From 08ac1eb7832fe99f44b25f192d9931d393a96983 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 2 Apr 2024 08:27:49 -1000
Subject: [PATCH 001/842] Bump ruff and codespell pre-commit checks (#15407)

xref https://github.com/rapidsai/cudf/pull/15345#discussion_r1532379047

Before pursuing migrating isort to ruff, bumping ruff to the latest version

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15407
---
 .pre-commit-config.yaml                              |  4 ++--
 cpp/include/cudf/io/detail/parquet.hpp               |  4 ++--
 cpp/src/copying/contiguous_split.cu                  |  2 +-
 cpp/src/io/orc/aggregate_orc_metadata.cpp            |  2 +-
 pyproject.toml                                       |  8 +++++---
 python/cudf/benchmarks/common/config.py              |  3 ++-
 python/cudf/cudf/_fuzz_testing/utils.py              |  6 +++---
 python/cudf/cudf/core/buffer/buffer.py               |  2 +-
 python/cudf/cudf/core/buffer/spillable_buffer.py     |  2 +-
 python/cudf/cudf/core/column/__init__.py             |  1 -
 python/cudf/cudf/core/column/methods.py              | 12 ++++--------
 python/cudf/cudf/core/column/string.py               |  6 ++----
 python/cudf/cudf/io/parquet.py                       |  6 +++---
 .../cudf/pandas/scripts/analyze-test-failures.py     |  3 ++-
 .../cudf/pandas/scripts/summarize-test-results.py    |  3 ++-
 python/cudf/cudf/tests/test_index.py                 |  1 +
 python/cudf/cudf/tests/test_monotonic.py             |  1 +
 python/cudf/cudf/tests/test_multiindex.py            |  1 +
 python/cudf/cudf/utils/docutils.py                   |  1 +
 python/cudf/cudf/utils/dtypes.py                     |  2 +-
 20 files changed, 36 insertions(+), 34 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 06fdcb9f761..3e99cf3fa9a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -113,7 +113,7 @@ repos:
         pass_filenames: false
         verbose: true
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.2.2
+    rev: v2.2.6
     hooks:
       - id: codespell
         additional_dependencies: [tomli]
@@ -129,7 +129,7 @@ repos:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.13
+    rev: v0.3.4
     hooks:
       - id: ruff
         files: python/.*$
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 0b8ee9676de..df870f6f1e4 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -110,7 +110,7 @@ class chunked_reader : private reader {
    * The chunk_read_limit parameter controls the size of the output chunks produces.  If the user
    * specifies 100 MB of data, the reader will attempt to return chunks containing tables that have
    * a total bytes size (over all columns) of 100 MB or less.  This is a soft limit and the code
-   * will not fail if it cannot satisfy the limit.  It will make a best-effort atttempt only.
+   * will not fail if it cannot satisfy the limit.  It will make a best-effort attempt only.
    *
    * The pass_read_limit parameter controls how much temporary memory is used in the process of
    * decoding the file.  The primary contributor to this memory usage is the uncompressed size of
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 23224d3225d..23bcd344a32 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1139,7 +1139,7 @@ struct packed_src_and_dst_pointers {
 
 /**
  * @brief Create an instance of `packed_src_and_dst_pointers` populating destination
- * partitition buffers (if any) from `out_buffers`. In the chunked_pack case
+ * partition buffers (if any) from `out_buffers`. In the chunked_pack case
  * `out_buffers` is empty, and the destination pointer is provided separately
  * to the `copy_partitions` kernel.
  *
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index f5f540bc3a4..d54524f0f0d 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -194,7 +194,7 @@ aggregate_orc_metadata::select_stripes(
   } else {
     int64_t count            = 0;
     int64_t stripe_skip_rows = 0;
-    // Iterate all source files, each source file has corelating metadata
+    // Iterate all source files, each source file has correlating metadata
     for (size_t src_file_idx = 0;
          src_file_idx < per_file_metadata.size() && count < rows_to_skip + rows_to_read;
          ++src_file_idx) {
diff --git a/pyproject.toml b/pyproject.toml
index 28eac66c1d6..797b5374cb6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,11 +19,14 @@ exclude = [
 skip = "./.git,./.github,./cpp/build,.*egg-info.*,./.mypy_cache,./cpp/tests,./python/cudf/cudf/tests,./java/src/test,./cpp/include/cudf_test/cxxopts.hpp"
 # ignore short words, and typename parameters like OffsetT
 ignore-regex = "\\b(.{1,4}|[A-Z]\\w*T)\\b"
-ignore-words-list = "inout,unparseable,falsy"
+ignore-words-list = "inout,unparseable,falsy,couldn,Couldn"
 builtin = "clear"
 quiet-level = 3
 
 [tool.ruff]
+line-length = 79
+
+[tool.ruff.lint]
 select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418"]
 ignore = [
     # whitespace before :
@@ -36,9 +39,8 @@ exclude = [
     # TODO: Remove this in a follow-up where we fix __all__.
     "__init__.py",
 ]
-line-length = 79
 
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 # Lots of pytest implicitly injected attributes in conftest-patch.py
 "python/cudf/cudf/pandas/scripts/conftest-patch.py" = ["F821"]
 "python/cudf/cudf/pandas/scripts/*" = ["D"]
diff --git a/python/cudf/benchmarks/common/config.py b/python/cudf/benchmarks/common/config.py
index 305a21d0a29..c1e9d4d6116 100644
--- a/python/cudf/benchmarks/common/config.py
+++ b/python/cudf/benchmarks/common/config.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 """Module used for global configuration of benchmarks.
 
@@ -20,6 +20,7 @@
 in this file and import them in conftest.py to ensure that they are handled
 appropriately.
 """
+
 import os
 import sys
 
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index 6e53195ac2d..d685174f3c2 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -99,9 +99,9 @@ def _generate_rand_meta(obj, dtypes_list, null_frequency_override=None):
                     low=1, high=10
                 )
             else:
-                meta[
-                    "max_types_at_each_level"
-                ] = obj._max_struct_types_at_each_level
+                meta["max_types_at_each_level"] = (
+                    obj._max_struct_types_at_each_level
+                )
 
         elif dtype == "decimal64":
             meta["max_precision"] = cudf.Decimal64Dtype.MAX_PRECISION
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index 8d278c9c065..1631fa00412 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -181,7 +181,7 @@ def _from_host_memory(cls, data: Any) -> Self:
         Parameters
         ----------
         data : Any
-            An object that represens host memory.
+            An object that represents host memory.
 
         Returns
         -------
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index b25af13679c..a9569190e75 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -154,7 +154,7 @@ def _from_host_memory(cls, data: Any) -> Self:
         Parameters
         ----------
         data : Any
-            An object that represens host memory.
+            An object that represents host memory.
 
         Returns
         -------
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index 2a46654ccc2..e7119fcdf47 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -4,7 +4,6 @@
 isort: skip_file
 """
 
-
 from cudf.core.column.categorical import CategoricalColumn
 from cudf.core.column.column import (
     ColumnBase,
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index 0f5a0eb086b..e827c7a3dd3 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -26,8 +26,7 @@ def _return_or_inplace(
         inplace: Literal[True],
         expand: bool = False,
         retain_index: bool = True,
-    ) -> None:
-        ...
+    ) -> None: ...
 
     @overload
     def _return_or_inplace(
@@ -36,8 +35,7 @@ def _return_or_inplace(
         inplace: Literal[False],
         expand: bool = False,
         retain_index: bool = True,
-    ) -> ParentType:
-        ...
+    ) -> ParentType: ...
 
     @overload
     def _return_or_inplace(
@@ -45,8 +43,7 @@ def _return_or_inplace(
         new_col,
         expand: bool = False,
         retain_index: bool = True,
-    ) -> ParentType:
-        ...
+    ) -> ParentType: ...
 
     @overload
     def _return_or_inplace(
@@ -55,8 +52,7 @@ def _return_or_inplace(
         inplace: bool = False,
         expand: bool = False,
         retain_index: bool = True,
-    ) -> Optional[ParentType]:
-        ...
+    ) -> Optional[ParentType]: ...
 
     def _return_or_inplace(
         self, new_col, inplace=False, expand=False, retain_index=True
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index fb76fcdaf39..06d7aa030db 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -257,14 +257,12 @@ def byte_count(self) -> SeriesOrIndex:
     @overload
     def cat(
         self, sep: Optional[str] = None, na_rep: Optional[str] = None
-    ) -> str:
-        ...
+    ) -> str: ...
 
     @overload
     def cat(
         self, others, sep: Optional[str] = None, na_rep: Optional[str] = None
-    ) -> Union[SeriesOrIndex, "cudf.core.column.string.StringColumn"]:
-        ...
+    ) -> Union[SeriesOrIndex, "cudf.core.column.string.StringColumn"]: ...
 
     def cat(self, others=None, sep=None, na_rep=None):
         """
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index bead9c352ef..e55898de675 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -1220,9 +1220,9 @@ def __init__(
     ) -> None:
         if isinstance(path, str) and path.startswith("s3://"):
             self.fs_meta = {"is_s3": True, "actual_path": path}
-            self.dir_: Optional[
-                tempfile.TemporaryDirectory
-            ] = tempfile.TemporaryDirectory()
+            self.dir_: Optional[tempfile.TemporaryDirectory] = (
+                tempfile.TemporaryDirectory()
+            )
             self.path = self.dir_.name
         else:
             self.fs_meta = {}
diff --git a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
index f1744c9e92b..8870fbc5c28 100644
--- a/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
+++ b/python/cudf/cudf/pandas/scripts/analyze-test-failures.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -11,6 +11,7 @@
 Example:
     python analyze-test-failures.py log.json frame/*
 """
+
 import json
 import sys
 from collections import Counter
diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
index bfc56319d82..ffd2abb960d 100644
--- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py
+++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -10,6 +10,7 @@
     python summarize-test-results.py log.json --output json
     python summarize-test-results.py log.json --output table
 """
+
 import argparse
 import json
 
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 05213d7601c..ebbca57bd40 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -3,6 +3,7 @@
 """
 Test related to Index
 """
+
 import datetime
 import operator
 import re
diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py
index 53919a95115..3c627a5fe89 100644
--- a/python/cudf/cudf/tests/test_monotonic.py
+++ b/python/cudf/cudf/tests/test_monotonic.py
@@ -4,6 +4,7 @@
 Tests related to is_unique, is_monotonic_increasing &
 is_monotonic_decreasing attributes
 """
+
 import numpy as np
 import pandas as pd
 import pytest
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 4926d79e734..76a82afb78e 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -3,6 +3,7 @@
 """
 Test related to MultiIndex
 """
+
 import datetime
 import itertools
 import operator
diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py
index 68447f423a4..4136d97d69f 100644
--- a/python/cudf/cudf/utils/docutils.py
+++ b/python/cudf/cudf/utils/docutils.py
@@ -3,6 +3,7 @@
 """
 Helper functions for parameterized docstring
 """
+
 import functools
 import re
 import string
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index e9dbc23d767..8521239413e 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -587,7 +587,7 @@ def find_common_type(dtypes):
 def _dtype_pandas_compatible(dtype):
     """
     A utility function, that returns `str` instead of `object`
-    dtype when pandas comptibility mode is enabled.
+    dtype when pandas compatibility mode is enabled.
     """
     if cudf.get_option("mode.pandas_compatible") and dtype == cudf.dtype("O"):
         return "str"

From 08d86c92b3e3ccd950e4d63033d44675510cbb74 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 2 Apr 2024 12:29:43 -0700
Subject: [PATCH 002/842] Fix errors in chunked ORC writer when no tables were
 (successfully) written (#15393)

Closes https://github.com/rapidsai/cudf/issues/15386, https://github.com/rapidsai/cudf/issues/15387

The fixes for the two issues overlap, so I included both in a single PR.

Expanded the `_closed` flag to an enum that tracks if the operations in `close()` should be performed (one or more tables were written to the sink). This way, we don't perform the steps in close when there is no valid file to write the footer for.
This includes:

- No `write` calls;
- All `write` calls failed;

The new enum replaces `skip_close()` that used to fix this issue for a smaller subset of cases.

Additionally, writing of the ORC header has been moved after the encode and uses the new state to only write the header in the first `write` call. This way we don't write anything to the sink if there were no `write` calls with the writer, and if the encode failed in the `write`s.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/15393
---
 cpp/include/cudf/io/detail/orc.hpp |  8 -----
 cpp/src/io/functions.cpp           | 11 +-----
 cpp/src/io/orc/writer_impl.cu      | 29 +++++++--------
 cpp/src/io/orc/writer_impl.hpp     | 20 +++++------
 cpp/tests/io/orc_test.cpp          | 58 +++++++++++++++++++++++++++---
 5 files changed, 79 insertions(+), 47 deletions(-)

diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 3c1486b60c2..c63c952e148 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -124,14 +124,6 @@ class writer {
    * @brief Finishes the chunked/streamed write process.
    */
   void close();
-
-  /**
-   * @brief Skip work done in `close()`; should be called if `write()` failed.
-   *
-   * Calling skip_close() prevents the writer from writing the (invalid) file footer and the
-   * postscript.
-   */
-  void skip_close();
 };
 }  // namespace orc::detail
 }  // namespace cudf::io
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index b8353d312fe..46c6c67c8df 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -436,16 +436,7 @@ void write_orc(orc_writer_options const& options, rmm::cuda_stream_view stream)
 
   auto writer = std::make_unique<orc::detail::writer>(
     std::move(sinks[0]), options, io_detail::single_write_mode::YES, stream);
-  try {
-    writer->write(options.get_table());
-  } catch (...) {
-    // If an exception is thrown, the output is incomplete/corrupted.
-    // Make sure the writer will not close with such corrupted data.
-    // In addition, the writer may throw an exception while trying to close, which would terminate
-    // the process.
-    writer->skip_close();
-    throw;
-  }
+  writer->write(options.get_table());
 }
 
 /**
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index ade0e75de35..750a593920c 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -2438,7 +2438,6 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
   if (options.get_metadata()) {
     _table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
   }
-  init_state();
 }
 
 writer::impl::impl(std::unique_ptr<data_sink> sink,
@@ -2460,20 +2459,13 @@ writer::impl::impl(std::unique_ptr<data_sink> sink,
   if (options.get_metadata()) {
     _table_meta = std::make_unique<table_input_metadata>(*options.get_metadata());
   }
-  init_state();
 }
 
 writer::impl::~impl() { close(); }
 
-void writer::impl::init_state()
-{
-  // Write file header
-  _out_sink->host_write(MAGIC, std::strlen(MAGIC));
-}
-
 void writer::impl::write(table_view const& input)
 {
-  CUDF_EXPECTS(not _closed, "Data has already been flushed to out and closed");
+  CUDF_EXPECTS(_state != writer_state::CLOSED, "Data has already been flushed to out and closed");
 
   if (not _table_meta) { _table_meta = make_table_meta(input); }
 
@@ -2516,6 +2508,11 @@ void writer::impl::write(table_view const& input)
     }
   }();
 
+  if (_state == writer_state::NO_DATA_WRITTEN) {
+    // Write the ORC file header if this is the first write
+    _out_sink->host_write(MAGIC, std::strlen(MAGIC));
+  }
+
   // Compression/encoding were all successful. Now write the intermediate results.
   write_orc_data_to_sink(enc_data,
                          segmentation,
@@ -2533,6 +2530,8 @@ void writer::impl::write(table_view const& input)
 
   // Update file-level and compression statistics
   update_statistics(orc_table.num_rows(), std::move(intermediate_stats), compression_stats);
+
+  _state = writer_state::DATA_WRITTEN;
 }
 
 void writer::impl::update_statistics(
@@ -2683,8 +2682,11 @@ void writer::impl::add_table_to_footer_data(orc_table_view const& orc_table,
 
 void writer::impl::close()
 {
-  if (_closed) { return; }
-  _closed = true;
+  if (_state != writer_state::DATA_WRITTEN) {
+    // writer is either closed or no data has been written
+    _state = writer_state::CLOSED;
+    return;
+  }
   PostScript ps;
 
   if (_stats_freq != statistics_freq::STATISTICS_NONE) {
@@ -2769,6 +2771,8 @@ void writer::impl::close()
   pbw.put_byte(ps_length);
   _out_sink->host_write(pbw.data(), pbw.size());
   _out_sink->flush();
+
+  _state = writer_state::CLOSED;
 }
 
 // Forward to implementation
@@ -2795,9 +2799,6 @@ writer::~writer() = default;
 // Forward to implementation
 void writer::write(table_view const& table) { _impl->write(table); }
 
-// Forward to implementation
-void writer::skip_close() { _impl->skip_close(); }
-
 // Forward to implementation
 void writer::close() { _impl->close(); }
 
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index 417d29efb58..bd082befe0c 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -227,6 +227,14 @@ struct encoded_footer_statistics {
   std::vector<ColStatsBlob> file_level;
 };
 
+enum class writer_state {
+  NO_DATA_WRITTEN,  // No table data has been written to the sink; if the writer is closed or
+                    // destroyed in this state, it should not write the footer.
+  DATA_WRITTEN,     // At least one table has been written to the sink; when the writer is closed,
+                    // it should write the footer.
+  CLOSED            // Writer has been closed; no further writes are allowed.
+};
+
 /**
  * @brief Implementation for ORC writer
  */
@@ -266,11 +274,6 @@ class writer::impl {
    */
   ~impl();
 
-  /**
-   * @brief Begins the chunked/streamed write process.
-   */
-  void init_state();
-
   /**
    * @brief Writes a single subtable as part of a larger ORC file/table write.
    *
@@ -283,11 +286,6 @@ class writer::impl {
    */
   void close();
 
-  /**
-   * @brief Skip writing the footer when closing/deleting the writer.
-   */
-  void skip_close() { _closed = true; }
-
  private:
   /**
    * @brief Write the intermediate ORC data into the data sink.
@@ -363,7 +361,7 @@ class writer::impl {
   Footer _footer;
   Metadata _orc_meta;
   persisted_statistics _persisted_stripe_statistics;  // Statistics data saved between calls.
-  bool _closed = false;  // To track if the output has been written to sink.
+  writer_state _state = writer_state::NO_DATA_WRITTEN;
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 24e2e2cfea0..e108e68e1f9 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -28,6 +28,7 @@
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/io/data_sink.hpp>
 #include <cudf/io/orc.hpp>
 #include <cudf/io/orc_metadata.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -2100,8 +2101,7 @@ TEST_F(OrcWriterTest, BounceBufferBug)
   auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 100; });
 
   constexpr auto num_rows = 150000;
-  column_wrapper<int8_t, typename decltype(sequence)::value_type> col(sequence,
-                                                                      sequence + num_rows);
+  column_wrapper<int8_t> col(sequence, sequence + num_rows);
   table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("BounceBufferBug.orc");
@@ -2120,8 +2120,7 @@ TEST_F(OrcReaderTest, SizeTypeRowsOverflow)
   static_assert(total_rows > std::numeric_limits<cudf::size_type>::max());
 
   auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 127; });
-  column_wrapper<int8_t, typename decltype(sequence)::value_type> col(sequence,
-                                                                      sequence + num_rows);
+  column_wrapper<int8_t> col(sequence, sequence + num_rows);
   table_view chunk_table({col});
 
   std::vector<char> out_buffer;
@@ -2169,4 +2168,55 @@ TEST_F(OrcReaderTest, SizeTypeRowsOverflow)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_with_stripe_selection->view());
 }
 
+TEST_F(OrcChunkedWriterTest, NoWriteCloseNotThrow)
+{
+  std::vector<char> out_buffer;
+
+  cudf::io::chunked_orc_writer_options write_opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&out_buffer});
+  auto writer = cudf::io::orc_chunked_writer(write_opts);
+
+  EXPECT_NO_THROW(writer.close());
+}
+
+TEST_F(OrcChunkedWriterTest, FailedWriteCloseNotThrow)
+{
+  // A sink that throws on write()
+  class throw_sink : public cudf::io::data_sink {
+   public:
+    void host_write(void const* data, size_t size) override { throw std::runtime_error("write"); }
+    void flush() override {}
+    size_t bytes_written() override { return 0; }
+  };
+
+  auto sequence = thrust::make_counting_iterator(0);
+  column_wrapper<int8_t> col(sequence, sequence + 10);
+  table_view table({col});
+
+  throw_sink sink;
+  cudf::io::chunked_orc_writer_options write_opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&sink});
+  auto writer = cudf::io::orc_chunked_writer(write_opts);
+
+  try {
+    writer.write(table);
+  } catch (...) {
+    // ignore the exception; we're testing that close() doesn't throw when the only write() fails
+  }
+
+  EXPECT_NO_THROW(writer.close());
+}
+
+TEST_F(OrcChunkedWriterTest, NoDataInSinkWhenNoWrite)
+{
+  std::vector<char> out_buffer;
+
+  cudf::io::chunked_orc_writer_options write_opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&out_buffer});
+  auto writer = cudf::io::orc_chunked_writer(write_opts);
+
+  EXPECT_NO_THROW(writer.close());
+  EXPECT_EQ(out_buffer.size(), 0);
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 13a5c7be33bec538a9f81872471c29796e67bce5 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 2 Apr 2024 16:54:09 -0400
Subject: [PATCH 003/842] Rework cudf::replace_nulls to use
 strings::detail::copy_if_else (#15286)

Removes the specialized kernels for strings in `cudf::replace_nulls` and replaces them with a call to `cudf::strings::detail::copy_if_else` which is already enabled with offsetalator support and optimized for long strings.
This will also allow `cudf::replace_nulls` to use large strings with no further changes.
Also includes a `replace_nulls` benchmark for strings.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15286
---
 cpp/benchmarks/CMakeLists.txt    |   3 +-
 cpp/benchmarks/replace/nulls.cpp |  59 ++++++++++++++
 cpp/src/replace/nulls.cu         | 127 +++++--------------------------
 3 files changed, 79 insertions(+), 110 deletions(-)
 create mode 100644 cpp/benchmarks/replace/nulls.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index c82e475dece..798e4e76141 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -208,8 +208,9 @@ ConfigureNVBench(
 )
 
 # ##################################################################################################
-# * reduction benchmark ---------------------------------------------------------------------------
+# * replace benchmark ---------------------------------------------------------------------------
 ConfigureBench(REPLACE_BENCH replace/clamp.cpp replace/nans.cpp)
+ConfigureNVBench(REPLACE_NVBENCH replace/nulls.cpp)
 
 # ##################################################################################################
 # * filling benchmark -----------------------------------------------------------------------------
diff --git a/cpp/benchmarks/replace/nulls.cpp b/cpp/benchmarks/replace/nulls.cpp
new file mode 100644
index 00000000000..ccd00050789
--- /dev/null
+++ b/cpp/benchmarks/replace/nulls.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/replace.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void replace_nulls(nvbench::state& state)
+{
+  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const max_width = static_cast<int32_t>(state.get_int64("row_width"));
+
+  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(max_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const table_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_width);
+
+  auto const input_table = create_random_table(
+    {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{n_rows}, table_profile);
+  auto const input = input_table->view().column(0);
+  auto const repl  = input_table->view().column(1);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  auto chars_size = cudf::strings_column_view(input).chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read;
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);
+
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { auto result = cudf::replace_nulls(input, repl); });
+}
+
+NVBENCH_BENCH(replace_nulls)
+  .set_name("replace_nulls")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216});
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 014171f2b40..299cdc6a160 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -32,8 +32,8 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/detail/copy_if_else.cuh>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -56,63 +56,6 @@ namespace {  // anonymous
 
 static constexpr int BLOCK_SIZE = 256;
 
-template <int phase, bool replacement_has_nulls>
-CUDF_KERNEL void replace_nulls_strings(cudf::column_device_view input,
-                                       cudf::column_device_view replacement,
-                                       cudf::bitmask_type* output_valid,
-                                       cudf::size_type* offsets,
-                                       char* chars,
-                                       cudf::size_type* valid_counter)
-{
-  cudf::size_type nrows = input.size();
-  auto i                = cudf::detail::grid_1d::global_thread_id();
-  auto const stride     = cudf::detail::grid_1d::grid_stride();
-
-  uint32_t active_mask = 0xffff'ffff;
-  active_mask          = __ballot_sync(active_mask, i < nrows);
-  auto const lane_id{threadIdx.x % cudf::detail::warp_size};
-  uint32_t valid_sum{0};
-
-  while (i < nrows) {
-    bool input_is_valid  = input.is_valid_nocheck(i);
-    bool output_is_valid = true;
-
-    if (replacement_has_nulls && !input_is_valid) {
-      output_is_valid = replacement.is_valid_nocheck(i);
-    }
-
-    cudf::string_view out;
-    if (input_is_valid) {
-      out = input.element<cudf::string_view>(i);
-    } else if (output_is_valid) {
-      out = replacement.element<cudf::string_view>(i);
-    }
-
-    bool nonzero_output = (input_is_valid || output_is_valid);
-
-    if (phase == 0) {
-      offsets[i]       = nonzero_output ? out.size_bytes() : 0;
-      uint32_t bitmask = __ballot_sync(active_mask, output_is_valid);
-      if (0 == lane_id) {
-        output_valid[cudf::word_index(i)] = bitmask;
-        valid_sum += __popc(bitmask);
-      }
-    } else if (phase == 1) {
-      if (nonzero_output) std::memcpy(chars + offsets[i], out.data(), out.size_bytes());
-    }
-
-    i += stride;
-    active_mask = __ballot_sync(active_mask, i < nrows);
-  }
-
-  // Compute total valid count for this block and add it to global count
-  uint32_t block_valid_count = cudf::detail::single_lane_block_sum_reduce<BLOCK_SIZE, 0>(valid_sum);
-  // one thread computes and adds to output_valid_count
-  if (threadIdx.x == 0) {
-    atomicAdd(valid_counter, static_cast<cudf::size_type>(block_valid_count));
-  }
-}
-
 template <typename Type, bool replacement_has_nulls>
 CUDF_KERNEL void replace_nulls(cudf::column_device_view input,
                                cudf::column_device_view replacement,
@@ -222,58 +165,24 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
-  cudf::size_type* valid_count = valid_counter.data();
-
-  auto replace_first  = replace_nulls_strings<0, false>;
-  auto replace_second = replace_nulls_strings<1, false>;
-  if (replacement.has_nulls()) {
-    replace_first  = replace_nulls_strings<0, true>;
-    replace_second = replace_nulls_strings<1, true>;
+  auto d_input       = cudf::column_device_view::create(input, stream);
+  auto d_replacement = cudf::column_device_view::create(replacement, stream);
+
+  auto lhs_iter =
+    cudf::detail::make_optional_iterator<cudf::string_view>(*d_input, cudf::nullate::YES{});
+  auto rhs_iter = cudf::detail::make_optional_iterator<cudf::string_view>(
+    *d_replacement, cudf::nullate::DYNAMIC{replacement.nullable()});
+
+  auto filter = cudf::detail::validity_accessor<false>{*d_input};
+  auto result = cudf::strings::detail::copy_if_else(
+    lhs_iter, lhs_iter + input.size(), rhs_iter, filter, stream, mr);
+
+  // input is nullable so result should always be nullable here
+  if (!result->nullable()) {
+    result->set_null_mask(
+      cudf::detail::create_null_mask(input.size(), cudf::mask_state::ALL_VALID, stream, mr), 0);
   }
-
-  // Create new offsets column to use in kernel
-  std::unique_ptr<cudf::column> sizes = cudf::make_numeric_column(
-    cudf::data_type(cudf::type_id::INT32), input.size(), cudf::mask_state::UNALLOCATED, stream);
-
-  auto sizes_view         = sizes->mutable_view();
-  auto device_in          = cudf::column_device_view::create(input, stream);
-  auto device_replacement = cudf::column_device_view::create(replacement, stream);
-
-  rmm::device_buffer valid_bits =
-    cudf::detail::create_null_mask(input.size(), cudf::mask_state::UNINITIALIZED, stream, mr);
-
-  // Call first pass kernel to get sizes in offsets
-  cudf::detail::grid_1d grid{input.size(), BLOCK_SIZE, 1};
-  replace_first<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
-    *device_in,
-    *device_replacement,
-    reinterpret_cast<cudf::bitmask_type*>(valid_bits.data()),
-    sizes_view.begin<cudf::size_type>(),
-    nullptr,
-    valid_count);
-
-  auto [offsets, bytes] = cudf::detail::make_offsets_child_column(
-    sizes_view.begin<int32_t>(), sizes_view.end<int32_t>(), stream, mr);
-
-  auto offsets_view = offsets->mutable_view();
-
-  // Allocate chars array and output null mask
-  rmm::device_uvector<char> output_chars(bytes, stream, mr);
-
-  replace_second<<<grid.num_blocks, BLOCK_SIZE, 0, stream.value()>>>(
-    *device_in,
-    *device_replacement,
-    reinterpret_cast<cudf::bitmask_type*>(valid_bits.data()),
-    offsets_view.begin<cudf::size_type>(),
-    output_chars.data(),
-    valid_count);
-
-  return cudf::make_strings_column(input.size(),
-                                   std::move(offsets),
-                                   output_chars.release(),
-                                   input.size() - valid_counter.value(stream),
-                                   std::move(valid_bits));
+  return result;
 }
 
 template <>

From 2584fd9d1e1fffb2aefd0417ba0994d7a563e076 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 2 Apr 2024 16:39:46 -0700
Subject: [PATCH 004/842] Test static builds in CI and fix nanoarrow configure
 (#15437)

Resolves #15275
Resolves #15434

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15437
---
 .github/workflows/pr.yaml                     |  11 ++
 .github/workflows/test.yaml                   |  10 ++
 ci/configure_cpp_static.sh                    |  23 +++
 cpp/cmake/thirdparty/get_nanoarrow.cmake      |  20 +++
 .../thirdparty/patches/nanoarrow_cmake.diff   | 161 ++++++++++++++++++
 dependencies.yaml                             |  18 +-
 6 files changed, 239 insertions(+), 4 deletions(-)
 create mode 100755 ci/configure_cpp_static.sh
 create mode 100644 cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 303988212d3..2d7ebb62fa8 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -20,6 +20,7 @@ jobs:
       - conda-python-cudf-tests
       - conda-python-other-tests
       - conda-java-tests
+      - static-configure
       - conda-notebook-tests
       - docs-build
       - wheel-build-cudf
@@ -88,6 +89,16 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_java.sh"
+  static-configure:
+    needs: checks
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    with:
+      build_type: pull-request
+      # Use the wheel container so we can skip conda solves and since our
+      # primary static consumers (Spark) are not in conda anyway.
+      container_image: "rapidsai/ci-wheel:latest"
+      run_script: "ci/configure_cpp_static.sh"
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 6f7aef79881..ea47b6ad466 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -43,6 +43,16 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_cpp_memcheck.sh"
+  static-configure:
+    needs: checks
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    with:
+      build_type: pull-request
+      # Use the wheel container so we can skip conda solves and since our
+      # primary static consumers (Spark) are not in conda anyway.
+      container_image: "rapidsai/ci-wheel:latest"
+      run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh
new file mode 100755
index 00000000000..675e0c3981f
--- /dev/null
+++ b/ci/configure_cpp_static.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+rapids-configure-conda-channels
+
+source rapids-date-string
+
+rapids-logger "Configure static cpp build"
+
+ENV_YAML_DIR="$(mktemp -d)"
+REQUIREMENTS_FILE="${ENV_YAML_DIR}/requirements.txt"
+
+rapids-dependency-file-generator \
+  --output requirements \
+  --file_key test_static_build \
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${REQUIREMENTS_FILE}"
+
+python -m pip install -r "${REQUIREMENTS_FILE}"
+pyenv rehash
+
+cmake -S cpp -B build_static -GNinja -DBUILD_SHARED_LIBS=OFF -DBUILD_TESTS=OFF
diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
index be938a89ccd..4316db99a8d 100644
--- a/cpp/cmake/thirdparty/get_nanoarrow.cmake
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -17,6 +17,25 @@ function(find_and_configure_nanoarrow)
   set(oneValueArgs VERSION FORK PINNED_TAG)
   cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
+  # Only run if PKG_VERSION is < 0.5.0
+  if(PKG_VERSION VERSION_LESS 0.5.0)
+    set(patch_files_to_run "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches/nanoarrow_cmake.diff")
+    set(patch_issues_to_ref
+        "Fix issues with nanoarrow CMake [https://github.com/apache/arrow-nanoarrow/pull/406]"
+    )
+    set(patch_script "${CMAKE_BINARY_DIR}/rapids-cmake/patches/nanoarrow/patch.cmake")
+    set(log_file "${CMAKE_BINARY_DIR}/rapids-cmake/patches/nanoarrow/log")
+    string(TIMESTAMP current_year "%Y" UTC)
+    configure_file(
+      ${rapids-cmake-dir}/cpm/patches/command_template.cmake.in "${patch_script}" @ONLY
+    )
+  else()
+    message(
+      FATAL_ERROR
+        "Nanoarrow version ${PKG_VERSION} already contains the necessary patch. Please remove this patch from cudf."
+    )
+  endif()
+
   rapids_cpm_find(
     nanoarrow ${PKG_VERSION}
     GLOBAL_TARGETS nanoarrow
@@ -26,6 +45,7 @@ function(find_and_configure_nanoarrow)
     # TODO: Commit hashes are not supported with shallow clones. Can switch this if and when we pin
     # to an actual tag.
     GIT_SHALLOW FALSE
+    PATCH_COMMAND ${CMAKE_COMMAND} -P ${patch_script}
     OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
   )
   set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff
new file mode 100644
index 00000000000..b53e134ed2c
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff
@@ -0,0 +1,161 @@
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 8714c70..1feec13 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -49,7 +49,6 @@ else()
+ endif()
+
+ option(NANOARROW_CODE_COVERAGE "Enable coverage reporting" OFF)
+-add_library(coverage_config INTERFACE)
+
+ # Avoids a warning about timestamps on downloaded files (prefer new policy
+ # if available))
+@@ -111,6 +110,8 @@ if(NANOARROW_BUNDLE)
+   if(NANOARROW_BUILD_TESTS)
+     include_directories(${CMAKE_BINARY_DIR}/amalgamation)
+     add_library(nanoarrow ${NANOARROW_C_TEMP})
++    add_library(nanoarrow::nanoarrow ALIAS nanoarrow)
++
+     target_compile_definitions(nanoarrow PUBLIC "$<$<CONFIG:Debug>:NANOARROW_DEBUG>")
+   endif()
+
+@@ -120,6 +121,7 @@ if(NANOARROW_BUNDLE)
+ else()
+   add_library(nanoarrow src/nanoarrow/array.c src/nanoarrow/schema.c
+                         src/nanoarrow/array_stream.c src/nanoarrow/utils.c)
++  add_library(nanoarrow::nanoarrow ALIAS nanoarrow)
+
+   target_include_directories(nanoarrow
+                              PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
+@@ -154,13 +156,50 @@ else()
+     endif()
+   endif()
+
+-  install(TARGETS nanoarrow DESTINATION lib)
++  install(TARGETS nanoarrow
++          DESTINATION lib
++          EXPORT nanoarrow-exports)
+   install(DIRECTORY src/
+           DESTINATION include
+           FILES_MATCHING
+-          PATTERN "*.h")
++          PATTERN "*.h*")
+   install(FILES ${CMAKE_CURRENT_BINARY_DIR}/generated/nanoarrow_config.h
+           DESTINATION include/nanoarrow)
++
++  # Generate package files for the build and install trees.
++  include(CMakePackageConfigHelpers)
++  include(GNUInstallDirs)
++
++  foreach(tree_type BUILD INSTALL)
++    if(tree_type STREQUAL "BUILD")
++      set(install_location ".")
++    else()
++      set(install_location "${CMAKE_INSTALL_LIBDIR}/cmake/nanoarrow")
++    endif()
++
++    set(build_location "${PROJECT_BINARY_DIR}/${install_location}")
++    write_basic_package_version_file(
++      "${build_location}/nanoarrow-config-version.cmake"
++      VERSION ${nanoarrow_VERSION}
++      # After 1.0.0, we can use `SameMajorVersion` here.
++      COMPATIBILITY ExactVersion)
++    configure_package_config_file("${CMAKE_CURRENT_LIST_DIR}/cmake/config.cmake.in"
++                                  "${build_location}/nanoarrow-config.cmake"
++                                  INSTALL_DESTINATION "${install_location}")
++
++    if(tree_type STREQUAL "BUILD")
++      export(EXPORT nanoarrow-exports
++             FILE "${build_location}/nanoarrow-targets.cmake"
++             NAMESPACE nanoarrow::)
++
++    else()
++      install(DIRECTORY "${build_location}/" DESTINATION "${install_location}")
++      install(EXPORT nanoarrow-exports
++              DESTINATION "${install_location}"
++              FILE "nanoarrow-targets.cmake"
++              NAMESPACE nanoarrow::)
++    endif()
++  endforeach()
+ endif()
+
+ # Always build integration test if building tests
+@@ -215,34 +254,18 @@ if(NANOARROW_BUILD_TESTS)
+                  src/nanoarrow/integration/c_data_integration_test.cc)
+
+   if(NANOARROW_CODE_COVERAGE)
+-    target_compile_options(coverage_config INTERFACE -O0 -g --coverage)
+-    target_link_options(coverage_config INTERFACE --coverage)
+-    target_link_libraries(nanoarrow coverage_config)
++    target_compile_options(nanoarrow PUBLIC -O0 -g --coverage)
++    target_link_options(nanoarrow PUBLIC --coverage)
+   endif()
+
+-  target_link_libraries(utils_test
+-                        nanoarrow
+-                        gtest_main
+-                        ${NANOARROW_ARROW_TARGET}
+-                        coverage_config)
+-  target_link_libraries(buffer_test nanoarrow gtest_main coverage_config)
+-  target_link_libraries(array_test
+-                        nanoarrow
+-                        gtest_main
+-                        ${NANOARROW_ARROW_TARGET}
+-                        coverage_config)
+-  target_link_libraries(schema_test
+-                        nanoarrow
+-                        gtest_main
+-                        ${NANOARROW_ARROW_TARGET}
+-                        coverage_config)
+-  target_link_libraries(array_stream_test nanoarrow gtest_main coverage_config)
+-  target_link_libraries(nanoarrow_hpp_test nanoarrow gtest_main coverage_config)
+-  target_link_libraries(nanoarrow_testing_test
+-                        nanoarrow
+-                        gtest_main
+-                        nlohmann_json::nlohmann_json
+-                        coverage_config)
++  target_link_libraries(utils_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET})
++  target_link_libraries(buffer_test nanoarrow gtest_main)
++  target_link_libraries(array_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET})
++  target_link_libraries(schema_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET})
++  target_link_libraries(array_stream_test nanoarrow gtest_main)
++  target_link_libraries(nanoarrow_hpp_test nanoarrow gtest_main)
++  target_link_libraries(nanoarrow_testing_test nanoarrow gtest_main
++                        nlohmann_json::nlohmann_json)
+   target_link_libraries(c_data_integration_test nanoarrow nanoarrow_c_data_integration
+                         gtest_main)
+
+diff --git a/cmake/config.cmake.in b/cmake/config.cmake.in
+new file mode 100644
+index 0000000..021dc31
+--- /dev/null
++++ b/cmake/config.cmake.in
+@@ -0,0 +1,28 @@
++# Licensed to the Apache Software Foundation (ASF) under one
++# or more contributor license agreements.  See the NOTICE file
++# distributed with this work for additional information
++# regarding copyright ownership.  The ASF licenses this file
++# to you under the Apache License, Version 2.0 (the
++# "License"); you may not use this file except in compliance
++# with the License.  You may obtain a copy of the License at
++#
++# http://www.apache.org/licenses/LICENSE-2.0
++#
++# Unless required by applicable law or agreed to in writing,
++# software distributed under the License is distributed on an
++# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
++# KIND, either express or implied.  See the License for the
++# specific language governing permissions and limitations
++# under the License.
++
++
++@PACKAGE_INIT@
++
++cmake_minimum_required(VERSION @CMAKE_MINIMUM_REQUIRED_VERSION@)
++
++include("${CMAKE_CURRENT_LIST_DIR}/nanoarrow-targets.cmake" REQUIRED)
++include("${CMAKE_CURRENT_LIST_DIR}/nanoarrow-config-version.cmake" REQUIRED)
++
++set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
++include(FindPackageHandleStandardArgs)
++find_package_handle_standard_args(${CMAKE_FIND_PACKAGE_NAME} CONFIG_MODE)
diff --git a/dependencies.yaml b/dependencies.yaml
index 85f5a86d938..5bb555df818 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -6,6 +6,7 @@ files:
       cuda: ["11.8", "12.2"]
       arch: [x86_64]
     includes:
+      - build_base
       - build_all
       - build_cpp
       - build_wheels
@@ -27,6 +28,10 @@ files:
       - test_python_cudf
       - test_python_dask_cudf
       - depends_on_cupy
+  test_static_build:
+    output: none
+    includes:
+      - build_base
   test_cpp:
     output: none
     includes:
@@ -45,6 +50,7 @@ files:
   test_java:
     output: none
     includes:
+      - build_base
       - build_all
       - cuda
       - cuda_version
@@ -75,6 +81,7 @@ files:
     extras:
       table: build-system
     includes:
+      - build_base
       - build_python_common
       - build_python_cudf
   py_run_cudf:
@@ -144,6 +151,7 @@ files:
     extras:
       table: build-system
     includes:
+      - build_base
       - build_python_common
   py_run_cudf_kafka:
     output: pyproject
@@ -191,12 +199,16 @@ channels:
   - conda-forge
   - nvidia
 dependencies:
-  build_all:
+  build_base:
     common:
-      - output_types: conda
+      - output_types: [conda, requirements, pyproject]
         packages:
           - &cmake_ver cmake>=3.26.4
           - &ninja ninja
+  build_all:
+    common:
+      - output_types: conda
+        packages:
           - c-compiler
           - cxx-compiler
           - dlpack>=0.8,<1.0
@@ -254,9 +266,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - *cmake_ver
           - cython>=3.0.3
-          - *ninja
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
           - pyarrow==14.0.2.*

From 082f6c91eb3906dbdf785348160ad5631ec91458 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 3 Apr 2024 11:27:47 -0400
Subject: [PATCH 005/842] Use offsetalator in cudf::strings::replace functions
 (#14824)

Adds offsetalator in place of hardcoded offset size_type arrays to the strings replace functions.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14824
---
 cpp/src/strings/replace/multi.cu         | 236 +++----
 cpp/src/strings/replace/replace.cu       | 791 +++++++++--------------
 cpp/src/strings/replace/replace_nulls.cu |  12 +-
 cpp/src/strings/replace/replace_slice.cu |  25 +-
 4 files changed, 463 insertions(+), 601 deletions(-)

diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 8b5a4317b50..c93add01f69 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -14,13 +14,14 @@
  * limitations under the License.
  */
 
+#include "strings/split/split.cuh"
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/strings/detail/char_tables.hpp>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
@@ -42,6 +43,7 @@
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
 #include <thrust/optional.h>
 #include <thrust/scan.h>
 #include <thrust/transform.h>
@@ -67,7 +69,7 @@ constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 256;
  * @brief Type used for holding the target position (first) and the
  * target index (second).
  */
-using target_pair = thrust::pair<size_type, size_type>;
+using target_pair = thrust::tuple<int64_t, size_type>;
 
 /**
  * @brief Helper functions for performing character-parallel replace
@@ -75,12 +77,6 @@ using target_pair = thrust::pair<size_type, size_type>;
 struct replace_multi_parallel_fn {
   __device__ char const* get_base_ptr() const { return d_strings.head<char>(); }
 
-  __device__ size_type const* get_offsets_ptr() const
-  {
-    return d_strings.child(strings_column_view::offsets_column_index).data<size_type>() +
-           d_strings.offset();
-  }
-
   __device__ string_view const get_string(size_type idx) const
   {
     return d_strings.element<string_view>(idx);
@@ -100,11 +96,12 @@ struct replace_multi_parallel_fn {
    * @param idx Index of the byte position in the chars column
    * @param chars_bytes Number of bytes in the chars column
    */
-  __device__ thrust::optional<size_type> has_target(size_type idx, size_type chars_bytes) const
+  __device__ size_type target_index(int64_t idx, int64_t chars_bytes) const
   {
-    auto const d_offsets = get_offsets_ptr();
+    auto const d_offsets = d_strings_offsets;
     auto const d_chars   = get_base_ptr() + d_offsets[0] + idx;
     size_type str_idx    = -1;
+    string_view d_str{};
     for (std::size_t t = 0; t < d_targets.size(); ++t) {
       auto const d_tgt = d_targets[t];
       if (!d_tgt.empty() && (idx + d_tgt.size_bytes() <= chars_bytes) &&
@@ -113,12 +110,24 @@ struct replace_multi_parallel_fn {
           auto const idx_itr =
             thrust::upper_bound(thrust::seq, d_offsets, d_offsets + d_strings.size(), idx);
           str_idx = thrust::distance(d_offsets, idx_itr) - 1;
+          d_str   = get_string(str_idx - d_offsets[0]);
         }
-        auto const d_str = get_string(str_idx - d_offsets[0]);
         if ((d_chars + d_tgt.size_bytes()) <= (d_str.data() + d_str.size_bytes())) { return t; }
       }
     }
-    return thrust::nullopt;
+    return -1;
+  }
+
+  __device__ bool has_target(int64_t idx, int64_t chars_bytes) const
+  {
+    auto const d_chars = get_base_ptr() + d_strings_offsets[0] + idx;
+    for (auto& d_tgt : d_targets) {
+      if (!d_tgt.empty() && (idx + d_tgt.size_bytes() <= chars_bytes) &&
+          (d_tgt.compare(d_chars, d_tgt.size_bytes()) == 0)) {
+        return true;
+      }
+    }
+    return false;
   }
 
   /**
@@ -133,28 +142,32 @@ struct replace_multi_parallel_fn {
    * @return Number of substrings resulting from the replace operations on this row
    */
   __device__ size_type count_strings(size_type idx,
-                                     target_pair const* d_positions,
-                                     size_type const* d_targets_offsets) const
+                                     int64_t const* d_positions,
+                                     size_type const* d_indices,
+                                     cudf::detail::input_offsetalator d_targets_offsets) const
   {
     if (!is_valid(idx)) { return 0; }
 
-    auto const d_str             = get_string(idx);
-    auto const d_str_end         = d_str.data() + d_str.size_bytes();
-    auto const base_ptr          = get_base_ptr();
-    auto const targets_positions = cudf::device_span<target_pair const>(
-      d_positions + d_targets_offsets[idx], d_targets_offsets[idx + 1] - d_targets_offsets[idx]);
+    auto const d_str     = get_string(idx);
+    auto const d_str_end = d_str.data() + d_str.size_bytes();
+    auto const base_ptr  = get_base_ptr();
+
+    auto const target_offset = d_targets_offsets[idx];
+    auto const targets_size  = static_cast<size_type>(d_targets_offsets[idx + 1] - target_offset);
+    auto const positions     = d_positions + target_offset;
+    auto const indices       = d_indices + target_offset;
 
     size_type count = 1;  // always at least one string
     auto str_ptr    = d_str.data();
-    for (auto d_pair : targets_positions) {
-      auto const d_pos   = d_pair.first;
-      auto const d_tgt   = d_targets[d_pair.second];
-      auto const tgt_ptr = base_ptr + d_pos;
+    for (std::size_t i = 0; i < targets_size; ++i) {
+      auto const tgt_idx = indices[i];
+      auto const d_tgt   = d_targets[tgt_idx];
+      auto const tgt_ptr = base_ptr + positions[i];
       if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) {
         auto const keep_size = static_cast<size_type>(thrust::distance(str_ptr, tgt_ptr));
         if (keep_size > 0) { count++; }  // don't bother counting empty strings
 
-        auto const d_repl = get_replacement_string(d_pair.second);
+        auto const d_repl = get_replacement_string(tgt_idx);
         if (!d_repl.empty()) { count++; }
 
         str_ptr += keep_size + d_tgt.size_bytes();
@@ -182,9 +195,10 @@ struct replace_multi_parallel_fn {
    * @return The size in bytes of the output string for this row
    */
   __device__ size_type get_strings(size_type idx,
-                                   size_type const* d_offsets,
-                                   target_pair const* d_positions,
-                                   size_type const* d_targets_offsets,
+                                   cudf::detail::input_offsetalator const d_offsets,
+                                   int64_t const* d_positions,
+                                   size_type const* d_indices,
+                                   cudf::detail::input_offsetalator d_targets_offsets,
                                    string_index_pair* d_all_strings) const
   {
     if (!is_valid(idx)) { return 0; }
@@ -194,22 +208,24 @@ struct replace_multi_parallel_fn {
     auto const d_str_end = d_str.data() + d_str.size_bytes();
     auto const base_ptr  = get_base_ptr();
 
-    auto const targets_positions = cudf::device_span<target_pair const>(
-      d_positions + d_targets_offsets[idx], d_targets_offsets[idx + 1] - d_targets_offsets[idx]);
+    auto const target_offset = d_targets_offsets[idx];
+    auto const targets_size  = static_cast<size_type>(d_targets_offsets[idx + 1] - target_offset);
+    auto const positions     = d_positions + target_offset;
+    auto const indices       = d_indices + target_offset;
 
     size_type output_idx  = 0;
     size_type output_size = 0;
     auto str_ptr          = d_str.data();
-    for (auto d_pair : targets_positions) {
-      auto const d_pos   = d_pair.first;
-      auto const d_tgt   = d_targets[d_pair.second];
-      auto const tgt_ptr = base_ptr + d_pos;
+    for (std::size_t i = 0; i < targets_size; ++i) {
+      auto const tgt_idx = indices[i];
+      auto const d_tgt   = d_targets[tgt_idx];
+      auto const tgt_ptr = base_ptr + positions[i];
       if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) {
         auto const keep_size = static_cast<size_type>(thrust::distance(str_ptr, tgt_ptr));
         if (keep_size > 0) { d_output[output_idx++] = string_index_pair{str_ptr, keep_size}; }
         output_size += keep_size;
 
-        auto const d_repl = get_replacement_string(d_pair.second);
+        auto const d_repl = get_replacement_string(tgt_idx);
         if (!d_repl.empty()) {
           d_output[output_idx++] = string_index_pair{d_repl.data(), d_repl.size_bytes()};
         }
@@ -228,14 +244,19 @@ struct replace_multi_parallel_fn {
   }
 
   replace_multi_parallel_fn(column_device_view const& d_strings,
+                            cudf::detail::input_offsetalator d_strings_offsets,
                             device_span<string_view const> d_targets,
                             device_span<string_view const> d_replacements)
-    : d_strings(d_strings), d_targets{d_targets}, d_replacements{d_replacements}
+    : d_strings(d_strings),
+      d_strings_offsets(d_strings_offsets),
+      d_targets{d_targets},
+      d_replacements{d_replacements}
   {
   }
 
  protected:
   column_device_view d_strings;
+  cudf::detail::input_offsetalator d_strings_offsets;
   device_span<string_view const> d_targets;
   device_span<string_view const> d_replacements;
 };
@@ -247,17 +268,16 @@ struct replace_multi_parallel_fn {
  * (this happens sometimes when passing device lambdas to thrust algorithms)
  */
 struct pair_generator {
-  __device__ target_pair operator()(int idx) const
+  __device__ target_pair operator()(int64_t idx) const
   {
-    auto pos = fn.has_target(idx, chars_bytes);
-    return target_pair{idx, pos.value_or(-1)};
+    return thrust::make_tuple(idx, fn.target_index(idx, chars_bytes));
   }
   replace_multi_parallel_fn fn;
-  size_type chars_bytes;
+  int64_t chars_bytes;
 };
 
 struct copy_if_fn {
-  __device__ bool operator()(target_pair pos) { return pos.second >= 0; }
+  __device__ bool operator()(target_pair pos) { return thrust::get<1>(pos) >= 0; }
 };
 
 std::unique_ptr<column> replace_character_parallel(strings_column_view const& input,
@@ -270,105 +290,91 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
 
   auto const strings_count = input.size();
   auto const chars_bytes =
-    cudf::detail::get_value<size_type>(input.offsets(), input.offset() + strings_count, stream) -
-    cudf::detail::get_value<size_type>(input.offsets(), input.offset(), stream);
+    get_offset_value(input.offsets(), input.offset() + strings_count, stream) -
+    get_offset_value(input.offsets(), input.offset(), stream);
 
   auto d_targets =
     create_string_vector_from_column(targets, stream, rmm::mr::get_current_device_resource());
   auto d_replacements =
     create_string_vector_from_column(repls, stream, rmm::mr::get_current_device_resource());
 
-  replace_multi_parallel_fn fn{*d_strings, d_targets, d_replacements};
+  replace_multi_parallel_fn fn{
+    *d_strings,
+    cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()),
+    d_targets,
+    d_replacements,
+  };
+
+  // Count the number of targets in the entire column.
+  // Note this may over-count in the case where a target spans adjacent strings.
+  auto target_count = thrust::count_if(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator<int64_t>(0),
+    thrust::make_counting_iterator<int64_t>(chars_bytes),
+    [fn, chars_bytes] __device__(int64_t idx) { return fn.has_target(idx, chars_bytes); });
 
-  // count the number of targets in the entire column
-  auto const target_count = thrust::count_if(rmm::exec_policy(stream),
-                                             thrust::make_counting_iterator<size_type>(0),
-                                             thrust::make_counting_iterator<size_type>(chars_bytes),
-                                             [fn, chars_bytes] __device__(size_type idx) {
-                                               return fn.has_target(idx, chars_bytes).has_value();
-                                             });
   // Create a vector of every target position in the chars column.
-  // These may include overlapping targets which will be resolved later.
-  auto targets_positions = rmm::device_uvector<target_pair>(target_count, stream);
+  // These may also include overlapping targets which will be resolved later.
+  auto targets_positions = rmm::device_uvector<int64_t>(target_count, stream);
+  auto targets_indices   = rmm::device_uvector<size_type>(target_count, stream);
+
+  // cudf::detail::make_counting_transform_iterator hardcodes size_type
+  auto const copy_itr = thrust::make_transform_iterator(thrust::counting_iterator<int64_t>(0),
+                                                        pair_generator{fn, chars_bytes});
+  auto const out_itr  = thrust::make_zip_iterator(
+    thrust::make_tuple(targets_positions.begin(), targets_indices.begin()));
+  auto const copy_end =
+    cudf::detail::copy_if_safe(copy_itr, copy_itr + chars_bytes, out_itr, copy_if_fn{}, stream);
+
+  // adjust target count since the copy-if may have eliminated some invalid targets
+  target_count = std::min(static_cast<int64_t>(std::distance(out_itr, copy_end)), target_count);
+  targets_positions.resize(target_count, stream);
+  targets_indices.resize(target_count, stream);
   auto d_positions       = targets_positions.data();
-
-  auto const copy_itr =
-    cudf::detail::make_counting_transform_iterator(0, pair_generator{fn, chars_bytes});
-  auto const copy_end = thrust::copy_if(
-    rmm::exec_policy(stream), copy_itr, copy_itr + chars_bytes, d_positions, copy_if_fn{});
+  auto d_targets_indices = targets_indices.data();
 
   // create a vector of offsets to each string's set of target positions
-  auto const targets_offsets = [&] {
-    auto string_indices = rmm::device_uvector<size_type>(target_count, stream);
-
-    auto const pos_itr = cudf::detail::make_counting_transform_iterator(
-      0, cuda::proclaim_return_type<int64_t>([d_positions] __device__(auto idx) -> int64_t {
-        return d_positions[idx].first;
-      }));
-    auto pos_count = std::distance(d_positions, copy_end);
-
-    auto begin =
-      cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
-    auto end = begin + input.offsets().size();
-    thrust::upper_bound(
-      rmm::exec_policy(stream), begin, end, pos_itr, pos_itr + pos_count, string_indices.begin());
-
-    // compute offsets per string
-    auto targets_offsets   = rmm::device_uvector<size_type>(strings_count + 1, stream);
-    auto d_targets_offsets = targets_offsets.data();
-
-    // memset to zero-out the target counts for any null-entries or strings with no targets
-    thrust::uninitialized_fill(
-      rmm::exec_policy(stream), targets_offsets.begin(), targets_offsets.end(), 0);
-
-    // next, count the number of targets per string
-    auto d_string_indices = string_indices.data();
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       target_count,
-                       [d_string_indices, d_targets_offsets] __device__(size_type idx) {
-                         auto const str_idx = d_string_indices[idx] - 1;
-                         atomicAdd(d_targets_offsets + str_idx, 1);
-                       });
-    // finally, convert the counts into offsets
-    thrust::exclusive_scan(rmm::exec_policy(stream),
-                           targets_offsets.begin(),
-                           targets_offsets.end(),
-                           targets_offsets.begin());
-    return targets_offsets;
-  }();
-  auto const d_targets_offsets = targets_offsets.data();
+  auto const targets_offsets = create_offsets_from_positions(
+    input, targets_positions, stream, rmm::mr::get_current_device_resource());
+  auto const d_targets_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(targets_offsets->view());
 
   // compute the number of string segments produced by replace in each string
   auto counts = rmm::device_uvector<size_type>(strings_count, stream);
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(strings_count),
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(strings_count),
                     counts.begin(),
                     cuda::proclaim_return_type<size_type>(
-                      [fn, d_positions, d_targets_offsets] __device__(size_type idx) -> size_type {
-                        return fn.count_strings(idx, d_positions, d_targets_offsets);
+                      [fn, d_positions, d_targets_indices, d_targets_offsets] __device__(
+                        size_type idx) -> size_type {
+                        return fn.count_strings(
+                          idx, d_positions, d_targets_indices, d_targets_offsets);
                       }));
 
   // create offsets from the counts
-  auto offsets =
-    std::get<0>(cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr));
-  auto const total_strings =
-    cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
-  auto const d_strings_offsets = offsets->view().data<size_type>();
+  auto [offsets, total_strings] =
+    cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr);
+  auto const d_strings_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
 
   // build a vector of all the positions for all the strings
   auto indices   = rmm::device_uvector<string_index_pair>(total_strings, stream);
   auto d_indices = indices.data();
   auto d_sizes   = counts.data();  // reusing this vector to hold output sizes now
   thrust::for_each_n(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::make_counting_iterator<size_type>(0),
     strings_count,
-    [fn, d_strings_offsets, d_positions, d_targets_offsets, d_indices, d_sizes] __device__(
-      size_type idx) {
-      d_sizes[idx] =
-        fn.get_strings(idx, d_strings_offsets, d_positions, d_targets_offsets, d_indices);
+    [fn,
+     d_strings_offsets,
+     d_positions,
+     d_targets_indices,
+     d_targets_offsets,
+     d_indices,
+     d_sizes] __device__(size_type idx) {
+      d_sizes[idx] = fn.get_strings(
+        idx, d_strings_offsets, d_positions, d_targets_indices, d_targets_offsets, d_indices);
     });
 
   // use this utility to gather the string parts into a contiguous chars column
@@ -376,8 +382,8 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
   auto chars_data = chars->release().data;
 
   // create offsets from the sizes
-  offsets =
-    std::get<0>(cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr));
+  offsets = std::get<0>(
+    cudf::strings::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr));
 
   // build the strings columns from the chars and offsets
   return make_strings_column(strings_count,
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 1f752f543d0..2c548f2f7cd 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -14,20 +14,21 @@
  * limitations under the License.
  */
 
+#include "strings/split/split.cuh"
+
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/char_tables.hpp>
+#include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -39,11 +40,7 @@
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
-#include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/remove.h>
-#include <thrust/scan.h>
 #include <thrust/transform.h>
 
 namespace cudf {
@@ -52,505 +49,375 @@ namespace detail {
 namespace {
 
 /**
- * @brief Average string byte-length threshold for deciding character-level vs row-level parallel
- * algorithm.
+ * @brief Threshold to decide on using string or character-parallel functions.
+ *
+ * If the average byte length of a string in a column exceeds this value then
+ * the character-parallel function is used.
+ * Otherwise, a regular string-parallel function is used.
  *
- * This value was determined by running the replace string scalar benchmark against different
- * power-of-2 string lengths and observing the point at which the performance only improved for
- * all trials.
+ * This value was found using the replace-multi benchmark results using an
+ * RTX A6000.
  */
-constexpr size_type BYTES_PER_VALID_ROW_THRESHOLD = 64;
+constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 256;
 
 /**
- * @brief Function logic for the row-level parallelism replace API.
- *
- * This will perform a replace operation on each string.
+ * @brief Helper functions for performing character-parallel replace
  */
-struct replace_row_parallel_fn {
-  column_device_view const d_strings;
-  string_view const d_target;
-  string_view const d_repl;
-  int32_t const max_repl;
-  int32_t* d_offsets{};
-  char* d_chars{};
+struct replace_parallel_chars_fn {
+  __device__ inline char const* get_base_ptr() const { return d_strings.head<char>(); }
 
-  __device__ void operator()(size_type idx)
+  __device__ inline string_view const get_string(size_type idx) const
   {
-    if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
-      return;
-    }
-    auto const d_str   = d_strings.element<string_view>(idx);
-    char const* in_ptr = d_str.data();
-
-    char* out_ptr = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    auto max_n    = (max_repl < 0) ? d_str.length() : max_repl;
-    auto bytes    = d_str.size_bytes();
-    auto position = d_str.find(d_target);
-
-    size_type last_pos = 0;
-    while ((position != string_view::npos) && (max_n > 0)) {
-      if (out_ptr) {
-        auto const curr_pos = d_str.byte_offset(position);
-        out_ptr = copy_and_increment(out_ptr, in_ptr + last_pos, curr_pos - last_pos);  // copy left
-        out_ptr = copy_string(out_ptr, d_repl);                                         // copy repl
-        last_pos = curr_pos + d_target.size_bytes();
-      } else {
-        bytes += d_repl.size_bytes() - d_target.size_bytes();
-      }
-      position = d_str.find(d_target, position + d_target.length());
-      --max_n;
-    }
-    if (out_ptr)  // copy whats left (or right depending on your point of view)
-      memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos);
-    else
-      d_offsets[idx] = bytes;
+    return d_strings.element<string_view>(idx);
   }
-};
 
-/**
- * @brief Functor for detecting falsely-overlapped target positions.
- *
- * This functor examines target positions that have been flagged as potentially overlapped by
- * a previous target position and identifies the overlaps that are false. A false overlap can occur
- * when a target position is overlapped by another target position that is itself overlapped.
- *
- * For example, a target string of "+++" and string to search of "++++++" will generate 4 potential
- * target positions at char offsets 0 through 3. The targets at offsets 1, 2, and 3 will be flagged
- * as potential overlaps since a prior target position is within range of the target string length.
- * The targets at offset 1 and 2 are true overlaps, since the footprint of the valid target at
- * offset 0 overlaps with them. The target at offset 3 is not truly overlapped because it is only
- * overlapped by invalid targets, targets that were themselves overlapped by a valid target.
- */
-struct target_false_overlap_filter_fn {
-  size_type const* const d_overlap_pos_indices{};
-  size_type const* const d_target_positions{};
-  size_type const target_size{};
+  __device__ inline bool is_valid(size_type idx) const { return d_strings.is_valid(idx); }
 
-  __device__ bool operator()(size_type overlap_idx) const
+  /**
+   * @brief Returns true if the target string is found at the given byte position
+   * in the input strings column and is legally within a string row
+   *
+   * @param idx Index of the byte position in the chars column
+   */
+  __device__ bool is_target_within_row(int64_t idx) const
   {
-    if (overlap_idx == 0) {
-      // The first overlap has no prior overlap to chain, so it should be kept as an overlap.
-      return false;
+    auto const d_offsets = d_strings_offsets;
+    auto const d_chars   = get_base_ptr() + idx;
+    auto const d_tgt     = d_target;
+    auto const chars_end = chars_bytes + d_offsets[0];
+    if (!d_tgt.empty() && (idx + d_tgt.size_bytes() <= chars_end) &&
+        (d_tgt.compare(d_chars, d_tgt.size_bytes()) == 0)) {
+      auto const idx_itr =
+        thrust::upper_bound(thrust::seq, d_offsets, d_offsets + d_strings.size(), idx);
+      auto str_idx = static_cast<size_type>(thrust::distance(d_offsets, idx_itr) - 1);
+      auto d_str   = get_string(str_idx);
+      if ((d_chars + d_tgt.size_bytes()) <= (d_str.data() + d_str.size_bytes())) { return true; }
     }
+    return false;
+  }
 
-    size_type const this_pos_idx = d_overlap_pos_indices[overlap_idx];
-
-    // Searching backwards for the first target position index of an overlap that is not adjacent
-    // to its overlap predecessor. The result will be the first overlap in this chain of overlaps.
-    size_type first_overlap_idx = overlap_idx;
-    size_type first_pos_idx     = this_pos_idx;
-    while (first_overlap_idx > 0) {
-      size_type prev_pos_idx = d_overlap_pos_indices[--first_overlap_idx];
-      if (prev_pos_idx + 1 != first_pos_idx) { break; }
-      first_pos_idx = prev_pos_idx;
-    }
+  /**
+   * @brief Returns true if the target string found at the given byte position
+   *
+   * @param idx Index of the byte position in the chars column
+   */
+  __device__ bool has_target(int64_t idx) const
+  {
+    auto const d_chars = get_base_ptr() + d_strings_offsets[0] + idx;
+    return (!d_target.empty() && (idx + d_target.size_bytes() <= chars_bytes) &&
+            (d_target.compare(d_chars, d_target.size_bytes()) == 0));
+  }
 
-    // The prior target position to the first overlapped position in the chain is a valid target.
-    size_type valid_pos_idx = first_pos_idx - 1;
-    size_type valid_pos     = d_target_positions[valid_pos_idx];
-
-    // Walk forward from this valid target. Any targets within the range of this valid one are true
-    // overlaps. The first overlap beyond the range of this valid target is another valid target,
-    // as it was falsely overlapped by a target that was itself overlapped. Repeat until we get to
-    // the overlapped position being queried by this call.
-    while (valid_pos_idx < this_pos_idx) {
-      size_type next_pos_idx = valid_pos_idx + 1;
-      size_type next_pos     = d_target_positions[next_pos_idx];
-      // Every target position within the range of a valid target position is a true overlap.
-      while (next_pos < valid_pos + target_size) {
-        if (next_pos_idx == this_pos_idx) { return false; }
-        next_pos = d_target_positions[++next_pos_idx];
+  /**
+   * @brief Count the number of strings that will be produced by the replace
+   *
+   * This includes segments of the string that are not replaced as well as those
+   * that are replaced.
+   *
+   * @param idx Index of the row in d_strings to be processed
+   * @param d_positions Positions of the targets found in the chars column
+   * @param d_targets_offsets Offsets identify which target positions go with the current string
+   * @return Number of substrings resulting from the replace operations on this row
+   */
+  __device__ size_type count_strings(size_type idx,
+                                     int64_t const* d_positions,
+                                     cudf::detail::input_offsetalator d_targets_offsets) const
+  {
+    if (!is_valid(idx)) { return 0; }
+
+    auto const d_str     = get_string(idx);
+    auto const d_str_end = d_str.data() + d_str.size_bytes();
+    auto const base_ptr  = get_base_ptr();
+    auto max_n           = (maxrepl < 0) ? d_str.length() : maxrepl;
+
+    auto const target_offset = d_targets_offsets[idx];
+    auto const targets_size  = static_cast<size_type>(d_targets_offsets[idx + 1] - target_offset);
+    auto const positions     = d_positions + target_offset;
+
+    size_type count = 1;  // always at least one string
+    auto str_ptr    = d_str.data();
+    for (std::size_t i = 0; (i < targets_size) && (max_n > 0); ++i) {
+      auto const tgt_ptr = base_ptr + positions[i];
+      if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) {
+        auto const keep_size = static_cast<size_type>(thrust::distance(str_ptr, tgt_ptr));
+        if (keep_size > 0) { count++; }  // don't bother counting empty strings
+        if (!d_replacement.empty()) { count++; }
+        str_ptr += keep_size + d_target.size_bytes();
+        --max_n;
       }
-      valid_pos_idx = next_pos_idx;
-      valid_pos     = next_pos;
     }
-
-    // This was overlapped only by false overlaps and therefore is a valid target.
-    return true;
+    return count;
   }
-};
 
-/**
- * @brief Functor for replacing each target string with the replacement string.
- *
- * This will perform a replace operation at each target position.
- */
-struct target_replacer_fn {
-  device_span<size_type const> const d_target_positions;
-  char const* const d_in_chars{};
-  char* const d_out_chars{};
-  size_type const target_size{};
-  string_view const d_repl;
-  int32_t const in_char_offset = 0;
-
-  __device__ void operator()(size_type input_idx) const
+  /**
+   * @brief Retrieve the strings for each row
+   *
+   * This will return string segments as string_index_pair objects for
+   * parts of the string that are not replaced interlaced with the
+   * appropriate replacement string where replacement targets are found.
+   *
+   * This function is called only once to produce both the string_index_pair objects
+   * and the output row size in bytes.
+   *
+   * @param idx Index of the row in d_strings
+   * @param d_offsets Offsets to identify where to store the results of the replace for this string
+   * @param d_positions The target positions found in the chars column
+   * @param d_targets_offsets The offsets to identify which target positions go with this string
+   * @param d_all_strings The output of all the produced string segments
+   * @return The size in bytes of the output string for this row
+   */
+  __device__ size_type get_strings(size_type idx,
+                                   cudf::detail::input_offsetalator const d_offsets,
+                                   int64_t const* d_positions,
+                                   cudf::detail::input_offsetalator d_targets_offsets,
+                                   string_index_pair* d_all_strings) const
   {
-    // Calculate the adjustment from input index to output index for each prior target position.
-    auto const repl_size         = d_repl.size_bytes();
-    auto const idx_delta_per_pos = repl_size - target_size;
-
-    // determine the number of target positions at or before this character position
-    size_type const* next_target_pos_ptr = thrust::upper_bound(
-      thrust::seq, d_target_positions.begin(), d_target_positions.end(), input_idx);
-    size_type const num_prev_targets = next_target_pos_ptr - d_target_positions.data();
-    size_type output_idx = input_idx - in_char_offset + idx_delta_per_pos * num_prev_targets;
-
-    if (num_prev_targets == 0) {
-      // not within a target string
-      d_out_chars[output_idx] = d_in_chars[input_idx];
-    } else {
-      // check if this input position is within a target string
-      size_type const prev_target_pos = *(next_target_pos_ptr - 1);
-      size_type target_idx            = input_idx - prev_target_pos;
-      if (target_idx < target_size) {
-        // within the target string, so the original calculation was off by one target string
-        output_idx -= idx_delta_per_pos;
-
-        // Copy the corresponding byte from the replacement string. If the replacement string is
-        // larger than the target string then the thread reading the last target byte is
-        // responsible for copying the remainder of the replacement string.
-        if (target_idx < repl_size) {
-          d_out_chars[output_idx++] = d_repl.data()[target_idx++];
-          if (target_idx == target_size) {
-            memcpy(d_out_chars + output_idx, d_repl.data() + target_idx, repl_size - target_idx);
-          }
+    if (!is_valid(idx)) { return 0; }
+
+    auto const d_output  = d_all_strings + d_offsets[idx];
+    auto const d_str     = get_string(idx);
+    auto const d_str_end = d_str.data() + d_str.size_bytes();
+    auto const base_ptr  = get_base_ptr();
+    auto max_n           = (maxrepl < 0) ? d_str.length() : maxrepl;
+
+    auto const target_offset = d_targets_offsets[idx];
+    auto const targets_size  = static_cast<size_type>(d_targets_offsets[idx + 1] - target_offset);
+    auto const positions     = d_positions + target_offset;
+
+    size_type output_idx  = 0;
+    size_type output_size = 0;
+    auto str_ptr          = d_str.data();
+    for (std::size_t i = 0; (i < targets_size) && (max_n > 0); ++i) {
+      auto const tgt_ptr = base_ptr + positions[i];
+      if (str_ptr <= tgt_ptr && tgt_ptr < d_str_end) {
+        auto const keep_size = static_cast<size_type>(thrust::distance(str_ptr, tgt_ptr));
+        if (keep_size > 0) { d_output[output_idx++] = string_index_pair{str_ptr, keep_size}; }
+        output_size += keep_size;
+
+        if (!d_replacement.empty()) {
+          d_output[output_idx++] =
+            string_index_pair{d_replacement.data(), d_replacement.size_bytes()};
         }
-      } else {
-        // not within a target string
-        d_out_chars[output_idx] = d_in_chars[input_idx];
+        output_size += d_replacement.size_bytes();
+
+        str_ptr += keep_size + d_target.size_bytes();
+        --max_n;
       }
     }
+    // include any leftover parts of the string
+    if (str_ptr <= d_str_end) {
+      auto const left_size = static_cast<size_type>(thrust::distance(str_ptr, d_str_end));
+      d_output[output_idx] = string_index_pair{str_ptr, left_size};
+      output_size += left_size;
+    }
+    return output_size;
   }
+
+  replace_parallel_chars_fn(column_device_view const& d_strings,
+                            cudf::detail::input_offsetalator d_strings_offsets,
+                            int64_t chars_bytes,
+                            string_view d_target,
+                            string_view d_replacement,
+                            cudf::size_type maxrepl)
+    : d_strings(d_strings),
+      d_strings_offsets(d_strings_offsets),
+      chars_bytes(chars_bytes),
+      d_target{d_target},
+      d_replacement{d_replacement},
+      maxrepl(maxrepl)
+  {
+  }
+
+ protected:
+  column_device_view d_strings;
+  cudf::detail::input_offsetalator d_strings_offsets;
+  int64_t chars_bytes;
+  string_view d_target;
+  string_view d_replacement;
+  cudf::size_type maxrepl;
 };
 
-/**
- * @brief Filter target positions that are overlapped by other, valid target positions.
- *
- * This performs an in-place modification of the target positions to remove any target positions
- * that are overlapped by other, valid target positions. For example, if the target string is "++"
- * and the string to search is "+++" then there will be two potential targets at character offsets
- * 0 and 1. The target at offset 0 is valid and overlaps the target at offset 1, invalidating the
- * target at offset 1.
- *
- * @param[in,out] d_target_positions Potential target positions to filter in-place.
- * @param[in]     target_count       Number of potential target positions.
- * @param[in]     target_size        Size of the target string in bytes.
- * @param[in]     stream             CUDA stream to use for device operations.
- * @return Number of target positions after filtering.
- */
-size_type filter_overlap_target_positions(size_type* d_target_positions,
-                                          size_type target_count,
-                                          size_type target_size,
-                                          rmm::cuda_stream_view stream)
+std::unique_ptr<column> replace_character_parallel(strings_column_view const& input,
+                                                   string_view const& d_target,
+                                                   string_view const& d_replacement,
+                                                   cudf::size_type maxrepl,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::mr::device_memory_resource* mr)
 {
-  auto overlap_detector = [d_target_positions, target_size] __device__(size_type pos_idx) -> bool {
-    return (pos_idx > 0)
-             ? d_target_positions[pos_idx] - d_target_positions[pos_idx - 1] < target_size
-             : false;
-  };
-
-  // count the potential number of overlapped target positions
-  size_type overlap_count =
-    thrust::count_if(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     thrust::make_counting_iterator<size_type>(target_count),
-                     overlap_detector);
-  if (overlap_count == 0) { return target_count; }
-
-  // create a vector indexing the potential overlapped target positions
-  rmm::device_uvector<size_type> potential_overlapped_pos_indices(overlap_count, stream);
-  auto d_potential_overlapped_pos_indices = potential_overlapped_pos_indices.data();
-  thrust::copy_if(rmm::exec_policy(stream),
-                  thrust::make_counting_iterator<size_type>(0),
-                  thrust::make_counting_iterator<size_type>(target_count),
-                  d_potential_overlapped_pos_indices,
-                  overlap_detector);
-
-  // filter out the false overlaps that are actually valid
-  rmm::device_uvector<size_type> overlapped_pos_indices(overlap_count, stream);
-  auto d_overlapped_pos_indices = overlapped_pos_indices.data();
-  auto overlap_end =
-    thrust::remove_copy_if(rmm::exec_policy(stream),
-                           d_potential_overlapped_pos_indices,
-                           d_potential_overlapped_pos_indices + overlap_count,
-                           thrust::make_counting_iterator<size_type>(0),
-                           d_overlapped_pos_indices,
-                           target_false_overlap_filter_fn{
-                             d_potential_overlapped_pos_indices, d_target_positions, target_size});
-  overlap_count = cudf::distance(d_overlapped_pos_indices, overlap_end);
-
-  // In-place remove any target positions that are overlapped by valid target positions
-  auto target_pos_end = thrust::remove_if(
-    rmm::exec_policy(stream),
-    d_target_positions,
-    d_target_positions + target_count,
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  auto const strings_count = input.size();
+  auto const chars_offset  = get_offset_value(input.offsets(), input.offset(), stream);
+  auto const chars_bytes =
+    get_offset_value(input.offsets(), input.offset() + strings_count, stream) - chars_offset;
+
+  auto const offsets_begin =
+    cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
+
+  replace_parallel_chars_fn fn{
+    *d_strings, offsets_begin, chars_bytes, d_target, d_replacement, maxrepl};
+
+  // Count the number of targets in the entire column.
+  // Note this may over-count in the case where a target spans adjacent strings.
+  auto target_count = thrust::count_if(rmm::exec_policy_nosync(stream),
+                                       thrust::make_counting_iterator<int64_t>(0),
+                                       thrust::make_counting_iterator<int64_t>(chars_bytes),
+                                       [fn] __device__(int64_t idx) { return fn.has_target(idx); });
+
+  // Create a vector of every target position in the chars column.
+  // These may also include overlapping targets which will be resolved later.
+  auto targets_positions = rmm::device_uvector<int64_t>(target_count, stream);
+  auto const copy_itr    = thrust::counting_iterator<int64_t>(chars_offset);
+  auto const copy_end    = cudf::detail::copy_if_safe(
+    copy_itr,
+    copy_itr + chars_bytes + chars_offset,
+    targets_positions.begin(),
+    [fn] __device__(int64_t idx) { return fn.is_target_within_row(idx); },
+    stream);
+
+  // adjust target count since the copy-if may have eliminated some invalid targets
+  target_count = std::min(std::distance(targets_positions.begin(), copy_end), target_count);
+  targets_positions.resize(target_count, stream);
+  auto d_positions = targets_positions.data();
+
+  // create a vector of offsets to each string's set of target positions
+  auto const targets_offsets = create_offsets_from_positions(
+    input, targets_positions, stream, rmm::mr::get_current_device_resource());
+  auto const d_targets_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(targets_offsets->view());
+
+  // compute the number of string segments produced by replace in each string
+  auto counts = rmm::device_uvector<size_type>(strings_count, stream);
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(strings_count),
+                    counts.begin(),
+                    cuda::proclaim_return_type<size_type>(
+                      [fn, d_positions, d_targets_offsets] __device__(size_type idx) -> size_type {
+                        return fn.count_strings(idx, d_positions, d_targets_offsets);
+                      }));
+
+  // create offsets from the counts
+  auto [offsets, total_strings] =
+    cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr);
+  auto const d_strings_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
+
+  // build a vector of all the positions for all the strings
+  auto indices   = rmm::device_uvector<string_index_pair>(total_strings, stream);
+  auto d_indices = indices.data();
+  auto d_sizes   = counts.data();  // reusing this vector to hold output sizes now
+  thrust::for_each_n(
+    rmm::exec_policy_nosync(stream),
     thrust::make_counting_iterator<size_type>(0),
-    [d_overlapped_pos_indices, overlap_count] __device__(size_type target_position_idx) -> bool {
-      return thrust::binary_search(thrust::seq,
-                                   d_overlapped_pos_indices,
-                                   d_overlapped_pos_indices + overlap_count,
-                                   target_position_idx);
+    strings_count,
+    [fn, d_strings_offsets, d_positions, d_targets_offsets, d_indices, d_sizes] __device__(
+      size_type idx) {
+      d_sizes[idx] =
+        fn.get_strings(idx, d_strings_offsets, d_positions, d_targets_offsets, d_indices);
     });
-  return cudf::distance(d_target_positions, target_pos_end);
-}
 
-/**
- * @brief Filter target positions to remove any invalid target positions.
- *
- * This performs an in-place modification of the target positions to remove any target positions
- * that are invalid, either by the target string overlapping a row boundary or being overlapped by
- * another valid target string.
- *
- * @param[in,out] target_positions Potential target positions to filter in-place.
- * @param[in]     d_offsets_span   Memory range encompassing the string column offsets.
- * @param[in]     target_size      Size of the target string in bytes.
- * @param[in]     stream           CUDA stream to use for device operations.
- * @return Number of target positions after filtering.
- */
-size_type filter_false_target_positions(rmm::device_uvector<size_type>& target_positions,
-                                        device_span<int32_t const> d_offsets_span,
-                                        size_type target_size,
-                                        rmm::cuda_stream_view stream)
-{
-  // In-place remove any positions for target strings that crossed string boundaries.
-  auto d_target_positions = target_positions.data();
-  auto target_pos_end =
-    thrust::remove_if(rmm::exec_policy(stream),
-                      d_target_positions,
-                      d_target_positions + target_positions.size(),
-                      [d_offsets_span, target_size] __device__(size_type target_pos) -> bool {
-                        // find the end of the string containing the start of this target
-                        size_type const* offset_ptr = thrust::upper_bound(
-                          thrust::seq, d_offsets_span.begin(), d_offsets_span.end(), target_pos);
-                        return target_pos + target_size > *offset_ptr;
-                      });
-  auto const target_count = cudf::distance(d_target_positions, target_pos_end);
-  if (target_count == 0) { return 0; }
-
-  // Filter out target positions that are the result of overlapping target matches.
-  return (target_count > 1)
-           ? filter_overlap_target_positions(d_target_positions, target_count, target_size, stream)
-           : target_count;
-}
+  // use this utility to gather the string parts into a contiguous chars column
+  auto chars      = make_strings_column(indices.begin(), indices.end(), stream, mr);
+  auto chars_data = chars->release().data;
 
-/**
- * @brief Filter target positions beyond the maximum target replacements per row limit.
- *
- * This performs an in-place modification of the target positions to remove any target positions
- * corresponding to targets that should not be replaced due to the maximum target replacement per
- * row limit.
- *
- * @param[in,out] target_positions Target positions to filter in-place.
- * @param[in]     target_count     Number of target positions.
- * @param[in]     d_offsets_span   Memory range encompassing the string column offsets.
- * @param[in]     max_repl_per_row Maximum target replacements per row limit.
- * @param[in]     stream           CUDA stream to use for device operations.
- * @return Number of target positions after filtering.
- */
-size_type filter_maxrepl_target_positions(size_type* d_target_positions,
-                                          size_type target_count,
-                                          device_span<int32_t const> d_offsets_span,
-                                          size_type max_repl_per_row,
-                                          rmm::cuda_stream_view stream)
-{
-  auto pos_to_row_fn = cuda::proclaim_return_type<size_type>(
-    [d_offsets_span] __device__(size_type target_pos) -> size_type {
-      auto upper_bound =
-        thrust::upper_bound(thrust::seq, d_offsets_span.begin(), d_offsets_span.end(), target_pos);
-      return thrust::distance(d_offsets_span.begin(), upper_bound);
-    });
+  // create offsets from the sizes
+  offsets = std::get<0>(
+    cudf::strings::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr));
 
-  // compute the match count per row for each target position
-  rmm::device_uvector<size_type> match_counts(target_count, stream);
-  auto d_match_counts = match_counts.data();
-  thrust::inclusive_scan_by_key(
-    rmm::exec_policy(stream),
-    thrust::make_transform_iterator(d_target_positions, pos_to_row_fn),
-    thrust::make_transform_iterator(d_target_positions + target_count, pos_to_row_fn),
-    thrust::make_constant_iterator<size_type>(1),
-    d_match_counts);
-
-  // In-place remove any positions that exceed the per-row match limit
-  auto target_pos_end =
-    thrust::remove_if(rmm::exec_policy(stream),
-                      d_target_positions,
-                      d_target_positions + target_count,
-                      d_match_counts,
-                      [max_repl_per_row] __device__(size_type match_count) -> bool {
-                        return match_count > max_repl_per_row;
-                      });
-
-  return cudf::distance(d_target_positions, target_pos_end);
+  // build the strings columns from the chars and offsets
+  return make_strings_column(strings_count,
+                             std::move(offsets),
+                             std::move(chars_data.release()[0]),
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
 
 /**
- * @brief Scalar string replacement using a character-level parallel algorithm.
- *
- * Replaces occurrences of the target string with the replacement string using an algorithm with
- * character-level parallelism. This algorithm will perform well when the strings in the string
- * column are relatively long.
- * @see BYTES_PER_VALID_ROW_THRESHOLD
+ * @brief Function logic for the replace_string_parallel
  *
- * @param strings     String column to search for target strings.
- * @param chars_start Offset of the first character in the string column.
- * @param chars_end   Offset beyond the last character in the string column to search.
- * @param d_target    String to search for within the string column.
- * @param d_repl      Replacement string if target string is found.
- * @param maxrepl     Maximum times to replace if target appears multiple times in a string.
- * @param stream      CUDA stream to use for device operations
- * @param mr          Device memory resource used to allocate the returned column's device memory
- * @return New strings column.
+ * Performs the multi-replace operation with a thread per string.
+ * This performs best on smaller strings. @see AVG_CHAR_BYTES_THRESHOLD
  */
-std::unique_ptr<column> replace_char_parallel(strings_column_view const& strings,
-                                              size_type chars_start,
-                                              size_type chars_end,
-                                              string_view const& d_target,
-                                              string_view const& d_repl,
-                                              int32_t maxrepl,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
-{
-  auto const strings_count = strings.size();
-  auto const offset_count  = strings_count + 1;
-  auto const d_offsets   = strings.offsets().begin<int32_t>() + strings.offset();  // TODO: PR 14824
-  auto const d_in_chars  = strings.chars_begin(stream);
-  auto const chars_bytes = chars_end - chars_start;
-  auto const target_size = d_target.size_bytes();
-
-  // detect a target match at the specified byte position
-  device_span<char const> const d_chars_span(d_in_chars, chars_end);
-  auto target_detector = [d_chars_span, d_target] __device__(size_type char_idx) {
-    auto target_size = d_target.size_bytes();
-    auto target_ptr  = d_chars_span.begin() + char_idx;
-    return target_ptr + target_size <= d_chars_span.end() &&
-           d_target.compare(target_ptr, target_size) == 0;
-  };
-
-  // Count target string matches across all character positions, ignoring string boundaries and
-  // overlapping target strings. This may produce false-positives.
-  size_type target_count = thrust::count_if(rmm::exec_policy(stream),
-                                            thrust::make_counting_iterator<size_type>(chars_start),
-                                            thrust::make_counting_iterator<size_type>(chars_end),
-                                            target_detector);
-  if (target_count == 0) {
-    // nothing to replace, copy the input column
-    return std::make_unique<cudf::column>(strings.parent(), stream, mr);
-  }
+struct replace_fn {
+  column_device_view const d_strings;
+  string_view d_target;
+  string_view d_replacement;
+  cudf::size_type maxrepl;
+  cudf::size_type* d_offsets{};
+  char* d_chars{};
 
-  // create a vector of the potential target match positions
-  rmm::device_uvector<size_type> target_positions(target_count, stream);
-  auto d_target_positions = target_positions.data();
-  thrust::copy_if(rmm::exec_policy(stream),
-                  thrust::make_counting_iterator<size_type>(chars_start),
-                  thrust::make_counting_iterator<size_type>(chars_end),
-                  d_target_positions,
-                  target_detector);
-
-  device_span<int32_t const> d_offsets_span(d_offsets, offset_count);
-  if (target_size > 1) {
-    target_count =
-      filter_false_target_positions(target_positions, d_offsets_span, target_size, stream);
-    if (target_count == 0) {
-      // nothing to replace, copy the input column
-      return std::make_unique<cudf::column>(strings.parent(), stream, mr);
+  __device__ void operator()(size_type idx)
+  {
+    if (d_strings.is_null(idx)) {
+      if (!d_chars) { d_offsets[idx] = 0; }
+      return;
     }
-  }
+    auto const d_str   = d_strings.element<string_view>(idx);
+    char const* in_ptr = d_str.data();
 
-  // filter out any target positions that exceed the per-row match limit
-  if (maxrepl > 0 && target_count > maxrepl) {
-    target_count = filter_maxrepl_target_positions(
-      d_target_positions, target_count, d_offsets_span, maxrepl, stream);
+    size_type bytes = d_str.size_bytes();
+    size_type spos  = 0;
+    size_type lpos  = 0;
+    char* out_ptr   = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    auto max_n      = (maxrepl < 0) ? d_str.length() : maxrepl;
+
+    // check each character against each target
+    while (spos < d_str.size_bytes() && (max_n > 0)) {
+      auto const d_tgt = d_target;
+      if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) &&    // check fit
+          (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0))  // and match
+      {
+        auto const d_repl = d_replacement;
+        bytes += d_repl.size_bytes() - d_tgt.size_bytes();
+        if (out_ptr) {
+          out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos);
+          out_ptr = copy_string(out_ptr, d_repl);
+          lpos    = spos + d_tgt.size_bytes();
+        }
+        spos += d_tgt.size_bytes() - 1;
+        --max_n;
+      }
+      ++spos;
+    }
+    if (out_ptr) {  // copy remainder
+      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
+    } else {
+      d_offsets[idx] = bytes;
+    }
   }
+};
 
-  // build the offsets column
-  auto offsets_column = make_numeric_column(
-    data_type{type_id::INT32}, offset_count, mask_state::UNALLOCATED, stream, mr);
-  auto offsets_view     = offsets_column->mutable_view();
-  auto delta_per_target = d_repl.size_bytes() - target_size;
-  device_span<size_type const> d_target_positions_span(d_target_positions, target_count);
-  auto offsets_update_fn = cuda::proclaim_return_type<int32_t>(
-    [d_target_positions_span, delta_per_target, chars_start] __device__(int32_t offset) -> int32_t {
-      // determine the number of target positions occurring before this offset
-      size_type const* next_target_pos_ptr = thrust::lower_bound(
-        thrust::seq, d_target_positions_span.begin(), d_target_positions_span.end(), offset);
-      size_type num_prev_targets =
-        thrust::distance(d_target_positions_span.data(), next_target_pos_ptr);
-      return offset - chars_start + delta_per_target * num_prev_targets;
-    });
-  thrust::transform(rmm::exec_policy(stream),
-                    d_offsets_span.begin(),
-                    d_offsets_span.end(),
-                    offsets_view.begin<int32_t>(),
-                    offsets_update_fn);
-
-  // build the characters column
-  rmm::device_uvector<char> chars(chars_bytes + (delta_per_target * target_count), stream, mr);
-  auto d_out_chars = chars.data();
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(chars_start),
-    chars_bytes,
-    target_replacer_fn{
-      d_target_positions_span, d_in_chars, d_out_chars, target_size, d_repl, chars_start});
-
-  // free the target positions buffer as it is no longer needed
-  (void)target_positions.release();
-
-  return make_strings_column(strings_count,
-                             std::move(offsets_column),
-                             chars.release(),
-                             strings.null_count(),
-                             cudf::detail::copy_bitmask(strings.parent(), stream, mr));
-}
-
-/**
- * @brief Scalar string replacement using a row-level parallel algorithm.
- *
- * Replaces occurrences of the target string with the replacement string using an algorithm with
- * row-level parallelism. This algorithm will perform well when the strings in the string
- * column are relatively short.
- * @see BYTES_PER_VALID_ROW_THRESHOLD
- *
- * @param strings     String column to search for target strings.
- * @param d_target    String to search for within the string column.
- * @param d_repl      Replacement string if target string is found.
- * @param maxrepl     Maximum times to replace if target appears multiple times in a string.
- * @param stream      CUDA stream to use for device operations
- * @param mr          Device memory resource used to allocate the returned column's device memory
- * @return New strings column.
- */
-std::unique_ptr<column> replace_row_parallel(strings_column_view const& strings,
-                                             string_view const& d_target,
-                                             string_view const& d_repl,
-                                             int32_t maxrepl,
-                                             rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> replace_string_parallel(strings_column_view const& input,
+                                                string_view const& d_target,
+                                                string_view const& d_replacement,
+                                                cudf::size_type maxrepl,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr)
 {
-  auto d_strings = column_device_view::create(strings.parent(), stream);
+  auto d_strings = column_device_view::create(input.parent(), stream);
 
-  // this utility calls the given functor to build the offsets and chars columns
   auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-    replace_row_parallel_fn{*d_strings, d_target, d_repl, maxrepl}, strings.size(), stream, mr);
+    replace_fn{*d_strings, d_target, d_replacement, maxrepl}, input.size(), stream, mr);
 
-  return make_strings_column(strings.size(),
+  return make_strings_column(input.size(),
                              std::move(offsets_column),
                              chars.release(),
-                             strings.null_count(),
-                             cudf::detail::copy_bitmask(strings.parent(), stream, mr));
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
 
 }  // namespace
 
-std::unique_ptr<column> replace(strings_column_view const& strings,
+std::unique_ptr<column> replace(strings_column_view const& input,
                                 string_scalar const& target,
                                 string_scalar const& repl,
-                                int32_t maxrepl,
+                                cudf::size_type maxrepl,
                                 rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_column(type_id::STRING);
-  if (maxrepl == 0) return std::make_unique<cudf::column>(strings.parent(), stream, mr);
+  if (input.is_empty()) { return make_empty_column(type_id::STRING); }
+  if (maxrepl == 0) { return std::make_unique<cudf::column>(input.parent(), stream, mr); }
   CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid.");
   CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid.");
   CUDF_EXPECTS(target.size() > 0, "Parameter target must not be empty string.");
@@ -558,25 +425,11 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
   string_view d_target(target.data(), target.size());
   string_view d_repl(repl.data(), repl.size());
 
-  // determine range of characters in the base column
-  auto const strings_count = strings.size();
-  auto const offset_count  = strings_count + 1;
-  auto const d_offsets     = strings.offsets().data<int32_t>() + strings.offset();
-  size_type const chars_start =
-    (strings.offset() == 0)
-      ? 0
-      : cudf::detail::get_value<int32_t>(strings.offsets(), strings.offset(), stream);
-  size_type const chars_end   = (offset_count == strings.offsets().size())
-                                  ? strings.chars_size(stream)
-                                  : cudf::detail::get_value<int32_t>(
-                                    strings.offsets(), strings.offset() + strings_count, stream);
-  size_type const chars_bytes = chars_end - chars_start;
-
-  auto const avg_bytes_per_row = chars_bytes / std::max(strings_count - strings.null_count(), 1);
-  return (avg_bytes_per_row < BYTES_PER_VALID_ROW_THRESHOLD)
-           ? replace_row_parallel(strings, d_target, d_repl, maxrepl, stream, mr)
-           : replace_char_parallel(
-               strings, chars_start, chars_end, d_target, d_repl, maxrepl, stream, mr);
+  return (input.size() == input.null_count() ||
+          ((input.chars_size(stream) / (input.size() - input.null_count())) <
+           AVG_CHAR_BYTES_THRESHOLD))
+           ? replace_string_parallel(input, d_target, d_repl, maxrepl, stream, mr)
+           : replace_character_parallel(input, d_target, d_repl, maxrepl, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/replace/replace_nulls.cu b/cpp/src/strings/replace/replace_nulls.cu
index 26fb1c7819f..bbca4997f57 100644
--- a/cpp/src/strings/replace/replace_nulls.cu
+++ b/cpp/src/strings/replace/replace_nulls.cu
@@ -36,18 +36,18 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
-std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
+std::unique_ptr<column> replace_nulls(strings_column_view const& input,
                                       string_scalar const& repl,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
-  size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
+  size_type strings_count = input.size();
+  if (strings_count == 0) { return make_empty_column(type_id::STRING); }
   CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid.");
 
   string_view d_repl(repl.data(), repl.size());
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
+  auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_strings      = *strings_column;
 
   // build offsets column
@@ -58,12 +58,12 @@ std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
     }));
   auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
     offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_offsets = offsets_column->view().data<int32_t>();
+  auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
 
   // build chars column
   rmm::device_uvector<char> chars(bytes, stream, mr);
   auto d_chars = chars.data();
-  thrust::for_each_n(rmm::exec_policy(stream),
+  thrust::for_each_n(rmm::exec_policy_nosync(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,
                      [d_strings, d_repl, d_offsets, d_chars] __device__(size_type idx) {
diff --git a/cpp/src/strings/replace/replace_slice.cu b/cpp/src/strings/replace/replace_slice.cu
index 041801336e6..c11664c86d4 100644
--- a/cpp/src/strings/replace/replace_slice.cu
+++ b/cpp/src/strings/replace/replace_slice.cu
@@ -50,7 +50,7 @@ struct replace_slice_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_offsets[idx] = 0; }
       return;
     }
     auto const d_str   = d_strings.element<string_view>(idx);
@@ -75,34 +75,37 @@ struct replace_slice_fn {
 
 }  // namespace
 
-std::unique_ptr<column> replace_slice(strings_column_view const& strings,
+std::unique_ptr<column> replace_slice(strings_column_view const& input,
                                       string_scalar const& repl,
                                       size_type start,
                                       size_type stop,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
-  if (strings.is_empty()) return make_empty_column(type_id::STRING);
+  if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid.");
-  if (stop > 0) CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop.");
+  if (stop > 0) {
+    CUDF_EXPECTS(start <= stop, "Parameter start must be less than or equal to stop.");
+  }
 
   string_view d_repl(repl.data(), repl.size());
 
-  auto d_strings = column_device_view::create(strings.parent(), stream);
+  auto d_strings = column_device_view::create(input.parent(), stream);
 
   // this utility calls the given functor to build the offsets and chars columns
   auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-    replace_slice_fn{*d_strings, d_repl, start, stop}, strings.size(), stream, mr);
+    replace_slice_fn{*d_strings, d_repl, start, stop}, input.size(), stream, mr);
 
-  return make_strings_column(strings.size(),
+  return make_strings_column(input.size(),
                              std::move(offsets_column),
                              chars.release(),
-                             strings.null_count(),
-                             cudf::detail::copy_bitmask(strings.parent(), stream, mr));
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
+
 }  // namespace detail
 
-std::unique_ptr<column> replace_slice(strings_column_view const& strings,
+std::unique_ptr<column> replace_slice(strings_column_view const& input,
                                       string_scalar const& repl,
                                       size_type start,
                                       size_type stop,
@@ -110,7 +113,7 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_slice(strings, repl, start, stop, stream, mr);
+  return detail::replace_slice(input, repl, start, stop, stream, mr);
 }
 
 }  // namespace strings

From 5192b608eeed4bda9317c657253c3a5630aa4c5d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 3 Apr 2024 09:11:37 -1000
Subject: [PATCH 006/842] Align date_range defaults with pandas, support tz
 (#15139)

Precursor to https://github.com/rapidsai/cudf/issues/15116

* Aligns `date_range` signature with pandas, _technically_ an API breakage with `closed` changing defaults even though it still isn't supported
* Copies pandas behavior of allowing `date_range` with just two of `start/end/periods`
* Supports `tz` arg now

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15139
---
 python/cudf/cudf/core/tools/datetimes.py | 49 +++++++++++++-----------
 python/cudf/cudf/tests/test_datetime.py  | 16 ++++++++
 2 files changed, 43 insertions(+), 22 deletions(-)

diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 65f97c99934..ed8fca88acd 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -799,9 +799,11 @@ def date_range(
     periods=None,
     freq=None,
     tz=None,
-    normalize=False,
+    normalize: bool = False,
     name=None,
-    closed=None,
+    closed: Literal["left", "right", "both", "neither"] = "both",
+    *,
+    unit: Optional[str] = None,
 ):
     """Return a fixed frequency DatetimeIndex.
 
@@ -837,8 +839,13 @@ def date_range(
     name : str, default None
         Name of the resulting DatetimeIndex
 
-    closed : {None, 'left', 'right'}, optional
-        Not Supported
+    closed : {"left", "right", "both", "neither"}, default "both"
+        Whether to set each bound as closed or open.
+        Currently only "both" is supported
+
+    unit : str, default None
+        Specify the desired resolution of the result. Currently
+        not supported.
 
     Returns
     -------
@@ -875,11 +882,15 @@ def date_range(
                 '2026-04-23 08:00:00'],
                 dtype='datetime64[ns]')
     """
-    if tz is not None:
-        raise NotImplementedError("tz is currently unsupported.")
+    if closed != "both":
+        raise NotImplementedError(f"{closed=} is currently unsupported.")
+    if unit is not None:
+        raise NotImplementedError(f"{unit=} is currently unsupported.")
+    if normalize is not False:
+        raise NotImplementedError(f"{normalize=} is currently unsupported.")
 
-    if closed is not None:
-        raise NotImplementedError("closed is currently unsupported.")
+    if freq is None and any(arg is None for arg in (start, end, periods)):
+        freq = "D"
 
     if (start, end, periods, freq).count(None) > 1:
         raise ValueError(
@@ -894,7 +905,7 @@ def date_range(
             FutureWarning,
         )
 
-    dtype = np.dtype("<M8[ns]")
+    dtype = np.dtype("datetime64[ns]")
 
     if freq is None:
         # `start`, `end`, `periods` is specified, we treat the timestamps as
@@ -903,7 +914,7 @@ def date_range(
         end = cudf.Scalar(end, dtype=dtype).value.astype("int64")
         arr = cp.linspace(start=start, stop=end, num=periods)
         result = cudf.core.column.as_column(arr).astype("datetime64[ns]")
-        return cudf.DatetimeIndex._from_data({name: result})
+        return cudf.DatetimeIndex._from_data({name: result}).tz_localize(tz)
 
     # The code logic below assumes `freq` is defined. It is first normalized
     # into `DateOffset` for further computation with timestamps.
@@ -912,8 +923,8 @@ def date_range(
         offset = freq
     elif isinstance(freq, str):
         offset = pd.tseries.frequencies.to_offset(freq)
-        if not isinstance(offset, pd.tseries.offsets.Tick) and not isinstance(
-            offset, pd.tseries.offsets.Week
+        if not isinstance(
+            offset, (pd.tseries.offsets.Tick, pd.tseries.offsets.Week)
         ):
             raise ValueError(
                 f"Unrecognized frequency string {freq}. cuDF does "
@@ -923,7 +934,7 @@ def date_range(
     else:
         raise TypeError("`freq` must be a `str` or cudf.DateOffset object.")
 
-    if _has_mixed_freqeuency(offset):
+    if _has_fixed_frequency(offset) and _has_non_fixed_frequency(offset):
         raise NotImplementedError(
             "Mixing fixed and non-fixed frequency offset is unsupported."
         )
@@ -1001,7 +1012,9 @@ def date_range(
         arr = cp.arange(start=start, stop=stop, step=step, dtype="int64")
         res = cudf.core.column.as_column(arr).astype("datetime64[ns]")
 
-    return cudf.DatetimeIndex._from_data({name: res}, freq=freq)
+    return cudf.DatetimeIndex._from_data({name: res}, freq=freq).tz_localize(
+        tz
+    )
 
 
 def _has_fixed_frequency(freq: DateOffset) -> bool:
@@ -1026,14 +1039,6 @@ def _has_non_fixed_frequency(freq: DateOffset) -> bool:
     return len(freq.kwds.keys() & non_fixed_frequencies) > 0
 
 
-def _has_mixed_freqeuency(freq: DateOffset) -> bool:
-    """Utility to determine if `freq` contains mixed fixed and non-fixed
-    frequency offset. e.g. {months=1, days=5}
-    """
-
-    return _has_fixed_frequency(freq) and _has_non_fixed_frequency(freq)
-
-
 def _offset_to_nanoseconds_lower_bound(offset: DateOffset) -> int:
     """Given a DateOffset, which can consist of either fixed frequency or
     non-fixed frequency offset, convert to the smallest possible fixed
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 7c209078fd2..37ba7acf044 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2357,3 +2357,19 @@ def test_timezone_array_notimplemented():
 def test_to_datetime_errors_ignore_deprecated():
     with pytest.warns(FutureWarning):
         cudf.to_datetime("2001-01-01 00:04:45", errors="ignore")
+
+
+def test_date_range_freq_default():
+    result = pd.date_range("2020-01-01", periods=2, name="foo")
+    expected = cudf.date_range("2020-01-01", periods=2, name="foo")
+    assert_eq(result, expected)
+
+
+def test_date_range_tz():
+    result = pd.date_range("2020-01-01", periods=2, tz="UTC")
+    expected = cudf.date_range("2020-01-01", periods=2, tz="UTC")
+    assert_eq(result, expected)
+
+    result = pd.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC")
+    expected = cudf.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC")
+    assert_eq(result, expected)

From fbaad8a480d3b2755afe04431c5abe6c098224b4 Mon Sep 17 00:00:00 2001
From: Tanmay Gujar <tanmaygujar999@gmail.com>
Date: Wed, 3 Apr 2024 18:10:19 -0400
Subject: [PATCH 007/842] [FEA] Performance improvement for mixed left
 semi/anti join (#15288)

Current implementation of mixed semi/anti join probes the built hash table twice -- once to find the output table size and once to build the output. Since the upper bound on output table size is O(N) where N is the size of the left table, we can avoid probing twice and achieve a faster join implementation.

This implementation reserves the required upper memory bound, builds the output, and then collects the relevant output rows. This probes the hash table only once.

This PR also removes the size kernels for mixed semi join and output size parameters passed to the mixed semi join.

Closes #15250

# Benchmark Results from cudf repository

## mixed_left_semi_join_32bit (New implementation)

### [0] NVIDIA TITAN V
```
| Key Type | Payload Type | Nullable | Build Table Size | Probe Table Size | Samples |  CPU Time  | Noise |  GPU Time  | Noise |
|----------|--------------|----------|------------------|------------------|---------|------------|-------|------------|-------|
|      I32 |          I32 |        0 |           100000 |           100000 |   1920x | 266.239 us | 3.43% | 261.324 us | 2.84% |
|      I32 |          I32 |        0 |           100000 |           400000 |   1024x | 495.434 us | 1.18% | 490.544 us | 0.63% |
|      I32 |          I32 |        0 |         10000000 |         10000000 |     24x |  20.919 ms | 0.04% |  20.914 ms | 0.03% |
|      I32 |          I32 |        0 |         10000000 |         40000000 |     11x |  54.697 ms | 0.03% |  54.692 ms | 0.03% |
|      I32 |          I32 |        0 |         10000000 |        100000000 |     11x | 122.171 ms | 0.03% | 122.166 ms | 0.03% |
|      I32 |          I32 |        0 |         80000000 |        100000000 |     11x | 192.979 ms | 0.01% | 192.975 ms | 0.01% |
|      I32 |          I32 |        0 |        100000000 |        100000000 |     11x | 212.878 ms | 0.01% | 212.874 ms | 0.01% |
|      I32 |          I32 |        0 |         10000000 |        240000000 |     11x | 279.794 ms | 0.01% | 279.790 ms | 0.01% |
|      I32 |          I32 |        0 |         80000000 |        240000000 |     11x | 351.186 ms | 0.01% | 351.183 ms | 0.01% |
|      I32 |          I32 |        0 |        100000000 |        240000000 |     11x | 370.794 ms | 0.01% | 370.790 ms | 0.01% |
```

## mixed_left_semi_join_32bit (Old implementation)

### [0] NVIDIA TITAN V
```
| Key Type | Payload Type | Nullable | Build Table Size | Probe Table Size | Samples |  CPU Time  | Noise |  GPU Time  | Noise |
|----------|--------------|----------|------------------|------------------|---------|------------|-------|------------|-------|
|      I32 |          I32 |        0 |           100000 |           100000 |   1392x | 368.030 us | 3.05% | 363.065 us | 2.70% |
|      I32 |          I32 |        0 |           100000 |           400000 |    832x | 832.492 us | 0.84% | 827.586 us | 0.60% |
|      I32 |          I32 |        0 |         10000000 |         10000000 |     16x |  32.310 ms | 0.03% |  32.305 ms | 0.03% |
|      I32 |          I32 |        0 |         10000000 |         40000000 |     11x | 100.222 ms | 0.03% | 100.218 ms | 0.03% |
|      I32 |          I32 |        0 |         10000000 |        100000000 |     11x | 235.874 ms | 0.01% | 235.870 ms | 0.01% |
|      I32 |          I32 |        0 |         80000000 |        100000000 |     11x | 307.042 ms | 0.01% | 307.038 ms | 0.01% |
|      I32 |          I32 |        0 |        100000000 |        100000000 |     11x | 326.797 ms | 0.01% | 326.794 ms | 0.01% |
|      I32 |          I32 |        0 |         10000000 |        240000000 |     11x | 552.730 ms | 0.01% | 552.728 ms | 0.01% |
|      I32 |          I32 |        0 |         80000000 |        240000000 |     11x | 624.958 ms | 0.01% | 624.956 ms | 0.01% |
|      I32 |          I32 |        0 |        100000000 |        240000000 |     11x | 644.148 ms | 0.00% | 644.146 ms | 0.00% |
```

Authors:
  - Tanmay Gujar (https://github.com/tgujar)
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Yunsong Wang (https://github.com/PointKernel)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15288
---
 cpp/CMakeLists.txt                            |   1 -
 cpp/include/cudf/join.hpp                     |  90 +----
 cpp/src/join/mixed_join_kernels_semi.cu       |  31 +-
 cpp/src/join/mixed_join_kernels_semi.cuh      |  64 +---
 cpp/src/join/mixed_join_semi.cu               | 360 ++----------------
 cpp/src/join/mixed_join_size_kernels_semi.cu  | 125 ------
 cpp/tests/join/mixed_join_tests.cu            |  41 --
 java/src/main/java/ai/rapids/cudf/Table.java  | 146 -------
 java/src/main/native/src/TableJni.cpp         |  60 ---
 .../test/java/ai/rapids/cudf/TableTest.java   | 116 ------
 10 files changed, 42 insertions(+), 992 deletions(-)
 delete mode 100644 cpp/src/join/mixed_join_size_kernels_semi.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f1d43e3c35f..7c32474ea56 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -453,7 +453,6 @@ add_library(
   src/join/mixed_join_semi.cu
   src/join/mixed_join_size_kernel.cu
   src/join/mixed_join_size_kernel_nulls.cu
-  src/join/mixed_join_size_kernels_semi.cu
   src/join/semi_join.cu
   src/json/json_path.cu
   src/lists/contains.cu
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index b7a3129cfec..e343ad9ee32 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -944,9 +944,6 @@ mixed_full_join(
  * @param right_conditional The right table used for the conditional join
  * @param binary_predicate The condition on which to join
  * @param compare_nulls Whether or not null values join to each other or not
- * @param output_size_data An optional pair of values indicating the exact output size and the
- * number of matches for each row in the larger of the two input tables, left or right (may be
- * precomputed using the corresponding mixed_full_join_size API).
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -958,8 +955,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& left_conditional,
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
-  null_equality compare_nulls = null_equality::EQUAL,
-  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -996,9 +992,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
  * @param right_conditional The right table used for the conditional join
  * @param binary_predicate The condition on which to join
  * @param compare_nulls Whether or not null values join to each other or not
- * @param output_size_data An optional pair of values indicating the exact output size and the
- * number of matches for each row in the larger of the two input tables, left or right (may be
- * precomputed using the corresponding mixed_full_join_size API).
  * @param mr Device memory resource used to allocate the returned table and columns' device memory
  *
  * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct
@@ -1010,8 +1003,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& left_conditional,
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
-  null_equality compare_nulls = null_equality::EQUAL,
-  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
+  null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -1094,84 +1086,6 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
   null_equality compare_nulls         = null_equality::EQUAL,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Returns the exact number of matches (rows) when performing a mixed
- * left semi join between the specified tables where the columns of the
- * equality table are equal and the predicate evaluates to true on the
- * conditional tables.
- *
- * If the provided predicate returns NULL for a pair of rows (left, right),
- * that pair is not included in the output. It is the user's responsibility to
- * choose a suitable compare_nulls value AND use appropriate null-safe
- * operators in the expression.
- *
- * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
- * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
- * match.
- * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
- * match.
- *
- * @param left_equality The left table used for the equality join
- * @param right_equality The right table used for the equality join
- * @param left_conditional The left table used for the conditional join
- * @param right_conditional The right table used for the conditional join
- * @param binary_predicate The condition on which to join
- * @param compare_nulls Whether or not null values join to each other or not
- * @param mr Device memory resource used to allocate the returned table and columns' device memory
- *
- * @return A pair containing the size that would result from performing the
- * requested join and the number of matches for each row in one of the two
- * tables. Which of the two tables is an implementation detail and should not
- * be relied upon, simply passed to the corresponding `mixed_left_join` API as
- * is.
- */
-std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_semi_join_size(
-  table_view const& left_equality,
-  table_view const& right_equality,
-  table_view const& left_conditional,
-  table_view const& right_conditional,
-  ast::expression const& binary_predicate,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Returns the exact number of matches (rows) when performing a mixed
- * left anti join between the specified tables.
- *
- * If the provided predicate returns NULL for a pair of rows (left, right),
- * that pair is not included in the output. It is the user's responsibility to
- * choose a suitable compare_nulls value AND use appropriate null-safe
- * operators in the expression.
- *
- * @throw cudf::logic_error If the binary predicate outputs a non-boolean result.
- * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not
- * match.
- * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not
- * match.
- *
- * @param left_equality The left table used for the equality join
- * @param right_equality The right table used for the equality join
- * @param left_conditional The left table used for the conditional join
- * @param right_conditional The right table used for the conditional join
- * @param binary_predicate The condition on which to join
- * @param compare_nulls Whether or not null values join to each other or not
- * @param mr Device memory resource used to allocate the returned table and columns' device memory
- *
- * @return A pair containing the size that would result from performing the
- * requested join and the number of matches for each row in one of the two
- * tables. Which of the two tables is an implementation detail and should not
- * be relied upon, simply passed to the corresponding `mixed_left_join` API as
- * is.
- */
-std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_anti_join_size(
-  table_view const& left_equality,
-  table_view const& right_equality,
-  table_view const& left_conditional,
-  table_view const& right_conditional,
-  ast::expression const& binary_predicate,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /**
  * @brief Returns the exact number of matches (rows) when performing a
  * conditional inner join between the specified tables where the predicate
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index 5a543997a50..01e3fe09b38 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -41,12 +41,9 @@ __attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
                        table_device_view build,
                        row_hash const hash_probe,
                        row_equality const equality_probe,
-                       join_kind const join_type,
                        cudf::detail::semi_map_type::device_view hash_table_view,
-                       size_type* join_output_l,
-                       cudf::ast::detail::expression_device_view device_expression_data,
-                       cudf::size_type const* join_result_offsets,
-                       bool const swap_tables)
+                       cudf::device_span<bool> left_table_keep_mask,
+                       cudf::ast::detail::expression_device_view device_expression_data)
 {
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is
@@ -60,7 +57,7 @@ __attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
 
   cudf::size_type const left_num_rows  = left_table.num_rows();
   cudf::size_type const right_num_rows = right_table.num_rows();
-  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
+  auto const outer_num_rows            = left_num_rows;
 
   cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
 
@@ -70,12 +67,10 @@ __attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
   if (outer_row_index < outer_num_rows) {
     // Figure out the number of elements for this key.
     auto equality = single_expression_equality<has_nulls>{
-      evaluator, thread_intermediate_storage, swap_tables, equality_probe};
+      evaluator, thread_intermediate_storage, false, equality_probe};
 
-    if ((join_type == join_kind::LEFT_ANTI_JOIN) !=
-        (hash_table_view.contains(outer_row_index, hash_probe, equality))) {
-      *(join_output_l + join_result_offsets[outer_row_index]) = outer_row_index;
-    }
+    left_table_keep_mask[outer_row_index] =
+      hash_table_view.contains(outer_row_index, hash_probe, equality);
   }
 }
 
@@ -86,12 +81,9 @@ template __global__ void mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, true>(
   table_device_view build,
   row_hash const hash_probe,
   row_equality const equality_probe,
-  join_kind const join_type,
   cudf::detail::semi_map_type::device_view hash_table_view,
-  size_type* join_output_l,
-  cudf::ast::detail::expression_device_view device_expression_data,
-  cudf::size_type const* join_result_offsets,
-  bool const swap_tables);
+  cudf::device_span<bool> left_table_keep_mask,
+  cudf::ast::detail::expression_device_view device_expression_data);
 
 template __global__ void mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>(
   table_device_view left_table,
@@ -100,12 +92,9 @@ template __global__ void mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>(
   table_device_view build,
   row_hash const hash_probe,
   row_equality const equality_probe,
-  join_kind const join_type,
   cudf::detail::semi_map_type::device_view hash_table_view,
-  size_type* join_output_l,
-  cudf::ast::detail::expression_device_view device_expression_data,
-  cudf::size_type const* join_result_offsets,
-  bool const swap_tables);
+  cudf::device_span<bool> left_table_keep_mask,
+  cudf::ast::detail::expression_device_view device_expression_data);
 
 }  // namespace detail
 
diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh
index f411d36f0a8..4ea404d451c 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cuh
+++ b/cpp/src/join/mixed_join_kernels_semi.cuh
@@ -27,53 +27,7 @@ namespace cudf {
 namespace detail {
 
 /**
- * @brief Computes the output size of joining the left table to the right table for semi/anti joins.
- *
- * This method probes the hash table with each row in the probe table using a
- * custom equality comparator that also checks that the conditional expression
- * evaluates to true between the left/right tables when a match is found
- * between probe and build rows.
- *
- * @tparam block_size The number of threads per block for this kernel
- * @tparam has_nulls Whether or not the inputs may contain nulls.
- *
- * @param[in] left_table The left table
- * @param[in] right_table The right table
- * @param[in] probe The table with which to probe the hash table for matches.
- * @param[in] build The table with which the hash table was built.
- * @param[in] hash_probe The hasher used for the probe table.
- * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] join_type The type of join to be performed
- * @param[in] hash_table_view The hash table built from `build`.
- * @param[in] device_expression_data Container of device data required to evaluate the desired
- * expression.
- * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
- * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
- * @param[out] output_size The resulting output size
- * @param[out] matches_per_row The number of matches in one pair of
- * equality/conditional tables for each row in the other pair of tables. If
- * swap_tables is true, matches_per_row corresponds to the right_table,
- * otherwise it corresponds to the left_table. Note that corresponding swap of
- * left/right tables to determine which is the build table and which is the
- * probe table has already happened on the host.
- */
-template <int block_size, bool has_nulls>
-__global__ void compute_mixed_join_output_size_semi(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::semi_map_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
-/**
- * @brief Performs a semi/anti join using the combination of a hash lookup to
+ * @brief Performs a semi join using the combination of a hash lookup to
  * identify equal rows between one pair of tables and the evaluation of an
  * expression containing an arbitrary expression.
  *
@@ -91,16 +45,11 @@ __global__ void compute_mixed_join_output_size_semi(
  * @param[in] build The table with which the hash table was built.
  * @param[in] hash_probe The hasher used for the probe table.
  * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] join_type The type of join to be performed
  * @param[in] hash_table_view The hash table built from `build`.
- * @param[out] join_output_l The left result of the join operation
+ * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating
+ * the corresponding index from left table is present in output
  * @param[in] device_expression_data Container of device data required to evaluate the desired
  * expression.
- * @param[in] join_result_offsets The starting indices in join_output[l|r]
- * where the matches for each row begin. Equivalent to a prefix sum of
- * matches_per_row.
- * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
- * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
  */
 template <cudf::size_type block_size, bool has_nulls>
 __global__ void mixed_join_semi(table_device_view left_table,
@@ -109,12 +58,9 @@ __global__ void mixed_join_semi(table_device_view left_table,
                                 table_device_view build,
                                 row_hash const hash_probe,
                                 row_equality const equality_probe,
-                                join_kind const join_type,
                                 cudf::detail::semi_map_type::device_view hash_table_view,
-                                size_type* join_output_l,
-                                cudf::ast::detail::expression_device_view device_expression_data,
-                                cudf::size_type const* join_result_offsets,
-                                bool const swap_tables);
+                                cudf::device_span<bool> left_table_keep_mask,
+                                cudf::ast::detail::expression_device_view device_expression_data);
 
 }  // namespace detail
 
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index edf6c32eadf..d654f580cad 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -92,7 +92,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   join_kind join_type,
-  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
@@ -107,12 +106,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
 
   auto const right_num_rows{right_conditional.num_rows()};
   auto const left_num_rows{left_conditional.num_rows()};
-  auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
-
-  // The "outer" table is the larger of the two tables. The kernels are
-  // launched with one thread per row of the outer table, which also means that
-  // it is the probe table for the hash
-  auto const outer_num_rows{swap_tables ? right_num_rows : left_num_rows};
+  auto const outer_num_rows{left_num_rows};
 
   // We can immediately filter out cases where the right table is empty. In
   // some cases, we return all the rows of the left table with a corresponding
@@ -155,8 +149,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   // TODO: The non-conditional join impls start with a dictionary matching,
   // figure out what that is and what it's needed for (and if conditional joins
   // need to do the same).
-  auto& probe                 = swap_tables ? right_equality : left_equality;
-  auto& build                 = swap_tables ? left_equality : right_equality;
+  auto& probe                 = left_equality;
+  auto& build                 = right_equality;
   auto probe_view             = table_device_view::create(probe, stream);
   auto build_view             = table_device_view::create(build, stream);
   auto left_conditional_view  = table_device_view::create(left_conditional, stream);
@@ -197,8 +191,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto const equality_build_equality =
     row_comparator_build.equal_to<false>(build_nulls, compare_nulls);
   auto const preprocessed_build_condtional =
-    experimental::row::equality::preprocessed_table::create(
-      swap_tables ? left_conditional : right_conditional, stream);
+    experimental::row::equality::preprocessed_table::create(right_conditional, stream);
   auto const row_comparator_conditional_build =
     cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional,
                                                             preprocessed_build_condtional};
@@ -225,84 +218,14 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
 
   auto hash_table_view = hash_table.get_device_view();
 
-  // For inner joins we support optimizing the join by launching one thread for
-  // whichever table is larger rather than always using the left table.
   detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
-  join_kind const kernel_join_type =
-    join_type == join_kind::FULL_JOIN ? join_kind::LEFT_JOIN : join_type;
-
-  // If the join size data was not provided as an input, compute it here.
-  std::size_t join_size;
-  // Using an optional because we only need to allocate a new vector if one was
-  // not passed as input, and rmm::device_uvector is not default constructible
-  std::optional<rmm::device_uvector<size_type>> matches_per_row{};
-  device_span<size_type const> matches_per_row_span{};
 
   auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
   auto const hash_probe = row_hash.device_hasher(has_nulls);
 
-  if (output_size_data.has_value()) {
-    join_size            = output_size_data->first;
-    matches_per_row_span = output_size_data->second;
-  } else {
-    // Allocate storage for the counter used to get the size of the join output
-    rmm::device_scalar<std::size_t> size(0, stream, mr);
-
-    matches_per_row =
-      rmm::device_uvector<size_type>{static_cast<std::size_t>(outer_num_rows), stream, mr};
-    // Note that the view goes out of scope after this else statement, but the
-    // data owned by matches_per_row stays alive so the data pointer is valid.
-    auto mutable_matches_per_row_span = cudf::device_span<size_type>{
-      matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
-    matches_per_row_span = cudf::device_span<size_type const>{
-      matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
-    if (has_nulls) {
-      compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, true>
-        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-          *left_conditional_view,
-          *right_conditional_view,
-          *probe_view,
-          *build_view,
-          hash_probe,
-          equality_probe,
-          kernel_join_type,
-          hash_table_view,
-          parser.device_expression_data,
-          swap_tables,
-          size.data(),
-          mutable_matches_per_row_span);
-    } else {
-      compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, false>
-        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-          *left_conditional_view,
-          *right_conditional_view,
-          *probe_view,
-          *build_view,
-          hash_probe,
-          equality_probe,
-          kernel_join_type,
-          hash_table_view,
-          parser.device_expression_data,
-          swap_tables,
-          size.data(),
-          mutable_matches_per_row_span);
-    }
-    join_size = size.value(stream);
-  }
-
-  if (join_size == 0) { return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr); }
-
-  // Given the number of matches per row, we need to compute the offsets for insertion.
-  auto join_result_offsets =
-    rmm::device_uvector<size_type>{static_cast<std::size_t>(outer_num_rows), stream, mr};
-  thrust::exclusive_scan(rmm::exec_policy{stream},
-                         matches_per_row_span.begin(),
-                         matches_per_row_span.end(),
-                         join_result_offsets.begin());
-
-  auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
-  auto const& join_output_l = left_indices->data();
+  // Vector used to indicate indices from left/probe table which are present in output
+  auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);
 
   if (has_nulls) {
     mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, true>
@@ -313,12 +236,9 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
         *build_view,
         hash_probe,
         equality_probe,
-        kernel_join_type,
         hash_table_view,
-        join_output_l,
-        parser.device_expression_data,
-        join_result_offsets.data(),
-        swap_tables);
+        cudf::device_span<bool>(left_table_keep_mask),
+        parser.device_expression_data);
   } else {
     mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>
       <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
@@ -328,235 +248,30 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
         *build_view,
         hash_probe,
         equality_probe,
-        kernel_join_type,
         hash_table_view,
-        join_output_l,
-        parser.device_expression_data,
-        join_result_offsets.data(),
-        swap_tables);
+        cudf::device_span<bool>(left_table_keep_mask),
+        parser.device_expression_data);
   }
 
-  return left_indices;
-}
-
-std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>>
-compute_mixed_join_output_size_semi(table_view const& left_equality,
-                                    table_view const& right_equality,
-                                    table_view const& left_conditional,
-                                    table_view const& right_conditional,
-                                    ast::expression const& binary_predicate,
-                                    null_equality compare_nulls,
-                                    join_kind join_type,
-                                    rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
-{
-  CUDF_EXPECTS(
-    (join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) &&
-      (join_type != join_kind::FULL_JOIN),
-    "Inner, left, and full join size estimation should use compute_mixed_join_output_size.");
-
-  CUDF_EXPECTS(left_conditional.num_rows() == left_equality.num_rows(),
-               "The left conditional and equality tables must have the same number of rows.");
-  CUDF_EXPECTS(right_conditional.num_rows() == right_equality.num_rows(),
-               "The right conditional and equality tables must have the same number of rows.");
-
-  auto const right_num_rows{right_conditional.num_rows()};
-  auto const left_num_rows{left_conditional.num_rows()};
-  auto const swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
-
-  // The "outer" table is the larger of the two tables. The kernels are
-  // launched with one thread per row of the outer table, which also means that
-  // it is the probe table for the hash
-  auto const outer_num_rows{swap_tables ? right_num_rows : left_num_rows};
-
-  auto matches_per_row = std::make_unique<rmm::device_uvector<size_type>>(
-    static_cast<std::size_t>(outer_num_rows), stream, mr);
-  auto matches_per_row_span = cudf::device_span<size_type>{
-    matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
-
-  // We can immediately filter out cases where one table is empty. In
-  // some cases, we return all the rows of the other table with a corresponding
-  // null index for the empty table; in others, we return an empty output.
-  if (right_num_rows == 0) {
-    switch (join_type) {
-      // Left, left anti, and full all return all the row indices from left
-      // with a corresponding NULL from the right.
-      case join_kind::LEFT_ANTI_JOIN: {
-        thrust::fill(matches_per_row->begin(), matches_per_row->end(), 1);
-        return {left_num_rows, std::move(matches_per_row)};
-      }
-      // Inner and left semi joins return empty output because no matches can exist.
-      case join_kind::LEFT_SEMI_JOIN: return {0, std::move(matches_per_row)};
-      default: CUDF_FAIL("Invalid join kind."); break;
-    }
-  } else if (left_num_rows == 0) {
-    switch (join_type) {
-      // Left, left anti, left semi, and inner joins all return empty sets.
-      case join_kind::LEFT_ANTI_JOIN:
-      case join_kind::LEFT_SEMI_JOIN: {
-        thrust::fill(matches_per_row->begin(), matches_per_row->end(), 0);
-        return {0, std::move(matches_per_row)};
-      }
-      default: CUDF_FAIL("Invalid join kind."); break;
-    }
-  }
-
-  // If evaluating the expression may produce null outputs we create a nullable
-  // output column and follow the null-supporting expression evaluation code
-  // path.
-  auto const has_nulls = cudf::nullate::DYNAMIC{
-    cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) ||
-    binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)};
-
-  auto const parser = ast::detail::expression_parser{
-    binary_predicate, left_conditional, right_conditional, has_nulls, stream, mr};
-  CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
-               "The expression must produce a boolean output.");
-
-  // TODO: The non-conditional join impls start with a dictionary matching,
-  // figure out what that is and what it's needed for (and if conditional joins
-  // need to do the same).
-  auto& probe                 = swap_tables ? right_equality : left_equality;
-  auto& build                 = swap_tables ? left_equality : right_equality;
-  auto probe_view             = table_device_view::create(probe, stream);
-  auto build_view             = table_device_view::create(build, stream);
-  auto left_conditional_view  = table_device_view::create(left_conditional, stream);
-  auto right_conditional_view = table_device_view::create(right_conditional, stream);
-
-  auto const preprocessed_build =
-    experimental::row::equality::preprocessed_table::create(build, stream);
-  auto const preprocessed_probe =
-    experimental::row::equality::preprocessed_table::create(probe, stream);
-  auto const row_comparator =
-    cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build};
-  auto const equality_probe = row_comparator.equal_to<false>(has_nulls, compare_nulls);
-
-  semi_map_type hash_table{compute_hash_table_size(build.num_rows()),
-                           cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
-                           cuco::empty_value{cudf::detail::JoinNoneValue},
-                           cudf::detail::cuco_allocator{stream},
-                           stream.value()};
-
-  // Create hash table containing all keys found in right table
-  // TODO: To add support for nested columns we will need to flatten in many
-  // places. However, this probably isn't worth adding any time soon since we
-  // won't be able to support AST conditions for those types anyway.
-  auto const build_nulls    = cudf::nullate::DYNAMIC{cudf::has_nulls(build)};
-  auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build};
-  auto const hash_build     = row_hash_build.device_hasher(build_nulls);
-  // Since we may see multiple rows that are identical in the equality tables
-  // but differ in the conditional tables, the equality comparator used for
-  // insertion must account for both sets of tables. An alternative solution
-  // would be to use a multimap, but that solution would store duplicates where
-  // equality and conditional rows are equal, so this approach is preferable.
-  // One way to make this solution even more efficient would be to only include
-  // the columns of the conditional table that are used by the expression, but
-  // that requires additional plumbing through the AST machinery and is out of
-  // scope for now.
-  auto const row_comparator_build =
-    cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_build};
-  auto const equality_build_equality =
-    row_comparator_build.equal_to<false>(build_nulls, compare_nulls);
-  auto const preprocessed_build_condtional =
-    experimental::row::equality::preprocessed_table::create(
-      swap_tables ? left_conditional : right_conditional, stream);
-  auto const row_comparator_conditional_build =
-    cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional,
-                                                            preprocessed_build_condtional};
-  auto const equality_build_conditional =
-    row_comparator_conditional_build.equal_to<false>(build_nulls, compare_nulls);
-  double_row_equality equality_build{equality_build_equality, equality_build_conditional};
-  make_pair_function_semi pair_func_build{};
-
-  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build);
-
-  // skip rows that are null here.
-  if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
-    hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
-  } else {
-    thrust::counting_iterator<cudf::size_type> stencil(0);
-    auto const [row_bitmask, _] =
-      cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource());
-    row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
-
-    // insert valid rows
-    hash_table.insert_if(
-      iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value());
-  }
-
-  auto hash_table_view = hash_table.get_device_view();
-
-  // For inner joins we support optimizing the join by launching one thread for
-  // whichever table is larger rather than always using the left table.
-  detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
-  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
-
-  // Allocate storage for the counter used to get the size of the join output
-  rmm::device_scalar<std::size_t> size(0, stream, mr);
-
-  auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
-  auto const hash_probe = row_hash.device_hasher(has_nulls);
-
-  // Determine number of output rows without actually building the output to simply
-  // find what the size of the output will be.
-  if (has_nulls) {
-    compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, true>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        join_type,
-        hash_table_view,
-        parser.device_expression_data,
-        swap_tables,
-        size.data(),
-        matches_per_row_span);
-  } else {
-    compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, false>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        join_type,
-        hash_table_view,
-        parser.device_expression_data,
-        swap_tables,
-        size.data(),
-        matches_per_row_span);
-  }
-
-  return {size.value(stream), std::move(matches_per_row)};
+  auto gather_map = std::make_unique<rmm::device_uvector<size_type>>(probe.num_rows(), stream, mr);
+
+  // gather_map_end will be the end of valid data in gather_map
+  auto gather_map_end =
+    thrust::copy_if(rmm::exec_policy(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(probe.num_rows()),
+                    left_table_keep_mask.begin(),
+                    gather_map->begin(),
+                    [join_type] __device__(bool keep_row) {
+                      return keep_row == (join_type == detail::join_kind::LEFT_SEMI_JOIN);
+                    });
+
+  gather_map->resize(thrust::distance(gather_map->begin(), gather_map_end), stream);
+  return gather_map;
 }
 
 }  // namespace detail
 
-std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_semi_join_size(
-  table_view const& left_equality,
-  table_view const& right_equality,
-  table_view const& left_conditional,
-  table_view const& right_conditional,
-  ast::expression const& binary_predicate,
-  null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::compute_mixed_join_output_size_semi(left_equality,
-                                                     right_equality,
-                                                     left_conditional,
-                                                     right_conditional,
-                                                     binary_predicate,
-                                                     compare_nulls,
-                                                     detail::join_kind::LEFT_SEMI_JOIN,
-                                                     cudf::get_default_stream(),
-                                                     mr);
-}
-
 std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& left_equality,
   table_view const& right_equality,
@@ -564,7 +279,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
-  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data,
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
@@ -575,32 +289,10 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
                                  binary_predicate,
                                  compare_nulls,
                                  detail::join_kind::LEFT_SEMI_JOIN,
-                                 output_size_data,
                                  cudf::get_default_stream(),
                                  mr);
 }
 
-std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_left_anti_join_size(
-  table_view const& left_equality,
-  table_view const& right_equality,
-  table_view const& left_conditional,
-  table_view const& right_conditional,
-  ast::expression const& binary_predicate,
-  null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::compute_mixed_join_output_size_semi(left_equality,
-                                                     right_equality,
-                                                     left_conditional,
-                                                     right_conditional,
-                                                     binary_predicate,
-                                                     compare_nulls,
-                                                     detail::join_kind::LEFT_ANTI_JOIN,
-                                                     cudf::get_default_stream(),
-                                                     mr);
-}
-
 std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& left_equality,
   table_view const& right_equality,
@@ -608,7 +300,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
-  std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data,
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
@@ -619,7 +310,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
                                  binary_predicate,
                                  compare_nulls,
                                  detail::join_kind::LEFT_ANTI_JOIN,
-                                 output_size_data,
                                  cudf::get_default_stream(),
                                  mr);
 }
diff --git a/cpp/src/join/mixed_join_size_kernels_semi.cu b/cpp/src/join/mixed_join_size_kernels_semi.cu
deleted file mode 100644
index 7a22ac60710..00000000000
--- a/cpp/src/join/mixed_join_size_kernels_semi.cu
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "join/join_common_utils.cuh"
-#include "join/join_common_utils.hpp"
-#include "join/mixed_join_common_utils.cuh"
-
-#include <cudf/ast/detail/expression_evaluator.cuh>
-#include <cudf/ast/detail/expression_parser.hpp>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/utilities/span.hpp>
-
-#include <cub/cub.cuh>
-
-namespace cudf {
-namespace detail {
-
-namespace cg = cooperative_groups;
-
-#pragma GCC diagnostic ignored "-Wattributes"
-
-template <int block_size, bool has_nulls>
-__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
-  void compute_mixed_join_output_size_semi(
-    table_device_view left_table,
-    table_device_view right_table,
-    table_device_view probe,
-    table_device_view build,
-    row_hash const hash_probe,
-    row_equality const equality_probe,
-    join_kind const join_type,
-    cudf::detail::semi_map_type::device_view hash_table_view,
-    ast::detail::expression_device_view device_expression_data,
-    bool const swap_tables,
-    std::size_t* output_size,
-    cudf::device_span<cudf::size_type> matches_per_row)
-{
-  // The (required) extern storage of the shared memory array leads to
-  // conflicting declarations between different templates. The easiest
-  // workaround is to declare an arbitrary (here char) array type then cast it
-  // after the fact to the appropriate type.
-  extern __shared__ char raw_intermediate_storage[];
-  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
-    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
-  auto thread_intermediate_storage =
-    intermediate_storage + (threadIdx.x * device_expression_data.num_intermediates);
-
-  std::size_t thread_counter{0};
-  cudf::size_type const start_idx      = threadIdx.x + blockIdx.x * block_size;
-  cudf::size_type const stride         = block_size * gridDim.x;
-  cudf::size_type const left_num_rows  = left_table.num_rows();
-  cudf::size_type const right_num_rows = right_table.num_rows();
-  auto const outer_num_rows            = (swap_tables ? right_num_rows : left_num_rows);
-
-  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
-    left_table, right_table, device_expression_data);
-
-  // TODO: Address asymmetry in operator.
-  auto equality = single_expression_equality<has_nulls>{
-    evaluator, thread_intermediate_storage, swap_tables, equality_probe};
-
-  for (cudf::size_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
-       outer_row_index += stride) {
-    matches_per_row[outer_row_index] =
-      ((join_type == join_kind::LEFT_ANTI_JOIN) !=
-       (hash_table_view.contains(outer_row_index, hash_probe, equality)));
-    thread_counter += matches_per_row[outer_row_index];
-  }
-
-  using BlockReduce = cub::BlockReduce<cudf::size_type, block_size>;
-  __shared__ typename BlockReduce::TempStorage temp_storage;
-  std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
-
-  // Add block counter to global counter
-  if (threadIdx.x == 0) {
-    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*output_size};
-    ref.fetch_add(block_counter, cuda::std::memory_order_relaxed);
-  }
-}
-
-template __global__ void compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, true>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::semi_map_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
-template __global__ void compute_mixed_join_output_size_semi<DEFAULT_JOIN_BLOCK_SIZE, false>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::semi_map_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
-}  // namespace detail
-
-}  // namespace cudf
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
index cc37dadffd8..6c147c8a128 100644
--- a/cpp/tests/join/mixed_join_tests.cu
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -657,10 +657,6 @@ struct MixedJoinSingleReturnTest : public MixedJoinTest<T> {
                      std::vector<cudf::size_type> expected_outputs,
                      cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
   {
-    auto [result_size, actual_counts] = this->join_size(
-      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
-    EXPECT_TRUE(result_size == expected_outputs.size());
-
     auto result = this->join(
       left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
     std::vector<cudf::size_type> resulting_indices;
@@ -751,19 +747,6 @@ struct MixedJoinSingleReturnTest : public MixedJoinTest<T> {
                                 cudf::table_view right_conditional,
                                 cudf::ast::operation predicate,
                                 cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) = 0;
-
-  /**
-   * This method must be implemented by subclasses for specific types of joins.
-   * It should be a simply forwarding of arguments to the appropriate cudf
-   * mixed join size computation API.
-   */
-  virtual std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
-    cudf::table_view left_equality,
-    cudf::table_view right_equality,
-    cudf::table_view left_conditional,
-    cudf::table_view right_conditional,
-    cudf::ast::operation predicate,
-    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) = 0;
 };
 
 /**
@@ -781,18 +764,6 @@ struct MixedLeftSemiJoinTest : public MixedJoinSingleReturnTest<T> {
     return cudf::mixed_left_semi_join(
       left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
   }
-
-  std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
-    cudf::table_view left_equality,
-    cudf::table_view right_equality,
-    cudf::table_view left_conditional,
-    cudf::table_view right_conditional,
-    cudf::ast::operation predicate,
-    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
-  {
-    return cudf::mixed_left_semi_join_size(
-      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
-  }
 };
 
 TYPED_TEST_SUITE(MixedLeftSemiJoinTest, cudf::test::IntegralTypesNotBool);
@@ -874,18 +845,6 @@ struct MixedLeftAntiJoinTest : public MixedJoinSingleReturnTest<T> {
     return cudf::mixed_left_anti_join(
       left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
   }
-
-  std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<cudf::size_type>>> join_size(
-    cudf::table_view left_equality,
-    cudf::table_view right_equality,
-    cudf::table_view left_conditional,
-    cudf::table_view right_conditional,
-    cudf::ast::operation predicate,
-    cudf::null_equality compare_nulls = cudf::null_equality::EQUAL) override
-  {
-    return cudf::mixed_left_anti_join_size(
-      left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
-  }
 };
 
 TYPED_TEST_SUITE(MixedLeftAntiJoinTest, cudf::test::IntegralTypesNotBool);
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 5ce2f9d2d6e..4038b3a40b8 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -732,32 +732,14 @@ private static native long[] mixedFullJoinGatherMaps(long leftKeysTable, long ri
                                                        long leftConditionTable, long rightConditionTable,
                                                        long condition, boolean compareNullsEqual);
 
-  private static native long[] mixedLeftSemiJoinSize(long leftKeysTable, long rightKeysTable,
-                                                     long leftConditionTable, long rightConditionTable,
-                                                     long condition, boolean compareNullsEqual);
-
   private static native long[] mixedLeftSemiJoinGatherMap(long leftKeysTable, long rightKeysTable,
                                                           long leftConditionTable, long rightConditionTable,
                                                           long condition, boolean compareNullsEqual);
 
-  private static native long[] mixedLeftSemiJoinGatherMapWithSize(long leftKeysTable, long rightKeysTable,
-                                                                  long leftConditionTable, long rightConditionTable,
-                                                                  long condition, boolean compareNullsEqual,
-                                                                  long outputRowCount, long matchesColumnView);
-
-  private static native long[] mixedLeftAntiJoinSize(long leftKeysTable, long rightKeysTable,
-                                                     long leftConditionTable, long rightConditionTable,
-                                                     long condition, boolean compareNullsEqual);
-
   private static native long[] mixedLeftAntiJoinGatherMap(long leftKeysTable, long rightKeysTable,
                                                           long leftConditionTable, long rightConditionTable,
                                                           long condition, boolean compareNullsEqual);
 
-  private static native long[] mixedLeftAntiJoinGatherMapWithSize(long leftKeysTable, long rightKeysTable,
-                                                                  long leftConditionTable, long rightConditionTable,
-                                                                  long condition, boolean compareNullsEqual,
-                                                                  long outputRowCount, long matchesColumnView);
-
   private static native long[] crossJoin(long leftTable, long rightTable) throws CudfException;
 
   private static native long[] concatenate(long[] cudfTablePointers) throws CudfException;
@@ -3747,34 +3729,6 @@ public GatherMap conditionalLeftSemiJoinGatherMap(Table rightTable,
     return buildSingleJoinGatherMap(gatherMapData);
   }
 
-  /**
-   * Computes output size information for a left semi join between two tables using a mix of
-   * equality and inequality conditions. The entire join condition is assumed to be a logical AND
-   * of the equality condition and inequality condition.
-   * NOTE: It is the responsibility of the caller to close the resulting size information object
-   * or native resources can be leaked!
-   * @param leftKeys the left table's key columns for the equality condition
-   * @param rightKeys the right table's key columns for the equality condition
-   * @param leftConditional the left table's columns needed to evaluate the inequality condition
-   * @param rightConditional the right table's columns needed to evaluate the inequality condition
-   * @param condition the inequality condition of the join
-   * @param nullEquality whether nulls should compare as equal
-   * @return size information for the join
-   */
-  public static MixedJoinSize mixedLeftSemiJoinSize(Table leftKeys, Table rightKeys,
-                                                    Table leftConditional, Table rightConditional,
-                                                    CompiledExpression condition,
-                                                    NullEquality nullEquality) {
-    long[] mixedSizeInfo = mixedLeftSemiJoinSize(
-        leftKeys.getNativeView(), rightKeys.getNativeView(),
-        leftConditional.getNativeView(), rightConditional.getNativeView(),
-        condition.getNativeHandle(), nullEquality == NullEquality.EQUAL);
-    assert mixedSizeInfo.length == 2;
-    long outputRowCount = mixedSizeInfo[0];
-    long matchesColumnHandle = mixedSizeInfo[1];
-    return new MixedJoinSize(outputRowCount, new ColumnVector(matchesColumnHandle));
-  }
-
   /**
    * Computes the gather map that can be used to manifest the result of a left semi join between
    * two tables using a mix of equality and inequality conditions. The entire join condition is
@@ -3804,42 +3758,6 @@ public static GatherMap mixedLeftSemiJoinGatherMap(Table leftKeys, Table rightKe
     return buildSingleJoinGatherMap(gatherMapData);
   }
 
-  /**
-   * Computes the gather map that can be used to manifest the result of a left semi join between
-   * two tables using a mix of equality and inequality conditions. The entire join condition is
-   * assumed to be a logical AND of the equality condition and inequality condition.
-   * A {@link GatherMap} instance will be returned that can be used to gather
-   * the left table to produce the result of the left semi join.
-   *
-   * It is the responsibility of the caller to close the resulting gather map instances.
-   *
-   * This interface allows passing the size result from
-   * {@link #mixedLeftSemiJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)}
-   * when the output size was computed previously.
-   *
-   * @param leftKeys the left table's key columns for the equality condition
-   * @param rightKeys the right table's key columns for the equality condition
-   * @param leftConditional the left table's columns needed to evaluate the inequality condition
-   * @param rightConditional the right table's columns needed to evaluate the inequality condition
-   * @param condition the inequality condition of the join
-   * @param nullEquality whether nulls should compare as equal
-   * @param joinSize mixed join size result
-   * @return left and right table gather maps
-   */
-  public static GatherMap mixedLeftSemiJoinGatherMap(Table leftKeys, Table rightKeys,
-                                                     Table leftConditional, Table rightConditional,
-                                                     CompiledExpression condition,
-                                                     NullEquality nullEquality,
-                                                     MixedJoinSize joinSize) {
-    long[] gatherMapData = mixedLeftSemiJoinGatherMapWithSize(
-        leftKeys.getNativeView(), rightKeys.getNativeView(),
-        leftConditional.getNativeView(), rightConditional.getNativeView(),
-        condition.getNativeHandle(),
-        nullEquality == NullEquality.EQUAL,
-        joinSize.getOutputRowCount(), joinSize.getMatches().getNativeView());
-    return buildSingleJoinGatherMap(gatherMapData);
-  }
-
   /**
    * Computes the gather map that can be used to manifest the result of a left anti-join between
    * two tables. It is assumed this table instance holds the key columns from the left table, and
@@ -3919,34 +3837,6 @@ public GatherMap conditionalLeftAntiJoinGatherMap(Table rightTable,
     return buildSingleJoinGatherMap(gatherMapData);
   }
 
-  /**
-   * Computes output size information for a left anti join between two tables using a mix of
-   * equality and inequality conditions. The entire join condition is assumed to be a logical AND
-   * of the equality condition and inequality condition.
-   * NOTE: It is the responsibility of the caller to close the resulting size information object
-   * or native resources can be leaked!
-   * @param leftKeys the left table's key columns for the equality condition
-   * @param rightKeys the right table's key columns for the equality condition
-   * @param leftConditional the left table's columns needed to evaluate the inequality condition
-   * @param rightConditional the right table's columns needed to evaluate the inequality condition
-   * @param condition the inequality condition of the join
-   * @param nullEquality whether nulls should compare as equal
-   * @return size information for the join
-   */
-  public static MixedJoinSize mixedLeftAntiJoinSize(Table leftKeys, Table rightKeys,
-                                                    Table leftConditional, Table rightConditional,
-                                                    CompiledExpression condition,
-                                                    NullEquality nullEquality) {
-    long[] mixedSizeInfo = mixedLeftAntiJoinSize(
-        leftKeys.getNativeView(), rightKeys.getNativeView(),
-        leftConditional.getNativeView(), rightConditional.getNativeView(),
-        condition.getNativeHandle(), nullEquality == NullEquality.EQUAL);
-    assert mixedSizeInfo.length == 2;
-    long outputRowCount = mixedSizeInfo[0];
-    long matchesColumnHandle = mixedSizeInfo[1];
-    return new MixedJoinSize(outputRowCount, new ColumnVector(matchesColumnHandle));
-  }
-
   /**
    * Computes the gather map that can be used to manifest the result of a left anti join between
    * two tables using a mix of equality and inequality conditions. The entire join condition is
@@ -3976,42 +3866,6 @@ public static GatherMap mixedLeftAntiJoinGatherMap(Table leftKeys, Table rightKe
     return buildSingleJoinGatherMap(gatherMapData);
   }
 
-  /**
-   * Computes the gather map that can be used to manifest the result of a left anti join between
-   * two tables using a mix of equality and inequality conditions. The entire join condition is
-   * assumed to be a logical AND of the equality condition and inequality condition.
-   * A {@link GatherMap} instance will be returned that can be used to gather
-   * the left table to produce the result of the left anti join.
-   *
-   * It is the responsibility of the caller to close the resulting gather map instances.
-   *
-   * This interface allows passing the size result from
-   * {@link #mixedLeftAntiJoinSize(Table, Table, Table, Table, CompiledExpression, NullEquality)}
-   * when the output size was computed previously.
-   *
-   * @param leftKeys the left table's key columns for the equality condition
-   * @param rightKeys the right table's key columns for the equality condition
-   * @param leftConditional the left table's columns needed to evaluate the inequality condition
-   * @param rightConditional the right table's columns needed to evaluate the inequality condition
-   * @param condition the inequality condition of the join
-   * @param nullEquality whether nulls should compare as equal
-   * @param joinSize mixed join size result
-   * @return left and right table gather maps
-   */
-  public static GatherMap mixedLeftAntiJoinGatherMap(Table leftKeys, Table rightKeys,
-                                                     Table leftConditional, Table rightConditional,
-                                                     CompiledExpression condition,
-                                                     NullEquality nullEquality,
-                                                     MixedJoinSize joinSize) {
-    long[] gatherMapData = mixedLeftAntiJoinGatherMapWithSize(
-        leftKeys.getNativeView(), rightKeys.getNativeView(),
-        leftConditional.getNativeView(), rightConditional.getNativeView(),
-        condition.getNativeHandle(),
-        nullEquality == NullEquality.EQUAL,
-        joinSize.getOutputRowCount(), joinSize.getMatches().getNativeView());
-    return buildSingleJoinGatherMap(gatherMapData);
-  }
-
   /**
    * Construct a table from a packed representation.
    * @param metadata host-based metadata for the table
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 51b8eb853de..e8616710217 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -2838,20 +2838,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGa
       });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
-  return cudf::jni::mixed_join_size(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_semi_join_size(left_keys, right_keys, left_condition,
-                                               right_condition, condition, nulls_equal);
-      });
-}
-
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMap(
     JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
     jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
@@ -2866,22 +2852,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMa
       });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMapWithSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count,
-    jlong j_matches_view) {
-  auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view);
-  return cudf::jni::mixed_join_gather_single_map(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-                   cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-                   cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_semi_join(left_keys, right_keys, left_condition, right_condition,
-                                          condition, nulls_equal, size_info);
-      });
-}
-
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoinGatherMap(
     JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
   return cudf::jni::join_gather_single_map(
@@ -2930,20 +2900,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGa
       });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
-  return cudf::jni::mixed_join_size(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_anti_join_size(left_keys, right_keys, left_condition,
-                                               right_condition, condition, nulls_equal);
-      });
-}
-
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMap(
     JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
     jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
@@ -2958,22 +2914,6 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMa
       });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMapWithSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count,
-    jlong j_matches_view) {
-  auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view);
-  return cudf::jni::mixed_join_gather_single_map(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-                   cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-                   cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_anti_join(left_keys, right_keys, left_condition, right_condition,
-                                          condition, nulls_equal, size_info);
-      });
-}
-
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv *env, jclass,
                                                                  jlong left_table,
                                                                  jlong right_table) {
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 30905783c7f..8560a9caad7 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -3058,64 +3058,6 @@ void testMixedLeftSemiJoinGatherMapNulls() {
     }
   }
 
-  @Test
-  void testMixedLeftSemiJoinGatherMapWithSize() {
-    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
-        new ColumnReference(1, TableReference.LEFT),
-        new ColumnReference(1, TableReference.RIGHT));
-    try (CompiledExpression condition = expr.compile();
-         Table left = new Table.TestBuilder()
-             .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8)
-             .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0)
-             .build();
-         Table leftKeys = new Table(left.getColumn(0));
-         Table right = new Table.TestBuilder()
-             .column(6, 5, 9, 8, 10, 32)
-             .column(0, 1, 2, 3, 4, 5)
-             .column(7, 8, 9, 0, 1, 2).build();
-         Table rightKeys = new Table(right.getColumn(0));
-         Table expected = new Table.TestBuilder()
-             .column(2, 7, 8)
-             .build();
-         MixedJoinSize sizeInfo = Table.mixedLeftSemiJoinSize(leftKeys, rightKeys, left, right,
-             condition, NullEquality.UNEQUAL)) {
-      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
-      try (GatherMap map = Table.mixedLeftSemiJoinGatherMap(leftKeys, rightKeys, left, right,
-          condition, NullEquality.UNEQUAL, sizeInfo)) {
-        verifySemiJoinGatherMap(map, expected);
-      }
-    }
-  }
-
-  @Test
-  void testMixedLeftSemiJoinGatherMapNullsWithSize() {
-    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
-        new ColumnReference(1, TableReference.LEFT),
-        new ColumnReference(1, TableReference.RIGHT));
-    try (CompiledExpression condition = expr.compile();
-         Table left = new Table.TestBuilder()
-             .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8)
-             .column(   1, 2, 3, 4, 5, 6, 7,    8, 9, 0)
-             .build();
-         Table leftKeys = new Table(left.getColumn(0));
-         Table right = new Table.TestBuilder()
-             .column(null, 5, null, 8, 10, 32)
-             .column(   0, 1,    2, 3,  4,  5)
-             .column(   7, 8,    9, 0,  1,  2).build();
-         Table rightKeys = new Table(right.getColumn(0));
-         Table expected = new Table.TestBuilder()
-             .column(0, 7, 8)
-             .build();
-         MixedJoinSize sizeInfo = Table.mixedLeftSemiJoinSize(leftKeys, rightKeys, left, right,
-             condition, NullEquality.EQUAL)) {
-      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
-      try (GatherMap map = Table.mixedLeftSemiJoinGatherMap(leftKeys, rightKeys, left, right,
-          condition, NullEquality.EQUAL, sizeInfo)) {
-        verifySemiJoinGatherMap(map, expected);
-      }
-    }
-  }
-
   @Test
   void testMixedLeftAntiJoinGatherMap() {
     BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
@@ -3166,64 +3108,6 @@ void testMixedLeftAntiJoinGatherMapNulls() {
     }
   }
 
-  @Test
-  void testMixedLeftAntiJoinGatherMapWithSize() {
-    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
-        new ColumnReference(1, TableReference.LEFT),
-        new ColumnReference(1, TableReference.RIGHT));
-    try (CompiledExpression condition = expr.compile();
-         Table left = new Table.TestBuilder()
-             .column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8)
-             .column(1, 2, 3, 4, 5, 6, 7, 8, 9, 0)
-             .build();
-         Table leftKeys = new Table(left.getColumn(0));
-         Table right = new Table.TestBuilder()
-             .column(6, 5, 9, 8, 10, 32)
-             .column(0, 1, 2, 3, 4, 5)
-             .column(7, 8, 9, 0, 1, 2).build();
-         Table rightKeys = new Table(right.getColumn(0));
-         Table expected = new Table.TestBuilder()
-             .column(0, 1, 3, 4, 5, 6, 9)
-             .build();
-         MixedJoinSize sizeInfo = Table.mixedLeftAntiJoinSize(leftKeys, rightKeys, left, right,
-             condition, NullEquality.UNEQUAL)) {
-      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
-      try (GatherMap map = Table.mixedLeftAntiJoinGatherMap(leftKeys, rightKeys, left, right,
-          condition, NullEquality.UNEQUAL, sizeInfo)) {
-        verifySemiJoinGatherMap(map, expected);
-      }
-    }
-  }
-
-  @Test
-  void testMixedLeftAntiJoinGatherMapNullsWithSize() {
-    BinaryOperation expr = new BinaryOperation(BinaryOperator.GREATER,
-        new ColumnReference(1, TableReference.LEFT),
-        new ColumnReference(1, TableReference.RIGHT));
-    try (CompiledExpression condition = expr.compile();
-         Table left = new Table.TestBuilder()
-             .column(null, 3, 9, 0, 1, 7, 4, null, 5, 8)
-             .column(   1, 2, 3, 4, 5, 6, 7,    8, 9, 0)
-             .build();
-         Table leftKeys = new Table(left.getColumn(0));
-         Table right = new Table.TestBuilder()
-             .column(null, 5, null, 8, 10, 32)
-             .column(   0, 1,    2, 3,  4,  5)
-             .column(   7, 8,    9, 0,  1,  2).build();
-         Table rightKeys = new Table(right.getColumn(0));
-         Table expected = new Table.TestBuilder()
-             .column(1, 2, 3, 4, 5, 6, 9)
-             .build();
-         MixedJoinSize sizeInfo = Table.mixedLeftAntiJoinSize(leftKeys, rightKeys, left, right,
-             condition, NullEquality.EQUAL)) {
-      assertEquals(expected.getRowCount(), sizeInfo.getOutputRowCount());
-      try (GatherMap map = Table.mixedLeftAntiJoinGatherMap(leftKeys, rightKeys, left, right,
-          condition, NullEquality.EQUAL, sizeInfo)) {
-        verifySemiJoinGatherMap(map, expected);
-      }
-    }
-  }
-
   @Test
   void testLeftSemiJoinGatherMap() {
     try (Table leftKeys = new Table.TestBuilder().column(2, 3, 9, 0, 1, 7, 4, 6, 5, 8).build();

From 61dbfe8dc7635264465ce46d7de9e87ca0353267 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 4 Apr 2024 15:22:48 -0400
Subject: [PATCH 008/842] Allow jit compilation when using a splayed CUDA
 toolkit (#15451)

The `JitifyPreprocessKernels.cmake` module now handles when `CUDAToolkit_INCLUDE_DIRS` has multiple values correctly, allowing for compilation with splayed CUDA Toolkit installs.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15451
---
 cpp/cmake/Modules/JitifyPreprocessKernels.cmake | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
index 8c4e2b47fca..752c2028350 100644
--- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -23,8 +23,9 @@ target_link_libraries(jitify_preprocess PUBLIC ${CMAKE_DL_LIBS})
 function(jit_preprocess_files)
   cmake_parse_arguments(ARG "" "SOURCE_DIRECTORY" "FILES" ${ARGN})
 
-  foreach(inc IN LISTS libcudacxx_raw_includes)
-    list(APPEND libcudacxx_includes "-I${inc}")
+  set(includes)
+  foreach(inc IN LISTS libcudacxx_raw_includes CUDAToolkit_INCLUDE_DIRS)
+    list(APPEND includes "-I${inc}")
   endforeach()
   foreach(ARG_FILE ${ARG_FILES})
     set(ARG_OUTPUT ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files/${ARG_FILE}.jit.hpp)
@@ -44,8 +45,7 @@ function(jit_preprocess_files)
         $<TARGET_FILE:jitify_preprocess> ${ARG_FILE} -o
         ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files -i -m -std=c++17
         -remove-unused-globals -D_FILE_OFFSET_BITS=64 -D__CUDACC_RTC__ -I${CUDF_SOURCE_DIR}/include
-        -I${CUDF_SOURCE_DIR}/src ${libcudacxx_includes} -I${CUDAToolkit_INCLUDE_DIRS}
-        --no-preinclude-workarounds --no-replace-pragma-once
+        -I${CUDF_SOURCE_DIR}/src ${includes} --no-preinclude-workarounds --no-replace-pragma-once
       COMMENT "Custom command to JIT-compile files."
     )
   endforeach()

From c0f84bf5bbc7262015c42588fc1f4fd2b8e1b6c1 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 4 Apr 2024 15:24:04 -0400
Subject: [PATCH 009/842] Allow consumers of static builds to find nanoarrow
 (#15456)

Allows consumers like spark-rapids to bring in libcudf static builds from the install and build trees.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15456
---
 cpp/cmake/thirdparty/get_nanoarrow.cmake      |  1 +
 .../thirdparty/patches/nanoarrow_cmake.diff   | 39 +++++++++++++++----
 2 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
index 4316db99a8d..884e5a2f368 100644
--- a/cpp/cmake/thirdparty/get_nanoarrow.cmake
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -49,6 +49,7 @@ function(find_and_configure_nanoarrow)
     OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
   )
   set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
+  rapids_export_find_package_root(BUILD nanoarrow "${nanoarrow_BINARY_DIR}" EXPORT_SET cudf-exports)
 endfunction()
 
 find_and_configure_nanoarrow(
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff
index b53e134ed2c..1262a38c0a4 100644
--- a/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff
+++ b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff
@@ -1,5 +1,5 @@
 diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 8714c70..1feec13 100644
+index 8714c70..6a9e505 100644
 --- a/CMakeLists.txt
 +++ b/CMakeLists.txt
 @@ -49,7 +49,6 @@ else()
@@ -10,7 +10,15 @@ index 8714c70..1feec13 100644
 
  # Avoids a warning about timestamps on downloaded files (prefer new policy
  # if available))
-@@ -111,6 +110,8 @@ if(NANOARROW_BUNDLE)
+@@ -59,6 +58,7 @@ endif()
+
+ configure_file(src/nanoarrow/nanoarrow_config.h.in generated/nanoarrow_config.h)
+
++include(GNUInstallDirs)
+ if(NANOARROW_BUNDLE)
+   # Combine all headers into amalgamation/nanoarrow.h in the build directory
+   file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/amalgamation)
+@@ -111,6 +111,8 @@ if(NANOARROW_BUNDLE)
    if(NANOARROW_BUILD_TESTS)
      include_directories(${CMAKE_BINARY_DIR}/amalgamation)
      add_library(nanoarrow ${NANOARROW_C_TEMP})
@@ -19,7 +27,7 @@ index 8714c70..1feec13 100644
      target_compile_definitions(nanoarrow PUBLIC "$<$<CONFIG:Debug>:NANOARROW_DEBUG>")
    endif()
 
-@@ -120,6 +121,7 @@ if(NANOARROW_BUNDLE)
+@@ -120,10 +122,11 @@ if(NANOARROW_BUNDLE)
  else()
    add_library(nanoarrow src/nanoarrow/array.c src/nanoarrow/schema.c
                          src/nanoarrow/array_stream.c src/nanoarrow/utils.c)
@@ -27,25 +35,31 @@ index 8714c70..1feec13 100644
 
    target_include_directories(nanoarrow
                               PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
-@@ -154,13 +156,50 @@ else()
+-                                    $<INSTALL_INTERFACE:include>)
++                                    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+   target_include_directories(nanoarrow
+                              PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/generated>
+   )
+@@ -154,13 +157,49 @@ else()
      endif()
    endif()
 
 -  install(TARGETS nanoarrow DESTINATION lib)
 +  install(TARGETS nanoarrow
-+          DESTINATION lib
++          DESTINATION "${CMAKE_INSTALL_LIBDIR}"
 +          EXPORT nanoarrow-exports)
    install(DIRECTORY src/
-           DESTINATION include
+-          DESTINATION include
++          DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
            FILES_MATCHING
 -          PATTERN "*.h")
 +          PATTERN "*.h*")
    install(FILES ${CMAKE_CURRENT_BINARY_DIR}/generated/nanoarrow_config.h
-           DESTINATION include/nanoarrow)
+-          DESTINATION include/nanoarrow)
++          DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nanoarrow")
 +
 +  # Generate package files for the build and install trees.
 +  include(CMakePackageConfigHelpers)
-+  include(GNUInstallDirs)
 +
 +  foreach(tree_type BUILD INSTALL)
 +    if(tree_type STREQUAL "BUILD")
@@ -80,6 +94,15 @@ index 8714c70..1feec13 100644
  endif()
 
  # Always build integration test if building tests
+@@ -171,7 +210,7 @@ if(NANOARROW_BUILD_TESTS OR NANOARROW_BUILD_INTEGRATION_TESTS)
+               src/nanoarrow/integration/c_data_integration.cc)
+   target_include_directories(nanoarrow_c_data_integration
+                              PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
+-                                    $<INSTALL_INTERFACE:include>)
++                                    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
+   target_link_libraries(nanoarrow_c_data_integration PRIVATE nanoarrow nlohmann_json)
+ endif()
+
 @@ -215,34 +254,18 @@ if(NANOARROW_BUILD_TESTS)
                   src/nanoarrow/integration/c_data_integration_test.cc)
 

From 8509054861f57379524982cc70db294d85a0dc5c Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 4 Apr 2024 16:09:45 -0400
Subject: [PATCH 010/842] Remove deprecated hash() and
 spark_murmurhash3_x86_32() (#15375)

Remove deprecated libcudf hash functions. The `cudf::hash()` and `cudf::hashing::spark_murmurhash3_x86_32()` were deprecated in previous releases. The `cudf::hash_partition()` function still relies on the enum `hash_id` so it has been moved from `hashing.cpp` to `partitioning.hpp`.
Calls to `cudf::hashing::spark_murmurhash3_x86_32()` were also removed from the JNI code.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/nvdbaranec
  - Jason Lowe (https://github.com/jlowe)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15375
---
 cpp/CMakeLists.txt                            |   2 -
 cpp/include/cudf/hashing.hpp                  |  52 --
 cpp/include/cudf/hashing/detail/hashing.hpp   |   5 -
 cpp/include/cudf/partitioning.hpp             |  10 +-
 cpp/src/hash/hashing.cu                       |  53 --
 cpp/src/hash/spark_murmurhash3_x86_32.cu      | 442 --------------
 .../hashing/spark_murmurhash3_x86_32_test.cpp | 576 ------------------
 .../partitioning/hash_partition_test.cpp      |  15 -
 .../java/ai/rapids/cudf/ColumnVector.java     |  44 +-
 .../main/java/ai/rapids/cudf/HashType.java    |   6 +-
 java/src/main/native/src/ColumnVectorJni.cpp  |  10 +-
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 219 -------
 12 files changed, 18 insertions(+), 1416 deletions(-)
 delete mode 100644 cpp/src/hash/hashing.cu
 delete mode 100644 cpp/src/hash/spark_murmurhash3_x86_32.cu
 delete mode 100644 cpp/tests/hashing/spark_murmurhash3_x86_32_test.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7c32474ea56..7d62e0acb10 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -346,7 +346,6 @@ add_library(
   src/groupby/sort/group_replace_nulls.cu
   src/groupby/sort/group_sum_scan.cu
   src/groupby/sort/sort_helper.cu
-  src/hash/hashing.cu
   src/hash/md5_hash.cu
   src/hash/murmurhash3_x86_32.cu
   src/hash/murmurhash3_x64_128.cu
@@ -355,7 +354,6 @@ add_library(
   src/hash/sha256_hash.cu
   src/hash/sha384_hash.cu
   src/hash/sha512_hash.cu
-  src/hash/spark_murmurhash3_x86_32.cu
   src/hash/xxhash_64.cu
   src/interop/dlpack.cpp
   src/interop/from_arrow.cu
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index 64a78da1803..83962b50a10 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -34,42 +34,11 @@ namespace cudf {
  */
 using hash_value_type = uint32_t;
 
-/**
- * @brief Identifies the hash function to be used
- *
- */
-enum class hash_id {
-  HASH_IDENTITY = 0,   ///< Identity hash function that simply returns the key to be hashed
-  HASH_MURMUR3,        ///< Murmur3 hash function
-  HASH_SPARK_MURMUR3,  ///< Spark Murmur3 hash function
-  HASH_MD5             ///< MD5 hash function
-};
-
 /**
  * @brief The default seed value for hash functions
  */
 static constexpr uint32_t DEFAULT_HASH_SEED = 0;
 
-/**
- * @brief Computes the hash value of each row in the input set of columns.
- *
- * @deprecated Since 23.08
- *
- * @param input The table of columns to hash
- * @param hash_function The hash function enum to use
- * @param seed Optional seed value to use for the hash function
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- *
- * @returns A column where each row is the hash of a column from the input
- */
-[[deprecated]] std::unique_ptr<column> hash(
-  table_view const& input,
-  hash_id hash_function               = hash_id::HASH_MURMUR3,
-  uint32_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 //! Hash APIs
 namespace hashing {
 
@@ -112,27 +81,6 @@ std::unique_ptr<table> murmurhash3_x64_128(
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Computes the MurmurHash3 32-bit hash value of each row in the given table
- *
- * @deprecated Since 24.04
- *
- * This function computes the hash similar to MurmurHash3_x86_32 with special processing
- * to match Spark's implementation results.
- *
- * @param input The table of columns to hash
- * @param seed Optional seed value to use for the hash function
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned column's device memory
- *
- * @returns A column where each row is the hash of a row from the input
- */
-[[deprecated]] std::unique_ptr<column> spark_murmurhash3_x86_32(
-  table_view const& input,
-  uint32_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
 /**
  * @brief Computes the MD5 hash value of each row in the given table
  *
diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp
index eaeb5d6b068..88a43a64638 100644
--- a/cpp/include/cudf/hashing/detail/hashing.hpp
+++ b/cpp/include/cudf/hashing/detail/hashing.hpp
@@ -37,11 +37,6 @@ std::unique_ptr<table> murmurhash3_x64_128(table_view const& input,
                                            rmm::cuda_stream_view,
                                            rmm::mr::device_memory_resource* mr);
 
-std::unique_ptr<column> spark_murmurhash3_x86_32(table_view const& input,
-                                                 uint32_t seed,
-                                                 rmm::cuda_stream_view,
-                                                 rmm::mr::device_memory_resource* mr);
-
 std::unique_ptr<column> md5(table_view const& input,
                             rmm::cuda_stream_view stream,
                             rmm::mr::device_memory_resource* mr);
diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp
index 2c91bdf64f5..7033aa500a2 100644
--- a/cpp/include/cudf/partitioning.hpp
+++ b/cpp/include/cudf/partitioning.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,6 +33,14 @@ namespace cudf {
  * @brief Column partitioning APIs
  */
 
+/**
+ * @brief Identifies the hash function to be used in hash partitioning
+ */
+enum class hash_id {
+  HASH_IDENTITY = 0,  ///< Identity hash function that simply returns the key to be hashed
+  HASH_MURMUR3        ///< Murmur3 hash function
+};
+
 /**
  * @brief Partitions rows of `t` according to the mapping specified by
  * `partition_map`.
diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu
deleted file mode 100644
index 68e02ef3cf4..00000000000
--- a/cpp/src/hash/hashing.cu
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/hashing/detail/hashing.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-namespace cudf {
-namespace hashing {
-namespace detail {
-
-std::unique_ptr<column> hash(table_view const& input,
-                             hash_id hash_function,
-                             uint32_t seed,
-                             rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
-{
-  switch (hash_function) {
-    case (hash_id::HASH_MURMUR3): return murmurhash3_x86_32(input, seed, stream, mr);
-    case (hash_id::HASH_SPARK_MURMUR3): return spark_murmurhash3_x86_32(input, seed, stream, mr);
-    case (hash_id::HASH_MD5): return md5(input, stream, mr);
-    default: CUDF_FAIL("Unsupported hash function.");
-  }
-}
-
-}  // namespace detail
-}  // namespace hashing
-
-std::unique_ptr<column> hash(table_view const& input,
-                             hash_id hash_function,
-                             uint32_t seed,
-                             rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return hashing::detail::hash(input, hash_function, seed, stream, mr);
-}
-
-}  // namespace cudf
diff --git a/cpp/src/hash/spark_murmurhash3_x86_32.cu b/cpp/src/hash/spark_murmurhash3_x86_32.cu
deleted file mode 100644
index c7992b4afa0..00000000000
--- a/cpp/src/hash/spark_murmurhash3_x86_32.cu
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/hashing/detail/hash_functions.cuh>
-#include <cudf/hashing/detail/hashing.hpp>
-#include <cudf/table/experimental/row_operators.cuh>
-#include <cudf/table/table_device_view.cuh>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/tabulate.h>
-
-namespace cudf {
-namespace hashing {
-namespace detail {
-
-namespace {
-
-using spark_hash_value_type = int32_t;
-
-template <typename Key, CUDF_ENABLE_IF(not cudf::is_nested<Key>())>
-struct Spark_MurmurHash3_x86_32 {
-  using result_type = spark_hash_value_type;
-
-  constexpr Spark_MurmurHash3_x86_32() = default;
-  constexpr Spark_MurmurHash3_x86_32(uint32_t seed) : m_seed(seed) {}
-
-  [[nodiscard]] __device__ inline uint32_t fmix32(uint32_t h) const
-  {
-    h ^= h >> 16;
-    h *= 0x85ebca6b;
-    h ^= h >> 13;
-    h *= 0xc2b2ae35;
-    h ^= h >> 16;
-    return h;
-  }
-
-  [[nodiscard]] __device__ inline uint32_t getblock32(std::byte const* data,
-                                                      cudf::size_type offset) const
-  {
-    // Read a 4-byte value from the data pointer as individual bytes for safe
-    // unaligned access (very likely for string types).
-    auto block = reinterpret_cast<uint8_t const*>(data + offset);
-    return block[0] | (block[1] << 8) | (block[2] << 16) | (block[3] << 24);
-  }
-
-  [[nodiscard]] result_type __device__ inline operator()(Key const& key) const
-  {
-    return compute(key);
-  }
-
-  template <typename T>
-  result_type __device__ inline compute(T const& key) const
-  {
-    return compute_bytes(reinterpret_cast<std::byte const*>(&key), sizeof(T));
-  }
-
-  result_type __device__ inline compute_remaining_bytes(std::byte const* data,
-                                                        cudf::size_type len,
-                                                        cudf::size_type tail_offset,
-                                                        result_type h) const
-  {
-    // Process remaining bytes that do not fill a four-byte chunk using Spark's approach
-    // (does not conform to normal MurmurHash3).
-    for (auto i = tail_offset; i < len; i++) {
-      // We require a two-step cast to get the k1 value from the byte. First,
-      // we must cast to a signed int8_t. Then, the sign bit is preserved when
-      // casting to uint32_t under 2's complement. Java preserves the sign when
-      // casting byte-to-int, but C++ does not.
-      uint32_t k1 = static_cast<uint32_t>(std::to_integer<int8_t>(data[i]));
-      k1 *= c1;
-      k1 = rotate_bits_left(k1, rot_c1);
-      k1 *= c2;
-      h ^= k1;
-      h = rotate_bits_left(static_cast<uint32_t>(h), rot_c2);
-      h = h * 5 + c3;
-    }
-    return h;
-  }
-
-  result_type __device__ compute_bytes(std::byte const* data, cudf::size_type const len) const
-  {
-    constexpr cudf::size_type BLOCK_SIZE = 4;
-    cudf::size_type const nblocks        = len / BLOCK_SIZE;
-    cudf::size_type const tail_offset    = nblocks * BLOCK_SIZE;
-    result_type h                        = m_seed;
-
-    // Process all four-byte chunks.
-    for (cudf::size_type i = 0; i < nblocks; i++) {
-      uint32_t k1 = getblock32(data, i * BLOCK_SIZE);
-      k1 *= c1;
-      k1 = rotate_bits_left(k1, rot_c1);
-      k1 *= c2;
-      h ^= k1;
-      h = rotate_bits_left(static_cast<uint32_t>(h), rot_c2);
-      h = h * 5 + c3;
-    }
-
-    h = compute_remaining_bytes(data, len, tail_offset, h);
-
-    // Finalize hash.
-    h ^= len;
-    h = fmix32(h);
-    return h;
-  }
-
- private:
-  uint32_t m_seed{cudf::DEFAULT_HASH_SEED};
-  static constexpr uint32_t c1     = 0xcc9e2d51;
-  static constexpr uint32_t c2     = 0x1b873593;
-  static constexpr uint32_t c3     = 0xe6546b64;
-  static constexpr uint32_t rot_c1 = 15;
-  static constexpr uint32_t rot_c2 = 13;
-};
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<bool>::operator()(
-  bool const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<int8_t>::operator()(
-  int8_t const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<uint8_t>::operator()(
-  uint8_t const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<int16_t>::operator()(
-  int16_t const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<uint16_t>::operator()(
-  uint16_t const& key) const
-{
-  return compute<uint32_t>(key);
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<float>::operator()(
-  float const& key) const
-{
-  return compute<float>(normalize_nans(key));
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<double>::operator()(
-  double const& key) const
-{
-  return compute<double>(normalize_nans(key));
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<cudf::string_view>::operator()(
-  cudf::string_view const& key) const
-{
-  auto const data = reinterpret_cast<std::byte const*>(key.data());
-  auto const len  = key.size_bytes();
-  return compute_bytes(data, len);
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<numeric::decimal32>::operator()(
-  numeric::decimal32 const& key) const
-{
-  return compute<uint64_t>(key.value());
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<numeric::decimal64>::operator()(
-  numeric::decimal64 const& key) const
-{
-  return compute<uint64_t>(key.value());
-}
-
-template <>
-spark_hash_value_type __device__ inline Spark_MurmurHash3_x86_32<numeric::decimal128>::operator()(
-  numeric::decimal128 const& key) const
-{
-  // Generates the Spark MurmurHash3 hash value, mimicking the conversion:
-  // java.math.BigDecimal.valueOf(unscaled_value, _scale).unscaledValue().toByteArray()
-  // https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala#L381
-  __int128_t const val               = key.value();
-  constexpr cudf::size_type key_size = sizeof(__int128_t);
-  std::byte const* data              = reinterpret_cast<std::byte const*>(&val);
-
-  // Small negative values start with 0xff..., small positive values start with 0x00...
-  bool const is_negative     = val < 0;
-  std::byte const zero_value = is_negative ? std::byte{0xff} : std::byte{0x00};
-
-  // If the value can be represented with a shorter than 16-byte integer, the
-  // leading bytes of the little-endian value are truncated and are not hashed.
-  auto const reverse_begin = thrust::reverse_iterator(data + key_size);
-  auto const reverse_end   = thrust::reverse_iterator(data);
-  auto const first_nonzero_byte =
-    thrust::find_if_not(thrust::seq, reverse_begin, reverse_end, [zero_value](std::byte const& v) {
-      return v == zero_value;
-    }).base();
-  // Max handles special case of 0 and -1 which would shorten to 0 length otherwise
-  cudf::size_type length =
-    std::max(1, static_cast<cudf::size_type>(thrust::distance(data, first_nonzero_byte)));
-
-  // Preserve the 2's complement sign bit by adding a byte back on if necessary.
-  // e.g. 0x0000ff would shorten to 0x00ff. The 0x00 byte is retained to
-  // preserve the sign bit, rather than leaving an "f" at the front which would
-  // change the sign bit. However, 0x00007f would shorten to 0x7f. No extra byte
-  // is needed because the leftmost bit matches the sign bit. Similarly for
-  // negative values: 0xffff00 --> 0xff00 and 0xffff80 --> 0x80.
-  if ((length < key_size) && (is_negative ^ bool(data[length - 1] & std::byte{0x80}))) { ++length; }
-
-  // Convert to big endian by reversing the range of nonzero bytes. Only those bytes are hashed.
-  __int128_t big_endian_value = 0;
-  auto big_endian_data        = reinterpret_cast<std::byte*>(&big_endian_value);
-  thrust::reverse_copy(thrust::seq, data, data + length, big_endian_data);
-  return compute_bytes(big_endian_data, length);
-}
-
-/**
- * @brief Computes the hash value of a row in the given table.
- *
- * This functor uses Spark conventions for Murmur hashing, which differs from
- * the Murmur implementation used in the rest of libcudf. These differences
- * include:
- * - Serially using the output hash as an input seed for the next item
- * - Ignorance of null values
- *
- * The serial use of hashes as seeds means that data of different nested types
- * can exhibit hash collisions. For example, a row of an integer column
- * containing a 1 will have the same hash as a lists column of integers
- * containing a list of [1] and a struct column of a single integer column
- * containing a struct of {1}.
- *
- * As a consequence of ignoring null values, inputs like [1], [1, null], and
- * [null, 1] have the same hash (an expected hash collision). This kind of
- * collision can also occur across a table of nullable columns and with nulls
- * in structs ({1, null} and {null, 1} have the same hash). The seed value (the
- * previous element's hash value) is returned as the hash if an element is
- * null.
- *
- * For additional differences such as special tail processing and decimal type
- * handling, refer to the Spark_MurmurHash3_x86_32 functor.
- *
- * @tparam hash_function Hash functor to use for hashing elements. Must be Spark_MurmurHash3_x86_32.
- * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
- */
-template <template <typename> class hash_function, typename Nullate>
-class spark_murmur_device_row_hasher {
-  friend class cudf::experimental::row::hash::row_hasher;  ///< Allow row_hasher to access private
-                                                           ///< members.
-
- public:
-  /**
-   * @brief Return the hash value of a row in the given table.
-   *
-   * @param row_index The row index to compute the hash value of
-   * @return The hash value of the row
-   */
-  __device__ auto operator()(size_type row_index) const noexcept
-  {
-    return cudf::detail::accumulate(
-      _table.begin(),
-      _table.end(),
-      _seed,
-      [row_index, nulls = this->_check_nulls] __device__(auto hash, auto column) {
-        return cudf::type_dispatcher(
-          column.type(), element_hasher_adapter<hash_function>{nulls, hash}, column, row_index);
-      });
-  }
-
- private:
-  /**
-   * @brief Computes the hash value of an element in the given column.
-   *
-   * When the column is non-nested, this is a simple wrapper around the element_hasher.
-   * When the column is nested, this uses a seed value to serially compute each
-   * nested element, with the output hash becoming the seed for the next value.
-   * This requires constructing a new hash functor for each nested element,
-   * using the new seed from the previous element's hash. The hash of a null
-   * element is the input seed (the previous element's hash).
-   */
-  template <template <typename> class hash_fn>
-  class element_hasher_adapter {
-   public:
-    __device__ element_hasher_adapter(Nullate check_nulls, uint32_t seed) noexcept
-      : _check_nulls(check_nulls), _seed(seed)
-    {
-    }
-
-    using hash_functor = cudf::experimental::row::hash::element_hasher<hash_fn, Nullate>;
-
-    template <typename T, CUDF_ENABLE_IF(not cudf::is_nested<T>())>
-    __device__ spark_hash_value_type operator()(column_device_view const& col,
-                                                size_type row_index) const noexcept
-    {
-      auto const hasher = hash_functor{_check_nulls, _seed, _seed};
-      return hasher.template operator()<T>(col, row_index);
-    }
-
-    template <typename T, CUDF_ENABLE_IF(cudf::is_nested<T>())>
-    __device__ spark_hash_value_type operator()(column_device_view const& col,
-                                                size_type row_index) const noexcept
-    {
-      column_device_view curr_col = col.slice(row_index, 1);
-      while (curr_col.type().id() == type_id::STRUCT || curr_col.type().id() == type_id::LIST) {
-        if (curr_col.type().id() == type_id::STRUCT) {
-          if (curr_col.num_child_columns() == 0) { return _seed; }
-          // Non-empty structs are assumed to be decomposed and contain only one child
-          curr_col = cudf::detail::structs_column_device_view(curr_col).get_sliced_child(0);
-        } else if (curr_col.type().id() == type_id::LIST) {
-          curr_col = cudf::detail::lists_column_device_view(curr_col).get_sliced_child();
-        }
-      }
-
-      return cudf::detail::accumulate(
-        thrust::counting_iterator(0),
-        thrust::counting_iterator(curr_col.size()),
-        _seed,
-        [curr_col, nulls = this->_check_nulls] __device__(auto hash, auto element_index) {
-          auto const hasher = hash_functor{nulls, hash, hash};
-          return cudf::type_dispatcher<cudf::experimental::dispatch_void_if_nested>(
-            curr_col.type(), hasher, curr_col, element_index);
-        });
-    }
-
-    Nullate const _check_nulls;  ///< Whether to check for nulls
-    uint32_t const _seed;        ///< The seed to use for hashing, also returned for null elements
-  };
-
-  CUDF_HOST_DEVICE spark_murmur_device_row_hasher(Nullate check_nulls,
-                                                  table_device_view t,
-                                                  uint32_t seed = DEFAULT_HASH_SEED) noexcept
-    : _check_nulls{check_nulls}, _table{t}, _seed(seed)
-  {
-    // Error out if passed an unsupported hash_function
-    static_assert(
-      std::is_base_of_v<Spark_MurmurHash3_x86_32<int>, hash_function<int>>,
-      "spark_murmur_device_row_hasher only supports the Spark_MurmurHash3_x86_32 hash function");
-  }
-
-  Nullate const _check_nulls;
-  table_device_view const _table;
-  uint32_t const _seed;
-};
-
-void check_hash_compatibility(table_view const& input)
-{
-  using column_checker_fn_t = std::function<void(column_view const&)>;
-
-  column_checker_fn_t check_column = [&](column_view const& c) {
-    if (c.type().id() == type_id::LIST) {
-      auto const& list_col = lists_column_view(c);
-      CUDF_EXPECTS(list_col.child().type().id() != type_id::STRUCT,
-                   "Cannot compute hash of a table with a LIST of STRUCT columns.");
-      check_column(list_col.child());
-    } else if (c.type().id() == type_id::STRUCT) {
-      for (auto child = c.child_begin(); child != c.child_end(); ++child) {
-        check_column(*child);
-      }
-    }
-  };
-
-  for (column_view const& c : input) {
-    check_column(c);
-  }
-}
-
-}  // namespace
-
-std::unique_ptr<column> spark_murmurhash3_x86_32(table_view const& input,
-                                                 uint32_t seed,
-                                                 rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
-{
-  auto output = make_numeric_column(data_type(type_to_id<spark_hash_value_type>()),
-                                    input.num_rows(),
-                                    mask_state::UNALLOCATED,
-                                    stream,
-                                    mr);
-
-  // Return early if there's nothing to hash
-  if (input.num_columns() == 0 || input.num_rows() == 0) { return output; }
-
-  // Lists of structs are not supported
-  check_hash_compatibility(input);
-
-  bool const nullable   = has_nested_nulls(input);
-  auto const row_hasher = cudf::experimental::row::hash::row_hasher(input, stream);
-  auto output_view      = output->mutable_view();
-
-  // Compute the hash value for each row
-  thrust::tabulate(
-    rmm::exec_policy(stream),
-    output_view.begin<spark_hash_value_type>(),
-    output_view.end<spark_hash_value_type>(),
-    row_hasher.device_hasher<Spark_MurmurHash3_x86_32, spark_murmur_device_row_hasher>(nullable,
-                                                                                       seed));
-
-  return output;
-}
-
-}  // namespace detail
-
-std::unique_ptr<column> spark_murmurhash3_x86_32(table_view const& input,
-                                                 uint32_t seed,
-                                                 rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::spark_murmurhash3_x86_32(input, seed, stream, mr);
-}
-
-}  // namespace hashing
-}  // namespace cudf
diff --git a/cpp/tests/hashing/spark_murmurhash3_x86_32_test.cpp b/cpp/tests/hashing/spark_murmurhash3_x86_32_test.cpp
deleted file mode 100644
index e8bbfaa2cba..00000000000
--- a/cpp/tests/hashing/spark_murmurhash3_x86_32_test.cpp
+++ /dev/null
@@ -1,576 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/detail/iterator.cuh>
-#include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/hashing.hpp>
-
-constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
-
-template <typename T>
-class SparkMurmurHashTestTyped : public cudf::test::BaseFixture {};
-
-TYPED_TEST_SUITE(SparkMurmurHashTestTyped, cudf::test::FixedWidthTypes);
-
-TYPED_TEST(SparkMurmurHashTestTyped, Equality)
-{
-  cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> const col{0, 127, 1, 2, 8};
-  auto const input = cudf::table_view({col});
-
-  // Hash of same input should be equal
-  auto const spark_output1 = cudf::hashing::spark_murmurhash3_x86_32(input, 0);
-  auto const spark_output2 = cudf::hashing::spark_murmurhash3_x86_32(input);
-
-  EXPECT_EQ(input.num_rows(), spark_output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
-}
-
-TYPED_TEST(SparkMurmurHashTestTyped, EqualityNulls)
-{
-  using T = TypeParam;
-
-  // Nulls with different values should be equal
-  cudf::test::fixed_width_column_wrapper<T, int32_t> const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
-  cudf::test::fixed_width_column_wrapper<T, int32_t> const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
-
-  auto const input1 = cudf::table_view({col1});
-  auto const input2 = cudf::table_view({col2});
-
-  auto const spark_output1 = cudf::hashing::spark_murmurhash3_x86_32(input1, 0);
-  auto const spark_output2 = cudf::hashing::spark_murmurhash3_x86_32(input2);
-
-  EXPECT_EQ(input1.num_rows(), spark_output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
-}
-
-template <typename T>
-class SparkMurmurHashTestFloatTyped : public cudf::test::BaseFixture {};
-
-TYPED_TEST_SUITE(SparkMurmurHashTestFloatTyped, cudf::test::FloatingPointTypes);
-
-TYPED_TEST(SparkMurmurHashTestFloatTyped, TestExtremes)
-{
-  using T = TypeParam;
-  T min   = std::numeric_limits<T>::min();
-  T max   = std::numeric_limits<T>::max();
-  T nan   = std::numeric_limits<T>::quiet_NaN();
-  T inf   = std::numeric_limits<T>::infinity();
-
-  cudf::test::fixed_width_column_wrapper<T> const col(
-    {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
-  cudf::test::fixed_width_column_wrapper<T> const col_neg_zero(
-    {T(-0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
-  cudf::test::fixed_width_column_wrapper<T> const col_neg_nan(
-    {T(0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf});
-
-  auto const table_col          = cudf::table_view({col});
-  auto const table_col_neg_zero = cudf::table_view({col_neg_zero});
-  auto const table_col_neg_nan  = cudf::table_view({col_neg_nan});
-
-  // Spark hash is sensitive to 0 and -0
-  auto const spark_col         = cudf::hashing::spark_murmurhash3_x86_32(table_col, 0);
-  auto const spark_col_neg_nan = cudf::hashing::spark_murmurhash3_x86_32(table_col_neg_nan);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*spark_col, *spark_col_neg_nan);
-}
-
-class SparkMurmurHashTest : public cudf::test::BaseFixture {};
-
-TEST_F(SparkMurmurHashTest, MultiValueNulls)
-{
-  // Nulls with different values should be equal
-  cudf::test::strings_column_wrapper const strings_col1(
-    {"",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "All work and no play makes Jack a dull boy",
-     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
-    {0, 1, 1, 0, 1});
-  cudf::test::strings_column_wrapper const strings_col2(
-    {"different but null",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "I am Jack's complete lack of null value",
-     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
-    {0, 1, 1, 0, 1});
-
-  // Nulls with different values should be equal
-  using limits = std::numeric_limits<int32_t>;
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col1(
-    {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col2(
-    {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
-
-  // Nulls with different values should be equal
-  // Different truth values should be equal
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1});
-
-  // Nulls with different values should be equal
-  using ts = cudf::timestamp_s;
-  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col1(
-    {ts::duration::zero(),
-     static_cast<ts::duration>(100),
-     static_cast<ts::duration>(-100),
-     ts::duration::min(),
-     ts::duration::max()},
-    {1, 0, 0, 1, 1});
-  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col2(
-    {ts::duration::zero(),
-     static_cast<ts::duration>(-200),
-     static_cast<ts::duration>(200),
-     ts::duration::min(),
-     ts::duration::max()},
-    {1, 0, 0, 1, 1});
-
-  auto const input1        = cudf::table_view({strings_col1, ints_col1, bools_col1, secs_col1});
-  auto const input2        = cudf::table_view({strings_col2, ints_col2, bools_col2, secs_col2});
-  auto const spark_output1 = cudf::hashing::spark_murmurhash3_x86_32(input1, 0);
-  auto const spark_output2 = cudf::hashing::spark_murmurhash3_x86_32(input2);
-
-  EXPECT_EQ(input1.num_rows(), spark_output1->size());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(spark_output1->view(), spark_output2->view());
-}
-
-TEST_F(SparkMurmurHashTest, MultiValueWithSeeds)
-{
-  // The hash values were determined by running the following Scala code in Apache Spark.
-  // Note that Spark >= 3.2 normalizes the float/double value of -0. to +0. and both values hash
-  // to the same result. This is normalized in the calling code (Spark RAPIDS plugin) for Spark
-  // >= 3.2. However, the reference values for -0. below must be obtained with Spark < 3.2 and
-  // libcudf will continue to implement the Spark < 3.2 behavior until Spark >= 3.2 is required and
-  // the workaround in the calling code is removed. This also affects the combined hash values.
-
-  /*
-  import org.apache.spark.sql.functions._
-  import org.apache.spark.sql.types._
-  import org.apache.spark.sql.Row
-  import org.apache.spark.sql.catalyst.util.DateTimeUtils
-
-  val schema = new StructType()
-      .add("structs", new StructType()
-          .add("a", IntegerType)
-          .add("b", StringType)
-          .add("c", new StructType()
-              .add("x", FloatType)
-              .add("y", LongType)))
-      .add("strings", StringType)
-      .add("doubles", DoubleType)
-      .add("timestamps", TimestampType)
-      .add("decimal64", DecimalType(18, 7))
-      .add("longs", LongType)
-      .add("floats", FloatType)
-      .add("dates", DateType)
-      .add("decimal32", DecimalType(9, 3))
-      .add("ints", IntegerType)
-      .add("shorts", ShortType)
-      .add("bytes", ByteType)
-      .add("bools", BooleanType)
-      .add("decimal128", DecimalType(38, 11))
-
-  val data = Seq(
-      Row(Row(0, "a", Row(0f, 0L)), "", 0.toDouble,
-          DateTimeUtils.toJavaTimestamp(0), BigDecimal(0), 0.toLong, 0.toFloat,
-          DateTimeUtils.toJavaDate(0), BigDecimal(0), 0, 0.toShort, 0.toByte,
-          false, BigDecimal(0)),
-      Row(Row(100, "bc", Row(100f, 100L)), "The quick brown fox", -(0.toDouble),
-          DateTimeUtils.toJavaTimestamp(100), BigDecimal("0.00001"), 100.toLong, -(0.toFloat),
-          DateTimeUtils.toJavaDate(100), BigDecimal("0.1"), 100, 100.toShort, 100.toByte,
-          true, BigDecimal("0.000000001")),
-      Row(Row(-100, "def", Row(-100f, -100L)), "jumps over the lazy dog.", -Double.NaN,
-          DateTimeUtils.toJavaTimestamp(-100), BigDecimal("-0.00001"), -100.toLong, -Float.NaN,
-          DateTimeUtils.toJavaDate(-100), BigDecimal("-0.1"), -100, -100.toShort, -100.toByte,
-          true, BigDecimal("-0.00000000001")),
-      Row(Row(0x12345678, "ghij", Row(Float.PositiveInfinity, 0x123456789abcdefL)),
-          "All work and no play makes Jack a dull boy", Double.MinValue,
-          DateTimeUtils.toJavaTimestamp(Long.MinValue/1000000), BigDecimal("-99999999999.9999999"),
-          Long.MinValue, Float.MinValue, DateTimeUtils.toJavaDate(Int.MinValue/100),
-          BigDecimal("-999999.999"), Int.MinValue, Short.MinValue, Byte.MinValue, true,
-          BigDecimal("-9999999999999999.99999999999")),
-      Row(Row(-0x76543210, "klmno", Row(Float.NegativeInfinity, -0x123456789abcdefL)),
-          "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721", Double.MaxValue,
-          DateTimeUtils.toJavaTimestamp(Long.MaxValue/1000000), BigDecimal("99999999999.9999999"),
-          Long.MaxValue, Float.MaxValue, DateTimeUtils.toJavaDate(Int.MaxValue/100),
-          BigDecimal("999999.999"), Int.MaxValue, Short.MaxValue, Byte.MaxValue, false,
-          BigDecimal("99999999999999999999999999.99999999999")))
-
-  val df = spark.createDataFrame(sc.parallelize(data), schema)
-  df.columns.foreach(c => println(s"$c => ${df.select(hash(col(c))).collect.mkString(",")}"))
-  println(s"combined => ${df.select(hash(col("*"))).collect.mkString(",")}")
-  */
-
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_structs_expected(
-    {-105406170, 90479889, -678041645, 1667387937, 301478567});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_strings_expected(
-    {142593372, 1217302703, -715697185, -2061143941, -111635966});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_doubles_expected(
-    {-1670924195, -853646085, -1281358385, 1897734433, -508695674});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_timestamps_expected(
-    {-1670924195, 1114849490, 904948192, -1832979433, 1752430209});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal64_expected(
-    {-1670924195, 1114849490, 904948192, 1962370902, -1795328666});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_longs_expected(
-    {-1670924195, 1114849490, 904948192, -853646085, -1604625029});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_floats_expected(
-    {933211791, 723455942, -349261430, -1225560532, -338752985});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_dates_expected(
-    {933211791, 751823303, -1080202046, -1906567553, -1503850410});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal32_expected(
-    {-1670924195, 1114849490, 904948192, -1454351396, -193774131});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_ints_expected(
-    {933211791, 751823303, -1080202046, 723455942, 133916647});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_shorts_expected(
-    {933211791, 751823303, -1080202046, -1871935946, 1249274084});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_bytes_expected(
-    {933211791, 751823303, -1080202046, 1110053733, 1135925485});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_bools_expected(
-    {933211791, -559580957, -559580957, -559580957, 933211791});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal128_expected(
-    {-783713497, -295670906, 1398487324, -52622807, -1359749815});
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_combined_expected(
-    {401603227, 588162166, 552160517, 1132537411, -326043017});
-
-  using double_limits = std::numeric_limits<double>;
-  using long_limits   = std::numeric_limits<int64_t>;
-  using float_limits  = std::numeric_limits<float>;
-  using int_limits    = std::numeric_limits<int32_t>;
-  cudf::test::fixed_width_column_wrapper<int32_t> a_col{0, 100, -100, 0x1234'5678, -0x7654'3210};
-  cudf::test::strings_column_wrapper b_col{"a", "bc", "def", "ghij", "klmno"};
-  cudf::test::fixed_width_column_wrapper<float> x_col{
-    0.f, 100.f, -100.f, float_limits::infinity(), -float_limits::infinity()};
-  cudf::test::fixed_width_column_wrapper<int64_t> y_col{
-    0L, 100L, -100L, 0x0123'4567'89ab'cdefL, -0x0123'4567'89ab'cdefL};
-  cudf::test::structs_column_wrapper c_col{{x_col, y_col}};
-  cudf::test::structs_column_wrapper const structs_col{{a_col, b_col, c_col}};
-
-  cudf::test::strings_column_wrapper const strings_col(
-    {"",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"});
-  cudf::test::fixed_width_column_wrapper<double> const doubles_col(
-    {0., -0., -double_limits::quiet_NaN(), double_limits::lowest(), double_limits::max()});
-  cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep> const
-    timestamps_col({0L, 100L, -100L, long_limits::min() / 1000000, long_limits::max() / 1000000});
-  cudf::test::fixed_point_column_wrapper<int64_t> const decimal64_col(
-    {0L, 100L, -100L, -999999999999999999L, 999999999999999999L}, numeric::scale_type{-7});
-  cudf::test::fixed_width_column_wrapper<int64_t> const longs_col(
-    {0L, 100L, -100L, long_limits::min(), long_limits::max()});
-  cudf::test::fixed_width_column_wrapper<float> const floats_col(
-    {0.f, -0.f, -float_limits::quiet_NaN(), float_limits::lowest(), float_limits::max()});
-  cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep> dates_col(
-    {0, 100, -100, int_limits::min() / 100, int_limits::max() / 100});
-  cudf::test::fixed_point_column_wrapper<int32_t> const decimal32_col(
-    {0, 100, -100, -999999999, 999999999}, numeric::scale_type{-3});
-  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
-    {0, 100, -100, int_limits::min(), int_limits::max()});
-  cudf::test::fixed_width_column_wrapper<int16_t> const shorts_col({0, 100, -100, -32768, 32767});
-  cudf::test::fixed_width_column_wrapper<int8_t> const bytes_col({0, 100, -100, -128, 127});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
-  cudf::test::fixed_point_column_wrapper<__int128_t> const decimal128_col(
-    {static_cast<__int128>(0),
-     static_cast<__int128>(100),
-     static_cast<__int128>(-1),
-     (static_cast<__int128>(0xFFFF'FFFF'FCC4'D1C3u) << 64 | 0x602F'7FC3'1800'0001u),
-     (static_cast<__int128>(0x0785'EE10'D5DA'46D9u) << 64 | 0x00F4'369F'FFFF'FFFFu)},
-    numeric::scale_type{-11});
-
-  auto const hash_structs =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({structs_col}), 42);
-  auto const hash_strings =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({strings_col}), 42);
-  auto const hash_doubles =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({doubles_col}), 42);
-  auto const hash_timestamps =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({timestamps_col}), 42);
-  auto const hash_decimal64 =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({decimal64_col}), 42);
-  auto const hash_longs =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({longs_col}), 42);
-  auto const hash_floats =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({floats_col}), 42);
-  auto const hash_dates =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({dates_col}), 42);
-  auto const hash_decimal32 =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({decimal32_col}), 42);
-  auto const hash_ints = cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({ints_col}), 42);
-  auto const hash_shorts =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({shorts_col}), 42);
-  auto const hash_bytes =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({bytes_col}), 42);
-  auto const hash_bools1 =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({bools_col1}), 42);
-  auto const hash_bools2 =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({bools_col2}), 42);
-  auto const hash_decimal128 =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({decimal128_col}), 42);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_structs, hash_structs_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_doubles, hash_doubles_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_timestamps, hash_timestamps_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal64, hash_decimal64_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_longs, hash_longs_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_floats, hash_floats_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_dates, hash_dates_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal32, hash_decimal32_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_ints, hash_ints_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_shorts, hash_shorts_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bytes, hash_bytes_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools1, hash_bools_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_bools2, hash_bools_expected, verbosity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_decimal128, hash_decimal128_expected, verbosity);
-
-  auto const combined_table = cudf::table_view({structs_col,
-                                                strings_col,
-                                                doubles_col,
-                                                timestamps_col,
-                                                decimal64_col,
-                                                longs_col,
-                                                floats_col,
-                                                dates_col,
-                                                decimal32_col,
-                                                ints_col,
-                                                shorts_col,
-                                                bytes_col,
-                                                bools_col2,
-                                                decimal128_col});
-  auto const hash_combined  = cudf::hashing::spark_murmurhash3_x86_32(combined_table, 42);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_combined, hash_combined_expected, verbosity);
-}
-
-TEST_F(SparkMurmurHashTest, StringsWithSeed)
-{
-  // The hash values were determined by running the following Scala code in Apache Spark:
-  // val strs = Seq("", "The quick brown fox",
-  //              "jumps over the lazy dog.",
-  //              "All work and no play makes Jack a dull boy",
-  //              "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721")
-  // println(strs.map(org.apache.spark.unsafe.types.UTF8String.fromString)
-  //   .map(org.apache.spark.sql.catalyst.expressions.Murmur3HashFunction.hash(
-  //     _, org.apache.spark.sql.types.StringType, 314)))
-
-  cudf::test::fixed_width_column_wrapper<int32_t> const hash_strings_expected_seed_314(
-    {1467149710, 723257560, -1620282500, -2001858707, 1588473657});
-
-  cudf::test::strings_column_wrapper const strings_col(
-    {"",
-     "The quick brown fox",
-     "jumps over the lazy dog.",
-     "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"});
-
-  auto const hash_strings =
-    cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({strings_col}), 314);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*hash_strings, hash_strings_expected_seed_314, verbosity);
-}
-
-TEST_F(SparkMurmurHashTest, ListValues)
-{
-  /*
-  import org.apache.spark.sql.functions._
-  import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
-  import org.apache.spark.sql.Row
-
-  val schema = new StructType()
-    .add("lists",ArrayType(ArrayType(IntegerType)))
-
-  val data = Seq(
-    Row(null),
-    Row(List(null)),
-    Row(List(List())),
-    Row(List(List(1))),
-    Row(List(List(1, 2))),
-    Row(List(List(1, 2, 3))),
-    Row(List(List(1, 2), List(3))),
-    Row(List(List(1), List(2, 3))),
-    Row(List(List(1), List(null, 2, 3))),
-    Row(List(List(1, 2), List(3), List(null))),
-    Row(List(List(1, 2), null, List(3))),
-  )
-
-  val df = spark.createDataFrame(
-    spark.sparkContext.parallelize(data), schema)
-
-  val df2 = df.selectExpr("lists", "hash(lists) as hash")
-  df2.printSchema()
-  df2.show(false)
-  */
-
-  auto const null = -1;
-  auto nested_list =
-    cudf::test::lists_column_wrapper<int>({{},
-                                           {1},
-                                           {1, 2},
-                                           {1, 2, 3},
-                                           {1, 2},
-                                           {3},
-                                           {1},
-                                           {2, 3},
-                                           {1},
-                                           {{null, 2, 3}, cudf::test::iterators::nulls_at({0})},
-                                           {1, 2},
-                                           {3},
-                                           {{null}, cudf::test::iterators::nulls_at({0})},
-                                           {1, 2},
-                                           {},
-                                           {3}},
-                                          cudf::test::iterators::nulls_at({0, 14}));
-  auto offsets =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 1, 2, 3, 4, 6, 8, 10, 13, 16};
-  auto list_validity = cudf::test::iterators::nulls_at({0});
-  auto [null_mask, null_count] =
-    cudf::test::detail::make_null_mask(list_validity, list_validity + 11);
-  auto list_column = cudf::make_lists_column(
-    11, offsets.release(), nested_list.release(), null_count, std::move(null_mask));
-
-  auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{42,
-                                                                42,
-                                                                42,
-                                                                -559580957,
-                                                                -222940379,
-                                                                -912918097,
-                                                                -912918097,
-                                                                -912918097,
-                                                                -912918097,
-                                                                -912918097,
-                                                                -912918097};
-
-  auto output = cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({*list_column}), 42);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-}
-
-TEST_F(SparkMurmurHashTest, StructOfListValues)
-{
-  /*
-  import org.apache.spark.sql.functions._
-  import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
-  import org.apache.spark.sql.Row
-
-  val schema = new StructType()
-    .add("structs", new StructType()
-        .add("a", ArrayType(IntegerType))
-        .add("b", ArrayType(IntegerType)))
-
-  val data = Seq(
-    Row(Row(List(), List())),
-    Row(Row(List(0), List(0))),
-    Row(Row(List(1, null), null)),
-    Row(Row(List(1, null), List())),
-    Row(Row(List(), List(null, 1))),
-    Row(Row(null, List(1))),
-    Row(Row(List(2, 3), List(4, 5))),
-  )
-
-  val df = spark.createDataFrame(
-    spark.sparkContext.parallelize(data), schema)
-
-  val df2 = df.selectExpr("lists", "hash(lists) as hash")
-  df2.printSchema()
-  df2.show(false)
-  */
-
-  auto const null = -1;
-  auto col1 =
-    cudf::test::lists_column_wrapper<int>({{},
-                                           {0},
-                                           {{1, null}, cudf::test::iterators::nulls_at({1})},
-                                           {{1, null}, cudf::test::iterators::nulls_at({1})},
-                                           {},
-                                           {} /*NULL*/,
-                                           {2, 3}},
-                                          cudf::test::iterators::nulls_at({5}));
-  auto col2 = cudf::test::lists_column_wrapper<int>(
-    {{}, {0}, {} /*NULL*/, {}, {{null, 1}, cudf::test::iterators::nulls_at({0})}, {1}, {4, 5}},
-    cudf::test::iterators::nulls_at({2}));
-  auto struct_column = cudf::test::structs_column_wrapper{{col1, col2}};
-
-  auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{
-    42, 59727262, -559580957, -559580957, -559580957, -559580957, 170038658};
-
-  auto output = cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({struct_column}), 42);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-}
-
-TEST_F(SparkMurmurHashTest, ListOfStructValues)
-{
-  /*
-  import org.apache.spark.sql.functions._
-  import org.apache.spark.sql.types.{ArrayType, IntegerType, StructType}
-  import org.apache.spark.sql.Row
-
-  val schema = new StructType()
-    .add("lists", ArrayType(new StructType()
-      .add("a", IntegerType)
-      .add("b", IntegerType)))
-
-  val data = Seq(
-    Row(List(Row(0, 0))),
-    Row(List(null)),
-    Row(List(Row(null, null))),
-    Row(List(Row(1, null))),
-    Row(List(Row(null, 1))),
-    Row(List(Row(null, 1), Row(2, 3))),
-    Row(List(Row(2, 3), null)),
-    Row(List(Row(2, 3), Row(4, 5))),
-  )
-
-  val df = spark.createDataFrame(
-    spark.sparkContext.parallelize(data), schema)
-
-  val df2 = df.selectExpr("lists", "hash(lists) as hash")
-  df2.printSchema()
-  df2.show(false)
-  */
-
-  auto const null = -1;
-  auto col1       = cudf::test::fixed_width_column_wrapper<int32_t>(
-    {0, null, null, 1, null, null, 2, 2, null, 2, 4},
-    cudf::test::iterators::nulls_at({1, 2, 4, 5, 8}));
-  auto col2 = cudf::test::fixed_width_column_wrapper<int32_t>(
-    {0, null, null, null, 1, 1, 3, 3, null, 3, 5}, cudf::test::iterators::nulls_at({1, 2, 3, 8}));
-  auto struct_column =
-    cudf::test::structs_column_wrapper{{col1, col2}, {1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1}};
-  auto offsets =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 1, 2, 3, 4, 5, 7, 9, 11};
-  auto list_nullmask = std::vector<bool>(1, 8);
-  auto [null_mask, null_count] =
-    cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
-  auto list_column = cudf::make_lists_column(
-    8, offsets.release(), struct_column.release(), null_count, std::move(null_mask));
-
-  // TODO: Lists of structs are not yet supported. Once support is added,
-  // remove this EXPECT_THROW and uncomment the rest of this test.
-  EXPECT_THROW(cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({*list_column}), 42),
-               cudf::logic_error);
-
-  /*
-  auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{
-    59727262, 42, 42, -559580957, -559580957, -912918097, 1092624418, 170038658};
-
-  auto output = cudf::hashing::spark_murmurhash3_x86_32(cudf::table_view({*list_column}), 42);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, output->view(), verbosity);
-  */
-}
diff --git a/cpp/tests/partitioning/hash_partition_test.cpp b/cpp/tests/partitioning/hash_partition_test.cpp
index 4177ee9bc98..521e1193036 100644
--- a/cpp/tests/partitioning/hash_partition_test.cpp
+++ b/cpp/tests/partitioning/hash_partition_test.cpp
@@ -193,21 +193,6 @@ TEST_F(HashPartition, IdentityHashFailure)
     cudf::logic_error);
 }
 
-TEST_F(HashPartition, UnsupportedHashFunction)
-{
-  fixed_width_column_wrapper<float> floats({1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
-  fixed_width_column_wrapper<int16_t> integers({1, 2, 3, 4, 5, 6, 7, 8});
-  strings_column_wrapper strings({"a", "bb", "ccc", "d", "ee", "fff", "gg", "h"});
-  auto input = cudf::table_view({floats, integers, strings});
-
-  auto columns_to_hash = std::vector<cudf::size_type>({2});
-
-  cudf::size_type const num_partitions = 3;
-  EXPECT_THROW(
-    cudf::hash_partition(input, columns_to_hash, num_partitions, cudf::hash_id::HASH_MD5),
-    cudf::logic_error);
-}
-
 TEST_F(HashPartition, CustomSeedValue)
 {
   fixed_width_column_wrapper<float> floats({1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f});
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index ba58f53931b..5a0fbd224ad 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -772,42 +772,7 @@ public static ColumnVector md5Hash(ColumnView... columns) {
           "Unsupported nested type column";
       columnViews[i] = columns[i].getNativeView();
     }
-    return new ColumnVector(hash(columnViews, HashType.HASH_MD5.getNativeId(), 0));
-  }
-
-  /**
-   * Create a new vector containing spark's 32-bit murmur3 hash of each row in the table.
-   * Spark's murmur3 hash uses a different tail processing algorithm.
-   *
-   * @param seed integer seed for the murmur3 hash function
-   * @param columns array of columns to hash, must have identical number of rows.
-   * @return the new ColumnVector of 32-bit values representing each row's hash value.
-   */
-  public static ColumnVector spark32BitMurmurHash3(int seed, ColumnView columns[]) {
-    if (columns.length < 1) {
-      throw new IllegalArgumentException("Murmur3 hashing requires at least 1 column of input");
-    }
-    long[] columnViews = new long[columns.length];
-    long size = columns[0].getRowCount();
-
-    for(int i = 0; i < columns.length; i++) {
-      assert columns[i] != null : "Column vectors passed may not be null";
-      assert columns[i].getRowCount() == size : "Row count mismatch, all columns must be the same size";
-      assert !columns[i].getType().isDurationType() : "Unsupported column type Duration";
-      columnViews[i] = columns[i].getNativeView();
-    }
-    return new ColumnVector(hash(columnViews, HashType.HASH_SPARK_MURMUR3.getNativeId(), seed));
-  }
-
-  /**
-   * Create a new vector containing spark's 32-bit murmur3 hash of each row in the table with the
-   * seed set to 0. Spark's murmur3 hash uses a different tail processing algorithm.
-   *
-   * @param columns array of columns to hash, must have identical number of rows.
-   * @return the new ColumnVector of 32-bit values representing each row's hash value.
-   */
-  public static ColumnVector spark32BitMurmurHash3(ColumnView columns[]) {
-    return spark32BitMurmurHash3(0, columns);
+    return new ColumnVector(md5(columnViews));
   }
 
   /**
@@ -914,15 +879,12 @@ private static native long stringConcatenationSepCol(long[] columnViews,
                                                        boolean separate_nulls);
 
   /**
-   * Native method to hash each row of the given table. Hashing function dispatched on the
-   * native side using the hashId.
+   * Native method to MD5 hash each row of the given table
    *
    * @param viewHandles array of native handles to the cudf::column_view columns being operated on.
-   * @param hashId integer native ID of the hashing function identifier HashType.
-   * @param seed integer seed for the hash. Only used by serial murmur3 hash.
    * @return native handle of the resulting cudf column containing the hex-string hashing results.
    */
-  private static native long hash(long[] viewHandles, int hashId, int seed) throws CudfException;
+  private static native long md5(long[] viewHandles) throws CudfException;
 
   /////////////////////////////////////////////////////////////////////////////
   // INTERNAL/NATIVE ACCESS
diff --git a/java/src/main/java/ai/rapids/cudf/HashType.java b/java/src/main/java/ai/rapids/cudf/HashType.java
index 081e8aa6700..50d6b866579 100644
--- a/java/src/main/java/ai/rapids/cudf/HashType.java
+++ b/java/src/main/java/ai/rapids/cudf/HashType.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -23,9 +23,7 @@
  */
 public enum HashType {
   IDENTITY(0),
-  MURMUR3(1),
-  HASH_SPARK_MURMUR3(2),
-  HASH_MD5(3);
+  MURMUR3(1);
 
   private static final HashType[] HASH_TYPES = HashType.values();
   final int nativeId;
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index 8fb7df78c09..e8a89f82a13 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -305,16 +305,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatenate(JNIEnv *env
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_hash(JNIEnv *env, jobject j_object,
-                                                              jlongArray column_handles,
-                                                              jint hash_function_id, jint seed) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_md5(JNIEnv *env, jobject j_object,
+                                                             jlongArray column_handles) {
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
 
   try {
     auto column_views =
         cudf::jni::native_jpointerArray<cudf::column_view>{env, column_handles}.get_dereferenced();
-    return release_as_jlong(cudf::hash(cudf::table_view{column_views},
-                                       static_cast<cudf::hash_id>(hash_function_id), seed));
+    return release_as_jlong(cudf::hashing::md5(cudf::table_view{column_views}));
   }
   CATCH_STD(env, 0);
 }
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index bac4d1e4b3e..1d6a3b3304a 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -530,225 +530,6 @@ void testMD5HashLists() {
     }
   }
 
-  @Test
-  void testSpark32BitMurmur3HashStrings() {
-    try (ColumnVector v0 = ColumnVector.fromStrings(
-           "a", "B\nc",  "dE\"\u0100\t\u0101 \ud720\ud721\\Fg2\'",
-           "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
-           "in the MD5 hash function. This string needed to be longer.A 60 character string to " +
-           "test MD5's message padding algorithm",
-           "hiJ\ud720\ud721\ud720\ud721", null);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v0});
-         ColumnVector expected = ColumnVector.fromBoxedInts(1485273170, 1709559900, 1423943036, 176121990, 1199621434, 42)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashInts() {
-    try (ColumnVector v0 = ColumnVector.fromBoxedInts(0, 100, null, null, Integer.MIN_VALUE, null);
-         ColumnVector v1 = ColumnVector.fromBoxedInts(0, null, -100, null, null, Integer.MAX_VALUE);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v0, v1});
-         ColumnVector expected = ColumnVector.fromBoxedInts(59727262, 751823303, -1080202046, 42, 723455942, 133916647)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashDoubles() {
-    try (ColumnVector v = ColumnVector.fromBoxedDoubles(
-          0.0, null, 100.0, -100.0, Double.MIN_NORMAL, Double.MAX_VALUE,
-          POSITIVE_DOUBLE_NAN_UPPER_RANGE, POSITIVE_DOUBLE_NAN_LOWER_RANGE,
-          NEGATIVE_DOUBLE_NAN_UPPER_RANGE, NEGATIVE_DOUBLE_NAN_LOWER_RANGE,
-          Double.POSITIVE_INFINITY, Double.NEGATIVE_INFINITY);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(new ColumnVector[]{v});
-         ColumnVector expected = ColumnVector.fromBoxedInts(1669671676, 0, -544903190, -1831674681, 150502665, 474144502, 1428788237, 1428788237, 1428788237, 1428788237, 420913893, 1915664072)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashTimestamps() {
-    // The hash values were derived from Apache Spark in a manner similar to the one documented at
-    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
-    try (ColumnVector v = ColumnVector.timestampMicroSecondsFromBoxedLongs(
-        0L, null, 100L, -100L, 0x123456789abcdefL, null, -0x123456789abcdefL);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v});
-         ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 42, 1114849490, 904948192, 657182333, 42, -57193045)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashDecimal64() {
-    // The hash values were derived from Apache Spark in a manner similar to the one documented at
-    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
-    try (ColumnVector v = ColumnVector.decimalFromLongs(-7,
-        0L, 100L, -100L, 0x123456789abcdefL, -0x123456789abcdefL);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v});
-         ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 1114849490, 904948192, 657182333, -57193045)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashDecimal32() {
-    // The hash values were derived from Apache Spark in a manner similar to the one documented at
-    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
-    try (ColumnVector v = ColumnVector.decimalFromInts(-3,
-        0, 100, -100, 0x12345678, -0x12345678);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v});
-         ColumnVector expected = ColumnVector.fromBoxedInts(-1670924195, 1114849490, 904948192, -958054811, -1447702630)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashDates() {
-    // The hash values were derived from Apache Spark in a manner similar to the one documented at
-    // https://github.com/rapidsai/cudf/blob/aa7ca46dcd9e/cpp/tests/hashing/hash_test.cpp#L281-L307
-    try (ColumnVector v = ColumnVector.timestampDaysFromBoxedInts(
-        0, null, 100, -100, 0x12345678, null, -0x12345678);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(42, new ColumnVector[]{v});
-         ColumnVector expected = ColumnVector.fromBoxedInts(933211791, 42, 751823303, -1080202046, -1721170160, 42, 1852996993)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashFloats() {
-    try (ColumnVector v = ColumnVector.fromBoxedFloats(
-          0f, 100f, -100f, Float.MIN_NORMAL, Float.MAX_VALUE, null,
-          POSITIVE_FLOAT_NAN_LOWER_RANGE, POSITIVE_FLOAT_NAN_UPPER_RANGE,
-          NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE,
-          Float.POSITIVE_INFINITY, Float.NEGATIVE_INFINITY);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(411, new ColumnVector[]{v});
-         ColumnVector expected = ColumnVector.fromBoxedInts(-235179434, 1812056886, 2028471189, 1775092689, -1531511762, 411, -1053523253, -1053523253, -1053523253, -1053523253, -1526256646, 930080402)){
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashBools() {
-    try (ColumnVector v0 = ColumnVector.fromBoxedBooleans(null, true, false, true, null, false);
-         ColumnVector v1 = ColumnVector.fromBoxedBooleans(null, true, false, null, false, true);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(0, new ColumnVector[]{v0, v1});
-         ColumnVector expected = ColumnVector.fromBoxedInts(0, -1589400010, -239939054, -68075478, 593689054, -1194558265)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashMixed() {
-    try (ColumnVector strings = ColumnVector.fromStrings(
-          "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
-          "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
-          "in the MD5 hash function. This string needed to be longer.",
-          null, null);
-         ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
-         ColumnVector doubles = ColumnVector.fromBoxedDoubles(
-          0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
-         ColumnVector floats = ColumnVector.fromBoxedFloats(
-          0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
-         ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools});
-         ColumnVector expected = ColumnVector.fromBoxedInts(1936985022, 720652989, 339312041, 1400354989, 769988643, 1868)) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashStruct() {
-    try (ColumnVector strings = ColumnVector.fromStrings(
-        "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
-        "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
-            "in the MD5 hash function. This string needed to be longer.",
-        null, null);
-         ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
-         ColumnVector doubles = ColumnVector.fromBoxedDoubles(
-             0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
-         ColumnVector floats = ColumnVector.fromBoxedFloats(
-             0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
-         ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
-         ColumnView structs = ColumnView.makeStructView(strings, integers, doubles, floats, bools);
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{structs});
-         ColumnVector expected = ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools})) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashNestedStruct() {
-    try (ColumnVector strings = ColumnVector.fromStrings(
-        "a", "B\n", "dE\"\u0100\t\u0101 \ud720\ud721",
-        "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
-            "in the MD5 hash function. This string needed to be longer.",
-        null, null);
-         ColumnVector integers = ColumnVector.fromBoxedInts(0, 100, -100, Integer.MIN_VALUE, Integer.MAX_VALUE, null);
-         ColumnVector doubles = ColumnVector.fromBoxedDoubles(
-             0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
-         ColumnVector floats = ColumnVector.fromBoxedFloats(
-             0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
-         ColumnVector bools = ColumnVector.fromBoxedBooleans(true, false, null, false, true, null);
-         ColumnView structs1 = ColumnView.makeStructView(strings, integers);
-         ColumnView structs2 = ColumnView.makeStructView(structs1, doubles);
-         ColumnView structs3 = ColumnView.makeStructView(bools);
-         ColumnView structs = ColumnView.makeStructView(structs2, floats, structs3);
-         ColumnVector expected = ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{strings, integers, doubles, floats, bools});
-         ColumnVector result = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{structs})) {
-      assertColumnsAreEqual(expected, result);
-    }
-  }
-
-  @Test
-  void testSpark32BitMurmur3HashListsAndNestedLists() {
-    try (ColumnVector stringListCV = ColumnVector.fromLists(
-             new ListType(true, new BasicType(true, DType.STRING)),
-             Arrays.asList(null, "a"),
-             Arrays.asList("B\n", ""),
-             Arrays.asList("dE\"\u0100\t\u0101", " \ud720\ud721"),
-             Collections.singletonList("A very long (greater than 128 bytes/char string) to test a multi" +
-             " hash-step data point in the Murmur3 hash function. This string needed to be longer."),
-             Collections.singletonList(""),
-             null);
-         ColumnVector strings1 = ColumnVector.fromStrings(
-             "a", "B\n", "dE\"\u0100\t\u0101",
-             "A very long (greater than 128 bytes/char string) to test a multi hash-step data point " +
-             "in the Murmur3 hash function. This string needed to be longer.", null, null);
-         ColumnVector strings2 = ColumnVector.fromStrings(
-             null, "", " \ud720\ud721", null, "", null);
-         ColumnView stringStruct = ColumnView.makeStructView(strings1, strings2);
-         ColumnVector stringExpected = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{stringStruct});
-         ColumnVector stringResult = ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{stringListCV});
-         ColumnVector intListCV = ColumnVector.fromLists(
-             new ListType(true, new BasicType(true, DType.INT32)),
-             null,
-             Arrays.asList(0, -2, 3),
-             Collections.singletonList(Integer.MAX_VALUE),
-             Arrays.asList(5, -6, null),
-             Collections.singletonList(Integer.MIN_VALUE),
-             null);
-         ColumnVector integers1 = ColumnVector.fromBoxedInts(null, 0, null, 5, Integer.MIN_VALUE, null);
-         ColumnVector integers2 = ColumnVector.fromBoxedInts(null, -2, Integer.MAX_VALUE, null, null, null);
-         ColumnVector integers3 = ColumnVector.fromBoxedInts(null, 3, null, -6, null, null);
-         ColumnVector intExpected =
-             ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{integers1, integers2, integers3});
-         ColumnVector intResult = ColumnVector.spark32BitMurmurHash3(1868, new ColumnVector[]{intListCV});
-         ColumnVector doubles = ColumnVector.fromBoxedDoubles(
-          0.0, 100.0, -100.0, POSITIVE_DOUBLE_NAN_LOWER_RANGE, POSITIVE_DOUBLE_NAN_UPPER_RANGE, null);
-         ColumnVector floats = ColumnVector.fromBoxedFloats(
-          0f, 100f, -100f, NEGATIVE_FLOAT_NAN_LOWER_RANGE, NEGATIVE_FLOAT_NAN_UPPER_RANGE, null);
-         ColumnView structCV = ColumnView.makeStructView(intListCV, stringListCV, doubles, floats);
-         ColumnVector nestedExpected =
-             ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{intListCV, strings1, strings2, doubles, floats});
-         ColumnVector nestedResult =
-             ColumnVector.spark32BitMurmurHash3(1868, new ColumnView[]{structCV})) {
-      assertColumnsAreEqual(stringExpected, stringResult);
-      assertColumnsAreEqual(intExpected, intResult);
-      assertColumnsAreEqual(nestedExpected, nestedResult);
-    }
-  }
-
   @Test
   void isNotNullTestEmptyColumn() {
     try (ColumnVector v = ColumnVector.fromBoxedInts();

From 43994fadf6c9c2bd6b599c79999f62a23d57b18a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 4 Apr 2024 16:49:28 -0400
Subject: [PATCH 011/842] Fix base_normalator::integer_sizeof_fn integer
 dispatch (#15457)

Fixes the `cudf::detail::base_normalator::integer_sizeof_fn` dispatch function to support only integers. Also remove the `constexpr` since the non-integer path can throw an exception.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15457
---
 cpp/include/cudf/detail/normalizing_iterator.cuh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh
index 8f90afc3e57..32df13104e0 100644
--- a/cpp/include/cudf/detail/normalizing_iterator.cuh
+++ b/cpp/include/cudf/detail/normalizing_iterator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -204,8 +204,8 @@ struct alignas(16) base_normalator {
 
  private:
   struct integer_sizeof_fn {
-    template <typename T, CUDF_ENABLE_IF(not cudf::is_fixed_width<T>())>
-    CUDF_HOST_DEVICE constexpr std::size_t operator()() const
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_integral_not_bool<T>())>
+    CUDF_HOST_DEVICE std::size_t operator()() const
     {
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("only integral types are supported");
@@ -213,8 +213,8 @@ struct alignas(16) base_normalator {
       CUDF_UNREACHABLE("only integral types are supported");
 #endif
     }
-    template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_width<T>())>
-    CUDF_HOST_DEVICE constexpr std::size_t operator()() const noexcept
+    template <typename T, CUDF_ENABLE_IF(cudf::is_integral_not_bool<T>())>
+    CUDF_HOST_DEVICE std::size_t operator()() const noexcept
     {
       return sizeof(T);
     }

From d7b8fc4de4107b6ee95cdeb26e7efecd3adf9325 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 4 Apr 2024 16:50:23 -0400
Subject: [PATCH 012/842] Remove empty elements from exploded character-ngrams
 output (#15371)

Fixes `character_ngrams` function to not include empty entries when `as_list=False`. That is, the exploded view (non-list result) should not contain empty or NA elements.

This PR changes the `nvtext::generate_character_ngrams()` API to return a lists column instead of a flat strings column. The python code had been converting the return object into lists column and then exploding it if `as_list=False`. Returning as a list column simplifies the logic and prevents the double conversion. There is almost no impact to the nvtext code since the offsets for the output lists column were already being generated.

All tests were updated to expect the new result. Also changed some exception types from `cudf::logic_error` to `std::invalid_argument` as appropriate.

Continues work of abandoned PR #14685

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15371
---
 cpp/include/nvtext/generate_ngrams.hpp        | 18 +++--
 cpp/src/text/generate_ngrams.cu               | 57 ++++++++--------
 cpp/tests/text/ngrams_tests.cpp               | 66 ++++++++++---------
 python/cudf/cudf/core/column/string.py        | 25 ++-----
 .../cudf/cudf/tests/text/test_text_methods.py | 10 +--
 5 files changed, 83 insertions(+), 93 deletions(-)

diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp
index 46f2c0e7bc9..e3d667f0292 100644
--- a/cpp/include/nvtext/generate_ngrams.hpp
+++ b/cpp/include/nvtext/generate_ngrams.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,21 +62,19 @@ std::unique_ptr<cudf::column> generate_ngrams(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief Generates ngrams of characters within each string.
+ * @brief Generates ngrams of characters within each string
  *
- * Each character of a string used to build ngrams.
+ * Each character of a string is used to build ngrams for the output row.
  * Ngrams are not created across strings.
  *
  * ```
- * ["ab", "cde", "fgh"] would generate bigrams as ["ab", "cd", "de", "fg", "gh"]
+ * ["ab", "cde", "fgh"] would generate bigrams as
+ * [["ab"], ["cd", "de"], ["fg", "gh"]]
  * ```
  *
- * The size of the output column will be the total number of ngrams generated from
- * the input strings column.
- *
- * All null row entries are ignored and the output contains all valid rows.
+ * All null row entries are ignored and the corresponding output row will be empty.
  *
- * @throw cudf::logic_error if `ngrams < 2`
+ * @throw std::invalid_argument if `ngrams < 2`
  * @throw cudf::logic_error if there are not enough characters to generate any ngrams
  *
  * @param input Strings column to produce ngrams from
@@ -84,7 +82,7 @@ std::unique_ptr<cudf::column> generate_ngrams(
  *               Default is 2 = bigram.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New strings columns of tokens
+ * @return Lists column of strings
  */
 std::unique_ptr<cudf::column> generate_character_ngrams(
   cudf::strings_column_view const& input,
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 3290b58101d..d2a0ef71e4a 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -40,6 +40,8 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform_scan.h>
 
+#include <stdexcept>
+
 namespace nvtext {
 namespace detail {
 namespace {
@@ -90,9 +92,12 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be valid");
+  CUDF_EXPECTS(
+    separator.is_valid(stream), "Parameter separator must be valid", std::invalid_argument);
   cudf::string_view const d_separator(separator.data(), separator.size());
-  CUDF_EXPECTS(ngrams > 1, "Parameter ngrams should be an integer value of 2 or greater");
+  CUDF_EXPECTS(ngrams > 1,
+               "Parameter ngrams should be an integer value of 2 or greater",
+               std::invalid_argument);
 
   auto strings_count = strings.size();
   if (strings_count == 0)  // if no strings, return an empty column
@@ -196,47 +201,45 @@ struct character_ngram_generator_fn {
 };
 }  // namespace
 
-std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& input,
                                                         cudf::size_type ngrams,
                                                         rmm::cuda_stream_view stream,
                                                         rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(ngrams > 1, "Parameter ngrams should be an integer value of 2 or greater");
+  CUDF_EXPECTS(ngrams >= 2,
+               "Parameter ngrams should be an integer value of 2 or greater",
+               std::invalid_argument);
 
-  auto const strings_count = strings.size();
-  if (strings_count == 0)  // if no strings, return an empty column
+  auto const strings_count = input.size();
+  if (strings_count == 0) {  // if no strings, return an empty column
     return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  }
 
-  auto const strings_column = cudf::column_device_view::create(strings.parent(), stream);
-  auto const d_strings      = *strings_column;
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
 
-  // create a vector of ngram offsets for each string
-  rmm::device_uvector<cudf::size_type> ngram_offsets(strings_count + 1, stream);
-  thrust::transform_exclusive_scan(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<cudf::size_type>(0),
-    thrust::make_counting_iterator<cudf::size_type>(strings_count + 1),
-    ngram_offsets.begin(),
+  auto sizes_itr = cudf::detail::make_counting_transform_iterator(
+    0,
     cuda::proclaim_return_type<cudf::size_type>(
-      [d_strings, strings_count, ngrams] __device__(auto idx) {
-        if (d_strings.is_null(idx) || (idx == strings_count)) return 0;
+      [d_strings = *d_strings, ngrams] __device__(auto idx) {
+        if (d_strings.is_null(idx)) { return 0; }
         auto const length = d_strings.element<cudf::string_view>(idx).length();
         return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams));
-      }),
-    cudf::size_type{0},
-    thrust::plus<cudf::size_type>());
-
-  // total ngrams count is the last entry
-  cudf::size_type const total_ngrams = ngram_offsets.back_element(stream);
+      }));
+  auto [offsets, total_ngrams] =
+    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto d_offsets = offsets->view().data<cudf::size_type>();
   CUDF_EXPECTS(total_ngrams > 0,
                "Insufficient number of characters in each string to generate ngrams");
 
-  character_ngram_generator_fn generator{d_strings, ngrams, ngram_offsets.data()};
+  character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets};
   auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
     generator, strings_count, total_ngrams, stream, mr);
 
-  return cudf::make_strings_column(
+  auto output = cudf::make_strings_column(
     total_ngrams, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
+
+  return make_lists_column(
+    input.size(), std::move(offsets), std::move(output), 0, rmm::device_buffer{}, stream, mr);
 }
 
 namespace {
@@ -277,7 +280,9 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
                                                     rmm::cuda_stream_view stream,
                                                     rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(ngrams >= 2, "Parameter ngrams should be an integer value of 2 or greater");
+  CUDF_EXPECTS(ngrams >= 2,
+               "Parameter ngrams should be an integer value of 2 or greater",
+               std::invalid_argument);
 
   auto output_type = cudf::data_type{cudf::type_to_id<cudf::hash_value_type>()};
   if (input.is_empty()) { return cudf::make_empty_column(output_type); }
diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp
index c5a5a342471..1acb4fc4265 100644
--- a/cpp/tests/text/ngrams_tests.cpp
+++ b/cpp/tests/text/ngrams_tests.cpp
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/column/column.hpp>
@@ -50,29 +51,24 @@ TEST_F(TextGenerateNgramsTest, Ngrams)
     auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
   {
-    cudf::test::strings_column_wrapper expected{"th",
-                                                "he",
-                                                "fo",
-                                                "ox",
-                                                "ju",
-                                                "um",
-                                                "mp",
-                                                "pe",
-                                                "ed",
-                                                "ov",
-                                                "ve",
-                                                "er",
-                                                "th",
-                                                "hé",
-                                                "do",
-                                                "og"};
+    LCW expected({LCW({"th", "he"}),
+                  LCW({"fo", "ox"}),
+                  LCW({"ju", "um", "mp", "pe", "ed"}),
+                  LCW({"ov", "ve", "er"}),
+                  LCW({"th", "hé"}),
+                  LCW({"do", "og"})});
     auto const results = nvtext::generate_character_ngrams(strings_view, 2);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    cudf::test::strings_column_wrapper expected{
-      "the", "fox", "jum", "ump", "mpe", "ped", "ove", "ver", "thé", "dog"};
+    LCW expected({LCW({"the"}),
+                  LCW({"fox"}),
+                  LCW({"jum", "ump", "mpe", "ped"}),
+                  LCW({"ove", "ver"}),
+                  LCW({"thé"}),
+                  LCW({"dog"})});
     auto const results = nvtext::generate_character_ngrams(strings_view, 3);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
@@ -80,24 +76,29 @@ TEST_F(TextGenerateNgramsTest, Ngrams)
 
 TEST_F(TextGenerateNgramsTest, NgramsWithNulls)
 {
-  std::vector<char const*> h_strings{"the", "fox", "", "jumped", "over", nullptr, "the", "dog"};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto validity = cudf::test::iterators::null_at(5);
+  cudf::test::strings_column_wrapper input({"the", "fox", "", "jumped", "over", "", "the", "dog"},
+                                           validity);
   auto const separator = cudf::string_scalar("_");
 
-  cudf::strings_column_view strings_view(strings);
+  cudf::strings_column_view sv(input);
   {
-    auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
+    auto const results = nvtext::generate_ngrams(sv, 3, separator);
     cudf::test::strings_column_wrapper expected{
       "the_fox_jumped", "fox_jumped_over", "jumped_over_the", "over_the_dog"};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    cudf::test::strings_column_wrapper expected{
-      "the", "fox", "jum", "ump", "mpe", "ped", "ove", "ver", "the", "dog"};
-    auto const results = nvtext::generate_character_ngrams(strings_view, 3);
+    using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+    LCW expected({LCW({"the"}),
+                  LCW({"fox"}),
+                  LCW{},
+                  LCW({"jum", "ump", "mpe", "ped"}),
+                  LCW({"ove", "ver"}),
+                  LCW{},
+                  LCW({"the"}),
+                  LCW({"dog"})});
+    auto const results = nvtext::generate_character_ngrams(sv, 3);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
@@ -121,9 +122,12 @@ TEST_F(TextGenerateNgramsTest, Errors)
   auto const separator = cudf::string_scalar("_");
   // invalid parameter value
   EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1, separator),
-               cudf::logic_error);
+               std::invalid_argument);
   EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 1),
-               cudf::logic_error);
+               std::invalid_argument);
+  auto const invalid_separator = cudf::string_scalar("", false);
+  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 2, invalid_separator),
+               std::invalid_argument);
   // not enough strings to generate ngrams
   EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3, separator),
                cudf::logic_error);
@@ -165,7 +169,7 @@ TEST_F(TextGenerateNgramsTest, NgramsHashErrors)
   auto view  = cudf::strings_column_view(input);
 
   // invalid parameter value
-  EXPECT_THROW(nvtext::hash_character_ngrams(view, 1), cudf::logic_error);
+  EXPECT_THROW(nvtext::hash_character_ngrams(view, 1), std::invalid_argument);
   // strings not long enough to generate ngrams
   EXPECT_THROW(nvtext::hash_character_ngrams(view), cudf::logic_error);
 }
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 06d7aa030db..0862995bc46 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4830,27 +4830,14 @@ def character_ngrams(
         2         [xyz]
         dtype: list
         """
-        ngrams = libstrings.generate_character_ngrams(self._column, n)
-
-        # convert the output to a list by just generating the
-        # offsets for the output list column
-        sn = (self.len() - (n - 1)).clip(0, None).fillna(0)  # type: ignore
-        sizes = libcudf.concat.concat_columns(
-            [column.as_column(0, dtype=np.int32, length=1), sn._column]
-        )
-        oc = libcudf.reduce.scan("cumsum", sizes, True)
-        lc = cudf.core.column.ListColumn(
-            size=self._column.size,
-            dtype=cudf.ListDtype(self._column.dtype),
-            mask=self._column.mask,
-            offset=0,
-            null_count=self._column.null_count,
-            children=(oc, ngrams),
+        result = self._return_or_inplace(
+            libstrings.generate_character_ngrams(self._column, n),
+            retain_index=True,
         )
-        result = self._return_or_inplace(lc, retain_index=True)
-
         if isinstance(result, cudf.Series) and not as_list:
-            return result.explode()
+            # before exploding, removes those lists which have 0 length
+            result = result[result.list.len() > 0]
+            return result.explode()  # type: ignore
         return result
 
     def hash_character_ngrams(
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 2dccd583b23..6ecead862bb 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import random
 import string
@@ -330,9 +330,8 @@ def test_ngrams(n, separator, expected_values):
                 "he",
                 "er",
                 "re",
-                cudf.NA,
             ],
-            [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5, 6],
+            [1, 1, 1, 2, 3, 4, 4, 4, 5, 5, 5],
             False,
         ),
         (
@@ -340,15 +339,12 @@ def test_ngrams(n, separator, expected_values):
             [
                 "thi",
                 "his",
-                cudf.NA,
-                cudf.NA,
                 "boo",
                 "ook",
                 "her",
                 "ere",
-                cudf.NA,
             ],
-            [1, 1, 2, 3, 4, 4, 5, 5, 6],
+            [1, 1, 4, 4, 5, 5],
             False,
         ),
         (

From 4e44d5d3c80852a15ae28d5afa0b13646ca3a4fd Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 4 Apr 2024 16:51:40 -0400
Subject: [PATCH 013/842] Large strings support in cudf::concatenate (#15195)

Enables `cudf::concatenate` to create and return a large strings column (offsets are INT64).

This also introduces the `LIBCUDF_LARGE_STRINGS_ENABLED` environment variable and utilities around it.
One internal utility checks the value so appropriate logic can either throw an overflow exception or build INT64 offsets as appropriate.

The `cudf::test::large_strings_enabler` is introduced to set/unset the env var for individual tests are needed.
A follow on PR will attempt to consolidate these kinds of tests with a specialized test fixture using this utility class.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15195
---
 cpp/include/cudf/strings/detail/utilities.hpp | 27 +++++++
 cpp/include/cudf_test/column_utilities.hpp    | 25 ++++++
 cpp/src/strings/copying/concatenate.cu        |  6 +-
 cpp/src/strings/utilities.cu                  | 35 ++++++++-
 cpp/tests/copying/concatenate_tests.cpp       | 76 +++++++++++--------
 cpp/tests/utilities/column_utilities.cu       | 11 +++
 6 files changed, 142 insertions(+), 38 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index 8d8065dbcaf..cf9a13e9742 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -27,6 +27,24 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
+/**
+ * @brief Create an offsets column to be a child of a strings column
+ *
+ * This will return the properly typed column to be filled in by the caller
+ * given the number of bytes to address.
+ *
+ * @param chars_bytes Number of bytes for the chars in the strings column
+ * @param count Number of elements for the offsets column.
+ *              This is the number of rows in the parent strings column +1.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return The offsets child column for a strings column
+ */
+std::unique_ptr<column> create_offsets_child_column(int64_t chars_bytes,
+                                                    size_type count,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr);
+
 /**
  * @brief Creates a string_view vector from a strings column.
  *
@@ -52,6 +70,15 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
  */
 int64_t get_offset64_threshold();
 
+/**
+ * @brief Checks if large strings is enabled
+ *
+ * This checks the setting in the environment variable LIBCUDF_LARGE_STRINGS_ENABLED.
+ *
+ * @return true if large strings are supported
+ */
+bool is_large_strings_enabled();
+
 /**
  * @brief Return a normalized offset value from a strings offsets column
  *
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index a8957473175..c83599a8072 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -210,6 +210,29 @@ template <>
 std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(column_view c);
 //! @endcond
 
+/**
+ * @brief For enabling large strings testing in specific tests
+ */
+struct large_strings_enabler {
+  /**
+   * @brief Create large strings enable object
+   *
+   * @param default_enable Default enables large strings support
+   */
+  large_strings_enabler(bool default_enable = true);
+  ~large_strings_enabler();
+
+  /**
+   * @brief Enable large strings support
+   */
+  void enable();
+
+  /**
+   * @brief Disable large strings support
+   */
+  void disable();
+};
+
 }  // namespace cudf::test
 
 // Macros for showing line of failure.
@@ -242,3 +265,5 @@ std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(c
     SCOPED_TRACE(" <--  line of failure\n");                        \
     cudf::test::detail::expect_equal_buffers(lhs, rhs, size_bytes); \
   } while (0)
+
+#define CUDF_TEST_ENABLE_LARGE_STRINGS() cudf::test::large_strings_enabler ls___
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index c4564b1105b..de7067f0bed 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -220,9 +220,6 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   CUDF_EXPECTS(offsets_count <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
                "total number of strings exceeds the column size limit",
                std::overflow_error);
-  CUDF_EXPECTS(total_bytes <= static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
-               "total size of strings exceeds the column size limit",
-               std::overflow_error);
 
   bool const has_nulls =
     std::any_of(columns.begin(), columns.end(), [](auto const& col) { return col.has_nulls(); });
@@ -232,8 +229,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
   auto d_new_chars = output_chars.data();
 
   // create output offsets column
-  auto offsets_column = make_numeric_column(
-    data_type{type_id::INT32}, offsets_count, mask_state::UNALLOCATED, stream, mr);
+  auto offsets_column = create_offsets_child_column(total_bytes, offsets_count, stream, mr);
   auto itr_new_offsets =
     cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view());
 
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 0a7353821b0..c83f827f290 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/strings/detail/char_tables.hpp>
 #include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -31,6 +32,9 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
 
+#include <cstdlib>
+#include <string>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -65,6 +69,27 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
   return strings_vector;
 }
 
+/**
+ * @copydoc cudf::strings::detail::create_offsets_child_column
+ */
+std::unique_ptr<column> create_offsets_child_column(int64_t chars_bytes,
+                                                    size_type count,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr)
+{
+  auto const threshold = get_offset64_threshold();
+  if (!is_large_strings_enabled()) {
+    CUDF_EXPECTS(
+      chars_bytes < threshold, "Size of output exceeds the column size limit", std::overflow_error);
+  }
+  return make_numeric_column(
+    chars_bytes < threshold ? data_type{type_id::INT32} : data_type{type_id::INT64},
+    count,
+    mask_state::UNALLOCATED,
+    stream,
+    mr);
+}
+
 namespace {
 // The device variables are created here to avoid using a singleton that may cause issues
 // with RMM initialize/finalize. See PR #3159 for details on this approach.
@@ -123,13 +148,19 @@ special_case_mapping const* get_special_case_mapping_table()
 
 int64_t get_offset64_threshold()
 {
-  auto const threshold  = std::getenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
-  std::size_t const rtn = threshold != nullptr ? std::atol(threshold) : 0;
+  auto const threshold = std::getenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
+  int64_t const rtn    = threshold != nullptr ? std::atol(threshold) : 0L;
   return (rtn > 0 && rtn < std::numeric_limits<int32_t>::max())
            ? rtn
            : std::numeric_limits<int32_t>::max();
 }
 
+bool is_large_strings_enabled()
+{
+  auto const env = std::getenv("LIBCUDF_LARGE_STRINGS_ENABLED");
+  return env != nullptr && std::string(env) == "1";
+}
+
 int64_t get_offset_value(cudf::column_view const& offsets,
                          size_type index,
                          rmm::cuda_stream_view stream)
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 0f7c1053adf..3e2e332936e 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -32,6 +32,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <thrust/iterator/constant_iterator.h>
+
 #include <numeric>
 #include <stdexcept>
 #include <string>
@@ -164,37 +166,6 @@ TEST_F(StringColumnTest, ConcatenateColumnView)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
-TEST_F(StringColumnTest, ConcatenateColumnViewLarge)
-{
-  // Test large concatenate, causes out of bound device memory errors if kernel
-  // indexing is not int64_t.
-  // 1.5GB bytes, 5k columns
-  constexpr size_t num_strings        = 10000;
-  constexpr size_t string_length      = 150000;
-  constexpr size_t strings_per_column = 2;
-  constexpr size_t num_columns        = num_strings / strings_per_column;
-
-  std::vector<std::string> strings;
-  std::vector<char const*> h_strings;
-  std::vector<cudf::test::strings_column_wrapper> strings_column_wrappers;
-  std::vector<cudf::column_view> strings_columns;
-
-  std::string s(string_length, 'a');
-  for (size_t i = 0; i < num_strings; ++i)
-    h_strings.push_back(s.data());
-
-  for (size_t i = 0; i < num_columns; ++i)
-    strings_column_wrappers.push_back(cudf::test::strings_column_wrapper(
-      h_strings.data() + i * strings_per_column, h_strings.data() + (i + 1) * strings_per_column));
-  for (auto& wrapper : strings_column_wrappers)
-    strings_columns.push_back(wrapper);
-
-  auto results = cudf::concatenate(strings_columns);
-
-  cudf::test::strings_column_wrapper expected(h_strings.begin(), h_strings.end());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-}
-
 TEST_F(StringColumnTest, ConcatenateManyColumns)
 {
   std::vector<char const*> h_strings{
@@ -226,6 +197,49 @@ TEST_F(StringColumnTest, ConcatenateTooLarge)
   EXPECT_THROW(cudf::concatenate(input_cols), std::overflow_error);
 }
 
+TEST_F(StringColumnTest, ConcatenateLargeStrings)
+{
+  CUDF_TEST_ENABLE_LARGE_STRINGS();
+  auto itr = thrust::constant_iterator<std::string_view>(
+    "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY");                // 50 bytes
+  auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000);  // 250MB
+  auto view  = cudf::column_view(input);
+  std::vector<cudf::column_view> input_cols;
+  std::vector<cudf::size_type> splits;
+  int const multiplier = 10;
+  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
+    input_cols.push_back(view);
+    splits.push_back(view.size() * (i + 1));
+  }
+  splits.pop_back();  // remove last entry
+  auto result = cudf::concatenate(input_cols);
+  auto sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.size() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+
+  // verify results in sections
+  auto sliced = cudf::split(result->view(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also test with large strings column as input
+  {
+    input_cols.clear();
+    input_cols.push_back(input);           // regular column
+    input_cols.push_back(result->view());  // large column
+    result = cudf::concatenate(input_cols);
+    sv     = cudf::strings_column_view(result->view());
+    EXPECT_EQ(sv.size(), view.size() * (multiplier + 1));
+    EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+    splits.push_back(view.size() * multiplier);
+    sliced = cudf::split(result->view(), splits);
+    for (auto c : sliced) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+    }
+  }
+}
+
 struct TableTest : public cudf::test::BaseFixture {};
 
 TEST_F(TableTest, ConcatenateTables)
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 2cd7dc1574c..047b096a283 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -1011,5 +1011,16 @@ std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(c
   return {std::move(host_data), bitmask_to_host(c)};
 }
 
+large_strings_enabler::large_strings_enabler(bool default_enable)
+{
+  default_enable ? enable() : disable();
+}
+
+large_strings_enabler::~large_strings_enabler() { disable(); }
+
+void large_strings_enabler::enable() { setenv("LIBCUDF_LARGE_STRINGS_ENABLED", "1", 1); }
+
+void large_strings_enabler::disable() { setenv("LIBCUDF_LARGE_STRINGS_ENABLED", "0", 1); }
+
 }  // namespace test
 }  // namespace cudf

From 0ed224d94a915eee4ce7cdc2d837c1be1c93afcc Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 4 Apr 2024 20:42:36 -0500
Subject: [PATCH 014/842] Support implicit array conversion with query-planning
 enabled (#15378)

when query-planning is enabled, implicit conversion is not yet supported from a cudf-backed collection to a dask array. [Some cuml + dask CI failures are related to this limitation](https://github.com/rapidsai/cuml/pull/5815#issuecomment-2011030249). This PR adds basic support for implicit conversion.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15378
---
 .../dask_cudf/dask_cudf/expr/_collection.py   | 31 +++++++++++++++++
 python/dask_cudf/dask_cudf/tests/test_core.py | 34 +++++++++++++++++++
 2 files changed, 65 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index b2f92aeddda..799e6eddab3 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -108,3 +108,34 @@ class Index(DXIndex):
 get_collection_type.register(cudf.DataFrame, lambda _: DataFrame)
 get_collection_type.register(cudf.Series, lambda _: Series)
 get_collection_type.register(cudf.BaseIndex, lambda _: Index)
+
+
+##
+## Support conversion to GPU-backed Array collections
+##
+
+
+try:
+    from dask_expr._backends import create_array_collection
+
+    @get_collection_type.register_lazy("cupy")
+    def _register_cupy():
+        import cupy
+
+        @get_collection_type.register(cupy.ndarray)
+        def get_collection_type_cupy_array(_):
+            return create_array_collection
+
+    @get_collection_type.register_lazy("cupyx")
+    def _register_cupyx():
+        # Needed for cuml
+        from cupyx.scipy.sparse import spmatrix
+
+        @get_collection_type.register(spmatrix)
+        def get_collection_type_csr_matrix(_):
+            return create_array_collection
+
+except ImportError:
+    # Older version of dask-expr.
+    # Implicit conversion to array wont work.
+    pass
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 8a2f3414fd1..c6918c94559 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -913,3 +913,37 @@ def test_categorical_dtype_round_trip():
     actual = ds.compute()
     expected = pds.compute()
     assert actual.dtype.ordered == expected.dtype.ordered
+
+
+def test_implicit_array_conversion_cupy():
+    s = cudf.Series(range(10))
+    ds = dask_cudf.from_cudf(s, npartitions=2)
+
+    def func(x):
+        return x.values
+
+    # Need to compute the dask collection for now.
+    # See: https://github.com/dask/dask/issues/11017
+    result = ds.map_partitions(func, meta=s.values).compute()
+    expect = func(s)
+
+    dask.array.assert_eq(result, expect)
+
+
+def test_implicit_array_conversion_cupy_sparse():
+    cupyx = pytest.importorskip("cupyx")
+
+    s = cudf.Series(range(10), dtype="float32")
+    ds = dask_cudf.from_cudf(s, npartitions=2)
+
+    def func(x):
+        return cupyx.scipy.sparse.csr_matrix(x.values)
+
+    # Need to compute the dask collection for now.
+    # See: https://github.com/dask/dask/issues/11017
+    result = ds.map_partitions(func, meta=s.values).compute()
+    expect = func(s)
+
+    # NOTE: The calculation here doesn't need to make sense.
+    # We just need to make sure we get the right type back.
+    assert type(result) == type(expect)

From a00c3c916947d16fbf997095a32a02ca510b78e5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 4 Apr 2024 22:20:32 -1000
Subject: [PATCH 015/842] Cleanup some timedelta/datetime column logic (#14715)

Remove private `_time_unit` attribute in favor of the public one and perform dtype validation earlier in `__init__`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14715
---
 python/cudf/cudf/core/_internals/timezones.py |  6 +--
 python/cudf/cudf/core/column/column.py        |  8 +--
 python/cudf/cudf/core/column/datetime.py      | 35 ++++++++-----
 python/cudf/cudf/core/column/timedelta.py     | 49 ++++++++-----------
 .../cudf/tests/series/test_datetimelike.py    | 15 ++++++
 5 files changed, 61 insertions(+), 52 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
index 4e2fad08d56..4888cdd9ac9 100644
--- a/python/cudf/cudf/core/_internals/timezones.py
+++ b/python/cudf/cudf/core/_internals/timezones.py
@@ -114,7 +114,7 @@ def _find_ambiguous_and_nonexistent(
     tz_data_for_zone = get_tz_data(zone_name)
     transition_times = tz_data_for_zone["transition_times"]
     offsets = tz_data_for_zone["offsets"].astype(
-        f"timedelta64[{data._time_unit}]"
+        f"timedelta64[{data.time_unit}]"
     )
 
     if len(offsets) == 1:  # no transitions
@@ -183,7 +183,7 @@ def localize(
             "Already localized. "
             "Use `tz_convert` to convert between time zones."
         )
-    dtype = pd.DatetimeTZDtype(data._time_unit, zone_name)
+    dtype = pd.DatetimeTZDtype(data.time_unit, zone_name)
     ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name)
     localized = cast(
         DatetimeColumn,
@@ -230,7 +230,7 @@ def convert(data: DatetimeTZColumn, zone_name: str) -> DatetimeTZColumn:
         DatetimeTZColumn,
         build_column(
             data=utc_time.base_data,
-            dtype=pd.DatetimeTZDtype(data._time_unit, zone_name),
+            dtype=pd.DatetimeTZDtype(data.time_unit, zone_name),
             mask=utc_time.base_mask,
             size=utc_time.size,
             offset=utc_time.offset,
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 2541e076250..835da36fbfd 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -537,13 +537,7 @@ def element_indexing(self, index: int):
             idx = len(self) + idx
         if idx > len(self) - 1 or idx < 0:
             raise IndexError("single positional indexer is out-of-bounds")
-        result = libcudf.copying.get_element(self, idx).value
-        if cudf.get_option("mode.pandas_compatible"):
-            if isinstance(result, np.datetime64):
-                return pd.Timestamp(result)
-            elif isinstance(result, np.timedelta64):
-                return pd.Timedelta(result)
-        return result
+        return libcudf.copying.get_element(self, idx).value
 
     def slice(
         self, start: int, stop: int, stride: Optional[int] = None
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 9a5d9dcd47a..b84c1dc7ccd 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import datetime
+import functools
 import locale
 import re
 from locale import nl_langinfo
@@ -241,6 +242,8 @@ def __init__(
         null_count: Optional[int] = None,
     ):
         dtype = cudf.dtype(dtype)
+        if dtype.kind != "M":
+            raise TypeError(f"{self.dtype} is not a supported datetime type")
 
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
@@ -256,26 +259,26 @@ def __init__(
             null_count=null_count,
         )
 
-        if self.dtype.type is not np.datetime64:
-            raise TypeError(f"{self.dtype} is not a supported datetime type")
-
-        self._time_unit, _ = np.datetime_data(self.dtype)
-
     def __contains__(self, item: ScalarLike) -> bool:
         try:
-            item_as_dt64 = np.datetime64(item, self._time_unit)
-        except ValueError:
-            # If item cannot be converted to datetime type
-            # np.datetime64 raises ValueError, hence `item`
-            # cannot exist in `self`.
+            ts = pd.Timestamp(item).as_unit(self.time_unit)
+        except Exception:
+            # pandas can raise a variety of errors
+            # item cannot exist in self.
             return False
-        return item_as_dt64.astype("int64") in self.as_numerical_column(
+        if ts.tzinfo is None and isinstance(self.dtype, pd.DatetimeTZDtype):
+            return False
+        elif ts.tzinfo is not None:
+            ts = ts.tz_convert(None)
+        return ts.to_numpy().astype("int64") in self.as_numerical_column(
             "int64"
         )
 
-    @property
+    @functools.cached_property
     def time_unit(self) -> str:
-        return self._time_unit
+        if isinstance(self.dtype, pd.DatetimeTZDtype):
+            return self.dtype.unit
+        return np.datetime_data(self.dtype)[0]
 
     @property
     def year(self) -> ColumnBase:
@@ -322,6 +325,12 @@ def values(self):
             "DateTime Arrays is not yet implemented in cudf"
         )
 
+    def element_indexing(self, index: int):
+        result = super().element_indexing(index)
+        if cudf.get_option("mode.pandas_compatible"):
+            return pd.Timestamp(result)
+        return result
+
     def get_dt_field(self, field: str) -> ColumnBase:
         return libcudf.datetime.extract_datetime_component(self, field)
 
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 0d24e8e5120..c5ed889b5dc 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import datetime
+import functools
 from typing import Any, Optional, Sequence, cast
 
 import numpy as np
@@ -19,13 +20,6 @@
 from cudf.utils.dtypes import np_to_pa_dtype
 from cudf.utils.utils import _all_bools_with_nulls
 
-_dtype_to_format_conversion = {
-    "timedelta64[ns]": "%D days %H:%M:%S",
-    "timedelta64[us]": "%D days %H:%M:%S",
-    "timedelta64[ms]": "%D days %H:%M:%S",
-    "timedelta64[s]": "%D days %H:%M:%S",
-}
-
 _unit_to_nanoseconds_conversion = {
     "ns": 1,
     "us": 1_000,
@@ -87,6 +81,8 @@ def __init__(
         null_count: Optional[int] = None,
     ):
         dtype = cudf.dtype(dtype)
+        if dtype.kind != "m":
+            raise TypeError(f"{self.dtype} is not a supported duration type")
 
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
@@ -102,14 +98,9 @@ def __init__(
             null_count=null_count,
         )
 
-        if self.dtype.type is not np.timedelta64:
-            raise TypeError(f"{self.dtype} is not a supported duration type")
-
-        self._time_unit, _ = np.datetime_data(self.dtype)
-
     def __contains__(self, item: DatetimeLikeScalar) -> bool:
         try:
-            item = np.timedelta64(item, self._time_unit)
+            item = np.timedelta64(item, self.time_unit)
         except ValueError:
             # If item cannot be converted to duration type
             # np.timedelta64 raises ValueError, hence `item`
@@ -126,6 +117,12 @@ def values(self):
             "TimeDelta Arrays is not yet implemented in cudf"
         )
 
+    def element_indexing(self, index: int):
+        result = super().element_indexing(index)
+        if cudf.get_option("mode.pandas_compatible"):
+            return pd.Timedelta(result)
+        return result
+
     @acquire_spill_lock()
     def to_arrow(self) -> pa.Array:
         mask = None
@@ -219,16 +216,12 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
             "Cannot perform binary operation on timezone-naive columns"
             " and timezone-aware timestamps."
         )
-        if isinstance(other, pd.Timestamp):
-            if other.tz is not None:
+        if isinstance(other, datetime.datetime):
+            if other.tzinfo is not None:
                 raise NotImplementedError(tz_error_msg)
-            other = other.to_datetime64()
-        elif isinstance(other, pd.Timedelta):
-            other = other.to_timedelta64()
+            other = pd.Timestamp(other).to_datetime64()
         elif isinstance(other, datetime.timedelta):
-            other = np.timedelta64(other)
-        elif isinstance(other, datetime.datetime) and other.tzinfo is not None:
-            raise NotImplementedError(tz_error_msg)
+            other = pd.Timedelta(other).to_timedelta64()
 
         if isinstance(other, np.timedelta64):
             other_time_unit = cudf.utils.dtypes.get_time_unit(other)
@@ -245,13 +238,13 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
             else:
                 common_dtype = determine_out_dtype(self.dtype, other.dtype)
             return cudf.Scalar(other.astype(common_dtype))
-        elif np.isscalar(other):
+        elif is_scalar(other):
             return cudf.Scalar(other)
         return NotImplemented
 
-    @property
+    @functools.cached_property
     def time_unit(self) -> str:
-        return self._time_unit
+        return np.datetime_data(self.dtype)[0]
 
     def fillna(
         self,
@@ -292,9 +285,7 @@ def as_string_column(
         self, dtype: Dtype, format: str | None = None
     ) -> "cudf.core.column.StringColumn":
         if format is None:
-            format = _dtype_to_format_conversion.get(
-                self.dtype.name, "%D days %H:%M:%S"
-            )
+            format = "%D days %H:%M:%S"
         if len(self) > 0:
             return string._timedelta_to_str_typecast_functions[
                 cudf.dtype(self.dtype)
@@ -479,7 +470,7 @@ def components(self, index=None) -> "cudf.DataFrame":
                     _unit_to_nanoseconds_conversion[value[1]], "ns"
                 ).astype(self.dtype)
             )
-            if self._time_unit == value[1]:
+            if self.time_unit == value[1]:
                 break
 
         for name in keys_list:
@@ -571,7 +562,7 @@ def nanoseconds(self) -> "cudf.core.column.NumericalColumn":
         # performing division operation to extract the number
         # of nanoseconds.
 
-        if self._time_unit != "ns":
+        if self.time_unit != "ns":
             res_col = column.as_column(0, length=len(self), dtype="int64")
             if self.nullable:
                 res_col = res_col.set_mask(self.mask)
diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py
index 98be7045923..6ee339ee3ea 100644
--- a/python/cudf/cudf/tests/series/test_datetimelike.py
+++ b/python/cudf/cudf/tests/series/test_datetimelike.py
@@ -203,3 +203,18 @@ def test_tz_aware_attributes_local():
     result = dti.hour
     expected = cudf.Index([9, 9, 9], dtype="int16")
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "item, expected",
+    [
+        ["2020-01-01", False],
+        ["2020-01-01T00:00:00+00:00", True],
+        ["2020-01-01T00:00:00-08:00", False],
+        ["2019-12-31T16:00:00-08:00", True],
+    ],
+)
+def test_contains_tz_aware(item, expected):
+    dti = cudf.date_range("2020", periods=2, freq="D").tz_localize("UTC")
+    result = item in dti
+    assert result == expected

From 9ae32fef59172bf5901e14553b106cf840d524c6 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 5 Apr 2024 11:59:05 -0400
Subject: [PATCH 016/842] Fix debug build errors from to_arrow_device_test.cpp
 (#15463)

Fixes debug build failures resulting from changes from #15047. Here are some of the errors reported by the compiler:
```
Building CXX object tests/CMakeFiles/INTEROP_TEST.dir/interop/to_arrow_device_test.cpp.o
FAILED: tests/CMakeFiles/INTEROP_TEST.dir/interop/to_arrow_device_test.cpp.o
/usr/local/bin/g++ -DFMT_HEADER_ONLY=1 -DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE -DNANOARROW_DEBUG -DSPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_INFO -DSPDLOG_FMT_EXTERNAL -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA -DTHRUST_DISABLE_ABI_NAMESPACE -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP -DTHRUST_IGNORE_ABI_NAMESPACE_ERROR -I/cudf/cpp -I/cudf/cpp/src -I/cudf/cpp/build/_deps/dlpack-src/include -I/cudf/cpp/build/_deps/jitify-src -I/cudf/cpp/include -I/cudf/cpp/build/include -I/cudf/cpp/build/_deps/cccl-src/thrust/thrust/cmake/../.. -I/cudf/cpp/build/_deps/cccl-src/libcudacxx/lib/cmake/libcudacxx/../../../include -I/cudf/cpp/build/_deps/cccl-src/cub/cub/cmake/../.. -I/cudf/cpp/build/_deps/nvtx3-src/c/include -I/cudf/cpp/build/_deps/nanoarrow-src/src -I/cudf/cpp/build/_deps/nanoarrow-build/generated -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /conda/envs/rapids/include -fdiagnostics-color=always  -I/conda/envs/rapids/targets/x86_64-linux/include  -L/conda/envs/rapids/targets/x86_64-linux/lib -L/conda/envs/rapids/targets/x86_64-linux/lib/stubs -g -std=gnu++17 -fPIE -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations -pthread -MD -MT tests/CMakeFiles/INTEROP_TEST.dir/interop/to_arrow_device_test.cpp.o -MF tests/CMakeFiles/INTEROP_TEST.dir/interop/to_arrow_device_test.cpp.o.d -o tests/CMakeFiles/INTEROP_TEST.dir/interop/to_arrow_device_test.cpp.o -c /cudf/cpp/tests/interop/to_arrow_device_test.cpp
In file included from /cudf/cpp/tests/interop/to_arrow_device_test.cpp:17:
/cudf/cpp/tests/interop/nanoarrow_utils.hpp: In function 'void populate_list_from_col(ArrowArray*, cudf::lists_column_view)':
/cudf/cpp/tests/interop/nanoarrow_utils.hpp:220:26: error: ignoring return value of 'ArrowErrorCode ArrowBufferSetAllocator(ArrowBuffer*, ArrowBufferAllocator)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  220 |   ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
      |   ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/cudf/cpp/tests/interop/nanoarrow_utils.hpp:224:26: error: ignoring return value of 'ArrowErrorCode ArrowBufferSetAllocator(ArrowBuffer*, ArrowBufferAllocator)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  224 |   ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
      |   ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/cudf/cpp/tests/interop/to_arrow_device_test.cpp: In member function 'void BaseArrowFixture::compare_arrays(const ArrowSchema*, const ArrowArray*, const ArrowArray*)':
/cudf/cpp/tests/interop/to_arrow_device_test.cpp:268:24: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaViewInit(ArrowSchemaView*, const ArrowSchema*, ArrowError*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  268 |     ArrowSchemaViewInit(&schema_view, schema, nullptr);
/cudf/cpp/tests/interop/to_arrow_device_test.cpp: In member function 'virtual void ToArrowDeviceTest_DateTimeTable_Test::TestBody()':
/cudf/cpp/tests/interop/to_arrow_device_test.cpp:353:27: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetTypeStruct(ArrowSchema*, int64_t)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  353 |   ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
/cudf/cpp/tests/interop/to_arrow_device_test.cpp:355:29: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetTypeDateTime(ArrowSchema*, ArrowType, ArrowTimeUnit, const char*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]

(many more)
```
Warning are turned into errors so the build fails.
Fix simply adds the `NANOARROW_THROW_NOT_OK` to the offending calls.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15463
---
 cpp/tests/interop/nanoarrow_utils.hpp      |  20 +--
 cpp/tests/interop/to_arrow_device_test.cpp | 140 ++++++++++++---------
 2 files changed, 88 insertions(+), 72 deletions(-)

diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index e7ffa9e40f4..c4b53282402 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -65,10 +65,10 @@ std::enable_if_t<cudf::is_fixed_width<T>() and !std::is_same_v<T, bool>, void> p
 {
   arr->length     = view.size();
   arr->null_count = view.null_count();
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
   ArrowArrayBuffer(arr, 1)->data = const_cast<uint8_t*>(view.data<uint8_t>());
 }
 
@@ -109,20 +109,20 @@ std::enable_if_t<std::is_same_v<T, bool>, void> populate_from_col(ArrowArray* ar
 {
   arr->length     = view.size();
   arr->null_count = view.null_count();
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
 
   auto bitmask = cudf::bools_to_mask(view);
   auto ptr     = reinterpret_cast<uint8_t*>(bitmask.first->data());
-  ArrowBufferSetAllocator(
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(
     ArrowArrayBuffer(arr, 1),
     ArrowBufferDeallocator(
       [](ArrowBufferAllocator* alloc, uint8_t*, int64_t) {
         auto buf = reinterpret_cast<std::unique_ptr<rmm::device_buffer>*>(alloc->private_data);
         delete buf;
       },
-      new std::unique_ptr<rmm::device_buffer>(std::move(bitmask.first))));
+      new std::unique_ptr<rmm::device_buffer>(std::move(bitmask.first)))));
   ArrowArrayBuffer(arr, 1)->data = ptr;
 }
 
@@ -160,14 +160,14 @@ std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> populate_from_col(
 {
   arr->length     = view.size();
   arr->null_count = view.null_count();
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
 
   cudf::strings_column_view sview{view};
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
   ArrowArrayBuffer(arr, 1)->data = const_cast<uint8_t*>(sview.offsets().data<uint8_t>());
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 2), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 2), noop_alloc));
   ArrowArrayBuffer(arr, 2)->data = const_cast<uint8_t*>(view.data<uint8_t>());
 }
 
@@ -217,10 +217,10 @@ void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view)
   arr->length     = view.size();
   arr->null_count = view.null_count();
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
   ArrowArrayBuffer(arr, 1)->data = const_cast<uint8_t*>(view.offsets().data<uint8_t>());
 }
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 243aa4e81af..16aab53a249 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -265,7 +265,7 @@ struct BaseArrowFixture : public cudf::test::BaseFixture {
                       const ArrowArray* actual)
   {
     ArrowSchemaView schema_view;
-    ArrowSchemaViewInit(&schema_view, schema, nullptr);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, nullptr));
 
     EXPECT_EQ(expected->length, actual->length);
     EXPECT_EQ(expected->null_count, actual->null_count);
@@ -350,11 +350,11 @@ TEST_F(ToArrowDeviceTest, DateTimeTable)
     cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
   nanoarrow::UniqueSchema expected_schema;
   ArrowSchemaInit(expected_schema.get());
-  ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
   ArrowSchemaInit(expected_schema->children[0]);
-  ArrowSchemaSetTypeDateTime(
-    expected_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr);
-  ArrowSchemaSetName(expected_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    expected_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
   expected_schema->children[0]->flags = 0;
 
   compare_schemas(expected_schema.get(), got_arrow_schema.get());
@@ -395,7 +395,7 @@ TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
 
   nanoarrow::UniqueSchema expected_schema;
   ArrowSchemaInit(expected_schema.get());
-  ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
 
   ArrowSchemaInit(expected_schema->children[0]);
   const ArrowTimeUnit arrow_unit = [&] {
@@ -407,9 +407,9 @@ TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
       default: CUDF_FAIL("Unsupported duration unit in arrow");
     }
   }();
-  ArrowSchemaSetTypeDateTime(
-    expected_schema->children[0], NANOARROW_TYPE_DURATION, arrow_unit, nullptr);
-  ArrowSchemaSetName(expected_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    expected_schema->children[0], NANOARROW_TYPE_DURATION, arrow_unit, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
   expected_schema->children[0]->flags = 0;
 
   auto got_arrow_schema =
@@ -450,19 +450,22 @@ TEST_F(ToArrowDeviceTest, NestedList)
 
   nanoarrow::UniqueSchema expected_schema;
   ArrowSchemaInit(expected_schema.get());
-  ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
 
-  ArrowSchemaInitFromType(expected_schema->children[0], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(expected_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(expected_schema->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
   expected_schema->children[0]->flags = ARROW_FLAG_NULLABLE;
 
-  ArrowSchemaInitFromType(expected_schema->children[0]->children[0], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(expected_schema->children[0]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(expected_schema->children[0]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0]->children[0], "element"));
   expected_schema->children[0]->children[0]->flags = 0;
 
-  ArrowSchemaInitFromType(expected_schema->children[0]->children[0]->children[0],
-                          NANOARROW_TYPE_INT64);
-  ArrowSchemaSetName(expected_schema->children[0]->children[0]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(
+    expected_schema->children[0]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(expected_schema->children[0]->children[0]->children[0], "element"));
   expected_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE;
 
   auto got_arrow_schema =
@@ -481,7 +484,8 @@ TEST_F(ToArrowDeviceTest, NestedList)
   populate_list_from_col(top_list->children[0], nested_view);
   populate_from_col<int64_t>(top_list->children[0]->children[0], nested_view.child());
 
-  ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
   auto got_arrow_array = cudf::to_arrow_device(std::move(input));
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
@@ -537,44 +541,49 @@ TEST_F(ToArrowDeviceTest, StructColumn)
 
   nanoarrow::UniqueSchema expected_schema;
   ArrowSchemaInit(expected_schema.get());
-  ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
 
   ArrowSchemaInit(expected_schema->children[0]);
-  ArrowSchemaSetTypeStruct(expected_schema->children[0], 5);
-  ArrowSchemaSetName(expected_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema->children[0], 5));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
   expected_schema->children[0]->flags = 0;
 
   auto child = expected_schema->children[0];
-  ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING);
-  ArrowSchemaSetName(child->children[0], "string");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[0], "string"));
   child->children[0]->flags = 0;
 
-  ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32);
-  ArrowSchemaSetName(child->children[1], "integral");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[1], "integral"));
   child->children[1]->flags = 0;
 
-  ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL);
-  ArrowSchemaSetName(child->children[2], "bool");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[2], "bool"));
   child->children[2]->flags = 0;
 
-  ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(child->children[3], "nested_list");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3], "nested_list"));
   child->children[3]->flags = 0;
-  ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(child->children[3]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3]->children[0], "element"));
   child->children[3]->children[0]->flags = 0;
-  ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64);
-  ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element"));
   child->children[3]->children[0]->children[0]->flags = 0;
 
   ArrowSchemaInit(child->children[4]);
-  ArrowSchemaSetTypeStruct(child->children[4], 2);
-  ArrowSchemaSetName(child->children[4], "struct");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(child->children[4], 2));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4], "struct"));
 
-  ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING);
-  ArrowSchemaSetName(child->children[4]->children[0], "string2");
-  ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32);
-  ArrowSchemaSetName(child->children[4]->children[1], "integral2");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[0], "string2"));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[1], "integral2"));
 
   auto got_arrow_schema =
     cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{metadata});
@@ -582,7 +591,8 @@ TEST_F(ToArrowDeviceTest, StructColumn)
   ArrowSchemaRelease(got_arrow_schema.get());
 
   nanoarrow::UniqueArray expected_array;
-  ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
 
   expected_array->length = input.num_rows();
 
@@ -591,7 +601,7 @@ TEST_F(ToArrowDeviceTest, StructColumn)
   array_a->length     = view_a.size();
   array_a->null_count = view_a.null_count();
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc));
   ArrowArrayValidityBitmap(array_a)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_a.null_mask()));
 
@@ -609,14 +619,15 @@ TEST_F(ToArrowDeviceTest, StructColumn)
   array_struct->length     = view_struct.size();
   array_struct->null_count = view_struct.null_count();
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc));
   ArrowArrayValidityBitmap(array_struct)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_struct.null_mask()));
 
   populate_from_col<cudf::string_view>(array_struct->children[0], view_struct.child(0));
   populate_from_col<int32_t>(array_struct->children[1], view_struct.child(1));
 
-  ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
   auto got_arrow_array = cudf::to_arrow_device(std::move(input));
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
@@ -642,13 +653,13 @@ TEST_F(ToArrowDeviceTest, FixedPoint64Table)
 
     nanoarrow::UniqueSchema expected_schema;
     ArrowSchemaInit(expected_schema.get());
-    ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
     ArrowSchemaInit(expected_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(expected_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<int64_t>(),
-                              -scale);
-    ArrowSchemaSetName(expected_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int64_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
     expected_schema->children[0]->flags = 0;
 
     auto got_arrow_schema =
@@ -665,16 +676,18 @@ TEST_F(ToArrowDeviceTest, FixedPoint64Table)
 
     cudf::get_default_stream().synchronize();
     nanoarrow::UniqueArray expected_array;
-    ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
     expected_array->length = input.num_rows();
 
     expected_array->children[0]->length = input.num_rows();
-    ArrowBufferSetAllocator(ArrowArrayBuffer(expected_array->children[0], 0), noop_alloc);
+    NANOARROW_THROW_NOT_OK(
+      ArrowBufferSetAllocator(ArrowArrayBuffer(expected_array->children[0], 0), noop_alloc));
     ArrowArrayValidityBitmap(expected_array->children[0])->buffer.data =
       const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(input.view().column(0).null_mask()));
 
     auto data_ptr = reinterpret_cast<uint8_t*>(result_dev_data->data());
-    ArrowBufferSetAllocator(
+    NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(
       ArrowArrayBuffer(expected_array->children[0], 1),
       ArrowBufferDeallocator(
         [](ArrowBufferAllocator* alloc, uint8_t*, int64_t) {
@@ -682,9 +695,10 @@ TEST_F(ToArrowDeviceTest, FixedPoint64Table)
             reinterpret_cast<std::unique_ptr<rmm::device_uvector<int64_t>>*>(alloc->private_data);
           delete buf;
         },
-        new std::unique_ptr<rmm::device_uvector<int64_t>>(std::move(result_dev_data))));
+        new std::unique_ptr<rmm::device_uvector<int64_t>>(std::move(result_dev_data)))));
     ArrowArrayBuffer(expected_array->children[0], 1)->data = data_ptr;
-    ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     auto got_arrow_array = cudf::to_arrow_device(std::move(input));
     ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
@@ -708,13 +722,13 @@ TEST_F(ToArrowDeviceTest, FixedPoint128Table)
 
     nanoarrow::UniqueSchema expected_schema;
     ArrowSchemaInit(expected_schema.get());
-    ArrowSchemaSetTypeStruct(expected_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
     ArrowSchemaInit(expected_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(expected_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<__int128_t>(),
-                              -scale);
-    ArrowSchemaSetName(expected_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
     expected_schema->children[0]->flags = 0;
 
     auto got_arrow_schema =
@@ -723,11 +737,13 @@ TEST_F(ToArrowDeviceTest, FixedPoint128Table)
     ArrowSchemaRelease(got_arrow_schema.get());
 
     nanoarrow::UniqueArray expected_array;
-    ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
     expected_array->length = input.num_rows();
 
     populate_from_col<__int128_t>(expected_array->children[0], input.view().column(0));
-    ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     auto got_arrow_array = cudf::to_arrow_device(std::move(input));
     EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);

From 363db505e46970668207e6d28f22653a831cc3d5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 5 Apr 2024 08:47:24 -1000
Subject: [PATCH 017/842] Use cached_property for NumericColumn.nan_count
 instead of ._nan_count variable (#15466)

Small cleanup that results in the same functionality

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15466
---
 python/cudf/cudf/core/column/numerical.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index b2bd73c9856..f42c87de3fd 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import functools
 from typing import (
     Any,
     Callable,
@@ -75,7 +76,6 @@ class NumericalColumn(NumericalBaseColumn):
     mask : Buffer, optional
     """
 
-    _nan_count: Optional[int]
     _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
 
     def __init__(
@@ -93,7 +93,6 @@ def __init__(
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
             size = (data.size // dtype.itemsize) - offset
-        self._nan_count = None
         super().__init__(
             data,
             size=size,
@@ -105,7 +104,10 @@ def __init__(
 
     def _clear_cache(self):
         super()._clear_cache()
-        self._nan_count = None
+        try:
+            del self.nan_count
+        except AttributeError:
+            pass
 
     def __contains__(self, item: ScalarLike) -> bool:
         """
@@ -424,14 +426,12 @@ def any(self, skipna: bool = True) -> bool:
 
         return libcudf.reduce.reduce("any", result_col, dtype=np.bool_)
 
-    @property
+    @functools.cached_property
     def nan_count(self) -> int:
         if self.dtype.kind != "f":
-            self._nan_count = 0
-        elif self._nan_count is None:
-            nan_col = libcudf.unary.is_nan(self)
-            self._nan_count = nan_col.sum()
-        return self._nan_count
+            return 0
+        nan_col = libcudf.unary.is_nan(self)
+        return nan_col.sum()
 
     def _process_values_for_isin(
         self, values: Sequence

From 6319ab708f2dff9fd7a62a5c77fd3b387bde1bb8 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 5 Apr 2024 14:30:26 -0500
Subject: [PATCH 018/842] Enable all tests for `arm` arch (#15402)

This PR enables running all pytests for `arm64` jobs.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Jake Awe (https://github.com/AyodeAwe)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15402
---
 ci/test_wheel_cudf.sh       | 39 ++++++++++++++++---------------------
 ci/wheel_smoke_test_cudf.py | 13 -------------
 2 files changed, 17 insertions(+), 35 deletions(-)
 delete mode 100644 ci/wheel_smoke_test_cudf.py

diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index a6f122491b0..fdb61278d36 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -13,26 +13,21 @@ RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
 
-# Run smoke tests for aarch64 pull requests
-if [[ "$(arch)" == "aarch64" && ${RAPIDS_BUILD_TYPE} == "pull-request" ]]; then
-    rapids-logger "Run smoke tests for cudf"
-    python ./ci/wheel_smoke_test_cudf.py
-else
-    rapids-logger "pytest pylibcudf"
-    pushd python/cudf/cudf/pylibcudf_tests
-    python -m pytest \
-      --cache-clear \
-      --dist=worksteal \
-      .
-    popd
 
-    rapids-logger "pytest cudf"
-    pushd python/cudf/cudf/tests
-    python -m pytest \
-      --cache-clear \
-      --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \
-      --numprocesses=8 \
-      --dist=worksteal \
-      .
-    popd
-fi
+rapids-logger "pytest pylibcudf"
+pushd python/cudf/cudf/pylibcudf_tests
+python -m pytest \
+  --cache-clear \
+  --dist=worksteal \
+  .
+popd
+
+rapids-logger "pytest cudf"
+pushd python/cudf/cudf/tests
+python -m pytest \
+  --cache-clear \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf.xml" \
+  --numprocesses=8 \
+  --dist=worksteal \
+  .
+popd
diff --git a/ci/wheel_smoke_test_cudf.py b/ci/wheel_smoke_test_cudf.py
deleted file mode 100644
index a11a97039af..00000000000
--- a/ci/wheel_smoke_test_cudf.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-
-import cudf
-import pyarrow as pa
-
-if __name__ == '__main__':
-    n_legs = pa.array([2, 4, 5, 100])
-    animals = pa.array(["Flamingo", "Horse", "Brittle stars", "Centipede"])
-    names = ["n_legs", "animals"]
-    foo = pa.table([n_legs, animals], names=names)
-    df = cudf.DataFrame.from_arrow(foo)
-    assert df.loc[df["animals"] == "Centipede"]["n_legs"].iloc[0] == 100
-    assert df.loc[df["animals"] == "Flamingo"]["n_legs"].iloc[0] == 2

From 4b951ef093a7cf0ff8da3fa3c0f1c87ef719ba5c Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 5 Apr 2024 16:23:24 -0500
Subject: [PATCH 019/842] Add custom status check workflow (#15464)

This PR adds a custom workflow that creates a custom github status check to `cudf` that will run after `workflow_run` event is triggered.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15464
---
 .github/workflows/status.yaml | 115 ++++++++++++++++++++++++++++++++++
 1 file changed, 115 insertions(+)
 create mode 100644 .github/workflows/status.yaml

diff --git a/.github/workflows/status.yaml b/.github/workflows/status.yaml
new file mode 100644
index 00000000000..0aad4c8a23e
--- /dev/null
+++ b/.github/workflows/status.yaml
@@ -0,0 +1,115 @@
+name: Custom GH Status from Workflow Artifacts
+
+on:
+  workflow_run:
+    workflows: ["pr"]
+    types:
+      - completed
+
+jobs:
+  process_artifacts:
+    if: ${{ github.event.workflow_run.conclusion == 'success' }}
+    runs-on: ubuntu-latest
+    outputs:
+      artifact_downloaded: ${{ steps.download_artifact.outputs.artifact_downloaded }}
+    permissions:
+      actions: read
+      checks: read
+      contents: read
+      deployments: read
+      id-token: write
+      issues: read
+      discussions: read
+      packages: read
+      pages: read
+      pull-requests: read
+      repository-projects: read
+      security-events: read
+      statuses: write
+    steps:
+      - name: Download artifact
+        id: download_artifact
+        uses: actions/github-script@v7
+        with:
+          retries: 3
+          script: |
+            const fs = require('fs');
+            const path = require('path');
+            const artifactName = 'gh-status';
+
+            const allArtifacts = await github.rest.actions.listWorkflowRunArtifacts({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                run_id: context.payload.workflow_run.id,
+              });
+            // Find the specific artifact
+            const artifact = allArtifacts.data.artifacts.find(artifact => artifact.name === artifactName);
+            if (!artifact) {
+              core.info(`Artifact "${artifactName}" not found. Exiting safely.`);
+              core.setOutput('artifact_downloaded', 'false');
+              return;
+            }
+            core.setOutput('artifact_downloaded', 'true');
+            // Download the artifact
+            const download = await github.rest.actions.downloadArtifact({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              artifact_id: artifact.id,
+              archive_format: 'zip',
+            });
+
+            // Write the artifact to a file
+            fs.writeFileSync(`${process.env.GITHUB_WORKSPACE}/${artifactName}.zip`, Buffer.from(download.data));
+
+      - name: 'Unzip artifact'
+        if: ${{ steps.download_artifact.outputs.artifact_downloaded == 'true' }}
+        run: unzip 'gh-status.zip'
+
+      - name: Create status
+        if: ${{ steps.download_artifact.outputs.artifact_downloaded == 'true' }}
+        uses: actions/github-script@v7
+        env:
+          WORKFLOW_RUN_ID: ${{ github.event.workflow_run.id }}
+          COMMIT_SHA: ${{ github.event.workflow_run.head_sha }}
+          ATTEMPTS: ${{ github.event.workflow_run.run_attempt }}
+        with:
+          retries: 3
+          script: |
+            // Load the JSON content
+            const contentJSON = require('./gh-status.json');
+            const {
+                job_name: JOB_NAME,
+                context: CUSTOM_CONTEXT = 'Custom CI Status Check',
+                description: CUSTOM_DESCRIPTION = 'Custom CI Status description',
+                target_url: CUSTOM_TARGET_URL,
+                state: CUSTOM_STATE = 'success'
+            } = contentJSON;
+
+            // Fetch the first job ID from the workflow run
+            const jobs = await github.rest.actions.listJobsForWorkflowRun({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                run_id: process.env.WORKFLOW_RUN_ID,
+            });
+            const job = jobs.data.jobs.find(job => job.name === JOB_NAME);
+            const JOB_ID = job ? job.id : null;
+
+            // Set default target URL if not defined
+            const targetUrl = CUSTOM_TARGET_URL || `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${process.env.WORKFLOW_RUN_ID}/attempts/${process.env.ATTEMPTS}#summary-${JOB_ID}`;
+
+            console.log("job id: ", JOB_ID);
+            console.log("state: ", CUSTOM_STATE);
+            console.log("target url: ", targetUrl);
+            console.log("description: ", CUSTOM_DESCRIPTION);
+            console.log("context: ", CUSTOM_CONTEXT);
+
+            // Create status
+            await github.rest.repos.createCommitStatus({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                sha: process.env.COMMIT_SHA,
+                state: CUSTOM_STATE,
+                target_url: targetUrl,
+                description: CUSTOM_DESCRIPTION,
+                context: CUSTOM_CONTEXT,
+            });

From c5eb3240387222373043ddf881d18fb5d18e0834 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 5 Apr 2024 13:01:10 -1000
Subject: [PATCH 020/842] Refactor numpy array input in as_column (#14651)

Simplifies the numpy array input logic to `as_column` to be

```
if object/string dtype like:
    # parse with pandas with inference
elif numeric-like dtype or datelike with nat:
    # parse with pyarrow (due to np.nan/np.nat/nan_is_null handling)
else:
    # create column from buffer
```

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14651
---
 python/cudf/cudf/core/column/column.py | 160 +++++++++----------------
 python/cudf/cudf/tests/test_column.py  |   2 +-
 python/cudf/cudf/tests/test_concat.py  |   2 +-
 python/cudf/cudf/tests/test_joining.py |   6 +-
 python/cudf/cudf/tests/test_series.py  |  20 ++--
 5 files changed, 72 insertions(+), 118 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 835da36fbfd..518513c66f0 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1702,6 +1702,14 @@ def _make_copy_replacing_NaT_with_null(column):
     return out_col
 
 
+def check_invalid_array(shape: tuple, dtype):
+    """Invalid ndarrays properties that are not supported"""
+    if len(shape) > 1:
+        raise ValueError("Data must be 1-dimensional")
+    elif dtype == "float16":
+        raise TypeError("Unsupported type float16")
+
+
 def as_memoryview(arbitrary: Any) -> Optional[memoryview]:
     try:
         return memoryview(arbitrary)
@@ -1777,12 +1785,9 @@ def as_column(
     elif hasattr(arbitrary, "__cuda_array_interface__"):
         desc = arbitrary.__cuda_array_interface__
         shape = desc["shape"]
-        if len(shape) > 1:
-            raise ValueError("Data must be 1-dimensional")
         current_dtype = np.dtype(desc["typestr"])
 
-        if current_dtype == "float16":
-            raise TypeError("Unsupported type float16")
+        check_invalid_array(shape, current_dtype)
 
         arb_dtype = cudf.dtype(current_dtype)
 
@@ -1962,7 +1967,7 @@ def as_column(
             inferred_dtype = infer_dtype(arbitrary)
             if inferred_dtype in ("mixed-integer", "mixed-integer-float"):
                 raise MixedTypeError("Cannot create column with mixed types")
-            elif inferred_dtype not in (
+            elif dtype is None and inferred_dtype not in (
                 "mixed",
                 "decimal",
                 "string",
@@ -2026,117 +2031,64 @@ def as_column(
             return ColumnBase.from_scalar(arbitrary, length)
 
     elif hasattr(arbitrary, "__array_interface__"):
-        # CUDF assumes values are always contiguous
         desc = arbitrary.__array_interface__
-        shape = desc["shape"]
-        arb_dtype = np.dtype(desc["typestr"])
+        check_invalid_array(desc["shape"], np.dtype(desc["typestr"]))
+
         # CUDF assumes values are always contiguous
-        if len(shape) > 1:
-            raise ValueError("Data must be 1-dimensional")
+        arbitrary = np.asarray(arbitrary, order="C")
 
-        arbitrary = np.asarray(arbitrary)
+        if arbitrary.ndim == 0:
+            # TODO: Or treat as scalar?
+            arbitrary = arbitrary[np.newaxis]
 
-        # Handle case that `arbitrary` elements are cupy arrays
-        if (
-            shape
-            and shape[0]
-            and hasattr(arbitrary[0], "__cuda_array_interface__")
-        ):
+        if arbitrary.dtype.kind in "OSU":
+            if pd.isna(arbitrary).any():
+                arbitrary = pa.array(arbitrary)
+            else:
+                # Let pandas potentially infer object type
+                # e.g. np.array([pd.Timestamp(...)], dtype=object) -> datetime64
+                arbitrary = pd.Series(arbitrary)
+            return as_column(arbitrary, dtype=dtype, nan_as_null=nan_as_null)
+        elif arbitrary.dtype.kind in "biuf":
+            from_pandas = nan_as_null is None or nan_as_null
             return as_column(
-                cupy.asarray(arbitrary, dtype=arbitrary[0].dtype),
-                nan_as_null=nan_as_null,
+                pa.array(arbitrary, from_pandas=from_pandas),
                 dtype=dtype,
-                length=length,
+                nan_as_null=nan_as_null,
             )
-
-        if not arbitrary.flags["C_CONTIGUOUS"]:
-            arbitrary = np.ascontiguousarray(arbitrary)
-
-        delayed_cast = False
-        if dtype is not None:
-            try:
-                dtype = np.dtype(dtype)
-            except TypeError:
-                # Some `dtype`'s can't be parsed by `np.dtype`
-                # for which we will have to cast after the column
-                # has been constructed.
-                delayed_cast = True
-            else:
-                arbitrary = arbitrary.astype(dtype)
-
-        if arb_dtype.kind == "M":
+        elif arbitrary.dtype.kind in "mM":
             time_unit = get_time_unit(arbitrary)
-            cast_dtype = time_unit in ("D", "W", "M", "Y")
-
-            if cast_dtype:
-                arbitrary = arbitrary.astype(cudf.dtype("datetime64[s]"))
+            if time_unit in ("D", "W", "M", "Y"):
+                # TODO: Raise in these cases instead of downcasting to s?
+                new_type = f"{arbitrary.dtype.type.__name__}[s]"
+                arbitrary = arbitrary.astype(new_type)
+            elif time_unit == "generic":
+                # TODO: This should probably be in cudf.dtype
+                raise TypeError(
+                    f"{arbitrary.dtype.type.__name__} must have a unit specified"
+                )
 
-            buffer = as_buffer(arbitrary.view("|u1"))
-            if nan_as_null is None or nan_as_null is True:
-                data = build_column(buffer, dtype=arbitrary.dtype)
-                data = _make_copy_replacing_NaT_with_null(data)
-                mask = data.mask
-            else:
-                bool_mask = as_column(~np.isnat(arbitrary))
+            is_nat = np.isnat(arbitrary)
+            mask = None
+            if is_nat.any():
+                if nan_as_null is None or nan_as_null:
+                    # Convert NaT to NA, which pyarrow does by default
+                    return as_column(
+                        pa.array(arbitrary),
+                        dtype=dtype,
+                        nan_as_null=nan_as_null,
+                    )
+                # Consider NaT as NA in the mask
+                # but maintain NaT as a value
+                bool_mask = as_column(~is_nat)
                 mask = as_buffer(bools_to_mask(bool_mask))
-
-            data = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
-        elif arb_dtype.kind == "m":
-            time_unit = get_time_unit(arbitrary)
-            cast_dtype = time_unit in ("D", "W", "M", "Y")
-
-            if cast_dtype:
-                arbitrary = arbitrary.astype(cudf.dtype("timedelta64[s]"))
-
             buffer = as_buffer(arbitrary.view("|u1"))
-            if nan_as_null is None or nan_as_null is True:
-                data = build_column(buffer, dtype=arbitrary.dtype)
-                data = _make_copy_replacing_NaT_with_null(data)
-                mask = data.mask
-            else:
-                bool_mask = as_column(~np.isnat(arbitrary))
-                mask = as_buffer(bools_to_mask(bool_mask))
-
-            data = cudf.core.column.timedelta.TimeDeltaColumn(
-                data=buffer,
-                size=len(arbitrary),
-                mask=mask,
-                dtype=arbitrary.dtype,
-            )
-        elif (
-            arbitrary.size != 0
-            and arb_dtype.kind in ("O")
-            and isinstance(arbitrary[0], pd.Interval)
-        ):
-            # changing from pd array to series,possible arrow bug
-            interval_series = pd.Series(arbitrary)
-            data = as_column(
-                pa.Array.from_pandas(interval_series),
-                dtype=arbitrary.dtype,
-            )
-            if dtype is not None:
-                data = data.astype(dtype)
-        elif arb_dtype.kind in ("O", "U"):
-            data = as_column(pa.array(arbitrary), dtype=dtype)
-            # There is no cast operation available for pa.Array from int to
-            # str, Hence instead of handling in pa.Array block, we
-            # will have to type-cast here.
-            if dtype is not None:
-                data = data.astype(dtype)
-        elif arb_dtype.kind in ("f"):
-            if arb_dtype == np.dtype("float16"):
-                raise TypeError("Unsupported type float16")
-            arb_dtype = cudf.dtype(arb_dtype if dtype is None else dtype)
-            data = as_column(
-                cupy.asarray(arbitrary, dtype=arb_dtype),
-                nan_as_null=nan_as_null,
-            )
+            col = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
+            if dtype:
+                col = col.astype(dtype)
+            return col
         else:
-            data = as_column(cupy.asarray(arbitrary), nan_as_null=nan_as_null)
-
-        if delayed_cast:
-            data = data.astype(cudf.dtype(dtype))
-
+            raise NotImplementedError(f"{arbitrary.dtype} not supported")
     elif (view := as_memoryview(arbitrary)) is not None:
         return as_column(
             np.asarray(view), dtype=dtype, nan_as_null=nan_as_null
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 8e8555b2005..2f70f955fa9 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -176,7 +176,7 @@ def test_column_series_multi_dim(data):
 @pytest.mark.parametrize(
     ("data", "error"),
     [
-        ([1, "1.0", "2", -3], pa.lib.ArrowInvalid),
+        ([1, "1.0", "2", -3], cudf.errors.MixedTypeError),
         ([np.nan, 0, "null", cp.nan], pa.lib.ArrowInvalid),
         (
             [np.int32(4), np.float64(1.5), np.float32(1.290994), np.int8(0)],
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index cdb47ea79d8..3d638da924b 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -1705,7 +1705,7 @@ def test_concat_decimal_numeric_series(s1, s2, s3, expected):
             cudf.Series(
                 np.arange(
                     "2005-02-01T12", "2005-02-01T15", dtype="datetime64[h]"
-                ),
+                ).astype("datetime64[s]"),
                 dtype="datetime64[s]",
             ),
             cudf.Series(
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index c063043b72a..f36774daab2 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -1527,7 +1527,7 @@ def test_categorical_typecast_outer():
         result = left.merge(right, how="outer", on="key")
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["object"])
+@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"])
 def test_categorical_typecast_inner_one_cat(dtype):
     data = np.array([1, 2, 3], dtype=dtype)
 
@@ -1538,7 +1538,7 @@ def test_categorical_typecast_inner_one_cat(dtype):
     assert result["key"].dtype == left["key"].dtype.categories.dtype
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["object"])
+@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"])
 def test_categorical_typecast_left_one_cat(dtype):
     data = np.array([1, 2, 3], dtype=dtype)
 
@@ -1549,7 +1549,7 @@ def test_categorical_typecast_left_one_cat(dtype):
     assert result["key"].dtype == left["key"].dtype
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["object"])
+@pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["str"])
 def test_categorical_typecast_outer_one_cat(dtype):
     data = np.array([1, 2, 3], dtype=dtype)
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 48194494260..b45857e28ad 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2637,24 +2637,26 @@ def test_series_setitem_mixed_bool_dtype():
 @pytest.mark.parametrize(
     "nat, value",
     [
-        [np.datetime64("nat"), np.datetime64("2020-01-01")],
-        [np.timedelta64("nat"), np.timedelta64(1)],
+        [np.datetime64("nat", "ns"), np.datetime64("2020-01-01", "ns")],
+        [np.timedelta64("nat", "ns"), np.timedelta64(1, "ns")],
     ],
 )
 @pytest.mark.parametrize("nan_as_null", [True, False])
-def test_series_np_array_nat_nan_as_nulls(nat, value, request, nan_as_null):
+def test_series_np_array_nat_nan_as_nulls(nat, value, nan_as_null):
     expected = np.array([nat, value])
-    if expected.dtype.kind == "m":
-        request.applymarker(
-            pytest.mark.xfail(
-                raises=TypeError, reason="timedelta64 not supported by cupy"
-            )
-        )
     ser = cudf.Series(expected, nan_as_null=nan_as_null)
     assert ser[0] is pd.NaT
     assert ser[1] == value
 
 
+def test_series_unitness_np_datetimelike_units():
+    data = np.array([np.timedelta64(1)])
+    with pytest.raises(TypeError):
+        cudf.Series(data)
+    with pytest.raises(TypeError):
+        pd.Series(data)
+
+
 def test_series_duplicate_index_reindex():
     gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1])
     ps = gs.to_pandas()

From 102d564db21df1d805c2d06571e75a96fa6d822f Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 8 Apr 2024 07:21:21 -0500
Subject: [PATCH 021/842] Enable test-reporting for pandas pytests in CI
 (#15369)

This PR enables pandas test-reporting for pandas pytests in CI by comparing against the results available in nightlies as a baseline.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15369
---
 .github/workflows/pr.yaml                     | 43 ++++---------------
 ci/cudf_pandas_scripts/pandas-tests/diff.sh   | 29 +++++++++----
 .../pandas-tests/job-summary.py               |  4 +-
 3 files changed, 32 insertions(+), 44 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 2d7ebb62fa8..345ccbea45b 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -30,8 +30,7 @@ jobs:
       - devcontainer
       - unit-tests-cudf-pandas
       - pandas-tests
-      #- pandas-tests-diff
-      #- pandas-tests-diff-comment
+      - pandas-tests-diff
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06
   checks:
@@ -180,35 +179,11 @@ jobs:
       script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
       # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit.
       test_summary_show: "none"
-  #pandas-tests-diff:
-  #  # diff the results of running the Pandas unit tests and publish a job summary
-  #  needs: [pandas-tests-main, pandas-tests-pr]
-  #  secrets: inherit
-  #  # This branch exports a `job_output` output that the downstream job reads.
-  #  uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
-  #  with:
-  #    node_type: cpu4
-  #    build_type: pull-request
-  #    run_script: ci/cudf_pandas_scripts/pandas-tests/diff.sh
-  #pandas-tests-diff-comment:
-  #  # Post comment of pass/fail rate on PR
-  #  runs-on: ubuntu-latest
-  #  needs: pandas-tests-diff
-  #  steps:
-  #    - uses: actions/github-script@v6
-  #      with:
-  #        script: |
-  #          const branch = process.env.GITHUB_REF_NAME;
-  #          const prBranchPattern = new RegExp("^pull-request/[0-9]+$");
-  #          if (!branch.match(prBranchPattern)) {
-  #            throw new Error(`${branch} does not match PR branch pattern.`);
-  #          }
-  #          const summary_url = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}`;
-  #          const prNumber = branch.split("/")[1];
-  #          const summary_comment = `${{ needs.pandas-tests-diff.outputs.job_output }}`;
-  #          github.rest.issues.createComment({
-  #            issue_number: prNumber,
-  #            owner: context.repo.owner,
-  #            repo: context.repo.repo,
-  #            body: `${summary_comment}\n\nHere is [a link to the full test summary](${summary_url}).\n`
-  #          })
+  pandas-tests-diff:
+    # diff the results of running the Pandas unit tests and publish a job summary
+    needs: pandas-tests
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@patch-1
+    with:
+        node_type: cpu4
+        build_type: pull-request
+        run_script: "ci/cudf_pandas_scripts/pandas-tests/diff.sh"
diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
index 37adabdb9c6..ae5a249bcbd 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
@@ -1,5 +1,5 @@
 #!/usr/bin/env bash
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -7,18 +7,31 @@
 # branch and the PR branch:
 
 # Hard-coded needs to match the version deduced by rapids-upload-artifacts-dir
+GH_JOB_NAME="pandas-tests-diff / build"
+rapids-logger "Github job name: ${GH_JOB_NAME}"
+
 MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py310.main-results.json
-PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py310.pr-results.json
-aws s3 cp $MAIN_ARTIFACT main-results.json
+PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py39.pr-results.json
+
+rapids-logger "Fetching latest available results from nightly"
+aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '.main-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt
+cat s3_output.txt
+read -r COMPARE_ENV < s3_output.txt
+export COMPARE_ENV
+rapids-logger "Latest available results from nightly: ${COMPARE_ENV}"
+
+aws s3 cp "s3://rapids-downloads/${COMPARE_ENV}" main-results.json
 aws s3 cp $PR_ARTIFACT pr-results.json
 
 # Compute the diff and prepare job summary:
 python -m pip install pandas tabulate
 python ci/cudf_pandas_scripts/pandas-tests/job-summary.py main-results.json pr-results.json | tee summary.txt >> "$GITHUB_STEP_SUMMARY"
 
-COMMENT=$(head -1 summary.txt)
-
+COMMENT=$(head -1 summary.txt | grep -oP '\d+/\d+ \(\d+\.\d+%\).*?(a decrease by|an increase by) \d+\.\d+%')
 echo "$COMMENT"
-
-# Magic name that the custom-job.yaml workflow reads and re-exports
-echo "job_output=${COMMENT}" >> "${GITHUB_OUTPUT}"
+jq --arg COMMENT "$COMMENT" --arg GH_JOB_NAME "$GH_JOB_NAME" -n \
+  '{"context": "Pandas tests",
+    "description": $COMMENT,
+    "state":"success",
+    "job_name": $GH_JOB_NAME}' \
+    > gh-status.json
diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
index 1e83e51ab04..93a815838b7 100644
--- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
+++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -40,7 +40,7 @@ def get_total_and_passed(results):
     "Merging this PR would result in "
     f"{pr_passed}/{pr_total} ({passing_percentage:.2f}%) "
     "Pandas tests passing, "
-    f"{rate_change_type} in the test pass rate by "
+    f"{rate_change_type} by "
     f"{pass_rate_change:.2f}%. "
     f"Trunk stats: {main_passed}/{main_total}."
 )

From bd249cce41a2475edb8c60525f665695854ae38e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 8 Apr 2024 04:17:08 -1000
Subject: [PATCH 022/842] Remove prior test skipping in run-pandas-tests with
 testing 2.2.1 (#15440)

Now that pandas 2.2.1 is used when running the pandas test suite with `cudf.pandas`, some of the previously skipped tests can now be enabled now that deterministic data is used in the test suite and some tests were refactored.

Also cleaned up some redundant/old configs in this file

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15440
---
 .../cudf/pandas/scripts/run-pandas-tests.sh   | 101 +-----------------
 1 file changed, 3 insertions(+), 98 deletions(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 06df7b36f7d..eeb9f2b6368 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -60,8 +60,6 @@ if [ ! -d "pandas-tests" ]; then
 [tool.pytest.ini_options]
 xfail_strict = true
 filterwarnings = [
-  "error:Sparse:FutureWarning",
-  "error:The SparseArray:FutureWarning",
   # Will be fixed in numba 0.56: https://github.com/numba/numba/issues/7758
   "ignore:`np.MachAr` is deprecated:DeprecationWarning:numba",
 ]
@@ -72,7 +70,7 @@ markers = [
   "db: tests requiring a database (mysql or postgres)",
   "clipboard: mark a pd.read_clipboard test",
   "arm_slow: mark a test as slow for arm64 architecture",
-  "arraymanager: mark a test to run with ArrayManager enabled",
+  "skip_ubsan: Tests known to fail UBSAN check",
 ]
 EOF
     # append the contents of patch-confest.py to conftest.py
@@ -100,104 +98,11 @@ cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftes
 # Run the tests
 cd pandas-tests/
 
-# TODO: Get a postgres & mysql container set up on the CI
-# test_overwrite_warns unsafely patchs over Series.mean affecting other tests when run in parallel
-# test_complex_series_frame_alignment randomly selects a DataFrames and axis to test but particular random selection(s) always fails
-# test_numpy_ufuncs_basic compares floating point values to unbounded precision, sometimes leading to failures
-TEST_NUMPY_UFUNCS_BASIC_FLAKY="not test_numpy_ufuncs_basic[float-exp] \
-and not test_numpy_ufuncs_basic[float-exp2] \
-and not test_numpy_ufuncs_basic[float-expm1] \
-and not test_numpy_ufuncs_basic[float-log] \
-and not test_numpy_ufuncs_basic[float-log2] \
-and not test_numpy_ufuncs_basic[float-log10] \
-and not test_numpy_ufuncs_basic[float-log1p] \
-and not test_numpy_ufuncs_basic[float-sqrt] \
-and not test_numpy_ufuncs_basic[float-sin] \
-and not test_numpy_ufuncs_basic[float-cos] \
-and not test_numpy_ufuncs_basic[float-tan] \
-and not test_numpy_ufuncs_basic[float-arcsin] \
-and not test_numpy_ufuncs_basic[float-arccos] \
-and not test_numpy_ufuncs_basic[float-arctan] \
-and not test_numpy_ufuncs_basic[float-sinh] \
-and not test_numpy_ufuncs_basic[float-cosh] \
-and not test_numpy_ufuncs_basic[float-tanh] \
-and not test_numpy_ufuncs_basic[float-arcsinh] \
-and not test_numpy_ufuncs_basic[float-arccosh] \
-and not test_numpy_ufuncs_basic[float-arctanh] \
-and not test_numpy_ufuncs_basic[float-deg2rad] \
-and not test_numpy_ufuncs_basic[float-rad2deg] \
-and not test_numpy_ufuncs_basic[num_float64-exp] \
-and not test_numpy_ufuncs_basic[num_float64-exp2] \
-and not test_numpy_ufuncs_basic[num_float64-expm1] \
-and not test_numpy_ufuncs_basic[num_float64-log] \
-and not test_numpy_ufuncs_basic[num_float64-log2] \
-and not test_numpy_ufuncs_basic[num_float64-log10] \
-and not test_numpy_ufuncs_basic[num_float64-log1p] \
-and not test_numpy_ufuncs_basic[num_float64-sqrt] \
-and not test_numpy_ufuncs_basic[num_float64-sin] \
-and not test_numpy_ufuncs_basic[num_float64-cos] \
-and not test_numpy_ufuncs_basic[num_float64-tan] \
-and not test_numpy_ufuncs_basic[num_float64-arcsin] \
-and not test_numpy_ufuncs_basic[num_float64-arccos] \
-and not test_numpy_ufuncs_basic[num_float64-arctan] \
-and not test_numpy_ufuncs_basic[num_float64-sinh] \
-and not test_numpy_ufuncs_basic[num_float64-cosh] \
-and not test_numpy_ufuncs_basic[num_float64-tanh] \
-and not test_numpy_ufuncs_basic[num_float64-arcsinh] \
-and not test_numpy_ufuncs_basic[num_float64-arccosh] \
-and not test_numpy_ufuncs_basic[num_float64-arctanh] \
-and not test_numpy_ufuncs_basic[num_float64-deg2rad] \
-and not test_numpy_ufuncs_basic[num_float64-rad2deg] \
-and not test_numpy_ufuncs_basic[num_float32-exp] \
-and not test_numpy_ufuncs_basic[num_float32-exp2] \
-and not test_numpy_ufuncs_basic[num_float32-expm1] \
-and not test_numpy_ufuncs_basic[num_float32-log] \
-and not test_numpy_ufuncs_basic[num_float32-log2] \
-and not test_numpy_ufuncs_basic[num_float32-log10] \
-and not test_numpy_ufuncs_basic[num_float32-log1p] \
-and not test_numpy_ufuncs_basic[num_float32-sqrt] \
-and not test_numpy_ufuncs_basic[num_float32-sin] \
-and not test_numpy_ufuncs_basic[num_float32-cos] \
-and not test_numpy_ufuncs_basic[num_float32-tan] \
-and not test_numpy_ufuncs_basic[num_float32-arcsin] \
-and not test_numpy_ufuncs_basic[num_float32-arccos] \
-and not test_numpy_ufuncs_basic[num_float32-arctan] \
-and not test_numpy_ufuncs_basic[num_float32-sinh] \
-and not test_numpy_ufuncs_basic[num_float32-cosh] \
-and not test_numpy_ufuncs_basic[num_float32-tanh] \
-and not test_numpy_ufuncs_basic[num_float32-arcsinh] \
-and not test_numpy_ufuncs_basic[num_float32-arccosh] \
-and not test_numpy_ufuncs_basic[num_float32-arctanh] \
-and not test_numpy_ufuncs_basic[num_float32-deg2rad] \
-and not test_numpy_ufuncs_basic[num_float32-rad2deg] \
-and not test_numpy_ufuncs_basic[nullable_float-exp] \
-and not test_numpy_ufuncs_basic[nullable_float-exp2] \
-and not test_numpy_ufuncs_basic[nullable_float-expm1] \
-and not test_numpy_ufuncs_basic[nullable_float-log] \
-and not test_numpy_ufuncs_basic[nullable_float-log2] \
-and not test_numpy_ufuncs_basic[nullable_float-log10] \
-and not test_numpy_ufuncs_basic[nullable_float-log1p] \
-and not test_numpy_ufuncs_basic[nullable_float-sqrt] \
-and not test_numpy_ufuncs_basic[nullable_float-sin] \
-and not test_numpy_ufuncs_basic[nullable_float-cos] \
-and not test_numpy_ufuncs_basic[nullable_float-tan] \
-and not test_numpy_ufuncs_basic[nullable_float-arcsin] \
-and not test_numpy_ufuncs_basic[nullable_float-arccos] \
-and not test_numpy_ufuncs_basic[nullable_float-arctan] \
-and not test_numpy_ufuncs_basic[nullable_float-sinh] \
-and not test_numpy_ufuncs_basic[nullable_float-cosh] \
-and not test_numpy_ufuncs_basic[nullable_float-tanh] \
-and not test_numpy_ufuncs_basic[nullable_float-arcsinh] \
-and not test_numpy_ufuncs_basic[nullable_float-arccosh] \
-and not test_numpy_ufuncs_basic[nullable_float-arctanh] \
-and not test_numpy_ufuncs_basic[nullable_float-deg2rad] \
-and not test_numpy_ufuncs_basic[nullable_float-rad2deg]"
-
+# TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
-    -k "not test_overwrite_warns and not test_complex_series_frame_alignment and not test_to_parquet_gcs_new_file and not test_qcut_nat and not test_add and not test_ismethods and $TEST_NUMPY_UFUNCS_BASIC_FLAKY" \
+    -k "not test_to_parquet_gcs_new_file and not test_qcut_nat and not test_add and not test_ismethods" \
     --import-mode=importlib \
-    -o xfail_strict=True \
     ${PYTEST_IGNORES} \
     "$@" || [ $? = 1 ]  # Exit success if exit code was 1 (permit test failures but not other errors)
 

From 3896222052a5aeff8198dca9ab02c053d62ff7c7 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 8 Apr 2024 10:20:09 -0500
Subject: [PATCH 023/842] Patch dask-expr `var` logic in dask-cudf (#15347)

The `var` logic in dask-expr relies on pandas -> numpy conversion that does not work for cudf -> cupy when null values are present. This PR copies over the custom `var` logic being used in dask-cudf for the legacy API.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15347
---
 ci/build_docs.sh                              |   3 -
 python/dask_cudf/dask_cudf/expr/_expr.py      | 106 +++++++++++++++---
 .../dask_cudf/tests/test_reductions.py        |  10 ++
 3 files changed, 103 insertions(+), 16 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index db0109015b8..668d52e530b 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -41,9 +41,6 @@ mkdir -p "${RAPIDS_DOCS_DIR}/libcudf/html"
 mv html/* "${RAPIDS_DOCS_DIR}/libcudf/html"
 popd
 
-# TODO: Remove this once dask-expr works in the 10min notebook
-export DASK_DATAFRAME__QUERY_PLANNING=False
-
 rapids-logger "Build Python docs"
 pushd docs/cudf
 make dirhtml
diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py
index 6def6e23b12..ff037b9520c 100644
--- a/python/dask_cudf/dask_cudf/expr/_expr.py
+++ b/python/dask_cudf/dask_cudf/expr/_expr.py
@@ -1,7 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
+import functools
 
 from dask_expr._cumulative import CumulativeBlockwise
-from dask_expr._reductions import Var
+from dask_expr._expr import Expr, VarColumns
+from dask_expr._reductions import Reduction, Var
+
+from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty
 
 ##
 ## Custom expression patching
@@ -25,19 +29,95 @@ def _kwargs(self) -> dict:
 CumulativeBlockwise._kwargs = PatchCumulativeBlockwise._kwargs
 
 
-# This patch accounts for differences between
-# numpy and cupy behavior. It may make sense
-# to move this logic upstream.
-_dx_reduction_aggregate = Var.reduction_aggregate
+# The upstream Var code uses `Series.values`, and relies on numpy
+# for most of the logic. Unfortunately, cudf -> cupy conversion
+# is not supported for data containing null values. Therefore,
+# we must implement our own version of Var for now. This logic
+# is mostly copied from dask-cudf.
+
+
+class VarCudf(Reduction):
+    # Uses the parallel version of Welford's online algorithm (Chan '79)
+    # (http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf)
+    _parameters = ["frame", "skipna", "ddof", "numeric_only", "split_every"]
+    _defaults = {
+        "skipna": True,
+        "ddof": 1,
+        "numeric_only": False,
+        "split_every": False,
+    }
+
+    @functools.cached_property
+    def _meta(self):
+        return make_meta(
+            meta_nonempty(self.frame._meta).var(
+                skipna=self.skipna, numeric_only=self.numeric_only
+            )
+        )
+
+    @property
+    def chunk_kwargs(self):
+        return dict(skipna=self.skipna, numeric_only=self.numeric_only)
+
+    @property
+    def combine_kwargs(self):
+        return {}
+
+    @property
+    def aggregate_kwargs(self):
+        return dict(ddof=self.ddof)
+
+    @classmethod
+    def reduction_chunk(cls, x, skipna=True, numeric_only=False):
+        kwargs = {"numeric_only": numeric_only} if is_dataframe_like(x) else {}
+        if skipna or numeric_only:
+            n = x.count(**kwargs)
+            kwargs["skipna"] = skipna
+            avg = x.mean(**kwargs)
+        else:
+            # Not skipping nulls, so might as well
+            # avoid the full `count` operation
+            n = len(x)
+            kwargs["skipna"] = skipna
+            avg = x.sum(**kwargs) / n
+        if numeric_only:
+            # Workaround for cudf bug
+            # (see: https://github.com/rapidsai/cudf/issues/13731)
+            x = x[n.index]
+        m2 = ((x - avg) ** 2).sum(**kwargs)
+        return n, avg, m2
+
+    @classmethod
+    def reduction_combine(cls, parts):
+        n, avg, m2 = parts[0]
+        for i in range(1, len(parts)):
+            n_a, avg_a, m2_a = n, avg, m2
+            n_b, avg_b, m2_b = parts[i]
+            n = n_a + n_b
+            avg = (n_a * avg_a + n_b * avg_b) / n
+            delta = avg_b - avg_a
+            m2 = m2_a + m2_b + delta**2 * n_a * n_b / n
+        return n, avg, m2
+
+    @classmethod
+    def reduction_aggregate(cls, vals, ddof=1):
+        vals = cls.reduction_combine(vals)
+        n, _, m2 = vals
+        return m2 / (n - ddof)
 
 
-def _reduction_aggregate(*args, **kwargs):
-    result = _dx_reduction_aggregate(*args, **kwargs)
-    if result.ndim == 0:
-        # cupy will sometimes produce a 0d array, and
-        # we need to convert it to a scalar.
-        return result.item()
-    return result
+def _patched_var(
+    self, axis=0, skipna=True, ddof=1, numeric_only=False, split_every=False
+):
+    if axis == 0:
+        if hasattr(self._meta, "to_pandas"):
+            return VarCudf(self, skipna, ddof, numeric_only, split_every)
+        else:
+            return Var(self, skipna, ddof, numeric_only, split_every)
+    elif axis == 1:
+        return VarColumns(self, skipna, ddof, numeric_only)
+    else:
+        raise ValueError(f"axis={axis} not supported. Please specify 0 or 1")
 
 
-Var.reduction_aggregate = staticmethod(_reduction_aggregate)
+Expr.var = _patched_var
diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py
index c3056f2607c..88b15718382 100644
--- a/python/dask_cudf/dask_cudf/tests/test_reductions.py
+++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py
@@ -84,3 +84,13 @@ def test_rowwise_reductions(data, op):
             check_exact=False,
             check_dtype=op not in ("var", "std"),
         )
+
+
+@pytest.mark.parametrize("skipna", [True, False])
+def test_var_nulls(skipna):
+    # Copied from 10min example notebook
+    # See: https://github.com/rapidsai/cudf/pull/15347
+    s = cudf.Series([1, 2, 3, None, 4])
+    ds = dask_cudf.from_cudf(s, npartitions=2)
+    dd.assert_eq(s.var(skipna=skipna), ds.var(skipna=skipna))
+    dd.assert_eq(s.std(skipna=skipna), ds.std(skipna=skipna))

From 7750afc81f02089faa66289e96dcfb8ecb3623bd Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 8 Apr 2024 11:46:10 -0400
Subject: [PATCH 024/842] Remove deprecated strings offsets_begin (#15454)

Removes the deprecated `cudf::strings_column_view::offsets_begin()` and `cudf::strings_column_view::offsets_end()` member functions. These are replaced with offsetalator wrapper calls instead.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/15454
---
 .../cudf/strings/strings_column_view.hpp      | 22 -------------------
 cpp/src/strings/strings_column_view.cpp       | 10 ---------
 2 files changed, 32 deletions(-)

diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp
index 1156f0a5b73..1e9e73cef4c 100644
--- a/cpp/include/cudf/strings/strings_column_view.hpp
+++ b/cpp/include/cudf/strings/strings_column_view.hpp
@@ -85,28 +85,6 @@ class strings_column_view : private column_view {
    */
   [[nodiscard]] column_view offsets() const;
 
-  /**
-   * @brief Return an iterator for the offsets child column.
-   *
-   * @deprecated Since 24.04
-   *
-   * This automatically applies the offset of the parent.
-   *
-   * @return Iterator pointing to the first offset value.
-   */
-  [[deprecated]] offset_iterator offsets_begin() const;
-
-  /**
-   * @brief Return an end iterator for the offsets child column.
-   *
-   * @deprecated Since 24.04
-   *
-   * This automatically applies the offset of the parent.
-   *
-   * @return Iterator pointing 1 past the last offset value.
-   */
-  [[deprecated]] offset_iterator offsets_end() const;
-
   /**
    * @brief Returns the number of bytes in the chars child column.
    *
diff --git a/cpp/src/strings/strings_column_view.cpp b/cpp/src/strings/strings_column_view.cpp
index 3ae97a00bbf..32671669093 100644
--- a/cpp/src/strings/strings_column_view.cpp
+++ b/cpp/src/strings/strings_column_view.cpp
@@ -35,16 +35,6 @@ column_view strings_column_view::offsets() const
   return child(offsets_column_index);
 }
 
-strings_column_view::offset_iterator strings_column_view::offsets_begin() const
-{
-  return offsets().begin<int32_t>() + offset();
-}
-
-strings_column_view::offset_iterator strings_column_view::offsets_end() const
-{
-  return offsets().begin<int32_t>() + offset() + size() + 1;
-}
-
 int64_t strings_column_view::chars_size(rmm::cuda_stream_view stream) const noexcept
 {
   if (size() == 0) { return 0L; }

From 44e0640bed93a5915346e38ff5380e2eef9a1e27 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 8 Apr 2024 10:58:39 -0500
Subject: [PATCH 025/842] Avoid "p2p" shuffle as a default when `dask_cudf` is
 imported (#15469)

I was looking through some dask-related test failures in https://github.com/rapidsai/cuml/pull/5819 and noticed that the "p2p" shuffle is causing some problems when query-planning is enabled. This PR sets the global default to "tasks". It *may* make sense to roll back this change once we fix the underlying problem(s), but I doubt it.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15469
---
 python/dask_cudf/dask_cudf/expr/__init__.py   |  3 +++
 .../dask_cudf/tests/test_distributed.py       | 25 ++++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/expr/__init__.py b/python/dask_cudf/dask_cudf/expr/__init__.py
index 826f514a674..a76b655ef42 100644
--- a/python/dask_cudf/dask_cudf/expr/__init__.py
+++ b/python/dask_cudf/dask_cudf/expr/__init__.py
@@ -8,6 +8,9 @@
 
 # Register custom expressions and collections
 if QUERY_PLANNING_ON:
+    # Broadly avoid "p2p" and "disk" defaults for now
+    config.set({"dataframe.shuffle.method": "tasks"})
+
     try:
         import dask_cudf.expr._collection
         import dask_cudf.expr._expr
diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py
index 39eadb45c91..07fdb25dff9 100644
--- a/python/dask_cudf/dask_cudf/tests/test_distributed.py
+++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py
@@ -16,9 +16,9 @@
 dask_cuda = pytest.importorskip("dask_cuda")
 
 
-def more_than_two_gpus():
+def at_least_n_gpus(n):
     ngpus = len(numba.cuda.gpus)
-    return ngpus >= 2
+    return ngpus >= n
 
 
 @pytest.mark.parametrize("delayed", [True, False])
@@ -54,7 +54,7 @@ def test_merge():
 
 
 @pytest.mark.skipif(
-    not more_than_two_gpus(), reason="Machine does not have more than two GPUs"
+    not at_least_n_gpus(2), reason="Machine does not have two GPUs"
 )
 def test_ucx_seriesgroupby():
     pytest.importorskip("ucp")
@@ -97,3 +97,22 @@ def test_p2p_shuffle():
                 ddf.compute().sort_values("x"),
                 check_index=False,
             )
+
+
+@pytest.mark.skipif(
+    not at_least_n_gpus(3),
+    reason="Machine does not have three GPUs",
+)
+def test_unique():
+    # Using `"p2p"` can produce dispatching problems
+    # TODO: Test "p2p" after dask > 2024.4.1 is required
+    # See: https://github.com/dask/dask/pull/11040
+    with dask_cuda.LocalCUDACluster(n_workers=3) as cluster:
+        with Client(cluster):
+            df = cudf.DataFrame({"x": ["a", "b", "c", "a", "a"]})
+            ddf = dask_cudf.from_cudf(df, npartitions=2)
+            dd.assert_eq(
+                df.x.unique(),
+                ddf.x.unique().compute(),
+                check_index=False,
+            )

From 6b3fd6a77e329f4e1db12ac2c0c9d1ad653cee98 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 8 Apr 2024 06:27:04 -1000
Subject: [PATCH 026/842] Enable tests/interchange/test_impl.py in cudf.pandas
 tests (#15443)

closes #15423

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15443
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index eeb9f2b6368..1ba2ac39ab2 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -23,7 +23,6 @@ set -euo pipefail
 PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)")
 
 PYTEST_IGNORES="--ignore=tests/io/test_user_agent.py \
---ignore=tests/interchange/test_impl.py \
 --ignore=tests/window/test_dtypes.py \
 --ignore=tests/strings/test_api.py \
 --ignore=tests/window/test_numba.py \

From e6cfd4503af063d3bba28954ab7ec67dbbb44e71 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 8 Apr 2024 11:35:01 -0500
Subject: [PATCH 027/842] Fix an issue with creating a series from scalar when
 `dtype='category'` (#15476)

## Description
When `dtype='category'` we seem to error:
```

File "/pyenv/versions/3.11.9/lib/python3.11/site-packages/cuml/preprocessing/LabelEncoder.py", line 218, in transform
2024-04-05T19:37:35.8255262Z E                 y = cudf.Series('a', dtype="category")
2024-04-05T19:37:35.8257445Z E                 ^^^^^^^^^^^^^^^^^
2024-04-05T19:37:35.8260865Z E               File "/pyenv/versions/3.11.9/lib/python3.11/site-packages/nvtx/nvtx.py", line 116, in inner
2024-04-05T19:37:35.8264174Z E                 result = func(*args, **kwargs)
2024-04-05T19:37:35.8266324Z E                 ^^^^^^^^^^^^^^^^^
2024-04-05T19:37:35.8270003Z E               File "/pyenv/versions/3.11.9/lib/python3.11/site-packages/cudf/core/series.py", line 648, in __init__
2024-04-05T19:37:35.8273382Z E                 column = as_column(
2024-04-05T19:37:35.8275420Z E                 ^^^^^^^^^^^^^^^^^
2024-04-05T19:37:35.8279989Z E               File "/pyenv/versions/3.11.9/lib/python3.11/site-packages/cudf/core/column/column.py", line 2022, in as_column
2024-04-05T19:37:35.8281584Z E                 arbitrary = cudf.Scalar(arbitrary, dtype=dtype)
2024-04-05T19:37:35.8282461Z E                 ^^^^^^^^^^^^^^^^^
2024-04-05T19:37:35.8283768Z E               File "/pyenv/versions/3.11.9/lib/python3.11/site-packages/cudf/core/scalar.py", line 57, in __call__
2024-04-05T19:37:35.8285137Z E                 obj = super().__call__(value, dtype=dtype)
2024-04-05T19:37:35.8285959Z E                 ^^^^^^^^^^^^^^^^^
2024-04-05T19:37:35.8287757Z E               File "/pyenv/versions/3.11.9/lib/python3.11/site-packages/cudf/core/scalar.py", line 128, in __init__
2024-04-05T19:37:35.8289232Z E                 self._host_value, self._host_dtype = self._preprocess_host_value(
2024-04-05T19:37:35.8290183Z E                 ^^^^^^^^^^^^^^^^^
2024-04-05T19:37:35.8291705Z E               File "/pyenv/versions/3.11.9/lib/python3.11/site-packages/cudf/core/scalar.py", line 222, in _preprocess_host_value
2024-04-05T19:37:35.8293212Z E                 value = to_cudf_compatible_scalar(value, dtype=dtype)
2024-04-05T19:37:35.8294438Z E                 ^^^^^^^^^^^^^^^^^
2024-04-05T19:37:35.8296026Z E               File "/pyenv/versions/3.11.9/lib/python3.11/site-packages/cudf/utils/dtypes.py", line 257, in to_cudf_compatible_scalar
2024-04-05T19:37:35.8297604Z E                 if isinstance(val, str) and np.dtype(dtype).kind == "M":
2024-04-05T19:37:35.8298543Z E                 ^^^^^^^^^^^^^^^^^
2024-04-05T19:37:35.8308752Z E             TypeError: data type 'category' not understood
```
## Checklist
- [x] I am familiar with the [Contributing
Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
- [x] New or existing tests cover these changes.
- [x] The documentation is up to date with these changes.
---
 python/cudf/cudf/core/column/column.py     | 2 +-
 python/cudf/cudf/tests/test_categorical.py | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index f13d8cf12f7..6103bbfc971 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2009,7 +2009,7 @@ def as_column(
             length = 1
         elif length < 0:
             raise ValueError(f"{length=} must be >=0.")
-        if isinstance(arbitrary, pd.Interval):
+        if isinstance(arbitrary, pd.Interval) or _is_categorical_dtype(dtype):
             # No cudf.Scalar support yet
             return as_column(
                 pd.Series([arbitrary] * length),
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index ad32ebce01b..cc3e20b5bac 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -846,3 +846,11 @@ def test_empty_series_category_cast(ordered):
 
     assert_eq(expected, actual)
     assert_eq(expected.dtype.ordered, actual.dtype.ordered)
+
+
+@pytest.mark.parametrize("scalar", [1, "a", None, 10.2])
+def test_cat_from_scalar(scalar):
+    ps = pd.Series(scalar, dtype="category")
+    gs = cudf.Series(scalar, dtype="category")
+
+    assert_eq(ps, gs)

From 33771bb935a1863f9ee8e0b62cdaed995725903c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 8 Apr 2024 11:18:29 -1000
Subject: [PATCH 028/842] Enable tests/io/test_user_agent.py in cudf pandas
 tests (#15442)

This test was renamed in `test_https_headers.py` in 2.2.0. https://github.com/pandas-dev/pandas/pull/56057

Also this test now "runs" but is skipped because it's marked as a `single_cpu` test which we skip when running these tests.

closes https://github.com/rapidsai/cudf/issues/15422

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15442
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 1ba2ac39ab2..9eb77358c41 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -22,7 +22,7 @@ set -euo pipefail
 # of Pandas installed.
 PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)")
 
-PYTEST_IGNORES="--ignore=tests/io/test_user_agent.py \
+PYTEST_IGNORES="--ignore=tests/interchange/test_impl.py \
 --ignore=tests/window/test_dtypes.py \
 --ignore=tests/strings/test_api.py \
 --ignore=tests/window/test_numba.py \

From 2d73f11c00597294519535f2668a67e6b710de1c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 8 Apr 2024 11:23:15 -1000
Subject: [PATCH 029/842] Enable tests/strings/test_api.py and
 tests/io/pytables in cudf.pandas tests (#15461)

closes https://github.com/rapidsai/cudf/issues/15425
closes https://github.com/rapidsai/cudf/issues/15427

The `tests/io/pytables` tests are technically skipped since they are marked `single_cpu` and we run `-m not single_cpu`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15461
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 9eb77358c41..b549f87230a 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -24,10 +24,8 @@ PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)")
 
 PYTEST_IGNORES="--ignore=tests/interchange/test_impl.py \
 --ignore=tests/window/test_dtypes.py \
---ignore=tests/strings/test_api.py \
 --ignore=tests/window/test_numba.py \
 --ignore=tests/window \
---ignore=tests/io/pytables \
 --ignore=tests/plotting \
 --ignore=tests/scalar \
 --ignore=tests/series/test_arithmetic.py \

From b037ddfb04d8b69214f0847ffe5905048c522511 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 8 Apr 2024 12:26:45 -1000
Subject: [PATCH 030/842] Ignore pandas tests for cudf.pandas that need
 motoserver (#15468)

These test `ERROR` because they expect a connection to a mock S3 server. Ignoring these test for now until that is set up

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15468
---
 .../cudf/pandas/scripts/run-pandas-tests.sh   | 43 ++++++++++++++++++-
 1 file changed, 42 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index b549f87230a..0ccec2663cb 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -95,10 +95,51 @@ cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftes
 # Run the tests
 cd pandas-tests/
 
+
+# TODO: Needs motoserver/moto container running on http://localhost:5000
+TEST_THAT_NEED_MOTO_SERVER="not test_styler_to_s3 \
+and not test_with_s3_url[None] \
+and not test_with_s3_url[gzip] \
+and not test_with_s3_url[bz2] \
+and not test_with_s3_url[zip] \
+and not test_with_s3_url[xz] \
+and not test_with_s3_url[tar] \
+and not test_s3_permission_output[etree] \
+and not test_read_s3_jsonl \
+and not test_s3_parser_consistency \
+and not test_to_s3 \
+and not test_parse_public_s3a_bucket \
+and not test_parse_public_s3_bucket_nrows \
+and not test_parse_public_s3_bucket_chunked \
+and not test_parse_public_s3_bucket_chunked_python \
+and not test_parse_public_s3_bucket_python \
+and not test_infer_s3_compression \
+and not test_parse_public_s3_bucket_nrows_python \
+and not test_read_s3_fails_private \
+and not test_read_csv_handles_boto_s3_object \
+and not test_read_csv_chunked_download \
+and not test_read_s3_with_hash_in_key \
+and not test_read_feather_s3_file_path \
+and not test_parse_public_s3_bucket \
+and not test_parse_private_s3_bucket \
+and not test_parse_public_s3n_bucket \
+and not test_read_with_creds_from_pub_bucket \
+and not test_read_without_creds_from_pub_bucket \
+and not test_from_s3_csv \
+and not test_s3_protocols[s3] \
+and not test_s3_protocols[s3a] \
+and not test_s3_protocols[s3n] \
+and not test_s3_parquet \
+and not test_s3_roundtrip_explicit_fs \
+and not test_s3_roundtrip \
+and not test_s3_roundtrip_for_dir[partition_col0] \
+and not test_s3_roundtrip_for_dir[partition_col1] \
+and not test_s3_roundtrip"
+
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
-    -k "not test_to_parquet_gcs_new_file and not test_qcut_nat and not test_add and not test_ismethods" \
+    -k "not test_to_parquet_gcs_new_file and not test_qcut_nat and not test_add and not test_ismethods and $TEST_THAT_NEED_MOTO_SERVER" \
     --import-mode=importlib \
     ${PYTEST_IGNORES} \
     "$@" || [ $? = 1 ]  # Exit success if exit code was 1 (permit test failures but not other errors)

From 1862cdc089c3a77ccec70411e5cd6dac292a8029 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 8 Apr 2024 20:50:37 -0400
Subject: [PATCH 031/842] Introduce benchmark suite for JSON reader options
 (#15124)

The goal of this piece of work is to analyze the performance of the reader for JSON lines. This PR establishes a baseline for the performance of single quote normalization, white space normalization, mixed type as string parsing and recovery mode options when the input JSON is valid, and does not have any single quotes.
Modifying the data generation to produce inputs with single quotes/mixed types/invalid lines will be the focus of follow-on PRs.
Addresses #15041

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15124
---
 cpp/benchmarks/CMakeLists.txt                 |   1 +
 cpp/benchmarks/io/json/json_reader_option.cpp | 197 ++++++++++++++++++
 cpp/benchmarks/io/nvbench_helpers.hpp         |  67 +++++-
 3 files changed, 264 insertions(+), 1 deletion(-)
 create mode 100644 cpp/benchmarks/io/json/json_reader_option.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 798e4e76141..b384f6d5674 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -331,6 +331,7 @@ ConfigureNVBench(
 ConfigureBench(JSON_BENCH json/json.cu)
 ConfigureNVBench(FST_NVBENCH io/fst.cu)
 ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader_input.cpp)
+ConfigureNVBench(JSON_READER_OPTION io/json/json_reader_option.cpp)
 ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/io/json/json_reader_option.cpp b/cpp/benchmarks/io/json/json_reader_option.cpp
new file mode 100644
index 00000000000..ed1008d053a
--- /dev/null
+++ b/cpp/benchmarks/io/json/json_reader_option.cpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
+#include <benchmarks/io/nvbench_helpers.hpp>
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/io/json.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
+constexpr size_t data_size         = 512 << 20;
+constexpr cudf::size_type num_cols = 64;
+
+template <json_lines JsonLines>
+void BM_json_read_options(nvbench::state& state, nvbench::type_list<nvbench::enum_type<JsonLines>>)
+{
+  constexpr auto json_lines_bool = JsonLines == json_lines::YES;
+
+  cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
+  auto const data_types = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
+                                             static_cast<int32_t>(data_type::FLOAT),
+                                             static_cast<int32_t>(data_type::DECIMAL),
+                                             static_cast<int32_t>(data_type::STRING),
+                                             static_cast<int32_t>(data_type::LIST),
+                                             static_cast<int32_t>(data_type::STRUCT)});
+
+  auto const tbl = create_random_table(
+    cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}, data_profile_builder());
+  auto const view = tbl->view();
+  cudf::io::json_writer_options const write_opts =
+    cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view)
+      .lines(json_lines_bool)
+      .na_rep("null")
+      .rows_per_chunk(100'000);
+  cudf::io::write_json(write_opts);
+
+  cudf::io::json_reader_options read_options =
+    cudf::io::json_reader_options::builder(source_sink.make_source_info()).lines(json_lines_bool);
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      try_drop_l3_cache();
+      timer.start();
+      auto const result        = cudf::io::read_json(read_options);
+      auto const num_rows_read = result.tbl->num_rows();
+      auto const num_cols_read = result.tbl->num_columns();
+      timer.stop();
+      CUDF_EXPECTS(num_rows_read == view.num_rows(), "Benchmark did not read the entire table");
+      CUDF_EXPECTS(num_cols_read == num_cols, "Unexpected number of columns");
+    });
+
+  auto const elapsed_time   = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  auto const data_processed = data_size * num_cols / view.num_columns();
+  state.add_element_count(static_cast<double>(data_processed) / elapsed_time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
+}
+
+template <row_selection RowSelection,
+          normalize_single_quotes NormalizeSingleQuotes,
+          normalize_whitespace NormalizeWhitespace,
+          mixed_types_as_string MixedTypesAsString,
+          recovery_mode RecoveryMode>
+void BM_jsonlines_read_options(nvbench::state& state,
+                               nvbench::type_list<nvbench::enum_type<RowSelection>,
+                                                  nvbench::enum_type<NormalizeSingleQuotes>,
+                                                  nvbench::enum_type<NormalizeWhitespace>,
+                                                  nvbench::enum_type<MixedTypesAsString>,
+                                                  nvbench::enum_type<RecoveryMode>>)
+{
+  constexpr auto normalize_single_quotes_bool =
+    NormalizeSingleQuotes == normalize_single_quotes::YES;
+  constexpr auto normalize_whitespace_bool  = NormalizeWhitespace == normalize_whitespace::YES;
+  constexpr auto mixed_types_as_string_bool = MixedTypesAsString == mixed_types_as_string::YES;
+  constexpr auto recovery_mode_enum         = RecoveryMode == recovery_mode::RECOVER_WITH_NULL
+                                                ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
+                                                : cudf::io::json_recovery_mode_t::FAIL;
+  size_t const num_chunks                   = state.get_int64("num_chunks");
+  if (num_chunks > 1 && RowSelection == row_selection::ALL) {
+    state.skip(
+      "No point running the same benchmark multiple times for different num_chunks when all rows "
+      "are being selected anyway");
+    return;
+  }
+
+  cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
+  auto const data_types = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
+                                             static_cast<int32_t>(data_type::FLOAT),
+                                             static_cast<int32_t>(data_type::DECIMAL),
+                                             static_cast<int32_t>(data_type::STRING),
+                                             static_cast<int32_t>(data_type::LIST),
+                                             static_cast<int32_t>(data_type::STRUCT)});
+
+  auto const tbl = create_random_table(
+    cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}, data_profile_builder());
+  auto const view = tbl->view();
+  cudf::io::json_writer_options const write_opts =
+    cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view)
+      .lines(true)
+      .na_rep("null")
+      .rows_per_chunk(100'000);
+  cudf::io::write_json(write_opts);
+
+  cudf::io::json_reader_options read_options =
+    cudf::io::json_reader_options::builder(source_sink.make_source_info())
+      .lines(true)
+      .normalize_single_quotes(normalize_single_quotes_bool)
+      .normalize_whitespace(normalize_whitespace_bool)
+      .mixed_types_as_string(mixed_types_as_string_bool)
+      .recovery_mode(recovery_mode_enum);
+
+  size_t const chunk_size = cudf::util::div_rounding_up_safe(source_sink.size(), num_chunks);
+  auto mem_stats_logger   = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      try_drop_l3_cache();
+      cudf::size_type num_rows_read = 0;
+      cudf::size_type num_cols_read = 0;
+      timer.start();
+      switch (RowSelection) {
+        case row_selection::ALL: {
+          auto const result = cudf::io::read_json(read_options);
+          num_rows_read     = result.tbl->num_rows();
+          num_cols_read     = result.tbl->num_columns();
+          break;
+        }
+        case row_selection::BYTE_RANGE: {
+          for (uint64_t chunk = 0; chunk < num_chunks; chunk++) {
+            read_options.set_byte_range_offset(chunk * chunk_size);
+            read_options.set_byte_range_size(chunk_size);
+            auto const result = cudf::io::read_json(read_options);
+            num_rows_read += result.tbl->num_rows();
+            num_cols_read = result.tbl->num_columns();
+            if (num_cols_read)
+              CUDF_EXPECTS(num_cols_read == num_cols, "Unexpected number of columns");
+          }
+          break;
+        }
+        default: CUDF_FAIL("Unsupported row selection method");
+      }
+      timer.stop();
+      CUDF_EXPECTS(num_rows_read == view.num_rows(), "Benchmark did not read the entire table");
+    });
+
+  auto const elapsed_time   = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  auto const data_processed = data_size * num_cols / view.num_columns();
+  state.add_element_count(static_cast<double>(data_processed) / elapsed_time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
+}
+
+NVBENCH_BENCH_TYPES(
+  BM_jsonlines_read_options,
+  NVBENCH_TYPE_AXES(
+    nvbench::enum_type_list<row_selection::ALL, row_selection::BYTE_RANGE>,
+    nvbench::enum_type_list<normalize_single_quotes::NO, normalize_single_quotes::YES>,
+    nvbench::enum_type_list<normalize_whitespace::NO, normalize_whitespace::YES>,
+    nvbench::enum_type_list<mixed_types_as_string::NO, mixed_types_as_string::YES>,
+    nvbench::enum_type_list<recovery_mode::RECOVER_WITH_NULL, recovery_mode::FAIL>))
+  .set_name("jsonlines_reader")
+  .set_type_axes_names({"row_selection",
+                        "normalize_single_quotes",
+                        "normalize_whitespace",
+                        "mixed_types_as_string",
+                        "recovery_mode"})
+  .set_min_samples(6)
+  .add_int64_axis("num_chunks", nvbench::range(1, 5, 1));
+
+NVBENCH_BENCH_TYPES(BM_json_read_options,
+                    NVBENCH_TYPE_AXES(nvbench::enum_type_list<json_lines::YES, json_lines::NO>))
+  .set_name("json_reader")
+  .set_type_axes_names({"json_lines"})
+  .set_min_samples(6);
diff --git a/cpp/benchmarks/io/nvbench_helpers.hpp b/cpp/benchmarks/io/nvbench_helpers.hpp
index dd96f6fa4cd..8b79912c7ee 100644
--- a/cpp/benchmarks/io/nvbench_helpers.hpp
+++ b/cpp/benchmarks/io/nvbench_helpers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -169,3 +169,68 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
     }
   },
   [](auto) { return std::string{}; })
+
+enum class json_lines : bool { YES, NO };
+
+enum class normalize_single_quotes : bool { YES, NO };
+
+enum class normalize_whitespace : bool { YES, NO };
+
+enum class mixed_types_as_string : bool { YES, NO };
+
+enum class recovery_mode : bool { FAIL, RECOVER_WITH_NULL };
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  json_lines,
+  [](auto value) {
+    switch (value) {
+      case json_lines::YES: return "YES";
+      case json_lines::NO: return "NO";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  normalize_single_quotes,
+  [](auto value) {
+    switch (value) {
+      case normalize_single_quotes::YES: return "YES";
+      case normalize_single_quotes::NO: return "NO";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  normalize_whitespace,
+  [](auto value) {
+    switch (value) {
+      case normalize_whitespace::YES: return "YES";
+      case normalize_whitespace::NO: return "NO";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  mixed_types_as_string,
+  [](auto value) {
+    switch (value) {
+      case mixed_types_as_string::YES: return "YES";
+      case mixed_types_as_string::NO: return "NO";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  recovery_mode,
+  [](auto value) {
+    switch (value) {
+      case recovery_mode::FAIL: return "FAIL";
+      case recovery_mode::RECOVER_WITH_NULL: return "RECOVER_WITH_NULL";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })

From f1a3db28e1e5efe9f144f95a7392549ea2c221b1 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 9 Apr 2024 02:46:13 -1000
Subject: [PATCH 032/842] Enable tests/scalar and test/series in cudf.pandas
 tests (#15486)

Locally these don't seem to hang or crash workers

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15486
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 0ccec2663cb..f14490eee7d 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -27,8 +27,6 @@ PYTEST_IGNORES="--ignore=tests/interchange/test_impl.py \
 --ignore=tests/window/test_numba.py \
 --ignore=tests/window \
 --ignore=tests/plotting \
---ignore=tests/scalar \
---ignore=tests/series/test_arithmetic.py \
 --ignore=tests/tslibs/test_parsing.py \
 --ignore=tests/io/parser/common/test_read_errors.py"
 

From 54eff4ef3a6cb2e0e10f1064eb2071653a3c9bc8 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 9 Apr 2024 04:19:47 -1000
Subject: [PATCH 033/842] Avoid .ordered and .categories from being settable in
 CategoricalColumn and CategoricalDtype (#15475)

A rehash of https://github.com/rapidsai/cudf/pull/14979

The `CategoricalDtype.ordered` behavior matches `pandas.CategoricalDtype.ordered` behavior.

Also combines `as_ordered` and `as_unordred` into 1 method, and avoids to `as_index` casts that are already performed elsewhere

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15475
---
 python/cudf/cudf/core/column/categorical.py | 53 +++++++--------------
 python/cudf/cudf/core/dtypes.py             |  4 --
 python/cudf/cudf/core/index.py              |  6 +--
 python/cudf/cudf/tests/test_categorical.py  |  5 ++
 4 files changed, 26 insertions(+), 42 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 88bb4521a5b..e4620ee5bc4 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -110,7 +110,7 @@ def categories(self) -> "cudf.core.index.Index":
         """
         The categories of this categorical.
         """
-        return cudf.core.index.as_index(self._column.categories)
+        return self._column.dtype.categories
 
     @property
     def codes(self) -> "cudf.Series":
@@ -165,7 +165,7 @@ def as_ordered(self) -> Optional[SeriesOrIndex]:
         dtype: category
         Categories (3, int64): [1 < 2 < 10]
         """
-        return self._return_or_inplace(self._column.as_ordered())
+        return self._return_or_inplace(self._column.as_ordered(ordered=True))
 
     def as_unordered(self) -> Optional[SeriesOrIndex]:
         """
@@ -212,8 +212,7 @@ def as_unordered(self) -> Optional[SeriesOrIndex]:
         dtype: category
         Categories (3, int64): [1, 2, 10]
         """
-
-        return self._return_or_inplace(self._column.as_unordered())
+        return self._return_or_inplace(self._column.as_ordered(ordered=False))
 
     def add_categories(self, new_categories: Any) -> Optional[SeriesOrIndex]:
         """
@@ -631,10 +630,6 @@ def codes(self) -> NumericalColumn:
     def ordered(self) -> bool:
         return self.dtype.ordered
 
-    @ordered.setter
-    def ordered(self, value: bool):
-        self.dtype.ordered = value
-
     def __setitem__(self, key, value):
         if cudf.api.types.is_scalar(
             value
@@ -1170,9 +1165,11 @@ def _get_decategorized_column(self) -> ColumnBase:
     def copy(self, deep: bool = True) -> Self:
         result_col = super().copy(deep=deep)
         if deep:
-            result_col.categories = libcudf.copying.copy_column(
-                self.dtype._categories
+            dtype_copy = CategoricalDtype(
+                categories=self.categories.copy(),
+                ordered=self.ordered,
             )
+            result_col = cast(Self, result_col._with_type_metadata(dtype_copy))
         return result_col
 
     @cached_property
@@ -1411,31 +1408,17 @@ def reorder_categories(
             )
         return self._set_categories(new_categories, ordered=ordered)
 
-    def as_ordered(self):
-        out_col = self
-        if not out_col.ordered:
-            out_col = column.build_categorical_column(
-                categories=self.categories,
-                codes=self.codes,
-                mask=self.base_mask,
-                size=self.base_size,
-                offset=self.offset,
-                ordered=True,
-            )
-        return out_col
-
-    def as_unordered(self):
-        out_col = self
-        if out_col.ordered:
-            out_col = column.build_categorical_column(
-                categories=self.categories,
-                codes=self.codes,
-                mask=self.base_mask,
-                size=self.base_size,
-                offset=self.offset,
-                ordered=False,
-            )
-        return out_col
+    def as_ordered(self, ordered: bool):
+        if self.dtype.ordered == ordered:
+            return self
+        return column.build_categorical_column(
+            categories=self.categories,
+            codes=self.codes,
+            mask=self.base_mask,
+            size=self.base_size,
+            offset=self.offset,
+            ordered=ordered,
+        )
 
 
 def _create_empty_categorical_column(
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 3bd342e24c2..73617763221 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -205,10 +205,6 @@ def ordered(self) -> bool:
         """
         return self._ordered
 
-    @ordered.setter
-    def ordered(self, value) -> None:
-        self._ordered = value
-
     @classmethod
     def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype":
         """
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index bd9dc1ae3da..0a7435bd241 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2624,9 +2624,9 @@ def __init__(
         elif isinstance(dtype, (pd.CategoricalDtype, cudf.CategoricalDtype)):
             data = data.set_categories(dtype.categories, ordered=ordered)
         elif ordered is True and data.ordered is False:
-            data = data.as_ordered()
+            data = data.as_ordered(ordered=True)
         elif ordered is False and data.ordered is True:
-            data = data.as_unordered()
+            data = data.as_ordered(ordered=False)
         super().__init__(data, **kwargs)
 
     @property  # type: ignore
@@ -2643,7 +2643,7 @@ def categories(self):
         """
         The categories of this categorical.
         """
-        return as_index(self._values.categories)
+        return self.dtype.categories
 
     def _is_boolean(self):
         return False
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index cc3e20b5bac..e21fd53bee4 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -848,6 +848,11 @@ def test_empty_series_category_cast(ordered):
     assert_eq(expected.dtype.ordered, actual.dtype.ordered)
 
 
+def test_categorical_dtype_ordered_not_settable():
+    with pytest.raises(AttributeError):
+        cudf.CategoricalDtype().ordered = False
+
+
 @pytest.mark.parametrize("scalar", [1, "a", None, 10.2])
 def test_cat_from_scalar(scalar):
     ps = pd.Series(scalar, dtype="category")

From 338cc98ff08fcfe8ab1a47a4db5373c7bce74538 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Tue, 9 Apr 2024 10:52:50 -0400
Subject: [PATCH 034/842] `ModuleAccelerator` performance: cache the result of
 checking if a caller is in the denylist (#15056)

Authors:
  - Ashwin Srinath (https://github.com/shwina)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15056
---
 python/cudf/cudf/pandas/module_accelerator.py | 24 ++++++++++++-------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py
index e97d6e4af24..1d431c6d882 100644
--- a/python/cudf/cudf/pandas/module_accelerator.py
+++ b/python/cudf/cudf/pandas/module_accelerator.py
@@ -17,7 +17,7 @@
 from abc import abstractmethod
 from importlib._bootstrap import _ImportLockContext as ImportLock
 from types import ModuleType
-from typing import Any, ContextManager, Dict, List, NamedTuple
+from typing import Any, ContextManager, Dict, NamedTuple, Tuple
 
 from typing_extensions import Self
 
@@ -377,7 +377,7 @@ class ModuleAccelerator(ModuleAcceleratorBase):
     attempts to call the fast version first).
     """
 
-    _denylist: List[str]
+    _denylist: Tuple[str]
     _use_fast_lib: bool
     _use_fast_lib_lock: threading.RLock
     _module_cache_prefix: str = "_slow_lib_"
@@ -407,7 +407,7 @@ def __new__(
             if mod.startswith(self.slow_lib):
                 sys.modules[self._module_cache_prefix + mod] = sys.modules[mod]
                 del sys.modules[mod]
-        self._denylist = [*slow_module.__path__, *fast_module.__path__]
+        self._denylist = (*slow_module.__path__, *fast_module.__path__)
 
         # Lock to manage temporarily disabling delivering wrapped attributes
         self._use_fast_lib_lock = threading.RLock()
@@ -551,17 +551,13 @@ def getattr_real_or_wrapped(
             # release the lock after reading this value)
             use_real = not loader._use_fast_lib
         if not use_real:
-            CUDF_PANDAS_PATH = __file__.rsplit("/", 1)[0]
             # Only need to check the denylist if we're not turned off.
             frame = sys._getframe()
             # We cannot possibly be at the top level.
             assert frame.f_back
             calling_module = pathlib.PurePath(frame.f_back.f_code.co_filename)
-            use_real = not calling_module.is_relative_to(
-                CUDF_PANDAS_PATH
-            ) and any(
-                calling_module.is_relative_to(path)
-                for path in loader._denylist
+            use_real = _caller_in_denylist(
+                calling_module, tuple(loader._denylist)
             )
         try:
             if use_real:
@@ -623,3 +619,13 @@ def disable_module_accelerator() -> contextlib.ExitStack:
                 stack.enter_context(finder.disabled())
         return stack.pop_all()
     assert False  # pacify type checker
+
+
+# because this function gets called so often and is quite
+# expensive to run, we cache the results:
+@functools.lru_cache(maxsize=1024)
+def _caller_in_denylist(calling_module, denylist):
+    CUDF_PANDAS_PATH = __file__.rsplit("/", 1)[0]
+    return not calling_module.is_relative_to(CUDF_PANDAS_PATH) and any(
+        calling_module.is_relative_to(path) for path in denylist
+    )

From 67246587438241ececa23661e6d7966bab1abdcc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 9 Apr 2024 04:56:06 -1000
Subject: [PATCH 035/842] Enable tests/windows/ in cudf.pandas tests (#15444)

closes #15424
closes #15426

In the test suite, the `window/numba` tests are skipped since they are marked `single_cpu` and these test are run with `-m not single_cpu`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15444
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index f14490eee7d..2f6c4ac5b13 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -22,11 +22,7 @@ set -euo pipefail
 # of Pandas installed.
 PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)")
 
-PYTEST_IGNORES="--ignore=tests/interchange/test_impl.py \
---ignore=tests/window/test_dtypes.py \
---ignore=tests/window/test_numba.py \
---ignore=tests/window \
---ignore=tests/plotting \
+PYTEST_IGNORES="--ignore=tests/plotting \
 --ignore=tests/tslibs/test_parsing.py \
 --ignore=tests/io/parser/common/test_read_errors.py"
 

From a2f625ac7eb1bba5f8a21b48dd268334a53c572f Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 9 Apr 2024 16:46:05 +0100
Subject: [PATCH 036/842] Only use data_type constructor with scale for decimal
 types (#15472)

If we pass a scale parameter to cudf::data_type, the type_id must name a decimal type. This is asserted in debug mode.

Without this change, one cannot use the cython wrappers when build with CMAKE_BUILD_TYPE=Debug.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15472
---
 python/cudf/cudf/_lib/pylibcudf/types.pyx | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index d8b92283412..baf92223714 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -28,7 +28,14 @@ cdef class DataType:
         The scale associated with the data. Only used for decimal data types.
     """
     def __cinit__(self, type_id id, int32_t scale=0):
-        self.c_obj = data_type(id, scale)
+        if (
+            id == type_id.DECIMAL32
+            or id == type_id.DECIMAL64
+            or id == type_id.DECIMAL128
+        ):
+            self.c_obj = data_type(id, scale)
+        else:
+            self.c_obj = data_type(id)
 
     # TODO: Consider making both id and scale cached properties.
     cpdef type_id id(self):

From 72b2759b3987baa8fd3b07fab2ef5c7942d057aa Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 9 Apr 2024 10:58:21 -0500
Subject: [PATCH 037/842] Support orc and text IO with dask-expr using legacy
 conversion (#15439)

Related to orc and text support in https://github.com/rapidsai/cudf/issues/15027

Follow-up work can to enable predicate pushdown and column projection with ORC, but the goal of this PR is basic functionality (and parity with the legacy API).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15439
---
 python/dask_cudf/dask_cudf/__init__.py              |  3 ++-
 python/dask_cudf/dask_cudf/backends.py              |  9 +++++++++
 python/dask_cudf/dask_cudf/expr/_collection.py      | 12 ++++++++++++
 python/dask_cudf/dask_cudf/io/orc.py                |  4 ++--
 python/dask_cudf/dask_cudf/io/tests/test_json.py    |  4 ++--
 python/dask_cudf/dask_cudf/io/tests/test_orc.py     |  4 ++--
 python/dask_cudf/dask_cudf/io/tests/test_parquet.py |  5 +++--
 python/dask_cudf/dask_cudf/io/tests/test_text.py    |  4 ++--
 8 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py
index c66e85ed2af..04c2ad65b99 100644
--- a/python/dask_cudf/dask_cudf/__init__.py
+++ b/python/dask_cudf/dask_cudf/__init__.py
@@ -51,8 +51,9 @@ def inner_func(*args, **kwargs):
     from .expr._collection import DataFrame, Index, Series
 
     groupby_agg = raise_not_implemented_error("groupby_agg")
-    read_text = raise_not_implemented_error("read_text")
+    read_text = DataFrame.read_text
     to_orc = raise_not_implemented_error("to_orc")
+
 else:
     from .core import DataFrame, Index, Series
     from .groupby import groupby_agg
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index d05be30602e..5401bcd3767 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -699,6 +699,15 @@ def read_json(*args, engine="auto", **kwargs):
             **kwargs,
         )
 
+    @staticmethod
+    def read_orc(*args, **kwargs):
+        from dask_expr import from_legacy_dataframe
+
+        from dask_cudf.io.orc import read_orc as legacy_read_orc
+
+        ddf = legacy_read_orc(*args, **kwargs)
+        return from_legacy_dataframe(ddf)
+
 
 # Import/register cudf-specific classes for dask-expr
 try:
diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index 799e6eddab3..516e35a4335 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -81,6 +81,18 @@ def groupby(
             **kwargs,
         )
 
+    def to_orc(self, *args, **kwargs):
+        return self.to_legacy_dataframe().to_orc(*args, **kwargs)
+
+    @staticmethod
+    def read_text(*args, **kwargs):
+        from dask_expr import from_legacy_dataframe
+
+        from dask_cudf.io.text import read_text as legacy_read_text
+
+        ddf = legacy_read_text(*args, **kwargs)
+        return from_legacy_dataframe(ddf)
+
 
 class Series(VarMixin, DXSeries):
     def groupby(self, by, **kwargs):
diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py
index 49fea0d7602..bed69f038b0 100644
--- a/python/dask_cudf/dask_cudf/io/orc.py
+++ b/python/dask_cudf/dask_cudf/io/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from io import BufferedWriter, IOBase
 
@@ -100,7 +100,7 @@ def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs):
             **kwargs,
         )
 
-    name = "read-orc-" + tokenize(fs_token, path, columns, **kwargs)
+    name = "read-orc-" + tokenize(fs_token, path, columns, filters, **kwargs)
     dsk = {}
     N = 0
     for path, n in zip(paths, nstripes_per_file):
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py
index 8dcf3f05e89..a09dfbff188 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_json.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py
@@ -12,8 +12,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support for dask_expr<=1.0.5
-pytestmark = skip_dask_expr(lt_version="1.0.5+a")
+# No dask-expr support for dask_expr<1.0.6
+pytestmark = skip_dask_expr(lt_version="1.0.6")
 
 
 def test_read_json_backend_dispatch(tmp_path):
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
index 8ccb7a7bfe7..7be6c712511 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
@@ -14,8 +14,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support
-pytestmark = skip_dask_expr()
+# No dask-expr support for dask_expr<1.0.6
+pytestmark = skip_dask_expr(lt_version="1.0.6")
 
 cur_dir = os.path.dirname(__file__)
 sample_orc = os.path.join(cur_dir, "data/orc/sample.orc")
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index df41ef77b7c..68460653119 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -185,7 +185,6 @@ def test_dask_timeseries_from_dask(tmpdir, index, divisions):
     )
 
 
-@xfail_dask_expr("Categorical column support")
 @pytest.mark.parametrize("index", [False, None])
 @pytest.mark.parametrize("divisions", [False, True])
 def test_dask_timeseries_from_daskcudf(tmpdir, index, divisions):
@@ -193,7 +192,9 @@ def test_dask_timeseries_from_daskcudf(tmpdir, index, divisions):
     ddf2 = dask_cudf.from_cudf(
         cudf.datasets.timeseries(freq="D"), npartitions=4
     )
-    ddf2.name = ddf2.name.astype("object")
+    # Use assign in lieu of `ddf2.name = ...`
+    # See: https://github.com/dask/dask-expr/issues/1010
+    ddf2 = ddf2.assign(name=ddf2.name.astype("object"))
     ddf2.to_parquet(fn, write_index=index)
     read_df = dask_cudf.read_parquet(
         fn, index=index, calculate_divisions=divisions
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py
index d3dcd386d0d..e3a9d380857 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_text.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py
@@ -11,8 +11,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support
-pytestmark = skip_dask_expr()
+# No dask-expr support for dask_expr<1.0.6
+pytestmark = skip_dask_expr(lt_version="1.0.6")
 
 cur_dir = os.path.dirname(__file__)
 text_file = os.path.join(cur_dir, "data/text/sample.pgn")

From 3b48f8b0290dc41073538487ad53c8923be2f0f8 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Tue, 9 Apr 2024 13:08:34 -0500
Subject: [PATCH 038/842] Fixed page data truncation in parquet writer under
 certain conditions. (#15474)

Fixes https://github.com/rapidsai/cudf/issues/15473

The issue is that in some cases, for example where we have all nulls, we can fail to update the size of the page output buffer, resulting in a missing byte expected by some readers.   Specifically, we poke the value of dict_bits into the output buffer here:

https://github.com/rapidsai/cudf/blob/6319ab708f2dff9fd7a62a5c77fd3b387bde1bb8/cpp/src/io/parquet/page_enc.cu#L1892

But, if we have no leaf values (for example, because everything in the page is null) `s->cur` never gets updated here, because we never enter the containing loop.

https://github.com/rapidsai/cudf/blob/6319ab708f2dff9fd7a62a5c77fd3b387bde1bb8/cpp/src/io/parquet/page_enc.cu#L1948

The fix is to just always update `s->cur` after this if-else block

https://github.com/rapidsai/cudf/blob/6319ab708f2dff9fd7a62a5c77fd3b387bde1bb8/cpp/src/io/parquet/page_enc.cu#L1891

Note that this was already handled by our reader.  But some third party readers (Trino) are expecting that data to be there and crash if it's not.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/15474
---
 cpp/src/io/parquet/page_enc.cu | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index d881ab6f9b7..114e47b325b 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -1896,6 +1896,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
       s->rle_out     = dst + RLE_LENGTH_FIELD_LEN;
       s->rle_len_pos = dst;
     }
+    s->cur             = s->rle_out;
     s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
     s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
   }

From 15c148dcbba087ed1be32e0cef7188c9b609e7dc Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 9 Apr 2024 17:50:26 -0700
Subject: [PATCH 039/842] Fix for logical and syntactical errors in libcudf c++
 examples (#15346)

This PR fixes a couple of fatal compile and runtime errors in `libcudf/strings` examples

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mark Harris (https://github.com/harrism)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15346
---
 cpp/examples/build.sh                    | 4 +++-
 cpp/examples/strings/common.hpp          | 4 +++-
 cpp/examples/strings/custom_optimized.cu | 8 ++++++--
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index 001cdeec694..424da35ad18 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -1,9 +1,11 @@
 #!/bin/bash
 
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 # libcudf examples build script
 
+set -euo pipefail
+
 # Parallelism control
 PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
 
diff --git a/cpp/examples/strings/common.hpp b/cpp/examples/strings/common.hpp
index 0dbe6fe2b7b..65a9c100c7c 100644
--- a/cpp/examples/strings/common.hpp
+++ b/cpp/examples/strings/common.hpp
@@ -19,6 +19,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/io/csv.hpp>
 #include <cudf/io/datasource.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
@@ -110,7 +111,8 @@ int main(int argc, char const** argv)
 
   std::chrono::duration<double> elapsed = std::chrono::steady_clock::now() - st;
   std::cout << "Wall time: " << elapsed.count() << " seconds\n";
-  std::cout << "Output size " << result->view().child(1).size() << " bytes\n";
+  auto const scv = cudf::strings_column_view(result->view());
+  std::cout << "Output size " << scv.chars_size(rmm::cuda_stream_default) << " bytes\n";
 
   return 0;
 }
diff --git a/cpp/examples/strings/custom_optimized.cu b/cpp/examples/strings/custom_optimized.cu
index cefa3346150..62ca19a5ca9 100644
--- a/cpp/examples/strings/custom_optimized.cu
+++ b/cpp/examples/strings/custom_optimized.cu
@@ -153,8 +153,12 @@ std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
   redact_kernel<<<blocks, block_size, 0, stream.value()>>>(
     *d_names, *d_visibilities, offsets.data(), chars.data());
 
-  // create column from offsets and chars vectors (no copy is performed)
-  auto result = cudf::make_strings_column(names.size(), std::move(offsets), chars.release(), {}, 0);
+  // create column from offsets vector (move only)
+  auto offsets_column = std::make_unique<cudf::column>(std::move(offsets), rmm::device_buffer{}, 0);
+
+  // create column for chars vector (no copy is performed)
+  auto result = cudf::make_strings_column(
+    names.size(), std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
 
   // wait for all of the above to finish
   stream.synchronize();

From b06536d3c061d62286c6844ed8d6a69cf906dc3d Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 10 Apr 2024 08:56:47 -0500
Subject: [PATCH 040/842] Make improvements in pandas-test reporting (#15485)

This PR fixes an issue where `listJobsForWorkflowRun` returns only 30 jobs details by default and we need to paginate and load the rest all of the job details to be able to filter jobs.

This PR also address review comments in https://github.com/rapidsai/cudf/pull/15369/

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15485
---
 .github/workflows/status.yaml               | 13 +++++++++----
 .github/workflows/test.yaml                 |  2 +-
 ci/cudf_pandas_scripts/pandas-tests/diff.sh |  9 +++++----
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/status.yaml b/.github/workflows/status.yaml
index 0aad4c8a23e..781264bc55e 100644
--- a/.github/workflows/status.yaml
+++ b/.github/workflows/status.yaml
@@ -85,13 +85,18 @@ jobs:
                 state: CUSTOM_STATE = 'success'
             } = contentJSON;
 
-            // Fetch the first job ID from the workflow run
-            const jobs = await github.rest.actions.listJobsForWorkflowRun({
+            // Fetch all jobs using pagination
+            const jobs = await github.paginate(
+              github.rest.actions.listJobsForWorkflowRun,
+              {
                 owner: context.repo.owner,
                 repo: context.repo.repo,
                 run_id: process.env.WORKFLOW_RUN_ID,
-            });
-            const job = jobs.data.jobs.find(job => job.name === JOB_NAME);
+              }
+            );
+
+            // Fetch the first job ID from the workflow run
+            const job = jobs.find(job => job.name === JOB_NAME);
             const JOB_ID = job ? job.id : null;
 
             // Set default target URL if not defined
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index ea47b6ad466..65aef37697e 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -130,7 +130,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(min_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: nightly
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
index ae5a249bcbd..cf80f383db4 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
@@ -10,12 +10,13 @@
 GH_JOB_NAME="pandas-tests-diff / build"
 rapids-logger "Github job name: ${GH_JOB_NAME}"
 
-MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py310.main-results.json
-PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py39.pr-results.json
+PY_VER="39"
+MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-results.json
+PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-results.json
 
 rapids-logger "Fetching latest available results from nightly"
-aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '.main-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt
-cat s3_output.txt
+aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt
+
 read -r COMPARE_ENV < s3_output.txt
 export COMPARE_ENV
 rapids-logger "Latest available results from nightly: ${COMPARE_ENV}"

From 94726ad056e2473c836f47d310e2584bdf44d1f9 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 10 Apr 2024 10:12:23 -0400
Subject: [PATCH 041/842] Update Changelog [skip ci]

---
 CHANGELOG.md | 297 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 297 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bce764f59e3..7ecad2c9c39 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,300 @@
+# cuDF 24.04.00 (10 Apr 2024)
+
+## 🚨 Breaking Changes
+
+- Restructure pylibcudf/arrow interop facilities ([#15325](https://github.com/rapidsai/cudf/pull/15325)) [@vyasr](https://github.com/vyasr)
+- Change exceptions thrown by copying APIs ([#15319](https://github.com/rapidsai/cudf/pull/15319)) [@vyasr](https://github.com/vyasr)
+- Change strings_column_view::char_size to return int64 ([#15197](https://github.com/rapidsai/cudf/pull/15197)) [@davidwendt](https://github.com/davidwendt)
+- Upgrade to `arrow-14.0.2` ([#15108](https://github.com/rapidsai/cudf/pull/15108)) [@galipremsagar](https://github.com/galipremsagar)
+- Add support for `pandas-2.2` in `cudf` ([#15100](https://github.com/rapidsai/cudf/pull/15100)) [@galipremsagar](https://github.com/galipremsagar)
+- Deprecate cudf::hashing::spark_murmurhash3_x86_32 ([#15074](https://github.com/rapidsai/cudf/pull/15074)) [@davidwendt](https://github.com/davidwendt)
+- Align MultiIndex.get_indexder with pandas 2.2 change ([#15059](https://github.com/rapidsai/cudf/pull/15059)) [@mroeschke](https://github.com/mroeschke)
+- Raise an error on import for unsupported GPUs. ([#15053](https://github.com/rapidsai/cudf/pull/15053)) [@bdice](https://github.com/bdice)
+- Deprecate datelike isin casting strings to dates to match pandas 2.2 ([#15046](https://github.com/rapidsai/cudf/pull/15046)) [@mroeschke](https://github.com/mroeschke)
+- Align concat Series name behavior in pandas 2.2 ([#15032](https://github.com/rapidsai/cudf/pull/15032)) [@mroeschke](https://github.com/mroeschke)
+- Add `future_stack` to `DataFrame.stack` ([#15015](https://github.com/rapidsai/cudf/pull/15015)) [@galipremsagar](https://github.com/galipremsagar)
+- Deprecate groupby fillna ([#15000](https://github.com/rapidsai/cudf/pull/15000)) [@mroeschke](https://github.com/mroeschke)
+- Deprecate replace with categorical columns ([#14988](https://github.com/rapidsai/cudf/pull/14988)) [@mroeschke](https://github.com/mroeschke)
+- Deprecate delim_whitespace in read_csv for pandas 2.2 ([#14986](https://github.com/rapidsai/cudf/pull/14986)) [@mroeschke](https://github.com/mroeschke)
+- Deprecate parameters similar to pandas 2.2 ([#14984](https://github.com/rapidsai/cudf/pull/14984)) [@mroeschke](https://github.com/mroeschke)
+- Add missing atomic operators, refactor atomic operators, move atomic operators to detail namespace. ([#14962](https://github.com/rapidsai/cudf/pull/14962)) [@bdice](https://github.com/bdice)
+- Add `pandas-2.x` support in `cudf` ([#14916](https://github.com/rapidsai/cudf/pull/14916)) [@galipremsagar](https://github.com/galipremsagar)
+- Use cuco::static_set in the hash-based groupby ([#14813](https://github.com/rapidsai/cudf/pull/14813)) [@PointKernel](https://github.com/PointKernel)
+
+## 🐛 Bug Fixes
+
+- Fix an issue with creating a series from scalar when `dtype=&#39;category&#39;` ([#15476](https://github.com/rapidsai/cudf/pull/15476)) [@galipremsagar](https://github.com/galipremsagar)
+- Update pre-commit-hooks to v0.0.3 ([#15355](https://github.com/rapidsai/cudf/pull/15355)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- [BUG][JNI] Trigger MemoryBuffer.onClosed after memory is freed ([#15351](https://github.com/rapidsai/cudf/pull/15351)) [@abellina](https://github.com/abellina)
+- Fix an issue with multiple short list rowgroups using the Parquet chunked reader. ([#15342](https://github.com/rapidsai/cudf/pull/15342)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Avoid importing dask-expr if &quot;query-planning&quot; config is `False` ([#15340](https://github.com/rapidsai/cudf/pull/15340)) [@rjzamora](https://github.com/rjzamora)
+- Fix gtests/ERROR_TEST errors when run in Debug ([#15317](https://github.com/rapidsai/cudf/pull/15317)) [@davidwendt](https://github.com/davidwendt)
+- Fix OOB read in `inflate_kernel` ([#15309](https://github.com/rapidsai/cudf/pull/15309)) [@vuule](https://github.com/vuule)
+- Work around a cuFile error when running CSV tests with memcheck ([#15293](https://github.com/rapidsai/cudf/pull/15293)) [@vuule](https://github.com/vuule)
+- Fix Doxygen upload directory ([#15291](https://github.com/rapidsai/cudf/pull/15291)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Fix Doxygen check ([#15289](https://github.com/rapidsai/cudf/pull/15289)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Reintroduce PANDAS_GE_220 import ([#15287](https://github.com/rapidsai/cudf/pull/15287)) [@wence-](https://github.com/wence-)
+- Fix mean computation for the geometric distribution in the data generator ([#15282](https://github.com/rapidsai/cudf/pull/15282)) [@vuule](https://github.com/vuule)
+- Fix Parquet decimal64 stats ([#15281](https://github.com/rapidsai/cudf/pull/15281)) [@etseidl](https://github.com/etseidl)
+- Make linking of nvtx3-cpp BUILD_LOCAL_INTERFACE ([#15271](https://github.com/rapidsai/cudf/pull/15271)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Workaround compute-sanitizer memcheck bug ([#15259](https://github.com/rapidsai/cudf/pull/15259)) [@davidwendt](https://github.com/davidwendt)
+- Cleanup `hostdevice_vector` and add more APIs ([#15252](https://github.com/rapidsai/cudf/pull/15252)) [@ttnghia](https://github.com/ttnghia)
+- Fix number of rows in randomly generated lists columns ([#15248](https://github.com/rapidsai/cudf/pull/15248)) [@vuule](https://github.com/vuule)
+- Fix wrong output for `collect_list`/`collect_set` of lists column ([#15243](https://github.com/rapidsai/cudf/pull/15243)) [@ttnghia](https://github.com/ttnghia)
+- Fix testchunkedPackTwoPasses to copy from the bounce buffer ([#15220](https://github.com/rapidsai/cudf/pull/15220)) [@abellina](https://github.com/abellina)
+- Fix accessing `.columns` by an external API ([#15212](https://github.com/rapidsai/cudf/pull/15212)) [@galipremsagar](https://github.com/galipremsagar)
+- [JNI] Disable testChunkedPackTwoPasses for now ([#15210](https://github.com/rapidsai/cudf/pull/15210)) [@abellina](https://github.com/abellina)
+- Update labeler and codeowner configs for CMake files ([#15208](https://github.com/rapidsai/cudf/pull/15208)) [@PointKernel](https://github.com/PointKernel)
+- Avoid dict normalization in ``__dask_tokenize__`` ([#15187](https://github.com/rapidsai/cudf/pull/15187)) [@rjzamora](https://github.com/rjzamora)
+- Fix memcheck error in distinct inner join ([#15164](https://github.com/rapidsai/cudf/pull/15164)) [@PointKernel](https://github.com/PointKernel)
+- Remove unneeded script parameters in test_cpp_memcheck.sh ([#15158](https://github.com/rapidsai/cudf/pull/15158)) [@davidwendt](https://github.com/davidwendt)
+- Fix `ListColumn.to_pandas()` to retain `list` type ([#15155](https://github.com/rapidsai/cudf/pull/15155)) [@galipremsagar](https://github.com/galipremsagar)
+- Avoid factorization in MultiIndex.to_pandas ([#15150](https://github.com/rapidsai/cudf/pull/15150)) [@mroeschke](https://github.com/mroeschke)
+- Fix GroupBy.get_group and GroupBy.indices ([#15143](https://github.com/rapidsai/cudf/pull/15143)) [@wence-](https://github.com/wence-)
+- Remove `const` from `range_window_bounds::_extent`. ([#15138](https://github.com/rapidsai/cudf/pull/15138)) [@mythrocks](https://github.com/mythrocks)
+- DataFrame.columns = ... retains RangeIndex &amp; set dtype ([#15129](https://github.com/rapidsai/cudf/pull/15129)) [@mroeschke](https://github.com/mroeschke)
+- Correctly handle output for `GroupBy.apply` when chunk results are reindexed series ([#15109](https://github.com/rapidsai/cudf/pull/15109)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Fix Series.groupby.shift with a MultiIndex ([#15098](https://github.com/rapidsai/cudf/pull/15098)) [@mroeschke](https://github.com/mroeschke)
+- Fix reductions when DataFrame has MulitIndex columns ([#15097](https://github.com/rapidsai/cudf/pull/15097)) [@mroeschke](https://github.com/mroeschke)
+- Fix deprecation warnings for deprecated hash() calls ([#15095](https://github.com/rapidsai/cudf/pull/15095)) [@davidwendt](https://github.com/davidwendt)
+- Add support for arrow `large_string` in `cudf` ([#15093](https://github.com/rapidsai/cudf/pull/15093)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix `sort_values` pytest failure with pandas-2.x regression ([#15092](https://github.com/rapidsai/cudf/pull/15092)) [@galipremsagar](https://github.com/galipremsagar)
+- Resolve path parsing issues in `get_json_object` ([#15082](https://github.com/rapidsai/cudf/pull/15082)) [@SurajAralihalli](https://github.com/SurajAralihalli)
+- Fix bugs in handling of delta encodings ([#15075](https://github.com/rapidsai/cudf/pull/15075)) [@etseidl](https://github.com/etseidl)
+- Fix `is_device_write_preferred` in `void_sink` and `user_sink_wrapper` ([#15064](https://github.com/rapidsai/cudf/pull/15064)) [@vuule](https://github.com/vuule)
+- Eliminate duplicate allocation of nested string columns ([#15061](https://github.com/rapidsai/cudf/pull/15061)) [@vuule](https://github.com/vuule)
+- Raise an error on import for unsupported GPUs. ([#15053](https://github.com/rapidsai/cudf/pull/15053)) [@bdice](https://github.com/bdice)
+- Align concat Series name behavior in pandas 2.2 ([#15032](https://github.com/rapidsai/cudf/pull/15032)) [@mroeschke](https://github.com/mroeschke)
+- Fix `Index.difference` to handle duplicate values when one of the inputs is empty ([#15016](https://github.com/rapidsai/cudf/pull/15016)) [@galipremsagar](https://github.com/galipremsagar)
+- Add `future_stack` to `DataFrame.stack` ([#15015](https://github.com/rapidsai/cudf/pull/15015)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix handling of values=None in pylibcudf GroupBy.get_groups ([#14998](https://github.com/rapidsai/cudf/pull/14998)) [@shwina](https://github.com/shwina)
+- Fix `DataFrame.sort_index` to respect `ignore_index` on all axis ([#14995](https://github.com/rapidsai/cudf/pull/14995)) [@galipremsagar](https://github.com/galipremsagar)
+- Raise for pyarrow array that is tz-aware ([#14980](https://github.com/rapidsai/cudf/pull/14980)) [@mroeschke](https://github.com/mroeschke)
+- Direct ``SeriesGroupBy.aggregate`` to ``SeriesGroupBy.agg`` ([#14971](https://github.com/rapidsai/cudf/pull/14971)) [@rjzamora](https://github.com/rjzamora)
+- Respect IntervalDtype and CategoricalDtype objects passed by users ([#14961](https://github.com/rapidsai/cudf/pull/14961)) [@mroeschke](https://github.com/mroeschke)
+- unset `CUDF_SPILL` after a pytest ([#14958](https://github.com/rapidsai/cudf/pull/14958)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix Null literals to be not parsed as string when mixed types as string is enabled in JSON reader ([#14939](https://github.com/rapidsai/cudf/pull/14939)) [@karthikeyann](https://github.com/karthikeyann)
+- Fix chunked reads of Parquet delta encoded pages ([#14921](https://github.com/rapidsai/cudf/pull/14921)) [@etseidl](https://github.com/etseidl)
+- Fix reading offset for data stream in ORC reader ([#14911](https://github.com/rapidsai/cudf/pull/14911)) [@ttnghia](https://github.com/ttnghia)
+- Enable sanitizer check for a test case testORCReadAndWriteForDecimal128 ([#14897](https://github.com/rapidsai/cudf/pull/14897)) [@res-life](https://github.com/res-life)
+- Fix dask token normalization ([#14829](https://github.com/rapidsai/cudf/pull/14829)) [@rjzamora](https://github.com/rjzamora)
+- Fix 24.04 versions ([#14825](https://github.com/rapidsai/cudf/pull/14825)) [@raydouglass](https://github.com/raydouglass)
+- Ensure slow private attrs are maybe proxies ([#14380](https://github.com/rapidsai/cudf/pull/14380)) [@mroeschke](https://github.com/mroeschke)
+
+## 📖 Documentation
+
+- Ignore DLManagedTensor in the docs build ([#15392](https://github.com/rapidsai/cudf/pull/15392)) [@davidwendt](https://github.com/davidwendt)
+- Revert &quot;Temporarily disable docs errors. ([#15265)&quot; (#15269](https://github.com/rapidsai/cudf/pull/15265)&quot; (#15269)) [@bdice](https://github.com/bdice)
+- Temporarily disable docs errors. ([#15265](https://github.com/rapidsai/cudf/pull/15265)) [@bdice](https://github.com/bdice)
+- Update `developer_guide.md` with new guidance on quoted internal includes ([#15238](https://github.com/rapidsai/cudf/pull/15238)) [@harrism](https://github.com/harrism)
+- Fix broken link for developer guide ([#15025](https://github.com/rapidsai/cudf/pull/15025)) [@sanjana098](https://github.com/sanjana098)
+- [DOC] Update typo in docs example of structs_column_wrapper ([#14949](https://github.com/rapidsai/cudf/pull/14949)) [@karthikeyann](https://github.com/karthikeyann)
+- Update cudf.pandas FAQ. ([#14940](https://github.com/rapidsai/cudf/pull/14940)) [@bdice](https://github.com/bdice)
+- Optimize doc builds ([#14856](https://github.com/rapidsai/cudf/pull/14856)) [@vyasr](https://github.com/vyasr)
+- Add developer guideline to use east const. ([#14836](https://github.com/rapidsai/cudf/pull/14836)) [@bdice](https://github.com/bdice)
+- Document how cuDF is pronounced ([#14753](https://github.com/rapidsai/cudf/pull/14753)) [@pentschev](https://github.com/pentschev)
+- Notes convert to Pandas-compat ([#12641](https://github.com/rapidsai/cudf/pull/12641)) [@Touutae-lab](https://github.com/Touutae-lab)
+
+## 🚀 New Features
+
+- Address inconsistency in single quote normalization in JSON reader ([#15324](https://github.com/rapidsai/cudf/pull/15324)) [@shrshi](https://github.com/shrshi)
+- Use JNI pinned pool resource with cuIO ([#15255](https://github.com/rapidsai/cudf/pull/15255)) [@abellina](https://github.com/abellina)
+- Add DELTA_BYTE_ARRAY encoder for Parquet ([#15239](https://github.com/rapidsai/cudf/pull/15239)) [@etseidl](https://github.com/etseidl)
+- Migrate filling operations to pylibcudf ([#15225](https://github.com/rapidsai/cudf/pull/15225)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- [JNI] rmm based pinned pool ([#15219](https://github.com/rapidsai/cudf/pull/15219)) [@abellina](https://github.com/abellina)
+- Implement zero-copy host buffer source instead of using an arrow implementation ([#15189](https://github.com/rapidsai/cudf/pull/15189)) [@vuule](https://github.com/vuule)
+- Enable creation of columns from scalar ([#15181](https://github.com/rapidsai/cudf/pull/15181)) [@vyasr](https://github.com/vyasr)
+- Use NVTX from GitHub. ([#15178](https://github.com/rapidsai/cudf/pull/15178)) [@bdice](https://github.com/bdice)
+- Implement `segmented_row_bit_count` for computing row sizes by segments of rows ([#15169](https://github.com/rapidsai/cudf/pull/15169)) [@ttnghia](https://github.com/ttnghia)
+- Implement search using pylibcudf ([#15166](https://github.com/rapidsai/cudf/pull/15166)) [@vyasr](https://github.com/vyasr)
+- Add distinct left join ([#15149](https://github.com/rapidsai/cudf/pull/15149)) [@PointKernel](https://github.com/PointKernel)
+- Add cardinality control for groupby benchs with flat types ([#15134](https://github.com/rapidsai/cudf/pull/15134)) [@PointKernel](https://github.com/PointKernel)
+- Add ability to request Parquet encodings on a per-column basis ([#15081](https://github.com/rapidsai/cudf/pull/15081)) [@etseidl](https://github.com/etseidl)
+- Automate include grouping order in .clang-format ([#15063](https://github.com/rapidsai/cudf/pull/15063)) [@harrism](https://github.com/harrism)
+- Requesting a clean build directory also clears Jitify cache ([#15052](https://github.com/rapidsai/cudf/pull/15052)) [@robertmaynard](https://github.com/robertmaynard)
+- API for JSON unquoted whitespace normalization ([#15033](https://github.com/rapidsai/cudf/pull/15033)) [@shrshi](https://github.com/shrshi)
+- Implement concatenate, lists.explode, merge, sorting, and stream compaction in pylibcudf ([#15011](https://github.com/rapidsai/cudf/pull/15011)) [@vyasr](https://github.com/vyasr)
+- Implement replace in pylibcudf ([#15005](https://github.com/rapidsai/cudf/pull/15005)) [@vyasr](https://github.com/vyasr)
+- Add distinct key inner join ([#14990](https://github.com/rapidsai/cudf/pull/14990)) [@PointKernel](https://github.com/PointKernel)
+- Implement rolling in pylibcudf ([#14982](https://github.com/rapidsai/cudf/pull/14982)) [@vyasr](https://github.com/vyasr)
+- Implement joins in pylibcudf ([#14972](https://github.com/rapidsai/cudf/pull/14972)) [@vyasr](https://github.com/vyasr)
+- Implement scans and reductions in pylibcudf ([#14970](https://github.com/rapidsai/cudf/pull/14970)) [@vyasr](https://github.com/vyasr)
+- Rewrite cudf internals using pylibcudf groupby ([#14946](https://github.com/rapidsai/cudf/pull/14946)) [@vyasr](https://github.com/vyasr)
+- Implement groupby in pylibcudf ([#14945](https://github.com/rapidsai/cudf/pull/14945)) [@vyasr](https://github.com/vyasr)
+- Support casting of Map type to string in JSON reader ([#14936](https://github.com/rapidsai/cudf/pull/14936)) [@karthikeyann](https://github.com/karthikeyann)
+- POC for whitespace removal in input JSON data using FST ([#14931](https://github.com/rapidsai/cudf/pull/14931)) [@shrshi](https://github.com/shrshi)
+- Support for LZ4 compression in ORC and Parquet ([#14906](https://github.com/rapidsai/cudf/pull/14906)) [@vuule](https://github.com/vuule)
+- Remove supports_streams from cuDF custom memory resources. ([#14857](https://github.com/rapidsai/cudf/pull/14857)) [@harrism](https://github.com/harrism)
+- Migrate unary operations to pylibcudf ([#14850](https://github.com/rapidsai/cudf/pull/14850)) [@vyasr](https://github.com/vyasr)
+- Migrate binary operations to pylibcudf ([#14821](https://github.com/rapidsai/cudf/pull/14821)) [@vyasr](https://github.com/vyasr)
+- Add row index and stripe size options to Python ORC chunked writer ([#14785](https://github.com/rapidsai/cudf/pull/14785)) [@vuule](https://github.com/vuule)
+- Support CUDA 12.2 ([#14712](https://github.com/rapidsai/cudf/pull/14712)) [@jameslamb](https://github.com/jameslamb)
+
+## 🛠️ Improvements
+
+- Use `conda env create --yes` instead of `--force` ([#15403](https://github.com/rapidsai/cudf/pull/15403)) [@bdice](https://github.com/bdice)
+- Restructure pylibcudf/arrow interop facilities ([#15325](https://github.com/rapidsai/cudf/pull/15325)) [@vyasr](https://github.com/vyasr)
+- Change exceptions thrown by copying APIs ([#15319](https://github.com/rapidsai/cudf/pull/15319)) [@vyasr](https://github.com/vyasr)
+- Enable branch testing for `cudf.pandas` ([#15316](https://github.com/rapidsai/cudf/pull/15316)) [@galipremsagar](https://github.com/galipremsagar)
+- Replace black with ruff-format ([#15312](https://github.com/rapidsai/cudf/pull/15312)) [@mroeschke](https://github.com/mroeschke)
+- This fixes an NPE when trying to read empty JSON data by adding a new API for missing information ([#15307](https://github.com/rapidsai/cudf/pull/15307)) [@revans2](https://github.com/revans2)
+- Address poor performance of Parquet string decoding ([#15304](https://github.com/rapidsai/cudf/pull/15304)) [@etseidl](https://github.com/etseidl)
+- Update script input name ([#15301](https://github.com/rapidsai/cudf/pull/15301)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Make test_read_parquet_partitioned_filtered data deterministic ([#15296](https://github.com/rapidsai/cudf/pull/15296)) [@mroeschke](https://github.com/mroeschke)
+- Add timeout for `cudf.pandas` pandas tests ([#15284](https://github.com/rapidsai/cudf/pull/15284)) [@galipremsagar](https://github.com/galipremsagar)
+- Add upper bound to prevent usage of NumPy 2 ([#15283](https://github.com/rapidsai/cudf/pull/15283)) [@bdice](https://github.com/bdice)
+- Fix cudf::test::to_host return of host_vector ([#15263](https://github.com/rapidsai/cudf/pull/15263)) [@davidwendt](https://github.com/davidwendt)
+- Implement grouped product scan ([#15254](https://github.com/rapidsai/cudf/pull/15254)) [@wence-](https://github.com/wence-)
+- Add CUDA 12.4 to supported PTX versions ([#15247](https://github.com/rapidsai/cudf/pull/15247)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Implement DataFrame|Series.squeeze ([#15244](https://github.com/rapidsai/cudf/pull/15244)) [@mroeschke](https://github.com/mroeschke)
+- Roll back ipow changes due to register pressure. ([#15242](https://github.com/rapidsai/cudf/pull/15242)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Remove create_chars_child_column utility ([#15241](https://github.com/rapidsai/cudf/pull/15241)) [@davidwendt](https://github.com/davidwendt)
+- Update dlpack to version 0.8 ([#15237](https://github.com/rapidsai/cudf/pull/15237)) [@dantegd](https://github.com/dantegd)
+- Improve performance in JSON reader when `mixed_types_as_string` option is enabled ([#15236](https://github.com/rapidsai/cudf/pull/15236)) [@shrshi](https://github.com/shrshi)
+- Remove row conversion code from libcudf ([#15234](https://github.com/rapidsai/cudf/pull/15234)) [@ttnghia](https://github.com/ttnghia)
+- Use variable substitution for RAPIDS version in Doxyfile ([#15231](https://github.com/rapidsai/cudf/pull/15231)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Add ListColumns.to_pandas(arrow_type=) ([#15228](https://github.com/rapidsai/cudf/pull/15228)) [@mroeschke](https://github.com/mroeschke)
+- Treat dask-cudf CI artifacts as pure wheels ([#15223](https://github.com/rapidsai/cudf/pull/15223)) [@bdice](https://github.com/bdice)
+- Clean up usage of __CUDA_ARCH__ and other macros. ([#15218](https://github.com/rapidsai/cudf/pull/15218)) [@bdice](https://github.com/bdice)
+- DOC: use constants in performance-comparisons.ipynb ([#15215](https://github.com/rapidsai/cudf/pull/15215)) [@raybellwaves](https://github.com/raybellwaves)
+- Rewrite conversion in terms of column ([#15213](https://github.com/rapidsai/cudf/pull/15213)) [@vyasr](https://github.com/vyasr)
+- Switch `pytest-xdist` algo to `worksteal` ([#15207](https://github.com/rapidsai/cudf/pull/15207)) [@galipremsagar](https://github.com/galipremsagar)
+- Deprecate strings_column_view::offsets_begin() ([#15205](https://github.com/rapidsai/cudf/pull/15205)) [@davidwendt](https://github.com/davidwendt)
+- Add `get_upstream_resource` method to `stream_checking_resource_adaptor` ([#15203](https://github.com/rapidsai/cudf/pull/15203)) [@miscco](https://github.com/miscco)
+- Tune up row size estimation in the data generator ([#15202](https://github.com/rapidsai/cudf/pull/15202)) [@vuule](https://github.com/vuule)
+- Fix `offset` value for generating test data in `parquet_chunked_reader_test.cu` ([#15200](https://github.com/rapidsai/cudf/pull/15200)) [@ttnghia](https://github.com/ttnghia)
+- Change strings_column_view::char_size to return int64 ([#15197](https://github.com/rapidsai/cudf/pull/15197)) [@davidwendt](https://github.com/davidwendt)
+- Fix includes for row_operators.cuh ([#15194](https://github.com/rapidsai/cudf/pull/15194)) [@davidwendt](https://github.com/davidwendt)
+- Generalize GHA selectors for pure Python testing ([#15191](https://github.com/rapidsai/cudf/pull/15191)) [@bdice](https://github.com/bdice)
+- Improvements for `__cuda_array_interface__` tests ([#15188](https://github.com/rapidsai/cudf/pull/15188)) [@bdice](https://github.com/bdice)
+- Allow to_pandas to return pandas.ArrowDtype ([#15182](https://github.com/rapidsai/cudf/pull/15182)) [@mroeschke](https://github.com/mroeschke)
+- Ignore `byte_range` in `read_json` when the size is not smaller than the input data ([#15180](https://github.com/rapidsai/cudf/pull/15180)) [@vuule](https://github.com/vuule)
+- Expose new stable_sort and finish stream_compaction in pylibcudf ([#15175](https://github.com/rapidsai/cudf/pull/15175)) [@wence-](https://github.com/wence-)
+- [ci] update matrix filters for dask-cudf builds ([#15174](https://github.com/rapidsai/cudf/pull/15174)) [@jameslamb](https://github.com/jameslamb)
+- Change make_strings_children to return uvector ([#15171](https://github.com/rapidsai/cudf/pull/15171)) [@davidwendt](https://github.com/davidwendt)
+- Don&#39;t override to_pandas for Datelike columns ([#15167](https://github.com/rapidsai/cudf/pull/15167)) [@mroeschke](https://github.com/mroeschke)
+- Drop python-snappy from dependencies. ([#15161](https://github.com/rapidsai/cudf/pull/15161)) [@bdice](https://github.com/bdice)
+- Add microkernels for fixed-width and fixed-width dictionary in Parquet decode ([#15159](https://github.com/rapidsai/cudf/pull/15159)) [@abellina](https://github.com/abellina)
+- Make HostColumnVector.DataType accessor methods public ([#15157](https://github.com/rapidsai/cudf/pull/15157)) [@jbrennan333](https://github.com/jbrennan333)
+- Java bindings for left outer distinct join ([#15154](https://github.com/rapidsai/cudf/pull/15154)) [@jlowe](https://github.com/jlowe)
+- Forward-merge branch-24.02 to branch-24.04 ([#15153](https://github.com/rapidsai/cudf/pull/15153)) [@bdice](https://github.com/bdice)
+- Enable pandas pytests for `cudf.pandas` ([#15147](https://github.com/rapidsai/cudf/pull/15147)) [@galipremsagar](https://github.com/galipremsagar)
+- Add java option to keep quotes for JSON reads ([#15146](https://github.com/rapidsai/cudf/pull/15146)) [@revans2](https://github.com/revans2)
+- Change cross-pandas-version testing in `cudf` ([#15145](https://github.com/rapidsai/cudf/pull/15145)) [@galipremsagar](https://github.com/galipremsagar)
+- Use `hostdevice_vector` in `kernel_error`  to avoid the pageable copy ([#15140](https://github.com/rapidsai/cudf/pull/15140)) [@vuule](https://github.com/vuule)
+- Clean up Columns.astype &amp; cudf.dtype ([#15125](https://github.com/rapidsai/cudf/pull/15125)) [@mroeschke](https://github.com/mroeschke)
+- Simplify some to_pandas implementations ([#15123](https://github.com/rapidsai/cudf/pull/15123)) [@mroeschke](https://github.com/mroeschke)
+- Java: Add leak tracking for Scalar instances ([#15121](https://github.com/rapidsai/cudf/pull/15121)) [@jlowe](https://github.com/jlowe)
+- Remove calls to strings_column_view::offsets_begin() ([#15112](https://github.com/rapidsai/cudf/pull/15112)) [@davidwendt](https://github.com/davidwendt)
+- Add support for Python 3.11, require NumPy 1.23+ ([#15111](https://github.com/rapidsai/cudf/pull/15111)) [@jameslamb](https://github.com/jameslamb)
+- Compile-time ipow computation with array lookup ([#15110](https://github.com/rapidsai/cudf/pull/15110)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Upgrade to `arrow-14.0.2` ([#15108](https://github.com/rapidsai/cudf/pull/15108)) [@galipremsagar](https://github.com/galipremsagar)
+- Dynamically set version in RAPIDS doc builds ([#15101](https://github.com/rapidsai/cudf/pull/15101)) [@jakirkham](https://github.com/jakirkham)
+- Add support for `pandas-2.2` in `cudf` ([#15100](https://github.com/rapidsai/cudf/pull/15100)) [@galipremsagar](https://github.com/galipremsagar)
+- Update devcontainers to CUDA Toolkit 12.2 ([#15099](https://github.com/rapidsai/cudf/pull/15099)) [@trxcllnt](https://github.com/trxcllnt)
+- Fix `datetime` binop pytest failures in pandas-2.2 ([#15090](https://github.com/rapidsai/cudf/pull/15090)) [@galipremsagar](https://github.com/galipremsagar)
+- Validate types in pylibcudf Column/Table constructors ([#15088](https://github.com/rapidsai/cudf/pull/15088)) [@wence-](https://github.com/wence-)
+- xfail test_join_ordering_pandas_compat for pandas 2.2 ([#15080](https://github.com/rapidsai/cudf/pull/15080)) [@mroeschke](https://github.com/mroeschke)
+- Add general purpose host memory allocator reference to cuIO with a demo of pooled-pinned allocation. ([#15079](https://github.com/rapidsai/cudf/pull/15079)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Adjust test_binops for pandas 2.2 ([#15078](https://github.com/rapidsai/cudf/pull/15078)) [@mroeschke](https://github.com/mroeschke)
+- Remove offsets_begin() call from nvtext::generate_ngrams ([#15077](https://github.com/rapidsai/cudf/pull/15077)) [@davidwendt](https://github.com/davidwendt)
+- Use offsetalator in cudf::detail::has_nonempty_null_rows ([#15076](https://github.com/rapidsai/cudf/pull/15076)) [@davidwendt](https://github.com/davidwendt)
+- Deprecate cudf::hashing::spark_murmurhash3_x86_32 ([#15074](https://github.com/rapidsai/cudf/pull/15074)) [@davidwendt](https://github.com/davidwendt)
+- Fix cudf::test::to_host to handle both offset types for strings columns ([#15073](https://github.com/rapidsai/cudf/pull/15073)) [@davidwendt](https://github.com/davidwendt)
+- Add condition for test_groupby_nulls_basic in pandas 2.2 ([#15072](https://github.com/rapidsai/cudf/pull/15072)) [@mroeschke](https://github.com/mroeschke)
+- xfail tests in test_udf_masked_ops due to pandas 2.2 bug ([#15071](https://github.com/rapidsai/cudf/pull/15071)) [@mroeschke](https://github.com/mroeschke)
+- target branch-24.04 for GitHub Actions workflows ([#15069](https://github.com/rapidsai/cudf/pull/15069)) [@jameslamb](https://github.com/jameslamb)
+- Implement stable version of `cudf::sort` ([#15066](https://github.com/rapidsai/cudf/pull/15066)) [@wence-](https://github.com/wence-)
+- Fix ORC and JSON tests failures for pandas 2.2 ([#15062](https://github.com/rapidsai/cudf/pull/15062)) [@mroeschke](https://github.com/mroeschke)
+- Adjust test_joining for pandas 2.2 ([#15060](https://github.com/rapidsai/cudf/pull/15060)) [@mroeschke](https://github.com/mroeschke)
+- Align MultiIndex.get_indexder with pandas 2.2 change ([#15059](https://github.com/rapidsai/cudf/pull/15059)) [@mroeschke](https://github.com/mroeschke)
+- Fix test_resample index dtype checking for pandas 2.2 ([#15058](https://github.com/rapidsai/cudf/pull/15058)) [@mroeschke](https://github.com/mroeschke)
+- Split out strings/replace.cu and rework its gtests ([#15054](https://github.com/rapidsai/cudf/pull/15054)) [@davidwendt](https://github.com/davidwendt)
+- Avoid incompatible value type setting in test_rolling for pandas 2.2 ([#15050](https://github.com/rapidsai/cudf/pull/15050)) [@mroeschke](https://github.com/mroeschke)
+- Change chained replace inplace test to COW test for pandas 2.2 ([#15049](https://github.com/rapidsai/cudf/pull/15049)) [@mroeschke](https://github.com/mroeschke)
+- Deprecate datelike isin casting strings to dates to match pandas 2.2 ([#15046](https://github.com/rapidsai/cudf/pull/15046)) [@mroeschke](https://github.com/mroeschke)
+- Avoid chained indexing in test_indexing for pandas 2.2 ([#15045](https://github.com/rapidsai/cudf/pull/15045)) [@mroeschke](https://github.com/mroeschke)
+- Avoid pandas 2.2 `DeprecationWarning` in test_hdf ([#15044](https://github.com/rapidsai/cudf/pull/15044)) [@mroeschke](https://github.com/mroeschke)
+- Use appropriate make_offsets_child_column for building lists columns ([#15043](https://github.com/rapidsai/cudf/pull/15043)) [@davidwendt](https://github.com/davidwendt)
+- Factor out position-offsets logic from strings split_helper utility ([#15040](https://github.com/rapidsai/cudf/pull/15040)) [@davidwendt](https://github.com/davidwendt)
+- Forward-merge branch-24.02 to branch-24.04 ([#15039](https://github.com/rapidsai/cudf/pull/15039)) [@bdice](https://github.com/bdice)
+- Clean up nvtx macros ([#15038](https://github.com/rapidsai/cudf/pull/15038)) [@PointKernel](https://github.com/PointKernel)
+- Add xfailures for test_applymap for pandas 2.2 ([#15034](https://github.com/rapidsai/cudf/pull/15034)) [@mroeschke](https://github.com/mroeschke)
+- Expose libcudf filter expression in read_parquet ([#15028](https://github.com/rapidsai/cudf/pull/15028)) [@wence-](https://github.com/wence-)
+- Adjust tests in test_dataframe.py for pandas 2.2 ([#15023](https://github.com/rapidsai/cudf/pull/15023)) [@mroeschke](https://github.com/mroeschke)
+- Adjust test_datetime_infer_format for pandas 2.2 ([#15021](https://github.com/rapidsai/cudf/pull/15021)) [@mroeschke](https://github.com/mroeschke)
+- Performance optimizations for parquet sub-rowgroup reader. ([#15020](https://github.com/rapidsai/cudf/pull/15020)) [@nvdbaranec](https://github.com/nvdbaranec)
+- JNI bindings for distinct_hash_join ([#15019](https://github.com/rapidsai/cudf/pull/15019)) [@jlowe](https://github.com/jlowe)
+- Change copy_if_safe to call thrust instead of the overload function ([#15018](https://github.com/rapidsai/cudf/pull/15018)) [@davidwendt](https://github.com/davidwendt)
+- Improve performance of copy_if_else for long strings ([#15017](https://github.com/rapidsai/cudf/pull/15017)) [@davidwendt](https://github.com/davidwendt)
+- Fix is_string_dtype test for pandas 2.2 ([#15012](https://github.com/rapidsai/cudf/pull/15012)) [@mroeschke](https://github.com/mroeschke)
+- Rework cudf::strings::detail::copy_range for offsetalator ([#15010](https://github.com/rapidsai/cudf/pull/15010)) [@davidwendt](https://github.com/davidwendt)
+- Use offsetalator in cudf::get_json_object() ([#15009](https://github.com/rapidsai/cudf/pull/15009)) [@davidwendt](https://github.com/davidwendt)
+- Align integral types in ORC to specs ([#15008](https://github.com/rapidsai/cudf/pull/15008)) [@vuule](https://github.com/vuule)
+- Clean up detail sequence header inclusion ([#15007](https://github.com/rapidsai/cudf/pull/15007)) [@PointKernel](https://github.com/PointKernel)
+- Add groupby.apply(include_groups=) to match pandas 2.2 deprecation ([#15006](https://github.com/rapidsai/cudf/pull/15006)) [@mroeschke](https://github.com/mroeschke)
+- Use offsetalator in cudf::interleave_columns() ([#15004](https://github.com/rapidsai/cudf/pull/15004)) [@davidwendt](https://github.com/davidwendt)
+- Use offsetalator in cudf::row_bit_count() ([#15003](https://github.com/rapidsai/cudf/pull/15003)) [@davidwendt](https://github.com/davidwendt)
+- Use offsetalator in cudf::strings::wrap() ([#15002](https://github.com/rapidsai/cudf/pull/15002)) [@davidwendt](https://github.com/davidwendt)
+- Use offsetalator in cudf::strings::reverse ([#15001](https://github.com/rapidsai/cudf/pull/15001)) [@davidwendt](https://github.com/davidwendt)
+- Deprecate groupby fillna ([#15000](https://github.com/rapidsai/cudf/pull/15000)) [@mroeschke](https://github.com/mroeschke)
+- Ensure to_* IO methods respect pandas 2.2 keyword only deprecation ([#14999](https://github.com/rapidsai/cudf/pull/14999)) [@mroeschke](https://github.com/mroeschke)
+- Remove unneeded calls to create_chars_child_column utility ([#14997](https://github.com/rapidsai/cudf/pull/14997)) [@davidwendt](https://github.com/davidwendt)
+- Add environment-agnostic scripts for running ctests and pytests ([#14992](https://github.com/rapidsai/cudf/pull/14992)) [@trxcllnt](https://github.com/trxcllnt)
+- Filter all `DeprecationWarning`&#39;s by `ArrowTable.to_pandas()` ([#14989](https://github.com/rapidsai/cudf/pull/14989)) [@galipremsagar](https://github.com/galipremsagar)
+- Deprecate replace with categorical columns ([#14988](https://github.com/rapidsai/cudf/pull/14988)) [@mroeschke](https://github.com/mroeschke)
+- Deprecate delim_whitespace in read_csv for pandas 2.2 ([#14986](https://github.com/rapidsai/cudf/pull/14986)) [@mroeschke](https://github.com/mroeschke)
+- Deprecate parameters similar to pandas 2.2 ([#14984](https://github.com/rapidsai/cudf/pull/14984)) [@mroeschke](https://github.com/mroeschke)
+- Ensure that `ctest` is called with `--no-tests=error`. ([#14983](https://github.com/rapidsai/cudf/pull/14983)) [@bdice](https://github.com/bdice)
+- Deprecate non-integer `periods` in `date_range` and `interval_range` ([#14976](https://github.com/rapidsai/cudf/pull/14976)) [@galipremsagar](https://github.com/galipremsagar)
+- Update ops-bot.yaml ([#14974](https://github.com/rapidsai/cudf/pull/14974)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Use page statistics in Parquet reader ([#14973](https://github.com/rapidsai/cudf/pull/14973)) [@etseidl](https://github.com/etseidl)
+- Use fused types for overloaded function signatures ([#14969](https://github.com/rapidsai/cudf/pull/14969)) [@vyasr](https://github.com/vyasr)
+- Deprecate certain frequency strings ([#14967](https://github.com/rapidsai/cudf/pull/14967)) [@galipremsagar](https://github.com/galipremsagar)
+- Update copyrights for 24.04. ([#14964](https://github.com/rapidsai/cudf/pull/14964)) [@bdice](https://github.com/bdice)
+- Add missing atomic operators, refactor atomic operators, move atomic operators to detail namespace. ([#14962](https://github.com/rapidsai/cudf/pull/14962)) [@bdice](https://github.com/bdice)
+- Introduce `GetJsonObjectOptions` in `getJSONObject` Java API ([#14956](https://github.com/rapidsai/cudf/pull/14956)) [@SurajAralihalli](https://github.com/SurajAralihalli)
+- JNI JSON read with DataSource and infered schema, along with basic java nested Schema JSON reads ([#14954](https://github.com/rapidsai/cudf/pull/14954)) [@revans2](https://github.com/revans2)
+- Make codecov only informational (always pass). ([#14952](https://github.com/rapidsai/cudf/pull/14952)) [@bdice](https://github.com/bdice)
+- Replace legacy cudf and dask_cudf imports as (d)gd ([#14944](https://github.com/rapidsai/cudf/pull/14944)) [@mroeschke](https://github.com/mroeschke)
+- Replace _is_datetime64tz/interval_dtype with isinstance ([#14943](https://github.com/rapidsai/cudf/pull/14943)) [@mroeschke](https://github.com/mroeschke)
+- Update tests for pandas 2. ([#14941](https://github.com/rapidsai/cudf/pull/14941)) [@bdice](https://github.com/bdice)
+- Use more public pandas APIs ([#14929](https://github.com/rapidsai/cudf/pull/14929)) [@mroeschke](https://github.com/mroeschke)
+- Replace local copyright check with pre-commit-hooks verify-copyright ([#14917](https://github.com/rapidsai/cudf/pull/14917)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Add `pandas-2.x` support in `cudf` ([#14916](https://github.com/rapidsai/cudf/pull/14916)) [@galipremsagar](https://github.com/galipremsagar)
+- Use offsetalator in nvtext::byte_pair_encoding ([#14888](https://github.com/rapidsai/cudf/pull/14888)) [@davidwendt](https://github.com/davidwendt)
+- De-DOS line-endings ([#14880](https://github.com/rapidsai/cudf/pull/14880)) [@wence-](https://github.com/wence-)
+- Add detail `cuco_allocator` ([#14877](https://github.com/rapidsai/cudf/pull/14877)) [@PointKernel](https://github.com/PointKernel)
+- Move all core types to using enum class in Cython ([#14876](https://github.com/rapidsai/cudf/pull/14876)) [@vyasr](https://github.com/vyasr)
+- Read `cudf.__version__` in Sphinx build ([#14872](https://github.com/rapidsai/cudf/pull/14872)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Use int64 offset types for accessing code-points in nvtext::normalize ([#14868](https://github.com/rapidsai/cudf/pull/14868)) [@davidwendt](https://github.com/davidwendt)
+- Read version from VERSION file in CMake ([#14867](https://github.com/rapidsai/cudf/pull/14867)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Update conda-cpp-post-build-checks to branch-24.04. ([#14854](https://github.com/rapidsai/cudf/pull/14854)) [@bdice](https://github.com/bdice)
+- Update cudf for compatibility with the latest cuco ([#14849](https://github.com/rapidsai/cudf/pull/14849)) [@PointKernel](https://github.com/PointKernel)
+- Remove deprecated strings functions ([#14848](https://github.com/rapidsai/cudf/pull/14848)) [@davidwendt](https://github.com/davidwendt)
+- Fix CI workflows for pandas-tests and add test summary. ([#14847](https://github.com/rapidsai/cudf/pull/14847)) [@bdice](https://github.com/bdice)
+- Use offsetalator in cudf::strings::copy_slice ([#14844](https://github.com/rapidsai/cudf/pull/14844)) [@davidwendt](https://github.com/davidwendt)
+- Fix V2 Parquet page alignment for use with zStandard compression ([#14841](https://github.com/rapidsai/cudf/pull/14841)) [@etseidl](https://github.com/etseidl)
+- Fix calls to deprecated strings factory API in examples. ([#14838](https://github.com/rapidsai/cudf/pull/14838)) [@bdice](https://github.com/bdice)
+- Update pre-commit hooks ([#14837](https://github.com/rapidsai/cudf/pull/14837)) [@bdice](https://github.com/bdice)
+- Use `rapids_cuda_set_runtime` to determine cuda runtime usage by target ([#14833](https://github.com/rapidsai/cudf/pull/14833)) [@vyasr](https://github.com/vyasr)
+- Remove get_mem_info functions from custom memory resources ([#14832](https://github.com/rapidsai/cudf/pull/14832)) [@harrism](https://github.com/harrism)
+- Fix debug build by splitting row_operator_tests_utilities.cu ([#14826](https://github.com/rapidsai/cudf/pull/14826)) [@davidwendt](https://github.com/davidwendt)
+- Remove -DNVBench_ENABLE_CUPTI=OFF. ([#14820](https://github.com/rapidsai/cudf/pull/14820)) [@bdice](https://github.com/bdice)
+- Use cuco::static_set in the hash-based groupby ([#14813](https://github.com/rapidsai/cudf/pull/14813)) [@PointKernel](https://github.com/PointKernel)
+- Branch 24.04 merge branch 24.02 ([#14809](https://github.com/rapidsai/cudf/pull/14809)) [@vyasr](https://github.com/vyasr)
+- Branch 24.04 merge branch 24.02 ([#14806](https://github.com/rapidsai/cudf/pull/14806)) [@vyasr](https://github.com/vyasr)
+- Introduce basic &quot;cudf&quot; backend for Dask Expressions ([#14805](https://github.com/rapidsai/cudf/pull/14805)) [@rjzamora](https://github.com/rjzamora)
+- Remove `build_struct|list_column` ([#14786](https://github.com/rapidsai/cudf/pull/14786)) [@mroeschke](https://github.com/mroeschke)
+- Use offsetalator in nvtext tokenize functions ([#14783](https://github.com/rapidsai/cudf/pull/14783)) [@davidwendt](https://github.com/davidwendt)
+- Reduce execution time of Python ORC tests ([#14776](https://github.com/rapidsai/cudf/pull/14776)) [@vuule](https://github.com/vuule)
+- Use offsetalator in cudf::strings::split functions ([#14757](https://github.com/rapidsai/cudf/pull/14757)) [@davidwendt](https://github.com/davidwendt)
+- Use offsetalator in cudf::strings::findall ([#14745](https://github.com/rapidsai/cudf/pull/14745)) [@davidwendt](https://github.com/davidwendt)
+- Use offsetalator in cudf::strings::url_decode ([#14744](https://github.com/rapidsai/cudf/pull/14744)) [@davidwendt](https://github.com/davidwendt)
+- Use get_offset_value utility in strings shift function ([#14743](https://github.com/rapidsai/cudf/pull/14743)) [@davidwendt](https://github.com/davidwendt)
+- Use as_column instead of full ([#14698](https://github.com/rapidsai/cudf/pull/14698)) [@mroeschke](https://github.com/mroeschke)
+- List all notable breaking changes ([#13535](https://github.com/rapidsai/cudf/pull/13535)) [@galipremsagar](https://github.com/galipremsagar)
+
 # cuDF 24.02.00 (12 Feb 2024)
 
 ## 🚨 Breaking Changes

From 460b41edadc90a43b02b1f1e7dc23190cc14d0b4 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 10 Apr 2024 05:47:58 -1000
Subject: [PATCH 042/842] Use less _is_categorical_dtype (#15148)

Rehash of https://github.com/rapidsai/cudf/pull/14942

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15148
---
 python/cudf/cudf/_fuzz_testing/csv.py  |  2 +-
 python/cudf/cudf/_fuzz_testing/json.py |  2 +-
 python/cudf/cudf/_lib/csv.pyx          | 15 +++---
 python/cudf/cudf/core/column/column.py |  7 +--
 python/cudf/cudf/core/dtypes.py        | 10 +++-
 python/cudf/cudf/testing/testing.py    | 24 +++++-----
 python/cudf/cudf/tests/test_column.py  |  4 +-
 python/cudf/cudf/tests/test_concat.py  | 66 ++++++++------------------
 python/cudf/cudf/tests/test_csv.py     | 22 +++++++--
 python/cudf/cudf/utils/dtypes.py       |  4 +-
 10 files changed, 78 insertions(+), 78 deletions(-)

diff --git a/python/cudf/cudf/_fuzz_testing/csv.py b/python/cudf/cudf/_fuzz_testing/csv.py
index 5b49143fd5a..67211a1c4bf 100644
--- a/python/cudf/cudf/_fuzz_testing/csv.py
+++ b/python/cudf/cudf/_fuzz_testing/csv.py
@@ -99,7 +99,7 @@ def set_rand_params(self, params):
                     if dtype_val is not None:
                         dtype_val = {
                             col_name: "category"
-                            if cudf.utils.dtypes._is_categorical_dtype(dtype)
+                            if isinstance(dtype, cudf.CategoricalDtype)
                             else pandas_dtypes_to_np_dtypes[dtype]
                             for col_name, dtype in dtype_val.items()
                         }
diff --git a/python/cudf/cudf/_fuzz_testing/json.py b/python/cudf/cudf/_fuzz_testing/json.py
index bffd508b2ef..e987529c8ba 100644
--- a/python/cudf/cudf/_fuzz_testing/json.py
+++ b/python/cudf/cudf/_fuzz_testing/json.py
@@ -27,7 +27,7 @@ def _get_dtype_param_value(dtype_val):
     if dtype_val is not None and isinstance(dtype_val, abc.Mapping):
         processed_dtypes = {}
         for col_name, dtype in dtype_val.items():
-            if cudf.utils.dtypes._is_categorical_dtype(dtype):
+            if isinstance(dtype, cudf.CategoricalDtype):
                 processed_dtypes[col_name] = "category"
             else:
                 processed_dtypes[col_name] = str(
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 0f0bc3ce81a..b2e4d442bd2 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -434,7 +434,7 @@ def read_csv(
     if dtype is not None:
         if isinstance(dtype, abc.Mapping):
             for k, v in dtype.items():
-                if cudf.api.types._is_categorical_dtype(v):
+                if isinstance(cudf.dtype(v), cudf.CategoricalDtype):
                     df._data[str(k)] = df._data[str(k)].astype(v)
         elif (
             cudf.api.types.is_scalar(dtype) or
@@ -442,11 +442,11 @@ def read_csv(
                 np.dtype, pd.api.extensions.ExtensionDtype, type
             ))
         ):
-            if cudf.api.types._is_categorical_dtype(dtype):
+            if isinstance(cudf.dtype(dtype), cudf.CategoricalDtype):
                 df = df.astype(dtype)
         elif isinstance(dtype, abc.Collection):
             for index, col_dtype in enumerate(dtype):
-                if cudf.api.types._is_categorical_dtype(col_dtype):
+                if isinstance(cudf.dtype(col_dtype), cudf.CategoricalDtype):
                     col_name = df._data.names[index]
                     df._data[col_name] = df._data[col_name].astype(col_dtype)
 
@@ -554,11 +554,10 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
     # TODO: Remove this work-around Dictionary types
     # in libcudf are fully mapped to categorical columns:
     # https://github.com/rapidsai/cudf/issues/3960
-    if cudf.api.types._is_categorical_dtype(dtype):
-        if isinstance(dtype, str):
-            dtype = "str"
-        else:
-            dtype = dtype.categories.dtype
+    if isinstance(dtype, cudf.CategoricalDtype):
+        dtype = dtype.categories.dtype
+    elif dtype == "category":
+        dtype = "str"
 
     if isinstance(dtype, str):
         if str(dtype) == "date32":
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 67f44ad2f48..c8a6493ddda 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -52,7 +52,6 @@
 from cudf._lib.types import size_type_dtype
 from cudf._typing import ColumnLike, Dtype, ScalarLike
 from cudf.api.types import (
-    _is_categorical_dtype,
     _is_non_decimal_numeric_dtype,
     _is_pandas_nullable_extension_dtype,
     infer_dtype,
@@ -1381,7 +1380,7 @@ def column_empty_like(
 
     if (
         hasattr(column, "dtype")
-        and _is_categorical_dtype(column.dtype)
+        and isinstance(column.dtype, cudf.CategoricalDtype)
         and dtype == column.dtype
     ):
         catcolumn = cast("cudf.core.column.CategoricalColumn", column)
@@ -2008,7 +2007,9 @@ def as_column(
             length = 1
         elif length < 0:
             raise ValueError(f"{length=} must be >=0.")
-        if isinstance(arbitrary, pd.Interval) or _is_categorical_dtype(dtype):
+        if isinstance(
+            arbitrary, pd.Interval
+        ) or cudf.api.types._is_categorical_dtype(dtype):
             # No cudf.Scalar support yet
             return as_column(
                 pd.Series([arbitrary] * length),
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 73617763221..9bb1995b836 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -51,6 +51,11 @@ def dtype(arbitrary):
             raise TypeError(f"Unsupported type {np_dtype}")
         return np_dtype
 
+    if isinstance(arbitrary, str) and arbitrary in {"hex", "hex32", "hex64"}:
+        # read_csv only accepts "hex"
+        # e.g. test_csv_reader_hexadecimals, test_csv_reader_hexadecimal_overflow
+        return arbitrary
+
     # use `pandas_dtype` to try and interpret
     # `arbitrary` as a Pandas extension type.
     #  Return the corresponding NumPy/cuDF type.
@@ -999,7 +1004,10 @@ def _is_categorical_dtype(obj):
             pd.Series,
         ),
     ):
-        return _is_categorical_dtype(obj.dtype)
+        try:
+            return isinstance(cudf.dtype(obj.dtype), cudf.CategoricalDtype)
+        except TypeError:
+            return False
     if hasattr(obj, "type"):
         if obj.type is pd.CategoricalDtype.type:
             return True
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index fc253c5c197..dffbbe92fc1 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -8,11 +8,7 @@
 
 import cudf
 from cudf._lib.unary import is_nan
-from cudf.api.types import (
-    _is_categorical_dtype,
-    is_numeric_dtype,
-    is_string_dtype,
-)
+from cudf.api.types import is_numeric_dtype, is_string_dtype
 from cudf.core.missing import NA, NaT
 
 
@@ -86,7 +82,7 @@ def _check_types(
     if (
         exact
         and not isinstance(left, cudf.MultiIndex)
-        and _is_categorical_dtype(left)
+        and isinstance(left.dtype, cudf.CategoricalDtype)
     ):
         if left.dtype != right.dtype:
             raise_assert_detail(
@@ -144,8 +140,8 @@ def assert_column_equal(
     """
     if check_dtype is True:
         if (
-            _is_categorical_dtype(left)
-            and _is_categorical_dtype(right)
+            isinstance(left.dtype, cudf.CategoricalDtype)
+            and isinstance(right.dtype, cudf.CategoricalDtype)
             and not check_categorical
         ):
             pass
@@ -173,7 +169,9 @@ def assert_column_equal(
             return
 
     if check_exact and check_categorical:
-        if _is_categorical_dtype(left) and _is_categorical_dtype(right):
+        if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance(
+            right.dtype, cudf.CategoricalDtype
+        ):
             left_cat = left.categories
             right_cat = right.categories
 
@@ -207,8 +205,8 @@ def assert_column_equal(
 
     if (
         not check_dtype
-        and _is_categorical_dtype(left)
-        and _is_categorical_dtype(right)
+        and isinstance(left.dtype, cudf.CategoricalDtype)
+        and isinstance(right.dtype, cudf.CategoricalDtype)
     ):
         left = left.astype(left.categories.dtype)
         right = right.astype(right.categories.dtype)
@@ -258,7 +256,9 @@ def assert_column_equal(
                 raise e
             else:
                 columns_equal = False
-            if _is_categorical_dtype(left) and _is_categorical_dtype(right):
+            if isinstance(left.dtype, cudf.CategoricalDtype) and isinstance(
+                right.dtype, cudf.CategoricalDtype
+            ):
                 left = left.astype(left.categories.dtype)
                 right = right.astype(right.categories.dtype)
     if not columns_equal:
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 2f70f955fa9..dace8009041 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -81,7 +81,7 @@ def test_column_offset_and_size(pandas_input, offset, size):
         children=col.base_children,
     )
 
-    if cudf.api.types._is_categorical_dtype(col.dtype):
+    if isinstance(col.dtype, cudf.CategoricalDtype):
         assert col.size == col.codes.size
         assert col.size == (col.codes.data.size / col.codes.dtype.itemsize)
     elif cudf.api.types.is_string_dtype(col.dtype):
@@ -120,7 +120,7 @@ def column_slicing_test(col, offset, size, cast_to_float=False):
     else:
         pd_series = series.to_pandas()
 
-    if cudf.api.types._is_categorical_dtype(col.dtype):
+    if isinstance(col.dtype, cudf.CategoricalDtype):
         # The cudf.Series is constructed from an already sliced column, whereas
         # the pandas.Series is constructed from the unsliced series and then
         # sliced, so the indexes should be different and we must ignore it.
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index 3d638da924b..87b3beb5589 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -9,7 +9,6 @@
 import pytest
 
 import cudf
-from cudf.api.types import _is_categorical_dtype
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing._utils import (
     assert_eq,
@@ -609,8 +608,8 @@ def test_concat_empty_dataframes(df, other, ignore_index):
     actual = cudf.concat(other_gd, ignore_index=ignore_index)
     if expected.shape != df.shape:
         for key, col in actual[actual.columns].items():
-            if _is_categorical_dtype(col.dtype):
-                if not _is_categorical_dtype(expected[key].dtype):
+            if isinstance(col.dtype, cudf.CategoricalDtype):
+                if not isinstance(expected[key].dtype, pd.CategoricalDtype):
                     # TODO: Pandas bug:
                     # https://github.com/pandas-dev/pandas/issues/42840
                     expected[key] = expected[key].fillna("-1").astype("str")
@@ -1195,10 +1194,10 @@ def test_concat_join_series(ignore_index, sort, join, axis):
 @pytest.mark.parametrize("ignore_index", [True, False])
 @pytest.mark.parametrize("sort", [True, False])
 @pytest.mark.parametrize("join", ["inner", "outer"])
-@pytest.mark.parametrize("axis", [0])
 def test_concat_join_empty_dataframes(
-    df, other, ignore_index, axis, join, sort
+    request, df, other, ignore_index, join, sort
 ):
+    axis = 0
     other_pd = [df] + other
     gdf = cudf.from_pandas(df)
     other_gd = [gdf] + [cudf.from_pandas(o) for o in other]
@@ -1209,50 +1208,27 @@ def test_concat_join_empty_dataframes(
     actual = cudf.concat(
         other_gd, ignore_index=ignore_index, axis=axis, join=join, sort=sort
     )
-    if expected.shape != df.shape:
-        if axis == 0:
-            for key, col in actual[actual.columns].items():
-                if _is_categorical_dtype(col.dtype):
-                    if not _is_categorical_dtype(expected[key].dtype):
-                        # TODO: Pandas bug:
-                        # https://github.com/pandas-dev/pandas/issues/42840
-                        expected[key] = (
-                            expected[key].fillna("-1").astype("str")
-                        )
-                    else:
-                        expected[key] = (
-                            expected[key]
-                            .cat.add_categories(["-1"])
-                            .fillna("-1")
-                            .astype("str")
-                        )
-                    actual[key] = col.astype("str").fillna("-1")
-                else:
-                    expected[key] = expected[key].fillna(-1)
-                    actual[key] = col.fillna(-1)
-
-            assert_eq(
-                expected.fillna(-1),
-                actual.fillna(-1),
-                check_dtype=False,
-                check_index_type=False
-                if len(expected) == 0 or actual.empty
-                else True,
-                check_column_type=False,
-            )
-        else:
-            # no need to fill in if axis=1
-            assert_eq(
-                expected,
-                actual,
-                check_index_type=False,
-                check_column_type=False,
+    if (
+        join == "outer"
+        and any(
+            isinstance(dtype, pd.CategoricalDtype)
+            for dtype in df.dtypes.tolist()
+        )
+        and any(
+            isinstance(dtype, pd.CategoricalDtype)
+            for other_df in other
+            for dtype in other_df.dtypes.tolist()
+        )
+    ):
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="https://github.com/pandas-dev/pandas/issues/42840"
             )
+        )
     assert_eq(
         expected,
         actual,
         check_dtype=False,
-        check_index_type=False,
         check_column_type=False,
     )
 
@@ -1332,7 +1308,7 @@ def test_concat_join_empty_dataframes_axis_1(
     if expected.shape != df.shape:
         if axis == 0:
             for key, col in actual[actual.columns].items():
-                if _is_categorical_dtype(col.dtype):
+                if isinstance(expected[key].dtype, pd.CategoricalDtype):
                     expected[key] = expected[key].fillna("-1")
                     actual[key] = col.astype("str").fillna("-1")
             # if not expected.empty:
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 2d728fb94ba..5009a7f2628 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -272,14 +272,30 @@ def test_csv_reader_mixed_data_delimiter_sep(
     gdf1 = read_csv(
         str(fname),
         names=["1", "2", "3", "4", "5", "6", "7"],
-        dtype=["int64", "date", "float64", "int64", "category", "str", "bool"],
+        dtype=[
+            "int64",
+            "datetime64[ns]",
+            "float64",
+            "int64",
+            "category",
+            "str",
+            "bool",
+        ],
         dayfirst=True,
         **cudf_arg,
     )
     gdf2 = read_csv(
         str(fname),
         names=["1", "2", "3", "4", "5", "6", "7"],
-        dtype=["int64", "date", "float64", "int64", "category", "str", "bool"],
+        dtype=[
+            "int64",
+            "datetime64[ns]",
+            "float64",
+            "int64",
+            "category",
+            "str",
+            "bool",
+        ],
         dayfirst=True,
         **pandas_arg,
     )
@@ -368,7 +384,7 @@ def test_csv_reader_skiprows_skipfooter(tmpdir, pd_mixed_dataframe):
     out = read_csv(
         str(fname),
         names=["1", "2", "3"],
-        dtype=["int64", "date", "float64"],
+        dtype=["int64", "datetime64[ns]", "float64"],
         skiprows=1,
         skipfooter=1,
         dayfirst=True,
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 8521239413e..a33b5ca139c 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -392,9 +392,9 @@ def get_min_float_dtype(col):
 
 
 def is_mixed_with_object_dtype(lhs, rhs):
-    if cudf.api.types._is_categorical_dtype(lhs.dtype):
+    if isinstance(lhs.dtype, cudf.CategoricalDtype):
         return is_mixed_with_object_dtype(lhs.dtype.categories, rhs)
-    elif cudf.api.types._is_categorical_dtype(rhs.dtype):
+    elif isinstance(rhs.dtype, cudf.CategoricalDtype):
         return is_mixed_with_object_dtype(lhs, rhs.dtype.categories)
 
     return (lhs.dtype == "object" and rhs.dtype != "object") or (

From 888e9d5c38cb27402313681744b87462846bc405 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Wed, 10 Apr 2024 17:56:10 -0400
Subject: [PATCH 043/842] Floating <--> fixed-point conversion must now be
 called explicitly (#15438)

This change makes it so fixed_point objects can no longer be constructed with floating point values, and can no longer be casted to floating point values.  Instead the functions added to unary.hpp must be explicitly called.

In addition to making it more clear when and where these conversions are occurring, this also makes it so that the low-level fixed_point.hpp header won't be inundated with all of the complex lossless conversion code to come.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Shruti Shivakumar (https://github.com/shrshi)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/15438
---
 cpp/include/cudf/fixed_point/fixed_point.hpp  |  49 +----
 cpp/include/cudf/unary.hpp                    |  75 ++++++-
 cpp/include/cudf/utilities/traits.hpp         |   7 +-
 cpp/src/binaryop/compiled/binary_ops.cuh      |  19 +-
 cpp/src/quantiles/quantiles_util.hpp          |   9 +-
 .../quantiles/tdigest/tdigest_aggregation.cu  |  14 +-
 cpp/src/unary/cast_ops.cu                     |  16 +-
 cpp/tests/fixed_point/fixed_point_tests.cpp   | 189 +++++++++---------
 cpp/tests/io/orc_test.cpp                     |   2 +-
 9 files changed, 219 insertions(+), 161 deletions(-)

diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 4445af6c5a8..e39d75757e8 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -67,18 +67,6 @@ constexpr inline auto is_supported_representation_type()
          cuda::std::is_same_v<T, __int128_t>;
 }
 
-/**
- * @brief Returns `true` if the value type is supported for constructing a `fixed_point`
- *
- * @tparam T The construction value type
- * @return `true` if the value type is supported to construct a `fixed_point` type
- */
-template <typename T>
-constexpr inline auto is_supported_construction_value_type()
-{
-  return cuda::std::is_integral<T>() || cuda::std::is_floating_point_v<T>;
-}
-
 /** @} */  // end of group
 
 // Helper functions for `fixed_point` type
@@ -222,23 +210,8 @@ class fixed_point {
   scale_type _scale;
 
  public:
-  using rep = Rep;  ///< The representation type
-
-  /**
-   * @brief Constructor that will perform shifting to store value appropriately (from floating point
-   * types)
-   *
-   * @tparam T The floating point type that you are constructing from
-   * @param value The value that will be constructed from
-   * @param scale The exponent that is applied to Rad to perform shifting
-   */
-  template <typename T,
-            typename cuda::std::enable_if_t<cuda::std::is_floating_point<T>() &&
-                                            is_supported_representation_type<Rep>()>* = nullptr>
-  CUDF_HOST_DEVICE inline explicit fixed_point(T const& value, scale_type const& scale)
-    : _value{static_cast<Rep>(detail::shift<Rep, Rad>(value, scale))}, _scale{scale}
-  {
-  }
+  using rep                 = Rep;  ///< The representation type
+  static constexpr auto rad = Rad;  ///< The base
 
   /**
    * @brief Constructor that will perform shifting to store value appropriately (from integral
@@ -249,7 +222,7 @@ class fixed_point {
    * @param scale The exponent that is applied to Rad to perform shifting
    */
   template <typename T,
-            typename cuda::std::enable_if_t<cuda::std::is_integral<T>() &&
+            typename cuda::std::enable_if_t<cuda::std::is_integral_v<T> &&
                                             is_supported_representation_type<Rep>()>* = nullptr>
   CUDF_HOST_DEVICE inline explicit fixed_point(T const& value, scale_type const& scale)
     // `value` is cast to `Rep` to avoid overflow in cases where
@@ -275,8 +248,7 @@ class fixed_point {
    * @tparam T The value type being constructing from
    * @param value The value that will be constructed from
    */
-  template <typename T,
-            typename cuda::std::enable_if_t<is_supported_construction_value_type<T>()>* = nullptr>
+  template <typename T, typename cuda::std::enable_if_t<cuda::std::is_integral_v<T>>* = nullptr>
   CUDF_HOST_DEVICE inline fixed_point(T const& value)
     : _value{static_cast<Rep>(value)}, _scale{scale_type{0}}
   {
@@ -288,19 +260,6 @@ class fixed_point {
    */
   CUDF_HOST_DEVICE inline fixed_point() : _scale{scale_type{0}} {}
 
-  /**
-   * @brief Explicit conversion operator for casting to floating point types
-   *
-   * @tparam U The floating point type that is being explicitly converted to
-   * @return The `fixed_point` number in base 10 (aka human readable format)
-   */
-  template <typename U,
-            typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<U>>* = nullptr>
-  explicit constexpr operator U() const
-  {
-    return detail::shift<Rep, Rad>(static_cast<U>(_value), scale_type{-_scale});
-  }
-
   /**
    * @brief Explicit conversion operator for casting to integral types
    *
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 64e802d88dd..5ded22488c7 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,10 @@
 
 #pragma once
 
+#include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/traits.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
@@ -31,6 +33,77 @@ namespace cudf {
  * @brief Column APIs for unary ops
  */
 
+/**
+ * @brief Convert a floating-point value to fixed point
+ *
+ * @note This conversion was moved from fixed-point member functions to free functions.
+ * This is so that the complex conversion code is not included into many parts of the
+ * code base that don't need it, and so that it's more obvious to pinpoint where these
+ * conversions are occurring.
+ *
+ * @tparam Fixed The fixed-point type to convert to
+ * @tparam Floating The floating-point type to convert from
+ * @param floating The floating-point value to convert
+ * @param scale The desired scale of the fixed-point value
+ * @return The converted fixed-point value
+ */
+template <typename Fixed,
+          typename Floating,
+          typename cuda::std::enable_if_t<is_fixed_point<Fixed>() &&
+                                          cuda::std::is_floating_point_v<Floating>>* = nullptr>
+CUDF_HOST_DEVICE Fixed convert_floating_to_fixed(Floating floating, numeric::scale_type scale)
+{
+  using Rep          = typename Fixed::rep;
+  auto const shifted = numeric::detail::shift<Rep, Fixed::rad>(floating, scale);
+  numeric::scaled_integer<Rep> scaled{static_cast<Rep>(shifted), scale};
+  return Fixed(scaled);
+}
+
+/**
+ * @brief Convert a fixed-point value to floating point
+ *
+ * @note This conversion was moved from fixed-point member functions to free functions.
+ * This is so that the complex conversion code is not included into many parts of the
+ * code base that don't need it, and so that it's more obvious to pinpoint where these
+ * conversions are occurring.
+ *
+ * @tparam Floating The floating-point type to convert to
+ * @tparam Fixed The fixed-point type to convert from
+ * @param fixed The fixed-point value to convert
+ * @return The converted floating-point value
+ */
+template <typename Floating,
+          typename Fixed,
+          typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<Floating> &&
+                                          is_fixed_point<Fixed>()>* = nullptr>
+CUDF_HOST_DEVICE Floating convert_fixed_to_floating(Fixed fixed)
+{
+  using Rep         = typename Fixed::rep;
+  auto const casted = static_cast<Floating>(fixed.value());
+  auto const scale  = numeric::scale_type{-fixed.scale()};
+  return numeric::detail::shift<Rep, Fixed::rad>(casted, scale);
+}
+
+/**
+ * @brief Convert a value to floating point
+ *
+ * @tparam Floating The floating-point type to convert to
+ * @tparam Input The input type to convert from
+ * @param input The input value to convert
+ * @return The converted floating-point value
+ */
+template <typename Floating,
+          typename Input,
+          typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<Floating>>* = nullptr>
+CUDF_HOST_DEVICE Floating convert_to_floating(Input input)
+{
+  if constexpr (is_fixed_point<Input>()) {
+    return convert_fixed_to_floating<Floating>(input);
+  } else {
+    return static_cast<Floating>(input);
+  }
+}
+
 /**
  * @brief Types of unary operations that can be performed on data.
  */
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index 2dda0740b96..d191e44228a 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -397,7 +397,10 @@ template <typename T>
 constexpr inline bool is_fixed_point()
 {
   return std::is_same_v<numeric::decimal32, T> || std::is_same_v<numeric::decimal64, T> ||
-         std::is_same_v<numeric::decimal128, T>;
+         std::is_same_v<numeric::decimal128, T> ||
+         std::is_same_v<numeric::fixed_point<int32_t, numeric::Radix::BASE_2>, T> ||
+         std::is_same_v<numeric::fixed_point<int64_t, numeric::Radix::BASE_2>, T> ||
+         std::is_same_v<numeric::fixed_point<__int128_t, numeric::Radix::BASE_2>, T>;
 }
 
 /**
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index d605c877d3f..0bc144baa83 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -22,6 +22,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/unary.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -69,13 +70,17 @@ struct typed_casted_writer {
     if constexpr (mutable_column_device_view::has_element_accessor<Element>() and
                   std::is_constructible_v<Element, FromType>) {
       col.element<Element>(i) = static_cast<Element>(val);
-    } else if constexpr (is_fixed_point<Element>() and
-                         (is_fixed_point<FromType>() or
-                          std::is_constructible_v<Element, FromType>)) {
-      if constexpr (is_fixed_point<FromType>())
-        col.data<Element::rep>()[i] = val.rescaled(numeric::scale_type{col.type().scale()}).value();
-      else
-        col.data<Element::rep>()[i] = Element{val, numeric::scale_type{col.type().scale()}}.value();
+    } else if constexpr (is_fixed_point<Element>()) {
+      auto const scale = numeric::scale_type{col.type().scale()};
+      if constexpr (is_fixed_point<FromType>()) {
+        col.data<Element::rep>()[i] = val.rescaled(scale).value();
+      } else if constexpr (cuda::std::is_constructible_v<Element, FromType>) {
+        col.data<Element::rep>()[i] = Element{val, scale}.value();
+      } else if constexpr (cuda::std::is_floating_point_v<FromType>) {
+        col.data<Element::rep>()[i] = convert_floating_to_fixed<Element>(val, scale).value();
+      }
+    } else if constexpr (cuda::std::is_floating_point_v<Element> and is_fixed_point<FromType>()) {
+      col.data<Element>()[i] = convert_fixed_to_floating<Element>(val);
     }
   }
 };
diff --git a/cpp/src/quantiles/quantiles_util.hpp b/cpp/src/quantiles/quantiles_util.hpp
index 5efafdd0be6..47864c25c5f 100644
--- a/cpp/src/quantiles/quantiles_util.hpp
+++ b/cpp/src/quantiles/quantiles_util.hpp
@@ -16,6 +16,7 @@
 
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/types.hpp>
+#include <cudf/unary.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
@@ -46,8 +47,8 @@ CUDF_HOST_DEVICE inline Result linear(T lhs, T rhs, double frac)
   // Underflow may occur when converting int64 to double
   // detail: https://github.com/rapidsai/cudf/issues/1417
 
-  auto dlhs             = static_cast<double>(lhs);
-  auto drhs             = static_cast<double>(rhs);
+  auto dlhs             = convert_to_floating<double>(lhs);
+  auto drhs             = convert_to_floating<double>(rhs);
   double one_minus_frac = 1.0 - frac;
   return static_cast<Result>(one_minus_frac * dlhs + frac * drhs);
 }
@@ -56,8 +57,8 @@ template <typename Result, typename T>
 CUDF_HOST_DEVICE inline Result midpoint(T lhs, T rhs)
 {
   // TODO: try std::midpoint (C++20) if available
-  auto dlhs = static_cast<double>(lhs);
-  auto drhs = static_cast<double>(rhs);
+  auto dlhs = convert_to_floating<double>(lhs);
+  auto drhs = convert_to_floating<double>(rhs);
   return static_cast<Result>(dlhs / 2 + drhs / 2);
 }
 
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 56e1bfbe003..8544d9caa56 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -28,6 +28,7 @@
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/unary.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -73,7 +74,7 @@ struct make_centroid {
   centroid operator() __device__(size_type index) const
   {
     auto const is_valid = col.is_valid(index);
-    auto const mean     = is_valid ? static_cast<double>(col.element<T>(index)) : 0.0;
+    auto const mean     = is_valid ? convert_to_floating<double>(col.element<T>(index)) : 0.0;
     auto const weight   = is_valid ? 1.0 : 0.0;
     return {mean, weight, is_valid};
   }
@@ -87,7 +88,7 @@ struct make_centroid_no_nulls {
 
   centroid operator() __device__(size_type index) const
   {
-    return {static_cast<double>(col.element<T>(index)), 1.0, true};
+    return {convert_to_floating<double>(col.element<T>(index)), 1.0, true};
   }
 };
 
@@ -808,8 +809,9 @@ struct get_scalar_minmax_grouped {
     auto const valid_count = group_valid_counts[group_index];
     return valid_count > 0
              ? thrust::make_tuple(
-                 static_cast<double>(col.element<T>(group_offsets[group_index])),
-                 static_cast<double>(col.element<T>(group_offsets[group_index] + valid_count - 1)))
+                 convert_to_floating<double>(col.element<T>(group_offsets[group_index])),
+                 convert_to_floating<double>(
+                   col.element<T>(group_offsets[group_index] + valid_count - 1)))
              : thrust::make_tuple(0.0, 0.0);
   }
 };
@@ -823,8 +825,8 @@ struct get_scalar_minmax {
   __device__ thrust::tuple<double, double> operator()(size_type)
   {
     return valid_count > 0
-             ? thrust::make_tuple(static_cast<double>(col.element<T>(0)),
-                                  static_cast<double>(col.element<T>(valid_count - 1)))
+             ? thrust::make_tuple(convert_to_floating<double>(col.element<T>(0)),
+                                  convert_to_floating<double>(col.element<T>(valid_count - 1)))
              : thrust::make_tuple(0.0, 0.0);
   }
 };
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 47a0cb393aa..b6c9b3caa20 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,8 +116,12 @@ struct fixed_point_unary_cast {
     std::enable_if_t<(cudf::is_fixed_point<_SourceT>() && cudf::is_numeric<TargetT>())>* = nullptr>
   __device__ inline TargetT operator()(DeviceT const element)
   {
-    auto const fp = SourceT{numeric::scaled_integer<DeviceT>{element, scale}};
-    return static_cast<TargetT>(fp);
+    auto const fixed_point = SourceT{numeric::scaled_integer<DeviceT>{element, scale}};
+    if constexpr (cuda::std::is_floating_point_v<TargetT>) {
+      return convert_fixed_to_floating<TargetT>(fixed_point);
+    } else {
+      return static_cast<TargetT>(fixed_point);
+    }
   }
 
   template <
@@ -126,7 +130,11 @@ struct fixed_point_unary_cast {
     std::enable_if_t<(cudf::is_numeric<_SourceT>() && cudf::is_fixed_point<TargetT>())>* = nullptr>
   __device__ inline DeviceT operator()(SourceT const element)
   {
-    return TargetT{element, scale}.value();
+    if constexpr (cuda::std::is_floating_point_v<SourceT>) {
+      return convert_floating_to_fixed<TargetT>(element, scale).value();
+    } else {
+      return TargetT{element, scale}.value();
+    }
   }
 };
 
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index 1c1680fcd6e..73de1fbaa68 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -23,6 +23,7 @@
 #include <cudf/binaryop.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/null_mask.hpp>
+#include <cudf/unary.hpp>
 
 #include <algorithm>
 #include <limits>
@@ -45,67 +46,71 @@ TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXConstruction)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
-  decimalXX num0{1.234567, scale_type{0}};
-  decimalXX num1{1.234567, scale_type{-1}};
-  decimalXX num2{1.234567, scale_type{-2}};
-  decimalXX num3{1.234567, scale_type{-3}};
-  decimalXX num4{1.234567, scale_type{-4}};
-  decimalXX num5{1.234567, scale_type{-5}};
-  decimalXX num6{1.234567, scale_type{-6}};
-
-  EXPECT_EQ(1, static_cast<double>(num0));
-  EXPECT_EQ(1.2, static_cast<double>(num1));
-  EXPECT_EQ(1.23, static_cast<double>(num2));
-  EXPECT_EQ(1.234, static_cast<double>(num3));
-  EXPECT_EQ(1.2345, static_cast<double>(num4));
-  EXPECT_EQ(1.23456, static_cast<double>(num5));
-  EXPECT_EQ(1.234567, static_cast<double>(num6));
+  auto num0 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(0));
+  auto num1 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-1));
+  auto num2 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-2));
+  auto num3 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-3));
+  auto num4 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-4));
+  auto num5 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-5));
+  auto num6 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-6));
+
+  EXPECT_EQ(1, cudf::convert_fixed_to_floating<double>(num0));
+  EXPECT_EQ(1.2, cudf::convert_fixed_to_floating<double>(num1));
+  EXPECT_EQ(1.23, cudf::convert_fixed_to_floating<double>(num2));
+  EXPECT_EQ(1.234, cudf::convert_fixed_to_floating<double>(num3));
+  EXPECT_EQ(1.2345, cudf::convert_fixed_to_floating<double>(num4));
+  EXPECT_EQ(1.23456, cudf::convert_fixed_to_floating<double>(num5));
+  EXPECT_EQ(1.234567, cudf::convert_fixed_to_floating<double>(num6));
 }
 
 TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
-  decimalXX num0{-1.234567, scale_type{0}};
-  decimalXX num1{-1.234567, scale_type{-1}};
-  decimalXX num2{-1.234567, scale_type{-2}};
-  decimalXX num3{-1.234567, scale_type{-3}};
-  decimalXX num4{-1.234567, scale_type{-4}};
-  decimalXX num5{-1.234567, scale_type{-5}};
-  decimalXX num6{-1.234567, scale_type{-6}};
-
-  EXPECT_EQ(-1, static_cast<double>(num0));
-  EXPECT_EQ(-1.2, static_cast<double>(num1));
-  EXPECT_EQ(-1.23, static_cast<double>(num2));
-  EXPECT_EQ(-1.234, static_cast<double>(num3));
-  EXPECT_EQ(-1.2345, static_cast<double>(num4));
-  EXPECT_EQ(-1.23456, static_cast<double>(num5));
-  EXPECT_EQ(-1.234567, static_cast<double>(num6));
+  auto num0 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(0));
+  auto num1 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-1));
+  auto num2 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-2));
+  auto num3 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-3));
+  auto num4 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-4));
+  auto num5 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-5));
+  auto num6 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-6));
+
+  EXPECT_EQ(-1, cudf::convert_fixed_to_floating<double>(num0));
+  EXPECT_EQ(-1.2, cudf::convert_fixed_to_floating<double>(num1));
+  EXPECT_EQ(-1.23, cudf::convert_fixed_to_floating<double>(num2));
+  EXPECT_EQ(-1.234, cudf::convert_fixed_to_floating<double>(num3));
+  EXPECT_EQ(-1.2345, cudf::convert_fixed_to_floating<double>(num4));
+  EXPECT_EQ(-1.23456, cudf::convert_fixed_to_floating<double>(num5));
+  EXPECT_EQ(-1.234567, cudf::convert_fixed_to_floating<double>(num6));
 }
 
 TYPED_TEST(FixedPointTestAllReps, PaddedDecimalXXConstruction)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
-  decimalXX a{1.1, scale_type{-1}};
-  decimalXX b{1.01, scale_type{-2}};
-  decimalXX c{1.001, scale_type{-3}};
-  decimalXX d{1.0001, scale_type{-4}};
-  decimalXX e{1.00001, scale_type{-5}};
-  decimalXX f{1.000001, scale_type{-6}};
-
-  decimalXX x{1.000123, scale_type{-8}};
-  decimalXX y{0.000123, scale_type{-8}};
-
-  EXPECT_EQ(1.1, static_cast<double>(a));
-  EXPECT_EQ(1.01, static_cast<double>(b));
-  EXPECT_EQ(1, static_cast<double>(c));  // intentional (inherited problem from floating point)
-  EXPECT_EQ(1.0001, static_cast<double>(d));
-  EXPECT_EQ(1.00001, static_cast<double>(e));
-  EXPECT_EQ(1, static_cast<double>(f));  // intentional (inherited problem from floating point)
-
-  EXPECT_TRUE(1.000123 - static_cast<double>(x) < std::numeric_limits<double>::epsilon());
-  EXPECT_EQ(0.000123, static_cast<double>(y));
+  auto a = cudf::convert_floating_to_fixed<decimalXX>(1.1, scale_type(-1));
+  auto b = cudf::convert_floating_to_fixed<decimalXX>(1.01, scale_type(-2));
+  auto c = cudf::convert_floating_to_fixed<decimalXX>(1.001, scale_type(-3));
+  auto d = cudf::convert_floating_to_fixed<decimalXX>(1.0001, scale_type(-4));
+  auto e = cudf::convert_floating_to_fixed<decimalXX>(1.00001, scale_type(-5));
+  auto f = cudf::convert_floating_to_fixed<decimalXX>(1.000001, scale_type(-6));
+  auto x = cudf::convert_floating_to_fixed<decimalXX>(1.000123, scale_type(-8));
+  auto y = cudf::convert_floating_to_fixed<decimalXX>(0.000123, scale_type(-8));
+
+  EXPECT_EQ(1.1, cudf::convert_fixed_to_floating<double>(a));
+  EXPECT_EQ(1.01, cudf::convert_fixed_to_floating<double>(b));
+  EXPECT_EQ(1,
+            cudf::convert_fixed_to_floating<double>(
+              c));  // intentional (inherited problem from floating point)
+  EXPECT_EQ(1.0001, cudf::convert_fixed_to_floating<double>(d));
+  EXPECT_EQ(1.00001, cudf::convert_fixed_to_floating<double>(e));
+  EXPECT_EQ(1,
+            cudf::convert_fixed_to_floating<double>(
+              f));  // intentional (inherited problem from floating point)
+
+  EXPECT_TRUE(1.000123 - cudf::convert_fixed_to_floating<double>(x) <
+              std::numeric_limits<double>::epsilon());
+  EXPECT_EQ(0.000123, cudf::convert_fixed_to_floating<double>(y));
 }
 
 TYPED_TEST(FixedPointTestAllReps, SimpleBinaryFPConstruction)
@@ -118,34 +123,34 @@ TYPED_TEST(FixedPointTestAllReps, SimpleBinaryFPConstruction)
   binary_fp num3{10, scale_type{3}};
   binary_fp num4{10, scale_type{4}};
 
-  binary_fp num5{1.24, scale_type{0}};
-  binary_fp num6{1.24, scale_type{-1}};
-  binary_fp num7{1.32, scale_type{-2}};
-  binary_fp num8{1.41, scale_type{-3}};
-  binary_fp num9{1.45, scale_type{-4}};
-
-  EXPECT_EQ(10, static_cast<double>(num0));
-  EXPECT_EQ(10, static_cast<double>(num1));
-  EXPECT_EQ(8, static_cast<double>(num2));
-  EXPECT_EQ(8, static_cast<double>(num3));
-  EXPECT_EQ(0, static_cast<double>(num4));
-
-  EXPECT_EQ(1, static_cast<double>(num5));
-  EXPECT_EQ(1, static_cast<double>(num6));
-  EXPECT_EQ(1.25, static_cast<double>(num7));
-  EXPECT_EQ(1.375, static_cast<double>(num8));
-  EXPECT_EQ(1.4375, static_cast<double>(num9));
+  auto num5 = cudf::convert_floating_to_fixed<binary_fp>(1.24, scale_type(0));
+  auto num6 = cudf::convert_floating_to_fixed<binary_fp>(1.24, scale_type(-1));
+  auto num7 = cudf::convert_floating_to_fixed<binary_fp>(1.32, scale_type(-2));
+  auto num8 = cudf::convert_floating_to_fixed<binary_fp>(1.41, scale_type(-3));
+  auto num9 = cudf::convert_floating_to_fixed<binary_fp>(1.45, scale_type(-4));
+
+  EXPECT_EQ(10, cudf::convert_fixed_to_floating<double>(num0));
+  EXPECT_EQ(10, cudf::convert_fixed_to_floating<double>(num1));
+  EXPECT_EQ(8, cudf::convert_fixed_to_floating<double>(num2));
+  EXPECT_EQ(8, cudf::convert_fixed_to_floating<double>(num3));
+  EXPECT_EQ(0, cudf::convert_fixed_to_floating<double>(num4));
+
+  EXPECT_EQ(1, cudf::convert_fixed_to_floating<double>(num5));
+  EXPECT_EQ(1, cudf::convert_fixed_to_floating<double>(num6));
+  EXPECT_EQ(1.25, cudf::convert_fixed_to_floating<double>(num7));
+  EXPECT_EQ(1.375, cudf::convert_fixed_to_floating<double>(num8));
+  EXPECT_EQ(1.4375, cudf::convert_fixed_to_floating<double>(num9));
 }
 
 TYPED_TEST(FixedPointTestAllReps, MoreSimpleBinaryFPConstruction)
 {
   using binary_fp = fixed_point<TypeParam, Radix::BASE_2>;
 
-  binary_fp num0{1.25, scale_type{-2}};
-  binary_fp num1{2.1, scale_type{-4}};
+  auto num0 = cudf::convert_floating_to_fixed<binary_fp>(1.25, scale_type(-2));
+  auto num1 = cudf::convert_floating_to_fixed<binary_fp>(2.1, scale_type(-4));
 
-  EXPECT_EQ(1.25, static_cast<double>(num0));
-  EXPECT_EQ(2.0625, static_cast<double>(num1));
+  EXPECT_EQ(1.25, cudf::convert_fixed_to_floating<double>(num0));
+  EXPECT_EQ(2.0625, cudf::convert_fixed_to_floating<double>(num1));
 }
 
 TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXMath)
@@ -166,7 +171,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXMath)
   EXPECT_EQ(TWO / ONE, TWO);
   EXPECT_EQ(SIX / TWO, THREE);
 
-  decimalXX a{1.23, scale_type{-2}};
+  auto a = cudf::convert_floating_to_fixed<decimalXX>(1.23, scale_type(-2));
   decimalXX b{0, scale_type{0}};
 
   EXPECT_EQ(a + b, a);
@@ -211,8 +216,8 @@ TYPED_TEST(FixedPointTestAllReps, DecimalXXTrickyDivision)
   EXPECT_EQ(SIXTY_1 / TEN_0, ONE_1);
   EXPECT_EQ(SIXTY_1 / TEN_1, SIX_0);
 
-  decimalXX A{34.56, scale_type{-2}};
-  decimalXX B{1.234, scale_type{-3}};
+  auto A = cudf::convert_floating_to_fixed<decimalXX>(34.56, scale_type(-2));
+  auto B = cudf::convert_floating_to_fixed<decimalXX>(1.234, scale_type(-3));
   decimalXX C{1, scale_type{-2}};
 
   EXPECT_EQ(static_cast<int32_t>(A / B), 20);
@@ -255,17 +260,17 @@ TYPED_TEST(FixedPointTestAllReps, ArithmeticWithDifferentScales)
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
   decimalXX a{1, scale_type{0}};
-  decimalXX b{1.2, scale_type{-1}};
-  decimalXX c{1.23, scale_type{-2}};
-  decimalXX d{1.111, scale_type{-3}};
+  auto b = cudf::convert_floating_to_fixed<decimalXX>(1.2, scale_type(-1));
+  auto c = cudf::convert_floating_to_fixed<decimalXX>(1.23, scale_type(-2));
+  auto d = cudf::convert_floating_to_fixed<decimalXX>(1.111, scale_type(-3));
 
-  decimalXX x{2.2, scale_type{-1}};
-  decimalXX y{3.43, scale_type{-2}};
-  decimalXX z{4.541, scale_type{-3}};
+  auto x = cudf::convert_floating_to_fixed<decimalXX>(2.2, scale_type(-1));
+  auto y = cudf::convert_floating_to_fixed<decimalXX>(3.43, scale_type(-2));
+  auto z = cudf::convert_floating_to_fixed<decimalXX>(4.541, scale_type(-3));
 
-  decimalXX xx{0.2, scale_type{-1}};
-  decimalXX yy{0.03, scale_type{-2}};
-  decimalXX zz{0.119, scale_type{-3}};
+  auto xx = cudf::convert_floating_to_fixed<decimalXX>(0.2, scale_type(-1));
+  auto yy = cudf::convert_floating_to_fixed<decimalXX>(0.03, scale_type(-2));
+  auto zz = cudf::convert_floating_to_fixed<decimalXX>(0.119, scale_type(-3));
 
   EXPECT_EQ(a + b, x);
   EXPECT_EQ(a + b + c, y);
@@ -280,12 +285,12 @@ TYPED_TEST(FixedPointTestAllReps, RescaledTest)
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
   decimalXX num0{1, scale_type{0}};
-  decimalXX num1{1.2, scale_type{-1}};
-  decimalXX num2{1.23, scale_type{-2}};
-  decimalXX num3{1.234, scale_type{-3}};
-  decimalXX num4{1.2345, scale_type{-4}};
-  decimalXX num5{1.23456, scale_type{-5}};
-  decimalXX num6{1.234567, scale_type{-6}};
+  auto num1 = cudf::convert_floating_to_fixed<decimalXX>(1.2, scale_type(-1));
+  auto num2 = cudf::convert_floating_to_fixed<decimalXX>(1.23, scale_type(-2));
+  auto num3 = cudf::convert_floating_to_fixed<decimalXX>(1.234, scale_type(-3));
+  auto num4 = cudf::convert_floating_to_fixed<decimalXX>(1.2345, scale_type(-4));
+  auto num5 = cudf::convert_floating_to_fixed<decimalXX>(1.23456, scale_type(-5));
+  auto num6 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-6));
 
   EXPECT_EQ(num0, num6.rescaled(scale_type{0}));
   EXPECT_EQ(num1, num6.rescaled(scale_type{-1}));
@@ -314,7 +319,7 @@ TYPED_TEST(FixedPointTestAllReps, BoolConversion)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
 
-  decimalXX truthy_value{1.234567, scale_type{0}};
+  auto truthy_value = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(0));
   decimalXX falsy_value{0, scale_type{0}};
 
   // Test explicit conversions
@@ -442,12 +447,14 @@ void float_vector_test(ValueType const initial_value,
   std::vector<decimal32> vec1(size);
   std::vector<ValueType> vec2(size);
 
-  std::iota(std::begin(vec1), std::end(vec1), decimal32{initial_value, scale_type{scale}});
+  auto decimal_input = cudf::convert_floating_to_fixed<decimal32>(initial_value, scale_type{scale});
+  std::iota(std::begin(vec1), std::end(vec1), decimal_input);
   std::iota(std::begin(vec2), std::end(vec2), initial_value);
 
   auto equal = std::equal(
     std::cbegin(vec1), std::cend(vec1), std::cbegin(vec2), [](auto const& a, auto const& b) {
-      return static_cast<double>(a) - b <= std::numeric_limits<ValueType>::epsilon();
+      return cudf::convert_fixed_to_floating<double>(a) - b <=
+             std::numeric_limits<ValueType>::epsilon();
     });
 
   EXPECT_TRUE(equal);
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index e108e68e1f9..a544a812efb 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -548,7 +548,7 @@ TEST_F(OrcWriterTest, SlicedTable)
   int32_col col0(seq_col0.begin(), seq_col0.end());
   str_col col1(strings.begin(), strings.end());
   float32_col col2(seq_col2.begin(), seq_col2.end());
-  float32_col col3(seq_col3, seq_col3 + num_rows);
+  dec64_col col3(seq_col3, seq_col3 + num_rows);
 
   list_col<int64_t> col4{
     {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}};

From af33b0aba4dafe82cb5d25811e5e737af6c7faad Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 11 Apr 2024 16:13:09 -0400
Subject: [PATCH 044/842] nanoarrow uses package override for proper pinned
 versions generation (#15515)

The usage of `PATCH_COMMAND` with `rapids_cpm_find` isn't capturable by `+rapids_cpm_generate_pinned_versions`. So we use a nanoarrow json override file to hold the patch we need applied and the custom SHA1 to check out.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15515
---
 cpp/cmake/thirdparty/get_nanoarrow.cmake      | 36 ++++---------------
 .../patches/nanoarrow_override.json           | 18 ++++++++++
 2 files changed, 24 insertions(+), 30 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/patches/nanoarrow_override.json

diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
index 884e5a2f368..dc0b8d09746 100644
--- a/cpp/cmake/thirdparty/get_nanoarrow.cmake
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -14,44 +14,20 @@
 
 # This function finds nanoarrow and sets any additional necessary environment variables.
 function(find_and_configure_nanoarrow)
-  set(oneValueArgs VERSION FORK PINNED_TAG)
-  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  include(${rapids-cmake-dir}/cpm/package_override.cmake)
 
-  # Only run if PKG_VERSION is < 0.5.0
-  if(PKG_VERSION VERSION_LESS 0.5.0)
-    set(patch_files_to_run "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches/nanoarrow_cmake.diff")
-    set(patch_issues_to_ref
-        "Fix issues with nanoarrow CMake [https://github.com/apache/arrow-nanoarrow/pull/406]"
-    )
-    set(patch_script "${CMAKE_BINARY_DIR}/rapids-cmake/patches/nanoarrow/patch.cmake")
-    set(log_file "${CMAKE_BINARY_DIR}/rapids-cmake/patches/nanoarrow/log")
-    string(TIMESTAMP current_year "%Y" UTC)
-    configure_file(
-      ${rapids-cmake-dir}/cpm/patches/command_template.cmake.in "${patch_script}" @ONLY
-    )
-  else()
-    message(
-      FATAL_ERROR
-        "Nanoarrow version ${PKG_VERSION} already contains the necessary patch. Please remove this patch from cudf."
-    )
-  endif()
+  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
+  rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json")
 
+  # The git_repo and git_tag are provided by the nanoarrow_override file
   rapids_cpm_find(
-    nanoarrow ${PKG_VERSION}
+    nanoarrow 0.4.0
     GLOBAL_TARGETS nanoarrow
     CPM_ARGS
-    GIT_REPOSITORY https://github.com/${PKG_FORK}/arrow-nanoarrow.git
-    GIT_TAG ${PKG_PINNED_TAG}
-    # TODO: Commit hashes are not supported with shallow clones. Can switch this if and when we pin
-    # to an actual tag.
-    GIT_SHALLOW FALSE
-    PATCH_COMMAND ${CMAKE_COMMAND} -P ${patch_script}
     OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
   )
   set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
   rapids_export_find_package_root(BUILD nanoarrow "${nanoarrow_BINARY_DIR}" EXPORT_SET cudf-exports)
 endfunction()
 
-find_and_configure_nanoarrow(
-  VERSION 0.4.0 FORK apache PINNED_TAG c97720003ff863b81805bcdb9f7c91306ab6b6a8
-)
+find_and_configure_nanoarrow()
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json
new file mode 100644
index 00000000000..0b83d1808cb
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/nanoarrow_override.json
@@ -0,0 +1,18 @@
+
+{
+  "packages" : {
+    "nanoarrow" : {
+      "version" : "0.4.0",
+      "git_url" : "https://github.com/apache/arrow-nanoarrow.git",
+      "git_tag" : "c97720003ff863b81805bcdb9f7c91306ab6b6a8",
+      "git_shallow" : false,
+      "patches" : [
+        {
+          "file" : "${current_json_dir}/nanoarrow_cmake.diff",
+          "issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]",
+          "fixed_in" : "0.5.0"
+        }
+      ]
+    }
+  }
+}

From 8506ea6dd12cd1bde91550366d846737bc7fdb7c Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Thu, 11 Apr 2024 18:07:22 -0500
Subject: [PATCH 045/842] Migrate string `case` operations to `pylibcudf`
 (#15489)

This PR creates `pylibcudf` `case` APIs and migrates the cuDF cython to leverage them. Part of https://github.com/rapidsai/cudf/issues/15162.

Authors:
  - https://github.com/brandon-b-miller
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15489
---
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |  2 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |  2 +
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |  2 +
 .../_lib/pylibcudf/strings/CMakeLists.txt     | 21 ++++++++
 .../cudf/_lib/pylibcudf/strings/__init__.pxd  |  3 ++
 .../cudf/_lib/pylibcudf/strings/__init__.py   |  3 ++
 .../cudf/cudf/_lib/pylibcudf/strings/case.pxd |  8 +++
 .../cudf/cudf/_lib/pylibcudf/strings/case.pyx | 30 +++++++++++
 python/cudf/cudf/_lib/strings/case.pyx        | 50 +++++++------------
 .../cudf/pylibcudf_tests/test_string_case.py  | 35 +++++++++++++
 10 files changed, 124 insertions(+), 32 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/case.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_case.py

diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 81d15cf95b4..c2b7cb7ca3d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -44,3 +44,5 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
 )
 link_to_pyarrow_headers(pylibcudf_interop)
+
+add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 48c23a9dd4c..5adefa5fd93 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -17,6 +17,7 @@ from . cimport (
     search,
     sorting,
     stream_compaction,
+    strings,
     types,
     unary,
 )
@@ -48,6 +49,7 @@ __all__ = [
     "rolling",
     "search",
     "stream_compaction",
+    "strings",
     "sorting",
     "types",
     "unary",
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 8ccb0ecc341..89f874f5fa5 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -17,6 +17,7 @@
     search,
     sorting,
     stream_compaction,
+    strings,
     types,
     unary,
 )
@@ -48,6 +49,7 @@
     "rolling",
     "search",
     "stream_compaction",
+    "strings",
     "sorting",
     "types",
     "unary",
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
new file mode 100644
index 00000000000..3a2a9e1e7eb
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
@@ -0,0 +1,21 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources case.pyx)
+set(linked_libraries cudf::cudf)
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
new file mode 100644
index 00000000000..ff87549b5b5
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
@@ -0,0 +1,3 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from . import case
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
new file mode 100644
index 00000000000..ff87549b5b5
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from . import case
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd
new file mode 100644
index 00000000000..225d566fe06
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.column cimport Column
+
+
+cpdef Column to_lower(Column input)
+cpdef Column to_upper(Column input)
+cpdef Column swapcase(Column input)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
new file mode 100644
index 00000000000..69910fd8c50
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.strings cimport case as cpp_case
+from cudf._lib.pylibcudf.column cimport Column
+
+
+cpdef Column to_lower(Column input):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_case.to_lower(input.view())
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column to_upper(Column input):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_case.to_upper(input.view())
+
+    return Column.from_libcudf(move(c_result))
+
+cpdef Column swapcase(Column input):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_case.swapcase(input.view())
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/case.pyx b/python/cudf/cudf/_lib/strings/case.pyx
index 09af1178946..38f242a67d6 100644
--- a/python/cudf/cudf/_lib/strings/case.pyx
+++ b/python/cudf/cudf/_lib/strings/case.pyx
@@ -1,48 +1,34 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.case cimport (
-    swapcase as cpp_swapcase,
-    to_lower as cpp_to_lower,
-    to_upper as cpp_to_upper,
-)
+
+from cudf._lib.pylibcudf.strings import case
 
 
 @acquire_spill_lock()
 def to_upper(Column source_strings):
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_to_upper(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+            case.to_upper(
+                source_strings.to_pylibcudf(mode='read')
+            )
+    )
 
 
 @acquire_spill_lock()
 def to_lower(Column source_strings):
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_to_lower(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+            case.to_lower(
+                source_strings.to_pylibcudf(mode='read')
+            )
+    )
 
 
 @acquire_spill_lock()
 def swapcase(Column source_strings):
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_swapcase(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+            case.swapcase(
+                source_strings.to_pylibcudf(mode='read')
+            )
+    )
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_case.py b/python/cudf/cudf/pylibcudf_tests/test_string_case.py
new file mode 100644
index 00000000000..ae01d953df5
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_case.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def string_col():
+    return pa.array(
+        ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"]
+    )
+
+
+def test_to_upper(string_col):
+    plc_col = plc.interop.from_arrow(string_col)
+    got = plc.strings.case.to_upper(plc_col)
+    expected = pa.compute.utf8_upper(string_col)
+    assert_column_eq(got, expected)
+
+
+def test_to_lower(string_col):
+    plc_col = plc.interop.from_arrow(string_col)
+    got = plc.strings.case.to_lower(plc_col)
+    expected = pa.compute.utf8_lower(string_col)
+    assert_column_eq(got, expected)
+
+
+def test_swapcase(string_col):
+    plc_col = plc.interop.from_arrow(string_col)
+    got = plc.strings.case.swapcase(plc_col)
+    expected = pa.compute.utf8_swapcase(string_col)
+    assert_column_eq(got, expected)

From ff22a7ac0d565be2b2221c6080966eb0338676ee Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Thu, 11 Apr 2024 21:01:40 -0400
Subject: [PATCH 046/842] Fix and clarify notes on result ordering (#13255)

I noticed when answering #13254 that the code example in this section of our documentation was incorrect and the text itself could use some improving.

Authors:
  - Ashwin Srinath (https://github.com/shwina)
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/13255
---
 .../source/user_guide/pandas-comparison.md    | 27 +++++++++++++------
 1 file changed, 19 insertions(+), 8 deletions(-)

diff --git a/docs/cudf/source/user_guide/pandas-comparison.md b/docs/cudf/source/user_guide/pandas-comparison.md
index 549d91b771a..4aaaa8a93df 100644
--- a/docs/cudf/source/user_guide/pandas-comparison.md
+++ b/docs/cudf/source/user_guide/pandas-comparison.md
@@ -87,9 +87,17 @@ using `.from_arrow()` or `.from_pandas()`.
 
 ## Result ordering
 
-By default, `join` (or `merge`), `value_counts` and `groupby` operations in cuDF
-do *not* guarantee output ordering.
-Compare the results obtained from Pandas and cuDF below:
+In Pandas, `join` (or `merge`), `value_counts` and `groupby` operations provide
+certain guarantees about the order of rows in the result returned.  In a Pandas
+`join`, the order of join keys is (depending on the particular style of join
+being performed) either preserved or sorted lexicographically by default.
+`groupby` sorts the group keys, and preserves the order of rows within each
+group. In some cases, disabling this option in Pandas can yield better
+performance.
+
+By contrast, cuDF's default behavior is to return rows in a
+non-deterministic order to maximize performance.  Compare the results
+obtained from Pandas and cuDF below:
 
 ```{code} python
 >>> import cupy as cp
@@ -114,13 +122,16 @@ a
 4  342.000000
 ```
 
-To match Pandas behavior, you must explicitly pass `sort=True`
-or enable the `mode.pandas_compatible` option when trying to
-match Pandas behavior with `sort=False`:
+In most cases, the rows of a DataFrame are accessed by index labels
+rather than by position, so the order in which rows are returned
+doesn't matter. However, if you require that results be returned in a
+predictable (sorted) order, you can pass the `sort=True` option
+explicitly or enable the `mode.pandas_compatible` option when trying
+to match Pandas behavior with `sort=False`:
 
 ```{code} python
->>> df.to_pandas().groupby("a", sort=True).mean().head()
-            b
+>>> df.groupby("a", sort=True).mean().head()
+         b
 a
 0   70.000000
 1  356.333333

From f19d4eb9f2ccbe1833aa8112c053e622bc138301 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 11 Apr 2024 23:07:23 -0500
Subject: [PATCH 047/842] Fix async synchronization issues in json_column.cu
 (#15497)

Fixes #15390
This change fixes async synchronization issues in json_column.cu.
Related file json_tree.cu does not have async synchronization issues.

Summary of changes:
changed debug print async to sync,
added synchronize after multiple async calls
changed h_chars to async since subsequent call is sync (it will also help because chars array is usually large).
changed is_str_column_all_nulls to sync.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15497
---
 cpp/src/io/json/json_column.cu | 30 +++++++++++++++++-------------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index bc5c45d8980..9d40c657396 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -76,16 +76,16 @@ void print_tree(host_span<SymbolT const> input,
                 tree_meta_t const& d_gpu_tree,
                 rmm::cuda_stream_view stream)
 {
-  print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.node_categories, stream),
+  print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.node_categories, stream),
             "node_categories",
             to_cat);
-  print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.parent_node_ids, stream),
+  print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.parent_node_ids, stream),
             "parent_node_ids",
             to_int);
   print_vec(
-    cudf::detail::make_std_vector_async(d_gpu_tree.node_levels, stream), "node_levels", to_int);
-  auto node_range_begin = cudf::detail::make_std_vector_async(d_gpu_tree.node_range_begin, stream);
-  auto node_range_end   = cudf::detail::make_std_vector_async(d_gpu_tree.node_range_end, stream);
+    cudf::detail::make_std_vector_sync(d_gpu_tree.node_levels, stream), "node_levels", to_int);
+  auto node_range_begin = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_begin, stream);
+  auto node_range_end   = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_end, stream);
   print_vec(node_range_begin, "node_range_begin", to_int);
   print_vec(node_range_end, "node_range_end", to_int);
   for (int i = 0; i < int(node_range_begin.size()); i++) {
@@ -333,10 +333,11 @@ rmm::device_uvector<NodeIndexT> get_values_column_indices(TreeDepthT const row_a
  * @param stream CUDA stream
  * @return Vector of strings
  */
-std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
-                                              device_span<SymbolOffsetT const> node_range_begin,
-                                              device_span<SymbolOffsetT const> node_range_end,
-                                              rmm::cuda_stream_view stream)
+std::vector<std::string> copy_strings_to_host_sync(
+  device_span<SymbolT const> input,
+  device_span<SymbolOffsetT const> node_range_begin,
+  device_span<SymbolOffsetT const> node_range_end,
+  rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   auto const num_strings = node_range_begin.size();
@@ -371,12 +372,13 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
   auto to_host        = [stream](auto const& col) {
     if (col.is_empty()) return std::vector<std::string>{};
     auto const scv     = cudf::strings_column_view(col);
-    auto const h_chars = cudf::detail::make_std_vector_sync<char>(
+    auto const h_chars = cudf::detail::make_std_vector_async<char>(
       cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
-    auto const h_offsets = cudf::detail::make_std_vector_sync(
+    auto const h_offsets = cudf::detail::make_std_vector_async(
       cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
                                                scv.size() + 1),
       stream);
+    stream.synchronize();
 
     // build std::string vector from chars and offsets
     std::vector<std::string> host_data;
@@ -528,8 +530,9 @@ void make_device_json_column(device_span<SymbolT const> input,
   auto column_range_beg =
     cudf::detail::make_std_vector_async(d_column_tree.node_range_begin, stream);
   auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream);
-  std::vector<std::string> column_names = copy_strings_to_host(
+  std::vector<std::string> column_names = copy_strings_to_host_sync(
     input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
+  stream.synchronize();
   // array of arrays column names
   if (is_array_of_arrays) {
     TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
@@ -537,6 +540,7 @@ void make_device_json_column(device_span<SymbolT const> input,
       get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
     auto h_values_column_indices =
       cudf::detail::make_std_vector_async(values_column_indices, stream);
+    stream.synchronize();
     std::transform(unique_col_ids.begin(),
                    unique_col_ids.end(),
                    column_names.begin(),
@@ -609,7 +613,7 @@ void make_device_json_column(device_span<SymbolT const> input,
 
   std::vector<uint8_t> is_str_column_all_nulls{};
   if (is_enabled_mixed_types_as_string) {
-    is_str_column_all_nulls = cudf::detail::make_std_vector_async(
+    is_str_column_all_nulls = cudf::detail::make_std_vector_sync(
       is_all_nulls_each_column(input, d_column_tree, tree, col_ids, options, stream), stream);
   }
 

From 6f8ff799bfc9e921bcde97c46cf3454c6ae45c6d Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 12 Apr 2024 11:40:18 -0500
Subject: [PATCH 048/842] Move to pandas-tests to a dedicated workflow file and
 trigger it from branch.yaml (#15516)

This PR moves pandas-tests to a dedicated workflow file and trigger's it from `branch.yaml`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Ray Douglass (https://github.com/raydouglass)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15516
---
 .github/workflows/build.yaml                | 18 ++++++++++++++
 .github/workflows/pandas-tests.yaml         | 27 +++++++++++++++++++++
 .github/workflows/pr.yaml                   |  2 +-
 .github/workflows/test.yaml                 | 11 ---------
 ci/cudf_pandas_scripts/pandas-tests/diff.sh |  8 +++---
 ci/cudf_pandas_scripts/pandas-tests/run.sh  | 11 +++++----
 6 files changed, 57 insertions(+), 20 deletions(-)
 create mode 100644 .github/workflows/pandas-tests.yaml

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 67c451fbd6e..6942ef0009d 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -108,3 +108,21 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: dask_cudf
+  trigger-pandas-tests:
+    if: inputs.build_type == 'nightly'
+    needs: wheel-build-cudf
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout code repo
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.sha }}
+          persist-credentials: false
+      - name: Trigger pandas-tests
+        env:
+          GH_TOKEN: ${{ github.token }}
+        run: |
+          gh workflow run pandas-tests.yaml \
+            -f branch=${{ inputs.branch }} \
+            -f sha=${{ inputs.sha }} \
+            -f date=${{ inputs.date }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
new file mode 100644
index 00000000000..60544294809
--- /dev/null
+++ b/.github/workflows/pandas-tests.yaml
@@ -0,0 +1,27 @@
+name: Pandas Test Job
+
+on:
+  workflow_dispatch:
+    inputs:
+      branch:
+        required: true
+        type: string
+      date:
+        required: true
+        type: string
+      sha:
+        required: true
+        type: string
+
+jobs:
+  pandas-tests:
+      # run the Pandas unit tests
+      secrets: inherit
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+      with:
+        matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
+        build_type: nightly
+        branch: ${{ inputs.branch }}
+        date: ${{ inputs.date }}
+        sha: ${{ inputs.sha }}
+        script: ci/cudf_pandas_scripts/pandas-tests/run.sh main
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 345ccbea45b..f84b1f42928 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -174,7 +174,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
       build_type: pull-request
       script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
       # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit.
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 65aef37697e..c5ae2f3b5a8 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -125,14 +125,3 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: ci/cudf_pandas_scripts/run_tests.sh
-  pandas-tests:
-    # run the Pandas unit tests
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
-    with:
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(min_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-      build_type: nightly
-      branch: ${{ inputs.branch }}
-      date: ${{ inputs.date }}
-      sha: ${{ inputs.sha }}
-      script: ci/cudf_pandas_scripts/pandas-tests/run.sh main
diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
index cf80f383db4..f87a3a36fcc 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
@@ -8,14 +8,16 @@
 
 # Hard-coded needs to match the version deduced by rapids-upload-artifacts-dir
 GH_JOB_NAME="pandas-tests-diff / build"
+RAPIDS_FULL_VERSION=$(<./VERSION)
 rapids-logger "Github job name: ${GH_JOB_NAME}"
+rapids-logger "Rapids version: ${RAPIDS_FULL_VERSION}"
 
 PY_VER="39"
-MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-results.json
-PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-results.json
+MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json
+PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json
 
 rapids-logger "Fetching latest available results from nightly"
-aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt
+aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt
 
 read -r COMPARE_ENV < s3_output.txt
 export COMPARE_ENV
diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index 1f70ca78c41..d13d31ad09f 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -6,8 +6,8 @@
 set -euo pipefail
 
 PANDAS_TESTS_BRANCH=${1}
-
-rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch"
+RAPIDS_FULL_VERSION=$(<./VERSION)
+rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch and rapids-version $RAPIDS_FULL_VERSION"
 rapids-logger "PR number: ${RAPIDS_REF_NAME:-"unknown"}"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
@@ -27,9 +27,10 @@ bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \
   --dist worksteal \
   --report-log=${PANDAS_TESTS_BRANCH}.json 2>&1
 
+SUMMARY_FILE_NAME=${PANDAS_TESTS_BRANCH}-24.06-results.json
 # summarize the results and save them to artifacts:
-python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/${PANDAS_TESTS_BRANCH}.json > pandas-testing/${PANDAS_TESTS_BRANCH}-results.json
+python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/${PANDAS_TESTS_BRANCH}.json > pandas-testing/${SUMMARY_FILE_NAME}
 RAPIDS_ARTIFACTS_DIR=${RAPIDS_ARTIFACTS_DIR:-"${PWD}/artifacts"}
 mkdir -p "${RAPIDS_ARTIFACTS_DIR}"
-mv pandas-testing/${PANDAS_TESTS_BRANCH}-results.json ${RAPIDS_ARTIFACTS_DIR}/
-rapids-upload-to-s3 ${RAPIDS_ARTIFACTS_DIR}/${PANDAS_TESTS_BRANCH}-results.json "${RAPIDS_ARTIFACTS_DIR}"
+mv pandas-testing/${SUMMARY_FILE_NAME} ${RAPIDS_ARTIFACTS_DIR}/
+rapids-upload-to-s3 ${RAPIDS_ARTIFACTS_DIR}/${SUMMARY_FILE_NAME} "${RAPIDS_ARTIFACTS_DIR}"

From 2e00cb1ebd7bee4a4085d1e691ad3b626bc10d0e Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 12 Apr 2024 18:06:27 +0100
Subject: [PATCH 049/842] cudf.pandas: Series dt accessor is
 CombinedDatetimelikeProperties (#15523)

On the pandas Series type (not an instance) the dt attribute returns a CombinedDatetimelikeProperties object, which advertises the attributes of all possible datetime like dtypes. Previously we were proxying this with a DatatimeProperties object, which doesn't advertise as many properties. To allow wrapping libraries like dask that introspect the object to work correctly, advertise like pandas on the type. The instance still produces an object of the correct type due to dynamic lookup and/or metaclass magic in cudf.pandas and pandas respectively.

- Closes #15522

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15523
---
 python/cudf/cudf/pandas/_wrappers/pandas.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index b7c8e92e8db..3c82d571939 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -174,7 +174,7 @@ def _DataFrame__dir__(self):
         "__arrow_array__": arrow_array_method,
         "__cuda_array_interface__": cuda_array_interface,
         "__iter__": custom_iter,
-        "dt": _AccessorAttr(DatetimeProperties),
+        "dt": _AccessorAttr(CombinedDatetimelikeProperties),
         "str": _AccessorAttr(StringMethods),
         "cat": _AccessorAttr(_CategoricalAccessor),
         "_constructor": _FastSlowAttribute("_constructor"),
@@ -208,7 +208,7 @@ def Index__new__(cls, *args, **kwargs):
         "__array_function__": array_function_method,
         "__arrow_array__": arrow_array_method,
         "__cuda_array_interface__": cuda_array_interface,
-        "dt": _AccessorAttr(DatetimeProperties),
+        "dt": _AccessorAttr(CombinedDatetimelikeProperties),
         "str": _AccessorAttr(StringMethods),
         "cat": _AccessorAttr(_CategoricalAccessor),
         "__iter__": custom_iter,

From f5df665da989b88853381bfb776224d17b38ce47 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 12 Apr 2024 17:44:37 -0400
Subject: [PATCH 050/842] Performance improvement in libcudf case conversion
 for long strings (#15441)

Improves logic efficiency overall strings case conversion and reworks the specialized kernels for long strings to improve parallelization within each string.
Closes #15406

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15441
---
 cpp/src/strings/case.cu          | 232 ++++++++++++++++++++++---------
 cpp/tests/strings/case_tests.cpp |   7 +-
 2 files changed, 168 insertions(+), 71 deletions(-)

diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 8d8930013cf..a7fd244f8a5 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -19,6 +19,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
@@ -34,6 +35,9 @@
 
 #include <cuda/atomic>
 #include <cuda/functional>
+#include <thrust/for_each.h>
+#include <thrust/merge.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace strings {
@@ -110,23 +114,22 @@ struct convert_char_fn {
  *
  * This can be used in calls to make_strings_children.
  */
-struct upper_lower_fn {
+struct base_upper_lower_fn {
   convert_char_fn converter;
-  column_device_view d_strings;
   size_type* d_offsets{};
   char* d_chars{};
 
-  __device__ void operator()(size_type idx) const
+  base_upper_lower_fn(convert_char_fn converter) : converter(converter) {}
+
+  __device__ inline void process_string(string_view d_str, size_type idx) const
   {
-    if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
-      return;
-    }
-    auto const d_str = d_strings.element<string_view>(idx);
-    size_type bytes  = 0;
-    char* d_buffer   = d_chars ? d_chars + d_offsets[idx] : nullptr;
-    for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) {
-      auto const size = converter.process_character(*itr, d_buffer);
+    size_type bytes = 0;
+    char* d_buffer  = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    for (auto itr = d_str.data(); itr < (d_str.data() + d_str.size_bytes()); ++itr) {
+      if (is_utf8_continuation_char(static_cast<u_char>(*itr))) continue;
+      char_utf8 chr = 0;
+      to_char_utf8(itr, chr);
+      auto const size = converter.process_character(chr, d_buffer);
       if (d_buffer) {
         d_buffer += size;
       } else {
@@ -137,45 +140,116 @@ struct upper_lower_fn {
   }
 };
 
+struct upper_lower_fn : public base_upper_lower_fn {
+  column_device_view d_strings;
+
+  upper_lower_fn(convert_char_fn converter, column_device_view const& d_strings)
+    : base_upper_lower_fn{converter}, d_strings{d_strings}
+  {
+  }
+
+  __device__ void operator()(size_type idx) const
+  {
+    if (d_strings.is_null(idx)) {
+      if (!d_chars) { d_offsets[idx] = 0; }
+      return;
+    }
+    auto const d_str = d_strings.element<string_view>(idx);
+    process_string(d_str, idx);
+  }
+};
+
+// Long strings are divided into smaller strings using this value as a guide.
+// Generally strings are split into sub-blocks of bytes of this size but
+// care is taken to not sub-block in the middle of a multi-byte character.
+constexpr size_type LS_SUB_BLOCK_SIZE = 32;
+
 /**
- * @brief Count output bytes in warp-parallel threads
+ * @brief Produces sub-offsets for the chars in the given strings column
+ */
+struct sub_offset_fn {
+  char const* d_input_chars;
+  int64_t first_offset;
+  int64_t last_offset;
+
+  __device__ int64_t operator()(int64_t idx) const
+  {
+    auto const end = d_input_chars + last_offset;
+    auto position  = (idx + 1) * LS_SUB_BLOCK_SIZE;
+    auto begin     = d_input_chars + first_offset + position;
+    while ((begin < end) && is_utf8_continuation_char(static_cast<u_char>(*begin))) {
+      ++begin;
+      ++position;
+    }
+    return (begin < end) ? position + first_offset : last_offset;
+  }
+};
+
+/**
+ * @brief Specialized case conversion for long strings
  *
- * This executes as one warp per string and just computes the output sizes.
+ * This is needed since the offset count can exceed size_type.
+ * Also, nulls are ignored since this purely builds the output chars.
+ * The d_offsets are only temporary to help address the sub-blocks.
  */
-struct count_bytes_fn {
+struct upper_lower_ls_fn : public base_upper_lower_fn {
   convert_char_fn converter;
-  column_device_view d_strings;
-  size_type* d_offsets;
+  char const* d_input_chars;
+  int64_t* d_input_offsets;  // includes column offset
 
+  upper_lower_ls_fn(convert_char_fn converter, char const* d_input_chars, int64_t* d_input_offsets)
+    : base_upper_lower_fn{converter}, d_input_chars{d_input_chars}, d_input_offsets{d_input_offsets}
+  {
+  }
+
+  // idx is row index
   __device__ void operator()(size_type idx) const
   {
-    auto const str_idx  = idx / cudf::detail::warp_size;
-    auto const lane_idx = idx % cudf::detail::warp_size;
-
-    // initialize the output for the atomicAdd
-    if (lane_idx == 0) { d_offsets[str_idx] = 0; }
-    __syncwarp();
-
-    if (d_strings.is_null(str_idx)) { return; }
-    auto const d_str   = d_strings.element<string_view>(str_idx);
-    auto const str_ptr = d_str.data();
-
-    size_type size = 0;
-    for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) {
-      auto const chr = str_ptr[i];
-      if (is_utf8_continuation_char(chr)) { continue; }
-      char_utf8 u8 = 0;
-      to_char_utf8(str_ptr + i, u8);
-      size += converter.process_character(u8);
-    }
-    // this is every so slightly faster than using the cub::warp_reduce
-    if (size > 0) {
-      cuda::atomic_ref<size_type, cuda::thread_scope_block> ref{*(d_offsets + str_idx)};
-      ref.fetch_add(size, cuda::std::memory_order_relaxed);
-    }
+    auto const offset = d_input_offsets[idx];
+    auto const d_str  = string_view{d_input_chars + offset,
+                                   static_cast<size_type>(d_input_offsets[idx + 1] - offset)};
+    process_string(d_str, idx);
   }
 };
 
+/**
+ * @brief Count output bytes in warp-parallel threads
+ *
+ * This executes as one warp per string and just computes the output sizes.
+ */
+CUDF_KERNEL void count_bytes_kernel(convert_char_fn converter,
+                                    column_device_view d_strings,
+                                    size_type* d_sizes)
+{
+  auto idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; }
+
+  auto const str_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+
+  // initialize the output for the atomicAdd
+  if (lane_idx == 0) { d_sizes[str_idx] = 0; }
+  __syncwarp();
+
+  if (d_strings.is_null(str_idx)) { return; }
+  auto const d_str   = d_strings.element<string_view>(str_idx);
+  auto const str_ptr = d_str.data();
+
+  size_type size = 0;
+  for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) {
+    auto const chr = str_ptr[i];
+    if (is_utf8_continuation_char(chr)) { continue; }
+    char_utf8 u8 = 0;
+    to_char_utf8(str_ptr + i, u8);
+    size += converter.process_character(u8);
+  }
+  // this is slightly faster than using the cub::warp_reduce
+  if (size > 0) {
+    cuda::atomic_ref<size_type, cuda::thread_scope_block> ref{*(d_sizes + str_idx)};
+    ref.fetch_add(size, cuda::std::memory_order_relaxed);
+  }
+}
+
 /**
  * @brief Special functor for processing ASCII-only data
  */
@@ -208,11 +282,18 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   auto const d_cases   = get_character_cases_table();
   auto const d_special = get_special_case_mapping_table();
 
+  auto const first_offset = (input.offset() == 0) ? 0L
+                                                  : cudf::strings::detail::get_offset_value(
+                                                      input.offsets(), input.offset(), stream);
+  auto const last_offset =
+    cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream);
+  auto const chars_size = last_offset - first_offset;
+
   convert_char_fn ccfn{case_flag, d_flags, d_cases, d_special};
   upper_lower_fn converter{ccfn, *d_strings};
 
   // For smaller strings, use the regular string-parallel algorithm
-  if ((input.chars_size(stream) / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
+  if ((chars_size / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
     auto [offsets, chars] =
       cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr);
     return make_strings_column(input.size(),
@@ -235,9 +316,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
                        [] __device__(auto chr) { return is_utf8_continuation_char(chr); })) > 0;
   if (!multi_byte_chars) {
     // optimization for ASCII-only case: copy the input column and inplace replace each character
-    auto result           = std::make_unique<column>(input.parent(), stream, mr);
-    auto d_chars          = result->mutable_view().head<char>();
-    auto const chars_size = strings_column_view(result->view()).chars_size(stream);
+    auto result  = std::make_unique<column>(input.parent(), stream, mr);
+    auto d_chars = result->mutable_view().head<char>();
     thrust::transform(
       rmm::exec_policy(stream), d_chars, d_chars + chars_size, d_chars, ascii_converter_fn{ccfn});
     result->set_null_count(input.null_count());
@@ -245,30 +325,46 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   }
 
   // This will use a warp-parallel algorithm to compute the output sizes for each string
-  // and then uses the normal string parallel functor to build the output.
-  auto offsets = make_numeric_column(
-    data_type{type_to_id<size_type>()}, input.size() + 1, mask_state::UNALLOCATED, stream, mr);
-  auto d_offsets = offsets->mutable_view().data<size_type>();
-
-  // first pass, compute output sizes
   // note: tried to use segmented-reduce approach instead here and it was consistently slower
-  count_bytes_fn counter{ccfn, *d_strings, d_offsets};
-  auto const count_itr = thrust::make_counting_iterator<size_type>(0);
-  thrust::for_each_n(
-    rmm::exec_policy(stream), count_itr, input.size() * cudf::detail::warp_size, counter);
-
-  // convert sizes to offsets
-  auto const bytes =
-    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream);
-  CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
-               "Size of output exceeds the column size limit",
-               std::overflow_error);
-
-  rmm::device_uvector<char> chars(bytes, stream, mr);
-  // second pass, write output
-  converter.d_offsets = d_offsets;
-  converter.d_chars   = chars.data();
-  thrust::for_each_n(rmm::exec_policy(stream), count_itr, input.size(), converter);
+  auto [offsets, bytes] = [&] {
+    rmm::device_uvector<size_type> sizes(input.size(), stream);
+    constexpr int block_size = 512;
+    cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
+    count_bytes_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      ccfn, *d_strings, sizes.data());
+    // convert sizes to offsets
+    return cudf::strings::detail::make_offsets_child_column(sizes.begin(), sizes.end(), stream, mr);
+  }();
+
+  // build sub-offsets
+  auto const input_chars = input.chars_begin(stream);
+  auto const sub_count   = chars_size / LS_SUB_BLOCK_SIZE;
+  auto tmp_offsets       = rmm::device_uvector<int64_t>(sub_count + input.size() + 1, stream);
+  {
+    rmm::device_uvector<size_type> sub_offsets(sub_count, stream);
+    auto const count_itr = thrust::make_counting_iterator<size_type>(0);
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      count_itr,
+                      count_itr + sub_count,
+                      sub_offsets.data(),
+                      sub_offset_fn{input_chars, first_offset, last_offset});
+
+    // merge them with input offsets
+    auto input_offsets =
+      cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
+    thrust::merge(rmm::exec_policy_nosync(stream),
+                  input_offsets,
+                  input_offsets + input.size() + 1,
+                  sub_offsets.begin(),
+                  sub_offsets.end(),
+                  tmp_offsets.begin());
+  }
+
+  // run case conversion over the new sub-strings
+  auto const tmp_size = static_cast<size_type>(tmp_offsets.size()) - 1;
+  upper_lower_ls_fn sub_conv{ccfn, input_chars, tmp_offsets.data()};
+  auto chars =
+    std::get<1>(cudf::strings::detail::make_strings_children(sub_conv, tmp_size, stream, mr));
 
   return make_strings_column(input.size(),
                              std::move(offsets),
diff --git a/cpp/tests/strings/case_tests.cpp b/cpp/tests/strings/case_tests.cpp
index 1d82d785ae8..bb0e77a29d0 100644
--- a/cpp/tests/strings/case_tests.cpp
+++ b/cpp/tests/strings/case_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -235,7 +235,7 @@ TEST_F(StringsCaseTest, LongStrings)
 {
   // average string length >= AVG_CHAR_BYTES_THRESHOLD as defined in case.cu
   cudf::test::strings_column_wrapper input{
-    "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
+    "abcdéfghijklmnopqrstuvwxyzABCDÉFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ",
     "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
     "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
     "ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=-"};
@@ -256,7 +256,8 @@ TEST_F(StringsCaseTest, LongStrings)
   results = cudf::strings::to_upper(view);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  results = cudf::strings::to_upper(cudf::strings_column_view(cudf::slice(input, {1, 3}).front()));
+  view    = cudf::strings_column_view(cudf::slice(input, {1, 3}).front());
+  results = cudf::strings::to_upper(view);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, cudf::slice(expected, {1, 3}).front());
 }
 

From 9cc87f01810d598eca4b80ce95b4c1eb72617a3a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 12 Apr 2024 14:30:27 -1000
Subject: [PATCH 051/842] Skip pandas unit tests that crash pytest workers in
 `cudf.pandas` (#15521)

While enabling some ignored pandas unit tests for `cudf.pandas`, tests were passing in the specified 30 minute allotment, but it appears some of these newly enabled tests are still causing pytest workers to crash. I think it's OK to lose some testing coverage of these tests if it means test runners are not crashing

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15521
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 2f6c4ac5b13..e21c4572e44 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -130,10 +130,15 @@ and not test_s3_roundtrip_for_dir[partition_col0] \
 and not test_s3_roundtrip_for_dir[partition_col1] \
 and not test_s3_roundtrip"
 
+TEST_THAT_CRASH_PYTEST_WORKERS="not test_bitmasks_pyarrow \
+and not test_large_string_pyarrow \
+and not test_interchange_from_corrected_buffer_dtypes \
+and not test_eof_states"
+
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
-    -k "not test_to_parquet_gcs_new_file and not test_qcut_nat and not test_add and not test_ismethods and $TEST_THAT_NEED_MOTO_SERVER" \
+    -k "not test_to_parquet_gcs_new_file and not test_qcut_nat and not test_add and not test_ismethods and $TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS" \
     --import-mode=importlib \
     ${PYTEST_IGNORES} \
     "$@" || [ $? = 1 ]  # Exit success if exit code was 1 (permit test failures but not other errors)

From c8cb4953550dc7b0e0f30c9d33ef55e25f935ef4 Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Sat, 13 Apr 2024 13:47:11 -0400
Subject: [PATCH 052/842] Update CONTRIBUTING.md to use latest cuda env
 (#15467)

clean commit of https://github.com/rapidsai/cudf/pull/15401

Authors:
  - Ray Bell (https://github.com/raybellwaves)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15467
---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index dce92d7e613..757eaa44510 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -105,7 +105,7 @@ Instructions for a minimal build environment without conda are included below.
 # create the conda environment (assuming in base `cudf` directory)
 # note: RAPIDS currently doesn't support `channel_priority: strict`;
 # use `channel_priority: flexible` instead
-conda env create --name cudf_dev --file conda/environments/all_cuda-118_arch-x86_64.yaml
+conda env create --name cudf_dev --file conda/environments/all_cuda-122_arch-x86_64.yaml
 # activate the environment
 conda activate cudf_dev
 ```

From 8beb4cea15602c081e3f948ceee181730d74a296 Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Mon, 15 Apr 2024 07:36:14 -0400
Subject: [PATCH 053/842] rm-dup-doc in frame.py (#15530)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extended view of the diff

![Screenshot 2024-04-13 at 11 02 32 PM](https://github.com/rapidsai/cudf/assets/17162724/e6cd36b1-73b3-4910-b186-eb0906ea1fa6)

Found a couple of others whilst looking into https://github.com/rapidsai/cudf/issues/15487

Authors:
  - Ray Bell (https://github.com/raybellwaves)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15530
---
 python/cudf/cudf/core/frame.py         | 12 ------------
 python/cudf/cudf/core/indexed_frame.py |  5 -----
 2 files changed, 17 deletions(-)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 809bdb4e6d1..01842b5f0a9 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1808,12 +1808,6 @@ def all(self, axis=0, skipna=True, **kwargs):
         b    False
         dtype: bool
 
-        .. pandas-compat::
-            **DataFrame.all, Series.all**
-
-            Parameters currently not supported are `axis`, `bool_only`,
-            `level`.
-
         .. pandas-compat::
             **DataFrame.all, Series.all**
 
@@ -1867,12 +1861,6 @@ def any(self, axis=0, skipna=True, **kwargs):
         b    True
         dtype: bool
 
-        .. pandas-compat::
-            **DataFrame.any, Series.any**
-
-            Parameters currently not supported are `axis`, `bool_only`,
-            `level`.
-
         .. pandas-compat::
             **DataFrame.any, Series.any**
 
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index ca9d5590044..c412b7a7e47 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1530,11 +1530,6 @@ def median(
         >>> ser.median()
         17.0
 
-        .. pandas-compat::
-            **DataFrame.median, Series.median**
-
-            Parameters currently not supported are `level` and `numeric_only`.
-
         .. pandas-compat::
             **DataFrame.median, Series.median**
 

From 1403e1b0b378397261d7cfa0025f791bb289f1e8 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 15 Apr 2024 08:20:27 -0500
Subject: [PATCH 054/842] Remove version hard-coding (#15529)

This PR removes version hard-coding introduced in https://github.com/rapidsai/cudf/pull/15516

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15529
---
 ci/cudf_pandas_scripts/pandas-tests/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index d13d31ad09f..abde5e5d160 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -27,7 +27,7 @@ bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \
   --dist worksteal \
   --report-log=${PANDAS_TESTS_BRANCH}.json 2>&1
 
-SUMMARY_FILE_NAME=${PANDAS_TESTS_BRANCH}-24.06-results.json
+SUMMARY_FILE_NAME=${PANDAS_TESTS_BRANCH}-${RAPIDS_FULL_VERSION}-results.json
 # summarize the results and save them to artifacts:
 python python/cudf/cudf/pandas/scripts/summarize-test-results.py --output json pandas-testing/${PANDAS_TESTS_BRANCH}.json > pandas-testing/${SUMMARY_FILE_NAME}
 RAPIDS_ARTIFACTS_DIR=${RAPIDS_ARTIFACTS_DIR:-"${PWD}/artifacts"}

From 64229b91283c3c7e1237962b294e1c38d1bffb35 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 15 Apr 2024 06:26:27 -1000
Subject: [PATCH 055/842] Make some private class properties not settable
 (#15527)

It appears these properties do not map to public APIs, and for better state management, these `@property`s are better left not settable.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15527
---
 python/cudf/cudf/core/column/categorical.py  | 6 ------
 python/cudf/cudf/core/single_column_frame.py | 5 -----
 2 files changed, 11 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index e4620ee5bc4..e3e73035046 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -614,12 +614,6 @@ def children(self) -> Tuple[NumericalColumn]:
     def categories(self) -> ColumnBase:
         return self.dtype.categories._values
 
-    @categories.setter
-    def categories(self, value):
-        self._dtype = CategoricalDtype(
-            categories=value, ordered=self.dtype.ordered
-        )
-
     @property
     def codes(self) -> NumericalColumn:
         if self._codes is None:
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 19dde2e51b9..829790007c9 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -103,11 +103,6 @@ def _num_columns(self):
     def _column(self):
         return self._data[self.name]
 
-    @_column.setter  # type: ignore
-    @_cudf_nvtx_annotate
-    def _column(self, value):
-        self._data[self.name] = value
-
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def values(self):  # noqa: D102

From ca7d85b2beb3d82161ceda642038f3f082900650 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 15 Apr 2024 11:26:51 -0500
Subject: [PATCH 056/842] Relax protobuf lower bound to 3.20. (#15506)

This PR drops the lower bound of protobuf to 3.20, to make cuDF compatible with the versions used in Google Colab.

I tested this manually in Google Colab, which uses protobuf 3.20, and cuDF 24.02 seemed to work fine when reading ORC statistics (the only runtime feature in cuDF that needs protobuf). Note: cuDF 24.02 was built was a newer protobuf/protoc, version 4.x.

I will test this by forcing protobuf 3.20 in CI, and then revert those changes if tests pass.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nick Becker (https://github.com/beckernick)
  - Ray Douglass (https://github.com/raydouglass)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15506
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-122_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 2 +-
 dependencies.yaml                                | 2 +-
 python/cudf/pyproject.toml                       | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 82d7104b0da..e629f8b633e 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -68,7 +68,7 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- protobuf>=4.21,<5
+- protobuf>=3.20,<5
 - ptxcompiler
 - pyarrow==14.0.2.*
 - pydata-sphinx-theme!=0.14.2
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 0fd87e91745..f135a88cac2 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -66,7 +66,7 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- protobuf>=4.21,<5
+- protobuf>=3.20,<5
 - pyarrow==14.0.2.*
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 7633fbb00a3..cd9237bd7cb 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -78,7 +78,7 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
   run:
-    - {{ pin_compatible('protobuf', min_pin='x.x', max_pin='x') }}
+    - protobuf >=3.20,<5.0a0
     - python
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.2dev0
diff --git a/dependencies.yaml b/dependencies.yaml
index 5bb555df818..8cd4c798c38 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -286,7 +286,7 @@ dependencies:
       - output_types: conda
         packages:
           - &rmm_conda rmm==24.6.*
-          - &protobuf protobuf>=4.21,<5
+          - &protobuf protobuf>=3.20,<5
           - pip
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 003a92988de..434383bc208 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = [
     "nvtx>=0.2.1",
     "packaging",
     "pandas>=2.0,<2.2.2dev0",
-    "protobuf>=4.21,<5",
+    "protobuf>=3.20,<5",
     "ptxcompiler",
     "pyarrow>=14.0.1,<15.0.0a0",
     "rich",

From 74b39e213a4e6a6a1cf9f0e8d19a112fc6639214 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 15 Apr 2024 14:57:25 -0400
Subject: [PATCH 057/842] Fix exponent overflow in strings-to-double conversion
 (#15517)

Adds a check when computing the exponent in the strings-to-double conversion to prevent an integer overflow.

Closes #15508

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15517
---
 .../detail/convert/string_to_float.cuh        |  5 ++-
 cpp/tests/strings/floats_tests.cpp            | 36 ++++++++++---------
 2 files changed, 23 insertions(+), 18 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/convert/string_to_float.cuh b/cpp/include/cudf/strings/detail/convert/string_to_float.cuh
index ab934750f9e..bbf56cf1446 100644
--- a/cpp/include/cudf/strings/detail/convert/string_to_float.cuh
+++ b/cpp/include/cudf/strings/detail/convert/string_to_float.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -102,6 +102,9 @@ __device__ inline double stod(string_view const& d_str)
           ch = *in_ptr++;
           if (ch < '0' || ch > '9') break;
           exp_ten = (exp_ten * 10) + (int)(ch - '0');
+          // Prevent integer overflow in exp_ten. 100,000,000 is the largest
+          // power of ten that can be multiplied by 10 without overflow.
+          if (exp_ten >= 100'000'000) { break; }
         }
       }
     }
diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp
index f668c384787..9fa1a3325b4 100644
--- a/cpp/tests/strings/floats_tests.cpp
+++ b/cpp/tests/strings/floats_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -25,8 +26,6 @@
 
 #include <vector>
 
-constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
-
 struct StringsConvertTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsConvertTest, IsFloat)
@@ -89,7 +88,7 @@ TEST_F(StringsConvertTest, ToFloats32)
     h_expected.begin(),
     h_expected.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsConvertTest, FromFloats32)
@@ -118,38 +117,41 @@ TEST_F(StringsConvertTest, FromFloats32)
     h_expected.end(),
     thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsConvertTest, ToFloats64)
 {
   // clang-format off
   std::vector<const char*> h_strings{
-    "1234",   nullptr,    "-876",     "543.2",         "-0.12",   ".25",
+    "1234",   "",         "-876",     "543.2",         "-0.12",   ".25",
     "-.002",  "",         "-0.0",     "1.28e256",      "NaN",     "abc123",
     "123abc", "456e",     "-1.78e+5", "-122.33644782", "12e+309", "1.7976931348623159E308",
     "-Inf",   "-INFINITY", "1.0",     "1.7976931348623157e+308",  "1.7976931348623157e-307",
     // subnormal numbers:           v--- smallest double               v--- result is 0
-    "4e-308", "3.3333333333e-320", "4.940656458412465441765688e-324", "1.e-324" };
+    "4e-308", "3.3333333333e-320", "4.940656458412465441765688e-324", "1.e-324",
+    // another very small number
+    "9.299999257686047e-0005603333574677677" };
   // clang-format on
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto validity = cudf::test::iterators::null_at(1);
+  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end(), validity);
 
   std::vector<double> h_expected;
   std::for_each(h_strings.begin(), h_strings.end(), [&](char const* str) {
-    h_expected.push_back(str ? std::atof(str) : 0);
+    h_expected.push_back(std::atof(str));
   });
 
   auto strings_view = cudf::strings_column_view(strings);
   auto results = cudf::strings::to_floats(strings_view, cudf::data_type{cudf::type_id::FLOAT64});
 
   cudf::test::fixed_width_column_wrapper<double> expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+    h_expected.begin(), h_expected.end(), validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  results = cudf::strings::is_float(strings_view);
+  cudf::test::fixed_width_column_wrapper<bool> is_expected(
+    {1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, is_expected);
 }
 
 TEST_F(StringsConvertTest, FromFloats64)
@@ -178,7 +180,7 @@ TEST_F(StringsConvertTest, FromFloats64)
     h_expected.end(),
     thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected, verbosity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsConvertTest, ZeroSizeStringsColumnFloat)

From 89196900f5739a39bd9861d3b494b47ff75e7f71 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 15 Apr 2024 20:42:00 -0500
Subject: [PATCH 058/842] Remove conda channel setup from wheel CI image
 script. (#15539)

The new `configure_cpp_static.sh` script added in https://github.com/rapidsai/cudf/pull/15437 is calling `rapids-configure-conda-channels`. However, it is doing so on a `ci-wheel` image, which fails. This is causing CI issues and needs to be removed.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15539
---
 ci/configure_cpp_static.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh
index 675e0c3981f..d1f9e0d1399 100755
--- a/ci/configure_cpp_static.sh
+++ b/ci/configure_cpp_static.sh
@@ -3,8 +3,6 @@
 
 set -euo pipefail
 
-rapids-configure-conda-channels
-
 source rapids-date-string
 
 rapids-logger "Configure static cpp build"

From c1dcc31c07e858dfc0f24ff77e5b111551ad8a0e Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 16 Apr 2024 12:54:56 +0100
Subject: [PATCH 059/842] Handle case of scan aggregation in groupby-transform
 (#15450)

When performing a groupby-transform with a scan aggregation, the intermediate result obtained from calling groupby-agg is already the correct shape and does not need to be broadcast to align with the grouping keys.

To handle this, make sure that if the requested transform is a scan that we don't try and broadcast.

While here, tighten up the input checking: transform only applies to a single aggregation, rather than the more general interface offered by agg.

- Closes #12621
- Closes #15448

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15450
---
 python/cudf/cudf/core/groupby/groupby.py      | 12 +++++-
 .../cudf/cudf/tests/groupby/test_transform.py | 43 +++++++++++++++++++
 2 files changed, 54 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf/tests/groupby/test_transform.py

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 945e546af1a..dd4924676f3 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1767,13 +1767,23 @@ def transform(self, function):
         --------
         agg
         """
+        if not (isinstance(function, str) or callable(function)):
+            raise TypeError(
+                "Aggregation must be a named aggregation or a callable"
+            )
         try:
             result = self.agg(function)
         except TypeError as e:
             raise NotImplementedError(
                 "Currently, `transform()` supports only aggregations."
             ) from e
-
+        # If the aggregation is a scan, don't broadcast
+        if libgroupby._is_all_scan_aggregate([[function]]):
+            if len(result) != len(self.obj):
+                raise AssertionError(
+                    "Unexpected result length for scan transform"
+                )
+            return result
         return self._broadcast(result)
 
     def rolling(self, *args, **kwargs):
diff --git a/python/cudf/cudf/tests/groupby/test_transform.py b/python/cudf/cudf/tests/groupby/test_transform.py
new file mode 100644
index 00000000000..78d7fbfd879
--- /dev/null
+++ b/python/cudf/cudf/tests/groupby/test_transform.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import itertools
+
+import pytest
+
+import cudf
+from cudf.testing._utils import assert_eq
+
+
+@pytest.fixture(params=[False, True], ids=["no-null-keys", "null-keys"])
+def keys_null(request):
+    return request.param
+
+
+@pytest.fixture(params=[False, True], ids=["no-null-values", "null-values"])
+def values_null(request):
+    return request.param
+
+
+@pytest.fixture
+def df(keys_null, values_null):
+    keys = ["a", "b", "a", "c", "b", "b", "c", "a"]
+    r = range(len(keys))
+    if keys_null:
+        keys[::3] = itertools.repeat(None, len(r[::3]))
+    values = list(range(len(keys)))
+    if values_null:
+        values[1::3] = itertools.repeat(None, len(r[1::3]))
+    return cudf.DataFrame({"key": keys, "values": values})
+
+
+@pytest.mark.parametrize("agg", ["cumsum", "cumprod", "max", "sum", "prod"])
+def test_transform_broadcast(agg, df):
+    pf = df.to_pandas()
+    got = df.groupby("key").transform(agg)
+    expect = pf.groupby("key").transform(agg)
+    assert_eq(got, expect, check_dtype=False)
+
+
+def test_transform_invalid():
+    df = cudf.DataFrame({"key": [1, 1], "values": [4, 5]})
+    with pytest.raises(TypeError):
+        df.groupby("key").transform({"values": "cumprod"})

From 77abf03a21ac22aaed48eb8ad627bbb37e81315c Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 16 Apr 2024 09:05:18 -0500
Subject: [PATCH 060/842] Remove legacy JSON reader from Python (#15538)

This PR removes the `engine="cudf_legacy"` option from Python.

This is a part of https://github.com/rapidsai/cudf/issues/15537.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15538
---
 python/cudf/cudf/io/json.py         | 23 ++---------------
 python/cudf/cudf/tests/test_json.py | 39 +----------------------------
 python/cudf/cudf/utils/ioutils.py   |  2 +-
 3 files changed, 4 insertions(+), 60 deletions(-)

diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index b2f3fd09146..5ef25a99590 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -38,25 +38,6 @@ def read_json(
             f"or a bool, or None. Got {type(dtype)}"
         )
 
-    if engine == "cudf_experimental":
-        raise ValueError(
-            "engine='cudf_experimental' support has been removed, "
-            "use `engine='cudf'`"
-        )
-
-    if engine == "cudf_legacy":
-        # TODO: Deprecated in 23.02, please
-        # give some time until(more than couple of
-        # releases from now) `cudf_legacy`
-        # support can be removed completely.
-        warnings.warn(
-            "engine='cudf_legacy' is a deprecated engine."
-            "This will be removed in a future release."
-            "Please switch to using engine='cudf'.",
-            FutureWarning,
-        )
-    if engine == "cudf_legacy" and not lines:
-        raise ValueError(f"{engine} engine only supports JSON Lines format")
     if engine == "auto":
         engine = "cudf" if lines else "pandas"
     if engine != "cudf" and keep_quotes:
@@ -64,7 +45,7 @@ def read_json(
             "keep_quotes='True' is supported only with engine='cudf'"
         )
 
-    if engine == "cudf_legacy" or engine == "cudf":
+    if engine == "cudf":
         if dtype is None:
             dtype = True
 
@@ -117,7 +98,7 @@ def read_json(
             lines,
             compression,
             byte_range,
-            engine == "cudf_legacy",
+            False,
             keep_quotes,
             mixed_types_as_string,
         )
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 40935733f34..3033a3e75e3 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -495,9 +495,6 @@ def test_json_lines_compression(tmpdir, ext, out_comp, in_comp):
 
 
 @pytest.mark.filterwarnings("ignore:Using CPU")
-@pytest.mark.filterwarnings(
-    "ignore:engine='cudf_legacy' is a deprecated engine."
-)
 def test_json_engine_selection():
     json = "[1, 2, 3]"
 
@@ -519,10 +516,6 @@ def test_json_engine_selection():
     for col_name in df.columns:
         assert isinstance(col_name, int)
 
-    # should raise an exception
-    with pytest.raises(ValueError):
-        cudf.read_json(StringIO(json), lines=False, engine="cudf_legacy")
-
 
 def test_json_bool_values():
     buffer = "[true,1]\n[false,false]\n[true,true]"
@@ -541,30 +534,6 @@ def test_json_bool_values():
     np.testing.assert_array_equal(pd_df.dtypes, cu_df.dtypes)
 
 
-@pytest.mark.filterwarnings(
-    "ignore:engine='cudf_legacy' is a deprecated engine."
-)
-@pytest.mark.parametrize(
-    "buffer",
-    [
-        "[1.0,]\n[null, ]",
-        '{"0":1.0,"1":}\n{"0":null,"1": }',
-        '{ "0" : 1.0 , "1" : }\n{ "0" : null , "1" : }',
-        '{"0":1.0}\n{"1":}',
-    ],
-)
-def test_json_null_literal(buffer):
-    df = cudf.read_json(StringIO(buffer), lines=True, engine="cudf_legacy")
-
-    # first column contains a null field, type should be set to float
-    # second column contains only empty fields, type should be set to int8
-    np.testing.assert_array_equal(df.dtypes, ["float64", "int8"])
-    np.testing.assert_array_equal(
-        df["0"].to_numpy(na_value=np.nan), [1.0, np.nan]
-    )
-    np.testing.assert_array_equal(df["1"].to_numpy(na_value=0), [0, 0])
-
-
 def test_json_bad_protocol_string():
     test_string = StringIO('{"field": "s3://path"}')
 
@@ -739,14 +708,8 @@ def test_default_integer_bitwidth(default_integer_bitwidth, engine):
 @pytest.mark.parametrize(
     "engine",
     [
-        pytest.param(
-            "cudf_legacy",
-            marks=pytest.mark.skip(
-                reason="cannot partially set dtypes for cudf json engine"
-            ),
-        ),
-        "pandas",
         "cudf",
+        "pandas",
     ],
 )
 def test_default_integer_bitwidth_partial(default_integer_bitwidth, engine):
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 0a0ee4f592c..8c58f2b859e 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -543,7 +543,7 @@
     function or `StringIO`). Multiple inputs may be provided as a list. If a
     list is specified each list entry may be of a different input type as long
     as each input is of a valid type and all input JSON schema(s) match.
-engine : {{ 'auto', 'cudf', 'cudf_legacy', 'pandas' }}, default 'auto'
+engine : {{ 'auto', 'cudf', 'pandas' }}, default 'auto'
     Parser engine to use. If 'auto' is passed, the engine will be
     automatically selected based on the other parameters. See notes below.
 orient : string

From feb96cbe39e36d35d673c51270f7316708d24f67 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Tue, 16 Apr 2024 12:33:52 -0400
Subject: [PATCH 061/842] Benchmark decimal <--> floating conversions. (#15334)

Adds benchmarks for decimal <--> floating conversions.  Does so for float <--> decimal32 & decimal64, and for double <--> decimal32, decimal64, and decimal128.  Within a column data tends to be in a similar range of values, so this provides separate tests for different representative ranges of powers-of-10.

Note that with the current conversion algorithm, the max supported scale of a decimal is the max power of 10 that that type can hold, so scale 9 for decimal32, 19 for decimal64, and 38 for decimal128. Thus only these ranges of floats/doubles are tested.

Also adds the ability to generate decimals with a specific (rather than random) scale factor. This expands the API, it does not replace the existing one.  All existing tests that generate a column of random decimals will continue to do so with a random scale factor, this capability is opt-in.  The machinery for this was already there, but only partially; this change fills it in.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/15334
---
 cpp/benchmarks/CMakeLists.txt               |   5 +
 cpp/benchmarks/common/generate_input.cu     |   5 +-
 cpp/benchmarks/common/generate_input.hpp    |  42 ++++-
 cpp/benchmarks/decimal/convert_floating.cpp | 167 ++++++++++++++++++++
 4 files changed, 211 insertions(+), 8 deletions(-)
 create mode 100644 cpp/benchmarks/decimal/convert_floating.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index b384f6d5674..571780888c0 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -339,6 +339,11 @@ ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)
 target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB)
 
+# ##################################################################################################
+# * decimal benchmark
+# ---------------------------------------------------------------------------------
+ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)
+
 add_custom_target(
   run_benchmarks
   DEPENDS CUDF_BENCHMARKS
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index 9857aac4473..6df2cb44adc 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -324,10 +324,11 @@ struct random_value_fn<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
   distribution_fn<DeviceType> dist;
   std::optional<numeric::scale_type> scale;
 
-  random_value_fn(distribution_params<DeviceType> const& desc)
+  random_value_fn(distribution_params<T> const& desc)
     : lower_bound{desc.lower_bound},
       upper_bound{desc.upper_bound},
-      dist{make_distribution<DeviceType>(desc.id, desc.lower_bound, desc.upper_bound)}
+      dist{make_distribution<DeviceType>(desc.id, lower_bound, upper_bound)},
+      scale{desc.scale}
   {
   }
 
diff --git a/cpp/benchmarks/common/generate_input.hpp b/cpp/benchmarks/common/generate_input.hpp
index 31dc2673d70..68d3dc492f5 100644
--- a/cpp/benchmarks/common/generate_input.hpp
+++ b/cpp/benchmarks/common/generate_input.hpp
@@ -182,9 +182,17 @@ struct distribution_params<T, std::enable_if_t<std::is_same_v<T, cudf::struct_vi
   cudf::size_type max_depth;
 };
 
-// Present for compilation only. To be implemented once reader/writers support the fixed width type.
+/**
+ * @brief Fixed-point values are parameterized with a distribution type, scale, and bounds of the
+ * same type.
+ */
 template <typename T>
-struct distribution_params<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {};
+struct distribution_params<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
+  distribution_id id;
+  typename T::rep lower_bound;
+  typename T::rep upper_bound;
+  std::optional<numeric::scale_type> scale;
+};
 
 /**
  * @brief Returns a vector of types, corresponding to the input type or a type group.
@@ -226,7 +234,7 @@ class data_profile {
     cudf::type_id::INT32, {distribution_id::GEOMETRIC, 0, 64}, 2};
   distribution_params<cudf::struct_view> struct_dist_desc{
     {cudf::type_id::INT32, cudf::type_id::FLOAT32, cudf::type_id::STRING}, 2};
-  std::map<cudf::type_id, distribution_params<__uint128_t>> decimal_params;
+  std::map<cudf::type_id, distribution_params<numeric::decimal128>> decimal_params;
 
   double bool_probability_true           = 0.5;
   std::optional<double> null_probability = 0.01;
@@ -300,16 +308,21 @@ class data_profile {
   }
 
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-  distribution_params<typename T::rep> get_distribution_params() const
+  distribution_params<T> get_distribution_params() const
   {
     using rep = typename T::rep;
     auto it   = decimal_params.find(cudf::type_to_id<T>());
     if (it == decimal_params.end()) {
       auto const range = default_range<rep>();
-      return distribution_params<rep>{default_distribution_id<rep>(), range.first, range.second};
+      auto const scale = std::optional<numeric::scale_type>{};
+      return distribution_params<T>{
+        default_distribution_id<rep>(), range.first, range.second, scale};
     } else {
       auto& desc = it->second;
-      return {desc.id, static_cast<rep>(desc.lower_bound), static_cast<rep>(desc.upper_bound)};
+      return {desc.id,
+              static_cast<rep>(desc.lower_bound),
+              static_cast<rep>(desc.upper_bound),
+              desc.scale};
     }
   }
 
@@ -359,6 +372,23 @@ class data_profile {
     }
   }
 
+  // Users should pass integral values for bounds when setting the parameters for fixed-point.
+  // Otherwise the call with have no effect.
+  template <typename T,
+            typename Type_enum,
+            std::enable_if_t<cuda::std::is_integral_v<T>, T>* = nullptr>
+  void set_distribution_params(Type_enum type_or_group,
+                               distribution_id dist,
+                               T lower_bound,
+                               T upper_bound,
+                               numeric::scale_type scale)
+  {
+    for (auto tid : get_type_or_group(static_cast<int32_t>(type_or_group))) {
+      decimal_params[tid] = {
+        dist, static_cast<__int128_t>(lower_bound), static_cast<__int128_t>(upper_bound), scale};
+    }
+  }
+
   template <typename T, typename Type_enum, std::enable_if_t<cudf::is_chrono<T>(), T>* = nullptr>
   void set_distribution_params(Type_enum type_or_group,
                                distribution_id dist,
diff --git a/cpp/benchmarks/decimal/convert_floating.cpp b/cpp/benchmarks/decimal/convert_floating.cpp
new file mode 100644
index 00000000000..a367036c494
--- /dev/null
+++ b/cpp/benchmarks/decimal/convert_floating.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/types.hpp>
+#include <cudf/unary.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <type_traits>
+
+// This benchmark compares the cost of converting decimal <--> floating point
+template <typename InputType, typename OutputType>
+void bench_cast_decimal(nvbench::state& state, nvbench::type_list<InputType, OutputType>)
+{
+  static constexpr bool is_input_floating  = std::is_floating_point_v<InputType>;
+  static constexpr bool is_output_floating = std::is_floating_point_v<OutputType>;
+
+  static constexpr bool is_double =
+    std::is_same_v<InputType, double> || std::is_same_v<OutputType, double>;
+  static constexpr bool is_32bit =
+    std::is_same_v<InputType, numeric::decimal32> || std::is_same_v<OutputType, numeric::decimal32>;
+  static constexpr bool is_128bit = std::is_same_v<InputType, numeric::decimal128> ||
+                                    std::is_same_v<OutputType, numeric::decimal128>;
+
+  // Skip floating --> floating and decimal --> decimal
+  if constexpr (is_input_floating == is_output_floating) {
+    state.skip("Meaningless conversion.");
+    return;
+  }
+
+  // Skip float <--> dec128
+  if constexpr (!is_double && is_128bit) {
+    state.skip("Ignoring float <--> dec128.");
+    return;
+  }
+
+  // Get settings
+  auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const exp_mode = state.get_int64("exp_range");
+
+  // Exponent range: Range size is 10^6
+  // These probe the edges of the float and double ranges, as well as more common values
+  int const exp_min_array[] = {-307, -37, -14, -3, 8, 31, 301};
+  int const exp_range_size  = 6;
+  int const exp_min         = exp_min_array[exp_mode];
+  int const exp_max         = exp_min + exp_range_size;
+
+  // With exp range size of 6, decimal output (generated or casted-to) has 7 digits of precision
+  int const extra_digits_precision = 1;
+
+  // Exclude end range of double from float test
+  if (!is_double && ((exp_mode == 0) || (exp_mode == 6))) {
+    state.skip("Range beyond end of float tests.");
+    return;
+  }
+
+  // The current float <--> decimal conversion algorithm is limited
+  static constexpr bool is_64bit = !is_32bit && !is_128bit;
+  if (is_32bit && (exp_mode != 3)) {
+    state.skip("Decimal32 conversion only works up to scale factors of 10^9.");
+    return;
+  }
+  if (is_64bit && ((exp_mode < 2) || (exp_mode > 4))) {
+    state.skip("Decimal64 conversion only works up to scale factors of 10^18.");
+    return;
+  }
+  if (is_128bit && ((exp_mode == 0) || (exp_mode == 6))) {
+    state.skip("Decimal128 conversion only works up to scale factors of 10^38.");
+    return;
+  }
+
+  // Type IDs
+  auto const input_id  = cudf::type_to_id<InputType>();
+  auto const output_id = cudf::type_to_id<OutputType>();
+
+  // Create data profile and scale
+  auto const [output_scale, profile] = [&]() {
+    if constexpr (is_input_floating) {
+      // Range for generated floating point values
+      auto get_pow10 = [](auto exp10) {
+        return std::pow(static_cast<InputType>(10), static_cast<InputType>(exp10));
+      };
+      InputType const floating_range_min = get_pow10(exp_min);
+      InputType const floating_range_max = get_pow10(exp_max);
+
+      // With exp range size of 6, output has 7 decimal digits of precision
+      auto const decimal_output_scale = exp_min - extra_digits_precision;
+
+      // Input distribution
+      data_profile const profile = data_profile_builder().distribution(
+        input_id, distribution_id::NORMAL, floating_range_min, floating_range_max);
+
+      return std::pair{decimal_output_scale, profile};
+
+    } else {  // Generating decimals
+
+      using decimal_rep_type = typename InputType::rep;
+
+      // For exp range size 6 and precision 7, generates ints between 10 and 10^7,
+      // with scale factor of: exp_max - 7. This matches floating point generation.
+      int const digits_precision     = exp_range_size + extra_digits_precision;
+      auto const decimal_input_scale = numeric::scale_type{exp_max - digits_precision};
+
+      // Range for generated integer values
+      auto get_pow10 = [](auto exp10) {
+        return numeric::detail::ipow<decimal_rep_type, numeric::Radix::BASE_10>(exp10);
+      };
+      auto const decimal_range_min = get_pow10(digits_precision - exp_range_size);
+      auto const decimal_range_max = get_pow10(digits_precision);
+
+      // Input distribution
+      data_profile const profile = data_profile_builder().distribution(input_id,
+                                                                       distribution_id::NORMAL,
+                                                                       decimal_range_min,
+                                                                       decimal_range_max,
+                                                                       decimal_input_scale);
+
+      return std::pair{0, profile};
+    }
+  }();
+
+  // Generate input data
+  auto const input_col  = create_random_column(input_id, row_count{num_rows}, profile);
+  auto const input_view = input_col->view();
+
+  // Output type
+  auto const output_type =
+    !is_input_floating ? cudf::data_type(output_id) : cudf::data_type(output_id, output_scale);
+
+  // Stream
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+
+  // Run benchmark
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch&) { cudf::cast(input_view, output_type); });
+
+  // Throughput statistics
+  state.add_element_count(num_rows);
+  state.add_global_memory_reads<InputType>(num_rows);
+  state.add_global_memory_writes<OutputType>(num_rows);
+}
+
+// Data types
+using data_types =
+  nvbench::type_list<float, double, numeric::decimal32, numeric::decimal64, numeric::decimal128>;
+
+NVBENCH_BENCH_TYPES(bench_cast_decimal, NVBENCH_TYPE_AXES(data_types, data_types))
+  .set_name("decimal_floating_conversion")
+  .set_type_axes_names({"InputType", "OutputType"})
+  .add_int64_power_of_two_axis("num_rows", {28})
+  .add_int64_axis("exp_range", nvbench::range(0, 6));

From 61e116eb873fca6f611c43aa909c177aeacb6f02 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 16 Apr 2024 11:08:09 -0700
Subject: [PATCH 062/842] Removing all batching code from parquet writer
 (#15528)

Fixes #13440. Removing the manually disabled batching code from parquet writer.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15528
---
 cpp/src/io/parquet/writer_impl.cu  | 143 ++++++++++-------------------
 cpp/src/io/parquet/writer_impl.hpp |   2 -
 2 files changed, 46 insertions(+), 99 deletions(-)

diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 5a8d96975ce..fd8d4f8bd7f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1396,16 +1396,13 @@ void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
 }
 
 /**
- * @brief Encode a batch of pages.
+ * @brief Encode pages.
  *
  * @throws rmm::bad_alloc if there is insufficient space for temporary buffers
  *
  * @param chunks column chunk array
  * @param pages encoder pages array
- * @param pages_in_batch number of pages in this batch
- * @param first_page_in_batch first page in batch
- * @param rowgroups_in_batch number of rowgroups in this batch
- * @param first_rowgroup first rowgroup in batch
+ * @param num_rowgroups number of rowgroups
  * @param page_stats optional page-level statistics (nullptr if none)
  * @param chunk_stats optional chunk-level statistics (nullptr if none)
  * @param column_stats optional page-level statistics for column index (nullptr if none)
@@ -1417,10 +1414,6 @@ void init_encoder_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
  */
 void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                   device_span<EncPage> pages,
-                  uint32_t pages_in_batch,
-                  uint32_t first_page_in_batch,
-                  uint32_t rowgroups_in_batch,
-                  uint32_t first_rowgroup,
                   statistics_chunk const* page_stats,
                   statistics_chunk const* chunk_stats,
                   statistics_chunk const* column_stats,
@@ -1430,14 +1423,12 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                   bool write_v2_headers,
                   rmm::cuda_stream_view stream)
 {
-  auto batch_pages = pages.subspan(first_page_in_batch, pages_in_batch);
+  auto const num_pages = pages.size();
+  auto pages_stats     = (page_stats != nullptr)
+                           ? device_span<statistics_chunk const>(page_stats, num_pages)
+                           : device_span<statistics_chunk const>();
 
-  auto batch_pages_stats =
-    (page_stats != nullptr)
-      ? device_span<statistics_chunk const>(page_stats + first_page_in_batch, pages_in_batch)
-      : device_span<statistics_chunk const>();
-
-  uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? pages_in_batch : 0;
+  uint32_t max_comp_pages = (compression != Compression::UNCOMPRESSED) ? num_pages : 0;
 
   rmm::device_uvector<device_span<uint8_t const>> comp_in(max_comp_pages, stream);
   rmm::device_uvector<device_span<uint8_t>> comp_out(max_comp_pages, stream);
@@ -1447,7 +1438,7 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
                comp_res.end(),
                compression_result{0, compression_status::FAILURE});
 
-  EncodePages(batch_pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
+  EncodePages(pages, write_v2_headers, comp_in, comp_out, comp_res, stream);
   switch (compression) {
     case Compression::SNAPPY:
       if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
@@ -1480,25 +1471,23 @@ void encode_pages(hostdevice_2dvector<EncColumnChunk>& chunks,
   // TBD: Not clear if the official spec actually allows dynamically turning off compression at the
   // chunk-level
 
-  auto d_chunks_in_batch = chunks.device_view().subspan(first_rowgroup, rowgroups_in_batch);
-  DecideCompression(d_chunks_in_batch.flat_view(), stream);
-  EncodePageHeaders(batch_pages, comp_res, batch_pages_stats, chunk_stats, stream);
-  GatherPages(d_chunks_in_batch.flat_view(), pages, stream);
+  auto d_chunks = chunks.device_view();
+  DecideCompression(d_chunks.flat_view(), stream);
+  EncodePageHeaders(pages, comp_res, pages_stats, chunk_stats, stream);
+  GatherPages(d_chunks.flat_view(), pages, stream);
 
   // By now, the var_bytes has been calculated in InitPages, and the histograms in EncodePages.
   // EncodeColumnIndexes can encode the histograms in the ColumnIndex, and also sum up var_bytes
   // and the histograms for inclusion in the chunk's SizeStats.
   if (column_stats != nullptr) {
-    EncodeColumnIndexes(d_chunks_in_batch.flat_view(),
-                        {column_stats, pages.size()},
-                        column_index_truncate_length,
-                        stream);
+    EncodeColumnIndexes(
+      d_chunks.flat_view(), {column_stats, pages.size()}, column_index_truncate_length, stream);
   }
 
-  auto h_chunks_in_batch = chunks.host_view().subspan(first_rowgroup, rowgroups_in_batch);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks_in_batch.data(),
-                                d_chunks_in_batch.data(),
-                                d_chunks_in_batch.flat_view().size_bytes(),
+  auto h_chunks = chunks.host_view();
+  CUDF_CUDA_TRY(cudaMemcpyAsync(h_chunks.data(),
+                                d_chunks.data(),
+                                d_chunks.flat_view().size_bytes(),
                                 cudaMemcpyDefault,
                                 stream.value()));
 
@@ -1959,33 +1948,23 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     std::fill_n(std::back_inserter(rg_to_part), num_rg_in_part[p], p);
   }
 
-  // Batch processing is no longer supported.
-  // This line disables batch processing (so batch size will no longer be limited at 1GB as before).
-  // TODO: All the relevant code will be removed in the follow-up work:
-  // https://github.com/rapidsai/cudf/issues/13440
-  auto const max_bytes_in_batch = std::numeric_limits<size_t>::max();
-
-  // Initialize batches of rowgroups to encode (mainly to limit peak memory usage)
-  std::vector<size_type> batch_list;
-  size_type num_pages           = 0;
-  size_t max_uncomp_bfr_size    = 0;
-  size_t max_comp_bfr_size      = 0;
-  size_t max_chunk_bfr_size     = 0;
-  size_type max_pages_in_batch  = 0;
-  size_t bytes_in_batch         = 0;
-  size_t comp_bytes_in_batch    = 0;
+  // Initialize rowgroups to encode
+  size_type num_pages        = 0;
+  size_t max_uncomp_bfr_size = 0;
+  size_t max_comp_bfr_size   = 0;
+  size_t max_chunk_bfr_size  = 0;
+
   size_t column_index_bfr_size  = 0;
   size_t def_histogram_bfr_size = 0;
   size_t rep_histogram_bfr_size = 0;
-  for (size_type r = 0, groups_in_batch = 0, pages_in_batch = 0; r <= num_rowgroups; r++) {
-    size_t rowgroup_size      = 0;
-    size_t comp_rowgroup_size = 0;
+  size_t rowgroup_size          = 0;
+  size_t comp_rowgroup_size     = 0;
+  for (size_type r = 0; r <= num_rowgroups; r++) {
     if (r < num_rowgroups) {
       for (int i = 0; i < num_columns; i++) {
         EncColumnChunk* ck = &chunks[r][i];
         ck->first_page     = num_pages;
         num_pages += ck->num_pages;
-        pages_in_batch += ck->num_pages;
         rowgroup_size += ck->bfr_size;
         comp_rowgroup_size += ck->compressed_size;
         max_chunk_bfr_size =
@@ -2007,29 +1986,17 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         }
       }
     }
-    // TBD: We may want to also shorten the batch if we have enough pages (not just based on size)
-    if ((r == num_rowgroups) ||
-        (groups_in_batch != 0 && bytes_in_batch + rowgroup_size > max_bytes_in_batch)) {
-      max_uncomp_bfr_size = std::max(max_uncomp_bfr_size, bytes_in_batch);
-      max_comp_bfr_size   = std::max(max_comp_bfr_size, comp_bytes_in_batch);
-      max_pages_in_batch  = std::max(max_pages_in_batch, pages_in_batch);
-      if (groups_in_batch != 0) {
-        batch_list.push_back(groups_in_batch);
-        groups_in_batch = 0;
-      }
-      bytes_in_batch      = 0;
-      comp_bytes_in_batch = 0;
-      pages_in_batch      = 0;
+    // write bfr sizes if this is the last rowgroup
+    if (r == num_rowgroups) {
+      max_uncomp_bfr_size = rowgroup_size;
+      max_comp_bfr_size   = comp_rowgroup_size;
     }
-    bytes_in_batch += rowgroup_size;
-    comp_bytes_in_batch += comp_rowgroup_size;
-    groups_in_batch++;
   }
 
   // Clear compressed buffer size if compression has been turned off
   if (compression == Compression::UNCOMPRESSED) { max_comp_bfr_size = 0; }
 
-  // Initialize data pointers in batch
+  // Initialize data pointers
   uint32_t const num_stats_bfr =
     (stats_granularity != statistics_freq::STATISTICS_NONE) ? num_pages + num_chunks : 0;
 
@@ -2055,10 +2022,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   auto bfr_i = static_cast<uint8_t*>(col_idx_bfr.data());
   auto bfr_r = rep_level_histogram.data();
   auto bfr_d = def_level_histogram.data();
-  for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
+  if (num_rowgroups != 0) {
     auto bfr   = static_cast<uint8_t*>(uncomp_bfr.data());
     auto bfr_c = static_cast<uint8_t*>(comp_bfr.data());
-    for (auto j = 0; j < batch_list[b]; j++, r++) {
+    for (auto r = 0; r < num_rowgroups; r++) {
       for (auto i = 0; i < num_columns; i++) {
         EncColumnChunk& ck   = chunks[r][i];
         ck.uncompressed_bfr  = bfr;
@@ -2108,22 +2075,11 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   std::optional<writer_compression_statistics> comp_stats;
   if (collect_compression_statistics) { comp_stats = writer_compression_statistics{}; }
 
-  // Encode row groups in batches
-  for (auto b = 0, batch_r_start = 0; b < static_cast<size_type>(batch_list.size()); b++) {
-    // Count pages in this batch
-    auto const rnext               = batch_r_start + batch_list[b];
-    auto const first_page_in_batch = chunks[batch_r_start][0].first_page;
-    auto const first_page_in_next_batch =
-      (rnext < num_rowgroups) ? chunks[rnext][0].first_page : num_pages;
-    auto const pages_in_batch = first_page_in_next_batch - first_page_in_batch;
-
+  // Encode row groups
+  if (num_rowgroups != 0) {
     encode_pages(
       chunks,
       {pages.data(), pages.size()},
-      pages_in_batch,
-      first_page_in_batch,
-      batch_list[b],
-      batch_r_start,
       (stats_granularity == statistics_freq::STATISTICS_PAGE) ? page_stats.data() : nullptr,
       (stats_granularity != statistics_freq::STATISTICS_NONE) ? page_stats.data() + num_pages
                                                               : nullptr,
@@ -2152,7 +2108,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       }
     }
 
-    for (int r = batch_r_start; r < rnext; r++) {
+    for (int r = 0; r < num_rowgroups; r++) {
       int p           = rg_to_part[r];
       int global_r    = global_rowgroup_base[p] + r - first_rg_in_part[p];
       auto& row_group = agg_meta->file(p).row_groups[global_r];
@@ -2192,7 +2148,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       auto h_def_ptr = h_def_histogram.data();
       auto h_rep_ptr = h_rep_histogram.data();
 
-      for (int r = batch_r_start; r < rnext; r++) {
+      for (int r = 0; r < num_rowgroups; r++) {
         int const p        = rg_to_part[r];
         int const global_r = global_rowgroup_base[p] + r - first_rg_in_part[p];
         auto& row_group    = agg_meta->file(p).row_groups[global_r];
@@ -2239,8 +2195,6 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         }
       }
     }
-
-    batch_r_start = rnext;
   }
 
   auto bounce_buffer =
@@ -2251,7 +2205,6 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                     std::move(chunks),
                     std::move(global_rowgroup_base),
                     std::move(first_rg_in_part),
-                    std::move(batch_list),
                     std::move(rg_to_part),
                     std::move(comp_stats),
                     std::move(uncomp_bfr),
@@ -2358,7 +2311,6 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                          chunks,
                          global_rowgroup_base,
                          first_rg_in_part,
-                         batch_list,
                          rg_to_part,
                          comp_stats,
                          uncomp_bfr,   // unused, but contains data for later write to sink
@@ -2402,7 +2354,6 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                              chunks,
                              global_rowgroup_base,
                              first_rg_in_part,
-                             batch_list,
                              rg_to_part,
                              bounce_buffer);
 
@@ -2417,18 +2368,17 @@ void writer::impl::write_parquet_data_to_sink(
   host_2dspan<EncColumnChunk const> chunks,
   host_span<size_t const> global_rowgroup_base,
   host_span<int const> first_rg_in_part,
-  host_span<size_type const> batch_list,
   host_span<int const> rg_to_part,
   host_span<uint8_t> bounce_buffer)
 {
-  _agg_meta              = std::move(updated_agg_meta);
-  auto const num_columns = chunks.size().second;
+  _agg_meta                = std::move(updated_agg_meta);
+  auto const num_rowgroups = chunks.size().first;
+  auto const num_columns   = chunks.size().second;
 
-  for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
-    auto const rnext = r + batch_list[b];
+  if (num_rowgroups != 0) {
     std::vector<std::future<void>> write_tasks;
 
-    for (; r < rnext; r++) {
+    for (auto r = 0; r < static_cast<int>(num_rowgroups); r++) {
       int const p        = rg_to_part[r];
       int const global_r = global_rowgroup_base[p] + r - first_rg_in_part[p];
       auto& row_group    = _agg_meta->file(p).row_groups[global_r];
@@ -2472,10 +2422,9 @@ void writer::impl::write_parquet_data_to_sink(
     auto const h_pages = cudf::detail::make_host_vector_sync(pages, _stream);
 
     // add column and offset indexes to metadata
-    for (auto b = 0, r = 0; b < static_cast<size_type>(batch_list.size()); b++) {
-      auto const rnext   = r + batch_list[b];
-      auto curr_page_idx = chunks[r][0].first_page;
-      for (; r < rnext; r++) {
+    if (num_rowgroups != 0) {
+      auto curr_page_idx = chunks[0][0].first_page;
+      for (auto r = 0; r < static_cast<int>(num_rowgroups); r++) {
         int const p           = rg_to_part[r];
         int const global_r    = global_rowgroup_base[p] + r - first_rg_in_part[p];
         auto const& row_group = _agg_meta->file(p).row_groups[global_r];
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 2f6608b0ae7..3cbb7630fab 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -129,7 +129,6 @@ class writer::impl {
    * @param chunks Column chunks
    * @param global_rowgroup_base Numbers of rowgroups in each file/partition
    * @param first_rg_in_part The first rowgroup in each partition
-   * @param batch_list The batches of rowgroups to encode
    * @param rg_to_part A map from rowgroup to partition
    * @param[out] bounce_buffer Temporary host output buffer
    */
@@ -138,7 +137,6 @@ class writer::impl {
                                   host_2dspan<EncColumnChunk const> chunks,
                                   host_span<size_t const> global_rowgroup_base,
                                   host_span<int const> first_rg_in_part,
-                                  host_span<size_type const> batch_list,
                                   host_span<int const> rg_to_part,
                                   host_span<uint8_t> bounce_buffer);
 

From c32274f3c869ae054df8e588375be6dd852e7161 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 16 Apr 2024 13:11:39 -0500
Subject: [PATCH 063/842] Remove checks dependency from static-configure test
 job. (#15542)

The `static-configure` CI job had an erroneous dependency on `checks`. That job exists for PRs but not nightly tests, and caused nightly failures.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15542
---
 .github/workflows/test.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index c5ae2f3b5a8..170f45e23fd 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -44,7 +44,6 @@ jobs:
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
-    needs: checks
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:

From f0be36bedd9a7d7c03d4b90666136070d650f22c Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 16 Apr 2024 13:54:39 -0500
Subject: [PATCH 064/842] Switch back to 24.06 branch for pandas tests (#15543)

This PR switches the `custom-build` shared workflow branch back to `branch-24.06` which now contains the necessary upload artifact changes.

Authors:
   - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
   - AJ Schmidt (https://github.com/ajschmidt8)
---
 .github/workflows/pr.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index f84b1f42928..f9d5976f1fe 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -182,7 +182,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@patch-1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
         node_type: cpu4
         build_type: pull-request

From b9d9af16df48b6e9f7c72cc10d1462210105c285 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 16 Apr 2024 16:53:41 -0400
Subject: [PATCH 065/842] Extend cudf devcontainers to specify jitify2 kernel
 cache (#15068)

This ensures that inside devcontainers, the helper clean commands will also remvoe the jitify2 cache as it is part of the build directory.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15068
---
 .devcontainer/Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index c19bb68986f..8190b5d0297 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -33,3 +33,4 @@ ENV SCCACHE_REGION="us-east-2"
 ENV SCCACHE_BUCKET="rapids-sccache-devs"
 ENV VAULT_HOST="https://vault.ops.k8s.rapids.ai"
 ENV HISTFILE="/home/coder/.cache/._bash_history"
+ENV LIBCUDF_KERNEL_CACHE_PATH="/home/coder/cudf/cpp/build/${PYTHON_PACKAGE_MANAGER}/cuda-${CUDA_VERSION}/latest/jitify_cache"

From 690e55807925bf1a69e0ab4932723dc204e53bdd Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Tue, 16 Apr 2024 14:06:19 -0700
Subject: [PATCH 066/842] Fix for some compiler warnings in
 parquet/page_decode.cuh (#15518)

Clangd generates several warnings/errors in cpp/src/io/parquet/page_decode.cuh. One is in regards to a lambda argument shadowing a captured value. The others involve the use of `thrust::optional::value()` in device code...unlike pretty much every other member function, `value()` lacks the `__device__` decorator. This PR replaces two usages of `value()` with `operator*()` which does have the `__device__` decorator.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15518
---
 cpp/src/io/parquet/page_decode.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index fa1de5f301d..83bf7fb0d73 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -924,7 +924,7 @@ inline __device__ uint32_t InitLevelSection(page_state_s* s,
 
   auto start = cur;
 
-  auto init_rle = [s, lvl, end, level_bits](uint8_t const* cur, uint8_t const* end) {
+  auto init_rle = [s, lvl, level_bits](uint8_t const* cur, uint8_t const* end) {
     uint32_t const run      = get_vlq32(cur, end);
     s->initial_rle_run[lvl] = run;
     if (!(run & 1)) {
@@ -1160,7 +1160,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
             int32_t units = 0;
             // Duration types are not included because no scaling is done when reading
             if (s->col.logical_type.has_value()) {
-              auto const& lt = s->col.logical_type.value();
+              auto const& lt = *s->col.logical_type;
               if (lt.is_timestamp_millis()) {
                 units = cudf::timestamp_ms::period::den;
               } else if (lt.is_timestamp_micros()) {
@@ -1217,7 +1217,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
       } else if (data_type == INT32) {
         // check for smaller bitwidths
         if (s->col.logical_type.has_value()) {
-          auto const& lt = s->col.logical_type.value();
+          auto const& lt = *s->col.logical_type;
           if (lt.type == LogicalType::INTEGER) {
             s->dtype_len = lt.bit_width() / 8;
           } else if (lt.is_time_millis()) {

From b378b13560165c476ab730fb53638b67dbc469fa Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 16 Apr 2024 11:57:07 -1000
Subject: [PATCH 067/842] Enable more ignored pandas unit tests for cudf.pandas
 (#15535)

If these test actually crash pytest workers, will add to `TEST_THAT_CRASH_PYTEST_WORKERS`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15535
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index e21c4572e44..07ec5c8bc0c 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -138,7 +138,7 @@ and not test_eof_states"
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
-    -k "not test_to_parquet_gcs_new_file and not test_qcut_nat and not test_add and not test_ismethods and $TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS" \
+    -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS" \
     --import-mode=importlib \
     ${PYTEST_IGNORES} \
     "$@" || [ $? = 1 ]  # Exit success if exit code was 1 (permit test failures but not other errors)

From 02f8e2fc882ae58cc74053ea631e27ab27dfbe53 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 16 Apr 2024 21:19:32 -0400
Subject: [PATCH 068/842] Fea/move to latest nanoarrow (#15526)

Move to the latest nightly build of nano arrow so that we don't need to keep patches around for it.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15526
---
 cpp/cmake/thirdparty/get_nanoarrow.cmake      |  13 +-
 .../thirdparty/patches/nanoarrow_cmake.diff   | 184 ------------------
 .../patches/nanoarrow_override.json           |  18 --
 3 files changed, 6 insertions(+), 209 deletions(-)
 delete mode 100644 cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff
 delete mode 100644 cpp/cmake/thirdparty/patches/nanoarrow_override.json

diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
index dc0b8d09746..025bff7d8f0 100644
--- a/cpp/cmake/thirdparty/get_nanoarrow.cmake
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -14,16 +14,15 @@
 
 # This function finds nanoarrow and sets any additional necessary environment variables.
 function(find_and_configure_nanoarrow)
-  include(${rapids-cmake-dir}/cpm/package_override.cmake)
-
-  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
-  rapids_cpm_package_override("${cudf_patch_dir}/nanoarrow_override.json")
-
-  # The git_repo and git_tag are provided by the nanoarrow_override file
+  # Currently we need to always build nanoarrow so we don't pickup a previous installed version
+  set(CPM_DOWNLOAD_nanoarrow ON)
   rapids_cpm_find(
-    nanoarrow 0.4.0
+    nanoarrow 0.5.0
     GLOBAL_TARGETS nanoarrow
     CPM_ARGS
+    GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git
+    GIT_TAG 11e73a8c85b45e3d49c8c541b4e1497a649fe03c
+    GIT_SHALLOW FALSE
     OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
   )
   set_target_properties(nanoarrow PROPERTIES POSITION_INDEPENDENT_CODE ON)
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff b/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff
deleted file mode 100644
index 1262a38c0a4..00000000000
--- a/cpp/cmake/thirdparty/patches/nanoarrow_cmake.diff
+++ /dev/null
@@ -1,184 +0,0 @@
-diff --git a/CMakeLists.txt b/CMakeLists.txt
-index 8714c70..6a9e505 100644
---- a/CMakeLists.txt
-+++ b/CMakeLists.txt
-@@ -49,7 +49,6 @@ else()
- endif()
-
- option(NANOARROW_CODE_COVERAGE "Enable coverage reporting" OFF)
--add_library(coverage_config INTERFACE)
-
- # Avoids a warning about timestamps on downloaded files (prefer new policy
- # if available))
-@@ -59,6 +58,7 @@ endif()
-
- configure_file(src/nanoarrow/nanoarrow_config.h.in generated/nanoarrow_config.h)
-
-+include(GNUInstallDirs)
- if(NANOARROW_BUNDLE)
-   # Combine all headers into amalgamation/nanoarrow.h in the build directory
-   file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/amalgamation)
-@@ -111,6 +111,8 @@ if(NANOARROW_BUNDLE)
-   if(NANOARROW_BUILD_TESTS)
-     include_directories(${CMAKE_BINARY_DIR}/amalgamation)
-     add_library(nanoarrow ${NANOARROW_C_TEMP})
-+    add_library(nanoarrow::nanoarrow ALIAS nanoarrow)
-+
-     target_compile_definitions(nanoarrow PUBLIC "$<$<CONFIG:Debug>:NANOARROW_DEBUG>")
-   endif()
-
-@@ -120,10 +122,11 @@ if(NANOARROW_BUNDLE)
- else()
-   add_library(nanoarrow src/nanoarrow/array.c src/nanoarrow/schema.c
-                         src/nanoarrow/array_stream.c src/nanoarrow/utils.c)
-+  add_library(nanoarrow::nanoarrow ALIAS nanoarrow)
-
-   target_include_directories(nanoarrow
-                              PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
--                                    $<INSTALL_INTERFACE:include>)
-+                                    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
-   target_include_directories(nanoarrow
-                              PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_BINARY_DIR}/generated>
-   )
-@@ -154,13 +157,49 @@ else()
-     endif()
-   endif()
-
--  install(TARGETS nanoarrow DESTINATION lib)
-+  install(TARGETS nanoarrow
-+          DESTINATION "${CMAKE_INSTALL_LIBDIR}"
-+          EXPORT nanoarrow-exports)
-   install(DIRECTORY src/
--          DESTINATION include
-+          DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}"
-           FILES_MATCHING
--          PATTERN "*.h")
-+          PATTERN "*.h*")
-   install(FILES ${CMAKE_CURRENT_BINARY_DIR}/generated/nanoarrow_config.h
--          DESTINATION include/nanoarrow)
-+          DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/nanoarrow")
-+
-+  # Generate package files for the build and install trees.
-+  include(CMakePackageConfigHelpers)
-+
-+  foreach(tree_type BUILD INSTALL)
-+    if(tree_type STREQUAL "BUILD")
-+      set(install_location ".")
-+    else()
-+      set(install_location "${CMAKE_INSTALL_LIBDIR}/cmake/nanoarrow")
-+    endif()
-+
-+    set(build_location "${PROJECT_BINARY_DIR}/${install_location}")
-+    write_basic_package_version_file(
-+      "${build_location}/nanoarrow-config-version.cmake"
-+      VERSION ${nanoarrow_VERSION}
-+      # After 1.0.0, we can use `SameMajorVersion` here.
-+      COMPATIBILITY ExactVersion)
-+    configure_package_config_file("${CMAKE_CURRENT_LIST_DIR}/cmake/config.cmake.in"
-+                                  "${build_location}/nanoarrow-config.cmake"
-+                                  INSTALL_DESTINATION "${install_location}")
-+
-+    if(tree_type STREQUAL "BUILD")
-+      export(EXPORT nanoarrow-exports
-+             FILE "${build_location}/nanoarrow-targets.cmake"
-+             NAMESPACE nanoarrow::)
-+
-+    else()
-+      install(DIRECTORY "${build_location}/" DESTINATION "${install_location}")
-+      install(EXPORT nanoarrow-exports
-+              DESTINATION "${install_location}"
-+              FILE "nanoarrow-targets.cmake"
-+              NAMESPACE nanoarrow::)
-+    endif()
-+  endforeach()
- endif()
-
- # Always build integration test if building tests
-@@ -171,7 +210,7 @@ if(NANOARROW_BUILD_TESTS OR NANOARROW_BUILD_INTEGRATION_TESTS)
-               src/nanoarrow/integration/c_data_integration.cc)
-   target_include_directories(nanoarrow_c_data_integration
-                              PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/src>
--                                    $<INSTALL_INTERFACE:include>)
-+                                    $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}>)
-   target_link_libraries(nanoarrow_c_data_integration PRIVATE nanoarrow nlohmann_json)
- endif()
-
-@@ -215,34 +254,18 @@ if(NANOARROW_BUILD_TESTS)
-                  src/nanoarrow/integration/c_data_integration_test.cc)
-
-   if(NANOARROW_CODE_COVERAGE)
--    target_compile_options(coverage_config INTERFACE -O0 -g --coverage)
--    target_link_options(coverage_config INTERFACE --coverage)
--    target_link_libraries(nanoarrow coverage_config)
-+    target_compile_options(nanoarrow PUBLIC -O0 -g --coverage)
-+    target_link_options(nanoarrow PUBLIC --coverage)
-   endif()
-
--  target_link_libraries(utils_test
--                        nanoarrow
--                        gtest_main
--                        ${NANOARROW_ARROW_TARGET}
--                        coverage_config)
--  target_link_libraries(buffer_test nanoarrow gtest_main coverage_config)
--  target_link_libraries(array_test
--                        nanoarrow
--                        gtest_main
--                        ${NANOARROW_ARROW_TARGET}
--                        coverage_config)
--  target_link_libraries(schema_test
--                        nanoarrow
--                        gtest_main
--                        ${NANOARROW_ARROW_TARGET}
--                        coverage_config)
--  target_link_libraries(array_stream_test nanoarrow gtest_main coverage_config)
--  target_link_libraries(nanoarrow_hpp_test nanoarrow gtest_main coverage_config)
--  target_link_libraries(nanoarrow_testing_test
--                        nanoarrow
--                        gtest_main
--                        nlohmann_json::nlohmann_json
--                        coverage_config)
-+  target_link_libraries(utils_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET})
-+  target_link_libraries(buffer_test nanoarrow gtest_main)
-+  target_link_libraries(array_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET})
-+  target_link_libraries(schema_test nanoarrow gtest_main ${NANOARROW_ARROW_TARGET})
-+  target_link_libraries(array_stream_test nanoarrow gtest_main)
-+  target_link_libraries(nanoarrow_hpp_test nanoarrow gtest_main)
-+  target_link_libraries(nanoarrow_testing_test nanoarrow gtest_main
-+                        nlohmann_json::nlohmann_json)
-   target_link_libraries(c_data_integration_test nanoarrow nanoarrow_c_data_integration
-                         gtest_main)
-
-diff --git a/cmake/config.cmake.in b/cmake/config.cmake.in
-new file mode 100644
-index 0000000..021dc31
---- /dev/null
-+++ b/cmake/config.cmake.in
-@@ -0,0 +1,28 @@
-+# Licensed to the Apache Software Foundation (ASF) under one
-+# or more contributor license agreements.  See the NOTICE file
-+# distributed with this work for additional information
-+# regarding copyright ownership.  The ASF licenses this file
-+# to you under the Apache License, Version 2.0 (the
-+# "License"); you may not use this file except in compliance
-+# with the License.  You may obtain a copy of the License at
-+#
-+# http://www.apache.org/licenses/LICENSE-2.0
-+#
-+# Unless required by applicable law or agreed to in writing,
-+# software distributed under the License is distributed on an
-+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-+# KIND, either express or implied.  See the License for the
-+# specific language governing permissions and limitations
-+# under the License.
-+
-+
-+@PACKAGE_INIT@
-+
-+cmake_minimum_required(VERSION @CMAKE_MINIMUM_REQUIRED_VERSION@)
-+
-+include("${CMAKE_CURRENT_LIST_DIR}/nanoarrow-targets.cmake" REQUIRED)
-+include("${CMAKE_CURRENT_LIST_DIR}/nanoarrow-config-version.cmake" REQUIRED)
-+
-+set(${CMAKE_FIND_PACKAGE_NAME}_CONFIG "${CMAKE_CURRENT_LIST_FILE}")
-+include(FindPackageHandleStandardArgs)
-+find_package_handle_standard_args(${CMAKE_FIND_PACKAGE_NAME} CONFIG_MODE)
diff --git a/cpp/cmake/thirdparty/patches/nanoarrow_override.json b/cpp/cmake/thirdparty/patches/nanoarrow_override.json
deleted file mode 100644
index 0b83d1808cb..00000000000
--- a/cpp/cmake/thirdparty/patches/nanoarrow_override.json
+++ /dev/null
@@ -1,18 +0,0 @@
-
-{
-  "packages" : {
-    "nanoarrow" : {
-      "version" : "0.4.0",
-      "git_url" : "https://github.com/apache/arrow-nanoarrow.git",
-      "git_tag" : "c97720003ff863b81805bcdb9f7c91306ab6b6a8",
-      "git_shallow" : false,
-      "patches" : [
-        {
-          "file" : "${current_json_dir}/nanoarrow_cmake.diff",
-          "issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]",
-          "fixed_in" : "0.5.0"
-        }
-      ]
-    }
-  }
-}

From 9192d259633c382c6f98f956dc7f43d754ebbf44 Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Wed, 17 Apr 2024 22:21:38 +1000
Subject: [PATCH 069/842] Convert libcudf resource parameters to
 rmm::device_async_resource_ref (#15507)

Closes https://github.com/rapidsai/cudf/issues/15498

For reviewers:
Almost all of the thousands of changes are simple textual replace of `rmm::mr::device_memory_resource *` with `rmm::device_async_resource_ref`.

I think the only substantial changes that are different are in `contiguous_split.cu` (which was assigning `nullptr` to the MR pointer -- I have changed these cases to use a `std::optional<rmm::device_async_resource_ref>`), and in JNI code.

~I still need to figure out how to build and test the JNI bindings. And figure out necessary Cython changes.~

JNI is passing CI now. Cython required no changes.

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15507
---
 cpp/benchmarks/copying/shift.cu               |  10 +-
 cpp/benchmarks/fixture/benchmark_fixture.hpp  |   1 +
 .../cudf/ast/detail/expression_parser.hpp     |  10 +-
 cpp/include/cudf/binaryop.hpp                 |  23 +--
 cpp/include/cudf/column/column.hpp            |   9 +-
 cpp/include/cudf/column/column_factories.hpp  |  75 ++++----
 cpp/include/cudf/concatenate.hpp              |  15 +-
 cpp/include/cudf/contiguous_split.hpp         |  10 +-
 cpp/include/cudf/copying.hpp                  |  77 ++++----
 cpp/include/cudf/datetime.hpp                 |  43 ++---
 cpp/include/cudf/detail/binaryop.hpp          |  19 +-
 .../detail/calendrical_month_sequence.cuh     |   5 +-
 cpp/include/cudf/detail/concatenate.hpp       |  11 +-
 cpp/include/cudf/detail/concatenate_masks.hpp |   7 +-
 cpp/include/cudf/detail/contiguous_split.hpp  |   7 +-
 cpp/include/cudf/detail/copy.hpp              |  35 ++--
 cpp/include/cudf/detail/copy_if.cuh           |   7 +-
 cpp/include/cudf/detail/copy_if_else.cuh      |   3 +-
 cpp/include/cudf/detail/copy_range.cuh        |   3 +-
 cpp/include/cudf/detail/datetime.hpp          |  66 +++----
 .../cudf/detail/distinct_hash_join.cuh        |   5 +-
 cpp/include/cudf/detail/fill.hpp              |   5 +-
 cpp/include/cudf/detail/gather.cuh            |  17 +-
 cpp/include/cudf/detail/gather.hpp            |   9 +-
 cpp/include/cudf/detail/groupby.hpp           |   5 +-
 .../detail/groupby/group_replace_nulls.hpp    |   5 +-
 .../cudf/detail/groupby/sort_helper.hpp       |  11 +-
 .../cudf/detail/hash_reduce_by_row.cuh        |   3 +-
 cpp/include/cudf/detail/interop.hpp           |  14 +-
 cpp/include/cudf/detail/join.hpp              |  13 +-
 cpp/include/cudf/detail/label_bins.hpp        |   5 +-
 cpp/include/cudf/detail/merge.hpp             |   7 +-
 cpp/include/cudf/detail/null_mask.cuh         |  11 +-
 cpp/include/cudf/detail/null_mask.hpp         |  25 +--
 cpp/include/cudf/detail/quantiles.hpp         |  11 +-
 cpp/include/cudf/detail/repeat.hpp            |  11 +-
 cpp/include/cudf/detail/replace.hpp           |  27 +--
 cpp/include/cudf/detail/reshape.hpp           |   7 +-
 cpp/include/cudf/detail/rolling.hpp           |   7 +-
 cpp/include/cudf/detail/round.hpp             |   7 +-
 cpp/include/cudf/detail/scan.hpp              |  13 +-
 cpp/include/cudf/detail/scatter.cuh           |  17 +-
 cpp/include/cudf/detail/scatter.hpp           |  19 +-
 cpp/include/cudf/detail/search.hpp            |  15 +-
 cpp/include/cudf/detail/sequence.hpp          |  13 +-
 .../cudf/detail/sizes_to_offsets_iterator.cuh |   3 +-
 cpp/include/cudf/detail/sorting.hpp           |  23 +--
 cpp/include/cudf/detail/stream_compaction.hpp |  21 ++-
 cpp/include/cudf/detail/structs/utilities.hpp |  11 +-
 cpp/include/cudf/detail/tdigest/tdigest.hpp   |  17 +-
 cpp/include/cudf/detail/timezone.hpp          |   7 +-
 cpp/include/cudf/detail/transform.hpp         |  19 +-
 cpp/include/cudf/detail/transpose.hpp         |   5 +-
 cpp/include/cudf/detail/unary.hpp             |  15 +-
 .../detail/utilities/vector_factories.hpp     |  23 +--
 cpp/include/cudf/detail/valid_if.cuh          |   3 +-
 .../cudf/dictionary/detail/concatenate.hpp    |   5 +-
 cpp/include/cudf/dictionary/detail/encode.hpp |   7 +-
 cpp/include/cudf/dictionary/detail/merge.hpp  |   5 +-
 .../cudf/dictionary/detail/replace.hpp        |   7 +-
 cpp/include/cudf/dictionary/detail/search.hpp |   9 +-
 .../cudf/dictionary/detail/update_keys.hpp    |  17 +-
 .../cudf/dictionary/dictionary_factories.hpp  |  11 +-
 cpp/include/cudf/dictionary/encode.hpp        |  13 +-
 cpp/include/cudf/dictionary/search.hpp        |   7 +-
 cpp/include/cudf/dictionary/update_keys.hpp   |  23 +--
 cpp/include/cudf/filling.hpp                  |  27 +--
 cpp/include/cudf/groupby.hpp                  |  25 +--
 cpp/include/cudf/hashing.hpp                  |  43 ++---
 cpp/include/cudf/hashing/detail/hashing.hpp   |  19 +-
 cpp/include/cudf/interop.hpp                  |  22 ++-
 cpp/include/cudf/io/avro.hpp                  |   5 +-
 cpp/include/cudf/io/csv.hpp                   |  11 +-
 cpp/include/cudf/io/detail/avro.hpp           |   5 +-
 cpp/include/cudf/io/detail/csv.hpp            |   7 +-
 cpp/include/cudf/io/detail/json.hpp           |   9 +-
 cpp/include/cudf/io/detail/orc.hpp            |   3 +-
 cpp/include/cudf/io/detail/parquet.hpp        |   5 +-
 cpp/include/cudf/io/detail/tokenize_json.hpp  |   5 +-
 cpp/include/cudf/io/json.hpp                  |   9 +-
 cpp/include/cudf/io/orc.hpp                   |   5 +-
 cpp/include/cudf/io/parquet.hpp               |  13 +-
 .../cudf/io/text/detail/tile_state.hpp        |   4 +-
 cpp/include/cudf/io/text/detail/trie.hpp      |   7 +-
 cpp/include/cudf/io/text/multibyte_split.hpp  |  11 +-
 cpp/include/cudf/join.hpp                     |  90 ++++-----
 cpp/include/cudf/json/json.hpp                |   9 +-
 cpp/include/cudf/labeling/label_bins.hpp      |   5 +-
 cpp/include/cudf/lists/combine.hpp            |   7 +-
 cpp/include/cudf/lists/contains.hpp           |  27 +--
 cpp/include/cudf/lists/count_elements.hpp     |   7 +-
 cpp/include/cudf/lists/detail/combine.hpp     |   8 +-
 cpp/include/cudf/lists/detail/concatenate.hpp |   5 +-
 cpp/include/cudf/lists/detail/contains.hpp    |  20 +-
 cpp/include/cudf/lists/detail/copying.hpp     |   5 +-
 cpp/include/cudf/lists/detail/extract.hpp     |  12 +-
 cpp/include/cudf/lists/detail/gather.cuh      |  15 +-
 .../cudf/lists/detail/interleave_columns.hpp  |   5 +-
 .../lists/detail/lists_column_factories.hpp   |   9 +-
 cpp/include/cudf/lists/detail/reverse.hpp     |   6 +-
 cpp/include/cudf/lists/detail/scatter.cuh     |   9 +-
 .../cudf/lists/detail/scatter_helper.cuh      |   5 +-
 .../cudf/lists/detail/set_operations.hpp      |   9 +-
 cpp/include/cudf/lists/detail/sorting.hpp     |   7 +-
 .../cudf/lists/detail/stream_compaction.hpp   |   9 +-
 cpp/include/cudf/lists/explode.hpp            |  11 +-
 cpp/include/cudf/lists/extract.hpp            |  11 +-
 cpp/include/cudf/lists/filling.hpp            |  11 +-
 cpp/include/cudf/lists/gather.hpp             |   9 +-
 cpp/include/cudf/lists/reverse.hpp            |   7 +-
 cpp/include/cudf/lists/set_operations.hpp     |  35 ++--
 cpp/include/cudf/lists/sorting.hpp            |  11 +-
 cpp/include/cudf/lists/stream_compaction.hpp  |  15 +-
 cpp/include/cudf/merge.hpp                    |   5 +-
 cpp/include/cudf/null_mask.hpp                |  23 +--
 cpp/include/cudf/partitioning.hpp             |  15 +-
 cpp/include/cudf/quantiles.hpp                |  15 +-
 cpp/include/cudf/reduction.hpp                |  17 +-
 .../cudf/reduction/detail/histogram.hpp       |   5 +-
 .../cudf/reduction/detail/reduction.cuh       |   9 +-
 .../cudf/reduction/detail/reduction.hpp       |   8 +-
 .../reduction/detail/reduction_functions.hpp  |  37 ++--
 .../detail/segmented_reduction_functions.hpp  |  25 +--
 cpp/include/cudf/replace.hpp                  |  39 ++--
 cpp/include/cudf/reshape.hpp                  |   9 +-
 cpp/include/cudf/rolling.hpp                  |  33 ++--
 cpp/include/cudf/round.hpp                    |   7 +-
 cpp/include/cudf/scalar/scalar.hpp            | 173 +++++++++---------
 cpp/include/cudf/scalar/scalar_factories.hpp  |  51 +++---
 cpp/include/cudf/search.hpp                   |  15 +-
 cpp/include/cudf/sorting.hpp                  |  25 +--
 cpp/include/cudf/stream_compaction.hpp        |  43 ++---
 cpp/include/cudf/strings/attributes.hpp       |   9 +-
 cpp/include/cudf/strings/capitalize.hpp       |  15 +-
 cpp/include/cudf/strings/case.hpp             |  15 +-
 .../cudf/strings/char_types/char_types.hpp    |   7 +-
 cpp/include/cudf/strings/combine.hpp          |  27 +--
 cpp/include/cudf/strings/contains.hpp         |  19 +-
 .../cudf/strings/convert/convert_booleans.hpp |  11 +-
 .../cudf/strings/convert/convert_datetime.hpp |  19 +-
 .../strings/convert/convert_durations.hpp     |  13 +-
 .../strings/convert/convert_fixed_point.hpp   |  17 +-
 .../cudf/strings/convert/convert_floats.hpp   |  15 +-
 .../cudf/strings/convert/convert_integers.hpp |  31 ++--
 .../cudf/strings/convert/convert_ipv4.hpp     |  15 +-
 .../cudf/strings/convert/convert_lists.hpp    |   5 +-
 .../cudf/strings/convert/convert_urls.hpp     |  11 +-
 cpp/include/cudf/strings/detail/combine.hpp   |  15 +-
 .../cudf/strings/detail/concatenate.hpp       |   5 +-
 .../cudf/strings/detail/converters.hpp        |  51 +++---
 .../cudf/strings/detail/copy_if_else.cuh      |   3 +-
 .../cudf/strings/detail/copy_range.hpp        |   3 +-
 cpp/include/cudf/strings/detail/copying.hpp   |   7 +-
 cpp/include/cudf/strings/detail/fill.hpp      |   5 +-
 cpp/include/cudf/strings/detail/gather.cuh    |   7 +-
 cpp/include/cudf/strings/detail/merge.cuh     |   3 +-
 cpp/include/cudf/strings/detail/replace.hpp   |  17 +-
 cpp/include/cudf/strings/detail/scan.hpp      |   5 +-
 cpp/include/cudf/strings/detail/scatter.cuh   |   3 +-
 .../cudf/strings/detail/strings_children.cuh  |   7 +-
 .../detail/strings_column_factories.cuh       |   5 +-
 cpp/include/cudf/strings/detail/utilities.hpp |   5 +-
 cpp/include/cudf/strings/extract.hpp          |  11 +-
 cpp/include/cudf/strings/find.hpp             |  49 ++---
 cpp/include/cudf/strings/find_multiple.hpp    |   7 +-
 cpp/include/cudf/strings/findall.hpp          |   7 +-
 cpp/include/cudf/strings/padding.hpp          |  15 +-
 cpp/include/cudf/strings/repeat_strings.hpp   |  15 +-
 cpp/include/cudf/strings/replace.hpp          |  23 +--
 cpp/include/cudf/strings/replace_re.hpp       |  15 +-
 cpp/include/cudf/strings/reverse.hpp          |   7 +-
 cpp/include/cudf/strings/slice.hpp            |   9 +-
 cpp/include/cudf/strings/split/partition.hpp  |  15 +-
 cpp/include/cudf/strings/split/split.hpp      |  35 ++--
 cpp/include/cudf/strings/split/split_re.hpp   |  27 +--
 cpp/include/cudf/strings/strip.hpp            |  11 +-
 cpp/include/cudf/strings/translate.hpp        |  15 +-
 cpp/include/cudf/strings/wrap.hpp             |   7 +-
 .../cudf/structs/detail/concatenate.hpp       |   6 +-
 cpp/include/cudf/structs/detail/scan.hpp      |   5 +-
 cpp/include/cudf/table/table.hpp              |  11 +-
 cpp/include/cudf/timezone.hpp                 |   5 +-
 cpp/include/cudf/transform.hpp                |  20 +-
 cpp/include/cudf/transpose.hpp                |   5 +-
 cpp/include/cudf/unary.hpp                    |  25 +--
 cpp/include/cudf_test/base_fixture.hpp        |   9 +-
 cpp/include/nvtext/byte_pair_encoding.hpp     |  18 +-
 cpp/include/nvtext/detail/generate_ngrams.hpp |   7 +-
 cpp/include/nvtext/detail/load_hash_file.hpp  |   3 +-
 cpp/include/nvtext/detail/tokenize.hpp        |  19 +-
 cpp/include/nvtext/edit_distance.hpp          |  12 +-
 cpp/include/nvtext/generate_ngrams.hpp        |  18 +-
 cpp/include/nvtext/jaccard.hpp                |   8 +-
 cpp/include/nvtext/minhash.hpp                |  20 +-
 cpp/include/nvtext/ngrams_tokenize.hpp        |   8 +-
 cpp/include/nvtext/normalize.hpp              |  12 +-
 cpp/include/nvtext/replace.hpp                |   8 +-
 cpp/include/nvtext/stemmer.hpp                |  16 +-
 cpp/include/nvtext/subword_tokenize.hpp       |   8 +-
 cpp/include/nvtext/tokenize.hpp               |  36 ++--
 cpp/src/binaryop/binaryop.cpp                 |  29 +--
 cpp/src/binaryop/compiled/binary_ops.cu       |  19 +-
 cpp/src/binaryop/compiled/binary_ops.hpp      |  15 +-
 cpp/src/bitmask/null_mask.cu                  |  25 +--
 cpp/src/column/column.cu                      |   9 +-
 cpp/src/column/column_factories.cpp           |  16 +-
 cpp/src/column/column_factories.cu            |  16 +-
 cpp/src/copying/concatenate.cu                |  19 +-
 cpp/src/copying/contiguous_split.cu           |  49 ++---
 cpp/src/copying/copy.cpp                      |   7 +-
 cpp/src/copying/copy.cu                       |  39 ++--
 cpp/src/copying/copy_range.cu                 |  11 +-
 cpp/src/copying/gather.cu                     |   7 +-
 cpp/src/copying/get_element.cu                |  17 +-
 cpp/src/copying/pack.cpp                      |   7 +-
 cpp/src/copying/purge_nonempty_nulls.cu       |   8 +-
 cpp/src/copying/reverse.cu                    |   9 +-
 cpp/src/copying/sample.cu                     |   5 +-
 cpp/src/copying/scatter.cu                    |  37 ++--
 cpp/src/copying/segmented_shift.cu            |  11 +-
 cpp/src/copying/shift.cu                      |  11 +-
 cpp/src/datetime/datetime_ops.cu              |  89 +++++----
 cpp/src/datetime/timezone.cpp                 |   6 +-
 cpp/src/dictionary/add_keys.cu                |   7 +-
 cpp/src/dictionary/decode.cu                  |   7 +-
 cpp/src/dictionary/detail/concatenate.cu      |   5 +-
 cpp/src/dictionary/detail/merge.cu            |   5 +-
 cpp/src/dictionary/dictionary_factories.cu    |  11 +-
 cpp/src/dictionary/encode.cu                  |   7 +-
 cpp/src/dictionary/remove_keys.cu             |  13 +-
 cpp/src/dictionary/replace.cu                 |  13 +-
 cpp/src/dictionary/search.cu                  |  19 +-
 cpp/src/dictionary/set_keys.cu                |  15 +-
 cpp/src/filling/calendrical_month_sequence.cu |   7 +-
 cpp/src/filling/fill.cu                       |  13 +-
 cpp/src/filling/repeat.cu                     |   9 +-
 cpp/src/filling/sequence.cu                   |  15 +-
 cpp/src/groupby/common/utils.hpp              |   6 +-
 cpp/src/groupby/groupby.cu                    |  17 +-
 cpp/src/groupby/hash/groupby.cu               |  11 +-
 cpp/src/groupby/sort/aggregate.cpp            |   3 +-
 cpp/src/groupby/sort/functors.hpp             |   9 +-
 cpp/src/groupby/sort/group_argmax.cu          |   3 +-
 cpp/src/groupby/sort/group_argmin.cu          |   3 +-
 cpp/src/groupby/sort/group_collect.cu         |   7 +-
 cpp/src/groupby/sort/group_correlation.cu     |   5 +-
 cpp/src/groupby/sort/group_count.cu           |   5 +-
 cpp/src/groupby/sort/group_count_scan.cu      |   3 +-
 cpp/src/groupby/sort/group_histogram.cu       |   7 +-
 cpp/src/groupby/sort/group_m2.cu              |   7 +-
 cpp/src/groupby/sort/group_max.cu             |   3 +-
 cpp/src/groupby/sort/group_max_scan.cu        |   3 +-
 cpp/src/groupby/sort/group_merge_lists.cu     |   5 +-
 cpp/src/groupby/sort/group_merge_m2.cu        |   5 +-
 cpp/src/groupby/sort/group_min.cu             |   3 +-
 cpp/src/groupby/sort/group_min_scan.cu        |   3 +-
 cpp/src/groupby/sort/group_nth_element.cu     |   3 +-
 cpp/src/groupby/sort/group_nunique.cu         |   5 +-
 cpp/src/groupby/sort/group_product.cu         |   3 +-
 cpp/src/groupby/sort/group_product_scan.cu    |   3 +-
 cpp/src/groupby/sort/group_quantiles.cu       |   5 +-
 cpp/src/groupby/sort/group_rank_scan.cu       |  17 +-
 cpp/src/groupby/sort/group_reductions.hpp     |  43 ++---
 cpp/src/groupby/sort/group_replace_nulls.cu   |   5 +-
 cpp/src/groupby/sort/group_scan.hpp           |  29 +--
 cpp/src/groupby/sort/group_scan_util.cuh      |   9 +-
 .../sort/group_single_pass_reduction_util.cuh |   7 +-
 cpp/src/groupby/sort/group_std.cu             |   7 +-
 cpp/src/groupby/sort/group_sum.cu             |   3 +-
 cpp/src/groupby/sort/group_sum_scan.cu        |   3 +-
 cpp/src/groupby/sort/scan.cpp                 |   3 +-
 cpp/src/groupby/sort/sort_helper.cu           |   9 +-
 cpp/src/hash/md5_hash.cu                      |   5 +-
 cpp/src/hash/murmurhash3_x64_128.cu           |   7 +-
 cpp/src/hash/murmurhash3_x86_32.cu            |   7 +-
 cpp/src/hash/sha1_hash.cu                     |   5 +-
 cpp/src/hash/sha224_hash.cu                   |   5 +-
 cpp/src/hash/sha256_hash.cu                   |   5 +-
 cpp/src/hash/sha384_hash.cu                   |   5 +-
 cpp/src/hash/sha512_hash.cu                   |   5 +-
 cpp/src/hash/sha_hash.cuh                     |   3 +-
 cpp/src/hash/xxhash_64.cu                     |   7 +-
 cpp/src/interop/dlpack.cpp                    |   9 +-
 cpp/src/interop/from_arrow.cu                 |  40 ++--
 cpp/src/interop/to_arrow_device.cu            |  34 ++--
 cpp/src/io/avro/reader_impl.cu                |   5 +-
 cpp/src/io/csv/durations.cu                   |   7 +-
 cpp/src/io/csv/durations.hpp                  |   5 +-
 cpp/src/io/csv/reader_impl.cu                 |   7 +-
 cpp/src/io/csv/writer_impl.cu                 |  11 +-
 cpp/src/io/functions.cpp                      |  21 ++-
 cpp/src/io/json/json_column.cu                |   7 +-
 cpp/src/io/json/json_normalization.cu         |   5 +-
 cpp/src/io/json/json_tree.cu                  |  11 +-
 cpp/src/io/json/legacy/read_json.hpp          |   3 +-
 cpp/src/io/json/legacy/reader_impl.cu         |   5 +-
 cpp/src/io/json/nested_json.hpp               |  12 +-
 cpp/src/io/json/nested_json_gpu.cu            |   9 +-
 cpp/src/io/json/read_json.cu                  |   3 +-
 cpp/src/io/json/read_json.hpp                 |   3 +-
 cpp/src/io/json/write_json.cu                 |  15 +-
 cpp/src/io/orc/reader_impl.cu                 |   6 +-
 cpp/src/io/orc/reader_impl.hpp                |   5 +-
 cpp/src/io/orc/reader_impl_helpers.cpp        |   4 +-
 cpp/src/io/orc/reader_impl_helpers.hpp        |   3 +-
 cpp/src/io/orc/reader_impl_preprocess.cu      |   5 +-
 cpp/src/io/parquet/predicate_pushdown.cpp     |   7 +-
 cpp/src/io/parquet/reader.cpp                 |   8 +-
 cpp/src/io/parquet/reader_impl.cpp            |   6 +-
 cpp/src/io/parquet/reader_impl.hpp            |   8 +-
 cpp/src/io/text/multibyte_split.cu            |   9 +-
 cpp/src/io/utilities/column_buffer.cpp        |   9 +-
 cpp/src/io/utilities/column_buffer.hpp        |  16 +-
 cpp/src/io/utilities/data_casting.cu          |   5 +-
 cpp/src/io/utilities/output_builder.cuh       |   7 +-
 cpp/src/io/utilities/string_parsing.hpp       |   3 +-
 cpp/src/join/conditional_join.cu              |  23 +--
 cpp/src/join/conditional_join.hpp             |   7 +-
 cpp/src/join/cross_join.cu                    |   7 +-
 cpp/src/join/distinct_hash_join.cu            |  13 +-
 cpp/src/join/hash_join.cu                     |  25 +--
 cpp/src/join/join.cu                          |  15 +-
 cpp/src/join/join_common_utils.cuh            |   5 +-
 cpp/src/join/join_utils.cu                    |   7 +-
 cpp/src/join/mixed_join.cu                    |  15 +-
 cpp/src/join/mixed_join_semi.cu               |   7 +-
 cpp/src/join/semi_join.cu                     |   7 +-
 cpp/src/json/json_path.cu                     |   5 +-
 cpp/src/labeling/label_bins.cu                |   9 +-
 .../combine/concatenate_list_elements.cu      |  13 +-
 cpp/src/lists/combine/concatenate_rows.cu     |   7 +-
 cpp/src/lists/contains.cu                     |  25 +--
 cpp/src/lists/copying/concatenate.cu          |   7 +-
 cpp/src/lists/copying/copying.cu              |   5 +-
 cpp/src/lists/copying/gather.cu               |   7 +-
 cpp/src/lists/copying/scatter_helper.cu       |  14 +-
 cpp/src/lists/copying/segmented_gather.cu     |   5 +-
 cpp/src/lists/count_elements.cu               |   7 +-
 cpp/src/lists/explode.cu                      |  25 +--
 cpp/src/lists/extract.cu                      |  17 +-
 cpp/src/lists/interleave_columns.cu           |  13 +-
 cpp/src/lists/lists_column_factories.cu       |  11 +-
 cpp/src/lists/reverse.cu                      |   7 +-
 cpp/src/lists/segmented_sort.cu               |  13 +-
 cpp/src/lists/sequences.cu                    |  17 +-
 cpp/src/lists/set_operations.cu               |  17 +-
 .../stream_compaction/apply_boolean_mask.cu   |   7 +-
 cpp/src/lists/stream_compaction/distinct.cu   |   5 +-
 cpp/src/lists/utilities.cu                    |  10 +-
 cpp/src/lists/utilities.hpp                   |   9 +-
 cpp/src/merge/merge.cu                        |  19 +-
 cpp/src/partitioning/partitioning.cu          |  17 +-
 cpp/src/partitioning/round_robin.cu           |   9 +-
 cpp/src/quantiles/quantile.cu                 |   9 +-
 cpp/src/quantiles/quantiles.cu                |   7 +-
 cpp/src/quantiles/tdigest/tdigest.cu          |  13 +-
 .../quantiles/tdigest/tdigest_aggregation.cu  |  23 +--
 cpp/src/reductions/all.cu                     |   8 +-
 cpp/src/reductions/any.cu                     |   8 +-
 cpp/src/reductions/collect_ops.cu             |  12 +-
 cpp/src/reductions/compound.cuh               |  14 +-
 cpp/src/reductions/histogram.cu               |  10 +-
 cpp/src/reductions/max.cu                     |   5 +-
 cpp/src/reductions/mean.cu                    |   5 +-
 cpp/src/reductions/min.cu                     |   6 +-
 cpp/src/reductions/minmax.cu                  |  15 +-
 cpp/src/reductions/nth_element.cu             |   3 +-
 cpp/src/reductions/product.cu                 |   5 +-
 cpp/src/reductions/reductions.cpp             |  11 +-
 cpp/src/reductions/scan/rank_scan.cu          |  11 +-
 cpp/src/reductions/scan/scan.cpp              |   8 +-
 cpp/src/reductions/scan/scan.cuh              |   7 +-
 cpp/src/reductions/scan/scan_exclusive.cu     |   5 +-
 cpp/src/reductions/scan/scan_inclusive.cu     |  13 +-
 cpp/src/reductions/segmented/all.cu           |   6 +-
 cpp/src/reductions/segmented/any.cu           |   6 +-
 cpp/src/reductions/segmented/compound.cuh     |  14 +-
 cpp/src/reductions/segmented/counts.cu        |   6 +-
 cpp/src/reductions/segmented/counts.hpp       |   5 +-
 cpp/src/reductions/segmented/max.cu           |   6 +-
 cpp/src/reductions/segmented/mean.cu          |   5 +-
 cpp/src/reductions/segmented/min.cu           |   6 +-
 cpp/src/reductions/segmented/nunique.cu       |   5 +-
 cpp/src/reductions/segmented/product.cu       |   6 +-
 cpp/src/reductions/segmented/reductions.cpp   |  15 +-
 cpp/src/reductions/segmented/simple.cuh       |  31 ++--
 cpp/src/reductions/segmented/std.cu           |   5 +-
 cpp/src/reductions/segmented/sum.cu           |   6 +-
 .../reductions/segmented/sum_of_squares.cu    |   5 +-
 .../reductions/segmented/update_validity.cu   |   6 +-
 .../reductions/segmented/update_validity.hpp  |   5 +-
 cpp/src/reductions/segmented/var.cu           |   5 +-
 cpp/src/reductions/simple.cuh                 |  39 ++--
 cpp/src/reductions/std.cu                     |   5 +-
 cpp/src/reductions/sum.cu                     |   5 +-
 cpp/src/reductions/sum_of_squares.cu          |   5 +-
 cpp/src/reductions/var.cu                     |   5 +-
 cpp/src/replace/clamp.cu                      |  25 +--
 cpp/src/replace/nans.cu                       |  17 +-
 cpp/src/replace/nulls.cu                      |  31 ++--
 cpp/src/replace/replace.cu                    |  13 +-
 cpp/src/reshape/byte_cast.cu                  |  15 +-
 cpp/src/reshape/interleave_columns.cu         |  15 +-
 cpp/src/reshape/tile.cu                       |   7 +-
 cpp/src/rolling/detail/lead_lag_nested.cuh    |   3 +-
 cpp/src/rolling/detail/nth_element.cuh        |   5 +-
 .../detail/optimized_unbounded_window.cpp     |  10 +-
 .../detail/optimized_unbounded_window.hpp     |   5 +-
 cpp/src/rolling/detail/rolling.cuh            |  17 +-
 cpp/src/rolling/detail/rolling.hpp            |  12 +-
 .../rolling/detail/rolling_collect_list.cu    |   5 +-
 .../rolling/detail/rolling_collect_list.cuh   |   7 +-
 .../rolling/detail/rolling_fixed_window.cu    |   4 +-
 .../rolling/detail/rolling_variable_window.cu |   4 +-
 cpp/src/rolling/grouped_rolling.cu            |  40 ++--
 cpp/src/rolling/rolling.cu                    |  10 +-
 cpp/src/round/round.cu                        |  11 +-
 cpp/src/scalar/scalar.cpp                     |  73 ++++----
 cpp/src/scalar/scalar_factories.cpp           |  35 ++--
 cpp/src/search/contains_column.cu             |  11 +-
 cpp/src/search/contains_table.cu              |   3 +-
 cpp/src/search/search_ordered.cu              |  13 +-
 cpp/src/sort/rank.cu                          |   5 +-
 cpp/src/sort/segmented_sort.cu                |  11 +-
 cpp/src/sort/segmented_sort_impl.cuh          |   7 +-
 cpp/src/sort/sort.cu                          |  13 +-
 cpp/src/sort/sort_column.cu                   |   6 +-
 cpp/src/sort/sort_column_impl.cuh             |   3 +-
 cpp/src/sort/sort_impl.cuh                    |   7 +-
 cpp/src/sort/stable_segmented_sort.cu         |  12 +-
 cpp/src/sort/stable_sort.cu                   |  13 +-
 cpp/src/sort/stable_sort_column.cu            |   6 +-
 .../stream_compaction/apply_boolean_mask.cu   |   7 +-
 cpp/src/stream_compaction/distinct.cu         |   9 +-
 cpp/src/stream_compaction/distinct_helpers.cu |   6 +-
 .../stream_compaction/distinct_helpers.hpp    |   5 +-
 cpp/src/stream_compaction/drop_nans.cu        |   9 +-
 cpp/src/stream_compaction/drop_nulls.cu       |   9 +-
 cpp/src/stream_compaction/stable_distinct.cu  |   8 +-
 cpp/src/stream_compaction/unique.cu           |   7 +-
 cpp/src/strings/attributes.cu                 |  17 +-
 cpp/src/strings/capitalize.cu                 |  15 +-
 cpp/src/strings/case.cu                       |  15 +-
 cpp/src/strings/char_types/char_types.cu      |   9 +-
 cpp/src/strings/combine/concatenate.cu        |   9 +-
 cpp/src/strings/combine/join.cu               |   5 +-
 cpp/src/strings/combine/join_list_elements.cu |   9 +-
 cpp/src/strings/contains.cu                   |  15 +-
 cpp/src/strings/convert/convert_booleans.cu   |   9 +-
 cpp/src/strings/convert/convert_datetime.cu   |  15 +-
 cpp/src/strings/convert/convert_durations.cu  |  11 +-
 .../strings/convert/convert_fixed_point.cu    |  25 +--
 cpp/src/strings/convert/convert_floats.cu     |  17 +-
 cpp/src/strings/convert/convert_hex.cu        |  15 +-
 cpp/src/strings/convert/convert_integers.cu   |  25 +--
 cpp/src/strings/convert/convert_ipv4.cu       |  13 +-
 cpp/src/strings/convert/convert_lists.cu      |   5 +-
 cpp/src/strings/convert/convert_urls.cu       |   9 +-
 cpp/src/strings/copying/concatenate.cu        |   3 +-
 cpp/src/strings/copying/copy_range.cu         |   3 +-
 cpp/src/strings/copying/copying.cu            |   3 +-
 cpp/src/strings/copying/shift.cu              |   3 +-
 cpp/src/strings/count_matches.cu              |   4 +-
 cpp/src/strings/count_matches.hpp             |   5 +-
 cpp/src/strings/extract/extract.cu            |   5 +-
 cpp/src/strings/extract/extract_all.cu        |   5 +-
 cpp/src/strings/filling/fill.cu               |   3 +-
 cpp/src/strings/filter_chars.cu               |   5 +-
 cpp/src/strings/like.cu                       |  13 +-
 cpp/src/strings/padding.cu                    |   9 +-
 cpp/src/strings/regex/utilities.cuh           |   3 +-
 cpp/src/strings/repeat_strings.cu             |  15 +-
 cpp/src/strings/replace/backref_re.cu         |   5 +-
 cpp/src/strings/replace/find_replace.cu       |   3 +-
 cpp/src/strings/replace/multi.cu              |   9 +-
 cpp/src/strings/replace/multi_re.cu           |   5 +-
 cpp/src/strings/replace/replace.cu            |   9 +-
 cpp/src/strings/replace/replace_nulls.cu      |   3 +-
 cpp/src/strings/replace/replace_re.cu         |   5 +-
 cpp/src/strings/replace/replace_slice.cu      |   5 +-
 cpp/src/strings/reverse.cu                    |   5 +-
 cpp/src/strings/scan/scan_inclusive.cu        |   9 +-
 cpp/src/strings/search/find.cu                |  45 ++---
 cpp/src/strings/search/find_multiple.cu       |   7 +-
 cpp/src/strings/search/findall.cu             |   7 +-
 cpp/src/strings/slice.cu                      |  11 +-
 cpp/src/strings/split/partition.cu            |  11 +-
 cpp/src/strings/split/split.cu                |  15 +-
 cpp/src/strings/split/split.cuh               |   5 +-
 cpp/src/strings/split/split_re.cu             |  21 ++-
 cpp/src/strings/split/split_record.cu         |  11 +-
 cpp/src/strings/strings_column_factories.cu   |   5 +-
 cpp/src/strings/strings_scalar_factories.cpp  |   5 +-
 cpp/src/strings/strip.cu                      |   7 +-
 cpp/src/strings/translate.cu                  |   5 +-
 cpp/src/strings/utilities.cu                  |   5 +-
 cpp/src/strings/wrap.cu                       |   5 +-
 cpp/src/structs/copying/concatenate.cu        |   5 +-
 cpp/src/structs/scan/scan_inclusive.cu        |   7 +-
 cpp/src/structs/structs_column_factories.cu   |   3 +-
 cpp/src/structs/utilities.cpp                 |  18 +-
 cpp/src/table/row_operators.cu                |   9 +-
 cpp/src/table/table.cpp                       |   7 +-
 cpp/src/text/bpe/byte_pair_encoding.cu        |   5 +-
 cpp/src/text/bpe/load_merge_pairs.cu          |  11 +-
 cpp/src/text/detokenize.cu                    |   5 +-
 cpp/src/text/edit_distance.cu                 |   9 +-
 cpp/src/text/generate_ngrams.cu               |  13 +-
 cpp/src/text/jaccard.cu                       |   5 +-
 cpp/src/text/minhash.cu                       |  21 ++-
 cpp/src/text/ngrams_tokenize.cu               |   5 +-
 cpp/src/text/normalize.cu                     |   9 +-
 cpp/src/text/replace.cu                       |   9 +-
 cpp/src/text/stemmer.cu                       |  15 +-
 cpp/src/text/subword/load_hash_file.cu        |   5 +-
 cpp/src/text/subword/subword_tokenize.cu      |   7 +-
 cpp/src/text/tokenize.cu                      |  25 +--
 cpp/src/text/vocabulary_tokenize.cu           |   9 +-
 cpp/src/transform/bools_to_mask.cu            |   7 +-
 cpp/src/transform/compute_column.cu           |   5 +-
 cpp/src/transform/encode.cu                   |  10 +-
 cpp/src/transform/mask_to_bools.cu            |   5 +-
 cpp/src/transform/nans_to_nulls.cu            |  11 +-
 cpp/src/transform/one_hot_encode.cu           |   5 +-
 cpp/src/transform/row_bit_count.cu            |   9 +-
 cpp/src/transform/transform.cpp               |   5 +-
 cpp/src/transpose/transpose.cu                |   7 +-
 cpp/src/unary/cast_ops.cu                     |  21 ++-
 cpp/src/unary/math_ops.cu                     |  33 ++--
 cpp/src/unary/nan_ops.cu                      |  15 +-
 cpp/src/unary/null_ops.cu                     |  12 +-
 cpp/src/unary/unary_ops.cuh                   |   5 +-
 cpp/tests/copying/shift_tests.cpp             |   9 +-
 cpp/tests/io/json_chunked_reader.cpp          |   4 +-
 cpp/tests/join/join_tests.cpp                 |   6 +-
 cpp/tests/join/semi_anti_join_tests.cpp       |   6 +-
 .../main/native/include/maps_column_view.hpp  |  20 +-
 java/src/main/native/src/RmmJni.cpp           |  10 +-
 java/src/main/native/src/maps_column_view.cu  |  23 +--
 539 files changed, 3613 insertions(+), 3012 deletions(-)

diff --git a/cpp/benchmarks/copying/shift.cu b/cpp/benchmarks/copying/shift.cu
index e1169e3bcd6..efc385cf10b 100644
--- a/cpp/benchmarks/copying/shift.cu
+++ b/cpp/benchmarks/copying/shift.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,11 +21,13 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
-  T value                             = 0,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  T value                           = 0,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   auto s = new ScalarType(value, true, stream, mr);
   return std::unique_ptr<cudf::scalar>(s);
diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index adde0ae1720..8c8d6756b00 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -120,6 +120,7 @@ class memory_stats_logger {
   }
 
  private:
+  // TODO change to resource_ref once set_current_device_resource supports it
   rmm::mr::device_memory_resource* existing_mr;
   rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource> statistics_mr;
 };
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index a36a831a7aa..38f7ac5291f 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/scan.h>
 
 #include <functional>
@@ -118,7 +120,7 @@ class expression_parser {
                     std::optional<std::reference_wrapper<cudf::table_view const>> right,
                     bool has_nulls,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr)
+                    rmm::device_async_resource_ref mr)
     : _left{left},
       _right{right},
       _expression_count{0},
@@ -139,7 +141,7 @@ class expression_parser {
                     cudf::table_view const& table,
                     bool has_nulls,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr)
+                    rmm::device_async_resource_ref mr)
     : expression_parser(expr, table, {}, has_nulls, stream, mr)
   {
   }
@@ -240,7 +242,7 @@ class expression_parser {
     data_pointers.push_back(v.data());
   }
 
-  void move_to_device(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  void move_to_device(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   {
     std::vector<cudf::size_type> sizes;
     std::vector<void const*> data_pointers;
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 9df4b4eb00f..20550e92f9f 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/scalar/scalar.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -116,8 +117,8 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a binary operation between a column and a scalar.
@@ -147,8 +148,8 @@ std::unique_ptr<column> binary_operation(
   scalar const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a binary operation between two columns.
@@ -177,8 +178,8 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a binary operation between two columns using a
@@ -208,8 +209,8 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   std::string const& ptx,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the `scale` for a `fixed_point` number based on given binary operator `op`
@@ -249,8 +250,8 @@ namespace binops {
 std::pair<rmm::device_buffer, size_type> scalar_col_valid_mask_and(
   column_view const& col,
   scalar const& s,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 namespace compiled {
 namespace detail {
diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
index 023e58c5300..22db25bdc83 100644
--- a/cpp/include/cudf/column/column.hpp
+++ b/cpp/include/cudf/column/column.hpp
@@ -24,6 +24,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <type_traits>
@@ -63,8 +64,8 @@ class column {
    * @param mr Device memory resource to use for all device memory allocations
    */
   column(column const& other,
-         rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-         rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+         rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+         rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Move the contents from `other` to create a new column.
@@ -141,8 +142,8 @@ class column {
    * @param mr Device memory resource to use for all device memory allocations
    */
   explicit column(column_view view,
-                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns the column's logical element type
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index 96322159f0f..dc4700576e6 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
@@ -75,9 +76,9 @@ std::unique_ptr<column> make_empty_column(type_id id);
 std::unique_ptr<column> make_numeric_column(
   data_type type,
   size_type size,
-  mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_state state                  = mask_state::UNALLOCATED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -102,8 +103,8 @@ std::unique_ptr<column> make_numeric_column(
   size_type size,
   B&& null_mask,
   size_type null_count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
   return std::make_unique<column>(type,
@@ -133,9 +134,9 @@ std::unique_ptr<column> make_numeric_column(
 std::unique_ptr<column> make_fixed_point_column(
   data_type type,
   size_type size,
-  mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_state state                  = mask_state::UNALLOCATED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -159,8 +160,8 @@ std::unique_ptr<column> make_fixed_point_column(
   size_type size,
   B&& null_mask,
   size_type null_count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.");
   return std::make_unique<column>(type,
@@ -191,9 +192,9 @@ std::unique_ptr<column> make_fixed_point_column(
 std::unique_ptr<column> make_timestamp_column(
   data_type type,
   size_type size,
-  mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_state state                  = mask_state::UNALLOCATED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -218,8 +219,8 @@ std::unique_ptr<column> make_timestamp_column(
   size_type size,
   B&& null_mask,
   size_type null_count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
   return std::make_unique<column>(type,
@@ -250,9 +251,9 @@ std::unique_ptr<column> make_timestamp_column(
 std::unique_ptr<column> make_duration_column(
   data_type type,
   size_type size,
-  mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_state state                  = mask_state::UNALLOCATED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -277,8 +278,8 @@ std::unique_ptr<column> make_duration_column(
   size_type size,
   B&& null_mask,
   size_type null_count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
   return std::make_unique<column>(type,
@@ -309,9 +310,9 @@ std::unique_ptr<column> make_duration_column(
 std::unique_ptr<column> make_fixed_width_column(
   data_type type,
   size_type size,
-  mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_state state                  = mask_state::UNALLOCATED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -336,8 +337,8 @@ std::unique_ptr<column> make_fixed_width_column(
   size_type size,
   B&& null_mask,
   size_type null_count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type.");
   if (is_timestamp(type)) {
@@ -375,8 +376,8 @@ std::unique_ptr<column> make_fixed_width_column(
  */
 std::unique_ptr<column> make_strings_column(
   cudf::device_span<thrust::pair<char const*, size_type> const> strings,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a STRING type column given a device span of string_view.
@@ -407,8 +408,8 @@ std::unique_ptr<column> make_strings_column(
 std::unique_ptr<column> make_strings_column(
   cudf::device_span<string_view const> string_views,
   string_view const null_placeholder,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a STRING type column given offsets column, chars columns, and null mask and null
@@ -495,8 +496,8 @@ std::unique_ptr<cudf::column> make_lists_column(
   std::unique_ptr<column> child_column,
   size_type null_count,
   rmm::device_buffer&& null_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a STRUCT column using specified child columns as members.
@@ -526,8 +527,8 @@ std::unique_ptr<cudf::column> make_structs_column(
   std::vector<std::unique_ptr<column>>&& child_columns,
   size_type null_count,
   rmm::device_buffer&& null_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a column with size elements that are all equal to the given scalar.
@@ -546,8 +547,8 @@ std::unique_ptr<cudf::column> make_structs_column(
 std::unique_ptr<column> make_column_from_scalar(
   scalar const& s,
   size_type size,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a dictionary column with size elements that are all equal to the given scalar.
@@ -566,8 +567,8 @@ std::unique_ptr<column> make_column_from_scalar(
 std::unique_ptr<column> make_dictionary_from_scalar(
   scalar const& s,
   size_type size,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/concatenate.hpp b/cpp/include/cudf/concatenate.hpp
index 9ee55275a5e..e7b55a2e6d0 100644
--- a/cpp/include/cudf/concatenate.hpp
+++ b/cpp/include/cudf/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -46,8 +47,8 @@ namespace cudf {
  */
 rmm::device_buffer concatenate_masks(
   host_span<column_view const> views,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Concatenates multiple columns into a single column
@@ -63,8 +64,8 @@ rmm::device_buffer concatenate_masks(
  */
 std::unique_ptr<column> concatenate(
   host_span<column_view const> columns_to_concat,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Columns of `tables_to_concat` are concatenated vertically to return a
@@ -92,8 +93,8 @@ std::unique_ptr<column> concatenate(
  */
 std::unique_ptr<table> concatenate(
   host_span<table_view const> tables_to_concat,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp
index 1bbbf73bd5d..0d4f20d1ef2 100644
--- a/cpp/include/cudf/contiguous_split.hpp
+++ b/cpp/include/cudf/contiguous_split.hpp
@@ -19,6 +19,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <memory>
 #include <vector>
 
@@ -119,7 +121,7 @@ struct packed_table {
 std::vector<packed_table> contiguous_split(
   cudf::table_view const& input,
   std::vector<size_type> const& splits,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 namespace detail {
 struct contiguous_split_state;
@@ -196,7 +198,7 @@ class chunked_pack {
   explicit chunked_pack(
     cudf::table_view const& input,
     std::size_t user_buffer_size,
-    rmm::mr::device_memory_resource* temp_mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref temp_mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Destructor that will be implemented as default. Declared with definition here because
@@ -261,7 +263,7 @@ class chunked_pack {
   [[nodiscard]] static std::unique_ptr<chunked_pack> create(
     cudf::table_view const& input,
     std::size_t user_buffer_size,
-    rmm::mr::device_memory_resource* temp_mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref temp_mr = rmm::mr::get_current_device_resource());
 
  private:
   // internal state of contiguous split
@@ -281,7 +283,7 @@ class chunked_pack {
  *         and device memory respectively
  */
 packed_columns pack(cudf::table_view const& input,
-                    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Produce the metadata used for packing a table stored in a contiguous buffer.
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index df96efdaffc..b17cafb05ab 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -25,6 +25,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -84,9 +85,9 @@ enum class out_of_bounds_policy : bool {
 std::unique_ptr<table> gather(
   table_view const& source_table,
   column_view const& gather_map,
-  out_of_bounds_policy bounds_policy  = out_of_bounds_policy::DONT_CHECK,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK,
+  rmm::cuda_stream_view stream       = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr  = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Reverses the rows within a table.
@@ -105,8 +106,8 @@ std::unique_ptr<table> gather(
  */
 std::unique_ptr<table> reverse(
   table_view const& source_table,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Reverses the elements of a column
@@ -125,8 +126,8 @@ std::unique_ptr<table> reverse(
  */
 std::unique_ptr<column> reverse(
   column_view const& source_column,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Scatters the rows of the source table into a copy of the target table
@@ -174,8 +175,8 @@ std::unique_ptr<table> scatter(
   table_view const& source,
   column_view const& scatter_map,
   table_view const& target,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Scatters a row of scalar values into a copy of the target table
@@ -217,8 +218,8 @@ std::unique_ptr<table> scatter(
   std::vector<std::reference_wrapper<scalar const>> const& source,
   column_view const& indices,
   table_view const& target,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Indicates when to allocate a mask, based on an existing mask.
@@ -264,9 +265,9 @@ std::unique_ptr<column> empty_like(scalar const& input);
  */
 std::unique_ptr<column> allocate_like(
   column_view const& input,
-  mask_allocation_policy mask_alloc   = mask_allocation_policy::RETAIN,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_allocation_policy mask_alloc = mask_allocation_policy::RETAIN,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates an uninitialized new column of the specified size and same type as the `input`.
@@ -287,9 +288,9 @@ std::unique_ptr<column> allocate_like(
 std::unique_ptr<column> allocate_like(
   column_view const& input,
   size_type size,
-  mask_allocation_policy mask_alloc   = mask_allocation_policy::RETAIN,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  mask_allocation_policy mask_alloc = mask_allocation_policy::RETAIN,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a table of empty columns with the same types as the `input_table`
@@ -380,8 +381,8 @@ std::unique_ptr<column> copy_range(
   size_type source_begin,
   size_type source_end,
   size_type target_begin,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a new column by shifting all values by an offset.
@@ -424,8 +425,8 @@ std::unique_ptr<column> shift(
   column_view const& input,
   size_type offset,
   scalar const& fill_value,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Slices a `column_view` into a set of `column_view`s according to a set of indices.
@@ -627,8 +628,8 @@ std::unique_ptr<column> copy_if_else(
   column_view const& lhs,
   column_view const& rhs,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
@@ -653,8 +654,8 @@ std::unique_ptr<column> copy_if_else(
   scalar const& lhs,
   column_view const& rhs,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
@@ -679,8 +680,8 @@ std::unique_ptr<column> copy_if_else(
   column_view const& lhs,
   scalar const& rhs,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
@@ -703,8 +704,8 @@ std::unique_ptr<column> copy_if_else(
   scalar const& lhs,
   scalar const& rhs,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Scatters rows from the input table to rows of the output corresponding
@@ -747,8 +748,8 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& input,
   table_view const& target,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Scatters scalar values to rows of the output corresponding
@@ -786,8 +787,8 @@ std::unique_ptr<table> boolean_mask_scatter(
   std::vector<std::reference_wrapper<scalar const>> const& input,
   table_view const& target,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Get the element at specified index from a column
@@ -806,8 +807,8 @@ std::unique_ptr<table> boolean_mask_scatter(
 std::unique_ptr<scalar> get_element(
   column_view const& input,
   size_type index,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Indicates whether a row can be sampled more than once.
@@ -851,7 +852,7 @@ std::unique_ptr<table> sample(
   sample_with_replacement replacement = sample_with_replacement::FALSE,
   int64_t const seed                  = 0,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Checks if a column or its descendants have non-empty null rows
@@ -967,8 +968,8 @@ bool may_have_nonempty_nulls(column_view const& input);
  */
 std::unique_ptr<column> purge_nonempty_nulls(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */
 }  // namespace cudf
diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index 44736ca0762..06b7d24f6cd 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -47,7 +48,7 @@ namespace datetime {
  */
 std::unique_ptr<cudf::column> extract_year(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts month from any datetime type and returns an int16_t
@@ -61,7 +62,7 @@ std::unique_ptr<cudf::column> extract_year(
  */
 std::unique_ptr<cudf::column> extract_month(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts day from any datetime type and returns an int16_t
@@ -75,7 +76,7 @@ std::unique_ptr<cudf::column> extract_month(
  */
 std::unique_ptr<cudf::column> extract_day(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts a weekday from any datetime type and returns an int16_t
@@ -89,7 +90,7 @@ std::unique_ptr<cudf::column> extract_day(
  */
 std::unique_ptr<cudf::column> extract_weekday(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts hour from any datetime type and returns an int16_t
@@ -103,7 +104,7 @@ std::unique_ptr<cudf::column> extract_weekday(
  */
 std::unique_ptr<cudf::column> extract_hour(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts minute from any datetime type and returns an int16_t
@@ -117,7 +118,7 @@ std::unique_ptr<cudf::column> extract_hour(
  */
 std::unique_ptr<cudf::column> extract_minute(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts second from any datetime type and returns an int16_t
@@ -131,7 +132,7 @@ std::unique_ptr<cudf::column> extract_minute(
  */
 std::unique_ptr<cudf::column> extract_second(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts millisecond fraction from any datetime type and returns an int16_t
@@ -148,7 +149,7 @@ std::unique_ptr<cudf::column> extract_second(
  */
 std::unique_ptr<cudf::column> extract_millisecond_fraction(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts microsecond fraction from any datetime type and returns an int16_t
@@ -165,7 +166,7 @@ std::unique_ptr<cudf::column> extract_millisecond_fraction(
  */
 std::unique_ptr<cudf::column> extract_microsecond_fraction(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Extracts nanosecond fraction from any datetime type and returns an int16_t
@@ -182,7 +183,7 @@ std::unique_ptr<cudf::column> extract_microsecond_fraction(
  */
 std::unique_ptr<cudf::column> extract_nanosecond_fraction(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 /**
@@ -203,7 +204,7 @@ std::unique_ptr<cudf::column> extract_nanosecond_fraction(
  */
 std::unique_ptr<cudf::column> last_day_of_month(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Computes the day number since the start of the year from the datetime and
@@ -217,7 +218,7 @@ std::unique_ptr<cudf::column> last_day_of_month(
  */
 std::unique_ptr<cudf::column> day_of_year(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Adds or subtracts a number of months from the datetime type and returns a
@@ -252,7 +253,7 @@ std::unique_ptr<cudf::column> day_of_year(
 std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::column_view const& months,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Adds or subtracts a number of months from the datetime type and returns a
@@ -287,7 +288,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
 std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::scalar const& months,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Check if the year of the given date is a leap year
@@ -304,7 +305,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
  */
 std::unique_ptr<cudf::column> is_leap_year(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Extract the number of days in the month
@@ -320,7 +321,7 @@ std::unique_ptr<cudf::column> is_leap_year(
  */
 std::unique_ptr<cudf::column> days_in_month(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Returns the quarter of the date
@@ -336,7 +337,7 @@ std::unique_ptr<cudf::column> days_in_month(
  */
 std::unique_ptr<cudf::column> extract_quarter(
   cudf::column_view const& column,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Fixed frequencies supported by datetime rounding functions ceil, floor, round.
@@ -365,7 +366,7 @@ enum class rounding_frequency : int32_t {
 std::unique_ptr<cudf::column> ceil_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Round datetimes down to the nearest multiple of the given frequency.
@@ -380,7 +381,7 @@ std::unique_ptr<cudf::column> ceil_datetimes(
 std::unique_ptr<cudf::column> floor_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Round datetimes to the nearest multiple of the given frequency.
@@ -395,7 +396,7 @@ std::unique_ptr<cudf::column> floor_datetimes(
 std::unique_ptr<cudf::column> round_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/detail/binaryop.hpp b/cpp/include/cudf/detail/binaryop.hpp
index e5609568d10..de1fde8bc96 100644
--- a/cpp/include/cudf/detail/binaryop.hpp
+++ b/cpp/include/cudf/detail/binaryop.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 //! Inner interfaces and implementations
@@ -26,7 +27,7 @@ namespace detail {
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, column_view const&,
- * std::string const&, data_type, rmm::mr::device_memory_resource *)
+ * std::string const&, data_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -35,11 +36,11 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          std::string const& ptx,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::binary_operation(scalar const&, column_view const&, binary_operator,
- * data_type, rmm::mr::device_memory_resource *)
+ * data_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -48,11 +49,11 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, scalar const&, binary_operator,
- * data_type, rmm::mr::device_memory_resource *)
+ * data_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -61,11 +62,11 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, column_view const&,
- * binary_operator, data_type, rmm::mr::device_memory_resource *)
+ * binary_operator, data_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -74,6 +75,6 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/calendrical_month_sequence.cuh b/cpp/include/cudf/detail/calendrical_month_sequence.cuh
index 59fb6758973..a9cf54e29b8 100644
--- a/cpp/include/cudf/detail/calendrical_month_sequence.cuh
+++ b/cpp/include/cudf/detail/calendrical_month_sequence.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -38,7 +39,7 @@ struct calendrical_month_sequence_functor {
     scalar const& input,
     size_type months,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     // Return empty column if n = 0
     if (n == 0) return cudf::make_empty_column(input.type());
diff --git a/cpp/include/cudf/detail/concatenate.hpp b/cpp/include/cudf/detail/concatenate.hpp
index 442814bc4fd..3e039175542 100644
--- a/cpp/include/cudf/detail/concatenate.hpp
+++ b/cpp/include/cudf/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
@@ -29,22 +30,22 @@ namespace cudf {
 //! Inner interfaces and implementations
 namespace detail {
 /**
- * @copydoc cudf::concatenate(host_span<column_view const>,rmm::mr::device_memory_resource*)
+ * @copydoc cudf::concatenate(host_span<column_view const>,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_concat,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::concatenate(host_span<table_view const>,rmm::mr::device_memory_resource*)
+ * @copydoc cudf::concatenate(host_span<table_view const>,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/concatenate_masks.hpp b/cpp/include/cudf/detail/concatenate_masks.hpp
index e7086ea17a5..dd2fb471a7d 100644
--- a/cpp/include/cudf/detail/concatenate_masks.hpp
+++ b/cpp/include/cudf/detail/concatenate_masks.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 //! Inner interfaces and implementations
@@ -59,13 +60,13 @@ size_type concatenate_masks(host_span<column_view const> views,
                             rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::concatenate_masks(host_span<column_view const>, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::concatenate_masks(host_span<column_view const>, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 rmm::device_buffer concatenate_masks(host_span<column_view const> views,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp
index d9a35470b7d..de00b61cdca 100644
--- a/cpp/include/cudf/detail/contiguous_split.hpp
+++ b/cpp/include/cudf/detail/contiguous_split.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -33,7 +34,7 @@ namespace detail {
 std::vector<packed_table> contiguous_split(cudf::table_view const& input,
                                            std::vector<size_type> const& splits,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::pack
@@ -42,7 +43,7 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
  **/
 packed_columns pack(cudf::table_view const& input,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr);
+                    rmm::device_async_resource_ref mr);
 
 // opaque implementation of `metadata_builder` since it needs to use
 // `serialized_column`, which is only defined in pack.cpp
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index 115822163c3..f7430eb090d 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <initializer_list>
 
@@ -123,7 +124,7 @@ std::vector<table_view> split(table_view const& input,
 
 /**
  * @copydoc cudf::shift(column_view const&,size_type,scalar const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -131,7 +132,7 @@ std::unique_ptr<column> shift(column_view const& input,
                               size_type offset,
                               scalar const& fill_value,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @brief Performs segmented shifts for specified values.
@@ -171,11 +172,11 @@ std::unique_ptr<column> segmented_shift(column_view const& segmented_values,
                                         size_type offset,
                                         scalar const& fill_value,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::allocate_like(column_view const&, size_type, mask_allocation_policy,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -183,11 +184,11 @@ std::unique_ptr<column> allocate_like(column_view const& input,
                                       size_type size,
                                       mask_allocation_policy mask_alloc,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::copy_if_else( column_view const&, column_view const&,
- * column_view const&, rmm::mr::device_memory_resource*)
+ * column_view const&, rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -195,11 +196,11 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::copy_if_else( scalar const&, column_view const&,
- * column_view const&, rmm::mr::device_memory_resource*)
+ * column_view const&, rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -207,11 +208,11 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::copy_if_else( column_view const&, scalar const&,
- * column_view const&, rmm::mr::device_memory_resource*)
+ * column_view const&, rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -219,11 +220,11 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::copy_if_else( scalar const&, scalar const&,
- * column_view const&, rmm::mr::device_memory_resource*)
+ * column_view const&, rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -231,7 +232,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::sample
@@ -243,7 +244,7 @@ std::unique_ptr<table> sample(table_view const& input,
                               sample_with_replacement replacement,
                               int64_t const seed,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::get_element
@@ -253,7 +254,7 @@ std::unique_ptr<table> sample(table_view const& input,
 std::unique_ptr<scalar> get_element(column_view const& input,
                                     size_type index,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::has_nonempty_nulls
@@ -276,7 +277,7 @@ bool may_have_nonempty_nulls(column_view const& input, rmm::cuda_stream_view str
  */
 std::unique_ptr<column> purge_nonempty_nulls(column_view const& input,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 3af050a5da6..c98057d077a 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -37,6 +37,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda/atomic>
@@ -239,7 +240,7 @@ struct scatter_gather_functor {
                                            Filter filter,
                                            cudf::size_type per_thread,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     auto output_column = cudf::detail::allocate_like(
       input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
@@ -286,7 +287,7 @@ struct scatter_gather_functor {
                                            Filter filter,
                                            cudf::size_type,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     rmm::device_uvector<cudf::size_type> indices(output_size, stream);
 
@@ -325,7 +326,7 @@ template <typename Filter>
 std::unique_ptr<table> copy_if(table_view const& input,
                                Filter filter,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index 6162fa5ecf1..ac5cb0ad141 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 
 #include <rmm/device_scalar.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/iterator_traits.h>
 #include <thrust/optional.h>
@@ -152,7 +153,7 @@ std::unique_ptr<column> copy_if_else(bool nullable,
                                      FilterFn filter,
                                      cudf::data_type output_type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   // This is the type of the thrust::optional element in the passed iterators
   using Element = typename thrust::iterator_traits<LeftIter>::value_type::value_type;
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index 9f8b0f8b619..1b3b2056c6c 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda_runtime.h>
@@ -203,7 +204,7 @@ std::unique_ptr<column> copy_range(column_view const& source,
                                    size_type source_end,
                                    size_type target_begin,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp
index c5160958165..a93c06d4371 100644
--- a/cpp/include/cudf/detail/datetime.hpp
+++ b/cpp/include/cudf/detail/datetime.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,156 +19,158 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <memory>
 
 namespace cudf {
 namespace datetime {
 namespace detail {
 /**
- * @copydoc cudf::extract_year(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_year(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_year(cudf::column_view const& column,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_month(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_month(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_month(cudf::column_view const& column,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_day(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_day(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_day(cudf::column_view const& column,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_weekday(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_weekday(cudf::column_view const& column,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr);
+                                              rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_hour(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_hour(cudf::column_view const& column,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_minute(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_minute(cudf::column_view const& column,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::extract_second(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::extract_second(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_second(cudf::column_view const& column,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&,
- * rmm::mr::device_memory_resource *)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_millisecond_fraction(cudf::column_view const& column,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr);
+                                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&,
- * rmm::mr::device_memory_resource *)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_microsecond_fraction(cudf::column_view const& column,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr);
+                                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&,
- * rmm::mr::device_memory_resource *)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> extract_nanosecond_fraction(cudf::column_view const& column,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr);
+                                                          rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::last_day_of_month(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> last_day_of_month(cudf::column_view const& column,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+                                                rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::day_of_year(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> day_of_year(cudf::column_view const& column,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::column_view const&,
- * rmm::mr::device_memory_resource *)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamps,
                                                      cudf::column_view const& months,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr);
+                                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::add_calendrical_months(cudf::column_view const&, cudf::scalar const&,
- * rmm::mr::device_memory_resource *)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamps,
                                                      cudf::scalar const& months,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr);
+                                                     rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::mr::device_memory_resource *)
+ * @copydoc cudf::is_leap_year(cudf::column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<cudf::column> is_leap_year(cudf::column_view const& column,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 std::unique_ptr<cudf::column> extract_quarter(cudf::column_view const& column,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr);
+                                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace datetime
diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
index e874151ed36..93d52d5dda3 100644
--- a/cpp/include/cudf/detail/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_set.cuh>
 
@@ -148,12 +149,12 @@ struct distinct_hash_join {
    */
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
-  inner_join(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const;
+  inner_join(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const;
 
   /**
    * @copydoc cudf::distinct_hash_join::left_join
    */
   std::unique_ptr<rmm::device_uvector<size_type>> left_join(
-    rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const;
+    rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const;
 };
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/fill.hpp b/cpp/include/cudf/detail/fill.hpp
index caaccfb4851..6996cda6974 100644
--- a/cpp/include/cudf/detail/fill.hpp
+++ b/cpp/include/cudf/detail/fill.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -48,7 +49,7 @@ std::unique_ptr<column> fill(column_view const& input,
                              size_type end,
                              scalar const& value,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 6492aa23e80..c9d350ce983 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -38,6 +38,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/gather.h>
@@ -174,7 +175,7 @@ struct column_gatherer {
                                      MapIterator gather_map_end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     column_gatherer_impl<Element> gatherer{};
 
@@ -214,7 +215,7 @@ struct column_gatherer_impl<Element, std::enable_if_t<is_rep_layout_compatible<E
                                      MapIterator gather_map_end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const num_rows = cudf::distance(gather_map_begin, gather_map_end);
     auto const policy   = cudf::mask_allocation_policy::NEVER;
@@ -260,7 +261,7 @@ struct column_gatherer_impl<string_view> {
                                      MapItType gather_map_end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (true == nullify_out_of_bounds) {
       return cudf::strings::detail::gather<true>(
@@ -334,7 +335,7 @@ struct column_gatherer_impl<list_view> {
                                      MapItRoot gather_map_end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     lists_column_view list(column);
     auto gather_map_size = std::distance(gather_map_begin, gather_map_end);
@@ -397,7 +398,7 @@ struct column_gatherer_impl<dictionary32> {
                                      MapItType gather_map_end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     dictionary_column_view dictionary(source_column);
     auto output_count = std::distance(gather_map_begin, gather_map_end);
@@ -448,7 +449,7 @@ struct column_gatherer_impl<struct_view> {
                                      MapItRoot gather_map_end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const gather_map_size = std::distance(gather_map_begin, gather_map_end);
     if (gather_map_size == 0) { return empty_like(column); }
@@ -554,7 +555,7 @@ void gather_bitmask(table_view const& source,
                     std::vector<std::unique_ptr<column>>& target,
                     gather_bitmask_op op,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr)
+                    rmm::device_async_resource_ref mr)
 {
   if (target.empty()) { return; }
 
@@ -652,7 +653,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               MapIterator gather_map_end,
                               out_of_bounds_policy bounds_policy,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   std::vector<std::unique_ptr<column>> destination_columns;
 
diff --git a/cpp/include/cudf/detail/gather.hpp b/cpp/include/cudf/detail/gather.hpp
index 034eb6c1282..36824f56895 100644
--- a/cpp/include/cudf/detail/gather.hpp
+++ b/cpp/include/cudf/detail/gather.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -66,12 +67,12 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               out_of_bounds_policy bounds_policy,
                               negative_index_policy neg_indices,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::detail::gather(table_view const&,column_view const&,table_view
  * const&,cudf::out_of_bounds_policy,cudf::detail::negative_index_policy,rmm::cuda_stream_view,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @throws cudf::logic_error if `gather_map` span size is larger than max of `size_type`.
  */
@@ -80,7 +81,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               out_of_bounds_policy bounds_policy,
                               negative_index_policy neg_indices,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/groupby.hpp b/cpp/include/cudf/detail/groupby.hpp
index 0afa69be1a3..5a8c9b0a27f 100644
--- a/cpp/include/cudf/detail/groupby.hpp
+++ b/cpp/include/cudf/detail/groupby.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <utility>
@@ -45,7 +46,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   host_span<aggregation_request const> requests,
   null_policy include_null_keys,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 }  // namespace hash
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
index e081a626c75..389c7952875 100644
--- a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
+++ b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 namespace cudf {
 namespace groupby {
 namespace detail {
@@ -40,7 +41,7 @@ std::unique_ptr<column> group_replace_nulls(cudf::column_view const& grouped_val
                                             device_span<size_type const> group_labels,
                                             cudf::replace_policy replace_policy,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace groupby
diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp
index 7b386eb5f03..567efedb9b2 100644
--- a/cpp/include/cudf/detail/groupby/sort_helper.hpp
+++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -87,7 +88,7 @@ struct sort_groupby_helper {
    */
   std::unique_ptr<column> sorted_values(column_view const& values,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
   /**
    * @brief Groups a column of values according to `keys`
@@ -101,7 +102,7 @@ struct sort_groupby_helper {
    */
   std::unique_ptr<column> grouped_values(column_view const& values,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
   /**
    * @brief Get a table of sorted unique keys
@@ -109,7 +110,7 @@ struct sort_groupby_helper {
    * @return a new table in which each row is a unique row in the sorted key table.
    */
   std::unique_ptr<table> unique_keys(rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
   /**
    * @brief Get a table of sorted keys
@@ -117,7 +118,7 @@ struct sort_groupby_helper {
    * @return a new table containing the sorted keys.
    */
   std::unique_ptr<table> sorted_keys(rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
   /**
    * @brief Get the number of groups in `keys`
diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
index 1df6848c575..dfe79646167 100644
--- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh
+++ b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
@@ -22,6 +22,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_map.cuh>
 #include <thrust/for_each.h>
@@ -124,7 +125,7 @@ rmm::device_uvector<OutputType> hash_reduce_by_row(
   ReduceFuncBuilder func_builder,
   OutputType init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto const map_dview  = map.get_device_view();
   auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index 296b68d22a9..5b2b9b5e69d 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -23,6 +23,8 @@
 #pragma nv_diag_suppress 611
 #pragma nv_diag_suppress 2810
 #endif
+#include <rmm/resource_ref.hpp>
+
 #include <arrow/api.h>
 #ifdef __CUDACC__
 #pragma nv_diag_default 611
@@ -47,7 +49,7 @@ namespace detail {
  */
 std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::to_dlpack
@@ -56,7 +58,7 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
  */
 DLManagedTensor* to_dlpack(table_view const& input,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr);
+                           rmm::device_async_resource_ref mr);
 
 // Creating arrow as per given type_id and buffer arguments
 template <typename... Ts>
@@ -127,19 +129,19 @@ std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
                                         arrow::MemoryPool* ar_mr);
 /**
  * @copydoc cudf::from_arrow(arrow::Table const& input_table, rmm::cuda_stream_view stream,
- * rmm::mr::device_memory_resource* mr)
+ * rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::from_arrow(arrow::Scalar const& input, rmm::cuda_stream_view stream,
- * rmm::mr::device_memory_resource* mr)
+ * rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @brief Return a maximum precision for a given type.
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index 27d14874bce..aabfff746ea 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -24,6 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_multimap.cuh>
 
@@ -105,7 +106,7 @@ struct hash_join {
   inner_join(cudf::table_view const& probe,
              std::optional<std::size_t> output_size,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr) const;
+             rmm::device_async_resource_ref mr) const;
 
   /**
    * @copydoc cudf::hash_join::left_join
@@ -115,7 +116,7 @@ struct hash_join {
   left_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size,
             rmm::cuda_stream_view stream,
-            rmm::mr::device_memory_resource* mr) const;
+            rmm::device_async_resource_ref mr) const;
 
   /**
    * @copydoc cudf::hash_join::full_join
@@ -125,7 +126,7 @@ struct hash_join {
   full_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size,
             rmm::cuda_stream_view stream,
-            rmm::mr::device_memory_resource* mr) const;
+            rmm::device_async_resource_ref mr) const;
 
   /**
    * @copydoc cudf::hash_join::inner_join_size
@@ -144,7 +145,7 @@ struct hash_join {
    */
   std::size_t full_join_size(cudf::table_view const& probe,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr) const;
+                             rmm::device_async_resource_ref mr) const;
 
  private:
   /**
@@ -169,7 +170,7 @@ struct hash_join {
                      join_kind join,
                      std::optional<std::size_t> output_size,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr) const;
+                     rmm::device_async_resource_ref mr) const;
 
   /**
    * @copydoc cudf::detail::hash_join::probe_join_indices
@@ -184,7 +185,7 @@ struct hash_join {
                     join_kind join,
                     std::optional<std::size_t> output_size,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr) const;
+                    rmm::device_async_resource_ref mr) const;
 };
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/label_bins.hpp b/cpp/include/cudf/detail/label_bins.hpp
index 50eeba58cdd..9f6dcce448d 100644
--- a/cpp/include/cudf/detail/label_bins.hpp
+++ b/cpp/include/cudf/detail/label_bins.hpp
@@ -25,6 +25,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -40,7 +41,7 @@ namespace detail {
 /**
  * @copydoc cudf::label_bins(column_view const& input, column_view const& left_edges, inclusive
  * left_inclusive, column_view const& right_edges, inclusive right_inclusive, rmm::cuda_stream_view,
- * rmm::mr::device_memory_resource* mr)
+ * rmm::device_async_resource_ref mr)
  *
  * @param stream Stream view on which to allocate resources and queue execution.
  */
@@ -50,7 +51,7 @@ std::unique_ptr<column> label_bins(column_view const& input,
                                    column_view const& right_edges,
                                    inclusive right_inclusive,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 /** @} */  // end of group
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/merge.hpp b/cpp/include/cudf/detail/merge.hpp
index 2167a484214..837eda0d7b5 100644
--- a/cpp/include/cudf/detail/merge.hpp
+++ b/cpp/include/cudf/detail/merge.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
@@ -45,7 +46,7 @@ using index_vector = rmm::device_uvector<index_type>;
  *            std::vector<cudf::size_type> const& key_cols,
  *            std::vector<cudf::order> const& column_order,
  *            std::vector<cudf::null_order> const& null_precedence,
- *            rmm::mr::device_memory_resource* mr)
+ *            rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
@@ -54,7 +55,7 @@ std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merg
                                    std::vector<cudf::order> const& column_order,
                                    std::vector<cudf::null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index db373f47a01..e62675cbc8c 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -26,6 +26,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_reduce.cuh>
 #include <cub/device/device_segmented_reduce.cuh>
@@ -110,7 +111,7 @@ CUDF_KERNEL void offset_bitmask_binop(Binop op,
 
 /**
  * @copydoc bitmask_binop(Binop op, host_span<bitmask_type const* const>, host_span<size_type>
- * const, size_type, rmm::mr::device_memory_resource *)
+ * const, size_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
@@ -120,7 +121,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_binop(Binop op,
                                                        host_span<size_type const> masks_begin_bits,
                                                        size_type mask_size_bits,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   auto dest_mask = rmm::device_buffer{bitmask_allocation_size_bytes(mask_size_bits), stream, mr};
   auto null_count =
@@ -163,7 +164,7 @@ size_type inplace_bitmask_binop(Binop op,
   CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }),
                "Mask pointer cannot be null");
 
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource();
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
   rmm::device_scalar<size_type> d_counter{0, stream, mr};
   rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
   rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);
@@ -282,7 +283,7 @@ rmm::device_uvector<size_type> segmented_count_bits(bitmask_type const* bitmask,
                                                     OffsetIterator last_bit_indices_begin,
                                                     count_bits_policy count_bits,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   auto const num_ranges =
     static_cast<size_type>(std::distance(first_bit_indices_begin, first_bit_indices_end));
@@ -541,7 +542,7 @@ std::pair<rmm::device_buffer, size_type> segmented_null_mask_reduction(
   null_policy null_handling,
   std::optional<bool> valid_initial_value,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto const segments_begin =
     thrust::make_zip_iterator(first_bit_indices_begin, last_bit_indices_begin);
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 74e2ccd2ea1..04d8d663acb 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
@@ -28,14 +29,14 @@ namespace cudf {
 namespace detail {
 
 /**
- * @copydoc cudf::create_null_mask(size_type, mask_state, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::create_null_mask(size_type, mask_state, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 rmm::device_buffer create_null_mask(size_type size,
                                     mask_state state,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::set_null_mask(bitmask_type*, size_type, size_type, bool)
@@ -194,7 +195,7 @@ std::vector<size_type> segmented_null_count(bitmask_type const* bitmask,
 
 /**
  * @copydoc cudf::copy_bitmask(bitmask_type const*, size_type, size_type,
- *rmm::mr::device_memory_resource*)
+ *rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -202,20 +203,20 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask,
                                 size_type begin_bit,
                                 size_type end_bit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::copy_bitmask(column_view const& view, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::copy_bitmask(column_view const& view, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 rmm::device_buffer copy_bitmask(column_view const& view,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc bitmask_and(host_span<bitmask_type const* const>, host_span<size_type> const,
- * size_type, rmm::mr::device_memory_resource *)
+ * size_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
@@ -223,7 +224,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(host_span<bitmask_type cons
                                                      host_span<size_type const> masks_begin_bits,
                                                      size_type mask_size_bits,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr);
+                                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::bitmask_and
@@ -232,7 +233,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(host_span<bitmask_type cons
  */
 std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr);
+                                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::bitmask_or
@@ -241,7 +242,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
  */
 std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr);
+                                                    rmm::device_async_resource_ref mr);
 
 /**
  * @brief Performs a bitwise AND of the specified bitmasks,
@@ -274,7 +275,7 @@ cudf::size_type inplace_bitmask_and(device_span<bitmask_type> dest_mask,
 void set_all_valid_null_masks(column_view const& input,
                               column& output,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 
diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp
index ac37d923d85..6c188d2ca68 100644
--- a/cpp/include/cudf/detail/quantiles.hpp
+++ b/cpp/include/cudf/detail/quantiles.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -35,7 +36,7 @@ std::unique_ptr<column> quantile(column_view const& input,
                                  column_view const& ordered_indices,
                                  bool exact,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::quantiles()
@@ -49,18 +50,18 @@ std::unique_ptr<table> quantiles(table_view const& input,
                                  std::vector<order> const& column_order,
                                  std::vector<null_order> const& null_precedence,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::percentile_approx(tdigest_column_view const&, column_view const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> percentile_approx(tdigest::tdigest_column_view const& input,
                                           column_view const& percentiles,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/repeat.hpp b/cpp/include/cudf/detail/repeat.hpp
index 883d5d158fb..abb9e45a95c 100644
--- a/cpp/include/cudf/detail/repeat.hpp
+++ b/cpp/include/cudf/detail/repeat.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -28,7 +29,7 @@ namespace detail {
 
 /**
  * @copydoc cudf::repeat(table_view const&, column_view const&, bool,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -36,18 +37,18 @@ std::unique_ptr<table> repeat(table_view const& input_table,
                               column_view const& count,
                               bool check_count,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::repeat(table_view const&, size_type,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> repeat(table_view const& input_table,
                               size_type count,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/replace.hpp b/cpp/include/cudf/detail/replace.hpp
index da83f7b285d..46203bdf2f0 100644
--- a/cpp/include/cudf/detail/replace.hpp
+++ b/cpp/include/cudf/detail/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -27,58 +28,58 @@ namespace cudf {
 namespace detail {
 /**
  * @copydoc cudf::replace_nulls(column_view const&, column_view const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> replace_nulls(column_view const& input,
                                       cudf::column_view const& replacement,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::replace_nulls(column_view const&, scalar const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> replace_nulls(column_view const& input,
                                       scalar const& replacement,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::replace_nulls(column_view const&, replace_policy const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> replace_nulls(column_view const& input,
                                       replace_policy const& replace_policy,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::replace_nans(column_view const&, column_view const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      column_view const& replacement,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::replace_nans(column_view const&, scalar const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      scalar const& replacement,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::find_and_replace_all
@@ -89,7 +90,7 @@ std::unique_ptr<column> find_and_replace_all(column_view const& input_col,
                                              column_view const& values_to_replace,
                                              column_view const& replacement_values,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::normalize_nans_and_zeros
@@ -98,7 +99,7 @@ std::unique_ptr<column> find_and_replace_all(column_view const& input_col,
  */
 std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr);
+                                                 rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp
index 5ab53690a23..7a1c3d6c4f0 100644
--- a/cpp/include/cudf/detail/reshape.hpp
+++ b/cpp/include/cudf/detail/reshape.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -33,7 +34,7 @@ namespace detail {
 std::unique_ptr<table> tile(table_view const& input,
                             size_type count,
                             rmm::cuda_stream_view,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::interleave_columns
@@ -42,7 +43,7 @@ std::unique_ptr<table> tile(table_view const& input,
  */
 std::unique_ptr<column> interleave_columns(table_view const& input,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/rolling.hpp b/cpp/include/cudf/detail/rolling.hpp
index da90217c254..ea6f38c421c 100644
--- a/cpp/include/cudf/detail/rolling.hpp
+++ b/cpp/include/cudf/detail/rolling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -35,7 +36,7 @@ namespace detail {
  *            column_view const& following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& agg,
- *            rmm::mr::device_memory_resource* mr)
+ *            rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -45,7 +46,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/round.hpp b/cpp/include/cudf/detail/round.hpp
index cdfc7caef37..1a9c5c82c65 100644
--- a/cpp/include/cudf/detail/round.hpp
+++ b/cpp/include/cudf/detail/round.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 //! Inner interfaces and implementations
@@ -27,7 +28,7 @@ namespace detail {
 
 /**
  * @copydoc cudf::round(column_view const&, int32_t, rounding_method,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -35,7 +36,7 @@ std::unique_ptr<column> round(column_view const& input,
                               int32_t decimal_places,
                               rounding_method method,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp
index f4b2d51d0cb..54c25d0157c 100644
--- a/cpp/include/cudf/detail/scan.hpp
+++ b/cpp/include/cudf/detail/scan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -50,7 +51,7 @@ std::unique_ptr<column> scan_exclusive(column_view const& input,
                                        scan_aggregation const& agg,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes the inclusive scan of a column.
@@ -76,7 +77,7 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        scan_aggregation const& agg,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @brief Generate row ranks for a column.
@@ -88,7 +89,7 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
  */
 std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Generate row dense ranks for a column.
@@ -100,7 +101,7 @@ std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
  */
 std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr);
+                                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Generate row ONE_NORMALIZED percent ranks for a column.
@@ -113,7 +114,7 @@ std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
  * @return rank values.
  */
 std::unique_ptr<column> inclusive_one_normalized_percent_rank_scan(
-  column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  column_view const& order_by, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index dbf7bfa9527..7eb661f7833 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -33,6 +33,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/distance.h>
@@ -145,7 +146,7 @@ struct column_scatterer_impl<Element, std::enable_if_t<cudf::is_fixed_width<Elem
                                      MapIterator scatter_map_end,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     auto result      = std::make_unique<column>(target, stream, mr);
     auto result_view = result->mutable_view();
@@ -170,7 +171,7 @@ struct column_scatterer_impl<string_view> {
                                      MapIterator scatter_map_end,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     auto d_column    = column_device_view::create(source, stream);
     auto const begin = d_column->begin<string_view>();
@@ -187,7 +188,7 @@ struct column_scatterer_impl<list_view> {
                                      MapIterator scatter_map_end,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     return cudf::lists::detail::scatter(
       source, scatter_map_begin, scatter_map_end, target, stream, mr);
@@ -202,7 +203,7 @@ struct column_scatterer_impl<dictionary32> {
                                      MapIterator scatter_map_end,
                                      column_view const& target_in,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     if (target_in.is_empty())  // empty begets empty
       return make_empty_column(type_id::DICTIONARY32);
@@ -261,7 +262,7 @@ struct column_scatterer {
                                      MapIterator scatter_map_end,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     column_scatterer_impl<Element> scatterer{};
     return scatterer(source, scatter_map_begin, scatter_map_end, target, stream, mr);
@@ -276,7 +277,7 @@ struct column_scatterer_impl<struct_view> {
                                      MapItRoot scatter_map_end,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     CUDF_EXPECTS(source.num_children() == target.num_children(),
                  "Scatter source and target are not of the same type.");
@@ -391,7 +392,7 @@ std::unique_ptr<table> scatter(table_view const& source,
                                MapIterator scatter_map_end,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp
index 94c795f31b2..95ed6af8c3c 100644
--- a/cpp/include/cudf/detail/scatter.hpp
+++ b/cpp/include/cudf/detail/scatter.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -63,11 +64,11 @@ std::unique_ptr<table> scatter(table_view const& source,
                                column_view const& scatter_map,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::detail::scatter(table_view const&,column_view const&,table_view
- * const&,bool,rmm::cuda_stream_view,rmm::mr::device_memory_resource*)
+ * const&,bool,rmm::cuda_stream_view,rmm::device_async_resource_ref)
  *
  * @throws cudf::logic_error if `scatter_map` span size is larger than max of `size_type`.
  */
@@ -75,7 +76,7 @@ std::unique_ptr<table> scatter(table_view const& source,
                                device_span<size_type const> const scatter_map,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 /**
  * @brief Scatters a row of scalar values into a copy of the target table
@@ -110,13 +111,13 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<scalar const>>
                                column_view const& indices,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::boolean_mask_scatter(
                       table_view const& source, table_view const& target,
  *                    column_view const& boolean_mask,
- *                    rmm::mr::device_memory_resource *mr)
+ *                    rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -124,14 +125,14 @@ std::unique_ptr<table> boolean_mask_scatter(table_view const& source,
                                             table_view const& target,
                                             column_view const& boolean_mask,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::boolean_mask_scatter(
  *                    std::vector<std::reference_wrapper<scalar>> const& source,
  *                    table_view const& target,
  *                    column_view const& boolean_mask,
- *                    rmm::mr::device_memory_resource *mr)
+ *                    rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -140,7 +141,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& target,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp
index 4277baf3edd..e60b18f4c8d 100644
--- a/cpp/include/cudf/detail/search.hpp
+++ b/cpp/include/cudf/detail/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::detail {
 /**
@@ -35,7 +36,7 @@ std::unique_ptr<column> lower_bound(table_view const& haystack,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::upper_bound
@@ -47,24 +48,24 @@ std::unique_ptr<column> upper_bound(table_view const& haystack,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::contains(column_view const&, scalar const&, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::contains(column_view const&, scalar const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 bool contains(column_view const& haystack, scalar const& needle, rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::contains(column_view const&, column_view const&, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::contains(column_view const&, column_view const&, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> contains(column_view const& haystack,
                                  column_view const& needles,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Check if rows in the given `needles` table exist in the `haystack` table.
@@ -96,6 +97,6 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
                                    null_equality compare_nulls,
                                    nan_equality compare_nans,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/sequence.hpp b/cpp/include/cudf/detail/sequence.hpp
index 6f2a43b54de..a18a9d3b200 100644
--- a/cpp/include/cudf/detail/sequence.hpp
+++ b/cpp/include/cudf/detail/sequence.hpp
@@ -21,12 +21,13 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
 /**
  * @copydoc cudf::sequence(size_type size, scalar const& init, scalar const& step,
- *                                       rmm::mr::device_memory_resource* mr =
+ *                                       rmm::device_async_resource_ref mr =
  *rmm::mr::get_current_device_resource())
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
@@ -35,11 +36,11 @@ std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  scalar const& step,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::sequence(size_type size, scalar const& init,
-                                         rmm::mr::device_memory_resource* mr =
+                                         rmm::device_async_resource_ref mr =
  rmm::mr::get_current_device_resource())
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
@@ -47,13 +48,13 @@ std::unique_ptr<column> sequence(size_type size,
 std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::calendrical_month_sequence(size_type size,
  *                                           scalar const& init,
  *                                           size_type months,
- *                                           rmm::mr::device_memory_resource* mr)
+ *                                           rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -61,7 +62,7 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
                                                          scalar const& init,
                                                          size_type months,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr);
+                                                         rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
index 08917bfce24..63e4fca8915 100644
--- a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
+++ b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
@@ -300,7 +301,7 @@ std::pair<std::unique_ptr<column>, size_type> make_offsets_child_column(
   InputIterator begin,
   InputIterator end,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto count          = static_cast<size_type>(std::distance(begin, end));
   auto offsets_column = make_numeric_column(
diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp
index 97cc054da57..4ddba38a7e9 100644
--- a/cpp/include/cudf/detail/sorting.hpp
+++ b/cpp/include/cudf/detail/sorting.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -37,7 +38,7 @@ std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::stable_sorted_order
@@ -48,7 +49,7 @@ std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::sort_by_key
@@ -60,7 +61,7 @@ std::unique_ptr<table> sort_by_key(table_view const& values,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::rank
@@ -74,7 +75,7 @@ std::unique_ptr<column> rank(column_view const& input,
                              null_order null_precedence,
                              bool percentage,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::stable_sort_by_key
@@ -86,7 +87,7 @@ std::unique_ptr<table> stable_sort_by_key(table_view const& values,
                                           std::vector<order> const& column_order,
                                           std::vector<null_order> const& null_precedence,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::segmented_sorted_order
@@ -98,7 +99,7 @@ std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
                                                std::vector<order> const& column_order,
                                                std::vector<null_order> const& null_precedence,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr);
+                                               rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::stable_segmented_sorted_order
@@ -111,7 +112,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::segmented_sort_by_key
@@ -124,7 +125,7 @@ std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
                                              std::vector<order> const& column_order,
                                              std::vector<null_order> const& null_precedence,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::stable_segmented_sort_by_key
@@ -137,7 +138,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
                                                     std::vector<order> const& column_order,
                                                     std::vector<null_order> const& null_precedence,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr);
+                                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::sort
@@ -148,7 +149,7 @@ std::unique_ptr<table> sort(table_view const& values,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::stable_sort
@@ -159,7 +160,7 @@ std::unique_ptr<table> stable_sort(table_view const& values,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index 7f366c06a1c..e2974789ea1 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,12 +23,13 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
 /**
  * @copydoc cudf::drop_nulls(table_view const&, std::vector<size_type> const&,
- *                           cudf::size_type, rmm::mr::device_memory_resource*)
+ *                           cudf::size_type, rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -36,11 +37,11 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
                                   cudf::size_type keep_threshold,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::drop_nans(table_view const&, std::vector<size_type> const&,
- *                          cudf::size_type, rmm::mr::device_memory_resource*)
+ *                          cudf::size_type, rmm::device_async_resource_ref)
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -48,7 +49,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
                                  cudf::size_type keep_threshold,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::apply_boolean_mask
@@ -58,7 +59,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
 std::unique_ptr<table> apply_boolean_mask(table_view const& input,
                                           column_view const& boolean_mask,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::unique
@@ -70,7 +71,7 @@ std::unique_ptr<table> unique(table_view const& input,
                               duplicate_keep_option keep,
                               null_equality nulls_equal,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::distinct
@@ -83,7 +84,7 @@ std::unique_ptr<table> distinct(table_view const& input,
                                 null_equality nulls_equal,
                                 nan_equality nans_equal,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::stable_distinct
@@ -96,7 +97,7 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::distinct_indices
@@ -108,7 +109,7 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
                                                 null_equality nulls_equal,
                                                 nan_equality nans_equal,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+                                                rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::unique_count(column_view const&, null_policy, nan_policy)
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index c0a79142cef..e736514ac29 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::structs::detail {
 
@@ -175,7 +176,7 @@ class flattened_table {
   std::vector<null_order> const& null_precedence,
   column_nullability nullability,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Superimpose nulls from a given null mask into the input column, using bitwise AND.
@@ -197,7 +198,7 @@ class flattened_table {
                                                         size_type null_count,
                                                         std::unique_ptr<column>&& input,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr);
+                                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Push down nulls from the given input column into its children columns, using bitwise AND.
@@ -222,7 +223,7 @@ class flattened_table {
  *         to be kept alive.
  */
 [[nodiscard]] std::pair<column_view, temporary_nullable_data> push_down_nulls(
-  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 /**
  * @brief Push down nulls from columns of the input table into their children columns, using
@@ -249,7 +250,7 @@ class flattened_table {
  *         to be kept alive.
  */
 [[nodiscard]] std::pair<table_view, temporary_nullable_data> push_down_nulls(
-  table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  table_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 /**
  * @brief Checks if a column or any of its children is a struct column with structs that are null.
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index b529d4a2c53..bfd12c18fff 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace tdigest {
@@ -70,7 +71,7 @@ std::unique_ptr<column> group_tdigest(column_view const& values,
                                       size_type num_groups,
                                       int max_centroids,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Merges tdigests within the same group to generate a new tdigest.
@@ -113,7 +114,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& values,
                                             size_type num_groups,
                                             int max_centroids,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create a tdigest column from its constituent components.
@@ -139,7 +140,7 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
                                             std::unique_ptr<column>&& min_values,
                                             std::unique_ptr<column>&& max_values,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create an empty tdigest column.
@@ -152,7 +153,7 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
  * @returns An empty tdigest column.
  */
 std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr);
+                                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create an empty tdigest scalar.
@@ -165,7 +166,7 @@ std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
  * @returns An empty tdigest scalar.
  */
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr);
+                                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Generate a tdigest scalar from a set of numeric input values.
@@ -199,7 +200,7 @@ std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
 std::unique_ptr<scalar> reduce_tdigest(column_view const& values,
                                        int max_centroids,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @brief Merges multiple tdigest columns to generate a new tdigest scalar.
@@ -233,7 +234,7 @@ std::unique_ptr<scalar> reduce_tdigest(column_view const& values,
 std::unique_ptr<scalar> reduce_merge_tdigest(column_view const& input,
                                              int max_centroids,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace tdigest
diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp
index f7f97c0a7c2..037164aa297 100644
--- a/cpp/include/cudf/detail/timezone.hpp
+++ b/cpp/include/cudf/detail/timezone.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,12 +18,13 @@
 #include <cudf/timezone.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::detail {
 
 /**
  * @copydoc cudf::make_timezone_transition_table(std::optional<std::string_view>, std::string_view,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -31,6 +32,6 @@ std::unique_ptr<table> make_timezone_transition_table(
   std::optional<std::string_view> tzif_dir,
   std::string_view timezone_name,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index 965fea84860..47e13fa2e5e 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -34,7 +35,7 @@ std::unique_ptr<column> transform(column_view const& input,
                                   data_type output_type,
                                   bool is_ptx,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::compute_column
@@ -44,7 +45,7 @@ std::unique_ptr<column> transform(column_view const& input,
 std::unique_ptr<column> compute_column(table_view const& table,
                                        ast::expression const& expr,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::nans_to_nulls
@@ -52,7 +53,7 @@ std::unique_ptr<column> compute_column(table_view const& table,
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
-  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::bools_to_mask
@@ -60,7 +61,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
-  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::encode
@@ -68,7 +69,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
-  cudf::table_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  cudf::table_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::one_hot_encode
@@ -78,7 +79,7 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const& input,
                                                               column_view const& categories,
                                                               rmm::cuda_stream_view stream,
-                                                              rmm::mr::device_memory_resource* mr);
+                                                              rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::mask_to_bools
@@ -89,7 +90,7 @@ std::unique_ptr<column> mask_to_bools(bitmask_type const* null_mask,
                                       size_type begin_bit,
                                       size_type end_bit,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::row_bit_count
@@ -98,7 +99,7 @@ std::unique_ptr<column> mask_to_bools(bitmask_type const* null_mask,
  */
 std::unique_ptr<column> row_bit_count(table_view const& t,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::segmented_row_bit_count
@@ -108,7 +109,7 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
 std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
                                                 size_type segment_length,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+                                                rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/transpose.hpp b/cpp/include/cudf/detail/transpose.hpp
index d0be51860b2..1f8effc8103 100644
--- a/cpp/include/cudf/detail/transpose.hpp
+++ b/cpp/include/cudf/detail/transpose.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -30,7 +31,7 @@ namespace detail {
  */
 std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr);
+                                                         rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index 12f864de572..5245cfdf079 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -50,7 +51,7 @@ std::unique_ptr<column> true_if(InputIterator begin,
                                 size_type size,
                                 Predicate p,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   auto output =
     make_numeric_column(data_type(type_id::BOOL8), size, mask_state::UNALLOCATED, stream, mr);
@@ -68,14 +69,14 @@ std::unique_ptr<column> true_if(InputIterator begin,
 std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
                                               cudf::unary_operator op,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr);
+                                              rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::is_valid
  */
 std::unique_ptr<cudf::column> is_valid(cudf::column_view const& input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::cast
@@ -83,21 +84,21 @@ std::unique_ptr<cudf::column> is_valid(cudf::column_view const& input,
 std::unique_ptr<column> cast(column_view const& input,
                              data_type type,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::is_nan
  */
 std::unique_ptr<column> is_nan(cudf::column_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::is_not_nan
  */
 std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 90ad98741ad..293a4096c57 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/host_vector.h>
 
@@ -50,7 +51,7 @@ namespace detail {
 template <typename T>
 rmm::device_uvector<T> make_zeroed_device_uvector_async(std::size_t size,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(size, stream, mr);
   CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
@@ -71,7 +72,7 @@ rmm::device_uvector<T> make_zeroed_device_uvector_async(std::size_t size,
 template <typename T>
 rmm::device_uvector<T> make_zeroed_device_uvector_sync(std::size_t size,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(size, stream, mr);
   CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(), 0, size * sizeof(T), stream.value()));
@@ -94,7 +95,7 @@ rmm::device_uvector<T> make_zeroed_device_uvector_sync(std::size_t size,
 template <typename T>
 rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
   CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
@@ -123,7 +124,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, host_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
-  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  Container const& c, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   return make_device_uvector_async(host_span<typename Container::value_type const>{c}, stream, mr);
 }
@@ -143,7 +144,7 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
 template <typename T>
 rmm::device_uvector<T> make_device_uvector_async(device_span<T const> source_data,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
   CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
@@ -172,7 +173,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
-  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  Container const& c, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   return make_device_uvector_async(
     device_span<typename Container::value_type const>{c}, stream, mr);
@@ -193,7 +194,7 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
 template <typename T>
 rmm::device_uvector<T> make_device_uvector_sync(host_span<T const> source_data,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   auto ret = make_device_uvector_async(source_data, stream, mr);
   stream.synchronize();
@@ -218,7 +219,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, host_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
-  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  Container const& c, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   return make_device_uvector_sync(host_span<typename Container::value_type const>{c}, stream, mr);
 }
@@ -238,7 +239,7 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
 template <typename T>
 rmm::device_uvector<T> make_device_uvector_sync(device_span<T const> source_data,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   auto ret = make_device_uvector_async(source_data, stream, mr);
   stream.synchronize();
@@ -263,7 +264,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
-  Container const& c, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  Container const& c, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   return make_device_uvector_sync(device_span<typename Container::value_type const>{c}, stream, mr);
 }
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index d0073177445..66163d6059a 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 
@@ -90,7 +91,7 @@ std::pair<rmm::device_buffer, size_type> valid_if(InputIterator begin,
                                                   InputIterator end,
                                                   Predicate p,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(begin <= end, "Invalid range.");
 
diff --git a/cpp/include/cudf/dictionary/detail/concatenate.hpp b/cpp/include/cudf/dictionary/detail/concatenate.hpp
index d74429484ce..55f3825b3ec 100644
--- a/cpp/include/cudf/dictionary/detail/concatenate.hpp
+++ b/cpp/include/cudf/dictionary/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -39,7 +40,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/detail/encode.hpp b/cpp/include/cudf/dictionary/detail/encode.hpp
index 2aad7dd80ed..3b5a3bbab56 100644
--- a/cpp/include/cudf/dictionary/detail/encode.hpp
+++ b/cpp/include/cudf/dictionary/detail/encode.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -54,7 +55,7 @@ namespace detail {
 std::unique_ptr<column> encode(column_view const& column,
                                data_type indices_type,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create a column by gathering the keys from the provided
@@ -73,7 +74,7 @@ std::unique_ptr<column> encode(column_view const& column,
  */
 std::unique_ptr<column> decode(dictionary_column_view const& dictionary_column,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 /**
  * @brief Return minimal integer type for the given number of elements.
diff --git a/cpp/include/cudf/dictionary/detail/merge.hpp b/cpp/include/cudf/dictionary/detail/merge.hpp
index cad495d0097..c4229690ff5 100644
--- a/cpp/include/cudf/dictionary/detail/merge.hpp
+++ b/cpp/include/cudf/dictionary/detail/merge.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -44,7 +45,7 @@ std::unique_ptr<column> merge(dictionary_column_view const& lcol,
                               dictionary_column_view const& rcol,
                               cudf::detail::index_vector const& row_order,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/detail/replace.hpp b/cpp/include/cudf/dictionary/detail/replace.hpp
index 0778baa84d6..81a91d57169 100644
--- a/cpp/include/cudf/dictionary/detail/replace.hpp
+++ b/cpp/include/cudf/dictionary/detail/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -42,7 +43,7 @@ namespace detail {
 std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       dictionary_column_view const& replacement,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create a new dictionary column by replacing nulls with a
@@ -59,7 +60,7 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
 std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       scalar const& replacement,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/detail/search.hpp b/cpp/include/cudf/dictionary/detail/search.hpp
index 62059306b9a..2563b96b214 100644
--- a/cpp/include/cudf/dictionary/detail/search.hpp
+++ b/cpp/include/cudf/dictionary/detail/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -27,14 +28,14 @@ namespace detail {
 
 /**
  * @copydoc cudf::dictionary::get_index(dictionary_column_view const&,scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
                                   scalar const& key,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Get the index for a key if it were added to the given dictionary.
@@ -58,7 +59,7 @@ std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
 std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionary,
                                          scalar const& key,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp
index 6fd743ad526..e8486a80afc 100644
--- a/cpp/include/cudf/dictionary/detail/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -35,7 +36,7 @@ namespace detail {
 std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& new_keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::dictionary::remove_keys(dictionary_column_view const&,column_view
@@ -46,7 +47,7 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
 std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_column,
                                     column_view const& keys_to_remove,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::dictionary::remove_unused_keys(dictionary_column_view
@@ -56,7 +57,7 @@ std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_col
  */
 std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& dictionary_column,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::dictionary::set_keys(dictionary_column_view
@@ -67,7 +68,7 @@ std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& diction
 std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc
@@ -78,7 +79,7 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create new dictionaries that have keys merged from dictionary columns
@@ -100,9 +101,7 @@ std::vector<std::unique_ptr<column>> match_dictionaries(
  * @return New dictionary columns and updated cudf::table_views.
  */
 std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_dictionaries(
-  std::vector<table_view> tables,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  std::vector<table_view> tables, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/dictionary_factories.hpp b/cpp/include/cudf/dictionary/dictionary_factories.hpp
index 821981ad148..7cdfa3bf9e5 100644
--- a/cpp/include/cudf/dictionary/dictionary_factories.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 /**
@@ -65,8 +66,8 @@ namespace cudf {
 std::unique_ptr<column> make_dictionary_column(
   column_view const& keys_column,
   column_view const& indices_column,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a dictionary column by taking ownership of the provided keys
@@ -117,8 +118,8 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_colu
 std::unique_ptr<column> make_dictionary_column(
   std::unique_ptr<column> keys_column,
   std::unique_ptr<column> indices_column,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp
index 959b785bf87..768e2be2b0d 100644
--- a/cpp/include/cudf/dictionary/encode.hpp
+++ b/cpp/include/cudf/dictionary/encode.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -59,9 +60,9 @@ namespace dictionary {
  */
 std::unique_ptr<column> encode(
   column_view const& column,
-  data_type indices_type              = data_type{type_id::UINT32},
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  data_type indices_type            = data_type{type_id::UINT32},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column by gathering the keys from the provided
@@ -80,8 +81,8 @@ std::unique_ptr<column> encode(
  */
 std::unique_ptr<column> decode(
   dictionary_column_view const& dictionary_column,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/search.hpp b/cpp/include/cudf/dictionary/search.hpp
index 1b72cf42acd..1dff6dc1d5d 100644
--- a/cpp/include/cudf/dictionary/search.hpp
+++ b/cpp/include/cudf/dictionary/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -44,8 +45,8 @@ namespace dictionary {
 std::unique_ptr<scalar> get_index(
   dictionary_column_view const& dictionary,
   scalar const& key,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/update_keys.hpp b/cpp/include/cudf/dictionary/update_keys.hpp
index 40504c22edd..ce7057359a1 100644
--- a/cpp/include/cudf/dictionary/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/update_keys.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -59,8 +60,8 @@ namespace dictionary {
 std::unique_ptr<column> add_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& new_keys,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new dictionary column by removing the specified keys
@@ -91,8 +92,8 @@ std::unique_ptr<column> add_keys(
 std::unique_ptr<column> remove_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys_to_remove,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new dictionary column by removing any keys
@@ -113,8 +114,8 @@ std::unique_ptr<column> remove_keys(
  */
 std::unique_ptr<column> remove_unused_keys(
   dictionary_column_view const& dictionary_column,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new dictionary column by applying only the specified keys
@@ -147,8 +148,8 @@ std::unique_ptr<column> remove_unused_keys(
 std::unique_ptr<column> set_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create new dictionaries that have keys merged from the input dictionaries.
@@ -163,8 +164,8 @@ std::unique_ptr<column> set_keys(
  */
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace dictionary
diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp
index 1268f488919..90139e8634a 100644
--- a/cpp/include/cudf/filling.hpp
+++ b/cpp/include/cudf/filling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -91,8 +92,8 @@ std::unique_ptr<column> fill(
   size_type begin,
   size_type end,
   scalar const& value,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Repeat rows of a Table.
@@ -125,8 +126,8 @@ std::unique_ptr<column> fill(
 std::unique_ptr<table> repeat(
   table_view const& input_table,
   column_view const& count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Repeat rows of a Table.
@@ -150,8 +151,8 @@ std::unique_ptr<table> repeat(
 std::unique_ptr<table> repeat(
   table_view const& input_table,
   size_type count,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Fills a column with a sequence of value specified by an initial value and a step.
@@ -181,8 +182,8 @@ std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
   scalar const& step,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Fills a column with a sequence of value specified by an initial value and a step of 1.
@@ -208,8 +209,8 @@ std::unique_ptr<column> sequence(
 std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Generate a sequence of timestamps beginning at `init` and incrementing by `months` for
@@ -239,8 +240,8 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(
   size_type size,
   scalar const& init,
   size_type months,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index 1c31e8777a8..831ef68ed15 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <utility>
@@ -184,17 +185,17 @@ class groupby {
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
     host_span<aggregation_request const> requests,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
-   * @copydoc aggregate(host_span<aggregation_request const>, rmm::mr::device_memory_resource*)
+   * @copydoc aggregate(host_span<aggregation_request const>, rmm::device_async_resource_ref)
    *
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
     host_span<aggregation_request const> requests,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
   /**
    * @brief Performs grouped scans on the specified values.
    *
@@ -248,7 +249,7 @@ class groupby {
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> scan(
     host_span<scan_request const> requests,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Performs grouped shifts for specified values.
@@ -304,7 +305,7 @@ class groupby {
     table_view const& values,
     host_span<size_type const> offsets,
     std::vector<std::reference_wrapper<scalar const>> const& fill_values,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief The grouped data corresponding to a groupby operation on a set of values.
@@ -332,8 +333,8 @@ class groupby {
    * returned groups
    * @return A `groups` object representing grouped keys and values
    */
-  groups get_groups(cudf::table_view values             = {},
-                    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  groups get_groups(cudf::table_view values           = {},
+                    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Performs grouped replace nulls on @p value
@@ -373,7 +374,7 @@ class groupby {
   std::pair<std::unique_ptr<table>, std::unique_ptr<table>> replace_nulls(
     table_view const& values,
     host_span<cudf::replace_policy const> replace_policies,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
  private:
   table_view _keys;                                      ///< Keys that determine grouping
@@ -404,18 +405,18 @@ class groupby {
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> dispatch_aggregation(
     host_span<aggregation_request const> requests,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr);
+    rmm::device_async_resource_ref mr);
 
   // Sort-based groupby
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_aggregate(
     host_span<aggregation_request const> requests,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr);
+    rmm::device_async_resource_ref mr);
 
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> sort_scan(
     host_span<scan_request const> requests,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr);
+    rmm::device_async_resource_ref mr);
 };
 /** @} */
 }  // namespace groupby
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index 83962b50a10..3c2f6dfe0d5 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -19,6 +19,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -58,9 +59,9 @@ namespace hashing {
  */
 std::unique_ptr<column> murmurhash3_x86_32(
   table_view const& input,
-  uint32_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  uint32_t seed                     = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the MurmurHash3 64-bit hash value of each row in the given table
@@ -77,9 +78,9 @@ std::unique_ptr<column> murmurhash3_x86_32(
  */
 std::unique_ptr<table> murmurhash3_x64_128(
   table_view const& input,
-  uint64_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  uint64_t seed                     = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the MD5 hash value of each row in the given table
@@ -92,8 +93,8 @@ std::unique_ptr<table> murmurhash3_x64_128(
  */
 std::unique_ptr<column> md5(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the SHA-1 hash value of each row in the given table
@@ -106,8 +107,8 @@ std::unique_ptr<column> md5(
  */
 std::unique_ptr<column> sha1(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the SHA-224 hash value of each row in the given table
@@ -120,8 +121,8 @@ std::unique_ptr<column> sha1(
  */
 std::unique_ptr<column> sha224(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the SHA-256 hash value of each row in the given table
@@ -134,8 +135,8 @@ std::unique_ptr<column> sha224(
  */
 std::unique_ptr<column> sha256(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the SHA-384 hash value of each row in the given table
@@ -148,8 +149,8 @@ std::unique_ptr<column> sha256(
  */
 std::unique_ptr<column> sha384(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the SHA-512 hash value of each row in the given table
@@ -162,8 +163,8 @@ std::unique_ptr<column> sha384(
  */
 std::unique_ptr<column> sha512(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the XXHash_64 hash value of each row in the given table
@@ -179,9 +180,9 @@ std::unique_ptr<column> sha512(
  */
 std::unique_ptr<column> xxhash_64(
   table_view const& input,
-  uint64_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  uint64_t seed                     = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace hashing
 
diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp
index 88a43a64638..77266ceb48f 100644
--- a/cpp/include/cudf/hashing/detail/hashing.hpp
+++ b/cpp/include/cudf/hashing/detail/hashing.hpp
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cstddef>
 #include <functional>
@@ -30,41 +31,41 @@ namespace detail {
 std::unique_ptr<column> murmurhash3_x86_32(table_view const& input,
                                            uint32_t seed,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 std::unique_ptr<table> murmurhash3_x64_128(table_view const& input,
                                            uint64_t seed,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> md5(table_view const& input,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> sha1(table_view const& input,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> sha224(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> sha256(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> sha384(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> sha512(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> xxhash_64(table_view const& input,
                                   uint64_t seed,
                                   rmm::cuda_stream_view,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /* Copyright 2005-2014 Daniel James.
  *
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 871f48e3aac..dc4d66a8f6e 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -23,6 +23,8 @@
 #pragma nv_diag_suppress 611
 #pragma nv_diag_suppress 2810
 #endif
+#include <rmm/resource_ref.hpp>
+
 #include <arrow/api.h>
 #ifdef __CUDACC__
 #pragma nv_diag_default 611
@@ -70,7 +72,7 @@ namespace cudf {
  */
 std::unique_ptr<table> from_dlpack(
   DLManagedTensor const* managed_tensor,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Convert a cudf table into a DLPack DLTensor
@@ -92,7 +94,7 @@ std::unique_ptr<table> from_dlpack(
  */
 DLManagedTensor* to_dlpack(
   table_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
@@ -224,8 +226,8 @@ unique_schema_t to_arrow_schema(cudf::table_view const& input,
  */
 unique_device_array_t to_arrow_device(
   cudf::table&& table,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `ArrowDeviceArray` from cudf column and metadata
@@ -253,8 +255,8 @@ unique_device_array_t to_arrow_device(
  */
 unique_device_array_t to_arrow_device(
   cudf::column&& col,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::table` from given arrow Table input
@@ -267,8 +269,8 @@ unique_device_array_t to_arrow_device(
 
 std::unique_ptr<table> from_arrow(
   arrow::Table const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::scalar` from given arrow Scalar input
@@ -281,8 +283,8 @@ std::unique_ptr<table> from_arrow(
 
 std::unique_ptr<cudf::scalar> from_arrow(
   arrow::Scalar const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp
index 89207302850..8bc74eb574c 100644
--- a/cpp/include/cudf/io/avro.hpp
+++ b/cpp/include/cudf/io/avro.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <string>
@@ -216,7 +217,7 @@ class avro_reader_options_builder {
  */
 table_with_metadata read_avro(
   avro_reader_options const& options,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index 435583e805d..fdceda40e92 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <string>
@@ -1315,8 +1316,8 @@ class csv_reader_options_builder {
  */
 table_with_metadata read_csv(
   csv_reader_options options,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 /**
@@ -1721,8 +1722,8 @@ class csv_writer_options_builder {
  * @param mr Device memory resource to use for device memory allocation
  */
 void write_csv(csv_writer_options const& options,
-               rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-               rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+               rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+               rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp
index fede8e62d9f..fe9f935d2cc 100644
--- a/cpp/include/cudf/io/detail/avro.hpp
+++ b/cpp/include/cudf/io/detail/avro.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace io {
@@ -39,7 +40,7 @@ namespace avro {
 table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
                               avro_reader_options const& options,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace avro
 }  // namespace detail
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 40ddcf385b0..50c1a7c163d 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/io/csv.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace io {
@@ -38,7 +39,7 @@ namespace csv {
 table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
                              csv_reader_options const& options,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 /**
  * @brief Write an entire dataset to CSV format.
@@ -55,7 +56,7 @@ void write_csv(data_sink* sink,
                host_span<std::string const> column_names,
                csv_writer_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr);
+               rmm::device_async_resource_ref mr);
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 3f7f7e9bb32..cf8e23c2d93 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -19,6 +19,7 @@
 #include <cudf/io/json.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::io::json::detail {
 
@@ -35,7 +36,7 @@ namespace cudf::io::json::detail {
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& options,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @brief Write an entire dataset to JSON format.
@@ -50,7 +51,7 @@ void write_json(data_sink* sink,
                 table_view const& table,
                 json_writer_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
+                rmm::device_async_resource_ref mr);
 
 /**
  * @brief Normalize single quotes to double quotes using FST
@@ -61,7 +62,7 @@ void write_json(data_sink* sink,
  */
 rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& inbuf,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr);
+                                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Normalize unquoted whitespace (space and tab characters) using FST
@@ -72,5 +73,5 @@ rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& in
  */
 rmm::device_uvector<char> normalize_whitespace(rmm::device_uvector<char>&& inbuf,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr);
+                                               rmm::device_async_resource_ref mr);
 }  // namespace cudf::io::json::detail
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index c63c952e148..9aeb9ae4267 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -23,6 +23,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <string>
@@ -57,7 +58,7 @@ class reader {
   explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                   orc_reader_options const& options,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
+                  rmm::device_async_resource_ref mr);
 
   /**
    * @brief Destructor explicitly declared to avoid inlining in header
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index df870f6f1e4..55338d422ad 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <string>
 #include <vector>
@@ -65,7 +66,7 @@ class reader {
   explicit reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                   parquet_reader_options const& options,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr);
+                  rmm::device_async_resource_ref mr);
 
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header
@@ -145,7 +146,7 @@ class chunked_reader : private reader {
                           std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                           parquet_reader_options const& options,
                           rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource* mr);
+                          rmm::device_async_resource_ref mr);
 
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header.
diff --git a/cpp/include/cudf/io/detail/tokenize_json.hpp b/cpp/include/cudf/io/detail/tokenize_json.hpp
index b2ea29a85c3..d08c4e7c65a 100644
--- a/cpp/include/cudf/io/detail/tokenize_json.hpp
+++ b/cpp/include/cudf/io/detail/tokenize_json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::io::json {
 
@@ -133,7 +134,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   device_span<SymbolT const> json_in,
   cudf::io::json_reader_options const& options,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 1f2628deea7..d8330b78f0e 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <map>
 #include <string>
@@ -612,8 +613,8 @@ class json_reader_options_builder {
  */
 table_with_metadata read_json(
   json_reader_options options,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
@@ -959,8 +960,8 @@ class json_writer_options_builder {
  * @param mr Device memory resource to use for device memory allocation
  */
 void write_json(json_writer_options const& options,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 5cc9ea81f29..bceb258cb38 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -402,8 +403,8 @@ class orc_reader_options_builder {
  */
 table_with_metadata read_orc(
   orc_reader_options const& options,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 /**
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index dc035db8d39..f58bc48a37d 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -23,6 +23,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <iostream>
 #include <memory>
@@ -409,8 +410,8 @@ class parquet_reader_options_builder {
  */
 table_with_metadata read_parquet(
   parquet_reader_options const& options,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief The chunked parquet reader class to read Parquet file iteratively in to a series of
@@ -446,8 +447,8 @@ class chunked_parquet_reader {
   chunked_parquet_reader(
     std::size_t chunk_read_limit,
     parquet_reader_options const& options,
-    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Constructor for chunked reader.
@@ -472,8 +473,8 @@ class chunked_parquet_reader {
     std::size_t chunk_read_limit,
     std::size_t pass_read_limit,
     parquet_reader_options const& options,
-    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Destructor, destroying the internal reader instance.
diff --git a/cpp/include/cudf/io/text/detail/tile_state.hpp b/cpp/include/cudf/io/text/detail/tile_state.hpp
index d42624aa9b7..aa9185b4983 100644
--- a/cpp/include/cudf/io/text/detail/tile_state.hpp
+++ b/cpp/include/cudf/io/text/detail/tile_state.hpp
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <rmm/resource_ref.hpp>
+
 #include <cub/block/block_scan.cuh>
 #include <cuda/atomic>
 
@@ -81,7 +83,7 @@ struct scan_tile_state {
 
   scan_tile_state(cudf::size_type num_tiles,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr)
+                  rmm::device_async_resource_ref mr)
     : tile_status(rmm::device_uvector<cuda::atomic<scan_tile_status, cuda::thread_scope_device>>(
         num_tiles, stream, mr)),
       tile_state_partial(rmm::device_uvector<T>(num_tiles, stream, mr)),
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
index 7bb2e4e2ece..e0b9c7635e3 100644
--- a/cpp/include/cudf/io/text/detail/trie.hpp
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <algorithm>
 #include <queue>
@@ -165,7 +166,7 @@ struct trie {
    */
   static trie create(std::string const& pattern,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr)
+                     rmm::device_async_resource_ref mr)
 
   {
     return create(std::vector<std::string>{pattern}, stream, mr);
@@ -181,7 +182,7 @@ struct trie {
    */
   static trie create(std::vector<std::string> const& patterns,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr)
+                     rmm::device_async_resource_ref mr)
   {
     std::vector<char> tokens;
     std::vector<uint8_t> transitions;
diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index a7edc9be0e4..7abae7c754b 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -85,18 +86,18 @@ struct parse_options {
 std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
-  parse_options options               = {},
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  parse_options options             = {},
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
   std::optional<byte_range_info> byte_range,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
                                               std::string const& delimiter,
-                                              rmm::mr::device_memory_resource* mr);
+                                              rmm::device_async_resource_ref mr);
 
 }  // namespace text
 }  // namespace io
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index e343ad9ee32..825f758adbd 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -26,6 +26,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 #include <utility>
@@ -95,8 +96,8 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 inner_join(cudf::table_view const& left_keys,
            cudf::table_view const& right_keys,
-           null_equality compare_nulls         = null_equality::EQUAL,
-           rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+           null_equality compare_nulls       = null_equality::EQUAL,
+           rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to a
@@ -135,8 +136,8 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 left_join(cudf::table_view const& left_keys,
           cudf::table_view const& right_keys,
-          null_equality compare_nulls         = null_equality::EQUAL,
-          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+          null_equality compare_nulls       = null_equality::EQUAL,
+          rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to a
@@ -174,8 +175,8 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 full_join(cudf::table_view const& left_keys,
           cudf::table_view const& right_keys,
-          null_equality compare_nulls         = null_equality::EQUAL,
-          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+          null_equality compare_nulls       = null_equality::EQUAL,
+          rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a vector of row indices corresponding to a left semi-join
@@ -202,8 +203,8 @@ full_join(cudf::table_view const& left_keys,
 std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a vector of row indices corresponding to a left anti join
@@ -233,8 +234,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
 std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a cross join on two tables (`left`, `right`)
@@ -261,7 +262,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
 std::unique_ptr<cudf::table> cross_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief The enum class to specify if any of the input join tables (`build` table and any later
@@ -340,7 +341,7 @@ class hash_join {
   inner_join(cudf::table_view const& probe,
              std::optional<std::size_t> output_size = {},
              rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-             rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource()) const;
+             rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource()) const;
 
   /**
    * Returns the row indices that can be used to construct the result of performing
@@ -365,7 +366,7 @@ class hash_join {
   left_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size = {},
             rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-            rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource()) const;
+            rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource()) const;
 
   /**
    * Returns the row indices that can be used to construct the result of performing
@@ -390,7 +391,7 @@ class hash_join {
   full_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size = {},
             rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-            rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource()) const;
+            rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource()) const;
 
   /**
    * Returns the exact number of matches (rows) when performing an inner join with the specified
@@ -441,8 +442,8 @@ class hash_join {
    */
   std::size_t full_join_size(
     cudf::table_view const& probe,
-    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
  private:
   const std::unique_ptr<impl_type const> _impl;
@@ -497,8 +498,8 @@ class distinct_hash_join {
    */
   std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
             std::unique_ptr<rmm::device_uvector<size_type>>>
-  inner_join(rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-             rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
+  inner_join(rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+             rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Returns the build table indices that can be used to construct the result of performing
@@ -515,8 +516,8 @@ class distinct_hash_join {
    * join between two tables with `build` and `probe` as the join keys.
    */
   std::unique_ptr<rmm::device_uvector<size_type>> left_join(
-    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
  private:
   using impl_type = typename cudf::detail::distinct_hash_join<HasNested>;  ///< Implementation type
@@ -561,12 +562,11 @@ class distinct_hash_join {
  */
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
-conditional_inner_join(
-  table_view const& left,
-  table_view const& right,
-  ast::expression const& binary_predicate,
-  std::optional<std::size_t> output_size = {},
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+conditional_inner_join(table_view const& left,
+                       table_view const& right,
+                       ast::expression const& binary_predicate,
+                       std::optional<std::size_t> output_size = {},
+                       rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs
@@ -611,7 +611,7 @@ conditional_left_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
                       std::optional<std::size_t> output_size = {},
-                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs
@@ -653,7 +653,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 conditional_full_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
-                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left table
@@ -692,7 +692,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size = {},
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left table
@@ -731,7 +731,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size = {},
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs of
@@ -789,7 +789,7 @@ mixed_inner_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs of
@@ -849,7 +849,7 @@ mixed_left_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs of
@@ -909,7 +909,7 @@ mixed_full_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left tables
@@ -955,8 +955,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& left_conditional,
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left tables
@@ -1003,8 +1003,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& left_conditional,
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1043,8 +1043,8 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
   table_view const& left_conditional,
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1083,8 +1083,8 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
   table_view const& left_conditional,
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
-  null_equality compare_nulls         = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality compare_nulls       = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1107,7 +1107,7 @@ std::size_t conditional_inner_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1130,7 +1130,7 @@ std::size_t conditional_left_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1153,7 +1153,7 @@ std::size_t conditional_left_semi_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1176,6 +1176,6 @@ std::size_t conditional_left_anti_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/json/json.hpp b/cpp/include/cudf/json/json.hpp
index 944e0c26dd6..385e8e54bdc 100644
--- a/cpp/include/cudf/json/json.hpp
+++ b/cpp/include/cudf/json/json.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/optional.h>
 
@@ -167,9 +168,9 @@ class get_json_object_options {
 std::unique_ptr<cudf::column> get_json_object(
   cudf::strings_column_view const& col,
   cudf::string_scalar const& json_path,
-  get_json_object_options options     = get_json_object_options{},
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  get_json_object_options options   = get_json_object_options{},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace cudf
diff --git a/cpp/include/cudf/labeling/label_bins.hpp b/cpp/include/cudf/labeling/label_bins.hpp
index d8ea262dfe1..9091e31a9ea 100644
--- a/cpp/include/cudf/labeling/label_bins.hpp
+++ b/cpp/include/cudf/labeling/label_bins.hpp
@@ -22,6 +22,7 @@
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -74,8 +75,8 @@ std::unique_ptr<column> label_bins(
   inclusive left_inclusive,
   column_view const& right_edges,
   inclusive right_inclusive,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp
index 0d9c1c157eb..853562acfff 100644
--- a/cpp/include/cudf/lists/combine.hpp
+++ b/cpp/include/cudf/lists/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -66,7 +67,7 @@ std::unique_ptr<column> concatenate_rows(
   table_view const& input,
   concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Concatenating multiple lists on the same row of a lists column into a single list.
@@ -97,7 +98,7 @@ std::unique_ptr<column> concatenate_list_elements(
   column_view const& input,
   concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp
index 7cf67ec9205..060882555aa 100644
--- a/cpp/include/cudf/lists/contains.hpp
+++ b/cpp/include/cudf/lists/contains.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -49,8 +50,8 @@ namespace lists {
 std::unique_ptr<column> contains(
   cudf::lists_column_view const& lists,
   cudf::scalar const& search_key,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column of `bool` values indicating whether the list rows of the first
@@ -73,8 +74,8 @@ std::unique_ptr<column> contains(
 std::unique_ptr<column> contains(
   cudf::lists_column_view const& lists,
   cudf::column_view const& search_keys,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column of `bool` values indicating whether each row in the `lists` column
@@ -95,8 +96,8 @@ std::unique_ptr<column> contains(
  */
 std::unique_ptr<column> contains_nulls(
   cudf::lists_column_view const& lists,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Option to choose whether `index_of()` returns the first or last match
@@ -138,9 +139,9 @@ enum class duplicate_find_option : int32_t {
 std::unique_ptr<column> index_of(
   cudf::lists_column_view const& lists,
   cudf::scalar const& search_key,
-  duplicate_find_option find_option   = duplicate_find_option::FIND_FIRST,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  duplicate_find_option find_option = duplicate_find_option::FIND_FIRST,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column of values indicating the position of a search key
@@ -175,9 +176,9 @@ std::unique_ptr<column> index_of(
 std::unique_ptr<column> index_of(
   cudf::lists_column_view const& lists,
   cudf::column_view const& search_keys,
-  duplicate_find_option find_option   = duplicate_find_option::FIND_FIRST,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  duplicate_find_option find_option = duplicate_find_option::FIND_FIRST,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp
index e4bd0dca9ae..2b9f5aa5607 100644
--- a/cpp/include/cudf/lists/count_elements.hpp
+++ b/cpp/include/cudf/lists/count_elements.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -51,8 +52,8 @@ namespace lists {
  */
 std::unique_ptr<column> count_elements(
   lists_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of lists_elements group
 
diff --git a/cpp/include/cudf/lists/detail/combine.hpp b/cpp/include/cudf/lists/detail/combine.hpp
index 4bc45e48a9f..bd4c01bbb4b 100644
--- a/cpp/include/cudf/lists/detail/combine.hpp
+++ b/cpp/include/cudf/lists/detail/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/lists/combine.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace lists {
 namespace detail {
@@ -30,7 +32,7 @@ namespace detail {
 std::unique_ptr<column> concatenate_rows(table_view const& input,
                                          concatenate_null_policy null_policy,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::concatenate_list_elements
@@ -40,7 +42,7 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
 std::unique_ptr<column> concatenate_list_elements(column_view const& input,
                                                   concatenate_null_policy null_policy,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr);
+                                                  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/concatenate.hpp b/cpp/include/cudf/lists/detail/concatenate.hpp
index a1f149d4ccf..d67958ef260 100644
--- a/cpp/include/cudf/lists/detail/concatenate.hpp
+++ b/cpp/include/cudf/lists/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -45,7 +46,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/contains.hpp b/cpp/include/cudf/lists/detail/contains.hpp
index 58ec18cb9ef..638cc7afb81 100644
--- a/cpp/include/cudf/lists/detail/contains.hpp
+++ b/cpp/include/cudf/lists/detail/contains.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include <cudf/lists/contains.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace lists {
 namespace detail {
@@ -26,49 +28,49 @@ namespace detail {
  * @copydoc cudf::lists::index_of(cudf::lists_column_view const&,
  *                                cudf::scalar const&,
  *                                duplicate_find_option,
- *                                rmm::mr::device_memory_resource*)
+ *                                rmm::device_async_resource_ref)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> index_of(cudf::lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  cudf::lists::duplicate_find_option find_option,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::index_of(cudf::lists_column_view const&,
  *                                cudf::column_view const&,
  *                                duplicate_find_option,
- *                                rmm::mr::device_memory_resource*)
+ *                                rmm::device_async_resource_ref)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> index_of(cudf::lists_column_view const& lists,
                                  cudf::column_view const& search_keys,
                                  cudf::lists::duplicate_find_option find_option,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::contains(cudf::lists_column_view const&,
  *                                cudf::scalar const&,
- *                                rmm::mr::device_memory_resource*)
+ *                                rmm::device_async_resource_ref)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::contains(cudf::lists_column_view const&,
  *                                cudf::column_view const&,
- *                                rmm::mr::device_memory_resource*)
+ *                                rmm::device_async_resource_ref)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
                                  cudf::column_view const& search_keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 }  // namespace detail
 }  // namespace lists
 }  // namespace cudf
diff --git a/cpp/include/cudf/lists/detail/copying.hpp b/cpp/include/cudf/lists/detail/copying.hpp
index 3760294f079..18a70bba5e9 100644
--- a/cpp/include/cudf/lists/detail/copying.hpp
+++ b/cpp/include/cudf/lists/detail/copying.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -46,7 +47,7 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
                                          size_type start,
                                          size_type end,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/extract.hpp b/cpp/include/cudf/lists/detail/extract.hpp
index 013f9b491dd..6f983d44bc9 100644
--- a/cpp/include/cudf/lists/detail/extract.hpp
+++ b/cpp/include/cudf/lists/detail/extract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,29 +18,31 @@
 #include <cudf/lists/extract.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace lists {
 namespace detail {
 
 /**
  * @copydoc cudf::lists::extract_list_element(lists_column_view, size_type,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
                                              size_type const index,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::extract_list_element(lists_column_view, column_view const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
                                              column_view const& indices,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index 03428bc347f..0cd77556f33 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -25,6 +25,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/functional.h>
@@ -59,7 +60,7 @@ struct gather_data {
  *                                 MapItType gather_map,
  *                                 size_type gather_map_size,
  *                                 rmm::cuda_stream_view stream,
- *                                 rmm::mr::device_memory_resource* mr)
+ *                                 rmm::device_async_resource_ref mr)
  *
  * @param prev_base_offsets The buffer backing the base offsets used in the gather map. We can
  *                          free this buffer before allocating the new one to keep peak memory
@@ -71,7 +72,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
                              size_type gather_map_size,
                              rmm::device_uvector<int32_t>&& prev_base_offsets,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   // size of the gather map is the # of output rows
   size_type output_count = gather_map_size;
@@ -252,7 +253,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
                              MapItType gather_map,
                              size_type gather_map_size,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   return make_gather_data<NullifyOutOfBounds, MapItType>(
     source_column,
@@ -278,7 +279,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
 std::unique_ptr<column> gather_list_nested(lists_column_view const& list,
                                            gather_data& gd,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @brief Gather a leaf column from a hierarchy of list columns.
@@ -295,13 +296,13 @@ std::unique_ptr<column> gather_list_nested(lists_column_view const& list,
 std::unique_ptr<column> gather_list_leaf(column_view const& column,
                                          gather_data const& gd,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::segmented_gather(lists_column_view const& source_column,
  *                                        lists_column_view const& gather_map_list,
  *                                        out_of_bounds_policy bounds_policy,
- *                                        rmm::mr::device_memory_resource* mr)
+ *                                        rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream on which to execute kernels
  */
@@ -309,7 +310,7 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& source_column,
                                          lists_column_view const& gather_map_list,
                                          out_of_bounds_policy bounds_policy,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/interleave_columns.hpp b/cpp/include/cudf/lists/detail/interleave_columns.hpp
index a5cf67c95b9..3aff93840a9 100644
--- a/cpp/include/cudf/lists/detail/interleave_columns.hpp
+++ b/cpp/include/cudf/lists/detail/interleave_columns.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -47,7 +48,7 @@ namespace detail {
 std::unique_ptr<column> interleave_columns(table_view const& input,
                                            bool has_null_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/lists_column_factories.hpp b/cpp/include/cudf/lists/detail/lists_column_factories.hpp
index 7b821a00b0d..192aee8d811 100644
--- a/cpp/include/cudf/lists/detail/lists_column_factories.hpp
+++ b/cpp/include/cudf/lists/detail/lists_column_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -38,7 +39,7 @@ namespace detail {
 std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& value,
                                                             size_type size,
                                                             rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource* mr);
+                                                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create an empty lists column.
@@ -51,7 +52,7 @@ std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& v
  */
 std::unique_ptr<column> make_empty_lists_column(data_type child_type,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+                                                rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create a lists column with all null rows.
@@ -64,7 +65,7 @@ std::unique_ptr<column> make_empty_lists_column(data_type child_type,
 std::unique_ptr<column> make_all_nulls_lists_column(size_type size,
                                                     data_type child_type,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr);
+                                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/reverse.hpp b/cpp/include/cudf/lists/detail/reverse.hpp
index 6e3b952a3b0..d099a0708b9 100644
--- a/cpp/include/cudf/lists/detail/reverse.hpp
+++ b/cpp/include/cudf/lists/detail/reverse.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 
 #include <cudf/lists/reverse.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::lists::detail {
 
 /**
@@ -25,6 +27,6 @@ namespace cudf::lists::detail {
  */
 std::unique_ptr<column> reverse(lists_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::lists::detail
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index 5fc52ff1c04..d0d5b1ad823 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -30,6 +30,7 @@
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
@@ -53,7 +54,7 @@ rmm::device_uvector<unbound_list_view> list_vector_from_column(
   IndexIterator index_begin,
   IndexIterator index_end,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto n_rows = thrust::distance(index_begin, index_end);
 
@@ -98,7 +99,7 @@ std::unique_ptr<column> scatter_impl(rmm::device_uvector<unbound_list_view> cons
                                      column_view const& source,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(column_types_equal(source, target), "Mismatched column types.");
 
@@ -177,7 +178,7 @@ std::unique_ptr<column> scatter(column_view const& source,
                                 MapIterator scatter_map_end,
                                 column_view const& target,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   auto const num_rows = target.size();
   if (num_rows == 0) { return cudf::empty_like(target); }
@@ -233,7 +234,7 @@ std::unique_ptr<column> scatter(scalar const& slr,
                                 MapIterator scatter_map_end,
                                 column_view const& target,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   auto const num_rows = target.size();
   if (num_rows == 0) { return cudf::empty_like(target); }
diff --git a/cpp/include/cudf/lists/detail/scatter_helper.cuh b/cpp/include/cudf/lists/detail/scatter_helper.cuh
index 605f76871b5..fc44e0bc290 100644
--- a/cpp/include/cudf/lists/detail/scatter_helper.cuh
+++ b/cpp/include/cudf/lists/detail/scatter_helper.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -136,7 +137,7 @@ std::unique_ptr<column> build_lists_child_column_recursive(
   cudf::lists_column_view const& source_lists_column_view,
   cudf::lists_column_view const& target_lists_column_view,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/set_operations.hpp b/cpp/include/cudf/lists/detail/set_operations.hpp
index 51fc58bee07..8746b1ba62a 100644
--- a/cpp/include/cudf/lists/detail/set_operations.hpp
+++ b/cpp/include/cudf/lists/detail/set_operations.hpp
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::lists::detail {
 
@@ -35,7 +36,7 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
                                      null_equality nulls_equal,
                                      nan_equality nans_equal,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::list::intersect_distinct
@@ -47,7 +48,7 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
                                            null_equality nulls_equal,
                                            nan_equality nans_equal,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::list::union_distinct
@@ -59,7 +60,7 @@ std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::list::difference_distinct
@@ -71,7 +72,7 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
                                             null_equality nulls_equal,
                                             nan_equality nans_equal,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /** @} */  // end of group
 }  // namespace cudf::lists::detail
diff --git a/cpp/include/cudf/lists/detail/sorting.hpp b/cpp/include/cudf/lists/detail/sorting.hpp
index c378ca8cf06..e428ea84ce6 100644
--- a/cpp/include/cudf/lists/detail/sorting.hpp
+++ b/cpp/include/cudf/lists/detail/sorting.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -32,7 +33,7 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
                                    order column_order,
                                    null_order null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::lists::stable_sort_lists
@@ -43,7 +44,7 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
                                           order column_order,
                                           null_order null_precedence,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp
index 7ab9cf9a343..f5e5b29bc8f 100644
--- a/cpp/include/cudf/lists/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/detail/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,19 +19,20 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::lists::detail {
 
 /**
  * @copydoc cudf::lists::apply_boolean_mask(lists_column_view const&, lists_column_view const&,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                            lists_column_view const& boolean_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::list::distinct
@@ -42,6 +43,6 @@ std::unique_ptr<column> distinct(lists_column_view const& input,
                                  null_equality nulls_equal,
                                  nan_equality nans_equal,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::lists::detail
diff --git a/cpp/include/cudf/lists/explode.hpp b/cpp/include/cudf/lists/explode.hpp
index adf46805855..81d82dcfa09 100644
--- a/cpp/include/cudf/lists/explode.hpp
+++ b/cpp/include/cudf/lists/explode.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -72,7 +73,7 @@ namespace cudf {
 std::unique_ptr<table> explode(
   table_view const& input_table,
   size_type explode_column_idx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Explodes a list column's elements and includes a position column.
@@ -116,7 +117,7 @@ std::unique_ptr<table> explode(
 std::unique_ptr<table> explode_position(
   table_view const& input_table,
   size_type explode_column_idx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Explodes a list column's elements retaining any null entries or empty lists inside.
@@ -158,7 +159,7 @@ std::unique_ptr<table> explode_position(
 std::unique_ptr<table> explode_outer(
   table_view const& input_table,
   size_type explode_column_idx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Explodes a list column's elements retaining any null entries or empty lists and includes a
@@ -202,7 +203,7 @@ std::unique_ptr<table> explode_outer(
 std::unique_ptr<table> explode_outer_position(
   table_view const& input_table,
   size_type explode_column_idx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/lists/extract.hpp b/cpp/include/cudf/lists/extract.hpp
index 14c0f59e17d..096d276fcfb 100644
--- a/cpp/include/cudf/lists/extract.hpp
+++ b/cpp/include/cudf/lists/extract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -66,8 +67,8 @@ namespace lists {
 std::unique_ptr<column> extract_list_element(
   lists_column_view const& lists_column,
   size_type index,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column where each row is a single element from the corresponding sublist
@@ -107,8 +108,8 @@ std::unique_ptr<column> extract_list_element(
 std::unique_ptr<column> extract_list_element(
   lists_column_view const& lists_column,
   column_view const& indices,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/filling.hpp b/cpp/include/cudf/lists/filling.hpp
index 3730e16482d..1d840c76bf8 100644
--- a/cpp/include/cudf/lists/filling.hpp
+++ b/cpp/include/cudf/lists/filling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -66,8 +67,8 @@ namespace cudf::lists {
 std::unique_ptr<column> sequences(
   column_view const& starts,
   column_view const& sizes,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a lists column in which each row contains a sequence of values specified by a tuple
@@ -108,8 +109,8 @@ std::unique_ptr<column> sequences(
   column_view const& starts,
   column_view const& steps,
   column_view const& sizes,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf::lists
diff --git a/cpp/include/cudf/lists/gather.hpp b/cpp/include/cudf/lists/gather.hpp
index 5e6ab6816e6..a0d79c05098 100644
--- a/cpp/include/cudf/lists/gather.hpp
+++ b/cpp/include/cudf/lists/gather.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -73,9 +74,9 @@ namespace lists {
 std::unique_ptr<column> segmented_gather(
   lists_column_view const& source_column,
   lists_column_view const& gather_map_list,
-  out_of_bounds_policy bounds_policy  = out_of_bounds_policy::DONT_CHECK,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK,
+  rmm::cuda_stream_view stream       = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr  = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/reverse.hpp b/cpp/include/cudf/lists/reverse.hpp
index 864cd796f72..34c40c5a3ba 100644
--- a/cpp/include/cudf/lists/reverse.hpp
+++ b/cpp/include/cudf/lists/reverse.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -48,8 +49,8 @@ namespace cudf::lists {
  */
 std::unique_ptr<column> reverse(
   lists_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 
diff --git a/cpp/include/cudf/lists/set_operations.hpp b/cpp/include/cudf/lists/set_operations.hpp
index 6fb8989f0bb..b8abfd62461 100644
--- a/cpp/include/cudf/lists/set_operations.hpp
+++ b/cpp/include/cudf/lists/set_operations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::lists {
 /**
@@ -59,10 +60,10 @@ namespace cudf::lists {
 std::unique_ptr<column> have_overlap(
   lists_column_view const& lhs,
   lists_column_view const& rhs,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a lists column of distinct elements common to two input lists columns.
@@ -96,10 +97,10 @@ std::unique_ptr<column> have_overlap(
 std::unique_ptr<column> intersect_distinct(
   lists_column_view const& lhs,
   lists_column_view const& rhs,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a lists column of distinct elements found in either of two input lists columns.
@@ -133,10 +134,10 @@ std::unique_ptr<column> intersect_distinct(
 std::unique_ptr<column> union_distinct(
   lists_column_view const& lhs,
   lists_column_view const& rhs,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a lists column of distinct elements found only in the left input column.
@@ -170,10 +171,10 @@ std::unique_ptr<column> union_distinct(
 std::unique_ptr<column> difference_distinct(
   lists_column_view const& lhs,
   lists_column_view const& rhs,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf::lists
diff --git a/cpp/include/cudf/lists/sorting.hpp b/cpp/include/cudf/lists/sorting.hpp
index 39a52c75a98..78cea191bc5 100644
--- a/cpp/include/cudf/lists/sorting.hpp
+++ b/cpp/include/cudf/lists/sorting.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace lists {
@@ -55,8 +56,8 @@ std::unique_ptr<column> sort_lists(
   lists_column_view const& source_column,
   order column_order,
   null_order null_precedence,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Segmented sort of the elements within a list in each row of a list column using stable
@@ -68,8 +69,8 @@ std::unique_ptr<column> stable_sort_lists(
   lists_column_view const& source_column,
   order column_order,
   null_order null_precedence,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp
index 3ac4f6861ec..31f09d37560 100644
--- a/cpp/include/cudf/lists/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::lists {
 
@@ -61,8 +62,8 @@ namespace cudf::lists {
 std::unique_ptr<column> apply_boolean_mask(
   lists_column_view const& input,
   lists_column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new list column without duplicate elements in each list.
@@ -86,10 +87,10 @@ std::unique_ptr<column> apply_boolean_mask(
  */
 std::unique_ptr<column> distinct(
   lists_column_view const& input,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/merge.hpp b/cpp/include/cudf/merge.hpp
index 8886ec24bfe..29aa3ffe934 100644
--- a/cpp/include/cudf/merge.hpp
+++ b/cpp/include/cudf/merge.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -105,7 +106,7 @@ std::unique_ptr<cudf::table> merge(
   std::vector<cudf::size_type> const& key_cols,
   std::vector<cudf::order> const& column_order,
   std::vector<cudf::null_order> const& null_precedence = {},
-  rmm::mr::device_memory_resource* mr                  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr                    = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index 524296e60ca..9e375df140b 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
@@ -89,8 +90,8 @@ size_type num_bitmask_words(size_type number_of_bits);
 rmm::device_buffer create_null_mask(
   size_type size,
   mask_state state,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Sets a pre-allocated bitmask buffer to a given state in the range
@@ -132,8 +133,8 @@ rmm::device_buffer copy_bitmask(
   bitmask_type const* mask,
   size_type begin_bit,
   size_type end_bit,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Copies `view`'s bitmask from the bits
@@ -149,8 +150,8 @@ rmm::device_buffer copy_bitmask(
  */
 rmm::device_buffer copy_bitmask(
   column_view const& view,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs bitwise AND of the bitmasks of columns of a table. Returns
@@ -166,8 +167,8 @@ rmm::device_buffer copy_bitmask(
  */
 std::pair<rmm::device_buffer, size_type> bitmask_and(
   table_view const& view,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs bitwise OR of the bitmasks of columns of a table. Returns
@@ -183,8 +184,8 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(
  */
 std::pair<rmm::device_buffer, size_type> bitmask_or(
   table_view const& view,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Given a validity bitmask, counts the number of null elements (unset bits)
diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp
index 7033aa500a2..9ed56297908 100644
--- a/cpp/include/cudf/partitioning.hpp
+++ b/cpp/include/cudf/partitioning.hpp
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -78,7 +79,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   table_view const& t,
   column_view const& partition_map,
   size_type num_partitions,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Partitions rows from the input table into multiple output tables.
@@ -104,10 +105,10 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   table_view const& input,
   std::vector<size_type> const& columns_to_hash,
   int num_partitions,
-  hash_id hash_function               = hash_id::HASH_MURMUR3,
-  uint32_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  hash_id hash_function             = hash_id::HASH_MURMUR3,
+  uint32_t seed                     = DEFAULT_HASH_SEED,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Round-robin partition.
@@ -249,8 +250,8 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
 std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> round_robin_partition(
   table_view const& input,
   cudf::size_type num_partitions,
-  cudf::size_type start_partition     = 0,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type start_partition   = 0,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp
index 1f3c26fa077..a1c98ee4e9d 100644
--- a/cpp/include/cudf/quantiles.hpp
+++ b/cpp/include/cudf/quantiles.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 /**
@@ -56,10 +57,10 @@ namespace cudf {
 std::unique_ptr<column> quantile(
   column_view const& input,
   std::vector<double> const& q,
-  interpolation interp                = interpolation::LINEAR,
-  column_view const& ordered_indices  = {},
-  bool exact                          = true,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  interpolation interp               = interpolation::LINEAR,
+  column_view const& ordered_indices = {},
+  bool exact                         = true,
+  rmm::device_async_resource_ref mr  = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the rows of the input corresponding to the requested quantiles.
@@ -98,7 +99,7 @@ std::unique_ptr<table> quantiles(
   cudf::sorted is_input_sorted                   = sorted::NO,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Calculate approximate percentiles on an input tdigest column.
@@ -125,7 +126,7 @@ std::unique_ptr<table> quantiles(
 std::unique_ptr<column> percentile_approx(
   tdigest::tdigest_column_view const& input,
   column_view const& percentiles,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp
index 52aebeb55e5..5adf89d1706 100644
--- a/cpp/include/cudf/reduction.hpp
+++ b/cpp/include/cudf/reduction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/scalar/scalar.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -81,7 +82,7 @@ std::unique_ptr<scalar> reduce(
   column_view const& col,
   reduce_aggregation const& agg,
   data_type output_dtype,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Computes the reduction of the values in all rows of a column with an initial value
@@ -103,7 +104,7 @@ std::unique_ptr<scalar> reduce(
   reduce_aggregation const& agg,
   data_type output_dtype,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Compute reduction of each segment in the input column
@@ -153,7 +154,7 @@ std::unique_ptr<column> segmented_reduce(
   segmented_reduce_aggregation const& agg,
   data_type output_dtype,
   null_policy null_handling,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Compute reduction of each segment in the input column with an initial value. Only SUM,
@@ -178,7 +179,7 @@ std::unique_ptr<column> segmented_reduce(
   data_type output_dtype,
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Computes the scan of a column.
@@ -201,8 +202,8 @@ std::unique_ptr<column> scan(
   column_view const& input,
   scan_aggregation const& agg,
   scan_type inclusive,
-  null_policy null_handling           = null_policy::EXCLUDE,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_policy null_handling         = null_policy::EXCLUDE,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Determines the minimum and maximum values of a column.
@@ -215,7 +216,7 @@ std::unique_ptr<column> scan(
  */
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
   column_view const& col,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/reduction/detail/histogram.hpp b/cpp/include/cudf/reduction/detail/histogram.hpp
index 97c711fda4e..f23c5a14e33 100644
--- a/cpp/include/cudf/reduction/detail/histogram.hpp
+++ b/cpp/include/cudf/reduction/detail/histogram.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -42,7 +43,7 @@ namespace cudf::reduction::detail {
 compute_row_frequencies(table_view const& input,
                         std::optional<column_view> const& partial_counts,
                         rmm::cuda_stream_view stream,
-                        rmm::mr::device_memory_resource* mr);
+                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Create an empty histogram column.
diff --git a/cpp/include/cudf/reduction/detail/reduction.cuh b/cpp/include/cudf/reduction/detail/reduction.cuh
index 9807d4cb4ea..7d1754d86f2 100644
--- a/cpp/include/cudf/reduction/detail/reduction.cuh
+++ b/cpp/include/cudf/reduction/detail/reduction.cuh
@@ -26,6 +26,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_reduce.cuh>
 #include <thrust/for_each.h>
@@ -62,7 +63,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                                op::simple_op<Op> op,
                                std::optional<OutputType> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto const binary_op     = cudf::detail::cast_functor<OutputType>(op.get_binary_op());
   auto const initial_value = init.value_or(op.template get_identity<OutputType>());
@@ -105,7 +106,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                                op::simple_op<Op> op,
                                std::optional<OutputType> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL(
     "This function should never be called. fixed_point reduce should always go through the reduce "
@@ -122,7 +123,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                                op::simple_op<Op> op,
                                std::optional<OutputType> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto const binary_op     = cudf::detail::cast_functor<OutputType>(op.get_binary_op());
   auto const initial_value = init.value_or(op.template get_identity<OutputType>());
@@ -188,7 +189,7 @@ std::unique_ptr<scalar> reduce(InputIterator d_in,
                                cudf::size_type valid_count,
                                cudf::size_type ddof,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto const binary_op     = cudf::detail::cast_functor<IntermediateType>(op.get_binary_op());
   auto const initial_value = op.template get_identity<IntermediateType>();
diff --git a/cpp/include/cudf/reduction/detail/reduction.hpp b/cpp/include/cudf/reduction/detail/reduction.hpp
index 4cbfb82ae6b..78f90a1e2c9 100644
--- a/cpp/include/cudf/reduction/detail/reduction.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,13 +20,15 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <optional>
 
 namespace cudf::reduction::detail {
 
 /**
  * @copydoc cudf::reduce(column_view const&, reduce_aggregation const&, data_type,
- * std::optional<std::reference_wrapper<scalar const>>, rmm::mr::device_memory_resource*)
+ * std::optional<std::reference_wrapper<scalar const>>, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -35,6 +37,6 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                                data_type output_dtype,
                                std::optional<std::reference_wrapper<scalar const>> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::reduction::detail
diff --git a/cpp/include/cudf/reduction/detail/reduction_functions.hpp b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
index 704332c8e1d..31d465619b9 100644
--- a/cpp/include/cudf/reduction/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -47,7 +48,7 @@ std::unique_ptr<scalar> sum(column_view const& col,
                             data_type const output_dtype,
                             std::optional<std::reference_wrapper<scalar const>> init,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes minimum of elements in input column
@@ -67,7 +68,7 @@ std::unique_ptr<scalar> min(column_view const& col,
                             data_type const output_dtype,
                             std::optional<std::reference_wrapper<scalar const>> init,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes maximum of elements in input column
@@ -87,7 +88,7 @@ std::unique_ptr<scalar> max(column_view const& col,
                             data_type const output_dtype,
                             std::optional<std::reference_wrapper<scalar const>> init,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes any of elements in input column is true when typecasted to bool
@@ -108,7 +109,7 @@ std::unique_ptr<scalar> any(column_view const& col,
                             data_type const output_dtype,
                             std::optional<std::reference_wrapper<scalar const>> init,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes all of elements in input column is true when typecasted to bool
@@ -129,7 +130,7 @@ std::unique_ptr<scalar> all(column_view const& col,
                             data_type const output_dtype,
                             std::optional<std::reference_wrapper<scalar const>> init,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Compute frequency for each unique element in the input column.
@@ -144,7 +145,7 @@ std::unique_ptr<scalar> all(column_view const& col,
  */
 std::unique_ptr<scalar> histogram(column_view const& input,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Merge multiple histograms together.
@@ -156,7 +157,7 @@ std::unique_ptr<scalar> histogram(column_view const& input,
  */
 std::unique_ptr<scalar> merge_histogram(column_view const& input,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes product of elements in input column
@@ -177,7 +178,7 @@ std::unique_ptr<scalar> product(column_view const& col,
                                 data_type const output_dtype,
                                 std::optional<std::reference_wrapper<scalar const>> init,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes sum of squares of elements in input column
@@ -196,7 +197,7 @@ std::unique_ptr<scalar> product(column_view const& col,
 std::unique_ptr<scalar> sum_of_squares(column_view const& col,
                                        data_type const output_dtype,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes mean of elements in input column
@@ -215,7 +216,7 @@ std::unique_ptr<scalar> sum_of_squares(column_view const& col,
 std::unique_ptr<scalar> mean(column_view const& col,
                              data_type const output_dtype,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes variance of elements in input column
@@ -237,7 +238,7 @@ std::unique_ptr<scalar> variance(column_view const& col,
                                  data_type const output_dtype,
                                  size_type ddof,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes standard deviation of elements in input column
@@ -259,7 +260,7 @@ std::unique_ptr<scalar> standard_deviation(column_view const& col,
                                            data_type const output_dtype,
                                            size_type ddof,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @brief Returns nth element in input column
@@ -289,7 +290,7 @@ std::unique_ptr<scalar> nth_element(column_view const& col,
                                     size_type n,
                                     null_policy null_handling,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @brief Collect input column into a (list) scalar
@@ -303,7 +304,7 @@ std::unique_ptr<scalar> nth_element(column_view const& col,
 std::unique_ptr<scalar> collect_list(column_view const& col,
                                      null_policy null_handling,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @brief Merge a bunch of list scalars into single list scalar
@@ -315,7 +316,7 @@ std::unique_ptr<scalar> collect_list(column_view const& col,
  */
 std::unique_ptr<scalar> merge_lists(lists_column_view const& col,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @brief Collect input column into a (list) scalar without duplicated elements
@@ -333,7 +334,7 @@ std::unique_ptr<scalar> collect_set(column_view const& col,
                                     null_equality nulls_equal,
                                     nan_equality nans_equal,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @brief Merge a bunch of list scalars into single list scalar then drop duplicated elements
@@ -349,7 +350,7 @@ std::unique_ptr<scalar> merge_sets(lists_column_view const& col,
                                    null_equality nulls_equal,
                                    nan_equality nans_equal,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace reduction
diff --git a/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
index 3902a7200a9..770ac6580ef 100644
--- a/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -57,7 +58,7 @@ std::unique_ptr<column> segmented_sum(column_view const& col,
                                       null_policy null_handling,
                                       std::optional<std::reference_wrapper<scalar const>> init,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes product of each segment in the input column
@@ -87,7 +88,7 @@ std::unique_ptr<column> segmented_product(column_view const& col,
                                           null_policy null_handling,
                                           std::optional<std::reference_wrapper<scalar const>> init,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @brief Compute minimum of each segment in the input column
@@ -116,7 +117,7 @@ std::unique_ptr<column> segmented_min(column_view const& col,
                                       null_policy null_handling,
                                       std::optional<std::reference_wrapper<scalar const>> init,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Compute maximum of each segment in the input column
@@ -145,7 +146,7 @@ std::unique_ptr<column> segmented_max(column_view const& col,
                                       null_policy null_handling,
                                       std::optional<std::reference_wrapper<scalar const>> init,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Compute if any of the values in the segment are true when typecasted to bool
@@ -175,7 +176,7 @@ std::unique_ptr<column> segmented_any(column_view const& col,
                                       null_policy null_handling,
                                       std::optional<std::reference_wrapper<scalar const>> init,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Compute if all of the values in the segment are true when typecasted to bool
@@ -205,7 +206,7 @@ std::unique_ptr<column> segmented_all(column_view const& col,
                                       null_policy null_handling,
                                       std::optional<std::reference_wrapper<scalar const>> init,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes mean of elements of segments in the input column
@@ -233,7 +234,7 @@ std::unique_ptr<column> segmented_mean(column_view const& col,
                                        data_type const output_dtype,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes sum of squares of elements of segments in the input column
@@ -261,7 +262,7 @@ std::unique_ptr<column> segmented_sum_of_squares(column_view const& col,
                                                  data_type const output_dtype,
                                                  null_policy null_handling,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr);
+                                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes the standard deviation of elements of segments in the input column
@@ -292,7 +293,7 @@ std::unique_ptr<column> segmented_standard_deviation(column_view const& col,
                                                      null_policy null_handling,
                                                      size_type ddof,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr);
+                                                     rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes the variance of elements of segments in the input column
@@ -323,7 +324,7 @@ std::unique_ptr<column> segmented_variance(column_view const& col,
                                            null_policy null_handling,
                                            size_type ddof,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @brief Counts the number of unique values within each segment of a column
@@ -351,7 +352,7 @@ std::unique_ptr<column> segmented_nunique(column_view const& col,
                                           device_span<size_type const> offsets,
                                           null_policy null_handling,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace reduction
diff --git a/cpp/include/cudf/replace.hpp b/cpp/include/cudf/replace.hpp
index 3405dc8b796..ae20e72f023 100644
--- a/cpp/include/cudf/replace.hpp
+++ b/cpp/include/cudf/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -55,8 +56,8 @@ enum class replace_policy : bool { PRECEDING, FOLLOWING };
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   column_view const& replacement,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces all null values in a column with a scalar.
@@ -74,8 +75,8 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   scalar const& replacement,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces all null values in a column with the first non-null value that precedes/follows.
@@ -93,8 +94,8 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   replace_policy const& replace_policy,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces all NaN values in a column with corresponding values from another column
@@ -121,8 +122,8 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nans(
   column_view const& input,
   column_view const& replacement,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces all NaN values in a column with a scalar
@@ -148,8 +149,8 @@ std::unique_ptr<column> replace_nans(
 std::unique_ptr<column> replace_nans(
   column_view const& input,
   scalar const& replacement,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Return a copy of `input_col` replacing any `values_to_replace[i]`
@@ -167,8 +168,8 @@ std::unique_ptr<column> find_and_replace_all(
   column_view const& input_col,
   column_view const& values_to_replace,
   column_view const& replacement_values,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces values less than `lo` in `input` with `lo_replace`,
@@ -222,8 +223,8 @@ std::unique_ptr<column> clamp(
   scalar const& lo_replace,
   scalar const& hi,
   scalar const& hi_replace,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces values less than `lo` in `input` with `lo`,
@@ -268,8 +269,8 @@ std::unique_ptr<column> clamp(
   column_view const& input,
   scalar const& lo,
   scalar const& hi,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Copies from a column of floating-point elements and replaces `-NaN` and `-0.0` with `+NaN`
@@ -288,8 +289,8 @@ std::unique_ptr<column> clamp(
  */
 std::unique_ptr<column> normalize_nans_and_zeros(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Modifies a column of floating-point elements to replace all `-NaN` and `-0.0` with `+NaN`
diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp
index 42cfb890a31..26316be7fd4 100644
--- a/cpp/include/cudf/reshape.hpp
+++ b/cpp/include/cudf/reshape.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -52,7 +53,7 @@ namespace cudf {
  */
 std::unique_ptr<column> interleave_columns(
   table_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Repeats the rows from `input` table `count` times to form a new table.
@@ -75,7 +76,7 @@ std::unique_ptr<column> interleave_columns(
 std::unique_ptr<table> tile(
   table_view const& input,
   size_type count,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Configures whether byte casting flips endianness
@@ -100,7 +101,7 @@ enum class flip_endianness : bool { NO, YES };
 std::unique_ptr<column> byte_cast(
   column_view const& input_column,
   flip_endianness endian_configuration,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp
index ec93c709163..2cd34f48265 100644
--- a/cpp/include/cudf/rolling.hpp
+++ b/cpp/include/cudf/rolling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -66,7 +67,7 @@ std::unique_ptr<column> rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& agg,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  @copybrief rolling_window
@@ -76,7 +77,7 @@ std::unique_ptr<column> rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& agg,
- *            rmm::mr::device_memory_resource* mr)
+ *            rmm::device_async_resource_ref mr)
  *
  * @param default_outputs A column of per-row default values to be returned instead
  *                        of nulls. Used for LEAD()/LAG(), if the row offset crosses
@@ -89,7 +90,7 @@ std::unique_ptr<column> rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& agg,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Abstraction for window boundary sizes
@@ -237,7 +238,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  @copybrief grouped_rolling_window
@@ -248,7 +249,7 @@ std::unique_ptr<column> grouped_rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& aggr,
- *            rmm::mr::device_memory_resource* mr)
+ *            rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> grouped_rolling_window(
   table_view const& group_keys,
@@ -257,7 +258,7 @@ std::unique_ptr<column> grouped_rolling_window(
   window_bounds following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  @copybrief grouped_rolling_window
@@ -268,7 +269,7 @@ std::unique_ptr<column> grouped_rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& aggr,
- *            rmm::mr::device_memory_resource* mr)
+ *            rmm::device_async_resource_ref mr)
  *
  * @param default_outputs A column of per-row default values to be returned instead
  *                        of nulls. Used for LEAD()/LAG(), if the row offset crosses
@@ -282,7 +283,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  @copybrief grouped_rolling_window
@@ -294,7 +295,7 @@ std::unique_ptr<column> grouped_rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& aggr,
- *            rmm::mr::device_memory_resource* mr)
+ *            rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> grouped_rolling_window(
   table_view const& group_keys,
@@ -304,7 +305,7 @@ std::unique_ptr<column> grouped_rolling_window(
   window_bounds following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Applies a grouping-aware, timestamp-based rolling window function to the values in a
@@ -399,7 +400,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
   size_type following_window_in_days,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Applies a grouping-aware, timestamp-based rolling window function to the values in a
@@ -414,7 +415,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
  *                size_type following_window_in_days,
  *                size_type min_periods,
  *                rolling_aggregation const& aggr,
- *                rmm::mr::device_memory_resource* mr)
+ *                rmm::device_async_resource_ref mr)
  *
  * The `preceding_window_in_days` and `following_window_in_days` are specified as a `window_bounds`
  * and supports "unbounded" windows, if set to `window_bounds::unbounded()`.
@@ -428,7 +429,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
   window_bounds following_window_in_days,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Applies a grouping-aware, value range-based rolling window function to the values in a
@@ -548,7 +549,7 @@ std::unique_ptr<column> grouped_range_rolling_window(
   range_window_bounds const& following,
   size_type min_periods,
   rolling_aggregation const& aggr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Applies a variable-size rolling window function to the values in a column.
@@ -591,7 +592,7 @@ std::unique_ptr<column> rolling_window(
   column_view const& following_window,
   size_type min_periods,
   rolling_aggregation const& agg,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/round.hpp b/cpp/include/cudf/round.hpp
index ee088628b94..85935f8f05c 100644
--- a/cpp/include/cudf/round.hpp
+++ b/cpp/include/cudf/round.hpp
@@ -19,6 +19,7 @@
 #include <cudf/column/column.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -72,9 +73,9 @@ enum class rounding_method : int32_t { HALF_UP, HALF_EVEN };
  */
 std::unique_ptr<column> round(
   column_view const& input,
-  int32_t decimal_places              = 0,
-  rounding_method method              = rounding_method::HALF_UP,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  int32_t decimal_places            = 0,
+  rounding_method method            = rounding_method::HALF_UP,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index 08bffab5067..da1d0d743a7 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 /**
  * @file
@@ -112,8 +113,8 @@ class scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   scalar(scalar const& other,
-         rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-         rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+         rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+         rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new scalar object.
@@ -127,9 +128,9 @@ class scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   scalar(data_type type,
-         bool is_valid                       = false,
-         rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-         rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+         bool is_valid                     = false,
+         rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+         rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 };
 
 namespace detail {
@@ -164,8 +165,8 @@ class fixed_width_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_width_scalar(fixed_width_scalar const& other,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Set the value of the scalar.
@@ -214,9 +215,9 @@ class fixed_width_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_width_scalar(T value,
-                     bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     bool is_valid                     = true,
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed width scalar object from existing device memory.
@@ -227,9 +228,9 @@ class fixed_width_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_width_scalar(rmm::device_scalar<T>&& data,
-                     bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     bool is_valid                     = true,
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 };
 
 }  // namespace detail
@@ -264,8 +265,8 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   numeric_scalar(numeric_scalar const& other,
-                 rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new numeric scalar object.
@@ -276,9 +277,9 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   numeric_scalar(T value,
-                 bool is_valid                       = true,
-                 rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                 bool is_valid                     = true,
+                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new numeric scalar object from existing device memory.
@@ -289,9 +290,9 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   numeric_scalar(rmm::device_scalar<T>&& data,
-                 bool is_valid                       = true,
-                 rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                 bool is_valid                     = true,
+                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 };
 
 /**
@@ -327,8 +328,8 @@ class fixed_point_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_point_scalar(fixed_point_scalar const& other,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed_point scalar object from already shifted value and scale.
@@ -341,9 +342,9 @@ class fixed_point_scalar : public scalar {
    */
   fixed_point_scalar(rep_type value,
                      numeric::scale_type scale,
-                     bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     bool is_valid                     = true,
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed_point scalar object from a value and default 0-scale.
@@ -354,9 +355,9 @@ class fixed_point_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_point_scalar(rep_type value,
-                     bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     bool is_valid                     = true,
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed_point scalar object from a fixed_point number.
@@ -367,9 +368,9 @@ class fixed_point_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_point_scalar(T value,
-                     bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     bool is_valid                     = true,
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new fixed_point scalar object from existing device memory.
@@ -382,9 +383,9 @@ class fixed_point_scalar : public scalar {
    */
   fixed_point_scalar(rmm::device_scalar<rep_type>&& data,
                      numeric::scale_type scale,
-                     bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                     bool is_valid                     = true,
+                     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Get the value of the scalar.
@@ -451,8 +452,8 @@ class string_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(string_scalar const& other,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new string scalar object.
@@ -465,9 +466,9 @@ class string_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(std::string const& string,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new string scalar object from string_view.
@@ -480,9 +481,9 @@ class string_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(value_type const& source,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new string scalar object from string_view in device memory.
@@ -495,9 +496,9 @@ class string_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(rmm::device_scalar<value_type>& data,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new string scalar object by moving an existing string data buffer.
@@ -511,9 +512,9 @@ class string_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(rmm::device_buffer&& data,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Explicit conversion operator to get the value of the scalar in a host std::string.
@@ -584,8 +585,8 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   chrono_scalar(chrono_scalar const& other,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new chrono scalar object.
@@ -596,9 +597,9 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   chrono_scalar(T value,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new chrono scalar object from existing device memory.
@@ -609,9 +610,9 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   chrono_scalar(rmm::device_scalar<T>&& data,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 };
 
 /**
@@ -643,8 +644,8 @@ class timestamp_scalar : public chrono_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   timestamp_scalar(timestamp_scalar const& other,
-                   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new timestamp scalar object from a duration that is
@@ -659,8 +660,8 @@ class timestamp_scalar : public chrono_scalar<T> {
   template <typename Duration2>
   timestamp_scalar(Duration2 const& value,
                    bool is_valid,
-                   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns the duration in number of ticks since the UNIX epoch.
@@ -699,8 +700,8 @@ class duration_scalar : public chrono_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   duration_scalar(duration_scalar const& other,
-                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new duration scalar object from tick counts.
@@ -712,8 +713,8 @@ class duration_scalar : public chrono_scalar<T> {
    */
   duration_scalar(rep_type value,
                   bool is_valid,
-                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns the duration in number of ticks.
@@ -748,8 +749,8 @@ class list_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   list_scalar(list_scalar const& other,
-              rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-              rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+              rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+              rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new list scalar object from column_view.
@@ -762,9 +763,9 @@ class list_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   list_scalar(cudf::column_view const& data,
-              bool is_valid                       = true,
-              rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-              rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+              bool is_valid                     = true,
+              rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+              rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new list scalar object from existing column.
@@ -775,9 +776,9 @@ class list_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   list_scalar(cudf::column&& data,
-              bool is_valid                       = true,
-              rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-              rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+              bool is_valid                     = true,
+              rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+              rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns a non-owning, immutable view to underlying device data.
@@ -813,8 +814,8 @@ class struct_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   struct_scalar(struct_scalar const& other,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new struct scalar object from table_view.
@@ -827,9 +828,9 @@ class struct_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   struct_scalar(table_view const& data,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new struct scalar object from a host_span of column_views.
@@ -842,9 +843,9 @@ class struct_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   struct_scalar(host_span<column_view const> data,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new struct scalar object from an existing table in device memory.
@@ -858,9 +859,9 @@ class struct_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   struct_scalar(table&& data,
-                bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                bool is_valid                     = true,
+                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns a non-owning, immutable view to underlying device data.
@@ -888,7 +889,7 @@ class struct_scalar : public scalar {
   static table init_data(table&& data,
                          bool is_valid,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr);
+                         rmm::device_async_resource_ref mr);
 };
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/scalar/scalar_factories.hpp b/cpp/include/cudf/scalar/scalar_factories.hpp
index 78b6c4fd0e9..7dd4674a2fd 100644
--- a/cpp/include/cudf/scalar/scalar_factories.hpp
+++ b/cpp/include/cudf/scalar/scalar_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 /**
@@ -43,8 +44,8 @@ namespace cudf {
  */
 std::unique_ptr<scalar> make_numeric_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct scalar with uninitialized storage to hold a value of the
@@ -60,8 +61,8 @@ std::unique_ptr<scalar> make_numeric_scalar(
  */
 std::unique_ptr<scalar> make_timestamp_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct scalar with uninitialized storage to hold a value of the
@@ -77,8 +78,8 @@ std::unique_ptr<scalar> make_timestamp_scalar(
  */
 std::unique_ptr<scalar> make_duration_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct scalar with uninitialized storage to hold a value of the
@@ -94,8 +95,8 @@ std::unique_ptr<scalar> make_duration_scalar(
  */
 std::unique_ptr<scalar> make_fixed_width_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct STRING type scalar given a `std::string`.
@@ -111,8 +112,8 @@ std::unique_ptr<scalar> make_fixed_width_scalar(
  */
 std::unique_ptr<scalar> make_string_scalar(
   std::string const& string,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Constructs default constructed scalar of type `type`
@@ -126,8 +127,8 @@ std::unique_ptr<scalar> make_string_scalar(
  */
 std::unique_ptr<scalar> make_default_constructed_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates an empty (invalid) scalar of the same type as the `input` column_view.
@@ -141,8 +142,8 @@ std::unique_ptr<scalar> make_default_constructed_scalar(
  */
 std::unique_ptr<scalar> make_empty_scalar_like(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct scalar using the given value of fixed width type
@@ -156,8 +157,8 @@ std::unique_ptr<scalar> make_empty_scalar_like(
 template <typename T>
 std::unique_ptr<scalar> make_fixed_width_scalar(
   T value,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   return std::make_unique<scalar_type_t<T>>(value, true, stream, mr);
 }
@@ -176,8 +177,8 @@ template <typename T>
 std::unique_ptr<scalar> make_fixed_point_scalar(
   typename T::rep value,
   numeric::scale_type scale,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   return std::make_unique<scalar_type_t<T>>(value, scale, true, stream, mr);
 }
@@ -192,8 +193,8 @@ std::unique_ptr<scalar> make_fixed_point_scalar(
  */
 std::unique_ptr<scalar> make_list_scalar(
   column_view elements,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a struct scalar using the given table_view.
@@ -207,8 +208,8 @@ std::unique_ptr<scalar> make_list_scalar(
  */
 std::unique_ptr<scalar> make_struct_scalar(
   table_view const& data,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a struct scalar using the given span of column views.
@@ -222,8 +223,8 @@ std::unique_ptr<scalar> make_struct_scalar(
  */
 std::unique_ptr<scalar> make_struct_scalar(
   host_span<column_view const> data,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/search.hpp b/cpp/include/cudf/search.hpp
index 49acce6a63b..2e50ba2d687 100644
--- a/cpp/include/cudf/search.hpp
+++ b/cpp/include/cudf/search.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
@@ -72,8 +73,8 @@ std::unique_ptr<column> lower_bound(
   table_view const& needles,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Find largest indices in a sorted table where values should be inserted to maintain order.
@@ -114,8 +115,8 @@ std::unique_ptr<column> upper_bound(
   table_view const& needles,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Check if the given `needle` value exists in the `haystack` column.
@@ -163,8 +164,8 @@ bool contains(column_view const& haystack,
 std::unique_ptr<column> contains(
   column_view const& haystack,
   column_view const& needles,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index 42bcb5da8e3..79a00cbce42 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -54,7 +55,7 @@ std::unique_ptr<column> sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the row indices that would produce `input` in a stable
@@ -69,7 +70,7 @@ std::unique_ptr<column> stable_sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Checks whether the rows of a `table` are sorted in a lexicographical
@@ -113,7 +114,7 @@ std::unique_ptr<table> sort(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a stable lexicographic sort of the rows of a table
@@ -125,7 +126,7 @@ std::unique_ptr<table> stable_sort(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a key-value sort.
@@ -155,7 +156,7 @@ std::unique_ptr<table> sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a key-value stable sort.
@@ -168,7 +169,7 @@ std::unique_ptr<table> stable_sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Computes the ranks of input column in sorted order.
@@ -207,8 +208,8 @@ std::unique_ptr<column> rank(
   null_policy null_handling,
   null_order null_precedence,
   bool percentage,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns sorted order after sorting each segment in the table.
@@ -259,7 +260,7 @@ std::unique_ptr<column> segmented_sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns sorted order after stably sorting each segment in the table.
@@ -272,7 +273,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a lexicographic segmented sort of a table
@@ -328,7 +329,7 @@ std::unique_ptr<table> segmented_sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Performs a stably lexicographic segmented sort of a table
@@ -342,7 +343,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index 3e7bdf13707..c386b3a22b4 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -73,7 +74,7 @@ std::unique_ptr<table> drop_nulls(
   table_view const& input,
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Filters a table to remove null elements.
@@ -104,7 +105,7 @@ std::unique_ptr<table> drop_nulls(
 std::unique_ptr<table> drop_nulls(
   table_view const& input,
   std::vector<size_type> const& keys,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Filters a table to remove NANs with threshold count.
@@ -147,7 +148,7 @@ std::unique_ptr<table> drop_nans(
   table_view const& input,
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Filters a table to remove NANs.
@@ -179,7 +180,7 @@ std::unique_ptr<table> drop_nans(
 std::unique_ptr<table> drop_nans(
   table_view const& input,
   std::vector<size_type> const& keys,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Filters `input` using `boolean_mask` of boolean values as a mask.
@@ -205,7 +206,7 @@ std::unique_ptr<table> drop_nans(
 std::unique_ptr<table> apply_boolean_mask(
   table_view const& input,
   column_view const& boolean_mask,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Choices for drop_duplicates API for retainment of duplicate rows
@@ -248,8 +249,8 @@ std::unique_ptr<table> unique(
   table_view const& input,
   std::vector<size_type> const& keys,
   duplicate_keep_option keep,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  null_equality nulls_equal         = null_equality::EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new table without duplicate rows.
@@ -273,10 +274,10 @@ std::unique_ptr<table> unique(
 std::unique_ptr<table> distinct(
   table_view const& input,
   std::vector<size_type> const& keys,
-  duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  duplicate_keep_option keep        = duplicate_keep_option::KEEP_ANY,
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a column of indices of all distinct rows in the input table.
@@ -294,11 +295,11 @@ std::unique_ptr<table> distinct(
  */
 std::unique_ptr<column> distinct_indices(
   table_view const& input,
-  duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  duplicate_keep_option keep        = duplicate_keep_option::KEEP_ANY,
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create a new table without duplicate rows, preserving input order.
@@ -325,10 +326,10 @@ std::unique_ptr<column> distinct_indices(
 std::unique_ptr<table> stable_distinct(
   table_view const& input,
   std::vector<size_type> const& keys,
-  duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
-  null_equality nulls_equal           = null_equality::EQUAL,
-  nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  duplicate_keep_option keep        = duplicate_keep_option::KEEP_ANY,
+  null_equality nulls_equal         = null_equality::EQUAL,
+  nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Count the number of consecutive groups of equivalent rows in a column.
diff --git a/cpp/include/cudf/strings/attributes.hpp b/cpp/include/cudf/strings/attributes.hpp
index 85086e44a26..26f906b3102 100644
--- a/cpp/include/cudf/strings/attributes.hpp
+++ b/cpp/include/cudf/strings/attributes.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -47,7 +48,7 @@ namespace strings {
  */
 std::unique_ptr<column> count_characters(
   strings_column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column containing byte lengths
@@ -65,7 +66,7 @@ std::unique_ptr<column> count_characters(
  */
 std::unique_ptr<column> count_bytes(
   strings_column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a numeric column with code point values (integers) for each
@@ -85,7 +86,7 @@ std::unique_ptr<column> count_bytes(
  */
 std::unique_ptr<column> code_points(
   strings_column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of strings_apis group
 
diff --git a/cpp/include/cudf/strings/capitalize.hpp b/cpp/include/cudf/strings/capitalize.hpp
index 57375e9ac6a..f8cbdc09748 100644
--- a/cpp/include/cudf/strings/capitalize.hpp
+++ b/cpp/include/cudf/strings/capitalize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -60,9 +61,9 @@ namespace strings {
  */
 std::unique_ptr<column> capitalize(
   strings_column_view const& input,
-  string_scalar const& delimiters     = string_scalar("", true, cudf::get_default_stream()),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiters   = string_scalar("", true, cudf::get_default_stream()),
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Modifies first character of each word to upper-case and lower-cases the rest.
@@ -95,7 +96,7 @@ std::unique_ptr<column> title(
   strings_column_view const& input,
   string_character_types sequence_type = string_character_types::ALPHA,
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Checks if the strings in the input column are title formatted.
@@ -123,8 +124,8 @@ std::unique_ptr<column> title(
  */
 std::unique_ptr<column> is_title(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/case.hpp b/cpp/include/cudf/strings/case.hpp
index 94191686a92..5403fa8db7e 100644
--- a/cpp/include/cudf/strings/case.hpp
+++ b/cpp/include/cudf/strings/case.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -44,8 +45,8 @@ namespace strings {
  */
 std::unique_ptr<column> to_lower(
   strings_column_view const& strings,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Converts a column of strings to upper case.
@@ -63,8 +64,8 @@ std::unique_ptr<column> to_lower(
  */
 std::unique_ptr<column> to_upper(
   strings_column_view const& strings,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of strings converting lower case characters to
@@ -83,8 +84,8 @@ std::unique_ptr<column> to_upper(
  */
 std::unique_ptr<column> swapcase(
   strings_column_view const& strings,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
index c6db5dab08a..da7a238a400 100644
--- a/cpp/include/cudf/strings/char_types/char_types.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -67,7 +68,7 @@ std::unique_ptr<column> all_characters_of_type(
   string_character_types types,
   string_character_types verify_types = string_character_types::ALL_TYPES,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Filter specific character types from a column of strings.
@@ -114,7 +115,7 @@ std::unique_ptr<column> filter_characters_of_type(
   string_scalar const& replacement     = string_scalar(""),
   string_character_types types_to_keep = string_character_types::ALL_TYPES,
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp
index 568e8ac50ec..8cc735831b8 100644
--- a/cpp/include/cudf/strings/combine.hpp
+++ b/cpp/include/cudf/strings/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -77,10 +78,10 @@ enum class output_if_empty_list {
  */
 std::unique_ptr<column> join_strings(
   strings_column_view const& input,
-  string_scalar const& separator      = string_scalar(""),
-  string_scalar const& narep          = string_scalar("", false),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& separator    = string_scalar(""),
+  string_scalar const& narep        = string_scalar("", false),
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Concatenates a list of strings columns using separators for each row
@@ -148,7 +149,7 @@ std::unique_ptr<column> concatenate(
   string_scalar const& col_narep       = string_scalar("", false),
   separator_on_nulls separate_nulls    = separator_on_nulls::YES,
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Row-wise concatenates the given list of strings columns and
@@ -199,11 +200,11 @@ std::unique_ptr<column> concatenate(
  */
 std::unique_ptr<column> concatenate(
   table_view const& strings_columns,
-  string_scalar const& separator      = string_scalar(""),
-  string_scalar const& narep          = string_scalar("", false),
-  separator_on_nulls separate_nulls   = separator_on_nulls::YES,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& separator    = string_scalar(""),
+  string_scalar const& narep        = string_scalar("", false),
+  separator_on_nulls separate_nulls = separator_on_nulls::YES,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Given a lists column of strings (each row is a list of strings), concatenates the strings
@@ -270,7 +271,7 @@ std::unique_ptr<column> join_list_elements(
   separator_on_nulls separate_nulls      = separator_on_nulls::YES,
   output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING,
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Given a lists column of strings (each row is a list of strings), concatenates the strings
@@ -329,7 +330,7 @@ std::unique_ptr<column> join_list_elements(
   separator_on_nulls separate_nulls      = separator_on_nulls::YES,
   output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING,
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp
index 341c146df92..f79a0f19e9c 100644
--- a/cpp/include/cudf/strings/contains.hpp
+++ b/cpp/include/cudf/strings/contains.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -59,8 +60,8 @@ struct regex_program;
 std::unique_ptr<column> contains_re(
   strings_column_view const& input,
   regex_program const& prog,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying rows which
@@ -87,8 +88,8 @@ std::unique_ptr<column> contains_re(
 std::unique_ptr<column> matches_re(
   strings_column_view const& input,
   regex_program const& prog,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the number of times the given regex_program's pattern
@@ -115,8 +116,8 @@ std::unique_ptr<column> matches_re(
 std::unique_ptr<column> count_re(
   strings_column_view const& input,
   regex_program const& prog,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying rows which
@@ -163,7 +164,7 @@ std::unique_ptr<column> like(
   string_scalar const& pattern,
   string_scalar const& escape_character = string_scalar(""),
   rmm::cuda_stream_view stream          = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr     = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying rows which
@@ -204,7 +205,7 @@ std::unique_ptr<column> like(
   strings_column_view const& patterns,
   string_scalar const& escape_character = string_scalar(""),
   rmm::cuda_stream_view stream          = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr     = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_booleans.hpp b/cpp/include/cudf/strings/convert/convert_booleans.hpp
index 9e9f25e800a..9c922361914 100644
--- a/cpp/include/cudf/strings/convert/convert_booleans.hpp
+++ b/cpp/include/cudf/strings/convert/convert_booleans.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -44,8 +45,8 @@ namespace strings {
 std::unique_ptr<column> to_booleans(
   strings_column_view const& input,
   string_scalar const& true_string,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting the boolean values from the
@@ -66,8 +67,8 @@ std::unique_ptr<column> from_booleans(
   column_view const& booleans,
   string_scalar const& true_string,
   string_scalar const& false_string,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_datetime.hpp b/cpp/include/cudf/strings/convert/convert_datetime.hpp
index 81cce14b53b..b89384d718b 100644
--- a/cpp/include/cudf/strings/convert/convert_datetime.hpp
+++ b/cpp/include/cudf/strings/convert/convert_datetime.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <string>
 #include <vector>
@@ -88,8 +89,8 @@ std::unique_ptr<column> to_timestamps(
   strings_column_view const& input,
   data_type timestamp_type,
   std::string_view format,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Verifies the given strings column can be parsed to timestamps using the provided format
@@ -135,8 +136,8 @@ std::unique_ptr<column> to_timestamps(
 std::unique_ptr<column> is_timestamp(
   strings_column_view const& input,
   std::string_view format,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting a timestamp column into
@@ -246,11 +247,11 @@ std::unique_ptr<column> is_timestamp(
  */
 std::unique_ptr<column> from_timestamps(
   column_view const& timestamps,
-  std::string_view format             = "%Y-%m-%dT%H:%M:%SZ",
-  strings_column_view const& names    = strings_column_view(column_view{
+  std::string_view format           = "%Y-%m-%dT%H:%M:%SZ",
+  strings_column_view const& names  = strings_column_view(column_view{
     data_type{type_id::STRING}, 0, nullptr, nullptr, 0}),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_durations.hpp b/cpp/include/cudf/strings/convert/convert_durations.hpp
index a1f4e4ead1d..2db719a4f1f 100644
--- a/cpp/include/cudf/strings/convert/convert_durations.hpp
+++ b/cpp/include/cudf/strings/convert/convert_durations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -76,8 +77,8 @@ std::unique_ptr<column> to_durations(
   strings_column_view const& input,
   data_type duration_type,
   std::string_view format,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting a duration column into
@@ -126,9 +127,9 @@ std::unique_ptr<column> to_durations(
  */
 std::unique_ptr<column> from_durations(
   column_view const& durations,
-  std::string_view format             = "%D days %H:%M:%S",
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  std::string_view format           = "%D days %H:%M:%S",
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
index 8f37715967a..9911bea1948 100644
--- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
+++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -62,8 +63,8 @@ namespace strings {
 std::unique_ptr<column> to_fixed_point(
   strings_column_view const& input,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting the fixed-point values
@@ -92,8 +93,8 @@ std::unique_ptr<column> to_fixed_point(
  */
 std::unique_ptr<column> from_fixed_point(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -123,9 +124,9 @@ std::unique_ptr<column> from_fixed_point(
  */
 std::unique_ptr<column> is_fixed_point(
   strings_column_view const& input,
-  data_type decimal_type              = data_type{type_id::DECIMAL64},
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  data_type decimal_type            = data_type{type_id::DECIMAL64},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_floats.hpp b/cpp/include/cudf/strings/convert/convert_floats.hpp
index a35cb68ef4e..feb5b528686 100644
--- a/cpp/include/cudf/strings/convert/convert_floats.hpp
+++ b/cpp/include/cudf/strings/convert/convert_floats.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -48,8 +49,8 @@ namespace strings {
 std::unique_ptr<column> to_floats(
   strings_column_view const& strings,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting the float values from the
@@ -71,8 +72,8 @@ std::unique_ptr<column> to_floats(
  */
 std::unique_ptr<column> from_floats(
   column_view const& floats,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -97,8 +98,8 @@ std::unique_ptr<column> from_floats(
  */
 std::unique_ptr<column> is_float(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp
index 74ec5d315a2..82696811fdc 100644
--- a/cpp/include/cudf/strings/convert/convert_integers.hpp
+++ b/cpp/include/cudf/strings/convert/convert_integers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -55,8 +56,8 @@ namespace strings {
 std::unique_ptr<column> to_integers(
   strings_column_view const& input,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting the integer values from the
@@ -76,8 +77,8 @@ std::unique_ptr<column> to_integers(
  */
 std::unique_ptr<column> from_integers(
   column_view const& integers,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -105,8 +106,8 @@ std::unique_ptr<column> from_integers(
  */
 std::unique_ptr<column> is_integer(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -139,8 +140,8 @@ std::unique_ptr<column> is_integer(
 std::unique_ptr<column> is_integer(
   strings_column_view const& input,
   data_type int_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new integer numeric column parsing hexadecimal values from the
@@ -169,8 +170,8 @@ std::unique_ptr<column> is_integer(
 std::unique_ptr<column> hex_to_integers(
   strings_column_view const& input,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -196,8 +197,8 @@ std::unique_ptr<column> hex_to_integers(
  */
 std::unique_ptr<column> is_hex(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column converting integer columns to hexadecimal
@@ -229,8 +230,8 @@ std::unique_ptr<column> is_hex(
  */
 std::unique_ptr<column> integers_to_hex(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_ipv4.hpp b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
index 25ad7b86748..64f8a412ce9 100644
--- a/cpp/include/cudf/strings/convert/convert_ipv4.hpp
+++ b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -55,8 +56,8 @@ namespace strings {
  */
 std::unique_ptr<column> ipv4_to_integers(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Converts integers into IPv4 addresses as strings.
@@ -80,8 +81,8 @@ std::unique_ptr<column> ipv4_to_integers(
  */
 std::unique_ptr<column> integers_to_ipv4(
   column_view const& integers,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -107,8 +108,8 @@ std::unique_ptr<column> integers_to_ipv4(
  */
 std::unique_ptr<column> is_ipv4(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp
index dedf4e95138..a88bbe99492 100644
--- a/cpp/include/cudf/strings/convert/convert_lists.hpp
+++ b/cpp/include/cudf/strings/convert/convert_lists.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -63,7 +64,7 @@ std::unique_ptr<column> format_list_column(
   strings_column_view const& separators = strings_column_view(column_view{
     data_type{type_id::STRING}, 0, nullptr, nullptr, 0}),
   rmm::cuda_stream_view stream          = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr     = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_urls.hpp b/cpp/include/cudf/strings/convert/convert_urls.hpp
index 902835081af..30988d2ff0a 100644
--- a/cpp/include/cudf/strings/convert/convert_urls.hpp
+++ b/cpp/include/cudf/strings/convert/convert_urls.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -46,8 +47,8 @@ namespace strings {
  */
 std::unique_ptr<column> url_encode(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Encodes each string using URL encoding.
@@ -69,8 +70,8 @@ std::unique_ptr<column> url_encode(
  */
 std::unique_ptr<column> url_decode(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp
index 3b8ed0f4e0d..25214055787 100644
--- a/cpp/include/cudf/strings/detail/combine.hpp
+++ b/cpp/include/cudf/strings/detail/combine.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -30,7 +31,7 @@ namespace detail {
 
 /**
  * @copydoc concatenate(table_view const&,string_scalar const&,string_scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -39,11 +40,11 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& narep,
                                     separator_on_nulls separate_nulls,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc join_strings(table_view const&,string_scalar const&,string_scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -51,11 +52,11 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
                                      string_scalar const& separator,
                                      string_scalar const& narep,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc join_list_elements(table_view const&,string_scalar const&,string_scalar
- * const&,separator_on_nulls,output_if_empty_list,rmm::mr::device_memory_resource*)
+ * const&,separator_on_nulls,output_if_empty_list,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -65,7 +66,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/concatenate.hpp b/cpp/include/cudf/strings/detail/concatenate.hpp
index 511e240886a..b5dd5b9516a 100644
--- a/cpp/include/cudf/strings/detail/concatenate.hpp
+++ b/cpp/include/cudf/strings/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -44,7 +45,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/converters.hpp b/cpp/include/cudf/strings/detail/converters.hpp
index 3337815342c..d212239264b 100644
--- a/cpp/include/cudf/strings/detail/converters.hpp
+++ b/cpp/include/cudf/strings/detail/converters.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,63 +20,64 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
 namespace detail {
 
 /**
- * @copydoc to_integers(strings_column_view const&,data_type,rmm::mr::device_memory_resource*)
+ * @copydoc to_integers(strings_column_view const&,data_type,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> to_integers(strings_column_view const& strings,
                                     data_type output_type,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc from_integers(strings_column_view const&,rmm::mr::device_memory_resource*)
+ * @copydoc from_integers(strings_column_view const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> from_integers(column_view const& integers,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc to_floats(strings_column_view const&,data_type,rmm::mr::device_memory_resource*)
+ * @copydoc to_floats(strings_column_view const&,data_type,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> to_floats(strings_column_view const& strings,
                                   data_type output_type,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc from_floats(strings_column_view const&,rmm::mr::device_memory_resource*)
+ * @copydoc from_floats(strings_column_view const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> from_floats(column_view const& floats,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc to_booleans(strings_column_view const&,string_scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> to_booleans(strings_column_view const& strings,
                                     string_scalar const& true_string,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc from_booleans(strings_column_view const&,string_scalar const&,string_scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -84,11 +85,11 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
                                       string_scalar const& true_string,
                                       string_scalar const& false_string,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc to_timestamps(strings_column_view const&,data_type,std::string_view,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -96,11 +97,11 @@ std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& strings,
                                             data_type timestamp_type,
                                             std::string_view format,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc from_timestamps(strings_column_view const&,std::string_view,
- * strings_column_view const&,rmm::mr::device_memory_resource*)
+ * strings_column_view const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -108,11 +109,11 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                                         std::string_view format,
                                         strings_column_view const& names,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc to_durations(strings_column_view const&,data_type,std::string_view,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -120,37 +121,37 @@ std::unique_ptr<column> to_durations(strings_column_view const& strings,
                                      data_type duration_type,
                                      std::string_view format,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc from_durations(strings_column_view const&,std::string_view.
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> from_durations(column_view const& durations,
                                        std::string_view format,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc to_fixed_point(strings_column_view const&,data_type,rmm::mr::device_memory_resource*)
+ * @copydoc to_fixed_point(strings_column_view const&,data_type,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> to_fixed_point(strings_column_view const& strings,
                                        data_type output_type,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc from_fixed_point(strings_column_view const&,rmm::mr::device_memory_resource*)
+ * @copydoc from_fixed_point(strings_column_view const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> from_fixed_point(column_view const& integers,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index 08ba99e90d8..4db7651330b 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -22,6 +22,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -59,7 +60,7 @@ std::unique_ptr<cudf::column> copy_if_else(StringIterLeft lhs_begin,
                                            StringIterRight rhs_begin,
                                            Filter filter_fn,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto strings_count = std::distance(lhs_begin, lhs_end);
   if (strings_count == 0) { return make_empty_column(type_id::STRING); }
diff --git a/cpp/include/cudf/strings/detail/copy_range.hpp b/cpp/include/cudf/strings/detail/copy_range.hpp
index e18f1fdc5ad..192c5b833c6 100644
--- a/cpp/include/cudf/strings/detail/copy_range.hpp
+++ b/cpp/include/cudf/strings/detail/copy_range.hpp
@@ -19,6 +19,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -53,7 +54,7 @@ std::unique_ptr<column> copy_range(strings_column_view const& source,
                                    size_type source_end,
                                    size_type target_begin,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/copying.hpp b/cpp/include/cudf/strings/detail/copying.hpp
index 7e82ad4c679..240cac17188 100644
--- a/cpp/include/cudf/strings/detail/copying.hpp
+++ b/cpp/include/cudf/strings/detail/copying.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -53,7 +54,7 @@ std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
                                          size_type start,
                                          size_type end,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @brief Returns a new strings column created by shifting the rows by a specified offset.
@@ -80,7 +81,7 @@ std::unique_ptr<column> shift(strings_column_view const& input,
                               size_type offset,
                               scalar const& fill_value,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/fill.hpp b/cpp/include/cudf/strings/detail/fill.hpp
index 43e3f6198f3..c5d005fbf75 100644
--- a/cpp/include/cudf/strings/detail/fill.hpp
+++ b/cpp/include/cudf/strings/detail/fill.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -47,7 +48,7 @@ std::unique_ptr<column> fill(strings_column_view const& strings,
                              size_type end,
                              string_scalar const& value,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr);
+                             rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 06d959acffb..94bce6bddd5 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/advance.h>
@@ -227,7 +228,7 @@ rmm::device_uvector<char> gather_chars(StringIterator strings_begin,
                                        cudf::detail::input_offsetalator const offsets,
                                        size_type chars_bytes,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto const output_count = std::distance(map_begin, map_end);
   if (output_count == 0) return rmm::device_uvector<char>(0, stream, mr);
@@ -290,7 +291,7 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
                                      MapIterator begin,
                                      MapIterator end,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   auto const output_count = std::distance(begin, end);
   if (output_count == 0) return make_empty_column(type_id::STRING);
@@ -354,7 +355,7 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
                                      MapIterator end,
                                      bool nullify_out_of_bounds,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   if (nullify_out_of_bounds) return gather<true>(strings, begin, end, stream, mr);
   return gather<false>(strings, begin, end, stream, mr);
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh
index f05e957783f..457c2b7f740 100644
--- a/cpp/include/cudf/strings/detail/merge.cuh
+++ b/cpp/include/cudf/strings/detail/merge.cuh
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -56,7 +57,7 @@ std::unique_ptr<column> merge(strings_column_view const& lhs,
                               row_order_iterator begin,
                               row_order_iterator end,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   using cudf::detail::side;
   size_type strings_count = static_cast<size_type>(std::distance(begin, end));
diff --git a/cpp/include/cudf/strings/detail/replace.hpp b/cpp/include/cudf/strings/detail/replace.hpp
index 0f050f057fa..aad89beb47e 100644
--- a/cpp/include/cudf/strings/detail/replace.hpp
+++ b/cpp/include/cudf/strings/detail/replace.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -28,24 +29,24 @@ namespace detail {
 
 /**
  * @copydoc cudf::strings::replace(strings_column_view const&, string_scalar const&,
- * string_scalar const&, int32_t, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+ * string_scalar const&, int32_t, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> replace(strings_column_view const& strings,
                                 string_scalar const& target,
                                 string_scalar const& repl,
                                 int32_t maxrepl,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::strings::replace(strings_column_view const&, strings_column_view const&,
- * strings_column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+ * strings_column_view const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> replace(strings_column_view const& strings,
                                 strings_column_view const& targets,
                                 strings_column_view const& repls,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr);
+                                rmm::device_async_resource_ref mr);
 
 /**
  * @brief Replaces any null string entries with the given string.
@@ -68,18 +69,18 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
 std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
                                       string_scalar const& repl,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::strings::replace_slice(strings_column_view const&, string_scalar const&,
- * size_type, size_type, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+ * size_type, size_type, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> replace_slice(strings_column_view const& strings,
                                       string_scalar const& repl,
                                       size_type start,
                                       size_type stop,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Return a copy of `input` replacing any `values_to_replace[i]`
@@ -97,7 +98,7 @@ std::unique_ptr<cudf::column> find_and_replace_all(
   cudf::strings_column_view const& values_to_replace,
   cudf::strings_column_view const& replacement_values,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/scan.hpp b/cpp/include/cudf/strings/detail/scan.hpp
index 611e32e28cd..f32afa64a72 100644
--- a/cpp/include/cudf/strings/detail/scan.hpp
+++ b/cpp/include/cudf/strings/detail/scan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -40,7 +41,7 @@ template <typename Op>
 std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        bitmask_type const* mask,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh
index 8b8c11dcd5c..87f0e7ae47c 100644
--- a/cpp/include/cudf/strings/detail/scatter.cuh
+++ b/cpp/include/cudf/strings/detail/scatter.cuh
@@ -24,6 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
@@ -63,7 +64,7 @@ std::unique_ptr<column> scatter(SourceIterator begin,
                                 MapIterator scatter_map,
                                 strings_column_view const& target,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   if (target.is_empty()) return make_empty_column(type_id::STRING);
 
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index 49c4be88ca5..7136df325f4 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -56,7 +57,7 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
                            size_type exec_size,
                            size_type strings_count,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+                           rmm::device_async_resource_ref mr)
 {
   auto offsets_column = make_numeric_column(
     data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
@@ -116,7 +117,7 @@ template <typename SizeAndExecuteFunction>
 auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
                            size_type strings_count,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+                           rmm::device_async_resource_ref mr)
 {
   return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
 }
@@ -142,7 +143,7 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
   InputIterator begin,
   InputIterator end,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto constexpr size_type_max = static_cast<int64_t>(std::numeric_limits<size_type>::max());
   auto const lcount            = static_cast<int64_t>(std::distance(begin, end));
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index 8e19f08a5cc..079b6a73e0b 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -73,7 +74,7 @@ template <typename IndexPairIterator>
 std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
                                             IndexPairIterator end,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   size_type strings_count = thrust::distance(begin, end);
@@ -163,7 +164,7 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
                                             size_type null_count,
                                             rmm::device_buffer&& null_mask,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   size_type strings_count = thrust::distance(offsets_begin, offsets_end) - 1;
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index cf9a13e9742..4467a9d0023 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -43,7 +44,7 @@ namespace detail {
 std::unique_ptr<column> create_offsets_child_column(int64_t chars_bytes,
                                                     size_type count,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr);
+                                                    rmm::device_async_resource_ref mr);
 
 /**
  * @brief Creates a string_view vector from a strings column.
@@ -56,7 +57,7 @@ std::unique_ptr<column> create_offsets_child_column(int64_t chars_bytes,
 rmm::device_uvector<string_view> create_string_vector_from_column(
   cudf::strings_column_view const strings,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Return the threshold size for a strings column to use int64 offsets
diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp
index a4db1ac46da..4138e1e59d5 100644
--- a/cpp/include/cudf/strings/extract.hpp
+++ b/cpp/include/cudf/strings/extract.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/table/table.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -62,8 +63,8 @@ struct regex_program;
 std::unique_ptr<table> extract(
   strings_column_view const& input,
   regex_program const& prog,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a lists column of strings where each string column row corresponds to the
@@ -98,8 +99,8 @@ std::unique_ptr<table> extract(
 std::unique_ptr<column> extract_all_record(
   strings_column_view const& input,
   regex_program const& prog,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp
index c1aa8b294b3..c116dbc2fe1 100644
--- a/cpp/include/cudf/strings/find.hpp
+++ b/cpp/include/cudf/strings/find.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -55,10 +56,10 @@ namespace strings {
 std::unique_ptr<column> find(
   strings_column_view const& input,
   string_scalar const& target,
-  size_type start                     = 0,
-  size_type stop                      = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type start                   = 0,
+  size_type stop                    = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of character position values where the target
@@ -86,10 +87,10 @@ std::unique_ptr<column> find(
 std::unique_ptr<column> rfind(
   strings_column_view const& input,
   string_scalar const& target,
-  size_type start                     = 0,
-  size_type stop                      = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type start                   = 0,
+  size_type stop                    = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of character position values where the target
@@ -114,9 +115,9 @@ std::unique_ptr<column> rfind(
 std::unique_ptr<column> find(
   strings_column_view const& input,
   strings_column_view const& target,
-  size_type start                     = 0,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type start                   = 0,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -136,8 +137,8 @@ std::unique_ptr<column> find(
 std::unique_ptr<column> contains(
   strings_column_view const& input,
   string_scalar const& target,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -161,8 +162,8 @@ std::unique_ptr<column> contains(
 std::unique_ptr<column> contains(
   strings_column_view const& input,
   strings_column_view const& targets,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -183,8 +184,8 @@ std::unique_ptr<column> contains(
 std::unique_ptr<column> starts_with(
   strings_column_view const& input,
   string_scalar const& target,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -209,8 +210,8 @@ std::unique_ptr<column> starts_with(
 std::unique_ptr<column> starts_with(
   strings_column_view const& input,
   strings_column_view const& targets,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -231,8 +232,8 @@ std::unique_ptr<column> starts_with(
 std::unique_ptr<column> ends_with(
   strings_column_view const& input,
   string_scalar const& target,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -257,8 +258,8 @@ std::unique_ptr<column> ends_with(
 std::unique_ptr<column> ends_with(
   strings_column_view const& input,
   strings_column_view const& targets,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp
index 06b851c5012..c2e82aa6f1a 100644
--- a/cpp/include/cudf/strings/find_multiple.hpp
+++ b/cpp/include/cudf/strings/find_multiple.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -57,8 +58,8 @@ namespace strings {
 std::unique_ptr<column> find_multiple(
   strings_column_view const& input,
   strings_column_view const& targets,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp
index 379b9624dc6..abc1d28ee4c 100644
--- a/cpp/include/cudf/strings/findall.hpp
+++ b/cpp/include/cudf/strings/findall.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/table/table.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -64,8 +65,8 @@ struct regex_program;
 std::unique_ptr<column> findall(
   strings_column_view const& input,
   regex_program const& prog,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/padding.hpp b/cpp/include/cudf/strings/padding.hpp
index f0cb351eeda..f1382d6ea29 100644
--- a/cpp/include/cudf/strings/padding.hpp
+++ b/cpp/include/cudf/strings/padding.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -58,10 +59,10 @@ namespace strings {
 std::unique_ptr<column> pad(
   strings_column_view const& input,
   size_type width,
-  side_type side                      = side_type::RIGHT,
-  std::string_view fill_char          = " ",
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  side_type side                    = side_type::RIGHT,
+  std::string_view fill_char        = " ",
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Add '0' as padding to the left of each string.
@@ -90,8 +91,8 @@ std::unique_ptr<column> pad(
 std::unique_ptr<column> zfill(
   strings_column_view const& input,
   size_type width,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp
index 7dc9c33f579..cbf1edc8331 100644
--- a/cpp/include/cudf/strings/repeat_strings.hpp
+++ b/cpp/include/cudf/strings/repeat_strings.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -59,8 +60,8 @@ namespace strings {
 std::unique_ptr<string_scalar> repeat_string(
   string_scalar const& input,
   size_type repeat_times,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Repeat each string in the given strings column a given number of times
@@ -90,8 +91,8 @@ std::unique_ptr<string_scalar> repeat_string(
 std::unique_ptr<column> repeat_strings(
   strings_column_view const& input,
   size_type repeat_times,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Repeat each string in the given strings column by the numbers of times given in another
@@ -127,8 +128,8 @@ std::unique_ptr<column> repeat_strings(
 std::unique_ptr<column> repeat_strings(
   strings_column_view const& input,
   column_view const& repeat_times,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index 2476a41e886..9525db44b69 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -67,9 +68,9 @@ std::unique_ptr<column> replace(
   strings_column_view const& input,
   string_scalar const& target,
   string_scalar const& repl,
-  cudf::size_type maxrepl             = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type maxrepl           = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief This function replaces each string in the column with the provided
@@ -107,11 +108,11 @@ std::unique_ptr<column> replace(
  */
 std::unique_ptr<column> replace_slice(
   strings_column_view const& input,
-  string_scalar const& repl           = string_scalar(""),
-  size_type start                     = 0,
-  size_type stop                      = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& repl         = string_scalar(""),
+  size_type start                   = 0,
+  size_type stop                    = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Replaces substrings matching a list of targets with the corresponding
@@ -156,8 +157,8 @@ std::unique_ptr<column> replace(
   strings_column_view const& input,
   strings_column_view const& targets,
   strings_column_view const& repls,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp
index 77db2882253..f61f9585144 100644
--- a/cpp/include/cudf/strings/replace_re.hpp
+++ b/cpp/include/cudf/strings/replace_re.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -59,7 +60,7 @@ std::unique_ptr<column> replace_re(
   string_scalar const& replacement           = string_scalar(""),
   std::optional<size_type> max_replace_count = std::nullopt,
   rmm::cuda_stream_view stream               = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr        = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr          = rmm::mr::get_current_device_resource());
 
 /**
  * @brief For each string, replaces any character sequence matching the given patterns
@@ -81,9 +82,9 @@ std::unique_ptr<column> replace_re(
   strings_column_view const& input,
   std::vector<std::string> const& patterns,
   strings_column_view const& replacements,
-  regex_flags const flags             = regex_flags::DEFAULT,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  regex_flags const flags           = regex_flags::DEFAULT,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief For each string, replaces any character sequence matching the given regex
@@ -107,8 +108,8 @@ std::unique_ptr<column> replace_with_backrefs(
   strings_column_view const& input,
   regex_program const& prog,
   std::string_view replacement,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/reverse.hpp b/cpp/include/cudf/strings/reverse.hpp
index 4fc8fbf67c2..86656693c8b 100644
--- a/cpp/include/cudf/strings/reverse.hpp
+++ b/cpp/include/cudf/strings/reverse.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -47,8 +48,8 @@ namespace strings {
  */
 std::unique_ptr<column> reverse(
   strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/slice.hpp b/cpp/include/cudf/strings/slice.hpp
index f106663be9b..e2be6abd344 100644
--- a/cpp/include/cudf/strings/slice.hpp
+++ b/cpp/include/cudf/strings/slice.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -64,7 +65,7 @@ std::unique_ptr<column> slice_strings(
   numeric_scalar<size_type> const& stop  = numeric_scalar<size_type>(0, false),
   numeric_scalar<size_type> const& step  = numeric_scalar<size_type>(1),
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a new strings column that contains substrings of the
@@ -108,8 +109,8 @@ std::unique_ptr<column> slice_strings(
   strings_column_view const& input,
   column_view const& starts,
   column_view const& stops,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/split/partition.hpp b/cpp/include/cudf/strings/split/partition.hpp
index 25eedf1e86b..0a837034ba1 100644
--- a/cpp/include/cudf/strings/split/partition.hpp
+++ b/cpp/include/cudf/strings/split/partition.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/table/table.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -60,9 +61,9 @@ namespace strings {
  */
 std::unique_ptr<table> partition(
   strings_column_view const& input,
-  string_scalar const& delimiter      = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiter    = string_scalar(""),
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a set of 3 columns by splitting each string using the
@@ -94,9 +95,9 @@ std::unique_ptr<table> partition(
  */
 std::unique_ptr<table> rpartition(
   strings_column_view const& input,
-  string_scalar const& delimiter      = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiter    = string_scalar(""),
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/split/split.hpp b/cpp/include/cudf/strings/split/split.hpp
index a34a59577a0..d5c44406ca7 100644
--- a/cpp/include/cudf/strings/split/split.hpp
+++ b/cpp/include/cudf/strings/split/split.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/table/table.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -54,10 +55,10 @@ namespace strings {
  */
 std::unique_ptr<table> split(
   strings_column_view const& strings_column,
-  string_scalar const& delimiter      = string_scalar(""),
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiter    = string_scalar(""),
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a list of columns by splitting each string using the
@@ -84,10 +85,10 @@ std::unique_ptr<table> split(
  */
 std::unique_ptr<table> rsplit(
   strings_column_view const& strings_column,
-  string_scalar const& delimiter      = string_scalar(""),
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiter    = string_scalar(""),
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Splits individual strings elements into a list of strings.
@@ -158,10 +159,10 @@ std::unique_ptr<table> rsplit(
  */
 std::unique_ptr<column> split_record(
   strings_column_view const& strings,
-  string_scalar const& delimiter      = string_scalar(""),
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiter    = string_scalar(""),
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Splits individual strings elements into a list of strings starting
@@ -237,10 +238,10 @@ std::unique_ptr<column> split_record(
  */
 std::unique_ptr<column> rsplit_record(
   strings_column_view const& strings,
-  string_scalar const& delimiter      = string_scalar(""),
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  string_scalar const& delimiter    = string_scalar(""),
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index f1736cb7e0c..81595fa7ed4 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/table/table.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -82,9 +83,9 @@ struct regex_program;
 std::unique_ptr<table> split_re(
   strings_column_view const& input,
   regex_program const& prog,
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Splits strings elements into a table of strings columns using a
@@ -138,9 +139,9 @@ std::unique_ptr<table> split_re(
 std::unique_ptr<table> rsplit_re(
   strings_column_view const& input,
   regex_program const& prog,
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Splits strings elements into a list column of strings
@@ -196,9 +197,9 @@ std::unique_ptr<table> rsplit_re(
 std::unique_ptr<column> split_record_re(
   strings_column_view const& input,
   regex_program const& prog,
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Splits strings elements into a list column of strings using the given
@@ -256,9 +257,9 @@ std::unique_ptr<column> split_record_re(
 std::unique_ptr<column> rsplit_record_re(
   strings_column_view const& input,
   regex_program const& prog,
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  size_type maxsplit                = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/strip.hpp b/cpp/include/cudf/strings/strip.hpp
index 556d6805ac3..6fb9bbc45e6 100644
--- a/cpp/include/cudf/strings/strip.hpp
+++ b/cpp/include/cudf/strings/strip.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -63,10 +64,10 @@ namespace strings {
  */
 std::unique_ptr<column> strip(
   strings_column_view const& input,
-  side_type side                      = side_type::BOTH,
-  string_scalar const& to_strip       = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  side_type side                    = side_type::BOTH,
+  string_scalar const& to_strip     = string_scalar(""),
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/translate.hpp b/cpp/include/cudf/strings/translate.hpp
index 4bd09352b09..9cd6b7d5974 100644
--- a/cpp/include/cudf/strings/translate.hpp
+++ b/cpp/include/cudf/strings/translate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
@@ -56,8 +57,8 @@ namespace strings {
 std::unique_ptr<column> translate(
   strings_column_view const& input,
   std::vector<std::pair<char_utf8, char_utf8>> const& chars_table,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Removes or keeps the specified character ranges in cudf::strings::filter_characters
@@ -101,10 +102,10 @@ enum class filter_type : bool {
 std::unique_ptr<column> filter_characters(
   strings_column_view const& input,
   std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> characters_to_filter,
-  filter_type keep_characters         = filter_type::KEEP,
-  string_scalar const& replacement    = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  filter_type keep_characters       = filter_type::KEEP,
+  string_scalar const& replacement  = string_scalar(""),
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/wrap.hpp b/cpp/include/cudf/strings/wrap.hpp
index efdc3e62aff..c05c33fbac8 100644
--- a/cpp/include/cudf/strings/wrap.hpp
+++ b/cpp/include/cudf/strings/wrap.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -66,8 +67,8 @@ namespace strings {
 std::unique_ptr<column> wrap(
   strings_column_view const& input,
   size_type width,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/structs/detail/concatenate.hpp b/cpp/include/cudf/structs/detail/concatenate.hpp
index 82ccca188e2..5dc3169c0c4 100644
--- a/cpp/include/cudf/structs/detail/concatenate.hpp
+++ b/cpp/include/cudf/structs/detail/concatenate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace structs {
 namespace detail {
@@ -50,7 +52,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace structs
diff --git a/cpp/include/cudf/structs/detail/scan.hpp b/cpp/include/cudf/structs/detail/scan.hpp
index 531e0a6c65f..c97a8452ecd 100644
--- a/cpp/include/cudf/structs/detail/scan.hpp
+++ b/cpp/include/cudf/structs/detail/scan.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace structs {
@@ -38,7 +39,7 @@ namespace detail {
 template <typename Op>
 std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace structs
diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp
index 439b02c2d53..8efe6eb8c72 100644
--- a/cpp/include/cudf/table/table.hpp
+++ b/cpp/include/cudf/table/table.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -56,8 +57,8 @@ class table {
    * @param mr Device memory resource to use for all device memory allocations
    */
   explicit table(table const& other,
-                 rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
   /**
    * @brief Moves the contents from a vector of `unique_ptr`s to columns to
    * construct a new table.
@@ -75,8 +76,8 @@ class table {
    * @param mr Device memory resource used for allocating the device memory for the new columns
    */
   table(table_view view,
-        rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-        rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+        rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+        rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Returns the number of columns in the table
diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp
index 56678c73811..7f65128526e 100644
--- a/cpp/include/cudf/timezone.hpp
+++ b/cpp/include/cudf/timezone.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -49,6 +50,6 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years;
 std::unique_ptr<table> make_timezone_transition_table(
   std::optional<std::string_view> tzif_dir,
   std::string_view timezone_name,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace cudf
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 49ec3d7c0d5..7bb9fb7a42e 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -20,6 +20,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -54,7 +55,7 @@ std::unique_ptr<column> transform(
   std::string const& unary_udf,
   data_type output_type,
   bool is_ptx,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a null_mask from `input` by converting `NaN` to null and
@@ -69,7 +70,7 @@ std::unique_ptr<column> transform(
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
   column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Compute a new column by evaluating an expression tree on a table.
@@ -87,7 +88,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
 std::unique_ptr<column> compute_column(
   table_view const& table,
   ast::expression const& expr,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a bitmask from a column of boolean elements.
@@ -106,7 +107,7 @@ std::unique_ptr<column> compute_column(
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
   column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Encode the rows of the given table as integers
@@ -134,7 +135,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
  */
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
   cudf::table_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Encodes `input` by generating a new column for each value in `categories` indicating the
@@ -166,7 +167,7 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(
   column_view const& input,
   column_view const& categories,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a boolean column from given bitmask.
@@ -193,7 +194,7 @@ std::unique_ptr<column> mask_to_bools(
   bitmask_type const* bitmask,
   size_type begin_bit,
   size_type end_bit,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for
@@ -221,8 +222,7 @@ std::unique_ptr<column> mask_to_bools(
  * @return A 32-bit integer column containing the per-row bit counts
  */
 std::unique_ptr<column> row_bit_count(
-  table_view const& t,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  table_view const& t, rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for
@@ -245,7 +245,7 @@ std::unique_ptr<column> row_bit_count(
 std::unique_ptr<column> segmented_row_bit_count(
   table_view const& t,
   size_type segment_length,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/transpose.hpp b/cpp/include/cudf/transpose.hpp
index e5d083ae7b3..c01a04afe87 100644
--- a/cpp/include/cudf/transpose.hpp
+++ b/cpp/include/cudf/transpose.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 /**
@@ -44,7 +45,7 @@ namespace cudf {
  */
 std::pair<std::unique_ptr<column>, table_view> transpose(
   table_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 5ded22488c7..74c8bc67d3a 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -22,6 +22,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -147,8 +148,8 @@ enum class unary_operator : int32_t {
 std::unique_ptr<cudf::column> unary_operation(
   cudf::column_view const& input,
   cudf::unary_operator op,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a column of `type_id::BOOL8` elements where for every element in `input` `true`
@@ -163,8 +164,8 @@ std::unique_ptr<cudf::column> unary_operation(
  */
 std::unique_ptr<cudf::column> is_null(
   cudf::column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a column of `type_id::BOOL8` elements where for every element in `input` `true`
@@ -179,8 +180,8 @@ std::unique_ptr<cudf::column> is_null(
  */
 std::unique_ptr<cudf::column> is_valid(
   cudf::column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief  Casts data from dtype specified in input to dtype specified in output.
@@ -198,8 +199,8 @@ std::unique_ptr<cudf::column> is_valid(
 std::unique_ptr<column> cast(
   column_view const& input,
   data_type out_type,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a column of `type_id::BOOL8` elements indicating the presence of `NaN` values
@@ -216,8 +217,8 @@ std::unique_ptr<column> cast(
  */
 std::unique_ptr<column> is_nan(
   cudf::column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a column of `type_id::BOOL8` elements indicating the absence of `NaN` values
@@ -235,8 +236,8 @@ std::unique_ptr<column> is_nan(
  */
 std::unique_ptr<column> is_not_nan(
   cudf::column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 14b94e061ae..18f75bbc842 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -23,6 +23,7 @@
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace test {
@@ -36,7 +37,7 @@ namespace test {
  * ```
  */
 class BaseFixture : public ::testing::Test {
-  rmm::mr::device_memory_resource* _mr{rmm::mr::get_current_device_resource()};
+  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 
  public:
   /**
@@ -44,7 +45,7 @@ class BaseFixture : public ::testing::Test {
    * all tests inheriting from this fixture
    * @return pointer to memory resource
    */
-  rmm::mr::device_memory_resource* mr() { return _mr; }
+  rmm::device_async_resource_ref mr() { return _mr; }
 };
 
 /**
@@ -57,7 +58,7 @@ class BaseFixture : public ::testing::Test {
  */
 template <typename T>
 class BaseFixtureWithParam : public ::testing::TestWithParam<T> {
-  rmm::mr::device_memory_resource* _mr{rmm::mr::get_current_device_resource()};
+  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 
  public:
   /**
@@ -65,7 +66,7 @@ class BaseFixtureWithParam : public ::testing::TestWithParam<T> {
    * all tests inheriting from this fixture
    * @return pointer to memory resource
    */
-  rmm::mr::device_memory_resource* mr() const { return _mr; }
+  rmm::device_async_resource_ref mr() const { return _mr; }
 };
 
 /**
diff --git a/cpp/include/nvtext/byte_pair_encoding.hpp b/cpp/include/nvtext/byte_pair_encoding.hpp
index 4d6d8335eac..375d44e367a 100644
--- a/cpp/include/nvtext/byte_pair_encoding.hpp
+++ b/cpp/include/nvtext/byte_pair_encoding.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 
 /**
@@ -45,8 +47,8 @@ struct bpe_merge_pairs {
    * @param mr Device memory resource used to allocate the device memory
    */
   bpe_merge_pairs(std::unique_ptr<cudf::column>&& input,
-                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   /**
    * @brief Construct a new bpe merge pairs object
@@ -56,8 +58,8 @@ struct bpe_merge_pairs {
    * @param mr Device memory resource used to allocate the device memory
    */
   bpe_merge_pairs(cudf::strings_column_view const& input,
-                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
   ~bpe_merge_pairs();
   bpe_merge_pairs();
@@ -94,8 +96,8 @@ struct bpe_merge_pairs {
  */
 std::unique_ptr<bpe_merge_pairs> load_merge_pairs(
   cudf::strings_column_view const& merge_pairs,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Byte pair encode the input strings.
@@ -127,7 +129,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
   cudf::strings_column_view const& input,
   bpe_merge_pairs const& merges_pairs,
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/detail/generate_ngrams.hpp b/cpp/include/nvtext/detail/generate_ngrams.hpp
index 835124141d4..c4b89b6d495 100644
--- a/cpp/include/nvtext/detail/generate_ngrams.hpp
+++ b/cpp/include/nvtext/detail/generate_ngrams.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,20 +18,21 @@
 #include <nvtext/generate_ngrams.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace nvtext {
 namespace detail {
 
 /**
  * @copydoc hash_character_ngrams(cudf::strings_column_view const&,
- * cudf::size_type, rmm::mr::device_memory_resource*)
+ * cudf::size_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for allocating/copying device memory and launching kernels
  */
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
                                                     cudf::size_type ngrams,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr);
+                                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/detail/load_hash_file.hpp b/cpp/include/nvtext/detail/load_hash_file.hpp
index f4107adb07e..0c27981f80b 100644
--- a/cpp/include/nvtext/detail/load_hash_file.hpp
+++ b/cpp/include/nvtext/detail/load_hash_file.hpp
@@ -20,6 +20,7 @@
 #include <nvtext/subword_tokenize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cstdint>
 #include <cstring>
@@ -43,7 +44,7 @@ namespace detail {
 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   std::string const& filename_hashed_vocabulary,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/detail/tokenize.hpp b/cpp/include/nvtext/detail/tokenize.hpp
index 80a6edc496b..d48027e4631 100644
--- a/cpp/include/nvtext/detail/tokenize.hpp
+++ b/cpp/include/nvtext/detail/tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,52 +21,53 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace nvtext {
 namespace detail {
 /**
  * @copydoc nvtext::tokenize(strings_column_view const&,string_scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        cudf::string_scalar const& delimiter,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc nvtext::tokenize(strings_column_view const&,strings_column_view
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        cudf::strings_column_view const& delimiters,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc nvtext::count_tokens(strings_column_view const&, string_scalar
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
                                            cudf::string_scalar const& delimiter,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc nvtext::count_tokens(strings_column_view const&,strings_column_view
- * const&,rmm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
                                            cudf::strings_column_view const& delimiters,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/edit_distance.hpp b/cpp/include/nvtext/edit_distance.hpp
index 9a24662455b..bfdfb4d1a1c 100644
--- a/cpp/include/nvtext/edit_distance.hpp
+++ b/cpp/include/nvtext/edit_distance.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 //! NVText APIs
 namespace nvtext {
 /**
@@ -60,8 +62,8 @@ namespace nvtext {
 std::unique_ptr<cudf::column> edit_distance(
   cudf::strings_column_view const& input,
   cudf::strings_column_view const& targets,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Compute the edit distance between all the strings in the input column.
@@ -98,8 +100,8 @@ std::unique_ptr<cudf::column> edit_distance(
  */
 std::unique_ptr<cudf::column> edit_distance_matrix(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp
index e3d667f0292..bebe2e46023 100644
--- a/cpp/include/nvtext/generate_ngrams.hpp
+++ b/cpp/include/nvtext/generate_ngrams.hpp
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 /**
  * @addtogroup nvtext_ngrams
@@ -58,8 +60,8 @@ std::unique_ptr<cudf::column> generate_ngrams(
   cudf::strings_column_view const& input,
   cudf::size_type ngrams,
   cudf::string_scalar const& separator,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Generates ngrams of characters within each string
@@ -86,9 +88,9 @@ std::unique_ptr<cudf::column> generate_ngrams(
  */
 std::unique_ptr<cudf::column> generate_character_ngrams(
   cudf::strings_column_view const& input,
-  cudf::size_type ngrams              = 2,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type ngrams            = 2,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Hashes ngrams of characters within each string
@@ -121,9 +123,9 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
  */
 std::unique_ptr<cudf::column> hash_character_ngrams(
   cudf::strings_column_view const& input,
-  cudf::size_type ngrams              = 5,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type ngrams            = 5,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/jaccard.hpp b/cpp/include/nvtext/jaccard.hpp
index 19d6c111200..649c17f0b1c 100644
--- a/cpp/include/nvtext/jaccard.hpp
+++ b/cpp/include/nvtext/jaccard.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 /**
  * @addtogroup nvtext_jaccard
@@ -72,8 +74,8 @@ std::unique_ptr<cudf::column> jaccard_index(
   cudf::strings_column_view const& input1,
   cudf::strings_column_view const& input2,
   cudf::size_type width,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 47c625b5079..7d3f6059454 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 /**
  * @addtogroup nvtext_minhash
@@ -53,7 +55,7 @@ std::unique_ptr<cudf::column> minhash(
   cudf::numeric_scalar<uint32_t> seed = 0,
   cudf::size_type width               = 4,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the minhash values for each string per seed
@@ -83,9 +85,9 @@ std::unique_ptr<cudf::column> minhash(
 std::unique_ptr<cudf::column> minhash(
   cudf::strings_column_view const& input,
   cudf::device_span<uint32_t const> seeds,
-  cudf::size_type width               = 4,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type width             = 4,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the minhash value for each string
@@ -114,7 +116,7 @@ std::unique_ptr<cudf::column> minhash64(
   cudf::numeric_scalar<uint64_t> seed = 0,
   cudf::size_type width               = 4,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the minhash values for each string per seed
@@ -144,9 +146,9 @@ std::unique_ptr<cudf::column> minhash64(
 std::unique_ptr<cudf::column> minhash64(
   cudf::strings_column_view const& input,
   cudf::device_span<uint64_t const> seeds,
-  cudf::size_type width               = 4,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type width             = 4,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/ngrams_tokenize.hpp b/cpp/include/nvtext/ngrams_tokenize.hpp
index 9d76ef8689f..09ce323a7ae 100644
--- a/cpp/include/nvtext/ngrams_tokenize.hpp
+++ b/cpp/include/nvtext/ngrams_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 /**
  * @addtogroup nvtext_ngrams
@@ -80,8 +82,8 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
   cudf::size_type ngrams,
   cudf::string_scalar const& delimiter,
   cudf::string_scalar const& separator,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
index 3cbff5c744b..e5967e78318 100644
--- a/cpp/include/nvtext/normalize.hpp
+++ b/cpp/include/nvtext/normalize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 //! NVText APIs
 namespace nvtext {
 /**
@@ -51,8 +53,8 @@ namespace nvtext {
  */
 std::unique_ptr<cudf::column> normalize_spaces(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Normalizes strings characters for tokenizing.
@@ -102,8 +104,8 @@ std::unique_ptr<cudf::column> normalize_spaces(
 std::unique_ptr<cudf::column> normalize_characters(
   cudf::strings_column_view const& input,
   bool do_lower_case,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/replace.hpp b/cpp/include/nvtext/replace.hpp
index 88cf7d41901..aac21346c72 100644
--- a/cpp/include/nvtext/replace.hpp
+++ b/cpp/include/nvtext/replace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 //! NVText APIs
 namespace nvtext {
 /**
@@ -88,7 +90,7 @@ std::unique_ptr<cudf::column> replace_tokens(
   cudf::strings_column_view const& replacements,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Removes tokens whose lengths are less than a specified number of characters.
@@ -137,7 +139,7 @@ std::unique_ptr<cudf::column> filter_tokens(
   cudf::string_scalar const& replacement = cudf::string_scalar{""},
   cudf::string_scalar const& delimiter   = cudf::string_scalar{""},
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/stemmer.hpp b/cpp/include/nvtext/stemmer.hpp
index 0e1759fdc5a..20b81aba661 100644
--- a/cpp/include/nvtext/stemmer.hpp
+++ b/cpp/include/nvtext/stemmer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 /**
  * @addtogroup nvtext_stemmer
@@ -79,8 +81,8 @@ std::unique_ptr<cudf::column> is_letter(
   cudf::strings_column_view const& input,
   letter_type ltype,
   cudf::size_type character_index,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns boolean column indicating if character at `indices[i]` of `input[i]`
@@ -132,8 +134,8 @@ std::unique_ptr<cudf::column> is_letter(
   cudf::strings_column_view const& input,
   letter_type ltype,
   cudf::column_view const& indices,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the Porter Stemmer measurements of a strings column.
@@ -166,8 +168,8 @@ std::unique_ptr<cudf::column> is_letter(
  */
 std::unique_ptr<cudf::column> porter_stemmer_measure(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index 72a899d70b4..a4e06495a1d 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 
 /**
@@ -65,7 +67,7 @@ struct hashed_vocabulary {
  */
 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   std::string const& filename_hashed_vocabulary,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Result object for the subword_tokenize functions.
@@ -155,7 +157,7 @@ tokenizer_result subword_tokenize(
   uint32_t stride,
   bool do_lower_case,
   bool do_truncate,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp
index 107fefcc3bf..ea1b9c716f0 100644
--- a/cpp/include/nvtext/tokenize.hpp
+++ b/cpp/include/nvtext/tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace nvtext {
 /**
  * @addtogroup nvtext_tokenize
@@ -60,7 +62,7 @@ std::unique_ptr<cudf::column> tokenize(
   cudf::strings_column_view const& input,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a single column of strings by tokenizing the input strings
@@ -95,8 +97,8 @@ std::unique_ptr<cudf::column> tokenize(
 std::unique_ptr<cudf::column> tokenize(
   cudf::strings_column_view const& input,
   cudf::strings_column_view const& delimiters,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the number of tokens in each string of a strings column.
@@ -127,7 +129,7 @@ std::unique_ptr<cudf::column> count_tokens(
   cudf::strings_column_view const& input,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the number of tokens in each string of a strings column
@@ -158,8 +160,8 @@ std::unique_ptr<cudf::column> count_tokens(
 std::unique_ptr<cudf::column> count_tokens(
   cudf::strings_column_view const& input,
   cudf::strings_column_view const& delimiters,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns a single column of strings by converting each character to a string.
@@ -183,8 +185,8 @@ std::unique_ptr<cudf::column> count_tokens(
  */
 std::unique_ptr<cudf::column> character_tokenize(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Creates a strings column from a strings column of tokens and an
@@ -225,7 +227,7 @@ std::unique_ptr<cudf::column> detokenize(
   cudf::column_view const& row_indices,
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Vocabulary object to be used with nvtext::tokenize_with_vocabulary
@@ -246,8 +248,8 @@ struct tokenize_vocabulary {
    * @param mr Device memory resource used to allocate the returned column's device memory
    */
   tokenize_vocabulary(cudf::strings_column_view const& input,
-                      rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+                      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
   ~tokenize_vocabulary();
 
   struct tokenize_vocabulary_impl;
@@ -269,8 +271,8 @@ struct tokenize_vocabulary {
  */
 std::unique_ptr<tokenize_vocabulary> load_vocabulary(
   cudf::strings_column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns the token ids for the input string by looking up each delimited
@@ -301,9 +303,9 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(
   cudf::strings_column_view const& input,
   tokenize_vocabulary const& vocabulary,
   cudf::string_scalar const& delimiter,
-  cudf::size_type default_id          = -1,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  cudf::size_type default_id        = -1,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of tokenize group
 }  // namespace nvtext
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index be91c3b4d08..e39a2bb3ae8 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -39,6 +39,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/optional.h>
 
@@ -56,7 +57,7 @@ std::pair<rmm::device_buffer, size_type> scalar_col_valid_mask_and(
   column_view const& col,
   scalar const& s,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   if (col.is_empty()) return std::pair(rmm::device_buffer{0, stream, mr}, 0);
 
@@ -179,7 +180,7 @@ void fixed_point_binary_operation_validation(binary_operator op,
 
 /**
  * @copydoc cudf::binary_operation(column_view const&, column_view const&,
- * binary_operator, data_type, rmm::mr::device_memory_resource*)
+ * binary_operator, data_type, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -189,7 +190,7 @@ std::unique_ptr<column> binary_operation(LhsType const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if constexpr (std::is_same_v<LhsType, column_view> and std::is_same_v<RhsType, column_view>)
     CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
@@ -250,7 +251,7 @@ std::unique_ptr<column> make_fixed_width_column_for_output(scalar const& lhs,
                                                            binary_operator op,
                                                            data_type output_type,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr)
+                                                           rmm::device_async_resource_ref mr)
 {
   if (binops::is_null_dependent(op)) {
     return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr);
@@ -277,7 +278,7 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
                                                            binary_operator op,
                                                            data_type output_type,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr)
+                                                           rmm::device_async_resource_ref mr)
 {
   if (binops::is_null_dependent(op)) {
     return make_fixed_width_column(output_type, lhs.size(), mask_state::ALL_VALID, stream, mr);
@@ -304,7 +305,7 @@ std::unique_ptr<column> make_fixed_width_column_for_output(column_view const& lh
                                                            binary_operator op,
                                                            data_type output_type,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr)
+                                                           rmm::device_async_resource_ref mr)
 {
   if (binops::is_null_dependent(op)) {
     return make_fixed_width_column(output_type, rhs.size(), mask_state::ALL_VALID, stream, mr);
@@ -320,7 +321,7 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   return binops::compiled::binary_operation<scalar, column_view>(
     lhs, rhs, op, output_type, stream, mr);
@@ -330,7 +331,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   return binops::compiled::binary_operation<column_view, scalar>(
     lhs, rhs, op, output_type, stream, mr);
@@ -340,7 +341,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   return binops::compiled::binary_operation<column_view, column_view>(
     lhs, rhs, op, output_type, stream, mr);
@@ -351,7 +352,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          std::string const& ptx,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   // Check for datatype
   auto is_type_supported_ptx = [](data_type type) -> bool {
@@ -405,7 +406,7 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
@@ -415,7 +416,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
@@ -425,7 +426,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::binary_operation(lhs, rhs, op, output_type, stream, mr);
@@ -436,7 +437,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          std::string const& ptx,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::binary_operation(lhs, rhs, ptx, output_type, stream, mr);
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 1429635b803..d3257fadb1d 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -27,6 +27,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/functional.h>
@@ -50,7 +51,7 @@ struct scalar_as_column_view {
   template <typename T, CUDF_ENABLE_IF(is_fixed_width<T>())>
   return_type operator()(scalar const& s,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource*)
+                         rmm::device_async_resource_ref)
   {
     auto& h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
     auto col_v               = column_view(s.type(),
@@ -61,7 +62,7 @@ struct scalar_as_column_view {
     return std::pair{col_v, std::unique_ptr<column>(nullptr)};
   }
   template <typename T, CUDF_ENABLE_IF(!is_fixed_width<T>())>
-  return_type operator()(scalar const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+  return_type operator()(scalar const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unsupported type");
   }
@@ -69,7 +70,7 @@ struct scalar_as_column_view {
 // specialization for cudf::string_view
 template <>
 scalar_as_column_view::return_type scalar_as_column_view::operator()<cudf::string_view>(
-  scalar const& s, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  scalar const& s, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   using T                  = cudf::string_view;
   auto& h_scalar_type_view = static_cast<cudf::scalar_type_t<T>&>(const_cast<scalar&>(s));
@@ -96,7 +97,7 @@ scalar_as_column_view::return_type scalar_as_column_view::operator()<cudf::strin
 // specializing for struct column
 template <>
 scalar_as_column_view::return_type scalar_as_column_view::operator()<cudf::struct_view>(
-  scalar const& s, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  scalar const& s, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   auto col = make_column_from_scalar(s, 1, stream, mr);
   return std::pair{col->view(), std::move(col)};
@@ -114,7 +115,7 @@ scalar_as_column_view::return_type scalar_as_column_view::operator()<cudf::struc
 auto scalar_to_column_view(
   scalar const& scal,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   return type_dispatcher(scal.type(), scalar_as_column_view{}, scal, stream, mr);
 }
@@ -216,7 +217,7 @@ struct null_considering_binop {
                                      data_type output_type,
                                      cudf::size_type col_size,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     // Create device views for inputs
     auto const lhs_dev_view = get_device_view(lhs);
@@ -263,7 +264,7 @@ std::unique_ptr<column> string_null_min_max(scalar const& lhs,
                                             binary_operator op,
                                             data_type output_type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   // hard-coded to only work with cudf::string_view so we don't explode compile times
   CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype");
@@ -280,7 +281,7 @@ std::unique_ptr<column> string_null_min_max(column_view const& lhs,
                                             binary_operator op,
                                             data_type output_type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   // hard-coded to only work with cudf::string_view so we don't explode compile times
   CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype");
@@ -297,7 +298,7 @@ std::unique_ptr<column> string_null_min_max(column_view const& lhs,
                                             binary_operator op,
                                             data_type output_type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   // hard-coded to only work with cudf::string_view so we don't explode compile times
   CUDF_EXPECTS(lhs.type().id() == cudf::type_id::STRING, "Invalid/Unsupported lhs datatype");
diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp
index 47fd50c5d97..c7eb08cd133 100644
--- a/cpp/src/binaryop/compiled/binary_ops.hpp
+++ b/cpp/src/binaryop/compiled/binary_ops.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -37,21 +38,21 @@ std::unique_ptr<column> string_null_min_max(scalar const& lhs,
                                             binary_operator op,
                                             data_type output_type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> string_null_min_max(column_view const& lhs,
                                             scalar const& rhs,
                                             binary_operator op,
                                             data_type output_type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 std::unique_ptr<column> string_null_min_max(column_view const& lhs,
                                             column_view const& rhs,
                                             binary_operator op,
                                             data_type output_type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Performs a binary operation between a string scalar and a string
@@ -77,7 +78,7 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @brief Performs a binary operation between a string column and a string
@@ -103,7 +104,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @brief Performs a binary operation between two string columns.
@@ -128,7 +129,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          binary_operator op,
                                          data_type output_type,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 void binary_operation(mutable_column_view& out,
                       scalar const& lhs,
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 806beeb4efe..4da2e502ce6 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -33,6 +33,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/binary_search.h>
@@ -79,7 +80,7 @@ namespace detail {
 rmm::device_buffer create_null_mask(size_type size,
                                     mask_state state,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   size_type mask_size{0};
 
@@ -157,7 +158,7 @@ void set_null_mask(bitmask_type* bitmask,
 rmm::device_buffer create_null_mask(size_type size,
                                     mask_state state,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   return detail::create_null_mask(size, state, stream, mr);
 }
@@ -211,7 +212,7 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask,
                                 size_type begin_bit,
                                 size_type end_bit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(begin_bit >= 0, "Invalid range.");
@@ -235,7 +236,7 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask,
 // Create a bitmask from a column view
 rmm::device_buffer copy_bitmask(column_view const& view,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   rmm::device_buffer null_mask{0, stream, mr};
@@ -432,7 +433,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(host_span<bitmask_type cons
                                                      host_span<size_type const> begin_bits,
                                                      size_type mask_size,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   return bitmask_binop(
     [] __device__(bitmask_type left, bitmask_type right) { return left & right; },
@@ -446,7 +447,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(host_span<bitmask_type cons
 // Returns the bitwise AND of the null masks of all columns in the table view
 std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   rmm::device_buffer null_mask{0, stream, mr};
@@ -479,7 +480,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
 // Returns the bitwise OR of the null masks of all columns in the table view
 std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   rmm::device_buffer null_mask{0, stream, mr};
@@ -512,7 +513,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
 void set_all_valid_null_masks(column_view const& input,
                               column& output,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   if (input.nullable()) {
     auto mask = detail::create_null_mask(output.size(), mask_state::ALL_VALID, stream, mr);
@@ -531,7 +532,7 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask,
                                 size_type begin_bit,
                                 size_type end_bit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_bitmask(mask, begin_bit, end_bit, stream, mr);
@@ -540,7 +541,7 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask,
 // Create a bitmask from a column view
 rmm::device_buffer copy_bitmask(column_view const& view,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_bitmask(view, stream, mr);
@@ -548,7 +549,7 @@ rmm::device_buffer copy_bitmask(column_view const& view,
 
 std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::bitmask_and(view, stream, mr);
@@ -556,7 +557,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
 
 std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::bitmask_or(view, stream, mr);
diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu
index d4a8fff69e2..90f719b9516 100644
--- a/cpp/src/column/column.cu
+++ b/cpp/src/column/column.cu
@@ -35,6 +35,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -46,9 +47,7 @@
 namespace cudf {
 
 // Copy ctor w/ optional stream/mr
-column::column(column const& other,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+column::column(column const& other, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   : _type{other._type},
     _size{other._size},
     _data{other._data, stream, mr},
@@ -160,7 +159,7 @@ namespace {
 struct create_column_from_view {
   cudf::column_view view;
   rmm::cuda_stream_view stream{cudf::get_default_stream()};
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   template <typename ColumnType,
             std::enable_if_t<std::is_same_v<ColumnType, cudf::string_view>>* = nullptr>
@@ -254,7 +253,7 @@ struct create_column_from_view {
 }  // anonymous namespace
 
 // Copy from a view
-column::column(column_view view, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+column::column(column_view view, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   :  // Move is needed here because the dereference operator of unique_ptr returns
      // an lvalue reference, which would otherwise dispatch to the copy constructor
     column{std::move(*type_dispatcher(view.type(), create_column_from_view{view, stream, mr}))}
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index d8da6a95aa4..e40056fc8a1 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/constant_iterator.h>
 
 namespace cudf {
@@ -75,7 +77,7 @@ std::unique_ptr<column> make_numeric_column(data_type type,
                                             size_type size,
                                             mask_state state,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
@@ -95,7 +97,7 @@ std::unique_ptr<column> make_fixed_point_column(data_type type,
                                                 size_type size,
                                                 mask_state state,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.");
@@ -115,7 +117,7 @@ std::unique_ptr<column> make_timestamp_column(data_type type,
                                               size_type size,
                                               mask_state state,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
@@ -135,7 +137,7 @@ std::unique_ptr<column> make_duration_column(data_type type,
                                              size_type size,
                                              mask_state state,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
@@ -155,7 +157,7 @@ std::unique_ptr<column> make_fixed_width_column(data_type type,
                                                 size_type size,
                                                 mask_state state,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type.");
@@ -171,7 +173,7 @@ std::unique_ptr<column> make_fixed_width_column(data_type type,
 std::unique_ptr<column> make_dictionary_from_scalar(scalar const& s,
                                                     size_type size,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   if (size == 0) return make_empty_column(type_id::DICTIONARY32);
   CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
diff --git a/cpp/src/column/column_factories.cu b/cpp/src/column/column_factories.cu
index 0e65a131e67..bad20d6817c 100644
--- a/cpp/src/column/column_factories.cu
+++ b/cpp/src/column/column_factories.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/fill.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/constant_iterator.h>
 
 namespace cudf {
@@ -33,7 +35,7 @@ struct column_from_scalar_dispatch {
   std::unique_ptr<cudf::column> operator()(scalar const& value,
                                            size_type size,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr) const
+                                           rmm::device_async_resource_ref mr) const
   {
     if (size == 0) return make_empty_column(value.type());
     if (!value.is_valid(stream))
@@ -51,7 +53,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stri
   scalar const& value,
   size_type size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   if (size == 0) return make_empty_column(value.type());
 
@@ -68,7 +70,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stri
 
 template <>
 std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::dictionary32>(
-  scalar const&, size_type, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) const
+  scalar const&, size_type, rmm::cuda_stream_view, rmm::device_async_resource_ref) const
 {
   CUDF_FAIL("dictionary not supported when creating from scalar");
 }
@@ -78,7 +80,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::list
   scalar const& value,
   size_type size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   auto lv = static_cast<list_scalar const*>(&value);
   return lists::detail::make_lists_column_from_scalar(*lv, size, stream, mr);
@@ -89,7 +91,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stru
   scalar const& value,
   size_type size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   if (size == 0) CUDF_FAIL("0-length struct column is unsupported.");
   auto& ss  = static_cast<scalar_type_t<cudf::struct_view> const&>(value);
@@ -113,7 +115,7 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stru
 std::unique_ptr<column> make_column_from_scalar(scalar const& s,
                                                 size_type size,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(s.type(), column_from_scalar_dispatch{}, s, size, stream, mr);
 }
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index b1d850e0b27..7c57be8e7c0 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -33,6 +33,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/advance.h>
 #include <thrust/binary_search.h>
@@ -241,7 +242,7 @@ template <typename T>
 std::unique_ptr<column> fused_concatenate(host_span<column_view const> views,
                                           bool const has_nulls,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   using mask_policy = cudf::mask_allocation_policy;
 
@@ -288,7 +289,7 @@ template <typename T>
 std::unique_ptr<column> for_each_concatenate(host_span<column_view const> views,
                                              bool const has_nulls,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   size_type const total_element_count =
     std::accumulate(views.begin(), views.end(), 0, [](auto accumulator, auto const& v) {
@@ -321,7 +322,7 @@ std::unique_ptr<column> for_each_concatenate(host_span<column_view const> views,
 struct concatenate_dispatch {
   host_span<column_view const> views;
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   // fixed width
   template <typename T>
@@ -485,7 +486,7 @@ void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_v
 // Concatenates the elements from a vector of column_views
 std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_concat,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(not columns_to_concat.empty(), "Unexpected empty list of columns to concatenate.");
 
@@ -504,7 +505,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_conc
 
 std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (tables_to_concat.empty()) { return std::make_unique<table>(); }
 
@@ -533,7 +534,7 @@ std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
 
 rmm::device_buffer concatenate_masks(host_span<column_view const> views,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   bool const has_nulls =
     std::any_of(views.begin(), views.end(), [](column_view const col) { return col.has_nulls(); });
@@ -558,7 +559,7 @@ rmm::device_buffer concatenate_masks(host_span<column_view const> views,
 
 rmm::device_buffer concatenate_masks(host_span<column_view const> views,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate_masks(views, stream, mr);
@@ -567,7 +568,7 @@ rmm::device_buffer concatenate_masks(host_span<column_view const> views,
 // Concatenates the elements from a vector of column_views
 std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_concat,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate(columns_to_concat, stream, mr);
@@ -575,7 +576,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_conc
 
 std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate(tables_to_concat, stream, mr);
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 23bcd344a32..37db2c74790 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -48,6 +49,7 @@
 
 #include <cstddef>
 #include <numeric>
+#include <optional>
 #include <stdexcept>
 
 namespace cudf {
@@ -988,7 +990,7 @@ struct packed_split_indices_and_src_buf_info {
                                         std::size_t num_partitions,
                                         cudf::size_type num_src_bufs,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* temp_mr)
+                                        rmm::device_async_resource_ref temp_mr)
     : indices_size(
         cudf::util::round_up_safe((num_partitions + 1) * sizeof(size_type), split_align)),
       src_buf_info_size(
@@ -1046,7 +1048,7 @@ struct packed_partition_buf_size_and_dst_buf_info {
   packed_partition_buf_size_and_dst_buf_info(std::size_t num_partitions,
                                              std::size_t num_bufs,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* temp_mr)
+                                             rmm::device_async_resource_ref temp_mr)
     : stream(stream),
       buf_sizes_size{cudf::util::round_up_safe(num_partitions * sizeof(std::size_t), split_align)},
       dst_buf_info_size{cudf::util::round_up_safe(num_bufs * sizeof(dst_buf_info), split_align)},
@@ -1097,7 +1099,7 @@ struct packed_src_and_dst_pointers {
                               std::size_t num_partitions,
                               cudf::size_type num_src_bufs,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* temp_mr)
+                              rmm::device_async_resource_ref temp_mr)
     : stream(stream),
       src_bufs_size{cudf::util::round_up_safe(num_src_bufs * sizeof(uint8_t*), split_align)},
       dst_bufs_size{cudf::util::round_up_safe(num_partitions * sizeof(uint8_t*), split_align)},
@@ -1158,7 +1160,7 @@ std::unique_ptr<packed_src_and_dst_pointers> setup_src_and_dst_pointers(
   cudf::size_type num_src_bufs,
   std::vector<rmm::device_buffer>& out_buffers,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* temp_mr)
+  rmm::device_async_resource_ref temp_mr)
 {
   auto src_and_dst_pointers = std::make_unique<packed_src_and_dst_pointers>(
     input, num_partitions, num_src_bufs, stream, temp_mr);
@@ -1195,7 +1197,7 @@ std::unique_ptr<packed_partition_buf_size_and_dst_buf_info> compute_splits(
   cudf::size_type num_src_bufs,
   std::size_t num_bufs,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* temp_mr)
+  rmm::device_async_resource_ref temp_mr)
 {
   auto partition_buf_size_and_dst_buf_info =
     std::make_unique<packed_partition_buf_size_and_dst_buf_info>(
@@ -1366,7 +1368,7 @@ struct chunk_iteration_state {
     std::size_t num_partitions,
     std::size_t user_buffer_size,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* temp_mr);
+    rmm::device_async_resource_ref temp_mr);
 
   /**
    * @brief As of the time of the call, return the starting 1MB batch index, and the
@@ -1426,7 +1428,7 @@ std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
   std::size_t num_partitions,
   std::size_t user_buffer_size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* temp_mr)
+  rmm::device_async_resource_ref temp_mr)
 {
   rmm::device_uvector<size_type> d_batch_offsets(num_bufs + 1, stream, temp_mr);
 
@@ -1646,7 +1648,7 @@ std::unique_ptr<chunk_iteration_state> compute_batches(int num_bufs,
                                                        std::size_t num_partitions,
                                                        std::size_t user_buffer_size,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* temp_mr)
+                                                       rmm::device_async_resource_ref temp_mr)
 {
   // Since we parallelize at one block per copy, performance is vulnerable to situations where we
   // have small numbers of copies to do (a combination of small numbers of splits and/or columns),
@@ -1769,8 +1771,8 @@ struct contiguous_split_state {
   contiguous_split_state(cudf::table_view const& input,
                          std::size_t user_buffer_size,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr,
-                         rmm::mr::device_memory_resource* temp_mr)
+                         std::optional<rmm::device_async_resource_ref> mr,
+                         rmm::device_async_resource_ref temp_mr)
     : contiguous_split_state(input, {}, user_buffer_size, stream, mr, temp_mr)
   {
   }
@@ -1778,8 +1780,8 @@ struct contiguous_split_state {
   contiguous_split_state(cudf::table_view const& input,
                          std::vector<size_type> const& splits,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr,
-                         rmm::mr::device_memory_resource* temp_mr)
+                         std::optional<rmm::device_async_resource_ref> mr,
+                         rmm::device_async_resource_ref temp_mr)
     : contiguous_split_state(input, splits, 0, stream, mr, temp_mr)
   {
   }
@@ -1897,8 +1899,8 @@ struct contiguous_split_state {
                          std::vector<size_type> const& splits,
                          std::size_t user_buffer_size,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr,
-                         rmm::mr::device_memory_resource* temp_mr)
+                         std::optional<rmm::device_async_resource_ref> mr,
+                         rmm::device_async_resource_ref temp_mr)
     : input(input),
       user_buffer_size(user_buffer_size),
       stream(stream),
@@ -1936,7 +1938,8 @@ struct contiguous_split_state {
       std::transform(h_buf_sizes,
                      h_buf_sizes + num_partitions,
                      std::back_inserter(out_buffers),
-                     [stream = stream, mr = mr](std::size_t bytes) {
+                     [stream = stream,
+                      mr = mr.value_or(rmm::mr::get_current_device_resource())](std::size_t bytes) {
                        return rmm::device_buffer{bytes, stream, mr};
                      });
     }
@@ -2014,11 +2017,11 @@ struct contiguous_split_state {
   cudf::table_view const input;        ///< The input table_view to operate on
   std::size_t const user_buffer_size;  ///< The size of the user buffer for the chunked_pack case
   rmm::cuda_stream_view const stream;
-  rmm::mr::device_memory_resource* const mr;  ///< The memory resource for any data returned
+  std::optional<rmm::device_async_resource_ref const> mr;  ///< The resource for any data returned
 
   // this resource defaults to `mr` for the contiguous_split case, but it can be useful for the
   // `chunked_pack` case to allocate scratch/temp memory in a pool
-  rmm::mr::device_memory_resource* const temp_mr;  ///< The memory resource for scratch/temp space
+  rmm::device_async_resource_ref const temp_mr;  ///< The memory resource for scratch/temp space
 
   // whether the table was empty to begin with (0 rows or 0 columns) and should be metadata-only
   bool const is_empty;  ///< True if the source table has 0 rows or 0 columns
@@ -2062,7 +2065,7 @@ struct contiguous_split_state {
 std::vector<packed_table> contiguous_split(cudf::table_view const& input,
                                            std::vector<size_type> const& splits,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   // `temp_mr` is the same as `mr` for contiguous_split as it allocates all
   // of its memory from the default memory resource in cuDF
@@ -2075,7 +2078,7 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
 
 std::vector<packed_table> contiguous_split(cudf::table_view const& input,
                                            std::vector<size_type> const& splits,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contiguous_split(input, splits, cudf::get_default_stream(), mr);
@@ -2083,14 +2086,14 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
 
 chunked_pack::chunked_pack(cudf::table_view const& input,
                            std::size_t user_buffer_size,
-                           rmm::mr::device_memory_resource* temp_mr)
+                           rmm::device_async_resource_ref temp_mr)
 {
   CUDF_EXPECTS(user_buffer_size >= desired_batch_size,
                "The output buffer size must be at least 1MB in size");
-  // We pass `nullptr` for the first `mr` in `contiguous_split_state` to indicate
+  // We pass `std::nullopt` for the first `mr` in `contiguous_split_state` to indicate
   // that it does not allocate any user-bound data for the `chunked_pack` case.
   state = std::make_unique<detail::contiguous_split_state>(
-    input, user_buffer_size, cudf::get_default_stream(), nullptr, temp_mr);
+    input, user_buffer_size, cudf::get_default_stream(), std::nullopt, temp_mr);
 }
 
 // required for the unique_ptr to work with a incomplete type (contiguous_split_state)
@@ -2115,7 +2118,7 @@ std::unique_ptr<std::vector<uint8_t>> chunked_pack::build_metadata() const
 
 std::unique_ptr<chunked_pack> chunked_pack::create(cudf::table_view const& input,
                                                    std::size_t user_buffer_size,
-                                                   rmm::mr::device_memory_resource* temp_mr)
+                                                   rmm::device_async_resource_ref temp_mr)
 {
   return std::make_unique<chunked_pack>(input, user_buffer_size, temp_mr);
 }
diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index cb7d507de81..98ee6aa8f68 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -26,6 +26,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -119,7 +120,7 @@ std::unique_ptr<column> allocate_like(column_view const& input,
                                       size_type size,
                                       mask_allocation_policy mask_alloc,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(
@@ -177,7 +178,7 @@ std::unique_ptr<table> empty_like(table_view const& input_table)
 std::unique_ptr<column> allocate_like(column_view const& input,
                                       mask_allocation_policy mask_alloc,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::allocate_like(input, input.size(), mask_alloc, stream, mr);
@@ -187,7 +188,7 @@ std::unique_ptr<column> allocate_like(column_view const& input,
                                       size_type size,
                                       mask_allocation_policy mask_alloc,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::allocate_like(input, size, mask_alloc, stream, mr);
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index 8299c211fad..92fb2e61741 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -30,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -77,7 +78,7 @@ struct copy_if_else_functor_impl<T, std::enable_if_t<is_rep_layout_compatible<T>
                                      bool right_nullable,
                                      Filter filter,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto p_lhs      = get_iterable_device_view{}(lhs_h, stream);
     auto p_rhs      = get_iterable_device_view{}(rhs_h, stream);
@@ -110,7 +111,7 @@ struct copy_if_else_functor_impl<string_view> {
                                      bool right_nullable,
                                      Filter filter,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     using T = string_view;
 
@@ -162,7 +163,7 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::column_view const& lh
                                                      size_type size,
                                                      Filter is_left,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   auto gather_map = rmm::device_uvector<size_type>{static_cast<std::size_t>(size), stream};
   auto const gather_map_end = thrust::copy_if(rmm::exec_policy(stream),
@@ -196,7 +197,7 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::scalar const& lhs,
                                                      size_type size,
                                                      Filter is_left,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   auto scatter_map = rmm::device_uvector<size_type>{static_cast<std::size_t>(size), stream};
   auto const scatter_map_end = thrust::copy_if(rmm::exec_policy(stream),
@@ -225,7 +226,7 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::column_view const& lh
                                                      size_type size,
                                                      Filter is_left,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   return scatter_gather_based_if_else(rhs, lhs, size, logical_not{is_left}, stream, mr);
 }
@@ -236,7 +237,7 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::scalar const& lhs,
                                                      size_type size,
                                                      Filter is_left,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   auto rhs_col = cudf::make_column_from_scalar(rhs, size, stream, mr);
   return scatter_gather_based_if_else(lhs, rhs_col->view(), size, is_left, stream, mr);
@@ -252,7 +253,7 @@ struct copy_if_else_functor_impl<struct_view> {
                                      bool,
                                      Filter filter,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return scatter_gather_based_if_else(lhs, rhs, size, filter, stream, mr);
   }
@@ -268,7 +269,7 @@ struct copy_if_else_functor_impl<list_view> {
                                      bool,
                                      Filter filter,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return scatter_gather_based_if_else(lhs, rhs, size, filter, stream, mr);
   }
@@ -284,7 +285,7 @@ struct copy_if_else_functor_impl<dictionary32> {
                                      bool,
                                      Filter filter,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return scatter_gather_based_if_else(lhs, rhs, size, filter, stream, mr);
   }
@@ -303,7 +304,7 @@ struct copy_if_else_functor {
                                      bool right_nullable,
                                      Filter filter,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     copy_if_else_functor_impl<T> copier{};
     return copier(lhs, rhs, size, left_nullable, right_nullable, filter, stream, mr);
@@ -318,7 +319,7 @@ std::unique_ptr<column> copy_if_else(Left const& lhs,
                                      bool right_nullable,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(boolean_mask.type() == data_type(type_id::BOOL8),
                "Boolean mask column must be of type type_id::BOOL8",
@@ -356,7 +357,7 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs and rhs columns",
@@ -372,7 +373,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(boolean_mask.size() == rhs.size(),
                "Boolean mask column must be the same size as rhs column",
@@ -390,7 +391,7 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs column",
@@ -408,7 +409,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(
     lhs.type() == rhs.type(), "Both inputs must be of the same type", cudf::data_type_error);
@@ -422,7 +423,7 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_if_else(lhs, rhs, boolean_mask, stream, mr);
@@ -432,7 +433,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      column_view const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_if_else(lhs, rhs, boolean_mask, stream, mr);
@@ -442,7 +443,7 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_if_else(lhs, rhs, boolean_mask, stream, mr);
@@ -452,7 +453,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      scalar const& rhs,
                                      column_view const& boolean_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_if_else(lhs, rhs, boolean_mask, stream, mr);
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index e10d7081a55..d2ea7036952 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -34,6 +34,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 
@@ -98,7 +99,7 @@ struct out_of_place_copy_range_dispatch {
     cudf::size_type source_end,
     cudf::size_type target_begin,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
   {
     auto p_ret = std::make_unique<cudf::column>(target, stream, mr);
     if ((!p_ret->nullable()) && source.has_nulls(source_begin, source_end)) {
@@ -129,7 +130,7 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
   cudf::size_type source_end,
   cudf::size_type target_begin,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   return cudf::strings::detail::copy_range(
     source, target, source_begin, source_end, target_begin, stream, mr);
@@ -141,7 +142,7 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
   cudf::size_type source_end,
   cudf::size_type target_begin,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // check the keys in the source and target
   cudf::dictionary_column_view const dict_source(source);
@@ -231,7 +232,7 @@ std::unique_ptr<column> copy_range(column_view const& source,
                                    size_type source_end,
                                    size_type target_begin,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS((source_begin >= 0) && (source_end <= source.size()) &&
                  (source_begin <= source_end) && (target_begin >= 0) &&
@@ -270,7 +271,7 @@ std::unique_ptr<column> copy_range(column_view const& source,
                                    size_type source_end,
                                    size_type target_begin,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::copy_range(source, target, source_begin, source_end, target_begin, stream, mr);
diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu
index 78748e5a00b..5eb039419df 100644
--- a/cpp/src/copying/gather.cu
+++ b/cpp/src/copying/gather.cu
@@ -25,6 +25,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/transform_iterator.h>
@@ -39,7 +40,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               out_of_bounds_policy bounds_policy,
                               negative_index_policy neg_indices,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(not gather_map.has_nulls(), "gather_map contains nulls", std::invalid_argument);
 
@@ -66,7 +67,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               out_of_bounds_policy bounds_policy,
                               negative_index_policy neg_indices,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(gather_map.size() <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
                "gather map size exceeds the column size limit",
@@ -85,7 +86,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               column_view const& gather_map,
                               out_of_bounds_policy bounds_policy,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index 2e804415439..b8860da479c 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <stdexcept>
 
@@ -42,7 +43,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      size_type index,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto s = make_fixed_width_scalar(data_type(type_to_id<T>()), stream, mr);
 
@@ -65,7 +66,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      size_type index,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto device_col = column_device_view::create(input, stream);
 
@@ -89,7 +90,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      size_type index,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto dict_view    = dictionary_column_view(input);
     auto indices_iter = detail::indexalator_factory::make_input_iterator(dict_view.indices());
@@ -124,7 +125,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      size_type index,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     bool valid               = is_element_valid_sync(input, index, stream);
     auto const child_col_idx = lists_column_view::child_column_index;
@@ -148,7 +149,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      size_type index,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     using Type = typename T::rep;
 
@@ -178,7 +179,7 @@ struct get_element_functor {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      size_type index,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     bool valid = is_element_valid_sync(input, index, stream);
     auto row_contents =
@@ -193,7 +194,7 @@ struct get_element_functor {
 std::unique_ptr<scalar> get_element(column_view const& input,
                                     size_type index,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(index >= 0 and index < input.size(), "Index out of bounds", std::out_of_range);
   return type_dispatcher(input.type(), get_element_functor{}, input, index, stream, mr);
@@ -204,7 +205,7 @@ std::unique_ptr<scalar> get_element(column_view const& input,
 std::unique_ptr<scalar> get_element(column_view const& input,
                                     size_type index,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::get_element(input, index, stream, mr);
diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp
index e4de4a43b68..b0208a58896 100644
--- a/cpp/src/copying/pack.cpp
+++ b/cpp/src/copying/pack.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -144,7 +145,7 @@ void build_column_metadata(metadata_builder& mb,
  */
 packed_columns pack(cudf::table_view const& input,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* mr)
+                    rmm::device_async_resource_ref mr)
 {
   // do a contiguous_split with no splits to get the memory for the table
   // arranged as we want it
@@ -260,7 +261,7 @@ void metadata_builder::clear() { return impl->clear(); }
 /**
  * @copydoc cudf::pack
  */
-packed_columns pack(cudf::table_view const& input, rmm::mr::device_memory_resource* mr)
+packed_columns pack(cudf::table_view const& input, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::pack(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/copying/purge_nonempty_nulls.cu b/cpp/src/copying/purge_nonempty_nulls.cu
index 620a03d8be5..d69d214a881 100644
--- a/cpp/src/copying/purge_nonempty_nulls.cu
+++ b/cpp/src/copying/purge_nonempty_nulls.cu
@@ -18,6 +18,8 @@
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/count.h>
 #include <thrust/iterator/counting_iterator.h>
 
@@ -87,7 +89,7 @@ bool has_nonempty_nulls(cudf::column_view const& input, rmm::cuda_stream_view st
 
 std::unique_ptr<column> purge_nonempty_nulls(column_view const& input,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   // If not compound types (LIST/STRING/STRUCT/DICTIONARY) then just copy the input into output.
   if (!cudf::is_compound(input.type())) { return std::make_unique<column>(input, stream, mr); }
@@ -132,11 +134,11 @@ bool has_nonempty_nulls(column_view const& input, rmm::cuda_stream_view stream)
 }
 
 /**
- * @copydoc cudf::purge_nonempty_nulls(column_view const&, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::purge_nonempty_nulls(column_view const&, rmm::device_async_resource_ref)
  */
 std::unique_ptr<cudf::column> purge_nonempty_nulls(column_view const& input,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   return detail::purge_nonempty_nulls(input, stream, mr);
 }
diff --git a/cpp/src/copying/reverse.cu b/cpp/src/copying/reverse.cu
index 78d1b54882c..d3d42e35e26 100644
--- a/cpp/src/copying/reverse.cu
+++ b/cpp/src/copying/reverse.cu
@@ -26,6 +26,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/constant_iterator.h>
@@ -37,7 +38,7 @@ namespace cudf {
 namespace detail {
 std::unique_ptr<table> reverse(table_view const& source_table,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   size_type num_rows = source_table.num_rows();
   auto elements      = make_counting_transform_iterator(
@@ -51,7 +52,7 @@ std::unique_ptr<table> reverse(table_view const& source_table,
 
 std::unique_ptr<column> reverse(column_view const& source_column,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   return std::move(
     cudf::detail::reverse(table_view({source_column}), stream, mr)->release().front());
@@ -60,7 +61,7 @@ std::unique_ptr<column> reverse(column_view const& source_column,
 
 std::unique_ptr<table> reverse(table_view const& source_table,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::reverse(source_table, stream, mr);
@@ -68,7 +69,7 @@ std::unique_ptr<table> reverse(table_view const& source_table,
 
 std::unique_ptr<column> reverse(column_view const& source_column,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::reverse(source_column, stream, mr);
diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu
index 0211f97deb3..f8e3a9a83e3 100644
--- a/cpp/src/copying/sample.cu
+++ b/cpp/src/copying/sample.cu
@@ -25,6 +25,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -40,7 +41,7 @@ std::unique_ptr<table> sample(table_view const& input,
                               sample_with_replacement replacement,
                               int64_t const seed,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(n >= 0, "expected number of samples should be non-negative");
   auto const num_rows = input.num_rows();
@@ -92,7 +93,7 @@ std::unique_ptr<table> sample(table_view const& input,
                               sample_with_replacement replacement,
                               int64_t const seed,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sample(input, n, replacement, seed, stream, mr);
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 3bc3979ec1b..cfcbe4724df 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -34,6 +34,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/count.h>
@@ -77,7 +78,7 @@ void scatter_scalar_bitmask_inplace(std::reference_wrapper<scalar const> const&
                                     size_type num_scatter_rows,
                                     column& target,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   constexpr size_type block_size = 256;
   size_type const grid_size      = grid_1d(num_scatter_rows, block_size).num_blocks;
@@ -109,7 +110,7 @@ struct column_scalar_scatterer_impl {
                                      size_type scatter_rows,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     CUDF_EXPECTS(source.get().type() == target.type(),
                  "scalar and column types must match",
@@ -142,7 +143,7 @@ struct column_scalar_scatterer_impl<string_view, MapIterator> {
                                      size_type scatter_rows,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     CUDF_EXPECTS(source.get().type() == target.type(),
                  "scalar and column types must match",
@@ -166,7 +167,7 @@ struct column_scalar_scatterer_impl<list_view, MapIterator> {
                                      size_type scatter_rows,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     CUDF_EXPECTS(source.get().type() == target.type(),
                  "scalar and column types must match",
@@ -186,7 +187,7 @@ struct column_scalar_scatterer_impl<dictionary32, MapIterator> {
                                      size_type scatter_rows,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     auto dict_target =
       dictionary::detail::add_keys(dictionary_column_view(target),
@@ -238,7 +239,7 @@ struct column_scalar_scatterer {
                                      size_type scatter_rows,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     column_scalar_scatterer_impl<Element, MapIterator> scatterer{};
     return scatterer(source, scatter_iter, scatter_rows, target, stream, mr);
@@ -252,7 +253,7 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
                                      size_type scatter_rows,
                                      column_view const& target,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     CUDF_EXPECTS(source.get().type() == target.type(),
                  "scalar and column types must match",
@@ -306,7 +307,7 @@ std::unique_ptr<table> scatter(table_view const& source,
                                column_view const& scatter_map,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(source.num_columns() == target.num_columns(),
                "Number of columns in source and target not equal",
@@ -336,7 +337,7 @@ std::unique_ptr<table> scatter(table_view const& source,
                                device_span<size_type const> const scatter_map,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(scatter_map.size() <= static_cast<size_t>(std::numeric_limits<size_type>::max()),
                "scatter map size exceeds the column size limit",
@@ -353,7 +354,7 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<scalar const>>
                                column_view const& indices,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(source.size() == static_cast<size_t>(target.num_columns()),
                "Number of scalars in source and number of columns in target not equal",
@@ -405,7 +406,7 @@ std::unique_ptr<column> boolean_mask_scatter(column_view const& input,
                                              column_view const& target,
                                              column_view const& boolean_mask,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   auto indices = cudf::make_numeric_column(
     data_type{type_id::INT32}, target.size(), mask_state::UNALLOCATED, stream);
@@ -430,7 +431,7 @@ std::unique_ptr<column> boolean_mask_scatter(scalar const& input,
                                              column_view const& target,
                                              column_view const& boolean_mask,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   return detail::copy_if_else(input, target, boolean_mask, stream, mr);
 }
@@ -439,7 +440,7 @@ std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
                                             table_view const& target,
                                             column_view const& boolean_mask,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.num_columns() == target.num_columns(),
                "Mismatch in number of input columns and target columns",
@@ -482,7 +483,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& target,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(static_cast<size_type>(input.size()) == target.num_columns(),
                "Mismatch in number of scalars and target columns",
@@ -527,7 +528,7 @@ std::unique_ptr<table> scatter(table_view const& source,
                                column_view const& scatter_map,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::scatter(source, scatter_map, target, stream, mr);
@@ -537,7 +538,7 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<scalar const>>
                                column_view const& indices,
                                table_view const& target,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::scatter(source, indices, target, stream, mr);
@@ -547,7 +548,7 @@ std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
                                             table_view const& target,
                                             column_view const& boolean_mask,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::boolean_mask_scatter(input, target, boolean_mask, stream, mr);
@@ -558,7 +559,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& target,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::boolean_mask_scatter(input, target, boolean_mask, stream, mr);
diff --git a/cpp/src/copying/segmented_shift.cu b/cpp/src/copying/segmented_shift.cu
index dd2733cf7e9..b7abc60f240 100644
--- a/cpp/src/copying/segmented_shift.cu
+++ b/cpp/src/copying/segmented_shift.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -73,7 +74,7 @@ struct segmented_shift_functor<T, std::enable_if_t<is_rep_layout_compatible<T>()
                                      size_type offset,
                                      scalar const& fill_value,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto values_device_view = column_device_view::create(segmented_values, stream);
     bool nullable           = not fill_value.is_valid(stream) or segmented_values.nullable();
@@ -102,7 +103,7 @@ struct segmented_shift_functor<string_view> {
                                      size_type offset,
                                      scalar const& fill_value,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto values_device_view = column_device_view::create(segmented_values, stream);
     auto input_iterator     = make_optional_iterator<cudf::string_view>(
@@ -129,7 +130,7 @@ struct segmented_shift_functor_forwarder {
                                      size_type offset,
                                      scalar const& fill_value,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     segmented_shift_functor<T> shifter;
     return shifter(segmented_values, segment_offsets, offset, fill_value, stream, mr);
@@ -143,7 +144,7 @@ std::unique_ptr<column> segmented_shift(column_view const& segmented_values,
                                         size_type offset,
                                         scalar const& fill_value,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   if (segmented_values.is_empty()) { return empty_like(segmented_values); }
   if (offset == 0) { return std::make_unique<column>(segmented_values, stream, mr); };
diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu
index 8e013bb1212..bdc741887f7 100644
--- a/cpp/src/copying/shift.cu
+++ b/cpp/src/copying/shift.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -51,7 +52,7 @@ std::pair<rmm::device_buffer, size_type> create_null_mask(column_device_view con
                                                           size_type offset,
                                                           scalar const& fill_value,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr)
+                                                          rmm::device_async_resource_ref mr)
 {
   auto const size = input.size();
   auto func_validity =
@@ -81,7 +82,7 @@ struct shift_functor {
     size_type offset,
     scalar const& fill_value,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     auto output = cudf::strings::detail::shift(
       cudf::strings_column_view(input), offset, fill_value, stream, mr);
@@ -101,7 +102,7 @@ struct shift_functor {
     size_type offset,
     scalar const& fill_value,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     using ScalarType = cudf::scalar_type_t<T>;
     auto& scalar     = static_cast<ScalarType const&>(fill_value);
@@ -155,7 +156,7 @@ std::unique_ptr<column> shift(column_view const& input,
                               size_type offset,
                               scalar const& fill_value,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type() == fill_value.type(),
                "shift requires each fill value type to match the corresponding column type.",
@@ -173,7 +174,7 @@ std::unique_ptr<column> shift(column_view const& input,
                               size_type offset,
                               scalar const& fill_value,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::shift(input, offset, fill_value, stream, mr);
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index 371663c41ee..7629cad79a9 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,6 +35,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/permutation_iterator.h>
@@ -254,7 +255,7 @@ struct dispatch_round {
     rounding_frequency component,
     cudf::column_view const& column,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto size            = column.size();
     auto output_col_type = data_type{cudf::type_to_id<Timestamp>()};
@@ -319,7 +320,7 @@ struct launch_functor {
 template <typename TransformFunctor, cudf::type_id OutputColCudfT>
 std::unique_ptr<column> apply_datetime_op(column_view const& column,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_timestamp(column.type()), "Column type should be timestamp");
   auto size            = column.size();
@@ -355,7 +356,7 @@ struct add_calendrical_months_functor {
     column_view timestamp_column,
     MonthIterator months_begin,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto size            = timestamp_column.size();
     auto output_col_type = timestamp_column.type();
@@ -386,7 +387,7 @@ struct add_calendrical_months_functor {
 std::unique_ptr<column> add_calendrical_months(column_view const& timestamp_column,
                                                column_view const& months_column,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_timestamp(timestamp_column.type()), "Column type should be timestamp");
   CUDF_EXPECTS(
@@ -413,7 +414,7 @@ std::unique_ptr<column> add_calendrical_months(column_view const& timestamp_colu
 std::unique_ptr<column> add_calendrical_months(column_view const& timestamp_column,
                                                scalar const& months,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_timestamp(timestamp_column.type()), "Column type should be timestamp");
   CUDF_EXPECTS(months.type().id() == type_id::INT16 or months.type().id() == type_id::INT32,
@@ -442,7 +443,7 @@ std::unique_ptr<column> round_general(rounding_function round_kind,
                                       rounding_frequency component,
                                       column_view const& column,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher(
     column.type(), dispatch_round{}, round_kind, component, column, stream, mr);
@@ -450,7 +451,7 @@ std::unique_ptr<column> round_general(rounding_function round_kind,
 
 std::unique_ptr<column> extract_year(column_view const& column,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::YEAR>,
@@ -459,7 +460,7 @@ std::unique_ptr<column> extract_year(column_view const& column,
 
 std::unique_ptr<column> extract_month(column_view const& column,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::MONTH>,
@@ -468,7 +469,7 @@ std::unique_ptr<column> extract_month(column_view const& column,
 
 std::unique_ptr<column> extract_day(column_view const& column,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::DAY>,
@@ -477,7 +478,7 @@ std::unique_ptr<column> extract_day(column_view const& column,
 
 std::unique_ptr<column> extract_weekday(column_view const& column,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::WEEKDAY>,
@@ -486,7 +487,7 @@ std::unique_ptr<column> extract_weekday(column_view const& column,
 
 std::unique_ptr<column> extract_hour(column_view const& column,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::HOUR>,
@@ -495,7 +496,7 @@ std::unique_ptr<column> extract_hour(column_view const& column,
 
 std::unique_ptr<column> extract_minute(column_view const& column,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::MINUTE>,
@@ -504,7 +505,7 @@ std::unique_ptr<column> extract_minute(column_view const& column,
 
 std::unique_ptr<column> extract_second(column_view const& column,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::SECOND>,
@@ -513,7 +514,7 @@ std::unique_ptr<column> extract_second(column_view const& column,
 
 std::unique_ptr<column> extract_millisecond_fraction(column_view const& column,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::MILLISECOND>,
@@ -522,7 +523,7 @@ std::unique_ptr<column> extract_millisecond_fraction(column_view const& column,
 
 std::unique_ptr<column> extract_microsecond_fraction(column_view const& column,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::MICROSECOND>,
@@ -531,7 +532,7 @@ std::unique_ptr<column> extract_microsecond_fraction(column_view const& column,
 
 std::unique_ptr<column> extract_nanosecond_fraction(column_view const& column,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<
     detail::extract_component_operator<detail::datetime_component::NANOSECOND>,
@@ -540,7 +541,7 @@ std::unique_ptr<column> extract_nanosecond_fraction(column_view const& column,
 
 std::unique_ptr<column> last_day_of_month(column_view const& column,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<detail::extract_last_day_of_month,
                                    cudf::type_id::TIMESTAMP_DAYS>(column, stream, mr);
@@ -548,7 +549,7 @@ std::unique_ptr<column> last_day_of_month(column_view const& column,
 
 std::unique_ptr<column> day_of_year(column_view const& column,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   return detail::apply_datetime_op<detail::extract_day_num_of_year, cudf::type_id::INT16>(
     column, stream, mr);
@@ -556,21 +557,21 @@ std::unique_ptr<column> day_of_year(column_view const& column,
 
 std::unique_ptr<column> is_leap_year(column_view const& column,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   return apply_datetime_op<is_leap_year_op, type_id::BOOL8>(column, stream, mr);
 }
 
 std::unique_ptr<column> days_in_month(column_view const& column,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return apply_datetime_op<days_in_month_op, type_id::INT16>(column, stream, mr);
 }
 
 std::unique_ptr<column> extract_quarter(column_view const& column,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return apply_datetime_op<extract_quarter_op, type_id::INT16>(column, stream, mr);
 }
@@ -579,7 +580,7 @@ std::unique_ptr<column> extract_quarter(column_view const& column,
 
 std::unique_ptr<column> ceil_datetimes(column_view const& column,
                                        rounding_frequency freq,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::round_general(
@@ -588,7 +589,7 @@ std::unique_ptr<column> ceil_datetimes(column_view const& column,
 
 std::unique_ptr<column> floor_datetimes(column_view const& column,
                                         rounding_frequency freq,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::round_general(
@@ -597,88 +598,85 @@ std::unique_ptr<column> floor_datetimes(column_view const& column,
 
 std::unique_ptr<column> round_datetimes(column_view const& column,
                                         rounding_frequency freq,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::round_general(
     detail::rounding_function::ROUND, freq, column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> extract_year(column_view const& column, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_year(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_year(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> extract_month(column_view const& column,
-                                      rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_month(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_month(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> extract_day(column_view const& column, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_day(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_day(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_weekday(column_view const& column,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_weekday(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> extract_hour(column_view const& column, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_hour(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_hour(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> extract_minute(column_view const& column,
-                                       rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_minute(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_minute(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> extract_second(column_view const& column,
-                                       rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> extract_second(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_second(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_millisecond_fraction(column_view const& column,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_millisecond_fraction(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_microsecond_fraction(column_view const& column,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_microsecond_fraction(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_nanosecond_fraction(column_view const& column,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_nanosecond_fraction(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> last_day_of_month(column_view const& column,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::last_day_of_month(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> day_of_year(column_view const& column, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> day_of_year(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::day_of_year(column, cudf::get_default_stream(), mr);
@@ -686,7 +684,7 @@ std::unique_ptr<column> day_of_year(column_view const& column, rmm::mr::device_m
 
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamp_column,
                                                      cudf::column_view const& months_column,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::add_calendrical_months(
@@ -695,27 +693,26 @@ std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& ti
 
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamp_column,
                                                      cudf::scalar const& months,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::add_calendrical_months(timestamp_column, months, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> is_leap_year(column_view const& column, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> is_leap_year(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_leap_year(column, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> days_in_month(column_view const& column,
-                                      rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> days_in_month(column_view const& column, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::days_in_month(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_quarter(column_view const& column,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_quarter(column, cudf::get_default_stream(), mr);
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index a75eea7172f..a3471485293 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -19,6 +19,8 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <algorithm>
 #include <filesystem>
 #include <fstream>
@@ -379,7 +381,7 @@ static int64_t get_transition_time(dst_transition_s const& trans, int year)
 
 std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_view> tzif_dir,
                                                       std::string_view timezone_name,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::make_timezone_transition_table(
@@ -391,7 +393,7 @@ namespace detail {
 std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_view> tzif_dir,
                                                       std::string_view timezone_name,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   if (timezone_name == "UTC" || timezone_name.empty()) {
     // Return an empty table for UTC
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index 3973100aced..5fd21ee0094 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,6 +31,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -49,7 +50,7 @@ namespace detail {
 std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& new_keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!new_keys.has_nulls(), "Keys must not have nulls");
   auto old_keys = dictionary_column.keys();  // [a,b,c,d,f]
@@ -131,7 +132,7 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
 std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::add_keys(dictionary_column, keys, stream, mr);
diff --git a/cpp/src/dictionary/decode.cu b/cpp/src/dictionary/decode.cu
index 8ce741c4a91..9f05593fc40 100644
--- a/cpp/src/dictionary/decode.cu
+++ b/cpp/src/dictionary/decode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -46,7 +47,7 @@ struct indices_handler_fn {
  */
 std::unique_ptr<column> decode(dictionary_column_view const& source,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   if (source.is_empty()) return make_empty_column(type_id::EMPTY);
 
@@ -77,7 +78,7 @@ std::unique_ptr<column> decode(dictionary_column_view const& source,
 
 std::unique_ptr<column> decode(dictionary_column_view const& source,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::decode(source, stream, mr);
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index 17295fb0345..62a6c816493 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -31,6 +31,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -140,7 +141,7 @@ struct dispatch_compute_indices {
              offsets_pair const* d_offsets,
              size_type const* d_map_to_keys,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+             rmm::device_async_resource_ref mr)
   {
     auto keys_view     = column_device_view::create(all_keys, stream);
     auto indices_view  = column_device_view::create(all_indices, stream);
@@ -206,7 +207,7 @@ struct dispatch_compute_indices {
 
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   // exception here is the same behavior as in cudf::concatenate
   CUDF_EXPECTS(not columns.empty(), "Unexpected empty list of columns to concatenate.");
diff --git a/cpp/src/dictionary/detail/merge.cu b/cpp/src/dictionary/detail/merge.cu
index 2fe21680873..c65aa5d1101 100644
--- a/cpp/src/dictionary/detail/merge.cu
+++ b/cpp/src/dictionary/detail/merge.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -36,7 +37,7 @@ std::unique_ptr<column> merge(dictionary_column_view const& lcol,
                               dictionary_column_view const& rcol,
                               cudf::detail::index_vector const& row_order,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   auto const lcol_iter = cudf::detail::indexalator_factory::make_input_iterator(lcol.indices());
   auto const rcol_iter = cudf::detail::indexalator_factory::make_input_iterator(rcol.indices());
diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu
index f70423a13a9..37f8fa7a05b 100644
--- a/cpp/src/dictionary/dictionary_factories.cu
+++ b/cpp/src/dictionary/dictionary_factories.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace {
@@ -30,7 +31,7 @@ struct dispatch_create_indices {
   template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& indices,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_EXPECTS(std::is_unsigned<IndexType>(), "indices must be an unsigned type");
     column_view indices_view{
@@ -40,7 +41,7 @@ struct dispatch_create_indices {
   template <typename IndexType, std::enable_if_t<!is_index_type<IndexType>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("indices must be an integer type.");
   }
@@ -50,7 +51,7 @@ struct dispatch_create_indices {
 std::unique_ptr<column> make_dictionary_column(column_view const& keys_column,
                                                column_view const& indices_column,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!keys_column.has_nulls(), "keys column must not have nulls");
   if (keys_column.is_empty()) return make_empty_column(type_id::DICTIONARY32);
@@ -117,7 +118,7 @@ struct make_unsigned_fn {
 std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys,
                                                std::unique_ptr<column> indices,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!keys->has_nulls(), "keys column must not have nulls");
 
diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu
index c92b57f0cac..ff29d83b80a 100644
--- a/cpp/src/dictionary/encode.cu
+++ b/cpp/src/dictionary/encode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -41,7 +42,7 @@ namespace detail {
 std::unique_ptr<column> encode(column_view const& input_column,
                                data_type indices_type,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_unsigned(indices_type), "indices must be type unsigned integer");
   CUDF_EXPECTS(input_column.type().id() != type_id::DICTIONARY32,
@@ -90,7 +91,7 @@ data_type get_indices_type_for_size(size_type keys_size)
 std::unique_ptr<column> encode(column_view const& input_column,
                                data_type indices_type,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::encode(input_column, indices_type, stream, mr);
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 86b70f1119b..718ca419289 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -59,7 +60,7 @@ template <typename KeysKeeper>
 std::unique_ptr<column> remove_keys_fn(dictionary_column_view const& dictionary_column,
                                        KeysKeeper keys_to_keep_fn,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto const keys_view    = dictionary_column.keys();
   auto const indices_type = dictionary_column.indices().type();
@@ -150,7 +151,7 @@ std::unique_ptr<column> remove_keys_fn(dictionary_column_view const& dictionary_
 std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_column,
                                     column_view const& keys_to_remove,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!keys_to_remove.has_nulls(), "keys_to_remove must not have nulls");
   auto const keys_view = dictionary_column.keys();
@@ -166,7 +167,7 @@ std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_col
 
 std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& dictionary_column,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   // locate the keys to remove
   auto const keys_size     = dictionary_column.keys_size();
@@ -196,7 +197,7 @@ std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& diction
 std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_column,
                                     column_view const& keys_to_remove,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::remove_keys(dictionary_column, keys_to_remove, stream, mr);
@@ -204,7 +205,7 @@ std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_col
 
 std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& dictionary_column,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::remove_unused_keys(dictionary_column, stream, mr);
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index 7069993866c..bb6b08c243d 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/dictionary/dictionary_factories.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -52,7 +53,7 @@ template <typename ReplacementIter>
 std::unique_ptr<column> replace_indices(column_view const& input,
                                         ReplacementIter replacement_iter,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   auto const input_view = column_device_view::create(input, stream);
   auto const d_input    = *input_view;
@@ -74,12 +75,12 @@ std::unique_ptr<column> replace_indices(column_view const& input,
 
 /**
  * @copydoc cudf::dictionary::detail::replace_nulls(cudf::column_view const&,cudf::column_view
- * const& rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+ * const& rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       dictionary_column_view const& replacement,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input.parent()); }
   if (!input.has_nulls()) { return std::make_unique<cudf::column>(input.parent(), stream, mr); }
@@ -107,12 +108,12 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
 
 /**
  * @copydoc cudf::dictionary::detail::replace_nulls(cudf::column_view const&,cudf::scalar
- * const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+ * const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       scalar const& replacement,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input.parent()); }
   if (!input.has_nulls() || !replacement.is_valid(stream)) {
diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu
index e35aded1984..680eadddba8 100644
--- a/cpp/src/dictionary/search.cu
+++ b/cpp/src/dictionary/search.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -40,7 +41,7 @@ struct dispatch_scalar_index {
   std::unique_ptr<scalar> operator()(size_type index,
                                      bool is_valid,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return std::make_unique<numeric_scalar<IndexType>>(index, is_valid, stream, mr);
   }
@@ -69,7 +70,7 @@ struct find_index_fn {
   std::unique_ptr<scalar> operator()(dictionary_column_view const& input,
                                      scalar const& key,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     if (!key.is_valid(stream))
       return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, 0, false, stream, mr);
@@ -96,7 +97,7 @@ struct find_index_fn {
   std::unique_ptr<scalar> operator()(dictionary_column_view const&,
                                      scalar const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL(
       "dictionary, list_view, and struct_view columns cannot be the keys column of a dictionary");
@@ -111,7 +112,7 @@ struct find_insert_index_fn {
   std::unique_ptr<scalar> operator()(dictionary_column_view const& input,
                                      scalar const& key,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     if (!key.is_valid(stream))
       return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, 0, false, stream, mr);
@@ -138,7 +139,7 @@ struct find_insert_index_fn {
   std::unique_ptr<scalar> operator()(dictionary_column_view const&,
                                      scalar const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("dictionary, list_view, and struct_view columns cannot be the keys for a dictionary");
   }
@@ -149,7 +150,7 @@ struct find_insert_index_fn {
 std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
                                   scalar const& key,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   if (dictionary.is_empty())
     return std::make_unique<numeric_scalar<uint32_t>>(0, false, stream, mr);
@@ -160,7 +161,7 @@ std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
 std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionary,
                                          scalar const& key,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if (dictionary.is_empty())
     return std::make_unique<numeric_scalar<uint32_t>>(0, false, stream, mr);
@@ -175,7 +176,7 @@ std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionar
 std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
                                   scalar const& key,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::get_index(dictionary, key, stream, mr);
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index b49cf7850b1..b56eec9401a 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -61,7 +62,7 @@ struct dispatch_compute_indices {
   operator()(dictionary_column_view const& input,
              column_view const& new_keys,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+             rmm::device_async_resource_ref mr)
   {
     auto dictionary_view = column_device_view::create(input.parent(), stream);
     auto dictionary_itr  = make_dictionary_iterator<Element>(*dictionary_view);
@@ -119,7 +120,7 @@ struct dispatch_compute_indices {
 std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& new_keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!new_keys.has_nulls(), "keys parameter must not have nulls");
   auto keys = dictionary_column.keys();
@@ -177,7 +178,7 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   std::vector<column_view> keys(input.size());
   std::transform(input.begin(), input.end(), keys.begin(), [](auto& col) { return col.keys(); });
@@ -191,7 +192,7 @@ std::vector<std::unique_ptr<column>> match_dictionaries(
 }
 
 std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_dictionaries(
-  std::vector<table_view> tables, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  std::vector<table_view> tables, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   // Make a copy of all the column views from each table_view
   std::vector<std::vector<column_view>> updated_columns;
@@ -242,7 +243,7 @@ std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_d
 std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::set_keys(dictionary_column, keys, stream, mr);
@@ -251,7 +252,7 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::match_dictionaries(input, stream, mr);
diff --git a/cpp/src/filling/calendrical_month_sequence.cu b/cpp/src/filling/calendrical_month_sequence.cu
index 80badb7d566..3e6d693dde5 100644
--- a/cpp/src/filling/calendrical_month_sequence.cu
+++ b/cpp/src/filling/calendrical_month_sequence.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -30,7 +31,7 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
                                                          scalar const& init,
                                                          size_type months,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(
     init.type(), calendrical_month_sequence_functor{}, size, init, months, stream, mr);
@@ -41,7 +42,7 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
                                                          scalar const& init,
                                                          size_type months,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::calendrical_month_sequence(size, init, months, stream, mr);
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index 42d1f7592ec..c4d786bd73b 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,6 +36,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 
@@ -107,7 +108,7 @@ struct out_of_place_fill_range_dispatch {
   std::unique_ptr<cudf::column> operator()(cudf::size_type begin,
                                            cudf::size_type end,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
     auto p_ret = std::make_unique<cudf::column>(input, stream, mr);
@@ -134,7 +135,7 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
   cudf::size_type begin,
   cudf::size_type end,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
   using ScalarType = cudf::scalar_type_t<cudf::string_view>;
@@ -148,7 +149,7 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
   cudf::size_type begin,
   cudf::size_type end,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return std::make_unique<cudf::column>(input, stream, mr);
   cudf::dictionary_column_view const target(input);
@@ -233,7 +234,7 @@ std::unique_ptr<column> fill(column_view const& input,
                              size_type end,
                              scalar const& value,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS((begin >= 0) && (end <= input.size()) && (begin <= end), "Range is out of bounds.");
 
@@ -258,7 +259,7 @@ std::unique_ptr<column> fill(column_view const& input,
                              size_type end,
                              scalar const& value,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::fill(input, begin, end, value, stream, mr);
diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu
index 87cc0f21d0e..ff4005d9366 100644
--- a/cpp/src/filling/repeat.cu
+++ b/cpp/src/filling/repeat.cu
@@ -33,6 +33,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -102,7 +103,7 @@ namespace detail {
 std::unique_ptr<table> repeat(table_view const& input_table,
                               column_view const& count,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input_table.num_rows() == count.size(), "in and count must have equal size");
   CUDF_EXPECTS(not count.has_nulls(), "count cannot contain nulls");
@@ -131,7 +132,7 @@ std::unique_ptr<table> repeat(table_view const& input_table,
 std::unique_ptr<table> repeat(table_view const& input_table,
                               size_type count,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   if ((input_table.num_rows() == 0) || (count == 0)) { return cudf::empty_like(input_table); }
 
@@ -154,7 +155,7 @@ std::unique_ptr<table> repeat(table_view const& input_table,
 std::unique_ptr<table> repeat(table_view const& input_table,
                               column_view const& count,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::repeat(input_table, count, stream, mr);
@@ -163,7 +164,7 @@ std::unique_ptr<table> repeat(table_view const& input_table,
 std::unique_ptr<table> repeat(table_view const& input_table,
                               size_type count,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::repeat(input_table, count, stream, mr);
diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu
index 99a17f8b0e0..f7067c3a91b 100644
--- a/cpp/src/filling/sequence.cu
+++ b/cpp/src/filling/sequence.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/sequence.h>
 #include <thrust/tabulate.h>
@@ -66,7 +67,7 @@ struct sequence_functor {
                                      scalar const& init,
                                      scalar const& step,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto result = make_fixed_width_column(init.type(), size, mask_state::UNALLOCATED, stream, mr);
     auto result_device_view = mutable_column_device_view::create(*result, stream);
@@ -92,7 +93,7 @@ struct sequence_functor {
   std::unique_ptr<column> operator()(size_type size,
                                      scalar const& init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto result = make_fixed_width_column(init.type(), size, mask_state::UNALLOCATED, stream, mr);
     auto result_device_view = mutable_column_device_view::create(*result, stream);
@@ -125,7 +126,7 @@ std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  scalar const& step,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(init.type() == step.type(), "init and step must be of the same type.");
   CUDF_EXPECTS(size >= 0, "size must be >= 0");
@@ -137,7 +138,7 @@ std::unique_ptr<column> sequence(size_type size,
 std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(size >= 0, "size must be >= 0");
   CUDF_EXPECTS(is_numeric(init.type()), "init scalar type must be numeric");
@@ -151,7 +152,7 @@ std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  scalar const& step,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sequence(size, init, step, stream, mr);
@@ -160,7 +161,7 @@ std::unique_ptr<column> sequence(size_type size,
 std::unique_ptr<column> sequence(size_type size,
                                  scalar const& init,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sequence(size, init, stream, mr);
diff --git a/cpp/src/groupby/common/utils.hpp b/cpp/src/groupby/common/utils.hpp
index 09b85c74f08..82c3c08b501 100644
--- a/cpp/src/groupby/common/utils.hpp
+++ b/cpp/src/groupby/common/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/detail/groupby.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <memory>
 #include <vector>
 
@@ -31,7 +33,7 @@ template <typename RequestType>
 inline std::vector<aggregation_result> extract_results(host_span<RequestType const> requests,
                                                        cudf::detail::result_cache& cache,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   std::vector<aggregation_result> results(requests.size());
   std::unordered_map<std::pair<column_view, std::reference_wrapper<aggregation const>>,
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index e3c021eb66a..73cb4efd283 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,6 +39,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -65,7 +66,7 @@ groupby::groupby(table_view const& keys,
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::dispatch_aggregation(
   host_span<aggregation_request const> requests,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // If sort groupby has been called once on this groupby object, then
   // always use sort groupby from now on. Because once keys are sorted,
@@ -193,7 +194,7 @@ void verify_valid_requests(host_span<RequestType const> requests)
 
 // Compute aggregation requests
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggregate(
-  host_span<aggregation_request const> requests, rmm::mr::device_memory_resource* mr)
+  host_span<aggregation_request const> requests, rmm::device_async_resource_ref mr)
 {
   return aggregate(requests, cudf::get_default_stream(), mr);
 }
@@ -202,7 +203,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggr
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggregate(
   host_span<aggregation_request const> requests,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(
@@ -220,7 +221,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggr
 
 // Compute scan requests
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::scan(
-  host_span<scan_request const> requests, rmm::mr::device_memory_resource* mr)
+  host_span<scan_request const> requests, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(
@@ -236,7 +237,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::scan
   return sort_scan(requests, cudf::get_default_stream(), mr);
 }
 
-groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_resource* mr)
+groupby::groups groupby::get_groups(table_view values, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto const stream = cudf::get_default_stream();
@@ -262,7 +263,7 @@ groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_re
 std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::replace_nulls(
   table_view const& values,
   host_span<cudf::replace_policy const> replace_policies,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(_keys.num_rows() == values.num_rows(),
@@ -306,7 +307,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::shift(
   table_view const& values,
   host_span<size_type const> offsets,
   std::vector<std::reference_wrapper<scalar const>> const& fill_values,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(values.num_columns() == static_cast<size_type>(fill_values.size()),
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index acc1b087510..4f75ab19c66 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -44,6 +44,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_set.cuh>
 #include <thrust/for_each.h>
@@ -190,7 +191,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
   SetType set;
   bitmask_type const* __restrict__ row_bitmask;
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
  public:
   using cudf::detail::aggregation_finalizer::visit;
@@ -202,7 +203,7 @@ class hash_compound_agg_finalizer final : public cudf::detail::aggregation_final
                               SetType set,
                               bitmask_type const* row_bitmask,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
     : col(col),
       sparse_results(sparse_results),
       dense_results(dense_results),
@@ -398,7 +399,7 @@ void sparse_to_dense_results(table_view const& keys,
                              bool keys_have_nulls,
                              null_policy include_null_keys,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   auto row_bitmask =
     cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first;
@@ -551,7 +552,7 @@ std::unique_ptr<table> groupby(table_view const& keys,
                                bool const keys_have_nulls,
                                null_policy const include_null_keys,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto const num_keys            = keys.num_rows();
   auto const null_keys_are_equal = null_equality::EQUAL;
@@ -654,7 +655,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   host_span<aggregation_request const> requests,
   null_policy include_null_keys,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   cudf::detail::result_cache cache(requests.size());
 
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 2d6f99de25a..ba59616babe 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -37,6 +37,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <unordered_map>
@@ -797,7 +798,7 @@ void aggregate_result_functor::operator()<aggregation::MERGE_TDIGEST>(aggregatio
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort_aggregate(
   host_span<aggregation_request const> requests,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // We're going to start by creating a cache of results so that aggs that
   // depend on other aggs will not have to be recalculated. e.g. mean depends on
diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp
index be36956b929..057085fe85d 100644
--- a/cpp/src/groupby/sort/functors.hpp
+++ b/cpp/src/groupby/sort/functors.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -42,7 +43,7 @@ struct store_result_functor {
                        sort::sort_groupby_helper& helper,
                        cudf::detail::result_cache& cache,
                        rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr,
+                       rmm::device_async_resource_ref mr,
                        sorted keys_are_sorted = sorted::NO)
     : helper(helper),
       cache(cache),
@@ -98,8 +99,8 @@ struct store_result_functor {
   cudf::detail::result_cache& cache;  ///< cache of results to store into
   column_view const& values;          ///< Column of values to group and aggregate
 
-  rmm::cuda_stream_view stream;         ///< CUDA stream on which to execute kernels
-  rmm::mr::device_memory_resource* mr;  ///< Memory resource to allocate space for results
+  rmm::cuda_stream_view stream;       ///< CUDA stream on which to execute kernels
+  rmm::device_async_resource_ref mr;  ///< Memory resource to allocate space for results
 
   sorted keys_are_sorted;                  ///< Whether the keys are sorted
   std::unique_ptr<column> sorted_values;   ///< Memoised grouped and sorted values
diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu
index a9c098bcf61..a1d197b1307 100644
--- a/cpp/src/groupby/sort/group_argmax.cu
+++ b/cpp/src/groupby/sort/group_argmax.cu
@@ -20,6 +20,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
@@ -31,7 +32,7 @@ std::unique_ptr<column> group_argmax(column_view const& values,
                                      cudf::device_span<size_type const> group_labels,
                                      column_view const& key_sort_order,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   auto indices = type_dispatcher(values.type(),
                                  group_reduction_dispatcher<aggregation::ARGMAX>{},
diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu
index 53a514ac8a7..03243bef836 100644
--- a/cpp/src/groupby/sort/group_argmin.cu
+++ b/cpp/src/groupby/sort/group_argmin.cu
@@ -20,6 +20,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
@@ -31,7 +32,7 @@ std::unique_ptr<column> group_argmin(column_view const& values,
                                      cudf::device_span<size_type const> group_labels,
                                      column_view const& key_sort_order,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   auto indices = type_dispatcher(values.type(),
                                  group_reduction_dispatcher<aggregation::ARGMIN>{},
diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu
index f95ad72f453..555c5d3ad41 100644
--- a/cpp/src/groupby/sort/group_collect.cu
+++ b/cpp/src/groupby/sort/group_collect.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -50,7 +51,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
   column_view const& offsets,
   size_type num_groups,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto values_device_view = column_device_view::create(values, stream);
 
@@ -91,7 +92,7 @@ std::unique_ptr<column> group_collect(column_view const& values,
                                       size_type num_groups,
                                       null_policy null_handling,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   auto [child_column,
         offsets_column] = [null_handling, num_groups, &values, &group_offsets, stream, mr] {
diff --git a/cpp/src/groupby/sort/group_correlation.cu b/cpp/src/groupby/sort/group_correlation.cu
index 4389b833c33..152aa98a8b9 100644
--- a/cpp/src/groupby/sort/group_correlation.cu
+++ b/cpp/src/groupby/sort/group_correlation.cu
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -120,7 +121,7 @@ std::unique_ptr<column> group_covariance(column_view const& values_0,
                                          size_type min_periods,
                                          size_type ddof,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   using result_type = id_to_type<type_id::FLOAT64>;
   static_assert(
@@ -181,7 +182,7 @@ std::unique_ptr<column> group_correlation(column_view const& covariance,
                                           column_view const& stddev_0,
                                           column_view const& stddev_1,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   using result_type = id_to_type<type_id::FLOAT64>;
   CUDF_EXPECTS(covariance.type().id() == type_id::FLOAT64, "Covariance result must be FLOAT64");
diff --git a/cpp/src/groupby/sort/group_count.cu b/cpp/src/groupby/sort/group_count.cu
index 2f289c8c8a7..56a4943e272 100644
--- a/cpp/src/groupby/sort/group_count.cu
+++ b/cpp/src/groupby/sort/group_count.cu
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/adjacent_difference.h>
@@ -37,7 +38,7 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
                                           cudf::device_span<size_type const> group_labels,
                                           size_type num_groups,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
@@ -80,7 +81,7 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
 std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group_offsets,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
 
diff --git a/cpp/src/groupby/sort/group_count_scan.cu b/cpp/src/groupby/sort/group_count_scan.cu
index 2e8fd41d984..c076f21e1f8 100644
--- a/cpp/src/groupby/sort/group_count_scan.cu
+++ b/cpp/src/groupby/sort/group_count_scan.cu
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scan.h>
@@ -30,7 +31,7 @@ namespace groupby {
 namespace detail {
 std::unique_ptr<column> count_scan(cudf::device_span<size_type const> group_labels,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   std::unique_ptr<column> result = make_fixed_width_column(
     data_type{type_id::INT32}, group_labels.size(), mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index 67c30adcd47..1000ec0d470 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
@@ -38,7 +39,7 @@ std::unique_ptr<column> build_histogram(column_view const& values,
                                         std::optional<column_view> const& partial_counts,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
                "Size of values column should be the same as that of group labels.",
@@ -89,7 +90,7 @@ std::unique_ptr<column> group_histogram(column_view const& values,
                                         cudf::device_span<size_type const> group_labels,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   // Empty group should be handled before reaching here.
   CUDF_EXPECTS(num_groups > 0, "Group should not be empty.", std::invalid_argument);
@@ -101,7 +102,7 @@ std::unique_ptr<column> group_merge_histogram(column_view const& values,
                                               cudf::device_span<size_type const> group_offsets,
                                               size_type num_groups,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   // Empty group should be handled before reaching here.
   CUDF_EXPECTS(num_groups > 0, "Group should not be empty.", std::invalid_argument);
diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu
index 70b05100fb0..77f33486284 100644
--- a/cpp/src/groupby/sort/group_m2.cu
+++ b/cpp/src/groupby/sort/group_m2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/reduce.h>
@@ -88,7 +89,7 @@ struct m2_functor {
     column_view const& group_means,
     cudf::device_span<size_type const> group_labels,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     using result_type = cudf::detail::target_type_t<T, aggregation::Kind::M2>;
     auto result       = make_numeric_column(data_type(type_to_id<result_type>()),
@@ -133,7 +134,7 @@ std::unique_ptr<column> group_m2(column_view const& values,
                                  column_view const& group_means,
                                  cudf::device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_max.cu b/cpp/src/groupby/sort/group_max.cu
index 148188f5fdf..60b071c25ff 100644
--- a/cpp/src/groupby/sort/group_max.cu
+++ b/cpp/src/groupby/sort/group_max.cu
@@ -17,6 +17,7 @@
 #include "groupby/sort/group_single_pass_reduction_util.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -25,7 +26,7 @@ std::unique_ptr<column> group_max(column_view const& values,
                                   size_type num_groups,
                                   cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_max_scan.cu b/cpp/src/groupby/sort/group_max_scan.cu
index 8679ab09df6..270059cfcad 100644
--- a/cpp/src/groupby/sort/group_max_scan.cu
+++ b/cpp/src/groupby/sort/group_max_scan.cu
@@ -17,6 +17,7 @@
 #include "groupby/sort/group_scan_util.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -25,7 +26,7 @@ std::unique_ptr<column> max_scan(column_view const& values,
                                  size_type num_groups,
                                  cudf::device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(values.type(),
                          group_scan_dispatcher<aggregation::MAX>{},
diff --git a/cpp/src/groupby/sort/group_merge_lists.cu b/cpp/src/groupby/sort/group_merge_lists.cu
index 2c72128dbfb..92cce1aa00e 100644
--- a/cpp/src/groupby/sort/group_merge_lists.cu
+++ b/cpp/src/groupby/sort/group_merge_lists.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
@@ -30,7 +31,7 @@ std::unique_ptr<column> group_merge_lists(column_view const& values,
                                           cudf::device_span<size_type const> group_offsets,
                                           size_type num_groups,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(values.type().id() == type_id::LIST,
                "Input to `group_merge_lists` must be a lists column.");
diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu
index a580c9dac9d..4ad8fa5ff07 100644
--- a/cpp/src/groupby/sort/group_merge_m2.cu
+++ b/cpp/src/groupby/sort/group_merge_m2.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -131,7 +132,7 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
                                        cudf::device_span<size_type const> group_offsets,
                                        size_type num_groups,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(values.type().id() == type_id::STRUCT,
                "Input to `group_merge_m2` must be a structs column.");
diff --git a/cpp/src/groupby/sort/group_min.cu b/cpp/src/groupby/sort/group_min.cu
index 3939fc41b65..22aaf664168 100644
--- a/cpp/src/groupby/sort/group_min.cu
+++ b/cpp/src/groupby/sort/group_min.cu
@@ -17,6 +17,7 @@
 #include "groupby/sort/group_single_pass_reduction_util.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -25,7 +26,7 @@ std::unique_ptr<column> group_min(column_view const& values,
                                   size_type num_groups,
                                   cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_min_scan.cu b/cpp/src/groupby/sort/group_min_scan.cu
index 7d2a88fb038..4ddc10a2e5a 100644
--- a/cpp/src/groupby/sort/group_min_scan.cu
+++ b/cpp/src/groupby/sort/group_min_scan.cu
@@ -17,6 +17,7 @@
 #include "groupby/sort/group_scan_util.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -25,7 +26,7 @@ std::unique_ptr<column> min_scan(column_view const& values,
                                  size_type num_groups,
                                  cudf::device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(values.type(),
                          group_scan_dispatcher<aggregation::MIN>{},
diff --git a/cpp/src/groupby/sort/group_nth_element.cu b/cpp/src/groupby/sort/group_nth_element.cu
index 694c052e42d..1bc1eef908c 100644
--- a/cpp/src/groupby/sort/group_nth_element.cu
+++ b/cpp/src/groupby/sort/group_nth_element.cu
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/constant_iterator.h>
@@ -49,7 +50,7 @@ std::unique_ptr<column> group_nth_element(column_view const& values,
                                           size_type n,
                                           null_policy null_handling,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
                "Size of values column should be same as that of group labels");
diff --git a/cpp/src/groupby/sort/group_nunique.cu b/cpp/src/groupby/sort/group_nunique.cu
index 1a5f1691d5b..de11e70719a 100644
--- a/cpp/src/groupby/sort/group_nunique.cu
+++ b/cpp/src/groupby/sort/group_nunique.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -78,7 +79,7 @@ std::unique_ptr<column> group_nunique(column_view const& values,
                                       cudf::device_span<size_type const> group_offsets,
                                       null_policy null_handling,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(num_groups >= 0, "number of groups cannot be negative");
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
diff --git a/cpp/src/groupby/sort/group_product.cu b/cpp/src/groupby/sort/group_product.cu
index c53362f2095..83ca1059325 100644
--- a/cpp/src/groupby/sort/group_product.cu
+++ b/cpp/src/groupby/sort/group_product.cu
@@ -20,6 +20,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -28,7 +29,7 @@ std::unique_ptr<column> group_product(column_view const& values,
                                       size_type num_groups,
                                       cudf::device_span<size_type const> group_labels,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_product_scan.cu b/cpp/src/groupby/sort/group_product_scan.cu
index e1a615730dd..40c53ceeff1 100644
--- a/cpp/src/groupby/sort/group_product_scan.cu
+++ b/cpp/src/groupby/sort/group_product_scan.cu
@@ -17,6 +17,7 @@
 #include "groupby/sort/group_scan_util.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -25,7 +26,7 @@ std::unique_ptr<column> product_scan(column_view const& values,
                                      size_type num_groups,
                                      cudf::device_span<size_type const> group_labels,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(values.type(),
                          group_scan_dispatcher<aggregation::PRODUCT>{},
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index a6bc2d5b38d..3156dfaadd0 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -29,6 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -90,7 +91,7 @@ struct quantiles_functor {
     device_span<double const> quantile,
     interpolation interpolation,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     using ResultType = cudf::detail::target_type_t<T, aggregation::QUANTILE>;
 
@@ -161,7 +162,7 @@ std::unique_ptr<column> group_quantiles(column_view const& values,
                                         std::vector<double> const& quantiles,
                                         interpolation interp,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   auto dv_quantiles = cudf::detail::make_device_uvector_async(
     quantiles, stream, rmm::mr::get_current_device_resource());
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 5cf7844410e..0b65889f127 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/reverse_iterator.h>
@@ -100,7 +101,7 @@ std::unique_ptr<column> rank_generator(column_view const& grouped_values,
                                        scan_operator scan_op,
                                        bool has_nulls,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto const grouped_values_view = table_view{{grouped_values}};
   auto const comparator =
@@ -155,7 +156,7 @@ std::unique_ptr<column> min_rank_scan(column_view const& grouped_values,
                                       device_span<size_type const> group_labels,
                                       device_span<size_type const> group_offsets,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return rank_generator<true>(
     grouped_values,
@@ -176,7 +177,7 @@ std::unique_ptr<column> max_rank_scan(column_view const& grouped_values,
                                       device_span<size_type const> group_labels,
                                       device_span<size_type const> group_offsets,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return rank_generator<false>(
     grouped_values,
@@ -197,7 +198,7 @@ std::unique_ptr<column> first_rank_scan(column_view const& grouped_values,
                                         device_span<size_type const> group_labels,
                                         device_span<size_type const> group_offsets,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   auto ranks = make_fixed_width_column(
     data_type{type_to_id<size_type>()}, group_labels.size(), mask_state::UNALLOCATED, stream, mr);
@@ -218,7 +219,7 @@ std::unique_ptr<column> average_rank_scan(column_view const& grouped_values,
                                           device_span<size_type const> group_labels,
                                           device_span<size_type const> group_offsets,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   auto max_rank = max_rank_scan(grouped_values,
                                 value_order,
@@ -251,7 +252,7 @@ std::unique_ptr<column> dense_rank_scan(column_view const& grouped_values,
                                         device_span<size_type const> group_labels,
                                         device_span<size_type const> group_offsets,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return rank_generator<true>(
     grouped_values,
@@ -272,7 +273,7 @@ std::unique_ptr<column> group_rank_to_percentage(rank_method const method,
                                                  device_span<size_type const> group_labels,
                                                  device_span<size_type const> group_offsets,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(percentage != rank_percentage::NONE, "Percentage cannot be NONE");
   auto ranks = make_fixed_width_column(
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 3aa79f226a3..5e76dc3135a 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -52,7 +53,7 @@ std::unique_ptr<column> group_sum(column_view const& values,
                                   size_type num_groups,
                                   cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise product
@@ -75,7 +76,7 @@ std::unique_ptr<column> group_product(column_view const& values,
                                       size_type num_groups,
                                       cudf::device_span<size_type const> group_labels,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise minimum value
@@ -98,7 +99,7 @@ std::unique_ptr<column> group_min(column_view const& values,
                                   size_type num_groups,
                                   cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise maximum value
@@ -121,7 +122,7 @@ std::unique_ptr<column> group_max(column_view const& values,
                                   size_type num_groups,
                                   cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate group-wise indices of maximum values.
@@ -146,7 +147,7 @@ std::unique_ptr<column> group_argmax(column_view const& values,
                                      cudf::device_span<size_type const> group_labels,
                                      column_view const& key_sort_order,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate group-wise indices of minimum values.
@@ -171,7 +172,7 @@ std::unique_ptr<column> group_argmin(column_view const& values,
                                      cudf::device_span<size_type const> group_labels,
                                      column_view const& key_sort_order,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate number of non-null values in each group of
@@ -195,7 +196,7 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
                                           cudf::device_span<size_type const> group_labels,
                                           size_type num_groups,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate number of values in each group of @p values
@@ -215,7 +216,7 @@ std::unique_ptr<column> group_count_valid(column_view const& values,
 std::unique_ptr<column> group_count_all(cudf::device_span<size_type const> group_offsets,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 /**
  * @brief Internal API to compute histogram for each group in @p values.
  *
@@ -242,7 +243,7 @@ std::unique_ptr<column> group_histogram(column_view const& values,
                                         cudf::device_span<size_type const> group_labels,
                                         size_type num_groups,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate sum of squares of differences from means.
@@ -266,7 +267,7 @@ std::unique_ptr<column> group_m2(column_view const& values,
                                  column_view const& group_means,
                                  cudf::device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise variance
@@ -296,7 +297,7 @@ std::unique_ptr<column> group_var(column_view const& values,
                                   cudf::device_span<size_type const> group_labels,
                                   size_type ddof,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr);
+                                  rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise quantiles
@@ -326,7 +327,7 @@ std::unique_ptr<column> group_quantiles(column_view const& values,
                                         std::vector<double> const& quantiles,
                                         interpolation interp,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate number of unique values in each group of
@@ -358,7 +359,7 @@ std::unique_ptr<column> group_nunique(column_view const& values,
                                       cudf::device_span<size_type const> group_offsets,
                                       null_policy null_handling,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate nth values in each group of  @p values
@@ -393,7 +394,7 @@ std::unique_ptr<column> group_nth_element(column_view const& values,
                                           size_type n,
                                           null_policy null_handling,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 /**
  * @brief Internal API to collect grouped values into a lists column
  *
@@ -418,7 +419,7 @@ std::unique_ptr<column> group_collect(column_view const& values,
                                       size_type num_groups,
                                       null_policy null_handling,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to merge grouped lists into one list.
@@ -441,7 +442,7 @@ std::unique_ptr<column> group_merge_lists(column_view const& values,
                                           cudf::device_span<size_type const> group_offsets,
                                           size_type num_groups,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to merge grouped M2 values corresponding to the same key.
@@ -467,7 +468,7 @@ std::unique_ptr<column> group_merge_m2(column_view const& values,
                                        cudf::device_span<size_type const> group_offsets,
                                        size_type num_groups,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to merge multiple output of HISTOGRAM aggregation.
@@ -494,7 +495,7 @@ std::unique_ptr<column> group_merge_histogram(column_view const& values,
                                               cudf::device_span<size_type const> group_offsets,
                                               size_type num_groups,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr);
+                                              rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to find covariance of child columns of a non-nullable struct column.
@@ -521,7 +522,7 @@ std::unique_ptr<column> group_covariance(column_view const& values_0,
                                          size_type min_periods,
                                          size_type ddof,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr);
+                                         rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to find correlation from covariance and standard deviation.
@@ -536,7 +537,7 @@ std::unique_ptr<column> group_correlation(column_view const& covariance,
                                           column_view const& stddev_0,
                                           column_view const& stddev_1,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace groupby
diff --git a/cpp/src/groupby/sort/group_replace_nulls.cu b/cpp/src/groupby/sort/group_replace_nulls.cu
index 49557164230..566507da230 100644
--- a/cpp/src/groupby/sort/group_replace_nulls.cu
+++ b/cpp/src/groupby/sort/group_replace_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/replace.hpp>
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -40,7 +41,7 @@ std::unique_ptr<column> group_replace_nulls(cudf::column_view const& grouped_val
                                             device_span<size_type const> group_labels,
                                             cudf::replace_policy replace_policy,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   cudf::size_type size = grouped_value.size();
 
diff --git a/cpp/src/groupby/sort/group_scan.hpp b/cpp/src/groupby/sort/group_scan.hpp
index fd53046f7e2..6f2daae5f9d 100644
--- a/cpp/src/groupby/sort/group_scan.hpp
+++ b/cpp/src/groupby/sort/group_scan.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -40,7 +41,7 @@ std::unique_ptr<column> sum_scan(column_view const& values,
                                  size_type num_groups,
                                  device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise cumulative product
@@ -57,7 +58,7 @@ std::unique_ptr<column> product_scan(column_view const& values,
                                      size_type num_groups,
                                      device_span<size_type const> group_labels,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise cumulative minimum value
@@ -72,7 +73,7 @@ std::unique_ptr<column> min_scan(column_view const& values,
                                  size_type num_groups,
                                  device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise cumulative maximum value
@@ -87,7 +88,7 @@ std::unique_ptr<column> max_scan(column_view const& values,
                                  size_type num_groups,
                                  device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate cumulative number of values in each group
@@ -99,7 +100,7 @@ std::unique_ptr<column> max_scan(column_view const& values,
  */
 std::unique_ptr<column> count_scan(device_span<size_type const> group_labels,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise min rank value
@@ -118,7 +119,7 @@ std::unique_ptr<column> min_rank_scan(column_view const& grouped_values,
                                       device_span<size_type const> group_labels,
                                       device_span<size_type const> group_offsets,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise max rank value
@@ -128,14 +129,14 @@ std::unique_ptr<column> min_rank_scan(column_view const& grouped_values,
  *                                      device_span<size_type const> group_labels,
  *                                      device_span<size_type const> group_offsets,
  *                                      rmm::cuda_stream_view stream,
- *                                      rmm::mr::device_memory_resource* mr)
+ *                                      rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> max_rank_scan(column_view const& grouped_values,
                                       column_view const& value_order,
                                       device_span<size_type const> group_labels,
                                       device_span<size_type const> group_offsets,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise first rank value
@@ -145,14 +146,14 @@ std::unique_ptr<column> max_rank_scan(column_view const& grouped_values,
  *                                      device_span<size_type const> group_labels,
  *                                      device_span<size_type const> group_offsets,
  *                                      rmm::cuda_stream_view stream,
- *                                      rmm::mr::device_memory_resource* mr)
+ *                                      rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> first_rank_scan(column_view const& grouped_values,
                                         column_view const& value_order,
                                         device_span<size_type const> group_labels,
                                         device_span<size_type const> group_offsets,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise average rank value
@@ -162,14 +163,14 @@ std::unique_ptr<column> first_rank_scan(column_view const& grouped_values,
  *                                      device_span<size_type const> group_labels,
  *                                      device_span<size_type const> group_offsets,
  *                                      rmm::cuda_stream_view stream,
- *                                      rmm::mr::device_memory_resource* mr)
+ *                                      rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> average_rank_scan(column_view const& grouped_values,
                                           column_view const& value_order,
                                           device_span<size_type const> group_labels,
                                           device_span<size_type const> group_offsets,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr);
+                                          rmm::device_async_resource_ref mr);
 
 /**
  * @brief Internal API to calculate groupwise dense rank value
@@ -186,7 +187,7 @@ std::unique_ptr<column> dense_rank_scan(column_view const& grouped_values,
                                         device_span<size_type const> group_labels,
                                         device_span<size_type const> group_offsets,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Convert groupwise rank to groupwise percentage rank
@@ -209,7 +210,7 @@ std::unique_ptr<column> group_rank_to_percentage(rank_method const method,
                                                  device_span<size_type const> group_labels,
                                                  device_span<size_type const> group_offsets,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr);
+                                                 rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace groupby
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index 2ebc8ba7d5d..b360ba2c45d 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -34,6 +34,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -60,7 +61,7 @@ struct group_scan_dispatcher {
                                      size_type num_groups,
                                      cudf::device_span<cudf::size_type const> group_labels,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return group_scan_functor<K, T>::invoke(values, num_groups, group_labels, stream, mr);
   }
@@ -89,7 +90,7 @@ struct group_scan_functor<K, T, std::enable_if_t<is_group_scan_supported<K, T>()
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     using DeviceType       = device_storage_type_t<T>;
     using OpType           = cudf::detail::corresponding_operator_t<K>;
@@ -145,7 +146,7 @@ struct group_scan_functor<K,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     using OpType = cudf::detail::corresponding_operator_t<K>;
 
@@ -191,7 +192,7 @@ struct group_scan_functor<K,
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     if (values.is_empty()) { return cudf::empty_like(values); }
 
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 42d4b654346..5e892710d3b 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -116,7 +117,7 @@ struct group_reduction_dispatcher {
                                      size_type num_groups,
                                      cudf::device_span<cudf::size_type const> group_labels,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return group_reduction_functor<K, T>::invoke(values, num_groups, group_labels, stream, mr);
   }
@@ -149,7 +150,7 @@ struct group_reduction_functor<
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 
   {
     using SourceDType = device_storage_type_t<T>;
@@ -218,7 +219,7 @@ struct group_reduction_functor<
                                         size_type num_groups,
                                         cudf::device_span<cudf::size_type const> group_labels,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     // This is be expected to be size_type.
     using ResultType = cudf::detail::target_type_t<T, K>;
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index 30b6f67dffe..70f64186f21 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -104,7 +105,7 @@ struct var_functor {
     cudf::device_span<size_type const> group_labels,
     size_type ddof,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     using ResultType = cudf::detail::target_type_t<T, aggregation::Kind::VARIANCE>;
 
@@ -175,7 +176,7 @@ std::unique_ptr<column> group_var(column_view const& values,
                                   cudf::device_span<size_type const> group_labels,
                                   size_type ddof,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_sum.cu b/cpp/src/groupby/sort/group_sum.cu
index 0af7cb22159..316b6f395bb 100644
--- a/cpp/src/groupby/sort/group_sum.cu
+++ b/cpp/src/groupby/sort/group_sum.cu
@@ -20,6 +20,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -28,7 +29,7 @@ std::unique_ptr<column> group_sum(column_view const& values,
                                   size_type num_groups,
                                   cudf::device_span<size_type const> group_labels,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_sum_scan.cu b/cpp/src/groupby/sort/group_sum_scan.cu
index 2efa1185899..01c4d0c2c4a 100644
--- a/cpp/src/groupby/sort/group_sum_scan.cu
+++ b/cpp/src/groupby/sort/group_sum_scan.cu
@@ -17,6 +17,7 @@
 #include "groupby/sort/group_scan_util.cuh"
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
@@ -25,7 +26,7 @@ std::unique_ptr<column> sum_scan(column_view const& values,
                                  size_type num_groups,
                                  cudf::device_span<size_type const> group_labels,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(values.type(),
                          group_scan_dispatcher<aggregation::SUM>{},
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index 45c232aa3aa..f211c61b3b7 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -35,6 +35,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -207,7 +208,7 @@ void scan_result_functor::operator()<aggregation::RANK>(aggregation const& agg)
 std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::sort_scan(
   host_span<scan_request const> requests,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // We're going to start by creating a cache of results so that aggs that
   // depend on other aggs will not have to be recalculated. e.g. mean depends on
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 1e6c7a9393f..4da1da089cd 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -35,6 +35,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
@@ -248,7 +249,7 @@ column_view sort_groupby_helper::keys_bitmask_column(rmm::cuda_stream_view strea
 }
 
 sort_groupby_helper::column_ptr sort_groupby_helper::sorted_values(
-  column_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  column_view const& values, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   column_ptr values_sort_order =
     cudf::detail::stable_sorted_order(table_view({unsorted_keys_labels(stream), values}),
@@ -272,7 +273,7 @@ sort_groupby_helper::column_ptr sort_groupby_helper::sorted_values(
 }
 
 sort_groupby_helper::column_ptr sort_groupby_helper::grouped_values(
-  column_view const& values, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  column_view const& values, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   auto gather_map = key_sort_order(stream);
 
@@ -287,7 +288,7 @@ sort_groupby_helper::column_ptr sort_groupby_helper::grouped_values(
 }
 
 std::unique_ptr<table> sort_groupby_helper::unique_keys(rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   auto idx_data = key_sort_order(stream).data<size_type>();
 
@@ -305,7 +306,7 @@ std::unique_ptr<table> sort_groupby_helper::unique_keys(rmm::cuda_stream_view st
 }
 
 std::unique_ptr<table> sort_groupby_helper::sorted_keys(rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   return cudf::detail::gather(_keys,
                               key_sort_order(stream),
diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
index b34455905d9..8f490ada8ff 100644
--- a/cpp/src/hash/md5_hash.cu
+++ b/cpp/src/hash/md5_hash.cu
@@ -29,6 +29,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/constant_iterator.h>
@@ -284,7 +285,7 @@ inline bool md5_leaf_type_check(data_type dt)
 
 std::unique_ptr<column> md5(table_view const& input,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   if (input.num_columns() == 0 || input.num_rows() == 0) {
     // Return the MD5 hash of a zero-length input.
@@ -349,7 +350,7 @@ std::unique_ptr<column> md5(table_view const& input,
 
 std::unique_ptr<column> md5(table_view const& input,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::md5(input, stream, mr);
diff --git a/cpp/src/hash/murmurhash3_x64_128.cu b/cpp/src/hash/murmurhash3_x64_128.cu
index 1fc469686e1..6c91532a193 100644
--- a/cpp/src/hash/murmurhash3_x64_128.cu
+++ b/cpp/src/hash/murmurhash3_x64_128.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 
@@ -109,7 +110,7 @@ class murmur_device_row_hasher {
 std::unique_ptr<table> murmurhash3_x64_128(table_view const& input,
                                            uint64_t seed,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto output1 = make_numeric_column(
     data_type(type_id::UINT64), input.num_rows(), mask_state::UNALLOCATED, stream, mr);
@@ -140,7 +141,7 @@ std::unique_ptr<table> murmurhash3_x64_128(table_view const& input,
 std::unique_ptr<table> murmurhash3_x64_128(table_view const& input,
                                            uint64_t seed,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::murmurhash3_x64_128(input, seed, stream, mr);
diff --git a/cpp/src/hash/murmurhash3_x86_32.cu b/cpp/src/hash/murmurhash3_x86_32.cu
index a6ab301a86e..eac72f5d995 100644
--- a/cpp/src/hash/murmurhash3_x86_32.cu
+++ b/cpp/src/hash/murmurhash3_x86_32.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/tabulate.h>
 
@@ -33,7 +34,7 @@ namespace detail {
 std::unique_ptr<column> murmurhash3_x86_32(table_view const& input,
                                            uint32_t seed,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto output = make_numeric_column(data_type(type_to_id<hash_value_type>()),
                                     input.num_rows(),
@@ -62,7 +63,7 @@ std::unique_ptr<column> murmurhash3_x86_32(table_view const& input,
 std::unique_ptr<column> murmurhash3_x86_32(table_view const& input,
                                            uint32_t seed,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::murmurhash3_x86_32(input, seed, stream, mr);
diff --git a/cpp/src/hash/sha1_hash.cu b/cpp/src/hash/sha1_hash.cu
index 71253d279b9..f7609eb26af 100644
--- a/cpp/src/hash/sha1_hash.cu
+++ b/cpp/src/hash/sha1_hash.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -62,7 +63,7 @@ struct SHA1Hash : HashBase<SHA1Hash> {
 
 std::unique_ptr<column> sha1(table_view const& input,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   return sha_hash<SHA1Hash>(input, stream, mr);
 }
@@ -71,7 +72,7 @@ std::unique_ptr<column> sha1(table_view const& input,
 
 std::unique_ptr<column> sha1(table_view const& input,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sha1(input, stream, mr);
diff --git a/cpp/src/hash/sha224_hash.cu b/cpp/src/hash/sha224_hash.cu
index 61480a78776..cf04504a489 100644
--- a/cpp/src/hash/sha224_hash.cu
+++ b/cpp/src/hash/sha224_hash.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -63,7 +64,7 @@ struct SHA224Hash : HashBase<SHA224Hash> {
 
 std::unique_ptr<column> sha224(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   return sha_hash<SHA224Hash>(input, stream, mr);
 }
@@ -72,7 +73,7 @@ std::unique_ptr<column> sha224(table_view const& input,
 
 std::unique_ptr<column> sha224(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sha224(input, stream, mr);
diff --git a/cpp/src/hash/sha256_hash.cu b/cpp/src/hash/sha256_hash.cu
index b15cfe09d52..664913c0f4c 100644
--- a/cpp/src/hash/sha256_hash.cu
+++ b/cpp/src/hash/sha256_hash.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -63,7 +64,7 @@ struct SHA256Hash : HashBase<SHA256Hash> {
 
 std::unique_ptr<column> sha256(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   return sha_hash<SHA256Hash>(input, stream, mr);
 }
@@ -72,7 +73,7 @@ std::unique_ptr<column> sha256(table_view const& input,
 
 std::unique_ptr<column> sha256(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sha256(input, stream, mr);
diff --git a/cpp/src/hash/sha384_hash.cu b/cpp/src/hash/sha384_hash.cu
index 3075d2c62f8..92192f501ec 100644
--- a/cpp/src/hash/sha384_hash.cu
+++ b/cpp/src/hash/sha384_hash.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -70,7 +71,7 @@ struct SHA384Hash : HashBase<SHA384Hash> {
 
 std::unique_ptr<column> sha384(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   return sha_hash<SHA384Hash>(input, stream, mr);
 }
@@ -79,7 +80,7 @@ std::unique_ptr<column> sha384(table_view const& input,
 
 std::unique_ptr<column> sha384(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sha384(input, stream, mr);
diff --git a/cpp/src/hash/sha512_hash.cu b/cpp/src/hash/sha512_hash.cu
index d073cf1edca..244206aeeb9 100644
--- a/cpp/src/hash/sha512_hash.cu
+++ b/cpp/src/hash/sha512_hash.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -70,7 +71,7 @@ struct SHA512Hash : HashBase<SHA512Hash> {
 
 std::unique_ptr<column> sha512(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   return sha_hash<SHA512Hash>(input, stream, mr);
 }
@@ -79,7 +80,7 @@ std::unique_ptr<column> sha512(table_view const& input,
 
 std::unique_ptr<column> sha512(table_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sha512(input, stream, mr);
diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh
index 0a22ee34918..005578cb2c2 100644
--- a/cpp/src/hash/sha_hash.cuh
+++ b/cpp/src/hash/sha_hash.cuh
@@ -28,6 +28,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
@@ -503,7 +504,7 @@ bool inline sha_leaf_type_check(data_type dt)
 template <typename Hasher>
 std::unique_ptr<column> sha_hash(table_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   if (input.num_rows() == 0) { return cudf::make_empty_column(cudf::type_id::STRING); }
 
diff --git a/cpp/src/hash/xxhash_64.cu b/cpp/src/hash/xxhash_64.cu
index e17bc134420..4366c12b453 100644
--- a/cpp/src/hash/xxhash_64.cu
+++ b/cpp/src/hash/xxhash_64.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/tabulate.h>
 
@@ -298,7 +299,7 @@ class device_row_hasher {
 std::unique_ptr<column> xxhash_64(table_view const& input,
                                   uint64_t seed,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto output = make_numeric_column(data_type(type_to_id<hash_value_type>()),
                                     input.num_rows(),
@@ -327,7 +328,7 @@ std::unique_ptr<column> xxhash_64(table_view const& input,
 std::unique_ptr<column> xxhash_64(table_view const& input,
                                   uint64_t seed,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::xxhash_64(input, seed, stream, mr);
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 9f36280930d..3109a36cbcf 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -24,6 +24,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <dlpack/dlpack.h>
 
@@ -133,7 +134,7 @@ struct dltensor_context {
 namespace detail {
 std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(nullptr != managed_tensor, "managed_tensor is null");
   auto const& tensor = managed_tensor->dl_tensor;
@@ -219,7 +220,7 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
 
 DLManagedTensor* to_dlpack(table_view const& input,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+                           rmm::device_async_resource_ref mr)
 {
   auto const num_rows = input.num_rows();
   auto const num_cols = input.num_columns();
@@ -298,13 +299,13 @@ DLManagedTensor* to_dlpack(table_view const& input,
 }  // namespace detail
 
 std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_dlpack(managed_tensor, cudf::get_default_stream(), mr);
 }
 
-DLManagedTensor* to_dlpack(table_view const& input, rmm::mr::device_memory_resource* mr)
+DLManagedTensor* to_dlpack(table_view const& input, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_dlpack(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
index 2a524c773c0..f100ca0cc2b 100644
--- a/cpp/src/interop/from_arrow.cu
+++ b/cpp/src/interop/from_arrow.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
@@ -100,7 +101,7 @@ struct dispatch_to_cudf_column {
    */
   std::unique_ptr<rmm::device_buffer> get_mask_buffer(arrow::Array const& array,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
   {
     if (array.null_bitmap_data() == nullptr) {
       return std::make_unique<rmm::device_buffer>(0, stream, mr);
@@ -126,7 +127,7 @@ struct dispatch_to_cudf_column {
 
   template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
   std::unique_ptr<column> operator()(
-    arrow::Array const&, data_type, bool, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+    arrow::Array const&, data_type, bool, rmm::cuda_stream_view, rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unsupported type in from_arrow.");
   }
@@ -136,7 +137,7 @@ struct dispatch_to_cudf_column {
                                      data_type type,
                                      bool skip_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto data_buffer         = array.data()->buffers[1];
     size_type const num_rows = array.length();
@@ -186,7 +187,7 @@ std::unique_ptr<column> get_column(arrow::Array const& array,
                                    data_type type,
                                    bool skip_mask,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 template <>
 std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal128>(
@@ -194,7 +195,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal128>
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   using DeviceType = __int128_t;
 
@@ -230,12 +231,11 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal128>
 }
 
 template <>
-std::unique_ptr<column> dispatch_to_cudf_column::operator()<bool>(
-  arrow::Array const& array,
-  data_type,
-  bool skip_mask,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> dispatch_to_cudf_column::operator()<bool>(arrow::Array const& array,
+                                                                  data_type,
+                                                                  bool skip_mask,
+                                                                  rmm::cuda_stream_view stream,
+                                                                  rmm::device_async_resource_ref mr)
 {
   auto data_buffer = array.data()->buffers[1];
   // mask-to-bools expects the mask to be bitmask_type aligned/padded
@@ -273,7 +273,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
   data_type,
   bool,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   if (array.length() == 0) { return make_empty_column(type_id::STRING); }
   auto str_array    = static_cast<arrow::StringArray const*>(&array);
@@ -311,7 +311,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::dictionary32>(
   data_type,
   bool,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto dict_array  = static_cast<arrow::DictionaryArray const*>(&array);
   auto dict_type   = arrow_to_cudf_type(*(dict_array->dictionary()->type()));
@@ -344,7 +344,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::struct_view>(
   data_type,
   bool,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto struct_array = static_cast<arrow::StructArray const*>(&array);
   std::vector<std::unique_ptr<column>> child_columns;
@@ -377,7 +377,7 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::list_view>(
   data_type,
   bool,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto list_array   = static_cast<arrow::ListArray const*>(&array);
   auto offset_array = std::make_unique<arrow::Int32Array>(
@@ -412,7 +412,7 @@ std::unique_ptr<column> get_column(arrow::Array const& array,
                                    data_type type,
                                    bool skip_mask,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   return type.id() != type_id::EMPTY
            ? type_dispatcher(type, dispatch_to_cudf_column{}, array, type, skip_mask, stream, mr)
@@ -423,7 +423,7 @@ std::unique_ptr<column> get_column(arrow::Array const& array,
 
 std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   if (input_table.num_columns() == 0) { return std::make_unique<table>(); }
   std::vector<std::unique_ptr<column>> columns;
@@ -464,7 +464,7 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
 
 std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   auto maybe_array = arrow::MakeArrayFromScalar(input, 1);
   if (!maybe_array.ok()) { CUDF_FAIL("Failed to create array"); }
@@ -484,7 +484,7 @@ std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
 
 std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -493,7 +493,7 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
 
 std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index e824412e71c..1754d1493bd 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -34,6 +34,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -288,10 +289,7 @@ int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column const
 
 struct dispatch_to_arrow_device {
   template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
-  int operator()(cudf::column&&,
-                 rmm::cuda_stream_view,
-                 rmm::mr::device_memory_resource*,
-                 ArrowArray*)
+  int operator()(cudf::column&&, rmm::cuda_stream_view, rmm::device_async_resource_ref, ArrowArray*)
   {
     CUDF_FAIL("Unsupported type for to_arrow_device");
   }
@@ -299,7 +297,7 @@ struct dispatch_to_arrow_device {
   template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
   int operator()(cudf::column&& column,
                  rmm::cuda_stream_view stream,
-                 rmm::mr::device_memory_resource* mr,
+                 rmm::device_async_resource_ref mr,
                  ArrowArray* out)
   {
     nanoarrow::UniqueArray tmp;
@@ -337,7 +335,7 @@ template <typename DeviceType>
 int decimals_to_arrow(cudf::column&& input,
                       int32_t precision,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr,
+                      rmm::device_async_resource_ref mr,
                       ArrowArray* out)
 {
   nanoarrow::UniqueArray tmp;
@@ -387,7 +385,7 @@ int decimals_to_arrow(cudf::column&& input,
 template <>
 int dispatch_to_arrow_device::operator()<numeric::decimal32>(cudf::column&& column,
                                                              rmm::cuda_stream_view stream,
-                                                             rmm::mr::device_memory_resource* mr,
+                                                             rmm::device_async_resource_ref mr,
                                                              ArrowArray* out)
 {
   using DeviceType = int32_t;
@@ -398,7 +396,7 @@ int dispatch_to_arrow_device::operator()<numeric::decimal32>(cudf::column&& colu
 template <>
 int dispatch_to_arrow_device::operator()<numeric::decimal64>(cudf::column&& column,
                                                              rmm::cuda_stream_view stream,
-                                                             rmm::mr::device_memory_resource* mr,
+                                                             rmm::device_async_resource_ref mr,
                                                              ArrowArray* out)
 {
   using DeviceType = int64_t;
@@ -409,7 +407,7 @@ int dispatch_to_arrow_device::operator()<numeric::decimal64>(cudf::column&& colu
 template <>
 int dispatch_to_arrow_device::operator()<numeric::decimal128>(cudf::column&& column,
                                                               rmm::cuda_stream_view stream,
-                                                              rmm::mr::device_memory_resource* mr,
+                                                              rmm::device_async_resource_ref mr,
                                                               ArrowArray* out)
 {
   using DeviceType = __int128_t;
@@ -420,7 +418,7 @@ int dispatch_to_arrow_device::operator()<numeric::decimal128>(cudf::column&& col
 template <>
 int dispatch_to_arrow_device::operator()<bool>(cudf::column&& column,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr,
+                                               rmm::device_async_resource_ref mr,
                                                ArrowArray* out)
 {
   nanoarrow::UniqueArray tmp;
@@ -442,7 +440,7 @@ int dispatch_to_arrow_device::operator()<bool>(cudf::column&& column,
 template <>
 int dispatch_to_arrow_device::operator()<cudf::string_view>(cudf::column&& column,
                                                             rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource* mr,
+                                                            rmm::device_async_resource_ref mr,
                                                             ArrowArray* out)
 {
   nanoarrow::UniqueArray tmp;
@@ -478,19 +476,19 @@ int dispatch_to_arrow_device::operator()<cudf::string_view>(cudf::column&& colum
 template <>
 int dispatch_to_arrow_device::operator()<cudf::list_view>(cudf::column&& column,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr,
+                                                          rmm::device_async_resource_ref mr,
                                                           ArrowArray* out);
 
 template <>
 int dispatch_to_arrow_device::operator()<cudf::dictionary32>(cudf::column&& column,
                                                              rmm::cuda_stream_view stream,
-                                                             rmm::mr::device_memory_resource* mr,
+                                                             rmm::device_async_resource_ref mr,
                                                              ArrowArray* out);
 
 template <>
 int dispatch_to_arrow_device::operator()<cudf::struct_view>(cudf::column&& column,
                                                             rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource* mr,
+                                                            rmm::device_async_resource_ref mr,
                                                             ArrowArray* out)
 {
   nanoarrow::UniqueArray tmp;
@@ -523,7 +521,7 @@ int dispatch_to_arrow_device::operator()<cudf::struct_view>(cudf::column&& colum
 template <>
 int dispatch_to_arrow_device::operator()<cudf::list_view>(cudf::column&& column,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr,
+                                                          rmm::device_async_resource_ref mr,
                                                           ArrowArray* out)
 {
   nanoarrow::UniqueArray tmp;
@@ -557,7 +555,7 @@ int dispatch_to_arrow_device::operator()<cudf::list_view>(cudf::column&& column,
 template <>
 int dispatch_to_arrow_device::operator()<cudf::dictionary32>(cudf::column&& column,
                                                              rmm::cuda_stream_view stream,
-                                                             rmm::mr::device_memory_resource* mr,
+                                                             rmm::device_async_resource_ref mr,
                                                              ArrowArray* out)
 {
   nanoarrow::UniqueArray tmp;
@@ -639,7 +637,7 @@ unique_schema_t to_arrow_schema(cudf::table_view const& input,
 
 unique_device_array_t to_arrow_device(cudf::table&& table,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   nanoarrow::UniqueArray tmp;
   NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRUCT));
@@ -689,7 +687,7 @@ unique_device_array_t to_arrow_device(cudf::table&& table,
 
 unique_device_array_t to_arrow_device(cudf::column&& col,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   nanoarrow::UniqueArray tmp;
   if (col.type().id() == cudf::type_id::EMPTY) {
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 03fd663040a..814efe2b5a1 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -33,6 +33,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/equal.h>
 #include <thrust/functional.h>
@@ -373,7 +374,7 @@ std::vector<column_buffer> decode_data(metadata& meta,
                                        std::vector<std::pair<int, std::string>> const& selection,
                                        std::vector<data_type> const& column_types,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto out_buffers = std::vector<column_buffer>();
 
@@ -483,7 +484,7 @@ std::vector<column_buffer> decode_data(metadata& meta,
 table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
                               avro_reader_options const& options,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   auto skip_rows = options.get_skip_rows();
   auto num_rows  = options.get_num_rows();
diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu
index 76b1b46dc61..918951d5902 100644
--- a/cpp/src/io/csv/durations.cu
+++ b/cpp/src/io/csv/durations.cu
@@ -24,6 +24,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -174,7 +175,7 @@ struct dispatch_from_durations_fn {
   template <typename T, std::enable_if_t<cudf::is_duration<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& durations,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     size_type strings_count = durations.size();
     auto column             = column_device_view::create(durations, stream);
@@ -211,7 +212,7 @@ struct dispatch_from_durations_fn {
   template <typename T, std::enable_if_t<not cudf::is_duration<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Values for from_durations function must be a duration type.");
   }
@@ -221,7 +222,7 @@ struct dispatch_from_durations_fn {
 
 std::unique_ptr<column> pandas_format_durations(column_view const& durations,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   size_type strings_count = durations.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
diff --git a/cpp/src/io/csv/durations.hpp b/cpp/src/io/csv/durations.hpp
index ac925011c58..f671f435eeb 100644
--- a/cpp/src/io/csv/durations.hpp
+++ b/cpp/src/io/csv/durations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -30,7 +31,7 @@ namespace csv {
 
 std::unique_ptr<column> pandas_format_durations(column_view const& durations,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+                                                rmm::device_async_resource_ref mr);
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 02daf4655db..67c1194578a 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -39,6 +39,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -574,7 +575,7 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
                                        int32_t num_actual_columns,
                                        int32_t num_active_columns,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   // Alloc output; columns' data memory is still expected for empty dataframe
   std::vector<column_buffer> out_buffers;
@@ -667,7 +668,7 @@ table_with_metadata read_csv(cudf::io::datasource* source,
                              csv_reader_options const& reader_opts,
                              parse_options const& parse_opts,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   std::vector<char> header;
 
@@ -995,7 +996,7 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
 table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
                              csv_reader_options const& options,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   auto parse_options = make_parse_options(options, stream);
 
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index c143d258448..335ce77e3e3 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -41,6 +41,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/host_vector.h>
@@ -140,7 +141,7 @@ struct column_to_strings_fn {
 
   explicit column_to_strings_fn(csv_writer_options const& options,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
     : options_(options), stream_(stream), mr_(mr)
   {
   }
@@ -277,7 +278,7 @@ struct column_to_strings_fn {
  private:
   csv_writer_options const& options_;
   rmm::cuda_stream_view stream_;
-  rmm::mr::device_memory_resource* mr_;
+  rmm::device_async_resource_ref mr_;
 };
 }  // unnamed namespace
 
@@ -288,7 +289,7 @@ void write_chunked_begin(data_sink* out_sink,
                          host_span<std::string const> user_column_names,
                          csv_writer_options const& options,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+                         rmm::device_async_resource_ref mr)
 {
   if (options.is_enabled_include_header()) {
     // need to generate column names if names are not provided
@@ -354,7 +355,7 @@ void write_chunked(data_sink* out_sink,
                    strings_column_view const& str_column_view,
                    csv_writer_options const& options,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
 {
   // algorithm outline:
   //
@@ -410,7 +411,7 @@ void write_csv(data_sink* out_sink,
                host_span<std::string const> user_column_names,
                csv_writer_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+               rmm::device_async_resource_ref mr)
 {
   // write header: column names separated by delimiter:
   // (even for tables with no rows)
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 46c6c67c8df..f0a37839810 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -36,6 +36,8 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <algorithm>
 
 namespace cudf::io {
@@ -156,8 +158,7 @@ std::vector<std::unique_ptr<data_sink>> make_datasinks(sink_info const& info)
 
 }  // namespace
 
-table_with_metadata read_avro(avro_reader_options const& options,
-                              rmm::mr::device_memory_resource* mr)
+table_with_metadata read_avro(avro_reader_options const& options, rmm::device_async_resource_ref mr)
 {
   namespace avro = cudf::io::detail::avro;
 
@@ -201,7 +202,7 @@ compression_type infer_compression_type(compression_type compression, source_inf
 
 table_with_metadata read_json(json_reader_options options,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -216,7 +217,7 @@ table_with_metadata read_json(json_reader_options options,
 
 void write_json(json_writer_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr)
+                rmm::device_async_resource_ref mr)
 {
   auto sinks = make_datasinks(options.get_sink());
   CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for JSON writing");
@@ -231,7 +232,7 @@ void write_json(json_writer_options const& options,
 
 table_with_metadata read_csv(csv_reader_options options,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -253,7 +254,7 @@ table_with_metadata read_csv(csv_reader_options options,
 // Freeform API wraps the detail writer class API
 void write_csv(csv_writer_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+               rmm::device_async_resource_ref mr)
 {
   using namespace cudf::io::detail;
 
@@ -413,7 +414,7 @@ orc_metadata read_orc_metadata(source_info const& src_info, rmm::cuda_stream_vie
  */
 table_with_metadata read_orc(orc_reader_options const& options,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -481,7 +482,7 @@ namespace detail_parquet = cudf::io::parquet::detail;
 
 table_with_metadata read_parquet(parquet_reader_options const& options,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -569,7 +570,7 @@ std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const
 chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit,
                                                parquet_reader_options const& options,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
   : reader{std::make_unique<detail_parquet::chunked_reader>(
       chunk_read_limit, 0, make_datasources(options.get_source()), options, stream, mr)}
 {
@@ -582,7 +583,7 @@ chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit,
                                                std::size_t pass_read_limit,
                                                parquet_reader_options const& options,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
   : reader{std::make_unique<detail_parquet::chunked_reader>(chunk_read_limit,
                                                             pass_read_limit,
                                                             make_datasources(options.get_source()),
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 9d40c657396..7117af8948b 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <cuda/functional>
@@ -481,7 +482,7 @@ void make_device_json_column(device_span<SymbolT const> input,
                              bool is_array_of_arrays,
                              cudf::io::json_reader_options const& options,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -893,7 +894,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
   cudf::io::parse_options const& options,
   std::optional<schema_element> schema,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto validity_size_check = [](device_json_column& json_col) {
@@ -1050,7 +1051,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
 table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                              cudf::io::json_reader_options const& options,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index b3a029224d7..eb06ea0177e 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
 
@@ -299,7 +300,7 @@ namespace detail {
 
 rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT>&& inbuf,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   auto parser = fst::detail::make_fst(
     fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
@@ -323,7 +324,7 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT
 
 rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&& inbuf,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   auto parser = fst::detail::make_fst(
     fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index 1b7976dab89..ad807b57766 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_radix_sort.cuh>
 #include <cuco/static_set.cuh>
@@ -218,7 +219,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                     device_span<SymbolOffsetT const> token_indices,
                                     bool is_strict_nested_boundaries,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   // Whether a token does represent a node in the tree representation
@@ -634,7 +635,7 @@ std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_n
   bool is_array_of_arrays,
   bool is_enabled_lines,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto const num_nodes = parent_node_ids.size();
@@ -779,7 +780,7 @@ std::pair<rmm::device_uvector<NodeIndexT>, rmm::device_uvector<NodeIndexT>> gene
   bool is_array_of_arrays,
   bool is_enabled_lines,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto const num_nodes = d_tree.node_categories.size();
@@ -848,7 +849,7 @@ rmm::device_uvector<size_type> compute_row_offsets(rmm::device_uvector<NodeIndex
                                                    bool is_array_of_arrays,
                                                    bool is_enabled_lines,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto const num_nodes = d_tree.node_categories.size();
@@ -947,7 +948,7 @@ records_orient_tree_traversal(device_span<SymbolT const> d_input,
                               bool is_array_of_arrays,
                               bool is_enabled_lines,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto [new_col_id, new_parent_col_id] =
diff --git a/cpp/src/io/json/legacy/read_json.hpp b/cpp/src/io/json/legacy/read_json.hpp
index 32d05c432b4..2c02fdd402f 100644
--- a/cpp/src/io/json/legacy/read_json.hpp
+++ b/cpp/src/io/json/legacy/read_json.hpp
@@ -17,6 +17,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/mr/memory_resource.h>
 
@@ -32,6 +33,6 @@ namespace cudf::io::json::detail::legacy {
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::io::json::detail::legacy
diff --git a/cpp/src/io/json/legacy/reader_impl.cu b/cpp/src/io/json/legacy/reader_impl.cu
index f9d0f6895b9..846b3cfab4e 100644
--- a/cpp/src/io/json/legacy/reader_impl.cu
+++ b/cpp/src/io/json/legacy/reader_impl.cu
@@ -39,6 +39,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
@@ -486,7 +487,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
                                           device_span<uint64_t const> rec_starts,
                                           device_span<char const> data,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   auto const num_columns = dtypes.size();
   auto const num_records = rec_starts.size();
@@ -598,7 +599,7 @@ table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(not sources.empty(), "No sources were defined");
   CUDF_EXPECTS(sources.size() == 1 or reader_opts.get_compression() == compression_type::NONE,
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 5d54e340e2b..a302785cee8 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -22,6 +22,8 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <map>
 #include <vector>
 
@@ -172,7 +174,7 @@ struct device_json_column {
    * @param stream The CUDA stream to which kernels are dispatched
    * @param mr Optional, resource with which to allocate
    */
-  device_json_column(rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  device_json_column(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
     : string_offsets(0, stream),
       string_lengths(0, stream),
       child_offsets(0, stream, mr),
@@ -232,7 +234,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                     device_span<SymbolOffsetT const> token_indices,
                                     bool is_strict_nested_boundaries,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr);
+                                    rmm::device_async_resource_ref mr);
 
 /**
  * @brief Traverse the tree representation of the JSON input in records orient format and populate
@@ -253,7 +255,7 @@ records_orient_tree_traversal(device_span<SymbolT const> d_input,
                               bool is_array_of_arrays,
                               bool is_enabled_lines,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @brief Searches for and selects nodes at level `row_array_children_level`. For each selected
@@ -307,7 +309,7 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt
 table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
                                              cudf::io::json_reader_options const& options,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr);
+                                             rmm::device_async_resource_ref mr);
 
 /**
  * @brief Get the path data type of a column by path if present in input schema
@@ -347,7 +349,7 @@ struct path_from_tree {
 table_with_metadata host_parse_nested_json(device_span<SymbolT const> input,
                                            cudf::io::json_reader_options const& options,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr);
+                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 4ddbe735963..8da1bb3ddfc 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -36,6 +36,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/device_vector.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -1531,7 +1532,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   device_span<SymbolT const> json_in,
   cudf::io::json_reader_options const& options,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   check_input_size(json_in.size());
 
@@ -1664,7 +1665,7 @@ void make_json_column(json_column& root_column,
                       cudf::io::json_reader_options const& options,
                       bool include_quote_char,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr)
+                      rmm::device_async_resource_ref mr)
 {
   // Range of encapsulating function that parses to internal columnar data representation
   CUDF_FUNC_RANGE();
@@ -2064,7 +2065,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
   cudf::io::json_reader_options const& options,
   std::optional<schema_element> schema,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // Range of orchestrating/encapsulating function
   CUDF_FUNC_RANGE();
@@ -2222,7 +2223,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
 table_with_metadata host_parse_nested_json(device_span<SymbolT const> d_input,
                                            cudf::io::json_reader_options const& options,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   // Range of orchestrating/encapsulating function
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index b03e0dd452b..3ea8639641c 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -25,6 +25,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scatter.h>
@@ -205,7 +206,7 @@ auto get_record_range_raw_input(host_span<std::unique_ptr<datasource>> sources,
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index d05134fa837..0c30b4cad46 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -31,7 +32,7 @@ namespace cudf::io::json::detail {
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 size_type find_first_delimiter(device_span<char const> d_data,
                                char const delimiter,
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 8c3aceeefd4..596b3381eaf 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -47,6 +47,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -167,7 +168,7 @@ struct escape_strings_fn {
 
   std::unique_ptr<column> get_escaped_strings(column_view const& column_v,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
   {
     auto [offsets_column, chars] =
       cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr);
@@ -256,7 +257,7 @@ std::unique_ptr<column> struct_to_strings(table_view const& strings_columns,
                                           string_scalar const& narep,
                                           bool include_nulls,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(column_names.type().id() == type_id::STRING, "Column names must be of type string");
@@ -373,7 +374,7 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
                                              string_view const element_separator,
                                              string_view const element_narep,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -497,7 +498,7 @@ struct column_to_strings_fn {
 
   explicit column_to_strings_fn(json_writer_options const& options,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
     : options_(options),
       stream_(stream),
       mr_(mr),
@@ -740,7 +741,7 @@ struct column_to_strings_fn {
  private:
   json_writer_options const& options_;
   rmm::cuda_stream_view stream_;
-  rmm::mr::device_memory_resource* mr_;
+  rmm::device_async_resource_ref mr_;
   string_scalar const narep;  // "null"
   // struct convert constants
   string_scalar const struct_value_separator;  // ","
@@ -804,7 +805,7 @@ void write_chunked(data_sink* out_sink,
                    int const skip_last_chars,
                    json_writer_options const& options,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
@@ -828,7 +829,7 @@ void write_json(data_sink* out_sink,
                 table_view const& table,
                 json_writer_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr)
+                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   std::vector<column_name_info> user_column_names = [&]() {
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index f078e20f7e6..77151f5b7b8 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -18,12 +18,14 @@
 #include "reader_impl_chunking.hpp"
 #include "reader_impl_helpers.hpp"
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::io::orc::detail {
 
 reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    orc_reader_options const& options,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
   : _stream(stream),
     _mr(mr),
     _timestamp_type{options.get_timestamp_type()},
@@ -119,7 +121,7 @@ table_with_metadata reader::impl::read_chunk_internal()
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                orc_reader_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+               rmm::device_async_resource_ref mr)
   : _impl{std::make_unique<impl>(std::move(sources), options, stream, mr)}
 {
 }
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index ab8eaebeb61..8b859da07e9 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -24,6 +24,7 @@
 #include <cudf/io/orc.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -50,7 +51,7 @@ class reader::impl {
   explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
                 orc_reader_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
+                rmm::device_async_resource_ref mr);
 
   /**
    * @brief Read an entire set or a subset of data and returns a set of columns
@@ -93,7 +94,7 @@ class reader::impl {
   table_with_metadata read_chunk_internal();
 
   rmm::cuda_stream_view const _stream;
-  rmm::mr::device_memory_resource* const _mr;
+  rmm::device_async_resource_ref const _mr;
 
   // Reader configs
   data_type const _timestamp_type;  // Override output timestamp resolution
diff --git a/cpp/src/io/orc/reader_impl_helpers.cpp b/cpp/src/io/orc/reader_impl_helpers.cpp
index ea4e5dcfaab..c943ae17d97 100644
--- a/cpp/src/io/orc/reader_impl_helpers.cpp
+++ b/cpp/src/io/orc/reader_impl_helpers.cpp
@@ -16,6 +16,8 @@
 
 #include "reader_impl_helpers.hpp"
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::io::orc::detail {
 
 std::unique_ptr<column> create_empty_column(size_type orc_col_id,
@@ -111,7 +113,7 @@ column_buffer assemble_buffer(size_type orc_col_id,
                               column_hierarchy const& selected_columns,
                               std::vector<std::vector<column_buffer>>& col_buffers,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   auto const col_id = col_meta.orc_col_map[level][orc_col_id];
   auto& col_buffer  = col_buffers[level][col_id];
diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp
index 22482bad486..6645eecbd29 100644
--- a/cpp/src/io/orc/reader_impl_helpers.hpp
+++ b/cpp/src/io/orc/reader_impl_helpers.hpp
@@ -23,6 +23,7 @@
 #include <cudf/io/orc.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -147,6 +148,6 @@ column_buffer assemble_buffer(size_type orc_col_id,
                               column_hierarchy const& selected_columns,
                               std::vector<std::vector<column_buffer>>& col_buffers,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_preprocess.cu
index 6c59f83bc46..04cb223c696 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_preprocess.cu
@@ -34,6 +34,7 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/fill.h>
@@ -401,7 +402,7 @@ rmm::device_buffer decompress_stripe_data(
 void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
                       host_span<column_buffer> out_buffers,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr)
+                      rmm::device_async_resource_ref mr)
 {
   auto const num_stripes = chunks.size().first;
   auto const num_columns = chunks.size().second;
@@ -492,7 +493,7 @@ void decode_stream_data(std::size_t num_dicts,
                         cudf::detail::device_2dspan<gpu::RowGroup> row_groups,
                         std::vector<column_buffer>& out_buffers,
                         rmm::cuda_stream_view stream,
-                        rmm::mr::device_memory_resource* mr)
+                        rmm::device_async_resource_ref mr)
 {
   auto const num_stripes = chunks.size().first;
   auto const num_columns = chunks.size().second;
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index f43a8fd24c4..9869dafadfb 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -29,6 +29,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <algorithm>
 #include <list>
@@ -129,7 +130,7 @@ struct stats_caster {
     size_t col_idx,
     cudf::data_type dtype,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     // List, Struct, Dictionary types are not supported
     if constexpr (cudf::is_compound<T>() && !std::is_same_v<T, string_view>) {
@@ -165,7 +166,7 @@ struct stats_caster {
 
         static auto make_strings_children(host_span<string_view> host_strings,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
         {
           std::vector<char> chars{};
           std::vector<cudf::size_type> offsets(1, 0);
@@ -182,7 +183,7 @@ struct stats_caster {
 
         auto to_device(cudf::data_type dtype,
                        rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr)
+                       rmm::device_async_resource_ref mr)
         {
           if constexpr (std::is_same_v<T, string_view>) {
             auto [d_chars, d_offsets] = make_strings_children(val, stream, mr);
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index 17d7c07bc91..170f7503134 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #include "reader_impl.hpp"
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::io::parquet::detail {
 
 reader::reader() = default;
@@ -23,7 +25,7 @@ reader::reader() = default;
 reader::reader(std::vector<std::unique_ptr<datasource>>&& sources,
                parquet_reader_options const& options,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+               rmm::device_async_resource_ref mr)
   : _impl(std::make_unique<impl>(std::move(sources), options, stream, mr))
 {
 }
@@ -47,7 +49,7 @@ chunked_reader::chunked_reader(std::size_t chunk_read_limit,
                                std::vector<std::unique_ptr<datasource>>&& sources,
                                parquet_reader_options const& options,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   _impl = std::make_unique<impl>(
     chunk_read_limit, pass_read_limit, std::move(sources), options, stream, mr);
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 2356878f6ba..e7409f45e13 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -23,6 +23,8 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <bitset>
 #include <numeric>
 
@@ -362,7 +364,7 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row
 reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
                    parquet_reader_options const& options,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
   : impl(0 /*chunk_read_limit*/,
          0 /*input_pass_read_limit*/,
          std::forward<std::vector<std::unique_ptr<cudf::io::datasource>>>(sources),
@@ -377,7 +379,7 @@ reader::impl::impl(std::size_t chunk_read_limit,
                    std::vector<std::unique_ptr<datasource>>&& sources,
                    parquet_reader_options const& options,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
   : _stream{stream},
     _mr{mr},
     _sources{std::move(sources)},
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 185419a5b46..6c6cedf4e76 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -31,6 +31,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -57,7 +59,7 @@ class reader::impl {
   explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
                 parquet_reader_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
+                rmm::device_async_resource_ref mr);
 
   /**
    * @brief Read an entire set or a subset of data and returns a set of columns
@@ -108,7 +110,7 @@ class reader::impl {
                 std::vector<std::unique_ptr<datasource>>&& sources,
                 parquet_reader_options const& options,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr);
+                rmm::device_async_resource_ref mr);
 
   /**
    * @copydoc cudf::io::chunked_parquet_reader::has_next
@@ -346,7 +348,7 @@ class reader::impl {
 
  private:
   rmm::cuda_stream_view _stream;
-  rmm::mr::device_memory_resource* _mr = nullptr;
+  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 
   std::vector<std::unique_ptr<datasource>> _sources;
   std::unique_ptr<aggregate_reader_metadata> _metadata;
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 8e37564fc35..976d735e010 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -37,6 +37,7 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
@@ -306,7 +307,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                                               byte_range_info byte_range,
                                               bool strip_delimiters,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -565,7 +566,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               std::optional<byte_range_info> byte_range,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   return multibyte_split(
     source, delimiter, parse_options{byte_range.value_or(create_byte_range_info_max())}, mr);
@@ -574,7 +575,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               parse_options options,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   auto stream = cudf::get_default_stream();
 
@@ -586,7 +587,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   return multibyte_split(source, delimiter, parse_options{}, mr);
 }
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 96503e4907b..5dc2291abdc 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -26,6 +26,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <iomanip>
 #include <sstream>
@@ -102,7 +103,7 @@ void copy_buffer_data(string_policy const& buff, string_policy& new_buff)
 template <class string_policy>
 void column_buffer_base<string_policy>::create(size_type _size,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   size = _size;
   _mr  = mr;
@@ -286,7 +287,7 @@ template <class string_policy>
 std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (schema_info != nullptr) { schema_info->name = buffer.name; }
 
@@ -357,12 +358,12 @@ template std::unique_ptr<column> make_column<pointer_type>(
 template std::unique_ptr<column> empty_like<string_type>(string_column_buffer& buffer,
                                                          column_name_info* schema_info,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr);
+                                                         rmm::device_async_resource_ref mr);
 
 template std::unique_ptr<column> empty_like<pointer_type>(pointer_column_buffer& buffer,
                                                           column_name_info* schema_info,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr);
+                                                          rmm::device_async_resource_ref mr);
 
 template std::string type_to_name<string_type>(string_column_buffer const& buffer);
 template std::string type_to_name<pointer_type>(pointer_column_buffer const& buffer);
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 57ee1043ee9..ace1396bc09 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -31,6 +31,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
@@ -50,7 +52,7 @@ namespace detail {
 inline rmm::device_buffer create_data(data_type type,
                                       size_type size,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   std::size_t data_size = size_of(type) * size;
 
@@ -96,7 +98,7 @@ class column_buffer_base {
                      size_type _size,
                      bool _is_nullable,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr)
+                     rmm::device_async_resource_ref mr)
     : column_buffer_base(_type, _is_nullable)
   {
   }
@@ -111,7 +113,7 @@ class column_buffer_base {
 
   // instantiate a column of known type with a specified size.  Allows deferred creation for
   // preprocessing steps such as in the Parquet reader
-  void create(size_type _size, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr);
+  void create(size_type _size, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
   // Create a new column_buffer that has empty data but with the same basic information as the
   // input column, including same type, nullability, name, and user_data.
@@ -140,7 +142,7 @@ class column_buffer_base {
   rmm::device_buffer _data{};
   rmm::device_buffer _null_mask{};
   size_type _null_count{0};
-  rmm::mr::device_memory_resource* _mr;
+  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 
  public:
   data_type type{type_id::EMPTY};
@@ -174,7 +176,7 @@ class gather_column_buffer : public column_buffer_base<gather_column_buffer> {
                        size_type _size,
                        bool _is_nullable,
                        rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr)
+                       rmm::device_async_resource_ref mr)
     : column_buffer_base<gather_column_buffer>(_type, _size, _is_nullable, stream, mr)
   {
     create(_size, stream, mr);
@@ -208,7 +210,7 @@ class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
                        size_type _size,
                        bool _is_nullable,
                        rmm::cuda_stream_view stream,
-                       rmm::mr::device_memory_resource* mr)
+                       rmm::device_async_resource_ref mr)
     : column_buffer_base<inline_column_buffer>(_type, _size, _is_nullable, stream, mr)
   {
     create(_size, stream, mr);
@@ -251,7 +253,7 @@ template <class string_policy>
 std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
                                    column_name_info* schema_info,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr);
+                                   rmm::device_async_resource_ref mr);
 
 /**
  * @brief Given a column_buffer, produce a formatted name string describing the type.
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 4b5d47e71fb..c9e507925ec 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/copy.h>
@@ -796,7 +797,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
                                             rmm::device_scalar<size_type>& d_null_count,
                                             cudf::io::parse_options_view const& options,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   //  CUDF_FUNC_RANGE();
 
@@ -914,7 +915,7 @@ std::unique_ptr<column> parse_data(
   size_type null_count,
   cudf::io::parse_options_view const& options,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/io/utilities/output_builder.cuh b/cpp/src/io/utilities/output_builder.cuh
index 1858912a871..a7517983cd3 100644
--- a/cpp/src/io/utilities/output_builder.cuh
+++ b/cpp/src/io/utilities/output_builder.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 
@@ -206,7 +207,7 @@ class output_builder {
   output_builder(size_type max_write_size,
                  size_type max_growth,
                  rmm::cuda_stream_view stream,
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
     : _size{0}, _max_write_size{max_write_size}, _max_growth{max_growth}
   {
     CUDF_EXPECTS(max_write_size > 0, "Internal error");
@@ -307,7 +308,7 @@ class output_builder {
    * @return The output vector.
    */
   rmm::device_uvector<T> gather(rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr) const
+                                rmm::device_async_resource_ref mr) const
   {
     rmm::device_uvector<T> output{size(), stream, mr};
     auto output_it = output.begin();
diff --git a/cpp/src/io/utilities/string_parsing.hpp b/cpp/src/io/utilities/string_parsing.hpp
index a98660c98a9..612889af74b 100644
--- a/cpp/src/io/utilities/string_parsing.hpp
+++ b/cpp/src/io/utilities/string_parsing.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/tuple.h>
@@ -74,6 +75,6 @@ std::unique_ptr<column> parse_data(
   size_type null_count,
   cudf::io::parse_options_view const& options,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 }  // namespace json::detail
 }  // namespace cudf::io
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index cc729ad5e8b..095093d08e5 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -30,6 +30,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -44,7 +45,7 @@ conditional_join(table_view const& left,
                  join_kind join_type,
                  std::optional<std::size_t> output_size,
                  rmm::cuda_stream_view stream,
-                 rmm::mr::device_memory_resource* mr)
+                 rmm::device_async_resource_ref mr)
 {
   // We can immediately filter out cases where the right table is empty. In
   // some cases, we return all the rows of the left table with a corresponding
@@ -197,7 +198,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
                                                  ast::expression const& binary_predicate,
                                                  join_kind join_type,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   // Until we add logic to handle the number of non-matches in the right table,
   // full joins are not supported in this function. Note that this does not
@@ -293,7 +294,7 @@ conditional_inner_join(table_view const& left,
                        table_view const& right,
                        ast::expression const& binary_predicate,
                        std::optional<std::size_t> output_size,
-                       rmm::mr::device_memory_resource* mr)
+                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::conditional_join(left,
@@ -311,7 +312,7 @@ conditional_left_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
                       std::optional<std::size_t> output_size,
-                      rmm::mr::device_memory_resource* mr)
+                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::conditional_join(left,
@@ -328,7 +329,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 conditional_full_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
-                      rmm::mr::device_memory_resource* mr)
+                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::conditional_join(left,
@@ -345,7 +346,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return std::move(detail::conditional_join(left,
@@ -363,7 +364,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return std::move(detail::conditional_join(left,
@@ -379,7 +380,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
 std::size_t conditional_inner_join_size(table_view const& left,
                                         table_view const& right,
                                         ast::expression const& binary_predicate,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_conditional_join_output_size(
@@ -389,7 +390,7 @@ std::size_t conditional_inner_join_size(table_view const& left,
 std::size_t conditional_left_join_size(table_view const& left,
                                        table_view const& right,
                                        ast::expression const& binary_predicate,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_conditional_join_output_size(
@@ -399,7 +400,7 @@ std::size_t conditional_left_join_size(table_view const& left,
 std::size_t conditional_left_semi_join_size(table_view const& left,
                                             table_view const& right,
                                             ast::expression const& binary_predicate,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return std::move(detail::compute_conditional_join_output_size(left,
@@ -413,7 +414,7 @@ std::size_t conditional_left_semi_join_size(table_view const& left,
 std::size_t conditional_left_anti_join_size(table_view const& left,
                                             table_view const& right,
                                             ast::expression const& binary_predicate,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return std::move(detail::compute_conditional_join_output_size(left,
diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp
index 9bc6024ee7e..06eb83d6ba8 100644
--- a/cpp/src/join/conditional_join.hpp
+++ b/cpp/src/join/conditional_join.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -49,7 +50,7 @@ conditional_join(table_view const& left,
                  join_kind JoinKind,
                  std::optional<std::size_t> output_size,
                  rmm::cuda_stream_view stream,
-                 rmm::mr::device_memory_resource* mr);
+                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Computes the size of a join operation between two tables without
@@ -68,7 +69,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
                                                  ast::expression const& binary_predicate,
                                                  join_kind JoinKind,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr);
+                                                 rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu
index 07057acf37e..a2ee3a7796b 100644
--- a/cpp/src/join/cross_join.cu
+++ b/cpp/src/join/cross_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -40,7 +41,7 @@ namespace detail {
 std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
                                         cudf::table_view const& right,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(0 != left.num_columns(), "Left table is empty");
   CUDF_EXPECTS(0 != right.num_columns(), "Right table is empty");
@@ -74,7 +75,7 @@ std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
 
 std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
                                         cudf::table_view const& right,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::cross_join(left, right, cudf::get_default_stream(), mr);
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index 8bd42d867a3..a3652942973 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cooperative_groups.h>
 #include <cub/block/block_scan.cuh>
@@ -309,7 +310,7 @@ template <cudf::has_nested HasNested>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr) const
+                                          rmm::device_async_resource_ref mr) const
 {
   cudf::scoped_range range{"distinct_hash_join::inner_join"};
 
@@ -352,7 +353,7 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
 
 template <cudf::has_nested HasNested>
 std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join<HasNested>::left_join(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const
+  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
 {
   cudf::scoped_range range{"distinct_hash_join::left_join"};
 
@@ -419,7 +420,7 @@ template <>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 distinct_hash_join<cudf::has_nested::YES>::inner_join(rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr) const
+                                                      rmm::device_async_resource_ref mr) const
 {
   return _impl->inner_join(stream, mr);
 }
@@ -428,7 +429,7 @@ template <>
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 distinct_hash_join<cudf::has_nested::NO>::inner_join(rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr) const
+                                                     rmm::device_async_resource_ref mr) const
 {
   return _impl->inner_join(stream, mr);
 }
@@ -436,14 +437,14 @@ distinct_hash_join<cudf::has_nested::NO>::inner_join(rmm::cuda_stream_view strea
 template <>
 std::unique_ptr<rmm::device_uvector<size_type>>
 distinct_hash_join<cudf::has_nested::YES>::left_join(rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr) const
+                                                     rmm::device_async_resource_ref mr) const
 {
   return _impl->left_join(stream, mr);
 }
 
 template <>
 std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join<cudf::has_nested::NO>::left_join(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) const
+  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const
 {
   return _impl->left_join(stream, mr);
 }
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 17616818a58..fbe16378e8c 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -26,6 +26,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/functional.h>
@@ -157,7 +158,7 @@ probe_join_hash_table(
   null_equality compare_nulls,
   std::optional<std::size_t> output_size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // Use the output size directly if provided. Otherwise, compute the exact output size
   auto const probe_join_type =
@@ -267,7 +268,7 @@ std::size_t get_full_join_size(
   bool has_nulls,
   null_equality compare_nulls,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   std::size_t join_size = compute_join_output_size(build_table,
                                                    probe_table,
@@ -396,7 +397,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 hash_join<Hasher>::inner_join(cudf::table_view const& probe,
                               std::optional<std::size_t> output_size,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr) const
+                              rmm::device_async_resource_ref mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join(probe, cudf::detail::join_kind::INNER_JOIN, output_size, stream, mr);
@@ -408,7 +409,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 hash_join<Hasher>::left_join(cudf::table_view const& probe,
                              std::optional<std::size_t> output_size,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr) const
+                             rmm::device_async_resource_ref mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join(probe, cudf::detail::join_kind::LEFT_JOIN, output_size, stream, mr);
@@ -420,7 +421,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 hash_join<Hasher>::full_join(cudf::table_view const& probe,
                              std::optional<std::size_t> output_size,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr) const
+                             rmm::device_async_resource_ref mr) const
 {
   CUDF_FUNC_RANGE();
   return compute_hash_join(probe, cudf::detail::join_kind::FULL_JOIN, output_size, stream, mr);
@@ -481,7 +482,7 @@ std::size_t hash_join<Hasher>::left_join_size(cudf::table_view const& probe,
 template <typename Hasher>
 std::size_t hash_join<Hasher>::full_join_size(cudf::table_view const& probe,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr) const
+                                              rmm::device_async_resource_ref mr) const
 {
   CUDF_FUNC_RANGE();
 
@@ -512,7 +513,7 @@ hash_join<Hasher>::probe_join_indices(cudf::table_view const& probe_table,
                                       cudf::detail::join_kind join,
                                       std::optional<std::size_t> output_size,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr) const
+                                      rmm::device_async_resource_ref mr) const
 {
   // Trivial left join case - exit early
   if (_is_empty and join != cudf::detail::join_kind::INNER_JOIN) {
@@ -553,7 +554,7 @@ hash_join<Hasher>::compute_hash_join(cudf::table_view const& probe,
                                      cudf::detail::join_kind join,
                                      std::optional<std::size_t> output_size,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
 {
   CUDF_EXPECTS(0 != probe.num_columns(), "Hash join probe table is empty");
 
@@ -603,7 +604,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 hash_join::inner_join(cudf::table_view const& probe,
                       std::optional<std::size_t> output_size,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr) const
+                      rmm::device_async_resource_ref mr) const
 {
   return _impl->inner_join(probe, output_size, stream, mr);
 }
@@ -613,7 +614,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 hash_join::left_join(cudf::table_view const& probe,
                      std::optional<std::size_t> output_size,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr) const
+                     rmm::device_async_resource_ref mr) const
 {
   return _impl->left_join(probe, output_size, stream, mr);
 }
@@ -623,7 +624,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 hash_join::full_join(cudf::table_view const& probe,
                      std::optional<std::size_t> output_size,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr) const
+                     rmm::device_async_resource_ref mr) const
 {
   return _impl->full_join(probe, output_size, stream, mr);
 }
@@ -642,7 +643,7 @@ std::size_t hash_join::left_join_size(cudf::table_view const& probe,
 
 std::size_t hash_join::full_join_size(cudf::table_view const& probe,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr) const
+                                      rmm::device_async_resource_ref mr) const
 {
   return _impl->full_join_size(probe, stream, mr);
 }
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index ae025b1a213..bc7f09763ec 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -33,7 +34,7 @@ inner_join(table_view const& left_input,
            table_view const& right_input,
            null_equality compare_nulls,
            rmm::cuda_stream_view stream,
-           rmm::mr::device_memory_resource* mr)
+           rmm::device_async_resource_ref mr)
 {
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
@@ -68,7 +69,7 @@ left_join(table_view const& left_input,
           table_view const& right_input,
           null_equality compare_nulls,
           rmm::cuda_stream_view stream,
-          rmm::mr::device_memory_resource* mr)
+          rmm::device_async_resource_ref mr)
 {
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
@@ -93,7 +94,7 @@ full_join(table_view const& left_input,
           table_view const& right_input,
           null_equality compare_nulls,
           rmm::cuda_stream_view stream,
-          rmm::mr::device_memory_resource* mr)
+          rmm::device_async_resource_ref mr)
 {
   // Make sure any dictionary columns have matched key sets.
   // This will return any new dictionary columns created as well as updated table_views.
@@ -119,7 +120,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 inner_join(table_view const& left,
            table_view const& right,
            null_equality compare_nulls,
-           rmm::mr::device_memory_resource* mr)
+           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::inner_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
@@ -130,7 +131,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 left_join(table_view const& left,
           table_view const& right,
           null_equality compare_nulls,
-          rmm::mr::device_memory_resource* mr)
+          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
@@ -141,7 +142,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 full_join(table_view const& left,
           table_view const& right,
           null_equality compare_nulls,
-          rmm::mr::device_memory_resource* mr)
+          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::full_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 9da41e296e6..9758919c5b4 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/iterator/counting_iterator.h>
@@ -146,7 +147,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 get_trivial_left_join_indices(table_view const& left,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr);
+                              rmm::device_async_resource_ref mr);
 
 /**
  * @brief Builds the hash table based on the given `build_table`.
@@ -245,7 +246,7 @@ get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>
                                  size_type left_table_row_count,
                                  size_type right_table_row_count,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr);
+                                 rmm::device_async_resource_ref mr);
 
 /**
  * @brief Device functor to determine if an index is contained in a range.
diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu
index 7fa6642b19f..8d916da9f2c 100644
--- a/cpp/src/join/join_utils.cu
+++ b/cpp/src/join/join_utils.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include "join_common_utils.cuh"
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/functional.h>
@@ -53,7 +54,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 get_trivial_left_join_indices(table_view const& left,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
   thrust::sequence(rmm::exec_policy(stream), left_indices->begin(), left_indices->end(), 0);
@@ -93,7 +94,7 @@ get_left_join_indices_complement(std::unique_ptr<rmm::device_uvector<size_type>>
                                  size_type left_table_row_count,
                                  size_type right_table_row_count,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   // Get array of indices that do not appear in right_indices
 
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index 6223114fcd0..42e0e4f45ee 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -32,6 +32,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/scan.h>
@@ -54,7 +55,7 @@ mixed_join(
   join_kind join_type,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const& output_size_data,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(left_conditional.num_rows() == left_equality.num_rows(),
                "The left conditional and equality tables must have the same number of rows.");
@@ -304,7 +305,7 @@ compute_mixed_join_output_size(table_view const& left_equality,
                                null_equality compare_nulls,
                                join_kind join_type,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   // Until we add logic to handle the number of non-matches in the right table,
   // full joins are not supported in this function. Note that this does not
@@ -483,7 +484,7 @@ mixed_inner_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::mixed_join(left_equality,
@@ -505,7 +506,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_mixed_join_output_size(left_equality,
@@ -529,7 +530,7 @@ mixed_left_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::mixed_join(left_equality,
@@ -551,7 +552,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_mixed_join_output_size(left_equality,
@@ -575,7 +576,7 @@ mixed_full_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> const output_size_data,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::mixed_join(left_equality,
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index d654f580cad..8500b248fcf 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -93,7 +94,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   null_equality compare_nulls,
   join_kind join_type,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) &&
                  (join_type != join_kind::FULL_JOIN),
@@ -279,7 +280,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::mixed_join_semi(left_equality,
@@ -300,7 +301,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::mixed_join_semi(left_equality,
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index b0e5282d97f..91d98d5e8d3 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -29,6 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -47,7 +48,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
   cudf::table_view const& right_keys,
   null_equality compare_nulls,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(0 != left_keys.num_columns(), "Left table is empty");
   CUDF_EXPECTS(0 != right_keys.num_columns(), "Right table is empty");
@@ -97,7 +98,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join(
@@ -108,7 +109,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_anti_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
   null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join(
diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu
index ff42d9c8620..d1a1097de35 100644
--- a/cpp/src/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -37,6 +37,7 @@
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/optional.h>
 #include <thrust/pair.h>
@@ -977,7 +978,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
                                               cudf::string_scalar const& json_path,
                                               get_json_object_options options,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   // preprocess the json_path into a command buffer
   auto preprocess = build_command_buffer(json_path, stream);
@@ -1062,7 +1063,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
                                               cudf::string_scalar const& json_path,
                                               get_json_object_options options,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::get_json_object(col, json_path, options, stream, mr);
diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu
index 9fecaa1ddb2..1bfa7f39190 100644
--- a/cpp/src/labeling/label_bins.cu
+++ b/cpp/src/labeling/label_bins.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/advance.h>
 #include <thrust/binary_search.h>
@@ -110,7 +111,7 @@ std::unique_ptr<column> label_bins(column_view const& input,
                                    column_view const& left_edges,
                                    column_view const& right_edges,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   auto output = make_numeric_column(
     data_type(type_to_id<size_type>()), input.size(), mask_state::UNALLOCATED, stream, mr);
@@ -176,7 +177,7 @@ struct bin_type_dispatcher {
     column_view const& right_edges,
     inclusive right_inclusive,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     if ((left_inclusive == inclusive::YES) && (right_inclusive == inclusive::YES))
       return label_bins<T, thrust::less_equal<T>, thrust::less_equal<T>>(
@@ -204,7 +205,7 @@ std::unique_ptr<column> label_bins(column_view const& input,
                                    column_view const& right_edges,
                                    inclusive right_inclusive,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE()
   CUDF_EXPECTS((input.type() == left_edges.type()) && (input.type() == right_edges.type()),
@@ -237,7 +238,7 @@ std::unique_ptr<column> label_bins(column_view const& input,
                                    column_view const& right_edges,
                                    inclusive right_inclusive,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::label_bins(
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index 579ad8e7dff..58ec053712d 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/execution_policy.h>
@@ -52,7 +53,7 @@ namespace {
 std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
                                                       bool build_null_mask,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   auto const num_rows = input.size();
 
@@ -119,7 +120,7 @@ std::unique_ptr<column> concatenate_lists_ignore_null(column_view const& input,
 std::pair<std::unique_ptr<column>, rmm::device_uvector<int8_t>>
 generate_list_offsets_and_validities(column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   auto const num_rows = input.size();
 
@@ -174,7 +175,7 @@ std::unique_ptr<column> gather_list_entries(column_view const& input,
                                             size_type num_rows,
                                             size_type num_output_entries,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   auto const child_col      = lists_column_view(input).child();
   auto const entry_col      = lists_column_view(child_col).child();
@@ -213,7 +214,7 @@ std::unique_ptr<column> gather_list_entries(column_view const& input,
 
 std::unique_ptr<column> concatenate_lists_nullifying_rows(column_view const& input,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr)
+                                                          rmm::device_async_resource_ref mr)
 {
   // Generate offsets and validities of the output lists column.
   auto [list_offsets, list_validities] = generate_list_offsets_and_validities(input, stream, mr);
@@ -247,7 +248,7 @@ std::unique_ptr<column> concatenate_lists_nullifying_rows(column_view const& inp
 std::unique_ptr<column> concatenate_list_elements(column_view const& input,
                                                   concatenate_null_policy null_policy,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type().id() == type_id::LIST,
                "Input column must be a lists column.",
@@ -274,7 +275,7 @@ std::unique_ptr<column> concatenate_list_elements(column_view const& input,
 std::unique_ptr<column> concatenate_list_elements(column_view const& input,
                                                   concatenate_null_policy null_policy,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate_list_elements(input, null_policy, stream, mr);
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index baecef3b92d..38d299763a1 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -75,7 +76,7 @@ generate_regrouped_offsets_and_null_mask(table_device_view const& input,
                                          concatenate_null_policy null_policy,
                                          device_span<size_type const> row_null_counts,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   // outgoing offsets.
   auto offsets = cudf::make_fixed_width_column(
@@ -194,7 +195,7 @@ rmm::device_uvector<size_type> generate_null_counts(table_device_view const& inp
 std::unique_ptr<column> concatenate_rows(table_view const& input,
                                          concatenate_null_policy null_policy,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.num_columns() > 0, "The input table must have at least one column.");
 
@@ -314,7 +315,7 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
 std::unique_ptr<column> concatenate_rows(table_view const& input,
                                          concatenate_null_policy null_policy,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate_rows(input, null_policy, stream, mr);
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 378cf678f1f..4737b077deb 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -30,6 +30,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/execution_policy.h>
@@ -184,7 +185,7 @@ std::unique_ptr<column> dispatch_index_of(lists_column_view const& lists,
                                           column_view const& search_keys,
                                           duplicate_find_option find_option,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(cudf::type_dispatcher(search_keys.type(), is_supported_type_fn{}),
                "Unsupported type in `dispatch_index_of` function.");
@@ -245,7 +246,7 @@ std::unique_ptr<column> dispatch_index_of(lists_column_view const& lists,
  */
 std::unique_ptr<column> to_contains(std::unique_ptr<column>&& key_positions,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(key_positions->type().id() == type_to_id<size_type>(),
                "Expected input column of type cudf::size_type.");
@@ -274,7 +275,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  duplicate_find_option find_option,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   if (!search_key.is_valid(stream)) {
     return make_numeric_column(
@@ -298,7 +299,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  column_view const& search_keys,
                                  duplicate_find_option find_option,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(search_keys.size() == lists.size(),
                "Number of search keys must match list column size.");
@@ -308,7 +309,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto key_indices = detail::index_of(lists,
                                       search_key,
@@ -321,7 +322,7 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  column_view const& search_keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(search_keys.size() == lists.size(),
                "Number of search keys must match list column size.");
@@ -336,7 +337,7 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
 
 std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto const lists_cv      = lists.parent();
   auto output              = make_numeric_column(data_type{type_to_id<bool>()},
@@ -370,7 +371,7 @@ std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains(lists, search_key, stream, mr);
@@ -379,7 +380,7 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
 std::unique_ptr<column> contains(lists_column_view const& lists,
                                  column_view const& search_keys,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains(lists, search_keys, stream, mr);
@@ -387,7 +388,7 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
 
 std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains_nulls(lists, stream, mr);
@@ -397,7 +398,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  cudf::scalar const& search_key,
                                  duplicate_find_option find_option,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::index_of(lists, search_key, find_option, stream, mr);
@@ -407,7 +408,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  column_view const& search_keys,
                                  duplicate_find_option find_option,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::index_of(lists, search_keys, find_option, stream, mr);
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index 5407b88236f..3d609a262b9 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -56,7 +57,7 @@ namespace {
 std::unique_ptr<column> merge_offsets(host_span<lists_column_view const> columns,
                                       size_type total_list_count,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   // outgoing offsets
   auto merged_offsets = cudf::make_fixed_width_column(
@@ -96,7 +97,7 @@ std::unique_ptr<column> merge_offsets(host_span<lists_column_view const> columns
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   std::vector<lists_column_view> lists_columns;
   lists_columns.reserve(columns.size());
diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu
index 2d3826c8004..162c6140656 100644
--- a/cpp/src/lists/copying/copying.cu
+++ b/cpp/src/lists/copying/copying.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -38,7 +39,7 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
                                          size_type start,
                                          size_type end,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if (lists.is_empty() or start == end) { return cudf::empty_like(lists.parent()); }
   if (end < 0 || end > lists.size()) end = lists.size();
diff --git a/cpp/src/lists/copying/gather.cu b/cpp/src/lists/copying/gather.cu
index bd270b69656..cadeb273a65 100644
--- a/cpp/src/lists/copying/gather.cu
+++ b/cpp/src/lists/copying/gather.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/lists/detail/gather.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -92,7 +93,7 @@ struct list_gatherer {
 std::unique_ptr<column> gather_list_leaf(column_view const& column,
                                          gather_data const& gd,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   // gather map iterator for this level (N)
   auto gather_map_begin = thrust::make_transform_iterator(
@@ -121,7 +122,7 @@ std::unique_ptr<column> gather_list_leaf(column_view const& column,
 std::unique_ptr<column> gather_list_nested(cudf::lists_column_view const& list,
                                            gather_data& gd,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   // gather map iterator for this level (N)
   auto gather_map_begin = thrust::make_transform_iterator(
diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu
index 1ec66b4f98e..b754fef24e5 100644
--- a/cpp/src/lists/copying/scatter_helper.cu
+++ b/cpp/src/lists/copying/scatter_helper.cu
@@ -23,6 +23,8 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -54,7 +56,7 @@ std::pair<rmm::device_buffer, size_type> construct_child_nullmask(
   cudf::detail::lists_column_device_view const& target_lists,
   size_type num_child_rows,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto is_valid_predicate = [d_list_vector  = parent_list_vector.begin(),
                              d_offsets      = parent_list_offsets.template data<size_type>(),
@@ -160,7 +162,7 @@ struct list_child_constructor {
     cudf::lists_column_view const& source_lists_column_view,
     cudf::lists_column_view const& target_lists_column_view,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto source_column_device_view =
       column_device_view::create(source_lists_column_view.parent(), stream);
@@ -219,7 +221,7 @@ struct list_child_constructor {
     cudf::lists_column_view const& source_lists_column_view,
     cudf::lists_column_view const& target_lists_column_view,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto source_column_device_view =
       column_device_view::create(source_lists_column_view.parent(), stream);
@@ -282,7 +284,7 @@ struct list_child_constructor {
     cudf::lists_column_view const& source_lists_column_view,
     cudf::lists_column_view const& target_lists_column_view,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto source_column_device_view =
       column_device_view::create(source_lists_column_view.parent(), stream);
@@ -378,7 +380,7 @@ struct list_child_constructor {
     cudf::lists_column_view const& source_lists_column_view,
     cudf::lists_column_view const& target_lists_column_view,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto const source_column_device_view =
       column_device_view::create(source_lists_column_view.parent(), stream);
@@ -468,7 +470,7 @@ std::unique_ptr<column> build_lists_child_column_recursive(
   cudf::lists_column_view const& source_lists_column_view,
   cudf::lists_column_view const& target_lists_column_view,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher<dispatch_storage_type>(child_column_type,
                                                       list_child_constructor{},
diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 156f868c5bd..89b1a126fc5 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -22,6 +22,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -35,7 +36,7 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& value_column,
                                          lists_column_view const& gather_map,
                                          out_of_bounds_policy bounds_policy,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_index_type(gather_map.child().type()),
                "Gather map should be list column of index type");
@@ -120,7 +121,7 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& source_column,
                                          lists_column_view const& gather_map_list,
                                          out_of_bounds_policy bounds_policy,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_gather(source_column, gather_map_list, bounds_policy, stream, mr);
diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu
index 2fd0851067a..19c434d10e1 100644
--- a/cpp/src/lists/count_elements.cu
+++ b/cpp/src/lists/count_elements.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -45,7 +46,7 @@ namespace detail {
  */
 std::unique_ptr<column> count_elements(lists_column_view const& input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto device_column = cudf::column_device_view::create(input.parent(), stream);
   auto d_column      = *device_column;
@@ -74,7 +75,7 @@ std::unique_ptr<column> count_elements(lists_column_view const& input,
 
 std::unique_ptr<column> count_elements(lists_column_view const& input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::count_elements(input, stream, mr);
diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu
index 5f1d30321a2..370d7480578 100644
--- a/cpp/src/lists/explode.cu
+++ b/cpp/src/lists/explode.cu
@@ -26,6 +26,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/advance.h>
@@ -59,7 +60,7 @@ std::unique_ptr<table> build_table(
   thrust::optional<cudf::device_span<size_type const>> explode_col_gather_map,
   thrust::optional<rmm::device_uvector<size_type>> position_array,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto select_iter = thrust::make_transform_iterator(
     thrust::make_counting_iterator(0),
@@ -113,7 +114,7 @@ std::unique_ptr<table> build_table(
 std::unique_ptr<table> explode(table_view const& input_table,
                                size_type const explode_column_idx,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   lists_column_view explode_col{input_table.column(explode_column_idx)};
   auto sliced_child = explode_col.get_sliced_child(stream);
@@ -151,7 +152,7 @@ std::unique_ptr<table> explode(table_view const& input_table,
 std::unique_ptr<table> explode_position(table_view const& input_table,
                                         size_type const explode_column_idx,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   lists_column_view explode_col{input_table.column(explode_column_idx)};
   auto sliced_child = explode_col.get_sliced_child(stream);
@@ -202,7 +203,7 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
                                      size_type const explode_column_idx,
                                      bool include_position,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   lists_column_view explode_col{input_table.column(explode_column_idx)};
   auto sliced_child  = explode_col.get_sliced_child(stream);
@@ -299,11 +300,11 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
 }  // namespace detail
 
 /**
- * @copydoc cudf::explode(table_view const&, size_type, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::explode(table_view const&, size_type, rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode(table_view const& input_table,
                                size_type explode_column_idx,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
@@ -312,11 +313,11 @@ std::unique_ptr<table> explode(table_view const& input_table,
 }
 
 /**
- * @copydoc cudf::explode_position(table_view const&, size_type, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::explode_position(table_view const&, size_type, rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode_position(table_view const& input_table,
                                         size_type explode_column_idx,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
@@ -325,11 +326,11 @@ std::unique_ptr<table> explode_position(table_view const& input_table,
 }
 
 /**
- * @copydoc cudf::explode_outer(table_view const&, size_type, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::explode_outer(table_view const&, size_type, rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode_outer(table_view const& input_table,
                                      size_type explode_column_idx,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
@@ -340,11 +341,11 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
 
 /**
  * @copydoc cudf::explode_outer_position(table_view const&, size_type,
- * rmm::mr::device_memory_resource*)
+ * rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode_outer_position(table_view const& input_table,
                                               size_type explode_column_idx,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index 365e9ef8255..c0ce86fb56e 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/fill.h>
@@ -118,7 +119,7 @@ template <typename index_t>
 std::unique_ptr<column> extract_list_element_impl(lists_column_view lists_column,
                                                   index_t const& index,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   auto const num_lists = lists_column.size();
   if (num_lists == 0) { return empty_like(lists_column.child()); }
@@ -174,7 +175,7 @@ std::unique_ptr<column> extract_list_element_impl(lists_column_view lists_column
 std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
                                              size_type const index,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   return detail::extract_list_element_impl(lists_column, index, stream, mr);
 }
@@ -182,7 +183,7 @@ std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
 std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
                                              column_view const& indices,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   return detail::extract_list_element_impl(lists_column, indices, stream, mr);
 }
@@ -192,12 +193,12 @@ std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
 /**
  * @copydoc cudf::lists::extract_list_element(lists_column_view const&,
  *                                            size_type,
- *                                            rmm::mr::device_memory_resource*)
+ *                                            rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> extract_list_element(lists_column_view const& lists_column,
                                              size_type index,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_list_element(lists_column, index, stream, mr);
@@ -206,12 +207,12 @@ std::unique_ptr<column> extract_list_element(lists_column_view const& lists_colu
 /**
  * @copydoc cudf::lists::extract_list_element(lists_column_view const&,
  *                                            column_view const&,
- *                                            rmm::mr::device_memory_resource*)
+ *                                            rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> extract_list_element(lists_column_view const& lists_column,
                                              column_view const& indices,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(indices.size() == lists_column.size(),
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index 478b6c9a209..88eccf13f72 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -29,6 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -51,7 +52,7 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<int8_t>>
 generate_list_offsets_and_validities(table_view const& input,
                                      bool has_null_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   auto const num_cols         = input.num_columns();
   auto const num_rows         = input.num_rows();
@@ -99,7 +100,7 @@ generate_list_offsets_and_validities(table_view const& input,
  */
 std::unique_ptr<column> concatenate_and_gather_lists(host_span<column_view const> columns_to_concat,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   // Concatenate all columns into a single (temporary) column.
   auto const concatenated_col =
@@ -218,7 +219,7 @@ struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::
                                      size_type num_output_entries,
                                      bool data_has_null_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const noexcept
+                                     rmm::device_async_resource_ref mr) const noexcept
   {
     auto const table_dv_ptr = table_device_view::create(input, stream);
     auto comp_fn            = compute_string_sizes_and_interleave_lists_fn{
@@ -250,7 +251,7 @@ struct interleave_list_entries_impl<T, std::enable_if_t<cudf::is_fixed_width<T>(
                                      size_type num_output_entries,
                                      bool data_has_null_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const noexcept
+                                     rmm::device_async_resource_ref mr) const noexcept
   {
     auto const num_cols     = input.num_columns();
     auto const table_dv_ptr = table_device_view::create(input, stream);
@@ -329,7 +330,7 @@ struct interleave_list_entries_fn {
                                      size_type num_output_entries,
                                      bool data_has_null_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     return interleave_list_entries_impl<T>{}(input,
                                              output_list_offsets,
@@ -350,7 +351,7 @@ struct interleave_list_entries_fn {
 std::unique_ptr<column> interleave_columns(table_view const& input,
                                            bool has_null_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto const entry_type = lists_column_view(*input.begin()).child().type();
   for (auto const& col : input) {
diff --git a/cpp/src/lists/lists_column_factories.cu b/cpp/src/lists/lists_column_factories.cu
index 278e5af07b2..66ad1c35c33 100644
--- a/cpp/src/lists/lists_column_factories.cu
+++ b/cpp/src/lists/lists_column_factories.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/sequence.h>
@@ -36,7 +37,7 @@ namespace detail {
 std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& value,
                                                             size_type size,
                                                             rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource* mr)
+                                                            rmm::device_async_resource_ref mr)
 {
   if (size == 0) {
     return make_lists_column(0,
@@ -84,7 +85,7 @@ std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& v
 
 std::unique_ptr<column> make_empty_lists_column(data_type child_type,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   auto offsets = make_empty_column(data_type(type_to_id<size_type>()));
   auto child   = make_empty_column(child_type);
@@ -95,7 +96,7 @@ std::unique_ptr<column> make_empty_lists_column(data_type child_type,
 std::unique_ptr<column> make_all_nulls_lists_column(size_type size,
                                                     data_type child_type,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   auto offsets = [&] {
     auto offsets_buff =
@@ -120,7 +121,7 @@ std::unique_ptr<column> make_lists_column(size_type num_rows,
                                           size_type null_count,
                                           rmm::device_buffer&& null_mask,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   if (null_count > 0) { CUDF_EXPECTS(null_mask.size() > 0, "Column with nulls must be nullable."); }
   CUDF_EXPECTS(
diff --git a/cpp/src/lists/reverse.cu b/cpp/src/lists/reverse.cu
index 6c00f8b64b4..d913ce070ae 100644
--- a/cpp/src/lists/reverse.cu
+++ b/cpp/src/lists/reverse.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -36,7 +37,7 @@ namespace detail {
 
 std::unique_ptr<column> reverse(lists_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input.parent()); }
 
@@ -88,7 +89,7 @@ std::unique_ptr<column> reverse(lists_column_view const& input,
 
 std::unique_ptr<column> reverse(lists_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::reverse(input, stream, mr);
diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu
index 0b70773f4b2..f920fb916eb 100644
--- a/cpp/src/lists/segmented_sort.cu
+++ b/cpp/src/lists/segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -43,7 +44,7 @@ namespace {
  */
 std::unique_ptr<column> build_output_offsets(lists_column_view const& input,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   auto output_offset = make_numeric_column(
     input.offsets().type(), input.size() + 1, mask_state::UNALLOCATED, stream, mr);
@@ -63,7 +64,7 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
                                    order column_order,
                                    null_order null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return empty_like(input.parent());
 
@@ -91,7 +92,7 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
                                           order column_order,
                                           null_order null_precedence,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return empty_like(input.parent()); }
 
@@ -120,7 +121,7 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
                                    order column_order,
                                    null_order null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sort_lists(input, column_order, null_precedence, stream, mr);
@@ -130,7 +131,7 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
                                           order column_order,
                                           null_order null_precedence,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_sort_lists(input, column_order, null_precedence, stream, mr);
diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
index f92ba782da7..cb14ae7619b 100644
--- a/cpp/src/lists/sequences.cu
+++ b/cpp/src/lists/sequences.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -88,7 +89,7 @@ struct sequences_dispatcher {
                                      std::optional<column_view> const& steps,
                                      size_type const* offsets,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return sequences_functor<T>::invoke(n_lists, n_elements, starts, steps, offsets, stream, mr);
   }
@@ -108,7 +109,7 @@ struct sequences_functor<T, std::enable_if_t<is_supported<T>()>> {
                                         std::optional<column_view> const& steps,
                                         size_type const* offsets,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     auto result =
       make_fixed_width_column(starts.type(), n_elements, mask_state::UNALLOCATED, stream, mr);
@@ -132,7 +133,7 @@ std::unique_ptr<column> sequences(column_view const& starts,
                                   std::optional<column_view> const& steps,
                                   column_view const& sizes,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!starts.has_nulls() && !sizes.has_nulls(),
                "starts and sizes input columns must not have nulls.");
@@ -190,7 +191,7 @@ std::unique_ptr<column> sequences(column_view const& starts,
 std::unique_ptr<column> sequences(column_view const& starts,
                                   column_view const& sizes,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   return sequences(starts, std::nullopt, sizes, stream, mr);
 }
@@ -199,7 +200,7 @@ std::unique_ptr<column> sequences(column_view const& starts,
                                   column_view const& steps,
                                   column_view const& sizes,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   return sequences(starts, std::optional<column_view>{steps}, sizes, stream, mr);
 }
@@ -209,7 +210,7 @@ std::unique_ptr<column> sequences(column_view const& starts,
 std::unique_ptr<column> sequences(column_view const& starts,
                                   column_view const& sizes,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sequences(starts, sizes, stream, mr);
@@ -219,7 +220,7 @@ std::unique_ptr<column> sequences(column_view const& starts,
                                   column_view const& steps,
                                   column_view const& sizes,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sequences(starts, steps, sizes, stream, mr);
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index 5735c84e3d3..f3352a3a52d 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/functional.h>
@@ -62,7 +63,7 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
                                      null_equality nulls_equal,
                                      nan_equality nans_equal,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   check_compatibility(lhs, rhs);
 
@@ -132,7 +133,7 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
                                            null_equality nulls_equal,
                                            nan_equality nans_equal,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   check_compatibility(lhs, rhs);
 
@@ -193,7 +194,7 @@ std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   check_compatibility(lhs, rhs);
 
@@ -214,7 +215,7 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
                                             null_equality nulls_equal,
                                             nan_equality nans_equal,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   check_compatibility(lhs, rhs);
 
@@ -279,7 +280,7 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
                                      null_equality nulls_equal,
                                      nan_equality nans_equal,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::have_overlap(lhs, rhs, nulls_equal, nans_equal, stream, mr);
@@ -290,7 +291,7 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
                                            null_equality nulls_equal,
                                            nan_equality nans_equal,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::intersect_distinct(lhs, rhs, nulls_equal, nans_equal, stream, mr);
@@ -301,7 +302,7 @@ std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::union_distinct(lhs, rhs, nulls_equal, nans_equal, stream, mr);
@@ -312,7 +313,7 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
                                             null_equality nulls_equal,
                                             nan_equality nans_equal,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::difference_distinct(lhs, rhs, nulls_equal, nans_equal, stream, mr);
diff --git a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
index ce972d89150..71aafa3ce12 100644
--- a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
@@ -39,7 +40,7 @@ namespace detail {
 std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                            lists_column_view const& boolean_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(boolean_mask.child().type().id() == type_id::BOOL8, "Mask must be of type BOOL8.");
   CUDF_EXPECTS(input.size() == boolean_mask.size(),
@@ -102,7 +103,7 @@ std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
 std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                            lists_column_view const& boolean_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::apply_boolean_mask(input, boolean_mask, stream, mr);
diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu
index c8d9c15706f..40dee010bd5 100644
--- a/cpp/src/lists/stream_compaction/distinct.cu
+++ b/cpp/src/lists/stream_compaction/distinct.cu
@@ -26,6 +26,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <utility>
@@ -37,7 +38,7 @@ std::unique_ptr<column> distinct(lists_column_view const& input,
                                  null_equality nulls_equal,
                                  nan_equality nans_equal,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   // Algorithm:
   // - Generate labels for the child elements.
@@ -77,7 +78,7 @@ std::unique_ptr<column> distinct(lists_column_view const& input,
                                  null_equality nulls_equal,
                                  nan_equality nans_equal,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::distinct(input, nulls_equal, nans_equal, stream, mr);
diff --git a/cpp/src/lists/utilities.cu b/cpp/src/lists/utilities.cu
index 2c4966c969e..7fb960f02ca 100644
--- a/cpp/src/lists/utilities.cu
+++ b/cpp/src/lists/utilities.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,12 +20,14 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/labeling/label_segments.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::lists::detail {
 
 std::unique_ptr<column> generate_labels(lists_column_view const& input,
                                         size_type n_elements,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   auto labels = make_numeric_column(
     data_type(type_to_id<size_type>()), n_elements, cudf::mask_state::UNALLOCATED, stream, mr);
@@ -38,7 +40,7 @@ std::unique_ptr<column> generate_labels(lists_column_view const& input,
 std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
                                             size_type n_lists,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 
 {
   auto out_offsets = make_numeric_column(
@@ -56,7 +58,7 @@ std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
 
 std::unique_ptr<column> get_normalized_offsets(lists_column_view const& input,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return empty_like(input.offsets()); }
 
diff --git a/cpp/src/lists/utilities.hpp b/cpp/src/lists/utilities.hpp
index c881e828677..218ad7872e9 100644
--- a/cpp/src/lists/utilities.hpp
+++ b/cpp/src/lists/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::lists::detail {
 
@@ -36,7 +37,7 @@ namespace cudf::lists::detail {
 std::unique_ptr<column> generate_labels(lists_column_view const& input,
                                         size_type n_elements,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Reconstruct an offsets column from the input list labels column.
@@ -50,7 +51,7 @@ std::unique_ptr<column> generate_labels(lists_column_view const& input,
 std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
                                             size_type n_lists,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr);
+                                            rmm::device_async_resource_ref mr);
 
 /**
  * @brief Generate 0-based list offsets from the offsets of the input lists column.
@@ -62,6 +63,6 @@ std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
  */
 std::unique_ptr<column> get_normalized_offsets(lists_column_view const& input,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr);
+                                               rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::lists::detail
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 8be503025bd..4463b16df78 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -38,6 +38,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -347,7 +348,7 @@ struct column_merger {
   std::unique_ptr<column> operator()(column_view const&,
                                      column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Unsupported type for merge.");
   }
@@ -359,7 +360,7 @@ struct column_merger {
     column_view const& lcol,
     column_view const& rcol,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr) const
+    rmm::device_async_resource_ref mr) const
   {
     auto lsz         = lcol.size();
     auto merged_size = lsz + rcol.size();
@@ -431,7 +432,7 @@ std::unique_ptr<column> column_merger::operator()<cudf::string_view>(
   column_view const& lcol,
   column_view const& rcol,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   auto column = strings::detail::merge<index_type>(strings_column_view(lcol),
                                                    strings_column_view(rcol),
@@ -453,7 +454,7 @@ std::unique_ptr<column> column_merger::operator()<cudf::dictionary32>(
   column_view const& lcol,
   column_view const& rcol,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   auto result = cudf::dictionary::detail::merge(
     cudf::dictionary_column_view(lcol), cudf::dictionary_column_view(rcol), row_order_, stream, mr);
@@ -473,7 +474,7 @@ std::unique_ptr<column> column_merger::operator()<cudf::list_view>(
   column_view const& lcol,
   column_view const& rcol,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   std::vector<column_view> columns{lcol, rcol};
   auto concatenated_list = cudf::lists::detail::concatenate(columns, stream, mr);
@@ -501,7 +502,7 @@ std::unique_ptr<column> column_merger::operator()<cudf::struct_view>(
   column_view const& lcol,
   column_view const& rcol,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   // merge each child.
   auto const lhs = structs_column_view{lcol};
@@ -550,7 +551,7 @@ table_ptr_type merge(cudf::table_view const& left_table,
                      std::vector<cudf::order> const& column_order,
                      std::vector<cudf::null_order> const& null_precedence,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr)
+                     rmm::device_async_resource_ref mr)
 {
   // collect index columns for lhs, rhs, resp.
   //
@@ -620,7 +621,7 @@ table_ptr_type merge(std::vector<table_view> const& tables_to_merge,
                      std::vector<cudf::order> const& column_order,
                      std::vector<cudf::null_order> const& null_precedence,
                      rmm::cuda_stream_view stream,
-                     rmm::mr::device_memory_resource* mr)
+                     rmm::device_async_resource_ref mr)
 {
   if (tables_to_merge.empty()) { return std::make_unique<cudf::table>(); }
 
@@ -702,7 +703,7 @@ std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merg
                                    std::vector<cudf::size_type> const& key_cols,
                                    std::vector<cudf::order> const& column_order,
                                    std::vector<cudf::null_order> const& null_precedence,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::merge(
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 0d2daaddb8c..f10388794fc 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_scan.cuh>
 #include <cub/device/device_histogram.cuh>
@@ -413,7 +414,7 @@ struct copy_block_partitions_dispatcher {
                                      size_type const* scanned_block_partition_sizes,
                                      size_type grid_size,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     rmm::device_buffer output(input.size() * sizeof(DataType), stream, mr);
 
@@ -441,7 +442,7 @@ struct copy_block_partitions_dispatcher {
                                      size_type const* scanned_block_partition_sizes,
                                      size_type grid_size,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     // Use move_to_output_buffer to create an equivalent gather map
     auto gather_map = compute_gather_map(input.size(),
@@ -471,7 +472,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
   size_type num_partitions,
   uint32_t seed,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto const num_rows = table_to_hash.num_rows();
 
@@ -658,7 +659,7 @@ struct dispatch_map_type {
              column_view const& partition_map,
              size_type num_partitions,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr) const
+             rmm::device_async_resource_ref mr) const
   {
     // Build a histogram of the number of rows in each partition
     rmm::device_uvector<size_type> histogram(num_partitions + 1, stream);
@@ -761,7 +762,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   int num_partitions,
   uint32_t seed,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto table_to_hash = input.select(columns_to_hash);
 
@@ -785,7 +786,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   column_view const& partition_map,
   size_type num_partitions,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(t.num_rows() == partition_map.size(),
                "Size mismatch between table and partition map.");
@@ -809,7 +810,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   hash_id hash_function,
   uint32_t seed,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -833,7 +834,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   table_view const& t,
   column_view const& partition_map,
   size_type num_partitions,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::partition(t, partition_map, num_partitions, cudf::get_default_stream(), mr);
diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu
index 3283a7c35ee..82b169c78ed 100644
--- a/cpp/src/partitioning/round_robin.cu
+++ b/cpp/src/partitioning/round_robin.cu
@@ -30,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -83,7 +84,7 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> degenerate
   cudf::size_type num_partitions,
   cudf::size_type start_partition,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto nrows = input.num_rows();
 
@@ -157,7 +158,7 @@ std::pair<std::unique_ptr<table>, std::vector<cudf::size_type>> round_robin_part
   cudf::size_type num_partitions,
   cudf::size_type start_partition,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto nrows = input.num_rows();
 
@@ -270,8 +271,8 @@ std::pair<std::unique_ptr<table>, std::vector<cudf::size_type>> round_robin_part
 std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> round_robin_partition(
   table_view const& input,
   cudf::size_type num_partitions,
-  cudf::size_type start_partition     = 0,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  cudf::size_type start_partition   = 0,
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   CUDF_FUNC_RANGE();
   return detail::round_robin_partition(
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index cba7203483b..b25254cfe49 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -33,6 +33,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -53,7 +54,7 @@ struct quantile_functor {
   interpolation interp;
   bool retain_types;
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   template <typename T>
   std::enable_if_t<not std::is_arithmetic_v<T> and not cudf::is_fixed_point<T>(),
@@ -145,7 +146,7 @@ std::unique_ptr<column> quantile(column_view const& input,
                                  interpolation interp,
                                  bool retain_types,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto functor = quantile_functor<exact, SortMapIterator>{
     ordered_indices, size, q, interp, retain_types, stream, mr};
@@ -163,7 +164,7 @@ std::unique_ptr<column> quantile(column_view const& input,
                                  column_view const& indices,
                                  bool exact,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   if (indices.is_empty()) {
     auto begin = thrust::make_counting_iterator<size_type>(0);
@@ -193,7 +194,7 @@ std::unique_ptr<column> quantile(column_view const& input,
                                  interpolation interp,
                                  column_view const& ordered_indices,
                                  bool exact,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::quantile(input, q, interp, ordered_indices, exact, cudf::get_default_stream(), mr);
diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu
index 8fee821dfc4..c0f536536ce 100644
--- a/cpp/src/quantiles/quantiles.cu
+++ b/cpp/src/quantiles/quantiles.cu
@@ -27,6 +27,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -43,7 +44,7 @@ std::unique_ptr<table> quantiles(table_view const& input,
                                  std::vector<double> const& q,
                                  interpolation interp,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto quantile_idx_lookup = cuda::proclaim_return_type<size_type>(
     [sortmap, interp, size = input.num_rows()] __device__(double q) {
@@ -71,7 +72,7 @@ std::unique_ptr<table> quantiles(table_view const& input,
                                  std::vector<order> const& column_order,
                                  std::vector<null_order> const& null_precedence,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   if (q.empty()) { return empty_like(input); }
 
@@ -99,7 +100,7 @@ std::unique_ptr<table> quantiles(table_view const& input,
                                  cudf::sorted is_input_sorted,
                                  std::vector<order> const& column_order,
                                  std::vector<null_order> const& null_precedence,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::quantiles(input,
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 96b0355c6e5..da36b7ab1da 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/advance.h>
@@ -184,7 +185,7 @@ CUDF_KERNEL void compute_percentiles_kernel(device_span<size_type const> tdigest
 std::unique_ptr<column> compute_approx_percentiles(tdigest_column_view const& input,
                                                    column_view const& percentiles,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   tdigest_column_view tdv(input);
 
@@ -259,7 +260,7 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
                                             std::unique_ptr<column>&& min_values,
                                             std::unique_ptr<column>&& max_values,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(tdigest_offsets->size() == num_rows + 1,
                "Encountered unexpected offset count in make_tdigest_column");
@@ -291,7 +292,7 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
 }
 
 std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   auto offsets = cudf::make_fixed_width_column(
     data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
@@ -334,7 +335,7 @@ std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
  * @returns An empty tdigest scalar.
  */
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   auto contents = make_empty_tdigest_column(stream, mr)->release();
   return std::make_unique<struct_scalar>(
@@ -346,7 +347,7 @@ std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
 std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
                                           column_view const& percentiles,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   tdigest_column_view tdv(input);
   CUDF_EXPECTS(percentiles.type().id() == type_id::FLOAT64,
@@ -407,7 +408,7 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
 
 std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
                                           column_view const& percentiles,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return tdigest::percentile_approx(input, percentiles, cudf::get_default_stream(), mr);
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 8544d9caa56..229af89fc46 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -33,6 +33,7 @@
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/advance.h>
@@ -333,7 +334,7 @@ __device__ double scale_func_k1(double quantile, double delta_norm)
 // convert a single-row tdigest column to a scalar.
 std::unique_ptr<scalar> to_tdigest_scalar(std::unique_ptr<column>&& tdigest,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(tdigest->size() == 1,
                "Encountered invalid tdigest column when converting to scalar");
@@ -517,7 +518,7 @@ generate_group_cluster_info(int delta,
                             CumulativeWeight cumulative_weight,
                             bool has_nulls,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   constexpr size_type block_size = 256;
   cudf::detail::grid_1d const grid(num_groups, block_size);
@@ -581,7 +582,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                                             std::unique_ptr<column>&& max_col,
                                             bool has_nulls,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   // whether or not this weight is a stub
   auto is_stub_weight = [weights = weights->view().begin<double>()] __device__(size_type i) {
@@ -732,7 +733,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                                          size_type total_clusters,
                                          bool has_nulls,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   // the output for each group is a column of data that represents the tdigest. since we want 1 row
   // per group, each row will be a list the length of the tdigest for that group. so our output
@@ -841,7 +842,7 @@ struct typed_group_tdigest {
                                      size_type num_groups,
                                      int delta,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     // first, generate cluster weight information for each input group
     auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info(
@@ -907,7 +908,7 @@ struct typed_reduce_tdigest {
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      int delta,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     // treat this the same as the groupby path with a single group.  Note:  even though
     // there is only 1 group there are still multiple keys within the group that represent
@@ -1029,7 +1030,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                                        size_type num_groups,
                                        int max_centroids,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   // thrust::merge and thrust::merge_by_key don't provide what we need.  What we would need is an
   // algorithm like a super-merge that takes two layers of keys: one which identifies the outer
@@ -1211,7 +1212,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 std::unique_ptr<scalar> reduce_tdigest(column_view const& col,
                                        int max_centroids,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_scalar(stream, mr); }
 
@@ -1234,7 +1235,7 @@ struct group_offsets_fn {
 std::unique_ptr<scalar> reduce_merge_tdigest(column_view const& input,
                                              int max_centroids,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   tdigest_column_view tdv(input);
 
@@ -1264,7 +1265,7 @@ std::unique_ptr<column> group_tdigest(column_view const& col,
                                       size_type num_groups,
                                       int max_centroids,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
 
@@ -1287,7 +1288,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
                                             size_type num_groups,
                                             int max_centroids,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   tdigest_column_view tdv(input);
 
diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu
index 6cea4e4ada3..11b0e2732fe 100644
--- a/cpp/src/reductions/all.cu
+++ b/cpp/src/reductions/all.cu
@@ -19,6 +19,8 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/atomic>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -55,7 +57,7 @@ struct all_fn {
   template <typename T, std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const d_dict = cudf::column_device_view::create(input, stream);
     auto const iter   = [&] {
@@ -74,7 +76,7 @@ struct all_fn {
   template <typename T, std::enable_if_t<!std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unexpected key type for dictionary in reduction all()");
   }
@@ -86,7 +88,7 @@ std::unique_ptr<cudf::scalar> all(column_view const& col,
                                   cudf::data_type const output_dtype,
                                   std::optional<std::reference_wrapper<scalar const>> init,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "all() operation can be applied with output type `BOOL8` only");
diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu
index c0c044a1e6f..0ebeb7a48b9 100644
--- a/cpp/src/reductions/any.cu
+++ b/cpp/src/reductions/any.cu
@@ -19,6 +19,8 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/atomic>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -55,7 +57,7 @@ struct any_fn {
   template <typename T, std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const d_dict = cudf::column_device_view::create(input, stream);
     auto const iter   = [&] {
@@ -74,7 +76,7 @@ struct any_fn {
   template <typename T, std::enable_if_t<!std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<scalar> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unexpected key type for dictionary in reduction any()");
   }
@@ -86,7 +88,7 @@ std::unique_ptr<cudf::scalar> any(column_view const& col,
                                   cudf::data_type const output_dtype,
                                   std::optional<std::reference_wrapper<scalar const>> init,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "any() operation can be applied with output type `bool8` only");
diff --git a/cpp/src/reductions/collect_ops.cu b/cpp/src/reductions/collect_ops.cu
index 743eddbffaf..c1a1f117ee1 100644
--- a/cpp/src/reductions/collect_ops.cu
+++ b/cpp/src/reductions/collect_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -45,7 +47,7 @@ bool need_handle_nulls(column_view const& input, null_policy null_handling)
 std::unique_ptr<scalar> collect_list(column_view const& col,
                                      null_policy null_handling,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   if (need_handle_nulls(col, null_handling)) {
     auto d_view             = column_device_view::create(col, stream);
@@ -61,7 +63,7 @@ std::unique_ptr<scalar> collect_list(column_view const& col,
 
 std::unique_ptr<scalar> merge_lists(lists_column_view const& col,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto flatten_col = col.get_sliced_child(stream);
   return make_list_scalar(flatten_col, stream, mr);
@@ -72,7 +74,7 @@ std::unique_ptr<scalar> collect_set(column_view const& col,
                                     null_equality nulls_equal,
                                     nan_equality nans_equal,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   // `input_as_collect_list` is the result of the input column that has been processed to obey
   // the given null handling behavior.
@@ -101,7 +103,7 @@ std::unique_ptr<scalar> merge_sets(lists_column_view const& col,
                                    null_equality nulls_equal,
                                    nan_equality nans_equal,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   auto flatten_col    = col.get_sliced_child(stream);
   auto distinct_table = cudf::detail::distinct(table_view{{flatten_col}},
diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh
index 3428130d912..aa71546f049 100644
--- a/cpp/src/reductions/compound.cuh
+++ b/cpp/src/reductions/compound.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,8 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/transform_iterator.h>
 
 namespace cudf {
@@ -48,7 +50,7 @@ std::unique_ptr<scalar> compound_reduction(column_view const& col,
                                            data_type const output_dtype,
                                            size_type ddof,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto const valid_count = col.size() - col.null_count();
 
@@ -101,7 +103,7 @@ struct result_type_dispatcher {
                                      cudf::data_type const output_dtype,
                                      size_type ddof,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return compound_reduction<ElementType, ResultType, Op>(col, output_dtype, ddof, stream, mr);
   }
@@ -111,7 +113,7 @@ struct result_type_dispatcher {
                                      cudf::data_type const output_dtype,
                                      size_type ddof,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_FAIL("Unsupported output data type");
   }
@@ -134,7 +136,7 @@ struct element_type_dispatcher {
                                      cudf::data_type const output_dtype,
                                      size_type ddof,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return cudf::type_dispatcher(
       output_dtype, result_type_dispatcher<ElementType, Op>(), col, output_dtype, ddof, stream, mr);
@@ -145,7 +147,7 @@ struct element_type_dispatcher {
                                      cudf::data_type const output_dtype,
                                      size_type ddof,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_FAIL(
       "Reduction operators other than `min` and `max`"
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index 3e46a34cc6a..bebb9d14923 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -21,6 +21,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/atomic>
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -114,7 +116,7 @@ auto gather_histogram(table_view const& input,
                       device_span<size_type const> distinct_indices,
                       std::unique_ptr<column>&& distinct_counts,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr)
+                      rmm::device_async_resource_ref mr)
 {
   auto distinct_rows = cudf::detail::gather(input,
                                             distinct_indices,
@@ -152,7 +154,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>, std::unique_ptr<colum
 compute_row_frequencies(table_view const& input,
                         std::optional<column_view> const& partial_counts,
                         rmm::cuda_stream_view stream,
-                        rmm::mr::device_memory_resource* mr)
+                        rmm::device_async_resource_ref mr)
 {
   auto const has_nested_columns = cudf::detail::has_nested_columns(input);
 
@@ -236,7 +238,7 @@ compute_row_frequencies(table_view const& input,
 
 std::unique_ptr<cudf::scalar> histogram(column_view const& input,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   // Empty group should be handled before reaching here.
   CUDF_EXPECTS(input.size() > 0, "Input should not be empty.", std::invalid_argument);
@@ -249,7 +251,7 @@ std::unique_ptr<cudf::scalar> histogram(column_view const& input,
 
 std::unique_ptr<cudf::scalar> merge_histogram(column_view const& input,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   // Empty group should be handled before reaching here.
   CUDF_EXPECTS(input.size() > 0, "Input should not be empty.", std::invalid_argument);
diff --git a/cpp/src/reductions/max.cu b/cpp/src/reductions/max.cu
index 1cf2b6f53b6..682889f0fee 100644
--- a/cpp/src/reductions/max.cu
+++ b/cpp/src/reductions/max.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::scalar> max(column_view const& col,
                                   cudf::data_type const output_dtype,
                                   std::optional<std::reference_wrapper<scalar const>> init,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto const input_type =
     cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() : col.type();
diff --git a/cpp/src/reductions/mean.cu b/cpp/src/reductions/mean.cu
index e64660932ce..e8a10f02cc1 100644
--- a/cpp/src/reductions/mean.cu
+++ b/cpp/src/reductions/mean.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -28,7 +29,7 @@ namespace detail {
 std::unique_ptr<cudf::scalar> mean(column_view const& col,
                                    cudf::data_type const output_dtype,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   auto col_type =
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type();
diff --git a/cpp/src/reductions/min.cu b/cpp/src/reductions/min.cu
index 792965e8b99..7986bda5751 100644
--- a/cpp/src/reductions/min.cu
+++ b/cpp/src/reductions/min.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,8 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -26,7 +28,7 @@ std::unique_ptr<cudf::scalar> min(column_view const& col,
                                   data_type const output_dtype,
                                   std::optional<std::reference_wrapper<scalar const>> init,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto const input_type =
     cudf::is_dictionary(col.type()) ? cudf::dictionary_column_view(col).keys().type() : col.type();
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index c4eb09110c6..62a1f4aab7c 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/extrema.h>
 #include <thrust/functional.h>
@@ -190,7 +191,7 @@ struct minmax_functor {
             std::enable_if_t<is_supported<T>() and !std::is_same_v<T, cudf::string_view> and
                              !cudf::is_dictionary<T>()>* = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   {
     using storage_type = device_storage_type_t<T>;
     // compute minimum and maximum values
@@ -210,7 +211,7 @@ struct minmax_functor {
    */
   template <typename T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   {
     // compute minimum and maximum values
     auto dev_result = reduce<cudf::string_view>(col, stream);
@@ -229,7 +230,7 @@ struct minmax_functor {
    */
   template <typename T, std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+    cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   {
     // compute minimum and maximum values
     auto dev_result = reduce<T>(col, stream);
@@ -246,7 +247,7 @@ struct minmax_functor {
 
   template <typename T, std::enable_if_t<!is_supported<T>()>* = nullptr>
   std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> operator()(
-    cudf::column_view const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*)
+    cudf::column_view const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
   {
     CUDF_FAIL("type not supported for minmax() operation");
   }
@@ -260,7 +261,7 @@ struct minmax_functor {
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
-  cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  cudf::column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   if (col.null_count() == col.size()) {
     // this handles empty and all-null columns
@@ -274,7 +275,7 @@ std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
 }  // namespace detail
 
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
-  column_view const& col, rmm::mr::device_memory_resource* mr)
+  column_view const& col, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::minmax(col, cudf::get_default_stream(), mr);
diff --git a/cpp/src/reductions/nth_element.cu b/cpp/src/reductions/nth_element.cu
index 88a1778bb7b..e266f477c5d 100644
--- a/cpp/src/reductions/nth_element.cu
+++ b/cpp/src/reductions/nth_element.cu
@@ -23,6 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -35,7 +36,7 @@ std::unique_ptr<cudf::scalar> nth_element(column_view const& col,
                                           size_type n,
                                           null_policy null_handling,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(n >= -col.size() and n < col.size(), "Index out of bounds");
   auto wrap_n = [n](size_type size) { return (n < 0 ? size + n : n); };
diff --git a/cpp/src/reductions/product.cu b/cpp/src/reductions/product.cu
index 2e483813939..28ff8db3708 100644
--- a/cpp/src/reductions/product.cu
+++ b/cpp/src/reductions/product.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::scalar> product(column_view const& col,
                                       cudf::data_type const output_dtype,
                                       std::optional<std::reference_wrapper<scalar const>> init,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher(
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(),
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index cd1669d1d6b..d764ea7559f 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -30,6 +30,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -38,14 +39,14 @@ struct reduce_dispatch_functor {
   column_view const col;
   data_type output_dtype;
   std::optional<std::reference_wrapper<scalar const>> init;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
   rmm::cuda_stream_view stream;
 
   reduce_dispatch_functor(column_view const& col,
                           data_type output_dtype,
                           std::optional<std::reference_wrapper<scalar const>> init,
                           rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource* mr)
+                          rmm::device_async_resource_ref mr)
     : col(col), output_dtype(output_dtype), init(init), mr(mr), stream(stream)
   {
   }
@@ -151,7 +152,7 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                                data_type output_dtype,
                                std::optional<std::reference_wrapper<scalar const>> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!init.has_value() || col.type() == init.value().get().type(),
                "column and initial value must be the same type");
@@ -204,7 +205,7 @@ std::unique_ptr<scalar> reduce(column_view const& col,
 std::unique_ptr<scalar> reduce(column_view const& col,
                                reduce_aggregation const& agg,
                                data_type output_dtype,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return reduction::detail::reduce(
@@ -215,7 +216,7 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                                reduce_aggregation const& agg,
                                data_type output_dtype,
                                std::optional<std::reference_wrapper<scalar const>> init,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return reduction::detail::reduce(col, agg, output_dtype, init, cudf::get_default_stream(), mr);
diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu
index 538763099d3..0befb6ac7d7 100644
--- a/cpp/src/reductions/scan/rank_scan.cu
+++ b/cpp/src/reductions/scan/rank_scan.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/scan.h>
 #include <thrust/tabulate.h>
@@ -66,7 +67,7 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
                                        value_resolver resolver,
                                        scan_operator scan_op,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto const order_by_tview = table_view{{order_by}};
   auto comp = cudf::experimental::row::equality::self_comparator(order_by_tview, stream);
@@ -105,7 +106,7 @@ std::unique_ptr<column> rank_generator(column_view const& order_by,
 
 std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   return rank_generator(
     order_by,
@@ -117,7 +118,7 @@ std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
 
 std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!cudf::structs::detail::is_or_has_nested_lists(order_by),
                "Unsupported list type in rank scan.");
@@ -130,7 +131,7 @@ std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
 }
 
 std::unique_ptr<column> inclusive_one_normalized_percent_rank_scan(
-  column_view const& order_by, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  column_view const& order_by, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   auto const rank_column =
     inclusive_rank_scan(order_by, stream, rmm::mr::get_current_device_resource());
diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp
index 2871ee283ba..b6e8690a6c9 100644
--- a/cpp/src/reductions/scan/scan.cpp
+++ b/cpp/src/reductions/scan/scan.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/reduction.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 
 namespace detail {
@@ -29,7 +31,7 @@ std::unique_ptr<column> scan(column_view const& input,
                              scan_type inclusive,
                              null_policy null_handling,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   if (agg.kind == aggregation::RANK) {
     CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE,
@@ -58,7 +60,7 @@ std::unique_ptr<column> scan(column_view const& input,
                              scan_aggregation const& agg,
                              scan_type inclusive,
                              null_policy null_handling,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::scan(input, agg, inclusive, null_handling, cudf::get_default_stream(), mr);
diff --git a/cpp/src/reductions/scan/scan.cuh b/cpp/src/reductions/scan/scan.cuh
index e575bde0ce0..aeb9e516cd4 100644
--- a/cpp/src/reductions/scan/scan.cuh
+++ b/cpp/src/reductions/scan/scan.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <utility>
 
@@ -33,14 +34,14 @@ namespace detail {
 std::pair<rmm::device_buffer, size_type> mask_scan(column_view const& input_view,
                                                    scan_type inclusive,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr);
+                                                   rmm::device_async_resource_ref mr);
 
 template <template <typename> typename DispatchFn>
 std::unique_ptr<column> scan_agg_dispatch(column_view const& input,
                                           scan_aggregation const& agg,
                                           bitmask_type const* output_mask,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   switch (agg.kind) {
     case aggregation::SUM:
diff --git a/cpp/src/reductions/scan/scan_exclusive.cu b/cpp/src/reductions/scan/scan_exclusive.cu
index 47301ad91f6..7224bf47390 100644
--- a/cpp/src/reductions/scan/scan_exclusive.cu
+++ b/cpp/src/reductions/scan/scan_exclusive.cu
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/scan.h>
@@ -56,7 +57,7 @@ struct scan_dispatcher {
   std::unique_ptr<column> operator()(column_view const& input,
                                      bitmask_type const*,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto output_column =
       detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr);
@@ -89,7 +90,7 @@ std::unique_ptr<column> scan_exclusive(column_view const& input,
                                        scan_aggregation const& agg,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto [mask, null_count] = [&] {
     if (null_handling == null_policy::EXCLUDE) {
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 7edf89a0c91..ad2eaa6a471 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/find.h>
 #include <thrust/functional.h>
@@ -45,7 +46,7 @@ namespace detail {
 std::pair<rmm::device_buffer, size_type> mask_scan(column_view const& input_view,
                                                    scan_type inclusive,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   rmm::device_buffer mask =
     detail::create_null_mask(input_view.size(), mask_state::UNINITIALIZED, stream, mr);
@@ -74,7 +75,7 @@ struct scan_functor {
   static std::unique_ptr<column> invoke(column_view const& input_view,
                                         bitmask_type const*,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     auto output_column = detail::allocate_like(
       input_view, input_view.size(), mask_allocation_policy::NEVER, stream, mr);
@@ -99,7 +100,7 @@ struct scan_functor<Op, cudf::string_view> {
   static std::unique_ptr<column> invoke(column_view const& input_view,
                                         bitmask_type const* mask,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     return cudf::strings::detail::scan_inclusive<Op>(input_view, mask, stream, mr);
   }
@@ -110,7 +111,7 @@ struct scan_functor<Op, cudf::struct_view> {
   static std::unique_ptr<column> invoke(column_view const& input,
                                         bitmask_type const*,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     return cudf::structs::detail::scan_inclusive<Op>(input, stream, mr);
   }
@@ -150,7 +151,7 @@ struct scan_dispatcher {
   std::unique_ptr<column> operator()(column_view const& input,
                                      bitmask_type const* output_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return scan_functor<Op, T>::invoke(input, output_mask, stream, mr);
   }
@@ -168,7 +169,7 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        scan_aggregation const& agg,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto [mask, null_count] = [&] {
     if (null_handling == null_policy::EXCLUDE) {
diff --git a/cpp/src/reductions/segmented/all.cu b/cpp/src/reductions/segmented/all.cu
index b81a088155c..489fc6a283c 100644
--- a/cpp/src/reductions/segmented/all.cu
+++ b/cpp/src/reductions/segmented/all.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -29,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_all(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "segmented_all() operation requires output type `BOOL8`");
diff --git a/cpp/src/reductions/segmented/any.cu b/cpp/src/reductions/segmented/any.cu
index 9210fbd3c7c..a9a8528548a 100644
--- a/cpp/src/reductions/segmented/any.cu
+++ b/cpp/src/reductions/segmented/any.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -29,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_any(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(output_dtype == cudf::data_type(cudf::type_id::BOOL8),
                "segmented_any() operation requires output type `BOOL8`");
diff --git a/cpp/src/reductions/segmented/compound.cuh b/cpp/src/reductions/segmented/compound.cuh
index 395ad4c1dc9..035a8bdcd75 100644
--- a/cpp/src/reductions/segmented/compound.cuh
+++ b/cpp/src/reductions/segmented/compound.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/adjacent_difference.h>
 #include <thrust/iterator/transform_iterator.h>
 
@@ -54,7 +56,7 @@ std::unique_ptr<column> compound_segmented_reduction(column_view const& col,
                                                      null_policy null_handling,
                                                      size_type ddof,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   auto d_col              = cudf::column_device_view::create(col, stream);
   auto compound_op        = Op{};
@@ -109,7 +111,7 @@ struct compound_float_output_dispatcher {
                                      null_policy null_handling,
                                      size_type ddof,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return compound_segmented_reduction<ElementType, ResultType, Op>(
       col, offsets, null_handling, ddof, stream, mr);
@@ -121,7 +123,7 @@ struct compound_float_output_dispatcher {
                                      null_policy,
                                      size_type,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unsupported output data type");
   }
@@ -144,7 +146,7 @@ struct compound_segmented_dispatcher {
                                      null_policy null_handling,
                                      size_type ddof,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return cudf::type_dispatcher(output_dtype,
                                  compound_float_output_dispatcher<ElementType, Op>(),
@@ -163,7 +165,7 @@ struct compound_segmented_dispatcher {
                                      null_policy,
                                      size_type,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Compound operators are not supported for non-arithmetic types");
   }
diff --git a/cpp/src/reductions/segmented/counts.cu b/cpp/src/reductions/segmented/counts.cu
index b9064ad3ffe..79737828678 100644
--- a/cpp/src/reductions/segmented/counts.cu
+++ b/cpp/src/reductions/segmented/counts.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/detail/null_mask.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/adjacent_difference.h>
 
 namespace cudf {
@@ -29,7 +31,7 @@ rmm::device_uvector<size_type> segmented_counts(bitmask_type const* null_mask,
                                                 device_span<size_type const> offsets,
                                                 null_policy null_handling,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   auto const num_segments = offsets.size() - 1;
 
diff --git a/cpp/src/reductions/segmented/counts.hpp b/cpp/src/reductions/segmented/counts.hpp
index c5ee1fadae7..f249644e564 100644
--- a/cpp/src/reductions/segmented/counts.hpp
+++ b/cpp/src/reductions/segmented/counts.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 class column_device_view;
@@ -48,7 +49,7 @@ rmm::device_uvector<size_type> segmented_counts(bitmask_type const* null_mask,
                                                 device_span<size_type const> offsets,
                                                 null_policy null_handling,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+                                                rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace reduction
diff --git a/cpp/src/reductions/segmented/max.cu b/cpp/src/reductions/segmented/max.cu
index c07c8fb2269..1c79edcc08c 100644
--- a/cpp/src/reductions/segmented/max.cu
+++ b/cpp/src/reductions/segmented/max.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -29,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_max(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(col.type() == output_dtype,
                "segmented_max() operation requires matching output type");
diff --git a/cpp/src/reductions/segmented/mean.cu b/cpp/src/reductions/segmented/mean.cu
index 99f1533a154..8df6bee97e9 100644
--- a/cpp/src/reductions/segmented/mean.cu
+++ b/cpp/src/reductions/segmented/mean.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::column> segmented_mean(column_view const& col,
                                              cudf::data_type const output_dtype,
                                              null_policy null_handling,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   using reducer            = compound::detail::compound_segmented_dispatcher<op::mean>;
   constexpr size_type ddof = 1;  // ddof for mean calculation
diff --git a/cpp/src/reductions/segmented/min.cu b/cpp/src/reductions/segmented/min.cu
index f1597f90267..ae1d5ae42a4 100644
--- a/cpp/src/reductions/segmented/min.cu
+++ b/cpp/src/reductions/segmented/min.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -29,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_min(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(col.type() == output_dtype,
                "segmented_min() operation requires matching output type");
diff --git a/cpp/src/reductions/segmented/nunique.cu b/cpp/src/reductions/segmented/nunique.cu
index bd1efb41df8..d4fcf89e161 100644
--- a/cpp/src/reductions/segmented/nunique.cu
+++ b/cpp/src/reductions/segmented/nunique.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -54,7 +55,7 @@ std::unique_ptr<cudf::column> segmented_nunique(column_view const& col,
                                                 device_span<size_type const> offsets,
                                                 null_policy null_handling,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   // only support non-nested types
   CUDF_EXPECTS(!cudf::is_nested(col.type()),
diff --git a/cpp/src/reductions/segmented/product.cu b/cpp/src/reductions/segmented/product.cu
index ea9c6f484c0..1b82e7e5aec 100644
--- a/cpp/src/reductions/segmented/product.cu
+++ b/cpp/src/reductions/segmented/product.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -28,7 +30,7 @@ std::unique_ptr<cudf::column> segmented_product(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   using reducer = simple::detail::column_type_dispatcher<op::product>;
   return cudf::type_dispatcher(
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index cee82560794..dee16b3e503 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -35,7 +36,7 @@ struct segmented_reduce_dispatch_functor {
   null_policy null_handling;
   std::optional<std::reference_wrapper<scalar const>> init;
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   segmented_reduce_dispatch_functor(column_view const& segmented_values,
                                     device_span<size_type const> offsets,
@@ -43,7 +44,7 @@ struct segmented_reduce_dispatch_functor {
                                     null_policy null_handling,
                                     std::optional<std::reference_wrapper<scalar const>> init,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
     : col(segmented_values),
       offsets(offsets),
       output_dtype(output_dtype),
@@ -59,7 +60,7 @@ struct segmented_reduce_dispatch_functor {
                                     data_type output_dtype,
                                     null_policy null_handling,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
     : segmented_reduce_dispatch_functor(
         segmented_values, offsets, output_dtype, null_handling, std::nullopt, stream, mr)
   {
@@ -109,7 +110,7 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          null_policy null_handling,
                                          std::optional<std::reference_wrapper<scalar const>> init,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!init.has_value() || segmented_values.type() == init.value().get().type(),
                "column and initial value must be the same type");
@@ -135,7 +136,7 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          segmented_reduce_aggregation const& agg,
                                          data_type output_dtype,
                                          null_policy null_handling,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return reduction::detail::segmented_reduce(segmented_values,
@@ -154,7 +155,7 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          data_type output_dtype,
                                          null_policy null_handling,
                                          std::optional<std::reference_wrapper<scalar const>> init,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return reduction::detail::segmented_reduce(segmented_values,
diff --git a/cpp/src/reductions/segmented/simple.cuh b/cpp/src/reductions/segmented/simple.cuh
index 4d4c6661428..da59df6b314 100644
--- a/cpp/src/reductions/segmented/simple.cuh
+++ b/cpp/src/reductions/segmented/simple.cuh
@@ -33,6 +33,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -71,7 +72,7 @@ std::unique_ptr<column> simple_segmented_reduction(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto dcol               = cudf::column_device_view::create(col, stream);
   auto simple_op          = Op{};
@@ -157,7 +158,7 @@ std::unique_ptr<column> string_segmented_reduction(column_view const& col,
                                                    device_span<size_type const> offsets,
                                                    null_policy null_handling,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   // Pass to simple_segmented_reduction, get indices to gather, perform gather here.
   auto device_col = cudf::column_device_view::create(col, stream);
@@ -201,7 +202,7 @@ std::unique_ptr<column> string_segmented_reduction(column_view const& col,
                                                    device_span<size_type const> offsets,
                                                    null_policy null_handling,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL("Segmented reduction on string column only supports min and max reduction.");
 }
@@ -226,7 +227,7 @@ std::unique_ptr<column> fixed_point_segmented_reduction(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   using RepType = device_storage_type_t<InputType>;
   auto result =
@@ -296,7 +297,7 @@ struct bool_result_column_dispatcher {
                                      null_policy null_handling,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return simple_segmented_reduction<ElementType, bool, Op>(
       col, offsets, null_handling, init, stream, mr);
@@ -308,7 +309,7 @@ struct bool_result_column_dispatcher {
                                      null_policy,
                                      std::optional<std::reference_wrapper<scalar const>>,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Reduction operator not supported for this type");
   }
@@ -341,7 +342,7 @@ struct same_column_type_dispatcher {
                                      null_policy null_handling,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return simple_segmented_reduction<ElementType, ElementType, Op>(
       col, offsets, null_handling, init, stream, mr);
@@ -354,7 +355,7 @@ struct same_column_type_dispatcher {
                                      null_policy null_handling,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (init.has_value()) { CUDF_FAIL("Initial value not supported for strings"); }
 
@@ -368,7 +369,7 @@ struct same_column_type_dispatcher {
                                      null_policy null_handling,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return fixed_point_segmented_reduction<ElementType, Op>(
       col, offsets, null_handling, init, stream, mr);
@@ -380,7 +381,7 @@ struct same_column_type_dispatcher {
                                      null_policy,
                                      std::optional<std::reference_wrapper<scalar const>>,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Reduction operator not supported for this type");
   }
@@ -412,7 +413,7 @@ struct column_type_dispatcher {
                                          null_policy null_handling,
                                          std::optional<std::reference_wrapper<scalar const>> init,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
   {
     // Floats are computed in double precision and then cast to the output type
     auto result = simple_segmented_reduction<ElementType, double, Op>(
@@ -439,7 +440,7 @@ struct column_type_dispatcher {
                                          null_policy null_handling,
                                          std::optional<std::reference_wrapper<scalar const>> init,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
   {
     // Integers are computed in int64 precision and then cast to the output type.
     auto result = simple_segmented_reduction<ElementType, int64_t, Op>(
@@ -468,7 +469,7 @@ struct column_type_dispatcher {
                                      null_policy null_handling,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     // If the output type matches the input type, then reduce using that type
     if (output_type.id() == cudf::type_to_id<ElementType>()) {
@@ -486,7 +487,7 @@ struct column_type_dispatcher {
                                      null_policy null_handling,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_EXPECTS(output_type == col.type(), "Output type must be same as input column type.");
     return fixed_point_segmented_reduction<ElementType, Op>(
@@ -502,7 +503,7 @@ struct column_type_dispatcher {
                                      null_policy,
                                      std::optional<std::reference_wrapper<scalar const>>,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Reduction operator not supported for this type");
   }
diff --git a/cpp/src/reductions/segmented/std.cu b/cpp/src/reductions/segmented/std.cu
index 5f5ced63b8f..0a7eb007f68 100644
--- a/cpp/src/reductions/segmented/std.cu
+++ b/cpp/src/reductions/segmented/std.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -30,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_standard_deviation(column_view const& co
                                                            null_policy null_handling,
                                                            size_type ddof,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr)
+                                                           rmm::device_async_resource_ref mr)
 {
   using reducer = compound::detail::compound_segmented_dispatcher<op::standard_deviation>;
   return cudf::type_dispatcher(
diff --git a/cpp/src/reductions/segmented/sum.cu b/cpp/src/reductions/segmented/sum.cu
index 7e84961dee0..bb06f6d7c8e 100644
--- a/cpp/src/reductions/segmented/sum.cu
+++ b/cpp/src/reductions/segmented/sum.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -29,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_sum(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   using reducer = simple::detail::column_type_dispatcher<op::sum>;
   return cudf::type_dispatcher(
diff --git a/cpp/src/reductions/segmented/sum_of_squares.cu b/cpp/src/reductions/segmented/sum_of_squares.cu
index 6c3f286fd8d..25d52f9bc79 100644
--- a/cpp/src/reductions/segmented/sum_of_squares.cu
+++ b/cpp/src/reductions/segmented/sum_of_squares.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::column> segmented_sum_of_squares(column_view const& col,
                                                        cudf::data_type const output_dtype,
                                                        null_policy null_handling,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   using reducer = simple::detail::column_type_dispatcher<op::sum_of_squares>;
   return cudf::type_dispatcher(
diff --git a/cpp/src/reductions/segmented/update_validity.cu b/cpp/src/reductions/segmented/update_validity.cu
index 7bf75d53ada..92cfe5417ef 100644
--- a/cpp/src/reductions/segmented/update_validity.cu
+++ b/cpp/src/reductions/segmented/update_validity.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -30,7 +32,7 @@ void segmented_update_validity(column& result,
                                null_policy null_handling,
                                std::optional<std::reference_wrapper<scalar const>> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto [output_null_mask, output_null_count] = cudf::detail::segmented_null_mask_reduction(
     col.null_mask(),
diff --git a/cpp/src/reductions/segmented/update_validity.hpp b/cpp/src/reductions/segmented/update_validity.hpp
index 0003b98308a..c143e1a4761 100644
--- a/cpp/src/reductions/segmented/update_validity.hpp
+++ b/cpp/src/reductions/segmented/update_validity.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
@@ -51,7 +52,7 @@ void segmented_update_validity(column& result,
                                null_policy null_handling,
                                std::optional<std::reference_wrapper<scalar const>> init,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr);
+                               rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace reduction
diff --git a/cpp/src/reductions/segmented/var.cu b/cpp/src/reductions/segmented/var.cu
index 4ac815b542f..35f2771dfcf 100644
--- a/cpp/src/reductions/segmented/var.cu
+++ b/cpp/src/reductions/segmented/var.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -30,7 +31,7 @@ std::unique_ptr<cudf::column> segmented_variance(column_view const& col,
                                                  null_policy null_handling,
                                                  size_type ddof,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   using reducer = compound::detail::compound_segmented_dispatcher<op::variance>;
   return cudf::type_dispatcher(
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index 006c6dc3034..372ceccf60b 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,6 +32,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -59,7 +60,7 @@ template <typename ElementType, typename ResultType, typename Op>
 std::unique_ptr<scalar> simple_reduction(column_view const& col,
                                          std::optional<std::reference_wrapper<scalar const>> init,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   // reduction by iterator
   auto dcol      = cudf::column_device_view::create(col, stream);
@@ -112,7 +113,7 @@ std::unique_ptr<scalar> fixed_point_reduction(
   column_view const& col,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   using Type = device_storage_type_t<DecimalXX>;
 
@@ -155,7 +156,7 @@ std::unique_ptr<scalar> dictionary_reduction(
   column_view const& col,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   if (init.has_value()) { CUDF_FAIL("Initial value not supported for dictionary reductions"); }
 
@@ -218,7 +219,7 @@ struct cast_numeric_scalar_fn {
   template <typename ResultType, std::enable_if_t<is_supported<ResultType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(numeric_scalar<InputType>* input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto d_input  = cudf::get_scalar_device_view(*input);
     auto result   = std::make_unique<numeric_scalar<ResultType>>(ResultType{}, true, stream, mr);
@@ -231,7 +232,7 @@ struct cast_numeric_scalar_fn {
   template <typename ResultType, std::enable_if_t<not is_supported<ResultType>()>* = nullptr>
   std::unique_ptr<scalar> operator()(numeric_scalar<InputType>*,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("input data type is not convertible to output data type");
   }
@@ -250,7 +251,7 @@ struct bool_result_element_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return simple_reduction<ElementType, bool, Op>(col, init, stream, mr);
   }
@@ -260,7 +261,7 @@ struct bool_result_element_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const&,
                                      std::optional<std::reference_wrapper<scalar const>>,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Reduction operator not supported for this type");
   }
@@ -286,7 +287,7 @@ struct same_element_type_dispatcher {
   std::unique_ptr<scalar> resolve_key(column_view const& keys,
                                       scalar const& keys_index,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
   {
     auto& index = static_cast<numeric_scalar<IndexType> const&>(keys_index);
     return cudf::detail::get_element(keys, index.value(stream), stream, mr);
@@ -296,7 +297,7 @@ struct same_element_type_dispatcher {
   std::unique_ptr<scalar> resolve_key(column_view const&,
                                       scalar const&,
                                       rmm::cuda_stream_view,
-                                      rmm::mr::device_memory_resource*)
+                                      rmm::device_async_resource_ref)
   {
     CUDF_FAIL("index type expected for dictionary column");
   }
@@ -309,7 +310,7 @@ struct same_element_type_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const& input,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (init.has_value()) { CUDF_FAIL("Initial value not supported for nested type reductions"); }
 
@@ -334,7 +335,7 @@ struct same_element_type_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (!cudf::is_dictionary(col.type())) {
       return simple_reduction<ElementType, ElementType, Op>(col, init, stream, mr);
@@ -351,7 +352,7 @@ struct same_element_type_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const& col,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return fixed_point_reduction<ElementType, Op>(col, init, stream, mr);
   }
@@ -360,7 +361,7 @@ struct same_element_type_dispatcher {
   std::unique_ptr<scalar> operator()(column_view const&,
                                      std::optional<std::reference_wrapper<scalar const>>,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Reduction operator not supported for this type");
   }
@@ -386,7 +387,7 @@ struct element_type_dispatcher {
                                          data_type const output_type,
                                          std::optional<std::reference_wrapper<scalar const>> init,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
   {
     auto result = !cudf::is_dictionary(col.type())
                     ? simple_reduction<ElementType, double, Op>(col, init, stream, mr)
@@ -409,7 +410,7 @@ struct element_type_dispatcher {
                                          data_type const output_type,
                                          std::optional<std::reference_wrapper<scalar const>> init,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
   {
     auto result = !cudf::is_dictionary(col.type())
                     ? simple_reduction<ElementType, int64_t, Op>(col, init, stream, mr)
@@ -439,7 +440,7 @@ struct element_type_dispatcher {
                                      data_type const output_type,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (output_type.id() == cudf::type_to_id<ElementType>())
       return !cudf::is_dictionary(col.type())
@@ -457,7 +458,7 @@ struct element_type_dispatcher {
                                      data_type const output_type,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_EXPECTS(output_type == col.type(), "Output type must be same as input column type.");
 
@@ -471,7 +472,7 @@ struct element_type_dispatcher {
                                      data_type const,
                                      std::optional<std::reference_wrapper<scalar const>> init,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Reduction operator not supported for this type");
   }
diff --git a/cpp/src/reductions/std.cu b/cpp/src/reductions/std.cu
index 9df83634667..9c78b35313b 100644
--- a/cpp/src/reductions/std.cu
+++ b/cpp/src/reductions/std.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::scalar> standard_deviation(column_view const& col,
                                                  cudf::data_type const output_dtype,
                                                  size_type ddof,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   // TODO: add cuda version check when the fix is available
 #if !defined(__CUDACC_DEBUG__)
diff --git a/cpp/src/reductions/sum.cu b/cpp/src/reductions/sum.cu
index 85c6b32dbaf..51b251a836e 100644
--- a/cpp/src/reductions/sum.cu
+++ b/cpp/src/reductions/sum.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::scalar> sum(column_view const& col,
                                   cudf::data_type const output_dtype,
                                   std::optional<std::reference_wrapper<scalar const>> init,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher(
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(),
diff --git a/cpp/src/reductions/sum_of_squares.cu b/cpp/src/reductions/sum_of_squares.cu
index 7b85c4e6dc9..dc0eae56e98 100644
--- a/cpp/src/reductions/sum_of_squares.cu
+++ b/cpp/src/reductions/sum_of_squares.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -28,7 +29,7 @@ namespace detail {
 std::unique_ptr<cudf::scalar> sum_of_squares(column_view const& col,
                                              cudf::data_type const output_dtype,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher(
     cudf::is_dictionary(col.type()) ? dictionary_column_view(col).keys().type() : col.type(),
diff --git a/cpp/src/reductions/var.cu b/cpp/src/reductions/var.cu
index d559531dc59..aaab9dd4604 100644
--- a/cpp/src/reductions/var.cu
+++ b/cpp/src/reductions/var.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -29,7 +30,7 @@ std::unique_ptr<cudf::scalar> variance(column_view const& col,
                                        cudf::data_type const output_dtype,
                                        size_type ddof,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   // TODO: add cuda version check when the fix is available
 #if !defined(__CUDACC_DEBUG__)
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 3cd1fdd20a2..fe5a9cfbd71 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -36,6 +36,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -93,7 +94,7 @@ std::unique_ptr<cudf::column> clamp_string_column(strings_column_view const& inp
                                                   OptionalScalarIterator hi_itr,
                                                   ReplaceScalarIterator hi_replace_itr,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   auto input_device_column = column_device_view::create(input.parent(), stream);
   auto d_input             = *input_device_column;
@@ -118,7 +119,7 @@ std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> clamp
   OptionalScalarIterator hi_itr,
   ReplaceScalarIterator hi_replace_itr,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto output =
     detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr);
@@ -168,7 +169,7 @@ std::enable_if_t<std::is_same_v<T, string_view>, std::unique_ptr<cudf::column>>
   OptionalScalarIterator hi_itr,
   ReplaceScalarIterator hi_replace_itr,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   return clamp_string_column(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, stream, mr);
 }
@@ -182,7 +183,7 @@ std::unique_ptr<column> clamp(column_view const& input,
                               OptionalScalarIterator hi_itr,
                               ReplaceScalarIterator hi_replace_itr,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   return clamper<T>(input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr, stream, mr);
 }
@@ -195,7 +196,7 @@ struct dispatch_clamp {
                                      scalar const& hi,
                                      scalar const& hi_replace,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     CUDF_EXPECTS(lo.type() == input.type(), "mismatching types of scalar and input");
 
@@ -216,7 +217,7 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::list_view>(
   scalar const& hi,
   scalar const& hi_replace,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL("clamp for list_view not supported");
 }
@@ -228,7 +229,7 @@ std::unique_ptr<column> dispatch_clamp::operator()<struct_view>(column_view cons
                                                                 scalar const& hi,
                                                                 scalar const& hi_replace,
                                                                 rmm::cuda_stream_view stream,
-                                                                rmm::mr::device_memory_resource* mr)
+                                                                rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL("clamp for struct_view not supported");
 }
@@ -241,7 +242,7 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
   scalar const& hi,
   scalar const& hi_replace,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   // add lo_replace and hi_replace to keys
   auto matched_column = [&] {
@@ -309,7 +310,7 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
                                       scalar const& lo_replace,
                                       scalar const& hi,
                                       scalar const& hi_replace,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -319,7 +320,7 @@ std::unique_ptr<column> clamp(column_view const& input,
                               scalar const& hi,
                               scalar const& hi_replace,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(lo.type() == hi.type(), "mismatching types of limit scalars");
   CUDF_EXPECTS(lo_replace.type() == hi_replace.type(), "mismatching types of replace scalars");
@@ -350,7 +351,7 @@ std::unique_ptr<column> clamp(column_view const& input,
                               scalar const& hi,
                               scalar const& hi_replace,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::clamp(input, lo, lo_replace, hi, hi_replace, stream, mr);
@@ -361,7 +362,7 @@ std::unique_ptr<column> clamp(column_view const& input,
                               scalar const& lo,
                               scalar const& hi,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::clamp(input, lo, lo, hi, hi, stream, mr);
diff --git a/cpp/src/replace/nans.cu b/cpp/src/replace/nans.cu
index 2fcb934ba65..eba6f6b436e 100644
--- a/cpp/src/replace/nans.cu
+++ b/cpp/src/replace/nans.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -44,7 +45,7 @@ struct replace_nans_functor {
     Replacement const& replacement,
     bool replacement_nullable,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     CUDF_EXPECTS(input.type() == replacement.type(),
                  "Input and replacement must be of the same type");
@@ -84,7 +85,7 @@ struct replace_nans_functor {
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      column_view const& replacement,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.size() == replacement.size(),
                "Input and replacement must be of the same size");
@@ -101,7 +102,7 @@ std::unique_ptr<column> replace_nans(column_view const& input,
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      scalar const& replacement,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(
     input.type(), replace_nans_functor{}, input, replacement, true, stream, mr);
@@ -112,7 +113,7 @@ std::unique_ptr<column> replace_nans(column_view const& input,
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      column_view const& replacement,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_nans(input, replacement, stream, mr);
@@ -121,7 +122,7 @@ std::unique_ptr<column> replace_nans(column_view const& input,
 std::unique_ptr<column> replace_nans(column_view const& input,
                                      scalar const& replacement,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_nans(input, replacement, stream, mr);
@@ -197,7 +198,7 @@ void normalize_nans_and_zeros(mutable_column_view in_out, rmm::cuda_stream_view
 
 std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   // output. copies the input
   auto out = std::make_unique<column>(input, stream, mr);
@@ -224,7 +225,7 @@ std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
  */
 std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::normalize_nans_and_zeros(input, stream, mr);
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 299cdc6a160..fe3d20e372e 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -42,6 +42,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -113,7 +114,7 @@ struct replace_nulls_column_kernel_forwarder {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            cudf::column_view const& replacement,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     cudf::size_type nrows = input.size();
     cudf::detail::grid_1d grid{nrows, BLOCK_SIZE};
@@ -152,7 +153,7 @@ struct replace_nulls_column_kernel_forwarder {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const&,
                                            cudf::column_view const&,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource*)
+                                           rmm::device_async_resource_ref)
   {
     CUDF_FAIL("No specialization exists for the given type.");
   }
@@ -163,7 +164,7 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
   cudf::column_view const& input,
   cudf::column_view const& replacement,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto d_input       = cudf::column_device_view::create(input, stream);
   auto d_replacement = cudf::column_device_view::create(replacement, stream);
@@ -190,7 +191,7 @@ std::unique_ptr<cudf::column> replace_nulls_column_kernel_forwarder::operator()<
   cudf::column_view const& input,
   cudf::column_view const& replacement,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   cudf::dictionary_column_view dict_input(input);
   cudf::dictionary_column_view dict_repl(replacement);
@@ -213,7 +214,7 @@ struct replace_nulls_scalar_kernel_forwarder {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            cudf::scalar const& replacement,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
     std::unique_ptr<cudf::column> output = cudf::detail::allocate_like(
@@ -238,7 +239,7 @@ struct replace_nulls_scalar_kernel_forwarder {
   std::unique_ptr<cudf::column> operator()(cudf::column_view const&,
                                            cudf::scalar const&,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource*)
+                                           rmm::device_async_resource_ref)
   {
     CUDF_FAIL("No specialization exists for the given type.");
   }
@@ -249,7 +250,7 @@ std::unique_ptr<cudf::column> replace_nulls_scalar_kernel_forwarder::operator()<
   cudf::column_view const& input,
   cudf::scalar const& replacement,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
   cudf::strings_column_view input_s(input);
@@ -262,7 +263,7 @@ std::unique_ptr<cudf::column> replace_nulls_scalar_kernel_forwarder::operator()<
   cudf::column_view const& input,
   cudf::scalar const& replacement,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   cudf::dictionary_column_view dict_input(input);
   return cudf::dictionary::detail::replace_nulls(dict_input, replacement, stream, mr);
@@ -275,7 +276,7 @@ std::unique_ptr<cudf::column> replace_nulls_scalar_kernel_forwarder::operator()<
 std::unique_ptr<cudf::column> replace_nulls_policy_impl(cudf::column_view const& input,
                                                         cudf::replace_policy const& replace_policy,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   auto device_in = cudf::column_device_view::create(input, stream);
   auto index     = thrust::make_counting_iterator<cudf::size_type>(0);
@@ -315,7 +316,7 @@ namespace detail {
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::column_view const& replacement,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
   CUDF_EXPECTS(replacement.size() == input.size(), "Column size mismatch");
@@ -330,7 +331,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::scalar const& replacement,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input); }
   if (!input.has_nulls() || !replacement.is_valid(stream)) {
@@ -344,7 +345,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::replace_policy const& replace_policy,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::empty_like(input); }
   if (!input.has_nulls()) { return std::make_unique<cudf::column>(input, stream, mr); }
@@ -357,7 +358,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::column_view const& replacement,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_nulls(input, replacement, stream, mr);
@@ -366,7 +367,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             cudf::scalar const& replacement,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_nulls(input, replacement, stream, mr);
@@ -375,7 +376,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
 std::unique_ptr<cudf::column> replace_nulls(column_view const& input,
                                             replace_policy const& replace_policy,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_nulls(input, replace_policy, stream, mr);
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 91a0ced791a..7bc0bd7e0be 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -52,6 +52,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -178,7 +179,7 @@ struct replace_kernel_forwarder {
                                            cudf::column_view const& values_to_replace,
                                            cudf::column_view const& replacement_values,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     rmm::device_scalar<cudf::size_type> valid_counter(0, stream);
     cudf::size_type* valid_count = valid_counter.data();
@@ -226,7 +227,7 @@ struct replace_kernel_forwarder {
                                            cudf::column_view const&,
                                            cudf::column_view const&,
                                            rmm::cuda_stream_view,
-                                           rmm::mr::device_memory_resource*)
+                                           rmm::device_async_resource_ref)
   {
     CUDF_FAIL("No specialization exists for this type");
   }
@@ -238,7 +239,7 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::string_
   cudf::column_view const& values_to_replace,
   cudf::column_view const& replacement_values,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   return cudf::strings::detail::find_and_replace_all(
     input_col, values_to_replace, replacement_values, stream, mr);
@@ -250,7 +251,7 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::diction
   cudf::column_view const& values_to_replace,
   cudf::column_view const& replacement_values,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto input        = cudf::dictionary_column_view(input_col);
   auto values       = cudf::dictionary_column_view(values_to_replace);
@@ -297,7 +298,7 @@ std::unique_ptr<cudf::column> find_and_replace_all(cudf::column_view const& inpu
                                                    cudf::column_view const& values_to_replace,
                                                    cudf::column_view const& replacement_values,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(values_to_replace.size() == replacement_values.size(),
                "values_to_replace and replacement_values size mismatch.");
@@ -337,7 +338,7 @@ std::unique_ptr<cudf::column> find_and_replace_all(cudf::column_view const& inpu
                                                    cudf::column_view const& values_to_replace,
                                                    cudf::column_view const& replacement_values,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   return detail::find_and_replace_all(input_col, values_to_replace, replacement_values, stream, mr);
 }
diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu
index 6ed28e693fd..1b05a9744fa 100644
--- a/cpp/src/reshape/byte_cast.cu
+++ b/cpp/src/reshape/byte_cast.cu
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/for_each.h>
@@ -56,7 +57,7 @@ struct byte_list_conversion_dispatcher {
   std::unique_ptr<column> operator()(column_view const& input,
                                      flip_endianness configuration,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     return byte_list_conversion_fn<T>::invoke(input, configuration, stream, mr);
   }
@@ -67,7 +68,7 @@ struct byte_list_conversion_fn<T, std::enable_if_t<cudf::is_numeric<T>()>> {
   static std::unique_ptr<column> invoke(column_view const& input,
                                         flip_endianness configuration,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     if (input.size() == 0) {
       return cudf::lists::detail::make_empty_lists_column(output_type, stream, mr);
@@ -124,7 +125,7 @@ struct byte_list_conversion_fn<T, std::enable_if_t<std::is_same_v<T, cudf::strin
   static std::unique_ptr<column> invoke(column_view const& input,
                                         flip_endianness,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
   {
     if (input.size() == 0) {
       return cudf::lists::detail::make_empty_lists_column(output_type, stream, mr);
@@ -162,14 +163,14 @@ struct byte_list_conversion_fn<T, std::enable_if_t<std::is_same_v<T, cudf::strin
 }  // namespace
 
 /**
- * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<column> byte_cast(column_view const& input,
                                   flip_endianness endian_configuration,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(
     input.type(), byte_list_conversion_dispatcher{}, input, endian_configuration, stream, mr);
@@ -178,11 +179,11 @@ std::unique_ptr<column> byte_cast(column_view const& input,
 }  // namespace detail
 
 /**
- * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::device_async_resource_ref)
  */
 std::unique_ptr<column> byte_cast(column_view const& input,
                                   flip_endianness endian_configuration,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::byte_cast(input, endian_configuration, cudf::get_default_stream(), mr);
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 72227ab5dda..3d1421120fd 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -55,7 +56,7 @@ struct interleave_columns_functor {
   std::unique_ptr<cudf::column> operator()(table_view const& input,
                                            bool create_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     return interleave_columns_impl<T>{}(input, create_mask, stream, mr);
   }
@@ -66,7 +67,7 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::list_
   std::unique_ptr<column> operator()(table_view const& lists_columns,
                                      bool create_mask,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return lists::detail::interleave_columns(lists_columns, create_mask, stream, mr);
   }
@@ -77,7 +78,7 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::struc
   std::unique_ptr<cudf::column> operator()(table_view const& structs_columns,
                                            bool create_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     // We can safely call `column(0)` as the number of columns is known to be non zero.
     auto const num_children = structs_columns.column(0).num_children();
@@ -144,7 +145,7 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::strin
   std::unique_ptr<cudf::column> operator()(table_view const& strings_columns,
                                            bool create_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     auto num_columns = strings_columns.num_columns();
     if (num_columns == 1)  // Single strings column returns a copy
@@ -226,7 +227,7 @@ struct interleave_columns_impl<T, std::enable_if_t<cudf::is_fixed_width<T>()>> {
   std::unique_ptr<cudf::column> operator()(table_view const& input,
                                            bool create_mask,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     auto arch_column = input.column(0);
     auto output_size = input.num_columns() * input.num_rows();
@@ -273,7 +274,7 @@ struct interleave_columns_impl<T, std::enable_if_t<cudf::is_fixed_width<T>()>> {
 
 std::unique_ptr<column> interleave_columns(table_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.num_columns() > 0, "input must have at least one column to determine dtype.");
 
@@ -293,7 +294,7 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
 }  // namespace detail
 
 std::unique_ptr<column> interleave_columns(table_view const& input,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::interleave_columns(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu
index 9d76c509333..1c4019b2c73 100644
--- a/cpp/src/reshape/tile.cu
+++ b/cpp/src/reshape/tile.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -44,7 +45,7 @@ namespace detail {
 std::unique_ptr<table> tile(table_view const& in,
                             size_type count,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(count >= 0, "Count cannot be negative");
 
@@ -62,7 +63,7 @@ std::unique_ptr<table> tile(table_view const& in,
 
 std::unique_ptr<table> tile(table_view const& in,
                             size_type count,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::tile(in, count, cudf::get_default_stream(), mr);
diff --git a/cpp/src/rolling/detail/lead_lag_nested.cuh b/cpp/src/rolling/detail/lead_lag_nested.cuh
index 66104fe5c77..269868910c7 100644
--- a/cpp/src/rolling/detail/lead_lag_nested.cuh
+++ b/cpp/src/rolling/detail/lead_lag_nested.cuh
@@ -27,6 +27,7 @@
 
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -94,7 +95,7 @@ std::unique_ptr<column> compute_lead_lag_for_nested(aggregation::Kind op,
                                                     FollowingIter following,
                                                     size_type row_offset,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(op == aggregation::LEAD || op == aggregation::LAG,
                "Unexpected aggregation type in compute_lead_lag_for_nested");
diff --git a/cpp/src/rolling/detail/nth_element.cuh b/cpp/src/rolling/detail/nth_element.cuh
index bd3cbb39168..571f4c02cb5 100644
--- a/cpp/src/rolling/detail/nth_element.cuh
+++ b/cpp/src/rolling/detail/nth_element.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/utilities/bit.hpp>
 
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
@@ -150,7 +151,7 @@ std::unique_ptr<column> nth_element(size_type n,
                                     FollowingIter following,
                                     size_type min_periods,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto const gather_iter = cudf::detail::make_counting_transform_iterator(
     0,
diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.cpp b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
index f1a5c4c78a8..3e085fa963c 100644
--- a/cpp/src/rolling/detail/optimized_unbounded_window.cpp
+++ b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,8 @@
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::detail {
 
 bool can_optimize_unbounded_window(bool unbounded_preceding,
@@ -94,7 +96,7 @@ std::unique_ptr<column> aggregation_based_rolling_window(table_view const& group
                                                          column_view const& input,
                                                          rolling_aggregation const& aggr,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(group_keys.num_columns() > 0,
                "Ungrouped rolling window not supported in aggregation path.");
@@ -127,7 +129,7 @@ std::unique_ptr<column> aggregation_based_rolling_window(table_view const& group
 std::unique_ptr<column> reduction_based_rolling_window(column_view const& input,
                                                        rolling_aggregation const& aggr,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   auto const reduce_results = [&] {
     auto const return_dtype = cudf::detail::target_type(input.type(), aggr.kind);
@@ -152,7 +154,7 @@ std::unique_ptr<column> optimized_unbounded_window(table_view const& group_keys,
                                                    column_view const& input,
                                                    rolling_aggregation const& aggr,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   return group_keys.num_columns() > 0
            ? aggregation_based_rolling_window(group_keys, input, aggr, stream, mr)
diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.hpp b/cpp/src/rolling/detail/optimized_unbounded_window.hpp
index 5964390398c..153586b187f 100644
--- a/cpp/src/rolling/detail/optimized_unbounded_window.hpp
+++ b/cpp/src/rolling/detail/optimized_unbounded_window.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace rmm::mr {
 class device_memory_resource;
@@ -51,6 +52,6 @@ std::unique_ptr<column> optimized_unbounded_window(table_view const& group_keys,
                                                    column_view const& input,
                                                    rolling_aggregation const& aggr,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr);
+                                                   rmm::device_async_resource_ref mr);
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh
index af6d6d7f157..c18bb9d9885 100644
--- a/cpp/src/rolling/detail/rolling.cuh
+++ b/cpp/src/rolling/detail/rolling.cuh
@@ -50,6 +50,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/std/climits>
 #include <cuda/std/limits>
@@ -849,7 +850,7 @@ class rolling_aggregation_postprocessor final : public cudf::detail::aggregation
                                     int _min_periods,
                                     std::unique_ptr<column>&& _intermediate,
                                     rmm::cuda_stream_view _stream,
-                                    rmm::mr::device_memory_resource* _mr)
+                                    rmm::device_async_resource_ref _mr)
     :
 
       input(_input),
@@ -990,7 +991,7 @@ class rolling_aggregation_postprocessor final : public cudf::detail::aggregation
   std::unique_ptr<column> intermediate;
   std::unique_ptr<column> result;
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 };
 
 /**
@@ -1095,7 +1096,7 @@ struct rolling_window_launcher {
              int min_periods,
              [[maybe_unused]] rolling_aggregation const& agg,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+             rmm::device_async_resource_ref mr)
   {
     auto const do_rolling = [&](auto const& device_op) {
       auto output = make_fixed_width_column(
@@ -1164,7 +1165,7 @@ struct rolling_window_launcher {
              int,
              rolling_aggregation const&,
              rmm::cuda_stream_view,
-             rmm::mr::device_memory_resource*)
+             rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Invalid aggregation type/pair");
   }
@@ -1188,7 +1189,7 @@ struct dispatch_rolling {
                                      size_type min_periods,
                                      rolling_aggregation const& agg,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     // do any preprocessing of aggregations (eg, MIN -> ARGMIN, COLLECT_LIST -> nothing)
     rolling_aggregation_preprocessor preprocessor;
@@ -1237,7 +1238,7 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
                                            size_type min_periods,
                                            rolling_aggregation const& agg,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   static_assert(warp_size == cudf::detail::size_in_bits<cudf::bitmask_type>(),
                 "bitmask_type size does not match CUDA warp size");
@@ -1308,7 +1309,7 @@ std::unique_ptr<column> rolling_window_udf(column_view const& input,
  *                               FollowingWindowIterator following_window_begin,
  *                               size_type min_periods,
  *                               rolling_aggregation const& agg,
- *                               rmm::mr::device_memory_resource* mr)
+ *                               rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -1320,7 +1321,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   static_assert(warp_size == cudf::detail::size_in_bits<cudf::bitmask_type>(),
                 "bitmask_type size does not match CUDA warp size");
diff --git a/cpp/src/rolling/detail/rolling.hpp b/cpp/src/rolling/detail/rolling.hpp
index d2dfa2f9df5..2624d982712 100644
--- a/cpp/src/rolling/detail/rolling.hpp
+++ b/cpp/src/rolling/detail/rolling.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/utilities/traits.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 // helper functions - used in the rolling window implementation and tests
 
@@ -64,7 +66,7 @@ struct rolling_store_output_functor<_T, true> {
  *                               size_type following_window,
  *                               size_type min_periods,
  *                               rolling_aggregation const& agg,
- *                               rmm::mr::device_memory_resource* mr)
+ *                               rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream to use for device memory operations
  */
@@ -75,7 +77,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 
 /**
  * @copydoc cudf::rolling_window(column_view const& input,
@@ -83,7 +85,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
  *                               column_view const& following_window,
  *                               size_type min_periods,
  *                               rolling_aggregation const& agg,
- *                               rmm::mr::device_memory_resource* mr);
+ *                               rmm::device_async_resource_ref mr);
  *
  * @param stream CUDA stream to use for device memory operations
  */
@@ -93,7 +95,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr);
+                                       rmm::device_async_resource_ref mr);
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/rolling/detail/rolling_collect_list.cu b/cpp/src/rolling/detail/rolling_collect_list.cu
index 85dced0efe3..b259bd51fc4 100644
--- a/cpp/src/rolling/detail/rolling_collect_list.cu
+++ b/cpp/src/rolling/detail/rolling_collect_list.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/detail/iterator.cuh>
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -114,7 +115,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
   column_view const& offsets,
   size_type num_child_nulls,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto input_device_view = column_device_view::create(input, stream);
 
diff --git a/cpp/src/rolling/detail/rolling_collect_list.cuh b/cpp/src/rolling/detail/rolling_collect_list.cuh
index 0ce14792cfa..7630898f820 100644
--- a/cpp/src/rolling/detail/rolling_collect_list.cuh
+++ b/cpp/src/rolling/detail/rolling_collect_list.cuh
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/extrema.h>
@@ -50,7 +51,7 @@ std::unique_ptr<column> create_collect_offsets(size_type input_size,
                                                FollowingIter following_begin,
                                                size_type min_periods,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   // Materialize offsets column.
   auto static constexpr size_data_type = data_type{type_to_id<size_type>()};
@@ -148,7 +149,7 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> purge_null_entries(
   column_view const& offsets,
   size_type num_child_nulls,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 template <typename PrecedingIter, typename FollowingIter>
 std::unique_ptr<column> rolling_collect_list(column_view const& input,
@@ -158,7 +159,7 @@ std::unique_ptr<column> rolling_collect_list(column_view const& input,
                                              size_type min_periods,
                                              null_policy null_handling,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(default_outputs.is_empty(),
                "COLLECT_LIST window function does not support default values.");
diff --git a/cpp/src/rolling/detail/rolling_fixed_window.cu b/cpp/src/rolling/detail/rolling_fixed_window.cu
index f51937f7a0e..df0e72748ce 100644
--- a/cpp/src/rolling/detail/rolling_fixed_window.cu
+++ b/cpp/src/rolling/detail/rolling_fixed_window.cu
@@ -21,6 +21,8 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/functional>
 #include <thrust/extrema.h>
 
@@ -34,7 +36,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/rolling/detail/rolling_variable_window.cu b/cpp/src/rolling/detail/rolling_variable_window.cu
index bb73f305c7b..83e8faec291 100644
--- a/cpp/src/rolling/detail/rolling_variable_window.cu
+++ b/cpp/src/rolling/detail/rolling_variable_window.cu
@@ -19,6 +19,8 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/functional>
 #include <thrust/extrema.h>
 #include <thrust/iterator/constant_iterator.h>
@@ -32,7 +34,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 89a51ad1d87..d461ed7a109 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -29,6 +29,8 @@
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -44,7 +46,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                size_type following_window,
                                                size_type min_periods,
                                                rolling_aggregation const& aggr,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   return grouped_rolling_window(group_keys,
                                 input,
@@ -61,7 +63,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                window_bounds following_window,
                                                size_type min_periods,
                                                rolling_aggregation const& aggr,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   return grouped_rolling_window(group_keys,
                                 input,
@@ -80,7 +82,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                size_type following_window,
                                                size_type min_periods,
                                                rolling_aggregation const& aggr,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   return grouped_rolling_window(group_keys,
                                 input,
@@ -205,7 +207,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                size_type min_periods,
                                                rolling_aggregation const& aggr,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -304,7 +306,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                window_bounds following_window_bounds,
                                                size_type min_periods,
                                                rolling_aggregation const& aggr,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   return detail::grouped_rolling_window(group_keys,
                                         input,
@@ -439,7 +441,7 @@ std::unique_ptr<column> range_window_ASC(column_view const& input,
                                          size_type min_periods,
                                          rolling_aggregation const& aggr,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   auto [h_nulls_begin_idx, h_nulls_end_idx] = get_null_bounds_for_orderby_column(orderby_column);
   auto const p_orderby_device_view = cudf::column_device_view::create(orderby_column, stream);
@@ -614,7 +616,7 @@ std::unique_ptr<column> range_window_ASC(column_view const& input,
                                          size_type min_periods,
                                          rolling_aggregation const& aggr,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   auto [null_start, null_end] =
     get_null_bounds_for_orderby_column(orderby_column, group_offsets, stream);
@@ -728,7 +730,7 @@ std::unique_ptr<column> range_window_DESC(column_view const& input,
                                           size_type min_periods,
                                           rolling_aggregation const& aggr,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   auto [h_nulls_begin_idx, h_nulls_end_idx] = get_null_bounds_for_orderby_column(orderby_column);
   auto const p_orderby_device_view = cudf::column_device_view::create(orderby_column, stream);
@@ -823,7 +825,7 @@ std::unique_ptr<column> range_window_DESC(column_view const& input,
                                           size_type min_periods,
                                           rolling_aggregation const& aggr,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   auto [null_start, null_end] =
     get_null_bounds_for_orderby_column(orderby_column, group_offsets, stream);
@@ -935,7 +937,7 @@ std::unique_ptr<column> grouped_range_rolling_window_impl(
   size_type min_periods,
   rolling_aggregation const& aggr,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto [preceding_value, following_value] = [&] {
     if constexpr (std::is_same_v<OrderByT, cudf::string_view>) {
@@ -1024,7 +1026,7 @@ struct dispatch_grouped_range_rolling_window {
              size_type min_periods,
              rolling_aggregation const& aggr,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr) const
+             rmm::device_async_resource_ref mr) const
   {
     return grouped_range_rolling_window_impl<OrderByColumnType>(input,
                                                                 orderby_column,
@@ -1120,7 +1122,7 @@ namespace detail {
  *               range_window_bounds const& following,
  *               size_type min_periods,
  *               rolling_aggregation const& aggr,
- *               rmm::mr::device_memory_resource* mr );
+ *               rmm::device_async_resource_ref mr );
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -1133,7 +1135,7 @@ std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_key
                                                      size_type min_periods,
                                                      rolling_aggregation const& aggr,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -1187,7 +1189,7 @@ std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_key
  *              size_type following_window_in_days,
  *              size_type min_periods,
  *              rolling_aggregation const& aggr,
- *              rmm::mr::device_memory_resource* mr);
+ *              rmm::device_async_resource_ref mr);
  */
 std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& group_keys,
                                                           column_view const& timestamp_column,
@@ -1197,7 +1199,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
                                                           size_type following_window_in_days,
                                                           size_type min_periods,
                                                           rolling_aggregation const& aggr,
-                                                          rmm::mr::device_memory_resource* mr)
+                                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto preceding = to_range_bounds(preceding_window_in_days, timestamp_column.type());
@@ -1225,7 +1227,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
  *            window_bounds following_window_in_days,
  *            size_type min_periods,
  *            rolling_aggregation const& aggr,
- *            rmm::mr::device_memory_resource* mr);
+ *            rmm::device_async_resource_ref mr);
  */
 std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& group_keys,
                                                           column_view const& timestamp_column,
@@ -1235,7 +1237,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
                                                           window_bounds following_window_in_days,
                                                           size_type min_periods,
                                                           rolling_aggregation const& aggr,
-                                                          rmm::mr::device_memory_resource* mr)
+                                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   range_window_bounds preceding =
@@ -1265,7 +1267,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
  *               range_window_bounds const& following,
  *               size_type min_periods,
  *               rolling_aggregation const& aggr,
- *               rmm::mr::device_memory_resource* mr );
+ *               rmm::device_async_resource_ref mr );
  */
 std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_keys,
                                                      column_view const& timestamp_column,
@@ -1275,7 +1277,7 @@ std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_key
                                                      range_window_bounds const& following,
                                                      size_type min_periods,
                                                      rolling_aggregation const& aggr,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::grouped_range_rolling_window(group_keys,
diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu
index 5c78cc4382d..a308ed8a7a6 100644
--- a/cpp/src/rolling/rolling.cu
+++ b/cpp/src/rolling/rolling.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 
 // Applies a fixed-size rolling window function to the values in a column, with default output
@@ -30,7 +32,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type following_window,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rolling_window(input,
@@ -49,7 +51,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type following_window,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto defaults =
@@ -70,7 +72,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        column_view const& following_window,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rolling_window(
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 8336e1ef2b0..369ed039b66 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 #include <thrust/uninitialized_fill.h>
@@ -213,7 +214,7 @@ template <typename T,
 std::unique_ptr<column> round_with(column_view const& input,
                                    int32_t decimal_places,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   using Functor = RoundFunctor<T>;
 
@@ -245,7 +246,7 @@ template <typename T,
 std::unique_ptr<column> round_with(column_view const& input,
                                    int32_t decimal_places,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   using namespace numeric;
   using Type                   = device_storage_type_t<T>;
@@ -309,7 +310,7 @@ struct round_type_dispatcher {
     int32_t decimal_places,
     cudf::rounding_method method,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     // clang-format off
     switch (method) {
@@ -335,7 +336,7 @@ std::unique_ptr<column> round(column_view const& input,
                               int32_t decimal_places,
                               cudf::rounding_method method,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(cudf::is_numeric(input.type()) || cudf::is_fixed_point(input.type()),
                "Only integral/floating point/fixed point currently supported.");
@@ -357,7 +358,7 @@ std::unique_ptr<column> round(column_view const& input,
 std::unique_ptr<column> round(column_view const& input,
                               int32_t decimal_places,
                               rounding_method method,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::round(input, decimal_places, method, cudf::get_default_stream(), mr);
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 2fa008d9062..07425a92413 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -34,14 +35,12 @@ namespace cudf {
 scalar::scalar(data_type type,
                bool is_valid,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+               rmm::device_async_resource_ref mr)
   : _type(type), _is_valid(is_valid, stream, mr)
 {
 }
 
-scalar::scalar(scalar const& other,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+scalar::scalar(scalar const& other, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   : _type(other.type()), _is_valid(other._is_valid, stream, mr)
 {
 }
@@ -62,7 +61,7 @@ bool const* scalar::validity_data() const { return _is_valid.data(); }
 string_scalar::string_scalar(std::string const& string,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::STRING), is_valid, stream, mr),
     _data(string.data(), string.size(), stream, mr)
 {
@@ -74,7 +73,7 @@ string_scalar::string_scalar(std::string const& string,
 
 string_scalar::string_scalar(string_scalar const& other,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(other, stream, mr), _data(other._data, stream, mr)
 {
 }
@@ -82,7 +81,7 @@ string_scalar::string_scalar(string_scalar const& other,
 string_scalar::string_scalar(rmm::device_scalar<value_type>& data,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : string_scalar(data.value(stream), is_valid, stream, mr)
 {
 }
@@ -90,7 +89,7 @@ string_scalar::string_scalar(rmm::device_scalar<value_type>& data,
 string_scalar::string_scalar(value_type const& source,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::STRING), is_valid, stream, mr),
     _data(source.data(), source.size_bytes(), stream, mr)
 {
@@ -99,7 +98,7 @@ string_scalar::string_scalar(value_type const& source,
 string_scalar::string_scalar(rmm::device_buffer&& data,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::STRING), is_valid, stream, mr), _data(std::move(data))
 {
 }
@@ -130,7 +129,7 @@ fixed_point_scalar<T>::fixed_point_scalar(rep_type value,
                                           numeric::scale_type scale,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar{data_type{type_to_id<T>(), static_cast<int32_t>(scale)}, is_valid, stream, mr},
     _data{value, stream, mr}
 {
@@ -140,7 +139,7 @@ template <typename T>
 fixed_point_scalar<T>::fixed_point_scalar(rep_type value,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar{data_type{type_to_id<T>(), 0}, is_valid, stream, mr}, _data{value, stream, mr}
 {
 }
@@ -149,7 +148,7 @@ template <typename T>
 fixed_point_scalar<T>::fixed_point_scalar(T value,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar{data_type{type_to_id<T>(), value.scale()}, is_valid, stream, mr},
     _data{value.value(), stream, mr}
 {
@@ -160,7 +159,7 @@ fixed_point_scalar<T>::fixed_point_scalar(rmm::device_scalar<rep_type>&& data,
                                           numeric::scale_type scale,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar{data_type{type_to_id<T>(), scale}, is_valid, stream, mr}, _data{std::move(data)}
 {
 }
@@ -168,7 +167,7 @@ fixed_point_scalar<T>::fixed_point_scalar(rmm::device_scalar<rep_type>&& data,
 template <typename T>
 fixed_point_scalar<T>::fixed_point_scalar(fixed_point_scalar<T> const& other,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar{other, stream, mr}, _data(other._data, stream, mr)
 {
 }
@@ -223,7 +222,7 @@ template <typename T>
 fixed_width_scalar<T>::fixed_width_scalar(T value,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar(data_type(type_to_id<T>()), is_valid, stream, mr), _data(value, stream, mr)
 {
 }
@@ -232,7 +231,7 @@ template <typename T>
 fixed_width_scalar<T>::fixed_width_scalar(rmm::device_scalar<T>&& data,
                                           bool is_valid,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar(data_type(type_to_id<T>()), is_valid, stream, mr), _data{std::move(data)}
 {
 }
@@ -240,7 +239,7 @@ fixed_width_scalar<T>::fixed_width_scalar(rmm::device_scalar<T>&& data,
 template <typename T>
 fixed_width_scalar<T>::fixed_width_scalar(fixed_width_scalar<T> const& other,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
   : scalar{other, stream, mr}, _data(other._data, stream, mr)
 {
 }
@@ -313,7 +312,7 @@ template <typename T>
 numeric_scalar<T>::numeric_scalar(T value,
                                   bool is_valid,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
   : detail::fixed_width_scalar<T>(value, is_valid, stream, mr)
 {
 }
@@ -322,7 +321,7 @@ template <typename T>
 numeric_scalar<T>::numeric_scalar(rmm::device_scalar<T>&& data,
                                   bool is_valid,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
   : detail::fixed_width_scalar<T>(std::forward<rmm::device_scalar<T>>(data), is_valid, stream, mr)
 {
 }
@@ -330,7 +329,7 @@ numeric_scalar<T>::numeric_scalar(rmm::device_scalar<T>&& data,
 template <typename T>
 numeric_scalar<T>::numeric_scalar(numeric_scalar<T> const& other,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
   : detail::fixed_width_scalar<T>{other, stream, mr}
 {
 }
@@ -360,7 +359,7 @@ template <typename T>
 chrono_scalar<T>::chrono_scalar(T value,
                                 bool is_valid,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
   : detail::fixed_width_scalar<T>(value, is_valid, stream, mr)
 {
 }
@@ -369,7 +368,7 @@ template <typename T>
 chrono_scalar<T>::chrono_scalar(rmm::device_scalar<T>&& data,
                                 bool is_valid,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
   : detail::fixed_width_scalar<T>(std::forward<rmm::device_scalar<T>>(data), is_valid, stream, mr)
 {
 }
@@ -377,7 +376,7 @@ chrono_scalar<T>::chrono_scalar(rmm::device_scalar<T>&& data,
 template <typename T>
 chrono_scalar<T>::chrono_scalar(chrono_scalar<T> const& other,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
   : detail::fixed_width_scalar<T>{other, stream, mr}
 {
 }
@@ -405,7 +404,7 @@ template <typename T>
 duration_scalar<T>::duration_scalar(rep_type value,
                                     bool is_valid,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
   : chrono_scalar<T>(T{value}, is_valid, stream, mr)
 {
 }
@@ -413,7 +412,7 @@ duration_scalar<T>::duration_scalar(rep_type value,
 template <typename T>
 duration_scalar<T>::duration_scalar(duration_scalar<T> const& other,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
   : chrono_scalar<T>{other, stream, mr}
 {
 }
@@ -464,7 +463,7 @@ template <typename D>
 timestamp_scalar<T>::timestamp_scalar(D const& value,
                                       bool is_valid,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
   : chrono_scalar<T>(T{typename T::duration{value}}, is_valid, stream, mr)
 {
 }
@@ -472,14 +471,14 @@ timestamp_scalar<T>::timestamp_scalar(D const& value,
 template <typename T>
 timestamp_scalar<T>::timestamp_scalar(timestamp_scalar<T> const& other,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
   : chrono_scalar<T>{other, stream, mr}
 {
 }
 
 #define TS_CTOR(TimestampType, DurationType)                  \
   template timestamp_scalar<TimestampType>::timestamp_scalar( \
-    DurationType const&, bool, rmm::cuda_stream_view, rmm::mr::device_memory_resource*);
+    DurationType const&, bool, rmm::cuda_stream_view, rmm::device_async_resource_ref);
 
 /**
  * @brief These are the valid combinations of duration types to timestamp types.
@@ -508,7 +507,7 @@ TS_CTOR(timestamp_ns, int64_t)
 list_scalar::list_scalar(cudf::column_view const& data,
                          bool is_valid,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+                         rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::LIST), is_valid, stream, mr), _data(data, stream, mr)
 {
 }
@@ -516,14 +515,14 @@ list_scalar::list_scalar(cudf::column_view const& data,
 list_scalar::list_scalar(cudf::column&& data,
                          bool is_valid,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+                         rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::LIST), is_valid, stream, mr), _data(std::move(data))
 {
 }
 
 list_scalar::list_scalar(list_scalar const& other,
                          rmm::cuda_stream_view stream,
-                         rmm::mr::device_memory_resource* mr)
+                         rmm::device_async_resource_ref mr)
   : scalar{other, stream, mr}, _data(other._data, stream, mr)
 {
 }
@@ -532,7 +531,7 @@ column_view list_scalar::view() const { return _data.view(); }
 
 struct_scalar::struct_scalar(struct_scalar const& other,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar{other, stream, mr}, _data(other._data, stream, mr)
 {
 }
@@ -540,7 +539,7 @@ struct_scalar::struct_scalar(struct_scalar const& other,
 struct_scalar::struct_scalar(table_view const& data,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::STRUCT), is_valid, stream, mr),
     _data{init_data(table{data, stream, mr}, is_valid, stream, mr)}
 {
@@ -550,7 +549,7 @@ struct_scalar::struct_scalar(table_view const& data,
 struct_scalar::struct_scalar(host_span<column_view const> data,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::STRUCT), is_valid, stream, mr),
     _data{
       init_data(table{table_view{std::vector<column_view>{data.begin(), data.end()}}, stream, mr},
@@ -564,7 +563,7 @@ struct_scalar::struct_scalar(host_span<column_view const> data,
 struct_scalar::struct_scalar(table&& data,
                              bool is_valid,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
   : scalar(data_type(type_id::STRUCT), is_valid, stream, mr),
     _data{init_data(std::move(data), is_valid, stream, mr)}
 {
@@ -584,7 +583,7 @@ void struct_scalar::assert_valid_size()
 table struct_scalar::init_data(table&& data,
                                bool is_valid,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   if (is_valid) { return std::move(data); }
 
diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index 2336b9075de..d59c5c9fc85 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -23,6 +23,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace {
@@ -31,7 +32,7 @@ struct scalar_construction_helper {
             typename ScalarType                                                = scalar_type_t<T>,
             std::enable_if_t<is_fixed_width<T>() and not is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<scalar> operator()(rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     using Type = device_storage_type_t<T>;
     auto s     = new ScalarType(Type{}, false, stream, mr);
@@ -42,7 +43,7 @@ struct scalar_construction_helper {
             typename ScalarType                    = scalar_type_t<T>,
             std::enable_if_t<is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<scalar> operator()(rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     using Type = device_storage_type_t<T>;
     auto s     = new ScalarType(Type{}, numeric::scale_type{0}, false, stream, mr);
@@ -60,7 +61,7 @@ struct scalar_construction_helper {
 // Allocate storage for a single numeric element
 std::unique_ptr<scalar> make_numeric_scalar(data_type type,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
 
@@ -70,7 +71,7 @@ std::unique_ptr<scalar> make_numeric_scalar(data_type type,
 // Allocate storage for a single timestamp element
 std::unique_ptr<scalar> make_timestamp_scalar(data_type type,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
 
@@ -80,7 +81,7 @@ std::unique_ptr<scalar> make_timestamp_scalar(data_type type,
 // Allocate storage for a single duration element
 std::unique_ptr<scalar> make_duration_scalar(data_type type,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
 
@@ -90,7 +91,7 @@ std::unique_ptr<scalar> make_duration_scalar(data_type type,
 // Allocate storage for a single fixed width element
 std::unique_ptr<scalar> make_fixed_width_scalar(data_type type,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type.");
 
@@ -99,21 +100,21 @@ std::unique_ptr<scalar> make_fixed_width_scalar(data_type type,
 
 std::unique_ptr<scalar> make_list_scalar(column_view elements,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   return std::make_unique<list_scalar>(elements, true, stream, mr);
 }
 
 std::unique_ptr<scalar> make_struct_scalar(table_view const& data,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   return std::make_unique<struct_scalar>(data, true, stream, mr);
 }
 
 std::unique_ptr<scalar> make_struct_scalar(host_span<column_view const> data,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   return std::make_unique<struct_scalar>(data, true, stream, mr);
 }
@@ -124,14 +125,14 @@ struct default_scalar_functor {
 
   template <typename T, std::enable_if_t<not is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<cudf::scalar> operator()(rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     return make_fixed_width_scalar(data_type(type_to_id<T>()), stream, mr);
   }
 
   template <typename T, std::enable_if_t<is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<cudf::scalar> operator()(rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     auto const scale_ = numeric::scale_type{type.scale()};
     auto s            = make_fixed_point_scalar<T>(0, scale_, stream, mr);
@@ -142,28 +143,28 @@ struct default_scalar_functor {
 
 template <>
 std::unique_ptr<cudf::scalar> default_scalar_functor::operator()<string_view>(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   return std::unique_ptr<scalar>(new string_scalar("", false, stream, mr));
 }
 
 template <>
 std::unique_ptr<cudf::scalar> default_scalar_functor::operator()<dictionary32>(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL("dictionary type not supported");
 }
 
 template <>
 std::unique_ptr<cudf::scalar> default_scalar_functor::operator()<list_view>(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL("list_view type not supported");
 }
 
 template <>
 std::unique_ptr<cudf::scalar> default_scalar_functor::operator()<struct_view>(
-  rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FAIL("struct_view type not supported");
 }
@@ -172,14 +173,14 @@ std::unique_ptr<cudf::scalar> default_scalar_functor::operator()<struct_view>(
 
 std::unique_ptr<scalar> make_default_constructed_scalar(data_type type,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   return type_dispatcher(type, default_scalar_functor{type}, stream, mr);
 }
 
 std::unique_ptr<scalar> make_empty_scalar_like(column_view const& column,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   std::unique_ptr<scalar> result;
   switch (column.type().id()) {
diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu
index b8c7d058535..8f05196a71c 100644
--- a/cpp/src/search/contains_column.cu
+++ b/cpp/src/search/contains_column.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -33,7 +34,7 @@ struct contains_column_dispatch {
   std::unique_ptr<column> operator()(column_view const& haystack,
                                      column_view const& needles,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     auto result_v = detail::contains(table_view{{haystack}},
                                      table_view{{needles}},
@@ -51,7 +52,7 @@ std::unique_ptr<column> contains_column_dispatch::operator()<dictionary32>(
   column_view const& haystack_in,
   column_view const& needles_in,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr) const
+  rmm::device_async_resource_ref mr) const
 {
   dictionary_column_view const haystack(haystack_in);
   dictionary_column_view const needles(needles_in);
@@ -79,7 +80,7 @@ std::unique_ptr<column> contains_column_dispatch::operator()<dictionary32>(
 std::unique_ptr<column> contains(column_view const& haystack,
                                  column_view const& needles,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher(
     haystack.type(), contains_column_dispatch{}, haystack, needles, stream, mr);
@@ -90,7 +91,7 @@ std::unique_ptr<column> contains(column_view const& haystack,
 std::unique_ptr<column> contains(column_view const& haystack,
                                  column_view const& needles,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains(haystack, needles, stream, mr);
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index f7b6d8fdb72..13417fdab63 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_set.cuh>
 #include <cuda/functional>
@@ -187,7 +188,7 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
                                    null_equality compare_nulls,
                                    nan_equality compare_nans,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(cudf::have_same_types(haystack, needles), "Column types mismatch");
 
diff --git a/cpp/src/search/search_ordered.cu b/cpp/src/search/search_ordered.cu
index 3b5dbef0401..328d3f0cee4 100644
--- a/cpp/src/search/search_ordered.cu
+++ b/cpp/src/search/search_ordered.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 
@@ -38,7 +39,7 @@ std::unique_ptr<column> search_ordered(table_view const& haystack,
                                        std::vector<order> const& column_order,
                                        std::vector<null_order> const& null_precedence,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(
     column_order.empty() or static_cast<std::size_t>(haystack.num_columns()) == column_order.size(),
@@ -121,7 +122,7 @@ std::unique_ptr<column> lower_bound(table_view const& haystack,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   return search_ordered(haystack, needles, true, column_order, null_precedence, stream, mr);
 }
@@ -131,7 +132,7 @@ std::unique_ptr<column> upper_bound(table_view const& haystack,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   return search_ordered(haystack, needles, false, column_order, null_precedence, stream, mr);
 }
@@ -145,7 +146,7 @@ std::unique_ptr<column> lower_bound(table_view const& haystack,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::lower_bound(haystack, needles, column_order, null_precedence, stream, mr);
@@ -156,7 +157,7 @@ std::unique_ptr<column> upper_bound(table_view const& haystack,
                                     std::vector<order> const& column_order,
                                     std::vector<null_order> const& null_precedence,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::upper_bound(haystack, needles, column_order, null_precedence, stream, mr);
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index cbd0207c20e..c5dcc7c240d 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <cuda/std/type_traits>
@@ -270,7 +271,7 @@ std::unique_ptr<column> rank(column_view const& input,
                              null_order null_precedence,
                              bool percentage,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   data_type const output_type         = (percentage or method == rank_method::AVERAGE)
                                           ? data_type(type_id::FLOAT64)
@@ -373,7 +374,7 @@ std::unique_ptr<column> rank(column_view const& input,
                              null_order null_precedence,
                              bool percentage,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rank(
diff --git a/cpp/src/sort/segmented_sort.cu b/cpp/src/sort/segmented_sort.cu
index d9457341bd2..408ac29b8a9 100644
--- a/cpp/src/sort/segmented_sort.cu
+++ b/cpp/src/sort/segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -57,7 +58,7 @@ std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
                                                std::vector<order> const& column_order,
                                                std::vector<null_order> const& null_precedence,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   return segmented_sorted_order_common<sort_method::UNSTABLE>(
     keys, segment_offsets, column_order, null_precedence, stream, mr);
@@ -69,7 +70,7 @@ std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
                                              std::vector<order> const& column_order,
                                              std::vector<null_order> const& null_precedence,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   return segmented_sort_by_key_common<sort_method::UNSTABLE>(
     values, keys, segment_offsets, column_order, null_precedence, stream, mr);
@@ -82,7 +83,7 @@ std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
                                                std::vector<order> const& column_order,
                                                std::vector<null_order> const& null_precedence,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_sorted_order(
@@ -95,7 +96,7 @@ std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
                                              std::vector<order> const& column_order,
                                              std::vector<null_order> const& null_precedence,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_sort_by_key(
diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh
index 796e178fecd..6d472925b30 100644
--- a/cpp/src/sort/segmented_sort_impl.cuh
+++ b/cpp/src/sort/segmented_sort_impl.cuh
@@ -27,6 +27,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_segmented_sort.cuh>
 
@@ -160,7 +161,7 @@ std::unique_ptr<column> fast_segmented_sorted_order(column_view const& input,
                                                     column_view const& segment_offsets,
                                                     order const& column_order,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   // Unfortunately, CUB's segmented sort functions cannot accept iterators.
   // We have to build a pre-filled sequence of indices as input.
@@ -227,7 +228,7 @@ std::unique_ptr<column> segmented_sorted_order_common(
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   if (keys.num_rows() == 0 || keys.num_columns() == 0) {
     return cudf::make_empty_column(type_to_id<size_type>());
@@ -304,7 +305,7 @@ std::unique_ptr<table> segmented_sort_by_key_common(table_view const& values,
                                                     std::vector<order> const& column_order,
                                                     std::vector<null_order> const& null_precedence,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(values.num_rows() == keys.num_rows(),
                "Mismatch in number of rows for values and keys");
diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
index adffc06ab93..7216bc99e08 100644
--- a/cpp/src/sort/sort.cu
+++ b/cpp/src/sort/sort.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/sort.h>
@@ -36,7 +37,7 @@ std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   return sorted_order<sort_method::UNSTABLE>(input, column_order, null_precedence, stream, mr);
 }
@@ -46,7 +47,7 @@ std::unique_ptr<table> sort_by_key(table_view const& values,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(values.num_rows() == keys.num_rows(),
                "Mismatch in number of rows for values and keys");
@@ -66,7 +67,7 @@ std::unique_ptr<table> sort(table_view const& input,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   // fast-path sort conditions: single, non-floating-point, fixed-width column with no nulls
   if (inplace_column_sort_fn<sort_method::UNSTABLE>::is_usable(input)) {
@@ -88,7 +89,7 @@ std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sorted_order(input, column_order, null_precedence, stream, mr);
@@ -98,7 +99,7 @@ std::unique_ptr<table> sort(table_view const& input,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sort(input, column_order, null_precedence, stream, mr);
@@ -109,7 +110,7 @@ std::unique_ptr<table> sort_by_key(table_view const& values,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::sort_by_key(values, keys, column_order, null_precedence, stream, mr);
diff --git a/cpp/src/sort/sort_column.cu b/cpp/src/sort/sort_column.cu
index 7db44476988..99a45bf91a3 100644
--- a/cpp/src/sort/sort_column.cu
+++ b/cpp/src/sort/sort_column.cu
@@ -21,6 +21,8 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/sequence.h>
 
 namespace cudf {
@@ -28,14 +30,14 @@ namespace detail {
 
 /**
  * @copydoc
- * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::mr::device_memory_resource*)
+ * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::device_async_resource_ref)
  */
 template <>
 std::unique_ptr<column> sorted_order<sort_method::UNSTABLE>(column_view const& input,
                                                             order column_order,
                                                             null_order null_precedence,
                                                             rmm::cuda_stream_view stream,
-                                                            rmm::mr::device_memory_resource* mr)
+                                                            rmm::device_async_resource_ref mr)
 {
   auto sorted_indices = cudf::make_numeric_column(
     data_type(type_to_id<size_type>()), input.size(), mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/src/sort/sort_column_impl.cuh b/cpp/src/sort/sort_column_impl.cuh
index 7af24f22b67..564791e0b49 100644
--- a/cpp/src/sort/sort_column_impl.cuh
+++ b/cpp/src/sort/sort_column_impl.cuh
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
@@ -52,7 +53,7 @@ std::unique_ptr<column> sorted_order(column_view const& input,
                                      order column_order,
                                      null_order null_precedence,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr);
+                                     rmm::device_async_resource_ref mr);
 
 /**
  * @brief Comparator functor needed for single column sort.
diff --git a/cpp/src/sort/sort_impl.cuh b/cpp/src/sort/sort_impl.cuh
index e0331d65053..20e977e9fd5 100644
--- a/cpp/src/sort/sort_impl.cuh
+++ b/cpp/src/sort/sort_impl.cuh
@@ -21,12 +21,15 @@
 
 #include <cudf/column/column_factories.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace detail {
 
 /**
  * @copydoc
- * sorted_order(table_view&,std::vector<order>,std::vector<null_order>,rmm::mr::device_memory_resource*)
+ * sorted_order(table_view&,std::vector<order>,std::vector<null_order>,rmm::device_async_resource_ref
+ * )
  *
  * @tparam stable Whether to use stable sort
  * @param stream CUDA stream used for device memory operations and kernel launches
@@ -36,7 +39,7 @@ std::unique_ptr<column> sorted_order(table_view input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   if (input.num_rows() == 0 or input.num_columns() == 0) {
     return cudf::make_numeric_column(
diff --git a/cpp/src/sort/stable_segmented_sort.cu b/cpp/src/sort/stable_segmented_sort.cu
index 4725d65e05d..61e37205c98 100644
--- a/cpp/src/sort/stable_segmented_sort.cu
+++ b/cpp/src/sort/stable_segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/sorting.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace detail {
 
@@ -30,7 +32,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   return segmented_sorted_order_common<sort_method::STABLE>(
     keys, segment_offsets, column_order, null_precedence, stream, mr);
@@ -42,7 +44,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
                                                     std::vector<order> const& column_order,
                                                     std::vector<null_order> const& null_precedence,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   return segmented_sort_by_key_common<sort_method::STABLE>(
     values, keys, segment_offsets, column_order, null_precedence, stream, mr);
@@ -56,7 +58,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_segmented_sorted_order(
@@ -69,7 +71,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
                                                     std::vector<order> const& column_order,
                                                     std::vector<null_order> const& null_precedence,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_segmented_sort_by_key(
diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu
index 0bfe2cfef16..ce05a755756 100644
--- a/cpp/src/sort/stable_sort.cu
+++ b/cpp/src/sort/stable_sort.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -33,7 +34,7 @@ std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   return sorted_order<sort_method::STABLE>(input, column_order, null_precedence, stream, mr);
 }
@@ -42,7 +43,7 @@ std::unique_ptr<table> stable_sort(table_view const& input,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (inplace_column_sort_fn<sort_method::STABLE>::is_usable(input)) {
     auto output = std::make_unique<column>(input.column(0), stream, mr);
@@ -62,7 +63,7 @@ std::unique_ptr<table> stable_sort_by_key(table_view const& values,
                                           std::vector<order> const& column_order,
                                           std::vector<null_order> const& null_precedence,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(values.num_rows() == keys.num_rows(),
                "Mismatch in number of rows for values and keys");
@@ -83,7 +84,7 @@ std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_sorted_order(input, column_order, null_precedence, stream, mr);
@@ -93,7 +94,7 @@ std::unique_ptr<table> stable_sort(table_view const& input,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_sort(input, column_order, null_precedence, stream, mr);
@@ -104,7 +105,7 @@ std::unique_ptr<table> stable_sort_by_key(table_view const& values,
                                           std::vector<order> const& column_order,
                                           std::vector<null_order> const& null_precedence,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_sort_by_key(values, keys, column_order, null_precedence, stream, mr);
diff --git a/cpp/src/sort/stable_sort_column.cu b/cpp/src/sort/stable_sort_column.cu
index 25a6c92034a..bdb631a8154 100644
--- a/cpp/src/sort/stable_sort_column.cu
+++ b/cpp/src/sort/stable_sort_column.cu
@@ -21,6 +21,8 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/sequence.h>
 
 namespace cudf {
@@ -28,14 +30,14 @@ namespace detail {
 
 /**
  * @copydoc
- * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::mr::device_memory_resource*)
+ * sorted_order(column_view&,order,null_order,rmm::cuda_stream_view,rmm::device_async_resource_ref)
  */
 template <>
 std::unique_ptr<column> sorted_order<sort_method::STABLE>(column_view const& input,
                                                           order column_order,
                                                           null_order null_precedence,
                                                           rmm::cuda_stream_view stream,
-                                                          rmm::mr::device_memory_resource* mr)
+                                                          rmm::device_async_resource_ref mr)
 {
   auto sorted_indices = cudf::make_numeric_column(
     data_type(type_to_id<size_type>()), input.size(), mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/src/stream_compaction/apply_boolean_mask.cu b/cpp/src/stream_compaction/apply_boolean_mask.cu
index 8f707f6d15d..cdca9517d94 100644
--- a/cpp/src/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/stream_compaction/apply_boolean_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <algorithm>
 
@@ -65,7 +66,7 @@ namespace detail {
 std::unique_ptr<table> apply_boolean_mask(table_view const& input,
                                           column_view const& boolean_mask,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   if (boolean_mask.is_empty()) { return empty_like(input); }
 
@@ -90,7 +91,7 @@ std::unique_ptr<table> apply_boolean_mask(table_view const& input,
  */
 std::unique_ptr<table> apply_boolean_mask(table_view const& input,
                                           column_view const& boolean_mask,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::apply_boolean_mask(input, boolean_mask, cudf::get_default_stream(), mr);
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index 11e2e77c253..a6f15cc49ec 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -26,6 +26,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -44,7 +45,7 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
                                                 null_equality nulls_equal,
                                                 nan_equality nans_equal,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   if (input.num_rows() == 0 or input.num_columns() == 0) {
     return rmm::device_uvector<size_type>(0, stream, mr);
@@ -145,7 +146,7 @@ std::unique_ptr<table> distinct(table_view const& input,
                                 null_equality nulls_equal,
                                 nan_equality nans_equal,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) {
     return empty_like(input);
@@ -172,7 +173,7 @@ std::unique_ptr<table> distinct(table_view const& input,
                                 duplicate_keep_option keep,
                                 null_equality nulls_equal,
                                 nan_equality nans_equal,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::distinct(
@@ -184,7 +185,7 @@ std::unique_ptr<column> distinct_indices(table_view const& input,
                                          null_equality nulls_equal,
                                          nan_equality nans_equal,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   auto indices = detail::distinct_indices(input, keep, nulls_equal, nans_equal, stream, mr);
diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu
index 8f36ec98f4a..13e89b15bb7 100644
--- a/cpp/src/stream_compaction/distinct_helpers.cu
+++ b/cpp/src/stream_compaction/distinct_helpers.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 
 #include <cudf/detail/hash_reduce_by_row.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf::detail {
 
 namespace {
@@ -88,7 +90,7 @@ rmm::device_uvector<size_type> reduce_by_row(
   null_equality nulls_equal,
   nan_equality nans_equal,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY,
                "This function should not be called with KEEP_ANY");
diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp
index b667d0b04f0..40f97e00ce5 100644
--- a/cpp/src/stream_compaction/distinct_helpers.hpp
+++ b/cpp/src/stream_compaction/distinct_helpers.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::detail {
 
@@ -82,6 +83,6 @@ rmm::device_uvector<size_type> reduce_by_row(
   null_equality nulls_equal,
   nan_equality nans_equal,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr);
+  rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/drop_nans.cu b/cpp/src/stream_compaction/drop_nans.cu
index a645b46f7a7..b46381c8ff6 100644
--- a/cpp/src/stream_compaction/drop_nans.cu
+++ b/cpp/src/stream_compaction/drop_nans.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/execution_policy.h>
@@ -89,7 +90,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
                                  cudf::size_type keep_threshold,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto keys_view = input.select(keys);
   if (keys_view.num_columns() == 0 || keys_view.num_rows() == 0) {
@@ -116,7 +117,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
 std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
                                  cudf::size_type keep_threshold,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::drop_nans(input, keys, keep_threshold, cudf::get_default_stream(), mr);
@@ -126,7 +127,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
  */
 std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::drop_nans(input, keys, keys.size(), cudf::get_default_stream(), mr);
diff --git a/cpp/src/stream_compaction/drop_nulls.cu b/cpp/src/stream_compaction/drop_nulls.cu
index 6ea1fd4c31f..cb7cd61bf02 100644
--- a/cpp/src/stream_compaction/drop_nulls.cu
+++ b/cpp/src/stream_compaction/drop_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/execution_policy.h>
@@ -68,7 +69,7 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
                                   cudf::size_type keep_threshold,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto keys_view = input.select(keys);
   if (keys_view.num_columns() == 0 || keys_view.num_rows() == 0 || not cudf::has_nulls(keys_view)) {
@@ -89,7 +90,7 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
 std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
                                   cudf::size_type keep_threshold,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::drop_nulls(input, keys, keep_threshold, cudf::get_default_stream(), mr);
@@ -99,7 +100,7 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
  */
 std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::drop_nulls(input, keys, keys.size(), cudf::get_default_stream(), mr);
diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu
index 63167b45b2d..27b5a92ab69 100644
--- a/cpp/src/stream_compaction/stable_distinct.cu
+++ b/cpp/src/stream_compaction/stable_distinct.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,8 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scatter.h>
 #include <thrust/uninitialized_fill.h>
@@ -34,7 +36,7 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) {
     return empty_like(input);
@@ -77,7 +79,7 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
                                        duplicate_keep_option keep,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_distinct(
diff --git a/cpp/src/stream_compaction/unique.cu b/cpp/src/stream_compaction/unique.cu
index db67daaa324..c1f8b17938c 100644
--- a/cpp/src/stream_compaction/unique.cu
+++ b/cpp/src/stream_compaction/unique.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,6 +36,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -52,7 +53,7 @@ std::unique_ptr<table> unique(table_view const& input,
                               duplicate_keep_option keep,
                               null_equality nulls_equal,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   // If keep is KEEP_ANY, just alias it to KEEP_FIRST.
   if (keep == duplicate_keep_option::KEEP_ANY) { keep = duplicate_keep_option::KEEP_FIRST; }
@@ -119,7 +120,7 @@ std::unique_ptr<table> unique(table_view const& input,
                               std::vector<size_type> const& keys,
                               duplicate_keep_option const keep,
                               null_equality nulls_equal,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::unique(input, keys, keep, nulls_equal, cudf::get_default_stream(), mr);
diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu
index 073ed74d8c9..778f546990d 100644
--- a/cpp/src/strings/attributes.cu
+++ b/cpp/src/strings/attributes.cu
@@ -30,6 +30,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/warp/warp_reduce.cuh>
 #include <cuda/functional>
@@ -75,7 +76,7 @@ template <typename UnaryFunction>
 std::unique_ptr<column> counts_fn(strings_column_view const& strings,
                                   UnaryFunction& ufn,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   // create output column
   auto results   = make_numeric_column(data_type{type_to_id<size_type>()},
@@ -136,7 +137,7 @@ CUDF_KERNEL void count_characters_parallel_fn(column_device_view const d_strings
 
 std::unique_ptr<column> count_characters_parallel(strings_column_view const& input,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   // create output column
   auto results = make_numeric_column(data_type{type_to_id<size_type>()},
@@ -165,7 +166,7 @@ std::unique_ptr<column> count_characters_parallel(strings_column_view const& inp
 
 std::unique_ptr<column> count_characters(strings_column_view const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if ((input.size() == input.null_count()) ||
       ((input.chars_size(stream) / (input.size() - input.null_count())) <
@@ -180,7 +181,7 @@ std::unique_ptr<column> count_characters(strings_column_view const& input,
 
 std::unique_ptr<column> count_bytes(strings_column_view const& input,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto ufn = cuda::proclaim_return_type<size_type>(
     [] __device__(string_view const& d_str) { return d_str.size_bytes(); });
@@ -219,7 +220,7 @@ namespace detail {
 //
 std::unique_ptr<column> code_points(strings_column_view const& input,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_column       = *strings_column;
@@ -263,21 +264,21 @@ std::unique_ptr<column> code_points(strings_column_view const& input,
 // external APIS
 
 std::unique_ptr<column> count_characters(strings_column_view const& input,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::count_characters(input, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> count_bytes(strings_column_view const& input,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::count_bytes(input, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> code_points(strings_column_view const& input,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::code_points(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index 3889bd31b4d..2bb85bf2c5c 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -27,6 +27,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/pair.h>
@@ -227,7 +228,7 @@ template <typename CapitalFn>
 std::unique_ptr<column> capitalizer(CapitalFn cfn,
                                     strings_column_view const& input,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto [offsets_column, chars] =
     cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr);
@@ -244,7 +245,7 @@ std::unique_ptr<column> capitalizer(CapitalFn cfn,
 std::unique_ptr<column> capitalize(strings_column_view const& input,
                                    string_scalar const& delimiters,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiters.is_valid(stream), "Delimiter must be a valid string");
   if (input.is_empty()) return make_empty_column(type_id::STRING);
@@ -256,7 +257,7 @@ std::unique_ptr<column> capitalize(strings_column_view const& input,
 std::unique_ptr<column> title(strings_column_view const& input,
                               string_character_types sequence_type,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
   auto d_column = column_device_view::create(input.parent(), stream);
@@ -265,7 +266,7 @@ std::unique_ptr<column> title(strings_column_view const& input,
 
 std::unique_ptr<column> is_title(strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::BOOL8);
   auto results  = make_numeric_column(data_type{type_id::BOOL8},
@@ -289,7 +290,7 @@ std::unique_ptr<column> is_title(strings_column_view const& input,
 std::unique_ptr<column> capitalize(strings_column_view const& input,
                                    string_scalar const& delimiter,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::capitalize(input, delimiter, stream, mr);
@@ -298,7 +299,7 @@ std::unique_ptr<column> capitalize(strings_column_view const& input,
 std::unique_ptr<column> title(strings_column_view const& input,
                               string_character_types sequence_type,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::title(input, sequence_type, stream, mr);
@@ -306,7 +307,7 @@ std::unique_ptr<column> title(strings_column_view const& input,
 
 std::unique_ptr<column> is_title(strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_title(input, stream, mr);
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index a7fd244f8a5..82b590f81b3 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -32,6 +32,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <cuda/functional>
@@ -271,7 +272,7 @@ struct ascii_converter_fn {
 std::unique_ptr<column> convert_case(strings_column_view const& input,
                                      character_flags_table_type case_flag,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   if (input.size() == input.null_count()) {
     return std::make_unique<column>(input.parent(), stream, mr);
@@ -377,7 +378,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
 
 std::unique_ptr<column> to_lower(strings_column_view const& strings,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   character_flags_table_type case_flag = IS_UPPER(0xFF);  // convert only upper case characters
   return convert_case(strings, case_flag, stream, mr);
@@ -386,7 +387,7 @@ std::unique_ptr<column> to_lower(strings_column_view const& strings,
 //
 std::unique_ptr<column> to_upper(strings_column_view const& strings,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   character_flags_table_type case_flag = IS_LOWER(0xFF);  // convert only lower case characters
   return convert_case(strings, case_flag, stream, mr);
@@ -395,7 +396,7 @@ std::unique_ptr<column> to_upper(strings_column_view const& strings,
 //
 std::unique_ptr<column> swapcase(strings_column_view const& strings,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   // convert only upper or lower case characters
   character_flags_table_type case_flag = IS_LOWER(0xFF) | IS_UPPER(0xFF);
@@ -408,7 +409,7 @@ std::unique_ptr<column> swapcase(strings_column_view const& strings,
 
 std::unique_ptr<column> to_lower(strings_column_view const& strings,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_lower(strings, stream, mr);
@@ -416,7 +417,7 @@ std::unique_ptr<column> to_lower(strings_column_view const& strings,
 
 std::unique_ptr<column> to_upper(strings_column_view const& strings,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_upper(strings, stream, mr);
@@ -424,7 +425,7 @@ std::unique_ptr<column> to_upper(strings_column_view const& strings,
 
 std::unique_ptr<column> swapcase(strings_column_view const& strings,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::swapcase(strings, stream, mr);
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index b8c0dfd27e6..28068cf7e78 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -87,7 +88,7 @@ std::unique_ptr<column> all_characters_of_type(strings_column_view const& input,
                                                string_character_types types,
                                                string_character_types verify_types,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
 
@@ -175,7 +176,7 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
                                                   string_scalar const& replacement,
                                                   string_character_types types_to_keep,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid");
   if (types_to_remove == ALL_TYPES)
@@ -219,7 +220,7 @@ std::unique_ptr<column> all_characters_of_type(strings_column_view const& input,
                                                string_character_types types,
                                                string_character_types verify_types,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::all_characters_of_type(input, types, verify_types, stream, mr);
@@ -230,7 +231,7 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& inp
                                                   string_scalar const& replacement,
                                                   string_character_types types_to_keep,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::filter_characters_of_type(
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index 14f530971f5..33d2de3cd07 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -32,6 +32,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -122,7 +123,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& narep,
                                     separator_on_nulls separate_nulls,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto const num_columns = strings_columns.num_columns();
   CUDF_EXPECTS(num_columns > 1, "At least two columns must be specified");
@@ -206,7 +207,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& col_narep,
                                     separator_on_nulls separate_nulls,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto const num_columns = strings_columns.num_columns();
   CUDF_EXPECTS(num_columns > 0, "At least one column must be specified");
@@ -262,7 +263,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& narep,
                                     separator_on_nulls separate_nulls,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate(strings_columns, separator, narep, separate_nulls, stream, mr);
@@ -274,7 +275,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                                     string_scalar const& col_narep,
                                     separator_on_nulls separate_nulls,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate(
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index c6290ceb6c2..d1d9afbb85f 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -32,6 +32,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -131,7 +132,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
                                      string_scalar const& separator,
                                      string_scalar const& narep,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
 
@@ -191,7 +192,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
                                      string_scalar const& separator,
                                      string_scalar const& narep,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::join_strings(strings, separator, narep, stream, mr);
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index 170e621e05c..a54ea5263fe 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -178,7 +179,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING,
                "The input column must be a column of lists of strings");
@@ -251,7 +252,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(lists_strings_column.child().type().id() == type_id::STRING,
                "The input column must be a column of lists of strings");
@@ -302,7 +303,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::join_list_elements(
@@ -316,7 +317,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            separator_on_nulls separate_nulls,
                                            output_if_empty_list empty_list_policy,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::join_list_elements(lists_strings_column,
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index 3f0ebc5962b..718ac41e36c 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -60,7 +61,7 @@ std::unique_ptr<column> contains_impl(strings_column_view const& input,
                                       regex_program const& prog,
                                       bool const beginning_only,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   auto results = make_numeric_column(data_type{type_id::BOOL8},
                                      input.size(),
@@ -88,7 +89,7 @@ std::unique_ptr<column> contains_impl(strings_column_view const& input,
 std::unique_ptr<column> contains_re(strings_column_view const& input,
                                     regex_program const& prog,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   return contains_impl(input, prog, false, stream, mr);
 }
@@ -96,7 +97,7 @@ std::unique_ptr<column> contains_re(strings_column_view const& input,
 std::unique_ptr<column> matches_re(strings_column_view const& input,
                                    regex_program const& prog,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   return contains_impl(input, prog, true, stream, mr);
 }
@@ -104,7 +105,7 @@ std::unique_ptr<column> matches_re(strings_column_view const& input,
 std::unique_ptr<column> count_re(strings_column_view const& input,
                                  regex_program const& prog,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   // create device object from regex_program
   auto d_prog = regex_device_builder::create_prog_device(prog, stream);
@@ -126,7 +127,7 @@ std::unique_ptr<column> count_re(strings_column_view const& input,
 std::unique_ptr<column> contains_re(strings_column_view const& input,
                                     regex_program const& prog,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains_re(input, prog, stream, mr);
@@ -135,7 +136,7 @@ std::unique_ptr<column> contains_re(strings_column_view const& input,
 std::unique_ptr<column> matches_re(strings_column_view const& input,
                                    regex_program const& prog,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::matches_re(input, prog, stream, mr);
@@ -144,7 +145,7 @@ std::unique_ptr<column> matches_re(strings_column_view const& input,
 std::unique_ptr<column> count_re(strings_column_view const& input,
                                  regex_program const& prog,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::count_re(input, prog, stream, mr);
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index d1de345a709..bf73800ad06 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -42,7 +43,7 @@ namespace detail {
 std::unique_ptr<column> to_booleans(strings_column_view const& input,
                                     string_scalar const& true_string,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) {
@@ -85,7 +86,7 @@ std::unique_ptr<column> to_booleans(strings_column_view const& input,
 std::unique_ptr<column> to_booleans(strings_column_view const& input,
                                     string_scalar const& true_string,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_booleans(input, true_string, stream, mr);
@@ -123,7 +124,7 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
                                       string_scalar const& true_string,
                                       string_scalar const& false_string,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   size_type strings_count = booleans.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -160,7 +161,7 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
                                       string_scalar const& true_string,
                                       string_scalar const& false_string,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_booleans(booleans, true_string, false_string, stream, mr);
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index f54eb082959..d6449fbb6c8 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -437,7 +438,7 @@ std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& input,
                                             data_type timestamp_type,
                                             std::string_view format,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   if (input.is_empty())
     return make_empty_column(timestamp_type);  // make_timestamp_column(timestamp_type, 0);
@@ -675,7 +676,7 @@ struct check_datetime_format {
 std::unique_ptr<cudf::column> is_timestamp(strings_column_view const& input,
                                            std::string_view const& format,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) return make_empty_column(type_id::BOOL8);
@@ -711,7 +712,7 @@ std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& input,
                                             data_type timestamp_type,
                                             std::string_view format,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_timestamps(input, timestamp_type, format, stream, mr);
@@ -720,7 +721,7 @@ std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& input,
 std::unique_ptr<cudf::column> is_timestamp(strings_column_view const& input,
                                            std::string_view format,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_timestamp(input, format, stream, mr);
@@ -1106,7 +1107,7 @@ struct dispatch_from_timestamps_fn {
                               column_device_view const& d_format_names,
                               device_span<format_item const> d_format_items,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr) const
+                              rmm::device_async_resource_ref mr) const
   {
     return make_strings_children(
       datetime_formatter_fn<T>{d_timestamps, d_format_names, d_format_items},
@@ -1129,7 +1130,7 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                                         std::string_view format,
                                         strings_column_view const& names,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   if (timestamps.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -1171,7 +1172,7 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                                         std::string_view format,
                                         strings_column_view const& names,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_timestamps(timestamps, format, names, stream, mr);
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 8076c5c484b..77c750848cf 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -23,6 +23,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -400,7 +401,7 @@ struct dispatch_from_durations_fn {
   std::unique_ptr<column> operator()(column_view const& durations,
                                      std::string_view format,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     CUDF_EXPECTS(!format.empty(), "Format parameter must not be empty.");
 
@@ -681,7 +682,7 @@ struct dispatch_to_durations_fn {
 std::unique_ptr<column> from_durations(column_view const& durations,
                                        std::string_view format,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   size_type strings_count = durations.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -694,7 +695,7 @@ std::unique_ptr<column> to_durations(strings_column_view const& input,
                                      data_type duration_type,
                                      std::string_view format,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) {
@@ -724,7 +725,7 @@ std::unique_ptr<column> to_durations(strings_column_view const& input,
 std::unique_ptr<column> from_durations(column_view const& durations,
                                        std::string_view format,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_durations(durations, format, stream, mr);
@@ -734,7 +735,7 @@ std::unique_ptr<column> to_durations(strings_column_view const& input,
                                      data_type duration_type,
                                      std::string_view format,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_durations(input, duration_type, format, stream, mr);
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index fb8ebf55ef1..446baa8dea9 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/std/climits>
 #include <cuda/std/limits>
@@ -133,7 +134,7 @@ struct dispatch_to_fixed_point_fn {
   std::unique_ptr<column> operator()(strings_column_view const& input,
                                      data_type output_type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     using DecimalType = device_storage_type_t<T>;
 
@@ -162,7 +163,7 @@ struct dispatch_to_fixed_point_fn {
   std::unique_ptr<column> operator()(strings_column_view const&,
                                      data_type,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Output for to_fixed_point must be a decimal type.");
   }
@@ -174,7 +175,7 @@ struct dispatch_to_fixed_point_fn {
 std::unique_ptr<column> to_fixed_point(strings_column_view const& input,
                                        data_type output_type,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(output_type);
   return type_dispatcher(output_type, dispatch_to_fixed_point_fn{}, input, output_type, stream, mr);
@@ -186,7 +187,7 @@ std::unique_ptr<column> to_fixed_point(strings_column_view const& input,
 std::unique_ptr<column> to_fixed_point(strings_column_view const& input,
                                        data_type output_type,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_fixed_point(input, output_type, stream, mr);
@@ -237,7 +238,7 @@ struct dispatch_from_fixed_point_fn {
   template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     using DecimalType = device_storage_type_t<T>;  // underlying value type
 
@@ -256,7 +257,7 @@ struct dispatch_from_fixed_point_fn {
   template <typename T, std::enable_if_t<not cudf::is_fixed_point<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Values for from_fixed_point function must be a decimal type.");
   }
@@ -266,7 +267,7 @@ struct dispatch_from_fixed_point_fn {
 
 std::unique_ptr<column> from_fixed_point(column_view const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
   return type_dispatcher(input.type(), dispatch_from_fixed_point_fn{}, input, stream, mr);
@@ -278,7 +279,7 @@ std::unique_ptr<column> from_fixed_point(column_view const& input,
 
 std::unique_ptr<column> from_fixed_point(column_view const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_fixed_point(input, stream, mr);
@@ -292,7 +293,7 @@ struct dispatch_is_fixed_point_fn {
   std::unique_ptr<column> operator()(strings_column_view const& input,
                                      data_type decimal_type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     using DecimalType = device_storage_type_t<T>;
 
@@ -321,7 +322,7 @@ struct dispatch_is_fixed_point_fn {
   std::unique_ptr<column> operator()(strings_column_view const&,
                                      data_type,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("is_fixed_point is expecting a decimal type");
   }
@@ -332,7 +333,7 @@ struct dispatch_is_fixed_point_fn {
 std::unique_ptr<column> is_fixed_point(strings_column_view const& input,
                                        data_type decimal_type,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return cudf::make_empty_column(type_id::BOOL8);
   return type_dispatcher(
@@ -343,7 +344,7 @@ std::unique_ptr<column> is_fixed_point(strings_column_view const& input,
 std::unique_ptr<column> is_fixed_point(strings_column_view const& input,
                                        data_type decimal_type,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_fixed_point(input, decimal_type, stream, mr);
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index df019ca236a..c6061f7d8e6 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -94,7 +95,7 @@ struct dispatch_to_floats_fn {
 std::unique_ptr<column> to_floats(strings_column_view const& input,
                                   data_type output_type,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) {
@@ -123,7 +124,7 @@ std::unique_ptr<column> to_floats(strings_column_view const& input,
 std::unique_ptr<column> to_floats(strings_column_view const& input,
                                   data_type output_type,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_floats(input, output_type, stream, mr);
@@ -394,7 +395,7 @@ struct dispatch_from_floats_fn {
   template <typename FloatType, std::enable_if_t<std::is_floating_point_v<FloatType>>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& floats,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     size_type strings_count = floats.size();
     auto column             = column_device_view::create(floats, stream);
@@ -417,7 +418,7 @@ struct dispatch_from_floats_fn {
   template <typename T, std::enable_if_t<not std::is_floating_point_v<T>>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Values for from_floats function must be a float type.");
   }
@@ -428,7 +429,7 @@ struct dispatch_from_floats_fn {
 // This will convert all float column types into a strings column.
 std::unique_ptr<column> from_floats(column_view const& floats,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   size_type strings_count = floats.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -441,7 +442,7 @@ std::unique_ptr<column> from_floats(column_view const& floats,
 // external API
 std::unique_ptr<column> from_floats(column_view const& floats,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_floats(floats, stream, mr);
@@ -450,7 +451,7 @@ std::unique_ptr<column> from_floats(column_view const& floats,
 namespace detail {
 std::unique_ptr<column> is_float(strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_column       = *strings_column;
@@ -480,7 +481,7 @@ std::unique_ptr<column> is_float(strings_column_view const& input,
 // external API
 std::unique_ptr<column> is_float(strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_float(input, stream, mr);
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index 332bc9837c1..95af378fc3f 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -29,6 +29,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -176,7 +177,7 @@ struct dispatch_integers_to_hex_fn {
             std::enable_if_t<cudf::is_integral_not_bool<IntegerType>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     auto const d_column = column_device_view::create(input, stream);
 
@@ -204,7 +205,7 @@ struct dispatch_integers_to_hex_fn {
 std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
                                         data_type output_type,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(output_type);
@@ -226,7 +227,7 @@ std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
 
 std::unique_ptr<column> is_hex(strings_column_view const& strings,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
@@ -264,7 +265,7 @@ std::unique_ptr<column> is_hex(strings_column_view const& strings,
 
 std::unique_ptr<column> integers_to_hex(column_view const& input,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::make_empty_column(type_id::STRING); }
   return type_dispatcher(input.type(), dispatch_integers_to_hex_fn{}, input, stream, mr);
@@ -276,7 +277,7 @@ std::unique_ptr<column> integers_to_hex(column_view const& input,
 std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
                                         data_type output_type,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::hex_to_integers(strings, output_type, stream, mr);
@@ -284,7 +285,7 @@ std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
 
 std::unique_ptr<column> is_hex(strings_column_view const& strings,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_hex(strings, stream, mr);
@@ -292,7 +293,7 @@ std::unique_ptr<column> is_hex(strings_column_view const& strings,
 
 std::unique_ptr<column> integers_to_hex(column_view const& input,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::integers_to_hex(input, stream, mr);
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index eb2e9c28134..f3e639817a6 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -114,7 +115,7 @@ struct dispatch_is_integer_fn {
   template <typename T, std::enable_if_t<cudf::is_integral_not_bool<T>()>* = nullptr>
   std::unique_ptr<column> operator()(strings_column_view const& input,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     auto const d_column = column_device_view::create(input.parent(), stream);
     auto results        = make_numeric_column(data_type{type_id::BOOL8},
@@ -148,7 +149,7 @@ struct dispatch_is_integer_fn {
   template <typename T, std::enable_if_t<not cudf::is_integral_not_bool<T>()>* = nullptr>
   std::unique_ptr<column> operator()(strings_column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("is_integer is expecting an integer type");
   }
@@ -158,7 +159,7 @@ struct dispatch_is_integer_fn {
 
 std::unique_ptr<column> is_integer(strings_column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   auto const d_column = column_device_view::create(input.parent(), stream);
   auto results        = make_numeric_column(data_type{type_id::BOOL8},
@@ -193,7 +194,7 @@ std::unique_ptr<column> is_integer(strings_column_view const& input,
 std::unique_ptr<column> is_integer(strings_column_view const& input,
                                    data_type int_type,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return cudf::make_empty_column(type_id::BOOL8); }
   return type_dispatcher(int_type, dispatch_is_integer_fn{}, input, stream, mr);
@@ -204,7 +205,7 @@ std::unique_ptr<column> is_integer(strings_column_view const& input,
 // external APIs
 std::unique_ptr<column> is_integer(strings_column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_integer(input, stream, mr);
@@ -213,7 +214,7 @@ std::unique_ptr<column> is_integer(strings_column_view const& input,
 std::unique_ptr<column> is_integer(strings_column_view const& input,
                                    data_type int_type,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_integer(input, int_type, stream, mr);
@@ -271,7 +272,7 @@ struct dispatch_to_integers_fn {
 std::unique_ptr<column> to_integers(strings_column_view const& input,
                                     data_type output_type,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) {
@@ -302,7 +303,7 @@ std::unique_ptr<column> to_integers(strings_column_view const& input,
 std::unique_ptr<column> to_integers(strings_column_view const& input,
                                     data_type output_type,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::to_integers(input, output_type, stream, mr);
@@ -353,7 +354,7 @@ struct dispatch_from_integers_fn {
             std::enable_if_t<cudf::is_integral_not_bool<IntegerType>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const& integers,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr) const
+                                     rmm::device_async_resource_ref mr) const
   {
     size_type strings_count = integers.size();
     auto column             = column_device_view::create(integers, stream);
@@ -376,7 +377,7 @@ struct dispatch_from_integers_fn {
   template <typename T, std::enable_if_t<not cudf::is_integral_not_bool<T>()>* = nullptr>
   std::unique_ptr<column> operator()(column_view const&,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*) const
+                                     rmm::device_async_resource_ref) const
   {
     CUDF_FAIL("Values for from_integers function must be an integer type.");
   }
@@ -386,7 +387,7 @@ struct dispatch_from_integers_fn {
 // This will convert all integer column types into a strings column.
 std::unique_ptr<column> from_integers(column_view const& integers,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   size_type strings_count = integers.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -399,7 +400,7 @@ std::unique_ptr<column> from_integers(column_view const& integers,
 // external API
 std::unique_ptr<column> from_integers(column_view const& integers,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_integers(integers, stream, mr);
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index ce7f98067ef..3d259f0ab82 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -74,7 +75,7 @@ struct ipv4_to_integers_fn {
 // Convert strings column of IPv4 addresses to integers column
 std::unique_ptr<column> ipv4_to_integers(strings_column_view const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) {
@@ -106,7 +107,7 @@ std::unique_ptr<column> ipv4_to_integers(strings_column_view const& input,
 // external API
 std::unique_ptr<column> ipv4_to_integers(strings_column_view const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::ipv4_to_integers(input, stream, mr);
@@ -159,7 +160,7 @@ struct integers_to_ipv4_fn {
 // Convert integers into IPv4 addresses
 std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if (integers.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -178,7 +179,7 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
 
 std::unique_ptr<column> is_ipv4(strings_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   auto strings_column = column_device_view::create(input.parent(), stream);
   auto d_column       = *strings_column;
@@ -227,7 +228,7 @@ std::unique_ptr<column> is_ipv4(strings_column_view const& input,
 
 std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::integers_to_ipv4(integers, stream, mr);
@@ -235,7 +236,7 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
 
 std::unique_ptr<column> is_ipv4(strings_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_ipv4(input, stream, mr);
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
index d6c24b6981b..ed898bd6f72 100644
--- a/cpp/src/strings/convert/convert_lists.cu
+++ b/cpp/src/strings/convert/convert_lists.cu
@@ -23,6 +23,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -193,7 +194,7 @@ std::unique_ptr<column> format_list_column(lists_column_view const& input,
                                            string_scalar const& na_rep,
                                            strings_column_view const& separators,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(data_type{type_id::STRING});
 
@@ -234,7 +235,7 @@ std::unique_ptr<column> format_list_column(lists_column_view const& input,
                                            string_scalar const& na_rep,
                                            strings_column_view const& separators,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::format_list_column(input, na_rep, separators, stream, mr);
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index f5aeeb8d130..644ffbb4bd1 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 
@@ -125,7 +126,7 @@ struct url_encoder_fn {
 //
 std::unique_ptr<column> url_encode(strings_column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -146,7 +147,7 @@ std::unique_ptr<column> url_encode(strings_column_view const& input,
 // external API
 std::unique_ptr<column> url_encode(strings_column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::url_encode(input, stream, mr);
@@ -369,7 +370,7 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
 //
 std::unique_ptr<column> url_decode(strings_column_view const& strings,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -416,7 +417,7 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
 
 std::unique_ptr<column> url_decode(strings_column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::url_decode(input, stream, mr);
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index de7067f0bed..5daacbdc2fa 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/advance.h>
 #include <thrust/binary_search.h>
@@ -203,7 +204,7 @@ CUDF_KERNEL void fused_concatenate_string_chars_kernel(column_device_view const*
 
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   // Compute output sizes
diff --git a/cpp/src/strings/copying/copy_range.cu b/cpp/src/strings/copying/copy_range.cu
index f4c86389534..9f8c47602f8 100644
--- a/cpp/src/strings/copying/copy_range.cu
+++ b/cpp/src/strings/copying/copy_range.cu
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -65,7 +66,7 @@ std::unique_ptr<column> copy_range(strings_column_view const& source,
                                    size_type source_end,
                                    size_type target_begin,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   auto target_end = target_begin + (source_end - source_begin);
   CUDF_EXPECTS(
diff --git a/cpp/src/strings/copying/copying.cu b/cpp/src/strings/copying/copying.cu
index 6f045fa7ea8..e8b411d50a6 100644
--- a/cpp/src/strings/copying/copying.cu
+++ b/cpp/src/strings/copying/copying.cu
@@ -25,6 +25,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/transform.h>
@@ -37,7 +38,7 @@ std::unique_ptr<cudf::column> copy_slice(strings_column_view const& input,
                                          size_type start,
                                          size_type end,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   CUDF_EXPECTS(((start >= 0) && (start < end)), "Invalid start parameter value.");
diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu
index 3a83cdab045..562ee6a7088 100644
--- a/cpp/src/strings/copying/shift.cu
+++ b/cpp/src/strings/copying/shift.cu
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -92,7 +93,7 @@ std::unique_ptr<column> shift(strings_column_view const& input,
                               size_type offset,
                               scalar const& fill_value,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   auto d_fill_str = static_cast<string_scalar const&>(fill_value).value(stream);
 
diff --git a/cpp/src/strings/count_matches.cu b/cpp/src/strings/count_matches.cu
index 8a32a46cc2b..e8672ea5335 100644
--- a/cpp/src/strings/count_matches.cu
+++ b/cpp/src/strings/count_matches.cu
@@ -21,6 +21,8 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/strings/string_view.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 namespace cudf {
 namespace strings {
 namespace detail {
@@ -60,7 +62,7 @@ std::unique_ptr<column> count_matches(column_device_view const& d_strings,
                                       reprog_device& d_prog,
                                       size_type output_size,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   assert(output_size >= d_strings.size() and "Unexpected output size");
 
diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp
index a4f76c1c5e3..4a5efac37fd 100644
--- a/cpp/src/strings/count_matches.hpp
+++ b/cpp/src/strings/count_matches.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@
 #include <cudf/column/column.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -45,7 +46,7 @@ std::unique_ptr<column> count_matches(column_device_view const& d_strings,
                                       reprog_device& d_prog,
                                       size_type output_size,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr);
+                                      rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu
index ffd4e03ea87..b18b50d1b43 100644
--- a/cpp/src/strings/extract/extract.cu
+++ b/cpp/src/strings/extract/extract.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/execution_policy.h>
@@ -91,7 +92,7 @@ struct extract_fn {
 std::unique_ptr<table> extract(strings_column_view const& input,
                                regex_program const& prog,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   // create device object from regex_program
   auto d_prog = regex_device_builder::create_prog_device(prog, stream);
@@ -135,7 +136,7 @@ std::unique_ptr<table> extract(strings_column_view const& input,
 std::unique_ptr<table> extract(strings_column_view const& input,
                                regex_program const& prog,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract(input, prog, stream, mr);
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index 3a02acb7050..27691068d5a 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/transform_scan.h>
@@ -104,7 +105,7 @@ struct extract_fn {
 std::unique_ptr<column> extract_all_record(strings_column_view const& input,
                                            regex_program const& prog,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto const strings_count = input.size();
   auto const d_strings     = column_device_view::create(input.parent(), stream);
@@ -164,7 +165,7 @@ std::unique_ptr<column> extract_all_record(strings_column_view const& input,
 std::unique_ptr<column> extract_all_record(strings_column_view const& input,
                                            regex_program const& prog,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::extract_all_record(input, prog, stream, mr);
diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu
index 685c3eec744..b48d56a595c 100644
--- a/cpp/src/strings/filling/fill.cu
+++ b/cpp/src/strings/filling/fill.cu
@@ -24,6 +24,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -62,7 +63,7 @@ std::unique_ptr<column> fill(strings_column_view const& input,
                              size_type end,
                              string_scalar const& value,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   auto const strings_count = input.size();
   if (strings_count == 0) { return make_empty_column(type_id::STRING); }
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index aaaa751c3f9..32717dac78d 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/find.h>
@@ -118,7 +119,7 @@ std::unique_ptr<column> filter_characters(
   filter_type keep_characters,
   string_scalar const& replacement,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -160,7 +161,7 @@ std::unique_ptr<column> filter_characters(
   filter_type keep_characters,
   string_scalar const& replacement,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::filter_characters(
diff --git a/cpp/src/strings/like.cu b/cpp/src/strings/like.cu
index 93e00592ef2..4df1b9b4ffe 100644
--- a/cpp/src/strings/like.cu
+++ b/cpp/src/strings/like.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -123,7 +124,7 @@ std::unique_ptr<column> like(strings_column_view const& input,
                              PatternIterator const patterns_itr,
                              string_view const& d_escape,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   auto results = make_numeric_column(data_type{type_id::BOOL8},
                                      input.size(),
@@ -151,7 +152,7 @@ std::unique_ptr<column> like(strings_column_view const& input,
                              string_scalar const& pattern,
                              string_scalar const& escape_character,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(pattern.is_valid(stream), "Parameter pattern must be valid");
   CUDF_EXPECTS(escape_character.is_valid(stream), "Parameter escape_character must be valid");
@@ -166,7 +167,7 @@ std::unique_ptr<column> like(strings_column_view const& input,
                              strings_column_view const& patterns,
                              string_scalar const& escape_character,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(patterns.size() == input.size(), "Number of patterns must match the input size");
   CUDF_EXPECTS(patterns.has_nulls() == false, "Parameter patterns must not contain nulls");
@@ -186,7 +187,7 @@ std::unique_ptr<column> like(strings_column_view const& input,
                              string_scalar const& pattern,
                              string_scalar const& escape_character,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::like(input, pattern, escape_character, stream, mr);
@@ -196,7 +197,7 @@ std::unique_ptr<column> like(strings_column_view const& input,
                              strings_column_view const& patterns,
                              string_scalar const& escape_character,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::like(input, patterns, escape_character, stream, mr);
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index 85d47af87f6..d8a3055772e 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -103,7 +104,7 @@ std::unique_ptr<column> pad(strings_column_view const& input,
                             side_type side,
                             std::string_view fill_char,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
   CUDF_EXPECTS(!fill_char.empty(), "fill_char parameter must not be empty");
@@ -146,7 +147,7 @@ struct zfill_fn : base_fn<zfill_fn> {
 std::unique_ptr<column> zfill(strings_column_view const& input,
                               size_type width,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -170,7 +171,7 @@ std::unique_ptr<column> pad(strings_column_view const& input,
                             side_type side,
                             std::string_view fill_char,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::pad(input, width, side, fill_char, stream, mr);
@@ -179,7 +180,7 @@ std::unique_ptr<column> pad(strings_column_view const& input,
 std::unique_ptr<column> zfill(strings_column_view const& input,
                               size_type width,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::zfill(input, width, stream, mr);
diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh
index ae8211ac916..cfe53937e66 100644
--- a/cpp/src/strings/regex/utilities.cuh
+++ b/cpp/src/strings/regex/utilities.cuh
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/scan.h>
 
@@ -113,7 +114,7 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
                            reprog_device& d_prog,
                            size_type strings_count,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+                           rmm::device_async_resource_ref mr)
 {
   auto offsets = make_numeric_column(
     data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 690a72c098f..97168a7fbd7 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -28,6 +28,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
@@ -42,7 +43,7 @@ namespace detail {
 std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
                                              size_type repeat_times,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   if (!input.is_valid(stream)) { return std::make_unique<string_scalar>("", false, stream, mr); }
   if (input.size() == 0 || repeat_times <= 0) {
@@ -79,7 +80,7 @@ namespace {
 auto generate_empty_output(strings_column_view const& input,
                            size_type strings_count,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+                           rmm::device_async_resource_ref mr)
 {
   auto offsets_column = make_numeric_column(
     data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
@@ -143,7 +144,7 @@ struct compute_size_and_repeat_fn {
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        size_type repeat_times,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto const strings_count = input.size();
   if (strings_count == 0) { return make_empty_column(type_id::STRING); }
@@ -220,7 +221,7 @@ struct compute_sizes_and_repeat_fn {
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        column_view const& repeat_times,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.size() == repeat_times.size(), "The input columns must have the same size.");
   CUDF_EXPECTS(cudf::is_index_type(repeat_times.type()),
@@ -256,7 +257,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
 std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
                                              size_type repeat_times,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::repeat_string(input, repeat_times, stream, mr);
@@ -265,7 +266,7 @@ std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        size_type repeat_times,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::repeat_strings(input, repeat_times, stream, mr);
@@ -274,7 +275,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        column_view const& repeat_times,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::repeat_strings(input, repeat_times, stream, mr);
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 8e20db18f43..86afe4c8b9b 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -30,6 +30,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <regex>
 
@@ -105,7 +106,7 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& input,
                                               regex_program const& prog,
                                               std::string_view replacement,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -148,7 +149,7 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& strings
                                               regex_program const& prog,
                                               std::string_view replacement,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_with_backrefs(strings, prog, replacement, stream, mr);
diff --git a/cpp/src/strings/replace/find_replace.cu b/cpp/src/strings/replace/find_replace.cu
index 818bfa58427..79bf6e3c910 100644
--- a/cpp/src/strings/replace/find_replace.cu
+++ b/cpp/src/strings/replace/find_replace.cu
@@ -21,6 +21,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
@@ -65,7 +66,7 @@ std::unique_ptr<cudf::column> find_and_replace_all(
   cudf::strings_column_view const& values_to_replace,
   cudf::strings_column_view const& replacement_values,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto d_input             = cudf::column_device_view::create(input.parent(), stream);
   auto d_values_to_replace = cudf::column_device_view::create(values_to_replace.parent(), stream);
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index c93add01f69..2eb03bd10a4 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -284,7 +285,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
                                                    strings_column_view const& targets,
                                                    strings_column_view const& repls,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
 
@@ -452,7 +453,7 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
                                                 strings_column_view const& targets,
                                                 strings_column_view const& repls,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   auto d_strings      = column_device_view::create(input.parent(), stream);
   auto d_targets      = column_device_view::create(targets.parent(), stream);
@@ -474,7 +475,7 @@ std::unique_ptr<column> replace(strings_column_view const& input,
                                 strings_column_view const& targets,
                                 strings_column_view const& repls,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   CUDF_EXPECTS(((targets.size() > 0) && (targets.null_count() == 0)),
@@ -499,7 +500,7 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
                                 strings_column_view const& targets,
                                 strings_column_view const& repls,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace(strings, targets, repls, stream, mr);
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 743e5894112..5172dba3fc3 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -31,6 +31,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
@@ -140,7 +141,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
                                    strings_column_view const& replacements,
                                    regex_flags const flags,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   if (patterns.empty()) {  // if no patterns; just return a copy
@@ -207,7 +208,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    strings_column_view const& replacements,
                                    regex_flags const flags,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_re(strings, patterns, replacements, flags, stream, mr);
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 2c548f2f7cd..857bc7fb41c 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -32,6 +32,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -242,7 +243,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
                                                    string_view const& d_replacement,
                                                    cudf::size_type maxrepl,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
 
@@ -393,7 +394,7 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
                                                 string_view const& d_replacement,
                                                 cudf::size_type maxrepl,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
 
@@ -414,7 +415,7 @@ std::unique_ptr<column> replace(strings_column_view const& input,
                                 string_scalar const& repl,
                                 cudf::size_type maxrepl,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   if (maxrepl == 0) { return std::make_unique<cudf::column>(input.parent(), stream, mr); }
@@ -441,7 +442,7 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
                                 string_scalar const& repl,
                                 cudf::size_type maxrepl,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace(strings, target, repl, maxrepl, stream, mr);
diff --git a/cpp/src/strings/replace/replace_nulls.cu b/cpp/src/strings/replace/replace_nulls.cu
index bbca4997f57..ffd9e6c2553 100644
--- a/cpp/src/strings/replace/replace_nulls.cu
+++ b/cpp/src/strings/replace/replace_nulls.cu
@@ -28,6 +28,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -39,7 +40,7 @@ namespace detail {
 std::unique_ptr<column> replace_nulls(strings_column_view const& input,
                                       string_scalar const& repl,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   size_type strings_count = input.size();
   if (strings_count == 0) { return make_empty_column(type_id::STRING); }
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index bded196946f..1290302340b 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -102,7 +103,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
                                    string_scalar const& replacement,
                                    std::optional<size_type> max_replace_count,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -135,7 +136,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    string_scalar const& replacement,
                                    std::optional<size_type> max_replace_count,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_re(strings, prog, replacement, max_replace_count, stream, mr);
diff --git a/cpp/src/strings/replace/replace_slice.cu b/cpp/src/strings/replace/replace_slice.cu
index c11664c86d4..90540b39189 100644
--- a/cpp/src/strings/replace/replace_slice.cu
+++ b/cpp/src/strings/replace/replace_slice.cu
@@ -27,6 +27,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 
@@ -80,7 +81,7 @@ std::unique_ptr<column> replace_slice(strings_column_view const& input,
                                       size_type start,
                                       size_type stop,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   CUDF_EXPECTS(repl.is_valid(stream), "Parameter repl must be valid.");
@@ -110,7 +111,7 @@ std::unique_ptr<column> replace_slice(strings_column_view const& input,
                                       size_type start,
                                       size_type stop,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_slice(input, repl, start, stop, stream, mr);
diff --git a/cpp/src/strings/reverse.cu b/cpp/src/strings/reverse.cu
index f9aec41b5e3..cbd231bc5f3 100644
--- a/cpp/src/strings/reverse.cu
+++ b/cpp/src/strings/reverse.cu
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -58,7 +59,7 @@ struct reverse_characters_fn {
 
 std::unique_ptr<column> reverse(strings_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
 
@@ -81,7 +82,7 @@ std::unique_ptr<column> reverse(strings_column_view const& input,
 
 std::unique_ptr<column> reverse(strings_column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::reverse(input, stream, mr);
diff --git a/cpp/src/strings/scan/scan_inclusive.cu b/cpp/src/strings/scan/scan_inclusive.cu
index 0cf492fa295..b3e45f65a21 100644
--- a/cpp/src/strings/scan/scan_inclusive.cu
+++ b/cpp/src/strings/scan/scan_inclusive.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/scan.h>
@@ -82,7 +83,7 @@ template <typename Op>
 std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        bitmask_type const* mask,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   auto d_input = column_device_view::create(input, stream);
 
@@ -120,12 +121,12 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
 template std::unique_ptr<column> scan_inclusive<DeviceMin>(column_view const& input,
                                                            bitmask_type const* mask,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr);
+                                                           rmm::device_async_resource_ref mr);
 
 template std::unique_ptr<column> scan_inclusive<DeviceMax>(column_view const& input,
                                                            bitmask_type const* mask,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr);
+                                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 598d48157d9..bbd98c4e9ff 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/binary_search.h>
@@ -208,7 +209,7 @@ std::unique_ptr<column> find_fn(strings_column_view const& input,
                                 size_type start,
                                 size_type stop,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid.");
   CUDF_EXPECTS(start >= 0, "Parameter start must be positive integer or zero.");
@@ -252,7 +253,7 @@ std::unique_ptr<column> find(strings_column_view const& input,
                              size_type start,
                              size_type stop,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   return find_fn<true>(input, target, start, stop, stream, mr);
 }
@@ -262,7 +263,7 @@ std::unique_ptr<column> rfind(strings_column_view const& input,
                               size_type start,
                               size_type stop,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   return find_fn<false>(input, target, start, stop, stream, mr);
 }
@@ -272,7 +273,7 @@ std::unique_ptr<column> find(strings_column_view const& input,
                              strings_column_view const& target,
                              size_type start,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(start >= 0, "Parameter start must be positive integer or zero.");
   CUDF_EXPECTS(input.size() == target.size(), "input and target columns must be the same size");
@@ -305,7 +306,7 @@ std::unique_ptr<column> find(strings_column_view const& strings,
                              size_type start,
                              size_type stop,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::find(strings, target, start, stop, stream, mr);
@@ -316,7 +317,7 @@ std::unique_ptr<column> rfind(strings_column_view const& strings,
                               size_type start,
                               size_type stop,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rfind(strings, target, start, stop, stream, mr);
@@ -326,7 +327,7 @@ std::unique_ptr<column> find(strings_column_view const& input,
                              strings_column_view const& target,
                              size_type start,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::find<true>(input, target, start, stream, mr);
@@ -375,7 +376,7 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
 std::unique_ptr<column> contains_warp_parallel(strings_column_view const& input,
                                                string_scalar const& target,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(target.is_valid(stream), "Parameter target must be valid.");
   auto d_target = string_view(target.data(), target.size());
@@ -427,7 +428,7 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
                                     string_scalar const& target,
                                     BoolFunction pfn,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::BOOL8);
@@ -488,7 +489,7 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
                                     strings_column_view const& targets,
                                     BoolFunction pfn,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) return make_empty_column(type_id::BOOL8);
 
@@ -533,7 +534,7 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
 std::unique_ptr<column> contains(strings_column_view const& input,
                                  string_scalar const& target,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   // use warp parallel when the average string width is greater than the threshold
   if ((input.null_count() < input.size()) &&
@@ -551,7 +552,7 @@ std::unique_ptr<column> contains(strings_column_view const& input,
 std::unique_ptr<column> contains(strings_column_view const& strings,
                                  strings_column_view const& targets,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return d_string.find(d_target) != string_view::npos;
@@ -562,7 +563,7 @@ std::unique_ptr<column> contains(strings_column_view const& strings,
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     string_scalar const& target,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return (d_target.size_bytes() <= d_string.size_bytes()) &&
@@ -574,7 +575,7 @@ std::unique_ptr<column> starts_with(strings_column_view const& strings,
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     strings_column_view const& targets,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return (d_target.size_bytes() <= d_string.size_bytes()) &&
@@ -586,7 +587,7 @@ std::unique_ptr<column> starts_with(strings_column_view const& strings,
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   string_scalar const& target,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     auto const str_size = d_string.size_bytes();
@@ -601,7 +602,7 @@ std::unique_ptr<column> ends_with(strings_column_view const& strings,
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   strings_column_view const& targets,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     auto const str_size = d_string.size_bytes();
@@ -620,7 +621,7 @@ std::unique_ptr<column> ends_with(strings_column_view const& strings,
 std::unique_ptr<column> contains(strings_column_view const& strings,
                                  string_scalar const& target,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains(strings, target, stream, mr);
@@ -629,7 +630,7 @@ std::unique_ptr<column> contains(strings_column_view const& strings,
 std::unique_ptr<column> contains(strings_column_view const& strings,
                                  strings_column_view const& targets,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::contains(strings, targets, stream, mr);
@@ -638,7 +639,7 @@ std::unique_ptr<column> contains(strings_column_view const& strings,
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     string_scalar const& target,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::starts_with(strings, target, stream, mr);
@@ -647,7 +648,7 @@ std::unique_ptr<column> starts_with(strings_column_view const& strings,
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     strings_column_view const& targets,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::starts_with(strings, targets, stream, mr);
@@ -656,7 +657,7 @@ std::unique_ptr<column> starts_with(strings_column_view const& strings,
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   string_scalar const& target,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::ends_with(strings, target, stream, mr);
@@ -665,7 +666,7 @@ std::unique_ptr<column> ends_with(strings_column_view const& strings,
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   strings_column_view const& targets,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::ends_with(strings, targets, stream, mr);
diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu
index fcaec835f4d..223a941a88a 100644
--- a/cpp/src/strings/search/find_multiple.cu
+++ b/cpp/src/strings/search/find_multiple.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -37,7 +38,7 @@ namespace detail {
 std::unique_ptr<column> find_multiple(strings_column_view const& input,
                                       strings_column_view const& targets,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   auto const strings_count = input.size();
   auto const targets_count = targets.size();
@@ -89,7 +90,7 @@ std::unique_ptr<column> find_multiple(strings_column_view const& input,
 std::unique_ptr<column> find_multiple(strings_column_view const& input,
                                       strings_column_view const& targets,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::find_multiple(input, targets, stream, mr);
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index 4e8e3a6a449..0d0962258cf 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
@@ -78,7 +79,7 @@ std::unique_ptr<column> findall_util(column_device_view const& d_strings,
                                      int64_t total_matches,
                                      size_type const* d_offsets,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<string_index_pair> indices(total_matches, stream);
 
@@ -94,7 +95,7 @@ std::unique_ptr<column> findall_util(column_device_view const& d_strings,
 std::unique_ptr<column> findall(strings_column_view const& input,
                                 regex_program const& prog,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   auto const strings_count = input.size();
   auto const d_strings     = column_device_view::create(input.parent(), stream);
@@ -128,7 +129,7 @@ std::unique_ptr<column> findall(strings_column_view const& input,
 std::unique_ptr<column> findall(strings_column_view const& input,
                                 regex_program const& prog,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::findall(input, prog, stream, mr);
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index 98f3c9cae0d..d080065b330 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -28,6 +28,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -156,7 +157,7 @@ std::unique_ptr<column> compute_substrings_from_fn(column_device_view const& d_c
                                                    IndexIterator starts,
                                                    IndexIterator stops,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   auto results = rmm::device_uvector<string_view>(d_column.size(), stream);
   thrust::transform(rmm::exec_policy(stream),
@@ -175,7 +176,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       numeric_scalar<size_type> const& stop,
                                       numeric_scalar<size_type> const& step,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -218,7 +219,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       column_view const& starts_column,
                                       column_view const& stops_column,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -249,7 +250,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       numeric_scalar<size_type> const& stop,
                                       numeric_scalar<size_type> const& step,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::slice_strings(strings, start, stop, step, stream, mr);
@@ -259,7 +260,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       column_view const& starts_column,
                                       column_view const& stops_column,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::slice_strings(strings, starts_column, stops_column, stream, mr);
diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu
index 16e6402cfef..93d55c494fe 100644
--- a/cpp/src/strings/split/partition.cu
+++ b/cpp/src/strings/split/partition.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -184,7 +185,7 @@ struct rpartition_fn : public partition_fn {
 std::unique_ptr<table> partition(strings_column_view const& strings,
                                  string_scalar const& delimiter,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   auto strings_count = strings.size();
@@ -211,7 +212,7 @@ std::unique_ptr<table> partition(strings_column_view const& strings,
 std::unique_ptr<table> rpartition(strings_column_view const& strings,
                                   string_scalar const& delimiter,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   auto strings_count = strings.size();
@@ -242,7 +243,7 @@ std::unique_ptr<table> rpartition(strings_column_view const& strings,
 std::unique_ptr<table> partition(strings_column_view const& input,
                                  string_scalar const& delimiter,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::partition(input, delimiter, stream, mr);
@@ -251,7 +252,7 @@ std::unique_ptr<table> partition(strings_column_view const& input,
 std::unique_ptr<table> rpartition(strings_column_view const& input,
                                   string_scalar const& delimiter,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rpartition(input, delimiter, stream, mr);
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 1416b293b75..2c6a0b2cf22 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/fill.h>
@@ -111,7 +112,7 @@ template <typename Tokenizer>
 std::unique_ptr<table> split_fn(strings_column_view const& input,
                                 Tokenizer tokenizer,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   std::vector<std::unique_ptr<column>> results;
   if (input.size() == input.null_count()) {
@@ -329,7 +330,7 @@ template <typename Tokenizer>
 std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
                                            Tokenizer tokenizer,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   // compute the number of tokens per string
   rmm::device_uvector<size_type> token_counts(strings_count, stream);
@@ -386,7 +387,7 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
 std::unique_ptr<column> create_offsets_from_positions(strings_column_view const& input,
                                                       device_span<int64_t const> const& positions,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   auto const d_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
@@ -427,7 +428,7 @@ std::unique_ptr<table> split(strings_column_view const& strings_column,
                              string_scalar const& delimiter,
                              size_type maxsplit,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
@@ -450,7 +451,7 @@ std::unique_ptr<table> rsplit(strings_column_view const& strings_column,
                               string_scalar const& delimiter,
                               size_type maxsplit,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
@@ -477,7 +478,7 @@ std::unique_ptr<table> split(strings_column_view const& strings_column,
                              string_scalar const& delimiter,
                              size_type maxsplit,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::split(strings_column, delimiter, maxsplit, stream, mr);
@@ -487,7 +488,7 @@ std::unique_ptr<table> rsplit(strings_column_view const& strings_column,
                               string_scalar const& delimiter,
                               size_type maxsplit,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rsplit(strings_column, delimiter, maxsplit, stream, mr);
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 5f3c9372c39..160d1be3978 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -27,6 +27,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/binary_search.h>
@@ -294,7 +295,7 @@ struct rsplit_tokenizer_fn : base_split_tokenizer<rsplit_tokenizer_fn> {
 std::unique_ptr<column> create_offsets_from_positions(strings_column_view const& input,
                                                       device_span<int64_t const> const& positions,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr);
+                                                      rmm::device_async_resource_ref mr);
 
 /**
  * @brief Helper function used by split/rsplit and split_record/rsplit_record
@@ -315,7 +316,7 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
   strings_column_view const& input,
   Tokenizer tokenizer,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto const strings_count = input.size();
   auto const chars_bytes =
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 16725fe006a..4dfb3e9ea62 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/functional.h>
@@ -187,7 +188,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
                                 split_direction direction,
                                 size_type maxsplit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!prog.pattern().empty(), "Parameter pattern must not be empty");
 
@@ -258,7 +259,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
                                         split_direction direction,
                                         size_type maxsplit,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!prog.pattern().empty(), "Parameter pattern must not be empty");
 
@@ -298,7 +299,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
                                 regex_program const& prog,
                                 size_type maxsplit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   return split_re(input, prog, split_direction::FORWARD, maxsplit, stream, mr);
 }
@@ -307,7 +308,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
                                         regex_program const& prog,
                                         size_type maxsplit,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return split_record_re(input, prog, split_direction::FORWARD, maxsplit, stream, mr);
 }
@@ -316,7 +317,7 @@ std::unique_ptr<table> rsplit_re(strings_column_view const& input,
                                  regex_program const& prog,
                                  size_type maxsplit,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   return split_re(input, prog, split_direction::BACKWARD, maxsplit, stream, mr);
 }
@@ -325,7 +326,7 @@ std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
                                          regex_program const& prog,
                                          size_type maxsplit,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   return split_record_re(input, prog, split_direction::BACKWARD, maxsplit, stream, mr);
 }
@@ -338,7 +339,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
                                 regex_program const& prog,
                                 size_type maxsplit,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::split_re(input, prog, maxsplit, stream, mr);
@@ -348,7 +349,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
                                         regex_program const& prog,
                                         size_type maxsplit,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::split_record_re(input, prog, maxsplit, stream, mr);
@@ -358,7 +359,7 @@ std::unique_ptr<table> rsplit_re(strings_column_view const& input,
                                  regex_program const& prog,
                                  size_type maxsplit,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rsplit_re(input, prog, maxsplit, stream, mr);
@@ -368,7 +369,7 @@ std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
                                          regex_program const& prog,
                                          size_type maxsplit,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rsplit_record_re(input, prog, maxsplit, stream, mr);
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 0971069592e..3e8be750b9e 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -29,6 +29,7 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -46,7 +47,7 @@ template <typename Tokenizer>
 std::unique_ptr<column> split_record_fn(strings_column_view const& input,
                                         Tokenizer tokenizer,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) {
     return cudf::lists::detail::make_empty_lists_column(data_type{type_id::STRING}, stream, mr);
@@ -142,7 +143,7 @@ template <typename TokenReader>
 std::unique_ptr<column> whitespace_split_record_fn(strings_column_view const& input,
                                                    TokenReader reader,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   // create offsets column by counting the number of tokens per string
   auto sizes_itr = cudf::detail::make_counting_transform_iterator(
@@ -176,7 +177,7 @@ std::unique_ptr<column> split_record(strings_column_view const& strings,
                                      string_scalar const& delimiter,
                                      size_type maxsplit,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
@@ -210,7 +211,7 @@ std::unique_ptr<column> split_record(strings_column_view const& strings,
                                      string_scalar const& delimiter,
                                      size_type maxsplit,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::split_record<detail::Direction::FORWARD>(strings, delimiter, maxsplit, stream, mr);
@@ -220,7 +221,7 @@ std::unique_ptr<column> rsplit_record(strings_column_view const& strings,
                                       string_scalar const& delimiter,
                                       size_type maxsplit,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::split_record<detail::Direction::BACKWARD>(
diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu
index 0f1b9e3baae..a298285f841 100644
--- a/cpp/src/strings/strings_column_factories.cu
+++ b/cpp/src/strings/strings_column_factories.cu
@@ -25,6 +25,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
@@ -49,7 +50,7 @@ struct string_view_to_pair {
 std::unique_ptr<column> make_strings_column(
   device_span<thrust::pair<char const*, size_type> const> strings,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -59,7 +60,7 @@ std::unique_ptr<column> make_strings_column(
 std::unique_ptr<column> make_strings_column(device_span<string_view const> string_views,
                                             string_view null_placeholder,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/strings/strings_scalar_factories.cpp b/cpp/src/strings/strings_scalar_factories.cpp
index 9c7f905cb0b..233fee14694 100644
--- a/cpp/src/strings/strings_scalar_factories.cpp
+++ b/cpp/src/strings/strings_scalar_factories.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,12 +17,13 @@
 #include <cudf/scalar/scalar.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 // Create a strings-type column from array of pointer/size pairs
 std::unique_ptr<scalar> make_string_scalar(std::string const& string,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto s = new string_scalar(string, true, stream, mr);
   return std::unique_ptr<scalar>(s);
diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu
index 26df76850f7..639097abe63 100644
--- a/cpp/src/strings/strip.cu
+++ b/cpp/src/strings/strip.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
@@ -60,7 +61,7 @@ std::unique_ptr<column> strip(strings_column_view const& input,
                               side_type side,
                               string_scalar const& to_strip,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -87,7 +88,7 @@ std::unique_ptr<column> strip(strings_column_view const& input,
                               side_type side,
                               string_scalar const& to_strip,
                               rmm::cuda_stream_view stream,
-                              rmm::mr::device_memory_resource* mr)
+                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::strip(input, side, to_strip, stream, mr);
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index a8603f47226..fcf55429e09 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -28,6 +28,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -89,7 +90,7 @@ struct translate_fn {
 std::unique_ptr<column> translate(strings_column_view const& strings,
                                   std::vector<std::pair<char_utf8, char_utf8>> const& chars_table,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -127,7 +128,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
 std::unique_ptr<column> translate(strings_column_view const& input,
                                   std::vector<std::pair<uint32_t, uint32_t>> const& chars_table,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::translate(input, chars_table, stream, mr);
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index c83f827f290..18e726a6d7d 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -45,7 +46,7 @@ namespace detail {
 rmm::device_uvector<string_view> create_string_vector_from_column(
   cudf::strings_column_view const input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
 
@@ -75,7 +76,7 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
 std::unique_ptr<column> create_offsets_child_column(int64_t chars_bytes,
                                                     size_type count,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   auto const threshold = get_offset64_threshold();
   if (!is_large_strings_enabled()) {
diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu
index 0b3b6e78f82..dff1891c3cc 100644
--- a/cpp/src/strings/wrap.cu
+++ b/cpp/src/strings/wrap.cu
@@ -29,6 +29,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -95,7 +96,7 @@ template <typename device_execute_functor>
 std::unique_ptr<column> wrap(strings_column_view const& strings,
                              size_type width,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(width > 0, "Positive wrap width required");
 
@@ -139,7 +140,7 @@ std::unique_ptr<column> wrap(strings_column_view const& strings,
 std::unique_ptr<column> wrap(strings_column_view const& strings,
                              size_type width,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::wrap<detail::execute_wrap>(strings, width, stream, mr);
diff --git a/cpp/src/structs/copying/concatenate.cu b/cpp/src/structs/copying/concatenate.cu
index e010ad9dd41..2ccf071711a 100644
--- a/cpp/src/structs/copying/concatenate.cu
+++ b/cpp/src/structs/copying/concatenate.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/structs/structs_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <algorithm>
 #include <memory>
@@ -39,7 +40,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   // get ordered children
   auto ordered_children = extract_ordered_struct_children(columns, stream);
diff --git a/cpp/src/structs/scan/scan_inclusive.cu b/cpp/src/structs/scan/scan_inclusive.cu
index 410a7d9348e..a6ccea5fca1 100644
--- a/cpp/src/structs/scan/scan_inclusive.cu
+++ b/cpp/src/structs/scan/scan_inclusive.cu
@@ -24,6 +24,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/scan.h>
@@ -40,7 +41,7 @@ namespace {
 template <typename Op>
 std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   // Create a gather map containing indices of the prefix min/max elements.
   auto gather_map = rmm::device_uvector<size_type>(input.size(), stream);
@@ -78,11 +79,11 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
 
 template std::unique_ptr<column> scan_inclusive<DeviceMin>(column_view const& input_view,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr);
+                                                           rmm::device_async_resource_ref mr);
 
 template std::unique_ptr<column> scan_inclusive<DeviceMax>(column_view const& input_view,
                                                            rmm::cuda_stream_view stream,
-                                                           rmm::mr::device_memory_resource* mr);
+                                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace structs
diff --git a/cpp/src/structs/structs_column_factories.cu b/cpp/src/structs/structs_column_factories.cu
index d94a33ce9fb..bbe2bb96fde 100644
--- a/cpp/src/structs/structs_column_factories.cu
+++ b/cpp/src/structs/structs_column_factories.cu
@@ -19,6 +19,7 @@
 #include <cudf/types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -33,7 +34,7 @@ std::unique_ptr<cudf::column> make_structs_column(
   size_type null_count,
   rmm::device_buffer&& null_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(null_count <= 0 || !null_mask.is_empty(),
                "Struct column with nulls must be nullable.");
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index f47d066852c..81806c92e23 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -28,6 +28,8 @@
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
@@ -92,7 +94,7 @@ struct table_flattener {
   std::vector<null_order> const& null_precedence;
   column_nullability nullability;
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   temporary_nullable_data nullable_data;
   std::vector<std::unique_ptr<column>> validity_as_column;
@@ -105,7 +107,7 @@ struct table_flattener {
                   std::vector<null_order> const& null_precedence,
                   column_nullability nullability,
                   rmm::cuda_stream_view stream,
-                  rmm::mr::device_memory_resource* mr)
+                  rmm::device_async_resource_ref mr)
     : column_order{column_order},
       null_precedence{null_precedence},
       nullability{nullability},
@@ -202,7 +204,7 @@ std::unique_ptr<flattened_table> flatten_nested_columns(
   std::vector<null_order> const& null_precedence,
   column_nullability nullability,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto const has_struct = std::any_of(input.begin(), input.end(), is_struct);
   if (not has_struct) {
@@ -228,7 +230,7 @@ std::unique_ptr<column> superimpose_nulls_no_sanitize(bitmask_type const* null_m
                                                       size_type null_count,
                                                       std::unique_ptr<column>&& input,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   if (input->type().id() == cudf::type_id::EMPTY) {
@@ -280,7 +282,7 @@ std::unique_ptr<column> superimpose_nulls_no_sanitize(bitmask_type const* null_m
  * @copydoc cudf::structs::detail::push_down_nulls
  */
 std::pair<column_view, temporary_nullable_data> push_down_nulls_no_sanitize(
-  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   auto ret_nullable_data = temporary_nullable_data{};
   if (input.type().id() != type_id::STRUCT) {
@@ -371,7 +373,7 @@ std::unique_ptr<column> superimpose_nulls(bitmask_type const* null_mask,
                                           size_type null_count,
                                           std::unique_ptr<column>&& input,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   input = superimpose_nulls_no_sanitize(null_mask, null_count, std::move(input), stream, mr);
 
@@ -389,7 +391,7 @@ std::unique_ptr<column> superimpose_nulls(bitmask_type const* null_mask,
 
 std::pair<column_view, temporary_nullable_data> push_down_nulls(column_view const& input,
                                                                 rmm::cuda_stream_view stream,
-                                                                rmm::mr::device_memory_resource* mr)
+                                                                rmm::device_async_resource_ref mr)
 {
   auto output = push_down_nulls_no_sanitize(input, stream, mr);
 
@@ -410,7 +412,7 @@ std::pair<column_view, temporary_nullable_data> push_down_nulls(column_view cons
 
 std::pair<table_view, temporary_nullable_data> push_down_nulls(table_view const& table,
                                                                rmm::cuda_stream_view stream,
-                                                               rmm::mr::device_memory_resource* mr)
+                                                               rmm::device_async_resource_ref mr)
 {
   auto processed_columns = std::vector<column_view>{};
   auto nullable_data     = temporary_nullable_data{};
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index 71b437cb47d..13c31e8ae4c 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -31,6 +31,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -414,7 +415,7 @@ auto replace_child(column_view const& input,
                    column_view const& new_child,
                    std::vector<std::unique_ptr<column>>& out_cols,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
 {
   auto const make_output = [&input](auto const& offsets_cv, auto const& child_cv) {
     return column_view{data_type{type_id::LIST},
@@ -463,7 +464,7 @@ auto replace_child(column_view const& input,
 auto compute_ranks(column_view const& input,
                    null_order column_null_order,
                    rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
+                   rmm::device_async_resource_ref mr)
 {
   return cudf::detail::rank(input,
                             rank_method::DENSE,
@@ -496,7 +497,7 @@ std::pair<column_view, std::vector<std::unique_ptr<column>>> transform_lists_of_
   column_view const& input,
   null_order column_null_order,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   std::vector<std::unique_ptr<column>> out_cols;
 
@@ -563,7 +564,7 @@ transform_lists_of_structs(column_view const& lhs,
                            column_view const& rhs,
                            null_order column_null_order,
                            rmm::cuda_stream_view stream,
-                           rmm::mr::device_memory_resource* mr)
+                           rmm::device_async_resource_ref mr)
 {
   std::vector<std::unique_ptr<column>> out_cols_lhs;
   std::vector<std::unique_ptr<column>> out_cols_rhs;
diff --git a/cpp/src/table/table.cpp b/cpp/src/table/table.cpp
index 7e9ed4270c7..9dac7be5efe 100644
--- a/cpp/src/table/table.cpp
+++ b/cpp/src/table/table.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,11 +20,12 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
 // Copy the columns from another table
-table::table(table const& other, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+table::table(table const& other, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   : _num_rows{other.num_rows()}
 {
   CUDF_FUNC_RANGE();
@@ -51,7 +52,7 @@ table::table(std::vector<std::unique_ptr<column>>&& columns) : _columns{std::mov
 }
 
 // Copy the contents of a `table_view`
-table::table(table_view view, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+table::table(table_view view, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
   : _num_rows{view.num_rows()}
 {
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu
index 363e15d74c1..e196eee275f 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cu
+++ b/cpp/src/text/bpe/byte_pair_encoding.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -341,7 +342,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                                                  bpe_merge_pairs const& merge_pairs,
                                                  cudf::string_scalar const& separator,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   if (input.is_empty() || input.chars_size(stream) == 0) {
     return cudf::make_empty_column(cudf::type_id::STRING);
@@ -458,7 +459,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
 std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const& input,
                                                  bpe_merge_pairs const& merges_table,
                                                  cudf::string_scalar const& separator,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::byte_pair_encoding(input, merges_table, separator, cudf::get_default_stream(), mr);
diff --git a/cpp/src/text/bpe/load_merge_pairs.cu b/cpp/src/text/bpe/load_merge_pairs.cu
index 1658f20182b..f34c5c4f7f6 100644
--- a/cpp/src/text/bpe/load_merge_pairs.cu
+++ b/cpp/src/text/bpe/load_merge_pairs.cu
@@ -28,6 +28,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 
@@ -99,7 +100,7 @@ std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_im
 std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(
   cudf::strings_column_view const& input,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto pairs   = cudf::strings::split_record(input, cudf::string_scalar(" "), 1, stream, mr);
   auto content = pairs->release();
@@ -110,7 +111,7 @@ std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_im
 
 std::unique_ptr<bpe_merge_pairs> load_merge_pairs(cudf::strings_column_view const& merge_pairs,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!merge_pairs.is_empty(), "Merge pairs must not be empty");
   CUDF_EXPECTS(!merge_pairs.has_nulls(), "Merge pairs may not contain nulls");
@@ -121,7 +122,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs(cudf::strings_column_view cons
 
 std::unique_ptr<bpe_merge_pairs> load_merge_pairs(cudf::strings_column_view const& merge_pairs,
                                                   rmm::cuda_stream_view stream,
-                                                  rmm::mr::device_memory_resource* mr)
+                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::load_merge_pairs(merge_pairs, stream, mr);
@@ -142,14 +143,14 @@ bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl(
 
 bpe_merge_pairs::bpe_merge_pairs(std::unique_ptr<cudf::column>&& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource*)
+                                 rmm::device_async_resource_ref)
   : impl(detail::create_bpe_merge_pairs_impl(std::move(input), stream).release())
 {
 }
 
 bpe_merge_pairs::bpe_merge_pairs(cudf::strings_column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
   : impl(detail::create_bpe_merge_pairs_impl(input, stream, mr).release())
 {
 }
diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index b9964352c74..63fe3113697 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -32,6 +32,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -132,7 +133,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
                                          cudf::column_view const& row_indices,
                                          cudf::string_scalar const& separator,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be valid");
   CUDF_EXPECTS(row_indices.size() == strings.size(),
@@ -173,7 +174,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& input,
                                          cudf::column_view const& row_indices,
                                          cudf::string_scalar const& separator,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::detokenize(input, row_indices, separator, stream, mr);
diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu
index 606bebe2174..8d857175407 100644
--- a/cpp/src/text/edit_distance.cu
+++ b/cpp/src/text/edit_distance.cu
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -138,7 +139,7 @@ struct edit_distance_matrix_levenshtein_algorithm {
 std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& strings,
                                             cudf::strings_column_view const& targets,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   auto const strings_count = strings.size();
   if (strings_count == 0) {
@@ -203,7 +204,7 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& str
  */
 std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view const& strings,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   cudf::size_type strings_count = strings.size();
   if (strings_count == 0) {
@@ -301,7 +302,7 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
 std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& input,
                                             cudf::strings_column_view const& targets,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::edit_distance(input, targets, stream, mr);
@@ -312,7 +313,7 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& inp
  */
 std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view const& input,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::edit_distance_matrix(input, stream, mr);
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index d2a0ef71e4a..d9fcd7dfd05 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/functional.h>
@@ -90,7 +91,7 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
                                               cudf::size_type ngrams,
                                               cudf::string_scalar const& separator,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(
     separator.is_valid(stream), "Parameter separator must be valid", std::invalid_argument);
@@ -154,7 +155,7 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
                                               cudf::size_type ngrams,
                                               cudf::string_scalar const& separator,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::generate_ngrams(strings, ngrams, separator, stream, mr);
@@ -204,7 +205,7 @@ struct character_ngram_generator_fn {
 std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& input,
                                                         cudf::size_type ngrams,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(ngrams >= 2,
                "Parameter ngrams should be an integer value of 2 or greater",
@@ -278,7 +279,7 @@ struct character_ngram_hash_fn {
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& input,
                                                     cudf::size_type ngrams,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(ngrams >= 2,
                "Parameter ngrams should be an integer value of 2 or greater",
@@ -325,7 +326,7 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
 std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
                                                         cudf::size_type ngrams,
                                                         rmm::cuda_stream_view stream,
-                                                        rmm::mr::device_memory_resource* mr)
+                                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::generate_character_ngrams(strings, ngrams, stream, mr);
@@ -334,7 +335,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
                                                     cudf::size_type ngrams,
                                                     rmm::cuda_stream_view stream,
-                                                    rmm::mr::device_memory_resource* mr)
+                                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::hash_character_ngrams(strings, ngrams, stream, mr);
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index 612eb52af01..9cf934165f6 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/binary_search.h>
@@ -247,7 +248,7 @@ std::unique_ptr<cudf::column> jaccard_index(cudf::strings_column_view const& inp
                                             cudf::strings_column_view const& input2,
                                             cudf::size_type width,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(
     input1.size() == input2.size(), "input columns must be the same size", std::invalid_argument);
@@ -297,7 +298,7 @@ std::unique_ptr<cudf::column> jaccard_index(cudf::strings_column_view const& inp
                                             cudf::strings_column_view const& input2,
                                             cudf::size_type width,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::jaccard_index(input1, input2, width, stream, mr);
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 8d22c784584..4318123627d 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -33,6 +33,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/execution_policy.h>
@@ -127,7 +128,7 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                                          cudf::device_span<hash_value_type const> seeds,
                                          cudf::size_type width,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
   CUDF_EXPECTS(width >= 2,
@@ -162,7 +163,7 @@ std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const&
                                                 std::unique_ptr<cudf::column>&& hashes,
                                                 cudf::size_type seeds_size,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   // build the offsets for the output lists column
   auto const zero = cudf::numeric_scalar<cudf::size_type>(0, true, stream);
@@ -190,7 +191,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                       cudf::numeric_scalar<uint32_t> const& seed,
                                       cudf::size_type width,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
   auto const seeds   = cudf::device_span<uint32_t const>{seed.data(), 1};
@@ -203,7 +204,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                       cudf::device_span<uint32_t const> seeds,
                                       cudf::size_type width,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
   auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
@@ -214,7 +215,7 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::numeric_scalar<uint64_t> const& seed,
                                         cudf::size_type width,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
   auto const seeds   = cudf::device_span<uint64_t const>{seed.data(), 1};
@@ -227,7 +228,7 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::device_span<uint64_t const> seeds,
                                         cudf::size_type width,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
   auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
@@ -239,7 +240,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                       cudf::numeric_scalar<uint32_t> seed,
                                       cudf::size_type width,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::minhash(input, seed, width, stream, mr);
@@ -249,7 +250,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                       cudf::device_span<uint32_t const> seeds,
                                       cudf::size_type width,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::minhash(input, seeds, width, stream, mr);
@@ -259,7 +260,7 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::numeric_scalar<uint64_t> seed,
                                         cudf::size_type width,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::minhash64(input, seed, width, stream, mr);
@@ -269,7 +270,7 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::device_span<uint64_t const> seeds,
                                         cudf::size_type width,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::minhash64(input, seeds, width, stream, mr);
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index 75ad542548b..95dd8ff3d6c 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -33,6 +33,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -141,7 +142,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
                                               cudf::string_scalar const& delimiter,
                                               cudf::string_scalar const& separator,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
@@ -248,7 +249,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
                                               cudf::string_scalar const& delimiter,
                                               cudf::string_scalar const& separator,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, stream, mr);
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index c06a24382ed..e5e72d3a33e 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -36,6 +36,7 @@
 #include <nvtext/normalize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -174,7 +175,7 @@ struct codepoint_to_utf8_fn {
 // detail API
 std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const& strings,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
@@ -198,7 +199,7 @@ std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const&
 std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& strings,
                                                    bool do_lower_case,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
@@ -240,7 +241,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
 
 std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const& input,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::normalize_spaces(input, stream, mr);
@@ -252,7 +253,7 @@ std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const&
 std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view const& input,
                                                    bool do_lower_case,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
+                                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::normalize_characters(input, do_lower_case, stream, mr);
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index 5aed701c037..f61fa544e73 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -32,6 +32,7 @@
 #include <nvtext/replace.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -202,7 +203,7 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
                                              cudf::strings_column_view const& replacements,
                                              cudf::string_scalar const& delimiter,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!targets.has_nulls(), "Parameter targets must not have nulls");
   CUDF_EXPECTS(!replacements.has_nulls(), "Parameter replacements must not have nulls");
@@ -244,7 +245,7 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
                                             cudf::string_scalar const& replacement,
                                             cudf::string_scalar const& delimiter,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid");
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
@@ -281,7 +282,7 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& in
                                              cudf::strings_column_view const& replacements,
                                              cudf::string_scalar const& delimiter,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_tokens(input, targets, replacements, delimiter, stream, mr);
@@ -292,7 +293,7 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& inp
                                             cudf::string_scalar const& replacement,
                                             cudf::string_scalar const& delimiter,
                                             rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource* mr)
+                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::filter_tokens(input, min_token_length, replacement, delimiter, stream, mr);
diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu
index 5c67b2e5f54..4746b6b74b9 100644
--- a/cpp/src/text/stemmer.cu
+++ b/cpp/src/text/stemmer.cu
@@ -30,6 +30,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/constant_iterator.h>
@@ -99,7 +100,7 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& strings
                                         letter_type ltype,
                                         PositionIterator position_itr,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::BOOL8});
 
@@ -133,7 +134,7 @@ struct dispatch_is_letter_fn {
                                            letter_type ltype,
                                            cudf::column_view const& indices,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr) const
+                                           rmm::device_async_resource_ref mr) const
   {
     CUDF_EXPECTS(strings.size() == indices.size(),
                  "strings column and indices column must be the same size");
@@ -211,7 +212,7 @@ struct porter_stemmer_measure_fn {
 
 std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view const& strings,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   if (strings.is_empty()) {
     return cudf::make_empty_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()});
@@ -240,7 +241,7 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& strings
                                         letter_type ltype,
                                         cudf::column_view const& indices,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return cudf::type_dispatcher(
     indices.type(), dispatch_is_letter_fn{}, strings, ltype, indices, stream, mr);
@@ -254,7 +255,7 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& input,
                                         letter_type ltype,
                                         cudf::size_type character_index,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_letter(
@@ -265,7 +266,7 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& input,
                                         letter_type ltype,
                                         cudf::column_view const& indices,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_letter(input, ltype, indices, stream, mr);
@@ -276,7 +277,7 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& input,
  */
 std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view const& input,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::porter_stemmer_measure(input, stream, mr);
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index 0b4f9f729c3..a08fdea3e84 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 
@@ -180,7 +181,7 @@ uint64_t str_to_uint64(std::string const& str, uint64_t line_no)
 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   std::string const& filename_hashed_vocabulary,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   hashed_vocabulary result;
   std::ifstream hash_file(filename_hashed_vocabulary);
@@ -288,7 +289,7 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
 }  // namespace detail
 
 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
-  std::string const& filename_hashed_vocabulary, rmm::mr::device_memory_resource* mr)
+  std::string const& filename_hashed_vocabulary, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::load_vocabulary_file(filename_hashed_vocabulary, cudf::get_default_stream(), mr);
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index a623450ecad..e05427eb6ac 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -31,6 +31,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
@@ -139,7 +140,7 @@ CUDF_KERNEL void kernel_compute_tensor_metadata(
 tokenizer_result build_empty_result(cudf::size_type size,
                                     uint32_t max_sequence_length,
                                     rmm::cuda_stream_view stream,
-                                    rmm::mr::device_memory_resource* mr)
+                                    rmm::device_async_resource_ref mr)
 {
   auto zero = cudf::numeric_scalar<uint32_t>(0, true, stream);
   auto ids  = cudf::detail::sequence(size * max_sequence_length, zero, zero, stream, mr);
@@ -166,7 +167,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   bool do_lower_case,
                                   bool do_truncate,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(stride <= max_sequence_length,
                "stride must be less than or equal to max_sequence_length");
@@ -292,7 +293,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   uint32_t stride,
                                   bool do_lower_case,
                                   bool do_truncate,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::subword_tokenize(strings,
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index 82c51e72b31..0b16305a81a 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -33,6 +33,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -48,7 +49,7 @@ template <typename TokenCounter>
 std::unique_ptr<cudf::column> token_count_fn(cudf::size_type strings_count,
                                              TokenCounter tokenizer,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
 {
   // create output column
   auto token_counts =
@@ -72,7 +73,7 @@ template <typename Tokenizer>
 std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
                                           Tokenizer tokenizer,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   // get the number of tokens in each string
   auto const token_counts =
@@ -106,7 +107,7 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        cudf::string_scalar const& delimiter,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
@@ -118,7 +119,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
                                            cudf::string_scalar const& delimiter,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
@@ -131,7 +132,7 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& stri
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        cudf::strings_column_view const& delimiters,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiters.size() > 0, "Parameter delimiters must not be empty");
   CUDF_EXPECTS(!delimiters.has_nulls(), "Parameter delimiters must not have nulls");
@@ -150,7 +151,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
                                            cudf::strings_column_view const& delimiters,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiters.size() > 0, "Parameter delimiters must not be empty");
   CUDF_EXPECTS(!delimiters.has_nulls(), "Parameter delimiters must not have nulls");
@@ -168,7 +169,7 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& stri
 // tokenize on every character
 std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const& strings_column,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   auto strings_count = strings_column.size();
   if (strings_count == 0) {
@@ -230,7 +231,7 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
                                        cudf::string_scalar const& delimiter,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::tokenize(input, delimiter, stream, mr);
@@ -239,7 +240,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
                                        cudf::strings_column_view const& delimiters,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::tokenize(input, delimiters, stream, mr);
@@ -248,7 +249,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& input,
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& input,
                                            cudf::string_scalar const& delimiter,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::count_tokens(input, delimiter, stream, mr);
@@ -257,7 +258,7 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& inpu
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& input,
                                            cudf::strings_column_view const& delimiters,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::count_tokens(input, delimiters, stream, mr);
@@ -265,7 +266,7 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& inpu
 
 std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const& input,
                                                  rmm::cuda_stream_view stream,
-                                                 rmm::mr::device_memory_resource* mr)
+                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::character_tokenize(input, stream, mr);
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index c99adda3fad..8913ce22da8 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -36,6 +36,7 @@
 #include <nvtext/tokenize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuco/static_map.cuh>
@@ -134,7 +135,7 @@ struct key_pair {
 
 tokenize_vocabulary::tokenize_vocabulary(cudf::strings_column_view const& input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(not input.is_empty(), "vocabulary must not be empty");
   CUDF_EXPECTS(not input.has_nulls(), "vocabulary must not have nulls");
@@ -165,7 +166,7 @@ tokenize_vocabulary::~tokenize_vocabulary() { delete _impl; }
 
 std::unique_ptr<tokenize_vocabulary> load_vocabulary(cudf::strings_column_view const& input,
                                                      rmm::cuda_stream_view stream,
-                                                     rmm::mr::device_memory_resource* mr)
+                                                     rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return std::make_unique<tokenize_vocabulary>(input, stream, mr);
@@ -358,7 +359,7 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
                                                        cudf::string_scalar const& delimiter,
                                                        cudf::size_type default_id,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
@@ -467,7 +468,7 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(cudf::strings_column_view
                                                        cudf::string_scalar const& delimiter,
                                                        cudf::size_type default_id,
                                                        rmm::cuda_stream_view stream,
-                                                       rmm::mr::device_memory_resource* mr)
+                                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::tokenize_with_vocabulary(input, vocabulary, delimiter, default_id, stream, mr);
diff --git a/cpp/src/transform/bools_to_mask.cu b/cpp/src/transform/bools_to_mask.cu
index e558b51fbb0..c12f65deb46 100644
--- a/cpp/src/transform/bools_to_mask.cu
+++ b/cpp/src/transform/bools_to_mask.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,11 +27,12 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
-  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type().id() == type_id::BOOL8, "Input is not of type bool");
 
@@ -58,7 +59,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
 }  // namespace detail
 
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
-  column_view const& input, rmm::mr::device_memory_resource* mr)
+  column_view const& input, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::bools_to_mask(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index eaf47adec10..7960731f3a1 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -34,6 +34,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -83,7 +84,7 @@ __launch_bounds__(max_block_size) CUDF_KERNEL
 std::unique_ptr<column> compute_column(table_view const& table,
                                        ast::expression const& expr,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   // If evaluating the expression may produce null outputs we create a nullable
   // output column and follow the null-supporting expression evaluation code
@@ -137,7 +138,7 @@ std::unique_ptr<column> compute_column(table_view const& table,
 
 std::unique_ptr<column> compute_column(table_view const& table,
                                        ast::expression const& expr,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::compute_column(table, expr, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index c0e0c83c416..7a044b9f6f7 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <numeric>
@@ -40,8 +41,9 @@
 namespace cudf {
 namespace detail {
 
-std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
-  table_view const& input_table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(table_view const& input_table,
+                                                                  rmm::cuda_stream_view stream,
+                                                                  rmm::device_async_resource_ref mr)
 {
   auto const num_cols = input_table.num_columns();
 
@@ -70,7 +72,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(
 }  // namespace detail
 
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
-  cudf::table_view const& input, rmm::mr::device_memory_resource* mr)
+  cudf::table_view const& input, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::encode(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/mask_to_bools.cu b/cpp/src/transform/mask_to_bools.cu
index 73c1a83cfe1..adf5db02d9c 100644
--- a/cpp/src/transform/mask_to_bools.cu
+++ b/cpp/src/transform/mask_to_bools.cu
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -35,7 +36,7 @@ std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
                                       size_type begin_bit,
                                       size_type end_bit,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   auto const length = end_bit - begin_bit;
   CUDF_EXPECTS(length >= 0, "begin_bit should be less than or equal to end_bit");
@@ -61,7 +62,7 @@ std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
 std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
                                       size_type begin_bit,
                                       size_type end_bit,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::mask_to_bools(bitmask, begin_bit, end_bit, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/nans_to_nulls.cu b/cpp/src/transform/nans_to_nulls.cu
index 3c02409f778..fd4f33c594c 100644
--- a/cpp/src/transform/nans_to_nulls.cu
+++ b/cpp/src/transform/nans_to_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -37,7 +38,7 @@ struct dispatch_nan_to_null {
                    std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type>>
   operator()(column_view const& input,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+             rmm::device_async_resource_ref mr)
   {
     auto input_device_view_ptr = column_device_view::create(input, stream);
     auto input_device_view     = *input_device_view_ptr;
@@ -75,14 +76,14 @@ struct dispatch_nan_to_null {
                    std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type>>
   operator()(column_view const& input,
              rmm::cuda_stream_view stream,
-             rmm::mr::device_memory_resource* mr)
+             rmm::device_async_resource_ref mr)
   {
     CUDF_FAIL("Input column can't be a non-floating type");
   }
 };
 
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
-  column_view const& input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr)
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return std::pair(std::make_unique<rmm::device_buffer>(), 0); }
 
@@ -92,7 +93,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
 }  // namespace detail
 
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
-  column_view const& input, rmm::mr::device_memory_resource* mr)
+  column_view const& input, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::nans_to_nulls(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 72f864346a4..570060b3870 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -27,6 +27,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -58,7 +59,7 @@ struct ohe_equality_functor {
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const& input,
                                                               column_view const& categories,
                                                               rmm::cuda_stream_view stream,
-                                                              rmm::mr::device_memory_resource* mr)
+                                                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input.type() == categories.type(), "Mismatch type between input and categories.");
 
@@ -110,7 +111,7 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const&
 
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const& input,
                                                               column_view const& categories,
-                                                              rmm::mr::device_memory_resource* mr)
+                                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::one_hot_encode(input, categories, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 78bd558501b..bfac7ab586e 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -31,6 +31,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -477,7 +478,7 @@ CUDF_KERNEL void compute_segment_sizes(device_span<column_device_view const> col
 std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
                                                 size_type segment_length,
                                                 rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   // If there is no rows, segment_length will not be checked.
   if (t.num_rows() <= 0) { return cudf::make_empty_column(type_id::INT32); }
@@ -557,7 +558,7 @@ std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
 
 std::unique_ptr<column> row_bit_count(table_view const& t,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   return segmented_row_bit_count(t, 1, stream, mr);
 }
@@ -566,13 +567,13 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
 
 std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
                                                 size_type segment_length,
-                                                rmm::mr::device_memory_resource* mr)
+                                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_row_bit_count(t, segment_length, cudf::get_default_stream(), mr);
 }
 
-std::unique_ptr<column> row_bit_count(table_view const& t, rmm::mr::device_memory_resource* mr)
+std::unique_ptr<column> row_bit_count(table_view const& t, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::row_bit_count(t, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 6f61ed80dd8..072eb73453b 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -28,6 +28,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <jit_preprocessed_files/transform/jit/kernel.cu.jit.hpp>
 
@@ -73,7 +74,7 @@ std::unique_ptr<column> transform(column_view const& input,
                                   data_type output_type,
                                   bool is_ptx,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_fixed_width(input.type()), "Unexpected non-fixed-width type.");
 
@@ -96,7 +97,7 @@ std::unique_ptr<column> transform(column_view const& input,
                                   std::string const& unary_udf,
                                   data_type output_type,
                                   bool is_ptx,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::transform(input, unary_udf, output_type, is_ptx, cudf::get_default_stream(), mr);
diff --git a/cpp/src/transpose/transpose.cu b/cpp/src/transpose/transpose.cu
index 45c2e650095..abde43535be 100644
--- a/cpp/src/transpose/transpose.cu
+++ b/cpp/src/transpose/transpose.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,6 +26,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -34,7 +35,7 @@ namespace cudf {
 namespace detail {
 std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   // If there are no rows in the input, return successfully
   if (input.num_columns() == 0 || input.num_rows() == 0) {
@@ -60,7 +61,7 @@ std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input
 }  // namespace detail
 
 std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input,
-                                                         rmm::mr::device_memory_resource* mr)
+                                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::transpose(input, cudf::get_default_stream(), mr);
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index b6c9b3caa20..98c412f805d 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -29,6 +29,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -184,7 +185,7 @@ template <typename T, std::enable_if_t<is_fixed_point<T>()>* = nullptr>
 std::unique_ptr<column> rescale(column_view input,
                                 numeric::scale_type scale,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   using namespace numeric;
   using RepType = device_storage_type_t<T>;
@@ -229,7 +230,7 @@ struct dispatch_unary_cast_to {
             std::enable_if_t<is_supported_non_fixed_point_cast<SourceT, TargetT>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const size = input.size();
     auto output =
@@ -256,7 +257,7 @@ struct dispatch_unary_cast_to {
     std::enable_if_t<cudf::is_fixed_point<SourceT>() && cudf::is_numeric<TargetT>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const size = input.size();
     auto output =
@@ -286,7 +287,7 @@ struct dispatch_unary_cast_to {
     std::enable_if_t<cudf::is_numeric<SourceT>() && cudf::is_fixed_point<TargetT>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     auto const size = input.size();
     auto output =
@@ -316,7 +317,7 @@ struct dispatch_unary_cast_to {
                              std::is_same_v<SourceT, TargetT>>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     if (input.type() == type) {
       return std::make_unique<column>(input, stream, mr);  // TODO add test for this
@@ -331,7 +332,7 @@ struct dispatch_unary_cast_to {
                              not std::is_same_v<SourceT, TargetT>>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     using namespace numeric;
     using SourceDeviceT = device_storage_type_t<SourceT>;
@@ -374,7 +375,7 @@ struct dispatch_unary_cast_to {
             std::enable_if_t<not is_supported_cast<SourceT, TargetT>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type,
                                      rmm::cuda_stream_view,
-                                     rmm::mr::device_memory_resource*)
+                                     rmm::device_async_resource_ref)
 
   {
     if (!cudf::is_fixed_width<TargetT>())
@@ -396,7 +397,7 @@ struct dispatch_unary_cast_from {
   template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
   std::unique_ptr<column> operator()(data_type type,
                                      rmm::cuda_stream_view stream,
-                                     rmm::mr::device_memory_resource* mr)
+                                     rmm::device_async_resource_ref mr)
   {
     return type_dispatcher(type, dispatch_unary_cast_to<T>{input}, type, stream, mr);
   }
@@ -412,7 +413,7 @@ struct dispatch_unary_cast_from {
 std::unique_ptr<column> cast(column_view const& input,
                              data_type type,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(is_fixed_width(type), "Unary cast type must be fixed-width.");
 
@@ -424,7 +425,7 @@ std::unique_ptr<column> cast(column_view const& input,
 std::unique_ptr<column> cast(column_view const& input,
                              data_type type,
                              rmm::cuda_stream_view stream,
-                             rmm::mr::device_memory_resource* mr)
+                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::cast(input, type, stream, mr);
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index 88922362319..ab17da5f8c4 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -280,7 +281,7 @@ struct fixed_point_abs {
 template <typename T, template <typename> typename FixedPointFunctor>
 std::unique_ptr<column> unary_op_with(column_view const& input,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   using Type                     = device_storage_type_t<T>;
   using FixedPointUnaryOpFunctor = FixedPointFunctor<Type>;
@@ -322,7 +323,7 @@ std::unique_ptr<cudf::column> transform_fn(InputIterator begin,
                                            rmm::device_buffer&& null_mask,
                                            size_type null_count,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto const size = cudf::distance(begin, end);
 
@@ -344,7 +345,7 @@ std::unique_ptr<cudf::column> transform_fn(InputIterator begin,
 template <typename T, typename UFN>
 std::unique_ptr<cudf::column> transform_fn(cudf::dictionary_column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
 {
   auto dictionary_view = cudf::column_device_view::create(input.parent(), stream);
   auto dictionary_itr  = dictionary::detail::make_dictionary_iterator<T>(*dictionary_view);
@@ -365,7 +366,7 @@ struct MathOpDispatcher {
   template <typename T, std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     return transform_fn<T, UFN>(input.begin<T>(),
                                 input.end<T>(),
@@ -379,7 +380,7 @@ struct MathOpDispatcher {
     template <typename T, std::enable_if_t<std::is_arithmetic_v<T>>* = nullptr>
     std::unique_ptr<cudf::column> operator()(cudf::dictionary_column_view const& input,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
     {
       return transform_fn<T, UFN>(input, stream, mr);
     }
@@ -396,7 +397,7 @@ struct MathOpDispatcher {
     std::enable_if_t<!std::is_arithmetic_v<T> and std::is_same_v<T, dictionary32>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     if (input.is_empty()) return empty_like(input);
     auto dictionary_col = dictionary_column_view(input);
@@ -418,7 +419,7 @@ struct BitwiseOpDispatcher {
   template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     return transform_fn<T, UFN>(input.begin<T>(),
                                 input.end<T>(),
@@ -432,7 +433,7 @@ struct BitwiseOpDispatcher {
     template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
     std::unique_ptr<cudf::column> operator()(cudf::dictionary_column_view const& input,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
     {
       return transform_fn<T, UFN>(input, stream, mr);
     }
@@ -448,7 +449,7 @@ struct BitwiseOpDispatcher {
             std::enable_if_t<!std::is_integral_v<T> and std::is_same_v<T, dictionary32>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     if (input.is_empty()) return empty_like(input);
     auto dictionary_col = dictionary_column_view(input);
@@ -478,7 +479,7 @@ struct LogicalOpDispatcher {
   template <typename T, std::enable_if_t<is_supported<T>()>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     return transform_fn<bool, UFN>(input.begin<T>(),
                                    input.end<T>(),
@@ -493,7 +494,7 @@ struct LogicalOpDispatcher {
     template <typename T, std::enable_if_t<is_supported<T>()>* = nullptr>
     std::unique_ptr<cudf::column> operator()(cudf::dictionary_column_view const& input,
                                              rmm::cuda_stream_view stream,
-                                             rmm::mr::device_memory_resource* mr)
+                                             rmm::device_async_resource_ref mr)
     {
       auto dictionary_view = cudf::column_device_view::create(input.parent(), stream);
       auto dictionary_itr  = dictionary::detail::make_dictionary_iterator<T>(*dictionary_view);
@@ -516,7 +517,7 @@ struct LogicalOpDispatcher {
             std::enable_if_t<!is_supported<T>() and std::is_same_v<T, dictionary32>>* = nullptr>
   std::unique_ptr<cudf::column> operator()(cudf::column_view const& input,
                                            rmm::cuda_stream_view stream,
-                                           rmm::mr::device_memory_resource* mr)
+                                           rmm::device_async_resource_ref mr)
   {
     if (input.is_empty()) return make_empty_column(cudf::data_type{cudf::type_id::BOOL8});
     auto dictionary_col = dictionary_column_view(input);
@@ -545,7 +546,7 @@ struct FixedPointOpDispatcher {
     column_view const& input,
     cudf::unary_operator op,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     // clang-format off
     switch (op) {
@@ -563,7 +564,7 @@ struct FixedPointOpDispatcher {
 std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
                                               cudf::unary_operator op,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   if (cudf::is_fixed_point(input.type()))
     return type_dispatcher(input.type(), detail::FixedPointOpDispatcher{}, input, op, stream, mr);
@@ -647,7 +648,7 @@ std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
 std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
                                               cudf::unary_operator op,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::unary_operation(input, op, stream, mr);
diff --git a/cpp/src/unary/nan_ops.cu b/cpp/src/unary/nan_ops.cu
index 092ad3b6731..08aa8755624 100644
--- a/cpp/src/unary/nan_ops.cu
+++ b/cpp/src/unary/nan_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -33,7 +34,7 @@ struct nan_dispatcher {
     cudf::column_view const& input,
     Predicate predicate,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     auto input_device_view = column_device_view::create(input, stream);
 
@@ -61,7 +62,7 @@ struct nan_dispatcher {
     cudf::column_view const& input,
     Predicate predicate,
     rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
+    rmm::device_async_resource_ref mr)
   {
     CUDF_FAIL("NAN is not supported in a Non-floating point type column");
   }
@@ -69,7 +70,7 @@ struct nan_dispatcher {
 
 std::unique_ptr<column> is_nan(cudf::column_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   auto predicate = [] __device__(auto element_validity_pair) {
     return element_validity_pair.second and std::isnan(element_validity_pair.first);
@@ -80,7 +81,7 @@ std::unique_ptr<column> is_nan(cudf::column_view const& input,
 
 std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   auto predicate = [] __device__(auto element_validity_pair) {
     return !element_validity_pair.second or !std::isnan(element_validity_pair.first);
@@ -93,7 +94,7 @@ std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
 
 std::unique_ptr<column> is_nan(cudf::column_view const& input,
                                rmm::cuda_stream_view stream,
-                               rmm::mr::device_memory_resource* mr)
+                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_nan(input, stream, mr);
@@ -101,7 +102,7 @@ std::unique_ptr<column> is_nan(cudf::column_view const& input,
 
 std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
                                    rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_not_nan(input, stream, mr);
diff --git a/cpp/src/unary/null_ops.cu b/cpp/src/unary/null_ops.cu
index 6bdd65dd42d..a223a090128 100644
--- a/cpp/src/unary/null_ops.cu
+++ b/cpp/src/unary/null_ops.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,13 +19,15 @@
 #include <cudf/detail/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 
 namespace cudf {
 namespace detail {
 std::unique_ptr<column> is_null(cudf::column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   auto input_device_view = column_device_view::create(input, stream);
   auto device_view       = *input_device_view;
@@ -40,7 +42,7 @@ std::unique_ptr<column> is_null(cudf::column_view const& input,
 
 std::unique_ptr<column> is_valid(cudf::column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   auto input_device_view = column_device_view::create(input, stream);
   auto device_view       = *input_device_view;
@@ -57,7 +59,7 @@ std::unique_ptr<column> is_valid(cudf::column_view const& input,
 
 std::unique_ptr<column> is_null(cudf::column_view const& input,
                                 rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr)
+                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_null(input, stream, mr);
@@ -65,7 +67,7 @@ std::unique_ptr<column> is_null(cudf::column_view const& input,
 
 std::unique_ptr<column> is_valid(cudf::column_view const& input,
                                  rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource* mr)
+                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::is_valid(input, stream, mr);
diff --git a/cpp/src/unary/unary_ops.cuh b/cpp/src/unary/unary_ops.cuh
index d0003bb6b41..61c41705665 100644
--- a/cpp/src/unary/unary_ops.cuh
+++ b/cpp/src/unary/unary_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -34,7 +35,7 @@ struct launcher {
   static std::unique_ptr<cudf::column> launch(cudf::column_view const& input,
                                               cudf::unary_operator op,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
   {
     std::unique_ptr<cudf::column> output = [&] {
       if (op == cudf::unary_operator::NOT) {
diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp
index f904696593c..9c2b16df1e1 100644
--- a/cpp/tests/copying/shift_tests.cpp
+++ b/cpp/tests/copying/shift_tests.cpp
@@ -26,6 +26,7 @@
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <limits>
 #include <memory>
@@ -35,8 +36,8 @@ using TestTypes = cudf::test::Types<int32_t>;
 
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   auto s = new ScalarType(cudf::test::make_type_param_scalar<T>(0), false, stream, mr);
   return std::unique_ptr<cudf::scalar>(s);
@@ -45,8 +46,8 @@ std::unique_ptr<cudf::scalar> make_scalar(
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
   T value,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   auto s = new ScalarType(value, true, stream, mr);
   return std::unique_ptr<cudf::scalar>(s);
diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json_chunked_reader.cpp
index 8d8fdd2a0e1..ef69ee5239d 100644
--- a/cpp/tests/io/json_chunked_reader.cpp
+++ b/cpp/tests/io/json_chunked_reader.cpp
@@ -22,6 +22,8 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 /**
  * @brief Base test fixture for JSON reader tests
  */
@@ -35,7 +37,7 @@ std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
   cudf::io::json_reader_options const& reader_opts,
   int32_t chunk_size,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   using namespace cudf::io::json::detail;
   using cudf::size_type;
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index b42f378d872..c35ad5319e4 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -38,6 +38,8 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <limits>
 
 template <typename T>
@@ -59,7 +61,7 @@ template <std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
             cudf::table_view const& left_keys,
             cudf::table_view const& right_keys,
             cudf::null_equality compare_nulls,
-            rmm::mr::device_memory_resource* mr),
+            rmm::device_async_resource_ref mr),
           cudf::out_of_bounds_policy oob_policy = cudf::out_of_bounds_policy::DONT_CHECK>
 std::unique_ptr<cudf::table> join_and_gather(
   cudf::table_view const& left_input,
@@ -67,7 +69,7 @@ std::unique_ptr<cudf::table> join_and_gather(
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
   cudf::null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   auto left_selected  = left_input.select(left_on);
   auto right_selected = right_input.select(right_on);
diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp
index 5cdf5b2a374..61bb3069308 100644
--- a/cpp/tests/join/semi_anti_join_tests.cpp
+++ b/cpp/tests/join/semi_anti_join_tests.cpp
@@ -29,6 +29,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <thrust/iterator/transform_iterator.h>
 
 template <typename T>
@@ -50,14 +52,14 @@ template <std::unique_ptr<rmm::device_uvector<cudf::size_type>> (*join_impl)(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
   cudf::null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr)>
+  rmm::device_async_resource_ref mr)>
 std::unique_ptr<cudf::table> join_and_gather(
   cudf::table_view const& left_input,
   cudf::table_view const& right_input,
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
   cudf::null_equality compare_nulls,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   auto left_selected      = left_input.select(left_on);
   auto right_selected     = right_input.select(right_on);
diff --git a/java/src/main/native/include/maps_column_view.hpp b/java/src/main/native/include/maps_column_view.hpp
index 5ac8d5c5713..7d19615053d 100644
--- a/java/src/main/native/include/maps_column_view.hpp
+++ b/java/src/main/native/include/maps_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -81,9 +83,9 @@ class maps_column_view {
    * @param mr Device memory resource used to allocate the returned column's device memory.
    * @return std::unique_ptr<column> Column of values corresponding the value of the lookup key.
    */
-  std::unique_ptr<column> get_values_for(
-      column_view const &keys, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-      rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) const;
+  std::unique_ptr<column>
+  get_values_for(column_view const &keys, rmm::cuda_stream_view stream = cudf::get_default_stream(),
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Map lookup by a scalar key.
@@ -99,9 +101,9 @@ class maps_column_view {
    * @param mr Device memory resource used to allocate the returned column's device memory.
    * @return std::unique_ptr<column>
    */
-  std::unique_ptr<column> get_values_for(
-      scalar const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-      rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) const;
+  std::unique_ptr<column>
+  get_values_for(scalar const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
+                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Check if each map row contains a specified scalar key.
@@ -121,7 +123,7 @@ class maps_column_view {
    */
   std::unique_ptr<column>
   contains(scalar const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-           rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) const;
+           rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Check if each map row contains keys specified by a column
@@ -142,7 +144,7 @@ class maps_column_view {
 
   std::unique_ptr<column>
   contains(column_view const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-           rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) const;
+           rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
 private:
   lists_column_view keys_, values_;
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 68af350d5fe..8d7ac8890cc 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -22,6 +22,7 @@
 #include <mutex>
 
 #include <cudf/io/memory_resource.hpp>
+#include <rmm/aligned.hpp>
 #include <rmm/mr/device/aligned_resource_adaptor.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/cuda_async_memory_resource.hpp>
@@ -33,6 +34,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include "cudf_jni_apis.hpp"
 
@@ -572,9 +574,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclas
                                                               jlong stream) {
   try {
     cudf::jni::auto_set_device(env);
-    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource();
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
     auto c_stream = rmm::cuda_stream_view(reinterpret_cast<cudaStream_t>(stream));
-    void *ret = mr->allocate(size, c_stream);
+    void *ret = mr.allocate_async(size, rmm::CUDA_ALLOCATION_ALIGNMENT, c_stream);
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
@@ -584,10 +586,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_free(JNIEnv *env, jclass clazz, j
                                                     jlong size, jlong stream) {
   try {
     cudf::jni::auto_set_device(env);
-    rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource();
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
     void *cptr = reinterpret_cast<void *>(ptr);
     auto c_stream = rmm::cuda_stream_view(reinterpret_cast<cudaStream_t>(stream));
-    mr->deallocate(cptr, size, c_stream);
+    mr.deallocate_async(cptr, size, rmm::CUDA_ALLOCATION_ALIGNMENT, c_stream);
   }
   CATCH_STD(env, )
 }
diff --git a/java/src/main/native/src/maps_column_view.cu b/java/src/main/native/src/maps_column_view.cu
index 1af7689f972..d5600e48a5c 100644
--- a/java/src/main/native/src/maps_column_view.cu
+++ b/java/src/main/native/src/maps_column_view.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <maps_column_view.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace cudf::jni {
 
@@ -49,7 +50,7 @@ maps_column_view::maps_column_view(lists_column_view const &lists_of_structs,
 template <typename KeyT>
 std::unique_ptr<column> get_values_for_impl(maps_column_view const &maps_view,
                                             KeyT const &lookup_keys, rmm::cuda_stream_view stream,
-                                            rmm::mr::device_memory_resource *mr) {
+                                            rmm::device_async_resource_ref mr) {
   auto const keys_ = maps_view.keys();
   auto const values_ = maps_view.values();
   CUDF_EXPECTS(lookup_keys.type().id() == keys_.child().type().id(),
@@ -65,25 +66,25 @@ std::unique_ptr<column> get_values_for_impl(maps_column_view const &maps_view,
   return lists::detail::extract_list_element(values_, key_indices->view(), stream, mr);
 }
 
-std::unique_ptr<column>
-maps_column_view::get_values_for(column_view const &lookup_keys, rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource *mr) const {
+std::unique_ptr<column> maps_column_view::get_values_for(column_view const &lookup_keys,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr) const {
   CUDF_EXPECTS(lookup_keys.size() == size(),
                "Lookup keys must have the same size as the map column.");
 
   return get_values_for_impl(*this, lookup_keys, stream, mr);
 }
 
-std::unique_ptr<column>
-maps_column_view::get_values_for(scalar const &lookup_key, rmm::cuda_stream_view stream,
-                                 rmm::mr::device_memory_resource *mr) const {
+std::unique_ptr<column> maps_column_view::get_values_for(scalar const &lookup_key,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr) const {
   return get_values_for_impl(*this, lookup_key, stream, mr);
 }
 
 template <typename KeyT>
 std::unique_ptr<column> contains_impl(maps_column_view const &maps_view, KeyT const &lookup_keys,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource *mr) {
+                                      rmm::device_async_resource_ref mr) {
   auto const keys = maps_view.keys();
   CUDF_EXPECTS(lookup_keys.type().id() == keys.child().type().id(),
                "Lookup keys must have the same type as the keys of the map column.");
@@ -96,7 +97,7 @@ std::unique_ptr<column> contains_impl(maps_column_view const &maps_view, KeyT co
 
 std::unique_ptr<column> maps_column_view::contains(column_view const &lookup_keys,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource *mr) const {
+                                                   rmm::device_async_resource_ref mr) const {
   CUDF_EXPECTS(lookup_keys.size() == size(),
                "Lookup keys must have the same size as the map column.");
 
@@ -105,7 +106,7 @@ std::unique_ptr<column> maps_column_view::contains(column_view const &lookup_key
 
 std::unique_ptr<column> maps_column_view::contains(scalar const &lookup_key,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource *mr) const {
+                                                   rmm::device_async_resource_ref mr) const {
   return contains_impl(*this, lookup_key, stream, mr);
 }
 

From 96b6bec7721fa32352bbe47d6618110a8de7d293 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 17 Apr 2024 05:22:44 -1000
Subject: [PATCH 070/842] Enable pandas plotting unit tests for cudf.pandas
 (#15547)

Locally, these tests ran without any crashed workers/hangs

closes #15428

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15547
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 07ec5c8bc0c..784d90a40ed 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -22,8 +22,7 @@ set -euo pipefail
 # of Pandas installed.
 PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)")
 
-PYTEST_IGNORES="--ignore=tests/plotting \
---ignore=tests/tslibs/test_parsing.py \
+PYTEST_IGNORES="--ignore=tests/tslibs/test_parsing.py \
 --ignore=tests/io/parser/common/test_read_errors.py"
 
 mkdir -p pandas-testing

From 041eaa4ac31e3f39713225d143e7c4dfb489b33a Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 17 Apr 2024 12:33:22 -0500
Subject: [PATCH 071/842] Rename JSON_READER_OPTION to
 JSON_READER_OPTION_NVBENCH. (#15553)

This renames a benchmark executable for `JSON_READER_OPTION` to indicate that it is an NVBench executable. This naming pattern is significant for our automated benchmarking tools.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/15553
---
 cpp/benchmarks/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 571780888c0..2c78a31f0f8 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -331,7 +331,7 @@ ConfigureNVBench(
 ConfigureBench(JSON_BENCH json/json.cu)
 ConfigureNVBench(FST_NVBENCH io/fst.cu)
 ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader_input.cpp)
-ConfigureNVBench(JSON_READER_OPTION io/json/json_reader_option.cpp)
+ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp)
 ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 
 # ##################################################################################################

From e928c4a01bfe528839b812aad8b5135029a0fa78 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 17 Apr 2024 09:17:42 -1000
Subject: [PATCH 072/842] Clean up special casing in `as_column` for non-typed
 input (#15276)

Redo at https://github.com/rapidsai/cudf/pull/14636

Clean up special casing for non-typed inputs to essentially do:

```
try:
     arbitrary = pa.array(arbitrary)
except:
     arbitrary = pd.Series(arbitrary)
return as_column(arbitrary)
```

Additionally, this change matches a behavior with pandas that will parse string data with `dtype=datetime64` type similar to the 2.2 behavior (fail if the resolution of the type doesn't match the string data)

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15276
---
 python/cudf/cudf/core/column/column.py  | 246 ++++++++----------------
 python/cudf/cudf/core/index.py          |  10 +-
 python/cudf/cudf/tests/test_binops.py   |  36 ++--
 python/cudf/cudf/tests/test_column.py   |   2 +-
 python/cudf/cudf/tests/test_datetime.py | 138 ++++++++-----
 python/cudf/cudf/tests/test_series.py   |   4 +-
 python/cudf/cudf/utils/docutils.py      |   9 +-
 7 files changed, 215 insertions(+), 230 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index c8a6493ddda..b5890f7aad4 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -4,7 +4,6 @@
 
 import builtins
 import pickle
-import warnings
 from collections import abc
 from functools import cached_property
 from itertools import chain
@@ -56,7 +55,6 @@
     _is_pandas_nullable_extension_dtype,
     infer_dtype,
     is_bool_dtype,
-    is_datetime64_dtype,
     is_dtype_equal,
     is_integer_dtype,
     is_scalar,
@@ -82,12 +80,13 @@
 from cudf.utils.dtypes import (
     _maybe_convert_to_default_type,
     cudf_dtype_from_pa_type,
+    cudf_dtype_to_pa_type,
     find_common_type,
     get_time_unit,
+    is_column_like,
     is_mixed_with_object_dtype,
     min_scalar_type,
     min_unsigned_type,
-    np_to_pa_dtype,
 )
 from cudf.utils.utils import _array_ufunc, mask_dtype
 
@@ -1923,7 +1922,7 @@ def as_column(
                 # pandas arrays define __arrow_array__ for better
                 # pyarrow.array conversion
                 arbitrary = arbitrary.array
-            data = as_column(
+            return as_column(
                 pa.array(arbitrary, from_pandas=True),
                 nan_as_null=nan_as_null,
                 dtype=dtype,
@@ -1932,7 +1931,7 @@ def as_column(
         elif isinstance(
             arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype)
         ):
-            data = as_column(
+            return as_column(
                 pa.array(arbitrary, from_pandas=True),
                 nan_as_null=nan_as_null,
                 dtype=dtype,
@@ -1956,7 +1955,7 @@ def as_column(
                 arbitrary = np.asarray(arbitrary)
             else:
                 arbitrary = cupy.asarray(arbitrary)
-            data = as_column(
+            return as_column(
                 arbitrary, nan_as_null=nan_as_null, dtype=dtype, length=length
             )
         elif arbitrary.dtype.kind == "O":
@@ -1988,7 +1987,7 @@ def as_column(
                 arbitrary,
                 from_pandas=True,
             )
-            data = as_column(
+            return as_column(
                 pyarrow_array,
                 dtype=dtype,
                 nan_as_null=nan_as_null,
@@ -1999,9 +1998,6 @@ def as_column(
                 f"{type(arbitrary).__name__} with "
                 f"{type(arbitrary.dtype).__name__} is not supported."
             )
-        if dtype is not None:
-            data = data.astype(dtype)
-
     elif is_scalar(arbitrary) and not isinstance(arbitrary, memoryview):
         if length is None:
             length = 1
@@ -2094,6 +2090,13 @@ def as_column(
         return as_column(
             np.asarray(view), dtype=dtype, nan_as_null=nan_as_null
         )
+    elif hasattr(arbitrary, "__array__"):
+        # e.g. test_cuda_array_interface_pytorch
+        try:
+            arbitrary = cupy.asarray(arbitrary)
+        except (ValueError, TypeError):
+            arbitrary = np.asarray(arbitrary)
+        return as_column(arbitrary, dtype=dtype, nan_as_null=nan_as_null)
     # Start of arbitrary that's not handed above but dtype provided
     elif isinstance(dtype, pd.DatetimeTZDtype):
         raise NotImplementedError(
@@ -2126,9 +2129,20 @@ def as_column(
             pd.IntervalDtype,
             cudf.IntervalDtype,
         ),
-    ) or dtype in {"category", "interval", "str", str, np.str_}:
+    ) or dtype in {
+        "category",
+        "interval",
+        "str",
+        str,
+        np.str_,
+        object,
+        np.dtype(object),
+    }:
         if isinstance(dtype, (cudf.CategoricalDtype, cudf.IntervalDtype)):
             dtype = dtype.to_pandas()
+        elif dtype == object:
+            # Unlike pandas, interpret object as "str" instead of "python object"
+            dtype = "str"
         ser = pd.Series(arbitrary, dtype=dtype)
         return as_column(ser, nan_as_null=nan_as_null)
     elif isinstance(dtype, (cudf.StructDtype, cudf.ListDtype)):
@@ -2140,166 +2154,72 @@ def as_column(
                 return cudf.core.column.ListColumn.from_sequences(arbitrary)
             raise
         return as_column(data, nan_as_null=nan_as_null)
-    else:
-        pa_type = None
+    elif not isinstance(arbitrary, (abc.Iterable, abc.Sequence)):
+        # TODO: This validation should probably be done earlier?
+        raise TypeError(
+            f"{type(arbitrary).__name__} must be an iterable or sequence."
+        )
+    from_pandas = nan_as_null is None or nan_as_null
+    if dtype is not None:
+        dtype = cudf.dtype(dtype)
         try:
-            if dtype is not None:
-                if is_datetime64_dtype(dtype):
-                    # Error checking only, actual construction happens
-                    # below.
-                    pa_array = pa.array(arbitrary)
-                    if (
-                        isinstance(pa_array.type, pa.TimestampType)
-                        and pa_array.type.tz is not None
-                    ):
-                        raise NotImplementedError(
-                            "cuDF does not yet support timezone-aware "
-                            "datetimes"
-                        )
-                if is_bool_dtype(dtype):
-                    # Need this special case handling for bool dtypes,
-                    # since 'boolean' & 'pd.BooleanDtype' are not
-                    # understood by np.dtype below.
-                    dtype = "bool"
-                np_dtype = np.dtype(dtype)
-                if np_dtype.kind in {"m", "M"}:
-                    unit = np.datetime_data(np_dtype)[0]
-                    if unit not in {"ns", "us", "ms", "s", "D"}:
-                        raise NotImplementedError(
-                            f"{dtype=} is not supported."
-                        )
-                pa_type = np_to_pa_dtype(np_dtype)
-            else:
-                # By default cudf constructs a 64-bit column. Setting
-                # the `default_*_bitwidth` to 32 will result in a 32-bit
-                # column being created.
-                if (
-                    cudf.get_option("default_integer_bitwidth")
-                    and infer_dtype(arbitrary) == "integer"
-                ):
-                    pa_type = np_to_pa_dtype(
-                        _maybe_convert_to_default_type("int")
-                    )
-                if cudf.get_option("default_float_bitwidth") and infer_dtype(
-                    arbitrary
-                ) in (
-                    "floating",
-                    "mixed-integer-float",
-                ):
-                    pa_type = np_to_pa_dtype(
-                        _maybe_convert_to_default_type("float")
-                    )
-
-            pyarrow_array = pa.array(
+            arbitrary = pa.array(
                 arbitrary,
-                type=pa_type,
-                from_pandas=True if nan_as_null is None else nan_as_null,
+                type=cudf_dtype_to_pa_type(dtype),
+                from_pandas=from_pandas,
             )
-
-            if (
-                isinstance(pyarrow_array, pa.NullArray)
-                and pa_type is None
-                and dtype is None
-                and getattr(arbitrary, "dtype", None) == cudf.dtype("object")
-            ):
-                # pa.array constructor returns a NullArray
-                # for empty arrays, instead of a StringArray.
-                # This issue is only specific to this dtype,
-                # all other dtypes, result in their corresponding
-                # arrow array creation.
-                dtype = cudf.dtype("str")
-                pyarrow_array = pyarrow_array.cast(np_to_pa_dtype(dtype))
-
+        except (pa.ArrowInvalid, pa.ArrowTypeError):
+            if not isinstance(dtype, np.dtype):
+                dtype = dtype.to_pandas()
+            arbitrary = pd.Series(arbitrary, dtype=dtype)
+        return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
+    else:
+        arbitrary = list(arbitrary)
+        for element in arbitrary:
+            # Carve-outs that cannot be parsed by pyarrow/pandas
+            if is_column_like(element):
+                # e.g. test_nested_series_from_sequence_data
+                return cudf.core.column.ListColumn.from_sequences(arbitrary)
+            elif isinstance(element, cupy.ndarray):
+                # e.g. test_series_from_cupy_scalars
+                return as_column(
+                    cupy.array(arbitrary),
+                    dtype=dtype,
+                    nan_as_null=nan_as_null,
+                    length=length,
+                )
+            elif not any(element is na for na in (None, pd.NA, np.nan)):
+                # Might have NA + element like above, but short-circuit if
+                # an element pyarrow/pandas might be able to parse
+                break
+        try:
+            arbitrary = pa.array(arbitrary, from_pandas=from_pandas)
             if (
                 cudf.get_option("mode.pandas_compatible")
-                and pa.types.is_integer(pyarrow_array.type)
-                and pyarrow_array.null_count
+                and pa.types.is_integer(arbitrary.type)
+                and arbitrary.null_count > 0
             ):
-                pyarrow_array = pyarrow_array.cast("float64").fill_null(np.nan)
-
-            data = as_column(
-                pyarrow_array,
-                dtype=dtype,
-                nan_as_null=nan_as_null,
-            )
-        except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError) as e:
-            if isinstance(e, MixedTypeError):
-                raise TypeError(str(e))
+                arbitrary = arbitrary.cast(pa.float64())
+            if cudf.get_option(
+                "default_integer_bitwidth"
+            ) and pa.types.is_integer(arbitrary.type):
+                dtype = _maybe_convert_to_default_type("int")
+            elif cudf.get_option(
+                "default_float_bitwidth"
+            ) and pa.types.is_floating(arbitrary.type):
+                dtype = _maybe_convert_to_default_type("float")
+        except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
+            arbitrary = pd.Series(arbitrary)
+            if cudf.get_option(
+                "default_integer_bitwidth"
+            ) and arbitrary.dtype.kind in set("iu"):
+                dtype = _maybe_convert_to_default_type("int")
             elif (
-                isinstance(arbitrary, Sequence)
-                and len(arbitrary) > 0
-                and any(
-                    cudf.utils.dtypes.is_column_like(arb) for arb in arbitrary
-                )
+                cudf.get_option("default_float_bitwidth")
+                and arbitrary.dtype.kind == "f"
             ):
-                # TODO: I think can be removed; covered by
-                # elif isinstance(dtype, (cudf.StructDtype, cudf.ListDtype)):
-                # above
-                return cudf.core.column.ListColumn.from_sequences(arbitrary)
-            elif isinstance(arbitrary, abc.Iterable) or isinstance(
-                arbitrary, abc.Sequence
-            ):
-                data = as_column(
-                    _construct_array(arbitrary, dtype),
-                    dtype=dtype,
-                    nan_as_null=nan_as_null,
-                )
-            else:
-                raise e
-    return data
-
-
-def _construct_array(
-    arbitrary: Any, dtype: Optional[Dtype]
-) -> Union[np.ndarray, cupy.ndarray, pd.api.extensions.ExtensionArray]:
-    """
-    Construct a CuPy/NumPy/Pandas array from `arbitrary`
-    """
-    try:
-        dtype = dtype if dtype is None else cudf.dtype(dtype)
-        arbitrary = cupy.asarray(arbitrary, dtype=dtype)
-    except (TypeError, ValueError):
-        native_dtype = dtype
-        inferred_dtype = infer_dtype(arbitrary, skipna=False)
-        if (
-            dtype is None
-            and not cudf._lib.scalar._is_null_host_scalar(arbitrary)
-            and inferred_dtype
-            in (
-                "mixed",
-                "mixed-integer",
-            )
-        ):
-            native_dtype = "object"
-        if inferred_dtype == "interval":
-            # Only way to construct an Interval column.
-            return pd.array(arbitrary)
-        elif (
-            inferred_dtype == "string" and getattr(dtype, "kind", None) == "M"
-        ):
-            # We may have date-like strings with timezones
-            try:
-                with warnings.catch_warnings():
-                    # Need to ignore userwarnings when
-                    # datetime format cannot be inferred.
-                    warnings.simplefilter("ignore", UserWarning)
-                    pd_arbitrary = pd.to_datetime(arbitrary)
-                if isinstance(pd_arbitrary.dtype, pd.DatetimeTZDtype):
-                    raise NotImplementedError(
-                        "cuDF does not yet support timezone-aware datetimes"
-                    )
-                return pd_arbitrary.to_numpy()
-            except pd.errors.OutOfBoundsDatetime:
-                # https://github.com/pandas-dev/pandas/issues/55096
-                pass
-
-        arbitrary = np.asarray(
-            arbitrary,
-            dtype=native_dtype
-            if native_dtype is None
-            else np.dtype(native_dtype),
-        )
-    return arbitrary
+                dtype = _maybe_convert_to_default_type("float")
+        return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
 
 
 def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 0a7435bd241..bbe496333cd 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1714,7 +1714,15 @@ def __init__(
             raise TypeError("dtype must be a datetime type")
 
         name = _setdefault_name(data, name=name)["name"]
-        data = column.as_column(data, dtype=dtype)
+        data = column.as_column(data)
+
+        # TODO: Remove this if statement and fix tests now that
+        # there's timezone support
+        if isinstance(data.dtype, pd.DatetimeTZDtype):
+            raise NotImplementedError(
+                "cuDF does not yet support timezone-aware datetimes"
+            )
+        data = data.astype(dtype)
 
         if copy:
             data = data.copy()
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 438f3e35ec8..5d0c403daa2 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1710,12 +1710,17 @@ def test_scalar_null_binops(op, dtype_l, dtype_r):
     ],
 )
 @pytest.mark.parametrize(
-    "dtype",
-    ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"],
+    "dtype, components",
+    [
+        ["datetime64[ns]", "00.012345678"],
+        ["datetime64[us]", "00.012345"],
+        ["datetime64[ms]", "00.012"],
+        ["datetime64[s]", "00"],
+    ],
 )
 @pytest.mark.parametrize("op", [operator.add, operator.sub])
 def test_datetime_dateoffset_binaryop(
-    request, n_periods, frequency, dtype, op
+    request, n_periods, frequency, dtype, components, op
 ):
     request.applymarker(
         pytest.mark.xfail(
@@ -1728,9 +1733,9 @@ def test_datetime_dateoffset_binaryop(
     )
 
     date_col = [
-        "2000-01-01 00:00:00.012345678",
-        "2000-01-31 00:00:00.012345678",
-        "2000-02-29 00:00:00.012345678",
+        f"2000-01-01 00:00:{components}",
+        f"2000-01-31 00:00:{components}",
+        f"2000-02-29 00:00:{components}",
     ]
     gsr = cudf.Series(date_col, dtype=dtype)
     psr = gsr.to_pandas()
@@ -1807,14 +1812,21 @@ def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op):
     ],
 )
 @pytest.mark.parametrize(
-    "dtype",
-    ["datetime64[ns]", "datetime64[us]", "datetime64[ms]", "datetime64[s]"],
+    "dtype, components",
+    [
+        ["datetime64[ns]", "00.012345678"],
+        ["datetime64[us]", "00.012345"],
+        ["datetime64[ms]", "00.012"],
+        ["datetime64[s]", "00"],
+    ],
 )
-def test_datetime_dateoffset_binaryop_reflected(n_periods, frequency, dtype):
+def test_datetime_dateoffset_binaryop_reflected(
+    n_periods, frequency, dtype, components
+):
     date_col = [
-        "2000-01-01 00:00:00.012345678",
-        "2000-01-31 00:00:00.012345678",
-        "2000-02-29 00:00:00.012345678",
+        f"2000-01-01 00:00:{components}",
+        f"2000-01-31 00:00:{components}",
+        f"2000-02-29 00:00:{components}",
     ]
     gsr = cudf.Series(date_col, dtype=dtype)
     psr = gsr.to_pandas()  # converts to nanos
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index dace8009041..a8a297c155f 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -177,7 +177,7 @@ def test_column_series_multi_dim(data):
     ("data", "error"),
     [
         ([1, "1.0", "2", -3], cudf.errors.MixedTypeError),
-        ([np.nan, 0, "null", cp.nan], pa.lib.ArrowInvalid),
+        ([np.nan, 0, "null", cp.nan], cudf.errors.MixedTypeError),
         (
             [np.int32(4), np.float64(1.5), np.float32(1.290994), np.int8(0)],
             None,
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 37ba7acf044..46a0dcd315d 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2191,9 +2191,8 @@ def test_datetime_index_freq_error(data, dtype, freq):
 
 
 def test_strings_with_utc_offset_not_implemented():
-    with pytest.warns(DeprecationWarning, match="parsing timezone"):  # cupy
-        with pytest.raises(NotImplementedError):
-            DatetimeIndex(["2022-07-22 00:00:00+02:00"])
+    with pytest.raises(NotImplementedError):
+        DatetimeIndex(["2022-07-22 00:00:00+02:00"])
 
 
 @pytest.mark.parametrize("code", ["z", "Z"])
@@ -2227,78 +2226,116 @@ def test_args_not_datetime_typerror(arg):
 
 
 @pytest.mark.parametrize(
-    "data",
+    "data, dtype",
     [
         [
-            "2000-01-01 00:00:00.000000000",
-            "2000-01-01 00:00:00.000000000",
-            "2000-01-01 00:00:00.000000000",
+            [
+                "2000-01-01 00:00:00.000000000",
+                "2000-01-01 00:00:00.000000000",
+                "2000-01-01 00:00:00.000000000",
+            ],
+            "datetime64[s]",
         ],
         [
-            "2000-01-01 00:00:00.000000000",
-            None,
-            "2000-01-01 00:00:00.000000000",
+            [
+                "2000-01-01 00:00:00.000000000",
+                None,
+                "2000-01-01 00:00:00.000000000",
+            ],
+            "datetime64[s]",
         ],
         [
-            "2000-01-01 00:00:00.001000000",
-            "2000-01-01 00:00:00.000000000",
-            "2000-01-01 00:00:00.000000000",
+            [
+                "2000-01-01 00:00:00.001000000",
+                "2000-01-01 00:00:00.000000000",
+                "2000-01-01 00:00:00.000000000",
+            ],
+            "datetime64[us]",
         ],
         [
-            "2000-01-01 00:00:00.010000000",
-            "2000-01-01 00:00:00.020000000",
-            "2000-01-01 00:00:00.030000000",
+            [
+                "2000-01-01 00:00:00.010000000",
+                "2000-01-01 00:00:00.020000000",
+                "2000-01-01 00:00:00.030000000",
+            ],
+            "datetime64[ms]",
         ],
         [
-            "2000-01-01 00:00:00.010000000",
-            "2000-01-01 00:00:00.020000000",
-            None,
+            [
+                "2000-01-01 00:00:00.010000000",
+                "2000-01-01 00:00:00.020000000",
+                None,
+            ],
+            "datetime64[ms]",
         ],
         [
-            "2000-01-01 00:00:00.000001000",
-            "2000-01-01 00:00:00.000000000",
-            "2000-01-01 00:00:00.000004000",
+            [
+                "2000-01-01 00:00:00.000001000",
+                "2000-01-01 00:00:00.000000000",
+                "2000-01-01 00:00:00.000004000",
+            ],
+            "datetime64[us]",
         ],
         [
-            None,
-            "2000-01-01 00:00:00.000000000",
-            "2000-01-01 00:00:00.000004000",
+            [
+                None,
+                "2000-01-01 00:00:00.000000000",
+                "2000-01-01 00:00:00.000004000",
+            ],
+            "datetime64[us]",
         ],
         [
-            "2000-01-01 00:00:00.000000010",
-            "2000-01-01 00:00:00.000000002",
-            "2000-01-01 00:00:00.000000000",
+            [
+                "2000-01-01 00:00:00.000000010",
+                "2000-01-01 00:00:00.000000002",
+                "2000-01-01 00:00:00.000000000",
+            ],
+            "datetime64[ns]",
         ],
         [
-            "2000-01-01 00:00:00.000000010",
-            None,
-            "2000-01-01 00:00:00.000000000",
+            [
+                "2000-01-01 00:00:00.000000010",
+                None,
+                "2000-01-01 00:00:00.000000000",
+            ],
+            "datetime64[ns]",
         ],
         [
-            "2000-01-01 00:00:01.000000000",
-            "2000-01-01 00:00:40.000000000",
-            "2000-01-01 00:00:59.000000000",
+            [
+                "2000-01-01 00:00:01.000000000",
+                "2000-01-01 00:00:40.000000000",
+                "2000-01-01 00:00:59.000000000",
+            ],
+            "datetime64[s]",
         ],
         [
-            "2000-01-01 00:10:00.000000000",
-            "2000-01-01 00:30:40.000000000",
-            "2000-01-01 00:59:00.000000000",
+            [
+                "2000-01-01 00:10:00.000000000",
+                "2000-01-01 00:30:40.000000000",
+                "2000-01-01 00:59:00.000000000",
+            ],
+            "datetime64[s]",
         ],
         [
-            "2000-01-01 07:00:00.000000000",
-            "2000-01-01 08:00:00.000000000",
-            None,
+            [
+                "2000-01-01 07:00:00.000000000",
+                "2000-01-01 08:00:00.000000000",
+                None,
+            ],
+            "datetime64[s]",
         ],
-        [None, None, None],
-        [],
+        [[None, None, None], "datetime64[s]"],
+        [[], "datetime64[s]"],
         [
-            "2000-01-01 00:10:00.123456789",
-            "2000-01-01 00:30:40.123123456",
-            "2000-01-01 00:59:00.675347634",
+            [
+                "2000-01-01 00:10:00.123456789",
+                "2000-01-01 00:30:40.123123456",
+                "2000-01-01 00:59:00.675347634",
+            ],
+            "datetime64[ns]",
         ],
     ],
 )
-@pytest.mark.parametrize("dtype", DATETIME_TYPES)
 def test_datetime_to_str(data, dtype):
     gs = cudf.Series(data, dtype=dtype)
     ps = gs.to_pandas()
@@ -2311,6 +2348,15 @@ def test_datetime_to_str(data, dtype):
     assert_eq(actual.to_pandas(nullable=True), expected)
 
 
+def test_datetime_string_to_datetime_resolution_loss_raises():
+    data = ["2020-01-01 00:00:00.00001"]
+    dtype = "datetime64[s]"
+    with pytest.raises(ValueError):
+        cudf.Series(data, dtype=dtype)
+    with pytest.raises(ValueError):
+        pd.Series(data, dtype=dtype)
+
+
 def test_dateimeindex_from_noniso_string():
     data = ["20160920", "20160925"]
     gdti = cudf.DatetimeIndex(data)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index b45857e28ad..642dbde3790 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2218,7 +2218,7 @@ def __getitem__(self, key):
 
 
 def test_series_constructor_error_mixed_type():
-    with pytest.raises(pa.ArrowTypeError):
+    with pytest.raises(MixedTypeError):
         cudf.Series(["abc", np.nan, "123"], nan_as_null=False)
 
 
@@ -2537,7 +2537,7 @@ def test_nan_as_null_from_arrow_objects(klass, data):
 @pytest.mark.parametrize("reso", ["M", "ps"])
 @pytest.mark.parametrize("typ", ["M", "m"])
 def test_series_invalid_reso_dtype(reso, typ):
-    with pytest.raises(NotImplementedError):
+    with pytest.raises(TypeError):
         cudf.Series([], dtype=f"{typ}8[{reso}]")
 
 
diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py
index 4136d97d69f..336b92dba4f 100644
--- a/python/cudf/cudf/utils/docutils.py
+++ b/python/cudf/cudf/utils/docutils.py
@@ -210,12 +210,11 @@ def wrapper(func):
 
         Describing a timestamp ``Series``.
 
-        >>> import numpy as np
         >>> s = cudf.Series([
-        ...   np.datetime64("2000-01-01"),
-        ...   np.datetime64("2010-01-01"),
-        ...   np.datetime64("2010-01-01")
-        ... ])
+        ...   "2000-01-01",
+        ...   "2010-01-01",
+        ...   "2010-01-01"
+        ... ], dtype="datetime64[s]")
         >>> s
         0   2000-01-01
         1   2010-01-01

From 9f2fdf84f59d8093d4ec7b91932c6b17a8193fd7 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 17 Apr 2024 15:25:11 -0500
Subject: [PATCH 073/842] Upgrade upper bound pinning to `pandas-2.2.2`
 (#15554)

This PR upgrades the pandas upper bound pinning to allow installation of newly released `2.2.2` version.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/15554
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-122_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 2 +-
 dependencies.yaml                                | 2 +-
 python/cudf/cudf/core/_compat.py                 | 2 +-
 python/cudf/pyproject.toml                       | 2 +-
 python/dask_cudf/pyproject.toml                  | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index e629f8b633e..ef971d10f19 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -64,7 +64,7 @@ dependencies:
 - nvcomp==3.0.6
 - nvtx>=0.2.1
 - packaging
-- pandas>=2.0,<2.2.2dev0
+- pandas>=2.0,<2.2.3dev0
 - pandoc
 - pip
 - pre-commit
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index f135a88cac2..688e41ec1ba 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -62,7 +62,7 @@ dependencies:
 - nvcomp==3.0.6
 - nvtx>=0.2.1
 - packaging
-- pandas>=2.0,<2.2.2dev0
+- pandas>=2.0,<2.2.3dev0
 - pandoc
 - pip
 - pre-commit
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index cd9237bd7cb..5512ef11057 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -81,7 +81,7 @@ requirements:
     - protobuf >=3.20,<5.0a0
     - python
     - typing_extensions >=4.0.0
-    - pandas >=2.0,<2.2.2dev0
+    - pandas >=2.0,<2.2.3dev0
     - cupy >=12.0.0
     - numba >=0.57
     - {{ pin_compatible('numpy', max_pin='x') }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 8cd4c798c38..147a89076c4 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -514,7 +514,7 @@ dependencies:
         packages:
           - fsspec>=0.6.0
           - numpy>=1.23,<2.0a0
-          - pandas>=2.0,<2.2.2dev0
+          - pandas>=2.0,<2.2.3dev0
   run_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py
index fba3a98e56d..e2bdecbe67a 100644
--- a/python/cudf/cudf/core/_compat.py
+++ b/python/cudf/cudf/core/_compat.py
@@ -3,7 +3,7 @@
 import pandas as pd
 from packaging import version
 
-PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.1")
+PANDAS_CURRENT_SUPPORTED_VERSION = version.parse("2.2.2")
 PANDAS_VERSION = version.parse(pd.__version__)
 
 
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 434383bc208..adab199dcf4 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -33,7 +33,7 @@ dependencies = [
     "numpy>=1.23,<2.0a0",
     "nvtx>=0.2.1",
     "packaging",
-    "pandas>=2.0,<2.2.2dev0",
+    "pandas>=2.0,<2.2.3dev0",
     "protobuf>=3.20,<5",
     "ptxcompiler",
     "pyarrow>=14.0.1,<15.0.0a0",
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index d0743516c4d..fcf83e82989 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "numpy>=1.23,<2.0a0",
-    "pandas>=2.0,<2.2.2dev0",
+    "pandas>=2.0,<2.2.3dev0",
     "rapids-dask-dependency==24.6.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [

From f222b4adc78187539092ad14de9d407451975514 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 17 Apr 2024 15:44:22 -0700
Subject: [PATCH 074/842] Bind `read_parquet_metadata` API to libcudf instead
 of pyarrow and extract `RowGroup` information (#15398)

The `cudf.io.read_parquet_metadata` is now bound to corresponding libcudf API instead of relying on pyarrow. The libcudf API now also returns high level `RowGroup` metadata to solve #11214. Added additional tests and doc updates as well.

More metadata information such `min, max` values for each column in each row group can also be extracted and returned if needed. Thoughts?

Recommend: Closing #15320 without merging in favor of this PR.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15398
---
 cpp/include/cudf/io/parquet_metadata.hpp      | 41 ++++++++++-
 cpp/src/io/parquet/reader_impl.cpp            |  3 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    | 20 ++++++
 cpp/src/io/parquet/reader_impl_helpers.hpp    |  8 +++
 .../cudf/_lib/cpp/io/parquet_metadata.pxd     | 32 +++++++++
 python/cudf/cudf/_lib/parquet.pyx             | 69 +++++++++++++++++++
 python/cudf/cudf/io/parquet.py                | 42 +++++++++--
 python/cudf/cudf/tests/test_parquet.py        | 47 +++++++++++--
 python/cudf/cudf/utils/ioutils.py             |  4 +-
 9 files changed, 249 insertions(+), 17 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/cpp/io/parquet_metadata.pxd

diff --git a/cpp/include/cudf/io/parquet_metadata.hpp b/cpp/include/cudf/io/parquet_metadata.hpp
index 3149b5b5945..e0c406c180c 100644
--- a/cpp/include/cudf/io/parquet_metadata.hpp
+++ b/cpp/include/cudf/io/parquet_metadata.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,6 +59,13 @@ enum class TypeKind : int8_t {
  */
 struct parquet_column_schema {
  public:
+  /**
+   * @brief Default constructor.
+   *
+   * This has been added since Cython requires a default constructor to create objects on stack.
+   */
+  explicit parquet_column_schema() = default;
+
   /**
    * @brief constructor
    *
@@ -134,6 +141,13 @@ struct parquet_column_schema {
  */
 struct parquet_schema {
  public:
+  /**
+   * @brief Default constructor.
+   *
+   * This has been added since Cython requires a default constructor to create objects on stack.
+   */
+  explicit parquet_schema() = default;
+
   /**
    * @brief constructor
    *
@@ -165,6 +179,15 @@ class parquet_metadata {
  public:
   /// Key-value metadata in the file footer.
   using key_value_metadata = std::unordered_map<std::string, std::string>;
+  /// row group metadata from each RowGroup element.
+  using row_group_metadata = std::unordered_map<std::string, int64_t>;
+
+  /**
+   * @brief Default constructor.
+   *
+   * This has been added since Cython requires a default constructor to create objects on stack.
+   */
+  explicit parquet_metadata() = default;
 
   /**
    * @brief constructor
@@ -173,15 +196,18 @@ class parquet_metadata {
    * @param num_rows number of rows
    * @param num_rowgroups number of row groups
    * @param file_metadata key-value metadata in the file footer
+   * @param rg_metadata vector of maps containing metadata for each row group
    */
   parquet_metadata(parquet_schema schema,
                    int64_t num_rows,
                    size_type num_rowgroups,
-                   key_value_metadata file_metadata)
+                   key_value_metadata file_metadata,
+                   std::vector<row_group_metadata> rg_metadata)
     : _schema{std::move(schema)},
       _num_rows{num_rows},
       _num_rowgroups{num_rowgroups},
-      _file_metadata{std::move(file_metadata)}
+      _file_metadata{std::move(file_metadata)},
+      _rowgroup_metadata{std::move(rg_metadata)}
   {
   }
 
@@ -207,6 +233,7 @@ class parquet_metadata {
    * @return Number of row groups
    */
   [[nodiscard]] auto num_rowgroups() const { return _num_rowgroups; }
+
   /**
    * @brief Returns the Key value metadata in the file footer.
    *
@@ -214,11 +241,19 @@ class parquet_metadata {
    */
   [[nodiscard]] auto const& metadata() const { return _file_metadata; }
 
+  /**
+   * @brief Returns the row group metadata in the file footer.
+   *
+   * @return vector of row group metadata as maps
+   */
+  [[nodiscard]] auto const& rowgroup_metadata() const { return _rowgroup_metadata; }
+
  private:
   parquet_schema _schema;
   int64_t _num_rows;
   size_type _num_rowgroups;
   key_value_metadata _file_metadata;
+  std::vector<row_group_metadata> _rowgroup_metadata;
 };
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index e7409f45e13..a524e7c6dcc 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -609,7 +609,8 @@ parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> con
   return parquet_metadata{parquet_schema{walk_schema(&metadata, 0)},
                           metadata.get_num_rows(),
                           metadata.get_num_row_groups(),
-                          metadata.get_key_value_metadata()[0]};
+                          metadata.get_key_value_metadata()[0],
+                          metadata.get_rowgroup_metadata()};
 }
 
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index bfc69264ab2..402ccef7a15 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -560,6 +560,26 @@ ColumnChunkMetaData const& aggregate_reader_metadata::get_column_metadata(size_t
   return col->meta_data;
 }
 
+std::vector<std::unordered_map<std::string, int64_t>>
+aggregate_reader_metadata::get_rowgroup_metadata() const
+{
+  std::vector<std::unordered_map<std::string, int64_t>> rg_metadata;
+
+  std::for_each(
+    per_file_metadata.cbegin(), per_file_metadata.cend(), [&rg_metadata](auto const& pfm) {
+      std::transform(pfm.row_groups.cbegin(),
+                     pfm.row_groups.cend(),
+                     std::back_inserter(rg_metadata),
+                     [](auto const& rg) {
+                       std::unordered_map<std::string, int64_t> rg_meta_map;
+                       rg_meta_map["num_rows"]        = rg.num_rows;
+                       rg_meta_map["total_byte_size"] = rg.total_byte_size;
+                       return rg_meta_map;
+                     });
+    });
+  return rg_metadata;
+}
+
 std::string aggregate_reader_metadata::get_pandas_index() const
 {
   // Assumes that all input files have the same metadata
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 8295654764e..09f65f9c388 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -166,6 +166,13 @@ class aggregate_reader_metadata {
                                                                size_type src_idx,
                                                                int schema_idx) const;
 
+  /**
+   * @brief Extracts high-level metadata for all row groups
+   *
+   * @return List of maps containing metadata information for each row group
+   */
+  [[nodiscard]] std::vector<std::unordered_map<std::string, int64_t>> get_rowgroup_metadata() const;
+
   [[nodiscard]] auto get_num_rows() const { return num_rows; }
 
   [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; }
@@ -178,6 +185,7 @@ class aggregate_reader_metadata {
   [[nodiscard]] auto const& get_key_value_metadata() const& { return keyval_maps; }
 
   [[nodiscard]] auto&& get_key_value_metadata() && { return std::move(keyval_maps); }
+
   /**
    * @brief Gets the concrete nesting depth of output cudf columns
    *
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet_metadata.pxd b/python/cudf/cudf/_lib/cpp/io/parquet_metadata.pxd
new file mode 100644
index 00000000000..e9def2aea5d
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/io/parquet_metadata.pxd
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport int64_t
+from libcpp.string cimport string
+from libcpp.unordered_map cimport unordered_map
+from libcpp.vector cimport vector
+
+cimport cudf._lib.cpp.io.types as cudf_io_types
+from cudf._lib.cpp.types cimport size_type
+
+
+cdef extern from "cudf/io/parquet_metadata.hpp" namespace "cudf::io" nogil:
+    cdef cppclass parquet_column_schema:
+        parquet_column_schema() except+
+        string name() except+
+        size_type num_children() except+
+        parquet_column_schema child(int idx) except+
+        vector[parquet_column_schema] children() except+
+
+    cdef cppclass parquet_schema:
+        parquet_schema() except+
+        parquet_column_schema root() except+
+
+    cdef cppclass parquet_metadata:
+        parquet_metadata() except+
+        parquet_schema schema() except+
+        int64_t num_rows() except+
+        size_type num_rowgroups() except+
+        unordered_map[string, string] metadata() except+
+        vector[unordered_map[string, int64_t]] rowgroup_metadata() except+
+
+    cdef parquet_metadata read_parquet_metadata(cudf_io_types.source_info src) except+
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index ce1cba59bec..9ce9aad18f7 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -46,6 +46,10 @@ from cudf._lib.cpp.io.parquet cimport (
     read_parquet as parquet_reader,
     write_parquet as parquet_writer,
 )
+from cudf._lib.cpp.io.parquet_metadata cimport (
+    parquet_metadata,
+    read_parquet_metadata as parquet_metadata_reader,
+)
 from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type
@@ -316,6 +320,71 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
         df._data.label_dtype = cudf.dtype(column_index_type)
     return df
 
+cpdef read_parquet_metadata(filepaths_or_buffers):
+    """
+    Cython function to call into libcudf API, see `read_parquet_metadata`.
+
+    See Also
+    --------
+    cudf.io.parquet.read_parquet
+    cudf.io.parquet.to_parquet
+    """
+    # Convert NativeFile buffers to NativeFileDatasource
+    for i, datasource in enumerate(filepaths_or_buffers):
+        if isinstance(datasource, NativeFile):
+            filepaths_or_buffers[i] = NativeFileDatasource(datasource)
+
+    cdef cudf_io_types.source_info source = make_source_info(filepaths_or_buffers)
+
+    args = move(source)
+
+    cdef parquet_metadata c_result
+
+    # Read Parquet metadata
+    with nogil:
+        c_result = move(parquet_metadata_reader(args))
+
+    # access and return results
+    num_rows = c_result.num_rows()
+    num_rowgroups = c_result.num_rowgroups()
+
+    # extract row group metadata and sanitize keys
+    row_group_metadata = [{k.decode(): v for k, v in metadata}
+                          for metadata in c_result.rowgroup_metadata()]
+
+    # read all column names including index column, if any
+    col_names = [info.name().decode() for info in c_result.schema().root().children()]
+
+    # access the Parquet file_footer to find the index
+    index_col = None
+    cdef unordered_map[string, string] file_footer = c_result.metadata()
+
+    # get index column name(s)
+    index_col_names = None
+    json_str = file_footer[b'pandas'].decode('utf-8')
+    meta = None
+    if json_str != "":
+        meta = json.loads(json_str)
+        file_is_range_index, index_col, _ = _parse_metadata(meta)
+        if not file_is_range_index and index_col is not None \
+                and index_col_names is None:
+            index_col_names = {}
+            for idx_col in index_col:
+                for c in meta['columns']:
+                    if c['field_name'] == idx_col:
+                        index_col_names[idx_col] = c['name']
+
+    # remove the index column from the list of column names
+    # only if index_col_names is not None
+    if index_col_names is not None:
+        col_names = [name for name in col_names if name not in index_col_names]
+
+    # num_columns = length of list(col_names)
+    num_columns = len(col_names)
+
+    # return the metadata
+    return num_rows, num_rowgroups, col_names, num_columns, row_group_metadata
+
 
 @acquire_spill_lock()
 def write_parquet(
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index e55898de675..e7f1ad0751f 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -267,17 +267,45 @@ def write_to_dataset(
 
 @ioutils.doc_read_parquet_metadata()
 @_cudf_nvtx_annotate
-def read_parquet_metadata(path):
+def read_parquet_metadata(filepath_or_buffer):
     """{docstring}"""
-    import pyarrow.parquet as pq
+    # Multiple sources are passed as a list. If a single source is passed,
+    # wrap it in a list for unified processing downstream.
+    if not is_list_like(filepath_or_buffer):
+        filepath_or_buffer = [filepath_or_buffer]
 
-    pq_file = pq.ParquetFile(path)
+    # Start by trying to construct a filesystem object
+    fs, paths = ioutils._get_filesystem_and_paths(
+        path_or_data=filepath_or_buffer, storage_options=None
+    )
 
-    num_rows = pq_file.metadata.num_rows
-    num_row_groups = pq_file.num_row_groups
-    col_names = pq_file.schema.names
+    # Check if filepath or buffer
+    filepath_or_buffer = paths if paths else filepath_or_buffer
+
+    # List of filepaths or buffers
+    filepaths_or_buffers = []
+
+    for source in filepath_or_buffer:
+        tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
+            path_or_data=source,
+            compression=None,
+            fs=fs,
+            use_python_file_object=True,
+            open_file_options=None,
+            storage_options=None,
+            bytes_per_thread=None,
+        )
+
+        if compression is not None:
+            raise ValueError(
+                "URL content-encoding decompression is not supported"
+            )
+        if isinstance(tmp_source, list):
+            filepath_or_buffer.extend(tmp_source)
+        else:
+            filepaths_or_buffers.append(tmp_source)
 
-    return num_rows, num_row_groups, col_names
+    return libparquet.read_parquet_metadata(filepaths_or_buffers)
 
 
 @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 9ba71b28637..56a4281aad9 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -415,8 +415,15 @@ def num_row_groups(rows, group_size):
     row_group_size = 5
     pdf.to_parquet(fname, compression="snappy", row_group_size=row_group_size)
 
-    num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
+    (
+        num_rows,
+        row_groups,
+        col_names,
+        num_columns,
+        _,  # rowgroup_metadata
+    ) = cudf.io.read_parquet_metadata(fname)
 
+    assert num_columns == len(pdf.columns)
     assert num_rows == len(pdf.index)
     assert row_groups == num_row_groups(num_rows, row_group_size)
     for a, b in zip(col_names, pdf.columns):
@@ -561,7 +568,9 @@ def test_parquet_read_row_groups(tmpdir, pdf, row_group_size):
     fname = tmpdir.join("row_group.parquet")
     pdf.to_parquet(fname, compression="gzip", row_group_size=row_group_size)
 
-    num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
+    num_rows, row_groups, col_names, _, _ = cudf.io.read_parquet_metadata(
+        fname
+    )
 
     gdf = [cudf.read_parquet(fname, row_groups=[i]) for i in range(row_groups)]
     gdf = cudf.concat(gdf)
@@ -586,7 +595,9 @@ def test_parquet_read_row_groups_non_contiguous(tmpdir, pdf, row_group_size):
     fname = tmpdir.join("row_group.parquet")
     pdf.to_parquet(fname, compression="gzip", row_group_size=row_group_size)
 
-    num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
+    num_rows, row_groups, col_names, _, _ = cudf.io.read_parquet_metadata(
+        fname
+    )
 
     # alternate rows between the two sources
     gdf = cudf.read_parquet(
@@ -1803,7 +1814,9 @@ def test_parquet_writer_row_group_size(tmpdir, row_group_size_kwargs):
         writer.write_table(gdf)
 
     # Simple check for multiple row-groups
-    nrows, nrow_groups, columns = cudf.io.parquet.read_parquet_metadata(fname)
+    nrows, nrow_groups, columns, _, _ = cudf.io.parquet.read_parquet_metadata(
+        fname
+    )
     assert nrows == size
     assert nrow_groups > 1
     assert columns == ["a", "b"]
@@ -2853,7 +2866,9 @@ def test_to_parquet_row_group_size(
         fname, row_group_size_bytes=size_bytes, row_group_size_rows=size_rows
     )
 
-    num_rows, row_groups, col_names = cudf.io.read_parquet_metadata(fname)
+    num_rows, row_groups, col_names, _, _ = cudf.io.read_parquet_metadata(
+        fname
+    )
     # 8 bytes per row, as the column is int64
     expected_num_rows = max(
         math.ceil(num_rows / size_rows), math.ceil(8 * num_rows / size_bytes)
@@ -2861,6 +2876,28 @@ def test_to_parquet_row_group_size(
     assert expected_num_rows == row_groups
 
 
+@pytest.mark.parametrize("size_rows", [500_000, 100_000, 10_000])
+def test_parquet_row_group_metadata(tmpdir, large_int64_gdf, size_rows):
+    fname = tmpdir.join("row_group_size.parquet")
+    large_int64_gdf.to_parquet(fname, row_group_size_rows=size_rows)
+
+    # read file metadata from parquet
+    (
+        num_rows,
+        row_groups,
+        _,  # col_names
+        _,  # num_columns
+        row_group_metadata,
+    ) = cudf.io.read_parquet_metadata(fname)
+
+    # length(RowGroupsMetaData) == number of row groups
+    assert len(row_group_metadata) == row_groups
+    # sum of rows in row groups == total rows
+    assert num_rows == sum(
+        [row_group["num_rows"] for row_group in row_group_metadata]
+    )
+
+
 def test_parquet_reader_decimal_columns():
     df = cudf.DataFrame(
         {
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 8c58f2b859e..66e14f4b9de 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -101,11 +101,13 @@
 Total number of rows
 Number of row groups
 List of column names
+Number of columns
+List of metadata of row groups
 
 Examples
 --------
 >>> import cudf
->>> num_rows, num_row_groups, names = cudf.io.read_parquet_metadata(filename)
+>>> num_rows, num_row_groups, names, num_columns, row_group_metadata = cudf.io.read_parquet_metadata(filename)
 >>> df = [cudf.read_parquet(fname, row_group=i) for i in range(row_groups)]
 >>> df = cudf.concat(df)
 >>> df

From eaae68d8b099e90a2e3bcc968f98c652d36bb844 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 17 Apr 2024 18:33:41 -0500
Subject: [PATCH 075/842] Deprecate legacy JSON reader options. (#15558)

This PR deprecates the option for using the legacy JSON reader, so it can be removed in the next RAPIDS release.

This work follows up on a task from https://github.com/rapidsai/cudf/issues/15537

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15558
---
 cpp/include/cudf/io/json.hpp | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index d8330b78f0e..a6112b8db4c 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -251,9 +251,11 @@ class json_reader_options {
   /**
    * @brief Whether the legacy reader should be used.
    *
+   * @deprecated Since 24.06
+   *
    * @returns true if the legacy reader will be used, false otherwise
    */
-  bool is_enabled_legacy() const { return _legacy; }
+  [[deprecated]] bool is_enabled_legacy() const { return _legacy; }
 
   /**
    * @brief Whether the reader should keep quotes of string values.
@@ -350,9 +352,11 @@ class json_reader_options {
   /**
    * @brief Set whether to use the legacy reader.
    *
+   * @deprecated Since 24.06
+   *
    * @param val Boolean value to enable/disable the legacy reader
    */
-  void enable_legacy(bool val) { _legacy = val; }
+  [[deprecated]] void enable_legacy(bool val) { _legacy = val; }
 
   /**
    * @brief Set whether the reader should keep quotes of string values.
@@ -519,10 +523,12 @@ class json_reader_options_builder {
   /**
    * @brief Set whether to use the legacy reader.
    *
+   * @deprecated Since 24.06
+   *
    * @param val Boolean value to enable/disable legacy parsing
    * @return this for chaining
    */
-  json_reader_options_builder& legacy(bool val)
+  [[deprecated]] json_reader_options_builder& legacy(bool val)
   {
     options._legacy = val;
     return *this;

From 0935d389192824ac1c9ea3e79df01db3b33feaef Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 18 Apr 2024 05:05:10 -1000
Subject: [PATCH 076/842] Fix millisecond resampling in cudf Python (#15560)

closes #15551

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - https://github.com/brandon-b-miller
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15560
---
 python/cudf/cudf/core/resample.py         | 44 +++++++++++------------
 python/cudf/cudf/core/tools/datetimes.py  | 15 --------
 python/cudf/cudf/tests/test_resampling.py | 14 ++++++++
 3 files changed, 35 insertions(+), 38 deletions(-)

diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index 1a79b122561..cdd4ec6f8e5 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -30,7 +30,6 @@
     SeriesGroupBy,
     _Grouping,
 )
-from cudf.core.tools.datetimes import _offset_alias_to_code, _unit_dtype_map
 
 
 class _Resampler(GroupBy):
@@ -247,47 +246,46 @@ def _handle_frequency_grouper(self, by):
         # column to have the same dtype, so we compute a `result_type`
         # and cast them both to that type.
         try:
-            result_type = np.dtype(
-                _unit_dtype_map[_offset_alias_to_code[offset.name]]
-            )
-        except KeyError:
+            result_type = np.dtype(f"datetime64[{offset.rule_code}]")
+            # TODO: Ideally, we can avoid one cast by having `date_range`
+            # generate timestamps of a given dtype.  Currently, it can
+            # only generate timestamps with 'ns' precision
+            cast_key_column = key_column.astype(result_type)
+            cast_bin_labels = bin_labels.astype(result_type)
+        except TypeError:
             # unsupported resolution (we don't support resolutions >s)
             # fall back to using datetime64[s]
             result_type = np.dtype("datetime64[s]")
-
-        # TODO: Ideally, we can avoid one cast by having `date_range`
-        # generate timestamps of a given dtype.  Currently, it can
-        # only generate timestamps with 'ns' precision
-        key_column = key_column.astype(result_type)
-        bin_labels = bin_labels.astype(result_type)
+            cast_key_column = key_column.astype(result_type)
+            cast_bin_labels = bin_labels.astype(result_type)
 
         # bin the key column:
         bin_numbers = cudf._lib.labeling.label_bins(
-            key_column,
-            left_edges=bin_labels[:-1]._column,
+            cast_key_column,
+            left_edges=cast_bin_labels[:-1]._column,
             left_inclusive=(closed == "left"),
-            right_edges=bin_labels[1:]._column,
+            right_edges=cast_bin_labels[1:]._column,
             right_inclusive=(closed == "right"),
         )
 
         if label == "right":
-            bin_labels = bin_labels[1:]
+            cast_bin_labels = cast_bin_labels[1:]
         else:
-            bin_labels = bin_labels[:-1]
+            cast_bin_labels = cast_bin_labels[:-1]
 
         # if we have more labels than bins, remove the extras labels:
         nbins = bin_numbers.max() + 1
-        if len(bin_labels) > nbins:
-            bin_labels = bin_labels[:nbins]
+        if len(cast_bin_labels) > nbins:
+            cast_bin_labels = cast_bin_labels[:nbins]
 
-        bin_labels.name = self.names[0]
-        self.bin_labels = bin_labels
+        cast_bin_labels.name = self.names[0]
+        self.bin_labels = cast_bin_labels
 
         # replace self._key_columns with the binned key column:
         self._key_columns = [
-            bin_labels._gather(bin_numbers, check_bounds=False)._column.astype(
-                result_type
-            )
+            cast_bin_labels._gather(
+                bin_numbers, check_bounds=False
+            )._column.astype(result_type)
         ]
 
 
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index ed8fca88acd..907f3b586d1 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -55,21 +55,6 @@
     "D": "datetime64[s]",
 }
 
-_offset_alias_to_code = {
-    "W": "W",
-    "D": "D",
-    "H": "h",
-    "h": "h",
-    "T": "m",
-    "min": "m",
-    "s": "s",
-    "S": "s",
-    "U": "us",
-    "us": "us",
-    "N": "ns",
-    "ns": "ns",
-}
-
 
 def to_datetime(
     arg,
diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py
index ad6e0ac52c5..d7a3fea1273 100644
--- a/python/cudf/cudf/tests/test_resampling.py
+++ b/python/cudf/cudf/tests/test_resampling.py
@@ -162,3 +162,17 @@ def test_resampling_frequency_conversion(in_freq, sampling_freq, out_freq):
     assert_resample_results_equal(expect, got)
 
     assert got.index.dtype == np.dtype(f"datetime64[{out_freq}]")
+
+
+def test_resampling_downsampling_ms():
+    pdf = pd.DataFrame(
+        {
+            "time": pd.date_range("2020-01-01", periods=5, freq="1ns"),
+            "sign": range(5),
+        }
+    )
+    gdf = cudf.from_pandas(pdf)
+    expected = pdf.resample("10ms", on="time").mean()
+    result = gdf.resample("10ms", on="time").mean()
+    result.index = result.index.astype("datetime64[ns]")
+    assert_eq(result, expected, check_freq=False)

From ae9e552697c2b13f0fc8161e088f2abeb83fbf36 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 18 Apr 2024 10:22:07 -0500
Subject: [PATCH 077/842] Use same .clang-format in cuDF JNI (#15557)

Closes #15546.

Currently cuDF JNI uses its own `.clang-format` settings. These settings organize includes differently than the rest of the cuDF C++ codebase, so we would like to align them.

This PR removes the JNI's custom `.clang-format` and instead uses the same settings as the rest of cuDF.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/15557
---
 java/src/main/native/.clang-format            |  204 -
 java/src/main/native/clang-format.README      |   13 -
 java/src/main/native/include/jni_utils.hpp    |  783 ++--
 .../main/native/include/maps_column_view.hpp  |   53 +-
 .../native/src/Aggregation128UtilsJni.cpp     |   14 +-
 java/src/main/native/src/AggregationJni.cpp   |  161 +-
 java/src/main/native/src/ChunkedPackJni.cpp   |   44 +-
 java/src/main/native/src/ChunkedReaderJni.cpp |  105 +-
 java/src/main/native/src/ColumnVectorJni.cpp  |  279 +-
 java/src/main/native/src/ColumnViewJni.cpp    | 1812 +++++----
 java/src/main/native/src/ColumnViewJni.cu     |  214 +-
 java/src/main/native/src/ColumnViewJni.hpp    |   22 +-
 .../main/native/src/CompiledExpression.cpp    |  217 +-
 .../main/native/src/ContiguousTableJni.cpp    |  100 +-
 java/src/main/native/src/CuFileJni.cpp        |  199 +-
 java/src/main/native/src/CudaJni.cpp          |  210 +-
 java/src/main/native/src/CudfJni.cpp          |  101 +-
 .../main/native/src/DataSourceHelperJni.cpp   |  185 +-
 java/src/main/native/src/HashJoinJni.cpp      |   24 +-
 .../src/HostMemoryBufferNativeUtilsJni.cpp    |   41 +-
 java/src/main/native/src/NvcompJni.cpp        |  183 +-
 java/src/main/native/src/NvtxRangeJni.cpp     |   16 +-
 .../main/native/src/NvtxUniqueRangeJni.cpp    |   21 +-
 .../native/src/PackedColumnMetadataJni.cpp    |   19 +-
 java/src/main/native/src/RmmJni.cpp           |  610 +--
 java/src/main/native/src/ScalarJni.cpp        |  353 +-
 java/src/main/native/src/TableJni.cpp         | 3357 ++++++++++-------
 .../main/native/src/aggregation128_utils.cu   |  113 +-
 .../main/native/src/aggregation128_utils.hpp  |   24 +-
 .../native/src/check_nvcomp_output_sizes.cu   |   27 +-
 .../native/src/check_nvcomp_output_sizes.hpp  |   13 +-
 .../main/native/src/csv_chunked_writer.hpp    |   28 +-
 java/src/main/native/src/cudf_jni_apis.hpp    |   61 +-
 java/src/main/native/src/dtype_utils.hpp      |   19 +-
 .../src/main/native/src/jni_compiled_expr.hpp |   35 +-
 .../main/native/src/jni_writer_data_sink.hpp  |  131 +-
 java/src/main/native/src/maps_column_view.cu  |   72 +-
 java/src/main/native/src/nvtx_common.hpp      |    8 +-
 38 files changed, 5533 insertions(+), 4338 deletions(-)
 delete mode 100644 java/src/main/native/.clang-format
 delete mode 100644 java/src/main/native/clang-format.README

diff --git a/java/src/main/native/.clang-format b/java/src/main/native/.clang-format
deleted file mode 100644
index e0866533a36..00000000000
--- a/java/src/main/native/.clang-format
+++ /dev/null
@@ -1,204 +0,0 @@
----
-# Reference: https://clang.llvm.org/docs/ClangFormatStyleOptions.html
-Language:        Cpp
-# BasedOnStyle:  LLVM
-# no indentation (-2 from indent, which is 2)
-AccessModifierOffset: -2
-AlignAfterOpenBracket: Align
-# int aaaa = 12;
-# int b    = 23;
-# int ccc  = 23;
-# leaving OFF
-AlignConsecutiveAssignments: false
-# int         aaaa = 12;
-# float       b = 23;
-# std::string ccc = 23;
-# leaving OFF
-AlignConsecutiveDeclarations: false
-##define A                                                                      \
-#  int aaaa;                                                                    \
-#  int b;                                                                       \
-#  int dddddddddd;
-# leaving ON
-AlignEscapedNewlines: Right
-# int aaa = bbbbbbbbbbbbbbb +
-#           ccccccccccccccc;
-# leaving ON
-AlignOperands:   true
-# true:                                   false:
-# int a;     // My comment a      vs.     int a; // My comment a
-# int b = 2; // comment  b                int b = 2; // comment about b
-# leaving ON
-AlignTrailingComments: true
-# squeezes a long declaration's arguments to the next line:
-#true:
-#void myFunction(
-#	int a, int b, int c, int d, int e);
-#
-#false:
-#void myFunction(int a,
-#				int b,
-#				int c,
-#				int d,
-#				int e);
-# leaving ON
-AllowAllParametersOfDeclarationOnNextLine: true
-# changed to ON, as we use short blocks on same lines
-AllowShortBlocksOnASingleLine: true
-# set this to ON, we use this in a few places
-AllowShortCaseLabelsOnASingleLine: true
-# set this to ON
-AllowShortFunctionsOnASingleLine: Inline
-AllowShortIfStatementsOnASingleLine: false
-AllowShortLoopsOnASingleLine: false
-# Deprecated option.
-# PenaltyReturnTypeOnItsOwnLine applies, as we set this to None,
-# where it tries to break after the return type automatically
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: false
-AlwaysBreakTemplateDeclarations: MultiLine
-
-# if all the arguments for a function don't fit in a single line,
-# with a value of "false", it'll split each argument into different lines
-BinPackArguments: true
-BinPackParameters: true
-
-# if this is set to Custom, the BraceWrapping flags apply
-BreakBeforeBraces: Custom
-BraceWrapping:
-  AfterClass:      false
-  AfterControlStatement: false
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  IndentBraces:    false
-  SplitEmptyFunction: false
-  SplitEmptyRecord: false
-  SplitEmptyNamespace: false
-
-# will break after operators when a line is too long
-BreakBeforeBinaryOperators: None
-# not in docs.. so that's nice
-BreakBeforeInheritanceComma: false
-# This will break inheritance list and align on colon,
-# it also places each inherited class in a different line.
-# Leaving ON
-BreakInheritanceList: BeforeColon
-
-#
-#true:
-#veryVeryVeryVeryVeryVeryVeryVeryVeryVeryVeryLongDescription
-#	? firstValue
-#	: SecondValueVeryVeryVeryVeryLong;
-#
-#false:
-#veryVeryVeryVeryVeryVeryVeryVeryVeryVeryVeryLongDescription ?
-#	firstValue :
-#	SecondValueVeryVeryVeryVeryLong;
-BreakBeforeTernaryOperators: false
-
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: true
-BreakStringLiterals: true
-# So the line lengths in cudf are not following a limit, at the moment.
-# Usually it's a long comment that makes the line length inconsistent.
-# Command I used to find max line lengths (from cpp directory):
-#   find include src tests|grep "\." |xargs -I{} bash -c "awk '{print length}' {} | sort -rn | head -1"|sort -n
-# I picked 100, as it seemed somewhere around median
-ColumnLimit:     100
-# TODO: not aware of any of these at this time
-CommentPragmas:  '^ IWYU pragma:'
-# So it doesn't put subsequent namespaces in the same line
-CompactNamespaces: false
-ConstructorInitializerAllOnOneLineOrOnePerLine: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-# TODO: adds spaces around the element list
-# in initializer: vector<T> x{ {}, ..., {} }
-Cpp11BracedListStyle: true
-DerivePointerAlignment: false
-DisableFormat:   false
-ExperimentalAutoDetectBinPacking: false
-# } // namespace a => useful
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IncludeBlocks:   Regroup
-IncludeCategories:
-  - Regex:           '<[[:alnum:]]+>'
-    Priority:        0
-  - Regex:           '<[[:alnum:].]+>'
-    Priority:        1
-  - Regex:           '<.*>'
-    Priority:        2
-  - Regex:           '.*/.*'
-    Priority:        3
-  - Regex:           '.*'
-    Priority:        4
-# if a header matches this in an include group, it will be moved up to the
-# top of the group.
-IncludeIsMainRegex: '(Test)?$'
-IndentCaseLabels: true
-IndentPPDirectives: None
-IndentWidth:     2
-IndentWrappedFunctionNames: false
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: true
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Auto
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-
-# Penalties: leaving unchanged for now
-# https://stackoverflow.com/questions/26635370/in-clang-format-what-do-the-penalties-do
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 19
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-# As currently set, we don't see return types being
-# left on their own line, leaving at 60
-PenaltyReturnTypeOnItsOwnLine: 60
-
-# char* foo vs char *foo, picking Right aligned
-PointerAlignment: Right
-ReflowComments:  true
-# leaving ON, but this could be something to turn OFF
-SortIncludes:    true
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 1
-SpacesInAngles:  false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-Standard:        Cpp11
-TabWidth:        8
-UseTab:          Never
-...
diff --git a/java/src/main/native/clang-format.README b/java/src/main/native/clang-format.README
deleted file mode 100644
index 6c13289720a..00000000000
--- a/java/src/main/native/clang-format.README
+++ /dev/null
@@ -1,13 +0,0 @@
-README
-======
-
-To apply code formatting to a file you are working on, currently you can do this manually using
-clang-format-7:
-
-This will edit the file, and print to stdout:
-
-  clang-format [file]
-
-This will edit the file in place, do this if you are sure of what you are doing:
-
-  clang-format -i [file]
diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index f342fca8933..96ad1f23b8c 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,48 +15,48 @@
  */
 #pragma once
 
-#include <algorithm>
-#include <memory>
-#include <vector>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/detail/error.hpp>
 
 #include <jni.h>
 
-#include <cudf/utilities/error.hpp>
-#include <rmm/detail/error.hpp>
+#include <algorithm>
+#include <memory>
+#include <vector>
 
 namespace cudf {
 namespace jni {
 
 constexpr jint MINIMUM_JNI_VERSION = JNI_VERSION_1_6;
 
-constexpr char const *CUDA_ERROR_CLASS = "ai/rapids/cudf/CudaException";
-constexpr char const *CUDA_FATAL_ERROR_CLASS = "ai/rapids/cudf/CudaFatalException";
-constexpr char const *CUDF_ERROR_CLASS = "ai/rapids/cudf/CudfException";
-constexpr char const *CUDF_OVERFLOW_ERROR_CLASS = "ai/rapids/cudf/CudfColumnSizeOverflowException";
-constexpr char const *CUDF_DTYPE_ERROR_CLASS = "ai/rapids/cudf/CudfException";
-constexpr char const *INDEX_OOB_CLASS = "java/lang/ArrayIndexOutOfBoundsException";
-constexpr char const *ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException";
-constexpr char const *NPE_CLASS = "java/lang/NullPointerException";
-constexpr char const *OOM_CLASS = "java/lang/OutOfMemoryError";
+constexpr char const* CUDA_ERROR_CLASS          = "ai/rapids/cudf/CudaException";
+constexpr char const* CUDA_FATAL_ERROR_CLASS    = "ai/rapids/cudf/CudaFatalException";
+constexpr char const* CUDF_ERROR_CLASS          = "ai/rapids/cudf/CudfException";
+constexpr char const* CUDF_OVERFLOW_ERROR_CLASS = "ai/rapids/cudf/CudfColumnSizeOverflowException";
+constexpr char const* CUDF_DTYPE_ERROR_CLASS    = "ai/rapids/cudf/CudfException";
+constexpr char const* INDEX_OOB_CLASS           = "java/lang/ArrayIndexOutOfBoundsException";
+constexpr char const* ILLEGAL_ARG_CLASS         = "java/lang/IllegalArgumentException";
+constexpr char const* NPE_CLASS                 = "java/lang/NullPointerException";
+constexpr char const* OOM_CLASS                 = "java/lang/OutOfMemoryError";
 
 /**
  * @brief indicates that a JNI error of some kind was thrown and the main
  * function should return.
  */
 class jni_exception : public std::runtime_error {
-public:
-  jni_exception(char const *const message) : std::runtime_error(message) {}
-  jni_exception(std::string const &message) : std::runtime_error(message) {}
+ public:
+  jni_exception(char const* const message) : std::runtime_error(message) {}
+  jni_exception(std::string const& message) : std::runtime_error(message) {}
 };
 
 /**
  * @brief throw a java exception and a C++ one for flow control.
  */
-inline void throw_java_exception(JNIEnv *const env, const char *class_name, const char *message) {
+inline void throw_java_exception(JNIEnv* const env, const char* class_name, const char* message)
+{
   jclass ex_class = env->FindClass(class_name);
-  if (ex_class != NULL) {
-    env->ThrowNew(ex_class, message);
-  }
+  if (ex_class != NULL) { env->ThrowNew(ex_class, message); }
   throw jni_exception(message);
 }
 
@@ -64,7 +64,8 @@ inline void throw_java_exception(JNIEnv *const env, const char *class_name, cons
  * @brief check if an java exceptions have been thrown and if so throw a C++
  * exception so the flow control stop processing.
  */
-inline void check_java_exception(JNIEnv *const env) {
+inline void check_java_exception(JNIEnv* const env)
+{
   if (env->ExceptionCheck()) {
     // Not going to try to get the message out of the Exception, too complex and
     // might fail.
@@ -78,7 +79,9 @@ inline void check_java_exception(JNIEnv *const env) {
  * This is useful when, for instance, converting a cudf::column pointer
  * to a jlong, for use in JNI.
  */
-template <typename T> jlong ptr_as_jlong(T *ptr) {
+template <typename T>
+jlong ptr_as_jlong(T* ptr)
+{
   return reinterpret_cast<jlong>(ptr);
 }
 
@@ -86,7 +89,9 @@ template <typename T> jlong ptr_as_jlong(T *ptr) {
  * @brief Helper to release the data held by a unique_ptr, and return
  * the pointer as a jlong.
  */
-template <typename T> jlong release_as_jlong(std::unique_ptr<T> &&ptr) {
+template <typename T>
+jlong release_as_jlong(std::unique_ptr<T>&& ptr)
+{
   return ptr_as_jlong(ptr.release());
 }
 
@@ -94,96 +99,112 @@ template <typename T> jlong release_as_jlong(std::unique_ptr<T> &&ptr) {
  * @brief Helper to release the data held by a unique_ptr, and return
  * the pointer as a jlong.
  */
-template <typename T> jlong release_as_jlong(std::unique_ptr<T> &ptr) {
+template <typename T>
+jlong release_as_jlong(std::unique_ptr<T>& ptr)
+{
   return release_as_jlong(std::move(ptr));
 }
 
 class native_jdoubleArray_accessor {
-public:
-  jdouble *getArrayElements(JNIEnv *const env, jdoubleArray arr) const {
+ public:
+  jdouble* getArrayElements(JNIEnv* const env, jdoubleArray arr) const
+  {
     return env->GetDoubleArrayElements(arr, NULL);
   }
 
-  jdoubleArray newArray(JNIEnv *const env, int len) const { return env->NewDoubleArray(len); }
+  jdoubleArray newArray(JNIEnv* const env, int len) const { return env->NewDoubleArray(len); }
 
-  void setArrayRegion(JNIEnv *const env, jdoubleArray jarr, int start, int len,
-                      jdouble const *arr) const {
+  void setArrayRegion(
+    JNIEnv* const env, jdoubleArray jarr, int start, int len, jdouble const* arr) const
+  {
     env->SetDoubleArrayRegion(jarr, start, len, arr);
   }
 
-  void releaseArrayElements(JNIEnv *const env, jdoubleArray jarr, jdouble *arr, jint mode) const {
+  void releaseArrayElements(JNIEnv* const env, jdoubleArray jarr, jdouble* arr, jint mode) const
+  {
     env->ReleaseDoubleArrayElements(jarr, arr, mode);
   }
 };
 
 class native_jlongArray_accessor {
-public:
-  jlong *getArrayElements(JNIEnv *const env, jlongArray arr) const {
+ public:
+  jlong* getArrayElements(JNIEnv* const env, jlongArray arr) const
+  {
     return env->GetLongArrayElements(arr, NULL);
   }
 
-  jlongArray newArray(JNIEnv *const env, int len) const { return env->NewLongArray(len); }
+  jlongArray newArray(JNIEnv* const env, int len) const { return env->NewLongArray(len); }
 
-  void setArrayRegion(JNIEnv *const env, jlongArray jarr, int start, int len,
-                      jlong const *arr) const {
+  void setArrayRegion(
+    JNIEnv* const env, jlongArray jarr, int start, int len, jlong const* arr) const
+  {
     env->SetLongArrayRegion(jarr, start, len, arr);
   }
 
-  void releaseArrayElements(JNIEnv *const env, jlongArray jarr, jlong *arr, jint mode) const {
+  void releaseArrayElements(JNIEnv* const env, jlongArray jarr, jlong* arr, jint mode) const
+  {
     env->ReleaseLongArrayElements(jarr, arr, mode);
   }
 };
 
 class native_jintArray_accessor {
-public:
-  jint *getArrayElements(JNIEnv *const env, jintArray arr) const {
+ public:
+  jint* getArrayElements(JNIEnv* const env, jintArray arr) const
+  {
     return env->GetIntArrayElements(arr, NULL);
   }
 
-  jintArray newArray(JNIEnv *const env, int len) const { return env->NewIntArray(len); }
+  jintArray newArray(JNIEnv* const env, int len) const { return env->NewIntArray(len); }
 
-  void setArrayRegion(JNIEnv *const env, jintArray jarr, int start, int len,
-                      jint const *arr) const {
+  void setArrayRegion(JNIEnv* const env, jintArray jarr, int start, int len, jint const* arr) const
+  {
     env->SetIntArrayRegion(jarr, start, len, arr);
   }
 
-  void releaseArrayElements(JNIEnv *const env, jintArray jarr, jint *arr, jint mode) const {
+  void releaseArrayElements(JNIEnv* const env, jintArray jarr, jint* arr, jint mode) const
+  {
     env->ReleaseIntArrayElements(jarr, arr, mode);
   }
 };
 
 class native_jbyteArray_accessor {
-public:
-  jbyte *getArrayElements(JNIEnv *const env, jbyteArray arr) const {
+ public:
+  jbyte* getArrayElements(JNIEnv* const env, jbyteArray arr) const
+  {
     return env->GetByteArrayElements(arr, NULL);
   }
 
-  jbyteArray newArray(JNIEnv *const env, int len) const { return env->NewByteArray(len); }
+  jbyteArray newArray(JNIEnv* const env, int len) const { return env->NewByteArray(len); }
 
-  void setArrayRegion(JNIEnv *const env, jbyteArray jarr, int start, int len,
-                      jbyte const *arr) const {
+  void setArrayRegion(
+    JNIEnv* const env, jbyteArray jarr, int start, int len, jbyte const* arr) const
+  {
     env->SetByteArrayRegion(jarr, start, len, arr);
   }
 
-  void releaseArrayElements(JNIEnv *const env, jbyteArray jarr, jbyte *arr, jint mode) const {
+  void releaseArrayElements(JNIEnv* const env, jbyteArray jarr, jbyte* arr, jint mode) const
+  {
     env->ReleaseByteArrayElements(jarr, arr, mode);
   }
 };
 
 class native_jbooleanArray_accessor {
-public:
-  jboolean *getArrayElements(JNIEnv *const env, jbooleanArray arr) const {
+ public:
+  jboolean* getArrayElements(JNIEnv* const env, jbooleanArray arr) const
+  {
     return env->GetBooleanArrayElements(arr, NULL);
   }
 
-  jbooleanArray newArray(JNIEnv *const env, int len) const { return env->NewBooleanArray(len); }
+  jbooleanArray newArray(JNIEnv* const env, int len) const { return env->NewBooleanArray(len); }
 
-  void setArrayRegion(JNIEnv *const env, jbooleanArray jarr, int start, int len,
-                      jboolean const *arr) const {
+  void setArrayRegion(
+    JNIEnv* const env, jbooleanArray jarr, int start, int len, jboolean const* arr) const
+  {
     env->SetBooleanArrayRegion(jarr, start, len, arr);
   }
 
-  void releaseArrayElements(JNIEnv *const env, jbooleanArray jarr, jboolean *arr, jint mode) const {
+  void releaseArrayElements(JNIEnv* const env, jbooleanArray jarr, jboolean* arr, jint mode) const
+  {
     env->ReleaseBooleanArrayElements(jarr, arr, mode);
   }
 };
@@ -194,47 +215,52 @@ class native_jbooleanArray_accessor {
  * By default any changes to the array will be committed back when
  * the destructor is called unless cancel is called first.
  */
-template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR> class native_jArray {
-private:
+template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR>
+class native_jArray {
+ private:
   ACCESSOR access{};
-  JNIEnv *const env;
+  JNIEnv* const env;
   J_ARRAY_TYPE orig;
   int len;
-  mutable N_TYPE *data_ptr;
+  mutable N_TYPE* data_ptr;
 
-  void init_data_ptr() const {
+  void init_data_ptr() const
+  {
     if (orig != nullptr && data_ptr == nullptr) {
       data_ptr = access.getArrayElements(env, orig);
       check_java_exception(env);
     }
   }
 
-public:
-  native_jArray(native_jArray const &) = delete;
-  native_jArray &operator=(native_jArray const &) = delete;
+ public:
+  native_jArray(native_jArray const&)            = delete;
+  native_jArray& operator=(native_jArray const&) = delete;
 
-  native_jArray(JNIEnv *const env, J_ARRAY_TYPE orig)
-      : env(env), orig(orig), len(0), data_ptr(NULL) {
+  native_jArray(JNIEnv* const env, J_ARRAY_TYPE orig) : env(env), orig(orig), len(0), data_ptr(NULL)
+  {
     if (orig != NULL) {
       len = env->GetArrayLength(orig);
       check_java_exception(env);
     }
   }
 
-  native_jArray(JNIEnv *const env, int len)
-      : env(env), orig(access.newArray(env, len)), len(len), data_ptr(NULL) {
+  native_jArray(JNIEnv* const env, int len)
+    : env(env), orig(access.newArray(env, len)), len(len), data_ptr(NULL)
+  {
     check_java_exception(env);
   }
 
-  native_jArray(JNIEnv *const env, N_TYPE const *arr, int len)
-      : env(env), orig(access.newArray(env, len)), len(len), data_ptr(NULL) {
+  native_jArray(JNIEnv* const env, N_TYPE const* arr, int len)
+    : env(env), orig(access.newArray(env, len)), len(len), data_ptr(NULL)
+  {
     check_java_exception(env);
     access.setArrayRegion(env, orig, 0, len, arr);
     check_java_exception(env);
   }
 
-  native_jArray(JNIEnv *const env, const std::vector<N_TYPE> &arr)
-      : env(env), orig(access.newArray(env, arr.size())), len(arr.size()), data_ptr(NULL) {
+  native_jArray(JNIEnv* const env, const std::vector<N_TYPE>& arr)
+    : env(env), orig(access.newArray(env, arr.size())), len(arr.size()), data_ptr(NULL)
+  {
     check_java_exception(env);
     access.setArrayRegion(env, orig, 0, len, arr.data());
     check_java_exception(env);
@@ -244,43 +270,39 @@ template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR> class nativ
 
   int size() const noexcept { return len; }
 
-  N_TYPE operator[](int index) const {
-    if (orig == NULL) {
-      throw_java_exception(env, NPE_CLASS, "pointer is NULL");
-    }
-    if (index < 0 || index >= len) {
-      throw_java_exception(env, INDEX_OOB_CLASS, "NOT IN BOUNDS");
-    }
+  N_TYPE operator[](int index) const
+  {
+    if (orig == NULL) { throw_java_exception(env, NPE_CLASS, "pointer is NULL"); }
+    if (index < 0 || index >= len) { throw_java_exception(env, INDEX_OOB_CLASS, "NOT IN BOUNDS"); }
     return data()[index];
   }
 
-  N_TYPE &operator[](int index) {
-    if (orig == NULL) {
-      throw_java_exception(env, NPE_CLASS, "pointer is NULL");
-    }
-    if (index < 0 || index >= len) {
-      throw_java_exception(env, INDEX_OOB_CLASS, "NOT IN BOUNDS");
-    }
+  N_TYPE& operator[](int index)
+  {
+    if (orig == NULL) { throw_java_exception(env, NPE_CLASS, "pointer is NULL"); }
+    if (index < 0 || index >= len) { throw_java_exception(env, INDEX_OOB_CLASS, "NOT IN BOUNDS"); }
     return data()[index];
   }
 
-  const N_TYPE *const data() const {
+  const N_TYPE* const data() const
+  {
     init_data_ptr();
     return data_ptr;
   }
 
-  N_TYPE *data() {
+  N_TYPE* data()
+  {
     init_data_ptr();
     return data_ptr;
   }
 
-  const N_TYPE *const begin() const { return data(); }
+  const N_TYPE* const begin() const { return data(); }
 
-  N_TYPE *begin() { return data(); }
+  N_TYPE* begin() { return data(); }
 
-  const N_TYPE *const end() const { return data() + size(); }
+  const N_TYPE* const end() const { return data() + size(); }
 
-  N_TYPE *end() { return data() + size(); }
+  N_TYPE* end() { return data() + size(); }
 
   const J_ARRAY_TYPE get_jArray() const { return orig; }
 
@@ -292,7 +314,9 @@ template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR> class nativ
    * @tparam target_t Target data type
    * @return std::vector<target_t> Vector with the copied contents
    */
-  template <typename target_t = N_TYPE> std::vector<target_t> to_vector() const {
+  template <typename target_t = N_TYPE>
+  std::vector<target_t> to_vector() const
+  {
     std::vector<target_t> ret;
     ret.reserve(size());
     std::copy(begin(), end(), std::back_inserter(ret));
@@ -303,14 +327,16 @@ template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR> class nativ
    * @brief if data has been written back into this array, don't commit
    * it.
    */
-  void cancel() {
+  void cancel()
+  {
     if (data_ptr != NULL && orig != NULL) {
       access.releaseArrayElements(env, orig, data_ptr, JNI_ABORT);
       data_ptr = NULL;
     }
   }
 
-  void commit() {
+  void commit()
+  {
     if (data_ptr != NULL && orig != NULL) {
       access.releaseArrayElements(env, orig, data_ptr, 0);
       data_ptr = NULL;
@@ -321,9 +347,9 @@ template <typename N_TYPE, typename J_ARRAY_TYPE, typename ACCESSOR> class nativ
 };
 
 using native_jdoubleArray = native_jArray<jdouble, jdoubleArray, native_jdoubleArray_accessor>;
-using native_jlongArray = native_jArray<jlong, jlongArray, native_jlongArray_accessor>;
-using native_jintArray = native_jArray<jint, jintArray, native_jintArray_accessor>;
-using native_jbyteArray = native_jArray<jbyte, jbyteArray, native_jbyteArray_accessor>;
+using native_jlongArray   = native_jArray<jlong, jlongArray, native_jlongArray_accessor>;
+using native_jintArray    = native_jArray<jint, jintArray, native_jintArray_accessor>;
+using native_jbyteArray   = native_jArray<jbyte, jbyteArray, native_jbyteArray_accessor>;
 
 /**
  * @brief Specialization of native_jArray for jboolean
@@ -332,19 +358,23 @@ using native_jbyteArray = native_jArray<jbyte, jbyteArray, native_jbyteArray_acc
  * value is chosen depending on the jboolean value.
  */
 struct native_jbooleanArray
-    : native_jArray<jboolean, jbooleanArray, native_jbooleanArray_accessor> {
-  native_jbooleanArray(JNIEnv *const env, jbooleanArray orig)
-      : native_jArray<jboolean, jbooleanArray, native_jbooleanArray_accessor>(env, orig) {}
+  : native_jArray<jboolean, jbooleanArray, native_jbooleanArray_accessor> {
+  native_jbooleanArray(JNIEnv* const env, jbooleanArray orig)
+    : native_jArray<jboolean, jbooleanArray, native_jbooleanArray_accessor>(env, orig)
+  {
+  }
 
-  native_jbooleanArray(native_jbooleanArray const &) = delete;
-  native_jbooleanArray &operator=(native_jbooleanArray const &) = delete;
+  native_jbooleanArray(native_jbooleanArray const&)            = delete;
+  native_jbooleanArray& operator=(native_jbooleanArray const&) = delete;
 
   template <typename target_t>
-  std::vector<target_t> transform_if_else(target_t const &if_true, target_t const &if_false) const {
+  std::vector<target_t> transform_if_else(target_t const& if_true, target_t const& if_false) const
+  {
     std::vector<target_t> ret;
     ret.reserve(size());
-    std::transform(begin(), end(), std::back_inserter(ret),
-                   [&](jboolean const &b) { return b ? if_true : if_false; });
+    std::transform(begin(), end(), std::back_inserter(ret), [&](jboolean const& b) {
+      return b ? if_true : if_false;
+    });
     return ret;
   }
 };
@@ -355,58 +385,58 @@ struct native_jbooleanArray
  * By default any changes to the array will be committed back when
  * the destructor is called unless cancel is called first.
  */
-template <typename T> class native_jpointerArray {
-private:
+template <typename T>
+class native_jpointerArray {
+ private:
   native_jlongArray wrapped;
-  JNIEnv *const env;
+  JNIEnv* const env;
 
-public:
-  native_jpointerArray(native_jpointerArray const &) = delete;
-  native_jpointerArray &operator=(native_jpointerArray const &) = delete;
+ public:
+  native_jpointerArray(native_jpointerArray const&)            = delete;
+  native_jpointerArray& operator=(native_jpointerArray const&) = delete;
 
-  native_jpointerArray(JNIEnv *const env, jlongArray orig) : wrapped(env, orig), env(env) {}
+  native_jpointerArray(JNIEnv* const env, jlongArray orig) : wrapped(env, orig), env(env) {}
 
-  native_jpointerArray(JNIEnv *const env, int len) : wrapped(env, len), env(env) {}
+  native_jpointerArray(JNIEnv* const env, int len) : wrapped(env, len), env(env) {}
 
-  native_jpointerArray(JNIEnv *const env, T *arr, int len) : wrapped(env, arr, len), env(env) {}
+  native_jpointerArray(JNIEnv* const env, T* arr, int len) : wrapped(env, arr, len), env(env) {}
 
   bool is_null() const noexcept { return wrapped.is_null(); }
 
   int size() const noexcept { return wrapped.size(); }
 
-  T *operator[](int index) const {
-    if (data() == NULL) {
-      throw_java_exception(env, NPE_CLASS, "pointer is NULL");
-    }
+  T* operator[](int index) const
+  {
+    if (data() == NULL) { throw_java_exception(env, NPE_CLASS, "pointer is NULL"); }
     if (index < 0 || index >= wrapped.size()) {
       throw_java_exception(env, INDEX_OOB_CLASS, "NOT IN BOUNDS");
     }
     return data()[index];
   }
 
-  T *&operator[](int index) {
-    if (data() == NULL) {
-      throw_java_exception(env, NPE_CLASS, "pointer is NULL");
-    }
+  T*& operator[](int index)
+  {
+    if (data() == NULL) { throw_java_exception(env, NPE_CLASS, "pointer is NULL"); }
     if (index < 0 || index >= wrapped.size()) {
       throw_java_exception(env, INDEX_OOB_CLASS, "NOT IN BOUNDS");
     }
     return data()[index];
   }
 
-  T *const *data() const { return reinterpret_cast<T *const *>(wrapped.data()); }
+  T* const* data() const { return reinterpret_cast<T* const*>(wrapped.data()); }
 
-  T **data() { return reinterpret_cast<T **>(wrapped.data()); }
+  T** data() { return reinterpret_cast<T**>(wrapped.data()); }
 
-  T *const *begin() const { return data(); }
-  T *const *end() const { return data() + size(); }
+  T* const* begin() const { return data(); }
+  T* const* end() const { return data() + size(); }
 
   const jlongArray get_jArray() const { return wrapped.get_jArray(); }
 
   jlongArray get_jArray() { return wrapped.get_jArray(); }
 
-  void assert_no_nulls() const {
-    if (std::any_of(data(), data() + size(), [](T *const ptr) { return ptr == nullptr; })) {
+  void assert_no_nulls() const
+  {
+    if (std::any_of(data(), data() + size(), [](T* const ptr) { return ptr == nullptr; })) {
       throw_java_exception(env, NPE_CLASS, "pointer is NULL");
     }
   }
@@ -414,12 +444,13 @@ template <typename T> class native_jpointerArray {
   /**
    * @brief Convert from `T*[]` to `vector<T>`.
    */
-  std::vector<T> get_dereferenced() const {
+  std::vector<T> get_dereferenced() const
+  {
     assert_no_nulls();
     auto ret = std::vector<T>{};
     ret.reserve(size());
-    std::transform(data(), data() + size(), std::back_inserter(ret),
-                   [](T *const &p) { return *p; });
+    std::transform(
+      data(), data() + size(), std::back_inserter(ret), [](T* const& p) { return *p; });
     return ret;
   }
 
@@ -439,73 +470,82 @@ template <typename T> class native_jpointerArray {
  * By default any changes to the array will be committed back when
  * released unless cancel is called first.
  */
-template <typename T, typename D = std::default_delete<T>> class unique_jpointerArray {
-private:
+template <typename T, typename D = std::default_delete<T>>
+class unique_jpointerArray {
+ private:
   std::unique_ptr<native_jpointerArray<T>> wrapped;
   D del;
 
-public:
-  unique_jpointerArray(unique_jpointerArray const &) = delete;
-  unique_jpointerArray &operator=(unique_jpointerArray const &) = delete;
+ public:
+  unique_jpointerArray(unique_jpointerArray const&)            = delete;
+  unique_jpointerArray& operator=(unique_jpointerArray const&) = delete;
 
-  unique_jpointerArray(JNIEnv *const env, jlongArray orig)
-      : wrapped(new native_jpointerArray<T>(env, orig)) {}
+  unique_jpointerArray(JNIEnv* const env, jlongArray orig)
+    : wrapped(new native_jpointerArray<T>(env, orig))
+  {
+  }
 
-  unique_jpointerArray(JNIEnv *const env, jlongArray orig, const D &del)
-      : wrapped(new native_jpointerArray<T>(env, orig)), del(del) {}
+  unique_jpointerArray(JNIEnv* const env, jlongArray orig, const D& del)
+    : wrapped(new native_jpointerArray<T>(env, orig)), del(del)
+  {
+  }
 
-  unique_jpointerArray(JNIEnv *const env, int len)
-      : wrapped(new native_jpointerArray<T>(env, len)) {}
+  unique_jpointerArray(JNIEnv* const env, int len) : wrapped(new native_jpointerArray<T>(env, len))
+  {
+  }
 
-  unique_jpointerArray(JNIEnv *const env, int len, const D &del)
-      : wrapped(new native_jpointerArray<T>(env, len)), del(del) {}
+  unique_jpointerArray(JNIEnv* const env, int len, const D& del)
+    : wrapped(new native_jpointerArray<T>(env, len)), del(del)
+  {
+  }
 
-  unique_jpointerArray(JNIEnv *const env, T *arr, int len)
-      : wrapped(new native_jpointerArray<T>(env, arr, len)) {}
+  unique_jpointerArray(JNIEnv* const env, T* arr, int len)
+    : wrapped(new native_jpointerArray<T>(env, arr, len))
+  {
+  }
 
-  unique_jpointerArray(JNIEnv *const env, T *arr, int len, const D &del)
-      : wrapped(new native_jpointerArray<T>(env, arr, len)), del(del) {}
+  unique_jpointerArray(JNIEnv* const env, T* arr, int len, const D& del)
+    : wrapped(new native_jpointerArray<T>(env, arr, len)), del(del)
+  {
+  }
 
   bool is_null() const noexcept { return wrapped == NULL || wrapped->is_null(); }
 
   int size() const noexcept { return wrapped == NULL ? 0 : wrapped->size(); }
 
-  void reset(int index, T *new_ptr = NULL) {
-    if (wrapped == NULL) {
-      throw std::logic_error("using unique_jpointerArray after release");
-    }
-    T *old = (*wrapped)[index];
+  void reset(int index, T* new_ptr = NULL)
+  {
+    if (wrapped == NULL) { throw std::logic_error("using unique_jpointerArray after release"); }
+    T* old = (*wrapped)[index];
     if (old != new_ptr) {
       (*wrapped)[index] = new_ptr;
       del(old);
     }
   }
 
-  T *get(int index) {
-    if (wrapped == NULL) {
-      throw std::logic_error("using unique_jpointerArray after release");
-    }
+  T* get(int index)
+  {
+    if (wrapped == NULL) { throw std::logic_error("using unique_jpointerArray after release"); }
     return (*wrapped)[index];
   }
 
-  T *const *get() {
-    if (wrapped == NULL) {
-      throw std::logic_error("using unique_jpointerArray after release");
-    }
+  T* const* get()
+  {
+    if (wrapped == NULL) { throw std::logic_error("using unique_jpointerArray after release"); }
     return wrapped->data();
   }
 
-  jlongArray release() {
-    if (wrapped == NULL) {
-      return NULL;
-    }
+  jlongArray release()
+  {
+    if (wrapped == NULL) { return NULL; }
     wrapped->commit();
     jlongArray ret = wrapped->get_jArray();
     wrapped.reset(NULL);
     return ret;
   }
 
-  ~unique_jpointerArray() {
+  ~unique_jpointerArray()
+  {
     if (wrapped != NULL) {
       for (int i = 0; i < wrapped->size(); i++) {
         reset(i, NULL);
@@ -518,57 +558,62 @@ template <typename T, typename D = std::default_delete<T>> class unique_jpointer
  * @brief RAII for jstring to be sure it is handled correctly.
  */
 class native_jstring {
-private:
-  JNIEnv *env;
+ private:
+  JNIEnv* env;
   jstring orig;
-  mutable const char *cstr;
+  mutable const char* cstr;
   mutable size_t cstr_length;
 
-  void init_cstr() const {
+  void init_cstr() const
+  {
     if (orig != NULL && cstr == NULL) {
       cstr_length = env->GetStringUTFLength(orig);
-      cstr = env->GetStringUTFChars(orig, 0);
+      cstr        = env->GetStringUTFChars(orig, 0);
       check_java_exception(env);
     }
   }
 
-public:
-  native_jstring(native_jstring const &) = delete;
-  native_jstring &operator=(native_jstring const &) = delete;
+ public:
+  native_jstring(native_jstring const&)            = delete;
+  native_jstring& operator=(native_jstring const&) = delete;
 
-  native_jstring(native_jstring &&other) noexcept
-      : env(other.env), orig(other.orig), cstr(other.cstr), cstr_length(other.cstr_length) {
+  native_jstring(native_jstring&& other) noexcept
+    : env(other.env), orig(other.orig), cstr(other.cstr), cstr_length(other.cstr_length)
+  {
     other.cstr = NULL;
   }
 
-  native_jstring(JNIEnv *const env, jstring orig)
-      : env(env), orig(orig), cstr(NULL), cstr_length(0) {}
+  native_jstring(JNIEnv* const env, jstring orig) : env(env), orig(orig), cstr(NULL), cstr_length(0)
+  {
+  }
 
-  native_jstring &operator=(native_jstring const &&other) {
-    if (orig != NULL && cstr != NULL) {
-      env->ReleaseStringUTFChars(orig, cstr);
-    }
-    this->env = other.env;
-    this->orig = other.orig;
-    this->cstr = other.cstr;
+  native_jstring& operator=(native_jstring const&& other)
+  {
+    if (orig != NULL && cstr != NULL) { env->ReleaseStringUTFChars(orig, cstr); }
+    this->env         = other.env;
+    this->orig        = other.orig;
+    this->cstr        = other.cstr;
     this->cstr_length = other.cstr_length;
-    other.cstr = NULL;
+    other.cstr        = NULL;
     return *this;
   }
 
   bool is_null() const noexcept { return orig == NULL; }
 
-  const char *get() const {
+  const char* get() const
+  {
     init_cstr();
     return cstr;
   }
 
-  size_t size_bytes() const {
+  size_t size_bytes() const
+  {
     init_cstr();
     return cstr_length;
   }
 
-  bool is_empty() const {
+  bool is_empty() const
+  {
     if (cstr != NULL) {
       return cstr_length <= 0;
     } else if (orig != NULL) {
@@ -581,24 +626,25 @@ class native_jstring {
 
   const jstring get_jstring() const { return orig; }
 
-  ~native_jstring() {
-    if (orig != NULL && cstr != NULL) {
-      env->ReleaseStringUTFChars(orig, cstr);
-    }
+  ~native_jstring()
+  {
+    if (orig != NULL && cstr != NULL) { env->ReleaseStringUTFChars(orig, cstr); }
   }
 };
 
 /**
  * @brief jobjectArray wrapper to make accessing it more convenient.
  */
-template <typename T> class native_jobjectArray {
-private:
-  JNIEnv *const env;
+template <typename T>
+class native_jobjectArray {
+ private:
+  JNIEnv* const env;
   jobjectArray orig;
   int len;
 
-public:
-  native_jobjectArray(JNIEnv *const env, jobjectArray orig) : env(env), orig(orig), len(0) {
+ public:
+  native_jobjectArray(JNIEnv* const env, jobjectArray orig) : env(env), orig(orig), len(0)
+  {
     if (orig != NULL) {
       len = env->GetArrayLength(orig);
       check_java_exception(env);
@@ -611,19 +657,17 @@ template <typename T> class native_jobjectArray {
 
   T operator[](int index) const { return get(index); }
 
-  T get(int index) const {
-    if (orig == NULL) {
-      throw_java_exception(env, NPE_CLASS, "jobjectArray pointer is NULL");
-    }
+  T get(int index) const
+  {
+    if (orig == NULL) { throw_java_exception(env, NPE_CLASS, "jobjectArray pointer is NULL"); }
     T ret = static_cast<T>(env->GetObjectArrayElement(orig, index));
     check_java_exception(env);
     return ret;
   }
 
-  void set(int index, const T &val) {
-    if (orig == NULL) {
-      throw_java_exception(env, NPE_CLASS, "jobjectArray pointer is NULL");
-    }
+  void set(int index, const T& val)
+  {
+    if (orig == NULL) { throw_java_exception(env, NPE_CLASS, "jobjectArray pointer is NULL"); }
     env->SetObjectArrayElement(orig, index, val);
     check_java_exception(env);
   }
@@ -636,14 +680,15 @@ template <typename T> class native_jobjectArray {
  * and convenient.
  */
 class native_jstringArray {
-private:
-  JNIEnv *const env;
+ private:
+  JNIEnv* const env;
   native_jobjectArray<jstring> arr;
   mutable std::vector<native_jstring> cache;
   mutable std::vector<std::string> cpp_cache;
-  mutable std::vector<const char *> c_cache;
+  mutable std::vector<const char*> c_cache;
 
-  void init_cache() const {
+  void init_cache() const
+  {
     if (!arr.is_null() && cache.empty()) {
       int size = this->size();
       cache.reserve(size);
@@ -653,7 +698,8 @@ class native_jstringArray {
     }
   }
 
-  void init_c_cache() const {
+  void init_c_cache() const
+  {
     if (!arr.is_null() && c_cache.empty()) {
       init_cache();
       int size = this->size();
@@ -664,7 +710,8 @@ class native_jstringArray {
     }
   }
 
-  void init_cpp_cache() const {
+  void init_cpp_cache() const
+  {
     if (!arr.is_null() && cpp_cache.empty()) {
       init_cache();
       int size = this->size();
@@ -675,32 +722,30 @@ class native_jstringArray {
     }
   }
 
-  void update_caches(int index, jstring val) {
+  void update_caches(int index, jstring val)
+  {
     if (!cache.empty()) {
       cache[index] = native_jstring(env, val);
-      if (!c_cache.empty()) {
-        c_cache[index] = cache[index].get();
-      }
+      if (!c_cache.empty()) { c_cache[index] = cache[index].get(); }
 
-      if (!cpp_cache.empty()) {
-        cpp_cache[index] = cache[index].get();
-      }
+      if (!cpp_cache.empty()) { cpp_cache[index] = cache[index].get(); }
     } else if (!c_cache.empty() || !cpp_cache.empty()) {
       // Illegal state
       throw std::logic_error("CACHING IS MESSED UP");
     }
   }
 
-public:
-  native_jstringArray(JNIEnv *const env, jobjectArray orig) : env(env), arr(env, orig) {}
+ public:
+  native_jstringArray(JNIEnv* const env, jobjectArray orig) : env(env), arr(env, orig) {}
 
   bool is_null() const noexcept { return arr.is_null(); }
 
   int size() const noexcept { return arr.size(); }
 
-  native_jstring &operator[](int index) const { return get(index); }
+  native_jstring& operator[](int index) const { return get(index); }
 
-  native_jstring &get(int index) const {
+  native_jstring& get(int index) const
+  {
     if (arr.is_null()) {
       throw_java_exception(env, cudf::jni::NPE_CLASS, "jstringArray pointer is NULL");
     }
@@ -708,27 +753,32 @@ class native_jstringArray {
     return cache[index];
   }
 
-  const char **const as_c_array() const {
+  const char** const as_c_array() const
+  {
     init_c_cache();
     return c_cache.data();
   }
 
-  const std::vector<std::string> as_cpp_vector() const {
+  const std::vector<std::string> as_cpp_vector() const
+  {
     init_cpp_cache();
     return cpp_cache;
   }
 
-  void set(int index, jstring val) {
+  void set(int index, jstring val)
+  {
     arr.set(index, val);
     update_caches(index, val);
   }
 
-  void set(int index, const native_jstring &val) {
+  void set(int index, const native_jstring& val)
+  {
     arr.set(index, val.get_jstring());
     update_caches(index, val.get_jstring());
   }
 
-  void set(int index, const char *val) {
+  void set(int index, const char* val)
+  {
     jstring str = env->NewStringUTF(val);
     check_java_exception(env);
     arr.set(index, str);
@@ -739,8 +789,9 @@ class native_jstringArray {
 /**
  * @brief create a cuda exception from a given cudaError_t
  */
-inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowable cause = NULL) {
-  const char *ex_class_name;
+inline jthrowable cuda_exception(JNIEnv* const env, cudaError_t status, jthrowable cause = NULL)
+{
+  const char* ex_class_name;
 
   // Calls cudaGetLastError twice. It is nearly certain that a fatal error occurred if the second
   // call doesn't return with cudaSuccess.
@@ -755,19 +806,13 @@ inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowab
   }
 
   jclass ex_class = env->FindClass(ex_class_name);
-  if (ex_class == NULL) {
-    return NULL;
-  }
+  if (ex_class == NULL) { return NULL; }
   jmethodID ctor_id =
-      env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;ILjava/lang/Throwable;)V");
-  if (ctor_id == NULL) {
-    return NULL;
-  }
+    env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;ILjava/lang/Throwable;)V");
+  if (ctor_id == NULL) { return NULL; }
 
   jstring msg = env->NewStringUTF(cudaGetErrorString(status));
-  if (msg == NULL) {
-    return NULL;
-  }
+  if (msg == NULL) { return NULL; }
 
   jint err_code = static_cast<jint>(status);
 
@@ -775,168 +820,146 @@ inline jthrowable cuda_exception(JNIEnv *const env, cudaError_t status, jthrowab
   return (jthrowable)ret;
 }
 
-inline void jni_cuda_check(JNIEnv *const env, cudaError_t cuda_status) {
+inline void jni_cuda_check(JNIEnv* const env, cudaError_t cuda_status)
+{
   if (cudaSuccess != cuda_status) {
     jthrowable jt = cuda_exception(env, cuda_status);
-    if (jt != NULL) {
-      env->Throw(jt);
-    }
+    if (jt != NULL) { env->Throw(jt); }
     throw jni_exception(std::string("CUDA ERROR: code ") +
                         std::to_string(static_cast<int>(cuda_status)));
   }
 }
 
-inline auto add_global_ref(JNIEnv *env, jobject jobj) {
+inline auto add_global_ref(JNIEnv* env, jobject jobj)
+{
   auto new_global_ref = env->NewGlobalRef(jobj);
-  if (new_global_ref == nullptr) {
-    throw cudf::jni::jni_exception("global ref");
-  }
+  if (new_global_ref == nullptr) { throw cudf::jni::jni_exception("global ref"); }
   return new_global_ref;
 }
 
-inline nullptr_t del_global_ref(JNIEnv *env, jobject jobj) {
-  if (jobj != nullptr) {
-    env->DeleteGlobalRef(jobj);
-  }
+inline nullptr_t del_global_ref(JNIEnv* env, jobject jobj)
+{
+  if (jobj != nullptr) { env->DeleteGlobalRef(jobj); }
   return nullptr;
 }
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
 
-#define JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val)                                                 \
-  {                                                                                                \
-    if (env->ExceptionOccurred()) {                                                                \
-      return ret_val;                                                                              \
-    }                                                                                              \
+#define JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val)    \
+  {                                                   \
+    if (env->ExceptionOccurred()) { return ret_val; } \
   }
 
-#define JNI_THROW_NEW(env, class_name, message, ret_val)                                           \
-  {                                                                                                \
-    jclass ex_class = env->FindClass(class_name);                                                  \
-    if (ex_class == NULL) {                                                                        \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    env->ThrowNew(ex_class, message);                                                              \
-    return ret_val;                                                                                \
+#define JNI_THROW_NEW(env, class_name, message, ret_val) \
+  {                                                      \
+    jclass ex_class = env->FindClass(class_name);        \
+    if (ex_class == NULL) { return ret_val; }            \
+    env->ThrowNew(ex_class, message);                    \
+    return ret_val;                                      \
   }
 
 // Throw a new exception only if one is not pending then always return with the specified value
-#define JNI_CHECK_THROW_CUDF_EXCEPTION(env, class_name, message, stacktrace, ret_val)              \
-  {                                                                                                \
-    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                    \
-    auto const ex_class = env->FindClass(class_name);                                              \
-    if (ex_class == nullptr) {                                                                     \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const ctor_id =                                                                           \
-        env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/String;)V");           \
-    if (ctor_id == nullptr) {                                                                      \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const empty_str = std::string{""};                                                        \
-    auto const jmessage = env->NewStringUTF(message == nullptr ? empty_str.c_str() : message);     \
-    if (jmessage == nullptr) {                                                                     \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const jstacktrace =                                                                       \
-        env->NewStringUTF(stacktrace == nullptr ? empty_str.c_str() : stacktrace);                 \
-    if (jstacktrace == nullptr) {                                                                  \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const jobj = env->NewObject(ex_class, ctor_id, jmessage, jstacktrace);                    \
-    if (jobj == nullptr) {                                                                         \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    env->Throw(reinterpret_cast<jthrowable>(jobj));                                                \
-    return ret_val;                                                                                \
+#define JNI_CHECK_THROW_CUDF_EXCEPTION(env, class_name, message, stacktrace, ret_val)           \
+  {                                                                                             \
+    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                 \
+    auto const ex_class = env->FindClass(class_name);                                           \
+    if (ex_class == nullptr) { return ret_val; }                                                \
+    auto const ctor_id =                                                                        \
+      env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/String;)V");          \
+    if (ctor_id == nullptr) { return ret_val; }                                                 \
+    auto const empty_str = std::string{""};                                                     \
+    auto const jmessage  = env->NewStringUTF(message == nullptr ? empty_str.c_str() : message); \
+    if (jmessage == nullptr) { return ret_val; }                                                \
+    auto const jstacktrace =                                                                    \
+      env->NewStringUTF(stacktrace == nullptr ? empty_str.c_str() : stacktrace);                \
+    if (jstacktrace == nullptr) { return ret_val; }                                             \
+    auto const jobj = env->NewObject(ex_class, ctor_id, jmessage, jstacktrace);                 \
+    if (jobj == nullptr) { return ret_val; }                                                    \
+    env->Throw(reinterpret_cast<jthrowable>(jobj));                                             \
+    return ret_val;                                                                             \
   }
 
 // Throw a new exception only if one is not pending then always return with the specified value
-#define JNI_CHECK_THROW_CUDA_EXCEPTION(env, class_name, message, stacktrace, error_code, ret_val)  \
-  {                                                                                                \
-    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                    \
-    auto const ex_class = env->FindClass(class_name);                                              \
-    if (ex_class == nullptr) {                                                                     \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const ctor_id =                                                                           \
-        env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/String;I)V");          \
-    if (ctor_id == nullptr) {                                                                      \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const empty_str = std::string{""};                                                        \
-    auto const jmessage = env->NewStringUTF(message == nullptr ? empty_str.c_str() : message);     \
-    if (jmessage == nullptr) {                                                                     \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const jstacktrace =                                                                       \
-        env->NewStringUTF(stacktrace == nullptr ? empty_str.c_str() : stacktrace);                 \
-    if (jstacktrace == nullptr) {                                                                  \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    auto const jerror_code = static_cast<jint>(error_code);                                        \
-    auto const jobj = env->NewObject(ex_class, ctor_id, jmessage, jstacktrace, jerror_code);       \
-    if (jobj == nullptr) {                                                                         \
-      return ret_val;                                                                              \
-    }                                                                                              \
-    env->Throw(reinterpret_cast<jthrowable>(jobj));                                                \
-    return ret_val;                                                                                \
-  }
-
-#define JNI_NULL_CHECK(env, obj, error_msg, ret_val)                                               \
-  {                                                                                                \
-    if ((obj) == 0) {                                                                              \
-      JNI_THROW_NEW(env, cudf::jni::NPE_CLASS, error_msg, ret_val);                                \
-    }                                                                                              \
-  }
-
-#define JNI_ARG_CHECK(env, obj, error_msg, ret_val)                                                \
-  {                                                                                                \
-    if (!(obj)) {                                                                                  \
-      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, error_msg, ret_val);                        \
-    }                                                                                              \
-  }
-
-#define CATCH_STD_CLASS(env, class_name, ret_val)                                                  \
-  catch (const rmm::out_of_memory &e) {                                                            \
-    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                    \
-    auto const what =                                                                              \
-        std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what()); \
-    JNI_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val);                               \
-  }                                                                                                \
-  catch (const cudf::fatal_cuda_error &e) {                                                        \
-    JNI_CHECK_THROW_CUDA_EXCEPTION(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e.what(),               \
-                                   e.stacktrace(), e.error_code(), ret_val);                       \
-  }                                                                                                \
-  catch (const cudf::cuda_error &e) {                                                              \
-    JNI_CHECK_THROW_CUDA_EXCEPTION(env, cudf::jni::CUDA_ERROR_CLASS, e.what(), e.stacktrace(),     \
-                                   e.error_code(), ret_val);                                       \
-  }                                                                                                \
-  catch (const cudf::data_type_error &e) {                                                         \
-    JNI_CHECK_THROW_CUDF_EXCEPTION(env, cudf::jni::CUDF_DTYPE_ERROR_CLASS, e.what(),               \
-                                   e.stacktrace(), ret_val);                                       \
-  }                                                                                                \
-  catch (std::overflow_error const &e) {                                                           \
-    JNI_CHECK_THROW_CUDF_EXCEPTION(env, cudf::jni::CUDF_OVERFLOW_ERROR_CLASS, e.what(),            \
-                                   "No native stacktrace is available.", ret_val);                 \
-  }                                                                                                \
-  catch (const std::exception &e) {                                                                \
-    char const *stacktrace = "No native stacktrace is available.";                                 \
-    if (auto const cudf_ex = dynamic_cast<cudf::logic_error const *>(&e); cudf_ex != nullptr) {    \
-      stacktrace = cudf_ex->stacktrace();                                                          \
-    }                                                                                              \
-    /* Double check whether the thrown exception is unrecoverable CUDA error or not. */            \
-    /* Like cudf::detail::throw_cuda_error, it is nearly certain that a fatal error  */            \
-    /* occurred if the second call doesn't return with cudaSuccess. */                             \
-    cudaGetLastError();                                                                            \
-    auto const last = cudaFree(0);                                                                 \
-    if (cudaSuccess != last && last == cudaDeviceSynchronize()) {                                  \
-      /* Throw CudaFatalException since the thrown exception is unrecoverable CUDA error */        \
-      JNI_CHECK_THROW_CUDA_EXCEPTION(env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e.what(), stacktrace, \
-                                     last, ret_val);                                               \
-    }                                                                                              \
-    JNI_CHECK_THROW_CUDF_EXCEPTION(env, class_name, e.what(), stacktrace, ret_val);                \
+#define JNI_CHECK_THROW_CUDA_EXCEPTION(env, class_name, message, stacktrace, error_code, ret_val)   \
+  {                                                                                                 \
+    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                     \
+    auto const ex_class = env->FindClass(class_name);                                               \
+    if (ex_class == nullptr) { return ret_val; }                                                    \
+    auto const ctor_id =                                                                            \
+      env->GetMethodID(ex_class, "<init>", "(Ljava/lang/String;Ljava/lang/String;I)V");             \
+    if (ctor_id == nullptr) { return ret_val; }                                                     \
+    auto const empty_str = std::string{""};                                                         \
+    auto const jmessage  = env->NewStringUTF(message == nullptr ? empty_str.c_str() : message);     \
+    if (jmessage == nullptr) { return ret_val; }                                                    \
+    auto const jstacktrace =                                                                        \
+      env->NewStringUTF(stacktrace == nullptr ? empty_str.c_str() : stacktrace);                    \
+    if (jstacktrace == nullptr) { return ret_val; }                                                 \
+    auto const jerror_code = static_cast<jint>(error_code);                                         \
+    auto const jobj        = env->NewObject(ex_class, ctor_id, jmessage, jstacktrace, jerror_code); \
+    if (jobj == nullptr) { return ret_val; }                                                        \
+    env->Throw(reinterpret_cast<jthrowable>(jobj));                                                 \
+    return ret_val;                                                                                 \
+  }
+
+#define JNI_NULL_CHECK(env, obj, error_msg, ret_val)                                  \
+  {                                                                                   \
+    if ((obj) == 0) { JNI_THROW_NEW(env, cudf::jni::NPE_CLASS, error_msg, ret_val); } \
+  }
+
+#define JNI_ARG_CHECK(env, obj, error_msg, ret_val)                                       \
+  {                                                                                       \
+    if (!(obj)) { JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, error_msg, ret_val); } \
+  }
+
+#define CATCH_STD_CLASS(env, class_name, ret_val)                                                 \
+  catch (const rmm::out_of_memory& e)                                                             \
+  {                                                                                               \
+    JNI_EXCEPTION_OCCURRED_CHECK(env, ret_val);                                                   \
+    auto const what =                                                                             \
+      std::string("Could not allocate native memory: ") + (e.what() == nullptr ? "" : e.what());  \
+    JNI_THROW_NEW(env, cudf::jni::OOM_CLASS, what.c_str(), ret_val);                              \
+  }                                                                                               \
+  catch (const cudf::fatal_cuda_error& e)                                                         \
+  {                                                                                               \
+    JNI_CHECK_THROW_CUDA_EXCEPTION(                                                               \
+      env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e.what(), e.stacktrace(), e.error_code(), ret_val); \
+  }                                                                                               \
+  catch (const cudf::cuda_error& e)                                                               \
+  {                                                                                               \
+    JNI_CHECK_THROW_CUDA_EXCEPTION(                                                               \
+      env, cudf::jni::CUDA_ERROR_CLASS, e.what(), e.stacktrace(), e.error_code(), ret_val);       \
+  }                                                                                               \
+  catch (const cudf::data_type_error& e)                                                          \
+  {                                                                                               \
+    JNI_CHECK_THROW_CUDF_EXCEPTION(                                                               \
+      env, cudf::jni::CUDF_DTYPE_ERROR_CLASS, e.what(), e.stacktrace(), ret_val);                 \
+  }                                                                                               \
+  catch (std::overflow_error const& e)                                                            \
+  {                                                                                               \
+    JNI_CHECK_THROW_CUDF_EXCEPTION(env,                                                           \
+                                   cudf::jni::CUDF_OVERFLOW_ERROR_CLASS,                          \
+                                   e.what(),                                                      \
+                                   "No native stacktrace is available.",                          \
+                                   ret_val);                                                      \
+  }                                                                                               \
+  catch (const std::exception& e)                                                                 \
+  {                                                                                               \
+    char const* stacktrace = "No native stacktrace is available.";                                \
+    if (auto const cudf_ex = dynamic_cast<cudf::logic_error const*>(&e); cudf_ex != nullptr) {    \
+      stacktrace = cudf_ex->stacktrace();                                                         \
+    }                                                                                             \
+    /* Double check whether the thrown exception is unrecoverable CUDA error or not. */           \
+    /* Like cudf::detail::throw_cuda_error, it is nearly certain that a fatal error  */           \
+    /* occurred if the second call doesn't return with cudaSuccess. */                            \
+    cudaGetLastError();                                                                           \
+    auto const last = cudaFree(0);                                                                \
+    if (cudaSuccess != last && last == cudaDeviceSynchronize()) {                                 \
+      /* Throw CudaFatalException since the thrown exception is unrecoverable CUDA error */       \
+      JNI_CHECK_THROW_CUDA_EXCEPTION(                                                             \
+        env, cudf::jni::CUDA_FATAL_ERROR_CLASS, e.what(), stacktrace, last, ret_val);             \
+    }                                                                                             \
+    JNI_CHECK_THROW_CUDF_EXCEPTION(env, class_name, e.what(), stacktrace, ret_val);               \
   }
 
 #define CATCH_STD(env, ret_val) CATCH_STD_CLASS(env, cudf::jni::CUDF_ERROR_CLASS, ret_val)
diff --git a/java/src/main/native/include/maps_column_view.hpp b/java/src/main/native/include/maps_column_view.hpp
index 7d19615053d..be25dbd2e55 100644
--- a/java/src/main/native/include/maps_column_view.hpp
+++ b/java/src/main/native/include/maps_column_view.hpp
@@ -19,6 +19,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -38,16 +39,16 @@ namespace jni {
  * retrieve the corresponding value.
  */
 class maps_column_view {
-public:
-  maps_column_view(lists_column_view const &lists_of_structs,
+ public:
+  maps_column_view(lists_column_view const& lists_of_structs,
                    rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   // Rule of 5.
-  maps_column_view(maps_column_view const &maps_view) = default;
-  maps_column_view(maps_column_view &&maps_view) = default;
-  maps_column_view &operator=(maps_column_view const &) = default;
-  maps_column_view &operator=(maps_column_view &&) = default;
-  ~maps_column_view() = default;
+  maps_column_view(maps_column_view const& maps_view)  = default;
+  maps_column_view(maps_column_view&& maps_view)       = default;
+  maps_column_view& operator=(maps_column_view const&) = default;
+  maps_column_view& operator=(maps_column_view&&)      = default;
+  ~maps_column_view()                                  = default;
 
   /**
    * @brief Returns number of map rows in the column.
@@ -59,14 +60,14 @@ class maps_column_view {
    *
    * Note: Keys are not deduped. Repeated keys are returned in order.
    */
-  lists_column_view const &keys() const { return keys_; }
+  lists_column_view const& keys() const { return keys_; }
 
   /**
    * @brief Getter for values as a list column.
    *
    * Note: Values for repeated keys are not dropped.
    */
-  lists_column_view const &values() const { return values_; }
+  lists_column_view const& values() const { return values_; }
 
   /**
    * @brief Map lookup by a column of keys.
@@ -83,9 +84,10 @@ class maps_column_view {
    * @param mr Device memory resource used to allocate the returned column's device memory.
    * @return std::unique_ptr<column> Column of values corresponding the value of the lookup key.
    */
-  std::unique_ptr<column>
-  get_values_for(column_view const &keys, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+  std::unique_ptr<column> get_values_for(
+    column_view const& keys,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Map lookup by a scalar key.
@@ -101,9 +103,10 @@ class maps_column_view {
    * @param mr Device memory resource used to allocate the returned column's device memory.
    * @return std::unique_ptr<column>
    */
-  std::unique_ptr<column>
-  get_values_for(scalar const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+  std::unique_ptr<column> get_values_for(
+    scalar const& key,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Check if each map row contains a specified scalar key.
@@ -121,9 +124,10 @@ class maps_column_view {
    * @param mr Device memory resource used to allocate the returned column's device memory.
    * @return std::unique_ptr<column>
    */
-  std::unique_ptr<column>
-  contains(scalar const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-           rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+  std::unique_ptr<column> contains(
+    scalar const& key,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
   /**
    * @brief Check if each map row contains keys specified by a column
@@ -142,13 +146,14 @@ class maps_column_view {
    * @return std::unique_ptr<column>
    */
 
-  std::unique_ptr<column>
-  contains(column_view const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
-           rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+  std::unique_ptr<column> contains(
+    column_view const& key,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
-private:
+ private:
   lists_column_view keys_, values_;
 };
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/main/native/src/Aggregation128UtilsJni.cpp b/java/src/main/native/src/Aggregation128UtilsJni.cpp
index 71c36cb724a..ed8a8dc1e5c 100644
--- a/java/src/main/native/src/Aggregation128UtilsJni.cpp
+++ b/java/src/main/native/src/Aggregation128UtilsJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,11 +21,12 @@
 extern "C" {
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation128Utils_extractInt32Chunk(
-    JNIEnv *env, jclass, jlong j_column_view, jint j_out_dtype, jint j_chunk_idx) {
+  JNIEnv* env, jclass, jlong j_column_view, jint j_out_dtype, jint j_chunk_idx)
+{
   JNI_NULL_CHECK(env, j_column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto cview = reinterpret_cast<cudf::column_view const *>(j_column_view);
+    auto cview = reinterpret_cast<cudf::column_view const*>(j_column_view);
     auto dtype = cudf::jni::make_data_type(j_out_dtype, 0);
     return cudf::jni::release_as_jlong(cudf::jni::extract_chunk32(*cview, dtype, j_chunk_idx));
   }
@@ -33,13 +34,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation128Utils_extractInt32Chun
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Aggregation128Utils_combineInt64SumChunks(
-    JNIEnv *env, jclass, jlong j_table_view, jint j_dtype, jint j_scale) {
+  JNIEnv* env, jclass, jlong j_table_view, jint j_dtype, jint j_scale)
+{
   JNI_NULL_CHECK(env, j_table_view, "table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto tview = reinterpret_cast<cudf::table_view const *>(j_table_view);
+    auto tview = reinterpret_cast<cudf::table_view const*>(j_table_view);
     std::unique_ptr<cudf::table> result =
-        cudf::jni::assemble128_from_sum(*tview, cudf::jni::make_data_type(j_dtype, j_scale));
+      cudf::jni::assemble128_from_sum(*tview, cudf::jni::make_data_type(j_dtype, j_scale));
     return cudf::jni::convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/AggregationJni.cpp b/java/src/main/native/src/AggregationJni.cpp
index bc62e95c36a..c40f1c55500 100644
--- a/java/src/main/native/src/AggregationJni.cpp
+++ b/java/src/main/native/src/AggregationJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,85 +14,91 @@
  * limitations under the License.
  */
 
-#include <cudf/aggregation.hpp>
-
 #include "cudf_jni_apis.hpp"
 
+#include <cudf/aggregation.hpp>
+
 extern "C" {
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Aggregation_close(JNIEnv *env, jclass class_object,
-                                                             jlong ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Aggregation_close(JNIEnv* env,
+                                                             jclass class_object,
+                                                             jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto to_del = reinterpret_cast<cudf::aggregation *>(ptr);
+    auto to_del = reinterpret_cast<cudf::aggregation*>(ptr);
     delete to_del;
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv* env,
                                                                          jclass class_object,
-                                                                         jint kind) {
+                                                                         jint kind)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto ret = [&] {
       // These numbers come from Aggregation.java and must stay in sync
       switch (kind) {
-        case 0: // SUM
+        case 0:  // SUM
           return cudf::make_sum_aggregation();
-        case 1: // PRODUCT
+        case 1:  // PRODUCT
           return cudf::make_product_aggregation();
-        case 2: // MIN
+        case 2:  // MIN
           return cudf::make_min_aggregation();
-        case 3: // MAX
+        case 3:  // MAX
           return cudf::make_max_aggregation();
         // case 4 COUNT
-        case 5: // ANY
+        case 5:  // ANY
           return cudf::make_any_aggregation();
-        case 6: // ALL
+        case 6:  // ALL
           return cudf::make_all_aggregation();
-        case 7: // SUM_OF_SQUARES
+        case 7:  // SUM_OF_SQUARES
           return cudf::make_sum_of_squares_aggregation();
-        case 8: // MEAN
+        case 8:  // MEAN
           return cudf::make_mean_aggregation();
         // case 9: VARIANCE
         // case 10: STD
-        case 11: // MEDIAN
+        case 11:  // MEDIAN
           return cudf::make_median_aggregation();
         // case 12: QUANTILE
-        case 13: // ARGMAX
+        case 13:  // ARGMAX
           return cudf::make_argmax_aggregation();
-        case 14: // ARGMIN
+        case 14:  // ARGMIN
           return cudf::make_argmin_aggregation();
         // case 15: NUNIQUE
         // case 16: NTH_ELEMENT
-        case 17: // ROW_NUMBER
+        case 17:  // ROW_NUMBER
           return cudf::make_row_number_aggregation();
         // case 18: COLLECT_LIST
         // case 19: COLLECT_SET
-        case 20: // MERGE_LISTS
+        case 20:  // MERGE_LISTS
           return cudf::make_merge_lists_aggregation();
         // case 21: MERGE_SETS
         // case 22: LEAD
         // case 23: LAG
         // case 24: PTX
         // case 25: CUDA
-        case 26: // M2
+        case 26:  // M2
           return cudf::make_m2_aggregation();
-        case 27: // MERGE_M2
+        case 27:  // MERGE_M2
           return cudf::make_merge_m2_aggregation();
-        case 28: // RANK
-          return cudf::make_rank_aggregation(cudf::rank_method::MIN, {},
-                                             cudf::null_policy::INCLUDE);
-        case 29: // DENSE_RANK
-          return cudf::make_rank_aggregation(cudf::rank_method::DENSE, {},
-                                             cudf::null_policy::INCLUDE);
-        case 30: // ANSI SQL PERCENT_RANK
-          return cudf::make_rank_aggregation(cudf::rank_method::MIN, {}, cudf::null_policy::INCLUDE,
-                                             {}, cudf::rank_percentage::ONE_NORMALIZED);
-        case 33: // HISTOGRAM
+        case 28:  // RANK
+          return cudf::make_rank_aggregation(
+            cudf::rank_method::MIN, {}, cudf::null_policy::INCLUDE);
+        case 29:  // DENSE_RANK
+          return cudf::make_rank_aggregation(
+            cudf::rank_method::DENSE, {}, cudf::null_policy::INCLUDE);
+        case 30:  // ANSI SQL PERCENT_RANK
+          return cudf::make_rank_aggregation(cudf::rank_method::MIN,
+                                             {},
+                                             cudf::null_policy::INCLUDE,
+                                             {},
+                                             cudf::rank_percentage::ONE_NORMALIZED);
+        case 33:  // HISTOGRAM
           return cudf::make_histogram_aggregation();
-        case 34: // MERGE_HISTOGRAM
+        case 34:  // MERGE_HISTOGRAM
           return cudf::make_merge_histogram_aggregation();
 
         default: throw std::logic_error("Unsupported No Parameter Aggregation Operation");
@@ -104,33 +110,36 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNoParamAgg(JNIEnv
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNthAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createNthAgg(JNIEnv* env,
                                                                      jclass class_object,
                                                                      jint offset,
-                                                                     jboolean include_nulls) {
+                                                                     jboolean include_nulls)
+{
   try {
     cudf::jni::auto_set_device(env);
 
     std::unique_ptr<cudf::aggregation> ret = cudf::make_nth_element_aggregation(
-        offset, include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE);
+      offset, include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE);
     return reinterpret_cast<jlong>(ret.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createDdofAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createDdofAgg(JNIEnv* env,
                                                                       jclass class_object,
-                                                                      jint kind, jint ddof) {
+                                                                      jint kind,
+                                                                      jint ddof)
+{
   try {
     cudf::jni::auto_set_device(env);
 
     std::unique_ptr<cudf::aggregation> ret;
     // These numbers come from Aggregation.java and must stay in sync
     switch (kind) {
-      case 9: // VARIANCE
+      case 9:  // VARIANCE
         ret = cudf::make_variance_aggregation(ddof);
         break;
-      case 10: // STD
+      case 10:  // STD
         ret = cudf::make_std_aggregation(ddof);
         break;
       default: throw std::logic_error("Unsupported DDOF Aggregation Operation");
@@ -140,19 +149,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createDdofAgg(JNIEnv *en
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createTDigestAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createTDigestAgg(JNIEnv* env,
                                                                          jclass class_object,
-                                                                         jint kind, jint delta) {
+                                                                         jint kind,
+                                                                         jint delta)
+{
   try {
     cudf::jni::auto_set_device(env);
 
     std::unique_ptr<cudf::aggregation> ret;
     // These numbers come from Aggregation.java and must stay in sync
     switch (kind) {
-      case 31: // TDIGEST
+      case 31:  // TDIGEST
         ret = cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta);
         break;
-      case 32: // MERGE_TDIGEST
+      case 32:  // MERGE_TDIGEST
         ret = cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta);
         break;
       default: throw std::logic_error("Unsupported TDigest Aggregation Operation");
@@ -162,22 +173,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createTDigestAgg(JNIEnv
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCountLikeAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCountLikeAgg(JNIEnv* env,
                                                                            jclass class_object,
                                                                            jint kind,
-                                                                           jboolean include_nulls) {
+                                                                           jboolean include_nulls)
+{
   try {
     cudf::jni::auto_set_device(env);
 
     cudf::null_policy policy =
-        include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
+      include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
     std::unique_ptr<cudf::aggregation> ret;
     // These numbers come from Aggregation.java and must stay in sync
     switch (kind) {
-      case 4: // COUNT
+      case 4:  // COUNT
         ret = cudf::make_count_aggregation(policy);
         break;
-      case 15: // NUNIQUE
+      case 15:  // NUNIQUE
         ret = cudf::make_nunique_aggregation(policy);
         break;
       default: throw std::logic_error("Unsupported Count Like Aggregation Operation");
@@ -187,10 +199,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCountLikeAgg(JNIEn
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createQuantAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createQuantAgg(JNIEnv* env,
                                                                        jclass class_object,
                                                                        jint j_method,
-                                                                       jdoubleArray j_quantiles) {
+                                                                       jdoubleArray j_quantiles)
+{
   JNI_NULL_CHECK(env, j_quantiles, "quantiles are null", 0);
   try {
     cudf::jni::auto_set_device(env);
@@ -206,19 +219,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createQuantAgg(JNIEnv *e
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createLeadLagAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createLeadLagAgg(JNIEnv* env,
                                                                          jclass class_object,
-                                                                         jint kind, jint offset) {
+                                                                         jint kind,
+                                                                         jint offset)
+{
   try {
     cudf::jni::auto_set_device(env);
 
     std::unique_ptr<cudf::aggregation> ret;
     // These numbers come from Aggregation.java and must stay in sync
     switch (kind) {
-      case 22: // LEAD
+      case 22:  // LEAD
         ret = cudf::make_lead_aggregation(offset);
         break;
-      case 23: // LAG
+      case 23:  // LAG
         ret = cudf::make_lag_aggregation(offset);
         break;
       default: throw std::logic_error("Unsupported Lead/Lag Aggregation Operation");
@@ -228,53 +243,57 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createLeadLagAgg(JNIEnv
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectListAgg(
-    JNIEnv *env, jclass class_object, jboolean include_nulls) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectListAgg(JNIEnv* env,
+                                                                             jclass class_object,
+                                                                             jboolean include_nulls)
+{
   try {
     cudf::jni::auto_set_device(env);
     cudf::null_policy policy =
-        include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
+      include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
     std::unique_ptr<cudf::aggregation> ret = cudf::make_collect_list_aggregation(policy);
     return reinterpret_cast<jlong>(ret.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectSetAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createCollectSetAgg(JNIEnv* env,
                                                                             jclass class_object,
                                                                             jboolean include_nulls,
                                                                             jboolean nulls_equal,
-                                                                            jboolean nans_equal) {
+                                                                            jboolean nans_equal)
+{
   try {
     cudf::jni::auto_set_device(env);
     cudf::null_policy null_policy =
-        include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
+      include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
     cudf::null_equality null_equality =
-        nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+      nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     cudf::nan_equality nan_equality =
-        nans_equal ? cudf::nan_equality::ALL_EQUAL : cudf::nan_equality::UNEQUAL;
+      nans_equal ? cudf::nan_equality::ALL_EQUAL : cudf::nan_equality::UNEQUAL;
     std::unique_ptr<cudf::aggregation> ret =
-        cudf::make_collect_set_aggregation(null_policy, null_equality, nan_equality);
+      cudf::make_collect_set_aggregation(null_policy, null_equality, nan_equality);
     return reinterpret_cast<jlong>(ret.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createMergeSetsAgg(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Aggregation_createMergeSetsAgg(JNIEnv* env,
                                                                            jclass class_object,
                                                                            jboolean nulls_equal,
-                                                                           jboolean nans_equal) {
+                                                                           jboolean nans_equal)
+{
   try {
     cudf::jni::auto_set_device(env);
     cudf::null_equality null_equality =
-        nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+      nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     cudf::nan_equality nan_equality =
-        nans_equal ? cudf::nan_equality::ALL_EQUAL : cudf::nan_equality::UNEQUAL;
+      nans_equal ? cudf::nan_equality::ALL_EQUAL : cudf::nan_equality::UNEQUAL;
     std::unique_ptr<cudf::aggregation> ret =
-        cudf::make_merge_sets_aggregation(null_equality, nan_equality);
+      cudf::make_merge_sets_aggregation(null_equality, nan_equality);
     return reinterpret_cast<jlong>(ret.release());
   }
   CATCH_STD(env, 0);
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/ChunkedPackJni.cpp b/java/src/main/native/src/ChunkedPackJni.cpp
index 746a67e1463..2512d74a113 100644
--- a/java/src/main/native/src/ChunkedPackJni.cpp
+++ b/java/src/main/native/src/ChunkedPackJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,59 +17,65 @@
 #include "cudf_jni_apis.hpp"
 
 extern "C" {
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackDelete(JNIEnv *env, jclass,
-                                                                         jlong chunked_pack) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackDelete(JNIEnv* env,
+                                                                         jclass,
+                                                                         jlong chunked_pack)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
+    auto cs = reinterpret_cast<cudf::chunked_pack*>(chunked_pack);
     delete cs;
   }
   CATCH_STD(env, );
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackGetTotalContiguousSize(
-    JNIEnv *env, jclass, jlong chunked_pack) {
+  JNIEnv* env, jclass, jlong chunked_pack)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
+    auto cs = reinterpret_cast<cudf::chunked_pack*>(chunked_pack);
     return cs->get_total_contiguous_size();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackHasNext(JNIEnv *env, jclass,
-                                                                              jlong chunked_pack) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackHasNext(JNIEnv* env,
+                                                                              jclass,
+                                                                              jlong chunked_pack)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
+    auto cs = reinterpret_cast<cudf::chunked_pack*>(chunked_pack);
     return cs->has_next();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackNext(JNIEnv *env, jclass,
-                                                                        jlong chunked_pack,
-                                                                        jlong user_ptr,
-                                                                        jlong user_ptr_size) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackNext(
+  JNIEnv* env, jclass, jlong chunked_pack, jlong user_ptr, jlong user_ptr_size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
-    auto user_buffer_span = cudf::device_span<uint8_t>(reinterpret_cast<uint8_t *>(user_ptr),
+    auto cs               = reinterpret_cast<cudf::chunked_pack*>(chunked_pack);
+    auto user_buffer_span = cudf::device_span<uint8_t>(reinterpret_cast<uint8_t*>(user_ptr),
                                                        static_cast<std::size_t>(user_ptr_size));
     return cs->next(user_buffer_span);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL
-Java_ai_rapids_cudf_ChunkedPack_chunkedPackBuildMetadata(JNIEnv *env, jclass, jlong chunked_pack) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ChunkedPack_chunkedPackBuildMetadata(JNIEnv* env,
+                                                                                 jclass,
+                                                                                 jlong chunked_pack)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto cs = reinterpret_cast<cudf::chunked_pack *>(chunked_pack);
+    auto cs = reinterpret_cast<cudf::chunked_pack*>(chunked_pack);
     std::unique_ptr<std::vector<uint8_t>> result = cs->build_metadata();
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp
index 5ce23bbe712..7681008f584 100644
--- a/java/src/main/native/src/ChunkedReaderJni.cpp
+++ b/java/src/main/native/src/ChunkedReaderJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,20 +14,21 @@
  * limitations under the License.
  */
 
-#include <memory>
-#include <vector>
+#include "cudf_jni_apis.hpp"
+#include "jni_utils.hpp"
 
 #include <cudf/column/column.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/table/table.hpp>
 
-#include "cudf_jni_apis.hpp"
-#include "jni_utils.hpp"
+#include <memory>
+#include <vector>
 
 // This function is defined in `TableJni.cpp`.
-jlongArray
-cudf::jni::convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &&table_result,
-                                    std::vector<std::unique_ptr<cudf::column>> &&extra_columns);
+jlongArray cudf::jni::convert_table_for_return(
+  JNIEnv* env,
+  std::unique_ptr<cudf::table>&& table_result,
+  std::vector<std::unique_ptr<cudf::column>>&& extra_columns);
 
 // This file is for the code related to chunked reader (Parquet, ORC, etc.).
 
@@ -35,18 +36,28 @@ extern "C" {
 
 // This function should take all the parameters that `Table.readParquet` takes,
 // plus one more parameter `long chunkSizeByteLimit`.
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_create(
-    JNIEnv *env, jclass, jlong chunk_read_limit, jlong pass_read_limit,
-    jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jstring inp_file_path,
-    jlong buffer, jlong buffer_length, jint unit) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env,
+                                                jclass,
+                                                jlong chunk_read_limit,
+                                                jlong pass_read_limit,
+                                                jobjectArray filter_col_names,
+                                                jbooleanArray j_col_binary_read,
+                                                jstring inp_file_path,
+                                                jlong buffer,
+                                                jlong buffer_length,
+                                                jint unit)
+{
   JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0);
   bool read_buffer = true;
   if (buffer == 0) {
     JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", 0);
     read_buffer = false;
   } else if (inp_file_path != nullptr) {
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                  "Cannot pass in both a buffer and an inp_file_path", 0);
+    JNI_THROW_NEW(env,
+                  "java/lang/IllegalArgumentException",
+                  "Cannot pass in both a buffer and an inp_file_path",
+                  0);
   } else if (buffer_length <= 0) {
     JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "An empty buffer is not supported", 0);
   }
@@ -66,29 +77,35 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_create(
     cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
     (void)n_col_binary_read;
 
-    auto const source = read_buffer ?
-                            cudf::io::source_info(reinterpret_cast<char *>(buffer),
-                                                  static_cast<std::size_t>(buffer_length)) :
-                            cudf::io::source_info(filename.get());
+    auto const source = read_buffer ? cudf::io::source_info(reinterpret_cast<char*>(buffer),
+                                                            static_cast<std::size_t>(buffer_length))
+                                    : cudf::io::source_info(filename.get());
 
     auto opts_builder = cudf::io::parquet_reader_options::builder(source);
     if (n_filter_col_names.size() > 0) {
       opts_builder = opts_builder.columns(n_filter_col_names.as_cpp_vector());
     }
     auto const read_opts = opts_builder.convert_strings_to_categories(false)
-                               .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-                               .build();
+                             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+                             .build();
 
     return reinterpret_cast<jlong>(
-        new cudf::io::chunked_parquet_reader(static_cast<std::size_t>(chunk_read_limit),
-                                             static_cast<std::size_t>(pass_read_limit), read_opts));
+      new cudf::io::chunked_parquet_reader(static_cast<std::size_t>(chunk_read_limit),
+                                           static_cast<std::size_t>(pass_read_limit),
+                                           read_opts));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_createWithDataSource(
-    JNIEnv *env, jclass, jlong chunk_read_limit, jobjectArray filter_col_names,
-    jbooleanArray j_col_binary_read, jint unit, jlong ds_handle) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ParquetChunkedReader_createWithDataSource(JNIEnv* env,
+                                                              jclass,
+                                                              jlong chunk_read_limit,
+                                                              jobjectArray filter_col_names,
+                                                              jbooleanArray j_col_binary_read,
+                                                              jint unit,
+                                                              jlong ds_handle)
+{
   JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0);
   JNI_NULL_CHECK(env, ds_handle, "Null DataSouurce", 0);
 
@@ -103,7 +120,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_createWithDataS
     cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
     (void)n_col_binary_read;
 
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
     auto opts_builder = cudf::io::parquet_reader_options::builder(source);
@@ -111,49 +128,55 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_createWithDataS
       opts_builder = opts_builder.columns(n_filter_col_names.as_cpp_vector());
     }
     auto const read_opts = opts_builder.convert_strings_to_categories(false)
-                               .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-                               .build();
+                             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+                             .build();
 
-    return reinterpret_cast<jlong>(new cudf::io::chunked_parquet_reader(
-        static_cast<std::size_t>(chunk_read_limit), read_opts));
+    return reinterpret_cast<jlong>(
+      new cudf::io::chunked_parquet_reader(static_cast<std::size_t>(chunk_read_limit), read_opts));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_hasNext(JNIEnv *env, jclass,
-                                                                            jlong handle) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_hasNext(JNIEnv* env,
+                                                                            jclass,
+                                                                            jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_parquet_reader *const>(handle);
+    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_parquet_reader* const>(handle);
     return reader_ptr->has_next();
   }
   CATCH_STD(env, false);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_readChunk(JNIEnv *env, jclass,
-                                                                                jlong handle) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_readChunk(JNIEnv* env,
+                                                                                jclass,
+                                                                                jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_parquet_reader *const>(handle);
-    auto chunk = reader_ptr->read_chunk();
+    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_parquet_reader* const>(handle);
+    auto chunk            = reader_ptr->read_chunk();
     return chunk.tbl ? cudf::jni::convert_table_for_return(env, chunk.tbl) : nullptr;
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv *env, jclass,
-                                                                      jlong handle) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv* env,
+                                                                      jclass,
+                                                                      jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", );
 
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cudf::io::chunked_parquet_reader *>(handle);
+    delete reinterpret_cast<cudf::io::chunked_parquet_reader*>(handle);
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index e8a89f82a13..30a04e37d2c 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -14,9 +14,10 @@
  * limitations under the License.
  */
 
-#include <algorithm>
+#include "cudf_jni_apis.hpp"
+#include "dtype_utils.hpp"
+#include "jni_utils.hpp"
 
-#include <arrow/api.h>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
@@ -33,90 +34,96 @@
 #include <cudf/strings/combine.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
+
 #include <rmm/mr/device/per_device_resource.hpp>
 
-#include "cudf_jni_apis.hpp"
-#include "dtype_utils.hpp"
-#include "jni_utils.hpp"
+#include <arrow/api.h>
+
+#include <algorithm>
 
 using cudf::jni::ptr_as_jlong;
 using cudf::jni::release_as_jlong;
 
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequence(JNIEnv *env, jclass,
-                                                                  jlong j_initial_val, jlong j_step,
-                                                                  jint row_count) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequence(
+  JNIEnv* env, jclass, jlong j_initial_val, jlong j_step, jint row_count)
+{
   JNI_NULL_CHECK(env, j_initial_val, "scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto initial_val = reinterpret_cast<cudf::scalar const *>(j_initial_val);
-    auto step = reinterpret_cast<cudf::scalar const *>(j_step);
-    return release_as_jlong(step ? cudf::sequence(row_count, *initial_val, *step) :
-                                   cudf::sequence(row_count, *initial_val));
+    auto initial_val = reinterpret_cast<cudf::scalar const*>(j_initial_val);
+    auto step        = reinterpret_cast<cudf::scalar const*>(j_step);
+    return release_as_jlong(step ? cudf::sequence(row_count, *initial_val, *step)
+                                 : cudf::sequence(row_count, *initial_val));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequences(JNIEnv *env, jclass,
-                                                                   jlong j_start_handle,
-                                                                   jlong j_size_handle,
-                                                                   jlong j_step_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequences(
+  JNIEnv* env, jclass, jlong j_start_handle, jlong j_size_handle, jlong j_step_handle)
+{
   JNI_NULL_CHECK(env, j_start_handle, "start is null", 0);
   JNI_NULL_CHECK(env, j_size_handle, "size is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto start = reinterpret_cast<cudf::column_view const *>(j_start_handle);
-    auto size = reinterpret_cast<cudf::column_view const *>(j_size_handle);
-    auto step = reinterpret_cast<cudf::column_view const *>(j_step_handle);
+    auto start = reinterpret_cast<cudf::column_view const*>(j_start_handle);
+    auto size  = reinterpret_cast<cudf::column_view const*>(j_size_handle);
+    auto step  = reinterpret_cast<cudf::column_view const*>(j_step_handle);
     auto ret =
-        step ? cudf::lists::sequences(*start, *step, *size) : cudf::lists::sequences(*start, *size);
+      step ? cudf::lists::sequences(*start, *step, *size) : cudf::lists::sequences(*start, *size);
     return release_as_jlong(ret);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(
-    JNIEnv *env, jclass, jint j_type, jlong j_col_length, jlong j_null_count, jobject j_data_obj,
-    jobject j_validity_obj, jobject j_offsets_obj) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(JNIEnv* env,
+                                                                   jclass,
+                                                                   jint j_type,
+                                                                   jlong j_col_length,
+                                                                   jlong j_null_count,
+                                                                   jobject j_data_obj,
+                                                                   jobject j_validity_obj,
+                                                                   jobject j_offsets_obj)
+{
   try {
     cudf::jni::auto_set_device(env);
     cudf::type_id n_type = static_cast<cudf::type_id>(j_type);
     // not all the buffers are used for all types
-    void const *data_address = 0;
-    int data_length = 0;
+    void const* data_address = 0;
+    int data_length          = 0;
     if (j_data_obj != 0) {
       data_address = env->GetDirectBufferAddress(j_data_obj);
-      data_length = env->GetDirectBufferCapacity(j_data_obj);
+      data_length  = env->GetDirectBufferCapacity(j_data_obj);
     }
-    void const *validity_address = 0;
-    int validity_length = 0;
+    void const* validity_address = 0;
+    int validity_length          = 0;
     if (j_validity_obj != 0) {
       validity_address = env->GetDirectBufferAddress(j_validity_obj);
-      validity_length = env->GetDirectBufferCapacity(j_validity_obj);
+      validity_length  = env->GetDirectBufferCapacity(j_validity_obj);
     }
-    void const *offsets_address = 0;
-    int offsets_length = 0;
+    void const* offsets_address = 0;
+    int offsets_length          = 0;
     if (j_offsets_obj != 0) {
       offsets_address = env->GetDirectBufferAddress(j_offsets_obj);
-      offsets_length = env->GetDirectBufferCapacity(j_offsets_obj);
+      offsets_length  = env->GetDirectBufferCapacity(j_offsets_obj);
     }
     auto data_buffer =
-        arrow::Buffer::Wrap(static_cast<const char *>(data_address), static_cast<int>(data_length));
-    auto null_buffer = arrow::Buffer::Wrap(static_cast<const char *>(validity_address),
+      arrow::Buffer::Wrap(static_cast<const char*>(data_address), static_cast<int>(data_length));
+    auto null_buffer    = arrow::Buffer::Wrap(static_cast<const char*>(validity_address),
                                            static_cast<int>(validity_length));
-    auto offsets_buffer = arrow::Buffer::Wrap(static_cast<const char *>(offsets_address),
+    auto offsets_buffer = arrow::Buffer::Wrap(static_cast<const char*>(offsets_address),
                                               static_cast<int>(offsets_length));
 
     std::shared_ptr<arrow::Array> arrow_array;
     switch (n_type) {
       case cudf::type_id::DECIMAL32:
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL32 yet",
-                      0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL32 yet", 0);
         break;
       case cudf::type_id::DECIMAL64:
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL64 yet",
-                      0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DECIMAL64 yet", 0);
         break;
       case cudf::type_id::STRUCT:
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting STRUCT yet", 0);
@@ -125,23 +132,23 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting LIST yet", 0);
         break;
       case cudf::type_id::DICTIONARY32:
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
-                      "Don't support converting DICTIONARY32 yet", 0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "Don't support converting DICTIONARY32 yet", 0);
         break;
       case cudf::type_id::STRING:
-        arrow_array = std::make_shared<arrow::StringArray>(j_col_length, offsets_buffer,
-                                                           data_buffer, null_buffer, j_null_count);
+        arrow_array = std::make_shared<arrow::StringArray>(
+          j_col_length, offsets_buffer, data_buffer, null_buffer, j_null_count);
         break;
       default:
         // this handles the primitive types
-        arrow_array = cudf::detail::to_arrow_array(n_type, j_col_length, data_buffer, null_buffer,
-                                                   j_null_count);
+        arrow_array = cudf::detail::to_arrow_array(
+          n_type, j_col_length, data_buffer, null_buffer, j_null_count);
     }
-    auto name_and_type = arrow::field("col", arrow_array->type());
+    auto name_and_type                                = arrow::field("col", arrow_array->type());
     std::vector<std::shared_ptr<arrow::Field>> fields = {name_and_type};
-    std::shared_ptr<arrow::Schema> schema = std::make_shared<arrow::Schema>(fields);
+    std::shared_ptr<arrow::Schema> schema             = std::make_shared<arrow::Schema>(fields);
     auto arrow_table =
-        arrow::Table::Make(schema, std::vector<std::shared_ptr<arrow::Array>>{arrow_array});
+      arrow::Table::Make(schema, std::vector<std::shared_ptr<arrow::Array>>{arrow_array});
     auto retCols = cudf::from_arrow(*(arrow_table))->release();
     if (retCols.size() != 1) {
       JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Must result in one column", 0);
@@ -151,135 +158,155 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenation(
-    JNIEnv *env, jclass, jlongArray column_handles, jlong separator, jlong narep,
-    jboolean separate_nulls) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ColumnVector_stringConcatenation(JNIEnv* env,
+                                                     jclass,
+                                                     jlongArray column_handles,
+                                                     jlong separator,
+                                                     jlong narep,
+                                                     jboolean separate_nulls)
+{
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
   JNI_NULL_CHECK(env, separator, "separator string scalar object is null", 0);
   JNI_NULL_CHECK(env, narep, "narep string scalar object is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto &separator_scalar = *reinterpret_cast<cudf::string_scalar *>(separator);
-    const auto &narep_scalar = *reinterpret_cast<cudf::string_scalar *>(narep);
-    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES :
-                                        cudf::strings::separator_on_nulls::NO;
+    const auto& separator_scalar = *reinterpret_cast<cudf::string_scalar*>(separator);
+    const auto& narep_scalar     = *reinterpret_cast<cudf::string_scalar*>(narep);
+    auto null_policy             = separate_nulls ? cudf::strings::separator_on_nulls::YES
+                                                  : cudf::strings::separator_on_nulls::NO;
 
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
     auto column_views = n_cudf_columns.get_dereferenced();
     return release_as_jlong(cudf::strings::concatenate(
-        cudf::table_view(column_views), separator_scalar, narep_scalar, null_policy));
+      cudf::table_view(column_views), separator_scalar, narep_scalar, null_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepCol(
-    JNIEnv *env, jclass, jlongArray column_handles, jlong sep_handle, jlong separator_narep,
-    jlong col_narep, jboolean separate_nulls) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepCol(JNIEnv* env,
+                                                           jclass,
+                                                           jlongArray column_handles,
+                                                           jlong sep_handle,
+                                                           jlong separator_narep,
+                                                           jlong col_narep,
+                                                           jboolean separate_nulls)
+{
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
   JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0);
   JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0);
   JNI_NULL_CHECK(env, col_narep, "column narep string scalar object is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto &separator_narep_scalar = *reinterpret_cast<cudf::string_scalar *>(separator_narep);
-    const auto &col_narep_scalar = *reinterpret_cast<cudf::string_scalar *>(col_narep);
-    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES :
-                                        cudf::strings::separator_on_nulls::NO;
+    const auto& separator_narep_scalar = *reinterpret_cast<cudf::string_scalar*>(separator_narep);
+    const auto& col_narep_scalar       = *reinterpret_cast<cudf::string_scalar*>(col_narep);
+    auto null_policy                   = separate_nulls ? cudf::strings::separator_on_nulls::YES
+                                                        : cudf::strings::separator_on_nulls::NO;
 
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
-    auto column_views = n_cudf_columns.get_dereferenced();
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(sep_handle);
+    auto column_views         = n_cudf_columns.get_dereferenced();
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(sep_handle);
     cudf::strings_column_view strings_column(*column);
     return release_as_jlong(cudf::strings::concatenate(cudf::table_view(column_views),
-                                                       strings_column, separator_narep_scalar,
-                                                       col_narep_scalar, null_policy));
+                                                       strings_column,
+                                                       separator_narep_scalar,
+                                                       col_narep_scalar,
+                                                       null_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatListByRow(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatListByRow(JNIEnv* env,
+                                                                         jclass,
                                                                          jlongArray column_handles,
-                                                                         jboolean ignore_null) {
+                                                                         jboolean ignore_null)
+{
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto null_policy = ignore_null ? cudf::lists::concatenate_null_policy::IGNORE :
-                                     cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW;
+    auto null_policy = ignore_null ? cudf::lists::concatenate_null_policy::IGNORE
+                                   : cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW;
 
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
     auto column_views = n_cudf_columns.get_dereferenced();
     return release_as_jlong(
-        cudf::lists::concatenate_rows(cudf::table_view(column_views), null_policy));
+      cudf::lists::concatenate_rows(cudf::table_view(column_views), null_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeList(JNIEnv *env, jobject j_object,
-                                                                  jlongArray handles, jlong j_type,
-                                                                  jint scale, jlong row_count) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeList(
+  JNIEnv* env, jobject j_object, jlongArray handles, jlong j_type, jint scale, jlong row_count)
+{
   using ScalarType = cudf::scalar_type_t<cudf::size_type>;
   JNI_NULL_CHECK(env, handles, "native view handles are null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto children = cudf::jni::native_jpointerArray<cudf::column_view>(env, handles);
+    auto children        = cudf::jni::native_jpointerArray<cudf::column_view>(env, handles);
     auto children_vector = children.get_dereferenced();
-    auto zero = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
+    auto zero            = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
     zero->set_valid_async(true);
-    static_cast<ScalarType *>(zero.get())->set_value(0);
+    static_cast<ScalarType*>(zero.get())->set_value(0);
 
     if (children.size() == 0) {
       // special case because cudf::interleave_columns does not support no columns
-      auto offsets = cudf::make_column_from_scalar(*zero, row_count + 1);
+      auto offsets                = cudf::make_column_from_scalar(*zero, row_count + 1);
       cudf::data_type n_data_type = cudf::jni::make_data_type(j_type, scale);
-      auto empty_col = cudf::make_empty_column(n_data_type);
+      auto empty_col              = cudf::make_empty_column(n_data_type);
       return release_as_jlong(cudf::make_lists_column(
-          row_count, std::move(offsets), std::move(empty_col), 0, rmm::device_buffer()));
+        row_count, std::move(offsets), std::move(empty_col), 0, rmm::device_buffer()));
     } else {
       auto count = cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
       count->set_valid_async(true);
-      static_cast<ScalarType *>(count.get())->set_value(children.size());
+      static_cast<ScalarType*>(count.get())->set_value(children.size());
 
       std::unique_ptr<cudf::column> offsets = cudf::sequence(row_count + 1, *zero, *count);
       auto data_col = cudf::interleave_columns(cudf::table_view(children_vector));
       return release_as_jlong(cudf::make_lists_column(
-          row_count, std::move(offsets), std::move(data_col), 0, rmm::device_buffer()));
+        row_count, std::move(offsets), std::move(data_col), 0, rmm::device_buffer()));
     }
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeListFromOffsets(
-    JNIEnv *env, jobject j_object, jlong child_handle, jlong offsets_handle, jlong row_count) {
+  JNIEnv* env, jobject j_object, jlong child_handle, jlong offsets_handle, jlong row_count)
+{
   JNI_NULL_CHECK(env, child_handle, "child_handle is null", 0)
   JNI_NULL_CHECK(env, offsets_handle, "offsets_handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const child_cv = reinterpret_cast<cudf::column_view const *>(child_handle);
-    auto const offsets_cv = reinterpret_cast<cudf::column_view const *>(offsets_handle);
+    auto const child_cv   = reinterpret_cast<cudf::column_view const*>(child_handle);
+    auto const offsets_cv = reinterpret_cast<cudf::column_view const*>(offsets_handle);
     CUDF_EXPECTS(offsets_cv->type().id() == cudf::type_id::INT32,
                  "Input offsets does not have type INT32.");
 
-    return release_as_jlong(cudf::make_lists_column(
-        static_cast<cudf::size_type>(row_count), std::make_unique<cudf::column>(*offsets_cv),
-        std::make_unique<cudf::column>(*child_cv), 0, {}));
+    return release_as_jlong(cudf::make_lists_column(static_cast<cudf::size_type>(row_count),
+                                                    std::make_unique<cudf::column>(*offsets_cv),
+                                                    std::make_unique<cudf::column>(*child_cv),
+                                                    0,
+                                                    {}));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong j_scalar,
-                                                                    jint row_count) {
+                                                                    jint row_count)
+{
   JNI_NULL_CHECK(env, j_scalar, "scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto scalar_val = reinterpret_cast<cudf::scalar const *>(j_scalar);
+    auto scalar_val = reinterpret_cast<cudf::scalar const*>(j_scalar);
     if (scalar_val->type().id() == cudf::type_id::STRUCT && row_count == 0) {
       // Specialize the creation of empty struct column, since libcudf doesn't support it.
-      auto struct_scalar = reinterpret_cast<cudf::struct_scalar const *>(j_scalar);
-      auto children = cudf::empty_like(struct_scalar->view())->release();
-      auto mask_buffer = cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED);
+      auto struct_scalar = reinterpret_cast<cudf::struct_scalar const*>(j_scalar);
+      auto children      = cudf::empty_like(struct_scalar->view())->release();
+      auto mask_buffer   = cudf::create_null_mask(0, cudf::mask_state::UNALLOCATED);
       return release_as_jlong(
-          cudf::make_structs_column(0, std::move(children), 0, std::move(mask_buffer)));
+        cudf::make_structs_column(0, std::move(children), 0, std::move(mask_buffer)));
     } else {
       return release_as_jlong(cudf::make_column_from_scalar(*scalar_val, row_count));
     }
@@ -287,31 +314,36 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromScalar(JNIEnv *env,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatenate(JNIEnv *env, jclass clazz,
-                                                                     jlongArray column_handles) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatenate(JNIEnv* env,
+                                                                     jclass clazz,
+                                                                     jlongArray column_handles)
+{
   JNI_NULL_CHECK(env, column_handles, "input columns are null", 0);
   using cudf::column;
   using cudf::column_view;
   try {
     cudf::jni::auto_set_device(env);
     auto columns =
-        cudf::jni::native_jpointerArray<column_view>{env, column_handles}.get_dereferenced();
+      cudf::jni::native_jpointerArray<column_view>{env, column_handles}.get_dereferenced();
     auto const is_lists_column = columns[0].type().id() == cudf::type_id::LIST;
     return release_as_jlong(
-        is_lists_column ? cudf::lists::detail::concatenate(columns, cudf::get_default_stream(),
-                                                           rmm::mr::get_current_device_resource()) :
-                          cudf::concatenate(columns));
+      is_lists_column
+        ? cudf::lists::detail::concatenate(
+            columns, cudf::get_default_stream(), rmm::mr::get_current_device_resource())
+        : cudf::concatenate(columns));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_md5(JNIEnv *env, jobject j_object,
-                                                             jlongArray column_handles) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_md5(JNIEnv* env,
+                                                             jobject j_object,
+                                                             jlongArray column_handles)
+{
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
 
   try {
     auto column_views =
-        cudf::jni::native_jpointerArray<cudf::column_view>{env, column_handles}.get_dereferenced();
+      cudf::jni::native_jpointerArray<cudf::column_view>{env, column_handles}.get_dereferenced();
     return release_as_jlong(cudf::hashing::md5(cudf::table_view{column_views}));
   }
   CATCH_STD(env, 0);
@@ -323,46 +355,50 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_md5(JNIEnv *env, jobjec
 // only be called from the CudfColumn child class.
 ////////
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnVector_deleteCudfColumn(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnVector_deleteCudfColumn(JNIEnv* env,
                                                                          jobject j_object,
-                                                                         jlong handle) {
+                                                                         jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "column handle is null", );
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cudf::column *>(handle);
+    delete reinterpret_cast<cudf::column*>(handle);
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnVector_setNativeNullCountColumn(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnVector_setNativeNullCountColumn(JNIEnv* env,
                                                                                  jobject j_object,
                                                                                  jlong handle,
-                                                                                 jint null_count) {
+                                                                                 jint null_count)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", );
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column *column = reinterpret_cast<cudf::column *>(handle);
+    cudf::column* column = reinterpret_cast<cudf::column*>(handle);
     column->set_null_count(null_count);
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_getNativeColumnView(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_getNativeColumnView(JNIEnv* env,
                                                                              jobject j_object,
-                                                                             jlong handle) {
+                                                                             jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column *column = reinterpret_cast<cudf::column *>(handle);
+    cudf::column* column = reinterpret_cast<cudf::column*>(handle);
     return ptr_as_jlong(new cudf::column_view{*column});
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeEmptyCudfColumn(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeEmptyCudfColumn(JNIEnv* env,
+                                                                             jclass,
                                                                              jint j_type,
-                                                                             jint scale) {
-
+                                                                             jint scale)
+{
   try {
     cudf::jni::auto_set_device(env);
     cudf::data_type n_data_type = cudf::jni::make_data_type(j_type, scale);
@@ -371,15 +407,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_makeEmptyCudfColumn(JNI
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnVector_getNativeNullCountColumn(JNIEnv *env,
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnVector_getNativeNullCountColumn(JNIEnv* env,
                                                                                  jobject j_object,
-                                                                                 jlong handle) {
+                                                                                 jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column *column = reinterpret_cast<cudf::column *>(handle);
+    cudf::column* column = reinterpret_cast<cudf::column*>(handle);
     return static_cast<jint>(column->null_count());
   }
   CATCH_STD(env, 0);
 }
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index dd3859a4160..086d4672788 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -15,9 +15,11 @@
  */
 
 #include "ColumnViewJni.hpp"
-#include <numeric>
 
-#include <jni.h>
+#include "cudf_jni_apis.hpp"
+#include "dtype_utils.hpp"
+#include "jni_utils.hpp"
+#include "maps_column_view.hpp"
 
 #include <cudf/aggregation.hpp>
 #include <cudf/binaryop.hpp>
@@ -81,17 +83,17 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include "cudf_jni_apis.hpp"
-#include "dtype_utils.hpp"
-#include "jni_utils.hpp"
-#include "maps_column_view.hpp"
+#include <jni.h>
+
+#include <numeric>
 
 using cudf::jni::ptr_as_jlong;
 using cudf::jni::release_as_jlong;
 
 namespace {
 
-std::size_t pad_size(std::size_t size, bool const should_pad_for_cpu) {
+std::size_t pad_size(std::size_t size, bool const should_pad_for_cpu)
+{
   if (should_pad_for_cpu) {
     constexpr std::size_t ALIGN = sizeof(std::max_align_t);
     return (size + (ALIGN - 1)) & ~(ALIGN - 1);
@@ -100,9 +102,10 @@ std::size_t pad_size(std::size_t size, bool const should_pad_for_cpu) {
   }
 }
 
-std::size_t calc_device_memory_size(cudf::column_view const &view, bool const pad_for_cpu) {
+std::size_t calc_device_memory_size(cudf::column_view const& view, bool const pad_for_cpu)
+{
   std::size_t total = 0;
-  auto row_count = view.size();
+  auto row_count    = view.size();
 
   if (view.nullable()) {
     total += pad_size(cudf::bitmask_allocation_size_bytes(row_count), pad_for_cpu);
@@ -116,249 +119,274 @@ std::size_t calc_device_memory_size(cudf::column_view const &view, bool const pa
     total += pad_size(scv.chars_size(cudf::get_default_stream()), pad_for_cpu);
   }
 
-  return std::accumulate(view.child_begin(), view.child_end(), total,
-                         [pad_for_cpu](std::size_t t, cudf::column_view const &v) {
+  return std::accumulate(view.child_begin(),
+                         view.child_end(),
+                         total,
+                         [pad_for_cpu](std::size_t t, cudf::column_view const& v) {
                            return t + calc_device_memory_size(v, pad_for_cpu);
                          });
 }
 
-} // anonymous namespace
+}  // anonymous namespace
 
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_upperStrings(JNIEnv *env, jobject j_object,
-                                                                    jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_upperStrings(JNIEnv* env,
+                                                                    jobject j_object,
+                                                                    jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "column is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     cudf::strings_column_view strings_column(*column);
     return release_as_jlong(cudf::strings::to_upper(strings_column));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lowerStrings(JNIEnv *env, jobject j_object,
-                                                                    jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lowerStrings(JNIEnv* env,
+                                                                    jobject j_object,
+                                                                    jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "column is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     cudf::strings_column_view strings_column(*column);
     return release_as_jlong(cudf::strings::to_lower(strings_column));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsScalar(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong j_col,
-                                                                          jlong j_scalar) {
+                                                                          jlong j_scalar)
+{
   JNI_NULL_CHECK(env, j_col, "column is null", 0);
   JNI_NULL_CHECK(env, j_scalar, "scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view col = *reinterpret_cast<cudf::column_view *>(j_col);
-    auto val = reinterpret_cast<cudf::scalar *>(j_scalar);
+    cudf::column_view col = *reinterpret_cast<cudf::column_view*>(j_col);
+    auto val              = reinterpret_cast<cudf::scalar*>(j_scalar);
     return release_as_jlong(cudf::replace_nulls(col, *val));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsColumn(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsColumn(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong j_col,
-                                                                          jlong j_replace_col) {
+                                                                          jlong j_replace_col)
+{
   JNI_NULL_CHECK(env, j_col, "column is null", 0);
   JNI_NULL_CHECK(env, j_replace_col, "replacement column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto col = reinterpret_cast<cudf::column_view *>(j_col);
-    auto replacements = reinterpret_cast<cudf::column_view *>(j_replace_col);
+    auto col          = reinterpret_cast<cudf::column_view*>(j_col);
+    auto replacements = reinterpret_cast<cudf::column_view*>(j_replace_col);
     return release_as_jlong(cudf::replace_nulls(*col, *replacements));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsPolicy(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceNullsPolicy(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong j_col,
-                                                                          jboolean is_preceding) {
+                                                                          jboolean is_preceding)
+{
   JNI_NULL_CHECK(env, j_col, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view col = *reinterpret_cast<cudf::column_view *>(j_col);
+    cudf::column_view col = *reinterpret_cast<cudf::column_view*>(j_col);
     return release_as_jlong(cudf::replace_nulls(
-        col, is_preceding ? cudf::replace_policy::PRECEDING : cudf::replace_policy::FOLLOWING));
+      col, is_preceding ? cudf::replace_policy::PRECEDING : cudf::replace_policy::FOLLOWING));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_distinctCount(JNIEnv *env, jclass,
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_distinctCount(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong j_col,
-                                                                    jboolean nulls_included) {
+                                                                    jboolean nulls_included)
+{
   JNI_NULL_CHECK(env, j_col, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view col = *reinterpret_cast<cudf::column_view *>(j_col);
+    cudf::column_view col = *reinterpret_cast<cudf::column_view*>(j_col);
 
     return cudf::distinct_count(
-        col, nulls_included ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE,
-        cudf::nan_policy::NAN_IS_VALID);
+      col,
+      nulls_included ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE,
+      cudf::nan_policy::NAN_IS_VALID);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseVV(JNIEnv *env, jclass,
-                                                                jlong j_pred_vec, jlong j_true_vec,
-                                                                jlong j_false_vec) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseVV(
+  JNIEnv* env, jclass, jlong j_pred_vec, jlong j_true_vec, jlong j_false_vec)
+{
   JNI_NULL_CHECK(env, j_pred_vec, "predicate column is null", 0);
   JNI_NULL_CHECK(env, j_true_vec, "true column is null", 0);
   JNI_NULL_CHECK(env, j_false_vec, "false column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto pred_vec = reinterpret_cast<cudf::column_view *>(j_pred_vec);
-    auto true_vec = reinterpret_cast<cudf::column_view *>(j_true_vec);
-    auto false_vec = reinterpret_cast<cudf::column_view *>(j_false_vec);
+    auto pred_vec  = reinterpret_cast<cudf::column_view*>(j_pred_vec);
+    auto true_vec  = reinterpret_cast<cudf::column_view*>(j_true_vec);
+    auto false_vec = reinterpret_cast<cudf::column_view*>(j_false_vec);
     return release_as_jlong(cudf::copy_if_else(*true_vec, *false_vec, *pred_vec));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseVS(JNIEnv *env, jclass,
-                                                                jlong j_pred_vec, jlong j_true_vec,
-                                                                jlong j_false_scalar) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseVS(
+  JNIEnv* env, jclass, jlong j_pred_vec, jlong j_true_vec, jlong j_false_scalar)
+{
   JNI_NULL_CHECK(env, j_pred_vec, "predicate column is null", 0);
   JNI_NULL_CHECK(env, j_true_vec, "true column is null", 0);
   JNI_NULL_CHECK(env, j_false_scalar, "false scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto pred_vec = reinterpret_cast<cudf::column_view *>(j_pred_vec);
-    auto true_vec = reinterpret_cast<cudf::column_view *>(j_true_vec);
-    auto false_scalar = reinterpret_cast<cudf::scalar *>(j_false_scalar);
+    auto pred_vec     = reinterpret_cast<cudf::column_view*>(j_pred_vec);
+    auto true_vec     = reinterpret_cast<cudf::column_view*>(j_true_vec);
+    auto false_scalar = reinterpret_cast<cudf::scalar*>(j_false_scalar);
     return release_as_jlong(cudf::copy_if_else(*true_vec, *false_scalar, *pred_vec));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseSV(JNIEnv *env, jclass,
-                                                                jlong j_pred_vec,
-                                                                jlong j_true_scalar,
-                                                                jlong j_false_vec) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseSV(
+  JNIEnv* env, jclass, jlong j_pred_vec, jlong j_true_scalar, jlong j_false_vec)
+{
   JNI_NULL_CHECK(env, j_pred_vec, "predicate column is null", 0);
   JNI_NULL_CHECK(env, j_true_scalar, "true scalar is null", 0);
   JNI_NULL_CHECK(env, j_false_vec, "false column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto pred_vec = reinterpret_cast<cudf::column_view *>(j_pred_vec);
-    auto true_scalar = reinterpret_cast<cudf::scalar *>(j_true_scalar);
-    auto false_vec = reinterpret_cast<cudf::column_view *>(j_false_vec);
+    auto pred_vec    = reinterpret_cast<cudf::column_view*>(j_pred_vec);
+    auto true_scalar = reinterpret_cast<cudf::scalar*>(j_true_scalar);
+    auto false_vec   = reinterpret_cast<cudf::column_view*>(j_false_vec);
     return release_as_jlong(cudf::copy_if_else(*true_scalar, *false_vec, *pred_vec));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseSS(JNIEnv *env, jclass,
-                                                                jlong j_pred_vec,
-                                                                jlong j_true_scalar,
-                                                                jlong j_false_scalar) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_ifElseSS(
+  JNIEnv* env, jclass, jlong j_pred_vec, jlong j_true_scalar, jlong j_false_scalar)
+{
   JNI_NULL_CHECK(env, j_pred_vec, "predicate column is null", 0);
   JNI_NULL_CHECK(env, j_true_scalar, "true scalar is null", 0);
   JNI_NULL_CHECK(env, j_false_scalar, "false scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto pred_vec = reinterpret_cast<cudf::column_view *>(j_pred_vec);
-    auto true_scalar = reinterpret_cast<cudf::scalar *>(j_true_scalar);
-    auto false_scalar = reinterpret_cast<cudf::scalar *>(j_false_scalar);
+    auto pred_vec     = reinterpret_cast<cudf::column_view*>(j_pred_vec);
+    auto true_scalar  = reinterpret_cast<cudf::scalar*>(j_true_scalar);
+    auto false_scalar = reinterpret_cast<cudf::scalar*>(j_false_scalar);
     return release_as_jlong(cudf::copy_if_else(*true_scalar, *false_scalar, *pred_vec));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getElement(JNIEnv *env, jclass, jlong from,
-                                                                  jint index) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getElement(JNIEnv* env,
+                                                                  jclass,
+                                                                  jlong from,
+                                                                  jint index)
+{
   JNI_NULL_CHECK(env, from, "from column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto from_vec = reinterpret_cast<cudf::column_view *>(from);
+    auto from_vec = reinterpret_cast<cudf::column_view*>(from);
     return release_as_jlong(cudf::get_element(*from_vec, index));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_reduce(JNIEnv *env, jclass, jlong j_col_view,
-                                                              jlong j_agg, jint j_dtype,
-                                                              jint scale) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_reduce(
+  JNIEnv* env, jclass, jlong j_col_view, jlong j_agg, jint j_dtype, jint scale)
+{
   JNI_NULL_CHECK(env, j_col_view, "column view is null", 0);
   JNI_NULL_CHECK(env, j_agg, "aggregation is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto col = reinterpret_cast<cudf::column_view *>(j_col_view);
-    auto agg = reinterpret_cast<cudf::aggregation *>(j_agg);
+    auto col                  = reinterpret_cast<cudf::column_view*>(j_col_view);
+    auto agg                  = reinterpret_cast<cudf::aggregation*>(j_agg);
     cudf::data_type out_dtype = cudf::jni::make_data_type(j_dtype, scale);
     return release_as_jlong(
-        cudf::reduce(*col, *dynamic_cast<cudf::reduce_aggregation *>(agg), out_dtype));
+      cudf::reduce(*col, *dynamic_cast<cudf::reduce_aggregation*>(agg), out_dtype));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_segmentedReduce(
-    JNIEnv *env, jclass, jlong j_data_view, jlong j_offsets_view, jlong j_agg,
-    jboolean include_nulls, jint j_dtype, jint scale) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_segmentedReduce(JNIEnv* env,
+                                                                       jclass,
+                                                                       jlong j_data_view,
+                                                                       jlong j_offsets_view,
+                                                                       jlong j_agg,
+                                                                       jboolean include_nulls,
+                                                                       jint j_dtype,
+                                                                       jint scale)
+{
   JNI_NULL_CHECK(env, j_data_view, "data column view is null", 0);
   JNI_NULL_CHECK(env, j_offsets_view, "offsets column view is null", 0);
   JNI_NULL_CHECK(env, j_agg, "aggregation is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto data = reinterpret_cast<cudf::column_view *>(j_data_view);
-    auto offsets = reinterpret_cast<cudf::column_view *>(j_offsets_view);
-    auto agg = reinterpret_cast<cudf::aggregation *>(j_agg);
-    auto s_agg = dynamic_cast<cudf::segmented_reduce_aggregation *>(agg);
+    auto data    = reinterpret_cast<cudf::column_view*>(j_data_view);
+    auto offsets = reinterpret_cast<cudf::column_view*>(j_offsets_view);
+    auto agg     = reinterpret_cast<cudf::aggregation*>(j_agg);
+    auto s_agg   = dynamic_cast<cudf::segmented_reduce_aggregation*>(agg);
     JNI_ARG_CHECK(env, s_agg != nullptr, "agg is not a cudf::segmented_reduce_aggregation", 0)
     auto null_policy = include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
     cudf::data_type out_dtype = cudf::jni::make_data_type(j_dtype, scale);
     return release_as_jlong(
-        cudf::segmented_reduce(*data, *offsets, *s_agg, out_dtype, null_policy));
+      cudf::segmented_reduce(*data, *offsets, *s_agg, out_dtype, null_policy));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_segmentedGather(
-    JNIEnv *env, jclass, jlong source_column, jlong gather_map_list, jboolean nullify_out_bounds) {
+  JNIEnv* env, jclass, jlong source_column, jlong gather_map_list, jboolean nullify_out_bounds)
+{
   JNI_NULL_CHECK(env, source_column, "source column view is null", 0);
   JNI_NULL_CHECK(env, gather_map_list, "gather map is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const &src_col =
-        cudf::lists_column_view(*reinterpret_cast<cudf::column_view *>(source_column));
-    auto const &gather_map =
-        cudf::lists_column_view(*reinterpret_cast<cudf::column_view *>(gather_map_list));
-    auto out_bounds_policy = nullify_out_bounds ? cudf::out_of_bounds_policy::NULLIFY :
-                                                  cudf::out_of_bounds_policy::DONT_CHECK;
+    auto const& src_col =
+      cudf::lists_column_view(*reinterpret_cast<cudf::column_view*>(source_column));
+    auto const& gather_map =
+      cudf::lists_column_view(*reinterpret_cast<cudf::column_view*>(gather_map_list));
+    auto out_bounds_policy = nullify_out_bounds ? cudf::out_of_bounds_policy::NULLIFY
+                                                : cudf::out_of_bounds_policy::DONT_CHECK;
     return release_as_jlong(cudf::lists::segmented_gather(src_col, gather_map, out_bounds_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_scan(JNIEnv *env, jclass, jlong j_col_view,
-                                                            jlong j_agg, jboolean is_inclusive,
-                                                            jboolean include_nulls) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_scan(
+  JNIEnv* env, jclass, jlong j_col_view, jlong j_agg, jboolean is_inclusive, jboolean include_nulls)
+{
   JNI_NULL_CHECK(env, j_col_view, "column view is null", 0);
   JNI_NULL_CHECK(env, j_agg, "aggregation is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto col = reinterpret_cast<cudf::column_view *>(j_col_view);
-    auto agg = reinterpret_cast<cudf::aggregation *>(j_agg);
-    auto scan_type = is_inclusive ? cudf::scan_type::INCLUSIVE : cudf::scan_type::EXCLUSIVE;
+    auto col         = reinterpret_cast<cudf::column_view*>(j_col_view);
+    auto agg         = reinterpret_cast<cudf::aggregation*>(j_agg);
+    auto scan_type   = is_inclusive ? cudf::scan_type::INCLUSIVE : cudf::scan_type::EXCLUSIVE;
     auto null_policy = include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
     return release_as_jlong(
-        cudf::scan(*col, *dynamic_cast<cudf::scan_aggregation *>(agg), scan_type, null_policy));
+      cudf::scan(*col, *dynamic_cast<cudf::scan_aggregation*>(agg), scan_type, null_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_approxPercentile(JNIEnv *env, jclass clazz,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_approxPercentile(JNIEnv* env,
+                                                                        jclass clazz,
                                                                         jlong input_column,
-                                                                        jlong percentiles_column) {
+                                                                        jlong percentiles_column)
+{
   JNI_NULL_CHECK(env, input_column, "input_column native handle is null", 0);
   JNI_NULL_CHECK(env, percentiles_column, "percentiles_column native handle is null", 0);
   try {
@@ -366,63 +394,70 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_approxPercentile(JNIEnv *
     using tdigest_column_view = cudf::tdigest::tdigest_column_view;
     jni::auto_set_device(env);
     auto const tdigest_view =
-        tdigest_column_view{structs_column_view{*reinterpret_cast<column_view *>(input_column)}};
-    auto const p_percentiles = reinterpret_cast<column_view *>(percentiles_column);
+      tdigest_column_view{structs_column_view{*reinterpret_cast<column_view*>(input_column)}};
+    auto const p_percentiles = reinterpret_cast<column_view*>(percentiles_column);
     return release_as_jlong(percentile_approx(tdigest_view, *p_percentiles));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_quantile(JNIEnv *env, jclass clazz,
-                                                                jlong input_column,
-                                                                jint quantile_method,
-                                                                jdoubleArray jquantiles) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_quantile(
+  JNIEnv* env, jclass clazz, jlong input_column, jint quantile_method, jdoubleArray jquantiles)
+{
   JNI_NULL_CHECK(env, input_column, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jdoubleArray native_quantiles(env, jquantiles);
     std::vector<double> quantiles(native_quantiles.data(),
                                   native_quantiles.data() + native_quantiles.size());
-    cudf::column_view *n_input_column = reinterpret_cast<cudf::column_view *>(input_column);
+    cudf::column_view* n_input_column     = reinterpret_cast<cudf::column_view*>(input_column);
     cudf::interpolation n_quantile_method = static_cast<cudf::interpolation>(quantile_method);
     return release_as_jlong(cudf::quantile(*n_input_column, quantiles, n_quantile_method));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_rollingWindow(
-    JNIEnv *env, jclass clazz, jlong input_col, jlong default_output_col, jint min_periods,
-    jlong agg_ptr, jint preceding, jint following, jlong preceding_col, jlong following_col) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_rollingWindow(JNIEnv* env,
+                                                                     jclass clazz,
+                                                                     jlong input_col,
+                                                                     jlong default_output_col,
+                                                                     jint min_periods,
+                                                                     jlong agg_ptr,
+                                                                     jint preceding,
+                                                                     jint following,
+                                                                     jlong preceding_col,
+                                                                     jlong following_col)
+{
   JNI_NULL_CHECK(env, input_col, "native handle is null", 0);
   JNI_NULL_CHECK(env, agg_ptr, "aggregation handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_input_col = reinterpret_cast<cudf::column_view *>(input_col);
-    cudf::column_view *n_default_output_col =
-        reinterpret_cast<cudf::column_view *>(default_output_col);
-    cudf::column_view *n_preceding_col = reinterpret_cast<cudf::column_view *>(preceding_col);
-    cudf::column_view *n_following_col = reinterpret_cast<cudf::column_view *>(following_col);
-    cudf::rolling_aggregation *agg =
-        dynamic_cast<cudf::rolling_aggregation *>(reinterpret_cast<cudf::aggregation *>(agg_ptr));
+    cudf::column_view* n_input_col = reinterpret_cast<cudf::column_view*>(input_col);
+    cudf::column_view* n_default_output_col =
+      reinterpret_cast<cudf::column_view*>(default_output_col);
+    cudf::column_view* n_preceding_col = reinterpret_cast<cudf::column_view*>(preceding_col);
+    cudf::column_view* n_following_col = reinterpret_cast<cudf::column_view*>(following_col);
+    cudf::rolling_aggregation* agg =
+      dynamic_cast<cudf::rolling_aggregation*>(reinterpret_cast<cudf::aggregation*>(agg_ptr));
     JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of rolling_aggregation", 0);
 
     std::unique_ptr<cudf::column> ret;
     if (n_default_output_col != nullptr) {
       if (n_preceding_col != nullptr && n_following_col != nullptr) {
-        CUDF_FAIL("A default output column is not currently supported with variable length "
-                  "preceding and following");
+        CUDF_FAIL(
+          "A default output column is not currently supported with variable length "
+          "preceding and following");
         // ret = cudf::rolling_window(*n_input_col, *n_default_output_col,
         //        *n_preceding_col, *n_following_col, min_periods, agg);
       } else {
-        ret = cudf::rolling_window(*n_input_col, *n_default_output_col, preceding, following,
-                                   min_periods, *agg);
+        ret = cudf::rolling_window(
+          *n_input_col, *n_default_output_col, preceding, following, min_periods, *agg);
       }
 
     } else {
       if (n_preceding_col != nullptr && n_following_col != nullptr) {
-        ret = cudf::rolling_window(*n_input_col, *n_preceding_col, *n_following_col, min_periods,
-                                   *agg);
+        ret =
+          cudf::rolling_window(*n_input_col, *n_preceding_col, *n_following_col, min_periods, *agg);
       } else {
         ret = cudf::rolling_window(*n_input_col, preceding, following, min_periods, *agg);
       }
@@ -432,301 +467,336 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_rollingWindow(
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_slice(JNIEnv *env, jclass clazz,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_slice(JNIEnv* env,
+                                                                  jclass clazz,
                                                                   jlong input_column,
-                                                                  jintArray slice_indices) {
+                                                                  jintArray slice_indices)
+{
   JNI_NULL_CHECK(env, input_column, "native handle is null", 0);
   JNI_NULL_CHECK(env, slice_indices, "slice indices are null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(input_column);
+    cudf::column_view* n_column = reinterpret_cast<cudf::column_view*>(input_column);
     cudf::jni::native_jintArray n_slice_indices(env, slice_indices);
     std::vector<cudf::size_type> indices(n_slice_indices.begin(), n_slice_indices.end());
 
     std::vector<cudf::column_view> result = cudf::slice(*n_column, indices);
     cudf::jni::native_jlongArray n_result(env, result.size());
 
-    std::transform(result.begin(), result.end(), n_result.begin(),
-                   [](cudf::column_view const &result_col) {
-                     return ptr_as_jlong(new cudf::column{result_col});
-                   });
+    std::transform(
+      result.begin(), result.end(), n_result.begin(), [](cudf::column_view const& result_col) {
+        return ptr_as_jlong(new cudf::column{result_col});
+      });
 
     return n_result.get_jArray();
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractListElement(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractListElement(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong column_view,
-                                                                          jint index) {
+                                                                          jint index)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::lists_column_view lcv(*cv);
     return release_as_jlong(cudf::lists::extract_list_element(lcv, index));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractListElementV(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractListElementV(JNIEnv* env,
+                                                                           jclass,
                                                                            jlong column_view,
-                                                                           jlong indices_view) {
+                                                                           jlong indices_view)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, indices_view, "indices is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *indices = reinterpret_cast<cudf::column_view *>(indices_view);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* indices = reinterpret_cast<cudf::column_view*>(indices_view);
+    cudf::column_view* cv      = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::lists_column_view lcv(*cv);
     return release_as_jlong(cudf::lists::extract_list_element(lcv, *indices));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dropListDuplicates(JNIEnv *env, jclass,
-                                                                          jlong column_view) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dropListDuplicates(JNIEnv* env,
+                                                                          jclass,
+                                                                          jlong column_view)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_cv = reinterpret_cast<cudf::column_view const *>(column_view);
+    auto const input_cv = reinterpret_cast<cudf::column_view const*>(column_view);
     return release_as_jlong(cudf::lists::distinct(cudf::lists_column_view{*input_cv}));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dropListDuplicatesWithKeysValues(
-    JNIEnv *env, jclass, jlong keys_vals_handle) {
+  JNIEnv* env, jclass, jlong keys_vals_handle)
+{
   JNI_NULL_CHECK(env, keys_vals_handle, "keys_vals_handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_cv = reinterpret_cast<cudf::column_view const *>(keys_vals_handle);
-    JNI_ARG_CHECK(env, input_cv->type().id() == cudf::type_id::LIST,
-                  "Input column is not a lists column.", 0);
+    auto const input_cv = reinterpret_cast<cudf::column_view const*>(keys_vals_handle);
+    JNI_ARG_CHECK(
+      env, input_cv->type().id() == cudf::type_id::LIST, "Input column is not a lists column.", 0);
 
     auto const lists_keys_vals = cudf::lists_column_view(*input_cv);
-    auto const keys_vals = lists_keys_vals.child();
-    JNI_ARG_CHECK(env, keys_vals.type().id() == cudf::type_id::STRUCT,
-                  "Input column has child that is not a structs column.", 0);
-    JNI_ARG_CHECK(env, keys_vals.num_children() == 2,
-                  "Input column has child that does not have 2 children.", 0);
+    auto const keys_vals       = lists_keys_vals.child();
+    JNI_ARG_CHECK(env,
+                  keys_vals.type().id() == cudf::type_id::STRUCT,
+                  "Input column has child that is not a structs column.",
+                  0);
+    JNI_ARG_CHECK(env,
+                  keys_vals.num_children() == 2,
+                  "Input column has child that does not have 2 children.",
+                  0);
 
     return release_as_jlong(
-        cudf::jni::lists_distinct_by_key(lists_keys_vals, cudf::get_default_stream()));
+      cudf::jni::lists_distinct_by_key(lists_keys_vals, cudf::get_default_stream()));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_flattenLists(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_flattenLists(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong input_handle,
-                                                                    jboolean ignore_null) {
+                                                                    jboolean ignore_null)
+{
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const null_policy = ignore_null ? cudf::lists::concatenate_null_policy::IGNORE :
-                                           cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW;
-    auto const input_cv = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const null_policy = ignore_null ? cudf::lists::concatenate_null_policy::IGNORE
+                                         : cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW;
+    auto const input_cv    = reinterpret_cast<cudf::column_view const*>(input_handle);
     return release_as_jlong(cudf::lists::concatenate_list_elements(*input_cv, null_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContains(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContains(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong column_view,
-                                                                    jlong lookup_key) {
+                                                                    jlong lookup_key)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_key, "lookup scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::lists_column_view lcv(*cv);
-    cudf::scalar *lookup_scalar = reinterpret_cast<cudf::scalar *>(lookup_key);
+    cudf::scalar* lookup_scalar = reinterpret_cast<cudf::scalar*>(lookup_key);
     return release_as_jlong(cudf::lists::contains(lcv, *lookup_scalar));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsNulls(JNIEnv *env, jclass,
-                                                                         jlong column_view) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsNulls(JNIEnv* env,
+                                                                         jclass,
+                                                                         jlong column_view)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto cv = reinterpret_cast<cudf::column_view *>(column_view);
+    auto cv  = reinterpret_cast<cudf::column_view*>(column_view);
     auto lcv = cudf::lists_column_view{*cv};
     return release_as_jlong(cudf::lists::contains_nulls(lcv));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsColumn(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listContainsColumn(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong column_view,
-                                                                          jlong lookup_key_cv) {
+                                                                          jlong lookup_key_cv)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_key_cv, "lookup column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::lists_column_view lcv(*cv);
-    cudf::column_view *lookup_cv = reinterpret_cast<cudf::column_view *>(lookup_key_cv);
+    cudf::column_view* lookup_cv = reinterpret_cast<cudf::column_view*>(lookup_key_cv);
     return release_as_jlong(cudf::lists::contains(lcv, *lookup_cv));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listIndexOfScalar(JNIEnv *env, jclass,
-                                                                         jlong column_view,
-                                                                         jlong lookup_key,
-                                                                         jboolean is_find_first) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listIndexOfScalar(
+  JNIEnv* env, jclass, jlong column_view, jlong lookup_key, jboolean is_find_first)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_key, "lookup scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = reinterpret_cast<cudf::column_view const *>(column_view);
-    auto const lcv = cudf::lists_column_view{*cv};
-    auto const lookup_key_scalar = reinterpret_cast<cudf::scalar const *>(lookup_key);
-    auto const find_option = is_find_first ? cudf::lists::duplicate_find_option::FIND_FIRST :
-                                             cudf::lists::duplicate_find_option::FIND_LAST;
+    auto const cv                = reinterpret_cast<cudf::column_view const*>(column_view);
+    auto const lcv               = cudf::lists_column_view{*cv};
+    auto const lookup_key_scalar = reinterpret_cast<cudf::scalar const*>(lookup_key);
+    auto const find_option       = is_find_first ? cudf::lists::duplicate_find_option::FIND_FIRST
+                                                 : cudf::lists::duplicate_find_option::FIND_LAST;
     return release_as_jlong(cudf::lists::index_of(lcv, *lookup_key_scalar, find_option));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listIndexOfColumn(JNIEnv *env, jclass,
-                                                                         jlong column_view,
-                                                                         jlong lookup_keys,
-                                                                         jboolean is_find_first) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listIndexOfColumn(
+  JNIEnv* env, jclass, jlong column_view, jlong lookup_keys, jboolean is_find_first)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_keys, "lookup key column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = reinterpret_cast<cudf::column_view const *>(column_view);
-    auto const lcv = cudf::lists_column_view{*cv};
-    auto const lookup_key_column = reinterpret_cast<cudf::column_view const *>(lookup_keys);
-    auto const find_option = is_find_first ? cudf::lists::duplicate_find_option::FIND_FIRST :
-                                             cudf::lists::duplicate_find_option::FIND_LAST;
+    auto const cv                = reinterpret_cast<cudf::column_view const*>(column_view);
+    auto const lcv               = cudf::lists_column_view{*cv};
+    auto const lookup_key_column = reinterpret_cast<cudf::column_view const*>(lookup_keys);
+    auto const find_option       = is_find_first ? cudf::lists::duplicate_find_option::FIND_FIRST
+                                                 : cudf::lists::duplicate_find_option::FIND_LAST;
     return release_as_jlong(cudf::lists::index_of(lcv, *lookup_key_column, find_option));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(JNIEnv *env, jclass,
-                                                                    jlong column_view,
-                                                                    jboolean is_descending,
-                                                                    jboolean is_null_smallest) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listSortRows(
+  JNIEnv* env, jclass, jlong column_view, jboolean is_descending, jboolean is_null_smallest)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
     auto sort_order = is_descending ? cudf::order::DESCENDING : cudf::order::ASCENDING;
     auto null_order = is_null_smallest ? cudf::null_order::BEFORE : cudf::null_order::AFTER;
-    auto *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    auto* cv        = reinterpret_cast<cudf::column_view*>(column_view);
     return release_as_jlong(
-        cudf::lists::sort_lists(cudf::lists_column_view(*cv), sort_order, null_order));
+      cudf::lists::sort_lists(cudf::lists_column_view(*cv), sort_order, null_order));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_generateListOffsets(JNIEnv *env, jclass,
-                                                                           jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_generateListOffsets(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = reinterpret_cast<cudf::column_view const *>(handle);
+    auto const cv = reinterpret_cast<cudf::column_view const*>(handle);
     return release_as_jlong(cudf::jni::generate_list_offsets(*cv));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsHaveOverlap(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsHaveOverlap(JNIEnv* env,
+                                                                        jclass,
                                                                         jlong lhs_handle,
-                                                                        jlong rhs_handle) {
+                                                                        jlong rhs_handle)
+{
   JNI_NULL_CHECK(env, lhs_handle, "lhs_handle is null", 0)
   JNI_NULL_CHECK(env, rhs_handle, "rhs_handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const lhs = reinterpret_cast<cudf::column_view const *>(lhs_handle);
-    auto const rhs = reinterpret_cast<cudf::column_view const *>(rhs_handle);
-    auto overlap_result =
-        cudf::lists::have_overlap(cudf::lists_column_view{*lhs}, cudf::lists_column_view{*rhs},
-                                  cudf::null_equality::UNEQUAL, cudf::nan_equality::ALL_EQUAL);
+    auto const lhs      = reinterpret_cast<cudf::column_view const*>(lhs_handle);
+    auto const rhs      = reinterpret_cast<cudf::column_view const*>(rhs_handle);
+    auto overlap_result = cudf::lists::have_overlap(cudf::lists_column_view{*lhs},
+                                                    cudf::lists_column_view{*rhs},
+                                                    cudf::null_equality::UNEQUAL,
+                                                    cudf::nan_equality::ALL_EQUAL);
     cudf::jni::post_process_list_overlap(*lhs, *rhs, overlap_result);
     return release_as_jlong(overlap_result);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsIntersectDistinct(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsIntersectDistinct(JNIEnv* env,
+                                                                              jclass,
                                                                               jlong lhs_handle,
-                                                                              jlong rhs_handle) {
+                                                                              jlong rhs_handle)
+{
   JNI_NULL_CHECK(env, lhs_handle, "lhs_handle is null", 0)
   JNI_NULL_CHECK(env, rhs_handle, "rhs_handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const lhs = reinterpret_cast<cudf::column_view const *>(lhs_handle);
-    auto const rhs = reinterpret_cast<cudf::column_view const *>(rhs_handle);
-    return release_as_jlong(cudf::lists::intersect_distinct(
-        cudf::lists_column_view{*lhs}, cudf::lists_column_view{*rhs}, cudf::null_equality::EQUAL,
-        cudf::nan_equality::ALL_EQUAL));
+    auto const lhs = reinterpret_cast<cudf::column_view const*>(lhs_handle);
+    auto const rhs = reinterpret_cast<cudf::column_view const*>(rhs_handle);
+    return release_as_jlong(cudf::lists::intersect_distinct(cudf::lists_column_view{*lhs},
+                                                            cudf::lists_column_view{*rhs},
+                                                            cudf::null_equality::EQUAL,
+                                                            cudf::nan_equality::ALL_EQUAL));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsUnionDistinct(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsUnionDistinct(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong lhs_handle,
-                                                                          jlong rhs_handle) {
+                                                                          jlong rhs_handle)
+{
   JNI_NULL_CHECK(env, lhs_handle, "lhs_handle is null", 0)
   JNI_NULL_CHECK(env, rhs_handle, "rhs_handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const lhs = reinterpret_cast<cudf::column_view const *>(lhs_handle);
-    auto const rhs = reinterpret_cast<cudf::column_view const *>(rhs_handle);
-    return release_as_jlong(
-        cudf::lists::union_distinct(cudf::lists_column_view{*lhs}, cudf::lists_column_view{*rhs},
-                                    cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL));
+    auto const lhs = reinterpret_cast<cudf::column_view const*>(lhs_handle);
+    auto const rhs = reinterpret_cast<cudf::column_view const*>(rhs_handle);
+    return release_as_jlong(cudf::lists::union_distinct(cudf::lists_column_view{*lhs},
+                                                        cudf::lists_column_view{*rhs},
+                                                        cudf::null_equality::EQUAL,
+                                                        cudf::nan_equality::ALL_EQUAL));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsDifferenceDistinct(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_listsDifferenceDistinct(JNIEnv* env,
+                                                                               jclass,
                                                                                jlong lhs_handle,
-                                                                               jlong rhs_handle) {
+                                                                               jlong rhs_handle)
+{
   JNI_NULL_CHECK(env, lhs_handle, "lhs_handle is null", 0)
   JNI_NULL_CHECK(env, rhs_handle, "rhs_handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const lhs = reinterpret_cast<cudf::column_view const *>(lhs_handle);
-    auto const rhs = reinterpret_cast<cudf::column_view const *>(rhs_handle);
-    return release_as_jlong(cudf::lists::difference_distinct(
-        cudf::lists_column_view{*lhs}, cudf::lists_column_view{*rhs}, cudf::null_equality::EQUAL,
-        cudf::nan_equality::ALL_EQUAL));
+    auto const lhs = reinterpret_cast<cudf::column_view const*>(lhs_handle);
+    auto const rhs = reinterpret_cast<cudf::column_view const*>(rhs_handle);
+    return release_as_jlong(cudf::lists::difference_distinct(cudf::lists_column_view{*lhs},
+                                                             cudf::lists_column_view{*rhs},
+                                                             cudf::null_equality::EQUAL,
+                                                             cudf::nan_equality::ALL_EQUAL));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_reverseStringsOrLists(JNIEnv *env, jclass,
-                                                                             jlong input_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_reverseStringsOrLists(JNIEnv* env,
+                                                                             jclass,
+                                                                             jlong input_handle)
+{
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
 
-    auto const input = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const input = reinterpret_cast<cudf::column_view const*>(input_handle);
     switch (input->type().id()) {
       case cudf::type_id::STRING:
         return release_as_jlong(cudf::strings::reverse(cudf::strings_column_view{*input}));
       case cudf::type_id::LIST:
         return release_as_jlong(cudf::lists::reverse(cudf::lists_column_view{*input}));
       default:
-        JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                      "A column of type string or list is required for reverse()", 0);
+        JNI_THROW_NEW(env,
+                      "java/lang/IllegalArgumentException",
+                      "A column of type string or list is required for reverse()",
+                      0);
     }
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *env, jclass,
-                                                                        jlong input_handle,
-                                                                        jstring delimiter_obj,
-                                                                        jint limit) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(
+  JNIEnv* env, jclass, jlong input_handle, jstring delimiter_obj, jint limit)
+{
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
 
   if (limit == 0 || limit == 1) {
@@ -734,26 +804,31 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplit(JNIEnv *
     // This is because cudf operates on a different parameter (`max_split`) which is converted from
     // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an
     // unlimited split.
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                  "limit == 0 and limit == 1 are not supported", 0);
+    JNI_THROW_NEW(
+      env, "java/lang/IllegalArgumentException", "limit == 0 and limit == 1 are not supported", 0);
   }
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const input          = reinterpret_cast<cudf::column_view const*>(input_handle);
     auto const strings_column = cudf::strings_column_view{*input};
     auto const delimiter_jstr = cudf::jni::native_jstring(env, delimiter_obj);
-    auto const delimiter = std::string(delimiter_jstr.get(), delimiter_jstr.size_bytes());
-    auto const max_split = limit > 1 ? limit - 1 : limit;
+    auto const delimiter      = std::string(delimiter_jstr.get(), delimiter_jstr.size_bytes());
+    auto const max_split      = limit > 1 ? limit - 1 : limit;
     auto result = cudf::strings::split(strings_column, cudf::string_scalar{delimiter}, max_split);
     return cudf::jni::convert_table_for_return(env, std::move(result));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRe(
-    JNIEnv *env, jclass, jlong input_handle, jstring pattern_obj, jint regex_flags,
-    jint capture_groups, jint limit) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRe(JNIEnv* env,
+                                                                          jclass,
+                                                                          jlong input_handle,
+                                                                          jstring pattern_obj,
+                                                                          jint regex_flags,
+                                                                          jint capture_groups,
+                                                                          jint limit)
+{
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
 
   if (limit == 0 || limit == 1) {
@@ -761,30 +836,29 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRe(
     // This is because cudf operates on a different parameter (`max_split`) which is converted from
     // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an
     // unlimited split.
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                  "limit == 0 and limit == 1 are not supported", 0);
+    JNI_THROW_NEW(
+      env, "java/lang/IllegalArgumentException", "limit == 0 and limit == 1 are not supported", 0);
   }
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const input          = reinterpret_cast<cudf::column_view const*>(input_handle);
     auto const strings_column = cudf::strings_column_view{*input};
-    auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj);
-    auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes());
-    auto const max_split = limit > 1 ? limit - 1 : limit;
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern, flags, groups);
-    auto result = cudf::strings::split_re(strings_column, *regex_prog, max_split);
+    auto const pattern_jstr   = cudf::jni::native_jstring(env, pattern_obj);
+    auto const pattern        = std::string(pattern_jstr.get(), pattern_jstr.size_bytes());
+    auto const max_split      = limit > 1 ? limit - 1 : limit;
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern, flags, groups);
+    auto result               = cudf::strings::split_re(strings_column, *regex_prog, max_split);
     return cudf::jni::convert_table_for_return(env, std::move(result));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv *env, jclass,
-                                                                         jlong input_handle,
-                                                                         jstring delimiter_obj,
-                                                                         jint limit) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(
+  JNIEnv* env, jclass, jlong input_handle, jstring delimiter_obj, jint limit)
+{
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
 
   if (limit == 0 || limit == 1) {
@@ -792,27 +866,32 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecord(JNIEnv
     // This is because cudf operates on a different parameter (`max_split`) which is converted from
     // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an
     // unlimited split.
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                  "limit == 0 and limit == 1 are not supported", 0);
+    JNI_THROW_NEW(
+      env, "java/lang/IllegalArgumentException", "limit == 0 and limit == 1 are not supported", 0);
   }
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const input          = reinterpret_cast<cudf::column_view const*>(input_handle);
     auto const strings_column = cudf::strings_column_view{*input};
     auto const delimiter_jstr = cudf::jni::native_jstring(env, delimiter_obj);
-    auto const delimiter = std::string(delimiter_jstr.get(), delimiter_jstr.size_bytes());
-    auto const max_split = limit > 1 ? limit - 1 : limit;
+    auto const delimiter      = std::string(delimiter_jstr.get(), delimiter_jstr.size_bytes());
+    auto const max_split      = limit > 1 ? limit - 1 : limit;
     auto result =
-        cudf::strings::split_record(strings_column, cudf::string_scalar{delimiter}, max_split);
+      cudf::strings::split_record(strings_column, cudf::string_scalar{delimiter}, max_split);
     return release_as_jlong(result);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecordRe(
-    JNIEnv *env, jclass, jlong input_handle, jstring pattern_obj, jint regex_flags,
-    jint capture_groups, jint limit) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecordRe(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong input_handle,
+                                                                           jstring pattern_obj,
+                                                                           jint regex_flags,
+                                                                           jint capture_groups,
+                                                                           jint limit)
+{
   JNI_NULL_CHECK(env, input_handle, "input_handle is null", 0);
 
   if (limit == 0 || limit == 1) {
@@ -820,99 +899,108 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringSplitRecordRe(
     // This is because cudf operates on a different parameter (`max_split`) which is converted from
     // limit. When limit == 0 or limit == 1, max_split will be non-positive and will result in an
     // unlimited split.
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                  "limit == 0 and limit == 1 are not supported", 0);
+    JNI_THROW_NEW(
+      env, "java/lang/IllegalArgumentException", "limit == 0 and limit == 1 are not supported", 0);
   }
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::column_view const *>(input_handle);
+    auto const input          = reinterpret_cast<cudf::column_view const*>(input_handle);
     auto const strings_column = cudf::strings_column_view{*input};
-    auto const pattern_jstr = cudf::jni::native_jstring(env, pattern_obj);
-    auto const pattern = std::string(pattern_jstr.get(), pattern_jstr.size_bytes());
-    auto const max_split = limit > 1 ? limit - 1 : limit;
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern, flags, groups);
+    auto const pattern_jstr   = cudf::jni::native_jstring(env, pattern_obj);
+    auto const pattern        = std::string(pattern_jstr.get(), pattern_jstr.size_bytes());
+    auto const max_split      = limit > 1 ? limit - 1 : limit;
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern, flags, groups);
     auto result = cudf::strings::split_record_re(strings_column, *regex_prog, max_split);
     return release_as_jlong(result);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_split(JNIEnv *env, jclass clazz,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_split(JNIEnv* env,
+                                                                  jclass clazz,
                                                                   jlong input_column,
-                                                                  jintArray split_indices) {
+                                                                  jintArray split_indices)
+{
   JNI_NULL_CHECK(env, input_column, "native handle is null", 0);
   JNI_NULL_CHECK(env, split_indices, "split indices are null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(input_column);
+    cudf::column_view* n_column = reinterpret_cast<cudf::column_view*>(input_column);
     cudf::jni::native_jintArray n_split_indices(env, split_indices);
     std::vector<cudf::size_type> indices(n_split_indices.begin(), n_split_indices.end());
 
     std::vector<cudf::column_view> result = cudf::split(*n_column, indices);
     cudf::jni::native_jlongArray n_result(env, result.size());
 
-    std::transform(result.begin(), result.end(), n_result.begin(),
-                   [](cudf::column_view const &result_col) {
-                     return ptr_as_jlong(new cudf::column_view{result_col});
-                   });
+    std::transform(
+      result.begin(), result.end(), n_result.begin(), [](cudf::column_view const& result_col) {
+        return ptr_as_jlong(new cudf::column_view{result_col});
+      });
 
     return n_result.get_jArray();
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_countElements(JNIEnv *env, jclass clazz,
-                                                                     jlong view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_countElements(JNIEnv* env,
+                                                                     jclass clazz,
+                                                                     jlong view_handle)
+{
   JNI_NULL_CHECK(env, view_handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(view_handle);
+    cudf::column_view* n_column = reinterpret_cast<cudf::column_view*>(view_handle);
     return release_as_jlong(cudf::lists::count_elements(cudf::lists_column_view(*n_column)));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_charLengths(JNIEnv *env, jclass clazz,
-                                                                   jlong view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_charLengths(JNIEnv* env,
+                                                                   jclass clazz,
+                                                                   jlong view_handle)
+{
   JNI_NULL_CHECK(env, view_handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(view_handle);
+    cudf::column_view* n_column = reinterpret_cast<cudf::column_view*>(view_handle);
     return release_as_jlong(cudf::strings::count_characters(cudf::strings_column_view(*n_column)));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_byteCount(JNIEnv *env, jclass clazz,
-                                                                 jlong view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_byteCount(JNIEnv* env,
+                                                                 jclass clazz,
+                                                                 jlong view_handle)
+{
   JNI_NULL_CHECK(env, view_handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_column = reinterpret_cast<cudf::column_view *>(view_handle);
+    cudf::column_view* n_column = reinterpret_cast<cudf::column_view*>(view_handle);
     return release_as_jlong(cudf::strings::count_bytes(cudf::strings_column_view(*n_column)));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_codePoints(JNIEnv *env, jclass clazz,
-                                                                  jlong view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_codePoints(JNIEnv* env,
+                                                                  jclass clazz,
+                                                                  jlong view_handle)
+{
   JNI_NULL_CHECK(env, view_handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::column_view const *>(view_handle);
+    auto const input = reinterpret_cast<cudf::column_view const*>(view_handle);
     return release_as_jlong(cudf::strings::code_points(cudf::strings_column_view{*input}));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_findAndReplaceAll(JNIEnv *env, jclass clazz,
-                                                                         jlong old_values_handle,
-                                                                         jlong new_values_handle,
-                                                                         jlong input_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_findAndReplaceAll(
+  JNIEnv* env, jclass clazz, jlong old_values_handle, jlong new_values_handle, jlong input_handle)
+{
   JNI_NULL_CHECK(env, old_values_handle, "values column is null", 0);
   JNI_NULL_CHECK(env, new_values_handle, "replace column is null", 0);
   JNI_NULL_CHECK(env, input_handle, "input column is null", 0);
@@ -922,230 +1010,253 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_findAndReplaceAll(JNIEnv
 
   try {
     cudf::jni::auto_set_device(env);
-    column_view *input_column = reinterpret_cast<column_view *>(input_handle);
-    column_view *old_values_column = reinterpret_cast<column_view *>(old_values_handle);
-    column_view *new_values_column = reinterpret_cast<column_view *>(new_values_handle);
+    column_view* input_column      = reinterpret_cast<column_view*>(input_handle);
+    column_view* old_values_column = reinterpret_cast<column_view*>(old_values_handle);
+    column_view* new_values_column = reinterpret_cast<column_view*>(new_values_handle);
     return release_as_jlong(
-        cudf::find_and_replace_all(*input_column, *old_values_column, *new_values_column));
+      cudf::find_and_replace_all(*input_column, *old_values_column, *new_values_column));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNullNative(JNIEnv *env, jclass,
-                                                                    jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNullNative(JNIEnv* env,
+                                                                    jclass,
+                                                                    jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(handle);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::is_null(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNotNullNative(JNIEnv *env, jclass,
-                                                                       jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNotNullNative(JNIEnv* env,
+                                                                       jclass,
+                                                                       jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(handle);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::is_valid(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNanNative(JNIEnv *env, jclass,
-                                                                   jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNanNative(JNIEnv* env,
+                                                                   jclass,
+                                                                   jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(handle);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::is_nan(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNotNanNative(JNIEnv *env, jclass,
-                                                                      jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNotNanNative(JNIEnv* env,
+                                                                      jclass,
+                                                                      jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(handle);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::is_not_nan(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_unaryOperation(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_unaryOperation(JNIEnv* env,
+                                                                      jclass,
                                                                       jlong input_ptr,
-                                                                      jint int_op) {
+                                                                      jint int_op)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
-    cudf::unary_operator op = static_cast<cudf::unary_operator>(int_op);
+    cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
+    cudf::unary_operator op  = static_cast<cudf::unary_operator>(int_op);
     return release_as_jlong(cudf::unary_operation(*input, op));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_round(JNIEnv *env, jclass, jlong input_ptr,
-                                                             jint decimal_places,
-                                                             jint rounding_method) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_round(
+  JNIEnv* env, jclass, jlong input_ptr, jint decimal_places, jint rounding_method)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    cudf::column_view* input     = reinterpret_cast<cudf::column_view*>(input_ptr);
     cudf::rounding_method method = static_cast<cudf::rounding_method>(rounding_method);
     return release_as_jlong(cudf::round(*input, decimal_places, method));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_year(JNIEnv *env, jclass, jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_year(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_year(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_month(JNIEnv *env, jclass, jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_month(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_month(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_day(JNIEnv *env, jclass, jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_day(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_day(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_hour(JNIEnv *env, jclass, jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_hour(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_hour(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_minute(JNIEnv *env, jclass,
-                                                              jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_minute(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_minute(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_second(JNIEnv *env, jclass,
-                                                              jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_second(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_second(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_weekDay(JNIEnv *env, jclass,
-                                                               jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_weekDay(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_weekday(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lastDayOfMonth(JNIEnv *env, jclass,
-                                                                      jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lastDayOfMonth(JNIEnv* env,
+                                                                      jclass,
+                                                                      jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::last_day_of_month(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dayOfYear(JNIEnv *env, jclass,
-                                                                 jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dayOfYear(JNIEnv* env,
+                                                                 jclass,
+                                                                 jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::day_of_year(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_quarterOfYear(JNIEnv *env, jclass,
-                                                                     jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_quarterOfYear(JNIEnv* env,
+                                                                     jclass,
+                                                                     jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_quarter(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_addCalendricalMonths(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_addCalendricalMonths(JNIEnv* env,
+                                                                            jclass,
                                                                             jlong ts_ptr,
-                                                                            jlong months_ptr) {
+                                                                            jlong months_ptr)
+{
   JNI_NULL_CHECK(env, ts_ptr, "ts is null", 0);
   JNI_NULL_CHECK(env, months_ptr, "months is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *ts = reinterpret_cast<cudf::column_view *>(ts_ptr);
-    const cudf::column_view *months = reinterpret_cast<cudf::column_view *>(months_ptr);
+    const cudf::column_view* ts     = reinterpret_cast<cudf::column_view*>(ts_ptr);
+    const cudf::column_view* months = reinterpret_cast<cudf::column_view*>(months_ptr);
     return release_as_jlong(cudf::datetime::add_calendrical_months(*ts, *months));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isLeapYear(JNIEnv *env, jclass,
-                                                                  jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isLeapYear(JNIEnv* env,
+                                                                  jclass,
+                                                                  jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::is_leap_year(*input));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclass, jlong handle,
-                                                              jint type, jint scale) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv* env, jclass, jlong handle, jint type, jint scale)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column   = reinterpret_cast<cudf::column_view*>(handle);
     cudf::data_type n_data_type = cudf::jni::make_data_type(type, scale);
-    if (n_data_type == column->type()) {
-      return ptr_as_jlong(new cudf::column(*column));
-    }
+    if (n_data_type == column->type()) { return ptr_as_jlong(new cudf::column(*column)); }
     if (n_data_type.id() == cudf::type_id::STRING) {
       switch (column->type().id()) {
         case cudf::type_id::BOOL8: {
-          auto const true_scalar = cudf::string_scalar("true");
+          auto const true_scalar  = cudf::string_scalar("true");
           auto const false_scalar = cudf::string_scalar("false");
           return release_as_jlong(cudf::strings::from_booleans(*column, true_scalar, false_scalar));
         }
@@ -1195,26 +1306,30 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
       // "reinterpret" casting will be supported via https://github.com/rapidsai/cudf/pull/5358
       if (n_data_type.id() == cudf::type_id::TIMESTAMP_DAYS) {
         if (column->type().id() != cudf::type_id::INT32) {
-          JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                        "Numeric cast to TIMESTAMP_DAYS requires INT32", 0);
+          JNI_THROW_NEW(env,
+                        "java/lang/IllegalArgumentException",
+                        "Numeric cast to TIMESTAMP_DAYS requires INT32",
+                        0);
         }
       } else {
         if (column->type().id() != cudf::type_id::INT64) {
-          JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
-                        "Numeric cast to non-day timestamp requires INT64", 0);
+          JNI_THROW_NEW(env,
+                        "java/lang/IllegalArgumentException",
+                        "Numeric cast to non-day timestamp requires INT64",
+                        0);
         }
       }
-      cudf::data_type duration_type = cudf::jni::timestamp_to_duration(n_data_type);
+      cudf::data_type duration_type   = cudf::jni::timestamp_to_duration(n_data_type);
       cudf::column_view duration_view = cudf::column_view(
-          duration_type, column->size(), column->head(), column->null_mask(), column->null_count());
+        duration_type, column->size(), column->head(), column->null_mask(), column->null_count());
       return release_as_jlong(cudf::cast(duration_view, n_data_type));
     } else if (cudf::is_timestamp(column->type()) && cudf::is_numeric(n_data_type)) {
       // This is a temporary workaround to allow Java to cast from timestamp types to integral types
       // without forcing an intermediate duration column to be manifested.  Ultimately this style of
       // "reinterpret" casting will be supported via https://github.com/rapidsai/cudf/pull/5358
-      cudf::data_type duration_type = cudf::jni::timestamp_to_duration(column->type());
+      cudf::data_type duration_type   = cudf::jni::timestamp_to_duration(column->type());
       cudf::column_view duration_view = cudf::column_view(
-          duration_type, column->size(), column->head(), column->null_mask(), column->null_count());
+        duration_type, column->size(), column->head(), column->null_mask(), column->null_count());
       return release_as_jlong(cudf::cast(duration_view, n_data_type));
     } else {
       return release_as_jlong(cudf::cast(*column, n_data_type));
@@ -1223,25 +1338,28 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_castTo(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitCastTo(JNIEnv *env, jclass, jlong handle,
-                                                                 jint type, jint scale) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ColumnView_bitCastTo(JNIEnv* env, jclass, jlong handle, jint type, jint scale)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column   = reinterpret_cast<cudf::column_view*>(handle);
     cudf::data_type n_data_type = cudf::jni::make_data_type(type, scale);
     return ptr_as_jlong(new cudf::column_view{cudf::bit_cast(*column, n_data_type)});
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_byteListCast(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_byteListCast(JNIEnv* env,
+                                                                    jobject j_object,
                                                                     jlong handle,
-                                                                    jboolean endianness_config) {
+                                                                    jboolean endianness_config)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     cudf::flip_endianness config(static_cast<cudf::flip_endianness>(endianness_config));
     return release_as_jlong(byte_cast(*column, config));
   }
@@ -1249,78 +1367,86 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_byteListCast(JNIEnv *env,
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringTimestampToTimestamp(
-    JNIEnv *env, jobject j_object, jlong handle, jint time_unit, jstring formatObj) {
+  JNIEnv* env, jobject j_object, jlong handle, jint time_unit, jstring formatObj)
+{
   JNI_NULL_CHECK(env, handle, "column is null", 0);
   JNI_NULL_CHECK(env, formatObj, "format is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstring format(env, formatObj);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     cudf::strings_column_view strings_column(*column);
 
     return release_as_jlong(cudf::strings::to_timestamps(
-        strings_column, cudf::data_type(static_cast<cudf::type_id>(time_unit)), format.get()));
+      strings_column, cudf::data_type(static_cast<cudf::type_id>(time_unit)), format.get()));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isTimestamp(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isTimestamp(JNIEnv* env,
+                                                                   jclass,
                                                                    jlong handle,
-                                                                   jstring formatObj) {
+                                                                   jstring formatObj)
+{
   JNI_NULL_CHECK(env, handle, "column is null", 0);
   JNI_NULL_CHECK(env, formatObj, "format is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstring format(env, formatObj);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     cudf::strings_column_view strings_column(*column);
     return release_as_jlong(cudf::strings::is_timestamp(strings_column, format.get()));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_timestampToStringTimestamp(
-    JNIEnv *env, jobject j_object, jlong handle, jstring j_format) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_timestampToStringTimestamp(JNIEnv* env,
+                                                                                  jobject j_object,
+                                                                                  jlong handle,
+                                                                                  jstring j_format)
+{
   JNI_NULL_CHECK(env, handle, "column is null", 0);
   JNI_NULL_CHECK(env, j_format, "format is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstring format(env, j_format);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::strings::from_timestamps(*column, format.get()));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ColumnView_containsScalar(JNIEnv *env,
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ColumnView_containsScalar(JNIEnv* env,
                                                                          jobject j_object,
                                                                          jlong j_view_handle,
-                                                                         jlong j_scalar_handle) {
+                                                                         jlong j_scalar_handle)
+{
   JNI_NULL_CHECK(env, j_view_handle, "haystack vector is null", false);
   JNI_NULL_CHECK(env, j_scalar_handle, "scalar needle is null", false);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
-    cudf::scalar *scalar = reinterpret_cast<cudf::scalar *>(j_scalar_handle);
+    cudf::column_view* column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
+    cudf::scalar* scalar           = reinterpret_cast<cudf::scalar*>(j_scalar_handle);
 
     return cudf::contains(*column_view, *scalar);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsVector(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsVector(JNIEnv* env,
+                                                                      jobject j_object,
                                                                       jlong j_values_handle,
-                                                                      jlong j_search_space_handle) {
+                                                                      jlong j_search_space_handle)
+{
   JNI_NULL_CHECK(env, j_values_handle, "values vector is null", false);
   JNI_NULL_CHECK(env, j_search_space_handle, "search_space vector is null", false);
   try {
     cudf::jni::auto_set_device(env);
-    auto const search_space_ptr =
-        reinterpret_cast<cudf::column_view const *>(j_search_space_handle);
-    auto const values_ptr = reinterpret_cast<cudf::column_view const *>(j_values_handle);
+    auto const search_space_ptr = reinterpret_cast<cudf::column_view const*>(j_search_space_handle);
+    auto const values_ptr       = reinterpret_cast<cudf::column_view const*>(j_values_handle);
 
     // The C++ API `cudf::contains` requires that the search space is the first parameter.
     return release_as_jlong(cudf::contains(*search_space_ptr, *values_ptr));
@@ -1328,141 +1454,149 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsVector(JNIEnv *en
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_transform(JNIEnv *env, jobject j_object,
-                                                                 jlong handle, jstring j_udf,
-                                                                 jboolean j_is_ptx) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_transform(
+  JNIEnv* env, jobject j_object, jlong handle, jstring j_udf, jboolean j_is_ptx)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     cudf::jni::native_jstring n_j_udf(env, j_udf);
     std::string n_udf(n_j_udf.get());
     return release_as_jlong(
-        cudf::transform(*column, n_udf, cudf::data_type(cudf::type_id::INT32), j_is_ptx));
+      cudf::transform(*column, n_udf, cudf::data_type(cudf::type_id::INT32), j_is_ptx));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringStartWith(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringStartWith(JNIEnv* env,
                                                                        jobject j_object,
                                                                        jlong j_view_handle,
-                                                                       jlong comp_string) {
+                                                                       jlong comp_string)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
   JNI_NULL_CHECK(env, comp_string, "comparison string scalar is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
+    cudf::column_view* column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view strings_column(*column_view);
-    cudf::string_scalar *comp_scalar = reinterpret_cast<cudf::string_scalar *>(comp_string);
+    cudf::string_scalar* comp_scalar = reinterpret_cast<cudf::string_scalar*>(comp_string);
     return release_as_jlong(cudf::strings::starts_with(strings_column, *comp_scalar));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringEndWith(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringEndWith(JNIEnv* env,
+                                                                     jobject j_object,
                                                                      jlong j_view_handle,
-                                                                     jlong comp_string) {
+                                                                     jlong comp_string)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
   JNI_NULL_CHECK(env, comp_string, "comparison string scalar is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
+    cudf::column_view* column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view strings_column(*column_view);
-    cudf::string_scalar *comp_scalar = reinterpret_cast<cudf::string_scalar *>(comp_string);
+    cudf::string_scalar* comp_scalar = reinterpret_cast<cudf::string_scalar*>(comp_string);
     return release_as_jlong(cudf::strings::ends_with(strings_column, *comp_scalar));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringContains(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringContains(JNIEnv* env,
+                                                                      jobject j_object,
                                                                       jlong j_view_handle,
-                                                                      jlong comp_string) {
+                                                                      jlong comp_string)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
   JNI_NULL_CHECK(env, comp_string, "comparison string scalar is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
+    cudf::column_view* column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view strings_column(*column_view);
-    cudf::string_scalar *comp_scalar = reinterpret_cast<cudf::string_scalar *>(comp_string);
+    cudf::string_scalar* comp_scalar = reinterpret_cast<cudf::string_scalar*>(comp_string);
     return release_as_jlong(cudf::strings::contains(strings_column, *comp_scalar));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_matchesRe(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_matchesRe(JNIEnv* env,
+                                                                 jobject j_object,
                                                                  jlong j_view_handle,
                                                                  jstring pattern_obj,
                                                                  jint regex_flags,
-                                                                 jint capture_groups) {
+                                                                 jint capture_groups)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
   JNI_NULL_CHECK(env, pattern_obj, "pattern is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
+    auto const column_view    = reinterpret_cast<cudf::column_view const*>(j_view_handle);
     auto const strings_column = cudf::strings_column_view{*column_view};
-    auto const pattern = cudf::jni::native_jstring(env, pattern_obj);
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
+    auto const pattern        = cudf::jni::native_jstring(env, pattern_obj);
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern.get(), flags, groups);
     return release_as_jlong(cudf::strings::matches_re(strings_column, *regex_prog));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsRe(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsRe(JNIEnv* env,
+                                                                  jobject j_object,
                                                                   jlong j_view_handle,
                                                                   jstring pattern_obj,
                                                                   jint regex_flags,
-                                                                  jint capture_groups) {
+                                                                  jint capture_groups)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
   JNI_NULL_CHECK(env, pattern_obj, "pattern is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
+    auto const column_view    = reinterpret_cast<cudf::column_view const*>(j_view_handle);
     auto const strings_column = cudf::strings_column_view{*column_view};
-    auto const pattern = cudf::jni::native_jstring(env, pattern_obj);
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const capture = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, capture);
+    auto const pattern        = cudf::jni::native_jstring(env, pattern_obj);
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const capture        = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern.get(), flags, capture);
     return release_as_jlong(cudf::strings::contains_re(strings_column, *regex_prog));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_like(JNIEnv *env, jobject j_object,
-                                                            jlong j_view_handle, jlong pattern,
-                                                            jlong escapeChar) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_like(
+  JNIEnv* env, jobject j_object, jlong j_view_handle, jlong pattern, jlong escapeChar)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
   JNI_NULL_CHECK(env, pattern, "pattern is null", false);
   JNI_NULL_CHECK(env, escapeChar, "escape character is null", false);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
+    auto const column_view    = reinterpret_cast<cudf::column_view const*>(j_view_handle);
     auto const strings_column = cudf::strings_column_view{*column_view};
-    auto const pattern_scalar = reinterpret_cast<cudf::string_scalar const *>(pattern);
-    auto const escape_scalar = reinterpret_cast<cudf::string_scalar const *>(escapeChar);
+    auto const pattern_scalar = reinterpret_cast<cudf::string_scalar const*>(pattern);
+    auto const escape_scalar  = reinterpret_cast<cudf::string_scalar const*>(escapeChar);
     return release_as_jlong(cudf::strings::like(strings_column, *pattern_scalar, *escape_scalar));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(JNIEnv *env, jclass,
-                                                                  jlong lhs_view, jlong rhs_view,
-                                                                  jint int_op, jint out_dtype,
-                                                                  jint scale) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(
+  JNIEnv* env, jclass, jlong lhs_view, jlong rhs_view, jint int_op, jint out_dtype, jint scale)
+{
   JNI_NULL_CHECK(env, lhs_view, "lhs is null", 0);
   JNI_NULL_CHECK(env, rhs_view, "rhs is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto lhs = reinterpret_cast<cudf::column_view *>(lhs_view);
-    auto rhs = reinterpret_cast<cudf::column_view *>(rhs_view);
+    auto lhs                    = reinterpret_cast<cudf::column_view*>(lhs_view);
+    auto rhs                    = reinterpret_cast<cudf::column_view*>(rhs_view);
     cudf::data_type n_data_type = cudf::jni::make_data_type(out_dtype, scale);
-    cudf::binary_operator op = static_cast<cudf::binary_operator>(int_op);
+    cudf::binary_operator op    = static_cast<cudf::binary_operator>(int_op);
 
     if (lhs->type().id() == cudf::type_id::STRUCT) {
       auto out = make_fixed_width_column(n_data_type, lhs->size(), cudf::mask_state::UNALLOCATED);
@@ -1476,7 +1610,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(JNIEnv *env, j
 
       auto out_view = out->mutable_view();
       cudf::binops::compiled::detail::apply_sorting_struct_binary_op(
-          out_view, *lhs, *rhs, false, false, op, cudf::get_default_stream());
+        out_view, *lhs, *rhs, false, false, op, cudf::get_default_stream());
       return release_as_jlong(out);
     }
 
@@ -1485,30 +1619,28 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(JNIEnv *env, j
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_fixedPointOutputScale(JNIEnv *env, jclass,
-                                                                            jint int_op,
-                                                                            jint lhs_scale,
-                                                                            jint rhs_scale) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_fixedPointOutputScale(
+  JNIEnv* env, jclass, jint int_op, jint lhs_scale, jint rhs_scale)
+{
   try {
     // we just return the scale as the types will be the same as the lhs input
-    return cudf::binary_operation_fixed_point_scale(static_cast<cudf::binary_operator>(int_op),
-                                                    lhs_scale, rhs_scale);
+    return cudf::binary_operation_fixed_point_scale(
+      static_cast<cudf::binary_operator>(int_op), lhs_scale, rhs_scale);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVS(JNIEnv *env, jclass,
-                                                                  jlong lhs_view, jlong rhs_ptr,
-                                                                  jint int_op, jint out_dtype,
-                                                                  jint scale) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVS(
+  JNIEnv* env, jclass, jlong lhs_view, jlong rhs_ptr, jint int_op, jint out_dtype, jint scale)
+{
   JNI_NULL_CHECK(env, lhs_view, "lhs is null", 0);
   JNI_NULL_CHECK(env, rhs_ptr, "rhs is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto lhs = reinterpret_cast<cudf::column_view *>(lhs_view);
-    cudf::scalar *rhs = reinterpret_cast<cudf::scalar *>(rhs_ptr);
+    auto lhs                    = reinterpret_cast<cudf::column_view*>(lhs_view);
+    cudf::scalar* rhs           = reinterpret_cast<cudf::scalar*>(rhs_ptr);
     cudf::data_type n_data_type = cudf::jni::make_data_type(out_dtype, scale);
-    cudf::binary_operator op = static_cast<cudf::binary_operator>(int_op);
+    cudf::binary_operator op    = static_cast<cudf::binary_operator>(int_op);
 
     if (lhs->type().id() == cudf::type_id::STRUCT) {
       auto out = make_fixed_width_column(n_data_type, lhs->size(), cudf::mask_state::UNALLOCATED);
@@ -1520,10 +1652,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVS(JNIEnv *env, j
         out->set_null_mask(std::move(new_mask), new_null_count);
       }
 
-      auto rhsv = cudf::make_column_from_scalar(*rhs, 1);
+      auto rhsv     = cudf::make_column_from_scalar(*rhs, 1);
       auto out_view = out->mutable_view();
       cudf::binops::compiled::detail::apply_sorting_struct_binary_op(
-          out_view, *lhs, rhsv->view(), false, true, op, cudf::get_default_stream());
+        out_view, *lhs, rhsv->view(), false, true, op, cudf::get_default_stream());
       return release_as_jlong(out);
     }
 
@@ -1532,233 +1664,251 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVS(JNIEnv *env, j
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringS(JNIEnv *env, jclass,
-                                                                  jlong cv_handle, jint start) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringS(JNIEnv* env,
+                                                                  jclass,
+                                                                  jlong cv_handle,
+                                                                  jint start)
+{
   JNI_NULL_CHECK(env, cv_handle, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = reinterpret_cast<cudf::column_view const *>(cv_handle);
+    auto const cv  = reinterpret_cast<cudf::column_view const*>(cv_handle);
     auto const scv = cudf::strings_column_view{*cv};
     return release_as_jlong(cudf::strings::slice_strings(scv, start));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substring(JNIEnv *env, jclass,
-                                                                 jlong column_view, jint start,
-                                                                 jint end) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substring(
+  JNIEnv* env, jclass, jlong column_view, jint start, jint end)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
     return release_as_jlong(cudf::strings::slice_strings(scv, start, end));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringColumn(JNIEnv *env, jclass,
-                                                                       jlong column_view,
-                                                                       jlong start_column,
-                                                                       jlong end_column) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringColumn(
+  JNIEnv* env, jclass, jlong column_view, jlong start_column, jlong end_column)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, start_column, "column is null", 0);
   JNI_NULL_CHECK(env, end_column, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
-    cudf::column_view *sc = reinterpret_cast<cudf::column_view *>(start_column);
-    cudf::column_view *ec = reinterpret_cast<cudf::column_view *>(end_column);
+    cudf::column_view* sc = reinterpret_cast<cudf::column_view*>(start_column);
+    cudf::column_view* ec = reinterpret_cast<cudf::column_view*>(end_column);
     return release_as_jlong(cudf::strings::slice_strings(scv, *sc, *ec));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringLocate(JNIEnv *env, jclass,
-                                                                       jlong column_view,
-                                                                       jlong substring, jint start,
-                                                                       jint end) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringLocate(
+  JNIEnv* env, jclass, jlong column_view, jlong substring, jint start, jint end)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, substring, "target string scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
-    cudf::string_scalar *ss_scalar = reinterpret_cast<cudf::string_scalar *>(substring);
+    cudf::string_scalar* ss_scalar = reinterpret_cast<cudf::string_scalar*>(substring);
     return release_as_jlong(cudf::strings::find(scv, *ss_scalar, start, end));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplace(JNIEnv *env, jclass,
-                                                                     jlong column_view,
-                                                                     jlong target, jlong replace) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplace(
+  JNIEnv* env, jclass, jlong column_view, jlong target, jlong replace)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, target, "target string scalar is null", 0);
   JNI_NULL_CHECK(env, replace, "replace string scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
-    cudf::string_scalar *ss_target = reinterpret_cast<cudf::string_scalar *>(target);
-    cudf::string_scalar *ss_replace = reinterpret_cast<cudf::string_scalar *>(replace);
+    cudf::string_scalar* ss_target  = reinterpret_cast<cudf::string_scalar*>(target);
+    cudf::string_scalar* ss_replace = reinterpret_cast<cudf::string_scalar*>(replace);
     return release_as_jlong(cudf::strings::replace(scv, *ss_target, *ss_replace));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceMulti(JNIEnv *env, jclass,
-                                                                          jlong inputs_cv,
-                                                                          jlong targets_cv,
-                                                                          jlong repls_cv) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceMulti(
+  JNIEnv* env, jclass, jlong inputs_cv, jlong targets_cv, jlong repls_cv)
+{
   JNI_NULL_CHECK(env, inputs_cv, "column is null", 0);
   JNI_NULL_CHECK(env, targets_cv, "targets string column view is null", 0);
   JNI_NULL_CHECK(env, repls_cv, "repls string column view is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(inputs_cv);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(inputs_cv);
     cudf::strings_column_view scv(*cv);
-    cudf::column_view *cvtargets = reinterpret_cast<cudf::column_view *>(targets_cv);
+    cudf::column_view* cvtargets = reinterpret_cast<cudf::column_view*>(targets_cv);
     cudf::strings_column_view scvtargets(*cvtargets);
-    cudf::column_view *cvrepls = reinterpret_cast<cudf::column_view *>(repls_cv);
+    cudf::column_view* cvrepls = reinterpret_cast<cudf::column_view*>(repls_cv);
     cudf::strings_column_view scvrepls(*cvrepls);
     return release_as_jlong(cudf::strings::replace(scv, scvtargets, scvrepls));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapLookupForKeys(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapLookupForKeys(JNIEnv* env,
+                                                                        jclass,
                                                                         jlong map_column_view,
-                                                                        jlong lookup_keys) {
+                                                                        jlong lookup_keys)
+{
   JNI_NULL_CHECK(env, map_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_keys, "lookup key is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const *cv = reinterpret_cast<cudf::column_view *>(map_column_view);
-    auto const *column_keys = reinterpret_cast<cudf::column_view *>(lookup_keys);
-    auto const maps_view = cudf::jni::maps_column_view{*cv};
+    auto const* cv          = reinterpret_cast<cudf::column_view*>(map_column_view);
+    auto const* column_keys = reinterpret_cast<cudf::column_view*>(lookup_keys);
+    auto const maps_view    = cudf::jni::maps_column_view{*cv};
     return release_as_jlong(maps_view.get_values_for(*column_keys));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapLookup(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapLookup(JNIEnv* env,
+                                                                 jclass,
                                                                  jlong map_column_view,
-                                                                 jlong lookup_key) {
+                                                                 jlong lookup_key)
+{
   JNI_NULL_CHECK(env, map_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_key, "lookup key is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const *cv = reinterpret_cast<cudf::column_view *>(map_column_view);
-    auto const *scalar_key = reinterpret_cast<cudf::scalar *>(lookup_key);
-    auto const maps_view = cudf::jni::maps_column_view{*cv};
+    auto const* cv         = reinterpret_cast<cudf::column_view*>(map_column_view);
+    auto const* scalar_key = reinterpret_cast<cudf::scalar*>(lookup_key);
+    auto const maps_view   = cudf::jni::maps_column_view{*cv};
     return release_as_jlong(maps_view.get_values_for(*scalar_key));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapContainsKeys(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapContainsKeys(JNIEnv* env,
+                                                                       jclass,
                                                                        jlong map_column_view,
-                                                                       jlong lookup_keys) {
+                                                                       jlong lookup_keys)
+{
   JNI_NULL_CHECK(env, map_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_keys, "lookup key is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const *cv = reinterpret_cast<cudf::column_view *>(map_column_view);
-    auto const *column_key = reinterpret_cast<cudf::column_view *>(lookup_keys);
-    auto const maps_view = cudf::jni::maps_column_view{*cv};
+    auto const* cv         = reinterpret_cast<cudf::column_view*>(map_column_view);
+    auto const* column_key = reinterpret_cast<cudf::column_view*>(lookup_keys);
+    auto const maps_view   = cudf::jni::maps_column_view{*cv};
     return release_as_jlong(maps_view.contains(*column_key));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapContains(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_mapContains(JNIEnv* env,
+                                                                   jclass,
                                                                    jlong map_column_view,
-                                                                   jlong lookup_key) {
+                                                                   jlong lookup_key)
+{
   JNI_NULL_CHECK(env, map_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, lookup_key, "lookup key is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const *cv = reinterpret_cast<cudf::column_view *>(map_column_view);
-    auto const *scalar_key = reinterpret_cast<cudf::scalar *>(lookup_key);
-    auto const maps_view = cudf::jni::maps_column_view{*cv};
+    auto const* cv         = reinterpret_cast<cudf::column_view*>(map_column_view);
+    auto const* scalar_key = reinterpret_cast<cudf::scalar*>(lookup_key);
+    auto const maps_view   = cudf::jni::maps_column_view{*cv};
     return release_as_jlong(maps_view.contains(*scalar_key));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceRegex(
-    JNIEnv *env, jclass, jlong j_column_view, jstring j_pattern, jint regex_flags,
-    jint capture_groups, jlong j_repl, jlong j_maxrepl) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceRegex(JNIEnv* env,
+                                                                    jclass,
+                                                                    jlong j_column_view,
+                                                                    jstring j_pattern,
+                                                                    jint regex_flags,
+                                                                    jint capture_groups,
+                                                                    jlong j_repl,
+                                                                    jlong j_maxrepl)
+{
   JNI_NULL_CHECK(env, j_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, j_pattern, "pattern string is null", 0);
   JNI_NULL_CHECK(env, j_repl, "replace scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = reinterpret_cast<cudf::column_view const *>(j_column_view);
+    auto const cv             = reinterpret_cast<cudf::column_view const*>(j_column_view);
     auto const strings_column = cudf::strings_column_view{*cv};
-    auto const pattern = cudf::jni::native_jstring(env, j_pattern);
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
-    auto const repl = reinterpret_cast<cudf::string_scalar const *>(j_repl);
+    auto const pattern        = cudf::jni::native_jstring(env, j_pattern);
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern.get(), flags, groups);
+    auto const repl           = reinterpret_cast<cudf::string_scalar const*>(j_repl);
     return release_as_jlong(
-        cudf::strings::replace_re(strings_column, *regex_prog, *repl, j_maxrepl));
+      cudf::strings::replace_re(strings_column, *regex_prog, *repl, j_maxrepl));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceMultiRegex(JNIEnv *env, jclass,
-                                                                         jlong j_column_view,
-                                                                         jobjectArray j_patterns,
-                                                                         jlong j_repls) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_replaceMultiRegex(
+  JNIEnv* env, jclass, jlong j_column_view, jobjectArray j_patterns, jlong j_repls)
+{
   JNI_NULL_CHECK(env, j_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, j_patterns, "patterns is null", 0);
   JNI_NULL_CHECK(env, j_repls, "repls is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto cv = reinterpret_cast<cudf::column_view const *>(j_column_view);
+    auto cv = reinterpret_cast<cudf::column_view const*>(j_column_view);
     cudf::strings_column_view scv(*cv);
     cudf::jni::native_jstringArray patterns(env, j_patterns);
-    auto repl_cv = reinterpret_cast<cudf::column_view const *>(j_repls);
+    auto repl_cv = reinterpret_cast<cudf::column_view const*>(j_repls);
     cudf::strings_column_view repl_scv(*repl_cv);
     return release_as_jlong(cudf::strings::replace_re(scv, patterns.as_cpp_vector(), repl_scv));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceWithBackrefs(
-    JNIEnv *env, jclass, jlong j_column_view, jstring pattern_obj, jint regex_flags,
-    jint capture_groups, jstring replace_obj) {
-
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ColumnView_stringReplaceWithBackrefs(JNIEnv* env,
+                                                         jclass,
+                                                         jlong j_column_view,
+                                                         jstring pattern_obj,
+                                                         jint regex_flags,
+                                                         jint capture_groups,
+                                                         jstring replace_obj)
+{
   JNI_NULL_CHECK(env, j_column_view, "column is null", 0);
   JNI_NULL_CHECK(env, pattern_obj, "pattern string is null", 0);
   JNI_NULL_CHECK(env, replace_obj, "replace string is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = reinterpret_cast<cudf::column_view const *>(j_column_view);
+    auto const cv             = reinterpret_cast<cudf::column_view const*>(j_column_view);
     auto const strings_column = cudf::strings_column_view{*cv};
-    auto const pattern = cudf::jni::native_jstring(env, pattern_obj);
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
+    auto const pattern        = cudf::jni::native_jstring(env, pattern_obj);
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern.get(), flags, groups);
     cudf::jni::native_jstring ss_replace(env, replace_obj);
     return release_as_jlong(
-        cudf::strings::replace_with_backrefs(strings_column, *regex_prog, ss_replace.get()));
+      cudf::strings::replace_with_backrefs(strings_column, *regex_prog, ss_replace.get()));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_zfill(JNIEnv *env, jclass, jlong column_view,
-                                                             jint j_width) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_zfill(JNIEnv* env,
+                                                             jclass,
+                                                             jlong column_view,
+                                                             jint j_width)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
     cudf::size_type width = reinterpret_cast<cudf::size_type>(j_width);
     return release_as_jlong(cudf::strings::zfill(scv, width));
@@ -1766,17 +1916,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_zfill(JNIEnv *env, jclass
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_pad(JNIEnv *env, jclass, jlong column_view,
-                                                           jint j_width, jint j_side,
-                                                           jstring fill_char) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_pad(
+  JNIEnv* env, jclass, jlong column_view, jint j_width, jint j_side, jstring fill_char)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, fill_char, "fill_char is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
-    cudf::size_type width = reinterpret_cast<cudf::size_type>(j_width);
+    cudf::size_type width         = reinterpret_cast<cudf::size_type>(j_width);
     cudf::strings::side_type side = static_cast<cudf::strings::side_type>(j_side);
     cudf::jni::native_jstring ss_fill(env, fill_char);
     return release_as_jlong(cudf::strings::pad(scv, width, side, ss_fill.get()));
@@ -1784,113 +1933,125 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_pad(JNIEnv *env, jclass,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringStrip(JNIEnv *env, jclass,
-                                                                   jlong column_view,
-                                                                   jint strip_type,
-                                                                   jlong to_strip) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringStrip(
+  JNIEnv* env, jclass, jlong column_view, jint strip_type, jlong to_strip)
+{
   JNI_NULL_CHECK(env, column_view, "column is null", 0);
   JNI_NULL_CHECK(env, to_strip, "to_strip scalar is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_view);
     cudf::strings_column_view scv(*cv);
     cudf::strings::side_type s_striptype = static_cast<cudf::strings::side_type>(strip_type);
-    cudf::string_scalar *ss_tostrip = reinterpret_cast<cudf::string_scalar *>(to_strip);
+    cudf::string_scalar* ss_tostrip      = reinterpret_cast<cudf::string_scalar*>(to_strip);
     return release_as_jlong(cudf::strings::strip(scv, s_striptype, *ss_tostrip));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_extractRe(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ColumnView_extractRe(JNIEnv* env,
+                                                                      jclass,
                                                                       jlong j_view_handle,
                                                                       jstring pattern_obj,
                                                                       jint regex_flags,
-                                                                      jint capture_groups) {
+                                                                      jint capture_groups)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", nullptr);
   JNI_NULL_CHECK(env, pattern_obj, "pattern is null", nullptr);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
+    auto const column_view    = reinterpret_cast<cudf::column_view const*>(j_view_handle);
     auto const strings_column = cudf::strings_column_view{*column_view};
-    auto const pattern = cudf::jni::native_jstring(env, pattern_obj);
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
+    auto const pattern        = cudf::jni::native_jstring(env, pattern_obj);
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern.get(), flags, groups);
     return cudf::jni::convert_table_for_return(env,
                                                cudf::strings::extract(strings_column, *regex_prog));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractAllRecord(
-    JNIEnv *env, jclass, jlong j_view_handle, jstring pattern_obj, jint regex_flags,
-    jint capture_groups, jint idx) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_extractAllRecord(JNIEnv* env,
+                                                                        jclass,
+                                                                        jlong j_view_handle,
+                                                                        jstring pattern_obj,
+                                                                        jint regex_flags,
+                                                                        jint capture_groups,
+                                                                        jint idx)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", 0);
   JNI_NULL_CHECK(env, pattern_obj, "pattern is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
+    auto const column_view    = reinterpret_cast<cudf::column_view const*>(j_view_handle);
     auto const strings_column = cudf::strings_column_view{*column_view};
-    auto const pattern = cudf::jni::native_jstring(env, pattern_obj);
-    auto const flags = static_cast<cudf::strings::regex_flags>(regex_flags);
-    auto const groups = static_cast<cudf::strings::capture_groups>(capture_groups);
-    auto const regex_prog = cudf::strings::regex_program::create(pattern.get(), flags, groups);
-    auto result = (idx == 0) ? cudf::strings::findall(strings_column, *regex_prog) :
-                               cudf::strings::extract_all_record(strings_column, *regex_prog);
+    auto const pattern        = cudf::jni::native_jstring(env, pattern_obj);
+    auto const flags          = static_cast<cudf::strings::regex_flags>(regex_flags);
+    auto const groups         = static_cast<cudf::strings::capture_groups>(capture_groups);
+    auto const regex_prog     = cudf::strings::regex_program::create(pattern.get(), flags, groups);
+    auto result               = (idx == 0) ? cudf::strings::findall(strings_column, *regex_prog)
+                                           : cudf::strings::extract_all_record(strings_column, *regex_prog);
     return release_as_jlong(result);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_urlDecode(JNIEnv *env, jclass,
-                                                                 jlong j_view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_urlDecode(JNIEnv* env,
+                                                                 jclass,
+                                                                 jlong j_view_handle)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto view_ptr = reinterpret_cast<cudf::column_view *>(j_view_handle);
+    auto view_ptr = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view strings_view(*view_ptr);
     return release_as_jlong(cudf::strings::url_decode(strings_view));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_urlEncode(JNIEnv *env, jclass,
-                                                                 jlong j_view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_urlEncode(JNIEnv* env,
+                                                                 jclass,
+                                                                 jlong j_view_handle)
+{
   JNI_NULL_CHECK(env, j_view_handle, "column is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto view_ptr = reinterpret_cast<cudf::column_view *>(j_view_handle);
+    auto view_ptr = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view strings_view(*view_ptr);
     return release_as_jlong(cudf::strings::url_encode(strings_view));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_normalizeNANsAndZeros(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_normalizeNANsAndZeros(JNIEnv* env,
                                                                              jclass clazz,
-                                                                             jlong input_column) {
+                                                                             jlong input_column)
+{
   using cudf::column_view;
 
   JNI_NULL_CHECK(env, input_column, "Input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
     return release_as_jlong(
-        cudf::normalize_nans_and_zeros(*reinterpret_cast<column_view *>(input_column)));
+      cudf::normalize_nans_and_zeros(*reinterpret_cast<column_view*>(input_column)));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidity(
-    JNIEnv *env, jobject j_object, jlong base_column, jlongArray column_handles, jint bin_op) {
+  JNIEnv* env, jobject j_object, jlong base_column, jlongArray column_handles, jint bin_op)
+{
   JNI_NULL_CHECK(env, base_column, "base column native handle is null", 0);
   JNI_NULL_CHECK(env, column_handles, "array of column handles is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *original_column = reinterpret_cast<cudf::column_view *>(base_column);
+    cudf::column_view* original_column = reinterpret_cast<cudf::column_view*>(base_column);
     std::unique_ptr<cudf::column> copy(new cudf::column(*original_column));
     cudf::jni::native_jpointerArray<cudf::column_view> n_cudf_columns(env, column_handles);
 
@@ -1904,7 +2065,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
       case cudf::binary_operator::BITWISE_AND: {
         auto cols = n_cudf_columns.get_dereferenced();
         cols.push_back(copy->view());
-        auto table_view = cudf::table_view{cols};
+        auto table_view                = cudf::table_view{cols};
         auto [new_bitmask, null_count] = cudf::bitmask_and(table_view);
         copy->set_null_mask(std::move(new_bitmask), null_count);
         break;
@@ -1922,9 +2083,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
       default: JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Unsupported merge operation", 0);
     }
     auto const copy_cv = copy->view();
-    if (cudf::has_nonempty_nulls(copy_cv)) {
-      copy = cudf::purge_nonempty_nulls(copy_cv);
-    }
+    if (cudf::has_nonempty_nulls(copy_cv)) { copy = cudf::purge_nonempty_nulls(copy_cv); }
 
     return release_as_jlong(copy);
   }
@@ -1932,15 +2091,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyWithBooleanColumnAsValidity(
-    JNIEnv *env, jobject j_object, jlong exemplar_handle, jlong validity_column_handle) {
+  JNIEnv* env, jobject j_object, jlong exemplar_handle, jlong validity_column_handle)
+{
   JNI_NULL_CHECK(env, exemplar_handle, "ColumnView handle is null", 0);
   JNI_NULL_CHECK(env, validity_column_handle, "Validity column handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const exemplar = *reinterpret_cast<cudf::column_view *>(exemplar_handle);
-    auto const validity = *reinterpret_cast<cudf::column_view *>(validity_column_handle);
+    auto const exemplar = *reinterpret_cast<cudf::column_view*>(exemplar_handle);
+    auto const validity = *reinterpret_cast<cudf::column_view*>(validity_column_handle);
     return release_as_jlong(
-        cudf::jni::new_column_with_boolean_column_as_validity(exemplar, validity));
+      cudf::jni::new_column_with_boolean_column_as_validity(exemplar, validity));
   }
   CATCH_STD(env, 0);
 }
@@ -1950,23 +2110,29 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyWithBooleanColumnAsVa
 // should typically only be called from the CudfColumn inner class.
 ////////
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(
-    JNIEnv *env, jclass, jint j_type, jint scale, jlong j_data, jlong j_data_size, jlong j_offset,
-    jlong j_valid, jint j_null_count, jint size, jlongArray j_children) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(JNIEnv* env,
+                                                                          jclass,
+                                                                          jint j_type,
+                                                                          jint scale,
+                                                                          jlong j_data,
+                                                                          jlong j_data_size,
+                                                                          jlong j_offset,
+                                                                          jlong j_valid,
+                                                                          jint j_null_count,
+                                                                          jint size,
+                                                                          jlongArray j_children)
+{
   try {
     using cudf::column_view;
     cudf::jni::auto_set_device(env);
-    cudf::type_id n_type = static_cast<cudf::type_id>(j_type);
+    cudf::type_id n_type        = static_cast<cudf::type_id>(j_type);
     cudf::data_type n_data_type = cudf::jni::make_data_type(j_type, scale);
 
-    void *data = reinterpret_cast<void *>(j_data);
-    cudf::bitmask_type *valid = reinterpret_cast<cudf::bitmask_type *>(j_valid);
-    if (valid == nullptr) {
-      j_null_count = 0;
-    }
+    void* data                = reinterpret_cast<void*>(j_data);
+    cudf::bitmask_type* valid = reinterpret_cast<cudf::bitmask_type*>(j_valid);
+    if (valid == nullptr) { j_null_count = 0; }
 
-    if (j_null_count < 0) { // Check for unknown null count.
+    if (j_null_count < 0) {  // Check for unknown null count.
       // Calculate concrete null count.
       j_null_count = cudf::null_count(valid, 0, size);
     }
@@ -1974,37 +2140,51 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(
     if (n_type == cudf::type_id::STRING) {
       if (size == 0) {
         return ptr_as_jlong(
-            new cudf::column_view(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0));
+          new cudf::column_view(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0));
       } else {
         JNI_NULL_CHECK(env, j_offset, "offset is null", 0);
-        cudf::size_type *offsets = reinterpret_cast<cudf::size_type *>(j_offset);
-        cudf::column_view offsets_column(cudf::data_type{cudf::type_id::INT32}, size + 1, offsets,
-                                         nullptr, 0);
-        return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRING}, size,
-                                                  data, valid, j_null_count, 0, {offsets_column}));
+        cudf::size_type* offsets = reinterpret_cast<cudf::size_type*>(j_offset);
+        cudf::column_view offsets_column(
+          cudf::data_type{cudf::type_id::INT32}, size + 1, offsets, nullptr, 0);
+        return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRING},
+                                                  size,
+                                                  data,
+                                                  valid,
+                                                  j_null_count,
+                                                  0,
+                                                  {offsets_column}));
       }
     } else if (n_type == cudf::type_id::LIST) {
       JNI_NULL_CHECK(env, j_children, "children of a list are null", 0);
       cudf::jni::native_jpointerArray<cudf::column_view> children(env, j_children);
       JNI_ARG_CHECK(env, (children.size() == 1), "LIST children size is not 1", 0);
       cudf::size_type offsets_size = 0;
-      cudf::size_type *offsets = nullptr;
+      cudf::size_type* offsets     = nullptr;
       if (size != 0) {
         JNI_NULL_CHECK(env, j_offset, "offset is null", 0);
         offsets_size = size + 1;
-        offsets = reinterpret_cast<cudf::size_type *>(j_offset);
+        offsets      = reinterpret_cast<cudf::size_type*>(j_offset);
       }
-      cudf::column_view offsets_column(cudf::data_type{cudf::type_id::INT32}, offsets_size, offsets,
-                                       nullptr, 0);
-      return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::LIST}, size, nullptr,
-                                                valid, j_null_count, 0,
+      cudf::column_view offsets_column(
+        cudf::data_type{cudf::type_id::INT32}, offsets_size, offsets, nullptr, 0);
+      return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::LIST},
+                                                size,
+                                                nullptr,
+                                                valid,
+                                                j_null_count,
+                                                0,
                                                 {offsets_column, *children[0]}));
     } else if (n_type == cudf::type_id::STRUCT) {
       JNI_NULL_CHECK(env, j_children, "children of a struct are null", 0);
       cudf::jni::native_jpointerArray<cudf::column_view> children(env, j_children);
       std::vector<column_view> children_vector = children.get_dereferenced();
-      return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRUCT}, size,
-                                                nullptr, valid, j_null_count, 0, children_vector));
+      return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRUCT},
+                                                size,
+                                                nullptr,
+                                                valid,
+                                                j_null_count,
+                                                0,
+                                                children_vector));
     } else {
       return ptr_as_jlong(new cudf::column_view(n_data_type, size, data, valid, j_null_count));
     }
@@ -2012,69 +2192,79 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeCudfColumnView(
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeTypeId(JNIEnv *env, jobject j_object,
-                                                                      jlong handle) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeTypeId(JNIEnv* env,
+                                                                      jobject j_object,
+                                                                      jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     return static_cast<jint>(column->type().id());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeTypeScale(JNIEnv *env, jclass,
-                                                                         jlong handle) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeTypeScale(JNIEnv* env,
+                                                                         jclass,
+                                                                         jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     return column->type().scale();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeRowCount(JNIEnv *env, jclass,
-                                                                        jlong handle) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeRowCount(JNIEnv* env,
+                                                                        jclass,
+                                                                        jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     return static_cast<jint>(column->size());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeNullCount(JNIEnv *env,
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeNullCount(JNIEnv* env,
                                                                          jobject j_object,
-                                                                         jlong handle) {
+                                                                         jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     return static_cast<jint>(column->null_count());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnView_deleteColumnView(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ColumnView_deleteColumnView(JNIEnv* env,
                                                                        jobject j_object,
-                                                                       jlong handle) {
+                                                                       jlong handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view = reinterpret_cast<cudf::column_view*>(handle);
     delete view;
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataAddress(JNIEnv *env, jclass,
-                                                                            jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataAddress(JNIEnv* env,
+                                                                            jclass,
+                                                                            jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    jlong result = 0;
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    jlong result              = 0;
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     if (column->type().id() == cudf::type_id::STRING) {
       if (column->size() > 0) {
         cudf::strings_column_view view = cudf::strings_column_view(*column);
@@ -2089,17 +2279,19 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataAddress(JNIE
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataLength(JNIEnv *env, jclass,
-                                                                           jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataLength(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    jlong result = 0;
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    jlong result              = 0;
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     if (column->type().id() == cudf::type_id::STRING) {
       if (column->size() > 0) {
         cudf::strings_column_view view = cudf::strings_column_view(*column);
-        result = view.chars_size(cudf::get_default_stream());
+        result                         = view.chars_size(cudf::get_default_stream());
       }
     } else if (column->type().id() != cudf::type_id::LIST &&
                column->type().id() != cudf::type_id::STRUCT) {
@@ -2110,14 +2302,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeDataLength(JNIEn
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeNumChildren(JNIEnv *env,
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeNumChildren(JNIEnv* env,
                                                                            jobject j_object,
-                                                                           jlong handle) {
-
+                                                                           jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     // Strings has children(offsets and chars) but not a nested child() we care about here.
     if (column->type().id() == cudf::type_id::STRING) {
       return 0;
@@ -2133,53 +2325,57 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_ColumnView_getNativeNumChildren(JNIEn
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getChildCvPointer(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getChildCvPointer(JNIEnv* env,
                                                                          jobject j_object,
                                                                          jlong handle,
-                                                                         jint child_index) {
+                                                                         jint child_index)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
-    auto const is_list = column->type().id() == cudf::type_id::LIST;
-    auto const child = column->child(child_index + (is_list ? 1 : 0));
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
+    auto const is_list        = column->type().id() == cudf::type_id::LIST;
+    auto const child          = column->child(child_index + (is_list ? 1 : 0));
     return ptr_as_jlong(new cudf::column_view(child));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getListOffsetCvPointer(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getListOffsetCvPointer(JNIEnv* env,
                                                                               jobject j_object,
-                                                                              jlong handle) {
+                                                                              jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
-    cudf::lists_column_view view = cudf::lists_column_view(*column);
+    cudf::column_view* column      = reinterpret_cast<cudf::column_view*>(handle);
+    cudf::lists_column_view view   = cudf::lists_column_view(*column);
     cudf::column_view offsets_view = view.offsets();
     return ptr_as_jlong(new cudf::column_view(offsets_view));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsAddress(JNIEnv *env, jclass,
-                                                                               jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsAddress(JNIEnv* env,
+                                                                               jclass,
+                                                                               jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    jlong result = 0;
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    jlong result              = 0;
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     if (column->type().id() == cudf::type_id::STRING) {
       if (column->size() > 0) {
         cudf::strings_column_view view = cudf::strings_column_view(*column);
         cudf::column_view offsets_view = view.offsets();
-        result = ptr_as_jlong(offsets_view.data<char>());
+        result                         = ptr_as_jlong(offsets_view.data<char>());
       }
     } else if (column->type().id() == cudf::type_id::LIST) {
       if (column->size() > 0) {
-        cudf::lists_column_view view = cudf::lists_column_view(*column);
+        cudf::lists_column_view view   = cudf::lists_column_view(*column);
         cudf::column_view offsets_view = view.offsets();
-        result = ptr_as_jlong(offsets_view.data<char>());
+        result                         = ptr_as_jlong(offsets_view.data<char>());
       }
     }
     return result;
@@ -2187,24 +2383,26 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsAddress(J
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsLength(JNIEnv *env, jclass,
-                                                                              jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsLength(JNIEnv* env,
+                                                                              jclass,
+                                                                              jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    jlong result = 0;
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    jlong result              = 0;
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     if (column->type().id() == cudf::type_id::STRING) {
       if (column->size() > 0) {
         cudf::strings_column_view view = cudf::strings_column_view(*column);
         cudf::column_view offsets_view = view.offsets();
-        result = sizeof(int) * offsets_view.size();
+        result                         = sizeof(int) * offsets_view.size();
       }
     } else if (column->type().id() == cudf::type_id::LIST) {
       if (column->size() > 0) {
-        cudf::lists_column_view view = cudf::lists_column_view(*column);
+        cudf::lists_column_view view   = cudf::lists_column_view(*column);
         cudf::column_view offsets_view = view.offsets();
-        result = sizeof(int) * offsets_view.size();
+        result                         = sizeof(int) * offsets_view.size();
       }
     }
     return result;
@@ -2212,24 +2410,28 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeOffsetsLength(JN
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityAddress(JNIEnv *env, jclass,
-                                                                                jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityAddress(JNIEnv* env,
+                                                                                jclass,
+                                                                                jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
     return ptr_as_jlong(column->null_mask());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityLength(JNIEnv *env, jclass,
-                                                                               jlong handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityLength(JNIEnv* env,
+                                                                               jclass,
+                                                                               jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(handle);
-    jlong result = 0;
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(handle);
+    jlong result              = 0;
     if (column->null_mask() != nullptr) {
       result = cudf::bitmask_allocation_size_bytes(column->size());
     }
@@ -2238,28 +2440,33 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getNativeValidityLength(J
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getDeviceMemorySize(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getDeviceMemorySize(JNIEnv* env,
+                                                                           jclass,
                                                                            jlong handle,
-                                                                           jboolean pad_for_cpu) {
+                                                                           jboolean pad_for_cpu)
+{
   JNI_NULL_CHECK(env, handle, "native handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto view = reinterpret_cast<cudf::column_view const *>(handle);
+    auto view = reinterpret_cast<cudf::column_view const*>(handle);
     return calc_device_memory_size(*view, pad_for_cpu);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_hostPaddingSizeInBytes(JNIEnv *env, jclass) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_hostPaddingSizeInBytes(JNIEnv* env, jclass)
+{
   return sizeof(std::max_align_t);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_clamper(JNIEnv *env, jobject j_object,
-                                                               jlong handle, jlong j_lo_scalar,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_clamper(JNIEnv* env,
+                                                               jobject j_object,
+                                                               jlong handle,
+                                                               jlong j_lo_scalar,
                                                                jlong j_lo_replace_scalar,
                                                                jlong j_hi_scalar,
-                                                               jlong j_hi_replace_scalar) {
-
+                                                               jlong j_hi_replace_scalar)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
   JNI_NULL_CHECK(env, j_lo_scalar, "lo scalar is null", 0)
   JNI_NULL_CHECK(env, j_lo_replace_scalar, "lo scalar replace value is null", 0)
@@ -2268,96 +2475,103 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_clamper(JNIEnv *env, jobj
   using cudf::clamp;
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *column_view = reinterpret_cast<cudf::column_view *>(handle);
-    cudf::scalar *lo_scalar = reinterpret_cast<cudf::scalar *>(j_lo_scalar);
-    cudf::scalar *lo_replace_scalar = reinterpret_cast<cudf::scalar *>(j_lo_replace_scalar);
-    cudf::scalar *hi_scalar = reinterpret_cast<cudf::scalar *>(j_hi_scalar);
-    cudf::scalar *hi_replace_scalar = reinterpret_cast<cudf::scalar *>(j_hi_replace_scalar);
+    cudf::column_view* column_view  = reinterpret_cast<cudf::column_view*>(handle);
+    cudf::scalar* lo_scalar         = reinterpret_cast<cudf::scalar*>(j_lo_scalar);
+    cudf::scalar* lo_replace_scalar = reinterpret_cast<cudf::scalar*>(j_lo_replace_scalar);
+    cudf::scalar* hi_scalar         = reinterpret_cast<cudf::scalar*>(j_hi_scalar);
+    cudf::scalar* hi_replace_scalar = reinterpret_cast<cudf::scalar*>(j_hi_replace_scalar);
 
     return release_as_jlong(
-        clamp(*column_view, *lo_scalar, *lo_replace_scalar, *hi_scalar, *hi_replace_scalar));
+      clamp(*column_view, *lo_scalar, *lo_replace_scalar, *hi_scalar, *hi_replace_scalar));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_title(JNIEnv *env, jobject j_object,
-                                                             jlong handle) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_title(JNIEnv* env,
+                                                             jobject j_object,
+                                                             jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::strings::title(*view));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_capitalize(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_capitalize(JNIEnv* env,
+                                                                  jobject j_object,
                                                                   jlong strs_handle,
-                                                                  jlong delimiters_handle) {
-
+                                                                  jlong delimiters_handle)
+{
   JNI_NULL_CHECK(env, strs_handle, "native view handle is null", 0)
   JNI_NULL_CHECK(env, delimiters_handle, "delimiters scalar handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(strs_handle);
-    cudf::string_scalar *deli = reinterpret_cast<cudf::string_scalar *>(delimiters_handle);
+    cudf::column_view* view   = reinterpret_cast<cudf::column_view*>(strs_handle);
+    cudf::string_scalar* deli = reinterpret_cast<cudf::string_scalar*>(delimiters_handle);
     return release_as_jlong(cudf::strings::capitalize(*view, *deli));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_joinStrings(JNIEnv *env, jobject j_object,
-                                                                   jlong strs_handle,
-                                                                   jlong separator_handle,
-                                                                   jlong narep_handle) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_joinStrings(
+  JNIEnv* env, jobject j_object, jlong strs_handle, jlong separator_handle, jlong narep_handle)
+{
   JNI_NULL_CHECK(env, strs_handle, "native view handle is null", 0)
   JNI_NULL_CHECK(env, separator_handle, "separator scalar handle is null", 0)
   JNI_NULL_CHECK(env, narep_handle, "narep scalar handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(strs_handle);
-    cudf::string_scalar *sep = reinterpret_cast<cudf::string_scalar *>(separator_handle);
-    cudf::string_scalar *narep = reinterpret_cast<cudf::string_scalar *>(narep_handle);
+    cudf::column_view* view    = reinterpret_cast<cudf::column_view*>(strs_handle);
+    cudf::string_scalar* sep   = reinterpret_cast<cudf::string_scalar*>(separator_handle);
+    cudf::string_scalar* narep = reinterpret_cast<cudf::string_scalar*>(narep_handle);
     return release_as_jlong(cudf::strings::join_strings(*view, *sep, *narep));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeStructView(JNIEnv *env, jobject j_object,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_makeStructView(JNIEnv* env,
+                                                                      jobject j_object,
                                                                       jlongArray handles,
-                                                                      jlong row_count) {
-
+                                                                      jlong row_count)
+{
   JNI_NULL_CHECK(env, handles, "native view handles are null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto children = cudf::jni::native_jpointerArray<cudf::column_view>{env, handles};
+    auto children        = cudf::jni::native_jpointerArray<cudf::column_view>{env, handles};
     auto children_vector = children.get_dereferenced();
-    return ptr_as_jlong(new cudf::column_view(cudf::data_type{cudf::type_id::STRUCT}, row_count,
-                                              nullptr, nullptr, 0, 0, children_vector));
+    return ptr_as_jlong(new cudf::column_view(
+      cudf::data_type{cudf::type_id::STRUCT}, row_count, nullptr, nullptr, 0, 0, children_vector));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_nansToNulls(JNIEnv *env, jobject j_object,
-                                                                   jlong handle) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_nansToNulls(JNIEnv* env,
+                                                                   jobject j_object,
+                                                                   jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = *reinterpret_cast<cudf::column_view *>(handle);
+    auto const input = *reinterpret_cast<cudf::column_view*>(handle);
     // get a new null mask by setting all the nans to null
     auto [new_nullmask, new_null_count] = cudf::nans_to_nulls(input);
     // create a column_view which is a no-copy wrapper around the original column without the null
     // mask
-    auto const input_without_nullmask = cudf::column_view(
-        input.type(), input.size(), input.head<void>(), nullptr, 0, input.offset(),
-        std::vector<cudf::column_view>{input.child_begin(), input.child_end()});
+    auto const input_without_nullmask =
+      cudf::column_view(input.type(),
+                        input.size(),
+                        input.head<void>(),
+                        nullptr,
+                        0,
+                        input.offset(),
+                        std::vector<cudf::column_view>{input.child_begin(), input.child_end()});
     // create a column by deep copying `input_without_nullmask`.
     auto deep_copy = std::make_unique<cudf::column>(input_without_nullmask);
     deep_copy->set_null_mask(std::move(*new_nullmask), new_null_count);
@@ -2366,99 +2580,106 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_nansToNulls(JNIEnv *env,
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isFloat(JNIEnv *env, jobject j_object,
-                                                               jlong handle) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isFloat(JNIEnv* env,
+                                                               jobject j_object,
+                                                               jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::strings::is_float(*view));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isInteger(JNIEnv *env, jobject j_object,
-                                                                 jlong handle) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isInteger(JNIEnv* env,
+                                                                 jobject j_object,
+                                                                 jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::strings::is_integer(*view));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isFixedPoint(JNIEnv *env, jobject,
-                                                                    jlong handle, jint j_dtype,
-                                                                    jint scale) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isFixedPoint(
+  JNIEnv* env, jobject, jlong handle, jint j_dtype, jint scale)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view  = reinterpret_cast<cudf::column_view*>(handle);
     cudf::data_type fp_dtype = cudf::jni::make_data_type(j_dtype, scale);
     return release_as_jlong(cudf::strings::is_fixed_point(*view, fp_dtype));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isIntegerWithType(JNIEnv *env, jobject,
-                                                                         jlong handle, jint j_dtype,
-                                                                         jint scale) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isIntegerWithType(
+  JNIEnv* env, jobject, jlong handle, jint j_dtype, jint scale)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view   = reinterpret_cast<cudf::column_view*>(handle);
     cudf::data_type int_dtype = cudf::jni::make_data_type(j_dtype, scale);
     return release_as_jlong(cudf::strings::is_integer(*view, int_dtype));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyColumnViewToCV(JNIEnv* env,
                                                                           jobject j_object,
-                                                                          jlong handle) {
-
+                                                                          jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "native view handle is null", 0)
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *view = reinterpret_cast<cudf::column_view *>(handle);
+    cudf::column_view* view = reinterpret_cast<cudf::column_view*>(handle);
     return ptr_as_jlong(new cudf::column(*view));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(
-    JNIEnv *env, jclass, jlong j_view_handle, jlong j_scalar_handle, jboolean allow_single_quotes,
-    jboolean strip_quotes_from_single_strings, jboolean missing_fields_as_nulls) {
-
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ColumnView_getJSONObject(JNIEnv* env,
+                                             jclass,
+                                             jlong j_view_handle,
+                                             jlong j_scalar_handle,
+                                             jboolean allow_single_quotes,
+                                             jboolean strip_quotes_from_single_strings,
+                                             jboolean missing_fields_as_nulls)
+{
   JNI_NULL_CHECK(env, j_view_handle, "view cannot be null", 0);
   JNI_NULL_CHECK(env, j_scalar_handle, "path cannot be null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::column_view *n_column_view = reinterpret_cast<cudf::column_view *>(j_view_handle);
+    cudf::column_view* n_column_view = reinterpret_cast<cudf::column_view*>(j_view_handle);
     cudf::strings_column_view n_strings_col_view(*n_column_view);
-    cudf::string_scalar *n_scalar_path = reinterpret_cast<cudf::string_scalar *>(j_scalar_handle);
-    auto options = cudf::get_json_object_options{};
+    cudf::string_scalar* n_scalar_path = reinterpret_cast<cudf::string_scalar*>(j_scalar_handle);
+    auto options                       = cudf::get_json_object_options{};
     options.set_allow_single_quotes(allow_single_quotes);
     options.set_strip_quotes_from_single_strings(strip_quotes_from_single_strings);
     options.set_missing_fields_as_nulls(missing_fields_as_nulls);
     auto result_col_ptr = [&]() {
       try {
         return cudf::get_json_object(n_strings_col_view, *n_scalar_path, options);
-      } catch (std::invalid_argument const &err) {
+      } catch (std::invalid_argument const& err) {
         auto const null_scalar = cudf::string_scalar(std::string(""), false);
         return cudf::make_column_from_scalar(null_scalar, n_strings_col_view.size());
-      } catch (...) { throw; }
+      } catch (...) {
+        throw;
+      }
     }();
     return release_as_jlong(result_col_ptr);
   }
@@ -2466,64 +2687,82 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringConcatenationListElementsSepCol(
-    JNIEnv *env, jclass, jlong column_handle, jlong sep_handle, jlong separator_narep,
-    jlong col_narep, jboolean separate_nulls, jboolean empty_string_output_if_empty_list) {
+  JNIEnv* env,
+  jclass,
+  jlong column_handle,
+  jlong sep_handle,
+  jlong separator_narep,
+  jlong col_narep,
+  jboolean separate_nulls,
+  jboolean empty_string_output_if_empty_list)
+{
   JNI_NULL_CHECK(env, column_handle, "column handle is null", 0);
   JNI_NULL_CHECK(env, sep_handle, "separator column handle is null", 0);
   JNI_NULL_CHECK(env, separator_narep, "separator narep string scalar object is null", 0);
   JNI_NULL_CHECK(env, col_narep, "column narep string scalar object is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto &separator_narep_scalar = *reinterpret_cast<cudf::string_scalar *>(separator_narep);
-    const auto &col_narep_scalar = *reinterpret_cast<cudf::string_scalar *>(col_narep);
-    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES :
-                                        cudf::strings::separator_on_nulls::NO;
-    auto empty_list_output = empty_string_output_if_empty_list ?
-                                 cudf::strings::output_if_empty_list::EMPTY_STRING :
-                                 cudf::strings::output_if_empty_list::NULL_ELEMENT;
+    const auto& separator_narep_scalar = *reinterpret_cast<cudf::string_scalar*>(separator_narep);
+    const auto& col_narep_scalar       = *reinterpret_cast<cudf::string_scalar*>(col_narep);
+    auto null_policy                   = separate_nulls ? cudf::strings::separator_on_nulls::YES
+                                                        : cudf::strings::separator_on_nulls::NO;
+    auto empty_list_output             = empty_string_output_if_empty_list
+                                           ? cudf::strings::output_if_empty_list::EMPTY_STRING
+                                           : cudf::strings::output_if_empty_list::NULL_ELEMENT;
 
-    cudf::column_view *column = reinterpret_cast<cudf::column_view *>(sep_handle);
+    cudf::column_view* column = reinterpret_cast<cudf::column_view*>(sep_handle);
     cudf::strings_column_view strings_column(*column);
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_handle);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_handle);
     cudf::lists_column_view lcv(*cv);
-    return release_as_jlong(
-        cudf::strings::join_list_elements(lcv, strings_column, separator_narep_scalar,
-                                          col_narep_scalar, null_policy, empty_list_output));
+    return release_as_jlong(cudf::strings::join_list_elements(lcv,
+                                                              strings_column,
+                                                              separator_narep_scalar,
+                                                              col_narep_scalar,
+                                                              null_policy,
+                                                              empty_list_output));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringConcatenationListElements(
-    JNIEnv *env, jclass, jlong column_handle, jlong separator, jlong narep, jboolean separate_nulls,
-    jboolean empty_string_output_if_empty_list) {
+  JNIEnv* env,
+  jclass,
+  jlong column_handle,
+  jlong separator,
+  jlong narep,
+  jboolean separate_nulls,
+  jboolean empty_string_output_if_empty_list)
+{
   JNI_NULL_CHECK(env, column_handle, "column handle is null", 0);
   JNI_NULL_CHECK(env, separator, "separator string scalar object is null", 0);
   JNI_NULL_CHECK(env, narep, "separator narep string scalar object is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto &separator_scalar = *reinterpret_cast<cudf::string_scalar *>(separator);
-    const auto &narep_scalar = *reinterpret_cast<cudf::string_scalar *>(narep);
-    auto null_policy = separate_nulls ? cudf::strings::separator_on_nulls::YES :
-                                        cudf::strings::separator_on_nulls::NO;
-    auto empty_list_output = empty_string_output_if_empty_list ?
-                                 cudf::strings::output_if_empty_list::EMPTY_STRING :
-                                 cudf::strings::output_if_empty_list::NULL_ELEMENT;
+    const auto& separator_scalar = *reinterpret_cast<cudf::string_scalar*>(separator);
+    const auto& narep_scalar     = *reinterpret_cast<cudf::string_scalar*>(narep);
+    auto null_policy             = separate_nulls ? cudf::strings::separator_on_nulls::YES
+                                                  : cudf::strings::separator_on_nulls::NO;
+    auto empty_list_output       = empty_string_output_if_empty_list
+                                     ? cudf::strings::output_if_empty_list::EMPTY_STRING
+                                     : cudf::strings::output_if_empty_list::NULL_ELEMENT;
 
-    cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_handle);
+    cudf::column_view* cv = reinterpret_cast<cudf::column_view*>(column_handle);
     cudf::lists_column_view lcv(*cv);
-    return release_as_jlong(cudf::strings::join_list_elements(lcv, separator_scalar, narep_scalar,
-                                                              null_policy, empty_list_output));
+    return release_as_jlong(cudf::strings::join_list_elements(
+      lcv, separator_scalar, narep_scalar, null_policy, empty_list_output));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_repeatStrings(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_repeatStrings(JNIEnv* env,
+                                                                     jclass,
                                                                      jlong strings_handle,
-                                                                     jint repeat_times) {
+                                                                     jint repeat_times)
+{
   JNI_NULL_CHECK(env, strings_handle, "strings_handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const cv = *reinterpret_cast<cudf::column_view *>(strings_handle);
+    auto const cv       = *reinterpret_cast<cudf::column_view*>(strings_handle);
     auto const strs_col = cudf::strings_column_view(cv);
     return release_as_jlong(cudf::strings::repeat_strings(strs_col, repeat_times));
   }
@@ -2531,69 +2770,76 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_repeatStrings(JNIEnv *env
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_repeatStringsWithColumnRepeatTimes(
-    JNIEnv *env, jclass, jlong strings_handle, jlong repeat_times_handle) {
+  JNIEnv* env, jclass, jlong strings_handle, jlong repeat_times_handle)
+{
   JNI_NULL_CHECK(env, strings_handle, "strings_handle is null", 0);
   JNI_NULL_CHECK(env, repeat_times_handle, "repeat_times_handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const strings_cv = *reinterpret_cast<cudf::column_view *>(strings_handle);
-    auto const strs_col = cudf::strings_column_view(strings_cv);
-    auto const repeat_times_cv = *reinterpret_cast<cudf::column_view *>(repeat_times_handle);
+    auto const strings_cv      = *reinterpret_cast<cudf::column_view*>(strings_handle);
+    auto const strs_col        = cudf::strings_column_view(strings_cv);
+    auto const repeat_times_cv = *reinterpret_cast<cudf::column_view*>(repeat_times_handle);
     return release_as_jlong(cudf::strings::repeat_strings(strs_col, repeat_times_cv));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_applyBooleanMask(
-    JNIEnv *env, jclass, jlong list_column_handle, jlong boolean_mask_list_column_handle) {
+  JNIEnv* env, jclass, jlong list_column_handle, jlong boolean_mask_list_column_handle)
+{
   JNI_NULL_CHECK(env, list_column_handle, "list handle is null", 0);
   JNI_NULL_CHECK(env, boolean_mask_list_column_handle, "boolean mask handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
 
-    cudf::column_view const *list_column =
-        reinterpret_cast<cudf::column_view const *>(list_column_handle);
+    cudf::column_view const* list_column =
+      reinterpret_cast<cudf::column_view const*>(list_column_handle);
     cudf::lists_column_view const list_view = cudf::lists_column_view(*list_column);
 
-    cudf::column_view const *boolean_mask_list_column =
-        reinterpret_cast<cudf::column_view const *>(boolean_mask_list_column_handle);
+    cudf::column_view const* boolean_mask_list_column =
+      reinterpret_cast<cudf::column_view const*>(boolean_mask_list_column_handle);
     cudf::lists_column_view const boolean_mask_list_view =
-        cudf::lists_column_view(*boolean_mask_list_column);
+      cudf::lists_column_view(*boolean_mask_list_column);
 
     return release_as_jlong(cudf::lists::apply_boolean_mask(list_view, boolean_mask_list_view));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jboolean JNICALL
-Java_ai_rapids_cudf_ColumnView_hasNonEmptyNulls(JNIEnv *env, jclass, jlong column_view_handle) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ColumnView_hasNonEmptyNulls(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong column_view_handle)
+{
   JNI_NULL_CHECK(env, column_view_handle, "column_view handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const *cv = reinterpret_cast<cudf::column_view const *>(column_view_handle);
+    auto const* cv = reinterpret_cast<cudf::column_view const*>(column_view_handle);
     return cudf::has_nonempty_nulls(*cv);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL
-Java_ai_rapids_cudf_ColumnView_purgeNonEmptyNulls(JNIEnv *env, jclass, jlong column_view_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_purgeNonEmptyNulls(JNIEnv* env,
+                                                                          jclass,
+                                                                          jlong column_view_handle)
+{
   JNI_NULL_CHECK(env, column_view_handle, "column_view handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const *cv = reinterpret_cast<cudf::column_view const *>(column_view_handle);
+    auto const* cv = reinterpret_cast<cudf::column_view const*>(column_view_handle);
     return release_as_jlong(cudf::purge_nonempty_nulls(*cv));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_toHex(JNIEnv *env, jclass, jlong input_ptr) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_toHex(JNIEnv* env, jclass, jlong input_ptr)
+{
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view *input = reinterpret_cast<cudf::column_view *>(input_ptr);
+    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::strings::integers_to_hex(*input));
   }
   CATCH_STD(env, 0);
 }
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu
index 56aea0b45e2..2dbff923544 100644
--- a/java/src/main/native/src/ColumnViewJni.cu
+++ b/java/src/main/native/src/ColumnViewJni.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <vector>
+#include "ColumnViewJni.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
@@ -29,59 +29,64 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/span.hpp>
+
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
+
 #include <thrust/functional.h>
 #include <thrust/logical.h>
 #include <thrust/scan.h>
 #include <thrust/tabulate.h>
 
-#include "ColumnViewJni.hpp"
+#include <vector>
 
 namespace cudf::jni {
 
-std::unique_ptr<cudf::column>
-new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar,
-                                           cudf::column_view const &validity_column) {
+std::unique_ptr<cudf::column> new_column_with_boolean_column_as_validity(
+  cudf::column_view const& exemplar, cudf::column_view const& validity_column)
+{
   CUDF_EXPECTS(validity_column.type().id() == type_id::BOOL8,
                "Validity column must be of type bool");
   CUDF_EXPECTS(validity_column.size() == exemplar.size(),
                "Exemplar and validity columns must have the same size");
 
   auto validity_device_view = cudf::column_device_view::create(validity_column);
-  auto validity_begin = cudf::detail::make_optional_iterator<bool>(
-      *validity_device_view, cudf::nullate::DYNAMIC{validity_column.has_nulls()});
-  auto validity_end = validity_begin + validity_device_view->size();
+  auto validity_begin       = cudf::detail::make_optional_iterator<bool>(
+    *validity_device_view, cudf::nullate::DYNAMIC{validity_column.has_nulls()});
+  auto validity_end            = validity_begin + validity_device_view->size();
   auto [null_mask, null_count] = cudf::detail::valid_if(
-      validity_begin, validity_end,
-      [] __device__(auto optional_bool) { return optional_bool.value_or(false); },
-      cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-  auto const exemplar_without_null_mask = cudf::column_view{
-      exemplar.type(),
-      exemplar.size(),
-      exemplar.head<void>(),
-      nullptr,
-      0,
-      exemplar.offset(),
-      std::vector<cudf::column_view>{exemplar.child_begin(), exemplar.child_end()}};
+    validity_begin,
+    validity_end,
+    [] __device__(auto optional_bool) { return optional_bool.value_or(false); },
+    cudf::get_default_stream(),
+    rmm::mr::get_current_device_resource());
+  auto const exemplar_without_null_mask =
+    cudf::column_view{exemplar.type(),
+                      exemplar.size(),
+                      exemplar.head<void>(),
+                      nullptr,
+                      0,
+                      exemplar.offset(),
+                      std::vector<cudf::column_view>{exemplar.child_begin(), exemplar.child_end()}};
   auto deep_copy = std::make_unique<cudf::column>(exemplar_without_null_mask);
   deep_copy->set_null_mask(std::move(null_mask), null_count);
   return deep_copy;
 }
 
-std::unique_ptr<cudf::column> generate_list_offsets(cudf::column_view const &list_length,
-                                                    rmm::cuda_stream_view stream) {
+std::unique_ptr<cudf::column> generate_list_offsets(cudf::column_view const& list_length,
+                                                    rmm::cuda_stream_view stream)
+{
   CUDF_EXPECTS(list_length.type().id() == cudf::type_id::INT32,
                "Input column does not have type INT32.");
 
   auto const begin_iter = list_length.template begin<cudf::size_type>();
-  auto const end_iter = list_length.template end<cudf::size_type>();
+  auto const end_iter   = list_length.template end<cudf::size_type>();
 
-  auto offsets_column = make_numeric_column(data_type{type_id::INT32}, list_length.size() + 1,
-                                            mask_state::UNALLOCATED, stream);
+  auto offsets_column = make_numeric_column(
+    data_type{type_id::INT32}, list_length.size() + 1, mask_state::UNALLOCATED, stream);
   auto offsets_view = offsets_column->mutable_view();
-  auto d_offsets = offsets_view.template begin<int32_t>();
+  auto d_offsets    = offsets_view.template begin<int32_t>();
 
   thrust::inclusive_scan(rmm::exec_policy(stream), begin_iter, end_iter, d_offsets + 1);
   CUDF_CUDA_TRY(cudaMemsetAsync(d_offsets, 0, sizeof(int32_t), stream));
@@ -97,75 +102,82 @@ namespace {
  * @param list The input list.
  * @return The boolean result indicating if the input list has null elements.
  */
-__device__ bool list_has_nulls(list_device_view list) {
-  return thrust::any_of(thrust::seq, thrust::make_counting_iterator(0),
+__device__ bool list_has_nulls(list_device_view list)
+{
+  return thrust::any_of(thrust::seq,
+                        thrust::make_counting_iterator(0),
                         thrust::make_counting_iterator(list.size()),
                         [&list](auto const idx) { return list.is_null(idx); });
 }
 
-} // namespace
+}  // namespace
 
-void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view const &rhs,
-                               std::unique_ptr<cudf::column> const &overlap_result,
-                               rmm::cuda_stream_view stream) {
+void post_process_list_overlap(cudf::column_view const& lhs,
+                               cudf::column_view const& rhs,
+                               std::unique_ptr<cudf::column> const& overlap_result,
+                               rmm::cuda_stream_view stream)
+{
   // If both of the input columns do not have nulls, we don't need to do anything here.
   if (!lists_column_view{lhs}.child().has_nulls() && !lists_column_view{rhs}.child().has_nulls()) {
     return;
   }
 
-  auto const overlap_cv = overlap_result->view();
-  auto const lhs_cdv_ptr = column_device_view::create(lhs, stream);
-  auto const rhs_cdv_ptr = column_device_view::create(rhs, stream);
+  auto const overlap_cv      = overlap_result->view();
+  auto const lhs_cdv_ptr     = column_device_view::create(lhs, stream);
+  auto const rhs_cdv_ptr     = column_device_view::create(rhs, stream);
   auto const overlap_cdv_ptr = column_device_view::create(overlap_cv, stream);
 
   // Create a new bitmask to satisfy Spark's arrays_overlap's special behavior.
   auto validity = rmm::device_uvector<bool>(overlap_cv.size(), stream);
-  thrust::tabulate(rmm::exec_policy(stream), validity.begin(), validity.end(),
-                   [lhs = cudf::detail::lists_column_device_view{*lhs_cdv_ptr},
-                    rhs = cudf::detail::lists_column_device_view{*rhs_cdv_ptr},
-                    overlap_result = *overlap_cdv_ptr] __device__(auto const idx) {
-                     if (overlap_result.is_null(idx) ||
-                         overlap_result.template element<bool>(idx)) {
-                       return true;
-                     }
-
-                     // `lhs_list` and `rhs_list` should not be null, otherwise
-                     // `overlap_result[idx]` is null and that has been handled above.
-                     auto const lhs_list = list_device_view{lhs, idx};
-                     auto const rhs_list = list_device_view{rhs, idx};
-
-                     // Only proceed if both lists are non-empty.
-                     if (lhs_list.size() == 0 || rhs_list.size() == 0) {
-                       return true;
-                     }
-
-                     // Only proceed if at least one list has nulls.
-                     if (!list_has_nulls(lhs_list) && !list_has_nulls(rhs_list)) {
-                       return true;
-                     }
-
-                     // Here, the input lists satisfy all the conditions below so we output a
-                     // null:
-                     //  - Both of the input lists have no non-null common element, and
-                     //  - They are both non-empty, and
-                     //  - Either of them contains null elements.
-                     return false;
-                   });
+  thrust::tabulate(
+    rmm::exec_policy(stream),
+    validity.begin(),
+    validity.end(),
+    [lhs            = cudf::detail::lists_column_device_view{*lhs_cdv_ptr},
+     rhs            = cudf::detail::lists_column_device_view{*rhs_cdv_ptr},
+     overlap_result = *overlap_cdv_ptr] __device__(auto const idx) {
+      if (overlap_result.is_null(idx) || overlap_result.template element<bool>(idx)) {
+        return true;
+      }
+
+      // `lhs_list` and `rhs_list` should not be null, otherwise
+      // `overlap_result[idx]` is null and that has been handled above.
+      auto const lhs_list = list_device_view{lhs, idx};
+      auto const rhs_list = list_device_view{rhs, idx};
+
+      // Only proceed if both lists are non-empty.
+      if (lhs_list.size() == 0 || rhs_list.size() == 0) { return true; }
+
+      // Only proceed if at least one list has nulls.
+      if (!list_has_nulls(lhs_list) && !list_has_nulls(rhs_list)) { return true; }
+
+      // Here, the input lists satisfy all the conditions below so we output a
+      // null:
+      //  - Both of the input lists have no non-null common element, and
+      //  - They are both non-empty, and
+      //  - Either of them contains null elements.
+      return false;
+    });
 
   // Create a new nullmask from the validity data.
   auto [new_null_mask, new_null_count] =
-      cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{},
-                             cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    cudf::detail::valid_if(validity.begin(),
+                           validity.end(),
+                           thrust::identity{},
+                           cudf::get_default_stream(),
+                           rmm::mr::get_current_device_resource());
 
   if (new_null_count > 0) {
     // If the `overlap_result` column is nullable, perform `bitmask_and` of its nullmask and the
     // new nullmask.
     if (overlap_cv.nullable()) {
       auto [null_mask, null_count] = cudf::detail::bitmask_and(
-          std::vector<bitmask_type const *>{
-              overlap_cv.null_mask(), static_cast<bitmask_type const *>(new_null_mask.data())},
-          std::vector<cudf::size_type>{0, 0}, overlap_cv.size(), stream,
-          rmm::mr::get_current_device_resource());
+        std::vector<bitmask_type const*>{overlap_cv.null_mask(),
+                                         static_cast<bitmask_type const*>(new_null_mask.data())},
+        std::vector<cudf::size_type>{0, 0},
+        overlap_cv.size(),
+        stream,
+        rmm::mr::get_current_device_resource());
       overlap_result->set_null_mask(std::move(null_mask), null_count);
     } else {
       // Just set the output nullmask as the new nullmask.
@@ -174,30 +186,32 @@ void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view c
   }
 }
 
-std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view const &input,
-                                                    rmm::cuda_stream_view stream) {
-  if (input.is_empty()) {
-    return empty_like(input.parent());
-  }
+std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view const& input,
+                                                    rmm::cuda_stream_view stream)
+{
+  if (input.is_empty()) { return empty_like(input.parent()); }
 
   auto const child = input.get_sliced_child(stream);
 
   // Generate labels for the input list elements.
   auto labels = rmm::device_uvector<cudf::size_type>(child.size(), stream);
-  cudf::detail::label_segments(input.offsets_begin(), input.offsets_end(), labels.begin(),
-                               labels.end(), stream);
+  cudf::detail::label_segments(
+    input.offsets_begin(), input.offsets_end(), labels.begin(), labels.end(), stream);
 
   // Use `cudf::duplicate_keep_option::KEEP_LAST` so this will produce the desired behavior when
   // being called in `create_map` in spark-rapids.
   // Other options comparing nulls and NaNs are set as all-equal.
-  auto out_columns =
-      cudf::detail::stable_distinct(
-          table_view{{column_view{cudf::device_span<cudf::size_type const>{labels}}, child.child(0),
-                      child.child(1)}}, // input table
-          std::vector<size_type>{0, 1}, // key columns
-          cudf::duplicate_keep_option::KEEP_LAST, cudf::null_equality::EQUAL,
-          cudf::nan_equality::ALL_EQUAL, stream, rmm::mr::get_current_device_resource())
-          ->release();
+  auto out_columns = cudf::detail::stable_distinct(
+                       table_view{{column_view{cudf::device_span<cudf::size_type const>{labels}},
+                                   child.child(0),
+                                   child.child(1)}},  // input table
+                       std::vector<size_type>{0, 1},  // key columns
+                       cudf::duplicate_keep_option::KEEP_LAST,
+                       cudf::null_equality::EQUAL,
+                       cudf::nan_equality::ALL_EQUAL,
+                       stream,
+                       rmm::mr::get_current_device_resource())
+                       ->release();
   auto const out_labels = out_columns.front()->view();
 
   // Assemble a structs column of <out_keys, out_vals>.
@@ -205,20 +219,26 @@ std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view cons
   out_structs_members.emplace_back(std::move(out_columns[1]));
   out_structs_members.emplace_back(std::move(out_columns[2]));
   auto out_structs =
-      cudf::make_structs_column(out_labels.size(), std::move(out_structs_members), 0, {});
+    cudf::make_structs_column(out_labels.size(), std::move(out_structs_members), 0, {});
 
   // Assemble a lists column of structs<out_keys, out_vals>.
-  auto out_offsets = make_numeric_column(data_type{type_to_id<size_type>()}, input.size() + 1,
-                                         mask_state::UNALLOCATED, stream);
+  auto out_offsets = make_numeric_column(
+    data_type{type_to_id<size_type>()}, input.size() + 1, mask_state::UNALLOCATED, stream);
   auto const offsets_begin = out_offsets->mutable_view().template begin<size_type>();
-  auto const labels_begin = out_labels.template begin<size_type>();
-  cudf::detail::labels_to_offsets(labels_begin, labels_begin + out_labels.size(), offsets_begin,
-                                  offsets_begin + out_offsets->size(), stream);
+  auto const labels_begin  = out_labels.template begin<size_type>();
+  cudf::detail::labels_to_offsets(labels_begin,
+                                  labels_begin + out_labels.size(),
+                                  offsets_begin,
+                                  offsets_begin + out_offsets->size(),
+                                  stream);
 
   return cudf::make_lists_column(
-      input.size(), std::move(out_offsets), std::move(out_structs), input.null_count(),
-      cudf::detail::copy_bitmask(input.parent(), stream, rmm::mr::get_current_device_resource()),
-      stream);
+    input.size(),
+    std::move(out_offsets),
+    std::move(out_structs),
+    input.null_count(),
+    cudf::detail::copy_bitmask(input.parent(), stream, rmm::mr::get_current_device_resource()),
+    stream);
 }
 
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/java/src/main/native/src/ColumnViewJni.hpp b/java/src/main/native/src/ColumnViewJni.hpp
index 12061119402..c9eef0139ea 100644
--- a/java/src/main/native/src/ColumnViewJni.hpp
+++ b/java/src/main/native/src/ColumnViewJni.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf::jni {
@@ -34,9 +35,8 @@ namespace cudf::jni {
  * @param bool_column bool column whose value is to be used as the validity.
  * @return Deep copy of the exemplar, with the replaced validity.
  */
-std::unique_ptr<cudf::column>
-new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar,
-                                           cudf::column_view const &bool_column);
+std::unique_ptr<cudf::column> new_column_with_boolean_column_as_validity(
+  cudf::column_view const& exemplar, cudf::column_view const& bool_column);
 
 /**
  * @brief Generates list offsets with lengths of each list.
@@ -49,9 +49,8 @@ new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar,
  * @param list_length The column represents list lengths.
  * @return The column represents list offsets.
  */
-std::unique_ptr<cudf::column>
-generate_list_offsets(cudf::column_view const &list_length,
-                      rmm::cuda_stream_view stream = cudf::get_default_stream());
+std::unique_ptr<cudf::column> generate_list_offsets(
+  cudf::column_view const& list_length, rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Perform a special treatment for the results of `cudf::lists::have_overlap` to produce the
@@ -71,8 +70,9 @@ generate_list_offsets(cudf::column_view const &list_length,
  * @param rhs The input lists column for the other side.
  * @param overlap_result The result column generated by checking list overlap in cudf.
  */
-void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view const &rhs,
-                               std::unique_ptr<cudf::column> const &overlap_result,
+void post_process_list_overlap(cudf::column_view const& lhs,
+                               cudf::column_view const& rhs,
+                               std::unique_ptr<cudf::column> const& overlap_result,
                                rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
@@ -88,7 +88,7 @@ void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view c
  *
  * @return A new list columns in which the elements in each list are distinct by key.
  */
-std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view const &input,
+std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view const& input,
                                                     rmm::cuda_stream_view stream);
 
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/java/src/main/native/src/CompiledExpression.cpp b/java/src/main/native/src/CompiledExpression.cpp
index 56c96b26200..339204b96e6 100644
--- a/java/src/main/native/src/CompiledExpression.cpp
+++ b/java/src/main/native/src/CompiledExpression.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,8 @@
  * limitations under the License.
  */
 
-#include <cstdint>
-#include <memory>
-#include <stdexcept>
-#include <vector>
+#include "cudf_jni_apis.hpp"
+#include "jni_compiled_expr.hpp"
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/scalar/scalar.hpp>
@@ -25,56 +23,65 @@
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 
-#include "cudf_jni_apis.hpp"
-#include "jni_compiled_expr.hpp"
+#include <cstdint>
+#include <memory>
+#include <stdexcept>
+#include <vector>
 
 namespace {
 
 /** Utility class to read data from the serialized AST buffer generated from Java */
 class jni_serialized_ast {
-  jbyte const *data_ptr;      // pointer to the current entity to deserialize
-  jbyte const *const end_ptr; // pointer to the byte immediately after the AST serialized data
+  jbyte const* data_ptr;       // pointer to the current entity to deserialize
+  jbyte const* const end_ptr;  // pointer to the byte immediately after the AST serialized data
 
   /** Throws an error if there is insufficient space left to read the specified number of bytes */
-  void check_for_eof(std::size_t num_bytes_to_read) {
+  void check_for_eof(std::size_t num_bytes_to_read)
+  {
     if (data_ptr + num_bytes_to_read > end_ptr) {
       throw std::runtime_error("Unexpected end of serialized data");
     }
   }
 
-public:
-  jni_serialized_ast(cudf::jni::native_jbyteArray &jni_data)
-      : data_ptr(jni_data.begin()), end_ptr(jni_data.end()) {}
+ public:
+  jni_serialized_ast(cudf::jni::native_jbyteArray& jni_data)
+    : data_ptr(jni_data.begin()), end_ptr(jni_data.end())
+  {
+  }
 
   /** Returns true if there is no data remaining to be read */
   bool at_eof() { return data_ptr == end_ptr; }
 
   /** Read a byte from the serialized AST data buffer */
-  jbyte read_byte() {
+  jbyte read_byte()
+  {
     check_for_eof(sizeof(jbyte));
     return *data_ptr++;
   }
 
   /** Read a multi-byte value from the serialized AST data buffer */
-  template <typename T> T read() {
+  template <typename T>
+  T read()
+  {
     if constexpr (std::is_same_v<T, std::string>) {
       auto const size = read<cudf::size_type>();
       check_for_eof(size);
-      auto const result = std::string(reinterpret_cast<char const *>(data_ptr), size);
+      auto const result = std::string(reinterpret_cast<char const*>(data_ptr), size);
       data_ptr += size;
       return result;
     } else {
       check_for_eof(sizeof(T));
       // use memcpy since data may be misaligned
       T result;
-      memcpy(reinterpret_cast<jbyte *>(&result), data_ptr, sizeof(T));
+      memcpy(reinterpret_cast<jbyte*>(&result), data_ptr, sizeof(T));
       data_ptr += sizeof(T);
       return result;
     }
   }
 
   /** Decode a libcudf data type from the serialized AST data buffer */
-  cudf::data_type read_cudf_type() {
+  cudf::data_type read_cudf_type()
+  {
     auto const dtype_id = static_cast<cudf::type_id>(read_byte());
     switch (dtype_id) {
       case cudf::type_id::INT8:
@@ -116,10 +123,10 @@ class jni_serialized_ast {
  * NOTE: This must be kept in sync with the NodeType enumeration in AstNode.java!
  */
 enum class jni_serialized_expression_type : int8_t {
-  VALID_LITERAL = 0,
-  NULL_LITERAL = 1,
+  VALID_LITERAL    = 0,
+  NULL_LITERAL     = 1,
   COLUMN_REFERENCE = 2,
-  UNARY_OPERATION = 3,
+  UNARY_OPERATION  = 3,
   BINARY_OPERATION = 4
 };
 
@@ -128,7 +135,8 @@ enum class jni_serialized_expression_type : int8_t {
  * corresponding libcudf AST operator.
  * NOTE: This must be kept in sync with the enumeration in UnaryOperator.java!
  */
-cudf::ast::ast_operator jni_to_unary_operator(jbyte jni_op_value) {
+cudf::ast::ast_operator jni_to_unary_operator(jbyte jni_op_value)
+{
   switch (jni_op_value) {
     case 0: return cudf::ast::ast_operator::IDENTITY;
     case 1: return cudf::ast::ast_operator::IS_NULL;
@@ -166,7 +174,8 @@ cudf::ast::ast_operator jni_to_unary_operator(jbyte jni_op_value) {
  * corresponding libcudf AST operator.
  * NOTE: This must be kept in sync with the enumeration in BinaryOperator.java!
  */
-cudf::ast::ast_operator jni_to_binary_operator(jbyte jni_op_value) {
+cudf::ast::ast_operator jni_to_binary_operator(jbyte jni_op_value)
+{
   switch (jni_op_value) {
     case 0: return cudf::ast::ast_operator::ADD;
     case 1: return cudf::ast::ast_operator::SUB;
@@ -200,7 +209,8 @@ cudf::ast::ast_operator jni_to_binary_operator(jbyte jni_op_value) {
  * corresponding libcudf AST table reference.
  * NOTE: This must be kept in sync with the enumeration in TableReference.java!
  */
-cudf::ast::table_reference jni_to_table_reference(jbyte jni_value) {
+cudf::ast::table_reference jni_to_table_reference(jbyte jni_value)
+{
   switch (jni_value) {
     case 0: return cudf::ast::table_reference::LEFT;
     case 1: return cudf::ast::table_reference::RIGHT;
@@ -211,64 +221,72 @@ cudf::ast::table_reference jni_to_table_reference(jbyte jni_value) {
 /** Functor for type-dispatching the creation of an AST literal */
 struct make_literal {
   /** Construct an AST literal from a numeric value */
-  template <typename T, std::enable_if_t<cudf::is_numeric<T>()> * = nullptr>
-  cudf::ast::literal &operator()(cudf::data_type dtype, bool is_valid,
-                                 cudf::jni::ast::compiled_expr &compiled_expr,
-                                 jni_serialized_ast &jni_ast) {
+  template <typename T, std::enable_if_t<cudf::is_numeric<T>()>* = nullptr>
+  cudf::ast::literal& operator()(cudf::data_type dtype,
+                                 bool is_valid,
+                                 cudf::jni::ast::compiled_expr& compiled_expr,
+                                 jni_serialized_ast& jni_ast)
+  {
     std::unique_ptr<cudf::scalar> scalar_ptr = cudf::make_numeric_scalar(dtype);
     scalar_ptr->set_valid_async(is_valid);
     if (is_valid) {
-      T val = jni_ast.read<T>();
+      T val            = jni_ast.read<T>();
       using ScalarType = cudf::scalar_type_t<T>;
-      static_cast<ScalarType *>(scalar_ptr.get())->set_value(val);
+      static_cast<ScalarType*>(scalar_ptr.get())->set_value(val);
     }
 
-    auto &numeric_scalar = static_cast<cudf::numeric_scalar<T> &>(*scalar_ptr);
+    auto& numeric_scalar = static_cast<cudf::numeric_scalar<T>&>(*scalar_ptr);
     return compiled_expr.add_literal(std::make_unique<cudf::ast::literal>(numeric_scalar),
                                      std::move(scalar_ptr));
   }
 
   /** Construct an AST literal from a timestamp value */
-  template <typename T, std::enable_if_t<cudf::is_timestamp<T>()> * = nullptr>
-  cudf::ast::literal &operator()(cudf::data_type dtype, bool is_valid,
-                                 cudf::jni::ast::compiled_expr &compiled_expr,
-                                 jni_serialized_ast &jni_ast) {
+  template <typename T, std::enable_if_t<cudf::is_timestamp<T>()>* = nullptr>
+  cudf::ast::literal& operator()(cudf::data_type dtype,
+                                 bool is_valid,
+                                 cudf::jni::ast::compiled_expr& compiled_expr,
+                                 jni_serialized_ast& jni_ast)
+  {
     std::unique_ptr<cudf::scalar> scalar_ptr = cudf::make_timestamp_scalar(dtype);
     scalar_ptr->set_valid_async(is_valid);
     if (is_valid) {
-      T val = jni_ast.read<T>();
+      T val            = jni_ast.read<T>();
       using ScalarType = cudf::scalar_type_t<T>;
-      static_cast<ScalarType *>(scalar_ptr.get())->set_value(val);
+      static_cast<ScalarType*>(scalar_ptr.get())->set_value(val);
     }
 
-    auto &timestamp_scalar = static_cast<cudf::timestamp_scalar<T> &>(*scalar_ptr);
+    auto& timestamp_scalar = static_cast<cudf::timestamp_scalar<T>&>(*scalar_ptr);
     return compiled_expr.add_literal(std::make_unique<cudf::ast::literal>(timestamp_scalar),
                                      std::move(scalar_ptr));
   }
 
   /** Construct an AST literal from a duration value */
-  template <typename T, std::enable_if_t<cudf::is_duration<T>()> * = nullptr>
-  cudf::ast::literal &operator()(cudf::data_type dtype, bool is_valid,
-                                 cudf::jni::ast::compiled_expr &compiled_expr,
-                                 jni_serialized_ast &jni_ast) {
+  template <typename T, std::enable_if_t<cudf::is_duration<T>()>* = nullptr>
+  cudf::ast::literal& operator()(cudf::data_type dtype,
+                                 bool is_valid,
+                                 cudf::jni::ast::compiled_expr& compiled_expr,
+                                 jni_serialized_ast& jni_ast)
+  {
     std::unique_ptr<cudf::scalar> scalar_ptr = cudf::make_duration_scalar(dtype);
     scalar_ptr->set_valid_async(is_valid);
     if (is_valid) {
-      T val = jni_ast.read<T>();
+      T val            = jni_ast.read<T>();
       using ScalarType = cudf::scalar_type_t<T>;
-      static_cast<ScalarType *>(scalar_ptr.get())->set_value(val);
+      static_cast<ScalarType*>(scalar_ptr.get())->set_value(val);
     }
 
-    auto &duration_scalar = static_cast<cudf::duration_scalar<T> &>(*scalar_ptr);
+    auto& duration_scalar = static_cast<cudf::duration_scalar<T>&>(*scalar_ptr);
     return compiled_expr.add_literal(std::make_unique<cudf::ast::literal>(duration_scalar),
                                      std::move(scalar_ptr));
   }
 
   /** Construct an AST literal from a string value */
-  template <typename T, std::enable_if_t<std::is_same_v<T, cudf::string_view>> * = nullptr>
-  cudf::ast::literal &operator()(cudf::data_type dtype, bool is_valid,
-                                 cudf::jni::ast::compiled_expr &compiled_expr,
-                                 jni_serialized_ast &jni_ast) {
+  template <typename T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>* = nullptr>
+  cudf::ast::literal& operator()(cudf::data_type dtype,
+                                 bool is_valid,
+                                 cudf::jni::ast::compiled_expr& compiled_expr,
+                                 jni_serialized_ast& jni_ast)
+  {
     std::unique_ptr<cudf::scalar> scalar_ptr = [&]() {
       if (is_valid) {
         std::string val = jni_ast.read<std::string>();
@@ -278,64 +296,73 @@ struct make_literal {
       }
     }();
 
-    auto &str_scalar = static_cast<cudf::string_scalar &>(*scalar_ptr);
+    auto& str_scalar = static_cast<cudf::string_scalar&>(*scalar_ptr);
     return compiled_expr.add_literal(std::make_unique<cudf::ast::literal>(str_scalar),
                                      std::move(scalar_ptr));
   }
 
   /** Default functor implementation to catch type dispatch errors */
-  template <typename T, std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_timestamp<T>() &&
-                                         !cudf::is_duration<T>() &&
-                                         !std::is_same_v<T, cudf::string_view>> * = nullptr>
-  cudf::ast::literal &operator()(cudf::data_type dtype, bool is_valid,
-                                 cudf::jni::ast::compiled_expr &compiled_expr,
-                                 jni_serialized_ast &jni_ast) {
+  template <
+    typename T,
+    std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_timestamp<T>() &&
+                     !cudf::is_duration<T>() && !std::is_same_v<T, cudf::string_view>>* = nullptr>
+  cudf::ast::literal& operator()(cudf::data_type dtype,
+                                 bool is_valid,
+                                 cudf::jni::ast::compiled_expr& compiled_expr,
+                                 jni_serialized_ast& jni_ast)
+  {
     throw std::logic_error("Unsupported AST literal type");
   }
 };
 
 /** Decode a serialized AST literal */
-cudf::ast::literal &compile_literal(bool is_valid, cudf::jni::ast::compiled_expr &compiled_expr,
-                                    jni_serialized_ast &jni_ast) {
+cudf::ast::literal& compile_literal(bool is_valid,
+                                    cudf::jni::ast::compiled_expr& compiled_expr,
+                                    jni_serialized_ast& jni_ast)
+{
   auto const dtype = jni_ast.read_cudf_type();
   return cudf::type_dispatcher(dtype, make_literal{}, dtype, is_valid, compiled_expr, jni_ast);
 }
 
 /** Decode a serialized AST column reference */
-cudf::ast::column_reference &compile_column_reference(cudf::jni::ast::compiled_expr &compiled_expr,
-                                                      jni_serialized_ast &jni_ast) {
-  auto const table_ref = jni_to_table_reference(jni_ast.read_byte());
+cudf::ast::column_reference& compile_column_reference(cudf::jni::ast::compiled_expr& compiled_expr,
+                                                      jni_serialized_ast& jni_ast)
+{
+  auto const table_ref               = jni_to_table_reference(jni_ast.read_byte());
   cudf::size_type const column_index = jni_ast.read<int>();
   return compiled_expr.add_column_ref(
-      std::make_unique<cudf::ast::column_reference>(column_index, table_ref));
+    std::make_unique<cudf::ast::column_reference>(column_index, table_ref));
 }
 
 // forward declaration
-cudf::ast::expression &compile_expression(cudf::jni::ast::compiled_expr &compiled_expr,
-                                          jni_serialized_ast &jni_ast);
+cudf::ast::expression& compile_expression(cudf::jni::ast::compiled_expr& compiled_expr,
+                                          jni_serialized_ast& jni_ast);
 
 /** Decode a serialized AST unary expression */
-cudf::ast::operation &compile_unary_expression(cudf::jni::ast::compiled_expr &compiled_expr,
-                                               jni_serialized_ast &jni_ast) {
-  auto const ast_op = jni_to_unary_operator(jni_ast.read_byte());
-  cudf::ast::expression &child_expression = compile_expression(compiled_expr, jni_ast);
+cudf::ast::operation& compile_unary_expression(cudf::jni::ast::compiled_expr& compiled_expr,
+                                               jni_serialized_ast& jni_ast)
+{
+  auto const ast_op                       = jni_to_unary_operator(jni_ast.read_byte());
+  cudf::ast::expression& child_expression = compile_expression(compiled_expr, jni_ast);
   return compiled_expr.add_operation(
-      std::make_unique<cudf::ast::operation>(ast_op, child_expression));
+    std::make_unique<cudf::ast::operation>(ast_op, child_expression));
 }
 
 /** Decode a serialized AST binary expression */
-cudf::ast::operation &compile_binary_expression(cudf::jni::ast::compiled_expr &compiled_expr,
-                                                jni_serialized_ast &jni_ast) {
-  auto const ast_op = jni_to_binary_operator(jni_ast.read_byte());
-  cudf::ast::expression &left_child = compile_expression(compiled_expr, jni_ast);
-  cudf::ast::expression &right_child = compile_expression(compiled_expr, jni_ast);
+cudf::ast::operation& compile_binary_expression(cudf::jni::ast::compiled_expr& compiled_expr,
+                                                jni_serialized_ast& jni_ast)
+{
+  auto const ast_op                  = jni_to_binary_operator(jni_ast.read_byte());
+  cudf::ast::expression& left_child  = compile_expression(compiled_expr, jni_ast);
+  cudf::ast::expression& right_child = compile_expression(compiled_expr, jni_ast);
   return compiled_expr.add_operation(
-      std::make_unique<cudf::ast::operation>(ast_op, left_child, right_child));
+    std::make_unique<cudf::ast::operation>(ast_op, left_child, right_child));
 }
 
 /** Decode a serialized AST expression by reading the expression type and dispatching */
-cudf::ast::expression &compile_expression(cudf::jni::ast::compiled_expr &compiled_expr,
-                                          jni_serialized_ast &jni_ast) {
+cudf::ast::expression& compile_expression(cudf::jni::ast::compiled_expr& compiled_expr,
+                                          jni_serialized_ast& jni_ast)
+{
   auto const expression_type = static_cast<jni_serialized_expression_type>(jni_ast.read_byte());
   switch (expression_type) {
     case jni_serialized_expression_type::VALID_LITERAL:
@@ -353,23 +380,24 @@ cudf::ast::expression &compile_expression(cudf::jni::ast::compiled_expr &compile
 }
 
 /** Decode a serialized AST into a native libcudf AST and associated resources */
-std::unique_ptr<cudf::jni::ast::compiled_expr> compile_serialized_ast(jni_serialized_ast &jni_ast) {
+std::unique_ptr<cudf::jni::ast::compiled_expr> compile_serialized_ast(jni_serialized_ast& jni_ast)
+{
   auto jni_expr_ptr = std::make_unique<cudf::jni::ast::compiled_expr>();
   (void)compile_expression(*jni_expr_ptr, jni_ast);
 
-  if (!jni_ast.at_eof()) {
-    throw std::invalid_argument("Extra bytes at end of serialized AST");
-  }
+  if (!jni_ast.at_eof()) { throw std::invalid_argument("Extra bytes at end of serialized AST"); }
 
   return jni_expr_ptr;
 }
 
-} // anonymous namespace
+}  // anonymous namespace
 
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_compile(JNIEnv *env, jclass,
-                                                                           jbyteArray jni_data) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_compile(JNIEnv* env,
+                                                                           jclass,
+                                                                           jbyteArray jni_data)
+{
   JNI_NULL_CHECK(env, jni_data, "Serialized AST data is null", 0);
   try {
     cudf::jni::auto_set_device(env);
@@ -382,31 +410,34 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_compile(JNIEn
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_computeColumn(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_computeColumn(JNIEnv* env,
                                                                                  jclass,
                                                                                  jlong j_ast,
-                                                                                 jlong j_table) {
+                                                                                 jlong j_table)
+{
   JNI_NULL_CHECK(env, j_ast, "Compiled AST pointer is null", 0);
   JNI_NULL_CHECK(env, j_table, "Table view pointer is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto compiled_expr_ptr = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_ast);
-    auto tview_ptr = reinterpret_cast<cudf::table_view const *>(j_table);
+    auto compiled_expr_ptr = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_ast);
+    auto tview_ptr         = reinterpret_cast<cudf::table_view const*>(j_table);
     std::unique_ptr<cudf::column> result =
-        cudf::compute_column(*tview_ptr, compiled_expr_ptr->get_top_expression());
+      cudf::compute_column(*tview_ptr, compiled_expr_ptr->get_top_expression());
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_destroy(JNIEnv *env, jclass,
-                                                                          jlong jni_handle) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ast_CompiledExpression_destroy(JNIEnv* env,
+                                                                          jclass,
+                                                                          jlong jni_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto ptr = reinterpret_cast<cudf::jni::ast::compiled_expr *>(jni_handle);
+    auto ptr = reinterpret_cast<cudf::jni::ast::compiled_expr*>(jni_handle);
     delete ptr;
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/ContiguousTableJni.cpp b/java/src/main/native/src/ContiguousTableJni.cpp
index 8c99c77ca1f..19a10bf25ec 100644
--- a/java/src/main/native/src/ContiguousTableJni.cpp
+++ b/java/src/main/native/src/ContiguousTableJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 namespace {
 
-#define CONTIGUOUS_TABLE_CLASS "ai/rapids/cudf/ContiguousTable"
+#define CONTIGUOUS_TABLE_CLASS                  "ai/rapids/cudf/ContiguousTable"
 #define CONTIGUOUS_TABLE_FACTORY_SIG(param_sig) "(" param_sig ")L" CONTIGUOUS_TABLE_CLASS ";"
 
 jclass Contiguous_table_jclass;
@@ -29,87 +29,85 @@ jclass Contig_split_group_by_result_jclass;
 jfieldID Contig_split_group_by_result_groups_field;
 jfieldID Contig_split_group_by_result_uniq_key_columns_field;
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf {
 namespace jni {
 
-bool cache_contiguous_table_jni(JNIEnv *env) {
+bool cache_contiguous_table_jni(JNIEnv* env)
+{
   jclass cls = env->FindClass(CONTIGUOUS_TABLE_CLASS);
-  if (cls == nullptr) {
-    return false;
-  }
+  if (cls == nullptr) { return false; }
 
   From_packed_table_method =
-      env->GetStaticMethodID(cls, "fromPackedTable", CONTIGUOUS_TABLE_FACTORY_SIG("JJJJJ"));
-  if (From_packed_table_method == nullptr) {
-    return false;
-  }
+    env->GetStaticMethodID(cls, "fromPackedTable", CONTIGUOUS_TABLE_FACTORY_SIG("JJJJJ"));
+  if (From_packed_table_method == nullptr) { return false; }
 
   // Convert local reference to global so it cannot be garbage collected.
   Contiguous_table_jclass = static_cast<jclass>(env->NewGlobalRef(cls));
-  if (Contiguous_table_jclass == nullptr) {
-    return false;
-  }
+  if (Contiguous_table_jclass == nullptr) { return false; }
   return true;
 }
 
-void release_contiguous_table_jni(JNIEnv *env) {
+void release_contiguous_table_jni(JNIEnv* env)
+{
   Contiguous_table_jclass = cudf::jni::del_global_ref(env, Contiguous_table_jclass);
 }
 
-bool cache_contig_split_group_by_result_jni(JNIEnv *env) {
+bool cache_contig_split_group_by_result_jni(JNIEnv* env)
+{
   jclass cls = env->FindClass(GROUP_BY_RESULT_CLASS);
-  if (cls == nullptr) {
-    return false;
-  }
+  if (cls == nullptr) { return false; }
 
   Contig_split_group_by_result_groups_field =
-      env->GetFieldID(cls, "groups", "[Lai/rapids/cudf/ContiguousTable;");
-  if (Contig_split_group_by_result_groups_field == nullptr) {
-    return false;
-  }
+    env->GetFieldID(cls, "groups", "[Lai/rapids/cudf/ContiguousTable;");
+  if (Contig_split_group_by_result_groups_field == nullptr) { return false; }
   Contig_split_group_by_result_uniq_key_columns_field =
-      env->GetFieldID(cls, "uniqKeyColumns", "[J");
-  if (Contig_split_group_by_result_uniq_key_columns_field == nullptr) {
-    return false;
-  }
+    env->GetFieldID(cls, "uniqKeyColumns", "[J");
+  if (Contig_split_group_by_result_uniq_key_columns_field == nullptr) { return false; }
 
   // Convert local reference to global so it cannot be garbage collected.
   Contig_split_group_by_result_jclass = static_cast<jclass>(env->NewGlobalRef(cls));
-  if (Contig_split_group_by_result_jclass == nullptr) {
-    return false;
-  }
+  if (Contig_split_group_by_result_jclass == nullptr) { return false; }
   return true;
 }
 
-void release_contig_split_group_by_result_jni(JNIEnv *env) {
+void release_contig_split_group_by_result_jni(JNIEnv* env)
+{
   Contig_split_group_by_result_jclass = del_global_ref(env, Contig_split_group_by_result_jclass);
 }
 
-jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups) {
+jobject contig_split_group_by_result_from(JNIEnv* env, jobjectArray& groups)
+{
   jobject gbr = env->AllocObject(Contig_split_group_by_result_jclass);
   env->SetObjectField(gbr, Contig_split_group_by_result_groups_field, groups);
   return gbr;
 }
 
-jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups,
-                                          jlongArray &uniq_key_columns) {
+jobject contig_split_group_by_result_from(JNIEnv* env,
+                                          jobjectArray& groups,
+                                          jlongArray& uniq_key_columns)
+{
   jobject gbr = env->AllocObject(Contig_split_group_by_result_jclass);
   env->SetObjectField(gbr, Contig_split_group_by_result_groups_field, groups);
   env->SetObjectField(gbr, Contig_split_group_by_result_uniq_key_columns_field, uniq_key_columns);
   return gbr;
 }
 
-jobject contiguous_table_from(JNIEnv *env, cudf::packed_columns &split, long row_count) {
-  jlong metadata_address = reinterpret_cast<jlong>(split.metadata.get());
-  jlong data_address = reinterpret_cast<jlong>(split.gpu_data->data());
-  jlong data_size = static_cast<jlong>(split.gpu_data->size());
+jobject contiguous_table_from(JNIEnv* env, cudf::packed_columns& split, long row_count)
+{
+  jlong metadata_address   = reinterpret_cast<jlong>(split.metadata.get());
+  jlong data_address       = reinterpret_cast<jlong>(split.gpu_data->data());
+  jlong data_size          = static_cast<jlong>(split.gpu_data->size());
   jlong rmm_buffer_address = reinterpret_cast<jlong>(split.gpu_data.get());
 
-  jobject contig_table_obj = env->CallStaticObjectMethod(
-      Contiguous_table_jclass, From_packed_table_method, metadata_address, data_address, data_size,
-      rmm_buffer_address, row_count);
+  jobject contig_table_obj = env->CallStaticObjectMethod(Contiguous_table_jclass,
+                                                         From_packed_table_method,
+                                                         metadata_address,
+                                                         data_address,
+                                                         data_size,
+                                                         rmm_buffer_address,
+                                                         row_count);
 
   if (contig_table_obj != nullptr) {
     split.metadata.release();
@@ -119,28 +117,30 @@ jobject contiguous_table_from(JNIEnv *env, cudf::packed_columns &split, long row
   return contig_table_obj;
 }
 
-native_jobjectArray<jobject> contiguous_table_array(JNIEnv *env, jsize length) {
+native_jobjectArray<jobject> contiguous_table_array(JNIEnv* env, jsize length)
+{
   return native_jobjectArray<jobject>(
-      env, env->NewObjectArray(length, Contiguous_table_jclass, nullptr));
+    env, env->NewObjectArray(length, Contiguous_table_jclass, nullptr));
 }
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
 
 extern "C" {
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ContiguousTable_createPackedMetadata(
-    JNIEnv *env, jclass, jlong j_table, jlong j_buffer_addr, jlong j_buffer_length) {
+  JNIEnv* env, jclass, jlong j_table, jlong j_buffer_addr, jlong j_buffer_length)
+{
   JNI_NULL_CHECK(env, j_table, "input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto table = reinterpret_cast<cudf::table_view const *>(j_table);
-    auto data_addr = reinterpret_cast<uint8_t const *>(j_buffer_addr);
-    auto data_size = static_cast<size_t>(j_buffer_length);
+    auto table        = reinterpret_cast<cudf::table_view const*>(j_table);
+    auto data_addr    = reinterpret_cast<uint8_t const*>(j_buffer_addr);
+    auto data_size    = static_cast<size_t>(j_buffer_length);
     auto metadata_ptr = new std::vector<uint8_t>(cudf::pack_metadata(*table, data_addr, data_size));
     return reinterpret_cast<jlong>(metadata_ptr);
   }
   CATCH_STD(env, 0);
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/CuFileJni.cpp b/java/src/main/native/src/CuFileJni.cpp
index ef165281bf9..382d0e6c9f7 100644
--- a/java/src/main/native/src/CuFileJni.cpp
+++ b/java/src/main/native/src/CuFileJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,18 +13,18 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cstring>
+#include "cudf_jni_apis.hpp"
+#include "jni_utils.hpp"
+
+#include <cudf/utilities/error.hpp>
 
 #include <cufile.h>
 #include <fcntl.h>
-#include <unistd.h>
-
-#include <cudf/utilities/error.hpp>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <unistd.h>
 
-#include "cudf_jni_apis.hpp"
-#include "jni_utils.hpp"
+#include <cstring>
 
 namespace {
 
@@ -34,10 +34,10 @@ namespace {
  * @param cu_result CUDA driver error code.
  * @return Description for the error.
  */
-char const *GetCuErrorString(CUresult cu_result) {
-  char const *description;
-  if (cuGetErrorName(cu_result, &description) != CUDA_SUCCESS)
-    description = "unknown cuda error";
+char const* GetCuErrorString(CUresult cu_result)
+{
+  char const* description;
+  if (cuGetErrorName(cu_result, &description) != CUDA_SUCCESS) description = "unknown cuda error";
   return description;
 }
 
@@ -49,9 +49,10 @@ char const *GetCuErrorString(CUresult cu_result) {
  * @param error_code Integer error code.
  * @return Description of the error.
  */
-std::string cuFileGetErrorString(int error_code) {
-  return IS_CUFILE_ERR(error_code) ? std::string(CUFILE_ERRSTR(error_code)) :
-                                     std::string(std::strerror(error_code));
+std::string cuFileGetErrorString(int error_code)
+{
+  return IS_CUFILE_ERR(error_code) ? std::string(CUFILE_ERRSTR(error_code))
+                                   : std::string(std::strerror(error_code));
 }
 
 /**
@@ -60,11 +61,10 @@ std::string cuFileGetErrorString(int error_code) {
  * @param status cuFile return status.
  * @return Description of the error.
  */
-std::string cuFileGetErrorString(CUfileError_t status) {
+std::string cuFileGetErrorString(CUfileError_t status)
+{
   std::string error = cuFileGetErrorString(status.err);
-  if (IS_CUDA_ERR(status)) {
-    error.append(".").append(GetCuErrorString(status.cu_err));
-  }
+  if (IS_CUDA_ERR(status)) { error.append(".").append(GetCuErrorString(status.cu_err)); }
   return error;
 }
 
@@ -72,9 +72,10 @@ std::string cuFileGetErrorString(CUfileError_t status) {
  * @brief RAII wrapper for the cuFile driver.
  */
 class cufile_driver {
-public:
+ public:
   /** @brief Construct a new driver instance by opening the cuFile driver. */
-  cufile_driver() {
+  cufile_driver()
+  {
     auto const status = cuFileDriverOpen();
     if (status.err != CU_FILE_SUCCESS) {
       CUDF_FAIL("Failed to initialize cuFile driver: " + cuFileGetErrorString(status));
@@ -82,8 +83,8 @@ class cufile_driver {
   }
 
   // Disable copy (and move) semantics.
-  cufile_driver(cufile_driver const &) = delete;
-  cufile_driver &operator=(cufile_driver const &) = delete;
+  cufile_driver(cufile_driver const&)            = delete;
+  cufile_driver& operator=(cufile_driver const&) = delete;
 
   /** @brief Destroy the driver instance by closing the cuFile driver. */
   ~cufile_driver() { cuFileDriverClose(); }
@@ -91,7 +92,7 @@ class cufile_driver {
 
 /** @brief RAII wrapper for a device buffer used by cuFile. */
 class cufile_buffer {
-public:
+ public:
   /**
    * @brief Construct a new cuFile buffer.
    *
@@ -100,8 +101,9 @@ class cufile_buffer {
    * @param register_buffer Whether to register the buffer with cuFile. This should only be set to
    * true if this buffer is being reused and is 4KiB aligned.
    */
-  cufile_buffer(void *device_pointer, std::size_t size, bool register_buffer = false)
-      : device_pointer_{device_pointer}, size_{size}, register_buffer_{register_buffer} {
+  cufile_buffer(void* device_pointer, std::size_t size, bool register_buffer = false)
+    : device_pointer_{device_pointer}, size_{size}, register_buffer_{register_buffer}
+  {
     if (register_buffer_) {
       auto const status = cuFileBufRegister(device_pointer_, size_, 0);
       if (status.err != CU_FILE_SUCCESS) {
@@ -111,14 +113,13 @@ class cufile_buffer {
   }
 
   // Disable copy (and move) semantics.
-  cufile_buffer(cufile_buffer const &) = delete;
-  cufile_buffer &operator=(cufile_buffer const &) = delete;
+  cufile_buffer(cufile_buffer const&)            = delete;
+  cufile_buffer& operator=(cufile_buffer const&) = delete;
 
   /** @brief Destroy the buffer by de-registering it if necessary. */
-  ~cufile_buffer() {
-    if (register_buffer_) {
-      cuFileBufDeregister(device_pointer_);
-    }
+  ~cufile_buffer()
+  {
+    if (register_buffer_) { cuFileBufDeregister(device_pointer_); }
   }
 
   /**
@@ -126,7 +127,7 @@ class cufile_buffer {
    *
    * @return Pointer to the device buffer.
    */
-  void *device_pointer() const { return device_pointer_; }
+  void* device_pointer() const { return device_pointer_; }
 
   /**
    * @brief Get the size of the underlying device buffer.
@@ -135,9 +136,9 @@ class cufile_buffer {
    */
   std::size_t size() const { return size_; }
 
-private:
+ private:
   /// Pointer to the device buffer.
-  void *device_pointer_;
+  void* device_pointer_;
   /// Size of the device buffer.
   std::size_t size_;
   /// Whether to register the buffer with cuFile.
@@ -146,7 +147,7 @@ class cufile_buffer {
 
 /** @brief RAII wrapper for a file descriptor and the corresponding cuFile handle. */
 class cufile_file {
-public:
+ public:
   /**
    * @brief Construct a file wrapper.
    *
@@ -154,7 +155,8 @@ class cufile_file {
    *
    * @param file_descriptor A valid file descriptor.
    */
-  explicit cufile_file(int file_descriptor) : file_descriptor_{file_descriptor} {
+  explicit cufile_file(int file_descriptor) : file_descriptor_{file_descriptor}
+  {
     CUfileDescr_t cufile_descriptor{CU_FILE_HANDLE_TYPE_OPAQUE_FD, file_descriptor_};
     auto const status = cuFileHandleRegister(&cufile_handle_, &cufile_descriptor);
     if (status.err != CU_FILE_SUCCESS) {
@@ -169,7 +171,8 @@ class cufile_file {
    * @param path Absolute path of the file to read from.
    * @return std::unique_ptr<cufile_file> for reading.
    */
-  static auto make_reader(char const *path) {
+  static auto make_reader(char const* path)
+  {
     auto const file_descriptor = open(path, O_RDONLY | O_DIRECT);
     if (file_descriptor < 0) {
       CUDF_FAIL("Failed to open file to read: " + cuFileGetErrorString(errno));
@@ -183,7 +186,8 @@ class cufile_file {
    * @param path Absolute path of the file to write to.
    * @return std::unique_ptr<cufile_file> for writing.
    */
-  static auto make_writer(char const *path) {
+  static auto make_writer(char const* path)
+  {
     auto const file_descriptor = open(path, O_CREAT | O_WRONLY | O_DIRECT, S_IRUSR | S_IWUSR);
     if (file_descriptor < 0) {
       CUDF_FAIL("Failed to open file to write: " + cuFileGetErrorString(errno));
@@ -192,11 +196,12 @@ class cufile_file {
   }
 
   // Disable copy (and move) semantics.
-  cufile_file(cufile_file const &) = delete;
-  cufile_file &operator=(cufile_file const &) = delete;
+  cufile_file(cufile_file const&)            = delete;
+  cufile_file& operator=(cufile_file const&) = delete;
 
   /** @brief Destroy the file wrapper by de-registering the cuFile handle and closing the file. */
-  ~cufile_file() {
+  ~cufile_file()
+  {
     cuFileHandleDeregister(cufile_handle_);
     close(file_descriptor_);
   }
@@ -207,9 +212,10 @@ class cufile_file {
    * @param buffer Device buffer to read the file content into.
    * @param file_offset Starting offset from which to read the file.
    */
-  void read(cufile_buffer const &buffer, std::size_t file_offset) const {
+  void read(cufile_buffer const& buffer, std::size_t file_offset) const
+  {
     auto const status =
-        cuFileRead(cufile_handle_, buffer.device_pointer(), buffer.size(), file_offset, 0);
+      cuFileRead(cufile_handle_, buffer.device_pointer(), buffer.size(), file_offset, 0);
 
     if (status < 0) {
       if (IS_CUFILE_ERR(status)) {
@@ -230,7 +236,8 @@ class cufile_file {
    * @param size The number of bytes to write.
    * @param file_offset Starting offset from which to write the buffer.
    */
-  void write(cufile_buffer const &buffer, std::size_t size, std::size_t file_offset) {
+  void write(cufile_buffer const& buffer, std::size_t size, std::size_t file_offset)
+  {
     auto const status = cuFileWrite(cufile_handle_, buffer.device_pointer(), size, file_offset, 0);
 
     if (status < 0) {
@@ -252,7 +259,8 @@ class cufile_file {
    * @param size The number of bytes to append.
    * @return The file offset from which the buffer was appended.
    */
-  std::size_t append(cufile_buffer const &buffer, std::size_t size) {
+  std::size_t append(cufile_buffer const& buffer, std::size_t size)
+  {
     struct stat stat_buffer;
     auto const status = fstat(file_descriptor_, &stat_buffer);
     if (status < 0) {
@@ -264,14 +272,14 @@ class cufile_file {
     return file_offset;
   }
 
-private:
+ private:
   /// The underlying file descriptor.
   int file_descriptor_;
   /// The registered cuFile handle.
   CUfileHandle_t cufile_handle_{};
 };
 
-} // anonymous namespace
+}  // anonymous namespace
 
 extern "C" {
 
@@ -281,7 +289,8 @@ extern "C" {
  * @param env The JNI environment.
  * @return Pointer address to the new driver wrapper instance.
  */
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileDriver_create(JNIEnv *env, jclass) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileDriver_create(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     return reinterpret_cast<jlong>(new cufile_driver());
@@ -295,11 +304,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileDriver_create(JNIEnv *env, jcl
  * @param env The JNI environment.
  * @param pointer Pointer address to the driver wrapper instance.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileDriver_destroy(JNIEnv *env, jclass,
-                                                                jlong pointer) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileDriver_destroy(JNIEnv* env, jclass, jlong pointer)
+{
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cufile_driver *>(pointer);
+    delete reinterpret_cast<cufile_driver*>(pointer);
   }
   CATCH_STD(env, );
 }
@@ -313,13 +322,13 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileDriver_destroy(JNIEnv *env, jcl
  * @param register_buffer If true, register the cuFile buffer.
  * @return Pointer address to the new buffer wrapper instance.
  */
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileBuffer_create(JNIEnv *env, jclass,
-                                                                jlong device_pointer, jlong size,
-                                                                jboolean register_buffer) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileBuffer_create(
+  JNIEnv* env, jclass, jlong device_pointer, jlong size, jboolean register_buffer)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto *buffer =
-        new cufile_buffer(reinterpret_cast<void *>(device_pointer), size, register_buffer);
+    auto* buffer =
+      new cufile_buffer(reinterpret_cast<void*>(device_pointer), size, register_buffer);
     return reinterpret_cast<jlong>(buffer);
   }
   CATCH_STD(env, 0);
@@ -331,11 +340,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileBuffer_create(JNIEnv *env, jcl
  * @param env The JNI environment.
  * @param pointer Pointer address to the buffer wrapper instance.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileBuffer_destroy(JNIEnv *env, jclass,
-                                                                jlong pointer) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileBuffer_destroy(JNIEnv* env, jclass, jlong pointer)
+{
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cufile_buffer *>(pointer);
+    delete reinterpret_cast<cufile_buffer*>(pointer);
   }
   CATCH_STD(env, );
 }
@@ -347,8 +356,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileBuffer_destroy(JNIEnv *env, jcl
  * @param path The file path to read from.
  * @return Pointer address to the new file handle wrapper instance.
  */
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileReadHandle_create(JNIEnv *env, jclass,
-                                                                    jstring path) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileReadHandle_create(JNIEnv* env,
+                                                                    jclass,
+                                                                    jstring path)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto file = cufile_file::make_reader(env->GetStringUTFChars(path, nullptr));
@@ -365,14 +376,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileReadHandle_create(JNIEnv *env,
  * @param file_offset The file offset from which to read.
  * @param buffer Pointer to the cuFile buffer object.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileReadHandle_readIntoBuffer(JNIEnv *env, jclass,
-                                                                           jlong file,
-                                                                           jlong file_offset,
-                                                                           jlong buffer) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileReadHandle_readIntoBuffer(
+  JNIEnv* env, jclass, jlong file, jlong file_offset, jlong buffer)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto *file_ptr = reinterpret_cast<cufile_file *>(file);
-    auto *buffer_ptr = reinterpret_cast<cufile_buffer *>(buffer);
+    auto* file_ptr   = reinterpret_cast<cufile_file*>(file);
+    auto* buffer_ptr = reinterpret_cast<cufile_buffer*>(buffer);
     file_ptr->read(*buffer_ptr, file_offset);
   }
   CATCH_STD(env, );
@@ -385,8 +395,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileReadHandle_readIntoBuffer(JNIEn
  * @param path The file path to write to.
  * @return Pointer address to the new file handle wrapper instance.
  */
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_create(JNIEnv *env, jclass,
-                                                                     jstring path) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_create(JNIEnv* env,
+                                                                     jclass,
+                                                                     jstring path)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto file = cufile_file::make_writer(env->GetStringUTFChars(path, nullptr));
@@ -405,11 +417,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_create(JNIEnv *env
  * @param size Number of bytes to write.
  */
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_writeFromBuffer(
-    JNIEnv *env, jclass, jlong file, jlong file_offset, jlong buffer, jlong size) {
+  JNIEnv* env, jclass, jlong file, jlong file_offset, jlong buffer, jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto *file_ptr = reinterpret_cast<cufile_file *>(file);
-    auto *buffer_ptr = reinterpret_cast<cufile_buffer *>(buffer);
+    auto* file_ptr   = reinterpret_cast<cufile_file*>(file);
+    auto* buffer_ptr = reinterpret_cast<cufile_buffer*>(buffer);
     file_ptr->write(*buffer_ptr, size, file_offset);
   }
   CATCH_STD(env, );
@@ -424,14 +437,13 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_writeFromBuffer(
  * @param size Number of bytes to append
  * @return The file offset from which the buffer was appended.
  */
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_appendFromBuffer(JNIEnv *env, jclass,
-                                                                              jlong file,
-                                                                              jlong buffer,
-                                                                              jlong size) {
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_appendFromBuffer(
+  JNIEnv* env, jclass, jlong file, jlong buffer, jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto *file_ptr = reinterpret_cast<cufile_file *>(file);
-    auto *buffer_ptr = reinterpret_cast<cufile_buffer *>(buffer);
+    auto* file_ptr   = reinterpret_cast<cufile_file*>(file);
+    auto* buffer_ptr = reinterpret_cast<cufile_buffer*>(buffer);
     return file_ptr->append(*buffer_ptr, size);
   }
   CATCH_STD(env, -1);
@@ -443,11 +455,11 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_CuFileWriteHandle_appendFromBuffer(JN
  * @param env The JNI environment.
  * @param pointer Pointer address to the file handle wrapper instance.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileHandle_destroy(JNIEnv *env, jclass,
-                                                                jlong pointer) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileHandle_destroy(JNIEnv* env, jclass, jlong pointer)
+{
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cufile_file *>(pointer);
+    delete reinterpret_cast<cufile_file*>(pointer);
   }
   CATCH_STD(env, );
 }
@@ -461,12 +473,12 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFileHandle_destroy(JNIEnv *env, jcl
  * @param device_pointer Pointer address to the device buffer.
  * @param size Number of bytes to write.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_writeToFile(JNIEnv *env, jclass, jstring path,
-                                                              jlong file_offset,
-                                                              jlong device_pointer, jlong size) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_writeToFile(
+  JNIEnv* env, jclass, jstring path, jlong file_offset, jlong device_pointer, jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cufile_buffer buffer{reinterpret_cast<void *>(device_pointer), static_cast<std::size_t>(size)};
+    cufile_buffer buffer{reinterpret_cast<void*>(device_pointer), static_cast<std::size_t>(size)};
     auto writer = cufile_file::make_writer(env->GetStringUTFChars(path, nullptr));
     writer->write(buffer, size, file_offset);
   }
@@ -482,11 +494,12 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_writeToFile(JNIEnv *env, jclas
  * @param size Number of bytes to append.
  * @return The file offset from which the buffer was appended.
  */
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFile_appendToFile(JNIEnv *env, jclass, jstring path,
-                                                                jlong device_pointer, jlong size) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFile_appendToFile(
+  JNIEnv* env, jclass, jstring path, jlong device_pointer, jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cufile_buffer buffer{reinterpret_cast<void *>(device_pointer), static_cast<std::size_t>(size)};
+    cufile_buffer buffer{reinterpret_cast<void*>(device_pointer), static_cast<std::size_t>(size)};
     auto writer = cufile_file::make_writer(env->GetStringUTFChars(path, nullptr));
     return writer->append(buffer, size);
   }
@@ -502,16 +515,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_CuFile_appendToFile(JNIEnv *env, jcl
  * @param path Absolute path of the file to copy from.
  * @param file_offset The file offset from which to copy content.
  */
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_readFromFile(JNIEnv *env, jclass,
-                                                               jlong device_pointer, jlong size,
-                                                               jstring path, jlong file_offset) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_CuFile_readFromFile(
+  JNIEnv* env, jclass, jlong device_pointer, jlong size, jstring path, jlong file_offset)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cufile_buffer buffer{reinterpret_cast<void *>(device_pointer), static_cast<std::size_t>(size)};
+    cufile_buffer buffer{reinterpret_cast<void*>(device_pointer), static_cast<std::size_t>(size)};
     auto const reader = cufile_file::make_reader(env->GetStringUTFChars(path, nullptr));
     reader->read(buffer, file_offset);
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/CudaJni.cpp b/java/src/main/native/src/CudaJni.cpp
index 2fe550cdfeb..127a750db43 100644
--- a/java/src/main/native/src/CudaJni.cpp
+++ b/java/src/main/native/src/CudaJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/utilities/error.hpp>
+
 #include <rmm/device_buffer.hpp>
 
 #ifdef CUDF_JNI_ENABLE_PROFILING
@@ -30,21 +31,20 @@ int Cudf_device{cudaInvalidDeviceId};
 
 thread_local int Thread_device = cudaInvalidDeviceId;
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf {
 namespace jni {
 
 /** Set the device to use for cudf */
-void set_cudf_device(int device) {
-  Cudf_device = device;
-}
+void set_cudf_device(int device) { Cudf_device = device; }
 
 /**
  * If a cudf device has been specified then this ensures the calling thread
  * is using the same device.
  */
-void auto_set_device(JNIEnv *env) {
+void auto_set_device(JNIEnv* env)
+{
   if (Cudf_device != cudaInvalidDeviceId) {
     if (Thread_device != Cudf_device) {
       cudaError_t cuda_status = cudaSetDevice(Cudf_device);
@@ -55,17 +55,19 @@ void auto_set_device(JNIEnv *env) {
 }
 
 /** Fills all the bytes in the buffer 'buf' with 'value'. */
-void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value) {
-  cudaError_t cuda_status = cudaMemsetAsync((void *)buf.data(), value, buf.size());
+void device_memset_async(JNIEnv* env, rmm::device_buffer& buf, char value)
+{
+  cudaError_t cuda_status = cudaMemsetAsync((void*)buf.data(), value, buf.size());
   jni_cuda_check(env, cuda_status);
 }
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
 
 extern "C" {
 
-JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Cuda_memGetInfo(JNIEnv *env, jclass clazz) {
+JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Cuda_memGetInfo(JNIEnv* env, jclass clazz)
+{
   try {
     cudf::jni::auto_set_device(env);
 
@@ -73,14 +75,10 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Cuda_memGetInfo(JNIEnv *env, jclas
     CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total));
 
     jclass info_class = env->FindClass("Lai/rapids/cudf/CudaMemInfo;");
-    if (info_class == NULL) {
-      return NULL;
-    }
+    if (info_class == NULL) { return NULL; }
 
     jmethodID ctor_id = env->GetMethodID(info_class, "<init>", "(JJ)V");
-    if (ctor_id == NULL) {
-      return NULL;
-    }
+    if (ctor_id == NULL) { return NULL; }
 
     jobject info_obj = env->NewObject(info_class, ctor_id, (jlong)free, (jlong)total);
     // No need to check for exceptions of null return value as we are just handing the object back
@@ -90,46 +88,51 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Cuda_memGetInfo(JNIEnv *env, jclas
   CATCH_STD(env, nullptr);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_hostAllocPinned(JNIEnv *env, jclass, jlong size) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_hostAllocPinned(JNIEnv* env, jclass, jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    void *ret = nullptr;
+    void* ret = nullptr;
     CUDF_CUDA_TRY(cudaMallocHost(&ret, size));
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freePinned(JNIEnv *env, jclass, jlong ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freePinned(JNIEnv* env, jclass, jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    CUDF_CUDA_TRY(cudaFreeHost(reinterpret_cast<void *>(ptr)));
+    CUDF_CUDA_TRY(cudaFreeHost(reinterpret_cast<void*>(ptr)));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memset(JNIEnv *env, jclass, jlong dst, jbyte value,
-                                                       jlong count, jint kind) {
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_Cuda_memset(JNIEnv* env, jclass, jlong dst, jbyte value, jlong count, jint kind)
+{
   JNI_NULL_CHECK(env, dst, "dst memory pointer is null", );
   try {
     cudf::jni::auto_set_device(env);
-    CUDF_CUDA_TRY(cudaMemsetAsync((void *)dst, value, count));
+    CUDF_CUDA_TRY(cudaMemsetAsync((void*)dst, value, count));
     CUDF_CUDA_TRY(cudaStreamSynchronize(0));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemset(JNIEnv *env, jclass, jlong dst,
-                                                            jbyte value, jlong count, jint kind) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemset(
+  JNIEnv* env, jclass, jlong dst, jbyte value, jlong count, jint kind)
+{
   JNI_NULL_CHECK(env, dst, "dst memory pointer is null", );
   try {
     cudf::jni::auto_set_device(env);
-    CUDF_CUDA_TRY(cudaMemsetAsync((void *)dst, value, count));
+    CUDF_CUDA_TRY(cudaMemsetAsync((void*)dst, value, count));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDevice(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDevice(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     jint dev;
@@ -139,7 +142,8 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDevice(JNIEnv *env, jclass) {
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDeviceCount(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDeviceCount(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     jint count;
@@ -149,25 +153,28 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDeviceCount(JNIEnv *env, jcla
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_setDevice(JNIEnv *env, jclass, jint dev) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_setDevice(JNIEnv* env, jclass, jint dev)
+{
   try {
     if (Cudf_device != cudaInvalidDeviceId && dev != Cudf_device) {
-      cudf::jni::throw_java_exception(env, cudf::jni::CUDF_ERROR_CLASS,
-                                      "Cannot change device after RMM init");
+      cudf::jni::throw_java_exception(
+        env, cudf::jni::CUDF_ERROR_CLASS, "Cannot change device after RMM init");
     }
     CUDF_CUDA_TRY(cudaSetDevice(dev));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_autoSetDevice(JNIEnv *env, jclass, jint dev) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_autoSetDevice(JNIEnv* env, jclass, jint dev)
+{
   try {
     cudf::jni::auto_set_device(env);
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDriverVersion(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDriverVersion(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     jint driver_version;
@@ -177,7 +184,8 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getDriverVersion(JNIEnv *env, jc
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getRuntimeVersion(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getRuntimeVersion(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     jint runtime_version;
@@ -187,7 +195,8 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getRuntimeVersion(JNIEnv *env, j
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getNativeComputeMode(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getNativeComputeMode(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     int device;
@@ -199,33 +208,36 @@ JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getNativeComputeMode(JNIEnv *env
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMajor(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMajor(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     int device;
     CUDF_CUDA_TRY(::cudaGetDevice(&device));
     int attribute_value;
     CUDF_CUDA_TRY(
-        ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMajor, device));
+      ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMajor, device));
     return attribute_value;
   }
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMinor(JNIEnv *env, jclass) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Cuda_getComputeCapabilityMinor(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     int device;
     CUDF_CUDA_TRY(::cudaGetDevice(&device));
     int attribute_value;
     CUDF_CUDA_TRY(
-        ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMinor, device));
+      ::cudaDeviceGetAttribute(&attribute_value, ::cudaDevAttrComputeCapabilityMinor, device));
     return attribute_value;
   }
   CATCH_STD(env, -2);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freeZero(JNIEnv *env, jclass) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freeZero(JNIEnv* env, jclass)
+{
   try {
     cudf::jni::auto_set_device(env);
     CUDF_CUDA_TRY(cudaFree(0));
@@ -233,19 +245,22 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_freeZero(JNIEnv *env, jclass) {
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createStream(JNIEnv *env, jclass,
-                                                              jboolean isNonBlocking) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createStream(JNIEnv* env,
+                                                              jclass,
+                                                              jboolean isNonBlocking)
+{
   try {
     cudf::jni::auto_set_device(env);
     cudaStream_t stream = nullptr;
-    auto flags = isNonBlocking ? cudaStreamNonBlocking : cudaStreamDefault;
+    auto flags          = isNonBlocking ? cudaStreamNonBlocking : cudaStreamDefault;
     CUDF_CUDA_TRY(cudaStreamCreateWithFlags(&stream, flags));
     return reinterpret_cast<jlong>(stream);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv *env, jclass, jlong jstream) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv* env, jclass, jlong jstream)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
@@ -254,19 +269,24 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyStream(JNIEnv *env, jclas
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamWaitEvent(JNIEnv *env, jclass, jlong jstream,
-                                                                jlong jevent) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamWaitEvent(JNIEnv* env,
+                                                                jclass,
+                                                                jlong jstream,
+                                                                jlong jevent)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
-    auto event = reinterpret_cast<cudaEvent_t>(jevent);
+    auto event  = reinterpret_cast<cudaEvent_t>(jevent);
     CUDF_CUDA_TRY(cudaStreamWaitEvent(stream, event, 0));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv *env, jclass,
-                                                                  jlong jstream) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv* env,
+                                                                  jclass,
+                                                                  jlong jstream)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
@@ -275,26 +295,25 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_streamSynchronize(JNIEnv *env, j
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createEvent(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Cuda_createEvent(JNIEnv* env,
+                                                             jclass,
                                                              jboolean enableTiming,
-                                                             jboolean blockingSync) {
+                                                             jboolean blockingSync)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cudaEvent_t event = nullptr;
+    cudaEvent_t event  = nullptr;
     unsigned int flags = 0;
-    if (!enableTiming) {
-      flags = flags | cudaEventDisableTiming;
-    }
-    if (blockingSync) {
-      flags = flags | cudaEventBlockingSync;
-    }
+    if (!enableTiming) { flags = flags | cudaEventDisableTiming; }
+    if (blockingSync) { flags = flags | cudaEventBlockingSync; }
     CUDF_CUDA_TRY(cudaEventCreateWithFlags(&event, flags));
     return reinterpret_cast<jlong>(event);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv *env, jclass, jlong jevent) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv* env, jclass, jlong jevent)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto event = reinterpret_cast<cudaEvent_t>(jevent);
@@ -303,35 +322,39 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_destroyEvent(JNIEnv *env, jclass
   CATCH_STD(env, );
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_eventQuery(JNIEnv *env, jclass, jlong jevent) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_eventQuery(JNIEnv* env, jclass, jlong jevent)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto event = reinterpret_cast<cudaEvent_t>(jevent);
+    auto event  = reinterpret_cast<cudaEvent_t>(jevent);
     auto result = cudaEventQuery(event);
     if (result == cudaSuccess) {
       return true;
     } else if (result == cudaErrorNotReady) {
       return false;
-    } // else
+    }  // else
     CUDF_CUDA_TRY(result);
   }
   CATCH_STD(env, false);
   return false;
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventRecord(JNIEnv *env, jclass, jlong jevent,
-                                                            jlong jstream) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventRecord(JNIEnv* env,
+                                                            jclass,
+                                                            jlong jevent,
+                                                            jlong jstream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto event = reinterpret_cast<cudaEvent_t>(jevent);
+    auto event  = reinterpret_cast<cudaEvent_t>(jevent);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
     CUDF_CUDA_TRY(cudaEventRecord(event, stream));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv *env, jclass,
-                                                                 jlong jevent) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv* env, jclass, jlong jevent)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto event = reinterpret_cast<cudaEvent_t>(jevent);
@@ -340,19 +363,17 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_eventSynchronize(JNIEnv *env, jc
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(JNIEnv *env, jclass, jlong jdst,
-                                                               jlong jsrc, jlong count, jint jkind,
-                                                               jlong jstream) {
-  if (count == 0) {
-    return;
-  }
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(
+  JNIEnv* env, jclass, jlong jdst, jlong jsrc, jlong count, jint jkind, jlong jstream)
+{
+  if (count == 0) { return; }
   JNI_ARG_CHECK(env, jdst != 0, "dst memory pointer is null", );
   JNI_ARG_CHECK(env, jsrc != 0, "src memory pointer is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto dst = reinterpret_cast<void *>(jdst);
-    auto src = reinterpret_cast<void *>(jsrc);
-    auto kind = static_cast<cudaMemcpyKind>(jkind);
+    auto dst    = reinterpret_cast<void*>(jdst);
+    auto src    = reinterpret_cast<void*>(jsrc);
+    auto kind   = static_cast<cudaMemcpyKind>(jkind);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
     CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream));
     CUDF_CUDA_TRY(cudaStreamSynchronize(stream));
@@ -360,50 +381,51 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_memcpyOnStream(JNIEnv *env, jcla
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemcpyOnStream(JNIEnv *env, jclass, jlong jdst,
-                                                                    jlong jsrc, jlong count,
-                                                                    jint jkind, jlong jstream) {
-  if (count == 0) {
-    return;
-  }
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_asyncMemcpyOnStream(
+  JNIEnv* env, jclass, jlong jdst, jlong jsrc, jlong count, jint jkind, jlong jstream)
+{
+  if (count == 0) { return; }
   JNI_ARG_CHECK(env, jdst != 0, "dst memory pointer is null", );
   JNI_ARG_CHECK(env, jsrc != 0, "src memory pointer is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto dst = reinterpret_cast<void *>(jdst);
-    auto src = reinterpret_cast<void *>(jsrc);
-    auto kind = static_cast<cudaMemcpyKind>(jkind);
+    auto dst    = reinterpret_cast<void*>(jdst);
+    auto src    = reinterpret_cast<void*>(jsrc);
+    auto kind   = static_cast<cudaMemcpyKind>(jkind);
     auto stream = reinterpret_cast<cudaStream_t>(jstream);
     CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, count, kind, stream));
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_profilerStart(JNIEnv *env, jclass clazz) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_profilerStart(JNIEnv* env, jclass clazz)
+{
 #ifdef CUDF_JNI_ENABLE_PROFILING
   try {
     cudaProfilerStart();
   }
   CATCH_STD(env, );
 #else
-  cudf::jni::throw_java_exception(env, cudf::jni::CUDF_ERROR_CLASS,
-                                  "This library was built without CUDA profiler support.");
+  cudf::jni::throw_java_exception(
+    env, cudf::jni::CUDF_ERROR_CLASS, "This library was built without CUDA profiler support.");
 #endif
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_profilerStop(JNIEnv *env, jclass clazz) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_profilerStop(JNIEnv* env, jclass clazz)
+{
 #ifdef CUDF_JNI_ENABLE_PROFILING
   try {
     cudaProfilerStop();
   }
   CATCH_STD(env, );
 #else
-  cudf::jni::throw_java_exception(env, cudf::jni::CUDF_ERROR_CLASS,
-                                  "This library was built without CUDA profiler support.");
+  cudf::jni::throw_java_exception(
+    env, cudf::jni::CUDF_ERROR_CLASS, "This library was built without CUDA profiler support.");
 #endif
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_deviceSynchronize(JNIEnv *env, jclass clazz) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_deviceSynchronize(JNIEnv* env, jclass clazz)
+{
   try {
     cudf::jni::auto_set_device(env);
     CUDF_CUDA_TRY(cudaDeviceSynchronize());
@@ -411,4 +433,4 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cuda_deviceSynchronize(JNIEnv *env, j
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/CudfJni.cpp b/java/src/main/native/src/CudfJni.cpp
index d0a25d449a6..698a8f6ff02 100644
--- a/java/src/main/native/src/CudfJni.cpp
+++ b/java/src/main/native/src/CudfJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,27 +14,27 @@
  * limitations under the License.
  */
 
-#include <sstream>
+#include "cudf_jni_apis.hpp"
 
 #include <cudf/copying.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include "cudf_jni_apis.hpp"
+#include <sstream>
 
 namespace {
 
 // handles detaching a thread from the JVM when the thread terminates
 class jvm_detach_on_destruct {
-public:
-  explicit jvm_detach_on_destruct(JavaVM *jvm) : jvm{jvm} {}
+ public:
+  explicit jvm_detach_on_destruct(JavaVM* jvm) : jvm{jvm} {}
 
   ~jvm_detach_on_destruct() { jvm->DetachCurrentThread(); }
 
-private:
-  JavaVM *jvm;
+ private:
+  JavaVM* jvm;
 };
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf {
 namespace jni {
@@ -49,74 +49,70 @@ static jclass Host_memory_buffer_jclass;
 static jfieldID Host_buffer_address;
 static jfieldID Host_buffer_length;
 
-#define HOST_MEMORY_BUFFER_CLASS "ai/rapids/cudf/HostMemoryBuffer"
+#define HOST_MEMORY_BUFFER_CLASS          "ai/rapids/cudf/HostMemoryBuffer"
 #define HOST_MEMORY_BUFFER_SIG(param_sig) "(" param_sig ")L" HOST_MEMORY_BUFFER_CLASS ";"
 
-static bool cache_host_memory_buffer_jni(JNIEnv *env) {
+static bool cache_host_memory_buffer_jni(JNIEnv* env)
+{
   jclass cls = env->FindClass(HOST_MEMORY_BUFFER_CLASS);
-  if (cls == nullptr) {
-    return false;
-  }
+  if (cls == nullptr) { return false; }
 
   Host_buffer_address = env->GetFieldID(cls, "address", "J");
-  if (Host_buffer_address == nullptr) {
-    return false;
-  }
+  if (Host_buffer_address == nullptr) { return false; }
 
   Host_buffer_length = env->GetFieldID(cls, "length", "J");
-  if (Host_buffer_length == nullptr) {
-    return false;
-  }
+  if (Host_buffer_length == nullptr) { return false; }
 
   // Convert local reference to global so it cannot be garbage collected.
   Host_memory_buffer_jclass = static_cast<jclass>(env->NewGlobalRef(cls));
-  if (Host_memory_buffer_jclass == nullptr) {
-    return false;
-  }
+  if (Host_memory_buffer_jclass == nullptr) { return false; }
   return true;
 }
 
-static void release_host_memory_buffer_jni(JNIEnv *env) {
+static void release_host_memory_buffer_jni(JNIEnv* env)
+{
   Host_memory_buffer_jclass = del_global_ref(env, Host_memory_buffer_jclass);
 }
 
-jobject allocate_host_buffer(JNIEnv *env, jlong amount, jboolean prefer_pinned,
-                             jobject host_memory_allocator) {
+jobject allocate_host_buffer(JNIEnv* env,
+                             jlong amount,
+                             jboolean prefer_pinned,
+                             jobject host_memory_allocator)
+{
   auto const host_memory_allocator_class = env->GetObjectClass(host_memory_allocator);
   auto const allocateMethodId =
-      env->GetMethodID(host_memory_allocator_class, "allocate", HOST_MEMORY_BUFFER_SIG("JZ"));
+    env->GetMethodID(host_memory_allocator_class, "allocate", HOST_MEMORY_BUFFER_SIG("JZ"));
   jobject ret =
-      env->CallObjectMethod(host_memory_allocator, allocateMethodId, amount, prefer_pinned);
+    env->CallObjectMethod(host_memory_allocator, allocateMethodId, amount, prefer_pinned);
 
-  if (env->ExceptionCheck()) {
-    throw std::runtime_error("allocateHostBuffer threw an exception");
-  }
+  if (env->ExceptionCheck()) { throw std::runtime_error("allocateHostBuffer threw an exception"); }
   return ret;
 }
 
-jlong get_host_buffer_address(JNIEnv *env, jobject buffer) {
+jlong get_host_buffer_address(JNIEnv* env, jobject buffer)
+{
   return env->GetLongField(buffer, Host_buffer_address);
 }
 
-jlong get_host_buffer_length(JNIEnv *env, jobject buffer) {
+jlong get_host_buffer_length(JNIEnv* env, jobject buffer)
+{
   return env->GetLongField(buffer, Host_buffer_length);
 }
 
 // Get the JNI environment, attaching the current thread to the JVM if necessary. If the thread
 // needs to be attached, the thread will automatically detach when the thread terminates.
-JNIEnv *get_jni_env(JavaVM *jvm) {
-  JNIEnv *env = nullptr;
-  jint rc = jvm->GetEnv(reinterpret_cast<void **>(&env), MINIMUM_JNI_VERSION);
-  if (rc == JNI_OK) {
-    return env;
-  }
+JNIEnv* get_jni_env(JavaVM* jvm)
+{
+  JNIEnv* env = nullptr;
+  jint rc     = jvm->GetEnv(reinterpret_cast<void**>(&env), MINIMUM_JNI_VERSION);
+  if (rc == JNI_OK) { return env; }
   if (rc == JNI_EDETACHED) {
     JavaVMAttachArgs attach_args;
     attach_args.version = MINIMUM_JNI_VERSION;
-    attach_args.name = const_cast<char *>("cudf thread");
-    attach_args.group = NULL;
+    attach_args.name    = const_cast<char*>("cudf thread");
+    attach_args.group   = NULL;
 
-    if (jvm->AttachCurrentThreadAsDaemon(reinterpret_cast<void **>(&env), &attach_args) == JNI_OK) {
+    if (jvm->AttachCurrentThreadAsDaemon(reinterpret_cast<void**>(&env), &attach_args) == JNI_OK) {
       // use thread_local object to detach the thread from the JVM when thread terminates.
       thread_local jvm_detach_on_destruct detacher(jvm);
     } else {
@@ -129,14 +125,15 @@ JNIEnv *get_jni_env(JavaVM *jvm) {
   throw std::runtime_error("error detecting thread attach state with JVM");
 }
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
 
 extern "C" {
 
-JNIEXPORT jint JNI_OnLoad(JavaVM *vm, void *) {
-  JNIEnv *env;
-  if (vm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+JNIEXPORT jint JNI_OnLoad(JavaVM* vm, void*)
+{
+  JNIEnv* env;
+  if (vm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
     return JNI_ERR;
   }
 
@@ -186,9 +183,10 @@ JNIEXPORT jint JNI_OnLoad(JavaVM *vm, void *) {
   return cudf::jni::MINIMUM_JNI_VERSION;
 }
 
-JNIEXPORT void JNI_OnUnload(JavaVM *vm, void *) {
-  JNIEnv *env = nullptr;
-  if (vm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+JNIEXPORT void JNI_OnUnload(JavaVM* vm, void*)
+{
+  JNIEnv* env = nullptr;
+  if (vm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
     return;
   }
 
@@ -198,8 +196,9 @@ JNIEXPORT void JNI_OnUnload(JavaVM *vm, void *) {
   cudf::jni::release_host_memory_buffer_jni(env);
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_isPtdsEnabled(JNIEnv *env, jclass) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_isPtdsEnabled(JNIEnv* env, jclass)
+{
   return cudf::jni::is_ptds_enabled;
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/DataSourceHelperJni.cpp b/java/src/main/native/src/DataSourceHelperJni.cpp
index 8d0e4d36413..af064a4f428 100644
--- a/java/src/main/native/src/DataSourceHelperJni.cpp
+++ b/java/src/main/native/src/DataSourceHelperJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,11 +14,11 @@
  * limitations under the License.
  */
 
-#include <cudf/io/datasource.hpp>
-
 #include "cudf_jni_apis.hpp"
 #include "jni_utils.hpp"
 
+#include <cudf/io/datasource.hpp>
+
 namespace {
 
 #define DATA_SOURCE_CLASS "ai/rapids/cudf/DataSource"
@@ -29,136 +29,127 @@ jmethodID hostReadBuff_method;
 jmethodID onHostBufferDone_method;
 jmethodID deviceRead_method;
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf {
 namespace jni {
-bool cache_data_source_jni(JNIEnv *env) {
+bool cache_data_source_jni(JNIEnv* env)
+{
   jclass cls = env->FindClass(DATA_SOURCE_CLASS);
-  if (cls == nullptr) {
-    return false;
-  }
+  if (cls == nullptr) { return false; }
 
   hostRead_method = env->GetMethodID(cls, "hostRead", "(JJJ)J");
-  if (hostRead_method == nullptr) {
-    return false;
-  }
+  if (hostRead_method == nullptr) { return false; }
 
   hostReadBuff_method = env->GetMethodID(cls, "hostReadBuff", "(JJ)[J");
-  if (hostReadBuff_method == nullptr) {
-    return false;
-  }
+  if (hostReadBuff_method == nullptr) { return false; }
 
   onHostBufferDone_method = env->GetMethodID(cls, "onHostBufferDone", "(J)V");
-  if (onHostBufferDone_method == nullptr) {
-    return false;
-  }
+  if (onHostBufferDone_method == nullptr) { return false; }
 
   deviceRead_method = env->GetMethodID(cls, "deviceRead", "(JJJJ)J");
-  if (deviceRead_method == nullptr) {
-    return false;
-  }
+  if (deviceRead_method == nullptr) { return false; }
 
   // Convert local reference to global so it cannot be garbage collected.
   DataSource_jclass = static_cast<jclass>(env->NewGlobalRef(cls));
-  if (DataSource_jclass == nullptr) {
-    return false;
-  }
+  if (DataSource_jclass == nullptr) { return false; }
   return true;
 }
 
-void release_data_source_jni(JNIEnv *env) {
+void release_data_source_jni(JNIEnv* env)
+{
   DataSource_jclass = cudf::jni::del_global_ref(env, DataSource_jclass);
 }
 
 class host_buffer_done_callback {
-public:
-  explicit host_buffer_done_callback(JavaVM *jvm, jobject ds, long id) : jvm(jvm), ds(ds), id(id) {}
+ public:
+  explicit host_buffer_done_callback(JavaVM* jvm, jobject ds, long id) : jvm(jvm), ds(ds), id(id) {}
 
-  host_buffer_done_callback(host_buffer_done_callback const &other) = delete;
-  host_buffer_done_callback(host_buffer_done_callback &&other)
-      : jvm(other.jvm), ds(other.ds), id(other.id) {
+  host_buffer_done_callback(host_buffer_done_callback const& other) = delete;
+  host_buffer_done_callback(host_buffer_done_callback&& other)
+    : jvm(other.jvm), ds(other.ds), id(other.id)
+  {
     other.jvm = nullptr;
-    other.ds = nullptr;
-    other.id = -1;
+    other.ds  = nullptr;
+    other.id  = -1;
   }
 
-  host_buffer_done_callback &operator=(host_buffer_done_callback &&other) = delete;
-  host_buffer_done_callback &operator=(host_buffer_done_callback const &other) = delete;
+  host_buffer_done_callback& operator=(host_buffer_done_callback&& other)      = delete;
+  host_buffer_done_callback& operator=(host_buffer_done_callback const& other) = delete;
 
-  ~host_buffer_done_callback() {
+  ~host_buffer_done_callback()
+  {
     // because we are in a destructor we cannot throw an exception, so for now we are
     // just going to keep the java exceptions around and have them be thrown when this
     // thread returns to the JVM. It might be kind of confusing, but we will not lose
     // them.
     if (jvm != nullptr) {
       // We cannot throw an exception in the destructor, so this is really best effort
-      JNIEnv *env = nullptr;
-      if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+      JNIEnv* env = nullptr;
+      if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
         env->CallVoidMethod(this->ds, onHostBufferDone_method, id);
       }
     }
   }
 
-private:
-  JavaVM *jvm;
+ private:
+  JavaVM* jvm;
   jobject ds;
   long id;
 };
 
 class jni_datasource : public cudf::io::datasource {
-public:
-  explicit jni_datasource(JNIEnv *env, jobject ds, size_t ds_size, bool device_read_supported,
-                          size_t device_read_cutoff)
-      : ds_size(ds_size), device_read_supported(device_read_supported),
-        device_read_cutoff(device_read_cutoff) {
-    if (env->GetJavaVM(&jvm) < 0) {
-      throw std::runtime_error("GetJavaVM failed");
-    }
+ public:
+  explicit jni_datasource(
+    JNIEnv* env, jobject ds, size_t ds_size, bool device_read_supported, size_t device_read_cutoff)
+    : ds_size(ds_size),
+      device_read_supported(device_read_supported),
+      device_read_cutoff(device_read_cutoff)
+  {
+    if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); }
     this->ds = add_global_ref(env, ds);
   }
 
-  virtual ~jni_datasource() {
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+  virtual ~jni_datasource()
+  {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
       ds = del_global_ref(env, ds);
     }
     ds = nullptr;
   }
 
-  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override {
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override
+  {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
       throw cudf::jni::jni_exception("Could not load JNIEnv");
     }
 
     jlongArray jbuffer_info =
-        static_cast<jlongArray>(env->CallObjectMethod(this->ds, hostReadBuff_method, offset, size));
-    if (env->ExceptionOccurred()) {
-      throw cudf::jni::jni_exception("Java exception in hostRead");
-    }
+      static_cast<jlongArray>(env->CallObjectMethod(this->ds, hostReadBuff_method, offset, size));
+    if (env->ExceptionOccurred()) { throw cudf::jni::jni_exception("Java exception in hostRead"); }
 
     cudf::jni::native_jlongArray buffer_info(env, jbuffer_info);
-    auto ptr = reinterpret_cast<uint8_t *>(buffer_info[0]);
+    auto ptr      = reinterpret_cast<uint8_t*>(buffer_info[0]);
     size_t length = buffer_info[1];
-    long id = buffer_info[2];
+    long id       = buffer_info[2];
 
     cudf::jni::host_buffer_done_callback cb(this->jvm, this->ds, id);
-    return std::make_unique<owning_buffer<cudf::jni::host_buffer_done_callback>>(std::move(cb), ptr,
-                                                                                 length);
+    return std::make_unique<owning_buffer<cudf::jni::host_buffer_done_callback>>(
+      std::move(cb), ptr, length);
   }
 
-  size_t host_read(size_t offset, size_t size, uint8_t *dst) override {
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+  size_t host_read(size_t offset, size_t size, uint8_t* dst) override
+  {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
       throw cudf::jni::jni_exception("Could not load JNIEnv");
     }
 
     jlong amount_read =
-        env->CallLongMethod(this->ds, hostRead_method, offset, size, reinterpret_cast<jlong>(dst));
-    if (env->ExceptionOccurred()) {
-      throw cudf::jni::jni_exception("Java exception in hostRead");
-    }
+      env->CallLongMethod(this->ds, hostRead_method, offset, size, reinterpret_cast<jlong>(dst));
+    if (env->ExceptionOccurred()) { throw cudf::jni::jni_exception("Java exception in hostRead"); }
     return amount_read;
   }
 
@@ -166,28 +157,38 @@ class jni_datasource : public cudf::io::datasource {
 
   bool supports_device_read() const override { return device_read_supported; }
 
-  bool is_device_read_preferred(size_t size) const override {
+  bool is_device_read_preferred(size_t size) const override
+  {
     return device_read_supported && size >= device_read_cutoff;
   }
 
-  size_t device_read(size_t offset, size_t size, uint8_t *dst,
-                     rmm::cuda_stream_view stream) override {
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
+  size_t device_read(size_t offset,
+                     size_t size,
+                     uint8_t* dst,
+                     rmm::cuda_stream_view stream) override
+  {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) != JNI_OK) {
       throw cudf::jni::jni_exception("Could not load JNIEnv");
     }
 
-    jlong amount_read =
-        env->CallLongMethod(this->ds, deviceRead_method, offset, size, reinterpret_cast<jlong>(dst),
-                            reinterpret_cast<jlong>(stream.value()));
+    jlong amount_read = env->CallLongMethod(this->ds,
+                                            deviceRead_method,
+                                            offset,
+                                            size,
+                                            reinterpret_cast<jlong>(dst),
+                                            reinterpret_cast<jlong>(stream.value()));
     if (env->ExceptionOccurred()) {
       throw cudf::jni::jni_exception("Java exception in deviceRead");
     }
     return amount_read;
   }
 
-  std::future<size_t> device_read_async(size_t offset, size_t size, uint8_t *dst,
-                                        rmm::cuda_stream_view stream) override {
+  std::future<size_t> device_read_async(size_t offset,
+                                        size_t size,
+                                        uint8_t* dst,
+                                        rmm::cuda_stream_view stream) override
+  {
     auto amount_read = device_read(offset, size, dst, stream);
     // This is a bit ugly, but we don't have a good way or a need to return
     // a future for the read
@@ -196,42 +197,48 @@ class jni_datasource : public cudf::io::datasource {
     return ret.get_future();
   }
 
-private:
+ private:
   size_t ds_size;
   bool device_read_supported;
   size_t device_read_cutoff;
-  JavaVM *jvm;
+  JavaVM* jvm;
   jobject ds;
 };
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
 
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_DataSourceHelper_createWrapperDataSource(
-    JNIEnv *env, jclass, jobject ds, jlong ds_size, jboolean device_read_supported,
-    jlong device_read_cutoff) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_DataSourceHelper_createWrapperDataSource(JNIEnv* env,
+                                                             jclass,
+                                                             jobject ds,
+                                                             jlong ds_size,
+                                                             jboolean device_read_supported,
+                                                             jlong device_read_cutoff)
+{
   JNI_NULL_CHECK(env, ds, "Null data source", 0);
   try {
     cudf::jni::auto_set_device(env);
     auto source =
-        new cudf::jni::jni_datasource(env, ds, ds_size, device_read_supported, device_read_cutoff);
+      new cudf::jni::jni_datasource(env, ds, ds_size, device_read_supported, device_read_cutoff);
     return reinterpret_cast<jlong>(source);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_DataSourceHelper_destroyWrapperDataSource(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_DataSourceHelper_destroyWrapperDataSource(JNIEnv* env,
                                                                                      jclass,
-                                                                                     jlong handle) {
+                                                                                     jlong handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     if (handle != 0) {
-      auto source = reinterpret_cast<cudf::jni::jni_datasource *>(handle);
+      auto source = reinterpret_cast<cudf::jni::jni_datasource*>(handle);
       delete (source);
     }
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/HashJoinJni.cpp b/java/src/main/native/src/HashJoinJni.cpp
index 0f78aef64bc..d4aa08e9a2d 100644
--- a/java/src/main/native/src/HashJoinJni.cpp
+++ b/java/src/main/native/src/HashJoinJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,32 +14,36 @@
  * limitations under the License.
  */
 
-#include <cudf/join.hpp>
-
 #include "cudf_jni_apis.hpp"
 
+#include <cudf/join.hpp>
+
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_HashJoin_create(JNIEnv *env, jclass, jlong j_table,
-                                                            jboolean j_nulls_equal) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_HashJoin_create(JNIEnv* env,
+                                                            jclass,
+                                                            jlong j_table,
+                                                            jboolean j_nulls_equal)
+{
   JNI_NULL_CHECK(env, j_table, "table handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto tview = reinterpret_cast<cudf::table_view const *>(j_table);
-    auto nulleq = j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    auto tview         = reinterpret_cast<cudf::table_view const*>(j_table);
+    auto nulleq        = j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     auto hash_join_ptr = new cudf::hash_join(*tview, nulleq);
     return reinterpret_cast<jlong>(hash_join_ptr);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_HashJoin_destroy(JNIEnv *env, jclass, jlong j_handle) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_HashJoin_destroy(JNIEnv* env, jclass, jlong j_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto hash_join_ptr = reinterpret_cast<cudf::hash_join *>(j_handle);
+    auto hash_join_ptr = reinterpret_cast<cudf::hash_join*>(j_handle);
     delete hash_join_ptr;
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/HostMemoryBufferNativeUtilsJni.cpp b/java/src/main/native/src/HostMemoryBufferNativeUtilsJni.cpp
index f9e05d27798..b175920ab4e 100644
--- a/java/src/main/native/src/HostMemoryBufferNativeUtilsJni.cpp
+++ b/java/src/main/native/src/HostMemoryBufferNativeUtilsJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,40 +14,39 @@
  * limitations under the License.
  */
 
+#include "jni_utils.hpp"
+
 #include <errno.h>
 #include <fcntl.h>
 #include <jni.h>
 #include <string.h>
-#include <unistd.h>
-
 #include <sys/mman.h>
 #include <sys/types.h>
-
-#include "jni_utils.hpp"
+#include <unistd.h>
 
 extern "C" {
 
 JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_HostMemoryBufferNativeUtils_wrapRangeInBuffer(
-    JNIEnv *env, jclass, jlong addr, jlong len) {
-  return env->NewDirectByteBuffer(reinterpret_cast<void *>(addr), len);
+  JNIEnv* env, jclass, jlong addr, jlong len)
+{
+  return env->NewDirectByteBuffer(reinterpret_cast<void*>(addr), len);
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_HostMemoryBufferNativeUtils_mmap(
-    JNIEnv *env, jclass, jstring jpath, jint mode, jlong offset, jlong length) {
+  JNIEnv* env, jclass, jstring jpath, jint mode, jlong offset, jlong length)
+{
   JNI_NULL_CHECK(env, jpath, "path is null", 0);
   JNI_ARG_CHECK(env, (mode == 0 || mode == 1), "bad mode value", 0);
   try {
     cudf::jni::native_jstring path(env, jpath);
 
     int fd = open(path.get(), (mode == 0) ? O_RDONLY : O_RDWR);
-    if (fd == -1) {
-      cudf::jni::throw_java_exception(env, "java/io/IOException", strerror(errno));
-    }
+    if (fd == -1) { cudf::jni::throw_java_exception(env, "java/io/IOException", strerror(errno)); }
 
-    void *address = mmap(NULL, length, (mode == 0) ? PROT_READ : PROT_READ | PROT_WRITE, MAP_SHARED,
-                         fd, offset);
+    void* address =
+      mmap(NULL, length, (mode == 0) ? PROT_READ : PROT_READ | PROT_WRITE, MAP_SHARED, fd, offset);
     if (address == MAP_FAILED) {
-      char const *error_msg = strerror(errno);
+      char const* error_msg = strerror(errno);
       close(fd);
       cudf::jni::throw_java_exception(env, "java/io/IOException", error_msg);
     }
@@ -58,17 +57,17 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_HostMemoryBufferNativeUtils_mmap(
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_HostMemoryBufferNativeUtils_munmap(JNIEnv *env, jclass,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_HostMemoryBufferNativeUtils_munmap(JNIEnv* env,
+                                                                              jclass,
                                                                               jlong address,
-                                                                              jlong length) {
+                                                                              jlong length)
+{
   JNI_NULL_CHECK(env, address, "address is NULL", );
   try {
-    int rc = munmap(reinterpret_cast<void *>(address), length);
-    if (rc == -1) {
-      cudf::jni::throw_java_exception(env, "java/io/IOException", strerror(errno));
-    }
+    int rc = munmap(reinterpret_cast<void*>(address), length);
+    if (rc == -1) { cudf::jni::throw_java_exception(env, "java/io/IOException", strerror(errno)); }
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/NvcompJni.cpp b/java/src/main/native/src/NvcompJni.cpp
index e616b7f66be..47a24653549 100644
--- a/java/src/main/native/src/NvcompJni.cpp
+++ b/java/src/main/native/src/NvcompJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,22 +13,23 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <nvcomp.h>
+#include "check_nvcomp_output_sizes.hpp"
+#include "cudf_jni_apis.hpp"
 
-#include <nvcomp/lz4.h>
 #include <rmm/device_uvector.hpp>
 
-#include "check_nvcomp_output_sizes.hpp"
-#include "cudf_jni_apis.hpp"
+#include <nvcomp.h>
+#include <nvcomp/lz4.h>
 
 namespace {
 
-constexpr char const *NVCOMP_ERROR_CLASS = "ai/rapids/cudf/nvcomp/NvcompException";
-constexpr char const *NVCOMP_CUDA_ERROR_CLASS = "ai/rapids/cudf/nvcomp/NvcompCudaException";
-constexpr char const *ILLEGAL_ARG_CLASS = "java/lang/IllegalArgumentException";
-constexpr char const *UNSUPPORTED_CLASS = "java/lang/UnsupportedOperationException";
+constexpr char const* NVCOMP_ERROR_CLASS      = "ai/rapids/cudf/nvcomp/NvcompException";
+constexpr char const* NVCOMP_CUDA_ERROR_CLASS = "ai/rapids/cudf/nvcomp/NvcompCudaException";
+constexpr char const* ILLEGAL_ARG_CLASS       = "java/lang/IllegalArgumentException";
+constexpr char const* UNSUPPORTED_CLASS       = "java/lang/UnsupportedOperationException";
 
-void check_nvcomp_status(JNIEnv *env, nvcompStatus_t status) {
+void check_nvcomp_status(JNIEnv* env, nvcompStatus_t status)
+{
   switch (status) {
     case nvcompSuccess: break;
     case nvcompErrorInvalidValue:
@@ -52,19 +53,20 @@ void check_nvcomp_status(JNIEnv *env, nvcompStatus_t status) {
   }
 }
 
-} // anonymous namespace
+}  // anonymous namespace
 
 extern "C" {
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetTempSize(
-    JNIEnv *env, jclass, jlong j_batch_size, jlong j_max_chunk_size) {
+  JNIEnv* env, jclass, jlong j_batch_size, jlong j_max_chunk_size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto batch_size = static_cast<std::size_t>(j_batch_size);
-    auto max_chunk_size = static_cast<std::size_t>(j_max_chunk_size);
+    auto batch_size       = static_cast<std::size_t>(j_batch_size);
+    auto max_chunk_size   = static_cast<std::size_t>(j_max_chunk_size);
     std::size_t temp_size = 0;
-    auto status = nvcompBatchedLZ4CompressGetTempSize(batch_size, max_chunk_size,
-                                                      nvcompBatchedLZ4DefaultOpts, &temp_size);
+    auto status           = nvcompBatchedLZ4CompressGetTempSize(
+      batch_size, max_chunk_size, nvcompBatchedLZ4DefaultOpts, &temp_size);
     check_nvcomp_status(env, status);
     return static_cast<jlong>(temp_size);
   }
@@ -72,49 +74,68 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressG
 }
 
 JNIEXPORT jlong JNICALL
-Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetMaxOutputChunkSize(
-    JNIEnv *env, jclass, jlong j_max_chunk_size) {
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetMaxOutputChunkSize(JNIEnv* env,
+                                                                             jclass,
+                                                                             jlong j_max_chunk_size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto max_chunk_size = static_cast<std::size_t>(j_max_chunk_size);
+    auto max_chunk_size         = static_cast<std::size_t>(j_max_chunk_size);
     std::size_t max_output_size = 0;
-    auto status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
-        max_chunk_size, nvcompBatchedLZ4DefaultOpts, &max_output_size);
+    auto status                 = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
+      max_chunk_size, nvcompBatchedLZ4DefaultOpts, &max_output_size);
     check_nvcomp_status(env, status);
     return static_cast<jlong>(max_output_size);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressAsync(
-    JNIEnv *env, jclass, jlong j_in_ptrs, jlong j_in_sizes, jlong j_chunk_size, jlong j_batch_size,
-    jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptrs, jlong j_compressed_sizes_out_ptr,
-    jlong j_stream) {
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressAsync(JNIEnv* env,
+                                                             jclass,
+                                                             jlong j_in_ptrs,
+                                                             jlong j_in_sizes,
+                                                             jlong j_chunk_size,
+                                                             jlong j_batch_size,
+                                                             jlong j_temp_ptr,
+                                                             jlong j_temp_size,
+                                                             jlong j_out_ptrs,
+                                                             jlong j_compressed_sizes_out_ptr,
+                                                             jlong j_stream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto in_ptrs = reinterpret_cast<void const *const *>(j_in_ptrs);
-    auto in_sizes = reinterpret_cast<std::size_t const *>(j_in_sizes);
-    auto chunk_size = static_cast<std::size_t>(j_chunk_size);
-    auto batch_size = static_cast<std::size_t>(j_batch_size);
-    auto temp_ptr = reinterpret_cast<void *>(j_temp_ptr);
-    auto temp_size = static_cast<std::size_t>(j_temp_size);
-    auto out_ptrs = reinterpret_cast<void *const *>(j_out_ptrs);
-    auto compressed_out_sizes = reinterpret_cast<std::size_t *>(j_compressed_sizes_out_ptr);
-    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
-    auto status = nvcompBatchedLZ4CompressAsync(in_ptrs, in_sizes, chunk_size, batch_size, temp_ptr,
-                                                temp_size, out_ptrs, compressed_out_sizes,
-                                                nvcompBatchedLZ4DefaultOpts, stream);
+    auto in_ptrs              = reinterpret_cast<void const* const*>(j_in_ptrs);
+    auto in_sizes             = reinterpret_cast<std::size_t const*>(j_in_sizes);
+    auto chunk_size           = static_cast<std::size_t>(j_chunk_size);
+    auto batch_size           = static_cast<std::size_t>(j_batch_size);
+    auto temp_ptr             = reinterpret_cast<void*>(j_temp_ptr);
+    auto temp_size            = static_cast<std::size_t>(j_temp_size);
+    auto out_ptrs             = reinterpret_cast<void* const*>(j_out_ptrs);
+    auto compressed_out_sizes = reinterpret_cast<std::size_t*>(j_compressed_sizes_out_ptr);
+    auto stream               = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status               = nvcompBatchedLZ4CompressAsync(in_ptrs,
+                                                in_sizes,
+                                                chunk_size,
+                                                batch_size,
+                                                temp_ptr,
+                                                temp_size,
+                                                out_ptrs,
+                                                compressed_out_sizes,
+                                                nvcompBatchedLZ4DefaultOpts,
+                                                stream);
     check_nvcomp_status(env, status);
   }
   CATCH_STD(env, );
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressGetTempSize(
-    JNIEnv *env, jclass, jlong j_batch_size, jlong j_chunk_size) {
+  JNIEnv* env, jclass, jlong j_batch_size, jlong j_chunk_size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto batch_size = static_cast<std::size_t>(j_batch_size);
-    auto chunk_size = static_cast<std::size_t>(j_chunk_size);
+    auto batch_size       = static_cast<std::size_t>(j_batch_size);
+    auto chunk_size       = static_cast<std::size_t>(j_chunk_size);
     std::size_t temp_size = 0;
     auto status = nvcompBatchedLZ4DecompressGetTempSize(batch_size, chunk_size, &temp_size);
     check_nvcomp_status(env, status);
@@ -123,49 +144,71 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4Decompres
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressAsync(
-    JNIEnv *env, jclass, jlong j_in_ptrs, jlong j_in_sizes, jlong j_out_sizes, jlong j_batch_size,
-    jlong j_temp_ptr, jlong j_temp_size, jlong j_out_ptrs, jlong j_stream) {
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4DecompressAsync(JNIEnv* env,
+                                                               jclass,
+                                                               jlong j_in_ptrs,
+                                                               jlong j_in_sizes,
+                                                               jlong j_out_sizes,
+                                                               jlong j_batch_size,
+                                                               jlong j_temp_ptr,
+                                                               jlong j_temp_size,
+                                                               jlong j_out_ptrs,
+                                                               jlong j_stream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto compressed_ptrs = reinterpret_cast<void const *const *>(j_in_ptrs);
-    auto compressed_sizes = reinterpret_cast<std::size_t const *>(j_in_sizes);
-    auto uncompressed_sizes = reinterpret_cast<std::size_t const *>(j_out_sizes);
-    auto batch_size = static_cast<std::size_t>(j_batch_size);
-    auto temp_ptr = reinterpret_cast<void *>(j_temp_ptr);
-    auto temp_size = static_cast<std::size_t>(j_temp_size);
-    auto uncompressed_ptrs = reinterpret_cast<void *const *>(j_out_ptrs);
-    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
-    auto uncompressed_statuses = rmm::device_uvector<nvcompStatus_t>(batch_size, stream);
+    auto compressed_ptrs           = reinterpret_cast<void const* const*>(j_in_ptrs);
+    auto compressed_sizes          = reinterpret_cast<std::size_t const*>(j_in_sizes);
+    auto uncompressed_sizes        = reinterpret_cast<std::size_t const*>(j_out_sizes);
+    auto batch_size                = static_cast<std::size_t>(j_batch_size);
+    auto temp_ptr                  = reinterpret_cast<void*>(j_temp_ptr);
+    auto temp_size                 = static_cast<std::size_t>(j_temp_size);
+    auto uncompressed_ptrs         = reinterpret_cast<void* const*>(j_out_ptrs);
+    auto stream                    = reinterpret_cast<cudaStream_t>(j_stream);
+    auto uncompressed_statuses     = rmm::device_uvector<nvcompStatus_t>(batch_size, stream);
     auto actual_uncompressed_sizes = rmm::device_uvector<std::size_t>(batch_size, stream);
-    auto status = nvcompBatchedLZ4DecompressAsync(
-        compressed_ptrs, compressed_sizes, uncompressed_sizes, actual_uncompressed_sizes.data(),
-        batch_size, temp_ptr, temp_size, uncompressed_ptrs, uncompressed_statuses.data(), stream);
+    auto status                    = nvcompBatchedLZ4DecompressAsync(compressed_ptrs,
+                                                  compressed_sizes,
+                                                  uncompressed_sizes,
+                                                  actual_uncompressed_sizes.data(),
+                                                  batch_size,
+                                                  temp_ptr,
+                                                  temp_size,
+                                                  uncompressed_ptrs,
+                                                  uncompressed_statuses.data(),
+                                                  stream);
     check_nvcomp_status(env, status);
-    if (!cudf::java::check_nvcomp_output_sizes(uncompressed_sizes, actual_uncompressed_sizes.data(),
-                                               batch_size, stream)) {
-      cudf::jni::throw_java_exception(env, NVCOMP_ERROR_CLASS,
-                                      "nvcomp decompress output size mismatch");
+    if (!cudf::java::check_nvcomp_output_sizes(
+          uncompressed_sizes, actual_uncompressed_sizes.data(), batch_size, stream)) {
+      cudf::jni::throw_java_exception(
+        env, NVCOMP_ERROR_CLASS, "nvcomp decompress output size mismatch");
     }
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4GetDecompressSizeAsync(
-    JNIEnv *env, jclass, jlong j_in_ptrs, jlong j_in_sizes, jlong j_out_sizes, jlong j_batch_size,
-    jlong j_stream) {
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4GetDecompressSizeAsync(JNIEnv* env,
+                                                                      jclass,
+                                                                      jlong j_in_ptrs,
+                                                                      jlong j_in_sizes,
+                                                                      jlong j_out_sizes,
+                                                                      jlong j_batch_size,
+                                                                      jlong j_stream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto compressed_ptrs = reinterpret_cast<void const *const *>(j_in_ptrs);
-    auto compressed_sizes = reinterpret_cast<std::size_t const *>(j_in_sizes);
-    auto uncompressed_sizes = reinterpret_cast<std::size_t *>(j_out_sizes);
-    auto batch_size = static_cast<std::size_t>(j_batch_size);
-    auto stream = reinterpret_cast<cudaStream_t>(j_stream);
-    auto status = nvcompBatchedLZ4GetDecompressSizeAsync(compressed_ptrs, compressed_sizes,
-                                                         uncompressed_sizes, batch_size, stream);
+    auto compressed_ptrs    = reinterpret_cast<void const* const*>(j_in_ptrs);
+    auto compressed_sizes   = reinterpret_cast<std::size_t const*>(j_in_sizes);
+    auto uncompressed_sizes = reinterpret_cast<std::size_t*>(j_out_sizes);
+    auto batch_size         = static_cast<std::size_t>(j_batch_size);
+    auto stream             = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status             = nvcompBatchedLZ4GetDecompressSizeAsync(
+      compressed_ptrs, compressed_sizes, uncompressed_sizes, batch_size, stream);
     check_nvcomp_status(env, status);
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/NvtxRangeJni.cpp b/java/src/main/native/src/NvtxRangeJni.cpp
index 2529acfb91d..4ba6be31b87 100644
--- a/java/src/main/native/src/NvtxRangeJni.cpp
+++ b/java/src/main/native/src/NvtxRangeJni.cpp
@@ -14,15 +14,18 @@
  * limitations under the License.
  */
 
-#include <nvtx3/nvtx3.hpp>
-
 #include "jni_utils.hpp"
 #include "nvtx_common.hpp"
 
+#include <nvtx3/nvtx3.hpp>
+
 extern "C" {
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxRange_push(JNIEnv *env, jclass clazz, jstring name,
-                                                          jint color_bits) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxRange_push(JNIEnv* env,
+                                                          jclass clazz,
+                                                          jstring name,
+                                                          jint color_bits)
+{
   try {
     cudf::jni::native_jstring range_name(env, name);
     nvtx3::color range_color(static_cast<nvtx3::color::value_type>(color_bits));
@@ -32,11 +35,12 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxRange_push(JNIEnv *env, jclass cl
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxRange_pop(JNIEnv *env, jclass clazz) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxRange_pop(JNIEnv* env, jclass clazz)
+{
   try {
     nvtxDomainRangePop(nvtx3::domain::get<cudf::jni::java_domain>());
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/NvtxUniqueRangeJni.cpp b/java/src/main/native/src/NvtxUniqueRangeJni.cpp
index 924b5a564e6..2ff96f96497 100644
--- a/java/src/main/native/src/NvtxUniqueRangeJni.cpp
+++ b/java/src/main/native/src/NvtxUniqueRangeJni.cpp
@@ -14,28 +14,33 @@
  * limitations under the License.
  */
 
-#include <nvtx3/nvtx3.hpp>
-
 #include "jni_utils.hpp"
 #include "nvtx_common.hpp"
 
+#include <nvtx3/nvtx3.hpp>
+
 extern "C" {
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_NvtxUniqueRange_start(JNIEnv *env, jclass clazz,
-                                                                  jstring name, jint color_bits) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_NvtxUniqueRange_start(JNIEnv* env,
+                                                                  jclass clazz,
+                                                                  jstring name,
+                                                                  jint color_bits)
+{
   try {
     cudf::jni::native_jstring range_name(env, name);
     nvtx3::color range_color(static_cast<nvtx3::color::value_type>(color_bits));
     nvtx3::event_attributes attr{range_color, range_name.get()};
     auto nvtxRangeId =
-        nvtxDomainRangeStartEx(nvtx3::domain::get<cudf::jni::java_domain>(), attr.get());
+      nvtxDomainRangeStartEx(nvtx3::domain::get<cudf::jni::java_domain>(), attr.get());
     return static_cast<jlong>(nvtxRangeId);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxUniqueRange_end(JNIEnv *env, jclass clazz,
-                                                               jlong nvtxRangeId) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxUniqueRange_end(JNIEnv* env,
+                                                               jclass clazz,
+                                                               jlong nvtxRangeId)
+{
   try {
     nvtxDomainRangeEnd(nvtx3::domain::get<cudf::jni::java_domain>(),
                        static_cast<nvtxRangeId_t>(nvtxRangeId));
@@ -43,4 +48,4 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_NvtxUniqueRange_end(JNIEnv *env, jcla
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/PackedColumnMetadataJni.cpp b/java/src/main/native/src/PackedColumnMetadataJni.cpp
index 7ec3e1294ce..c7c95558e71 100644
--- a/java/src/main/native/src/PackedColumnMetadataJni.cpp
+++ b/java/src/main/native/src/PackedColumnMetadataJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,23 +19,26 @@
 extern "C" {
 
 JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_PackedColumnMetadata_createMetadataDirectBuffer(
-    JNIEnv *env, jclass, jlong j_metadata_ptr) {
+  JNIEnv* env, jclass, jlong j_metadata_ptr)
+{
   JNI_NULL_CHECK(env, j_metadata_ptr, "metadata is null", nullptr);
   try {
-    auto metadata = reinterpret_cast<std::vector<uint8_t> *>(j_metadata_ptr);
-    return env->NewDirectByteBuffer(const_cast<uint8_t *>(metadata->data()), metadata->size());
+    auto metadata = reinterpret_cast<std::vector<uint8_t>*>(j_metadata_ptr);
+    return env->NewDirectByteBuffer(const_cast<uint8_t*>(metadata->data()), metadata->size());
   }
   CATCH_STD(env, nullptr);
 }
 
-JNIEXPORT void JNICALL
-Java_ai_rapids_cudf_PackedColumnMetadata_closeMetadata(JNIEnv *env, jclass, jlong j_metadata_ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_PackedColumnMetadata_closeMetadata(JNIEnv* env,
+                                                                              jclass,
+                                                                              jlong j_metadata_ptr)
+{
   JNI_NULL_CHECK(env, j_metadata_ptr, "metadata is null", );
   try {
-    auto metadata = reinterpret_cast<std::vector<uint8_t> *>(j_metadata_ptr);
+    auto metadata = reinterpret_cast<std::vector<uint8_t>*>(j_metadata_ptr);
     delete metadata;
   }
   CATCH_STD(env, );
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 8d7ac8890cc..68453c924d6 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -14,14 +14,10 @@
  * limitations under the License.
  */
 
-#include <atomic>
-#include <ctime>
-#include <fstream>
-#include <iostream>
-#include <limits>
-#include <mutex>
+#include "cudf_jni_apis.hpp"
 
 #include <cudf/io/memory_resource.hpp>
+
 #include <rmm/aligned.hpp>
 #include <rmm/mr/device/aligned_resource_adaptor.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
@@ -36,7 +32,12 @@
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include "cudf_jni_apis.hpp"
+#include <atomic>
+#include <ctime>
+#include <fstream>
+#include <iostream>
+#include <limits>
+#include <mutex>
 
 using rmm::mr::device_memory_resource;
 using rmm::mr::logging_resource_adaptor;
@@ -44,14 +45,14 @@ using rmm_pinned_pool_t = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_mem
 
 namespace {
 
-constexpr char const *RMM_EXCEPTION_CLASS = "ai/rapids/cudf/RmmException";
+constexpr char const* RMM_EXCEPTION_CLASS = "ai/rapids/cudf/RmmException";
 
 /**
  * @brief Base class so we can template tracking_resource_adaptor but
  * still hold all instances of it without issues.
  */
 class base_tracking_resource_adaptor : public device_memory_resource {
-public:
+ public:
   virtual std::size_t get_total_allocated() = 0;
 
   virtual std::size_t get_max_total_allocated() = 0;
@@ -71,7 +72,7 @@ class base_tracking_resource_adaptor : public device_memory_resource {
  */
 template <typename Upstream>
 class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
-public:
+ public:
   /**
    * @brief Constructs a new tracking resource adaptor that delegates to
    * `mr` for all allocation operations while tracking the amount of memory
@@ -81,28 +82,32 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
    * @param size_alignment The alignment to which the `mr` resource will
    * round up all memory allocation size requests.
    */
-  tracking_resource_adaptor(Upstream *mr, std::size_t size_alignment)
-      : resource{mr}, size_align{size_alignment} {}
+  tracking_resource_adaptor(Upstream* mr, std::size_t size_alignment)
+    : resource{mr}, size_align{size_alignment}
+  {
+  }
 
-  Upstream *get_wrapped_resource() { return resource; }
+  Upstream* get_wrapped_resource() { return resource; }
 
   std::size_t get_total_allocated() override { return total_allocated.load(); }
 
   std::size_t get_max_total_allocated() override { return max_total_allocated; }
 
-  void reset_scoped_max_total_allocated(std::size_t initial_value) override {
+  void reset_scoped_max_total_allocated(std::size_t initial_value) override
+  {
     std::scoped_lock lock(max_total_allocated_mutex);
-    scoped_allocated = initial_value;
+    scoped_allocated           = initial_value;
     scoped_max_total_allocated = initial_value;
   }
 
-  std::size_t get_scoped_max_total_allocated() override {
+  std::size_t get_scoped_max_total_allocated() override
+  {
     std::scoped_lock lock(max_total_allocated_mutex);
     return scoped_max_total_allocated;
   }
 
-private:
-  Upstream *const resource;
+ private:
+  Upstream* const resource;
   std::size_t const size_align;
   // sum of what is currently allocated
   std::atomic_size_t total_allocated{0};
@@ -120,7 +125,8 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
   std::mutex max_total_allocated_mutex;
 
-  void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
+  void* do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override
+  {
     // adjust size of allocation based on specified size alignment
     num_bytes = (num_bytes + size_align - 1) / size_align * size_align;
 
@@ -129,13 +135,14 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
       total_allocated += num_bytes;
       scoped_allocated += num_bytes;
       std::scoped_lock lock(max_total_allocated_mutex);
-      max_total_allocated = std::max(total_allocated.load(), max_total_allocated);
+      max_total_allocated        = std::max(total_allocated.load(), max_total_allocated);
       scoped_max_total_allocated = std::max(scoped_allocated.load(), scoped_max_total_allocated);
     }
     return result;
   }
 
-  void do_deallocate(void *p, std::size_t size, rmm::cuda_stream_view stream) override {
+  void do_deallocate(void* p, std::size_t size, rmm::cuda_stream_view stream) override
+  {
     size = (size + size_align - 1) / size_align * size_align;
 
     resource->deallocate(p, size, stream);
@@ -148,8 +155,9 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 };
 
 template <typename Upstream>
-tracking_resource_adaptor<Upstream> *make_tracking_adaptor(Upstream *upstream,
-                                                           std::size_t size_alignment) {
+tracking_resource_adaptor<Upstream>* make_tracking_adaptor(Upstream* upstream,
+                                                           std::size_t size_alignment)
+{
   return new tracking_resource_adaptor<Upstream>{upstream, size_alignment};
 }
 
@@ -158,24 +166,23 @@ tracking_resource_adaptor<Upstream> *make_tracking_adaptor(Upstream *upstream,
  * for most operations but will call Java to handle certain situations (e.g.: allocation failure).
  */
 class java_event_handler_memory_resource : public device_memory_resource {
-public:
-  java_event_handler_memory_resource(JNIEnv *env, jobject jhandler, jlongArray jalloc_thresholds,
+ public:
+  java_event_handler_memory_resource(JNIEnv* env,
+                                     jobject jhandler,
+                                     jlongArray jalloc_thresholds,
                                      jlongArray jdealloc_thresholds,
-                                     device_memory_resource *resource_to_wrap,
-                                     base_tracking_resource_adaptor *tracker)
-      : resource(resource_to_wrap), tracker(tracker) {
-    if (env->GetJavaVM(&jvm) < 0) {
-      throw std::runtime_error("GetJavaVM failed");
-    }
+                                     device_memory_resource* resource_to_wrap,
+                                     base_tracking_resource_adaptor* tracker)
+    : resource(resource_to_wrap), tracker(tracker)
+  {
+    if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); }
 
     jclass cls = env->GetObjectClass(jhandler);
-    if (cls == nullptr) {
-      throw cudf::jni::jni_exception("class not found");
-    }
+    if (cls == nullptr) { throw cudf::jni::jni_exception("class not found"); }
     on_alloc_fail_method = env->GetMethodID(cls, "onAllocFailure", "(JI)Z");
     if (on_alloc_fail_method == nullptr) {
       use_old_alloc_fail_interface = true;
-      on_alloc_fail_method = env->GetMethodID(cls, "onAllocFailure", "(J)Z");
+      on_alloc_fail_method         = env->GetMethodID(cls, "onAllocFailure", "(J)Z");
       if (on_alloc_fail_method == nullptr) {
         throw cudf::jni::jni_exception("onAllocFailure method");
       }
@@ -197,22 +204,23 @@ class java_event_handler_memory_resource : public device_memory_resource {
     handler_obj = cudf::jni::add_global_ref(env, jhandler);
   }
 
-  virtual ~java_event_handler_memory_resource() {
+  virtual ~java_event_handler_memory_resource()
+  {
     // This should normally be called by a JVM thread. If the JVM environment is missing then this
     // is likely being triggered by the C++ runtime during shutdown. In that case the JVM may
     // already be destroyed and this thread should not try to attach to get an environment.
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
       handler_obj = cudf::jni::del_global_ref(env, handler_obj);
     }
     handler_obj = nullptr;
   }
 
-  device_memory_resource *get_wrapped_resource() { return resource; }
+  device_memory_resource* get_wrapped_resource() { return resource; }
 
-private:
-  device_memory_resource *const resource;
-  base_tracking_resource_adaptor *const tracker;
+ private:
+  device_memory_resource* const resource;
+  base_tracking_resource_adaptor* const tracker;
   jmethodID on_alloc_fail_method;
   bool use_old_alloc_fail_interface;
   jmethodID on_alloc_threshold_method;
@@ -222,8 +230,10 @@ class java_event_handler_memory_resource : public device_memory_resource {
   std::vector<std::size_t> alloc_thresholds{};
   std::vector<std::size_t> dealloc_thresholds{};
 
-  static void update_thresholds(JNIEnv *env, std::vector<std::size_t> &thresholds,
-                                jlongArray from_java) {
+  static void update_thresholds(JNIEnv* env,
+                                std::vector<std::size_t>& thresholds,
+                                jlongArray from_java)
+  {
     thresholds.clear();
     if (from_java != nullptr) {
       cudf::jni::native_jlongArray jvalues(env, from_java);
@@ -234,17 +244,19 @@ class java_event_handler_memory_resource : public device_memory_resource {
     }
   }
 
-  bool on_alloc_fail(std::size_t num_bytes, int retry_count) {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
+  bool on_alloc_fail(std::size_t num_bytes, int retry_count)
+  {
+    JNIEnv* env     = cudf::jni::get_jni_env(jvm);
     jboolean result = false;
     if (!use_old_alloc_fail_interface) {
-      result =
-          env->CallBooleanMethod(handler_obj, on_alloc_fail_method, static_cast<jlong>(num_bytes),
-                                 static_cast<jint>(retry_count));
+      result = env->CallBooleanMethod(handler_obj,
+                                      on_alloc_fail_method,
+                                      static_cast<jlong>(num_bytes),
+                                      static_cast<jint>(retry_count));
 
     } else {
       result =
-          env->CallBooleanMethod(handler_obj, on_alloc_fail_method, static_cast<jlong>(num_bytes));
+        env->CallBooleanMethod(handler_obj, on_alloc_fail_method, static_cast<jlong>(num_bytes));
     }
     if (env->ExceptionCheck()) {
       throw std::runtime_error("onAllocFailure handler threw an exception");
@@ -252,16 +264,20 @@ class java_event_handler_memory_resource : public device_memory_resource {
     return result;
   }
 
-  void check_for_threshold_callback(std::size_t low, std::size_t high,
-                                    std::vector<std::size_t> const &thresholds,
-                                    jmethodID callback_method, char const *callback_name,
-                                    std::size_t current_total) {
+  void check_for_threshold_callback(std::size_t low,
+                                    std::size_t high,
+                                    std::vector<std::size_t> const& thresholds,
+                                    jmethodID callback_method,
+                                    char const* callback_name,
+                                    std::size_t current_total)
+  {
     if (high >= thresholds.front() && low < thresholds.back()) {
       // could use binary search, but assumption is threshold count is very small
-      auto it = std::find_if(thresholds.begin(), thresholds.end(),
-                             [=](std::size_t t) -> bool { return low < t && high >= t; });
+      auto it = std::find_if(thresholds.begin(), thresholds.end(), [=](std::size_t t) -> bool {
+        return low < t && high >= t;
+      });
       if (it != thresholds.end()) {
-        JNIEnv *env = cudf::jni::get_jni_env(jvm);
+        JNIEnv* env = cudf::jni::get_jni_env(jvm);
         env->CallVoidMethod(handler_obj, callback_method, current_total);
         if (env->ExceptionCheck()) {
           throw std::runtime_error("onAllocThreshold handler threw an exception");
@@ -270,13 +286,14 @@ class java_event_handler_memory_resource : public device_memory_resource {
     }
   }
 
-protected:
-  JavaVM *jvm;
+ protected:
+  JavaVM* jvm;
   jobject handler_obj;
 
-  void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
+  void* do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override
+  {
     std::size_t total_before;
-    void *result;
+    void* result;
     // a non-zero retry_count signifies that the `on_alloc_fail`
     // callback is being invoked while re-attempting an allocation
     // that had previously failed.
@@ -284,20 +301,22 @@ class java_event_handler_memory_resource : public device_memory_resource {
     while (true) {
       try {
         total_before = tracker->get_total_allocated();
-        result = resource->allocate(num_bytes, stream);
+        result       = resource->allocate(num_bytes, stream);
         break;
-      } catch (rmm::out_of_memory const &e) {
-        if (!on_alloc_fail(num_bytes, retry_count++)) {
-          throw;
-        }
+      } catch (rmm::out_of_memory const& e) {
+        if (!on_alloc_fail(num_bytes, retry_count++)) { throw; }
       }
     }
     auto total_after = tracker->get_total_allocated();
 
     try {
-      check_for_threshold_callback(total_before, total_after, alloc_thresholds,
-                                   on_alloc_threshold_method, "onAllocThreshold", total_after);
-    } catch (std::exception const &e) {
+      check_for_threshold_callback(total_before,
+                                   total_after,
+                                   alloc_thresholds,
+                                   on_alloc_threshold_method,
+                                   "onAllocThreshold",
+                                   total_after);
+    } catch (std::exception const& e) {
       // Free the allocation as app will think the exception means the memory was not allocated.
       resource->deallocate(result, num_bytes, stream);
       throw;
@@ -306,33 +325,36 @@ class java_event_handler_memory_resource : public device_memory_resource {
     return result;
   }
 
-  void do_deallocate(void *p, std::size_t size, rmm::cuda_stream_view stream) override {
+  void do_deallocate(void* p, std::size_t size, rmm::cuda_stream_view stream) override
+  {
     auto total_before = tracker->get_total_allocated();
     resource->deallocate(p, size, stream);
     auto total_after = tracker->get_total_allocated();
-    check_for_threshold_callback(total_after, total_before, dealloc_thresholds,
-                                 on_dealloc_threshold_method, "onDeallocThreshold", total_after);
+    check_for_threshold_callback(total_after,
+                                 total_before,
+                                 dealloc_thresholds,
+                                 on_dealloc_threshold_method,
+                                 "onDeallocThreshold",
+                                 total_after);
   }
 };
 
 class java_debug_event_handler_memory_resource final : public java_event_handler_memory_resource {
-public:
-  java_debug_event_handler_memory_resource(JNIEnv *env, jobject jhandler,
+ public:
+  java_debug_event_handler_memory_resource(JNIEnv* env,
+                                           jobject jhandler,
                                            jlongArray jalloc_thresholds,
                                            jlongArray jdealloc_thresholds,
-                                           device_memory_resource *resource_to_wrap,
-                                           base_tracking_resource_adaptor *tracker)
-      : java_event_handler_memory_resource(env, jhandler, jalloc_thresholds, jdealloc_thresholds,
-                                           resource_to_wrap, tracker) {
+                                           device_memory_resource* resource_to_wrap,
+                                           base_tracking_resource_adaptor* tracker)
+    : java_event_handler_memory_resource(
+        env, jhandler, jalloc_thresholds, jdealloc_thresholds, resource_to_wrap, tracker)
+  {
     jclass cls = env->GetObjectClass(jhandler);
-    if (cls == nullptr) {
-      throw cudf::jni::jni_exception("class not found");
-    }
+    if (cls == nullptr) { throw cudf::jni::jni_exception("class not found"); }
 
     on_allocated_method = env->GetMethodID(cls, "onAllocated", "(J)V");
-    if (on_allocated_method == nullptr) {
-      throw cudf::jni::jni_exception("onAllocated method");
-    }
+    if (on_allocated_method == nullptr) { throw cudf::jni::jni_exception("onAllocated method"); }
 
     on_deallocated_method = env->GetMethodID(cls, "onDeallocated", "(J)V");
     if (on_deallocated_method == nullptr) {
@@ -340,36 +362,41 @@ class java_debug_event_handler_memory_resource final : public java_event_handler
     }
   }
 
-private:
+ private:
   jmethodID on_allocated_method;
   jmethodID on_deallocated_method;
 
-  void on_allocated_callback(std::size_t num_bytes, rmm::cuda_stream_view stream) {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
+  void on_allocated_callback(std::size_t num_bytes, rmm::cuda_stream_view stream)
+  {
+    JNIEnv* env = cudf::jni::get_jni_env(jvm);
     env->CallVoidMethod(handler_obj, on_allocated_method, num_bytes);
     if (env->ExceptionCheck()) {
       throw std::runtime_error("onAllocated handler threw an exception");
     }
   }
 
-  void on_deallocated_callback(void *p, std::size_t size, rmm::cuda_stream_view stream) {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
+  void on_deallocated_callback(void* p, std::size_t size, rmm::cuda_stream_view stream)
+  {
+    JNIEnv* env = cudf::jni::get_jni_env(jvm);
     env->CallVoidMethod(handler_obj, on_deallocated_method, size);
   }
 
-  void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
-    void *result = java_event_handler_memory_resource::do_allocate(num_bytes, stream);
+  void* do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override
+  {
+    void* result = java_event_handler_memory_resource::do_allocate(num_bytes, stream);
     on_allocated_callback(num_bytes, stream);
     return result;
   }
 
-  void do_deallocate(void *p, std::size_t size, rmm::cuda_stream_view stream) override {
+  void do_deallocate(void* p, std::size_t size, rmm::cuda_stream_view stream) override
+  {
     java_event_handler_memory_resource::do_deallocate(p, size, stream);
     on_deallocated_callback(p, size, stream);
   }
 };
 
-inline auto &prior_cuio_host_mr() {
+inline auto& prior_cuio_host_mr()
+{
   static rmm::host_async_resource_ref _prior_cuio_host_mr = cudf::io::get_host_memory_resource();
   return _prior_cuio_host_mr;
 }
@@ -384,18 +411,19 @@ inline auto &prior_cuio_host_mr() {
  * Most of this comes directly from `pinned_host_memory_resource` in RMM.
  */
 class pinned_fallback_host_memory_resource {
-private:
-  rmm_pinned_pool_t *_pool;
-  void *pool_begin_;
-  void *pool_end_;
-
-public:
-  pinned_fallback_host_memory_resource(rmm_pinned_pool_t *pool) : _pool(pool) {
+ private:
+  rmm_pinned_pool_t* _pool;
+  void* pool_begin_;
+  void* pool_end_;
+
+ public:
+  pinned_fallback_host_memory_resource(rmm_pinned_pool_t* pool) : _pool(pool)
+  {
     // allocate from the pinned pool the full size to figure out
     // our beginning and end address.
     auto pool_size = pool->pool_size();
-    pool_begin_ = pool->allocate(pool_size);
-    pool_end_ = static_cast<void *>(static_cast<uint8_t *>(pool_begin_) + pool_size);
+    pool_begin_    = pool->allocate(pool_size);
+    pool_end_      = static_cast<void*>(static_cast<uint8_t*>(pool_begin_) + pool_size);
     pool->deallocate(pool_begin_, pool_size);
   }
 
@@ -415,11 +443,12 @@ class pinned_fallback_host_memory_resource {
    *
    * @return Pointer to the newly allocated memory.
    */
-  void *allocate(std::size_t bytes,
-                 [[maybe_unused]] std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) {
+  void* allocate(std::size_t bytes,
+                 [[maybe_unused]] std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
+  {
     try {
       return _pool->allocate(bytes, alignment);
-    } catch (const std::exception &unused) {
+    } catch (const std::exception& unused) {
       // try to allocate using the underlying pinned resource
       return prior_cuio_host_mr().allocate(bytes, alignment);
     }
@@ -436,8 +465,10 @@ class pinned_fallback_host_memory_resource {
    * @param bytes Size of the allocation.
    * @param alignment Alignment in bytes. Default alignment is used if unspecified.
    */
-  void deallocate(void *ptr, std::size_t bytes,
-                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept {
+  void deallocate(void* ptr,
+                  std::size_t bytes,
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
+  {
     if (ptr >= pool_begin_ && ptr <= pool_end_) {
       _pool->deallocate(ptr, bytes, alignment);
     } else {
@@ -459,7 +490,8 @@ class pinned_fallback_host_memory_resource {
    * @param stream CUDA stream on which to perform the allocation (ignored).
    * @return Pointer to the newly allocated memory.
    */
-  void *allocate_async(std::size_t bytes, [[maybe_unused]] cuda::stream_ref stream) {
+  void* allocate_async(std::size_t bytes, [[maybe_unused]] cuda::stream_ref stream)
+  {
     return allocate(bytes);
   }
 
@@ -478,8 +510,10 @@ class pinned_fallback_host_memory_resource {
    * @param stream CUDA stream on which to perform the allocation (ignored).
    * @return Pointer to the newly allocated memory.
    */
-  void *allocate_async(std::size_t bytes, std::size_t alignment,
-                       [[maybe_unused]] cuda::stream_ref stream) {
+  void* allocate_async(std::size_t bytes,
+                       std::size_t alignment,
+                       [[maybe_unused]] cuda::stream_ref stream)
+  {
     return allocate(bytes, alignment);
   }
 
@@ -492,8 +526,10 @@ class pinned_fallback_host_memory_resource {
    * @param bytes Size of the allocation.
    * @param stream CUDA stream on which to perform the deallocation (ignored).
    */
-  void deallocate_async(void *ptr, std::size_t bytes,
-                        [[maybe_unused]] cuda::stream_ref stream) noexcept {
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        [[maybe_unused]] cuda::stream_ref stream) noexcept
+  {
     return deallocate(ptr, bytes);
   }
 
@@ -508,8 +544,11 @@ class pinned_fallback_host_memory_resource {
    * @param alignment Alignment in bytes.
    * @param stream CUDA stream on which to perform the deallocation (ignored).
    */
-  void deallocate_async(void *ptr, std::size_t bytes, std::size_t alignment,
-                        [[maybe_unused]] cuda::stream_ref stream) noexcept {
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        std::size_t alignment,
+                        [[maybe_unused]] cuda::stream_ref stream) noexcept
+  {
     return deallocate(ptr, bytes, alignment);
   }
   // NOLINTEND(bugprone-easily-swappable-parameters)
@@ -517,44 +556,49 @@ class pinned_fallback_host_memory_resource {
   /**
    * @briefreturn{true if the specified resource is the same type as this resource.}
    */
-  bool operator==(const pinned_fallback_host_memory_resource &) const { return true; }
+  bool operator==(const pinned_fallback_host_memory_resource&) const { return true; }
 
   /**
    * @briefreturn{true if the specified resource is not the same type as this resource, otherwise
    * false.}
    */
-  bool operator!=(const pinned_fallback_host_memory_resource &) const { return false; }
+  bool operator!=(const pinned_fallback_host_memory_resource&) const { return false; }
 
   /**
    * @brief Enables the `cuda::mr::device_accessible` property
    *
    * This property declares that a `pinned_host_memory_resource` provides device accessible memory
    */
-  friend void get_property(pinned_fallback_host_memory_resource const &,
-                           cuda::mr::device_accessible) noexcept {}
+  friend void get_property(pinned_fallback_host_memory_resource const&,
+                           cuda::mr::device_accessible) noexcept
+  {
+  }
 
   /**
    * @brief Enables the `cuda::mr::host_accessible` property
    *
    * This property declares that a `pinned_host_memory_resource` provides host accessible memory
    */
-  friend void get_property(pinned_fallback_host_memory_resource const &,
-                           cuda::mr::host_accessible) noexcept {}
+  friend void get_property(pinned_fallback_host_memory_resource const&,
+                           cuda::mr::host_accessible) noexcept
+  {
+  }
 };
 
 // carryover from RMM pinned_host_memory_resource
-static_assert(
-    cuda::mr::async_resource_with<pinned_fallback_host_memory_resource, cuda::mr::device_accessible,
-                                  cuda::mr::host_accessible>);
+static_assert(cuda::mr::async_resource_with<pinned_fallback_host_memory_resource,
+                                            cuda::mr::device_accessible,
+                                            cuda::mr::host_accessible>);
 
 // we set this to our fallback resource if we have set it.
 std::unique_ptr<pinned_fallback_host_memory_resource> pinned_fallback_mr;
 
-} // anonymous namespace
+}  // anonymous namespace
 
 extern "C" {
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initDefaultCudaDevice(JNIEnv *env, jclass clazz) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initDefaultCudaDevice(JNIEnv* env, jclass clazz)
+{
   // make sure the CUDA device is setup in the context
   cudaError_t cuda_status = cudaFree(0);
   cudf::jni::jni_cuda_check(env, cuda_status);
@@ -566,66 +610,78 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initDefaultCudaDevice(JNIEnv *env
   cudf::jni::set_cudf_device(device_id);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_cleanupDefaultCudaDevice(JNIEnv *env, jclass clazz) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_cleanupDefaultCudaDevice(JNIEnv* env, jclass clazz)
+{
   cudf::jni::set_cudf_device(cudaInvalidDeviceId);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclass clazz, jlong size,
-                                                              jlong stream) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv* env,
+                                                              jclass clazz,
+                                                              jlong size,
+                                                              jlong stream)
+{
   try {
     cudf::jni::auto_set_device(env);
     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
     auto c_stream = rmm::cuda_stream_view(reinterpret_cast<cudaStream_t>(stream));
-    void *ret = mr.allocate_async(size, rmm::CUDA_ALLOCATION_ALIGNMENT, c_stream);
+    void* ret     = mr.allocate_async(size, rmm::CUDA_ALLOCATION_ALIGNMENT, c_stream);
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_free(JNIEnv *env, jclass clazz, jlong ptr,
-                                                    jlong size, jlong stream) {
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_Rmm_free(JNIEnv* env, jclass clazz, jlong ptr, jlong size, jlong stream)
+{
   try {
     cudf::jni::auto_set_device(env);
     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
-    void *cptr = reinterpret_cast<void *>(ptr);
+    void* cptr                        = reinterpret_cast<void*>(ptr);
     auto c_stream = rmm::cuda_stream_view(reinterpret_cast<cudaStream_t>(stream));
     mr.deallocate_async(cptr, size, rmm::CUDA_ALLOCATION_ALIGNMENT, c_stream);
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeDeviceBuffer(JNIEnv *env, jclass clazz,
-                                                                jlong ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeDeviceBuffer(JNIEnv* env,
+                                                                jclass clazz,
+                                                                jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    rmm::device_buffer *cptr = reinterpret_cast<rmm::device_buffer *>(ptr);
+    rmm::device_buffer* cptr = reinterpret_cast<rmm::device_buffer*>(ptr);
     delete cptr;
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocCudaInternal(JNIEnv *env, jclass clazz,
-                                                                  jlong size, jlong stream) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocCudaInternal(JNIEnv* env,
+                                                                  jclass clazz,
+                                                                  jlong size,
+                                                                  jlong stream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    void *ptr{nullptr};
+    void* ptr{nullptr};
     RMM_CUDA_TRY_ALLOC(cudaMalloc(&ptr, size));
     return reinterpret_cast<jlong>(ptr);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeCuda(JNIEnv *env, jclass clazz, jlong ptr,
-                                                        jlong size, jlong stream) {
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_Rmm_freeCuda(JNIEnv* env, jclass clazz, jlong ptr, jlong size, jlong stream)
+{
   try {
     cudf::jni::auto_set_device(env);
-    void *cptr = reinterpret_cast<void *>(ptr);
+    void* cptr = reinterpret_cast<void*>(ptr);
     RMM_ASSERT_CUDA_SUCCESS(cudaFree(cptr));
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaMemoryResource(JNIEnv *env, jclass clazz) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaMemoryResource(JNIEnv* env, jclass clazz)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto ret = new rmm::mr::cuda_memory_resource();
@@ -634,18 +690,20 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaMemoryResource(JNIEnv *en
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseCudaMemoryResource(JNIEnv *env, jclass clazz,
-                                                                         jlong ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseCudaMemoryResource(JNIEnv* env,
+                                                                         jclass clazz,
+                                                                         jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<rmm::mr::cuda_memory_resource *>(ptr);
+    auto mr = reinterpret_cast<rmm::mr::cuda_memory_resource*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newManagedMemoryResource(JNIEnv *env,
-                                                                         jclass clazz) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newManagedMemoryResource(JNIEnv* env, jclass clazz)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto ret = new rmm::mr::managed_memory_resource();
@@ -654,70 +712,77 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newManagedMemoryResource(JNIEnv
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseManagedMemoryResource(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseManagedMemoryResource(JNIEnv* env,
                                                                             jclass clazz,
-                                                                            jlong ptr) {
+                                                                            jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<rmm::mr::managed_memory_resource *>(ptr);
+    auto mr = reinterpret_cast<rmm::mr::managed_memory_resource*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPoolMemoryResource(JNIEnv *env, jclass clazz,
-                                                                      jlong child, jlong init,
-                                                                      jlong max) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPoolMemoryResource(
+  JNIEnv* env, jclass clazz, jlong child, jlong init, jlong max)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
     auto ret =
-        new rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>(wrapped, init, max);
+      new rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>(wrapped, init, max);
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releasePoolMemoryResource(JNIEnv *env, jclass clazz,
-                                                                         jlong ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releasePoolMemoryResource(JNIEnv* env,
+                                                                         jclass clazz,
+                                                                         jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto mr =
-        reinterpret_cast<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> *>(ptr);
+      reinterpret_cast<rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newArenaMemoryResource(JNIEnv *env, jclass clazz,
-                                                                       jlong child, jlong init,
-                                                                       jboolean dump_on_oom) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newArenaMemoryResource(
+  JNIEnv* env, jclass clazz, jlong child, jlong init, jboolean dump_on_oom)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
-    auto ret = new rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>(wrapped, init,
-                                                                                   dump_on_oom);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
+    auto ret     = new rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>(
+      wrapped, init, dump_on_oom);
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseArenaMemoryResource(JNIEnv *env, jclass clazz,
-                                                                          jlong ptr) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseArenaMemoryResource(JNIEnv* env,
+                                                                          jclass clazz,
+                                                                          jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto mr =
-        reinterpret_cast<rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource> *>(ptr);
+      reinterpret_cast<rmm::mr::arena_memory_resource<rmm::mr::device_memory_resource>*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEnv *env,
-                                                                           jclass clazz, jlong init,
-                                                                           jlong release) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEnv* env,
+                                                                           jclass clazz,
+                                                                           jlong init,
+                                                                           jlong release)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto ret = new rmm::mr::cuda_async_memory_resource(init, release);
@@ -726,71 +791,70 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newCudaAsyncMemoryResource(JNIEn
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseCudaAsyncMemoryResource(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseCudaAsyncMemoryResource(JNIEnv* env,
                                                                               jclass clazz,
-                                                                              jlong ptr) {
+                                                                              jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<rmm::mr::cuda_async_memory_resource *>(ptr);
+    auto mr = reinterpret_cast<rmm::mr::cuda_async_memory_resource*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newLimitingResourceAdaptor(JNIEnv *env,
-                                                                           jclass clazz,
-                                                                           jlong child, jlong limit,
-                                                                           jlong align) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newLimitingResourceAdaptor(
+  JNIEnv* env, jclass clazz, jlong child, jlong limit, jlong align)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
-    auto ret = new rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>(
-        wrapped, limit, align);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
+    auto ret     = new rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>(
+      wrapped, limit, align);
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseLimitingResourceAdaptor(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseLimitingResourceAdaptor(JNIEnv* env,
                                                                               jclass clazz,
-                                                                              jlong ptr) {
+                                                                              jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto mr =
-        reinterpret_cast<rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource> *>(
-            ptr);
+      reinterpret_cast<rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newLoggingResourceAdaptor(JNIEnv *env, jclass clazz,
-                                                                          jlong child, jint type,
-                                                                          jstring jpath,
-                                                                          jboolean auto_flush) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newLoggingResourceAdaptor(
+  JNIEnv* env, jclass clazz, jlong child, jint type, jstring jpath, jboolean auto_flush)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
     switch (type) {
-      case 1: // File
+      case 1:  // File
       {
         cudf::jni::native_jstring path(env, jpath);
         auto ret = new logging_resource_adaptor<rmm::mr::device_memory_resource>(
-            wrapped, path.get(), auto_flush);
+          wrapped, path.get(), auto_flush);
         return reinterpret_cast<jlong>(ret);
       }
-      case 2: // stdout
+      case 2:  // stdout
       {
-        auto ret = new logging_resource_adaptor<rmm::mr::device_memory_resource>(wrapped, std::cout,
-                                                                                 auto_flush);
+        auto ret = new logging_resource_adaptor<rmm::mr::device_memory_resource>(
+          wrapped, std::cout, auto_flush);
         return reinterpret_cast<jlong>(ret);
       }
-      case 3: // stderr
+      case 3:  // stderr
       {
-        auto ret = new logging_resource_adaptor<rmm::mr::device_memory_resource>(wrapped, std::cerr,
-                                                                                 auto_flush);
+        auto ret = new logging_resource_adaptor<rmm::mr::device_memory_resource>(
+          wrapped, std::cerr, auto_flush);
         return reinterpret_cast<jlong>(ret);
       }
       default: throw std::logic_error("unsupported logging location type");
@@ -799,108 +863,121 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newLoggingResourceAdaptor(JNIEnv
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseLoggingResourceAdaptor(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseLoggingResourceAdaptor(JNIEnv* env,
                                                                              jclass clazz,
-                                                                             jlong ptr) {
+                                                                             jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto mr =
-        reinterpret_cast<rmm::mr::logging_resource_adaptor<rmm::mr::device_memory_resource> *>(ptr);
+      reinterpret_cast<rmm::mr::logging_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newTrackingResourceAdaptor(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newTrackingResourceAdaptor(JNIEnv* env,
                                                                            jclass clazz,
                                                                            jlong child,
-                                                                           jlong align) {
+                                                                           jlong align)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
-    auto ret = new tracking_resource_adaptor<rmm::mr::device_memory_resource>(wrapped, align);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
+    auto ret     = new tracking_resource_adaptor<rmm::mr::device_memory_resource>(wrapped, align);
     return reinterpret_cast<jlong>(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseTrackingResourceAdaptor(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseTrackingResourceAdaptor(JNIEnv* env,
                                                                               jclass clazz,
-                                                                              jlong ptr) {
+                                                                              jlong ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource> *>(ptr);
+    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     delete mr;
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_nativeGetTotalBytesAllocated(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_nativeGetTotalBytesAllocated(JNIEnv* env,
                                                                              jclass clazz,
-                                                                             jlong ptr) {
+                                                                             jlong ptr)
+{
   JNI_NULL_CHECK(env, ptr, "adaptor is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource> *>(ptr);
+    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     return mr->get_total_allocated();
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_nativeGetMaxTotalBytesAllocated(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_nativeGetMaxTotalBytesAllocated(JNIEnv* env,
                                                                                 jclass clazz,
-                                                                                jlong ptr) {
+                                                                                jlong ptr)
+{
   JNI_NULL_CHECK(env, ptr, "adaptor is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource> *>(ptr);
+    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     return mr->get_max_total_allocated();
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_nativeResetScopedMaxTotalBytesAllocated(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_nativeResetScopedMaxTotalBytesAllocated(JNIEnv* env,
                                                                                        jclass clazz,
                                                                                        jlong ptr,
-                                                                                       jlong init) {
+                                                                                       jlong init)
+{
   JNI_NULL_CHECK(env, ptr, "adaptor is null", );
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource> *>(ptr);
+    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     mr->reset_scoped_max_total_allocated(init);
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_nativeGetScopedMaxTotalBytesAllocated(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_nativeGetScopedMaxTotalBytesAllocated(JNIEnv* env,
                                                                                       jclass clazz,
-                                                                                      jlong ptr) {
+                                                                                      jlong ptr)
+{
   JNI_NULL_CHECK(env, ptr, "adaptor is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource> *>(ptr);
+    auto mr = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource>*>(ptr);
     return mr->get_scoped_max_total_allocated();
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newEventHandlerResourceAdaptor(
-    JNIEnv *env, jclass, jlong child, jlong tracker, jobject handler_obj,
-    jlongArray jalloc_thresholds, jlongArray jdealloc_thresholds, jboolean enable_debug) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_Rmm_newEventHandlerResourceAdaptor(JNIEnv* env,
+                                                       jclass,
+                                                       jlong child,
+                                                       jlong tracker,
+                                                       jobject handler_obj,
+                                                       jlongArray jalloc_thresholds,
+                                                       jlongArray jdealloc_thresholds,
+                                                       jboolean enable_debug)
+{
   JNI_NULL_CHECK(env, child, "child is null", 0);
   JNI_NULL_CHECK(env, tracker, "tracker is null", 0);
   try {
-    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource *>(child);
-    auto t =
-        reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource> *>(tracker);
+    auto wrapped = reinterpret_cast<rmm::mr::device_memory_resource*>(child);
+    auto t = reinterpret_cast<tracking_resource_adaptor<rmm::mr::device_memory_resource>*>(tracker);
     if (enable_debug) {
-      auto ret = new java_debug_event_handler_memory_resource(env, handler_obj, jalloc_thresholds,
-                                                              jdealloc_thresholds, wrapped, t);
+      auto ret = new java_debug_event_handler_memory_resource(
+        env, handler_obj, jalloc_thresholds, jdealloc_thresholds, wrapped, t);
       return reinterpret_cast<jlong>(ret);
     } else {
-      auto ret = new java_event_handler_memory_resource(env, handler_obj, jalloc_thresholds,
-                                                        jdealloc_thresholds, wrapped, t);
+      auto ret = new java_event_handler_memory_resource(
+        env, handler_obj, jalloc_thresholds, jdealloc_thresholds, wrapped, t);
       return reinterpret_cast<jlong>(ret);
     }
   }
@@ -908,34 +985,38 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newEventHandlerResourceAdaptor(
 }
 
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releaseEventHandlerResourceAdaptor(
-    JNIEnv *env, jclass clazz, jlong ptr, jboolean enable_debug) {
+  JNIEnv* env, jclass clazz, jlong ptr, jboolean enable_debug)
+{
   try {
     cudf::jni::auto_set_device(env);
     if (enable_debug) {
-      auto mr = reinterpret_cast<java_debug_event_handler_memory_resource *>(ptr);
+      auto mr = reinterpret_cast<java_debug_event_handler_memory_resource*>(ptr);
       delete mr;
     } else {
-      auto mr = reinterpret_cast<java_event_handler_memory_resource *>(ptr);
+      auto mr = reinterpret_cast<java_event_handler_memory_resource*>(ptr);
       delete mr;
     }
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCurrentDeviceResourceInternal(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCurrentDeviceResourceInternal(JNIEnv* env,
                                                                                 jclass clazz,
-                                                                                jlong new_handle) {
+                                                                                jlong new_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto mr = reinterpret_cast<rmm::mr::device_memory_resource *>(new_handle);
+    auto mr = reinterpret_cast<rmm::mr::device_memory_resource*>(new_handle);
     rmm::mr::set_current_device_resource(mr);
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPinnedPoolMemoryResource(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPinnedPoolMemoryResource(JNIEnv* env,
                                                                             jclass clazz,
-                                                                            jlong init, jlong max) {
+                                                                            jlong init,
+                                                                            jlong max)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto pool = new rmm_pinned_pool_t(new rmm::mr::pinned_host_memory_resource(), init, max);
@@ -944,12 +1025,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPinnedPoolMemoryResource(JNIE
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCuioPinnedPoolMemoryResource(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCuioPinnedPoolMemoryResource(JNIEnv* env,
                                                                                jclass clazz,
-                                                                               jlong pool_ptr) {
+                                                                               jlong pool_ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto pool = reinterpret_cast<rmm_pinned_pool_t *>(pool_ptr);
+    auto pool = reinterpret_cast<rmm_pinned_pool_t*>(pool_ptr);
     // create a pinned fallback pool that will allocate pinned memory
     // if the regular pinned pool is exhausted
     pinned_fallback_mr.reset(new pinned_fallback_host_memory_resource(pool));
@@ -959,57 +1041,67 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCuioPinnedPoolMemoryResource(J
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releasePinnedPoolMemoryResource(JNIEnv *env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releasePinnedPoolMemoryResource(JNIEnv* env,
                                                                                jclass clazz,
-                                                                               jlong pool_ptr) {
+                                                                               jlong pool_ptr)
+{
   try {
     cudf::jni::auto_set_device(env);
     // set the cuio host memory resource to what it was before, or the same
     // if we didn't overwrite it with setCuioPinnedPoolMemoryResource
     cudf::io::set_host_memory_resource(prior_cuio_host_mr());
     pinned_fallback_mr.reset();
-    delete reinterpret_cast<rmm_pinned_pool_t *>(pool_ptr);
+    delete reinterpret_cast<rmm_pinned_pool_t*>(pool_ptr);
   }
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocFromPinnedPool(JNIEnv *env, jclass clazz,
-                                                                    jlong pool_ptr, jlong size) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocFromPinnedPool(JNIEnv* env,
+                                                                    jclass clazz,
+                                                                    jlong pool_ptr,
+                                                                    jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto pool = reinterpret_cast<rmm_pinned_pool_t *>(pool_ptr);
-    void *ret = pool->allocate(size);
+    auto pool = reinterpret_cast<rmm_pinned_pool_t*>(pool_ptr);
+    void* ret = pool->allocate(size);
     return reinterpret_cast<jlong>(ret);
-  } catch (const std::exception &unused) { return -1; }
+  } catch (const std::exception& unused) {
+    return -1;
+  }
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromPinnedPool(JNIEnv *env, jclass clazz,
-                                                                  jlong pool_ptr, jlong ptr,
-                                                                  jlong size) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromPinnedPool(
+  JNIEnv* env, jclass clazz, jlong pool_ptr, jlong ptr, jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto pool = reinterpret_cast<rmm_pinned_pool_t *>(pool_ptr);
-    void *cptr = reinterpret_cast<void *>(ptr);
+    auto pool  = reinterpret_cast<rmm_pinned_pool_t*>(pool_ptr);
+    void* cptr = reinterpret_cast<void*>(ptr);
     pool->deallocate(cptr, size);
   }
   CATCH_STD(env, )
 }
 
 // only for tests
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocFromFallbackPinnedPool(JNIEnv *env,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocFromFallbackPinnedPool(JNIEnv* env,
                                                                             jclass clazz,
-                                                                            jlong size) {
+                                                                            jlong size)
+{
   cudf::jni::auto_set_device(env);
-  void *ret = cudf::io::get_host_memory_resource().allocate(size);
+  void* ret = cudf::io::get_host_memory_resource().allocate(size);
   return reinterpret_cast<jlong>(ret);
 }
 
 // only for tests
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromFallbackPinnedPool(JNIEnv *env, jclass clazz,
-                                                                          jlong ptr, jlong size) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromFallbackPinnedPool(JNIEnv* env,
+                                                                          jclass clazz,
+                                                                          jlong ptr,
+                                                                          jlong size)
+{
   try {
     cudf::jni::auto_set_device(env);
-    void *cptr = reinterpret_cast<void *>(ptr);
+    void* cptr = reinterpret_cast<void*>(ptr);
     cudf::io::get_host_memory_resource().deallocate(cptr, size);
   }
   CATCH_STD(env, )
diff --git a/java/src/main/native/src/ScalarJni.cpp b/java/src/main/native/src/ScalarJni.cpp
index e47728f6acc..6a1ad1a9f32 100644
--- a/java/src/main/native/src/ScalarJni.cpp
+++ b/java/src/main/native/src/ScalarJni.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#include "cudf_jni_apis.hpp"
+#include "dtype_utils.hpp"
+
 #include <cudf/binaryop.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
@@ -21,135 +24,149 @@
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/types.hpp>
 
-#include "cudf_jni_apis.hpp"
-#include "dtype_utils.hpp"
-
 using cudf::jni::release_as_jlong;
 
 extern "C" {
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Scalar_closeScalar(JNIEnv *env, jclass,
-                                                              jlong scalar_handle) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Scalar_closeScalar(JNIEnv* env,
+                                                              jclass,
+                                                              jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cudf::scalar *s = reinterpret_cast<cudf::scalar *>(scalar_handle);
+    cudf::scalar* s = reinterpret_cast<cudf::scalar*>(scalar_handle);
     delete s;
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Scalar_isScalarValid(JNIEnv *env, jclass,
-                                                                    jlong scalar_handle) {
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Scalar_isScalarValid(JNIEnv* env,
+                                                                    jclass,
+                                                                    jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    cudf::scalar *s = reinterpret_cast<cudf::scalar *>(scalar_handle);
+    cudf::scalar* s = reinterpret_cast<cudf::scalar*>(scalar_handle);
     return static_cast<jboolean>(s->is_valid());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jbyte JNICALL Java_ai_rapids_cudf_Scalar_getByte(JNIEnv *env, jclass,
-                                                           jlong scalar_handle) {
+JNIEXPORT jbyte JNICALL Java_ai_rapids_cudf_Scalar_getByte(JNIEnv* env, jclass, jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<int8_t>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
     return static_cast<jbyte>(s->value());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jshort JNICALL Java_ai_rapids_cudf_Scalar_getShort(JNIEnv *env, jclass,
-                                                             jlong scalar_handle) {
+JNIEXPORT jshort JNICALL Java_ai_rapids_cudf_Scalar_getShort(JNIEnv* env,
+                                                             jclass,
+                                                             jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<int16_t>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
     return static_cast<jshort>(s->value());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Scalar_getInt(JNIEnv *env, jclass, jlong scalar_handle) {
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Scalar_getInt(JNIEnv* env, jclass, jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<int32_t>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
     return static_cast<jint>(s->value());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_getLong(JNIEnv *env, jclass,
-                                                           jlong scalar_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_getLong(JNIEnv* env, jclass, jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<int64_t>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
     return static_cast<jlong>(s->value());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jfloat JNICALL Java_ai_rapids_cudf_Scalar_getFloat(JNIEnv *env, jclass,
-                                                             jlong scalar_handle) {
+JNIEXPORT jfloat JNICALL Java_ai_rapids_cudf_Scalar_getFloat(JNIEnv* env,
+                                                             jclass,
+                                                             jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<float>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
     return static_cast<jfloat>(s->value());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jdouble JNICALL Java_ai_rapids_cudf_Scalar_getDouble(JNIEnv *env, jclass,
-                                                               jlong scalar_handle) {
+JNIEXPORT jdouble JNICALL Java_ai_rapids_cudf_Scalar_getDouble(JNIEnv* env,
+                                                               jclass,
+                                                               jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<double>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
     return static_cast<jdouble>(s->value());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jbyteArray JNICALL Java_ai_rapids_cudf_Scalar_getBigIntegerBytes(JNIEnv *env, jclass,
-                                                                           jlong scalar_handle) {
+JNIEXPORT jbyteArray JNICALL Java_ai_rapids_cudf_Scalar_getBigIntegerBytes(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
     using ScalarType = cudf::scalar_type_t<__int128_t>;
-    auto s = reinterpret_cast<ScalarType *>(scalar_handle);
-    auto val = s->value();
-    jbyte const *ptr = reinterpret_cast<jbyte const *>(&val);
+    auto s           = reinterpret_cast<ScalarType*>(scalar_handle);
+    auto val         = s->value();
+    jbyte const* ptr = reinterpret_cast<jbyte const*>(&val);
     cudf::jni::native_jbyteArray jbytes{env, ptr, sizeof(__int128_t)};
     return jbytes.get_jArray();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jbyteArray JNICALL Java_ai_rapids_cudf_Scalar_getUTF8(JNIEnv *env, jclass,
-                                                                jlong scalar_handle) {
+JNIEXPORT jbyteArray JNICALL Java_ai_rapids_cudf_Scalar_getUTF8(JNIEnv* env,
+                                                                jclass,
+                                                                jlong scalar_handle)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto s = reinterpret_cast<cudf::string_scalar *>(scalar_handle);
+    auto s = reinterpret_cast<cudf::string_scalar*>(scalar_handle);
     std::string val{s->to_string()};
     if (val.size() > 0x7FFFFFFF) {
-      cudf::jni::throw_java_exception(env, "java/lang/IllegalArgumentException",
-                                      "string scalar too large");
+      cudf::jni::throw_java_exception(
+        env, "java/lang/IllegalArgumentException", "string scalar too large");
     }
-    cudf::jni::native_jbyteArray jbytes{env, reinterpret_cast<jbyte const *>(val.data()),
-                                        static_cast<int>(val.size())};
+    cudf::jni::native_jbyteArray jbytes{
+      env, reinterpret_cast<jbyte const*>(val.data()), static_cast<int>(val.size())};
     return jbytes.get_jArray();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_getListAsColumnView(JNIEnv *env, jclass,
-                                                                       jlong scalar_handle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_getListAsColumnView(JNIEnv* env,
+                                                                       jclass,
+                                                                       jlong scalar_handle)
+{
   JNI_NULL_CHECK(env, scalar_handle, "scalar handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto s = reinterpret_cast<cudf::list_scalar *>(scalar_handle);
+    auto s = reinterpret_cast<cudf::list_scalar*>(scalar_handle);
     // Creates a column view in heap with the stack one, to let JVM take care of its
     // life cycle.
     return reinterpret_cast<jlong>(new cudf::column_view(s->view()));
@@ -158,12 +175,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_getListAsColumnView(JNIEnv *e
 }
 
 JNIEXPORT jlongArray JNICALL
-Java_ai_rapids_cudf_Scalar_getChildrenFromStructScalar(JNIEnv *env, jclass, jlong scalar_handle) {
+Java_ai_rapids_cudf_Scalar_getChildrenFromStructScalar(JNIEnv* env, jclass, jlong scalar_handle)
+{
   JNI_NULL_CHECK(env, scalar_handle, "scalar handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto s = reinterpret_cast<cudf::struct_scalar *>(scalar_handle);
-    const cudf::table_view &table = s->view();
+    const auto s                  = reinterpret_cast<cudf::struct_scalar*>(scalar_handle);
+    const cudf::table_view& table = s->view();
     cudf::jni::native_jpointerArray<cudf::column_view> column_handles(env, table.num_columns());
     for (int i = 0; i < table.num_columns(); i++) {
       column_handles[i] = new cudf::column_view(table.column(i));
@@ -173,215 +191,246 @@ Java_ai_rapids_cudf_Scalar_getChildrenFromStructScalar(JNIEnv *env, jclass, jlon
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeBool8Scalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeBool8Scalar(JNIEnv* env,
+                                                                   jclass,
                                                                    jboolean value,
-                                                                   jboolean is_valid) {
+                                                                   jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::BOOL8));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::BOOL8));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int8_t>;
-      int8_t val = value ? 1 : 0;
-      static_cast<ScalarType *>(s.get())->set_value(val);
+      int8_t val       = value ? 1 : 0;
+      static_cast<ScalarType*>(s.get())->set_value(val);
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt8Scalar(JNIEnv *env, jclass, jbyte value,
-                                                                  jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt8Scalar(JNIEnv* env,
+                                                                  jclass,
+                                                                  jbyte value,
+                                                                  jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT8));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT8));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int8_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int8_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int8_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint8Scalar(JNIEnv *env, jclass, jbyte value,
-                                                                   jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint8Scalar(JNIEnv* env,
+                                                                   jclass,
+                                                                   jbyte value,
+                                                                   jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT8));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT8));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<uint8_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<uint8_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<uint8_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt16Scalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt16Scalar(JNIEnv* env,
+                                                                   jclass,
                                                                    jshort value,
-                                                                   jboolean is_valid) {
+                                                                   jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT16));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT16));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int16_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int16_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int16_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint16Scalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint16Scalar(JNIEnv* env,
+                                                                    jclass,
                                                                     jshort value,
-                                                                    jboolean is_valid) {
+                                                                    jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT16));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT16));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<uint16_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<uint16_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<uint16_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDurationDaysScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDurationDaysScalar(JNIEnv* env,
+                                                                          jclass,
                                                                           jint value,
-                                                                          jboolean is_valid) {
+                                                                          jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_duration_scalar(cudf::data_type(cudf::type_id::DURATION_DAYS));
+      cudf::make_duration_scalar(cudf::data_type(cudf::type_id::DURATION_DAYS));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int32_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int32_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int32_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt32Scalar(JNIEnv *env, jclass, jint value,
-                                                                   jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt32Scalar(JNIEnv* env,
+                                                                   jclass,
+                                                                   jint value,
+                                                                   jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT32));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int32_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int32_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int32_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint32Scalar(JNIEnv *env, jclass, jint value,
-                                                                    jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint32Scalar(JNIEnv* env,
+                                                                    jclass,
+                                                                    jint value,
+                                                                    jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT32));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT32));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<uint32_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<uint32_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<uint32_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt64Scalar(JNIEnv *env, jclass, jlong value,
-                                                                   jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeInt64Scalar(JNIEnv* env,
+                                                                   jclass,
+                                                                   jlong value,
+                                                                   jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT64));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::INT64));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int64_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int64_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int64_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint64Scalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeUint64Scalar(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong value,
-                                                                    jboolean is_valid) {
+                                                                    jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT64));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::UINT64));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<uint64_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<uint64_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<uint64_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeFloat32Scalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeFloat32Scalar(JNIEnv* env,
+                                                                     jclass,
                                                                      jfloat value,
-                                                                     jboolean is_valid) {
+                                                                     jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::FLOAT32));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::FLOAT32));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<float>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<float>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<float>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeFloat64Scalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeFloat64Scalar(JNIEnv* env,
+                                                                     jclass,
                                                                      jdouble value,
-                                                                     jboolean is_valid) {
+                                                                     jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::FLOAT64));
+      cudf::make_numeric_scalar(cudf::data_type(cudf::type_id::FLOAT64));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<double>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<double>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<double>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStringScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStringScalar(JNIEnv* env,
+                                                                    jclass,
                                                                     jbyteArray value,
-                                                                    jboolean is_valid) {
+                                                                    jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::string strval;
     if (is_valid) {
       cudf::jni::native_jbyteArray jbytes{env, value};
-      strval.assign(reinterpret_cast<char *>(jbytes.data()), jbytes.size());
+      strval.assign(reinterpret_cast<char*>(jbytes.data()), jbytes.size());
     }
 
     auto s = new cudf::string_scalar{strval, static_cast<bool>(is_valid)};
@@ -390,117 +439,116 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStringScalar(JNIEnv *env,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeTimestampDaysScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeTimestampDaysScalar(JNIEnv* env,
+                                                                           jclass,
                                                                            jint value,
-                                                                           jboolean is_valid) {
+                                                                           jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_timestamp_scalar(cudf::data_type(cudf::type_id::TIMESTAMP_DAYS));
+      cudf::make_timestamp_scalar(cudf::data_type(cudf::type_id::TIMESTAMP_DAYS));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int32_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int32_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int32_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDurationTimeScalar(JNIEnv *env, jclass,
-                                                                          jint jdtype_id,
-                                                                          jlong value,
-                                                                          jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDurationTimeScalar(
+  JNIEnv* env, jclass, jint jdtype_id, jlong value, jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto dtype_id = static_cast<cudf::type_id>(jdtype_id);
+    auto dtype_id                   = static_cast<cudf::type_id>(jdtype_id);
     std::unique_ptr<cudf::scalar> s = cudf::make_duration_scalar(cudf::data_type(dtype_id));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int64_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int64_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int64_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeTimestampTimeScalar(JNIEnv *env, jclass,
-                                                                           jint jdtype_id,
-                                                                           jlong value,
-                                                                           jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeTimestampTimeScalar(
+  JNIEnv* env, jclass, jint jdtype_id, jlong value, jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
-    auto dtype_id = static_cast<cudf::type_id>(jdtype_id);
+    auto dtype_id                   = static_cast<cudf::type_id>(jdtype_id);
     std::unique_ptr<cudf::scalar> s = cudf::make_timestamp_scalar(cudf::data_type(dtype_id));
     s->set_valid_async(is_valid);
     if (is_valid) {
       using ScalarType = cudf::scalar_type_t<int64_t>;
-      static_cast<ScalarType *>(s.get())->set_value(static_cast<int64_t>(value));
+      static_cast<ScalarType*>(s.get())->set_value(static_cast<int64_t>(value));
     }
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal32Scalar(JNIEnv *env, jclass,
-                                                                       jint value, jint scale,
-                                                                       jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal32Scalar(
+  JNIEnv* env, jclass, jint value, jint scale, jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto const value_ = static_cast<int32_t>(value);
     auto const scale_ = numeric::scale_type{static_cast<int32_t>(scale)};
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_fixed_point_scalar<numeric::decimal32>(value_, scale_);
+      cudf::make_fixed_point_scalar<numeric::decimal32>(value_, scale_);
     s->set_valid_async(is_valid);
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal64Scalar(JNIEnv *env, jclass,
-                                                                       jlong value, jint scale,
-                                                                       jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal64Scalar(
+  JNIEnv* env, jclass, jlong value, jint scale, jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto const value_ = static_cast<int64_t>(value);
     auto const scale_ = numeric::scale_type{static_cast<int32_t>(scale)};
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_fixed_point_scalar<numeric::decimal64>(value_, scale_);
+      cudf::make_fixed_point_scalar<numeric::decimal64>(value_, scale_);
     s->set_valid_async(is_valid);
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal128Scalar(JNIEnv *env, jclass,
-                                                                        jbyteArray value,
-                                                                        jint scale,
-                                                                        jboolean is_valid) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeDecimal128Scalar(
+  JNIEnv* env, jclass, jbyteArray value, jint scale, jboolean is_valid)
+{
   try {
     cudf::jni::auto_set_device(env);
     auto const scale_ = numeric::scale_type{static_cast<int32_t>(scale)};
     cudf::jni::native_jbyteArray jbytes{env, value};
-    auto const value_ = reinterpret_cast<__int128_t *>(jbytes.data());
+    auto const value_ = reinterpret_cast<__int128_t*>(jbytes.data());
     std::unique_ptr<cudf::scalar> s =
-        cudf::make_fixed_point_scalar<numeric::decimal128>(*value_, scale_);
+      cudf::make_fixed_point_scalar<numeric::decimal128>(*value_, scale_);
     s->set_valid_async(is_valid);
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_binaryOpSV(JNIEnv *env, jclass, jlong lhs_ptr,
-                                                              jlong rhs_view, jint int_op,
-                                                              jint out_dtype, jint scale) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_binaryOpSV(
+  JNIEnv* env, jclass, jlong lhs_ptr, jlong rhs_view, jint int_op, jint out_dtype, jint scale)
+{
   JNI_NULL_CHECK(env, lhs_ptr, "lhs is null", 0);
   JNI_NULL_CHECK(env, rhs_view, "rhs is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::scalar *lhs = reinterpret_cast<cudf::scalar *>(lhs_ptr);
-    auto rhs = reinterpret_cast<cudf::column_view *>(rhs_view);
+    cudf::scalar* lhs           = reinterpret_cast<cudf::scalar*>(lhs_ptr);
+    auto rhs                    = reinterpret_cast<cudf::column_view*>(rhs_view);
     cudf::data_type n_data_type = cudf::jni::make_data_type(out_dtype, scale);
-    cudf::binary_operator op = static_cast<cudf::binary_operator>(int_op);
+    cudf::binary_operator op    = static_cast<cudf::binary_operator>(int_op);
 
     if (lhs->type().id() == cudf::type_id::STRUCT) {
       auto out = make_fixed_width_column(n_data_type, rhs->size(), cudf::mask_state::UNALLOCATED);
@@ -512,10 +560,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_binaryOpSV(JNIEnv *env, jclas
         out->set_null_mask(std::move(new_mask), new_null_count);
       }
 
-      auto lhs_col = cudf::make_column_from_scalar(*lhs, 1);
+      auto lhs_col  = cudf::make_column_from_scalar(*lhs, 1);
       auto out_view = out->mutable_view();
       cudf::binops::compiled::detail::apply_sorting_struct_binary_op(
-          out_view, lhs_col->view(), *rhs, true, false, op, cudf::get_default_stream());
+        out_view, lhs_col->view(), *rhs, true, false, op, cudf::get_default_stream());
       return release_as_jlong(out);
     }
 
@@ -524,28 +572,32 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_binaryOpSV(JNIEnv *env, jclas
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeListScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeListScalar(JNIEnv* env,
+                                                                  jclass,
                                                                   jlong view_handle,
-                                                                  jboolean is_valid) {
+                                                                  jboolean is_valid)
+{
   JNI_NULL_CHECK(env, view_handle, "Column view should NOT be null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto col_view = reinterpret_cast<cudf::column_view *>(view_handle);
+    auto col_view = reinterpret_cast<cudf::column_view*>(view_handle);
 
     // Instead of calling the `cudf::empty_like` to create an empty column when `is_valid`
     // is false, always passes the input view to the scalar, to avoid copying the column
     // twice.
     // Let the Java layer make sure the view is empty when `is_valid` is false.
-    cudf::scalar *s = new cudf::list_scalar(*col_view);
+    cudf::scalar* s = new cudf::list_scalar(*col_view);
     s->set_valid_async(is_valid);
     return reinterpret_cast<jlong>(s);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStructScalar(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStructScalar(JNIEnv* env,
+                                                                    jclass,
                                                                     jlongArray handles,
-                                                                    jboolean is_valid) {
+                                                                    jboolean is_valid)
+{
   JNI_NULL_CHECK(env, handles, "native view handles are null", 0)
   try {
     cudf::jni::auto_set_device(env);
@@ -553,24 +605,29 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_makeStructScalar(JNIEnv *env,
     cudf::jni::native_jpointerArray<cudf::column_view> column_pointers(env, handles);
     std::vector<cudf::column_view> columns;
     columns.reserve(column_pointers.size());
-    std::transform(column_pointers.data(), column_pointers.data() + column_pointers.size(),
-                   std::back_inserter(columns), [](auto const &col_ptr) { return *col_ptr; });
+    std::transform(column_pointers.data(),
+                   column_pointers.data() + column_pointers.size(),
+                   std::back_inserter(columns),
+                   [](auto const& col_ptr) { return *col_ptr; });
     auto s = std::make_unique<cudf::struct_scalar>(
-        cudf::host_span<cudf::column_view const>{columns}, is_valid);
+      cudf::host_span<cudf::column_view const>{columns}, is_valid);
     return reinterpret_cast<jlong>(s.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_repeatString(JNIEnv *env, jclass, jlong handle,
-                                                                jint repeat_times) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_repeatString(JNIEnv* env,
+                                                                jclass,
+                                                                jlong handle,
+                                                                jint repeat_times)
+{
   JNI_NULL_CHECK(env, handle, "scalar handle is null", 0)
   try {
     cudf::jni::auto_set_device(env);
-    auto const str = *reinterpret_cast<cudf::string_scalar *>(handle);
+    auto const str = *reinterpret_cast<cudf::string_scalar*>(handle);
     return reinterpret_cast<jlong>(cudf::strings::repeat_string(str, repeat_times).release());
   }
   CATCH_STD(env, 0);
 }
 
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index e8616710217..e411b1d5362 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -13,10 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <algorithm>
+#include "csv_chunked_writer.hpp"
+#include "cudf_jni_apis.hpp"
+#include "dtype_utils.hpp"
+#include "jni_compiled_expr.hpp"
+#include "jni_utils.hpp"
+#include "jni_writer_data_sink.hpp"
 
-#include <arrow/io/api.h>
-#include <arrow/ipc/api.h>
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/concatenate.hpp>
@@ -44,16 +47,16 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 
-#include "csv_chunked_writer.hpp"
-#include "cudf_jni_apis.hpp"
-#include "dtype_utils.hpp"
-#include "jni_compiled_expr.hpp"
-#include "jni_utils.hpp"
-#include "jni_writer_data_sink.hpp"
+#include <arrow/io/api.h>
+#include <arrow/ipc/api.h>
+
+#include <algorithm>
 
 namespace cudf {
 namespace jni {
@@ -67,9 +70,11 @@ namespace jni {
  */
 struct jni_table_writer_handle_base {
   explicit jni_table_writer_handle_base(
-      std::unique_ptr<jni_writer_data_sink> &&sink_,
-      std::shared_ptr<cudf::io::writer_compression_statistics> &&stats_)
-      : sink{std::move(sink_)}, stats{std::move(stats_)} {}
+    std::unique_ptr<jni_writer_data_sink>&& sink_,
+    std::shared_ptr<cudf::io::writer_compression_statistics>&& stats_)
+    : sink{std::move(sink_)}, stats{std::move(stats_)}
+  {
+  }
 
   std::unique_ptr<jni_writer_data_sink> sink;
   std::shared_ptr<cudf::io::writer_compression_statistics> stats;
@@ -77,13 +82,17 @@ struct jni_table_writer_handle_base {
 
 template <typename Writer>
 struct jni_table_writer_handle final : public jni_table_writer_handle_base {
-  explicit jni_table_writer_handle(std::unique_ptr<Writer> &&writer_)
-      : jni_table_writer_handle_base(nullptr, nullptr), writer{std::move(writer_)} {}
+  explicit jni_table_writer_handle(std::unique_ptr<Writer>&& writer_)
+    : jni_table_writer_handle_base(nullptr, nullptr), writer{std::move(writer_)}
+  {
+  }
   explicit jni_table_writer_handle(
-      std::unique_ptr<Writer> &&writer_, std::unique_ptr<jni_writer_data_sink> &&sink_,
-      std::shared_ptr<cudf::io::writer_compression_statistics> &&stats_)
-      : jni_table_writer_handle_base(std::move(sink_), std::move(stats_)),
-        writer{std::move(writer_)} {}
+    std::unique_ptr<Writer>&& writer_,
+    std::unique_ptr<jni_writer_data_sink>&& sink_,
+    std::shared_ptr<cudf::io::writer_compression_statistics>&& stats_)
+    : jni_table_writer_handle_base(std::move(sink_), std::move(stats_)), writer{std::move(writer_)}
+  {
+  }
 
   std::unique_ptr<Writer> writer;
 };
@@ -92,16 +101,20 @@ typedef jni_table_writer_handle<cudf::io::parquet_chunked_writer> native_parquet
 typedef jni_table_writer_handle<cudf::io::orc_chunked_writer> native_orc_writer_handle;
 
 class native_arrow_ipc_writer_handle final {
-public:
-  explicit native_arrow_ipc_writer_handle(const std::vector<std::string> &col_names,
-                                          const std::string &file_name)
-      : initialized(false), column_names(col_names), file_name(file_name) {}
+ public:
+  explicit native_arrow_ipc_writer_handle(const std::vector<std::string>& col_names,
+                                          const std::string& file_name)
+    : initialized(false), column_names(col_names), file_name(file_name)
+  {
+  }
 
-  explicit native_arrow_ipc_writer_handle(const std::vector<std::string> &col_names,
-                                          const std::shared_ptr<arrow::io::OutputStream> &sink)
-      : initialized(false), column_names(col_names), file_name(""), sink(sink) {}
+  explicit native_arrow_ipc_writer_handle(const std::vector<std::string>& col_names,
+                                          const std::shared_ptr<arrow::io::OutputStream>& sink)
+    : initialized(false), column_names(col_names), file_name(""), sink(sink)
+  {
+  }
 
-private:
+ private:
   bool initialized;
   std::vector<std::string> column_names;
   std::vector<cudf::column_metadata> columns_meta;
@@ -109,23 +122,20 @@ class native_arrow_ipc_writer_handle final {
   std::shared_ptr<arrow::io::OutputStream> sink;
   std::shared_ptr<arrow::ipc::RecordBatchWriter> writer;
 
-public:
-  void write(std::shared_ptr<arrow::Table> &arrow_tab, int64_t max_chunk) {
+ public:
+  void write(std::shared_ptr<arrow::Table>& arrow_tab, int64_t max_chunk)
+  {
     if (!initialized) {
       if (!sink) {
         auto tmp_sink = arrow::io::FileOutputStream::Open(file_name);
-        if (!tmp_sink.ok()) {
-          throw std::runtime_error(tmp_sink.status().message());
-        }
+        if (!tmp_sink.ok()) { throw std::runtime_error(tmp_sink.status().message()); }
         sink = *tmp_sink;
       }
 
       // There is an option to have a file writer too, with metadata
       auto tmp_writer = arrow::ipc::MakeStreamWriter(sink, arrow_tab->schema());
-      if (!tmp_writer.ok()) {
-        throw std::runtime_error(tmp_writer.status().message());
-      }
-      writer = *tmp_writer;
+      if (!tmp_writer.ok()) { throw std::runtime_error(tmp_writer.status().message()); }
+      writer      = *tmp_writer;
       initialized = true;
     }
     if (arrow_tab->num_rows() == 0) {
@@ -133,7 +143,7 @@ class native_arrow_ipc_writer_handle final {
       // empty table, so need to write an empty batch explicitly.
       // For more please see https://issues.apache.org/jira/browse/ARROW-17912.
       auto empty_batch = arrow::RecordBatch::MakeEmpty(arrow_tab->schema());
-      auto status = writer->WriteRecordBatch(*(*empty_batch));
+      auto status      = writer->WriteRecordBatch(*(*empty_batch));
       if (!status.ok()) {
         throw std::runtime_error("writer failed to write batch with the following error: " +
                                  status.ToString());
@@ -147,7 +157,8 @@ class native_arrow_ipc_writer_handle final {
     }
   }
 
-  void close() {
+  void close()
+  {
     if (initialized) {
       {
         auto status = writer->Close();
@@ -167,7 +178,8 @@ class native_arrow_ipc_writer_handle final {
     initialized = false;
   }
 
-  std::vector<cudf::column_metadata> get_column_metadata(const cudf::table_view &tview) {
+  std::vector<cudf::column_metadata> get_column_metadata(const cudf::table_view& tview)
+  {
     if (!column_names.empty() && columns_meta.empty()) {
       // Rebuild the structure of column meta according to table schema.
       // All the tables written by this writer should share the same schema,
@@ -187,13 +199,13 @@ class native_arrow_ipc_writer_handle final {
     return columns_meta;
   }
 
-private:
-  cudf::column_metadata build_one_column_meta(const cudf::column_view &cview, size_t &idx,
-                                              const bool consume_name = true) {
+ private:
+  cudf::column_metadata build_one_column_meta(const cudf::column_view& cview,
+                                              size_t& idx,
+                                              const bool consume_name = true)
+  {
     auto col_meta = cudf::column_metadata{};
-    if (consume_name) {
-      col_meta.name = get_column_name(idx++);
-    }
+    if (consume_name) { col_meta.name = get_column_name(idx++); }
     // Process children
     if (cview.type().id() == cudf::type_id::LIST) {
       // list type:
@@ -213,7 +225,8 @@ class native_arrow_ipc_writer_handle final {
     return col_meta;
   }
 
-  std::string &get_column_name(const size_t idx) {
+  std::string& get_column_name(const size_t idx)
+  {
     if (idx < 0 || idx >= column_names.size()) {
       throw cudf::jni::jni_exception("Missing names for columns or nested struct columns");
     }
@@ -222,49 +235,47 @@ class native_arrow_ipc_writer_handle final {
 };
 
 class jni_arrow_output_stream final : public arrow::io::OutputStream {
-public:
-  explicit jni_arrow_output_stream(JNIEnv *env, jobject callback, jobject host_memory_allocator) {
-    if (env->GetJavaVM(&jvm) < 0) {
-      throw std::runtime_error("GetJavaVM failed");
-    }
+ public:
+  explicit jni_arrow_output_stream(JNIEnv* env, jobject callback, jobject host_memory_allocator)
+  {
+    if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); }
 
     jclass cls = env->GetObjectClass(callback);
-    if (cls == nullptr) {
-      throw cudf::jni::jni_exception("class not found");
-    }
+    if (cls == nullptr) { throw cudf::jni::jni_exception("class not found"); }
 
     handle_buffer_method =
-        env->GetMethodID(cls, "handleBuffer", "(Lai/rapids/cudf/HostMemoryBuffer;J)V");
-    if (handle_buffer_method == nullptr) {
-      throw cudf::jni::jni_exception("handleBuffer method");
-    }
-    this->callback = add_global_ref(env, callback);
+      env->GetMethodID(cls, "handleBuffer", "(Lai/rapids/cudf/HostMemoryBuffer;J)V");
+    if (handle_buffer_method == nullptr) { throw cudf::jni::jni_exception("handleBuffer method"); }
+    this->callback              = add_global_ref(env, callback);
     this->host_memory_allocator = add_global_ref(env, host_memory_allocator);
   }
 
-  virtual ~jni_arrow_output_stream() {
+  virtual ~jni_arrow_output_stream()
+  {
     // This should normally be called by a JVM thread. If the JVM environment is missing then this
     // is likely being triggered by the C++ runtime during shutdown. In that case the JVM may
     // already be destroyed and this thread should not try to attach to get an environment.
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
-      callback = del_global_ref(env, callback);
-      current_buffer = del_global_ref(env, current_buffer);
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+      callback              = del_global_ref(env, callback);
+      current_buffer        = del_global_ref(env, current_buffer);
       host_memory_allocator = del_global_ref(env, host_memory_allocator);
     }
-    callback = nullptr;
-    current_buffer = nullptr;
+    callback              = nullptr;
+    current_buffer        = nullptr;
     host_memory_allocator = nullptr;
   }
 
-  arrow::Status Write(const std::shared_ptr<arrow::Buffer> &data) override {
+  arrow::Status Write(const std::shared_ptr<arrow::Buffer>& data) override
+  {
     return Write(data->data(), data->size());
   }
 
-  arrow::Status Write(const void *data, int64_t nbytes) override {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
-    int64_t left_to_copy = nbytes;
-    const char *copy_from = static_cast<const char *>(data);
+  arrow::Status Write(const void* data, int64_t nbytes) override
+  {
+    JNIEnv* env           = cudf::jni::get_jni_env(jvm);
+    int64_t left_to_copy  = nbytes;
+    const char* copy_from = static_cast<const char*>(data);
     while (left_to_copy > 0) {
       long buffer_amount_available = current_buffer_len - current_buffer_written;
       if (buffer_amount_available <= 0) {
@@ -273,8 +284,8 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
         buffer_amount_available = current_buffer_len - current_buffer_written;
       }
       long amount_to_copy =
-          left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
-      char *copy_to = current_buffer_data + current_buffer_written;
+        left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
+      char* copy_to = current_buffer_data + current_buffer_written;
 
       std::memcpy(copy_to, copy_from, amount_to_copy);
       copy_from = copy_from + amount_to_copy;
@@ -285,25 +296,28 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
     return arrow::Status::OK();
   }
 
-  arrow::Status Flush() override {
+  arrow::Status Flush() override
+  {
     if (current_buffer_written > 0) {
-      JNIEnv *env = cudf::jni::get_jni_env(jvm);
+      JNIEnv* env = cudf::jni::get_jni_env(jvm);
       handle_buffer(env, current_buffer, current_buffer_written);
-      current_buffer = del_global_ref(env, current_buffer);
-      current_buffer_len = 0;
-      current_buffer_data = nullptr;
+      current_buffer         = del_global_ref(env, current_buffer);
+      current_buffer_len     = 0;
+      current_buffer_data    = nullptr;
       current_buffer_written = 0;
     }
     return arrow::Status::OK();
   }
 
-  arrow::Status Close() override {
-    auto ret = Flush();
+  arrow::Status Close() override
+  {
+    auto ret  = Flush();
     is_closed = true;
     return ret;
   }
 
-  arrow::Status Abort() override {
+  arrow::Status Abort() override
+  {
     is_closed = true;
     return arrow::Status::OK();
   }
@@ -312,99 +326,93 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
 
   bool closed() const override { return is_closed; }
 
-private:
-  void rotate_buffer(JNIEnv *env) {
-    if (current_buffer != nullptr) {
-      handle_buffer(env, current_buffer, current_buffer_written);
-    }
-    current_buffer = del_global_ref(env, current_buffer);
-    jobject tmp_buffer = allocate_host_buffer(env, alloc_size, true, host_memory_allocator);
-    current_buffer = add_global_ref(env, tmp_buffer);
-    current_buffer_len = get_host_buffer_length(env, current_buffer);
-    current_buffer_data = reinterpret_cast<char *>(get_host_buffer_address(env, current_buffer));
+ private:
+  void rotate_buffer(JNIEnv* env)
+  {
+    if (current_buffer != nullptr) { handle_buffer(env, current_buffer, current_buffer_written); }
+    current_buffer         = del_global_ref(env, current_buffer);
+    jobject tmp_buffer     = allocate_host_buffer(env, alloc_size, true, host_memory_allocator);
+    current_buffer         = add_global_ref(env, tmp_buffer);
+    current_buffer_len     = get_host_buffer_length(env, current_buffer);
+    current_buffer_data    = reinterpret_cast<char*>(get_host_buffer_address(env, current_buffer));
     current_buffer_written = 0;
   }
 
-  void handle_buffer(JNIEnv *env, jobject buffer, jlong len) {
+  void handle_buffer(JNIEnv* env, jobject buffer, jlong len)
+  {
     env->CallVoidMethod(callback, handle_buffer_method, buffer, len);
-    if (env->ExceptionCheck()) {
-      throw std::runtime_error("handleBuffer threw an exception");
-    }
+    if (env->ExceptionCheck()) { throw std::runtime_error("handleBuffer threw an exception"); }
   }
 
-  JavaVM *jvm;
+  JavaVM* jvm;
   jobject callback;
   jmethodID handle_buffer_method;
-  jobject current_buffer = nullptr;
-  char *current_buffer_data = nullptr;
-  long current_buffer_len = 0;
+  jobject current_buffer      = nullptr;
+  char* current_buffer_data   = nullptr;
+  long current_buffer_len     = 0;
   long current_buffer_written = 0;
-  int64_t total_written = 0;
-  long alloc_size = MINIMUM_WRITE_BUFFER_SIZE;
-  bool is_closed = false;
+  int64_t total_written       = 0;
+  long alloc_size             = MINIMUM_WRITE_BUFFER_SIZE;
+  bool is_closed              = false;
   jobject host_memory_allocator;
 };
 
 class jni_arrow_input_stream final : public arrow::io::InputStream {
-public:
-  explicit jni_arrow_input_stream(JNIEnv *env, jobject callback)
-      : mm(arrow::default_cpu_memory_manager()) {
-    if (env->GetJavaVM(&jvm) < 0) {
-      throw std::runtime_error("GetJavaVM failed");
-    }
+ public:
+  explicit jni_arrow_input_stream(JNIEnv* env, jobject callback)
+    : mm(arrow::default_cpu_memory_manager())
+  {
+    if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); }
 
     jclass cls = env->GetObjectClass(callback);
-    if (cls == nullptr) {
-      throw cudf::jni::jni_exception("class not found");
-    }
+    if (cls == nullptr) { throw cudf::jni::jni_exception("class not found"); }
 
     read_into_method = env->GetMethodID(cls, "readInto", "(JJ)J");
-    if (read_into_method == nullptr) {
-      throw cudf::jni::jni_exception("readInto method");
-    }
+    if (read_into_method == nullptr) { throw cudf::jni::jni_exception("readInto method"); }
 
     this->callback = add_global_ref(env, callback);
   }
 
-  virtual ~jni_arrow_input_stream() {
+  virtual ~jni_arrow_input_stream()
+  {
     // This should normally be called by a JVM thread. If the JVM environment is missing then this
     // is likely being triggered by the C++ runtime during shutdown. In that case the JVM may
     // already be destroyed and this thread should not try to attach to get an environment.
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
       callback = del_global_ref(env, callback);
     }
     callback = nullptr;
   }
 
-  arrow::Result<int64_t> Read(int64_t nbytes, void *out) override {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
-    jlong ret = read_into(env, ptr_as_jlong(out), nbytes);
+  arrow::Result<int64_t> Read(int64_t nbytes, void* out) override
+  {
+    JNIEnv* env = cudf::jni::get_jni_env(jvm);
+    jlong ret   = read_into(env, ptr_as_jlong(out), nbytes);
     total_read += ret;
     return ret;
   }
 
-  arrow::Result<std::shared_ptr<arrow::Buffer>> Read(int64_t nbytes) override {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
+  arrow::Result<std::shared_ptr<arrow::Buffer>> Read(int64_t nbytes) override
+  {
+    JNIEnv* env = cudf::jni::get_jni_env(jvm);
     arrow::Result<std::shared_ptr<arrow::ResizableBuffer>> tmp_buffer =
-        arrow::AllocateResizableBuffer(nbytes);
-    if (!tmp_buffer.ok()) {
-      return tmp_buffer;
-    }
-    jlong amount_read = read_into(env, ptr_as_jlong((*tmp_buffer)->data()), nbytes);
+      arrow::AllocateResizableBuffer(nbytes);
+    if (!tmp_buffer.ok()) { return tmp_buffer; }
+    jlong amount_read  = read_into(env, ptr_as_jlong((*tmp_buffer)->data()), nbytes);
     arrow::Status stat = (*tmp_buffer)->Resize(amount_read);
-    if (!stat.ok()) {
-      return stat;
-    }
+    if (!stat.ok()) { return stat; }
     return tmp_buffer;
   }
 
-  arrow::Status Close() override {
+  arrow::Status Close() override
+  {
     is_closed = true;
     return arrow::Status::OK();
   }
 
-  arrow::Status Abort() override {
+  arrow::Status Abort() override
+  {
     is_closed = true;
     return arrow::Status::OK();
   }
@@ -413,57 +421,51 @@ class jni_arrow_input_stream final : public arrow::io::InputStream {
 
   bool closed() const override { return is_closed; }
 
-private:
-  jlong read_into(JNIEnv *env, jlong addr, jlong len) {
+ private:
+  jlong read_into(JNIEnv* env, jlong addr, jlong len)
+  {
     jlong ret = env->CallLongMethod(callback, read_into_method, addr, len);
-    if (env->ExceptionCheck()) {
-      throw std::runtime_error("readInto threw an exception");
-    }
+    if (env->ExceptionCheck()) { throw std::runtime_error("readInto threw an exception"); }
     return ret;
   }
 
-  JavaVM *jvm;
+  JavaVM* jvm;
   jobject callback;
   jmethodID read_into_method;
   int64_t total_read = 0;
-  bool is_closed = false;
+  bool is_closed     = false;
   std::vector<uint8_t> tmp_buffer;
   std::shared_ptr<arrow::MemoryManager> mm;
 };
 
 class native_arrow_ipc_reader_handle final {
-public:
-  explicit native_arrow_ipc_reader_handle(const std::string &file_name) {
+ public:
+  explicit native_arrow_ipc_reader_handle(const std::string& file_name)
+  {
     auto tmp_source = arrow::io::ReadableFile::Open(file_name);
-    if (!tmp_source.ok()) {
-      throw std::runtime_error(tmp_source.status().message());
-    }
-    source = *tmp_source;
+    if (!tmp_source.ok()) { throw std::runtime_error(tmp_source.status().message()); }
+    source          = *tmp_source;
     auto tmp_reader = arrow::ipc::RecordBatchStreamReader::Open(source);
-    if (!tmp_reader.ok()) {
-      throw std::runtime_error(tmp_reader.status().message());
-    }
+    if (!tmp_reader.ok()) { throw std::runtime_error(tmp_reader.status().message()); }
     reader = *tmp_reader;
   }
 
   explicit native_arrow_ipc_reader_handle(std::shared_ptr<arrow::io::InputStream> source)
-      : source(source) {
+    : source(source)
+  {
     auto tmp_reader = arrow::ipc::RecordBatchStreamReader::Open(source);
-    if (!tmp_reader.ok()) {
-      throw std::runtime_error(tmp_reader.status().message());
-    }
+    if (!tmp_reader.ok()) { throw std::runtime_error(tmp_reader.status().message()); }
     reader = *tmp_reader;
   }
 
-  std::shared_ptr<arrow::Table> next(int32_t row_target) {
+  std::shared_ptr<arrow::Table> next(int32_t row_target)
+  {
     int64_t total_rows = 0;
-    bool done = false;
+    bool done          = false;
     std::vector<std::shared_ptr<arrow::RecordBatch>> batches;
     while (!done) {
       arrow::Result<std::shared_ptr<arrow::RecordBatch>> batch = reader->Next();
-      if (!batch.ok()) {
-        throw std::runtime_error(batch.status().message());
-      }
+      if (!batch.ok()) { throw std::runtime_error(batch.status().message()); }
       if (!*batch) {
         done = true;
       } else {
@@ -477,17 +479,16 @@ class native_arrow_ipc_reader_handle final {
       return std::unique_ptr<arrow::Table>();
     }
     arrow::Result<std::shared_ptr<arrow::Table>> tmp =
-        arrow::Table::FromRecordBatches(reader->schema(), batches);
-    if (!tmp.ok()) {
-      throw std::runtime_error(tmp.status().message());
-    }
+      arrow::Table::FromRecordBatches(reader->schema(), batches);
+    if (!tmp.ok()) { throw std::runtime_error(tmp.status().message()); }
     return *tmp;
   }
 
   std::shared_ptr<arrow::io::InputStream> source;
   std::shared_ptr<arrow::ipc::RecordBatchReader> reader;
 
-  void close() {
+  void close()
+  {
     auto status = source->Close();
     if (!status.ok()) {
       throw std::runtime_error("Closing source failed with the following error: " +
@@ -496,33 +497,44 @@ class native_arrow_ipc_reader_handle final {
   }
 };
 
-jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &&table_result,
-                                    std::vector<std::unique_ptr<cudf::column>> &&extra_columns) {
+jlongArray convert_table_for_return(JNIEnv* env,
+                                    std::unique_ptr<cudf::table>&& table_result,
+                                    std::vector<std::unique_ptr<cudf::column>>&& extra_columns)
+{
   std::vector<std::unique_ptr<cudf::column>> ret = table_result->release();
-  int table_cols = ret.size();
-  int num_columns = table_cols + extra_columns.size();
+  int table_cols                                 = ret.size();
+  int num_columns                                = table_cols + extra_columns.size();
   cudf::jni::native_jlongArray outcol_handles(env, num_columns);
-  std::transform(ret.begin(), ret.end(), outcol_handles.begin(),
-                 [](auto &col) { return release_as_jlong(col); });
-  std::transform(extra_columns.begin(), extra_columns.end(), outcol_handles.begin() + table_cols,
-                 [](auto &col) { return release_as_jlong(col); });
+  std::transform(ret.begin(), ret.end(), outcol_handles.begin(), [](auto& col) {
+    return release_as_jlong(col);
+  });
+  std::transform(
+    extra_columns.begin(), extra_columns.end(), outcol_handles.begin() + table_cols, [](auto& col) {
+      return release_as_jlong(col);
+    });
   return outcol_handles.get_jArray();
 }
 
-jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result,
-                                    std::vector<std::unique_ptr<cudf::column>> &&extra_columns) {
+jlongArray convert_table_for_return(JNIEnv* env,
+                                    std::unique_ptr<cudf::table>& table_result,
+                                    std::vector<std::unique_ptr<cudf::column>>&& extra_columns)
+{
   return convert_table_for_return(env, std::move(table_result), std::move(extra_columns));
 }
 
-jlongArray convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &first_table,
-                                    std::unique_ptr<cudf::table> &second_table) {
+jlongArray convert_table_for_return(JNIEnv* env,
+                                    std::unique_ptr<cudf::table>& first_table,
+                                    std::unique_ptr<cudf::table>& second_table)
+{
   return convert_table_for_return(env, first_table, second_table->release());
 }
 
 // Convert the JNI boolean array of key column sort order to a vector of cudf::order
 // for groupby.
-std::vector<cudf::order> resolve_column_order(JNIEnv *env, jbooleanArray jkeys_sort_desc,
-                                              int key_size) {
+std::vector<cudf::order> resolve_column_order(JNIEnv* env,
+                                              jbooleanArray jkeys_sort_desc,
+                                              int key_size)
+{
   cudf::jni::native_jbooleanArray keys_sort_desc(env, jkeys_sort_desc);
   auto keys_sort_num = keys_sort_desc.size();
   // The number of column order should be 0 or equal to the number of key.
@@ -532,18 +544,21 @@ std::vector<cudf::order> resolve_column_order(JNIEnv *env, jbooleanArray jkeys_s
 
   std::vector<cudf::order> column_order(keys_sort_num);
   if (keys_sort_num > 0) {
-    std::transform(keys_sort_desc.data(), keys_sort_desc.data() + keys_sort_num,
-                   column_order.begin(), [](jboolean is_desc) {
-                     return is_desc ? cudf::order::DESCENDING : cudf::order::ASCENDING;
-                   });
+    std::transform(
+      keys_sort_desc.data(),
+      keys_sort_desc.data() + keys_sort_num,
+      column_order.begin(),
+      [](jboolean is_desc) { return is_desc ? cudf::order::DESCENDING : cudf::order::ASCENDING; });
   }
   return column_order;
 }
 
 // Convert the JNI boolean array of key column null order to a vector of cudf::null_order
 // for groupby.
-std::vector<cudf::null_order> resolve_null_precedence(JNIEnv *env, jbooleanArray jkeys_null_first,
-                                                      int key_size) {
+std::vector<cudf::null_order> resolve_null_precedence(JNIEnv* env,
+                                                      jbooleanArray jkeys_null_first,
+                                                      int key_size)
+{
   cudf::jni::native_jbooleanArray keys_null_first(env, jkeys_null_first);
   auto null_order_num = keys_null_first.size();
   // The number of null order should be 0 or equal to the number of key.
@@ -553,8 +568,10 @@ std::vector<cudf::null_order> resolve_null_precedence(JNIEnv *env, jbooleanArray
 
   std::vector<cudf::null_order> null_precedence(null_order_num);
   if (null_order_num > 0) {
-    std::transform(keys_null_first.data(), keys_null_first.data() + null_order_num,
-                   null_precedence.begin(), [](jboolean null_before) {
+    std::transform(keys_null_first.data(),
+                   keys_null_first.data() + null_order_num,
+                   null_precedence.begin(),
+                   [](jboolean null_before) {
                      return null_before ? cudf::null_order::BEFORE : cudf::null_order::AFTER;
                    });
   }
@@ -563,49 +580,63 @@ std::vector<cudf::null_order> resolve_null_precedence(JNIEnv *env, jbooleanArray
 
 namespace {
 
-int set_column_metadata(
-    cudf::io::column_in_metadata &column_metadata, std::vector<std::string> &col_names,
-    cudf::jni::native_jbooleanArray &nullability, cudf::jni::native_jbooleanArray &is_int96,
-    cudf::jni::native_jintArray &precisions, cudf::jni::native_jbooleanArray &is_map,
-    cudf::jni::native_jbooleanArray &hasParquetFieldIds,
-    cudf::jni::native_jintArray &parquetFieldIds, cudf::jni::native_jintArray &children,
-    int num_children, int read_index, cudf::jni::native_jbooleanArray &is_binary) {
+int set_column_metadata(cudf::io::column_in_metadata& column_metadata,
+                        std::vector<std::string>& col_names,
+                        cudf::jni::native_jbooleanArray& nullability,
+                        cudf::jni::native_jbooleanArray& is_int96,
+                        cudf::jni::native_jintArray& precisions,
+                        cudf::jni::native_jbooleanArray& is_map,
+                        cudf::jni::native_jbooleanArray& hasParquetFieldIds,
+                        cudf::jni::native_jintArray& parquetFieldIds,
+                        cudf::jni::native_jintArray& children,
+                        int num_children,
+                        int read_index,
+                        cudf::jni::native_jbooleanArray& is_binary)
+{
   int write_index = 0;
   for (int i = 0; i < num_children; i++, write_index++) {
     cudf::io::column_in_metadata child;
     child.set_name(col_names[read_index]).set_nullability(nullability[read_index]);
-    if (precisions[read_index] > -1) {
-      child.set_decimal_precision(precisions[read_index]);
-    }
-    if (!is_int96.is_null()) {
-      child.set_int96_timestamps(is_int96[read_index]);
-    }
-    if (!is_binary.is_null()) {
-      child.set_output_as_binary(is_binary[read_index]);
-    }
-    if (is_map[read_index]) {
-      child.set_list_column_as_map();
-    }
+    if (precisions[read_index] > -1) { child.set_decimal_precision(precisions[read_index]); }
+    if (!is_int96.is_null()) { child.set_int96_timestamps(is_int96[read_index]); }
+    if (!is_binary.is_null()) { child.set_output_as_binary(is_binary[read_index]); }
+    if (is_map[read_index]) { child.set_list_column_as_map(); }
     if (!parquetFieldIds.is_null() && hasParquetFieldIds[read_index]) {
       child.set_parquet_field_id(parquetFieldIds[read_index]);
     }
     column_metadata.add_child(child);
     int childs_children = children[read_index++];
     if (childs_children > 0) {
-      read_index = set_column_metadata(
-          column_metadata.child(write_index), col_names, nullability, is_int96, precisions, is_map,
-          hasParquetFieldIds, parquetFieldIds, children, childs_children, read_index, is_binary);
+      read_index = set_column_metadata(column_metadata.child(write_index),
+                                       col_names,
+                                       nullability,
+                                       is_int96,
+                                       precisions,
+                                       is_map,
+                                       hasParquetFieldIds,
+                                       parquetFieldIds,
+                                       children,
+                                       childs_children,
+                                       read_index,
+                                       is_binary);
     }
   }
   return read_index;
 }
 
-void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_names,
-                         jintArray &j_children, jbooleanArray &j_col_nullability,
-                         jbooleanArray &j_is_int96, jintArray &j_precisions,
-                         jbooleanArray &j_is_map, cudf::io::table_input_metadata &metadata,
-                         jbooleanArray &j_hasParquetFieldIds, jintArray &j_parquetFieldIds,
-                         jbooleanArray &j_is_binary) {
+void createTableMetaData(JNIEnv* env,
+                         jint num_children,
+                         jobjectArray& j_col_names,
+                         jintArray& j_children,
+                         jbooleanArray& j_col_nullability,
+                         jbooleanArray& j_is_int96,
+                         jintArray& j_precisions,
+                         jbooleanArray& j_is_map,
+                         cudf::io::table_input_metadata& metadata,
+                         jbooleanArray& j_hasParquetFieldIds,
+                         jintArray& j_parquetFieldIds,
+                         jbooleanArray& j_is_binary)
+{
   cudf::jni::auto_set_device(env);
   cudf::jni::native_jstringArray col_names(env, j_col_names);
   cudf::jni::native_jbooleanArray col_nullability(env, j_col_nullability);
@@ -622,11 +653,11 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
   int top_level_children = num_children;
 
   metadata.column_metadata.resize(top_level_children);
-  int read_index = 0; // the read_index, which will be used to read the arrays
+  int read_index = 0;  // the read_index, which will be used to read the arrays
   for (int i = read_index, write_index = 0; i < top_level_children; i++, write_index++) {
     metadata.column_metadata[write_index]
-        .set_name(cpp_names[read_index])
-        .set_nullability(col_nullability[read_index]);
+      .set_name(cpp_names[read_index])
+      .set_nullability(col_nullability[read_index]);
     if (precisions[read_index] > -1) {
       metadata.column_metadata[write_index].set_decimal_precision(precisions[read_index]);
     }
@@ -636,37 +667,46 @@ void createTableMetaData(JNIEnv *env, jint num_children, jobjectArray &j_col_nam
     if (!is_binary.is_null()) {
       metadata.column_metadata[write_index].set_output_as_binary(is_binary[read_index]);
     }
-    if (is_map[read_index]) {
-      metadata.column_metadata[write_index].set_list_column_as_map();
-    }
+    if (is_map[read_index]) { metadata.column_metadata[write_index].set_list_column_as_map(); }
     if (!parquetFieldIds.is_null() && hasParquetFieldIds[read_index]) {
       metadata.column_metadata[write_index].set_parquet_field_id(parquetFieldIds[read_index]);
     }
     int childs_children = children[read_index++];
     if (childs_children > 0) {
-      read_index =
-          set_column_metadata(metadata.column_metadata[write_index], cpp_names, col_nullability,
-                              is_int96, precisions, is_map, hasParquetFieldIds, parquetFieldIds,
-                              children, childs_children, read_index, is_binary);
+      read_index = set_column_metadata(metadata.column_metadata[write_index],
+                                       cpp_names,
+                                       col_nullability,
+                                       is_int96,
+                                       precisions,
+                                       is_map,
+                                       hasParquetFieldIds,
+                                       parquetFieldIds,
+                                       children,
+                                       childs_children,
+                                       read_index,
+                                       is_binary);
     }
   }
 }
 
 // Check that window parameters are valid.
-bool valid_window_parameters(native_jintArray const &values,
-                             native_jpointerArray<cudf::aggregation> const &ops,
-                             native_jintArray const &min_periods, native_jintArray const &preceding,
-                             native_jintArray const &following) {
+bool valid_window_parameters(native_jintArray const& values,
+                             native_jpointerArray<cudf::aggregation> const& ops,
+                             native_jintArray const& min_periods,
+                             native_jintArray const& preceding,
+                             native_jintArray const& following)
+{
   return values.size() == ops.size() && values.size() == min_periods.size() &&
          values.size() == preceding.size() && values.size() == following.size();
 }
 
 // Check that window parameters are valid.
-bool valid_window_parameters(native_jintArray const &values,
-                             native_jpointerArray<cudf::aggregation> const &ops,
-                             native_jintArray const &min_periods,
-                             native_jpointerArray<cudf::scalar> const &preceding,
-                             native_jpointerArray<cudf::scalar> const &following) {
+bool valid_window_parameters(native_jintArray const& values,
+                             native_jpointerArray<cudf::aggregation> const& ops,
+                             native_jintArray const& min_periods,
+                             native_jpointerArray<cudf::scalar> const& preceding,
+                             native_jpointerArray<cudf::scalar> const& following)
+{
   return values.size() == ops.size() && values.size() == min_periods.size() &&
          values.size() == preceding.size() && values.size() == following.size();
 }
@@ -678,12 +718,13 @@ bool valid_window_parameters(native_jintArray const &values,
 //   2: Host address of the rmm::device_buffer instance that owns the left gather map data
 //   3: Device address of the gather map for the right table
 //   4: Host address of the rmm::device_buffer instance that owns the right gather map data
-jlongArray gather_maps_to_java(JNIEnv *env,
-                               std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
-                                         std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-                                   maps) {
+jlongArray gather_maps_to_java(
+  JNIEnv* env,
+  std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+            std::unique_ptr<rmm::device_uvector<cudf::size_type>>> maps)
+{
   // release the underlying device buffer to Java
-  auto left_map_buffer = std::make_unique<rmm::device_buffer>(maps.first->release());
+  auto left_map_buffer  = std::make_unique<rmm::device_buffer>(maps.first->release());
   auto right_map_buffer = std::make_unique<rmm::device_buffer>(maps.second->release());
   cudf::jni::native_jlongArray result(env, 5);
   result[0] = static_cast<jlong>(left_map_buffer->size());
@@ -699,27 +740,29 @@ jlongArray gather_maps_to_java(JNIEnv *env,
 //   0: Size of the gather map in bytes
 //   1: Device address of the gather map
 //   2: Host address of the rmm::device_buffer instance that owns the gather map data
-jlongArray gather_map_to_java(JNIEnv *env,
-                              std::unique_ptr<rmm::device_uvector<cudf::size_type>> map) {
+jlongArray gather_map_to_java(JNIEnv* env,
+                              std::unique_ptr<rmm::device_uvector<cudf::size_type>> map)
+{
   // release the underlying device buffer to Java
   cudf::jni::native_jlongArray result(env, 3);
-  result[0] = static_cast<jlong>(map->size() * sizeof(cudf::size_type));
+  result[0]              = static_cast<jlong>(map->size() * sizeof(cudf::size_type));
   auto gather_map_buffer = std::make_unique<rmm::device_buffer>(map->release());
-  result[1] = ptr_as_jlong(gather_map_buffer->data());
-  result[2] = release_as_jlong(gather_map_buffer);
+  result[1]              = ptr_as_jlong(gather_map_buffer->data());
+  result[2]              = release_as_jlong(gather_map_buffer);
   return result.get_jArray();
 }
 
 // Generate gather maps needed to manifest the result of an equi-join between two tables.
 template <typename T>
-jlongArray join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
-                            jboolean compare_nulls_equal, T join_func) {
+jlongArray join_gather_maps(
+  JNIEnv* env, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal, T join_func)
+{
   JNI_NULL_CHECK(env, j_left_keys, "left_table is null", NULL);
   JNI_NULL_CHECK(env, j_right_keys, "right_table is null", NULL);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
-    auto right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
+    auto left_keys  = reinterpret_cast<cudf::table_view const*>(j_left_keys);
+    auto right_keys = reinterpret_cast<cudf::table_view const*>(j_right_keys);
     auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     return gather_maps_to_java(env, join_func(*left_keys, *right_keys, nulleq));
   }
@@ -729,14 +772,17 @@ jlongArray join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
 // Generate gather maps needed to manifest the result of an equi-join between a left table and
 // a hash table built from the join's right table.
 template <typename T>
-jlongArray hash_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_hash_join,
-                                 T join_func) {
+jlongArray hash_join_gather_maps(JNIEnv* env,
+                                 jlong j_left_keys,
+                                 jlong j_right_hash_join,
+                                 T join_func)
+{
   JNI_NULL_CHECK(env, j_left_keys, "left table is null", NULL);
   JNI_NULL_CHECK(env, j_right_hash_join, "hash join is null", NULL);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
-    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
+    auto left_keys = reinterpret_cast<cudf::table_view const*>(j_left_keys);
+    auto hash_join = reinterpret_cast<cudf::hash_join const*>(j_right_hash_join);
     return gather_maps_to_java(env, join_func(*left_keys, *hash_join));
   }
   CATCH_STD(env, NULL);
@@ -744,32 +790,34 @@ jlongArray hash_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_h
 
 // Generate gather maps needed to manifest the result of a conditional join between two tables.
 template <typename T>
-jlongArray cond_join_gather_maps(JNIEnv *env, jlong j_left_table, jlong j_right_table,
-                                 jlong j_condition, T join_func) {
+jlongArray cond_join_gather_maps(
+  JNIEnv* env, jlong j_left_table, jlong j_right_table, jlong j_condition, T join_func)
+{
   JNI_NULL_CHECK(env, j_left_table, "left_table is null", NULL);
   JNI_NULL_CHECK(env, j_right_table, "right_table is null", NULL);
   JNI_NULL_CHECK(env, j_condition, "condition is null", NULL);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
-    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto left_table  = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const*>(j_right_table);
+    auto condition   = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
     return gather_maps_to_java(
-        env, join_func(*left_table, *right_table, condition->get_top_expression()));
+      env, join_func(*left_table, *right_table, condition->get_top_expression()));
   }
   CATCH_STD(env, NULL);
 }
 
 // Generate a gather map needed to manifest the result of a semi/anti join between two tables.
 template <typename T>
-jlongArray join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
-                                  jboolean compare_nulls_equal, T join_func) {
+jlongArray join_gather_single_map(
+  JNIEnv* env, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal, T join_func)
+{
   JNI_NULL_CHECK(env, j_left_keys, "left_table is null", NULL);
   JNI_NULL_CHECK(env, j_right_keys, "right_table is null", NULL);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
-    auto right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
+    auto left_keys  = reinterpret_cast<cudf::table_view const*>(j_left_keys);
+    auto right_keys = reinterpret_cast<cudf::table_view const*>(j_right_keys);
     auto nulleq = compare_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     return gather_map_to_java(env, join_func(*left_keys, *right_keys, nulleq));
   }
@@ -779,26 +827,33 @@ jlongArray join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_
 // Generate a gather map needed to manifest the result of a conditional semi/anti join
 // between two tables.
 template <typename T>
-jlongArray cond_join_gather_single_map(JNIEnv *env, jlong j_left_table, jlong j_right_table,
-                                       jlong j_condition, T join_func) {
+jlongArray cond_join_gather_single_map(
+  JNIEnv* env, jlong j_left_table, jlong j_right_table, jlong j_condition, T join_func)
+{
   JNI_NULL_CHECK(env, j_left_table, "left_table is null", NULL);
   JNI_NULL_CHECK(env, j_right_table, "right_table is null", NULL);
   JNI_NULL_CHECK(env, j_condition, "condition is null", NULL);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
-    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr *>(j_condition);
+    auto left_table  = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const*>(j_right_table);
+    auto condition   = reinterpret_cast<cudf::jni::ast::compiled_expr*>(j_condition);
     return gather_map_to_java(
-        env, join_func(*left_table, *right_table, condition->get_top_expression()));
+      env, join_func(*left_table, *right_table, condition->get_top_expression()));
   }
   CATCH_STD(env, NULL);
 }
 
 template <typename T>
-jlongArray mixed_join_size(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
-                           jlong j_left_condition, jlong j_right_condition, jlong j_condition,
-                           jboolean j_nulls_equal, T join_size_func) {
+jlongArray mixed_join_size(JNIEnv* env,
+                           jlong j_left_keys,
+                           jlong j_right_keys,
+                           jlong j_left_condition,
+                           jlong j_right_condition,
+                           jlong j_condition,
+                           jboolean j_nulls_equal,
+                           T join_size_func)
+{
   JNI_NULL_CHECK(env, j_left_keys, "left keys table is null", 0);
   JNI_NULL_CHECK(env, j_right_keys, "right keys table is null", 0);
   JNI_NULL_CHECK(env, j_left_condition, "left condition table is null", 0);
@@ -806,16 +861,19 @@ jlongArray mixed_join_size(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
-    auto const right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
-    auto const left_condition = reinterpret_cast<cudf::table_view const *>(j_left_condition);
-    auto const right_condition = reinterpret_cast<cudf::table_view const *>(j_right_condition);
-    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto const left_keys       = reinterpret_cast<cudf::table_view const*>(j_left_keys);
+    auto const right_keys      = reinterpret_cast<cudf::table_view const*>(j_right_keys);
+    auto const left_condition  = reinterpret_cast<cudf::table_view const*>(j_left_condition);
+    auto const right_condition = reinterpret_cast<cudf::table_view const*>(j_right_condition);
+    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
     auto const nulls_equal =
-        j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
-    auto [join_size, matches_per_row] =
-        join_size_func(*left_keys, *right_keys, *left_condition, *right_condition,
-                       condition->get_top_expression(), nulls_equal);
+      j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+    auto [join_size, matches_per_row] = join_size_func(*left_keys,
+                                                       *right_keys,
+                                                       *left_condition,
+                                                       *right_condition,
+                                                       condition->get_top_expression(),
+                                                       nulls_equal);
     if (matches_per_row->size() > std::numeric_limits<cudf::size_type>::max()) {
       throw std::runtime_error("Too many values in device buffer to convert into a column");
     }
@@ -823,17 +881,26 @@ jlongArray mixed_join_size(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
     auto col_data = matches_per_row->release();
     cudf::jni::native_jlongArray result(env, 2);
     result[0] = static_cast<jlong>(join_size);
-    result[1] = ptr_as_jlong(new cudf::column{cudf::data_type{cudf::type_id::INT32}, col_size,
-                                              std::move(col_data), rmm::device_buffer{}, 0});
+    result[1] = ptr_as_jlong(new cudf::column{cudf::data_type{cudf::type_id::INT32},
+                                              col_size,
+                                              std::move(col_data),
+                                              rmm::device_buffer{},
+                                              0});
     return result.get_jArray();
   }
   CATCH_STD(env, NULL);
 }
 
 template <typename T>
-jlongArray mixed_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
-                                  jlong j_left_condition, jlong j_right_condition,
-                                  jlong j_condition, jboolean j_nulls_equal, T join_func) {
+jlongArray mixed_join_gather_maps(JNIEnv* env,
+                                  jlong j_left_keys,
+                                  jlong j_right_keys,
+                                  jlong j_left_condition,
+                                  jlong j_right_condition,
+                                  jlong j_condition,
+                                  jboolean j_nulls_equal,
+                                  T join_func)
+{
   JNI_NULL_CHECK(env, j_left_keys, "left keys table is null", 0);
   JNI_NULL_CHECK(env, j_right_keys, "right keys table is null", 0);
   JNI_NULL_CHECK(env, j_left_condition, "left condition table is null", 0);
@@ -841,24 +908,34 @@ jlongArray mixed_join_gather_maps(JNIEnv *env, jlong j_left_keys, jlong j_right_
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
-    auto const right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
-    auto const left_condition = reinterpret_cast<cudf::table_view const *>(j_left_condition);
-    auto const right_condition = reinterpret_cast<cudf::table_view const *>(j_right_condition);
-    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto const left_keys       = reinterpret_cast<cudf::table_view const*>(j_left_keys);
+    auto const right_keys      = reinterpret_cast<cudf::table_view const*>(j_right_keys);
+    auto const left_condition  = reinterpret_cast<cudf::table_view const*>(j_left_condition);
+    auto const right_condition = reinterpret_cast<cudf::table_view const*>(j_right_condition);
+    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
     auto const nulls_equal =
-        j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+      j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     return gather_maps_to_java(env,
-                               join_func(*left_keys, *right_keys, *left_condition, *right_condition,
-                                         condition->get_top_expression(), nulls_equal));
+                               join_func(*left_keys,
+                                         *right_keys,
+                                         *left_condition,
+                                         *right_condition,
+                                         condition->get_top_expression(),
+                                         nulls_equal));
   }
   CATCH_STD(env, NULL);
 }
 
 template <typename T>
-jlongArray mixed_join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_right_keys,
-                                        jlong j_left_condition, jlong j_right_condition,
-                                        jlong j_condition, jboolean j_nulls_equal, T join_func) {
+jlongArray mixed_join_gather_single_map(JNIEnv* env,
+                                        jlong j_left_keys,
+                                        jlong j_right_keys,
+                                        jlong j_left_condition,
+                                        jlong j_right_condition,
+                                        jlong j_condition,
+                                        jboolean j_nulls_equal,
+                                        T join_func)
+{
   JNI_NULL_CHECK(env, j_left_keys, "left keys table is null", 0);
   JNI_NULL_CHECK(env, j_right_keys, "right keys table is null", 0);
   JNI_NULL_CHECK(env, j_left_condition, "left condition table is null", 0);
@@ -866,35 +943,46 @@ jlongArray mixed_join_gather_single_map(JNIEnv *env, jlong j_left_keys, jlong j_
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const left_keys = reinterpret_cast<cudf::table_view const *>(j_left_keys);
-    auto const right_keys = reinterpret_cast<cudf::table_view const *>(j_right_keys);
-    auto const left_condition = reinterpret_cast<cudf::table_view const *>(j_left_condition);
-    auto const right_condition = reinterpret_cast<cudf::table_view const *>(j_right_condition);
-    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
+    auto const left_keys       = reinterpret_cast<cudf::table_view const*>(j_left_keys);
+    auto const right_keys      = reinterpret_cast<cudf::table_view const*>(j_right_keys);
+    auto const left_condition  = reinterpret_cast<cudf::table_view const*>(j_left_condition);
+    auto const right_condition = reinterpret_cast<cudf::table_view const*>(j_right_condition);
+    auto const condition = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
     auto const nulls_equal =
-        j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
+      j_nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL;
     return gather_map_to_java(env,
-                              join_func(*left_keys, *right_keys, *left_condition, *right_condition,
-                                        condition->get_top_expression(), nulls_equal));
+                              join_func(*left_keys,
+                                        *right_keys,
+                                        *left_condition,
+                                        *right_condition,
+                                        condition->get_top_expression(),
+                                        nulls_equal));
   }
   CATCH_STD(env, NULL);
 }
 
-std::pair<std::size_t, cudf::device_span<cudf::size_type const>>
-get_mixed_size_info(JNIEnv *env, jlong j_output_row_count, jlong j_matches_view) {
+std::pair<std::size_t, cudf::device_span<cudf::size_type const>> get_mixed_size_info(
+  JNIEnv* env, jlong j_output_row_count, jlong j_matches_view)
+{
   auto const row_count = static_cast<std::size_t>(j_output_row_count);
-  auto const matches = reinterpret_cast<cudf::column_view const *>(j_matches_view);
-  return std::make_pair(row_count, cudf::device_span<cudf::size_type const>(
-                                       matches->template data<cudf::size_type>(), matches->size()));
+  auto const matches   = reinterpret_cast<cudf::column_view const*>(j_matches_view);
+  return std::make_pair(row_count,
+                        cudf::device_span<cudf::size_type const>(
+                          matches->template data<cudf::size_type>(), matches->size()));
 }
 
-cudf::column_view remove_validity_from_col(cudf::column_view column_view) {
+cudf::column_view remove_validity_from_col(cudf::column_view column_view)
+{
   if (!cudf::is_compound(column_view.type())) {
     if (column_view.nullable() && column_view.null_count() == 0) {
       // null_mask is allocated but no nulls present therefore we create a new column_view without
       // the null_mask to avoid things blowing up in reading the parquet file
-      return cudf::column_view(column_view.type(), column_view.size(), column_view.head(), nullptr,
-                               0, column_view.offset());
+      return cudf::column_view(column_view.type(),
+                               column_view.size(),
+                               column_view.head(),
+                               nullptr,
+                               0,
+                               column_view.offset());
     } else {
       return cudf::column_view(column_view);
     }
@@ -905,17 +993,27 @@ cudf::column_view remove_validity_from_col(cudf::column_view column_view) {
       children.push_back(remove_validity_from_col(*it));
     }
     if (!column_view.nullable() || column_view.null_count() != 0) {
-      return cudf::column_view(column_view.type(), column_view.size(), column_view.head(),
-                               column_view.null_mask(), column_view.null_count(),
-                               column_view.offset(), children);
+      return cudf::column_view(column_view.type(),
+                               column_view.size(),
+                               column_view.head(),
+                               column_view.null_mask(),
+                               column_view.null_count(),
+                               column_view.offset(),
+                               children);
     } else {
-      return cudf::column_view(column_view.type(), column_view.size(), column_view.head(), nullptr,
-                               0, column_view.offset(), children);
+      return cudf::column_view(column_view.type(),
+                               column_view.size(),
+                               column_view.head(),
+                               nullptr,
+                               0,
+                               column_view.offset(),
+                               children);
     }
   }
 }
 
-cudf::table_view remove_validity_if_needed(cudf::table_view *input_table_view) {
+cudf::table_view remove_validity_if_needed(cudf::table_view* input_table_view)
+{
   std::vector<cudf::column_view> views;
   views.reserve(input_table_view->num_columns());
   for (auto it = input_table_view->begin(); it != input_table_view->end(); it++) {
@@ -925,11 +1023,12 @@ cudf::table_view remove_validity_if_needed(cudf::table_view *input_table_view) {
   return cudf::table_view(views);
 }
 
-cudf::io::schema_element read_schema_element(int &index,
-                                             cudf::jni::native_jintArray const &children,
-                                             cudf::jni::native_jstringArray const &names,
-                                             cudf::jni::native_jintArray const &types,
-                                             cudf::jni::native_jintArray const &scales) {
+cudf::io::schema_element read_schema_element(int& index,
+                                             cudf::jni::native_jintArray const& children,
+                                             cudf::jni::native_jstringArray const& names,
+                                             cudf::jni::native_jintArray const& types,
+                                             cudf::jni::native_jintArray const& scales)
+{
   auto d_type = cudf::data_type{static_cast<cudf::type_id>(types[index]), scales[index]};
   if (d_type.id() == cudf::type_id::STRUCT || d_type.id() == cudf::type_id::LIST) {
     std::map<std::string, cudf::io::schema_element> child_elems;
@@ -938,8 +1037,8 @@ cudf::io::schema_element read_schema_element(int &index,
     index++;
     for (int i = 0; i < num_children; i++) {
       child_elems.insert(
-          std::pair{names.get(index).get(),
-                    cudf::jni::read_schema_element(index, children, names, types, scales)});
+        std::pair{names.get(index).get(),
+                  cudf::jni::read_schema_element(index, children, names, types, scales)});
     }
     return cudf::io::schema_element{d_type, std::move(child_elems)};
   } else {
@@ -952,26 +1051,27 @@ cudf::io::schema_element read_schema_element(int &index,
   }
 }
 
-void append_flattened_child_counts(cudf::io::column_name_info const &info,
-                                   std::vector<int> &counts) {
+void append_flattened_child_counts(cudf::io::column_name_info const& info, std::vector<int>& counts)
+{
   counts.push_back(info.children.size());
-  for (cudf::io::column_name_info const &child : info.children) {
+  for (cudf::io::column_name_info const& child : info.children) {
     append_flattened_child_counts(child, counts);
   }
 }
 
-void append_flattened_child_names(cudf::io::column_name_info const &info,
-                                  std::vector<std::string> &names) {
+void append_flattened_child_names(cudf::io::column_name_info const& info,
+                                  std::vector<std::string>& names)
+{
   names.push_back(info.name);
-  for (cudf::io::column_name_info const &child : info.children) {
+  for (cudf::io::column_name_info const& child : info.children) {
     append_flattened_child_names(child, names);
   }
 }
 
-} // namespace
+}  // namespace
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
 
 using cudf::jni::convert_table_for_return;
 using cudf::jni::ptr_as_jlong;
@@ -980,24 +1080,28 @@ using cudf::jni::release_as_jlong;
 extern "C" {
 
 // This is a method purely added for testing remove_validity_if_needed method
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_removeNullMasksIfNeeded(JNIEnv *env, jclass,
-                                                                               jlong j_table_view) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_removeNullMasksIfNeeded(JNIEnv* env,
+                                                                               jclass,
+                                                                               jlong j_table_view)
+{
   JNI_NULL_CHECK(env, j_table_view, "table view handle is null", 0);
   try {
-    cudf::table_view *tview = reinterpret_cast<cudf::table_view *>(j_table_view);
+    cudf::table_view* tview = reinterpret_cast<cudf::table_view*>(j_table_view);
     cudf::table_view result = cudf::jni::remove_validity_if_needed(tview);
     cudf::table m_tbl(result);
     std::vector<std::unique_ptr<cudf::column>> cols = m_tbl.release();
     auto results = cudf::jni::native_jlongArray(env, cols.size());
-    std::transform(cols.begin(), cols.end(), results.begin(),
-                   [](auto &col) { return release_as_jlong(col); });
+    std::transform(
+      cols.begin(), cols.end(), results.begin(), [](auto& col) { return release_as_jlong(col); });
     return results.get_jArray();
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_createCudfTableView(JNIEnv *env, jclass,
-                                                                      jlongArray j_cudf_columns) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_createCudfTableView(JNIEnv* env,
+                                                                      jclass,
+                                                                      jlongArray j_cudf_columns)
+{
   JNI_NULL_CHECK(env, j_cudf_columns, "columns are null", 0);
 
   try {
@@ -1010,27 +1114,31 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_createCudfTableView(JNIEnv *en
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_deleteCudfTable(JNIEnv *env, jclass,
-                                                                 jlong j_cudf_table_view) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_deleteCudfTable(JNIEnv* env,
+                                                                 jclass,
+                                                                 jlong j_cudf_table_view)
+{
   JNI_NULL_CHECK(env, j_cudf_table_view, "table view handle is null", );
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cudf::table_view *>(j_cudf_table_view);
+    delete reinterpret_cast<cudf::table_view*>(j_cudf_table_view);
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_columnViewsFromPacked(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_columnViewsFromPacked(JNIEnv* env,
+                                                                             jclass,
                                                                              jobject buffer_obj,
-                                                                             jlong j_data_address) {
+                                                                             jlong j_data_address)
+{
   // The GPU data address can be null when the table is empty, so it is not null-checked here.
   JNI_NULL_CHECK(env, buffer_obj, "metadata is null", nullptr);
   try {
     cudf::jni::auto_set_device(env);
-    void const *metadata_address = env->GetDirectBufferAddress(buffer_obj);
+    void const* metadata_address = env->GetDirectBufferAddress(buffer_obj);
     JNI_NULL_CHECK(env, metadata_address, "metadata buffer address is null", nullptr);
-    cudf::table_view table = cudf::unpack(static_cast<uint8_t const *>(metadata_address),
-                                          reinterpret_cast<uint8_t const *>(j_data_address));
+    cudf::table_view table = cudf::unpack(static_cast<uint8_t const*>(metadata_address),
+                                          reinterpret_cast<uint8_t const*>(j_data_address));
     cudf::jni::native_jlongArray views(env, table.num_columns());
     for (int i = 0; i < table.num_columns(); i++) {
       // TODO Exception handling is not ideal, if no exceptions are thrown ownership of the new cv
@@ -1051,12 +1159,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_columnViewsFromPacked(JNI
   CATCH_STD(env, nullptr);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_sortOrder(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_sortOrder(JNIEnv* env,
+                                                            jclass,
                                                             jlong j_input_table,
                                                             jlongArray j_sort_keys_columns,
                                                             jbooleanArray j_is_descending,
-                                                            jbooleanArray j_are_nulls_smallest) {
-
+                                                            jbooleanArray j_are_nulls_smallest)
+{
   // input validations & verifications
   JNI_NULL_CHECK(env, j_input_table, "input table is null", 0);
   JNI_NULL_CHECK(env, j_sort_keys_columns, "sort keys columns is null", 0);
@@ -1071,19 +1180,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_sortOrder(JNIEnv *env, jclass,
     const cudf::jni::native_jbooleanArray n_is_descending(env, j_is_descending);
     jsize num_columns_is_desc = n_is_descending.size();
 
-    JNI_ARG_CHECK(env, num_columns_is_desc == num_columns,
-                  "columns and is_descending lengths don't match", 0);
+    JNI_ARG_CHECK(
+      env, num_columns_is_desc == num_columns, "columns and is_descending lengths don't match", 0);
 
     const cudf::jni::native_jbooleanArray n_are_nulls_smallest(env, j_are_nulls_smallest);
     jsize num_columns_null_smallest = n_are_nulls_smallest.size();
 
-    JNI_ARG_CHECK(env, num_columns_null_smallest == num_columns,
-                  "columns and is_descending lengths don't match", 0);
+    JNI_ARG_CHECK(env,
+                  num_columns_null_smallest == num_columns,
+                  "columns and is_descending lengths don't match",
+                  0);
 
     std::vector<cudf::order> order =
-        n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
+      n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
     std::vector<cudf::null_order> null_order =
-        n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
+      n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
 
     std::vector<cudf::column_view> sort_keys = n_sort_keys_columns.get_dereferenced();
     return release_as_jlong(cudf::sorted_order(cudf::table_view{sort_keys}, order, null_order));
@@ -1091,12 +1202,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_sortOrder(JNIEnv *env, jclass,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_orderBy(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_orderBy(JNIEnv* env,
+                                                               jclass,
                                                                jlong j_input_table,
                                                                jlongArray j_sort_keys_columns,
                                                                jbooleanArray j_is_descending,
-                                                               jbooleanArray j_are_nulls_smallest) {
-
+                                                               jbooleanArray j_are_nulls_smallest)
+{
   // input validations & verifications
   JNI_NULL_CHECK(env, j_input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, j_sort_keys_columns, "sort keys columns is null", NULL);
@@ -1111,36 +1223,39 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_orderBy(JNIEnv *env, jcla
     const cudf::jni::native_jbooleanArray n_is_descending(env, j_is_descending);
     jsize num_columns_is_desc = n_is_descending.size();
 
-    JNI_ARG_CHECK(env, num_columns_is_desc == num_columns,
-                  "columns and is_descending lengths don't match", 0);
+    JNI_ARG_CHECK(
+      env, num_columns_is_desc == num_columns, "columns and is_descending lengths don't match", 0);
 
     const cudf::jni::native_jbooleanArray n_are_nulls_smallest(env, j_are_nulls_smallest);
     jsize num_columns_null_smallest = n_are_nulls_smallest.size();
 
-    JNI_ARG_CHECK(env, num_columns_null_smallest == num_columns,
-                  "columns and areNullsSmallest lengths don't match", 0);
+    JNI_ARG_CHECK(env,
+                  num_columns_null_smallest == num_columns,
+                  "columns and areNullsSmallest lengths don't match",
+                  0);
 
     std::vector<cudf::order> order =
-        n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
+      n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
 
     std::vector<cudf::null_order> null_order =
-        n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
+      n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
 
     std::vector<cudf::column_view> sort_keys = n_sort_keys_columns.get_dereferenced();
     auto sorted_col = cudf::sorted_order(cudf::table_view{sort_keys}, order, null_order);
 
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(j_input_table);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(j_input_table);
     return convert_table_for_return(env, cudf::gather(*input_table, sorted_col->view()));
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv* env,
+                                                             jclass,
                                                              jlongArray j_table_handles,
                                                              jintArray j_sort_key_indexes,
                                                              jbooleanArray j_is_descending,
-                                                             jbooleanArray j_are_nulls_smallest) {
-
+                                                             jbooleanArray j_are_nulls_smallest)
+{
   // input validations & verifications
   JNI_NULL_CHECK(env, j_table_handles, "input tables are null", NULL);
   JNI_NULL_CHECK(env, j_sort_key_indexes, "key indexes is null", NULL);
@@ -1156,20 +1271,24 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass
     const cudf::jni::native_jbooleanArray n_is_descending(env, j_is_descending);
     jsize num_columns_is_desc = n_is_descending.size();
 
-    JNI_ARG_CHECK(env, num_columns_is_desc == num_columns,
-                  "columns and is_descending lengths don't match", NULL);
+    JNI_ARG_CHECK(env,
+                  num_columns_is_desc == num_columns,
+                  "columns and is_descending lengths don't match",
+                  NULL);
 
     const cudf::jni::native_jbooleanArray n_are_nulls_smallest(env, j_are_nulls_smallest);
     jsize num_columns_null_smallest = n_are_nulls_smallest.size();
 
-    JNI_ARG_CHECK(env, num_columns_null_smallest == num_columns,
-                  "columns and areNullsSmallest lengths don't match", NULL);
+    JNI_ARG_CHECK(env,
+                  num_columns_null_smallest == num_columns,
+                  "columns and areNullsSmallest lengths don't match",
+                  NULL);
 
     std::vector<int> indexes = n_sort_key_indexes.to_vector<int>();
     std::vector<cudf::order> order =
-        n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
+      n_is_descending.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING);
     std::vector<cudf::null_order> null_order =
-        n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
+      n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER);
     std::vector<cudf::table_view> tables = n_table_handles.get_dereferenced();
 
     return convert_table_for_return(env, cudf::merge(tables, indexes, order, null_order));
@@ -1177,11 +1296,23 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_merge(JNIEnv *env, jclass
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSVFromDataSource(
-    JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
-    jobjectArray filter_col_names, jint header_row, jbyte delim, jint j_quote_style, jbyte quote,
-    jbyte comment, jobjectArray null_values, jobjectArray true_values, jobjectArray false_values,
-    jlong ds_handle) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_readCSVFromDataSource(JNIEnv* env,
+                                                jclass,
+                                                jobjectArray col_names,
+                                                jintArray j_types,
+                                                jintArray j_scales,
+                                                jobjectArray filter_col_names,
+                                                jint header_row,
+                                                jbyte delim,
+                                                jint j_quote_style,
+                                                jbyte quote,
+                                                jbyte comment,
+                                                jobjectArray null_values,
+                                                jobjectArray true_values,
+                                                jobjectArray false_values,
+                                                jlong ds_handle)
+{
   JNI_NULL_CHECK(env, null_values, "null_values must be supplied, even if it is empty", NULL);
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", NULL);
 
@@ -1199,8 +1330,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSVFromDataSource(
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and scales must match size", NULL);
       }
       data_types.reserve(n_types.size());
-      std::transform(n_types.begin(), n_types.end(), n_scales.begin(),
-                     std::back_inserter(data_types), [](auto type, auto scale) {
+      std::transform(n_types.begin(),
+                     n_types.end(),
+                     n_scales.begin(),
+                     std::back_inserter(data_types),
+                     [](auto type, auto scale) {
                        return cudf::data_type{static_cast<cudf::type_id>(type), scale};
                      });
     }
@@ -1210,37 +1344,50 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSVFromDataSource(
     cudf::jni::native_jstringArray n_false_values(env, false_values);
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
 
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
     auto const quote_style = static_cast<cudf::io::quote_style>(j_quote_style);
 
     cudf::io::csv_reader_options opts = cudf::io::csv_reader_options::builder(source)
-                                            .delimiter(delim)
-                                            .header(header_row)
-                                            .names(n_col_names.as_cpp_vector())
-                                            .dtypes(data_types)
-                                            .use_cols_names(n_filter_col_names.as_cpp_vector())
-                                            .true_values(n_true_values.as_cpp_vector())
-                                            .false_values(n_false_values.as_cpp_vector())
-                                            .na_values(n_null_values.as_cpp_vector())
-                                            .keep_default_na(false)
-                                            .na_filter(n_null_values.size() > 0)
-                                            .quoting(quote_style)
-                                            .quotechar(quote)
-                                            .comment(comment)
-                                            .build();
+                                          .delimiter(delim)
+                                          .header(header_row)
+                                          .names(n_col_names.as_cpp_vector())
+                                          .dtypes(data_types)
+                                          .use_cols_names(n_filter_col_names.as_cpp_vector())
+                                          .true_values(n_true_values.as_cpp_vector())
+                                          .false_values(n_false_values.as_cpp_vector())
+                                          .na_values(n_null_values.as_cpp_vector())
+                                          .keep_default_na(false)
+                                          .na_filter(n_null_values.size() > 0)
+                                          .quoting(quote_style)
+                                          .quotechar(quote)
+                                          .comment(comment)
+                                          .build();
 
     return convert_table_for_return(env, cudf::io::read_csv(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
-    JNIEnv *env, jclass, jobjectArray col_names, jintArray j_types, jintArray j_scales,
-    jobjectArray filter_col_names, jstring inputfilepath, jlong buffer, jlong buffer_length,
-    jint header_row, jbyte delim, jint j_quote_style, jbyte quote, jbyte comment,
-    jobjectArray null_values, jobjectArray true_values, jobjectArray false_values) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(JNIEnv* env,
+                                                               jclass,
+                                                               jobjectArray col_names,
+                                                               jintArray j_types,
+                                                               jintArray j_scales,
+                                                               jobjectArray filter_col_names,
+                                                               jstring inputfilepath,
+                                                               jlong buffer,
+                                                               jlong buffer_length,
+                                                               jint header_row,
+                                                               jbyte delim,
+                                                               jint j_quote_style,
+                                                               jbyte quote,
+                                                               jbyte comment,
+                                                               jobjectArray null_values,
+                                                               jobjectArray true_values,
+                                                               jobjectArray false_values)
+{
   JNI_NULL_CHECK(env, null_values, "null_values must be supplied, even if it is empty", NULL);
 
   bool read_buffer = true;
@@ -1248,8 +1395,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
     read_buffer = false;
   } else if (inputfilepath != NULL) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
-                  "cannot pass in both a buffer and an inputfilepath", NULL);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL);
   } else if (buffer_length <= 0) {
     JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL);
   }
@@ -1268,8 +1415,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and scales must match size", NULL);
       }
       data_types.reserve(n_types.size());
-      std::transform(n_types.begin(), n_types.end(), n_scales.begin(),
-                     std::back_inserter(data_types), [](auto type, auto scale) {
+      std::transform(n_types.begin(),
+                     n_types.end(),
+                     n_scales.begin(),
+                     std::back_inserter(data_types),
+                     [](auto type, auto scale) {
                        return cudf::data_type{static_cast<cudf::type_id>(type), scale};
                      });
     }
@@ -1284,36 +1434,45 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readCSV(
     cudf::jni::native_jstringArray n_false_values(env, false_values);
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
 
-    auto source = read_buffer ? cudf::io::source_info{reinterpret_cast<char *>(buffer),
-                                                      static_cast<std::size_t>(buffer_length)} :
-                                cudf::io::source_info{filename.get()};
+    auto source            = read_buffer ? cudf::io::source_info{reinterpret_cast<char*>(buffer),
+                                                      static_cast<std::size_t>(buffer_length)}
+                                         : cudf::io::source_info{filename.get()};
     auto const quote_style = static_cast<cudf::io::quote_style>(j_quote_style);
 
     cudf::io::csv_reader_options opts = cudf::io::csv_reader_options::builder(source)
-                                            .delimiter(delim)
-                                            .header(header_row)
-                                            .names(n_col_names.as_cpp_vector())
-                                            .dtypes(data_types)
-                                            .use_cols_names(n_filter_col_names.as_cpp_vector())
-                                            .true_values(n_true_values.as_cpp_vector())
-                                            .false_values(n_false_values.as_cpp_vector())
-                                            .na_values(n_null_values.as_cpp_vector())
-                                            .keep_default_na(false)
-                                            .na_filter(n_null_values.size() > 0)
-                                            .quoting(quote_style)
-                                            .quotechar(quote)
-                                            .comment(comment)
-                                            .build();
+                                          .delimiter(delim)
+                                          .header(header_row)
+                                          .names(n_col_names.as_cpp_vector())
+                                          .dtypes(data_types)
+                                          .use_cols_names(n_filter_col_names.as_cpp_vector())
+                                          .true_values(n_true_values.as_cpp_vector())
+                                          .false_values(n_false_values.as_cpp_vector())
+                                          .na_values(n_null_values.as_cpp_vector())
+                                          .keep_default_na(false)
+                                          .na_filter(n_null_values.size() > 0)
+                                          .quoting(quote_style)
+                                          .quotechar(quote)
+                                          .comment(comment)
+                                          .build();
 
     return convert_table_for_return(env, cudf::io::read_csv(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile(
-    JNIEnv *env, jclass, jlong j_table_handle, jobjectArray j_column_names, jboolean include_header,
-    jstring j_row_delimiter, jbyte j_field_delimiter, jstring j_null_value, jstring j_true_value,
-    jstring j_false_value, jint j_quote_style, jstring j_output_path) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile(JNIEnv* env,
+                                                                jclass,
+                                                                jlong j_table_handle,
+                                                                jobjectArray j_column_names,
+                                                                jboolean include_header,
+                                                                jstring j_row_delimiter,
+                                                                jbyte j_field_delimiter,
+                                                                jstring j_null_value,
+                                                                jstring j_true_value,
+                                                                jstring j_false_value,
+                                                                jint j_quote_style,
+                                                                jstring j_output_path)
+{
   JNI_NULL_CHECK(env, j_table_handle, "table handle cannot be null.", );
   JNI_NULL_CHECK(env, j_column_names, "column name array cannot be null", );
   JNI_NULL_CHECK(env, j_row_delimiter, "row delimiter cannot be null", );
@@ -1327,37 +1486,47 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVToFile(
     cudf::jni::auto_set_device(env);
 
     auto const native_output_path = cudf::jni::native_jstring{env, j_output_path};
-    auto const output_path = native_output_path.get();
+    auto const output_path        = native_output_path.get();
 
-    auto const table = reinterpret_cast<cudf::table_view *>(j_table_handle);
+    auto const table          = reinterpret_cast<cudf::table_view*>(j_table_handle);
     auto const n_column_names = cudf::jni::native_jstringArray{env, j_column_names};
-    auto const column_names = n_column_names.as_cpp_vector();
+    auto const column_names   = n_column_names.as_cpp_vector();
 
     auto const line_terminator = cudf::jni::native_jstring{env, j_row_delimiter};
-    auto const na_rep = cudf::jni::native_jstring{env, j_null_value};
-    auto const true_value = cudf::jni::native_jstring{env, j_true_value};
-    auto const false_value = cudf::jni::native_jstring{env, j_false_value};
-    auto const quote_style = static_cast<cudf::io::quote_style>(j_quote_style);
+    auto const na_rep          = cudf::jni::native_jstring{env, j_null_value};
+    auto const true_value      = cudf::jni::native_jstring{env, j_true_value};
+    auto const false_value     = cudf::jni::native_jstring{env, j_false_value};
+    auto const quote_style     = static_cast<cudf::io::quote_style>(j_quote_style);
 
     auto options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{output_path}, *table)
-                       .names(column_names)
-                       .include_header(static_cast<bool>(include_header))
-                       .line_terminator(line_terminator.get())
-                       .inter_column_delimiter(j_field_delimiter)
-                       .na_rep(na_rep.get())
-                       .true_value(true_value.get())
-                       .false_value(false_value.get())
-                       .quoting(quote_style);
+                     .names(column_names)
+                     .include_header(static_cast<bool>(include_header))
+                     .line_terminator(line_terminator.get())
+                     .inter_column_delimiter(j_field_delimiter)
+                     .na_rep(na_rep.get())
+                     .true_value(true_value.get())
+                     .false_value(false_value.get())
+                     .quoting(quote_style);
 
     cudf::io::write_csv(options.build());
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_startWriteCSVToBuffer(
-    JNIEnv *env, jclass, jobjectArray j_column_names, jboolean include_header,
-    jstring j_row_delimiter, jbyte j_field_delimiter, jstring j_null_value, jstring j_true_value,
-    jstring j_false_value, jint j_quote_style, jobject j_buffer, jobject host_memory_allocator) {
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_Table_startWriteCSVToBuffer(JNIEnv* env,
+                                                jclass,
+                                                jobjectArray j_column_names,
+                                                jboolean include_header,
+                                                jstring j_row_delimiter,
+                                                jbyte j_field_delimiter,
+                                                jstring j_null_value,
+                                                jstring j_true_value,
+                                                jstring j_false_value,
+                                                jint j_quote_style,
+                                                jobject j_buffer,
+                                                jobject host_memory_allocator)
+{
   JNI_NULL_CHECK(env, j_column_names, "column name array cannot be null", 0);
   JNI_NULL_CHECK(env, j_row_delimiter, "row delimiter cannot be null", 0);
   JNI_NULL_CHECK(env, j_field_delimiter, "field delimiter cannot be null", 0);
@@ -1368,42 +1537,44 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_startWriteCSVToBuffer(
     cudf::jni::auto_set_device(env);
 
     auto data_sink =
-        std::make_unique<cudf::jni::jni_writer_data_sink>(env, j_buffer, host_memory_allocator);
+      std::make_unique<cudf::jni::jni_writer_data_sink>(env, j_buffer, host_memory_allocator);
 
     auto const n_column_names = cudf::jni::native_jstringArray{env, j_column_names};
-    auto const column_names = n_column_names.as_cpp_vector();
+    auto const column_names   = n_column_names.as_cpp_vector();
 
     auto const line_terminator = cudf::jni::native_jstring{env, j_row_delimiter};
-    auto const na_rep = cudf::jni::native_jstring{env, j_null_value};
-    auto const true_value = cudf::jni::native_jstring{env, j_true_value};
-    auto const false_value = cudf::jni::native_jstring{env, j_false_value};
-    auto const quote_style = static_cast<cudf::io::quote_style>(j_quote_style);
+    auto const na_rep          = cudf::jni::native_jstring{env, j_null_value};
+    auto const true_value      = cudf::jni::native_jstring{env, j_true_value};
+    auto const false_value     = cudf::jni::native_jstring{env, j_false_value};
+    auto const quote_style     = static_cast<cudf::io::quote_style>(j_quote_style);
 
     auto options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{data_sink.get()},
                                                          cudf::table_view{})
-                       .names(column_names)
-                       .include_header(static_cast<bool>(include_header))
-                       .line_terminator(line_terminator.get())
-                       .inter_column_delimiter(j_field_delimiter)
-                       .na_rep(na_rep.get())
-                       .true_value(true_value.get())
-                       .false_value(false_value.get())
-                       .quoting(quote_style)
-                       .build();
+                     .names(column_names)
+                     .include_header(static_cast<bool>(include_header))
+                     .line_terminator(line_terminator.get())
+                     .inter_column_delimiter(j_field_delimiter)
+                     .na_rep(na_rep.get())
+                     .true_value(true_value.get())
+                     .false_value(false_value.get())
+                     .quoting(quote_style)
+                     .build();
 
     return ptr_as_jlong(new cudf::jni::io::csv_chunked_writer{options, data_sink});
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVChunkToBuffer(JNIEnv *env, jclass,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVChunkToBuffer(JNIEnv* env,
+                                                                       jclass,
                                                                        jlong j_writer_handle,
-                                                                       jlong j_table_handle) {
+                                                                       jlong j_table_handle)
+{
   JNI_NULL_CHECK(env, j_writer_handle, "writer handle cannot be null.", );
   JNI_NULL_CHECK(env, j_table_handle, "table handle cannot be null.", );
 
-  auto const table = reinterpret_cast<cudf::table_view *>(j_table_handle);
-  auto writer = reinterpret_cast<cudf::jni::io::csv_chunked_writer *>(j_writer_handle);
+  auto const table = reinterpret_cast<cudf::table_view*>(j_table_handle);
+  auto writer      = reinterpret_cast<cudf::jni::io::csv_chunked_writer*>(j_writer_handle);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -1412,13 +1583,15 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeCSVChunkToBuffer(JNIEnv *e
   CATCH_STD(env, );
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env, jclass,
-                                                                     jlong j_writer_handle) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv* env,
+                                                                     jclass,
+                                                                     jlong j_writer_handle)
+{
   JNI_NULL_CHECK(env, j_writer_handle, "writer handle cannot be null.", );
 
   using cudf::jni::io::csv_chunked_writer;
   auto writer =
-      std::unique_ptr<csv_chunked_writer>{reinterpret_cast<csv_chunked_writer *>(j_writer_handle)};
+    std::unique_ptr<csv_chunked_writer>{reinterpret_cast<csv_chunked_writer*>(j_writer_handle)};
 
   try {
     cudf::jni::auto_set_device(env);
@@ -1427,44 +1600,57 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_endWriteCSVToBuffer(JNIEnv *env
   CATCH_STD(env, );
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(
-    JNIEnv *env, jclass, jboolean day_first, jboolean lines, jboolean recover_with_null,
-    jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string,
-    jboolean keep_quotes, jlong ds_handle) {
-
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
+                                                         jclass,
+                                                         jboolean day_first,
+                                                         jboolean lines,
+                                                         jboolean recover_with_null,
+                                                         jboolean normalize_single_quotes,
+                                                         jboolean normalize_whitespace,
+                                                         jboolean mixed_types_as_string,
+                                                         jboolean keep_quotes,
+                                                         jlong ds_handle)
+{
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
-    auto const recovery_mode = recover_with_null ?
-                                   cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
-                                   cudf::io::json_recovery_mode_t::FAIL;
+    auto const recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
+                                                 : cudf::io::json_recovery_mode_t::FAIL;
     cudf::io::json_reader_options_builder opts =
-        cudf::io::json_reader_options::builder(source)
-            .dayfirst(static_cast<bool>(day_first))
-            .lines(static_cast<bool>(lines))
-            .recovery_mode(recovery_mode)
-            .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
-            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
-            .mixed_types_as_string(mixed_types_as_string)
-            .keep_quotes(keep_quotes);
+      cudf::io::json_reader_options::builder(source)
+        .dayfirst(static_cast<bool>(day_first))
+        .lines(static_cast<bool>(lines))
+        .recovery_mode(recovery_mode)
+        .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+        .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+        .mixed_types_as_string(mixed_types_as_string)
+        .keep_quotes(keep_quotes);
 
     auto result =
-        std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
+      std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
 
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
-    JNIEnv *env, jclass, jlong buffer, jlong buffer_length, jboolean day_first, jboolean lines,
-    jboolean recover_with_null, jboolean normalize_single_quotes, jboolean normalize_whitespace,
-    jboolean mixed_types_as_string, jboolean keep_quotes) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
+                                                                   jclass,
+                                                                   jlong buffer,
+                                                                   jlong buffer_length,
+                                                                   jboolean day_first,
+                                                                   jboolean lines,
+                                                                   jboolean recover_with_null,
+                                                                   jboolean normalize_single_quotes,
+                                                                   jboolean normalize_whitespace,
+                                                                   jboolean mixed_types_as_string,
+                                                                   jboolean keep_quotes)
+{
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
   if (buffer_length <= 0) {
     JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0);
@@ -1473,50 +1659,52 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(
   try {
     cudf::jni::auto_set_device(env);
 
-    auto source = cudf::io::source_info{reinterpret_cast<char *>(buffer),
+    auto source = cudf::io::source_info{reinterpret_cast<char*>(buffer),
                                         static_cast<std::size_t>(buffer_length)};
 
-    auto const recovery_mode = recover_with_null ?
-                                   cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
-                                   cudf::io::json_recovery_mode_t::FAIL;
+    auto const recovery_mode = recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
+                                                 : cudf::io::json_recovery_mode_t::FAIL;
     cudf::io::json_reader_options_builder opts =
-        cudf::io::json_reader_options::builder(source)
-            .dayfirst(static_cast<bool>(day_first))
-            .lines(static_cast<bool>(lines))
-            .recovery_mode(recovery_mode)
-            .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
-            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
-            .mixed_types_as_string(mixed_types_as_string)
-            .keep_quotes(keep_quotes);
+      cudf::io::json_reader_options::builder(source)
+        .dayfirst(static_cast<bool>(day_first))
+        .lines(static_cast<bool>(lines))
+        .recovery_mode(recovery_mode)
+        .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+        .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+        .mixed_types_as_string(mixed_types_as_string)
+        .keep_quotes(keep_quotes);
 
     auto result =
-        std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
+      std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
 
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_TableWithMeta_close(JNIEnv *env, jclass, jlong handle) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_TableWithMeta_close(JNIEnv* env, jclass, jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", );
 
   try {
     cudf::jni::auto_set_device(env);
-    delete reinterpret_cast<cudf::io::table_with_metadata *>(handle);
+    delete reinterpret_cast<cudf::io::table_with_metadata*>(handle);
   }
   CATCH_STD(env, );
 }
 
-JNIEXPORT jintArray JNICALL
-Java_ai_rapids_cudf_TableWithMeta_getFlattenedChildCounts(JNIEnv *env, jclass, jlong handle) {
+JNIEXPORT jintArray JNICALL Java_ai_rapids_cudf_TableWithMeta_getFlattenedChildCounts(JNIEnv* env,
+                                                                                      jclass,
+                                                                                      jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto ptr = reinterpret_cast<cudf::io::table_with_metadata *>(handle);
+    auto ptr = reinterpret_cast<cudf::io::table_with_metadata*>(handle);
     std::vector<int> counts;
     counts.push_back(ptr->metadata.schema_info.size());
-    for (cudf::io::column_name_info const &child : ptr->metadata.schema_info) {
+    for (cudf::io::column_name_info const& child : ptr->metadata.schema_info) {
       cudf::jni::append_flattened_child_counts(child, counts);
     }
 
@@ -1532,21 +1720,22 @@ Java_ai_rapids_cudf_TableWithMeta_getFlattenedChildCounts(JNIEnv *env, jclass, j
 }
 
 JNIEXPORT jobjectArray JNICALL
-Java_ai_rapids_cudf_TableWithMeta_getFlattenedColumnNames(JNIEnv *env, jclass, jlong handle) {
+Java_ai_rapids_cudf_TableWithMeta_getFlattenedColumnNames(JNIEnv* env, jclass, jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto ptr = reinterpret_cast<cudf::io::table_with_metadata *>(handle);
+    auto ptr = reinterpret_cast<cudf::io::table_with_metadata*>(handle);
     std::vector<std::string> names;
     names.push_back("ROOT");
-    for (cudf::io::column_name_info const &child : ptr->metadata.schema_info) {
+    for (cudf::io::column_name_info const& child : ptr->metadata.schema_info) {
       cudf::jni::append_flattened_child_names(child, names);
     }
 
     auto length = names.size();
-    auto ret = static_cast<jobjectArray>(
-        env->NewObjectArray(length, env->FindClass("java/lang/String"), nullptr));
+    auto ret    = static_cast<jobjectArray>(
+      env->NewObjectArray(length, env->FindClass("java/lang/String"), nullptr));
     for (size_t i = 0; i < length; i++) {
       env->SetObjectArrayElement(ret, i, env->NewStringUTF(names[i].c_str()));
     }
@@ -1556,13 +1745,15 @@ Java_ai_rapids_cudf_TableWithMeta_getFlattenedColumnNames(JNIEnv *env, jclass, j
   CATCH_STD(env, nullptr);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIEnv *env, jclass,
-                                                                            jlong handle) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIEnv* env,
+                                                                            jclass,
+                                                                            jlong handle)
+{
   JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto ptr = reinterpret_cast<cudf::io::table_with_metadata *>(handle);
+    auto ptr = reinterpret_cast<cudf::io::table_with_metadata*>(handle);
     if (ptr->tbl) {
       return convert_table_for_return(env, ptr->tbl);
     } else {
@@ -1572,12 +1763,22 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_TableWithMeta_releaseTable(JNIE
   CATCH_STD(env, nullptr);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
-    JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types,
-    jintArray j_scales, jboolean day_first, jboolean lines, jboolean recover_with_null,
-    jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string,
-    jboolean keep_quotes, jlong ds_handle) {
-
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
+                                                 jclass,
+                                                 jintArray j_num_children,
+                                                 jobjectArray col_names,
+                                                 jintArray j_types,
+                                                 jintArray j_scales,
+                                                 jboolean day_first,
+                                                 jboolean lines,
+                                                 jboolean recover_with_null,
+                                                 jboolean normalize_single_quotes,
+                                                 jboolean normalize_whitespace,
+                                                 jboolean mixed_types_as_string,
+                                                 jboolean keep_quotes,
+                                                 jlong ds_handle)
+{
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
   try {
@@ -1596,41 +1797,41 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
       JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match null", 0);
     }
 
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
     cudf::io::json_recovery_mode_t recovery_mode =
-        recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
-                            cudf::io::json_recovery_mode_t::FAIL;
+      recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
+                        : cudf::io::json_recovery_mode_t::FAIL;
     cudf::io::json_reader_options_builder opts =
-        cudf::io::json_reader_options::builder(source)
-            .dayfirst(static_cast<bool>(day_first))
-            .lines(static_cast<bool>(lines))
-            .recovery_mode(recovery_mode)
-            .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
-            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
-            .mixed_types_as_string(mixed_types_as_string)
-            .keep_quotes(keep_quotes);
+      cudf::io::json_reader_options::builder(source)
+        .dayfirst(static_cast<bool>(day_first))
+        .lines(static_cast<bool>(lines))
+        .recovery_mode(recovery_mode)
+        .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+        .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+        .mixed_types_as_string(mixed_types_as_string)
+        .keep_quotes(keep_quotes);
 
     if (!n_types.is_null()) {
       if (n_types.size() != n_scales.size()) {
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and scales must match size", 0);
       }
       if (n_col_names.size() != n_types.size()) {
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and column names must match size",
-                      0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "types and column names must match size", 0);
       }
       if (n_children.size() != n_types.size()) {
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match size",
-                      0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match size", 0);
       }
 
       std::map<std::string, cudf::io::schema_element> data_types;
       int at = 0;
       while (at < n_types.size()) {
         data_types.insert(std::pair{
-            n_col_names.get(at).get(),
-            cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
+          n_col_names.get(at).get(),
+          cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
       }
       opts.dtypes(data_types);
     } else {
@@ -1638,27 +1839,37 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSONFromDataSource(
     }
 
     auto result =
-        std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
+      std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
 
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
-    JNIEnv *env, jclass, jintArray j_num_children, jobjectArray col_names, jintArray j_types,
-    jintArray j_scales, jstring inputfilepath, jlong buffer, jlong buffer_length,
-    jboolean day_first, jboolean lines, jboolean recover_with_null,
-    jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string,
-    jboolean keep_quotes) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
+                                                           jclass,
+                                                           jintArray j_num_children,
+                                                           jobjectArray col_names,
+                                                           jintArray j_types,
+                                                           jintArray j_scales,
+                                                           jstring inputfilepath,
+                                                           jlong buffer,
+                                                           jlong buffer_length,
+                                                           jboolean day_first,
+                                                           jboolean lines,
+                                                           jboolean recover_with_null,
+                                                           jboolean normalize_single_quotes,
+                                                           jboolean normalize_whitespace,
+                                                           jboolean mixed_types_as_string,
+                                                           jboolean keep_quotes)
+{
   bool read_buffer = true;
   if (buffer == 0) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", 0);
     read_buffer = false;
   } else if (inputfilepath != NULL) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
-                  "cannot pass in both a buffer and an inputfilepath", 0);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", 0);
   } else if (buffer_length <= 0) {
     JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0);
   }
@@ -1684,42 +1895,42 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
       JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inputfilepath can't be empty", 0);
     }
 
-    auto source = read_buffer ? cudf::io::source_info{reinterpret_cast<char *>(buffer),
-                                                      static_cast<std::size_t>(buffer_length)} :
-                                cudf::io::source_info{filename.get()};
+    auto source = read_buffer ? cudf::io::source_info{reinterpret_cast<char*>(buffer),
+                                                      static_cast<std::size_t>(buffer_length)}
+                              : cudf::io::source_info{filename.get()};
 
     cudf::io::json_recovery_mode_t recovery_mode =
-        recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL :
-                            cudf::io::json_recovery_mode_t::FAIL;
+      recover_with_null ? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
+                        : cudf::io::json_recovery_mode_t::FAIL;
     cudf::io::json_reader_options_builder opts =
-        cudf::io::json_reader_options::builder(source)
-            .dayfirst(static_cast<bool>(day_first))
-            .lines(static_cast<bool>(lines))
-            .recovery_mode(recovery_mode)
-            .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
-            .normalize_whitespace(static_cast<bool>(normalize_whitespace))
-            .mixed_types_as_string(mixed_types_as_string)
-            .keep_quotes(keep_quotes);
+      cudf::io::json_reader_options::builder(source)
+        .dayfirst(static_cast<bool>(day_first))
+        .lines(static_cast<bool>(lines))
+        .recovery_mode(recovery_mode)
+        .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
+        .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+        .mixed_types_as_string(mixed_types_as_string)
+        .keep_quotes(keep_quotes);
 
     if (!n_types.is_null()) {
       if (n_types.size() != n_scales.size()) {
         JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and scales must match size", 0);
       }
       if (n_col_names.size() != n_types.size()) {
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and column names must match size",
-                      0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "types and column names must match size", 0);
       }
       if (n_children.size() != n_types.size()) {
-        JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match size",
-                      0);
+        JNI_THROW_NEW(
+          env, cudf::jni::ILLEGAL_ARG_CLASS, "types and num children must match size", 0);
       }
 
       std::map<std::string, cudf::io::schema_element> data_types;
       int at = 0;
       while (at < n_types.size()) {
         data_types.insert(std::pair{
-            n_col_names.get(at).get(),
-            cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
+          n_col_names.get(at).get(),
+          cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
       }
       opts.dtypes(data_types);
     } else {
@@ -1727,17 +1938,21 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(
     }
 
     auto result =
-        std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
+      std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
 
     return reinterpret_cast<jlong>(result.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquetFromDataSource(
-    JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jint unit,
-    jlong ds_handle) {
-
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_readParquetFromDataSource(JNIEnv* env,
+                                                    jclass,
+                                                    jobjectArray filter_col_names,
+                                                    jbooleanArray j_col_binary_read,
+                                                    jint unit,
+                                                    jlong ds_handle)
+{
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
   JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0);
 
@@ -1747,7 +1962,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquetFromDataSource
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
     cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
 
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
     auto builder = cudf::io::parquet_reader_options::builder(source);
@@ -1756,26 +1971,31 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquetFromDataSource
     }
 
     cudf::io::parquet_reader_options opts =
-        builder.convert_strings_to_categories(false)
-            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-            .build();
+      builder.convert_strings_to_categories(false)
+        .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+        .build();
     return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
-    JNIEnv *env, jclass, jobjectArray filter_col_names, jbooleanArray j_col_binary_read,
-    jstring inputfilepath, jlong buffer, jlong buffer_length, jint unit) {
-
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env,
+                                                                   jclass,
+                                                                   jobjectArray filter_col_names,
+                                                                   jbooleanArray j_col_binary_read,
+                                                                   jstring inputfilepath,
+                                                                   jlong buffer,
+                                                                   jlong buffer_length,
+                                                                   jint unit)
+{
   JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0);
   bool read_buffer = true;
   if (buffer == 0) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
     read_buffer = false;
   } else if (inputfilepath != NULL) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
-                  "cannot pass in both a buffer and an inputfilepath", NULL);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL);
   } else if (buffer_length <= 0) {
     JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL);
   }
@@ -1790,9 +2010,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
     cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
 
-    auto source = read_buffer ? cudf::io::source_info(reinterpret_cast<char *>(buffer),
-                                                      static_cast<std::size_t>(buffer_length)) :
-                                cudf::io::source_info(filename.get());
+    auto source = read_buffer ? cudf::io::source_info(reinterpret_cast<char*>(buffer),
+                                                      static_cast<std::size_t>(buffer_length))
+                              : cudf::io::source_info(filename.get());
 
     auto builder = cudf::io::parquet_reader_options::builder(source);
     if (n_filter_col_names.size() > 0) {
@@ -1800,17 +2020,17 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(
     }
 
     cudf::io::parquet_reader_options opts =
-        builder.convert_strings_to_categories(false)
-            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-            .build();
+      builder.convert_strings_to_categories(false)
+        .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+        .build();
     return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvroFromDataSource(
-    JNIEnv *env, jclass, jobjectArray filter_col_names, jlong ds_handle) {
-
+  JNIEnv* env, jclass, jobjectArray filter_col_names, jlong ds_handle)
+{
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
   try {
@@ -1818,28 +2038,30 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvroFromDataSource(
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
 
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
     cudf::io::avro_reader_options opts = cudf::io::avro_reader_options::builder(source)
-                                             .columns(n_filter_col_names.as_cpp_vector())
-                                             .build();
+                                           .columns(n_filter_col_names.as_cpp_vector())
+                                           .build();
     return convert_table_for_return(env, cudf::io::read_avro(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvro(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvro(JNIEnv* env,
+                                                                jclass,
                                                                 jobjectArray filter_col_names,
-                                                                jstring inputfilepath, jlong buffer,
-                                                                jlong buffer_length) {
-
+                                                                jstring inputfilepath,
+                                                                jlong buffer,
+                                                                jlong buffer_length)
+{
   const bool read_buffer = (buffer != 0);
   if (!read_buffer) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
   } else if (inputfilepath != NULL) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
-                  "cannot pass in both a buffer and an inputfilepath", NULL);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL);
   } else if (buffer_length <= 0) {
     JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL);
   }
@@ -1853,24 +2075,38 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvro(JNIEnv *env, jcl
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
 
-    auto source = read_buffer ? cudf::io::source_info(reinterpret_cast<char *>(buffer),
-                                                      static_cast<std::size_t>(buffer_length)) :
-                                cudf::io::source_info(filename.get());
+    auto source = read_buffer ? cudf::io::source_info(reinterpret_cast<char*>(buffer),
+                                                      static_cast<std::size_t>(buffer_length))
+                              : cudf::io::source_info(filename.get());
 
     cudf::io::avro_reader_options opts = cudf::io::avro_reader_options::builder(source)
-                                             .columns(n_filter_col_names.as_cpp_vector())
-                                             .build();
+                                           .columns(n_filter_col_names.as_cpp_vector())
+                                           .build();
     return convert_table_for_return(env, cudf::io::read_avro(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
-    jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
-    jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions,
-    jbooleanArray j_is_map, jbooleanArray j_is_binary, jbooleanArray j_hasParquetFieldIds,
-    jintArray j_parquetFieldIds, jobject consumer, jobject host_memory_allocator) {
+JNIEXPORT long JNICALL
+Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env,
+                                                  jclass,
+                                                  jobjectArray j_col_names,
+                                                  jint j_num_children,
+                                                  jintArray j_children,
+                                                  jbooleanArray j_col_nullability,
+                                                  jobjectArray j_metadata_keys,
+                                                  jobjectArray j_metadata_values,
+                                                  jint j_compression,
+                                                  jint j_stats_freq,
+                                                  jbooleanArray j_isInt96,
+                                                  jintArray j_precisions,
+                                                  jbooleanArray j_is_map,
+                                                  jbooleanArray j_is_binary,
+                                                  jbooleanArray j_hasParquetFieldIds,
+                                                  jintArray j_parquetFieldIds,
+                                                  jobject consumer,
+                                                  jobject host_memory_allocator)
+{
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -1878,23 +2114,34 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
   JNI_NULL_CHECK(env, consumer, "null consumer", 0);
   try {
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
-        new cudf::jni::jni_writer_data_sink(env, consumer, host_memory_allocator));
+      new cudf::jni::jni_writer_data_sink(env, consumer, host_memory_allocator));
 
     using namespace cudf::io;
     using namespace cudf::jni;
     sink_info sink{data_sink.get()};
     table_input_metadata metadata;
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96,
-                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds,
+    createTableMetaData(env,
+                        j_num_children,
+                        j_col_names,
+                        j_children,
+                        j_col_nullability,
+                        j_isInt96,
+                        j_precisions,
+                        j_is_map,
+                        metadata,
+                        j_hasParquetFieldIds,
+                        j_parquetFieldIds,
                         j_is_binary);
 
-    auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
+    auto meta_keys   = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
 
     std::map<std::string, std::string> kv_metadata;
-    std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
+    std::transform(meta_keys.begin(),
+                   meta_keys.end(),
+                   meta_values.begin(),
                    std::inserter(kv_metadata, kv_metadata.end()),
-                   [](auto const &key, auto const &value) {
+                   [](auto const& key, auto const& value) {
                      // The metadata value will be ignored if it is empty.
                      // We modify it into a space character to workaround such issue.
                      return std::make_pair(key, value.empty() ? std::string(" ") : value);
@@ -1902,27 +2149,40 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetBufferBegin(
 
     auto stats = std::make_shared<cudf::io::writer_compression_statistics>();
     chunked_parquet_writer_options opts =
-        chunked_parquet_writer_options::builder(sink)
-            .metadata(std::move(metadata))
-            .compression(static_cast<compression_type>(j_compression))
-            .stats_level(static_cast<statistics_freq>(j_stats_freq))
-            .key_value_metadata({kv_metadata})
-            .compression_statistics(stats)
-            .build();
+      chunked_parquet_writer_options::builder(sink)
+        .metadata(std::move(metadata))
+        .compression(static_cast<compression_type>(j_compression))
+        .stats_level(static_cast<statistics_freq>(j_stats_freq))
+        .key_value_metadata({kv_metadata})
+        .compression_statistics(stats)
+        .build();
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
-    cudf::jni::native_parquet_writer_handle *ret = new cudf::jni::native_parquet_writer_handle(
-        std::move(writer_ptr), std::move(data_sink), std::move(stats));
+    cudf::jni::native_parquet_writer_handle* ret = new cudf::jni::native_parquet_writer_handle(
+      std::move(writer_ptr), std::move(data_sink), std::move(stats));
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
-    jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
-    jint j_compression, jint j_stats_freq, jbooleanArray j_isInt96, jintArray j_precisions,
-    jbooleanArray j_is_map, jbooleanArray j_is_binary, jbooleanArray j_hasParquetFieldIds,
-    jintArray j_parquetFieldIds, jstring j_output_path) {
+JNIEXPORT long JNICALL
+Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env,
+                                                jclass,
+                                                jobjectArray j_col_names,
+                                                jint j_num_children,
+                                                jintArray j_children,
+                                                jbooleanArray j_col_nullability,
+                                                jobjectArray j_metadata_keys,
+                                                jobjectArray j_metadata_values,
+                                                jint j_compression,
+                                                jint j_stats_freq,
+                                                jbooleanArray j_isInt96,
+                                                jintArray j_precisions,
+                                                jbooleanArray j_is_map,
+                                                jbooleanArray j_is_binary,
+                                                jbooleanArray j_hasParquetFieldIds,
+                                                jintArray j_parquetFieldIds,
+                                                jstring j_output_path)
+{
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -1934,17 +2194,28 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     using namespace cudf::io;
     using namespace cudf::jni;
     table_input_metadata metadata;
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_isInt96,
-                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds,
+    createTableMetaData(env,
+                        j_num_children,
+                        j_col_names,
+                        j_children,
+                        j_col_nullability,
+                        j_isInt96,
+                        j_precisions,
+                        j_is_map,
+                        metadata,
+                        j_hasParquetFieldIds,
+                        j_parquetFieldIds,
                         j_is_binary);
 
-    auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
+    auto meta_keys   = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
 
     std::map<std::string, std::string> kv_metadata;
-    std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
+    std::transform(meta_keys.begin(),
+                   meta_keys.end(),
+                   meta_values.begin(),
                    std::inserter(kv_metadata, kv_metadata.end()),
-                   [](auto const &key, auto const &value) {
+                   [](auto const& key, auto const& value) {
                      // The metadata value will be ignored if it is empty.
                      // We modify it into a space character to workaround such issue.
                      return std::make_pair(key, value.empty() ? std::string(" ") : value);
@@ -1953,33 +2224,33 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeParquetFileBegin(
     sink_info sink{output_path.get()};
     auto stats = std::make_shared<cudf::io::writer_compression_statistics>();
     chunked_parquet_writer_options opts =
-        chunked_parquet_writer_options::builder(sink)
-            .metadata(std::move(metadata))
-            .compression(static_cast<compression_type>(j_compression))
-            .stats_level(static_cast<statistics_freq>(j_stats_freq))
-            .key_value_metadata({kv_metadata})
-            .compression_statistics(stats)
-            .build();
+      chunked_parquet_writer_options::builder(sink)
+        .metadata(std::move(metadata))
+        .compression(static_cast<compression_type>(j_compression))
+        .stats_level(static_cast<statistics_freq>(j_stats_freq))
+        .key_value_metadata({kv_metadata})
+        .compression_statistics(stats)
+        .build();
 
     auto writer_ptr = std::make_unique<cudf::io::parquet_chunked_writer>(opts);
-    cudf::jni::native_parquet_writer_handle *ret = new cudf::jni::native_parquet_writer_handle(
-        std::move(writer_ptr), nullptr, std::move(stats));
+    cudf::jni::native_parquet_writer_handle* ret =
+      new cudf::jni::native_parquet_writer_handle(std::move(writer_ptr), nullptr, std::move(stats));
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(JNIEnv *env, jclass,
-                                                                   jlong j_state, jlong j_table,
-                                                                   jlong mem_size) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(
+  JNIEnv* env, jclass, jlong j_state, jlong j_table, jlong mem_size)
+{
   JNI_NULL_CHECK(env, j_table, "null table", );
   JNI_NULL_CHECK(env, j_state, "null state", );
 
   using namespace cudf::io;
-  cudf::table_view *tview_with_empty_nullmask = reinterpret_cast<cudf::table_view *>(j_table);
+  cudf::table_view* tview_with_empty_nullmask = reinterpret_cast<cudf::table_view*>(j_table);
   cudf::table_view tview = cudf::jni::remove_validity_if_needed(tview_with_empty_nullmask);
-  cudf::jni::native_parquet_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_parquet_writer_handle *>(j_state);
+  cudf::jni::native_parquet_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_parquet_writer_handle*>(j_state);
 
   if (state->sink) {
     long alloc_size = std::max(cudf::jni::MINIMUM_WRITE_BUFFER_SIZE, mem_size / 2);
@@ -1992,13 +2263,13 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetChunk(JNIEnv *env,
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetEnd(JNIEnv *env, jclass,
-                                                                 jlong j_state) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetEnd(JNIEnv* env, jclass, jlong j_state)
+{
   JNI_NULL_CHECK(env, j_state, "null state", );
 
   using namespace cudf::io;
-  cudf::jni::native_parquet_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_parquet_writer_handle *>(j_state);
+  cudf::jni::native_parquet_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_parquet_writer_handle*>(j_state);
   std::unique_ptr<cudf::jni::native_parquet_writer_handle> make_sure_we_delete(state);
   try {
     cudf::jni::auto_set_device(env);
@@ -2007,10 +2278,15 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeParquetEnd(JNIEnv *env, jc
   CATCH_STD(env, )
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORCFromDataSource(
-    JNIEnv *env, jclass, jobjectArray filter_col_names, jboolean usingNumPyTypes, jint unit,
-    jobjectArray dec128_col_names, jlong ds_handle) {
-
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_readORCFromDataSource(JNIEnv* env,
+                                                jclass,
+                                                jobjectArray filter_col_names,
+                                                jboolean usingNumPyTypes,
+                                                jint unit,
+                                                jobjectArray dec128_col_names,
+                                                jlong ds_handle)
+{
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
 
   try {
@@ -2020,7 +2296,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORCFromDataSource(
 
     cudf::jni::native_jstringArray n_dec128_col_names(env, dec128_col_names);
 
-    auto ds = reinterpret_cast<cudf::io::datasource *>(ds_handle);
+    auto ds = reinterpret_cast<cudf::io::datasource*>(ds_handle);
     cudf::io::source_info source{ds};
 
     auto builder = cudf::io::orc_reader_options::builder(source);
@@ -2029,26 +2305,33 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORCFromDataSource(
     }
 
     cudf::io::orc_reader_options opts =
-        builder.use_index(false)
-            .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
-            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-            .decimal128_columns(n_dec128_col_names.as_cpp_vector())
-            .build();
+      builder.use_index(false)
+        .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
+        .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+        .decimal128_columns(n_dec128_col_names.as_cpp_vector())
+        .build();
     return convert_table_for_return(env, cudf::io::read_orc(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
-    JNIEnv *env, jclass, jobjectArray filter_col_names, jstring inputfilepath, jlong buffer,
-    jlong buffer_length, jboolean usingNumPyTypes, jint unit, jobjectArray dec128_col_names) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(JNIEnv* env,
+                                                               jclass,
+                                                               jobjectArray filter_col_names,
+                                                               jstring inputfilepath,
+                                                               jlong buffer,
+                                                               jlong buffer_length,
+                                                               jboolean usingNumPyTypes,
+                                                               jint unit,
+                                                               jobjectArray dec128_col_names)
+{
   bool read_buffer = true;
   if (buffer == 0) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
     read_buffer = false;
   } else if (inputfilepath != NULL) {
-    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
-                  "cannot pass in both a buffer and an inputfilepath", NULL);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL);
   } else if (buffer_length <= 0) {
     JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL);
   }
@@ -2064,9 +2347,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
 
     cudf::jni::native_jstringArray n_dec128_col_names(env, dec128_col_names);
 
-    auto source = read_buffer ?
-                      cudf::io::source_info(reinterpret_cast<char *>(buffer), buffer_length) :
-                      cudf::io::source_info(filename.get());
+    auto source = read_buffer
+                    ? cudf::io::source_info(reinterpret_cast<char*>(buffer), buffer_length)
+                    : cudf::io::source_info(filename.get());
 
     auto builder = cudf::io::orc_reader_options::builder(source);
     if (n_filter_col_names.size() > 0) {
@@ -2074,21 +2357,31 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readORC(
     }
 
     cudf::io::orc_reader_options opts =
-        builder.use_index(false)
-            .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
-            .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
-            .decimal128_columns(n_dec128_col_names.as_cpp_vector())
-            .build();
+      builder.use_index(false)
+        .use_np_dtypes(static_cast<bool>(usingNumPyTypes))
+        .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+        .decimal128_columns(n_dec128_col_names.as_cpp_vector())
+        .build();
     return convert_table_for_return(env, cudf::io::read_orc(opts).tbl);
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
-    jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
-    jint j_compression, jintArray j_precisions, jbooleanArray j_is_map, jobject consumer,
-    jobject host_memory_allocator) {
+JNIEXPORT long JNICALL
+Java_ai_rapids_cudf_Table_writeORCBufferBegin(JNIEnv* env,
+                                              jclass,
+                                              jobjectArray j_col_names,
+                                              jint j_num_children,
+                                              jintArray j_children,
+                                              jbooleanArray j_col_nullability,
+                                              jobjectArray j_metadata_keys,
+                                              jobjectArray j_metadata_values,
+                                              jint j_compression,
+                                              jintArray j_precisions,
+                                              jbooleanArray j_is_map,
+                                              jobject consumer,
+                                              jobject host_memory_allocator)
+{
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -2103,46 +2396,66 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCBufferBegin(
     jbooleanArray j_is_int96 = NULL;
     // ORC has no `j_parquetFieldIds`, but `createTableMetaData` needs a lvalue.
     jbooleanArray j_hasParquetFieldIds = NULL;
-    jintArray j_parquetFieldIds = NULL;
+    jintArray j_parquetFieldIds        = NULL;
     // temp stub
     jbooleanArray j_is_binary = NULL;
 
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96,
-                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds,
+    createTableMetaData(env,
+                        j_num_children,
+                        j_col_names,
+                        j_children,
+                        j_col_nullability,
+                        j_is_int96,
+                        j_precisions,
+                        j_is_map,
+                        metadata,
+                        j_hasParquetFieldIds,
+                        j_parquetFieldIds,
                         j_is_binary);
 
-    auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
+    auto meta_keys   = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
 
     std::map<std::string, std::string> kv_metadata;
-    std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
+    std::transform(meta_keys.begin(),
+                   meta_keys.end(),
+                   meta_values.begin(),
                    std::inserter(kv_metadata, kv_metadata.end()),
-                   [](const std::string &k, const std::string &v) { return std::make_pair(k, v); });
+                   [](const std::string& k, const std::string& v) { return std::make_pair(k, v); });
 
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
-        new cudf::jni::jni_writer_data_sink(env, consumer, host_memory_allocator));
+      new cudf::jni::jni_writer_data_sink(env, consumer, host_memory_allocator));
     sink_info sink{data_sink.get()};
 
-    auto stats = std::make_shared<cudf::io::writer_compression_statistics>();
+    auto stats                      = std::make_shared<cudf::io::writer_compression_statistics>();
     chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
-                                          .metadata(std::move(metadata))
-                                          .compression(static_cast<compression_type>(j_compression))
-                                          .enable_statistics(ORC_STATISTICS_ROW_GROUP)
-                                          .key_value_metadata(kv_metadata)
-                                          .compression_statistics(stats)
-                                          .build();
-    auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
-    cudf::jni::native_orc_writer_handle *ret = new cudf::jni::native_orc_writer_handle(
-        std::move(writer_ptr), std::move(data_sink), std::move(stats));
+                                        .metadata(std::move(metadata))
+                                        .compression(static_cast<compression_type>(j_compression))
+                                        .enable_statistics(ORC_STATISTICS_ROW_GROUP)
+                                        .key_value_metadata(kv_metadata)
+                                        .compression_statistics(stats)
+                                        .build();
+    auto writer_ptr                          = std::make_unique<cudf::io::orc_chunked_writer>(opts);
+    cudf::jni::native_orc_writer_handle* ret = new cudf::jni::native_orc_writer_handle(
+      std::move(writer_ptr), std::move(data_sink), std::move(stats));
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jint j_num_children, jintArray j_children,
-    jbooleanArray j_col_nullability, jobjectArray j_metadata_keys, jobjectArray j_metadata_values,
-    jint j_compression, jintArray j_precisions, jbooleanArray j_is_map, jstring j_output_path) {
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(JNIEnv* env,
+                                                                   jclass,
+                                                                   jobjectArray j_col_names,
+                                                                   jint j_num_children,
+                                                                   jintArray j_children,
+                                                                   jbooleanArray j_col_nullability,
+                                                                   jobjectArray j_metadata_keys,
+                                                                   jobjectArray j_metadata_values,
+                                                                   jint j_compression,
+                                                                   jintArray j_precisions,
+                                                                   jbooleanArray j_is_map,
+                                                                   jstring j_output_path)
+{
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_col_nullability, "null nullability", 0);
   JNI_NULL_CHECK(env, j_metadata_keys, "null metadata keys", 0);
@@ -2158,48 +2471,60 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(
     jbooleanArray j_is_int96 = NULL;
     // ORC has no `j_parquetFieldIds`, but `createTableMetaData` needs a lvalue.
     jbooleanArray j_hasParquetFieldIds = NULL;
-    jintArray j_parquetFieldIds = NULL;
+    jintArray j_parquetFieldIds        = NULL;
     // temp stub
     jbooleanArray j_is_binary = NULL;
-    createTableMetaData(env, j_num_children, j_col_names, j_children, j_col_nullability, j_is_int96,
-                        j_precisions, j_is_map, metadata, j_hasParquetFieldIds, j_parquetFieldIds,
+    createTableMetaData(env,
+                        j_num_children,
+                        j_col_names,
+                        j_children,
+                        j_col_nullability,
+                        j_is_int96,
+                        j_precisions,
+                        j_is_map,
+                        metadata,
+                        j_hasParquetFieldIds,
+                        j_parquetFieldIds,
                         j_is_binary);
 
-    auto meta_keys = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
+    auto meta_keys   = cudf::jni::native_jstringArray{env, j_metadata_keys}.as_cpp_vector();
     auto meta_values = cudf::jni::native_jstringArray{env, j_metadata_values}.as_cpp_vector();
 
     std::map<std::string, std::string> kv_metadata;
-    std::transform(meta_keys.begin(), meta_keys.end(), meta_values.begin(),
+    std::transform(meta_keys.begin(),
+                   meta_keys.end(),
+                   meta_values.begin(),
                    std::inserter(kv_metadata, kv_metadata.end()),
-                   [](const std::string &k, const std::string &v) { return std::make_pair(k, v); });
+                   [](const std::string& k, const std::string& v) { return std::make_pair(k, v); });
 
     sink_info sink{output_path.get()};
-    auto stats = std::make_shared<cudf::io::writer_compression_statistics>();
+    auto stats                      = std::make_shared<cudf::io::writer_compression_statistics>();
     chunked_orc_writer_options opts = chunked_orc_writer_options::builder(sink)
-                                          .metadata(std::move(metadata))
-                                          .compression(static_cast<compression_type>(j_compression))
-                                          .enable_statistics(ORC_STATISTICS_ROW_GROUP)
-                                          .key_value_metadata(kv_metadata)
-                                          .compression_statistics(stats)
-                                          .build();
+                                        .metadata(std::move(metadata))
+                                        .compression(static_cast<compression_type>(j_compression))
+                                        .enable_statistics(ORC_STATISTICS_ROW_GROUP)
+                                        .key_value_metadata(kv_metadata)
+                                        .compression_statistics(stats)
+                                        .build();
     auto writer_ptr = std::make_unique<cudf::io::orc_chunked_writer>(opts);
-    cudf::jni::native_orc_writer_handle *ret =
-        new cudf::jni::native_orc_writer_handle(std::move(writer_ptr), nullptr, std::move(stats));
+    cudf::jni::native_orc_writer_handle* ret =
+      new cudf::jni::native_orc_writer_handle(std::move(writer_ptr), nullptr, std::move(stats));
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(JNIEnv *env, jclass, jlong j_state,
-                                                               jlong j_table, jlong mem_size) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(
+  JNIEnv* env, jclass, jlong j_state, jlong j_table, jlong mem_size)
+{
   JNI_NULL_CHECK(env, j_table, "null table", );
   JNI_NULL_CHECK(env, j_state, "null state", );
 
   using namespace cudf::io;
-  cudf::table_view *tview_orig = reinterpret_cast<cudf::table_view *>(j_table);
-  cudf::table_view tview = cudf::jni::remove_validity_if_needed(tview_orig);
-  cudf::jni::native_orc_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_orc_writer_handle *>(j_state);
+  cudf::table_view* tview_orig = reinterpret_cast<cudf::table_view*>(j_table);
+  cudf::table_view tview       = cudf::jni::remove_validity_if_needed(tview_orig);
+  cudf::jni::native_orc_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_orc_writer_handle*>(j_state);
 
   if (state->sink) {
     long alloc_size = std::max(cudf::jni::MINIMUM_WRITE_BUFFER_SIZE, mem_size / 2);
@@ -2212,12 +2537,13 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCChunk(JNIEnv *env, jcla
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCEnd(JNIEnv *env, jclass, jlong j_state) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCEnd(JNIEnv* env, jclass, jlong j_state)
+{
   JNI_NULL_CHECK(env, j_state, "null state", );
 
   using namespace cudf::io;
-  cudf::jni::native_orc_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_orc_writer_handle *>(j_state);
+  cudf::jni::native_orc_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_orc_writer_handle*>(j_state);
   std::unique_ptr<cudf::jni::native_orc_writer_handle> make_sure_we_delete(state);
   try {
     cudf::jni::auto_set_device(env);
@@ -2226,25 +2552,24 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeORCEnd(JNIEnv *env, jclass
   CATCH_STD(env, )
 }
 
-JNIEXPORT jdoubleArray JNICALL Java_ai_rapids_cudf_TableWriter_getWriteStatistics(JNIEnv *env,
+JNIEXPORT jdoubleArray JNICALL Java_ai_rapids_cudf_TableWriter_getWriteStatistics(JNIEnv* env,
                                                                                   jclass,
-                                                                                  jlong j_state) {
+                                                                                  jlong j_state)
+{
   JNI_NULL_CHECK(env, j_state, "null state", nullptr);
 
   using namespace cudf::io;
-  auto const state = reinterpret_cast<cudf::jni::jni_table_writer_handle_base const *>(j_state);
+  auto const state = reinterpret_cast<cudf::jni::jni_table_writer_handle_base const*>(j_state);
   try {
     cudf::jni::auto_set_device(env);
-    if (!state->stats) {
-      return nullptr;
-    }
+    if (!state->stats) { return nullptr; }
 
-    auto const &stats = *state->stats;
-    auto output = cudf::jni::native_jdoubleArray(env, 4);
-    output[0] = static_cast<jdouble>(stats.num_compressed_bytes());
-    output[1] = static_cast<jdouble>(stats.num_failed_bytes());
-    output[2] = static_cast<jdouble>(stats.num_skipped_bytes());
-    output[3] = static_cast<jdouble>(stats.compression_ratio());
+    auto const& stats = *state->stats;
+    auto output       = cudf::jni::native_jdoubleArray(env, 4);
+    output[0]         = static_cast<jdouble>(stats.num_compressed_bytes());
+    output[1]         = static_cast<jdouble>(stats.num_failed_bytes());
+    output[2]         = static_cast<jdouble>(stats.num_skipped_bytes());
+    output[3]         = static_cast<jdouble>(stats.compression_ratio());
 
     return output.get_jArray();
   }
@@ -2252,8 +2577,8 @@ JNIEXPORT jdoubleArray JNICALL Java_ai_rapids_cudf_TableWriter_getWriteStatistic
 }
 
 JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(
-    JNIEnv *env, jclass, jobjectArray j_col_names, jobject consumer,
-    jobject host_memory_allocator) {
+  JNIEnv* env, jclass, jobjectArray j_col_names, jobject consumer, jobject host_memory_allocator)
+{
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, consumer, "null consumer", 0);
   try {
@@ -2261,18 +2586,20 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCBufferBegin(
     cudf::jni::native_jstringArray col_names(env, j_col_names);
 
     std::shared_ptr<cudf::jni::jni_arrow_output_stream> data_sink(
-        new cudf::jni::jni_arrow_output_stream(env, consumer, host_memory_allocator));
+      new cudf::jni::jni_arrow_output_stream(env, consumer, host_memory_allocator));
 
-    cudf::jni::native_arrow_ipc_writer_handle *ret =
-        new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), data_sink);
+    cudf::jni::native_arrow_ipc_writer_handle* ret =
+      new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), data_sink);
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin(JNIEnv *env, jclass,
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin(JNIEnv* env,
+                                                                        jclass,
                                                                         jobjectArray j_col_names,
-                                                                        jstring j_output_path) {
+                                                                        jstring j_output_path)
+{
   JNI_NULL_CHECK(env, j_col_names, "null columns", 0);
   JNI_NULL_CHECK(env, j_output_path, "null output path", 0);
   try {
@@ -2280,22 +2607,24 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCFileBegin(JNIEnv *
     cudf::jni::native_jstringArray col_names(env, j_col_names);
     cudf::jni::native_jstring output_path(env, j_output_path);
 
-    cudf::jni::native_arrow_ipc_writer_handle *ret =
-        new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), output_path.get());
+    cudf::jni::native_arrow_ipc_writer_handle* ret =
+      new cudf::jni::native_arrow_ipc_writer_handle(col_names.as_cpp_vector(), output_path.get());
     return ptr_as_jlong(ret);
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv* env,
+                                                                          jclass,
                                                                           jlong j_state,
-                                                                          jlong j_table) {
+                                                                          jlong j_table)
+{
   JNI_NULL_CHECK(env, j_table, "null table", 0);
   JNI_NULL_CHECK(env, j_state, "null state", 0);
 
-  cudf::table_view *tview = reinterpret_cast<cudf::table_view *>(j_table);
-  cudf::jni::native_arrow_ipc_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_arrow_ipc_writer_handle *>(j_state);
+  cudf::table_view* tview = reinterpret_cast<cudf::table_view*>(j_table);
+  cudf::jni::native_arrow_ipc_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_arrow_ipc_writer_handle*>(j_state);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -2311,17 +2640,16 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCArrowChunk(JNIEnv *env, jclass,
-                                                                         jlong j_state,
-                                                                         jlong arrow_table_handle,
-                                                                         jlong max_chunk) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCArrowChunk(
+  JNIEnv* env, jclass, jlong j_state, jlong arrow_table_handle, jlong max_chunk)
+{
   JNI_NULL_CHECK(env, arrow_table_handle, "null arrow table", );
   JNI_NULL_CHECK(env, j_state, "null state", );
 
-  std::shared_ptr<arrow::Table> *handle =
-      reinterpret_cast<std::shared_ptr<arrow::Table> *>(arrow_table_handle);
-  cudf::jni::native_arrow_ipc_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_arrow_ipc_writer_handle *>(j_state);
+  std::shared_ptr<arrow::Table>* handle =
+    reinterpret_cast<std::shared_ptr<arrow::Table>*>(arrow_table_handle);
+  cudf::jni::native_arrow_ipc_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_arrow_ipc_writer_handle*>(j_state);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -2330,12 +2658,14 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCArrowChunk(JNIEnv
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCEnd(JNIEnv *env, jclass,
-                                                                  jlong j_state) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCEnd(JNIEnv* env,
+                                                                  jclass,
+                                                                  jlong j_state)
+{
   JNI_NULL_CHECK(env, j_state, "null state", );
 
-  cudf::jni::native_arrow_ipc_writer_handle *state =
-      reinterpret_cast<cudf::jni::native_arrow_ipc_writer_handle *>(j_state);
+  cudf::jni::native_arrow_ipc_writer_handle* state =
+    reinterpret_cast<cudf::jni::native_arrow_ipc_writer_handle*>(j_state);
   std::unique_ptr<cudf::jni::native_arrow_ipc_writer_handle> make_sure_we_delete(state);
   try {
     cudf::jni::auto_set_device(env);
@@ -2344,8 +2674,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_writeArrowIPCEnd(JNIEnv *env, j
   CATCH_STD(env, )
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCFileBegin(JNIEnv *env, jclass,
-                                                                       jstring j_input_path) {
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCFileBegin(JNIEnv* env,
+                                                                       jclass,
+                                                                       jstring j_input_path)
+{
   JNI_NULL_CHECK(env, j_input_path, "null input path", 0);
   try {
     cudf::jni::auto_set_device(env);
@@ -2355,25 +2687,29 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCFileBegin(JNIEnv *e
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCBufferBegin(JNIEnv *env, jclass,
-                                                                         jobject provider) {
+JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_readArrowIPCBufferBegin(JNIEnv* env,
+                                                                         jclass,
+                                                                         jobject provider)
+{
   JNI_NULL_CHECK(env, provider, "null provider", 0);
   try {
     cudf::jni::auto_set_device(env);
     std::shared_ptr<cudf::jni::jni_arrow_input_stream> data_source(
-        new cudf::jni::jni_arrow_input_stream(env, provider));
+      new cudf::jni::jni_arrow_input_stream(env, provider));
     return ptr_as_jlong(new cudf::jni::native_arrow_ipc_reader_handle(data_source));
   }
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readArrowIPCChunkToArrowTable(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readArrowIPCChunkToArrowTable(JNIEnv* env,
+                                                                                jclass,
                                                                                 jlong j_state,
-                                                                                jint row_target) {
+                                                                                jint row_target)
+{
   JNI_NULL_CHECK(env, j_state, "null state", 0);
 
-  cudf::jni::native_arrow_ipc_reader_handle *state =
-      reinterpret_cast<cudf::jni::native_arrow_ipc_reader_handle *>(j_state);
+  cudf::jni::native_arrow_ipc_reader_handle* state =
+    reinterpret_cast<cudf::jni::native_arrow_ipc_reader_handle*>(j_state);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -2385,10 +2721,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readArrowIPCChunkToArrowTable(
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_closeArrowTable(JNIEnv *env, jclass,
-                                                                 jlong arrow_table_handle) {
-  std::shared_ptr<arrow::Table> *handle =
-      reinterpret_cast<std::shared_ptr<arrow::Table> *>(arrow_table_handle);
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_closeArrowTable(JNIEnv* env,
+                                                                 jclass,
+                                                                 jlong arrow_table_handle)
+{
+  std::shared_ptr<arrow::Table>* handle =
+    reinterpret_cast<std::shared_ptr<arrow::Table>*>(arrow_table_handle);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -2398,11 +2736,12 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_closeArrowTable(JNIEnv *env, jc
 }
 
 JNIEXPORT jlongArray JNICALL
-Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv *env, jclass, jlong arrow_table_handle) {
+Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv* env, jclass, jlong arrow_table_handle)
+{
   JNI_NULL_CHECK(env, arrow_table_handle, "null arrow handle", 0);
 
-  std::shared_ptr<arrow::Table> *handle =
-      reinterpret_cast<std::shared_ptr<arrow::Table> *>(arrow_table_handle);
+  std::shared_ptr<arrow::Table>* handle =
+    reinterpret_cast<std::shared_ptr<arrow::Table>*>(arrow_table_handle);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -2411,12 +2750,12 @@ Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv *env, jclass, jlong arr
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_readArrowIPCEnd(JNIEnv *env, jclass,
-                                                                 jlong j_state) {
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_readArrowIPCEnd(JNIEnv* env, jclass, jlong j_state)
+{
   JNI_NULL_CHECK(env, j_state, "null state", );
 
-  cudf::jni::native_arrow_ipc_reader_handle *state =
-      reinterpret_cast<cudf::jni::native_arrow_ipc_reader_handle *>(j_state);
+  cudf::jni::native_arrow_ipc_reader_handle* state =
+    reinterpret_cast<cudf::jni::native_arrow_ipc_reader_handle*>(j_state);
   std::unique_ptr<cudf::jni::native_arrow_ipc_reader_handle> make_sure_we_delete(state);
   try {
     cudf::jni::auto_set_device(env);
@@ -2426,523 +2765,772 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Table_readArrowIPCEnd(JNIEnv *env, jc
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_maps(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        return cudf::left_join(left, right, nulleq);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      return cudf::left_join(left, right, nulleq);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftDistinctJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_single_map(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right) ?
-                             cudf::nullable_join::YES :
-                             cudf::nullable_join::NO;
-        if (cudf::detail::has_nested_columns(right)) {
-          cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
-          return hash.left_join();
-        } else {
-          cudf::distinct_hash_join<cudf::has_nested::NO> hash(right, left, has_nulls, nulleq);
-          return hash.left_join();
-        }
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right)
+                         ? cudf::nullable_join::YES
+                         : cudf::nullable_join::NO;
+      if (cudf::detail::has_nested_columns(right)) {
+        cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
+        return hash.left_join();
+      } else {
+        cudf::distinct_hash_join<cudf::has_nested::NO> hash(right, left, has_nulls, nulleq);
+        return hash.left_join();
+      }
+    });
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_leftJoinRowCount(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_leftJoinRowCount(JNIEnv* env,
+                                                                   jclass,
                                                                    jlong j_left_table,
-                                                                   jlong j_right_hash_join) {
+                                                                   jlong j_right_hash_join)
+{
   JNI_NULL_CHECK(env, j_left_table, "left table is null", 0);
   JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
-    auto row_count = hash_join->left_join_size(*left_table);
+    auto left_table = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto hash_join  = reinterpret_cast<cudf::hash_join const*>(j_right_hash_join);
+    auto row_count  = hash_join->left_join_size(*left_table);
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_hash_join)
+{
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join,
-      [](cudf::table_view const &left, cudf::hash_join const &hash) {
-        return hash.left_join(left);
-      });
+    env,
+    j_left_table,
+    j_right_hash_join,
+    [](cudf::table_view const& left, cudf::hash_join const& hash) { return hash.left_join(left); });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftHashJoinGatherMapsWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count)
+{
   auto output_row_count = static_cast<std::size_t>(j_output_row_count);
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join,
-      [output_row_count](cudf::table_view const &left, cudf::hash_join const &hash) {
-        return hash.left_join(left, output_row_count);
-      });
+    env,
+    j_left_table,
+    j_right_hash_join,
+    [output_row_count](cudf::table_view const& left, cudf::hash_join const& hash) {
+      return hash.left_join(left, output_row_count);
+    });
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinRowCount(JNIEnv *env, jclass,
-                                                                              jlong j_left_table,
-                                                                              jlong j_right_table,
-                                                                              jlong j_condition) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinRowCount(
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
   JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0);
   JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0);
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
-    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
-    auto row_count = cudf::conditional_left_join_size(*left_table, *right_table,
-                                                      condition->get_top_expression());
+    auto left_table  = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const*>(j_right_table);
+    auto condition   = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
+    auto row_count =
+      cudf::conditional_left_join_size(*left_table, *right_table, condition->get_top_expression());
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
-  return cudf::jni::cond_join_gather_maps(
-      env, j_left_table, j_right_table, j_condition,
-      [](cudf::table_view const &left, cudf::table_view const &right,
-         cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_left_join(left, right, cond_expr);
-      });
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
+  return cudf::jni::cond_join_gather_maps(env,
+                                          j_left_table,
+                                          j_right_table,
+                                          j_condition,
+                                          [](cudf::table_view const& left,
+                                             cudf::table_view const& right,
+                                             cudf::ast::expression const& cond_expr) {
+                                            return cudf::conditional_left_join(
+                                              left, right, cond_expr);
+                                          });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftJoinGatherMapsWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
-    jlong j_row_count) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_conditionalLeftJoinGatherMapsWithCount(JNIEnv* env,
+                                                                 jclass,
+                                                                 jlong j_left_table,
+                                                                 jlong j_right_table,
+                                                                 jlong j_condition,
+                                                                 jlong j_row_count)
+{
   auto row_count = static_cast<std::size_t>(j_row_count);
-  return cudf::jni::cond_join_gather_maps(
-      env, j_left_table, j_right_table, j_condition,
-      [row_count](cudf::table_view const &left, cudf::table_view const &right,
-                  cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_left_join(left, right, cond_expr, row_count);
-      });
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftJoinSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+  return cudf::jni::cond_join_gather_maps(env,
+                                          j_left_table,
+                                          j_right_table,
+                                          j_condition,
+                                          [row_count](cudf::table_view const& left,
+                                                      cudf::table_view const& right,
+                                                      cudf::ast::expression const& cond_expr) {
+                                            return cudf::conditional_left_join(
+                                              left, right, cond_expr, row_count);
+                                          });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftJoinSize(JNIEnv* env,
+                                                                         jclass,
+                                                                         jlong j_left_keys,
+                                                                         jlong j_right_keys,
+                                                                         jlong j_left_condition,
+                                                                         jlong j_right_condition,
+                                                                         jlong j_condition,
+                                                                         jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_size(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_join_size(left_keys, right_keys, left_condition, right_condition,
-                                          condition, nulls_equal);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_left_join_size(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedLeftJoinGatherMaps(JNIEnv* env,
+                                                  jclass,
+                                                  jlong j_left_keys,
+                                                  jlong j_right_keys,
+                                                  jlong j_left_condition,
+                                                  jlong j_right_condition,
+                                                  jlong j_condition,
+                                                  jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_gather_maps(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_join(left_keys, right_keys, left_condition, right_condition,
-                                     condition, nulls_equal);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_left_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftJoinGatherMapsWithSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count,
-    jlong j_matches_view) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedLeftJoinGatherMapsWithSize(JNIEnv* env,
+                                                          jclass,
+                                                          jlong j_left_keys,
+                                                          jlong j_right_keys,
+                                                          jlong j_left_condition,
+                                                          jlong j_right_condition,
+                                                          jlong j_condition,
+                                                          jboolean j_nulls_equal,
+                                                          jlong j_output_row_count,
+                                                          jlong j_matches_view)
+{
   auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view);
   return cudf::jni::mixed_join_gather_maps(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-                   cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-                   cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_join(left_keys, right_keys, left_condition, right_condition,
-                                     condition, nulls_equal, size_info);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [&size_info](cudf::table_view const& left_keys,
+                 cudf::table_view const& right_keys,
+                 cudf::table_view const& left_condition,
+                 cudf::table_view const& right_condition,
+                 cudf::ast::expression const& condition,
+                 cudf::null_equality nulls_equal) {
+      return cudf::mixed_left_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal, size_info);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_maps(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        return cudf::inner_join(left, right, nulleq);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      return cudf::inner_join(left, right, nulleq);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerDistinctJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_maps(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right) ?
-                             cudf::nullable_join::YES :
-                             cudf::nullable_join::NO;
-        std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
-                  std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
-            maps;
-        if (cudf::detail::has_nested_columns(right)) {
-          cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
-          maps = hash.inner_join();
-        } else {
-          cudf::distinct_hash_join<cudf::has_nested::NO> hash(right, left, has_nulls, nulleq);
-          maps = hash.inner_join();
-        }
-        // Unique join returns {right map, left map} but all the other joins
-        // return {left map, right map}. Swap here to make it consistent.
-        return std::make_pair(std::move(maps.second), std::move(maps.first));
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right)
+                         ? cudf::nullable_join::YES
+                         : cudf::nullable_join::NO;
+      std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
+                std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
+        maps;
+      if (cudf::detail::has_nested_columns(right)) {
+        cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
+        maps = hash.inner_join();
+      } else {
+        cudf::distinct_hash_join<cudf::has_nested::NO> hash(right, left, has_nulls, nulleq);
+        maps = hash.inner_join();
+      }
+      // Unique join returns {right map, left map} but all the other joins
+      // return {left map, right map}. Swap here to make it consistent.
+      return std::make_pair(std::move(maps.second), std::move(maps.first));
+    });
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_innerJoinRowCount(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_innerJoinRowCount(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong j_left_table,
-                                                                    jlong j_right_hash_join) {
+                                                                    jlong j_right_hash_join)
+{
   JNI_NULL_CHECK(env, j_left_table, "left table is null", 0);
   JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
-    auto row_count = hash_join->inner_join_size(*left_table);
+    auto left_table = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto hash_join  = reinterpret_cast<cudf::hash_join const*>(j_right_hash_join);
+    auto row_count  = hash_join->inner_join_size(*left_table);
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_hash_join)
+{
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join,
-      [](cudf::table_view const &left, cudf::hash_join const &hash) {
-        return hash.inner_join(left);
-      });
+    env,
+    j_left_table,
+    j_right_hash_join,
+    [](cudf::table_view const& left, cudf::hash_join const& hash) {
+      return hash.inner_join(left);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerHashJoinGatherMapsWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count)
+{
   auto output_row_count = static_cast<std::size_t>(j_output_row_count);
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join,
-      [output_row_count](cudf::table_view const &left, cudf::hash_join const &hash) {
-        return hash.inner_join(left, output_row_count);
-      });
+    env,
+    j_left_table,
+    j_right_hash_join,
+    [output_row_count](cudf::table_view const& left, cudf::hash_join const& hash) {
+      return hash.inner_join(left, output_row_count);
+    });
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinRowCount(JNIEnv *env, jclass,
-                                                                               jlong j_left_table,
-                                                                               jlong j_right_table,
-                                                                               jlong j_condition) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinRowCount(
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
   JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0);
   JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0);
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
-    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
-    auto row_count = cudf::conditional_inner_join_size(*left_table, *right_table,
-                                                       condition->get_top_expression());
+    auto left_table  = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const*>(j_right_table);
+    auto condition   = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
+    auto row_count =
+      cudf::conditional_inner_join_size(*left_table, *right_table, condition->get_top_expression());
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
-  return cudf::jni::cond_join_gather_maps(
-      env, j_left_table, j_right_table, j_condition,
-      [](cudf::table_view const &left, cudf::table_view const &right,
-         cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_inner_join(left, right, cond_expr);
-      });
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
+  return cudf::jni::cond_join_gather_maps(env,
+                                          j_left_table,
+                                          j_right_table,
+                                          j_condition,
+                                          [](cudf::table_view const& left,
+                                             cudf::table_view const& right,
+                                             cudf::ast::expression const& cond_expr) {
+                                            return cudf::conditional_inner_join(
+                                              left, right, cond_expr);
+                                          });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalInnerJoinGatherMapsWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
-    jlong j_row_count) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_conditionalInnerJoinGatherMapsWithCount(JNIEnv* env,
+                                                                  jclass,
+                                                                  jlong j_left_table,
+                                                                  jlong j_right_table,
+                                                                  jlong j_condition,
+                                                                  jlong j_row_count)
+{
   auto row_count = static_cast<std::size_t>(j_row_count);
-  return cudf::jni::cond_join_gather_maps(
-      env, j_left_table, j_right_table, j_condition,
-      [row_count](cudf::table_view const &left, cudf::table_view const &right,
-                  cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_inner_join(left, right, cond_expr, row_count);
-      });
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedInnerJoinSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+  return cudf::jni::cond_join_gather_maps(env,
+                                          j_left_table,
+                                          j_right_table,
+                                          j_condition,
+                                          [row_count](cudf::table_view const& left,
+                                                      cudf::table_view const& right,
+                                                      cudf::ast::expression const& cond_expr) {
+                                            return cudf::conditional_inner_join(
+                                              left, right, cond_expr, row_count);
+                                          });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedInnerJoinSize(JNIEnv* env,
+                                                                          jclass,
+                                                                          jlong j_left_keys,
+                                                                          jlong j_right_keys,
+                                                                          jlong j_left_condition,
+                                                                          jlong j_right_condition,
+                                                                          jlong j_condition,
+                                                                          jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_size(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_inner_join_size(left_keys, right_keys, left_condition, right_condition,
-                                           condition, nulls_equal);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_inner_join_size(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedInnerJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedInnerJoinGatherMaps(JNIEnv* env,
+                                                   jclass,
+                                                   jlong j_left_keys,
+                                                   jlong j_right_keys,
+                                                   jlong j_left_condition,
+                                                   jlong j_right_condition,
+                                                   jlong j_condition,
+                                                   jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_gather_maps(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_inner_join(left_keys, right_keys, left_condition, right_condition,
-                                      condition, nulls_equal);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_inner_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedInnerJoinGatherMapsWithSize(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal, jlong j_output_row_count,
-    jlong j_matches_view) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedInnerJoinGatherMapsWithSize(JNIEnv* env,
+                                                           jclass,
+                                                           jlong j_left_keys,
+                                                           jlong j_right_keys,
+                                                           jlong j_left_condition,
+                                                           jlong j_right_condition,
+                                                           jlong j_condition,
+                                                           jboolean j_nulls_equal,
+                                                           jlong j_output_row_count,
+                                                           jlong j_matches_view)
+{
   auto size_info = cudf::jni::get_mixed_size_info(env, j_output_row_count, j_matches_view);
   return cudf::jni::mixed_join_gather_maps(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [&size_info](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-                   cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-                   cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_inner_join(left_keys, right_keys, left_condition, right_condition,
-                                      condition, nulls_equal, size_info);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [&size_info](cudf::table_view const& left_keys,
+                 cudf::table_view const& right_keys,
+                 cudf::table_view const& left_condition,
+                 cudf::table_view const& right_condition,
+                 cudf::ast::expression const& condition,
+                 cudf::null_equality nulls_equal) {
+      return cudf::mixed_inner_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal, size_info);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_maps(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        return cudf::full_join(left, right, nulleq);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      return cudf::full_join(left, right, nulleq);
+    });
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_fullJoinRowCount(JNIEnv *env, jclass,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_fullJoinRowCount(JNIEnv* env,
+                                                                   jclass,
                                                                    jlong j_left_table,
-                                                                   jlong j_right_hash_join) {
+                                                                   jlong j_right_hash_join)
+{
   JNI_NULL_CHECK(env, j_left_table, "left table is null", 0);
   JNI_NULL_CHECK(env, j_right_hash_join, "right hash join is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto hash_join = reinterpret_cast<cudf::hash_join const *>(j_right_hash_join);
-    auto row_count = hash_join->full_join_size(*left_table);
+    auto left_table = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto hash_join  = reinterpret_cast<cudf::hash_join const*>(j_right_hash_join);
+    auto row_count  = hash_join->full_join_size(*left_table);
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_hash_join)
+{
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join,
-      [](cudf::table_view const &left, cudf::hash_join const &hash) {
-        return hash.full_join(left);
-      });
+    env,
+    j_left_table,
+    j_right_hash_join,
+    [](cudf::table_view const& left, cudf::hash_join const& hash) { return hash.full_join(left); });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_fullHashJoinGatherMapsWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_hash_join, jlong j_output_row_count)
+{
   auto output_row_count = static_cast<std::size_t>(j_output_row_count);
   return cudf::jni::hash_join_gather_maps(
-      env, j_left_table, j_right_hash_join,
-      [output_row_count](cudf::table_view const &left, cudf::hash_join const &hash) {
-        return hash.full_join(left, output_row_count);
-      });
+    env,
+    j_left_table,
+    j_right_hash_join,
+    [output_row_count](cudf::table_view const& left, cudf::hash_join const& hash) {
+      return hash.full_join(left, output_row_count);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalFullJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
-  return cudf::jni::cond_join_gather_maps(
-      env, j_left_table, j_right_table, j_condition,
-      [](cudf::table_view const &left, cudf::table_view const &right,
-         cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_full_join(left, right, cond_expr);
-      });
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
+  return cudf::jni::cond_join_gather_maps(env,
+                                          j_left_table,
+                                          j_right_table,
+                                          j_condition,
+                                          [](cudf::table_view const& left,
+                                             cudf::table_view const& right,
+                                             cudf::ast::expression const& cond_expr) {
+                                            return cudf::conditional_full_join(
+                                              left, right, cond_expr);
+                                          });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedFullJoinGatherMaps(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedFullJoinGatherMaps(JNIEnv* env,
+                                                  jclass,
+                                                  jlong j_left_keys,
+                                                  jlong j_right_keys,
+                                                  jlong j_left_condition,
+                                                  jlong j_right_condition,
+                                                  jlong j_condition,
+                                                  jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_gather_maps(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_full_join(left_keys, right_keys, left_condition, right_condition,
-                                     condition, nulls_equal);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_full_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftSemiJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_single_map(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        return cudf::left_semi_join(left, right, nulleq);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      return cudf::left_semi_join(left, right, nulleq);
+    });
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinRowCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
   JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0);
   JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0);
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
-    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
-    auto row_count = cudf::conditional_left_semi_join_size(*left_table, *right_table,
-                                                           condition->get_top_expression());
+    auto left_table  = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const*>(j_right_table);
+    auto condition   = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
+    auto row_count   = cudf::conditional_left_semi_join_size(
+      *left_table, *right_table, condition->get_top_expression());
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
-  return cudf::jni::cond_join_gather_single_map(
-      env, j_left_table, j_right_table, j_condition,
-      [](cudf::table_view const &left, cudf::table_view const &right,
-         cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_left_semi_join(left, right, cond_expr);
-      });
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
+  return cudf::jni::cond_join_gather_single_map(env,
+                                                j_left_table,
+                                                j_right_table,
+                                                j_condition,
+                                                [](cudf::table_view const& left,
+                                                   cudf::table_view const& right,
+                                                   cudf::ast::expression const& cond_expr) {
+                                                  return cudf::conditional_left_semi_join(
+                                                    left, right, cond_expr);
+                                                });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGatherMapWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
-    jlong j_row_count) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_conditionalLeftSemiJoinGatherMapWithCount(JNIEnv* env,
+                                                                    jclass,
+                                                                    jlong j_left_table,
+                                                                    jlong j_right_table,
+                                                                    jlong j_condition,
+                                                                    jlong j_row_count)
+{
   auto row_count = static_cast<std::size_t>(j_row_count);
   return cudf::jni::cond_join_gather_single_map(
-      env, j_left_table, j_right_table, j_condition,
-      [row_count](cudf::table_view const &left, cudf::table_view const &right,
-                  cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_left_semi_join(left, right, cond_expr, row_count);
-      });
+    env,
+    j_left_table,
+    j_right_table,
+    j_condition,
+    [row_count](cudf::table_view const& left,
+                cudf::table_view const& right,
+                cudf::ast::expression const& cond_expr) {
+      return cudf::conditional_left_semi_join(left, right, cond_expr, row_count);
+    });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedLeftSemiJoinGatherMap(JNIEnv* env,
+                                                     jclass,
+                                                     jlong j_left_keys,
+                                                     jlong j_right_keys,
+                                                     jlong j_left_condition,
+                                                     jlong j_right_condition,
+                                                     jlong j_condition,
+                                                     jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_gather_single_map(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_semi_join(left_keys, right_keys, left_condition, right_condition,
-                                          condition, nulls_equal);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_left_semi_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftAntiJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal) {
+  JNIEnv* env, jclass, jlong j_left_keys, jlong j_right_keys, jboolean compare_nulls_equal)
+{
   return cudf::jni::join_gather_single_map(
-      env, j_left_keys, j_right_keys, compare_nulls_equal,
-      [](cudf::table_view const &left, cudf::table_view const &right, cudf::null_equality nulleq) {
-        return cudf::left_anti_join(left, right, nulleq);
-      });
+    env,
+    j_left_keys,
+    j_right_keys,
+    compare_nulls_equal,
+    [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) {
+      return cudf::left_anti_join(left, right, nulleq);
+    });
 }
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinRowCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
   JNI_NULL_CHECK(env, j_left_table, "left_table is null", 0);
   JNI_NULL_CHECK(env, j_right_table, "right_table is null", 0);
   JNI_NULL_CHECK(env, j_condition, "condition is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto left_table = reinterpret_cast<cudf::table_view const *>(j_left_table);
-    auto right_table = reinterpret_cast<cudf::table_view const *>(j_right_table);
-    auto condition = reinterpret_cast<cudf::jni::ast::compiled_expr const *>(j_condition);
-    auto row_count = cudf::conditional_left_anti_join_size(*left_table, *right_table,
-                                                           condition->get_top_expression());
+    auto left_table  = reinterpret_cast<cudf::table_view const*>(j_left_table);
+    auto right_table = reinterpret_cast<cudf::table_view const*>(j_right_table);
+    auto condition   = reinterpret_cast<cudf::jni::ast::compiled_expr const*>(j_condition);
+    auto row_count   = cudf::conditional_left_anti_join_size(
+      *left_table, *right_table, condition->get_top_expression());
     return static_cast<jlong>(row_count);
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition) {
-  return cudf::jni::cond_join_gather_single_map(
-      env, j_left_table, j_right_table, j_condition,
-      [](cudf::table_view const &left, cudf::table_view const &right,
-         cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_left_anti_join(left, right, cond_expr);
-      });
+  JNIEnv* env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition)
+{
+  return cudf::jni::cond_join_gather_single_map(env,
+                                                j_left_table,
+                                                j_right_table,
+                                                j_condition,
+                                                [](cudf::table_view const& left,
+                                                   cudf::table_view const& right,
+                                                   cudf::ast::expression const& cond_expr) {
+                                                  return cudf::conditional_left_anti_join(
+                                                    left, right, cond_expr);
+                                                });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGatherMapWithCount(
-    JNIEnv *env, jclass, jlong j_left_table, jlong j_right_table, jlong j_condition,
-    jlong j_row_count) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_conditionalLeftAntiJoinGatherMapWithCount(JNIEnv* env,
+                                                                    jclass,
+                                                                    jlong j_left_table,
+                                                                    jlong j_right_table,
+                                                                    jlong j_condition,
+                                                                    jlong j_row_count)
+{
   auto row_count = static_cast<std::size_t>(j_row_count);
   return cudf::jni::cond_join_gather_single_map(
-      env, j_left_table, j_right_table, j_condition,
-      [row_count](cudf::table_view const &left, cudf::table_view const &right,
-                  cudf::ast::expression const &cond_expr) {
-        return cudf::conditional_left_anti_join(left, right, cond_expr, row_count);
-      });
+    env,
+    j_left_table,
+    j_right_table,
+    j_condition,
+    [row_count](cudf::table_view const& left,
+                cudf::table_view const& right,
+                cudf::ast::expression const& cond_expr) {
+      return cudf::conditional_left_anti_join(left, right, cond_expr, row_count);
+    });
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMap(
-    JNIEnv *env, jclass, jlong j_left_keys, jlong j_right_keys, jlong j_left_condition,
-    jlong j_right_condition, jlong j_condition, jboolean j_nulls_equal) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_mixedLeftAntiJoinGatherMap(JNIEnv* env,
+                                                     jclass,
+                                                     jlong j_left_keys,
+                                                     jlong j_right_keys,
+                                                     jlong j_left_condition,
+                                                     jlong j_right_condition,
+                                                     jlong j_condition,
+                                                     jboolean j_nulls_equal)
+{
   return cudf::jni::mixed_join_gather_single_map(
-      env, j_left_keys, j_right_keys, j_left_condition, j_right_condition, j_condition,
-      j_nulls_equal,
-      [](cudf::table_view const &left_keys, cudf::table_view const &right_keys,
-         cudf::table_view const &left_condition, cudf::table_view const &right_condition,
-         cudf::ast::expression const &condition, cudf::null_equality nulls_equal) {
-        return cudf::mixed_left_anti_join(left_keys, right_keys, left_condition, right_condition,
-                                          condition, nulls_equal);
-      });
-}
-
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv *env, jclass,
+    env,
+    j_left_keys,
+    j_right_keys,
+    j_left_condition,
+    j_right_condition,
+    j_condition,
+    j_nulls_equal,
+    [](cudf::table_view const& left_keys,
+       cudf::table_view const& right_keys,
+       cudf::table_view const& left_condition,
+       cudf::table_view const& right_condition,
+       cudf::ast::expression const& condition,
+       cudf::null_equality nulls_equal) {
+      return cudf::mixed_left_anti_join(
+        left_keys, right_keys, left_condition, right_condition, condition, nulls_equal);
+    });
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_crossJoin(JNIEnv* env,
+                                                                 jclass,
                                                                  jlong left_table,
-                                                                 jlong right_table) {
+                                                                 jlong right_table)
+{
   JNI_NULL_CHECK(env, left_table, "left_table is null", NULL);
   JNI_NULL_CHECK(env, right_table, "right_table is null", NULL);
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const left = reinterpret_cast<cudf::table_view const *>(left_table);
-    auto const right = reinterpret_cast<cudf::table_view const *>(right_table);
+    auto const left  = reinterpret_cast<cudf::table_view const*>(left_table);
+    auto const right = reinterpret_cast<cudf::table_view const*>(right_table);
     return convert_table_for_return(env, cudf::cross_join(*left, *right));
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_interleaveColumns(JNIEnv *env, jclass,
-                                                                    jlongArray j_cudf_table_view) {
-
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_interleaveColumns(JNIEnv* env,
+                                                                    jclass,
+                                                                    jlongArray j_cudf_table_view)
+{
   JNI_NULL_CHECK(env, j_cudf_table_view, "table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *table_view = reinterpret_cast<cudf::table_view *>(j_cudf_table_view);
+    cudf::table_view* table_view = reinterpret_cast<cudf::table_view*>(j_cudf_table_view);
     return release_as_jlong(cudf::interleave_columns(*table_view));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv *env, jclass,
-                                                                   jlongArray table_handles) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv* env,
+                                                                   jclass,
+                                                                   jlongArray table_handles)
+{
   JNI_NULL_CHECK(env, table_handles, "input tables are null", NULL);
   try {
     cudf::jni::auto_set_device(env);
@@ -2953,12 +3541,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_concatenate(JNIEnv *env,
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv* env,
+                                                                 jclass,
                                                                  jlong input_table,
                                                                  jlong partition_column,
                                                                  jint number_of_partitions,
-                                                                 jintArray output_offsets) {
-
+                                                                 jintArray output_offsets)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, partition_column, "partition_column is null", NULL);
   JNI_NULL_CHECK(env, output_offsets, "output_offsets is null", NULL);
@@ -2966,11 +3555,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jc
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const n_input_table = reinterpret_cast<cudf::table_view const *>(input_table);
-    auto const n_part_column = reinterpret_cast<cudf::column_view const *>(partition_column);
+    auto const n_input_table = reinterpret_cast<cudf::table_view const*>(input_table);
+    auto const n_part_column = reinterpret_cast<cudf::column_view const*>(partition_column);
 
     auto [partitioned_table, partition_offsets] =
-        cudf::partition(*n_input_table, *n_part_column, number_of_partitions);
+      cudf::partition(*n_input_table, *n_part_column, number_of_partitions);
 
     // for what ever reason partition returns the length of the result at then
     // end and hash partition/round robin do not, so skip the last entry for
@@ -2983,10 +3572,15 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_partition(JNIEnv *env, jc
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(
-    JNIEnv *env, jclass, jlong input_table, jintArray columns_to_hash, jint hash_function,
-    jint number_of_partitions, jint seed, jintArray output_offsets) {
-
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(JNIEnv* env,
+                                                                     jclass,
+                                                                     jlong input_table,
+                                                                     jintArray columns_to_hash,
+                                                                     jint hash_function,
+                                                                     jint number_of_partitions,
+                                                                     jint seed,
+                                                                     jintArray output_offsets)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, columns_to_hash, "columns_to_hash is null", NULL);
   JNI_NULL_CHECK(env, output_offsets, "output_offsets is null", NULL);
@@ -2994,9 +3588,9 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(
 
   try {
     cudf::jni::auto_set_device(env);
-    auto const hash_func = static_cast<cudf::hash_id>(hash_function);
-    auto const hash_seed = static_cast<uint32_t>(seed);
-    auto const n_input_table = reinterpret_cast<cudf::table_view const *>(input_table);
+    auto const hash_func     = static_cast<cudf::hash_id>(hash_function);
+    auto const hash_seed     = static_cast<uint32_t>(seed);
+    auto const n_input_table = reinterpret_cast<cudf::table_view const*>(input_table);
     cudf::jni::native_jintArray n_columns_to_hash(env, columns_to_hash);
     JNI_ARG_CHECK(env, n_columns_to_hash.size() > 0, "columns_to_hash is zero", NULL);
 
@@ -3004,7 +3598,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(
                                                      n_columns_to_hash.end());
 
     auto [partitioned_table, partition_offsets] = cudf::hash_partition(
-        *n_input_table, columns_to_hash_vec, number_of_partitions, hash_func, hash_seed);
+      *n_input_table, columns_to_hash_vec, number_of_partitions, hash_func, hash_seed);
 
     cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
     std::copy(partition_offsets.begin(), partition_offsets.end(), n_output_offsets.begin());
@@ -3014,9 +3608,13 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_hashPartition(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_roundRobinPartition(
-    JNIEnv *env, jclass, jlong input_table, jint num_partitions, jint start_partition,
-    jintArray output_offsets) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_roundRobinPartition(JNIEnv* env,
+                                                                           jclass,
+                                                                           jlong input_table,
+                                                                           jint num_partitions,
+                                                                           jint start_partition,
+                                                                           jintArray output_offsets)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, output_offsets, "output_offsets is null", NULL);
   JNI_ARG_CHECK(env, num_partitions > 0, "num_partitions <= 0", NULL);
@@ -3024,10 +3622,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_roundRobinPartition(
 
   try {
     cudf::jni::auto_set_device(env);
-    auto n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    auto n_input_table = reinterpret_cast<cudf::table_view*>(input_table);
 
     auto [partitioned_table, partition_offsets] =
-        cudf::round_robin_partition(*n_input_table, num_partitions, start_partition);
+      cudf::round_robin_partition(*n_input_table, num_partitions, start_partition);
 
     cudf::jni::native_jintArray n_output_offsets(env, output_offsets);
     std::copy(partition_offsets.begin(), partition_offsets.end(), n_output_offsets.begin());
@@ -3037,10 +3635,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_roundRobinPartition(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
-    JNIEnv *env, jclass, jlong input_table, jintArray keys, jintArray aggregate_column_indices,
-    jlongArray agg_instances, jboolean ignore_null_keys, jboolean jkey_sorted,
-    jbooleanArray jkeys_sort_desc, jbooleanArray jkeys_null_first) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_groupByAggregate(JNIEnv* env,
+                                           jclass,
+                                           jlong input_table,
+                                           jintArray keys,
+                                           jintArray aggregate_column_indices,
+                                           jlongArray agg_instances,
+                                           jboolean ignore_null_keys,
+                                           jboolean jkey_sorted,
+                                           jbooleanArray jkeys_sort_desc,
+                                           jbooleanArray jkeys_null_first)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, keys, "input keys are null", NULL);
   JNI_NULL_CHECK(env, aggregate_column_indices, "input aggregate_column_indices are null", NULL);
@@ -3048,7 +3654,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::table_view* n_input_table = reinterpret_cast<cudf::table_view*>(input_table);
     cudf::jni::native_jintArray n_keys(env, keys);
     cudf::jni::native_jintArray n_values(env, aggregate_column_indices);
     cudf::jni::native_jpointerArray<cudf::aggregation> n_agg_instances(env, agg_instances);
@@ -3059,11 +3665,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
     }
 
     cudf::table_view n_keys_table(n_keys_cols);
-    auto column_order = cudf::jni::resolve_column_order(env, jkeys_sort_desc, n_keys.size());
+    auto column_order    = cudf::jni::resolve_column_order(env, jkeys_sort_desc, n_keys.size());
     auto null_precedence = cudf::jni::resolve_null_precedence(env, jkeys_null_first, n_keys.size());
     cudf::groupby::groupby grouper(
-        n_keys_table, ignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE,
-        jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO, column_order, null_precedence);
+      n_keys_table,
+      ignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE,
+      jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO,
+      column_order,
+      null_precedence);
 
     // Aggregates are passed in already grouped by column, so we just need to fill it in
     // as we go.
@@ -3074,12 +3683,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
       cudf::groupby::aggregation_request req;
       int col_index = n_values[i];
 
-      cudf::groupby_aggregation *agg =
-          dynamic_cast<cudf::groupby_aggregation *>(n_agg_instances[i]);
-      JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of groupby_aggregation",
-                    nullptr);
+      cudf::groupby_aggregation* agg = dynamic_cast<cudf::groupby_aggregation*>(n_agg_instances[i]);
+      JNI_ARG_CHECK(
+        env, agg != nullptr, "aggregation is not an instance of groupby_aggregation", nullptr);
       std::unique_ptr<cudf::groupby_aggregation> cloned(
-          dynamic_cast<cudf::groupby_aggregation *>(agg->clone().release()));
+        dynamic_cast<cudf::groupby_aggregation*>(agg->clone().release()));
 
       if (col_index == previous_index) {
         requests.back().aggregations.push_back(std::move(cloned));
@@ -3092,7 +3700,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
     }
 
     std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::groupby::aggregation_result>> result =
-        grouper.aggregate(requests);
+      grouper.aggregate(requests);
 
     std::vector<std::unique_ptr<cudf::column>> result_columns;
     int agg_result_size = result.second.size();
@@ -3107,10 +3715,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByAggregate(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
-    JNIEnv *env, jclass, jlong input_table, jintArray keys, jintArray aggregate_column_indices,
-    jlongArray agg_instances, jboolean ignore_null_keys, jboolean jkey_sorted,
-    jbooleanArray jkeys_sort_desc, jbooleanArray jkeys_null_first) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_groupByScan(JNIEnv* env,
+                                      jclass,
+                                      jlong input_table,
+                                      jintArray keys,
+                                      jintArray aggregate_column_indices,
+                                      jlongArray agg_instances,
+                                      jboolean ignore_null_keys,
+                                      jboolean jkey_sorted,
+                                      jbooleanArray jkeys_sort_desc,
+                                      jbooleanArray jkeys_null_first)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, keys, "input keys are null", NULL);
   JNI_NULL_CHECK(env, aggregate_column_indices, "input aggregate_column_indices are null", NULL);
@@ -3118,7 +3734,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::table_view* n_input_table = reinterpret_cast<cudf::table_view*>(input_table);
     cudf::jni::native_jintArray n_keys(env, keys);
     cudf::jni::native_jintArray n_values(env, aggregate_column_indices);
     cudf::jni::native_jpointerArray<cudf::aggregation> n_agg_instances(env, agg_instances);
@@ -3129,11 +3745,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
     }
 
     cudf::table_view n_keys_table(n_keys_cols);
-    auto column_order = cudf::jni::resolve_column_order(env, jkeys_sort_desc, n_keys.size());
+    auto column_order    = cudf::jni::resolve_column_order(env, jkeys_sort_desc, n_keys.size());
     auto null_precedence = cudf::jni::resolve_null_precedence(env, jkeys_null_first, n_keys.size());
     cudf::groupby::groupby grouper(
-        n_keys_table, ignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE,
-        jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO, column_order, null_precedence);
+      n_keys_table,
+      ignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE,
+      jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO,
+      column_order,
+      null_precedence);
 
     // Aggregates are passed in already grouped by column, so we just need to fill it in
     // as we go.
@@ -3144,12 +3763,12 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
       cudf::groupby::scan_request req;
       int col_index = n_values[i];
 
-      cudf::groupby_scan_aggregation *agg =
-          dynamic_cast<cudf::groupby_scan_aggregation *>(n_agg_instances[i]);
-      JNI_ARG_CHECK(env, agg != nullptr,
-                    "aggregation is not an instance of groupby_scan_aggregation", nullptr);
+      cudf::groupby_scan_aggregation* agg =
+        dynamic_cast<cudf::groupby_scan_aggregation*>(n_agg_instances[i]);
+      JNI_ARG_CHECK(
+        env, agg != nullptr, "aggregation is not an instance of groupby_scan_aggregation", nullptr);
       std::unique_ptr<cudf::groupby_scan_aggregation> cloned(
-          dynamic_cast<cudf::groupby_scan_aggregation *>(agg->clone().release()));
+        dynamic_cast<cudf::groupby_scan_aggregation*>(agg->clone().release()));
 
       if (col_index == previous_index) {
         requests.back().aggregations.push_back(std::move(cloned));
@@ -3162,7 +3781,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
     }
 
     std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::groupby::aggregation_result>> result =
-        grouper.scan(requests);
+      grouper.scan(requests);
 
     std::vector<std::unique_ptr<cudf::column>> result_columns;
     int agg_result_size = result.second.size();
@@ -3177,10 +3796,18 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByScan(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByReplaceNulls(
-    JNIEnv *env, jclass, jlong input_table, jintArray keys, jintArray replace_column_indices,
-    jbooleanArray is_preceding, jboolean ignore_null_keys, jboolean jkey_sorted,
-    jbooleanArray jkeys_sort_desc, jbooleanArray jkeys_null_first) {
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_groupByReplaceNulls(JNIEnv* env,
+                                              jclass,
+                                              jlong input_table,
+                                              jintArray keys,
+                                              jintArray replace_column_indices,
+                                              jbooleanArray is_preceding,
+                                              jboolean ignore_null_keys,
+                                              jboolean jkey_sorted,
+                                              jbooleanArray jkeys_sort_desc,
+                                              jbooleanArray jkeys_null_first)
+{
   JNI_NULL_CHECK(env, input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, keys, "input keys are null", NULL);
   JNI_NULL_CHECK(env, replace_column_indices, "input replace_column_indices are null", NULL);
@@ -3188,7 +3815,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByReplaceNulls(
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_input_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::table_view* n_input_table = reinterpret_cast<cudf::table_view*>(input_table);
     cudf::jni::native_jintArray n_keys(env, keys);
     cudf::jni::native_jintArray n_values(env, replace_column_indices);
     cudf::jni::native_jbooleanArray n_is_preceding(env, is_preceding);
@@ -3199,11 +3826,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByReplaceNulls(
     }
 
     cudf::table_view n_keys_table(n_keys_cols);
-    auto column_order = cudf::jni::resolve_column_order(env, jkeys_sort_desc, n_keys.size());
+    auto column_order    = cudf::jni::resolve_column_order(env, jkeys_sort_desc, n_keys.size());
     auto null_precedence = cudf::jni::resolve_null_precedence(env, jkeys_null_first, n_keys.size());
     cudf::groupby::groupby grouper(
-        n_keys_table, ignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE,
-        jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO, column_order, null_precedence);
+      n_keys_table,
+      ignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE,
+      jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO,
+      column_order,
+      null_precedence);
 
     // Aggregates are passed in already grouped by column, so we just need to fill it in
     // as we go.
@@ -3215,7 +3845,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByReplaceNulls(
     cudf::table_view n_replace_table(n_replace_cols);
 
     std::vector<cudf::replace_policy> policies = n_is_preceding.transform_if_else(
-        cudf::replace_policy::PRECEDING, cudf::replace_policy::FOLLOWING);
+      cudf::replace_policy::PRECEDING, cudf::replace_policy::FOLLOWING);
 
     auto [keys, results] = grouper.replace_nulls(n_replace_table, policies);
     return convert_table_for_return(env, keys, results);
@@ -3223,48 +3853,51 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_groupByReplaceNulls(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_filter(JNIEnv *env, jclass,
-                                                              jlong input_jtable, jlong mask_jcol) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_filter(JNIEnv* env,
+                                                              jclass,
+                                                              jlong input_jtable,
+                                                              jlong mask_jcol)
+{
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   JNI_NULL_CHECK(env, mask_jcol, "mask column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
-    auto const mask = reinterpret_cast<cudf::column_view const *>(mask_jcol);
+    auto const input = reinterpret_cast<cudf::table_view const*>(input_jtable);
+    auto const mask  = reinterpret_cast<cudf::column_view const*>(mask_jcol);
     return convert_table_for_return(env, cudf::apply_boolean_mask(*input, *mask));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Table_distinctCount(JNIEnv *env, jclass,
+JNIEXPORT jint JNICALL Java_ai_rapids_cudf_Table_distinctCount(JNIEnv* env,
+                                                               jclass,
                                                                jlong input_jtable,
-                                                               jboolean nulls_equal) {
+                                                               jboolean nulls_equal)
+{
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const input = reinterpret_cast<cudf::table_view const*>(input_jtable);
 
-    return cudf::distinct_count(*input, nulls_equal ? cudf::null_equality::EQUAL :
-                                                      cudf::null_equality::UNEQUAL);
+    return cudf::distinct_count(
+      *input, nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(JNIEnv *env, jclass,
-                                                                      jlong input_jtable,
-                                                                      jintArray key_columns,
-                                                                      jint keep,
-                                                                      jboolean nulls_equal) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(
+  JNIEnv* env, jclass, jlong input_jtable, jintArray key_columns, jint keep, jboolean nulls_equal)
+{
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   JNI_NULL_CHECK(env, key_columns, "input key_columns is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const input = reinterpret_cast<cudf::table_view const*>(input_jtable);
 
     static_assert(sizeof(jint) == sizeof(cudf::size_type), "Integer types mismatched.");
     auto const native_keys_indices = cudf::jni::native_jintArray(env, key_columns);
     auto const keys_indices =
-        std::vector<cudf::size_type>(native_keys_indices.begin(), native_keys_indices.end());
+      std::vector<cudf::size_type>(native_keys_indices.begin(), native_keys_indices.end());
     auto const keep_option = [&] {
       switch (keep) {
         case 0: return cudf::duplicate_keep_option::KEEP_ANY;
@@ -3272,54 +3905,60 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(JNIEnv *en
         case 2: return cudf::duplicate_keep_option::KEEP_LAST;
         case 3: return cudf::duplicate_keep_option::KEEP_NONE;
         default:
-          JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "Invalid `keep` option",
+          JNI_THROW_NEW(env,
+                        cudf::jni::ILLEGAL_ARG_CLASS,
+                        "Invalid `keep` option",
                         cudf::duplicate_keep_option::KEEP_ANY);
       }
     }();
 
     auto result =
-        cudf::distinct(*input, keys_indices, keep_option,
-                       nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL,
-                       cudf::nan_equality::ALL_EQUAL, rmm::mr::get_current_device_resource());
+      cudf::distinct(*input,
+                     keys_indices,
+                     keep_option,
+                     nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL,
+                     cudf::nan_equality::ALL_EQUAL,
+                     rmm::mr::get_current_device_resource());
     return convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(JNIEnv *env, jclass, jlong j_input,
-                                                              jlong j_map, jboolean check_bounds) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_gather(
+  JNIEnv* env, jclass, jlong j_input, jlong j_map, jboolean check_bounds)
+{
   JNI_NULL_CHECK(env, j_input, "input table is null", 0);
   JNI_NULL_CHECK(env, j_map, "map column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(j_input);
-    auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
+    auto const input = reinterpret_cast<cudf::table_view const*>(j_input);
+    auto const map   = reinterpret_cast<cudf::column_view const*>(j_map);
     auto bounds_policy =
-        check_bounds ? cudf::out_of_bounds_policy::NULLIFY : cudf::out_of_bounds_policy::DONT_CHECK;
+      check_bounds ? cudf::out_of_bounds_policy::NULLIFY : cudf::out_of_bounds_policy::DONT_CHECK;
     return convert_table_for_return(env, cudf::gather(*input, *map, bounds_policy));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterTable(JNIEnv *env, jclass,
-                                                                    jlong j_input, jlong j_map,
-                                                                    jlong j_target) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterTable(
+  JNIEnv* env, jclass, jlong j_input, jlong j_map, jlong j_target)
+{
   JNI_NULL_CHECK(env, j_input, "input table is null", 0);
   JNI_NULL_CHECK(env, j_map, "map column is null", 0);
   JNI_NULL_CHECK(env, j_target, "target table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(j_input);
-    auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
-    auto const target = reinterpret_cast<cudf::table_view const *>(j_target);
+    auto const input  = reinterpret_cast<cudf::table_view const*>(j_input);
+    auto const map    = reinterpret_cast<cudf::column_view const*>(j_map);
+    auto const target = reinterpret_cast<cudf::table_view const*>(j_target);
     return convert_table_for_return(env, cudf::scatter(*input, *map, *target));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(JNIEnv *env, jclass,
-                                                                      jlongArray j_input,
-                                                                      jlong j_map, jlong j_target) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(
+  JNIEnv* env, jclass, jlongArray j_input, jlong j_map, jlong j_target)
+{
   JNI_NULL_CHECK(env, j_input, "input scalars array is null", 0);
   JNI_NULL_CHECK(env, j_map, "map column is null", 0);
   JNI_NULL_CHECK(env, j_target, "target table is null", 0);
@@ -3327,81 +3966,94 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(JNIEnv *en
     cudf::jni::auto_set_device(env);
     auto const scalars_array = cudf::jni::native_jpointerArray<cudf::scalar>(env, j_input);
     std::vector<std::reference_wrapper<cudf::scalar const>> input;
-    std::transform(scalars_array.begin(), scalars_array.end(), std::back_inserter(input),
-                   [](auto &scalar) { return std::ref(*scalar); });
-    auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
-    auto const target = reinterpret_cast<cudf::table_view const *>(j_target);
+    std::transform(
+      scalars_array.begin(), scalars_array.end(), std::back_inserter(input), [](auto& scalar) {
+        return std::ref(*scalar);
+      });
+    auto const map    = reinterpret_cast<cudf::column_view const*>(j_map);
+    auto const target = reinterpret_cast<cudf::table_view const*>(j_target);
     return convert_table_for_return(env, cudf::scatter(input, *map, *target));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatStaticCount(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatStaticCount(JNIEnv* env,
+                                                                         jclass,
                                                                          jlong input_jtable,
-                                                                         jint count) {
+                                                                         jint count)
+{
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
+    auto const input = reinterpret_cast<cudf::table_view const*>(input_jtable);
     return convert_table_for_return(env, cudf::repeat(*input, count));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatColumnCount(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatColumnCount(JNIEnv* env,
+                                                                         jclass,
                                                                          jlong input_jtable,
-                                                                         jlong count_jcol) {
+                                                                         jlong count_jcol)
+{
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   JNI_NULL_CHECK(env, count_jcol, "count column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
-    auto const count = reinterpret_cast<cudf::column_view const *>(count_jcol);
+    auto const input = reinterpret_cast<cudf::table_view const*>(input_jtable);
+    auto const count = reinterpret_cast<cudf::column_view const*>(count_jcol);
     return convert_table_for_return(env, cudf::repeat(*input, *count));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_bound(JNIEnv *env, jclass, jlong input_jtable,
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_bound(JNIEnv* env,
+                                                        jclass,
+                                                        jlong input_jtable,
                                                         jlong values_jtable,
                                                         jbooleanArray desc_flags,
                                                         jbooleanArray are_nulls_smallest,
-                                                        jboolean is_upper_bound) {
+                                                        jboolean is_upper_bound)
+{
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   JNI_NULL_CHECK(env, values_jtable, "values table is null", 0);
   using cudf::column;
   using cudf::table_view;
   try {
     cudf::jni::auto_set_device(env);
-    table_view *input = reinterpret_cast<table_view *>(input_jtable);
-    table_view *values = reinterpret_cast<table_view *>(values_jtable);
+    table_view* input  = reinterpret_cast<table_view*>(input_jtable);
+    table_view* values = reinterpret_cast<table_view*>(values_jtable);
     cudf::jni::native_jbooleanArray const n_desc_flags(env, desc_flags);
     cudf::jni::native_jbooleanArray const n_are_nulls_smallest(env, are_nulls_smallest);
 
     std::vector<cudf::order> column_desc_flags{
-        n_desc_flags.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING)};
+      n_desc_flags.transform_if_else(cudf::order::DESCENDING, cudf::order::ASCENDING)};
     std::vector<cudf::null_order> column_null_orders{
-        n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER)};
+      n_are_nulls_smallest.transform_if_else(cudf::null_order::BEFORE, cudf::null_order::AFTER)};
 
-    JNI_ARG_CHECK(env, (column_desc_flags.size() == column_null_orders.size()),
-                  "null-order and sort-order size mismatch", 0);
+    JNI_ARG_CHECK(env,
+                  (column_desc_flags.size() == column_null_orders.size()),
+                  "null-order and sort-order size mismatch",
+                  0);
 
     return release_as_jlong(
-        is_upper_bound ? cudf::upper_bound(*input, *values, column_desc_flags, column_null_orders) :
-                         cudf::lower_bound(*input, *values, column_desc_flags, column_null_orders));
+      is_upper_bound ? cudf::upper_bound(*input, *values, column_desc_flags, column_null_orders)
+                     : cudf::lower_bound(*input, *values, column_desc_flags, column_null_orders));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplit(JNIEnv *env, jclass,
+JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplit(JNIEnv* env,
+                                                                         jclass,
                                                                          jlong input_table,
-                                                                         jintArray split_indices) {
+                                                                         jintArray split_indices)
+{
   JNI_NULL_CHECK(env, input_table, "native handle is null", 0);
   JNI_NULL_CHECK(env, split_indices, "split indices are null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::table_view* n_table = reinterpret_cast<cudf::table_view*>(input_table);
     cudf::jni::native_jintArray n_split_indices(env, split_indices);
 
     std::vector<cudf::size_type> indices(n_split_indices.data(),
@@ -3409,42 +4061,50 @@ JNIEXPORT jobjectArray JNICALL Java_ai_rapids_cudf_Table_contiguousSplit(JNIEnv
 
     std::vector<cudf::packed_table> result = cudf::contiguous_split(*n_table, indices);
     cudf::jni::native_jobjectArray<jobject> n_result =
-        cudf::jni::contiguous_table_array(env, result.size());
+      cudf::jni::contiguous_table_array(env, result.size());
     for (size_t i = 0; i < result.size(); i++) {
       n_result.set(
-          i, cudf::jni::contiguous_table_from(env, result[i].data, result[i].table.num_rows()));
+        i, cudf::jni::contiguous_table_from(env, result[i].data, result[i].table.num_rows()));
     }
     return n_result.wrapped();
   }
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_makeChunkedPack(JNIEnv *env, jclass,
-                                                                  jlong input_table,
-                                                                  jlong bounce_buffer_size,
-                                                                  jlong memoryResourceHandle) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_makeChunkedPack(
+  JNIEnv* env, jclass, jlong input_table, jlong bounce_buffer_size, jlong memoryResourceHandle)
+{
   JNI_NULL_CHECK(env, input_table, "native handle is null", 0);
 
   try {
     cudf::jni::auto_set_device(env);
-    cudf::table_view *n_table = reinterpret_cast<cudf::table_view *>(input_table);
+    cudf::table_view* n_table = reinterpret_cast<cudf::table_view*>(input_table);
     // `temp_mr` is the memory resource that `cudf::chunked_pack` will use to create temporary
     // and scratch memory only.
-    auto temp_mr = memoryResourceHandle != 0 ?
-                       reinterpret_cast<rmm::mr::device_memory_resource *>(memoryResourceHandle) :
-                       rmm::mr::get_current_device_resource();
+    auto temp_mr      = memoryResourceHandle != 0
+                          ? reinterpret_cast<rmm::mr::device_memory_resource*>(memoryResourceHandle)
+                          : rmm::mr::get_current_device_resource();
     auto chunked_pack = cudf::chunked_pack::create(*n_table, bounce_buffer_size, temp_mr);
     return reinterpret_cast<jlong>(chunked_pack.release());
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
-    JNIEnv *env, jclass, jlong j_input_table, jintArray j_keys, jlongArray j_default_output,
-    jintArray j_aggregate_column_indices, jlongArray j_agg_instances, jintArray j_min_periods,
-    jintArray j_preceding, jintArray j_following, jbooleanArray j_unbounded_preceding,
-    jbooleanArray j_unbounded_following, jboolean ignore_null_keys) {
-
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_rollingWindowAggregate(JNIEnv* env,
+                                                 jclass,
+                                                 jlong j_input_table,
+                                                 jintArray j_keys,
+                                                 jlongArray j_default_output,
+                                                 jintArray j_aggregate_column_indices,
+                                                 jlongArray j_agg_instances,
+                                                 jintArray j_min_periods,
+                                                 jintArray j_preceding,
+                                                 jintArray j_following,
+                                                 jbooleanArray j_unbounded_preceding,
+                                                 jbooleanArray j_unbounded_following,
+                                                 jboolean ignore_null_keys)
+{
   JNI_NULL_CHECK(env, j_input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, j_keys, "input keys are null", NULL);
   JNI_NULL_CHECK(env, j_aggregate_column_indices, "input aggregate_column_indices are null", NULL);
@@ -3457,7 +4117,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
     using cudf::jni::valid_window_parameters;
 
     // Convert from j-types to native.
-    cudf::table_view *input_table{reinterpret_cast<cudf::table_view *>(j_input_table)};
+    cudf::table_view* input_table{reinterpret_cast<cudf::table_view*>(j_input_table)};
     cudf::jni::native_jintArray keys{env, j_keys};
     cudf::jni::native_jintArray values{env, j_aggregate_column_indices};
     cudf::jni::native_jpointerArray<cudf::aggregation> agg_instances(env, j_agg_instances);
@@ -3469,37 +4129,47 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
     cudf::jni::native_jbooleanArray unbounded_following{env, j_unbounded_following};
 
     if (not valid_window_parameters(values, agg_instances, min_periods, preceding, following)) {
-      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
+      JNI_THROW_NEW(env,
+                    cudf::jni::ILLEGAL_ARG_CLASS,
                     "Number of aggregation columns must match number of agg ops, and window-specs",
                     nullptr);
     }
 
     // Extract table-view.
     cudf::table_view groupby_keys{
-        input_table->select(std::vector<cudf::size_type>(keys.data(), keys.data() + keys.size()))};
+      input_table->select(std::vector<cudf::size_type>(keys.data(), keys.data() + keys.size()))};
 
     std::vector<std::unique_ptr<cudf::column>> result_columns;
     for (int i(0); i < values.size(); ++i) {
-      cudf::rolling_aggregation *agg = dynamic_cast<cudf::rolling_aggregation *>(agg_instances[i]);
-      JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of rolling_aggregation",
-                    nullptr);
-
-      int agg_column_index = values[i];
-      auto const preceding_window_bounds = unbounded_preceding[i] ?
-                                               cudf::window_bounds::unbounded() :
-                                               cudf::window_bounds::get(preceding[i]);
-      auto const following_window_bounds = unbounded_following[i] ?
-                                               cudf::window_bounds::unbounded() :
-                                               cudf::window_bounds::get(following[i]);
+      cudf::rolling_aggregation* agg = dynamic_cast<cudf::rolling_aggregation*>(agg_instances[i]);
+      JNI_ARG_CHECK(
+        env, agg != nullptr, "aggregation is not an instance of rolling_aggregation", nullptr);
+
+      int agg_column_index               = values[i];
+      auto const preceding_window_bounds = unbounded_preceding[i]
+                                             ? cudf::window_bounds::unbounded()
+                                             : cudf::window_bounds::get(preceding[i]);
+      auto const following_window_bounds = unbounded_following[i]
+                                             ? cudf::window_bounds::unbounded()
+                                             : cudf::window_bounds::get(following[i]);
 
       if (default_output[i] != nullptr) {
-        result_columns.emplace_back(cudf::grouped_rolling_window(
-            groupby_keys, input_table->column(agg_column_index), *default_output[i],
-            preceding_window_bounds, following_window_bounds, min_periods[i], *agg));
+        result_columns.emplace_back(
+          cudf::grouped_rolling_window(groupby_keys,
+                                       input_table->column(agg_column_index),
+                                       *default_output[i],
+                                       preceding_window_bounds,
+                                       following_window_bounds,
+                                       min_periods[i],
+                                       *agg));
       } else {
-        result_columns.emplace_back(cudf::grouped_rolling_window(
-            groupby_keys, input_table->column(agg_column_index), preceding_window_bounds,
-            following_window_bounds, min_periods[i], *agg));
+        result_columns.emplace_back(
+          cudf::grouped_rolling_window(groupby_keys,
+                                       input_table->column(agg_column_index),
+                                       preceding_window_bounds,
+                                       following_window_bounds,
+                                       min_periods[i],
+                                       *agg));
       }
     }
 
@@ -3509,13 +4179,22 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rollingWindowAggregate(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggregate(
-    JNIEnv *env, jclass, jlong j_input_table, jintArray j_keys, jintArray j_orderby_column_indices,
-    jbooleanArray j_is_orderby_ascending, jintArray j_aggregate_column_indices,
-    jlongArray j_agg_instances, jintArray j_min_periods, jlongArray j_preceding,
-    jlongArray j_following, jintArray j_preceding_extent, jintArray j_following_extent,
-    jboolean ignore_null_keys) {
-
+JNIEXPORT jlongArray JNICALL
+Java_ai_rapids_cudf_Table_rangeRollingWindowAggregate(JNIEnv* env,
+                                                      jclass,
+                                                      jlong j_input_table,
+                                                      jintArray j_keys,
+                                                      jintArray j_orderby_column_indices,
+                                                      jbooleanArray j_is_orderby_ascending,
+                                                      jintArray j_aggregate_column_indices,
+                                                      jlongArray j_agg_instances,
+                                                      jintArray j_min_periods,
+                                                      jlongArray j_preceding,
+                                                      jlongArray j_following,
+                                                      jintArray j_preceding_extent,
+                                                      jintArray j_following_extent,
+                                                      jboolean ignore_null_keys)
+{
   JNI_NULL_CHECK(env, j_input_table, "input table is null", NULL);
   JNI_NULL_CHECK(env, j_keys, "input keys are null", NULL);
   JNI_NULL_CHECK(env, j_orderby_column_indices, "input orderby_column_indices are null", NULL);
@@ -3531,7 +4210,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
     using cudf::jni::valid_window_parameters;
 
     // Convert from j-types to native.
-    cudf::table_view *input_table{reinterpret_cast<cudf::table_view *>(j_input_table)};
+    cudf::table_view* input_table{reinterpret_cast<cudf::table_view*>(j_input_table)};
     cudf::jni::native_jintArray keys{env, j_keys};
     cudf::jni::native_jintArray orderbys{env, j_orderby_column_indices};
     cudf::jni::native_jbooleanArray orderbys_ascending{env, j_is_orderby_ascending};
@@ -3544,21 +4223,22 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
     cudf::jni::native_jpointerArray<cudf::scalar> following(env, j_following);
 
     if (not valid_window_parameters(values, agg_instances, min_periods, preceding, following)) {
-      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS,
+      JNI_THROW_NEW(env,
+                    cudf::jni::ILLEGAL_ARG_CLASS,
                     "Number of aggregation columns must match number of agg ops, and window-specs",
                     nullptr);
     }
 
     // Extract table-view.
     cudf::table_view groupby_keys{
-        input_table->select(std::vector<cudf::size_type>(keys.data(), keys.data() + keys.size()))};
+      input_table->select(std::vector<cudf::size_type>(keys.data(), keys.data() + keys.size()))};
 
     std::vector<std::unique_ptr<cudf::column>> result_columns;
     for (int i(0); i < values.size(); ++i) {
-      int agg_column_index = values[i];
-      cudf::column_view const &order_by_column = input_table->column(orderbys[i]);
-      cudf::data_type order_by_type = order_by_column.type();
-      cudf::data_type duration_type = order_by_type;
+      int agg_column_index                     = values[i];
+      cudf::column_view const& order_by_column = input_table->column(orderbys[i]);
+      cudf::data_type order_by_type            = order_by_column.type();
+      cudf::data_type duration_type            = order_by_type;
 
       // Range extents are defined as:
       // a) 0 == CURRENT ROW
@@ -3566,8 +4246,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
       // c) 2 == UNBOUNDED
       // Must set unbounded_type for only the BOUNDED case.
       auto constexpr CURRENT_ROW = 0;
-      auto constexpr BOUNDED = 1;
-      auto constexpr UNBOUNDED = 2;
+      auto constexpr BOUNDED     = 1;
+      auto constexpr UNBOUNDED   = 2;
       if (preceding_extent[i] != BOUNDED || following_extent[i] != BOUNDED) {
         switch (order_by_type.id()) {
           case cudf::type_id::TIMESTAMP_DAYS:
@@ -3589,11 +4269,11 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
         }
       }
 
-      cudf::rolling_aggregation *agg = dynamic_cast<cudf::rolling_aggregation *>(agg_instances[i]);
-      JNI_ARG_CHECK(env, agg != nullptr, "aggregation is not an instance of rolling_aggregation",
-                    nullptr);
+      cudf::rolling_aggregation* agg = dynamic_cast<cudf::rolling_aggregation*>(agg_instances[i]);
+      JNI_ARG_CHECK(
+        env, agg != nullptr, "aggregation is not an instance of rolling_aggregation", nullptr);
 
-      auto const make_window_bounds = [&](auto const &range_extent, auto const *p_scalar) {
+      auto const make_window_bounds = [&](auto const& range_extent, auto const* p_scalar) {
         if (range_extent == CURRENT_ROW) {
           return cudf::range_window_bounds::current_row(duration_type);
         } else if (range_extent == UNBOUNDED) {
@@ -3604,11 +4284,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
       };
 
       result_columns.emplace_back(cudf::grouped_range_rolling_window(
-          groupby_keys, order_by_column,
-          orderbys_ascending[i] ? cudf::order::ASCENDING : cudf::order::DESCENDING,
-          input_table->column(agg_column_index),
-          make_window_bounds(preceding_extent[i], preceding[i]),
-          make_window_bounds(following_extent[i], following[i]), min_periods[i], *agg));
+        groupby_keys,
+        order_by_column,
+        orderbys_ascending[i] ? cudf::order::ASCENDING : cudf::order::DESCENDING,
+        input_table->column(agg_column_index),
+        make_window_bounds(preceding_extent[i], preceding[i]),
+        make_window_bounds(following_extent[i], following[i]),
+        min_periods[i],
+        *agg));
     }
 
     auto result_table = std::make_unique<cudf::table>(std::move(result_columns));
@@ -3617,72 +4300,88 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_rangeRollingWindowAggrega
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explode(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explode(JNIEnv* env,
+                                                               jclass,
                                                                jlong input_jtable,
-                                                               jint column_index) {
+                                                               jint column_index)
+{
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
-    auto const col_index = static_cast<cudf::size_type>(column_index);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(input_jtable);
+    auto const col_index   = static_cast<cudf::size_type>(column_index);
     return convert_table_for_return(env, cudf::explode(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodePosition(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodePosition(JNIEnv* env,
+                                                                       jclass,
                                                                        jlong input_jtable,
-                                                                       jint column_index) {
+                                                                       jint column_index)
+{
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
-    auto const col_index = static_cast<cudf::size_type>(column_index);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(input_jtable);
+    auto const col_index   = static_cast<cudf::size_type>(column_index);
     return convert_table_for_return(env, cudf::explode_position(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuter(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuter(JNIEnv* env,
+                                                                    jclass,
                                                                     jlong input_jtable,
-                                                                    jint column_index) {
+                                                                    jint column_index)
+{
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
-    auto const col_index = static_cast<cudf::size_type>(column_index);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(input_jtable);
+    auto const col_index   = static_cast<cudf::size_type>(column_index);
     return convert_table_for_return(env, cudf::explode_outer(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuterPosition(JNIEnv *env, jclass,
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_explodeOuterPosition(JNIEnv* env,
+                                                                            jclass,
                                                                             jlong input_jtable,
-                                                                            jint column_index) {
+                                                                            jint column_index)
+{
   JNI_NULL_CHECK(env, input_jtable, "explode: input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(input_jtable);
-    auto const col_index = static_cast<cudf::size_type>(column_index);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(input_jtable);
+    auto const col_index   = static_cast<cudf::size_type>(column_index);
     return convert_table_for_return(env, cudf::explode_outer_position(*input_table, col_index));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_rowBitCount(JNIEnv *env, jclass, jlong j_table) {
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_rowBitCount(JNIEnv* env, jclass, jlong j_table)
+{
   JNI_NULL_CHECK(env, j_table, "table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(j_table);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(j_table);
     return release_as_jlong(cudf::row_bit_count(*input_table));
   }
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
-    JNIEnv *env, jclass, jlong jinput_table, jintArray jkey_indices, jboolean jignore_null_keys,
-    jboolean jkey_sorted, jbooleanArray jkeys_sort_desc, jbooleanArray jkeys_null_first,
-    jboolean genUniqKeys) {
+JNIEXPORT jobject JNICALL
+Java_ai_rapids_cudf_Table_contiguousSplitGroups(JNIEnv* env,
+                                                jclass,
+                                                jlong jinput_table,
+                                                jintArray jkey_indices,
+                                                jboolean jignore_null_keys,
+                                                jboolean jkey_sorted,
+                                                jbooleanArray jkeys_sort_desc,
+                                                jbooleanArray jkeys_null_first,
+                                                jboolean genUniqKeys)
+{
   JNI_NULL_CHECK(env, jinput_table, "table native handle is null", 0);
   JNI_NULL_CHECK(env, jkey_indices, "key indices are null", 0);
   // Two main steps to split the groups in the input table.
@@ -3693,7 +4392,7 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jintArray n_key_indices(env, jkey_indices);
-    auto const input_table = reinterpret_cast<cudf::table_view const *>(jinput_table);
+    auto const input_table = reinterpret_cast<cudf::table_view const*>(jinput_table);
 
     // Prepares arguments for the groupby:
     //   (keys, null_handling, keys_are_sorted, column_order, null_precedence)
@@ -3701,15 +4400,15 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
                                              n_key_indices.data() + n_key_indices.size());
     auto keys = input_table->select(key_indices);
     auto null_handling =
-        jignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE;
+      jignore_null_keys ? cudf::null_policy::EXCLUDE : cudf::null_policy::INCLUDE;
     auto keys_are_sorted = jkey_sorted ? cudf::sorted::YES : cudf::sorted::NO;
     auto column_order = cudf::jni::resolve_column_order(env, jkeys_sort_desc, key_indices.size());
     auto null_precedence =
-        cudf::jni::resolve_null_precedence(env, jkeys_null_first, key_indices.size());
+      cudf::jni::resolve_null_precedence(env, jkeys_null_first, key_indices.size());
 
     // Constructs a groupby
-    cudf::groupby::groupby grouper(keys, null_handling, keys_are_sorted, column_order,
-                                   null_precedence);
+    cudf::groupby::groupby grouper(
+      keys, null_handling, keys_are_sorted, column_order, null_precedence);
 
     // 1) Gets the groups(keys, offsets, values) from groupby.
     //
@@ -3736,14 +4435,14 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
     // original order of columns (same order with that in input table).
     std::vector<cudf::column_view> grouped_cols(key_indices.size() + num_value_cols);
     // key columns
-    auto key_view = groups.keys->view();
+    auto key_view    = groups.keys->view();
     auto key_view_it = key_view.begin();
     for (auto key_id : key_indices) {
       grouped_cols.at(key_id) = std::move(*key_view_it);
       key_view_it++;
     }
     // value columns
-    auto value_view = groups.values->view();
+    auto value_view    = groups.values->view();
     auto value_view_it = value_view.begin();
     for (auto value_id : value_indices) {
       grouped_cols.at(value_id) = std::move(*value_view_it);
@@ -3752,11 +4451,11 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
     cudf::table_view grouped_table(grouped_cols);
     // When no key columns, uses the input table instead, because the output
     // of 'get_groups' is empty.
-    auto &grouped_view = key_indices.empty() ? *input_table : grouped_table;
+    auto& grouped_view = key_indices.empty() ? *input_table : grouped_table;
 
     // Resolves the split indices from offsets vector directly to avoid copying. Since
     // the offsets vector may be very large if there are too many small groups.
-    std::vector<cudf::size_type> &split_indices = groups.offsets;
+    std::vector<cudf::size_type>& split_indices = groups.offsets;
     // Offsets layout is [0, split indices..., num_rows] or [0] for empty keys, so
     // need to removes the first and last elements. First remove last one.
     split_indices.pop_back();
@@ -3765,23 +4464,21 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
     std::unique_ptr<cudf::table> group_by_result_table;
     if (genUniqKeys) {
       // generate gather map column from `split_indices`
-      auto begin = std::cbegin(split_indices);
-      auto end = std::cend(split_indices);
+      auto begin      = std::cbegin(split_indices);
+      auto end        = std::cend(split_indices);
       auto const size = cudf::distance(begin, end);
-      auto const vec = thrust::host_vector<cudf::size_type>(begin, end);
-      auto buf = rmm::device_buffer{vec.data(), size * sizeof(cudf::size_type),
-                                    cudf::get_default_stream()};
+      auto const vec  = thrust::host_vector<cudf::size_type>(begin, end);
+      auto buf =
+        rmm::device_buffer{vec.data(), size * sizeof(cudf::size_type), cudf::get_default_stream()};
       auto gather_map_col = std::make_unique<cudf::column>(
-          cudf::data_type{cudf::type_id::INT32}, size, std::move(buf), rmm::device_buffer{}, 0);
+        cudf::data_type{cudf::type_id::INT32}, size, std::move(buf), rmm::device_buffer{}, 0);
 
       // gather the first key in each group to remove duplicated ones.
       group_by_result_table = cudf::gather(groups.keys->view(), gather_map_col->view());
     }
 
     // remove the first 0 if it exists
-    if (!split_indices.empty()) {
-      split_indices.erase(split_indices.begin());
-    }
+    if (!split_indices.empty()) { split_indices.erase(split_indices.begin()); }
 
     // 2) Splits the groups.
     std::vector<cudf::packed_table> result = cudf::contiguous_split(grouped_view, split_indices);
@@ -3791,10 +4488,10 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
 
     //  Returns the split result.
     cudf::jni::native_jobjectArray<jobject> n_result =
-        cudf::jni::contiguous_table_array(env, result.size());
+      cudf::jni::contiguous_table_array(env, result.size());
     for (size_t i = 0; i < result.size(); i++) {
       n_result.set(
-          i, cudf::jni::contiguous_table_from(env, result[i].data, result[i].table.num_rows()));
+        i, cudf::jni::contiguous_table_from(env, result[i].data, result[i].table.num_rows()));
     }
 
     jobjectArray groups_array = n_result.wrapped();
@@ -3809,17 +4506,17 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
   CATCH_STD(env, NULL);
 }
 
-JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_sample(JNIEnv *env, jclass, jlong j_input,
-                                                              jlong n, jboolean replacement,
-                                                              jlong seed) {
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_sample(
+  JNIEnv* env, jclass, jlong j_input, jlong n, jboolean replacement, jlong seed)
+{
   JNI_NULL_CHECK(env, j_input, "input table is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    auto const input = reinterpret_cast<cudf::table_view const *>(j_input);
+    auto const input = reinterpret_cast<cudf::table_view const*>(j_input);
     auto sample_with_replacement =
-        replacement ? cudf::sample_with_replacement::TRUE : cudf::sample_with_replacement::FALSE;
+      replacement ? cudf::sample_with_replacement::TRUE : cudf::sample_with_replacement::FALSE;
     return convert_table_for_return(env, cudf::sample(*input, n, sample_with_replacement, seed));
   }
   CATCH_STD(env, 0);
 }
-} // extern "C"
+}  // extern "C"
diff --git a/java/src/main/native/src/aggregation128_utils.cu b/java/src/main/native/src/aggregation128_utils.cu
index d722aaa84fe..a32e7d27085 100644
--- a/java/src/main/native/src/aggregation128_utils.cu
+++ b/java/src/main/native/src/aggregation128_utils.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,118 +14,131 @@
  * limitations under the License.
  */
 
-#include <cstddef>
-#include <utility>
-#include <vector>
+#include "aggregation128_utils.hpp"
 
-#include <cuda/functional>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/utilities/error.hpp>
+
 #include <rmm/exec_policy.hpp>
+
+#include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/permutation_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
-#include "aggregation128_utils.hpp"
+#include <cstddef>
+#include <utility>
+#include <vector>
 
 namespace {
 
 // Functor to reassemble a 128-bit value from four 64-bit chunks with overflow detection.
 class chunk_assembler : public thrust::unary_function<cudf::size_type, __int128_t> {
-public:
-  chunk_assembler(bool *overflows, uint64_t const *chunks0, uint64_t const *chunks1,
-                  uint64_t const *chunks2, int64_t const *chunks3)
-      : overflows(overflows), chunks0(chunks0), chunks1(chunks1), chunks2(chunks2),
-        chunks3(chunks3) {}
+ public:
+  chunk_assembler(bool* overflows,
+                  uint64_t const* chunks0,
+                  uint64_t const* chunks1,
+                  uint64_t const* chunks2,
+                  int64_t const* chunks3)
+    : overflows(overflows), chunks0(chunks0), chunks1(chunks1), chunks2(chunks2), chunks3(chunks3)
+  {
+  }
 
-  __device__ __int128_t operator()(cudf::size_type i) const {
+  __device__ __int128_t operator()(cudf::size_type i) const
+  {
     // Starting with the least significant input and moving to the most significant, propagate the
     // upper 32-bits of the previous column into the next column, i.e.: propagate the "carry" bits
     // of each 64-bit chunk into the next chunk.
-    uint64_t const c0 = chunks0[i];
-    uint64_t const c1 = chunks1[i] + (c0 >> 32);
-    uint64_t const c2 = chunks2[i] + (c1 >> 32);
-    int64_t const c3 = chunks3[i] + (c2 >> 32);
+    uint64_t const c0      = chunks0[i];
+    uint64_t const c1      = chunks1[i] + (c0 >> 32);
+    uint64_t const c2      = chunks2[i] + (c1 >> 32);
+    int64_t const c3       = chunks3[i] + (c2 >> 32);
     uint64_t const lower64 = (c1 << 32) | static_cast<uint32_t>(c0);
-    int64_t const upper64 = (c3 << 32) | static_cast<uint32_t>(c2);
+    int64_t const upper64  = (c3 << 32) | static_cast<uint32_t>(c2);
 
     // check for overflow by ensuring the sign bit matches the top carry bits
     int32_t const replicated_sign_bit = static_cast<int32_t>(c3) >> 31;
-    int32_t const top_carry_bits = static_cast<int32_t>(c3 >> 32);
-    overflows[i] = (replicated_sign_bit != top_carry_bits);
+    int32_t const top_carry_bits      = static_cast<int32_t>(c3 >> 32);
+    overflows[i]                      = (replicated_sign_bit != top_carry_bits);
 
     return (static_cast<__int128_t>(upper64) << 64) | lower64;
   }
 
-private:
+ private:
   // output column for overflow detected
-  bool *const overflows;
+  bool* const overflows;
 
   // input columns for the four 64-bit values
-  uint64_t const *const chunks0;
-  uint64_t const *const chunks1;
-  uint64_t const *const chunks2;
-  int64_t const *const chunks3;
+  uint64_t const* const chunks0;
+  uint64_t const* const chunks1;
+  uint64_t const* const chunks2;
+  int64_t const* const chunks3;
 };
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf::jni {
 
 // Extract a 32-bit chunk from a 128-bit value.
-std::unique_ptr<cudf::column> extract_chunk32(cudf::column_view const &in_col, cudf::data_type type,
-                                              int chunk_idx, rmm::cuda_stream_view stream) {
+std::unique_ptr<cudf::column> extract_chunk32(cudf::column_view const& in_col,
+                                              cudf::data_type type,
+                                              int chunk_idx,
+                                              rmm::cuda_stream_view stream)
+{
   CUDF_EXPECTS(in_col.type().id() == cudf::type_id::DECIMAL128, "not a 128-bit type");
   CUDF_EXPECTS(chunk_idx >= 0 && chunk_idx < 4, "invalid chunk index");
   CUDF_EXPECTS(type.id() == cudf::type_id::INT32 || type.id() == cudf::type_id::UINT32,
                "not a 32-bit integer type");
   auto const num_rows = in_col.size();
   auto out_col =
-      cudf::make_fixed_width_column(type, num_rows, copy_bitmask(in_col), in_col.null_count());
-  auto out_view = out_col->mutable_view();
+    cudf::make_fixed_width_column(type, num_rows, copy_bitmask(in_col), in_col.null_count());
+  auto out_view       = out_col->mutable_view();
   auto const in_begin = in_col.begin<int32_t>();
 
   // Build an iterator for every fourth 32-bit value, i.e.: one "chunk" of a __int128_t value
   thrust::transform_iterator transform_iter{
-      thrust::counting_iterator{0},
-      cuda::proclaim_return_type<cudf::size_type>([] __device__(auto i) { return i * 4; })};
+    thrust::counting_iterator{0},
+    cuda::proclaim_return_type<cudf::size_type>([] __device__(auto i) { return i * 4; })};
   thrust::permutation_iterator stride_iter{in_begin + chunk_idx, transform_iter};
 
-  thrust::copy(rmm::exec_policy(stream), stride_iter, stride_iter + num_rows,
-               out_view.data<int32_t>());
+  thrust::copy(
+    rmm::exec_policy(stream), stride_iter, stride_iter + num_rows, out_view.data<int32_t>());
   return out_col;
 }
 
 // Reassemble a column of 128-bit values from four 64-bit integer columns with overflow detection.
-std::unique_ptr<cudf::table> assemble128_from_sum(cudf::table_view const &chunks_table,
+std::unique_ptr<cudf::table> assemble128_from_sum(cudf::table_view const& chunks_table,
                                                   cudf::data_type output_type,
-                                                  rmm::cuda_stream_view stream) {
+                                                  rmm::cuda_stream_view stream)
+{
   CUDF_EXPECTS(output_type.id() == cudf::type_id::DECIMAL128, "not a 128-bit type");
   CUDF_EXPECTS(chunks_table.num_columns() == 4, "must be 4 column table");
   auto const num_rows = chunks_table.num_rows();
-  auto const chunks0 = chunks_table.column(0);
-  auto const chunks1 = chunks_table.column(1);
-  auto const chunks2 = chunks_table.column(2);
-  auto const chunks3 = chunks_table.column(3);
+  auto const chunks0  = chunks_table.column(0);
+  auto const chunks1  = chunks_table.column(1);
+  auto const chunks2  = chunks_table.column(2);
+  auto const chunks3  = chunks_table.column(3);
   CUDF_EXPECTS(cudf::size_of(chunks0.type()) == 8 && cudf::size_of(chunks1.type()) == 8 &&
-                   cudf::size_of(chunks2.type()) == 8 &&
-                   chunks3.type().id() == cudf::type_id::INT64,
+                 cudf::size_of(chunks2.type()) == 8 && chunks3.type().id() == cudf::type_id::INT64,
                "chunks type mismatch");
   std::vector<std::unique_ptr<cudf::column>> columns;
-  columns.push_back(cudf::make_fixed_width_column(cudf::data_type{cudf::type_id::BOOL8}, num_rows,
-                                                  copy_bitmask(chunks0), chunks0.null_count()));
-  columns.push_back(cudf::make_fixed_width_column(output_type, num_rows, copy_bitmask(chunks0),
-                                                  chunks0.null_count()));
+  columns.push_back(cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::BOOL8}, num_rows, copy_bitmask(chunks0), chunks0.null_count()));
+  columns.push_back(cudf::make_fixed_width_column(
+    output_type, num_rows, copy_bitmask(chunks0), chunks0.null_count()));
   auto overflows_view = columns[0]->mutable_view();
   auto assembled_view = columns[1]->mutable_view();
-  thrust::transform(rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0),
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<cudf::size_type>(0),
                     thrust::make_counting_iterator<cudf::size_type>(num_rows),
                     assembled_view.begin<__int128_t>(),
-                    chunk_assembler(overflows_view.begin<bool>(), chunks0.begin<uint64_t>(),
-                                    chunks1.begin<uint64_t>(), chunks2.begin<uint64_t>(),
+                    chunk_assembler(overflows_view.begin<bool>(),
+                                    chunks0.begin<uint64_t>(),
+                                    chunks1.begin<uint64_t>(),
+                                    chunks2.begin<uint64_t>(),
                                     chunks3.begin<int64_t>()));
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/java/src/main/native/src/aggregation128_utils.hpp b/java/src/main/native/src/aggregation128_utils.hpp
index a1437606cdf..94860cea53b 100644
--- a/java/src/main/native/src/aggregation128_utils.hpp
+++ b/java/src/main/native/src/aggregation128_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,14 @@
  * limitations under the License.
  */
 
-#include <memory>
-
 #include <cudf/column/column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 
+#include <memory>
+
 namespace cudf::jni {
 
 /**
@@ -39,9 +40,11 @@ namespace cudf::jni {
  * @param stream    CUDA stream to use
  * @return          A column containing the extracted 32-bit integer values
  */
-std::unique_ptr<cudf::column>
-extract_chunk32(cudf::column_view const &col, cudf::data_type dtype, int chunk_idx,
-                rmm::cuda_stream_view stream = cudf::get_default_stream());
+std::unique_ptr<cudf::column> extract_chunk32(
+  cudf::column_view const& col,
+  cudf::data_type dtype,
+  int chunk_idx,
+  rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Reassemble a 128-bit column from four 64-bit integer columns with overflow detection.
@@ -63,8 +66,9 @@ extract_chunk32(cudf::column_view const &col, cudf::data_type dtype, int chunk_i
  *                     requested type. The boolean value will be true if an overflow was detected
  *                     for that row's value.
  */
-std::unique_ptr<cudf::table>
-assemble128_from_sum(cudf::table_view const &chunks_table, cudf::data_type output_type,
-                     rmm::cuda_stream_view stream = cudf::get_default_stream());
+std::unique_ptr<cudf::table> assemble128_from_sum(
+  cudf::table_view const& chunks_table,
+  cudf::data_type output_type,
+  rmm::cuda_stream_view stream = cudf::get_default_stream());
 
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/java/src/main/native/src/check_nvcomp_output_sizes.cu b/java/src/main/native/src/check_nvcomp_output_sizes.cu
index 9d29e66ec59..8e0df7dd89a 100644
--- a/java/src/main/native/src/check_nvcomp_output_sizes.cu
+++ b/java/src/main/native/src/check_nvcomp_output_sizes.cu
@@ -13,20 +13,21 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "check_nvcomp_output_sizes.hpp"
+
 #include <cudf/utilities/error.hpp>
+
 #include <nvtx3/nvtx3.hpp>
 #include <thrust/device_ptr.h>
 #include <thrust/equal.h>
 
-#include "check_nvcomp_output_sizes.hpp"
-
 namespace {
 
 struct java_domain {
-  static constexpr char const *name{"Java"};
+  static constexpr char const* name{"Java"};
 };
 
-} // anonymous namespace
+}  // anonymous namespace
 
 namespace cudf {
 namespace java {
@@ -35,13 +36,17 @@ namespace java {
  * Check that the vector of expected uncompressed sizes matches the vector of actual compressed
  * sizes. Both vectors are assumed to be in device memory and contain num_chunks elements.
  */
-bool check_nvcomp_output_sizes(std::size_t const *dev_uncompressed_sizes,
-                               std::size_t const *dev_actual_uncompressed_sizes,
-                               std::size_t num_chunks, rmm::cuda_stream_view stream) {
+bool check_nvcomp_output_sizes(std::size_t const* dev_uncompressed_sizes,
+                               std::size_t const* dev_actual_uncompressed_sizes,
+                               std::size_t num_chunks,
+                               rmm::cuda_stream_view stream)
+{
   NVTX3_FUNC_RANGE_IN(java_domain);
-  return thrust::equal(rmm::exec_policy(stream), dev_uncompressed_sizes,
-                       dev_uncompressed_sizes + num_chunks, dev_actual_uncompressed_sizes);
+  return thrust::equal(rmm::exec_policy(stream),
+                       dev_uncompressed_sizes,
+                       dev_uncompressed_sizes + num_chunks,
+                       dev_actual_uncompressed_sizes);
 }
 
-} // namespace java
-} // namespace cudf
+}  // namespace java
+}  // namespace cudf
diff --git a/java/src/main/native/src/check_nvcomp_output_sizes.hpp b/java/src/main/native/src/check_nvcomp_output_sizes.hpp
index 00b36471a85..594be6c7c96 100644
--- a/java/src/main/native/src/check_nvcomp_output_sizes.hpp
+++ b/java/src/main/native/src/check_nvcomp_output_sizes.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,8 +26,9 @@ namespace java {
  * Check that the vector of expected uncompressed sizes matches the vector of actual compressed
  * sizes. Both vectors are assumed to be in device memory and contain num_chunks elements.
  */
-bool check_nvcomp_output_sizes(std::size_t const *dev_uncompressed_sizes,
-                               std::size_t const *dev_actual_uncompressed_sizes,
-                               std::size_t num_chunks, rmm::cuda_stream_view stream);
-} // namespace java
-} // namespace cudf
+bool check_nvcomp_output_sizes(std::size_t const* dev_uncompressed_sizes,
+                               std::size_t const* dev_actual_uncompressed_sizes,
+                               std::size_t num_chunks,
+                               rmm::cuda_stream_view stream);
+}  // namespace java
+}  // namespace cudf
diff --git a/java/src/main/native/src/csv_chunked_writer.hpp b/java/src/main/native/src/csv_chunked_writer.hpp
index 1f1e73a1a4b..ee05aa95328 100644
--- a/java/src/main/native/src/csv_chunked_writer.hpp
+++ b/java/src/main/native/src/csv_chunked_writer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,11 @@
  */
 #pragma once
 
-#include <cassert>
+#include "jni_writer_data_sink.hpp"
 
 #include <cudf/io/csv.hpp>
 
-#include "jni_writer_data_sink.hpp"
+#include <cassert>
 
 namespace cudf::jni::io {
 
@@ -27,17 +27,17 @@ namespace cudf::jni::io {
  * @brief Class to write multiple Tables into the jni_writer_data_sink.
  */
 class csv_chunked_writer {
-
   cudf::io::csv_writer_options _options;
   std::unique_ptr<cudf::jni::jni_writer_data_sink> _sink;
 
-  bool _first_write_completed = false; ///< Decides if header should be written.
+  bool _first_write_completed = false;  ///< Decides if header should be written.
 
-public:
+ public:
   explicit csv_chunked_writer(cudf::io::csv_writer_options options,
-                              std::unique_ptr<cudf::jni::jni_writer_data_sink> &sink)
-      : _options{options}, _sink{std::move(sink)} {
-    auto const &sink_info = _options.get_sink();
+                              std::unique_ptr<cudf::jni::jni_writer_data_sink>& sink)
+    : _options{options}, _sink{std::move(sink)}
+  {
+    auto const& sink_info = _options.get_sink();
     // Assert invariants.
     CUDF_EXPECTS(sink_info.type() != cudf::io::io_type::FILEPATH,
                  "Currently, chunked CSV writes to files is not supported.");
@@ -52,9 +52,10 @@ class csv_chunked_writer {
     CUDF_EXPECTS(sink_info.user_sinks()[0] == _sink.get(), "Sink mismatch.");
   }
 
-  void write(cudf::table_view const &table) {
+  void write(cudf::table_view const& table)
+  {
     if (_first_write_completed) {
-      _options.enable_include_header(false); // Don't write header after the first write.
+      _options.enable_include_header(false);  // Don't write header after the first write.
     }
 
     _options.set_table(table);
@@ -64,10 +65,11 @@ class csv_chunked_writer {
     _first_write_completed = true;
   }
 
-  void close() {
+  void close()
+  {
     // Flush pending writes to sink.
     _sink->flush();
   }
 };
 
-} // namespace cudf::jni::io
+}  // namespace cudf::jni::io
diff --git a/java/src/main/native/src/cudf_jni_apis.hpp b/java/src/main/native/src/cudf_jni_apis.hpp
index bd82bbd2899..022493f04ab 100644
--- a/java/src/main/native/src/cudf_jni_apis.hpp
+++ b/java/src/main/native/src/cudf_jni_apis.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,11 @@
  */
 #pragma once
 
+#include "jni_utils.hpp"
+
 #include <cudf/contiguous_split.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 
-#include "jni_utils.hpp"
-
 namespace cudf {
 namespace jni {
 
@@ -34,29 +34,31 @@ namespace jni {
  * @param table_result the table to convert for return
  * @param extra_columns columns not in the table that will be appended to the result.
  */
-jlongArray
-convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &table_result,
-                         std::vector<std::unique_ptr<cudf::column>> &&extra_columns = {});
+jlongArray convert_table_for_return(
+  JNIEnv* env,
+  std::unique_ptr<cudf::table>& table_result,
+  std::vector<std::unique_ptr<cudf::column>>&& extra_columns = {});
 
 /**
  * @copydoc convert_table_for_return(JNIEnv*, std::unique_ptr<cudf::table>&,
  *                                   std::vector<std::unique_ptr<cudf::column>>&&)
  */
-jlongArray
-convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &&table_result,
-                         std::vector<std::unique_ptr<cudf::column>> &&extra_columns = {});
+jlongArray convert_table_for_return(
+  JNIEnv* env,
+  std::unique_ptr<cudf::table>&& table_result,
+  std::vector<std::unique_ptr<cudf::column>>&& extra_columns = {});
 
 //
 // ContiguousTable APIs
 //
 
-bool cache_contiguous_table_jni(JNIEnv *env);
+bool cache_contiguous_table_jni(JNIEnv* env);
 
-void release_contiguous_table_jni(JNIEnv *env);
+void release_contiguous_table_jni(JNIEnv* env);
 
-jobject contiguous_table_from(JNIEnv *env, cudf::packed_columns &split, long row_count);
+jobject contiguous_table_from(JNIEnv* env, cudf::packed_columns& split, long row_count);
 
-native_jobjectArray<jobject> contiguous_table_array(JNIEnv *env, jsize length);
+native_jobjectArray<jobject> contiguous_table_array(JNIEnv* env, jsize length);
 
 /**
  * @brief Cache the JNI jclass and JNI jfield of Java `ContigSplitGroupByResult`
@@ -64,14 +66,14 @@ native_jobjectArray<jobject> contiguous_table_array(JNIEnv *env, jsize length);
  * @param env the JNI Env pointer
  * @return if success
  */
-bool cache_contig_split_group_by_result_jni(JNIEnv *env);
+bool cache_contig_split_group_by_result_jni(JNIEnv* env);
 
 /**
  * @brief Release the JNI jclass and JNI jfield of Java `ContigSplitGroupByResult`
  *
  * @param env the JNI Env pointer
  */
-void release_contig_split_group_by_result_jni(JNIEnv *env);
+void release_contig_split_group_by_result_jni(JNIEnv* env);
 
 /**
  * @brief Construct a Java `ContigSplitGroupByResult` from contiguous tables.
@@ -80,7 +82,7 @@ void release_contig_split_group_by_result_jni(JNIEnv *env);
  * @param groups the contiguous tables
  * @return a Java `ContigSplitGroupByResult`
  */
-jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups);
+jobject contig_split_group_by_result_from(JNIEnv* env, jobjectArray& groups);
 
 /**
  * @brief Construct a Java `ContigSplitGroupByResult` from contiguous tables.
@@ -90,8 +92,9 @@ jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups);
  * @param groups the contiguous tables
  * @return a Java `ContigSplitGroupByResult`
  */
-jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups,
-                                          jlongArray &uniq_key_columns);
+jobject contig_split_group_by_result_from(JNIEnv* env,
+                                          jobjectArray& groups,
+                                          jlongArray& uniq_key_columns);
 
 //
 // HostMemoryBuffer APIs
@@ -100,22 +103,24 @@ jobject contig_split_group_by_result_from(JNIEnv *env, jobjectArray &groups,
 /**
  * Allocate a HostMemoryBuffer
  */
-jobject allocate_host_buffer(JNIEnv *env, jlong amount, jboolean prefer_pinned,
+jobject allocate_host_buffer(JNIEnv* env,
+                             jlong amount,
+                             jboolean prefer_pinned,
                              jobject host_memory_allocator);
 
 /**
  * Get the address of a HostMemoryBuffer
  */
-jlong get_host_buffer_address(JNIEnv *env, jobject buffer);
+jlong get_host_buffer_address(JNIEnv* env, jobject buffer);
 
 /**
  * Get the length of a HostMemoryBuffer
  */
-jlong get_host_buffer_length(JNIEnv *env, jobject buffer);
+jlong get_host_buffer_length(JNIEnv* env, jobject buffer);
 
 // Get the JNI environment, attaching the current thread to the JVM if necessary. If the thread
 // needs to be attached, the thread will automatically detach when the thread terminates.
-JNIEnv *get_jni_env(JavaVM *jvm);
+JNIEnv* get_jni_env(JavaVM* jvm);
 
 /** Set the device to use for cudf */
 void set_cudf_device(int device);
@@ -125,22 +130,22 @@ void set_cudf_device(int device);
  * set the device, throw an exception, or do nothing depending on how the application has
  * configured it via Cuda.setAutoSetDeviceMode.
  */
-void auto_set_device(JNIEnv *env);
+void auto_set_device(JNIEnv* env);
 
 /**
  * Fills all the bytes in the buffer 'buf' with 'value'.
  * The operation has not necessarily completed when this returns, but it could overlap with
  * operations occurring on other streams.
  */
-void device_memset_async(JNIEnv *env, rmm::device_buffer &buf, char value);
+void device_memset_async(JNIEnv* env, rmm::device_buffer& buf, char value);
 
 //
 // DataSource APIs
 //
 
-bool cache_data_source_jni(JNIEnv *env);
+bool cache_data_source_jni(JNIEnv* env);
 
-void release_data_source_jni(JNIEnv *env);
+void release_data_source_jni(JNIEnv* env);
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/main/native/src/dtype_utils.hpp b/java/src/main/native/src/dtype_utils.hpp
index 4de8a94182c..90408782dd0 100644
--- a/java/src/main/native/src/dtype_utils.hpp
+++ b/java/src/main/native/src/dtype_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,15 +15,16 @@
  */
 #pragma once
 
-#include <jni.h>
-
 #include <cudf/types.hpp>
 
+#include <jni.h>
+
 namespace cudf {
 namespace jni {
 
 // convert a timestamp type to the corresponding duration type
-inline cudf::data_type timestamp_to_duration(cudf::data_type dt) {
+inline cudf::data_type timestamp_to_duration(cudf::data_type dt)
+{
   cudf::type_id duration_type_id;
   switch (dt.id()) {
     case cudf::type_id::TIMESTAMP_DAYS: duration_type_id = cudf::type_id::DURATION_DAYS; break;
@@ -44,13 +45,15 @@ inline cudf::data_type timestamp_to_duration(cudf::data_type dt) {
   return cudf::data_type(duration_type_id);
 }
 
-inline bool is_decimal_type(cudf::type_id n_type) {
+inline bool is_decimal_type(cudf::type_id n_type)
+{
   return n_type == cudf::type_id::DECIMAL32 || n_type == cudf::type_id::DECIMAL64 ||
          n_type == cudf::type_id::DECIMAL128;
 }
 
 // create data_type including scale for decimal type
-inline cudf::data_type make_data_type(jint out_dtype, jint scale) {
+inline cudf::data_type make_data_type(jint out_dtype, jint scale)
+{
   cudf::type_id n_type = static_cast<cudf::type_id>(out_dtype);
   cudf::data_type n_data_type;
   if (is_decimal_type(n_type)) {
@@ -61,5 +64,5 @@ inline cudf::data_type make_data_type(jint out_dtype, jint scale) {
   return n_data_type;
 }
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/main/native/src/jni_compiled_expr.hpp b/java/src/main/native/src/jni_compiled_expr.hpp
index 74010f71011..dad2c33b731 100644
--- a/java/src/main/native/src/jni_compiled_expr.hpp
+++ b/java/src/main/native/src/jni_compiled_expr.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,11 @@
 
 #pragma once
 
+#include <cudf/ast/expressions.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <memory>
+#include <utility>
 #include <vector>
 
 namespace cudf {
@@ -38,29 +43,31 @@ class compiled_expr {
   /** GPU scalar instances that correspond to literal nodes */
   std::vector<std::unique_ptr<cudf::scalar>> scalars;
 
-public:
-  cudf::ast::literal &add_literal(std::unique_ptr<cudf::ast::literal> literal_ptr,
-                                  std::unique_ptr<cudf::scalar> scalar_ptr) {
+ public:
+  cudf::ast::literal& add_literal(std::unique_ptr<cudf::ast::literal> literal_ptr,
+                                  std::unique_ptr<cudf::scalar> scalar_ptr)
+  {
     expressions.push_back(std::move(literal_ptr));
     scalars.push_back(std::move(scalar_ptr));
-    return static_cast<cudf::ast::literal &>(*expressions.back());
+    return static_cast<cudf::ast::literal&>(*expressions.back());
   }
 
-  cudf::ast::column_reference &
-  add_column_ref(std::unique_ptr<cudf::ast::column_reference> ref_ptr) {
+  cudf::ast::column_reference& add_column_ref(std::unique_ptr<cudf::ast::column_reference> ref_ptr)
+  {
     expressions.push_back(std::move(ref_ptr));
-    return static_cast<cudf::ast::column_reference &>(*expressions.back());
+    return static_cast<cudf::ast::column_reference&>(*expressions.back());
   }
 
-  cudf::ast::operation &add_operation(std::unique_ptr<cudf::ast::operation> expr_ptr) {
+  cudf::ast::operation& add_operation(std::unique_ptr<cudf::ast::operation> expr_ptr)
+  {
     expressions.push_back(std::move(expr_ptr));
-    return static_cast<cudf::ast::operation &>(*expressions.back());
+    return static_cast<cudf::ast::operation&>(*expressions.back());
   }
 
   /** Return the expression node at the top of the tree */
-  cudf::ast::expression &get_top_expression() const { return *expressions.back(); }
+  cudf::ast::expression& get_top_expression() const { return *expressions.back(); }
 };
 
-} // namespace ast
-} // namespace jni
-} // namespace cudf
+}  // namespace ast
+}  // namespace jni
+}  // namespace cudf
diff --git a/java/src/main/native/src/jni_writer_data_sink.hpp b/java/src/main/native/src/jni_writer_data_sink.hpp
index efac6112c25..52756266beb 100644
--- a/java/src/main/native/src/jni_writer_data_sink.hpp
+++ b/java/src/main/native/src/jni_writer_data_sink.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,56 +15,53 @@
  */
 #pragma once
 
-#include <cudf/io/data_sink.hpp>
-
 #include "cudf_jni_apis.hpp"
 #include "jni_utils.hpp"
 
+#include <cudf/io/data_sink.hpp>
+
 namespace cudf::jni {
 
-constexpr long MINIMUM_WRITE_BUFFER_SIZE = 10 * 1024 * 1024; // 10 MB
+constexpr long MINIMUM_WRITE_BUFFER_SIZE = 10 * 1024 * 1024;  // 10 MB
 
 class jni_writer_data_sink final : public cudf::io::data_sink {
-public:
-  explicit jni_writer_data_sink(JNIEnv *env, jobject callback, jobject host_memory_allocator) {
-    if (env->GetJavaVM(&jvm) < 0) {
-      throw std::runtime_error("GetJavaVM failed");
-    }
+ public:
+  explicit jni_writer_data_sink(JNIEnv* env, jobject callback, jobject host_memory_allocator)
+  {
+    if (env->GetJavaVM(&jvm) < 0) { throw std::runtime_error("GetJavaVM failed"); }
 
     jclass cls = env->GetObjectClass(callback);
-    if (cls == nullptr) {
-      throw cudf::jni::jni_exception("class not found");
-    }
+    if (cls == nullptr) { throw cudf::jni::jni_exception("class not found"); }
 
     handle_buffer_method =
-        env->GetMethodID(cls, "handleBuffer", "(Lai/rapids/cudf/HostMemoryBuffer;J)V");
-    if (handle_buffer_method == nullptr) {
-      throw cudf::jni::jni_exception("handleBuffer method");
-    }
+      env->GetMethodID(cls, "handleBuffer", "(Lai/rapids/cudf/HostMemoryBuffer;J)V");
+    if (handle_buffer_method == nullptr) { throw cudf::jni::jni_exception("handleBuffer method"); }
 
-    this->callback = add_global_ref(env, callback);
+    this->callback              = add_global_ref(env, callback);
     this->host_memory_allocator = add_global_ref(env, host_memory_allocator);
   }
 
-  virtual ~jni_writer_data_sink() {
+  virtual ~jni_writer_data_sink()
+  {
     // This should normally be called by a JVM thread. If the JVM environment is missing then this
     // is likely being triggered by the C++ runtime during shutdown. In that case the JVM may
     // already be destroyed and this thread should not try to attach to get an environment.
-    JNIEnv *env = nullptr;
-    if (jvm->GetEnv(reinterpret_cast<void **>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
-      callback = del_global_ref(env, callback);
-      current_buffer = del_global_ref(env, current_buffer);
+    JNIEnv* env = nullptr;
+    if (jvm->GetEnv(reinterpret_cast<void**>(&env), cudf::jni::MINIMUM_JNI_VERSION) == JNI_OK) {
+      callback              = del_global_ref(env, callback);
+      current_buffer        = del_global_ref(env, current_buffer);
       host_memory_allocator = del_global_ref(env, host_memory_allocator);
     }
-    callback = nullptr;
-    current_buffer = nullptr;
+    callback              = nullptr;
+    current_buffer        = nullptr;
     host_memory_allocator = nullptr;
   }
 
-  void host_write(void const *data, size_t size) override {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
-    long left_to_copy = static_cast<long>(size);
-    const char *copy_from = static_cast<const char *>(data);
+  void host_write(void const* data, size_t size) override
+  {
+    JNIEnv* env           = cudf::jni::get_jni_env(jvm);
+    long left_to_copy     = static_cast<long>(size);
+    const char* copy_from = static_cast<const char*>(data);
     while (left_to_copy > 0) {
       long buffer_amount_available = current_buffer_len - current_buffer_written;
       if (buffer_amount_available <= 0) {
@@ -73,8 +70,8 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
         buffer_amount_available = current_buffer_len - current_buffer_written;
       }
       long amount_to_copy =
-          left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
-      char *copy_to = current_buffer_data + current_buffer_written;
+        left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
+      char* copy_to = current_buffer_data + current_buffer_written;
 
       std::memcpy(copy_to, copy_from, amount_to_copy);
       copy_from = copy_from + amount_to_copy;
@@ -86,10 +83,11 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
 
   bool supports_device_write() const override { return true; }
 
-  void device_write(void const *gpu_data, size_t size, rmm::cuda_stream_view stream) override {
-    JNIEnv *env = cudf::jni::get_jni_env(jvm);
-    long left_to_copy = static_cast<long>(size);
-    const char *copy_from = static_cast<const char *>(gpu_data);
+  void device_write(void const* gpu_data, size_t size, rmm::cuda_stream_view stream) override
+  {
+    JNIEnv* env           = cudf::jni::get_jni_env(jvm);
+    long left_to_copy     = static_cast<long>(size);
+    const char* copy_from = static_cast<const char*>(gpu_data);
     while (left_to_copy > 0) {
       long buffer_amount_available = current_buffer_len - current_buffer_written;
       if (buffer_amount_available <= 0) {
@@ -99,11 +97,11 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
         buffer_amount_available = current_buffer_len - current_buffer_written;
       }
       long amount_to_copy =
-          left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
-      char *copy_to = current_buffer_data + current_buffer_written;
+        left_to_copy < buffer_amount_available ? left_to_copy : buffer_amount_available;
+      char* copy_to = current_buffer_data + current_buffer_written;
 
-      CUDF_CUDA_TRY(cudaMemcpyAsync(copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost,
-                                    stream.value()));
+      CUDF_CUDA_TRY(cudaMemcpyAsync(
+        copy_to, copy_from, amount_to_copy, cudaMemcpyDeviceToHost, stream.value()));
 
       copy_from = copy_from + amount_to_copy;
       current_buffer_written += amount_to_copy;
@@ -113,20 +111,23 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
     stream.synchronize();
   }
 
-  std::future<void> device_write_async(void const *gpu_data, size_t size,
-                                       rmm::cuda_stream_view stream) override {
+  std::future<void> device_write_async(void const* gpu_data,
+                                       size_t size,
+                                       rmm::cuda_stream_view stream) override
+  {
     // Call the sync version until figuring out how to write asynchronously.
     device_write(gpu_data, size, stream);
     return std::async(std::launch::deferred, [] {});
   }
 
-  void flush() override {
+  void flush() override
+  {
     if (current_buffer_written > 0) {
-      JNIEnv *env = cudf::jni::get_jni_env(jvm);
+      JNIEnv* env = cudf::jni::get_jni_env(jvm);
       handle_buffer(env, current_buffer, current_buffer_written);
-      current_buffer = del_global_ref(env, current_buffer);
-      current_buffer_len = 0;
-      current_buffer_data = nullptr;
+      current_buffer         = del_global_ref(env, current_buffer);
+      current_buffer_len     = 0;
+      current_buffer_data    = nullptr;
       current_buffer_written = 0;
     }
   }
@@ -135,36 +136,34 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
 
   void set_alloc_size(long size) { this->alloc_size = size; }
 
-private:
-  void rotate_buffer(JNIEnv *env) {
-    if (current_buffer != nullptr) {
-      handle_buffer(env, current_buffer, current_buffer_written);
-    }
-    current_buffer = del_global_ref(env, current_buffer);
-    jobject tmp_buffer = allocate_host_buffer(env, alloc_size, true, host_memory_allocator);
-    current_buffer = add_global_ref(env, tmp_buffer);
-    current_buffer_len = get_host_buffer_length(env, current_buffer);
-    current_buffer_data = reinterpret_cast<char *>(get_host_buffer_address(env, current_buffer));
+ private:
+  void rotate_buffer(JNIEnv* env)
+  {
+    if (current_buffer != nullptr) { handle_buffer(env, current_buffer, current_buffer_written); }
+    current_buffer         = del_global_ref(env, current_buffer);
+    jobject tmp_buffer     = allocate_host_buffer(env, alloc_size, true, host_memory_allocator);
+    current_buffer         = add_global_ref(env, tmp_buffer);
+    current_buffer_len     = get_host_buffer_length(env, current_buffer);
+    current_buffer_data    = reinterpret_cast<char*>(get_host_buffer_address(env, current_buffer));
     current_buffer_written = 0;
   }
 
-  void handle_buffer(JNIEnv *env, jobject buffer, jlong len) {
+  void handle_buffer(JNIEnv* env, jobject buffer, jlong len)
+  {
     env->CallVoidMethod(callback, handle_buffer_method, buffer, len);
-    if (env->ExceptionCheck()) {
-      throw std::runtime_error("handleBuffer threw an exception");
-    }
+    if (env->ExceptionCheck()) { throw std::runtime_error("handleBuffer threw an exception"); }
   }
 
-  JavaVM *jvm;
+  JavaVM* jvm;
   jobject callback;
   jmethodID handle_buffer_method;
-  jobject current_buffer = nullptr;
-  char *current_buffer_data = nullptr;
-  long current_buffer_len = 0;
+  jobject current_buffer      = nullptr;
+  char* current_buffer_data   = nullptr;
+  long current_buffer_len     = 0;
   long current_buffer_written = 0;
-  size_t total_written = 0;
-  long alloc_size = MINIMUM_WRITE_BUFFER_SIZE;
+  size_t total_written        = 0;
+  long alloc_size             = MINIMUM_WRITE_BUFFER_SIZE;
   jobject host_memory_allocator;
 };
 
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/java/src/main/native/src/maps_column_view.cu b/java/src/main/native/src/maps_column_view.cu
index d5600e48a5c..d3ee52c074c 100644
--- a/java/src/main/native/src/maps_column_view.cu
+++ b/java/src/main/native/src/maps_column_view.cu
@@ -18,14 +18,17 @@
 #include <cudf/lists/detail/contains.hpp>
 #include <cudf/lists/detail/extract.hpp>
 #include <cudf/scalar/scalar.hpp>
-#include <maps_column_view.hpp>
+
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <maps_column_view.hpp>
+
 namespace cudf::jni {
 
 namespace {
-column_view make_lists(column_view const &lists_child, lists_column_view const &lists_of_structs) {
+column_view make_lists(column_view const& lists_child, lists_column_view const& lists_of_structs)
+{
   return column_view{data_type{type_id::LIST},
                      lists_of_structs.size(),
                      nullptr,
@@ -34,12 +37,13 @@ column_view make_lists(column_view const &lists_child, lists_column_view const &
                      lists_of_structs.offset(),
                      {lists_of_structs.offsets(), lists_child}};
 }
-} // namespace
+}  // namespace
 
-maps_column_view::maps_column_view(lists_column_view const &lists_of_structs,
+maps_column_view::maps_column_view(lists_column_view const& lists_of_structs,
                                    rmm::cuda_stream_view stream)
-    : keys_{make_lists(lists_of_structs.child().child(0), lists_of_structs)},
-      values_{make_lists(lists_of_structs.child().child(1), lists_of_structs)} {
+  : keys_{make_lists(lists_of_structs.child().child(0), lists_of_structs)},
+    values_{make_lists(lists_of_structs.child().child(1), lists_of_structs)}
+{
   auto const structs = lists_of_structs.child();
   CUDF_EXPECTS(structs.type().id() == type_id::STRUCT,
                "maps_column_view input must have exactly 1 child (STRUCT) column.");
@@ -48,66 +52,78 @@ maps_column_view::maps_column_view(lists_column_view const &lists_of_structs,
 }
 
 template <typename KeyT>
-std::unique_ptr<column> get_values_for_impl(maps_column_view const &maps_view,
-                                            KeyT const &lookup_keys, rmm::cuda_stream_view stream,
-                                            rmm::device_async_resource_ref mr) {
-  auto const keys_ = maps_view.keys();
+std::unique_ptr<column> get_values_for_impl(maps_column_view const& maps_view,
+                                            KeyT const& lookup_keys,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::device_async_resource_ref mr)
+{
+  auto const keys_   = maps_view.keys();
   auto const values_ = maps_view.values();
   CUDF_EXPECTS(lookup_keys.type().id() == keys_.child().type().id(),
                "Lookup keys must have the same type as the keys of the map column.");
-  auto key_indices =
-      lists::detail::index_of(keys_, lookup_keys, lists::duplicate_find_option::FIND_LAST, stream,
-                              rmm::mr::get_current_device_resource());
-  auto constexpr absent_offset = size_type{-1};
+  auto key_indices              = lists::detail::index_of(keys_,
+                                             lookup_keys,
+                                             lists::duplicate_find_option::FIND_LAST,
+                                             stream,
+                                             rmm::mr::get_current_device_resource());
+  auto constexpr absent_offset  = size_type{-1};
   auto constexpr nullity_offset = std::numeric_limits<size_type>::min();
-  thrust::replace(rmm::exec_policy(stream), key_indices->mutable_view().template begin<size_type>(),
-                  key_indices->mutable_view().template end<size_type>(), absent_offset,
+  thrust::replace(rmm::exec_policy(stream),
+                  key_indices->mutable_view().template begin<size_type>(),
+                  key_indices->mutable_view().template end<size_type>(),
+                  absent_offset,
                   nullity_offset);
   return lists::detail::extract_list_element(values_, key_indices->view(), stream, mr);
 }
 
-std::unique_ptr<column> maps_column_view::get_values_for(column_view const &lookup_keys,
+std::unique_ptr<column> maps_column_view::get_values_for(column_view const& lookup_keys,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::device_async_resource_ref mr) const {
+                                                         rmm::device_async_resource_ref mr) const
+{
   CUDF_EXPECTS(lookup_keys.size() == size(),
                "Lookup keys must have the same size as the map column.");
 
   return get_values_for_impl(*this, lookup_keys, stream, mr);
 }
 
-std::unique_ptr<column> maps_column_view::get_values_for(scalar const &lookup_key,
+std::unique_ptr<column> maps_column_view::get_values_for(scalar const& lookup_key,
                                                          rmm::cuda_stream_view stream,
-                                                         rmm::device_async_resource_ref mr) const {
+                                                         rmm::device_async_resource_ref mr) const
+{
   return get_values_for_impl(*this, lookup_key, stream, mr);
 }
 
 template <typename KeyT>
-std::unique_ptr<column> contains_impl(maps_column_view const &maps_view, KeyT const &lookup_keys,
+std::unique_ptr<column> contains_impl(maps_column_view const& maps_view,
+                                      KeyT const& lookup_keys,
                                       rmm::cuda_stream_view stream,
-                                      rmm::device_async_resource_ref mr) {
+                                      rmm::device_async_resource_ref mr)
+{
   auto const keys = maps_view.keys();
   CUDF_EXPECTS(lookup_keys.type().id() == keys.child().type().id(),
                "Lookup keys must have the same type as the keys of the map column.");
   auto const contains =
-      lists::detail::contains(keys, lookup_keys, stream, rmm::mr::get_current_device_resource());
+    lists::detail::contains(keys, lookup_keys, stream, rmm::mr::get_current_device_resource());
   // Replace nulls with BOOL8{false};
   auto const scalar_false = numeric_scalar<bool>{false, true, stream};
   return detail::replace_nulls(contains->view(), scalar_false, stream, mr);
 }
 
-std::unique_ptr<column> maps_column_view::contains(column_view const &lookup_keys,
+std::unique_ptr<column> maps_column_view::contains(column_view const& lookup_keys,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::device_async_resource_ref mr) const {
+                                                   rmm::device_async_resource_ref mr) const
+{
   CUDF_EXPECTS(lookup_keys.size() == size(),
                "Lookup keys must have the same size as the map column.");
 
   return contains_impl(*this, lookup_keys, stream, mr);
 }
 
-std::unique_ptr<column> maps_column_view::contains(scalar const &lookup_key,
+std::unique_ptr<column> maps_column_view::contains(scalar const& lookup_key,
                                                    rmm::cuda_stream_view stream,
-                                                   rmm::device_async_resource_ref mr) const {
+                                                   rmm::device_async_resource_ref mr) const
+{
   return contains_impl(*this, lookup_key, stream, mr);
 }
 
-} // namespace cudf::jni
+}  // namespace cudf::jni
diff --git a/java/src/main/native/src/nvtx_common.hpp b/java/src/main/native/src/nvtx_common.hpp
index 8b5b04f3370..69bcdfb8521 100644
--- a/java/src/main/native/src/nvtx_common.hpp
+++ b/java/src/main/native/src/nvtx_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,8 @@ namespace cudf {
 namespace jni {
 
 struct java_domain {
-  static constexpr char const *name{"Java"};
+  static constexpr char const* name{"Java"};
 };
 
-} // namespace jni
-} // namespace cudf
+}  // namespace jni
+}  // namespace cudf

From 9dac831cb51c90e6d30d6b6c6366b8afd01047aa Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 18 Apr 2024 05:55:25 -1000
Subject: [PATCH 078/842] Clean up __cuda_array_interface__ handling in
 as_column (#15477)

Removes some unnecessary type cast checking and NaT handling as cupy does not support datelike types https://github.com/cupy/cupy/issues/2622

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15477
---
 python/cudf/cudf/core/column/column.py | 98 +++++++-------------------
 1 file changed, 24 insertions(+), 74 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b5890f7aad4..7e48552742c 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1679,27 +1679,6 @@ def build_categorical_column(
     return cast("cudf.core.column.CategoricalColumn", result)
 
 
-def _make_copy_replacing_NaT_with_null(column):
-    """Return a copy with NaT values replaced with nulls."""
-    if np.issubdtype(column.dtype, np.timedelta64):
-        na_value = np.timedelta64("NaT", column.time_unit)
-    elif np.issubdtype(column.dtype, np.datetime64):
-        na_value = np.datetime64("NaT", column.time_unit)
-    else:
-        raise ValueError("This type does not support replacing NaT with null.")
-
-    null = column_empty_like(column, masked=True, newsize=1)
-    out_col = cudf._lib.replace.replace(
-        column,
-        build_column(
-            as_buffer(np.array([na_value], dtype=column.dtype).view("|u1")),
-            dtype=column.dtype,
-        ),
-        null,
-    )
-    return out_col
-
-
 def check_invalid_array(shape: tuple, dtype):
     """Invalid ndarrays properties that are not supported"""
     if len(shape) > 1:
@@ -1782,50 +1761,30 @@ def as_column(
         return arbitrary
     elif hasattr(arbitrary, "__cuda_array_interface__"):
         desc = arbitrary.__cuda_array_interface__
-        shape = desc["shape"]
-        current_dtype = np.dtype(desc["typestr"])
-
-        check_invalid_array(shape, current_dtype)
-
-        arb_dtype = cudf.dtype(current_dtype)
+        check_invalid_array(desc["shape"], np.dtype(desc["typestr"]))
 
         if desc.get("mask", None) is not None:
             # Extract and remove the mask from arbitrary before
             # passing to cupy.asarray
-            mask = _mask_from_cuda_array_interface_desc(arbitrary)
-            arbitrary = SimpleNamespace(__cuda_array_interface__=desc.copy())
-            arbitrary.__cuda_array_interface__["mask"] = None
-            desc = arbitrary.__cuda_array_interface__
+            cai_copy = desc.copy()
+            mask = _mask_from_cuda_array_interface_desc(
+                arbitrary, cai_copy.pop("mask")
+            )
+            arbitrary = SimpleNamespace(__cuda_array_interface__=cai_copy)
         else:
             mask = None
 
         arbitrary = cupy.asarray(arbitrary)
-
-        if arb_dtype != current_dtype:
-            arbitrary = arbitrary.astype(arb_dtype)
-            current_dtype = arb_dtype
-
-        if (
-            desc["strides"] is not None
-            and not (arbitrary.itemsize,) == arbitrary.strides
-        ):
-            arbitrary = cupy.ascontiguousarray(arbitrary)
+        arbitrary = cupy.ascontiguousarray(arbitrary)
 
         data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write"))
-        col = build_column(data, dtype=current_dtype, mask=mask)
-
+        col = build_column(data, dtype=arbitrary.dtype, mask=mask)
+        if (
+            nan_as_null or (mask is None and nan_as_null is None)
+        ) and col.dtype.kind == "f":
+            col = col.nans_to_nulls()
         if dtype is not None:
             col = col.astype(dtype)
-
-        if isinstance(col, cudf.core.column.CategoricalColumn):
-            return col
-        elif np.issubdtype(col.dtype, np.floating):
-            if nan_as_null or (mask is None and nan_as_null is None):
-                mask = libcudf.transform.nans_to_nulls(col.fillna(np.nan))
-                col = col.set_mask(mask)
-        elif np.issubdtype(col.dtype, np.datetime64):
-            if nan_as_null or (mask is None and nan_as_null is None):
-                col = _make_copy_replacing_NaT_with_null(col)
         return col
 
     elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)):
@@ -2222,27 +2181,18 @@ def as_column(
         return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
 
 
-def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]:
-    desc = obj.__cuda_array_interface__
-    mask = desc.get("mask", None)
-
-    if mask is not None:
-        desc = mask.__cuda_array_interface__
-        ptr = desc["data"][0]
-        nelem = desc["shape"][0]
-        typestr = desc["typestr"]
-        typecode = typestr[1]
-        if typecode == "t":
-            mask_size = bitmask_allocation_size_bytes(nelem)
-            mask = as_buffer(data=ptr, size=mask_size, owner=obj)
-        elif typecode == "b":
-            col = as_column(mask)
-            mask = bools_to_mask(col)
-        else:
-            raise NotImplementedError(
-                f"Cannot infer mask from typestr {typestr}"
-            )
-    return mask
+def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer:
+    desc = cai_mask.__cuda_array_interface__
+    typestr = desc["typestr"]
+    typecode = typestr[1]
+    if typecode == "t":
+        mask_size = bitmask_allocation_size_bytes(desc["shape"][0])
+        return as_buffer(data=desc["data"][0], size=mask_size, owner=obj)
+    elif typecode == "b":
+        col = as_column(cai_mask)
+        return bools_to_mask(col)
+    else:
+        raise NotImplementedError(f"Cannot infer mask from typestr {typestr}")
 
 
 def serialize_columns(columns) -> Tuple[List[dict], List]:

From a9350669b607810a66f5ecc2133703c2a8e18c7c Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Thu, 18 Apr 2024 12:37:01 -0400
Subject: [PATCH 079/842] add correct labels to pandas_function_request.md
 (#15381)

The should correct the labels added on a the "Request a Missing Pandas Function" template.

"? - Needs Triage" -> "Needs Triage" (https://github.com/rapidsai/cudf/issues?q=is%3Aopen+is%3Aissue+label%3A%22Needs+Triage%22)

and adds the "pandas" label (https://github.com/rapidsai/cudf/issues?q=is%3Aopen+is%3Aissue+label%3Apandas)

Authors:
  - Ray Bell (https://github.com/raybellwaves)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/15381
---
 .github/ISSUE_TEMPLATE/pandas_function_request.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/ISSUE_TEMPLATE/pandas_function_request.md b/.github/ISSUE_TEMPLATE/pandas_function_request.md
index 1cecca72953..19f1377dfe7 100644
--- a/.github/ISSUE_TEMPLATE/pandas_function_request.md
+++ b/.github/ISSUE_TEMPLATE/pandas_function_request.md
@@ -2,7 +2,7 @@
 name: Request a Missing Pandas Function
 about: Request GPU support for a function executed on the CPU in pandas accelerator mode.
 title: "[FEA]"
-labels: "? - Needs Triage, feature request"
+labels: "Needs Triage, feature request, cudf.pandas"
 assignees: ''
 
 ---

From cb8e434e9f2abec93af5877af062688069e5d164 Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Thu, 18 Apr 2024 13:22:58 -0400
Subject: [PATCH 080/842] DOC: add pandas intersphinx mapping (#15531)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR add pandas to intersphinx mapping to make it easy to link to pandas docs from the RAPIDS docs.

There is likely other opportunities to use the pandas intersphinx mapping e.g. https://github.com/rapidsai/cudf/pull/15383 but I think they can be subsequent PRs.

I've tested this locally and confirm it works as expected (i.e. the note in the docstring at https://docs.rapids.ai/api/cudf/stable/user_guide/api_docs/api/cudf.dataframe.query/#cudf.DataFrame.query is now hyperlinked to https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.query.html

![Screenshot 2024-04-14 at 2 15 33 AM](https://github.com/rapidsai/cudf/assets/17162724/193076e2-202e-4e74-9305-be1dbcdfa82b)

Apologies about the other linting. I can revert if need be

Authors:
  - Ray Bell (https://github.com/raybellwaves)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15531
---
 docs/cudf/source/conf.py           | 24 ++++++++++++++++--------
 python/cudf/cudf/core/dataframe.py |  2 +-
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index b891ff99d47..bcefa3fbdf8 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -142,8 +142,6 @@ def clean_all_xml_files(path):
                 tree.write(fn)
 
 
-
-
 # Breathe Configuration
 breathe_projects = {"libcudf": "../../../cpp/doxygen/xml"}
 for project_path in breathe_projects.values():
@@ -187,7 +185,9 @@ def clean_all_xml_files(path):
 # The short X.Y version.
 version = f"{CUDF_VERSION.major:02}.{CUDF_VERSION.minor:02}"
 # The full version.
-release = f"{CUDF_VERSION.major:02}.{CUDF_VERSION.minor:02}.{CUDF_VERSION.micro:02}"
+release = (
+    f"{CUDF_VERSION.major:02}.{CUDF_VERSION.minor:02}.{CUDF_VERSION.micro:02}"
+)
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -308,7 +308,10 @@ def clean_all_xml_files(path):
     "dlpack": ("https://dmlc.github.io/dlpack/latest/", None),
     "nanoarrow": ("https://arrow.apache.org/nanoarrow/latest", None),
     "numpy": ("https://numpy.org/doc/stable", None),
-    "pandas": ("https://pandas.pydata.org/docs/", None),
+    "pandas": (
+        "https://pandas.pydata.org/pandas-docs/stable/",
+        None,
+    ),
     "pyarrow": ("https://arrow.apache.org/docs/", None),
     "python": ("https://docs.python.org/3", None),
     "rmm": ("https://docs.rapids.ai/api/rmm/nightly/", None),
@@ -380,7 +383,7 @@ def _generate_namespaces(namespaces):
     "type_id",
     # Unknown base types
     "int32_t",
-    "void"
+    "void",
 }
 
 
@@ -448,9 +451,14 @@ def _cached_intersphinx_lookup(env, node, contnode):
 
 def on_missing_reference(app, env, node, contnode):
     # These variables are defined outside the function to speed up the build.
-    global _all_namespaces, _names_to_skip_in_cpp, \
-        _names_to_skip_in_pylibcudf, _intersphinx_extra_prefixes, \
-        _domain_objects, _prefixed_domain_objects, _intersphinx_cache
+    global \
+        _all_namespaces, \
+        _names_to_skip_in_cpp, \
+        _names_to_skip_in_pylibcudf, \
+        _intersphinx_extra_prefixes, \
+        _domain_objects, \
+        _prefixed_domain_objects, \
+        _intersphinx_cache
 
     # Precompute and cache domains for faster lookups
     if _domain_objects is None:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 2a4f93c1716..99e4588d608 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4320,7 +4320,7 @@ def query(self, expr, local_dict=None):
         """
         Query with a boolean expression using Numba to compile a GPU kernel.
 
-        See pandas.DataFrame.query.
+        See :meth:`pandas.DataFrame.query`.
 
         Parameters
         ----------

From b8d003e9e992cd0621368a698f76336ad87f7180 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 18 Apr 2024 12:48:05 -0700
Subject: [PATCH 081/842] Fix CMake files in libcudf C++ examples to use
 existing libcudf build if present (#15348)

This PR fixes the CMake artifacts for libcudf examples and includes CI updates to create executable `libcudf-example` conda package to run from CI

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Robert Maynard (https://github.com/robertmaynard)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/15348
---
 .gitignore                                    |  1 +
 ci/release/update-version.sh                  |  2 +-
 ci/run_cudf_examples.sh                       | 20 +++++++++++++
 ci/test_cpp.sh                                |  6 ++++
 ci/test_cpp_common.sh                         |  2 +-
 .../libcudf/install_libcudf_example.sh        |  5 ++--
 conda/recipes/libcudf/meta.yaml               |  2 +-
 cpp/examples/basic/CMakeLists.txt             | 11 ++++++-
 cpp/examples/build.sh                         | 29 +++++++++++++++++++
 cpp/examples/fetch_dependencies.cmake         | 11 +++++--
 cpp/examples/nested_types/CMakeLists.txt      | 11 ++++++-
 cpp/examples/set_cuda_architecture.cmake      | 28 ++++++++++++++++++
 cpp/examples/strings/CMakeLists.txt           | 15 ++++++++--
 cpp/examples/versions.cmake                   | 15 ++++++++++
 14 files changed, 146 insertions(+), 12 deletions(-)
 create mode 100755 ci/run_cudf_examples.sh
 mode change 100644 => 100755 ci/test_cpp_common.sh
 create mode 100644 cpp/examples/set_cuda_architecture.cmake
 create mode 100644 cpp/examples/versions.cmake

diff --git a/.gitignore b/.gitignore
index 471d4100458..313bb1c3789 100644
--- a/.gitignore
+++ b/.gitignore
@@ -78,6 +78,7 @@ CMakeFiles/
 Debug
 build/
 cpp/build/
+cpp/examples/*/install/
 cpp/include/cudf/ipc_generated/*.h
 cpp/thirdparty/googletest/
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 7cacdfd39c3..99f9c698217 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -70,7 +70,7 @@ sed_runner "s/version == ${CURRENT_SHORT_TAG}/version == ${NEXT_SHORT_TAG}/g" RE
 sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md
 
 # Libcudf examples update
-sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/fetch_dependencies.cmake
+sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/versions.cmake
 
 # CI files
 for FILE in .github/workflows/*.yaml; do
diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh
new file mode 100755
index 00000000000..71af6446748
--- /dev/null
+++ b/ci/run_cudf_examples.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -uo pipefail
+
+EXITCODE=0
+trap "EXITCODE=1" ERR
+
+# Support customizing the examples' install location
+cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/examples/libcudf/";
+
+compute-sanitizer --tool memcheck basic_example
+
+compute-sanitizer --tool memcheck deduplication
+
+compute-sanitizer --tool memcheck custom_optimized names.csv
+compute-sanitizer --tool memcheck custom_prealloc names.csv
+compute-sanitizer --tool memcheck custom_with_malloc names.csv
+
+exit ${EXITCODE}
diff --git a/ci/test_cpp.sh b/ci/test_cpp.sh
index 995c8d7d71f..7865849bb74 100755
--- a/ci/test_cpp.sh
+++ b/ci/test_cpp.sh
@@ -17,6 +17,12 @@ rapids-logger "Run libcudf gtests"
 ./ci/run_cudf_ctests.sh -j20
 SUITEERROR=$?
 
+if (( ${SUITEERROR} == 0 )); then
+    rapids-logger "Run libcudf examples"
+    ./ci/run_cudf_examples.sh
+    SUITEERROR=$?
+fi
+
 if (( ${SUITEERROR} == 0 )); then
     rapids-logger "Run libcudf_kafka gtests"
     ./ci/run_cudf_kafka_ctests.sh -j20
diff --git a/ci/test_cpp_common.sh b/ci/test_cpp_common.sh
old mode 100644
new mode 100755
index e1b2a367187..da847137a2b
--- a/ci/test_cpp_common.sh
+++ b/ci/test_cpp_common.sh
@@ -31,7 +31,7 @@ rapids-print-env
 
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
-  libcudf libcudf_kafka libcudf-tests
+  libcudf libcudf_kafka libcudf-tests libcudf-example
 
 rapids-logger "Check GPU usage"
 nvidia-smi
diff --git a/conda/recipes/libcudf/install_libcudf_example.sh b/conda/recipes/libcudf/install_libcudf_example.sh
index e249688a03b..1a52dec99e3 100644
--- a/conda/recipes/libcudf/install_libcudf_example.sh
+++ b/conda/recipes/libcudf/install_libcudf_example.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
-./cpp/examples/build.sh
+# build and install libcudf examples
+./cpp/examples/build.sh --install
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 63eb83084dd..3af0b7885c3 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -195,7 +195,7 @@ outputs:
       license: Apache-2.0
       license_family: APACHE
       license_file: LICENSE
-      summary: libcudf_example library
+      summary: libcudf example executables
   - name: libcudf-tests
     version: {{ version }}
     script: install_libcudf_tests.sh
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index 759a43b5627..a3fe699667a 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -1,7 +1,13 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 cmake_minimum_required(VERSION 3.26.4)
 
+include(../set_cuda_architecture.cmake)
+
+# initialize cuda architecture
+rapids_cuda_init_architectures(basic_example)
+rapids_cuda_set_architectures(RAPIDS)
+
 project(
   basic_example
   VERSION 0.0.1
@@ -14,3 +20,6 @@ include(../fetch_dependencies.cmake)
 add_executable(basic_example src/process_csv.cpp)
 target_link_libraries(basic_example PRIVATE cudf::cudf)
 target_compile_features(basic_example PRIVATE cxx_std_17)
+
+install(TARGETS basic_example DESTINATION bin/examples/libcudf)
+install(FILES ${CMAKE_CURRENT_LIST_DIR}/4stock_5day.csv DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index 424da35ad18..9802c876930 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -8,9 +8,34 @@ set -euo pipefail
 
 # Parallelism control
 PARALLEL_LEVEL=${PARALLEL_LEVEL:-4}
+# Installation disabled by default
+INSTALL_EXAMPLES=false
+
+# Check for -i or --install flags to enable installation
+ARGS=$(getopt -o i --long install -- "$@")
+eval set -- "$ARGS"
+while [ : ]; do
+  case "$1" in
+    -i | --install)
+        INSTALL_EXAMPLES=true
+        shift
+        ;;
+    --) shift;
+        break
+        ;;
+  esac
+done
 
 # Root of examples
 EXAMPLES_DIR=$(dirname "$(realpath "$0")")
+
+# Set up default libcudf build directory and install prefix if conda build
+if [ "${CONDA_BUILD:-"0"}" == "1" ]; then
+  LIB_BUILD_DIR="${LIB_BUILD_DIR:-${SRC_DIR/cpp/build}}"
+  INSTALL_PREFIX="${INSTALL_PREFIX:-${PREFIX}}"
+fi
+
+# libcudf build directory
 LIB_BUILD_DIR=${LIB_BUILD_DIR:-$(readlink -f "${EXAMPLES_DIR}/../build")}
 
 ################################################################################
@@ -25,6 +50,10 @@ build_example() {
   cmake -S ${example_dir} -B ${build_dir} -Dcudf_ROOT="${LIB_BUILD_DIR}"
   # Build
   cmake --build ${build_dir} -j${PARALLEL_LEVEL}
+  # Install if needed
+  if [ "$INSTALL_EXAMPLES" = true ]; then
+    cmake --install ${build_dir} --prefix ${INSTALL_PREFIX:-${example_dir}/install}
+  fi
 }
 
 build_example basic
diff --git a/cpp/examples/fetch_dependencies.cmake b/cpp/examples/fetch_dependencies.cmake
index e4c11bbdeca..851405caf55 100644
--- a/cpp/examples/fetch_dependencies.cmake
+++ b/cpp/examples/fetch_dependencies.cmake
@@ -11,7 +11,10 @@
 # or implied. See the License for the specific language governing permissions and limitations under
 # the License.
 # =============================================================================
-set(CPM_DOWNLOAD_VERSION v0.35.3)
+
+include(${CMAKE_CURRENT_LIST_DIR}/versions.cmake)
+
+set(CPM_DOWNLOAD_VERSION v0.38.5)
 file(
   DOWNLOAD
   https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
@@ -19,9 +22,11 @@ file(
 )
 include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 
-set(CUDF_TAG branch-24.06)
+# find or build it via CPM
 CPMFindPackage(
-  NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
+  NAME cudf
+  FIND_PACKAGE_ARGUMENTS "PATHS ${cudf_ROOT} ${cudf_ROOT}/latest" GIT_REPOSITORY
+                         https://github.com/rapidsai/cudf
   GIT_TAG ${CUDF_TAG}
   GIT_SHALLOW
     TRUE
diff --git a/cpp/examples/nested_types/CMakeLists.txt b/cpp/examples/nested_types/CMakeLists.txt
index cb9430db237..8a900f6b5ae 100644
--- a/cpp/examples/nested_types/CMakeLists.txt
+++ b/cpp/examples/nested_types/CMakeLists.txt
@@ -1,7 +1,13 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 cmake_minimum_required(VERSION 3.26.4)
 
+include(../set_cuda_architecture.cmake)
+
+# initialize cuda architecture
+rapids_cuda_init_architectures(nested_types)
+rapids_cuda_set_architectures(RAPIDS)
+
 project(
   nested_types
   VERSION 0.0.1
@@ -14,3 +20,6 @@ include(../fetch_dependencies.cmake)
 add_executable(deduplication deduplication.cpp)
 target_link_libraries(deduplication PRIVATE cudf::cudf)
 target_compile_features(deduplication PRIVATE cxx_std_17)
+
+install(TARGETS deduplication DESTINATION bin/examples/libcudf)
+install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.json DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/set_cuda_architecture.cmake b/cpp/examples/set_cuda_architecture.cmake
new file mode 100644
index 00000000000..bed6cd2f357
--- /dev/null
+++ b/cpp/examples/set_cuda_architecture.cmake
@@ -0,0 +1,28 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+include(${CMAKE_CURRENT_LIST_DIR}/versions.cmake)
+
+if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/libcudf_cpp_examples_RAPIDS.cmake)
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/${CUDF_TAG}/RAPIDS.cmake
+       ${CMAKE_CURRENT_BINARY_DIR}/libcudf_cpp_examples_RAPIDS.cmake
+  )
+endif()
+include(${CMAKE_CURRENT_BINARY_DIR}/libcudf_cpp_examples_RAPIDS.cmake)
+
+include(rapids-cmake)
+include(rapids-cpm)
+include(rapids-cuda)
+include(rapids-export)
+include(rapids-find)
diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt
index c90fa9dde16..a5654870544 100644
--- a/cpp/examples/strings/CMakeLists.txt
+++ b/cpp/examples/strings/CMakeLists.txt
@@ -1,7 +1,13 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 cmake_minimum_required(VERSION 3.26.4)
 
+include(../set_cuda_architecture.cmake)
+
+# initialize cuda architecture
+rapids_cuda_init_architectures(strings_examples)
+rapids_cuda_set_architectures(RAPIDS)
+
 project(
   strings_examples
   VERSION 0.0.1
@@ -12,22 +18,27 @@ include(../fetch_dependencies.cmake)
 
 list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
 
-#
 add_executable(libcudf_apis libcudf_apis.cpp)
 target_compile_features(libcudf_apis PRIVATE cxx_std_17)
 target_link_libraries(libcudf_apis PRIVATE cudf::cudf nvToolsExt)
+install(TARGETS libcudf_apis DESTINATION bin/examples/libcudf)
 
 add_executable(custom_with_malloc custom_with_malloc.cu)
 target_compile_features(custom_with_malloc PRIVATE cxx_std_17)
 target_compile_options(custom_with_malloc PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
 target_link_libraries(custom_with_malloc PRIVATE cudf::cudf nvToolsExt)
+install(TARGETS custom_with_malloc DESTINATION bin/examples/libcudf)
 
 add_executable(custom_prealloc custom_prealloc.cu)
 target_compile_features(custom_prealloc PRIVATE cxx_std_17)
 target_compile_options(custom_prealloc PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
 target_link_libraries(custom_prealloc PRIVATE cudf::cudf nvToolsExt)
+install(TARGETS custom_prealloc DESTINATION bin/examples/libcudf)
 
 add_executable(custom_optimized custom_optimized.cu)
 target_compile_features(custom_optimized PRIVATE cxx_std_17)
 target_compile_options(custom_optimized PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
 target_link_libraries(custom_optimized PRIVATE cudf::cudf nvToolsExt)
+install(TARGETS custom_optimized DESTINATION bin/examples/libcudf)
+
+install(FILES ${CMAKE_CURRENT_LIST_DIR}/names.csv DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake
new file mode 100644
index 00000000000..dff66b4d7d8
--- /dev/null
+++ b/cpp/examples/versions.cmake
@@ -0,0 +1,15 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(CUDF_TAG branch-24.06)

From 7b9e8158d38a250217b328ee005d9cf8581bec9f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 18 Apr 2024 17:07:04 -0400
Subject: [PATCH 082/842] Fix deprecation warnings for json legacy reader
 (#15563)

Fixes deprecation warnings caused by changes in #15558
Most are in the `json_test.cpp` and appear like this
```
[150+7+50=206] Building CXX object tests/CMakeFiles/JSON_TEST.dir/io/json_test.cpp.o
/cudf/cpp/tests/io/json_test.cpp: In member function 'virtual void JsonReaderParamTest_BasicJsonLines_Test::TestBody()':
/cudf/cpp/tests/io/json_test.cpp:320:14: warning: 'cudf::io::json_reader_options_builder& cudf::io::json_reader_options_builder::legacy(bool)' is deprecated [-Wdeprecated-declarations]
  317 |     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
      |     ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  318 |       .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<double>()})
      |       ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
  319 |       .lines(true)
      |       ~~~~~~~~~~~~
  320 |       .legacy(is_legacy_test(test_opt));
      |       ~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~
In file included from /cudf/cpp/tests/io/json_test.cpp:30:

```
Compiler warnings usually result in errors when building libcudf.

This PR removes calls and references to legacy JSON reader features where possible.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15563
---
 cpp/src/io/json/read_json.cu       |   4 +
 cpp/tests/io/json_test.cpp         | 164 +++++++++--------------------
 cpp/tests/streams/io/json_test.cpp |   3 +-
 3 files changed, 54 insertions(+), 117 deletions(-)

diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 3ea8639641c..81ef3a51afc 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -210,9 +210,13 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
 {
   CUDF_FUNC_RANGE();
 
+  // TODO remove this if-statement once legacy is removed
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
   if (reader_opts.is_enabled_legacy()) {
     return legacy::read_json(sources, reader_opts, stream, mr);
   }
+#pragma GCC diagnostic pop
 
   if (reader_opts.get_byte_range_offset() != 0 or reader_opts.get_byte_range_size() != 0) {
     CUDF_EXPECTS(reader_opts.is_enabled_lines(),
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index bae71d3c2a8..81cedf3d23e 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -169,26 +169,15 @@ struct JsonReaderTest : public cudf::test::BaseFixture {};
  * @brief Enum class to be used to specify the test case of parametrized tests
  */
 enum class json_test_t {
-  // Run test with the existing JSON lines reader using row-orient input data
-  legacy_lines_row_orient,
-  // Run test with the existing JSON lines reader using record-orient input data
-  legacy_lines_record_orient,
   // Run test with the nested JSON lines reader using record-orient input data
   json_experimental_record_orient,
   // Run test with the nested JSON lines reader using row-orient input data
   json_experimental_row_orient
 };
 
-constexpr bool is_legacy_test(json_test_t test_opt)
-{
-  return test_opt == json_test_t::legacy_lines_row_orient or
-         test_opt == json_test_t::legacy_lines_record_orient;
-}
-
 constexpr bool is_row_orient_test(json_test_t test_opt)
 {
-  return test_opt == json_test_t::legacy_lines_row_orient or
-         test_opt == json_test_t::json_experimental_row_orient;
+  return test_opt == json_test_t::json_experimental_row_orient;
 }
 
 /**
@@ -198,17 +187,10 @@ struct JsonReaderParamTest : public cudf::test::BaseFixture,
                              public testing::WithParamInterface<json_test_t> {};
 
 /**
- * @brief Test fixture for parametrized JSON reader tests, testing record orient-only for legacy
- * JSON lines reader and the nested reader
+ * @brief Test fixture for parametrized JSON reader tests with both orients
  */
-struct JsonReaderDualTest : public cudf::test::BaseFixture,
-                            public testing::WithParamInterface<json_test_t> {};
-
-/**
- * @brief Test fixture for parametrized JSON reader tests that only tests the new nested JSON reader
- */
-struct JsonReaderNoLegacy : public cudf::test::BaseFixture,
-                            public testing::WithParamInterface<json_test_t> {};
+struct JsonReaderRecordTest : public cudf::test::BaseFixture,
+                              public testing::WithParamInterface<json_test_t> {};
 
 /**
  * @brief Generates a JSON lines string that uses the record orient
@@ -244,9 +226,7 @@ struct JsonFixedPointReaderTest : public JsonReaderTest {};
 
 template <typename DecimalType>
 struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest<DecimalType> {
-  void run_test(std::vector<std::string> const& reference_strings,
-                numeric::scale_type scale,
-                bool use_legacy_parser)
+  void run_test(std::vector<std::string> const& reference_strings, numeric::scale_type scale)
   {
     cudf::test::strings_column_wrapper const strings(reference_strings.begin(),
                                                      reference_strings.end());
@@ -263,8 +243,7 @@ struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest<DecimalTy
     cudf::io::json_reader_options const in_opts =
       cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .dtypes({data_type{type_to_id<DecimalType>(), scale}})
-        .lines(true)
-        .legacy(use_legacy_parser);
+        .lines(true);
 
     auto const result      = cudf::io::read_json(in_opts);
     auto const result_view = result.tbl->view();
@@ -277,8 +256,8 @@ struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest<DecimalTy
   void run_tests(std::vector<std::string> const& reference_strings, numeric::scale_type scale)
   {
     // Test both parsers
-    run_test(reference_strings, scale, false);
-    run_test(reference_strings, scale, true);
+    run_test(reference_strings, scale);
+    run_test(reference_strings, scale);
   }
 };
 
@@ -288,22 +267,13 @@ TYPED_TEST_SUITE(JsonValidFixedPointReaderTest, cudf::test::FixedPointTypes);
 // Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
 INSTANTIATE_TEST_CASE_P(JsonReaderParamTest,
                         JsonReaderParamTest,
-                        ::testing::Values(json_test_t::legacy_lines_row_orient,
-                                          json_test_t::legacy_lines_record_orient,
-                                          json_test_t::json_experimental_record_orient,
+                        ::testing::Values(json_test_t::json_experimental_record_orient,
                                           json_test_t::json_experimental_row_orient));
 
 // Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
-INSTANTIATE_TEST_CASE_P(JsonReaderDualTest,
-                        JsonReaderDualTest,
-                        ::testing::Values(json_test_t::legacy_lines_record_orient,
-                                          json_test_t::json_experimental_record_orient));
-
-// Parametrize qualifying JSON tests for executing nested reader only
-INSTANTIATE_TEST_CASE_P(JsonReaderNoLegacy,
-                        JsonReaderNoLegacy,
-                        ::testing::Values(json_test_t::json_experimental_row_orient,
-                                          json_test_t::json_experimental_record_orient));
+INSTANTIATE_TEST_CASE_P(JsonReaderRecordTest,
+                        JsonReaderRecordTest,
+                        ::testing::Values(json_test_t::json_experimental_record_orient));
 
 TEST_P(JsonReaderParamTest, BasicJsonLines)
 {
@@ -316,8 +286,7 @@ TEST_P(JsonReaderParamTest, BasicJsonLines)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
       .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<double>()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 2);
@@ -359,8 +328,7 @@ TEST_P(JsonReaderParamTest, FloatingPoint)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({dtype<float>()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -384,8 +352,7 @@ TEST_P(JsonReaderParamTest, JsonLinesStrings)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
       .dtypes({{"2", dtype<cudf::string_view>()}, {"0", dtype<int32_t>()}, {"1", dtype<double>()}})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -454,8 +421,7 @@ TEST_P(JsonReaderParamTest, MultiColumn)
                dtype<int64_t>(),
                dtype<float>(),
                dtype<double>()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   auto const view = result.tbl->view();
@@ -504,8 +470,7 @@ TEST_P(JsonReaderParamTest, Booleans)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({dtype<bool>()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   // Booleans are the same (integer) data type, but valued at 0 or 1
@@ -548,8 +513,7 @@ TEST_P(JsonReaderParamTest, Dates)
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
       .lines(true)
-      .dayfirst(true)
-      .legacy(is_legacy_test(test_opt));
+      .dayfirst(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   auto const view = result.tbl->view();
@@ -604,8 +568,7 @@ TEST_P(JsonReaderParamTest, Durations)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({data_type{type_id::DURATION_NANOSECONDS}})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   auto const view = result.tbl->view();
@@ -642,8 +605,7 @@ TEST_P(JsonReaderParamTest, JsonLinesDtypeInference)
 
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -678,9 +640,7 @@ TEST_P(JsonReaderParamTest, JsonLinesFileInput)
   outfile.close();
 
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{fname})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+    cudf::io::json_reader_options::builder(cudf::io::source_info{fname}).lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -707,7 +667,6 @@ TEST_F(JsonReaderTest, JsonLinesByteRange)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{fname})
       .lines(true)
-      .legacy(true)  // Support in new reader coming in https://github.com/rapidsai/cudf/pull/12498
       .byte_range_offset(11)
       .byte_range_size(20);
 
@@ -722,18 +681,15 @@ TEST_F(JsonReaderTest, JsonLinesByteRange)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{3000, 4000, 5000}});
 }
 
-TEST_P(JsonReaderDualTest, JsonLinesObjects)
+TEST_P(JsonReaderRecordTest, JsonLinesObjects)
 {
-  auto const test_opt     = GetParam();
   const std::string fname = temp_env->get_temp_dir() + "JsonLinesObjectsTest.json";
   std::ofstream outfile(fname, std::ofstream::out);
   outfile << " {\"co\\\"l1\" : 1, \"col2\" : 2.0} \n";
   outfile.close();
 
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{fname})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+    cudf::io::json_reader_options::builder(cudf::io::source_info{fname}).lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -741,7 +697,7 @@ TEST_P(JsonReaderDualTest, JsonLinesObjects)
   EXPECT_EQ(result.tbl->num_rows(), 1);
 
   EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
-  EXPECT_EQ(result.metadata.schema_info[0].name, is_legacy_test(test_opt) ? "co\\\"l1" : "co\"l1");
+  EXPECT_EQ(result.metadata.schema_info[0].name, "co\"l1");
   EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64);
   EXPECT_EQ(result.metadata.schema_info[1].name, "col2");
 
@@ -749,14 +705,13 @@ TEST_P(JsonReaderDualTest, JsonLinesObjects)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{2.0}});
 }
 
-TEST_P(JsonReaderDualTest, JsonLinesObjectsStrings)
+TEST_P(JsonReaderRecordTest, JsonLinesObjectsStrings)
 {
   auto const test_opt    = GetParam();
   auto test_json_objects = [test_opt](std::string const& data) {
     cudf::io::json_reader_options in_options =
       cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
-        .lines(true)
-        .legacy(is_legacy_test(test_opt));
+        .lines(true);
 
     cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -786,17 +741,15 @@ TEST_P(JsonReaderDualTest, JsonLinesObjectsStrings)
     "{\"col3\":\"bbb\", \"col1\":200, \"col2\":2.2}\n");
 }
 
-TEST_P(JsonReaderDualTest, JsonLinesObjectsMissingData)
+TEST_P(JsonReaderRecordTest, JsonLinesObjectsMissingData)
 {
-  auto const test_opt = GetParam();
-  // Note: columns will be ordered based on which fields appear first
+  //  Note: columns will be ordered based on which fields appear first
   std::string const data =
     "{              \"col2\":1.1, \"col3\":\"aaa\"}\n"
     "{\"col1\":200,               \"col3\":\"bbb\"}\n";
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -823,17 +776,15 @@ TEST_P(JsonReaderDualTest, JsonLinesObjectsMissingData)
                                  cudf::test::strings_column_wrapper({"aaa", "bbb"}));
 }
 
-TEST_P(JsonReaderDualTest, JsonLinesObjectsOutOfOrder)
+TEST_P(JsonReaderRecordTest, JsonLinesObjectsOutOfOrder)
 {
-  auto const test_opt = GetParam();
   std::string const data =
     "{\"col1\":100, \"col2\":1.1, \"col3\":\"aaa\"}\n"
     "{\"col3\":\"bbb\", \"col1\":200, \"col2\":2.2}\n";
 
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -919,8 +870,7 @@ TEST_F(JsonReaderTest, ArrowFileSource)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{&arrow_source})
       .dtypes({dtype<int8_t>()})
-      .lines(true)
-      .legacy(true);  // Support in new reader coming in https://github.com/rapidsai/cudf/pull/12498
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -952,8 +902,7 @@ TEST_P(JsonReaderParamTest, InvalidFloatingPoint)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({dtype<float>()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 1);
@@ -972,8 +921,7 @@ TEST_P(JsonReaderParamTest, StringInference)
 
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.c_str(), data.size()})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 1);
@@ -1054,9 +1002,7 @@ TEST_P(JsonReaderParamTest, ParseInRangeIntegers)
     outfile << line.str();
   }
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}).lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1158,9 +1104,7 @@ TEST_P(JsonReaderParamTest, ParseOutOfRangeIntegers)
     outfile << line.str();
   }
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}).lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1198,9 +1142,7 @@ TEST_P(JsonReaderParamTest, JsonLinesMultipleFileInputs)
   outfile2.close();
 
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file2}})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file2}}).lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1217,7 +1159,7 @@ TEST_P(JsonReaderParamTest, JsonLinesMultipleFileInputs)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2, 3.3, 4.4}});
 }
 
-TEST_P(JsonReaderNoLegacy, JsonLinesMultipleFileInputsNoNL)
+TEST_P(JsonReaderParamTest, JsonLinesMultipleFileInputsNoNL)
 {
   auto const test_opt = GetParam();
   // Strings for the two separate input files in row-orient that do not end with a newline
@@ -1239,9 +1181,7 @@ TEST_P(JsonReaderNoLegacy, JsonLinesMultipleFileInputsNoNL)
   outfile2.close();
 
   cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file2}})
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file2}}).lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1258,15 +1198,16 @@ TEST_P(JsonReaderNoLegacy, JsonLinesMultipleFileInputsNoNL)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2, 3.3, 4.4}});
 }
 
-TEST_F(JsonReaderTest, BadDtypeParams)
+// This can be removed once the legacy option has been removed.
+// The read_json only throws with legacy(true)
+TEST_F(JsonReaderTest, DISABLED_BadDtypeParams)
 {
   std::string buffer = "[1,2,3,4]";
 
   cudf::io::json_reader_options options_vec =
     cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .lines(true)
-      .dtypes({dtype<int8_t>()})
-      .legacy(true);
+      .dtypes({dtype<int8_t>()});
 
   // should throw because there are four columns and only one dtype
   EXPECT_THROW(cudf::io::read_json(options_vec), cudf::logic_error);
@@ -1274,7 +1215,6 @@ TEST_F(JsonReaderTest, BadDtypeParams)
   cudf::io::json_reader_options options_map =
     cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .lines(true)
-      .legacy(true)
       .dtypes(std::map<std::string, cudf::data_type>{{"0", dtype<int8_t>()},
                                                      {"1", dtype<int8_t>()},
                                                      {"2", dtype<int8_t>()},
@@ -1328,7 +1268,6 @@ TEST_F(JsonReaderTest, JsonExperimentalLines)
   auto const table = cudf::io::read_json(json_lines_options);
 
   // Read test data via legacy, non-nested JSON lines reader
-  json_lines_options.enable_legacy(true);
   auto const legacy_reader_table = cudf::io::read_json(json_lines_options);
 
   // Verify that the data read via non-nested JSON lines reader matches the data read via nested
@@ -1433,8 +1372,7 @@ TEST_F(JsonReaderTest, ErrorStrings)
   cudf::io::json_reader_options const in_opts =
     cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .dtypes({data_type{cudf::type_id::STRING}})
-      .lines(true)
-      .legacy(false);
+      .lines(true);
 
   auto const result      = cudf::io::read_json(in_opts);
   auto const result_view = result.tbl->view().column(0);
@@ -1506,7 +1444,6 @@ TEST_F(JsonReaderTest, ExperimentalLinesNoOmissions)
     auto const table = cudf::io::read_json(json_lines_options);
 
     // Read test data via legacy, non-nested JSON lines reader
-    json_lines_options.enable_legacy(true);
     auto const legacy_reader_table = cudf::io::read_json(json_lines_options);
 
     // Verify that the data read via non-nested JSON lines reader matches the data read via
@@ -1592,8 +1529,7 @@ TEST_P(JsonReaderParamTest, JsonDtypeSchema)
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
       .dtypes(dtype_schema)
-      .lines(true)
-      .legacy(is_legacy_test(test_opt));
+      .lines(true);
 
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1789,8 +1725,7 @@ TEST_P(JsonReaderParamTest, JsonDtypeParsing)
     cudf::io::json_reader_options in_options =
       cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
         .dtypes(dtype_schema)
-        .lines(true)
-        .legacy(is_legacy_test(test_opt));
+        .lines(true);
 
     cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
@@ -1824,13 +1759,12 @@ TYPED_TEST(JsonValidFixedPointReaderTest, SingleColumnPositiveScale)
 
 TYPED_TEST(JsonFixedPointReaderTest, EmptyValues)
 {
-  auto const buffer = std::string{"{\"col0\":}"};
+  auto const buffer = std::string{"{\"col0\":\"\"}"};
 
   cudf::io::json_reader_options const in_opts =
     cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .dtypes({data_type{type_to_id<TypeParam>(), 0}})
-      .lines(true)
-      .legacy(true);  // Legacy behavior; not aligned with JSON specs
+      .lines(true);
 
   auto const result      = cudf::io::read_json(in_opts);
   auto const result_view = result.tbl->view();
@@ -1838,7 +1772,7 @@ TYPED_TEST(JsonFixedPointReaderTest, EmptyValues)
   ASSERT_EQ(result_view.num_columns(), 1);
   EXPECT_EQ(result_view.num_rows(), 1);
   EXPECT_EQ(result.metadata.schema_info[0].name, "col0");
-  EXPECT_EQ(result_view.column(0).null_count(), 1);
+  EXPECT_EQ(result_view.column(0).null_count(), 0);
 }
 
 TEST_F(JsonReaderTest, UnsupportedMultipleFileInputs)
diff --git a/cpp/tests/streams/io/json_test.cpp b/cpp/tests/streams/io/json_test.cpp
index 21da19a5a38..f98e685ed0c 100644
--- a/cpp/tests/streams/io/json_test.cpp
+++ b/cpp/tests/streams/io/json_test.cpp
@@ -37,8 +37,7 @@ TEST_F(JSONTest, JSONreader)
     cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
       .dtypes(std::vector<cudf::data_type>{cudf::data_type{cudf::type_id::INT32},
                                            cudf::data_type{cudf::type_id::FLOAT64}})
-      .lines(true)
-      .legacy(true);
+      .lines(true);
   cudf::io::table_with_metadata result =
     cudf::io::read_json(in_options, cudf::test::get_default_stream());
 }

From d1b92e2ec3b943a99299db24873a89fe31e3c0e3 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 18 Apr 2024 18:17:27 -0400
Subject: [PATCH 083/842] Large strings support in regex replace APIs (#15524)

Updates the `replace_re()` and `replace_with_backrefs()` internal logic to support large strings.
These functions use a regex-specific version of make-strings-children.

Depends on #15363

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15524
---
 cpp/src/strings/regex/utilities.cuh    | 19 +++++++++----------
 cpp/src/strings/replace/backref_re.cuh |  7 ++++---
 cpp/src/strings/replace/replace_re.cu  |  7 ++++---
 3 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh
index cfe53937e66..afbfe9de049 100644
--- a/cpp/src/strings/regex/utilities.cuh
+++ b/cpp/src/strings/regex/utilities.cuh
@@ -19,8 +19,10 @@
 #include "strings/regex/regex.cuh"
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -116,10 +118,8 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
                            rmm::cuda_stream_view stream,
                            rmm::device_async_resource_ref mr)
 {
-  auto offsets = make_numeric_column(
-    data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  auto d_offsets             = offsets->mutable_view().template data<int32_t>();
-  size_and_exec_fn.d_offsets = d_offsets;
+  auto output_sizes        = rmm::device_uvector<size_type>(strings_count, stream);
+  size_and_exec_fn.d_sizes = output_sizes.data();
 
   auto [buffer_size, thread_count] = d_prog.compute_strided_working_memory(strings_count);
 
@@ -133,12 +133,11 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
     for_each_kernel<<<grid.num_blocks, grid.num_threads_per_block, shmem_size, stream.value()>>>(
       size_and_exec_fn, d_prog, strings_count);
   }
-
-  auto const char_bytes =
-    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
-  CUDF_EXPECTS(char_bytes <= std::numeric_limits<size_type>::max(),
-               "Size of output exceeds the column size limit",
-               std::overflow_error);
+  // Convert the sizes to offsets
+  auto [offsets, char_bytes] = cudf::strings::detail::make_offsets_child_column(
+    output_sizes.begin(), output_sizes.end(), stream, mr);
+  size_and_exec_fn.d_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
 
   // Now build the chars column
   rmm::device_uvector<char> chars(char_bytes, stream, mr);
diff --git a/cpp/src/strings/replace/backref_re.cuh b/cpp/src/strings/replace/backref_re.cuh
index edd85f29e6c..b5b75cf8f40 100644
--- a/cpp/src/strings/replace/backref_re.cuh
+++ b/cpp/src/strings/replace/backref_re.cuh
@@ -45,13 +45,14 @@ struct backrefs_fn {
   string_view const d_repl;  // string replacement template
   Iterator backrefs_begin;
   Iterator backrefs_end;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str  = d_strings.element<string_view>(idx);
@@ -113,7 +114,7 @@ struct backrefs_fn {
       thrust::copy_n(
         thrust::seq, in_ptr + itr.byte_offset(), d_str.size_bytes() - itr.byte_offset(), out_ptr);
     } else {
-      d_offsets[idx] = nbytes;
+      d_sizes[idx] = nbytes;
     }
   }
 };
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index 1290302340b..fd988855424 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -43,13 +43,14 @@ struct replace_regex_fn {
   column_device_view const d_strings;
   string_view const d_repl;
   size_type const maxrepl;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -90,7 +91,7 @@ struct replace_regex_fn {
                      d_str.size_bytes() - last_pos.byte_offset(),  //             ^   ^
                      out_ptr);
     } else {
-      d_offsets[idx] = nbytes;
+      d_sizes[idx] = nbytes;
     }
   }
 };

From e0c4280e44d25006dca37d5e2e6c7f77dce3fd56 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Thu, 18 Apr 2024 16:50:25 -0700
Subject: [PATCH 084/842] Add option to Parquet writer to skip compressing
 individual columns (#15411)

#15081 added the ability to select per-column encodings in the Parquet writer. Some Parquet encodings (e.g `DELTA_BINARY_PACKED`) do not mix well with compression (see [PARQUET-2414](https://issues.apache.org/jira/browse/PARQUET-2414) for example). This PR adds the ability to turn off compression for select columns. This uses the same mechanism as encoding selection, so an example use would be:
```c++
  cudf::io::table_input_metadata table_metadata(table);
  table_metadata.column_metadata[0]
    .set_name("int_delta_binary")
    .set_encoding(cudf::io::column_encoding::DELTA_BINARY_PACKED)
    .set_skip_compression(true);
```

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15411
---
 cpp/include/cudf/io/types.hpp        | 21 ++++++++++++++
 cpp/src/io/parquet/page_enc.cu       |  5 +++-
 cpp/src/io/parquet/parquet_gpu.hpp   |  1 +
 cpp/src/io/parquet/writer_impl.cu    |  4 +++
 cpp/tests/io/parquet_writer_test.cpp | 42 ++++++++++++++++++++++++++++
 5 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 64d627483e6..65d4a4417f0 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -602,6 +602,7 @@ class column_in_metadata {
   bool _list_column_is_map  = false;
   bool _use_int96_timestamp = false;
   bool _output_as_binary    = false;
+  bool _skip_compression    = false;
   std::optional<uint8_t> _decimal_precision;
   std::optional<int32_t> _parquet_field_id;
   std::vector<column_in_metadata> children;
@@ -722,6 +723,19 @@ class column_in_metadata {
     return *this;
   }
 
+  /**
+   * @brief Specifies whether this column should not be compressed regardless of the compression
+   * codec specified for the file.
+   *
+   * @param skip If `true` do not compress this column
+   * @return this for chaining
+   */
+  column_in_metadata& set_skip_compression(bool skip) noexcept
+  {
+    _skip_compression = skip;
+    return *this;
+  }
+
   /**
    * @brief Sets the encoding to use for this column.
    *
@@ -844,6 +858,13 @@ class column_in_metadata {
    */
   [[nodiscard]] bool is_enabled_output_as_binary() const noexcept { return _output_as_binary; }
 
+  /**
+   * @brief Get whether to skip compressing this column
+   *
+   * @return Boolean indicating whether to skip compression of this column
+   */
+  [[nodiscard]] bool is_enabled_skip_compression() const noexcept { return _skip_compression; }
+
   /**
    * @brief Get the encoding that was set for this column.
    *
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 114e47b325b..2db6dc4270d 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -1591,7 +1591,9 @@ __device__ void finish_page_encode(state_buf* s,
     }
     pages[blockIdx.x] = s->page;
     if (not comp_results.empty()) {
-      comp_results[blockIdx.x]   = {0, compression_status::FAILURE};
+      auto const status =
+        s->col.skip_compression ? compression_status::SKIPPED : compression_status::FAILURE;
+      comp_results[blockIdx.x]   = {0, status};
       pages[blockIdx.x].comp_res = &comp_results[blockIdx.x];
     }
   }
@@ -2495,6 +2497,7 @@ CUDF_KERNEL void __launch_bounds__(decide_compression_block_size)
     if (auto comp_res = curr_page.comp_res; comp_res != nullptr) {
       auto const lvl_bytes = curr_page.is_v2() ? curr_page.level_bytes() : 0;
       compressed_data_size += comp_res->bytes_written + lvl_bytes;
+      // TODO: would this be better as a ballot?
       if (comp_res->status != compression_status::SUCCESS) {
         atomicOr(&compression_error[warp_id], 1);
       }
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 200a8ec9ddb..b165c60b2cf 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -475,6 +475,7 @@ struct parquet_column_device_view : stats_column_desc {
                                //!< nullability of parent_column. May be different from
                                //!< col.nullable() in case of chunked writing.
   bool output_as_byte_array;   //!< Indicates this list column is being written as a byte array
+  bool skip_compression;       //!< Skip compression for this column
   column_encoding requested_encoding;  //!< User specified encoding for this column.
 };
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index fd8d4f8bd7f..823a08084ee 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -274,6 +274,7 @@ struct schema_tree_node : public SchemaElement {
   statistics_dtype stats_dtype;
   int32_t ts_scale;
   column_encoding requested_encoding;
+  bool skip_compression;
 
   // TODO(fut): Think about making schema a class that holds a vector of schema_tree_nodes. The
   // function construct_schema_tree could be its constructor. It can have method to get the per
@@ -698,6 +699,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         set_field_id(col_schema, col_meta);
         set_encoding(col_schema, col_meta);
         col_schema.output_as_byte_array = col_meta.is_enabled_output_as_binary();
+        col_schema.skip_compression     = col_meta.is_enabled_skip_compression();
         schema.push_back(col_schema);
       } else if (col->type().id() == type_id::STRUCT) {
         // if struct, add current and recursively call for all children
@@ -833,6 +835,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         col_schema.leaf_column = col;
         set_field_id(col_schema, col_meta);
         set_encoding(col_schema, col_meta);
+        col_schema.skip_compression = col_meta.is_enabled_skip_compression();
         schema.push_back(col_schema);
       }
     };
@@ -1023,6 +1026,7 @@ parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream
   desc.max_def_level      = _max_def_level;
   desc.max_rep_level      = _max_rep_level;
   desc.requested_encoding = schema_node.requested_encoding;
+  desc.skip_compression   = schema_node.skip_compression;
   return desc;
 }
 
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index ffa672fb564..caddfee9f02 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -24,8 +24,11 @@
 
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
 #include <cudf/unary.hpp>
 
+#include <src/io/parquet/parquet_common.hpp>
+
 #include <fstream>
 
 using cudf::test::iterators::no_nulls;
@@ -1321,6 +1324,45 @@ TEST_F(ParquetWriterTest, CompStatsEmptyTable)
   expect_compression_stats_empty(stats);
 }
 
+TEST_F(ParquetWriterTest, SkipCompression)
+{
+  constexpr auto page_rows      = 1000;
+  constexpr auto row_group_rows = 2 * page_rows;
+  constexpr auto num_rows       = 2 * row_group_rows;
+
+  auto sequence = thrust::make_counting_iterator(0);
+  column_wrapper<int> col(sequence, sequence + num_rows, no_nulls());
+
+  auto expected          = table_view{{col, col}};
+  auto expected_metadata = cudf::io::table_input_metadata{expected};
+  expected_metadata.column_metadata[0].set_skip_compression(true);
+
+  auto const filepath = temp_env->get_temp_filepath("SkipCompression.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .compression(cudf::io::compression_type::ZSTD)
+      .max_page_size_rows(page_rows)
+      .row_group_size_rows(row_group_rows)
+      .max_page_fragment_size(page_rows)
+      .metadata(std::move(expected_metadata));
+
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected);
+
+  // check metadata to make sure column 0 is not compressed and column 1 is
+  auto const source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::detail::FileMetaData fmd;
+  read_footer(source, &fmd);
+
+  EXPECT_EQ(fmd.row_groups[0].columns[0].meta_data.codec, cudf::io::parquet::detail::UNCOMPRESSED);
+  EXPECT_EQ(fmd.row_groups[0].columns[1].meta_data.codec, cudf::io::parquet::detail::ZSTD);
+}
+
 TEST_F(ParquetWriterTest, NoNullsAsNonNullable)
 {
   column_wrapper<int32_t> col{{1, 2, 3}, no_nulls()};

From 045f29d1f37ce3fe9dedcfd9ed1141c7a70243ba Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Fri, 19 Apr 2024 10:24:14 -0500
Subject: [PATCH 085/842] Refactor JNI native dependency loading to allow
 returning of library path (#15566)

Adds a method to NativeDepsLoader that allows loading a dependency and determining the temporary file path the dependency was loaded from.  Also refactors the methods to take the dependency preservation flag as a parameter rather than lower-level functions directly reading the global flag, making them more flexible for reuse.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/15566
---
 .../java/ai/rapids/cudf/NativeDepsLoader.java | 84 ++++++++++++++++---
 1 file changed, 73 insertions(+), 11 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
index 27322cca436..7ee590e3c82 100755
--- a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
+++ b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -77,7 +77,7 @@ public class NativeDepsLoader {
   public static synchronized void loadNativeDeps() {
     if (!loaded) {
       try {
-        loadNativeDeps(loadOrder);
+        loadNativeDeps(loadOrder, preserveDepsAfterLoad);
         loaded = true;
       } catch (Throwable t) {
         log.error("Could not load cudf jni library...", t);
@@ -122,11 +122,53 @@ public static synchronized void loadNativeDeps() {
    * @throws IOException on any error trying to load the libraries.
    */
   public static void loadNativeDeps(String[] loadOrder) throws IOException {
+    loadNativeDeps(loadOrder, preserveDepsAfterLoad);
+  }
+
+  /**
+   * Allows other libraries to reuse the same native deps loading logic. Libraries will be searched
+   * for under ${os.arch}/${os.name}/ in the class path using the class loader for this class.
+   * <br/>
+   * Because this just loads the libraries and loading the libraries themselves needs to be a
+   * singleton operation it is recommended that any library using this provide their own wrapper
+   * function similar to
+   * <pre>
+   *     private static boolean loaded = false;
+   *     static synchronized void loadNativeDeps() {
+   *         if (!loaded) {
+   *             try {
+   *                 // If you also depend on the cudf liobrary being loaded, be sure it is loaded
+   *                 // first
+   *                 ai.rapids.cudf.NativeDepsLoader.loadNativeDeps();
+   *                 ai.rapids.cudf.NativeDepsLoader.loadNativeDeps(new String[]{...});
+   *                 loaded = true;
+   *             } catch (Throwable t) {
+   *                 log.error("Could not load ...", t);
+   *             }
+   *         }
+   *     }
+   * </pre>
+   * This function should be called from the static initialization block of any class that uses
+   * JNI. For example
+   * <pre>
+   *     public class UsesJNI {
+   *         static {
+   *             MyNativeDepsLoader.loadNativeDeps();
+   *         }
+   *     }
+   * </pre>
+   * @param loadOrder the base name of the libraries. For example libfoo.so would be passed in as
+   *                  "foo".  The libraries are loaded in the order provided.
+   * @param preserveDeps if false the dependencies will be deleted immediately after loading
+   *                     rather than on exit.
+   * @throws IOException on any error trying to load the libraries.
+   */
+  public static void loadNativeDeps(String[] loadOrder, boolean preserveDeps) throws IOException {
     String os = System.getProperty("os.name");
     String arch = System.getProperty("os.arch");
 
     for (String toLoad : loadOrder) {
-      loadDep(os, arch, toLoad);
+      loadDep(os, arch, toLoad, preserveDeps);
     }
   }
 
@@ -134,9 +176,11 @@ public static void loadNativeDeps(String[] loadOrder) throws IOException {
    * Load native dependencies in stages, where the dependency libraries in each stage
    * are loaded only after all libraries in earlier stages have completed loading.
    * @param loadOrder array of stages with an array of dependency library names in each stage
+   * @param preserveDeps if false the dependencies will be deleted immediately after loading
+   *                     rather than on exit.
    * @throws IOException on any error trying to load the libraries
    */
-  private static void loadNativeDeps(String[][] loadOrder) throws IOException {
+  private static void loadNativeDeps(String[][] loadOrder, boolean preserveDeps) throws IOException {
     String os = System.getProperty("os.name");
     String arch = System.getProperty("os.arch");
 
@@ -161,7 +205,7 @@ private static void loadNativeDeps(String[][] loadOrder) throws IOException {
       // Submit all dependencies in the stage to be loaded in parallel
       loadCompletionFutures.clear();
       for (Future<File> fileFuture : stageFileFutures) {
-        loadCompletionFutures.add(executor.submit(() -> loadDep(fileFuture)));
+        loadCompletionFutures.add(executor.submit(() -> loadDep(fileFuture, preserveDeps)));
       }
 
       // Wait for all dependencies in this stage to have been loaded
@@ -177,28 +221,46 @@ private static void loadNativeDeps(String[][] loadOrder) throws IOException {
     executor.shutdownNow();
   }
 
-  private static void loadDep(String os, String arch, String baseName) throws IOException {
+  /**
+   * Allows other libraries to reuse the same native deps loading logic. Library will be searched
+   * for under ${os.arch}/${os.name}/ in the class path using the class loader for this class.
+   * @param depName the base name of the library. For example libfoo.so would be passed in as
+   *                "foo".  The libraries are loaded in the order provided.
+   * @param preserveDep if false the dependencies will be deleted immediately after loading
+   *                    rather than on exit.
+   * @return path where the dependency was loaded
+   * @throws IOException on any error trying to load the libraries.
+   */
+  public static File loadNativeDep(String depName, boolean preserveDep) throws IOException {
+    String os = System.getProperty("os.name");
+    String arch = System.getProperty("os.arch");
+    return loadDep(os, arch, depName, preserveDep);
+  }
+
+  private static File loadDep(String os, String arch, String baseName, boolean preserveDep)
+      throws IOException {
     File path = createFile(os, arch, baseName);
-    loadDep(path);
+    loadDep(path, preserveDep);
+    return path;
   }
 
   /** Load a library at the specified path */
-  private static void loadDep(File path) {
+  private static void loadDep(File path, boolean preserveDep) {
     System.load(path.getAbsolutePath());
-    if (!preserveDepsAfterLoad) {
+    if (!preserveDep) {
       path.delete();
     }
   }
 
   /** Load a library, waiting for the specified future to produce the path before loading */
-  private static void loadDep(Future<File> fileFuture) {
+  private static void loadDep(Future<File> fileFuture, boolean preserveDep) {
     File path;
     try {
       path = fileFuture.get();
     } catch (ExecutionException | InterruptedException e) {
       throw new RuntimeException("Error loading dependencies", e);
     }
-    loadDep(path);
+    loadDep(path, preserveDep);
   }
 
   /** Extract the contents of a library resource into a temporary file */

From 088be5aecee1f2bc00f7d4acdb095894b3defcb7 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 19 Apr 2024 11:26:12 -0500
Subject: [PATCH 086/842] Rename experimental JSON tests. (#15568)

This PR renames the "experimental" JSON reader tests. These are now production grade and not experimental.

This task is tracked in https://github.com/rapidsai/cudf/issues/15537.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/15568
---
 cpp/tests/io/json_test.cpp        | 20 ++++++++++----------
 cpp/tests/io/nested_json_test.cpp |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 81cedf3d23e..ee1207f04a2 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -170,14 +170,14 @@ struct JsonReaderTest : public cudf::test::BaseFixture {};
  */
 enum class json_test_t {
   // Run test with the nested JSON lines reader using record-orient input data
-  json_experimental_record_orient,
+  json_record_orient,
   // Run test with the nested JSON lines reader using row-orient input data
-  json_experimental_row_orient
+  json_row_orient
 };
 
 constexpr bool is_row_orient_test(json_test_t test_opt)
 {
-  return test_opt == json_test_t::json_experimental_row_orient;
+  return test_opt == json_test_t::json_row_orient;
 }
 
 /**
@@ -267,13 +267,13 @@ TYPED_TEST_SUITE(JsonValidFixedPointReaderTest, cudf::test::FixedPointTypes);
 // Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
 INSTANTIATE_TEST_CASE_P(JsonReaderParamTest,
                         JsonReaderParamTest,
-                        ::testing::Values(json_test_t::json_experimental_record_orient,
-                                          json_test_t::json_experimental_row_orient));
+                        ::testing::Values(json_test_t::json_record_orient,
+                                          json_test_t::json_row_orient));
 
 // Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
 INSTANTIATE_TEST_CASE_P(JsonReaderRecordTest,
                         JsonReaderRecordTest,
-                        ::testing::Values(json_test_t::json_experimental_record_orient));
+                        ::testing::Values(json_test_t::json_record_orient));
 
 TEST_P(JsonReaderParamTest, BasicJsonLines)
 {
@@ -1223,9 +1223,9 @@ TEST_F(JsonReaderTest, DISABLED_BadDtypeParams)
   EXPECT_THROW(cudf::io::read_json(options_map), cudf::logic_error);
 }
 
-TEST_F(JsonReaderTest, JsonExperimentalBasic)
+TEST_F(JsonReaderTest, JsonBasic)
 {
-  std::string const fname = temp_env->get_temp_dir() + "JsonExperimentalBasic.json";
+  std::string const fname = temp_env->get_temp_dir() + "JsonBasic.json";
   std::ofstream outfile(fname, std::ofstream::out);
   outfile << R"([{"a":"11", "b":"1.1"},{"a":"22", "b":"2.2"}])";
   outfile.close();
@@ -1249,7 +1249,7 @@ TEST_F(JsonReaderTest, JsonExperimentalBasic)
                                  cudf::test::strings_column_wrapper({"1.1", "2.2"}));
 }
 
-TEST_F(JsonReaderTest, JsonExperimentalLines)
+TEST_F(JsonReaderTest, JsonLines)
 {
   std::string const json_string =
     R"({"a":"a0"}
@@ -1416,7 +1416,7 @@ TEST_F(JsonReaderTest, TokenAllocation)
   }
 }
 
-TEST_F(JsonReaderTest, ExperimentalLinesNoOmissions)
+TEST_F(JsonReaderTest, LinesNoOmissions)
 {
   std::array<std::string const, 4> const json_inputs
     // single column
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 97e1a78f909..2e2d5cae34c 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -621,7 +621,7 @@ TEST_F(JsonTest, TokenStream2)
 }
 
 struct JsonParserTest : public cudf::test::BaseFixture, public testing::WithParamInterface<bool> {};
-INSTANTIATE_TEST_SUITE_P(Experimental, JsonParserTest, testing::Bool());
+INSTANTIATE_TEST_SUITE_P(IsFullGPU, JsonParserTest, testing::Bool());
 
 TEST_P(JsonParserTest, ExtractColumn)
 {

From 21350fc2ac070315d110fca55cb6781ed7905596 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Apr 2024 07:17:09 -1000
Subject: [PATCH 087/842] Allow apply udf to reference global modules in
 cudf.pandas (#15569)

closes #15548

`_replace_closurevars` creates a new function by replacing objects with their fast versions. When creating the new function, it populates `globals` from the result of `inspect.getclosurevars`, but it don't think it comprehensively returns _all_ the globals accessible to the function (`function.__globals__`)

To minimize the change, the "fast globals" are still sourced from `inspect.getclosurevars`, and those update the `old_function.__globals__` when creating a new function.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15569
---
 python/cudf/cudf/pandas/fast_slow_proxy.py        |  9 ++++++---
 python/cudf/cudf_pandas_tests/test_cudf_pandas.py | 12 ++++++++++++
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index e811ba1351a..9d8c174b297 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -1108,7 +1108,7 @@ def _replace_closurevars(
         if any(c == types.CellType() for c in f.__closure__):
             return f
 
-    f_nonlocals, f_globals, f_builtins, _ = inspect.getclosurevars(f)
+    f_nonlocals, f_globals, _, _ = inspect.getclosurevars(f)
 
     g_globals = _transform_arg(f_globals, attribute_name, seen)
     g_nonlocals = _transform_arg(f_nonlocals, attribute_name, seen)
@@ -1121,11 +1121,14 @@ def _replace_closurevars(
         return f
 
     g_closure = tuple(types.CellType(val) for val in g_nonlocals.values())
-    g_globals["__builtins__"] = f_builtins
+
+    # https://github.com/rapidsai/cudf/issues/15548
+    new_g_globals = f.__globals__.copy()
+    new_g_globals.update(g_globals)
 
     g = types.FunctionType(
         f.__code__,
-        g_globals,
+        new_g_globals,
         name=f.__name__,
         argdefs=f.__defaults__,
         closure=g_closure,
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index f017b46866f..90356a01404 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1208,3 +1208,15 @@ def test_pickle_groupby(dataframe):
 def test_isinstance_base_offset():
     offset = xpd.tseries.frequencies.to_offset("1s")
     assert isinstance(offset, xpd.tseries.offsets.BaseOffset)
+
+
+def test_apply_slow_path_udf_references_global_module():
+    def my_apply(df, unused):
+        # `datetime` Raised `KeyError: __import__`
+        datetime.datetime.strptime(df["Minute"], "%H:%M:%S")
+        return pd.to_numeric(1)
+
+    df = xpd.DataFrame({"Minute": ["09:00:00"]})
+    result = df.apply(my_apply, axis=1, unused=True)
+    expected = xpd.Series([1])
+    tm.assert_series_equal(result, expected)

From 40d3dd7681b22103457e0e3d511d1f6860e28d77 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 19 Apr 2024 14:21:04 -0700
Subject: [PATCH 088/842] Ignore new cupy warning (#15574)

cupy 13.1 added a warning about the jitify cache warming up that we must silence in our test suite for it to pass.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - https://github.com/brandon-b-miller
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15574
---
 python/cudf/cudf/tests/pytest.ini | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini
index 36ccb434bb2..710473acb85 100644
--- a/python/cudf/cudf/tests/pytest.ini
+++ b/python/cudf/cudf/tests/pytest.ini
@@ -8,5 +8,7 @@ filterwarnings =
     error
     ignore:::.*xdist.*
     ignore:::.*pytest.*
+    # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+
     ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning
-    # Above deprecation warning comes from Pyarrow Table.to_pandas() with pandas-2.2+
+    # PerformanceWarning from cupy warming up the JIT cache
+    ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning

From d37636dc8d17571bd3e7a17e2da9d26f99b5490d Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 19 Apr 2024 18:59:54 -0500
Subject: [PATCH 089/842] Remove protobuf and use parsed ORC statistics from
 libcudf (#15564)

This PR removes the cuDF Python dependencies on `protobuf` and `protoc-wheel`. Closes #15511.

The only use case for the `protobuf` dependency was reading ORC file/stripe statistics. However, we have code in libcudf that can do this without requiring `protobuf`.

In this PR, we expose the C++ code for parsing ORC statistics from libcudf to Cython and remove all references to `protobuf`.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Jake Awe (https://github.com/AyodeAwe)
  - David Wendt (https://github.com/davidwendt)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15564
---
 .gitignore                                    |   3 -
 .../all_cuda-118_arch-x86_64.yaml             |   1 -
 .../all_cuda-122_arch-x86_64.yaml             |   1 -
 conda/recipes/cudf/meta.yaml                  |   2 -
 cpp/include/cudf/io/orc_metadata.hpp          |  26 +--
 dependencies.yaml                             |   5 -
 pyproject.toml                                |   1 -
 python/cudf/CMakeLists.txt                    |   3 -
 .../cudf/cmake/Modules/ProtobufHelpers.cmake  |  50 ------
 python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd |  64 ++++++-
 python/cudf/cudf/_lib/orc.pyx                 | 161 +++++++++++++++++-
 python/cudf/cudf/_lib/variant.pxd             |  15 ++
 python/cudf/cudf/io/orc.py                    |  48 +++---
 python/cudf/cudf/utils/metadata/__init__.py   |   1 -
 .../metadata/orc_column_statistics.proto      |  62 -------
 python/cudf/pyproject.toml                    |   2 -
 16 files changed, 263 insertions(+), 182 deletions(-)
 delete mode 100644 python/cudf/cmake/Modules/ProtobufHelpers.cmake
 create mode 100644 python/cudf/cudf/_lib/variant.pxd
 delete mode 100644 python/cudf/cudf/utils/metadata/__init__.py
 delete mode 100644 python/cudf/cudf/utils/metadata/orc_column_statistics.proto

diff --git a/.gitignore b/.gitignore
index 313bb1c3789..c89fb49697a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -161,9 +161,6 @@ ENV/
 # Dask
 dask-worker-space/
 
-# protobuf
-**/*_pb2.py
-
 # Sphinx docs & build artifacts
 docs/cudf/source/api_docs/generated/*
 docs/cudf/source/user_guide/api_docs/api/*
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index ef971d10f19..e8816da3a2a 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -68,7 +68,6 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- protobuf>=3.20,<5
 - ptxcompiler
 - pyarrow==14.0.2.*
 - pydata-sphinx-theme!=0.14.2
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 688e41ec1ba..8044fc35a19 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -66,7 +66,6 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- protobuf>=3.20,<5
 - pyarrow==14.0.2.*
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 5512ef11057..ae2d938250b 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -59,7 +59,6 @@ requirements:
     - cuda-version ={{ cuda_version }}
     - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
-    - protobuf ==4.24.*
     - python
     - cython >=3.0.3
     - scikit-build-core >=0.7.0
@@ -78,7 +77,6 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
   run:
-    - protobuf >=3.20,<5.0a0
     - python
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.3dev0
diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
index 8f3eb1dff3c..35196a19349 100644
--- a/cpp/include/cudf/io/orc_metadata.hpp
+++ b/cpp/include/cudf/io/orc_metadata.hpp
@@ -154,6 +154,21 @@ struct timestamp_statistics : minmax_statistics<int64_t> {
   std::optional<uint32_t> maximum_nanos;  ///< nanoseconds part of the maximum
 };
 
+/**
+ * @brief Variant type for ORC type-specific column statistics.
+ *
+ * The variant can hold any of the supported column statistics types.
+ */
+using statistics_type = std::variant<no_statistics,
+                                     integer_statistics,
+                                     double_statistics,
+                                     string_statistics,
+                                     bucket_statistics,
+                                     decimal_statistics,
+                                     date_statistics,
+                                     binary_statistics,
+                                     timestamp_statistics>;
+
 //! Orc I/O interfaces
 namespace orc {
 // forward declare the type that ProtobufReader uses. The `cudf::io::column_statistics` objects,
@@ -171,16 +186,7 @@ struct column_statistics;
 struct column_statistics {
   std::optional<uint64_t> number_of_values;  ///< number of statistics
   std::optional<bool> has_null;              ///< column has any nulls
-  std::variant<no_statistics,
-               integer_statistics,
-               double_statistics,
-               string_statistics,
-               bucket_statistics,
-               decimal_statistics,
-               date_statistics,
-               binary_statistics,
-               timestamp_statistics>
-    type_specific_stats;  ///< type-specific statistics
+  statistics_type type_specific_stats;       ///< type-specific statistics
 
   /**
    * @brief Construct a new column statistics object
diff --git a/dependencies.yaml b/dependencies.yaml
index 147a89076c4..2ed2525fc1e 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -286,13 +286,9 @@ dependencies:
       - output_types: conda
         packages:
           - &rmm_conda rmm==24.6.*
-          - &protobuf protobuf>=3.20,<5
           - pip
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
-      - output_types: [requirements, pyproject]
-        packages:
-          - protoc-wheel
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -525,7 +521,6 @@ dependencies:
           - packaging
           - rich
           - typing_extensions>=4.0.0
-          - *protobuf
       - output_types: conda
         packages:
           - *rmm_conda
diff --git a/pyproject.toml b/pyproject.toml
index 797b5374cb6..d343b237ee7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,6 @@ follow_imports = "skip"
 exclude = [
     "cudf/_lib/",
     "cudf/cudf/tests/",
-    "cudf/cudf/utils/metadata/orc_column_statistics_pb2.py",
     "custreamz/custreamz/tests/",
     "dask_cudf/dask_cudf/tests/",
  ]
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index 23edbbc636c..ecadbf5cbbc 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -96,9 +96,6 @@ include(cmake/Modules/LinkPyarrowHeaders.cmake)
 add_subdirectory(cudf/_lib)
 add_subdirectory(udf_cpp)
 
-include(cmake/Modules/ProtobufHelpers.cmake)
-codegen_protoc(cudf/utils/metadata/orc_column_statistics.proto)
-
 if(DEFINED cython_lib_dir)
   rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}")
 endif()
diff --git a/python/cudf/cmake/Modules/ProtobufHelpers.cmake b/python/cudf/cmake/Modules/ProtobufHelpers.cmake
deleted file mode 100644
index 70b8879cf18..00000000000
--- a/python/cudf/cmake/Modules/ProtobufHelpers.cmake
+++ /dev/null
@@ -1,50 +0,0 @@
-# =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-include_guard(GLOBAL)
-
-# Compile protobuf files to Python. All arguments are assumed to be .proto files.
-function(codegen_protoc)
-  # Allow user to provide path to protoc executable as an environment variable.
-  if(DEFINED ENV{PROTOC})
-    set(protoc_COMMAND $ENV{PROTOC})
-  else()
-    find_program(protoc_COMMAND protoc REQUIRED)
-  endif()
-
-  foreach(_proto_path IN LISTS ARGV)
-    string(REPLACE "\.proto" "_pb2\.py" pb2_py_path "${_proto_path}")
-    set(pb2_py_path "${CMAKE_CURRENT_SOURCE_DIR}/${pb2_py_path}")
-    # Note: If we ever need to process larger numbers of protobuf files we should consider switching
-    # to protobuf_generate_python from the FindProtobuf module.
-    execute_process(
-      COMMAND ${protoc_COMMAND} --python_out=. "${_proto_path}"
-      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY
-    )
-    # Mark entire file to skip formatting.
-    file(READ "${pb2_py_path}" pb2_py)
-    file(
-      WRITE "${pb2_py_path}"
-      [=[
-# fmt: off
-]=]
-    )
-    file(APPEND "${pb2_py_path}" "${pb2_py}")
-    file(
-      APPEND "${pb2_py_path}"
-      [=[
-# fmt: on
-]=]
-    )
-  endforeach()
-endfunction()
diff --git a/python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd b/python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd
index 57be1b1c90c..aad4f1c6870 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd
@@ -1,19 +1,73 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t
+from libcpp cimport bool
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
+from cudf._lib.variant cimport monostate, variant
 
 
 cdef extern from "cudf/io/orc_metadata.hpp" \
         namespace "cudf::io" nogil:
 
-    cdef cppclass raw_orc_statistics:
+    ctypedef monostate no_statistics
+
+    cdef cppclass minmax_statistics[T]:
+        optional[T] minimum
+        optional[T] maximum
+
+    cdef cppclass sum_statistics[T]:
+        optional[T] sum
+
+    cdef cppclass integer_statistics(
+        minmax_statistics[int64_t], sum_statistics[int64_t]
+    ):
+        pass
+
+    cdef cppclass double_statistics(
+        minmax_statistics[double], sum_statistics[double]
+    ):
+        pass
+
+    cdef cppclass string_statistics(
+        minmax_statistics[string], sum_statistics[int64_t]
+    ):
+        pass
+
+    cdef cppclass bucket_statistics:
+        vector[int64_t] count
+
+    cdef cppclass decimal_statistics(
+        minmax_statistics[string], sum_statistics[string]
+    ):
+        pass
+
+    ctypedef minmax_statistics[int32_t] date_statistics
+
+    ctypedef sum_statistics[int64_t] binary_statistics
+
+    cdef cppclass timestamp_statistics(minmax_statistics[int64_t]):
+        optional[int64_t] minimum_utc
+        optional[int64_t] maximum_utc
+        optional[uint32_t] minimum_nanos
+        optional[uint32_t] maximum_nanos
+
+    # This is a std::variant of all the statistics types
+    ctypedef variant statistics_type
+
+    cdef cppclass column_statistics:
+        optional[uint64_t] number_of_values
+        optional[bool] has_null
+        statistics_type type_specific_stats
+
+    cdef cppclass parsed_orc_statistics:
         vector[string] column_names
-        vector[string] file_stats
-        vector[vector[string]] stripes_stats
+        vector[column_statistics] file_stats
+        vector[vector[column_statistics]] stripes_stats
 
-    cdef raw_orc_statistics read_raw_orc_statistics(
+    cdef parsed_orc_statistics read_parsed_orc_statistics(
         cudf_io_types.source_info src_info
     ) except +
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 3fc9823b914..836880a6f2c 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -11,6 +11,7 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+import datetime
 from collections import OrderedDict
 
 cimport cudf._lib.cpp.lists.lists_column_view as cpp_lists_column_view
@@ -32,8 +33,19 @@ from cudf._lib.cpp.io.orc cimport (
     write_orc as libcudf_write_orc,
 )
 from cudf._lib.cpp.io.orc_metadata cimport (
-    raw_orc_statistics,
-    read_raw_orc_statistics as libcudf_read_raw_orc_statistics,
+    binary_statistics,
+    bucket_statistics,
+    column_statistics,
+    date_statistics,
+    decimal_statistics,
+    double_statistics,
+    integer_statistics,
+    no_statistics,
+    parsed_orc_statistics,
+    read_parsed_orc_statistics as libcudf_read_parsed_orc_statistics,
+    statistics_type,
+    string_statistics,
+    timestamp_statistics,
 )
 from cudf._lib.cpp.io.types cimport (
     column_in_metadata,
@@ -51,6 +63,7 @@ from cudf._lib.io.utils cimport (
     make_source_info,
     update_column_struct_field_names,
 )
+from cudf._lib.variant cimport get_if as std_get_if, holds_alternative
 
 from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
@@ -62,9 +75,128 @@ from pyarrow.lib import NativeFile
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
 
-cpdef read_raw_orc_statistics(filepath_or_buffer):
+cdef _parse_column_type_statistics(column_statistics stats):
+    # Initialize stats to return and parse stats blob
+    column_stats = {}
+
+    if stats.number_of_values.has_value():
+        column_stats["number_of_values"] = stats.number_of_values.value()
+
+    if stats.has_null.has_value():
+        column_stats["has_null"] = stats.has_null.value()
+
+    cdef statistics_type type_specific_stats = stats.type_specific_stats
+
+    cdef integer_statistics* int_stats
+    cdef double_statistics* dbl_stats
+    cdef string_statistics* str_stats
+    cdef bucket_statistics* bucket_stats
+    cdef decimal_statistics* dec_stats
+    cdef date_statistics* date_stats
+    cdef binary_statistics* bin_stats
+    cdef timestamp_statistics* ts_stats
+
+    if holds_alternative[no_statistics](type_specific_stats):
+        return column_stats
+    elif int_stats := std_get_if[integer_statistics](&type_specific_stats):
+        if int_stats.minimum.has_value():
+            column_stats["minimum"] = int_stats.minimum.value()
+        else:
+            column_stats["minimum"] = None
+        if int_stats.maximum.has_value():
+            column_stats["maximum"] = int_stats.maximum.value()
+        else:
+            column_stats["maximum"] = None
+        if int_stats.sum.has_value():
+            column_stats["sum"] = int_stats.sum.value()
+        else:
+            column_stats["sum"] = None
+    elif dbl_stats := std_get_if[double_statistics](&type_specific_stats):
+        if dbl_stats.minimum.has_value():
+            column_stats["minimum"] = dbl_stats.minimum.value()
+        else:
+            column_stats["minimum"] = None
+        if dbl_stats.maximum.has_value():
+            column_stats["maximum"] = dbl_stats.maximum.value()
+        else:
+            column_stats["maximum"] = None
+        if dbl_stats.sum.has_value():
+            column_stats["sum"] = dbl_stats.sum.value()
+        else:
+            column_stats["sum"] = None
+    elif str_stats := std_get_if[string_statistics](&type_specific_stats):
+        if str_stats.minimum.has_value():
+            column_stats["minimum"] = str_stats.minimum.value().decode("utf-8")
+        else:
+            column_stats["minimum"] = None
+        if str_stats.maximum.has_value():
+            column_stats["maximum"] = str_stats.maximum.value().decode("utf-8")
+        else:
+            column_stats["maximum"] = None
+        if str_stats.sum.has_value():
+            column_stats["sum"] = str_stats.sum.value()
+        else:
+            column_stats["sum"] = None
+    elif bucket_stats := std_get_if[bucket_statistics](&type_specific_stats):
+        column_stats["true_count"] = bucket_stats.count[0]
+        column_stats["false_count"] = (
+            column_stats["number_of_values"]
+            - column_stats["true_count"]
+        )
+    elif dec_stats := std_get_if[decimal_statistics](&type_specific_stats):
+        if dec_stats.minimum.has_value():
+            column_stats["minimum"] = dec_stats.minimum.value().decode("utf-8")
+        else:
+            column_stats["minimum"] = None
+        if dec_stats.maximum.has_value():
+            column_stats["maximum"] = dec_stats.maximum.value().decode("utf-8")
+        else:
+            column_stats["maximum"] = None
+        if dec_stats.sum.has_value():
+            column_stats["sum"] = dec_stats.sum.value().decode("utf-8")
+        else:
+            column_stats["sum"] = None
+    elif date_stats := std_get_if[date_statistics](&type_specific_stats):
+        if date_stats.minimum.has_value():
+            column_stats["minimum"] = datetime.datetime.fromtimestamp(
+                datetime.timedelta(date_stats.minimum.value()).total_seconds(),
+                datetime.timezone.utc,
+            )
+        else:
+            column_stats["minimum"] = None
+        if date_stats.maximum.has_value():
+            column_stats["maximum"] = datetime.datetime.fromtimestamp(
+                datetime.timedelta(date_stats.maximum.value()).total_seconds(),
+                datetime.timezone.utc,
+            )
+        else:
+            column_stats["maximum"] = None
+    elif bin_stats := std_get_if[binary_statistics](&type_specific_stats):
+        if bin_stats.sum.has_value():
+            column_stats["sum"] = bin_stats.sum.value()
+        else:
+            column_stats["sum"] = None
+    elif ts_stats := std_get_if[timestamp_statistics](&type_specific_stats):
+        # Before ORC-135, the local timezone offset was included and they were
+        # stored as minimum and maximum. After ORC-135, the timestamp is
+        # adjusted to UTC before being converted to milliseconds and stored
+        # in minimumUtc and maximumUtc.
+        # TODO: Support minimum and maximum by reading writer's local timezone
+        if ts_stats.minimum_utc.has_value() and ts_stats.maximum_utc.has_value():
+            column_stats["minimum"] = datetime.datetime.fromtimestamp(
+                ts_stats.minimum_utc.value() / 1000, datetime.timezone.utc
+            )
+            column_stats["maximum"] = datetime.datetime.fromtimestamp(
+                ts_stats.maximum_utc.value() / 1000, datetime.timezone.utc
+            )
+    else:
+        raise ValueError("Unsupported statistics type")
+    return column_stats
+
+
+cpdef read_parsed_orc_statistics(filepath_or_buffer):
     """
-    Cython function to call into libcudf API, see `read_raw_orc_statistics`.
+    Cython function to call into libcudf API, see `read_parsed_orc_statistics`.
 
     See Also
     --------
@@ -75,10 +207,25 @@ cpdef read_raw_orc_statistics(filepath_or_buffer):
     if isinstance(filepath_or_buffer, NativeFile):
         filepath_or_buffer = NativeFileDatasource(filepath_or_buffer)
 
-    cdef raw_orc_statistics raw = (
-        libcudf_read_raw_orc_statistics(make_source_info([filepath_or_buffer]))
+    cdef parsed_orc_statistics parsed = (
+        libcudf_read_parsed_orc_statistics(make_source_info([filepath_or_buffer]))
     )
-    return (raw.column_names, raw.file_stats, raw.stripes_stats)
+
+    cdef vector[column_statistics] file_stats = parsed.file_stats
+    cdef vector[vector[column_statistics]] stripes_stats = parsed.stripes_stats
+
+    parsed_file_stats = [
+        _parse_column_type_statistics(file_stats[column_index])
+        for column_index in range(file_stats.size())
+    ]
+
+    parsed_stripes_stats = [
+        [_parse_column_type_statistics(stripes_stats[stripe_index][column_index])
+         for column_index in range(stripes_stats[stripe_index].size())]
+        for stripe_index in range(stripes_stats.size())
+    ]
+
+    return parsed.column_names, parsed_file_stats, parsed_stripes_stats
 
 
 cpdef read_orc(object filepaths_or_buffers,
diff --git a/python/cudf/cudf/_lib/variant.pxd b/python/cudf/cudf/_lib/variant.pxd
new file mode 100644
index 00000000000..f686bf18bf7
--- /dev/null
+++ b/python/cudf/cudf/_lib/variant.pxd
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+
+cdef extern from "<variant>" namespace "std" nogil:
+    cdef cppclass variant:
+        variant& operator=(variant&)
+        size_t index()
+
+    cdef cppclass monostate:
+        pass
+
+    cdef T* get_if[T](...)
+    cdef bool holds_alternative[T](...)
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index d135a31438e..7082a85237a 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import datetime
 import warnings
@@ -10,9 +10,6 @@
 from cudf._lib import orc as liborc
 from cudf.api.types import is_list_like
 from cudf.utils import ioutils
-from cudf.utils.metadata import (  # type: ignore
-    orc_column_statistics_pb2 as cs_pb2,
-)
 
 
 def _make_empty_df(filepath_or_buffer, columns):
@@ -173,45 +170,38 @@ def read_orc_statistics(
     files_statistics = []
     stripes_statistics = []
     for source in filepaths_or_buffers:
-        path_or_buf, compression = ioutils.get_reader_filepath_or_buffer(
+        path_or_buf, _ = ioutils.get_reader_filepath_or_buffer(
             path_or_data=source, compression=None, **kwargs
         )
-        if compression is not None:
-            ValueError("URL content-encoding decompression is not supported")
-
-        # Read in statistics and unpack
         (
             column_names,
-            raw_file_statistics,
-            raw_stripes_statistics,
-        ) = liborc.read_raw_orc_statistics(path_or_buf)
+            parsed_file_statistics,
+            parsed_stripes_statistics,
+        ) = liborc.read_parsed_orc_statistics(path_or_buf)
 
         # Parse column names
         column_names = [
             column_name.decode("utf-8") for column_name in column_names
         ]
 
-        # Parse statistics
-        cs = cs_pb2.ColumnStatistics()
-
+        # Parse file statistics
         file_statistics = {
-            column_names[i]: _parse_column_statistics(cs, raw_file_stats)
-            for i, raw_file_stats in enumerate(raw_file_statistics)
-            if columns is None or column_names[i] in columns
+            column_name: column_stats
+            for column_name, column_stats in zip(
+                column_names, parsed_file_statistics
+            )
+            if columns is None or column_name in columns
         }
-        if any(
-            not parsed_statistics
-            for parsed_statistics in file_statistics.values()
-        ):
-            continue
-        else:
-            files_statistics.append(file_statistics)
+        files_statistics.append(file_statistics)
 
-        for raw_stripe_statistics in raw_stripes_statistics:
+        # Parse stripe statistics
+        for parsed_stripe_statistics in parsed_stripes_statistics:
             stripe_statistics = {
-                column_names[i]: _parse_column_statistics(cs, raw_file_stats)
-                for i, raw_file_stats in enumerate(raw_stripe_statistics)
-                if columns is None or column_names[i] in columns
+                column_name: column_stats
+                for column_name, column_stats in zip(
+                    column_names, parsed_stripe_statistics
+                )
+                if columns is None or column_name in columns
             }
             if any(
                 not parsed_statistics
diff --git a/python/cudf/cudf/utils/metadata/__init__.py b/python/cudf/cudf/utils/metadata/__init__.py
deleted file mode 100644
index ccbb16256fb..00000000000
--- a/python/cudf/cudf/utils/metadata/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
diff --git a/python/cudf/cudf/utils/metadata/orc_column_statistics.proto b/python/cudf/cudf/utils/metadata/orc_column_statistics.proto
deleted file mode 100644
index 1bc0fa6f6bd..00000000000
--- a/python/cudf/cudf/utils/metadata/orc_column_statistics.proto
+++ /dev/null
@@ -1,62 +0,0 @@
-syntax = "proto2";
-
-message IntegerStatistics  {
-  optional sint64 minimum = 1;
-  optional sint64 maximum = 2;
-  optional sint64 sum = 3;
-}
-
-message DoubleStatistics {
-  optional double minimum = 1;
-  optional double maximum = 2;
-  optional double sum = 3;
-}
-
-message StringStatistics {
-  optional string minimum = 1;
-  optional string maximum = 2;
-  // sum will store the total length of all strings in a stripe
-  optional sint64 sum = 3;
-}
-
-message BucketStatistics {
-  repeated uint64 count = 1 [packed=true];
-}
-
-message DecimalStatistics {
-  optional string minimum = 1;
-  optional string maximum = 2;
-  optional string sum = 3;
-}
-
-message DateStatistics {
-  // min,max values saved as days since epoch
-  optional sint32 minimum = 1;
-  optional sint32 maximum = 2;
-}
-
-message TimestampStatistics {
-  // min,max values saved as milliseconds since epoch
-  optional sint64 minimum = 1;
-  optional sint64 maximum = 2;
-  optional sint64 minimumUtc = 3;
-  optional sint64 maximumUtc = 4;
-}
-
-message BinaryStatistics {
-  // sum will store the total binary blob length in a stripe
-  optional sint64 sum = 1;
-}
-
-message ColumnStatistics {
-  optional uint64 numberOfValues = 1;
-  optional IntegerStatistics intStatistics = 2;
-  optional DoubleStatistics doubleStatistics = 3;
-  optional StringStatistics stringStatistics = 4;
-  optional BucketStatistics bucketStatistics = 5;
-  optional DecimalStatistics decimalStatistics = 6;
-  optional DateStatistics dateStatistics = 7;
-  optional BinaryStatistics binaryStatistics = 8;
-  optional TimestampStatistics timestampStatistics = 9;
-  optional bool hasNull = 10;
-}
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index adab199dcf4..fc3a243572f 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -7,7 +7,6 @@ requires = [
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
-    "protoc-wheel",
     "pyarrow==14.0.2.*",
     "rmm==24.6.*",
     "scikit-build-core[pyproject]>=0.7.0",
@@ -34,7 +33,6 @@ dependencies = [
     "nvtx>=0.2.1",
     "packaging",
     "pandas>=2.0,<2.2.3dev0",
-    "protobuf>=3.20,<5",
     "ptxcompiler",
     "pyarrow>=14.0.1,<15.0.0a0",
     "rich",

From 96903bb99476f876bbcadbdb50f8a9e9b80eeff4 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Sat, 20 Apr 2024 02:23:32 +0200
Subject: [PATCH 090/842] Unify Copy-On-Write and Spilling (#15436)

This is the final step to unify COW and spilling. Now, `SpillableBuffer` inherits from `ExposureTrackedBuffer` so the final class hierarchy becomes:
```
SpillableBufferOwner -> BufferOwner
SpillableBuffer -> ExposureTrackedBuffer -> Buffer
```

Additionally, spill-on-demand is now set globally using `set_spill_on_demand_globally()` instead of in the `SpillManager` constructor.

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15436
---
 python/cudf/cudf/core/buffer/buffer.py        |  69 +++++----
 .../core/buffer/exposure_tracked_buffer.py    |  18 +--
 python/cudf/cudf/core/buffer/spill_manager.py | 101 +++++++++----
 .../cudf/cudf/core/buffer/spillable_buffer.py |  59 ++++----
 python/cudf/cudf/core/buffer/utils.py         |   4 +-
 python/cudf/cudf/options.py                   |  15 +-
 python/cudf/cudf/tests/test_copying.py        |  13 +-
 python/cudf/cudf/tests/test_spilling.py       | 140 +++++++++++++++---
 8 files changed, 284 insertions(+), 135 deletions(-)

diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index 1631fa00412..b2aba4f978b 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -106,8 +106,25 @@ class BufferOwner(Serializable):
     been accessed outside of BufferOwner. In this case, we have no control
     over knowing if the data is being modified by a third party.
 
-    Use `_from_device_memory` and `_from_host_memory` to create
+    Use `from_device_memory` and `from_host_memory` to create
     a new instance from either device or host memory respectively.
+
+    Parameters
+    ----------
+    ptr
+        An integer representing a pointer to memory.
+    size
+        The size of the memory in nbytes
+    owner
+        Python object to which the lifetime of the memory allocation is tied.
+        This buffer will keep a reference to `owner`.
+    exposed
+        Pointer to the underlying memory
+
+    Raises
+    ------
+    ValueError
+        If size is negative
     """
 
     _ptr: int
@@ -117,14 +134,25 @@ class BufferOwner(Serializable):
     # The set of buffers that point to this owner.
     _slices: weakref.WeakSet[Buffer]
 
-    def __init__(self):
-        raise ValueError(
-            f"do not create a {self.__class__} directly, please "
-            "use the factory function `cudf.core.buffer.as_buffer`"
-        )
+    def __init__(
+        self,
+        *,
+        ptr: int,
+        size: int,
+        owner: object,
+        exposed: bool,
+    ):
+        if size < 0:
+            raise ValueError("size cannot be negative")
+
+        self._ptr = ptr
+        self._size = size
+        self._owner = owner
+        self._exposed = exposed
+        self._slices = weakref.WeakSet()
 
     @classmethod
-    def _from_device_memory(cls, data: Any, exposed: bool) -> Self:
+    def from_device_memory(cls, data: Any, exposed: bool) -> Self:
         """Create from an object providing a `__cuda_array_interface__`.
 
         No data is being copied.
@@ -151,24 +179,15 @@ def _from_device_memory(cls, data: Any, exposed: bool) -> Self:
             If the resulting buffer has negative size
         """
 
-        # Bypass `__init__` and initialize attributes manually
-        ret = cls.__new__(cls)
-        ret._owner = data
-        ret._exposed = exposed
-        ret._slices = weakref.WeakSet()
         if isinstance(data, rmm.DeviceBuffer):  # Common case shortcut
-            ret._ptr = data.ptr
-            ret._size = data.size
+            ptr = data.ptr
+            size = data.size
         else:
-            ret._ptr, ret._size = get_ptr_and_size(
-                data.__cuda_array_interface__
-            )
-        if ret.size < 0:
-            raise ValueError("size cannot be negative")
-        return ret
+            ptr, size = get_ptr_and_size(data.__cuda_array_interface__)
+        return cls(ptr=ptr, size=size, owner=data, exposed=exposed)
 
     @classmethod
-    def _from_host_memory(cls, data: Any) -> Self:
+    def from_host_memory(cls, data: Any) -> Self:
         """Create an owner from a buffer or array like object
 
         Data must implement `__array_interface__`, the buffer protocol, and/or
@@ -196,7 +215,7 @@ def _from_host_memory(cls, data: Any) -> Self:
         # Copy to device memory
         buf = rmm.DeviceBuffer(ptr=ptr, size=size)
         # Create from device memory
-        return cls._from_device_memory(buf, exposed=False)
+        return cls.from_device_memory(buf, exposed=False)
 
     @property
     def size(self) -> int:
@@ -375,7 +394,7 @@ def copy(self, deep: bool = True) -> Self:
             )
 
         # Otherwise, we create a new copy of the memory
-        owner = self._owner._from_device_memory(
+        owner = self._owner.from_device_memory(
             rmm.DeviceBuffer(
                 ptr=self._owner.get_ptr(mode="read") + self._offset,
                 size=self.size,
@@ -439,9 +458,9 @@ def deserialize(cls, header: dict, frames: list) -> Self:
 
         owner_type: BufferOwner = pickle.loads(header["owner-type-serialized"])
         if hasattr(frame, "__cuda_array_interface__"):
-            owner = owner_type._from_device_memory(frame, exposed=False)
+            owner = owner_type.from_device_memory(frame, exposed=False)
         else:
-            owner = owner_type._from_host_memory(frame)
+            owner = owner_type.from_host_memory(frame)
         return cls(
             owner=owner,
             offset=0,
diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
index 4c08016adbb..15f00fc670d 100644
--- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
+++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
@@ -23,8 +23,6 @@ class ExposureTrackedBuffer(Buffer):
         The size of the slice (in bytes)
     """
 
-    _owner: BufferOwner
-
     def __init__(
         self,
         owner: BufferOwner,
@@ -32,11 +30,7 @@ def __init__(
         size: Optional[int] = None,
     ) -> None:
         super().__init__(owner=owner, offset=offset, size=size)
-        self._owner._slices.add(self)
-
-    @property
-    def exposed(self) -> bool:
-        return self._owner.exposed
+        self.owner._slices.add(self)
 
     def get_ptr(self, *, mode: Literal["read", "write"]) -> int:
         if mode == "write" and cudf.get_option("copy_on_write"):
@@ -72,7 +66,7 @@ def copy(self, deep: bool = True) -> Self:
             copy-on-write option (see above).
         """
         if cudf.get_option("copy_on_write"):
-            return super().copy(deep=deep or self.exposed)
+            return super().copy(deep=deep or self.owner.exposed)
         return super().copy(deep=deep)
 
     @property
@@ -98,11 +92,11 @@ def make_single_owner_inplace(self) -> None:
             Buffer representing the same device memory as `data`
         """
 
-        if len(self._owner._slices) > 1:
-            # If this is not the only slice pointing to `self._owner`, we
-            # point to a new deep copy of the owner.
+        if len(self.owner._slices) > 1:
+            # If this is not the only slice pointing to `self.owner`, we
+            # point to a new copy of our slice of `self.owner`.
             t = self.copy(deep=True)
-            self._owner = t._owner
+            self._owner = t.owner
             self._offset = t._offset
             self._size = t._size
             self._owner._slices.add(self)
diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
index 3e654e01401..cd81149bdb8 100644
--- a/python/cudf/cudf/core/buffer/spill_manager.py
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -10,6 +10,7 @@
 import warnings
 import weakref
 from collections import defaultdict
+from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
 from typing import Dict, List, Optional, Tuple
@@ -201,10 +202,6 @@ class SpillManager:
     This class implements tracking of all known spillable buffers, on-demand
     spilling of said buffers, and (optionally) maintains a memory usage limit.
 
-    When `spill_on_demand=True`, the manager registers an RMM out-of-memory
-    error handler, which will spill spillable buffers in order to free up
-    memory.
-
     When `device_memory_limit=<limit-in-bytes>`, the manager will try keep
     the device memory usage below the specified limit by spilling of spillable
     buffers continuously, which will introduce a modest overhead.
@@ -213,8 +210,6 @@ class SpillManager:
 
     Parameters
     ----------
-    spill_on_demand : bool
-        Enable spill on demand.
     device_memory_limit: int, optional
         If not None, this is the device memory limit in bytes that triggers
         device to host spilling. The global manager sets this to the value
@@ -230,30 +225,15 @@ class SpillManager:
     def __init__(
         self,
         *,
-        spill_on_demand: bool = False,
         device_memory_limit: Optional[int] = None,
         statistic_level: int = 0,
     ) -> None:
         self._lock = threading.Lock()
         self._buffers = weakref.WeakValueDictionary()
         self._id_counter = 0
-        self._spill_on_demand = spill_on_demand
         self._device_memory_limit = device_memory_limit
         self.statistics = SpillStatistics(statistic_level)
 
-        if self._spill_on_demand:
-            # Set the RMM out-of-memory handle if not already set
-            mr = rmm.mr.get_current_device_resource()
-            if all(
-                not isinstance(m, rmm.mr.FailureCallbackResourceAdaptor)
-                for m in get_rmm_memory_resource_stack(mr)
-            ):
-                rmm.mr.set_current_device_resource(
-                    rmm.mr.FailureCallbackResourceAdaptor(
-                        mr, self._out_of_memory_handle
-                    )
-                )
-
     def _out_of_memory_handle(self, nbytes: int, *, retry_once=True) -> bool:
         """Try to handle an out-of-memory error by spilling
 
@@ -408,8 +388,7 @@ def __repr__(self) -> str:
             dev_limit = format_bytes(self._device_memory_limit)
 
         return (
-            f"<SpillManager spill_on_demand={self._spill_on_demand} "
-            f"device_memory_limit={dev_limit} | "
+            f"<SpillManager device_memory_limit={dev_limit} | "
             f"{format_bytes(spilled)} spilled | "
             f"{format_bytes(unspilled)} ({unspillable_ratio:.0%}) "
             f"unspilled (unspillable)>"
@@ -442,12 +421,82 @@ def get_global_manager() -> Optional[SpillManager]:
     """Get the global manager or None if spilling is disabled"""
     global _global_manager_uninitialized
     if _global_manager_uninitialized:
-        manager = None
         if get_option("spill"):
             manager = SpillManager(
-                spill_on_demand=get_option("spill_on_demand"),
                 device_memory_limit=get_option("spill_device_limit"),
                 statistic_level=get_option("spill_stats"),
             )
-        set_global_manager(manager)
+            set_global_manager(manager)
+            if get_option("spill_on_demand"):
+                set_spill_on_demand_globally()
+        else:
+            set_global_manager(None)
     return _global_manager
+
+
+def set_spill_on_demand_globally() -> None:
+    """Enable spill on demand in the current global spill manager.
+
+    Warning: this modifies the current RMM memory resource. A memory resource
+    to handle out-of-memory errors is pushed onto the RMM memory resource stack.
+
+    Raises
+    ------
+    ValueError
+        If no global spill manager exists (spilling is disabled).
+    ValueError
+        If a failure callback resource is already in the resource stack.
+    """
+
+    manager = get_global_manager()
+    if manager is None:
+        raise ValueError(
+            "Cannot enable spill on demand with no global spill manager"
+        )
+    mr = rmm.mr.get_current_device_resource()
+    if any(
+        isinstance(m, rmm.mr.FailureCallbackResourceAdaptor)
+        for m in get_rmm_memory_resource_stack(mr)
+    ):
+        raise ValueError(
+            "Spill on demand (or another failure callback resource) "
+            "is already registered"
+        )
+    rmm.mr.set_current_device_resource(
+        rmm.mr.FailureCallbackResourceAdaptor(
+            mr, manager._out_of_memory_handle
+        )
+    )
+
+
+@contextmanager
+def spill_on_demand_globally():
+    """Context to enable spill on demand temporarily.
+
+    Warning: this modifies the current RMM memory resource. A memory resource
+    to handle out-of-memory errors is pushed onto the RMM memory resource stack
+    when entering the context and popped again when exiting.
+
+    Raises
+    ------
+    ValueError
+        If no global spill manager exists (spilling is disabled).
+    ValueError
+        If a failure callback resource is already in the resource stack.
+    ValueError
+        If the RMM memory source stack was changed while in the context.
+    """
+    set_spill_on_demand_globally()
+    # Save the new memory resource stack for later cleanup
+    mr_stack = get_rmm_memory_resource_stack(
+        rmm.mr.get_current_device_resource()
+    )
+    try:
+        yield
+    finally:
+        mr = rmm.mr.get_current_device_resource()
+        if mr_stack != get_rmm_memory_resource_stack(mr):
+            raise ValueError(
+                "RMM memory source stack was changed while in the context"
+            )
+        rmm.mr.set_current_device_resource(mr_stack[1])
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index a9569190e75..a1af3ba8c9d 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -20,6 +20,7 @@
     cuda_array_interface_wrapper,
     host_memory_allocation,
 )
+from cudf.core.buffer.exposure_tracked_buffer import ExposureTrackedBuffer
 from cudf.utils.nvtx_annotation import _get_color_for_nvtx, annotate
 from cudf.utils.string import format_bytes
 
@@ -93,8 +94,8 @@ class SpillableBufferOwner(BufferOwner):
     def _finalize_init(self, ptr_desc: Dict[str, Any]) -> None:
         """Finish initialization of the spillable buffer
 
-        This implements the common initialization that `_from_device_memory`
-        and `_from_host_memory` are missing.
+        This implements the common initialization that `from_device_memory`
+        and `from_host_memory` are missing.
 
         Parameters
         ----------
@@ -119,7 +120,7 @@ def _finalize_init(self, ptr_desc: Dict[str, Any]) -> None:
         self._manager.add(self)
 
     @classmethod
-    def _from_device_memory(cls, data: Any, exposed: bool) -> Self:
+    def from_device_memory(cls, data: Any, exposed: bool) -> Self:
         """Create a spillabe buffer from device memory.
 
         No data is being copied.
@@ -136,12 +137,12 @@ def _from_device_memory(cls, data: Any, exposed: bool) -> Self:
         SpillableBufferOwner
             Buffer representing the same device memory as `data`
         """
-        ret = super()._from_device_memory(data, exposed=exposed)
+        ret = super().from_device_memory(data, exposed=exposed)
         ret._finalize_init(ptr_desc={"type": "gpu"})
         return ret
 
     @classmethod
-    def _from_host_memory(cls, data: Any) -> Self:
+    def from_host_memory(cls, data: Any) -> Self:
         """Create a spillabe buffer from host memory.
 
         Data must implement `__array_interface__`, the buffer protocol, and/or
@@ -170,11 +171,7 @@ def _from_host_memory(cls, data: Any) -> Self:
         data = data.cast("B")  # Make sure itemsize==1
 
         # Create an already spilled buffer
-        ret = cls.__new__(cls)
-        ret._owner = None
-        ret._ptr = 0
-        ret._size = data.nbytes
-        ret._exposed = False
+        ret = cls(ptr=0, size=data.nbytes, owner=None, exposed=False)
         ret._finalize_init(ptr_desc={"type": "cpu", "memoryview": data})
         return ret
 
@@ -372,21 +369,8 @@ def __str__(self) -> str:
         )
 
 
-class SpillableBuffer(Buffer):
-    """A slice of a spillable buffer
-
-    This buffer applies the slicing and then delegates all
-    operations to its owning buffer.
-
-    Parameters
-    ----------
-    owner : SpillableBufferOwner
-        The owner of the view
-    offset : int
-        Memory offset into the owning buffer
-    size : int
-        Size of the view (in bytes)
-    """
+class SpillableBuffer(ExposureTrackedBuffer):
+    """A slice of a spillable buffer"""
 
     _owner: SpillableBufferOwner
 
@@ -397,10 +381,6 @@ def spill(self, target: str = "cpu") -> None:
     def is_spilled(self) -> bool:
         return self._owner.is_spilled
 
-    @property
-    def exposed(self) -> bool:
-        return self._owner.exposed
-
     @property
     def spillable(self) -> bool:
         return self._owner.spillable
@@ -412,9 +392,6 @@ def memory_info(self) -> Tuple[int, int, str]:
         (ptr, _, device_type) = self._owner.memory_info()
         return (ptr + self._offset, self.nbytes, device_type)
 
-    def mark_exposed(self) -> None:
-        self._owner.mark_exposed()
-
     def serialize(self) -> Tuple[dict, list]:
         """Serialize the Buffer
 
@@ -449,7 +426,7 @@ def serialize(self) -> Tuple[dict, list]:
                 ptr, size, _ = self.memory_info()
                 frames = [
                     Buffer(
-                        owner=BufferOwner._from_device_memory(
+                        owner=BufferOwner.from_device_memory(
                             cuda_array_interface_wrapper(
                                 ptr=ptr,
                                 size=size,
@@ -461,6 +438,22 @@ def serialize(self) -> Tuple[dict, list]:
                 ]
             return header, frames
 
+    def copy(self, deep: bool = True) -> Self:
+        from cudf.core.buffer.utils import acquire_spill_lock
+
+        if not deep:
+            return super().copy(deep=False)
+
+        if self.is_spilled:
+            # In this case, we make the new copy point to the same spilled
+            # data in host memory. We can do this since spilled data is never
+            # modified.
+            owner = self._owner.from_host_memory(self.memoryview())
+            return self.__class__(owner=owner, offset=0, size=owner.size)
+
+        with acquire_spill_lock():
+            return super().copy(deep=deep)
+
     @property
     def __cuda_array_interface__(self) -> dict:
         return {
diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py
index c2ec7effd13..3346d05ed4a 100644
--- a/python/cudf/cudf/core/buffer/utils.py
+++ b/python/cudf/cudf/core/buffer/utils.py
@@ -133,13 +133,13 @@ def as_buffer(
     if not hasattr(data, "__cuda_array_interface__"):
         if exposed:
             raise ValueError("cannot created exposed host memory")
-        return buffer_class(owner=owner_class._from_host_memory(data))
+        return buffer_class(owner=owner_class.from_host_memory(data))
 
     # Check if `data` is owned by a known class
     owner = get_buffer_owner(data)
     if owner is None:  # `data` is new device memory
         return buffer_class(
-            owner=owner_class._from_device_memory(data, exposed=exposed)
+            owner=owner_class.from_device_memory(data, exposed=exposed)
         )
 
     # At this point, we know that `data` is owned by a known class, which
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index 7a0db49bd20..efa8eabd8b8 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 import os
 import textwrap
@@ -152,11 +152,6 @@ def _validator(val):
 
 
 def _cow_validator(val):
-    if get_option("spill") and val:
-        raise ValueError(
-            "Copy-on-write is not supported when spilling is enabled. "
-            "Please set `spill` to `False`"
-        )
     if val not in {False, True}:
         raise ValueError(
             f"{val} is not a valid option. Must be one of {{False, True}}."
@@ -164,14 +159,6 @@ def _cow_validator(val):
 
 
 def _spill_validator(val):
-    try:
-        if get_option("copy_on_write") and val:
-            raise ValueError(
-                "Spilling is not supported when copy-on-write is enabled. "
-                "Please set `copy_on_write` to `False`"
-            )
-    except KeyError:
-        pass
     if val not in {False, True}:
         raise ValueError(
             f"{val} is not a valid option. Must be one of {{False, True}}."
diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py
index e737a73e86b..0bc9ffa8004 100644
--- a/python/cudf/cudf/tests/test_copying.py
+++ b/python/cudf/cudf/tests/test_copying.py
@@ -7,8 +7,11 @@
 
 import cudf
 from cudf import Series
+from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
 
+pytestmark = pytest.mark.spilling
+
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + OTHER_TYPES)
 def test_repeat(dtype):
@@ -302,6 +305,8 @@ def test_series_zero_copy_cow_on():
 
 
 def test_series_zero_copy_cow_off():
+    is_spill_enabled = get_global_manager() is not None
+
     with cudf.option_context("copy_on_write", False):
         s = cudf.Series([1, 2, 3, 4, 5])
         s1 = s.copy(deep=False)
@@ -334,8 +339,12 @@ def test_series_zero_copy_cow_off():
         assert_eq(s, cudf.Series([20, 10, 10, 4, 5]))
         assert_eq(s1, cudf.Series([20, 10, 10, 4, 5]))
         assert_eq(cp_array, cp.array([20, 10, 10, 4, 5]))
-        assert_eq(s2, cudf.Series([20, 10, 10, 4, 5]))
-        assert_eq(s3, cudf.Series([20, 10, 10, 4, 5]))
+        if not is_spill_enabled:
+            # Since spilling might make a copy of the data, we cannot
+            # expect the two series to be a zero-copy of the cupy array
+            # when spilling is enabled globally.
+            assert_eq(s2, cudf.Series([20, 10, 10, 4, 5]))
+            assert_eq(s3, cudf.Series([20, 10, 10, 4, 5]))
 
         s4 = cudf.Series([10, 20, 30, 40, 50])
         s5 = cudf.Series(s4)
diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py
index f18cb32a091..913a958b4c2 100644
--- a/python/cudf/cudf/tests/test_spilling.py
+++ b/python/cudf/cudf/tests/test_spilling.py
@@ -32,6 +32,7 @@
     get_global_manager,
     get_rmm_memory_resource_stack,
     set_global_manager,
+    spill_on_demand_globally,
 )
 from cudf.core.buffer.spillable_buffer import (
     SpillableBuffer,
@@ -47,6 +48,22 @@
     )
 
 
+@contextlib.contextmanager
+def set_rmm_memory_pool(nbytes: int):
+    mr = rmm.mr.get_current_device_resource()
+    rmm.mr.set_current_device_resource(
+        rmm.mr.PoolMemoryResource(
+            mr,
+            initial_pool_size=nbytes,
+            maximum_pool_size=nbytes,
+        )
+    )
+    try:
+        yield
+    finally:
+        rmm.mr.set_current_device_resource(mr)
+
+
 def single_column_df(target="gpu") -> cudf.DataFrame:
     """Create a standard single column dataframe used for testing
 
@@ -120,18 +137,18 @@ def test_spillable_buffer(manager: SpillManager):
     buf = as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False)
     assert isinstance(buf, SpillableBuffer)
     assert buf.spillable
-    buf.mark_exposed()
-    assert buf.exposed
+    buf.owner.mark_exposed()
+    assert buf.owner.exposed
     assert not buf.spillable
     buf = as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False)
     # Notice, accessing `__cuda_array_interface__` itself doesn't
     # expose the pointer, only accessing the "data" field exposes
     # the pointer.
     iface = buf.__cuda_array_interface__
-    assert not buf.exposed
+    assert not buf.owner.exposed
     assert buf.spillable
     iface["data"][0]  # Expose pointer
-    assert buf.exposed
+    assert buf.owner.exposed
     assert not buf.spillable
 
 
@@ -141,7 +158,6 @@ def test_spillable_buffer(manager: SpillManager):
         "get_ptr",
         "memoryview",
         "is_spilled",
-        "exposed",
         "spillable",
         "spill_lock",
         "spill",
@@ -210,7 +226,7 @@ def test_spilling_buffer(manager: SpillManager):
     buf = as_buffer(rmm.DeviceBuffer(size=10), exposed=False)
     buf.spill(target="cpu")
     assert buf.is_spilled
-    buf.mark_exposed()  # Expose pointer and trigger unspill
+    buf.owner.mark_exposed()  # Expose pointer and trigger unspill
     assert not buf.is_spilled
     with pytest.raises(ValueError, match="unspillable buffer"):
         buf.spill(target="cpu")
@@ -237,7 +253,7 @@ def _get_manager_in_env(monkeypatch, var_vals):
 def test_environment_variables_spill_off(monkeypatch):
     with _get_manager_in_env(
         monkeypatch,
-        [("CUDF_SPILL", "off"), ("CUDF_SPILL_ON_DEMAND", "off")],
+        [("CUDF_SPILL", "off")],
     ) as manager:
         assert manager is None
 
@@ -245,10 +261,9 @@ def test_environment_variables_spill_off(monkeypatch):
 def test_environment_variables_spill_on(monkeypatch):
     with _get_manager_in_env(
         monkeypatch,
-        [("CUDF_SPILL", "on")],
+        [("CUDF_SPILL", "on"), ("CUDF_SPILL_ON_DEMAND", "off")],
     ) as manager:
         assert isinstance(manager, SpillManager)
-        assert manager._spill_on_demand is True
         assert manager._device_memory_limit is None
         assert manager.statistics.level == 0
 
@@ -256,7 +271,11 @@ def test_environment_variables_spill_on(monkeypatch):
 def test_environment_variables_device_limit(monkeypatch):
     with _get_manager_in_env(
         monkeypatch,
-        [("CUDF_SPILL", "on"), ("CUDF_SPILL_DEVICE_LIMIT", "1000")],
+        [
+            ("CUDF_SPILL", "on"),
+            ("CUDF_SPILL_ON_DEMAND", "off"),
+            ("CUDF_SPILL_DEVICE_LIMIT", "1000"),
+        ],
     ) as manager:
         assert isinstance(manager, SpillManager)
         assert manager._device_memory_limit == 1000
@@ -269,6 +288,7 @@ def test_environment_variables_spill_stats(monkeypatch, level):
         monkeypatch,
         [
             ("CUDF_SPILL", "on"),
+            ("CUDF_SPILL_ON_DEMAND", "off"),
             ("CUDF_SPILL_DEVICE_LIMIT", "1000"),
             ("CUDF_SPILL_STATS", f"{level}"),
         ],
@@ -529,12 +549,8 @@ def test_serialize_cuda_dataframe(manager: SpillManager):
     assert_eq(df1, df2)
 
 
-@pytest.mark.skip(
-    reason="This test is not safe because other tests may have enabled"
-    "spilling and already modified rmm's global state"
-)
 def test_get_rmm_memory_resource_stack():
-    mr1 = rmm.mr.get_current_device_resource()
+    mr1 = rmm.mr.CudaMemoryResource()
     assert all(
         not isinstance(m, rmm.mr.FailureCallbackResourceAdaptor)
         for m in get_rmm_memory_resource_stack(mr1)
@@ -560,9 +576,9 @@ def test_df_transpose(manager: SpillManager):
     df1 = cudf.DataFrame({"a": [1, 2]})
     df2 = df1.transpose()
     # For now, all buffers are marked as exposed
-    assert df1._data._data["a"].data.exposed
-    assert df2._data._data[0].data.exposed
-    assert df2._data._data[1].data.exposed
+    assert df1._data._data["a"].data.owner.exposed
+    assert df2._data._data[0].data.owner.exposed
+    assert df2._data._data[1].data.owner.exposed
 
 
 def test_as_buffer_of_spillable_buffer(manager: SpillManager):
@@ -651,7 +667,7 @@ def test_statistics_expose(manager: SpillManager):
     ]
 
     # Expose the first buffer
-    buffers[0].mark_exposed()
+    buffers[0].owner.mark_exposed()
     assert len(manager.statistics.exposes) == 1
     stat = list(manager.statistics.exposes.values())[0]
     assert stat.count == 1
@@ -660,7 +676,7 @@ def test_statistics_expose(manager: SpillManager):
 
     # Expose all 10 buffers
     for i in range(10):
-        buffers[i].mark_exposed()
+        buffers[i].owner.mark_exposed()
 
     # The rest of the ptr accesses should accumulate to a single stat
     # because they resolve to the same traceback.
@@ -680,9 +696,91 @@ def test_statistics_expose(manager: SpillManager):
 
     # Expose the new buffers and check that they are counted as spilled
     for i in range(10):
-        buffers[i].mark_exposed()
+        buffers[i].owner.mark_exposed()
     assert len(manager.statistics.exposes) == 3
     stat = list(manager.statistics.exposes.values())[2]
     assert stat.count == 10
     assert stat.total_nbytes == buffers[0].nbytes * 10
     assert stat.spilled_nbytes == buffers[0].nbytes * 10
+
+
+def test_spill_on_demand(manager: SpillManager):
+    with set_rmm_memory_pool(1024):
+        a = as_buffer(data=rmm.DeviceBuffer(size=1024))
+        assert isinstance(a, SpillableBuffer)
+        assert not a.is_spilled
+
+        with pytest.raises(MemoryError, match="Maximum pool size exceeded"):
+            as_buffer(data=rmm.DeviceBuffer(size=1024))
+
+        with spill_on_demand_globally():
+            b = as_buffer(data=rmm.DeviceBuffer(size=1024))
+            assert a.is_spilled
+            assert not b.is_spilled
+
+        with pytest.raises(MemoryError, match="Maximum pool size exceeded"):
+            as_buffer(data=rmm.DeviceBuffer(size=1024))
+
+
+def test_spilling_and_copy_on_write(manager: SpillManager):
+    with cudf.option_context("copy_on_write", True):
+        a: SpillableBuffer = as_buffer(data=rmm.DeviceBuffer(size=10))
+
+        b = a.copy(deep=False)
+        assert a.owner == b.owner
+        a.spill(target="cpu")
+        assert a.is_spilled
+        assert b.is_spilled
+
+        # Write access trigger copy of `a` into `b` but since `a` is spilled
+        # the copy is done in host memory and `a` remains spilled.
+        with acquire_spill_lock():
+            b.get_ptr(mode="write")
+        assert a.is_spilled
+        assert not b.is_spilled
+
+        # Deep copy of the spilled buffer `a`
+        b = a.copy(deep=True)
+        assert a.owner != b.owner
+        assert a.is_spilled
+        assert b.is_spilled
+        a.spill(target="gpu")
+        assert not a.is_spilled
+        assert b.is_spilled
+
+        # Deep copy of the unspilled buffer `a`
+        b = a.copy(deep=True)
+        assert a.spillable
+        assert not a.is_spilled
+        assert not b.is_spilled
+
+        b = a.copy(deep=False)
+        assert a.owner == b.owner
+        # Write access trigger copy of `a` into `b` in device memory
+        with acquire_spill_lock():
+            b.get_ptr(mode="write")
+        assert a.owner != b.owner
+        assert not a.is_spilled
+        assert not b.is_spilled
+        # And `a` and `b` is now seperated with there one spilling status
+        a.spill(target="cpu")
+        assert a.is_spilled
+        assert not b.is_spilled
+        b.spill(target="cpu")
+        assert a.is_spilled
+        assert b.is_spilled
+
+        # Read access with a spill lock unspill `a` and allows copy-on-write
+        with acquire_spill_lock():
+            a.get_ptr(mode="read")
+        b = a.copy(deep=False)
+        assert a.owner == b.owner
+        assert not a.is_spilled
+
+        # Read access without a spill lock exposes `a` and forces a deep copy
+        a.get_ptr(mode="read")
+        b = a.copy(deep=False)
+        assert a.owner != b.owner
+        assert not a.is_spilled
+        assert a.owner.exposed
+        assert not b.owner.exposed

From 14854b14fe2878f801319eca8d6cd1d5685b9ca6 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Apr 2024 14:33:20 -1000
Subject: [PATCH 091/842] Enable test_parsing in cudf.pandas tests (#15460)

closes https://github.com/rapidsai/cudf/issues/15432

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15460
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 784d90a40ed..af7fa72d44e 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -22,8 +22,7 @@ set -euo pipefail
 # of Pandas installed.
 PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)")
 
-PYTEST_IGNORES="--ignore=tests/tslibs/test_parsing.py \
---ignore=tests/io/parser/common/test_read_errors.py"
+PYTEST_IGNORES="--ignore=tests/io/parser/common/test_read_errors.py"
 
 mkdir -p pandas-testing
 cd pandas-testing

From 9fa247ff7db104517f4e9dab0fc3c321e76cccdf Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 22 Apr 2024 08:28:42 -0400
Subject: [PATCH 092/842] Add to_arrow_device() functions that accept views
 (#15465)

Adds the following new interop functions
```
unique_device_array_t to_arrow_device(cudf::table_view const& table,
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr);
unique_device_array_t to_arrow_device(cudf::column_view const& col,
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr);
```
Also refactors some common code with the ownership transfer version of these APIs.
And moves the `to_arrow_schema()` functions to a separate .cpp file.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Matt Topol (https://github.com/zeroshade)
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15465
---
 cpp/CMakeLists.txt                         |   2 +
 cpp/include/cudf/interop.hpp               |  66 +-
 cpp/src/interop/to_arrow_device.cu         | 745 ++++++++++-----------
 cpp/src/interop/to_arrow_schema.cpp        | 231 +++++++
 cpp/src/interop/to_arrow_utilities.cpp     |  44 ++
 cpp/src/interop/to_arrow_utilities.hpp     |  34 +
 cpp/tests/interop/to_arrow_device_test.cpp |  78 ++-
 7 files changed, 801 insertions(+), 399 deletions(-)
 create mode 100644 cpp/src/interop/to_arrow_schema.cpp
 create mode 100644 cpp/src/interop/to_arrow_utilities.cpp
 create mode 100644 cpp/src/interop/to_arrow_utilities.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7d62e0acb10..3c7e10c9bc4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -359,6 +359,8 @@ add_library(
   src/interop/from_arrow.cu
   src/interop/to_arrow.cu
   src/interop/to_arrow_device.cu
+  src/interop/to_arrow_schema.cpp
+  src/interop/to_arrow_utilities.cpp
   src/interop/detail/arrow_allocator.cpp
   src/io/avro/avro.cpp
   src/io/avro/avro_gpu.cu
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index dc4d66a8f6e..defc1fc834c 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -258,6 +258,70 @@ unique_device_array_t to_arrow_device(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Create `ArrowDeviceArray` from a table view
+ *
+ * Populates the C struct ArrowDeviceArray performing copies only if necessary.
+ * This wraps the data on the GPU device and gives a view of the table data
+ * to the ArrowDeviceArray struct. If the caller frees the data referenced by
+ * the table_view, using the returned object results in undefined behavior.
+ *
+ * After calling this function, the release callback on the returned ArrowDeviceArray
+ * must be called to clean up any memory created during conversion.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf
+ * it will be converted to an Arrow decimal128 with the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
+ *
+ * Copies will be performed in the cases where cudf differs from Arrow:
+ * - BOOL8: Arrow uses a bitmap and cudf uses 1 byte per value
+ * - DECIMAL32 and DECIMAL64: Converted to Arrow decimal128
+ * - STRING: Arrow expects a single value int32 offset child array for empty strings columns
+ *
+ * @param table Input table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray which will have ownership of any copied data
+ */
+unique_device_array_t to_arrow_device(
+  cudf::table_view const& table,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Create `ArrowDeviceArray` from a column view
+ *
+ * Populates the C struct ArrowDeviceArray performing copies only if necessary.
+ * This wraps the data on the GPU device and gives a view of the column data
+ * to the ArrowDeviceArray struct. If the caller frees the data referenced by
+ * the column_view, using the returned object results in undefined behavior.
+ *
+ * After calling this function, the release callback on the returned ArrowDeviceArray
+ * must be called to clean up any memory created during conversion.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf
+ * it will be converted to an Arrow decimal128 with the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similar, numeric::decimal128 will be
+ * converted to Arrow decimal128 of the precision 38.
+ *
+ * Copies will be performed in the cases where cudf differs from Arrow:
+ * - BOOL8: Arrow uses a bitmap and cudf uses 1 byte per value
+ * - DECIMAL32 and DECIMAL64: Converted to Arrow decimal128
+ * - STRING: Arrow expects a single value int32 offset child array for empty strings columns
+ *
+ * @param col Input column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray which will have ownership of any copied data
+ */
+unique_device_array_t to_arrow_device(
+  cudf::column_view const& col,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Create `cudf::table` from given arrow Table input
  *
@@ -266,7 +330,6 @@ unique_device_array_t to_arrow_device(
  * @param mr    Device memory resource used to allocate `cudf::table`
  * @return cudf table generated from given arrow Table
  */
-
 std::unique_ptr<table> from_arrow(
   arrow::Table const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -280,7 +343,6 @@ std::unique_ptr<table> from_arrow(
  * @param mr    Device memory resource used to allocate `cudf::scalar`
  * @return cudf scalar generated from given arrow Scalar
  */
-
 std::unique_ptr<cudf::scalar> from_arrow(
   arrow::Scalar const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index 1754d1493bd..737f8c7f625 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -14,11 +14,14 @@
  * limitations under the License.
  */
 
+#include "to_arrow_utilities.hpp"
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/interop.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/interop.hpp>
 #include <cudf/interop/detail/arrow.hpp>
@@ -45,198 +48,10 @@
 namespace cudf {
 namespace detail {
 namespace {
+
 static constexpr int validity_buffer_idx         = 0;
 static constexpr int fixed_width_data_buffer_idx = 1;
 
-ArrowType id_to_arrow_type(cudf::type_id id)
-{
-  switch (id) {
-    case cudf::type_id::BOOL8: return NANOARROW_TYPE_BOOL;
-    case cudf::type_id::INT8: return NANOARROW_TYPE_INT8;
-    case cudf::type_id::INT16: return NANOARROW_TYPE_INT16;
-    case cudf::type_id::INT32: return NANOARROW_TYPE_INT32;
-    case cudf::type_id::INT64: return NANOARROW_TYPE_INT64;
-    case cudf::type_id::UINT8: return NANOARROW_TYPE_UINT8;
-    case cudf::type_id::UINT16: return NANOARROW_TYPE_UINT16;
-    case cudf::type_id::UINT32: return NANOARROW_TYPE_UINT32;
-    case cudf::type_id::UINT64: return NANOARROW_TYPE_UINT64;
-    case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT;
-    case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE;
-    case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32;
-    default: CUDF_FAIL("Unsupported type_id conversion to arrow type");
-  }
-}
-
-struct dispatch_to_arrow_type {
-  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
-  int operator()(column_view, column_metadata const&, ArrowSchema*)
-  {
-    CUDF_FAIL("Unsupported type for to_arrow_schema");
-  }
-
-  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
-  int operator()(column_view input_view, column_metadata const&, ArrowSchema* out)
-  {
-    cudf::type_id id = input_view.type().id();
-    switch (id) {
-      case cudf::type_id::TIMESTAMP_SECONDS:
-        return ArrowSchemaSetTypeDateTime(
-          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_SECOND, nullptr);
-      case cudf::type_id::TIMESTAMP_MILLISECONDS:
-        return ArrowSchemaSetTypeDateTime(
-          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr);
-      case cudf::type_id::TIMESTAMP_MICROSECONDS:
-        return ArrowSchemaSetTypeDateTime(
-          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MICRO, nullptr);
-      case cudf::type_id::TIMESTAMP_NANOSECONDS:
-        return ArrowSchemaSetTypeDateTime(
-          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_NANO, nullptr);
-      case cudf::type_id::DURATION_SECONDS:
-        return ArrowSchemaSetTypeDateTime(
-          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_SECOND, nullptr);
-      case cudf::type_id::DURATION_MILLISECONDS:
-        return ArrowSchemaSetTypeDateTime(
-          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_MILLI, nullptr);
-      case cudf::type_id::DURATION_MICROSECONDS:
-        return ArrowSchemaSetTypeDateTime(
-          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_MICRO, nullptr);
-      case cudf::type_id::DURATION_NANOSECONDS:
-        return ArrowSchemaSetTypeDateTime(
-          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_NANO, nullptr);
-      default: return ArrowSchemaSetType(out, id_to_arrow_type(id));
-    }
-  }
-};
-
-template <typename DeviceType>
-int decimals_to_arrow(column_view input, ArrowSchema* out)
-{
-  // Arrow doesn't support decimal32/decimal64 currently. decimal128
-  // is the smallest that arrow supports besides float32/float64 so we
-  // upcast to decimal128.
-  return ArrowSchemaSetTypeDecimal(out,
-                                   NANOARROW_TYPE_DECIMAL128,
-                                   cudf::detail::max_precision<DeviceType>(),
-                                   -input.type().scale());
-}
-
-template <>
-int dispatch_to_arrow_type::operator()<numeric::decimal32>(column_view input,
-                                                           column_metadata const&,
-                                                           ArrowSchema* out)
-{
-  using DeviceType = int32_t;
-  return decimals_to_arrow<DeviceType>(input, out);
-}
-
-template <>
-int dispatch_to_arrow_type::operator()<numeric::decimal64>(column_view input,
-                                                           column_metadata const&,
-                                                           ArrowSchema* out)
-{
-  using DeviceType = int64_t;
-  return decimals_to_arrow<DeviceType>(input, out);
-}
-
-template <>
-int dispatch_to_arrow_type::operator()<numeric::decimal128>(column_view input,
-                                                            column_metadata const&,
-                                                            ArrowSchema* out)
-{
-  using DeviceType = __int128_t;
-  return decimals_to_arrow<DeviceType>(input, out);
-}
-
-template <>
-int dispatch_to_arrow_type::operator()<cudf::string_view>(column_view input,
-                                                          column_metadata const&,
-                                                          ArrowSchema* out)
-{
-  return ArrowSchemaSetType(out, NANOARROW_TYPE_STRING);
-}
-
-// these forward declarations are needed due to the recursive calls to them
-// inside their definitions and in struct_vew for handling children
-template <>
-int dispatch_to_arrow_type::operator()<cudf::list_view>(column_view input,
-                                                        column_metadata const& metadata,
-                                                        ArrowSchema* out);
-
-template <>
-int dispatch_to_arrow_type::operator()<cudf::dictionary32>(column_view input,
-                                                           column_metadata const& metadata,
-                                                           ArrowSchema* out);
-
-template <>
-int dispatch_to_arrow_type::operator()<cudf::struct_view>(column_view input,
-                                                          column_metadata const& metadata,
-                                                          ArrowSchema* out)
-{
-  CUDF_EXPECTS(metadata.children_meta.size() == static_cast<std::size_t>(input.num_children()),
-               "Number of field names and number of children doesn't match\n");
-
-  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeStruct(out, input.num_children()));
-  for (int i = 0; i < input.num_children(); ++i) {
-    auto child = out->children[i];
-    auto col   = input.child(i);
-    ArrowSchemaInit(child);
-    NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(child, metadata.children_meta[i].name.c_str()));
-
-    child->flags = col.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
-
-    if (col.type().id() == cudf::type_id::EMPTY) {
-      NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(child, NANOARROW_TYPE_NA));
-      continue;
-    }
-
-    NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
-      col.type(), detail::dispatch_to_arrow_type{}, col, metadata.children_meta[i], child));
-  }
-
-  return NANOARROW_OK;
-}
-
-template <>
-int dispatch_to_arrow_type::operator()<cudf::list_view>(column_view input,
-                                                        column_metadata const& metadata,
-                                                        ArrowSchema* out)
-{
-  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, NANOARROW_TYPE_LIST));
-  auto child = input.child(cudf::lists_column_view::child_column_index);
-  ArrowSchemaInit(out->children[0]);
-  if (child.type().id() == cudf::type_id::EMPTY) {
-    return ArrowSchemaSetType(out->children[0], NANOARROW_TYPE_NA);
-  }
-  auto child_meta =
-    metadata.children_meta.empty() ? column_metadata{"element"} : metadata.children_meta[0];
-
-  out->flags = input.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
-  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(out->children[0], child_meta.name.c_str()));
-  out->children[0]->flags = child.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
-  return cudf::type_dispatcher(
-    child.type(), detail::dispatch_to_arrow_type{}, child, child_meta, out->children[0]);
-}
-
-template <>
-int dispatch_to_arrow_type::operator()<cudf::dictionary32>(column_view input,
-                                                           column_metadata const& metadata,
-                                                           ArrowSchema* out)
-{
-  cudf::dictionary_column_view dview{input};
-
-  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, id_to_arrow_type(dview.indices().type().id())));
-  NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateDictionary(out));
-  ArrowSchemaInit(out->dictionary);
-
-  auto dict_keys = dview.keys();
-  return cudf::type_dispatcher(
-    dict_keys.type(),
-    detail::dispatch_to_arrow_type{},
-    dict_keys,
-    metadata.children_meta.empty() ? column_metadata{"keys"} : metadata.children_meta[0],
-    out->dictionary);
-}
-
 template <typename T>
 void device_buffer_finalize(ArrowBufferAllocator* allocator, uint8_t*, int64_t)
 {
@@ -244,6 +59,14 @@ void device_buffer_finalize(ArrowBufferAllocator* allocator, uint8_t*, int64_t)
   delete unique_buffer;
 }
 
+int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column_view column)
+{
+  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(arr, storage_type));
+  arr->length     = column.size();
+  arr->null_count = column.null_count();
+  return NANOARROW_OK;
+}
+
 template <typename>
 struct is_device_scalar : public std::false_type {};
 
@@ -279,19 +102,26 @@ int set_buffer(std::unique_ptr<T> device_buf, int64_t i, ArrowArray* out)
   return NANOARROW_OK;
 }
 
-int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column const& column)
+ArrowType id_to_arrow_storage_type(cudf::type_id id)
 {
-  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(arr, storage_type));
-  arr->length     = column.size();
-  arr->null_count = column.null_count();
-  return NANOARROW_OK;
+  switch (id) {
+    case cudf::type_id::TIMESTAMP_SECONDS:
+    case cudf::type_id::TIMESTAMP_MILLISECONDS:
+    case cudf::type_id::TIMESTAMP_MICROSECONDS:
+    case cudf::type_id::TIMESTAMP_NANOSECONDS: return NANOARROW_TYPE_INT64;
+    case cudf::type_id::DURATION_SECONDS:
+    case cudf::type_id::DURATION_MILLISECONDS:
+    case cudf::type_id::DURATION_MICROSECONDS:
+    case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TYPE_INT64;
+    default: return id_to_arrow_type(id);
+  }
 }
 
 struct dispatch_to_arrow_device {
   template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
   int operator()(cudf::column&&, rmm::cuda_stream_view, rmm::device_async_resource_ref, ArrowArray*)
   {
-    CUDF_FAIL("Unsupported type for to_arrow_device");
+    CUDF_FAIL("Unsupported type for to_arrow_device", cudf::data_type_error);
   }
 
   template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
@@ -302,38 +132,34 @@ struct dispatch_to_arrow_device {
   {
     nanoarrow::UniqueArray tmp;
 
-    const ArrowType storage_type = [&] {
-      switch (column.type().id()) {
-        case cudf::type_id::TIMESTAMP_SECONDS:
-        case cudf::type_id::TIMESTAMP_MILLISECONDS:
-        case cudf::type_id::TIMESTAMP_MICROSECONDS:
-        case cudf::type_id::TIMESTAMP_NANOSECONDS: return NANOARROW_TYPE_INT64;
-        case cudf::type_id::DURATION_SECONDS:
-        case cudf::type_id::DURATION_MILLISECONDS:
-        case cudf::type_id::DURATION_MICROSECONDS:
-        case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TYPE_INT64;
-        default: return id_to_arrow_type(column.type().id());
-      }
-    }();
+    auto const storage_type = id_to_arrow_storage_type(column.type().id());
     NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), storage_type, column));
 
     auto contents = column.release();
+    NANOARROW_RETURN_NOT_OK(set_contents(contents, tmp.get()));
+
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+
+  int set_null_mask(column::contents& contents, ArrowArray* out)
+  {
     if (contents.null_mask) {
-      NANOARROW_RETURN_NOT_OK(
-        set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
+      NANOARROW_RETURN_NOT_OK(set_buffer(std::move(contents.null_mask), validity_buffer_idx, out));
     }
+    return NANOARROW_OK;
+  }
 
-    NANOARROW_RETURN_NOT_OK(
-      set_buffer(std::move(contents.data), fixed_width_data_buffer_idx, tmp.get()));
-
-    ArrowArrayMove(tmp.get(), out);
+  int set_contents(column::contents& contents, ArrowArray* out)
+  {
+    NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
+    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(contents.data), fixed_width_data_buffer_idx, out));
     return NANOARROW_OK;
   }
 };
 
 template <typename DeviceType>
-int decimals_to_arrow(cudf::column&& input,
-                      int32_t precision,
+int decimals_to_arrow(cudf::column_view input,
                       rmm::cuda_stream_view stream,
                       rmm::device_async_resource_ref mr,
                       ArrowArray* out)
@@ -341,42 +167,28 @@ int decimals_to_arrow(cudf::column&& input,
   nanoarrow::UniqueArray tmp;
   NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, input));
 
-  if constexpr (!std::is_same_v<DeviceType, __int128_t>) {
-    constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
-    auto buf =
-      std::make_unique<rmm::device_uvector<DeviceType>>(input.size() * BIT_WIDTH_RATIO, stream, mr);
-
-    auto count = thrust::make_counting_iterator(0);
-
-    thrust::for_each(rmm::exec_policy(stream, mr),
-                     count,
-                     count + input.size(),
-                     [in  = input.view().begin<DeviceType>(),
-                      out = buf->data(),
-                      BIT_WIDTH_RATIO] __device__(auto in_idx) {
-                       auto const out_idx = in_idx * BIT_WIDTH_RATIO;
-                       // the lowest order bits are the value, the remainder
-                       // simply matches the sign bit to satisfy the two's
-                       // complement integer representation of negative numbers.
-                       out[out_idx] = in[in_idx];
+  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
+  auto buf =
+    std::make_unique<rmm::device_uvector<DeviceType>>(input.size() * BIT_WIDTH_RATIO, stream, mr);
+
+  auto count = thrust::counting_iterator<size_type>(0);
+
+  thrust::for_each(
+    rmm::exec_policy(stream, mr),
+    count,
+    count + input.size(),
+    [in = input.begin<DeviceType>(), out = buf->data(), BIT_WIDTH_RATIO] __device__(auto in_idx) {
+      auto const out_idx = in_idx * BIT_WIDTH_RATIO;
+      // the lowest order bits are the value, the remainder
+      // simply matches the sign bit to satisfy the two's
+      // complement integer representation of negative numbers.
+      out[out_idx] = in[in_idx];
 #pragma unroll BIT_WIDTH_RATIO - 1
-                       for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
-                         out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
-                       }
-                     });
-    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get()));
-  }
-
-  auto contents = input.release();
-  if (contents.null_mask) {
-    NANOARROW_RETURN_NOT_OK(
-      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
-  }
-
-  if constexpr (std::is_same_v<DeviceType, __int128_t>) {
-    NANOARROW_RETURN_NOT_OK(
-      set_buffer(std::move(contents.data), fixed_width_data_buffer_idx, tmp.get()));
-  }
+      for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
+        out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
+      }
+    });
+  NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get()));
 
   ArrowArrayMove(tmp.get(), out);
   return NANOARROW_OK;
@@ -389,8 +201,10 @@ int dispatch_to_arrow_device::operator()<numeric::decimal32>(cudf::column&& colu
                                                              ArrowArray* out)
 {
   using DeviceType = int32_t;
-  return decimals_to_arrow<DeviceType>(
-    std::move(column), cudf::detail::max_precision<DeviceType>(), stream, mr, out);
+  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column.view(), stream, mr, out));
+  auto contents = column.release();
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
+  return NANOARROW_OK;
 }
 
 template <>
@@ -400,8 +214,10 @@ int dispatch_to_arrow_device::operator()<numeric::decimal64>(cudf::column&& colu
                                                              ArrowArray* out)
 {
   using DeviceType = int64_t;
-  return decimals_to_arrow<DeviceType>(
-    std::move(column), cudf::detail::max_precision<DeviceType>(), stream, mr, out);
+  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column.view(), stream, mr, out));
+  auto contents = column.release();
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
+  return NANOARROW_OK;
 }
 
 template <>
@@ -410,9 +226,12 @@ int dispatch_to_arrow_device::operator()<numeric::decimal128>(cudf::column&& col
                                                               rmm::device_async_resource_ref mr,
                                                               ArrowArray* out)
 {
-  using DeviceType = __int128_t;
-  return decimals_to_arrow<DeviceType>(
-    std::move(column), cudf::detail::max_precision<DeviceType>(), stream, mr, out);
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
+  auto contents = column.release();
+  NANOARROW_RETURN_NOT_OK(set_contents(contents, tmp.get()));
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
 }
 
 template <>
@@ -426,10 +245,7 @@ int dispatch_to_arrow_device::operator()<bool>(cudf::column&& column,
 
   auto bitmask  = bools_to_mask(column.view(), stream, mr);
   auto contents = column.release();
-  if (contents.null_mask) {
-    NANOARROW_RETURN_NOT_OK(
-      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
-  }
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get()));
   NANOARROW_RETURN_NOT_OK(
     set_buffer(std::move(bitmask.first), fixed_width_data_buffer_idx, tmp.get()));
 
@@ -459,10 +275,7 @@ int dispatch_to_arrow_device::operator()<cudf::string_view>(cudf::column&& colum
   }
 
   auto contents = column.release();
-  if (contents.null_mask) {
-    NANOARROW_RETURN_NOT_OK(
-      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
-  }
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get()));
 
   auto offsets_contents =
     contents.children[cudf::strings_column_view::offsets_column_index]->release();
@@ -496,22 +309,13 @@ int dispatch_to_arrow_device::operator()<cudf::struct_view>(cudf::column&& colum
   NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), column.num_children()));
 
   auto contents = column.release();
-  if (contents.null_mask) {
-    NANOARROW_RETURN_NOT_OK(
-      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
-  }
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get()));
 
   for (size_t i = 0; i < size_t(tmp->n_children); ++i) {
     ArrowArray* child_ptr = tmp->children[i];
     auto& child           = contents.children[i];
-    if (child->type().id() == cudf::type_id::EMPTY) {
-      NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(child_ptr, NANOARROW_TYPE_NA));
-      child_ptr->length     = child->size();
-      child_ptr->null_count = child->size();
-    } else {
-      NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
-        child->type(), dispatch_to_arrow_device{}, std::move(*child), stream, mr, child_ptr));
-    }
+    NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+      child->type(), dispatch_to_arrow_device{}, std::move(*child), stream, mr, child_ptr));
   }
 
   ArrowArrayMove(tmp.get(), out);
@@ -529,24 +333,15 @@ int dispatch_to_arrow_device::operator()<cudf::list_view>(cudf::column&& column,
   NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), 1));
 
   auto contents = column.release();
-  if (contents.null_mask) {
-    NANOARROW_RETURN_NOT_OK(
-      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
-  }
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get()));
 
   auto offsets_contents =
     contents.children[cudf::lists_column_view::offsets_column_index]->release();
   NANOARROW_RETURN_NOT_OK(set_buffer(std::move(offsets_contents.data), 1, tmp.get()));
 
   auto& child = contents.children[cudf::lists_column_view::child_column_index];
-  if (child->type().id() == cudf::type_id::EMPTY) {
-    NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(tmp->children[0], NANOARROW_TYPE_NA));
-    tmp->children[0]->length     = 0;
-    tmp->children[0]->null_count = 0;
-  } else {
-    NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
-      child->type(), dispatch_to_arrow_device{}, std::move(*child), stream, mr, tmp->children[0]));
-  }
+  NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+    child->type(), dispatch_to_arrow_device{}, std::move(*child), stream, mr, tmp->children[0]));
 
   ArrowArrayMove(tmp.get(), out);
   return NANOARROW_OK;
@@ -566,10 +361,7 @@ int dispatch_to_arrow_device::operator()<cudf::dictionary32>(cudf::column&& colu
   NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateDictionary(tmp.get()));
 
   auto contents = column.release();
-  if (contents.null_mask) {
-    NANOARROW_RETURN_NOT_OK(
-      set_buffer(std::move(contents.null_mask), validity_buffer_idx, tmp.get()));
-  }
+  NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get()));
 
   auto indices_contents =
     contents.children[cudf::dictionary_column_view::indices_column_index]->release();
@@ -584,6 +376,205 @@ int dispatch_to_arrow_device::operator()<cudf::dictionary32>(cudf::column&& colu
   return NANOARROW_OK;
 }
 
+struct dispatch_to_arrow_device_view {
+  cudf::column_view column;
+  rmm::cuda_stream_view stream;
+  rmm::device_async_resource_ref mr;
+
+  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
+  int operator()(ArrowArray*) const
+  {
+    CUDF_FAIL("Unsupported type for to_arrow_device", cudf::data_type_error);
+  }
+
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
+  int operator()(ArrowArray* out) const
+  {
+    nanoarrow::UniqueArray tmp;
+
+    auto const storage_type = id_to_arrow_storage_type(column.type().id());
+    NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), storage_type, column));
+    NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+    NANOARROW_RETURN_NOT_OK(set_view_to_buffer(column, tmp.get()));
+
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+
+  int set_buffer_view(void const* in_ptr, size_t size, int64_t i, ArrowArray* out) const
+  {
+    ArrowBuffer* buf = ArrowArrayBuffer(out, i);
+    buf->size_bytes  = size;
+
+    // reset the deallocator to do nothing since this is a non-owning view
+    NANOARROW_RETURN_NOT_OK(ArrowBufferSetAllocator(
+      buf, ArrowBufferDeallocator([](ArrowBufferAllocator*, uint8_t*, int64_t) {}, nullptr)));
+
+    buf->data = const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(in_ptr));
+    return NANOARROW_OK;
+  }
+
+  int set_null_mask(column_view column, ArrowArray* out) const
+  {
+    if (column.nullable()) {
+      NANOARROW_RETURN_NOT_OK(set_buffer_view(column.null_mask(),
+                                              bitmask_allocation_size_bytes(column.size()),
+                                              validity_buffer_idx,
+                                              out));
+    }
+    return NANOARROW_OK;
+  }
+
+  int set_view_to_buffer(column_view column, ArrowArray* out) const
+  {
+    auto const type_size = cudf::size_of(column.type());
+    return set_buffer_view(column.head<uint8_t>() + (type_size * column.offset()),
+                           column.size() * type_size,
+                           fixed_width_data_buffer_idx,
+                           out);
+  }
+};
+
+template <>
+int dispatch_to_arrow_device_view::operator()<numeric::decimal32>(ArrowArray* out) const
+{
+  using DeviceType = int32_t;
+  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column, stream, mr, out));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, out));
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<numeric::decimal64>(ArrowArray* out) const
+{
+  using DeviceType = int64_t;
+  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column, stream, mr, out));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, out));
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<numeric::decimal128>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+  NANOARROW_RETURN_NOT_OK(set_view_to_buffer(column, tmp.get()));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<bool>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column));
+
+  auto bitmask = bools_to_mask(column, stream, mr);
+  NANOARROW_RETURN_NOT_OK(
+    set_buffer(std::move(bitmask.first), fixed_width_data_buffer_idx, tmp.get()));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<cudf::string_view>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRING, column));
+
+  if (column.size() == 0) {
+    // https://github.com/rapidsai/cudf/pull/15047#discussion_r1546528552
+    auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
+    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+
+  auto const scv = cudf::strings_column_view(column);
+  NANOARROW_RETURN_NOT_OK(set_view_to_buffer(scv.offsets(), tmp.get()));
+  NANOARROW_RETURN_NOT_OK(
+    set_buffer_view(scv.chars_begin(stream), scv.chars_size(stream), 2, tmp.get()));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<cudf::list_view>(ArrowArray* out) const;
+
+template <>
+int dispatch_to_arrow_device_view::operator()<cudf::dictionary32>(ArrowArray* out) const;
+
+template <>
+int dispatch_to_arrow_device_view::operator()<cudf::struct_view>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRUCT, column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), column.num_children()));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+
+  for (size_t i = 0; i < size_t(tmp->n_children); ++i) {
+    ArrowArray* child_ptr = tmp->children[i];
+    auto const child      = column.child(i);
+    NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+      child.type(), dispatch_to_arrow_device_view{child, stream, mr}, child_ptr));
+  }
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<cudf::list_view>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_LIST, column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), 1));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+
+  auto const lcv = cudf::lists_column_view(column);
+  NANOARROW_RETURN_NOT_OK(set_view_to_buffer(lcv.offsets(), tmp.get()));
+
+  auto child = lcv.child();
+  NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+    child.type(), dispatch_to_arrow_device_view{child, stream, mr}, tmp->children[0]));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_device_view::operator()<cudf::dictionary32>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_RETURN_NOT_OK(initialize_array(
+    tmp.get(),
+    id_to_arrow_type(column.child(cudf::dictionary_column_view::indices_column_index).type().id()),
+    column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateDictionary(tmp.get()));
+  NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
+
+  auto const dcv = cudf::dictionary_column_view(column);
+  NANOARROW_RETURN_NOT_OK(set_view_to_buffer(dcv.indices(), tmp.get()));
+
+  auto keys = dcv.keys();
+  NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+    keys.type(), dispatch_to_arrow_device_view{keys, stream, mr}, tmp->dictionary));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
 struct ArrowDeviceArrayPrivateData {
   ArrowArray parent;
   cudaEvent_t sync_event;
@@ -592,49 +583,38 @@ struct ArrowDeviceArrayPrivateData {
 void ArrowDeviceArrayRelease(ArrowArray* array)
 {
   auto private_data = reinterpret_cast<ArrowDeviceArrayPrivateData*>(array->private_data);
-  cudaEventDestroy(private_data->sync_event);
+  RMM_ASSERT_CUDA_SUCCESS(cudaEventDestroy(private_data->sync_event));
   ArrowArrayRelease(&private_data->parent);
   delete private_data;
   array->release = nullptr;
 }
 
-}  // namespace
-}  // namespace detail
-
-unique_schema_t to_arrow_schema(cudf::table_view const& input,
-                                cudf::host_span<column_metadata const> metadata)
+unique_device_array_t create_device_array(nanoarrow::UniqueArray&& out,
+                                          rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS((metadata.size() == static_cast<std::size_t>(input.num_columns())),
-               "columns' metadata should be equal to the number of columns in table");
-
-  nanoarrow::UniqueSchema result;
-  ArrowSchemaInit(result.get());
-  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(result.get(), input.num_columns()));
-
-  for (int i = 0; i < input.num_columns(); ++i) {
-    auto child = result->children[i];
-    auto col   = input.column(i);
-    ArrowSchemaInit(child);
-    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child, metadata[i].name.c_str()));
-    child->flags = col.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
-
-    if (col.type().id() == cudf::type_id::EMPTY) {
-      NANOARROW_THROW_NOT_OK(ArrowSchemaSetType(child, NANOARROW_TYPE_NA));
-      continue;
-    }
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(out.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
 
-    NANOARROW_THROW_NOT_OK(
-      cudf::type_dispatcher(col.type(), detail::dispatch_to_arrow_type{}, col, metadata[i], child));
-  }
+  auto private_data = std::make_unique<detail::ArrowDeviceArrayPrivateData>();
+  CUDF_CUDA_TRY(cudaEventCreate(&private_data->sync_event));
+  CUDF_CUDA_TRY(cudaEventRecord(private_data->sync_event, stream.value()));
 
-  unique_schema_t out(new ArrowSchema, [](ArrowSchema* schema) {
-    if (schema->release != nullptr) { ArrowSchemaRelease(schema); }
-    delete schema;
+  ArrowArrayMove(out.get(), &private_data->parent);
+  unique_device_array_t result(new ArrowDeviceArray, [](ArrowDeviceArray* arr) {
+    if (arr->array.release != nullptr) { ArrowArrayRelease(&arr->array); }
+    delete arr;
   });
-  result.move(out.get());
-  return out;
+  result->device_id          = rmm::get_current_cuda_device().value();
+  result->device_type        = ARROW_DEVICE_CUDA;
+  result->sync_event         = private_data->sync_event;
+  result->array              = private_data->parent;  // makes a shallow copy
+  result->array.private_data = private_data.release();
+  result->array.release      = &detail::ArrowDeviceArrayRelease;
+  return result;
 }
 
+}  // namespace
+
 unique_device_array_t to_arrow_device(cudf::table&& table,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
@@ -650,76 +630,89 @@ unique_device_array_t to_arrow_device(cudf::table&& table,
   for (size_t i = 0; i < cols.size(); ++i) {
     auto child = tmp->children[i];
     auto col   = cols[i].get();
-
-    if (col->type().id() == cudf::type_id::EMPTY) {
-      NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(child, NANOARROW_TYPE_NA));
-      child->length     = col->size();
-      child->null_count = col->size();
-      continue;
-    }
-
     NANOARROW_THROW_NOT_OK(cudf::type_dispatcher(
       col->type(), detail::dispatch_to_arrow_device{}, std::move(*col), stream, mr, child));
   }
 
-  NANOARROW_THROW_NOT_OK(
-    ArrowArrayFinishBuilding(tmp.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
+  return create_device_array(std::move(tmp), stream);
+}
 
-  auto private_data = std::make_unique<detail::ArrowDeviceArrayPrivateData>();
-  cudaEventCreate(&private_data->sync_event);
+unique_device_array_t to_arrow_device(cudf::column&& col,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  nanoarrow::UniqueArray tmp;
 
-  auto status = cudaEventRecord(private_data->sync_event, stream);
-  if (status != cudaSuccess) { CUDF_FAIL("could not create event to sync on"); }
+  NANOARROW_THROW_NOT_OK(cudf::type_dispatcher(
+    col.type(), detail::dispatch_to_arrow_device{}, std::move(col), stream, mr, tmp.get()));
 
-  ArrowArrayMove(tmp.get(), &private_data->parent);
-  unique_device_array_t result(new ArrowDeviceArray, [](ArrowDeviceArray* arr) {
-    if (arr->array.release != nullptr) { ArrowArrayRelease(&arr->array); }
-    delete arr;
-  });
-  result->device_id          = rmm::get_current_cuda_device().value();
-  result->device_type        = ARROW_DEVICE_CUDA;
-  result->sync_event         = &private_data->sync_event;
-  result->array              = private_data->parent;
-  result->array.private_data = private_data.release();
-  result->array.release      = &detail::ArrowDeviceArrayRelease;
-  return result;
+  return create_device_array(std::move(tmp), stream);
 }
 
-unique_device_array_t to_arrow_device(cudf::column&& col,
+unique_device_array_t to_arrow_device(cudf::table_view const& table,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
   nanoarrow::UniqueArray tmp;
-  if (col.type().id() == cudf::type_id::EMPTY) {
-    NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_NA));
-    tmp->length     = col.size();
-    tmp->null_count = col.size();
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRUCT));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), table.num_columns()));
+  tmp->length     = table.num_rows();
+  tmp->null_count = 0;
+
+  for (cudf::size_type i = 0; i < table.num_columns(); ++i) {
+    auto child = tmp->children[i];
+    auto col   = table.column(i);
+    NANOARROW_THROW_NOT_OK(cudf::type_dispatcher(
+      col.type(), detail::dispatch_to_arrow_device_view{col, stream, mr}, child));
   }
 
+  return create_device_array(std::move(tmp), stream);
+}
+
+unique_device_array_t to_arrow_device(cudf::column_view const& col,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  nanoarrow::UniqueArray tmp;
+
   NANOARROW_THROW_NOT_OK(cudf::type_dispatcher(
-    col.type(), detail::dispatch_to_arrow_device{}, std::move(col), stream, mr, tmp.get()));
+    col.type(), detail::dispatch_to_arrow_device_view{col, stream, mr}, tmp.get()));
 
-  NANOARROW_THROW_NOT_OK(
-    ArrowArrayFinishBuilding(tmp.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
+  return create_device_array(std::move(tmp), stream);
+}
 
-  auto private_data = std::make_unique<detail::ArrowDeviceArrayPrivateData>();
-  cudaEventCreate(&private_data->sync_event);
+}  // namespace detail
 
-  auto status = cudaEventRecord(private_data->sync_event, stream);
-  if (status != cudaSuccess) { CUDF_FAIL("could not create event to sync on"); }
+unique_device_array_t to_arrow_device(cudf::table&& table,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow_device(std::move(table), stream, mr);
+}
 
-  ArrowArrayMove(tmp.get(), &private_data->parent);
-  unique_device_array_t result(new ArrowDeviceArray, [](ArrowDeviceArray* arr) {
-    if (arr->array.release != nullptr) { ArrowArrayRelease(&arr->array); }
-    delete arr;
-  });
-  result->device_id          = rmm::get_current_cuda_device().value();
-  result->device_type        = ARROW_DEVICE_CUDA;
-  result->sync_event         = &private_data->sync_event;
-  result->array              = private_data->parent;
-  result->array.private_data = private_data.release();
-  result->array.release      = &detail::ArrowDeviceArrayRelease;
-  return result;
+unique_device_array_t to_arrow_device(cudf::column&& col,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow_device(std::move(col), stream, mr);
+}
+
+unique_device_array_t to_arrow_device(cudf::table_view const& table,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow_device(table, stream, mr);
 }
 
+unique_device_array_t to_arrow_device(cudf::column_view const& col,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow_device(col, stream, mr);
+}
 }  // namespace cudf
diff --git a/cpp/src/interop/to_arrow_schema.cpp b/cpp/src/interop/to_arrow_schema.cpp
new file mode 100644
index 00000000000..6f943593dce
--- /dev/null
+++ b/cpp/src/interop/to_arrow_schema.cpp
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "to_arrow_utilities.hpp"
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/interop.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/interop/detail/arrow.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow.hpp>
+
+namespace cudf {
+namespace detail {
+namespace {
+
+struct dispatch_to_arrow_type {
+  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
+  int operator()(column_view, column_metadata const&, ArrowSchema*)
+  {
+    CUDF_FAIL("Unsupported type for to_arrow_schema", cudf::data_type_error);
+  }
+
+  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
+  int operator()(column_view input_view, column_metadata const&, ArrowSchema* out)
+  {
+    cudf::type_id id = input_view.type().id();
+    switch (id) {
+      case cudf::type_id::TIMESTAMP_SECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_SECOND, nullptr);
+      case cudf::type_id::TIMESTAMP_MILLISECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr);
+      case cudf::type_id::TIMESTAMP_MICROSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MICRO, nullptr);
+      case cudf::type_id::TIMESTAMP_NANOSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_NANO, nullptr);
+      case cudf::type_id::DURATION_SECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_SECOND, nullptr);
+      case cudf::type_id::DURATION_MILLISECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_MILLI, nullptr);
+      case cudf::type_id::DURATION_MICROSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_MICRO, nullptr);
+      case cudf::type_id::DURATION_NANOSECONDS:
+        return ArrowSchemaSetTypeDateTime(
+          out, NANOARROW_TYPE_DURATION, NANOARROW_TIME_UNIT_NANO, nullptr);
+      default: return ArrowSchemaSetType(out, id_to_arrow_type(id));
+    }
+  }
+};
+
+template <typename DeviceType>
+int decimals_to_arrow(column_view input, ArrowSchema* out)
+{
+  // Arrow doesn't support decimal32/decimal64 currently. decimal128
+  // is the smallest that arrow supports besides float32/float64 so we
+  // upcast to decimal128.
+  return ArrowSchemaSetTypeDecimal(out,
+                                   NANOARROW_TYPE_DECIMAL128,
+                                   cudf::detail::max_precision<DeviceType>(),
+                                   -input.type().scale());
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<numeric::decimal32>(column_view input,
+                                                           column_metadata const&,
+                                                           ArrowSchema* out)
+{
+  using DeviceType = int32_t;
+  return decimals_to_arrow<DeviceType>(input, out);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<numeric::decimal64>(column_view input,
+                                                           column_metadata const&,
+                                                           ArrowSchema* out)
+{
+  using DeviceType = int64_t;
+  return decimals_to_arrow<DeviceType>(input, out);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<numeric::decimal128>(column_view input,
+                                                            column_metadata const&,
+                                                            ArrowSchema* out)
+{
+  using DeviceType = __int128_t;
+  return decimals_to_arrow<DeviceType>(input, out);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::string_view>(column_view input,
+                                                          column_metadata const&,
+                                                          ArrowSchema* out)
+{
+  return ArrowSchemaSetType(out, NANOARROW_TYPE_STRING);
+}
+
+// these forward declarations are needed due to the recursive calls to them
+// inside their definitions and in struct_vew for handling children
+template <>
+int dispatch_to_arrow_type::operator()<cudf::list_view>(column_view input,
+                                                        column_metadata const& metadata,
+                                                        ArrowSchema* out);
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::dictionary32>(column_view input,
+                                                           column_metadata const& metadata,
+                                                           ArrowSchema* out);
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::struct_view>(column_view input,
+                                                          column_metadata const& metadata,
+                                                          ArrowSchema* out)
+{
+  CUDF_EXPECTS(metadata.children_meta.size() == static_cast<std::size_t>(input.num_children()),
+               "Number of field names and number of children doesn't match\n");
+
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetTypeStruct(out, input.num_children()));
+  for (int i = 0; i < input.num_children(); ++i) {
+    auto child = out->children[i];
+    auto col   = input.child(i);
+    ArrowSchemaInit(child);
+    NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(child, metadata.children_meta[i].name.c_str()));
+
+    child->flags = col.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+
+    NANOARROW_RETURN_NOT_OK(cudf::type_dispatcher(
+      col.type(), detail::dispatch_to_arrow_type{}, col, metadata.children_meta[i], child));
+  }
+
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::list_view>(column_view input,
+                                                        column_metadata const& metadata,
+                                                        ArrowSchema* out)
+{
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, NANOARROW_TYPE_LIST));
+  auto child = input.child(cudf::lists_column_view::child_column_index);
+  ArrowSchemaInit(out->children[0]);
+  auto child_meta =
+    metadata.children_meta.empty() ? column_metadata{"element"} : metadata.children_meta[0];
+
+  out->flags = input.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(out->children[0], child_meta.name.c_str()));
+  out->children[0]->flags = child.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+  return cudf::type_dispatcher(
+    child.type(), detail::dispatch_to_arrow_type{}, child, child_meta, out->children[0]);
+}
+
+template <>
+int dispatch_to_arrow_type::operator()<cudf::dictionary32>(column_view input,
+                                                           column_metadata const& metadata,
+                                                           ArrowSchema* out)
+{
+  cudf::dictionary_column_view dview{input};
+
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, id_to_arrow_type(dview.indices().type().id())));
+  NANOARROW_RETURN_NOT_OK(ArrowSchemaAllocateDictionary(out));
+  ArrowSchemaInit(out->dictionary);
+
+  auto dict_keys = dview.keys();
+  return cudf::type_dispatcher(
+    dict_keys.type(),
+    detail::dispatch_to_arrow_type{},
+    dict_keys,
+    metadata.children_meta.empty() ? column_metadata{"keys"} : metadata.children_meta[0],
+    out->dictionary);
+}
+}  // namespace
+}  // namespace detail
+
+unique_schema_t to_arrow_schema(cudf::table_view const& input,
+                                cudf::host_span<column_metadata const> metadata)
+{
+  CUDF_EXPECTS((metadata.size() == static_cast<std::size_t>(input.num_columns())),
+               "columns' metadata should be equal to the number of columns in table");
+
+  nanoarrow::UniqueSchema result;
+  ArrowSchemaInit(result.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(result.get(), input.num_columns()));
+
+  for (int i = 0; i < input.num_columns(); ++i) {
+    auto child = result->children[i];
+    auto col   = input.column(i);
+    ArrowSchemaInit(child);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child, metadata[i].name.c_str()));
+    child->flags = col.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
+
+    NANOARROW_THROW_NOT_OK(
+      cudf::type_dispatcher(col.type(), detail::dispatch_to_arrow_type{}, col, metadata[i], child));
+  }
+
+  unique_schema_t out(new ArrowSchema, [](ArrowSchema* schema) {
+    if (schema->release != nullptr) { ArrowSchemaRelease(schema); }
+    delete schema;
+  });
+  result.move(out.get());
+  return out;
+}
+
+}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow_utilities.cpp b/cpp/src/interop/to_arrow_utilities.cpp
new file mode 100644
index 00000000000..04d17847273
--- /dev/null
+++ b/cpp/src/interop/to_arrow_utilities.cpp
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "to_arrow_utilities.hpp"
+
+#include <cudf/utilities/error.hpp>
+
+namespace cudf {
+namespace detail {
+
+ArrowType id_to_arrow_type(cudf::type_id id)
+{
+  switch (id) {
+    case cudf::type_id::BOOL8: return NANOARROW_TYPE_BOOL;
+    case cudf::type_id::INT8: return NANOARROW_TYPE_INT8;
+    case cudf::type_id::INT16: return NANOARROW_TYPE_INT16;
+    case cudf::type_id::INT32: return NANOARROW_TYPE_INT32;
+    case cudf::type_id::INT64: return NANOARROW_TYPE_INT64;
+    case cudf::type_id::UINT8: return NANOARROW_TYPE_UINT8;
+    case cudf::type_id::UINT16: return NANOARROW_TYPE_UINT16;
+    case cudf::type_id::UINT32: return NANOARROW_TYPE_UINT32;
+    case cudf::type_id::UINT64: return NANOARROW_TYPE_UINT64;
+    case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT;
+    case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE;
+    case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32;
+    default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error);
+  }
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow_utilities.hpp b/cpp/src/interop/to_arrow_utilities.hpp
new file mode 100644
index 00000000000..3c01c726a7b
--- /dev/null
+++ b/cpp/src/interop/to_arrow_utilities.hpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/types.hpp>
+
+#include <nanoarrow/nanoarrow_types.h>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Map cudf column type id to ArrowType id
+ *
+ * @param id Column type id
+ * @return ArrowType id
+ */
+ArrowType id_to_arrow_type(cudf::type_id id);
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 16aab53a249..d6eae8dece1 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -327,14 +327,16 @@ TEST_F(ToArrowDeviceTest, EmptyTable)
   auto got_arrow_schema = cudf::to_arrow_schema(table->view(), meta);
 
   compare_schemas(schema.get(), got_arrow_schema.get());
-  ArrowSchemaRelease(got_arrow_schema.get());
 
-  auto got_arrow_device = cudf::to_arrow_device(std::move(*table));
+  auto got_arrow_device = cudf::to_arrow_device(table->view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_device->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_device->device_type);
+  compare_arrays(schema.get(), arr.get(), &got_arrow_device->array);
 
+  got_arrow_device = cudf::to_arrow_device(std::move(*table));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_device->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_device->device_type);
   compare_arrays(schema.get(), arr.get(), &got_arrow_device->array);
-  ArrowArrayRelease(&got_arrow_device->array);
 }
 
 TEST_F(ToArrowDeviceTest, DateTimeTable)
@@ -358,10 +360,9 @@ TEST_F(ToArrowDeviceTest, DateTimeTable)
   expected_schema->children[0]->flags = 0;
 
   compare_schemas(expected_schema.get(), got_arrow_schema.get());
-  ArrowSchemaRelease(got_arrow_schema.get());
 
   auto data_ptr        = input.get_column(0).view().data<int64_t>();
-  auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+  auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
 
@@ -377,7 +378,21 @@ TEST_F(ToArrowDeviceTest, DateTimeTable)
   EXPECT_EQ(nullptr, got_arrow_array->array.children[0]->buffers[0]);
   EXPECT_EQ(data_ptr, got_arrow_array->array.children[0]->buffers[1]);
 
-  ArrowArrayRelease(&got_arrow_array->array);
+  got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.length);
+  EXPECT_EQ(0, got_arrow_array->array.null_count);
+  EXPECT_EQ(0, got_arrow_array->array.offset);
+  EXPECT_EQ(1, got_arrow_array->array.n_children);
+  EXPECT_EQ(nullptr, got_arrow_array->array.buffers[0]);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.children[0]->length);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->null_count);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->offset);
+  EXPECT_EQ(nullptr, got_arrow_array->array.children[0]->buffers[0]);
+  EXPECT_EQ(data_ptr, got_arrow_array->array.children[0]->buffers[1]);
 }
 
 TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
@@ -415,10 +430,9 @@ TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
   auto got_arrow_schema =
     cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
   BaseArrowFixture::compare_schemas(expected_schema.get(), got_arrow_schema.get());
-  ArrowSchemaRelease(got_arrow_schema.get());
 
   auto data_ptr        = input.get_column(0).view().data<int64_t>();
-  auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+  auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
 
@@ -434,7 +448,21 @@ TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
   EXPECT_EQ(nullptr, got_arrow_array->array.children[0]->buffers[0]);
   EXPECT_EQ(data_ptr, got_arrow_array->array.children[0]->buffers[1]);
 
-  ArrowArrayRelease(&got_arrow_array->array);
+  got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.length);
+  EXPECT_EQ(0, got_arrow_array->array.null_count);
+  EXPECT_EQ(0, got_arrow_array->array.offset);
+  EXPECT_EQ(1, got_arrow_array->array.n_children);
+  EXPECT_EQ(nullptr, got_arrow_array->array.buffers[0]);
+
+  EXPECT_EQ(data.size(), got_arrow_array->array.children[0]->length);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->null_count);
+  EXPECT_EQ(0, got_arrow_array->array.children[0]->offset);
+  EXPECT_EQ(nullptr, got_arrow_array->array.children[0]->buffers[0]);
+  EXPECT_EQ(data_ptr, got_arrow_array->array.children[0]->buffers[1]);
 }
 
 TEST_F(ToArrowDeviceTest, NestedList)
@@ -471,7 +499,6 @@ TEST_F(ToArrowDeviceTest, NestedList)
   auto got_arrow_schema =
     cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
   compare_schemas(expected_schema.get(), got_arrow_schema.get());
-  ArrowSchemaRelease(got_arrow_schema.get());
 
   nanoarrow::UniqueArray expected_array;
   EXPECT_EQ(NANOARROW_OK,
@@ -487,12 +514,15 @@ TEST_F(ToArrowDeviceTest, NestedList)
   NANOARROW_THROW_NOT_OK(
     ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
-  auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+  auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
+  got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
   compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
-  ArrowArrayRelease(&got_arrow_array->array);
 }
 
 TEST_F(ToArrowDeviceTest, StructColumn)
@@ -588,7 +618,6 @@ TEST_F(ToArrowDeviceTest, StructColumn)
   auto got_arrow_schema =
     cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{metadata});
   compare_schemas(expected_schema.get(), got_arrow_schema.get());
-  ArrowSchemaRelease(got_arrow_schema.get());
 
   nanoarrow::UniqueArray expected_array;
   NANOARROW_THROW_NOT_OK(
@@ -629,12 +658,15 @@ TEST_F(ToArrowDeviceTest, StructColumn)
   NANOARROW_THROW_NOT_OK(
     ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
-  auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+  auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
+  got_arrow_array = cudf::to_arrow_device(std::move(input));
+  EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+  EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
   compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
-  ArrowArrayRelease(&got_arrow_array->array);
 }
 
 template <typename T>
@@ -665,7 +697,6 @@ TEST_F(ToArrowDeviceTest, FixedPoint64Table)
     auto got_arrow_schema =
       cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
     compare_schemas(expected_schema.get(), got_arrow_schema.get());
-    ArrowSchemaRelease(got_arrow_schema.get());
 
     auto result_dev_data = std::make_unique<rmm::device_uvector<int64_t>>(
       expect_data.size(), cudf::get_default_stream());
@@ -700,12 +731,15 @@ TEST_F(ToArrowDeviceTest, FixedPoint64Table)
     NANOARROW_THROW_NOT_OK(
       ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
-    auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+    auto got_arrow_array = cudf::to_arrow_device(input.view());
     ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
     ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
+    got_arrow_array = cudf::to_arrow_device(std::move(input));
+    ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
     compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
-    ArrowArrayRelease(&got_arrow_array->array);
   }
 }
 
@@ -734,7 +768,6 @@ TEST_F(ToArrowDeviceTest, FixedPoint128Table)
     auto got_arrow_schema =
       cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
     compare_schemas(expected_schema.get(), got_arrow_schema.get());
-    ArrowSchemaRelease(got_arrow_schema.get());
 
     nanoarrow::UniqueArray expected_array;
     NANOARROW_THROW_NOT_OK(
@@ -745,11 +778,14 @@ TEST_F(ToArrowDeviceTest, FixedPoint128Table)
     NANOARROW_THROW_NOT_OK(
       ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
-    auto got_arrow_array = cudf::to_arrow_device(std::move(input));
+    auto got_arrow_array = cudf::to_arrow_device(input.view());
     EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
     EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
+    got_arrow_array = cudf::to_arrow_device(std::move(input));
+    EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
     compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
-    ArrowArrayRelease(&got_arrow_array->array);
   }
 }

From a2c81e71fd9a7bbb0a89eee8a456d0066fa3ecbb Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 22 Apr 2024 08:29:31 -0400
Subject: [PATCH 093/842] Large strings support in cudf::merge (#15374)

Enable large strings support in `cudf::merge`.
Simplifies the strings specialization to use the gather-based strings factory function which is already optimized for long strings and is now appropriately enabled for large strings.
Also moved source from `include/cudf/strings/detail/merge.cuh` to `src/strings/merge/merge.cu` file since the template implemenation was not actually required.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/nvdbaranec
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15374
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/benchmarks/CMakeLists.txt                 |   4 +-
 cpp/benchmarks/merge/merge_strings.cpp        |  64 ++++++++++
 cpp/include/cudf/strings/detail/merge.cuh     | 111 ------------------
 cpp/include/cudf/strings/detail/merge.hpp     |  41 +++++++
 .../cudf/strings/detail/strings_children.cuh  |  26 ++--
 .../detail/strings_column_factories.cuh       |  29 ++---
 cpp/src/merge/merge.cu                        |  16 +--
 cpp/src/strings/merge/merge.cu                |  74 ++++++++++++
 cpp/tests/merge/merge_string_test.cpp         |  57 +++++++++
 10 files changed, 267 insertions(+), 156 deletions(-)
 create mode 100644 cpp/benchmarks/merge/merge_strings.cpp
 delete mode 100644 cpp/include/cudf/strings/detail/merge.cuh
 create mode 100644 cpp/include/cudf/strings/detail/merge.hpp
 create mode 100644 cpp/src/strings/merge/merge.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 3c7e10c9bc4..60d0094efac 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -586,6 +586,7 @@ add_library(
   src/strings/filling/fill.cu
   src/strings/filter_chars.cu
   src/strings/like.cu
+  src/strings/merge/merge.cu
   src/strings/padding.cu
   src/strings/regex/regcomp.cpp
   src/strings/regex/regexec.cpp
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 2c78a31f0f8..d36ecfd3a21 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -236,7 +236,9 @@ ConfigureNVBench(HASHING_NVBENCH hashing/hash.cpp)
 # ##################################################################################################
 # * merge benchmark -------------------------------------------------------------------------------
 ConfigureBench(MERGE_BENCH merge/merge.cpp)
-ConfigureNVBench(MERGE_NVBENCH merge/merge_structs.cpp merge/merge_lists.cpp)
+ConfigureNVBench(
+  MERGE_NVBENCH merge/merge_lists.cpp merge/merge_structs.cpp merge/merge_strings.cpp
+)
 
 # ##################################################################################################
 # * null_mask benchmark ---------------------------------------------------------------------------
diff --git a/cpp/benchmarks/merge/merge_strings.cpp b/cpp/benchmarks/merge/merge_strings.cpp
new file mode 100644
index 00000000000..3d0f1865490
--- /dev/null
+++ b/cpp/benchmarks/merge/merge_strings.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/merge.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+void nvbench_merge_strings(nvbench::state& state)
+{
+  auto stream = cudf::get_default_stream();
+
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  if (static_cast<std::size_t>(2 * num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const table_profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .no_validity();
+  auto const source_tables = create_random_table(
+    {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, table_profile);
+
+  auto const sorted_lhs = cudf::sort(cudf::table_view({source_tables->view().column(0)}));
+  auto const sorted_rhs = cudf::sort(cudf::table_view({source_tables->view().column(1)}));
+  auto const lhs        = sorted_lhs->view().column(0);
+  auto const rhs        = sorted_rhs->view().column(0);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size = cudf::strings_column_view(lhs).chars_size(stream) +
+                    cudf::strings_column_view(rhs).chars_size(stream);
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto result = cudf::merge(
+      {cudf::table_view({lhs}), cudf::table_view({rhs})}, {0}, {cudf::order::ASCENDING});
+  });
+}
+
+NVBENCH_BENCH(nvbench_merge_strings)
+  .set_name("merge_strings")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh
deleted file mode 100644
index 457c2b7f740..00000000000
--- a/cpp/include/cudf/strings/detail/merge.cuh
+++ /dev/null
@@ -1,111 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/merge.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
-#include <cudf/strings/string_view.cuh>
-#include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
-
-#include <cuda/functional>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
-#include <thrust/tuple.h>
-
-namespace cudf {
-namespace strings {
-namespace detail {
-/**
- * @brief Merges two strings columns.
- *
- * Caller must set the validity mask in the output column.
- *
- * @tparam row_order_iterator This must be an iterator for type thrust::tuple<side,size_type>.
- *
- * @param lhs First column.
- * @param rhs Second column.
- * @param row_order Indexes for each column.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings column.
- */
-template <typename index_type, typename row_order_iterator>
-std::unique_ptr<column> merge(strings_column_view const& lhs,
-                              strings_column_view const& rhs,
-                              row_order_iterator begin,
-                              row_order_iterator end,
-                              rmm::cuda_stream_view stream,
-                              rmm::device_async_resource_ref mr)
-{
-  using cudf::detail::side;
-  size_type strings_count = static_cast<size_type>(std::distance(begin, end));
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
-
-  auto lhs_column = column_device_view::create(lhs.parent(), stream);
-  auto d_lhs      = *lhs_column;
-  auto rhs_column = column_device_view::create(rhs.parent(), stream);
-  auto d_rhs      = *rhs_column;
-
-  // caller will set the null mask
-  rmm::device_buffer null_mask{0, stream, mr};
-  size_type null_count = lhs.null_count() + rhs.null_count();
-  if (null_count > 0)
-    null_mask = cudf::detail::create_null_mask(strings_count, mask_state::ALL_VALID, stream, mr);
-
-  // build offsets column
-  auto offsets_transformer =
-    cuda::proclaim_return_type<size_type>([d_lhs, d_rhs] __device__(auto index_pair) {
-      auto const [side, index] = index_pair;
-      if (side == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) return 0;
-      auto d_str =
-        side == side::LEFT ? d_lhs.element<string_view>(index) : d_rhs.element<string_view>(index);
-      return d_str.size_bytes();
-    });
-  auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
-  auto [offsets_column, bytes] = cudf::detail::make_offsets_child_column(
-    offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto d_offsets = offsets_column->view().template data<int32_t>();
-
-  // create the chars column
-  rmm::device_uvector<char> chars(bytes, stream, mr);
-  auto d_chars = chars.data();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     strings_count,
-                     [d_lhs, d_rhs, begin, d_offsets, d_chars] __device__(size_type idx) {
-                       auto const [side, index] = begin[idx];
-                       if (side == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) return;
-                       auto d_str = side == side::LEFT ? d_lhs.element<string_view>(index)
-                                                       : d_rhs.element<string_view>(index);
-                       memcpy(d_chars + d_offsets[idx], d_str.data(), d_str.size_bytes());
-                     });
-
-  return make_strings_column(
-    strings_count, std::move(offsets_column), chars.release(), null_count, std::move(null_mask));
-}
-
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/merge.hpp b/cpp/include/cudf/strings/detail/merge.hpp
new file mode 100644
index 00000000000..35fd9c0593d
--- /dev/null
+++ b/cpp/include/cudf/strings/detail/merge.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/detail/merge.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf ::strings ::detail {
+/**
+ * @brief Merges two strings columns
+ *
+ * @param lhs First column
+ * @param rhs Second column
+ * @param row_order Indices for each column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
+std::unique_ptr<column> merge(strings_column_view const& lhs,
+                              strings_column_view const& rhs,
+                              cudf::detail::index_vector const& row_order,
+                              rmm::cuda_stream_view stream,
+                              rmm::device_async_resource_ref mr);
+
+}  // namespace cudf::strings::detail
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index 7136df325f4..35812c0573d 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -164,22 +164,22 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
     });
   auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
   // Use the sizes-to-offsets iterator to compute the total number of elements
-  auto const total_elements =
+  auto const total_bytes =
     cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);
 
-  // TODO: replace exception with if-statement when enabling creating INT64 offsets
-  CUDF_EXPECTS(total_elements <= size_type_max,
-               "Size of output exceeds the character size limit",
+  auto const threshold = get_offset64_threshold();
+  CUDF_EXPECTS(is_large_strings_enabled() || (total_bytes < threshold),
+               "Size of output exceeds the column size limit",
                std::overflow_error);
-  // if (total_elements >= get_offset64_threshold()) {
-  //   // recompute as int64 offsets when above the threshold
-  //   offsets_column = make_numeric_column(
-  //     data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  //   auto d_offsets64 = offsets_column->mutable_view().template data<int64_t>();
-  //   sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream);
-  // }
-
-  return std::pair(std::move(offsets_column), total_elements);
+  if (total_bytes >= get_offset64_threshold()) {
+    // recompute as int64 offsets when above the threshold
+    offsets_column = make_numeric_column(
+      data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+    auto d_offsets64 = offsets_column->mutable_view().template data<int64_t>();
+    cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream);
+  }
+
+  return std::pair(std::move(offsets_column), total_bytes);
 }
 
 }  // namespace detail
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index 079b6a73e0b..a3221038eed 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -86,9 +86,10 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
       return (item.first != nullptr ? static_cast<size_type>(item.second) : size_type{0});
     });
   auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
-  auto [offsets_column, bytes] = cudf::detail::make_offsets_child_column(
+  auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
     offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto offsets_view = offsets_column->view();
+  auto const d_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
 
   // create null mask
   auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; };
@@ -98,11 +99,10 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
     (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr};
 
   // build chars column
-  auto chars_data = [offsets_view, bytes = bytes, begin, strings_count, null_count, stream, mr] {
+  auto chars_data = [d_offsets, bytes = bytes, begin, strings_count, null_count, stream, mr] {
     auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
     // use a character-parallel kernel for long string lengths
     if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
-      auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets_view);
       auto const str_begin = thrust::make_transform_iterator(
         begin, cuda::proclaim_return_type<string_view>([] __device__(auto ip) {
           return string_view{ip.first, ip.second};
@@ -121,12 +121,11 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
       auto d_chars    = chars_data.data();
       auto copy_chars = [d_chars] __device__(auto item) {
         string_index_pair const str = thrust::get<0>(item);
-        size_type const offset      = thrust::get<1>(item);
+        int64_t const offset        = thrust::get<1>(item);
         if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
       };
       thrust::for_each_n(rmm::exec_policy(stream),
-                         thrust::make_zip_iterator(
-                           thrust::make_tuple(begin, offsets_view.template begin<size_type>())),
+                         thrust::make_zip_iterator(thrust::make_tuple(begin, d_offsets)),
                          strings_count,
                          copy_chars);
       return chars_data;
@@ -168,21 +167,15 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
 {
   CUDF_FUNC_RANGE();
   size_type strings_count = thrust::distance(offsets_begin, offsets_end) - 1;
-  size_type bytes         = std::distance(chars_begin, chars_end) * sizeof(char);
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
+  if (strings_count == 0) { return make_empty_column(type_id::STRING); }
 
+  int64_t const bytes = std::distance(chars_begin, chars_end) * sizeof(char);
   CUDF_EXPECTS(bytes >= 0, "invalid offsets data");
 
   // build offsets column -- this is the number of strings + 1
-  auto offsets_column = make_numeric_column(
-    data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  auto offsets_view = offsets_column->mutable_view();
-  thrust::transform(rmm::exec_policy(stream),
-                    offsets_begin,
-                    offsets_end,
-                    offsets_view.data<int32_t>(),
-                    cuda::proclaim_return_type<int32_t>(
-                      [] __device__(auto offset) { return static_cast<int32_t>(offset); }));
+  auto [offsets_column, computed_bytes] =
+    cudf::strings::detail::make_offsets_child_column(offsets_begin, offsets_end, stream, mr);
+  CUDF_EXPECTS(bytes == computed_bytes, "unexpected byte count");
 
   // build chars column
   rmm::device_uvector<char> chars_data(bytes, stream, mr);
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 4463b16df78..5a3be259ed9 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -27,7 +27,7 @@
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/detail/merge.cuh>
+#include <cudf/strings/detail/merge.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
@@ -434,18 +434,8 @@ std::unique_ptr<column> column_merger::operator()<cudf::string_view>(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr) const
 {
-  auto column = strings::detail::merge<index_type>(strings_column_view(lcol),
-                                                   strings_column_view(rcol),
-                                                   row_order_.begin(),
-                                                   row_order_.end(),
-                                                   stream,
-                                                   mr);
-  if (lcol.has_nulls() || rcol.has_nulls()) {
-    auto merged_view = column->mutable_view();
-    materialize_bitmask(
-      lcol, rcol, merged_view.null_mask(), merged_view.size(), row_order_.data(), stream);
-  }
-  return column;
+  return strings::detail::merge(
+    strings_column_view(lcol), strings_column_view(rcol), row_order_, stream, mr);
 }
 
 // specialization for dictionary
diff --git a/cpp/src/strings/merge/merge.cu b/cpp/src/strings/merge/merge.cu
new file mode 100644
index 00000000000..28e171f157e
--- /dev/null
+++ b/cpp/src/strings/merge/merge.cu
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/strings/detail/merge.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/strings/string_view.cuh>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+std::unique_ptr<column> merge(strings_column_view const& lhs,
+                              strings_column_view const& rhs,
+                              cudf::detail::index_vector const& row_order,
+                              rmm::cuda_stream_view stream,
+                              rmm::device_async_resource_ref mr)
+{
+  using cudf::detail::side;
+  if (row_order.is_empty()) { return make_empty_column(type_id::STRING); }
+  auto const strings_count = static_cast<cudf::size_type>(row_order.size());
+
+  auto const lhs_column = column_device_view::create(lhs.parent(), stream);
+  auto const d_lhs      = *lhs_column;
+  auto const rhs_column = column_device_view::create(rhs.parent(), stream);
+  auto const d_rhs      = *rhs_column;
+
+  auto const begin = row_order.begin();
+
+  // build vector of strings
+  rmm::device_uvector<string_index_pair> indices(strings_count, stream);
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::make_counting_iterator<size_type>(0),
+                    thrust::make_counting_iterator<size_type>(strings_count),
+                    indices.begin(),
+                    [d_lhs, d_rhs, begin] __device__(size_type idx) {
+                      auto const [s, index] = begin[idx];
+                      if (s == side::LEFT ? d_lhs.is_null(index) : d_rhs.is_null(index)) {
+                        return string_index_pair{nullptr, 0};
+                      }
+                      auto d_str = (s == side::LEFT) ? d_lhs.element<string_view>(index)
+                                                     : d_rhs.element<string_view>(index);
+                      return d_str.size_bytes() == 0
+                               ? string_index_pair{"", 0}  // ensures empty != null
+                               : string_index_pair{d_str.data(), d_str.size_bytes()};
+                    });
+
+  // convert vector into strings column
+  return make_strings_column(indices.begin(), indices.end(), stream, mr);
+}
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp
index 28179a7341c..d7368d31944 100644
--- a/cpp/tests/merge/merge_string_test.cpp
+++ b/cpp/tests/merge/merge_string_test.cpp
@@ -411,3 +411,60 @@ TYPED_TEST(MergeStringTest, Merge2StringKeyNullColumns)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view2, output_column_view2);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view3, output_column_view3);
 }
+
+class MergeLargeStringsTest : public cudf::test::BaseFixture {};
+
+TEST_F(MergeLargeStringsTest, MergeLargeStrings)
+{
+  CUDF_TEST_ENABLE_LARGE_STRINGS();
+  auto itr = thrust::constant_iterator<std::string_view>(
+    "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY");                      // 50 bytes
+  auto const input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000);  // 250MB
+  auto input_views = std::vector<cudf::table_view>();
+  auto const view  = cudf::table_view({input});
+  std::vector<cudf::size_type> splits;
+  int const multiplier = 10;
+  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
+    input_views.push_back(view);
+    splits.push_back(view.num_rows() * (i + 1));
+  }
+  splits.pop_back();  // remove last entry
+  auto const column_order    = std::vector<cudf::order>{cudf::order::ASCENDING};
+  auto const null_precedence = std::vector<cudf::null_order>{cudf::null_order::AFTER};
+
+  auto result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  auto sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), view.num_rows() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+
+  auto sliced = cudf::split(sv.parent(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also test with large strings column as input
+  input_views.clear();
+  input_views.push_back(view);            // regular column
+  input_views.push_back(result->view());  // large column
+  result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), view.num_rows() * (multiplier + 1));
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+  splits.push_back(view.num_rows() * multiplier);
+  sliced = cudf::split(sv.parent(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also check merge still returns 32-bit offsets for regular columns
+  input_views.clear();
+  input_views.push_back(view);
+  input_views.push_back(view);
+  result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), view.num_rows() * 2);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT32});
+  sliced = cudf::split(sv.parent(), {view.num_rows()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[0], input);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[1], input);
+}

From 13af97572fa108073ce3f335802949106c9ad790 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 22 Apr 2024 09:50:01 -0400
Subject: [PATCH 094/842] Add cuda-sanitizer-api dependency for test-cpp matrix
 11.4 (#15573)

Fixes dependency issue with nightly builds running 11.4.3 cpp tests requiring the compute-sanitizer tool.

Closes #15571

Authors:
  - David Wendt (https://github.com/davidwendt)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15573
---
 dependencies.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/dependencies.yaml b/dependencies.yaml
index 2ed2525fc1e..14c698000cb 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -599,6 +599,10 @@ dependencies:
               cuda: "11.8"
             packages:
               - cuda-sanitizer-api=11.8.86
+          - matrix:
+              cuda: "11.4"
+            packages:
+              - cuda-sanitizer-api=11.4.120
           - matrix:  # Fallback for CUDA 11 or no matrix
             packages:
   test_java:

From 475f5e5fcb5b703ffdf1e491b7f2230c514a41fc Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Mon, 22 Apr 2024 11:18:44 -0400
Subject: [PATCH 095/842] Remove index name overrides in dask-cudf pyarrow
 table dispatch (#15514)

Looks like these overrides should be safe to remove now that https://github.com/rapidsai/cudf/issues/14159 is closed out.

This should unblock the GPU CI failures we're seeing on Dask with 24.06 in https://github.com/dask/dask/pull/11045.

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15514
---
 python/cudf/cudf/core/dataframe.py            | 46 ++++++++++++-------
 python/cudf/cudf/tests/test_dataframe.py      | 22 +++++++++
 python/dask_cudf/dask_cudf/backends.py        | 22 +--------
 .../dask_cudf/tests/test_dispatch.py          | 11 ++++-
 4 files changed, 62 insertions(+), 39 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 99e4588d608..7b7fc87a6dc 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5450,9 +5450,11 @@ def from_arrow(cls, table):
         """
         index_col = None
         col_index_names = None
+        physical_column_md = []
         if isinstance(table, pa.Table) and isinstance(
             table.schema.pandas_metadata, dict
         ):
+            physical_column_md = table.schema.pandas_metadata["columns"]
             index_col = table.schema.pandas_metadata["index_columns"]
             if "column_indexes" in table.schema.pandas_metadata:
                 col_index_names = []
@@ -5480,7 +5482,18 @@ def from_arrow(cls, table):
                     # https://github.com/apache/arrow/issues/15178
                     out = out.set_index(idx)
             else:
-                out = out.set_index(index_col[0])
+                out = out.set_index(index_col)
+
+        if (
+            "__index_level_0__" in out.index.names
+            and len(out.index.names) == 1
+        ):
+            real_index_name = None
+            for md in physical_column_md:
+                if md["field_name"] == "__index_level_0__":
+                    real_index_name = md["name"]
+                    break
+            out.index.name = real_index_name
 
         return out
 
@@ -5530,42 +5543,43 @@ def to_arrow(self, preserve_index=None):
         write_index = preserve_index is not False
         keep_range_index = write_index and preserve_index is None
         index = self.index
+        index_levels = [self.index]
         if write_index:
             if isinstance(index, cudf.RangeIndex) and keep_range_index:
-                descr = {
-                    "kind": "range",
-                    "name": index.name,
-                    "start": index._start,
-                    "stop": index._stop,
-                    "step": 1,
-                }
+                index_descr = [
+                    {
+                        "kind": "range",
+                        "name": index.name,
+                        "start": index._start,
+                        "stop": index._stop,
+                        "step": 1,
+                    }
+                ]
             else:
                 if isinstance(index, cudf.RangeIndex):
                     index = index._as_int_index()
                     index.name = "__index_level_0__"
                 if isinstance(index, MultiIndex):
-                    gen_names = tuple(
-                        f"level_{i}" for i, _ in enumerate(index._data.names)
-                    )
+                    index_descr = list(index._data.names)
+                    index_levels = index.levels
                 else:
-                    gen_names = (
+                    index_descr = (
                         index.names if index.name is not None else ("index",)
                     )
-                for gen_name, col_name in zip(gen_names, index._data.names):
+                for gen_name, col_name in zip(index_descr, index._data.names):
                     data._insert(
                         data.shape[1],
                         gen_name,
                         index._data[col_name],
                     )
-                descr = gen_names[0]
-            index_descr.append(descr)
 
         out = super(DataFrame, data).to_arrow()
+        # import pdb; pdb.set_trace()
         metadata = pa.pandas_compat.construct_metadata(
             columns_to_convert=[self[col] for col in self._data.names],
             df=self,
             column_names=out.schema.names,
-            index_levels=[index],
+            index_levels=index_levels,
             index_descriptors=index_descr,
             preserve_index=preserve_index,
             types=out.schema.types,
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index ead1ab2da6c..df0e22c5e43 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -2769,6 +2769,28 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index):
     assert_eq(pdf2, gdf2)
 
 
+@pytest.mark.parametrize(
+    "index", [None, cudf.RangeIndex(3, name="a"), "a", "b", ["a", "b"]]
+)
+@pytest.mark.parametrize("preserve_index", [True, False, None])
+def test_arrow_round_trip(preserve_index, index):
+    data = {"a": [4, 5, 6], "b": ["cat", "dog", "bird"]}
+    if isinstance(index, (list, str)):
+        gdf = cudf.DataFrame(data).set_index(index)
+    else:
+        gdf = cudf.DataFrame(data, index=index)
+
+    table = gdf.to_arrow(preserve_index=preserve_index)
+    table_pd = pa.Table.from_pandas(
+        gdf.to_pandas(), preserve_index=preserve_index
+    )
+
+    gdf_out = cudf.DataFrame.from_arrow(table)
+    pdf_out = table_pd.to_pandas()
+
+    assert_eq(gdf_out, pdf_out)
+
+
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + ["bool"])
 def test_cuda_array_interface(dtype):
     np_data = np.arange(10).astype(dtype)
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 5401bcd3767..94528325aea 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -384,18 +384,6 @@ def _cudf_to_table(obj, preserve_index=None, **kwargs):
             "Ignoring the following arguments to "
             f"`to_pyarrow_table_dispatch`: {list(kwargs)}"
         )
-
-    # TODO: Remove this logic when cudf#14159 is resolved
-    # (see: https://github.com/rapidsai/cudf/issues/14159)
-    if preserve_index and isinstance(obj.index, cudf.RangeIndex):
-        obj = obj.copy()
-        obj.index.name = (
-            obj.index.name
-            if obj.index.name is not None
-            else "__index_level_0__"
-        )
-        obj.index = obj.index._as_int_index()
-
     return obj.to_arrow(preserve_index=preserve_index)
 
 
@@ -408,15 +396,7 @@ def _table_to_cudf(obj, table, self_destruct=None, **kwargs):
             f"Ignoring the following arguments to "
             f"`from_pyarrow_table_dispatch`: {list(kwargs)}"
         )
-    result = obj.from_arrow(table)
-
-    # TODO: Remove this logic when cudf#14159 is resolved
-    # (see: https://github.com/rapidsai/cudf/issues/14159)
-    if "__index_level_0__" in result.index.names:
-        assert len(result.index.names) == 1
-        result.index.name = None
-
-    return result
+    return obj.from_arrow(table)
 
 
 @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex))
diff --git a/python/dask_cudf/dask_cudf/tests/test_dispatch.py b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
index 76703206726..a12481a7bb4 100644
--- a/python/dask_cudf/dask_cudf/tests/test_dispatch.py
+++ b/python/dask_cudf/dask_cudf/tests/test_dispatch.py
@@ -25,17 +25,24 @@ def test_is_categorical_dispatch():
 
 
 @pytest.mark.parametrize("preserve_index", [True, False])
-def test_pyarrow_conversion_dispatch(preserve_index):
+@pytest.mark.parametrize("index", [None, cudf.RangeIndex(10, name="foo")])
+def test_pyarrow_conversion_dispatch(preserve_index, index):
     from dask.dataframe.dispatch import (
         from_pyarrow_table_dispatch,
         to_pyarrow_table_dispatch,
     )
 
-    df1 = cudf.DataFrame(np.random.randn(10, 3), columns=list("abc"))
+    df1 = cudf.DataFrame(
+        np.random.randn(10, 3), columns=list("abc"), index=index
+    )
     df2 = from_pyarrow_table_dispatch(
         df1, to_pyarrow_table_dispatch(df1, preserve_index=preserve_index)
     )
 
+    # preserve_index=False doesn't retain index metadata
+    if not preserve_index and index is not None:
+        df1.index.name = None
+
     assert type(df1) == type(df2)
     assert_eq(df1, df2)
 

From 818b29d2ee49a7cc6de910951f64c36c55cc6d08 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 22 Apr 2024 13:07:33 -1000
Subject: [PATCH 096/842] Clean up index methods (#15496)

- Removed `_index_from_columns` in favor of an inline call
- Renamed `_setdefault_name` to `_getdefault_name` and to not modify `kwargs`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15496
---
 python/cudf/cudf/core/groupby/groupby.py |  4 ++-
 python/cudf/cudf/core/index.py           | 31 +++++++++---------------
 python/cudf/cudf/core/indexed_frame.py   | 10 +++++---
 3 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index dd4924676f3..3e4b8192888 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1201,7 +1201,9 @@ def _grouped(self, *, include_groups: bool = True):
         offsets, grouped_key_cols, grouped_value_cols = self._groupby.groups(
             [*self.obj._index._columns, *self.obj._columns]
         )
-        grouped_keys = cudf.core.index._index_from_columns(grouped_key_cols)
+        grouped_keys = cudf.core.index._index_from_data(
+            dict(enumerate(grouped_key_cols))
+        )
         if isinstance(self.grouping.keys, cudf.MultiIndex):
             grouped_keys.names = self.grouping.keys.names
             to_drop = self.grouping.keys.names
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index bbe496333cd..6f08b1d83b3 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -149,13 +149,6 @@ def _index_from_data(data: MutableMapping, name: Any = no_default):
     return index_class_type._from_data(data, name)
 
 
-def _index_from_columns(
-    columns: List[cudf.core.column.ColumnBase], name: Any = no_default
-):
-    """Construct an index from ``columns``, with levels named 0, 1, 2..."""
-    return _index_from_data(dict(zip(range(len(columns)), columns)), name=name)
-
-
 class RangeIndex(BaseIndex, BinaryOperand):
     """
     Immutable Index implementing a monotonic integer range.
@@ -988,8 +981,7 @@ class Index(SingleColumnFrame, BaseIndex, metaclass=IndexMeta):
 
     @_cudf_nvtx_annotate
     def __init__(self, data, **kwargs):
-        kwargs = _setdefault_name(data, **kwargs)
-        name = kwargs.get("name")
+        name = _getdefault_name(data, name=kwargs.get("name"))
         super().__init__({name: data})
 
     @_cudf_nvtx_annotate
@@ -1397,8 +1389,7 @@ def __repr__(self):
     def __getitem__(self, index):
         res = self._get_elements_from_column(index)
         if isinstance(res, ColumnBase):
-            res = as_index(res)
-            res.name = self.name
+            res = as_index(res, name=self.name)
         return res
 
     @property  # type: ignore
@@ -1713,7 +1704,7 @@ def __init__(
         if dtype.kind != "M":
             raise TypeError("dtype must be a datetime type")
 
-        name = _setdefault_name(data, name=name)["name"]
+        name = _getdefault_name(data, name=name)
         data = column.as_column(data)
 
         # TODO: Remove this if statement and fix tests now that
@@ -2432,7 +2423,7 @@ def __init__(
         if dtype.kind != "m":
             raise TypeError("dtype must be a timedelta type")
 
-        name = _setdefault_name(data, name=name)["name"]
+        name = _getdefault_name(data, name=name)
         data = column.as_column(data, dtype=dtype)
 
         if copy:
@@ -2601,7 +2592,7 @@ def __init__(
                 )
         if copy:
             data = column.as_column(data, dtype=dtype).copy(deep=True)
-        kwargs = _setdefault_name(data, name=name)
+        name = _getdefault_name(data, name=name)
         if isinstance(data, CategoricalColumn):
             data = data
         elif isinstance(data, pd.Series) and (
@@ -2635,7 +2626,7 @@ def __init__(
             data = data.as_ordered(ordered=True)
         elif ordered is False and data.ordered is True:
             data = data.as_ordered(ordered=False)
-        super().__init__(data, **kwargs)
+        super().__init__(data, name=name)
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -2821,7 +2812,7 @@ def __init__(
         copy: bool = False,
         name=None,
     ):
-        name = _setdefault_name(data, name=name)["name"]
+        name = _getdefault_name(data, name=name)
 
         if dtype is not None:
             dtype = cudf.dtype(dtype)
@@ -3053,10 +3044,10 @@ def as_index(
     return idx
 
 
-def _setdefault_name(values, **kwargs):
-    if kwargs.get("name") is None:
-        kwargs["name"] = getattr(values, "name", None)
-    return kwargs
+def _getdefault_name(values, name):
+    if name is None:
+        return getattr(values, "name", None)
+    return name
 
 
 @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index c412b7a7e47..48e80d8162f 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -56,7 +56,7 @@
 from cudf.core.dtypes import ListDtype
 from cudf.core.frame import Frame
 from cudf.core.groupby.groupby import GroupBy
-from cudf.core.index import Index, RangeIndex, _index_from_columns
+from cudf.core.index import Index, RangeIndex, _index_from_data
 from cudf.core.missing import NA
 from cudf.core.multiindex import MultiIndex
 from cudf.core.resample import _Resampler
@@ -331,7 +331,9 @@ def _from_columns_like_self(
         if index_names is not None:
             n_index_columns = len(index_names)
             data_columns = columns[n_index_columns:]
-            index = _index_from_columns(columns[:n_index_columns])
+            index = _index_from_data(
+                dict(enumerate(columns[:n_index_columns]))
+            )
             if isinstance(index, cudf.MultiIndex):
                 index.names = index_names
             else:
@@ -4348,8 +4350,8 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""):
             index_names,
         ) = self._index._split_columns_by_levels(level)
         if index_columns:
-            index = _index_from_columns(
-                index_columns,
+            index = _index_from_data(
+                dict(enumerate(index_columns)),
                 name=self._index.name,
             )
             if isinstance(index, MultiIndex):

From 7804ba7f817b3fccf13b0084e2d7e0ac2257ff5a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 22 Apr 2024 18:09:58 -1000
Subject: [PATCH 097/842] Preserve RangeIndex.step in to_arrow/from_arrow
 (#15581)

Noticed that step was hardcoded to `1` when it should reflect `RangeIndex.step`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15581
---
 python/cudf/cudf/core/dataframe.py       | 15 ++++++++-------
 python/cudf/cudf/tests/test_dataframe.py | 10 +++++++++-
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7b7fc87a6dc..45bb66d5d4b 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5466,10 +5466,12 @@ def from_arrow(cls, table):
             out._data._level_names = col_index_names
         if index_col:
             if isinstance(index_col[0], dict):
+                range_meta = index_col[0]
                 idx = cudf.RangeIndex(
-                    index_col[0]["start"],
-                    index_col[0]["stop"],
-                    name=index_col[0]["name"],
+                    start=range_meta["start"],
+                    stop=range_meta["stop"],
+                    step=range_meta["step"],
+                    name=range_meta["name"],
                 )
                 if len(idx) == len(out):
                     # `idx` is generated from arrow `pandas_metadata`
@@ -5550,9 +5552,9 @@ def to_arrow(self, preserve_index=None):
                     {
                         "kind": "range",
                         "name": index.name,
-                        "start": index._start,
-                        "stop": index._stop,
-                        "step": 1,
+                        "start": index.start,
+                        "stop": index.stop,
+                        "step": index.step,
                     }
                 ]
             else:
@@ -5574,7 +5576,6 @@ def to_arrow(self, preserve_index=None):
                     )
 
         out = super(DataFrame, data).to_arrow()
-        # import pdb; pdb.set_trace()
         metadata = pa.pandas_compat.construct_metadata(
             columns_to_convert=[self[col] for col in self._data.names],
             df=self,
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index df0e22c5e43..59e8b41e51a 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -2770,7 +2770,15 @@ def test_arrow_pandas_compat(pdf, gdf, preserve_index):
 
 
 @pytest.mark.parametrize(
-    "index", [None, cudf.RangeIndex(3, name="a"), "a", "b", ["a", "b"]]
+    "index",
+    [
+        None,
+        cudf.RangeIndex(3, name="a"),
+        "a",
+        "b",
+        ["a", "b"],
+        cudf.RangeIndex(0, 5, 2, name="a"),
+    ],
 )
 @pytest.mark.parametrize("preserve_index", [True, False, None])
 def test_arrow_round_trip(preserve_index, index):

From 73306f1750f4859f03611658c4d11e2e1c82eb86 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 23 Apr 2024 10:31:13 -0400
Subject: [PATCH 098/842] Large strings support for cudf::fill (#15555)

Updates the `cudf::fill` strings specialization logic to use gather-based `make_strings_column` instead of the `make_strings_children` since the gather-based function already efficiently supports longs.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/15555
---
 cpp/src/strings/filling/fill.cu | 66 ++++++++++++---------------------
 1 file changed, 24 insertions(+), 42 deletions(-)

diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu
index b48d56a595c..878d0fe11ba 100644
--- a/cpp/src/strings/filling/fill.cu
+++ b/cpp/src/strings/filling/fill.cu
@@ -15,10 +15,8 @@
  */
 
 #include <cudf/column/column_device_view.cuh>
-#include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/fill.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
-#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
@@ -27,35 +25,34 @@
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace strings {
 namespace detail {
 namespace {
+
 struct fill_fn {
   column_device_view const d_strings;
   size_type const begin;
   size_type const end;
-  string_view const d_value;
-  size_type* d_offsets{};
-  char* d_chars{};
-
-  __device__ string_view resolve_string_at(size_type idx) const
-  {
-    if ((begin <= idx) && (idx < end)) { return d_value; }
-    return d_strings.is_valid(idx) ? d_strings.element<string_view>(idx) : string_view{};
-  }
+  string_scalar_device_view const d_value;
 
-  __device__ void operator()(size_type idx) const
+  __device__ string_index_pair operator()(size_type idx) const
   {
-    auto const d_str = resolve_string_at(idx);
-    if (!d_chars) {
-      d_offsets[idx] = d_str.size_bytes();
+    auto d_str = string_view();
+    if ((begin <= idx) && (idx < end)) {
+      if (!d_value.is_valid()) { return string_index_pair{nullptr, 0}; }
+      d_str = d_value.value();
     } else {
-      copy_string(d_chars + d_offsets[idx], d_str);
+      if (d_strings.is_null(idx)) { return string_index_pair{nullptr, 0}; }
+      d_str = d_strings.element<string_view>(idx);
     }
+    return !d_str.empty() ? string_index_pair{d_str.data(), d_str.size_bytes()}
+                          : string_index_pair{"", 0};
   }
 };
+
 }  // namespace
 
 std::unique_ptr<column> fill(strings_column_view const& input,
@@ -72,33 +69,18 @@ std::unique_ptr<column> fill(strings_column_view const& input,
   CUDF_EXPECTS(begin <= end, "Parameters [begin,end) have invalid range values");
   if (begin == end) { return std::make_unique<column>(input.parent(), stream, mr); }
 
-  auto strings_column  = column_device_view::create(input.parent(), stream);
-  auto const d_strings = *strings_column;
-  auto const is_valid  = value.is_valid(stream);
-
-  // create resulting null mask
-  auto [null_mask, null_count] = [begin, end, is_valid, d_strings, stream, mr] {
-    if (begin == 0 and end == d_strings.size() and is_valid) {
-      return std::pair(rmm::device_buffer{}, 0);
-    }
-    return cudf::detail::valid_if(
-      thrust::make_counting_iterator<size_type>(0),
-      thrust::make_counting_iterator<size_type>(d_strings.size()),
-      [d_strings, begin, end, is_valid] __device__(size_type idx) {
-        return ((begin <= idx) && (idx < end)) ? is_valid : d_strings.is_valid(idx);
-      },
-      stream,
-      mr);
-  }();
-
-  auto const d_value = const_cast<string_scalar&>(value);
-  auto const d_str   = is_valid ? d_value.value(stream) : string_view{};
-  auto fn            = fill_fn{d_strings, begin, end, d_str};
+  auto const d_strings = column_device_view::create(input.parent(), stream);
+  auto const d_value   = cudf::get_scalar_device_view(const_cast<string_scalar&>(value));
 
-  auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr);
+  auto fn = fill_fn{*d_strings, begin, end, d_value};
+  rmm::device_uvector<string_index_pair> indices(strings_count, stream);
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(strings_count),
+                    indices.begin(),
+                    fn);
 
-  return make_strings_column(
-    strings_count, std::move(offsets_column), chars.release(), null_count, std::move(null_mask));
+  return make_strings_column(indices.begin(), indices.end(), stream, mr);
 }
 
 }  // namespace detail

From 7341866495b03bdf3f01f8f4e57953741c77e7aa Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 23 Apr 2024 15:03:38 -0400
Subject: [PATCH 099/842] Remove public gtest dependency from libcudf conda
 package (#15534)

Reworks the cudftestutil and dependency chain to remove the public gtest dependency in libcudf conda package.
The libcudftestutil was previously made static due to issues using a static system GTest that wasn't build with `fPIC`. Using  a GTest from `rapids-cmake` which is built with `fPIC` enabled, removes this restriction and allows us to remove the public depedency.

Some notes:
-  We need to align all of RAPIDS C++ projects on static GTest from `rapids-cmake`
- None of the compiled components / classes of `libcudftestutils` publically depend on GTest
- Two of the libcudftestutils header only components bring include gtest. Since these headers aren't required to be used we are going to consider them optional.
- Therefore using these optional `libcudftestutils` header will require downstream users to bring in GTest.

Fixes #13381

Authors:
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/15534
---
 .../all_cuda-118_arch-x86_64.yaml             |  3 -
 .../all_cuda-122_arch-x86_64.yaml             |  3 -
 conda/recipes/libcudf/conda_build_config.yaml |  6 --
 conda/recipes/libcudf/meta.yaml               | 11 ---
 cpp/CMakeLists.txt                            | 12 ++-
 cpp/benchmarks/CMakeLists.txt                 |  2 +-
 cpp/cmake/thirdparty/get_gtest.cmake          | 19 +----
 cpp/include/cudf_test/column_wrapper.hpp      |  1 -
 cpp/include/cudf_test/cudf_gtest.hpp          | 82 +------------------
 cpp/tests/CMakeLists.txt                      |  5 +-
 cpp/tests/groupby/groupby_test_util.cpp       |  3 +-
 .../{base_fixture.cpp => random_seed.cpp}     |  0
 dependencies.yaml                             |  6 --
 13 files changed, 14 insertions(+), 139 deletions(-)
 rename cpp/tests/utilities/{base_fixture.cpp => random_seed.cpp} (100%)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index e8816da3a2a..7a5fef9f25e 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -8,7 +8,6 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
-- benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
 - breathe>=4.35.0
@@ -34,8 +33,6 @@ dependencies:
 - fmt>=10.1.1,<11
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
-- gmock>=1.13.0
-- gtest>=1.13.0
 - hypothesis
 - identify>=2.5.20
 - ipython
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 8044fc35a19..48453e18bb0 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -8,7 +8,6 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
-- benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
 - breathe>=4.35.0
@@ -35,8 +34,6 @@ dependencies:
 - fmt>=10.1.1,<11
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
-- gmock>=1.13.0
-- gtest>=1.13.0
 - hypothesis
 - identify>=2.5.20
 - ipython
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 53770956ebe..b7fbaab9306 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -16,12 +16,6 @@ sysroot_version:
 cmake_version:
   - ">=3.26.4"
 
-gbench_version:
-  - "==1.8.0"
-
-gtest_version:
-  - ">=1.13.0"
-
 libarrow_version:
   - "==14.0.2"
 
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 3af0b7885c3..695c515b9d4 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -69,9 +69,6 @@ requirements:
     - librdkafka {{ librdkafka_version }}
     - fmt {{ fmt_version }}
     - spdlog {{ spdlog_version }}
-    - benchmark {{ gbench_version }}
-    - gtest {{ gtest_version }}
-    - gmock {{ gtest_version }}
     - zlib {{ zlib_version }}
 
 outputs:
@@ -108,8 +105,6 @@ outputs:
         - librmm ={{ minor_version }}
         - libkvikio ={{ minor_version }}
         - dlpack {{ dlpack_version }}
-        - gtest {{ gtest_version }}
-        - gmock {{ gtest_version }}
     test:
       commands:
         - test -f $PREFIX/lib/libcudf.so
@@ -221,9 +216,6 @@ outputs:
         {% else %}
         - libcurand-dev
         {% endif %}
-        - benchmark {{ gbench_version }}
-        - gtest {{ gtest_version }}
-        - gmock {{ gtest_version }}
       run:
         - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
         - {{ pin_subpackage('libcudf', exact=True) }}
@@ -233,9 +225,6 @@ outputs:
         {% else %}
         - libcurand
         {% endif %}
-        - benchmark {{ gbench_version }}
-        - gtest {{ gtest_version }}
-        - gmock {{ gtest_version }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 60d0094efac..b6a61368fe7 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -847,14 +847,12 @@ if(CUDF_BUILD_TESTUTIL)
 
   add_library(cudf::cudftest_default_stream ALIAS cudftest_default_stream)
 
-  # Needs to be static so that we support usage of static builds of gtest which doesn't compile with
-  # fPIC enabled and therefore can't be embedded into shared libraries.
   add_library(
-    cudftestutil STATIC
+    cudftestutil SHARED
     tests/io/metadata_utilities.cpp
-    tests/utilities/base_fixture.cpp
     tests/utilities/column_utilities.cu
     tests/utilities/debug_utilities.cu
+    tests/utilities/random_seed.cpp
     tests/utilities/table_utilities.cu
     tests/utilities/tdigest_utilities.cu
   )
@@ -879,8 +877,8 @@ if(CUDF_BUILD_TESTUTIL)
 
   target_link_libraries(
     cudftestutil
-    PUBLIC GTest::gmock GTest::gtest Threads::Threads cudf cudftest_default_stream
-    PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
+    PUBLIC Threads::Threads cudf cudftest_default_stream
+    PRIVATE GTest::gmock GTest::gtest $<TARGET_NAME_IF_EXISTS:conda_env>
   )
 
   target_include_directories(
@@ -959,7 +957,7 @@ endif()
 if(CUDF_BUILD_BENCHMARKS)
   # Find or install GoogleBench
   include(${rapids-cmake-dir}/cpm/gbench.cmake)
-  rapids_cpm_gbench()
+  rapids_cpm_gbench(BUILD_STATIC)
 
   # Find or install nvbench
   include(cmake/thirdparty/get_nvbench.cmake)
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index d36ecfd3a21..5fd328dfc68 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -40,7 +40,7 @@ target_include_directories(
 
 # Use an OBJECT library so we only compile these helper source files only once
 add_library(
-  cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/base_fixture.cpp"
+  cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
                                synchronization/synchronization.cpp io/cuio_common.cpp
 )
 target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env>)
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index cfb219448f1..10e6b026d9a 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -17,22 +17,7 @@ function(find_and_configure_gtest)
   include(${rapids-cmake-dir}/cpm/gtest.cmake)
 
   # Find or install GoogleTest
-  rapids_cpm_gtest(BUILD_EXPORT_SET cudf-testing-exports INSTALL_EXPORT_SET cudf-testing-exports)
-
-  if(GTest_ADDED)
-    rapids_export(
-      BUILD GTest
-      VERSION ${GTest_VERSION}
-      EXPORT_SET GTestTargets
-      GLOBAL_TARGETS gtest gmock gtest_main gmock_main
-      NAMESPACE GTest::
-    )
-
-    include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(
-      BUILD GTest [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-testing-exports
-    )
-  endif()
+  rapids_cpm_gtest(BUILD_STATIC)
 
 endfunction()
 
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 151fe50be4f..dc873658abf 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf_test/column_utilities.hpp>
-#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/default_stream.hpp>
 
 #include <cudf/column/column.hpp>
diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp
index fa76204d622..89394fbd1c3 100644
--- a/cpp/include/cudf_test/cudf_gtest.hpp
+++ b/cpp/include/cudf_test/cudf_gtest.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,86 +16,6 @@
 
 #pragma once
 
-#ifdef GTEST_INCLUDE_GTEST_GTEST_H_
-#error "Don't include gtest/gtest.h directly, include cudf_gtest.hpp instead"
-#endif
-
-/**
- * @file cudf_gtest.hpp
- * @brief Work around for GTests( <=v1.10 ) emulation of variadic templates in
- * @verbatim ::Testing::Types @endverbatim
- *
- * @note Instead of including `gtest/gtest.h`, all libcudf test files should
- * include `cudf_gtest.hpp` instead.
- *
- * Removes the 50 type limit in a type-parameterized test list.
- *
- * Uses macros to rename GTests's emulated variadic template types and then
- * redefines them properly.
- */
-
-// @cond
-#if __has_include(<gtest/internal/gtest-type-util.h.pump>)
-// gtest doesn't provide a version header so we need to
-// use a file existence trick.
-// gtest-type-util.h.pump only exists in versions < 1.11
-#define Types      Types_NOT_USED
-#define Types0     Types0_NOT_USED
-#define TypeList   TypeList_NOT_USED
-#define Templates  Templates_NOT_USED
-#define Templates0 Templates0_NOT_USED
-#include <gtest/internal/gtest-type-util.h>
-#undef Types
-#undef Types0
-#undef TypeList
-#undef Templates
-#undef Templates0
-
-namespace testing {
-template <class... TYPES>
-struct Types {
-  using type = Types;
-};
-
-template <class T, class... TYPES>
-struct Types<T, TYPES...> {
-  using Head = T;
-  using Tail = Types<TYPES...>;
-
-  using type = Types;
-};
-
-namespace internal {
-using Types0 = Types<>;
-
-template <GTEST_TEMPLATE_... TYPES>
-struct Templates {};
-
-template <GTEST_TEMPLATE_ HEAD, GTEST_TEMPLATE_... TAIL>
-struct Templates<HEAD, TAIL...> {
-  using Head = internal::TemplateSel<HEAD>;
-  using Tail = Templates<TAIL...>;
-
-  using type = Templates;
-};
-
-using Templates0 = Templates<>;
-
-template <typename T>
-struct TypeList {
-  using type = Types<T>;
-};
-
-template <class... TYPES>
-struct TypeList<Types<TYPES...>> {
-  using type = Types<TYPES...>;
-};
-
-}  // namespace internal
-}  // namespace testing
-#endif  // gtest < 1.11
-// @endcond
-
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 053fcc0989a..d0c2b3d2bce 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -55,8 +55,9 @@ function(ConfigureTest CMAKE_TEST_NAME)
   )
 
   target_link_libraries(
-    ${CMAKE_TEST_NAME} PRIVATE cudftestutil GTest::gmock_main GTest::gtest_main nvtx3-cpp
-                               $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIB}"
+    ${CMAKE_TEST_NAME}
+    PRIVATE cudftestutil GTest::gmock GTest::gmock_main GTest::gtest GTest::gtest_main nvtx3-cpp
+            $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIB}"
   )
   rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME})
   rapids_test_add(
diff --git a/cpp/tests/groupby/groupby_test_util.cpp b/cpp/tests/groupby/groupby_test_util.cpp
index de51012e8e1..8bd109fca53 100644
--- a/cpp/tests/groupby/groupby_test_util.cpp
+++ b/cpp/tests/groupby/groupby_test_util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/tests/utilities/base_fixture.cpp b/cpp/tests/utilities/random_seed.cpp
similarity index 100%
rename from cpp/tests/utilities/base_fixture.cpp
rename to cpp/tests/utilities/random_seed.cpp
diff --git a/dependencies.yaml b/dependencies.yaml
index 14c698000cb..de5b3569933 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -247,9 +247,6 @@ dependencies:
       - output_types: conda
         packages:
           - fmt>=10.1.1,<11
-          - &gbench benchmark==1.8.0
-          - &gtest gtest>=1.13.0
-          - &gmock gmock>=1.13.0
           - librmm==24.6.*
           - libkvikio==24.6.*
           - librdkafka>=1.9.0,<1.10.0a0
@@ -585,9 +582,6 @@ dependencies:
       - output_types: conda
         packages:
           - *cmake_ver
-          - *gbench
-          - *gtest
-          - *gmock
     specific:
       - output_types: conda
         matrices:

From 702706d7c2e86e900ffbca0568d6ff9d2e415975 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 23 Apr 2024 15:37:49 -0400
Subject: [PATCH 100/842] Add experimental make_strings_children utility
 (#15363)

Adds new `cudf::strings::detail::experimental::make_strings_children` which uses the offsetalator to build output columns. The current `d_offsets` member required by the given functors no longer stores sizes and offsets but is now split into `d_sizes` and `d_offsets` where `d_sizes` is computed in the first pass and then `d_offsets` is set to an offsetalator for building output in `d_chars`.

Once all the uses of `make_strings_children` (~50 or so) are converted to use the experimental implementation, this will replace the old implementation and the 'experimental' namespace will be removed.

This PR includes 2 changes, `repeat_strings` and `concatenate` (per row) since each use different overloaded `make_strings_children` functions to verify the code does not break any current tests.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)
  - Shruti Shivakumar (https://github.com/shrshi)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15363
---
 .../strings/detail/strings_children_ex.cuh    | 186 ++++++++++++++++++
 cpp/src/strings/combine/concatenate.cu        |  21 +-
 cpp/src/strings/repeat_strings.cu             |  58 +++---
 3 files changed, 232 insertions(+), 33 deletions(-)
 create mode 100644 cpp/include/cudf/strings/detail/strings_children_ex.cuh

diff --git a/cpp/include/cudf/strings/detail/strings_children_ex.cuh b/cpp/include/cudf/strings/detail/strings_children_ex.cuh
new file mode 100644
index 00000000000..6028c7e2437
--- /dev/null
+++ b/cpp/include/cudf/strings/detail/strings_children_ex.cuh
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+namespace experimental {
+
+/**
+ * @brief Kernel used by make_strings_children for calling the given functor
+ *
+ * @tparam SizeAndExecuteFunction Functor type to call in each thread
+ *
+ * @param fn Functor to call in each thread
+ * @param exec_size Total number of threads to be processed by this kernel
+ */
+template <typename SizeAndExecuteFunction>
+CUDF_KERNEL void strings_children_kernel(SizeAndExecuteFunction fn, size_type exec_size)
+{
+  auto tid = cudf::detail::grid_1d::global_thread_id();
+  if (tid < exec_size) { fn(tid); }
+}
+
+/**
+ * @brief Creates child offsets and chars data by applying the template function that
+ * can be used for computing the output size of each string as well as create the output
+ *
+ * The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter
+ * and 3 member variables:
+ * - `d_sizes`: output size in bytes of each output row for the 1st pass call
+ * - `d_chars`: output buffer for new string data for the 2nd pass call
+ * - `d_offsets`: used for addressing the specific output row data in `d_chars`
+ *
+ * The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`.
+ * Null rows should be set with an output size of 0.
+ *
+ * @code{.cpp}
+ * struct size_and_exec_fn {
+ *  size_type* d_sizes;
+ *  char* d_chars;
+ *  input_offsetalator d_offsets;
+ *
+ *   __device__ void operator()(size_type thread_idx)
+ *   {
+ *     // functor-specific logic to resolve out_idx from thread_idx
+ *     if( !d_chars ) {
+ *       d_sizes[out_idx] = output_size;
+ *     } else {
+ *       auto d_output = d_chars + d_offsets[out_idx];
+ *       // write characters to d_output
+ *     }
+ *   }
+ * };
+ * @endcode
+ *
+ * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting
+ *         an index parameter and three member variables: `size_type* d_sizes`
+ *         `char* d_chars`, and `input_offsetalator d_offsets`.
+ *
+ * @param size_and_exec_fn This is called twice. Once for the output size of each string
+ *        and once again to fill in the memory pointed to by d_chars.
+ * @param exec_size Number of threads for executing the `size_and_exec_fn` function
+ * @param strings_count Number of strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned columns' device memory
+ * @return Offsets child column and chars vector for creating a strings column
+ */
+template <typename SizeAndExecuteFunction>
+auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
+                           size_type exec_size,
+                           size_type strings_count,
+                           rmm::cuda_stream_view stream,
+                           rmm::device_async_resource_ref mr)
+{
+  // This is called twice -- once for computing sizes and once for writing chars.
+  // Reducing the number of places size_and_exec_fn is inlined speeds up compile time.
+  auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) {
+    auto constexpr block_size = 256;
+    auto grid                 = cudf::detail::grid_1d{exec_size, block_size};
+    strings_children_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(size_and_exec_fn,
+                                                                                exec_size);
+  };
+
+  // Compute the output sizes
+  auto output_sizes        = rmm::device_uvector<size_type>(strings_count, stream);
+  size_and_exec_fn.d_sizes = output_sizes.data();
+  size_and_exec_fn.d_chars = nullptr;
+  for_each_fn(size_and_exec_fn);
+
+  // Convert the sizes to offsets
+  auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
+    output_sizes.begin(), output_sizes.end(), stream, mr);
+  size_and_exec_fn.d_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
+
+  // Now build the chars column
+  rmm::device_uvector<char> chars(bytes, stream, mr);
+  size_and_exec_fn.d_chars = chars.data();
+
+  // Execute the function fn again to fill in the chars data.
+  if (bytes > 0) { for_each_fn(size_and_exec_fn); }
+
+  return std::pair(std::move(offsets_column), std::move(chars));
+}
+
+/**
+ * @brief Creates child offsets and chars columns by applying the template function that
+ * can be used for computing the output size of each string as well as create the output
+ *
+ * The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter
+ * and 3 member variables:
+ * - `d_sizes`: output size in bytes of each output row for the 1st pass call
+ * - `d_chars`: output buffer for new string data for the 2nd pass call
+ * - `d_offsets`: used for addressing the specific output row data in `d_chars`
+ *
+ * The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`.
+ * Null rows should be set with an output size of 0.
+ *
+ * @code{.cpp}
+ * struct size_and_exec_fn {
+ *  size_type* d_sizes;
+ *  char* d_chars;
+ *  input_offsetalator d_offsets;
+ *
+ *   __device__ void operator()(size_type idx)
+ *   {
+ *     if( !d_chars ) {
+ *       d_sizes[idx] = output_size;
+ *     } else {
+ *       auto d_output = d_chars + d_offsets[idx];
+ *       // write characters to d_output
+ *     }
+ *   }
+ * };
+ * @endcode
+ *
+ * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting
+ *         an index parameter and three member variables: `size_type* d_sizes`
+ *         `char* d_chars`, and `input_offsetalator d_offsets`.
+ *
+ * @param size_and_exec_fn This is called twice. Once for the output size of each string
+ *        and once again to fill in the memory pointed to by `d_chars`.
+ * @param strings_count Number of strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned columns' device memory
+ * @return Offsets child column and chars vector for creating a strings column
+ */
+template <typename SizeAndExecuteFunction>
+auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
+                           size_type strings_count,
+                           rmm::cuda_stream_view stream,
+                           rmm::device_async_resource_ref mr)
+{
+  return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
+}
+
+}  // namespace experimental
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index 33d2de3cd07..97008fa94f8 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -17,11 +17,12 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/offsets_iterator.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/detail/combine.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -49,8 +50,9 @@ struct concat_strings_base {
   table_device_view const d_table;
   string_scalar_device_view const d_narep;
   separator_on_nulls separate_nulls;
-  size_type* d_offsets{};
-  char* d_chars{};
+  size_type* d_sizes;
+  char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Concatenate each table row to a single output string.
@@ -68,7 +70,7 @@ struct concat_strings_base {
         thrust::any_of(thrust::seq, d_table.begin(), d_table.end(), [idx](auto const& col) {
           return col.is_null(idx);
         })) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -95,7 +97,7 @@ struct concat_strings_base {
         write_separator || (separate_nulls == separator_on_nulls::YES) || !null_element;
     }
 
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -113,7 +115,7 @@ struct concat_strings_fn : concat_strings_base {
   {
   }
 
-  __device__ void operator()(size_type idx) { process_row(idx, d_separator); }
+  __device__ void operator()(std::size_t idx) { process_row(idx, d_separator); }
 };
 
 }  // namespace
@@ -143,7 +145,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
   // Create device views from the strings columns.
   auto d_table = table_device_view::create(strings_columns, stream);
   concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls};
-  auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr);
+  auto [offsets_column, chars] = experimental::make_strings_children(fn, strings_count, stream, mr);
 
   // create resulting null mask
   auto [null_mask, null_count] = cudf::detail::valid_if(
@@ -188,7 +190,7 @@ struct multi_separator_concat_fn : concat_strings_base {
   __device__ void operator()(size_type idx)
   {
     if (d_separators.is_null(idx) && !d_separator_narep.is_valid()) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -235,7 +237,8 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
 
   multi_separator_concat_fn mscf{
     *d_table, separator_col_view, separator_rep, col_rep, separate_nulls};
-  auto [offsets_column, chars] = make_strings_children(mscf, strings_count, stream, mr);
+  auto [offsets_column, chars] =
+    experimental::make_strings_children(mscf, strings_count, stream, mr);
 
   // Create resulting null mask
   auto [null_mask, null_count] = cudf::detail::valid_if(
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 97168a7fbd7..de1d5e38e00 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -20,7 +20,8 @@
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/detail/offsets_iterator.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -107,22 +108,26 @@ struct compute_size_and_repeat_fn {
   column_device_view const strings_dv;
   size_type const repeat_times;
   bool const has_nulls;
-
-  size_type* d_offsets{nullptr};
-
-  // If d_chars == nullptr: only compute sizes of the output strings.
-  // If d_chars != nullptr: only repeat strings.
-  char* d_chars{nullptr};
-
-  // `idx` will be in the range of [0, repeat_times * strings_count).
-  __device__ void operator()(size_type const idx) const noexcept
+  size_type* d_sizes;
+  char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
+
+  /**
+   * @brief Called by make_strings_children to build output
+   *
+   * @param idx Thread index in the range [0,repeat_times * strings_count)
+   * @param d_sizes Return output size here in 1st call (d_chars==nullptr)
+   * @param d_chars Write output here in 2nd call
+   * @param d_offsets Offsets to address output row within d_chars
+   */
+  __device__ void operator()(size_type idx) const noexcept
   {
     auto const str_idx    = idx / repeat_times;  // value cycles in [0, string_count)
     auto const repeat_idx = idx % repeat_times;  // value cycles in [0, repeat_times)
     auto const is_valid   = !has_nulls || strings_dv.is_valid_nocheck(str_idx);
 
     if (!d_chars && repeat_idx == 0) {
-      d_offsets[str_idx] =
+      d_sizes[str_idx] =
         is_valid ? repeat_times * strings_dv.element<string_view>(str_idx).size_bytes() : 0;
     }
 
@@ -161,8 +166,8 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
   auto const strings_dv_ptr = column_device_view::create(input.parent(), stream);
   auto const fn = compute_size_and_repeat_fn{*strings_dv_ptr, repeat_times, input.has_nulls()};
 
-  auto [offsets_column, chars] =
-    make_strings_children(fn, strings_count * repeat_times, strings_count, stream, mr);
+  auto [offsets_column, chars] = experimental::make_strings_children(
+    fn, strings_count * repeat_times, strings_count, stream, mr);
   return make_strings_column(strings_count,
                              std::move(offsets_column),
                              chars.release(),
@@ -182,14 +187,19 @@ struct compute_sizes_and_repeat_fn {
   Iterator const repeat_times_iter;
   bool const strings_has_nulls;
   bool const rtimes_has_nulls;
-
-  size_type* d_offsets{nullptr};
-
-  // If d_chars == nullptr: only compute sizes of the output strings.
-  // If d_chars != nullptr: only repeat strings.
-  char* d_chars{nullptr};
-
-  __device__ void operator()(size_type const idx) const noexcept
+  size_type* d_sizes;
+  char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
+
+  /**
+   * @brief Called by make_strings_children to build output
+   *
+   * @param idx Row index
+   * @param d_sizes Return output size here in 1st call (d_chars==nullptr)
+   * @param d_chars Write output here in 2nd call
+   * @param d_offsets Offsets to address output row within d_chars
+   */
+  __device__ void operator()(size_type idx) const noexcept
   {
     auto const string_is_valid = !strings_has_nulls || strings_dv.is_valid_nocheck(idx);
     auto const rtimes_is_valid = !rtimes_has_nulls || repeat_times_dv.is_valid_nocheck(idx);
@@ -197,7 +207,7 @@ struct compute_sizes_and_repeat_fn {
     // Any null input (either string or repeat_times value) will result in a null output.
     auto const is_valid = string_is_valid && rtimes_is_valid;
     if (!is_valid) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -206,7 +216,7 @@ struct compute_sizes_and_repeat_fn {
 
     if (!d_chars) {
       // repeat_times could be negative
-      d_offsets[idx] = (repeat_times > 0) ? (repeat_times * d_str.size_bytes()) : 0;
+      d_sizes[idx] = std::max(repeat_times, 0) * d_str.size_bytes();
     } else {
       auto output_ptr = d_chars + d_offsets[idx];
       while (repeat_times-- > 0) {
@@ -241,7 +251,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                                              input.has_nulls(),
                                                              repeat_times.has_nulls()};
 
-  auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr);
+  auto [offsets_column, chars] = experimental::make_strings_children(fn, strings_count, stream, mr);
 
   // We generate new bitmask by AND of the two input columns' bitmasks.
   // Note that if either of the input columns are nullable, the output column will also be nullable

From b16e5c25eb7c38b26cb0d5b1e96047f0ef968c2b Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 23 Apr 2024 15:43:48 -0400
Subject: [PATCH 101/842] Disable compute-sanitizer usage in CI tests with
 CUDA<11.6 (#15584)

Undoes changes in 15573 since `compute-sanitizer` is not available in the CI test environment with CUDA 11.4.
This disables the example scripts for tests with CUDA < 11.6 only to unblock the nightly builds.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/15584
---
 ci/run_cudf_examples.sh | 6 ++++++
 dependencies.yaml       | 4 ----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh
index 71af6446748..f3561bc595c 100755
--- a/ci/run_cudf_examples.sh
+++ b/ci/run_cudf_examples.sh
@@ -9,6 +9,12 @@ trap "EXITCODE=1" ERR
 # Support customizing the examples' install location
 cd "${INSTALL_PREFIX:-${CONDA_PREFIX:-/usr}}/bin/examples/libcudf/";
 
+# compute-sanitizer not available before CUDA 11.6
+if [[ "${RAPIDS_CUDA_VERSION%.*}" < "11.6" ]]; then
+  echo "computer-sanitizer unavailable pre 11.6"
+  exit 0
+fi
+
 compute-sanitizer --tool memcheck basic_example
 
 compute-sanitizer --tool memcheck deduplication
diff --git a/dependencies.yaml b/dependencies.yaml
index de5b3569933..1508656471d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -593,10 +593,6 @@ dependencies:
               cuda: "11.8"
             packages:
               - cuda-sanitizer-api=11.8.86
-          - matrix:
-              cuda: "11.4"
-            packages:
-              - cuda-sanitizer-api=11.4.120
           - matrix:  # Fallback for CUDA 11 or no matrix
             packages:
   test_java:

From e6d9b9f141ce675313c00aac20194e70bcf52b0b Mon Sep 17 00:00:00 2001
From: Allison Piper <apiper@nvidia.com>
Date: Tue, 23 Apr 2024 17:11:43 -0400
Subject: [PATCH 102/842] Update NVBench fixture to use new hooks, fix pinned
 memory segfault. (#15492)

NVBench recently exposed new hooks for modifying its `main` implementation. Updated cudf to use these.

Also noticed that the host pinned-pool memory resource option caused the test to segfault, since the function-scope static holding the pool outlived the CUDA context. Refactored the fixture a bit to ensure that the pool is destroyed before the context.

Note that this currently overrides the rapids-cmake version for NVBench. Rapids-cmake should be updated and the override removed before this is merged (ping @robertmaynard).

cc: @jrhemstad @davidwendt

Authors:
  - Allison Piper (https://github.com/alliepiper)
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/15492
---
 cpp/benchmarks/fixture/nvbench_fixture.hpp    | 21 +++++++--
 cpp/benchmarks/fixture/nvbench_main.cpp       | 47 ++++++++++++-------
 .../patches/nvbench_global_setup.diff         | 29 ------------
 .../thirdparty/patches/nvbench_override.json  |  9 +---
 4 files changed, 49 insertions(+), 57 deletions(-)
 delete mode 100644 cpp/cmake/thirdparty/patches/nvbench_global_setup.diff

diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp
index 4e4eec3547f..ac0cab4071b 100644
--- a/cpp/benchmarks/fixture/nvbench_fixture.hpp
+++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp
@@ -45,6 +45,8 @@ static std::string cuio_host_mem_param{
  * Initializes the default memory resource to use the RMM pool device resource.
  */
 struct nvbench_base_fixture {
+  using host_pooled_mr_t = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
+
   inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
 
   inline auto make_pool()
@@ -90,12 +92,14 @@ struct nvbench_base_fixture {
 
   inline rmm::host_async_resource_ref make_cuio_host_pinned_pool()
   {
-    using host_pooled_mr = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
-    static std::shared_ptr<host_pooled_mr> mr = std::make_shared<host_pooled_mr>(
-      std::make_shared<rmm::mr::pinned_host_memory_resource>().get(),
-      size_t{1} * 1024 * 1024 * 1024);
+    if (!this->host_pooled_mr) {
+      // Don't store in static, as the CUDA context may be destroyed before static destruction
+      this->host_pooled_mr = std::make_shared<host_pooled_mr_t>(
+        std::make_shared<rmm::mr::pinned_host_memory_resource>().get(),
+        size_t{1} * 1024 * 1024 * 1024);
+    }
 
-    return *mr;
+    return *this->host_pooled_mr;
   }
 
   inline rmm::host_async_resource_ref create_cuio_host_memory_resource(std::string const& mode)
@@ -126,9 +130,16 @@ struct nvbench_base_fixture {
     std::cout << "CUIO host memory resource = " << cuio_host_mode << "\n";
   }
 
+  ~nvbench_base_fixture()
+  {
+    // Ensure the the pool is freed before the CUDA context is destroyed:
+    cudf::io::set_host_memory_resource(this->make_cuio_host_pinned());
+  }
+
   std::shared_ptr<rmm::mr::device_memory_resource> mr;
   std::string rmm_mode{"pool"};
 
+  std::shared_ptr<host_pooled_mr_t> host_pooled_mr;
   std::string cuio_host_mode{"pinned"};
 };
 
diff --git a/cpp/benchmarks/fixture/nvbench_main.cpp b/cpp/benchmarks/fixture/nvbench_main.cpp
index f46cb11a6c3..5dfd67b1c54 100644
--- a/cpp/benchmarks/fixture/nvbench_main.cpp
+++ b/cpp/benchmarks/fixture/nvbench_main.cpp
@@ -15,29 +15,44 @@
  */
 
 #include <benchmarks/fixture/nvbench_fixture.hpp>
-#define NVBENCH_ENVIRONMENT cudf::nvbench_base_fixture
 
 #include <nvbench/main.cuh>
 
+#include <string>
 #include <vector>
 
+namespace cudf {
+
 // strip off the rmm_mode and cuio_host_mem parameters before passing the
 // remaining arguments to nvbench::option_parser
-#undef NVBENCH_MAIN_PARSE
-#define NVBENCH_MAIN_PARSE(argc, argv)                     \
-  nvbench::option_parser parser;                           \
-  std::vector<std::string> m_args;                         \
-  for (int i = 0; i < argc; ++i) {                         \
-    std::string arg = argv[i];                             \
-    if (arg == cudf::detail::rmm_mode_param) {             \
-      i += 2;                                              \
-    } else if (arg == cudf::detail::cuio_host_mem_param) { \
-      i += 2;                                              \
-    } else {                                               \
-      m_args.push_back(arg);                               \
-    }                                                      \
-  }                                                        \
-  parser.parse(m_args)
+void benchmark_arg_handler(std::vector<std::string>& args)
+{
+  std::vector<std::string> _cudf_tmp_args;
+
+  for (std::size_t i = 0; i < args.size(); ++i) {
+    std::string arg = args[i];
+    if (arg == cudf::detail::rmm_mode_param) {
+      i++;  // skip the next argument
+    } else if (arg == cudf::detail::cuio_host_mem_param) {
+      i++;  // skip the next argument
+    } else {
+      _cudf_tmp_args.push_back(arg);
+    }
+  }
+
+  args = _cudf_tmp_args;
+}
+
+}  // namespace cudf
+
+// Install arg handler
+#undef NVBENCH_MAIN_CUSTOM_ARGS_HANDLER
+#define NVBENCH_MAIN_CUSTOM_ARGS_HANDLER(args) cudf::benchmark_arg_handler(args)
+
+// Global fixture setup:
+#undef NVBENCH_MAIN_INITIALIZE_CUSTOM_POST
+#define NVBENCH_MAIN_INITIALIZE_CUSTOM_POST(argc, argv) \
+  [[maybe_unused]] auto env_state = cudf::nvbench_base_fixture(argc, argv);
 
 // this declares/defines the main() function using the definitions above
 NVBENCH_MAIN
diff --git a/cpp/cmake/thirdparty/patches/nvbench_global_setup.diff b/cpp/cmake/thirdparty/patches/nvbench_global_setup.diff
deleted file mode 100644
index 04f96f49b48..00000000000
--- a/cpp/cmake/thirdparty/patches/nvbench_global_setup.diff
+++ /dev/null
@@ -1,29 +0,0 @@
-diff --git a/nvbench/main.cuh b/nvbench/main.cuh
-index 0ba82d7..cca5273 100644
---- a/nvbench/main.cuh
-+++ b/nvbench/main.cuh
-@@ -54,6 +54,16 @@
- // clang-format on
- #endif
-
-+#ifndef NVBENCH_ENVIRONMENT
-+namespace nvbench {
-+struct no_environment
-+{
-+  no_environment(int, char const *const *) {}
-+};
-+}
-+#define NVBENCH_ENVIRONMENT nvbench::no_environment
-+#endif
-+
- #define NVBENCH_MAIN_PARSE(argc, argv)                                                             \
-   nvbench::option_parser parser;                                                                   \
-   parser.parse(argc, argv)
-@@ -77,6 +87,7 @@
-     printer.set_total_state_count(total_states);                                                   \
-                                                                                                    \
-     printer.set_completed_state_count(0);                                                          \
-+    [[maybe_unused]] auto env_state = NVBENCH_ENVIRONMENT(argc, argv);                             \
-     for (auto &bench_ptr : benchmarks)                                                             \
-     {                                                                                              \
-       bench_ptr->set_printer(printer);                                                             \
diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json
index ad9b19c29c1..ef0deb4c1e9 100644
--- a/cpp/cmake/thirdparty/patches/nvbench_override.json
+++ b/cpp/cmake/thirdparty/patches/nvbench_override.json
@@ -2,13 +2,8 @@
 {
   "packages" : {
     "nvbench" : {
-      "patches" : [
-        {
-          "file" : "${current_json_dir}/nvbench_global_setup.diff",
-          "issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]",
-          "fixed_in" : ""
-        }
-      ]
+      "git_url": "https://github.com/NVIDIA/nvbench.git",
+      "git_tag": "555d628e9b250868c9da003e4407087ff1982e8e"
     }
   }
 }

From 6780e59fed3e1a72a64a06be8d41d1747782eda5 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Tue, 23 Apr 2024 14:39:22 -0700
Subject: [PATCH 103/842] Add some missing optional fields to the Parquet
 RowGroup metadata (#15421)

This PR adds the `sorting_columns`, `file_offset`, `total_compressed_size`, and `ordinal` optional fields to the Parquet `RowGroup` metadata object.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Mike Wilson (https://github.com/hyperbolic2346)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15421
---
 cpp/include/cudf/io/parquet.hpp               | 64 +++++++++++++++++++
 cpp/src/io/functions.cpp                      | 14 ++++
 .../io/parquet/compact_protocol_reader.cpp    | 22 ++++++-
 .../io/parquet/compact_protocol_reader.hpp    |  3 +-
 .../io/parquet/compact_protocol_writer.cpp    | 22 +++++++
 .../io/parquet/compact_protocol_writer.hpp    |  3 +
 cpp/src/io/parquet/parquet.hpp                | 25 +++++++-
 cpp/src/io/parquet/writer_impl.cu             | 37 +++++++++--
 cpp/src/io/parquet/writer_impl.hpp            |  1 +
 cpp/tests/io/parquet_writer_test.cpp          | 20 ++++++
 10 files changed, 202 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index f58bc48a37d..0406d6e3e4c 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -516,6 +516,15 @@ class chunked_parquet_reader {
  * @file
  */
 
+/**
+ * @brief Struct used to describe column sorting metadata
+ */
+struct sorting_column {
+  int column_idx{};           //!< leaf column index within the row group
+  bool is_descending{false};  //!< true if sort order is descending
+  bool is_nulls_first{true};  //!< true if nulls come before non-null values
+};
+
 class parquet_writer_options_builder;
 
 /**
@@ -564,6 +573,8 @@ class parquet_writer_options {
   std::shared_ptr<writer_compression_statistics> _compression_stats;
   // write V2 page headers?
   bool _v2_page_headers = false;
+  // Which columns in _table are used for sorting
+  std::optional<std::vector<sorting_column>> _sorting_columns;
 
   /**
    * @brief Constructor from sink and table.
@@ -762,6 +773,13 @@ class parquet_writer_options {
    */
   [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; }
 
+  /**
+   * @brief Returns the sorting_columns.
+   *
+   * @return Column sort order metadata
+   */
+  [[nodiscard]] auto const& get_sorting_columns() const { return _sorting_columns; }
+
   /**
    * @brief Sets partitions.
    *
@@ -893,6 +911,16 @@ class parquet_writer_options {
    * @param val Boolean value to enable/disable writing of V2 page headers.
    */
   void enable_write_v2_headers(bool val) { _v2_page_headers = val; }
+
+  /**
+   * @brief Sets sorting columns.
+   *
+   * @param sorting_columns Column sort order metadata
+   */
+  void set_sorting_columns(std::vector<sorting_column> sorting_columns)
+  {
+    _sorting_columns = std::move(sorting_columns);
+  }
 };
 
 /**
@@ -1144,6 +1172,14 @@ class parquet_writer_options_builder {
    */
   parquet_writer_options_builder& write_v2_headers(bool enabled);
 
+  /**
+   * @brief Sets column sorting metadata to chunked_parquet_writer_options.
+   *
+   * @param sorting_columns Column sort order metadata
+   * @return this for chaining
+   */
+  parquet_writer_options_builder& sorting_columns(std::vector<sorting_column> sorting_columns);
+
   /**
    * @brief move parquet_writer_options member once it's built.
    */
@@ -1231,6 +1267,8 @@ class chunked_parquet_writer_options {
   std::shared_ptr<writer_compression_statistics> _compression_stats;
   // write V2 page headers?
   bool _v2_page_headers = false;
+  // Which columns in _table are used for sorting
+  std::optional<std::vector<sorting_column>> _sorting_columns;
 
   /**
    * @brief Constructor from sink.
@@ -1385,6 +1423,13 @@ class chunked_parquet_writer_options {
    */
   [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; }
 
+  /**
+   * @brief Returns the sorting_columns.
+   *
+   * @return Column sort order metadata
+   */
+  [[nodiscard]] auto const& get_sorting_columns() const { return _sorting_columns; }
+
   /**
    * @brief Sets metadata.
    *
@@ -1502,6 +1547,16 @@ class chunked_parquet_writer_options {
    */
   void enable_write_v2_headers(bool val) { _v2_page_headers = val; }
 
+  /**
+   * @brief Sets sorting columns.
+   *
+   * @param sorting_columns Column sort order metadata
+   */
+  void set_sorting_columns(std::vector<sorting_column> sorting_columns)
+  {
+    _sorting_columns = std::move(sorting_columns);
+  }
+
   /**
    * @brief creates builder to build chunked_parquet_writer_options.
    *
@@ -1741,6 +1796,15 @@ class chunked_parquet_writer_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets column sorting metadata to chunked_parquet_writer_options.
+   *
+   * @param sorting_columns Column sort order metadata
+   * @return this for chaining
+   */
+  chunked_parquet_writer_options_builder& sorting_columns(
+    std::vector<sorting_column> sorting_columns);
+
   /**
    * @brief move chunked_parquet_writer_options member once it's built.
    */
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index f0a37839810..12059dffa4e 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -801,6 +801,13 @@ parquet_writer_options_builder& parquet_writer_options_builder::write_v2_headers
   return *this;
 }
 
+parquet_writer_options_builder& parquet_writer_options_builder::sorting_columns(
+  std::vector<sorting_column> sorting_columns)
+{
+  options._sorting_columns = std::move(sorting_columns);
+  return *this;
+}
+
 void chunked_parquet_writer_options::set_key_value_metadata(
   std::vector<std::map<std::string, std::string>> metadata)
 {
@@ -889,6 +896,13 @@ chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::
   return *this;
 }
 
+chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::sorting_columns(
+  std::vector<sorting_column> sorting_columns)
+{
+  options._sorting_columns = std::move(sorting_columns);
+  return *this;
+}
+
 chunked_parquet_writer_options_builder&
 chunked_parquet_writer_options_builder::max_page_fragment_size(size_type val)
 {
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index d39d832c18c..04a22b41247 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -16,6 +16,8 @@
 
 #include "compact_protocol_reader.hpp"
 
+#include "parquet.hpp"
+
 #include <cudf/utilities/error.hpp>
 
 #include <algorithm>
@@ -171,6 +173,7 @@ class parquet_field_int : public parquet_field {
 };
 
 using parquet_field_int8  = parquet_field_int<int8_t, FieldType::I8>;
+using parquet_field_int16 = parquet_field_int<int16_t, FieldType::I16>;
 using parquet_field_int32 = parquet_field_int<int32_t, FieldType::I32>;
 using parquet_field_int64 = parquet_field_int<int64_t, FieldType::I64>;
 
@@ -618,9 +621,18 @@ void CompactProtocolReader::read(IntType* i)
 
 void CompactProtocolReader::read(RowGroup* r)
 {
+  using optional_i16 = parquet_field_optional<int16_t, parquet_field_int16>;
+  using optional_i64 = parquet_field_optional<int64_t, parquet_field_int64>;
+  using optional_list_sorting_column =
+    parquet_field_optional<std::vector<SortingColumn>, parquet_field_struct_list<SortingColumn>>;
+
   auto op = std::make_tuple(parquet_field_struct_list(1, r->columns),
                             parquet_field_int64(2, r->total_byte_size),
-                            parquet_field_int64(3, r->num_rows));
+                            parquet_field_int64(3, r->num_rows),
+                            optional_list_sorting_column(4, r->sorting_columns),
+                            optional_i64(5, r->file_offset),
+                            optional_i64(6, r->total_compressed_size),
+                            optional_i16(7, r->ordinal));
   function_builder(this, op);
 }
 
@@ -762,6 +774,14 @@ void CompactProtocolReader::read(ColumnOrder* c)
   function_builder(this, op);
 }
 
+void CompactProtocolReader::read(SortingColumn* s)
+{
+  auto op = std::make_tuple(parquet_field_int32(1, s->column_idx),
+                            parquet_field_bool(2, s->descending),
+                            parquet_field_bool(3, s->nulls_first));
+  function_builder(this, op);
+}
+
 /**
  * @brief Constructs the schema from the file-level metadata
  *
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index f244df07176..2ad336a3052 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -120,6 +120,7 @@ class CompactProtocolReader {
   void read(ColumnIndex* c);
   void read(Statistics* s);
   void read(ColumnOrder* c);
+  void read(SortingColumn* s);
 
  public:
   static int NumRequiredBits(uint32_t max_level) noexcept
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index d610ec6c546..1262ca1926d 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -16,6 +16,8 @@
 
 #include "compact_protocol_writer.hpp"
 
+#include "parquet.hpp"
+
 #include <cudf/utilities/error.hpp>
 
 namespace cudf::io::parquet::detail {
@@ -140,6 +142,10 @@ size_t CompactProtocolWriter::write(RowGroup const& r)
   c.field_struct_list(1, r.columns);
   c.field_int(2, r.total_byte_size);
   c.field_int(3, r.num_rows);
+  if (r.sorting_columns.has_value()) { c.field_struct_list(4, r.sorting_columns.value()); }
+  if (r.file_offset.has_value()) { c.field_int(5, r.file_offset.value()); }
+  if (r.total_compressed_size.has_value()) { c.field_int(6, r.total_compressed_size.value()); }
+  if (r.ordinal.has_value()) { c.field_int16(7, r.ordinal.value()); }
   return c.value();
 }
 
@@ -242,6 +248,15 @@ size_t CompactProtocolWriter::write(ColumnOrder const& co)
   return c.value();
 }
 
+size_t CompactProtocolWriter::write(SortingColumn const& sc)
+{
+  CompactProtocolFieldWriter c(*this);
+  c.field_int(1, sc.column_idx);
+  c.field_bool(2, sc.descending);
+  c.field_bool(3, sc.nulls_first);
+  return c.value();
+}
+
 void CompactProtocolFieldWriter::put_byte(uint8_t v) { writer.m_buf.push_back(v); }
 
 void CompactProtocolFieldWriter::put_byte(uint8_t const* raw, uint32_t len)
@@ -292,6 +307,13 @@ inline void CompactProtocolFieldWriter::field_int8(int field, int8_t val)
   current_field_value = field;
 }
 
+inline void CompactProtocolFieldWriter::field_int16(int field, int16_t val)
+{
+  put_field_header(field, current_field_value, FieldType::I16);
+  put_int(val);
+  current_field_value = field;
+}
+
 inline void CompactProtocolFieldWriter::field_int(int field, int32_t val)
 {
   put_field_header(field, current_field_value, FieldType::I32);
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 2ed7c078f8b..2e39abadd24 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -53,6 +53,7 @@ class CompactProtocolWriter {
   size_t write(OffsetIndex const&);
   size_t write(SizeStatistics const&);
   size_t write(ColumnOrder const&);
+  size_t write(SortingColumn const&);
 
  protected:
   std::vector<uint8_t>& m_buf;
@@ -91,6 +92,8 @@ class CompactProtocolFieldWriter {
 
   inline void field_int8(int field, int8_t val);
 
+  inline void field_int16(int field, int16_t val);
+
   inline void field_int(int field, int32_t val);
 
   inline void field_int(int field, int64_t val);
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 08f9fae145b..7f00d63b9c2 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -322,6 +322,15 @@ struct ColumnIndex {
   thrust::optional<std::vector<int64_t>> definition_level_histogram;
 };
 
+/**
+ * @brief Thrift-derived struct describing column sort order
+ */
+struct SortingColumn {
+  int32_t column_idx;  // The column index (in this row group)
+  bool descending;     // If true, indicates this column is sorted in descending order
+  bool nulls_first;    // If true, nulls will come before non-null values
+};
+
 /**
  * @brief Thrift-derived struct describing a column chunk
  */
@@ -374,9 +383,21 @@ struct ColumnChunk {
  * consisting of a column chunk for each column.
  */
 struct RowGroup {
-  int64_t total_byte_size = 0;
+  // Metadata for each column chunk in this row group.
   std::vector<ColumnChunk> columns;
+  // Total byte size of all the uncompressed column data in this row group
+  int64_t total_byte_size = 0;
+  // Number of rows in this row group
   int64_t num_rows = 0;
+  // If set, specifies a sort ordering of the rows in this RowGroup.
+  // The sorting columns can be a subset of all the columns.
+  thrust::optional<std::vector<SortingColumn>> sorting_columns;
+  // Byte offset from beginning of file to first page (data or dictionary) in this row group
+  thrust::optional<int64_t> file_offset;
+  // Total byte size of all compressed (and potentially encrypted) column data in this row group
+  thrust::optional<int64_t> total_compressed_size;
+  // Row group ordinal in the file
+  thrust::optional<int16_t> ordinal;
 };
 
 /**
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 823a08084ee..6a8c31fb96b 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -51,6 +51,7 @@
 
 #include <algorithm>
 #include <cstring>
+#include <iterator>
 #include <numeric>
 #include <utility>
 
@@ -2139,6 +2140,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         }
 
         row_group.total_byte_size += ck.bfr_size;
+        row_group.total_compressed_size =
+          row_group.total_compressed_size.value_or(0) + ck.compressed_size;
         column_chunk_meta.total_uncompressed_size = ck.bfr_size;
         column_chunk_meta.total_compressed_size   = ck.compressed_size;
       }
@@ -2236,6 +2239,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
+    _sorting_columns(options.get_sorting_columns()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
     _single_write_mode(mode),
@@ -2265,6 +2269,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
+    _sorting_columns(options.get_sorting_columns()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
     _single_write_mode(mode),
@@ -2408,12 +2413,15 @@ void writer::impl::write_parquet_data_to_sink(
           _out_sink[p]->host_write(bounce_buffer.data(), ck.compressed_size);
         }
 
+        auto const chunk_offset = _current_chunk_offset[p];
         auto& column_chunk_meta = row_group.columns[i].meta_data;
         column_chunk_meta.data_page_offset =
-          _current_chunk_offset[p] + ((ck.use_dictionary) ? ck.dictionary_size : 0);
-        column_chunk_meta.dictionary_page_offset =
-          (ck.use_dictionary) ? _current_chunk_offset[p] : 0;
+          chunk_offset + ((ck.use_dictionary) ? ck.dictionary_size : 0);
+        column_chunk_meta.dictionary_page_offset = (ck.use_dictionary) ? chunk_offset : 0;
         _current_chunk_offset[p] += ck.compressed_size;
+
+        // save location of first page in row group
+        if (i == 0) { row_group.file_offset = chunk_offset; }
       }
     }
     for (auto const& task : write_tasks) {
@@ -2488,10 +2496,9 @@ std::unique_ptr<std::vector<uint8_t>> writer::impl::close(
     std::vector<uint8_t> buffer;
     CompactProtocolWriter cpw(&buffer);
     file_ender_s fendr;
+    auto& fmd = _agg_meta->file(p);
 
     if (_stats_granularity == statistics_freq::STATISTICS_COLUMN) {
-      auto& fmd = _agg_meta->file(p);
-
       // write column indices, updating column metadata along the way
       int chunkidx = 0;
       for (auto& r : fmd.row_groups) {
@@ -2517,6 +2524,26 @@ std::unique_ptr<std::vector<uint8_t>> writer::impl::close(
       }
     }
 
+    // set row group ordinals
+    auto iter        = thrust::make_counting_iterator(0);
+    auto& row_groups = fmd.row_groups;
+    std::for_each(
+      iter, iter + row_groups.size(), [&row_groups](auto idx) { row_groups[idx].ordinal = idx; });
+
+    // set sorting_columns on row groups
+    if (_sorting_columns.has_value()) {
+      // convert `sorting_column` to `SortingColumn`
+      auto const& sorting_cols = _sorting_columns.value();
+      std::vector<SortingColumn> scols;
+      std::transform(
+        sorting_cols.begin(), sorting_cols.end(), std::back_inserter(scols), [](auto const& sc) {
+          return SortingColumn{sc.column_idx, sc.is_descending, sc.is_nulls_first};
+        });
+      // and copy to each row group
+      std::for_each(iter, iter + row_groups.size(), [&row_groups, &scols](auto idx) {
+        row_groups[idx].sorting_columns = scols;
+      });
+    }
     buffer.resize(0);
     fendr.footer_len = static_cast<uint32_t>(cpw.write(_agg_meta->get_metadata(p)));
     fendr.magic      = parquet_magic;
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 3cbb7630fab..784f78f06d5 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -156,6 +156,7 @@ class writer::impl {
   bool const _int96_timestamps;
   bool const _utc_timestamps;
   bool const _write_v2_headers;
+  std::optional<std::vector<sorting_column>> _sorting_columns;
   int32_t const _column_index_truncate_length;
   std::vector<std::map<std::string, std::string>> const _kv_meta;  // Optional user metadata.
   cudf::io::detail::single_write_mode const
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index caddfee9f02..3a3040f0957 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -27,6 +27,7 @@
 #include <cudf/io/types.hpp>
 #include <cudf/unary.hpp>
 
+#include <src/io/parquet/parquet.hpp>
 #include <src/io/parquet/parquet_common.hpp>
 
 #include <fstream>
@@ -1513,6 +1514,7 @@ TEST_F(ParquetWriterTest, RowGroupMetadata)
   cudf::io::parquet_writer_options opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table)
       .dictionary_policy(cudf::io::dictionary_policy::NEVER)
+      .sorting_columns({{0, false, false}})
       .compression(cudf::io::compression_type::ZSTD);
   cudf::io::write_parquet(opts);
 
@@ -1524,6 +1526,24 @@ TEST_F(ParquetWriterTest, RowGroupMetadata)
   ASSERT_GT(fmd.row_groups.size(), 0);
   EXPECT_GE(fmd.row_groups[0].total_byte_size,
             static_cast<int64_t>(num_rows * sizeof(column_type)));
+
+  // row group file offset should be first page location
+  EXPECT_EQ(fmd.row_groups[0].file_offset, fmd.row_groups[0].columns[0].meta_data.data_page_offset);
+
+  // ordinal should be set to 0
+  ASSERT_TRUE(fmd.row_groups[0].ordinal.has_value());
+  EXPECT_EQ(fmd.row_groups[0].ordinal.value(), 0);
+
+  // only one column, so total_compressed_size should equal compressed size of first chunk
+  ASSERT_TRUE(fmd.row_groups[0].total_compressed_size.has_value());
+  EXPECT_EQ(fmd.row_groups[0].total_compressed_size.value(),
+            fmd.row_groups[0].columns[0].meta_data.total_compressed_size);
+
+  // test that sorting order was written correctly
+  ASSERT_TRUE(fmd.row_groups[0].sorting_columns.has_value());
+  EXPECT_EQ(fmd.row_groups[0].sorting_columns.value()[0].column_idx, 0);
+  EXPECT_FALSE(fmd.row_groups[0].sorting_columns.value()[0].descending);
+  EXPECT_FALSE(fmd.row_groups[0].sorting_columns.value()[0].nulls_first);
 }
 
 TEST_F(ParquetWriterTest, UserRequestedDictFallback)

From 8db1851106e3a250609294a81502f5abff801f67 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Tue, 23 Apr 2024 17:42:57 -0400
Subject: [PATCH 104/842] Add `from_arrow_device` function to cudf interop
 using nanoarrow (#15458)

Adding a corresponding `from_arrow_device` function following up from #15047. This continues the work towards addressing #14926.

Authors:
  - Matt Topol (https://github.com/zeroshade)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/15458
---
 cpp/CMakeLists.txt                           |   1 +
 cpp/include/cudf/interop.hpp                 | 124 ++++
 cpp/src/interop/arrow_utilities.hpp          |  30 +
 cpp/src/interop/from_arrow_device.cu         | 483 ++++++++++++
 cpp/src/interop/to_arrow_device.cu           |   4 +-
 cpp/tests/CMakeLists.txt                     |  10 +-
 cpp/tests/interop/from_arrow_device_test.cpp | 732 +++++++++++++++++++
 cpp/tests/interop/nanoarrow_utils.hpp        | 169 ++---
 cpp/tests/interop/to_arrow_device_test.cpp   |  97 ++-
 9 files changed, 1488 insertions(+), 162 deletions(-)
 create mode 100644 cpp/src/interop/arrow_utilities.hpp
 create mode 100644 cpp/src/interop/from_arrow_device.cu
 create mode 100644 cpp/tests/interop/from_arrow_device_test.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index b6a61368fe7..53da710f0ea 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -359,6 +359,7 @@ add_library(
   src/interop/from_arrow.cu
   src/interop/to_arrow.cu
   src/interop/to_arrow_device.cu
+  src/interop/from_arrow_device.cu
   src/interop/to_arrow_schema.cpp
   src/interop/to_arrow_utilities.cpp
   src/interop/detail/arrow_allocator.cpp
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index defc1fc834c..bb05a622f40 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -348,5 +348,129 @@ std::unique_ptr<cudf::scalar> from_arrow(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray
+ *
+ */
+using owned_columns_t = std::vector<std::unique_ptr<cudf::column>>;
+
+/**
+ * @brief functor for a custom deleter to a unique_ptr of table_view
+ *
+ * When converting from an ArrowDeviceArray, there are cases where data can't
+ * be zero-copy (i.e. bools or non-UINT32 dictionary indices). This custom deleter
+ * is used to maintain ownership over the data allocated since a `cudf::table_view`
+ * doesn't hold ownership.
+ */
+template <typename ViewType>
+struct custom_view_deleter {
+  /**
+   * @brief Construct a new custom view deleter object
+   *
+   * @param owned Vector of owning columns
+   */
+  explicit custom_view_deleter(owned_columns_t&& owned) : owned_mem_{std::move(owned)} {}
+
+  /**
+   * @brief operator to delete the unique_ptr
+   *
+   * @param ptr Pointer to the object to be deleted
+   */
+  void operator()(ViewType* ptr) const { delete ptr; }
+
+  owned_columns_t owned_mem_;  ///< Owned columns that must be deleted.
+};
+
+/**
+ * @brief typedef for a unique_ptr to a `cudf::table_view` with custom deleter
+ *
+ */
+using unique_table_view_t =
+  std::unique_ptr<cudf::table_view, custom_view_deleter<cudf::table_view>>;
+
+/**
+ * @brief Create `cudf::table_view` from given `ArrowDeviceArray` and `ArrowSchema`
+ *
+ * Constructs a non-owning `cudf::table_view` using `ArrowDeviceArray` and `ArrowSchema`,
+ * data must be accessible to the CUDA device. Because the resulting `cudf::table_view` will
+ * not own the data, the `ArrowDeviceArray` must be kept alive for the lifetime of the result.
+ * It is the responsibility of callers to ensure they call the release callback on the
+ * `ArrowDeviceArray` after it is no longer needed, and that the `cudf::table_view` is not
+ * accessed after this happens.
+ *
+ * @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
+ * or `ARROW_DEVICE_CUDA_MANAGED`
+ *
+ * @throws cudf::data_type_error if the input array is not a struct array, non-struct
+ * arrays should be passed to `from_arrow_device_column` instead.
+ *
+ * @throws cudf::data_type_error if the input arrow data type is not supported.
+ *
+ * Each child of the input struct will be the columns of the resulting table_view.
+ *
+ * @note The custom deleter used for the unique_ptr to the table_view maintains ownership
+ * over any memory which is allocated, such as converting boolean columns from the bitmap
+ * used by Arrow to the 1-byte per value for cudf.
+ *
+ * @note If the input `ArrowDeviceArray` contained a non-null sync_event it is assumed
+ * to be a `cudaEvent_t*` and the passed in stream will have `cudaStreamWaitEvent` called
+ * on it with the event. This function, however, will not explicitly synchronize on the
+ * stream.
+ *
+ * @param schema `ArrowSchema` pointer to object describing the type of the device array
+ * @param input `ArrowDeviceArray` pointer to object owning the Arrow data
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to perform any allocations
+ * @return `cudf::table_view` generated from given Arrow data
+ */
+unique_table_view_t from_arrow_device(
+  ArrowSchema const* schema,
+  ArrowDeviceArray const* input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief typedef for a unique_ptr to a `cudf::column_view` with custom deleter
+ *
+ */
+using unique_column_view_t =
+  std::unique_ptr<cudf::column_view, custom_view_deleter<cudf::column_view>>;
+
+/**
+ * @brief Create `cudf::column_view` from given `ArrowDeviceArray` and `ArrowSchema`
+ *
+ * Constructs a non-owning `cudf::column_view` using `ArrowDeviceArray` and `ArrowSchema`,
+ * data must be accessible to the CUDA device. Because the resulting `cudf::column_view` will
+ * not own the data, the `ArrowDeviceArray` must be kept alive for the lifetime of the result.
+ * It is the responsibility of callers to ensure they call the release callback on the
+ * `ArrowDeviceArray` after it is no longer needed, and that the `cudf::column_view` is not
+ * accessed after this happens.
+ *
+ * @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
+ * or `ARROW_DEVICE_CUDA_MANAGED`
+ *
+ * @throws cudf::data_type_error input arrow data type is not supported.
+ *
+ * @note The custom deleter used for the unique_ptr to the table_view maintains ownership
+ * over any memory which is allocated, such as converting boolean columns from the bitmap
+ * used by Arrow to the 1-byte per value for cudf.
+ *
+ * @note If the input `ArrowDeviceArray` contained a non-null sync_event it is assumed
+ * to be a `cudaEvent_t*` and the passed in stream will have `cudaStreamWaitEvent` called
+ * on it with the event. This function, however, will not explicitly synchronize on the
+ * stream.
+ *
+ * @param schema `ArrowSchema` pointer to object describing the type of the device array
+ * @param input `ArrowDeviceArray` pointer to object owning the Arrow data
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to perform any allocations
+ * @return `cudf::column_view` generated from given Arrow data
+ */
+unique_column_view_t from_arrow_device_column(
+  ArrowSchema const* schema,
+  ArrowDeviceArray const* input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp
new file mode 100644
index 00000000000..9bbdaa2c363
--- /dev/null
+++ b/cpp/src/interop/arrow_utilities.hpp
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief constants for buffer indexes of Arrow arrays
+ *
+ */
+static constexpr int validity_buffer_idx         = 0;
+static constexpr int fixed_width_data_buffer_idx = 1;
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
new file mode 100644
index 00000000000..d4d31d1989b
--- /dev/null
+++ b/cpp/src/interop/from_arrow_device.cu
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arrow_utilities.hpp"
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/transform.hpp>
+#include <cudf/detail/unary.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/interop/detail/arrow.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow.hpp>
+
+namespace cudf {
+
+namespace detail {
+data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view)
+{
+  switch (arrow_view->type) {
+    case NANOARROW_TYPE_NA: return data_type(type_id::EMPTY);
+    case NANOARROW_TYPE_BOOL: return data_type(type_id::BOOL8);
+    case NANOARROW_TYPE_INT8: return data_type(type_id::INT8);
+    case NANOARROW_TYPE_INT16: return data_type(type_id::INT16);
+    case NANOARROW_TYPE_INT32: return data_type(type_id::INT32);
+    case NANOARROW_TYPE_INT64: return data_type(type_id::INT64);
+    case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
+    case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
+    case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
+    case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
+    case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32);
+    case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64);
+    case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS);
+    case NANOARROW_TYPE_STRING: return data_type(type_id::STRING);
+    case NANOARROW_TYPE_LIST: return data_type(type_id::LIST);
+    case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32);
+    case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT);
+    case NANOARROW_TYPE_TIMESTAMP: {
+      switch (arrow_view->time_unit) {
+        case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::TIMESTAMP_SECONDS);
+        case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS);
+        case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::TIMESTAMP_MICROSECONDS);
+        case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::TIMESTAMP_NANOSECONDS);
+        default: CUDF_FAIL("Unsupported timestamp unit in arrow", cudf::data_type_error);
+      }
+    }
+    case NANOARROW_TYPE_DURATION: {
+      switch (arrow_view->time_unit) {
+        case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::DURATION_SECONDS);
+        case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::DURATION_MILLISECONDS);
+        case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::DURATION_MICROSECONDS);
+        case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::DURATION_NANOSECONDS);
+        default: CUDF_FAIL("Unsupported duration unit in arrow", cudf::data_type_error);
+      }
+    }
+    case NANOARROW_TYPE_DECIMAL128:
+      return data_type{type_id::DECIMAL128, -arrow_view->decimal_scale};
+    default: CUDF_FAIL("Unsupported type_id conversion to cudf", cudf::data_type_error);
+  }
+}
+
+namespace {
+
+using dispatch_tuple_t = std::tuple<column_view, owned_columns_t>;
+
+struct dispatch_from_arrow_device {
+  template <typename T,
+            CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() &&
+                           !std::is_same_v<T, numeric::decimal128>)>
+  dispatch_tuple_t operator()(ArrowSchemaView*,
+                              ArrowArray const*,
+                              data_type,
+                              bool,
+                              rmm::cuda_stream_view,
+                              rmm::mr::device_memory_resource*)
+  {
+    CUDF_FAIL("Unsupported type in from_arrow_device", cudf::data_type_error);
+  }
+
+  template <typename T,
+            CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || std::is_same_v<T, numeric::decimal128>)>
+  dispatch_tuple_t operator()(ArrowSchemaView* schema,
+                              ArrowArray const* input,
+                              data_type type,
+                              bool skip_mask,
+                              rmm::cuda_stream_view,
+                              rmm::mr::device_memory_resource*)
+  {
+    size_type const num_rows   = input->length;
+    size_type const offset     = input->offset;
+    size_type const null_count = input->null_count;
+    bitmask_type const* null_mask =
+      skip_mask ? nullptr
+                : reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]);
+    auto data_buffer = input->buffers[fixed_width_data_buffer_idx];
+    return std::make_tuple<column_view, owned_columns_t>(
+      {type, num_rows, data_buffer, null_mask, null_count, offset}, {});
+  }
+};
+
+// forward declaration is needed because `type_dispatch` instantiates the
+// dispatch_from_arrow_device struct causing a recursive situation for struct,
+// dictionary and list_view types.
+dispatch_tuple_t get_column(ArrowSchemaView* schema,
+                            ArrowArray const* input,
+                            data_type type,
+                            bool skip_mask,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr);
+
+template <>
+dispatch_tuple_t dispatch_from_arrow_device::operator()<bool>(ArrowSchemaView* schema,
+                                                              ArrowArray const* input,
+                                                              data_type type,
+                                                              bool skip_mask,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::mr::device_memory_resource* mr)
+{
+  if (input->length == 0) {
+    return std::make_tuple<column_view, owned_columns_t>(
+      {type,
+       0,
+       nullptr,
+       skip_mask ? nullptr
+                 : reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+       0},
+      {});
+  }
+
+  auto out_col = mask_to_bools(
+    reinterpret_cast<bitmask_type const*>(input->buffers[fixed_width_data_buffer_idx]),
+    input->offset,
+    input->offset + input->length,
+    stream,
+    mr);
+  auto const has_nulls = skip_mask ? false : input->buffers[validity_buffer_idx] != nullptr;
+  if (has_nulls) {
+    auto out_mask = cudf::detail::copy_bitmask(
+      reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+      input->offset,
+      input->offset + input->length,
+      stream,
+      mr);
+    out_col->set_null_mask(std::move(out_mask), input->null_count);
+  }
+
+  auto out_view = out_col->view();
+  owned_columns_t owned;
+  owned.emplace_back(std::move(out_col));
+  return std::make_tuple<column_view, owned_columns_t>(std::move(out_view), std::move(owned));
+}
+
+template <>
+dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::string_view>(
+  ArrowSchemaView* schema,
+  ArrowArray const* input,
+  data_type type,
+  bool skip_mask,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  if (input->length == 0) {
+    return std::make_tuple<column_view, owned_columns_t>(
+      {type,
+       0,
+       nullptr,
+       skip_mask ? nullptr
+                 : reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+       0},
+      {});
+  }
+
+  auto offsets_view = column_view{data_type(type_id::INT32),
+                                  static_cast<size_type>(input->offset + input->length) + 1,
+                                  input->buffers[fixed_width_data_buffer_idx],
+                                  nullptr,
+                                  0,
+                                  0};
+  return std::make_tuple<column_view, owned_columns_t>(
+    {type,
+     static_cast<size_type>(input->length),
+     input->buffers[2],
+     skip_mask ? nullptr
+               : reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+     static_cast<size_type>(input->null_count),
+     static_cast<size_type>(input->offset),
+     {offsets_view}},
+    {});
+}
+
+template <>
+dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::dictionary32>(
+  ArrowSchemaView* schema,
+  ArrowArray const* input,
+  data_type type,
+  bool skip_mask,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  ArrowSchemaView keys_schema_view;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaViewInit(&keys_schema_view, schema->schema->dictionary, nullptr));
+
+  auto const keys_type = arrow_to_cudf_type(&keys_schema_view);
+  auto [keys_view, owned_cols] =
+    get_column(&keys_schema_view, input->dictionary, keys_type, true, stream, mr);
+
+  auto const dict_indices_type = [&schema]() -> data_type {
+    // cudf dictionary requires an unsigned type for the indices,
+    // since it is invalid for an arrow dictionary to contain negative
+    // indices, we can safely use the unsigned equivalent without having
+    // to modify the buffers.
+    switch (schema->storage_type) {
+      case NANOARROW_TYPE_INT8:
+      case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
+      case NANOARROW_TYPE_INT16:
+      case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
+      case NANOARROW_TYPE_INT32:
+      case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
+      case NANOARROW_TYPE_INT64:
+      case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
+      default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error);
+    }
+  }();
+
+  size_type const num_rows   = input->length;
+  size_type const offset     = input->offset;
+  size_type const null_count = input->null_count;
+  column_view indices_view   = column_view{dict_indices_type,
+                                         offset + num_rows,
+                                         input->buffers[fixed_width_data_buffer_idx],
+                                         nullptr,
+                                         0,
+                                         0};
+
+  return std::make_tuple<column_view, owned_columns_t>(
+    {type,
+     num_rows,
+     nullptr,
+     reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+     null_count,
+     offset,
+     {indices_view, keys_view}},
+    std::move(owned_cols));
+}
+
+template <>
+dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::struct_view>(
+  ArrowSchemaView* schema,
+  ArrowArray const* input,
+  data_type type,
+  bool skip_mask,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  std::vector<column_view> children;
+  owned_columns_t out_owned_cols;
+  std::transform(
+    input->children,
+    input->children + input->n_children,
+    schema->schema->children,
+    std::back_inserter(children),
+    [&out_owned_cols, &stream, &mr](ArrowArray const* child, ArrowSchema const* child_schema) {
+      ArrowSchemaView view;
+      NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr));
+      auto type              = arrow_to_cudf_type(&view);
+      auto [out_view, owned] = get_column(&view, child, type, false, stream, mr);
+      if (out_owned_cols.empty()) {
+        out_owned_cols = std::move(owned);
+      } else {
+        out_owned_cols.insert(std::end(out_owned_cols),
+                              std::make_move_iterator(std::begin(owned)),
+                              std::make_move_iterator(std::end(owned)));
+      }
+      return out_view;
+    });
+
+  size_type const num_rows   = input->length;
+  size_type const offset     = input->offset;
+  size_type const null_count = input->null_count;
+  return std::make_tuple<column_view, owned_columns_t>(
+    {type,
+     num_rows,
+     nullptr,
+     reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+     null_count,
+     offset,
+     std::move(children)},
+    std::move(out_owned_cols));
+}
+
+template <>
+dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::list_view>(
+  ArrowSchemaView* schema,
+  ArrowArray const* input,
+  data_type type,
+  bool skip_mask,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  size_type const num_rows   = input->length;
+  size_type const offset     = input->offset;
+  size_type const null_count = input->null_count;
+  auto offsets_view          = column_view{data_type(type_id::INT32),
+                                  offset + num_rows + 1,
+                                  input->buffers[fixed_width_data_buffer_idx],
+                                  nullptr,
+                                  0,
+                                  0};
+
+  ArrowSchemaView child_schema_view;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaViewInit(&child_schema_view, schema->schema->children[0], nullptr));
+  auto child_type = arrow_to_cudf_type(&child_schema_view);
+  auto [child_view, owned] =
+    get_column(&child_schema_view, input->children[0], child_type, false, stream, mr);
+
+  // in the scenario where we were sliced and there are more elements in the child_view
+  // than can be referenced by the sliced offsets, we need to slice the child_view
+  // so that when `get_sliced_child` is called, we still produce the right result
+  auto max_child_offset = cudf::detail::get_value<int32_t>(offsets_view, offset + num_rows, stream);
+  child_view            = cudf::slice(child_view, {0, max_child_offset}, stream).front();
+
+  return std::make_tuple<column_view, owned_columns_t>(
+    {type,
+     num_rows,
+     rmm::device_buffer{0, stream, mr}.data(),
+     reinterpret_cast<bitmask_type const*>(input->buffers[validity_buffer_idx]),
+     null_count,
+     offset,
+     {offsets_view, child_view}},
+    std::move(owned));
+}
+
+dispatch_tuple_t get_column(ArrowSchemaView* schema,
+                            ArrowArray const* input,
+                            data_type type,
+                            bool skip_mask,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr)
+{
+  return type.id() != type_id::EMPTY
+           ? std::move(type_dispatcher(
+               type, dispatch_from_arrow_device{}, schema, input, type, skip_mask, stream, mr))
+           : std::make_tuple<column_view, owned_columns_t>({data_type(type_id::EMPTY),
+                                                            static_cast<size_type>(input->length),
+                                                            nullptr,
+                                                            nullptr,
+                                                            static_cast<size_type>(input->length)},
+                                                           {});
+}
+
+}  // namespace
+
+unique_table_view_t from_arrow_device(ArrowSchemaView* schema,
+                                      ArrowDeviceArray const* input,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  if (input->sync_event != nullptr) {
+    CUDF_CUDA_TRY(
+      cudaStreamWaitEvent(stream.value(), *reinterpret_cast<cudaEvent_t*>(input->sync_event)));
+  }
+
+  std::vector<column_view> columns;
+  owned_columns_t owned_mem;
+
+  auto type = arrow_to_cudf_type(schema);
+  CUDF_EXPECTS(type == data_type(type_id::STRUCT),
+               "Must pass a struct to `from_arrow_device`",
+               cudf::data_type_error);
+  std::transform(
+    input->array.children,
+    input->array.children + input->array.n_children,
+    schema->schema->children,
+    std::back_inserter(columns),
+    [&owned_mem, &stream, &mr](ArrowArray const* child, ArrowSchema const* child_schema) {
+      ArrowSchemaView view;
+      NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr));
+      auto type              = arrow_to_cudf_type(&view);
+      auto [out_view, owned] = get_column(&view, child, type, false, stream, mr);
+      if (owned_mem.empty()) {
+        owned_mem = std::move(owned);
+      } else {
+        owned_mem.insert(std::end(owned_mem),
+                         std::make_move_iterator(std::begin(owned)),
+                         std::make_move_iterator(std::end(owned)));
+      }
+      return out_view;
+    });
+
+  return unique_table_view_t{new table_view{columns},
+                             custom_view_deleter<cudf::table_view>{std::move(owned_mem)}};
+}
+
+unique_column_view_t from_arrow_device_column(ArrowSchemaView* schema,
+                                              ArrowDeviceArray const* input,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  if (input->sync_event != nullptr) {
+    CUDF_CUDA_TRY(
+      cudaStreamWaitEvent(stream.value(), *reinterpret_cast<cudaEvent_t*>(input->sync_event)));
+  }
+
+  auto type             = arrow_to_cudf_type(schema);
+  auto [colview, owned] = get_column(schema, &input->array, type, false, stream, mr);
+  return unique_column_view_t{new column_view{colview},
+                              custom_view_deleter<cudf::column_view>{std::move(owned)}};
+}
+
+}  // namespace detail
+
+unique_table_view_t from_arrow_device(ArrowSchema const* schema,
+                                      ArrowDeviceArray const* input,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(schema != nullptr && input != nullptr,
+               "input ArrowSchema and ArrowDeviceArray must not be NULL");
+  CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA ||
+                 input->device_type == ARROW_DEVICE_CUDA_HOST ||
+                 input->device_type == ARROW_DEVICE_CUDA_MANAGED,
+               "ArrowDeviceArray memory must be accessible to CUDA");
+
+  CUDF_FUNC_RANGE();
+
+  rmm::cuda_set_device_raii dev(
+    rmm::cuda_device_id{static_cast<rmm::cuda_device_id::value_type>(input->device_id)});
+  ArrowSchemaView view;
+  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr));
+  return detail::from_arrow_device(&view, input, stream, mr);
+}
+
+unique_column_view_t from_arrow_device_column(ArrowSchema const* schema,
+                                              ArrowDeviceArray const* input,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(schema != nullptr && input != nullptr,
+               "input ArrowSchema and ArrowDeviceArray must not be NULL");
+  CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA ||
+                 input->device_type == ARROW_DEVICE_CUDA_HOST ||
+                 input->device_type == ARROW_DEVICE_CUDA_MANAGED,
+               "ArrowDeviceArray must be accessible to CUDA");
+
+  CUDF_FUNC_RANGE();
+
+  rmm::cuda_set_device_raii dev(
+    rmm::cuda_device_id{static_cast<rmm::cuda_device_id::value_type>(input->device_id)});
+  ArrowSchemaView view;
+  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr));
+  return detail::from_arrow_device_column(&view, input, stream, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index 737f8c7f625..f2b1669df9b 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "arrow_utilities.hpp"
 #include "to_arrow_utilities.hpp"
 
 #include <cudf/column/column.hpp>
@@ -49,9 +50,6 @@ namespace cudf {
 namespace detail {
 namespace {
 
-static constexpr int validity_buffer_idx         = 0;
-static constexpr int fixed_width_data_buffer_idx = 1;
-
 template <typename T>
 void device_buffer_finalize(ArrowBufferAllocator* allocator, uint8_t*, int64_t)
 {
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d0c2b3d2bce..f59e675e1d5 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -268,8 +268,14 @@ ConfigureTest(
 # ##################################################################################################
 # * interop tests -------------------------------------------------------------------------
 ConfigureTest(
-  INTEROP_TEST interop/to_arrow_device_test.cpp interop/to_arrow_test.cpp
-  interop/from_arrow_test.cpp interop/dlpack_test.cpp EXTRA_LIB nanoarrow
+  INTEROP_TEST
+  interop/to_arrow_device_test.cpp
+  interop/to_arrow_test.cpp
+  interop/from_arrow_test.cpp
+  interop/from_arrow_device_test.cpp
+  interop/dlpack_test.cpp
+  EXTRA_LIB
+  nanoarrow
 )
 
 # ##################################################################################################
diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp
new file mode 100644
index 00000000000..95cbe8057d1
--- /dev/null
+++ b/cpp/tests/interop/from_arrow_device_test.cpp
@@ -0,0 +1,732 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nanoarrow_utils.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/testing_main.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/dictionary/encode.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+struct FromArrowDeviceTest : public cudf::test::BaseFixture {};
+
+template <typename T>
+struct FromArrowDeviceTestDurationsTest : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(FromArrowDeviceTestDurationsTest, cudf::test::DurationTypes);
+
+TEST_F(FromArrowDeviceTest, FailConditions)
+{
+  // can't pass null for schema or device array
+  EXPECT_THROW(cudf::from_arrow_device(nullptr, nullptr), cudf::logic_error);
+  // can't pass null for device array
+  ArrowSchema schema;
+  EXPECT_THROW(cudf::from_arrow_device(&schema, nullptr), cudf::logic_error);
+  // device_type must be CUDA/CUDA_HOST/CUDA_MANAGED
+  // should fail with ARROW_DEVICE_CPU
+  ArrowDeviceArray arr;
+  arr.device_type = ARROW_DEVICE_CPU;
+  EXPECT_THROW(cudf::from_arrow_device(&schema, &arr), cudf::logic_error);
+
+  // can't pass null for schema or device array
+  EXPECT_THROW(cudf::from_arrow_device_column(nullptr, nullptr), cudf::logic_error);
+  // can't pass null for device array
+  EXPECT_THROW(cudf::from_arrow_device_column(&schema, nullptr), cudf::logic_error);
+  // device_type must be CUDA/CUDA_HOST/CUDA_MANAGED
+  // should fail with ARROW_DEVICE_CPU
+  EXPECT_THROW(cudf::from_arrow_device_column(&schema, &arr), cudf::logic_error);
+}
+
+TEST_F(FromArrowDeviceTest, EmptyTable)
+{
+  const auto [table, schema, arr] = get_nanoarrow_tables(0);
+
+  auto expected_cudf_table = table->view();
+
+  ArrowDeviceArray input;
+  memcpy(&input.array, arr.get(), sizeof(ArrowArray));
+  input.device_id   = rmm::get_current_cuda_device().value();
+  input.device_type = ARROW_DEVICE_CUDA;
+  input.sync_event  = nullptr;
+
+  auto got_cudf_table = cudf::from_arrow_device(schema.get(), &input);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table, *got_cudf_table);
+
+  auto got_cudf_col = cudf::from_arrow_device_column(schema.get(), &input);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table, from_struct);
+}
+
+TEST_F(FromArrowDeviceTest, DateTimeTable)
+{
+  auto data = std::vector<int64_t>{1, 2, 3, 4, 5, 6};
+  auto col  = cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(
+    data.begin(), data.end());
+
+  cudf::table_view expected_table_view({col});
+
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+  ArrowSchemaInit(input_schema->children[0]);
+  ArrowSchemaSetTypeDateTime(
+    input_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr);
+  ArrowSchemaSetName(input_schema->children[0], "a");
+
+  nanoarrow::UniqueArray input_array;
+  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  input_array->length                  = 6;
+  input_array->null_count              = 0;
+  input_array->children[0]->length     = 6;
+  input_array->children[0]->null_count = 0;
+  ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc);
+  ArrowArrayBuffer(input_array->children[0], 1)->data =
+    const_cast<uint8_t*>(cudf::column_view(col).data<uint8_t>());
+  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr);
+
+  ArrowDeviceArray input_device_array;
+  input_device_array.device_id   = rmm::get_current_cuda_device().value();
+  input_device_array.device_type = ARROW_DEVICE_CUDA;
+  input_device_array.sync_event  = nullptr;
+  memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+  auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, *got_cudf_table_view);
+
+  auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+}
+
+TYPED_TEST(FromArrowDeviceTestDurationsTest, DurationTable)
+{
+  using T = TypeParam;
+
+  if (cudf::type_to_id<TypeParam>() == cudf::type_id::DURATION_DAYS) { return; }
+
+  auto data = {T{1}, T{2}, T{3}, T{4}, T{5}, T{6}};
+  auto col  = cudf::test::fixed_width_column_wrapper<T>(data);
+
+  cudf::table_view expected_table_view({col});
+  const ArrowTimeUnit time_unit = [&] {
+    switch (cudf::type_to_id<TypeParam>()) {
+      case cudf::type_id::DURATION_SECONDS: return NANOARROW_TIME_UNIT_SECOND;
+      case cudf::type_id::DURATION_MILLISECONDS: return NANOARROW_TIME_UNIT_MILLI;
+      case cudf::type_id::DURATION_MICROSECONDS: return NANOARROW_TIME_UNIT_MICRO;
+      case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TIME_UNIT_NANO;
+      default: CUDF_FAIL("Unsupported duration unit in arrow");
+    }
+  }();
+
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+
+  ArrowSchemaInit(input_schema->children[0]);
+  ArrowSchemaSetTypeDateTime(
+    input_schema->children[0], NANOARROW_TYPE_DURATION, time_unit, nullptr);
+  ArrowSchemaSetName(input_schema->children[0], "a");
+
+  auto data_ptr = expected_table_view.column(0).data<uint8_t>();
+  nanoarrow::UniqueArray input_array;
+  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  input_array->length                  = expected_table_view.num_rows();
+  input_array->null_count              = 0;
+  input_array->children[0]->length     = expected_table_view.num_rows();
+  input_array->children[0]->null_count = 0;
+  ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc);
+  ArrowArrayBuffer(input_array->children[0], 1)->data = const_cast<uint8_t*>(data_ptr);
+  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr);
+
+  ArrowDeviceArray input_device_array;
+  input_device_array.device_id   = rmm::get_current_cuda_device().value();
+  input_device_array.device_type = ARROW_DEVICE_CUDA;
+  input_device_array.sync_event  = nullptr;
+  memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+  auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, *got_cudf_table_view);
+
+  auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+}
+
+TEST_F(FromArrowDeviceTest, NestedList)
+{
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; });
+  auto col = cudf::test::lists_column_wrapper<int64_t>(
+    {{{{{1, 2}, valids}, {{3, 4}, valids}, {5}}, {{6}, {{7, 8, 9}, valids}}}, valids});
+  cudf::table_view expected_table_view({col});
+
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+
+  ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_LIST);
+  ArrowSchemaSetName(input_schema->children[0], "a");
+  input_schema->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  ArrowSchemaInitFromType(input_schema->children[0]->children[0], NANOARROW_TYPE_LIST);
+  ArrowSchemaSetName(input_schema->children[0]->children[0], "element");
+  input_schema->children[0]->children[0]->flags = 0;
+
+  ArrowSchemaInitFromType(input_schema->children[0]->children[0]->children[0],
+                          NANOARROW_TYPE_INT64);
+  ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element");
+  input_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  nanoarrow::UniqueArray input_array;
+  EXPECT_EQ(NANOARROW_OK, ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
+  input_array->length = expected_table_view.num_rows();
+  auto top_list       = input_array->children[0];
+  cudf::lists_column_view lview{expected_table_view.column(0)};
+  populate_list_from_col(top_list, lview);
+  cudf::lists_column_view nested_view{lview.child()};
+  populate_list_from_col(top_list->children[0], nested_view);
+  populate_from_col<int64_t>(top_list->children[0]->children[0], nested_view.child());
+  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+  ArrowDeviceArray input_device_array;
+  input_device_array.device_id   = rmm::get_current_cuda_device().value();
+  input_device_array.device_type = ARROW_DEVICE_CUDA;
+  input_device_array.sync_event  = nullptr;
+  memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+  auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, *got_cudf_table_view);
+
+  auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+}
+
+TEST_F(FromArrowDeviceTest, StructColumn)
+{
+  using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
+
+  // Create cudf table
+  auto nested_type_field_names =
+    std::vector<std::vector<std::string>>{{"string", "integral", "bool", "nested_list", "struct"}};
+  auto str_col =
+    cudf::test::strings_column_wrapper{
+      "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}
+      .release();
+  auto str_col2 =
+    cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {0, 1, 0}}.release();
+  int num_rows{str_col->size()};
+  auto int_col = cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{48, 27, 25}}.release();
+  auto int_col2 =
+    cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
+  auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
+  auto list_col =
+    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
+      .release();
+  vector_of_columns cols2;
+  cols2.push_back(std::move(str_col2));
+  cols2.push_back(std::move(int_col2));
+  auto [null_mask, null_count] =
+    cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}});
+  auto sub_struct_col =
+    cudf::make_structs_column(num_rows, std::move(cols2), null_count, std::move(*null_mask));
+  vector_of_columns cols;
+  cols.push_back(std::move(str_col));
+  cols.push_back(std::move(int_col));
+  cols.push_back(std::move(bool_col));
+  cols.push_back(std::move(list_col));
+  cols.push_back(std::move(sub_struct_col));
+
+  auto struct_col = cudf::make_structs_column(num_rows, std::move(cols), 0, {});
+  cudf::table_view expected_table_view({struct_col->view()});
+
+  // Create name metadata
+  auto sub_metadata          = cudf::column_metadata{"struct"};
+  sub_metadata.children_meta = {{"string2"}, {"integral2"}};
+  auto metadata              = cudf::column_metadata{"a"};
+  metadata.children_meta     = {{"string"}, {"integral"}, {"bool"}, {"nested_list"}, sub_metadata};
+
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+
+  ArrowSchemaInit(input_schema->children[0]);
+  ArrowSchemaSetTypeStruct(input_schema->children[0], 5);
+  ArrowSchemaSetName(input_schema->children[0], "a");
+  input_schema->children[0]->flags = 0;
+
+  auto child = input_schema->children[0];
+  ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING);
+  ArrowSchemaSetName(child->children[0], "string");
+  child->children[0]->flags = 0;
+
+  ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32);
+  ArrowSchemaSetName(child->children[1], "integral");
+  child->children[1]->flags = 0;
+
+  ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL);
+  ArrowSchemaSetName(child->children[2], "bool");
+  child->children[2]->flags = 0;
+
+  ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST);
+  ArrowSchemaSetName(child->children[3], "nested_list");
+  child->children[3]->flags = 0;
+  ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST);
+  ArrowSchemaSetName(child->children[3]->children[0], "element");
+  child->children[3]->children[0]->flags = 0;
+  ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64);
+  ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element");
+  child->children[3]->children[0]->children[0]->flags = 0;
+
+  ArrowSchemaInit(child->children[4]);
+  ArrowSchemaSetTypeStruct(child->children[4], 2);
+  ArrowSchemaSetName(child->children[4], "struct");
+
+  ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING);
+  ArrowSchemaSetName(child->children[4]->children[0], "string2");
+  ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32);
+  ArrowSchemaSetName(child->children[4]->children[1], "integral2");
+
+  nanoarrow::UniqueArray input_array;
+  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+
+  input_array->length = expected_table_view.num_rows();
+
+  auto array_a        = input_array->children[0];
+  auto view_a         = expected_table_view.column(0);
+  array_a->length     = view_a.size();
+  array_a->null_count = view_a.null_count();
+
+  ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc);
+  ArrowArrayValidityBitmap(array_a)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_a.null_mask()));
+
+  populate_from_col<cudf::string_view>(array_a->children[0], view_a.child(0));
+  populate_from_col<int32_t>(array_a->children[1], view_a.child(1));
+  populate_from_col<bool>(array_a->children[2], view_a.child(2));
+  populate_list_from_col(array_a->children[3], cudf::lists_column_view{view_a.child(3)});
+  populate_list_from_col(array_a->children[3]->children[0],
+                         cudf::lists_column_view{view_a.child(3).child(1)});
+  populate_from_col<int64_t>(array_a->children[3]->children[0]->children[0],
+                             view_a.child(3).child(1).child(1));
+
+  auto array_struct        = array_a->children[4];
+  auto view_struct         = view_a.child(4);
+  array_struct->length     = view_struct.size();
+  array_struct->null_count = view_struct.null_count();
+
+  ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc);
+  ArrowArrayValidityBitmap(array_struct)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_struct.null_mask()));
+
+  populate_from_col<cudf::string_view>(array_struct->children[0], view_struct.child(0));
+  populate_from_col<int32_t>(array_struct->children[1], view_struct.child(1));
+
+  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+  ArrowDeviceArray input_device_array;
+  input_device_array.device_id   = rmm::get_current_cuda_device().value();
+  input_device_array.device_type = ARROW_DEVICE_CUDA;
+  input_device_array.sync_event  = nullptr;
+  memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+  auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, *got_cudf_table_view);
+
+  {
+    // there's one boolean column so we should have one "owned_mem" column in the
+    // returned unique_ptr's custom deleter
+    const cudf::custom_view_deleter<cudf::table_view>& deleter = got_cudf_table_view.get_deleter();
+    EXPECT_EQ(deleter.owned_mem_.size(), 1);
+  }
+
+  auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+
+  {
+    // there's one boolean column so we should have one "owned_mem" column in the
+    // returned unique_ptr's custom deleter
+    const cudf::custom_view_deleter<cudf::column_view>& deleter = got_cudf_col.get_deleter();
+    EXPECT_EQ(deleter.owned_mem_.size(), 1);
+  }
+}
+
+TEST_F(FromArrowDeviceTest, DictionaryIndicesType)
+{
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  auto col = cudf::test::fixed_width_column_wrapper<int64_t>({1, 2, 5, 2, 7}, {1, 0, 1, 1, 1});
+  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
+  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
+  columns.emplace_back(std::move(cudf::dictionary::encode(col)));
+
+  cudf::table expected_table(std::move(columns));
+  cudf::table_view expected_table_view = expected_table.view();
+
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  ArrowSchemaSetTypeStruct(input_schema.get(), 3);
+
+  ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_INT8);
+  ArrowSchemaSetName(input_schema->children[0], "a");
+  ArrowSchemaAllocateDictionary(input_schema->children[0]);
+  ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64);
+
+  ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_INT16);
+  ArrowSchemaSetName(input_schema->children[1], "b");
+  ArrowSchemaAllocateDictionary(input_schema->children[1]);
+  ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64);
+
+  ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_INT64);
+  ArrowSchemaSetName(input_schema->children[2], "c");
+  ArrowSchemaAllocateDictionary(input_schema->children[2]);
+  ArrowSchemaInitFromType(input_schema->children[2]->dictionary, NANOARROW_TYPE_INT64);
+
+  nanoarrow::UniqueArray input_array;
+  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  input_array->length     = expected_table.num_rows();
+  input_array->null_count = 0;
+
+  auto col1_indices =
+    cudf::test::fixed_width_column_wrapper<int8_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  populate_from_col<int8_t>(input_array->children[0], col1_indices);
+  populate_from_col<int64_t>(input_array->children[0]->dictionary,
+                             cudf::dictionary_column_view{expected_table_view.column(0)}.keys());
+
+  auto col2_indices =
+    cudf::test::fixed_width_column_wrapper<int16_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  populate_from_col<int16_t>(input_array->children[1], col2_indices);
+  populate_from_col<int64_t>(input_array->children[1]->dictionary,
+                             cudf::dictionary_column_view{expected_table_view.column(1)}.keys());
+
+  auto col3_indices =
+    cudf::test::fixed_width_column_wrapper<int64_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  populate_from_col<int64_t>(input_array->children[2], col3_indices);
+  populate_from_col<int64_t>(input_array->children[2]->dictionary,
+                             cudf::dictionary_column_view{expected_table_view.column(2)}.keys());
+
+  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+  ArrowDeviceArray input_device_array;
+  input_device_array.device_id   = rmm::get_current_cuda_device().value();
+  input_device_array.device_type = ARROW_DEVICE_CUDA;
+  input_device_array.sync_event  = nullptr;
+  memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+  auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, *got_cudf_table_view);
+
+  {
+    const cudf::custom_view_deleter<cudf::table_view>& deleter = got_cudf_table_view.get_deleter();
+    EXPECT_EQ(deleter.owned_mem_.size(), 0);
+  }
+
+  auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+
+  {
+    const cudf::custom_view_deleter<cudf::column_view>& deleter = got_cudf_col.get_deleter();
+    EXPECT_EQ(deleter.owned_mem_.size(), 0);
+  }
+}
+
+void slice_nanoarrow(ArrowArray* arr, int64_t start, int64_t end)
+{
+  auto op = [&](ArrowArray* array) {
+    array->offset = start;
+    array->length = end - start;
+    if (array->null_count != 0) {
+      array->null_count =
+        cudf::null_count(reinterpret_cast<cudf::bitmask_type const*>(array->buffers[0]),
+                         start,
+                         end,
+                         cudf::get_default_stream());
+    }
+  };
+
+  if (arr->n_children == 0) {
+    op(arr);
+    return;
+  }
+
+  arr->length = end - start;
+  for (int64_t i = 0; i < arr->n_children; ++i) {
+    op(arr->children[i]);
+  }
+}
+
+struct FromArrowDeviceTestSlice
+  : public FromArrowDeviceTest,
+    public ::testing::WithParamInterface<std::tuple<cudf::size_type, cudf::size_type>> {};
+
+TEST_P(FromArrowDeviceTestSlice, SliceTest)
+{
+  auto [table, schema, array] = get_nanoarrow_tables(10000);
+  auto cudf_table_view        = table->view();
+  auto const [start, end]     = GetParam();
+
+  auto sliced_cudf_table = cudf::slice(cudf_table_view, {start, end})[0];
+  slice_nanoarrow(array.get(), start, end);
+
+  ArrowDeviceArray input_device_array;
+  input_device_array.device_id   = rmm::get_current_cuda_device().value();
+  input_device_array.device_type = ARROW_DEVICE_CUDA;
+  input_device_array.sync_event  = nullptr;
+  memcpy(&input_device_array.array, array.get(), sizeof(ArrowArray));
+
+  auto got_cudf_table_view = cudf::from_arrow_device(schema.get(), &input_device_array);
+  if (got_cudf_table_view->num_rows() == 0 and sliced_cudf_table.num_rows() == 0) {
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(sliced_cudf_table, *got_cudf_table_view);
+
+    auto got_cudf_col = cudf::from_arrow_device_column(schema.get(), &input_device_array);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    cudf::table_view from_struct{
+      std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*got_cudf_table_view, from_struct);
+
+  } else {
+    CUDF_TEST_EXPECT_TABLES_EQUAL(sliced_cudf_table, *got_cudf_table_view);
+
+    auto got_cudf_col = cudf::from_arrow_device_column(schema.get(), &input_device_array);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    cudf::table_view from_struct{
+      std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(FromArrowDeviceTest,
+                        FromArrowDeviceTestSlice,
+                        ::testing::Values(std::make_tuple(0, 10000),
+                                          std::make_tuple(2912, 2915),
+                                          std::make_tuple(100, 3000),
+                                          std::make_tuple(0, 0),
+                                          std::make_tuple(0, 3000),
+                                          std::make_tuple(10000, 10000)));
+
+template <typename T>
+using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+TEST_F(FromArrowDeviceTest, FixedPoint128Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6};
+    auto const col      = fp_wrapper<__int128_t>(data.cbegin(), data.cend(), scale_type{scale});
+    auto const expected = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema input_schema;
+    ArrowSchemaInit(input_schema.get());
+    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    ArrowSchemaInit(input_schema->children[0]);
+    ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                              NANOARROW_TYPE_DECIMAL128,
+                              cudf::detail::max_precision<__int128_t>(),
+                              -scale);
+    ArrowSchemaSetName(input_schema->children[0], "a");
+
+    nanoarrow::UniqueArray input_array;
+    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    input_array->length = expected.num_rows();
+
+    populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+    ArrowDeviceArray input_device_array;
+    input_device_array.device_id   = rmm::get_current_cuda_device().value();
+    input_device_array.device_type = ARROW_DEVICE_CUDA;
+    input_device_array.sync_event  = nullptr;
+    memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+    auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *got_cudf_table_view);
+
+    auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    cudf::table_view from_struct{
+      std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+  }
+}
+
+TEST_F(FromArrowDeviceTest, FixedPoint128TableLarge)
+{
+  using namespace numeric;
+  auto constexpr NUM_ELEMENTS = 1000;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto iota           = thrust::make_counting_iterator(1);
+    auto const data     = std::vector<__int128_t>(iota, iota + NUM_ELEMENTS);
+    auto const col      = fp_wrapper<__int128_t>(iota, iota + NUM_ELEMENTS, scale_type{scale});
+    auto const expected = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema input_schema;
+    ArrowSchemaInit(input_schema.get());
+    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    ArrowSchemaInit(input_schema->children[0]);
+    ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                              NANOARROW_TYPE_DECIMAL128,
+                              cudf::detail::max_precision<__int128_t>(),
+                              -scale);
+    ArrowSchemaSetName(input_schema->children[0], "a");
+
+    nanoarrow::UniqueArray input_array;
+    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    input_array->length = expected.num_rows();
+
+    populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+    ArrowDeviceArray input_device_array;
+    input_device_array.device_id   = rmm::get_current_cuda_device().value();
+    input_device_array.device_type = ARROW_DEVICE_CUDA;
+    input_device_array.sync_event  = nullptr;
+    memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+    auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *got_cudf_table_view);
+
+    auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    cudf::table_view from_struct{
+      std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+  }
+}
+
+TEST_F(FromArrowDeviceTest, FixedPoint128TableNulls)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 0, 0};
+    auto const validity = std::vector<int32_t>{1, 1, 1, 1, 1, 1, 0, 0};
+    auto const col =
+      fp_wrapper<__int128_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale});
+    auto const expected = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema input_schema;
+    ArrowSchemaInit(input_schema.get());
+    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    ArrowSchemaInit(input_schema->children[0]);
+    ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                              NANOARROW_TYPE_DECIMAL128,
+                              cudf::detail::max_precision<__int128_t>(),
+                              -scale);
+    ArrowSchemaSetName(input_schema->children[0], "a");
+
+    nanoarrow::UniqueArray input_array;
+    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    input_array->length = expected.num_rows();
+
+    populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+    ArrowDeviceArray input_device_array;
+    input_device_array.device_id   = rmm::get_current_cuda_device().value();
+    input_device_array.device_type = ARROW_DEVICE_CUDA;
+    input_device_array.sync_event  = nullptr;
+    memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+    auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *got_cudf_table_view);
+
+    auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    cudf::table_view from_struct{
+      std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+  }
+}
+
+TEST_F(FromArrowDeviceTest, FixedPoint128TableNullsLarge)
+{
+  using namespace numeric;
+  auto constexpr NUM_ELEMENTS = 1000;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto every_other = [](auto i) { return i % 2 ? 0 : 1; };
+    auto validity    = cudf::detail::make_counting_transform_iterator(0, every_other);
+    auto iota        = thrust::make_counting_iterator(1);
+    auto const data  = std::vector<__int128_t>(iota, iota + NUM_ELEMENTS);
+    auto const col = fp_wrapper<__int128_t>(iota, iota + NUM_ELEMENTS, validity, scale_type{scale});
+    auto const expected = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema input_schema;
+    ArrowSchemaInit(input_schema.get());
+    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    ArrowSchemaInit(input_schema->children[0]);
+    ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                              NANOARROW_TYPE_DECIMAL128,
+                              cudf::detail::max_precision<__int128_t>(),
+                              -scale);
+    ArrowSchemaSetName(input_schema->children[0], "a");
+
+    nanoarrow::UniqueArray input_array;
+    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    input_array->length = expected.num_rows();
+
+    populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+
+    ArrowDeviceArray input_device_array;
+    input_device_array.device_id   = rmm::get_current_cuda_device().value();
+    input_device_array.device_type = ARROW_DEVICE_CUDA;
+    input_device_array.sync_event  = nullptr;
+    memcpy(&input_device_array.array, input_array.get(), sizeof(ArrowArray));
+
+    auto got_cudf_table_view = cudf::from_arrow_device(input_schema.get(), &input_device_array);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *got_cudf_table_view);
+
+    auto got_cudf_col = cudf::from_arrow_device_column(input_schema.get(), &input_device_array);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    cudf::table_view from_struct{
+      std::vector<cudf::column_view>(got_cudf_col->child_begin(), got_cudf_col->child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
+  }
+}
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index c4b53282402..b795bafed97 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/interop/detail/arrow.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -25,6 +26,8 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <nanoarrow/nanoarrow.hpp>
+
 // no-op allocator/deallocator to set into ArrowArray buffers that we don't
 // want to own their buffers.
 static ArrowBufferAllocator noop_alloc = (struct ArrowBufferAllocator){
@@ -35,28 +38,6 @@ static ArrowBufferAllocator noop_alloc = (struct ArrowBufferAllocator){
   .private_data = nullptr,
 };
 
-// populate the ArrowArray by copying host data buffers for fixed width types other
-// than boolean.
-template <typename T>
-std::enable_if_t<cudf::is_fixed_width<T>() and !std::is_same_v<T, bool>, void> get_nanoarrow_array(
-  ArrowArray* arr, std::vector<T> const& data, std::vector<uint8_t> const& mask = {})
-{
-  arr->length = data.size();
-  NANOARROW_THROW_NOT_OK(
-    ArrowBufferAppend(ArrowArrayBuffer(arr, 1), data.data(), sizeof(T) * data.size()));
-  if (!mask.empty()) {
-    NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), mask.size()));
-    ArrowBitmapAppendInt8Unsafe(
-      ArrowArrayValidityBitmap(arr), reinterpret_cast<const int8_t*>(mask.data()), mask.size());
-    arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, data.size());
-  } else {
-    arr->null_count = 0;
-  }
-
-  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK,
-               "failed to construct array");
-}
-
 // populate an ArrowArray with pointers to the raw device buffers of a cudf::column_view
 // and use the no-op alloc so that the ArrowArray doesn't presume ownership of the data
 template <typename T>
@@ -66,38 +47,13 @@ std::enable_if_t<cudf::is_fixed_width<T>() and !std::is_same_v<T, bool>, void> p
   arr->length     = view.size();
   arr->null_count = view.null_count();
   NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
+  ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
+    cudf::bitmask_allocation_size_bytes(view.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
   NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
-  ArrowArrayBuffer(arr, 1)->data = const_cast<uint8_t*>(view.data<uint8_t>());
-}
-
-// populate an ArrowArray with boolean data by generating the appropriate
-// bitmaps to copy the data.
-template <typename T>
-std::enable_if_t<std::is_same_v<T, bool>, void> get_nanoarrow_array(
-  ArrowArray* arr, std::vector<bool> const& data, std::vector<bool> const& mask = {})
-{
-  ArrowBitmap bool_data;
-  ArrowBitmapInit(&bool_data);
-  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&bool_data, data.size()));
-  std::for_each(data.begin(), data.end(), [&](const auto&& elem) {
-    NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&bool_data, (elem) ? 1 : 0, 1));
-  });
-  NANOARROW_THROW_NOT_OK(ArrowArraySetBuffer(arr, 1, &bool_data.buffer));
-
-  if (!mask.empty()) {
-    NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), mask.size()));
-    std::for_each(mask.begin(), mask.end(), [&](const auto&& elem) {
-      NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(ArrowArrayValidityBitmap(arr), (elem) ? 1 : 0, 1));
-    });
-    arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, data.size());
-  } else {
-    arr->null_count = 0;
-  }
-
-  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK,
-               "failed to construct boolean array");
+  ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(T) * view.size();
+  ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(view.data<uint8_t>());
 }
 
 // populate an ArrowArray from a boolean cudf column. Since Arrow and cudf
@@ -109,7 +65,10 @@ std::enable_if_t<std::is_same_v<T, bool>, void> populate_from_col(ArrowArray* ar
 {
   arr->length     = view.size();
   arr->null_count = view.null_count();
+
   NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
+  ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
+    cudf::bitmask_allocation_size_bytes(view.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
 
@@ -123,32 +82,8 @@ std::enable_if_t<std::is_same_v<T, bool>, void> populate_from_col(ArrowArray* ar
         delete buf;
       },
       new std::unique_ptr<rmm::device_buffer>(std::move(bitmask.first)))));
-  ArrowArrayBuffer(arr, 1)->data = ptr;
-}
-
-// populate an ArrowArray by copying the string data and constructing the offsets
-// buffer.
-template <typename T>
-std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> get_nanoarrow_array(
-  ArrowArray* arr, std::vector<std::string> const& data, std::vector<uint8_t> const& mask = {})
-{
-  NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(arr));
-  for (auto& str : data) {
-    NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(arr, ArrowCharView(str.c_str())));
-  }
-
-  if (!mask.empty()) {
-    ArrowBitmapReset(ArrowArrayValidityBitmap(arr));
-    NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), mask.size()));
-    ArrowBitmapAppendInt8Unsafe(
-      ArrowArrayValidityBitmap(arr), reinterpret_cast<const int8_t*>(mask.data()), mask.size());
-    arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, data.size());
-  } else {
-    arr->null_count = 0;
-  }
-
-  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK,
-               "failed to construct string array");
+  ArrowArrayBuffer(arr, 1)->size_bytes = cudf::bitmask_allocation_size_bytes(view.size());
+  ArrowArrayBuffer(arr, 1)->data       = ptr;
 }
 
 // populate an ArrowArray with the string data buffers of a cudf column_view
@@ -160,67 +95,47 @@ std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> populate_from_col(
 {
   arr->length     = view.size();
   arr->null_count = view.null_count();
+
   NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
+  ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
+    cudf::bitmask_allocation_size_bytes(view.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
 
   cudf::strings_column_view sview{view};
-  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
-  ArrowArrayBuffer(arr, 1)->data = const_cast<uint8_t*>(sview.offsets().data<uint8_t>());
-  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 2), noop_alloc));
-  ArrowArrayBuffer(arr, 2)->data = const_cast<uint8_t*>(view.data<uint8_t>());
+  if (view.size() > 0) {
+    NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
+    ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(int32_t) * sview.offsets().size();
+    ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(sview.offsets().data<uint8_t>());
+    NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 2), noop_alloc));
+    ArrowArrayBuffer(arr, 2)->size_bytes = sview.chars_size(cudf::get_default_stream());
+    ArrowArrayBuffer(arr, 2)->data       = const_cast<uint8_t*>(view.data<uint8_t>());
+  } else {
+    auto zero          = rmm::device_scalar<int32_t>(0, cudf::get_default_stream());
+    const uint8_t* ptr = reinterpret_cast<uint8_t*>(zero.data());
+    nanoarrow::BufferInitWrapped(ArrowArrayBuffer(arr, 1), std::move(zero), ptr, 4);
+  }
 }
 
-// populate a dictionary ArrowArray by delegating the copying of the indices
-// and key arrays
 template <typename KEY_TYPE, typename IND_TYPE>
-void get_nanoarrow_dict_array(ArrowArray* arr,
-                              std::vector<KEY_TYPE> const& keys,
-                              std::vector<IND_TYPE> const& ind,
-                              std::vector<uint8_t> const& validity = {})
+void populate_dict_from_col(ArrowArray* arr, cudf::dictionary_column_view dview)
 {
-  get_nanoarrow_array<KEY_TYPE>(arr->dictionary, keys);
-  get_nanoarrow_array<IND_TYPE>(arr, ind, validity);
-}
+  arr->length     = dview.size();
+  arr->null_count = dview.null_count();
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
+    cudf::bitmask_allocation_size_bytes(dview.size());
+  ArrowArrayValidityBitmap(arr)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(dview.null_mask()));
 
-// populate a list ArrowArray by copying the offsets and data buffers
-template <typename T>
-void get_nanoarrow_list_array(ArrowArray* arr,
-                              std::vector<T> data,
-                              std::vector<int32_t> offsets,
-                              std::vector<uint8_t> data_validity = {},
-                              std::vector<uint8_t> list_validity = {})
-{
-  get_nanoarrow_array<T>(arr->children[0], data, data_validity);
-
-  arr->length = offsets.size() - 1;
-  NANOARROW_THROW_NOT_OK(
-    ArrowBufferAppend(ArrowArrayBuffer(arr, 1), offsets.data(), sizeof(int32_t) * offsets.size()));
-  if (!list_validity.empty()) {
-    NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arr), list_validity.size()));
-    ArrowBitmapAppendInt8Unsafe(ArrowArrayValidityBitmap(arr),
-                                reinterpret_cast<const int8_t*>(list_validity.data()),
-                                arr->length);
-    arr->null_count = ArrowBitCountSet(ArrowArrayValidityBitmap(arr)->buffer.data, 0, arr->length);
-  } else {
-    arr->null_count = 0;
-  }
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(IND_TYPE) * dview.indices().size();
+  ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(dview.indices().data<uint8_t>());
 
-  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arr, nullptr) == NANOARROW_OK,
-               "failed to construct list array");
+  populate_from_col<KEY_TYPE>(arr->dictionary, dview.keys());
 }
 
-// populate an ArrowArray list array from device buffers using a no-op
-// allocator so that the ArrowArray doesn't have ownership of the buffers
-void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view)
-{
-  arr->length     = view.size();
-  arr->null_count = view.null_count();
+std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, nanoarrow::UniqueArray>
+get_nanoarrow_tables(cudf::size_type length = 10000);
 
-  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
-  ArrowArrayValidityBitmap(arr)->buffer.data =
-    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
-
-  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
-  ArrowArrayBuffer(arr, 1)->data = const_cast<uint8_t*>(view.offsets().data<uint8_t>());
-}
+void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view);
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index d6eae8dece1..fb346dad538 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -57,6 +57,26 @@ get_nanoarrow_tables(cudf::size_type length)
 
   std::vector<std::unique_ptr<cudf::column>> columns;
 
+  std::generate(int64_data.begin(), int64_data.end(), []() { return rand() % 500000; });
+  std::generate(list_int64_data.begin(), list_int64_data.end(), []() { return rand() % 500000; });
+  auto validity_generator = []() { return rand() % 7 != 0; };
+  std::generate(
+    list_int64_data_validity.begin(), list_int64_data_validity.end(), validity_generator);
+  std::generate(
+    list_offsets.begin(), list_offsets.end(), [length_of_individual_list, n = 0]() mutable {
+      return (n++) * length_of_individual_list;
+    });
+  std::generate(bool_data.begin(), bool_data.end(), validity_generator);
+  std::generate(
+    string_data.begin(), string_data.end(), []() { return rand() % 7 != 0 ? "CUDF" : "Rocks"; });
+  std::generate(validity.begin(), validity.end(), validity_generator);
+  std::generate(bool_validity.begin(), bool_validity.end(), validity_generator);
+
+  std::transform(bool_validity.cbegin(),
+                 bool_validity.cend(),
+                 std::back_inserter(bool_data_validity),
+                 [](auto val) { return static_cast<uint8_t>(val); });
+
   columns.emplace_back(cudf::test::fixed_width_column_wrapper<int64_t>(
                          int64_data.begin(), int64_data.end(), validity.begin())
                          .release());
@@ -180,41 +200,58 @@ get_nanoarrow_tables(cudf::size_type length)
 
   nanoarrow::UniqueArray arrow;
   NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(arrow.get(), schema.get(), nullptr));
-
-  get_nanoarrow_array<int64_t>(arrow->children[0], int64_data, validity);
-  get_nanoarrow_array<cudf::string_view>(arrow->children[1], string_data, validity);
-  cudf::dictionary_column_view view(dict_col->view());
-  auto keys    = cudf::test::to_host<int64_t>(view.keys()).first;
-  auto indices = cudf::test::to_host<uint32_t>(view.indices()).first;
-  get_nanoarrow_dict_array(arrow->children[2],
-                           std::vector<int64_t>(keys.begin(), keys.end()),
-                           std::vector<int32_t>(indices.begin(), indices.end()),
-                           validity);
-  get_nanoarrow_array<bool>(arrow->children[3], bool_data, bool_validity);
-  get_nanoarrow_list_array<int64_t>(arrow->children[4],
-                                    list_int64_data,
-                                    list_offsets,
-                                    list_int64_data_validity,
-                                    bool_data_validity);
-
-  get_nanoarrow_array<int64_t>(arrow->children[5]->children[0], int64_data, validity);
-  get_nanoarrow_array<cudf::string_view>(arrow->children[5]->children[1], string_data, validity);
-  arrow->children[5]->length = length;
-  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(arrow->children[5]), length));
-  std::for_each(bool_data_validity.begin(), bool_data_validity.end(), [&](auto&& elem) {
-    NANOARROW_THROW_NOT_OK(
-      ArrowBitmapAppend(ArrowArrayValidityBitmap(arrow->children[5]), (elem) ? 1 : 0, 1));
-  });
-  arrow->children[5]->null_count =
-    ArrowBitCountSet(ArrowArrayValidityBitmap(arrow->children[5])->buffer.data, 0, length);
-
-  CUDF_EXPECTS(ArrowArrayFinishBuildingDefault(arrow.get(), nullptr) == NANOARROW_OK,
-               "failed to build example Arrays");
+  arrow->length = length;
+
+  populate_from_col<int64_t>(arrow->children[0], columns[0]->view());
+  populate_from_col<cudf::string_view>(arrow->children[1], columns[1]->view());
+  populate_dict_from_col<int64_t, uint32_t>(arrow->children[2],
+                                            cudf::dictionary_column_view(columns[2]->view()));
+
+  populate_from_col<bool>(arrow->children[3], columns[3]->view());
+  cudf::lists_column_view list_view{columns[4]->view()};
+  populate_list_from_col(arrow->children[4], list_view);
+  populate_from_col<int64_t>(arrow->children[4]->children[0], list_view.child());
+
+  cudf::structs_column_view struct_view{columns[5]->view()};
+  populate_from_col<int64_t>(arrow->children[5]->children[0], struct_view.child(0));
+  populate_from_col<cudf::string_view>(arrow->children[5]->children[1], struct_view.child(1));
+  arrow->children[5]->length     = struct_view.size();
+  arrow->children[5]->null_count = struct_view.null_count();
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arrow->children[5], 0), noop_alloc);
+  ArrowArrayValidityBitmap(arrow->children[5])->buffer.size_bytes =
+    cudf::bitmask_allocation_size_bytes(struct_view.size());
+  ArrowArrayValidityBitmap(arrow->children[5])->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(struct_view.null_mask()));
+
+  ArrowError error;
+  if (ArrowArrayFinishBuilding(arrow.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, &error) !=
+      NANOARROW_OK) {
+    std::cerr << ArrowErrorMessage(&error) << std::endl;
+    CUDF_FAIL("failed to build example arrays");
+  }
 
   return std::make_tuple(
     std::make_unique<cudf::table>(std::move(columns)), std::move(schema), std::move(arrow));
 }
 
+// populate an ArrowArray list array from device buffers using a no-op
+// allocator so that the ArrowArray doesn't have ownership of the buffers
+void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view)
+{
+  arr->length     = view.size();
+  arr->null_count = view.null_count();
+
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
+    cudf::bitmask_allocation_size_bytes(view.size());
+  ArrowArrayValidityBitmap(arr)->buffer.data =
+    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
+
+  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(int32_t) * view.offsets().size();
+  ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(view.offsets().data<uint8_t>());
+}
+
 struct BaseArrowFixture : public cudf::test::BaseFixture {
   void compare_schemas(const ArrowSchema* expected, const ArrowSchema* actual)
   {

From 117eff6bf1eb8a46c597fd8f9e76a22fa363f03a Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Wed, 24 Apr 2024 12:26:09 -0700
Subject: [PATCH 105/842] Add BYTE_STREAM_SPLIT support to Parquet (#15311)

Closes #15226. Part of #13501.  Adds support for reading and writing `BYTE_STREAM_SPLIT` encoded Parquet data. Includes a "microkernel" version like those introduced by #15159.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15311
---
 cpp/include/cudf/io/types.hpp            |   1 +
 cpp/src/io/parquet/decode_fixed.cu       | 229 ++++++++++++++++++++++-
 cpp/src/io/parquet/page_data.cu          | 198 +++++++++++++++++++-
 cpp/src/io/parquet/page_data.cuh         |  76 ++++++++
 cpp/src/io/parquet/page_decode.cuh       |   1 +
 cpp/src/io/parquet/page_delta_decode.cu  |   6 +-
 cpp/src/io/parquet/page_enc.cu           | 137 ++++++++------
 cpp/src/io/parquet/page_hdr.cu           |  26 ++-
 cpp/src/io/parquet/page_string_decode.cu |  16 +-
 cpp/src/io/parquet/parquet_gpu.hpp       |  84 +++++++--
 cpp/src/io/parquet/reader_impl.cpp       |  22 +++
 cpp/src/io/parquet/writer_impl.cu        |  18 +-
 cpp/tests/io/parquet_common.cpp          |   1 +
 cpp/tests/io/parquet_writer_test.cpp     | 159 +++++++++++++++-
 14 files changed, 876 insertions(+), 98 deletions(-)

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 65d4a4417f0..b3dea0ab280 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -113,6 +113,7 @@ enum class column_encoding {
                             ///< valid for BYTE_ARRAY columns)
   DELTA_BYTE_ARRAY,         ///< Use DELTA_BYTE_ARRAY encoding (only valid for
                             ///< BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY columns)
+  BYTE_STREAM_SPLIT,        ///< Use BYTE_STREAM_SPLIT encoding (valid for all fixed width types)
   // ORC encodings:
   DIRECT,         ///< Use DIRECT encoding
   DIRECT_V2,      ///< Use DIRECT_V2 encoding
diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index 945a7dcb4c6..f3332a23992 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -225,6 +225,96 @@ __device__ inline void gpuDecodeValues(
   }
 }
 
+template <typename state_buf>
+__device__ inline void gpuDecodeSplitValues(page_state_s* s,
+                                            state_buf* const sb,
+                                            int start,
+                                            int end)
+{
+  using cudf::detail::warp_size;
+  constexpr int num_warps      = decode_block_size / warp_size;
+  constexpr int max_batch_size = num_warps * warp_size;
+
+  auto const t = threadIdx.x;
+
+  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
+  int const dtype                          = s->col.physical_type;
+  auto const data_len                      = thrust::distance(s->data_start, s->data_end);
+  auto const num_values                    = data_len / s->dtype_len_in;
+
+  // decode values
+  int pos = start;
+  while (pos < end) {
+    int const batch_size = min(max_batch_size, end - pos);
+
+    int const target_pos = pos + batch_size;
+    int const src_pos    = pos + t;
+
+    // the position in the output column/buffer
+    int dst_pos = sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] - s->first_row;
+
+    // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
+    // before first_row) in the flat hierarchy case.
+    if (src_pos < target_pos && dst_pos >= 0) {
+      // nesting level that is storing actual leaf values
+      int const leaf_level_index = s->col.max_nesting_depth - 1;
+
+      uint32_t dtype_len = s->dtype_len;
+      uint8_t const* src = s->data_start + src_pos;
+      uint8_t* dst =
+        nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+      auto const is_decimal =
+        s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
+
+      // Note: non-decimal FIXED_LEN_BYTE_ARRAY will be handled in the string reader
+      if (is_decimal) {
+        switch (dtype) {
+          case INT32: gpuOutputByteStreamSplit<int32_t>(dst, src, num_values); break;
+          case INT64: gpuOutputByteStreamSplit<int64_t>(dst, src, num_values); break;
+          case FIXED_LEN_BYTE_ARRAY:
+            if (s->dtype_len_in <= sizeof(int32_t)) {
+              gpuOutputSplitFixedLenByteArrayAsInt(
+                reinterpret_cast<int32_t*>(dst), src, num_values, s->dtype_len_in);
+              break;
+            } else if (s->dtype_len_in <= sizeof(int64_t)) {
+              gpuOutputSplitFixedLenByteArrayAsInt(
+                reinterpret_cast<int64_t*>(dst), src, num_values, s->dtype_len_in);
+              break;
+            } else if (s->dtype_len_in <= sizeof(__int128_t)) {
+              gpuOutputSplitFixedLenByteArrayAsInt(
+                reinterpret_cast<__int128_t*>(dst), src, num_values, s->dtype_len_in);
+              break;
+            }
+            // unsupported decimal precision
+            [[fallthrough]];
+
+          default: s->set_error_code(decode_error::UNSUPPORTED_ENCODING);
+        }
+      } else if (dtype_len == 8) {
+        if (s->dtype_len_in == 4) {
+          // Reading INT32 TIME_MILLIS into 64-bit DURATION_MILLISECONDS
+          // TIME_MILLIS is the only duration type stored as int32:
+          // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#deprecated-time-convertedtype
+          gpuOutputByteStreamSplit<int32_t>(dst, src, num_values);
+          // zero out most significant bytes
+          memset(dst + 4, 0, 4);
+        } else if (s->ts_scale) {
+          gpuOutputSplitInt64Timestamp(
+            reinterpret_cast<int64_t*>(dst), src, num_values, s->ts_scale);
+        } else {
+          gpuOutputByteStreamSplit<int64_t>(dst, src, num_values);
+        }
+      } else if (dtype_len == 4) {
+        gpuOutputByteStreamSplit<int32_t>(dst, src, num_values);
+      } else {
+        s->set_error_code(decode_error::UNSUPPORTED_ENCODING);
+      }
+    }
+
+    pos += batch_size;
+  }
+}
+
 // is the page marked nullable or not
 __device__ inline bool is_nullable(page_state_s* s)
 {
@@ -495,6 +585,123 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
 
+/**
+ * @brief Kernel for computing fixed width non dictionary column data stored in the pages
+ *
+ * This function will write the page data and the page data's validity to the
+ * output specified in the page's column chunk. If necessary, additional
+ * conversion will be performed to translate from the Parquet datatype to
+ * desired output datatype.
+ *
+ * @param pages List of pages
+ * @param chunks List of column chunks
+ * @param min_row Row index to start reading at
+ * @param num_rows Maximum number of rows to read
+ * @param error_code Error code to set if an error is encountered
+ */
+template <typename level_t>
+CUDF_KERNEL void __launch_bounds__(decode_block_size)
+  gpuDecodeSplitPageDataFlat(PageInfo* pages,
+                             device_span<ColumnChunkDesc const> chunks,
+                             size_t min_row,
+                             size_t num_rows,
+                             kernel_error::pointer error_code)
+{
+  __shared__ __align__(16) page_state_s state_g;
+  __shared__ __align__(16) page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
+                                                1,                 // unused in this kernel
+                                                1>                 // unused in this kernel
+    state_buffers;
+
+  page_state_s* const s = &state_g;
+  auto* const sb        = &state_buffers;
+  int const page_idx    = blockIdx.x;
+  int const t           = threadIdx.x;
+  PageInfo* pp          = &pages[page_idx];
+
+  if (!(BitAnd(pages[page_idx].kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT))) {
+    return;
+  }
+
+  // must come after the kernel mask check
+  [[maybe_unused]] null_count_back_copier _{s, t};
+
+  if (!setupLocalPageInfo(s,
+                          pp,
+                          chunks,
+                          min_row,
+                          num_rows,
+                          mask_filter{decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT},
+                          page_processing_stage::DECODE)) {
+    return;
+  }
+
+  // the level stream decoders
+  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
+  rle_stream<level_t, decode_block_size, rolling_buf_size> def_decoder{def_runs};
+
+  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
+  if (s->num_rows == 0) { return; }
+
+  bool const nullable            = is_nullable(s);
+  bool const nullable_with_nulls = nullable && has_nulls(s);
+
+  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
+  level_t* const def = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
+  if (nullable_with_nulls) {
+    def_decoder.init(s->col.level_bits[level_type::DEFINITION],
+                     s->abs_lvl_start[level_type::DEFINITION],
+                     s->abs_lvl_end[level_type::DEFINITION],
+                     def,
+                     s->page.num_input_values);
+  }
+  __syncthreads();
+
+  // We use two counters in the loop below: processed_count and valid_count.
+  // - processed_count: number of rows out of num_input_values that we have decoded so far.
+  //   the definition stream returns the number of total rows it has processed in each call
+  //   to decode_next and we accumulate in process_count.
+  // - valid_count: number of non-null rows we have decoded so far. In each iteration of the
+  //   loop below, we look at the number of valid items (which could be all for non-nullable),
+  //   and valid_count is that running count.
+  int processed_count = 0;
+  int valid_count     = 0;
+  // the core loop. decode batches of level stream data using rle_stream objects
+  // and pass the results to gpuDecodeValues
+  while (s->error == 0 && processed_count < s->page.num_input_values) {
+    int next_valid_count;
+
+    // only need to process definition levels if this is a nullable column
+    if (nullable) {
+      if (nullable_with_nulls) {
+        processed_count += def_decoder.decode_next(t);
+        __syncthreads();
+      } else {
+        processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
+      }
+
+      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(
+        processed_count, s, sb, def, t, nullable_with_nulls);
+    }
+    // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
+    // this function call entirely since all it will ever generate is a mapping of (i -> i) for
+    // nz_idx.  gpuDecodeValues would be the only work that happens.
+    else {
+      processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
+      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
+        processed_count, s, sb, nullptr, t, false);
+    }
+    __syncthreads();
+
+    // decode the values themselves
+    gpuDecodeSplitValues(s, sb, valid_count, next_valid_count);
+    __syncthreads();
+
+    valid_count = next_valid_count;
+  }
+  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
+}
+
 }  // anonymous namespace
 
 void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
@@ -528,7 +735,7 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
   //  dim3 dim_block(decode_block_size, 1); // decode_block_size = 128 threads per block
   // 1 full warp, and 1 warp of 1 thread
   dim3 dim_block(decode_block_size, 1);  // decode_block_size = 128 threads per block
-  dim3 dim_grid(pages.size(), 1);        // 1 thread block per pags => # blocks
+  dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
     gpuDecodePageDataFixedDict<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
@@ -539,4 +746,24 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
   }
 }
 
+void __host__ DecodeSplitPageDataFlat(cudf::detail::hostdevice_span<PageInfo> pages,
+                                      cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                                      size_t num_rows,
+                                      size_t min_row,
+                                      int level_type_size,
+                                      kernel_error::pointer error_code,
+                                      rmm::cuda_stream_view stream)
+{
+  dim3 dim_block(decode_block_size, 1);  // decode_block_size = 128 threads per block
+  dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
+
+  if (level_type_size == 1) {
+    gpuDecodeSplitPageDataFlat<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+  } else {
+    gpuDecodeSplitPageDataFlat<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+  }
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 62ce5b9f9a5..7207173b82f 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -28,6 +28,177 @@ namespace {
 constexpr int decode_block_size = 128;
 constexpr int rolling_buf_size  = decode_block_size * 2;
 
+/**
+ * @brief Kernel for computing the BYTE_STREAM_SPLIT column data stored in the pages
+ *
+ * This is basically the PLAIN decoder, but with a pared down set of supported data
+ * types, and using output functions that piece together the individual streams.
+ * Supported physical types include INT32, INT64, FLOAT, DOUBLE and FIXED_LEN_BYTE_ARRAY.
+ * The latter is currently only used for large decimals. The Parquet specification also
+ * has FLOAT16 and UUID types that are currently not supported. FIXED_LEN_BYTE_ARRAY data
+ * that lacks a `LogicalType` annotation will be handled by the string decoder.
+ *
+ * @param pages List of pages
+ * @param chunks List of column chunks
+ * @param min_row Row index to start reading at
+ * @param num_rows Maximum number of rows to read
+ * @param error_code Error code to set if an error is encountered
+ */
+template <int lvl_buf_size, typename level_t>
+CUDF_KERNEL void __launch_bounds__(decode_block_size)
+  gpuDecodeSplitPageData(PageInfo* pages,
+                         device_span<ColumnChunkDesc const> chunks,
+                         size_t min_row,
+                         size_t num_rows,
+                         kernel_error::pointer error_code)
+{
+  using cudf::detail::warp_size;
+  __shared__ __align__(16) page_state_s state_g;
+  __shared__ __align__(16)
+    page_state_buffers_s<rolling_buf_size, rolling_buf_size, rolling_buf_size>
+      state_buffers;
+
+  page_state_s* const s = &state_g;
+  auto* const sb        = &state_buffers;
+  int page_idx          = blockIdx.x;
+  int t                 = threadIdx.x;
+  [[maybe_unused]] null_count_back_copier _{s, t};
+
+  if (!setupLocalPageInfo(s,
+                          &pages[page_idx],
+                          chunks,
+                          min_row,
+                          num_rows,
+                          mask_filter{decode_kernel_mask::BYTE_STREAM_SPLIT},
+                          page_processing_stage::DECODE)) {
+    return;
+  }
+
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+
+  auto const data_len    = thrust::distance(s->data_start, s->data_end);
+  auto const num_values  = data_len / s->dtype_len_in;
+  auto const out_thread0 = warp_size;
+
+  PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
+
+  __shared__ level_t rep[rolling_buf_size];  // circular buffer of repetition level values
+  __shared__ level_t def[rolling_buf_size];  // circular buffer of definition level values
+
+  // skipped_leaf_values will always be 0 for flat hierarchies.
+  uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
+  while (s->error == 0 &&
+         (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
+    int target_pos;
+    int src_pos = s->src_pos;
+
+    if (t < out_thread0) {
+      target_pos = min(src_pos + 2 * (decode_block_size - out_thread0),
+                       s->nz_count + (decode_block_size - out_thread0));
+    } else {
+      target_pos = min(s->nz_count, src_pos + decode_block_size - out_thread0);
+    }
+    // this needs to be here to prevent warp 1 modifying src_pos before all threads have read it
+    __syncthreads();
+
+    if (t < warp_size) {
+      // decode repetition and definition levels.
+      // - update validity vectors
+      // - updates offsets (for nested columns)
+      // - produces non-NULL value indices in s->nz_idx for subsequent decoding
+      gpuDecodeLevels<lvl_buf_size, level_t>(s, sb, target_pos, rep, def, t);
+    } else {
+      // WARP1..WARP3: Decode values
+      int const dtype = s->col.physical_type;
+      src_pos += t - out_thread0;
+
+      // the position in the output column/buffer
+      int dst_pos = sb->nz_idx[rolling_index<rolling_buf_size>(src_pos)];
+
+      // for the flat hierarchy case we will be reading from the beginning of the value stream,
+      // regardless of the value of first_row. so adjust our destination offset accordingly.
+      // example:
+      // - user has passed skip_rows = 2, so our first_row to output is 2
+      // - the row values we get from nz_idx will be
+      //   0, 1, 2, 3, 4 ....
+      // - by shifting these values by first_row, the sequence becomes
+      //   -2, -1, 0, 1, 2 ...
+      // - so we will end up ignoring the first two input rows, and input rows 2..n will
+      //   get written to the output starting at position 0.
+      //
+      if (!has_repetition) { dst_pos -= s->first_row; }
+
+      // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
+      // before first_row) in the flat hierarchy case.
+      if (src_pos < target_pos && dst_pos >= 0) {
+        // src_pos represents the logical row position we want to read from. But in the case of
+        // nested hierarchies, there is no 1:1 mapping of rows to values.  So our true read position
+        // has to take into account the # of values we have to skip in the page to get to the
+        // desired logical row.  For flat hierarchies, skipped_leaf_values will always be 0.
+        uint32_t val_src_pos = src_pos + skipped_leaf_values;
+
+        // nesting level that is storing actual leaf values
+        int leaf_level_index = s->col.max_nesting_depth - 1;
+
+        uint32_t dtype_len = s->dtype_len;
+        uint8_t const* src = s->data_start + val_src_pos;
+        uint8_t* dst =
+          nesting_info_base[leaf_level_index].data_out + static_cast<size_t>(dst_pos) * dtype_len;
+        auto const is_decimal =
+          s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
+
+        // Note: non-decimal FIXED_LEN_BYTE_ARRAY will be handled in the string reader
+        if (is_decimal) {
+          switch (dtype) {
+            case INT32: gpuOutputByteStreamSplit<int32_t>(dst, src, num_values); break;
+            case INT64: gpuOutputByteStreamSplit<int64_t>(dst, src, num_values); break;
+            case FIXED_LEN_BYTE_ARRAY:
+              if (s->dtype_len_in <= sizeof(int32_t)) {
+                gpuOutputSplitFixedLenByteArrayAsInt(
+                  reinterpret_cast<int32_t*>(dst), src, num_values, s->dtype_len_in);
+                break;
+              } else if (s->dtype_len_in <= sizeof(int64_t)) {
+                gpuOutputSplitFixedLenByteArrayAsInt(
+                  reinterpret_cast<int64_t*>(dst), src, num_values, s->dtype_len_in);
+                break;
+              } else if (s->dtype_len_in <= sizeof(__int128_t)) {
+                gpuOutputSplitFixedLenByteArrayAsInt(
+                  reinterpret_cast<__int128_t*>(dst), src, num_values, s->dtype_len_in);
+                break;
+              }
+              // unsupported decimal precision
+              [[fallthrough]];
+
+            default: s->set_error_code(decode_error::UNSUPPORTED_ENCODING);
+          }
+        } else if (dtype_len == 8) {
+          if (s->dtype_len_in == 4) {
+            // Reading INT32 TIME_MILLIS into 64-bit DURATION_MILLISECONDS
+            // TIME_MILLIS is the only duration type stored as int32:
+            // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#deprecated-time-convertedtype
+            gpuOutputByteStreamSplit<int32_t>(dst, src, num_values);
+            // zero out most significant bytes
+            memset(dst + 4, 0, 4);
+          } else if (s->ts_scale) {
+            gpuOutputSplitInt64Timestamp(
+              reinterpret_cast<int64_t*>(dst), src, num_values, s->ts_scale);
+          } else {
+            gpuOutputByteStreamSplit<int64_t>(dst, src, num_values);
+          }
+        } else if (dtype_len == 4) {
+          gpuOutputByteStreamSplit<int32_t>(dst, src, num_values);
+        } else {
+          s->set_error_code(decode_error::UNSUPPORTED_ENCODING);
+        }
+      }
+
+      if (t == out_thread0) { s->src_pos = target_pos; }
+    }
+    __syncthreads();
+  }
+  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
+}
+
 /**
  * @brief Kernel for computing the column data stored in the pages
  *
@@ -145,7 +316,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
       // - the row values we get from nz_idx will be
       //   0, 1, 2, 3, 4 ....
       // - by shifting these values by first_row, the sequence becomes
-      //   -1, -2, 0, 1, 2 ...
+      //   -2, -1, 0, 1, 2 ...
       // - so we will end up ignoring the first two input rows, and input rows 2..n will
       //   get written to the output starting at position 0.
       //
@@ -267,4 +438,29 @@ void __host__ DecodePageData(cudf::detail::hostdevice_span<PageInfo> pages,
   }
 }
 
+/**
+ * @copydoc cudf::io::parquet::detail::DecodePageData
+ */
+void __host__ DecodeSplitPageData(cudf::detail::hostdevice_span<PageInfo> pages,
+                                  cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                                  size_t num_rows,
+                                  size_t min_row,
+                                  int level_type_size,
+                                  kernel_error::pointer error_code,
+                                  rmm::cuda_stream_view stream)
+{
+  CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
+
+  dim3 dim_block(decode_block_size, 1);
+  dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
+
+  if (level_type_size == 1) {
+    gpuDecodeSplitPageData<rolling_buf_size, uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+  } else {
+    gpuDecodeSplitPageData<rolling_buf_size, uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
+      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+  }
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_data.cuh b/cpp/src/io/parquet/page_data.cuh
index df8d801d66c..f182747650e 100644
--- a/cpp/src/io/parquet/page_data.cuh
+++ b/cpp/src/io/parquet/page_data.cuh
@@ -396,4 +396,80 @@ inline __device__ void gpuOutputGeneric(
     }
   }
 }
+
+/**
+ * Output a BYTE_STREAM_SPLIT value of type `T`.
+ *
+ * Data is encoded as N == sizeof(T) streams of length M, forming an NxM sized matrix.
+ * Rows are streams, columns are individual values.
+ *
+ * @param dst pointer to output data
+ * @param src pointer to first byte of input data in stream 0
+ * @param stride number of bytes per input stream (M)
+ */
+template <typename T>
+__device__ inline void gpuOutputByteStreamSplit(uint8_t* dst, uint8_t const* src, size_type stride)
+{
+  for (int i = 0; i < sizeof(T); i++) {
+    dst[i] = src[i * stride];
+  }
+}
+
+/**
+ * Output a 64-bit BYTE_STREAM_SPLIT encoded timestamp.
+ *
+ * Data is encoded as N streams of length M, forming an NxM sized matrix. Rows are streams,
+ * columns are individual values.
+ *
+ * @param dst pointer to output data
+ * @param src pointer to first byte of input data in stream 0
+ * @param stride number of bytes per input stream (M)
+ * @param ts_scale timestamp scale
+ */
+inline __device__ void gpuOutputSplitInt64Timestamp(int64_t* dst,
+                                                    uint8_t const* src,
+                                                    size_type stride,
+                                                    int32_t ts_scale)
+{
+  gpuOutputByteStreamSplit<int64_t>(reinterpret_cast<uint8_t*>(dst), src, stride);
+  if (ts_scale < 0) {
+    // round towards negative infinity
+    int sign = (*dst < 0);
+    *dst     = ((*dst + sign) / -ts_scale) + sign;
+  } else {
+    *dst = *dst * ts_scale;
+  }
+}
+
+/**
+ * Output a BYTE_STREAM_SPLIT encoded decimal as an integer type.
+ *
+ * Data is encoded as N streams of length M, forming an NxM sized matrix. Rows are streams,
+ * columns are individual values.
+ *
+ * @param dst pointer to output data
+ * @param src pointer to first byte of input data in stream 0
+ * @param stride number of bytes per input stream (M)
+ * @param dtype_len_in length of the `FIXED_LEN_BYTE_ARRAY` used to represent the decimal
+ */
+template <typename T>
+__device__ void gpuOutputSplitFixedLenByteArrayAsInt(T* dst,
+                                                     uint8_t const* src,
+                                                     size_type stride,
+                                                     uint32_t dtype_len_in)
+{
+  T unscaled = 0;
+  // fixed_len_byte_array decimals are big endian
+  for (unsigned int i = 0; i < dtype_len_in; i++) {
+    unscaled = (unscaled << 8) | src[i * stride];
+  }
+  // Shift the unscaled value up and back down when it isn't all 8 bytes,
+  // which sign extend the value for correctly representing negative numbers.
+  if (dtype_len_in < sizeof(T)) {
+    unscaled <<= (sizeof(T) - dtype_len_in) * 8;
+    unscaled >>= (sizeof(T) - dtype_len_in) * 8;
+  }
+  *dst = unscaled;
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 83bf7fb0d73..0c139fced24 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -1316,6 +1316,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           }
           break;
         case Encoding::PLAIN:
+        case Encoding::BYTE_STREAM_SPLIT:
           s->dict_size = static_cast<int32_t>(end - cur);
           s->dict_val  = 0;
           if (s->col.physical_type == BOOLEAN) { s->dict_run = s->dict_size * 2 + 1; }
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index 7c0092c6185..da1bbaebd73 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -315,7 +315,7 @@ CUDF_KERNEL void __launch_bounds__(96)
   using cudf::detail::warp_size;
   __shared__ __align__(16) delta_binary_decoder db_state;
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 0, 0> state_buffers;
+  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 1, 1> state_buffers;
 
   page_state_s* const s = &state_g;
   auto* const sb        = &state_buffers;
@@ -440,7 +440,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   using cudf::detail::warp_size;
   __shared__ __align__(16) delta_byte_array_decoder db_state;
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 0, 0> state_buffers;
+  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 1, 1> state_buffers;
 
   page_state_s* const s = &state_g;
   auto* const sb        = &state_buffers;
@@ -605,7 +605,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   using cudf::detail::warp_size;
   __shared__ __align__(16) delta_binary_decoder db_state;
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 0, 0> state_buffers;
+  __shared__ __align__(16) page_state_buffers_s<delta_rolling_buf_size, 1, 1> state_buffers;
   __shared__ __align__(8) uint8_t const* page_string_data;
   __shared__ size_t string_offset;
 
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 2db6dc4270d..227f13db60e 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -15,6 +15,7 @@
  */
 
 #include "delta_enc.cuh"
+#include "io/parquet/parquet_gpu.hpp"
 #include "io/utilities/block_utils.cuh"
 #include "page_string_utils.cuh"
 #include "parquet_gpu.cuh"
@@ -238,8 +239,10 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t)
 Encoding __device__ determine_encoding(PageType page_type,
                                        Type physical_type,
                                        bool use_dictionary,
-                                       bool write_v2_headers)
+                                       bool write_v2_headers,
+                                       bool is_split_stream)
 {
+  if (is_split_stream) { return Encoding::BYTE_STREAM_SPLIT; }
   // NOTE: For dictionary encoding, parquet v2 recommends using PLAIN in dictionary page and
   // RLE_DICTIONARY in data page, but parquet v1 uses PLAIN_DICTIONARY in both dictionary and
   // data pages (actual encoding is identical).
@@ -514,6 +517,7 @@ __device__ encode_kernel_mask data_encoding_for_col(EncColumnChunk const* chunk,
       case column_encoding::DELTA_BINARY_PACKED: return encode_kernel_mask::DELTA_BINARY;
       case column_encoding::DELTA_LENGTH_BYTE_ARRAY: return encode_kernel_mask::DELTA_LENGTH_BA;
       case column_encoding::DELTA_BYTE_ARRAY: return encode_kernel_mask::DELTA_BYTE_ARRAY;
+      case column_encoding::BYTE_STREAM_SPLIT: return encode_kernel_mask::BYTE_STREAM_SPLIT;
     }
   }
 
@@ -1608,6 +1612,19 @@ __device__ void finish_page_encode(state_buf* s,
   }
 }
 
+// Encode a fixed-width data type int `dst`. `dst` points to the first byte
+// of the result. `stride` is 1 for PLAIN encoding and num_values for
+// BYTE_STREAM_SPLIT.
+template <typename T>
+__device__ inline void encode_value(uint8_t* dst, T src, size_type stride)
+{
+  T v = src;
+  for (int i = 0; i < sizeof(T); i++) {
+    dst[i * stride] = v;
+    v >>= 8;
+  }
+}
+
 // PLAIN page data encoder
 // blockDim(128, 1, 1)
 template <int block_size>
@@ -1616,7 +1633,8 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
                  device_span<device_span<uint8_t const>> comp_in,
                  device_span<device_span<uint8_t>> comp_out,
                  device_span<compression_result> comp_results,
-                 bool write_v2_headers)
+                 bool write_v2_headers,
+                 bool is_split_stream)
 {
   __shared__ __align__(8) page_enc_state_s<0> state_g;
   using block_scan = cub::BlockScan<uint32_t, block_size>;
@@ -1636,7 +1654,9 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
   }
   __syncthreads();
 
-  if (BitAnd(s->page.kernel_mask, encode_kernel_mask::PLAIN) == 0) { return; }
+  auto const allowed_mask =
+    is_split_stream ? encode_kernel_mask::BYTE_STREAM_SPLIT : encode_kernel_mask::PLAIN;
+  if (BitAnd(s->page.kernel_mask, allowed_mask) == 0) { return; }
 
   // Encode data values
   __syncthreads();
@@ -1650,18 +1670,20 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
   }();
 
   if (t == 0) {
-    uint8_t* dst   = s->cur;
-    s->rle_run     = 0;
-    s->rle_pos     = 0;
-    s->rle_numvals = 0;
-    s->rle_out     = dst;
-    s->page.encoding =
-      determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers);
+    uint8_t* dst     = s->cur;
+    s->rle_run       = 0;
+    s->rle_pos       = 0;
+    s->rle_numvals   = 0;
+    s->rle_out       = dst;
+    s->page.encoding = determine_encoding(
+      s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers, is_split_stream);
     s->page_start_val  = row_to_value_idx(s->page.start_row, s->col);
     s->chunk_start_val = row_to_value_idx(s->ck.start_row, s->col);
   }
   __syncthreads();
 
+  auto const stride = is_split_stream ? s->page.num_valid : 1;
+
   for (uint32_t cur_val_idx = 0; cur_val_idx < s->page.num_leaf_values;) {
     uint32_t nvals = min(s->page.num_leaf_values - cur_val_idx, block_size);
     uint32_t len, pos;
@@ -1708,6 +1730,13 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
     uint32_t total_len = 0;
     block_scan(scan_storage).ExclusiveSum(len, pos, total_len);
     __syncthreads();
+
+    // if BYTE_STREAM_SPLIT, then translate byte positions to indexes
+    if (is_split_stream) {
+      pos /= dtype_len_out;
+      total_len /= dtype_len_out;
+    }
+
     if (t == 0) { s->cur = dst + total_len; }
     if (is_valid) {
       switch (physical_type) {
@@ -1725,13 +1754,11 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
             }
           }();
 
-          dst[pos + 0] = v;
-          dst[pos + 1] = v >> 8;
-          dst[pos + 2] = v >> 16;
-          dst[pos + 3] = v >> 24;
+          encode_value(dst + pos, v, stride);
         } break;
+        case DOUBLE:
         case INT64: {
-          int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
+          auto v           = s->col.leaf_column->element<int64_t>(val_idx);
           int32_t ts_scale = s->col.ts_scale;
           if (ts_scale != 0) {
             if (ts_scale < 0) {
@@ -1740,16 +1767,10 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
               v *= ts_scale;
             }
           }
-          dst[pos + 0] = v;
-          dst[pos + 1] = v >> 8;
-          dst[pos + 2] = v >> 16;
-          dst[pos + 3] = v >> 24;
-          dst[pos + 4] = v >> 32;
-          dst[pos + 5] = v >> 40;
-          dst[pos + 6] = v >> 48;
-          dst[pos + 7] = v >> 56;
+          encode_value(dst + pos, v, stride);
         } break;
         case INT96: {
+          // only PLAIN encoding is supported
           int64_t v        = s->col.leaf_column->element<int64_t>(val_idx);
           int32_t ts_scale = s->col.ts_scale;
           if (ts_scale != 0) {
@@ -1776,27 +1797,14 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
           }();
 
           // the 12 bytes of fixed length data.
-          v             = last_day_nanos.count();
-          dst[pos + 0]  = v;
-          dst[pos + 1]  = v >> 8;
-          dst[pos + 2]  = v >> 16;
-          dst[pos + 3]  = v >> 24;
-          dst[pos + 4]  = v >> 32;
-          dst[pos + 5]  = v >> 40;
-          dst[pos + 6]  = v >> 48;
-          dst[pos + 7]  = v >> 56;
-          uint32_t w    = julian_days.count();
-          dst[pos + 8]  = w;
-          dst[pos + 9]  = w >> 8;
-          dst[pos + 10] = w >> 16;
-          dst[pos + 11] = w >> 24;
+          v = last_day_nanos.count();
+          encode_value(dst + pos, v, 1);
+          uint32_t w = julian_days.count();
+          encode_value(dst + pos + 8, w, 1);
         } break;
 
-        case DOUBLE: {
-          auto v = s->col.leaf_column->element<double>(val_idx);
-          memcpy(dst + pos, &v, 8);
-        } break;
         case BYTE_ARRAY: {
+          // only PLAIN encoding is supported
           auto const bytes = [](cudf::type_id const type_id,
                                 column_device_view const* leaf_column,
                                 uint32_t const val_idx) -> void const* {
@@ -1810,11 +1818,8 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
               default: CUDF_UNREACHABLE("invalid type id for byte array writing!");
             }
           }(type_id, s->col.leaf_column, val_idx);
-          uint32_t v   = len - 4;  // string length
-          dst[pos + 0] = v;
-          dst[pos + 1] = v >> 8;
-          dst[pos + 2] = v >> 16;
-          dst[pos + 3] = v >> 24;
+          uint32_t v = len - 4;  // string length
+          encode_value(dst + pos, v, 1);
           if (v != 0) memcpy(dst + pos + 4, bytes, v);
         } break;
         case FIXED_LEN_BYTE_ARRAY: {
@@ -1822,10 +1827,16 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
             // When using FIXED_LEN_BYTE_ARRAY for decimals, the rep is encoded in big-endian
             auto const v = s->col.leaf_column->element<numeric::decimal128>(val_idx).value();
             auto const v_char_ptr = reinterpret_cast<char const*>(&v);
-            thrust::copy(thrust::seq,
-                         thrust::make_reverse_iterator(v_char_ptr + sizeof(v)),
-                         thrust::make_reverse_iterator(v_char_ptr),
-                         dst + pos);
+            if (is_split_stream) {
+              for (int i = dtype_len_out - 1; i >= 0; i--, pos += stride) {
+                dst[pos] = v_char_ptr[i];
+              }
+            } else {
+              thrust::copy(thrust::seq,
+                           thrust::make_reverse_iterator(v_char_ptr + sizeof(v)),
+                           thrust::make_reverse_iterator(v_char_ptr),
+                           dst + pos);
+            }
           }
         } break;
       }
@@ -1833,6 +1844,9 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
     __syncthreads();
   }
 
+  // for BYTE_STREAM_SPLIT, s->cur now points to the end of the first stream.
+  // need it to point to the end of the Nth stream.
+  if (is_split_stream and t == 0) { s->cur += (dtype_len_out - 1) * s->page.num_valid; }
   finish_page_encode<block_size>(
     s, s->cur, pages, comp_in, comp_out, comp_results, write_v2_headers);
 }
@@ -1883,13 +1897,13 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
                            ? s->ck.dict_rle_bits
                            : -1;
   if (t == 0) {
-    uint8_t* dst   = s->cur;
-    s->rle_run     = 0;
-    s->rle_pos     = 0;
-    s->rle_numvals = 0;
-    s->rle_out     = dst;
-    s->page.encoding =
-      determine_encoding(s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers);
+    uint8_t* dst     = s->cur;
+    s->rle_run       = 0;
+    s->rle_pos       = 0;
+    s->rle_numvals   = 0;
+    s->rle_out       = dst;
+    s->page.encoding = determine_encoding(
+      s->page.page_type, physical_type, s->ck.use_dictionary, write_v2_headers, false);
     if (dict_bits >= 0 && physical_type != BOOLEAN) {
       dst[0]     = dict_bits;
       s->rle_out = dst + 1;
@@ -3417,7 +3431,14 @@ void EncodePages(device_span<EncPage> pages,
     gpuEncodePageLevels<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
       pages, write_v2_headers, encode_kernel_mask::PLAIN);
     gpuEncodePages<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
-      pages, comp_in, comp_out, comp_results, write_v2_headers);
+      pages, comp_in, comp_out, comp_results, write_v2_headers, false);
+  }
+  if (BitAnd(kernel_mask, encode_kernel_mask::BYTE_STREAM_SPLIT) != 0) {
+    auto const strm = streams[s_idx++];
+    gpuEncodePageLevels<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, write_v2_headers, encode_kernel_mask::BYTE_STREAM_SPLIT);
+    gpuEncodePages<encode_block_size><<<num_pages, encode_block_size, 0, strm.value()>>>(
+      pages, comp_in, comp_out, comp_results, write_v2_headers, true);
   }
   if (BitAnd(kernel_mask, encode_kernel_mask::DELTA_BINARY) != 0) {
     auto const strm = streams[s_idx++];
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 07e03460ecb..6c6afde29e4 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -166,13 +166,7 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
                                                    ColumnChunkDesc const& chunk)
 {
   if (page.flags & PAGEINFO_FLAGS_DICTIONARY) { return decode_kernel_mask::NONE; }
-  if (!is_string_col(chunk) && !is_nested(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
-    if (page.encoding == Encoding::PLAIN) {
-      return decode_kernel_mask::FIXED_WIDTH_NO_DICT;
-    } else if (page.encoding == Encoding::PLAIN_DICTIONARY) {
-      return decode_kernel_mask::FIXED_WIDTH_DICT;
-    }
-  }
+
   if (page.encoding == Encoding::DELTA_BINARY_PACKED) {
     return decode_kernel_mask::DELTA_BINARY;
   } else if (page.encoding == Encoding::DELTA_BYTE_ARRAY) {
@@ -180,10 +174,26 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
   } else if (page.encoding == Encoding::DELTA_LENGTH_BYTE_ARRAY) {
     return decode_kernel_mask::DELTA_LENGTH_BA;
   } else if (is_string_col(chunk)) {
+    // check for string before byte_stream_split so FLBA will go to the right kernel
     return decode_kernel_mask::STRING;
   }
 
-  // non-string, non-delta
+  if (!is_nested(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
+    if (page.encoding == Encoding::PLAIN) {
+      return decode_kernel_mask::FIXED_WIDTH_NO_DICT;
+    } else if (page.encoding == Encoding::PLAIN_DICTIONARY ||
+               page.encoding == Encoding::RLE_DICTIONARY) {
+      return decode_kernel_mask::FIXED_WIDTH_DICT;
+    } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
+      return decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT;
+    }
+  }
+
+  if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
+    return decode_kernel_mask::BYTE_STREAM_SPLIT;
+  }
+
+  // non-string, non-delta, non-split_stream
   return decode_kernel_mask::GENERAL;
 }
 
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 6f96d4dd1cf..5ba813f518f 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -1039,7 +1039,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
       // - the row values we get from nz_idx will be
       //   0, 1, 2, 3, 4 ....
       // - by shifting these values by first_row, the sequence becomes
-      //   -1, -2, 0, 1, 2 ...
+      //   -2, -1, 0, 1, 2 ...
       // - so we will end up ignoring the first two input rows, and input rows 2..n will
       //   get written to the output starting at position 0.
       //
@@ -1062,7 +1062,19 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
           // choose a character parallel string copy when the average string is longer than a warp
           auto const use_char_ll = warp_total / warp_size >= warp_size;
 
-          if (use_char_ll) {
+          if (s->page.encoding == Encoding::BYTE_STREAM_SPLIT) {
+            if (src_pos + i < target_pos && dst_pos >= 0) {
+              auto const stride = s->page.str_bytes / s->dtype_len_in;
+              auto offptr =
+                reinterpret_cast<int32_t*>(nesting_info_base[leaf_level_index].data_out) + dst_pos;
+              *offptr      = len;
+              auto str_ptr = nesting_info_base[leaf_level_index].string_out + offset;
+              for (int ii = 0; ii < s->dtype_len_in; ii++) {
+                str_ptr[ii] = s->data_start[src_pos + i + ii * stride];
+              }
+            }
+            __syncwarp();
+          } else if (use_char_ll) {
             __shared__ __align__(8) uint8_t const* pointers[warp_size];
             __shared__ __align__(4) size_type offsets[warp_size];
             __shared__ __align__(4) int dsts[warp_size];
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index b165c60b2cf..c06fb63acda 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -54,7 +54,13 @@ constexpr int LEVEL_DECODE_BUF_SIZE = 2048;
 template <int rolling_size>
 constexpr int rolling_index(int index)
 {
-  return index % rolling_size;
+  // Cannot divide by 0. But `rolling_size` will be 0 for unused arrays, so this case will never
+  // actual be executed.
+  if constexpr (rolling_size == 0) {
+    return index;
+  } else {
+    return index % rolling_size;
+  }
 }
 
 // PARQUET-2261 allows for not writing the level histograms in certain cases.
@@ -81,7 +87,8 @@ constexpr bool is_supported_encoding(Encoding enc)
     case Encoding::RLE_DICTIONARY:
     case Encoding::DELTA_BINARY_PACKED:
     case Encoding::DELTA_LENGTH_BYTE_ARRAY:
-    case Encoding::DELTA_BYTE_ARRAY: return true;
+    case Encoding::DELTA_BYTE_ARRAY:
+    case Encoding::BYTE_STREAM_SPLIT: return true;
     default: return false;
   }
 }
@@ -199,14 +206,16 @@ enum level_type {
  * Used to control which decode kernels to run.
  */
 enum class decode_kernel_mask {
-  NONE                = 0,
-  GENERAL             = (1 << 0),  // Run catch-all decode kernel
-  STRING              = (1 << 1),  // Run decode kernel for string data
-  DELTA_BINARY        = (1 << 2),  // Run decode kernel for DELTA_BINARY_PACKED data
-  DELTA_BYTE_ARRAY    = (1 << 3),  // Run decode kernel for DELTA_BYTE_ARRAY encoded data
-  DELTA_LENGTH_BA     = (1 << 4),  // Run decode kernel for DELTA_LENGTH_BYTE_ARRAY encoded data
-  FIXED_WIDTH_NO_DICT = (1 << 5),  // Run decode kernel for fixed width non-dictionary pages
-  FIXED_WIDTH_DICT    = (1 << 6)   // Run decode kernel for fixed width dictionary pages
+  NONE                   = 0,
+  GENERAL                = (1 << 0),  // Run catch-all decode kernel
+  STRING                 = (1 << 1),  // Run decode kernel for string data
+  DELTA_BINARY           = (1 << 2),  // Run decode kernel for DELTA_BINARY_PACKED data
+  DELTA_BYTE_ARRAY       = (1 << 3),  // Run decode kernel for DELTA_BYTE_ARRAY encoded data
+  DELTA_LENGTH_BA        = (1 << 4),  // Run decode kernel for DELTA_LENGTH_BYTE_ARRAY encoded data
+  FIXED_WIDTH_NO_DICT    = (1 << 5),  // Run decode kernel for fixed width non-dictionary pages
+  FIXED_WIDTH_DICT       = (1 << 6),  // Run decode kernel for fixed width dictionary pages
+  BYTE_STREAM_SPLIT      = (1 << 7),  // Run decode kernel for BYTE_STREAM_SPLIT encoded data
+  BYTE_STREAM_SPLIT_FLAT = (1 << 8),  // Same as above but with a flat schema
 };
 
 // mask representing all the ways in which a string can be encoded
@@ -517,11 +526,12 @@ constexpr uint32_t encoding_to_mask(Encoding encoding)
  * Used to control which encode kernels to run.
  */
 enum class encode_kernel_mask {
-  PLAIN            = (1 << 0),  // Run plain encoding kernel
-  DICTIONARY       = (1 << 1),  // Run dictionary encoding kernel
-  DELTA_BINARY     = (1 << 2),  // Run DELTA_BINARY_PACKED encoding kernel
-  DELTA_LENGTH_BA  = (1 << 3),  // Run DELTA_LENGTH_BYTE_ARRAY encoding kernel
-  DELTA_BYTE_ARRAY = (1 << 4),  // Run DELTA_BYtE_ARRAY encoding kernel
+  PLAIN             = (1 << 0),  // Run plain encoding kernel
+  DICTIONARY        = (1 << 1),  // Run dictionary encoding kernel
+  DELTA_BINARY      = (1 << 2),  // Run DELTA_BINARY_PACKED encoding kernel
+  DELTA_LENGTH_BA   = (1 << 3),  // Run DELTA_LENGTH_BYTE_ARRAY encoding kernel
+  DELTA_BYTE_ARRAY  = (1 << 4),  // Run DELTA_BYtE_ARRAY encoding kernel
+  BYTE_STREAM_SPLIT = (1 << 5),  // Run plain encoding kernel, but split streams
 };
 
 /**
@@ -759,6 +769,28 @@ void DecodePageData(cudf::detail::hostdevice_span<PageInfo> pages,
                     kernel_error::pointer error_code,
                     rmm::cuda_stream_view stream);
 
+/**
+ * @brief Launches kernel for reading the BYTE_STREAM_SPLIT column data stored in the pages
+ *
+ * The page data will be written to the output pointed to in the page's
+ * associated column chunk.
+ *
+ * @param[in,out] pages All pages to be decoded
+ * @param[in] chunks All chunks to be decoded
+ * @param[in] num_rows Total number of rows to read
+ * @param[in] min_row Minimum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] error_code Error code for kernel failures
+ * @param[in] stream CUDA stream to use
+ */
+void DecodeSplitPageData(cudf::detail::hostdevice_span<PageInfo> pages,
+                         cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                         size_t num_rows,
+                         size_t min_row,
+                         int level_type_size,
+                         kernel_error::pointer error_code,
+                         rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for reading the string column data stored in the pages
  *
@@ -891,6 +923,28 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
                              kernel_error::pointer error_code,
                              rmm::cuda_stream_view stream);
 
+/**
+ * @brief Launches kernel for reading dictionary fixed width column data stored in the pages
+ *
+ * The page data will be written to the output pointed to in the page's
+ * associated column chunk.
+ *
+ * @param[in,out] pages All pages to be decoded
+ * @param[in] chunks All chunks to be decoded
+ * @param[in] num_rows Total number of rows to read
+ * @param[in] min_row Minimum number of rows to read
+ * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[out] error_code Error code for kernel failures
+ * @param[in] stream CUDA stream to use
+ */
+void DecodeSplitPageDataFlat(cudf::detail::hostdevice_span<PageInfo> pages,
+                             cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                             std::size_t num_rows,
+                             size_t min_row,
+                             int level_type_size,
+                             kernel_error::pointer error_code,
+                             rmm::cuda_stream_view stream);
+
 /**
  * @brief Launches kernel for initializing encoder row group fragments
  *
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index a524e7c6dcc..b7172f5ba67 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -253,6 +253,28 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row
                       streams[s_idx++]);
   }
 
+  // launch byte stream split decoder
+  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT) != 0) {
+    DecodeSplitPageDataFlat(subpass.pages,
+                            pass.chunks,
+                            num_rows,
+                            skip_rows,
+                            level_type_size,
+                            error_code.data(),
+                            streams[s_idx++]);
+  }
+
+  // launch byte stream split decoder
+  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT) != 0) {
+    DecodeSplitPageData(subpass.pages,
+                        pass.chunks,
+                        num_rows,
+                        skip_rows,
+                        level_type_size,
+                        error_code.data(),
+                        streams[s_idx++]);
+  }
+
   if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT) != 0) {
     DecodePageDataFixed(subpass.pages,
                         pass.chunks,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 6a8c31fb96b..5509a33f9f0 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -613,8 +613,7 @@ std::vector<schema_tree_node> construct_schema_tree(
                                                 column_in_metadata const& col_meta) {
         s.requested_encoding = column_encoding::USE_DEFAULT;
 
-        if (schema[parent_idx].name != "list" and
-            col_meta.get_encoding() != column_encoding::USE_DEFAULT) {
+        if (s.name != "list" and col_meta.get_encoding() != column_encoding::USE_DEFAULT) {
           // do some validation
           switch (col_meta.get_encoding()) {
             case column_encoding::DELTA_BINARY_PACKED:
@@ -659,6 +658,21 @@ std::vector<schema_tree_node> construct_schema_tree(
               }
               break;
 
+            case column_encoding::BYTE_STREAM_SPLIT:
+              if (s.type == Type::BYTE_ARRAY) {
+                CUDF_LOG_WARN(
+                  "BYTE_STREAM_SPLIT encoding is only supported for fixed width columns; the "
+                  "requested encoding will be ignored");
+                return;
+              }
+              if (s.type == Type::INT96) {
+                CUDF_LOG_WARN(
+                  "BYTE_STREAM_SPLIT encoding is not supported for INT96 columns; the "
+                  "requested encoding will be ignored");
+                return;
+              }
+              break;
+
             // supported parquet encodings
             case column_encoding::PLAIN:
             case column_encoding::DICTIONARY: break;
diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp
index b64cd230bc6..c1211869bcc 100644
--- a/cpp/tests/io/parquet_common.cpp
+++ b/cpp/tests/io/parquet_common.cpp
@@ -203,6 +203,7 @@ template std::vector<int8_t> random_values<int8_t>(size_t size);
 template std::vector<int16_t> random_values<int16_t>(size_t size);
 template std::vector<int32_t> random_values<int32_t>(size_t size);
 template std::vector<int64_t> random_values<int64_t>(size_t size);
+template std::vector<__int128_t> random_values<__int128_t>(size_t size);
 template std::vector<uint8_t> random_values<uint8_t>(size_t size);
 template std::vector<uint16_t> random_values<uint16_t>(size_t size);
 template std::vector<uint32_t> random_values<uint32_t>(size_t size);
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 3a3040f0957..a16b3d63177 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -35,7 +35,7 @@
 using cudf::test::iterators::no_nulls;
 
 template <typename mask_op_t>
-void test_durations(mask_op_t mask_op)
+void test_durations(mask_op_t mask_op, bool use_byte_stream_split)
 {
   std::default_random_engine generator;
   std::uniform_int_distribution<int> distribution_d(0, 30);
@@ -67,6 +67,13 @@ void test_durations(mask_op_t mask_op)
 
   auto expected = table_view{{durations_d, durations_s, durations_ms, durations_us, durations_ns}};
 
+  if (use_byte_stream_split) {
+    cudf::io::table_input_metadata expected_metadata(expected);
+    for (auto& col_meta : expected_metadata.column_metadata) {
+      col_meta.set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT);
+    }
+  }
+
   auto filepath = temp_env->get_temp_filepath("Durations.parquet");
   cudf::io::parquet_writer_options out_opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
@@ -91,10 +98,10 @@ void test_durations(mask_op_t mask_op)
 
 TEST_F(ParquetWriterTest, Durations)
 {
-  test_durations([](auto i) { return true; });
-  test_durations([](auto i) { return (i % 2) != 0; });
-  test_durations([](auto i) { return (i % 3) != 0; });
-  test_durations([](auto i) { return false; });
+  test_durations([](auto i) { return true; }, false);
+  test_durations([](auto i) { return (i % 2) != 0; }, false);
+  test_durations([](auto i) { return (i % 3) != 0; }, false);
+  test_durations([](auto i) { return false; }, false);
 }
 
 TEST_F(ParquetWriterTest, MultiIndex)
@@ -1593,6 +1600,7 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
   using cudf::io::column_encoding;
   using cudf::io::parquet::detail::Encoding;
   constexpr int num_rows = 500;
+  std::mt19937 engine{31337};
 
   auto const ones = thrust::make_constant_iterator(1);
   auto const col =
@@ -1602,6 +1610,9 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
   auto const string_col =
     cudf::test::strings_column_wrapper(strings, strings + num_rows, no_nulls());
 
+  // throw in a list to make sure encoding selection works there too
+  auto list_col = make_parquet_list_col<int32_t>(engine, num_rows, 5, true);
+
   auto const table = table_view({col,
                                  col,
                                  col,
@@ -1613,7 +1624,8 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
                                  string_col,
                                  string_col,
                                  string_col,
-                                 string_col});
+                                 string_col,
+                                 *list_col});
 
   cudf::io::table_input_metadata table_metadata(table);
 
@@ -1635,10 +1647,17 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
   set_meta(10, "string_db", column_encoding::DELTA_BINARY_PACKED);
   table_metadata.column_metadata[11].set_name("string_none");
 
-  for (auto& col_meta : table_metadata.column_metadata) {
-    col_meta.set_nullability(false);
+  for (int i = 0; i < 12; i++) {
+    table_metadata.column_metadata[i].set_nullability(false);
   }
 
+  // handle list column separately
+  table_metadata.column_metadata[12].set_name("int32_list").set_nullability(true);
+  table_metadata.column_metadata[12]
+    .child(1)
+    .set_encoding(column_encoding::DELTA_BINARY_PACKED)
+    .set_nullability(true);
+
   auto const filepath = temp_env->get_temp_filepath("UserRequestedEncodings.parquet");
   cudf::io::parquet_writer_options opts =
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table)
@@ -1683,6 +1702,12 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
   expect_enc(10, Encoding::PLAIN_DICTIONARY);
   // no request, should use dictionary
   expect_enc(11, Encoding::PLAIN_DICTIONARY);
+  // int list requested delta_binary_packed. it's has level data, so have to search for a match.
+  auto const encodings = fmd.row_groups[0].columns[12].meta_data.encodings;
+  auto const has_delta = std::any_of(encodings.begin(), encodings.end(), [](Encoding enc) {
+    return enc == Encoding::DELTA_BINARY_PACKED;
+  });
+  EXPECT_TRUE(has_delta);
 }
 
 TEST_F(ParquetWriterTest, Decimal128DeltaByteArray)
@@ -1743,6 +1768,95 @@ TEST_F(ParquetWriterTest, DeltaBinaryStartsWithNulls)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
+TEST_F(ParquetWriterTest, ByteStreamSplit)
+{
+  constexpr auto num_rows = 100;
+  std::mt19937 engine{31337};
+  auto col0_data = random_values<int32_t>(num_rows);
+  auto col1_data = random_values<int64_t>(num_rows);
+  auto col2_data = random_values<float>(num_rows);
+  auto col3_data = random_values<double>(num_rows);
+
+  column_wrapper<int32_t> col0{col0_data.begin(), col0_data.end(), no_nulls()};
+  column_wrapper<int64_t> col1{col1_data.begin(), col1_data.end(), no_nulls()};
+  column_wrapper<float> col2{col2_data.begin(), col2_data.end(), no_nulls()};
+  column_wrapper<double> col3{col3_data.begin(), col3_data.end(), no_nulls()};
+
+  // throw in a list to make sure both decoders are working
+  auto col4 = make_parquet_list_col<int32_t>(engine, num_rows, 5, true);
+
+  auto expected = table_view{{col0, col1, col2, col3, *col4}};
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("int32s");
+  expected_metadata.column_metadata[1].set_name("int64s");
+  expected_metadata.column_metadata[2].set_name("floats");
+  expected_metadata.column_metadata[3].set_name("doubles");
+  expected_metadata.column_metadata[4].set_name("int32list");
+  auto const encoding = cudf::io::column_encoding::BYTE_STREAM_SPLIT;
+  for (int i = 0; i <= 3; i++) {
+    expected_metadata.column_metadata[i].set_encoding(encoding);
+  }
+
+  expected_metadata.column_metadata[4].child(1).set_encoding(encoding);
+
+  auto const filepath = temp_env->get_temp_filepath("ByteStreamSplit.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
+TEST_F(ParquetWriterTest, DecimalByteStreamSplit)
+{
+  constexpr cudf::size_type num_rows = 100;
+  auto seq_col0                      = random_values<int32_t>(num_rows);
+  auto seq_col1                      = random_values<int64_t>(num_rows);
+  auto seq_col2                      = random_values<__int128_t>(num_rows);
+
+  auto col0 = cudf::test::fixed_point_column_wrapper<int32_t>{
+    seq_col0.begin(), seq_col0.end(), no_nulls(), numeric::scale_type{-5}};
+  auto col1 = cudf::test::fixed_point_column_wrapper<int64_t>{
+    seq_col1.begin(), seq_col1.end(), no_nulls(), numeric::scale_type{-9}};
+  auto col2 = cudf::test::fixed_point_column_wrapper<__int128_t>{
+    seq_col1.begin(), seq_col1.end(), no_nulls(), numeric::scale_type{-11}};
+
+  auto expected = table_view({col0, col1, col2});
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("int32s").set_decimal_precision(7);
+  expected_metadata.column_metadata[1].set_name("int64s").set_decimal_precision(11);
+  expected_metadata.column_metadata[2].set_name("int128s").set_decimal_precision(22);
+  for (auto& col_meta : expected_metadata.column_metadata) {
+    col_meta.set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT);
+  }
+
+  auto const filepath = temp_env->get_temp_filepath("DecimalByteStreamSplit.parquet");
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(args);
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
+TEST_F(ParquetWriterTest, DurationByteStreamSplit)
+{
+  test_durations([](auto i) { return true; }, true);
+  test_durations([](auto i) { return (i % 2) != 0; }, true);
+  test_durations([](auto i) { return (i % 3) != 0; }, true);
+  test_durations([](auto i) { return false; }, true);
+}
+
 /////////////////////////////////////////////////////////////
 // custom mem mapped data sink that supports device writes
 template <bool supports_device_writes>
@@ -1926,6 +2040,35 @@ TYPED_TEST(ParquetWriterTimestampTypeTest, TimestampOverflow)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
+TYPED_TEST(ParquetWriterTimestampTypeTest, TimestampsByteStreamSplit)
+{
+  srand(42);
+  auto sequence = cudf::detail::make_counting_transform_iterator(
+    0, [](auto i) { return ((std::rand() / 10000) * 1000); });
+
+  constexpr auto num_rows = 100;
+  column_wrapper<TypeParam, typename decltype(sequence)::value_type> col(
+    sequence, sequence + num_rows, no_nulls());
+
+  auto expected = table_view{{col}};
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT);
+
+  auto filepath = temp_env->get_temp_filepath("TimestampsByteStreamSplit.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+      .timestamp_type(this->type());
+  auto result = cudf::io::read_parquet(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+}
+
 //////////////////////////////
 // writer stress tests
 

From 2eb71b28d9607e3dfa5b891cbc40ce53a5d27bc6 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 24 Apr 2024 16:05:34 -0400
Subject: [PATCH 106/842] Large strings gtest fixture and utilities (#15513)

Creates the base class and utilities for testing APIs to produce large strings.
The main purpose of the fixture is to enable the large strings environment variable(s) and to setup large test data that can be reused by multiple tests.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15513
---
 cpp/include/cudf_test/testing_main.hpp        |  37 ++++--
 cpp/tests/CMakeLists.txt                      |   9 ++
 cpp/tests/copying/concatenate_tests.cpp       |  43 ------
 cpp/tests/large_strings/concatenate_tests.cpp |  65 ++++++++++
 .../large_strings/large_strings_fixture.cpp   | 122 ++++++++++++++++++
 .../large_strings/large_strings_fixture.hpp   |  49 +++++++
 cpp/tests/large_strings/merge_tests.cpp       |  79 ++++++++++++
 cpp/tests/merge/merge_string_test.cpp         |  57 --------
 8 files changed, 351 insertions(+), 110 deletions(-)
 create mode 100644 cpp/tests/large_strings/concatenate_tests.cpp
 create mode 100644 cpp/tests/large_strings/large_strings_fixture.cpp
 create mode 100644 cpp/tests/large_strings/large_strings_fixture.hpp
 create mode 100644 cpp/tests/large_strings/merge_tests.cpp

diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp
index ecac761f7cb..66b831b917f 100644
--- a/cpp/include/cudf_test/testing_main.hpp
+++ b/cpp/include/cudf_test/testing_main.hpp
@@ -145,6 +145,25 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
   }
 }
 
+/**
+ * @brief Sets up stream mode memory resource adaptor
+ *
+ * The resource adaptor is only set as the current device resource if the
+ * stream mode is enabled.
+ *
+ * The caller must keep the return object alive for the life of the test runs.
+ *
+ * @param cmd_opts Command line options returned by parse_cudf_test_opts
+ * @return Memory resource adaptor
+ */
+inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts)
+{
+  auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();
+  auto resource       = cudf::test::create_memory_resource(rmm_mode);
+  rmm::mr::set_current_device_resource(resource.get());
+  return resource;
+}
+
 /**
  * @brief Sets up stream mode memory resource adaptor
  *
@@ -181,14 +200,12 @@ inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
  * function parses the command line to customize test behavior, like the
  * allocation mode used for creating the default memory resource.
  */
-#define CUDF_TEST_PROGRAM_MAIN()                                        \
-  int main(int argc, char** argv)                                       \
-  {                                                                     \
-    ::testing::InitGoogleTest(&argc, argv);                             \
-    auto const cmd_opts = parse_cudf_test_opts(argc, argv);             \
-    auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();       \
-    auto resource       = cudf::test::create_memory_resource(rmm_mode); \
-    rmm::mr::set_current_device_resource(resource.get());               \
-    auto adaptor = make_stream_mode_adaptor(cmd_opts);                  \
-    return RUN_ALL_TESTS();                                             \
+#define CUDF_TEST_PROGRAM_MAIN()                                            \
+  int main(int argc, char** argv)                                           \
+  {                                                                         \
+    ::testing::InitGoogleTest(&argc, argv);                                 \
+    auto const cmd_opts           = parse_cudf_test_opts(argc, argv);       \
+    [[maybe_unused]] auto mr      = make_memory_resource_adaptor(cmd_opts); \
+    [[maybe_unused]] auto adaptor = make_stream_mode_adaptor(cmd_opts);     \
+    return RUN_ALL_TESTS();                                                 \
   }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index f59e675e1d5..6c56d82007a 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -568,6 +568,15 @@ ConfigureTest(
   strings/urls_tests.cpp
 )
 
+# ##################################################################################################
+# * large strings test ----------------------------------------------------------------------------
+ConfigureTest(
+  LARGE_STRINGS_TEST large_strings/large_strings_fixture.cpp large_strings/merge_tests.cpp
+  large_strings/concatenate_tests.cpp
+  GPUS 1
+  PERCENT 100
+)
+
 # ##################################################################################################
 # * json path test --------------------------------------------------------------------------------
 ConfigureTest(JSON_PATH_TEST json/json_tests.cpp)
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 3e2e332936e..c2d1e1d9f4f 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -197,49 +197,6 @@ TEST_F(StringColumnTest, ConcatenateTooLarge)
   EXPECT_THROW(cudf::concatenate(input_cols), std::overflow_error);
 }
 
-TEST_F(StringColumnTest, ConcatenateLargeStrings)
-{
-  CUDF_TEST_ENABLE_LARGE_STRINGS();
-  auto itr = thrust::constant_iterator<std::string_view>(
-    "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY");                // 50 bytes
-  auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000);  // 250MB
-  auto view  = cudf::column_view(input);
-  std::vector<cudf::column_view> input_cols;
-  std::vector<cudf::size_type> splits;
-  int const multiplier = 10;
-  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
-    input_cols.push_back(view);
-    splits.push_back(view.size() * (i + 1));
-  }
-  splits.pop_back();  // remove last entry
-  auto result = cudf::concatenate(input_cols);
-  auto sv     = cudf::strings_column_view(result->view());
-  EXPECT_EQ(sv.size(), view.size() * multiplier);
-  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
-
-  // verify results in sections
-  auto sliced = cudf::split(result->view(), splits);
-  for (auto c : sliced) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
-  }
-
-  // also test with large strings column as input
-  {
-    input_cols.clear();
-    input_cols.push_back(input);           // regular column
-    input_cols.push_back(result->view());  // large column
-    result = cudf::concatenate(input_cols);
-    sv     = cudf::strings_column_view(result->view());
-    EXPECT_EQ(sv.size(), view.size() * (multiplier + 1));
-    EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
-    splits.push_back(view.size() * multiplier);
-    sliced = cudf::split(result->view(), splits);
-    for (auto c : sliced) {
-      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
-    }
-  }
-}
-
 struct TableTest : public cudf::test::BaseFixture {};
 
 TEST_F(TableTest, ConcatenateTables)
diff --git a/cpp/tests/large_strings/concatenate_tests.cpp b/cpp/tests/large_strings/concatenate_tests.cpp
new file mode 100644
index 00000000000..aa445bf761b
--- /dev/null
+++ b/cpp/tests/large_strings/concatenate_tests.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <vector>
+
+struct ConcatenateTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(ConcatenateTest, ConcatenateVertical)
+{
+  auto input = this->long_column();
+  auto view  = cudf::column_view(input);
+  std::vector<cudf::column_view> input_cols;
+  std::vector<cudf::size_type> splits;
+  int const multiplier = 10;
+  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
+    input_cols.push_back(view);
+    splits.push_back(view.size() * (i + 1));
+  }
+  splits.pop_back();  // remove last entry
+  auto result = cudf::concatenate(input_cols);
+  auto sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.size() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+
+  // verify results in sections
+  auto sliced = cudf::split(result->view(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also test with large strings column as input
+  input_cols.clear();
+  input_cols.push_back(input);           // regular column
+  input_cols.push_back(result->view());  // large column
+  result = cudf::concatenate(input_cols);
+  sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.size() * (multiplier + 1));
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+  splits.push_back(view.size() * multiplier);
+  sliced = cudf::split(result->view(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+}
diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp
new file mode 100644
index 00000000000..59e0cd43d05
--- /dev/null
+++ b/cpp/tests/large_strings/large_strings_fixture.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/testing_main.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/repeat_strings.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <map>
+#include <memory>
+#include <vector>
+
+namespace cudf::test {
+class LargeStringsData {
+ public:
+  using DataPointer = std::unique_ptr<cudf::table>;
+
+  virtual ~LargeStringsData() {}
+
+  void add_table(std::string_view name, std::unique_ptr<cudf::table>&& data)
+  {
+    _data[std::string(name)] = std::move(data);
+  }
+
+  cudf::table_view get_table(std::string_view name) const
+  {
+    std::string key{name};
+    return _data.find(key) != _data.end() ? _data.at(key)->view() : cudf::table_view{};
+  }
+
+  void add_column(std::string_view name, std::unique_ptr<cudf::column>&& data)
+  {
+    std::vector<std::unique_ptr<cudf::column>> cols;
+    cols.emplace_back(std::move(data));
+    _data[std::string(name)] = std::make_unique<cudf::table>(std::move(cols));
+  }
+
+  cudf::column_view get_column(std::string_view name) const
+  {
+    std::string key{name};
+    return _data.find(key) != _data.end() ? _data.at(key)->view().column(0) : cudf::column_view{};
+  }
+
+  bool has_key(std::string_view name) const { return _data.find(std::string(name)) != _data.end(); }
+
+ protected:
+  std::map<std::string, DataPointer> _data;
+};
+
+cudf::column_view StringsLargeTest::wide_column()
+{
+  std::string name{"wide1"};
+  if (!g_ls_data->has_key(name)) {
+    auto input =
+      cudf::test::strings_column_wrapper({"the quick brown fox jumps over the lazy dog",
+                                          "the fat cat lays next to the other accénted cat",
+                                          "a slow moving turtlé cannot catch the bird",
+                                          "which can be composéd together to form a more complete",
+                                          "The result does not include the value in the sum in"});
+    auto counts = cudf::test::fixed_width_column_wrapper<int>({8, 8, 8, 8, 8});
+    auto result = cudf::strings::repeat_strings(cudf::strings_column_view(input), counts);
+    g_ls_data->add_column(name, std::move(result));
+  }
+  return g_ls_data->get_column(name);
+}
+
+cudf::column_view StringsLargeTest::long_column()
+{
+  std::string name("long1");
+  if (!g_ls_data->has_key(name)) {
+    auto itr = thrust::constant_iterator<std::string_view>(
+      "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY");                // 50 bytes
+    auto input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000);  // 250MB
+    g_ls_data->add_column(name, input.release());
+  }
+  return g_ls_data->get_column(name);
+}
+
+std::unique_ptr<LargeStringsData> StringsLargeTest::get_ls_data()
+{
+  CUDF_EXPECTS(g_ls_data == nullptr, "invalid call to get_ls_data");
+  auto lsd_data = std::make_unique<LargeStringsData>();
+  g_ls_data     = lsd_data.get();
+  return lsd_data;
+}
+
+LargeStringsData* StringsLargeTest::g_ls_data = nullptr;
+}  // namespace cudf::test
+
+int main(int argc, char** argv)
+{
+  ::testing::InitGoogleTest(&argc, argv);
+  auto const cmd_opts = parse_cudf_test_opts(argc, argv);
+  // hardcoding the CUDA memory resource to keep from exceeding the pool
+  auto mr = cudf::test::make_cuda();
+  rmm::mr::set_current_device_resource(mr.get());
+  auto adaptor = make_stream_mode_adaptor(cmd_opts);
+
+  // create object to automatically be destroyed at the end of main()
+  auto lsd = cudf::test::StringsLargeTest::get_ls_data();
+
+  return RUN_ALL_TESTS();
+}
diff --git a/cpp/tests/large_strings/large_strings_fixture.hpp b/cpp/tests/large_strings/large_strings_fixture.hpp
new file mode 100644
index 00000000000..8827b65f1ce
--- /dev/null
+++ b/cpp/tests/large_strings/large_strings_fixture.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/column/column_view.hpp>
+
+namespace cudf::test {
+class LargeStringsData;
+
+/**
+ * @brief Fixture for creating large strings tests
+ *
+ * Stores tests strings columns for reuse by specific tests.
+ * Creating the test input only once helps speed up the overall tests.
+ *
+ * Also automatically enables appropriate large strings environment variables.
+ */
+struct StringsLargeTest : public cudf::test::BaseFixture {
+  /**
+   * @brief Returns a column of long strings
+   */
+  cudf::column_view wide_column();
+
+  /**
+   * @brief Returns a long column of strings
+   */
+  cudf::column_view long_column();
+
+  large_strings_enabler g_ls_enabler;
+  static LargeStringsData* g_ls_data;
+
+  static std::unique_ptr<LargeStringsData> get_ls_data();
+};
+}  // namespace cudf::test
diff --git a/cpp/tests/large_strings/merge_tests.cpp b/cpp/tests/large_strings/merge_tests.cpp
new file mode 100644
index 00000000000..afe6e424371
--- /dev/null
+++ b/cpp/tests/large_strings/merge_tests.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/merge.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <vector>
+
+struct MergeTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(MergeTest, MergeLargeStrings)
+{
+  auto const input = this->long_column();
+  auto input_views = std::vector<cudf::table_view>();
+  auto const view  = cudf::table_view({input});
+  std::vector<cudf::size_type> splits;
+  int const multiplier = 10;
+  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
+    input_views.push_back(view);
+    splits.push_back(view.num_rows() * (i + 1));
+  }
+  splits.pop_back();  // remove last entry
+  auto const column_order    = std::vector<cudf::order>{cudf::order::ASCENDING};
+  auto const null_precedence = std::vector<cudf::null_order>{cudf::null_order::AFTER};
+
+  auto result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  auto sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), view.num_rows() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+
+  auto sliced = cudf::split(sv.parent(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also test with large strings column as input
+  input_views.clear();
+  input_views.push_back(view);            // regular column
+  input_views.push_back(result->view());  // large column
+  result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), view.num_rows() * (multiplier + 1));
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+  splits.push_back(view.num_rows() * multiplier);
+  sliced = cudf::split(sv.parent(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also check merge still returns 32-bit offsets for regular columns
+  input_views.clear();
+  input_views.push_back(view);
+  input_views.push_back(view);
+  result = cudf::merge(input_views, {0}, column_order, null_precedence);
+  sv     = cudf::strings_column_view(result->view().column(0));
+  EXPECT_EQ(sv.size(), view.num_rows() * 2);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT32});
+  sliced = cudf::split(sv.parent(), {view.num_rows()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[0], input);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[1], input);
+}
diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp
index d7368d31944..28179a7341c 100644
--- a/cpp/tests/merge/merge_string_test.cpp
+++ b/cpp/tests/merge/merge_string_test.cpp
@@ -411,60 +411,3 @@ TYPED_TEST(MergeStringTest, Merge2StringKeyNullColumns)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view2, output_column_view2);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view3, output_column_view3);
 }
-
-class MergeLargeStringsTest : public cudf::test::BaseFixture {};
-
-TEST_F(MergeLargeStringsTest, MergeLargeStrings)
-{
-  CUDF_TEST_ENABLE_LARGE_STRINGS();
-  auto itr = thrust::constant_iterator<std::string_view>(
-    "abcdefghijklmnopqrstuvwxyABCDEFGHIJKLMNOPQRSTUVWXY");                      // 50 bytes
-  auto const input = cudf::test::strings_column_wrapper(itr, itr + 5'000'000);  // 250MB
-  auto input_views = std::vector<cudf::table_view>();
-  auto const view  = cudf::table_view({input});
-  std::vector<cudf::size_type> splits;
-  int const multiplier = 10;
-  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
-    input_views.push_back(view);
-    splits.push_back(view.num_rows() * (i + 1));
-  }
-  splits.pop_back();  // remove last entry
-  auto const column_order    = std::vector<cudf::order>{cudf::order::ASCENDING};
-  auto const null_precedence = std::vector<cudf::null_order>{cudf::null_order::AFTER};
-
-  auto result = cudf::merge(input_views, {0}, column_order, null_precedence);
-  auto sv     = cudf::strings_column_view(result->view().column(0));
-  EXPECT_EQ(sv.size(), view.num_rows() * multiplier);
-  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
-
-  auto sliced = cudf::split(sv.parent(), splits);
-  for (auto c : sliced) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
-  }
-
-  // also test with large strings column as input
-  input_views.clear();
-  input_views.push_back(view);            // regular column
-  input_views.push_back(result->view());  // large column
-  result = cudf::merge(input_views, {0}, column_order, null_precedence);
-  sv     = cudf::strings_column_view(result->view().column(0));
-  EXPECT_EQ(sv.size(), view.num_rows() * (multiplier + 1));
-  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
-  splits.push_back(view.num_rows() * multiplier);
-  sliced = cudf::split(sv.parent(), splits);
-  for (auto c : sliced) {
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
-  }
-
-  // also check merge still returns 32-bit offsets for regular columns
-  input_views.clear();
-  input_views.push_back(view);
-  input_views.push_back(view);
-  result = cudf::merge(input_views, {0}, column_order, null_precedence);
-  sv     = cudf::strings_column_view(result->view().column(0));
-  EXPECT_EQ(sv.size(), view.num_rows() * 2);
-  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT32});
-  sliced = cudf::split(sv.parent(), {view.num_rows()});
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[0], input);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[1], input);
-}

From 8b4dc91fbee585e0f03cccc2b60ce7b68baa9a5f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 24 Apr 2024 10:53:36 -1000
Subject: [PATCH 107/842] Replace RangeIndex._start/_stop/_step with _range
 (#15576)

The `._start/_stop/_step` attributes are wholly redundant with the similar attributes on a `range` object, so replacing with those attributes where needed

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15576
---
 python/cudf/cudf/core/index.py       | 128 +++++++++++----------------
 python/cudf/cudf/tests/test_index.py |   2 +-
 2 files changed, 55 insertions(+), 75 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 6f08b1d83b3..e457e818129 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -36,7 +36,6 @@
     is_integer,
     is_list_like,
     is_scalar,
-    is_signed_integer_dtype,
 )
 from cudf.core._base_index import BaseIndex
 from cudf.core._compat import PANDAS_LT_300
@@ -149,6 +148,15 @@ def _index_from_data(data: MutableMapping, name: Any = no_default):
     return index_class_type._from_data(data, name)
 
 
+def validate_range_arg(arg, arg_name: Literal["start", "stop", "step"]) -> int:
+    """Validate start/stop/step argument in RangeIndex.__init__"""
+    if not is_integer(arg):
+        raise TypeError(
+            f"{arg_name} must be an integer, not {type(arg).__name__}"
+        )
+    return int(arg)
+
+
 class RangeIndex(BaseIndex, BinaryOperand):
     """
     Immutable Index implementing a monotonic integer range.
@@ -197,44 +205,29 @@ class RangeIndex(BaseIndex, BinaryOperand):
     def __init__(
         self, start, stop=None, step=1, dtype=None, copy=False, name=None
     ):
-        if step == 0:
-            raise ValueError("Step must not be zero.")
         if not cudf.api.types.is_hashable(name):
             raise ValueError("Name must be a hashable value.")
-        if dtype is not None and not is_signed_integer_dtype(dtype):
+        self._name = name
+        if dtype is not None and cudf.dtype(dtype).kind != "i":
             raise ValueError(f"{dtype=} must be a signed integer type")
 
         if isinstance(start, range):
-            therange = start
-            start = therange.start
-            stop = therange.stop
-            step = therange.step
-        if stop is None:
-            start, stop = 0, start
-        if not is_integer(start):
-            raise TypeError(
-                f"start must be an integer, not {type(start).__name__}"
-            )
-        self._start = int(start)
-        if not is_integer(stop):
-            raise TypeError(
-                f"stop must be an integer, not {type(stop).__name__}"
-            )
-        self._stop = int(stop)
-        if step is not None:
-            if not is_integer(step):
-                raise TypeError(
-                    f"step must be an integer, not {type(step).__name__}"
-                )
-            self._step = int(step)
+            self._range = start
         else:
-            self._step = 1
-        self._index = None
-        self._name = name
-        self._range = range(self._start, self._stop, self._step)
-        # _end is the actual last element of RangeIndex,
-        # whereas _stop is an upper bound.
-        self._end = self._start + self._step * (len(self._range) - 1)
+            if stop is None:
+                start, stop = 0, start
+            start = validate_range_arg(start, "start")
+            stop = validate_range_arg(stop, "stop")
+            if step is not None:
+                step = validate_range_arg(step, "step")
+            else:
+                step = 1
+            try:
+                self._range = range(start, stop, step)
+            except ValueError as err:
+                if step == 0:
+                    raise ValueError("Step must not be zero.") from err
+                raise
 
     def _copy_type_metadata(
         self, other: RangeIndex, *, override_dtypes=None
@@ -251,9 +244,9 @@ def searchsorted(
         na_position: Literal["first", "last"] = "last",
     ):
         assert (len(self) <= 1) or (
-            ascending == (self._step > 0)
+            ascending == (self.step > 0)
         ), "Invalid ascending flag"
-        return search_range(value, self.as_range, side=side)
+        return search_range(value, self._range, side=side)
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -271,7 +264,7 @@ def start(self):
         """
         The value of the `start` parameter (0 if this was not supplied).
         """
-        return self._start
+        return self._range.start
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -279,7 +272,7 @@ def stop(self):
         """
         The value of the stop parameter.
         """
-        return self._stop
+        return self._range.stop
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -287,7 +280,7 @@ def step(self):
         """
         The value of the step parameter.
         """
-        return self._step
+        return self._range.step
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -368,9 +361,7 @@ def copy(self, name=None, deep=False):
 
         name = self.name if name is None else name
 
-        return RangeIndex(
-            start=self._start, stop=self._stop, step=self._step, name=name
-        )
+        return RangeIndex(self._range, name=name)
 
     @_cudf_nvtx_annotate
     def astype(self, dtype, copy: bool = True):
@@ -389,8 +380,8 @@ def duplicated(self, keep="first"):
     @_cudf_nvtx_annotate
     def __repr__(self):
         return (
-            f"{self.__class__.__name__}(start={self._start}, stop={self._stop}"
-            f", step={self._step}"
+            f"{self.__class__.__name__}(start={self.start}, stop={self.stop}"
+            f", step={self.step}"
             + (
                 f", name={pd.io.formats.printing.default_pprint(self.name)}"
                 if self.name is not None
@@ -401,16 +392,16 @@ def __repr__(self):
 
     @_cudf_nvtx_annotate
     def __len__(self):
-        return len(range(self._start, self._stop, self._step))
+        return len(self._range)
 
     @_cudf_nvtx_annotate
     def __getitem__(self, index):
         if isinstance(index, slice):
             sl_start, sl_stop, sl_step = index.indices(len(self))
 
-            lo = self._start + sl_start * self._step
-            hi = self._start + sl_stop * self._step
-            st = self._step * sl_step
+            lo = self.start + sl_start * self.step
+            hi = self.start + sl_stop * self.step
+            st = self.step * sl_step
             return RangeIndex(start=lo, stop=hi, step=st, name=self._name)
 
         elif isinstance(index, Number):
@@ -419,18 +410,13 @@ def __getitem__(self, index):
                 index += len_self
             if not (0 <= index < len_self):
                 raise IndexError("Index out of bounds")
-            return self._start + index * self._step
+            return self.start + index * self.step
         return self._as_int_index()[index]
 
     @_cudf_nvtx_annotate
     def equals(self, other):
         if isinstance(other, RangeIndex):
-            if (self._start, self._stop, self._step) == (
-                other._start,
-                other._stop,
-                other._step,
-            ):
-                return True
+            return self._range == other._range
         return self._as_int_index().equals(other)
 
     @_cudf_nvtx_annotate
@@ -442,9 +428,9 @@ def serialize(self):
         # We don't need to store the GPU buffer for RangeIndexes
         # cuDF only needs to store start/stop and rehydrate
         # during de-serialization
-        header["index_column"]["start"] = self._start
-        header["index_column"]["stop"] = self._stop
-        header["index_column"]["step"] = self._step
+        header["index_column"]["start"] = self.start
+        header["index_column"]["stop"] = self.stop
+        header["index_column"]["step"] = self.step
         frames = []
 
         header["name"] = pickle.dumps(self.name)
@@ -484,9 +470,9 @@ def to_pandas(
         elif arrow_type:
             raise NotImplementedError(f"{arrow_type=} is not implemented.")
         return pd.RangeIndex(
-            start=self._start,
-            stop=self._stop,
-            step=self._step,
+            start=self.start,
+            stop=self.stop,
+            step=self.step,
             dtype=self.dtype,
             name=self.name,
         )
@@ -495,19 +481,15 @@ def to_pandas(
     def is_unique(self):
         return True
 
-    @cached_property
-    def as_range(self):
-        return range(self._start, self._stop, self._step)
-
     @cached_property  # type: ignore
     @_cudf_nvtx_annotate
     def is_monotonic_increasing(self):
-        return self._step > 0 or len(self) <= 1
+        return self.step > 0 or len(self) <= 1
 
     @cached_property  # type: ignore
     @_cudf_nvtx_annotate
     def is_monotonic_decreasing(self):
-        return self._step < 0 or len(self) <= 1
+        return self.step < 0 or len(self) <= 1
 
     @_cudf_nvtx_annotate
     def memory_usage(self, deep=False):
@@ -590,12 +572,12 @@ def get_indexer(self, target, limit=None, method=None, tolerance=None):
     def get_loc(self, key):
         if not is_scalar(key):
             raise TypeError("Should be a scalar-like")
-        idx = (key - self._start) / self._step
-        idx_int_upper_bound = (self._stop - self._start) // self._step
+        idx = (key - self.start) / self.step
+        idx_int_upper_bound = (self.stop - self.start) // self.step
         if idx > idx_int_upper_bound or idx < 0:
             raise KeyError(key)
 
-        idx_int = (key - self._start) // self._step
+        idx_int = (key - self.start) // self.step
         if idx_int != idx:
             raise KeyError(key)
         return idx_int
@@ -607,9 +589,9 @@ def _union(self, other, sort=None):
             # following notation: *_o -> other, *_s -> self,
             # and *_r -> result
             start_s, step_s = self.start, self.step
-            end_s = self._end
+            end_s = self.start + self.step * (len(self) - 1)
             start_o, step_o = other.start, other.step
-            end_o = other._end
+            end_o = other.start + other.step * (len(other) - 1)
             if self.step < 0:
                 start_s, step_s, end_s = end_s, -step_s, start_s
             if other.step < 0:
@@ -854,9 +836,7 @@ def argsort(
             raise ValueError(f"invalid na_position: {na_position}")
 
         indices = cupy.arange(0, len(self))
-        if (ascending and self._step < 0) or (
-            not ascending and self._step > 0
-        ):
+        if (ascending and self.step < 0) or (not ascending and self.step > 0):
             indices = indices[::-1]
         return indices
 
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index ebbca57bd40..08a7a9148dd 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -1606,7 +1606,7 @@ def test_rangeindex_name_not_hashable():
 def test_index_rangeindex_search_range():
     # step > 0
     ridx = RangeIndex(-13, 17, 4)
-    ri = ridx.as_range
+    ri = ridx._range
     for i in range(len(ridx)):
         assert i == search_range(ridx[i], ri, side="left")
         assert i + 1 == search_range(ridx[i], ri, side="right")

From 70a5b2bda500fe46cd14860b4e2ca0109893c434 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 24 Apr 2024 13:40:03 -1000
Subject: [PATCH 108/842] Don't materialize column during RangeIndex methods
 (#15582)

Additionally implements some methods that are defined on `BaseIndex` that were not implemented on `RangeIndex` and adds some typing

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15582
---
 python/cudf/cudf/core/_base_index.py |  10 ++-
 python/cudf/cudf/core/index.py       | 108 +++++++++++++++++----------
 python/cudf/cudf/tests/test_index.py |  23 ++++++
 3 files changed, 100 insertions(+), 41 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index de44f392eef..b5630ff9a54 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -517,7 +517,7 @@ def where(self, cond, other=None, inplace=False):
         """
         raise NotImplementedError
 
-    def factorize(self, sort=False, na_sentinel=None, use_na_sentinel=None):
+    def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
         raise NotImplementedError
 
     def union(self, other, sort=None):
@@ -2061,7 +2061,13 @@ def dropna(self, how="any"):
             one null value. "all" drops only rows containing
             *all* null values.
         """
-
+        if how not in {"any", "all"}:
+            raise ValueError(f"{how=} must be 'any' or 'all'")
+        try:
+            if not self.hasnans:
+                return self.copy()
+        except NotImplementedError:
+            pass
         # This is to be consistent with IndexedFrame.dropna to handle nans
         # as nulls by default
         data_columns = [
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index e457e818129..6c0acdc5fb0 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -21,6 +21,7 @@
 import cupy
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from typing_extensions import Self
 
 import cudf
@@ -248,6 +249,15 @@ def searchsorted(
         ), "Invalid ascending flag"
         return search_range(value, self._range, side=side)
 
+    def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
+        if sort and self.step < 0:
+            codes = cupy.arange(len(self) - 1, -1, -1)
+            uniques = self[::-1]
+        else:
+            codes = cupy.arange(len(self), dtype=np.intp)
+            uniques = self
+        return codes, uniques
+
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def name(self):
@@ -260,7 +270,7 @@ def name(self, value):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def start(self):
+    def start(self) -> int:
         """
         The value of the `start` parameter (0 if this was not supplied).
         """
@@ -268,7 +278,7 @@ def start(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def stop(self):
+    def stop(self) -> int:
         """
         The value of the stop parameter.
         """
@@ -276,7 +286,7 @@ def stop(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def step(self):
+    def step(self) -> int:
         """
         The value of the step parameter.
         """
@@ -284,7 +294,7 @@ def step(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def _num_rows(self):
+    def _num_rows(self) -> int:
         return len(self)
 
     @cached_property  # type: ignore
@@ -295,33 +305,33 @@ def _values(self):
         else:
             return column.column_empty(0, masked=False, dtype=self.dtype)
 
-    def _clean_nulls_from_index(self):
+    def _clean_nulls_from_index(self) -> Self:
         return self
 
-    def _is_numeric(self):
+    def _is_numeric(self) -> bool:
         return True
 
-    def _is_boolean(self):
+    def _is_boolean(self) -> bool:
         return False
 
-    def _is_integer(self):
+    def _is_integer(self) -> bool:
         return True
 
-    def _is_floating(self):
+    def _is_floating(self) -> bool:
         return False
 
-    def _is_object(self):
+    def _is_object(self) -> bool:
         return False
 
-    def _is_categorical(self):
+    def _is_categorical(self) -> bool:
         return False
 
-    def _is_interval(self):
+    def _is_interval(self) -> bool:
         return False
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def hasnans(self):
+    def hasnans(self) -> bool:
         return False
 
     @property  # type: ignore
@@ -369,12 +379,15 @@ def astype(self, dtype, copy: bool = True):
             return self
         return self._as_int_index().astype(dtype, copy=copy)
 
+    def fillna(self, value, downcast=None):
+        return self.copy()
+
     @_cudf_nvtx_annotate
     def drop_duplicates(self, keep="first"):
         return self
 
     @_cudf_nvtx_annotate
-    def duplicated(self, keep="first"):
+    def duplicated(self, keep="first") -> cupy.ndarray:
         return cupy.zeros(len(self), dtype=bool)
 
     @_cudf_nvtx_annotate
@@ -390,6 +403,11 @@ def __repr__(self):
             + ")"
         )
 
+    @property
+    @_cudf_nvtx_annotate
+    def size(self) -> int:
+        return len(self)
+
     @_cudf_nvtx_annotate
     def __len__(self):
         return len(self._range)
@@ -478,12 +496,12 @@ def to_pandas(
         )
 
     @property
-    def is_unique(self):
+    def is_unique(self) -> bool:
         return True
 
     @cached_property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_monotonic_increasing(self):
+    def is_monotonic_increasing(self) -> bool:
         return self.step > 0 or len(self) <= 1
 
     @cached_property  # type: ignore
@@ -492,7 +510,7 @@ def is_monotonic_decreasing(self):
         return self.step < 0 or len(self) <= 1
 
     @_cudf_nvtx_annotate
-    def memory_usage(self, deep=False):
+    def memory_usage(self, deep: bool = False) -> int:
         if deep:
             warnings.warn(
                 "The deep parameter is ignored and is only included "
@@ -500,7 +518,7 @@ def memory_usage(self, deep=False):
             )
         return 0
 
-    def unique(self):
+    def unique(self) -> Self:
         # RangeIndex always has unique values
         return self
 
@@ -823,34 +841,37 @@ def _columns(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def values_host(self):
-        return self.to_pandas().values
+    def values_host(self) -> np.ndarray:
+        return np.arange(start=self.start, stop=self.stop, step=self.step)
 
     @_cudf_nvtx_annotate
     def argsort(
         self,
         ascending=True,
         na_position="last",
-    ):
+    ) -> cupy.ndarray:
         if na_position not in {"first", "last"}:
             raise ValueError(f"invalid na_position: {na_position}")
-
-        indices = cupy.arange(0, len(self))
         if (ascending and self.step < 0) or (not ascending and self.step > 0):
-            indices = indices[::-1]
-        return indices
+            return cupy.arange(len(self) - 1, -1, -1)
+        else:
+            return cupy.arange(len(self))
 
     @_cudf_nvtx_annotate
     def where(self, cond, other=None, inplace=False):
         return self._as_int_index().where(cond, other, inplace)
 
     @_cudf_nvtx_annotate
-    def to_numpy(self):
+    def to_numpy(self) -> np.ndarray:
         return self.values_host
 
     @_cudf_nvtx_annotate
-    def to_arrow(self):
-        return self._as_int_index().to_arrow()
+    def to_cupy(self) -> cupy.ndarray:
+        return self.values
+
+    @_cudf_nvtx_annotate
+    def to_arrow(self) -> pa.Array:
+        return pa.array(self._range, type=pa.from_numpy_dtype(self.dtype))
 
     def __array__(self, dtype=None):
         raise TypeError(
@@ -861,17 +882,17 @@ def __array__(self, dtype=None):
         )
 
     @_cudf_nvtx_annotate
-    def nunique(self):
+    def nunique(self) -> int:
         return len(self)
 
     @_cudf_nvtx_annotate
-    def isna(self):
+    def isna(self) -> cupy.ndarray:
         return cupy.zeros(len(self), dtype=bool)
 
     isnull = isna
 
     @_cudf_nvtx_annotate
-    def notna(self):
+    def notna(self) -> cupy.ndarray:
         return cupy.ones(len(self), dtype=bool)
 
     notnull = isna
@@ -895,12 +916,15 @@ def max(self):
         return self._minmax("max")
 
     @property
-    def values(self):
+    def values(self) -> cupy.ndarray:
         return cupy.arange(self.start, self.stop, self.step)
 
-    def any(self):
+    def any(self) -> bool:
         return any(self._range)
 
+    def all(self) -> bool:
+        return 0 not in self._range
+
     def append(self, other):
         result = self._as_int_index().append(other)
         return self._try_reconstruct_range_index(result)
@@ -926,14 +950,20 @@ def isin(self, values):
 
         return self._values.isin(values).values
 
-    def __neg__(self):
-        return -self._as_int_index()
+    def __pos__(self) -> Self:
+        return self.copy()
 
-    def __pos__(self):
-        return +self._as_int_index()
+    def __neg__(self) -> Self:
+        rng = range(-self.start, -self.stop, -self.step)
+        return type(self)(rng, name=self.name)
 
-    def __abs__(self):
-        return abs(self._as_int_index())
+    def __abs__(self) -> Self | Index:
+        if len(self) == 0 or self.min() >= 0:
+            return self.copy()
+        elif self.max() <= 0:
+            return -self
+        else:
+            return abs(self._as_int_index())
 
     @_warn_no_dask_cudf
     def __dask_tokenize__(self):
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 08a7a9148dd..c7875b81440 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -3176,3 +3176,26 @@ def test_index_to_pandas_arrow_type(scalar):
     result = idx.to_pandas(arrow_type=True)
     expected = pd.Index(pd.arrays.ArrowExtensionArray(pa_array))
     pd.testing.assert_index_equal(result, expected)
+
+
+@pytest.mark.parametrize("data", [range(-3, 3), range(1, 3), range(0)])
+def test_rangeindex_all(data):
+    result = cudf.RangeIndex(data).all()
+    expected = cudf.Index(list(data)).all()
+    assert result == expected
+
+
+@pytest.mark.parametrize("sort", [True, False])
+@pytest.mark.parametrize("data", [range(2), range(2, -1, -1)])
+def test_rangeindex_factorize(sort, data):
+    res_codes, res_uniques = cudf.RangeIndex(data).factorize(sort=sort)
+    exp_codes, exp_uniques = cudf.Index(list(data)).factorize(sort=sort)
+    assert_eq(res_codes, exp_codes)
+    assert_eq(res_uniques, exp_uniques)
+
+
+def test_rangeindex_dropna():
+    ri = cudf.RangeIndex(range(2))
+    result = ri.dropna()
+    expected = ri.copy()
+    assert_eq(result, expected)

From 4dc9ebbfe5b2a22949c5f24114918e4369d055cd Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 25 Apr 2024 08:53:11 -0400
Subject: [PATCH 109/842] Improve performance for cudf::strings::count_re
 (#15578)

Improves performance of `cudf::strings::count_re` when pattern starts with a literal character.
Although this is a specific use case, the regex code has special logic to help speed up the search in this case.

Since the pattern indicates the target must contain this character as the start of the matching sequence, it first does a normal find for the character before continuing matching the remaining pattern. The `find()` function can be inefficient for long strings since it is character based and must resolve the character's byte-position by counting from the beginning of the string. For a function like `count_re()` all occurrences are matched within a target meaning longer target strings can incur expensive counting.

The solution included here is to introduce a more efficient `find_char()` utility that accepts a `string_view::const_iterator()` which automatically keeps track of its byte and character positions. This helps minimize byte/character counting in between calls from `count_re()` and other similar functions that make repeated calls for all matches (e.g. `replace_re()` and `split_re()`).

Close #15567

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15578
---
 cpp/benchmarks/string/contains.cpp |  4 ++--
 cpp/benchmarks/string/count.cpp    | 12 ++++++++----
 cpp/src/strings/regex/regex.inl    | 19 ++++++++++++++-----
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
index 6d839c1de64..ae6c8b844c8 100644
--- a/cpp/benchmarks/string/contains.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -80,7 +80,7 @@ std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
 }
 
 // longer pattern lengths demand more working memory per string
-std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$"};
+std::string patterns[] = {"^\\d+ [a-z]+", "[A-Z ]+\\d+ +\\d+[A-Z]+\\d+$", "5W43"};
 
 static void bench_contains(nvbench::state& state)
 {
@@ -114,4 +114,4 @@ NVBENCH_BENCH(bench_contains)
   .add_int64_axis("row_width", {32, 64, 128, 256, 512})
   .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
   .add_int64_axis("hit_rate", {50, 100})  // percentage
-  .add_int64_axis("pattern", {0, 1});
+  .add_int64_axis("pattern", {0, 1, 2});
diff --git a/cpp/benchmarks/string/count.cpp b/cpp/benchmarks/string/count.cpp
index a656010dca5..f964bc5d224 100644
--- a/cpp/benchmarks/string/count.cpp
+++ b/cpp/benchmarks/string/count.cpp
@@ -25,10 +25,13 @@
 
 #include <nvbench/nvbench.cuh>
 
+static std::string patterns[] = {"\\d+", "a"};
+
 static void bench_count(nvbench::state& state)
 {
-  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
-  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width     = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const pattern_index = static_cast<cudf::size_type>(state.get_int64("pattern"));
 
   if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
       static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
@@ -41,7 +44,7 @@ static void bench_count(nvbench::state& state)
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
   cudf::strings_column_view input(table->view().column(0));
 
-  std::string pattern = "\\d+";
+  auto const pattern = patterns[pattern_index];
 
   auto prog = cudf::strings::regex_program::create(pattern);
 
@@ -59,4 +62,5 @@ static void bench_count(nvbench::state& state)
 NVBENCH_BENCH(bench_count)
   .set_name("count")
   .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("pattern", {0, 1});
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index ce12dc17aa4..10e06505094 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -217,6 +217,15 @@ __device__ __forceinline__ reprog_device reprog_device::load(reprog_device const
                                             : reinterpret_cast<reprog_device*>(buffer)[0];
 }
 
+__device__ __forceinline__ static string_view::const_iterator find_char(
+  cudf::char_utf8 chr, string_view const d_str, string_view::const_iterator itr)
+{
+  while (itr.byte_offset() < d_str.size_bytes() && *itr != chr) {
+    ++itr;
+  }
+  return itr;
+}
+
 /**
  * @brief Evaluate a specific string against regex pattern compiled to this instance.
  *
@@ -253,16 +262,16 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
         case BOL:
           if (pos == 0) break;
           if (jnk.startchar != '^') { return thrust::nullopt; }
-          --pos;
+          --itr;
           startchar = static_cast<char_utf8>('\n');
         case CHAR: {
-          auto const fidx = dstr.find(startchar, pos);
-          if (fidx == string_view::npos) { return thrust::nullopt; }
-          pos = fidx + (jnk.starttype == BOL);
+          auto const find_itr = find_char(startchar, dstr, itr);
+          if (find_itr.byte_offset() >= dstr.size_bytes()) { return thrust::nullopt; }
+          itr = find_itr + (jnk.starttype == BOL);
+          pos = itr.position();
           break;
         }
       }
-      itr += (pos - itr.position());  // faster to increment position
     }
 
     if (((eos < 0) || (pos < eos)) && match == 0) {

From 65c2b53602d70f7f50c7dd7544ca0fd07ac8b455 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 25 Apr 2024 15:12:01 -0400
Subject: [PATCH 110/842] Fix debug warnings/errors in
 from_arrow_device_test.cpp (#15596)

Fixes debug build errors introduced by #15458

These warnings show up in a debug build where warnings become errors.
Some of the errors:
```
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:103:27: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetTypeStruct(ArrowSchema*, int64_t)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  103 |   ArrowSchemaSetTypeStruct(input_schema.get(), 1);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:105:29: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetTypeDateTime(ArrowSchema*, ArrowType, ArrowTimeUnit, const char*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  105 |   ArrowSchemaSetTypeDateTime(
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:107:21: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetName(ArrowSchema*, const char*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  107 |   ArrowSchemaSetName(input_schema->children[0], "a");
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:110:27: error: ignoring return value of 'ArrowErrorCode cudfArrowArrayInitFromSchema(ArrowArray*, const ArrowSchema*, ArrowError*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  110 |   ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:115:26: error: ignoring return value of 'ArrowErrorCode ArrowBufferSetAllocator(ArrowBuffer*, ArrowBufferAllocator)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  115 |   ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc);
      |   ~~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:118:27: error: ignoring return value of 'ArrowErrorCode cudfArrowArrayFinishBuilding(ArrowArray*, ArrowValidationLevel, ArrowError*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  118 |   ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp: In member function 'virtual void FromArrowDeviceTest_NestedList_Test::TestBody()':
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:202:27: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetTypeStruct(ArrowSchema*, int64_t)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  202 |   ArrowSchemaSetTypeStruct(input_schema.get(), 1);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:204:26: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaInitFromType(ArrowSchema*, ArrowType)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  204 |   ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_LIST);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:205:21: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetName(ArrowSchema*, const char*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  205 |   ArrowSchemaSetName(input_schema->children[0], "a");
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:208:26: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaInitFromType(ArrowSchema*, ArrowType)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  208 |   ArrowSchemaInitFromType(input_schema->children[0]->children[0], NANOARROW_TYPE_LIST);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:209:21: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetName(ArrowSchema*, const char*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  209 |   ArrowSchemaSetName(input_schema->children[0]->children[0], "element");
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:212:26: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaInitFromType(ArrowSchema*, ArrowType)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  212 |   ArrowSchemaInitFromType(input_schema->children[0]->children[0]->children[0],
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:214:21: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaSetName(ArrowSchema*, const char*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  214 |   ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element");
/cudf/cpp/tests/interop/from_arrow_device_test.cpp:226:27: error: ignoring return value of 'ArrowErrorCode cudfArrowArrayFinishBuilding(ArrowArray*, ArrowValidationLevel, ArrowError*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
  226 |   ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
/cudf/cpp/tests/interop/from_arrow_device_test.cpp: In member function 'virtual void FromArrowDeviceTest_StructColumn_Test::TestBody()':

```

Closes #15597

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15596
---
 cpp/tests/interop/from_arrow_device_test.cpp | 229 +++++++++++--------
 cpp/tests/interop/nanoarrow_utils.hpp        |   4 +-
 cpp/tests/interop/to_arrow_device_test.cpp   |   7 +-
 3 files changed, 135 insertions(+), 105 deletions(-)

diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp
index 95cbe8057d1..66bd4dd1bfb 100644
--- a/cpp/tests/interop/from_arrow_device_test.cpp
+++ b/cpp/tests/interop/from_arrow_device_test.cpp
@@ -100,22 +100,26 @@ TEST_F(FromArrowDeviceTest, DateTimeTable)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
   ArrowSchemaInit(input_schema->children[0]);
-  ArrowSchemaSetTypeDateTime(
-    input_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr);
-  ArrowSchemaSetName(input_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    input_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
   nanoarrow::UniqueArray input_array;
-  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
   input_array->length                  = 6;
   input_array->null_count              = 0;
   input_array->children[0]->length     = 6;
   input_array->children[0]->null_count = 0;
-  ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc);
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc));
   ArrowArrayBuffer(input_array->children[0], 1)->data =
     const_cast<uint8_t*>(cudf::column_view(col).data<uint8_t>());
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr);
+  ArrowArrayBuffer(input_array->children[0], 1)->size_bytes =
+    sizeof(int64_t) * cudf::column_view(col).size();
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -155,23 +159,27 @@ TYPED_TEST(FromArrowDeviceTestDurationsTest, DurationTable)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
 
   ArrowSchemaInit(input_schema->children[0]);
-  ArrowSchemaSetTypeDateTime(
-    input_schema->children[0], NANOARROW_TYPE_DURATION, time_unit, nullptr);
-  ArrowSchemaSetName(input_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    input_schema->children[0], NANOARROW_TYPE_DURATION, time_unit, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
-  auto data_ptr = expected_table_view.column(0).data<uint8_t>();
+  auto data_ptr  = expected_table_view.column(0).data<uint8_t>();
+  auto data_size = expected_table_view.column(0).size();
   nanoarrow::UniqueArray input_array;
-  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
   input_array->length                  = expected_table_view.num_rows();
   input_array->null_count              = 0;
   input_array->children[0]->length     = expected_table_view.num_rows();
   input_array->children[0]->null_count = 0;
-  ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc);
-  ArrowArrayBuffer(input_array->children[0], 1)->data = const_cast<uint8_t*>(data_ptr);
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferSetAllocator(ArrowArrayBuffer(input_array->children[0], 1), noop_alloc));
+  ArrowArrayBuffer(input_array->children[0], 1)->data       = const_cast<uint8_t*>(data_ptr);
+  ArrowArrayBuffer(input_array->children[0], 1)->size_bytes = sizeof(T) * data_size;
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -199,19 +207,21 @@ TEST_F(FromArrowDeviceTest, NestedList)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
 
-  ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(input_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
   input_schema->children[0]->flags = ARROW_FLAG_NULLABLE;
 
-  ArrowSchemaInitFromType(input_schema->children[0]->children[0], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(input_schema->children[0]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[0]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0]->children[0], "element"));
   input_schema->children[0]->children[0]->flags = 0;
 
-  ArrowSchemaInitFromType(input_schema->children[0]->children[0]->children[0],
-                          NANOARROW_TYPE_INT64);
-  ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(
+    input_schema->children[0]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element"));
   input_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE;
 
   nanoarrow::UniqueArray input_array;
@@ -223,7 +233,8 @@ TEST_F(FromArrowDeviceTest, NestedList)
   cudf::lists_column_view nested_view{lview.child()};
   populate_list_from_col(top_list->children[0], nested_view);
   populate_from_col<int64_t>(top_list->children[0]->children[0], nested_view.child());
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -287,47 +298,52 @@ TEST_F(FromArrowDeviceTest, StructColumn)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
 
   ArrowSchemaInit(input_schema->children[0]);
-  ArrowSchemaSetTypeStruct(input_schema->children[0], 5);
-  ArrowSchemaSetName(input_schema->children[0], "a");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema->children[0], 5));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
   input_schema->children[0]->flags = 0;
 
   auto child = input_schema->children[0];
-  ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING);
-  ArrowSchemaSetName(child->children[0], "string");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[0], "string"));
   child->children[0]->flags = 0;
 
-  ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32);
-  ArrowSchemaSetName(child->children[1], "integral");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[1], "integral"));
   child->children[1]->flags = 0;
 
-  ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL);
-  ArrowSchemaSetName(child->children[2], "bool");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[2], "bool"));
   child->children[2]->flags = 0;
 
-  ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(child->children[3], "nested_list");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3], "nested_list"));
   child->children[3]->flags = 0;
-  ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST);
-  ArrowSchemaSetName(child->children[3]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3]->children[0], "element"));
   child->children[3]->children[0]->flags = 0;
-  ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64);
-  ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element"));
   child->children[3]->children[0]->children[0]->flags = 0;
 
   ArrowSchemaInit(child->children[4]);
-  ArrowSchemaSetTypeStruct(child->children[4], 2);
-  ArrowSchemaSetName(child->children[4], "struct");
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(child->children[4], 2));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4], "struct"));
 
-  ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING);
-  ArrowSchemaSetName(child->children[4]->children[0], "string2");
-  ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32);
-  ArrowSchemaSetName(child->children[4]->children[1], "integral2");
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[0], "string2"));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[1], "integral2"));
 
   nanoarrow::UniqueArray input_array;
-  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
 
   input_array->length = expected_table_view.num_rows();
 
@@ -336,7 +352,7 @@ TEST_F(FromArrowDeviceTest, StructColumn)
   array_a->length     = view_a.size();
   array_a->null_count = view_a.null_count();
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc));
   ArrowArrayValidityBitmap(array_a)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_a.null_mask()));
 
@@ -354,14 +370,15 @@ TEST_F(FromArrowDeviceTest, StructColumn)
   array_struct->length     = view_struct.size();
   array_struct->null_count = view_struct.null_count();
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc));
   ArrowArrayValidityBitmap(array_struct)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_struct.null_mask()));
 
   populate_from_col<cudf::string_view>(array_struct->children[0], view_struct.child(0));
   populate_from_col<int32_t>(array_struct->children[1], view_struct.child(1));
 
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -406,25 +423,28 @@ TEST_F(FromArrowDeviceTest, DictionaryIndicesType)
 
   nanoarrow::UniqueSchema input_schema;
   ArrowSchemaInit(input_schema.get());
-  ArrowSchemaSetTypeStruct(input_schema.get(), 3);
-
-  ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_INT8);
-  ArrowSchemaSetName(input_schema->children[0], "a");
-  ArrowSchemaAllocateDictionary(input_schema->children[0]);
-  ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64);
-
-  ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_INT16);
-  ArrowSchemaSetName(input_schema->children[1], "b");
-  ArrowSchemaAllocateDictionary(input_schema->children[1]);
-  ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64);
-
-  ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_INT64);
-  ArrowSchemaSetName(input_schema->children[2], "c");
-  ArrowSchemaAllocateDictionary(input_schema->children[2]);
-  ArrowSchemaInitFromType(input_schema->children[2]->dictionary, NANOARROW_TYPE_INT64);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 3));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_INT8));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[0]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_INT16));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[1], "b"));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[1]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[2], "c"));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[2]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[2]->dictionary, NANOARROW_TYPE_INT64));
 
   nanoarrow::UniqueArray input_array;
-  ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
   input_array->length     = expected_table.num_rows();
   input_array->null_count = 0;
 
@@ -446,7 +466,8 @@ TEST_F(FromArrowDeviceTest, DictionaryIndicesType)
   populate_from_col<int64_t>(input_array->children[2]->dictionary,
                              cudf::dictionary_column_view{expected_table_view.column(2)}.keys());
 
-  ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
   ArrowDeviceArray input_device_array;
   input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -562,20 +583,22 @@ TEST_F(FromArrowDeviceTest, FixedPoint128Table)
 
     nanoarrow::UniqueSchema input_schema;
     ArrowSchemaInit(input_schema.get());
-    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
     ArrowSchemaInit(input_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(input_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<__int128_t>(),
-                              -scale);
-    ArrowSchemaSetName(input_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
     nanoarrow::UniqueArray input_array;
-    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
     input_array->length = expected.num_rows();
 
     populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
-    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     ArrowDeviceArray input_device_array;
     input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -607,20 +630,22 @@ TEST_F(FromArrowDeviceTest, FixedPoint128TableLarge)
 
     nanoarrow::UniqueSchema input_schema;
     ArrowSchemaInit(input_schema.get());
-    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
     ArrowSchemaInit(input_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(input_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<__int128_t>(),
-                              -scale);
-    ArrowSchemaSetName(input_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
     nanoarrow::UniqueArray input_array;
-    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
     input_array->length = expected.num_rows();
 
     populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
-    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     ArrowDeviceArray input_device_array;
     input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -652,20 +677,22 @@ TEST_F(FromArrowDeviceTest, FixedPoint128TableNulls)
 
     nanoarrow::UniqueSchema input_schema;
     ArrowSchemaInit(input_schema.get());
-    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
     ArrowSchemaInit(input_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(input_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<__int128_t>(),
-                              -scale);
-    ArrowSchemaSetName(input_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
     nanoarrow::UniqueArray input_array;
-    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
     input_array->length = expected.num_rows();
 
     populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
-    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     ArrowDeviceArray input_device_array;
     input_device_array.device_id   = rmm::get_current_cuda_device().value();
@@ -699,20 +726,22 @@ TEST_F(FromArrowDeviceTest, FixedPoint128TableNullsLarge)
 
     nanoarrow::UniqueSchema input_schema;
     ArrowSchemaInit(input_schema.get());
-    ArrowSchemaSetTypeStruct(input_schema.get(), 1);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
     ArrowSchemaInit(input_schema->children[0]);
-    ArrowSchemaSetTypeDecimal(input_schema->children[0],
-                              NANOARROW_TYPE_DECIMAL128,
-                              cudf::detail::max_precision<__int128_t>(),
-                              -scale);
-    ArrowSchemaSetName(input_schema->children[0], "a");
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(input_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
 
     nanoarrow::UniqueArray input_array;
-    ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
     input_array->length = expected.num_rows();
 
     populate_from_col<__int128_t>(input_array->children[0], expected.column(0));
-    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr);
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
 
     ArrowDeviceArray input_device_array;
     input_device_array.device_id   = rmm::get_current_cuda_device().value();
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index b795bafed97..fb5d1060f6f 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -122,13 +122,13 @@ void populate_dict_from_col(ArrowArray* arr, cudf::dictionary_column_view dview)
 {
   arr->length     = dview.size();
   arr->null_count = dview.null_count();
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
   ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
     cudf::bitmask_allocation_size_bytes(dview.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(dview.null_mask()));
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
   ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(IND_TYPE) * dview.indices().size();
   ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(dview.indices().data<uint8_t>());
 
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index fb346dad538..626aeb53cdd 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -217,7 +217,8 @@ get_nanoarrow_tables(cudf::size_type length)
   populate_from_col<cudf::string_view>(arrow->children[5]->children[1], struct_view.child(1));
   arrow->children[5]->length     = struct_view.size();
   arrow->children[5]->null_count = struct_view.null_count();
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arrow->children[5], 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferSetAllocator(ArrowArrayBuffer(arrow->children[5], 0), noop_alloc));
   ArrowArrayValidityBitmap(arrow->children[5])->buffer.size_bytes =
     cudf::bitmask_allocation_size_bytes(struct_view.size());
   ArrowArrayValidityBitmap(arrow->children[5])->buffer.data =
@@ -241,13 +242,13 @@ void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view)
   arr->length     = view.size();
   arr->null_count = view.null_count();
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 0), noop_alloc));
   ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
     cudf::bitmask_allocation_size_bytes(view.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
     const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
 
-  ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc);
+  NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
   ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(int32_t) * view.offsets().size();
   ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(view.offsets().data<uint8_t>());
 }

From c62c5f69ca5036d69188ab8e43ac2ab5276d6cfa Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Fri, 26 Apr 2024 04:02:25 -0500
Subject: [PATCH 111/842] Fix a JNI bug in JSON parsing fixup (#15550)

When parsing JSON in the current code if no columns can be parsed out of the data, then an empty table is returned. Earlier we put in a work around to this so that we could pass in the number of rows needed and the JSON parsing code would make a table of null values for it. This had some issues with structs and lists which needed an extended way to produce the null scalar. This adds in code to do just that.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/15550
---
 java/src/main/java/ai/rapids/cudf/Schema.java | 28 ++++++++++++++++++-
 java/src/main/java/ai/rapids/cudf/Table.java  | 22 +++++++++++++--
 2 files changed, 47 insertions(+), 3 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java
index c8571dd841c..43603386649 100644
--- a/java/src/main/java/ai/rapids/cudf/Schema.java
+++ b/java/src/main/java/ai/rapids/cudf/Schema.java
@@ -20,6 +20,7 @@
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.stream.Collectors;
 
 /**
  * The schema of data to be read in.
@@ -221,6 +222,13 @@ public DType[] getChildTypes() {
     return ret;
   }
 
+  public int getNumChildren() {
+    if (childSchemas == null) {
+      return 0;
+    }
+    return childSchemas.size();
+  }
+
   int[] getFlattenedNumChildren() {
     flattenIfNeeded();
     return flattenedCounts;
@@ -243,7 +251,25 @@ public boolean isStructOrHasStructDescendant() {
     return false;
   }
 
-  public static class Builder {
+  public HostColumnVector.DataType asHostDataType() {
+    if (topLevelType == DType.LIST) {
+      assert(childSchemas != null && childSchemas.size() == 1);
+      HostColumnVector.DataType element = childSchemas.get(0).asHostDataType();
+      return new HostColumnVector.ListType(true, element);
+    } else if (topLevelType == DType.STRUCT) {
+      if (childSchemas == null) {
+        return new HostColumnVector.StructType(true);
+      } else {
+        List<HostColumnVector.DataType> childTypes =
+                childSchemas.stream().map(Schema::asHostDataType).collect(Collectors.toList());
+        return new HostColumnVector.StructType(true, childTypes);
+      }
+    } else {
+      return new HostColumnVector.BasicType(true, topLevelType);
+    }
+  }
+
+    public static class Builder {
     private final DType topLevelType;
     private final List<String> names;
     private final List<Builder> types;
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 4038b3a40b8..4e737451ed6 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -1220,8 +1220,26 @@ private static Table gatherJSONColumns(Schema schema, TableWithMeta twm, int emp
               columns[i] = tbl.getColumn(index).incRefCount();
             }
           } else {
-            try (Scalar s = Scalar.fromNull(types[i])) {
-              columns[i] = ColumnVector.fromScalar(s, rowCount);
+            if (types[i] == DType.LIST) {
+              Schema listSchema = schema.getChild(i);
+              Schema elementSchema = listSchema.getChild(0);
+              try (Scalar s = Scalar.listFromNull(elementSchema.asHostDataType())) {
+                columns[i] = ColumnVector.fromScalar(s, rowCount);
+              }
+            } else if (types[i] == DType.STRUCT) {
+              Schema structSchema = schema.getChild(i);
+              int numStructChildren = structSchema.getNumChildren();
+              DataType[] structChildrenTypes = new DataType[numStructChildren];
+              for (int j = 0; j < numStructChildren; j++) {
+                structChildrenTypes[j] = structSchema.getChild(j).asHostDataType();
+              }
+              try (Scalar s = Scalar.structFromNull(structChildrenTypes)) {
+                columns[i] = ColumnVector.fromScalar(s, rowCount);
+              }
+            } else {
+              try (Scalar s = Scalar.fromNull(types[i])) {
+                columns[i] = ColumnVector.fromScalar(s, rowCount);
+              }
             }
           }
         }

From 79cd473f8ec18d1f0abed3faa6dd8d61f54bf384 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastian@sipsolutions.net>
Date: Fri, 26 Apr 2024 20:51:03 +0200
Subject: [PATCH 112/842] Minor fixups for future NumPy 2 compatibility
 (#15590)

These are some small fixes to be compatible with NumPy 2 python changes, as pointed out by the `ruff` with the `"NPY201"` rule-set.

I am not really happy with the `_NUMPY_SCTYPES` (reaching into what is now private, but figured that others will do so also for a while; feels like we should add a better way to do this in NumPy before removing it).
Listing the full set is also a bit ugly/convoluted, but happy to do so instead.

(I was hoping to get a bit further with testing against the NumPy 2rc, but unfortunately the `numba` dependency makes that at least difficult.)

Authors:
  - Sebastian Berg (https://github.com/seberg)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15590
---
 python/cudf/cudf/core/column/string.py     |  2 +-
 python/cudf/cudf/core/frame.py             | 12 ++++----
 python/cudf/cudf/core/index.py             |  6 +++-
 python/cudf/cudf/tests/test_api_types.py   | 33 ----------------------
 python/cudf/cudf/tests/test_categorical.py |  4 +--
 python/cudf/cudf/tests/test_dataframe.py   | 16 +++++------
 python/cudf/cudf/tests/test_parquet.py     |  2 +-
 python/cudf/cudf/tests/test_stats.py       |  4 +--
 python/cudf/cudf/utils/dtypes.py           |  8 ++++--
 9 files changed, 31 insertions(+), 56 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 0862995bc46..8143e7919a7 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -692,7 +692,7 @@ def contains(
 
         Returning an Index of booleans using only a literal pattern.
 
-        >>> data = ['Mouse', 'dog', 'house and parrot', '23.0', np.NaN]
+        >>> data = ['Mouse', 'dog', 'house and parrot', '23.0', np.nan]
         >>> idx = cudf.Index(data)
         >>> idx
         Index(['Mouse', 'dog', 'house and parrot', '23.0', None], dtype='object')
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 01842b5f0a9..cd42bf52ea1 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1077,7 +1077,7 @@ def isna(self):
         >>> import cudf
         >>> import numpy as np
         >>> import pandas as pd
-        >>> df = cudf.DataFrame({'age': [5, 6, np.NaN],
+        >>> df = cudf.DataFrame({'age': [5, 6, np.nan],
         ...                    'born': [pd.NaT, pd.Timestamp('1939-05-27'),
         ...                             pd.Timestamp('1940-04-25')],
         ...                    'name': ['Alfred', 'Batman', ''],
@@ -1095,7 +1095,7 @@ def isna(self):
 
         Show which entries in a Series are NA.
 
-        >>> ser = cudf.Series([5, 6, np.NaN, np.inf, -np.inf])
+        >>> ser = cudf.Series([5, 6, np.nan, np.inf, -np.inf])
         >>> ser
         0     5.0
         1     6.0
@@ -1113,7 +1113,7 @@ def isna(self):
 
         Show which entries in an Index are NA.
 
-        >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf])
+        >>> idx = cudf.Index([1, 2, None, np.nan, 0.32, np.inf])
         >>> idx
         Index([1.0, 2.0, <NA>, <NA>, 0.32, Inf], dtype='float64')
         >>> idx.isna()
@@ -1156,7 +1156,7 @@ def notna(self):
         >>> import cudf
         >>> import numpy as np
         >>> import pandas as pd
-        >>> df = cudf.DataFrame({'age': [5, 6, np.NaN],
+        >>> df = cudf.DataFrame({'age': [5, 6, np.nan],
         ...                    'born': [pd.NaT, pd.Timestamp('1939-05-27'),
         ...                             pd.Timestamp('1940-04-25')],
         ...                    'name': ['Alfred', 'Batman', ''],
@@ -1174,7 +1174,7 @@ def notna(self):
 
         Show which entries in a Series are NA.
 
-        >>> ser = cudf.Series([5, 6, np.NaN, np.inf, -np.inf])
+        >>> ser = cudf.Series([5, 6, np.nan, np.inf, -np.inf])
         >>> ser
         0     5.0
         1     6.0
@@ -1192,7 +1192,7 @@ def notna(self):
 
         Show which entries in an Index are NA.
 
-        >>> idx = cudf.Index([1, 2, None, np.NaN, 0.32, np.inf])
+        >>> idx = cudf.Index([1, 2, None, np.nan, 0.32, np.inf])
         >>> idx
         Index([1.0, 2.0, <NA>, <NA>, 0.32, Inf], dtype='float64')
         >>> idx.notna()
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 6c0acdc5fb0..f55fa4c05b5 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -60,6 +60,7 @@
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
+    _NUMPY_SCTYPES,
     _maybe_convert_to_default_type,
     find_common_type,
     is_mixed_with_object_dtype,
@@ -344,7 +345,10 @@ def _data(self):
     @_cudf_nvtx_annotate
     def __contains__(self, item):
         if isinstance(item, bool) or not isinstance(
-            item, tuple(np.sctypes["int"] + np.sctypes["float"] + [int, float])
+            item,
+            tuple(
+                _NUMPY_SCTYPES["int"] + _NUMPY_SCTYPES["float"] + [int, float]
+            ),
         ):
             return False
         try:
diff --git a/python/cudf/cudf/tests/test_api_types.py b/python/cudf/cudf/tests/test_api_types.py
index 9436d65e0b7..4abe210c6ea 100644
--- a/python/cudf/cudf/tests/test_api_types.py
+++ b/python/cudf/cudf/tests/test_api_types.py
@@ -33,7 +33,6 @@
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -42,7 +41,6 @@
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -61,7 +59,6 @@
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -142,7 +139,6 @@ def test_is_categorical_dtype(obj, expect):
         (np.float64, True),
         (np.complex128, True),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -151,7 +147,6 @@ def test_is_categorical_dtype(obj, expect):
         (np.float64(), True),
         (np.complex128(), True),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -170,7 +165,6 @@ def test_is_categorical_dtype(obj, expect):
         (np.array([], dtype=np.float64), True),
         (np.array([], dtype=np.complex128), True),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -247,7 +241,6 @@ def test_is_numeric_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -256,7 +249,6 @@ def test_is_numeric_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -275,7 +267,6 @@ def test_is_numeric_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -352,7 +343,6 @@ def test_is_integer_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -361,7 +351,6 @@ def test_is_integer_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -380,7 +369,6 @@ def test_is_integer_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -458,7 +446,6 @@ def test_is_integer(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, True),
-        (np.unicode_, True),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -467,7 +454,6 @@ def test_is_integer(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), True),
-        (np.unicode_(), True),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -486,7 +472,6 @@ def test_is_integer(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), True),
-        (np.array([], dtype=np.unicode_), True),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         # (np.array([], dtype=object), False),
@@ -577,7 +562,6 @@ def test_is_string_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, True),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -586,7 +570,6 @@ def test_is_string_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), True),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -605,7 +588,6 @@ def test_is_string_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), True),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -682,7 +664,6 @@ def test_is_datetime_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -691,7 +672,6 @@ def test_is_datetime_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -710,7 +690,6 @@ def test_is_datetime_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -787,7 +766,6 @@ def test_is_list_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -796,7 +774,6 @@ def test_is_list_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -815,7 +792,6 @@ def test_is_list_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -895,7 +871,6 @@ def test_is_struct_dtype(obj, expect):
         (np.float64, False),
         (np.complex128, False),
         (np.str_, False),
-        (np.unicode_, False),
         (np.datetime64, False),
         (np.timedelta64, False),
         # NumPy scalars.
@@ -904,7 +879,6 @@ def test_is_struct_dtype(obj, expect):
         (np.float64(), False),
         (np.complex128(), False),
         (np.str_(), False),
-        (np.unicode_(), False),
         (np.datetime64(), False),
         (np.timedelta64(), False),
         # NumPy dtype objects.
@@ -923,7 +897,6 @@ def test_is_struct_dtype(obj, expect):
         (np.array([], dtype=np.float64), False),
         (np.array([], dtype=np.complex128), False),
         (np.array([], dtype=np.str_), False),
-        (np.array([], dtype=np.unicode_), False),
         (np.array([], dtype=np.datetime64), False),
         (np.array([], dtype=np.timedelta64), False),
         (np.array([], dtype=object), False),
@@ -1004,7 +977,6 @@ def test_is_decimal_dtype(obj, expect):
         np.float64,
         np.complex128,
         np.str_,
-        np.unicode_,
         np.datetime64,
         np.timedelta64,
         # NumPy scalars.
@@ -1013,7 +985,6 @@ def test_is_decimal_dtype(obj, expect):
         np.float64(),
         np.complex128(),
         np.str_(),
-        np.unicode_(),
         np.datetime64(),
         np.timedelta64(),
         # NumPy dtype objects.
@@ -1032,7 +1003,6 @@ def test_is_decimal_dtype(obj, expect):
         np.array([], dtype=np.float64),
         np.array([], dtype=np.complex128),
         np.array([], dtype=np.str_),
-        np.array([], dtype=np.unicode_),
         np.array([], dtype=np.datetime64),
         np.array([], dtype=np.timedelta64),
         np.array([], dtype=object),
@@ -1088,7 +1058,6 @@ def test_pandas_agreement(obj):
         np.float64,
         np.complex128,
         np.str_,
-        np.unicode_,
         np.datetime64,
         np.timedelta64,
         # NumPy scalars.
@@ -1097,7 +1066,6 @@ def test_pandas_agreement(obj):
         np.float64(),
         np.complex128(),
         np.str_(),
-        np.unicode_(),
         np.datetime64(),
         np.timedelta64(),
         # NumPy dtype objects.
@@ -1116,7 +1084,6 @@ def test_pandas_agreement(obj):
         np.array([], dtype=np.float64),
         np.array([], dtype=np.complex128),
         np.array([], dtype=np.str_),
-        np.array([], dtype=np.unicode_),
         np.array([], dtype=np.datetime64),
         np.array([], dtype=np.timedelta64),
         np.array([], dtype=object),
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index e21fd53bee4..7aba2e45532 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -460,7 +460,7 @@ def test_categorical_dataframe_slice_copy():
         pd.Series(["1.0", "2.5", "3.001", None, "9"], dtype="category"),
         pd.Series(["a", "b", "c", "c", "b", "a", "b", "b"]),
         pd.Series(["aa", "b", "c", "c", "bb", "bb", "a", "b", "b"]),
-        pd.Series([1, 2, 3, 89, None, np.nan, np.NaN], dtype="float64"),
+        pd.Series([1, 2, 3, 89, None, np.nan, np.nan], dtype="float64"),
         pd.Series([1, 2, 3, 89], dtype="float64"),
         pd.Series([1, 2.5, 3.001, 89], dtype="float64"),
         pd.Series([None, None, None]),
@@ -493,7 +493,7 @@ def test_categorical_typecast(data, cat_type):
         pd.Series([1, 2, 3, 89]),
         pd.Series(["a", "b", "c", "c", "b", "a", "b", "b"]),
         pd.Series(["aa", "b", "c", "c", "bb", "bb", "a", "b", "b"]),
-        pd.Series([1, 2, 3, 89, None, np.nan, np.NaN], dtype="float64"),
+        pd.Series([1, 2, 3, 89, None, np.nan, np.nan], dtype="float64"),
         pd.Series([1, 2, 3, 89], dtype="float64"),
         pd.Series([1, 2.5, 3.001, 89], dtype="float64"),
         pd.Series([None, None, None]),
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 59e8b41e51a..e287603de07 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -5199,20 +5199,20 @@ def test_df_constructor_dtype(dtype):
         cudf.DataFrame(
             {
                 "a": [1, 2, 3, 4],
-                "b": [7, np.NaN, 9, 10],
+                "b": [7, np.nan, 9, 10],
                 "c": cudf.Series(
-                    [np.NaN, np.NaN, np.NaN, np.NaN], nan_as_null=False
+                    [np.nan, np.nan, np.nan, np.nan], nan_as_null=False
                 ),
                 "d": cudf.Series([None, None, None, None], dtype="int64"),
                 "e": [100, None, 200, None],
-                "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False),
+                "f": cudf.Series([10, None, np.nan, 11], nan_as_null=False),
             }
         ),
         cudf.DataFrame(
             {
                 "a": [10, 11, 12, 13, 14, 15],
                 "b": cudf.Series(
-                    [10, None, np.NaN, 2234, None, np.NaN], nan_as_null=False
+                    [10, None, np.nan, 2234, None, np.nan], nan_as_null=False
                 ),
             }
         ),
@@ -5264,11 +5264,11 @@ def test_rowwise_ops_nullable_dtypes_all_null(op):
     gdf = cudf.DataFrame(
         {
             "a": [1, 2, 3, 4],
-            "b": [7, np.NaN, 9, 10],
-            "c": cudf.Series([np.NaN, np.NaN, np.NaN, np.NaN], dtype=float),
+            "b": [7, np.nan, 9, 10],
+            "c": cudf.Series([np.nan, np.nan, np.nan, np.nan], dtype=float),
             "d": cudf.Series([None, None, None, None], dtype="int64"),
             "e": [100, None, 200, None],
-            "f": cudf.Series([10, None, np.NaN, 11], nan_as_null=False),
+            "f": cudf.Series([10, None, np.nan, 11], nan_as_null=False),
         }
     )
 
@@ -5300,7 +5300,7 @@ def test_rowwise_ops_nullable_dtypes_partial_null(op):
         {
             "a": [10, 11, 12, 13, 14, 15],
             "b": cudf.Series(
-                [10, None, np.NaN, 2234, None, np.NaN],
+                [10, None, np.nan, 2234, None, np.nan],
                 nan_as_null=False,
             ),
         }
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 56a4281aad9..6fb1d3d8ba5 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -211,7 +211,7 @@ def make_pdf(nrows, ncolumns=1, nvalids=0, dtype=np.int64):
         # Randomly but reproducibly mark subset of rows as invalid
         random.seed(1337)
         mask = random.sample(range(nrows), nvalids)
-        test_pdf[test_pdf.index.isin(mask)] = np.NaN
+        test_pdf[test_pdf.index.isin(mask)] = np.nan
     if dtype:
         test_pdf = test_pdf.astype(dtype)
 
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index b9eb42906e8..27811d0fcde 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -507,7 +507,7 @@ def test_df_corr(method):
 @pytest.mark.parametrize(
     "data",
     [
-        [0.0, 1, 3, 6, np.NaN, 7, 5.0, np.nan, 5, 2, 3, -100],
+        [0.0, 1, 3, 6, np.nan, 7, 5.0, np.nan, 5, 2, 3, -100],
         [np.nan] * 3,
         [1, 5, 3],
         [],
@@ -555,7 +555,7 @@ def test_nans_stats(data, ops, skipna):
 @pytest.mark.parametrize(
     "data",
     [
-        [0.0, 1, 3, 6, np.NaN, 7, 5.0, np.nan, 5, 2, 3, -100],
+        [0.0, 1, 3, 6, np.nan, 7, 5.0, np.nan, 5, 2, 3, -100],
         [np.nan] * 3,
         [1, 5, 3],
     ],
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index a33b5ca139c..2aa3129ab30 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -91,6 +91,10 @@
 BOOL_TYPES = {"bool"}
 ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES
 
+# The NumPy scalar types are a bit of a mess as they align with the C types
+# so for now we use the `sctypes` dict (although it was made private in 2.0)
+_NUMPY_SCTYPES = np.sctypes if hasattr(np, "sctypes") else np._core.sctypes
+
 
 def np_to_pa_dtype(dtype):
     """Util to convert numpy dtype to PyArrow dtype."""
@@ -335,7 +339,7 @@ def min_signed_type(x, min_size=8):
     Return the smallest *signed* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in np.sctypes["int"]:
+    for int_dtype in _NUMPY_SCTYPES["int"]:
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max:
                 return int_dtype
@@ -348,7 +352,7 @@ def min_unsigned_type(x, min_size=8):
     Return the smallest *unsigned* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in np.sctypes["uint"]:
+    for int_dtype in _NUMPY_SCTYPES["uint"]:
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if 0 <= x <= np.iinfo(int_dtype).max:
                 return int_dtype

From d91a4add4c56d35f0ed2fb7f12c87bc3c26f28d9 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Fri, 26 Apr 2024 16:15:36 -0700
Subject: [PATCH 113/842] Add Parquet encoding statistics to column chunk
 metadata (#15452)

Closes #15313

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/15452
---
 .../io/parquet/compact_protocol_reader.cpp    | 13 +++++
 .../io/parquet/compact_protocol_reader.hpp    |  1 +
 .../io/parquet/compact_protocol_writer.cpp    | 10 ++++
 .../io/parquet/compact_protocol_writer.hpp    |  1 +
 cpp/src/io/parquet/parquet.hpp                | 46 +++++++++++++----
 cpp/src/io/parquet/writer_impl.cu             | 50 +++++++++++++++++++
 cpp/tests/io/parquet_writer_test.cpp          | 13 ++++-
 7 files changed, 122 insertions(+), 12 deletions(-)

diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 04a22b41247..a3b58347e20 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -17,6 +17,7 @@
 #include "compact_protocol_reader.hpp"
 
 #include "parquet.hpp"
+#include "parquet_common.hpp"
 
 #include <cudf/utilities/error.hpp>
 
@@ -652,6 +653,9 @@ void CompactProtocolReader::read(ColumnChunkMetaData* c)
 {
   using optional_size_statistics =
     parquet_field_optional<SizeStatistics, parquet_field_struct<SizeStatistics>>;
+  using optional_list_enc_stats =
+    parquet_field_optional<std::vector<PageEncodingStats>,
+                           parquet_field_struct_list<PageEncodingStats>>;
   auto op = std::make_tuple(parquet_field_enum<Type>(1, c->type),
                             parquet_field_enum_list(2, c->encodings),
                             parquet_field_string_list(3, c->path_in_schema),
@@ -663,6 +667,7 @@ void CompactProtocolReader::read(ColumnChunkMetaData* c)
                             parquet_field_int64(10, c->index_page_offset),
                             parquet_field_int64(11, c->dictionary_page_offset),
                             parquet_field_struct(12, c->statistics),
+                            optional_list_enc_stats(13, c->encoding_stats),
                             optional_size_statistics(16, c->size_statistics));
   function_builder(this, op);
 }
@@ -774,6 +779,14 @@ void CompactProtocolReader::read(ColumnOrder* c)
   function_builder(this, op);
 }
 
+void CompactProtocolReader::read(PageEncodingStats* s)
+{
+  auto op = std::make_tuple(parquet_field_enum<PageType>(1, s->page_type),
+                            parquet_field_enum<Encoding>(2, s->encoding),
+                            parquet_field_int32(3, s->count));
+  function_builder(this, op);
+}
+
 void CompactProtocolReader::read(SortingColumn* s)
 {
   auto op = std::make_tuple(parquet_field_int32(1, s->column_idx),
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index 2ad336a3052..bcc9adfc8c0 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -120,6 +120,7 @@ class CompactProtocolReader {
   void read(ColumnIndex* c);
   void read(Statistics* s);
   void read(ColumnOrder* c);
+  void read(PageEncodingStats* s);
   void read(SortingColumn* s);
 
  public:
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 1262ca1926d..2174fe46663 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -188,6 +188,7 @@ size_t CompactProtocolWriter::write(ColumnChunkMetaData const& s)
   if (s.index_page_offset != 0) { c.field_int(10, s.index_page_offset); }
   if (s.dictionary_page_offset != 0) { c.field_int(11, s.dictionary_page_offset); }
   c.field_struct(12, s.statistics);
+  if (s.encoding_stats.has_value()) { c.field_struct_list(13, s.encoding_stats.value()); }
   if (s.size_statistics.has_value()) { c.field_struct(16, s.size_statistics.value()); }
   return c.value();
 }
@@ -248,6 +249,15 @@ size_t CompactProtocolWriter::write(ColumnOrder const& co)
   return c.value();
 }
 
+size_t CompactProtocolWriter::write(PageEncodingStats const& enc)
+{
+  CompactProtocolFieldWriter c(*this);
+  c.field_int(1, static_cast<int32_t>(enc.page_type));
+  c.field_int(2, static_cast<int32_t>(enc.encoding));
+  c.field_int(3, enc.count);
+  return c.value();
+}
+
 size_t CompactProtocolWriter::write(SortingColumn const& sc)
 {
   CompactProtocolFieldWriter c(*this);
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index 2e39abadd24..c2e6178acbf 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -53,6 +53,7 @@ class CompactProtocolWriter {
   size_t write(OffsetIndex const&);
   size_t write(SizeStatistics const&);
   size_t write(ColumnOrder const&);
+  size_t write(PageEncodingStats const&);
   size_t write(SortingColumn const&);
 
  protected:
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 7f00d63b9c2..fe9b6ead6d4 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -322,6 +322,15 @@ struct ColumnIndex {
   thrust::optional<std::vector<int64_t>> definition_level_histogram;
 };
 
+/**
+ * @brief Thrift-derived struct describing page encoding statistics
+ */
+struct PageEncodingStats {
+  PageType page_type;  // The page type (data/dic/...)
+  Encoding encoding;   // Encoding of the page
+  int32_t count;       // Number of pages of this type with this encoding
+};
+
 /**
  * @brief Thrift-derived struct describing column sort order
  */
@@ -335,21 +344,36 @@ struct SortingColumn {
  * @brief Thrift-derived struct describing a column chunk
  */
 struct ColumnChunkMetaData {
+  // Type of this column
   Type type = BOOLEAN;
+  // Set of all encodings used for this column. The purpose is to validate
+  // whether we can decode those pages.
   std::vector<Encoding> encodings;
+  // Path in schema
   std::vector<std::string> path_in_schema;
-  Compression codec  = UNCOMPRESSED;
+  // Compression codec
+  Compression codec = UNCOMPRESSED;
+  // Number of values in this column
   int64_t num_values = 0;
-  int64_t total_uncompressed_size =
-    0;  // total byte size of all uncompressed pages in this column chunk (including the headers)
-  int64_t total_compressed_size =
-    0;  // total byte size of all compressed pages in this column chunk (including the headers)
-  int64_t data_page_offset  = 0;  // Byte offset from beginning of file to first data page
-  int64_t index_page_offset = 0;  // Byte offset from beginning of file to root index page
-  int64_t dictionary_page_offset =
-    0;                    // Byte offset from the beginning of file to first (only) dictionary page
-  Statistics statistics;  // Encoded chunk-level statistics
-  thrust::optional<SizeStatistics> size_statistics;  // Size statistics for the chunk
+  // Total byte size of all uncompressed pages in this column chunk (including the headers)
+  int64_t total_uncompressed_size = 0;
+  // Total byte size of all compressed pages in this column chunk (including the headers)
+  int64_t total_compressed_size = 0;
+  // Byte offset from beginning of file to first data page
+  int64_t data_page_offset = 0;
+  // Byte offset from beginning of file to root index page
+  int64_t index_page_offset = 0;
+  // Byte offset from the beginning of file to first (only) dictionary page
+  int64_t dictionary_page_offset = 0;
+  // Optional statistics for this column chunk
+  Statistics statistics;
+  // Set of all encodings used for pages in this column chunk. This information can be used to
+  // determine if all data pages are dictionary encoded for example.
+  thrust::optional<std::vector<PageEncodingStats>> encoding_stats;
+  // Optional statistics to help estimate total memory when converted to in-memory representations.
+  // The histograms contained in these statistics can also be useful in some cases for more
+  // fine-grained nullability/list length filter pushdown.
+  thrust::optional<SizeStatistics> size_statistics;
 };
 
 /**
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 5509a33f9f0..286c7b361a9 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -22,6 +22,8 @@
 #include "compact_protocol_reader.hpp"
 #include "compact_protocol_writer.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
+#include "io/parquet/parquet.hpp"
+#include "io/parquet/parquet_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
 #include "io/utilities/column_utils.cuh"
 #include "io/utilities/config_utils.hpp"
@@ -214,6 +216,53 @@ void update_chunk_encodings(std::vector<Encoding>& encodings, uint32_t enc_mask)
   }
 }
 
+/**
+ * @brief Update the encoding_stats field in the column chunk metadata.
+ *
+ * @param chunk_meta The `ColumnChunkMetaData` struct for the column chunk
+ * @param ck The column chunk to summarize stats for
+ * @param is_v2 True if V2 page headers are used
+ */
+void update_chunk_encoding_stats(ColumnChunkMetaData& chunk_meta,
+                                 EncColumnChunk const& ck,
+                                 bool is_v2)
+{
+  // don't set encoding stats if there are no pages
+  if (ck.num_pages == 0) { return; }
+
+  // NOTE: since cudf doesn't use mixed encodings for a chunk, we really only need to account
+  // for the dictionary page (if there is one), and the encoding used for the data pages. We can
+  // examine the chunk's encodings field to figure out the encodings without having to examine
+  // the page data.
+  auto const num_data_pages = static_cast<int32_t>(ck.num_data_pages());
+  auto const data_page_type = is_v2 ? PageType::DATA_PAGE_V2 : PageType::DATA_PAGE;
+
+  std::vector<PageEncodingStats> result;
+  if (ck.use_dictionary) {
+    // For dictionary encoding, if V1 then both data and dictionary use PLAIN_DICTIONARY. For V2
+    // the dictionary uses PLAIN and the data RLE_DICTIONARY.
+    auto const dict_enc = is_v2 ? Encoding::PLAIN : Encoding::PLAIN_DICTIONARY;
+    auto const data_enc = is_v2 ? Encoding::RLE_DICTIONARY : Encoding::PLAIN_DICTIONARY;
+    result.push_back({PageType::DICTIONARY_PAGE, dict_enc, 1});
+    if (num_data_pages > 0) { result.push_back({data_page_type, data_enc, num_data_pages}); }
+  } else {
+    // No dictionary page, the pages are encoded with something other than RLE (unless it's a
+    // boolean column).
+    for (auto const enc : chunk_meta.encodings) {
+      if (enc != Encoding::RLE) {
+        result.push_back({data_page_type, enc, num_data_pages});
+        break;
+      }
+    }
+    // if result is empty and we're using V2 headers, then assume the data is RLE as well
+    if (result.empty() and is_v2 and (ck.encodings & encoding_to_mask(Encoding::RLE)) != 0) {
+      result.push_back({data_page_type, Encoding::RLE, num_data_pages});
+    }
+  }
+
+  if (not result.empty()) { chunk_meta.encoding_stats = std::move(result); }
+}
+
 /**
  * @brief Compute size (in bytes) of the data stored in the given column.
  *
@@ -2144,6 +2193,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         max_write_size = std::max(max_write_size, ck.compressed_size);
 
         update_chunk_encodings(column_chunk_meta.encodings, ck.encodings);
+        update_chunk_encoding_stats(column_chunk_meta, ck, write_v2_headers);
 
         if (ck.ck_stat_size != 0) {
           std::vector<uint8_t> const stats_blob = cudf::detail::make_std_vector_sync(
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index a16b3d63177..e88afd73290 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -1674,7 +1674,18 @@ TEST_F(ParquetWriterTest, UserRequestedEncodings)
   // no nulls and no repetition, so the only encoding used should be for the data.
   // since we're writing v1, both dict and data pages should use PLAIN_DICTIONARY.
   auto const expect_enc = [&fmd](int idx, cudf::io::parquet::detail::Encoding enc) {
-    EXPECT_EQ(fmd.row_groups[0].columns[idx].meta_data.encodings[0], enc);
+    auto const& col_meta = fmd.row_groups[0].columns[idx].meta_data;
+    EXPECT_EQ(col_meta.encodings[0], enc);
+
+    // also check encoding stats are written properly
+    ASSERT_TRUE(col_meta.encoding_stats.has_value());
+    auto const& enc_stats = col_meta.encoding_stats.value();
+    for (auto const& ec : enc_stats) {
+      if (ec.page_type == cudf::io::parquet::detail::PageType::DATA_PAGE) {
+        EXPECT_EQ(ec.encoding, enc);
+        EXPECT_EQ(ec.count, 1);
+      }
+    }
   };
 
   // requested plain

From 064dd7b02166cc67e882b708d66621bc3fafd70b Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Fri, 26 Apr 2024 16:20:32 -0700
Subject: [PATCH 114/842] Add fields to Parquet Statistics structure that were
 added in parquet-format 2.10 (#15412)

[PARQUET-2352](https://github.com/apache/parquet-format/pull/216) added fields to the `Statistics` struct to indicate whether the min and max values were exact or had been truncated. This was somewhat ambiguous in the past. One reason to want to know this is to allow avoiding the decoding of pages (or column chunks) that contain a single value (if the min and max are the same value, and are known to be exact values, and there are no nulls, then the only valid value for the page will be that value). This PR adds these new fields, which will always be `true` in cuDF since cuDF does not support truncating min and max values in the statistics (but does support truncation in the page indexes).

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15412
---
 cpp/src/io/parquet/compact_protocol_reader.cpp | 5 ++++-
 cpp/src/io/parquet/compact_protocol_writer.cpp | 2 ++
 cpp/src/io/parquet/page_enc.cu                 | 3 +++
 cpp/src/io/parquet/parquet.hpp                 | 4 ++++
 cpp/tests/io/parquet_writer_test.cpp           | 6 ++++++
 5 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index a3b58347e20..c9212334a96 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -763,13 +763,16 @@ void CompactProtocolReader::read(Statistics* s)
 {
   using optional_binary = parquet_field_optional<std::vector<uint8_t>, parquet_field_binary>;
   using optional_int64  = parquet_field_optional<int64_t, parquet_field_int64>;
+  using optional_bool   = parquet_field_optional<bool, parquet_field_bool>;
 
   auto op = std::make_tuple(optional_binary(1, s->max),
                             optional_binary(2, s->min),
                             optional_int64(3, s->null_count),
                             optional_int64(4, s->distinct_count),
                             optional_binary(5, s->max_value),
-                            optional_binary(6, s->min_value));
+                            optional_binary(6, s->min_value),
+                            optional_bool(7, s->is_max_value_exact),
+                            optional_bool(8, s->is_min_value_exact));
   function_builder(this, op);
 }
 
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 2174fe46663..14c99f728de 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -202,6 +202,8 @@ size_t CompactProtocolWriter::write(Statistics const& s)
   if (s.distinct_count.has_value()) { c.field_int(4, s.distinct_count.value()); }
   if (s.max_value.has_value()) { c.field_binary(5, s.max_value.value()); }
   if (s.min_value.has_value()) { c.field_binary(6, s.min_value.value()); }
+  if (s.is_max_value_exact.has_value()) { c.field_bool(7, s.is_max_value_exact.value()); }
+  if (s.is_min_value_exact.has_value()) { c.field_bool(8, s.is_min_value_exact.value()); }
   return c.value();
 }
 
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 227f13db60e..11b18579c58 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -2944,6 +2944,9 @@ __device__ uint8_t* EncodeStatistics(uint8_t* start,
     auto const [min_ptr, min_size] =
       get_extremum(&s->min_value, dtype, scratch, true, NO_TRUNC_STATS);
     encoder.field_binary(6, min_ptr, min_size);
+    // cudf min/max statistics are always exact (i.e. not truncated)
+    encoder.field_bool(7, true);
+    encoder.field_bool(8, true);
   }
   encoder.end(&end);
   return end;
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index fe9b6ead6d4..756726945cf 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -259,6 +259,10 @@ struct Statistics {
   thrust::optional<std::vector<uint8_t>> max_value;
   // min value for column determined by ColumnOrder
   thrust::optional<std::vector<uint8_t>> min_value;
+  // If true, max_value is the actual maximum value for a column
+  thrust::optional<bool> is_max_value_exact;
+  // If true, min_value is the actual minimum value for a column
+  thrust::optional<bool> is_min_value_exact;
 };
 
 /**
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index e88afd73290..3a8763ed9f3 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -903,6 +903,12 @@ TEST_F(ParquetWriterTest, CheckColumnIndexTruncation)
       ASSERT_TRUE(stats.min_value.has_value());
       ASSERT_TRUE(stats.max_value.has_value());
 
+      // check that min and max for the column chunk are exact (i.e. not truncated)
+      ASSERT_TRUE(stats.is_max_value_exact.has_value());
+      EXPECT_TRUE(stats.is_max_value_exact.value());
+      ASSERT_TRUE(stats.is_min_value_exact.has_value());
+      EXPECT_TRUE(stats.is_min_value_exact.value());
+
       // check trunc(page.min) <= stats.min && trun(page.max) >= stats.max
       auto const ptype = fmd.schema[c + 1].type;
       auto const ctype = fmd.schema[c + 1].converted_type;

From ab5e3f3bc8924f3393ec839830865b57a4d309a3 Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Tue, 30 Apr 2024 09:10:06 +1000
Subject: [PATCH 115/842] Update developer guide with device_async_resource_ref
 guidelines (#15562)

Closes #15561

Updates guidance in libcudf DEVELOPER_GUIDE.md to cover resource refs and change examples to not use `device_memory_resource` pointers.

Authors:
  - Mark Harris (https://github.com/harrism)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15562
---
 .../developer_guide/DEVELOPER_GUIDE.md        | 46 ++++++++++++++-----
 1 file changed, 34 insertions(+), 12 deletions(-)

diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index ce9840050a9..23b129fdf4b 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -84,7 +84,7 @@ prefixed with an underscore.
 
 ```c++
 template <typename IteratorType>
-void algorithm_function(int x, rmm::cuda_stream_view s, rmm::device_memory_resource* mr)
+void algorithm_function(int x, rmm::cuda_stream_view s, rmm::device_async_resource_ref mr)
 {
   ...
 }
@@ -194,9 +194,10 @@ and produce `unique_ptr`s to owning objects as output. For example,
 std::unique_ptr<table> sort(table_view const& input);
 ```
 
-## rmm::device_memory_resource
+## Memory Resources
 
-libcudf allocates all device memory via RMM memory resources (MR). See the
+libcudf allocates all device memory via RMM memory resources (MR) or CUDA MRs. Either type
+can be passed to libcudf functions via `rmm::device_async_resource_ref` parameters. See the
 [RMM documentation](https://github.com/rapidsai/rmm/blob/main/README.md) for details.
 
 ### Current Device Memory Resource
@@ -206,6 +207,27 @@ RMM provides a "default" memory resource for each device that can be accessed an
 respectively. All memory resource parameters should be defaulted to use the return value of
 `rmm::mr::get_current_device_resource()`.
 
+### Resource Refs
+
+Memory resources are passed via resource ref parameters. A resource ref is a memory resource wrapper
+that enables consumers to specify properties of resources that they expect. These are defined
+in the `cuda::mr` namespace of libcu++, but RMM provides some convenience wrappers in
+`rmm/resource_ref.hpp`:
+ - `rmm::device_resource_ref` accepts a memory resource that provides synchronous allocation
+    of device-accessible memory.
+ - `rmm::device_async_resource_ref` accepts a memory resource that provides stream-ordered allocation
+    of device-accessible memory.
+ - `rmm::host_resource_ref` accepts a memory resource that provides synchronous allocation of host-
+    accessible memory.
+ - `rmm::host_async_resource_ref` accepts a memory resource that provides stream-ordered allocation
+    of host-accessible memory.
+ - `rmm::host_device_resource_ref` accepts a memory resource that provides synchronous allocation of
+    host- and device-accessible memory.
+ - `rmm::host_async_resource_ref` accepts a memory resource that provides stream-ordered allocation
+    of host- and device-accessible memory.
+
+See the libcu++ [docs on `resource_ref`](https://nvidia.github.io/cccl/libcudacxx/extended_api/memory_resource/resource_ref.html) for more information.
+
 ## cudf::column
 
 `cudf::column` is a core owning data structure in libcudf. Most libcudf public APIs produce either
@@ -519,23 +541,23 @@ how device memory is allocated.
 
 ### Output Memory
 
-Any libcudf API that allocates memory that is *returned* to a user must accept a pointer to a
-`device_memory_resource` as the last parameter. Inside the API, this memory resource must be used
-to allocate any memory for returned objects. It should therefore be passed into functions whose
-outputs will be returned. Example:
+Any libcudf API that allocates memory that is *returned* to a user must accept a
+`rmm::device_async_resource_ref` as the last parameter. Inside the API, this memory resource must
+be used to allocate any memory for returned objects. It should therefore be passed into functions
+whose outputs will be returned. Example:
 
 ```c++
 // Returned `column` contains newly allocated memory,
 // therefore the API must accept a memory resource pointer
 std::unique_ptr<column> returns_output_memory(
-  ..., rmm::device_memory_resource * mr = rmm::mr::get_current_device_resource());
+  ..., rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 // This API does not allocate any new *output* memory, therefore
 // a memory resource is unnecessary
 void does_not_allocate_output_memory(...);
 ```
 
-This rule automatically applies to all detail APIs that allocates memory. Any detail API may be
+This rule automatically applies to all detail APIs that allocate memory. Any detail API may be
 called by any public API, and therefore could be allocating memory that is returned to the user.
 To support such uses cases, all detail APIs allocating memory resources should accept an `mr`
 parameter. Callers are responsible for either passing through a provided `mr` or
@@ -549,7 +571,7 @@ obtained from `rmm::mr::get_current_device_resource()` for temporary memory allo
 
 ```c++
 rmm::device_buffer some_function(
-  ..., rmm::mr::device_memory_resource mr * = rmm::mr::get_current_device_resource()) {
+  ..., rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) {
     rmm::device_buffer returned_buffer(..., mr); // Returned buffer uses the passed in MR
     ...
     rmm::device_buffer temporary_buffer(...); // Temporary buffer uses default MR
@@ -561,11 +583,11 @@ rmm::device_buffer some_function(
 ### Memory Management
 
 libcudf code generally eschews raw pointers and direct memory allocation. Use RMM classes built to
-use `device_memory_resource`s for device memory allocation with automated lifetime management.
+use memory resources for device memory allocation with automated lifetime management.
 
 #### rmm::device_buffer
 Allocates a specified number of bytes of untyped, uninitialized device memory using a
-`device_memory_resource`. If no resource is explicitly provided, uses
+memory resource. If no `rmm::device_async_resource_ref` is explicitly provided, it uses
 `rmm::mr::get_current_device_resource()`.
 
 `rmm::device_buffer` is movable and copyable on a stream. A copy performs a deep copy of the

From 528758059e674333ac4ca9b783d5adce7d61248d Mon Sep 17 00:00:00 2001
From: DanialJavady96 <154250392+DanialJavady96@users.noreply.github.com>
Date: Tue, 30 Apr 2024 10:35:53 -0400
Subject: [PATCH 116/842] Refactor joins for conditional semis and antis
 (#14646)

Add a new kernel to be used for both semi and anti joins.
Add some new device functions for adding only one array of shared_memory for caching.

Tests pass on my 3080.

Authors:
  - https://github.com/DanialJavady96
  - Danial Javady (https://github.com/ZelboK)
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14646
---
 cpp/src/join/conditional_join.cu          | 149 +++++++++++++++++-----
 cpp/src/join/conditional_join_kernels.cuh |  94 ++++++++++++++
 cpp/src/join/join_common_utils.cuh        |  39 +++++-
 3 files changed, 249 insertions(+), 33 deletions(-)

diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 095093d08e5..f02dee5f7f5 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -37,6 +37,99 @@
 namespace cudf {
 namespace detail {
 
+std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
+  table_view const& left,
+  table_view const& right,
+  ast::expression const& binary_predicate,
+  join_kind join_type,
+  std::optional<std::size_t> output_size,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  if (right.num_rows() == 0) {
+    switch (join_type) {
+      case join_kind::LEFT_ANTI_JOIN:
+        return std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
+      case join_kind::LEFT_SEMI_JOIN:
+        return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  } else if (left.num_rows() == 0) {
+    switch (join_type) {
+      case join_kind::LEFT_ANTI_JOIN: [[fallthrough]];
+      case join_kind::LEFT_SEMI_JOIN:
+        return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
+      default: CUDF_FAIL("Invalid join kind."); break;
+    }
+  }
+
+  auto const has_nulls = binary_predicate.may_evaluate_null(left, right, stream);
+
+  auto const parser =
+    ast::detail::expression_parser{binary_predicate, left, right, has_nulls, stream, mr};
+  CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8,
+               "The expression must produce a Boolean output.");
+
+  auto left_table  = table_device_view::create(left, stream);
+  auto right_table = table_device_view::create(right, stream);
+
+  detail::grid_1d const config(left.num_rows(), DEFAULT_JOIN_BLOCK_SIZE);
+  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
+
+  // TODO: Remove the output_size parameter. It is not needed because the
+  // output size is bounded by the size of the left table.
+  std::size_t join_size;
+  if (output_size.has_value()) {
+    join_size = *output_size;
+  } else {
+    // Allocate storage for the counter used to get the size of the join output
+    rmm::device_scalar<std::size_t> size(0, stream, mr);
+    if (has_nulls) {
+      compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
+        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+          *left_table, *right_table, join_type, parser.device_expression_data, false, size.data());
+    } else {
+      compute_conditional_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>
+        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+          *left_table, *right_table, join_type, parser.device_expression_data, false, size.data());
+    }
+    join_size = size.value(stream);
+  }
+
+  if (left.num_rows() == 0) {
+    return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
+  }
+
+  rmm::device_scalar<size_type> write_index(0, stream);
+
+  auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
+
+  auto const& join_output_l = left_indices->data();
+
+  if (has_nulls) {
+    conditional_join_anti_semi<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, true>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_table,
+        *right_table,
+        join_type,
+        join_output_l,
+        write_index.data(),
+        parser.device_expression_data,
+        join_size);
+  } else {
+    conditional_join_anti_semi<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, false>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        *left_table,
+        *right_table,
+        join_type,
+        join_output_l,
+        write_index.data(),
+        parser.device_expression_data,
+        join_size);
+  }
+  return left_indices;
+}
+
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
           std::unique_ptr<rmm::device_uvector<size_type>>>
 conditional_join(table_view const& left,
@@ -50,9 +143,7 @@ conditional_join(table_view const& left,
   // We can immediately filter out cases where the right table is empty. In
   // some cases, we return all the rows of the left table with a corresponding
   // null index for the right table; in others, we return an empty output.
-  auto right_num_rows{right.num_rows()};
-  auto left_num_rows{left.num_rows()};
-  if (right_num_rows == 0) {
+  if (right.num_rows() == 0) {
     switch (join_type) {
       // Left, left anti, and full all return all the row indices from left
       // with a corresponding NULL from the right.
@@ -67,7 +158,7 @@ conditional_join(table_view const& left,
                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       default: CUDF_FAIL("Invalid join kind."); break;
     }
-  } else if (left_num_rows == 0) {
+  } else if (left.num_rows() == 0) {
     switch (join_type) {
       // Left, left anti, left semi, and inner joins all return empty sets.
       case join_kind::LEFT_JOIN:
@@ -101,8 +192,8 @@ conditional_join(table_view const& left,
 
   // For inner joins we support optimizing the join by launching one thread for
   // whichever table is larger rather than always using the left table.
-  auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
-  detail::grid_1d const config(swap_tables ? right_num_rows : left_num_rows,
+  auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right.num_rows() > left.num_rows());
+  detail::grid_1d const config(swap_tables ? right.num_rows() : left.num_rows(),
                                DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
   join_kind const kernel_join_type =
@@ -187,7 +278,7 @@ conditional_join(table_view const& left,
   // by any row in the left table.
   if (join_type == join_kind::FULL_JOIN) {
     auto complement_indices = detail::get_left_join_indices_complement(
-      join_indices.second, left_num_rows, right_num_rows, stream, mr);
+      join_indices.second, left.num_rows(), right.num_rows(), stream, mr);
     join_indices = detail::concatenate_vector_pairs(join_indices, complement_indices, stream);
   }
   return join_indices;
@@ -210,21 +301,19 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
   // We can immediately filter out cases where one table is empty. In
   // some cases, we return all the rows of the other table with a corresponding
   // null index for the empty table; in others, we return an empty output.
-  auto right_num_rows{right.num_rows()};
-  auto left_num_rows{left.num_rows()};
-  if (right_num_rows == 0) {
+  if (right.num_rows() == 0) {
     switch (join_type) {
       // Left, left anti, and full all return all the row indices from left
       // with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
       case join_kind::LEFT_ANTI_JOIN:
-      case join_kind::FULL_JOIN: return left_num_rows;
+      case join_kind::FULL_JOIN: return left.num_rows();
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN: return 0;
       default: CUDF_FAIL("Invalid join kind."); break;
     }
-  } else if (left_num_rows == 0) {
+  } else if (left.num_rows() == 0) {
     switch (join_type) {
       // Left, left anti, left semi, and inner joins all return empty sets.
       case join_kind::LEFT_JOIN:
@@ -232,7 +321,7 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN: return 0;
       // Full joins need to return the trivial complement.
-      case join_kind::FULL_JOIN: return right_num_rows;
+      case join_kind::FULL_JOIN: return right.num_rows();
       default: CUDF_FAIL("Invalid join kind."); break;
     }
   }
@@ -254,8 +343,8 @@ std::size_t compute_conditional_join_output_size(table_view const& left,
 
   // For inner joins we support optimizing the join by launching one thread for
   // whichever table is larger rather than always using the left table.
-  auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right_num_rows > left_num_rows);
-  detail::grid_1d const config(swap_tables ? right_num_rows : left_num_rows,
+  auto swap_tables = (join_type == join_kind::INNER_JOIN) && (right.num_rows() > left.num_rows());
+  detail::grid_1d const config(swap_tables ? right.num_rows() : left.num_rows(),
                                DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
 
@@ -349,14 +438,13 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::conditional_join(left,
-                                            right,
-                                            binary_predicate,
-                                            detail::join_kind::LEFT_SEMI_JOIN,
-                                            output_size,
-                                            cudf::get_default_stream(),
-                                            mr)
-                     .first);
+  return std::move(detail::conditional_join_anti_semi(left,
+                                                      right,
+                                                      binary_predicate,
+                                                      detail::join_kind::LEFT_SEMI_JOIN,
+                                                      output_size,
+                                                      cudf::get_default_stream(),
+                                                      mr));
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
@@ -367,14 +455,13 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::conditional_join(left,
-                                            right,
-                                            binary_predicate,
-                                            detail::join_kind::LEFT_ANTI_JOIN,
-                                            output_size,
-                                            cudf::get_default_stream(),
-                                            mr)
-                     .first);
+  return std::move(detail::conditional_join_anti_semi(left,
+                                                      right,
+                                                      binary_predicate,
+                                                      detail::join_kind::LEFT_ANTI_JOIN,
+                                                      output_size,
+                                                      cudf::get_default_stream(),
+                                                      mr));
 }
 
 std::size_t conditional_inner_join_size(table_view const& left,
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index cc57fa7b03b..5e190eb2b27 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -271,6 +271,100 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,
   }
 }
 
+template <cudf::size_type block_size, cudf::size_type output_cache_size, bool has_nulls>
+CUDF_KERNEL void conditional_join_anti_semi(
+  table_device_view left_table,
+  table_device_view right_table,
+  join_kind join_type,
+  cudf::size_type* join_output_l,
+  cudf::size_type* current_idx,
+  cudf::ast::detail::expression_device_view device_expression_data,
+  cudf::size_type const max_size)
+{
+  constexpr int num_warps = block_size / detail::warp_size;
+  __shared__ cudf::size_type current_idx_shared[num_warps];
+  __shared__ cudf::size_type join_shared_l[num_warps][output_cache_size];
+
+  extern __shared__ char raw_intermediate_storage[];
+  cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
+    reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
+  auto thread_intermediate_storage =
+    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
+
+  int const warp_id                            = threadIdx.x / detail::warp_size;
+  int const lane_id                            = threadIdx.x % detail::warp_size;
+  cudf::thread_index_type const outer_num_rows = left_table.num_rows();
+  cudf::thread_index_type const inner_num_rows = right_table.num_rows();
+  auto const stride                            = cudf::detail::grid_1d::grid_stride();
+  auto const start_idx                         = cudf::detail::grid_1d::global_thread_id();
+
+  if (0 == lane_id) { current_idx_shared[warp_id] = 0; }
+
+  __syncwarp();
+
+  unsigned int const activemask = __ballot_sync(0xffff'ffffu, start_idx < outer_num_rows);
+
+  auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
+    left_table, right_table, device_expression_data);
+
+  for (cudf::thread_index_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
+       outer_row_index += stride) {
+    bool found_match = false;
+    for (thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
+         ++inner_row_index) {
+      auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
+
+      evaluator.evaluate(
+        output_dest, outer_row_index, inner_row_index, 0, thread_intermediate_storage);
+
+      if (output_dest.is_valid() && output_dest.value()) {
+        if (join_type == join_kind::LEFT_SEMI_JOIN && !found_match) {
+          add_left_to_cache(outer_row_index, current_idx_shared, warp_id, join_shared_l[warp_id]);
+        }
+        found_match = true;
+      }
+
+      __syncwarp(activemask);
+
+      auto const do_flush   = current_idx_shared[warp_id] + detail::warp_size >= output_cache_size;
+      auto const flush_mask = __ballot_sync(activemask, do_flush);
+      if (do_flush) {
+        flush_output_cache<num_warps, output_cache_size>(flush_mask,
+                                                         max_size,
+                                                         warp_id,
+                                                         lane_id,
+                                                         current_idx,
+                                                         current_idx_shared,
+                                                         join_shared_l,
+                                                         join_output_l);
+        __syncwarp(flush_mask);
+        if (0 == lane_id) { current_idx_shared[warp_id] = 0; }
+      }
+      __syncwarp(activemask);
+    }
+
+    if ((join_type == join_kind::LEFT_ANTI_JOIN) && (!found_match)) {
+      add_left_to_cache(outer_row_index, current_idx_shared, warp_id, join_shared_l[warp_id]);
+    }
+
+    __syncwarp(activemask);
+
+    auto const do_flush   = current_idx_shared[warp_id] > 0;
+    auto const flush_mask = __ballot_sync(activemask, do_flush);
+    if (do_flush) {
+      flush_output_cache<num_warps, output_cache_size>(flush_mask,
+                                                       max_size,
+                                                       warp_id,
+                                                       lane_id,
+                                                       current_idx,
+                                                       current_idx_shared,
+                                                       join_shared_l,
+                                                       join_output_l);
+    }
+    if (found_match) break;
+  }
+}
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 9758919c5b4..31f267d5cfb 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -281,12 +281,21 @@ __inline__ __device__ void add_pair_to_cache(size_type const first,
                                              size_type* joined_shared_r)
 {
   size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
-
   // its guaranteed to fit into the shared cache
   joined_shared_l[my_current_idx] = first;
   joined_shared_r[my_current_idx] = second;
 }
 
+__inline__ __device__ void add_left_to_cache(size_type const first,
+                                             size_type* current_idx_shared,
+                                             int const warp_id,
+                                             size_type* joined_shared_l)
+{
+  size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
+
+  joined_shared_l[my_current_idx] = first;
+}
+
 template <int num_warps, cudf::size_type output_cache_size>
 __device__ void flush_output_cache(unsigned int const activemask,
                                    cudf::size_type const max_size,
@@ -300,7 +309,7 @@ __device__ void flush_output_cache(unsigned int const activemask,
                                    size_type* join_output_r)
 {
   // count how many active threads participating here which could be less than warp_size
-  int num_threads               = __popc(activemask);
+  int const num_threads         = __popc(activemask);
   cudf::size_type output_offset = 0;
 
   if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
@@ -322,6 +331,32 @@ __device__ void flush_output_cache(unsigned int const activemask,
   }
 }
 
+template <int num_warps, cudf::size_type output_cache_size>
+__device__ void flush_output_cache(unsigned int const activemask,
+                                   cudf::size_type const max_size,
+                                   int const warp_id,
+                                   int const lane_id,
+                                   cudf::size_type* current_idx,
+                                   cudf::size_type current_idx_shared[num_warps],
+                                   size_type join_shared_l[num_warps][output_cache_size],
+                                   size_type* join_output_l)
+{
+  int const num_threads         = __popc(activemask);
+  cudf::size_type output_offset = 0;
+
+  if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
+
+  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
+
+  for (int shared_out_idx = lane_id; shared_out_idx < current_idx_shared[warp_id];
+       shared_out_idx += num_threads) {
+    cudf::size_type thread_offset = output_offset + shared_out_idx;
+    if (thread_offset < max_size) {
+      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
+    }
+  }
+}
+
 }  // namespace detail
 
 }  // namespace cudf

From b9c6d4c5f4bbbb75ec7b31bcdfc7546812806c32 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 30 Apr 2024 12:30:06 -0500
Subject: [PATCH 117/842] Deprecate `to/from_dask_dataframe` APIs in dask-cudf
 (#15592)

The `to/from_dask_dataframe` APIs have been obsolete for a long time. It is always better to use `ddf.to_backend("cudf")` or `ddf.to_backend("pandas")` instead.

These APIs are also "dangerous" to use with dask-expr, because the same API names are still used to convert data to/from "legacy" Dask collections. Note that dask-expr also deprecated `to/from_dask_dataframe` in favor of `to/from_legacy_dataframe`, but the conflicting APIs still exist (for now).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/15592
---
 docs/dask_cudf/source/api.rst                 |  3 +-
 python/dask_cudf/dask_cudf/core.py            | 30 ++++++++++++++++---
 .../dask_cudf/dask_cudf/expr/_collection.py   | 29 ++++++++++++++----
 .../dask_cudf/io/tests/test_parquet.py        |  2 +-
 python/dask_cudf/dask_cudf/tests/test_core.py | 24 +++++++++++++++
 .../dask_cudf/dask_cudf/tests/test_groupby.py |  4 +--
 6 files changed, 77 insertions(+), 15 deletions(-)

diff --git a/docs/dask_cudf/source/api.rst b/docs/dask_cudf/source/api.rst
index db32f4bbcb3..ab10f4af4fa 100644
--- a/docs/dask_cudf/source/api.rst
+++ b/docs/dask_cudf/source/api.rst
@@ -13,12 +13,11 @@ Creating and storing DataFrames
 of DataFrames from a variety of storage formats. For on-disk data that
 are not supported directly in Dask-cuDF, we recommend using Dask's
 data reading facilities, followed by calling
-:func:`.from_dask_dataframe` to obtain a Dask-cuDF object.
+:meth:`*.to_backend("cudf")` to obtain a Dask-cuDF object.
 
 .. automodule:: dask_cudf
    :members:
       from_cudf,
-      from_dask_dataframe,
       from_delayed,
       read_csv,
       read_json,
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index bfe58531a73..3f0cfeb6d2c 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -55,9 +55,20 @@ def __repr__(self):
 
     @_dask_cudf_nvtx_annotate
     def to_dask_dataframe(self, **kwargs):
-        """Create a dask.dataframe object from a dask_cudf object"""
-        nullable = kwargs.get("nullable", False)
-        return self.map_partitions(M.to_pandas, nullable=nullable)
+        """Create a dask.dataframe object from a dask_cudf object
+
+        WARNING: This API is deprecated, and may not work properly
+        when query-planning is active. Please use `*.to_backend("pandas")`
+        to convert the underlying data to pandas.
+        """
+
+        warnings.warn(
+            "The `to_dask_dataframe` API is now deprecated. "
+            "Please use `*.to_backend('pandas')` instead.",
+            FutureWarning,
+        )
+
+        return self.to_backend("pandas", **kwargs)
 
 
 concat = dd.concat
@@ -733,6 +744,10 @@ def from_dask_dataframe(df):
     Convert a Dask :class:`dask.dataframe.DataFrame` to a Dask-cuDF
     one.
 
+    WARNING: This API is deprecated, and may not work properly
+    when query-planning is active. Please use `*.to_backend("cudf")`
+    to convert the underlying data to cudf.
+
     Parameters
     ----------
     df : dask.dataframe.DataFrame
@@ -742,7 +757,14 @@ def from_dask_dataframe(df):
     -------
     dask_cudf.DataFrame : A new Dask collection backed by cuDF objects
     """
-    return df.map_partitions(cudf.from_pandas)
+
+    warnings.warn(
+        "The `from_dask_dataframe` API is now deprecated. "
+        "Please use `*.to_backend('cudf')` instead.",
+        FutureWarning,
+    )
+
+    return df.to_backend("cudf")
 
 
 for name in (
diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index 516e35a4335..605a81f0fcd 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+import warnings
 from functools import cached_property
 
 from dask_expr import (
@@ -22,9 +23,25 @@
 ##
 
 
-# VarMixin can be removed if cudf#15179 is addressed.
-# See: https://github.com/rapidsai/cudf/issues/15179
-class VarMixin:
+class CudfFrameBase(FrameBase):
+    def to_dask_dataframe(self, **kwargs):
+        """Create a dask.dataframe object from a dask_cudf object
+
+        WARNING: This API is deprecated, and may not work properly.
+        Please use `*.to_backend("pandas")` to convert the
+        underlying data to pandas.
+        """
+
+        warnings.warn(
+            "The `to_dask_dataframe` API is now deprecated. "
+            "Please use `*.to_backend('pandas')` instead.",
+            FutureWarning,
+        )
+
+        return self.to_backend("pandas", **kwargs)
+
+    # var can be removed if cudf#15179 is addressed.
+    # See: https://github.com/rapidsai/cudf/issues/15179
     def var(
         self,
         axis=0,
@@ -49,7 +66,7 @@ def var(
         )
 
 
-class DataFrame(VarMixin, DXDataFrame):
+class DataFrame(DXDataFrame, CudfFrameBase):
     @classmethod
     def from_dict(cls, *args, **kwargs):
         with config.set({"dataframe.backend": "cudf"}):
@@ -94,7 +111,7 @@ def read_text(*args, **kwargs):
         return from_legacy_dataframe(ddf)
 
 
-class Series(VarMixin, DXSeries):
+class Series(DXSeries, CudfFrameBase):
     def groupby(self, by, **kwargs):
         from dask_cudf.expr._groupby import SeriesGroupBy
 
@@ -113,7 +130,7 @@ def struct(self):
         return StructMethods(self)
 
 
-class Index(DXIndex):
+class Index(DXIndex, CudfFrameBase):
     pass  # Same as pandas (for now)
 
 
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 68460653119..8ca27df8fec 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -113,7 +113,7 @@ def test_roundtrip_from_dask_none_index_false(tmpdir):
 @pytest.mark.parametrize("write_meta", [True, False])
 def test_roundtrip_from_dask_cudf(tmpdir, write_meta):
     tmpdir = str(tmpdir)
-    gddf = dask_cudf.from_dask_dataframe(ddf)
+    gddf = ddf.to_backend("cudf")
     gddf.to_parquet(tmpdir, write_metadata_file=write_meta)
 
     gddf2 = dask_cudf.read_parquet(tmpdir, calculate_divisions=True)
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index c6918c94559..4878d44d636 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -32,6 +32,30 @@ def test_from_dict_backend_dispatch():
     dd.assert_eq(expect, ddf)
 
 
+def test_to_dask_dataframe_deprecated():
+    gdf = cudf.DataFrame({"a": range(100)})
+    ddf = dd.from_pandas(gdf, npartitions=2)
+    assert isinstance(ddf._meta, cudf.DataFrame)
+
+    with pytest.warns(FutureWarning, match="API is now deprecated"):
+        assert isinstance(
+            ddf.to_dask_dataframe()._meta,
+            pd.DataFrame,
+        )
+
+
+def test_from_dask_dataframe_deprecated():
+    gdf = pd.DataFrame({"a": range(100)})
+    ddf = dd.from_pandas(gdf, npartitions=2)
+    assert isinstance(ddf._meta, pd.DataFrame)
+
+    with pytest.warns(FutureWarning, match="API is now deprecated"):
+        assert isinstance(
+            dask_cudf.from_dask_dataframe(ddf)._meta,
+            cudf.DataFrame,
+        )
+
+
 def test_to_backend():
     np.random.seed(0)
     data = {
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 3bb3e3b0bb8..1e22dd95475 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -562,9 +562,9 @@ def test_groupby_reset_index_string_name():
 def test_groupby_categorical_key():
     # See https://github.com/rapidsai/cudf/issues/4608
     df = dask.datasets.timeseries()
-    gddf = dask_cudf.from_dask_dataframe(df)
+    gddf = df.to_backend("cudf")
     gddf["name"] = gddf["name"].astype("category")
-    ddf = gddf.to_dask_dataframe()
+    ddf = gddf.to_backend("pandas")
 
     got = gddf.groupby("name", sort=True).agg(
         {"x": ["mean", "max"], "y": ["mean", "count"]}

From f3206eabeafe1510e1484312c33e8e9be9c1d891 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Tue, 30 Apr 2024 14:33:54 -0400
Subject: [PATCH 118/842] Optimizing multi-source byte range reading in JSON
 reader (#15396)

This piece of work seeks to achieve two goals - (i) reducing repeated reading of byte range chunks in the JSON reader, and (ii) enabling multi-source byte range reading for chunks spanning sources.
- We expand on the idea outlined in #15185 to reduce the repeated reading of follow-on chunks while searching for the end of the last row in the requested chunk. After the requested chunk, the following chunks are divided into subchunks, and read until the delimiter character is reached.
- We estimate the buffer size needed for the entire byte range, and compute offsets per source into the buffer.

[Visualization of the performance improvement with this optimization](https://github.com/rapidsai/cudf/pull/15396#issuecomment-2044217170)

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - MithunR (https://github.com/mythrocks)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15396
---
 cpp/include/cudf/io/detail/json.hpp           |  17 +-
 cpp/src/io/json/json_normalization.cu         |  30 ++-
 cpp/src/io/json/read_json.cu                  | 248 +++++++++++-------
 .../io/json_quote_normalization_test.cpp      |  25 +-
 cpp/tests/io/json_test.cpp                    | 105 ++++++++
 .../io/json_whitespace_normalization_test.cu  |  24 +-
 6 files changed, 313 insertions(+), 136 deletions(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index cf8e23c2d93..540a584908d 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -56,22 +57,22 @@ void write_json(data_sink* sink,
 /**
  * @brief Normalize single quotes to double quotes using FST
  *
- * @param inbuf Input device buffer
+ * @param indata Input device buffer
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& inbuf,
-                                                  rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr);
+void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr);
 
 /**
  * @brief Normalize unquoted whitespace (space and tab characters) using FST
  *
- * @param inbuf Input device buffer
+ * @param indata Input device buffer
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-rmm::device_uvector<char> normalize_whitespace(rmm::device_uvector<char>&& inbuf,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::device_async_resource_ref mr);
+void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
+                          rmm::cuda_stream_view stream,
+                          rmm::device_async_resource_ref mr);
 }  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index eb06ea0177e..ca56a12eb36 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -298,9 +298,9 @@ struct TransduceToNormalizedWS {
 
 namespace detail {
 
-rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT>&& inbuf,
-                                                     rmm::cuda_stream_view stream,
-                                                     rmm::device_async_resource_ref mr)
+void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<SymbolT>>& indata,
+                             rmm::cuda_stream_view stream,
+                             rmm::device_async_resource_ref mr)
 {
   auto parser = fst::detail::make_fst(
     fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
@@ -308,10 +308,10 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT
     fst::detail::make_translation_functor(normalize_quotes::TransduceToNormalizedQuotes{}),
     stream);
 
-  rmm::device_uvector<SymbolT> outbuf(inbuf.size() * 2, stream, mr);
+  rmm::device_uvector<SymbolT> outbuf(indata.size() * 2, stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(inbuf.data(),
-                   static_cast<SymbolOffsetT>(inbuf.size()),
+  parser.Transduce(indata.data(),
+                   static_cast<SymbolOffsetT>(indata.size()),
                    outbuf.data(),
                    thrust::make_discard_iterator(),
                    outbuf_size.data(),
@@ -319,12 +319,13 @@ rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT
                    stream);
 
   outbuf.resize(outbuf_size.value(stream), stream);
-  return outbuf;
+  datasource::owning_buffer<rmm::device_uvector<SymbolT>> outdata(std::move(outbuf));
+  std::swap(indata, outdata);
 }
 
-rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&& inbuf,
-                                                  rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr)
+void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<SymbolT>>& indata,
+                          rmm::cuda_stream_view stream,
+                          rmm::device_async_resource_ref mr)
 {
   auto parser = fst::detail::make_fst(
     fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
@@ -332,10 +333,10 @@ rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&&
     fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}),
     stream);
 
-  rmm::device_uvector<SymbolT> outbuf(inbuf.size(), stream, mr);
+  rmm::device_uvector<SymbolT> outbuf(indata.size(), stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(inbuf.data(),
-                   static_cast<SymbolOffsetT>(inbuf.size()),
+  parser.Transduce(indata.data(),
+                   static_cast<SymbolOffsetT>(indata.size()),
                    outbuf.data(),
                    thrust::make_discard_iterator(),
                    outbuf_size.data(),
@@ -343,7 +344,8 @@ rmm::device_uvector<SymbolT> normalize_whitespace(rmm::device_uvector<SymbolT>&&
                    stream);
 
   outbuf.resize(outbuf_size.value(stream), stream);
-  return outbuf;
+  datasource::owning_buffer<rmm::device_uvector<SymbolT>> outdata(std::move(outbuf));
+  std::swap(indata, outdata);
 }
 
 }  // namespace detail
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 81ef3a51afc..89c301ec055 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -20,10 +20,13 @@
 #include "read_json.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
 
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
@@ -49,17 +52,20 @@ size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
 /**
  * @brief Read from array of data sources into RMM buffer
  *
+ * @param buffer Device span buffer to which data is read
  * @param sources Array of data sources
  * @param compression Compression format of source
  * @param range_offset Number of bytes to skip from source start
  * @param range_size Number of bytes to read from source
  * @param stream CUDA stream used for device memory operations and kernel launches
+ * @returns A subspan of the input device span containing data read
  */
-rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>> sources,
-                                           compression_type compression,
-                                           size_t range_offset,
-                                           size_t range_size,
-                                           rmm::cuda_stream_view stream)
+device_span<char> ingest_raw_input(device_span<char> buffer,
+                                   host_span<std::unique_ptr<datasource>> sources,
+                                   compression_type compression,
+                                   size_t range_offset,
+                                   size_t range_size,
+                                   rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   // We append a line delimiter between two files to make sure the last line of file i and the first
@@ -68,33 +74,43 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
   auto constexpr num_delimiter_chars = 1;
   auto const num_extra_delimiters    = num_delimiter_chars * (sources.size() - 1);
 
-  // Iterate through the user defined sources and read the contents into the local buffer
-  auto const total_source_size =
-    sources_size(sources, range_offset, range_size) + num_extra_delimiters;
-
   if (compression == compression_type::NONE) {
     std::vector<size_type> delimiter_map{};
+    std::vector<size_t> prefsum_source_sizes(sources.size());
+    std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
     delimiter_map.reserve(sources.size());
-    auto d_buffer     = rmm::device_uvector<char>(total_source_size, stream);
     size_t bytes_read = 0;
-    std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
-    for (auto const& source : sources) {
-      if (!source->is_empty()) {
-        auto data_size   = (range_size != 0) ? range_size : source->size();
-        auto destination = reinterpret_cast<uint8_t*>(d_buffer.data()) + bytes_read;
-        if (source->is_device_read_preferred(data_size)) {
-          bytes_read += source->device_read(range_offset, data_size, destination, stream);
-        } else {
-          h_buffers.emplace_back(source->host_read(range_offset, data_size));
-          auto const& h_buffer = h_buffers.back();
-          CUDF_CUDA_TRY(cudaMemcpyAsync(
-            destination, h_buffer->data(), h_buffer->size(), cudaMemcpyDefault, stream.value()));
-          bytes_read += h_buffer->size();
-        }
-        delimiter_map.push_back(bytes_read);
-        bytes_read += num_delimiter_chars;
+    std::transform_inclusive_scan(sources.begin(),
+                                  sources.end(),
+                                  prefsum_source_sizes.begin(),
+                                  std::plus<int>{},
+                                  [](const std::unique_ptr<datasource>& s) { return s->size(); });
+    auto upper =
+      std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset);
+    size_t start_source = std::distance(prefsum_source_sizes.begin(), upper);
+
+    auto remaining_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset);
+    range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
+    for (size_t i = start_source; i < sources.size() && remaining_bytes_to_read; i++) {
+      if (sources[i]->is_empty()) continue;
+      auto data_size   = std::min(sources[i]->size() - range_offset, remaining_bytes_to_read);
+      auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read;
+      if (sources[i]->is_device_read_preferred(data_size)) {
+        bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
+      } else {
+        h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
+        auto const& h_buffer = h_buffers.back();
+        CUDF_CUDA_TRY(cudaMemcpyAsync(
+          destination, h_buffer->data(), h_buffer->size(), cudaMemcpyDefault, stream.value()));
+        bytes_read += h_buffer->size();
       }
+      range_offset = 0;
+      remaining_bytes_to_read -= bytes_read;
+      delimiter_map.push_back(bytes_read);
+      bytes_read += num_delimiter_chars;
     }
+    // In the case where all sources are empty, bytes_read is zero
+    if (bytes_read) bytes_read -= num_delimiter_chars;
 
     // If this is a multi-file source, we scatter the JSON line delimiters between files
     if (sources.size() > 1) {
@@ -109,23 +125,25 @@ rmm::device_uvector<char> ingest_raw_input(host_span<std::unique_ptr<datasource>
                       delimiter_source,
                       delimiter_source + d_delimiter_map.size(),
                       d_delimiter_map.data(),
-                      d_buffer.data());
+                      buffer.data());
     }
-
     stream.synchronize();
-    return d_buffer;
-
-  } else {
-    auto buffer = std::vector<uint8_t>(total_source_size);
-    // Single read because only a single compressed source is supported
-    // Reading to host because decompression of a single block is much faster on the CPU
-    sources[0]->host_read(range_offset, total_source_size, buffer.data());
-    auto const uncomp_data = decompress(compression, buffer);
-    return cudf::detail::make_device_uvector_sync(
-      host_span<char const>{reinterpret_cast<char const*>(uncomp_data.data()), uncomp_data.size()},
-      stream,
-      rmm::mr::get_current_device_resource());
+    return buffer.first(bytes_read);
   }
+  // TODO: allow byte range reading from multiple compressed files.
+  auto remaining_bytes_to_read = std::min(range_size, sources[0]->size() - range_offset);
+  auto hbuffer                 = std::vector<uint8_t>(remaining_bytes_to_read);
+  // Single read because only a single compressed source is supported
+  // Reading to host because decompression of a single block is much faster on the CPU
+  sources[0]->host_read(range_offset, remaining_bytes_to_read, hbuffer.data());
+  auto uncomp_data = decompress(compression, hbuffer);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(buffer.data(),
+                                reinterpret_cast<char*>(uncomp_data.data()),
+                                uncomp_data.size() * sizeof(char),
+                                cudaMemcpyHostToDevice,
+                                stream.value()));
+  stream.synchronize();
+  return buffer.first(uncomp_data.size());
 }
 
 size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::datasource>> sources,
@@ -133,21 +151,19 @@ size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::data
                                         char const delimiter,
                                         rmm::cuda_stream_view stream)
 {
-  auto const buffer = ingest_raw_input(sources,
-                                       reader_opts.get_compression(),
-                                       reader_opts.get_byte_range_offset(),
-                                       reader_opts.get_byte_range_size(),
-                                       stream);
+  auto const total_source_size =
+    sources_size(sources, reader_opts.get_byte_range_offset(), reader_opts.get_byte_range_size()) +
+    (sources.size() - 1);
+  rmm::device_uvector<char> buffer(total_source_size, stream);
+  ingest_raw_input(buffer,
+                   sources,
+                   reader_opts.get_compression(),
+                   reader_opts.get_byte_range_offset(),
+                   reader_opts.get_byte_range_size(),
+                   stream);
   return find_first_delimiter(buffer, delimiter, stream);
 }
 
-bool should_load_whole_source(json_reader_options const& opts, size_t source_size)
-{
-  auto const range_offset = opts.get_byte_range_offset();
-  auto const range_size   = opts.get_byte_range_size();
-  return range_offset == 0 and (range_size == 0 or range_size >= source_size);
-}
-
 /**
  * @brief Get the byte range between record starts and ends starting from the given range.
  *
@@ -159,48 +175,90 @@ bool should_load_whole_source(json_reader_options const& opts, size_t source_siz
  * @param sources Data sources to read from
  * @param reader_opts JSON reader options with range offset and range size
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @return Byte range for parsing
+ * @returns Data source owning buffer enclosing the bytes read
  */
-auto get_record_range_raw_input(host_span<std::unique_ptr<datasource>> sources,
-                                json_reader_options const& reader_opts,
-                                rmm::cuda_stream_view stream)
+datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
+  host_span<std::unique_ptr<datasource>> sources,
+  json_reader_options const& reader_opts,
+  rmm::cuda_stream_view stream)
 {
-  auto buffer = ingest_raw_input(sources,
-                                 reader_opts.get_compression(),
-                                 reader_opts.get_byte_range_offset(),
-                                 reader_opts.get_byte_range_size(),
-                                 stream);
-  if (should_load_whole_source(reader_opts, sources[0]->size())) return buffer;
-  auto first_delim_pos =
-    reader_opts.get_byte_range_offset() == 0 ? 0 : find_first_delimiter(buffer, '\n', stream);
+  CUDF_FUNC_RANGE();
+  auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); };
+
+  size_t const total_source_size            = sources_size(sources, 0, 0);
+  auto constexpr num_delimiter_chars        = 1;
+  auto const num_extra_delimiters           = num_delimiter_chars * (sources.size() - 1);
+  compression_type const reader_compression = reader_opts.get_compression();
+  size_t const chunk_offset                 = reader_opts.get_byte_range_offset();
+  size_t chunk_size                         = reader_opts.get_byte_range_size();
+
+  CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset,
+               "Invalid offsetting");
+  auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
+  chunk_size =
+    should_load_all_sources ? total_source_size - chunk_offset + num_extra_delimiters : chunk_size;
+
+  // Some magic numbers
+  constexpr int num_subchunks               = 10;  // per chunk_size
+  constexpr size_t min_subchunk_size        = 10000;
+  int const num_subchunks_prealloced        = should_load_all_sources ? 0 : 3;
+  constexpr int estimated_compression_ratio = 4;
+
+  // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to
+  // 10kb) and the byte range size
+
+  size_t const size_per_subchunk =
+    geometric_mean(std::ceil((double)chunk_size / num_subchunks), min_subchunk_size);
+
+  // The allocation for single source compressed input is estimated by assuming a ~4:1
+  // compression ratio. For uncompressed inputs, we can getter a better estimate using the idea
+  // of subchunks.
+  auto constexpr header_size = 4096;
+  size_t const buffer_size =
+    reader_compression != compression_type::NONE
+      ? total_source_size * estimated_compression_ratio + header_size
+      : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk);
+  rmm::device_uvector<char> buffer(buffer_size, stream);
+  device_span<char> bufspan(buffer);
+
+  // Offset within buffer indicating first read position
+  std::int64_t buffer_offset = 0;
+  auto readbufspan =
+    ingest_raw_input(bufspan, sources, reader_compression, chunk_offset, chunk_size, stream);
+
+  auto const shift_for_nonzero_offset = std::min<std::int64_t>(chunk_offset, 1);
+  auto const first_delim_pos =
+    chunk_offset == 0 ? 0 : find_first_delimiter(readbufspan, '\n', stream);
   if (first_delim_pos == -1) {
-    return rmm::device_uvector<char>{0, stream};
-  } else {
-    first_delim_pos = first_delim_pos + reader_opts.get_byte_range_offset();
+    // return empty owning datasource buffer
+    auto empty_buf = rmm::device_uvector<char>(0, stream);
+    return datasource::owning_buffer<rmm::device_uvector<char>>(std::move(empty_buf));
+  } else if (!should_load_all_sources) {
     // Find next delimiter
-    decltype(first_delim_pos) next_delim_pos = -1;
-    auto const total_source_size             = sources_size(sources, 0, 0);
-    auto current_offset = reader_opts.get_byte_range_offset() + reader_opts.get_byte_range_size();
-    while (current_offset < total_source_size and next_delim_pos == -1) {
-      buffer         = ingest_raw_input(sources,
-                                reader_opts.get_compression(),
-                                current_offset,
-                                reader_opts.get_byte_range_size(),
-                                stream);
-      next_delim_pos = find_first_delimiter(buffer, '\n', stream);
-      if (next_delim_pos == -1) { current_offset += reader_opts.get_byte_range_size(); }
+    std::int64_t next_delim_pos = -1;
+    size_t next_subchunk_start  = chunk_offset + chunk_size;
+    while (next_subchunk_start < total_source_size && next_delim_pos < buffer_offset) {
+      buffer_offset += readbufspan.size();
+      readbufspan    = ingest_raw_input(bufspan.last(buffer_size - buffer_offset),
+                                     sources,
+                                     reader_compression,
+                                     next_subchunk_start,
+                                     size_per_subchunk,
+                                     stream);
+      next_delim_pos = find_first_delimiter(readbufspan, '\n', stream) + buffer_offset;
+      if (next_delim_pos < buffer_offset) { next_subchunk_start += size_per_subchunk; }
     }
-    if (next_delim_pos == -1) {
-      next_delim_pos = total_source_size;
-    } else {
-      next_delim_pos = next_delim_pos + current_offset;
-    }
-    return ingest_raw_input(sources,
-                            reader_opts.get_compression(),
-                            first_delim_pos,
-                            next_delim_pos - first_delim_pos,
-                            stream);
+    if (next_delim_pos < buffer_offset) next_delim_pos = buffer_offset + readbufspan.size();
+
+    return datasource::owning_buffer<rmm::device_uvector<char>>(
+      std::move(buffer),
+      reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
+      next_delim_pos - first_delim_pos - shift_for_nonzero_offset);
   }
+  return datasource::owning_buffer<rmm::device_uvector<char>>(
+    std::move(buffer),
+    reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
+    readbufspan.size() - first_delim_pos - shift_for_nonzero_offset);
 }
 
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
@@ -221,8 +279,6 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
   if (reader_opts.get_byte_range_offset() != 0 or reader_opts.get_byte_range_size() != 0) {
     CUDF_EXPECTS(reader_opts.is_enabled_lines(),
                  "Specifying a byte range is supported only for JSON Lines");
-    CUDF_EXPECTS(sources.size() == 1,
-                 "Specifying a byte range is supported only for a single source");
   }
 
   if (sources.size() > 1) {
@@ -232,22 +288,24 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                  "Multiple inputs are supported only for JSON Lines format");
   }
 
-  auto buffer = get_record_range_raw_input(sources, reader_opts, stream);
+  datasource::owning_buffer<rmm::device_uvector<char>> bufview =
+    get_record_range_raw_input(sources, reader_opts, stream);
 
   // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
   // invoke pre-processing FST
   if (reader_opts.is_enabled_normalize_single_quotes()) {
-    buffer =
-      normalize_single_quotes(std::move(buffer), stream, rmm::mr::get_current_device_resource());
+    normalize_single_quotes(bufview, stream, rmm::mr::get_current_device_resource());
   }
 
   // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is
   // enabled, invoke pre-processing FST
   if (reader_opts.is_enabled_normalize_whitespace()) {
-    buffer =
-      normalize_whitespace(std::move(buffer), stream, rmm::mr::get_current_device_resource());
+    normalize_whitespace(bufview, stream, rmm::mr::get_current_device_resource());
   }
 
+  auto buffer =
+    cudf::device_span<char const>(reinterpret_cast<char const*>(bufview.data()), bufview.size());
+  stream.synchronize();
   return device_parse_nested_json(buffer, reader_opts, stream, mr);
   // For debug purposes, use host_parse_nested_json()
 }
diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp
index 593c8136e6a..5260b435482 100644
--- a/cpp/tests/io/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json_quote_normalization_test.cpp
@@ -20,6 +20,8 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/testing_main.hpp>
 
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/io/types.hpp>
@@ -39,23 +41,22 @@ void run_test(const std::string& host_input, const std::string& expected_host_ou
   std::shared_ptr<rmm::mr::device_memory_resource> rsc =
     std::make_shared<rmm::mr::cuda_memory_resource>();
 
-  rmm::device_uvector<char> device_input(
-    host_input.size(), cudf::test::get_default_stream(), rsc.get());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(device_input.data(),
-                                host_input.data(),
-                                host_input.size(),
-                                cudaMemcpyHostToDevice,
-                                cudf::test::get_default_stream().value()));
+  auto stream_view  = cudf::test::get_default_stream();
+  auto device_input = cudf::detail::make_device_uvector_async(
+    host_input, stream_view, rmm::mr::get_current_device_resource());
+
   // Preprocessing FST
-  auto device_fst_output = cudf::io::json::detail::normalize_single_quotes(
-    std::move(device_input), cudf::test::get_default_stream(), rsc.get());
+  cudf::io::datasource::owning_buffer<rmm::device_uvector<char>> device_data(
+    std::move(device_input));
+  cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get());
 
-  std::string preprocessed_host_output(device_fst_output.size(), 0);
+  std::string preprocessed_host_output(device_data.size(), 0);
   CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
-                                device_fst_output.data(),
+                                device_data.data(),
                                 preprocessed_host_output.size(),
                                 cudaMemcpyDeviceToHost,
-                                cudf::test::get_default_stream().value()));
+                                stream_view.value()))
+  stream_view.synchronize();
   CUDF_TEST_EXPECT_VECTOR_EQUAL(
     preprocessed_host_output, expected_host_output, preprocessed_host_output.size());
 }
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index ee1207f04a2..f0f72d4e794 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -681,6 +681,111 @@ TEST_F(JsonReaderTest, JsonLinesByteRange)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{3000, 4000, 5000}});
 }
 
+TEST_F(JsonReaderTest, JsonLinesMultipleFilesByteRange_AcrossFiles)
+{
+  const std::string file1 = temp_env->get_temp_dir() + "JsonLinesMultipleFilesByteRangeTest1.json";
+  std::ofstream outfile1(file1, std::ofstream::out);
+  outfile1 << "[1000]\n[2000]\n[3000]\n[4000]\n[5000]\n[6000]\n[7000]\n[8000]\n[9000]";
+  outfile1.close();
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file1}})
+      .lines(true)
+      .byte_range_offset(11)
+      .byte_range_size(70);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 10);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.metadata.schema_info[0].name, "0");
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(0),
+    int64_wrapper{{3000, 4000, 5000, 6000, 7000, 8000, 9000, 1000, 2000, 3000}});
+}
+
+TEST_F(JsonReaderTest, JsonLinesMultipleFilesByteRange_ExcessRangeSize)
+{
+  const std::string file1 = temp_env->get_temp_dir() + "JsonLinesMultipleFilesByteRangeTest1.json";
+  std::ofstream outfile1(file1, std::ofstream::out);
+  outfile1 << "[1000]\n[2000]\n[3000]\n[4000]\n[5000]\n[6000]\n[7000]\n[8000]\n[9000]";
+  outfile1.close();
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file1}})
+      .lines(true)
+      .byte_range_offset(11)
+      .byte_range_size(1000);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 16);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.metadata.schema_info[0].name, "0");
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                 int64_wrapper{{3000,
+                                                4000,
+                                                5000,
+                                                6000,
+                                                7000,
+                                                8000,
+                                                9000,
+                                                1000,
+                                                2000,
+                                                3000,
+                                                4000,
+                                                5000,
+                                                6000,
+                                                7000,
+                                                8000,
+                                                9000}});
+}
+
+TEST_F(JsonReaderTest, JsonLinesMultipleFilesByteRange_LoadAllFiles)
+{
+  const std::string file1 = temp_env->get_temp_dir() + "JsonLinesMultipleFilesByteRangeTest1.json";
+  std::ofstream outfile1(file1, std::ofstream::out);
+  outfile1 << "[1000]\n[2000]\n[3000]\n[4000]\n[5000]\n[6000]\n[7000]\n[8000]\n[9000]";
+  outfile1.close();
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file1}}).lines(true);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 1);
+  EXPECT_EQ(result.tbl->num_rows(), 18);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.metadata.schema_info[0].name, "0");
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                 int64_wrapper{{1000,
+                                                2000,
+                                                3000,
+                                                4000,
+                                                5000,
+                                                6000,
+                                                7000,
+                                                8000,
+                                                9000,
+                                                1000,
+                                                2000,
+                                                3000,
+                                                4000,
+                                                5000,
+                                                6000,
+                                                7000,
+                                                8000,
+                                                9000}});
+}
+
 TEST_P(JsonReaderRecordTest, JsonLinesObjects)
 {
   const std::string fname = temp_env->get_temp_dir() + "JsonLinesObjectsTest.json";
diff --git a/cpp/tests/io/json_whitespace_normalization_test.cu b/cpp/tests/io/json_whitespace_normalization_test.cu
index 336d360063f..8ed5fa81b12 100644
--- a/cpp/tests/io/json_whitespace_normalization_test.cu
+++ b/cpp/tests/io/json_whitespace_normalization_test.cu
@@ -19,6 +19,7 @@
 #include <cudf_test/testing_main.hpp>
 
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/types.hpp>
@@ -34,17 +35,26 @@ struct JsonWSNormalizationTest : public cudf::test::BaseFixture {};
 
 void run_test(std::string const& host_input, std::string const& expected_host_output)
 {
-  auto stream_view  = cudf::get_default_stream();
+  // Prepare cuda stream for data transfers & kernels
+  auto stream_view = cudf::test::get_default_stream();
+
   auto device_input = cudf::detail::make_device_uvector_async(
     host_input, stream_view, rmm::mr::get_current_device_resource());
 
   // Preprocessing FST
-  auto device_fst_output = cudf::io::json::detail::normalize_whitespace(
-    std::move(device_input), stream_view, rmm::mr::get_current_device_resource());
-
-  auto const preprocessed_host_output =
-    cudf::detail::make_std_vector_sync(device_fst_output, stream_view);
-
+  cudf::io::datasource::owning_buffer<rmm::device_uvector<char>> device_data(
+    std::move(device_input));
+  cudf::io::json::detail::normalize_whitespace(
+    device_data, stream_view, rmm::mr::get_current_device_resource());
+
+  std::string preprocessed_host_output(device_data.size(), 0);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
+                                device_data.data(),
+                                preprocessed_host_output.size(),
+                                cudaMemcpyDeviceToHost,
+                                stream_view.value()));
+
+  stream_view.synchronize();
   ASSERT_EQ(preprocessed_host_output.size(), expected_host_output.size());
   CUDF_TEST_EXPECT_VECTOR_EQUAL(
     preprocessed_host_output, expected_host_output, preprocessed_host_output.size());

From f4ec1a49e8f04305c324cc03e5f8fbc275bf5c88 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 30 Apr 2024 13:37:20 -0500
Subject: [PATCH 119/842] Remove jni-docker-build workflow (#15619)

This PR removes `jni-docker-build.yml`, which is an unused workflow according to the Spark team.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Tim Liu (https://github.com/NvTimLiu)

URL: https://github.com/rapidsai/cudf/pull/15619
---
 .github/workflows/jni-docker-build.yml | 53 --------------------------
 1 file changed, 53 deletions(-)
 delete mode 100644 .github/workflows/jni-docker-build.yml

diff --git a/.github/workflows/jni-docker-build.yml b/.github/workflows/jni-docker-build.yml
deleted file mode 100644
index 0bdc409d0ab..00000000000
--- a/.github/workflows/jni-docker-build.yml
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-name: JNI Docker Build
-
-on:
-  workflow_dispatch: # manual trigger only
-
-concurrency:
-  group: jni-docker-build-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  docker-build:
-    if: github.repository == 'rapidsai/cudf'
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-
-      - name: Set up QEMU
-        uses: docker/setup-qemu-action@v2
-
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v2
-
-      - name: Login to DockerHub
-        uses: docker/login-action@v2
-        with:
-          username: ${{ secrets.GPUCIBOT_DOCKERHUB_USER }}
-          password: ${{ secrets.GPUCIBOT_DOCKERHUB_TOKEN }}
-
-      - name: Set ENVs
-        run: |
-          echo "IMAGE_NAME=rapidsai/cudf-jni-build" >> $GITHUB_ENV
-          echo "IMAGE_REF=${GITHUB_REF_NAME}" >> $GITHUB_ENV
-
-      - name: Build and Push
-        uses: docker/build-push-action@v3
-        with:
-          push: true
-          file: java/ci/Dockerfile.centos7
-          tags: "${{ env.IMAGE_NAME }}:${{ env.IMAGE_REF }}"

From 1fd3db8b662c61b4fb04e4be07cf6ac737cef8a1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 30 Apr 2024 15:36:26 -0400
Subject: [PATCH 120/842] Use experimental make_strings_children for strings
 replace/filter/translate (#15586)

Updates strings replace functions to use the new experimental `make_strings_children` which supports building large strings.

Reference #15579

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15586
---
 cpp/src/strings/char_types/char_types.cu | 13 +++++++------
 cpp/src/strings/filter_chars.cu          | 11 ++++++-----
 cpp/src/strings/replace/multi.cu         | 15 +++++++++------
 cpp/src/strings/replace/replace.cu       | 11 ++++++-----
 cpp/src/strings/replace/replace_slice.cu | 11 ++++++-----
 cpp/src/strings/translate.cu             | 11 ++++++-----
 6 files changed, 40 insertions(+), 32 deletions(-)

diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 28068cf7e78..7716cf0cc29 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -21,7 +21,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -130,8 +130,9 @@ struct filter_chars_fn {
   string_character_types const types_to_remove;
   string_character_types const types_to_keep;
   string_view const d_replacement;  ///< optional replacement for removed characters
-  int32_t* d_offsets{};             ///< size of the output string stored here during first pass
-  char* d_chars{};                  ///< this is null only during the first pass
+  size_type* d_sizes{};
+  char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Returns true if the given character should be replaced.
@@ -150,7 +151,7 @@ struct filter_chars_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str  = d_column.element<string_view>(idx);
@@ -165,7 +166,7 @@ struct filter_chars_fn {
       nbytes += d_newchar.size_bytes() - char_size;
       if (out_ptr) out_ptr = cudf::strings::detail::copy_string(out_ptr, d_newchar);
     }
-    if (!out_ptr) d_offsets[idx] = nbytes;
+    if (!out_ptr) { d_sizes[idx] = nbytes; }
   }
 };
 
@@ -202,7 +203,7 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
 
   // this utility calls filterer to build the offsets and chars columns
   auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
+    cudf::strings::detail::experimental::make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return make_strings_column(strings_count,
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 32717dac78d..4705ae519cd 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -57,8 +57,9 @@ struct filter_fn {
   rmm::device_uvector<char_range>::iterator table_begin;
   rmm::device_uvector<char_range>::iterator table_end;
   string_view const d_replacement;
-  int32_t* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Return true if this character should be removed.
@@ -87,7 +88,7 @@ struct filter_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str = d_strings.element<string_view>(idx);
@@ -104,7 +105,7 @@ struct filter_fn {
       else
         nbytes += d_newchar.size_bytes() - char_size;
     }
-    if (!out_ptr) d_offsets[idx] = nbytes;
+    if (!out_ptr) { d_sizes[idx] = nbytes; }
   }
 };
 
@@ -141,7 +142,7 @@ std::unique_ptr<column> filter_characters(
   // this utility calls the strip_fn to build the offsets and chars columns
   filter_fn ffn{*d_strings, keep_characters, table.begin(), table.end(), d_replacement};
   auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(ffn, strings.size(), stream, mr);
+    cudf::strings::detail::experimental::make_strings_children(ffn, strings.size(), stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 2eb03bd10a4..9abcca7a5e6 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -23,7 +23,7 @@
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace.hpp>
@@ -404,13 +404,14 @@ struct replace_multi_fn {
   column_device_view const d_strings;
   column_device_view const d_targets;
   column_device_view const d_repls;
-  int32_t* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str   = d_strings.element<string_view>(idx);
@@ -443,9 +444,11 @@ struct replace_multi_fn {
       ++spos;
     }
     if (out_ptr)  // copy remainder
+    {
       memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
-    else
-      d_offsets[idx] = bytes;
+    } else {
+      d_sizes[idx] = bytes;
+    }
   }
 };
 
@@ -459,7 +462,7 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
   auto d_targets      = column_device_view::create(targets.parent(), stream);
   auto d_replacements = column_device_view::create(repls.parent(), stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     replace_multi_fn{*d_strings, *d_targets, *d_replacements}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 857bc7fb41c..df8526fa942 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace.hpp>
@@ -345,13 +345,14 @@ struct replace_fn {
   string_view d_target;
   string_view d_replacement;
   cudf::size_type maxrepl;
-  cudf::size_type* d_offsets{};
+  cudf::size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str   = d_strings.element<string_view>(idx);
@@ -384,7 +385,7 @@ struct replace_fn {
     if (out_ptr) {  // copy remainder
       memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
     } else {
-      d_offsets[idx] = bytes;
+      d_sizes[idx] = bytes;
     }
   }
 };
@@ -398,7 +399,7 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     replace_fn{*d_strings, d_target, d_replacement, maxrepl}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
diff --git a/cpp/src/strings/replace/replace_slice.cu b/cpp/src/strings/replace/replace_slice.cu
index 90540b39189..54e84dfe504 100644
--- a/cpp/src/strings/replace/replace_slice.cu
+++ b/cpp/src/strings/replace/replace_slice.cu
@@ -19,7 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -45,13 +45,14 @@ struct replace_slice_fn {
   string_view const d_repl;
   size_type const start;
   size_type const stop;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str   = d_strings.element<string_view>(idx);
@@ -69,7 +70,7 @@ struct replace_slice_fn {
                                    in_ptr + end,
                                    d_str.size_bytes() - end);
     } else {
-      d_offsets[idx] = d_str.size_bytes() + d_repl.size_bytes() - (end - begin);
+      d_sizes[idx] = d_str.size_bytes() + d_repl.size_bytes() - (end - begin);
     }
   }
 };
@@ -94,7 +95,7 @@ std::unique_ptr<column> replace_slice(strings_column_view const& input,
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   // this utility calls the given functor to build the offsets and chars columns
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     replace_slice_fn{*d_strings, d_repl, start, stop}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index fcf55429e09..75bc46d30c4 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/translate.hpp>
@@ -52,13 +52,14 @@ struct translate_fn {
   column_device_view const d_strings;
   rmm::device_uvector<translate_table>::iterator table_begin;
   rmm::device_uvector<translate_table>::iterator table_end;
-  int32_t* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     string_view const d_str = d_strings.element<string_view>(idx);
@@ -80,7 +81,7 @@ struct translate_fn {
       }
       if (chr && out_ptr) out_ptr += from_char_utf8(chr, out_ptr);
     }
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -111,7 +112,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
-  auto [offsets_column, chars] = make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     translate_fn{*d_strings, table.begin(), table.end()}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),

From 2439dee4a3c0744e0169ff8dc0c0354e285db58b Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 30 Apr 2024 15:37:01 -0400
Subject: [PATCH 121/842] Use experimental make_strings_children for strings
 join/url_encode/slice (#15598)

Updates strings APIs to use the new experimental `make_strings_children` which supports building large strings.
- `cudf::strings::join_strings`
- `cudf::strings::join_list_elements`
- `cudf::strings::slice_strings`
- `cudf::strings::format_list_column`
- `cudf::strings::url_encode`

Reference https://github.com/rapidsai/cudf/issues/15579

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15598
---
 cpp/src/strings/combine/join.cu               |  8 +++++---
 cpp/src/strings/combine/join_list_elements.cu | 13 +++++++------
 cpp/src/strings/convert/convert_lists.cu      |  9 +++++----
 cpp/src/strings/convert/convert_urls.cu       | 13 +++++++------
 cpp/src/strings/slice.cu                      | 13 +++++++------
 5 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index d1d9afbb85f..4b2996a77e4 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -22,6 +22,7 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/detail/combine.hpp>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -84,8 +85,9 @@ struct join_base_fn {
  * This functor is suitable for make_strings_children
  */
 struct join_fn : public join_base_fn {
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   join_fn(column_device_view const d_strings,
           string_view d_separator,
@@ -106,7 +108,7 @@ struct join_fn : public join_base_fn {
     } else {
       bytes += d_str.size_bytes() + d_sep.size_bytes();
     }
-    if (!d_chars) { d_offsets[idx] = bytes; }
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -148,7 +150,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
     if ((input.size() == input.null_count()) ||
         ((input.chars_size(stream) / (input.size() - input.null_count())) <=
          AVG_CHAR_BYTES_THRESHOLD)) {
-      return std::get<1>(make_strings_children(
+      return std::get<1>(experimental::make_strings_children(
                            join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr))
         .release();
     }
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index a54ea5263fe..b0073452741 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -22,7 +22,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -60,11 +60,12 @@ struct compute_size_and_concatenate_fn {
   separator_on_nulls const separate_nulls;
   output_if_empty_list const empty_list_policy;
 
-  size_type* d_offsets{nullptr};
+  size_type* d_sizes{nullptr};
 
   // If d_chars == nullptr: only compute sizes and validities of the output strings.
   // If d_chars != nullptr: only concatenate strings.
   char* d_chars{nullptr};
+  cudf::detail::input_offsetalator d_offsets;
 
   [[nodiscard]] __device__ bool output_is_null(size_type const idx,
                                                size_type const start_idx,
@@ -84,7 +85,7 @@ struct compute_size_and_concatenate_fn {
     auto const end_idx   = list_offsets[idx + 1];
 
     if (!d_chars && output_is_null(idx, start_idx, end_idx)) {
-      d_offsets[idx] = 0;
+      d_sizes[idx] = 0;
       return;
     }
 
@@ -120,7 +121,7 @@ struct compute_size_and_concatenate_fn {
 
     // If there are all null elements, the output should be the same as having an empty list input:
     // a null or an empty string
-    if (!d_chars) { d_offsets[idx] = has_valid_element ? size_bytes : 0; }
+    if (!d_chars) { d_sizes[idx] = has_valid_element ? size_bytes : 0; }
   }
 };
 
@@ -208,7 +209,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                                     separate_nulls,
                                                     empty_list_policy};
 
-  auto [offsets_column, chars] = make_strings_children(comp_fn, num_rows, stream, mr);
+  auto [offsets_column, chars] = experimental::make_strings_children(comp_fn, num_rows, stream, mr);
   auto [null_mask, null_count] =
     cudf::detail::valid_if(thrust::counting_iterator<size_type>(0),
                            thrust::counting_iterator<size_type>(num_rows),
@@ -283,7 +284,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                                     separate_nulls,
                                                     empty_list_policy};
 
-  auto [offsets_column, chars] = make_strings_children(comp_fn, num_rows, stream, mr);
+  auto [offsets_column, chars] = experimental::make_strings_children(comp_fn, num_rows, stream, mr);
   auto [null_mask, null_count] =
     cudf::detail::valid_if(thrust::counting_iterator<size_type>(0),
                            thrust::counting_iterator<size_type>(num_rows),
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
index ed898bd6f72..198e6c11ef3 100644
--- a/cpp/src/strings/convert/convert_lists.cu
+++ b/cpp/src/strings/convert/convert_lists.cu
@@ -17,7 +17,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_lists.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
@@ -66,8 +66,9 @@ struct format_lists_fn {
   string_view const d_na_rep;
   stack_item* d_stack;
   size_type const max_depth;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ column_device_view get_nested_child(size_type idx)
   {
@@ -184,7 +185,7 @@ struct format_lists_fn {
       }
     }
 
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -217,7 +218,7 @@ std::unique_ptr<column> format_list_column(lists_column_view const& input,
   auto const d_separators = column_device_view::create(separators.parent(), stream);
   auto const d_na_rep     = na_rep.value(stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = experimental::make_strings_children(
     format_lists_fn{*d_input, *d_separators, d_na_rep, stack_buffer.data(), depth},
     input.size(),
     stream,
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 644ffbb4bd1..459c3e88a4e 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/strings/convert/convert_urls.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -50,8 +50,9 @@ namespace {
 //
 struct url_encoder_fn {
   column_device_view const d_strings;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   // utility to create 2-byte hex characters from single binary byte
   __device__ void byte_to_hex(uint8_t byte, char* hex)
@@ -80,7 +81,7 @@ struct url_encoder_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -117,7 +118,7 @@ struct url_encoder_fn {
         }
       }
     }
-    if (!d_chars) d_offsets[idx] = nbytes;
+    if (!d_chars) { d_sizes[idx] = nbytes; }
   }
 };
 
@@ -132,8 +133,8 @@ std::unique_ptr<column> url_encode(strings_column_view const& input,
 
   auto d_column = column_device_view::create(input.parent(), stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-    url_encoder_fn{*d_column}, input.size(), stream, mr);
+  auto [offsets_column, chars] =
+    experimental::make_strings_children(url_encoder_fn{*d_column}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index d080065b330..2f7564b3b0d 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -21,7 +21,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/slice.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -79,19 +79,20 @@ struct substring_fn {
   numeric_scalar_device_view<size_type> const d_start;
   numeric_scalar_device_view<size_type> const d_stop;
   numeric_scalar_device_view<size_type> const d_step;
-  int32_t* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str  = d_column.template element<string_view>(idx);
     auto const length = d_str.length();
     if (length == 0) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     size_type const step = d_step.is_valid() ? d_step.value() : 1;
@@ -131,7 +132,7 @@ struct substring_fn {
       }
       itr += step;
     }
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -205,7 +206,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
   auto const d_stop  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(stop));
   auto const d_step  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
 
-  auto [offsets, chars] = make_strings_children(
+  auto [offsets, chars] = experimental::make_strings_children(
     substring_fn{*d_column, d_start, d_stop, d_step}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),

From 4da6fda3e6042645b8e21c931b26966ef0fa8897 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 30 Apr 2024 15:37:48 -0400
Subject: [PATCH 122/842] Use experimental make_strings_children for
 capitalize/case/pad functions (#15587)

Updates strings case conversion and pad functions to use the new experimental `make_strings_children` which supports building large strings.

Reference https://github.com/rapidsai/cudf/issues/15579

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15587
---
 cpp/src/strings/capitalize.cu | 11 ++++++-----
 cpp/src/strings/case.cu       | 17 +++++++++--------
 cpp/src/strings/padding.cu    | 17 +++++++++--------
 3 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index 2bb85bf2c5c..031fff4086a 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/capitalize.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -64,8 +64,9 @@ struct base_fn {
   character_cases_table_type const* d_case_table;
   special_case_mapping const* d_special_case_mapping;
   column_device_view const d_column;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   base_fn(column_device_view const& d_column)
     : d_flags(get_character_flags_table()),
@@ -108,7 +109,7 @@ struct base_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -137,7 +138,7 @@ struct base_fn {
       // capitalize the next char if this one is a delimiter
       capitalize = derived.capitalize_next(chr, flag);
     }
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -231,7 +232,7 @@ std::unique_ptr<column> capitalizer(CapitalFn cfn,
                                     rmm::device_async_resource_ref mr)
 {
   auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(cfn, input.size(), stream, mr);
+    cudf::strings::detail::experimental::make_strings_children(cfn, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 82b590f81b3..5d5e6ba9a3e 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -23,7 +23,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -117,8 +117,9 @@ struct convert_char_fn {
  */
 struct base_upper_lower_fn {
   convert_char_fn converter;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   base_upper_lower_fn(convert_char_fn converter) : converter(converter) {}
 
@@ -137,7 +138,7 @@ struct base_upper_lower_fn {
         bytes += size;
       }
     }
-    if (!d_buffer) { d_offsets[idx] = bytes; }
+    if (!d_buffer) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -152,7 +153,7 @@ struct upper_lower_fn : public base_upper_lower_fn {
   __device__ void operator()(size_type idx) const
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const d_str = d_strings.element<string_view>(idx);
@@ -295,8 +296,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
 
   // For smaller strings, use the regular string-parallel algorithm
   if ((chars_size / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
-    auto [offsets, chars] =
-      cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr);
+    auto [offsets, chars] = cudf::strings::detail::experimental::make_strings_children(
+      converter, input.size(), stream, mr);
     return make_strings_column(input.size(),
                                std::move(offsets),
                                chars.release(),
@@ -364,8 +365,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   // run case conversion over the new sub-strings
   auto const tmp_size = static_cast<size_type>(tmp_offsets.size()) - 1;
   upper_lower_ls_fn sub_conv{ccfn, input_chars, tmp_offsets.data()};
-  auto chars =
-    std::get<1>(cudf::strings::detail::make_strings_children(sub_conv, tmp_size, stream, mr));
+  auto chars = std::get<1>(
+    cudf::strings::detail::experimental::make_strings_children(sub_conv, tmp_size, stream, mr));
 
   return make_strings_column(input.size(),
                              std::move(offsets),
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index d8a3055772e..3cfbf79a8f3 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -19,7 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/pad_impl.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/padding.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -47,8 +47,9 @@ struct base_fn {
   column_device_view const d_column;
   size_type const width;
   size_type const fill_char_size;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   base_fn(column_device_view const& d_column, size_type width, size_type fill_char_size)
     : d_column(d_column), width(width), fill_char_size(fill_char_size)
@@ -58,7 +59,7 @@ struct base_fn {
   __device__ void operator()(size_type idx) const
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -67,7 +68,7 @@ struct base_fn {
     if (d_chars) {
       derived.pad(d_str, d_chars + d_offsets[idx]);
     } else {
-      d_offsets[idx] = compute_padded_size(d_str, width, fill_char_size);
+      d_sizes[idx] = compute_padded_size(d_str, width, fill_char_size);
     }
   };
 };
@@ -116,13 +117,13 @@ std::unique_ptr<column> pad(strings_column_view const& input,
   auto [offsets_column, chars] = [&] {
     if (side == side_type::LEFT) {
       auto fn = pad_fn<side_type::LEFT>{*d_strings, width, fill_char_size, d_fill_char};
-      return make_strings_children(fn, input.size(), stream, mr);
+      return experimental::make_strings_children(fn, input.size(), stream, mr);
     } else if (side == side_type::RIGHT) {
       auto fn = pad_fn<side_type::RIGHT>{*d_strings, width, fill_char_size, d_fill_char};
-      return make_strings_children(fn, input.size(), stream, mr);
+      return experimental::make_strings_children(fn, input.size(), stream, mr);
     }
     auto fn = pad_fn<side_type::BOTH>{*d_strings, width, fill_char_size, d_fill_char};
-    return make_strings_children(fn, input.size(), stream, mr);
+    return experimental::make_strings_children(fn, input.size(), stream, mr);
   }();
 
   return make_strings_column(input.size(),
@@ -153,7 +154,7 @@ std::unique_ptr<column> zfill(strings_column_view const& input,
 
   auto d_strings = column_device_view::create(input.parent(), stream);
   auto [offsets_column, chars] =
-    make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr);
+    experimental::make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),

From 2eeacb9f5f22a56458b644a93b8cbeacd4844472 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 30 Apr 2024 14:55:40 -1000
Subject: [PATCH 123/842] Make ColumnBase.__cuda_array_interface__ opt out
 instead of opt in (#15622)

Column types that support CAI already have custom `NotImplementedError`s, and since the implementation is the same for datetime and numeric columns, moving their implementation to `ColumnBase`

Should help address timedelta support in https://github.com/rapidsai/cudf/pull/15615 cc @brandon-b-miller

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15622
---
 python/cudf/cudf/core/column/column.py        | 26 ++++++++++---
 python/cudf/cudf/core/column/datetime.py      | 27 +-------------
 python/cudf/cudf/core/column/decimal.py       | 12 +++---
 python/cudf/cudf/core/column/numerical.py     | 37 +------------------
 python/cudf/cudf/core/column/string.py        |  7 ++++
 .../cudf/tests/test_cuda_array_interface.py   | 19 ++++++++--
 6 files changed, 53 insertions(+), 75 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 7e48552742c..ba2dab2c2e1 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1101,11 +1101,27 @@ def __arrow_array__(self, type=None):
         )
 
     @property
-    def __cuda_array_interface__(self):
-        raise NotImplementedError(
-            f"dtype {self.dtype} is not yet supported via "
-            "`__cuda_array_interface__`"
-        )
+    def __cuda_array_interface__(self) -> abc.Mapping[str, Any]:
+        output = {
+            "shape": (len(self),),
+            "strides": (self.dtype.itemsize,),
+            "typestr": self.dtype.str,
+            "data": (self.data_ptr, False),
+            "version": 1,
+        }
+
+        if self.nullable and self.has_nulls():
+            # Create a simple Python object that exposes the
+            # `__cuda_array_interface__` attribute here since we need to modify
+            # some of the attributes from the numba device array
+            output["mask"] = cuda_array_interface_wrapper(
+                ptr=self.mask_ptr,
+                size=len(self),
+                owner=self.mask,
+                readonly=True,
+                typestr="<t1",
+            )
+        return output
 
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return _array_ufunc(self, ufunc, method, inputs, kwargs)
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index b84c1dc7ccd..981ef738458 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -7,7 +7,7 @@
 import locale
 import re
 from locale import nl_langinfo
-from typing import Any, Mapping, Optional, Sequence, cast
+from typing import Any, Optional, Sequence, cast
 
 import numpy as np
 import pandas as pd
@@ -25,7 +25,7 @@
 )
 from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
 from cudf.core._compat import PANDAS_GE_220
-from cudf.core.buffer import Buffer, cuda_array_interface_wrapper
+from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
@@ -399,29 +399,6 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
 
         return NotImplemented
 
-    @property
-    def __cuda_array_interface__(self) -> Mapping[str, Any]:
-        output = {
-            "shape": (len(self),),
-            "strides": (self.dtype.itemsize,),
-            "typestr": self.dtype.str,
-            "data": (self.data_ptr, False),
-            "version": 1,
-        }
-
-        if self.nullable and self.has_nulls():
-            # Create a simple Python object that exposes the
-            # `__cuda_array_interface__` attribute here since we need to modify
-            # some of the attributes from the numba device array
-            output["mask"] = cuda_array_interface_wrapper(
-                ptr=self.mask_ptr,
-                size=len(self),
-                owner=self.mask,
-                readonly=True,
-                typestr="<t1",
-            )
-        return output
-
     def as_datetime_column(
         self, dtype: Dtype, format: str | None = None
     ) -> DatetimeColumn:
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index b83a6ded416..3a0f6649e21 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -38,6 +38,12 @@ class DecimalBaseColumn(NumericalBaseColumn):
     dtype: DecimalDtype
     _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
 
+    @property
+    def __cuda_array_interface__(self):
+        raise NotImplementedError(
+            "Decimals are not yet supported via `__cuda_array_interface__`"
+        )
+
     def as_decimal_column(
         self,
         dtype: Dtype,
@@ -342,12 +348,6 @@ def to_arrow(self):
             buffers=[mask_buf, data_buf],
         )
 
-    @property
-    def __cuda_array_interface__(self):
-        raise NotImplementedError(
-            "Decimals are not yet supported via `__cuda_array_interface__`"
-        )
-
     def _with_type_metadata(
         self: "cudf.core.column.Decimal64Column", dtype: Dtype
     ) -> "cudf.core.column.Decimal64Column":
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index f42c87de3fd..4c211a173b1 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -3,16 +3,7 @@
 from __future__ import annotations
 
 import functools
-from typing import (
-    Any,
-    Callable,
-    Mapping,
-    Optional,
-    Sequence,
-    Tuple,
-    Union,
-    cast,
-)
+from typing import Any, Callable, Optional, Sequence, Tuple, Union, cast
 
 import cupy as cp
 import numpy as np
@@ -37,7 +28,7 @@
     is_integer_dtype,
     is_scalar,
 )
-from cudf.core.buffer import Buffer, cuda_array_interface_wrapper
+from cudf.core.buffer import Buffer
 from cudf.core.column import (
     ColumnBase,
     as_column,
@@ -194,30 +185,6 @@ def __setitem__(self, key: Any, value: Any):
         if out:
             self._mimic_inplace(out, inplace=True)
 
-    @property
-    def __cuda_array_interface__(self) -> Mapping[str, Any]:
-        output = {
-            "shape": (len(self),),
-            "strides": (self.dtype.itemsize,),
-            "typestr": self.dtype.str,
-            "data": (self.data_ptr, False),
-            "version": 1,
-        }
-
-        if self.nullable and self.has_nulls():
-            # Create a simple Python object that exposes the
-            # `__cuda_array_interface__` attribute here since we need to modify
-            # some of the attributes from the numba device array
-            output["mask"] = cuda_array_interface_wrapper(
-                ptr=self.mask_ptr,
-                size=len(self),
-                owner=self.mask,
-                readonly=True,
-                typestr="<t1",
-            )
-
-        return output
-
     def unary_operator(self, unaryop: Union[str, Callable]) -> ColumnBase:
         if callable(unaryop):
             return libcudf.transform.transform(self, unaryop)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 8143e7919a7..3e941d60079 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5600,6 +5600,13 @@ def data_array_view(
     ) -> cuda.devicearray.DeviceNDArray:
         raise ValueError("Cannot get an array view of a StringColumn")
 
+    @property
+    def __cuda_array_interface__(self):
+        raise NotImplementedError(
+            f"dtype {self.dtype} is not yet supported via "
+            "`__cuda_array_interface__`"
+        )
+
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
 
diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index 213c6c2c1f9..f98c3ad0475 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -11,7 +11,12 @@
 
 import cudf
 from cudf.core.buffer.spill_manager import get_global_manager
-from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
+from cudf.testing._utils import (
+    DATETIME_TYPES,
+    NUMERIC_TYPES,
+    TIMEDELTA_TYPES,
+    assert_eq,
+)
 
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
@@ -42,7 +47,9 @@ def test_cuda_array_interface_interop_in(dtype, module):
         assert_eq(pd_data, gdf["test"])
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES + ["str"])
+@pytest.mark.parametrize(
+    "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["str"]
+)
 @pytest.mark.parametrize("module", ["cupy", "numba"])
 def test_cuda_array_interface_interop_out(dtype, module):
     expectation = does_not_raise()
@@ -73,7 +80,9 @@ def to_host_function(x):
         assert_eq(expect, got)
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
+@pytest.mark.parametrize(
+    "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES
+)
 @pytest.mark.parametrize("module", ["cupy", "numba"])
 def test_cuda_array_interface_interop_out_masked(dtype, module):
     expectation = does_not_raise()
@@ -104,7 +113,9 @@ def to_host_function(x):
         module_data = module_constructor(cudf_data)  # noqa: F841
 
 
-@pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
+@pytest.mark.parametrize(
+    "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES
+)
 @pytest.mark.parametrize("nulls", ["all", "some", "bools", "none"])
 @pytest.mark.parametrize("mask_type", ["bits", "bools"])
 def test_cuda_array_interface_as_column(dtype, nulls, mask_type):

From acbb30a962933092542e5bf065eb984708b3353b Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 1 May 2024 07:27:52 -0500
Subject: [PATCH 124/842] Backport: Relax protobuf lower bound to 3.20.
 (#15506) (#15610)

Backport of https://github.com/rapidsai/cudf/pull/15506 to cuDF 24.04.

Also backports #15574 to ignore a warning from newer cupy releases.

---------

Co-authored-by: Vyas Ramasubramani <vyasr@nvidia.com>
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-122_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 2 +-
 dependencies.yaml                                | 2 +-
 python/cudf/cudf/tests/pytest.ini                | 4 +++-
 python/cudf/pyproject.toml                       | 2 +-
 6 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index cf363a819a2..eb4eca1cb12 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -68,7 +68,7 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- protobuf>=4.21,<5
+- protobuf>=3.20,<5
 - ptxcompiler
 - pyarrow==14.0.2.*
 - pydata-sphinx-theme!=0.14.2
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 42460532b1b..b1b41f41803 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -66,7 +66,7 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- protobuf>=4.21,<5
+- protobuf>=3.20,<5
 - pyarrow==14.0.2.*
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 7633fbb00a3..cd9237bd7cb 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -78,7 +78,7 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
   run:
-    - {{ pin_compatible('protobuf', min_pin='x.x', max_pin='x') }}
+    - protobuf >=3.20,<5.0a0
     - python
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.2dev0
diff --git a/dependencies.yaml b/dependencies.yaml
index db0a766df82..edc0677f244 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -276,7 +276,7 @@ dependencies:
       - output_types: conda
         packages:
           - &rmm_conda rmm==24.4.*
-          - &protobuf protobuf>=4.21,<5
+          - &protobuf protobuf>=3.20,<5
           - pip
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini
index 36ccb434bb2..710473acb85 100644
--- a/python/cudf/cudf/tests/pytest.ini
+++ b/python/cudf/cudf/tests/pytest.ini
@@ -8,5 +8,7 @@ filterwarnings =
     error
     ignore:::.*xdist.*
     ignore:::.*pytest.*
+    # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+
     ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning
-    # Above deprecation warning comes from Pyarrow Table.to_pandas() with pandas-2.2+
+    # PerformanceWarning from cupy warming up the JIT cache
+    ignore:Jitify is performing a one-time only warm-up to populate the persistent cache:cupy._util.PerformanceWarning
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index da574fdb031..3112db2a720 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = [
     "nvtx>=0.2.1",
     "packaging",
     "pandas>=2.0,<2.2.2dev0",
-    "protobuf>=4.21,<5",
+    "protobuf>=3.20,<5",
     "ptxcompiler",
     "pyarrow>=14.0.1,<15.0.0a0",
     "rich",

From f5c777826d759b0541569426e6099a3ef7a13049 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 1 May 2024 10:03:03 -0400
Subject: [PATCH 125/842] Large strings support for cudf::gather (#15621)

Replaces `make_offsets_child_column` with strings specific version in `cudf::strings::detail::gather` function.
Fixes issue found here: https://github.com/rapidsai/cudf/issues/13733#issuecomment-2079656314

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/15621
---
 cpp/include/cudf/strings/detail/gather.cuh | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 94bce6bddd5..fcd74bebfe8 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -19,23 +19,19 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
-#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
-#include <thrust/advance.h>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
-#include <thrust/functional.h>
-#include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
 namespace cudf {
@@ -226,7 +222,7 @@ rmm::device_uvector<char> gather_chars(StringIterator strings_begin,
                                        MapIterator map_begin,
                                        MapIterator map_end,
                                        cudf::detail::input_offsetalator const offsets,
-                                       size_type chars_bytes,
+                                       int64_t chars_bytes,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
@@ -239,9 +235,9 @@ rmm::device_uvector<char> gather_chars(StringIterator strings_begin,
   constexpr int warps_per_threadblock = 4;
   // String parallel strategy will be used if average string length is above this threshold.
   // Otherwise, char parallel strategy will be used.
-  constexpr size_type string_parallel_threshold = 32;
+  constexpr int64_t string_parallel_threshold = 32;
 
-  size_type average_string_length = chars_bytes / output_count;
+  int64_t const average_string_length = chars_bytes / output_count;
 
   if (average_string_length > string_parallel_threshold) {
     constexpr int max_threadblocks = 65536;
@@ -302,7 +298,7 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
     strings.is_empty() ? make_empty_column(type_id::INT32)->view() : strings.offsets(),
     strings.offset());
 
-  auto offsets_itr = thrust::make_transform_iterator(
+  auto sizes_itr = thrust::make_transform_iterator(
     begin,
     cuda::proclaim_return_type<size_type>(
       [d_strings = *d_strings, d_in_offsets] __device__(size_type idx) {
@@ -310,8 +306,8 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
         if (not d_strings.is_valid(idx)) { return 0; }
         return static_cast<size_type>(d_in_offsets[idx + 1] - d_in_offsets[idx]);
       }));
-  auto [out_offsets_column, total_bytes] =
-    cudf::detail::make_offsets_child_column(offsets_itr, offsets_itr + output_count, stream, mr);
+  auto [out_offsets_column, total_bytes] = cudf::strings::detail::make_offsets_child_column(
+    sizes_itr, sizes_itr + output_count, stream, mr);
 
   // build chars column
   auto const offsets_view =

From 4aabf51df1441a77107fb146b182c82b1ed9c611 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 1 May 2024 10:03:31 -0400
Subject: [PATCH 126/842] Use experimental make_strings_children for json/csv
 writers (#15599)

Updates the JSON and CSV writer functions to use the new experimental make_strings_children.
Also included is an update to the JSON_BENCH benchmark for get_json_object.

Reference https://github.com/rapidsai/cudf/issues/15579

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15599
---
 cpp/benchmarks/json/json.cu   |  9 +++++----
 cpp/src/io/csv/writer_impl.cu | 11 ++++++-----
 cpp/src/io/json/write_json.cu | 13 +++++++------
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/cpp/benchmarks/json/json.cu b/cpp/benchmarks/json/json.cu
index a54d7d48dc4..c65db187f42 100644
--- a/cpp/benchmarks/json/json.cu
+++ b/cpp/benchmarks/json/json.cu
@@ -22,7 +22,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/json/json.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -77,8 +77,9 @@ struct json_benchmark_row_builder {
   cudf::column_device_view const d_book_pct;           // Book percentage
   cudf::column_device_view const d_misc_order;         // Misc-Store order
   cudf::column_device_view const d_store_order;        // Books-Bicycles order
-  int32_t* d_offsets{};
+  cudf::size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
   thrust::minstd_rand rng{5236};
   thrust::uniform_int_distribution<int> dist{};
 
@@ -155,7 +156,7 @@ struct json_benchmark_row_builder {
       output_str += Misc;
     }
     output_str += brace2;
-    if (!output_str.ptr) d_offsets[idx] = output_str.bytes;
+    if (!output_str.ptr) { d_sizes[idx] = output_str.bytes; }
   }
 };
 
@@ -177,7 +178,7 @@ auto build_json_string_column(int desired_bytes, int num_rows)
   auto d_store_order = cudf::column_device_view::create(float_2bool_columns->get_column(2));
   json_benchmark_row_builder jb{
     desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order};
-  auto [offsets, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets, chars] = cudf::strings::detail::experimental::make_strings_children(
     jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   return cudf::make_strings_column(num_rows, std::move(offsets), chars.release(), 0, {});
 }
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 335ce77e3e3..58a74654405 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -33,7 +33,7 @@
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
@@ -75,8 +75,9 @@ namespace {
 struct escape_strings_fn {
   column_device_view const d_column;
   string_view const d_delimiter;  // check for column delimiter
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void write_char(char_utf8 chr, char*& d_buffer, size_type& bytes)
   {
@@ -89,7 +90,7 @@ struct escape_strings_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -115,7 +116,7 @@ struct escape_strings_fn {
     }
     if (quote_row) write_char(quote, d_buffer, bytes);
 
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -182,7 +183,7 @@ struct column_to_strings_fn {
     auto d_column = column_device_view::create(column_v, stream_);
     escape_strings_fn fn{*d_column, delimiter.value(stream_)};
     auto [offsets_column, chars] =
-      cudf::strings::detail::make_strings_children(fn, column_v.size(), stream_, mr_);
+      cudf::strings::detail::experimental::make_strings_children(fn, column_v.size(), stream_, mr_);
 
     return make_strings_column(column_v.size(),
                                std::move(offsets_column),
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 596b3381eaf..cac7149dabe 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -36,7 +36,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table.hpp>
@@ -78,8 +78,9 @@ namespace {
 struct escape_strings_fn {
   column_device_view const d_column;
   bool const append_colon{false};
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void write_char(char_utf8 chr, char*& d_buffer, size_type& bytes)
   {
@@ -123,7 +124,7 @@ struct escape_strings_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -163,15 +164,15 @@ struct escape_strings_fn {
     constexpr char_utf8 const colon = ':';  // append colon
     if (append_colon) write_char(colon, d_buffer, bytes);
 
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 
   std::unique_ptr<column> get_escaped_strings(column_view const& column_v,
                                               rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr)
   {
-    auto [offsets_column, chars] =
-      cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr);
+    auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+      *this, column_v.size(), stream, mr);
 
     return make_strings_column(column_v.size(),
                                std::move(offsets_column),

From fe4b92cfa61a324b417f12760341f40e5db452eb Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 1 May 2024 14:33:38 -0400
Subject: [PATCH 127/842] Use experimental make_strings_children in nvtext APIs
 (#15595)

Updates nvtext replace, ngram, normalize, and detokenize functions to replace the existing calls to `make_strings_children` with the new experimental `make_strings_children` which supports building large strings.

Reference https://github.com/rapidsai/cudf/issues/15579

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15595
---
 cpp/src/text/detokenize.cu      | 19 ++++++++++---------
 cpp/src/text/generate_ngrams.cu | 20 +++++++++++---------
 cpp/src/text/normalize.cu       | 20 +++++++++++---------
 cpp/src/text/replace.cu         | 18 ++++++++++--------
 4 files changed, 42 insertions(+), 35 deletions(-)

diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index 63fe3113697..2efeeee0ee9 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -48,12 +48,13 @@ namespace {
  * the same row. The `d_separator` is appended between each token.
  */
 struct detokenizer_fn {
-  cudf::column_device_view const d_strings;  // these are the tokens
-  cudf::size_type const* d_row_map;          // indices sorted by output row
-  cudf::size_type const* d_token_offsets;    // to each input token array
-  cudf::string_view const d_separator;       // append after each token
-  cudf::size_type* d_offsets{};              // offsets to output buffer d_chars
-  char* d_chars{};                           // output buffer for characters
+  cudf::column_device_view const d_strings;    // these are the tokens
+  cudf::size_type const* d_row_map;            // indices sorted by output row
+  cudf::size_type const* d_token_offsets;      // to each input token array
+  cudf::string_view const d_separator;         // append after each token
+  cudf::size_type* d_sizes{};                  // output sizes
+  char* d_chars{};                             // output buffer for characters
+  cudf::detail::input_offsetalator d_offsets;  // for addressing output row data in d_chars
 
   __device__ void operator()(cudf::size_type idx)
   {
@@ -75,7 +76,7 @@ struct detokenizer_fn {
         nbytes += d_separator.size_bytes();
       }
     }
-    if (!d_chars) { d_offsets[idx] = (nbytes > 0) ? (nbytes - d_separator.size_bytes()) : 0; }
+    if (!d_chars) { d_sizes[idx] = (nbytes > 0) ? (nbytes - d_separator.size_bytes()) : 0; }
   }
 };
 
@@ -157,7 +158,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
 
   cudf::string_view const d_separator(separator.data(), separator.size());
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     detokenizer_fn{*strings_column, d_row_map, tokens_offsets.data(), d_separator},
     output_count,
     stream,
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index d9fcd7dfd05..fdd165a54bc 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -57,8 +57,9 @@ struct ngram_generator_fn {
   cudf::column_device_view const d_strings;
   cudf::size_type ngrams;
   cudf::string_view const d_separator;
-  cudf::size_type* d_offsets{};
+  cudf::size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Build ngram for each string.
@@ -81,7 +82,7 @@ struct ngram_generator_fn {
       bytes += d_separator.size_bytes();
       if (out_ptr) out_ptr = cudf::strings::detail::copy_string(out_ptr, d_separator);
     }
-    if (!d_chars) d_offsets[idx] = bytes;
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -141,7 +142,7 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
   // compute the number of strings of ngrams
   auto const ngrams_count = strings_count - ngrams + 1;
 
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     ngram_generator_fn{d_strings, ngrams, d_separator}, ngrams_count, stream, mr);
 
   // make the output strings column from the offsets and chars column
@@ -175,8 +176,9 @@ struct character_ngram_generator_fn {
   cudf::column_device_view const d_strings;
   cudf::size_type ngrams;
   cudf::size_type const* d_ngram_offsets{};
-  cudf::size_type* d_offsets{};
+  cudf::size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(cudf::size_type idx)
   {
@@ -186,8 +188,8 @@ struct character_ngram_generator_fn {
     auto itr                = d_str.begin();
     auto const ngram_offset = d_ngram_offsets[idx];
     auto const ngram_count  = d_ngram_offsets[idx + 1] - ngram_offset;
-    auto d_sizes            = d_offsets + ngram_offset;
-    auto out_ptr            = d_chars ? d_chars + *d_sizes : nullptr;
+    auto d_output_sizes     = d_sizes + ngram_offset;
+    auto out_ptr            = d_chars ? d_chars + d_offsets[ngram_offset] : nullptr;
     for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) {
       auto const begin = itr.byte_offset();
       auto const end   = (itr + ngrams).byte_offset();
@@ -195,7 +197,7 @@ struct character_ngram_generator_fn {
         out_ptr =
           cudf::strings::detail::copy_and_increment(out_ptr, d_str.data() + begin, (end - begin));
       } else {
-        *d_sizes++ = end - begin;
+        *d_output_sizes++ = end - begin;
       }
     }
   }
@@ -233,7 +235,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
                "Insufficient number of characters in each string to generate ngrams");
 
   character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets};
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     generator, strings_count, total_ngrams, stream, mr);
 
   auto output = cudf::make_strings_column(
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index e5e72d3a33e..2f97eb1ce74 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -26,7 +26,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -59,13 +59,14 @@ namespace {
  */
 struct normalize_spaces_fn {
   cudf::column_device_view const d_strings;  // strings to normalize
-  cudf::size_type* d_offsets{};              // offsets into d_chars
+  cudf::size_type* d_sizes{};                // size of each output row
   char* d_chars{};                           // output buffer for characters
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(cudf::size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     cudf::string_view const single_space(" ", 1);
@@ -93,7 +94,7 @@ struct normalize_spaces_fn {
       nbytes += token.size_bytes() + 1;  // token size plus a single space
     }
     // remove trailing space
-    if (!d_chars) d_offsets[idx] = (nbytes > 0) ? nbytes - 1 : 0;
+    if (!d_chars) { d_sizes[idx] = (nbytes > 0) ? nbytes - 1 : 0; }
   }
 };
 
@@ -109,8 +110,9 @@ struct codepoint_to_utf8_fn {
   cudf::column_device_view const d_strings;  // input strings
   uint32_t const* cp_data;                   // full code-point array
   int64_t const* d_cp_offsets{};             // offsets to each string's code-point array
-  cudf::size_type* d_offsets{};              // offsets for the output strings
+  cudf::size_type* d_sizes{};                // size of output string
   char* d_chars{};                           // buffer for the output strings column
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Return the number of bytes for the output string given its code-point array.
@@ -133,14 +135,14 @@ struct codepoint_to_utf8_fn {
   __device__ void operator()(cudf::size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const offset = d_cp_offsets[idx];
     auto const count  = d_cp_offsets[idx + 1] - offset;  // number of code-points
     auto str_cps      = cp_data + offset;                // code-points for this string
     if (!d_chars) {
-      d_offsets[idx] = compute_output_size(str_cps, count);
+      d_sizes[idx] = compute_output_size(str_cps, count);
       return;
     }
     // convert each code-point to 1-4 UTF-8 encoded bytes
@@ -183,7 +185,7 @@ std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const&
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
   // build offsets and children using the normalize_space_fn
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     normalize_spaces_fn{*d_strings}, strings.size(), stream, mr);
 
   return cudf::make_strings_column(strings.size(),
@@ -225,7 +227,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
   // build offsets and children using the codepoint_to_utf8_fn
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
     codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, strings.size(), stream, mr);
 
   return cudf::make_strings_column(strings.size(),
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index f61fa544e73..f95b53a3ac8 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -21,7 +21,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -48,8 +48,9 @@ using replace_result = thrust::pair<bool, cudf::string_view>;
 struct base_token_replacer_fn {
   cudf::column_device_view const d_strings;  ///< strings to tokenize
   cudf::string_view const d_delimiter;       ///< delimiter characters for tokenizing
-  cudf::size_type* d_offsets{};              ///< for locating output string in d_chars
+  cudf::size_type* d_sizes{};                ///< for output string size
   char* d_chars{};                           ///< output buffer
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Tokenizes each string and calls the provided `replacer` function
@@ -63,7 +64,7 @@ struct base_token_replacer_fn {
   __device__ void process_string(cudf::size_type idx, ReplaceFn replacer)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -95,10 +96,11 @@ struct base_token_replacer_fn {
     }
 
     // copy the remainder of the string's bytes to the output buffer
-    if (out_ptr)
+    if (out_ptr) {
       memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos);
-    else
-      d_offsets[idx] = nbytes;
+    } else {
+      d_sizes[idx] = nbytes;
+    }
   }
 };
 
@@ -230,7 +232,7 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
 
   // this utility calls replacer to build the offsets and chars columns
   auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr);
+    cudf::strings::detail::experimental::make_strings_children(replacer, strings_count, stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,
@@ -263,7 +265,7 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
 
   // this utility calls filterer to build the offsets and chars columns
   auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
+    cudf::strings::detail::experimental::make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,

From 67d427deb3cf18d1139b76aecc1e6a3e9d5253f3 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 1 May 2024 16:03:28 -0500
Subject: [PATCH 128/842] Fix categorical-accessor support and testing in
 dask-cudf (#15591)

Related to https://github.com/rapidsai/cudf/issues/15027

Adds a minor tokenization fix, and adjusts testing for categorical-accessor support.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15591
---
 python/cudf/cudf/core/indexed_frame.py         |  7 ++++++-
 .../dask_cudf/dask_cudf/io/tests/test_json.py  |  4 ++--
 .../dask_cudf/dask_cudf/io/tests/test_orc.py   |  4 ++--
 .../dask_cudf/io/tests/test_parquet.py         |  2 +-
 .../dask_cudf/dask_cudf/io/tests/test_text.py  |  4 ++--
 .../dask_cudf/dask_cudf/tests/test_accessor.py | 18 ++++++++++++++----
 python/dask_cudf/dask_cudf/tests/utils.py      | 11 +++++------
 7 files changed, 32 insertions(+), 18 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 48e80d8162f..bec97bd3290 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -6308,7 +6308,12 @@ def __dask_tokenize__(self):
 
         return [
             type(self),
-            normalize_token(self._dtypes),
+            str(self._dtypes),
+            *[
+                normalize_token(cat.categories)
+                for cat in self._dtypes.values()
+                if cat == "category"
+            ],
             normalize_token(self.index),
             normalize_token(self.hash_values().values_host),
         ]
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py
index a09dfbff188..f8e5be0a417 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_json.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py
@@ -12,8 +12,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support for dask_expr<1.0.6
-pytestmark = skip_dask_expr(lt_version="1.0.6")
+# No dask-expr support for dask<2024.4.0
+pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 
 def test_read_json_backend_dispatch(tmp_path):
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
index 7be6c712511..457e5546bd9 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
@@ -14,8 +14,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support for dask_expr<1.0.6
-pytestmark = skip_dask_expr(lt_version="1.0.6")
+# No dask-expr support for dask<2024.4.0
+pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 cur_dir = os.path.dirname(__file__)
 sample_orc = os.path.join(cur_dir, "data/orc/sample.orc")
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 8ca27df8fec..6f4737db5be 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -536,7 +536,7 @@ def test_check_file_size(tmpdir):
         dask_cudf.io.read_parquet(fn, check_file_size=1).compute()
 
 
-@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="1.0")
+@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="2024.3.0")
 def test_null_partition(tmpdir):
     import pyarrow as pa
     from pyarrow.dataset import HivePartitioning
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py
index e3a9d380857..8912b7d5da6 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_text.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py
@@ -11,8 +11,8 @@
 import dask_cudf
 from dask_cudf.tests.utils import skip_dask_expr
 
-# No dask-expr support for dask_expr<1.0.6
-pytestmark = skip_dask_expr(lt_version="1.0.6")
+# No dask-expr support for dask<2024.4.0
+pytestmark = skip_dask_expr(lt_version="2024.4.0")
 
 cur_dir = os.path.dirname(__file__)
 text_file = os.path.join(cur_dir, "data/text/sample.pgn")
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index ebb8e4be187..ae17b89832a 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -111,7 +111,7 @@ def test_categorical_accessor_initialization2(data):
         dsr.cat
 
 
-@xfail_dask_expr("TODO: Unexplained dask-expr failure")
+@xfail_dask_expr(lt_version="2024.5.0")
 @pytest.mark.parametrize("data", [data_cat_1()])
 def test_categorical_basic(data):
     cat = data.copy()
@@ -203,7 +203,6 @@ def test_categorical_compare_unordered(data):
         dsr < dsr
 
 
-@xfail_dask_expr("TODO: Unexplained dask-expr failure")
 @pytest.mark.parametrize("data", [data_cat_3()])
 def test_categorical_compare_ordered(data):
     cat1 = data[0].copy()
@@ -274,7 +273,6 @@ def test_categorical_categories():
     )
 
 
-@xfail_dask_expr("TODO: Unexplained dask-expr failure")
 def test_categorical_as_known():
     df = dask_cudf.from_cudf(DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2)
     df["col_1"] = df["col_1"].astype("category")
@@ -283,7 +281,19 @@ def test_categorical_as_known():
     pdf = dd.from_pandas(pd.DataFrame({"col_1": [0, 1, 2, 3]}), npartitions=2)
     pdf["col_1"] = pdf["col_1"].astype("category")
     expected = pdf["col_1"].cat.as_known()
-    dd.assert_eq(expected, actual)
+
+    # Note: Categories may be ordered differently in
+    # cudf and pandas. Therefore, we need to compare
+    # the global set of categories (before and after
+    # calling `compute`), then we need to check that
+    # the initial order of rows was preserved.
+    assert set(expected.cat.categories) == set(
+        actual.cat.categories.values_host
+    )
+    assert set(expected.compute().cat.categories) == set(
+        actual.compute().cat.categories.values_host
+    )
+    dd.assert_eq(expected, actual.astype(expected.dtype))
 
 
 def test_str_slice():
diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py
index 1ca1758736b..c7dedbb6b4a 100644
--- a/python/dask_cudf/dask_cudf/tests/utils.py
+++ b/python/dask_cudf/dask_cudf/tests/utils.py
@@ -5,6 +5,7 @@
 import pytest
 from packaging.version import Version
 
+import dask
 import dask.dataframe as dd
 
 import cudf
@@ -12,11 +13,9 @@
 from dask_cudf.expr import QUERY_PLANNING_ON
 
 if QUERY_PLANNING_ON:
-    import dask_expr
-
-    DASK_EXPR_VERSION = Version(dask_expr.__version__)
+    DASK_VERSION = Version(dask.__version__)
 else:
-    DASK_EXPR_VERSION = None
+    DASK_VERSION = None
 
 
 def _make_random_frame(nelem, npartitions=2, include_na=False):
@@ -37,7 +36,7 @@ def _make_random_frame(nelem, npartitions=2, include_na=False):
 
 def skip_dask_expr(reason=_default_reason, lt_version=None):
     if lt_version is not None:
-        skip = QUERY_PLANNING_ON and DASK_EXPR_VERSION < Version(lt_version)
+        skip = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version)
     else:
         skip = QUERY_PLANNING_ON
     return pytest.mark.skipif(skip, reason=reason)
@@ -45,7 +44,7 @@ def skip_dask_expr(reason=_default_reason, lt_version=None):
 
 def xfail_dask_expr(reason=_default_reason, lt_version=None):
     if lt_version is not None:
-        xfail = QUERY_PLANNING_ON and DASK_EXPR_VERSION < Version(lt_version)
+        xfail = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version)
     else:
         xfail = QUERY_PLANNING_ON
     return pytest.mark.xfail(xfail, reason=reason)

From 7458a6ecbf474e10a4a64f10833d71253f42af7b Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 1 May 2024 17:56:12 -0500
Subject: [PATCH 129/842] Add "collect" aggregation support to dask-cudf
 (#15593)

This PR ~(along with it's upstream dependency)~ enables `"collect"` aggregations in dask-cudf when query-planning is enabled. It also adds an clearer error message for `as_index` usage (which is not supported in dask-dataframe, but *was* supported in legacy dask-cudf)

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15593
---
 .../dask_cudf/dask_cudf/expr/_collection.py   | 24 +++++++++
 python/dask_cudf/dask_cudf/expr/_groupby.py   | 54 +++++++++++++++++++
 .../dask_cudf/dask_cudf/tests/test_groupby.py | 39 ++++++++------
 3 files changed, 101 insertions(+), 16 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index 605a81f0fcd..d50dfb24256 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -18,6 +18,15 @@
 
 import cudf
 
+_LEGACY_WORKAROUND = (
+    "To enable the 'legacy' dask-cudf API, set the "
+    "global 'dataframe.query-planning' config to "
+    "`False` before dask is imported. This can also "
+    "be done by setting an environment variable: "
+    "`DASK_DATAFRAME__QUERY_PLANNING=False` "
+)
+
+
 ##
 ## Custom collection classes
 ##
@@ -88,6 +97,21 @@ def groupby(
                 f"`by` must be a column name or list of columns, got {by}."
             )
 
+        if "as_index" in kwargs:
+            msg = (
+                "The `as_index` argument is now deprecated. All groupby "
+                "results will be consistent with `as_index=True`."
+            )
+
+            if kwargs.pop("as_index") is not True:
+                raise NotImplementedError(
+                    f"{msg} Please reset the index after aggregating, or "
+                    "use the legacy API if `as_index=False` is required.\n"
+                    f"{_LEGACY_WORKAROUND}"
+                )
+            else:
+                warnings.warn(msg, FutureWarning)
+
         return GroupBy(
             self,
             by,
diff --git a/python/dask_cudf/dask_cudf/expr/_groupby.py b/python/dask_cudf/dask_cudf/expr/_groupby.py
index 7f275151f75..116893891e3 100644
--- a/python/dask_cudf/dask_cudf/expr/_groupby.py
+++ b/python/dask_cudf/dask_cudf/expr/_groupby.py
@@ -3,13 +3,55 @@
 from dask_expr._groupby import (
     GroupBy as DXGroupBy,
     SeriesGroupBy as DXSeriesGroupBy,
+    SingleAggregation,
 )
 from dask_expr._util import is_scalar
 
+from dask.dataframe.groupby import Aggregation
+
 ##
 ## Custom groupby classes
 ##
 
+
+class Collect(SingleAggregation):
+    @staticmethod
+    def groupby_chunk(arg):
+        return arg.agg("collect")
+
+    @staticmethod
+    def groupby_aggregate(arg):
+        gb = arg.agg("collect")
+        if gb.ndim > 1:
+            for col in gb.columns:
+                gb[col] = gb[col].list.concat()
+            return gb
+        else:
+            return gb.list.concat()
+
+
+collect_aggregation = Aggregation(
+    name="collect",
+    chunk=Collect.groupby_chunk,
+    agg=Collect.groupby_aggregate,
+)
+
+
+def _translate_arg(arg):
+    # Helper function to translate args so that
+    # they can be processed correctly by upstream
+    # dask & dask-expr. Right now, the only necessary
+    # translation is "collect" aggregations.
+    if isinstance(arg, dict):
+        return {k: _translate_arg(v) for k, v in arg.items()}
+    elif isinstance(arg, list):
+        return [_translate_arg(x) for x in arg]
+    elif arg in ("collect", "list", list):
+        return collect_aggregation
+    else:
+        return arg
+
+
 # TODO: These classes are mostly a work-around for missing
 # `observed=False` support.
 # See: https://github.com/rapidsai/cudf/issues/15173
@@ -41,8 +83,20 @@ def __getitem__(self, key):
         )
         return g
 
+    def collect(self, **kwargs):
+        return self._single_agg(Collect, **kwargs)
+
+    def aggregate(self, arg, **kwargs):
+        return super().aggregate(_translate_arg(arg), **kwargs)
+
 
 class SeriesGroupBy(DXSeriesGroupBy):
     def __init__(self, *args, observed=None, **kwargs):
         observed = observed if observed is not None else True
         super().__init__(*args, observed=observed, **kwargs)
+
+    def collect(self, **kwargs):
+        return self._single_agg(Collect, **kwargs)
+
+    def aggregate(self, arg, **kwargs):
+        return super().aggregate(_translate_arg(arg), **kwargs)
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 1e22dd95475..67fa045d3d0 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -14,16 +14,6 @@
 from dask_cudf.groupby import OPTIMIZED_AGGS, _aggs_optimized
 from dask_cudf.tests.utils import QUERY_PLANNING_ON, xfail_dask_expr
 
-# XFAIL "collect" tests for now
-agg_params = [agg for agg in OPTIMIZED_AGGS if agg != "collect"]
-if QUERY_PLANNING_ON:
-    agg_params.append(
-        # TODO: "collect" not supported with dask-expr yet
-        pytest.param("collect", marks=pytest.mark.xfail)
-    )
-else:
-    agg_params.append("collect")
-
 
 def assert_cudf_groupby_layers(ddf):
     for prefix in ("cudf-aggregate-chunk", "cudf-aggregate-agg"):
@@ -57,7 +47,7 @@ def pdf(request):
     return pdf
 
 
-@pytest.mark.parametrize("aggregation", agg_params)
+@pytest.mark.parametrize("aggregation", OPTIMIZED_AGGS)
 @pytest.mark.parametrize("series", [False, True])
 def test_groupby_basic(series, aggregation, pdf):
     gdf = cudf.DataFrame.from_pandas(pdf)
@@ -110,7 +100,7 @@ def test_groupby_cumulative(aggregation, pdf, series):
     dd.assert_eq(a, b)
 
 
-@pytest.mark.parametrize("aggregation", agg_params)
+@pytest.mark.parametrize("aggregation", OPTIMIZED_AGGS)
 @pytest.mark.parametrize(
     "func",
     [
@@ -579,8 +569,16 @@ def test_groupby_categorical_key():
     dd.assert_eq(expect, got)
 
 
-@xfail_dask_expr("as_index not supported in dask-expr")
-@pytest.mark.parametrize("as_index", [True, False])
+@pytest.mark.parametrize(
+    "as_index",
+    [
+        True,
+        pytest.param(
+            False,
+            marks=xfail_dask_expr("as_index not supported in dask-expr"),
+        ),
+    ],
+)
 @pytest.mark.parametrize("split_out", ["use_dask_default", 1, 2])
 @pytest.mark.parametrize("split_every", [False, 4])
 @pytest.mark.parametrize("npartitions", [1, 10])
@@ -603,10 +601,19 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
     if split_out == "use_dask_default":
         split_kwargs.pop("split_out")
 
+    # Avoid using as_index when query-planning is enabled
+    if QUERY_PLANNING_ON:
+        with pytest.warns(FutureWarning, match="argument is now deprecated"):
+            # Should warn when `as_index` is used
+            ddf.groupby(["name", "a"], sort=False, as_index=as_index)
+        maybe_as_index = {"as_index": as_index} if as_index is False else {}
+    else:
+        maybe_as_index = {"as_index": as_index}
+
     # Check `sort=True` behavior
     if split_out == 1:
         gf = (
-            ddf.groupby(["name", "a"], sort=True, as_index=as_index)
+            ddf.groupby(["name", "a"], sort=True, **maybe_as_index)
             .aggregate(
                 agg_dict,
                 **split_kwargs,
@@ -628,7 +635,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
             )
 
     # Full check (`sort=False`)
-    gr = ddf.groupby(["name", "a"], sort=False, as_index=as_index).aggregate(
+    gr = ddf.groupby(["name", "a"], sort=False, **maybe_as_index).aggregate(
         agg_dict,
         **split_kwargs,
     )

From e58838b6cc820fc89f1f67eb9117a3ee6ddeaa47 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 2 May 2024 11:59:55 -0400
Subject: [PATCH 130/842] Large strings support for cudf::clamp (#15533)

Replaces call to `make_strings_children` utility to use the gather-based `make_strings_column` function which is already optimized for long strings (and large strings).

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15533
---
 cpp/src/replace/clamp.cu | 38 ++++++++++++++++----------------------
 1 file changed, 16 insertions(+), 22 deletions(-)

diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index fe5a9cfbd71..31ffc76a4a5 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -28,7 +28,7 @@
 #include <cudf/replace.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
@@ -52,26 +52,22 @@ namespace {
 
 template <typename OptionalScalarIterator, typename ReplaceScalarIterator>
 struct clamp_strings_fn {
+  using string_index_pair = cudf::strings::detail::string_index_pair;
   column_device_view const d_strings;
   OptionalScalarIterator lo_itr;
   ReplaceScalarIterator lo_replace_itr;
   OptionalScalarIterator hi_itr;
   ReplaceScalarIterator hi_replace_itr;
-  size_type* d_offsets{};
-  char* d_chars{};
 
-  __device__ void operator()(size_type idx) const
+  __device__ string_index_pair operator()(size_type idx) const
   {
-    if (d_strings.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
-      return;
-    }
+    if (d_strings.is_null(idx)) { return string_index_pair{nullptr, 0}; }
+
     auto const element      = d_strings.element<string_view>(idx);
     auto const d_lo         = (*lo_itr).value_or(element);
     auto const d_hi         = (*hi_itr).value_or(element);
     auto const d_lo_replace = *(*lo_replace_itr);
     auto const d_hi_replace = *(*hi_replace_itr);
-    auto d_output           = d_chars ? d_chars + d_offsets[idx] : nullptr;
 
     auto d_str = [d_lo, d_lo_replace, d_hi, d_hi_replace, element] {
       if (element < d_lo) { return d_lo_replace; }
@@ -79,11 +75,9 @@ struct clamp_strings_fn {
       return element;
     }();
 
-    if (d_output) {
-      cudf::strings::detail::copy_string(d_output, d_str);
-    } else {
-      d_offsets[idx] = d_str.size_bytes();
-    }
+    // ensures an empty string is not converted to a null row
+    return !d_str.empty() ? string_index_pair{d_str.data(), d_str.size_bytes()}
+                          : string_index_pair{"", 0};
   }
 };
 
@@ -101,14 +95,14 @@ std::unique_ptr<cudf::column> clamp_string_column(strings_column_view const& inp
 
   auto fn = clamp_strings_fn<OptionalScalarIterator, ReplaceScalarIterator>{
     d_input, lo_itr, lo_replace_itr, hi_itr, hi_replace_itr};
-  auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(fn, input.size(), stream, mr);
-
-  return make_strings_column(input.size(),
-                             std::move(offsets_column),
-                             chars.release(),
-                             input.null_count(),
-                             std::move(cudf::detail::copy_bitmask(input.parent(), stream, mr)));
+  rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(input.size(), stream);
+  thrust::transform(rmm::exec_policy_nosync(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(input.size()),
+                    indices.begin(),
+                    fn);
+
+  return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
 }
 
 template <typename T, typename OptionalScalarIterator, typename ReplaceScalarIterator>

From 68828708497252f9c9dae617b9f29ae7de448309 Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Thu, 2 May 2024 13:03:26 -0400
Subject: [PATCH 131/842] Doc: interleave columns pandas compat (#15383)

Add a `pandas_compat` note to `DataFrame.interleave_columns`

Authors:
  - Ray Bell (https://github.com/raybellwaves)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15383
---
 python/cudf/cudf/core/dataframe.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 45bb66d5d4b..1e6ae861679 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7556,6 +7556,12 @@ def interleave_columns(self):
         Returns
         -------
         The interleaved columns as a single column
+
+        .. pandas-compat::
+            **DataFrame.interleave_columns**
+
+            This method does not exist in pandas but it can be run
+            as ``pd.Series(np.vstack(df.to_numpy()).reshape((-1,)))``.
         """
         if ("category" == self.dtypes).any():
             raise ValueError(

From 4494991f73ed373bfb7a300859e5f234f94d8131 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Thu, 2 May 2024 12:25:00 -0500
Subject: [PATCH 132/842] Construct `pylibcudf` columns from objects supporting
 `__cuda_array_interface__` (#15615)

This PR allows zero copy construction of `pylibcudf` columns from device arrays via the `gpumemoryview` class. cc @mroeschke

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15615
---
 python/cudf/cudf/_lib/pylibcudf/column.pyx    | 107 ++++++++++++++++++
 python/cudf/cudf/core/buffer/buffer.py        |  36 +-----
 .../test_column_from_device.py                |  51 +++++++++
 3 files changed, 162 insertions(+), 32 deletions(-)
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_column_from_device.py

diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index 2565e92d5c9..b9e5e48226d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -16,6 +16,10 @@ from .scalar cimport Scalar
 from .types cimport DataType, type_id
 from .utils cimport int_to_bitmask_ptr, int_to_void_ptr
 
+import functools
+
+import numpy as np
+
 
 cdef class Column:
     """A container of nullable device data as a column of elements.
@@ -223,6 +227,51 @@ cdef class Column:
             c_result = move(make_column_from_scalar(dereference(c_scalar), size))
         return Column.from_libcudf(move(c_result))
 
+    @staticmethod
+    def from_cuda_array_interface_obj(object obj):
+        """Create a Column from an object with a CUDA array interface.
+
+        Parameters
+        ----------
+        obj : object
+            The object with the CUDA array interface to create a column from.
+
+        Returns
+        -------
+        Column
+            A Column containing the data from the CUDA array interface.
+
+        Notes
+        -----
+        Data is not copied when creating the column. The caller is
+        responsible for ensuring the data is not mutated unexpectedly while the
+        column is in use.
+        """
+        data = gpumemoryview(obj)
+        iface = data.__cuda_array_interface__()
+        if iface.get('mask') is not None:
+            raise ValueError("mask not yet supported.")
+
+        typestr = iface['typestr'][1:]
+        if not is_c_contiguous(
+            iface['shape'],
+            iface['strides'],
+            np.dtype(typestr).itemsize
+        ):
+            raise ValueError("Data must be C-contiguous")
+
+        data_type = _datatype_from_dtype_desc(typestr)
+        size = iface['shape'][0]
+        return Column(
+            data_type,
+            size,
+            data,
+            None,
+            0,
+            0,
+            []
+        )
+
     cpdef DataType type(self):
         """The type of data in the column."""
         return self._data_type
@@ -296,3 +345,61 @@ cdef class ListColumnView:
     cpdef offsets(self):
         """The offsets column of the underlying list column."""
         return self._column.child(1)
+
+
+@functools.cache
+def _datatype_from_dtype_desc(desc):
+    mapping = {
+        'u1': type_id.UINT8,
+        'u2': type_id.UINT16,
+        'u4': type_id.UINT32,
+        'u8': type_id.UINT64,
+        'i1': type_id.INT8,
+        'i2': type_id.INT16,
+        'i4': type_id.INT32,
+        'i8': type_id.INT64,
+        'f4': type_id.FLOAT32,
+        'f8': type_id.FLOAT64,
+        'b1': type_id.BOOL8,
+        'M8[s]': type_id.TIMESTAMP_SECONDS,
+        'M8[ms]': type_id.TIMESTAMP_MILLISECONDS,
+        'M8[us]': type_id.TIMESTAMP_MICROSECONDS,
+        'M8[ns]': type_id.TIMESTAMP_NANOSECONDS,
+        'm8[s]': type_id.DURATION_SECONDS,
+        'm8[ms]': type_id.DURATION_MILLISECONDS,
+        'm8[us]': type_id.DURATION_MICROSECONDS,
+        'm8[ns]': type_id.DURATION_NANOSECONDS,
+    }
+    if desc not in mapping:
+        raise ValueError(f"Unsupported dtype: {desc}")
+    return DataType(mapping[desc])
+
+
+def is_c_contiguous(
+    shape: Sequence[int], strides: Sequence[int], itemsize: int
+) -> bool:
+    """Determine if shape and strides are C-contiguous
+
+    Parameters
+    ----------
+    shape : Sequence[int]
+        Number of elements in each dimension.
+    strides : Sequence[int]
+        The stride of each dimension in bytes.
+    itemsize : int
+        Size of an element in bytes.
+
+    Return
+    ------
+    bool
+        The boolean answer.
+    """
+
+    if any(dim == 0 for dim in shape):
+        return True
+    cumulative_stride = itemsize
+    for dim, stride in zip(reversed(shape), reversed(strides)):
+        if dim > 1 and stride != cumulative_stride:
+            return False
+        cumulative_stride *= dim
+    return True
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index b2aba4f978b..5c2d77033b8 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -6,7 +6,7 @@
 import pickle
 import weakref
 from types import SimpleNamespace
-from typing import Any, Dict, Literal, Mapping, Optional, Sequence, Tuple
+from typing import Any, Dict, Literal, Mapping, Optional, Tuple
 
 import numpy
 from typing_extensions import Self
@@ -480,36 +480,6 @@ def __str__(self) -> str:
         )
 
 
-def is_c_contiguous(
-    shape: Sequence[int], strides: Sequence[int], itemsize: int
-) -> bool:
-    """Determine if shape and strides are C-contiguous
-
-    Parameters
-    ----------
-    shape : Sequence[int]
-        Number of elements in each dimension.
-    strides : Sequence[int]
-        The stride of each dimension in bytes.
-    itemsize : int
-        Size of an element in bytes.
-
-    Return
-    ------
-    bool
-        The boolean answer.
-    """
-
-    if any(dim == 0 for dim in shape):
-        return True
-    cumulative_stride = itemsize
-    for dim, stride in zip(reversed(shape), reversed(strides)):
-        if dim > 1 and stride != cumulative_stride:
-            return False
-        cumulative_stride *= dim
-    return True
-
-
 def get_ptr_and_size(array_interface: Mapping) -> Tuple[int, int]:
     """Retrieve the pointer and size from an array interface.
 
@@ -531,7 +501,9 @@ def get_ptr_and_size(array_interface: Mapping) -> Tuple[int, int]:
     shape = array_interface["shape"] or (1,)
     strides = array_interface["strides"]
     itemsize = cudf.dtype(array_interface["typestr"]).itemsize
-    if strides is None or is_c_contiguous(shape, strides, itemsize):
+    if strides is None or cudf._lib.pylibcudf.column.is_c_contiguous(
+        shape, strides, itemsize
+    ):
         nelem = math.prod(shape)
         ptr = array_interface["data"][0] or 0
         return ptr, nelem * itemsize
diff --git a/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py b/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
new file mode 100644
index 00000000000..764720d9de1
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf
+from cudf._lib import pylibcudf as plc
+
+VALID_TYPES = [
+    pa.int8(),
+    pa.int16(),
+    pa.int32(),
+    pa.int64(),
+    pa.uint8(),
+    pa.uint16(),
+    pa.uint32(),
+    pa.uint64(),
+    pa.float32(),
+    pa.float64(),
+    pa.bool_(),
+    pa.timestamp("s"),
+    pa.timestamp("ms"),
+    pa.timestamp("us"),
+    pa.timestamp("ns"),
+    pa.duration("s"),
+    pa.duration("ms"),
+    pa.duration("us"),
+    pa.duration("ns"),
+]
+
+
+@pytest.fixture(params=VALID_TYPES, ids=repr)
+def valid_type(request):
+    return request.param
+
+
+@pytest.fixture
+def valid_column(valid_type):
+    if valid_type == pa.bool_():
+        return pa.array([True, False, True], type=valid_type)
+    return pa.array([1, 2, 3], type=valid_type)
+
+
+def test_from_cuda_array_interface(valid_column):
+    col = plc.column.Column.from_cuda_array_interface_obj(
+        cudf.Series(valid_column)
+    )
+    expect = valid_column
+
+    assert_column_eq(col, expect)

From 500cb29ce8f3043f0227a8852bad98c3f6c0dab2 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 2 May 2024 12:52:48 -0500
Subject: [PATCH 133/842] Check column type equality, handling nested types
 correctly. (#14531)

Addresses most of #14527. See also #14494.

This PR expands the use of `cudf::column_types_equal(lhs, rhs)` and adds new methods `cudf::column_scalar_types_equal`, `cudf::scalar_types_equal`, and `cudf::all_column_types_equal`.

These type check functions are now employed throughout the code base instead of raw checks like `a.type() == b.type()` because those do not correctly handle nested types.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Lawrence Mitchell (https://github.com/wence-)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/14531
---
 .../developer_guide/DEVELOPER_GUIDE.md        |  15 +-
 cpp/include/cudf/detail/scatter.cuh           |   7 +-
 cpp/include/cudf/lists/detail/scatter.cuh     |   2 +-
 cpp/include/cudf/table/table_view.hpp         |  11 +-
 cpp/include/cudf/utilities/type_checks.hpp    | 106 +++++++++-
 cpp/src/copying/concatenate.cu                |  11 +-
 cpp/src/copying/copy.cu                       |  18 +-
 cpp/src/copying/copy_range.cu                 |  10 +-
 cpp/src/copying/scatter.cu                    |  39 ++--
 cpp/src/copying/shift.cu                      |   3 +-
 cpp/src/dictionary/add_keys.cu                |   5 +-
 cpp/src/dictionary/detail/concatenate.cu      |  20 +-
 cpp/src/dictionary/remove_keys.cu             |   6 +-
 cpp/src/dictionary/replace.cu                 |  10 +-
 cpp/src/dictionary/search.cu                  |  18 +-
 cpp/src/dictionary/set_keys.cu                |   6 +-
 cpp/src/filling/fill.cu                       |  11 +-
 cpp/src/filling/sequence.cu                   |   5 +-
 cpp/src/groupby/groupby.cu                    |  16 +-
 cpp/src/interop/dlpack.cpp                    |   7 +-
 cpp/src/join/hash_join.cu                     |  11 +-
 cpp/src/labeling/label_bins.cu                |   7 +-
 cpp/src/lists/combine/concatenate_rows.cu     |  12 +-
 cpp/src/lists/contains.cu                     |   3 +-
 cpp/src/lists/sequences.cu                    |  11 +-
 cpp/src/lists/set_operations.cu               |   2 +-
 cpp/src/merge/merge.cu                        |   1 +
 cpp/src/reductions/reductions.cpp             |   7 +-
 cpp/src/reductions/segmented/reductions.cpp   |   6 +-
 cpp/src/replace/clamp.cu                      |  17 +-
 cpp/src/replace/nulls.cu                      |  12 +-
 cpp/src/replace/replace.cu                    |   8 +-
 cpp/src/rolling/detail/lead_lag_nested.cuh    |   7 +-
 cpp/src/search/contains_scalar.cu             |  12 +-
 cpp/src/search/contains_table.cu              |   1 +
 cpp/src/strings/slice.cu                      |  14 +-
 cpp/src/table/table_view.cpp                  |  23 +-
 cpp/src/transform/one_hot_encode.cu           |   5 +-
 cpp/src/utilities/type_checks.cpp             | 126 +++++++++--
 cpp/tests/copying/concatenate_tests.cpp       |  13 +-
 cpp/tests/copying/copy_range_tests.cpp        |   2 +-
 cpp/tests/copying/copy_tests.cpp              |   2 +-
 cpp/tests/copying/get_value_tests.cpp         |  11 +-
 cpp/tests/dictionary/add_keys_test.cpp        |   3 +-
 cpp/tests/dictionary/remove_keys_test.cpp     |   3 +-
 cpp/tests/dictionary/scatter_test.cpp         |   2 +-
 cpp/tests/dictionary/search_test.cpp          |   4 +-
 cpp/tests/dictionary/set_keys_test.cpp        |   3 +-
 cpp/tests/filling/fill_tests.cpp              |   4 +-
 cpp/tests/filling/sequence_tests.cpp          |   8 +-
 cpp/tests/groupby/shift_tests.cpp             |   6 +-
 cpp/tests/interop/dlpack_test.cpp             |   3 +-
 cpp/tests/io/parquet_writer_test.cpp          |   4 +-
 cpp/tests/labeling/label_bins_tests.cpp       |   7 +-
 .../lists/combine/concatenate_rows_tests.cpp  |   5 +-
 cpp/tests/lists/sequences_tests.cpp           |   7 +-
 cpp/tests/replace/clamp_test.cpp              |  11 +-
 cpp/tests/replace/replace_nulls_tests.cpp     |  10 +-
 cpp/tests/replace/replace_tests.cpp           |   3 +-
 cpp/tests/transform/one_hot_encode_tests.cpp  |   5 +-
 cpp/tests/utilities/column_utilities.cu       |  10 +-
 .../utilities_tests/type_check_tests.cpp      | 197 ++++++++++--------
 62 files changed, 615 insertions(+), 319 deletions(-)

diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index 23b129fdf4b..05f8e4585cc 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -943,13 +943,14 @@ Use the `CUDF_EXPECTS` macro to enforce runtime conditions necessary for correct
 Example usage:
 
 ```c++
-CUDF_EXPECTS(lhs.type() == rhs.type(), "Column type mismatch");
+CUDF_EXPECTS(cudf::have_same_types(lhs, rhs), "Type mismatch", cudf::data_type_error);
 ```
 
 The first argument is the conditional expression expected to resolve to `true` under normal
-conditions. If the conditional evaluates to `false`, then an error has occurred and an instance of
-`cudf::logic_error` is thrown. The second argument to `CUDF_EXPECTS` is a short description of the
-error that has occurred and is used for the exception's `what()` message.
+conditions. The second argument to `CUDF_EXPECTS` is a short description of the error that has
+occurred and is used for the exception's `what()` message. If the conditional evaluates to
+`false`, then an error has occurred and an instance of the exception class in the third argument
+(or the default, `cudf::logic_error`) is thrown.
 
 There are times where a particular code path, if reached, should indicate an error no matter what.
 For example, often the `default` case of a `switch` statement represents an invalid alternative.
@@ -1048,6 +1049,12 @@ types such as numeric types and timestamps/durations, adding support for nested
 Enabling an algorithm differently for different types uses either template specialization or SFINAE,
 as discussed in [Specializing Type-Dispatched Code Paths](#specializing-type-dispatched-code-paths).
 
+## Comparing Data Types
+
+When comparing the data types of two columns or scalars, do not directly compare
+`a.type() == b.type()`. Nested types such as lists of structs of integers will not be handled
+properly if only the top level type is compared. Instead, use the `cudf::have_same_types` function.
+
 # Type Dispatcher
 
 libcudf stores data (for columns and scalars) "type erased" in `void*` device memory. This
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index 7eb661f7833..80bc87731ca 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -29,7 +29,9 @@
 #include <cudf/strings/detail/scatter.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -213,8 +215,9 @@ struct column_scatterer_impl<dictionary32> {
     // check the keys match
     dictionary_column_view const source(source_in);
     dictionary_column_view const target(target_in);
-    CUDF_EXPECTS(source.keys().type() == target.keys().type(),
-                 "scatter dictionary keys must be the same type");
+    CUDF_EXPECTS(cudf::have_same_types(source.keys(), target.keys()),
+                 "scatter dictionary keys must be the same type",
+                 cudf::data_type_error);
 
     // first combine keys so both dictionaries have the same set
     auto target_matched    = dictionary::detail::add_keys(target, source.keys(), stream, mr);
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index d0d5b1ad823..c550ad5b94f 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -101,7 +101,7 @@ std::unique_ptr<column> scatter_impl(rmm::device_uvector<unbound_list_view> cons
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(column_types_equal(source, target), "Mismatched column types.");
+  CUDF_EXPECTS(have_same_types(source, target), "Mismatched column types.");
 
   auto const child_column_type = lists_column_view(target).child().type();
 
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index 4f3b23747e6..ad12b1eef4e 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -339,15 +339,6 @@ bool has_nested_nullable_columns(table_view const& input);
  */
 std::vector<column_view> get_nullable_columns(table_view const& table);
 
-/**
- * @brief Checks if two `table_view`s have columns of same types
- *
- * @param lhs left-side table_view operand
- * @param rhs right-side table_view operand
- * @return boolean comparison result
- */
-bool have_same_types(table_view const& lhs, table_view const& rhs);
-
 /**
  * @brief Copy column_views from a table_view into another table_view according to
  * a column indices map.
diff --git a/cpp/include/cudf/utilities/type_checks.hpp b/cpp/include/cudf/utilities/type_checks.hpp
index b925fc8ae92..fd3b0581c11 100644
--- a/cpp/include/cudf/utilities/type_checks.hpp
+++ b/cpp/include/cudf/utilities/type_checks.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,11 +16,16 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <algorithm>
 
 namespace cudf {
 
 /**
- * @brief Compares the type of two `column_view`s
+ * @brief Compare the types of two `column_view`s
+ *
+ * @deprecated Since 24.06. Use cudf::have_same_types instead.
  *
  * This function returns true if the type of `lhs` equals that of `rhs`.
  * - For fixed point types, the scale is compared.
@@ -34,10 +39,11 @@ namespace cudf {
  * @param rhs The second `column_view` to compare
  * @return true if column types match
  */
-bool column_types_equal(column_view const& lhs, column_view const& rhs);
+[[deprecated]] bool column_types_equal(column_view const& lhs, column_view const& rhs);
 
 /**
  * @brief Compare the type IDs of two `column_view`s
+ *
  * This function returns true if the type of `lhs` equals that of `rhs`.
  * - For fixed point types, the scale is ignored.
  *
@@ -47,4 +53,98 @@ bool column_types_equal(column_view const& lhs, column_view const& rhs);
  */
 bool column_types_equivalent(column_view const& lhs, column_view const& rhs);
 
+/**
+ * @brief Compares the type of two `column_view`s
+ *
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is compared.
+ * - For dictionary types, the type of the keys are compared if both are
+ *   non-empty columns.
+ * - For lists types, the type of child columns are compared recursively.
+ * - For struct types, the type of each field are compared in order.
+ * - For all other types, the `id` of `data_type` is compared.
+ *
+ * @param lhs The first `column_view` to compare
+ * @param rhs The second `column_view` to compare
+ * @return true if types match
+ */
+bool have_same_types(column_view const& lhs, column_view const& rhs);
+
+/**
+ * @brief Compare the types of a `column_view` and a `scalar`
+ *
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is compared.
+ * - For dictionary column types, the type of the keys is compared to the
+ *   scalar type.
+ * - For lists types, the types of child columns are compared recursively.
+ * - For struct types, the types of each field are compared in order.
+ * - For all other types, the `id` of `data_type` is compared.
+ *
+ * @param lhs The `column_view` to compare
+ * @param rhs The `scalar` to compare
+ * @return true if types match
+ */
+bool have_same_types(column_view const& lhs, scalar const& rhs);
+
+/**
+ * @brief Compare the types of a `scalar` and a `column_view`
+ *
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is compared.
+ * - For dictionary column types, the type of the keys is compared to the
+ *   scalar type.
+ * - For lists types, the types of child columns are compared recursively.
+ * - For struct types, the types of each field are compared in order.
+ * - For all other types, the `id` of `data_type` is compared.
+ *
+ * @param lhs The `scalar` to compare
+ * @param rhs The `column_view` to compare
+ * @return true if types match
+ */
+bool have_same_types(scalar const& lhs, column_view const& rhs);
+
+/**
+ * @brief Compare the types of two `scalar`s
+ *
+ * This function returns true if the type of `lhs` equals that of `rhs`.
+ * - For fixed point types, the scale is compared.
+ * - For lists types, the types of child columns are compared recursively.
+ * - For struct types, the types of each field are compared in order.
+ * - For all other types, the `id` of `data_type` is compared.
+ *
+ * @param lhs The first `scalar` to compare
+ * @param rhs The second `scalar` to compare
+ * @return true if types match
+ */
+bool have_same_types(scalar const& lhs, scalar const& rhs);
+
+/**
+ * @brief Checks if two `table_view`s have columns of same types
+ *
+ * @param lhs left-side table_view operand
+ * @param rhs right-side table_view operand
+ * @return boolean comparison result
+ */
+bool have_same_types(table_view const& lhs, table_view const& rhs);
+
+/**
+ * @brief Compare the types of a range of `column_view` or `scalar` objects
+ *
+ * This function returns true if all objects in the range have the same type, in the sense of
+ * cudf::have_same_types.
+ *
+ * @tparam ForwardIt Forward iterator
+ * @param first The first iterator
+ * @param last The last iterator
+ * @return true if all types match
+ */
+template <typename ForwardIt>
+inline bool all_have_same_types(ForwardIt first, ForwardIt last)
+{
+  return first == last || std::all_of(std::next(first), last, [want = *first](auto const& c) {
+           return cudf::have_same_types(want, c);
+         });
+}
+
 }  // namespace cudf
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 7c57be8e7c0..b1136a9eeb3 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -30,6 +30,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -461,12 +463,9 @@ void traverse_children::operator()<cudf::list_view>(host_span<column_view const>
  */
 void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(std::all_of(cols.begin(),
-                           cols.end(),
-                           [expected_type = cols.front().type()](auto const& c) {
-                             return c.type() == expected_type;
-                           }),
-               "Type mismatch in columns to concatenate.");
+  CUDF_EXPECTS(cudf::all_have_same_types(cols.begin(), cols.end()),
+               "Type mismatch in columns to concatenate.",
+               cudf::data_type_error);
 
   // total size of all concatenated rows
   size_t const total_row_count =
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index 92fb2e61741..e86a1f8d6f1 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -26,6 +26,7 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -362,9 +363,10 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs and rhs columns",
                std::invalid_argument);
-  CUDF_EXPECTS(lhs.size() == rhs.size(), "Both columns must be of the size", std::invalid_argument);
   CUDF_EXPECTS(
-    lhs.type() == rhs.type(), "Both inputs must be of the same type", cudf::data_type_error);
+    lhs.size() == rhs.size(), "Both columns must be of the same size", std::invalid_argument);
+  CUDF_EXPECTS(
+    cudf::have_same_types(lhs, rhs), "Both inputs must be of the same type", cudf::data_type_error);
 
   return copy_if_else(lhs, rhs, lhs.has_nulls(), rhs.has_nulls(), boolean_mask, stream, mr);
 }
@@ -378,11 +380,8 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
   CUDF_EXPECTS(boolean_mask.size() == rhs.size(),
                "Boolean mask column must be the same size as rhs column",
                std::invalid_argument);
-
-  auto rhs_type =
-    cudf::is_dictionary(rhs.type()) ? cudf::dictionary_column_view(rhs).keys_type() : rhs.type();
   CUDF_EXPECTS(
-    lhs.type() == rhs_type, "Both inputs must be of the same type", cudf::data_type_error);
+    cudf::have_same_types(rhs, lhs), "Both inputs must be of the same type", cudf::data_type_error);
 
   return copy_if_else(lhs, rhs, !lhs.is_valid(stream), rhs.has_nulls(), boolean_mask, stream, mr);
 }
@@ -396,11 +395,8 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
   CUDF_EXPECTS(boolean_mask.size() == lhs.size(),
                "Boolean mask column must be the same size as lhs column",
                std::invalid_argument);
-
-  auto lhs_type =
-    cudf::is_dictionary(lhs.type()) ? cudf::dictionary_column_view(lhs).keys_type() : lhs.type();
   CUDF_EXPECTS(
-    lhs_type == rhs.type(), "Both inputs must be of the same type", cudf::data_type_error);
+    cudf::have_same_types(lhs, rhs), "Both inputs must be of the same type", cudf::data_type_error);
 
   return copy_if_else(lhs, rhs, lhs.has_nulls(), !rhs.is_valid(stream), boolean_mask, stream, mr);
 }
@@ -412,7 +408,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(
-    lhs.type() == rhs.type(), "Both inputs must be of the same type", cudf::data_type_error);
+    cudf::have_same_types(lhs, rhs), "Both inputs must be of the same type", cudf::data_type_error);
   return copy_if_else(
     lhs, rhs, !lhs.is_valid(stream), !rhs.is_valid(stream), boolean_mask, stream, mr);
 }
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index d2ea7036952..dd18f99a3c8 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -32,6 +32,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -147,8 +148,9 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
   // check the keys in the source and target
   cudf::dictionary_column_view const dict_source(source);
   cudf::dictionary_column_view const dict_target(target);
-  CUDF_EXPECTS(dict_source.keys().type() == dict_target.keys().type(),
-               "dictionary keys must be the same type");
+  CUDF_EXPECTS(cudf::have_same_types(dict_source.keys(), dict_target.keys()),
+               "dictionary keys must be the same type",
+               cudf::data_type_error);
 
   // combine keys so both dictionaries have the same set
   auto target_matched =
@@ -211,7 +213,7 @@ void copy_range_in_place(column_view const& source,
                  (target_begin <= target.size() - (source_end - source_begin)),
                "Range is out of bounds.",
                std::out_of_range);
-  CUDF_EXPECTS(target.type() == source.type(), "Data type mismatch.", cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(target, source), "Data type mismatch.", cudf::data_type_error);
   CUDF_EXPECTS(target.nullable() || not source.has_nulls(),
                "target should be nullable if source has null values.",
                std::invalid_argument);
@@ -239,7 +241,7 @@ std::unique_ptr<column> copy_range(column_view const& source,
                  (target_begin <= target.size() - (source_end - source_begin)),
                "Range is out of bounds.",
                std::out_of_range);
-  CUDF_EXPECTS(target.type() == source.type(), "Data type mismatch.", cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(target, source), "Data type mismatch.", cudf::data_type_error);
 
   return cudf::type_dispatcher<dispatch_storage_type>(
     target.type(),
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index cfcbe4724df..993ee074f14 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -32,6 +32,8 @@
 #include <cudf/structs/struct_view.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -112,7 +114,7 @@ struct column_scalar_scatterer_impl {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr) const
   {
-    CUDF_EXPECTS(source.get().type() == target.type(),
+    CUDF_EXPECTS(cudf::have_same_types(target, source.get()),
                  "scalar and column types must match",
                  cudf::data_type_error);
 
@@ -145,7 +147,7 @@ struct column_scalar_scatterer_impl<string_view, MapIterator> {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr) const
   {
-    CUDF_EXPECTS(source.get().type() == target.type(),
+    CUDF_EXPECTS(cudf::have_same_types(target, source.get()),
                  "scalar and column types must match",
                  cudf::data_type_error);
 
@@ -315,12 +317,7 @@ std::unique_ptr<table> scatter(table_view const& source,
   CUDF_EXPECTS(scatter_map.size() <= source.num_rows(),
                "Size of scatter map must be equal to or less than source rows",
                std::invalid_argument);
-  CUDF_EXPECTS(std::equal(source.begin(),
-                          source.end(),
-                          target.begin(),
-                          [](auto const& col1, auto const& col2) {
-                            return col1.type().id() == col2.type().id();
-                          }),
+  CUDF_EXPECTS(cudf::have_same_types(source, target),
                "Column types do not match between source and target",
                cudf::data_type_error);
   CUDF_EXPECTS(not scatter_map.has_nulls(), "Scatter map contains nulls", std::invalid_argument);
@@ -452,14 +449,9 @@ std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
                "Mask must be of Boolean type",
                cudf::data_type_error);
   // Count valid pair of input and columns as per type at each column index i
-  CUDF_EXPECTS(
-    std::all_of(thrust::counting_iterator<size_type>(0),
-                thrust::counting_iterator<size_type>(target.num_columns()),
-                [&input, &target](auto index) {
-                  return ((input.column(index).type().id()) == (target.column(index).type().id()));
-                }),
-    "Type mismatch in input column and target column",
-    cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(input, target),
+               "Type mismatch in input column and target column",
+               cudf::data_type_error);
 
   if (target.num_rows() != 0) {
     std::vector<std::unique_ptr<column>> out_columns(target.num_columns());
@@ -496,14 +488,13 @@ std::unique_ptr<table> boolean_mask_scatter(
                cudf::data_type_error);
 
   // Count valid pair of input and columns as per type at each column/scalar index i
-  CUDF_EXPECTS(
-    std::all_of(thrust::counting_iterator<size_type>(0),
-                thrust::counting_iterator<size_type>(target.num_columns()),
-                [&input, &target](auto index) {
-                  return (input[index].get().type().id() == target.column(index).type().id());
-                }),
-    "Type mismatch in input scalar and target column",
-    cudf::data_type_error);
+  CUDF_EXPECTS(std::all_of(thrust::counting_iterator<size_type>(0),
+                           thrust::counting_iterator<size_type>(target.num_columns()),
+                           [&input, &target](auto index) {
+                             return cudf::have_same_types(target.column(index), input[index].get());
+                           }),
+               "Type mismatch in input scalar and target column",
+               cudf::data_type_error);
 
   if (target.num_rows() != 0) {
     std::vector<std::unique_ptr<column>> out_columns(target.num_columns());
diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu
index bdc741887f7..91254f21170 100644
--- a/cpp/src/copying/shift.cu
+++ b/cpp/src/copying/shift.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -158,7 +159,7 @@ std::unique_ptr<column> shift(column_view const& input,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == fill_value.type(),
+  CUDF_EXPECTS(cudf::have_same_types(input, fill_value),
                "shift requires each fill value type to match the corresponding column type.",
                cudf::data_type_error);
 
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index 5fd21ee0094..0ed9006f88b 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -29,6 +29,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -54,7 +56,8 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
 {
   CUDF_EXPECTS(!new_keys.has_nulls(), "Keys must not have nulls");
   auto old_keys = dictionary_column.keys();  // [a,b,c,d,f]
-  CUDF_EXPECTS(new_keys.type() == old_keys.type(), "Keys must be the same type");
+  CUDF_EXPECTS(
+    cudf::have_same_types(new_keys, old_keys), "Keys must be the same type", cudf::data_type_error);
   // first, concatenate the keys together
   // [a,b,c,d,f] + [d,b,e] = [a,b,c,d,f,d,b,e]
   auto combined_keys = cudf::detail::concatenate(
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index 62a6c816493..fdc3d9d0ecf 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -26,6 +26,8 @@
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -82,13 +84,13 @@ struct compute_children_offsets_fn {
   }
 
   /**
-   * @brief Return the first keys().type of the dictionary columns.
+   * @brief Return the first keys() of the dictionary columns.
    */
-  data_type get_keys_type()
+  column_view get_keys()
   {
     auto const view(*std::find_if(
       columns_ptrs.begin(), columns_ptrs.end(), [](auto pcv) { return pcv->size() > 0; }));
-    return dictionary_column_view(*view).keys().type();
+    return dictionary_column_view(*view).keys();
   }
 
   /**
@@ -214,14 +216,16 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
 
   // concatenate the keys (and check the keys match)
   compute_children_offsets_fn child_offsets_fn{columns};
-  auto keys_type = child_offsets_fn.get_keys_type();
+  auto expected_keys = child_offsets_fn.get_keys();
   std::vector<column_view> keys_views(columns.size());
-  std::transform(columns.begin(), columns.end(), keys_views.begin(), [keys_type](auto cv) {
+  std::transform(columns.begin(), columns.end(), keys_views.begin(), [expected_keys](auto cv) {
     auto dict_view = dictionary_column_view(cv);
     // empty column may not have keys so we create an empty column_view place-holder
-    if (dict_view.is_empty()) return column_view{keys_type, 0, nullptr, nullptr, 0};
+    if (dict_view.is_empty()) return column_view{expected_keys.type(), 0, nullptr, nullptr, 0};
     auto keys = dict_view.keys();
-    CUDF_EXPECTS(keys.type() == keys_type, "key types of all dictionary columns must match");
+    CUDF_EXPECTS(cudf::have_same_types(keys, expected_keys),
+                 "key types of all dictionary columns must match",
+                 cudf::data_type_error);
     return keys;
   });
   auto all_keys =
@@ -275,7 +279,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
 
   // now recompute the indices values for the new keys_column;
   // the keys offsets (pair.first) are for mapping to the input keys
-  auto indices_column = type_dispatcher(keys_type,
+  auto indices_column = type_dispatcher(expected_keys.type(),
                                         dispatch_compute_indices{},
                                         all_keys->view(),     // old keys
                                         all_indices->view(),  // old indices
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 718ca419289..35387efa56b 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -26,6 +26,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -155,7 +157,9 @@ std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_col
 {
   CUDF_EXPECTS(!keys_to_remove.has_nulls(), "keys_to_remove must not have nulls");
   auto const keys_view = dictionary_column.keys();
-  CUDF_EXPECTS(keys_view.type() == keys_to_remove.type(), "keys types must match");
+  CUDF_EXPECTS(cudf::have_same_types(keys_view, keys_to_remove),
+               "keys types must match",
+               cudf::data_type_error);
 
   // locate keys to remove by searching the keys column
   auto const matches = cudf::detail::contains(keys_to_remove, keys_view, stream, mr);
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index bb6b08c243d..bc17dfd4bab 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -24,6 +24,8 @@
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -84,7 +86,9 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
 {
   if (input.is_empty()) { return cudf::empty_like(input.parent()); }
   if (!input.has_nulls()) { return std::make_unique<cudf::column>(input.parent(), stream, mr); }
-  CUDF_EXPECTS(input.keys().type() == replacement.keys().type(), "keys must match");
+  CUDF_EXPECTS(cudf::have_same_types(input.keys(), replacement.keys()),
+               "keys must match",
+               cudf::data_type_error);
   CUDF_EXPECTS(replacement.size() == input.size(), "column sizes must match");
 
   // first combine the keys so both input dictionaries have the same set
@@ -119,7 +123,9 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
   if (!input.has_nulls() || !replacement.is_valid(stream)) {
     return std::make_unique<cudf::column>(input.parent(), stream, mr);
   }
-  CUDF_EXPECTS(input.keys().type() == replacement.type(), "keys must match scalar type");
+  CUDF_EXPECTS(cudf::have_same_types(input.parent(), replacement),
+               "keys must match scalar type",
+               cudf::data_type_error);
 
   // first add the replacement to the keys so only the indices need to be processed
   auto input_matched = dictionary::detail::add_keys(
diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu
index 680eadddba8..231619836f9 100644
--- a/cpp/src/dictionary/search.cu
+++ b/cpp/src/dictionary/search.cu
@@ -19,7 +19,9 @@
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/search.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -72,10 +74,12 @@ struct find_index_fn {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr) const
   {
-    if (!key.is_valid(stream))
+    if (!key.is_valid(stream)) {
       return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, 0, false, stream, mr);
-    CUDF_EXPECTS(input.keys().type() == key.type(),
-                 "search key type must match dictionary keys type");
+    }
+    CUDF_EXPECTS(cudf::have_same_types(input.parent(), key),
+                 "search key type must match dictionary keys type",
+                 cudf::data_type_error);
 
     using ScalarType = cudf::scalar_type_t<Element>;
     auto find_key    = static_cast<ScalarType const&>(key).value(stream);
@@ -114,10 +118,12 @@ struct find_insert_index_fn {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr) const
   {
-    if (!key.is_valid(stream))
+    if (!key.is_valid(stream)) {
       return type_dispatcher(input.indices().type(), dispatch_scalar_index{}, 0, false, stream, mr);
-    CUDF_EXPECTS(input.keys().type() == key.type(),
-                 "search key type must match dictionary keys type");
+    }
+    CUDF_EXPECTS(cudf::have_same_types(input.parent(), key),
+                 "search key type must match dictionary keys type",
+                 cudf::data_type_error);
 
     using ScalarType = cudf::scalar_type_t<Element>;
     auto find_key    = static_cast<ScalarType const&>(key).value(stream);
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index b56eec9401a..08a33d40abe 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -29,6 +29,8 @@
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -116,7 +118,6 @@ struct dispatch_compute_indices {
 
 }  // namespace
 
-//
 std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column,
                                  column_view const& new_keys,
                                  rmm::cuda_stream_view stream,
@@ -124,7 +125,8 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
 {
   CUDF_EXPECTS(!new_keys.has_nulls(), "keys parameter must not have nulls");
   auto keys = dictionary_column.keys();
-  CUDF_EXPECTS(keys.type() == new_keys.type(), "keys types must match");
+  CUDF_EXPECTS(
+    cudf::have_same_types(keys, new_keys), "keys types must match", cudf::data_type_error);
 
   // copy the keys -- use cudf::distinct to make sure there are no duplicates,
   // then sort the results.
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index c4d786bd73b..1fc9ed31c09 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -33,6 +33,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -110,7 +111,7 @@ struct out_of_place_fill_range_dispatch {
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
   {
-    CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
+    CUDF_EXPECTS(cudf::have_same_types(input, value), "Data type mismatch.", cudf::data_type_error);
     auto p_ret = std::make_unique<cudf::column>(input, stream, mr);
 
     if (end != begin) {  // otherwise no fill
@@ -137,7 +138,7 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == value.type(), "Data type mismatch.");
+  CUDF_EXPECTS(cudf::have_same_types(input, value), "Data type mismatch.", cudf::data_type_error);
   using ScalarType = cudf::scalar_type_t<cudf::string_view>;
   auto p_scalar    = static_cast<ScalarType const*>(&value);
   return cudf::strings::detail::fill(
@@ -153,7 +154,8 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
 {
   if (input.is_empty()) return std::make_unique<cudf::column>(input, stream, mr);
   cudf::dictionary_column_view const target(input);
-  CUDF_EXPECTS(target.keys().type() == value.type(), "Data type mismatch.");
+  CUDF_EXPECTS(
+    cudf::have_same_types(target.parent(), value), "Data type mismatch.", cudf::data_type_error);
 
   // if the scalar is invalid, then just copy the column and fill the null mask
   if (!value.is_valid(stream)) {
@@ -219,7 +221,8 @@ void fill_in_place(mutable_column_view& destination,
                "Range is out of bounds.");
   CUDF_EXPECTS(destination.nullable() || value.is_valid(stream),
                "destination should be nullable or value should be non-null.");
-  CUDF_EXPECTS(destination.type() == value.type(), "Data type mismatch.");
+  CUDF_EXPECTS(
+    cudf::have_same_types(destination, value), "Data type mismatch.", cudf::data_type_error);
 
   if (end != begin) {  // otherwise no-op
     cudf::type_dispatcher(
diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu
index f7067c3a91b..ee1745b8498 100644
--- a/cpp/src/filling/sequence.cu
+++ b/cpp/src/filling/sequence.cu
@@ -24,6 +24,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -128,7 +129,9 @@ std::unique_ptr<column> sequence(size_type size,
                                  rmm::cuda_stream_view stream,
                                  rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(init.type() == step.type(), "init and step must be of the same type.");
+  CUDF_EXPECTS(cudf::have_same_types(init, step),
+               "init and step must be of the same type.",
+               cudf::data_type_error);
   CUDF_EXPECTS(size >= 0, "size must be >= 0");
   CUDF_EXPECTS(is_numeric(init.type()), "Input scalar types must be numeric");
 
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index 73cb4efd283..e43dfcb4d98 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -36,6 +36,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -312,12 +313,15 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::shift(
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(values.num_columns() == static_cast<size_type>(fill_values.size()),
                "Mismatch number of fill_values and columns.");
-  CUDF_EXPECTS(
-    std::all_of(thrust::make_counting_iterator(0),
-                thrust::make_counting_iterator(values.num_columns()),
-                [&](auto i) { return values.column(i).type() == fill_values[i].get().type(); }),
-    "values and fill_value should have the same type.");
-
+  CUDF_EXPECTS(std::equal(values.begin(),
+                          values.end(),
+                          fill_values.cbegin(),
+                          fill_values.cend(),
+                          [](auto const& col, auto const& scalar) {
+                            return cudf::have_same_types(col, scalar.get());
+                          }),
+               "values and fill_value should have the same type.",
+               cudf::data_type_error);
   auto stream = cudf::get_default_stream();
   std::vector<std::unique_ptr<column>> results;
   auto const& group_offsets = helper().group_offsets(stream);
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 3109a36cbcf..78ddd7f5ad5 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -231,9 +232,9 @@ DLManagedTensor* to_dlpack(table_view const& input,
   DLDataType const dltype = data_type_to_DLDataType(type);
 
   // Ensure all columns are the same type
-  CUDF_EXPECTS(
-    std::all_of(input.begin(), input.end(), [type](auto const& col) { return col.type() == type; }),
-    "All columns required to have same data type");
+  CUDF_EXPECTS(cudf::all_have_same_types(input.begin(), input.end()),
+               "All columns required to have same data type",
+               cudf::data_type_error);
 
   // Ensure none of the columns have nulls
   CUDF_EXPECTS(
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index fbe16378e8c..b0184ff6a86 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -21,6 +21,8 @@
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/join.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -569,12 +571,9 @@ hash_join<Hasher>::compute_hash_join(cudf::table_view const& probe,
                      std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
-  CUDF_EXPECTS(std::equal(std::cbegin(_build),
-                          std::cend(_build),
-                          std::cbegin(probe),
-                          std::cend(probe),
-                          [](auto const& b, auto const& p) { return b.type() == p.type(); }),
-               "Mismatch in joining column data types");
+  CUDF_EXPECTS(cudf::have_same_types(_build, probe),
+               "Mismatch in joining column data types",
+               cudf::data_type_error);
 
   return probe_join_indices(probe, join, output_size, stream, mr);
 }
diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu
index 1bfa7f39190..7ee1d540831 100644
--- a/cpp/src/labeling/label_bins.cu
+++ b/cpp/src/labeling/label_bins.cu
@@ -26,6 +26,7 @@
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -208,8 +209,10 @@ std::unique_ptr<column> label_bins(column_view const& input,
                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE()
-  CUDF_EXPECTS((input.type() == left_edges.type()) && (input.type() == right_edges.type()),
-               "The input and edge columns must have the same types.");
+  CUDF_EXPECTS(
+    cudf::have_same_types(input, left_edges) && cudf::have_same_types(input, right_edges),
+    "The input and edge columns must have the same types.",
+    cudf::data_type_error);
   CUDF_EXPECTS(left_edges.size() == right_edges.size(),
                "The left and right edge columns must be of the same length.");
   CUDF_EXPECTS(!left_edges.has_nulls() && !right_edges.has_nulls(),
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index 38d299763a1..bc1b48b11cd 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/lists/combine.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -204,12 +205,11 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
     std::all_of(input.begin(),
                 input.end(),
                 [](column_view const& col) { return col.type().id() == cudf::type_id::LIST; }),
-    "All columns of the input table must be of lists column type.");
-  CUDF_EXPECTS(
-    std::all_of(std::next(input.begin()),
-                input.end(),
-                [a = *input.begin()](column_view const& b) { return column_types_equal(a, b); }),
-    "The types of entries in the input columns must be the same.");
+    "All columns of the input table must be of list column type.",
+    cudf::data_type_error);
+  CUDF_EXPECTS(cudf::all_have_same_types(input.begin(), input.end()),
+               "The types of entries in the input columns must be the same.",
+               cudf::data_type_error);
 
   auto const num_rows = input.num_rows();
   auto const num_cols = input.num_columns();
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 4737b077deb..f03d394d6d7 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -27,6 +27,7 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -194,7 +195,7 @@ std::unique_ptr<column> dispatch_index_of(lists_column_view const& lists,
   // comparisons.
   auto const child = lists.child();
 
-  CUDF_EXPECTS(child.type() == search_keys.type(),
+  CUDF_EXPECTS(cudf::have_same_types(child, search_keys),
                "Type/Scale of search key does not match list column element type.",
                cudf::data_type_error);
   CUDF_EXPECTS(search_keys.type().id() != type_id::EMPTY, "Type cannot be empty.");
diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
index cb14ae7619b..7d57d8ddb60 100644
--- a/cpp/src/lists/sequences.cu
+++ b/cpp/src/lists/sequences.cu
@@ -23,6 +23,8 @@
 #include <cudf/lists/filling.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -139,15 +141,18 @@ std::unique_ptr<column> sequences(column_view const& starts,
                "starts and sizes input columns must not have nulls.");
   CUDF_EXPECTS(starts.size() == sizes.size(),
                "starts and sizes input columns must have the same number of rows.");
-  CUDF_EXPECTS(cudf::is_index_type(sizes.type()), "Input sizes column must be of integer types.");
+  CUDF_EXPECTS(cudf::is_index_type(sizes.type()),
+               "Input sizes column must be of integer types.",
+               cudf::data_type_error);
 
   if (steps) {
     auto const& steps_cv = steps.value();
     CUDF_EXPECTS(!steps_cv.has_nulls(), "steps input column must not have nulls.");
     CUDF_EXPECTS(starts.size() == steps_cv.size(),
                  "starts and steps input columns must have the same number of rows.");
-    CUDF_EXPECTS(starts.type() == steps_cv.type(),
-                 "starts and steps input columns must have the same type.");
+    CUDF_EXPECTS(cudf::have_same_types(starts, steps_cv),
+                 "starts and steps input columns must have the same type.",
+                 cudf::data_type_error);
   }
 
   auto const n_lists = starts.size();
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index f3352a3a52d..1d18b8c677c 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -52,7 +52,7 @@ namespace {
 void check_compatibility(lists_column_view const& lhs, lists_column_view const& rhs)
 {
   CUDF_EXPECTS(lhs.size() == rhs.size(), "The input lists column must have the same size.");
-  CUDF_EXPECTS(column_types_equal(lhs.child(), rhs.child()),
+  CUDF_EXPECTS(have_same_types(lhs.child(), rhs.child()),
                "The input lists columns must have children having the same type structure");
 }
 
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 5a3be259ed9..630cf328579 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -34,6 +34,7 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index d764ea7559f..cde0274339a 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -28,6 +28,8 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -154,8 +156,9 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(!init.has_value() || col.type() == init.value().get().type(),
-               "column and initial value must be the same type");
+  CUDF_EXPECTS(!init.has_value() || cudf::have_same_types(col, init.value().get()),
+               "column and initial value must be the same type",
+               cudf::data_type_error);
   if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT ||
                             agg.kind == aggregation::MIN || agg.kind == aggregation::MAX ||
                             agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) {
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index dee16b3e503..1ae344dcace 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -22,6 +22,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -112,8 +113,9 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(!init.has_value() || segmented_values.type() == init.value().get().type(),
-               "column and initial value must be the same type");
+  CUDF_EXPECTS(!init.has_value() || cudf::have_same_types(segmented_values, init.value().get()),
+               "column and initial value must be the same type",
+               cudf::data_type_error);
   if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT ||
                             agg.kind == aggregation::MIN || agg.kind == aggregation::MAX ||
                             agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) {
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 31ffc76a4a5..cb3caf9d068 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -33,6 +33,8 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -192,7 +194,9 @@ struct dispatch_clamp {
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
   {
-    CUDF_EXPECTS(lo.type() == input.type(), "mismatching types of scalar and input");
+    CUDF_EXPECTS(cudf::have_same_types(input, lo),
+                 "mismatching types of scalar and input",
+                 cudf::data_type_error);
 
     auto lo_itr         = make_optional_iterator<T>(lo, nullate::YES{});
     auto hi_itr         = make_optional_iterator<T>(hi, nullate::YES{});
@@ -316,9 +320,14 @@ std::unique_ptr<column> clamp(column_view const& input,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(lo.type() == hi.type(), "mismatching types of limit scalars");
-  CUDF_EXPECTS(lo_replace.type() == hi_replace.type(), "mismatching types of replace scalars");
-  CUDF_EXPECTS(lo.type() == lo_replace.type(), "mismatching types of limit and replace scalars");
+  CUDF_EXPECTS(
+    cudf::have_same_types(lo, hi), "mismatching types of limit scalars", cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(lo_replace, hi_replace),
+               "mismatching types of replace scalars",
+               cudf::data_type_error);
+  CUDF_EXPECTS(cudf::have_same_types(lo, lo_replace),
+               "mismatching types of limit and replace scalars",
+               cudf::data_type_error);
 
   if ((not lo.is_valid(stream) and not hi.is_valid(stream)) or (input.is_empty())) {
     // There will be no change
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index fe3d20e372e..13e130588c1 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -38,6 +38,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -216,7 +217,8 @@ struct replace_nulls_scalar_kernel_forwarder {
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
   {
-    CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
+    CUDF_EXPECTS(
+      cudf::have_same_types(input, replacement), "Data type mismatch", cudf::data_type_error);
     std::unique_ptr<cudf::column> output = cudf::detail::allocate_like(
       input, input.size(), cudf::mask_allocation_policy::NEVER, stream, mr);
     auto output_view = output->mutable_view();
@@ -252,9 +254,10 @@ std::unique_ptr<cudf::column> replace_nulls_scalar_kernel_forwarder::operator()<
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
+  CUDF_EXPECTS(
+    cudf::have_same_types(input, replacement), "Data type mismatch", cudf::data_type_error);
   cudf::strings_column_view input_s(input);
-  cudf::string_scalar const& repl = static_cast<cudf::string_scalar const&>(replacement);
+  auto const& repl = static_cast<cudf::string_scalar const&>(replacement);
   return cudf::strings::detail::replace_nulls(input_s, repl, stream, mr);
 }
 
@@ -318,7 +321,8 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == replacement.type(), "Data type mismatch");
+  CUDF_EXPECTS(
+    cudf::have_same_types(input, replacement), "Data type mismatch", cudf::data_type_error);
   CUDF_EXPECTS(replacement.size() == input.size(), "Column size mismatch");
 
   if (input.is_empty()) { return cudf::empty_like(input); }
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index 7bc0bd7e0be..c2cd03cd761 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -48,6 +48,7 @@
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -303,9 +304,10 @@ std::unique_ptr<cudf::column> find_and_replace_all(cudf::column_view const& inpu
   CUDF_EXPECTS(values_to_replace.size() == replacement_values.size(),
                "values_to_replace and replacement_values size mismatch.");
 
-  CUDF_EXPECTS(
-    input_col.type() == values_to_replace.type() && input_col.type() == replacement_values.type(),
-    "Columns type mismatch");
+  CUDF_EXPECTS(cudf::have_same_types(input_col, values_to_replace) &&
+                 cudf::have_same_types(input_col, replacement_values),
+               "Columns type mismatch",
+               cudf::data_type_error);
   CUDF_EXPECTS(not values_to_replace.has_nulls(), "values_to_replace must not have nulls");
 
   if (input_col.is_empty() or values_to_replace.is_empty() or replacement_values.is_empty()) {
diff --git a/cpp/src/rolling/detail/lead_lag_nested.cuh b/cpp/src/rolling/detail/lead_lag_nested.cuh
index 269868910c7..cfedcac8ae4 100644
--- a/cpp/src/rolling/detail/lead_lag_nested.cuh
+++ b/cpp/src/rolling/detail/lead_lag_nested.cuh
@@ -23,7 +23,9 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/scatter.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -99,8 +101,9 @@ std::unique_ptr<column> compute_lead_lag_for_nested(aggregation::Kind op,
 {
   CUDF_EXPECTS(op == aggregation::LEAD || op == aggregation::LAG,
                "Unexpected aggregation type in compute_lead_lag_for_nested");
-  CUDF_EXPECTS(default_outputs.type().id() == input.type().id(),
-               "Defaults column type must match input column.");  // Because LEAD/LAG.
+  CUDF_EXPECTS(cudf::have_same_types(input, default_outputs),
+               "Defaults column type must match input column.",
+               cudf::data_type_error);  // Because LEAD/LAG.
 
   CUDF_EXPECTS(default_outputs.is_empty() || (input.size() == default_outputs.size()),
                "Number of defaults must match input column.");
diff --git a/cpp/src/search/contains_scalar.cu b/cpp/src/search/contains_scalar.cu
index 0b344ec347b..e88acf68e28 100644
--- a/cpp/src/search/contains_scalar.cu
+++ b/cpp/src/search/contains_scalar.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,8 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -62,7 +64,9 @@ struct contains_scalar_dispatch {
                                                            scalar const& needle,
                                                            rmm::cuda_stream_view stream) const
   {
-    CUDF_EXPECTS(haystack.type() == needle.type(), "Scalar and column types must match");
+    CUDF_EXPECTS(cudf::have_same_types(haystack, needle),
+                 "Scalar and column types must match",
+                 cudf::data_type_error);
     // Don't need to check for needle validity. If it is invalid, it should be handled by the caller
     // before dispatching to this function.
 
@@ -87,7 +91,9 @@ struct contains_scalar_dispatch {
                                                           scalar const& needle,
                                                           rmm::cuda_stream_view stream) const
   {
-    CUDF_EXPECTS(haystack.type() == needle.type(), "Scalar and column types must match");
+    CUDF_EXPECTS(cudf::have_same_types(haystack, needle),
+                 "Scalar and column types must match",
+                 cudf::data_type_error);
     // Don't need to check for needle validity. If it is invalid, it should be handled by the caller
     // before dispatching to this function.
     // In addition, haystack and needle structure compatibility will be checked later on by
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index 13417fdab63..466f9093194 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -22,6 +22,7 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index 2f7564b3b0d..972a4ffd58e 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -26,6 +26,8 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -228,13 +230,17 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                "Parameter starts must have the same number of rows as strings.");
   CUDF_EXPECTS(stops_column.size() == strings_count,
                "Parameter stops must have the same number of rows as strings.");
-  CUDF_EXPECTS(starts_column.type() == stops_column.type(),
-               "Parameters starts and stops must be of the same type.");
+  CUDF_EXPECTS(cudf::have_same_types(starts_column, stops_column),
+               "Parameters starts and stops must be of the same type.",
+               cudf::data_type_error);
   CUDF_EXPECTS(starts_column.null_count() == 0, "Parameter starts must not contain nulls.");
   CUDF_EXPECTS(stops_column.null_count() == 0, "Parameter stops must not contain nulls.");
   CUDF_EXPECTS(starts_column.type().id() != data_type{type_id::BOOL8}.id(),
-               "Positions values must not be bool type.");
-  CUDF_EXPECTS(is_fixed_width(starts_column.type()), "Positions values must be fixed width type.");
+               "Positions values must not be bool type.",
+               cudf::data_type_error);
+  CUDF_EXPECTS(is_fixed_width(starts_column.type()),
+               "Positions values must be fixed width type.",
+               cudf::data_type_error);
 
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto starts_iter    = cudf::detail::indexalator_factory::make_input_iterator(starts_column);
diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp
index bcbf2d44139..13832b0d9dc 100644
--- a/cpp/src/table/table_view.cpp
+++ b/cpp/src/table/table_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -145,30 +145,21 @@ bool has_nested_nullable_columns(table_view const& input)
   });
 }
 
-bool have_same_types(table_view const& lhs, table_view const& rhs)
+namespace detail {
+
+template <typename TableView>
+bool is_relationally_comparable(TableView const& lhs, TableView const& rhs)
 {
   return std::equal(lhs.begin(),
                     lhs.end(),
                     rhs.begin(),
                     rhs.end(),
                     [](column_view const& lcol, column_view const& rcol) {
-                      return cudf::column_types_equal(lcol, rcol);
+                      return cudf::is_relationally_comparable(lcol.type()) and
+                             cudf::have_same_types(lcol, rcol);
                     });
 }
 
-namespace detail {
-
-template <typename TableView>
-bool is_relationally_comparable(TableView const& lhs, TableView const& rhs)
-{
-  return std::all_of(thrust::counting_iterator<size_type>(0),
-                     thrust::counting_iterator<size_type>(lhs.num_columns()),
-                     [lhs, rhs](auto const i) {
-                       return lhs.column(i).type() == rhs.column(i).type() and
-                              cudf::is_relationally_comparable(lhs.column(i).type());
-                     });
-}
-
 // Explicit template instantiation for a table of immutable views
 template bool is_relationally_comparable<table_view>(table_view const& lhs, table_view const& rhs);
 
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 570060b3870..723c306da1d 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -24,6 +24,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -61,7 +62,9 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const&
                                                               rmm::cuda_stream_view stream,
                                                               rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(input.type() == categories.type(), "Mismatch type between input and categories.");
+  CUDF_EXPECTS(cudf::have_same_types(input, categories),
+               "Mismatch type between input and categories.",
+               cudf::data_type_error);
 
   if (categories.is_empty()) { return {make_empty_column(type_id::BOOL8), table_view{}}; }
 
diff --git a/cpp/src/utilities/type_checks.cpp b/cpp/src/utilities/type_checks.cpp
index d6f5c65593a..dac981fb532 100644
--- a/cpp/src/utilities/type_checks.cpp
+++ b/cpp/src/utilities/type_checks.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -28,15 +30,16 @@ namespace {
 
 struct columns_equal_fn {
   template <typename T>
-  bool operator()(column_view const&, column_view const&)
+  bool operator()(column_view const& lhs, column_view const& rhs)
   {
-    return true;
+    return lhs.type() == rhs.type();
   }
 };
 
 template <>
 bool columns_equal_fn::operator()<dictionary32>(column_view const& lhs, column_view const& rhs)
 {
+  if (not cudf::is_dictionary(rhs.type())) { return false; }
   auto const kidx = dictionary_column_view::keys_column_index;
   return lhs.num_children() > 0 and rhs.num_children() > 0
            ? lhs.child(kidx).type() == rhs.child(kidx).type()
@@ -46,33 +49,132 @@ bool columns_equal_fn::operator()<dictionary32>(column_view const& lhs, column_v
 template <>
 bool columns_equal_fn::operator()<list_view>(column_view const& lhs, column_view const& rhs)
 {
+  if (rhs.type().id() != type_id::LIST) { return false; }
   auto const& ci = lists_column_view::child_column_index;
-  return column_types_equal(lhs.child(ci), rhs.child(ci));
+  return have_same_types(lhs.child(ci), rhs.child(ci));
 }
 
 template <>
 bool columns_equal_fn::operator()<struct_view>(column_view const& lhs, column_view const& rhs)
 {
-  return lhs.num_children() == rhs.num_children() and
-         std::all_of(thrust::make_counting_iterator(0),
-                     thrust::make_counting_iterator(lhs.num_children()),
-                     [&](auto i) { return column_types_equal(lhs.child(i), rhs.child(i)); });
+  if (rhs.type().id() != type_id::STRUCT) { return false; }
+  return std::equal(lhs.child_begin(),
+                    lhs.child_end(),
+                    rhs.child_begin(),
+                    rhs.child_end(),
+                    [](auto const& lhs, auto const& rhs) { return have_same_types(lhs, rhs); });
+}
+
+struct column_scalar_equal_fn {
+  template <typename T>
+  bool operator()(column_view const& col, scalar const& slr)
+  {
+    return col.type() == slr.type();
+  }
+};
+
+template <>
+bool column_scalar_equal_fn::operator()<dictionary32>(column_view const& col, scalar const& slr)
+{
+  // It is not possible to have a scalar dictionary, so compare the dictionary
+  // column keys type to the scalar type.
+  auto col_keys = cudf::dictionary_column_view(col).keys();
+  return have_same_types(col_keys, slr);
+}
+
+template <>
+bool column_scalar_equal_fn::operator()<list_view>(column_view const& col, scalar const& slr)
+{
+  if (slr.type().id() != type_id::LIST) { return false; }
+  auto const& ci      = lists_column_view::child_column_index;
+  auto const list_slr = static_cast<list_scalar const*>(&slr);
+  return have_same_types(col.child(ci), list_slr->view());
+}
+
+template <>
+bool column_scalar_equal_fn::operator()<struct_view>(column_view const& col, scalar const& slr)
+{
+  if (slr.type().id() != type_id::STRUCT) { return false; }
+  auto const struct_slr = static_cast<struct_scalar const*>(&slr);
+  auto const slr_tbl    = struct_slr->view();
+  return std::equal(col.child_begin(),
+                    col.child_end(),
+                    slr_tbl.begin(),
+                    slr_tbl.end(),
+                    [](auto const& lhs, auto const& rhs) { return have_same_types(lhs, rhs); });
+}
+
+struct scalars_equal_fn {
+  template <typename T>
+  bool operator()(scalar const& lhs, scalar const& rhs)
+  {
+    return lhs.type() == rhs.type();
+  }
+};
+
+template <>
+bool scalars_equal_fn::operator()<list_view>(scalar const& lhs, scalar const& rhs)
+{
+  if (rhs.type().id() != type_id::LIST) { return false; }
+  auto const list_lhs = static_cast<list_scalar const*>(&lhs);
+  auto const list_rhs = static_cast<list_scalar const*>(&rhs);
+  return have_same_types(list_lhs->view(), list_rhs->view());
+}
+
+template <>
+bool scalars_equal_fn::operator()<struct_view>(scalar const& lhs, scalar const& rhs)
+{
+  if (rhs.type().id() != type_id::STRUCT) { return false; }
+  auto const tbl_lhs = static_cast<struct_scalar const*>(&lhs)->view();
+  auto const tbl_rhs = static_cast<struct_scalar const*>(&rhs)->view();
+  return have_same_types(tbl_lhs, tbl_rhs);
 }
 
 };  // namespace
 
 // Implementation note: avoid using double dispatch for this function
 // as it increases code paths to NxN for N types.
-bool column_types_equal(column_view const& lhs, column_view const& rhs)
+bool have_same_types(column_view const& lhs, column_view const& rhs)
 {
-  if (lhs.type() != rhs.type()) { return false; }
   return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs);
 }
 
+bool column_types_equal(column_view const& lhs, column_view const& rhs)
+{
+  return have_same_types(lhs, rhs);
+}
+
+bool have_same_types(column_view const& lhs, scalar const& rhs)
+{
+  return type_dispatcher(lhs.type(), column_scalar_equal_fn{}, lhs, rhs);
+}
+
+bool have_same_types(scalar const& lhs, column_view const& rhs)
+{
+  return have_same_types(rhs, lhs);
+}
+
+bool have_same_types(scalar const& lhs, scalar const& rhs)
+{
+  return type_dispatcher(lhs.type(), scalars_equal_fn{}, lhs, rhs);
+}
+
+bool have_same_types(table_view const& lhs, table_view const& rhs)
+{
+  return std::equal(
+    lhs.begin(),
+    lhs.end(),
+    rhs.begin(),
+    rhs.end(),
+    [](column_view const& lcol, column_view const& rcol) { return have_same_types(lcol, rcol); });
+}
+
 bool column_types_equivalent(column_view const& lhs, column_view const& rhs)
 {
-  if (lhs.type().id() != rhs.type().id()) { return false; }
-  return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs);
+  // Check if the columns have fixed point types. This is the only case where
+  // type equality and equivalence differ.
+  if (cudf::is_fixed_point(lhs.type())) { return lhs.type().id() == rhs.type().id(); }
+  return have_same_types(lhs, rhs);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index c2d1e1d9f4f..a9bf22682cf 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -31,6 +31,7 @@
 #include <cudf/filling.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 
@@ -1226,7 +1227,7 @@ TEST_F(ListsColumnTest, ConcatenateMismatchedHierarchies)
     cudf::test::lists_column_wrapper<int> b{{{LCW{}}}};
     cudf::test::lists_column_wrapper<int> c{{LCW{}}};
 
-    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::data_type_error);
   }
 
   {
@@ -1235,7 +1236,7 @@ TEST_F(ListsColumnTest, ConcatenateMismatchedHierarchies)
     cudf::test::lists_column_wrapper<int> b{{{LCW{}}}};
     cudf::test::lists_column_wrapper<int> c{{LCW{}}};
 
-    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::data_type_error);
   }
 
   {
@@ -1243,14 +1244,14 @@ TEST_F(ListsColumnTest, ConcatenateMismatchedHierarchies)
     cudf::test::lists_column_wrapper<int> b{1, 2, 3};
     cudf::test::lists_column_wrapper<int> c{{3, 4, 5}};
 
-    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b, c})), cudf::data_type_error);
   }
 
   {
     cudf::test::lists_column_wrapper<int> a{{{1, 2, 3}}};
     cudf::test::lists_column_wrapper<int> b{{4, 5}};
 
-    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b})), cudf::logic_error);
+    EXPECT_THROW(cudf::concatenate(std::vector<column_view>({a, b})), cudf::data_type_error);
   }
 }
 
@@ -1605,7 +1606,7 @@ TEST_F(FixedPointTest, FixedPointScaleMismatch)
   auto const b = fp_wrapper(vec.begin() + 300, vec.begin() + 700, scale_type{-2});
   auto const c = fp_wrapper(vec.begin() + 700, vec.end(), /*****/ scale_type{-3});
 
-  EXPECT_THROW(cudf::concatenate(std::vector<cudf::column_view>{a, b, c}), cudf::logic_error);
+  EXPECT_THROW(cudf::concatenate(std::vector<cudf::column_view>{a, b, c}), cudf::data_type_error);
 }
 
 struct DictionaryConcatTest : public cudf::test::BaseFixture {};
@@ -1650,7 +1651,7 @@ TEST_F(DictionaryConcatTest, ErrorsTest)
   cudf::test::fixed_width_column_wrapper<int32_t> integers({10, 30, 20});
   auto dictionary2 = cudf::dictionary::encode(integers);
   std::vector<cudf::column_view> views({dictionary1->view(), dictionary2->view()});
-  EXPECT_THROW(cudf::concatenate(views), cudf::logic_error);
+  EXPECT_THROW(cudf::concatenate(views), cudf::data_type_error);
   std::vector<cudf::column_view> empty;
   EXPECT_THROW(cudf::concatenate(empty), cudf::logic_error);
 }
diff --git a/cpp/tests/copying/copy_range_tests.cpp b/cpp/tests/copying/copy_range_tests.cpp
index bcc0ac29b3e..223946ddcee 100644
--- a/cpp/tests/copying/copy_range_tests.cpp
+++ b/cpp/tests/copying/copy_range_tests.cpp
@@ -465,7 +465,7 @@ TEST_F(CopyRangeErrorTestFixture, DTypeMismatch)
   auto dict_target = cudf::dictionary::encode(target);
   auto dict_source = cudf::dictionary::encode(source);
   EXPECT_THROW(cudf::copy_range(dict_source->view(), dict_target->view(), 0, 100, 0),
-               cudf::logic_error);
+               cudf::data_type_error);
 }
 
 template <typename T>
diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp
index 138e1935363..f31d8d6f79a 100644
--- a/cpp/tests/copying/copy_tests.cpp
+++ b/cpp/tests/copying/copy_tests.cpp
@@ -712,7 +712,7 @@ TEST_F(DictionaryCopyIfElseTest, TypeMismatch)
   cudf::test::dictionary_column_wrapper<double> input2({1.0, 1.0, 1.0, 1.0});
   cudf::test::fixed_width_column_wrapper<bool> mask({1, 0, 0, 1});
 
-  EXPECT_THROW(cudf::copy_if_else(input1, input2, mask), cudf::logic_error);
+  EXPECT_THROW(cudf::copy_if_else(input1, input2, mask), cudf::data_type_error);
 
   cudf::string_scalar input3{"1"};
   EXPECT_THROW(cudf::copy_if_else(input1, input3, mask), cudf::data_type_error);
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index 2be3c26af1d..99b86c86997 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -542,11 +542,6 @@ struct ListGetStructValueTest : public cudf::test::BaseFixture {
     return SCW{{field1, field2, field3}, mask};
   }
 
-  /**
-   * @brief Create a 0-length structs column
-   */
-  SCW zero_length_struct() { return SCW{}; }
-
   /**
    * @brief Concatenate structs columns, allow specifying inputs in `initializer_list`
    */
@@ -653,7 +648,7 @@ TYPED_TEST(ListGetStructValueTest, NonNestedGetNonNullEmpty)
   cudf::size_type index = 2;
   // For well-formed list column, an empty list still holds the complete structure of
   // a 0-length structs column
-  auto expected_data = this->zero_length_struct();
+  auto expected_data = this->make_test_structs_column({}, {}, {}, no_nulls());
 
   auto s       = cudf::get_element(list_column->view(), index);
   auto typed_s = static_cast<cudf::list_scalar const*>(s.get());
@@ -757,8 +752,8 @@ TYPED_TEST(ListGetStructValueTest, NestedGetNonNullEmpty)
   auto list_column_nested =
     this->make_test_lists_column(3, {0, 1, 1, 2}, std::move(list_column), {1, 1, 1});
 
-  auto expected_data =
-    this->make_test_lists_column(0, {0}, this->zero_length_struct().release(), {});
+  auto expected_data = this->make_test_lists_column(
+    0, {0}, this->make_test_structs_column({}, {}, {}, no_nulls()).release(), {});
 
   cudf::size_type index = 1;
   auto s                = cudf::get_element(list_column_nested->view(), index);
diff --git a/cpp/tests/dictionary/add_keys_test.cpp b/cpp/tests/dictionary/add_keys_test.cpp
index 1314375f383..46bf5468922 100644
--- a/cpp/tests/dictionary/add_keys_test.cpp
+++ b/cpp/tests/dictionary/add_keys_test.cpp
@@ -22,6 +22,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/dictionary/update_keys.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <vector>
 
@@ -83,7 +84,7 @@ TEST_F(DictionaryAddKeysTest, Errors)
   auto dictionary = cudf::dictionary::encode(input);
 
   cudf::test::fixed_width_column_wrapper<float> new_keys{1.0, 2.0, 3.0};
-  EXPECT_THROW(cudf::dictionary::add_keys(dictionary->view(), new_keys), cudf::logic_error);
+  EXPECT_THROW(cudf::dictionary::add_keys(dictionary->view(), new_keys), cudf::data_type_error);
   cudf::test::fixed_width_column_wrapper<int64_t> null_keys{{1, 2, 3}, {1, 0, 1}};
   EXPECT_THROW(cudf::dictionary::add_keys(dictionary->view(), null_keys), cudf::logic_error);
 }
diff --git a/cpp/tests/dictionary/remove_keys_test.cpp b/cpp/tests/dictionary/remove_keys_test.cpp
index 13fe3efd0f4..9950a39d630 100644
--- a/cpp/tests/dictionary/remove_keys_test.cpp
+++ b/cpp/tests/dictionary/remove_keys_test.cpp
@@ -22,6 +22,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/dictionary/update_keys.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -119,7 +120,7 @@ TEST_F(DictionaryRemoveKeysTest, Errors)
   auto const dictionary = cudf::dictionary::encode(input);
 
   cudf::test::fixed_width_column_wrapper<float> del_keys{1.0, 2.0, 3.0};
-  EXPECT_THROW(cudf::dictionary::remove_keys(dictionary->view(), del_keys), cudf::logic_error);
+  EXPECT_THROW(cudf::dictionary::remove_keys(dictionary->view(), del_keys), cudf::data_type_error);
   cudf::test::fixed_width_column_wrapper<int64_t> null_keys{{1, 2, 3}, {1, 0, 1}};
   EXPECT_THROW(cudf::dictionary::remove_keys(dictionary->view(), null_keys), cudf::logic_error);
 }
diff --git a/cpp/tests/dictionary/scatter_test.cpp b/cpp/tests/dictionary/scatter_test.cpp
index 2a2841827d0..2f77f4ee621 100644
--- a/cpp/tests/dictionary/scatter_test.cpp
+++ b/cpp/tests/dictionary/scatter_test.cpp
@@ -141,5 +141,5 @@ TEST_F(DictionaryScatterTest, Error)
   EXPECT_THROW(
     cudf::scatter(
       cudf::table_view{{source->view()}}, scatter_map, cudf::table_view{{target->view()}}),
-    cudf::logic_error);
+    cudf::data_type_error);
 }
diff --git a/cpp/tests/dictionary/search_test.cpp b/cpp/tests/dictionary/search_test.cpp
index 600d00ac186..b49b4ce5aa0 100644
--- a/cpp/tests/dictionary/search_test.cpp
+++ b/cpp/tests/dictionary/search_test.cpp
@@ -77,9 +77,9 @@ TEST_F(DictionarySearchTest, Errors)
 {
   cudf::test::dictionary_column_wrapper<int64_t> dictionary({1, 2, 3});
   cudf::numeric_scalar<double> key(7);
-  EXPECT_THROW(cudf::dictionary::get_index(dictionary, key), cudf::logic_error);
+  EXPECT_THROW(cudf::dictionary::get_index(dictionary, key), cudf::data_type_error);
   EXPECT_THROW(
     cudf::dictionary::detail::get_insert_index(
       dictionary, key, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
-    cudf::logic_error);
+    cudf::data_type_error);
 }
diff --git a/cpp/tests/dictionary/set_keys_test.cpp b/cpp/tests/dictionary/set_keys_test.cpp
index d0c37493cf8..5c9ec3567fe 100644
--- a/cpp/tests/dictionary/set_keys_test.cpp
+++ b/cpp/tests/dictionary/set_keys_test.cpp
@@ -21,6 +21,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/dictionary/update_keys.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -82,7 +83,7 @@ TEST_F(DictionarySetKeysTest, Errors)
   auto dictionary = cudf::dictionary::encode(input);
 
   cudf::test::fixed_width_column_wrapper<float> new_keys{1.0, 2.0, 3.0};
-  EXPECT_THROW(cudf::dictionary::set_keys(dictionary->view(), new_keys), cudf::logic_error);
+  EXPECT_THROW(cudf::dictionary::set_keys(dictionary->view(), new_keys), cudf::data_type_error);
   cudf::test::fixed_width_column_wrapper<int64_t> null_keys{{1, 2, 3}, {1, 0, 1}};
   EXPECT_THROW(cudf::dictionary::set_keys(dictionary->view(), null_keys), cudf::logic_error);
 }
diff --git a/cpp/tests/filling/fill_tests.cpp b/cpp/tests/filling/fill_tests.cpp
index 95a27defa4e..26badefe698 100644
--- a/cpp/tests/filling/fill_tests.cpp
+++ b/cpp/tests/filling/fill_tests.cpp
@@ -359,8 +359,8 @@ TEST_F(FillErrorTestFixture, DTypeMismatch)
 
   auto destination_view = cudf::mutable_column_view{destination};
 
-  EXPECT_THROW(cudf::fill_in_place(destination_view, 0, 10, *p_val), cudf::logic_error);
-  EXPECT_THROW(auto p_ret = cudf::fill(destination, 0, 10, *p_val), cudf::logic_error);
+  EXPECT_THROW(cudf::fill_in_place(destination_view, 0, 10, *p_val), cudf::data_type_error);
+  EXPECT_THROW(auto p_ret = cudf::fill(destination, 0, 10, *p_val), cudf::data_type_error);
 }
 
 template <typename T>
diff --git a/cpp/tests/filling/sequence_tests.cpp b/cpp/tests/filling/sequence_tests.cpp
index cf619aace5a..5651a26f192 100644
--- a/cpp/tests/filling/sequence_tests.cpp
+++ b/cpp/tests/filling/sequence_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -102,15 +102,15 @@ TEST_F(SequenceTestFixture, MismatchedInputs)
 {
   cudf::numeric_scalar<int> init(0);
   cudf::numeric_scalar<float> step(-5);
-  EXPECT_THROW(cudf::sequence(10, init, step), cudf::logic_error);
+  EXPECT_THROW(cudf::sequence(10, init, step), cudf::data_type_error);
 
   cudf::numeric_scalar<int> init2(0);
   cudf::numeric_scalar<int8_t> step2(-5);
-  EXPECT_THROW(cudf::sequence(10, init2, step2), cudf::logic_error);
+  EXPECT_THROW(cudf::sequence(10, init2, step2), cudf::data_type_error);
 
   cudf::numeric_scalar<float> init3(0);
   cudf::numeric_scalar<double> step3(-5);
-  EXPECT_THROW(cudf::sequence(10, init3, step3), cudf::logic_error);
+  EXPECT_THROW(cudf::sequence(10, init3, step3), cudf::data_type_error);
 }
 
 TYPED_TEST(SequenceTypedTestFixture, DefaultStep)
diff --git a/cpp/tests/groupby/shift_tests.cpp b/cpp/tests/groupby/shift_tests.cpp
index d2ecb667eca..1a6abf2e734 100644
--- a/cpp/tests/groupby/shift_tests.cpp
+++ b/cpp/tests/groupby/shift_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -507,7 +507,7 @@ TEST_F(groupby_shift_fixed_point_type_test, MismatchScaleType)
 
   EXPECT_THROW(test_groupby_shift_multi(
                  key, cudf::table_view{{v1}}, offset, {*slr1}, cudf::table_view{{stub}}),
-               cudf::logic_error);
+               cudf::data_type_error);
 }
 
 TEST_F(groupby_shift_fixed_point_type_test, MismatchRepType)
@@ -525,5 +525,5 @@ TEST_F(groupby_shift_fixed_point_type_test, MismatchRepType)
 
   EXPECT_THROW(test_groupby_shift_multi(
                  key, cudf::table_view{{v1}}, offset, {*slr1}, cudf::table_view{{stub}}),
-               cudf::logic_error);
+               cudf::data_type_error);
 }
diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp
index 895887ee348..ecc8558243d 100644
--- a/cpp/tests/interop/dlpack_test.cpp
+++ b/cpp/tests/interop/dlpack_test.cpp
@@ -20,6 +20,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/interop.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/host_vector.h>
 
@@ -98,7 +99,7 @@ TEST_F(DLPackUntypedTests, MultipleTypesToDlpack)
   cudf::test::fixed_width_column_wrapper<int16_t> col1({1, 2, 3, 4});
   cudf::test::fixed_width_column_wrapper<int32_t> col2({1, 2, 3, 4});
   cudf::table_view input({col1, col2});
-  EXPECT_THROW(cudf::to_dlpack(input), cudf::logic_error);
+  EXPECT_THROW(cudf::to_dlpack(input), cudf::data_type_error);
 }
 
 TEST_F(DLPackUntypedTests, InvalidNullsToDlpack)
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 3a8763ed9f3..fd8484bc70f 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -567,9 +567,7 @@ TEST_F(ParquetWriterTest, EmptyList)
   auto result = cudf::io::read_parquet(
     cudf::io::parquet_reader_options_builder(cudf::io::source_info(filepath)));
 
-  using lcw     = cudf::test::lists_column_wrapper<int64_t>;
-  auto expected = lcw{lcw{}, lcw{}, lcw{}};
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), L0->view());
 }
 
 TEST_F(ParquetWriterTest, DeepEmptyList)
diff --git a/cpp/tests/labeling/label_bins_tests.cpp b/cpp/tests/labeling/label_bins_tests.cpp
index 2ac6ad5dd0d..1a9e74df9be 100644
--- a/cpp/tests/labeling/label_bins_tests.cpp
+++ b/cpp/tests/labeling/label_bins_tests.cpp
@@ -25,6 +25,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/labeling/label_bins.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <algorithm>
 #include <limits>
@@ -64,7 +65,7 @@ TEST(BinColumnErrorTests, TestInvalidLeft)
 
   EXPECT_THROW(
     cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO),
-    cudf::logic_error);
+    cudf::data_type_error);
 };
 
 // Right edges type check.
@@ -76,7 +77,7 @@ TEST(BinColumnErrorTests, TestInvalidRight)
 
   EXPECT_THROW(
     cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO),
-    cudf::logic_error);
+    cudf::data_type_error);
 };
 
 // Input type check.
@@ -88,7 +89,7 @@ TEST(BinColumnErrorTests, TestInvalidInput)
 
   EXPECT_THROW(
     cudf::label_bins(input, left_edges, cudf::inclusive::YES, right_edges, cudf::inclusive::NO),
-    cudf::logic_error);
+    cudf::data_type_error);
 };
 
 // Number of left and right edges must match.
diff --git a/cpp/tests/lists/combine/concatenate_rows_tests.cpp b/cpp/tests/lists/combine/concatenate_rows_tests.cpp
index 008003a08a1..bf088eb855a 100644
--- a/cpp/tests/lists/combine/concatenate_rows_tests.cpp
+++ b/cpp/tests/lists/combine/concatenate_rows_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/lists/combine.hpp>
+#include <cudf/utilities/error.hpp>
 
 using namespace cudf::test::iterators;
 
@@ -53,7 +54,7 @@ TEST_F(ListConcatenateRowsTest, InvalidInput)
     auto const col1 = IntListsCol{}.release();
     auto const col2 = StrListsCol{}.release();
     EXPECT_THROW(cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}}),
-                 cudf::logic_error);
+                 cudf::data_type_error);
   }
 }
 
diff --git a/cpp/tests/lists/sequences_tests.cpp b/cpp/tests/lists/sequences_tests.cpp
index e97600a76d3..74545903eb3 100644
--- a/cpp/tests/lists/sequences_tests.cpp
+++ b/cpp/tests/lists/sequences_tests.cpp
@@ -22,6 +22,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/lists/filling.hpp>
+#include <cudf/utilities/error.hpp>
 
 using namespace cudf::test::iterators;
 
@@ -200,8 +201,8 @@ TEST_F(NumericSequencesTest, InvalidSizesInput)
   auto const steps  = IntsCol{};
   auto const sizes  = FWDCol<float>{};
 
-  EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::logic_error);
-  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error);
+  EXPECT_THROW(cudf::lists::sequences(starts, sizes), cudf::data_type_error);
+  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::data_type_error);
 }
 
 TEST_F(NumericSequencesTest, MismatchedColumnSizesInput)
@@ -220,7 +221,7 @@ TEST_F(NumericSequencesTest, MismatchedColumnTypesInput)
   auto const steps  = FWDCol<float>{1, 2, 3};
   auto const sizes  = IntsCol{1, 2, 3};
 
-  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::logic_error);
+  EXPECT_THROW(cudf::lists::sequences(starts, steps, sizes), cudf::data_type_error);
 }
 
 TEST_F(NumericSequencesTest, InputHasNulls)
diff --git a/cpp/tests/replace/clamp_test.cpp b/cpp/tests/replace/clamp_test.cpp
index bb33de1f1e7..239c9ce6ddd 100644
--- a/cpp/tests/replace/clamp_test.cpp
+++ b/cpp/tests/replace/clamp_test.cpp
@@ -25,6 +25,7 @@
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -41,7 +42,7 @@ TEST_F(ClampErrorTest, MisMatchingScalarTypes)
 
   cudf::test::fixed_width_column_wrapper<int32_t> input({1, 2, 3, 4, 5, 6});
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::data_type_error);
 }
 
 TEST_F(ClampErrorTest, MisMatchingInputAndScalarTypes)
@@ -53,7 +54,7 @@ TEST_F(ClampErrorTest, MisMatchingInputAndScalarTypes)
 
   cudf::test::fixed_width_column_wrapper<int64_t> input({1, 2, 3, 4, 5, 6});
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::data_type_error);
 }
 
 TEST_F(ClampErrorTest, MisMatchingReplaceScalarTypes)
@@ -69,7 +70,7 @@ TEST_F(ClampErrorTest, MisMatchingReplaceScalarTypes)
 
   cudf::test::fixed_width_column_wrapper<int64_t> input({1, 2, 3, 4, 5, 6});
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *lo_replace, *hi, *hi_replace), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *lo_replace, *hi, *hi_replace), cudf::data_type_error);
 }
 
 TEST_F(ClampErrorTest, InValidCase1)
@@ -640,7 +641,7 @@ TYPED_TEST(FixedPointTest, MismatchedScalarScales)
   auto const hi    = cudf::make_fixed_point_scalar<decimalXX>(8, scale);
   auto const input = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, scale};
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::data_type_error);
 }
 
 TYPED_TEST(FixedPointTest, MismatchedColumnScalarScale)
@@ -655,7 +656,7 @@ TYPED_TEST(FixedPointTest, MismatchedColumnScalarScale)
   auto const hi    = cudf::make_fixed_point_scalar<decimalXX>(8, scale);
   auto const input = fp_wrapper{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, scale_type{-4}};
 
-  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::logic_error);
+  EXPECT_THROW(cudf::clamp(input, *lo, *hi), cudf::data_type_error);
 }
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index 6c23dd6bdc8..9603ea44a76 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -58,7 +58,7 @@ TEST_F(ReplaceErrorTest, TypeMismatch)
   cudf::test::fixed_width_column_wrapper<float> values_to_replace_column{
     {10, 11, 12, 13, 14, 15, 16, 17}};
 
-  EXPECT_THROW(cudf::replace_nulls(input_column, values_to_replace_column), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input_column, values_to_replace_column), cudf::data_type_error);
 }
 
 // Error: column type mismatch
@@ -68,7 +68,7 @@ TEST_F(ReplaceErrorTest, TypeMismatchScalar)
                                                                {0, 0, 1, 1, 1, 1, 1, 1}};
   cudf::numeric_scalar<float> replacement(1);
 
-  EXPECT_THROW(cudf::replace_nulls(input_column, replacement), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input_column, replacement), cudf::data_type_error);
 }
 
 struct ReplaceNullsStringsTest : public cudf::test::BaseFixture {};
@@ -659,14 +659,14 @@ TEST_F(ReplaceDictionaryTest, ReplaceNullsError)
   cudf::test::fixed_width_column_wrapper<int64_t> replacement_w({1, 2, 3, 4});
   auto replacement = cudf::dictionary::encode(replacement_w);
 
-  EXPECT_THROW(cudf::replace_nulls(input->view(), replacement->view()), cudf::logic_error);
-  EXPECT_THROW(cudf::replace_nulls(input->view(), cudf::string_scalar("x")), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input->view(), replacement->view()), cudf::data_type_error);
+  EXPECT_THROW(cudf::replace_nulls(input->view(), cudf::string_scalar("x")), cudf::data_type_error);
 
   cudf::test::fixed_width_column_wrapper<int64_t> input_one_w({1}, {0});
   auto input_one  = cudf::dictionary::encode(input_one_w);
   auto dict_input = cudf::dictionary_column_view(input_one->view());
   auto dict_repl  = cudf::dictionary_column_view(replacement->view());
-  EXPECT_THROW(cudf::replace_nulls(input->view(), replacement->view()), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input->view(), replacement->view()), cudf::data_type_error);
 }
 
 TEST_F(ReplaceDictionaryTest, ReplaceNullsEmpty)
diff --git a/cpp/tests/replace/replace_tests.cpp b/cpp/tests/replace/replace_tests.cpp
index 613034efc12..1858cd7782e 100644
--- a/cpp/tests/replace/replace_tests.cpp
+++ b/cpp/tests/replace/replace_tests.cpp
@@ -30,6 +30,7 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/replace.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -63,7 +64,7 @@ TEST_F(ReplaceErrorTest, TypeMismatch)
 
   EXPECT_THROW(
     cudf::find_and_replace_all(input_column, values_to_replace_column, replacement_values_column),
-    cudf::logic_error);
+    cudf::data_type_error);
 }
 
 // Error: nulls in old-values
diff --git a/cpp/tests/transform/one_hot_encode_tests.cpp b/cpp/tests/transform/one_hot_encode_tests.cpp
index 1015370fe4b..8384cb3480b 100644
--- a/cpp/tests/transform/one_hot_encode_tests.cpp
+++ b/cpp/tests/transform/one_hot_encode_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <cudf/table/table_view.hpp>
 #include <cudf/transform.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <limits>
 
@@ -198,7 +199,7 @@ TEST_F(OneHotEncodingTest, MismatchTypes)
   auto input    = cudf::test::strings_column_wrapper{"xx", "yy", "xx"};
   auto category = cudf::test::fixed_width_column_wrapper<int64_t>{1};
 
-  EXPECT_THROW(cudf::one_hot_encode(input, category), cudf::logic_error);
+  EXPECT_THROW(cudf::one_hot_encode(input, category), cudf::data_type_error);
 }
 
 TEST_F(OneHotEncodingTest, List)
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 047b096a283..7cc2777972e 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -31,6 +31,7 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -238,11 +239,6 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
 
 template <bool check_exact_equality>
 struct column_property_comparator {
-  bool types_equivalent(cudf::data_type const& lhs, cudf::data_type const& rhs)
-  {
-    return is_fixed_point(lhs) ? lhs.id() == rhs.id() : lhs == rhs;
-  }
-
   bool compare_common(cudf::column_view const& lhs,
                       cudf::column_view const& rhs,
                       cudf::column_view const& lhs_row_indices,
@@ -252,9 +248,9 @@ struct column_property_comparator {
     bool result = true;
 
     if (check_exact_equality) {
-      PROP_EXPECT_EQ(lhs.type(), rhs.type());
+      PROP_EXPECT_EQ(cudf::have_same_types(lhs, rhs), true);
     } else {
-      PROP_EXPECT_EQ(types_equivalent(lhs.type(), rhs.type()), true);
+      PROP_EXPECT_EQ(cudf::column_types_equivalent(lhs, rhs), true);
     }
 
     auto const lhs_size = check_exact_equality ? lhs.size() : lhs_row_indices.size();
diff --git a/cpp/tests/utilities_tests/type_check_tests.cpp b/cpp/tests/utilities_tests/type_check_tests.cpp
index 9c23798fce6..fecb896f95a 100644
--- a/cpp/tests/utilities_tests/type_check_tests.cpp
+++ b/cpp/tests/utilities_tests/type_check_tests.cpp
@@ -19,13 +19,11 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/table/table_view.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/wrappers/durations.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-namespace cudf {
-namespace test {
-
 template <typename T>
 struct ColumnTypeCheckTestTyped : public cudf::test::BaseFixture {};
 
@@ -35,56 +33,56 @@ TYPED_TEST_SUITE(ColumnTypeCheckTestTyped, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(ColumnTypeCheckTestTyped, SameFixedWidth)
 {
-  fixed_width_column_wrapper<TypeParam> lhs{1, 1}, rhs{2};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  cudf::test::fixed_width_column_wrapper<TypeParam> lhs{1, 1}, rhs{2};
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 }
 
 TEST_F(ColumnTypeCheckTest, SameString)
 {
-  strings_column_wrapper lhs{{'a', 'a'}}, rhs{{'b'}};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  cudf::test::strings_column_wrapper lhs{{'a', 'a'}}, rhs{{'b'}};
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 
-  strings_column_wrapper lhs2{}, rhs2{{'b'}};
-  EXPECT_TRUE(column_types_equal(lhs2, rhs2));
+  cudf::test::strings_column_wrapper lhs2{}, rhs2{{'b'}};
+  EXPECT_TRUE(cudf::have_same_types(lhs2, rhs2));
 
-  strings_column_wrapper lhs3{}, rhs3{};
-  EXPECT_TRUE(column_types_equal(lhs3, rhs3));
+  cudf::test::strings_column_wrapper lhs3{}, rhs3{};
+  EXPECT_TRUE(cudf::have_same_types(lhs3, rhs3));
 }
 
 TEST_F(ColumnTypeCheckTest, SameList)
 {
-  using LCW = lists_column_wrapper<int32_t>;
+  using LCW = cudf::test::lists_column_wrapper<int32_t>;
 
   LCW lhs{}, rhs{};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 
   LCW lhs2{{1, 2, 3}}, rhs2{{4, 5}};
-  EXPECT_TRUE(column_types_equal(lhs2, rhs2));
+  EXPECT_TRUE(cudf::have_same_types(lhs2, rhs2));
 
   LCW lhs3{{LCW{1}, LCW{2, 3}}}, rhs3{{LCW{4, 5}}};
-  EXPECT_TRUE(column_types_equal(lhs3, rhs3));
+  EXPECT_TRUE(cudf::have_same_types(lhs3, rhs3));
 
   LCW lhs4{{LCW{1}, LCW{}, LCW{2, 3}}}, rhs4{{LCW{4, 5}, LCW{}}};
-  EXPECT_TRUE(column_types_equal(lhs4, rhs4));
+  EXPECT_TRUE(cudf::have_same_types(lhs4, rhs4));
 }
 
 TYPED_TEST(ColumnTypeCheckTestTyped, SameDictionary)
 {
-  using DCW = dictionary_column_wrapper<TypeParam>;
+  using DCW = cudf::test::dictionary_column_wrapper<TypeParam>;
   DCW lhs{1, 1, 2, 3}, rhs{5, 5};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 
   DCW lhs2{}, rhs2{};
-  EXPECT_TRUE(column_types_equal(lhs2, rhs2));
+  EXPECT_TRUE(cudf::have_same_types(lhs2, rhs2));
 }
 
 TEST_F(ColumnTypeCheckTest, SameStruct)
 {
-  using SCW      = structs_column_wrapper;
-  using FCW      = fixed_width_column_wrapper<int32_t>;
-  using StringCW = strings_column_wrapper;
-  using LCW      = lists_column_wrapper<int32_t>;
-  using DCW      = dictionary_column_wrapper<int32_t>;
+  using SCW      = cudf::test::structs_column_wrapper;
+  using FCW      = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using StringCW = cudf::test::strings_column_wrapper;
+  using LCW      = cudf::test::lists_column_wrapper<int32_t>;
+  using DCW      = cudf::test::dictionary_column_wrapper<int32_t>;
 
   FCW lf1{1, 2, 3}, rf1{0, 1};
   StringCW lf2{"a", "bb", ""}, rf2{"cc", "d"};
@@ -92,127 +90,158 @@ TEST_F(ColumnTypeCheckTest, SameStruct)
   DCW lf4{5, 5, 5}, rf4{9, 9};
 
   SCW lhs{lf1, lf2, lf3, lf4}, rhs{rf1, rf2, rf3, rf4};
-  EXPECT_TRUE(column_types_equal(lhs, rhs));
+  EXPECT_TRUE(cudf::have_same_types(lhs, rhs));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentBasics)
 {
-  fixed_width_column_wrapper<int32_t> lhs1{1, 1};
-  strings_column_wrapper rhs1{"a", "bb"};
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs1{1, 1};
+  cudf::test::strings_column_wrapper rhs1{"a", "bb"};
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
-  lists_column_wrapper<string_view> lhs2{{"hello"}, {"world", "!"}};
-  strings_column_wrapper rhs2{"", "kk"};
+  cudf::test::lists_column_wrapper<cudf::string_view> lhs2{{"hello"}, {"world", "!"}};
+  cudf::test::strings_column_wrapper rhs2{"", "kk"};
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
 
-  fixed_width_column_wrapper<int32_t> lhs3{1, 1};
-  dictionary_column_wrapper<int32_t> rhs3{2, 2};
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs3{1, 1};
+  cudf::test::dictionary_column_wrapper<int32_t> rhs3{2, 2};
 
-  EXPECT_FALSE(column_types_equal(lhs3, rhs3));
+  EXPECT_FALSE(cudf::have_same_types(lhs3, rhs3));
 
-  lists_column_wrapper<int32_t> lhs4{{8, 8, 8}, {10, 10}};
-  structs_column_wrapper rhs4{rhs2, rhs3};
+  cudf::test::lists_column_wrapper<int32_t> lhs4{{8, 8, 8}, {10, 10}};
+  cudf::test::structs_column_wrapper rhs4{rhs2, rhs3};
 
-  EXPECT_FALSE(column_types_equal(lhs4, rhs4));
+  EXPECT_FALSE(cudf::have_same_types(lhs4, rhs4));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentFixedWidth)
 {
-  fixed_width_column_wrapper<int32_t> lhs1{1, 1};
-  fixed_width_column_wrapper<int64_t> rhs1{2};
+  cudf::test::fixed_width_column_wrapper<int32_t> lhs1{1, 1};
+  cudf::test::fixed_width_column_wrapper<int64_t> rhs1{2};
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
-  fixed_width_column_wrapper<float> lhs2{1, 1};
-  fixed_width_column_wrapper<double> rhs2{2};
+  cudf::test::fixed_width_column_wrapper<float> lhs2{1, 1};
+  cudf::test::fixed_width_column_wrapper<double> rhs2{2};
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
 
-  fixed_width_column_wrapper<timestamp_ms> lhs3{1, 1};
-  fixed_width_column_wrapper<timestamp_us> rhs3{2};
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms> lhs3{1, 1};
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_us> rhs3{2};
 
-  EXPECT_FALSE(column_types_equal(lhs3, rhs3));
+  EXPECT_FALSE(cudf::have_same_types(lhs3, rhs3));
 
-  fixed_width_column_wrapper<duration_D> lhs4{};
-  fixed_width_column_wrapper<duration_us> rhs4{42};
+  cudf::test::fixed_width_column_wrapper<cudf::duration_D> lhs4{};
+  cudf::test::fixed_width_column_wrapper<cudf::duration_us> rhs4{42};
 
-  EXPECT_FALSE(column_types_equal(lhs4, rhs4));
+  EXPECT_FALSE(cudf::have_same_types(lhs4, rhs4));
 
   // Same rep, different scale
-  fixed_point_column_wrapper<int32_t> lhs5({10000}, numeric::scale_type{-3});
-  fixed_point_column_wrapper<int32_t> rhs5({10000}, numeric::scale_type{0});
+  cudf::test::fixed_point_column_wrapper<int32_t> lhs5({10000}, numeric::scale_type{-3});
+  cudf::test::fixed_point_column_wrapper<int32_t> rhs5({10000}, numeric::scale_type{0});
 
-  EXPECT_FALSE(column_types_equal(lhs5, rhs5));
-  EXPECT_TRUE(column_types_equivalent(lhs5, rhs5));
+  EXPECT_FALSE(cudf::have_same_types(lhs5, rhs5));
+  EXPECT_TRUE(cudf::column_types_equivalent(lhs5, rhs5));
 
   // Different rep, same scale
-  fixed_point_column_wrapper<int32_t> lhs6({10000}, numeric::scale_type{-1});
-  fixed_point_column_wrapper<int64_t> rhs6({4200}, numeric::scale_type{-1});
+  cudf::test::fixed_point_column_wrapper<int32_t> lhs6({10000}, numeric::scale_type{-1});
+  cudf::test::fixed_point_column_wrapper<int64_t> rhs6({4200}, numeric::scale_type{-1});
 
-  EXPECT_FALSE(column_types_equal(lhs6, rhs6));
+  EXPECT_FALSE(cudf::have_same_types(lhs6, rhs6));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentDictionary)
 {
-  dictionary_column_wrapper<int32_t, uint32_t> lhs1{1, 1, 1, 2, 2, 3};
-  dictionary_column_wrapper<int64_t, uint32_t> rhs1{0, 0, 42, 42};
+  cudf::test::dictionary_column_wrapper<int32_t, uint32_t> lhs1{1, 1, 1, 2, 2, 3};
+  cudf::test::dictionary_column_wrapper<int64_t, uint32_t> rhs1{0, 0, 42, 42};
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
-  dictionary_column_wrapper<double, uint32_t> lhs2{3.14, 3.14, 5.00};
-  dictionary_column_wrapper<int64_t, uint32_t> rhs2{0, 0, 42, 42};
+  cudf::test::dictionary_column_wrapper<double, uint32_t> lhs2{3.14, 3.14, 5.00};
+  cudf::test::dictionary_column_wrapper<int64_t, uint32_t> rhs2{0, 0, 42, 42};
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
 
-  dictionary_column_wrapper<int32_t, uint32_t> lhs3{1, 1, 1, 2, 2, 3};
-  dictionary_column_wrapper<duration_s, uint32_t> rhs3{8, 8};
+  cudf::test::dictionary_column_wrapper<int32_t, uint32_t> lhs3{1, 1, 1, 2, 2, 3};
+  cudf::test::dictionary_column_wrapper<cudf::duration_s, uint32_t> rhs3{8, 8};
 
-  EXPECT_FALSE(column_types_equal(lhs3, rhs3));
+  EXPECT_FALSE(cudf::have_same_types(lhs3, rhs3));
 
-  dictionary_column_wrapper<int32_t, uint32_t> lhs4{1, 1, 2, 3}, rhs4{};
-  EXPECT_FALSE(column_types_equal(lhs4, rhs4));
+  cudf::test::dictionary_column_wrapper<int32_t, uint32_t> lhs4{1, 1, 2, 3}, rhs4{};
+  EXPECT_FALSE(cudf::have_same_types(lhs4, rhs4));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentLists)
 {
-  using LCW_i = lists_column_wrapper<int32_t>;
-  using LCW_f = lists_column_wrapper<float>;
+  using LCW_i = cudf::test::lists_column_wrapper<int32_t>;
+  using LCW_f = cudf::test::lists_column_wrapper<float>;
 
   // Different nested level
   LCW_i lhs1{LCW_i{1, 1, 2, 3}, LCW_i{}, LCW_i{42, 42}};
   LCW_i rhs1{LCW_i{LCW_i{8, 8, 8}, LCW_i{9, 9}}, LCW_i{LCW_i{42, 42}}};
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
   // Different base column type
   LCW_i lhs2{LCW_i{1, 1, 2, 3}, LCW_i{}, LCW_i{42, 42}};
   LCW_f rhs2{LCW_f{9.0, 9.1}, LCW_f{3.14}, LCW_f{}};
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
 }
 
 TEST_F(ColumnTypeCheckTest, DifferentStructs)
 {
-  fixed_width_column_wrapper<int32_t> lf1{1, 1, 1};
-  fixed_width_column_wrapper<int64_t> rf1{2, 2};
+  cudf::test::fixed_width_column_wrapper<int32_t> lf1{1, 1, 1};
+  cudf::test::fixed_width_column_wrapper<int64_t> rf1{2, 2};
+
+  cudf::test::structs_column_wrapper lhs1{lf1};
+  cudf::test::structs_column_wrapper rhs1{rf1};
 
-  structs_column_wrapper lhs1{lf1};
-  structs_column_wrapper rhs1{rf1};
+  EXPECT_FALSE(cudf::have_same_types(lhs1, rhs1));
 
-  EXPECT_FALSE(column_types_equal(lhs1, rhs1));
+  cudf::test::fixed_width_column_wrapper<int32_t> lf2{1, 1, 1};
+  cudf::test::fixed_width_column_wrapper<int32_t> rf2{2, 2};
 
-  fixed_width_column_wrapper<int32_t> lf2{1, 1, 1};
-  fixed_width_column_wrapper<int32_t> rf2{2, 2};
+  cudf::test::strings_column_wrapper lf3{"a", "b", "c"};
 
-  strings_column_wrapper lf3{"a", "b", "c"};
+  cudf::test::structs_column_wrapper lhs2{lf2, lf3};
+  cudf::test::structs_column_wrapper rhs2{rf2};
 
-  structs_column_wrapper lhs2{lf2, lf3};
-  structs_column_wrapper rhs2{rf2};
+  EXPECT_FALSE(cudf::have_same_types(lhs2, rhs2));
+}
 
-  EXPECT_FALSE(column_types_equal(lhs2, rhs2));
+TYPED_TEST(ColumnTypeCheckTestTyped, AllTypesEqual)
+{
+  {
+    // An empty table
+    cudf::table_view tbl{};
+    EXPECT_TRUE(cudf::all_have_same_types(tbl.begin(), tbl.end()));
+  }
+
+  {
+    // A table with one column
+    cudf::test::fixed_width_column_wrapper<TypeParam> col1{1, 2, 3};
+    cudf::table_view tbl{{col1}};
+    EXPECT_TRUE(cudf::all_have_same_types(tbl.begin(), tbl.end()));
+  }
+
+  {
+    // A table with all the same types
+    cudf::test::fixed_width_column_wrapper<TypeParam> col1{1, 2, 3};
+    cudf::test::fixed_width_column_wrapper<TypeParam> col2{4, 5, 6};
+    cudf::test::fixed_width_column_wrapper<TypeParam> col3{7, 8, 9};
+    cudf::table_view tbl{{col1, col2, col3}};
+    EXPECT_TRUE(cudf::all_have_same_types(tbl.begin(), tbl.end()));
+  }
 }
 
-}  // namespace test
-}  // namespace cudf
+TEST_F(ColumnTypeCheckTest, AllTypesNotEqual)
+{
+  // A table with different types
+  cudf::test::fixed_width_column_wrapper<int> col1{1, 2, 3};
+  cudf::test::fixed_width_column_wrapper<float> col2{3.14, 1.57, 2.71};
+  cudf::table_view tbl{{col1, col2}};
+  EXPECT_FALSE(cudf::all_have_same_types(tbl.begin(), tbl.end()));
+}

From 8cc4cc16fc6538f706b9f795d0879bdd0ba442a1 Mon Sep 17 00:00:00 2001
From: Allison Piper <apiper@nvidia.com>
Date: Thu, 2 May 2024 14:50:20 -0400
Subject: [PATCH 134/842] Remove NVBench SHA override. (#15633)

The override is no longer necessary as rapids-cmake now uses the same version that was set by the override.

Refs rapidsai/rapids-cmake#584, #15492

Authors:
  - Allison Piper (https://github.com/alliepiper)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15633
---
 cpp/cmake/thirdparty/get_nvbench.cmake             | 5 +----
 cpp/cmake/thirdparty/patches/nvbench_override.json | 9 ---------
 2 files changed, 1 insertion(+), 13 deletions(-)
 delete mode 100644 cpp/cmake/thirdparty/patches/nvbench_override.json

diff --git a/cpp/cmake/thirdparty/get_nvbench.cmake b/cpp/cmake/thirdparty/get_nvbench.cmake
index bbd22693ba4..84c27dd9d56 100644
--- a/cpp/cmake/thirdparty/get_nvbench.cmake
+++ b/cpp/cmake/thirdparty/get_nvbench.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -18,9 +18,6 @@ function(find_and_configure_nvbench)
   include(${rapids-cmake-dir}/cpm/nvbench.cmake)
   include(${rapids-cmake-dir}/cpm/package_override.cmake)
 
-  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
-  rapids_cpm_package_override("${cudf_patch_dir}/nvbench_override.json")
-
   rapids_cpm_nvbench(BUILD_STATIC)
 
 endfunction()
diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json
deleted file mode 100644
index ef0deb4c1e9..00000000000
--- a/cpp/cmake/thirdparty/patches/nvbench_override.json
+++ /dev/null
@@ -1,9 +0,0 @@
-
-{
-  "packages" : {
-    "nvbench" : {
-      "git_url": "https://github.com/NVIDIA/nvbench.git",
-      "git_tag": "555d628e9b250868c9da003e4407087ff1982e8e"
-    }
-  }
-}

From 2ee0219a8255beb7b21628648387e3284a0ee0bc Mon Sep 17 00:00:00 2001
From: Tim Liu <timl@nvidia.com>
Date: Fri, 3 May 2024 03:00:48 +0800
Subject: [PATCH 135/842] Drop Centos7 support (#15608)

To fix https://github.com/rapidsai/cudf/issues/15583

We plan to drop CentOS 7 (which uses glibc 2.17) RAPIDS 24.06. The [java/ci/Dockerfile.centos7](https://github.com/rapidsai/cudf/blob/branch-24.06/java/ci/Dockerfile.centos7), refor or to :

https://docs.rapids.ai/notices/rsn0037/

https://github.com/rapidsai/build-planning/issues/23

Change to run build in Rocky8 docker container

Authors:
  - Tim Liu (https://github.com/NvTimLiu)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15608
---
 build.sh                   |  8 ++---
 java/ci/Dockerfile.centos7 | 56 ----------------------------------
 java/ci/Dockerfile.rocky   | 62 ++++++++++++++++++++++++++++++++++++++
 java/ci/README.md          |  8 ++---
 4 files changed, 70 insertions(+), 64 deletions(-)
 delete mode 100644 java/ci/Dockerfile.centos7
 create mode 100644 java/ci/Dockerfile.rocky

diff --git a/build.sh b/build.sh
index e5daf2f3451..43bb04f7a18 100755
--- a/build.sh
+++ b/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 # cuDF build script
 
@@ -109,8 +109,8 @@ function buildAll {
 }
 
 function buildLibCudfJniInDocker {
-    local cudaVersion="11.5.0"
-    local imageName="cudf-build:${cudaVersion}-devel-centos7"
+    local cudaVersion="11.8.0"
+    local imageName="cudf-build:${cudaVersion}-devel-rocky8"
     local CMAKE_GENERATOR="${CMAKE_GENERATOR:-Ninja}"
     local workspaceDir="/rapids"
     local localMavenRepo=${LOCAL_MAVEN_REPO:-"$HOME/.m2/repository"}
@@ -120,7 +120,7 @@ function buildLibCudfJniInDocker {
     mkdir -p "$CUDF_JAR_JAVA_BUILD_DIR/libcudf-cmake-build"
     mkdir -p "$HOME/.ccache" "$HOME/.m2"
     nvidia-docker build \
-        -f java/ci/Dockerfile.centos7 \
+        -f java/ci/Dockerfile.rocky \
         --build-arg CUDA_VERSION=${cudaVersion} \
         -t $imageName .
     nvidia-docker run -it -u $(id -u):$(id -g) --rm \
diff --git a/java/ci/Dockerfile.centos7 b/java/ci/Dockerfile.centos7
deleted file mode 100644
index b2c620848de..00000000000
--- a/java/ci/Dockerfile.centos7
+++ /dev/null
@@ -1,56 +0,0 @@
-#
-# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-###
-# Build the image for cudf development environment.
-#
-# Arguments: CUDA_VERSION=11.X.Y
-#
-###
-ARG CUDA_VERSION=11.8.0
-FROM nvidia/cuda:$CUDA_VERSION-devel-centos7
-
-### Install basic requirements
-ARG DEVTOOLSET_VERSION=11
-RUN yum install -y centos-release-scl
-RUN yum install -y devtoolset-${DEVTOOLSET_VERSION} epel-release
-RUN yum install -y git zlib-devel maven tar wget patch ninja-build
-
-## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
-RUN mkdir /usr/local/rapids && mkdir /rapids && chmod 777 /usr/local/rapids && chmod 777 /rapids
-
-ARG CMAKE_VERSION=3.26.4
-RUN cd /usr/local/ && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
-   tar zxf cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz && \
-   rm cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz
-
-ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:$PATH
-
-ARG CCACHE_VERSION=4.6
-RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}.tar.gz && \
-   tar zxf ccache-${CCACHE_VERSION}.tar.gz && \
-   rm ccache-${CCACHE_VERSION}.tar.gz && \
-   cd ccache-${CCACHE_VERSION} && \
-   mkdir build && \
-   cd build && \
-   scl enable devtoolset-${DEVTOOLSET_VERSION} \
-      "cmake .. \
-         -DCMAKE_BUILD_TYPE=Release \
-         -DZSTD_FROM_INTERNET=ON \
-         -DREDIS_STORAGE_BACKEND=OFF && \
-      cmake --build . --parallel ${PARALLEL_LEVEL} --target install" && \
-   cd ../.. && \
-   rm -rf ccache-${CCACHE_VERSION}
diff --git a/java/ci/Dockerfile.rocky b/java/ci/Dockerfile.rocky
new file mode 100644
index 00000000000..6b87f3ed34e
--- /dev/null
+++ b/java/ci/Dockerfile.rocky
@@ -0,0 +1,62 @@
+#
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+###
+# Build the image for cudf development environment.
+#
+# Arguments: CUDA_VERSION=[11.X.Y, 12.X.Y], OS_RELEASE=[8, 9], TARGETPLATFORM=[linux/amd64, linux/amd64]
+#
+###
+ARG CUDA_VERSION=11.8.0
+ARG OS_RELEASE=8
+ARG TARGETPLATFORM=linux/amd64
+# multi-platform build with: docker buildx build --platform linux/arm64,linux/amd64 <ARGS> on either amd64 or arm64 host
+# check available official arm-based docker images at https://hub.docker.com/r/nvidia/cuda/tags (OS/ARCH)
+FROM --platform=$TARGETPLATFORM nvidia/cuda:$CUDA_VERSION-devel-rockylinux$OS_RELEASE
+ARG TOOLSET_VERSION=11
+### Install basic requirements
+RUN dnf --enablerepo=powertools install -y  scl-utils gcc-toolset-${TOOLSET_VERSION} git zlib-devel maven tar wget patch ninja-build
+## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
+RUN mkdir /usr/local/rapids /rapids && chmod 777 /usr/local/rapids /rapids
+
+# 3.22.3+: CUDA architecture 'native' support + flexible CMAKE_<LANG>_*_LAUNCHER for ccache
+ARG CMAKE_VERSION=3.26.4
+# default x86_64 from x86 build, aarch64 cmake for arm build
+ARG CMAKE_ARCH=x86_64
+RUN cd /usr/local && wget --quiet https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \
+   tar zxf cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz && \
+   rm cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}.tar.gz
+ENV PATH /usr/local/cmake-${CMAKE_VERSION}-linux-${CMAKE_ARCH}/bin:$PATH
+
+# ccache for interactive builds
+ARG CCACHE_VERSION=4.6
+RUN cd /tmp && wget --quiet https://github.com/ccache/ccache/releases/download/v${CCACHE_VERSION}/ccache-${CCACHE_VERSION}.tar.gz && \
+   tar zxf ccache-${CCACHE_VERSION}.tar.gz && \
+   rm ccache-${CCACHE_VERSION}.tar.gz && \
+   cd ccache-${CCACHE_VERSION} && \
+   mkdir build && \
+   cd build && \
+   scl enable gcc-toolset-${TOOLSET_VERSION} \
+      "cmake .. \
+         -DCMAKE_BUILD_TYPE=Release \
+         -DZSTD_FROM_INTERNET=ON \
+         -DREDIS_STORAGE_BACKEND=OFF && \
+      cmake --build . --parallel 4 --target install" && \
+   cd ../.. && \
+   rm -rf ccache-${CCACHE_VERSION}
+
+# disable cuda container constraints to allow running w/ elder drivers on data-center GPUs
+ENV NVIDIA_DISABLE_REQUIRE="true"
diff --git a/java/ci/README.md b/java/ci/README.md
index da24c5923ea..18ad3cc4d0d 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -11,14 +11,14 @@
 
 In the root path of cuDF repo, run below command to build the docker image.
 ```bash
-docker build -f java/ci/Dockerfile.centos7 --build-arg CUDA_VERSION=11.8.0 -t cudf-build:11.8.0-devel-centos7 .
+docker build -f java/ci/Dockerfile.rocky --build-arg CUDA_VERSION=11.8.0 -t cudf-build:11.8.0-devel-rocky8 .
 ```
 
 The following CUDA versions are supported w/ CUDA Enhanced Compatibility:
 * CUDA 11.0+
 
 Change the --build-arg CUDA_VERSION to what you need.
-You can replace the tag "cudf-build:11.8.0-devel-centos7" with another name you like.
+You can replace the tag "cudf-build:11.8.0-devel-rocky8" with another name you like.
 
 ## Start the docker then build
 
@@ -26,7 +26,7 @@ You can replace the tag "cudf-build:11.8.0-devel-centos7" with another name you
 
 Run below command to start a docker container with GPU.
 ```bash
-nvidia-docker run -it cudf-build:11.8.0-devel-centos7 bash
+nvidia-docker run -it cudf-build:11.8.0-devel-rocky8 bash
 ```
 
 ### Download the cuDF source code
@@ -42,7 +42,7 @@ git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.06
 ```bash
 cd cudf
 export WORKSPACE=`pwd`
-scl enable devtoolset-11 "java/ci/build-in-docker.sh"
+scl enable gcc-toolset-11 "java/ci/build-in-docker.sh"
 ```
 
 ### The output

From 2fccbc0ba4af7a76c47553ea578d517d2db8e297 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 2 May 2024 14:45:03 -0500
Subject: [PATCH 136/842] Add JSON option to prune columns (#14996)

Resolves https://github.com/rapidsai/cudf/issues/14951
This adds an option `prune_columns` to json_reader_options (default False)
When set to True, the dtypes option is used as filter instead of type inference suggestion. If dtypes (vector of dtypes, map of dtypes or nested schema), is not specified, output is empty dataframe.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - MithunR (https://github.com/mythrocks)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Shruti Shivakumar (https://github.com/shrshi)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14996
---
 cpp/include/cudf/io/json.hpp          |  40 +++++
 cpp/src/io/json/json_column.cu        | 143 ++++++++++++------
 cpp/src/io/json/nested_json.hpp       |   2 +-
 cpp/src/io/json/parser_features.cpp   |  15 +-
 cpp/tests/io/json_test.cpp            | 205 +++++++++++++++++++++++++-
 python/cudf/cudf/_lib/cpp/io/json.pxd |   5 +
 python/cudf/cudf/_lib/json.pyx        |   4 +-
 python/cudf/cudf/io/json.py           |   2 +
 python/cudf/cudf/utils/ioutils.py     |  16 +-
 9 files changed, 377 insertions(+), 55 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index a6112b8db4c..7374ffc37e6 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -101,6 +101,8 @@ class json_reader_options {
   bool _lines = false;
   // Parse mixed types as a string column
   bool _mixed_types_as_string = false;
+  // Prune columns on read, selected based on the _dtypes option
+  bool _prune_columns = false;
 
   // Bytes to skip from the start
   size_t _byte_range_offset = 0;
@@ -241,6 +243,17 @@ class json_reader_options {
    */
   bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; }
 
+  /**
+   * @brief Whether to prune columns on read, selected based on the @ref set_dtypes option.
+   *
+   * When set as true, if the reader options include @ref set_dtypes, then
+   * the reader will only return those columns which are mentioned in @ref set_dtypes.
+   * If false, then all columns are returned, independent of the @ref set_dtypes setting.
+   *
+   * @return True if column pruning is enabled
+   */
+  bool is_enabled_prune_columns() const { return _prune_columns; }
+
   /**
    * @brief Whether to parse dates as DD/MM versus MM/DD.
    *
@@ -342,6 +355,17 @@ class json_reader_options {
    */
   void enable_mixed_types_as_string(bool val) { _mixed_types_as_string = val; }
 
+  /**
+   * @brief Set whether to prune columns on read, selected based on the @ref set_dtypes option.
+   *
+   * When set as true, if the reader options include @ref set_dtypes, then
+   * the reader will only return those columns which are mentioned in @ref set_dtypes.
+   * If false, then all columns are returned, independent of the @ref set_dtypes setting.
+   *
+   * @param val Boolean value to enable/disable column pruning
+   */
+  void enable_prune_columns(bool val) { _prune_columns = val; }
+
   /**
    * @brief Set whether to parse dates as DD/MM versus MM/DD.
    *
@@ -508,6 +532,22 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether to prune columns on read, selected based on the @ref dtypes option.
+   *
+   * When set as true, if the reader options include @ref dtypes, then
+   * the reader will only return those columns which are mentioned in @ref dtypes.
+   * If false, then all columns are returned, independent of the @ref dtypes setting.
+   *
+   * @param val Boolean value to enable/disable column pruning
+   * @return this for chaining
+   */
+  json_reader_options_builder& prune_columns(bool val)
+  {
+    options._prune_columns = val;
+    return *this;
+  }
+
   /**
    * @brief Set whether to parse dates as DD/MM versus MM/DD.
    *
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 7117af8948b..631f8adbd6d 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -564,7 +564,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     }
   };
   auto init_to_zero = [stream](auto& v) {
-    thrust::uninitialized_fill(rmm::exec_policy(stream), v.begin(), v.end(), 0);
+    thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0);
   };
 
   auto initialize_json_columns = [&](auto i, auto& col) {
@@ -625,13 +625,14 @@ void make_device_json_column(device_span<SymbolT const> input,
   // find column_ids which are values, but should be ignored in validity
   std::vector<uint8_t> ignore_vals(num_columns, 0);
   std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
+  std::vector<uint8_t> is_pruned(num_columns, 0);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
 
-  for (auto const this_col_id : unique_col_ids) {
-    if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
-      continue;
-    }
-    // Struct, List, String, Value
+  auto name_and_parent_index = [&is_array_of_arrays,
+                                &row_array_parent_col_id,
+                                &column_parent_ids,
+                                &column_categories,
+                                &column_names](auto this_col_id) {
     std::string name   = "";
     auto parent_col_id = column_parent_ids[this_col_id];
     if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
@@ -647,11 +648,46 @@ void make_device_json_column(device_span<SymbolT const> input,
     } else {
       CUDF_FAIL("Unexpected parent column category");
     }
+    return std::pair{name, parent_col_id};
+  };
+
+  // Prune columns that are not required to be parsed.
+  if (options.is_enabled_prune_columns()) {
+    for (auto const this_col_id : unique_col_ids) {
+      if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
+        continue;
+      }
+      // Struct, List, String, Value
+      auto [name, parent_col_id] = name_and_parent_index(this_col_id);
+      // get path of this column, and get its dtype if present in options
+      auto const nt                             = tree_path.get_path(this_col_id);
+      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
+      if (!user_dtype.has_value() and parent_col_id != parent_node_sentinel) {
+        is_pruned[this_col_id] = 1;
+        continue;
+      } else {
+        // make sure all its parents are not pruned.
+        while (parent_col_id != parent_node_sentinel and is_pruned[parent_col_id] == 1) {
+          is_pruned[parent_col_id] = 0;
+          parent_col_id            = column_parent_ids[parent_col_id];
+        }
+      }
+    }
+  }
+
+  // Build the column tree, also, handles mixed types.
+  for (auto const this_col_id : unique_col_ids) {
+    if (column_categories[this_col_id] == NC_ERR || column_categories[this_col_id] == NC_FN) {
+      continue;
+    }
+    // Struct, List, String, Value
+    auto [name, parent_col_id] = name_and_parent_index(this_col_id);
 
-    if (parent_col_id != parent_node_sentinel && is_mixed_type_column[parent_col_id] == 1) {
-      // if parent is mixed type column, ignore this column.
-      is_mixed_type_column[this_col_id] = 1;
-      ignore_vals[this_col_id]          = 1;
+    // if parent is mixed type column or this column is pruned, ignore this column.
+    if (parent_col_id != parent_node_sentinel &&
+        (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id])) {
+      ignore_vals[this_col_id] = 1;
+      if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; }
       continue;
     }
 
@@ -714,12 +750,13 @@ void make_device_json_column(device_span<SymbolT const> input,
                      "A mix of lists and structs within the same column is not supported");
       }
     }
+
     if (is_enabled_mixed_types_as_string) {
       // get path of this column, check if it is a struct forced as string, and enforce it
-      auto nt                          = tree_path.get_path(this_col_id);
-      std::optional<data_type> user_dt = get_path_data_type(nt, options);
-      if (column_categories[this_col_id] == NC_STRUCT and user_dt.has_value() and
-          user_dt.value().id() == type_id::STRING) {
+      auto const nt                             = tree_path.get_path(this_col_id);
+      std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
+      if (column_categories[this_col_id] == NC_STRUCT and user_dtype.has_value() and
+          user_dtype.value().id() == type_id::STRING) {
         is_mixed_type_column[this_col_id] = 1;
         column_categories[this_col_id]    = NC_STR;
       }
@@ -873,25 +910,27 @@ void make_device_json_column(device_span<SymbolT const> input,
   for (auto& [id, col_ref] : columns) {
     auto& col = col_ref.get();
     if (col.type == json_col_t::StringColumn) {
-      thrust::inclusive_scan(rmm::exec_policy(stream),
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
                              col.string_offsets.begin(),
                              col.string_offsets.end(),
                              col.string_offsets.begin(),
                              thrust::maximum<json_column::row_offset_t>{});
     } else if (col.type == json_col_t::ListColumn) {
-      thrust::inclusive_scan(rmm::exec_policy(stream),
+      thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
                              col.child_offsets.begin(),
                              col.child_offsets.end(),
                              col.child_offsets.begin(),
                              thrust::maximum<json_column::row_offset_t>{});
     }
   }
+  stream.synchronize();
 }
 
 std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_column_to_cudf_column(
   device_json_column& json_col,
   device_span<SymbolT const> d_input,
   cudf::io::parse_options const& options,
+  bool prune_columns,
   std::optional<schema_element> schema,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
@@ -982,13 +1021,16 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       for (auto const& col_name : json_col.column_order) {
         auto const& col = json_col.child_columns.find(col_name);
         column_names.emplace_back(col->first);
-        auto& child_col            = col->second;
-        auto [child_column, names] = device_json_column_to_cudf_column(
-          child_col, d_input, options, get_child_schema(col_name), stream, mr);
-        CUDF_EXPECTS(num_rows == child_column->size(),
-                     "All children columns must have the same size");
-        child_columns.push_back(std::move(child_column));
-        column_names.back().children = names;
+        auto& child_col           = col->second;
+        auto child_schema_element = get_child_schema(col_name);
+        if (!prune_columns or child_schema_element.has_value()) {
+          auto [child_column, names] = device_json_column_to_cudf_column(
+            child_col, d_input, options, prune_columns, child_schema_element, stream, mr);
+          CUDF_EXPECTS(num_rows == child_column->size(),
+                       "All children columns must have the same size");
+          child_columns.push_back(std::move(child_column));
+          column_names.back().children = names;
+        }
       }
       auto [result_bitmask, null_count] = make_validity(json_col);
       // The null_mask is set after creation of struct column is to skip the superimpose_nulls and
@@ -1011,8 +1053,11 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
                                                      rmm::device_buffer{},
                                                      0);
       // Create children column
+      auto child_schema_element = json_col.child_columns.empty()
+                                    ? std::optional<schema_element>{}
+                                    : get_child_schema(json_col.child_columns.begin()->first);
       auto [child_column, names] =
-        json_col.child_columns.empty()
+        json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value())
           ? std::pair<std::unique_ptr<column>,
                       // EMPTY type could not used because gather throws exception on EMPTY type.
                       std::vector<column_name_info>>{std::make_unique<column>(
@@ -1022,13 +1067,13 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
                                                        rmm::device_buffer{},
                                                        0),
                                                      std::vector<column_name_info>{}}
-          : device_json_column_to_cudf_column(
-              json_col.child_columns.begin()->second,
-              d_input,
-              options,
-              get_child_schema(json_col.child_columns.begin()->first),
-              stream,
-              mr);
+          : device_json_column_to_cudf_column(json_col.child_columns.begin()->second,
+                                              d_input,
+                                              options,
+                                              prune_columns,
+                                              child_schema_element,
+                                              stream,
+                                              mr);
       column_names.back().children      = names;
       auto [result_bitmask, null_count] = make_validity(json_col);
       auto ret_col                      = make_lists_column(num_rows,
@@ -1140,8 +1185,6 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
   size_type column_index = 0;
   for (auto const& col_name : root_struct_col.column_order) {
     auto& json_col = root_struct_col.child_columns.find(col_name)->second;
-    // Insert this columns name into the schema
-    out_column_names.emplace_back(col_name);
 
     std::optional<schema_element> child_schema_element = std::visit(
       cudf::detail::visitor_overload{
@@ -1184,18 +1227,28 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
     debug_schema_print(child_schema_element);
 #endif
 
-    // Get this JSON column's cudf column and schema info, (modifies json_col)
-    auto [cudf_col, col_name_info] = device_json_column_to_cudf_column(
-      json_col, d_input, parse_opt, child_schema_element, stream, mr);
-    // TODO: RangeIndex as DataFrame.columns names for array of arrays
-    // if (is_array_of_arrays) {
-    //   col_name_info.back().name = "";
-    // }
-
-    out_column_names.back().children = std::move(col_name_info);
-    out_columns.emplace_back(std::move(cudf_col));
-
-    column_index++;
+    if (!options.is_enabled_prune_columns() or child_schema_element.has_value()) {
+      // Get this JSON column's cudf column and schema info, (modifies json_col)
+      auto [cudf_col, col_name_info] =
+        device_json_column_to_cudf_column(json_col,
+                                          d_input,
+                                          parse_opt,
+                                          options.is_enabled_prune_columns(),
+                                          child_schema_element,
+                                          stream,
+                                          mr);
+      // Insert this column's name into the schema
+      out_column_names.emplace_back(col_name);
+      // TODO: RangeIndex as DataFrame.columns names for array of arrays
+      // if (is_array_of_arrays) {
+      //   col_name_info.back().name = "";
+      // }
+
+      out_column_names.back().children = std::move(col_name_info);
+      out_columns.emplace_back(std::move(cudf_col));
+
+      column_index++;
+    }
   }
 
   return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {out_column_names}};
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index a302785cee8..52ea23c7f1c 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -319,7 +319,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
  * @return data type of the column if present
  */
 std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT>> path,
+  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path,
   cudf::io::json_reader_options const& options);
 
 /**
diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp
index 740b7523cc1..4caa5cd9e24 100644
--- a/cpp/src/io/json/parser_features.cpp
+++ b/cpp/src/io/json/parser_features.cpp
@@ -58,8 +58,15 @@ std::optional<schema_element> child_schema_element(std::string const& col_name,
 // "a": [ null]         {"a", list}, {"element", str}
 // back() is root.
 // front() is leaf.
+/**
+ * @brief Get the path data type of a column by path if present in input schema
+ *
+ * @param path path of the json column
+ * @param root root of input schema element
+ * @return data type of the column if present, otherwise std::nullopt
+ */
 std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT>> path, schema_element const& root)
+  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path, schema_element const& root)
 {
   if (path.empty() || path.size() == 1) {
     return root.type;
@@ -81,7 +88,7 @@ std::optional<data_type> get_path_data_type(
 }
 
 std::optional<data_type> get_path_data_type(
-  host_span<std::pair<std::string, cudf::io::json::NodeT>> path,
+  host_span<std::pair<std::string, cudf::io::json::NodeT> const> path,
   cudf::io::json_reader_options const& options)
 {
   if (path.empty()) return {};
@@ -98,11 +105,11 @@ std::optional<data_type> get_path_data_type(
 std::vector<path_from_tree::path_rep> path_from_tree::get_path(NodeIndexT this_col_id)
 {
   std::vector<path_rep> path;
-  // TODO Need to stop at row root. so, how to find row root?
+  // stops at root.
   while (this_col_id != parent_node_sentinel) {
     auto type        = column_categories[this_col_id];
     std::string name = "";
-    // TODO make this ifelse into a separate lambda function, along with parent_col_id.
+    // code same as name_and_parent_index lambda.
     auto parent_col_id = column_parent_ids[this_col_id];
     if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
       if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) {
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index f0f72d4e794..b25822f6613 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -2233,9 +2233,6 @@ TEST_F(JsonReaderTest, MixedTypes)
         .lines(true);
 
     cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
-    static int num_case                  = 0;
-    num_case++;
-    std::cout << "case:" << num_case << "\n";
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), expected);
   };
   // value + string (not mixed type case)
@@ -2437,4 +2434,206 @@ TEST_F(JsonReaderTest, MapTypes)
           {type_id::LIST, type_id::STRING, type_id::STRING});
 }
 
+// Test case for dtype prune:
+// all paths, only one.
+// one present, another not present, nothing present
+// nested, flat, not-jsonlines
+TEST_F(JsonReaderTest, JsonNestedDtypeFilter)
+{
+  std::string json_stringl = R"(
+    {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true}
+    {"a": 1, "b": {"0": "abc"          }, "c": false}
+    {"a": 1, "b": {}}
+    {"a": 1,                              "c": null}
+    )";
+  std::string json_string  = R"([
+    {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true},
+    {"a": 1, "b": {"0": "abc"          }, "c": false},
+    {"a": 1, "b": {}},
+    {"a": 1,                              "c": null}
+    ])";
+  for (auto& [json_string, lines] : {std::pair{json_stringl, true}, {json_string, false}}) {
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_string.data(), json_string.size()})
+        .prune_columns(true)
+        .lines(lines);
+
+    // include all columns
+    //// schema
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"b",
+         {data_type{cudf::type_id::STRUCT},
+          {{"0", {data_type{cudf::type_id::STRING}}},
+           {"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype<float>()}}}}}}}},
+        {"a", {dtype<int32_t>()}},
+        {"c", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "a", "b" and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 3);
+      ASSERT_EQ(result.metadata.schema_info.size(), 3);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "b");
+      EXPECT_EQ(result.metadata.schema_info[2].name, "c");
+      // "b" children checks
+      ASSERT_EQ(result.metadata.schema_info[1].children.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[1].children[0].name, "0");
+      EXPECT_EQ(result.metadata.schema_info[1].children[1].name, "1");
+      ASSERT_EQ(result.metadata.schema_info[1].children[1].children.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[1].children[1].children[0].name, "offsets");
+      EXPECT_EQ(result.metadata.schema_info[1].children[1].children[1].name, "element");
+      // types
+      EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT32);
+      EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::STRUCT);
+      EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::BOOL8);
+      EXPECT_EQ(result.tbl->get_column(1).child(0).type().id(), cudf::type_id::STRING);
+      EXPECT_EQ(result.tbl->get_column(1).child(1).type().id(), cudf::type_id::LIST);
+      EXPECT_EQ(result.tbl->get_column(1).child(1).child(0).type().id(), cudf::type_id::INT32);
+      EXPECT_EQ(result.tbl->get_column(1).child(1).child(1).type().id(), cudf::type_id::FLOAT32);
+    }
+    //// vector
+    {
+      std::vector<data_type> types{
+        {dtype<int32_t>()}, data_type{cudf::type_id::STRUCT}, {dtype<bool>()}};
+      in_options.set_dtypes(types);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "a", "b" and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 3);
+      ASSERT_EQ(result.metadata.schema_info.size(), 3);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "b");
+      EXPECT_EQ(result.metadata.schema_info[2].name, "c");
+    }
+    //// map
+    {
+      std::map<std::string, data_type> dtype_map{
+        {"b",
+         {
+           data_type{cudf::type_id::STRUCT},
+         }},
+        {"a", {dtype<int32_t>()}},
+        {"c", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_map);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "a", "b" and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 3);
+      ASSERT_EQ(result.metadata.schema_info.size(), 3);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "b");
+      EXPECT_EQ(result.metadata.schema_info[2].name, "c");
+    }
+
+    // include only one column
+    //// schema
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"a", {dtype<int32_t>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "a"
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    }
+    //// vector
+    {
+      std::vector<data_type> types{{dtype<int32_t>()}};
+      in_options.set_dtypes(types);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "a"
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    }
+    //// map
+    {
+      std::map<std::string, data_type> dtype_map{
+        {"a", {dtype<int32_t>()}},
+      };
+      in_options.set_dtypes(dtype_map);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "a"
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    }
+
+    // include only one column (nested)
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"b",
+         {data_type{cudf::type_id::STRUCT},
+          {{"1", {data_type{cudf::type_id::LIST}, {{"element", {dtype<float>()}}}}}}}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "b":"1":[float]
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "b");
+      ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "1");
+      ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
+      EXPECT_EQ(result.metadata.schema_info[0].children[0].children[1].name, "element");
+      EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
+      EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::LIST);
+      EXPECT_EQ(result.tbl->get_column(0).child(0).child(0).type().id(), cudf::type_id::INT32);
+      EXPECT_EQ(result.tbl->get_column(0).child(0).child(1).type().id(), cudf::type_id::FLOAT32);
+    }
+    // multiple - all present
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"a", {dtype<int32_t>()}},
+        {"c", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "a", and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 2);
+      ASSERT_EQ(result.metadata.schema_info.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+      EXPECT_EQ(result.metadata.schema_info[1].name, "c");
+    }
+    // multiple - not all present
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+        {"a", {dtype<int32_t>()}},
+        {"d", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have column "a"
+      ASSERT_EQ(result.tbl->num_columns(), 1);
+      ASSERT_EQ(result.metadata.schema_info.size(), 1);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "a");
+    }
+    // multiple - not all present nested
+    {
+      std::map<std::string, cudf::io::schema_element> dtype_schema{
+
+        {"b",
+         {data_type{cudf::type_id::STRUCT},
+          {
+            {"2", {data_type{cudf::type_id::STRING}}},
+          }}},
+        {"c", {dtype<bool>()}},
+      };
+      in_options.set_dtypes(dtype_schema);
+      cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+      // Make sure we have columns "b" (empty struct) and "c"
+      ASSERT_EQ(result.tbl->num_columns(), 2);
+      ASSERT_EQ(result.metadata.schema_info.size(), 2);
+      EXPECT_EQ(result.metadata.schema_info[0].name, "b");
+      ASSERT_EQ(result.metadata.schema_info[0].children.size(), 0);
+      EXPECT_EQ(result.metadata.schema_info[1].name, "c");
+    }
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd
index b916c2b7ad9..1e1057beede 100644
--- a/python/cudf/cudf/_lib/cpp/io/json.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/json.pxd
@@ -28,6 +28,7 @@ cdef extern from "cudf/io/json.hpp" \
         size_type get_byte_range_size() except +
         bool is_enabled_lines() except +
         bool is_enabled_mixed_types_as_string() except +
+        bool is_enabled_prune_columns() except +
         bool is_enabled_dayfirst() except +
         bool is_enabled_experimental() except +
 
@@ -41,6 +42,7 @@ cdef extern from "cudf/io/json.hpp" \
         void set_byte_range_size(size_type size) except +
         void enable_lines(bool val) except +
         void enable_mixed_types_as_string(bool val) except +
+        void enable_prune_columns(bool val) except +
         void enable_dayfirst(bool val) except +
         void enable_experimental(bool val) except +
         void enable_keep_quotes(bool val) except +
@@ -79,6 +81,9 @@ cdef extern from "cudf/io/json.hpp" \
         json_reader_options_builder& mixed_types_as_string(
             bool val
         ) except +
+        json_reader_options_builder& prune_columns(
+            bool val
+        ) except +
         json_reader_options_builder& dayfirst(
             bool val
         ) except +
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index f2e03391f08..cef71ed24a5 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -49,7 +49,8 @@ cpdef read_json(object filepaths_or_buffers,
                 object byte_range,
                 bool legacy,
                 bool keep_quotes,
-                bool mixed_types_as_string):
+                bool mixed_types_as_string,
+                bool prune_columns):
     """
     Cython function to call into libcudf API, see `read_json`.
 
@@ -128,6 +129,7 @@ cpdef read_json(object filepaths_or_buffers,
 
     opts.enable_keep_quotes(keep_quotes)
     opts.enable_mixed_types_as_string(mixed_types_as_string)
+    opts.enable_prune_columns(prune_columns)
     # Read JSON
     cdef cudf_io_types.table_with_metadata c_result
 
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index 5ef25a99590..03d07fc3a50 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -26,6 +26,7 @@ def read_json(
     keep_quotes=False,
     storage_options=None,
     mixed_types_as_string=False,
+    prune_columns=False,
     *args,
     **kwargs,
 ):
@@ -101,6 +102,7 @@ def read_json(
             False,
             keep_quotes,
             mixed_types_as_string,
+            prune_columns,
         )
     else:
         warnings.warn(
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 66e14f4b9de..6bd7558d322 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -692,7 +692,6 @@
 
        This parameter is only supported with ``engine='cudf'``.
 
-    This parameter is only supported in ``cudf`` engine.
     If `True`, any string values are read literally (and wrapped in an
     additional set of quotes).
     If `False` string values are parsed into Python strings.
@@ -703,7 +702,22 @@
     For other URLs (e.g. starting with "s3://", and "gcs://") the key-value
     pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
     ``urllib`` for more details.
+mixed_types_as_string : bool, default False
 
+    .. admonition:: GPU-accelerated feature
+
+       This parameter is only supported with ``engine='cudf'``.
+
+    If True, mixed type columns are returned as string columns.
+    If `False` parsing mixed type columns will thrown an error.
+prune_columns : bool, default False
+
+    .. admonition:: GPU-accelerated feature
+
+       This parameter is only supported with ``engine='cudf'``.
+
+    If True, only return those columns mentioned in the dtype argument.
+    If `False` dtype argument is used a type inference suggestion.
 Returns
 -------
 result : Series or DataFrame, depending on the value of `typ`.

From 541b53a97eeb2c8bc14a834b517b6f7f81c76328 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Thu, 2 May 2024 13:23:28 -0700
Subject: [PATCH 137/842] Fix operator precedence problem in Parquet reader
 (#15638)

Fixes an operator precedence problem with a bitwise `&` that was not detected because it was accidentally correct. `PAGEINFO_FLAGS_DICTIONARY` has a value of '1', so `PAGEINFO_FLAGS_DICTIONARY != 0` evaluates to '1', and that ANDed with the page flags evaluates `true` when the bit is set.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15638
---
 cpp/src/io/parquet/page_decode.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 0c139fced24..4c811449c70 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -122,7 +122,7 @@ struct null_count_back_copier {
  */
 constexpr bool is_string_col(PageInfo const& page, device_span<ColumnChunkDesc const> chunks)
 {
-  if (page.flags & PAGEINFO_FLAGS_DICTIONARY != 0) { return false; }
+  if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0) { return false; }
   auto const& col = chunks[page.chunk_idx];
   return is_string_col(col);
 }

From e3ea5237d5e139ec93d5c4cb3d06fb38df6e562b Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 2 May 2024 15:58:03 -0500
Subject: [PATCH 138/842] Fix -Werror=type-limits. (#15635)

I'm compiling cuDF as a part of another application and ran into errors from `-Werror=type-limits`. There are a few comparisons between unsigned types like `value < 0`, which is never true. This PR removes those impossible code paths.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/15635
---
 cpp/src/io/comp/cpu_unbz2.cpp              | 6 +++---
 cpp/src/io/parquet/reader_impl_helpers.cpp | 2 --
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/comp/cpu_unbz2.cpp b/cpp/src/io/comp/cpu_unbz2.cpp
index a116335b254..44535cff589 100644
--- a/cpp/src/io/comp/cpu_unbz2.cpp
+++ b/cpp/src/io/comp/cpu_unbz2.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -221,7 +221,7 @@ int32_t bz2_decompress_block(unbz_state_s* s)
   if (getbits(s, 1)) return BZ_DATA_ERROR;  // blockRandomized not supported (old bzip versions)
 
   s->origPtr = getbits(s, 24);
-  if (s->origPtr < 0 || s->origPtr > 10 + 100000 * s->blockSize100k) return BZ_DATA_ERROR;
+  if (s->origPtr > 10 + 100000 * s->blockSize100k) return BZ_DATA_ERROR;
 
   // Receive the mapping table
   inUse16 = getbits(s, 16);
@@ -436,7 +436,7 @@ int32_t bz2_decompress_block(unbz_state_s* s)
   }
 
   // Now we know what nblock is, we can do a better sanity check on s->origPtr.
-  if (s->origPtr < 0 || s->origPtr >= nblock) return BZ_DATA_ERROR;
+  if (s->origPtr >= nblock) return BZ_DATA_ERROR;
 
   // compute the T^(-1) vector
   {
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 402ccef7a15..c7659be1adb 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -205,7 +205,6 @@ void metadata::sanitize_schema()
   // This code attempts to make this less messy for the code that follows.
 
   std::function<void(size_t)> process = [&](size_t schema_idx) -> void {
-    if (schema_idx < 0) { return; }
     auto& schema_elem = schema[schema_idx];
     if (schema_idx != 0 && schema_elem.type == UNDEFINED_TYPE) {
       auto const parent_type = schema[schema_elem.parent_idx].converted_type;
@@ -723,7 +722,6 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
                        int schema_idx,
                        std::vector<cudf::io::detail::inline_column_buffer>& out_col_array,
                        bool has_list_parent) {
-      if (schema_idx < 0) { return false; }
       auto const& schema_elem = get_schema(schema_idx);
 
       // if schema_elem is a stub then it does not exist in the column_name_info and column_buffer

From 81f8cdfdfb326afaee8177e4f40a607393b21b99 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Thu, 2 May 2024 16:23:18 -0700
Subject: [PATCH 139/842] Implement ORC chunked reader (#15094)

This implements ORC chunked reader, to support reading ORC such that:
 * The output is multiple tables instead of once, each of them is issue when calling to `read_chunk()`, and has limited size which stays within a given `output_limit` parameter.
 * The temporary device memory usage can be limited by a soft limit `data_read_limit` parameter, allowing to read very large ORC files without OOM.
 * ORC files containing many billions of rows can be properly read chunk-by-chunk without seeing the size overflow issue when the number of rows exceeds cudf size limit (`2^31` rows).

Depends on:
 * https://github.com/rapidsai/cudf/pull/14911
 * https://github.com/rapidsai/cudf/pull/15008
 * https://github.com/rapidsai/cudf/pull/15169
 * https://github.com/rapidsai/cudf/pull/15252

Partially contribute to https://github.com/rapidsai/cudf/issues/12228.

---

## Benchmarks

Due to some small optimizations in ORC reader, reading ORC files all-at-once (reading the entire file into just one output table) can be a little bit faster. For example, with the benchmark `orc_read_io_compression`:
```
## [0] Quadro RTX 6000

|      io       |  compression  |  cardinality  |  run_length  |   Ref Time |   Ref Noise |   Cmp Time |   Cmp Noise |          Diff |   %Diff |  Status  |
|---------------|---------------|---------------|--------------|------------|-------------|------------|-------------|---------------|---------|----------|
|   FILEPATH    |    SNAPPY     |       0       |      1       | 183.027 ms |       7.45% | 157.293 ms |       4.72% | -25733.837 us | -14.06% |   FAIL   |
|   FILEPATH    |    SNAPPY     |     1000      |      1       | 198.228 ms |       6.43% | 164.395 ms |       4.14% | -33833.020 us | -17.07% |   FAIL   |
|   FILEPATH    |    SNAPPY     |       0       |      32      |  96.676 ms |       6.19% |  82.522 ms |       1.36% | -14153.945 us | -14.64% |   FAIL   |
|   FILEPATH    |    SNAPPY     |     1000      |      32      |  94.508 ms |       4.80% |  81.078 ms |       0.48% | -13429.672 us | -14.21% |   FAIL   |
|   FILEPATH    |     NONE      |       0       |      1       | 161.868 ms |       5.40% | 139.849 ms |       2.44% | -22018.910 us | -13.60% |   FAIL   |
|   FILEPATH    |     NONE      |     1000      |      1       | 164.902 ms |       5.80% | 142.041 ms |       3.43% | -22861.258 us | -13.86% |   FAIL   |
|   FILEPATH    |     NONE      |       0       |      32      |  88.298 ms |       5.15% |  74.924 ms |       1.97% | -13374.607 us | -15.15% |   FAIL   |
|   FILEPATH    |     NONE      |     1000      |      32      |  87.147 ms |       5.61% |  72.502 ms |       0.50% | -14645.122 us | -16.81% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |       0       |      1       | 124.990 ms |       0.39% | 111.670 ms |       2.13% | -13320.483 us | -10.66% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |     1000      |      1       | 149.858 ms |       4.10% | 126.266 ms |       0.48% | -23591.543 us | -15.74% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |       0       |      32      |  92.499 ms |       4.46% |  77.653 ms |       1.58% | -14846.471 us | -16.05% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |     1000      |      32      |  93.373 ms |       4.14% |  80.033 ms |       3.19% | -13340.002 us | -14.29% |   FAIL   |
|  HOST_BUFFER  |     NONE      |       0       |      1       | 111.792 ms |       0.50% |  97.083 ms |       0.50% | -14709.530 us | -13.16% |   FAIL   |
|  HOST_BUFFER  |     NONE      |     1000      |      1       | 117.646 ms |       5.60% |  97.634 ms |       0.44% | -20012.301 us | -17.01% |   FAIL   |
|  HOST_BUFFER  |     NONE      |       0       |      32      |  84.983 ms |       4.96% |  66.975 ms |       0.50% | -18007.403 us | -21.19% |   FAIL   |
|  HOST_BUFFER  |     NONE      |     1000      |      32      |  82.648 ms |       4.42% |  65.510 ms |       0.91% | -17137.910 us | -20.74% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |       0       |      1       |  65.538 ms |       4.02% |  59.399 ms |       2.54% |  -6138.560 us |  -9.37% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |     1000      |      1       | 101.427 ms |       4.10% |  92.276 ms |       3.30% |  -9150.278 us |  -9.02% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |       0       |      32      |  80.133 ms |       4.64% |  73.959 ms |       3.50% |  -6173.818 us |  -7.70% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |     1000      |      32      |  86.232 ms |       4.71% |  77.446 ms |       3.32% |  -8786.606 us | -10.19% |   FAIL   |
| DEVICE_BUFFER |     NONE      |       0       |      1       |  52.189 ms |       6.62% |  45.018 ms |       4.11% |  -7171.043 us | -13.74% |   FAIL   |
| DEVICE_BUFFER |     NONE      |     1000      |      1       |  54.664 ms |       6.76% |  46.855 ms |       3.35% |  -7809.803 us | -14.29% |   FAIL   |
| DEVICE_BUFFER |     NONE      |       0       |      32      |  67.975 ms |       5.12% |  60.553 ms |       4.22% |  -7422.279 us | -10.92% |   FAIL   |
| DEVICE_BUFFER |     NONE      |     1000      |      32      |  68.485 ms |       4.86% |  62.253 ms |       6.23% |  -6232.340 us |  -9.10% |   FAIL   |

```


When memory is limited, chunked read can help avoiding OOM but with some sort of performance trade-off. For example, for reading a table of size 500MB from file using 64MB output limits and 640 MB data read limit:
```
|      io       |  compression  |  cardinality  |  run_length  |   Ref Time |   Ref Noise |   Cmp Time |   Cmp Noise |       Diff |   %Diff |  Status  |
|---------------|---------------|---------------|--------------|------------|-------------|------------|-------------|------------|---------|----------|
|   FILEPATH    |    SNAPPY     |       0       |      1       | 183.027 ms |       7.45% | 350.824 ms |       2.74% | 167.796 ms |  91.68% |   FAIL   |
|   FILEPATH    |    SNAPPY     |     1000      |      1       | 198.228 ms |       6.43% | 322.414 ms |       3.46% | 124.186 ms |  62.65% |   FAIL   |
|   FILEPATH    |    SNAPPY     |       0       |      32      |  96.676 ms |       6.19% | 133.363 ms |       4.78% |  36.686 ms |  37.95% |   FAIL   |
|   FILEPATH    |    SNAPPY     |     1000      |      32      |  94.508 ms |       4.80% | 128.897 ms |       0.37% |  34.389 ms |  36.39% |   FAIL   |
|   FILEPATH    |     NONE      |       0       |      1       | 161.868 ms |       5.40% | 316.637 ms |       4.21% | 154.769 ms |  95.61% |   FAIL   |
|   FILEPATH    |     NONE      |     1000      |      1       | 164.902 ms |       5.80% | 326.043 ms |       3.06% | 161.141 ms |  97.72% |   FAIL   |
|   FILEPATH    |     NONE      |       0       |      32      |  88.298 ms |       5.15% | 124.819 ms |       5.17% |  36.520 ms |  41.36% |   FAIL   |
|   FILEPATH    |     NONE      |     1000      |      32      |  87.147 ms |       5.61% | 123.047 ms |       5.82% |  35.900 ms |  41.19% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |       0       |      1       | 124.990 ms |       0.39% | 285.718 ms |       0.78% | 160.728 ms | 128.59% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |     1000      |      1       | 149.858 ms |       4.10% | 263.491 ms |       2.89% | 113.633 ms |  75.83% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |       0       |      32      |  92.499 ms |       4.46% | 127.881 ms |       0.86% |  35.382 ms |  38.25% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |     1000      |      32      |  93.373 ms |       4.14% | 128.022 ms |       0.98% |  34.650 ms |  37.11% |   FAIL   |
|  HOST_BUFFER  |     NONE      |       0       |      1       | 111.792 ms |       0.50% | 241.064 ms |       1.89% | 129.271 ms | 115.64% |   FAIL   |
|  HOST_BUFFER  |     NONE      |     1000      |      1       | 117.646 ms |       5.60% | 248.134 ms |       3.08% | 130.488 ms | 110.92% |   FAIL   |
|  HOST_BUFFER  |     NONE      |       0       |      32      |  84.983 ms |       4.96% | 118.049 ms |       5.99% |  33.066 ms |  38.91% |   FAIL   |
|  HOST_BUFFER  |     NONE      |     1000      |      32      |  82.648 ms |       4.42% | 114.577 ms |       2.34% |  31.929 ms |  38.63% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |       0       |      1       |  65.538 ms |       4.02% | 232.466 ms |       3.28% | 166.928 ms | 254.71% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |     1000      |      1       | 101.427 ms |       4.10% | 221.578 ms |       1.43% | 120.152 ms | 118.46% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |       0       |      32      |  80.133 ms |       4.64% | 120.604 ms |       0.35% |  40.471 ms |  50.50% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |     1000      |      32      |  86.232 ms |       4.71% | 125.521 ms |       3.93% |  39.289 ms |  45.56% |   FAIL   |
| DEVICE_BUFFER |     NONE      |       0       |      1       |  52.189 ms |       6.62% | 182.943 ms |       0.29% | 130.754 ms | 250.54% |   FAIL   |
| DEVICE_BUFFER |     NONE      |     1000      |      1       |  54.664 ms |       6.76% | 190.501 ms |       0.49% | 135.836 ms | 248.49% |   FAIL   |
| DEVICE_BUFFER |     NONE      |       0       |      32      |  67.975 ms |       5.12% | 107.172 ms |       3.56% |  39.197 ms |  57.66% |   FAIL   |
| DEVICE_BUFFER |     NONE      |     1000      |      32      |  68.485 ms |       4.86% | 108.097 ms |       2.92% |  39.611 ms |  57.84% |   FAIL   |

```
And if memory is too limited, chunked read with 8MB output limit/80MB data read limit:
```
|      io       |  compression  |  cardinality  |  run_length  |   Ref Time |   Ref Noise |   Cmp Time |   Cmp Noise |       Diff |   %Diff |  Status  |
|      io       |  compression  |  cardinality  |  run_length  |   Ref Time |   Ref Noise |   Cmp Time |   Cmp Noise |       Diff |   %Diff |  Status  |
|---------------|---------------|---------------|--------------|------------|-------------|------------|-------------|------------|---------|----------|
|   FILEPATH    |    SNAPPY     |       0       |      1       | 183.027 ms |       7.45% | 732.926 ms |       1.98% | 549.899 ms | 300.45% |   FAIL   |
|   FILEPATH    |    SNAPPY     |     1000      |      1       | 198.228 ms |       6.43% | 834.309 ms |       4.21% | 636.081 ms | 320.88% |   FAIL   |
|   FILEPATH    |    SNAPPY     |       0       |      32      |  96.676 ms |       6.19% | 363.033 ms |       1.66% | 266.356 ms | 275.51% |   FAIL   |
|   FILEPATH    |    SNAPPY     |     1000      |      32      |  94.508 ms |       4.80% | 313.813 ms |       1.28% | 219.305 ms | 232.05% |   FAIL   |
|   FILEPATH    |     NONE      |       0       |      1       | 161.868 ms |       5.40% | 607.700 ms |       2.90% | 445.832 ms | 275.43% |   FAIL   |
|   FILEPATH    |     NONE      |     1000      |      1       | 164.902 ms |       5.80% | 616.101 ms |       3.46% | 451.199 ms | 273.62% |   FAIL   |
|   FILEPATH    |     NONE      |       0       |      32      |  88.298 ms |       5.15% | 267.703 ms |       0.46% | 179.405 ms | 203.18% |   FAIL   |
|   FILEPATH    |     NONE      |     1000      |      32      |  87.147 ms |       5.61% | 250.528 ms |       0.43% | 163.381 ms | 187.48% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |       0       |      1       | 124.990 ms |       0.39% | 636.270 ms |       0.44% | 511.280 ms | 409.06% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |     1000      |      1       | 149.858 ms |       4.10% | 747.264 ms |       0.50% | 597.406 ms | 398.65% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |       0       |      32      |  92.499 ms |       4.46% | 359.660 ms |       0.19% | 267.161 ms | 288.82% |   FAIL   |
|  HOST_BUFFER  |    SNAPPY     |     1000      |      32      |  93.373 ms |       4.14% | 311.608 ms |       0.43% | 218.235 ms | 233.73% |   FAIL   |
|  HOST_BUFFER  |     NONE      |       0       |      1       | 111.792 ms |       0.50% | 493.797 ms |       0.13% | 382.005 ms | 341.71% |   FAIL   |
|  HOST_BUFFER  |     NONE      |     1000      |      1       | 117.646 ms |       5.60% | 516.706 ms |       0.12% | 399.060 ms | 339.20% |   FAIL   |
|  HOST_BUFFER  |     NONE      |       0       |      32      |  84.983 ms |       4.96% | 258.477 ms |       0.46% | 173.495 ms | 204.15% |   FAIL   |
|  HOST_BUFFER  |     NONE      |     1000      |      32      |  82.648 ms |       4.42% | 248.028 ms |       5.30% | 165.380 ms | 200.10% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |       0       |      1       |  65.538 ms |       4.02% | 606.010 ms |       3.76% | 540.472 ms | 824.68% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |     1000      |      1       | 101.427 ms |       4.10% | 742.774 ms |       4.64% | 641.347 ms | 632.33% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |       0       |      32      |  80.133 ms |       4.64% | 364.701 ms |       2.70% | 284.568 ms | 355.12% |   FAIL   |
| DEVICE_BUFFER |    SNAPPY     |     1000      |      32      |  86.232 ms |       4.71% | 320.387 ms |       2.80% | 234.155 ms | 271.54% |   FAIL   |
| DEVICE_BUFFER |     NONE      |       0       |      1       |  52.189 ms |       6.62% | 458.100 ms |       2.15% | 405.912 ms | 777.78% |   FAIL   |
| DEVICE_BUFFER |     NONE      |     1000      |      1       |  54.664 ms |       6.76% | 478.527 ms |       1.41% | 423.862 ms | 775.39% |   FAIL   |
| DEVICE_BUFFER |     NONE      |       0       |      32      |  67.975 ms |       5.12% | 260.009 ms |       3.71% | 192.034 ms | 282.51% |   FAIL   |
| DEVICE_BUFFER |     NONE      |     1000      |      32      |  68.485 ms |       4.86% | 243.705 ms |       2.09% | 175.220 ms | 255.85% |   FAIL   |

```

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/nvdbaranec
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15094
---
 cpp/CMakeLists.txt                            |    3 +-
 cpp/benchmarks/io/orc/orc_reader_input.cpp    |  106 +-
 cpp/include/cudf/io/detail/orc.hpp            |   64 +-
 cpp/include/cudf/io/orc.hpp                   |  160 +-
 cpp/src/io/functions.cpp                      |   60 +-
 cpp/src/io/orc/aggregate_orc_metadata.cpp     |   71 +-
 cpp/src/io/orc/aggregate_orc_metadata.hpp     |   22 +-
 cpp/src/io/orc/orc.hpp                        |    8 +-
 cpp/src/io/orc/reader_impl.cu                 |  255 ++-
 cpp/src/io/orc/reader_impl.hpp                |  164 +-
 cpp/src/io/orc/reader_impl_chunking.cu        |  723 ++++++++
 cpp/src/io/orc/reader_impl_chunking.hpp       |  290 +++-
 ...pl_preprocess.cu => reader_impl_decode.cu} |  851 +++++-----
 cpp/src/io/orc/reader_impl_helpers.hpp        |    4 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    |    5 +-
 cpp/src/io/utilities/row_selection.cpp        |   21 +-
 cpp/src/io/utilities/row_selection.hpp        |    5 +-
 cpp/tests/CMakeLists.txt                      |    2 +-
 cpp/tests/io/orc_chunked_reader_test.cu       | 1477 +++++++++++++++++
 cpp/tests/io/row_selection_test.cpp           |   13 -
 python/cudf/cudf/_lib/cpp/io/orc.pxd          |   16 +-
 python/cudf/cudf/_lib/orc.pyx                 |    6 +-
 22 files changed, 3685 insertions(+), 641 deletions(-)
 create mode 100644 cpp/src/io/orc/reader_impl_chunking.cu
 rename cpp/src/io/orc/{reader_impl_preprocess.cu => reader_impl_decode.cu} (56%)
 create mode 100644 cpp/tests/io/orc_chunked_reader_test.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 53da710f0ea..232a4f40d8e 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -395,8 +395,9 @@ add_library(
   src/io/orc/dict_enc.cu
   src/io/orc/orc.cpp
   src/io/orc/reader_impl.cu
+  src/io/orc/reader_impl_chunking.cu
+  src/io/orc/reader_impl_decode.cu
   src/io/orc/reader_impl_helpers.cpp
-  src/io/orc/reader_impl_preprocess.cu
   src/io/orc/stats_enc.cu
   src/io/orc/stripe_data.cu
   src/io/orc/stripe_enc.cu
diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index fdb7dbe59b8..b7c214a8374 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,31 +24,59 @@
 
 #include <nvbench/nvbench.cuh>
 
+namespace {
+
 // Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
 // run on most GPUs, but large enough to allow highest throughput
-constexpr int64_t data_size        = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
+constexpr std::size_t data_size    = 512 << 20;
+constexpr std::size_t Mbytes       = 1024 * 1024;
 
+template <bool is_chunked_read>
 void orc_read_common(cudf::size_type num_rows_to_read,
                      cuio_source_sink_pair& source_sink,
                      nvbench::state& state)
 {
-  cudf::io::orc_reader_options read_opts =
-    cudf::io::orc_reader_options::builder(source_sink.make_source_info());
+  auto const read_opts =
+    cudf::io::orc_reader_options::builder(source_sink.make_source_info()).build();
 
   auto mem_stats_logger = cudf::memory_stats_logger();  // init stats logger
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-  state.exec(
-    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
-      try_drop_l3_cache();
-
-      timer.start();
-      auto const result = cudf::io::read_orc(read_opts);
-      timer.stop();
 
-      CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns");
-      CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
-    });
+  if constexpr (is_chunked_read) {
+    state.exec(
+      nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch&, auto& timer) {
+        try_drop_l3_cache();
+        auto const output_limit_MB =
+          static_cast<std::size_t>(state.get_int64("chunk_read_limit_MB"));
+        auto const read_limit_MB = static_cast<std::size_t>(state.get_int64("pass_read_limit_MB"));
+
+        auto reader =
+          cudf::io::chunked_orc_reader(output_limit_MB * Mbytes, read_limit_MB * Mbytes, read_opts);
+        cudf::size_type num_rows{0};
+
+        timer.start();
+        do {
+          auto chunk = reader.read_chunk();
+          num_rows += chunk.tbl->num_rows();
+        } while (reader.has_next());
+        timer.stop();
+
+        CUDF_EXPECTS(num_rows == num_rows_to_read, "Unexpected number of rows");
+      });
+  } else {  // not is_chunked_read
+    state.exec(
+      nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch&, auto& timer) {
+        try_drop_l3_cache();
+
+        timer.start();
+        auto const result = cudf::io::read_orc(read_opts);
+        timer.stop();
+
+        CUDF_EXPECTS(result.tbl->num_columns() == num_cols, "Unexpected number of columns");
+        CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
+      });
+  }
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
   state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
@@ -57,6 +85,8 @@ void orc_read_common(cudf::size_type num_rows_to_read,
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
+}  // namespace
+
 template <data_type DataType, cudf::io::io_type IOType>
 void BM_orc_read_data(nvbench::state& state,
                       nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IOType>>)
@@ -79,13 +109,11 @@ void BM_orc_read_data(nvbench::state& state,
     return view.num_rows();
   }();
 
-  orc_read_common(num_rows_written, source_sink, state);
+  orc_read_common<false>(num_rows_written, source_sink, state);
 }
 
-template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
-void BM_orc_read_io_compression(
-  nvbench::state& state,
-  nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
+template <cudf::io::io_type IOType, cudf::io::compression_type Compression, bool chunked_read>
+void orc_read_io_compression(nvbench::state& state)
 {
   auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL_SIGNED),
                                          static_cast<int32_t>(data_type::FLOAT),
@@ -95,15 +123,21 @@ void BM_orc_read_io_compression(
                                          static_cast<int32_t>(data_type::LIST),
                                          static_cast<int32_t>(data_type::STRUCT)});
 
-  cudf::size_type const cardinality = state.get_int64("cardinality");
-  cudf::size_type const run_length  = state.get_int64("run_length");
+  auto const [cardinality, run_length] = [&]() -> std::pair<cudf::size_type, cudf::size_type> {
+    if constexpr (chunked_read) {
+      return {0, 4};
+    } else {
+      return {static_cast<cudf::size_type>(state.get_int64("cardinality")),
+              static_cast<cudf::size_type>(state.get_int64("run_length"))};
+    }
+  }();
   cuio_source_sink_pair source_sink(IOType);
 
   auto const num_rows_written = [&]() {
     auto const tbl = create_random_table(
       cycle_dtypes(d_type, num_cols),
       table_size_bytes{data_size},
-      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+      data_profile_builder{}.cardinality(cardinality).avg_run_length(run_length));
     auto const view = tbl->view();
 
     cudf::io::orc_writer_options opts =
@@ -113,7 +147,23 @@ void BM_orc_read_io_compression(
     return view.num_rows();
   }();
 
-  orc_read_common(num_rows_written, source_sink, state);
+  orc_read_common<chunked_read>(num_rows_written, source_sink, state);
+}
+
+template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
+void BM_orc_read_io_compression(
+  nvbench::state& state,
+  nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
+{
+  return orc_read_io_compression<IOType, Compression, false>(state);
+}
+
+template <cudf::io::compression_type Compression>
+void BM_orc_chunked_read_io_compression(nvbench::state& state,
+                                        nvbench::type_list<nvbench::enum_type<Compression>>)
+{
+  // Only run benchmark using HOST_BUFFER IO.
+  return orc_read_io_compression<cudf::io::io_type::HOST_BUFFER, Compression, true>(state);
 }
 
 using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL_SIGNED,
@@ -146,3 +196,13 @@ NVBENCH_BENCH_TYPES(BM_orc_read_io_compression, NVBENCH_TYPE_AXES(io_list, compr
   .set_min_samples(4)
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32});
+
+// Should have the same parameters as `BM_orc_read_io_compression` for comparison.
+NVBENCH_BENCH_TYPES(BM_orc_chunked_read_io_compression, NVBENCH_TYPE_AXES(compression_list))
+  .set_name("orc_chunked_read_io_compression")
+  .set_type_axes_names({"compression"})
+  .set_min_samples(4)
+  // The input has approximately 520MB and 127K rows.
+  // The limits below are given in MBs.
+  .add_int64_axis("chunk_read_limit_MB", {50, 250, 700})
+  .add_int64_axis("pass_read_limit_MB", {50, 250, 700});
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 9aeb9ae4267..597ddd9cf0a 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -38,13 +38,15 @@ class chunked_orc_writer_options;
 
 namespace orc::detail {
 
+// Forward declaration of the internal reader class
+class reader_impl;
+
 /**
  * @brief Class to read ORC dataset data into columns.
  */
 class reader {
  private:
-  class impl;
-  std::unique_ptr<impl> _impl;
+  std::unique_ptr<reader_impl> _impl;
 
  public:
   /**
@@ -68,10 +70,63 @@ class reader {
   /**
    * @brief Reads the entire dataset.
    *
-   * @param options Settings for controlling reading behavior
    * @return The set of columns along with table metadata
    */
-  table_with_metadata read(orc_reader_options const& options);
+  table_with_metadata read();
+};
+
+/**
+ * @brief The reader class that supports iterative reading from an array of data sources.
+ */
+class chunked_reader {
+ private:
+  std::unique_ptr<reader_impl> _impl;
+
+ public:
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t, size_type,
+   * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
+   *
+   * @param sources Input `datasource` objects to read the dataset from
+   */
+  explicit chunked_reader(std::size_t chunk_read_limit,
+                          std::size_t pass_read_limit,
+                          size_type output_row_granularity,
+                          std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+                          orc_reader_options const& options,
+                          rmm::cuda_stream_view stream,
+                          rmm::device_async_resource_ref mr);
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::chunked_orc_reader(std::size_t, std::size_t,
+   * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
+   *
+   * @param sources Input `datasource` objects to read the dataset from
+   */
+  explicit chunked_reader(std::size_t chunk_read_limit,
+                          std::size_t pass_read_limit,
+                          std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+                          orc_reader_options const& options,
+                          rmm::cuda_stream_view stream,
+                          rmm::device_async_resource_ref mr);
+
+  /**
+   * @brief Destructor explicitly-declared to avoid inlined in header.
+   *
+   * Since the declaration of the internal `_impl` object does not exist in this header, this
+   * destructor needs to be defined in a separate source file which can access to that object's
+   * declaration.
+   */
+  ~chunked_reader();
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::has_next
+   */
+  [[nodiscard]] bool has_next() const;
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::read_chunk
+   */
+  [[nodiscard]] table_with_metadata read_chunk() const;
 };
 
 /**
@@ -126,5 +181,6 @@ class writer {
    */
   void close();
 };
+
 }  // namespace orc::detail
 }  // namespace cudf::io
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index bceb258cb38..8140f8897b7 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -58,10 +58,10 @@ class orc_reader_options {
 
   // List of individual stripes to read (ignored if empty)
   std::vector<std::vector<size_type>> _stripes;
-  // Rows to skip from the start; ORC stores the number of rows as uint64_t
-  uint64_t _skip_rows = 0;
+  // Rows to skip from the start
+  int64_t _skip_rows = 0;
   // Rows to read; `nullopt` is all
-  std::optional<size_type> _num_rows;
+  std::optional<int64_t> _num_rows;
 
   // Whether to use row index to speed-up reading
   bool _use_index = true;
@@ -125,7 +125,7 @@ class orc_reader_options {
    *
    * @return Number of rows to skip from the start
    */
-  uint64_t get_skip_rows() const { return _skip_rows; }
+  int64_t get_skip_rows() const { return _skip_rows; }
 
   /**
    * @brief Returns number of row to read.
@@ -133,7 +133,7 @@ class orc_reader_options {
    * @return Number of rows to read; `nullopt` if the option hasn't been set (in which case the file
    * is read until the end)
    */
-  std::optional<size_type> const& get_num_rows() const { return _num_rows; }
+  std::optional<int64_t> const& get_num_rows() const { return _num_rows; }
 
   /**
    * @brief Whether to use row index to speed-up reading.
@@ -198,10 +198,10 @@ class orc_reader_options {
    * @throw cudf::logic_error if a negative value is passed
    * @throw cudf::logic_error if stripes have been previously set
    */
-  void set_skip_rows(uint64_t rows)
+  void set_skip_rows(int64_t rows)
   {
+    CUDF_EXPECTS(rows >= 0, "skip_rows cannot be negative");
     CUDF_EXPECTS(rows == 0 or _stripes.empty(), "Can't set both skip_rows along with stripes");
-    CUDF_EXPECTS(rows <= std::numeric_limits<int64_t>::max(), "skip_rows is too large");
     _skip_rows = rows;
   }
 
@@ -213,7 +213,7 @@ class orc_reader_options {
    * @throw cudf::logic_error if a negative value is passed
    * @throw cudf::logic_error if stripes have been previously set
    */
-  void set_num_rows(size_type nrows)
+  void set_num_rows(int64_t nrows)
   {
     CUDF_EXPECTS(nrows >= 0, "num_rows cannot be negative");
     CUDF_EXPECTS(_stripes.empty(), "Can't set both num_rows and stripes");
@@ -271,7 +271,7 @@ class orc_reader_options_builder {
    *
    * @param src The source information used to read orc file
    */
-  explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {};
+  explicit orc_reader_options_builder(source_info src) : options{std::move(src)} {}
 
   /**
    * @brief Sets names of the column to read.
@@ -303,7 +303,7 @@ class orc_reader_options_builder {
    * @param rows Number of rows
    * @return this for chaining
    */
-  orc_reader_options_builder& skip_rows(uint64_t rows)
+  orc_reader_options_builder& skip_rows(int64_t rows)
   {
     options.set_skip_rows(rows);
     return *this;
@@ -315,7 +315,7 @@ class orc_reader_options_builder {
    * @param nrows Number of rows
    * @return this for chaining
    */
-  orc_reader_options_builder& num_rows(size_type nrows)
+  orc_reader_options_builder& num_rows(int64_t nrows)
   {
     options.set_num_rows(nrows);
     return *this;
@@ -406,6 +406,144 @@ table_with_metadata read_orc(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief The chunked orc reader class to read an ORC file iteratively into a series of
+ * tables, chunk by chunk.
+ *
+ * This class is designed to address the reading issue when reading very large ORC files such
+ * that sizes of their columns exceed the limit that can be stored in cudf columns. By reading the
+ * file content by chunks using this class, each chunk is guaranteed to have its size stay within
+ * the given limit.
+ */
+class chunked_orc_reader {
+ public:
+  /**
+   * @brief Default constructor, this should never be used.
+   *
+   * This is added just to satisfy cython.
+   */
+  chunked_orc_reader() = default;
+
+  /**
+   * @brief Construct the reader from input/output size limits, output row granularity, along with
+   * other ORC reader options.
+   *
+   * The typical usage should be similar to this:
+   * ```
+   *  do {
+   *    auto const chunk = reader.read_chunk();
+   *    // Process chunk
+   *  } while (reader.has_next());
+   *
+   * ```
+   *
+   * If `chunk_read_limit == 0` (i.e., no output limit) and `pass_read_limit == 0` (no temporary
+   * memory size limit), a call to `read_chunk()` will read the whole data source and return a table
+   * containing all rows.
+   *
+   * The `chunk_read_limit` parameter controls the size of the output table to be returned per
+   * `read_chunk()` call. If the user specifies a 100 MB limit, the reader will attempt to return
+   * tables that have a total bytes size (over all columns) of 100 MB or less.
+   * This is a soft limit and the code will not fail if it cannot satisfy the limit.
+   *
+   * The `pass_read_limit` parameter controls how much temporary memory is used in the entire
+   * process of loading, decompressing and decoding of data. Again, this is also a soft limit and
+   * the reader will try to make the best effort.
+   *
+   * Finally, the parameter `output_row_granularity` controls the changes in row number of the
+   * output chunk. For each call to `read_chunk()`, with respect to the given `pass_read_limit`, a
+   * subset of stripes may be loaded, decompressed and decoded into an intermediate table. The
+   * reader will then subdivide that table into smaller tables for final output using
+   * `output_row_granularity` as the subdivision step.
+   *
+   * @param chunk_read_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   *        or `0` if there is no limit
+   * @param pass_read_limit Limit on temporary memory usage for reading the data sources,
+   *        or `0` if there is no limit
+   * @param output_row_granularity The granularity parameter used for subdividing the decoded
+   *        table for final output
+   * @param options Settings for controlling reading behaviors
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   *
+   * @throw cudf::logic_error if `output_row_granularity` is non-positive
+   */
+  explicit chunked_orc_reader(
+    std::size_t chunk_read_limit,
+    std::size_t pass_read_limit,
+    size_type output_row_granularity,
+    orc_reader_options const& options,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Construct the reader from input/output size limits along with other ORC reader options.
+   *
+   * This constructor implicitly call the other constructor with `output_row_granularity` set to
+   * `DEFAULT_OUTPUT_ROW_GRANULARITY` rows.
+   *
+   * @param chunk_read_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   *        or `0` if there is no limit
+   * @param pass_read_limit Limit on temporary memory usage for reading the data sources,
+   *        or `0` if there is no limit
+   * @param options Settings for controlling reading behaviors
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  explicit chunked_orc_reader(
+    std::size_t chunk_read_limit,
+    std::size_t pass_read_limit,
+    orc_reader_options const& options,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Construct the reader from output size limits along with other ORC reader options.
+   *
+   * This constructor implicitly call the other constructor with `pass_read_limit` set to `0` and
+   * `output_row_granularity` set to `DEFAULT_OUTPUT_ROW_GRANULARITY` rows.
+   *
+   * @param chunk_read_limit Limit on total number of bytes to be returned per `read_chunk()` call,
+   *        or `0` if there is no limit
+   * @param options Settings for controlling reading behaviors
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  explicit chunked_orc_reader(
+    std::size_t chunk_read_limit,
+    orc_reader_options const& options,
+    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Destructor, destroying the internal reader instance.
+   */
+  ~chunked_orc_reader();
+
+  /**
+   * @brief Check if there is any data in the given data sources has not yet read.
+   *
+   * @return A boolean value indicating if there is any data left to read
+   */
+  [[nodiscard]] bool has_next() const;
+
+  /**
+   * @brief Read a chunk of rows in the given data sources.
+   *
+   * The sequence of returned tables, if concatenated by their order, guarantees to form a complete
+   * dataset as reading the entire given data sources at once.
+   *
+   * An empty table will be returned if the given sources are empty, or all the data has
+   * been read and returned by the previous calls.
+   *
+   * @return An output `cudf::table` along with its metadata
+   */
+  [[nodiscard]] table_with_metadata read_chunk() const;
+
+ private:
+  std::unique_ptr<cudf::io::orc::detail::chunked_reader> reader;
+};
+
 /** @} */  // end of group
 /**
  * @addtogroup io_writers
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 12059dffa4e..98b010109ec 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -420,7 +420,7 @@ table_with_metadata read_orc(orc_reader_options const& options,
 
   auto datasources = make_datasources(options.get_source());
   auto reader = std::make_unique<orc::detail::reader>(std::move(datasources), options, stream, mr);
-  return reader->read(options);
+  return reader->read();
 }
 
 /**
@@ -440,6 +440,64 @@ void write_orc(orc_writer_options const& options, rmm::cuda_stream_view stream)
   writer->write(options.get_table());
 }
 
+chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit,
+                                       std::size_t pass_read_limit,
+                                       size_type output_row_granularity,
+                                       orc_reader_options const& options,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
+  : reader{std::make_unique<orc::detail::chunked_reader>(chunk_read_limit,
+                                                         pass_read_limit,
+                                                         output_row_granularity,
+                                                         make_datasources(options.get_source()),
+                                                         options,
+                                                         stream,
+                                                         mr)}
+{
+}
+
+chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit,
+                                       std::size_t pass_read_limit,
+                                       orc_reader_options const& options,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
+  : reader{std::make_unique<orc::detail::chunked_reader>(chunk_read_limit,
+                                                         pass_read_limit,
+                                                         make_datasources(options.get_source()),
+                                                         options,
+                                                         stream,
+                                                         mr)}
+{
+}
+
+chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit,
+                                       orc_reader_options const& options,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::device_async_resource_ref mr)
+  : chunked_orc_reader(chunk_read_limit, 0UL, options, stream, mr)
+{
+}
+
+// This destructor destroys the internal reader instance.
+// Since the declaration of the internal `reader` object does not exist in the header, this
+// destructor needs to be defined in a separate source file which can access to that object's
+// declaration.
+chunked_orc_reader::~chunked_orc_reader() = default;
+
+bool chunked_orc_reader::has_next() const
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(reader != nullptr, "Reader has not been constructed properly.");
+  return reader->has_next();
+}
+
+table_with_metadata chunked_orc_reader::read_chunk() const
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(reader != nullptr, "Reader has not been constructed properly.");
+  return reader->read_chunk();
+}
+
 /**
  * @copydoc cudf::io::orc_chunked_writer::orc_chunked_writer
  */
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.cpp b/cpp/src/io/orc/aggregate_orc_metadata.cpp
index d54524f0f0d..94a4d146b35 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.cpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "aggregate_orc_metadata.hpp"
+#include "io/orc/aggregate_orc_metadata.hpp"
 
 #include "io/utilities/row_selection.hpp"
 
@@ -152,22 +152,28 @@ aggregate_orc_metadata::aggregate_orc_metadata(
   }
 }
 
-std::tuple<int64_t, size_type, std::vector<metadata::stripe_source_mapping>>
+std::tuple<int64_t, int64_t, std::vector<metadata::orc_stripe_info>>
 aggregate_orc_metadata::select_stripes(
   std::vector<std::vector<size_type>> const& user_specified_stripes,
   int64_t skip_rows,
-  std::optional<size_type> const& num_rows,
+  std::optional<size_type> const& num_read_rows,
   rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS((skip_rows == 0 and not num_rows.has_value()) or user_specified_stripes.empty(),
+  CUDF_EXPECTS((skip_rows == 0 and not num_read_rows.has_value()) or user_specified_stripes.empty(),
                "Can't use both the row selection and the stripe selection");
 
   auto [rows_to_skip, rows_to_read] = [&]() {
-    if (not user_specified_stripes.empty()) { return std::pair<int64_t, size_type>{0, 0}; }
-    return cudf::io::detail::skip_rows_num_rows_from_options(skip_rows, num_rows, get_num_rows());
+    if (not user_specified_stripes.empty()) { return std::pair<int64_t, int64_t>{0, 0}; }
+    return cudf::io::detail::skip_rows_num_rows_from_options(
+      skip_rows, num_read_rows, get_num_rows());
   }();
 
-  std::vector<metadata::stripe_source_mapping> selected_stripes_mapping;
+  struct stripe_source_mapping {
+    int source_idx;
+    std::vector<metadata::orc_stripe_info> stripe_info;
+  };
+
+  std::vector<stripe_source_mapping> selected_stripes_mapping;
 
   if (!user_specified_stripes.empty()) {
     CUDF_EXPECTS(user_specified_stripes.size() == per_file_metadata.size(),
@@ -176,7 +182,8 @@ aggregate_orc_metadata::select_stripes(
     // Each vector entry represents a source file; each nested vector represents the
     // user_defined_stripes to get from that source file
     for (size_t src_file_idx = 0; src_file_idx < user_specified_stripes.size(); ++src_file_idx) {
-      std::vector<OrcStripeInfo> stripe_infos;
+      std::vector<metadata::orc_stripe_info> stripe_infos;
+      stripe_infos.reserve(user_specified_stripes[src_file_idx].size());
 
       // Coalesce stripe info at the source file later since that makes downstream processing much
       // easier in impl::read
@@ -185,11 +192,19 @@ aggregate_orc_metadata::select_stripes(
           stripe_idx >= 0 and stripe_idx < static_cast<decltype(stripe_idx)>(
                                              per_file_metadata[src_file_idx].ff.stripes.size()),
           "Invalid stripe index");
-        stripe_infos.push_back(
-          std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
-        rows_to_read += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        stripe_infos.push_back({&per_file_metadata[src_file_idx].ff.stripes[stripe_idx],
+                                nullptr,
+                                static_cast<int>(src_file_idx)});
+
+        auto const stripe_rows =
+          per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        CUDF_EXPECTS(stripe_rows < static_cast<uint64_t>(std::numeric_limits<size_type>::max()),
+                     "The number of rows in one stripe exceeds the column size limit.",
+                     std::overflow_error);
+        rows_to_read += static_cast<int64_t>(stripe_rows);
       }
-      selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
+      selected_stripes_mapping.emplace_back(
+        stripe_source_mapping{static_cast<int>(src_file_idx), std::move(stripe_infos)});
     }
   } else {
     int64_t count            = 0;
@@ -198,33 +213,44 @@ aggregate_orc_metadata::select_stripes(
     for (size_t src_file_idx = 0;
          src_file_idx < per_file_metadata.size() && count < rows_to_skip + rows_to_read;
          ++src_file_idx) {
-      std::vector<OrcStripeInfo> stripe_infos;
+      std::vector<metadata::orc_stripe_info> stripe_infos;
+      stripe_infos.reserve(per_file_metadata[src_file_idx].ff.stripes.size());
 
       for (size_t stripe_idx = 0; stripe_idx < per_file_metadata[src_file_idx].ff.stripes.size() &&
                                   count < rows_to_skip + rows_to_read;
            ++stripe_idx) {
-        count += per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        auto const stripe_rows =
+          per_file_metadata[src_file_idx].ff.stripes[stripe_idx].numberOfRows;
+        CUDF_EXPECTS(stripe_rows < static_cast<uint64_t>(std::numeric_limits<size_type>::max()),
+                     "The number of rows in one stripe exceeds the column size limit.",
+                     std::overflow_error);
+        count += static_cast<int64_t>(stripe_rows);
+
         if (count > rows_to_skip || count == 0) {
-          stripe_infos.push_back(
-            std::pair(&per_file_metadata[src_file_idx].ff.stripes[stripe_idx], nullptr));
+          stripe_infos.push_back({&per_file_metadata[src_file_idx].ff.stripes[stripe_idx],
+                                  nullptr,
+                                  static_cast<int>(src_file_idx)});
         } else {
           stripe_skip_rows = count;
         }
       }
 
-      selected_stripes_mapping.push_back({static_cast<int>(src_file_idx), stripe_infos});
+      selected_stripes_mapping.emplace_back(
+        stripe_source_mapping{static_cast<int>(src_file_idx), std::move(stripe_infos)});
     }
     // Need to remove skipped rows from the stripes which are not selected.
     rows_to_skip -= stripe_skip_rows;
   }
 
+  std::vector<metadata::orc_stripe_info> output;
+
   // Read each stripe's stripefooter metadata
   for (auto& mapping : selected_stripes_mapping) {
     // Resize to all stripe_info for the source level
     per_file_metadata[mapping.source_idx].stripefooters.resize(mapping.stripe_info.size());
 
     for (size_t i = 0; i < mapping.stripe_info.size(); i++) {
-      auto const stripe         = mapping.stripe_info[i].first;
+      auto const stripe         = mapping.stripe_info[i].stripe_info;
       auto const sf_comp_offset = stripe->offset + stripe->indexLength + stripe->dataLength;
       auto const sf_comp_length = stripe->footerLength;
       CUDF_EXPECTS(
@@ -236,12 +262,17 @@ aggregate_orc_metadata::select_stripes(
         {buffer->data(), buffer->size()}, stream);
       ProtobufReader(sf_data.data(), sf_data.size())
         .read(per_file_metadata[mapping.source_idx].stripefooters[i]);
-      mapping.stripe_info[i].second = &per_file_metadata[mapping.source_idx].stripefooters[i];
+      mapping.stripe_info[i].stripe_footer =
+        &per_file_metadata[mapping.source_idx].stripefooters[i];
       if (stripe->indexLength == 0) { row_grp_idx_present = false; }
     }
+
+    output.insert(output.end(),
+                  std::make_move_iterator(mapping.stripe_info.begin()),
+                  std::make_move_iterator(mapping.stripe_info.end()));
   }
 
-  return {rows_to_skip, rows_to_read, selected_stripes_mapping};
+  return {rows_to_skip, rows_to_read, std::move(output)};
 }
 
 column_hierarchy aggregate_orc_metadata::select_columns(
diff --git a/cpp/src/io/orc/aggregate_orc_metadata.hpp b/cpp/src/io/orc/aggregate_orc_metadata.hpp
index d1e053be481..5da5af58b9b 100644
--- a/cpp/src/io/orc/aggregate_orc_metadata.hpp
+++ b/cpp/src/io/orc/aggregate_orc_metadata.hpp
@@ -45,8 +45,6 @@ struct column_hierarchy {
  * to aggregate that metadata from all the files.
  */
 class aggregate_orc_metadata {
-  using OrcStripeInfo = std::pair<StripeInformation const*, StripeFooter const*>;
-
   /**
    * @brief Sums up the number of rows of each source
    */
@@ -114,12 +112,22 @@ class aggregate_orc_metadata {
    * @brief Selects the stripes to read, based on the row/stripe selection parameters.
    *
    * Stripes are potentially selected from multiple files.
+   *
+   * Upon parsing stripes' information, the number of skip rows and reading rows are also updated
+   * to be matched with the actual numbers for reading stripes from data sources.
+   *
+   * @param user_specified_stripes The specified stripe indices to read
+   * @param skip_rows Number of rows to skip from reading
+   * @param num_read_rows Number of rows to read
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @return A tuple of the corrected skip_rows and num_rows values along with a vector of
+   *         stripes' metadata such as footer, data information, and source index
    */
-  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<metadata::stripe_source_mapping>>
-  select_stripes(std::vector<std::vector<size_type>> const& user_specified_stripes,
-                 int64_t skip_rows,
-                 std::optional<size_type> const& num_rows,
-                 rmm::cuda_stream_view stream);
+  [[nodiscard]] std::tuple<int64_t, int64_t, std::vector<metadata::orc_stripe_info>> select_stripes(
+    std::vector<std::vector<size_type>> const& user_specified_stripes,
+    int64_t skip_rows,
+    std::optional<size_type> const& num_read_rows,
+    rmm::cuda_stream_view stream);
 
   /**
    * @brief Filters ORC file to a selection of columns, based on their paths in the file.
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 88bd260a598..fd55cbb6846 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -602,13 +602,13 @@ struct column_validity_info {
  * convenience methods for initializing and accessing metadata.
  */
 class metadata {
-  using OrcStripeInfo = std::pair<StripeInformation const*, StripeFooter const*>;
-
  public:
-  struct stripe_source_mapping {
+  struct orc_stripe_info {
+    StripeInformation const* stripe_info;
+    StripeFooter const* stripe_footer;
     int source_idx;
-    std::vector<OrcStripeInfo> stripe_info;
   };
+  std::vector<orc_stripe_info> stripe_info;
 
  public:
   explicit metadata(datasource* const src, rmm::cuda_stream_view stream);
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 77151f5b7b8..621d4c67691 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -14,42 +14,100 @@
  * limitations under the License.
  */
 
-#include "reader_impl.hpp"
-#include "reader_impl_chunking.hpp"
-#include "reader_impl_helpers.hpp"
+#include "io/orc/reader_impl.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
+#include "io/orc/reader_impl_helpers.hpp"
 
-#include <rmm/resource_ref.hpp>
+#include <cudf/detail/copy.hpp>
+
+#include <algorithm>
 
 namespace cudf::io::orc::detail {
 
-reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                   orc_reader_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::device_async_resource_ref mr)
-  : _stream(stream),
-    _mr(mr),
-    _timestamp_type{options.get_timestamp_type()},
-    _use_index{options.is_enabled_use_index()},
-    _use_np_dtypes{options.is_enabled_use_np_dtypes()},
-    _decimal128_columns{options.get_decimal128_columns()},
-    _col_meta{std::make_unique<reader_column_meta>()},
-    _sources(std::move(sources)),
-    _metadata{_sources, stream},
-    _selected_columns{_metadata.select_columns(options.get_columns())}
+// This is just the proxy to call all other data preprocessing functions.
+void reader_impl::prepare_data(read_mode mode)
 {
+  // There are no columns in the table.
+  if (_selected_columns.num_levels() == 0) { return; }
+
+  // This will be no-op if it was called before.
+  preprocess_file(mode);
+
+  if (!_chunk_read_data.more_table_chunks_to_output()) {
+    if (!_chunk_read_data.more_stripes_to_decode() && _chunk_read_data.more_stripes_to_load()) {
+      // Only load stripe data if:
+      //  - There is more stripe to load, and
+      //  - All loaded stripes were decoded, and
+      //  - All the decoded results were output.
+      load_next_stripe_data(mode);
+    }
+    if (_chunk_read_data.more_stripes_to_decode()) {
+      // Only decompress/decode the loaded stripes if:
+      //  - There are loaded stripes that were not decoded yet, and
+      //  - All the decoded results were output.
+      decompress_and_decode_stripes(mode);
+    }
+  }
 }
 
-table_with_metadata reader::impl::read(int64_t skip_rows,
-                                       std::optional<size_type> const& num_rows_opt,
-                                       std::vector<std::vector<size_type>> const& stripes)
+table_with_metadata reader_impl::make_output_chunk()
 {
-  prepare_data(skip_rows, num_rows_opt, stripes);
-  return read_chunk_internal();
+  // There are no columns in the table.
+  if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
+
+  // If no rows or stripes to read, return empty columns.
+  if (!_chunk_read_data.more_table_chunks_to_output()) {
+    std::vector<std::unique_ptr<column>> out_columns;
+    auto out_metadata = get_meta_with_user_data();
+    std::transform(_selected_columns.levels[0].begin(),
+                   _selected_columns.levels[0].end(),
+                   std::back_inserter(out_columns),
+                   [&](auto const& col_meta) {
+                     out_metadata.schema_info.emplace_back("");
+                     return create_empty_column(col_meta.id,
+                                                _metadata,
+                                                _options.decimal128_columns,
+                                                _options.use_np_dtypes,
+                                                _options.timestamp_type,
+                                                out_metadata.schema_info.back(),
+                                                _stream);
+                   });
+    return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
+  }
+
+  auto const make_output_table = [&] {
+    if (_chunk_read_data.output_table_ranges.size() == 1) {
+      // Must change the index of the current output range such that calling `has_next()` after
+      // this will return the correct answer (`false`, since there is only one range).
+      _chunk_read_data.curr_output_table_range++;
+
+      // Just hand over the decoded table without slicing.
+      return std::move(_chunk_read_data.decoded_table);
+    }
+
+    // The range of rows in the decoded table to output.
+    auto const out_range =
+      _chunk_read_data.output_table_ranges[_chunk_read_data.curr_output_table_range++];
+    auto const out_tview = cudf::detail::slice(
+      _chunk_read_data.decoded_table->view(),
+      {static_cast<size_type>(out_range.begin), static_cast<size_type>(out_range.end)},
+      _stream)[0];
+    auto output = std::make_unique<table>(out_tview, _stream, _mr);
+
+    // If this is the last slice, we also delete the decoded table to free up memory.
+    if (!_chunk_read_data.more_table_chunks_to_output()) {
+      _chunk_read_data.decoded_table.reset(nullptr);
+    }
+
+    return output;
+  };
+
+  return {make_output_table(), table_metadata{_out_metadata} /*copy cached metadata*/};
 }
 
-table_metadata reader::impl::make_output_metadata()
+table_metadata reader_impl::get_meta_with_user_data()
 {
-  if (_output_metadata) { return table_metadata{*_output_metadata}; }
+  if (_meta_with_user_data) { return table_metadata{*_meta_with_user_data}; }
 
   // Copy user data to the output metadata.
   table_metadata out_metadata;
@@ -70,69 +128,126 @@ table_metadata reader::impl::make_output_metadata()
   out_metadata.user_data = {out_metadata.per_file_user_data[0].begin(),
                             out_metadata.per_file_user_data[0].end()};
 
-  // Save the output table metadata into `_output_metadata` for reuse next time.
-  _output_metadata = std::make_unique<table_metadata>(out_metadata);
+  // Save the output table metadata into `_meta_with_user_data` for reuse next time.
+  _meta_with_user_data = std::make_unique<table_metadata>(out_metadata);
 
   return out_metadata;
 }
 
-table_with_metadata reader::impl::read_chunk_internal()
+reader_impl::reader_impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                         orc_reader_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::device_async_resource_ref mr)
+  : reader_impl::reader_impl(0UL, 0UL, std::move(sources), options, stream, mr)
 {
-  // There is no columns in the table.
-  if (_selected_columns.num_levels() == 0) { return {std::make_unique<table>(), table_metadata{}}; }
+}
 
-  std::vector<std::unique_ptr<column>> out_columns;
-  auto out_metadata = make_output_metadata();
+reader_impl::reader_impl(std::size_t chunk_read_limit,
+                         std::size_t pass_read_limit,
+                         std::vector<std::unique_ptr<datasource>>&& sources,
+                         orc_reader_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::device_async_resource_ref mr)
+  : reader_impl::reader_impl(chunk_read_limit,
+                             pass_read_limit,
+                             DEFAULT_OUTPUT_ROW_GRANULARITY,
+                             std::move(sources),
+                             options,
+                             stream,
+                             mr)
+{
+}
 
-  // If no rows or stripes to read, return empty columns
-  if (_file_itm_data->rows_to_read == 0 || _file_itm_data->selected_stripes.empty()) {
-    std::transform(_selected_columns.levels[0].begin(),
-                   _selected_columns.levels[0].end(),
-                   std::back_inserter(out_columns),
-                   [&](auto const col_meta) {
-                     out_metadata.schema_info.emplace_back("");
-                     return create_empty_column(col_meta.id,
-                                                _metadata,
-                                                _decimal128_columns,
-                                                _use_np_dtypes,
-                                                _timestamp_type,
-                                                out_metadata.schema_info.back(),
-                                                _stream);
-                   });
-    return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
-  }
+reader_impl::reader_impl(std::size_t chunk_read_limit,
+                         std::size_t pass_read_limit,
+                         size_type output_row_granularity,
+                         std::vector<std::unique_ptr<datasource>>&& sources,
+                         orc_reader_options const& options,
+                         rmm::cuda_stream_view stream,
+                         rmm::device_async_resource_ref mr)
+  : _stream(stream),
+    _mr(mr),
+    _options{options.get_timestamp_type(),
+             options.is_enabled_use_index(),
+             options.is_enabled_use_np_dtypes(),
+             options.get_decimal128_columns(),
+             options.get_skip_rows(),
+             options.get_num_rows(),
+             options.get_stripes()},
+    _col_meta{std::make_unique<reader_column_meta>()},
+    _sources(std::move(sources)),
+    _metadata{_sources, stream},
+    _selected_columns{_metadata.select_columns(options.get_columns())},
+    _chunk_read_data{chunk_read_limit, pass_read_limit, output_row_granularity}
+{
+  // Selected columns at different levels of nesting are stored in different elements
+  // of `selected_columns`; thus, size == 1 means no nested columns.
+  CUDF_EXPECTS(_options.skip_rows == 0 or _selected_columns.num_levels() == 1,
+               "skip_rows is not supported by nested column");
+}
+
+table_with_metadata reader_impl::read()
+{
+  prepare_data(read_mode::READ_ALL);
+  return make_output_chunk();
+}
 
-  // Create columns from buffer with respective schema information.
-  std::transform(
-    _selected_columns.levels[0].begin(),
-    _selected_columns.levels[0].end(),
-    std::back_inserter(out_columns),
-    [&](auto const& orc_col_meta) {
-      out_metadata.schema_info.emplace_back("");
-      auto col_buffer = assemble_buffer(
-        orc_col_meta.id, 0, *_col_meta, _metadata, _selected_columns, _out_buffers, _stream, _mr);
-      return make_column(col_buffer, &out_metadata.schema_info.back(), std::nullopt, _stream);
-    });
-
-  return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
+bool reader_impl::has_next()
+{
+  prepare_data(read_mode::CHUNKED_READ);
+  return _chunk_read_data.has_next();
+}
+
+table_with_metadata reader_impl::read_chunk()
+{
+  prepare_data(read_mode::CHUNKED_READ);
+  return make_output_chunk();
 }
 
-// Forward to implementation
+chunked_reader::chunked_reader(std::size_t chunk_read_limit,
+                               std::size_t pass_read_limit,
+                               std::vector<std::unique_ptr<datasource>>&& sources,
+                               orc_reader_options const& options,
+                               rmm::cuda_stream_view stream,
+                               rmm::device_async_resource_ref mr)
+  : _impl{std::make_unique<reader_impl>(
+      chunk_read_limit, pass_read_limit, std::move(sources), options, stream, mr)}
+{
+}
+
+chunked_reader::chunked_reader(std::size_t chunk_read_limit,
+                               std::size_t pass_read_limit,
+                               size_type output_row_granularity,
+                               std::vector<std::unique_ptr<datasource>>&& sources,
+                               orc_reader_options const& options,
+                               rmm::cuda_stream_view stream,
+                               rmm::device_async_resource_ref mr)
+  : _impl{std::make_unique<reader_impl>(chunk_read_limit,
+                                        pass_read_limit,
+                                        output_row_granularity,
+                                        std::move(sources),
+                                        options,
+                                        stream,
+                                        mr)}
+{
+}
+
+chunked_reader::~chunked_reader() = default;
+
+bool chunked_reader::has_next() const { return _impl->has_next(); }
+
+table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); }
+
 reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
                orc_reader_options const& options,
                rmm::cuda_stream_view stream,
                rmm::device_async_resource_ref mr)
-  : _impl{std::make_unique<impl>(std::move(sources), options, stream, mr)}
+  : _impl{std::make_unique<reader_impl>(std::move(sources), options, stream, mr)}
 {
 }
 
-// Destructor within this translation unit
 reader::~reader() = default;
 
-// Forward to implementation
-table_with_metadata reader::read(orc_reader_options const& options)
-{
-  return _impl->read(options.get_skip_rows(), options.get_num_rows(), options.get_stripes());
-}
+table_with_metadata reader::read() { return _impl->read(); }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 8b859da07e9..94b294087b8 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -16,8 +16,8 @@
 
 #pragma once
 
-#include "aggregate_orc_metadata.hpp"
-#include "io/utilities/column_buffer.hpp"
+#include "io/orc/aggregate_orc_metadata.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
 
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/orc.hpp>
@@ -26,6 +26,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <io/utilities/column_buffer.hpp>
+
 #include <memory>
 #include <optional>
 #include <vector>
@@ -33,83 +35,169 @@
 namespace cudf::io::orc::detail {
 
 struct reader_column_meta;
-struct file_intermediate_data;
 
 /**
  * @brief Implementation for ORC reader.
  */
-class reader::impl {
+class reader_impl {
  public:
   /**
    * @brief Constructor from a dataset source with reader options.
    *
+   * This constructor will call the other constructor with `chunk_read_limit` and `pass_read_limit`
+   * set to `0` and `output_row_granularity` set to `DEFAULT_OUTPUT_ROW_GRANULARITY`.
+   *
    * @param sources Dataset sources
    * @param options Settings for controlling reading behavior
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource to use for device memory allocation
    */
-  explicit impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                orc_reader_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::device_async_resource_ref mr);
+  explicit reader_impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                       orc_reader_options const& options,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr);
 
   /**
-   * @brief Read an entire set or a subset of data and returns a set of columns
-   *
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows
-   * @param stripes Indices of individual stripes to load if non-empty
-   * @return The set of columns along with metadata
+   * @copydoc cudf::io::orc::detail::chunked_reader::chunked_reader(std::size_t, std::size_t,
+   * orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
+   */
+  explicit reader_impl(std::size_t chunk_read_limit,
+                       std::size_t pass_read_limit,
+                       std::vector<std::unique_ptr<datasource>>&& sources,
+                       orc_reader_options const& options,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr);
+
+  /**
+   * @copydoc cudf::io::orc::detail::chunked_reader::chunked_reader(std::size_t, std::size_t,
+   * size_type, orc_reader_options const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
    */
-  table_with_metadata read(int64_t skip_rows,
-                           std::optional<size_type> const& num_rows_opt,
-                           std::vector<std::vector<size_type>> const& stripes);
+  explicit reader_impl(std::size_t chunk_read_limit,
+                       std::size_t pass_read_limit,
+                       size_type output_row_granularity,
+                       std::vector<std::unique_ptr<datasource>>&& sources,
+                       orc_reader_options const& options,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr);
+
+  /**
+   * @copydoc cudf::io::orc::detail::reader::read
+   */
+  table_with_metadata read();
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::has_next
+   */
+  bool has_next();
+
+  /**
+   * @copydoc cudf::io::chunked_orc_reader::read_chunk
+   */
+  table_with_metadata read_chunk();
 
  private:
+  /**
+   * @brief The enum indicating whether the data sources are read all at once or chunk by chunk.
+   */
+  enum class read_mode { READ_ALL, CHUNKED_READ };
+
   /**
    * @brief Perform all the necessary data preprocessing before creating an output table.
    *
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows_opt Optional number of rows to read, or `std::nullopt` to read all rows
-   * @param stripes Indices of individual stripes to load if non-empty
+   * This is the proxy to call all other data preprocessing functions, which are prerequisite
+   * for generating the output.
+   *
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void prepare_data(int64_t skip_rows,
-                    std::optional<size_type> const& num_rows_opt,
-                    std::vector<std::vector<size_type>> const& stripes);
+  void prepare_data(read_mode mode);
 
   /**
-   * @brief Create the output table metadata from file metadata.
+   * @brief Perform a preprocessing step on the input data sources that executes exactly once
+   * for the entire duration of the reader.
    *
-   * @return Columns' metadata to output with the table read from file
+   * In this step, the metadata of all stripes in the data sources is parsed, and information about
+   * data streams of the selected columns in all stripes are generated. If the reader has a data
+   * read limit, sizes of these streams are used to split the list of all stripes into multiple
+   * subsets, each of which will be loaded into memory in the `load_next_stripe_data()` step. These
+   * subsets are computed such that memory usage will be kept to be around a fixed size limit.
+   *
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
+   */
+  void preprocess_file(read_mode mode);
+
+  /**
+   * @brief Load stripes from the input data sources into memory.
+   *
+   * If there is a data read limit, only a subset of stripes are read at a time such that
+   * their total data size does not exceed a fixed size limit. Then, the data is probed to
+   * estimate its uncompressed sizes, which are in turn used to split that stripe subset into
+   * smaller subsets, each of which to be decompressed and decoded in the next step
+   * `decompress_and_decode_stripes()`. This is to ensure that loading data from data sources
+   * together with decompression and decoding will be capped around the given data read limit.
+   *
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  table_metadata make_output_metadata();
+  void load_next_stripe_data(read_mode mode);
 
   /**
-   * @brief Read a chunk of data from the input source and return an output table with metadata.
+   * @brief Decompress and decode stripe data in the internal buffers, and store the result into
+   * an intermediate table.
+   *
+   * This function expects that the other preprocessing steps (`global preprocess()` and
+   * `load_next_stripe_data()`) have already been done.
    *
-   * This function is called internally and expects all preprocessing steps have already been done.
+   * @param mode Value indicating if the data sources are read all at once or chunk by chunk
+   */
+  void decompress_and_decode_stripes(read_mode mode);
+
+  /**
+   * @brief Create the output table from the intermediate table and return it along with metadata.
    *
    * @return The output table along with columns' metadata
    */
-  table_with_metadata read_chunk_internal();
+  table_with_metadata make_output_chunk();
+
+  /**
+   * @brief Create the output table metadata storing user data in source metadata.
+   *
+   * @return Columns' user data to output with the table read from file
+   */
+  table_metadata get_meta_with_user_data();
 
   rmm::cuda_stream_view const _stream;
   rmm::device_async_resource_ref const _mr;
 
-  // Reader configs
-  data_type const _timestamp_type;  // Override output timestamp resolution
-  bool const _use_index;            // Enable or disable attempt to use row index for parsing
-  bool const _use_np_dtypes;        // Enable or disable the conversion to numpy-compatible dtypes
-  std::vector<std::string> const _decimal128_columns;   // Control decimals conversion
-  std::unique_ptr<reader_column_meta> const _col_meta;  // Track of orc mapping and child details
+  // Reader configs.
+  struct {
+    data_type timestamp_type;  // override output timestamp resolution
+    bool use_index;            // enable or disable attempt to use row index for parsing
+    bool use_np_dtypes;        // enable or disable the conversion to numpy-compatible dtypes
+    std::vector<std::string> decimal128_columns;  // control decimals conversion
 
-  // Intermediate data for internal processing.
+    // User specified reading rows/stripes selection.
+    int64_t const skip_rows;
+    std::optional<int64_t> num_read_rows;
+    std::vector<std::vector<size_type>> const selected_stripes;
+  } const _options;
+
+  // Intermediate data for reading.
+  std::unique_ptr<reader_column_meta> const _col_meta;  // Track of orc mapping and child details
   std::vector<std::unique_ptr<datasource>> const _sources;  // Unused but owns data for `_metadata`
   aggregate_orc_metadata _metadata;
   column_hierarchy const _selected_columns;  // Construct from `_metadata` thus declare after it
-  std::unique_ptr<file_intermediate_data> _file_itm_data;
-  std::unique_ptr<table_metadata> _output_metadata;
+  file_intermediate_data _file_itm_data;
+  chunk_read_data _chunk_read_data;
+
+  // Intermediate data for output.
+  std::unique_ptr<table_metadata> _meta_with_user_data;
+  table_metadata _out_metadata;
   std::vector<std::vector<cudf::io::detail::column_buffer>> _out_buffers;
+
+  // The default value used for subdividing the decoded table for final output.
+  // Larger values will reduce the computation time but will make the output table less granular.
+  // Smaller values (minimum is `1`) will increase the computation time but the output table will
+  // have size closer to the given `chunk_read_limit`.
+  static inline constexpr size_type DEFAULT_OUTPUT_ROW_GRANULARITY = 10'000;
 };
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
new file mode 100644
index 00000000000..5034aa14a95
--- /dev/null
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -0,0 +1,723 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "io/comp/gpuinflate.hpp"
+#include "io/orc/reader_impl.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
+#include "io/orc/reader_impl_helpers.hpp"
+#include "io/utilities/hostdevice_span.hpp"
+
+#include <cudf/detail/timezone.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <rmm/device_buffer.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/scan.h>
+
+#include <algorithm>
+#include <tuple>
+
+namespace cudf::io::orc::detail {
+
+std::size_t gather_stream_info_and_column_desc(
+  std::size_t stripe_id,
+  std::size_t level,
+  orc::StripeInformation const* stripeinfo,
+  orc::StripeFooter const* stripefooter,
+  host_span<int const> orc2gdf,
+  host_span<orc::SchemaType const> types,
+  bool use_index,
+  bool apply_struct_map,
+  int64_t* num_dictionary_entries,
+  std::size_t* local_stream_order,
+  std::vector<orc_stream_info>* stream_info,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>* chunks)
+{
+  CUDF_EXPECTS((stream_info == nullptr) ^ (chunks == nullptr),
+               "Either stream_info or chunks must be provided, but not both.");
+
+  std::size_t src_offset = 0;
+  std::size_t dst_offset = 0;
+
+  auto const get_stream_index_type = [](orc::StreamKind kind) {
+    switch (kind) {
+      case orc::DATA: return gpu::CI_DATA;
+      case orc::LENGTH:
+      case orc::SECONDARY: return gpu::CI_DATA2;
+      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
+      case orc::PRESENT: return gpu::CI_PRESENT;
+      case orc::ROW_INDEX: return gpu::CI_INDEX;
+      default:
+        // Skip this stream as it's not strictly required
+        return gpu::CI_NUM_STREAMS;
+    }
+  };
+
+  for (auto const& stream : stripefooter->streams) {
+    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
+      // Ignore reading this stream from source.
+      CUDF_LOG_WARN("Unexpected stream in the input ORC source. The stream will be ignored.");
+      src_offset += stream.length;
+      continue;
+    }
+
+    auto const column_id = *stream.column_id;
+    auto col             = orc2gdf[column_id];
+
+    if (col == -1 and apply_struct_map) {
+      // A struct-type column has no data itself, but rather child columns
+      // for each of its fields. There is only a PRESENT stream, which
+      // needs to be included for the reader.
+      auto const schema_type = types[column_id];
+      if (!schema_type.subtypes.empty() && schema_type.kind == orc::STRUCT &&
+          stream.kind == orc::PRESENT) {
+        for (auto const& idx : schema_type.subtypes) {
+          auto const child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
+          if (child_idx >= 0) {
+            col = child_idx;
+            if (chunks) {
+              auto& chunk                     = (*chunks)[stripe_id][col];
+              chunk.strm_id[gpu::CI_PRESENT]  = *local_stream_order;
+              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
+            }
+          }
+        }
+      }
+    } else if (col != -1) {
+      if (chunks) {
+        if (src_offset >= stripeinfo->indexLength || use_index) {
+          auto const index_type = get_stream_index_type(stream.kind);
+          if (index_type < gpu::CI_NUM_STREAMS) {
+            auto& chunk                = (*chunks)[stripe_id][col];
+            chunk.strm_id[index_type]  = *local_stream_order;
+            chunk.strm_len[index_type] = stream.length;
+            // NOTE: skip_count field is temporarily used to track the presence of index streams
+            chunk.skip_count |= 1 << index_type;
+
+            if (index_type == gpu::CI_DICTIONARY) {
+              chunk.dictionary_start = *num_dictionary_entries;
+              chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
+              *num_dictionary_entries +=
+                static_cast<int64_t>(stripefooter->columns[column_id].dictionarySize);
+            }
+          }
+        }
+
+        (*local_stream_order)++;
+      } else {  // chunks == nullptr
+        stream_info->emplace_back(
+          orc_stream_info{stripeinfo->offset + src_offset,
+                          dst_offset,
+                          stream.length,
+                          stream_source_info{stripe_id, level, column_id, stream.kind}});
+      }
+
+      dst_offset += stream.length;
+    }
+    src_offset += stream.length;
+  }
+
+  return dst_offset;
+}
+
+template <typename T>
+std::vector<range> find_splits(host_span<T const> cumulative_sizes,
+                               std::size_t total_count,
+                               std::size_t size_limit)
+{
+  CUDF_EXPECTS(size_limit > 0, "Invalid size limit", std::invalid_argument);
+
+  std::vector<range> splits;
+  std::size_t cur_count{0};
+  int64_t cur_pos{0};
+  std::size_t cur_cumulative_size{0};
+
+  [[maybe_unused]] std::size_t cur_cumulative_rows{0};
+
+  auto const start = thrust::make_transform_iterator(
+    cumulative_sizes.begin(),
+    [&](auto const& size) { return size.size_bytes - cur_cumulative_size; });
+  auto const end = start + cumulative_sizes.size();
+
+  while (cur_count < total_count) {
+    int64_t split_pos = static_cast<int64_t>(
+      thrust::distance(start, thrust::lower_bound(thrust::seq, start + cur_pos, end, size_limit)));
+
+    // If we're past the end, or if the returned range has size exceeds the given size limit,
+    // move back one position.
+    if (split_pos >= static_cast<int64_t>(cumulative_sizes.size()) ||
+        (cumulative_sizes[split_pos].size_bytes > cur_cumulative_size + size_limit)) {
+      split_pos--;
+    }
+
+    if constexpr (std::is_same_v<T, cumulative_size_and_row>) {
+      // Similarly, while the returned range has total number of rows exceeds column size limit,
+      // move back one position.
+      while (split_pos > 0 && cumulative_sizes[split_pos].num_rows >
+                                cur_cumulative_rows +
+                                  static_cast<std::size_t>(std::numeric_limits<size_type>::max())) {
+        split_pos--;
+      }
+    }
+
+    // In case we have moved back too much in the steps above, far beyond the last split point, that
+    // means we could not find any range that has size fits within the given size limit.
+    // In such situations, we need to move forward until we move pass the last output range.
+    while (split_pos < (static_cast<int64_t>(cumulative_sizes.size()) - 1) &&
+           (split_pos < 0 || cumulative_sizes[split_pos].count <= cur_count)) {
+      split_pos++;
+    }
+
+    auto const start_count = cur_count;
+    cur_count              = cumulative_sizes[split_pos].count;
+    splits.emplace_back(range{start_count, cur_count});
+    cur_pos             = split_pos;
+    cur_cumulative_size = cumulative_sizes[split_pos].size_bytes;
+
+    if constexpr (std::is_same_v<T, cumulative_size_and_row>) {
+      cur_cumulative_rows = cumulative_sizes[split_pos].num_rows;
+    }
+  }
+
+  // If the last range has size smaller than `merge_threshold` the size of the second last one,
+  // merge it with the second last one.
+  // This is to prevent having the last range too small.
+  if (splits.size() > 1) {
+    double constexpr merge_threshold = 0.15;
+    if (auto const last = splits.back(), second_last = splits[splits.size() - 2];
+        last.size() <= static_cast<std::size_t>(merge_threshold * second_last.size())) {
+      splits.pop_back();
+      splits.back().end = last.end;
+    }
+  }
+
+  return splits;
+}
+
+// Since `find_splits` is a template function, we need to explicitly instantiate it so it can be
+// used outside of this TU.
+template std::vector<range> find_splits<cumulative_size>(host_span<cumulative_size const> sizes,
+                                                         std::size_t total_count,
+                                                         std::size_t size_limit);
+template std::vector<range> find_splits<cumulative_size_and_row>(
+  host_span<cumulative_size_and_row const> sizes, std::size_t total_count, std::size_t size_limit);
+
+// In this step, the metadata of all stripes in the data sources is parsed, and information about
+// data streams of the selected columns in all stripes are generated. If the reader has a data
+// read limit, sizes of these streams are used to split the list of all stripes into multiple
+// subsets, each of which will be loaded into memory in the `load_next_stripe_data()` step. These
+// subsets are computed such that memory usage will be kept to be around a fixed size limit.
+void reader_impl::preprocess_file(read_mode mode)
+{
+  if (_file_itm_data.global_preprocessed) { return; }
+  _file_itm_data.global_preprocessed = true;
+
+  //
+  // Load stripes' metadata:
+  //
+  std::tie(
+    _file_itm_data.rows_to_skip, _file_itm_data.rows_to_read, _file_itm_data.selected_stripes) =
+    _metadata.select_stripes(
+      _options.selected_stripes, _options.skip_rows, _options.num_read_rows, _stream);
+  if (!_file_itm_data.has_data()) { return; }
+
+  CUDF_EXPECTS(
+    mode == read_mode::CHUNKED_READ ||
+      _file_itm_data.rows_to_read <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+    "READ_ALL mode does not support reading number of rows more than cudf's column size limit. "
+    "For reading large number of rows, please use chunked_reader.",
+    std::overflow_error);
+
+  auto const& selected_stripes = _file_itm_data.selected_stripes;
+  auto const num_total_stripes = selected_stripes.size();
+  auto const num_levels        = _selected_columns.num_levels();
+
+  // Set up table for converting timestamp columns from local to UTC time
+  _file_itm_data.tz_table = [&] {
+    auto const has_timestamp_column = std::any_of(
+      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
+        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
+          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
+        });
+      });
+
+    return has_timestamp_column ? cudf::detail::make_timezone_transition_table(
+                                    {}, selected_stripes[0].stripe_footer->writerTimezone, _stream)
+                                : std::make_unique<cudf::table>();
+  }();
+
+  //
+  // Pre allocate necessary memory for data processed in the other reading steps:
+  //
+  auto& stripe_data_read_ranges = _file_itm_data.stripe_data_read_ranges;
+  stripe_data_read_ranges.resize(num_total_stripes);
+
+  auto& lvl_stripe_data          = _file_itm_data.lvl_stripe_data;
+  auto& lvl_stripe_sizes         = _file_itm_data.lvl_stripe_sizes;
+  auto& lvl_stream_info          = _file_itm_data.lvl_stream_info;
+  auto& lvl_stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges;
+  auto& lvl_column_types         = _file_itm_data.lvl_column_types;
+  auto& lvl_nested_cols          = _file_itm_data.lvl_nested_cols;
+
+  lvl_stripe_data.resize(num_levels);
+  lvl_stripe_sizes.resize(num_levels);
+  lvl_stream_info.resize(num_levels);
+  lvl_stripe_stream_ranges.resize(num_levels);
+  lvl_column_types.resize(num_levels);
+  lvl_nested_cols.resize(num_levels);
+  _out_buffers.resize(num_levels);
+
+  auto& read_info = _file_itm_data.data_read_info;
+  auto& col_meta  = *_col_meta;
+
+  //
+  // Collect columns' types:
+  //
+  for (std::size_t level = 0; level < num_levels; ++level) {
+    lvl_stripe_sizes[level].resize(num_total_stripes);
+    lvl_stripe_stream_ranges[level].resize(num_total_stripes);
+
+    // Association between each ORC column and its cudf::column
+    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
+
+    auto const& columns_level = _selected_columns.levels[level];
+    size_type col_id{0};
+
+    for (auto const& col : columns_level) {
+      // Map each ORC column to its column
+      col_meta.orc_col_map[level][col.id] = col_id++;
+
+      auto const col_type =
+        to_cudf_type(_metadata.get_col_type(col.id).kind,
+                     _options.use_np_dtypes,
+                     _options.timestamp_type.id(),
+                     to_cudf_decimal_type(_options.decimal128_columns, _metadata, col.id));
+      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
+
+      auto& column_types = lvl_column_types[level];
+      auto& nested_cols  = lvl_nested_cols[level];
+
+      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
+          col_type == type_id::DECIMAL128) {
+        // sign of the scale is changed since cuDF follows c++ libraries like CNL
+        // which uses negative scaling, but liborc and other libraries
+        // follow positive scaling.
+        auto const scale =
+          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
+        column_types.emplace_back(col_type, scale);
+      } else {
+        column_types.emplace_back(col_type);
+      }
+
+      // Map each ORC column to its column.
+      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
+        nested_cols.emplace_back(col);
+      }
+    }
+
+    // Try to reserve some memory, but the final size is unknown,
+    // since each column may have more than one stream.
+    auto const num_columns = columns_level.size();
+    lvl_stream_info[level].reserve(num_total_stripes * num_columns);
+    if (read_info.capacity() < num_total_stripes * num_columns) {
+      read_info.reserve(num_total_stripes * num_columns);
+    }
+  }
+
+  //
+  // Collect all data streams' information:
+  //
+
+  // Load all stripes if we are in READ_ALL mode or there is no read limit.
+  auto const load_all_stripes =
+    mode == read_mode::READ_ALL || _chunk_read_data.pass_read_limit == 0;
+
+  // Accumulate data size for data streams in each stripe, used for chunking.
+  // This will be used only for CHUNKED_READ mode when there is a read limit.
+  // Otherwise, we do not need this since we just load all stripes.
+  cudf::detail::hostdevice_vector<cumulative_size> total_stripe_sizes(
+    load_all_stripes ? std::size_t{0} : num_total_stripes, _stream);
+
+  for (std::size_t stripe_global_idx = 0; stripe_global_idx < num_total_stripes;
+       ++stripe_global_idx) {
+    auto const& stripe       = selected_stripes[stripe_global_idx];
+    auto const stripe_info   = stripe.stripe_info;
+    auto const stripe_footer = stripe.stripe_footer;
+
+    std::size_t this_stripe_size{0};
+    auto const last_read_size = read_info.size();
+    for (std::size_t level = 0; level < num_levels; ++level) {
+      auto& stream_info = _file_itm_data.lvl_stream_info[level];
+
+      auto stream_level_count = stream_info.size();
+      auto const stripe_level_size =
+        gather_stream_info_and_column_desc(stripe_global_idx,
+                                           level,
+                                           stripe_info,
+                                           stripe_footer,
+                                           col_meta.orc_col_map[level],
+                                           _metadata.get_types(),
+                                           false,  // use_index,
+                                           level == 0,
+                                           nullptr,  // num_dictionary_entries
+                                           nullptr,  // local_stream_order
+                                           &stream_info,
+                                           nullptr  // chunks
+        );
+
+      auto const is_stripe_data_empty = stripe_level_size == 0;
+      CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                   "Invalid index rowgroup stream data");
+
+      lvl_stripe_sizes[level][stripe_global_idx] = stripe_level_size;
+      this_stripe_size += stripe_level_size;
+
+      // Range of the streams in `stream_info` corresponding to this stripe at the current level.
+      lvl_stripe_stream_ranges[level][stripe_global_idx] =
+        range{stream_level_count, stream_info.size()};
+
+      // Coalesce consecutive streams into one read.
+      while (not is_stripe_data_empty and stream_level_count < stream_info.size()) {
+        auto const d_dst  = stream_info[stream_level_count].dst_pos;
+        auto const offset = stream_info[stream_level_count].offset;
+        auto len          = stream_info[stream_level_count].length;
+        stream_level_count++;
+
+        while (stream_level_count < stream_info.size() &&
+               stream_info[stream_level_count].offset == offset + len) {
+          len += stream_info[stream_level_count].length;
+          stream_level_count++;
+        }
+        read_info.emplace_back(stream_data_read_info{offset,
+                                                     d_dst,
+                                                     len,
+                                                     static_cast<std::size_t>(stripe.source_idx),
+                                                     stripe_global_idx,
+                                                     level});
+      }
+    }  // end loop level
+
+    if (!load_all_stripes) { total_stripe_sizes[stripe_global_idx] = {1, this_stripe_size}; }
+
+    // Range of all stream reads in `read_info` corresponding to this stripe, in all levels.
+    stripe_data_read_ranges[stripe_global_idx] = range{last_read_size, read_info.size()};
+  }
+
+  //
+  // Split range of all stripes into subranges that can be loaded separately while maintaining
+  // the memory usage under the given pass limit:
+  //
+
+  // Load range is reset to start from the first position in `load_stripe_ranges`.
+  _chunk_read_data.curr_load_stripe_range = 0;
+
+  if (load_all_stripes) {
+    _chunk_read_data.load_stripe_ranges = {range{0UL, num_total_stripes}};
+    return;
+  }
+
+  // Compute the prefix sum of stripes' data sizes.
+  total_stripe_sizes.host_to_device_async(_stream);
+  thrust::inclusive_scan(rmm::exec_policy_nosync(_stream),
+                         total_stripe_sizes.d_begin(),
+                         total_stripe_sizes.d_end(),
+                         total_stripe_sizes.d_begin(),
+                         cumulative_size_plus{});
+  total_stripe_sizes.device_to_host_sync(_stream);
+
+  auto const load_limit = [&] {
+    auto const tmp = static_cast<std::size_t>(_chunk_read_data.pass_read_limit *
+                                              chunk_read_data::load_limit_ratio);
+    // Make sure not to pass 0 byte limit (due to round-off) to `find_splits`.
+    return std::max(tmp, 1UL);
+  }();
+
+  _chunk_read_data.load_stripe_ranges =
+    find_splits<cumulative_size>(total_stripe_sizes, num_total_stripes, load_limit);
+}
+
+// If there is a data read limit, only a subset of stripes are read at a time such that
+// their total data size does not exceed a fixed size limit. Then, the data is probed to
+// estimate its uncompressed sizes, which are in turn used to split that stripe subset into
+// smaller subsets, each of which to be decompressed and decoded in the next step
+// `decompress_and_decode_stripes()`. This is to ensure that loading data from data sources
+// together with decompression and decoding will be capped around the given data read limit.
+void reader_impl::load_next_stripe_data(read_mode mode)
+{
+  if (!_file_itm_data.has_data()) { return; }
+
+  auto const load_stripe_range =
+    _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range++];
+  auto const stripe_start = load_stripe_range.begin;
+  auto const stripe_count = load_stripe_range.size();
+
+  auto& lvl_stripe_data = _file_itm_data.lvl_stripe_data;
+  auto const num_levels = _selected_columns.num_levels();
+
+  // Prepare the buffer to read raw data onto.
+  for (std::size_t level = 0; level < num_levels; ++level) {
+    auto& stripe_data = lvl_stripe_data[level];
+    stripe_data.resize(stripe_count);
+
+    for (std::size_t idx = 0; idx < stripe_count; ++idx) {
+      auto const stripe_size = _file_itm_data.lvl_stripe_sizes[level][idx + stripe_start];
+      stripe_data[idx]       = rmm::device_buffer(
+        cudf::util::round_up_safe(stripe_size, BUFFER_PADDING_MULTIPLE), _stream);
+    }
+  }
+
+  //
+  // Load stripe data into memory:
+  //
+
+  // If we load data from sources into host buffers, we need to transfer (async) data to device
+  // memory. Such host buffers need to be kept alive until we sync the transfers.
+  std::vector<std::unique_ptr<cudf::io::datasource::buffer>> host_read_buffers;
+
+  // If we load data directly from sources into device memory, the loads are also async.
+  // Thus, we need to make sure to sync all them at the end.
+  std::vector<std::pair<std::future<std::size_t>, std::size_t>> device_read_tasks;
+
+  // Range of the read info (offset, length) to read for the current being loaded stripes.
+  auto const [read_begin, read_end] =
+    merge_selected_ranges(_file_itm_data.stripe_data_read_ranges, load_stripe_range);
+
+  for (auto read_idx = read_begin; read_idx < read_end; ++read_idx) {
+    auto const& read_info = _file_itm_data.data_read_info[read_idx];
+    auto const source_ptr = _metadata.per_file_metadata[read_info.source_idx].source;
+    auto const dst_base   = static_cast<uint8_t*>(
+      lvl_stripe_data[read_info.level][read_info.stripe_idx - stripe_start].data());
+
+    if (source_ptr->is_device_read_preferred(read_info.length)) {
+      device_read_tasks.push_back(
+        std::pair(source_ptr->device_read_async(
+                    read_info.offset, read_info.length, dst_base + read_info.dst_pos, _stream),
+                  read_info.length));
+
+    } else {
+      auto buffer = source_ptr->host_read(read_info.offset, read_info.length);
+      CUDF_EXPECTS(buffer->size() == read_info.length, "Unexpected discrepancy in bytes read.");
+      CUDF_CUDA_TRY(cudaMemcpyAsync(dst_base + read_info.dst_pos,
+                                    buffer->data(),
+                                    read_info.length,
+                                    cudaMemcpyDefault,
+                                    _stream.value()));
+      host_read_buffers.emplace_back(std::move(buffer));
+    }
+  }
+
+  if (host_read_buffers.size() > 0) {  // if there was host read
+    _stream.synchronize();
+    host_read_buffers.clear();  // its data was copied to device memory after stream sync
+  }
+  for (auto& task : device_read_tasks) {  // if there was device read
+    CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
+  }
+
+  // Compute number of rows in the loading stripes.
+  auto const num_loading_rows = std::accumulate(
+    _file_itm_data.selected_stripes.begin() + stripe_start,
+    _file_itm_data.selected_stripes.begin() + stripe_start + stripe_count,
+    std::size_t{0},
+    [](std::size_t count, const auto& stripe) { return count + stripe.stripe_info->numberOfRows; });
+
+  // Decoding range needs to be reset to start from the first position in `decode_stripe_ranges`.
+  _chunk_read_data.curr_decode_stripe_range = 0;
+
+  // The cudf's column size limit.
+  auto constexpr column_size_limit =
+    static_cast<std::size_t>(std::numeric_limits<size_type>::max());
+
+  // Decode all loaded stripes if there is no read limit, or if we are in READ_ALL mode,
+  // and the number of loading rows is less than the column size limit.
+  // In theory, we should just decode 'enough' stripes for output one table chunk, instead of
+  // decoding all stripes like this, for better load-balancing and reduce memory usage.
+  // However, we do not have any good way to know how many stripes are 'enough'.
+  if ((mode == read_mode::READ_ALL || _chunk_read_data.pass_read_limit == 0) &&
+      // In addition to read limit, we also need to check if the total number of
+      // rows in the loaded stripes exceeds the column size limit.
+      // If that is the case, we cannot decode all stripes at once into a cudf table.
+      num_loading_rows <= column_size_limit) {
+    _chunk_read_data.decode_stripe_ranges = {load_stripe_range};
+    return;
+  }
+
+  // From here, we have reading mode that is either:
+  // - CHUNKED_READ without read limit but the number of reading rows exceeds column size limit, or
+  // - CHUNKED_READ with a pass read limit.
+  // READ_ALL mode with number of rows more than cudf's column size limit should be handled early in
+  // `preprocess_file`. We just check again to make sure such situations never happen here.
+  CUDF_EXPECTS(
+    mode != read_mode::READ_ALL,
+    "READ_ALL mode does not support reading number of rows more than cudf's column size limit.");
+
+  // This is the post-processing step after we've done with splitting `load_stripe_range` into
+  // `decode_stripe_ranges`.
+  auto const add_range_offset = [stripe_start](std::vector<range>& new_ranges) {
+    // The split ranges always start from zero.
+    // We need to change these ranges to start from `stripe_start` which are the correct subranges
+    // of the current loaded stripe range.
+    for (auto& range : new_ranges) {
+      range.begin += stripe_start;
+      range.end += stripe_start;
+    }
+  };
+
+  // Optimized code path when we do not have any read limit but the number of rows in the
+  // loaded stripes exceeds column size limit.
+  // Note that the values `max_uncompressed_size` for each stripe are not computed here.
+  // Instead, they will be computed on the fly during decoding to avoid the overhead of
+  // storing and retrieving from memory.
+  if (_chunk_read_data.pass_read_limit == 0 && num_loading_rows > column_size_limit) {
+    std::vector<cumulative_size_and_row> cumulative_stripe_rows(stripe_count);
+    std::size_t rows{0};
+
+    for (std::size_t idx = 0; idx < stripe_count; ++idx) {
+      auto const& stripe     = _file_itm_data.selected_stripes[idx + stripe_start];
+      auto const stripe_info = stripe.stripe_info;
+      rows += stripe_info->numberOfRows;
+
+      // We will split stripe ranges based only on stripes' number of rows, not data size.
+      // Thus, we override the cumulative `size_bytes` using the prefix sum of rows in stripes and
+      // will use the column size limit as the split size limit.
+      cumulative_stripe_rows[idx] =
+        cumulative_size_and_row{idx + 1UL /*count*/, rows /*size_bytes*/, rows};
+    }
+
+    _chunk_read_data.decode_stripe_ranges =
+      find_splits<cumulative_size_and_row>(cumulative_stripe_rows, stripe_count, column_size_limit);
+    add_range_offset(_chunk_read_data.decode_stripe_ranges);
+    return;
+  }
+
+  //
+  // Split range of loaded stripes into subranges that can be decoded separately such that the
+  // memory usage is maintained around the given limit:
+  //
+
+  // This is for estimating the decompressed sizes of the loaded stripes.
+  cudf::detail::hostdevice_vector<cumulative_size_and_row> stripe_decomp_sizes(stripe_count,
+                                                                               _stream);
+
+  // Fill up the `cumulative_size_and_row` array with initial values.
+  // Note: `hostdevice_vector::begin()` mirrors `std::vector::data()` using incorrect API name.
+  for (std::size_t idx = 0; idx < stripe_count; ++idx) {
+    auto const& stripe     = _file_itm_data.selected_stripes[idx + stripe_start];
+    auto const stripe_info = stripe.stripe_info;
+    stripe_decomp_sizes[idx] =
+      cumulative_size_and_row{1UL /*count*/, 0UL /*size_bytes*/, stripe_info->numberOfRows};
+  }
+
+  auto& compinfo_map = _file_itm_data.compinfo_map;
+  compinfo_map.clear();  // clear cache of the last load
+
+  // For parsing decompression data.
+  // We create an array that is large enough to use for all levels, thus only need to allocate
+  // memory once.
+  auto hd_compinfo = [&] {
+    std::size_t max_num_streams{0};
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      // Find the maximum number of streams in all levels of the loaded stripes.
+      for (std::size_t level = 0; level < num_levels; ++level) {
+        auto const stream_range =
+          merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
+        max_num_streams = std::max(max_num_streams, stream_range.size());
+      }
+    }
+    return cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo>(max_num_streams, _stream);
+  }();
+
+  for (std::size_t level = 0; level < num_levels; ++level) {
+    auto const& stream_info = _file_itm_data.lvl_stream_info[level];
+    auto const num_columns  = _selected_columns.levels[level].size();
+
+    auto& stripe_data = lvl_stripe_data[level];
+    if (stripe_data.empty()) { continue; }
+
+    // Range of all streams in the loaded stripes.
+    auto const stream_range =
+      merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], load_stripe_range);
+
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      auto const& decompressor = *_metadata.per_file_metadata[0].decompressor;
+
+      auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>(
+        hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size());
+      for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
+        auto const& info = stream_info[stream_idx];
+        auto const dst_base =
+          static_cast<uint8_t const*>(stripe_data[info.source.stripe_idx - stripe_start].data());
+        compinfo[stream_idx - stream_range.begin] =
+          gpu::CompressedStreamInfo(dst_base + info.dst_pos, info.length);
+      }
+
+      // Estimate the uncompressed data.
+      compinfo.host_to_device_async(_stream);
+      gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                     compinfo.size(),
+                                     decompressor.GetBlockSize(),
+                                     decompressor.GetLog2MaxCompressionRatio(),
+                                     _stream);
+      compinfo.device_to_host_sync(_stream);
+
+      for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
+        auto const& info           = stream_info[stream_idx];
+        auto const stream_compinfo = compinfo[stream_idx - stream_range.begin];
+
+        // Cache these parsed numbers so they can be reused in the decompression/decoding step.
+        compinfo_map[info.source] = {stream_compinfo.num_compressed_blocks,
+                                     stream_compinfo.num_uncompressed_blocks,
+                                     stream_compinfo.max_uncompressed_size};
+        stripe_decomp_sizes[info.source.stripe_idx - stripe_start].size_bytes +=
+          stream_compinfo.max_uncompressed_size;
+      }
+
+    } else {  // no decompression
+      // Set decompression sizes equal to the input sizes.
+      for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
+        auto const& info = stream_info[stream_idx];
+        stripe_decomp_sizes[info.source.stripe_idx - stripe_start].size_bytes += info.length;
+      }
+    }
+  }  // end loop level
+
+  // Compute the prefix sum of stripe data sizes and rows.
+  stripe_decomp_sizes.host_to_device_async(_stream);
+  thrust::inclusive_scan(rmm::exec_policy_nosync(_stream),
+                         stripe_decomp_sizes.d_begin(),
+                         stripe_decomp_sizes.d_end(),
+                         stripe_decomp_sizes.d_begin(),
+                         cumulative_size_plus{});
+  stripe_decomp_sizes.device_to_host_sync(_stream);
+
+  auto const decode_limit = [&] {
+    auto const tmp = static_cast<std::size_t>(_chunk_read_data.pass_read_limit *
+                                              chunk_read_data::decompress_and_decode_limit_ratio);
+    // Make sure not to pass 0 byte limit to `find_splits`.
+    return std::max(tmp, 1UL);
+  }();
+
+  _chunk_read_data.decode_stripe_ranges =
+    find_splits<cumulative_size_and_row>(stripe_decomp_sizes, stripe_count, decode_limit);
+
+  add_range_offset(_chunk_read_data.decode_stripe_ranges);
+}
+
+}  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_chunking.hpp b/cpp/src/io/orc/reader_impl_chunking.hpp
index 0ad0f9af589..4ef68ee8d86 100644
--- a/cpp/src/io/orc/reader_impl_chunking.hpp
+++ b/cpp/src/io/orc/reader_impl_chunking.hpp
@@ -24,18 +24,298 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 
+#include <unordered_map>
+
 namespace cudf::io::orc::detail {
 
 /**
- * @brief Struct to store file-level data that remains constant for all chunks being read.
+ * @brief Struct representing a range of of data offsets.
+ */
+struct range {
+  std::size_t begin{0};
+  std::size_t end{0};
+
+  [[nodiscard]] auto size() const { return end - begin; }
+};
+
+/**
+ * @brief Expand a range of ranges into a simple range of data.
+ *
+ * @param input_ranges The list of all data ranges
+ * @param selected_ranges A range of ranges from `input_ranges`
+ * @return The range of data span by the selected range of ranges
+ */
+inline range merge_selected_ranges(host_span<range const> input_ranges,
+                                   range const& selected_ranges)
+{
+  // The first and last range.
+  auto const& first_range = input_ranges[selected_ranges.begin];
+  auto const& last_range  = input_ranges[selected_ranges.end - 1];
+
+  // The range of data covered from the first to the last range.
+  return {first_range.begin, last_range.end};
+}
+
+// Store information to identify where to read a chunk of data from source.
+// Each read corresponds to one or more consecutive streams combined.
+struct stream_data_read_info {
+  uint64_t offset;         // offset in data source
+  std::size_t dst_pos;     // offset to store data in memory relative to start of raw stripe data
+  std::size_t length;      // data length to read
+  std::size_t source_idx;  // the data source id
+  std::size_t stripe_idx;  // global stripe index
+  std::size_t level;       // nested level
+};
+
+/**
+ * @brief Compression information for a stripe at a specific nested level.
+ */
+struct stripe_level_comp_info {
+  std::size_t num_compressed_blocks{0};
+  std::size_t num_uncompressed_blocks{0};
+  std::size_t total_decomp_size{0};
+};
+
+/**
+ * @brief Struct that stores source information of an ORC streams.
+ */
+struct stream_source_info {
+  std::size_t stripe_idx;  // global stripe id throughout all data sources
+  std::size_t level;       // level of the nested column
+  uint32_t orc_col_idx;    // orc column id
+  StreamKind kind;         // stream kind
+
+  struct hash {
+    std::size_t operator()(stream_source_info const& id) const
+    {
+      auto const col_kind =
+        static_cast<std::size_t>(id.orc_col_idx) | (static_cast<std::size_t>(id.kind) << 32);
+      auto const hasher = std::hash<size_t>{};
+      return hasher(id.stripe_idx) ^ hasher(id.level) ^ hasher(col_kind);
+    }
+  };
+  struct equal_to {
+    bool operator()(stream_source_info const& lhs, stream_source_info const& rhs) const
+    {
+      return lhs.stripe_idx == rhs.stripe_idx && lhs.level == rhs.level &&
+             lhs.orc_col_idx == rhs.orc_col_idx && lhs.kind == rhs.kind;
+    }
+  };
+};
+
+/**
+ * @brief Map to lookup a value from stream source.
+ */
+template <typename T>
+using stream_source_map =
+  std::unordered_map<stream_source_info, T, stream_source_info::hash, stream_source_info::equal_to>;
+
+/**
+ * @brief Struct that stores information of an ORC stream.
+ */
+struct orc_stream_info {
+  // Data info:
+  uint64_t offset;      // offset in data source
+  std::size_t dst_pos;  // offset to store data in memory relative to start of raw stripe data
+  std::size_t length;   // stream length to read
+
+  // Store source of the stream in the stripe, so we can look up where this stream comes from.
+  stream_source_info source;
+};
+
+/**
+ * @brief Struct storing intermediate processing data loaded from data sources.
  */
 struct file_intermediate_data {
+  int64_t rows_to_skip;
+  int64_t rows_to_read;
+  std::vector<metadata::orc_stripe_info> selected_stripes;
+
+  // Check if there is data to read.
+  bool has_data() const { return rows_to_read > 0 && !selected_stripes.empty(); }
+
+  // For each stripe, we perform a number of reads for its streams.
+  // Those reads are identified by a chunk of consecutive read info stored in `data_read_info`.
+  std::vector<range> stripe_data_read_ranges;
+
+  // Identify what data to read from source.
+  std::vector<stream_data_read_info> data_read_info;
+
+  // Store the compression information for each data stream.
+  stream_source_map<stripe_level_comp_info> compinfo_map;
+
+  // Store info for each ORC stream at each nested level.
+  std::vector<std::vector<orc_stream_info>> lvl_stream_info;
+
+  // At each nested level, the streams for each stripe are stored consecutively in lvl_stream_info.
+  // This is used to identify the range of streams for each stripe from that vector.
+  std::vector<std::vector<range>> lvl_stripe_stream_ranges;
+
+  // The buffers to store raw data read from disk, initialized for each reading stripe chunks.
+  // After decoding, such buffers can be released.
+  // This can only be implemented after chunked output is ready.
   std::vector<std::vector<rmm::device_buffer>> lvl_stripe_data;
-  std::vector<std::vector<rmm::device_uvector<uint32_t>>> null_count_prefix_sums;
 
-  int64_t rows_to_skip;
-  size_type rows_to_read;
-  std::vector<metadata::stripe_source_mapping> selected_stripes;
+  // Store the size of each stripe at each nested level.
+  // This is used to initialize the stripe_data buffers.
+  std::vector<std::vector<std::size_t>> lvl_stripe_sizes;
+
+  // List of column data types at each nested level.
+  std::vector<std::vector<data_type>> lvl_column_types;
+
+  // List of nested type columns at each nested level.
+  std::vector<std::vector<orc_column_meta>> lvl_nested_cols;
+
+  // Table for converting timestamp columns from local to UTC time.
+  std::unique_ptr<cudf::table> tz_table;
+
+  bool global_preprocessed{false};
+};
+
+/**
+ * @brief Struct collecting data necessary for chunked reading.
+ */
+struct chunk_read_data {
+  explicit chunk_read_data(std::size_t output_size_limit_,
+                           std::size_t data_read_limit_,
+                           size_type output_row_granularity_)
+    : chunk_read_limit{output_size_limit_},
+      pass_read_limit{data_read_limit_},
+      output_row_granularity{output_row_granularity_}
+  {
+    CUDF_EXPECTS(output_row_granularity > 0,
+                 "The value of `output_row_granularity` must be positive.");
+  }
+
+  std::size_t const
+    chunk_read_limit;  // maximum size (in bytes) of an output chunk, or 0 for no limit
+  std::size_t const pass_read_limit;  // approximate maximum size (in bytes) used for store
+                                      // intermediate data, or 0 for no limit
+  size_type const output_row_granularity;
+
+  // Memory limits for loading data and decoding are computed as
+  // `*_limit_ratio * pass_read_limit`.
+  // This is to maintain the total memory usage to be **around** the given `pass_read_limit`.
+  // Note that sum of these limits may not be `1.0`, and their values are set empirically.
+  static double constexpr load_limit_ratio{0.25};
+  static double constexpr decompress_and_decode_limit_ratio{0.6};
+
+  // Chunks of stripes that can be loaded into memory such that their data size is within the user
+  // specified limit.
+  std::vector<range> load_stripe_ranges;
+  std::size_t curr_load_stripe_range{0};
+  bool more_stripes_to_load() const { return curr_load_stripe_range < load_stripe_ranges.size(); }
+
+  // Chunks of stripes such that their decompression size is within the user specified size limit.
+  std::vector<range> decode_stripe_ranges;
+  std::size_t curr_decode_stripe_range{0};
+  bool more_stripes_to_decode() const
+  {
+    return curr_decode_stripe_range < decode_stripe_ranges.size();
+  }
+
+  // Chunk of rows in the internal decoded table to output for each `read_chunk()`.
+  std::vector<range> output_table_ranges;
+  std::size_t curr_output_table_range{0};
+  std::unique_ptr<cudf::table> decoded_table;
+  bool more_table_chunks_to_output() const
+  {
+    return curr_output_table_range < output_table_ranges.size();
+  }
+
+  bool has_next() const
+  {
+    // Only has more chunk to output if:
+    return more_stripes_to_load() || more_stripes_to_decode() || more_table_chunks_to_output();
+  }
+};
+
+/**
+ * @brief Struct to accumulate counts and sizes of some types such as stripes or rows.
+ */
+struct cumulative_size {
+  std::size_t count{0};
+  std::size_t size_bytes{0};
 };
 
+/**
+ * @brief Struct to accumulate counts, sizes, and number of rows of some types such as stripes or
+ * rows in tables.
+ */
+struct cumulative_size_and_row : public cumulative_size {
+  std::size_t num_rows{0};
+};
+
+/**
+ * @brief Functor to sum up cumulative data.
+ */
+struct cumulative_size_plus {
+  __device__ cumulative_size operator()(cumulative_size const& a, cumulative_size const& b) const
+  {
+    return cumulative_size{a.count + b.count, a.size_bytes + b.size_bytes};
+  }
+
+  __device__ cumulative_size_and_row operator()(cumulative_size_and_row const& a,
+                                                cumulative_size_and_row const& b) const
+  {
+    return cumulative_size_and_row{
+      a.count + b.count, a.size_bytes + b.size_bytes, a.num_rows + b.num_rows};
+  }
+};
+
+/**
+ * @brief Find the splits of the input data such that each split range has cumulative size less than
+ * a given `size_limit`.
+ *
+ * Note that the given limit is just a soft limit. The function will always output ranges that
+ * have at least one count, even such ranges have sizes exceed the value of `size_limit`.
+ *
+ * @param cumulative_sizes The input cumulative sizes to compute split ranges
+ * @param total_count The total count in the entire input
+ * @param size_limit The given soft limit to compute splits; must be positive
+ * @return A vector of ranges as splits of the input
+ */
+template <typename T>
+std::vector<range> find_splits(host_span<T const> cumulative_sizes,
+                               std::size_t total_count,
+                               std::size_t size_limit);
+
+/**
+ * @brief Function that populates descriptors for either individual streams or chunks of column
+ * data, but not both.
+ *
+ * This function is firstly used in the global step, to gather information for streams of all
+ * stripes in the data sources (when `stream_info` is present). Later on, it is used again to
+ * populate column descriptors (`chunks` is present) during decompression and decoding. The two
+ * steps share most of the execution path thus this function takes mutually exclusive parameters
+ * `stream_info` or `chunks` depending on each use case.
+ *
+ * @param stripe_id The index of the current stripe, can be global index or local decoding index
+ * @param level The current processing nested level
+ * @param stripeinfo The pointer to current stripe's information
+ * @param stripefooter The pointer to current stripe's footer
+ * @param orc2gdf The mapping from ORC column ids to gdf column ids
+ * @param types The schema type
+ * @param use_index Whether to use the row index for parsing
+ * @param apply_struct_map Indicating if this is the root level
+ * @param num_dictionary_entries The number of dictionary entries
+ * @param local_stream_order For retrieving 0-based orders of streams in the decoding step
+ * @param stream_info The vector of streams' information
+ * @param chunks The vector of column descriptors
+ * @return The number of bytes in the gathered streams
+ */
+std::size_t gather_stream_info_and_column_desc(
+  std::size_t stripe_id,
+  std::size_t level,
+  orc::StripeInformation const* stripeinfo,
+  orc::StripeFooter const* stripefooter,
+  host_span<int const> orc2gdf,
+  host_span<orc::SchemaType const> types,
+  bool use_index,
+  bool apply_struct_map,
+  int64_t* num_dictionary_entries,
+  std::size_t* local_stream_order,
+  std::vector<orc_stream_info>* stream_info,
+  cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>* chunks);
+
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_preprocess.cu b/cpp/src/io/orc/reader_impl_decode.cu
similarity index 56%
rename from cpp/src/io/orc/reader_impl_preprocess.cu
rename to cpp/src/io/orc/reader_impl_decode.cu
index 04cb223c696..ec936b85761 100644
--- a/cpp/src/io/orc/reader_impl_preprocess.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -16,17 +16,17 @@
 
 #include "io/comp/gpuinflate.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
+#include "io/orc/reader_impl.hpp"
+#include "io/orc/reader_impl_chunking.hpp"
+#include "io/orc/reader_impl_helpers.hpp"
 #include "io/utilities/config_utils.hpp"
-#include "reader_impl.hpp"
-#include "reader_impl_chunking.hpp"
-#include "reader_impl_helpers.hpp"
+#include "io/utilities/hostdevice_span.hpp"
 
-#include <cudf/detail/timezone.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
-#include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -45,175 +45,104 @@
 #include <thrust/transform.h>
 
 #include <algorithm>
-#include <iterator>
+#include <numeric>
 
 namespace cudf::io::orc::detail {
 
 namespace {
 
 /**
- * @brief Struct that maps ORC streams to columns
- */
-struct orc_stream_info {
-  explicit orc_stream_info(uint64_t offset_,
-                           std::size_t dst_pos_,
-                           uint32_t length_,
-                           uint32_t stripe_idx_)
-    : offset(offset_), dst_pos(dst_pos_), length(length_), stripe_idx(stripe_idx_)
-  {
-  }
-  uint64_t offset;      // offset in file
-  std::size_t dst_pos;  // offset in memory relative to start of compressed stripe data
-  std::size_t length;   // length in file
-  uint32_t stripe_idx;  // stripe index
-};
-
-/**
- * @brief Function that populates column descriptors stream/chunk
- */
-std::size_t gather_stream_info(std::size_t stripe_index,
-                               orc::StripeInformation const* stripeinfo,
-                               orc::StripeFooter const* stripefooter,
-                               host_span<int const> orc2gdf,
-                               host_span<orc::SchemaType const> types,
-                               bool use_index,
-                               bool apply_struct_map,
-                               int64_t* num_dictionary_entries,
-                               std::vector<orc_stream_info>& stream_info,
-                               cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks)
-{
-  uint64_t src_offset = 0;
-  uint64_t dst_offset = 0;
-
-  auto const get_stream_index_type = [](orc::StreamKind kind) {
-    switch (kind) {
-      case orc::DATA: return gpu::CI_DATA;
-      case orc::LENGTH:
-      case orc::SECONDARY: return gpu::CI_DATA2;
-      case orc::DICTIONARY_DATA: return gpu::CI_DICTIONARY;
-      case orc::PRESENT: return gpu::CI_PRESENT;
-      case orc::ROW_INDEX: return gpu::CI_INDEX;
-      default:
-        // Skip this stream as it's not strictly required
-        return gpu::CI_NUM_STREAMS;
-    }
-  };
-
-  for (auto const& stream : stripefooter->streams) {
-    if (!stream.column_id || *stream.column_id >= orc2gdf.size()) {
-      // Ignore reading this stream from source.
-      cudf::logger().warn("Unexpected stream in the input ORC source. The stream will be ignored.");
-      src_offset += stream.length;
-      continue;
-    }
-
-    auto const column_id = *stream.column_id;
-    auto col             = orc2gdf[column_id];
-
-    if (col == -1 and apply_struct_map) {
-      // A struct-type column has no data itself, but rather child columns
-      // for each of its fields. There is only a PRESENT stream, which
-      // needs to be included for the reader.
-      auto const schema_type = types[column_id];
-      if (not schema_type.subtypes.empty()) {
-        if (schema_type.kind == orc::STRUCT && stream.kind == orc::PRESENT) {
-          for (auto const& idx : schema_type.subtypes) {
-            auto child_idx = (idx < orc2gdf.size()) ? orc2gdf[idx] : -1;
-            if (child_idx >= 0) {
-              col                             = child_idx;
-              auto& chunk                     = chunks[stripe_index][col];
-              chunk.strm_id[gpu::CI_PRESENT]  = stream_info.size();
-              chunk.strm_len[gpu::CI_PRESENT] = stream.length;
-            }
-          }
-        }
-      }
-    } else if (col != -1) {
-      if (src_offset >= stripeinfo->indexLength || use_index) {
-        auto& chunk           = chunks[stripe_index][col];
-        auto const index_type = get_stream_index_type(stream.kind);
-        if (index_type < gpu::CI_NUM_STREAMS) {
-          chunk.strm_id[index_type]  = stream_info.size();
-          chunk.strm_len[index_type] = stream.length;
-          // NOTE: skip_count field is temporarily used to track the presence of index streams
-          chunk.skip_count |= 1 << index_type;
-
-          if (index_type == gpu::CI_DICTIONARY) {
-            chunk.dictionary_start = *num_dictionary_entries;
-            chunk.dict_len         = stripefooter->columns[column_id].dictionarySize;
-            *num_dictionary_entries += stripefooter->columns[column_id].dictionarySize;
-          }
-        }
-      }
-      stream_info.emplace_back(
-        stripeinfo->offset + src_offset, dst_offset, stream.length, stripe_index);
-      dst_offset += stream.length;
-    }
-    src_offset += stream.length;
-  }
-
-  return dst_offset;
-}
-
-/**
- * @brief Decompresses the stripe data, at stream granularity.
+ * @brief  Decompresses the stripe data, at stream granularity.
+ *
+ * Only the streams in the provided `stream_range` are decoded. That range is determined in
+ * the previous steps, after splitting stripes into ranges to maintain memory usage to be
+ * under data read limit.
  *
+ * @param loaded_stripe_range Range of stripes that are already loaded in memory
+ * @param stream_range Range of streams to be decoded
+ * @param num_decode_stripes Number of stripes that the decoding streams belong to
+ * @param compinfo_map A map to lookup compression info of streams
  * @param decompressor Block decompressor
  * @param stripe_data List of source stripe column data
  * @param stream_info List of stream to column mappings
  * @param chunks Vector of list of column chunk descriptors
  * @param row_groups Vector of list of row index descriptors
- * @param num_stripes Number of stripes making up column chunks
  * @param row_index_stride Distance between each row index
  * @param use_base_stride Whether to use base stride obtained from meta or use the computed value
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @return Device buffer to decompressed page data
+ * @return Device buffer to decompressed data
  */
 rmm::device_buffer decompress_stripe_data(
+  range const& loaded_stripe_range,
+  range const& stream_range,
+  std::size_t num_decode_stripes,
+  cudf::detail::hostdevice_span<gpu::CompressedStreamInfo> compinfo,
+  stream_source_map<stripe_level_comp_info> const& compinfo_map,
   OrcDecompressor const& decompressor,
   host_span<rmm::device_buffer const> stripe_data,
-  host_span<orc_stream_info> stream_info,
+  host_span<orc_stream_info const> stream_info,
   cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
   cudf::detail::hostdevice_2dvector<gpu::RowGroup>& row_groups,
-  size_type num_stripes,
   size_type row_index_stride,
   bool use_base_stride,
   rmm::cuda_stream_view stream)
 {
-  // Parse the columns' compressed info
-  cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo> compinfo(
-    0, stream_info.size(), stream);
-  for (auto const& info : stream_info) {
-    compinfo.push_back(gpu::CompressedStreamInfo(
-      static_cast<uint8_t const*>(stripe_data[info.stripe_idx].data()) + info.dst_pos,
-      info.length));
-  }
-  compinfo.host_to_device_async(stream);
-
-  gpu::ParseCompressedStripeData(compinfo.device_ptr(),
-                                 compinfo.size(),
-                                 decompressor.GetBlockSize(),
-                                 decompressor.GetLog2MaxCompressionRatio(),
-                                 stream);
-  compinfo.device_to_host_sync(stream);
+  // Whether we have the comppression info precomputed.
+  auto const compinfo_ready = not compinfo_map.empty();
 
   // Count the exact number of compressed blocks
   std::size_t num_compressed_blocks   = 0;
   std::size_t num_uncompressed_blocks = 0;
   std::size_t total_decomp_size       = 0;
-  for (std::size_t i = 0; i < compinfo.size(); ++i) {
-    num_compressed_blocks += compinfo[i].num_compressed_blocks;
-    num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
-    total_decomp_size += compinfo[i].max_uncompressed_size;
+
+  for (auto stream_idx = stream_range.begin; stream_idx < stream_range.end; ++stream_idx) {
+    auto const& info = stream_info[stream_idx];
+
+    auto& stream_comp_info = compinfo[stream_idx - stream_range.begin];
+    stream_comp_info       = gpu::CompressedStreamInfo(
+      static_cast<uint8_t const*>(
+        stripe_data[info.source.stripe_idx - loaded_stripe_range.begin].data()) +
+        info.dst_pos,
+      info.length);
+
+    if (compinfo_ready) {
+      auto const& cached_comp_info             = compinfo_map.at(info.source);
+      stream_comp_info.num_compressed_blocks   = cached_comp_info.num_compressed_blocks;
+      stream_comp_info.num_uncompressed_blocks = cached_comp_info.num_uncompressed_blocks;
+      stream_comp_info.max_uncompressed_size   = cached_comp_info.total_decomp_size;
+
+      num_compressed_blocks += cached_comp_info.num_compressed_blocks;
+      num_uncompressed_blocks += cached_comp_info.num_uncompressed_blocks;
+      total_decomp_size += cached_comp_info.total_decomp_size;
+    }
   }
+
+  if (!compinfo_ready) {
+    compinfo.host_to_device_async(stream);
+    gpu::ParseCompressedStripeData(compinfo.device_ptr(),
+                                   compinfo.size(),
+                                   decompressor.GetBlockSize(),
+                                   decompressor.GetLog2MaxCompressionRatio(),
+                                   stream);
+    compinfo.device_to_host_sync(stream);
+
+    for (std::size_t i = 0; i < compinfo.size(); ++i) {
+      num_compressed_blocks += compinfo[i].num_compressed_blocks;
+      num_uncompressed_blocks += compinfo[i].num_uncompressed_blocks;
+      total_decomp_size += compinfo[i].max_uncompressed_size;
+    }
+  }
+
   CUDF_EXPECTS(
     not((num_uncompressed_blocks + num_compressed_blocks > 0) and (total_decomp_size == 0)),
     "Inconsistent info on compression blocks");
 
-  // Buffer needs to be padded.
-  // Required by `gpuDecodeOrcColumnData`.
+  // Buffer needs to be padded.This is required by `gpuDecodeOrcColumnData`.
   rmm::device_buffer decomp_data(
     cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
+
+  // If total_decomp_size is zero, the input data may be just empty.
+  // This is still a valid input, thus do not be panick.
   if (decomp_data.is_empty()) { return decomp_data; }
 
   rmm::device_uvector<device_span<uint8_t const>> inflate_in(
@@ -221,7 +150,7 @@ rmm::device_buffer decompress_stripe_data(
   rmm::device_uvector<device_span<uint8_t>> inflate_out(
     num_compressed_blocks + num_uncompressed_blocks, stream);
   rmm::device_uvector<compression_result> inflate_res(num_compressed_blocks, stream);
-  thrust::fill(rmm::exec_policy(stream),
+  thrust::fill(rmm::exec_policy_nosync(stream),
                inflate_res.begin(),
                inflate_res.end(),
                compression_result{0, compression_status::FAILURE});
@@ -240,13 +169,13 @@ rmm::device_buffer decompress_stripe_data(
     compinfo[i].copy_in_ctl  = inflate_in.data() + start_pos_uncomp;
     compinfo[i].copy_out_ctl = inflate_out.data() + start_pos_uncomp;
 
-    stream_info[i].dst_pos = decomp_offset;
     decomp_offset += compinfo[i].max_uncompressed_size;
     start_pos += compinfo[i].num_compressed_blocks;
     start_pos_uncomp += compinfo[i].num_uncompressed_blocks;
     max_uncomp_block_size =
       std::max(max_uncomp_block_size, compinfo[i].max_uncompressed_block_size);
   }
+
   compinfo.host_to_device_async(stream);
   gpu::ParseCompressedStripeData(compinfo.device_ptr(),
                                  compinfo.size(),
@@ -325,7 +254,7 @@ rmm::device_buffer decompress_stripe_data(
     // Check if any block has been failed to decompress.
     // Not using `thrust::any` or `thrust::count_if` to defer stream sync.
     thrust::for_each(
-      rmm::exec_policy(stream),
+      rmm::exec_policy_nosync(stream),
       thrust::make_counting_iterator(std::size_t{0}),
       thrust::make_counting_iterator(inflate_res.size()),
       [results           = inflate_res.begin(),
@@ -351,15 +280,15 @@ rmm::device_buffer decompress_stripe_data(
   // We can check on host after stream synchronize
   CUDF_EXPECTS(not any_block_failure[0], "Error during decompression");
 
-  size_type const num_columns = chunks.size().second;
+  auto const num_columns = chunks.size().second;
 
   // Update the stream information with the updated uncompressed info
   // TBD: We could update the value from the information we already
   // have in stream_info[], but using the gpu results also updates
   // max_uncompressed_size to the actual uncompressed size, or zero if
   // decompression failed.
-  for (size_type i = 0; i < num_stripes; ++i) {
-    for (size_type j = 0; j < num_columns; ++j) {
+  for (std::size_t i = 0; i < num_decode_stripes; ++i) {
+    for (std::size_t j = 0; j < num_columns; ++j) {
       auto& chunk = chunks[i][j];
       for (int k = 0; k < gpu::CI_NUM_STREAMS; ++k) {
         if (chunk.strm_len[k] > 0 && chunk.strm_id[k] < compinfo.size()) {
@@ -377,7 +306,7 @@ rmm::device_buffer decompress_stripe_data(
                             compinfo.device_ptr(),
                             chunks.base_device_ptr(),
                             num_columns,
-                            num_stripes,
+                            num_decode_stripes,
                             row_index_stride,
                             use_base_stride,
                             stream);
@@ -424,7 +353,7 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
       if (child_valid_map_base != nullptr) {
         rmm::device_uvector<uint32_t> dst_idx(child_mask_len, stream);
         // Copy indexes at which the parent has valid value.
-        thrust::copy_if(rmm::exec_policy(stream),
+        thrust::copy_if(rmm::exec_policy_nosync(stream),
                         thrust::make_counting_iterator(0),
                         thrust::make_counting_iterator(0) + parent_mask_len,
                         dst_idx.begin(),
@@ -438,7 +367,7 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
         uint32_t* dst_idx_ptr = dst_idx.data();
         // Copy child valid bits from child column to valid indexes, this will merge both child
         // and parent null masks
-        thrust::for_each(rmm::exec_policy(stream),
+        thrust::for_each(rmm::exec_policy_nosync(stream),
                          thrust::make_counting_iterator(0),
                          thrust::make_counting_iterator(0) + dst_idx.size(),
                          [child_valid_map_base, dst_idx_ptr, merged_mask] __device__(auto idx) {
@@ -484,11 +413,11 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-void decode_stream_data(std::size_t num_dicts,
+void decode_stream_data(int64_t num_dicts,
                         int64_t skip_rows,
                         size_type row_index_stride,
                         std::size_t level,
-                        table_view const& tz_table,
+                        table_device_view const& d_tz_table,
                         cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks,
                         cudf::detail::device_2dspan<gpu::RowGroup> row_groups,
                         std::vector<column_buffer>& out_buffers,
@@ -497,6 +426,7 @@ void decode_stream_data(std::size_t num_dicts,
 {
   auto const num_stripes = chunks.size().first;
   auto const num_columns = chunks.size().second;
+
   thrust::counting_iterator<int> col_idx_it(0);
   thrust::counting_iterator<int> stripe_idx_it(0);
 
@@ -512,7 +442,7 @@ void decode_stream_data(std::size_t num_dicts,
   // Allocate global dictionary for deserializing
   rmm::device_uvector<gpu::DictionaryEntry> global_dict(num_dicts, stream);
 
-  chunks.host_to_device_sync(stream);
+  chunks.host_to_device_async(stream);
   gpu::DecodeNullsAndStringDictionaries(
     chunks.base_device_ptr(), global_dict.data(), num_columns, num_stripes, skip_rows, stream);
 
@@ -521,16 +451,14 @@ void decode_stream_data(std::size_t num_dicts,
     update_null_mask(chunks, out_buffers, stream, mr);
   }
 
-  auto const tz_table_dptr = table_device_view::create(tz_table, stream);
   rmm::device_scalar<size_type> error_count(0, stream);
-  // Update the null map for child columns
   gpu::DecodeOrcColumnData(chunks.base_device_ptr(),
                            global_dict.data(),
                            row_groups,
                            num_columns,
                            num_stripes,
                            skip_rows,
-                           *tz_table_dptr,
+                           d_tz_table,
                            row_groups.size().first,
                            row_index_stride,
                            level,
@@ -557,40 +485,38 @@ void decode_stream_data(std::size_t num_dicts,
  * layer.
  */
 void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const& chunks,
-                      cudf::host_span<rmm::device_uvector<uint32_t>> prefix_sums,
+                      uint32_t* d_prefix_sums,
                       rmm::cuda_stream_view stream)
 {
   auto const num_stripes = chunks.size().first;
   if (num_stripes == 0) return;
 
   auto const num_columns = chunks.size().second;
-  std::vector<thrust::pair<size_type, cudf::device_span<uint32_t>>> prefix_sums_to_update;
+  std::vector<thrust::pair<size_type, uint32_t*>> prefix_sums_to_update;
   for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
     // Null counts sums are only needed for children of struct columns
     if (chunks[0][col_idx].type_kind == STRUCT) {
-      prefix_sums_to_update.emplace_back(col_idx, prefix_sums[col_idx]);
+      prefix_sums_to_update.emplace_back(col_idx, d_prefix_sums + num_stripes * col_idx);
     }
   }
   auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
     prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
 
-  thrust::for_each(rmm::exec_policy(stream),
-                   d_prefix_sums_to_update.begin(),
-                   d_prefix_sums_to_update.end(),
-                   [chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
-                     auto const& idx_psums) {
-                     auto const col_idx = idx_psums.first;
-                     auto const psums   = idx_psums.second;
-
-                     thrust::transform(
-                       thrust::seq,
-                       thrust::make_counting_iterator(0),
-                       thrust::make_counting_iterator(0) + psums.size(),
-                       psums.begin(),
-                       [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
-
-                     thrust::inclusive_scan(thrust::seq, psums.begin(), psums.end(), psums.begin());
-                   });
+  thrust::for_each(
+    rmm::exec_policy_nosync(stream),
+    d_prefix_sums_to_update.begin(),
+    d_prefix_sums_to_update.end(),
+    [num_stripes, chunks = cudf::detail::device_2dspan<gpu::ColumnDesc const>{chunks}] __device__(
+      auto const& idx_psums) {
+      auto const col_idx = idx_psums.first;
+      auto const psums   = idx_psums.second;
+      thrust::transform(thrust::seq,
+                        thrust::make_counting_iterator<std::size_t>(0ul),
+                        thrust::make_counting_iterator<std::size_t>(num_stripes),
+                        psums,
+                        [&](auto stripe_idx) { return chunks[stripe_idx][col_idx].null_count; });
+      thrust::inclusive_scan(thrust::seq, psums, psums + num_stripes, psums);
+    });
   // `prefix_sums_to_update` goes out of scope, copy has to be done before we return
   stream.synchronize();
 }
@@ -634,6 +560,7 @@ void aggregate_child_meta(std::size_t level,
   // For each parent column, update its child column meta for each stripe.
   std::for_each(nested_cols.begin(), nested_cols.end(), [&](auto const p_col) {
     auto const parent_col_idx = col_meta.orc_col_map[level][p_col.id];
+
     int64_t start_row         = 0;
     auto processed_row_groups = 0;
 
@@ -657,10 +584,19 @@ void aggregate_child_meta(std::size_t level,
 
       // Aggregate start row, number of rows per chunk and total number of rows in a column
       auto const child_rows = chunks[stripe_id][parent_col_idx].num_child_rows;
+
       for (size_type id = 0; id < p_col.num_children; id++) {
         auto const child_col_idx = index + id;
 
         num_child_rows[child_col_idx] += child_rows;
+
+        // The number of rows in child column should not be very large otherwise we will have
+        // size overflow.
+        // If that is the case, we need to set a read limit to reduce number of decoding stripes.
+        CUDF_EXPECTS(num_child_rows[child_col_idx] <=
+                       static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+                     "Number of rows in the child column exceeds column size limit.");
+
         num_child_rows_per_stripe[stripe_id][child_col_idx] = child_rows;
         // start row could be different for each column when there is nesting at each stripe level
         child_start_row[stripe_id][child_col_idx] = (stripe_id == 0) ? 0 : start_row;
@@ -709,264 +645,291 @@ void generate_offsets_for_list(host_span<list_buffer_data> buff_data, rmm::cuda_
   }
 }
 
+/**
+ * @brief Find the splits of the input table such that each split range of rows has data size less
+ * than a given `size_limit`.
+ *
+ * The parameter `segment_length` is to control the granularity of splits. The output ranges will
+ * always have numbers of rows that are multiple of this value, except the last range that contains
+ * the remaining rows.
+ *
+ * Similar to `find_splits`, the given limit is just a soft limit. This function will never output
+ * empty ranges, even they have sizes exceed the value of `size_limit`.
+ *
+ * @param input The input table to find splits
+ * @param segment_length Value to control granularity of the output ranges
+ * @param size_limit A limit on the output size of each split range
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A vector of ranges as splits of the input
+ */
+std::vector<range> find_table_splits(table_view const& input,
+                                     size_type segment_length,
+                                     std::size_t size_limit,
+                                     rmm::cuda_stream_view stream)
+{
+  if (size_limit == 0) {
+    return std::vector<range>{range{0, static_cast<std::size_t>(input.num_rows())}};
+  }
+
+  CUDF_EXPECTS(segment_length > 0, "Invalid segment_length", std::invalid_argument);
+
+  // `segmented_row_bit_count` requires that `segment_length` is not larger than number of rows.
+  segment_length = std::min(segment_length, input.num_rows());
+
+  auto const d_segmented_sizes = cudf::detail::segmented_row_bit_count(
+    input, segment_length, stream, rmm::mr::get_current_device_resource());
+
+  auto segmented_sizes =
+    cudf::detail::hostdevice_vector<cumulative_size>(d_segmented_sizes->size(), stream);
+
+  thrust::transform(
+    rmm::exec_policy_nosync(stream),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(d_segmented_sizes->size()),
+    segmented_sizes.d_begin(),
+    [segment_length,
+     num_rows = input.num_rows(),
+     d_sizes  = d_segmented_sizes->view().begin<size_type>()] __device__(auto const segment_idx) {
+      // Since the number of rows may not divisible by segment_length,
+      // the last segment may be shorter than the others.
+      auto const current_length =
+        cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
+      auto const size = d_sizes[segment_idx] / CHAR_BIT;  // divide by CHAR_BIT to get size in bytes
+      return cumulative_size{static_cast<std::size_t>(current_length),
+                             static_cast<std::size_t>(size)};
+    });
+
+  thrust::inclusive_scan(rmm::exec_policy_nosync(stream),
+                         segmented_sizes.d_begin(),
+                         segmented_sizes.d_end(),
+                         segmented_sizes.d_begin(),
+                         cumulative_size_plus{});
+  segmented_sizes.device_to_host_sync(stream);
+
+  return find_splits<cumulative_size>(segmented_sizes, input.num_rows(), size_limit);
+}
+
 }  // namespace
 
-void reader::impl::prepare_data(int64_t skip_rows,
-                                std::optional<size_type> const& num_rows_opt,
-                                std::vector<std::vector<size_type>> const& stripes)
+void reader_impl::decompress_and_decode_stripes(read_mode mode)
 {
-  // Selected columns at different levels of nesting are stored in different elements
-  // of `selected_columns`; thus, size == 1 means no nested columns
-  CUDF_EXPECTS(skip_rows == 0 or _selected_columns.num_levels() == 1,
-               "skip_rows is not supported by nested columns");
-
-  // There are no columns in the table
-  if (_selected_columns.num_levels() == 0) { return; }
-
-  _file_itm_data = std::make_unique<file_intermediate_data>();
-
-  // Select only stripes required (aka row groups)
-  std::tie(
-    _file_itm_data->rows_to_skip, _file_itm_data->rows_to_read, _file_itm_data->selected_stripes) =
-    _metadata.select_stripes(stripes, skip_rows, num_rows_opt, _stream);
-  auto const rows_to_skip      = _file_itm_data->rows_to_skip;
-  auto const rows_to_read      = _file_itm_data->rows_to_read;
-  auto const& selected_stripes = _file_itm_data->selected_stripes;
-
-  // If no rows or stripes to read, return empty columns
-  if (rows_to_read == 0 || selected_stripes.empty()) { return; }
-
-  // Set up table for converting timestamp columns from local to UTC time
-  auto const tz_table = [&, &selected_stripes = selected_stripes] {
-    auto const has_timestamp_column = std::any_of(
-      _selected_columns.levels.cbegin(), _selected_columns.levels.cend(), [&](auto const& col_lvl) {
-        return std::any_of(col_lvl.cbegin(), col_lvl.cend(), [&](auto const& col_meta) {
-          return _metadata.get_col_type(col_meta.id).kind == TypeKind::TIMESTAMP;
-        });
-      });
+  if (!_file_itm_data.has_data()) { return; }
+
+  CUDF_EXPECTS(_chunk_read_data.curr_load_stripe_range > 0, "There is not any stripe loaded.");
+
+  auto const stripe_range =
+    _chunk_read_data.decode_stripe_ranges[_chunk_read_data.curr_decode_stripe_range++];
+  auto const stripe_start = stripe_range.begin;
+  auto const stripe_end   = stripe_range.end;
+  auto const stripe_count = stripe_range.size();
+
+  // The start index of loaded stripes. They are different from decoding stripes.
+  auto const load_stripe_range =
+    _chunk_read_data.load_stripe_ranges[_chunk_read_data.curr_load_stripe_range - 1];
+  auto const load_stripe_start = load_stripe_range.begin;
+
+  auto const rows_to_skip      = _file_itm_data.rows_to_skip;
+  auto const& selected_stripes = _file_itm_data.selected_stripes;
+
+  // Number of rows to decode in this decompressing/decoding step.
+  int64_t rows_to_decode = 0;
+  for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
+    auto const& stripe     = selected_stripes[stripe_idx];
+    auto const stripe_rows = static_cast<int64_t>(stripe.stripe_info->numberOfRows);
+    rows_to_decode += stripe_rows;
+  }
 
-    return has_timestamp_column
-             ? cudf::detail::make_timezone_transition_table(
-                 {}, selected_stripes[0].stripe_info[0].second->writerTimezone, _stream)
-             : std::make_unique<cudf::table>();
+  CUDF_EXPECTS(rows_to_decode > rows_to_skip, "Invalid rows_to_decode computation.");
+  rows_to_decode = std::min<int64_t>(rows_to_decode - rows_to_skip, _file_itm_data.rows_to_read);
+
+  // After this step, we no longer have any rows to skip.
+  // The number of rows remains to read in the future also reduced.
+  _file_itm_data.rows_to_skip = 0;
+  _file_itm_data.rows_to_read -= rows_to_decode;
+
+  // Technically, overflow here should never happen because the `load_next_stripe_data()` step
+  // already handled it by splitting the loaded stripe range into multiple decode ranges.
+  CUDF_EXPECTS(rows_to_decode <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+               "Number or rows to decode exceeds the column size limit.",
+               std::overflow_error);
+
+  auto const tz_table_dptr = table_device_view::create(_file_itm_data.tz_table->view(), _stream);
+  auto const num_levels    = _selected_columns.num_levels();
+  _out_buffers.resize(num_levels);
+
+  // Column descriptors ('chunks').
+  // Each 'chunk' of data here corresponds to an orc column, in a stripe, at a nested level.
+  // Unfortunately we cannot create one hostdevice_vector to use for all levels because
+  // currently we do not have a hostdevice_2dspan class.
+  std::vector<cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>> lvl_chunks(num_levels);
+
+  // For computing null count.
+  auto null_count_prefix_sums = [&] {
+    auto const num_total_cols = std::accumulate(
+      _selected_columns.levels.begin(),
+      _selected_columns.levels.end(),
+      std::size_t{0},
+      [](auto const& sum, auto const& cols_level) { return sum + cols_level.size(); });
+
+    return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
+      num_total_cols * stripe_count, _stream, rmm::mr::get_current_device_resource());
+  }();
+  std::size_t num_processed_lvl_columns      = 0;
+  std::size_t num_processed_prev_lvl_columns = 0;
+
+  // For parsing decompression data.
+  // We create one hostdevice_vector that is large enough to use for all levels,
+  // thus only need to allocate memory once.
+  auto hd_compinfo = [&] {
+    std::size_t max_num_streams{0};
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      // Find the maximum number of streams in all levels of the decoding stripes.
+      for (std::size_t level = 0; level < num_levels; ++level) {
+        auto const stream_range =
+          merge_selected_ranges(_file_itm_data.lvl_stripe_stream_ranges[level], stripe_range);
+        max_num_streams = std::max(max_num_streams, stream_range.size());
+      }
+    }
+    return cudf::detail::hostdevice_vector<gpu::CompressedStreamInfo>{max_num_streams, _stream};
   }();
 
-  auto& lvl_stripe_data        = _file_itm_data->lvl_stripe_data;
-  auto& null_count_prefix_sums = _file_itm_data->null_count_prefix_sums;
-  lvl_stripe_data.resize(_selected_columns.num_levels());
-
-  _out_buffers.resize(_selected_columns.num_levels());
-
-  // Iterates through levels of nested columns, child column will be one level down
-  // compared to parent column.
   auto& col_meta = *_col_meta;
   for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
-    auto& columns_level = _selected_columns.levels[level];
-    // Association between each ORC column and its cudf::column
-    col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
-    std::vector<orc_column_meta> nested_cols;
-
-    // Get a list of column data types
-    std::vector<data_type> column_types;
-    for (auto& col : columns_level) {
-      auto col_type = to_cudf_type(_metadata.get_col_type(col.id).kind,
-                                   _use_np_dtypes,
-                                   _timestamp_type.id(),
-                                   to_cudf_decimal_type(_decimal128_columns, _metadata, col.id));
-      CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-      if (col_type == type_id::DECIMAL32 or col_type == type_id::DECIMAL64 or
-          col_type == type_id::DECIMAL128) {
-        // sign of the scale is changed since cuDF follows c++ libraries like CNL
-        // which uses negative scaling, but liborc and other libraries
-        // follow positive scaling.
-        auto const scale =
-          -static_cast<size_type>(_metadata.get_col_type(col.id).scale.value_or(0));
-        column_types.emplace_back(col_type, scale);
-      } else {
-        column_types.emplace_back(col_type);
-      }
+    auto const& stripe_stream_ranges = _file_itm_data.lvl_stripe_stream_ranges[level];
+    auto const stream_range          = merge_selected_ranges(stripe_stream_ranges, stripe_range);
 
-      // Map each ORC column to its column
-      col_meta.orc_col_map[level][col.id] = column_types.size() - 1;
-      if (col_type == type_id::LIST or col_type == type_id::STRUCT) {
-        nested_cols.emplace_back(col);
-      }
-    }
+    auto const& columns_level = _selected_columns.levels[level];
+    auto const& stream_info   = _file_itm_data.lvl_stream_info[level];
+    auto const& column_types  = _file_itm_data.lvl_column_types[level];
+    auto const& nested_cols   = _file_itm_data.lvl_nested_cols[level];
 
-    // Get the total number of stripes across all input files.
-    std::size_t total_num_stripes =
-      std::accumulate(selected_stripes.begin(),
-                      selected_stripes.end(),
-                      0,
-                      [](std::size_t sum, auto& stripe_source_mapping) {
-                        return sum + stripe_source_mapping.stripe_info.size();
-                      });
-    auto const num_columns = columns_level.size();
-    cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> chunks(
-      total_num_stripes, num_columns, _stream);
+    auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
+    auto& chunks      = lvl_chunks[level];
+
+    auto const num_lvl_columns = columns_level.size();
+    chunks =
+      cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(stripe_count, num_lvl_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
     const bool use_index =
-      _use_index &&
+      _options.use_index &&
       // Do stripes have row group index
       _metadata.is_row_grp_idx_present() &&
       // Only use if we don't have much work with complete columns & stripes
       // TODO: Consider nrows, gpu, and tune the threshold
-      (rows_to_read > _metadata.get_row_index_stride() && !(_metadata.get_row_index_stride() & 7) &&
-       _metadata.get_row_index_stride() != 0 && num_columns * total_num_stripes < 8 * 128) &&
+      (rows_to_decode > _metadata.get_row_index_stride() &&
+       !(_metadata.get_row_index_stride() & 7) && _metadata.get_row_index_stride() != 0 &&
+       num_lvl_columns * stripe_count < 8 * 128) &&
       // Only use if first row is aligned to a stripe boundary
       // TODO: Fix logic to handle unaligned rows
       (rows_to_skip == 0);
 
-    // Logically view streams as columns
-    std::vector<orc_stream_info> stream_info;
-
-    null_count_prefix_sums.emplace_back();
-    null_count_prefix_sums.back().reserve(_selected_columns.levels[level].size());
-    std::generate_n(std::back_inserter(null_count_prefix_sums.back()),
-                    _selected_columns.levels[level].size(),
-                    [&]() {
-                      return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-                        total_num_stripes, _stream, rmm::mr::get_current_device_resource());
-                    });
-
-    // Tracker for eventually deallocating compressed and uncompressed data
-    auto& stripe_data = lvl_stripe_data[level];
-
-    int64_t stripe_start_row = 0;
-    int64_t num_dict_entries = 0;
-    int64_t num_rowgroups    = 0;
-    size_type stripe_idx     = 0;
-
-    std::vector<std::pair<std::future<std::size_t>, std::size_t>> read_tasks;
-    for (auto const& stripe_source_mapping : selected_stripes) {
-      // Iterate through the source files selected stripes
-      for (auto const& stripe : stripe_source_mapping.stripe_info) {
-        auto const stripe_info   = stripe.first;
-        auto const stripe_footer = stripe.second;
-
-        auto stream_count          = stream_info.size();
-        auto const total_data_size = gather_stream_info(stripe_idx,
-                                                        stripe_info,
-                                                        stripe_footer,
-                                                        col_meta.orc_col_map[level],
-                                                        _metadata.get_types(),
-                                                        use_index,
-                                                        level == 0,
-                                                        &num_dict_entries,
-                                                        stream_info,
-                                                        chunks);
-
-        auto const is_stripe_data_empty = total_data_size == 0;
-        CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
-                     "Invalid index rowgroup stream data");
-
-        // Buffer needs to be padded.
-        // Required by `copy_uncompressed_kernel`.
-        stripe_data.emplace_back(
-          cudf::util::round_up_safe(total_data_size, BUFFER_PADDING_MULTIPLE), _stream);
-        auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
-
-        // Coalesce consecutive streams into one read
-        while (not is_stripe_data_empty and stream_count < stream_info.size()) {
-          auto const d_dst  = dst_base + stream_info[stream_count].dst_pos;
-          auto const offset = stream_info[stream_count].offset;
-          auto len          = stream_info[stream_count].length;
-          stream_count++;
-
-          while (stream_count < stream_info.size() &&
-                 stream_info[stream_count].offset == offset + len) {
-            len += stream_info[stream_count].length;
-            stream_count++;
-          }
-          if (_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                .source->is_device_read_preferred(len)) {
-            read_tasks.push_back(
-              std::pair(_metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                          .source->device_read_async(offset, len, d_dst, _stream),
-                        len));
-
-          } else {
-            auto const buffer =
-              _metadata.per_file_metadata[stripe_source_mapping.source_idx].source->host_read(
-                offset, len);
-            CUDF_EXPECTS(buffer->size() == len, "Unexpected discrepancy in bytes read.");
-            CUDF_CUDA_TRY(
-              cudaMemcpyAsync(d_dst, buffer->data(), len, cudaMemcpyDefault, _stream.value()));
-            _stream.synchronize();
-          }
-        }
-
-        auto const num_rows_per_stripe = stripe_info->numberOfRows;
-        auto const rowgroup_id         = num_rowgroups;
-        auto stripe_num_rowgroups      = 0;
-        if (use_index) {
-          stripe_num_rowgroups = (num_rows_per_stripe + _metadata.get_row_index_stride() - 1) /
-                                 _metadata.get_row_index_stride();
+    // 0-based counters, used across all decoding stripes in this step.
+    int64_t stripe_start_row{0};
+    int64_t num_dict_entries{0};
+    uint32_t num_rowgroups{0};
+    std::size_t local_stream_order{0};
+
+    for (auto stripe_idx = stripe_start; stripe_idx < stripe_end; ++stripe_idx) {
+      auto const& stripe       = selected_stripes[stripe_idx];
+      auto const stripe_info   = stripe.stripe_info;
+      auto const stripe_footer = stripe.stripe_footer;
+
+      // Normalize stripe_idx to 0-based.
+      auto const stripe_local_idx = stripe_idx - stripe_start;
+
+      // The first parameter (`stripe_order`) must be normalized to 0-based.
+      auto const total_data_size = gather_stream_info_and_column_desc(stripe_local_idx,
+                                                                      level,
+                                                                      stripe_info,
+                                                                      stripe_footer,
+                                                                      col_meta.orc_col_map[level],
+                                                                      _metadata.get_types(),
+                                                                      use_index,
+                                                                      level == 0,
+                                                                      &num_dict_entries,
+                                                                      &local_stream_order,
+                                                                      nullptr,  // stream_info
+                                                                      &chunks);
+
+      auto const is_stripe_data_empty = total_data_size == 0;
+      CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                   "Invalid index rowgroup stream data");
+
+      auto const dst_base =
+        static_cast<uint8_t*>(stripe_data[stripe_idx - load_stripe_start].data());
+      auto const num_rows_in_stripe = static_cast<int64_t>(stripe_info->numberOfRows);
+
+      uint32_t const rowgroup_id = num_rowgroups;
+      uint32_t const stripe_num_rowgroups =
+        use_index ? (num_rows_in_stripe + _metadata.get_row_index_stride() - 1) /
+                      _metadata.get_row_index_stride()
+                  : 0;
+
+      // Update chunks to reference streams pointers.
+      for (std::size_t col_idx = 0; col_idx < num_lvl_columns; col_idx++) {
+        auto& chunk = chunks[stripe_local_idx][col_idx];
+        // start row, number of rows in a each stripe and total number of rows
+        // may change in lower levels of nesting
+        chunk.start_row =
+          (level == 0) ? stripe_start_row
+                       : col_meta.child_start_row[stripe_local_idx * num_lvl_columns + col_idx];
+        chunk.num_rows =
+          (level == 0)
+            ? num_rows_in_stripe
+            : col_meta.num_child_rows_per_stripe[stripe_local_idx * num_lvl_columns + col_idx];
+        chunk.column_num_rows = (level == 0) ? rows_to_decode : col_meta.num_child_rows[col_idx];
+        chunk.parent_validity_info =
+          (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
+        chunk.parent_null_count_prefix_sums =
+          (level == 0) ? nullptr
+                       : null_count_prefix_sums.data() + (num_processed_prev_lvl_columns +
+                                                          col_meta.parent_column_index[col_idx]) *
+                                                           stripe_count;
+        chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
+        chunk.type_kind =
+          _metadata.per_file_metadata[stripe.source_idx].ff.types[columns_level[col_idx].id].kind;
+
+        // num_child_rows for a struct column will be same, for other nested types it will be
+        // calculated.
+        chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
+        chunk.dtype_id       = column_types[col_idx].id();
+        chunk.decimal_scale  = _metadata.per_file_metadata[stripe.source_idx]
+                                .ff.types[columns_level[col_idx].id]
+                                .scale.value_or(0);
+
+        chunk.rowgroup_id   = rowgroup_id;
+        chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
+                                ? sizeof(string_index_pair)
+                              : ((column_types[col_idx].id() == type_id::LIST) or
+                             (column_types[col_idx].id() == type_id::STRUCT))
+                                ? sizeof(size_type)
+                                : cudf::size_of(column_types[col_idx]);
+        chunk.num_rowgroups = stripe_num_rowgroups;
+
+        if (chunk.type_kind == orc::TIMESTAMP) {
+          chunk.timestamp_type_id = _options.timestamp_type.id();
         }
-        // Update chunks to reference streams pointers
-        for (std::size_t col_idx = 0; col_idx < num_columns; col_idx++) {
-          auto& chunk = chunks[stripe_idx][col_idx];
-          // start row, number of rows in a each stripe and total number of rows
-          // may change in lower levels of nesting
-          chunk.start_row = (level == 0)
-                              ? stripe_start_row
-                              : col_meta.child_start_row[stripe_idx * num_columns + col_idx];
-          chunk.num_rows =
-            (level == 0) ? stripe_info->numberOfRows
-                         : col_meta.num_child_rows_per_stripe[stripe_idx * num_columns + col_idx];
-          chunk.column_num_rows = (level == 0) ? rows_to_read : col_meta.num_child_rows[col_idx];
-          chunk.parent_validity_info =
-            (level == 0) ? column_validity_info{} : col_meta.parent_column_data[col_idx];
-          chunk.parent_null_count_prefix_sums =
-            (level == 0)
-              ? nullptr
-              : null_count_prefix_sums[level - 1][col_meta.parent_column_index[col_idx]].data();
-          chunk.encoding_kind = stripe_footer->columns[columns_level[col_idx].id].kind;
-          chunk.type_kind     = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                              .ff.types[columns_level[col_idx].id]
-                              .kind;
-          // num_child_rows for a struct column will be same, for other nested types it will be
-          // calculated.
-          chunk.num_child_rows = (chunk.type_kind != orc::STRUCT) ? 0 : chunk.num_rows;
-          chunk.dtype_id       = column_types[col_idx].id();
-          chunk.decimal_scale  = _metadata.per_file_metadata[stripe_source_mapping.source_idx]
-                                  .ff.types[columns_level[col_idx].id]
-                                  .scale.value_or(0);
-
-          chunk.rowgroup_id   = rowgroup_id;
-          chunk.dtype_len     = (column_types[col_idx].id() == type_id::STRING)
-                                  ? sizeof(string_index_pair)
-                                : ((column_types[col_idx].id() == type_id::LIST) or
-                               (column_types[col_idx].id() == type_id::STRUCT))
-                                  ? sizeof(size_type)
-                                  : cudf::size_of(column_types[col_idx]);
-          chunk.num_rowgroups = stripe_num_rowgroups;
-          if (chunk.type_kind == orc::TIMESTAMP) { chunk.timestamp_type_id = _timestamp_type.id(); }
-          if (not is_stripe_data_empty) {
-            for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
-              chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
-            }
+        if (not is_stripe_data_empty) {
+          for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
+            chunk.streams[k] =
+              dst_base + stream_info[chunk.strm_id[k] + stream_range.begin].dst_pos;
           }
         }
-        stripe_start_row += num_rows_per_stripe;
-        num_rowgroups += stripe_num_rowgroups;
-
-        stripe_idx++;
       }
-    }
-    for (auto& task : read_tasks) {
-      CUDF_EXPECTS(task.first.get() == task.second, "Unexpected discrepancy in bytes read.");
+
+      stripe_start_row += num_rows_in_stripe;
+      num_rowgroups += stripe_num_rowgroups;
     }
 
     if (stripe_data.empty()) { continue; }
 
-    // Process dataset chunk pages into output columns
+    // Process dataset chunks into output columns.
     auto row_groups =
-      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_columns, _stream);
+      cudf::detail::hostdevice_2dvector<gpu::RowGroup>(num_rowgroups, num_lvl_columns, _stream);
     if (level > 0 and row_groups.size().first) {
       cudf::host_span<gpu::RowGroup> row_groups_span(row_groups.base_host_ptr(),
-                                                     num_rowgroups * num_columns);
+                                                     num_rowgroups * num_lvl_columns);
       auto& rw_grp_meta = col_meta.rwgrp_meta;
 
       // Update start row and num rows per row group
@@ -980,19 +943,31 @@ void reader::impl::prepare_data(int64_t skip_rows,
                        return meta;
                      });
     }
-    // Setup row group descriptors if using indexes
+
+    // Setup row group descriptors if using indexes.
     if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
-      auto decomp_data = decompress_stripe_data(*_metadata.per_file_metadata[0].decompressor,
+      auto compinfo = cudf::detail::hostdevice_span<gpu::CompressedStreamInfo>(
+        hd_compinfo.begin(), hd_compinfo.d_begin(), stream_range.size());
+      auto decomp_data = decompress_stripe_data(load_stripe_range,
+                                                stream_range,
+                                                stripe_count,
+                                                compinfo,
+                                                _file_itm_data.compinfo_map,
+                                                *_metadata.per_file_metadata[0].decompressor,
                                                 stripe_data,
                                                 stream_info,
                                                 chunks,
                                                 row_groups,
-                                                total_num_stripes,
                                                 _metadata.get_row_index_stride(),
                                                 level == 0,
                                                 _stream);
-      stripe_data.clear();
-      stripe_data.push_back(std::move(decomp_data));
+
+      // Just save the decompressed data and clear out the raw data to free up memory.
+      stripe_data[stripe_start - load_stripe_start] = std::move(decomp_data);
+      for (std::size_t i = 1; i < stripe_count; ++i) {
+        stripe_data[i + stripe_start - load_stripe_start] = {};
+      }
+
     } else {
       if (row_groups.size().first) {
         chunks.host_to_device_async(_stream);
@@ -1001,34 +976,38 @@ void reader::impl::prepare_data(int64_t skip_rows,
         gpu::ParseRowGroupIndex(row_groups.base_device_ptr(),
                                 nullptr,
                                 chunks.base_device_ptr(),
-                                num_columns,
-                                total_num_stripes,
+                                num_lvl_columns,
+                                stripe_count,
                                 _metadata.get_row_index_stride(),
                                 level == 0,
                                 _stream);
       }
     }
 
+    _out_buffers[level].resize(0);
+
     for (std::size_t i = 0; i < column_types.size(); ++i) {
       bool is_nullable = false;
-      for (std::size_t j = 0; j < total_num_stripes; ++j) {
+      for (std::size_t j = 0; j < stripe_count; ++j) {
         if (chunks[j][i].strm_len[gpu::CI_PRESENT] != 0) {
           is_nullable = true;
           break;
         }
       }
-      auto is_list_type = (column_types[i].id() == type_id::LIST);
-      auto n_rows       = (level == 0) ? rows_to_read : col_meta.num_child_rows[i];
-      // For list column, offset column will be always size + 1
-      if (is_list_type) n_rows++;
-      _out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, _stream, _mr);
+
+      auto const is_list_type = (column_types[i].id() == type_id::LIST);
+      auto const n_rows       = (level == 0) ? rows_to_decode : col_meta.num_child_rows[i];
+
+      // For list column, offset column will be always size + 1.
+      _out_buffers[level].emplace_back(
+        column_types[i], is_list_type ? n_rows + 1 : n_rows, is_nullable, _stream, _mr);
     }
 
     decode_stream_data(num_dict_entries,
                        rows_to_skip,
                        _metadata.get_row_index_stride(),
                        level,
-                       tz_table->view(),
+                       *tz_table_dptr,
                        chunks,
                        row_groups,
                        _out_buffers[level],
@@ -1036,8 +1015,9 @@ void reader::impl::prepare_data(int64_t skip_rows,
                        _mr);
 
     if (nested_cols.size()) {
-      // Extract information to process nested child columns
-      scan_null_counts(chunks, null_count_prefix_sums[level], _stream);
+      // Extract information to process nested child columns.
+      scan_null_counts(
+        chunks, null_count_prefix_sums.data() + num_processed_lvl_columns * stripe_count, _stream);
 
       row_groups.device_to_host_sync(_stream);
       aggregate_child_meta(
@@ -1055,7 +1035,48 @@ void reader::impl::prepare_data(int64_t skip_rows,
 
       if (not buff_data.empty()) { generate_offsets_for_list(buff_data, _stream); }
     }
+    num_processed_prev_lvl_columns = num_processed_lvl_columns;
+    num_processed_lvl_columns += num_lvl_columns;
   }  // end loop level
+
+  // Now generate a table from the decoded result.
+  std::vector<std::unique_ptr<column>> out_columns;
+  _out_metadata = get_meta_with_user_data();
+  std::transform(
+    _selected_columns.levels[0].begin(),
+    _selected_columns.levels[0].end(),
+    std::back_inserter(out_columns),
+    [&](auto const& orc_col_meta) {
+      _out_metadata.schema_info.emplace_back("");
+      auto col_buffer = assemble_buffer(
+        orc_col_meta.id, 0, *_col_meta, _metadata, _selected_columns, _out_buffers, _stream, _mr);
+      return make_column(col_buffer, &_out_metadata.schema_info.back(), std::nullopt, _stream);
+    });
+  _chunk_read_data.decoded_table = std::make_unique<table>(std::move(out_columns));
+
+  // Free up temp memory used for decoding.
+  for (std::size_t level = 0; level < _selected_columns.num_levels(); ++level) {
+    _out_buffers[level].resize(0);
+
+    auto& stripe_data = _file_itm_data.lvl_stripe_data[level];
+    if (_metadata.per_file_metadata[0].ps.compression != orc::NONE) {
+      stripe_data[stripe_start - load_stripe_start] = {};
+    } else {
+      for (std::size_t i = 0; i < stripe_count; ++i) {
+        stripe_data[i + stripe_start - load_stripe_start] = {};
+      }
+    }
+  }
+
+  // Output table range is reset to start from the first position.
+  _chunk_read_data.curr_output_table_range = 0;
+
+  // Split the decoded table into ranges that be output into chunks having size within the given
+  // output size limit.
+  _chunk_read_data.output_table_ranges = find_table_splits(_chunk_read_data.decoded_table->view(),
+                                                           _chunk_read_data.output_row_granularity,
+                                                           _chunk_read_data.chunk_read_limit,
+                                                           _stream);
 }
 
 }  // namespace cudf::io::orc::detail
diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp
index 6645eecbd29..a563fb19e15 100644
--- a/cpp/src/io/orc/reader_impl_helpers.hpp
+++ b/cpp/src/io/orc/reader_impl_helpers.hpp
@@ -16,9 +16,9 @@
 
 #pragma once
 
-#include "aggregate_orc_metadata.hpp"
+#include "io/orc/aggregate_orc_metadata.hpp"
+#include "io/orc/orc.hpp"
 #include "io/utilities/column_buffer.hpp"
-#include "orc.hpp"
 
 #include <cudf/io/orc.hpp>
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index c7659be1adb..c47beb8d7ed 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -650,7 +650,10 @@ aggregate_reader_metadata::select_row_groups(
     if (not row_group_indices.empty()) { return std::pair<int64_t, size_type>{}; }
     auto const from_opts = cudf::io::detail::skip_rows_num_rows_from_options(
       skip_rows_opt, num_rows_opt, get_num_rows());
-    return std::pair{static_cast<int64_t>(from_opts.first), from_opts.second};
+    CUDF_EXPECTS(from_opts.second <= static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+                 "Number of reading rows exceeds cudf's column size limit.");
+    return std::pair{static_cast<int64_t>(from_opts.first),
+                     static_cast<size_type>(from_opts.second)};
   }();
 
   if (!row_group_indices.empty()) {
diff --git a/cpp/src/io/utilities/row_selection.cpp b/cpp/src/io/utilities/row_selection.cpp
index f136cd11ff7..c0bbca39167 100644
--- a/cpp/src/io/utilities/row_selection.cpp
+++ b/cpp/src/io/utilities/row_selection.cpp
@@ -23,20 +23,17 @@
 
 namespace cudf::io::detail {
 
-std::pair<int64_t, size_type> skip_rows_num_rows_from_options(
-  int64_t skip_rows, std::optional<size_type> const& num_rows, int64_t num_source_rows)
+std::pair<int64_t, int64_t> skip_rows_num_rows_from_options(int64_t skip_rows,
+                                                            std::optional<int64_t> const& num_rows,
+                                                            int64_t num_source_rows)
 {
-  auto const rows_to_skip = std::min(skip_rows, num_source_rows);
-  if (not num_rows.has_value()) {
-    CUDF_EXPECTS(num_source_rows - rows_to_skip <= std::numeric_limits<size_type>::max(),
-                 "The requested number of rows exceeds the column size limit",
-                 std::overflow_error);
-    return {rows_to_skip, num_source_rows - rows_to_skip};
-  }
+  auto const rows_to_skip      = std::min(skip_rows, num_source_rows);
+  auto const num_rows_can_read = num_source_rows - rows_to_skip;
+
+  if (not num_rows.has_value()) { return {rows_to_skip, num_rows_can_read}; }
+
   // Limit the number of rows to the end of the input
-  return {
-    rows_to_skip,
-    static_cast<size_type>(std::min<int64_t>(num_rows.value(), num_source_rows - rows_to_skip))};
+  return {rows_to_skip, std::min(num_rows.value(), num_rows_can_read)};
 }
 
 }  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/row_selection.hpp b/cpp/src/io/utilities/row_selection.hpp
index 0b5d3aef8bd..7fdcc65d77b 100644
--- a/cpp/src/io/utilities/row_selection.hpp
+++ b/cpp/src/io/utilities/row_selection.hpp
@@ -34,7 +34,8 @@ namespace cudf::io::detail {
  *
  * @throw std::overflow_exception The requested number of rows exceeds the column size limit
  */
-std::pair<int64_t, size_type> skip_rows_num_rows_from_options(
-  int64_t skip_rows, std::optional<size_type> const& num_rows, int64_t num_source_rows);
+std::pair<int64_t, int64_t> skip_rows_num_rows_from_options(int64_t skip_rows,
+                                                            std::optional<int64_t> const& num_rows,
+                                                            int64_t num_source_rows);
 
 }  // namespace cudf::io::detail
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 6c56d82007a..fa633dfa67b 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -294,7 +294,7 @@ ConfigureTest(
   PERCENT 30
 )
 ConfigureTest(
-  ORC_TEST io/orc_test.cpp
+  ORC_TEST io/orc_chunked_reader_test.cu io/orc_test.cpp
   GPUS 1
   PERCENT 30
 )
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
new file mode 100644
index 00000000000..1c1b53ea17f
--- /dev/null
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -0,0 +1,1477 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/io_metadata_utilities.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/io/data_sink.hpp>
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/orc.hpp>
+#include <cudf/io/orc_metadata.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+namespace {
+enum class output_limit : std::size_t {};
+enum class input_limit : std::size_t {};
+enum class output_row_granularity : cudf::size_type {};
+
+// Global environment for temporary files
+auto const temp_env = reinterpret_cast<cudf::test::TempDirTestEnvironment*>(
+  ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+using int32s_col       = cudf::test::fixed_width_column_wrapper<int32_t>;
+using int64s_col       = cudf::test::fixed_width_column_wrapper<int64_t>;
+using doubles_col      = cudf::test::fixed_width_column_wrapper<double>;
+using strings_col      = cudf::test::strings_column_wrapper;
+using structs_col      = cudf::test::structs_column_wrapper;
+using int32s_lists_col = cudf::test::lists_column_wrapper<int32_t>;
+
+auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
+                std::string const& filename,
+                bool nullable                    = false,
+                std::size_t stripe_size_bytes    = cudf::io::default_stripe_size_bytes,
+                cudf::size_type stripe_size_rows = cudf::io::default_stripe_size_rows)
+{
+  if (nullable) {
+    // Generate deterministic bitmask instead of random bitmask for easy computation of data size.
+    auto const valid_iter = cudf::detail::make_counting_transform_iterator(
+      0, [](cudf::size_type i) { return i % 4 != 3; });
+    cudf::size_type offset{0};
+    for (auto& col : input_columns) {
+      auto const [null_mask, null_count] =
+        cudf::test::detail::make_null_mask(valid_iter + offset, valid_iter + col->size() + offset);
+      col = cudf::structs::detail::superimpose_nulls(
+        static_cast<cudf::bitmask_type const*>(null_mask.data()),
+        null_count,
+        std::move(col),
+        cudf::get_default_stream(),
+        rmm::mr::get_current_device_resource());
+
+      // Shift nulls of the next column by one position, to avoid having all nulls
+      // in the same table rows.
+      ++offset;
+    }
+  }
+
+  auto input_table = std::make_unique<cudf::table>(std::move(input_columns));
+  auto filepath =
+    temp_env->get_temp_filepath(nullable ? filename + "_nullable.orc" : filename + ".orc");
+
+  auto const write_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, *input_table)
+      .stripe_size_bytes(stripe_size_bytes)
+      .stripe_size_rows(stripe_size_rows)
+      .build();
+  cudf::io::write_orc(write_opts);
+
+  return std::pair{std::move(input_table), std::move(filepath)};
+}
+
+// NOTE: By default, output_row_granularity=10'000 rows.
+// This means if the input file has more than 10k rows then the output chunk will never
+// have less than 10k rows.
+auto chunked_read(std::string const& filepath,
+                  output_limit output_limit_bytes,
+                  input_limit input_limit_bytes             = input_limit{0},
+                  output_row_granularity output_granularity = output_row_granularity{10'000})
+{
+  auto const read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build();
+  auto reader = cudf::io::chunked_orc_reader(static_cast<std::size_t>(output_limit_bytes),
+                                             static_cast<std::size_t>(input_limit_bytes),
+                                             static_cast<cudf::size_type>(output_granularity),
+                                             read_opts);
+
+  auto num_chunks = 0;
+  auto out_tables = std::vector<std::unique_ptr<cudf::table>>{};
+
+  // TODO: remove this scope, when we get rid of mem stat in the reader.
+  // This is to avoid use-after-free of memory resource created by the mem stat object.
+  auto mr = rmm::mr::get_current_device_resource();
+
+  do {
+    auto chunk = reader.read_chunk();
+    // If the input file is empty, the first call to `read_chunk` will return an empty table.
+    // Thus, we only check for non-empty output table from the second call.
+    if (num_chunks > 0) {
+      CUDF_EXPECTS(chunk.tbl->num_rows() != 0, "Number of rows in the new chunk is zero.");
+    }
+    ++num_chunks;
+    out_tables.emplace_back(std::move(chunk.tbl));
+  } while (reader.has_next());
+
+  if (num_chunks > 1) {
+    CUDF_EXPECTS(out_tables.front()->num_rows() != 0, "Number of rows in the new chunk is zero.");
+  }
+
+  auto out_tviews = std::vector<cudf::table_view>{};
+  for (auto const& tbl : out_tables) {
+    out_tviews.emplace_back(tbl->view());
+  }
+
+  // return std::pair(cudf::concatenate(out_tviews), num_chunks);
+
+  // TODO: remove this
+  return std::pair(cudf::concatenate(out_tviews, cudf::get_default_stream(), mr), num_chunks);
+}
+
+auto chunked_read(std::string const& filepath,
+                  output_limit output_limit_bytes,
+                  output_row_granularity output_granularity)
+{
+  return chunked_read(filepath, output_limit_bytes, input_limit{0UL}, output_granularity);
+}
+
+}  // namespace
+
+struct OrcChunkedReaderTest : public cudf::test::BaseFixture {};
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadNoData)
+{
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  input_columns.emplace_back(int32s_col{}.release());
+  input_columns.emplace_back(int64s_col{}.release());
+
+  auto const [expected, filepath] = write_file(input_columns, "chunked_read_empty");
+  auto const [result, num_chunks] = chunked_read(filepath, output_limit{1'000});
+  EXPECT_EQ(num_chunks, 1);
+  EXPECT_EQ(result->num_rows(), 0);
+  EXPECT_EQ(result->num_columns(), 2);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadInvalidParameter)
+{
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  input_columns.emplace_back(int32s_col{}.release());
+  input_columns.emplace_back(int64s_col{}.release());
+
+  auto const [expected, filepath] = write_file(input_columns, "chunked_read_invalid");
+  EXPECT_THROW(
+    chunked_read(filepath, output_limit{1'000}, output_row_granularity{-1} /*invalid value*/),
+    cudf::logic_error);
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadSimpleData)
+{
+  auto constexpr num_rows = 40'000;
+
+  auto const generate_input = [num_rows](bool nullable, std::size_t stripe_rows) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+    input_columns.emplace_back(int64s_col(value_iter, value_iter + num_rows).release());
+
+    return write_file(input_columns,
+                      "chunked_read_simple",
+                      nullable,
+                      cudf::io::default_stripe_size_bytes,
+                      stripe_rows);
+  };
+
+  {
+    auto const [expected, filepath] = generate_input(false, 1'000);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+  {
+    auto const [expected, filepath] = generate_input(false, cudf::io::default_stripe_size_rows);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  {
+    auto const [expected, filepath] = generate_input(true, 1'000);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+  {
+    auto const [expected, filepath] = generate_input(true, cudf::io::default_stripe_size_rows);
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{245'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadBoundaryCases)
+{
+  // Tests some specific boundary conditions in the split calculations.
+
+  auto constexpr num_rows = 40'000;
+
+  auto const [expected, filepath] = [num_rows]() {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+    return write_file(input_columns, "chunked_read_simple_boundary");
+  }();
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{1UL});
+    // Number of chunks is 4 because of using default `output_row_granularity = 10k`.
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte, and small value of `output_row_granularity`.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{1UL}, output_row_granularity{1'000});
+    EXPECT_EQ(num_chunks, 40);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte, and large value of `output_row_granularity`.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{1UL}, output_row_granularity{30'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+  // Test with a very large limit
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+  // Test with a limit slightly less than one granularity segment of data
+  // (output_row_granularity = 10k rows = 40'000 bytes).
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{39'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size one granularity segment of data
+  // (output_row_granularity = 10k rows = 40'000 bytes).
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{40'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly more than one granularity segment of data
+  // (output_row_granularity = 10k rows = 40'000 bytes).
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{41'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly less than two granularity segments of data
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{79'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of two granularity segments of data minus 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{79'999UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of two granularity segments of data.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{80'000UL});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly more the size two granularity segments of data.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{81'000});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of the input minus 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{159'999UL});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of the input.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{160'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithString)
+{
+  auto constexpr num_rows           = 60'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+
+    // ints                               Granularity Segment  total bytes   cumulative bytes
+    // 20000 rows of 4 bytes each               = A0           80000         80000
+    // 20000 rows of 4 bytes each               = A1           80000         160000
+    // 20000 rows of 4 bytes each               = A2           80000         240000
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+
+    // strings                            Granularity Segment  total bytes   cumulative bytes
+    // 20000 rows of 1 char each    (20000  + 80004) = B0      100004        100004
+    // 20000 rows of 4 chars each   (80000  + 80004) = B1      160004        260008
+    // 20000 rows of 16 chars each  (320000 + 80004) = B2      400004        660012
+    auto const strings  = std::vector<std::string>{"a", "bbbb", "cccccccccccccccc"};
+    auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+      if (i < 20000) { return strings[0]; }
+      if (i < 40000) { return strings[1]; }
+      return strings[2];
+    });
+    input_columns.emplace_back(strings_col(str_iter, str_iter + num_rows).release());
+
+    // Cumulative sizes:
+    // A0 + B0 :  180004
+    // A1 + B1 :  420008
+    // A2 + B2 :  900012
+    //                                    skip_rows / num_rows
+    // byte_limit==500000  should give 2 chunks: {0, 40000}, {40000, 20000}
+    // byte_limit==1000000 should give 1 chunks: {0, 60000},
+    return write_file(input_columns, "chunked_read_with_strings", nullable);
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests:
+
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{500'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{500'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructs)
+{
+  auto constexpr num_rows           = 100'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const int_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+    input_columns.emplace_back([=] {
+      auto child1 = int32s_col(int_iter, int_iter + num_rows);
+      auto child2 = int32s_col(int_iter + num_rows, int_iter + num_rows * 2);
+
+      auto const str_iter = cudf::detail::make_counting_transform_iterator(
+        0, [&](int32_t i) { return std::to_string(i); });
+      auto child3 = strings_col{str_iter, str_iter + num_rows};
+
+      return structs_col{{child1, child2, child3}}.release();
+    }());
+
+    return write_file(input_columns, "chunked_read_with_structs", nullable);
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{2L << 40}, output_granularity);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{2L << 40}, output_granularity);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests:
+
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_no_null, output_limit{500'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath_with_nulls, output_limit{500'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsNoNulls)
+{
+  auto constexpr num_rows           = 100'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
+
+  auto const [expected, filepath] = [num_rows]() {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    // 20000 rows in 1 segment consist of:
+    //
+    // 20001 offsets :   80004  bytes
+    // 30000 ints    :   120000 bytes
+    // total         :   200004 bytes
+    //
+    // However, `segmented_row_bit_count` used in chunked reader returns 200000,
+    // thus we consider as having only 200000 bytes in total.
+    auto const template_lists = int32s_lists_col{
+      int32s_lists_col{}, int32s_lists_col{0}, int32s_lists_col{1, 2}, int32s_lists_col{3, 4, 5}};
+
+    auto const gather_iter =
+      cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; });
+    auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows);
+    input_columns.emplace_back(
+      std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
+
+    return write_file(input_columns, "chunked_read_with_lists_no_null");
+  }();
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{2L << 40UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size slightly less than 1 row segment (forcing it to be at least 1 segment per read).
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{199'999UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size exactly 1 row segment.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{200'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size == size of 2 segments. Totally have 3 chunks.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{400'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size == size of 2 segment minus one byte: each chunk will be just one segment.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{399'999UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsHavingNulls)
+{
+  auto constexpr num_rows           = 100'000;
+  auto constexpr output_granularity = output_row_granularity{20'000};
+
+  auto const [expected, filepath] = [num_rows]() {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    // 20000 rows in 1 page consist of:
+    //
+    // 625 validity words :   2500 bytes   (a null every 4 rows: null at indices [3, 7, 11, ...])
+    // 20001 offsets      :   80004  bytes
+    // 15000 ints         :   60000 bytes
+    // total              :   142504 bytes
+    //
+    // However, `segmented_row_bit_count` used in chunked reader returns 142500,
+    // thus we consider as having only 142500 bytes in total.
+    auto const template_lists =
+      int32s_lists_col{// these will all be null
+                       int32s_lists_col{},
+                       int32s_lists_col{0},
+                       int32s_lists_col{1, 2},
+                       int32s_lists_col{3, 4, 5, 6, 7, 8, 9} /* this list will be nullified out */};
+    auto const gather_iter =
+      cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; });
+    auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows);
+    input_columns.emplace_back(
+      std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
+
+    return write_file(input_columns, "chunked_read_with_lists_nulls", true /*nullable*/);
+  }();
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, output_limit{1UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{2L << 40}, output_granularity);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size slightly less than 1 row segment (forcing it to be at least 1 segment per read).
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{142'499UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size exactly 1 row segment.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{142'500UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size == size of 2 segments. Totally have 3 chunks.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{285'000UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Chunk size == size of 2 segment minus one byte: each chunk will be just one segment.
+  {
+    auto const [result, num_chunks] =
+      chunked_read(filepath, output_limit{284'999UL}, output_granularity);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithStructsOfLists)
+{
+  auto constexpr num_rows = 100'000;
+
+  // Size of each segment (10k row by default) is from 537k to 560k bytes (no nulls)
+  // and from 456k to 473k (with nulls).
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const int_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+    input_columns.emplace_back([=] {
+      std::vector<std::unique_ptr<cudf::column>> child_columns;
+      child_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+      child_columns.emplace_back(
+        int32s_col(int_iter + num_rows, int_iter + num_rows * 2).release());
+
+      auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+        return std::to_string(i) + "++++++++++++++++++++" + std::to_string(i);
+      });
+      child_columns.emplace_back(strings_col{str_iter, str_iter + num_rows}.release());
+
+      auto const template_lists = int32s_lists_col{
+        int32s_lists_col{}, int32s_lists_col{0}, int32s_lists_col{0, 1}, int32s_lists_col{0, 1, 2}};
+      auto const gather_iter =
+        cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; });
+      auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows);
+      child_columns.emplace_back(
+        std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
+
+      return structs_col(std::move(child_columns)).release();
+    }());
+
+    return write_file(input_columns, "chunked_read_with_structs_of_lists", nullable);
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests:
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'500'000UL});
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2'000'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{5'000'000UL});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'500'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2'000'000UL});
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{5'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadWithListsOfStructs)
+{
+  auto constexpr num_rows = 100'000;
+
+  // Size of each segment (10k row by default) is from 450k to 530k bytes (no nulls)
+  // and from 330k to 380k (with nulls).
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const int_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+
+    auto offsets = std::vector<cudf::size_type>{};
+    offsets.reserve(num_rows * 2);
+    cudf::size_type num_structs = 0;
+    for (int i = 0; i < num_rows; ++i) {
+      offsets.push_back(num_structs);
+      auto const new_list_size = i % 4;
+      num_structs += new_list_size;
+    }
+    offsets.push_back(num_structs);
+
+    auto const make_structs_col = [=] {
+      auto child1 = int32s_col(int_iter, int_iter + num_structs);
+      auto child2 = int32s_col(int_iter + num_structs, int_iter + num_structs * 2);
+
+      auto const str_iter = cudf::detail::make_counting_transform_iterator(
+        0, [&](int32_t i) { return std::to_string(i) + std::to_string(i) + std::to_string(i); });
+      auto child3 = strings_col{str_iter, str_iter + num_structs};
+
+      return structs_col{{child1, child2, child3}}.release();
+    };
+
+    input_columns.emplace_back(
+      cudf::make_lists_column(static_cast<cudf::size_type>(offsets.size() - 1),
+                              int32s_col(offsets.begin(), offsets.end()).release(),
+                              make_structs_col(),
+                              0,
+                              rmm::device_buffer{}));
+
+    return write_file(input_columns, "chunked_read_with_lists_of_structs", nullable);
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{0UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1UL});
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit.
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2L << 40});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests.
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 7);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{1'500'000UL});
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{2'000'000UL});
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, output_limit{5'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'000'000UL});
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{1'500'000UL});
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{2'000'000UL});
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, output_limit{5'000'000UL});
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(OrcChunkedReaderTest, TestChunkedReadNullCount)
+{
+  auto constexpr num_rows = 100'000;
+
+  auto const sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 1; });
+  auto const validity =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 4 != 3; });
+  std::vector<std::unique_ptr<cudf::column>> cols;
+  cols.push_back(int32s_col{sequence, sequence + num_rows, validity}.release());
+  auto const expected = std::make_unique<cudf::table>(std::move(cols));
+
+  auto const filepath          = temp_env->get_temp_filepath("chunked_reader_null_count.orc");
+  auto const stripe_limit_rows = num_rows / 5;
+  auto const write_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, *expected)
+      .stripe_size_rows(stripe_limit_rows)
+      .build();
+  cudf::io::write_orc(write_opts);
+
+  auto const byte_limit = stripe_limit_rows * sizeof(int);
+  auto const read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).build();
+  auto reader =
+    cudf::io::chunked_orc_reader(byte_limit, 0UL /*read_limit*/, stripe_limit_rows, read_opts);
+
+  do {
+    // Every fourth row is null.
+    EXPECT_EQ(reader.read_chunk().tbl->get_column(0).null_count(), stripe_limit_rows / 4UL);
+  } while (reader.has_next());
+}
+
+namespace {
+
+std::size_t constexpr input_limit_expected_file_count = 3;
+
+std::vector<std::string> input_limit_get_test_names(std::string const& base_filename)
+{
+  return {base_filename + "_a.orc", base_filename + "_b.orc", base_filename + "_c.orc"};
+}
+
+void input_limit_test_write_one(std::string const& filepath,
+                                cudf::table_view const& input,
+                                cudf::size_type stripe_size_rows,
+                                cudf::io::compression_type compression)
+{
+  auto const out_opts = cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, input)
+                          .compression(compression)
+                          .stripe_size_rows(stripe_size_rows)
+                          .build();
+  cudf::io::write_orc(out_opts);
+}
+
+void input_limit_test_write(
+  std::vector<std::string> const& test_files,
+  cudf::table_view const& input,
+  cudf::size_type stripe_size_rows = 20'000 /*write relatively small stripes by default*/)
+{
+  CUDF_EXPECTS(test_files.size() == input_limit_expected_file_count,
+               "Unexpected count of test filenames.");
+
+  // ZSTD yields a very small decompression size, can be much smaller than SNAPPY.
+  // However, ORC reader typically over-estimates the decompression size of data
+  // compressed by ZSTD to be very large, can be much larger than that of SNAPPY.
+  // That is because ZSTD may use a lot of scratch space at decode time
+  // (2.5x the total decompressed buffer size).
+  // As such, we may see smaller output chunks for the input data compressed by ZSTD.
+  input_limit_test_write_one(
+    test_files[0], input, stripe_size_rows, cudf::io::compression_type::NONE);
+  input_limit_test_write_one(
+    test_files[1], input, stripe_size_rows, cudf::io::compression_type::ZSTD);
+  input_limit_test_write_one(
+    test_files[2], input, stripe_size_rows, cudf::io::compression_type::SNAPPY);
+}
+
+void input_limit_test_read(int test_location,
+                           std::vector<std::string> const& test_files,
+                           cudf::table_view const& input,
+                           output_limit output_limit_bytes,
+                           input_limit input_limit_bytes,
+                           int const* expected_chunk_counts)
+{
+  CUDF_EXPECTS(test_files.size() == input_limit_expected_file_count,
+               "Unexpected count of test filenames.");
+
+  for (size_t idx = 0; idx < test_files.size(); ++idx) {
+    SCOPED_TRACE("Original line of failure: " + std::to_string(test_location) +
+                 ", file idx: " + std::to_string(idx));
+    auto const [result, num_chunks] =
+      chunked_read(test_files[idx], output_limit_bytes, input_limit_bytes);
+    EXPECT_EQ(expected_chunk_counts[idx], num_chunks);
+    // TODO: equal
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result, input);
+  }
+}
+
+}  // namespace
+
+struct OrcChunkedReaderInputLimitTest : public cudf::test::BaseFixture {};
+
+TEST_F(OrcChunkedReaderInputLimitTest, SingleFixedWidthColumn)
+{
+  auto constexpr num_rows = 1'000'000;
+  auto const iter1        = thrust::make_constant_iterator(15);
+  auto const col1         = doubles_col(iter1, iter1 + num_rows);
+
+  auto const filename   = std::string{"single_col_fixed_width"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{col1}};
+  input_limit_test_write(test_files, input);
+
+  {
+    int constexpr expected[] = {50, 50, 50};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected);
+  }
+
+  {
+    int constexpr expected[] = {17, 13, 10};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected);
+  }
+}
+
+TEST_F(OrcChunkedReaderInputLimitTest, MixedColumns)
+{
+  auto constexpr num_rows = 1'000'000;
+
+  auto const iter1 = thrust::make_counting_iterator<int>(0);
+  auto const col1  = int32s_col(iter1, iter1 + num_rows);
+
+  auto const iter2 = thrust::make_counting_iterator<double>(0);
+  auto const col2  = doubles_col(iter2, iter2 + num_rows);
+
+  auto const strings  = std::vector<std::string>{"abc", "de", "fghi"};
+  auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+    if (i < 250000) { return strings[0]; }
+    if (i < 750000) { return strings[1]; }
+    return strings[2];
+  });
+  auto const col3     = strings_col(str_iter, str_iter + num_rows);
+
+  auto const filename   = std::string{"mixed_columns"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{col1, col2, col3}};
+  input_limit_test_write(test_files, input);
+
+  {
+    int constexpr expected[] = {50, 50, 50};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{1UL}, expected);
+  }
+
+  {
+    int constexpr expected[] = {17, 50, 17};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{2 * 1024 * 1024UL}, expected);
+  }
+}
+
+namespace {
+
+struct offset_gen {
+  int const group_size;
+  __device__ int operator()(int i) const { return i * group_size; }
+};
+
+template <typename T>
+struct value_gen {
+  __device__ T operator()(int i) const { return i % 1024; }
+};
+
+struct char_values {
+  __device__ int8_t operator()(int i) const
+  {
+    int const index = (i / 2) % 3;
+    // Generate repeating 3-runs of 2 values each: "aabbccaabbcc...".
+    return index == 0 ? 'a' : (index == 1 ? 'b' : 'c');
+  }
+};
+
+}  // namespace
+
+TEST_F(OrcChunkedReaderInputLimitTest, ListType)
+{
+  int constexpr num_rows  = 50'000'000;
+  int constexpr list_size = 4;
+
+  auto const stream = cudf::get_default_stream();
+  auto const iter   = thrust::make_counting_iterator(0);
+
+  auto offset_col = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows + 1,
+                    offset_col->mutable_view().begin<int>(),
+                    offset_gen{list_size});
+
+  int constexpr num_ints = num_rows * list_size;
+  auto value_col         = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_ints,
+                    value_col->mutable_view().begin<int>(),
+                    value_gen<int>{});
+
+  auto const lists_col =
+    cudf::make_lists_column(num_rows, std::move(offset_col), std::move(value_col), 0, {}, stream);
+
+  auto const filename   = std::string{"list_type"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{*lists_col}};
+
+  // Although we set `stripe_size_rows` to be very large, the writer only write
+  // 250k rows (top level) per stripe due to having nested type.
+  // Thus, we have 200 stripes in total.
+  input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
+
+  {
+    int constexpr expected[] = {3, 40, 3};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{5 * 1024 * 1024UL}, expected);
+  }
+
+  {
+    int constexpr expected[] = {8, 40, 9};
+    input_limit_test_read(__LINE__,
+                          test_files,
+                          input,
+                          output_limit{128 * 1024 * 1024UL},
+                          input_limit{5 * 1024 * 1024UL},
+                          expected);
+  }
+}
+
+TEST_F(OrcChunkedReaderInputLimitTest, MixedColumnsHavingList)
+{
+  int constexpr num_rows  = 50'000'000;
+  int constexpr list_size = 4;
+  int constexpr str_size  = 3;
+
+  auto const stream = cudf::get_default_stream();
+  auto const iter   = thrust::make_counting_iterator(0);
+
+  // list<int>
+  auto offset_col = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows + 1,
+                    offset_col->mutable_view().begin<int>(),
+                    offset_gen{list_size});
+
+  int constexpr num_ints = num_rows * list_size;
+  auto value_col         = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_ints, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_ints,
+                    value_col->mutable_view().begin<int>(),
+                    value_gen<int>{});
+
+  auto const lists_col =
+    cudf::make_lists_column(num_rows, std::move(offset_col), std::move(value_col), 0, {}, stream);
+
+  // strings
+  int constexpr num_chars = num_rows * str_size;
+  auto str_offset_col     = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows + 1, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows + 1,
+                    str_offset_col->mutable_view().begin<int>(),
+                    offset_gen{str_size});
+  rmm::device_buffer str_chars(num_chars, stream);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_chars,
+                    static_cast<int8_t*>(str_chars.data()),
+                    char_values{});
+  auto const str_col =
+    cudf::make_strings_column(num_rows, std::move(str_offset_col), std::move(str_chars), 0, {});
+
+  // doubles
+  auto const double_col = cudf::make_fixed_width_column(
+    cudf::data_type{cudf::type_id::FLOAT64}, num_rows, cudf::mask_state::UNALLOCATED);
+  thrust::transform(rmm::exec_policy(stream),
+                    iter,
+                    iter + num_rows,
+                    double_col->mutable_view().begin<double>(),
+                    value_gen<double>{});
+
+  auto const filename   = std::string{"mixed_cols_having_list"};
+  auto const test_files = input_limit_get_test_names(temp_env->get_temp_filepath(filename));
+  auto const input      = cudf::table_view{{*lists_col, *str_col, *double_col}};
+
+  // Although we set `stripe_size_rows` to be very large, the writer only write
+  // 250k rows (top level) per stripe due to having nested type.
+  // Thus, we have 200 stripes in total.
+  input_limit_test_write(test_files, input, cudf::io::default_stripe_size_rows);
+
+  {
+    int constexpr expected[] = {13, 8, 6};
+    input_limit_test_read(
+      __LINE__, test_files, input, output_limit{0UL}, input_limit{128 * 1024 * 1024UL}, expected);
+  }
+
+  {
+    int constexpr expected[] = {13, 15, 17};
+    input_limit_test_read(__LINE__,
+                          test_files,
+                          input,
+                          output_limit{128 * 1024 * 1024UL},
+                          input_limit{128 * 1024 * 1024UL},
+                          expected);
+  }
+}
+
+TEST_F(OrcChunkedReaderInputLimitTest, ReadWithRowSelection)
+{
+  // `num_rows` should not be divisible by `stripe_size_rows`, to test the correctness of row
+  // selections.
+  int64_t constexpr num_rows    = 100'517'687l;
+  int constexpr rows_per_stripe = 100'000;
+  static_assert(num_rows % rows_per_stripe != 0,
+                "`num_rows` should not be divisible by `stripe_size_rows`.");
+
+  auto const it    = thrust::make_counting_iterator(0);
+  auto const col   = int32s_col(it, it + num_rows);
+  auto const input = cudf::table_view{{col}};
+
+  auto const filepath = temp_env->get_temp_filepath("chunk_read_with_row_selection.orc");
+  auto const write_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, input)
+      .stripe_size_rows(rows_per_stripe)
+      .build();
+  cudf::io::write_orc(write_opts);
+
+  // Verify metadata.
+  auto const metadata = cudf::io::read_orc_metadata(cudf::io::source_info{filepath});
+  EXPECT_EQ(metadata.num_rows(), num_rows);
+  EXPECT_EQ(metadata.num_stripes(), num_rows / rows_per_stripe + 1);
+
+  int constexpr random_val = 123456;
+
+  // Read some random number or rows that is not stripe size.
+  int constexpr num_rows_to_read = rows_per_stripe * 5 + random_val;
+
+  // Just shift the read data region back by a random offset.
+  const auto num_rows_to_skip = num_rows - num_rows_to_read - random_val;
+
+  const auto sequence_start = num_rows_to_skip % num_rows;
+  auto const skipped_col = int32s_col(it + sequence_start, it + sequence_start + num_rows_to_read);
+  auto const expected    = cudf::table_view{{skipped_col}};
+
+  auto const read_opts = cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
+                           .use_index(false)
+                           .skip_rows(num_rows_to_skip)
+                           .num_rows(num_rows_to_read)
+                           .build();
+
+  auto reader = cudf::io::chunked_orc_reader(
+    60'000UL * sizeof(int) /*output limit, equal to 60k rows, less than rows in 1 stripe*/,
+    rows_per_stripe * sizeof(int) /*input limit, around size of 1 stripe's decoded data*/,
+    50'000 /*output granularity, or minimum number of rows for the output chunk*/,
+    read_opts);
+
+  auto num_chunks  = 0;
+  auto read_tables = std::vector<std::unique_ptr<cudf::table>>{};
+  auto tviews      = std::vector<cudf::table_view>{};
+
+  do {
+    auto chunk = reader.read_chunk();
+    // Each output chunk should have either exactly 50k rows, or num_rows_to_read % 50k.
+    EXPECT_TRUE(chunk.tbl->num_rows() == 50000 ||
+                chunk.tbl->num_rows() == num_rows_to_read % 50000);
+
+    tviews.emplace_back(chunk.tbl->view());
+    read_tables.emplace_back(std::move(chunk.tbl));
+    ++num_chunks;
+  } while (reader.has_next());
+
+  auto const read_result = cudf::concatenate(tviews);
+  EXPECT_EQ(num_chunks, 13);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
+}
+
+TEST_F(OrcChunkedReaderInputLimitTest, SizeTypeRowsOverflow)
+{
+  using data_type = int16_t;
+  using data_col  = cudf::test::fixed_width_column_wrapper<data_type, int64_t>;
+
+  int64_t constexpr num_rows    = 500'000'000l;
+  int constexpr rows_per_stripe = 1'000'000;
+  int constexpr num_reps        = 10;
+  int64_t constexpr total_rows  = num_rows * num_reps;
+  static_assert(total_rows > std::numeric_limits<cudf::size_type>::max());
+
+  auto const it  = cudf::detail::make_counting_transform_iterator(0l, [num_rows](int64_t i) {
+    return (i % num_rows) % static_cast<int64_t>(std::numeric_limits<data_type>::max() / 2);
+  });
+  auto const col = data_col(it, it + num_rows);
+  auto const chunk_table = cudf::table_view{{col}};
+
+  std::vector<char> data_buffer;
+  {
+    auto const write_opts =
+      cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{&data_buffer})
+        .stripe_size_rows(rows_per_stripe)
+        .build();
+
+    auto writer = cudf::io::orc_chunked_writer(write_opts);
+    for (int i = 0; i < num_reps; ++i) {
+      writer.write(chunk_table);
+    }
+  }
+
+  // Verify metadata.
+  auto const metadata =
+    cudf::io::read_orc_metadata(cudf::io::source_info{data_buffer.data(), data_buffer.size()});
+  EXPECT_EQ(metadata.num_rows(), total_rows);
+  EXPECT_EQ(metadata.num_stripes(), total_rows / rows_per_stripe);
+
+  // Read with row selections and memory limit.
+  {
+    int64_t constexpr num_rows_to_read = 5'000'000l;
+    int64_t const num_rows_to_skip =
+      static_cast<int64_t>(metadata.num_rows()) - num_rows_to_read -
+      123456l /*just shift the read data region back by a random offset*/;
+
+    // Check validity of the last 5 million rows.
+    auto const sequence_start = num_rows_to_skip % num_rows;
+    auto const skipped_col = data_col(it + sequence_start, it + sequence_start + num_rows_to_read);
+    auto const expected    = cudf::table_view{{skipped_col}};
+
+    auto const read_opts = cudf::io::orc_reader_options::builder(
+                             cudf::io::source_info{data_buffer.data(), data_buffer.size()})
+                             .use_index(false)
+                             .skip_rows(num_rows_to_skip)
+                             .num_rows(num_rows_to_read)
+                             .build();
+    auto reader = cudf::io::chunked_orc_reader(
+      600'000UL * sizeof(data_type) /* output limit, equal to 600k rows */,
+      rows_per_stripe * sizeof(data_type) /* input limit, around size of 1 stripe's decoded data */,
+      rows_per_stripe / 2 /* output granularity, or minimum number of rows for the output chunk */,
+      read_opts);
+
+    auto num_chunks  = 0;
+    auto read_tables = std::vector<std::unique_ptr<cudf::table>>{};
+    auto tviews      = std::vector<cudf::table_view>{};
+
+    do {
+      auto chunk = reader.read_chunk();
+      ++num_chunks;
+      tviews.emplace_back(chunk.tbl->view());
+      read_tables.emplace_back(std::move(chunk.tbl));
+    } while (reader.has_next());
+
+    auto const read_result = cudf::concatenate(tviews);
+    EXPECT_EQ(num_chunks, 11);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, read_result->view());
+  }
+
+  // The test below requires a huge amount of memory, thus it is disabled by default.
+#ifdef LOCAL_TEST
+  // Read with only output limit -- there is no limit on the memory usage.
+  // However, the reader should be able to detect and load only enough stripes each time
+  // to avoid decoding a table having number of rows that exceeds the column size limit.
+  {
+    auto const read_opts = cudf::io::orc_reader_options::builder(
+                             cudf::io::source_info{data_buffer.data(), data_buffer.size()})
+                             .use_index(false)
+                             .build();
+    auto reader = cudf::io::chunked_orc_reader(
+      static_cast<std::size_t>(rows_per_stripe * 5.7) *
+        sizeof(data_type) /* output limit, equal to 5.7M rows */,
+      0UL /* no input limit */,
+      rows_per_stripe / 2 /* output granularity, or minimum number of rows for the output chunk */,
+      read_opts);
+
+    int num_chunks          = 0;
+    int64_t num_read_rows   = 0;
+    int64_t test_rows_start = 0;
+    auto test_chunk         = std::unique_ptr<cudf::table>{};
+
+    do {
+      auto chunk            = reader.read_chunk();
+      auto const chunk_rows = chunk.tbl->num_rows();
+
+      // Just randomly select one output chunk to verify.
+      if (num_chunks == 123) {
+        test_rows_start = num_read_rows;
+        test_chunk      = std::move(chunk.tbl);
+      }
+
+      ++num_chunks;
+      num_read_rows += chunk_rows;
+    } while (reader.has_next());
+
+    EXPECT_EQ(num_read_rows, total_rows);
+
+    // Typically, we got a chunk having 5M rows.
+    // However, since the reader internally splits file stripes that are not multiple of 5 stripes,
+    // we may have some extra chunks that have less than 5M rows.
+    EXPECT_EQ(num_chunks, 1002);
+
+    // Verify the selected chunk.
+    using namespace cudf::test::iterators;
+    auto const skipped_col =
+      data_col(it + test_rows_start, it + test_rows_start + test_chunk->num_rows(), no_nulls());
+    auto const expected = cudf::table_view{{skipped_col}};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, test_chunk->view());
+  }
+
+#endif  // LOCAL_TEST
+}
diff --git a/cpp/tests/io/row_selection_test.cpp b/cpp/tests/io/row_selection_test.cpp
index 0c259c81a23..ebadd870091 100644
--- a/cpp/tests/io/row_selection_test.cpp
+++ b/cpp/tests/io/row_selection_test.cpp
@@ -122,17 +122,4 @@ TEST_F(FromOptsTest, LimitOptionsToFileRows)
   }
 }
 
-TEST_F(FromOptsTest, OverFlowDetection)
-{
-  auto const too_large_for_32bit = std::numeric_limits<int64_t>::max();
-
-  // Too many rows to read until the end of the file
-  EXPECT_THROW(skip_rows_num_rows_from_options(0, std::nullopt, too_large_for_32bit),
-               std::overflow_error);
-
-  // Should work fine with num_rows
-  EXPECT_NO_THROW(
-    skip_rows_num_rows_from_options(1000, too_large_for_32bit - 100, too_large_for_32bit));
-}
-
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index d5ac8574fe4..d5bb1726a43 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -1,6 +1,6 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
@@ -21,8 +21,8 @@ cdef extern from "cudf/io/orc.hpp" \
 
         cudf_io_types.source_info get_source() except +
         vector[vector[size_type]] get_stripes() except +
-        size_type get_skip_rows() except +
-        size_type get_num_rows() except +
+        int64_t get_skip_rows() except +
+        optional[int64_t] get_num_rows() except +
         bool is_enabled_use_index() except +
         bool is_enabled_use_np_dtypes() except +
         data_type get_timestamp_type() except +
@@ -31,8 +31,8 @@ cdef extern from "cudf/io/orc.hpp" \
 
         void set_columns(vector[string] col_names) except +
         void set_stripes(vector[vector[size_type]] strps) except +
-        void set_skip_rows(size_type rows) except +
-        void set_num_rows(size_type nrows) except +
+        void set_skip_rows(int64_t rows) except +
+        void set_num_rows(int64_t nrows) except +
         void enable_use_index(bool val) except +
         void enable_use_np_dtypes(bool val) except +
         void set_timestamp_type(data_type type) except +
@@ -49,8 +49,8 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_reader_options_builder& columns(vector[string] col_names) except +
         orc_reader_options_builder& \
             stripes(vector[vector[size_type]] strps) except +
-        orc_reader_options_builder& skip_rows(size_type rows) except +
-        orc_reader_options_builder& num_rows(size_type nrows) except +
+        orc_reader_options_builder& skip_rows(int64_t rows) except +
+        orc_reader_options_builder& num_rows(int64_t nrows) except +
         orc_reader_options_builder& use_index(bool val) except +
         orc_reader_options_builder& use_np_dtypes(bool val) except +
         orc_reader_options_builder& timestamp_type(data_type type) except +
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 836880a6f2c..918880648bf 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -472,11 +472,11 @@ cdef int64_t get_skiprows_arg(object arg) except*:
         raise TypeError("skiprows must be an int >= 0")
     return <int64_t> arg
 
-cdef size_type get_num_rows_arg(object arg) except*:
+cdef int64_t get_num_rows_arg(object arg) except*:
     arg = -1 if arg is None else arg
     if not isinstance(arg, int) or arg < -1:
         raise TypeError("num_rows must be an int >= -1")
-    return <size_type> arg
+    return <int64_t> arg
 
 
 cdef orc_reader_options make_orc_reader_options(
@@ -484,7 +484,7 @@ cdef orc_reader_options make_orc_reader_options(
     object column_names,
     object stripes,
     int64_t skip_rows,
-    size_type num_rows,
+    int64_t num_rows,
     type_id timestamp_type,
     bool use_index
 ) except*:

From a27feabf1a46fd3fcf388c7cbadb49831416f8ba Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 3 May 2024 01:31:58 -1000
Subject: [PATCH 140/842] Preserve column metadata during more DataFrame
 operations (#15519)

Supersedes https://github.com/rapidsai/cudf/pull/15410/, adds a `ColumnAccessor._from_columns_like_self` that will preserve column attributes during DataFrame operations. This can wholly replace `_from_data_like_self`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15519
---
 python/cudf/cudf/core/column_accessor.py | 33 ++++++++++++++++++++++++
 python/cudf/cudf/core/dataframe.py       | 16 +++++++-----
 python/cudf/cudf/core/frame.py           | 33 ++++++++++++++----------
 python/cudf/cudf/core/indexed_frame.py   | 16 +++++++-----
 python/cudf/cudf/core/multiindex.py      |  6 +++--
 python/cudf/cudf/core/series.py          |  4 ++-
 python/cudf/cudf/tests/test_dataframe.py | 20 ++++++++++++++
 7 files changed, 99 insertions(+), 29 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 33085bede78..fbce6e02330 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -3,6 +3,7 @@
 from __future__ import annotations
 
 import itertools
+import sys
 from collections import abc
 from functools import cached_property, reduce
 from typing import (
@@ -174,6 +175,38 @@ def __repr__(self) -> str:
         )
         return f"{type_info}\n{column_info}"
 
+    def _from_columns_like_self(
+        self, columns: abc.Iterable[ColumnBase], verify: bool = True
+    ):
+        """
+        Return a new ColumnAccessor with columns and the properties of self.
+
+        Parameters
+        ----------
+        columns : iterable of Columns
+            New columns for the ColumnAccessor.
+        verify : bool, optional
+            Whether to verify column length and type.
+        """
+        if sys.version_info.major >= 3 and sys.version_info.minor >= 10:
+            data = zip(self.names, columns, strict=True)
+        else:
+            columns = list(columns)
+            if len(columns) != len(self.names):
+                raise ValueError(
+                    f"The number of columns ({len(columns)}) must match "
+                    f"the number of existing column labels ({len(self.names)})."
+                )
+            data = zip(self.names, columns)
+        return type(self)(
+            data=dict(data),
+            multiindex=self.multiindex,
+            level_names=self.level_names,
+            rangeindex=self.rangeindex,
+            label_dtype=self.label_dtype,
+            verify=verify,
+        )
+
     @property
     def level_names(self) -> Tuple[Any, ...]:
         if self._level_names is None or len(self._level_names) == 0:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1e6ae861679..bf8201e4dc1 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3036,8 +3036,11 @@ def where(self, cond, other=None, inplace=False):
 
         # First process the condition.
         if isinstance(cond, Series):
-            cond = self._from_data_like_self(
-                {name: cond._column for name in self._column_names},
+            cond = self._from_data(
+                self._data._from_columns_like_self(
+                    itertools.repeat(cond._column, len(self._column_names)),
+                    verify=False,
+                )
             )
         elif hasattr(cond, "__cuda_array_interface__"):
             cond = DataFrame(
@@ -3078,7 +3081,7 @@ def where(self, cond, other=None, inplace=False):
                 should be equal to number of columns of self"""
             )
 
-        out = {}
+        out = []
         for (name, col), other_col in zip(self._data.items(), other_cols):
             col, other_col = _check_and_cast_columns_with_other(
                 source_col=col,
@@ -3091,16 +3094,17 @@ def where(self, cond, other=None, inplace=False):
                     col, other_col, cond_col
                 )
 
-                out[name] = _make_categorical_like(result, self._data[name])
+                out.append(_make_categorical_like(result, self._data[name]))
             else:
                 out_mask = cudf._lib.null_mask.create_null_mask(
                     len(col),
                     state=cudf._lib.null_mask.MaskState.ALL_NULL,
                 )
-                out[name] = col.set_mask(out_mask)
+                out.append(col.set_mask(out_mask))
 
         return self._mimic_inplace(
-            self._from_data_like_self(out), inplace=inplace
+            self._from_data_like_self(self._data._from_columns_like_self(out)),
+            inplace=inplace,
         )
 
     @docutils.doc_apply(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index cd42bf52ea1..017190ab5b4 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1120,7 +1120,9 @@ def isna(self):
         array([False, False,  True,  True, False, False])
         """
         data_columns = (col.isnull() for col in self._columns)
-        return self._from_data_like_self(zip(self._column_names, data_columns))
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(data_columns)
+        )
 
     # Alias for isna
     isnull = isna
@@ -1199,7 +1201,9 @@ def notna(self):
         array([ True,  True, False, False,  True,  True])
         """
         data_columns = (col.notnull() for col in self._columns)
-        return self._from_data_like_self(zip(self._column_names, data_columns))
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(data_columns)
+        )
 
     # Alias for notna
     notnull = notna
@@ -1506,7 +1510,9 @@ def _encode(self):
     @_cudf_nvtx_annotate
     def _unaryop(self, op):
         data_columns = (col.unary_operator(op) for col in self._columns)
-        return self._from_data_like_self(zip(self._column_names, data_columns))
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(data_columns)
+        )
 
     @classmethod
     @_cudf_nvtx_annotate
@@ -1638,12 +1644,14 @@ def _apply_cupy_ufunc_to_operands(
     def __neg__(self):
         """Negate for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
-            {
-                name: col.unary_operator("not")
-                if is_bool_dtype(col.dtype)
-                else -1 * col
-                for name, col in self._data.items()
-            }
+            self._data._from_columns_like_self(
+                (
+                    col.unary_operator("not")
+                    if col.dtype.kind == "b"
+                    else -1 * col
+                    for col in self._data.columns
+                )
+            )
         )
 
     @_cudf_nvtx_annotate
@@ -1897,10 +1905,9 @@ def __copy__(self):
     def __invert__(self):
         """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
-            {
-                name: _apply_inverse_column(col)
-                for name, col in self._data.items()
-            }
+            self._data._from_columns_like_self(
+                (_apply_inverse_column(col) for col in self._data.columns)
+            )
         )
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index bec97bd3290..62ee780ebbb 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1903,13 +1903,15 @@ def nans_to_nulls(self):
         1  <NA>  3.14
         2  <NA>  <NA>
         """
-        result_data = {}
-        for name, col in self._data.items():
-            try:
-                result_data[name] = col.nans_to_nulls()
-            except AttributeError:
-                result_data[name] = col.copy()
-        return self._from_data_like_self(result_data)
+        result = (
+            col.nans_to_nulls()
+            if isinstance(col, cudf.core.column.NumericalColumn)
+            else col.copy()
+            for col in self._data.columns
+        )
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(result)
+        )
 
     def _copy_type_metadata(
         self,
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 019daacddba..1ab42df111f 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -2088,6 +2088,8 @@ def _split_columns_by_levels(self, levels):
         return data_columns, index_columns, data_names, index_names
 
     def repeat(self, repeats, axis=None):
-        return self._from_columns_like_self(
-            Frame._repeat([*self._columns], repeats, axis), self._column_names
+        return self._from_data(
+            self._data._from_columns_like_self(
+                super()._repeat([*self._columns], repeats, axis)
+            )
         )
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 275dc664175..b6ed28f9093 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3654,7 +3654,9 @@ def pct_change(
     def where(self, cond, other=None, inplace=False):
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
-            self._from_data_like_self({self.name: result_col}),
+            self._from_data_like_self(
+                self._data._from_columns_like_self([result_col])
+            ),
             inplace=inplace,
         )
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index e287603de07..f52076407b5 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10986,3 +10986,23 @@ def test_squeeze(axis, data):
     result = df.squeeze(axis=axis)
     expected = df.to_pandas().squeeze(axis=axis)
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("column", [range(1), np.array([1], dtype=np.int8)])
+@pytest.mark.parametrize(
+    "operation",
+    [
+        lambda df: df.where(df < 2, 2),
+        lambda df: df.nans_to_nulls(),
+        lambda df: df.isna(),
+        lambda df: df.notna(),
+        lambda df: abs(df),
+        lambda df: -df,
+        lambda df: ~df,
+    ],
+)
+def test_op_preserves_column_metadata(column, operation):
+    df = cudf.DataFrame([1], columns=cudf.Index(column))
+    result = operation(df).columns
+    expected = pd.Index(column)
+    pd.testing.assert_index_equal(result, expected, exact=True)

From c60860dfb3ac78a8439966e0fa5c7282b9988b15 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 3 May 2024 09:03:02 -0400
Subject: [PATCH 141/842] Fix make_offsets_child_column usage in
 cudf::strings::detail::shift (#15630)

Fixes the `cudf::strings::detail::shift()` function to use the correct `make_offsets_child_column` function to support large strings.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15630
---
 cpp/src/strings/copying/shift.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu
index 562ee6a7088..5bba4855390 100644
--- a/cpp/src/strings/copying/shift.cu
+++ b/cpp/src/strings/copying/shift.cu
@@ -19,8 +19,8 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/strings/detail/copying.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -104,8 +104,8 @@ std::unique_ptr<column> shift(strings_column_view const& input,
   auto const d_input = column_device_view::create(input.parent(), stream);
   auto sizes_itr     = cudf::detail::make_counting_transform_iterator(
     0, output_sizes_fn{*d_input, d_fill_str, offset});
-  auto [offsets_column, total_bytes] =
-    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto [offsets_column, total_bytes] = cudf::strings::detail::make_offsets_child_column(
+    sizes_itr, sizes_itr + input.size(), stream, mr);
   auto offsets_view = offsets_column->view();
 
   // compute the shift-offset for the output characters child column

From 18f2e7a84a03342bf6305f63ae1f8164ffbccd99 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 3 May 2024 09:03:59 -0400
Subject: [PATCH 142/842] Large strings support in MD5 and SHA hashers (#15631)

Updates the hash functions for md5 and sha to support creating large strings results.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15631
---
 cpp/src/hash/md5_hash.cu  |  4 ++--
 cpp/src/hash/sha_hash.cuh | 29 +++++++++++++++--------------
 2 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
index 8f490ada8ff..0b559e8e86c 100644
--- a/cpp/src/hash/md5_hash.cu
+++ b/cpp/src/hash/md5_hash.cu
@@ -309,7 +309,7 @@ std::unique_ptr<column> md5(table_view const& input,
   // Result column allocation and creation
   auto begin = thrust::make_constant_iterator(digest_size);
   auto [offsets_column, bytes] =
-    cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
+    cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
 
   rmm::device_uvector<char> chars(bytes, stream, mr);
   auto d_chars = chars.data();
@@ -322,7 +322,7 @@ std::unique_ptr<column> md5(table_view const& input,
     thrust::make_counting_iterator(0),
     thrust::make_counting_iterator(input.num_rows()),
     [d_chars, device_input = *device_input] __device__(auto row_index) {
-      MD5Hasher hasher(d_chars + (row_index * digest_size));
+      MD5Hasher hasher(d_chars + (static_cast<int64_t>(row_index) * digest_size));
       for (auto const& col : device_input) {
         if (col.is_valid(row_index)) {
           if (col.type().id() == type_id::LIST) {
diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh
index 005578cb2c2..6976241057e 100644
--- a/cpp/src/hash/sha_hash.cuh
+++ b/cpp/src/hash/sha_hash.cuh
@@ -518,7 +518,7 @@ std::unique_ptr<column> sha_hash(table_view const& input,
   // Result column allocation and creation
   auto begin = thrust::make_constant_iterator(Hasher::digest_size);
   auto [offsets_column, bytes] =
-    cudf::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
+    cudf::strings::detail::make_offsets_child_column(begin, begin + input.num_rows(), stream, mr);
 
   auto chars   = rmm::device_uvector<char>(bytes, stream, mr);
   auto d_chars = chars.data();
@@ -526,19 +526,20 @@ std::unique_ptr<column> sha_hash(table_view const& input,
   auto const device_input = table_device_view::create(input, stream);
 
   // Hash each row, hashing each element sequentially left to right
-  thrust::for_each(rmm::exec_policy(stream),
-                   thrust::make_counting_iterator(0),
-                   thrust::make_counting_iterator(input.num_rows()),
-                   [d_chars, device_input = *device_input] __device__(auto row_index) {
-                     Hasher hasher(d_chars + (row_index * Hasher::digest_size));
-                     for (auto const& col : device_input) {
-                       if (col.is_valid(row_index)) {
-                         cudf::type_dispatcher<dispatch_storage_type>(
-                           col.type(), HasherDispatcher(&hasher, col), row_index);
-                       }
-                     }
-                     hasher.finalize();
-                   });
+  thrust::for_each(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator(0),
+    thrust::make_counting_iterator(input.num_rows()),
+    [d_chars, device_input = *device_input] __device__(auto row_index) {
+      Hasher hasher(d_chars + (static_cast<int64_t>(row_index) * Hasher::digest_size));
+      for (auto const& col : device_input) {
+        if (col.is_valid(row_index)) {
+          cudf::type_dispatcher<dispatch_storage_type>(
+            col.type(), HasherDispatcher(&hasher, col), row_index);
+        }
+      }
+      hasher.finalize();
+    });
 
   return make_strings_column(input.num_rows(), std::move(offsets_column), chars.release(), 0, {});
 }

From 35d77afab14d4d5a5faec321bdb2d87112c07eb2 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 3 May 2024 09:11:27 -0400
Subject: [PATCH 143/842] Use experimental make_strings_children for strings
 convert (#15629)

Updates strings convert functions to use the new experimental `make_strings_children` which supports building large strings.

Reference https://github.com/rapidsai/cudf/issues/15579

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15629
---
 cpp/src/strings/convert/convert_booleans.cu   | 17 +++++++--------
 cpp/src/strings/convert/convert_datetime.cu   | 12 +++++------
 cpp/src/strings/convert/convert_durations.cu  | 21 +++++++++----------
 .../strings/convert/convert_fixed_point.cu    | 16 +++++++-------
 cpp/src/strings/convert/convert_floats.cu     | 15 +++++++------
 cpp/src/strings/convert/convert_hex.cu        | 11 +++++-----
 cpp/src/strings/convert/convert_integers.cu   | 15 +++++++------
 cpp/src/strings/convert/convert_ipv4.cu       | 11 +++++-----
 8 files changed, 56 insertions(+), 62 deletions(-)

diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index bf73800ad06..6b64006fa24 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -16,23 +16,19 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_booleans.hpp>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/traits.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
 
@@ -99,13 +95,14 @@ struct from_booleans_fn {
   column_device_view const d_column;
   string_view d_true;
   string_view d_false;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx) const
   {
     if (d_column.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -113,7 +110,7 @@ struct from_booleans_fn {
       auto const result = d_column.element<bool>(idx) ? d_true : d_false;
       memcpy(d_chars + d_offsets[idx], result.data(), result.size_bytes());
     } else {
-      d_offsets[idx] = d_column.element<bool>(idx) ? d_true.size_bytes() : d_false.size_bytes();
+      d_sizes[idx] = d_column.element<bool>(idx) ? d_true.size_bytes() : d_false.size_bytes();
     }
   };
 };
@@ -143,8 +140,8 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
   // copy null mask
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(booleans, stream, mr);
 
-  auto [offsets, chars] =
-    make_strings_children(from_booleans_fn{d_column, d_true, d_false}, strings_count, stream, mr);
+  auto [offsets, chars] = experimental::make_strings_children(
+    from_booleans_fn{d_column, d_true, d_false}, strings_count, stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index d6449fbb6c8..ddf68eae951 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -37,7 +37,6 @@
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/logical.h>
@@ -756,8 +755,9 @@ struct datetime_formatter_fn {
   column_device_view const d_timestamps;
   column_device_view const d_format_names;
   device_span<format_item const> const d_format_items;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Specialized modulo expression that handles negative values.
@@ -1087,14 +1087,14 @@ struct datetime_formatter_fn {
   __device__ void operator()(size_type idx) const
   {
     if (d_timestamps.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const tstamp = d_timestamps.element<T>(idx);
     if (d_chars) {
       timestamp_to_string(tstamp, d_chars + d_offsets[idx]);
     } else {
-      d_offsets[idx] = compute_output_size(tstamp);
+      d_sizes[idx] = compute_output_size(tstamp);
     }
   }
 };
@@ -1109,7 +1109,7 @@ struct dispatch_from_timestamps_fn {
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr) const
   {
-    return make_strings_children(
+    return experimental::make_strings_children(
       datetime_formatter_fn<T>{d_timestamps, d_format_names, d_format_items},
       d_timestamps.size(),
       stream,
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 77c750848cf..faf9a83f016 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -17,7 +17,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/convert/int_to_string.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -26,10 +26,8 @@
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/transform.h>
 #include <thrust/transform_reduce.h>
 
@@ -192,8 +190,9 @@ struct from_durations_fn {
   column_device_view d_durations;
   format_item const* d_format_items;
   size_type items_count;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ int8_t format_length(char format_char, duration_component const* const timeparts) const
   {
@@ -378,14 +377,14 @@ struct from_durations_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_durations.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
 
     if (d_chars != nullptr) {
       set_chars(idx);
     } else {
-      d_offsets[idx] = string_size(d_durations.template element<T>(idx));
+      d_sizes[idx] = string_size(d_durations.template element<T>(idx));
     }
   }
 };
@@ -415,11 +414,11 @@ struct dispatch_from_durations_fn {
     // copy null mask
     rmm::device_buffer null_mask = cudf::detail::copy_bitmask(durations, stream, mr);
 
-    auto [offsets, chars] =
-      make_strings_children(from_durations_fn<T>{d_column, d_format_items, compiler.items_count()},
-                            strings_count,
-                            stream,
-                            mr);
+    auto [offsets, chars] = experimental::make_strings_children(
+      from_durations_fn<T>{d_column, d_format_items, compiler.items_count()},
+      strings_count,
+      stream,
+      mr);
 
     return make_strings_column(strings_count,
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 446baa8dea9..34f81b8b407 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -23,7 +23,7 @@
 #include <cudf/strings/detail/convert/fixed_point.cuh>
 #include <cudf/strings/detail/convert/fixed_point_to_string.cuh>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -37,10 +37,7 @@
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 #include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
-#include <thrust/generate.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/transform.h>
 
 namespace cudf {
@@ -198,8 +195,9 @@ namespace {
 template <typename DecimalType>
 struct from_fixed_point_fn {
   column_device_view d_decimals;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Converts a decimal element into a string.
@@ -219,13 +217,13 @@ struct from_fixed_point_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_decimals.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
     if (d_chars != nullptr) {
       fixed_point_element_to_string(idx);
     } else {
-      d_offsets[idx] =
+      d_sizes[idx] =
         fixed_point_string_size(d_decimals.element<DecimalType>(idx), d_decimals.type().scale());
     }
   }
@@ -244,8 +242,8 @@ struct dispatch_from_fixed_point_fn {
 
     auto const d_column = column_device_view::create(input, stream);
 
-    auto [offsets, chars] =
-      make_strings_children(from_fixed_point_fn<DecimalType>{*d_column}, input.size(), stream, mr);
+    auto [offsets, chars] = experimental::make_strings_children(
+      from_fixed_point_fn<DecimalType>{*d_column}, input.size(), stream, mr);
 
     return make_strings_column(input.size(),
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index c6061f7d8e6..0ed80b976fd 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -21,7 +21,7 @@
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/detail/convert/string_to_float.cuh>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -32,9 +32,7 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/transform.h>
 
 #include <cmath>
@@ -356,8 +354,9 @@ struct ftos_converter {
 template <typename FloatType>
 struct from_floats_fn {
   column_device_view d_floats;
-  size_type* d_offsets;
+  size_type* d_sizes;
   char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ size_type compute_output_size(FloatType value)
   {
@@ -375,13 +374,13 @@ struct from_floats_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_floats.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
     if (d_chars != nullptr) {
       float_to_string(idx);
     } else {
-      d_offsets[idx] = compute_output_size(d_floats.element<FloatType>(idx));
+      d_sizes[idx] = compute_output_size(d_floats.element<FloatType>(idx));
     }
   }
 };
@@ -404,8 +403,8 @@ struct dispatch_from_floats_fn {
     // copy the null mask
     rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr);
 
-    auto [offsets, chars] =
-      make_strings_children(from_floats_fn<FloatType>{d_column}, strings_count, stream, mr);
+    auto [offsets, chars] = experimental::make_strings_children(
+      from_floats_fn<FloatType>{d_column}, strings_count, stream, mr);
 
     return make_strings_column(strings_count,
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index 95af378fc3f..1f9fc3858f8 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -19,7 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_integers.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -123,8 +123,9 @@ struct dispatch_hex_to_integers_fn {
 template <typename IntegerType>
 struct integer_to_hex_fn {
   column_device_view const d_column;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void byte_to_hex(uint8_t byte, char* hex)
   {
@@ -141,7 +142,7 @@ struct integer_to_hex_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) { d_offsets[idx] = 0; }
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -167,7 +168,7 @@ struct integer_to_hex_fn {
         --byte_index;
       }
     } else {
-      d_offsets[idx] = static_cast<size_type>(bytes) * 2;  // 2 hex characters per byte
+      d_sizes[idx] = static_cast<size_type>(bytes) * 2;  // 2 hex characters per byte
     }
   }
 };
@@ -181,7 +182,7 @@ struct dispatch_integers_to_hex_fn {
   {
     auto const d_column = column_device_view::create(input, stream);
 
-    auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+    auto [offsets_column, chars] = experimental::make_strings_children(
       integer_to_hex_fn<IntegerType>{*d_column}, input.size(), stream, mr);
 
     return make_strings_column(input.size(),
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index f3e639817a6..918369ead4d 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -23,7 +23,7 @@
 #include <cudf/strings/detail/convert/int_to_string.cuh>
 #include <cudf/strings/detail/convert/string_to_int.cuh>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -34,9 +34,7 @@
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
-#include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
 #include <thrust/logical.h>
 #include <thrust/pair.h>
 #include <thrust/transform.h>
@@ -314,8 +312,9 @@ namespace {
 template <typename IntegerType>
 struct from_integers_fn {
   column_device_view d_integers;
-  size_type* d_offsets;
+  size_type* d_sizes;
   char* d_chars;
+  cudf::detail::input_offsetalator d_offsets;
 
   /**
    * @brief Converts an integer element into a string.
@@ -334,13 +333,13 @@ struct from_integers_fn {
   __device__ void operator()(size_type idx)
   {
     if (d_integers.is_null(idx)) {
-      if (d_chars == nullptr) { d_offsets[idx] = 0; }
+      if (d_chars == nullptr) { d_sizes[idx] = 0; }
       return;
     }
     if (d_chars != nullptr) {
       integer_element_to_string(idx);
     } else {
-      d_offsets[idx] = count_digits(d_integers.element<IntegerType>(idx));
+      d_sizes[idx] = count_digits(d_integers.element<IntegerType>(idx));
     }
   }
 };
@@ -363,8 +362,8 @@ struct dispatch_from_integers_fn {
     // copy the null mask
     rmm::device_buffer null_mask = cudf::detail::copy_bitmask(integers, stream, mr);
 
-    auto [offsets, chars] =
-      make_strings_children(from_integers_fn<IntegerType>{d_column}, strings_count, stream, mr);
+    auto [offsets, chars] = experimental::make_strings_children(
+      from_integers_fn<IntegerType>{d_column}, strings_count, stream, mr);
 
     return make_strings_column(strings_count,
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 3d259f0ab82..33f6c553001 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_ipv4.hpp>
 #include <cudf/strings/detail/convert/int_to_string.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -124,13 +124,14 @@ namespace {
  */
 struct integers_to_ipv4_fn {
   column_device_view const d_column;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_column.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -151,7 +152,7 @@ struct integers_to_ipv4_fn {
       shift_bits -= 8;
     }
 
-    if (!d_chars) { d_offsets[idx] = bytes; }
+    if (!d_chars) { d_sizes[idx] = bytes; }
   }
 };
 
@@ -167,7 +168,7 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
   CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type");
 
   auto d_column                = column_device_view::create(integers, stream);
-  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
+  auto [offsets_column, chars] = experimental::make_strings_children(
     integers_to_ipv4_fn{*d_column}, integers.size(), stream, mr);
 
   return make_strings_column(integers.size(),

From b8503bc000f19b983b19292b16f0048254f2b3a9 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Fri, 3 May 2024 07:44:08 -0700
Subject: [PATCH 144/842] Add support for large string columns to Parquet
 reader and writer (#15632)

Part of #13733.

Adds support for reading and writing cuDF string columns where the string data exceeds 2GB. This is accomplished by skipping the final offsets calculation in the string decoding kernel when the 2GB threshold is exceeded, and instead uses `cudf::strings::detail::make_offsets_child_column()`.  This could lead to increased overhead with many columns (see #13024), so this will need some more benchmarking. But if there are many columns that exceed the 2GB limit, it's likely reads will have to be chunked to stay within the memory budget.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - David Wendt (https://github.com/davidwendt)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15632
---
 cpp/CMakeLists.txt                            |  1 +
 cpp/src/io/parquet/page_delta_decode.cu       | 34 +++++----
 cpp/src/io/parquet/page_string_decode.cu      | 33 +++++----
 cpp/src/io/parquet/parquet_gpu.hpp            |  8 +-
 cpp/src/io/parquet/reader_impl.cpp            | 29 ++++++--
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  8 +-
 cpp/src/io/parquet/writer_impl.cu             |  6 +-
 cpp/src/io/utilities/column_buffer.cpp        | 10 ---
 cpp/src/io/utilities/column_buffer_strings.cu | 53 ++++++++++++++
 cpp/tests/CMakeLists.txt                      |  2 +-
 cpp/tests/large_strings/parquet_tests.cpp     | 73 +++++++++++++++++++
 11 files changed, 200 insertions(+), 57 deletions(-)
 create mode 100644 cpp/src/io/utilities/column_buffer_strings.cu
 create mode 100644 cpp/tests/large_strings/parquet_tests.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 232a4f40d8e..f11f3fc3c9a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -430,6 +430,7 @@ add_library(
   src/io/text/multibyte_split.cu
   src/io/utilities/arrow_io_source.cpp
   src/io/utilities/column_buffer.cpp
+  src/io/utilities/column_buffer_strings.cu
   src/io/utilities/config_utils.cpp
   src/io/utilities/data_casting.cu
   src/io/utilities/data_sink.cpp
diff --git a/cpp/src/io/parquet/page_delta_decode.cu b/cpp/src/io/parquet/page_delta_decode.cu
index da1bbaebd73..0c9d4e77f0c 100644
--- a/cpp/src/io/parquet/page_delta_decode.cu
+++ b/cpp/src/io/parquet/page_delta_decode.cu
@@ -579,15 +579,18 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     __syncthreads();
   }
 
-  // now turn array of lengths into offsets
-  int value_count = nesting_info_base[leaf_level_index].value_count;
+  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
+  // latter case, offsets will be computed during string column creation.
+  if (not s->col.is_large_string_col) {
+    int value_count = nesting_info_base[leaf_level_index].value_count;
 
-  // if no repetition we haven't calculated start/end bounds and instead just skipped
-  // values until we reach first_row. account for that here.
-  if (!has_repetition) { value_count -= s->first_row; }
+    // if no repetition we haven't calculated start/end bounds and instead just skipped
+    // values until we reach first_row. account for that here.
+    if (!has_repetition) { value_count -= s->first_row; }
 
-  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+    auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+    block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+  }
 
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
@@ -738,15 +741,18 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     __syncthreads();
   }
 
-  // now turn array of lengths into offsets
-  int value_count = nesting_info_base[leaf_level_index].value_count;
+  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
+  // latter case, offsets will be computed during string column creation.
+  if (not s->col.is_large_string_col) {
+    int value_count = nesting_info_base[leaf_level_index].value_count;
 
-  // if no repetition we haven't calculated start/end bounds and instead just skipped
-  // values until we reach first_row. account for that here.
-  if (!has_repetition) { value_count -= s->first_row; }
+    // if no repetition we haven't calculated start/end bounds and instead just skipped
+    // values until we reach first_row. account for that here.
+    if (!has_repetition) { value_count -= s->first_row; }
 
-  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+    auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+    block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+  }
 
   // finally, copy the string data into place
   auto const dst = nesting_info_base[leaf_level_index].string_out;
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 5ba813f518f..cf1dc58b06a 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -955,7 +955,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
 {
   using cudf::detail::warp_size;
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(4) size_type last_offset;
+  __shared__ size_t last_offset;
   __shared__ __align__(16)
     page_state_buffers_s<rolling_buf_size, rolling_buf_size, rolling_buf_size>
       state_buffers;
@@ -1054,9 +1054,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
                               ? gpuGetStringData(s, sb, src_pos + skipped_leaf_values + i)
                               : cuda::std::pair<char const*, size_t>{nullptr, 0};
 
-          __shared__ cub::WarpScan<size_type>::TempStorage temp_storage;
-          size_type offset, warp_total;
-          cub::WarpScan<size_type>(temp_storage).ExclusiveSum(len, offset, warp_total);
+          __shared__ cub::WarpScan<size_t>::TempStorage temp_storage;
+          size_t offset, warp_total;
+          cub::WarpScan<size_t>(temp_storage).ExclusiveSum(len, offset, warp_total);
           offset += last_offset;
 
           // choose a character parallel string copy when the average string is longer than a warp
@@ -1075,10 +1075,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
             }
             __syncwarp();
           } else if (use_char_ll) {
-            __shared__ __align__(8) uint8_t const* pointers[warp_size];
-            __shared__ __align__(4) size_type offsets[warp_size];
-            __shared__ __align__(4) int dsts[warp_size];
-            __shared__ __align__(4) int lengths[warp_size];
+            __shared__ uint8_t const* pointers[warp_size];
+            __shared__ size_t offsets[warp_size];
+            __shared__ int dsts[warp_size];
+            __shared__ int lengths[warp_size];
 
             offsets[me]  = offset;
             pointers[me] = reinterpret_cast<uint8_t const*>(ptr);
@@ -1119,15 +1119,18 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     __syncthreads();
   }
 
-  // now turn array of lengths into offsets
-  int value_count = nesting_info_base[leaf_level_index].value_count;
+  // Now turn the array of lengths into offsets, but skip if this is a large string column. In the
+  // latter case, offsets will be computed during string column creation.
+  if (not s->col.is_large_string_col) {
+    int value_count = nesting_info_base[leaf_level_index].value_count;
 
-  // if no repetition we haven't calculated start/end bounds and instead just skipped
-  // values until we reach first_row. account for that here.
-  if (!has_repetition) { value_count -= s->first_row; }
+    // if no repetition we haven't calculated start/end bounds and instead just skipped
+    // values until we reach first_row. account for that here.
+    if (!has_repetition) { value_count -= s->first_row; }
 
-  auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
-  block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+    auto const offptr = reinterpret_cast<size_type*>(nesting_info_base[leaf_level_index].data_out);
+    block_excl_sum<decode_block_size>(offptr, value_count, s->page.str_offset);
+  }
 
   if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
 }
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index c06fb63acda..3b18175dccd 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -326,8 +326,8 @@ struct PageInfo {
   int32_t skipped_leaf_values;
   // for string columns only, the size of all the chars in the string for
   // this page. only valid/computed during the base preprocess pass
+  size_t str_offset;  // offset into string data for this page
   int32_t str_bytes;
-  int32_t str_offset;   // offset into string data for this page
   bool has_page_index;  // true if str_bytes, num_valids, etc are derivable from page indexes
 
   // nesting information (input/output) for each page. this array contains
@@ -420,7 +420,8 @@ struct ColumnChunkDesc {
       src_col_schema(src_col_schema_),
       h_chunk_info(chunk_info_),
       list_bytes_per_row_est(list_bytes_per_row_est_),
-      is_strings_to_cat(strings_to_categorical_)
+      is_strings_to_cat(strings_to_categorical_),
+      is_large_string_col(false)
   {
   }
 
@@ -454,7 +455,8 @@ struct ColumnChunkDesc {
 
   float list_bytes_per_row_est{};  // for LIST columns, an estimate on number of bytes per row
 
-  bool is_strings_to_cat{};  // convert strings to hashes
+  bool is_strings_to_cat{};    // convert strings to hashes
+  bool is_large_string_col{};  // `true` if string data uses 64-bit offsets
 };
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index b7172f5ba67..0602b5ec007 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -22,6 +22,7 @@
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 
 #include <rmm/resource_ref.hpp>
 
@@ -99,11 +100,21 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row
     col_string_sizes = calculate_page_string_offsets();
 
     // check for overflow
-    if (std::any_of(col_string_sizes.cbegin(), col_string_sizes.cend(), [](std::size_t sz) {
-          return sz > std::numeric_limits<size_type>::max();
-        })) {
+    auto const threshold         = static_cast<size_t>(strings::detail::get_offset64_threshold());
+    auto const has_large_strings = std::any_of(col_string_sizes.cbegin(),
+                                               col_string_sizes.cend(),
+                                               [=](std::size_t sz) { return sz > threshold; });
+    if (has_large_strings and not strings::detail::is_large_strings_enabled()) {
       CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
     }
+
+    // mark any chunks that are large string columns
+    if (has_large_strings) {
+      for (auto& chunk : pass.chunks) {
+        auto const idx = chunk.src_col_index;
+        if (col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; }
+      }
+    }
   }
 
   // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
@@ -348,11 +359,13 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row
       } else if (out_buf.type.id() == type_id::STRING) {
         // need to cap off the string offsets column
         auto const sz = static_cast<size_type>(col_string_sizes[idx]);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<size_type*>(out_buf.data()) + out_buf.size,
-                                      &sz,
-                                      sizeof(size_type),
-                                      cudaMemcpyDefault,
-                                      _stream.value()));
+        if (sz <= strings::detail::get_offset64_threshold()) {
+          CUDF_CUDA_TRY(cudaMemcpyAsync(static_cast<size_type*>(out_buf.data()) + out_buf.size,
+                                        &sz,
+                                        sizeof(size_type),
+                                        cudaMemcpyDefault,
+                                        _stream.value()));
+        }
       }
     }
   }
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 4b7a64ac6ab..8c9b3c1a1e6 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1169,10 +1169,10 @@ struct page_to_string_size {
 struct page_offset_output_iter {
   PageInfo* p;
 
-  using value_type        = size_type;
-  using difference_type   = size_type;
-  using pointer           = size_type*;
-  using reference         = size_type&;
+  using value_type        = size_t;
+  using difference_type   = size_t;
+  using pointer           = size_t*;
+  using reference         = size_t&;
   using iterator_category = thrust::output_device_iterator_tag;
 
   __host__ __device__ page_offset_output_iter operator+(int i) { return {p + i}; }
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 286c7b361a9..24aa630a05f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -40,6 +40,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -278,8 +279,9 @@ size_t column_size(column_view const& column, rmm::cuda_stream_view stream)
     return size_of(column.type()) * column.size();
   } else if (column.type().id() == type_id::STRING) {
     auto const scol = strings_column_view(column);
-    return cudf::detail::get_value<size_type>(scol.offsets(), column.size(), stream) -
-           cudf::detail::get_value<size_type>(scol.offsets(), 0, stream);
+    return cudf::strings::detail::get_offset_value(
+             scol.offsets(), column.size() + column.offset(), stream) -
+           cudf::strings::detail::get_offset_value(scol.offsets(), column.offset(), stream);
   } else if (column.type().id() == type_id::STRUCT) {
     auto const scol = structs_column_view(column);
     size_t ret      = 0;
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 5dc2291abdc..db84778edc6 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -69,16 +69,6 @@ void cudf::io::detail::inline_column_buffer::create_string_data(size_t num_bytes
   _string_data = rmm::device_buffer(num_bytes, stream, _mr);
 }
 
-std::unique_ptr<column> cudf::io::detail::inline_column_buffer::make_string_column_impl(
-  rmm::cuda_stream_view stream)
-{
-  // no need for copies, just transfer ownership of the data_buffers to the columns
-  auto offsets_col = std::make_unique<column>(
-    data_type{type_to_id<size_type>()}, size + 1, std::move(_data), rmm::device_buffer{}, 0);
-  return make_strings_column(
-    size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask));
-}
-
 namespace {
 
 /**
diff --git a/cpp/src/io/utilities/column_buffer_strings.cu b/cpp/src/io/utilities/column_buffer_strings.cu
new file mode 100644
index 00000000000..4bc303a34a5
--- /dev/null
+++ b/cpp/src/io/utilities/column_buffer_strings.cu
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "column_buffer.hpp"
+
+#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/utilities/error.hpp>
+
+namespace cudf::io::detail {
+
+std::unique_ptr<column> cudf::io::detail::inline_column_buffer::make_string_column_impl(
+  rmm::cuda_stream_view stream)
+{
+  // if the size of _string_data is over the threshold for 64bit size_type, _data will contain
+  // sizes rather than offsets. need special handling for that case.
+  auto const threshold = static_cast<size_t>(strings::detail::get_offset64_threshold());
+  if (_string_data.size() > threshold) {
+    if (not strings::detail::is_large_strings_enabled()) {
+      CUDF_FAIL("String column exceeds the column size limit", std::overflow_error);
+    }
+    // create new offsets
+    auto const offsets_ptr = static_cast<size_type*>(_data.data());
+    auto offsets_col       = make_numeric_column(
+      data_type{type_id::INT64}, size + 1, mask_state::UNALLOCATED, stream, _mr);
+    auto d_offsets64 = offsets_col->mutable_view().template data<int64_t>();
+    // it's safe to call with size + 1 because _data is also sized that large
+    cudf::detail::sizes_to_offsets(offsets_ptr, offsets_ptr + size + 1, d_offsets64, stream);
+    return make_strings_column(
+      size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask));
+  } else {
+    // no need for copies, just transfer ownership of the data_buffers to the columns
+    auto offsets_col = std::make_unique<column>(
+      data_type{type_to_id<size_type>()}, size + 1, std::move(_data), rmm::device_buffer{}, 0);
+    return make_strings_column(
+      size, std::move(offsets_col), std::move(_string_data), null_count(), std::move(_null_mask));
+  }
+}
+
+}  // namespace cudf::io::detail
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index fa633dfa67b..bbb919aa2d1 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -572,7 +572,7 @@ ConfigureTest(
 # * large strings test ----------------------------------------------------------------------------
 ConfigureTest(
   LARGE_STRINGS_TEST large_strings/large_strings_fixture.cpp large_strings/merge_tests.cpp
-  large_strings/concatenate_tests.cpp
+  large_strings/concatenate_tests.cpp large_strings/parquet_tests.cpp
   GPUS 1
   PERCENT 100
 )
diff --git a/cpp/tests/large_strings/parquet_tests.cpp b/cpp/tests/large_strings/parquet_tests.cpp
new file mode 100644
index 00000000000..007c08ce0fb
--- /dev/null
+++ b/cpp/tests/large_strings/parquet_tests.cpp
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/table/table_view.hpp>
+
+namespace {
+
+cudf::test::TempDirTestEnvironment* const g_temp_env =
+  static_cast<cudf::test::TempDirTestEnvironment*>(
+    ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+}  // namespace
+
+struct ParquetStringsTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(ParquetStringsTest, ReadLargeStrings)
+{
+  // need to create a string column larger than `threshold`
+  auto const col0        = this->long_column();
+  auto const column_size = cudf::strings_column_view(col0).chars_size(cudf::get_default_stream());
+  auto const threshold   = column_size - 1;
+  auto const expected    = cudf::table_view{{col0, col0, col0}};
+
+  auto expected_metadata = cudf::io::table_input_metadata{expected};
+  expected_metadata.column_metadata[1].set_encoding(
+    cudf::io::column_encoding::DELTA_LENGTH_BYTE_ARRAY);
+  expected_metadata.column_metadata[2].set_encoding(cudf::io::column_encoding::DELTA_BYTE_ARRAY);
+
+  // set smaller threshold to reduce file size and execution time
+  setenv("LIBCUDF_LARGE_STRINGS_THRESHOLD", std::to_string(threshold).c_str(), 1);
+
+  auto const filepath = g_temp_env->get_temp_filepath("ReadLargeStrings.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .compression(cudf::io::compression_type::ZSTD)
+      .stats_level(cudf::io::STATISTICS_NONE)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options default_in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto const result      = cudf::io::read_parquet(default_in_opts);
+  auto const result_view = result.tbl->view();
+  for (auto cv : result_view) {
+    auto const offsets = cudf::strings_column_view(cv).offsets();
+    EXPECT_EQ(offsets.type(), cudf::data_type{cudf::type_id::INT64});
+  }
+  CUDF_TEST_EXPECT_TABLES_EQUAL(result_view, expected);
+
+  // go back to normal threshold
+  unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
+}

From ce6902f064e9c028aa97c4a7ec5f2eed1c0c9a90 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 3 May 2024 08:21:25 -1000
Subject: [PATCH 145/842] Move timezone conversion logic to `DatetimeColumn`
 (#15545)

Moves methods/logic in `python/cudf/cudf/core/_internals/timezones.py` to the newly created `DatetimeColumn.tz_localize` and `DatetimeColumn.tz_convert`.

Additionally adds typing and improves an error message when doing `tz_convert(None)` on a tz-naive Series/Index to raise a `TypeError` (like pandas) instead of an `AttributeError`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15545
---
 python/cudf/cudf/core/_internals/timezones.py | 201 +++---------------
 python/cudf/cudf/core/column/datetime.py      | 159 +++++++++++++-
 python/cudf/cudf/core/index.py                |  23 +-
 python/cudf/cudf/core/series.py               |  27 +--
 python/cudf/cudf/core/tools/datetimes.py      |   5 +-
 .../cudf/tests/series/test_datetimelike.py    |   5 +
 6 files changed, 206 insertions(+), 214 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
index 4888cdd9ac9..f04cae719c2 100644
--- a/python/cudf/cudf/core/_internals/timezones.py
+++ b/python/cudf/cudf/core/_internals/timezones.py
@@ -3,23 +3,18 @@
 import os
 import zoneinfo
 from functools import lru_cache
-from typing import Tuple, cast
+from typing import Literal, Tuple
 
 import numpy as np
-import pandas as pd
 
-import cudf
-from cudf._lib.labeling import label_bins
-from cudf._lib.search import search_sorted
 from cudf._lib.timezone import make_timezone_transition_table
-from cudf.core.column.column import as_column, build_column
-from cudf.core.column.datetime import DatetimeColumn, DatetimeTZColumn
-from cudf.core.dataframe import DataFrame
-from cudf.utils.dtypes import _get_base_dtype
+from cudf.core.column.column import as_column
+from cudf.core.column.datetime import DatetimeColumn
+from cudf.core.column.timedelta import TimeDeltaColumn
 
 
 @lru_cache(maxsize=20)
-def get_tz_data(zone_name):
+def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
     """
     Return timezone data (transition times and UTC offsets) for the
     given IANA time zone.
@@ -31,8 +26,8 @@ def get_tz_data(zone_name):
 
     Returns
     -------
-    DataFrame with two columns containing the transition times
-    ("transition_times") and corresponding UTC offsets ("offsets").
+    Tuple with two columns containing the transition times
+    and corresponding UTC offsets.
     """
     try:
         # like zoneinfo, we first look in TZPATH
@@ -43,19 +38,23 @@ def get_tz_data(zone_name):
     return tz_table
 
 
-def _find_and_read_tzfile_tzpath(zone_name):
+def _find_and_read_tzfile_tzpath(
+    zone_name: str,
+) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
     for search_path in zoneinfo.TZPATH:
         if os.path.isfile(os.path.join(search_path, zone_name)):
-            return _read_tzfile_as_frame(search_path, zone_name)
+            return _read_tzfile_as_columns(search_path, zone_name)
     raise zoneinfo.ZoneInfoNotFoundError(zone_name)
 
 
-def _find_and_read_tzfile_tzdata(zone_name):
+def _find_and_read_tzfile_tzdata(
+    zone_name: str,
+) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
     import importlib.resources
 
     package_base = "tzdata.zoneinfo"
     try:
-        return _read_tzfile_as_frame(
+        return _read_tzfile_as_columns(
             str(importlib.resources.files(package_base)), zone_name
         )
     # TODO: make it so that the call to libcudf raises a
@@ -77,7 +76,9 @@ def _find_and_read_tzfile_tzdata(zone_name):
         raise zoneinfo.ZoneInfoNotFoundError(zone_name)
 
 
-def _read_tzfile_as_frame(tzdir, zone_name):
+def _read_tzfile_as_columns(
+    tzdir, zone_name: str
+) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
     transition_times_and_offsets = make_timezone_transition_table(
         tzdir, zone_name
     )
@@ -85,91 +86,13 @@ def _read_tzfile_as_frame(tzdir, zone_name):
     if not transition_times_and_offsets:
         # this happens for UTC-like zones
         min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]")
-        transition_times_and_offsets = (
-            as_column([min_date]),
-            as_column([np.timedelta64(0, "s")]),
-        )
-
-    return DataFrame._from_data(
-        dict(
-            zip(["transition_times", "offsets"], transition_times_and_offsets)
-        )
-    )
-
+        return (as_column([min_date]), as_column([np.timedelta64(0, "s")]))
+    return tuple(transition_times_and_offsets)  # type: ignore[return-value]
 
-def _find_ambiguous_and_nonexistent(
-    data: DatetimeColumn, zone_name: str
-) -> Tuple:
-    """
-    Recognize ambiguous and nonexistent timestamps for the given timezone.
-
-    Returns a tuple of columns, both of "bool" dtype and of the same
-    size as `data`, that respectively indicate ambiguous and
-    nonexistent timestamps in `data` with the value `True`.
-
-    Ambiguous and/or nonexistent timestamps are only possible if any
-    transitions occur in the time zone database for the given timezone.
-    If no transitions occur, the tuple `(False, False)` is returned.
-    """
-    tz_data_for_zone = get_tz_data(zone_name)
-    transition_times = tz_data_for_zone["transition_times"]
-    offsets = tz_data_for_zone["offsets"].astype(
-        f"timedelta64[{data.time_unit}]"
-    )
 
-    if len(offsets) == 1:  # no transitions
-        return False, False
-
-    transition_times, offsets, old_offsets = (
-        transition_times[1:]._column,
-        offsets[1:]._column,
-        offsets[:-1]._column,
-    )
-
-    # Assume we have two clocks at the moment of transition:
-    # - Clock 1 is turned forward or backwards correctly
-    # - Clock 2 makes no changes
-    clock_1 = transition_times + offsets
-    clock_2 = transition_times + old_offsets
-
-    # At the start of an ambiguous time period, Clock 1 (which has
-    # been turned back) reads less than Clock 2:
-    cond = clock_1 < clock_2
-    ambiguous_begin = clock_1.apply_boolean_mask(cond)
-
-    # The end of an ambiguous time period is what Clock 2 reads at
-    # the moment of transition:
-    ambiguous_end = clock_2.apply_boolean_mask(cond)
-    ambiguous = label_bins(
-        data,
-        left_edges=ambiguous_begin,
-        left_inclusive=True,
-        right_edges=ambiguous_end,
-        right_inclusive=False,
-    ).notnull()
-
-    # At the start of a non-existent time period, Clock 2 reads less
-    # than Clock 1 (which has been turned forward):
-    cond = clock_1 > clock_2
-    nonexistent_begin = clock_2.apply_boolean_mask(cond)
-
-    # The end of the non-existent time period is what Clock 1 reads
-    # at the moment of transition:
-    nonexistent_end = clock_1.apply_boolean_mask(cond)
-    nonexistent = label_bins(
-        data,
-        left_edges=nonexistent_begin,
-        left_inclusive=True,
-        right_edges=nonexistent_end,
-        right_inclusive=False,
-    ).notnull()
-
-    return ambiguous, nonexistent
-
-
-def localize(
-    data: DatetimeColumn, zone_name: str, ambiguous, nonexistent
-) -> DatetimeTZColumn:
+def check_ambiguous_and_nonexistent(
+    ambiguous: Literal["NaT"], nonexistent: Literal["NaT"]
+) -> Tuple[Literal["NaT"], Literal["NaT"]]:
     if ambiguous != "NaT":
         raise NotImplementedError(
             "Only ambiguous='NaT' is currently supported"
@@ -178,80 +101,4 @@ def localize(
         raise NotImplementedError(
             "Only nonexistent='NaT' is currently supported"
         )
-    if isinstance(data, DatetimeTZColumn):
-        raise ValueError(
-            "Already localized. "
-            "Use `tz_convert` to convert between time zones."
-        )
-    dtype = pd.DatetimeTZDtype(data.time_unit, zone_name)
-    ambiguous, nonexistent = _find_ambiguous_and_nonexistent(data, zone_name)
-    localized = cast(
-        DatetimeColumn,
-        data._scatter_by_column(
-            data.isnull() | (ambiguous | nonexistent),
-            cudf.Scalar(cudf.NaT, dtype=data.dtype),
-        ),
-    )
-    gmt_data = local_to_utc(localized, zone_name)
-    return cast(
-        DatetimeTZColumn,
-        build_column(
-            data=gmt_data.base_data,
-            dtype=dtype,
-            mask=localized.base_mask,
-            size=gmt_data.size,
-            offset=gmt_data.offset,
-        ),
-    )
-
-
-def delocalize(data: DatetimeColumn) -> DatetimeColumn:
-    """
-    Convert a timezone-aware datetime column to a timezone-naive one.
-    If the column is already timezone-naive, return it as is.
-    """
-    if isinstance(data, DatetimeTZColumn):
-        return data._local_time
-    # already timezone-naive:
-    return data
-
-
-def convert(data: DatetimeTZColumn, zone_name: str) -> DatetimeTZColumn:
-    if not isinstance(data, DatetimeTZColumn):
-        raise TypeError(
-            "Cannot convert from timezone-naive timestamps to "
-            "timezone-aware timestamps. For that, "
-            "use `tz_localize`."
-        )
-    if zone_name == str(data.dtype.tz):
-        return data.copy()
-    utc_time = data._utc_time
-    out = cast(
-        DatetimeTZColumn,
-        build_column(
-            data=utc_time.base_data,
-            dtype=pd.DatetimeTZDtype(data.time_unit, zone_name),
-            mask=utc_time.base_mask,
-            size=utc_time.size,
-            offset=utc_time.offset,
-        ),
-    )
-    return out
-
-
-def utc_to_local(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
-    tz_data_for_zone = get_tz_data(zone_name)
-    transition_times, offsets = tz_data_for_zone._columns
-    transition_times = transition_times.astype(_get_base_dtype(data.dtype))
-    indices = search_sorted([transition_times], [data], "right") - 1
-    offsets_from_utc = offsets.take(indices, nullify=True)
-    return data + offsets_from_utc
-
-
-def local_to_utc(data: DatetimeColumn, zone_name: str) -> DatetimeColumn:
-    tz_data_for_zone = get_tz_data(zone_name)
-    transition_times, offsets = tz_data_for_zone._columns
-    transition_times_local = (transition_times + offsets).astype(data.dtype)
-    indices = search_sorted([transition_times_local], [data], "right") - 1
-    offsets_to_utc = offsets.take(indices, nullify=True)
-    return data - offsets_to_utc
+    return ambiguous, nonexistent
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 981ef738458..9fe4e5da96d 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -7,7 +7,7 @@
 import locale
 import re
 from locale import nl_langinfo
-from typing import Any, Optional, Sequence, cast
+from typing import TYPE_CHECKING, Any, Literal, Optional, Sequence, Tuple, cast
 
 import numpy as np
 import pandas as pd
@@ -16,6 +16,8 @@
 
 import cudf
 from cudf import _lib as libcudf
+from cudf._lib.labeling import label_bins
+from cudf._lib.search import search_sorted
 from cudf._typing import (
     ColumnBinaryOperand,
     DatetimeLikeScalar,
@@ -31,6 +33,9 @@
 from cudf.utils.dtypes import _get_base_dtype
 from cudf.utils.utils import _all_bools_with_nulls
 
+if TYPE_CHECKING:
+    from cudf.core.column.numerical import NumericalColumn
+
 if PANDAS_GE_220:
     _guess_datetime_format = pd.tseries.api.guess_datetime_format
 else:
@@ -665,6 +670,121 @@ def _with_type_metadata(self, dtype):
             )
         return self
 
+    def _find_ambiguous_and_nonexistent(
+        self, zone_name: str
+    ) -> Tuple[NumericalColumn, NumericalColumn] | Tuple[bool, bool]:
+        """
+        Recognize ambiguous and nonexistent timestamps for the given timezone.
+
+        Returns a tuple of columns, both of "bool" dtype and of the same
+        size as `self`, that respectively indicate ambiguous and
+        nonexistent timestamps in `self` with the value `True`.
+
+        Ambiguous and/or nonexistent timestamps are only possible if any
+        transitions occur in the time zone database for the given timezone.
+        If no transitions occur, the tuple `(False, False)` is returned.
+        """
+        from cudf.core._internals.timezones import get_tz_data
+
+        transition_times, offsets = get_tz_data(zone_name)
+        offsets = offsets.astype(f"timedelta64[{self.time_unit}]")  # type: ignore[assignment]
+
+        if len(offsets) == 1:  # no transitions
+            return False, False
+
+        transition_times, offsets, old_offsets = (
+            transition_times.slice(1, len(transition_times)),
+            offsets.slice(1, len(offsets)),
+            offsets.slice(0, len(offsets) - 1),
+        )
+
+        # Assume we have two clocks at the moment of transition:
+        # - Clock 1 is turned forward or backwards correctly
+        # - Clock 2 makes no changes
+        clock_1 = transition_times + offsets
+        clock_2 = transition_times + old_offsets
+
+        # At the start of an ambiguous time period, Clock 1 (which has
+        # been turned back) reads less than Clock 2:
+        cond = clock_1 < clock_2
+        ambiguous_begin = clock_1.apply_boolean_mask(cond)
+
+        # The end of an ambiguous time period is what Clock 2 reads at
+        # the moment of transition:
+        ambiguous_end = clock_2.apply_boolean_mask(cond)
+        ambiguous = label_bins(
+            self,
+            left_edges=ambiguous_begin,
+            left_inclusive=True,
+            right_edges=ambiguous_end,
+            right_inclusive=False,
+        ).notnull()
+
+        # At the start of a non-existent time period, Clock 2 reads less
+        # than Clock 1 (which has been turned forward):
+        cond = clock_1 > clock_2
+        nonexistent_begin = clock_2.apply_boolean_mask(cond)
+
+        # The end of the non-existent time period is what Clock 1 reads
+        # at the moment of transition:
+        nonexistent_end = clock_1.apply_boolean_mask(cond)
+        nonexistent = label_bins(
+            self,
+            left_edges=nonexistent_begin,
+            left_inclusive=True,
+            right_edges=nonexistent_end,
+            right_inclusive=False,
+        ).notnull()
+
+        return ambiguous, nonexistent
+
+    def tz_localize(
+        self,
+        tz: str | None,
+        ambiguous: Literal["NaT"] = "NaT",
+        nonexistent: Literal["NaT"] = "NaT",
+    ):
+        from cudf.core._internals.timezones import (
+            check_ambiguous_and_nonexistent,
+            get_tz_data,
+        )
+
+        if tz is None:
+            return self.copy()
+        ambiguous, nonexistent = check_ambiguous_and_nonexistent(
+            ambiguous, nonexistent
+        )
+        dtype = pd.DatetimeTZDtype(self.time_unit, tz)
+        ambiguous_col, nonexistent_col = self._find_ambiguous_and_nonexistent(
+            tz
+        )
+        localized = self._scatter_by_column(
+            self.isnull() | (ambiguous_col | nonexistent_col),
+            cudf.Scalar(cudf.NaT, dtype=self.dtype),
+        )
+
+        transition_times, offsets = get_tz_data(tz)
+        transition_times_local = (transition_times + offsets).astype(
+            localized.dtype
+        )
+        indices = (
+            search_sorted([transition_times_local], [localized], "right") - 1
+        )
+        offsets_to_utc = offsets.take(indices, nullify=True)
+        gmt_data = localized - offsets_to_utc
+        return DatetimeTZColumn(
+            data=gmt_data.base_data,
+            dtype=dtype,
+            mask=localized.base_mask,
+            size=gmt_data.size,
+            offset=gmt_data.offset,
+        )
+
+    def tz_convert(self, tz: str | None):
+        raise TypeError(
+            "Cannot convert tz-naive timestamps, use tz_localize to localize"
+        )
+
 
 class DatetimeTZColumn(DatetimeColumn):
     def __init__(
@@ -731,9 +851,13 @@ def _utc_time(self):
     @property
     def _local_time(self):
         """Return the local time as naive timestamps."""
-        from cudf.core._internals.timezones import utc_to_local
+        from cudf.core._internals.timezones import get_tz_data
 
-        return utc_to_local(self, str(self.dtype.tz))
+        transition_times, offsets = get_tz_data(str(self.dtype.tz))
+        transition_times = transition_times.astype(_get_base_dtype(self.dtype))
+        indices = search_sorted([transition_times], [self], "right") - 1
+        offsets_from_utc = offsets.take(indices, nullify=True)
+        return self + offsets_from_utc
 
     def as_string_column(
         self, dtype: Dtype, format: str | None = None
@@ -756,3 +880,32 @@ def __repr__(self):
             f"{arr.to_string()}\n"
             f"dtype: {self.dtype}"
         )
+
+    def tz_localize(self, tz: str | None, ambiguous="NaT", nonexistent="NaT"):
+        from cudf.core._internals.timezones import (
+            check_ambiguous_and_nonexistent,
+        )
+
+        if tz is None:
+            return self._local_time
+        ambiguous, nonexistent = check_ambiguous_and_nonexistent(
+            ambiguous, nonexistent
+        )
+        raise ValueError(
+            "Already localized. "
+            "Use `tz_convert` to convert between time zones."
+        )
+
+    def tz_convert(self, tz: str | None):
+        if tz is None:
+            return self._utc_time
+        elif tz == str(self.dtype.tz):
+            return self.copy()
+        utc_time = self._utc_time
+        return type(self)(
+            data=utc_time.base_data,
+            dtype=pd.DatetimeTZDtype(self.time_unit, tz),
+            mask=utc_time.base_mask,
+            size=utc_time.size,
+            offset=utc_time.offset,
+        )
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index f55fa4c05b5..583e5d74b56 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2258,7 +2258,12 @@ def round(self, freq):
 
         return self.__class__._from_data({self.name: out_column})
 
-    def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
+    def tz_localize(
+        self,
+        tz: str | None,
+        ambiguous: Literal["NaT"] = "NaT",
+        nonexistent: Literal["NaT"] = "NaT",
+    ):
         """
         Localize timezone-naive data to timezone-aware data.
 
@@ -2300,17 +2305,12 @@ def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
         ambiguous or nonexistent timestamps are converted
         to 'NaT'.
         """  # noqa: E501
-        from cudf.core._internals.timezones import delocalize, localize
-
-        if tz is None:
-            result_col = delocalize(self._column)
-        else:
-            result_col = localize(self._column, tz, ambiguous, nonexistent)
+        result_col = self._column.tz_localize(tz, ambiguous, nonexistent)
         return DatetimeIndex._from_data(
             {self.name: result_col}, freq=self._freq
         )
 
-    def tz_convert(self, tz):
+    def tz_convert(self, tz: str | None):
         """
         Convert tz-aware datetimes from one time zone to another.
 
@@ -2342,12 +2342,7 @@ def tz_convert(self, tz):
                        '2018-03-03 14:00:00+00:00'],
                       dtype='datetime64[ns, Europe/London]')
         """  # noqa: E501
-        from cudf.core._internals.timezones import convert
-
-        if tz is None:
-            result_col = self._column._utc_time
-        else:
-            result_col = convert(self._column, tz)
+        result_col = self._column.tz_convert(tz)
         return DatetimeIndex._from_data({self.name: result_col})
 
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index b6ed28f9093..c3d232aaa7c 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4755,22 +4755,22 @@ def strftime(self, date_format, *args, **kwargs):
         )
 
     @copy_docstring(DatetimeIndex.tz_localize)
-    def tz_localize(self, tz, ambiguous="NaT", nonexistent="NaT"):
-        from cudf.core._internals.timezones import delocalize, localize
-
-        if tz is None:
-            result_col = delocalize(self.series._column)
-        else:
-            result_col = localize(
-                self.series._column, tz, ambiguous, nonexistent
-            )
+    def tz_localize(
+        self,
+        tz: str | None,
+        ambiguous: Literal["NaT"] = "NaT",
+        nonexistent: Literal["NaT"] = "NaT",
+    ):
+        result_col = self.series._column.tz_localize(
+            tz, ambiguous, nonexistent
+        )
         return Series._from_data(
             data={self.series.name: result_col},
             index=self.series._index,
         )
 
     @copy_docstring(DatetimeIndex.tz_convert)
-    def tz_convert(self, tz):
+    def tz_convert(self, tz: str | None):
         """
         Parameters
         ----------
@@ -4780,12 +4780,7 @@ def tz_convert(self, tz):
             A `tz` of None will convert to UTC and remove the
             timezone information.
         """
-        from cudf.core._internals.timezones import convert
-
-        if tz is None:
-            result_col = self.series._column._utc_time
-        else:
-            result_col = convert(self.series._column, tz)
+        result_col = self.series._column.tz_convert(tz)
         return Series._from_data(
             {self.series.name: result_col}, index=self.series._index
         )
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 907f3b586d1..7f6ce1100ea 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -317,9 +317,6 @@ def _process_col(
     format: Optional[str],
     utc: bool,
 ):
-    # Causes circular import
-    from cudf.core._internals.timezones import localize
-
     if col.dtype.kind == "f":
         if unit not in (None, "ns"):
             factor = cudf.Scalar(
@@ -396,7 +393,7 @@ def _process_col(
             f"dtype {col.dtype} cannot be converted to {_unit_dtype_map[unit]}"
         )
     if utc and not isinstance(col.dtype, pd.DatetimeTZDtype):
-        return localize(col, "UTC", ambiguous="NaT", nonexistent="NaT")
+        return col.tz_localize("UTC")
     return col
 
 
diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py
index 6ee339ee3ea..7ef55761b2b 100644
--- a/python/cudf/cudf/tests/series/test_datetimelike.py
+++ b/python/cudf/cudf/tests/series/test_datetimelike.py
@@ -218,3 +218,8 @@ def test_contains_tz_aware(item, expected):
     dti = cudf.date_range("2020", periods=2, freq="D").tz_localize("UTC")
     result = item in dti
     assert result == expected
+
+
+def test_tz_convert_naive_typeerror():
+    with pytest.raises(TypeError):
+        cudf.date_range("2020", periods=2, freq="D").tz_convert(None)

From 09f8ff39728b774f1bb8957d76ed3b47e00c3708 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 3 May 2024 14:53:20 -0400
Subject: [PATCH 146/842] Large strings support for cudf::interleave_columns
 (#15544)

Updates the `cudf::interleave_columns` logic to use gather-based `make_strings_column` instead of the `make_strings_children` since the gather-based function already efficiently supports longs.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15544
---
 cpp/benchmarks/CMakeLists.txt         |   5 ++
 cpp/benchmarks/reshape/interleave.cpp |  59 +++++++++++++++
 cpp/src/lists/interleave_columns.cu   | 100 ++++++++++----------------
 cpp/src/reshape/interleave_columns.cu |  95 +++++++++---------------
 4 files changed, 133 insertions(+), 126 deletions(-)
 create mode 100644 cpp/benchmarks/reshape/interleave.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 5fd328dfc68..7e61d881f07 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -346,6 +346,11 @@ target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB)
 # ---------------------------------------------------------------------------------
 ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp)
 
+# ##################################################################################################
+# * reshape benchmark
+# ---------------------------------------------------------------------------------
+ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp)
+
 add_custom_target(
   run_benchmarks
   DEPENDS CUDF_BENCHMARKS
diff --git a/cpp/benchmarks/reshape/interleave.cpp b/cpp/benchmarks/reshape/interleave.cpp
new file mode 100644
index 00000000000..4499e34af77
--- /dev/null
+++ b/cpp/benchmarks/reshape/interleave.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/reshape.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_interleave(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const num_cols  = static_cast<cudf::size_type>(state.get_int64("columns"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) * num_cols >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const str_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  std::vector<cudf::type_id> types(num_cols, cudf::type_id::STRING);
+  auto const source_table = create_random_table(types, row_count{num_rows}, str_profile);
+
+  auto const source_view = source_table->view();
+  auto const stream      = cudf::get_default_stream();
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size = cudf::strings_column_view(source_view.column(0)).chars_size(stream) +
+                    cudf::strings_column_view(source_view.column(1)).chars_size(stream);
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto result = cudf::interleave_columns(source_view);
+  });
+}
+
+NVBENCH_BENCH(bench_interleave)
+  .set_name("interleave_strings")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
+  .add_int64_axis("columns", {2, 10, 100});
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index 88eccf13f72..be8fad62412 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -128,12 +128,20 @@ std::unique_ptr<column> concatenate_and_gather_lists(host_span<column_view const
   return std::move(result->release()[0]);
 }
 
+// Error case when no other overload or specialization is available
+template <typename T, typename Enable = void>
+struct interleave_list_entries_impl {
+  template <typename... Args>
+  std::unique_ptr<column> operator()(Args&&...)
+  {
+    CUDF_FAIL("Called `interleave_list_entries_fn()` on non-supported types.");
+  }
+};
+
 /**
- * @brief Compute string sizes, string validities, and interleave string lists functor.
+ * @brief Interleave array of string_index_pair objects for a list of strings
  *
- * This functor is executed twice. In the first pass, the sizes and validities of the output strings
- * will be computed. In the second pass, this will interleave the lists of strings of the given
- * table containing those lists.
+ * Each thread processes the strings for the corresponding list row
  */
 struct compute_string_sizes_and_interleave_lists_fn {
   table_device_view const table_dv;
@@ -141,19 +149,10 @@ struct compute_string_sizes_and_interleave_lists_fn {
   // Store list offsets of the output lists column.
   size_type const* const dst_list_offsets;
 
-  // Flag to specify whether to compute string validities.
-  bool const has_null_mask;
-
-  // Store offsets of the strings.
-  size_type* d_offsets{nullptr};
-
-  // If d_chars == nullptr: only compute sizes and validities of the output strings.
-  // If d_chars != nullptr: only interleave lists of strings.
-  char* d_chars{nullptr};
-
-  // We need to set `1` or `0` for the validities of the strings in the child column.
-  int8_t* d_validities{nullptr};
+  using string_index_pair = cudf::strings::detail::string_index_pair;
+  string_index_pair* indices;  // output
 
+  // thread per list row per column
   __device__ void operator()(size_type const idx)
   {
     auto const num_cols = table_dv.num_columns();
@@ -161,7 +160,7 @@ struct compute_string_sizes_and_interleave_lists_fn {
     auto const list_id  = idx / num_cols;
 
     auto const& lists_col = table_dv.column(col_id);
-    if (has_null_mask and lists_col.is_null(list_id)) { return; }
+    if (lists_col.is_null(list_id)) { return; }
 
     auto const list_offsets =
       lists_col.child(lists_column_view::offsets_column_index).template data<size_type>() +
@@ -181,65 +180,40 @@ struct compute_string_sizes_and_interleave_lists_fn {
     // read_idx and write_idx are indices of string elements.
     size_type write_idx = dst_list_offsets[idx];
 
-    if (not d_chars) {  // just compute sizes and validities of strings within a list
-      for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) {
-        if (has_null_mask) {
-          d_validities[write_idx] = static_cast<int8_t>(str_col.is_valid(read_idx));
-        }
-        d_offsets[write_idx] = str_offsets[read_idx + 1] - str_offsets[read_idx];
-      }
-    } else {  // just copy the entire memory region containing all strings in the list
-      // start_byte and end_byte are indices of character of the string elements.
-      auto const start_byte = str_offsets[start_str_idx];
-      auto const end_byte   = str_offsets[end_str_idx];
-      if (start_byte < end_byte) {
-        auto const input_ptr  = str_col.template head<char>() + start_byte;
-        auto const output_ptr = d_chars + d_offsets[write_idx];
-        thrust::copy(thrust::seq, input_ptr, input_ptr + end_byte - start_byte, output_ptr);
+    for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) {
+      auto const offset        = str_offsets[read_idx];
+      auto const size          = str_offsets[read_idx + 1] - offset;
+      string_index_pair result = {nullptr, size};
+      if (str_col.is_valid(read_idx)) {
+        result.first = size > 0 ? str_col.template head<char>() + offset : "";
       }
+      indices[write_idx] = result;
     }
   }
 };
 
-// Error case when no other overload or specialization is available
-template <typename T, typename Enable = void>
-struct interleave_list_entries_impl {
-  template <typename... Args>
-  std::unique_ptr<column> operator()(Args&&...)
-  {
-    CUDF_FAIL("Called `interleave_list_entries_fn()` on non-supported types.");
-  }
-};
-
 template <typename T>
 struct interleave_list_entries_impl<T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
   std::unique_ptr<column> operator()(table_view const& input,
                                      column_view const& output_list_offsets,
                                      size_type num_output_lists,
                                      size_type num_output_entries,
-                                     bool data_has_null_mask,
+                                     bool,
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr) const noexcept
   {
-    auto const table_dv_ptr = table_device_view::create(input, stream);
-    auto comp_fn            = compute_string_sizes_and_interleave_lists_fn{
-      *table_dv_ptr, output_list_offsets.template begin<size_type>(), data_has_null_mask};
-
-    auto validities =
-      rmm::device_uvector<int8_t>(data_has_null_mask ? num_output_entries : 0, stream);
-    comp_fn.d_validities = validities.data();
-
-    auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
-      comp_fn, num_output_lists, num_output_entries, stream, mr);
-
-    auto [null_mask, null_count] =
-      cudf::detail::valid_if(validities.begin(), validities.end(), thrust::identity{}, stream, mr);
-
-    return make_strings_column(num_output_entries,
-                               std::move(offsets_column),
-                               chars.release(),
-                               null_count,
-                               std::move(null_mask));
+    auto const table_dv_ptr   = table_device_view::create(input, stream);
+    auto const d_list_offsets = output_list_offsets.template begin<size_type>();
+
+    rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(num_output_entries,
+                                                                          stream);
+    auto comp_fn =
+      compute_string_sizes_and_interleave_lists_fn{*table_dv_ptr, d_list_offsets, indices.data()};
+    thrust::for_each_n(rmm::exec_policy_nosync(stream),
+                       thrust::counting_iterator<size_type>(0),
+                       num_output_lists,
+                       comp_fn);
+    return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
   }
 };
 
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 3d1421120fd..580db0e24c5 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/detail/interleave_columns.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -140,85 +141,53 @@ struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::struc
   }
 };
 
+struct interleave_strings_fn {
+  using string_index_pair = cudf::strings::detail::string_index_pair;
+  table_device_view d_table;
+
+  __device__ string_index_pair operator()(size_type idx)
+  {
+    auto const num_columns    = d_table.num_columns();
+    auto const source_col_idx = idx % num_columns;
+    auto const source_row_idx = idx / num_columns;
+    auto const col            = d_table.column(source_col_idx);
+    if (col.is_null(source_row_idx)) { return string_index_pair{nullptr, 0}; }
+    auto const d_str = col.element<string_view>(source_row_idx);
+    // ensures an empty string is not identified as a null row
+    return !d_str.empty() ? string_index_pair{d_str.data(), d_str.size_bytes()}
+                          : string_index_pair{"", 0};
+  }
+};
+
 template <typename T>
 struct interleave_columns_impl<T, std::enable_if_t<std::is_same_v<T, cudf::string_view>>> {
   std::unique_ptr<cudf::column> operator()(table_view const& strings_columns,
-                                           bool create_mask,
+                                           bool,
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
   {
     auto num_columns = strings_columns.num_columns();
-    if (num_columns == 1)  // Single strings column returns a copy
+    if (num_columns == 1) {  // Single strings column returns a copy
       return std::make_unique<column>(*(strings_columns.begin()), stream, mr);
+    }
 
     auto strings_count = strings_columns.num_rows();
-    if (strings_count == 0)  // All columns have 0 rows
+    if (strings_count == 0) {  // All columns have 0 rows
       return make_empty_column(type_id::STRING);
+    }
 
     // Create device views from the strings columns.
-    auto table       = table_device_view::create(strings_columns, stream);
-    auto d_table     = *table;
+    auto d_table     = table_device_view::create(strings_columns, stream);
     auto num_strings = num_columns * strings_count;
 
-    std::pair<rmm::device_buffer, size_type> valid_mask{};
-    if (create_mask) {
-      // Create resulting null mask
-      valid_mask = cudf::detail::valid_if(
-        thrust::make_counting_iterator<size_type>(0),
-        thrust::make_counting_iterator<size_type>(num_strings),
-        [num_columns, d_table] __device__(size_type idx) {
-          auto source_row_idx = idx % num_columns;
-          auto source_col_idx = idx / num_columns;
-          return !d_table.column(source_row_idx).is_null(source_col_idx);
-        },
-        stream,
-        mr);
-    }
-
-    auto const null_count = valid_mask.second;
-
-    // Build offsets column by computing sizes of each string in the output
-    auto offsets_transformer =
-      cuda::proclaim_return_type<size_type>([num_columns, d_table] __device__(size_type idx) {
-        // First compute the column and the row this item belongs to
-        auto source_row_idx = idx % num_columns;
-        auto source_col_idx = idx / num_columns;
-        return d_table.column(source_row_idx).is_valid(source_col_idx)
-                 ? d_table.column(source_row_idx).element<string_view>(source_col_idx).size_bytes()
-                 : 0;
-      });
-    auto offsets_transformer_itr = thrust::make_transform_iterator(
-      thrust::make_counting_iterator<size_type>(0), offsets_transformer);
-    auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
-      offsets_transformer_itr, offsets_transformer_itr + num_strings, stream, mr);
-    auto d_results_offsets =
-      cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
-
-    // Create the chars column
-    rmm::device_uvector<char> chars(bytes, stream, mr);
-    auto d_results_chars = chars.data();
-    thrust::for_each_n(
-      rmm::exec_policy(stream),
-      thrust::make_counting_iterator<size_type>(0),
-      num_strings,
-      [num_columns, d_table, d_results_offsets, d_results_chars] __device__(size_type idx) {
-        auto source_row_idx = idx % num_columns;
-        auto source_col_idx = idx / num_columns;
-
-        // Do not write to buffer if the column value for this row is null
-        if (d_table.column(source_row_idx).is_null(source_col_idx)) return;
-
-        size_type offset = d_results_offsets[idx];
-        char* d_buffer   = d_results_chars + offset;
-        strings::detail::copy_string(
-          d_buffer, d_table.column(source_row_idx).element<string_view>(source_col_idx));
-      });
+    rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(num_strings, stream);
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      thrust::make_counting_iterator<size_type>(0),
+                      thrust::make_counting_iterator<size_type>(num_strings),
+                      indices.begin(),
+                      interleave_strings_fn{*d_table});
 
-    return make_strings_column(num_strings,
-                               std::move(offsets_column),
-                               chars.release(),
-                               null_count,
-                               std::move(valid_mask.first));
+    return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
   }
 };
 

From 2ff60d610847ae4a3a983617f70d2138bf0fd239 Mon Sep 17 00:00:00 2001
From: er-eis <eeisenberg0@gmail.com>
Date: Fri, 3 May 2024 17:47:57 -0400
Subject: [PATCH 147/842] Concatenate dictionary of objects along axis=1
 (#15623)

Note: This work is heavily based off [amanlai's](https://github.com/amanlai) PR [raised here](https://github.com/rapidsai/cudf/pull/15160), wasn't able to base my branch off amanlai's due to deleted branch.

> Closes https://github.com/rapidsai/cudf/issues/15115.
>Unlike `pandas.concat`, `cudf.concat` doesn't work with a dictionary of objects. The following code raises an error.
```python
d = {
    'first': cudf.DataFrame({'A': [1, 2], 'B': [3, 4]}),
    'second': cudf.DataFrame({'A': [5, 6], 'B': [7, 8]}),
}

cudf.concat(d, axis=1)
```
>This commit resolves this issue.

Authors:
  - https://github.com/er-eis
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15623
---
 python/cudf/cudf/core/reshape.py      | 192 +++++++++++++++++---------
 python/cudf/cudf/tests/test_concat.py | 148 ++++++++++++++++++--
 2 files changed, 268 insertions(+), 72 deletions(-)

diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 9008d2f3a1b..26d91bed173 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -122,9 +122,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
 
     Parameters
     ----------
-    objs : list of DataFrame, Series, or Index
+    objs : list or dictionary of DataFrame, Series, or Index
     axis : {0/'index', 1/'columns'}, default 0
         The axis to concatenate along.
+        `axis=1` must be passed if a dictionary is passed.
     join : {'inner', 'outer'}, default 'outer'
         How to handle indexes on other axis (or axes).
     ignore_index : bool, default False
@@ -231,27 +232,71 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
       letter  number  animal    name
     0      a       1    bird   polly
     1      b       2  monkey  george
+
+    Combine a dictionary of DataFrame objects horizontally:
+
+    >>> d = {'first': df1, 'second': df2}
+    >>> cudf.concat(d, axis=1)
+      first           second
+      letter  number  letter  number
+    0      a       1       c       3
+    1      b       2       d       4
     """
     # TODO: Do we really need to have different error messages for an empty
     # list and a list of None?
     if not objs:
         raise ValueError("No objects to concatenate")
 
-    objs = [obj for obj in objs if obj is not None]
-
-    if not objs:
-        raise ValueError("All objects passed were None")
-
     axis = _AXIS_MAP.get(axis, None)
     if axis is None:
         raise ValueError(
             f'`axis` must be 0 / "index" or 1 / "columns", got: {axis}'
         )
 
+    if isinstance(objs, dict):
+        if axis != 1:
+            raise NotImplementedError(
+                f"Can only concatenate dictionary input along axis=1, not {axis}"
+            )
+        objs = {k: obj for k, obj in objs.items() if obj is not None}
+        keys = list(objs)
+        objs = list(objs.values())
+        if any(isinstance(o, cudf.BaseIndex) for o in objs):
+            raise TypeError(
+                "cannot concatenate a dictionary containing indices"
+            )
+    else:
+        objs = [obj for obj in objs if obj is not None]
+        keys = None
+
+    if not objs:
+        raise ValueError("All objects passed were None")
+
+    # Retrieve the base types of `objs`. In order to support sub-types
+    # and object wrappers, we use `isinstance()` instead of comparing
+    # types directly
+    allowed_typs = {
+        cudf.Series,
+        cudf.DataFrame,
+        cudf.BaseIndex,
+    }
+    if not all(isinstance(o, tuple(allowed_typs)) for o in objs):
+        raise TypeError(
+            f"can only concatenate objects which are instances of "
+            f"{allowed_typs}, instead received {[type(o) for o in objs]}"
+        )
+
+    if any(isinstance(o, cudf.BaseIndex) for o in objs):
+        if not all(isinstance(o, cudf.BaseIndex) for o in objs):
+            raise TypeError(
+                "when concatenating indices you must provide ONLY indices"
+            )
+
+    only_series = all(isinstance(o, cudf.Series) for o in objs)
+
     # Return for single object
     if len(objs) == 1:
         obj = objs[0]
-
         if ignore_index:
             if axis == 1:
                 result = cudf.DataFrame._from_data(
@@ -290,6 +335,15 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                 result = cudf.DataFrame._from_data(
                     data, index=obj.index.copy(deep=True)
                 )
+                if keys is not None:
+                    if isinstance(result, cudf.DataFrame):
+                        k = keys[0]
+                        result.columns = cudf.MultiIndex.from_tuples(
+                            [
+                                (k, *c) if isinstance(c, tuple) else (k, c)
+                                for c in result._column_names
+                            ]
+                        )
 
         if isinstance(result, cudf.Series) and axis == 0:
             # sort has no effect for series concatted along axis 0
@@ -297,27 +351,9 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         else:
             return result.sort_index(axis=(1 - axis)) if sort else result
 
-    # Retrieve the base types of `objs`. In order to support sub-types
-    # and object wrappers, we use `isinstance()` instead of comparing
-    # types directly
-    typs = set()
-    for o in objs:
-        if isinstance(o, cudf.MultiIndex):
-            typs.add(cudf.MultiIndex)
-        elif isinstance(o, cudf.BaseIndex):
-            typs.add(type(o))
-        elif isinstance(o, cudf.DataFrame):
-            typs.add(cudf.DataFrame)
-        elif isinstance(o, cudf.Series):
-            typs.add(cudf.Series)
-        else:
-            raise TypeError(f"cannot concatenate object of type {type(o)}")
-
-    allowed_typs = {cudf.Series, cudf.DataFrame}
-
     # when axis is 1 (column) we can concat with Series and Dataframes
     if axis == 1:
-        if not typs.issubset(allowed_typs):
+        if not all(isinstance(o, (cudf.Series, cudf.DataFrame)) for o in objs):
             raise TypeError(
                 "Can only concatenate Series and DataFrame objects when axis=1"
             )
@@ -353,35 +389,71 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
             objs = _align_objs(objs, how=join, sort=sort)
             df.index = objs[0].index
 
-        for o in objs:
-            for name, col in o._data.items():
-                if name in df._data:
-                    raise NotImplementedError(
-                        f"A Column with duplicate name found: {name}, cuDF "
-                        f"doesn't support having multiple columns with "
-                        f"same names yet."
-                    )
-                if empty_inner:
-                    # if join is inner and it contains an empty df
-                    # we return an empty df, hence creating an empty
-                    # column with dtype metadata retained.
-                    df[name] = cudf.core.column.column_empty_like(
-                        col, newsize=0
-                    )
-                else:
-                    df[name] = col
-
-        result_columns = (
-            objs[0]
-            ._data.to_pandas_index()
-            .append([obj._data.to_pandas_index() for obj in objs[1:]])
-        )
+        if keys is None:
+            for o in objs:
+                for name, col in o._data.items():
+                    if name in df._data:
+                        raise NotImplementedError(
+                            f"A Column with duplicate name found: {name}, cuDF "
+                            f"doesn't support having multiple columns with "
+                            f"same names yet."
+                        )
+                    if empty_inner:
+                        # if join is inner and it contains an empty df
+                        # we return an empty df, hence creating an empty
+                        # column with dtype metadata retained.
+                        df[name] = cudf.core.column.column_empty_like(
+                            col, newsize=0
+                        )
+                    else:
+                        df[name] = col
+
+            result_columns = (
+                objs[0]
+                ._data.to_pandas_index()
+                .append([obj._data.to_pandas_index() for obj in objs[1:]])
+                .unique()
+            )
 
-        if ignore_index:
-            # with ignore_index the column names change to numbers
-            df.columns = pd.RangeIndex(len(result_columns.unique()))
+        # need to create a MultiIndex column
         else:
+            # All levels in the multiindex label must have the same type
+            has_multiple_level_types = (
+                len({type(name) for o in objs for name in o._data.keys()}) > 1
+            )
+            if has_multiple_level_types:
+                raise NotImplementedError(
+                    "Cannot construct a MultiIndex column with multiple "
+                    "label types in cuDF at this time. You must convert "
+                    "the labels to the same type."
+                )
+            for k, o in zip(keys, objs):
+                for name, col in o._data.items():
+                    # if only series, then only keep keys as column labels
+                    # if the existing column is multiindex, prepend it
+                    # to handle cases where dfs and srs are concatenated
+                    if only_series:
+                        col_label = k
+                    elif isinstance(name, tuple):
+                        col_label = (k, *name)
+                    else:
+                        col_label = (k, name)
+                    if empty_inner:
+                        df[col_label] = cudf.core.column.column_empty_like(
+                            col, newsize=0
+                        )
+                    else:
+                        df[col_label] = col
+
+        if keys is None:
             df.columns = result_columns.unique()
+            if ignore_index:
+                df.columns = cudf.RangeIndex(len(result_columns.unique()))
+        elif ignore_index:
+            # with ignore_index the column names change to numbers
+            df.columns = cudf.RangeIndex(len(result_columns))
+        elif not only_series:
+            df.columns = cudf.MultiIndex.from_tuples(df._column_names)
 
         if empty_inner:
             # if join is inner and it contains an empty df
@@ -391,18 +463,10 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         return df
 
     # If we get here, we are always concatenating along axis 0 (the rows).
-    typ = list(typs)[0]
-    if len(typs) > 1:
-        if allowed_typs == typs:
-            # This block of code will run when `objs` has
-            # both Series & DataFrame kind of inputs.
-            _normalize_series_and_dataframe(objs, axis=axis)
-            typ = cudf.DataFrame
-        else:
-            raise TypeError(
-                f"`concat` cannot concatenate objects of "
-                f"types: {sorted([t.__name__ for t in typs])}."
-            )
+    typ = type(objs[0])
+    if len({type(o) for o in objs}) > 1:
+        _normalize_series_and_dataframe(objs, axis=axis)
+        typ = cudf.DataFrame
 
     if typ is cudf.DataFrame:
         old_objs = objs
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index 87b3beb5589..4b43a33c8c8 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -218,7 +218,8 @@ def test_concat_columns(axis):
     assert_eq(expect, got, check_index_type=True)
 
 
-def test_concat_multiindex_dataframe():
+@pytest.mark.parametrize("axis", [0, 1])
+def test_concat_multiindex_dataframe(axis):
     gdf = cudf.DataFrame(
         {
             "w": np.arange(4),
@@ -233,14 +234,11 @@ def test_concat_multiindex_dataframe():
     pdg2 = pdg.iloc[:, 1:]
     gdg1 = cudf.from_pandas(pdg1)
     gdg2 = cudf.from_pandas(pdg2)
+    expected = pd.concat([pdg1, pdg2], axis=axis)
+    result = cudf.concat([gdg1, gdg2], axis=axis)
     assert_eq(
-        cudf.concat([gdg1, gdg2]).astype("float64"),
-        pd.concat([pdg1, pdg2]),
-        check_index_type=True,
-    )
-    assert_eq(
-        cudf.concat([gdg1, gdg2], axis=1),
-        pd.concat([pdg1, pdg2], axis=1),
+        expected,
+        result,
         check_index_type=True,
     )
 
@@ -1865,3 +1863,137 @@ def test_concat_mixed_list_types_error(s1, s2):
 
     with pytest.raises(NotImplementedError):
         cudf.concat([s1, s2], ignore_index=True)
+
+
+@pytest.mark.parametrize(
+    "axis",
+    [
+        pytest.param(
+            0,
+            marks=pytest.mark.xfail(
+                reason="concat dictionaries with axis=0 not implemented"
+            ),
+        ),
+        1,
+        "columns",
+    ],
+)
+@pytest.mark.parametrize(
+    "d",
+    [
+        {"first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}})},
+        {
+            "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}),
+            "second": (cudf.DataFrame, {"data": {"A": [5, 6], "B": [7, 8]}}),
+            "third": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}),
+        },
+        {
+            "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}),
+            "second": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}),
+            "third": (cudf.DataFrame, {"data": {"A": [5, 6], "B": [7, 8]}}),
+        },
+        {
+            "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}),
+            "second": (cudf.DataFrame, {"data": {"C": [1, 2, 3]}}),
+            "third": (cudf.DataFrame, {"data": {"A": [5, 6], "C": [7, 8]}}),
+            "fourth": (cudf.DataFrame, {"data": {"B": [9, 10]}}),
+        },
+        pytest.param(
+            {
+                "first": (cudf.DataFrame, {"data": {2.0: [1, 1]}}),
+                "second": (cudf.DataFrame, {"data": {"test": ["abc", "def"]}}),
+            },
+            marks=pytest.mark.xfail(
+                reason=(
+                    "Cannot construct a MultiIndex column with multiple "
+                    "label types in cuDF at this time. You must convert "
+                    "the labels to the same type."
+                )
+            ),
+        ),
+        {
+            "first": (cudf.Series, {"data": [1, 2, 3]}),
+            "second": (cudf.Series, {"data": [4, 5, 6]}),
+        },
+        {
+            "first": (cudf.DataFrame, {"data": {"A": [1, 2], "B": [3, 4]}}),
+            "second": (cudf.Series, {"data": [5, 6], "name": "C"}),
+        },
+        pytest.param(
+            {
+                "first": (
+                    cudf.DataFrame,
+                    {"data": {("A", "B"): [1, 2], "C": [3, 4]}},
+                ),
+                "second": (
+                    cudf.DataFrame,
+                    {"data": {"D": [5, 6], ("A", "B"): [7, 8]}},
+                ),
+            },
+            marks=pytest.mark.xfail(
+                reason=(
+                    "Cannot construct a MultiIndex column with multiple "
+                    "label types in cuDF at this time. You must convert "
+                    "the labels to the same type."
+                )
+            ),
+        ),
+        pytest.param(
+            {
+                "first": (
+                    cudf.DataFrame,
+                    {"data": {("A", "B"): [3, 4], 2.0: [1, 1]}},
+                ),
+                "second": (
+                    cudf.DataFrame,
+                    {"data": {("C", "D"): [3, 4], 3.0: [5, 6]}},
+                ),
+            },
+            marks=pytest.mark.xfail(
+                reason=(
+                    "Cannot construct a MultiIndex column with multiple "
+                    "label types in cuDF at this time. You must convert "
+                    "the labels to the same type."
+                )
+            ),
+        ),
+        {
+            "first": (
+                cudf.DataFrame,
+                {"data": {(1, 2): [1, 2], (3, 4): [3, 4]}},
+            ),
+            "second": (
+                cudf.DataFrame,
+                {"data": {(1, 2): [5, 6], (5, 6): [7, 8]}},
+            ),
+        },
+    ],
+)
+def test_concat_dictionary(d, axis):
+    _dict = {k: c(**v) for k, (c, v) in d.items()}
+    result = cudf.concat(_dict, axis=axis)
+    expected = cudf.from_pandas(
+        pd.concat({k: df.to_pandas() for k, df in _dict.items()}, axis=axis)
+    )
+    assert_eq(expected, result)
+
+
+@pytest.mark.parametrize(
+    "d",
+    [
+        {"first": cudf.Index([1, 2, 3])},
+        {
+            "first": cudf.MultiIndex(
+                levels=[[1, 2], ["blue", "red"]],
+                codes=[[0, 0, 1, 1], [1, 0, 1, 0]],
+            )
+        },
+        {"first": cudf.CategoricalIndex([1, 2, 3])},
+    ],
+)
+def test_concat_dict_incorrect_type_index(d):
+    with pytest.raises(
+        TypeError,
+        match="cannot concatenate a dictionary containing indices",
+    ):
+        cudf.concat(d, axis=1)

From bee2a38b63fb5e4ef90f243a3c51cf23fbf3c984 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 3 May 2024 11:58:14 -1000
Subject: [PATCH 148/842] Enable FutureWarnings/DeprecationWarnings as errors
 for dask_cudf (#15634)

Part of https://github.com/rapidsai/build-planning/issues/26

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15634
---
 python/cudf/cudf/core/index.py                 | 18 +++++++++++++++---
 python/cudf/cudf/tests/test_index.py           | 12 ++++++++----
 .../dask_cudf/dask_cudf/io/tests/test_json.py  |  5 ++---
 .../dask_cudf/dask_cudf/tests/test_accessor.py |  6 +++---
 .../dask_cudf/dask_cudf/tests/test_groupby.py  | 10 +++++-----
 python/dask_cudf/dask_cudf/tests/test_join.py  | 16 ++++++++++++----
 python/dask_cudf/pyproject.toml                | 10 ++++++++++
 7 files changed, 55 insertions(+), 22 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 583e5d74b56..b51751a1b55 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1119,14 +1119,26 @@ def _concat(cls, objs):
             assert (
                 PANDAS_LT_300
             ), "Need to drop after pandas-3.0 support is added."
-            warnings.warn(
+            warning_msg = (
                 "The behavior of array concatenation with empty entries is "
                 "deprecated. In a future version, this will no longer exclude "
                 "empty items when determining the result dtype. "
                 "To retain the old behavior, exclude the empty entries before "
-                "the concat operation.",
-                FutureWarning,
+                "the concat operation."
             )
+            # Warn only if the type might _actually_ change
+            if len(non_empties) == 0:
+                if not all(objs[0].dtype == index.dtype for index in objs[1:]):
+                    warnings.warn(warning_msg, FutureWarning)
+            else:
+                common_all_type = find_common_type(
+                    [index.dtype for index in objs]
+                )
+                common_non_empty_type = find_common_type(
+                    [index.dtype for index in non_empties]
+                )
+                if common_all_type != common_non_empty_type:
+                    warnings.warn(warning_msg, FutureWarning)
         if all(isinstance(obj, RangeIndex) for obj in non_empties):
             result = _concat_range_index(non_empties)
         else:
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index c7875b81440..104a5fc0ffa 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -1039,7 +1039,9 @@ def test_index_append(data, other):
         (len(data) == 0 or len(other) == 0) and pd_data.dtype != pd_other.dtype
     ):
         expected = pd_data.append(pd_other)
-    with expect_warning_if(len(data) == 0 or len(other) == 0):
+    with expect_warning_if(
+        (len(data) == 0 or len(other) == 0) and gd_data.dtype != gd_other.dtype
+    ):
         actual = gd_data.append(gd_other)
     if len(data) == 0 and len(other) == 0:
         # Pandas default dtype to "object" for empty list
@@ -1237,7 +1239,10 @@ def test_index_append_list(data, other):
         and (any(d.dtype != data.dtype for d in other))
     ):
         expected = pd_data.append(pd_other)
-    with expect_warning_if(len(data) == 0 or any(len(d) == 0 for d in other)):
+    with expect_warning_if(
+        (len(data) == 0 or any(len(d) == 0 for d in other))
+        and (any(d.dtype != data.dtype for d in other))
+    ):
         actual = gd_data.append(gd_other)
 
     assert_eq(expected, actual)
@@ -2817,8 +2822,7 @@ def test_index_methods(index, func):
 
     if func == "append":
         expected = pidx.append(other=pidx)
-        with expect_warning_if(len(gidx) == 0):
-            actual = gidx.append(other=gidx)
+        actual = gidx.append(other=gidx)
     else:
         expected = getattr(pidx, func)()
         actual = getattr(gidx, func)()
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py
index f8e5be0a417..dc780478794 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_json.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py
@@ -84,9 +84,8 @@ def test_read_json_nested(tmp_path):
         }
     )
     kwargs = dict(orient="records", lines=True)
-    with tmp_path / "data.json" as f, dask.config.set(
-        {"dataframe.convert-string": False}
-    ):
+    f = tmp_path / "data.json"
+    with dask.config.set({"dataframe.convert-string": False}):
         df.to_json(f, **kwargs)
         # Ensure engine='cudf' is tested.
         actual = dask_cudf.read_json(f, engine="cudf", **kwargs)
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index ae17b89832a..035b73094e7 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -543,7 +543,7 @@ def test_struct_explode(data):
 
 
 def test_tz_localize():
-    data = Series(date_range("2000-04-01", "2000-04-03", freq="H"))
+    data = Series(date_range("2000-04-01", "2000-04-03", freq="h"))
     expect = data.dt.tz_localize(
         "US/Eastern", ambiguous="NaT", nonexistent="NaT"
     )
@@ -560,8 +560,8 @@ def test_tz_localize():
 @pytest.mark.parametrize(
     "data",
     [
-        date_range("2000-04-01", "2000-04-03", freq="H").tz_localize("UTC"),
-        date_range("2000-04-01", "2000-04-03", freq="H").tz_localize(
+        date_range("2000-04-01", "2000-04-03", freq="h").tz_localize("UTC"),
+        date_range("2000-04-01", "2000-04-03", freq="h").tz_localize(
             "US/Eastern"
         ),
     ],
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index 67fa045d3d0..f96b5b760d8 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -233,7 +233,7 @@ def test_groupby_split_out(split_out, column):
     gddf = dask_cudf.from_cudf(gdf, npartitions=3)
 
     ddf_result = (
-        ddf.groupby(column)
+        ddf.groupby(column, observed=True)
         .a.mean(split_out=split_out)
         .compute()
         .sort_values()
@@ -368,10 +368,10 @@ def test_groupby_dropna_dask(dropna, by):
 
     if dropna is None:
         dask_cudf_result = gddf.groupby(by).e.sum()
-        dask_result = ddf.groupby(by).e.sum()
+        dask_result = ddf.groupby(by, observed=True).e.sum()
     else:
         dask_cudf_result = gddf.groupby(by, dropna=dropna).e.sum()
-        dask_result = ddf.groupby(by, dropna=dropna).e.sum()
+        dask_result = ddf.groupby(by, dropna=dropna, observed=True).e.sum()
 
     dd.assert_eq(dask_cudf_result, dask_result)
 
@@ -505,7 +505,7 @@ def test_groupby_reset_index_dtype():
     a = df.groupby("a").agg({"b": ["count"]})
 
     assert a.index.dtype == "int8"
-    assert a.reset_index().dtypes[0] == "int8"
+    assert a.reset_index().dtypes.iloc[0] == "int8"
 
 
 def test_groupby_reset_index_names():
@@ -563,7 +563,7 @@ def test_groupby_categorical_key():
     # (See: https://github.com/dask/dask/issues/9515)
     expect = (
         ddf.compute()
-        .groupby("name", sort=True)
+        .groupby("name", sort=True, observed=True)
         .agg({"x": ["mean", "max"], "y": ["mean", "count"]})
     )
     dd.assert_eq(expect, got)
diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py
index 42ecc130298..ed291ef31a7 100644
--- a/python/dask_cudf/dask_cudf/tests/test_join.py
+++ b/python/dask_cudf/dask_cudf/tests/test_join.py
@@ -66,8 +66,12 @@ def test_join_inner(left_nrows, right_nrows, left_nkeys, right_nkeys):
     def gather(df, grows):
         grows[df["x"].values[0]] = (set(df.al), set(df.ar))
 
-    expect.reset_index().groupby("x").apply(partial(gather, grows=expect_rows))
-    expect.reset_index().groupby("x").apply(partial(gather, grows=got_rows))
+    expect.reset_index().groupby("x")[["x", "al", "ar"]].apply(
+        partial(gather, grows=expect_rows)
+    )
+    expect.reset_index().groupby("x")[["x", "al", "ar"]].apply(
+        partial(gather, grows=got_rows)
+    )
 
     assert got_rows == expect_rows
 
@@ -127,9 +131,13 @@ def gather(df, grows):
 
         grows[df["x"].values[0]] = (cola, colb)
 
-    expect.reset_index().groupby("x").apply(partial(gather, grows=expect_rows))
+    expect.reset_index().groupby("x")[["x", "al", "ar"]].apply(
+        partial(gather, grows=expect_rows)
+    )
 
-    expect.reset_index().groupby("x").apply(partial(gather, grows=got_rows))
+    expect.reset_index().groupby("x")[["x", "al", "ar"]].apply(
+        partial(gather, grows=got_rows)
+    )
 
     for k in expect_rows:
         np.testing.assert_array_equal(expect_rows[k][0], got_rows[k][0])
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index fcf83e82989..5fbdd98225e 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -107,3 +107,13 @@ skip = [
     "build",
     "dist",
 ]
+
+[tool.pytest.ini_options]
+filterwarnings = [
+    "error::FutureWarning",
+    "error::DeprecationWarning",
+    "ignore:create_block_manager_from_blocks is deprecated and will be removed in a future version. Use public APIs instead.:DeprecationWarning",
+    # https://github.com/dask/partd/blob/main/partd/pandas.py#L198
+    "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning",
+    "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask",
+]

From 23bb2ed156d164b59e608e7e791c74db5cb4bce8 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 3 May 2024 17:08:11 -1000
Subject: [PATCH 149/842] Enable warnings as errors in custreamz (#15642)

Part of https://github.com/rapidsai/build-planning/issues/26

Builds on https://github.com/rapidsai/cudf/pull/15634

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15642
---
 python/custreamz/pyproject.toml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index e6c86351ac9..7786bf98bef 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -103,3 +103,13 @@ skip = [
     "dist",
     "__init__.py",
 ]
+
+[tool.pytest.ini_options]
+filterwarnings = [
+    "error",
+    "ignore:unclosed <socket.socket:ResourceWarning",
+    "ignore:Port .* is already in use.:UserWarning:distributed",
+    # Should be fixed in the next streamz release
+    # https://github.com/python-streamz/streamz/commit/2812f1f961dfcb3f17e948d8b12a12472975558e
+    "ignore:pkg_resources is deprecated as an API:DeprecationWarning:streamz",
+]

From d3c4cf44940b6d60131bc72241c749021e4a8117 Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <phcho@nvidia.com>
Date: Mon, 6 May 2024 10:39:07 -0700
Subject: [PATCH 150/842] Migrate to `{{ stdlib("c") }}` (#15594)

The `sysroot*` syntax is getting phased out (conda-forge/conda-forge.github.io#2102).
The recommendation is to move to `{{ stdlib("c") }}`.

Ref https://github.com/rapidsai/build-planning/issues/39

Authors:
  - Philip Hyunsu Cho (https://github.com/hcho3)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/jakirkham
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15594
---
 conda/recipes/cudf/conda_build_config.yaml       | 5 ++++-
 conda/recipes/cudf/meta.yaml                     | 2 +-
 conda/recipes/cudf_kafka/conda_build_config.yaml | 5 ++++-
 conda/recipes/cudf_kafka/meta.yaml               | 2 +-
 conda/recipes/libcudf/conda_build_config.yaml    | 5 ++++-
 conda/recipes/libcudf/meta.yaml                  | 4 ++--
 6 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml
index c98c2701653..d399e440edd 100644
--- a/conda/recipes/cudf/conda_build_config.yaml
+++ b/conda/recipes/cudf/conda_build_config.yaml
@@ -4,7 +4,10 @@ c_compiler_version:
 cxx_compiler_version:
   - 11
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index ae2d938250b..ddcadfd1570 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -57,7 +57,7 @@ requirements:
     - {{ compiler('cuda') }}
     {% endif %}
     - cuda-version ={{ cuda_version }}
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     - python
     - cython >=3.0.3
diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml
index c98c2701653..d399e440edd 100644
--- a/conda/recipes/cudf_kafka/conda_build_config.yaml
+++ b/conda/recipes/cudf_kafka/conda_build_config.yaml
@@ -4,7 +4,10 @@ c_compiler_version:
 cxx_compiler_version:
   - 11
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 45e41bf8de7..ab41d9e1f15 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -53,7 +53,7 @@ requirements:
     - {{ compiler('cuda') }}
     {% endif %}
     - cuda-version ={{ cuda_version }}
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     - python
     - cython >=3.0.3
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index b7fbaab9306..ba5e96fb6cf 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -10,7 +10,10 @@ cuda_compiler:
 cuda11_compiler:
   - nvcc
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 695c515b9d4..76115362b6c 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -43,7 +43,7 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
     - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     - librmm ={{ minor_version }}
     - libkvikio ={{ minor_version }}
@@ -170,7 +170,7 @@ outputs:
         {% endif %}
         - cuda-version ={{ cuda_version }}
         - ninja
-        - sysroot_{{ target_platform }} {{ sysroot_version }}
+        - {{ stdlib("c") }}
       host:
         - {{ pin_subpackage('libcudf', exact=True) }}
         {% if cuda_major == "11" %}

From 4dc616227f5c872031603426a3235282f6d23554 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Mon, 6 May 2024 11:29:46 -0700
Subject: [PATCH 151/842] Implement JNI for chunked ORC reader (#15446)

This adds JNI implementation for chunked ORC reader, allowing to read ORC files by an iterative manner.

Depends on:
 * https://github.com/rapidsai/cudf/pull/15094

Closes https://github.com/rapidsai/cudf/issues/12228.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/15446
---
 .../java/ai/rapids/cudf/ORCChunkedReader.java | 169 +++++++++++++++++
 java/src/main/native/src/ChunkedReaderJni.cpp | 173 ++++++++++++++++--
 .../test/java/ai/rapids/cudf/TableTest.java   |  24 +++
 java/src/test/resources/splittable.orc        | Bin 0 -> 385961 bytes
 4 files changed, 352 insertions(+), 14 deletions(-)
 create mode 100644 java/src/main/java/ai/rapids/cudf/ORCChunkedReader.java
 create mode 100644 java/src/test/resources/splittable.orc

diff --git a/java/src/main/java/ai/rapids/cudf/ORCChunkedReader.java b/java/src/main/java/ai/rapids/cudf/ORCChunkedReader.java
new file mode 100644
index 00000000000..2f46c8d1825
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/ORCChunkedReader.java
@@ -0,0 +1,169 @@
+/*
+ *
+ *  Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+/**
+ * Provide an interface for reading an ORC file in an iterative manner.
+ */
+public class ORCChunkedReader implements AutoCloseable {
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
+  /**
+   * Construct the reader instance from read limits, output row granularity,
+   * and a file already loaded in a memory buffer.
+   *
+   * @param chunkReadLimit Limit on total number of bytes to be returned per read,
+   *                       or 0 if there is no limit.
+   * @param passReadLimit  Limit on the amount of memory used by the chunked reader,
+   *                       or 0 if there is no limit.
+   * @param opts           The options for ORC reading.
+   * @param buffer         Raw ORC file content.
+   * @param offset         The starting offset into buffer.
+   * @param len            The number of bytes to parse the given buffer.
+   */
+  public ORCChunkedReader(long chunkReadLimit, long passReadLimit,
+      ORCOptions opts, HostMemoryBuffer buffer, long offset, long len) {
+    handle = createReader(chunkReadLimit, passReadLimit,
+        opts.getIncludeColumnNames(), buffer.getAddress() + offset, len,
+        opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(),
+        opts.getDecimal128Columns());
+    if (handle == 0) {
+      throw new IllegalStateException("Cannot create native chunked ORC reader object.");
+    }
+  }
+
+  /**
+   * Construct a chunked ORC reader instance, similar to
+   * {@link ORCChunkedReader#ORCChunkedReader(long, long, ORCOptions, HostMemoryBuffer, long, long)},
+   * with an additional parameter to control the granularity of the output table.
+   * When reading a chunk table, with respect to the given size limits, a subset of stripes may
+   * be loaded, decompressed and decoded into a large intermediate table. The reader will then
+   * subdivide that table into smaller tables for final output using
+   * {@code outputRowSizingGranularity} as the subdivision step. If the chunked reader is
+   * constructed without this parameter, the default value of 10k rows will be used.
+   *
+   * @param outputRowSizingGranularity The change step in number of rows in the output table.
+   * @see ORCChunkedReader#ORCChunkedReader(long, long, ORCOptions, HostMemoryBuffer, long, long)
+   */
+  public ORCChunkedReader(long chunkReadLimit, long passReadLimit, long outputRowSizingGranularity,
+      ORCOptions opts, HostMemoryBuffer buffer, long offset, long len) {
+    handle = createReaderWithOutputGranularity(chunkReadLimit, passReadLimit, outputRowSizingGranularity,
+        opts.getIncludeColumnNames(), buffer.getAddress() + offset, len,
+        opts.usingNumPyTypes(), opts.timeUnit().typeId.getNativeId(),
+        opts.getDecimal128Columns());
+    if (handle == 0) {
+      throw new IllegalStateException("Cannot create native chunked ORC reader object.");
+    }
+  }
+
+  /**
+   * Check if the given file has anything left to read.
+   *
+   * @return A boolean value indicating if there is more data to read from file.
+   */
+  public boolean hasNext() {
+    if (handle == 0) {
+      throw new IllegalStateException("Native chunked ORC reader object may have been closed.");
+    }
+
+    if (firstCall) {
+      // This function needs to return true at least once, so an empty table
+      // (but having empty columns instead of no column) can be returned by readChunk()
+      // if the input file has no row.
+      firstCall = false;
+      return true;
+    }
+    return hasNext(handle);
+  }
+
+  /**
+   * Read a chunk of rows in the given ORC file such that the returning data has total size
+   * does not exceed the given read limit. If the given file has no data, or all data has been read
+   * before by previous calls to this function, a null Table will be returned.
+   *
+   * @return A table of new rows reading from the given file.
+   */
+  public Table readChunk() {
+    if (handle == 0) {
+      throw new IllegalStateException("Native chunked ORC reader object may have been closed.");
+    }
+
+    long[] columnPtrs = readChunk(handle);
+    return columnPtrs != null ? new Table(columnPtrs) : null;
+  }
+
+  @Override
+  public void close() {
+    if (handle != 0) {
+      close(handle);
+      handle = 0;
+    }
+  }
+
+
+  /**
+   * Auxiliary variable to help {@link #hasNext()} returning true at least once.
+   */
+  private boolean firstCall = true;
+
+  /**
+   * Handle for memory address of the native ORC chunked reader class.
+   */
+  private long handle;
+
+  /**
+   * Create a native chunked ORC reader object on heap and return its memory address.
+   *
+   * @param chunkReadLimit    Limit on total number of bytes to be returned per read,
+   *                          or 0 if there is no limit.
+   * @param passReadLimit     Limit on the amount of memory used by the chunked reader,
+   *                          or 0 if there is no limit.
+   * @param filterColumnNames Name of the columns to read, or an empty array if we want to read all.
+   * @param bufferAddrs       The address of a buffer to read from, or 0 if we are not using that buffer.
+   * @param length            The length of the buffer to read from.
+   * @param usingNumPyTypes   Whether the parser should implicitly promote TIMESTAMP
+   *                          columns to TIMESTAMP_MILLISECONDS for compatibility with NumPy.
+   * @param timeUnit          return type of TimeStamp in units
+   * @param decimal128Columns name of the columns which are read as Decimal128 rather than Decimal64
+   */
+  private static native long createReader(long chunkReadLimit, long passReadLimit,
+      String[] filterColumnNames, long bufferAddrs, long length,
+      boolean usingNumPyTypes, int timeUnit, String[] decimal128Columns);
+
+  /**
+   * Create a native chunked ORC reader object, similar to
+   * {@link ORCChunkedReader#createReader(long, long, String[], long, long, boolean, int, String[])},
+   * with an additional parameter to control the granularity of the output table.
+   *
+   * @param outputRowSizingGranularity The change step in number of rows in the output table.
+   * @see ORCChunkedReader#createReader(long, long, String[], long, long, boolean, int, String[])
+   */
+  private static native long createReaderWithOutputGranularity(
+      long chunkReadLimit, long passReadLimit, long outputRowSizingGranularity,
+      String[] filterColumnNames, long bufferAddrs, long length,
+      boolean usingNumPyTypes, int timeUnit, String[] decimal128Columns);
+
+  private static native boolean hasNext(long handle);
+
+  private static native long[] readChunk(long handle);
+
+  private static native void close(long handle);
+}
diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp
index 7681008f584..cf04a87262f 100644
--- a/java/src/main/native/src/ChunkedReaderJni.cpp
+++ b/java/src/main/native/src/ChunkedReaderJni.cpp
@@ -18,22 +18,22 @@
 #include "jni_utils.hpp"
 
 #include <cudf/column/column.hpp>
+#include <cudf/io/orc.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/table/table.hpp>
 
 #include <memory>
+#include <optional>
 #include <vector>
 
-// This function is defined in `TableJni.cpp`.
-jlongArray cudf::jni::convert_table_for_return(
-  JNIEnv* env,
-  std::unique_ptr<cudf::table>&& table_result,
-  std::vector<std::unique_ptr<cudf::column>>&& extra_columns);
-
 // This file is for the code related to chunked reader (Parquet, ORC, etc.).
 
 extern "C" {
 
+//
+// Chunked Parquet reader JNI
+//
+
 // This function should take all the parameters that `Table.readParquet` takes,
 // plus one more parameter `long chunkSizeByteLimit`.
 JNIEXPORT jlong JNICALL
@@ -54,19 +54,17 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env,
     JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", 0);
     read_buffer = false;
   } else if (inp_file_path != nullptr) {
-    JNI_THROW_NEW(env,
-                  "java/lang/IllegalArgumentException",
-                  "Cannot pass in both a buffer and an inp_file_path",
-                  0);
+    JNI_THROW_NEW(
+      env, cudf::jni::ILLEGAL_ARG_CLASS, "Cannot pass in both a buffer and an inp_file_path", 0);
   } else if (buffer_length <= 0) {
-    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "An empty buffer is not supported", 0);
+    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0);
   }
 
   try {
     cudf::jni::auto_set_device(env);
     cudf::jni::native_jstring filename(env, inp_file_path);
     if (!read_buffer && filename.is_empty()) {
-      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "inp_file_path cannot be empty", 0);
+      JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inp_file_path cannot be empty", 0);
     }
 
     cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
@@ -155,7 +153,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_readChunk(
                                                                                 jclass,
                                                                                 jlong handle)
 {
-  JNI_NULL_CHECK(env, handle, "handle is null", 0);
+  JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
 
   try {
     cudf::jni::auto_set_device(env);
@@ -163,7 +161,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_readChunk(
     auto chunk            = reader_ptr->read_chunk();
     return chunk.tbl ? cudf::jni::convert_table_for_return(env, chunk.tbl) : nullptr;
   }
-  CATCH_STD(env, 0);
+  CATCH_STD(env, nullptr);
 }
 
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv* env,
@@ -179,4 +177,151 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv* en
   CATCH_STD(env, );
 }
 
+//
+// Chunked ORC reader JNI
+//
+
+namespace {
+jlong create_chunked_orc_reader(JNIEnv* env,
+                                jlong chunk_read_limit,
+                                jlong pass_read_limit,
+                                std::optional<jlong> output_granularity,
+                                jobjectArray filter_col_names,
+                                jlong buffer,
+                                jlong buffer_length,
+                                jboolean using_numpy_Types,
+                                jint unit,
+                                jobjectArray dec128_col_names)
+{
+  JNI_NULL_CHECK(env, buffer, "buffer is null", 0);
+  if (buffer_length <= 0) {
+    JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0);
+  }
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+    cudf::jni::native_jstringArray n_dec128_col_names(env, dec128_col_names);
+
+    auto const source = cudf::io::source_info(reinterpret_cast<char*>(buffer),
+                                              static_cast<std::size_t>(buffer_length));
+    auto opts_builder = cudf::io::orc_reader_options::builder(source);
+    if (n_filter_col_names.size() > 0) {
+      opts_builder = opts_builder.columns(n_filter_col_names.as_cpp_vector());
+    }
+    auto const read_opts = opts_builder.use_index(false)
+                             .use_np_dtypes(static_cast<bool>(using_numpy_Types))
+                             .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+                             .decimal128_columns(n_dec128_col_names.as_cpp_vector())
+                             .build();
+
+    if (output_granularity) {
+      return reinterpret_cast<jlong>(
+        new cudf::io::chunked_orc_reader(static_cast<std::size_t>(chunk_read_limit),
+                                         static_cast<std::size_t>(pass_read_limit),
+                                         static_cast<std::size_t>(output_granularity.value()),
+                                         read_opts));
+    }
+    return reinterpret_cast<jlong>(
+      new cudf::io::chunked_orc_reader(static_cast<std::size_t>(chunk_read_limit),
+                                       static_cast<std::size_t>(pass_read_limit),
+                                       read_opts));
+  }
+  CATCH_STD(env, 0);
+}
+}  // namespace
+
+// This function should take all the parameters that `Table.readORC` takes,
+// plus two more parameters: `chunk_read_limit` and `pass_read_limit`.
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_ORCChunkedReader_createReader(JNIEnv* env,
+                                                  jclass,
+                                                  jlong chunk_read_limit,
+                                                  jlong pass_read_limit,
+                                                  jobjectArray filter_col_names,
+                                                  jlong buffer,
+                                                  jlong buffer_length,
+                                                  jboolean using_numpy_Types,
+                                                  jint unit,
+                                                  jobjectArray dec128_col_names)
+{
+  return create_chunked_orc_reader(env,
+                                   chunk_read_limit,
+                                   pass_read_limit,
+                                   std::nullopt,
+                                   filter_col_names,
+                                   buffer,
+                                   buffer_length,
+                                   using_numpy_Types,
+                                   unit,
+                                   dec128_col_names);
+}
+
+// This function should take all the parameters that `Table.readORC` takes,
+// plus three more parameters: `chunk_read_limit`, `pass_read_limit`, `output_granularity`.
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ORCChunkedReader_createReaderWithOutputGranularity(
+  JNIEnv* env,
+  jclass,
+  jlong chunk_read_limit,
+  jlong pass_read_limit,
+  jlong output_granularity,
+  jobjectArray filter_col_names,
+  jlong buffer,
+  jlong buffer_length,
+  jboolean using_numpy_Types,
+  jint unit,
+  jobjectArray dec128_col_names)
+{
+  return create_chunked_orc_reader(env,
+                                   chunk_read_limit,
+                                   pass_read_limit,
+                                   output_granularity,
+                                   filter_col_names,
+                                   buffer,
+                                   buffer_length,
+                                   using_numpy_Types,
+                                   unit,
+                                   dec128_col_names);
+}
+
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ORCChunkedReader_hasNext(JNIEnv* env,
+                                                                        jclass,
+                                                                        jlong handle)
+{
+  JNI_NULL_CHECK(env, handle, "handle is null", false);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_orc_reader* const>(handle);
+    return reader_ptr->has_next();
+  }
+  CATCH_STD(env, false);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ORCChunkedReader_readChunk(JNIEnv* env,
+                                                                            jclass,
+                                                                            jlong handle)
+{
+  JNI_NULL_CHECK(env, handle, "handle is null", nullptr);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_orc_reader* const>(handle);
+    auto chunk            = reader_ptr->read_chunk();
+    return chunk.tbl ? cudf::jni::convert_table_for_return(env, chunk.tbl) : nullptr;
+  }
+  CATCH_STD(env, nullptr);
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ORCChunkedReader_close(JNIEnv* env, jclass, jlong handle)
+{
+  JNI_NULL_CHECK(env, handle, "handle is null", );
+
+  try {
+    cudf::jni::auto_set_device(env);
+    delete reinterpret_cast<cudf::io::chunked_orc_reader*>(handle);
+  }
+  CATCH_STD(env, );
+}
+
 }  // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 8560a9caad7..dc6eb55fc6a 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -81,6 +81,7 @@ public class TableTest extends CudfTestBase {
   private static final File TEST_PARQUET_FILE_CHUNKED_READ = TestUtils.getResourceAsFile("splittable.parquet");
   private static final File TEST_PARQUET_FILE_BINARY = TestUtils.getResourceAsFile("binary.parquet");
   private static final File TEST_ORC_FILE = TestUtils.getResourceAsFile("TestOrcFile.orc");
+  private static final File TEST_ORC_FILE_CHUNKED_READ = TestUtils.getResourceAsFile("splittable.orc");
   private static final File TEST_ORC_TIMESTAMP_DATE_FILE = TestUtils.getResourceAsFile("timestamp-date-test.orc");
   private static final File TEST_DECIMAL_PARQUET_FILE = TestUtils.getResourceAsFile("decimal.parquet");
   private static final File TEST_ALL_TYPES_PLAIN_AVRO_FILE = TestUtils.getResourceAsFile("alltypes_plain.avro");
@@ -1699,6 +1700,29 @@ void testReadORCTimeUnit() {
     }
   }
 
+  @Test
+  void testORCChunkedReader() throws IOException {
+    byte[] buffer = Files.readAllBytes(TEST_ORC_FILE_CHUNKED_READ.toPath());
+    long len = buffer.length;
+
+    try (HostMemoryBuffer hostBuf = hostMemoryAllocator.allocate(len)) {
+      hostBuf.setBytes(0, buffer, 0, len);
+      try (ORCChunkedReader reader = new ORCChunkedReader(0, 2 * 1024 * 1024, 10000,
+          ORCOptions.DEFAULT, hostBuf, 0, len)) {
+        int numChunks = 0;
+        long totalRows = 0;
+        while (reader.hasNext()) {
+          ++numChunks;
+          try (Table chunk = reader.readChunk()) {
+            totalRows += chunk.getRowCount();
+          }
+        }
+        assertEquals(10, numChunks);
+        assertEquals(1000000, totalRows);
+      }
+    }
+  }
+
   @Test
   void testCrossJoin() {
     try (Table leftTable = new Table.TestBuilder()
diff --git a/java/src/test/resources/splittable.orc b/java/src/test/resources/splittable.orc
new file mode 100644
index 0000000000000000000000000000000000000000..1f5e094534f2a550cab263dbba58b9cb473c1821
GIT binary patch
literal 385961
zcmeI%J!qT-7zgn8E?;l1X_K5^NxDet5ClU;i4-nmacV+02XWA`;OKe{4z`0vMTHtc
zZ5LY$R$Fwi)xl)ZDky$JZFOk7s4Z5|D#c5R*!KnD86WT$n%?E%f9b>T2}#O-Y475A
zQM8(^t@q=-r!%c)QM9Ma)ji9_)Xw2z>ut8vOKrZ*b|yC;x=YQ4*1fI%qI<PJl07+J
zbbsu9u7O7XY-eDoC~lO+jgCJA2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5Fqfc
zz(%K7?6tOo<=|K`v2E+IG4W7qsBDZiCeF>bn`N>2uH;oW6Cgl<009C72oNAZfB*pk
z1PBlyK!5-N0t5&UAV464fRZ=E73@HO009C72oNAZfB*pk1PBlyK!5-N0t5&UAV7cs
zfuMkrHwYz0fB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+0D%kwO5O}tumb@C1PBly
zK!5-N0t5&UAV7cs0RjXF5FkK+009C7f&xn3Ae0yZ0t5&UAV7cs0RjXF5FkK+009C7
z2oNAZfB*pk1TqLHc{5zW4g?4gAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBla3MhGl
zP+|lK5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U$RMEP&2R-f5FkK+009C72oNAZ
zfB*pk1PBlyK!5-N0t5&UAV44}pyUlgi4h<`fB*pk1PBlyK!5-N0t5&UAV7cs0RjXF
z5FkJxgMgAZ!xij6fB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+0D+)@k~at?Mt}eT
z0t5&UAV7cs0RjXF5FkK+009C72oNAZfB=CE0!rQtSFi&C0t5&UAV7cs0RjXF5FkK+
z009C72oNAZfB*pk1cCxe-XN410RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N0t7M$
zD0wqn!43on5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U2nr~9gHU1w2oNAZfB*pk
z1PBlyK!5-N0t5&UAV7cs0RjXF5Xc~)<jrseI}jj1fB*pk1PBlyK!5-N0t5&UAV7cs
z0RjXF5FkJxD4^sGLWvO|K!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZAcKICH^UX|
zK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZfB=D@fRZ-|B}RY%0RjXF5FkK+009C7
z2oNAZfB*pk1PBlyK!5;&3<65t3|FuN0RjXF5FkK+009C72oNAZfB*pk1PBlyK!5-N
z0tA8rO5Pxp7y$wV2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXv2q<|oT)_?m2oNAZ
zfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5C{q=d4o`51PBlyK!5-N0t5&UAV7cs0RjXF
z5FkK+009C72oT61pybVP1v?NRK!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZASj^Z
z4MK?#AV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyKp=yFk~hN@>_C730RjXF5FkK+
z009C72oNAZfB*pk1PBlyK!5;&pn#G$2qi{<009C72oNAZfB*pk1PBlyK!5-N0t5&U
zAV7csfeZpl-V9f;0|5dA2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjYq0!rQ>lo$a5
z1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7G6*PnGhD$A1PBlyK!5-N0t5&UAV7cs
z0RjXF5FkK+009C72oMMgD0zcWVgv{fAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBnw
zAfV*Ua0NRMAV7cs0RjXF5FkK+009C72oNAZfB*pk1PBlyKp-ff<PAcJ5g<T-009C7
z2oNAZfB*pk1PBlyK!5-N0t5&UAV464fRZ=E73@HO009C72oNAZfB*pk1PBlyK!5-N
z0t5&UAV7csfuMkrHwYz0fB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+0D%kwO5O}t
zumb@C1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+009C7f&xn3Ae0yZ0t5&UAV7cs0RjXF
z5FkK+009C72oNAZfB*pk1TqLHc{5zW4g?4gAV7cs0RjXF5FkK+009C72oNAZfB*pk
z1PBla3MhGlP+|lK5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U$RMEP&2R-f5FkK+
z009C72oNAZfB*pk1PBlyK!5-N0t5&UAV44}pyUlgi4h<`fB*pk1PBlyK!5-N0t5&U
zAV7cs0RjXF5FkJxgMgAZ!xij6fB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5FkK+0D+)@
zk~at?Mt}eT0t5&UAV7cs0RjXF5FkK+009C72oNAZfB=CE0!rQtSFi&C0t5&UAV7cs
z0RjXF5FkK+009C72oNAZfB*pk1cCxe-XN410RjXF5FkK+009C72oNAZfB*pk1PBly
zK!5-N0t7M$D0wqn!43on5FkK+009C72oNAZfB*pk1PBlyK!5-N0t5&U2nr~9gHU1w
z2oNAZfB*pk1PBlyK!5-N0t5&UAV7cs0RjXF5Xc~)<jrseI}jj1fB*pk1PBlyK!5-N
z0t5&UAV7cs0RjXF5FkJxD4^sGLWvO|K!5-N0t5&UAV7cs0RjXF5FkK+009C72oNAZ
zAcH`#<bAkZwC-ySmancgmW$S|n~zgFhl{PZ+0OVsPbN2?beGx-;+qm=e9ibG@QuTF
z0zW<cIQSp^POy8ZDEr1vAHQpAd++hFiF5OJ-!s$u?P+R9dHI3r$7i3o_2bd<?ROg0
z!Arl?27euGRuey5tPM6^ul%E<<+1N;gVjU*)z9(Um$kun&kj@*ADpcXjvN@QCJwCE
z20wR)s);Y&uMOV&y*7B|WLbU5%9YyS(y`%c;*(3Y!HL7IYU0K7wZWf<wp9}szN!t@
zpBt$rj(%DjJace+HF0gdHu!e`j%wn}>Du7%AGN_#r$(zU`RPh+@cEl#)x^7(YlF2T
z<JH9e3$?+;m)q6E$6wb5Z#{ooHSxk{wZZj+w^tMAH)?~AR_>@K-Z)bmJbk@3`1RDC
z)t7v8_13}kNIBAOG^Pd~dGzspbJP9hV7r+6@X6itgJtjU<zjx<&By=zugU#u`xZK@
idmbDpUmF^0PIS9t)6HUJ=0tmLXysa8-_qX2$-e<?uJaiH

literal 0
HcmV?d00001


From 4ce6674641def5a68dce633d3a21f17438ae48de Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 6 May 2024 14:43:20 -0500
Subject: [PATCH 152/842] Return `int64` when pandas compatible mode is turned
 on for `get_indexer` (#15659)

Fixes: #15658

This PR makes a change to `get_indexer` to return `int64` indices when pandas compatible mode is turned on.

Forks out of https://github.com/rapidsai/cudf/pull/14534

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15659
---
 python/cudf/cudf/core/_base_index.py |  6 ++++++
 python/cudf/cudf/core/index.py       |  8 ++++----
 python/cudf/cudf/core/multiindex.py  |  7 ++++---
 python/cudf/cudf/tests/test_index.py | 20 ++++++++++++++++++++
 4 files changed, 34 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index b5630ff9a54..fe0f39f9d0a 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -2205,3 +2205,9 @@ def _split(self, splits):
 
 def _get_result_name(left_name, right_name):
     return left_name if _is_same_name(left_name, right_name) else None
+
+
+def _return_get_indexer_result(result):
+    if cudf.get_option("mode.pandas_compatible"):
+        return result.astype("int64")
+    return result
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index b51751a1b55..a2ad10a0590 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -38,7 +38,7 @@
     is_list_like,
     is_scalar,
 )
-from cudf.core._base_index import BaseIndex
+from cudf.core._base_index import BaseIndex, _return_get_indexer_result
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.column import (
     CategoricalColumn,
@@ -1256,11 +1256,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
         )
 
         if not len(self):
-            return result.values
+            return _return_get_indexer_result(result.values)
         try:
             lcol, rcol = _match_join_keys(needle, self._column, "inner")
         except ValueError:
-            return result.values
+            return _return_get_indexer_result(result.values)
 
         scatter_map, indices = libcudf.join.join([lcol], [rcol], how="inner")
         (result,) = libcudf.copying.scatter([indices], scatter_map, [result])
@@ -1287,7 +1287,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
                 "{['ffill'/'pad', 'bfill'/'backfill', 'nearest', None]}"
             )
 
-        return result_series.to_cupy()
+        return _return_get_indexer_result(result_series.to_cupy())
 
     @_cudf_nvtx_annotate
     def get_loc(self, key):
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 1ab42df111f..c3184f51a4c 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -23,6 +23,7 @@
 from cudf.api.extensions import no_default
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
 from cudf.core import column
+from cudf.core._base_index import _return_get_indexer_result
 from cudf.core.frame import Frame
 from cudf.core.index import (
     BaseIndex,
@@ -1858,11 +1859,11 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             dtype=libcudf.types.size_type_dtype,
         )
         if not len(self):
-            return result.values
+            return _return_get_indexer_result(result.values)
         try:
             target = cudf.MultiIndex.from_tuples(target)
         except TypeError:
-            return result.values
+            return _return_get_indexer_result(result.values)
 
         join_keys = [
             _match_join_keys(lcol, rcol, "inner")
@@ -1892,7 +1893,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
                 "{['ffill'/'pad', 'bfill'/'backfill', None]}"
             )
 
-        return result_series.to_cupy()
+        return _return_get_indexer_result(result_series.to_cupy())
 
     @_cudf_nvtx_annotate
     def get_loc(self, key):
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 104a5fc0ffa..4ff1beb0a9a 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -1741,6 +1741,10 @@ def test_get_indexer_single_unique_numeric(idx, key, method):
 
         assert_eq(expected, got)
 
+        with cudf.option_context("mode.pandas_compatible", True):
+            got = gi.get_indexer(key, method=method)
+        assert_eq(expected, got, check_dtype=True)
+
 
 @pytest.mark.parametrize(
     "idx",
@@ -1770,6 +1774,12 @@ def test_get_indexer_rangeindex(idx, key, method, tolerance):
 
     assert_eq(expected, got)
 
+    with cudf.option_context("mode.pandas_compatible", True):
+        got = gi.get_indexer(
+            key, method=method, tolerance=None if method is None else tolerance
+        )
+    assert_eq(expected, got, check_dtype=True)
+
 
 @pytest.mark.parametrize(
     "idx",
@@ -1950,6 +1960,11 @@ def test_get_indexer_single_duplicate_string(idx, key, method):
 
         assert_eq(expected, got)
 
+        with cudf.option_context("mode.pandas_compatible", True):
+            got = gi.get_indexer(key, method=method)
+
+        assert_eq(expected, got, check_dtype=True)
+
 
 @pytest.mark.parametrize(
     "idx",
@@ -2009,6 +2024,11 @@ def test_get_indexer_multi_numeric(idx, key, method):
 
     assert_eq(expected, got)
 
+    with cudf.option_context("mode.pandas_compatible", True):
+        got = gi.get_indexer(key, method=method)
+
+    assert_eq(expected, got, check_dtype=True)
+
 
 @pytest.mark.parametrize(
     "idx",

From bc3071ed9c730333f71c7143f1a53abda42b0b56 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 6 May 2024 15:12:06 -0700
Subject: [PATCH 153/842] Improve distinct join with set `retrieve` (#15636)

This PR updates the distinct join to use `static_set::retrieve` instead of the custom device code.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Karthikeyan (https://github.com/karthikeyann)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/15636
---
 .../cudf/detail/distinct_hash_join.cuh        |  12 +-
 cpp/src/join/distinct_hash_join.cu            | 192 ++----------------
 2 files changed, 19 insertions(+), 185 deletions(-)

diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
index 93d52d5dda3..de3d23e9470 100644
--- a/cpp/include/cudf/detail/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -85,16 +85,10 @@ struct hasher_adapter {
 template <cudf::has_nested HasNested>
 struct distinct_hash_join {
  private:
-  /// Row equality type for nested columns
-  using nested_row_equal = cudf::experimental::row::equality::strong_index_comparator_adapter<
-    cudf::experimental::row::equality::device_row_comparator<true, cudf::nullate::DYNAMIC>>;
-  /// Row equality type for flat columns
-  using flat_row_equal = cudf::experimental::row::equality::strong_index_comparator_adapter<
-    cudf::experimental::row::equality::device_row_comparator<false, cudf::nullate::DYNAMIC>>;
-
   /// Device row equal type
-  using d_equal_type =
-    std::conditional_t<HasNested == cudf::has_nested::YES, nested_row_equal, flat_row_equal>;
+  using d_equal_type = cudf::experimental::row::equality::strong_index_comparator_adapter<
+    cudf::experimental::row::equality::device_row_comparator<HasNested == cudf::has_nested::YES,
+                                                             cudf::nullate::DYNAMIC>>;
   using hasher              = hasher_adapter<thrust::identity<hash_value_type>>;
   using probing_scheme_type = cuco::linear_probing<1, hasher>;
   using cuco_storage_type   = cuco::storage<1>;
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index a3652942973..ad401bdccba 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -46,8 +46,6 @@ namespace cudf {
 namespace detail {
 namespace {
 
-static auto constexpr DISTINCT_JOIN_BLOCK_SIZE = 256;
-
 template <cudf::has_nested HasNested>
 auto prepare_device_equal(
   std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> build,
@@ -82,7 +80,7 @@ class build_keys_fn {
 
 /**
  * @brief Device output transform functor to construct `size_type` with `cuco::pair<hash_value_type,
- * lhs_index_type>`
+ * lhs_index_type>` or `cuco::pair<hash_value_type, rhs_index_type>`
  */
 struct output_fn {
   __device__ constexpr cudf::size_type operator()(
@@ -90,167 +88,12 @@ struct output_fn {
   {
     return static_cast<cudf::size_type>(x.second);
   }
-};
-
-template <typename Tile>
-__device__ void flush_buffer(Tile const& tile,
-                             cudf::size_type tile_count,
-                             cuco::pair<cudf::size_type, cudf::size_type>* buffer,
-                             cudf::size_type* counter,
-                             cudf::size_type* build_indices,
-                             cudf::size_type* probe_indices)
-{
-  cudf::size_type offset;
-  auto const lane_id = tile.thread_rank();
-  if (0 == lane_id) { offset = atomicAdd(counter, tile_count); }
-  offset = tile.shfl(offset, 0);
-
-  for (cudf::size_type i = lane_id; i < tile_count; i += tile.size()) {
-    auto const& [build_idx, probe_idx] = buffer[i];
-    *(build_indices + offset + i)      = build_idx;
-    *(probe_indices + offset + i)      = probe_idx;
-  }
-}
-
-__device__ void flush_buffer(cooperative_groups::thread_block const& block,
-                             cudf::size_type buffer_size,
-                             cuco::pair<cudf::size_type, cudf::size_type>* buffer,
-                             cudf::size_type* counter,
-                             cudf::size_type* build_indices,
-                             cudf::size_type* probe_indices)
-{
-  auto i = block.thread_rank();
-  __shared__ cudf::size_type offset;
-
-  if (i == 0) { offset = atomicAdd(counter, buffer_size); }
-  block.sync();
-
-  while (i < buffer_size) {
-    auto const& [build_idx, probe_idx] = buffer[i];
-    *(build_indices + offset + i)      = build_idx;
-    *(probe_indices + offset + i)      = probe_idx;
-
-    i += block.size();
-  }
-}
-
-// TODO: custom kernel to be replaced by cuco::static_set::retrieve
-template <typename Iter, typename HashTable>
-CUDF_KERNEL void distinct_join_probe_kernel(Iter iter,
-                                            cudf::size_type n,
-                                            HashTable hash_table,
-                                            cudf::size_type* counter,
-                                            cudf::size_type* build_indices,
-                                            cudf::size_type* probe_indices)
-{
-  namespace cg = cooperative_groups;
-
-  auto constexpr tile_size   = HashTable::cg_size;
-  auto constexpr window_size = HashTable::window_size;
-
-  auto idx          = cudf::detail::grid_1d::global_thread_id() / tile_size;
-  auto const stride = cudf::detail::grid_1d::grid_stride() / tile_size;
-  auto const block  = cg::this_thread_block();
-
-  // CG-based probing algorithm
-  if constexpr (tile_size != 1) {
-    auto const tile = cg::tiled_partition<tile_size>(block);
-
-    auto constexpr flushing_tile_size = cudf::detail::warp_size / window_size;
-    // random choice to tune
-    auto constexpr flushing_buffer_size = 2 * flushing_tile_size;
-    auto constexpr num_flushing_tiles   = DISTINCT_JOIN_BLOCK_SIZE / flushing_tile_size;
-    auto constexpr max_matches          = flushing_tile_size / tile_size;
-
-    auto const flushing_tile    = cg::tiled_partition<flushing_tile_size>(block);
-    auto const flushing_tile_id = block.thread_rank() / flushing_tile_size;
-
-    __shared__ cuco::pair<cudf::size_type, cudf::size_type>
-      flushing_tile_buffer[num_flushing_tiles][flushing_tile_size];
-    // per flushing-tile counter to track number of filled elements
-    __shared__ cudf::size_type flushing_counter[num_flushing_tiles];
-
-    if (flushing_tile.thread_rank() == 0) { flushing_counter[flushing_tile_id] = 0; }
-    flushing_tile.sync();  // sync still needed since cg.any doesn't imply a memory barrier
-
-    while (flushing_tile.any(idx < n)) {
-      bool active_flag = idx < n;
-      auto const active_flushing_tile =
-        cg::binary_partition<flushing_tile_size>(flushing_tile, active_flag);
-      if (active_flag) {
-        auto const found = hash_table.find(tile, *(iter + idx));
-        if (tile.thread_rank() == 0 and found != hash_table.end()) {
-          auto const offset = atomicAdd_block(&flushing_counter[flushing_tile_id], 1);
-          flushing_tile_buffer[flushing_tile_id][offset] = cuco::pair{
-            static_cast<cudf::size_type>(found->second), static_cast<cudf::size_type>(idx)};
-        }
-      }
-
-      flushing_tile.sync();
-      if (flushing_counter[flushing_tile_id] + max_matches > flushing_buffer_size) {
-        flush_buffer(flushing_tile,
-                     flushing_counter[flushing_tile_id],
-                     flushing_tile_buffer[flushing_tile_id],
-                     counter,
-                     build_indices,
-                     probe_indices);
-        flushing_tile.sync();
-        if (flushing_tile.thread_rank() == 0) { flushing_counter[flushing_tile_id] = 0; }
-        flushing_tile.sync();
-      }
-
-      idx += stride;
-    }  // while
-
-    if (flushing_counter[flushing_tile_id] > 0) {
-      flush_buffer(flushing_tile,
-                   flushing_counter[flushing_tile_id],
-                   flushing_tile_buffer[flushing_tile_id],
-                   counter,
-                   build_indices,
-                   probe_indices);
-    }
-  }
-  // Scalar probing for CG size 1
-  else {
-    using block_scan = cub::BlockScan<cudf::size_type, DISTINCT_JOIN_BLOCK_SIZE>;
-    __shared__ typename block_scan::TempStorage block_scan_temp_storage;
-
-    auto constexpr buffer_capacity = 2 * DISTINCT_JOIN_BLOCK_SIZE;
-    __shared__ cuco::pair<cudf::size_type, cudf::size_type> buffer[buffer_capacity];
-    cudf::size_type buffer_size = 0;
-
-    while (idx - block.thread_rank() < n) {  // the whole thread block falls into the same iteration
-      auto const found     = idx < n ? hash_table.find(*(iter + idx)) : hash_table.end();
-      auto const has_match = found != hash_table.end();
-
-      // Use a whole-block scan to calculate the output location
-      cudf::size_type offset;
-      cudf::size_type block_count;
-      block_scan(block_scan_temp_storage)
-        .ExclusiveSum(static_cast<cudf::size_type>(has_match), offset, block_count);
-
-      if (buffer_size + block_count > buffer_capacity) {
-        flush_buffer(block, buffer_size, buffer, counter, build_indices, probe_indices);
-        block.sync();
-        buffer_size = 0;
-      }
-
-      if (has_match) {
-        buffer[buffer_size + offset] = cuco::pair{static_cast<cudf::size_type>(found->second),
-                                                  static_cast<cudf::size_type>(idx)};
-      }
-      buffer_size += block_count;
-      block.sync();
-
-      idx += stride;
-    }  // while
-
-    if (buffer_size > 0) {
-      flush_buffer(block, buffer_size, buffer, counter, build_indices, probe_indices);
-    }
+  __device__ constexpr cudf::size_type operator()(
+    cuco::pair<hash_value_type, rhs_index_type> const& x) const
+  {
+    return static_cast<cudf::size_type>(x.second);
   }
-}
+};
 }  // namespace
 
 template <cudf::has_nested HasNested>
@@ -332,19 +175,16 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
   auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls});
   auto const iter           = cudf::detail::make_counting_transform_iterator(
     0, build_keys_fn<decltype(d_probe_hasher), rhs_index_type>{d_probe_hasher});
-  auto counter = rmm::device_scalar<cudf::size_type>{stream};
-  counter.set_value_to_zero_async(stream);
-
-  cudf::detail::grid_1d grid{probe_table_num_rows, DISTINCT_JOIN_BLOCK_SIZE};
-  distinct_join_probe_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    iter,
-    probe_table_num_rows,
-    this->_hash_table.ref(cuco::find),
-    counter.data(),
-    build_indices->data(),
-    probe_indices->data());
-
-  auto const actual_size = counter.value(stream);
+
+  auto const build_indices_begin =
+    thrust::make_transform_output_iterator(build_indices->begin(), output_fn{});
+  auto const probe_indices_begin =
+    thrust::make_transform_output_iterator(probe_indices->begin(), output_fn{});
+
+  auto const [probe_indices_end, _] = this->_hash_table.retrieve(
+    iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, stream.value());
+
+  auto const actual_size = std::distance(probe_indices_begin, probe_indices_end);
   build_indices->resize(actual_size, stream);
   probe_indices->resize(actual_size, stream);
 

From dcd0d6b97da07db36f5c8c9fa0e33ac54dcbcaf0 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 6 May 2024 18:05:58 -0500
Subject: [PATCH 154/842] Remove host_parse_nested_json. (#15674)

This PR addresses a task from #15537 to remove the `host_parse_nested_json` code path and corresponding tests. See discussion in https://github.com/rapidsai/cudf/pull/15568#issuecomment-2067024223.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/15674
---
 cpp/src/io/json/nested_json.hpp   | 23 +++++++------------
 cpp/src/io/json/read_json.cu      |  1 -
 cpp/tests/io/nested_json_test.cpp | 37 +++++++++++--------------------
 3 files changed, 21 insertions(+), 40 deletions(-)

diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 52ea23c7f1c..5817a01c21f 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -302,9 +302,16 @@ reduce_to_column_tree(tree_meta_t& tree,
 cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& options,
                                         rmm::cuda_stream_view stream);
 
-/** @copydoc host_parse_nested_json
+/**
+ * @brief Parses the given JSON string and generates table from the given input.
+ *
  * All processing is done in device memory.
  *
+ * @param input The JSON input
+ * @param options Parsing options specifying the parsing behaviour
+ * @param stream The CUDA stream to which kernels are dispatched
+ * @param mr Optional, resource with which to allocate
+ * @return The data parsed from the given JSON input
  */
 table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
                                              cudf::io::json_reader_options const& options,
@@ -337,20 +344,6 @@ struct path_from_tree {
   std::vector<path_rep> get_path(NodeIndexT this_col_id);
 };
 
-/**
- * @brief Parses the given JSON string and generates table from the given input.
- *
- * @param input The JSON input
- * @param options Parsing options specifying the parsing behaviour
- * @param stream The CUDA stream to which kernels are dispatched
- * @param mr Optional, resource with which to allocate
- * @return The data parsed from the given JSON input
- */
-table_with_metadata host_parse_nested_json(device_span<SymbolT const> input,
-                                           cudf::io::json_reader_options const& options,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr);
-
 }  // namespace detail
 
 }  // namespace cudf::io::json
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 89c301ec055..0ead5c56264 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -307,7 +307,6 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
     cudf::device_span<char const>(reinterpret_cast<char const*>(bufview.data()), bufview.size());
   stream.synchronize();
   return device_parse_nested_json(buffer, reader_opts, stream, mr);
-  // For debug purposes, use host_parse_nested_json()
 }
 
 }  // namespace cudf::io::json::detail
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 2e2d5cae34c..112ee8fb57b 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -620,15 +620,12 @@ TEST_F(JsonTest, TokenStream2)
   }
 }
 
-struct JsonParserTest : public cudf::test::BaseFixture, public testing::WithParamInterface<bool> {};
-INSTANTIATE_TEST_SUITE_P(IsFullGPU, JsonParserTest, testing::Bool());
+struct JsonParserTest : public cudf::test::BaseFixture {};
 
-TEST_P(JsonParserTest, ExtractColumn)
+TEST_F(JsonParserTest, ExtractColumn)
 {
   using cuio_json::SymbolT;
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto json_parser = cuio_json::detail::device_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
@@ -867,14 +864,12 @@ TEST_F(JsonTest, PostProcessTokenStream)
   }
 }
 
-TEST_P(JsonParserTest, UTF_JSON)
+TEST_F(JsonParserTest, UTF_JSON)
 {
   // Prepare cuda stream for data transfers & kernels
-  auto const stream      = cudf::get_default_stream();
-  auto mr                = rmm::mr::get_current_device_resource();
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto const stream = cudf::get_default_stream();
+  auto mr           = rmm::mr::get_current_device_resource();
+  auto json_parser  = cuio_json::detail::device_parse_nested_json;
 
   // Default parsing options
   cudf::io::json_reader_options default_options{};
@@ -924,12 +919,10 @@ TEST_P(JsonParserTest, UTF_JSON)
   CUDF_EXPECT_NO_THROW(json_parser(d_utf_pass, default_options, stream, mr));
 }
 
-TEST_P(JsonParserTest, ExtractColumnWithQuotes)
+TEST_F(JsonParserTest, ExtractColumnWithQuotes)
 {
   using cuio_json::SymbolT;
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto json_parser = cuio_json::detail::device_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
@@ -959,12 +952,10 @@ TEST_P(JsonParserTest, ExtractColumnWithQuotes)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2);
 }
 
-TEST_P(JsonParserTest, ExpectFailMixStructAndList)
+TEST_F(JsonParserTest, ExpectFailMixStructAndList)
 {
   using cuio_json::SymbolT;
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto json_parser = cuio_json::detail::device_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
@@ -1002,12 +993,10 @@ TEST_P(JsonParserTest, ExpectFailMixStructAndList)
   }
 }
 
-TEST_P(JsonParserTest, EmptyString)
+TEST_F(JsonParserTest, EmptyString)
 {
   using cuio_json::SymbolT;
-  bool const is_full_gpu = GetParam();
-  auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
-                                       : cuio_json::detail::host_parse_nested_json;
+  auto json_parser = cuio_json::detail::device_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();

From 5f02cb8e9f5f6431196ac188029c29690518e3f7 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Mon, 6 May 2024 18:38:05 -0500
Subject: [PATCH 155/842] Migrate string `find` operations to `pylibcudf`
 (#15604)

This PR implements libcudf's string `find.hpp` and migrates existing cuDF cython to leverage it.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15604
---
 python/cudf/cudf/_lib/cpp/strings/find.pxd    |   7 +-
 .../_lib/pylibcudf/strings/CMakeLists.txt     |   2 +-
 .../cudf/_lib/pylibcudf/strings/__init__.pxd  |   2 +-
 .../cudf/_lib/pylibcudf/strings/__init__.py   |   2 +-
 .../cudf/cudf/_lib/pylibcudf/strings/find.pxd |  38 +++
 .../cudf/cudf/_lib/pylibcudf/strings/find.pyx | 277 ++++++++++++++++++
 python/cudf/cudf/_lib/strings/find.pyx        | 174 +++--------
 .../cudf/pylibcudf_tests/test_string_find.py  | 262 +++++++++++++++++
 8 files changed, 633 insertions(+), 131 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_find.py

diff --git a/python/cudf/cudf/_lib/cpp/strings/find.pxd b/python/cudf/cudf/_lib/cpp/strings/find.pxd
index 953d5c30b2a..dfbdebb9651 100644
--- a/python/cudf/cudf/_lib/cpp/strings/find.pxd
+++ b/python/cudf/cudf/_lib/cpp/strings/find.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -41,6 +41,11 @@ cdef extern from "cudf/strings/find.hpp" namespace "cudf::strings" nogil:
         size_type start,
         size_type stop) except +
 
+    cdef unique_ptr[column] find(
+        column_view source_strings,
+        column_view target,
+        size_type start) except +
+
     cdef unique_ptr[column] rfind(
         column_view source_strings,
         string_scalar target,
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
index 3a2a9e1e7eb..c42b57ece63 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources case.pyx)
+set(cython_sources case.pyx find.pyx)
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
   CXX
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
index ff87549b5b5..33e2d56c087 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
@@ -1,3 +1,3 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import case
+from . cimport case, find
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
index ff87549b5b5..9220f6bd045 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
@@ -1,3 +1,3 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import case
+from . import case, find
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
new file mode 100644
index 00000000000..22e933106c7
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.scalar cimport Scalar
+
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
+
+cpdef Column find(
+    Column input,
+    ColumnOrScalar target,
+    size_type start=*,
+    size_type stop=*
+)
+
+cpdef Column rfind(
+    Column input,
+    Scalar target,
+    size_type start=*,
+    size_type stop=*
+)
+
+cpdef Column contains(
+    Column input,
+    ColumnOrScalar target,
+)
+
+cpdef Column starts_with(
+    Column input,
+    ColumnOrScalar target,
+)
+
+cpdef Column ends_with(
+    Column input,
+    ColumnOrScalar target,
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
new file mode 100644
index 00000000000..1d94132a8b3
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
@@ -0,0 +1,277 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.strings cimport find as cpp_find
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.scalar cimport Scalar
+
+from cython.operator import dereference
+
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+
+
+cpdef Column find(
+    Column input,
+    ColumnOrScalar target,
+    size_type start=0,
+    size_type stop=-1
+):
+    """Returns a column of character position values where the target string is
+    first found in each string of the provided column.
+
+    ``target`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    If ``target`` is a scalar, the scalar will be searched for in each string.
+    If ``target`` is a column, the corresponding string in the column will be
+    searched for in each string.
+
+    For details, see :cpp:func:`find`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Union[Column, Scalar]
+        String to search for in each string
+    start : size_type
+        First character position to include in the search
+    stop : size_type
+        Last position (exclusive) to include in the search. Default of -1 will
+        search to the end of the string.
+
+    Returns
+    -------
+    pylibcudf.Column
+        New integer column with character position values
+    """
+    cdef unique_ptr[column] result
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(
+                cpp_find.find(
+                    input.view(),
+                    target.view(),
+                    start
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        with nogil:
+            result = move(
+                cpp_find.find(
+                    input.view(),
+                    dereference(<string_scalar*>(target.c_obj.get())),
+                    start,
+                    stop
+                )
+            )
+    else:
+        raise ValueError(f"Invalid target {target}")
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column rfind(
+    Column input,
+    Scalar target,
+    size_type start=0,
+    size_type stop=-1
+):
+    """
+    Returns a column of character position values where the target string is
+    first found searching from the end of each string.
+
+    For details, see :cpp:func:`rfind`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Scalar
+        String to search for in each string
+    start : size_type
+        First character position to include in the search
+    stop : size_type
+        Last position (exclusive) to include in the search. Default of -1 will
+        search to the end of the string.
+
+    Returns
+    -------
+    pylibcudf.Column
+        New integer column with character position values
+    """
+    cdef unique_ptr[column] result
+    with nogil:
+        result = move(
+            cpp_find.rfind(
+                input.view(),
+                dereference(<string_scalar*>(target.c_obj.get())),
+                start,
+                stop
+            )
+        )
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column contains(
+    Column input,
+    ColumnOrScalar target,
+):
+    """
+    Returns a column of boolean values for each string where true indicates the
+    corresponding target string was found within that string in the provided
+    column.
+
+    ``target`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    If ``target`` is a scalar, the scalar will be searched for in each string.
+    If ``target`` is a column, the corresponding string in the column will be
+    searched for in each string.
+
+    For details, see :cpp:func:`contains`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Union[Column, Scalar]
+        String to search for in each string
+
+    Returns
+    -------
+    pylibcudf.Column
+        New boolean column with True for each string that contains the target
+    """
+    cdef unique_ptr[column] result
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(
+                cpp_find.contains(
+                    input.view(),
+                    target.view()
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        with nogil:
+            result = move(
+                cpp_find.contains(
+                    input.view(),
+                    dereference(<string_scalar*>(target.c_obj.get()))
+                )
+            )
+    else:
+        raise ValueError(f"Invalid target {target}")
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column starts_with(
+    Column input,
+    ColumnOrScalar target,
+):
+    """
+    Returns a column of boolean values for each string where true indicates the
+    target string was found at the beginning of the string in the provided
+    column.
+
+    ``target`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    If ``target`` is a scalar, the scalar will be searched for in each string.
+    If ``target`` is a column, the corresponding string in the column will be
+    searched for in each string.
+
+    For details, see :cpp:func:`starts_with`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Union[Column, Scalar]
+        String to search for at the beginning of each string
+
+    Returns
+    -------
+    pylibcudf.Column
+        New boolean column with True for each string that starts with the target
+    """
+    cdef unique_ptr[column] result
+
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(
+                cpp_find.starts_with(
+                    input.view(),
+                    target.view()
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        with nogil:
+            result = move(
+                cpp_find.starts_with(
+                    input.view(),
+                    dereference(<string_scalar*>(target.c_obj.get()))
+                )
+            )
+    else:
+        raise ValueError(f"Invalid target {target}")
+
+    return Column.from_libcudf(move(result))
+
+cpdef Column ends_with(
+    Column input,
+    ColumnOrScalar target,
+):
+    """
+    Returns a column of boolean values for each string where true indicates the
+    target string was found at the end of the string in the provided column.
+
+    ``target`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    If ``target`` is a scalar, the scalar will be searched for in each string.
+    If ``target`` is a column, the corresponding string in the column will be
+    searched for in each string.
+
+    For details, see :cpp:func:`ends_with`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Union[Column, Scalar]
+        String to search for at the end of each string
+
+    Returns
+    -------
+    pylibcudf.Column
+        New boolean column with True for each string that ends with the target
+    """
+    cdef unique_ptr[column] result
+    if ColumnOrScalar is Column:
+        with nogil:
+            result = move(
+                cpp_find.ends_with(
+                    input.view(),
+                    target.view()
+                )
+            )
+    elif ColumnOrScalar is Scalar:
+        with nogil:
+            result = move(
+                cpp_find.ends_with(
+                    input.view(),
+                    dereference(<string_scalar*>(target.c_obj.get()))
+                )
+            )
+    else:
+        raise ValueError(f"Invalid target {target}")
+
+    return Column.from_libcudf(move(result))
diff --git a/python/cudf/cudf/_lib/strings/find.pyx b/python/cudf/cudf/_lib/strings/find.pyx
index f6dd3b80de9..341776b102c 100644
--- a/python/cudf/cudf/_lib/strings/find.pyx
+++ b/python/cudf/cudf/_lib/strings/find.pyx
@@ -1,23 +1,10 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+import cudf._lib.pylibcudf as plc
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.find cimport (
-    contains as cpp_contains,
-    ends_with as cpp_ends_with,
-    find as cpp_find,
-    rfind as cpp_rfind,
-    starts_with as cpp_starts_with,
-)
 from cudf._lib.cpp.types cimport size_type
-from cudf._lib.scalar cimport DeviceScalar
 
 
 @acquire_spill_lock()
@@ -26,23 +13,13 @@ def contains(Column source_strings, object py_target):
     Returns a Column of boolean values with True for `source_strings`
     that contain the pattern given in `py_target`.
     """
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
+    return Column.from_pylibcudf(
+        plc.strings.find.contains(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value
+        )
     )
 
-    with nogil:
-        c_result = move(cpp_contains(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def contains_multiple(Column source_strings, Column target_strings):
@@ -50,17 +27,12 @@ def contains_multiple(Column source_strings, Column target_strings):
     Returns a Column of boolean values with True for `source_strings`
     that contain the corresponding string in `target_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view target_view = target_strings.view()
-
-    with nogil:
-        c_result = move(cpp_contains(
-            source_view,
-            target_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.find.contains(
+            source_strings.to_pylibcudf(mode="read"),
+            target_strings.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -70,23 +42,13 @@ def endswith(Column source_strings, object py_target):
     that contain strings that end with the pattern given in `py_target`.
     """
 
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
+    return Column.from_pylibcudf(
+        plc.strings.find.ends_with(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value
+        )
     )
 
-    with nogil:
-        c_result = move(cpp_ends_with(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def endswith_multiple(Column source_strings, Column target_strings):
@@ -95,17 +57,12 @@ def endswith_multiple(Column source_strings, Column target_strings):
     that contain strings that end with corresponding location
     in `target_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view target_view = target_strings.view()
-
-    with nogil:
-        c_result = move(cpp_ends_with(
-            source_view,
-            target_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.find.ends_with(
+            source_strings.to_pylibcudf(mode="read"),
+            target_strings.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -114,24 +71,13 @@ def startswith(Column source_strings, object py_target):
     Returns a Column of boolean values with True for `source_strings`
     that contain strings that start with the pattern given in `py_target`.
     """
-
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
+    return Column.from_pylibcudf(
+        plc.strings.find.starts_with(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value
+        )
     )
 
-    with nogil:
-        c_result = move(cpp_starts_with(
-            source_view,
-            scalar_str[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def startswith_multiple(Column source_strings, Column target_strings):
@@ -140,17 +86,12 @@ def startswith_multiple(Column source_strings, Column target_strings):
     that contain strings that begin with corresponding location
     in `target_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view target_view = target_strings.view()
-
-    with nogil:
-        c_result = move(cpp_starts_with(
-            source_view,
-            target_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.find.starts_with(
+            source_strings.to_pylibcudf(mode="read"),
+            target_strings.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -164,25 +105,14 @@ def find(Column source_strings,
     Scan portion of strings in `source_strings` can be
     controlled by setting `start` and `end` values.
     """
-
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_find(
-            source_view,
-            scalar_str[0],
+    return Column.from_pylibcudf(
+        plc.strings.find.find(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value,
             start,
             end
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -197,21 +127,11 @@ def rfind(Column source_strings,
     controlled by setting `start` and `end` values.
     """
 
-    cdef DeviceScalar target = py_target.device_value
-
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        target.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_rfind(
-            source_view,
-            scalar_str[0],
+    return Column.from_pylibcudf(
+        plc.strings.find.rfind(
+            source_strings.to_pylibcudf(mode="read"),
+            py_target.device_value.c_value,
             start,
             end
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+        )
+    )
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_find.py b/python/cudf/cudf/pylibcudf_tests/test_string_find.py
new file mode 100644
index 00000000000..f44c4af9bfc
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_find.py
@@ -0,0 +1,262 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def pa_data_col():
+    return pa.array(
+        [
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+            "abc123",
+            "ABC123",
+            "aBc123",
+            "",
+            " ",
+            None,
+            "a",
+            None,
+        ]
+    )
+
+
+@pytest.fixture(scope="module")
+def plc_data_col(pa_data_col):
+    return plc.interop.from_arrow(pa_data_col)
+
+
+@pytest.fixture(scope="module")
+def pa_target_col():
+    return pa.array(
+        [
+            "a",
+            "B",
+            "x",
+            "1",
+            " ",
+            "a",
+            None,
+            None,  # find
+            "a",
+            "B",
+            "x",
+            "1",
+            " ",
+            "a",
+            None,
+            None,  # rfind
+            "ab",
+            "12",
+            "BC",
+            "",
+            " ",
+            "a",
+            None,
+            None,  # contains
+            "ab",
+            "ABC",
+            "AB",
+            "",
+            " ",
+            "a",
+            None,
+            None,  # starts_with
+            "3",
+            "23",
+            "a23",
+            "",
+            " ",
+            "a",
+            None,
+            None,  # ends_with
+        ]
+    )
+
+
+@pytest.fixture(scope="module")
+def plc_target_col(pa_target_col):
+    return plc.interop.from_arrow(pa_target_col)
+
+
+@pytest.fixture(params=["a", " ", "A", "Ab", "23"], scope="module")
+def pa_target_scalar(request):
+    return pa.scalar(request.param, type=pa.string())
+
+
+@pytest.fixture(scope="module")
+def plc_target_scalar(pa_target_scalar):
+    return plc.interop.from_arrow(pa_target_scalar)
+
+
+def test_find(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar):
+    got = plc.strings.find.find(plc_data_col, plc_target_scalar, 0, -1)
+
+    expected = pa.array(
+        [
+            elem.find(pa_target_scalar.as_py()) if elem is not None else None
+            for elem in pa_data_col.to_pylist()
+        ],
+        type=pa.int32(),
+    )
+
+    assert_column_eq(got, expected)
+
+
+def colwise_apply(pa_data_col, pa_target_col, operator):
+    def handle_none(st, target):
+        # Match libcudf handling of nulls
+        if st is None:
+            return None
+        elif target is None:
+            return False
+        else:
+            return operator(st, target)
+
+    expected = pa.array(
+        [
+            handle_none(elem, target)
+            for elem, target in zip(
+                pa_data_col.to_pylist(),
+                pa_target_col.to_pylist(),
+            )
+        ],
+        type=pa.bool_(),
+    )
+
+    return expected
+
+
+def test_find_column(pa_data_col, pa_target_col, plc_data_col, plc_target_col):
+    expected = pa.array(
+        [
+            elem.find(target) if not (elem is None or target is None) else None
+            for elem, target in zip(
+                pa_data_col.to_pylist(),
+                pa_target_col.to_pylist(),
+            )
+        ],
+        type=pa.int32(),
+    )
+
+    got = plc.strings.find.find(plc_data_col, plc_target_col, 0)
+    assert_column_eq(got, expected)
+
+
+def test_rfind(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar):
+    py_target = pa_target_scalar.as_py()
+
+    got = plc.strings.find.rfind(plc_data_col, plc_target_scalar, 0, -1)
+
+    expected = pa.array(
+        [
+            elem.rfind(py_target)
+            if not (elem is None or py_target is None)
+            else None
+            for elem in pa_data_col.to_pylist()
+        ],
+        type=pa.int32(),
+    )
+
+    assert_column_eq(got, expected)
+
+
+def test_contains(
+    pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar
+):
+    py_target = pa_target_scalar.as_py()
+
+    got = plc.strings.find.contains(plc_data_col, plc_target_scalar)
+    expected = pa.array(
+        [
+            py_target in elem
+            if not (elem is None or py_target is None)
+            else None
+            for elem in pa_data_col.to_pylist()
+        ],
+        type=pa.bool_(),
+    )
+
+    assert_column_eq(got, expected)
+
+
+def test_contains_column(
+    pa_data_col, pa_target_col, plc_data_col, plc_target_col
+):
+    expected = colwise_apply(
+        pa_data_col, pa_target_col, lambda st, target: target in st
+    )
+    got = plc.strings.find.contains(plc_data_col, plc_target_col)
+    assert_column_eq(got, expected)
+
+
+def test_starts_with(
+    pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar
+):
+    py_target = pa_target_scalar.as_py()
+    got = plc.strings.find.starts_with(plc_data_col, plc_target_scalar)
+    expected = pa.compute.starts_with(pa_data_col, py_target)
+    assert_column_eq(got, expected)
+
+
+def test_starts_with_column(
+    pa_data_col, pa_target_col, plc_data_col, plc_target_col
+):
+    expected = colwise_apply(
+        pa_data_col, pa_target_col, lambda st, target: st.startswith(target)
+    )
+    got = plc.strings.find.starts_with(plc_data_col, plc_target_col)
+    assert_column_eq(got, expected)
+
+
+def test_ends_with(
+    pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar
+):
+    py_target = pa_target_scalar.as_py()
+    got = plc.strings.find.ends_with(plc_data_col, plc_target_scalar)
+    expected = pa.compute.ends_with(pa_data_col, py_target)
+    assert_column_eq(got, expected)
+
+
+def test_ends_with_column(
+    pa_data_col, pa_target_col, plc_data_col, plc_target_col
+):
+    expected = colwise_apply(
+        pa_data_col, pa_target_col, lambda st, target: st.endswith(target)
+    )
+    got = plc.strings.find.ends_with(plc_data_col, plc_target_col)
+    assert_column_eq(got, expected)

From d5ad366e9787999f00450ec858b5d18b813b3106 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 6 May 2024 18:51:17 -0500
Subject: [PATCH 156/842] Fix Index contains for error validations and float vs
 int comparisons (#15657)

Fixes: #15656

This PR:

- [x] Raises error for non-hashable values passed to `__contains__`
- [x] Fixes comparison of float values with int columns

Forks out of https://github.com/rapidsai/cudf/pull/14534

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15657
---
 python/cudf/cudf/core/_base_index.py      |  1 +
 python/cudf/cudf/core/column/numerical.py |  7 +++---
 python/cudf/cudf/core/index.py            |  2 ++
 python/cudf/cudf/tests/test_index.py      | 26 +++++++++++++++++++++++
 python/cudf/cudf/tests/test_multiindex.py | 12 +++++++++++
 5 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index fe0f39f9d0a..d2534acd2dc 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -275,6 +275,7 @@ def __getitem__(self, key):
         raise NotImplementedError()
 
     def __contains__(self, item):
+        hash(item)
         return item in self._values
 
     def _copy_type_metadata(
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 4c211a173b1..f6c7ca7675a 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -107,15 +107,14 @@ def __contains__(self, item: ScalarLike) -> bool:
         # Handles improper item types
         # Fails if item is of type None, so the handler.
         try:
-            if np.can_cast(item, self.dtype):
-                item = self.dtype.type(item)
-            else:
+            search_item = self.dtype.type(item)
+            if search_item != item and self.dtype.kind != "f":
                 return False
         except (TypeError, ValueError):
             return False
         # TODO: Use `scalar`-based `contains` wrapper
         return libcudf.search.contains(
-            self, column.as_column([item], dtype=self.dtype)
+            self, column.as_column([search_item], dtype=self.dtype)
         ).any()
 
     def indices_of(self, value: ScalarLike) -> NumericalColumn:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index a2ad10a0590..f9dd328aaa8 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -344,6 +344,7 @@ def _data(self):
 
     @_cudf_nvtx_annotate
     def __contains__(self, item):
+        hash(item)
         if isinstance(item, bool) or not isinstance(
             item,
             tuple(
@@ -1523,6 +1524,7 @@ def values(self):
         return self._column.values
 
     def __contains__(self, item):
+        hash(item)
         return item in self._values
 
     def _clean_nulls_from_index(self):
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 4ff1beb0a9a..baa839ecd72 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -3223,3 +3223,29 @@ def test_rangeindex_dropna():
     result = ri.dropna()
     expected = ri.copy()
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("data", [range(2), [10, 11, 12]])
+def test_index_contains_hashable(data):
+    gidx = cudf.Index(data)
+    pidx = gidx.to_pandas()
+
+    assert_exceptions_equal(
+        lambda: [] in gidx,
+        lambda: [] in pidx,
+        lfunc_args_and_kwargs=((),),
+        rfunc_args_and_kwargs=((),),
+    )
+
+
+@pytest.mark.parametrize("data", [[0, 1, 2], [1.1, 2.3, 4.5]])
+@pytest.mark.parametrize("dtype", ["int32", "float32", "float64"])
+@pytest.mark.parametrize("needle", [0, 1, 2.3])
+def test_index_contains_float_int(data, dtype, needle):
+    gidx = cudf.Index(data=data, dtype=dtype)
+    pidx = gidx.to_pandas()
+
+    actual = needle in gidx
+    expected = needle in pidx
+
+    assert_eq(actual, expected)
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 76a82afb78e..dd731fab8f3 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -2153,3 +2153,15 @@ def test_index_to_pandas_arrow_type(scalar):
         levels=[pd.arrays.ArrowExtensionArray(pa_array)], codes=[[0]]
     )
     pd.testing.assert_index_equal(result, expected)
+
+
+def test_multi_index_contains_hashable():
+    gidx = cudf.MultiIndex.from_tuples(zip(["foo", "bar", "baz"], [1, 2, 3]))
+    pidx = gidx.to_pandas()
+
+    assert_exceptions_equal(
+        lambda: [] in gidx,
+        lambda: [] in pidx,
+        lfunc_args_and_kwargs=((),),
+        rfunc_args_and_kwargs=((),),
+    )

From c30495492be40989c9ff1d56087fa91a28ea469a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 7 May 2024 10:19:46 -0400
Subject: [PATCH 157/842] Check row limit size in cudf::strings::join_strings
 (#15643)

Fixes condition where `cudf::strings::join_strings` could produce column that exceeds the expected row width limit.
The `join_strings` API produces a single row column from the input column (plus optional separators).
Since a strings column total size can now exceed max(size_type) character bytes, it is now possible to produce invalid single-row column since individual rows still cannot exceed max(size_type).

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15643
---
 cpp/src/strings/combine/join.cu | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index 4b2996a77e4..2e30e01df21 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -162,16 +162,16 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
     return std::move(*chars_data);
   }();
 
+  // API returns a single output row which cannot exceed row limit(max of size_type).
+  CUDF_EXPECTS(chars.size() < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
+               "The output exceeds the row size limit",
+               std::overflow_error);
+
   // build the offsets: single string output has offsets [0,chars-size]
   auto offsets_column = [&] {
-    if (chars.size() < static_cast<std::size_t>(get_offset64_threshold())) {
-      auto offsets32 = cudf::detail::make_device_uvector_async(
-        std::vector<int32_t>({0, static_cast<int32_t>(chars.size())}), stream, mr);
-      return std::make_unique<column>(std::move(offsets32), rmm::device_buffer{}, 0);
-    }
-    auto offsets64 = cudf::detail::make_device_uvector_async(
-      std::vector<int64_t>({0L, static_cast<int64_t>(chars.size())}), stream, mr);
-    return std::make_unique<column>(std::move(offsets64), rmm::device_buffer{}, 0);
+    auto offsets = cudf::detail::make_device_uvector_async(
+      std::vector<size_type>({0, static_cast<size_type>(chars.size())}), stream, mr);
+    return std::make_unique<column>(std::move(offsets), rmm::device_buffer{}, 0);
   }();
 
   // build the null mask: only one output row so it is either all-valid or all-null

From bd966141c9bb8b6dba3cabba3b8c8498203ed2ea Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 7 May 2024 16:22:33 +0100
Subject: [PATCH 158/842] Upgrade pre commit hooks (#15685)

The only really substantive change is to update mypy, which will be required for some type annotations in the cudf-polars work.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15685
---
 .pre-commit-config.yaml                    | 14 +++++++-------
 python/cudf/cudf/core/column/column.py     |  2 +-
 python/cudf/cudf/pandas/fast_slow_proxy.py |  5 ++---
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 3e99cf3fa9a..0ae745257cb 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.5.0
+    rev: v4.6.0
     hooks:
       - id: trailing-whitespace
         exclude: |
@@ -24,11 +24,11 @@ repos:
         files: python/.*
         types_or: [python, cython, pyi]
   - repo: https://github.com/MarcoGorelli/cython-lint
-    rev: v0.16.0
+    rev: v0.16.2
     hooks:
       - id: cython-lint
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: 'v1.3.0'
+    rev: 'v1.10.0'
     hooks:
       - id: mypy
         additional_dependencies: [types-cachetools]
@@ -39,7 +39,7 @@ repos:
                "python/dask_cudf/dask_cudf"]
         pass_filenames: false
   - repo: https://github.com/nbQA-dev/nbQA
-    rev: 1.7.1
+    rev: 1.8.5
     hooks:
       - id: nbqa-isort
         # Use the cudf_kafka isort orderings in notebooks so that dask
@@ -52,7 +52,7 @@ repos:
         types_or: [c, c++, cuda]
         args: ["-fallback-style=none", "-style=file", "-i"]
   - repo: https://github.com/sirosen/texthooks
-    rev: 0.6.3
+    rev: 0.6.6
     hooks:
       - id: fix-smartquotes
         exclude: |
@@ -124,12 +124,12 @@ repos:
             ^CHANGELOG.md$
           )
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.8.0
+    rev: v1.13.4
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.3.4
+    rev: v0.4.3
     hooks:
       - id: ruff
         files: python/.*$
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index ba2dab2c2e1..553f4cc7fb3 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2211,7 +2211,7 @@ def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer:
         raise NotImplementedError(f"Cannot infer mask from typestr {typestr}")
 
 
-def serialize_columns(columns) -> Tuple[List[dict], List]:
+def serialize_columns(columns: list[ColumnBase]) -> Tuple[List[dict], List]:
     """
     Return the headers and frames resulting
     from serializing a list of Column
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 9d8c174b297..835cfa89133 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -1093,7 +1093,7 @@ def _replace_closurevars(
     f: types.FunctionType,
     attribute_name: Literal["_fsproxy_slow", "_fsproxy_fast"],
     seen: Set[int],
-) -> types.FunctionType:
+) -> Callable[..., Any]:
     """
     Return a copy of `f` with its closure variables replaced with
     their corresponding slow (or fast) types.
@@ -1133,12 +1133,11 @@ def _replace_closurevars(
         argdefs=f.__defaults__,
         closure=g_closure,
     )
-    g = functools.update_wrapper(
+    return functools.update_wrapper(
         g,
         f,
         assigned=functools.WRAPPER_ASSIGNMENTS + ("__kwdefaults__",),
     )
-    return g
 
 
 _SPECIAL_METHODS: Set[str] = {

From 2e818575bb913c04f94c669c3a7555b5131e0639 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 7 May 2024 11:16:03 -0500
Subject: [PATCH 159/842] Fix CI s3 api command to fetch latest results
 (#15687)

An `s3` fetch api seems to be returning incorrectly ordered output, this PR fixes the issue.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15687
---
 ci/cudf_pandas_scripts/pandas-tests/diff.sh | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
index f87a3a36fcc..6cf70a2347f 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
@@ -17,10 +17,8 @@ MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-${RAPIDS_FULL_VER
 PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json
 
 rapids-logger "Fetching latest available results from nightly"
-aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json')], &LastModified)[::-1].[Key]" --output text > s3_output.txt
-
-read -r COMPARE_ENV < s3_output.txt
-export COMPARE_ENV
+aws s3api list-objects-v2 --bucket rapids-downloads --prefix "nightly/" --query "sort_by(Contents[?ends_with(Key, '_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json')], &LastModified)[::].[Key]" --output text  | tee s3_output.txt
+COMPARE_ENV=$(tail -n 1 s3_output.txt)
 rapids-logger "Latest available results from nightly: ${COMPARE_ENV}"
 
 aws s3 cp "s3://rapids-downloads/${COMPARE_ENV}" main-results.json

From 0cfdbc135556a4b51f4521429e19309d7ce586f9 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Tue, 7 May 2024 11:29:00 -0700
Subject: [PATCH 160/842] Fix decoding of dictionary encoded
 FIXED_LEN_BYTE_ARRAY data in Parquet reader (#15601)

Reading Parquet files with dictionary encoded FIXED_LEN_BYTE_ARRAY data fails because the dictionary page is never parsed, leading to out-of-bounds memory accesses.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15601
---
 cpp/src/io/parquet/page_decode.cuh           | 10 +++++++---
 cpp/src/io/parquet/page_hdr.cu               | 21 +++++++++++++++-----
 cpp/src/io/parquet/reader_impl_preprocess.cu | 15 +++++++++++---
 python/cudf/cudf/tests/test_parquet.py       | 19 ++++++++++++++++++
 4 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index 4c811449c70..b1f8e6dd5fe 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -1298,9 +1298,13 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
       // be made to is_supported_encoding() in reader_impl_preprocess.cu
       switch (s->page.encoding) {
         case Encoding::PLAIN_DICTIONARY:
-        case Encoding::RLE_DICTIONARY:
+        case Encoding::RLE_DICTIONARY: {
           // RLE-packed dictionary indices, first byte indicates index length in bits
-          if (s->col.physical_type == BYTE_ARRAY && s->col.str_dict_index != nullptr) {
+          auto const is_decimal =
+            s->col.logical_type.has_value() and s->col.logical_type->type == LogicalType::DECIMAL;
+          if ((s->col.physical_type == BYTE_ARRAY or
+               s->col.physical_type == FIXED_LEN_BYTE_ARRAY) and
+              not is_decimal and s->col.str_dict_index != nullptr) {
             // String dictionary: use index
             s->dict_base = reinterpret_cast<uint8_t const*>(s->col.str_dict_index);
             s->dict_size = s->col.dict_page->num_input_values * sizeof(string_index_pair);
@@ -1314,7 +1318,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
           if (s->dict_bits > 32 || (!s->dict_base && s->col.dict_page->num_input_values > 0)) {
             s->set_error_code(decode_error::INVALID_DICT_WIDTH);
           }
-          break;
+        } break;
         case Encoding::PLAIN:
         case Encoding::BYTE_STREAM_SPLIT:
           s->dict_size = static_cast<int32_t>(end - cur);
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index 6c6afde29e4..cf0dd85e490 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -538,17 +538,28 @@ CUDF_KERNEL void __launch_bounds__(128)
     int pos = 0, cur = 0;
     for (int i = 0; i < num_entries; i++) {
       int len = 0;
-      if (cur + 4 <= dict_size) {
-        len = dict[cur + 0] | (dict[cur + 1] << 8) | (dict[cur + 2] << 16) | (dict[cur + 3] << 24);
-        if (len >= 0 && cur + 4 + len <= dict_size) {
+      if (ck->physical_type == FIXED_LEN_BYTE_ARRAY) {
+        if (cur + ck->type_length <= dict_size) {
+          len = ck->type_length;
           pos = cur;
-          cur = cur + 4 + len;
+          cur += len;
         } else {
           cur = dict_size;
         }
+      } else {
+        if (cur + 4 <= dict_size) {
+          len =
+            dict[cur + 0] | (dict[cur + 1] << 8) | (dict[cur + 2] << 16) | (dict[cur + 3] << 24);
+          if (len >= 0 && cur + 4 + len <= dict_size) {
+            pos = cur + 4;
+            cur = pos + len;
+          } else {
+            cur = dict_size;
+          }
+        }
       }
       // TODO: Could store 8 entries in shared mem, then do a single warp-wide store
-      dict_index[i].first  = reinterpret_cast<char const*>(dict + pos + 4);
+      dict_index[i].first  = reinterpret_cast<char const*>(dict + pos);
       dict_index[i].second = len;
     }
   }
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 8c9b3c1a1e6..55633b97cf4 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -636,6 +636,15 @@ void decode_page_headers(pass_intermediate_data& pass,
   stream.synchronize();
 }
 
+constexpr bool is_string_chunk(ColumnChunkDesc const& chunk)
+{
+  auto const is_decimal =
+    chunk.logical_type.has_value() and chunk.logical_type->type == LogicalType::DECIMAL;
+  auto const is_binary =
+    chunk.physical_type == BYTE_ARRAY or chunk.physical_type == FIXED_LEN_BYTE_ARRAY;
+  return is_binary and not is_decimal;
+}
+
 struct set_str_dict_index_count {
   device_span<size_t> str_dict_index_count;
   device_span<const ColumnChunkDesc> chunks;
@@ -643,8 +652,8 @@ struct set_str_dict_index_count {
   __device__ void operator()(PageInfo const& page)
   {
     auto const& chunk = chunks[page.chunk_idx];
-    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) && chunk.physical_type == BYTE_ARRAY &&
-        (chunk.num_dict_pages > 0)) {
+    if ((page.flags & PAGEINFO_FLAGS_DICTIONARY) != 0 and chunk.num_dict_pages > 0 and
+        is_string_chunk(chunk)) {
       // there is only ever one dictionary page per chunk, so this is safe to do in parallel.
       str_dict_index_count[page.chunk_idx] = page.num_input_values;
     }
@@ -659,7 +668,7 @@ struct set_str_dict_index_ptr {
   __device__ void operator()(size_t i)
   {
     auto& chunk = chunks[i];
-    if (chunk.physical_type == BYTE_ARRAY && (chunk.num_dict_pages > 0)) {
+    if (chunk.num_dict_pages > 0 and is_string_chunk(chunk)) {
       chunk.str_dict_index = base + str_dict_index_offsets[i];
     }
   }
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 6fb1d3d8ba5..f1b90b40991 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2,6 +2,7 @@
 
 import datetime
 import glob
+import hashlib
 import math
 import os
 import pathlib
@@ -2807,6 +2808,24 @@ def test_parquet_reader_fixed_bin(datadir):
     assert_eq(expect, got)
 
 
+def test_parquet_reader_fixed_len_with_dict(tmpdir):
+    def flba(i):
+        hasher = hashlib.sha256()
+        hasher.update(i.to_bytes(4, "little"))
+        return hasher.digest()
+
+    # use pyarrow to write table of fixed_len_byte_array
+    num_rows = 200
+    data = pa.array([flba(i) for i in range(num_rows)], type=pa.binary(32))
+    padf = pa.Table.from_arrays([data], names=["flba"])
+    padf_fname = tmpdir.join("padf.parquet")
+    pq.write_table(padf, padf_fname, use_dictionary=True)
+
+    expect = pd.read_parquet(padf_fname)
+    got = cudf.read_parquet(padf_fname)
+    assert_eq(expect, got)
+
+
 def test_parquet_reader_rle_boolean(datadir):
     fname = datadir / "rle_boolean_encoding.parquet"
 

From e87a78d422e25474dd23b031ef98eeb8a293d718 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 7 May 2024 14:09:28 -0500
Subject: [PATCH 161/842] Allow `fillna` to validate for
 `CategoricalColumn.fillna` (#15683)

Fixes: #15666

This PR validates values passed to `fillna` even if there are no null values in a categorical column.

Forks from #14534

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15683
---
 python/cudf/cudf/core/column/categorical.py |  8 +++++---
 python/cudf/cudf/core/frame.py              | 15 +++++++++++----
 python/cudf/cudf/tests/test_categorical.py  | 16 ++++++++++++++++
 3 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index e3e73035046..dc51cd4f28f 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1045,9 +1045,6 @@ def fillna(
         """
         Fill null values with *fill_value*
         """
-        if not self.nullable:
-            return self
-
         if fill_value is not None:
             fill_is_scalar = np.isscalar(fill_value)
 
@@ -1079,6 +1076,11 @@ def fillna(
                     self.codes.dtype
                 )
 
+        # Validation of `fill_value` will have to be performed
+        # before returning self.
+        if not self.nullable:
+            return self
+
         return super().fillna(fill_value, method=method)
 
     def indices_of(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 017190ab5b4..58932db2bda 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -762,10 +762,17 @@ def fillna(
             else:
                 replace_val = None
             should_fill = (
-                col_name in value
-                and col.has_nulls(include_nan=True)
-                and not libcudf.scalar._is_null_host_scalar(replace_val)
-            ) or method is not None
+                (
+                    col_name in value
+                    and col.has_nulls(include_nan=True)
+                    and not libcudf.scalar._is_null_host_scalar(replace_val)
+                )
+                or method is not None
+                or (
+                    isinstance(col, cudf.core.column.CategoricalColumn)
+                    and not libcudf.scalar._is_null_host_scalar(replace_val)
+                )
+            )
             if should_fill:
                 filled_data[col_name] = col.fillna(replace_val, method)
             else:
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 7aba2e45532..07ce81e3c39 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -859,3 +859,19 @@ def test_cat_from_scalar(scalar):
     gs = cudf.Series(scalar, dtype="category")
 
     assert_eq(ps, gs)
+
+
+def test_cat_groupby_fillna():
+    ps = pd.Series(["a", "b", "c"], dtype="category")
+    gs = cudf.from_pandas(ps)
+
+    with pytest.warns(FutureWarning):
+        pg = ps.groupby(ps)
+    gg = gs.groupby(gs)
+
+    assert_exceptions_equal(
+        lfunc=pg.fillna,
+        rfunc=gg.fillna,
+        lfunc_args_and_kwargs=(("d",), {}),
+        rfunc_args_and_kwargs=(("d",), {}),
+    )

From a958274d338fabac4cac63ec938ea273aa58490c Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 7 May 2024 14:33:43 -0500
Subject: [PATCH 162/842] Some additional kernel thread index refactoring.
 (#14107)

This PR refactors a few kernels to use `thread_index_type` and associated utilities. I started this before realizing how much scope was still left in issue #10368 ("Part 2 - Take another pass over more challenging kernels"), and then I stopped working on this due to time constraints. For the moment, I hope this PR makes a small dent in the number of remaining kernels to convert to using `thread_index_type`.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - Mark Harris (https://github.com/harrism)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/14107
---
 cpp/benchmarks/join/generate_input_tables.cuh | 17 ++++++----
 .../type_dispatcher/type_dispatcher.cu        | 33 +++++++++++--------
 cpp/include/cudf/detail/copy_if_else.cuh      | 23 ++++++-------
 cpp/include/cudf/detail/utilities/cuda.cuh    | 26 +++++++++++++++
 cpp/include/cudf/detail/valid_if.cuh          |  4 +--
 cpp/src/bitmask/null_mask.cu                  |  4 +--
 cpp/src/copying/concatenate.cu                |  4 +--
 cpp/src/join/conditional_join_kernels.cuh     | 10 +++---
 cpp/src/strings/convert/convert_urls.cu       | 18 +++++-----
 9 files changed, 88 insertions(+), 51 deletions(-)

diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
index 93401f01026..f7984b29d6b 100644
--- a/cpp/benchmarks/join/generate_input_tables.cuh
+++ b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -34,7 +35,7 @@
 
 CUDF_KERNEL void init_curand(curandState* state, int const nstates)
 {
-  int ithread = threadIdx.x + blockIdx.x * blockDim.x;
+  int ithread = cudf::detail::grid_1d::global_thread_id();
 
   if (ithread < nstates) { curand_init(1234ULL, ithread, 0, state + ithread); }
 }
@@ -46,13 +47,14 @@ CUDF_KERNEL void init_build_tbl(key_type* const build_tbl,
                                 curandState* state,
                                 int const num_states)
 {
-  auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  auto const stride    = blockDim.x * gridDim.x;
+  auto const start_idx = cudf::detail::grid_1d::global_thread_id();
+  auto const stride    = cudf::detail::grid_1d::grid_stride();
   assert(start_idx < num_states);
 
   curandState localState = state[start_idx];
 
-  for (size_type idx = start_idx; idx < build_tbl_size; idx += stride) {
+  for (cudf::thread_index_type tidx = start_idx; tidx < build_tbl_size; tidx += stride) {
+    auto const idx = static_cast<size_type>(tidx);
     double const x = curand_uniform_double(&localState);
 
     build_tbl[idx] = static_cast<key_type>(x * (build_tbl_size / multiplicity));
@@ -71,13 +73,14 @@ CUDF_KERNEL void init_probe_tbl(key_type* const probe_tbl,
                                 curandState* state,
                                 int const num_states)
 {
-  auto const start_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  auto const stride    = blockDim.x * gridDim.x;
+  auto const start_idx = cudf::detail::grid_1d::global_thread_id();
+  auto const stride    = cudf::detail::grid_1d::grid_stride();
   assert(start_idx < num_states);
 
   curandState localState = state[start_idx];
 
-  for (size_type idx = start_idx; idx < probe_tbl_size; idx += stride) {
+  for (cudf::thread_index_type tidx = start_idx; tidx < probe_tbl_size; tidx += stride) {
+    auto const idx = static_cast<size_type>(tidx);
     key_type val;
     double x = curand_uniform_double(&localState);
 
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
index 161328ae088..3aff75d840e 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
@@ -60,13 +60,15 @@ constexpr int block_size = 256;
 template <FunctorType functor_type, class T>
 CUDF_KERNEL void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size_type n_cols)
 {
-  using F               = Functor<T, functor_type>;
-  cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x;
-  while (index < n_rows) {
+  using F           = Functor<T, functor_type>;
+  auto tidx         = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+  while (tidx < n_rows) {
+    auto const index = static_cast<cudf::size_type>(tidx);
     for (int c = 0; c < n_cols; c++) {
       A[c][index] = F::f(A[c][index]);
     }
-    index += blockDim.x * gridDim.x;
+    tidx += stride;
   }
 }
 
@@ -74,12 +76,14 @@ CUDF_KERNEL void no_dispatching_kernel(T** A, cudf::size_type n_rows, cudf::size
 template <FunctorType functor_type, class T>
 CUDF_KERNEL void host_dispatching_kernel(cudf::mutable_column_device_view source_column)
 {
-  using F               = Functor<T, functor_type>;
-  T* A                  = source_column.data<T>();
-  cudf::size_type index = blockIdx.x * blockDim.x + threadIdx.x;
-  while (index < source_column.size()) {
-    A[index] = F::f(A[index]);
-    index += blockDim.x * gridDim.x;
+  using F           = Functor<T, functor_type>;
+  T* A              = source_column.data<T>();
+  auto tidx         = cudf::detail::grid_1d::global_thread_id();
+  auto const stride = cudf::detail::grid_1d::grid_stride();
+  while (tidx < source_column.size()) {
+    auto const index = static_cast<cudf::size_type>(tidx);
+    A[index]         = F::f(A[index]);
+    tidx += stride;
   }
 }
 
@@ -127,14 +131,15 @@ template <FunctorType functor_type>
 CUDF_KERNEL void device_dispatching_kernel(cudf::mutable_table_device_view source)
 {
   cudf::size_type const n_rows = source.num_rows();
-  cudf::size_type index        = threadIdx.x + blockIdx.x * blockDim.x;
-
-  while (index < n_rows) {
+  auto tidx                    = cudf::detail::grid_1d::global_thread_id();
+  auto const stride            = cudf::detail::grid_1d::grid_stride();
+  while (tidx < n_rows) {
+    auto const index = static_cast<cudf::size_type>(tidx);
     for (cudf::size_type i = 0; i < source.num_columns(); i++) {
       cudf::type_dispatcher(
         source.column(i).type(), RowHandle<functor_type>{}, source.column(i), index);
     }
-    index += blockDim.x * gridDim.x;
+    tidx += stride;
   }  // while
 }
 
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index ac5cb0ad141..8418e279ce7 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -45,29 +45,30 @@ __launch_bounds__(block_size) CUDF_KERNEL
                            mutable_column_device_view out,
                            size_type* __restrict__ const valid_count)
 {
-  size_type const tid            = threadIdx.x + blockIdx.x * block_size;
-  int const warp_id              = tid / warp_size;
-  size_type const warps_per_grid = gridDim.x * block_size / warp_size;
+  auto tidx                      = cudf::detail::grid_1d::global_thread_id<block_size>();
+  auto const stride              = cudf::detail::grid_1d::grid_stride<block_size>();
+  int const warp_id              = tidx / cudf::detail::warp_size;
+  size_type const warps_per_grid = gridDim.x * block_size / cudf::detail::warp_size;
 
   // begin/end indices for the column data
-  size_type begin = 0;
-  size_type end   = out.size();
+  size_type const begin = 0;
+  size_type const end   = out.size();
   // warp indices.  since 1 warp == 32 threads == sizeof(bitmask_type) * 8,
   // each warp will process one (32 bit) of the validity mask via
   // __ballot_sync()
-  size_type warp_begin = cudf::word_index(begin);
-  size_type warp_end   = cudf::word_index(end - 1);
+  size_type const warp_begin = cudf::word_index(begin);
+  size_type const warp_end   = cudf::word_index(end - 1);
 
   // lane id within the current warp
   constexpr size_type leader_lane{0};
-  int const lane_id = threadIdx.x % warp_size;
+  int const lane_id = threadIdx.x % cudf::detail::warp_size;
 
   size_type warp_valid_count{0};
 
   // current warp.
   size_type warp_cur = warp_begin + warp_id;
-  size_type index    = tid;
   while (warp_cur <= warp_end) {
+    auto const index = static_cast<size_type>(tidx);
     auto const opt_value =
       (index < end) ? (filter(index) ? lhs[index] : rhs[index]) : thrust::nullopt;
     if (opt_value) { out.element<T>(index) = static_cast<T>(*opt_value); }
@@ -85,7 +86,7 @@ __launch_bounds__(block_size) CUDF_KERNEL
 
     // next grid
     warp_cur += warps_per_grid;
-    index += block_size * gridDim.x;
+    tidx += stride;
   }
 
   if (has_nulls) {
@@ -159,7 +160,7 @@ std::unique_ptr<column> copy_if_else(bool nullable,
   using Element = typename thrust::iterator_traits<LeftIter>::value_type::value_type;
 
   size_type size           = std::distance(lhs_begin, lhs_end);
-  size_type num_els        = cudf::util::round_up_safe(size, warp_size);
+  size_type num_els        = cudf::util::round_up_safe(size, cudf::detail::warp_size);
   constexpr int block_size = 256;
   cudf::detail::grid_1d grid{num_els, block_size, 1};
 
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index 86c85ca8d06..f1775c6d6d7 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -93,6 +93,19 @@ class grid_1d {
     return global_thread_id(threadIdx.x, blockIdx.x, blockDim.x);
   }
 
+  /**
+   * @brief Returns the global thread index of the current thread in a 1D grid.
+   *
+   * @tparam num_threads_per_block The number of threads per block
+   *
+   * @return thread_index_type The global thread index
+   */
+  template <thread_index_type num_threads_per_block>
+  static __device__ thread_index_type global_thread_id()
+  {
+    return global_thread_id(threadIdx.x, blockIdx.x, num_threads_per_block);
+  }
+
   /**
    * @brief Returns the stride of a 1D grid.
    *
@@ -115,6 +128,19 @@ class grid_1d {
    * @return thread_index_type The number of threads in the grid.
    */
   static __device__ thread_index_type grid_stride() { return grid_stride(blockDim.x, gridDim.x); }
+
+  /**
+   * @brief Returns the stride of the current 1D grid.
+   *
+   * @tparam num_threads_per_block The number of threads per block
+   *
+   * @return thread_index_type The number of threads in the grid.
+   */
+  template <thread_index_type num_threads_per_block>
+  static __device__ thread_index_type grid_stride()
+  {
+    return grid_stride(num_threads_per_block, gridDim.x);
+  }
 };
 
 /**
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index 66163d6059a..64a3c4edf78 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -50,8 +50,8 @@ CUDF_KERNEL void valid_if_kernel(
 {
   constexpr size_type leader_lane{0};
   auto const lane_id{threadIdx.x % warp_size};
-  auto i            = cudf::detail::grid_1d::global_thread_id();
-  auto const stride = cudf::detail::grid_1d::grid_stride();
+  auto i            = cudf::detail::grid_1d::global_thread_id<block_size>();
+  auto const stride = cudf::detail::grid_1d::grid_stride<block_size>();
   size_type warp_valid_count{0};
 
   auto active_mask = __ballot_sync(0xFFFF'FFFFu, i < size);
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 4da2e502ce6..d0faeea8336 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -269,8 +269,8 @@ CUDF_KERNEL void count_set_bits_kernel(bitmask_type const* bitmask,
 
   auto const first_word_index{word_index(first_bit_index)};
   auto const last_word_index{word_index(last_bit_index)};
-  thread_index_type const tid         = grid_1d::global_thread_id();
-  thread_index_type const stride      = grid_1d::grid_stride();
+  thread_index_type const tid         = grid_1d::global_thread_id<block_size>();
+  thread_index_type const stride      = grid_1d::grid_stride<block_size>();
   thread_index_type thread_word_index = tid + first_word_index;
   size_type thread_count{0};
 
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index b1136a9eeb3..47e74a5cb48 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -121,8 +121,8 @@ CUDF_KERNEL void concatenate_masks_kernel(column_device_view const* views,
                                           size_type number_of_mask_bits,
                                           size_type* out_valid_count)
 {
-  auto tidx         = cudf::detail::grid_1d::global_thread_id();
-  auto const stride = cudf::detail::grid_1d::grid_stride();
+  auto tidx         = cudf::detail::grid_1d::global_thread_id<block_size>();
+  auto const stride = cudf::detail::grid_1d::grid_stride<block_size>();
   auto active_mask  = __ballot_sync(0xFFFF'FFFFu, tidx < number_of_mask_bits);
 
   size_type warp_valid_count = 0;
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index 5e190eb2b27..1e16c451f5a 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -67,8 +67,8 @@ CUDF_KERNEL void compute_conditional_join_output_size(
     &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
 
   std::size_t thread_counter{0};
-  auto const start_idx = cudf::detail::grid_1d::global_thread_id();
-  auto const stride    = cudf::detail::grid_1d::grid_stride();
+  auto const start_idx = cudf::detail::grid_1d::global_thread_id<block_size>();
+  auto const stride    = cudf::detail::grid_1d::grid_stride<block_size>();
 
   cudf::thread_index_type const left_num_rows  = left_table.num_rows();
   cudf::thread_index_type const right_num_rows = right_table.num_rows();
@@ -174,7 +174,7 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,
 
   __syncwarp();
 
-  auto outer_row_index = cudf::detail::grid_1d::global_thread_id();
+  auto outer_row_index = cudf::detail::grid_1d::global_thread_id<block_size>();
 
   unsigned int const activemask = __ballot_sync(0xffff'ffffu, outer_row_index < outer_num_rows);
 
@@ -295,8 +295,8 @@ CUDF_KERNEL void conditional_join_anti_semi(
   int const lane_id                            = threadIdx.x % detail::warp_size;
   cudf::thread_index_type const outer_num_rows = left_table.num_rows();
   cudf::thread_index_type const inner_num_rows = right_table.num_rows();
-  auto const stride                            = cudf::detail::grid_1d::grid_stride();
-  auto const start_idx                         = cudf::detail::grid_1d::global_thread_id();
+  auto const stride                            = cudf::detail::grid_1d::grid_stride<block_size>();
+  auto const start_idx = cudf::detail::grid_1d::global_thread_id<block_size>();
 
   if (0 == lane_id) { current_idx_shared[warp_id] = 0; }
 
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 459c3e88a4e..d9920be045f 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -202,10 +202,11 @@ CUDF_KERNEL void url_decode_char_counter(column_device_view const in_strings,
   __shared__ char temporary_buffer[num_warps_per_threadblock][char_block_size + halo_size];
   __shared__ typename cub::WarpReduce<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];
 
-  auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
-  auto const global_warp_id   = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
-  auto const local_warp_id    = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
-  auto const warp_lane        = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
+  auto const global_thread_id =
+    cudf::detail::grid_1d::global_thread_id<num_warps_per_threadblock * cudf::detail::warp_size>();
+  auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
+  auto const local_warp_id  = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
+  auto const warp_lane      = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
   auto const nwarps     = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
   char* in_chars_shared = temporary_buffer[local_warp_id];
 
@@ -287,10 +288,11 @@ CUDF_KERNEL void url_decode_char_replacer(column_device_view const in_strings,
   __shared__ typename cub::WarpScan<int8_t>::TempStorage cub_storage[num_warps_per_threadblock];
   __shared__ size_type out_idx[num_warps_per_threadblock];
 
-  auto const global_thread_id = cudf::detail::grid_1d::global_thread_id();
-  auto const global_warp_id   = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
-  auto const local_warp_id    = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
-  auto const warp_lane        = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
+  auto const global_thread_id =
+    cudf::detail::grid_1d::global_thread_id<num_warps_per_threadblock * cudf::detail::warp_size>();
+  auto const global_warp_id = static_cast<size_type>(global_thread_id / cudf::detail::warp_size);
+  auto const local_warp_id  = static_cast<size_type>(threadIdx.x / cudf::detail::warp_size);
+  auto const warp_lane      = static_cast<size_type>(threadIdx.x % cudf::detail::warp_size);
   auto const nwarps     = static_cast<size_type>(gridDim.x * blockDim.x / cudf::detail::warp_size);
   char* in_chars_shared = temporary_buffer[local_warp_id];
 

From 4c6593bc247006493b1b29918225620e0d4ecb65 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 7 May 2024 14:45:07 -0500
Subject: [PATCH 163/842] Add `NumpyExtensionArray` proxy type in `cudf.pandas`
 (#15686)

Fixes: #15678

This PR adds a proxy type for `NumpyExtensionArray`

Forks out from #14534

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15686
---
 python/cudf/cudf/pandas/_wrappers/pandas.py       | 12 ++++++++++++
 python/cudf/cudf_pandas_tests/test_cudf_pandas.py |  8 ++++++++
 2 files changed, 20 insertions(+)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 3c82d571939..a4a0c24deda 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -310,6 +310,18 @@ def Index__new__(cls, *args, **kwargs):
     additional_attributes={"__init__": _DELETE},
 )
 
+NumpyExtensionArray = make_final_proxy_type(
+    "NumpyExtensionArray",
+    _Unusable,
+    pd.arrays.NumpyExtensionArray,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={
+        "_ndarray": _FastSlowAttribute("_ndarray"),
+        "_dtype": _FastSlowAttribute("_dtype"),
+    },
+)
+
 TimedeltaArray = make_final_proxy_type(
     "TimedeltaArray",
     _Unusable,
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 90356a01404..dff735cfd05 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1205,6 +1205,14 @@ def test_pickle_groupby(dataframe):
     tm.assert_equal(pgb.sum(), gb.sum())
 
 
+def test_numpy_extension_array():
+    np_array = np.array([0, 1, 2, 3])
+    xarray = xpd.arrays.NumpyExtensionArray(np_array)
+    array = pd.arrays.NumpyExtensionArray(np_array)
+
+    tm.assert_equal(xarray, array)
+
+
 def test_isinstance_base_offset():
     offset = xpd.tseries.frequencies.to_offset("1s")
     assert isinstance(offset, xpd.tseries.offsets.BaseOffset)

From 8d9c06a764900124446ca754d0d1555c3cb09904 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 7 May 2024 15:21:06 -0500
Subject: [PATCH 164/842] Add `Timestamp` and `Timedelta` proxy types (#15680)

Fixes: #15673

This PR adds `Timestamp` and `Timedelta` proxy types.


Forks out from: #14534

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15680
---
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 43 +++++++++++++++++++
 .../cudf_pandas_tests/test_cudf_pandas.py     | 14 ++++++
 .../cudf/cudf_pandas_tests/test_profiler.py   |  4 +-
 3 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index a4a0c24deda..93bef66de4f 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -103,6 +103,49 @@ def __get__(self, obj, cls=None):
             raise AttributeError()
 
 
+def Timestamp_Timedelta__new__(cls, *args, **kwargs):
+    # Call fast/slow constructor
+    # This takes care of running __init__ as well, but must be paired
+    # with a removal of the defaulted __init__ that
+    # make_final_proxy_type provides.
+    # Timestamp & Timedelta don't always return same types as self,
+    # hence this method is needed.
+    self, _ = _fast_slow_function_call(
+        lambda cls, args, kwargs: cls(*args, **kwargs),
+        cls,
+        args,
+        kwargs,
+    )
+    return self
+
+
+Timedelta = make_final_proxy_type(
+    "Timedelta",
+    _Unusable,
+    pd.Timedelta,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={
+        "__hash__": _FastSlowAttribute("__hash__"),
+        "__new__": Timestamp_Timedelta__new__,
+        "__init__": _DELETE,
+    },
+)
+
+
+Timestamp = make_final_proxy_type(
+    "Timestamp",
+    _Unusable,
+    pd.Timestamp,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={
+        "__hash__": _FastSlowAttribute("__hash__"),
+        "__new__": Timestamp_Timedelta__new__,
+        "__init__": _DELETE,
+    },
+)
+
 DatetimeProperties = make_intermediate_proxy_type(
     "DatetimeProperties",
     cudf.core.series.DatetimeProperties,
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index dff735cfd05..8d319cfe640 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1228,3 +1228,17 @@ def my_apply(df, unused):
     result = df.apply(my_apply, axis=1, unused=True)
     expected = xpd.Series([1])
     tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("data", [pd.NaT, 1234, "nat"])
+def test_timestamp(data):
+    xtimestamp = xpd.Timestamp(data)
+    timestamp = pd.Timestamp(data)
+    tm.assert_equal(xtimestamp, timestamp)
+
+
+@pytest.mark.parametrize("data", [pd.NaT, 1234, "nat"])
+def test_timedelta(data):
+    xtimedelta = xpd.Timedelta(data)
+    timedelta = pd.Timedelta(data)
+    tm.assert_equal(xtimedelta, timedelta)
diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py
index 4921446ab6b..dd8d9287972 100644
--- a/python/cudf/cudf_pandas_tests/test_profiler.py
+++ b/python/cudf/cudf_pandas_tests/test_profiler.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -30,11 +30,13 @@ def test_profiler():
 
     per_function_stats = profiler.per_function_stats
     assert set(per_function_stats) == {
+        "Timestamp",
         "DataFrame",
         "DataFrame.groupby",
         "DataFrameGroupBy.sum",
         "DataFrame.sum",
         "Series.__getitem__",
+        "Timedelta",
     }
     for name, func in per_function_stats.items():
         assert (

From 5d244dfc13f4db0b1e41ded3029942fec50c98f6 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 7 May 2024 15:52:18 -0500
Subject: [PATCH 165/842] Preserve sub-second data for time scalars in column
 construction (#15655)

Fixes: #15654

This PR makes fixes such that sub-second timestamp data is not being dropped in column construction.

Forks out of https://github.com/rapidsai/cudf/pull/14534/

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15655
---
 python/cudf/cudf/_lib/scalar.pyx       |  3 ++-
 python/cudf/cudf/core/column/column.py | 13 ++++++++++++
 python/cudf/cudf/core/dataframe.py     |  2 +-
 python/cudf/cudf/core/scalar.py        |  3 +++
 python/cudf/cudf/tests/test_series.py  | 28 ++++++++++++++++++++++++++
 5 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 7ddf4ff4883..aee496e9f1c 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -354,7 +354,8 @@ def as_device_scalar(val, dtype=None):
 def _is_null_host_scalar(slr):
     if cudf.utils.utils.is_na_like(slr):
         return True
-    elif isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr):
+    elif (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) or \
+            slr is pd.NaT:
         return True
     else:
         return False
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 553f4cc7fb3..e23da59b883 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2163,6 +2163,19 @@ def as_column(
                     nan_as_null=nan_as_null,
                     length=length,
                 )
+            elif (
+                isinstance(element, (pd.Timestamp, pd.Timedelta))
+                or element is pd.NaT
+            ):
+                # TODO: Remove this after
+                # https://github.com/apache/arrow/issues/26492
+                # is fixed.
+                return as_column(
+                    pd.Series(arbitrary),
+                    dtype=dtype,
+                    nan_as_null=nan_as_null,
+                    length=length,
+                )
             elif not any(element is na for na in (None, pd.NA, np.nan)):
                 # Might have NA + element like above, but short-circuit if
                 # an element pyarrow/pandas might be able to parse
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index bf8201e4dc1..6fa957684e4 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1215,7 +1215,7 @@ def dtypes(self):
         >>> df.dtypes
         float              float64
         int                  int64
-        datetime    datetime64[us]
+        datetime    datetime64[ns]
         string              object
         dtype: object
         """
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index f7d05e53ce7..29460d8c67e 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -223,6 +223,9 @@ def _preprocess_host_value(self, value, dtype):
 
         if dtype is None:
             if not valid:
+                if value is NaT:
+                    value = value.to_numpy()
+
                 if isinstance(value, (np.datetime64, np.timedelta64)):
                     unit, _ = np.datetime_data(value)
                     if unit == "generic":
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 642dbde3790..6a9de197374 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2786,3 +2786,31 @@ def test_squeeze(axis, data):
 def test_squeeze_invalid_axis(axis):
     with pytest.raises(ValueError):
         cudf.Series([1]).squeeze(axis=axis)
+
+
+@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0])
+def test_timestamp_series_init(data):
+    scalar = pd.Timestamp(data)
+    expected = pd.Series([scalar])
+    actual = cudf.Series([scalar])
+
+    assert_eq(expected, actual)
+
+    expected = pd.Series(scalar)
+    actual = cudf.Series(scalar)
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("data", [None, 123, 33243243232423, 0])
+def test_timedelta_series_init(data):
+    scalar = pd.Timedelta(data)
+    expected = pd.Series([scalar])
+    actual = cudf.Series([scalar])
+
+    assert_eq(expected, actual)
+
+    expected = pd.Series(scalar)
+    actual = cudf.Series(scalar)
+
+    assert_eq(expected, actual)

From 5154661ae48074f9e781f95f74bce560b30ab00a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 7 May 2024 13:07:38 -1000
Subject: [PATCH 166/842] Avoid accessing attributes via `_column` if not
 needed (#15624)

xref https://github.com/rapidsai/cudf/pull/15494

If the attributes are exposed on the top level object e.g. `Index.dtype` it should be sufficient to just access the attributes there instead of reaching for the underlying object

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15624
---
 python/cudf/cudf/core/algorithms.py      |   4 +-
 python/cudf/cudf/core/dataframe.py       |   4 +-
 python/cudf/cudf/core/index.py           |   2 +-
 python/cudf/cudf/core/indexed_frame.py   |   2 +-
 python/cudf/cudf/core/series.py          | 111 +++++++++--------------
 python/cudf/cudf/core/tools/datetimes.py |   3 +-
 6 files changed, 48 insertions(+), 78 deletions(-)

diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 33cec21caa5..272abdece9e 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -142,10 +142,10 @@ def _index_or_values_interpolation(column, index=None):
         BooleanMask(~mask, len(to_interp))
     )
 
-    known_x = known_x_and_y._index._column.values
+    known_x = known_x_and_y.index.to_cupy()
     known_y = known_x_and_y._data.columns[0].values
 
-    result = cp.interp(to_interp._index.values, known_x, known_y)
+    result = cp.interp(index.to_cupy(), known_x, known_y)
 
     # find the first nan
     first_nan_idx = (mask == 0).argmax().item()
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 6fa957684e4..6928425a867 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1768,7 +1768,7 @@ def _concat(
                 indices[:first_data_column_position],
             )
             if not isinstance(out._index, MultiIndex) and isinstance(
-                out._index._values.dtype, cudf.CategoricalDtype
+                out._index.dtype, cudf.CategoricalDtype
             ):
                 out = out.set_index(
                     cudf.core.index.as_index(out.index._values)
@@ -3582,7 +3582,7 @@ def rename(
         if index:
             if (
                 any(isinstance(item, str) for item in index.values())
-                and type(self.index._values) != cudf.core.column.StringColumn
+                and self.index.dtype != "object"
             ):
                 raise NotImplementedError(
                     "Implicit conversion of index to "
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index f9dd328aaa8..52322b0160f 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2882,7 +2882,7 @@ def __init__(
 
     @property
     def closed(self):
-        return self._values.dtype.closed
+        return self.dtype.closed
 
     @classmethod
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 62ee780ebbb..e656fd49758 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -174,7 +174,7 @@ def _indices_from_labels(obj, labels):
 
         if isinstance(obj.index.dtype, cudf.CategoricalDtype):
             labels = labels.astype("category")
-            codes = labels.codes.astype(obj.index._values.codes.dtype)
+            codes = labels.codes.astype(obj.index.codes.dtype)
             labels = cudf.core.column.build_categorical_column(
                 categories=labels.dtype.categories,
                 codes=codes,
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index c3d232aaa7c..63a49a898f4 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -39,11 +39,9 @@
     _is_scalar_or_zero_d_array,
     is_bool_dtype,
     is_dict_like,
-    is_float_dtype,
     is_integer,
     is_integer_dtype,
     is_scalar,
-    is_string_dtype,
 )
 from cudf.core import indexing_utils
 from cudf.core._compat import PANDAS_LT_300
@@ -205,19 +203,10 @@ def __setitem__(self, key, value):
         if is_scalar(value):
             value = to_cudf_compatible_scalar(value)
             if (
-                not isinstance(
-                    self._frame._column,
-                    (
-                        cudf.core.column.DatetimeColumn,
-                        cudf.core.column.TimeDeltaColumn,
-                    ),
-                )
+                self._frame.dtype.kind not in "mM"
                 and cudf.utils.utils._isnat(value)
                 and not (
-                    isinstance(
-                        self._frame._column, cudf.core.column.StringColumn
-                    )
-                    and isinstance(value, str)
+                    self._frame.dtype == "object" and isinstance(value, str)
                 )
             ):
                 raise MixedTypeError(
@@ -226,14 +215,10 @@ def __setitem__(self, key, value):
                 )
             elif (
                 not (
-                    is_float_dtype(self._frame._column.dtype)
+                    self._frame.dtype.kind == "f"
                     or (
-                        isinstance(
-                            self._frame._column.dtype, cudf.CategoricalDtype
-                        )
-                        and is_float_dtype(
-                            self._frame._column.dtype.categories.dtype
-                        )
+                        isinstance(self._frame.dtype, cudf.CategoricalDtype)
+                        and self._frame.dtype.categories.dtype.kind == "f"
                     )
                 )
                 and isinstance(value, (np.float32, np.float64))
@@ -241,40 +226,37 @@ def __setitem__(self, key, value):
             ):
                 raise MixedTypeError(
                     f"Cannot assign {value=} to "
-                    f"non-float dtype={self._frame._column.dtype}"
+                    f"non-float dtype={self._frame.dtype}"
                 )
             elif (
-                is_bool_dtype(self._frame._column.dtype)
+                self._frame.dtype.kind == "b"
                 and not is_bool_dtype(value)
                 and value not in {None, cudf.NA}
             ):
                 raise MixedTypeError(
                     f"Cannot assign {value=} to "
-                    f"bool dtype={self._frame._column.dtype}"
+                    f"bool dtype={self._frame.dtype}"
                 )
         elif not (
             isinstance(value, (list, dict))
             and isinstance(
-                self._frame._column.dtype, (cudf.ListDtype, cudf.StructDtype)
+                self._frame.dtype, (cudf.ListDtype, cudf.StructDtype)
             )
         ):
             value = as_column(value)
 
         if (
-            (
-                _is_non_decimal_numeric_dtype(self._frame._column.dtype)
-                or is_string_dtype(self._frame._column.dtype)
-            )
+            (self._frame.dtype.kind in "uifb" or self._frame.dtype == "object")
             and hasattr(value, "dtype")
-            and _is_non_decimal_numeric_dtype(value.dtype)
+            and value.dtype.kind in "uifb"
         ):
             # normalize types if necessary:
             # In contrast to Column.__setitem__ (which downcasts the value to
             # the dtype of the column) here we upcast the series to the
             # larger data type mimicking pandas
-            to_dtype = np.result_type(value.dtype, self._frame._column.dtype)
+            to_dtype = np.result_type(value.dtype, self._frame.dtype)
             value = value.astype(to_dtype)
-            if to_dtype != self._frame._column.dtype:
+            if to_dtype != self._frame.dtype:
                 # Do not remove until pandas-3.0 support is added.
                 assert (
                     PANDAS_LT_300
@@ -283,7 +265,7 @@ def __setitem__(self, key, value):
                     f"Setting an item of incompatible dtype is deprecated "
                     "and will raise in a future error of pandas. "
                     f"Value '{value}' has dtype incompatible with "
-                    f"{self._frame._column.dtype}, "
+                    f"{self._frame.dtype}, "
                     "please explicitly cast to a compatible dtype first.",
                     FutureWarning,
                 )
@@ -336,27 +318,27 @@ def __setitem__(self, key, value):
                 and not isinstance(self._frame.index, cudf.MultiIndex)
                 and is_scalar(value)
             ):
-                # TODO: Modifying index in place is bad because
-                # our index are immutable, but columns are not (which
-                # means our index are mutable with internal APIs).
-                # Get rid of the deep copy once columns too are
-                # immutable.
-                idx_copy = self._frame._index.copy(deep=True)
-                if (
-                    isinstance(idx_copy, cudf.RangeIndex)
-                    and isinstance(key, int)
-                    and (key == idx_copy[-1] + idx_copy.step)
-                ):
-                    idx_copy = cudf.RangeIndex(
-                        start=idx_copy.start,
-                        stop=idx_copy.stop + idx_copy.step,
-                        step=idx_copy.step,
-                        name=idx_copy.name,
-                    )
+                idx = self._frame._index
+                if isinstance(idx, cudf.RangeIndex):
+                    if isinstance(key, int) and (key == idx[-1] + idx.step):
+                        idx_copy = cudf.RangeIndex(
+                            start=idx.start,
+                            stop=idx.stop + idx.step,
+                            step=idx.step,
+                            name=idx.name,
+                        )
+                    else:
+                        idx_copy = idx._as_int_index()
+                        _append_new_row_inplace(idx_copy._column, key)
                 else:
-                    if isinstance(idx_copy, cudf.RangeIndex):
-                        idx_copy = idx_copy._as_int_index()
-                    _append_new_row_inplace(idx_copy._values, key)
+                    # TODO: Modifying index in place is bad because
+                    # our index are immutable, but columns are not (which
+                    # means our index are mutable with internal APIs).
+                    # Get rid of the deep copy once columns too are
+                    # immutable.
+                    idx_copy = idx.copy(deep=True)
+                    _append_new_row_inplace(idx_copy._column, key)
+
                 self._frame._index = idx_copy
                 _append_new_row_inplace(self._frame._column, value)
                 return
@@ -1407,34 +1389,23 @@ def __repr__(self):
                     cudf.core.dtypes.DecimalDtype,
                 ),
             )
-        ) or isinstance(
-            preprocess._column,
-            cudf.core.column.timedelta.TimeDeltaColumn,
-        ):
+        ) or preprocess.dtype.kind == "m":
             fill_value = (
                 str(cudf.NaT)
-                if isinstance(
-                    preprocess._column,
-                    (
-                        cudf.core.column.TimeDeltaColumn,
-                        cudf.core.column.DatetimeColumn,
-                    ),
-                )
+                if preprocess.dtype.kind in "mM"
                 else str(cudf.NA)
             )
             output = repr(
                 preprocess.astype("str").fillna(fill_value).to_pandas()
             )
-        elif isinstance(
-            preprocess._column, cudf.core.column.CategoricalColumn
-        ):
+        elif isinstance(preprocess.dtype, cudf.CategoricalDtype):
             min_rows = (
                 height
                 if pd.get_option("display.min_rows") == 0
                 else pd.get_option("display.min_rows")
             )
             show_dimensions = pd.get_option("display.show_dimensions")
-            if preprocess._column.categories.dtype.kind == "f":
+            if preprocess.dtype.categories.dtype.kind == "f":
                 pd_series = (
                     preprocess.astype("str")
                     .to_pandas()
@@ -1461,13 +1432,13 @@ def __repr__(self):
             output = repr(preprocess.to_pandas())
 
         lines = output.split("\n")
-        if isinstance(preprocess._column, cudf.core.column.CategoricalColumn):
+        if isinstance(preprocess.dtype, cudf.CategoricalDtype):
             category_memory = lines[-1]
-            if preprocess._column.categories.dtype.kind == "f":
+            if preprocess.dtype.categories.dtype.kind == "f":
                 category_memory = category_memory.replace("'", "").split(": ")
                 category_memory = (
                     category_memory[0].replace(
-                        "object", preprocess._column.categories.dtype.name
+                        "object", preprocess.dtype.categories.dtype.name
                     )
                     + ": "
                     + category_memory[1]
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 7f6ce1100ea..12a1ecc68e0 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -1058,8 +1058,7 @@ def _to_iso_calendar(arg):
         )
     if isinstance(arg, cudf.Index):
         iso_params = [
-            arg._column.as_string_column(arg._values.dtype, fmt)
-            for fmt in formats
+            arg._column.as_string_column(arg.dtype, fmt) for fmt in formats
         ]
         index = arg._column
     elif isinstance(arg.series, cudf.Series):

From d29af846ed7f881d7cedccd07f147bde39218101 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Tue, 7 May 2024 17:10:51 -0700
Subject: [PATCH 167/842] Rework some python tests of Parquet delta encodings
 (#15693)

test_parquet.py currently takes around 55s to run on an RTXA6000 system. A large portion of that run time is in two tests of the Parquet DELTA_LENGTH_BYTE_ARRAY and DELTA_BYTE_ARRAY encodings. These tests are parameterized with varying row counts to test certain encoding edge cases, but the final two row counts (10,000, 50,000) are unnecessarily large to provide adequate test coverage. This PR reduces the number of row counts (some were redundant) and decreases the maximum row count to 1,000.  This drops the execution time to just under 26s on the same system.

This PR also corrects an oversight from #15239. DELTA_BYTE_ARRAY encoding should have been added to the tests at that time.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15693
---
 python/cudf/cudf/tests/test_parquet.py | 55 +++++++++++++++-----------
 1 file changed, 32 insertions(+), 23 deletions(-)

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index f1b90b40991..1e175f5ff0d 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1311,8 +1311,19 @@ def test_parquet_delta_byte_array(datadir):
     assert_eq(cudf.read_parquet(fname), pd.read_parquet(fname))
 
 
+# values chosen to exercise:
+#    1 - header only, no bitpacked values
+#    2 - one bitpacked value
+#   23 - one partially filled miniblock
+#   32 - almost full miniblock
+#   33 - one full miniblock
+#   34 - one full miniblock plus one value in new miniblock
+#  128 - almost full block
+#  129 - one full block
+#  130 - one full block plus one value in new block
+# 1000 - multiple blocks
 def delta_num_rows():
-    return [1, 2, 23, 32, 33, 34, 64, 65, 66, 128, 129, 130, 20000, 50000]
+    return [1, 2, 23, 32, 33, 34, 128, 129, 130, 1000]
 
 
 @pytest.mark.parametrize("nrows", delta_num_rows())
@@ -1412,17 +1423,16 @@ def test_delta_byte_array_roundtrip(
     pcdf = cudf.from_pandas(test_pdf)
     assert_eq(cdf, pcdf)
 
-    # Test DELTA_LENGTH_BYTE_ARRAY writing as well
-    if str_encoding == "DELTA_LENGTH_BYTE_ARRAY":
-        cudf_fname = tmpdir.join("cdfdeltaba.parquet")
-        pcdf.to_parquet(
-            cudf_fname,
-            compression="snappy",
-            header_version="2.0",
-            use_dictionary=False,
-        )
-        cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
-        assert_eq(cdf2, cdf)
+    # Write back out with cudf and make sure pyarrow can read it
+    cudf_fname = tmpdir.join("cdfdeltaba.parquet")
+    pcdf.to_parquet(
+        cudf_fname,
+        compression="snappy",
+        header_version="2.0",
+        use_dictionary=False,
+    )
+    cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
+    assert_eq(cdf2, cdf)
 
 
 @pytest.mark.parametrize("nrows", delta_num_rows())
@@ -1479,17 +1489,16 @@ def string_list_gen_wrapped(x, y):
     pcdf = cudf.from_pandas(test_pdf)
     assert_eq(cdf, pcdf)
 
-    # Test DELTA_LENGTH_BYTE_ARRAY writing as well
-    if str_encoding == "DELTA_LENGTH_BYTE_ARRAY":
-        cudf_fname = tmpdir.join("cdfdeltaba.parquet")
-        pcdf.to_parquet(
-            cudf_fname,
-            compression="snappy",
-            header_version="2.0",
-            use_dictionary=False,
-        )
-        cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
-        assert_eq(cdf2, cdf)
+    # Write back out with cudf and make sure pyarrow can read it
+    cudf_fname = tmpdir.join("cdfdeltaba.parquet")
+    pcdf.to_parquet(
+        cudf_fname,
+        compression="snappy",
+        header_version="2.0",
+        use_dictionary=False,
+    )
+    cdf2 = cudf.from_pandas(pd.read_parquet(cudf_fname))
+    assert_eq(cdf2, cdf)
 
 
 @pytest.mark.parametrize(

From 46ae8cbc5cad97d45500901b1b15ed7c2f3eb0fc Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 7 May 2024 17:13:57 -0700
Subject: [PATCH 168/842] Fix copy assignment and the comparison operator of
 `rmm_host_allocator` (#15677)

Copy assignment of `rmm_host_allocator`, used in `hostdevice_vector`, is missing the `stream` member assignment, leading to deallocation in the default stream in the assigned-to allocator.

This PR fixes this error by switching to the auto-generated special functions.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15677
---
 .../cudf/detail/utilities/rmm_host_vector.hpp | 29 +++----------------
 1 file changed, 4 insertions(+), 25 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp b/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
index 858501877b0..6901a19473e 100644
--- a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
@@ -109,30 +109,6 @@ class rmm_host_allocator {
   {
   }
 
-  /**
-   * @brief Copy constructor
-   */
-  rmm_host_allocator(rmm_host_allocator const& other) = default;
-
-  /**
-   * @brief Move constructor
-   */
-  rmm_host_allocator(rmm_host_allocator&& other) = default;
-
-  /**
-   * @brief Assignment operator
-   */
-  rmm_host_allocator& operator=(rmm_host_allocator const& other)
-  {
-    mr = other.mr;
-    return *this;
-  }
-
-  /**
-   * @brief rmm_host_allocator's null destructor does nothing.
-   */
-  inline ~rmm_host_allocator() {}
-
   /**
    * @brief This method allocates storage for objects in host memory.
    *
@@ -183,7 +159,10 @@ class rmm_host_allocator {
    *  @param x The other \p rmm_host_allocator of interest.
    *  @return This method always returns \c true.
    */
-  inline bool operator==(rmm_host_allocator const& x) const { return x.mr == mr; }
+  inline bool operator==(rmm_host_allocator const& x) const
+  {
+    return x.mr == mr && x.stream == stream;
+  }
 
   /**
    * @brief This method tests this \p rmm_host_allocator for inequality

From 5f1f0dd503ac55facfb91ae0c528b88b306831df Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Tue, 7 May 2024 20:15:50 -0700
Subject: [PATCH 169/842] Round trip FIXED_LEN_BYTE_ARRAY data properly in
 Parquet writer (#15600)

#13437 added the ability to consume FIXED_LEN_BYTE_ARRAY encoded data and represent it as lists of `UINT8`. When trying to write this data back to Parquet there are two problems. 1) the notion of fixed length is lost, and 2) the `UINT8` data is written as a list of `INT32` which can quadruple the storage required. This PR addresses both issues by adding fields to the input and output metadata to allow for preserving the form of the original data.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/15600
---
 cpp/include/cudf/io/types.hpp              | 59 ++++++++++++++-
 cpp/src/io/functions.cpp                   |  2 +
 cpp/src/io/parquet/page_enc.cu             | 32 +++++---
 cpp/src/io/parquet/parquet_gpu.hpp         |  1 +
 cpp/src/io/parquet/reader_impl.cpp         | 16 ++--
 cpp/src/io/parquet/reader_impl_helpers.cpp |  7 +-
 cpp/src/io/parquet/writer_impl.cu          | 13 +++-
 cpp/src/io/utilities/column_buffer.cpp     |  5 ++
 cpp/tests/io/parquet_writer_test.cpp       | 86 ++++++++++++++++++++++
 9 files changed, 198 insertions(+), 23 deletions(-)

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index b3dea0ab280..150e997f533 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -236,6 +236,8 @@ enum dictionary_policy {
 struct column_name_info {
   std::string name;                        ///< Column name
   std::optional<bool> is_nullable;         ///< Column nullability
+  std::optional<bool> is_binary;           ///< Column is binary (i.e. not a list)
+  std::optional<int32_t> type_length;      ///< Byte width of data (for fixed length data)
   std::vector<column_name_info> children;  ///< Child column names
 
   /**
@@ -243,9 +245,12 @@ struct column_name_info {
    *
    * @param _name Column name
    * @param _is_nullable True if column is nullable
+   * @param _is_binary True if column is binary data
    */
-  column_name_info(std::string const& _name, std::optional<bool> _is_nullable = std::nullopt)
-    : name(_name), is_nullable(_is_nullable)
+  column_name_info(std::string const& _name,
+                   std::optional<bool> _is_nullable = std::nullopt,
+                   std::optional<bool> _is_binary   = std::nullopt)
+    : name(_name), is_nullable(_is_nullable), is_binary(_is_binary)
   {
   }
 
@@ -606,6 +611,7 @@ class column_in_metadata {
   bool _skip_compression    = false;
   std::optional<uint8_t> _decimal_precision;
   std::optional<int32_t> _parquet_field_id;
+  std::optional<int32_t> _type_length;
   std::vector<column_in_metadata> children;
   column_encoding _encoding = column_encoding::USE_DEFAULT;
 
@@ -693,6 +699,19 @@ class column_in_metadata {
     return *this;
   }
 
+  /**
+   * @brief Set the data length of the column. Only valid if this column is a
+   * fixed-length byte array.
+   *
+   * @param length The data length to set for this column
+   * @return this for chaining
+   */
+  column_in_metadata& set_type_length(int32_t length) noexcept
+  {
+    _type_length = length;
+    return *this;
+  }
+
   /**
    * @brief Set the parquet field id of this column.
    *
@@ -826,6 +845,22 @@ class column_in_metadata {
    */
   [[nodiscard]] uint8_t get_decimal_precision() const { return _decimal_precision.value(); }
 
+  /**
+   * @brief Get whether type length has been set for this column
+   *
+   * @return Boolean indicating whether type length has been set for this column
+   */
+  [[nodiscard]] bool is_type_length_set() const noexcept { return _type_length.has_value(); }
+
+  /**
+   * @brief Get the type length that was set for this column.
+   *
+   * @throws std::bad_optional_access If type length was not set for this
+   *         column. Check using `is_type_length_set()` first.
+   * @return The decimal precision that was set for this column
+   */
+  [[nodiscard]] uint8_t get_type_length() const { return _type_length.value(); }
+
   /**
    * @brief Get whether parquet field id has been set for this column.
    *
@@ -932,6 +967,7 @@ struct partition_info {
 class reader_column_schema {
   // Whether to read binary data as a string column
   bool _convert_binary_to_strings{true};
+  int32_t _type_length{0};
 
   std::vector<reader_column_schema> children;
 
@@ -997,6 +1033,18 @@ class reader_column_schema {
     return *this;
   }
 
+  /**
+   * @brief Sets the length of fixed length data.
+   *
+   * @param type_length Size of the data type in bytes
+   * @return this for chaining
+   */
+  reader_column_schema& set_type_length(int32_t type_length)
+  {
+    _type_length = type_length;
+    return *this;
+  }
+
   /**
    * @brief Get whether to encode this column as binary or string data
    *
@@ -1007,6 +1055,13 @@ class reader_column_schema {
     return _convert_binary_to_strings;
   }
 
+  /**
+   * @brief Get the length in bytes of this fixed length data.
+   *
+   * @return The length in bytes of the data type
+   */
+  [[nodiscard]] int32_t get_type_length() const { return _type_length; }
+
   /**
    * @brief Get the number of child objects
    *
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 98b010109ec..0358a1a6b86 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -592,6 +592,8 @@ table_input_metadata::table_input_metadata(table_metadata const& metadata)
     [&](column_name_info const& name) {
       auto col_meta = column_in_metadata{name.name};
       if (name.is_nullable.has_value()) { col_meta.set_nullability(name.is_nullable.value()); }
+      if (name.is_binary.value_or(false)) { col_meta.set_output_as_binary(true); }
+      if (name.type_length.has_value()) { col_meta.set_type_length(name.type_length.value()); }
       std::transform(name.children.begin(),
                      name.children.end(),
                      std::back_inserter(col_meta.children),
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index 11b18579c58..e9558735929 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -109,10 +109,10 @@ using rle_page_enc_state_s = page_enc_state_s<rle_buffer_size>;
 /**
  * @brief Returns the size of the type in the Parquet file.
  */
-constexpr uint32_t physical_type_len(Type physical_type, type_id id)
+constexpr uint32_t physical_type_len(Type physical_type, type_id id, int type_length)
 {
-  if (physical_type == FIXED_LEN_BYTE_ARRAY and id == type_id::DECIMAL128) {
-    return sizeof(__int128_t);
+  if (physical_type == FIXED_LEN_BYTE_ARRAY) {
+    return id == type_id::DECIMAL128 ? sizeof(__int128_t) : type_length;
   }
   switch (physical_type) {
     case INT96: return 12u;
@@ -183,7 +183,7 @@ void __device__ calculate_frag_size(frag_init_state_s* const s, int t)
 
   auto const physical_type   = s->col.physical_type;
   auto const leaf_type       = s->col.leaf_column->type().id();
-  auto const dtype_len       = physical_type_len(physical_type, leaf_type);
+  auto const dtype_len       = physical_type_len(physical_type, leaf_type, s->col.type_length);
   auto const nvals           = s->frag.num_leaf_values;
   auto const start_value_idx = s->frag.start_value_idx;
 
@@ -541,7 +541,8 @@ __device__ size_t delta_data_len(Type physical_type,
                                  size_t page_size,
                                  encode_kernel_mask encoding)
 {
-  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  // dtype_len_out is for the lengths, rather than the char data, so pass sizeof(int32_t)
+  auto const dtype_len_out = physical_type_len(physical_type, type_id, sizeof(int32_t));
   auto const dtype_len     = [&]() -> uint32_t {
     if (physical_type == INT32) { return int32_logical_len(type_id); }
     if (physical_type == INT96) { return sizeof(int64_t); }
@@ -1662,7 +1663,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
   __syncthreads();
   auto const physical_type = s->col.physical_type;
   auto const type_id       = s->col.leaf_column->type().id();
-  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len_out = physical_type_len(physical_type, type_id, s->col.type_length);
   auto const dtype_len_in  = [&]() -> uint32_t {
     if (physical_type == INT32) { return int32_logical_len(type_id); }
     if (physical_type == INT96) { return sizeof(int64_t); }
@@ -1837,6 +1838,19 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
                            thrust::make_reverse_iterator(v_char_ptr),
                            dst + pos);
             }
+          } else {
+            auto const elem =
+              get_element<statistics::byte_array_view>(*(s->col.leaf_column), val_idx);
+            if (len != 0 and elem.data() != nullptr) {
+              if (is_split_stream) {
+                auto const v_char_ptr = reinterpret_cast<uint8_t const*>(elem.data());
+                for (int i = 0; i < dtype_len_out; i++, pos += stride) {
+                  dst[pos] = v_char_ptr[i];
+                }
+              } else {
+                memcpy(dst + pos, elem.data(), len);
+              }
+            }
           }
         } break;
       }
@@ -1884,7 +1898,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
   // Encode data values
   auto const physical_type = s->col.physical_type;
   auto const type_id       = s->col.leaf_column->type().id();
-  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len_out = physical_type_len(physical_type, type_id, s->col.type_length);
   auto const dtype_len_in  = [&]() -> uint32_t {
     if (physical_type == INT32) { return int32_logical_len(type_id); }
     if (physical_type == INT96) { return sizeof(int64_t); }
@@ -2016,7 +2030,7 @@ CUDF_KERNEL void __launch_bounds__(block_size, 8)
   // Encode data values
   auto const physical_type = s->col.physical_type;
   auto const type_id       = s->col.leaf_column->type().id();
-  auto const dtype_len_out = physical_type_len(physical_type, type_id);
+  auto const dtype_len_out = physical_type_len(physical_type, type_id, s->col.type_length);
   auto const dtype_len_in  = [&]() -> uint32_t {
     if (physical_type == INT32) { return int32_logical_len(type_id); }
     if (physical_type == INT96) { return sizeof(int64_t); }
@@ -3218,7 +3232,7 @@ __device__ int32_t calculate_boundary_order(statistics_chunk const* s,
 }
 
 // align ptr to an 8-byte boundary. address returned will be <= ptr.
-constexpr __device__ void* align8(void* ptr)
+inline __device__ void* align8(void* ptr)
 {
   // it's ok to round down because we have an extra 7 bytes in the buffer
   auto algn = 3 & reinterpret_cast<std::uintptr_t>(ptr);
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 3b18175dccd..e3e4d8736c7 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -472,6 +472,7 @@ struct chunk_page_info {
 struct parquet_column_device_view : stats_column_desc {
   Type physical_type;            //!< physical data type
   ConvertedType converted_type;  //!< logical data type
+  int32_t type_length;           //!< length of fixed_length_byte_array data
   uint8_t level_bits;  //!< bits to encode max definition (lower nibble) & repetition (upper nibble)
                        //!< levels
   constexpr uint8_t num_def_level_bits() const { return level_bits & 0xf; }
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 0602b5ec007..3af4d5cdb86 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -510,14 +510,18 @@ table_with_metadata reader::impl::read_chunk_internal(
 
   // Create the final output cudf columns.
   for (size_t i = 0; i < _output_buffers.size(); ++i) {
-    auto metadata      = _reader_column_schema.has_value()
-                           ? std::make_optional<reader_column_schema>((*_reader_column_schema)[i])
-                           : std::nullopt;
-    auto const& schema = _metadata->get_schema(_output_column_schemas[i]);
-    // FIXED_LEN_BYTE_ARRAY never read as string
-    if (schema.type == FIXED_LEN_BYTE_ARRAY and schema.converted_type != DECIMAL) {
+    auto metadata           = _reader_column_schema.has_value()
+                                ? std::make_optional<reader_column_schema>((*_reader_column_schema)[i])
+                                : std::nullopt;
+    auto const& schema      = _metadata->get_schema(_output_column_schemas[i]);
+    auto const logical_type = schema.logical_type.value_or(LogicalType{});
+    // FIXED_LEN_BYTE_ARRAY never read as string.
+    // TODO: if we ever decide that the default reader behavior is to treat unannotated BINARY as
+    // binary and not strings, this test needs to change.
+    if (schema.type == FIXED_LEN_BYTE_ARRAY and logical_type.type != LogicalType::DECIMAL) {
       metadata = std::make_optional<reader_column_schema>();
       metadata->set_convert_binary_to_strings(false);
+      metadata->set_type_length(schema.type_length);
     }
     // Only construct `out_metadata` if `_output_metadata` has not been cached.
     if (!_output_metadata) {
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index c47beb8d7ed..68dbf532a68 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -165,9 +165,10 @@ type_id to_type_id(SchemaElement const& schema,
     case FLOAT: return type_id::FLOAT32;
     case DOUBLE: return type_id::FLOAT64;
     case BYTE_ARRAY:
-    case FIXED_LEN_BYTE_ARRAY:
-      // Can be mapped to INT32 (32-bit hash) or STRING
-      return strings_to_categorical ? type_id::INT32 : type_id::STRING;
+      // strings can be mapped to a 32-bit hash
+      if (strings_to_categorical) { return type_id::INT32; }
+      [[fallthrough]];
+    case FIXED_LEN_BYTE_ARRAY: return type_id::STRING;
     case INT96:
       return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
                                                    : type_id::TIMESTAMP_NANOSECONDS;
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 24aa630a05f..1dfced94f5b 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -755,7 +755,14 @@ std::vector<schema_tree_node> construct_schema_tree(
         }
 
         schema_tree_node col_schema{};
-        col_schema.type            = Type::BYTE_ARRAY;
+        // test if this should be output as FIXED_LEN_BYTE_ARRAY
+        if (col_meta.is_type_length_set()) {
+          col_schema.type        = Type::FIXED_LEN_BYTE_ARRAY;
+          col_schema.type_length = col_meta.get_type_length();
+        } else {
+          col_schema.type = Type::BYTE_ARRAY;
+        }
+
         col_schema.converted_type  = thrust::nullopt;
         col_schema.stats_dtype     = statistics_dtype::dtype_byte_array;
         col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
@@ -1075,6 +1082,7 @@ parquet_column_device_view parquet_column_view::get_device_view(rmm::cuda_stream
   auto desc        = parquet_column_device_view{};  // Zero out all fields
   desc.stats_dtype = schema_node.stats_dtype;
   desc.ts_scale    = schema_node.ts_scale;
+  desc.type_length = schema_node.type_length;
 
   if (is_list()) {
     desc.level_offsets = _dremel_offsets.data();
@@ -1317,8 +1325,7 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
       chunk_col_desc.requested_encoding != column_encoding::USE_DEFAULT &&
       chunk_col_desc.requested_encoding != column_encoding::DICTIONARY;
     auto const is_type_non_dict =
-      chunk_col_desc.physical_type == Type::BOOLEAN ||
-      (chunk_col_desc.output_as_byte_array && chunk_col_desc.physical_type == Type::BYTE_ARRAY);
+      chunk_col_desc.physical_type == Type::BOOLEAN || chunk_col_desc.output_as_byte_array;
 
     if (is_type_non_dict || is_requested_non_dict) {
       chunk.use_dictionary = false;
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index db84778edc6..5ef43599838 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -188,6 +188,11 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
         if (schema_info != nullptr) {
           schema_info->children.push_back(column_name_info{"offsets"});
           schema_info->children.push_back(column_name_info{"binary"});
+          // cuDF type will be list<UINT8>, but remember it was originally binary data
+          schema_info->is_binary = true;
+          if (schema.has_value() and schema->get_type_length() > 0) {
+            schema_info->type_length = schema->get_type_length();
+          }
         }
 
         return make_lists_column(
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index fd8484bc70f..ad0860e265e 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -1872,6 +1872,92 @@ TEST_F(ParquetWriterTest, DurationByteStreamSplit)
   test_durations([](auto i) { return false; }, true);
 }
 
+TEST_F(ParquetWriterTest, WriteFixedLenByteArray)
+{
+  srand(31337);
+  using cudf::io::parquet::detail::Encoding;
+  constexpr int fixed_width          = 16;
+  constexpr cudf::size_type num_rows = 200;
+  std::vector<uint8_t> data(num_rows * fixed_width);
+  std::vector<cudf::size_type> offsets(num_rows + 1);
+
+  // fill a num_rows X fixed_width array with random numbers and populate offsets array
+  int cur_offset = 0;
+  for (int i = 0; i < num_rows; i++) {
+    offsets[i] = cur_offset;
+    for (int j = 0; j < fixed_width; j++, cur_offset++) {
+      data[cur_offset] = rand() & 0xff;
+    }
+  }
+  offsets[num_rows] = cur_offset;
+
+  auto data_child = cudf::test::fixed_width_column_wrapper<uint8_t>(data.begin(), data.end());
+  auto off_child  = cudf::test::fixed_width_column_wrapper<int32_t>(offsets.begin(), offsets.end());
+  auto col = cudf::make_lists_column(num_rows, off_child.release(), data_child.release(), 0, {});
+
+  auto expected = table_view{{*col, *col, *col, *col}};
+  cudf::io::table_input_metadata expected_metadata(expected);
+
+  expected_metadata.column_metadata[0]
+    .set_name("flba_plain")
+    .set_type_length(fixed_width)
+    .set_encoding(cudf::io::column_encoding::PLAIN)
+    .set_output_as_binary(true);
+  expected_metadata.column_metadata[1]
+    .set_name("flba_split")
+    .set_type_length(fixed_width)
+    .set_encoding(cudf::io::column_encoding::BYTE_STREAM_SPLIT)
+    .set_output_as_binary(true);
+  expected_metadata.column_metadata[2]
+    .set_name("flba_delta")
+    .set_type_length(fixed_width)
+    .set_encoding(cudf::io::column_encoding::DELTA_BYTE_ARRAY)
+    .set_output_as_binary(true);
+  expected_metadata.column_metadata[3]
+    .set_name("flba_dict")
+    .set_type_length(fixed_width)
+    .set_encoding(cudf::io::column_encoding::DICTIONARY)
+    .set_output_as_binary(true);
+
+  auto filepath = temp_env->get_temp_filepath("WriteFixedLenByteArray.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+
+  // check page headers to make sure each column is encoded with the appropriate encoder
+  auto const source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::detail::FileMetaData fmd;
+  read_footer(source, &fmd);
+
+  // check that the schema retains the FIXED_LEN_BYTE_ARRAY type
+  for (int i = 1; i <= 4; i++) {
+    EXPECT_EQ(fmd.schema[i].type, cudf::io::parquet::detail::Type::FIXED_LEN_BYTE_ARRAY);
+    EXPECT_EQ(fmd.schema[i].type_length, fixed_width);
+  }
+
+  // no nulls and no repetition, so the only encoding used should be for the data.
+  auto const expect_enc = [&fmd](int idx, cudf::io::parquet::detail::Encoding enc) {
+    EXPECT_EQ(fmd.row_groups[0].columns[idx].meta_data.encodings[0], enc);
+  };
+
+  // requested plain
+  expect_enc(0, Encoding::PLAIN);
+  // requested byte_stream_split
+  expect_enc(1, Encoding::BYTE_STREAM_SPLIT);
+  // requested delta_byte_array
+  expect_enc(2, Encoding::DELTA_BYTE_ARRAY);
+  // requested dictionary, but should fall back to plain
+  // TODO: update if we get FLBA working with dictionary encoding
+  expect_enc(3, Encoding::PLAIN);
+}
+
 /////////////////////////////////////////////////////////////
 // custom mem mapped data sink that supports device writes
 template <bool supports_device_writes>

From 2056d0fa16bbea9fef9bb9e10558967680b14a3b Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 8 May 2024 11:19:35 -0400
Subject: [PATCH 170/842] Use experimental make_strings_children for
 multi-replace_re (#15667)

Updates multi-pattern version of `cudf::strings::replace_re` to use the new experimental `make_strings_children` which supports building large strings.

Reference https://github.com/rapidsai/cudf/issues/15579

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/15667
---
 cpp/src/strings/replace/multi_re.cu | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 5172dba3fc3..b9a3acf747f 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -23,7 +23,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_children_ex.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -56,13 +56,14 @@ struct replace_multi_regex_fn {
   device_span<reprog_device const> progs;  // array of regex progs
   found_range* d_found_ranges;             // working array matched (begin,end) values
   column_device_view const d_repls;        // replacement strings
-  size_type* d_offsets{};
+  size_type* d_sizes{};
   char* d_chars{};
+  cudf::detail::input_offsetalator d_offsets;
 
   __device__ void operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
 
@@ -129,7 +130,7 @@ struct replace_multi_regex_fn {
                      d_str.size_bytes() - last_pos.byte_offset(),
                      out_ptr);
     } else {
-      d_offsets[idx] = nbytes;
+      d_sizes[idx] = nbytes;
     }
   }
 };
@@ -186,7 +187,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
 
   auto found_ranges = rmm::device_uvector<found_range>(d_progs.size() * input.size(), stream);
 
-  auto [offsets_column, chars] = make_strings_children(
+  auto [offsets_column, chars] = experimental::make_strings_children(
     replace_multi_regex_fn{*d_strings, d_progs, found_ranges.data(), *d_repls},
     input.size(),
     stream,

From ab73b4cd14f39fbc64f6b0b0ab625e78715acd2d Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 8 May 2024 12:06:13 -0500
Subject: [PATCH 171/842] Make `nan_as_null` behavior consistent across all
 APIs (#15692)

Fixes: #15679

This PR switches the default of `nan_as_null` to be `False` if pandas compatibility mode is turned on.

Forked from #14534

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15692
---
 python/cudf/cudf/core/dataframe.py       | 13 +++++++++++--
 python/cudf/cudf/core/index.py           |  6 +++++-
 python/cudf/cudf/core/series.py          |  4 +++-
 python/cudf/cudf/tests/test_dataframe.py |  8 ++++++++
 python/cudf/cudf/tests/test_index.py     |  8 ++++++++
 python/cudf/cudf/tests/test_series.py    |  8 ++++++++
 6 files changed, 43 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 6928425a867..b937d2da25c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -684,9 +684,16 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
 
     @_cudf_nvtx_annotate
     def __init__(
-        self, data=None, index=None, columns=None, dtype=None, nan_as_null=True
+        self,
+        data=None,
+        index=None,
+        columns=None,
+        dtype=None,
+        nan_as_null=no_default,
     ):
         super().__init__()
+        if nan_as_null is no_default:
+            nan_as_null = not cudf.get_option("mode.pandas_compatible")
 
         if isinstance(columns, (Series, cudf.BaseIndex)):
             columns = columns.to_pandas()
@@ -3185,7 +3192,7 @@ def reset_index(
         )
 
     @_cudf_nvtx_annotate
-    def insert(self, loc, name, value, nan_as_null=None):
+    def insert(self, loc, name, value, nan_as_null=no_default):
         """Add a column to DataFrame at the index specified by loc.
 
         Parameters
@@ -3200,6 +3207,8 @@ def insert(self, loc, name, value, nan_as_null=None):
             ``null`` values.
             If ``False``, leaves ``np.nan`` values as is.
         """
+        if nan_as_null is no_default:
+            nan_as_null = not cudf.get_option("mode.pandas_compatible")
         return self._insert(
             loc=loc,
             name=name,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 52322b0160f..35afe6ee949 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2964,7 +2964,7 @@ def _clean_nulls_from_index(self):
 
 @_cudf_nvtx_annotate
 def as_index(
-    arbitrary, nan_as_null=None, copy=False, name=no_default, dtype=None
+    arbitrary, nan_as_null=no_default, copy=False, name=no_default, dtype=None
 ) -> BaseIndex:
     """Create an Index from an arbitrary object
 
@@ -3014,6 +3014,10 @@ def as_index(
         - DatetimeIndex for Datetime input.
         - Index for all other inputs.
     """
+    if nan_as_null is no_default:
+        nan_as_null = (
+            False if cudf.get_option("mode.pandas_compatible") else None
+        )
 
     if name is no_default:
         name = getattr(arbitrary, "name", None)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 63a49a898f4..c7bc97edd68 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -581,8 +581,10 @@ def __init__(
         dtype=None,
         name=None,
         copy=False,
-        nan_as_null=True,
+        nan_as_null=no_default,
     ):
+        if nan_as_null is no_default:
+            nan_as_null = not cudf.get_option("mode.pandas_compatible")
         index_from_data = None
         name_from_data = None
         if data is None:
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index f52076407b5..2dee3566e1b 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -11006,3 +11006,11 @@ def test_op_preserves_column_metadata(column, operation):
     result = operation(df).columns
     expected = pd.Index(column)
     pd.testing.assert_index_equal(result, expected, exact=True)
+
+
+def test_dataframe_init_with_nans():
+    with cudf.option_context("mode.pandas_compatible", True):
+        gdf = cudf.DataFrame({"a": [1, 2, 3, np.nan]})
+    assert gdf["a"].dtype == np.dtype("float64")
+    pdf = pd.DataFrame({"a": [1, 2, 3, np.nan]})
+    assert_eq(pdf, gdf)
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index baa839ecd72..0b252cec4b8 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -3249,3 +3249,11 @@ def test_index_contains_float_int(data, dtype, needle):
     expected = needle in pidx
 
     assert_eq(actual, expected)
+
+
+def test_Index_init_with_nans():
+    with cudf.option_context("mode.pandas_compatible", True):
+        gi = cudf.Index([1, 2, 3, np.nan])
+    assert gi.dtype == np.dtype("float64")
+    pi = pd.Index([1, 2, 3, np.nan])
+    assert_eq(pi, gi)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 6a9de197374..08a6173d3f5 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2788,6 +2788,14 @@ def test_squeeze_invalid_axis(axis):
         cudf.Series([1]).squeeze(axis=axis)
 
 
+def test_series_init_with_nans():
+    with cudf.option_context("mode.pandas_compatible", True):
+        gs = cudf.Series([1, 2, 3, np.nan])
+    assert gs.dtype == np.dtype("float64")
+    ps = pd.Series([1, 2, 3, np.nan])
+    assert_eq(ps, gs)
+
+
 @pytest.mark.parametrize("data", [None, 123, 33243243232423, 0])
 def test_timestamp_series_init(data):
     scalar = pd.Timestamp(data)

From 6d0f3d9a6fc0d079d598e08cac824b7b3c6cbc9b Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 8 May 2024 10:20:35 -0700
Subject: [PATCH 172/842] Add new patch to hide more CCCL APIs (#15493)

See rapidsai/rapids-cmake#580

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/15493
---
 cpp/cmake/Modules/ConfigureCUDA.cmake           | 7 ++++++-
 cpp/cmake/thirdparty/patches/cccl_override.json | 5 +++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/cpp/cmake/Modules/ConfigureCUDA.cmake b/cpp/cmake/Modules/ConfigureCUDA.cmake
index f79e4c37228..f75b5aef7af 100644
--- a/cpp/cmake/Modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/Modules/ConfigureCUDA.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -25,6 +25,11 @@ else()
   list(APPEND CUDF_CUDA_FLAGS -Werror=cross-execution-space-call)
 endif()
 list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
+# This warning needs to be suppressed because some parts of cudf instantiate templated CCCL
+# functions in contexts where the resulting instantiations would have internal linkage (e.g. in
+# anonymous namespaces). In such contexts, the visibility attribute on the template is ignored, and
+# the compiler issues a warning. This is not a problem and will be fixed in future versions of CCCL.
+list(APPEND CUDF_CUDA_FLAGS -diag-suppress=1407)
 
 if(DISABLE_DEPRECATION_WARNINGS)
   list(APPEND CUDF_CXX_FLAGS -Wno-deprecated-declarations)
diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index 68fc8979c46..b33f17f3e4a 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -18,6 +18,11 @@
           "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
           "fixed_in" : ""
         },
+        {
+          "file": "cccl/kernel_pointer_hiding.diff",
+          "issue": "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]",
+          "fixed_in": "2.4"
+        },
         {
           "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
           "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",

From eaf555616ff83a75b3c3b11ce18e1c393604ccf4 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 8 May 2024 13:30:14 -0500
Subject: [PATCH 173/842] Properly implement binaryops for proxy types (#15684)

Fixes #15675
This PR makes changes to `cudf.pandas` machinery by not calling `operator.op` functions insider the re-direct calls.

Forked from #14534

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15684
---
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 170 ++++++------------
 .../cudf_pandas_tests/test_cudf_pandas.py     |  13 ++
 .../cudf_pandas_tests/test_fast_slow_proxy.py |   4 -
 .../cudf/cudf_pandas_tests/test_profiler.py   |   1 +
 4 files changed, 71 insertions(+), 117 deletions(-)

diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 835cfa89133..c66458077fa 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -597,90 +597,6 @@ def __setattr__(self, name, value):
             return
         return _FastSlowAttribute("__setattr__").__get__(self)(name, value)
 
-    def __add__(self, other):
-        return _fast_slow_function_call(operator.add, self, other)[0]
-
-    def __radd__(self, other):
-        return _fast_slow_function_call(operator.add, other, self)[0]
-
-    def __sub__(self, other):
-        return _fast_slow_function_call(operator.sub, self, other)[0]
-
-    def __rsub__(self, other):
-        return _fast_slow_function_call(operator.sub, other, self)[0]
-
-    def __mul__(self, other):
-        return _fast_slow_function_call(operator.mul, self, other)[0]
-
-    def __rmul__(self, other):
-        return _fast_slow_function_call(operator.mul, other, self)[0]
-
-    def __truediv__(self, other):
-        return _fast_slow_function_call(operator.truediv, self, other)[0]
-
-    def __rtruediv__(self, other):
-        return _fast_slow_function_call(operator.truediv, other, self)[0]
-
-    def __floordiv__(self, other):
-        return _fast_slow_function_call(operator.floordiv, self, other)[0]
-
-    def __rfloordiv__(self, other):
-        return _fast_slow_function_call(operator.floordiv, other, self)[0]
-
-    def __mod__(self, other):
-        return _fast_slow_function_call(operator.mod, self, other)[0]
-
-    def __rmod__(self, other):
-        return _fast_slow_function_call(operator.mod, other, self)[0]
-
-    def __divmod__(self, other):
-        return _fast_slow_function_call(divmod, self, other)[0]
-
-    def __rdivmod__(self, other):
-        return _fast_slow_function_call(divmod, other, self)[0]
-
-    def __pow__(self, other):
-        return _fast_slow_function_call(operator.pow, self, other)[0]
-
-    def __rpow__(self, other):
-        return _fast_slow_function_call(operator.pow, other, self)[0]
-
-    def __lshift__(self, other):
-        return _fast_slow_function_call(operator.lshift, self, other)[0]
-
-    def __rlshift__(self, other):
-        return _fast_slow_function_call(operator.lshift, other, self)[0]
-
-    def __rshift__(self, other):
-        return _fast_slow_function_call(operator.rshift, self, other)[0]
-
-    def __rrshift__(self, other):
-        return _fast_slow_function_call(operator.rshift, other, self)[0]
-
-    def __and__(self, other):
-        return _fast_slow_function_call(operator.and_, self, other)[0]
-
-    def __rand__(self, other):
-        return _fast_slow_function_call(operator.and_, other, self)[0]
-
-    def __xor__(self, other):
-        return _fast_slow_function_call(operator.xor, self, other)[0]
-
-    def __rxor__(self, other):
-        return _fast_slow_function_call(operator.xor, other, self)[0]
-
-    def __or__(self, other):
-        return _fast_slow_function_call(operator.or_, self, other)[0]
-
-    def __ror__(self, other):
-        return _fast_slow_function_call(operator.or_, other, self)[0]
-
-    def __matmul__(self, other):
-        return _fast_slow_function_call(operator.matmul, self, other)[0]
-
-    def __rmatmul__(self, other):
-        return _fast_slow_function_call(operator.matmul, other, self)[0]
-
 
 class _FinalProxy(_FastSlowProxy):
     """
@@ -1141,41 +1057,69 @@ def _replace_closurevars(
 
 
 _SPECIAL_METHODS: Set[str] = {
-    "__repr__",
-    "__str__",
-    "__len__",
-    "__contains__",
-    "__getitem__",
-    "__setitem__",
-    "__delitem__",
-    "__getslice__",
-    "__setslice__",
-    "__delslice__",
-    "__iter__",
-    "__lt__",
-    "__le__",
-    "__eq__",
-    "__ne__",
-    "__gt__",
-    "__ge__",
-    "__pos__",
-    "__neg__",
-    "__invert__",
     "__abs__",
-    "__round__",
-    "__format__",
+    "__add__",
+    "__and__",
     "__bool__",
-    "__float__",
-    "__int__",
+    "__call__",
     "__complex__",
-    "__enter__",
-    "__exit__",
-    "__next__",
+    "__contains__",
     "__copy__",
-    "__deepcopy__",
     "__dataframe__",
-    "__call__",
+    "__deepcopy__",
+    "__delitem__",
+    "__delslice__",
+    "__divmod__",
+    "__enter__",
+    "__eq__",
+    "__exit__",
+    "__float__",
+    "__floordiv__",
+    "__format__",
+    "__ge__",
+    "__getitem__",
+    "__getslice__",
+    "__gt__",
     # Added on a per-proxy basis
     # https://github.com/rapidsai/xdf/pull/306#pullrequestreview-1636155428
     # "__hash__",
+    "__int__",
+    "__invert__",
+    "__iter__",
+    "__le__",
+    "__len__",
+    "__lshift__",
+    "__lt__",
+    "__matmul__",
+    "__mod__",
+    "__mul__",
+    "__ne__",
+    "__neg__",
+    "__next__",
+    "__or__",
+    "__pos__",
+    "__pow__",
+    "__radd__",
+    "__rand__",
+    "__rdivmod__",
+    "__repr__",
+    "__rfloordiv__",
+    "__rlshift__",
+    "__rmatmul__",
+    "__rmod__",
+    "__rmul__",
+    "__ror__",
+    "__round__",
+    "__rpow__",
+    "__rrshift__",
+    "__rshift__",
+    "__rsub__",
+    "__rtruediv__",
+    "__rxor__",
+    "__setitem__",
+    "__setslice__",
+    "__str__",
+    "__sub__",
+    "__truediv__",
+    "__xor__",
 }
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 8d319cfe640..aa937d3ed4f 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1218,6 +1218,19 @@ def test_isinstance_base_offset():
     assert isinstance(offset, xpd.tseries.offsets.BaseOffset)
 
 
+def test_floordiv_array_vs_df():
+    xarray = xpd.Series([1, 2, 3], dtype="datetime64[ns]").array
+    parray = pd.Series([1, 2, 3], dtype="datetime64[ns]").array
+
+    xdf = xpd.DataFrame(xarray)
+    pdf = pd.DataFrame(parray)
+
+    actual = xarray.__floordiv__(xdf)
+    expected = parray.__floordiv__(pdf)
+
+    tm.assert_equal(actual, expected)
+
+
 def test_apply_slow_path_udf_references_global_module():
     def my_apply(df, unused):
         # `datetime` Raised `KeyError: __import__`
diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
index 631ad2f37b2..39bf07c49de 100644
--- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
+++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
@@ -439,10 +439,6 @@ def __radd__(self, other):
     assert Bar() + Foo() == "sum"
     assert FooProxy() + BarProxy() == "sum"
     assert BarProxy() + FooProxy() == "sum"
-    assert FooProxy() + Bar() == "sum"
-    assert Bar() + FooProxy() == "sum"
-    assert Foo() + BarProxy() == "sum"
-    assert BarProxy() + Foo() == "sum"
 
 
 def test_slow_attr_still_proxy():
diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py
index dd8d9287972..359a2a2c515 100644
--- a/python/cudf/cudf_pandas_tests/test_profiler.py
+++ b/python/cudf/cudf_pandas_tests/test_profiler.py
@@ -37,6 +37,7 @@ def test_profiler():
         "DataFrame.sum",
         "Series.__getitem__",
         "Timedelta",
+        "Timestamp.__add__",
     }
     for name, func in per_function_stats.items():
         assert (

From ffbdd2402d1131b9a06cda87b4ef888953b2901a Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Wed, 8 May 2024 12:52:16 -0700
Subject: [PATCH 174/842] Skip decode steps in Parquet reader when nullable
 columns have no nulls (#15332)

Closes #15266.

Some block and warp operations and some atomics can be avoided during Parquet page decoding when nullable columns are known to not contain any null values. One issue, however, is that since the columns are nullable, the null mask has to be set. My approach in this PR was to initialize nullable column buffers with an `ALL_VALID` mask rather than `ALL_NULL`. This will work when nulls are present because `store_validity()` sets the bitmask to all zeros before ORing in the passed `valid_mask`.

Benchmarks modified to not emit nulls showed a good improvement in decoding time for fixed-width data types.
```
## [0] NVIDIA RTX A6000

|  data_type  |    io_type    |  cardinality  |  run_length  |   Ref Time |   Ref Noise |   Cmp Time |   Cmp Noise |         Diff |   %Diff |  Status  |
|-------------|---------------|---------------|--------------|------------|-------------|------------|-------------|--------------|---------|----------|
|  INTEGRAL   | DEVICE_BUFFER |       0       |      1       |   9.955 ms |       2.31% |   9.361 ms |       3.55% |  -594.395 us |  -5.97% |   FAIL   |
|  INTEGRAL   | DEVICE_BUFFER |     1000      |      1       |   9.964 ms |       3.01% |   8.981 ms |       3.98% |  -982.965 us |  -9.87% |   FAIL   |
|  INTEGRAL   | DEVICE_BUFFER |       0       |      32      |  10.222 ms |       3.32% |   9.207 ms |       5.47% | -1014.597 us |  -9.93% |   FAIL   |
|  INTEGRAL   | DEVICE_BUFFER |     1000      |      32      |   9.930 ms |       3.83% |   8.607 ms |       3.37% | -1323.326 us | -13.33% |   FAIL   |
|    FLOAT    | DEVICE_BUFFER |       0       |      1       |   5.999 ms |       3.59% |   5.752 ms |       3.87% |  -246.635 us |  -4.11% |   FAIL   |
|    FLOAT    | DEVICE_BUFFER |     1000      |      1       |   6.576 ms |       4.40% |   5.839 ms |       4.43% |  -737.338 us | -11.21% |   FAIL   |
|    FLOAT    | DEVICE_BUFFER |       0       |      32      |   5.828 ms |       4.59% |   4.940 ms |       4.41% |  -887.375 us | -15.23% |   FAIL   |
|    FLOAT    | DEVICE_BUFFER |     1000      |      32      |   6.198 ms |       3.91% |   5.271 ms |       3.54% |  -927.602 us | -14.97% |   FAIL   |
|   DECIMAL   | DEVICE_BUFFER |       0       |      1       |  20.199 ms |       1.66% |  20.014 ms |       2.23% |  -184.710 us |  -0.91% |   PASS   |
|   DECIMAL   | DEVICE_BUFFER |     1000      |      1       |   7.068 ms |       3.99% |   6.479 ms |       4.08% |  -588.856 us |  -8.33% |   FAIL   |
|   DECIMAL   | DEVICE_BUFFER |       0       |      32      |   9.287 ms |       3.45% |   8.656 ms |       2.94% |  -631.348 us |  -6.80% |   FAIL   |
|   DECIMAL   | DEVICE_BUFFER |     1000      |      32      |   5.641 ms |       4.39% |   5.021 ms |       3.31% |  -620.122 us | -10.99% |   FAIL   |
|  TIMESTAMP  | DEVICE_BUFFER |       0       |      1       |  27.488 ms |       1.57% |  27.235 ms |       1.74% |  -253.277 us |  -0.92% |   PASS   |
|  TIMESTAMP  | DEVICE_BUFFER |     1000      |      1       |   6.656 ms |       4.73% |   6.049 ms |       4.61% |  -607.760 us |  -9.13% |   FAIL   |
|  TIMESTAMP  | DEVICE_BUFFER |       0       |      32      |   9.974 ms |       3.22% |   9.204 ms |       2.84% |  -770.247 us |  -7.72% |   FAIL   |
|  TIMESTAMP  | DEVICE_BUFFER |     1000      |      32      |   5.998 ms |       5.17% |   5.203 ms |       3.06% |  -794.943 us | -13.25% |   FAIL   |
|  DURATION   | DEVICE_BUFFER |       0       |      1       |   8.816 ms |       3.61% |   8.538 ms |       3.26% |  -278.877 us |  -3.16% |   PASS   |
|  DURATION   | DEVICE_BUFFER |     1000      |      1       |   5.989 ms |       4.76% |   5.446 ms |       4.57% |  -542.636 us |  -9.06% |   FAIL   |
|  DURATION   | DEVICE_BUFFER |       0       |      32      |   6.822 ms |       4.96% |   6.042 ms |       3.74% |  -779.786 us | -11.43% |   FAIL   |
|  DURATION   | DEVICE_BUFFER |     1000      |      32      |   5.706 ms |       5.40% |   4.930 ms |       3.39% |  -775.607 us | -13.59% |   FAIL   |
|   STRING    | DEVICE_BUFFER |       0       |      1       |  36.616 ms |       1.74% |  36.483 ms |       1.31% |  -132.191 us |  -0.36% |   PASS   |
|   STRING    | DEVICE_BUFFER |     1000      |      1       |  12.006 ms |       4.15% |  11.989 ms |       3.53% |   -16.278 us |  -0.14% |   PASS   |
|   STRING    | DEVICE_BUFFER |       0       |      32      |  36.587 ms |       1.99% |  36.514 ms |       1.38% |   -73.737 us |  -0.20% |   PASS   |
|   STRING    | DEVICE_BUFFER |     1000      |      32      |  11.235 ms |       4.25% |  11.228 ms |       3.62% |    -7.041 us |  -0.06% |   PASS   |
|    LIST     | DEVICE_BUFFER |       0       |      1       |  36.929 ms |       1.88% |  36.988 ms |       1.42% |    59.350 us |   0.16% |   PASS   |
|    LIST     | DEVICE_BUFFER |     1000      |      1       |  36.510 ms |       1.91% |  36.558 ms |       1.66% |    48.536 us |   0.13% |   PASS   |
|    LIST     | DEVICE_BUFFER |       0       |      32      |  35.513 ms |       2.00% |  35.490 ms |       1.77% |   -23.411 us |  -0.07% |   PASS   |
|    LIST     | DEVICE_BUFFER |     1000      |      32      |  35.755 ms |       1.99% |  35.728 ms |       1.64% |   -27.564 us |  -0.08% |   PASS   |
|   STRUCT    | DEVICE_BUFFER |       0       |      1       |  43.456 ms |       1.35% |  43.537 ms |       1.16% |    81.405 us |   0.19% |   PASS   |
|   STRUCT    | DEVICE_BUFFER |     1000      |      1       |  25.549 ms |       2.54% |  25.698 ms |       1.90% |   149.295 us |   0.58% |   PASS   |
|   STRUCT    | DEVICE_BUFFER |       0       |      32      |  43.103 ms |       1.87% |  43.019 ms |       1.59% |   -84.825 us |  -0.20% |   PASS   |
|   STRUCT    | DEVICE_BUFFER |     1000      |      32      |  23.462 ms |       2.81% |  23.432 ms |       1.88% |   -30.434 us |  -0.13% |   PASS   |
```

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15332
---
 cpp/src/io/parquet/decode_fixed.cu           | 72 +++++++-------------
 cpp/src/io/parquet/reader_impl_preprocess.cu |  7 +-
 cpp/src/io/utilities/column_buffer.cpp       | 19 ++++--
 cpp/src/io/utilities/column_buffer.hpp       |  7 ++
 4 files changed, 52 insertions(+), 53 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index f3332a23992..bfd89200786 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -31,12 +31,8 @@ constexpr int rolling_buf_size  = decode_block_size * 2;
 constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<decode_block_size>();
 
 template <bool nullable, typename level_t, typename state_buf>
-static __device__ int gpuUpdateValidityOffsetsAndRowIndicesFlat(int32_t target_value_count,
-                                                                page_state_s* s,
-                                                                state_buf* sb,
-                                                                level_t const* const def,
-                                                                int t,
-                                                                bool nullable_with_nulls)
+static __device__ int gpuUpdateValidityOffsetsAndRowIndicesFlat(
+  int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
   constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
   constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
@@ -63,13 +59,9 @@ static __device__ int gpuUpdateValidityOffsetsAndRowIndicesFlat(int32_t target_v
     // definition level. only need to process for nullable columns
     int d = 0;
     if constexpr (nullable) {
-      if (nullable_with_nulls) {
-        d = t < batch_size
-              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
-              : -1;
-      } else {
-        d = t < batch_size ? 1 : -1;
-      }
+      d = t < batch_size
+            ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
+            : -1;
     }
 
     int const thread_value_count = t + 1;
@@ -426,17 +418,13 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
 
-    // only need to process definition levels if this is a nullable column
-    if (nullable) {
-      if (nullable_with_nulls) {
-        processed_count += def_decoder.decode_next(t);
-        __syncthreads();
-      } else {
-        processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      }
+    // only need to process definition levels if the column has nulls
+    if (nullable_with_nulls) {
+      processed_count += def_decoder.decode_next(t);
+      __syncthreads();
 
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(
-        processed_count, s, sb, def, t, nullable_with_nulls);
+      next_valid_count =
+        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
     }
     // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
     // this function call entirely since all it will ever generate is a mapping of (i -> i) for
@@ -444,7 +432,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     else {
       processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
       next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t, false);
+        processed_count, s, sb, nullptr, t);
     }
     __syncthreads();
 
@@ -547,18 +535,14 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
 
-    // only need to process definition levels if this is a nullable column
-    if (nullable) {
-      if (nullable_with_nulls) {
-        processed_count += def_decoder.decode_next(t);
-        __syncthreads();
-      } else {
-        processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      }
+    // only need to process definition levels if the column has nulls
+    if (nullable_with_nulls) {
+      processed_count += def_decoder.decode_next(t);
+      __syncthreads();
 
       // count of valid items in this batch
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(
-        processed_count, s, sb, def, t, nullable_with_nulls);
+      next_valid_count =
+        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
     }
     // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
     // this function call entirely since all it will ever generate is a mapping of (i -> i) for
@@ -566,7 +550,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     else {
       processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
       next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t, false);
+        processed_count, s, sb, nullptr, t);
     }
     __syncthreads();
 
@@ -671,17 +655,13 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
 
-    // only need to process definition levels if this is a nullable column
-    if (nullable) {
-      if (nullable_with_nulls) {
-        processed_count += def_decoder.decode_next(t);
-        __syncthreads();
-      } else {
-        processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      }
+    // only need to process definition levels if the column has nulls
+    if (nullable_with_nulls) {
+      processed_count += def_decoder.decode_next(t);
+      __syncthreads();
 
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(
-        processed_count, s, sb, def, t, nullable_with_nulls);
+      next_valid_count =
+        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
     }
     // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
     // this function call entirely since all it will ever generate is a mapping of (i -> i) for
@@ -689,7 +669,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     else {
       processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
       next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t, false);
+        processed_count, s, sb, nullptr, t);
     }
     __syncthreads();
 
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 55633b97cf4..a5cd7d06536 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1498,8 +1498,10 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
       // if we haven't already processed this column because it is part of a struct hierarchy
       else if (out_buf.size == 0) {
         // add 1 for the offset if this is a list column
-        out_buf.create(
+        // we're going to start null mask as all valid and then turn bits off if necessary
+        out_buf.create_with_mask(
           out_buf.type.id() == type_id::LIST && l_idx < max_depth ? num_rows + 1 : num_rows,
+          cudf::mask_state::ALL_VALID,
           _stream,
           _mr);
       }
@@ -1577,7 +1579,8 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
           if (out_buf.type.id() == type_id::LIST && l_idx < max_depth) { size++; }
 
           // allocate
-          out_buf.create(size, _stream, _mr);
+          // we're going to start null mask as all valid and then turn bits off if necessary
+          out_buf.create_with_mask(size, cudf::mask_state::ALL_VALID, _stream, _mr);
         }
       }
     }
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 5ef43599838..e5d4e1a360f 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -91,9 +91,10 @@ void copy_buffer_data(string_policy const& buff, string_policy& new_buff)
 }  // namespace
 
 template <class string_policy>
-void column_buffer_base<string_policy>::create(size_type _size,
-                                               rmm::cuda_stream_view stream,
-                                               rmm::device_async_resource_ref mr)
+void column_buffer_base<string_policy>::create_with_mask(size_type _size,
+                                                         cudf::mask_state null_mask_state,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr)
 {
   size = _size;
   _mr  = mr;
@@ -111,11 +112,19 @@ void column_buffer_base<string_policy>::create(size_type _size,
     default: _data = create_data(type, size, stream, _mr); break;
   }
   if (is_nullable) {
-    _null_mask = cudf::detail::create_null_mask(
-      size, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), _mr);
+    _null_mask =
+      cudf::detail::create_null_mask(size, null_mask_state, rmm::cuda_stream_view(stream), _mr);
   }
 }
 
+template <class string_policy>
+void column_buffer_base<string_policy>::create(size_type _size,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  create_with_mask(_size, mask_state::ALL_NULL, stream, mr);
+}
+
 template <class string_policy>
 string_policy column_buffer_base<string_policy>::empty_like(string_policy const& input)
 {
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index ace1396bc09..e6bfae0681a 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -115,6 +115,13 @@ class column_buffer_base {
   // preprocessing steps such as in the Parquet reader
   void create(size_type _size, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
+  // like create(), but also takes a `cudf::mask_state` to allow initializing the null mask as
+  // something other than `ALL_NULL`
+  void create_with_mask(size_type _size,
+                        cudf::mask_state null_mask_state,
+                        rmm::cuda_stream_view stream,
+                        rmm::device_async_resource_ref mr);
+
   // Create a new column_buffer that has empty data but with the same basic information as the
   // input column, including same type, nullability, name, and user_data.
   static string_policy empty_like(string_policy const& input);

From f965f3cc90de3fd693cf41954b3eecefa24512d7 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Wed, 8 May 2024 16:24:20 -0400
Subject: [PATCH 175/842] Reducing runtime of JSON reader options benchmark
 (#15681)

This PR cleans up the JSON reader options benchmark by reducing the number of runtime configurations from 162 to 20.
Reasoning behind the splitting of the benchmark -
1. The `normalize_single_quotes` and `normalize_whitespace` are pre-processing options and do not impact each other - the runtimes of the FSTs are additive.
2. The performance of raw input ingestion (`row_selection::ALL` and `row_selection::BYTE_RANGE`) is independent of the token generation and tree algorithms.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15681
---
 cpp/benchmarks/io/json/json_reader_option.cpp | 61 ++++++++++++++++---
 1 file changed, 54 insertions(+), 7 deletions(-)

diff --git a/cpp/benchmarks/io/json/json_reader_option.cpp b/cpp/benchmarks/io/json/json_reader_option.cpp
index ed1008d053a..378134a2010 100644
--- a/cpp/benchmarks/io/json/json_reader_option.cpp
+++ b/cpp/benchmarks/io/json/json_reader_option.cpp
@@ -173,15 +173,62 @@ void BM_jsonlines_read_options(nvbench::state& state,
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
+NVBENCH_BENCH_TYPES(BM_jsonlines_read_options,
+                    NVBENCH_TYPE_AXES(nvbench::enum_type_list<row_selection::ALL>,
+                                      nvbench::enum_type_list<normalize_single_quotes::NO,
+                                                              normalize_single_quotes::YES>,
+                                      nvbench::enum_type_list<normalize_whitespace::NO>,
+                                      nvbench::enum_type_list<mixed_types_as_string::NO>,
+                                      nvbench::enum_type_list<recovery_mode::FAIL>))
+  .set_name("jsonlines_reader_normalize_single_quotes")
+  .set_type_axes_names({"row_selection",
+                        "normalize_single_quotes",
+                        "normalize_whitespace",
+                        "mixed_types_as_string",
+                        "recovery_mode"})
+  .set_min_samples(6)
+  .add_int64_axis("num_chunks", nvbench::range(1, 1, 1));
+
+NVBENCH_BENCH_TYPES(
+  BM_jsonlines_read_options,
+  NVBENCH_TYPE_AXES(nvbench::enum_type_list<row_selection::ALL>,
+                    nvbench::enum_type_list<normalize_single_quotes::NO>,
+                    nvbench::enum_type_list<normalize_whitespace::NO, normalize_whitespace::YES>,
+                    nvbench::enum_type_list<mixed_types_as_string::NO>,
+                    nvbench::enum_type_list<recovery_mode::FAIL>))
+  .set_name("jsonlines_reader_normalize_whitespace")
+  .set_type_axes_names({"row_selection",
+                        "normalize_single_quotes",
+                        "normalize_whitespace",
+                        "mixed_types_as_string",
+                        "recovery_mode"})
+  .set_min_samples(6)
+  .add_int64_axis("num_chunks", nvbench::range(1, 1, 1));
+
+NVBENCH_BENCH_TYPES(
+  BM_jsonlines_read_options,
+  NVBENCH_TYPE_AXES(nvbench::enum_type_list<row_selection::ALL>,
+                    nvbench::enum_type_list<normalize_single_quotes::NO>,
+                    nvbench::enum_type_list<normalize_whitespace::NO>,
+                    nvbench::enum_type_list<mixed_types_as_string::NO, mixed_types_as_string::YES>,
+                    nvbench::enum_type_list<recovery_mode::RECOVER_WITH_NULL, recovery_mode::FAIL>))
+  .set_name("jsonlines_reader_mixed_types_as_string")
+  .set_type_axes_names({"row_selection",
+                        "normalize_single_quotes",
+                        "normalize_whitespace",
+                        "mixed_types_as_string",
+                        "recovery_mode"})
+  .set_min_samples(6)
+  .add_int64_axis("num_chunks", nvbench::range(1, 1, 1));
+
 NVBENCH_BENCH_TYPES(
   BM_jsonlines_read_options,
-  NVBENCH_TYPE_AXES(
-    nvbench::enum_type_list<row_selection::ALL, row_selection::BYTE_RANGE>,
-    nvbench::enum_type_list<normalize_single_quotes::NO, normalize_single_quotes::YES>,
-    nvbench::enum_type_list<normalize_whitespace::NO, normalize_whitespace::YES>,
-    nvbench::enum_type_list<mixed_types_as_string::NO, mixed_types_as_string::YES>,
-    nvbench::enum_type_list<recovery_mode::RECOVER_WITH_NULL, recovery_mode::FAIL>))
-  .set_name("jsonlines_reader")
+  NVBENCH_TYPE_AXES(nvbench::enum_type_list<row_selection::ALL, row_selection::BYTE_RANGE>,
+                    nvbench::enum_type_list<normalize_single_quotes::NO>,
+                    nvbench::enum_type_list<normalize_whitespace::NO>,
+                    nvbench::enum_type_list<mixed_types_as_string::NO>,
+                    nvbench::enum_type_list<recovery_mode::FAIL>))
+  .set_name("jsonlines_reader_row_selection")
   .set_type_axes_names({"row_selection",
                         "normalize_single_quotes",
                         "normalize_whitespace",

From 57e534a74f89086396479b548e4b02458e3a1bc2 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 8 May 2024 13:34:48 -1000
Subject: [PATCH 176/842] Misc Column cleanups (#15682)

* Some typing
* Moved a single use helper function inline
* Some dtype checking simplification

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15682
---
 python/cudf/cudf/core/column/column.py        | 24 +++++++++----------
 .../cudf/cudf/core/column/numerical_base.py   | 22 +++++++----------
 2 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index e23da59b883..3754ed1392e 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -56,7 +56,6 @@
     infer_dtype,
     is_bool_dtype,
     is_dtype_equal,
-    is_integer_dtype,
     is_scalar,
     is_string_dtype,
 )
@@ -606,7 +605,8 @@ def _scatter_by_slice(
         start, stop, step = key.indices(len(self))
         if start >= stop:
             return None
-        num_keys = len(range(start, stop, step))
+        rng = range(start, stop, step)
+        num_keys = len(rng)
 
         self._check_scatter_key_length(num_keys, value)
 
@@ -625,7 +625,7 @@ def _scatter_by_slice(
 
         # step != 1, create a scatter map with arange
         scatter_map = as_column(
-            range(start, stop, step),
+            rng,
             dtype=cudf.dtype(np.int32),
         )
 
@@ -672,18 +672,16 @@ def _scatter_by_column(
 
     def _check_scatter_key_length(
         self, num_keys: int, value: Union[cudf.core.scalar.Scalar, ColumnBase]
-    ):
+    ) -> None:
         """`num_keys` is the number of keys to scatter. Should equal to the
         number of rows in ``value`` if ``value`` is a column.
         """
-        if isinstance(value, ColumnBase):
-            if len(value) != num_keys:
-                msg = (
-                    f"Size mismatch: cannot set value "
-                    f"of size {len(value)} to indexing result of size "
-                    f"{num_keys}"
-                )
-                raise ValueError(msg)
+        if isinstance(value, ColumnBase) and len(value) != num_keys:
+            raise ValueError(
+                f"Size mismatch: cannot set value "
+                f"of size {len(value)} to indexing result of size "
+                f"{num_keys}"
+            )
 
     def fillna(
         self,
@@ -820,7 +818,7 @@ def take(
 
         # TODO: For performance, the check and conversion of gather map should
         # be done by the caller. This check will be removed in future release.
-        if not is_integer_dtype(indices.dtype):
+        if indices.dtype.kind not in {"u", "i"}:
             indices = indices.astype(libcudf.types.size_type_dtype)
         if not libcudf.copying._gather_map_is_valid(
             indices, len(self), check_bounds, nullify
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index c45a9c7fd5d..541c32a2520 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 """Define an interface for columns that can perform numerical operations."""
 
 from __future__ import annotations
@@ -112,7 +112,13 @@ def quantile(
                 ),
             )
         else:
-            result = self._numeric_quantile(q, interpolation, exact)
+            # get sorted indices and exclude nulls
+            indices = libcudf.sort.order_by(
+                [self], [True], "first", stable=True
+            ).slice(self.null_count, len(self))
+            result = libcudf.quantiles.quantile(
+                self, q, interpolation, indices, exact
+            )
         if return_scalar:
             scalar_result = result.element_indexing(0)
             if interpolation in {"lower", "higher", "nearest"}:
@@ -178,18 +184,6 @@ def median(self, skipna: Optional[bool] = None) -> NumericalBaseColumn:
             return_scalar=True,
         )
 
-    def _numeric_quantile(
-        self, q: np.ndarray, interpolation: str, exact: bool
-    ) -> NumericalBaseColumn:
-        # get sorted indices and exclude nulls
-        indices = libcudf.sort.order_by(
-            [self], [True], "first", stable=True
-        ).slice(self.null_count, len(self))
-
-        return libcudf.quantiles.quantile(
-            self, q, interpolation, indices, exact
-        )
-
     def cov(self, other: NumericalBaseColumn) -> float:
         if (
             len(self) == 0

From b09e794f8323051d8fdb5ed2a6bae25e92475665 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 8 May 2024 18:54:05 -0500
Subject: [PATCH 177/842] Add proxy for inplace operations in `cudf.pandas`
 (#15695)

Fixes: #15676

This PR implements `__iadd__` and `__isub__` methods to allow in-place subtraction and addition operations.

Forks out from #14534

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15695
---
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 14 +++++
 .../cudf_pandas_tests/test_cudf_pandas.py     | 56 +++++++++++++++++++
 2 files changed, 70 insertions(+)

diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index c66458077fa..f91cdeac149 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -1083,9 +1083,23 @@ def _replace_closurevars(
     # Added on a per-proxy basis
     # https://github.com/rapidsai/xdf/pull/306#pullrequestreview-1636155428
     # "__hash__",
+    "__iadd__",
+    "__iand__",
+    "__iconcat__",
+    "__ifloordiv__",
+    "__ilshift__",
+    "__imatmul__",
+    "__imod__",
+    "__imul__",
     "__int__",
     "__invert__",
+    "__ior__",
+    "__ipow__",
+    "__irshift__",
+    "__isub__",
     "__iter__",
+    "__itruediv__",
+    "__ixor__",
     "__le__",
     "__len__",
     "__lshift__",
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index aa937d3ed4f..dcba1edd5fe 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1243,6 +1243,62 @@ def my_apply(df, unused):
     tm.assert_series_equal(result, expected)
 
 
+@pytest.mark.parametrize(
+    "op",
+    [
+        "__iadd__",
+        "__iand__",
+        "__ifloordiv__",
+        "__imod__",
+        "__imul__",
+        "__ior__",
+        "__ipow__",
+        "__isub__",
+        "__itruediv__",
+        "__ixor__",
+    ],
+)
+def test_inplace_ops(op):
+    xdf1 = xpd.DataFrame({"a": [10, 11, 12]})
+    xdf2 = xpd.DataFrame({"a": [1, 2, 3]})
+
+    df1 = pd.DataFrame({"a": [10, 11, 12]})
+    df2 = pd.DataFrame({"a": [1, 2, 3]})
+
+    actual = getattr(xdf1, op)(xdf2)
+    expected = getattr(df1, op)(df2)
+
+    tm.assert_equal(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        "__iadd__",
+        "__iand__",
+        "__ifloordiv__",
+        "__imod__",
+        "__imul__",
+        "__ior__",
+        "__ipow__",
+        "__isub__",
+        "__itruediv__",
+        "__ixor__",
+    ],
+)
+def test_inplace_ops_series(op):
+    xser1 = xpd.Series([10, 11, 12])
+    xser2 = xpd.Series([1, 2, 3])
+
+    ser1 = pd.Series([10, 11, 12])
+    ser2 = pd.Series([1, 2, 3])
+
+    actual = getattr(xser1, op)(xser2)
+    expected = getattr(ser1, op)(ser2)
+
+    tm.assert_equal(actual, expected)
+
+
 @pytest.mark.parametrize("data", [pd.NaT, 1234, "nat"])
 def test_timestamp(data):
     xtimestamp = xpd.Timestamp(data)

From c0c38ebf2e204da7e5c615453c87e3a0a8c31d0c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 8 May 2024 13:56:01 -1000
Subject: [PATCH 178/842] Delay materializing RangeIndex in .reset_index
 (#15588)

Before, a `RangeIndex` would be materialized to a `Column` even if it wasn't used (`drop=True`). Now, it's only materialized if the index will be added as a new column in the DataFrame.

Also caught a validation bug where an `invalid` number of levels would not raise an error

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15588
---
 python/cudf/cudf/core/_base_index.py     | 23 +++++---
 python/cudf/cudf/core/index.py           |  8 +++
 python/cudf/cudf/core/indexed_frame.py   | 35 +++++------
 python/cudf/cudf/core/multiindex.py      | 74 ++++++++++++++++--------
 python/cudf/cudf/tests/test_dataframe.py |  8 +++
 5 files changed, 94 insertions(+), 54 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index d2534acd2dc..6c116e740ff 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -4,6 +4,7 @@
 
 import pickle
 import warnings
+from collections.abc import Generator
 from functools import cached_property
 from typing import Any, Literal, Set, Tuple
 
@@ -2190,14 +2191,20 @@ def repeat(self, repeats, axis=None):
         """
         raise NotImplementedError
 
-    def _split_columns_by_levels(self, levels):
-        if isinstance(levels, int) and levels > 0:
-            raise ValueError(f"Out of bound level: {levels}")
-        return (
-            [self._data[self.name]],
-            [],
-            ["index" if self.name is None else self.name],
-            [],
+    def _new_index_for_reset_index(
+        self, levels: tuple | None, name
+    ) -> None | BaseIndex:
+        """Return the new index after .reset_index"""
+        # None is caught later to return RangeIndex
+        return None
+
+    def _columns_for_reset_index(
+        self, levels: tuple | None
+    ) -> Generator[tuple[Any, ColumnBase], None, None]:
+        """Return the columns and column names for .reset_index"""
+        yield (
+            "index" if self.name is None else self.name,
+            next(iter(self._columns)),
         )
 
     def _split(self, splits):
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 35afe6ee949..096b6f17c1d 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -5,6 +5,7 @@
 import operator
 import pickle
 import warnings
+from collections.abc import Generator
 from functools import cache, cached_property
 from numbers import Number
 from typing import (
@@ -970,6 +971,13 @@ def __abs__(self) -> Self | Index:
         else:
             return abs(self._as_int_index())
 
+    def _columns_for_reset_index(
+        self, levels: tuple | None
+    ) -> Generator[tuple[Any, ColumnBase], None, None]:
+        """Return the columns and column names for .reset_index"""
+        # We need to explicitly materialize the RangeIndex to a column
+        yield "index" if self.name is None else self.name, as_column(self)
+
     @_warn_no_dask_cudf
     def __dask_tokenize__(self):
         return (type(self), self.start, self.stop, self.step)
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index e656fd49758..dc261707867 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -4340,34 +4340,27 @@ def take(self, indices, axis=0):
 
     def _reset_index(self, level, drop, col_level=0, col_fill=""):
         """Shared path for DataFrame.reset_index and Series.reset_index."""
-        if level is not None and not isinstance(level, (tuple, list)):
-            level = (level,)
+        if level is not None:
+            if (
+                isinstance(level, int)
+                and level > 0
+                and not isinstance(self.index, MultiIndex)
+            ):
+                raise IndexError(
+                    f"Too many levels: Index has only 1 level, not {level + 1}"
+                )
+            if not isinstance(level, (tuple, list)):
+                level = (level,)
         _check_duplicate_level_names(level, self._index.names)
 
-        # Split the columns in the index into data and index columns
-        (
-            data_columns,
-            index_columns,
-            data_names,
-            index_names,
-        ) = self._index._split_columns_by_levels(level)
-        if index_columns:
-            index = _index_from_data(
-                dict(enumerate(index_columns)),
-                name=self._index.name,
-            )
-            if isinstance(index, MultiIndex):
-                index.names = index_names
-            else:
-                index.name = index_names[0]
-        else:
+        index = self.index._new_index_for_reset_index(level, self.index.name)
+        if index is None:
             index = RangeIndex(len(self))
-
         if drop:
             return self._data, index
 
         new_column_data = {}
-        for name, col in zip(data_names, data_columns):
+        for name, col in self.index._columns_for_reset_index(level):
             if name == "index" and "index" in self._data:
                 name = "level_0"
             name = (
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index c3184f51a4c..58a2846bf43 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -8,6 +8,7 @@
 import pickle
 import warnings
 from collections import abc
+from collections.abc import Generator
 from functools import cached_property
 from numbers import Integral
 from typing import Any, List, MutableMapping, Tuple, Union
@@ -2052,41 +2053,64 @@ def _copy_type_metadata(
         return res
 
     @_cudf_nvtx_annotate
-    def _split_columns_by_levels(self, levels):
+    def _split_columns_by_levels(
+        self, levels: tuple, *, in_levels: bool
+    ) -> Generator[tuple[Any, column.ColumnBase], None, None]:
         # This function assumes that for levels with duplicate names, they are
         # specified by indices, not name by ``levels``. E.g. [None, None] can
         # only be specified by 0, 1, not "None".
-
-        if levels is None:
-            return (
-                list(self._data.columns),
-                [],
-                [
-                    f"level_{i}" if name is None else name
-                    for i, name in enumerate(self.names)
-                ],
-                [],
-            )
-
-        # Normalize named levels into indices
         level_names = list(self.names)
         level_indices = {
             lv if isinstance(lv, int) else level_names.index(lv)
             for lv in levels
         }
-
-        # Split the columns
-        data_columns, index_columns = [], []
-        data_names, index_names = [], []
         for i, (name, col) in enumerate(zip(self.names, self._data.columns)):
-            if i in level_indices:
+            if in_levels and i in level_indices:
                 name = f"level_{i}" if name is None else name
-                data_columns.append(col)
-                data_names.append(name)
-            else:
-                index_columns.append(col)
-                index_names.append(name)
-        return data_columns, index_columns, data_names, index_names
+                yield name, col
+            elif not in_levels and i not in level_indices:
+                yield name, col
+
+    @_cudf_nvtx_annotate
+    def _new_index_for_reset_index(
+        self, levels: tuple | None, name
+    ) -> None | BaseIndex:
+        """Return the new index after .reset_index"""
+        if levels is None:
+            return None
+
+        index_columns, index_names = [], []
+        for name, col in self._split_columns_by_levels(
+            levels, in_levels=False
+        ):
+            index_columns.append(col)
+            index_names.append(name)
+
+        if not index_columns:
+            # None is caught later to return RangeIndex
+            return None
+
+        index = cudf.core.index._index_from_data(
+            dict(enumerate(index_columns)),
+            name=name,
+        )
+        if isinstance(index, type(self)):
+            index.names = index_names
+        else:
+            index.name = index_names[0]
+        return index
+
+    def _columns_for_reset_index(
+        self, levels: tuple | None
+    ) -> Generator[tuple[Any, column.ColumnBase], None, None]:
+        """Return the columns and column names for .reset_index"""
+        if levels is None:
+            for i, (col, name) in enumerate(
+                zip(self._data.columns, self.names)
+            ):
+                yield f"level_{i}" if name is None else name, col
+        else:
+            yield from self._split_columns_by_levels(levels, in_levels=True)
 
     def repeat(self, repeats, axis=None):
         return self._from_data(
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 2dee3566e1b..20e9f41de63 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -3208,6 +3208,14 @@ def test_reset_index_unnamed(
     assert_eq(expect, got)
 
 
+def test_reset_index_invalid_level():
+    with pytest.raises(IndexError):
+        cudf.DataFrame([1]).reset_index(level=2)
+
+    with pytest.raises(IndexError):
+        pd.DataFrame([1]).reset_index(level=2)
+
+
 @pytest.mark.parametrize(
     "data",
     [

From c576e97a6a7afef225e9c7746885ac436f224ee3 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 8 May 2024 20:01:31 -0500
Subject: [PATCH 179/842] Enabled `Holiday` types in `cudf.pandas` (#15664)

Fixes: #15663

This PR enables `Holiday` types in `cudf.pandas` by also adding a utility to create a composite meta class.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15664
---
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 135 +++++++++++++++++-
 python/cudf/cudf/pandas/fast_slow_proxy.py    |  21 ++-
 .../cudf_pandas_tests/test_cudf_pandas.py     |  74 ++++++++++
 3 files changed, 228 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 93bef66de4f..de92cce8ebb 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -7,6 +7,21 @@
 import sys
 
 import pandas as pd
+from pandas.tseries.holiday import (
+    AbstractHolidayCalendar as pd_AbstractHolidayCalendar,
+    EasterMonday as pd_EasterMonday,
+    GoodFriday as pd_GoodFriday,
+    Holiday as pd_Holiday,
+    HolidayCalendarFactory as pd_HolidayCalendarFactory,
+    HolidayCalendarMetaClass as pd_HolidayCalendarMetaClass,
+    USColumbusDay as pd_USColumbusDay,
+    USFederalHolidayCalendar as pd_USFederalHolidayCalendar,
+    USLaborDay as pd_USLaborDay,
+    USMartinLutherKingJr as pd_USMartinLutherKingJr,
+    USMemorialDay as pd_USMemorialDay,
+    USPresidentsDay as pd_USPresidentsDay,
+    USThanksgivingDay as pd_USThanksgivingDay,
+)
 
 import cudf
 
@@ -37,7 +52,6 @@
     XportReader as pd_XportReader,
 )
 
-
 # TODO(pandas2.1): Can import from pandas.api.typing
 from pandas.core.resample import (  # isort: skip
     Resampler as pd_Resampler,
@@ -882,6 +896,125 @@ def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs):
     "_SAS7BDATReader", _Unusable, pd_SAS7BDATReader
 )
 
+USFederalHolidayCalendar = make_final_proxy_type(
+    "USFederalHolidayCalendar",
+    _Unusable,
+    pd_USFederalHolidayCalendar,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+HolidayCalendarMetaClass = make_final_proxy_type(
+    "HolidayCalendarMetaClass",
+    _Unusable,
+    pd_HolidayCalendarMetaClass,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+
+@register_proxy_func(pd_HolidayCalendarFactory)
+def holiday_calendar_factory_wrapper(*args, **kwargs):
+    # Call the original HolidayCalendarFactory
+    result = _FunctionProxy(_Unusable(), pd_HolidayCalendarFactory)(
+        *args, **kwargs
+    )
+    # Return the slow proxy of the result
+    return result._fsproxy_slow
+
+
+AbstractHolidayCalendar = make_final_proxy_type(
+    "AbstractHolidayCalendar",
+    _Unusable,
+    pd_AbstractHolidayCalendar,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+    meta_class=pd_HolidayCalendarMetaClass,
+)
+
+Holiday = make_final_proxy_type(
+    "Holiday",
+    _Unusable,
+    pd_Holiday,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+USThanksgivingDay = make_final_proxy_type(
+    "USThanksgivingDay",
+    _Unusable,
+    pd_USThanksgivingDay,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+USColumbusDay = make_final_proxy_type(
+    "USColumbusDay",
+    _Unusable,
+    pd_USColumbusDay,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+USLaborDay = make_final_proxy_type(
+    "USLaborDay",
+    _Unusable,
+    pd_USLaborDay,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+USMemorialDay = make_final_proxy_type(
+    "USMemorialDay",
+    _Unusable,
+    pd_USMemorialDay,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+USMartinLutherKingJr = make_final_proxy_type(
+    "USMartinLutherKingJr",
+    _Unusable,
+    pd_USMartinLutherKingJr,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+USPresidentsDay = make_final_proxy_type(
+    "USPresidentsDay",
+    _Unusable,
+    pd_USPresidentsDay,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+
+GoodFriday = make_final_proxy_type(
+    "GoodFriday",
+    _Unusable,
+    pd_GoodFriday,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
+
+EasterMonday = make_final_proxy_type(
+    "EasterMonday",
+    _Unusable,
+    pd_EasterMonday,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+)
 
 FY5253 = make_final_proxy_type(
     "FY5253",
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index f91cdeac149..e5c86d2318e 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -103,6 +103,19 @@ def __call__(self):
 _DELETE = object()
 
 
+def create_composite_metaclass(base_meta, additional_meta):
+    """
+    Dynamically creates a composite metaclass that inherits from both provided metaclasses.
+    This ensures that the metaclass behaviors of both base_meta and additional_meta are preserved.
+    """
+
+    class CompositeMeta(base_meta, additional_meta):
+        def __new__(cls, name, bases, namespace):
+            return super().__new__(cls, name, bases, namespace)
+
+    return CompositeMeta
+
+
 def make_final_proxy_type(
     name: str,
     fast_type: type,
@@ -114,6 +127,7 @@ def make_final_proxy_type(
     additional_attributes: Mapping[str, Any] | None = None,
     postprocess: Callable[[_FinalProxy, Any, Any], Any] | None = None,
     bases: Tuple = (),
+    meta_class=None,
 ) -> Type[_FinalProxy]:
     """
     Defines a fast-slow proxy type for a pair of "final" fast and slow
@@ -217,10 +231,15 @@ def _fsproxy_state(self) -> _State:
         elif v is not _DELETE:
             cls_dict[k] = v
 
+    if meta_class is None:
+        meta_class = _FastSlowProxyMeta
+    else:
+        meta_class = create_composite_metaclass(_FastSlowProxyMeta, meta_class)
+
     cls = types.new_class(
         name,
         (*bases, _FinalProxy),
-        {"metaclass": _FastSlowProxyMeta},
+        {"metaclass": meta_class},
         lambda ns: ns.update(cls_dict),
     )
     functools.update_wrapper(
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index dcba1edd5fe..9fb0891fa52 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -16,6 +16,7 @@
 import pyarrow as pa
 import pytest
 from numba import NumbaDeprecationWarning
+from pytz import utc
 
 from cudf.pandas import LOADED, Profiler
 from cudf.pandas.fast_slow_proxy import _Unusable
@@ -25,6 +26,19 @@
 
 import pandas as xpd
 import pandas._testing as tm
+from pandas.tseries.holiday import (
+    AbstractHolidayCalendar,
+    EasterMonday,
+    GoodFriday,
+    Holiday,
+    USColumbusDay,
+    USLaborDay,
+    USMartinLutherKingJr,
+    USMemorialDay,
+    USPresidentsDay,
+    USThanksgivingDay,
+    get_calendar,
+)
 
 # Accelerated pandas has the real pandas module as an attribute
 pd = xpd._fsproxy_slow
@@ -1311,3 +1325,63 @@ def test_timedelta(data):
     xtimedelta = xpd.Timedelta(data)
     timedelta = pd.Timedelta(data)
     tm.assert_equal(xtimedelta, timedelta)
+
+
+def test_abstract_holiday_calendar():
+    class TestCalendar(AbstractHolidayCalendar):
+        def __init__(self, name=None, rules=None) -> None:
+            super().__init__(name=name, rules=rules)
+
+    jan1 = TestCalendar(rules=[Holiday("jan1", year=2015, month=1, day=1)])
+    jan2 = TestCalendar(rules=[Holiday("jan2", year=2015, month=1, day=2)])
+
+    # Getting holidays for Jan 1 should not alter results for Jan 2.
+    expected = xpd.DatetimeIndex(["01-Jan-2015"]).as_unit("ns")
+    tm.assert_index_equal(jan1.holidays(), expected)
+
+    expected2 = xpd.DatetimeIndex(["02-Jan-2015"]).as_unit("ns")
+    tm.assert_index_equal(jan2.holidays(), expected2)
+
+
+@pytest.mark.parametrize(
+    "holiday,start,expected",
+    [
+        (USMemorialDay, datetime.datetime(2015, 7, 1), []),
+        (USLaborDay, "2015-09-07", [xpd.Timestamp("2015-09-07")]),
+        (USColumbusDay, "2015-10-12", [xpd.Timestamp("2015-10-12")]),
+        (USThanksgivingDay, "2015-11-26", [xpd.Timestamp("2015-11-26")]),
+        (USMartinLutherKingJr, "2015-01-19", [xpd.Timestamp("2015-01-19")]),
+        (USPresidentsDay, datetime.datetime(2015, 7, 1), []),
+        (GoodFriday, datetime.datetime(2015, 7, 1), []),
+        (EasterMonday, "2015-04-06", [xpd.Timestamp("2015-04-06")]),
+        ("New Year's Day", "2010-12-31", [xpd.Timestamp("2010-12-31")]),
+        ("Independence Day", "2015-07-03", [xpd.Timestamp("2015-07-03")]),
+        ("Veterans Day", "2012-11-11", []),
+        ("Christmas Day", "2011-12-26", [xpd.Timestamp("2011-12-26")]),
+        (
+            "Juneteenth National Independence Day",
+            "2021-06-18",
+            [xpd.Timestamp("2021-06-18")],
+        ),
+        ("Juneteenth National Independence Day", "2022-06-19", []),
+        (
+            "Juneteenth National Independence Day",
+            "2022-06-20",
+            [xpd.Timestamp("2022-06-20")],
+        ),
+    ],
+)
+def test_holidays_within_dates(holiday, start, expected):
+    if isinstance(holiday, str):
+        calendar = get_calendar("USFederalHolidayCalendar")
+        holiday = calendar.rule_from_name(holiday)
+
+    assert list(holiday.dates(start, start)) == expected
+
+    # Verify that timezone info is preserved.
+    assert list(
+        holiday.dates(
+            utc.localize(xpd.Timestamp(start)),
+            utc.localize(xpd.Timestamp(start)),
+        )
+    ) == [utc.localize(dt) for dt in expected]

From 776756142fe7119a6e9b24158ec441011ef9d2b4 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 9 May 2024 08:19:40 -0400
Subject: [PATCH 180/842] Rework get_json_object benchmark to use nvbench
 (#15698)

Moves google-benchmark for `cudf::get_json_object` to nvbench.
Also removes randomness from device code to help ensure consistent results if the test input data generator logic is changed.
Updating the `detail::make_strings_children` function produced different input data so the performance results could not be compared. Removing the device code randomness helps keep the input data consistent.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/15698
---
 cpp/benchmarks/CMakeLists.txt |  2 +-
 cpp/benchmarks/json/json.cu   | 73 ++++++++++++++++-------------------
 2 files changed, 34 insertions(+), 41 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 7e61d881f07..ac4cce02318 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -330,7 +330,7 @@ ConfigureNVBench(
 
 # ##################################################################################################
 # * json benchmark -------------------------------------------------------------------
-ConfigureBench(JSON_BENCH json/json.cu)
+ConfigureNVBench(JSON_NVBENCH json/json.cu)
 ConfigureNVBench(FST_NVBENCH io/fst.cu)
 ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader_input.cpp)
 ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp)
diff --git a/cpp/benchmarks/json/json.cu b/cpp/benchmarks/json/json.cu
index c65db187f42..eee85f3feeb 100644
--- a/cpp/benchmarks/json/json.cu
+++ b/cpp/benchmarks/json/json.cu
@@ -15,8 +15,6 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
@@ -28,9 +26,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 
-#include <thrust/random.h>
-
-class JsonPath : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
 std::vector<std::string> const Books{
   R"json({
@@ -80,8 +76,6 @@ struct json_benchmark_row_builder {
   cudf::size_type* d_sizes{};
   char* d_chars{};
   cudf::detail::input_offsetalator d_offsets;
-  thrust::minstd_rand rng{5236};
-  thrust::uniform_int_distribution<int> dist{};
 
   // internal data structure for {bytes, out_ptr} with operator+=
   struct bytes_and_ptr {
@@ -99,12 +93,10 @@ struct json_benchmark_row_builder {
                                     cudf::size_type num_items,
                                     bytes_and_ptr& output_str)
   {
-    using param_type = thrust::uniform_int_distribution<int>::param_type;
-    dist.param(param_type{0, d_books_bicycles[this_idx].size() - 1});
     cudf::string_view comma(",\n", 2);
     for (int i = 0; i < num_items; i++) {
       if (i > 0) { output_str += comma; }
-      int idx   = dist(rng);
+      int idx   = threadIdx.x % d_books_bicycles[this_idx].size();
       auto item = d_books_bicycles[this_idx].element<cudf::string_view>(idx);
       output_str += item;
     }
@@ -183,41 +175,42 @@ auto build_json_string_column(int desired_bytes, int num_rows)
   return cudf::make_strings_column(num_rows, std::move(offsets), chars.release(), 0, {});
 }
 
-void BM_case(benchmark::State& state, std::string query_arg)
+static std::string queries[] = {"$",
+                                "$.store",
+                                "$.store.book",
+                                "$.store.*",
+                                "$.store.book[*]",
+                                "$.store.book[*].category",
+                                "$.store['bicycle']",
+                                "$.store.book[*]['isbn']",
+                                "$.store.bicycle[1]"};
+
+static void bench_query(nvbench::state& state)
 {
   srand(5236);
-  int num_rows      = state.range(0);
-  int desired_bytes = state.range(1);
+
+  auto const num_rows      = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const desired_bytes = static_cast<cudf::size_type>(state.get_int64("bytes"));
+  auto const query         = state.get_int64("query");
+  auto const json_path     = queries[query];
+
+  auto const stream = cudf::get_default_stream();
   auto input        = build_json_string_column(desired_bytes, num_rows);
   cudf::strings_column_view scv(input->view());
-  size_t num_chars = scv.chars_size(cudf::get_default_stream());
+  size_t num_chars = scv.chars_size(stream);
 
-  std::string json_path(query_arg);
-
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    auto result = cudf::get_json_object(scv, json_path);
-    CUDF_CUDA_TRY(cudaStreamSynchronize(0));
-  }
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  // This isn't strictly 100% accurate. a given query isn't necessarily
+  // going to visit every single incoming character but in spirit it does.
+  state.add_global_memory_reads<nvbench::int8_t>(num_chars);
 
-  // this isn't strictly 100% accurate. a given query isn't necessarily
-  // going to visit every single incoming character.  but in spirit it does.
-  state.SetBytesProcessed(state.iterations() * num_chars);
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto result = cudf::get_json_object(scv, json_path);
+  });
 }
 
-#define JSON_BENCHMARK_DEFINE(name, query)                                                  \
-  BENCHMARK_DEFINE_F(JsonPath, name)(::benchmark::State & state) { BM_case(state, query); } \
-  BENCHMARK_REGISTER_F(JsonPath, name)                                                      \
-    ->ArgsProduct({{100, 1000, 100000, 400000}, {300, 600, 4096}})                          \
-    ->UseManualTime()                                                                       \
-    ->Unit(benchmark::kMillisecond);
-
-JSON_BENCHMARK_DEFINE(query0, "$");
-JSON_BENCHMARK_DEFINE(query1, "$.store");
-JSON_BENCHMARK_DEFINE(query2, "$.store.book");
-JSON_BENCHMARK_DEFINE(query3, "$.store.*");
-JSON_BENCHMARK_DEFINE(query4, "$.store.book[*]");
-JSON_BENCHMARK_DEFINE(query5, "$.store.book[*].category");
-JSON_BENCHMARK_DEFINE(query6, "$.store['bicycle']");
-JSON_BENCHMARK_DEFINE(query7, "$.store.book[*]['isbn']");
-JSON_BENCHMARK_DEFINE(query8, "$.store.bicycle[1]");
+NVBENCH_BENCH(bench_query)
+  .set_name("json_path")
+  .add_int64_axis("bytes", {300, 600, 4096})
+  .add_int64_axis("num_rows", {100, 1000, 100000, 400000})
+  .add_int64_axis("query", {0, 1, 2, 3, 4, 5, 6, 7, 8});

From a4cd1d877631e4554c53b57202564398b758324c Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 9 May 2024 08:31:25 -0500
Subject: [PATCH 181/842] Remove obsolete `XFAIL` markers for query-planning
 (#15662)

Simple PR that removes/modifies several `XFAIL` markers for tests that *should* be passing with the latest version of `dask`. Note that the `lt_version="2024.5.0"` argument used in many places is conservative for most tests.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/15662
---
 .../dask_cudf/io/tests/test_parquet.py        |  2 +-
 .../dask_cudf/tests/test_accessor.py          |  2 +-
 python/dask_cudf/dask_cudf/tests/test_core.py |  6 +-
 .../dask_cudf/dask_cudf/tests/test_groupby.py | 58 +++----------------
 .../dask_cudf/dask_cudf/tests/test_onehot.py  |  4 +-
 5 files changed, 17 insertions(+), 55 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 6f4737db5be..2c44f192612 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -443,7 +443,7 @@ def test_create_metadata_file(tmpdir, partition_on):
     dd.assert_eq(ddf1, ddf2)
 
 
-@xfail_dask_expr("dtypes are inconsistent")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @need_create_meta
 def test_create_metadata_file_inconsistent_schema(tmpdir):
     # NOTE: This test demonstrates that the CudfEngine
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 035b73094e7..58d28f0597e 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -111,7 +111,7 @@ def test_categorical_accessor_initialization2(data):
         dsr.cat
 
 
-@xfail_dask_expr(lt_version="2024.5.0")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @pytest.mark.parametrize("data", [data_cat_1()])
 def test_categorical_basic(data):
     cat = data.copy()
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 4878d44d636..981c2c369f1 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -800,7 +800,7 @@ def test_dataframe_set_index():
         assert_eq(ddf.compute(), pddf.compute())
 
 
-@xfail_dask_expr("Insufficient describe support in dask-expr")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_series_describe():
     random.seed(0)
     sr = cudf.datasets.randomdata(20)["x"]
@@ -816,7 +816,7 @@ def test_series_describe():
     )
 
 
-@xfail_dask_expr("Insufficient describe support in dask-expr")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_dataframe_describe():
     random.seed(0)
     df = cudf.datasets.randomdata(20)
@@ -830,7 +830,7 @@ def test_dataframe_describe():
     )
 
 
-@xfail_dask_expr("Insufficient describe support in dask-expr")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_zero_std_describe():
     num = 84886781
     df = cudf.DataFrame(
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index f96b5b760d8..dc279bfa690 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -288,64 +288,24 @@ def test_groupby_dropna_cudf(dropna, by):
         (False, "a"),
         (False, "b"),
         (False, "c"),
-        pytest.param(
-            False,
-            "d",
-            marks=pytest.mark.xfail(
-                reason="dropna=False is broken in Dask CPU for groupbys on "
-                "categorical columns"
-            ),
-        ),
-        pytest.param(
-            False,
-            ["a", "b"],
-            marks=pytest.mark.xfail(
-                reason="https://github.com/dask/dask/issues/8817"
-            ),
-        ),
-        pytest.param(
-            False,
-            ["a", "c"],
-            marks=pytest.mark.xfail(
-                reason="https://github.com/dask/dask/issues/8817"
-            ),
-        ),
-        pytest.param(
-            False,
-            ["a", "d"],
-            marks=pytest.mark.xfail(
-                reason="multi-col groupbys on categorical columns are broken "
-                "in Dask CPU"
-            ),
-        ),
+        (False, "d"),
+        (False, ["a", "b"]),
+        (False, ["a", "c"]),
+        (False, ["a", "d"]),
         (True, "a"),
         (True, "b"),
         (True, "c"),
         (True, "d"),
         (True, ["a", "b"]),
         (True, ["a", "c"]),
-        pytest.param(
-            True,
-            ["a", "d"],
-            marks=pytest.mark.xfail(
-                reason="multi-col groupbys on categorical columns are broken "
-                "in Dask CPU"
-            ),
-        ),
+        (True, ["a", "d"]),
         (None, "a"),
         (None, "b"),
         (None, "c"),
         (None, "d"),
         (None, ["a", "b"]),
         (None, ["a", "c"]),
-        pytest.param(
-            None,
-            ["a", "d"],
-            marks=pytest.mark.xfail(
-                reason="multi-col groupbys on categorical columns are broken "
-                "in Dask CPU"
-            ),
-        ),
+        (None, ["a", "d"]),
     ],
 )
 def test_groupby_dropna_dask(dropna, by):
@@ -675,7 +635,7 @@ def test_groupby_agg_params(npartitions, split_every, split_out, as_index):
     dd.assert_eq(gf, pf)
 
 
-@xfail_dask_expr("Newer dask-expr version needed")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @pytest.mark.parametrize(
     "aggregations", [(sum, "sum"), (max, "max"), (min, "min")]
 )
@@ -714,7 +674,7 @@ def test_is_supported(arg, supported):
     assert _aggs_optimized(arg, OPTIMIZED_AGGS) is supported
 
 
-@xfail_dask_expr("Fails on older versions of dask-expr")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 def test_groupby_unique_lists():
     df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]})
     gdf = cudf.from_pandas(df)
@@ -776,7 +736,7 @@ def test_groupby_with_list_of_series():
     )
 
 
-@xfail_dask_expr("Nested renamer not supported in dask-expr")
+@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0")
 @pytest.mark.parametrize(
     "func",
     [
diff --git a/python/dask_cudf/dask_cudf/tests/test_onehot.py b/python/dask_cudf/dask_cudf/tests/test_onehot.py
index 96646f85f74..0b7c7855e07 100644
--- a/python/dask_cudf/dask_cudf/tests/test_onehot.py
+++ b/python/dask_cudf/dask_cudf/tests/test_onehot.py
@@ -11,7 +11,9 @@
 from dask_cudf.tests.utils import xfail_dask_expr
 
 # No dask-expr support
-pytestmark = xfail_dask_expr("limited get_dummy support in dask-expr + cudf")
+pytestmark = xfail_dask_expr(
+    "Newer dask version needed", lt_version="2024.5.0"
+)
 
 
 def test_get_dummies_cat():

From 69fe21365c043fbd165ee050a576ece8830aea45 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 9 May 2024 12:16:40 -0500
Subject: [PATCH 182/842] Enable sorting on column with nulls using
 query-planning (#15639)

Related to https://github.com/rapidsai/cudf/issues/15027

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/15639
---
 python/dask_cudf/dask_cudf/backends.py        | 2 +-
 python/dask_cudf/dask_cudf/tests/test_sort.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 94528325aea..d250589e389 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -307,7 +307,7 @@ def categorical_dtype_cudf(categories=None, ordered=False):
 @tolist_dispatch.register((cudf.Series, cudf.BaseIndex))
 @_dask_cudf_nvtx_annotate
 def tolist_cudf(obj):
-    return obj.to_arrow().to_pylist()
+    return obj.to_pandas().tolist()
 
 
 @is_categorical_dtype_dispatch.register(
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index 9184ad996ad..400600a1598 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -72,7 +72,7 @@ def test_sort_repartition():
     dd.assert_eq(len(new_ddf), len(ddf))
 
 
-@xfail_dask_expr("dask-expr code path fails with nulls")
+@xfail_dask_expr("missing null support", lt_version="2024.5.1")
 @pytest.mark.parametrize("na_position", ["first", "last"])
 @pytest.mark.parametrize("ascending", [True, False])
 @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]])

From 3481042d5d1a1f511515cf23f36c43620ad6663e Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 9 May 2024 12:18:31 -0500
Subject: [PATCH 183/842] Upgrade `arrow` to `16` (#15703)

This PR upgrades `arrow` to `16`. This PR also contains fixes to pytests because of breaking API changes in pyarrow.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/15703
---
 .../all_cuda-118_arch-x86_64.yaml             |  12 ++++-----
 .../all_cuda-122_arch-x86_64.yaml             |  12 ++++-----
 conda/recipes/cudf/meta.yaml                  |   2 +-
 conda/recipes/libcudf/conda_build_config.yaml |   2 +-
 cpp/cmake/thirdparty/get_arrow.cmake          |   2 +-
 dependencies.yaml                             |  24 ++++++++----------
 python/cudf/cudf/io/parquet.py                |   5 ----
 .../tests/data/parquet/usec_timestamp.parquet | Bin 1128 -> 2323 bytes
 python/cudf/cudf/tests/test_dataframe.py      |  14 +---------
 python/cudf/cudf/tests/test_index.py          |   8 +-----
 python/cudf/cudf/tests/test_parquet.py        |   4 +--
 python/cudf/cudf/utils/ioutils.py             |   3 ++-
 python/cudf/pyproject.toml                    |   6 ++---
 python/cudf_kafka/pyproject.toml              |   2 +-
 .../dask_cudf/io/tests/test_parquet.py        |   4 +--
 15 files changed, 36 insertions(+), 64 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 7a5fef9f25e..48699b81eed 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -36,15 +36,15 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-acero==14.0.2.*
-- libarrow-dataset==14.0.2.*
-- libarrow==14.0.2.*
+- libarrow-acero==16.0.0.*
+- libarrow-dataset==16.0.0.*
+- libarrow==16.0.0.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
 - libkvikio==24.6.*
-- libparquet==14.0.2.*
+- libparquet==16.0.0.*
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.6.*
 - make
@@ -66,7 +66,7 @@ dependencies:
 - pip
 - pre-commit
 - ptxcompiler
-- pyarrow==14.0.2.*
+- pyarrow==16.0.0.*
 - pydata-sphinx-theme!=0.14.2
 - pytest-benchmark
 - pytest-cases>=3.8.2
@@ -92,7 +92,7 @@ dependencies:
 - streamz
 - sysroot_linux-64==2.17
 - tokenizers==0.15.2
-- transformers==4.38.1
+- transformers==4.39.3
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
 - pip:
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 48453e18bb0..d06a727f331 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -37,13 +37,13 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-acero==14.0.2.*
-- libarrow-dataset==14.0.2.*
-- libarrow==14.0.2.*
+- libarrow-acero==16.0.0.*
+- libarrow-dataset==16.0.0.*
+- libarrow==16.0.0.*
 - libcufile-dev
 - libcurand-dev
 - libkvikio==24.6.*
-- libparquet==14.0.2.*
+- libparquet==16.0.0.*
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.6.*
 - make
@@ -63,7 +63,7 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- pyarrow==14.0.2.*
+- pyarrow==16.0.0.*
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink
 - pytest-benchmark
@@ -90,7 +90,7 @@ dependencies:
 - streamz
 - sysroot_linux-64==2.17
 - tokenizers==0.15.2
-- transformers==4.38.1
+- transformers==4.39.3
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
 - pip:
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index ddcadfd1570..24210830ada 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -65,7 +65,7 @@ requirements:
     - setuptools
     - dlpack >=0.8,<1.0
     - numpy 1.23
-    - pyarrow ==14.0.2.*
+    - pyarrow ==16.0.0.*
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index ba5e96fb6cf..61ffcf3c3de 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
   - ">=3.26.4"
 
 libarrow_version:
-  - "==14.0.2"
+  - "==16.0.0"
 
 dlpack_version:
   - ">=0.8,<1.0"
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 892056959c8..70283efbd79 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -410,7 +410,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
   set(CUDF_VERSION_Arrow
       # This version must be kept in sync with the libarrow version pinned for builds in
       # dependencies.yaml.
-      14.0.2
+      16.0.0
       CACHE STRING "The version of Arrow to find (or build)"
   )
 endif()
diff --git a/dependencies.yaml b/dependencies.yaml
index 1508656471d..7fe67817f73 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -266,7 +266,7 @@ dependencies:
           - cython>=3.0.3
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - pyarrow==14.0.2.*
+          - pyarrow==16.0.0.*
       - output_types: conda
         packages:
           - scikit-build-core>=0.7.0
@@ -312,27 +312,25 @@ dependencies:
         packages:
           # Hard pin the Arrow patch version used during the build. This must
           # be kept in sync with the version pinned in get_arrow.cmake.
-          - libarrow-acero==14.0.2.*
-          - libarrow-dataset==14.0.2.*
-          - libarrow==14.0.2.*
-          - libparquet==14.0.2.*
+          - libarrow-acero==16.0.0.*
+          - libarrow-dataset==16.0.0.*
+          - libarrow==16.0.0.*
+          - libparquet==16.0.0.*
   libarrow_run:
     common:
       - output_types: conda
         packages:
           # Allow runtime version to float up to minor version
-          # Disallow libarrow 14.0.0 due to a CVE
-          - libarrow-acero>=14.0.1,<15.0.0a0
-          - libarrow-dataset>=14.0.1,<15.0.0a0
-          - libarrow>=14.0.1,<15.0.0a0
-          - libparquet>=14.0.1,<15.0.0a0
+          - libarrow-acero>=16.0.0,<17.0.0a0
+          - libarrow-dataset>=16.0.0,<17.0.0a0
+          - libarrow>=16.0.0,<17.0.0a0
+          - libparquet>=16.0.0,<17.0.0a0
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           # Allow runtime version to float up to minor version
-          # Disallow pyarrow 14.0.0 due to a CVE
-          - pyarrow>=14.0.1,<15.0.0a0
+          - pyarrow>=16.0.0,<17.0.0a0
   cuda_version:
     specific:
       - output_types: conda
@@ -631,7 +629,7 @@ dependencies:
         packages:
           - msgpack
           - &tokenizers tokenizers==0.15.2
-          - &transformers transformers==4.38.1
+          - &transformers transformers==4.39.3
           - tzdata
     specific:
       - output_types: conda
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index e7f1ad0751f..dd1e59acaaa 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -993,15 +993,10 @@ def to_parquet(
         if index is None:
             index = True
 
-        # Convert partition_file_name to a call back
-        if partition_file_name:
-            partition_file_name = lambda x: partition_file_name  # noqa: E731
-
         pa_table = df.to_arrow(preserve_index=index)
         return pq.write_to_dataset(
             pa_table,
             root_path=path,
-            partition_filename_cb=partition_file_name,
             partition_cols=partition_cols,
             *args,
             **kwargs,
diff --git a/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet b/python/cudf/cudf/tests/data/parquet/usec_timestamp.parquet
index 20ef3cc5578426c7fa4fabe32692f43d3cd1c66c..efde6ff11bf97254c84827f1fe2703035fae4c34 100644
GIT binary patch
literal 2323
zcmcguL2u$l6dnj6v}z=}!U!Y|$l67fO0qy8Evxh}kN~EH1hTP#)e3oR17>YQz=p6<
zj=l9z^|Z%TkL{(uqsN|BJyz<ir=EK38#^W;B!?c_NyamN@6GqV_uf2<Q`}55J|@mQ
zt-*qxAQU|K@cSPZv2Qj}<=s!IG^`9_LQEtHk09e`{IN*D)kFfWZ5Vy_fD|G0@l!86
zLmtoOkKV*@o*3hc=8!)ajd(=H8xDnk@q~Z%Br1u(zk!V=DOtu#sRdy{2;Sd&7`X-{
zX%MFfZF`VsJAk|}H(>f;Zo>4#+}aKt1mF-IA2|y}f+XAx($z#<Q$&MYmjXcWzzNc~
z8O*$#y#q<&=gr$tKkar^Lsl-knle+gZmQe0L;v6nzxO7-_a(mDfZ*4GA943k@X>=D
ziuk9}K+#3U2ibV$g#<qf)xLcaOWG0IqIDuD9E!%G$^Wo2|7#|~Kls9-L?HavMhKy3
zGU6YLhAd7ae{Wo|P6?!uX6eS1eGz}fu9by-os6@Ng;O!bHoRgDQCHxIVQCsz^i)NY
zyQ?zCQP*6IsUd3_sx*=n7OV|RA1{`Q5g96yN#(j?in3^mV1C6a2AHcxpPoH`S=#xM
zy}rIBv&w=M>G2g3j_4i-09Q;Eh_8`wHT&6D*7Wr~GBn8RByd^{aMj;>>cGi8|Egi`
zXV;xC#di{5`?TYoD55!<*g4TuG7%>Wnv(HCoJ>Y@3N}+FQ*~reX?lM*wVPsrtn2!Y
zGP4VR>@|vff}70~;f2#2kw3Sd)A4z~h%s)!q~mLhv$)X5C5-#H*tcbc!is)Qg-1*M
zA*Scy9fpl!>p6_)7|y|JgTU$<wP9D7w}srBcIz19(>|CLzTkAt>Xi9xD^s753U3nQ
zSUph-gwygp5o|lHL64OAOh>rv9hdSwPD@R>Vs%I|muVv8TpLYVaQj@z1iajU2<H^T
zl7-7@t%%i*u4$y)>=C_WcFw@AoJ*7PoZ{T@srTfHHn(-fYMSU8;5Ri+o%aDRO<ILq
zYp<D;$`lu?BvZZU2=$bsPiu0y1Ti%S+;G%a7FCVt^;xG3aZygiD$gIa&68p^-{V)f
zzoquV-c()kDpm&$W@ilXzG-FLSg_hY;~wKneFF!L#zt)*?(|&Y^C_iO(5MeiZw}-#
zpAv<%Rv$Kr+^WkDaVG{pB^BUUxx?u;SGp+h(@rUGwE@S*eXK?_Z$@E2z^|0P%R{r|
z!}d+y*7FMWPSuAEmDY?1e5x%_-bc$?y7`lI9_0DOO}?k1P+hcJ)tt@ADCA(jAbXOH
zBkTM^%4jp9(pqqrbu~M$4YArgTj7}Lw5o4KA#d3jL(V!@vkT5h(d+A6<&t~<F`k`5
z|IpV4XW#sqZREBR+;_T#yZX@mrKZq0&;Wht2HGb#<J?{wxX_^bGR6`<q$21Nr^#U7
z7}-uhNnh;ea$7#4{R?<)x;mqeze17tSe#r~in$|=$2;~%B!2Ueh&!K;XflQn{G-wq
K{BD4sL;nCsZEHdR

delta 361
zcmbO%^n!ywz%j^BltolQRK*8Ku}tKbXJnbEFRjkNz`&)#1SCZm8Ch2`scmAE5oM7y
zWno~GlweF0WfBt+Tf``~YO*|&As+(+Q0oFl(G(w15ug!Q>{(ev*+khUPh=8T0Gc3R
z12Kn%Rlq2uYz9z7?EnkRw8^<llH41Z#C|ZUonlry!35N#BLg&$Nz%rYL4vI~IU_YU
zQIrK}gDMlyVpWmJFBlcX@wkSUpz$IM3}TCz#Y7k+WhB@N67y0LizN<je!(Qk$e1^o
ziKQ<9A{JkkT2!2wpQmJ{pk$zDWTI!JRLdZtRFGIySeja*n_N(!09B!2sAsBYs2l9(
V7!d5?A0i3z01z=e2KpNmz5p+6N@V~5

diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 20e9f41de63..8550bc91253 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -2824,13 +2824,7 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type):
     ]
     pa_chunk_array = pa.chunked_array(np_list_data)
 
-    expect = pd.Series(pa_chunk_array.to_pandas())
-    if cudf.api.types.is_datetime64_dtype(
-        data_type
-    ) or cudf.api.types.is_timedelta64_dtype(data_type):
-        # Workaround for an Arrow Bug:
-        # https://github.com/apache/arrow/issues/34462
-        expect = expect.astype(data_type)
+    expect = pa_chunk_array.to_pandas()
     got = cudf.Series(pa_chunk_array)
 
     assert_eq(expect, got)
@@ -2845,12 +2839,6 @@ def test_from_arrow_chunked_arrays(nelem, nchunks, data_type):
     )
 
     expect = pa_table.to_pandas()
-    if cudf.api.types.is_datetime64_dtype(
-        data_type
-    ) or cudf.api.types.is_timedelta64_dtype(data_type):
-        # Workaround for an Arrow Bug:
-        # https://github.com/apache/arrow/issues/34462
-        expect = expect.astype(data_type)
     got = cudf.DataFrame.from_arrow(pa_table)
 
     assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 0b252cec4b8..3cc6bfdbdc2 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -1523,13 +1523,7 @@ def test_index_from_arrow(data):
     arrow_array = pa.Array.from_pandas(pdi)
     expected_index = pd.Index(arrow_array.to_pandas())
     gdi = cudf.Index.from_arrow(arrow_array)
-    if gdi.dtype == cudf.dtype("datetime64[s]"):
-        # Arrow bug:
-        # https://github.com/apache/arrow/issues/33321
-        # arrow cannot convert non-nanosecond
-        # resolution to appropriate type in pandas.
-        # Hence need to type-cast.
-        expected_index = expected_index.astype(gdi.dtype)
+
     assert_eq(expected_index, gdi)
 
 
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 1e175f5ff0d..cf3c0e7f7a0 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -472,9 +472,7 @@ def test_parquet_read_filtered(tmpdir, rdg_seed):
     # Because of this, we aren't using PyArrow as a reference for testing our
     # row-group selection method since the only way to only select row groups
     # with PyArrow is with the method we use and intend to test.
-    tbl_filtered = pq.read_table(
-        fname, filters=[("1", ">", 60)], use_legacy_dataset=False
-    )
+    tbl_filtered = pq.read_table(fname, filters=[("1", ">", 60)])
 
     assert_eq(cudf.io.read_parquet_metadata(fname)[1], 2048 / 64)
     print(len(df_filtered))
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 6bd7558d322..9c7c687a6ed 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -247,7 +247,8 @@
     File name to use for partitioned datasets. Different partitions
     will be written to different directories, but all files will
     have this name.  If nothing is specified, a random uuid4 hex string
-    will be used for each file.
+    will be used for each file. This parameter is only supported by 'cudf'
+    engine, and will be ignored by other engines.
 partition_offsets : list, optional, default None
     Offsets to partition the dataframe by. Should be used when path is list
     of str. Should be a list of integers of size ``len(path) + 1``
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index fc3a243572f..4b57bcd018a 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
-    "pyarrow==14.0.2.*",
+    "pyarrow==16.0.0.*",
     "rmm==24.6.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -34,7 +34,7 @@ dependencies = [
     "packaging",
     "pandas>=2.0,<2.2.3dev0",
     "ptxcompiler",
-    "pyarrow>=14.0.1,<15.0.0a0",
+    "pyarrow>=16.0.0,<17.0.0a0",
     "rich",
     "rmm==24.6.*",
     "typing_extensions>=4.0.0",
@@ -63,7 +63,7 @@ test = [
     "pytest<8",
     "scipy",
     "tokenizers==0.15.2",
-    "transformers==4.38.1",
+    "transformers==4.39.3",
     "tzdata",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 pandas-tests = [
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index eb48852202a..787dd8a97d7 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
-    "pyarrow==14.0.2.*",
+    "pyarrow==16.0.0.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 2c44f192612..39800145585 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -166,9 +166,7 @@ def test_dask_timeseries_from_pandas(tmpdir):
     pdf = ddf2.compute()
     pdf.to_parquet(fn, engine="pyarrow")
     read_df = dask_cudf.read_parquet(fn)
-    # Workaround until following issue is fixed:
-    # https://github.com/apache/arrow/issues/33321
-    dd.assert_eq(ddf2, read_df.compute(), check_index_type=False)
+    dd.assert_eq(ddf2, read_df.compute())
 
 
 @pytest.mark.parametrize("index", [False, None])

From bd93e203b0bdfaa2b736a385ea7595c904bd30d8 Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Thu, 9 May 2024 15:23:21 -0400
Subject: [PATCH 184/842] Overhaul ops-codeowners coverage (#15660)

This PR overhauls how `ops-codeowners` reviews are handled.

`ops-codeowners` is replaced by `ci-codeowners` &
`packaging-codeowners`. The coverage of files is expanded as well.

Additionally, the process will change: reviews will be assigned to a
member of the teams instead of a manual request to `ops-codeowners`.
---
 .github/CODEOWNERS | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 31cfeaf4ca3..9efac3f1904 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -16,8 +16,14 @@ cpp/libcudf_kafka/CMakeLists.txt @rapidsai/cudf-cmake-codeowners
 #java code owners
 java/              @rapidsai/cudf-java-codeowners
 
-#build/ops code owners
-.github/           @rapidsai/ops-codeowners
-/ci/               @rapidsai/ops-codeowners
-conda/             @rapidsai/ops-codeowners
-dependencies.yaml  @rapidsai/ops-codeowners
+#CI code owners
+/.github/                @rapidsai/ci-codeowners
+/ci/                     @rapidsai/ci-codeowners
+/.pre-commit-config.yaml @rapidsai/ci-codeowners
+
+#packaging code owners
+/.devcontainers/   @rapidsai/packaging-codeowners
+/conda/            @rapidsai/packaging-codeowners
+/dependencies.yaml @rapidsai/packaging-codeowners
+/build.sh          @rapidsai/packaging-codeowners
+pyproject.toml     @rapidsai/packaging-codeowners

From 65a51ffa364b8a54fadab041cb5c563873303643 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 9 May 2024 19:19:45 -0400
Subject: [PATCH 185/842] Add large-strings gtest for cudf::interleave_columns
 (#15669)

Adds a gtest for `cudf::interleave_columns` that tests it can produce large-strings appropriately.
Follow on to #15544

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/15669
---
 cpp/tests/CMakeLists.txt                  |  8 ++-
 cpp/tests/large_strings/reshape_tests.cpp | 64 +++++++++++++++++++++++
 2 files changed, 70 insertions(+), 2 deletions(-)
 create mode 100644 cpp/tests/large_strings/reshape_tests.cpp

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index bbb919aa2d1..e779e1d1410 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -571,8 +571,12 @@ ConfigureTest(
 # ##################################################################################################
 # * large strings test ----------------------------------------------------------------------------
 ConfigureTest(
-  LARGE_STRINGS_TEST large_strings/large_strings_fixture.cpp large_strings/merge_tests.cpp
-  large_strings/concatenate_tests.cpp large_strings/parquet_tests.cpp
+  LARGE_STRINGS_TEST
+  large_strings/large_strings_fixture.cpp
+  large_strings/merge_tests.cpp
+  large_strings/concatenate_tests.cpp
+  large_strings/parquet_tests.cpp
+  large_strings/reshape_tests.cpp
   GPUS 1
   PERCENT 100
 )
diff --git a/cpp/tests/large_strings/reshape_tests.cpp b/cpp/tests/large_strings/reshape_tests.cpp
new file mode 100644
index 00000000000..b688a40a8d3
--- /dev/null
+++ b/cpp/tests/large_strings/reshape_tests.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/reshape.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <vector>
+
+struct ReshapeTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(ReshapeTest, InterleaveLargeStrings)
+{
+  auto const input = this->long_column();
+  auto input_views = std::vector<cudf::table_view>();
+  auto const view  = cudf::table_view({input});
+  std::vector<cudf::size_type> splits;
+  int const multiplier = 10;
+  for (int i = 0; i < multiplier; ++i) {  // 2500MB > 2GB
+    input_views.push_back(view);
+    splits.push_back(view.num_rows() * (i + 1));
+  }
+  splits.pop_back();  // remove last entry
+
+  auto result = cudf::interleave_columns(input_views);
+  auto sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.num_rows() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+
+  auto sliced = cudf::split(sv.parent(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
+  }
+
+  // also check regular sizes returns 32-bit offsets
+  input_views.clear();
+  input_views.push_back(view);
+  input_views.push_back(view);
+  result = cudf::interleave_columns(input_views);
+  sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.num_rows() * 2);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT32});
+  sliced = cudf::split(sv.parent(), {view.num_rows()});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[0], input);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sliced[1], input);
+}

From e1c6dc2848984279e6c5422496390c6396e74b6c Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Fri, 10 May 2024 22:37:21 +0800
Subject: [PATCH 186/842] Refine `CudaTest.testCudaException` in case throwing
 wrong type of CudaError under aarch64 (#15706)

Fix #15705

1. Replacing  `Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024)` with `Cuda.freePinned(-1L)`, the previous one throws fatal CUDAError `cudaErrorIllegalAddress`  instead of nonFatal CUDAError `cudaErrorInvalidValue` under aarch64, while the later one throwing the correct kind of error.

2. Enable the test case when Sanitizer is ON

Authors:
  - Alfred Xu (https://github.com/sperlingxx)

Approvers:
  - Tim Liu (https://github.com/NvTimLiu)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/15706
---
 java/src/test/java/ai/rapids/cudf/CudaTest.java | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java
index 2edd7f36cb7..9aaa9cee916 100644
--- a/java/src/test/java/ai/rapids/cudf/CudaTest.java
+++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,6 @@
 
 package ai.rapids.cudf;
 
-import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -34,13 +33,13 @@ public void testGetCudaRuntimeInfo() {
     assertEquals(Cuda.getNativeComputeMode(), Cuda.getComputeMode().nativeId);
   }
 
-  @Tag("noSanitizer")
   @Test
   public void testCudaException() {
     assertThrows(CudaException.class, () -> {
           try {
-            Cuda.memset(Long.MAX_VALUE, (byte) 0, 1024);
-          } catch (CudaFatalException ignored) {
+            Cuda.freePinned(-1L);
+          } catch (CudaFatalException fatalEx) {
+            throw new AssertionError("Expected UnFatalError but got FatalError: " + fatalEx);
           } catch (CudaException ex) {
             assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.getCudaError());
             throw ex;

From e93782f9d579701a628d8fb20f3d89f8c086fdc4 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Fri, 10 May 2024 11:26:15 -0500
Subject: [PATCH 187/842] Fix maxima of categorical column (#15701)

Closes https://github.com/rapidsai/cudf/issues/15641

Applies patch suggested by @wence-

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15701
---
 python/cudf/cudf/core/column/categorical.py | 23 +++++++++++++++++++++
 python/cudf/cudf/tests/test_categorical.py  | 19 +++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index dc51cd4f28f..1f003534913 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -515,6 +515,10 @@ class CategoricalColumn(column.ColumnBase):
     dtype: cudf.core.dtypes.CategoricalDtype
     _codes: Optional[NumericalColumn]
     _children: Tuple[NumericalColumn]
+    _VALID_REDUCTIONS = {
+        "max",
+        "min",
+    }
     _VALID_BINARY_OPERATIONS = {
         "__eq__",
         "__ne__",
@@ -699,6 +703,25 @@ def slice(
             ),
         )
 
+    def _reduce(
+        self,
+        op: str,
+        skipna: Optional[bool] = None,
+        min_count: int = 0,
+        *args,
+        **kwargs,
+    ) -> ScalarLike:
+        # Only valid reductions are min and max
+        if not self.ordered:
+            raise TypeError(
+                f"Categorical is not ordered for operation {op} "
+                "you can use .as_ordered() to change the Categorical "
+                "to an ordered one."
+            )
+        return self._decode(
+            self.codes._reduce(op, skipna, min_count, *args, **kwargs)
+        )
+
     def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         other = self._wrap_binop_normalization(other)
         # TODO: This is currently just here to make mypy happy, but eventually
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 07ce81e3c39..c36595192e4 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -875,3 +875,22 @@ def test_cat_groupby_fillna():
         lfunc_args_and_kwargs=(("d",), {}),
         rfunc_args_and_kwargs=(("d",), {}),
     )
+
+
+@pytest.mark.parametrize("op", ["min", "max"])
+def test_categorical_maxima(op):
+    ser = cudf.Series(
+        ["a", "d", "c", "z", "g"],
+        dtype=cudf.CategoricalDtype(["z", "c", "g", "d", "a"], ordered=False),
+    )
+    assert not ser.cat.ordered
+
+    # Cannot get extrema of unordered Categorical column
+    with pytest.raises(TypeError, match="Categorical is not ordered"):
+        getattr(ser, op)()
+
+    # Max/min should work after converting to "ordered"
+    ser_pd = ser.to_pandas()
+    result = getattr(ser.cat.as_ordered(), op)()
+    result_pd = getattr(ser_pd.cat.as_ordered(), op)()
+    assert_eq(result, result_pd)

From b810113d6255dbe123aafbc80018bf6165a0842f Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 10 May 2024 10:02:48 -0700
Subject: [PATCH 188/842] Clean up join benchmarks (#15644)

This PR cleans up the join benchmark implementations. It uses nvbench helpers to simplify the code and reduces the number of test cases.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15644
---
 cpp/benchmarks/join/conditional_join.cu | 186 ++++++++---------
 cpp/benchmarks/join/distinct_join.cu    |  83 ++------
 cpp/benchmarks/join/join.cu             | 161 +++------------
 cpp/benchmarks/join/join_common.hpp     | 133 ++++++------
 cpp/benchmarks/join/left_join.cu        |  60 +++---
 cpp/benchmarks/join/mixed_join.cu       | 262 +++++-------------------
 6 files changed, 264 insertions(+), 621 deletions(-)

diff --git a/cpp/benchmarks/join/conditional_join.cu b/cpp/benchmarks/join/conditional_join.cu
index d721de0e8fd..d95fc0a5b59 100644
--- a/cpp/benchmarks/join/conditional_join.cu
+++ b/cpp/benchmarks/join/conditional_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,126 +16,102 @@
 
 #include <benchmarks/join/join_common.hpp>
 
-template <typename key_type, typename payload_type>
+template <typename Key>
 class ConditionalJoin : public cudf::benchmark {};
 
 // For compatibility with the shared logic for equality (hash) joins, all of
 // the join lambdas defined by these macros accept a null_equality parameter
 // but ignore it (don't forward it to the underlying join implementation)
 // because conditional joins do not use this parameter.
-#define CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)            \
-  (::benchmark::State & st)                                                             \
-  {                                                                                     \
-    auto join = [](cudf::table_view const& left,                                        \
-                   cudf::table_view const& right,                                       \
-                   cudf::ast::operation binary_pred,                                    \
-                   cudf::null_equality compare_nulls) {                                 \
-      return cudf::conditional_inner_join(left, right, binary_pred);                    \
-    };                                                                                  \
-    BM_join<key_type, payload_type, nullable, join_t::CONDITIONAL>(st, join);           \
+#define CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)            \
+  (::benchmark::State & st)                                          \
+  {                                                                  \
+    auto join = [](cudf::table_view const& left,                     \
+                   cudf::table_view const& right,                    \
+                   cudf::ast::operation binary_pred,                 \
+                   cudf::null_equality compare_nulls) {              \
+      return cudf::conditional_inner_join(left, right, binary_pred); \
+    };                                                               \
+    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);           \
   }
 
-CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit, int32_t, int32_t, false);
-CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit, int64_t, int64_t, false);
-CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit_nulls, int32_t, int32_t, true);
-CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit_nulls, int64_t, int64_t, true);
-
-#define CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)           \
-  (::benchmark::State & st)                                                            \
-  {                                                                                    \
-    auto join = [](cudf::table_view const& left,                                       \
-                   cudf::table_view const& right,                                      \
-                   cudf::ast::operation binary_pred,                                   \
-                   cudf::null_equality compare_nulls) {                                \
-      return cudf::conditional_left_join(left, right, binary_pred);                    \
-    };                                                                                 \
-    BM_join<key_type, payload_type, nullable, join_t::CONDITIONAL>(st, join);          \
+CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit, int32_t, false);
+CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit, int64_t, false);
+CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit_nulls, int32_t, true);
+CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit_nulls, int64_t, true);
+
+#define CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)           \
+  (::benchmark::State & st)                                         \
+  {                                                                 \
+    auto join = [](cudf::table_view const& left,                    \
+                   cudf::table_view const& right,                   \
+                   cudf::ast::operation binary_pred,                \
+                   cudf::null_equality compare_nulls) {             \
+      return cudf::conditional_left_join(left, right, binary_pred); \
+    };                                                              \
+    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);          \
   }
 
-CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit, int32_t, int32_t, false);
-CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit, int64_t, int64_t, false);
-CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit_nulls, int32_t, int32_t, true);
-CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit_nulls, int64_t, int64_t, true);
-
-#define CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)           \
-  (::benchmark::State & st)                                                            \
-  {                                                                                    \
-    auto join = [](cudf::table_view const& left,                                       \
-                   cudf::table_view const& right,                                      \
-                   cudf::ast::operation binary_pred,                                   \
-                   cudf::null_equality compare_nulls) {                                \
-      return cudf::conditional_full_join(left, right, binary_pred);                    \
-    };                                                                                 \
-    BM_join<key_type, payload_type, nullable, join_t::CONDITIONAL>(st, join);          \
+CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit, int32_t, false);
+CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit, int64_t, false);
+CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit_nulls, int32_t, true);
+CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit_nulls, int64_t, true);
+
+#define CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)           \
+  (::benchmark::State & st)                                         \
+  {                                                                 \
+    auto join = [](cudf::table_view const& left,                    \
+                   cudf::table_view const& right,                   \
+                   cudf::ast::operation binary_pred,                \
+                   cudf::null_equality compare_nulls) {             \
+      return cudf::conditional_full_join(left, right, binary_pred); \
+    };                                                              \
+    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);          \
   }
 
-CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit, int32_t, int32_t, false);
-CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit, int64_t, int64_t, false);
-CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit_nulls, int32_t, int32_t, true);
-CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit_nulls, int64_t, int64_t, true);
-
-#define CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)                \
-  (::benchmark::State & st)                                                                 \
-  {                                                                                         \
-    auto join = [](cudf::table_view const& left,                                            \
-                   cudf::table_view const& right,                                           \
-                   cudf::ast::operation binary_pred,                                        \
-                   cudf::null_equality compare_nulls) {                                     \
-      return cudf::conditional_left_anti_join(left, right, binary_pred);                    \
-    };                                                                                      \
-    BM_join<key_type, payload_type, nullable, join_t::CONDITIONAL>(st, join);               \
+CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit, int32_t, false);
+CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit, int64_t, false);
+CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit_nulls, int32_t, true);
+CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit_nulls, int64_t, true);
+
+#define CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)                \
+  (::benchmark::State & st)                                              \
+  {                                                                      \
+    auto join = [](cudf::table_view const& left,                         \
+                   cudf::table_view const& right,                        \
+                   cudf::ast::operation binary_pred,                     \
+                   cudf::null_equality compare_nulls) {                  \
+      return cudf::conditional_left_anti_join(left, right, binary_pred); \
+    };                                                                   \
+    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);               \
   }
 
-CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit,
-                                            int32_t,
-                                            int32_t,
-                                            false);
-CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit,
-                                            int64_t,
-                                            int64_t,
-                                            false);
-CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit_nulls,
-                                            int32_t,
-                                            int32_t,
-                                            true);
-CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit_nulls,
-                                            int64_t,
-                                            int64_t,
-                                            true);
-
-#define CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, key_type, payload_type)                \
-  (::benchmark::State & st)                                                                 \
-  {                                                                                         \
-    auto join = [](cudf::table_view const& left,                                            \
-                   cudf::table_view const& right,                                           \
-                   cudf::ast::operation binary_pred,                                        \
-                   cudf::null_equality compare_nulls) {                                     \
-      return cudf::conditional_left_semi_join(left, right, binary_pred);                    \
-    };                                                                                      \
-    BM_join<key_type, payload_type, nullable, join_t::CONDITIONAL>(st, join);               \
+CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit, int32_t, false);
+CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit, int64_t, false);
+CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit_nulls, int32_t, true);
+CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit_nulls, int64_t, true);
+
+#define CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
+  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)                \
+  (::benchmark::State & st)                                              \
+  {                                                                      \
+    auto join = [](cudf::table_view const& left,                         \
+                   cudf::table_view const& right,                        \
+                   cudf::ast::operation binary_pred,                     \
+                   cudf::null_equality compare_nulls) {                  \
+      return cudf::conditional_left_semi_join(left, right, binary_pred); \
+    };                                                                   \
+    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);               \
   }
 
-CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit,
-                                            int32_t,
-                                            int32_t,
-                                            false);
-CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit,
-                                            int64_t,
-                                            int64_t,
-                                            false);
-CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit_nulls,
-                                            int32_t,
-                                            int32_t,
-                                            true);
-CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit_nulls,
-                                            int64_t,
-                                            int64_t,
-                                            true);
+CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit, int32_t, false);
+CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit, int64_t, false);
+CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit_nulls, int32_t, true);
+CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit_nulls, int64_t, true);
 
 // inner join -----------------------------------------------------------------------
 BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit)
diff --git a/cpp/benchmarks/join/distinct_join.cu b/cpp/benchmarks/join/distinct_join.cu
index 4a68ee3878e..af8fa1f9d94 100644
--- a/cpp/benchmarks/join/distinct_join.cu
+++ b/cpp/benchmarks/join/distinct_join.cu
@@ -16,12 +16,10 @@
 
 #include "join_common.hpp"
 
-template <typename key_type, typename payload_type, bool Nullable>
+template <typename Key, bool Nullable>
 void distinct_inner_join(nvbench::state& state,
-                         nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+                         nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& build_input,
                  cudf::table_view const& probe_input,
                  cudf::null_equality compare_nulls,
@@ -35,15 +33,13 @@ void distinct_inner_join(nvbench::state& state,
     return hj_obj.inner_join(stream);
   };
 
-  BM_join<key_type, payload_type, Nullable>(state, join);
+  BM_join<Key, Nullable>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
+template <typename Key, bool Nullable>
 void distinct_left_join(nvbench::state& state,
-                        nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+                        nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& build_input,
                  cudf::table_view const& probe_input,
                  cudf::null_equality compare_nulls,
@@ -57,65 +53,18 @@ void distinct_left_join(nvbench::state& state,
     return hj_obj.left_join(stream);
   };
 
-  BM_join<key_type, payload_type, Nullable>(state, join);
+  BM_join<Key, Nullable>(state, join);
 }
 
-// inner join -----------------------------------------------------------------------
 NVBENCH_BENCH_TYPES(distinct_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("distinct_inner_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(distinct_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("distinct_inner_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(distinct_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("distinct_inner_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(distinct_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("distinct_inner_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// left join ------------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(distinct_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("distinct_left_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("distinct_inner_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
 
-NVBENCH_BENCH_TYPES(distinct_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("distinct_left_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+NVBENCH_BENCH_TYPES(distinct_left_join, NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("distinct_left_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
diff --git a/cpp/benchmarks/join/join.cu b/cpp/benchmarks/join/join.cu
index 1c02a4488ac..c4a39da4662 100644
--- a/cpp/benchmarks/join/join.cu
+++ b/cpp/benchmarks/join/join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,10 @@
 
 #include <benchmarks/join/join_common.hpp>
 
-template <typename key_type, typename payload_type, bool Nullable>
+template <typename Key, bool Nullable>
 void nvbench_inner_join(nvbench::state& state,
-                        nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+                        nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_input,
                  cudf::table_view const& right_input,
                  cudf::null_equality compare_nulls,
@@ -33,15 +31,12 @@ void nvbench_inner_join(nvbench::state& state,
     return hj_obj.inner_join(right_input, std::nullopt, stream);
   };
 
-  BM_join<key_type, payload_type, Nullable>(state, join);
+  BM_join<Key, Nullable>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_left_join(nvbench::state& state,
-                       nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_left_join(nvbench::state& state, nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_input,
                  cudf::table_view const& right_input,
                  cudf::null_equality compare_nulls,
@@ -53,15 +48,12 @@ void nvbench_left_join(nvbench::state& state,
     return hj_obj.left_join(right_input, std::nullopt, stream);
   };
 
-  BM_join<key_type, payload_type, Nullable>(state, join);
+  BM_join<Key, Nullable>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_full_join(nvbench::state& state,
-                       nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_full_join(nvbench::state& state, nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_input,
                  cudf::table_view const& right_input,
                  cudf::null_equality compare_nulls,
@@ -73,122 +65,23 @@ void nvbench_full_join(nvbench::state& state,
     return hj_obj.full_join(right_input, std::nullopt, stream);
   };
 
-  BM_join<key_type, payload_type, Nullable>(state, join);
+  BM_join<Key, Nullable>(state, join);
 }
 
-// inner join -----------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(nvbench_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("inner_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("inner_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("inner_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("inner_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// left join ------------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(nvbench_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("left_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("left_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("left_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("left_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// full join ------------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(nvbench_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("full_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("full_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("full_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("full_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
+NVBENCH_BENCH_TYPES(nvbench_inner_join, NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("inner_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+
+NVBENCH_BENCH_TYPES(nvbench_left_join, NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("left_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+
+NVBENCH_BENCH_TYPES(nvbench_full_join, NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("full_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index 9f869ddb1ac..9e23d28b363 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -41,6 +41,11 @@
 
 #include <vector>
 
+using JOIN_KEY_TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t>;
+using JOIN_NULLABLE_RANGE = nvbench::enum_type_list<false, true>;
+
+auto const JOIN_SIZE_RANGE = std::vector<nvbench::int64_t>{1000, 100'000, 10'000'000};
+
 struct null75_generator {
   thrust::minstd_rand engine;
   thrust::uniform_int_distribution<unsigned> rand_gen;
@@ -55,52 +60,42 @@ struct null75_generator {
 
 enum class join_t { CONDITIONAL, MIXED, HASH };
 
-inline void skip_helper(nvbench::state& state)
-{
-  auto const build_table_size = state.get_int64("Build Table Size");
-  auto const probe_table_size = state.get_int64("Probe Table Size");
-
-  if (build_table_size > probe_table_size) {
-    state.skip("Large build tables are skipped.");
-    return;
-  }
-
-  if (build_table_size * 100 <= probe_table_size) {
-    state.skip("Large probe tables are skipped.");
-    return;
-  }
-}
-
-template <typename key_type,
-          typename payload_type,
+template <typename Key,
           bool Nullable,
           join_t join_type = join_t::HASH,
           typename state_type,
           typename Join>
 void BM_join(state_type& state, Join JoinFunc)
 {
-  auto const build_table_size = [&]() {
+  auto const right_size = [&]() {
     if constexpr (std::is_same_v<state_type, benchmark::State>) {
       return static_cast<cudf::size_type>(state.range(0));
     }
     if constexpr (std::is_same_v<state_type, nvbench::state>) {
-      return static_cast<cudf::size_type>(state.get_int64("Build Table Size"));
+      return static_cast<cudf::size_type>(state.get_int64("right_size"));
     }
   }();
-  auto const probe_table_size = [&]() {
+  auto const left_size = [&]() {
     if constexpr (std::is_same_v<state_type, benchmark::State>) {
       return static_cast<cudf::size_type>(state.range(1));
     }
     if constexpr (std::is_same_v<state_type, nvbench::state>) {
-      return static_cast<cudf::size_type>(state.get_int64("Probe Table Size"));
+      return static_cast<cudf::size_type>(state.get_int64("left_size"));
     }
   }();
 
+  if constexpr (std::is_same_v<state_type, nvbench::state>) {
+    if (right_size > left_size) {
+      state.skip("Skip large right table");
+      return;
+    }
+  }
+
   double const selectivity = 0.3;
   int const multiplicity   = 1;
 
   // Generate build and probe tables
-  auto build_random_null_mask = [](int size) {
+  auto right_random_null_mask = [](int size) {
     // roughly 75% nulls
     auto validity =
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), null75_generator{});
@@ -111,62 +106,62 @@ void BM_join(state_type& state, Join JoinFunc)
                                   rmm::mr::get_current_device_resource());
   };
 
-  std::unique_ptr<cudf::column> build_key_column0 = [&]() {
-    auto [null_mask, null_count] = build_random_null_mask(build_table_size);
-    return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                build_table_size,
-                                                std::move(null_mask),
-                                                null_count)
-                    : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                build_table_size);
+  std::unique_ptr<cudf::column> right_key_column0 = [&]() {
+    auto [null_mask, null_count] = right_random_null_mask(right_size);
+    return Nullable
+             ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<Key>()),
+                                         right_size,
+                                         std::move(null_mask),
+                                         null_count)
+             : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<Key>()), right_size);
   }();
-  std::unique_ptr<cudf::column> probe_key_column0 = [&]() {
-    auto [null_mask, null_count] = build_random_null_mask(probe_table_size);
-    return Nullable ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                probe_table_size,
-                                                std::move(null_mask),
-                                                null_count)
-                    : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<key_type>()),
-                                                probe_table_size);
+  std::unique_ptr<cudf::column> left_key_column0 = [&]() {
+    auto [null_mask, null_count] = right_random_null_mask(left_size);
+    return Nullable
+             ? cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<Key>()),
+                                         left_size,
+                                         std::move(null_mask),
+                                         null_count)
+             : cudf::make_numeric_column(cudf::data_type(cudf::type_to_id<Key>()), left_size);
   }();
 
-  generate_input_tables<key_type, cudf::size_type>(
-    build_key_column0->mutable_view().data<key_type>(),
-    build_table_size,
-    probe_key_column0->mutable_view().data<key_type>(),
-    probe_table_size,
-    selectivity,
-    multiplicity);
+  // build table is right table, probe table is left table
+  generate_input_tables<Key, cudf::size_type>(right_key_column0->mutable_view().data<Key>(),
+                                              right_size,
+                                              left_key_column0->mutable_view().data<Key>(),
+                                              left_size,
+                                              selectivity,
+                                              multiplicity);
 
-  // Copy build_key_column0 and probe_key_column0 into new columns.
+  // Copy right_key_column0 and left_key_column0 into new columns.
   // If Nullable, the new columns will be assigned new nullmasks.
-  auto const build_key_column1 = [&]() {
-    auto col = std::make_unique<cudf::column>(build_key_column0->view());
+  auto const right_key_column1 = [&]() {
+    auto col = std::make_unique<cudf::column>(right_key_column0->view());
     if (Nullable) {
-      auto [null_mask, null_count] = build_random_null_mask(build_table_size);
+      auto [null_mask, null_count] = right_random_null_mask(right_size);
       col->set_null_mask(std::move(null_mask), null_count);
     }
     return col;
   }();
-  auto const probe_key_column1 = [&]() {
-    auto col = std::make_unique<cudf::column>(probe_key_column0->view());
+  auto const left_key_column1 = [&]() {
+    auto col = std::make_unique<cudf::column>(left_key_column0->view());
     if (Nullable) {
-      auto [null_mask, null_count] = build_random_null_mask(probe_table_size);
+      auto [null_mask, null_count] = right_random_null_mask(left_size);
       col->set_null_mask(std::move(null_mask), null_count);
     }
     return col;
   }();
 
-  auto init = cudf::make_fixed_width_scalar<payload_type>(static_cast<payload_type>(0));
-  auto build_payload_column = cudf::sequence(build_table_size, *init);
-  auto probe_payload_column = cudf::sequence(probe_table_size, *init);
+  auto init                 = cudf::make_fixed_width_scalar<Key>(static_cast<Key>(0));
+  auto right_payload_column = cudf::sequence(right_size, *init);
+  auto left_payload_column  = cudf::sequence(left_size, *init);
 
   CUDF_CHECK_CUDA(0);
 
-  cudf::table_view build_table(
-    {build_key_column0->view(), build_key_column1->view(), *build_payload_column});
-  cudf::table_view probe_table(
-    {probe_key_column0->view(), probe_key_column1->view(), *probe_payload_column});
+  cudf::table_view right_table(
+    {right_key_column0->view(), right_key_column1->view(), *right_payload_column});
+  cudf::table_view left_table(
+    {left_key_column0->view(), left_key_column1->view(), *left_payload_column});
 
   // Setup join parameters and result table
   [[maybe_unused]] std::vector<cudf::size_type> columns_to_join = {0};
@@ -177,8 +172,8 @@ void BM_join(state_type& state, Join JoinFunc)
     for (auto _ : state) {
       cuda_event_timer raii(state, true, cudf::get_default_stream());
 
-      auto result = JoinFunc(probe_table.select(columns_to_join),
-                             build_table.select(columns_to_join),
+      auto result = JoinFunc(left_table.select(columns_to_join),
+                             right_table.select(columns_to_join),
                              cudf::null_equality::UNEQUAL);
     }
   }
@@ -191,10 +186,10 @@ void BM_join(state_type& state, Join JoinFunc)
         cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
       state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
         rmm::cuda_stream_view stream_view{launch.get_stream()};
-        auto result = JoinFunc(probe_table.select(columns_to_join),
-                               build_table.select(columns_to_join),
-                               probe_table.select({1}),
-                               build_table.select({1}),
+        auto result = JoinFunc(left_table.select(columns_to_join),
+                               right_table.select(columns_to_join),
+                               left_table.select({1}),
+                               right_table.select({1}),
                                left_zero_eq_right_zero,
                                cudf::null_equality::UNEQUAL,
                                stream_view);
@@ -203,8 +198,8 @@ void BM_join(state_type& state, Join JoinFunc)
     if constexpr (join_type == join_t::HASH) {
       state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
         rmm::cuda_stream_view stream_view{launch.get_stream()};
-        auto result = JoinFunc(probe_table.select(columns_to_join),
-                               build_table.select(columns_to_join),
+        auto result = JoinFunc(left_table.select(columns_to_join),
+                               right_table.select(columns_to_join),
                                cudf::null_equality::UNEQUAL,
                                stream_view);
       });
@@ -223,7 +218,7 @@ void BM_join(state_type& state, Join JoinFunc)
       cuda_event_timer raii(state, true, cudf::get_default_stream());
 
       auto result =
-        JoinFunc(probe_table, build_table, left_zero_eq_right_zero, cudf::null_equality::UNEQUAL);
+        JoinFunc(left_table, right_table, left_zero_eq_right_zero, cudf::null_equality::UNEQUAL);
     }
   }
 }
diff --git a/cpp/benchmarks/join/left_join.cu b/cpp/benchmarks/join/left_join.cu
index 96bbd1bc58e..3e398e721fa 100644
--- a/cpp/benchmarks/join/left_join.cu
+++ b/cpp/benchmarks/join/left_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,42 +16,42 @@
 
 #include <benchmarks/join/join_common.hpp>
 
-template <typename key_type, typename payload_type>
+template <typename Key>
 class Join : public cudf::benchmark {};
 
-#define LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)               \
-  (::benchmark::State & st)                                                     \
-  {                                                                             \
-    auto join = [](cudf::table_view const& left,                                \
-                   cudf::table_view const& right,                               \
-                   cudf::null_equality compare_nulls) {                         \
-      return cudf::left_anti_join(left, right, compare_nulls);                  \
-    };                                                                          \
-    BM_join<key_type, payload_type, nullable>(st, join);                        \
+#define LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable)   \
+  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, Key)                 \
+  (::benchmark::State & st)                                    \
+  {                                                            \
+    auto join = [](cudf::table_view const& left,               \
+                   cudf::table_view const& right,              \
+                   cudf::null_equality compare_nulls) {        \
+      return cudf::left_anti_join(left, right, compare_nulls); \
+    };                                                         \
+    BM_join<Key, Nullable>(st, join);                          \
   }
 
-LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit, int32_t, int32_t, false);
-LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit, int64_t, int64_t, false);
-LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit_nulls, int32_t, int32_t, true);
-LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit_nulls, int64_t, int64_t, true);
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit, int32_t, false);
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit, int64_t, false);
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit_nulls, int32_t, true);
+LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit_nulls, int64_t, true);
 
-#define LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, key_type, payload_type, nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, key_type, payload_type)               \
-  (::benchmark::State & st)                                                     \
-  {                                                                             \
-    auto join = [](cudf::table_view const& left,                                \
-                   cudf::table_view const& right,                               \
-                   cudf::null_equality compare_nulls) {                         \
-      return cudf::left_semi_join(left, right, compare_nulls);                  \
-    };                                                                          \
-    BM_join<key_type, payload_type, nullable>(st, join);                        \
+#define LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable)   \
+  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, Key)                 \
+  (::benchmark::State & st)                                    \
+  {                                                            \
+    auto join = [](cudf::table_view const& left,               \
+                   cudf::table_view const& right,              \
+                   cudf::null_equality compare_nulls) {        \
+      return cudf::left_semi_join(left, right, compare_nulls); \
+    };                                                         \
+    BM_join<Key, Nullable>(st, join);                          \
   }
 
-LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit, int32_t, int32_t, false);
-LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit, int64_t, int64_t, false);
-LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit_nulls, int32_t, int32_t, true);
-LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit_nulls, int64_t, int64_t, true);
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit, int32_t, false);
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit, int64_t, false);
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit_nulls, int32_t, true);
+LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit_nulls, int64_t, true);
 
 // left anti-join -------------------------------------------------------------
 BENCHMARK_REGISTER_F(Join, left_anti_join_32bit)
diff --git a/cpp/benchmarks/join/mixed_join.cu b/cpp/benchmarks/join/mixed_join.cu
index 67be4640f84..129ea62e7a6 100644
--- a/cpp/benchmarks/join/mixed_join.cu
+++ b/cpp/benchmarks/join/mixed_join.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,10 @@
 
 #include <benchmarks/join/join_common.hpp>
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_mixed_inner_join(
-  nvbench::state& state, nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_mixed_inner_join(nvbench::state& state,
+                              nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_equality_input,
                  cudf::table_view const& right_equality_input,
                  cudf::table_view const& left_conditional_input,
@@ -37,15 +35,13 @@ void nvbench_mixed_inner_join(
                                   compare_nulls);
   };
 
-  BM_join<key_type, payload_type, Nullable, join_t::MIXED>(state, join);
+  BM_join<Key, Nullable, join_t::MIXED>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_mixed_left_join(
-  nvbench::state& state, nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_mixed_left_join(nvbench::state& state,
+                             nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_equality_input,
                  cudf::table_view const& right_equality_input,
                  cudf::table_view const& left_conditional_input,
@@ -61,15 +57,13 @@ void nvbench_mixed_left_join(
                                  compare_nulls);
   };
 
-  BM_join<key_type, payload_type, Nullable, join_t::MIXED>(state, join);
+  BM_join<Key, Nullable, join_t::MIXED>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_mixed_full_join(
-  nvbench::state& state, nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_mixed_full_join(nvbench::state& state,
+                             nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_equality_input,
                  cudf::table_view const& right_equality_input,
                  cudf::table_view const& left_conditional_input,
@@ -85,15 +79,13 @@ void nvbench_mixed_full_join(
                                  compare_nulls);
   };
 
-  BM_join<key_type, payload_type, Nullable, join_t::MIXED>(state, join);
+  BM_join<Key, Nullable, join_t::MIXED>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_mixed_left_semi_join(
-  nvbench::state& state, nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_mixed_left_semi_join(nvbench::state& state,
+                                  nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_equality_input,
                  cudf::table_view const& right_equality_input,
                  cudf::table_view const& left_conditional_input,
@@ -109,15 +101,13 @@ void nvbench_mixed_left_semi_join(
                                       compare_nulls);
   };
 
-  BM_join<key_type, payload_type, Nullable, join_t::MIXED>(state, join);
+  BM_join<Key, Nullable, join_t::MIXED>(state, join);
 }
 
-template <typename key_type, typename payload_type, bool Nullable>
-void nvbench_mixed_left_anti_join(
-  nvbench::state& state, nvbench::type_list<key_type, payload_type, nvbench::enum_type<Nullable>>)
+template <typename Key, bool Nullable>
+void nvbench_mixed_left_anti_join(nvbench::state& state,
+                                  nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  skip_helper(state);
-
   auto join = [](cudf::table_view const& left_equality_input,
                  cudf::table_view const& right_equality_input,
                  cudf::table_view const& left_conditional_input,
@@ -133,200 +123,40 @@ void nvbench_mixed_left_anti_join(
                                       compare_nulls);
   };
 
-  BM_join<key_type, payload_type, Nullable, join_t::MIXED>(state, join);
+  BM_join<Key, Nullable, join_t::MIXED>(state, join);
 }
 
-// inner join -----------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(nvbench_mixed_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_inner_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_inner_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
 NVBENCH_BENCH_TYPES(nvbench_mixed_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_inner_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("mixed_inner_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
 
-NVBENCH_BENCH_TYPES(nvbench_mixed_inner_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_inner_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// left join ------------------------------------------------------------------------
 NVBENCH_BENCH_TYPES(nvbench_mixed_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_left_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("mixed_left_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
 
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_left_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_left_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_left_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// full join ------------------------------------------------------------------------
 NVBENCH_BENCH_TYPES(nvbench_mixed_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_full_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_full_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_full_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_full_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_full_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// left semi join ------------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_semi_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_left_semi_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_semi_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_left_semi_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("mixed_full_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
 
 NVBENCH_BENCH_TYPES(nvbench_mixed_left_semi_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_left_semi_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_semi_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_left_semi_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-// left anti join ------------------------------------------------------------------------
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_anti_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_left_anti_join_32bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_anti_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<false>))
-  .set_name("mixed_left_anti_join_64bit")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
-
-NVBENCH_BENCH_TYPES(nvbench_mixed_left_anti_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::type_list<nvbench::int32_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_left_anti_join_32bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {100'000, 10'000'000, 80'000'000, 100'000'000})
-  .add_int64_axis("Probe Table Size",
-                  {100'000, 400'000, 10'000'000, 40'000'000, 100'000'000, 240'000'000});
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("mixed_left_semi_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
 
 NVBENCH_BENCH_TYPES(nvbench_mixed_left_anti_join,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::type_list<nvbench::int64_t>,
-                                      nvbench::enum_type_list<true>))
-  .set_name("mixed_left_anti_join_64bit_nulls")
-  .set_type_axes_names({"Key Type", "Payload Type", "Nullable"})
-  .add_int64_axis("Build Table Size", {40'000'000, 50'000'000})
-  .add_int64_axis("Probe Table Size", {50'000'000, 120'000'000});
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("mixed_left_anti_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);

From c3f34093ede9e20c5e2e008658097dffc99fe038 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 10 May 2024 15:19:45 -0500
Subject: [PATCH 189/842] Fix `get_loc` to properly fetch results from an index
 that is in decreasing order (#15719)

Fixes: #15713

This PR properly calls `search_sorted` to arrive at correct results for indexes that are of decreasing order.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15719
---
 python/cudf/cudf/core/index.py       | 10 ++++++++--
 python/cudf/cudf/tests/test_index.py |  7 ++++---
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 096b6f17c1d..0710f0f5c42 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -112,10 +112,16 @@ def _lexsorted_equal_range(
         sort_inds = None
         sort_vals = idx
     lower_bound = search_sorted(
-        [*sort_vals._data.columns], [*key_as_table._columns], side="left"
+        [*sort_vals._data.columns],
+        [*key_as_table._columns],
+        side="left",
+        ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
     upper_bound = search_sorted(
-        [*sort_vals._data.columns], [*key_as_table._columns], side="right"
+        [*sort_vals._data.columns],
+        [*key_as_table._columns],
+        side="right",
+        ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
 
     return lower_bound, upper_bound, sort_inds
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 3cc6bfdbdc2..8b7ee1dccf8 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -1806,12 +1806,13 @@ def test_get_loc_rangeindex(idx, key):
 @pytest.mark.parametrize(
     "idx",
     [
-        pd.Index([1, 3, 3, 6]),  # monotonic
+        pd.Index([1, 3, 3, 6]),  # monotonic increasing
         pd.Index([6, 1, 3, 3]),  # non-monotonic
+        pd.Index([4, 3, 2, 1, 0]),  # monotonic decreasing
     ],
 )
-@pytest.mark.parametrize("key", [0, 3, 6, 7])
-def test_get_loc_single_duplicate_numeric(idx, key):
+@pytest.mark.parametrize("key", [0, 3, 6, 7, 4])
+def test_get_loc_duplicate_numeric(idx, key):
     pi = idx
     gi = cudf.from_pandas(pi)
 

From b5a9c4b5114390fb45e27d9aab5eaa995de3fa37 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Fri, 10 May 2024 19:08:18 -0400
Subject: [PATCH 190/842] Fix multi-source reading in JSON byte range reader
 (#15671)

This PR fixes the number of bytes read and corrects the offsets for the delimiters added to the buffer when reading across multiple sources.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15671
---
 cpp/src/io/json/read_json.cu         |  57 +++++++-------
 cpp/tests/io/json_chunked_reader.cpp | 110 ++++++++++++++++++++++++++-
 2 files changed, 135 insertions(+), 32 deletions(-)

diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 0ead5c56264..ea52dce020e 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -50,7 +50,10 @@ size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
 }
 
 /**
- * @brief Read from array of data sources into RMM buffer
+ * @brief Read from array of data sources into RMM buffer. The size of the returned device span
+          can be larger than the number of bytes requested from the list of sources when
+          the range to be read spans across multiple sources. This is due to the delimiter
+          characters inserted after the end of each accessed source.
  *
  * @param buffer Device span buffer to which data is read
  * @param sources Array of data sources
@@ -72,7 +75,6 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line
   // delimiter.
   auto constexpr num_delimiter_chars = 1;
-  auto const num_extra_delimiters    = num_delimiter_chars * (sources.size() - 1);
 
   if (compression == compression_type::NONE) {
     std::vector<size_type> delimiter_map{};
@@ -89,28 +91,29 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
       std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset);
     size_t start_source = std::distance(prefsum_source_sizes.begin(), upper);
 
-    auto remaining_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset);
+    auto const total_bytes_to_read =
+      std::min(range_size, prefsum_source_sizes.back() - range_offset);
     range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
-    for (size_t i = start_source; i < sources.size() && remaining_bytes_to_read; i++) {
+    for (size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) {
       if (sources[i]->is_empty()) continue;
-      auto data_size   = std::min(sources[i]->size() - range_offset, remaining_bytes_to_read);
-      auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read;
+      auto data_size =
+        std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read);
+      auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read +
+                         (num_delimiter_chars * delimiter_map.size());
       if (sources[i]->is_device_read_preferred(data_size)) {
         bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
       } else {
         h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
         auto const& h_buffer = h_buffers.back();
         CUDF_CUDA_TRY(cudaMemcpyAsync(
-          destination, h_buffer->data(), h_buffer->size(), cudaMemcpyDefault, stream.value()));
+          destination, h_buffer->data(), h_buffer->size(), cudaMemcpyHostToDevice, stream.value()));
         bytes_read += h_buffer->size();
       }
       range_offset = 0;
-      remaining_bytes_to_read -= bytes_read;
-      delimiter_map.push_back(bytes_read);
-      bytes_read += num_delimiter_chars;
+      delimiter_map.push_back(bytes_read + (num_delimiter_chars * delimiter_map.size()));
     }
-    // In the case where all sources are empty, bytes_read is zero
-    if (bytes_read) bytes_read -= num_delimiter_chars;
+    // Removing delimiter inserted after last non-empty source is read
+    if (!delimiter_map.empty()) { delimiter_map.pop_back(); }
 
     // If this is a multi-file source, we scatter the JSON line delimiters between files
     if (sources.size() > 1) {
@@ -118,9 +121,7 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
                     "Currently only single-character delimiters are supported");
       auto const delimiter_source = thrust::make_constant_iterator('\n');
       auto const d_delimiter_map  = cudf::detail::make_device_uvector_async(
-        host_span<size_type const>{delimiter_map.data(), delimiter_map.size() - 1},
-        stream,
-        rmm::mr::get_current_device_resource());
+        delimiter_map, stream, rmm::mr::get_current_device_resource());
       thrust::scatter(rmm::exec_policy_nosync(stream),
                       delimiter_source,
                       delimiter_source + d_delimiter_map.size(),
@@ -128,7 +129,7 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
                       buffer.data());
     }
     stream.synchronize();
-    return buffer.first(bytes_read);
+    return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars));
   }
   // TODO: allow byte range reading from multiple compressed files.
   auto remaining_bytes_to_read = std::min(range_size, sources[0]->size() - range_offset);
@@ -151,17 +152,15 @@ size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::data
                                         char const delimiter,
                                         rmm::cuda_stream_view stream)
 {
-  auto const total_source_size =
-    sources_size(sources, reader_opts.get_byte_range_offset(), reader_opts.get_byte_range_size()) +
-    (sources.size() - 1);
+  auto total_source_size = sources_size(sources, 0, 0) + (sources.size() - 1);
   rmm::device_uvector<char> buffer(total_source_size, stream);
-  ingest_raw_input(buffer,
-                   sources,
-                   reader_opts.get_compression(),
-                   reader_opts.get_byte_range_offset(),
-                   reader_opts.get_byte_range_size(),
-                   stream);
-  return find_first_delimiter(buffer, delimiter, stream);
+  auto readbufspan = ingest_raw_input(buffer,
+                                      sources,
+                                      reader_opts.get_compression(),
+                                      reader_opts.get_byte_range_offset(),
+                                      reader_opts.get_byte_range_size(),
+                                      stream);
+  return find_first_delimiter(readbufspan, '\n', stream);
 }
 
 /**
@@ -195,8 +194,7 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
   CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset,
                "Invalid offsetting");
   auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
-  chunk_size =
-    should_load_all_sources ? total_source_size - chunk_offset + num_extra_delimiters : chunk_size;
+  chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size;
 
   // Some magic numbers
   constexpr int num_subchunks               = 10;  // per chunk_size
@@ -217,7 +215,8 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
   size_t const buffer_size =
     reader_compression != compression_type::NONE
       ? total_source_size * estimated_compression_ratio + header_size
-      : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk);
+      : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk) +
+          num_extra_delimiters;
   rmm::device_uvector<char> buffer(buffer_size, stream);
   device_span<char> bufspan(buffer);
 
diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json_chunked_reader.cpp
index ef69ee5239d..7482cb1b70d 100644
--- a/cpp/tests/io/json_chunked_reader.cpp
+++ b/cpp/tests/io/json_chunked_reader.cpp
@@ -24,11 +24,19 @@
 
 #include <rmm/resource_ref.hpp>
 
+#include <fstream>
+#include <string>
+#include <vector>
+
 /**
  * @brief Base test fixture for JSON reader tests
  */
 struct JsonReaderTest : public cudf::test::BaseFixture {};
 
+cudf::test::TempDirTestEnvironment* const temp_env =
+  static_cast<cudf::test::TempDirTestEnvironment*>(
+    ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
 // function to extract first delimiter in the string in each chunk,
 // collate together and form byte_range for each chunk,
 // parse separately.
@@ -41,7 +49,6 @@ std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
 {
   using namespace cudf::io::json::detail;
   using cudf::size_type;
-  // assuming single source.
   size_t total_source_size = 0;
   for (auto const& source : sources) {
     total_source_size += source->size();
@@ -77,7 +84,9 @@ std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
   std::vector<cudf::io::table_with_metadata> tables;
   // Process each chunk in parallel.
   for (auto const& [chunk_start, chunk_end] : record_ranges) {
-    if (chunk_start == -1 or chunk_end == -1) continue;
+    if (chunk_start == -1 or chunk_end == -1 or
+        static_cast<size_t>(chunk_start) >= total_source_size)
+      continue;
     reader_opts_chunk.set_byte_range_offset(chunk_start);
     reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start);
     tables.push_back(read_json(sources, reader_opts_chunk, stream, mr));
@@ -87,7 +96,7 @@ std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
   return tables;
 }
 
-TEST_F(JsonReaderTest, ByteRange)
+TEST_F(JsonReaderTest, ByteRange_SingleSource)
 {
   std::string const json_string = R"(
     { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
@@ -126,3 +135,98 @@ TEST_F(JsonReaderTest, ByteRange)
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view());
   }
 }
+
+TEST_F(JsonReaderTest, ReadCompleteFiles)
+{
+  std::string const json_string = R"(
+    { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+  auto filename                 = temp_env->get_temp_dir() + "ParseInRangeIntegers.json";
+  {
+    std::ofstream outfile(filename, std::ofstream::out);
+    outfile << json_string;
+  }
+
+  constexpr int num_sources = 5;
+  std::vector<std::string> filepaths(num_sources, filename);
+
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepaths})
+      .lines(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+  std::vector<cudf::io::table_with_metadata> part_tables;
+  for (auto filepath : filepaths) {
+    cudf::io::json_reader_options part_in_options =
+      cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
+        .lines(true)
+        .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
+
+    part_tables.push_back(cudf::io::read_json(part_in_options));
+  }
+
+  auto part_table_views = std::vector<cudf::table_view>(part_tables.size());
+  std::transform(part_tables.begin(), part_tables.end(), part_table_views.begin(), [](auto& table) {
+    return table.tbl->view();
+  });
+
+  auto expected_result = cudf::concatenate(part_table_views);
+
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result.tbl->view(), expected_result->view());
+}
+
+TEST_F(JsonReaderTest, ByteRange_MultiSource)
+{
+  std::string const json_string = R"(
+    { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+  auto filename                 = temp_env->get_temp_dir() + "ParseInRangeIntegers.json";
+  {
+    std::ofstream outfile(filename, std::ofstream::out);
+    outfile << json_string;
+  }
+
+  constexpr int num_sources = 5;
+  std::vector<std::string> filepaths(num_sources, filename);
+
+  // Initialize parsing options (reading json lines)
+  cudf::io::json_reader_options json_lines_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepaths})
+      .lines(true)
+      .compression(cudf::io::compression_type::NONE)
+      .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
+
+  // Read full test data via existing, nested JSON lines reader
+  cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options);
+
+  auto file_paths = json_lines_options.get_source().filepaths();
+  std::vector<std::unique_ptr<cudf::io::datasource>> datasources;
+  for (auto& fp : file_paths) {
+    datasources.emplace_back(cudf::io::datasource::create(fp));
+  }
+
+  // Test for different chunk sizes
+  for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500, 1000, 2000}) {
+    auto const tables = skeleton_for_parellel_chunk_reader(datasources,
+                                                           json_lines_options,
+                                                           chunk_size,
+                                                           cudf::get_default_stream(),
+                                                           rmm::mr::get_current_device_resource());
+
+    auto table_views = std::vector<cudf::table_view>(tables.size());
+    std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
+      return table.tbl->view();
+    });
+    auto result = cudf::concatenate(table_views);
+
+    // Verify that the data read via chunked reader matches the data read via nested JSON reader
+    // cannot use EQUAL due to concatenate removing null mask
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view());
+  }
+}

From ce1933fc07d5f8d1da3ad36217ea0b39d7a926fa Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 10 May 2024 20:40:41 -0700
Subject: [PATCH 191/842] Change the default dictionary policy in Parquet
 writer from `ALWAYS` to `ADAPTIVE` (#15570)

This PR changes the default dictionary policy in parquet from `ALWAYS` to `ADAPTIVE` and adds an argument `max_dictionary_size` to control the `ADAPTIVE`-ness of the dictionary policy. This change prevents a silent fallback to `UNCOMPRESSED` when writing parquet files with `ZSTD` compression leading to better performance for several use cases.

Partially closes #15501.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15570
---
 cpp/include/cudf/io/parquet.hpp          |  8 ++---
 python/cudf/cudf/_lib/cpp/io/parquet.pxd | 14 +++++++--
 python/cudf/cudf/_lib/parquet.pyx        | 28 ++++++++++++++++--
 python/cudf/cudf/io/parquet.py           |  4 +++
 python/cudf/cudf/tests/test_parquet.py   | 37 ++++++++++++++++++++++++
 python/cudf/cudf/utils/ioutils.py        |  8 +++--
 6 files changed, 88 insertions(+), 11 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 0406d6e3e4c..8bfcacdb47f 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -564,7 +564,7 @@ class parquet_writer_options {
   // Maximum size of min or max values in column index
   int32_t _column_index_truncate_length = default_column_index_truncate_length;
   // When to use dictionary encoding for data
-  dictionary_policy _dictionary_policy = dictionary_policy::ALWAYS;
+  dictionary_policy _dictionary_policy = dictionary_policy::ADAPTIVE;
   // Maximum size of column chunk dictionary (in bytes)
   size_t _max_dictionary_size = default_max_dictionary_size;
   // Maximum number of rows in a page fragment
@@ -1095,7 +1095,7 @@ class parquet_writer_options_builder {
    * dictionary_policy::ALWAYS will allow the use of dictionary encoding even if it will result in
    * the disabling of compression for columns that would otherwise be compressed.
    *
-   * The default value is dictionary_policy::ALWAYS.
+   * The default value is dictionary_policy::ADAPTIVE.
    *
    * @param val policy for dictionary use
    * @return this for chaining
@@ -1258,7 +1258,7 @@ class chunked_parquet_writer_options {
   // Maximum size of min or max values in column index
   int32_t _column_index_truncate_length = default_column_index_truncate_length;
   // When to use dictionary encoding for data
-  dictionary_policy _dictionary_policy = dictionary_policy::ALWAYS;
+  dictionary_policy _dictionary_policy = dictionary_policy::ADAPTIVE;
   // Maximum size of column chunk dictionary (in bytes)
   size_t _max_dictionary_size = default_max_dictionary_size;
   // Maximum number of rows in a page fragment
@@ -1751,7 +1751,7 @@ class chunked_parquet_writer_options_builder {
    * dictionary_policy::ALWAYS will allow the use of dictionary encoding even if it will result in
    * the disabling of compression for columns that would otherwise be compressed.
    *
-   * The default value is dictionary_policy::ALWAYS.
+   * The default value is dictionary_policy::ADAPTIVE.
    *
    * @param val policy for dictionary use
    * @return this for chaining
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index 8de16d06a9d..1680eb43700 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -74,6 +74,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         size_type get_row_group_size_rows() except +
         size_t get_max_page_size_bytes() except +
         size_type get_max_page_size_rows() except +
+        size_t get_max_dictionary_size() except +
 
         void set_partitions(
             vector[cudf_io_types.partition_info] partitions
@@ -103,8 +104,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_row_group_size_rows(size_type val) except +
         void set_max_page_size_bytes(size_t val) except +
         void set_max_page_size_rows(size_type val) except +
+        void set_max_dictionary_size(size_t val) except +
         void enable_write_v2_headers(bool val) except +
-        void set_dictionary_policy(cudf_io_types.dictionary_policy policy)except +
+        void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except +
 
         @staticmethod
         parquet_writer_options_builder builder(
@@ -155,6 +157,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_writer_options_builder& max_page_size_rows(
             size_type val
         ) except +
+        parquet_writer_options_builder& max_dictionary_size(
+            size_t val
+        ) except +
         parquet_writer_options_builder& write_v2_headers(
             bool val
         ) except +
@@ -179,6 +184,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         size_type get_row_group_size_rows() except +
         size_t get_max_page_size_bytes() except +
         size_type get_max_page_size_rows() except +
+        size_t get_max_dictionary_size() except +
 
         void set_metadata(
             cudf_io_types.table_input_metadata m
@@ -202,8 +208,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_row_group_size_rows(size_type val) except +
         void set_max_page_size_bytes(size_t val) except +
         void set_max_page_size_rows(size_type val) except +
+        void set_max_dictionary_size(size_t val) except +
         void enable_write_v2_headers(bool val) except +
-        void set_dictionary_policy(cudf_io_types.dictionary_policy policy)except +
+        void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except +
 
         @staticmethod
         chunked_parquet_writer_options_builder builder(
@@ -245,6 +252,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         chunked_parquet_writer_options_builder& max_page_size_rows(
             size_type val
         ) except +
+        chunked_parquet_writer_options_builder& max_dictionary_size(
+            size_t val
+        ) except +
         parquet_writer_options_builder& write_v2_headers(
             bool val
         ) except +
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 9ce9aad18f7..dcfa087a1fa 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -399,6 +399,7 @@ def write_parquet(
     object row_group_size_rows=None,
     object max_page_size_bytes=None,
     object max_page_size_rows=None,
+    object max_dictionary_size=None,
     object partitions_info=None,
     object force_nullable_schema=False,
     header_version="1.0",
@@ -478,7 +479,7 @@ def write_parquet(
         )
 
     dict_policy = (
-        cudf_io_types.dictionary_policy.ALWAYS
+        cudf_io_types.dictionary_policy.ADAPTIVE
         if use_dictionary
         else cudf_io_types.dictionary_policy.NEVER
     )
@@ -528,6 +529,8 @@ def write_parquet(
         args.set_max_page_size_bytes(max_page_size_bytes)
     if max_page_size_rows is not None:
         args.set_max_page_size_rows(max_page_size_rows)
+    if max_dictionary_size is not None:
+        args.set_max_dictionary_size(max_dictionary_size)
 
     with nogil:
         out_metadata_c = move(parquet_writer(args))
@@ -571,7 +574,14 @@ cdef class ParquetWriter:
     max_page_size_rows: int, default 20000
         Maximum number of rows of each page of the output.
         By default, 20000 will be used.
-
+    max_dictionary_size: int, default 1048576
+        Maximum size of the dictionary page for each output column chunk. Dictionary
+        encoding for column chunks that exceeds this limit will be disabled.
+        By default, 1048576 (1MB) will be used.
+    use_dictionary : bool, default True
+        If ``True``, enable dictionary encoding for Parquet page data
+        subject to ``max_dictionary_size`` constraints.
+        If ``False``, disable dictionary encoding for Parquet page data.
     See Also
     --------
     cudf.io.parquet.write_parquet
@@ -588,13 +598,17 @@ cdef class ParquetWriter:
     cdef size_type row_group_size_rows
     cdef size_t max_page_size_bytes
     cdef size_type max_page_size_rows
+    cdef size_t max_dictionary_size
+    cdef cudf_io_types.dictionary_policy dict_policy
 
     def __cinit__(self, object filepath_or_buffer, object index=None,
                   object compression="snappy", str statistics="ROWGROUP",
                   int row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
                   int row_group_size_rows=1000000,
                   int max_page_size_bytes=524288,
-                  int max_page_size_rows=20000):
+                  int max_page_size_rows=20000,
+                  int max_dictionary_size=1048576,
+                  bool use_dictionary=True):
         filepaths_or_buffers = (
             list(filepath_or_buffer)
             if is_list_like(filepath_or_buffer)
@@ -609,6 +623,12 @@ cdef class ParquetWriter:
         self.row_group_size_rows = row_group_size_rows
         self.max_page_size_bytes = max_page_size_bytes
         self.max_page_size_rows = max_page_size_rows
+        self.max_dictionary_size = max_dictionary_size
+        self.dict_policy = (
+            cudf_io_types.dictionary_policy.ADAPTIVE
+            if use_dictionary
+            else cudf_io_types.dictionary_policy.NEVER
+        )
 
     def write_table(self, table, object partitions_info=None):
         """ Writes a single table to the file """
@@ -726,8 +746,10 @@ cdef class ParquetWriter:
                 .row_group_size_rows(self.row_group_size_rows)
                 .max_page_size_bytes(self.max_page_size_bytes)
                 .max_page_size_rows(self.max_page_size_rows)
+                .max_dictionary_size(self.max_dictionary_size)
                 .build()
             )
+            args.set_dictionary_policy(self.dict_policy)
             self.writer.reset(new cpp_parquet_chunked_writer(args))
         self.initialized = True
 
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index dd1e59acaaa..a6c67d22af7 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -63,6 +63,7 @@ def _write_parquet(
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
+    max_dictionary_size=None,
     partitions_info=None,
     storage_options=None,
     force_nullable_schema=False,
@@ -96,6 +97,7 @@ def _write_parquet(
         "row_group_size_rows": row_group_size_rows,
         "max_page_size_bytes": max_page_size_bytes,
         "max_page_size_rows": max_page_size_rows,
+        "max_dictionary_size": max_dictionary_size,
         "partitions_info": partitions_info,
         "force_nullable_schema": force_nullable_schema,
         "header_version": header_version,
@@ -898,6 +900,7 @@ def to_parquet(
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
+    max_dictionary_size=None,
     storage_options=None,
     return_metadata=False,
     force_nullable_schema=False,
@@ -974,6 +977,7 @@ def to_parquet(
             row_group_size_rows=row_group_size_rows,
             max_page_size_bytes=max_page_size_bytes,
             max_page_size_rows=max_page_size_rows,
+            max_dictionary_size=max_dictionary_size,
             partitions_info=partition_info,
             storage_options=storage_options,
             force_nullable_schema=force_nullable_schema,
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index cf3c0e7f7a0..3680c1e0c62 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1890,6 +1890,43 @@ def test_parquet_writer_max_page_size(tmpdir, max_page_size_kwargs):
     assert s1 > s2
 
 
+@pytest.mark.parametrize("use_dict", [False, True])
+@pytest.mark.parametrize("max_dict_size", [0, 1048576])
+def test_parquet_writer_dictionary_setting(use_dict, max_dict_size):
+    # Simple test for checking the validity of dictionary encoding setting
+    # and behavior of ParquetWriter in cudf.
+    # Write a table with repetitive data with varying dictionary settings.
+    # Make sure the written columns are dictionary-encoded accordingly.
+
+    # Table with repetitive data
+    table = cudf.DataFrame(
+        {
+            "int32": cudf.Series([1024] * 1024, dtype="int64"),
+        }
+    )
+
+    # Write to Parquet using ParquetWriter
+    buffer = BytesIO()
+    writer = ParquetWriter(
+        buffer,
+        use_dictionary=use_dict,
+        max_dictionary_size=max_dict_size,
+    )
+    writer.write_table(table)
+    writer.close()
+
+    # Read encodings from parquet file
+    got = pq.ParquetFile(buffer)
+    encodings = got.metadata.row_group(0).column(0).encodings
+
+    # Check for `PLAIN_DICTIONARY` encoding if dictionary encoding enabled
+    # and dictionary page limit > 0
+    if use_dict is True and max_dict_size > 0:
+        assert "PLAIN_DICTIONARY" in encodings
+    else:
+        assert "PLAIN_DICTIONARY" not in encodings
+
+
 @pytest.mark.parametrize("filename", ["myfile.parquet", None])
 @pytest.mark.parametrize("cols", [["b"], ["c", "b"]])
 def test_parquet_partitioned(tmpdir_factory, cols, filename):
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 9c7c687a6ed..18e81078587 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -278,6 +278,10 @@
 max_page_size_rows: integer or None, default None
     Maximum number of rows of each page of the output.
     If None, 20000 will be used.
+max_dictionary_size: integer or None, default None
+    Maximum size of the dictionary page for each output column chunk. Dictionary
+    encoding for column chunks that exceeds this limit will be disabled.
+    If None, 1048576 (1MB) will be used.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -292,8 +296,8 @@
     ``return_metadata=True`` instead of specifying ``metadata_file_path``
 use_dictionary : bool, default True
     When ``False``, prevents the use of dictionary encoding for Parquet page
-    data. When ``True``, dictionary encoding is preferred when not disabled due
-    to dictionary size constraints.
+    data. When ``True``, dictionary encoding is preferred subject to
+    ``max_dictionary_size`` constraints.
 header_version : {{'1.0', '2.0'}}, default "1.0"
     Controls whether to use version 1.0 or version 2.0 page headers when
     encoding. Version 1.0 is more portable, but version 2.0 enables the

From 425a5dac64b7c74c061b588dc8725c5390517cf9 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Sun, 12 May 2024 15:44:24 -0500
Subject: [PATCH 192/842] Return same type as the original index for `.loc`
 operations (#15717)

Fixes: #15716

This PR makes changes to `.loc` by preserving the original type at the end of the operation.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15717
---
 python/cudf/cudf/core/dataframe.py       |  5 +++++
 python/cudf/cudf/tests/test_dataframe.py | 18 ++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b937d2da25c..b29089cb81a 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -357,6 +357,11 @@ def _getitem_tuple_arg(self, arg):
                     # as join is not assigning any names to index,
                     # update it over here
                     df.index.name = columns_df.index.name
+                    if not isinstance(
+                        df.index, MultiIndex
+                    ) and is_numeric_dtype(df.index.dtype):
+                        # Preserve the original index type.
+                        df.index = df.index.astype(self._frame.index.dtype)
                     df = df.sort_values(by=[tmp_col_name, cantor_name])
                     df.drop(columns=[tmp_col_name, cantor_name], inplace=True)
                     # There were no indices found
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 8550bc91253..96301670e9c 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -11010,3 +11010,21 @@ def test_dataframe_init_with_nans():
     assert gdf["a"].dtype == np.dtype("float64")
     pdf = pd.DataFrame({"a": [1, 2, 3, np.nan]})
     assert_eq(pdf, gdf)
+
+
+@pytest.mark.parametrize("dtype1", ["int16", "float32"])
+@pytest.mark.parametrize("dtype2", ["int16", "float32"])
+def test_dataframe_loc_int_float(dtype1, dtype2):
+    df = cudf.DataFrame(
+        {"a": [10, 11, 12, 13, 14]},
+        index=cudf.Index([1, 2, 3, 4, 5], dtype=dtype1),
+    )
+    pdf = df.to_pandas()
+
+    gidx = cudf.Index([2, 3, 4], dtype=dtype2)
+    pidx = gidx.to_pandas()
+
+    actual = df.loc[gidx]
+    expected = pdf.loc[pidx]
+
+    assert_eq(actual, expected, check_index_type=True, check_dtype=True)

From bff301527d074cd8f98d1a2d8dddedbf8830dffd Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Mon, 13 May 2024 01:33:29 -0700
Subject: [PATCH 193/842] Adding parquet transcoding example (#15420)

This PR adds a new example `parquet_io` to `libcudf/cpp/examples` instrumenting reading and writing parquet files with different column encodings (same for all columns for now) and compressions to close #15344. The example maybe elaborated and/or evolved as needed. #15348 should be merged before this PR to get all CMake updates needed to successfully build and run this example.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15420
---
 ci/run_cudf_examples.sh                 |   3 +
 cpp/examples/build.sh                   |   1 +
 cpp/examples/parquet_io/CMakeLists.txt  |  25 ++++
 cpp/examples/parquet_io/example.parquet | Bin 0 -> 614 bytes
 cpp/examples/parquet_io/parquet_io.cpp  | 172 ++++++++++++++++++++++++
 cpp/examples/parquet_io/parquet_io.hpp  | 157 +++++++++++++++++++++
 6 files changed, 358 insertions(+)
 create mode 100644 cpp/examples/parquet_io/CMakeLists.txt
 create mode 100644 cpp/examples/parquet_io/example.parquet
 create mode 100644 cpp/examples/parquet_io/parquet_io.cpp
 create mode 100644 cpp/examples/parquet_io/parquet_io.hpp

diff --git a/ci/run_cudf_examples.sh b/ci/run_cudf_examples.sh
index f3561bc595c..0819eacf636 100755
--- a/ci/run_cudf_examples.sh
+++ b/ci/run_cudf_examples.sh
@@ -23,4 +23,7 @@ compute-sanitizer --tool memcheck custom_optimized names.csv
 compute-sanitizer --tool memcheck custom_prealloc names.csv
 compute-sanitizer --tool memcheck custom_with_malloc names.csv
 
+compute-sanitizer --tool memcheck parquet_io
+compute-sanitizer --tool memcheck parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD TRUE
+
 exit ${EXITCODE}
diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index 9802c876930..bde6ef7d69c 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -59,3 +59,4 @@ build_example() {
 build_example basic
 build_example strings
 build_example nested_types
+build_example parquet_io
diff --git a/cpp/examples/parquet_io/CMakeLists.txt b/cpp/examples/parquet_io/CMakeLists.txt
new file mode 100644
index 00000000000..d8e9205ffd4
--- /dev/null
+++ b/cpp/examples/parquet_io/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+cmake_minimum_required(VERSION 3.26.4)
+
+include(../set_cuda_architecture.cmake)
+
+# initialize cuda architecture
+rapids_cuda_init_architectures(parquet_io)
+rapids_cuda_set_architectures(RAPIDS)
+
+project(
+  parquet_io
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+include(../fetch_dependencies.cmake)
+
+# Configure your project here
+add_executable(parquet_io parquet_io.cpp)
+target_link_libraries(parquet_io PRIVATE cudf::cudf)
+target_compile_features(parquet_io PRIVATE cxx_std_17)
+
+install(TARGETS parquet_io DESTINATION bin/examples/libcudf)
+install(FILES ${CMAKE_CURRENT_LIST_DIR}/example.parquet DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/parquet_io/example.parquet b/cpp/examples/parquet_io/example.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..f0fb5319cb040395b3d3841501e48de01fc0c7ba
GIT binary patch
literal 614
zcmZ9K!A^rf5Qew4MLc*hSpo?MveCvvB~Y7cOfOy<k4>YerfIgYttc#HSyWPKd?l}a
z44=qdXtb2mKmW}9^92~+Ph1F~1JAoq5ki3q0J{3q0_{fY>$H8Wg6-x+g-xUTprqb2
zi7;#e*t!6EBMm>#%ps7ErWmGSc7O?61_Xxa!KS8Y6K&K{hXZh_Za3vz6<tZOHPIAc
zn7Wc+>f==ZK3b!I`s8Eo#bFlF6x~_VHF_1EP>5l=BB*O2iX$QnnuVdLQjQrHrBmuS
z5%Ri}L`t`qiK5Dzm*Hd=TTfXAB=qNXufm%1Jdx*6MguSQD}pRa^1nvBcp)B_WD()m
z#~h3CH6m1u2XrNOpB_h7x2V{IsZ*d-uGb(c>ww|^6s;lfAR;~qIUe+<G@Tu=$TwA;
zW(4z?m3U#WV3_5SF3B-t@w?o5BP<PKDqXj4J9ZBh!rf8qHG6i?c2NOz=w<%W+I6g+
U*csT4W0<D+$~R2_#^MkA1w)>u9RL6T

literal 0
HcmV?d00001

diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
new file mode 100644
index 00000000000..8be17db3781
--- /dev/null
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "parquet_io.hpp"
+
+/**
+ * @file parquet_io.cpp
+ * @brief Demonstrates usage of the libcudf APIs to read and write
+ * parquet file format with different encodings and compression types
+ *
+ * The following encoding and compression ztypes are demonstrated:
+ * Encoding Types: DEFAULT, DICTIONARY, PLAIN, DELTA_BINARY_PACKED,
+ *                 DELTA_LENGTH_BYTE_ARRAY, DELTA_BYTE_ARRAY
+ *
+ * Compression Types: NONE, AUTO, SNAPPY, LZ4, ZSTD
+ *
+ */
+
+/**
+ * @brief Read parquet input from file
+ *
+ * @param filepath path to input parquet file
+ * @return cudf::io::table_with_metadata
+ */
+cudf::io::table_with_metadata read_parquet(std::string filepath)
+{
+  auto source_info = cudf::io::source_info(filepath);
+  auto builder     = cudf::io::parquet_reader_options::builder(source_info);
+  auto options     = builder.build();
+  return cudf::io::read_parquet(options);
+}
+
+/**
+ * @brief Write parquet output to file
+ *
+ * @param input table to write
+ * @param metadata metadata of input table read by parquet reader
+ * @param filepath path to output parquet file
+ * @param stats_level optional page size stats level
+ */
+void write_parquet(cudf::table_view input,
+                   cudf::io::table_metadata metadata,
+                   std::string filepath,
+                   cudf::io::column_encoding encoding,
+                   cudf::io::compression_type compression,
+                   std::optional<cudf::io::statistics_freq> stats_level)
+{
+  // write the data for inspection
+  auto sink_info      = cudf::io::sink_info(filepath);
+  auto builder        = cudf::io::parquet_writer_options::builder(sink_info, input);
+  auto table_metadata = cudf::io::table_input_metadata{metadata};
+
+  std::for_each(table_metadata.column_metadata.begin(),
+                table_metadata.column_metadata.end(),
+                [=](auto& col_meta) { col_meta.set_encoding(encoding); });
+
+  builder.metadata(table_metadata);
+  auto options = builder.build();
+  options.set_compression(compression);
+  // Either use the input stats level or don't write stats
+  options.set_stats_level(stats_level.value_or(cudf::io::statistics_freq::STATISTICS_NONE));
+
+  // write parquet data
+  cudf::io::write_parquet(options);
+}
+
+/**
+ * @brief Main for nested_types examples
+ *
+ * Command line parameters:
+ * 1. parquet input file name/path (default: "example.parquet")
+ * 2. parquet output file name/path (default: "output.parquet")
+ * 3. encoding type for columns (default: "DELTA_BINARY_PACKED")
+ * 4. compression type (default: "ZSTD")
+ * 5. optional: use page size stats metadata (default: "NO")
+ *
+ * Example invocation from directory `cudf/cpp/examples/parquet_io`:
+ * ./build/parquet_io example.parquet output.parquet DELTA_BINARY_PACKED ZSTD
+ *
+ */
+int main(int argc, char const** argv)
+{
+  std::string input_filepath;
+  std::string output_filepath;
+  cudf::io::column_encoding encoding;
+  cudf::io::compression_type compression;
+  std::optional<cudf::io::statistics_freq> page_stats;
+
+  switch (argc) {
+    case 1:
+      input_filepath  = "example.parquet";
+      output_filepath = "output.parquet";
+      encoding        = get_encoding_type("DELTA_BINARY_PACKED");
+      compression     = get_compression_type("ZSTD");
+      break;
+    case 6: page_stats = get_page_size_stats(argv[5]); [[fallthrough]];
+    case 5:
+      input_filepath  = argv[1];
+      output_filepath = argv[2];
+      encoding        = get_encoding_type(argv[3]);
+      compression     = get_compression_type(argv[4]);
+      break;
+    default:
+      throw std::runtime_error(
+        "Either provide all command-line arguments, or none to use defaults\n");
+  }
+
+  // Create and use a memory pool
+  bool is_pool_used = true;
+  auto resource     = create_memory_resource(is_pool_used);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  // Read input parquet file
+  // We do not want to time the initial read time as it may include
+  // time for nvcomp, cufile loading and RMM growth
+  std::cout << std::endl << "Reading " << input_filepath << "..." << std::endl;
+  std::cout << "Note: Not timing the initial parquet read as it may include\n"
+               "times for nvcomp, cufile loading and RMM growth."
+            << std::endl
+            << std::endl;
+  auto [input, metadata] = read_parquet(input_filepath);
+
+  // Status string to indicate if page stats are set to be written or not
+  auto page_stat_string = (page_stats.has_value()) ? "page stats" : "no page stats";
+  // Write parquet file with the specified encoding and compression
+  std::cout << "Writing " << output_filepath << " with encoding, compression and "
+            << page_stat_string << ".." << std::endl;
+
+  // `timer` is automatically started here
+  Timer timer;
+  write_parquet(input->view(), metadata, output_filepath, encoding, compression, page_stats);
+  timer.print_elapsed_millis();
+
+  // Read the parquet file written with encoding and compression
+  std::cout << "Reading " << output_filepath << "..." << std::endl;
+
+  // Reset the timer
+  timer.reset();
+  auto [transcoded_input, transcoded_metadata] = read_parquet(output_filepath);
+  timer.print_elapsed_millis();
+
+  // Check for validity
+  try {
+    // Left anti-join the original and transcoded tables
+    // identical tables should not throw an exception and
+    // return an empty indices vector
+    auto const indices = cudf::left_anti_join(
+      input->view(), transcoded_input->view(), cudf::null_equality::EQUAL, resource.get());
+
+    // No exception thrown, check indices
+    auto const valid = indices->size() == 0;
+    std::cout << "Transcoding valid: " << std::boolalpha << valid << std::endl;
+  } catch (std::exception& e) {
+    std::cerr << e.what() << std::endl << std::endl;
+    std::cout << "Transcoding valid: false" << std::endl;
+  }
+
+  return 0;
+}
diff --git a/cpp/examples/parquet_io/parquet_io.hpp b/cpp/examples/parquet_io/parquet_io.hpp
new file mode 100644
index 00000000000..d2fc359a2fe
--- /dev/null
+++ b/cpp/examples/parquet_io/parquet_io.hpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/io/parquet.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/join.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <chrono>
+#include <iostream>
+#include <optional>
+#include <string>
+
+/**
+ * @brief Create memory resource for libcudf functions
+ *
+ * @param pool Whether to use a pool memory resource.
+ * @return Memory resource instance
+ */
+std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_pool_used)
+{
+  auto cuda_mr = std::make_shared<rmm::mr::cuda_memory_resource>();
+  if (is_pool_used) {
+    return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+      cuda_mr, rmm::percent_of_free_device_memory(50));
+  }
+  return cuda_mr;
+}
+
+/**
+ * @brief Get encoding type from the keyword
+ *
+ * @param name encoding keyword name
+ * @return corresponding column encoding type
+ */
+[[nodiscard]] cudf::io::column_encoding get_encoding_type(std::string name)
+{
+  using encoding_type = cudf::io::column_encoding;
+
+  static const std::unordered_map<std::string_view, cudf::io::column_encoding> map = {
+    {"DEFAULT", encoding_type::USE_DEFAULT},
+    {"DICTIONARY", encoding_type::DICTIONARY},
+    {"PLAIN", encoding_type::PLAIN},
+    {"DELTA_BINARY_PACKED", encoding_type::DELTA_BINARY_PACKED},
+    {"DELTA_LENGTH_BYTE_ARRAY", encoding_type::DELTA_LENGTH_BYTE_ARRAY},
+    {"DELTA_BYTE_ARRAY", encoding_type::DELTA_BYTE_ARRAY},
+  };
+
+  std::transform(name.begin(), name.end(), name.begin(), ::toupper);
+  if (map.find(name) != map.end()) { return map.at(name); }
+  throw std::invalid_argument("FATAL: " + std::string(name) +
+                              " is not a valid encoding type.\n\n"
+                              "Available encoding types: DEFAULT, DICTIONARY, PLAIN,\n"
+                              "DELTA_BINARY_PACKED, DELTA_LENGTH_BYTE_ARRAY,\n"
+                              "DELTA_BYTE_ARRAY\n"
+                              "\n"
+                              "Exiting...\n");
+}
+
+/**
+ * @brief Get compression type from the keyword
+ *
+ * @param name compression keyword name
+ * @return corresponding compression type
+ */
+[[nodiscard]] cudf::io::compression_type get_compression_type(std::string name)
+{
+  using compression_type = cudf::io::compression_type;
+
+  static const std::unordered_map<std::string_view, cudf::io::compression_type> map = {
+    {"NONE", compression_type::NONE},
+    {"AUTO", compression_type::AUTO},
+    {"SNAPPY", compression_type::SNAPPY},
+    {"LZ4", compression_type::LZ4},
+    {"ZSTD", compression_type::ZSTD}};
+
+  std::transform(name.begin(), name.end(), name.begin(), ::toupper);
+  if (map.find(name) != map.end()) { return map.at(name); }
+  throw std::invalid_argument("FATAL: " + std::string(name) +
+                              " is not a valid compression type.\n\n"
+                              "Available compression_type types: NONE, AUTO, SNAPPY,\n"
+                              "LZ4, ZSTD\n"
+                              "\n"
+                              "Exiting...\n");
+}
+
+/**
+ * @brief Get the optional page size stat frequency from they keyword
+ *
+ * @param use_stats keyword affirmation string such as: Y, T, YES, TRUE, ON
+ * @return optional page statistics frequency set to full (STATISTICS_COLUMN)
+ */
+[[nodiscard]] std::optional<cudf::io::statistics_freq> get_page_size_stats(std::string use_stats)
+{
+  std::transform(use_stats.begin(), use_stats.end(), use_stats.begin(), ::toupper);
+
+  // Check if the input string matches to any of the following
+  if (not use_stats.compare("ON") or not use_stats.compare("TRUE") or
+      not use_stats.compare("YES") or not use_stats.compare("Y") or not use_stats.compare("T")) {
+    // Full column and offset indices - STATISTICS_COLUMN
+    return std::make_optional(cudf::io::statistics_freq::STATISTICS_COLUMN);
+  }
+
+  return std::nullopt;
+}
+
+/**
+ * @brief Light-weight timer for parquet reader and writer instrumentation
+ *
+ * Timer object constructed from std::chrono, instrumenting at microseconds
+ * precision. Can display elapsed durations at milli and micro second
+ * scales. Timer starts at object construction.
+ */
+class Timer {
+ public:
+  using micros = std::chrono::microseconds;
+  using millis = std::chrono::milliseconds;
+
+  Timer() { reset(); }
+  void reset() { start_time = std::chrono::high_resolution_clock::now(); }
+  auto elapsed() { return (std::chrono::high_resolution_clock::now() - start_time); }
+  void print_elapsed_micros()
+  {
+    std::cout << "Elapsed Time: " << std::chrono::duration_cast<micros>(elapsed()).count()
+              << "us\n\n";
+  }
+  void print_elapsed_millis()
+  {
+    std::cout << "Elapsed Time: " << std::chrono::duration_cast<millis>(elapsed()).count()
+              << "ms\n\n";
+  }
+
+ private:
+  using time_point_t = std::chrono::time_point<std::chrono::high_resolution_clock>;
+  time_point_t start_time;
+};

From b4bdea295331862949afe408feb47522a4ff8f2a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Sun, 12 May 2024 23:41:58 -1000
Subject: [PATCH 194/842] Fix ColumnAccessor caching of nrows if empty
 previously (#15710)

https://github.com/rapidsai/cudf/pull/14758 may have propagated a caching invalidation bug of the number of rows in a `ColumnAccessor`

Previously the number of rows was cached and cleared only if an operation caused the `ColumnAccessor` to have no more columns.

However, if the `ColumnAccessor` was empty and operation added new columns, the cached number of rows should have also been cleared.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15710
---
 python/cudf/cudf/core/column_accessor.py      | 47 +++++++++++++------
 .../cudf/cudf/tests/test_column_accessor.py   | 14 ++++++
 2 files changed, 47 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index fbce6e02330..9f3de061ee8 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -158,8 +158,10 @@ def __setitem__(self, key: Any, value: Any):
         self.set_by_label(key, value)
 
     def __delitem__(self, key: Any):
+        old_ncols = len(self._data)
         del self._data[key]
-        self._clear_cache()
+        new_ncols = len(self._data)
+        self._clear_cache(old_ncols, new_ncols)
 
     def __len__(self) -> int:
         return len(self._data)
@@ -253,7 +255,17 @@ def _grouped_data(self) -> abc.MutableMapping:
         else:
             return self._data
 
-    def _clear_cache(self):
+    def _clear_cache(self, old_ncols: int, new_ncols: int):
+        """
+        Clear cached attributes.
+
+        Parameters
+        ----------
+        old_ncols: int
+            len(self._data) before self._data was modified
+        new_ncols: int
+            len(self._data) after self._data was modified
+        """
         cached_properties = ("columns", "names", "_grouped_data")
         for attr in cached_properties:
             try:
@@ -261,9 +273,12 @@ def _clear_cache(self):
             except AttributeError:
                 pass
 
-        # nrows should only be cleared if no data is present.
-        if len(self._data) == 0 and hasattr(self, "nrows"):
-            del self.nrows
+        # nrows should only be cleared if empty before/after the op.
+        if (old_ncols == 0) ^ (new_ncols == 0):
+            try:
+                del self.nrows
+            except AttributeError:
+                pass
 
     def to_pandas_index(self) -> pd.Index:
         """Convert the keys of the ColumnAccessor to a Pandas Index object."""
@@ -321,27 +336,27 @@ def insert(
         """
         name = self._pad_key(name)
 
-        ncols = len(self._data)
+        old_ncols = len(self._data)
         if loc == -1:
-            loc = ncols
-        if not (0 <= loc <= ncols):
+            loc = old_ncols
+        if not (0 <= loc <= old_ncols):
             raise ValueError(
-                "insert: loc out of bounds: must be  0 <= loc <= ncols"
+                f"insert: loc out of bounds: must be  0 <= loc <= {old_ncols}"
             )
         # TODO: we should move all insert logic here
         if name in self._data:
             raise ValueError(f"Cannot insert '{name}', already exists")
-        if loc == len(self._data):
+        if loc == old_ncols:
             if validate:
                 value = column.as_column(value)
-                if len(self._data) > 0 and len(value) != self.nrows:
+                if old_ncols > 0 and len(value) != self.nrows:
                     raise ValueError("All columns must be of equal length")
             self._data[name] = value
         else:
             new_keys = self.names[:loc] + (name,) + self.names[loc:]
             new_values = self.columns[:loc] + (value,) + self.columns[loc:]
             self._data = self._data.__class__(zip(new_keys, new_values))
-        self._clear_cache()
+        self._clear_cache(old_ncols, old_ncols + 1)
 
     def copy(self, deep=False) -> ColumnAccessor:
         """
@@ -498,8 +513,10 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True):
             if len(self._data) > 0 and len(value) != self.nrows:
                 raise ValueError("All columns must be of equal length")
 
+        old_ncols = len(self._data)
         self._data[key] = value
-        self._clear_cache()
+        new_ncols = len(self._data)
+        self._clear_cache(old_ncols, new_ncols)
 
     def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
         # Might be a generator
@@ -673,10 +690,12 @@ def droplevel(self, level):
         if level < 0:
             level += self.nlevels
 
+        old_ncols = len(self._data)
         self._data = {
             _remove_key_level(key, level): value
             for key, value in self._data.items()
         }
+        new_ncols = len(self._data)
         self._level_names = (
             self._level_names[:level] + self._level_names[level + 1 :]
         )
@@ -685,7 +704,7 @@ def droplevel(self, level):
             len(self._level_names) == 1
         ):  # can't use nlevels, as it depends on multiindex
             self.multiindex = False
-        self._clear_cache()
+        self._clear_cache(old_ncols, new_ncols)
 
 
 def _keys_equal(target: Any, key: Any) -> bool:
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
index a8eac2edf2b..f1f6097d6a9 100644
--- a/python/cudf/cudf/tests/test_column_accessor.py
+++ b/python/cudf/cudf/tests/test_column_accessor.py
@@ -293,3 +293,17 @@ def test_replace_level_values_MultiColumn():
 
     got = ca.rename_levels(mapper={"a": "f"}, level=0)
     check_ca_equal(expect, got)
+
+
+def test_clear_nrows_empty_before():
+    ca = ColumnAccessor({})
+    assert ca.nrows == 0
+    ca.insert("new", [1])
+    assert ca.nrows == 1
+
+
+def test_clear_nrows_empty_after():
+    ca = ColumnAccessor({"new": [1]})
+    assert ca.nrows == 1
+    del ca["new"]
+    assert ca.nrows == 0

From c42c4189d3273205a75d7b3c3ab33446eefb7631 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 13 May 2024 08:06:33 -0400
Subject: [PATCH 195/842] Remove experimental namespace from
 make_strings_children (#15702)

Replaces the `cudf::strings::detail::make_strings_children` with the new `cudf::strings::detail::experimental::make_strings_children`.
No code logic has changed -- just code moved around. All current code was already using the experimental function.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15702
---
 cpp/benchmarks/json/json.cu                   |   4 +-
 .../cudf/strings/detail/strings_children.cuh  | 238 +++++++++++-------
 .../strings/detail/strings_children_ex.cuh    | 186 --------------
 cpp/src/io/csv/writer_impl.cu                 |   4 +-
 cpp/src/io/json/write_json.cu                 |   6 +-
 cpp/src/strings/capitalize.cu                 |   5 +-
 cpp/src/strings/case.cu                       |   8 +-
 cpp/src/strings/char_types/char_types.cu      |   5 +-
 cpp/src/strings/combine/concatenate.cu        |   7 +-
 cpp/src/strings/combine/join.cu               |   4 +-
 cpp/src/strings/combine/join_list_elements.cu |   6 +-
 cpp/src/strings/convert/convert_booleans.cu   |   6 +-
 cpp/src/strings/convert/convert_datetime.cu   |   4 +-
 cpp/src/strings/convert/convert_durations.cu  |  12 +-
 .../strings/convert/convert_fixed_point.cu    |   6 +-
 cpp/src/strings/convert/convert_floats.cu     |   6 +-
 cpp/src/strings/convert/convert_hex.cu        |   6 +-
 cpp/src/strings/convert/convert_integers.cu   |   6 +-
 cpp/src/strings/convert/convert_ipv4.cu       |   8 +-
 cpp/src/strings/convert/convert_lists.cu      |   4 +-
 cpp/src/strings/convert/convert_urls.cu       |   4 +-
 cpp/src/strings/filter_chars.cu               |   5 +-
 cpp/src/strings/padding.cu                    |  10 +-
 cpp/src/strings/repeat_strings.cu             |   8 +-
 cpp/src/strings/replace/multi.cu              |   4 +-
 cpp/src/strings/replace/multi_re.cu           |   4 +-
 cpp/src/strings/replace/replace.cu            |   4 +-
 cpp/src/strings/replace/replace_slice.cu      |   4 +-
 cpp/src/strings/slice.cu                      |   4 +-
 cpp/src/strings/translate.cu                  |   4 +-
 cpp/src/text/detokenize.cu                    |   4 +-
 cpp/src/text/generate_ngrams.cu               |   6 +-
 cpp/src/text/normalize.cu                     |   6 +-
 cpp/src/text/replace.cu                       |   6 +-
 34 files changed, 237 insertions(+), 367 deletions(-)
 delete mode 100644 cpp/include/cudf/strings/detail/strings_children_ex.cuh

diff --git a/cpp/benchmarks/json/json.cu b/cpp/benchmarks/json/json.cu
index eee85f3feeb..06b793bf5f1 100644
--- a/cpp/benchmarks/json/json.cu
+++ b/cpp/benchmarks/json/json.cu
@@ -20,7 +20,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/json/json.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -170,7 +170,7 @@ auto build_json_string_column(int desired_bytes, int num_rows)
   auto d_store_order = cudf::column_device_view::create(float_2bool_columns->get_column(2));
   json_benchmark_row_builder jb{
     desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order};
-  auto [offsets, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets, chars] = cudf::strings::detail::make_strings_children(
     jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   return cudf::make_strings_column(num_rows, std::move(offsets), chars.release(), 0, {});
 }
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index 35812c0573d..f105a6dc546 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -17,7 +17,9 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -34,94 +36,6 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
-/**
- * @brief Creates child offsets and chars data by applying the template function that
- * can be used for computing the output size of each string as well as create the output
- *
- * @throws std::overflow_error if the output strings column exceeds the column size limit
- *
- * @tparam SizeAndExecuteFunction Function must accept an index and return a size.
- *         It must also have members d_offsets and d_chars which are set to
- *         memory containing the offsets and chars columns during write.
- *
- * @param size_and_exec_fn This is called twice. Once for the output size of each string
- *        and once again to fill in the memory pointed to by d_chars.
- * @param exec_size Number of rows for executing the `size_and_exec_fn` function.
- * @param strings_count Number of strings.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned columns' device memory.
- * @return Offsets child column and chars data for a strings column
- */
-template <typename SizeAndExecuteFunction>
-auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
-                           size_type exec_size,
-                           size_type strings_count,
-                           rmm::cuda_stream_view stream,
-                           rmm::device_async_resource_ref mr)
-{
-  auto offsets_column = make_numeric_column(
-    data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  auto offsets_view          = offsets_column->mutable_view();
-  auto d_offsets             = offsets_view.template data<int32_t>();
-  size_and_exec_fn.d_offsets = d_offsets;
-
-  // This is called twice -- once for offsets and once for chars.
-  // Reducing the number of places size_and_exec_fn is inlined speeds up compile time.
-  auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) {
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       exec_size,
-                       size_and_exec_fn);
-  };
-
-  // Compute the output sizes
-  for_each_fn(size_and_exec_fn);
-
-  // Convert the sizes to offsets
-  auto const bytes =
-    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + strings_count + 1, d_offsets, stream);
-  CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
-               "Size of output exceeds the column size limit",
-               std::overflow_error);
-
-  // Now build the chars column
-  rmm::device_uvector<char> chars(bytes, stream, mr);
-
-  // Execute the function fn again to fill the chars column.
-  // Note that if the output chars column has zero size, the function fn should not be called to
-  // avoid accidentally overwriting the offsets.
-  if (bytes > 0) {
-    size_and_exec_fn.d_chars = chars.data();
-    for_each_fn(size_and_exec_fn);
-  }
-
-  return std::pair(std::move(offsets_column), std::move(chars));
-}
-
-/**
- * @brief Creates child offsets and chars columns by applying the template function that
- * can be used for computing the output size of each string as well as create the output.
- *
- * @tparam SizeAndExecuteFunction Function must accept an index and return a size.
- *         It must also have members d_offsets and d_chars which are set to
- *         memory containing the offsets and chars columns during write.
- *
- * @param size_and_exec_fn This is called twice. Once for the output size of each string
- *        and once again to fill in the memory pointed to by d_chars.
- * @param strings_count Number of strings.
- * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource used to allocate the returned columns' device memory.
- * @return offsets child column and chars child column for a strings column
- */
-template <typename SizeAndExecuteFunction>
-auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
-                           size_type strings_count,
-                           rmm::cuda_stream_view stream,
-                           rmm::device_async_resource_ref mr)
-{
-  return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
-}
-
 /**
  * @brief Create an offsets column to be a child of a compound column
  *
@@ -182,6 +96,154 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
   return std::pair(std::move(offsets_column), total_bytes);
 }
 
+/**
+ * @brief Kernel used by make_strings_children for calling the given functor
+ *
+ * @tparam SizeAndExecuteFunction Functor type to call in each thread
+ *
+ * @param fn Functor to call in each thread
+ * @param exec_size Total number of threads to be processed by this kernel
+ */
+template <typename SizeAndExecuteFunction>
+CUDF_KERNEL void strings_children_kernel(SizeAndExecuteFunction fn, size_type exec_size)
+{
+  auto tid = cudf::detail::grid_1d::global_thread_id();
+  if (tid < exec_size) { fn(tid); }
+}
+
+/**
+ * @brief Creates child offsets and chars data by applying the template function that
+ * can be used for computing the output size of each string as well as create the output
+ *
+ * The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter
+ * and 3 member variables:
+ * - `d_sizes`: output size in bytes of each output row for the 1st pass call
+ * - `d_chars`: output buffer for new string data for the 2nd pass call
+ * - `d_offsets`: used for addressing the specific output row data in `d_chars`
+ *
+ * The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`.
+ * Null rows should be set with an output size of 0.
+ *
+ * @code{.cpp}
+ * struct size_and_exec_fn {
+ *  size_type* d_sizes;
+ *  char* d_chars;
+ *  input_offsetalator d_offsets;
+ *
+ *   __device__ void operator()(size_type thread_idx)
+ *   {
+ *     // functor-specific logic to resolve out_idx from thread_idx
+ *     if( !d_chars ) {
+ *       d_sizes[out_idx] = output_size;
+ *     } else {
+ *       auto d_output = d_chars + d_offsets[out_idx];
+ *       // write characters to d_output
+ *     }
+ *   }
+ * };
+ * @endcode
+ *
+ * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting
+ *         an index parameter and three member variables: `size_type* d_sizes`
+ *         `char* d_chars`, and `input_offsetalator d_offsets`.
+ *
+ * @param size_and_exec_fn This is called twice. Once for the output size of each string
+ *        and once again to fill in the memory pointed to by d_chars.
+ * @param exec_size Number of threads for executing the `size_and_exec_fn` function
+ * @param strings_count Number of strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned columns' device memory
+ * @return Offsets child column and chars vector for creating a strings column
+ */
+template <typename SizeAndExecuteFunction>
+auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
+                           size_type exec_size,
+                           size_type strings_count,
+                           rmm::cuda_stream_view stream,
+                           rmm::device_async_resource_ref mr)
+{
+  // This is called twice -- once for computing sizes and once for writing chars.
+  // Reducing the number of places size_and_exec_fn is inlined speeds up compile time.
+  auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) {
+    auto constexpr block_size = 256;
+    auto grid                 = cudf::detail::grid_1d{exec_size, block_size};
+    strings_children_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(size_and_exec_fn,
+                                                                                exec_size);
+  };
+
+  // Compute the output sizes
+  auto output_sizes        = rmm::device_uvector<size_type>(strings_count, stream);
+  size_and_exec_fn.d_sizes = output_sizes.data();
+  size_and_exec_fn.d_chars = nullptr;
+  for_each_fn(size_and_exec_fn);
+
+  // Convert the sizes to offsets
+  auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
+    output_sizes.begin(), output_sizes.end(), stream, mr);
+  size_and_exec_fn.d_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
+
+  // Now build the chars column
+  rmm::device_uvector<char> chars(bytes, stream, mr);
+  size_and_exec_fn.d_chars = chars.data();
+
+  // Execute the function fn again to fill in the chars data.
+  if (bytes > 0) { for_each_fn(size_and_exec_fn); }
+
+  return std::pair(std::move(offsets_column), std::move(chars));
+}
+
+/**
+ * @brief Creates child offsets and chars columns by applying the template function that
+ * can be used for computing the output size of each string as well as create the output
+ *
+ * The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter
+ * and 3 member variables:
+ * - `d_sizes`: output size in bytes of each output row for the 1st pass call
+ * - `d_chars`: output buffer for new string data for the 2nd pass call
+ * - `d_offsets`: used for addressing the specific output row data in `d_chars`
+ *
+ * The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`.
+ * Null rows should be set with an output size of 0.
+ *
+ * @code{.cpp}
+ * struct size_and_exec_fn {
+ *  size_type* d_sizes;
+ *  char* d_chars;
+ *  input_offsetalator d_offsets;
+ *
+ *   __device__ void operator()(size_type idx)
+ *   {
+ *     if( !d_chars ) {
+ *       d_sizes[idx] = output_size;
+ *     } else {
+ *       auto d_output = d_chars + d_offsets[idx];
+ *       // write characters to d_output
+ *     }
+ *   }
+ * };
+ * @endcode
+ *
+ * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting
+ *         an index parameter and three member variables: `size_type* d_sizes`
+ *         `char* d_chars`, and `input_offsetalator d_offsets`.
+ *
+ * @param size_and_exec_fn This is called twice. Once for the output size of each string
+ *        and once again to fill in the memory pointed to by `d_chars`.
+ * @param strings_count Number of strings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned columns' device memory
+ * @return Offsets child column and chars vector for creating a strings column
+ */
+template <typename SizeAndExecuteFunction>
+auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
+                           size_type strings_count,
+                           rmm::cuda_stream_view stream,
+                           rmm::device_async_resource_ref mr)
+{
+  return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
+}
+
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/strings_children_ex.cuh b/cpp/include/cudf/strings/detail/strings_children_ex.cuh
deleted file mode 100644
index 6028c7e2437..00000000000
--- a/cpp/include/cudf/strings/detail/strings_children_ex.cuh
+++ /dev/null
@@ -1,186 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/column/column.hpp>
-#include <cudf/detail/offsets_iterator_factory.cuh>
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/strings/detail/strings_children.cuh>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-
-namespace cudf {
-namespace strings {
-namespace detail {
-namespace experimental {
-
-/**
- * @brief Kernel used by make_strings_children for calling the given functor
- *
- * @tparam SizeAndExecuteFunction Functor type to call in each thread
- *
- * @param fn Functor to call in each thread
- * @param exec_size Total number of threads to be processed by this kernel
- */
-template <typename SizeAndExecuteFunction>
-CUDF_KERNEL void strings_children_kernel(SizeAndExecuteFunction fn, size_type exec_size)
-{
-  auto tid = cudf::detail::grid_1d::global_thread_id();
-  if (tid < exec_size) { fn(tid); }
-}
-
-/**
- * @brief Creates child offsets and chars data by applying the template function that
- * can be used for computing the output size of each string as well as create the output
- *
- * The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter
- * and 3 member variables:
- * - `d_sizes`: output size in bytes of each output row for the 1st pass call
- * - `d_chars`: output buffer for new string data for the 2nd pass call
- * - `d_offsets`: used for addressing the specific output row data in `d_chars`
- *
- * The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`.
- * Null rows should be set with an output size of 0.
- *
- * @code{.cpp}
- * struct size_and_exec_fn {
- *  size_type* d_sizes;
- *  char* d_chars;
- *  input_offsetalator d_offsets;
- *
- *   __device__ void operator()(size_type thread_idx)
- *   {
- *     // functor-specific logic to resolve out_idx from thread_idx
- *     if( !d_chars ) {
- *       d_sizes[out_idx] = output_size;
- *     } else {
- *       auto d_output = d_chars + d_offsets[out_idx];
- *       // write characters to d_output
- *     }
- *   }
- * };
- * @endcode
- *
- * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting
- *         an index parameter and three member variables: `size_type* d_sizes`
- *         `char* d_chars`, and `input_offsetalator d_offsets`.
- *
- * @param size_and_exec_fn This is called twice. Once for the output size of each string
- *        and once again to fill in the memory pointed to by d_chars.
- * @param exec_size Number of threads for executing the `size_and_exec_fn` function
- * @param strings_count Number of strings
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned columns' device memory
- * @return Offsets child column and chars vector for creating a strings column
- */
-template <typename SizeAndExecuteFunction>
-auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
-                           size_type exec_size,
-                           size_type strings_count,
-                           rmm::cuda_stream_view stream,
-                           rmm::device_async_resource_ref mr)
-{
-  // This is called twice -- once for computing sizes and once for writing chars.
-  // Reducing the number of places size_and_exec_fn is inlined speeds up compile time.
-  auto for_each_fn = [exec_size, stream](SizeAndExecuteFunction& size_and_exec_fn) {
-    auto constexpr block_size = 256;
-    auto grid                 = cudf::detail::grid_1d{exec_size, block_size};
-    strings_children_kernel<<<grid.num_blocks, block_size, 0, stream.value()>>>(size_and_exec_fn,
-                                                                                exec_size);
-  };
-
-  // Compute the output sizes
-  auto output_sizes        = rmm::device_uvector<size_type>(strings_count, stream);
-  size_and_exec_fn.d_sizes = output_sizes.data();
-  size_and_exec_fn.d_chars = nullptr;
-  for_each_fn(size_and_exec_fn);
-
-  // Convert the sizes to offsets
-  auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
-    output_sizes.begin(), output_sizes.end(), stream, mr);
-  size_and_exec_fn.d_offsets =
-    cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
-
-  // Now build the chars column
-  rmm::device_uvector<char> chars(bytes, stream, mr);
-  size_and_exec_fn.d_chars = chars.data();
-
-  // Execute the function fn again to fill in the chars data.
-  if (bytes > 0) { for_each_fn(size_and_exec_fn); }
-
-  return std::pair(std::move(offsets_column), std::move(chars));
-}
-
-/**
- * @brief Creates child offsets and chars columns by applying the template function that
- * can be used for computing the output size of each string as well as create the output
- *
- * The `size_and_exec_fn` is expected declare an operator() function with a size_type parameter
- * and 3 member variables:
- * - `d_sizes`: output size in bytes of each output row for the 1st pass call
- * - `d_chars`: output buffer for new string data for the 2nd pass call
- * - `d_offsets`: used for addressing the specific output row data in `d_chars`
- *
- * The 1st pass call computes the output sizes and is identified by `d_chars==nullptr`.
- * Null rows should be set with an output size of 0.
- *
- * @code{.cpp}
- * struct size_and_exec_fn {
- *  size_type* d_sizes;
- *  char* d_chars;
- *  input_offsetalator d_offsets;
- *
- *   __device__ void operator()(size_type idx)
- *   {
- *     if( !d_chars ) {
- *       d_sizes[idx] = output_size;
- *     } else {
- *       auto d_output = d_chars + d_offsets[idx];
- *       // write characters to d_output
- *     }
- *   }
- * };
- * @endcode
- *
- * @tparam SizeAndExecuteFunction Functor type with an operator() function accepting
- *         an index parameter and three member variables: `size_type* d_sizes`
- *         `char* d_chars`, and `input_offsetalator d_offsets`.
- *
- * @param size_and_exec_fn This is called twice. Once for the output size of each string
- *        and once again to fill in the memory pointed to by `d_chars`.
- * @param strings_count Number of strings
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned columns' device memory
- * @return Offsets child column and chars vector for creating a strings column
- */
-template <typename SizeAndExecuteFunction>
-auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
-                           size_type strings_count,
-                           rmm::cuda_stream_view stream,
-                           rmm::device_async_resource_ref mr)
-{
-  return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
-}
-
-}  // namespace experimental
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 58a74654405..7c4d5711281 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -33,7 +33,7 @@
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/converters.hpp>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
@@ -183,7 +183,7 @@ struct column_to_strings_fn {
     auto d_column = column_device_view::create(column_v, stream_);
     escape_strings_fn fn{*d_column, delimiter.value(stream_)};
     auto [offsets_column, chars] =
-      cudf::strings::detail::experimental::make_strings_children(fn, column_v.size(), stream_, mr_);
+      cudf::strings::detail::make_strings_children(fn, column_v.size(), stream_, mr_);
 
     return make_strings_column(column_v.size(),
                                std::move(offsets_column),
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index cac7149dabe..997d6fd99f8 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -36,7 +36,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table.hpp>
@@ -171,8 +171,8 @@ struct escape_strings_fn {
                                               rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr)
   {
-    auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
-      *this, column_v.size(), stream, mr);
+    auto [offsets_column, chars] =
+      cudf::strings::detail::make_strings_children(*this, column_v.size(), stream, mr);
 
     return make_strings_column(column_v.size(),
                                std::move(offsets_column),
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index 031fff4086a..3f7a98381b8 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/capitalize.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -231,8 +231,7 @@ std::unique_ptr<column> capitalizer(CapitalFn cfn,
                                     rmm::cuda_stream_view stream,
                                     rmm::device_async_resource_ref mr)
 {
-  auto [offsets_column, chars] =
-    cudf::strings::detail::experimental::make_strings_children(cfn, input.size(), stream, mr);
+  auto [offsets_column, chars] = make_strings_children(cfn, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 5d5e6ba9a3e..77c014301ba 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -23,7 +23,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -296,8 +296,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
 
   // For smaller strings, use the regular string-parallel algorithm
   if ((chars_size / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
-    auto [offsets, chars] = cudf::strings::detail::experimental::make_strings_children(
-      converter, input.size(), stream, mr);
+    auto [offsets, chars] = make_strings_children(converter, input.size(), stream, mr);
     return make_strings_column(input.size(),
                                std::move(offsets),
                                chars.release(),
@@ -365,8 +364,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   // run case conversion over the new sub-strings
   auto const tmp_size = static_cast<size_type>(tmp_offsets.size()) - 1;
   upper_lower_ls_fn sub_conv{ccfn, input_chars, tmp_offsets.data()};
-  auto chars = std::get<1>(
-    cudf::strings::detail::experimental::make_strings_children(sub_conv, tmp_size, stream, mr));
+  auto chars = std::get<1>(make_strings_children(sub_conv, tmp_size, stream, mr));
 
   return make_strings_column(input.size(),
                              std::move(offsets),
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 7716cf0cc29..58137aced0f 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -21,7 +21,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -202,8 +202,7 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
 
   // this utility calls filterer to build the offsets and chars columns
-  auto [offsets_column, chars] =
-    cudf::strings::detail::experimental::make_strings_children(filterer, strings_count, stream, mr);
+  auto [offsets_column, chars] = make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return make_strings_column(strings_count,
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index 97008fa94f8..a2c77c5e77f 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -22,7 +22,7 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/detail/combine.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -145,7 +145,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
   // Create device views from the strings columns.
   auto d_table = table_device_view::create(strings_columns, stream);
   concat_strings_fn fn{*d_table, d_separator, d_narep, separate_nulls};
-  auto [offsets_column, chars] = experimental::make_strings_children(fn, strings_count, stream, mr);
+  auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr);
 
   // create resulting null mask
   auto [null_mask, null_count] = cudf::detail::valid_if(
@@ -237,8 +237,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
 
   multi_separator_concat_fn mscf{
     *d_table, separator_col_view, separator_rep, col_rep, separate_nulls};
-  auto [offsets_column, chars] =
-    experimental::make_strings_children(mscf, strings_count, stream, mr);
+  auto [offsets_column, chars] = make_strings_children(mscf, strings_count, stream, mr);
 
   // Create resulting null mask
   auto [null_mask, null_count] = cudf::detail::valid_if(
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index 2e30e01df21..c4cc0dbe09d 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -22,7 +22,7 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/detail/combine.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -150,7 +150,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
     if ((input.size() == input.null_count()) ||
         ((input.chars_size(stream) / (input.size() - input.null_count())) <=
          AVG_CHAR_BYTES_THRESHOLD)) {
-      return std::get<1>(experimental::make_strings_children(
+      return std::get<1>(make_strings_children(
                            join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr))
         .release();
     }
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index b0073452741..f5dfc1a2012 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -22,7 +22,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -209,7 +209,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                                     separate_nulls,
                                                     empty_list_policy};
 
-  auto [offsets_column, chars] = experimental::make_strings_children(comp_fn, num_rows, stream, mr);
+  auto [offsets_column, chars] = make_strings_children(comp_fn, num_rows, stream, mr);
   auto [null_mask, null_count] =
     cudf::detail::valid_if(thrust::counting_iterator<size_type>(0),
                            thrust::counting_iterator<size_type>(num_rows),
@@ -284,7 +284,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                                     separate_nulls,
                                                     empty_list_policy};
 
-  auto [offsets_column, chars] = experimental::make_strings_children(comp_fn, num_rows, stream, mr);
+  auto [offsets_column, chars] = make_strings_children(comp_fn, num_rows, stream, mr);
   auto [null_mask, null_count] =
     cudf::detail::valid_if(thrust::counting_iterator<size_type>(0),
                            thrust::counting_iterator<size_type>(num_rows),
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index 6b64006fa24..d4ccb685061 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_booleans.hpp>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -140,8 +140,8 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
   // copy null mask
   rmm::device_buffer null_mask = cudf::detail::copy_bitmask(booleans, stream, mr);
 
-  auto [offsets, chars] = experimental::make_strings_children(
-    from_booleans_fn{d_column, d_true, d_false}, strings_count, stream, mr);
+  auto [offsets, chars] =
+    make_strings_children(from_booleans_fn{d_column, d_true, d_false}, strings_count, stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index ddf68eae951..2f4ebf97264 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -1109,7 +1109,7 @@ struct dispatch_from_timestamps_fn {
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr) const
   {
-    return experimental::make_strings_children(
+    return make_strings_children(
       datetime_formatter_fn<T>{d_timestamps, d_format_names, d_format_items},
       d_timestamps.size(),
       stream,
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index faf9a83f016..2e4a776d3c0 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -17,7 +17,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/convert/int_to_string.cuh>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -414,11 +414,11 @@ struct dispatch_from_durations_fn {
     // copy null mask
     rmm::device_buffer null_mask = cudf::detail::copy_bitmask(durations, stream, mr);
 
-    auto [offsets, chars] = experimental::make_strings_children(
-      from_durations_fn<T>{d_column, d_format_items, compiler.items_count()},
-      strings_count,
-      stream,
-      mr);
+    auto [offsets, chars] =
+      make_strings_children(from_durations_fn<T>{d_column, d_format_items, compiler.items_count()},
+                            strings_count,
+                            stream,
+                            mr);
 
     return make_strings_column(strings_count,
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 34f81b8b407..73089ad407e 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -23,7 +23,7 @@
 #include <cudf/strings/detail/convert/fixed_point.cuh>
 #include <cudf/strings/detail/convert/fixed_point_to_string.cuh>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -242,8 +242,8 @@ struct dispatch_from_fixed_point_fn {
 
     auto const d_column = column_device_view::create(input, stream);
 
-    auto [offsets, chars] = experimental::make_strings_children(
-      from_fixed_point_fn<DecimalType>{*d_column}, input.size(), stream, mr);
+    auto [offsets, chars] =
+      make_strings_children(from_fixed_point_fn<DecimalType>{*d_column}, input.size(), stream, mr);
 
     return make_strings_column(input.size(),
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 0ed80b976fd..bd7b411d3c3 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -21,7 +21,7 @@
 #include <cudf/strings/convert/convert_floats.hpp>
 #include <cudf/strings/detail/convert/string_to_float.cuh>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -403,8 +403,8 @@ struct dispatch_from_floats_fn {
     // copy the null mask
     rmm::device_buffer null_mask = cudf::detail::copy_bitmask(floats, stream, mr);
 
-    auto [offsets, chars] = experimental::make_strings_children(
-      from_floats_fn<FloatType>{d_column}, strings_count, stream, mr);
+    auto [offsets, chars] =
+      make_strings_children(from_floats_fn<FloatType>{d_column}, strings_count, stream, mr);
 
     return make_strings_column(strings_count,
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index 1f9fc3858f8..a34b148a951 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -19,7 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_integers.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -182,8 +182,8 @@ struct dispatch_integers_to_hex_fn {
   {
     auto const d_column = column_device_view::create(input, stream);
 
-    auto [offsets_column, chars] = experimental::make_strings_children(
-      integer_to_hex_fn<IntegerType>{*d_column}, input.size(), stream, mr);
+    auto [offsets_column, chars] =
+      make_strings_children(integer_to_hex_fn<IntegerType>{*d_column}, input.size(), stream, mr);
 
     return make_strings_column(input.size(),
                                std::move(offsets_column),
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index 918369ead4d..aeabc71d300 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -23,7 +23,7 @@
 #include <cudf/strings/detail/convert/int_to_string.cuh>
 #include <cudf/strings/detail/convert/string_to_int.cuh>
 #include <cudf/strings/detail/converters.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -362,8 +362,8 @@ struct dispatch_from_integers_fn {
     // copy the null mask
     rmm::device_buffer null_mask = cudf::detail::copy_bitmask(integers, stream, mr);
 
-    auto [offsets, chars] = experimental::make_strings_children(
-      from_integers_fn<IntegerType>{d_column}, strings_count, stream, mr);
+    auto [offsets, chars] =
+      make_strings_children(from_integers_fn<IntegerType>{d_column}, strings_count, stream, mr);
 
     return make_strings_column(strings_count,
                                std::move(offsets),
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 33f6c553001..68a24e000ae 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_ipv4.hpp>
 #include <cudf/strings/detail/convert/int_to_string.cuh>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -167,9 +167,9 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
 
   CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type");
 
-  auto d_column                = column_device_view::create(integers, stream);
-  auto [offsets_column, chars] = experimental::make_strings_children(
-    integers_to_ipv4_fn{*d_column}, integers.size(), stream, mr);
+  auto d_column = column_device_view::create(integers, stream);
+  auto [offsets_column, chars] =
+    make_strings_children(integers_to_ipv4_fn{*d_column}, integers.size(), stream, mr);
 
   return make_strings_column(integers.size(),
                              std::move(offsets_column),
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
index 198e6c11ef3..604f928430b 100644
--- a/cpp/src/strings/convert/convert_lists.cu
+++ b/cpp/src/strings/convert/convert_lists.cu
@@ -17,7 +17,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/convert/convert_lists.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
@@ -218,7 +218,7 @@ std::unique_ptr<column> format_list_column(lists_column_view const& input,
   auto const d_separators = column_device_view::create(separators.parent(), stream);
   auto const d_na_rep     = na_rep.value(stream);
 
-  auto [offsets_column, chars] = experimental::make_strings_children(
+  auto [offsets_column, chars] = make_strings_children(
     format_lists_fn{*d_input, *d_separators, d_na_rep, stack_buffer.data(), depth},
     input.size(),
     stream,
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index d9920be045f..39907a38f2f 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/strings/convert/convert_urls.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -134,7 +134,7 @@ std::unique_ptr<column> url_encode(strings_column_view const& input,
   auto d_column = column_device_view::create(input.parent(), stream);
 
   auto [offsets_column, chars] =
-    experimental::make_strings_children(url_encoder_fn{*d_column}, input.size(), stream, mr);
+    make_strings_children(url_encoder_fn{*d_column}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 4705ae519cd..a34828fa97e 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -141,8 +141,7 @@ std::unique_ptr<column> filter_characters(
 
   // this utility calls the strip_fn to build the offsets and chars columns
   filter_fn ffn{*d_strings, keep_characters, table.begin(), table.end(), d_replacement};
-  auto [offsets_column, chars] =
-    cudf::strings::detail::experimental::make_strings_children(ffn, strings.size(), stream, mr);
+  auto [offsets_column, chars] = make_strings_children(ffn, strings.size(), stream, mr);
 
   return make_strings_column(strings_count,
                              std::move(offsets_column),
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index 3cfbf79a8f3..0d146108436 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -19,7 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/pad_impl.cuh>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/padding.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -117,13 +117,13 @@ std::unique_ptr<column> pad(strings_column_view const& input,
   auto [offsets_column, chars] = [&] {
     if (side == side_type::LEFT) {
       auto fn = pad_fn<side_type::LEFT>{*d_strings, width, fill_char_size, d_fill_char};
-      return experimental::make_strings_children(fn, input.size(), stream, mr);
+      return make_strings_children(fn, input.size(), stream, mr);
     } else if (side == side_type::RIGHT) {
       auto fn = pad_fn<side_type::RIGHT>{*d_strings, width, fill_char_size, d_fill_char};
-      return experimental::make_strings_children(fn, input.size(), stream, mr);
+      return make_strings_children(fn, input.size(), stream, mr);
     }
     auto fn = pad_fn<side_type::BOTH>{*d_strings, width, fill_char_size, d_fill_char};
-    return experimental::make_strings_children(fn, input.size(), stream, mr);
+    return make_strings_children(fn, input.size(), stream, mr);
   }();
 
   return make_strings_column(input.size(),
@@ -154,7 +154,7 @@ std::unique_ptr<column> zfill(strings_column_view const& input,
 
   auto d_strings = column_device_view::create(input.parent(), stream);
   auto [offsets_column, chars] =
-    experimental::make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr);
+    make_strings_children(zfill_fn{*d_strings, width}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
                              std::move(offsets_column),
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index de1d5e38e00..022f1eb3232 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -21,7 +21,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/offsets_iterator.cuh>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -166,8 +166,8 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
   auto const strings_dv_ptr = column_device_view::create(input.parent(), stream);
   auto const fn = compute_size_and_repeat_fn{*strings_dv_ptr, repeat_times, input.has_nulls()};
 
-  auto [offsets_column, chars] = experimental::make_strings_children(
-    fn, strings_count * repeat_times, strings_count, stream, mr);
+  auto [offsets_column, chars] =
+    make_strings_children(fn, strings_count * repeat_times, strings_count, stream, mr);
   return make_strings_column(strings_count,
                              std::move(offsets_column),
                              chars.release(),
@@ -251,7 +251,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                                              input.has_nulls(),
                                                              repeat_times.has_nulls()};
 
-  auto [offsets_column, chars] = experimental::make_strings_children(fn, strings_count, stream, mr);
+  auto [offsets_column, chars] = make_strings_children(fn, strings_count, stream, mr);
 
   // We generate new bitmask by AND of the two input columns' bitmasks.
   // Note that if either of the input columns are nullable, the output column will also be nullable
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 9abcca7a5e6..9025234aa52 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -23,7 +23,7 @@
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace.hpp>
@@ -462,7 +462,7 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
   auto d_targets      = column_device_view::create(targets.parent(), stream);
   auto d_replacements = column_device_view::create(repls.parent(), stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets_column, chars] = make_strings_children(
     replace_multi_fn{*d_strings, *d_targets, *d_replacements}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index b9a3acf747f..cd60a4296b9 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -23,7 +23,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace_re.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -187,7 +187,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
 
   auto found_ranges = rmm::device_uvector<found_range>(d_progs.size() * input.size(), stream);
 
-  auto [offsets_column, chars] = experimental::make_strings_children(
+  auto [offsets_column, chars] = make_strings_children(
     replace_multi_regex_fn{*d_strings, d_progs, found_ranges.data(), *d_repls},
     input.size(),
     stream,
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index df8526fa942..501e6d547e6 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace.hpp>
@@ -399,7 +399,7 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
 {
   auto d_strings = column_device_view::create(input.parent(), stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets_column, chars] = make_strings_children(
     replace_fn{*d_strings, d_target, d_replacement, maxrepl}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
diff --git a/cpp/src/strings/replace/replace_slice.cu b/cpp/src/strings/replace/replace_slice.cu
index 54e84dfe504..04d81218a16 100644
--- a/cpp/src/strings/replace/replace_slice.cu
+++ b/cpp/src/strings/replace/replace_slice.cu
@@ -19,7 +19,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/replace.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -95,7 +95,7 @@ std::unique_ptr<column> replace_slice(strings_column_view const& input,
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   // this utility calls the given functor to build the offsets and chars columns
-  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets_column, chars] = make_strings_children(
     replace_slice_fn{*d_strings, d_repl, start, stop}, input.size(), stream, mr);
 
   return make_strings_column(input.size(),
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index 972a4ffd58e..cf82a837c51 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -21,7 +21,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/slice.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -208,7 +208,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
   auto const d_stop  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(stop));
   auto const d_step  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
 
-  auto [offsets, chars] = experimental::make_strings_children(
+  auto [offsets, chars] = make_strings_children(
     substring_fn{*d_column, d_start, d_stop, d_step}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 75bc46d30c4..16b22d0de4c 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/translate.hpp>
@@ -112,7 +112,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
-  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets_column, chars] = make_strings_children(
     translate_fn{*d_strings, table.begin(), table.end()}, strings.size(), stream, mr);
 
   return make_strings_column(strings.size(),
diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index 2efeeee0ee9..6635b61093e 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -20,7 +20,7 @@
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -158,7 +158,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
 
   cudf::string_view const d_separator(separator.data(), separator.size());
 
-  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
     detokenizer_fn{*strings_column, d_row_map, tokens_offsets.data(), d_separator},
     output_count,
     stream,
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index fdd165a54bc..724f3603f29 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -22,7 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -142,7 +142,7 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
   // compute the number of strings of ngrams
   auto const ngrams_count = strings_count - ngrams + 1;
 
-  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
     ngram_generator_fn{d_strings, ngrams, d_separator}, ngrams_count, stream, mr);
 
   // make the output strings column from the offsets and chars column
@@ -235,7 +235,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
                "Insufficient number of characters in each string to generate ngrams");
 
   character_ngram_generator_fn generator{*d_strings, ngrams, d_offsets};
-  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
     generator, strings_count, total_ngrams, stream, mr);
 
   auto output = cudf::make_strings_column(
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 2f97eb1ce74..4db11dc5beb 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -26,7 +26,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -185,7 +185,7 @@ std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const&
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
   // build offsets and children using the normalize_space_fn
-  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
     normalize_spaces_fn{*d_strings}, strings.size(), stream, mr);
 
   return cudf::make_strings_column(strings.size(),
@@ -227,7 +227,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
   auto d_strings = cudf::column_device_view::create(strings.parent(), stream);
 
   // build offsets and children using the codepoint_to_utf8_fn
-  auto [offsets_column, chars] = cudf::strings::detail::experimental::make_strings_children(
+  auto [offsets_column, chars] = cudf::strings::detail::make_strings_children(
     codepoint_to_utf8_fn{*d_strings, cp_chars, cp_offsets}, strings.size(), stream, mr);
 
   return cudf::make_strings_column(strings.size(),
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index f95b53a3ac8..84ed1827117 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -21,7 +21,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/strings_children_ex.cuh>
+#include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -232,7 +232,7 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
 
   // this utility calls replacer to build the offsets and chars columns
   auto [offsets_column, chars] =
-    cudf::strings::detail::experimental::make_strings_children(replacer, strings_count, stream, mr);
+    cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,
@@ -265,7 +265,7 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
 
   // this utility calls filterer to build the offsets and chars columns
   auto [offsets_column, chars] =
-    cudf::strings::detail::experimental::make_strings_children(filterer, strings_count, stream, mr);
+    cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
 
   // return new strings column
   return cudf::make_strings_column(strings_count,

From 915c6bea2069f75b5637ff39befd877ba37a1922 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Mon, 13 May 2024 10:19:24 -0400
Subject: [PATCH 196/842] Correct static builds + static arrow (#15715)

Correct the CMake logic in arrow so that we can properly build cudf + arrow statically

Fixes https://github.com/rapidsai/cudf/issues/15714

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/15715
---
 ci/configure_cpp_static.sh           |  2 +-
 cpp/cmake/thirdparty/get_arrow.cmake | 15 ++++++++++++++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh
index d1f9e0d1399..11d5585d98f 100755
--- a/ci/configure_cpp_static.sh
+++ b/ci/configure_cpp_static.sh
@@ -18,4 +18,4 @@ rapids-dependency-file-generator \
 python -m pip install -r "${REQUIREMENTS_FILE}"
 pyenv rehash
 
-cmake -S cpp -B build_static -GNinja -DBUILD_SHARED_LIBS=OFF -DBUILD_TESTS=OFF
+cmake -S cpp -B build_static -GNinja -DBUILD_SHARED_LIBS=OFF -DCUDF_USE_ARROW_STATIC=ON -DBUILD_TESTS=OFF
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 70283efbd79..e9d2f479088 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -303,7 +303,20 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
         "
       )
     endif()
-
+    rapids_cmake_install_lib_dir(lib_dir)
+    if(TARGET arrow_static)
+      get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES)
+      # The `arrow_static` library is leaking a dependency on the object libraries it was built with
+      # we need to remove this from the interface, since keeping them around would cause duplicate
+      # symbols and CMake export errors
+      if(interface_libs MATCHES "arrow_array" AND interface_libs MATCHES "arrow_compute")
+        string(REPLACE "BUILD_INTERFACE:" "BUILD_LOCAL_INTERFACE:" interface_libs
+                       "${interface_libs}"
+        )
+        set_target_properties(arrow_static PROPERTIES INTERFACE_LINK_LIBRARIES "${interface_libs}")
+        get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES)
+      endif()
+    endif()
     rapids_export(
       BUILD Arrow
       VERSION ${VERSION}

From 149253b2e9f3801fdcc88c17e31a25788fe6381a Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 13 May 2024 16:08:05 +0100
Subject: [PATCH 197/842] Skeleton cudf polars package (#15688)

Introduce the skeleton of a cudf_polars package. Note that we are deliberately not building any packages yet.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Mike Sarahan (https://github.com/msarahan)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15688
---
 .pre-commit-config.yaml                    |   3 +
 dependencies.yaml                          |  28 ++++
 python/cudf_polars/LICENSE                 |   1 +
 python/cudf_polars/README.md               |   1 +
 python/cudf_polars/cudf_polars/VERSION     |   1 +
 python/cudf_polars/cudf_polars/__init__.py |  13 ++
 python/cudf_polars/pyproject.toml          | 171 +++++++++++++++++++++
 7 files changed, 218 insertions(+)
 create mode 120000 python/cudf_polars/LICENSE
 create mode 120000 python/cudf_polars/README.md
 create mode 120000 python/cudf_polars/cudf_polars/VERSION
 create mode 100644 python/cudf_polars/cudf_polars/__init__.py
 create mode 100644 python/cudf_polars/pyproject.toml

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0ae745257cb..d44462236b2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,6 +22,8 @@ repos:
         # project can specify its own first/third-party packages.
         args: ["--config-root=python/", "--resolve-all-configs"]
         files: python/.*
+        exclude: |
+          (?x)^(^python/cudf_polars/.*)
         types_or: [python, cython, pyi]
   - repo: https://github.com/MarcoGorelli/cython-lint
     rev: v0.16.2
@@ -36,6 +38,7 @@ repos:
                "python/cudf/cudf",
                "python/custreamz/custreamz",
                "python/cudf_kafka/cudf_kafka",
+               "python/cudf_polars/cudf_polars",
                "python/dask_cudf/dask_cudf"]
         pass_filenames: false
   - repo: https://github.com/nbQA-dev/nbQA
diff --git a/dependencies.yaml b/dependencies.yaml
index 7fe67817f73..27b0f23389c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -119,6 +119,29 @@ files:
       key: cudf-pandas-tests
     includes:
       - test_python_cudf_pandas
+  py_build_cudf_polars:
+    output: pyproject
+    pyproject_dir: python/cudf_polars
+    extras:
+      table: build-system
+    includes:
+      - build_wheels
+  py_run_cudf_polars:
+    output: pyproject
+    pyproject_dir: python/cudf_polars
+    extras:
+      table: project
+    includes:
+      - run_cudf_polars
+      - depends_on_cudf
+  py_test_cudf_polars:
+    output: pyproject
+    pyproject_dir: python/cudf_polars
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
   py_build_dask_cudf:
     output: pyproject
     pyproject_dir: python/dask_cudf
@@ -559,6 +582,11 @@ dependencies:
       - output_types: pyproject
         matrices:
           - {matrix: null, packages: [cubinlinker, ptxcompiler, *rmm_conda] }
+  run_cudf_polars:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - polars>=0.20.24
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/LICENSE b/python/cudf_polars/LICENSE
new file mode 120000
index 00000000000..30cff7403da
--- /dev/null
+++ b/python/cudf_polars/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/cudf_polars/README.md b/python/cudf_polars/README.md
new file mode 120000
index 00000000000..fe840054137
--- /dev/null
+++ b/python/cudf_polars/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/cudf_polars/cudf_polars/VERSION b/python/cudf_polars/cudf_polars/VERSION
new file mode 120000
index 00000000000..d62dc733efd
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py
new file mode 100644
index 00000000000..74547fe2448
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/__init__.py
@@ -0,0 +1,13 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+An executor for polars logical plans.
+
+This package implements an executor for polars logical plans using
+pylibcudf to execute the plans on device.
+"""
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
new file mode 100644
index 00000000000..de26a3eb51c
--- /dev/null
+++ b/python/cudf_polars/pyproject.toml
@@ -0,0 +1,171 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = [
+    "setuptools",
+    "wheel",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project]
+name = "cudf-polars"
+dynamic = ["version"]
+description = "Executor for polars using cudf"
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.9"
+dependencies = [
+    "cudf==24.6.*",
+    "polars>=0.20.24",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+classifiers = [
+    "Intended Audience :: Developers",
+    "Topic :: Database",
+    "Topic :: Scientific/Engineering",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+]
+
+[project.optional-dependencies]
+test = [
+    "pytest-cov",
+    "pytest-xdist",
+    "pytest<8",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/cudf"
+
+[tool.setuptools]
+license-files = ["LICENSE"]
+
+[tool.setuptools.dynamic]
+version = {file = "cudf_polars/VERSION"}
+
+[tool.setuptools.packages.find]
+exclude = ["*tests*"]
+
+[tool.ruff]
+line-length = 88
+indent-width = 4
+target-version = "py39"
+fix = true
+
+[tool.ruff.lint]
+# __init__.py must re-export everything it imports
+ignore-init-module-imports = false
+select = [
+  "E", # pycodestyle
+  "W", # pycodestyle
+  "F", # Pyflakes
+  "B", # flake8-bugbear
+  "C4", # flake8-comprehensions
+  "D", # flake8-docstrings
+  "D213", # Augment NumPy docstring convention: Multi-line docstring summary should start at the second line
+  "D417", # Augment NumPy docstring convention: Missing argument descriptions
+  "I", # isort
+  "ISC", # flake8-implicit-str-concat
+  "INP", # flake8-no-pep420 (namespace packages)
+  "SIM", # flake8-simplify
+  "TCH", # flake8-type-checking
+  "TID", # flake8-tidy-imports
+  "PLC", # pylint-convention
+  "PLE", # pylint-error
+  # Not enabling PLR (pylint-refactor) since it conflicts with other rules
+  "PLW", # pylint-warning
+  "PERF", # perflint
+  "UP", # pyupgrade
+  "PT", # flake8-pytest-style
+  # https://docs.astral.sh/ruff/rules/#flake8-return-ret
+  "RET502", # no implicit return
+  "RET503", # no implicit return
+  "RET504", # no implicit return
+  "RUF", # Ruff-specific rules
+  "PTH", # flake8-use-pathlib
+  "FA", # flake8-future-annotations
+  "PIE", # flake8-pie
+  "TD", # flake8-todos
+  "TRY", # tryceratops
+  "FBT", # flake8-boolean-trap
+]
+
+ignore = [
+  # Line length regulated by formatter
+  "E501",
+  # pydocstyle: http://www.pydocstyle.org/en/stable/error_codes.html
+  "D401", # Relax NumPy docstring convention: First line should be in imperative mood
+  # flake8-pytest-style:
+  "PT011", # pytest.raises({exception}) is too broad, set the match parameter or use a more specific exception
+  # flake8-simplify
+  "SIM108", # Use ternary operator
+  # flake8-todos
+  "TD002", # Missing author in TODO
+  "TD003", # Missing issue link on the line following this TODO
+  # tryceratops
+  "TRY003", # Avoid specifying long messages outside the exception class
+  # Lints below are turned off because of conflicts with the ruff
+  # formatter
+  # See https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules
+  "W191", # tab-indentation
+  "E111", # indentation-with-invalid-multiple
+  "E114", # indentation-with-invalid-multiple-comment
+  "E117", # over-indented
+  "D206", # indent-with-spaces
+  "D300", # triple-single-quotes
+  "Q000", # bad-quotes-inline-string
+  "Q001", # bad-quotes-multiline-string
+  "Q002", # bad-quotes-docstring
+  "Q003", # avoidable-escaped-quote
+  "COM812", # missing-trailing-comma
+  "COM819", # prohibited-trailing-comma
+  "ISC001", # single-line-implicit-string-concatenation
+  "ISC002", # multi-line-implicit-string-concatenation
+]
+fixable = ["ALL"]
+
+[tool.ruff.lint.flake8-pytest-style]
+# https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style
+fixture-parentheses = false
+mark-parentheses = false
+parametrize-names-type = "csv"
+parametrize-values-type = "list"
+parametrize-values-row-type = "tuple"
+
+[tool.ruff.lint.pydocstyle]
+convention = "numpy"
+
+[tool.ruff.lint.flake8-tidy-imports]
+ban-relative-imports = "all"
+
+[tool.ruff.lint.flake8-type-checking]
+strict = true
+
+[tool.ruff.lint.isort]
+case-sensitive = true
+combine-as-imports = true
+order-by-type = true
+known-first-party = ["cudf_polars"]
+default-section = "third-party"
+section-order = [
+  "future",
+  "standard-library",
+  "third-party",
+  "polars",
+  "rapids",
+  "first-party",
+  "local-folder"
+]
+required-imports = ["from __future__ import annotations"]
+
+[tool.ruff.lint.isort.sections]
+polars = ["polars"]
+rapids = ["rmm", "cudf"]
+
+[tool.ruff.format]
+docstring-code-format = true

From 13f028f01ad043b0d24f3e4a28f4267c02806390 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 13 May 2024 11:39:50 -0400
Subject: [PATCH 198/842] Update libcudf developer guide for strings offsets
 column (#15661)

Updates the libcudf Developer Guide to better describe the strings offsets child column and include the offsetalator.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15661
---
 .../developer_guide/DEVELOPER_GUIDE.md        | 96 ++++++++++++++-----
 1 file changed, 71 insertions(+), 25 deletions(-)

diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index 05f8e4585cc..ff80c2daab8 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -1,4 +1,4 @@
-# libcudf C++ Developer Guide {#DEVELOPER_GUIDE}
+# libcudf C++ Developer Guide
 
 This document serves as a guide for contributors to libcudf C++ code. Developers should also refer
 to these additional files for further documentation of libcudf best practices.
@@ -828,7 +828,7 @@ This iterator returns the validity of the underlying element (`true` or `false`)
 
 The proliferation of data types supported by libcudf can result in long compile times. One area
 where compile time was a problem is in types used to store indices, which can be any integer type.
-The "Indexalator", or index-normalizing iterator (`include/cudf/detail/indexalator.cuh`), can be
+The "indexalator", or index-normalizing iterator (`include/cudf/detail/indexalator.cuh`), can be
 used for index types (integers) without requiring a type-specific instance. It can be used for any
 iterator interface for reading an array of integer values of type `int8`, `int16`, `int32`,
 `int64`, `uint8`, `uint16`, `uint32`, or `uint64`. Reading specific elements always returns a
@@ -856,6 +856,41 @@ thrust::lower_bound(rmm::exec_policy(stream),
                     thrust::less<Element>());
 ```
 
+### Offset-normalizing iterators
+
+Like the [indexalator](#index-normalizing-iterators),
+the "offsetalator", or offset-normalizing iterator (`include/cudf/detail/offsetalator.cuh`), can be
+used for offset column types (`INT32` or `INT64` only) without requiring a type-specific instance.
+This is helpful when reading or building [strings columns](#strings-columns).
+The normalized type is `int64` which means an `input_offsetsalator` will return `int64` type values
+for both `INT32` and `INT64` offsets columns.
+Likewise, an `output_offselator` can accept `int64` type values to store into either an
+`INT32` or `INT64` output offsets column created appropriately.
+
+Use the `cudf::detail::offsetalator_factory` to create an appropriate input or output iterator from an offsets column_view.
+Example input iterator usage:
+
+```c++
+  // convert the sizes to offsets
+  auto [offsets, char_bytes] = cudf::strings::detail::make_offsets_child_column(
+    output_sizes.begin(), output_sizes.end(), stream, mr);
+  auto d_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
+  // use d_offsets to address the output row bytes
+```
+
+Example output iterator usage:
+
+```c++
+    // create offsets column as either INT32 or INT64 depending on the number of bytes
+    auto offsets_column = cudf::strings::detail::create_offsets_child_column(total_bytes,
+                                                                             offsets_count,
+                                                                             stream, mr);
+    auto d_offsets =
+      cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view());
+    // write appropriate offset values to d_offsets
+```
+
 ## Namespaces
 
 ### External
@@ -1241,18 +1276,20 @@ This is related to [Arrow's "Variable-Size List" memory layout](https://arrow.ap
 
 Strings are represented as a column with a data device buffer and a child offsets column.
 The parent column's type is `STRING` and its data holds all the characters across all the strings packed together
-but its size represents the number of strings in the column, and its null mask represents the
-validity of each string. To summarize, the strings column children are:
-
-1. A non-nullable column of [`size_type`](#cudfsize_type) elements that indicates the offset to the beginning of each
-   string in a dense data buffer of all characters.
+but its size represents the number of strings in the column and its null mask represents the
+validity of each string.
 
-With this representation, `data[offsets[i]]` is the first character of string `i`, and the
-size of string `i` is given by `offsets[i+1] - offsets[i]`. The following image shows an example of
-this compound column representation of strings.
+The strings column contains a single, non-nullable child column
+of offset elements that indicates the byte position offset to the beginning of each
+string in the dense data buffer of all characters. With this representation, `data[offsets[i]]` is the
+first character of string `i`, and the size of string `i` is given by `offsets[i+1] - offsets[i]`.
+The following image shows an example of this compound column representation of strings.
 
 ![strings](strings.png)
 
+The type of the offsets column is either `INT32` or `INT64` depending on the number of bytes in the data buffer.
+See [`cudf::strings_view`](#cudfstrings_column_view-and-cudfstring_view) for more information on processing individual string rows.
+
 ## Structs columns
 
 A struct is a nested data type with a set of child columns each representing an individual field
@@ -1295,7 +1332,7 @@ struct column's layout is as follows. (Note that null masks should be read from
 }
 ```
 
-The last struct row (index 3) is not null, but has a null value in the INT32 field. Also, row 2 of
+The last struct row (index 3) is not null, but has a null value in the `INT32` field. Also, row 2 of
 the struct column is null, making its corresponding fields also null. Therefore, bit 2 is unset in
 the null masks of both struct fields.
 
@@ -1351,18 +1388,27 @@ libcudf provides view types for nested column types as well as for the data elem
 
 ### cudf::strings_column_view and cudf::string_view
 
-`cudf::strings_column_view` is a view of a strings column, like `cudf::column_view` is a view of
-any `cudf::column`. `cudf::string_view` is a view of a single string, and therefore
-`cudf::string_view` is the data type of a `cudf::column` of type `STRING` just like `int32_t` is the
-data type for a `cudf::column` of type [`size_type`](#cudfsize_type). As its name implies, this is a
-read-only object instance that points to device memory inside the strings column. It's lifespan is
-the same (or less) as the column it views.
+A `cudf::strings_column_view` wraps a strings column and contains a parent
+`cudf::column_view` as a view of the strings column and an offsets `cudf::column_view`
+which is a child of the parent.
+The parent view contains the offset, size, and validity mask for the strings column.
+The offsets view is non-nullable with `offset()==0` and its own size.
+Since the offset column type can be either `INT32` or `INT64` it is useful to use the
+offset normalizing iterators [offsetalator](#offset-normalizing-iterators) to access individual offset values.
+
+A `cudf::string_view` is a view of a single string and therefore
+is the data type of a `cudf::column` of type `STRING` just like `int32_t` is the
+data type for a `cudf::column` of type `INT32`. As its name implies, this is a
+read-only object instance that points to device memory inside the strings column.
+Its lifespan is the same (or less) as the column it views.
+An individual strings column row and a `cudf::string_view` is limited to [`size_type`](#cudfsize_type) bytes.
 
 Use the `column_device_view::element` method to access an individual row element. Like any other
 column, do not call `element()` on a row that is null.
 
 ```c++
-   cudf::column_device_view d_strings;
+   cudf::strings_column_view scv;
+   auto d_strings = cudf::column_device_view::create(scv.parent(), stream);
    ...
    if( d_strings.is_valid(row_index) ) {
       string_view d_str = d_strings.element<string_view>(row_index);
@@ -1370,27 +1416,27 @@ column, do not call `element()` on a row that is null.
    }
 ```
 
-A null string is not the same as an empty string. Use the `string_scalar` class if you need an
+A null string is not the same as an empty string. Use the `cudf::string_scalar` class if you need an
 instance of a class object to represent a null string.
 
-The `string_view` contains comparison operators `<,>,==,<=,>=` that can be used in many cudf
-functions like `sort` without string-specific code. The data for a `string_view` instance is
+The `cudf::string_view` contains comparison operators `<,>,==,<=,>=` that can be used in many cudf
+functions like `sort` without string-specific code. The data for a `cudf::string_view` instance is
 required to be [UTF-8](#utf-8) and all operators and methods expect this encoding. Unless documented
 otherwise, position and length parameters are specified in characters and not bytes. The class also
-includes a `string_view::const_iterator` which can be used to navigate through individual characters
+includes a `cudf::string_view::const_iterator` which can be used to navigate through individual characters
 within the string.
 
-`cudf::type_dispatcher` dispatches to the `string_view` data type when invoked on a `STRING` column.
+`cudf::type_dispatcher` dispatches to the `cudf::string_view` data type when invoked on a `STRING` column.
 
 #### UTF-8
 
 The libcudf strings column only supports UTF-8 encoding for strings data.
 [UTF-8](https://en.wikipedia.org/wiki/UTF-8) is a variable-length character encoding wherein each
 character can be 1-4 bytes. This means the length of a string is not the same as its size in bytes.
-For this reason, it is recommended to use the `string_view` class to access these characters for
+For this reason, it is recommended to use the `cudf::string_view` class to access these characters for
 most operations.
 
-The `string_view.cuh` header also includes some utility methods for reading and writing
+The `cudf/strings/detail/utf8.hpp` header also includes some utility methods for reading and writing
 (`to_char_utf8/from_char_utf8`) individual UTF-8 characters to/from byte arrays.
 
 ### cudf::lists_column_view and cudf::lists_view

From 38d988bceec620317fb9c267e6dd23c569ffbbf5 Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Mon, 13 May 2024 13:20:02 -0700
Subject: [PATCH 199/842] add --rm and --name to devcontainer run args (#15572)

* Remove the devcontainer when the VSCode window closes
* Adds a descriptive name to the running container:
  ```shell
  $ docker ps -a
  CONTAINER ID   IMAGE         ...  NAMES
  0dbb364fe544   vsc-cudf-...  ...  rapids-cudf-24.06-cuda12.2-conda

  $ docker rm -f rapids-cudf-24.06-cuda12.2-conda
  ```

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15572
---
 .devcontainer/cuda11.8-conda/devcontainer.json | 5 +++++
 .devcontainer/cuda11.8-pip/devcontainer.json   | 5 +++++
 .devcontainer/cuda12.2-conda/devcontainer.json | 5 +++++
 .devcontainer/cuda12.2-pip/devcontainer.json   | 5 +++++
 ci/release/update-version.sh                   | 1 +
 cpp/scripts/run-cmake-format.sh                | 4 ++--
 6 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 9999eebdc97..944a73ecc98 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -8,6 +8,11 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 90471e0b750..8b802333bda 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -8,6 +8,11 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json
index 5a61d26e1f5..886b07025cc 100644
--- a/.devcontainer/cuda12.2-conda/devcontainer.json
+++ b/.devcontainer/cuda12.2-conda/devcontainer.json
@@ -8,6 +8,11 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-mambaforge-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index 29817cdadc3..86df56ada19 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -8,6 +8,11 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 99f9c698217..beeb130f0f1 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -88,4 +88,5 @@ sed_runner "s/cudf-.*-SNAPSHOT/cudf-${NEXT_FULL_JAVA_TAG}/g" java/ci/README.md
 find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
     sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+    sed_runner "s@rapids-\${localWorkspaceFolderBasename}-[0-9.]*@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}"
 done
diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh
index f3e21779aa5..603880954a6 100755
--- a/cpp/scripts/run-cmake-format.sh
+++ b/cpp/scripts/run-cmake-format.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
-
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 # This script is a wrapper for cmakelang that may be used with pre-commit. The
 # wrapping is necessary because RAPIDS libraries split configuration for
@@ -45,6 +44,7 @@ fi
 
 DEFAULT_FORMAT_FILE_LOCATIONS=(
   "${CUDF_BUILD_DIR:-${HOME}}/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
+  "${CUDF_BUILD_DIR:-cpp/build}/latest/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
   "cpp/libcudf_kafka/build/_deps/rapids-cmake-src/cmake-format-rapids-cmake.json"
 )
 

From 3a33d51bb8bf95542e27d63ff007989802615a68 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 13 May 2024 14:26:22 -1000
Subject: [PATCH 200/842] Disable pandas 2.x clipboard tests in cudf.pandas
 tests (#15462)

Clipboard testing currently `ERROR` since a `pytest-qt` fixture isn't found.

Even when this fixture is installed these tests seems to consistent crash pytest workers (I suspect since there is some method patching in `test_clipboard.py`. I don't think `cudf.pandas` should realistically work with clipboard functionality so just skipping these tests instead

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/15462
---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index af7fa72d44e..6eb28104120 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -22,7 +22,9 @@ set -euo pipefail
 # of Pandas installed.
 PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)")
 
-PYTEST_IGNORES="--ignore=tests/io/parser/common/test_read_errors.py"
+# tests/io/test_clipboard.py::TestClipboard crashes pytest workers (possibly due to fixture patching clipboard functionality)
+PYTEST_IGNORES="--ignore=tests/io/parser/common/test_read_errors.py \
+--ignore=tests/io/test_clipboard.py"
 
 mkdir -p pandas-testing
 cd pandas-testing

From 0f6ce63431cff85a278eafc555e74ee0e101f6da Mon Sep 17 00:00:00 2001
From: Liangcai Li <firestarmanllc@gmail.com>
Date: Tue, 14 May 2024 09:28:44 +0800
Subject: [PATCH 201/842] Add JNI bindings for zstd compression of NVCOMP.
 (#15729)

Authors:
  - Liangcai Li (https://github.com/firestarman)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/15729
---
 .../rapids/cudf/nvcomp/BatchedCompressor.java | 335 ++++++++++++++++++
 .../cudf/nvcomp/BatchedDecompressor.java      | 220 ++++++++++++
 .../cudf/nvcomp/BatchedLZ4Compressor.java     | 310 +---------------
 .../cudf/nvcomp/BatchedLZ4Decompressor.java   | 182 +---------
 .../cudf/nvcomp/BatchedZstdCompressor.java    |  45 +++
 .../cudf/nvcomp/BatchedZstdDecompressor.java  |  37 ++
 .../java/ai/rapids/cudf/nvcomp/NvcompJni.java |  96 ++++-
 java/src/main/native/src/NvcompJni.cpp        | 156 ++++++++
 .../ai/rapids/cudf/nvcomp/NvcompTest.java     |  27 +-
 9 files changed, 943 insertions(+), 465 deletions(-)
 create mode 100644 java/src/main/java/ai/rapids/cudf/nvcomp/BatchedCompressor.java
 create mode 100644 java/src/main/java/ai/rapids/cudf/nvcomp/BatchedDecompressor.java
 create mode 100644 java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdCompressor.java
 create mode 100644 java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdDecompressor.java

diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedCompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedCompressor.java
new file mode 100644
index 00000000000..72dfcdb3cb5
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedCompressor.java
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf.nvcomp;
+
+import ai.rapids.cudf.BaseDeviceMemoryBuffer;
+import ai.rapids.cudf.CloseableArray;
+import ai.rapids.cudf.Cuda;
+import ai.rapids.cudf.DeviceMemoryBuffer;
+import ai.rapids.cudf.HostMemoryBuffer;
+import ai.rapids.cudf.MemoryBuffer;
+import ai.rapids.cudf.NvtxColor;
+import ai.rapids.cudf.NvtxRange;
+
+/** Multi-buffer compressor */
+public abstract class BatchedCompressor {
+
+  static final long MAX_CHUNK_SIZE = 16777216;  // 16MiB in bytes
+  // each chunk has a 64-bit integer value as metadata containing the compressed size
+  static final long METADATA_BYTES_PER_CHUNK = 8;
+
+  private final long chunkSize;
+  private final long maxIntermediateBufferSize;
+  private final long maxOutputChunkSize;
+
+  /**
+   * Construct a batched compressor instance
+   * @param chunkSize maximum amount of uncompressed data to compress as a single chunk.
+   *                  Inputs larger than this will be compressed in multiple chunks.
+   * @param maxIntermediateBufferSize desired maximum size of intermediate device
+   *                                  buffers used during compression.
+   */
+  public BatchedCompressor(long chunkSize, long maxOutputChunkSize,
+      long maxIntermediateBufferSize) {
+    validateChunkSize(chunkSize);
+    assert maxOutputChunkSize < Integer.MAX_VALUE;
+    this.chunkSize = chunkSize;
+    this.maxOutputChunkSize = maxOutputChunkSize;
+    this.maxIntermediateBufferSize = Math.max(maxOutputChunkSize, maxIntermediateBufferSize);
+  }
+
+  /**
+   * Compress a batch of buffers. The input buffers will be closed.
+   * @param origInputs buffers to compress
+   * @param stream CUDA stream to use
+   * @return compressed buffers corresponding to the input buffers
+   */
+  public DeviceMemoryBuffer[] compress(BaseDeviceMemoryBuffer[] origInputs, Cuda.Stream stream) {
+    try (CloseableArray<BaseDeviceMemoryBuffer> inputs = CloseableArray.wrap(origInputs)) {
+      if (chunkSize <= 0) {
+        throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
+      }
+      final int numInputs = inputs.size();
+      if (numInputs == 0) {
+        return new DeviceMemoryBuffer[0];
+      }
+
+      // Each buffer is broken up into chunkSize chunks for compression.  Calculate how many
+      // chunks are needed for each input buffer.
+      int[] chunksPerInput = new int[numInputs];
+      int numChunks = 0;
+      for (int i = 0; i < numInputs; i++) {
+        BaseDeviceMemoryBuffer buffer = inputs.get(i);
+        int numBufferChunks = getNumChunksInBuffer(buffer);
+        chunksPerInput[i] = numBufferChunks;
+        numChunks += numBufferChunks;
+      }
+
+      // Allocate buffers for each chunk and generate parallel lists of chunk source addresses,
+      // chunk destination addresses, and sizes.
+      try (CloseableArray<DeviceMemoryBuffer> compressedBuffers =
+              allocCompressedBuffers(numChunks, stream);
+           DeviceMemoryBuffer compressedChunkSizes =
+              DeviceMemoryBuffer.allocate(numChunks * 8L, stream)) {
+        long[] inputChunkAddrs = new long[numChunks];
+        long[] inputChunkSizes = new long[numChunks];
+        long[] outputChunkAddrs = new long[numChunks];
+        buildAddrsAndSizes(inputs, inputChunkAddrs, inputChunkSizes, compressedBuffers,
+            outputChunkAddrs);
+
+        final long tempBufferSize = batchedCompressGetTempSize(numChunks, chunkSize);
+        try (DeviceMemoryBuffer addrsAndSizes = putAddrsAndSizesOnDevice(inputChunkAddrs,
+                inputChunkSizes, outputChunkAddrs, stream);
+             DeviceMemoryBuffer tempBuffer =
+                DeviceMemoryBuffer.allocate(tempBufferSize, stream)) {
+          final long devOutputAddrsPtr = addrsAndSizes.getAddress() + numChunks * 8L;
+          final long devInputSizesPtr = devOutputAddrsPtr + numChunks * 8L;
+          batchedCompressAsync(addrsAndSizes.getAddress(), devInputSizesPtr, chunkSize,
+              numChunks, tempBuffer.getAddress(), tempBufferSize, devOutputAddrsPtr,
+              compressedChunkSizes.getAddress(), stream.getStream());
+        }
+
+        // Synchronously copy the resulting compressed sizes per chunk.
+        long[] outputChunkSizes = getOutputChunkSizes(compressedChunkSizes, stream);
+
+        // inputs are no longer needed at this point, so free them early
+        inputs.close();
+
+        // Combine compressed chunks into output buffers corresponding to each original input
+        return stitchOutput(chunksPerInput, compressedChunkSizes, outputChunkAddrs,
+            outputChunkSizes, stream);
+      }
+    }
+  }
+
+  static void validateChunkSize(long chunkSize) {
+    if (chunkSize <= 0  || chunkSize > MAX_CHUNK_SIZE) {
+      throw new IllegalArgumentException("Invalid chunk size: " + chunkSize +
+          " Max chunk size is: " + MAX_CHUNK_SIZE + " bytes");
+    }
+  }
+
+  private static long ceilingDivide(long x, long y) {
+    return (x + y - 1) / y;
+  }
+
+  private int getNumChunksInBuffer(MemoryBuffer buffer) {
+    return (int) ceilingDivide(buffer.getLength(), chunkSize);
+  }
+
+  private CloseableArray<DeviceMemoryBuffer> allocCompressedBuffers(long numChunks,
+      Cuda.Stream stream) {
+    final long chunksPerBuffer = maxIntermediateBufferSize / maxOutputChunkSize;
+    final long numBuffers = ceilingDivide(numChunks, chunksPerBuffer);
+    if (numBuffers > Integer.MAX_VALUE) {
+      throw new IllegalStateException("Too many chunks");
+    }
+    try (NvtxRange range = new NvtxRange("allocCompressedBuffers", NvtxColor.YELLOW)) {
+      CloseableArray<DeviceMemoryBuffer> buffers = CloseableArray.wrap(
+          new DeviceMemoryBuffer[(int) numBuffers]);
+      try {
+        // allocate all of the max-chunks intermediate compressed buffers
+        for (int i = 0; i < buffers.size() - 1; ++i) {
+          buffers.set(i,
+              DeviceMemoryBuffer.allocate(chunksPerBuffer * maxOutputChunkSize, stream));
+        }
+        // allocate the tail intermediate compressed buffer that may be smaller than the others
+        buffers.set(buffers.size() - 1, DeviceMemoryBuffer.allocate(
+            (numChunks - chunksPerBuffer * (buffers.size() - 1)) * maxOutputChunkSize, stream));
+        return buffers;
+      } catch (Exception e) {
+        buffers.close(e);
+        throw e;
+      }
+    }
+  }
+
+  // Fill in the inputChunkAddrs, inputChunkSizes, and outputChunkAddrs arrays to point
+  // into the chunks in the input and output buffers.
+  private void buildAddrsAndSizes(CloseableArray<BaseDeviceMemoryBuffer> inputs,
+      long[] inputChunkAddrs, long[] inputChunkSizes,
+      CloseableArray<DeviceMemoryBuffer> compressedBuffers, long[] outputChunkAddrs) {
+    // setup the input addresses and sizes
+    int chunkIdx = 0;
+    for (BaseDeviceMemoryBuffer input : inputs.getArray()) {
+      final int numChunksInBuffer = getNumChunksInBuffer(input);
+      for (int i = 0; i < numChunksInBuffer; i++) {
+        inputChunkAddrs[chunkIdx] = input.getAddress() + i * chunkSize;
+        inputChunkSizes[chunkIdx] = (i != numChunksInBuffer - 1) ? chunkSize
+            : (input.getLength() - (long) i * chunkSize);
+        ++chunkIdx;
+      }
+    }
+    assert chunkIdx == inputChunkAddrs.length;
+    assert chunkIdx == inputChunkSizes.length;
+
+    // setup output addresses
+    chunkIdx = 0;
+    for (DeviceMemoryBuffer buffer : compressedBuffers.getArray()) {
+      assert buffer.getLength() % maxOutputChunkSize == 0;
+      long numChunksInBuffer = buffer.getLength() / maxOutputChunkSize;
+      long baseAddr = buffer.getAddress();
+      for (int i = 0; i < numChunksInBuffer; i++) {
+        outputChunkAddrs[chunkIdx++] = baseAddr + i * maxOutputChunkSize;
+      }
+    }
+    assert chunkIdx == outputChunkAddrs.length;
+  }
+
+  // Write input addresses, output addresses and sizes contiguously into a DeviceMemoryBuffer.
+  private DeviceMemoryBuffer putAddrsAndSizesOnDevice(long[] inputAddrs, long[] inputSizes,
+        long[] outputAddrs, Cuda.Stream stream) {
+    final long totalSize = inputAddrs.length * 8L * 3; // space for input, output, and size arrays
+    final long outputAddrsOffset = inputAddrs.length * 8L;
+    final long sizesOffset = outputAddrsOffset + inputAddrs.length * 8L;
+    try (NvtxRange range = new NvtxRange("putAddrsAndSizesOnDevice", NvtxColor.YELLOW)) {
+      try (HostMemoryBuffer hostbuf = HostMemoryBuffer.allocate(totalSize);
+           DeviceMemoryBuffer result = DeviceMemoryBuffer.allocate(totalSize)) {
+        hostbuf.setLongs(0, inputAddrs, 0, inputAddrs.length);
+        hostbuf.setLongs(outputAddrsOffset, outputAddrs, 0, outputAddrs.length);
+        for (int i = 0; i < inputSizes.length; i++) {
+          hostbuf.setLong(sizesOffset + i * 8L, inputSizes[i]);
+        }
+        result.copyFromHostBuffer(hostbuf, stream);
+        result.incRefCount();
+        return result;
+      }
+    }
+  }
+
+  // Synchronously copy the resulting compressed sizes from device memory to host memory.
+  private long[] getOutputChunkSizes(BaseDeviceMemoryBuffer devChunkSizes, Cuda.Stream stream) {
+    try (NvtxRange range = new NvtxRange("getOutputChunkSizes", NvtxColor.YELLOW)) {
+      try (HostMemoryBuffer hostbuf = HostMemoryBuffer.allocate(devChunkSizes.getLength())) {
+        hostbuf.copyFromDeviceBuffer(devChunkSizes, stream);
+        int numChunks = (int) (devChunkSizes.getLength() / 8);
+        long[] result = new long[numChunks];
+        for (int i = 0; i < numChunks; i++) {
+          long size = hostbuf.getLong(i * 8L);
+          assert size < Integer.MAX_VALUE : "output size is too big";
+          result[i] = size;
+        }
+        return result;
+      }
+    }
+  }
+
+  // Stitch together the individual chunks into the result buffers.
+  // Each result buffer has metadata at the beginning, followed by compressed chunks.
+  // This is done by building up parallel lists of source addr, dest addr and size and
+  // then calling multiBufferCopyAsync()
+  private DeviceMemoryBuffer[] stitchOutput(int[] chunksPerInput,
+        DeviceMemoryBuffer compressedChunkSizes, long[] outputChunkAddrs,
+        long[] outputChunkSizes, Cuda.Stream stream) {
+    try (NvtxRange range = new NvtxRange("stitchOutput", NvtxColor.YELLOW)) {
+      final int numOutputs = chunksPerInput.length;
+      final long chunkSizesAddr = compressedChunkSizes.getAddress();
+      long[] outputBufferSizes = calcOutputBufferSizes(chunksPerInput, outputChunkSizes);
+      try (CloseableArray<DeviceMemoryBuffer> outputs =
+              CloseableArray.wrap(new DeviceMemoryBuffer[numOutputs])) {
+        // Each chunk needs to be copied, and each output needs a copy of the
+        // compressed chunk size vector representing the metadata.
+        final int totalBuffersToCopy = numOutputs + outputChunkAddrs.length;
+        long[] destAddrs = new long[totalBuffersToCopy];
+        long[] srcAddrs = new long[totalBuffersToCopy];
+        long[] sizes = new long[totalBuffersToCopy];
+        int copyBufferIdx = 0;
+        int chunkIdx = 0;
+        for (int outputIdx = 0; outputIdx < numOutputs; outputIdx++) {
+          DeviceMemoryBuffer outputBuffer =
+              DeviceMemoryBuffer.allocate(outputBufferSizes[outputIdx]);
+          outputs.set(outputIdx, outputBuffer);
+          final long outputBufferAddr = outputBuffer.getAddress();
+          final long numChunks = chunksPerInput[outputIdx];
+          final long metadataSize = numChunks * METADATA_BYTES_PER_CHUNK;
+
+          // setup a copy of the metadata at the front of the output buffer
+          srcAddrs[copyBufferIdx] = chunkSizesAddr + chunkIdx * 8;
+          destAddrs[copyBufferIdx] = outputBufferAddr;
+          sizes[copyBufferIdx] = metadataSize;
+          ++copyBufferIdx;
+
+          // setup copies of the compressed chunks for this output buffer
+          long nextChunkAddr = outputBufferAddr + metadataSize;
+          for (int i = 0; i < numChunks; ++i) {
+            srcAddrs[copyBufferIdx] = outputChunkAddrs[chunkIdx];
+            destAddrs[copyBufferIdx] = nextChunkAddr;
+            final long chunkSize = outputChunkSizes[chunkIdx];
+            sizes[copyBufferIdx] = chunkSize;
+            copyBufferIdx++;
+            chunkIdx++;
+            nextChunkAddr += chunkSize;
+          }
+        }
+        assert copyBufferIdx == totalBuffersToCopy;
+        assert chunkIdx == outputChunkAddrs.length;
+        assert chunkIdx == outputChunkSizes.length;
+
+        Cuda.multiBufferCopyAsync(destAddrs, srcAddrs, sizes, stream);
+        return outputs.release();
+      }
+    }
+  }
+
+  // Calculate the sizes for each output buffer (metadata plus size of compressed chunks)
+  private long[] calcOutputBufferSizes(int[] chunksPerInput, long[] outputChunkSizes) {
+    long[] sizes = new long[chunksPerInput.length];
+    int chunkIdx = 0;
+    for (int i = 0; i < sizes.length; i++) {
+      final int chunksInBuffer = chunksPerInput[i];
+      final int chunkEndIdx = chunkIdx + chunksInBuffer;
+      // metadata stored in front of compressed data
+      long bufferSize = METADATA_BYTES_PER_CHUNK * chunksInBuffer;
+      // add in the compressed chunk sizes to get the total size
+      while (chunkIdx < chunkEndIdx) {
+        bufferSize += outputChunkSizes[chunkIdx++];
+      }
+      sizes[i] = bufferSize;
+    }
+    assert chunkIdx == outputChunkSizes.length;
+    return sizes;
+  }
+
+  /**
+   * Get the temporary workspace size required to perform compression of an entire batch.
+   * @param batchSize number of chunks in the batch
+   * @param maxChunkSize maximum size of an uncompressed chunk in bytes
+   * @return The size of required temporary workspace in bytes to compress the batch.
+   */
+  protected abstract long batchedCompressGetTempSize(long batchSize, long maxChunkSize);
+
+   /**
+   * Asynchronously compress a batch of buffers. Note that compressedSizesOutPtr must
+   * point to pinned memory for this operation to be asynchronous.
+   * @param devInPtrs device address of uncompressed buffer addresses vector
+   * @param devInSizes device address of uncompressed buffer sizes vector
+   * @param chunkSize maximum size of an uncompressed chunk in bytes
+   * @param batchSize number of chunks in the batch
+   * @param tempPtr device address of the temporary workspace buffer
+   * @param tempSize size of the temporary workspace buffer in bytes
+   * @param devOutPtrs device address of output buffer addresses vector
+   * @param compressedSizesOutPtr device address where to write the sizes of the
+   *                              compressed data written to the corresponding
+   *                              output buffers. Must point to a buffer with
+   *                              at least 8 bytes of memory per output buffer
+   *                              in the batch.
+   * @param stream CUDA stream to use
+   */
+  protected abstract void batchedCompressAsync(long devInPtrs, long devInSizes, long chunkSize,
+      long batchSize, long tempPtr, long tempSize, long devOutPtrs, long compressedSizesOutPtr,
+      long stream);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedDecompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedDecompressor.java
new file mode 100644
index 00000000000..5543d2dcb64
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedDecompressor.java
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf.nvcomp;
+
+import ai.rapids.cudf.CloseableArray;
+import ai.rapids.cudf.Cuda;
+import ai.rapids.cudf.BaseDeviceMemoryBuffer;
+import ai.rapids.cudf.DeviceMemoryBuffer;
+import ai.rapids.cudf.HostMemoryBuffer;
+import ai.rapids.cudf.NvtxColor;
+import ai.rapids.cudf.NvtxRange;
+
+import java.util.Arrays;
+
+/** Decompressor that operates on multiple input buffers in a batch */
+public abstract class BatchedDecompressor {
+
+  private final long chunkSize;
+
+  /**
+   * Construct a batched decompressor instance
+   * @param chunkSize maximum uncompressed block size, must match value used
+   *                  during compression
+   */
+  public BatchedDecompressor(long chunkSize) {
+    this.chunkSize = chunkSize;
+  }
+
+  /**
+   * Asynchronously decompress a batch of buffers
+   * @param origInputs buffers to decompress, will be closed by this operation
+   * @param outputs output buffers that will contain the decompressed results, each must
+   *                be sized to the exact decompressed size of the corresponding input
+   * @param stream CUDA stream to use
+   */
+  public void decompressAsync(BaseDeviceMemoryBuffer[] origInputs,
+      BaseDeviceMemoryBuffer[] outputs, Cuda.Stream stream) {
+    try (CloseableArray<BaseDeviceMemoryBuffer> inputs =
+            CloseableArray.wrap(Arrays.copyOf(origInputs, origInputs.length))) {
+      BatchedCompressor.validateChunkSize(chunkSize);
+      if (origInputs.length != outputs.length) {
+        throw new IllegalArgumentException("number of inputs must match number of outputs");
+      }
+      final int numInputs = inputs.size();
+      if (numInputs == 0) {
+        return;
+      }
+
+      int[] chunksPerInput = new int[numInputs];
+      long totalChunks = 0;
+      for (int i = 0; i < numInputs; i++) {
+        // use output size to determine number of chunks in the input, as the output buffer
+        // must be exactly sized to the uncompressed data
+        BaseDeviceMemoryBuffer buffer = outputs[i];
+        int numBufferChunks = getNumChunksInBuffer(chunkSize, buffer);
+        chunksPerInput[i] = numBufferChunks;
+        totalChunks += numBufferChunks;
+      }
+
+      final long tempBufferSize = batchedDecompressGetTempSize(totalChunks, chunkSize);
+      try (DeviceMemoryBuffer devAddrsSizes = buildAddrsSizesBuffer(chunkSize, totalChunks,
+              inputs.getArray(), chunksPerInput, outputs, stream);
+           DeviceMemoryBuffer devTemp = DeviceMemoryBuffer.allocate(tempBufferSize)) {
+        // buffer containing addresses and sizes contains four vectors of longs in this order:
+        // - compressed chunk input addresses
+        // - chunk output buffer addresses
+        // - compressed chunk sizes
+        // - uncompressed chunk sizes
+        final long inputAddrsPtr = devAddrsSizes.getAddress();
+        final long outputAddrsPtr = inputAddrsPtr + totalChunks * 8;
+        final long inputSizesPtr = outputAddrsPtr + totalChunks * 8;
+        final long outputSizesPtr = inputSizesPtr + totalChunks * 8;
+        batchedDecompressAsync(inputAddrsPtr, inputSizesPtr, outputSizesPtr, totalChunks,
+            devTemp.getAddress(), devTemp.getLength(), outputAddrsPtr, stream.getStream());
+      }
+    }
+  }
+
+  private static int getNumChunksInBuffer(long chunkSize, BaseDeviceMemoryBuffer buffer) {
+    return (int) ((buffer.getLength() + chunkSize - 1) / chunkSize);
+  }
+
+  /**
+   * Build a device memory buffer containing four vectors of longs in the following order:
+   * <ul>
+   *   <li>compressed chunk input addresses</li>
+   *   <li>uncompressed chunk output addresses</li>
+   *   <li>compressed chunk sizes</li>
+   *   <li>uncompressed chunk sizes</li>
+   * </ul>
+   * Each vector contains as many longs as the number of chunks being decompressed
+   * @param chunkSize maximum uncompressed size of a chunk
+   * @param totalChunks total number of chunks to be decompressed
+   * @param inputs device buffers containing the compressed data
+   * @param chunksPerInput number of compressed chunks per input buffer
+   * @param outputs device buffers that will hold the uncompressed output
+   * @param stream CUDA stream to use
+   * @return device buffer containing address and size vectors
+   */
+  private static DeviceMemoryBuffer buildAddrsSizesBuffer(long chunkSize, long totalChunks,
+      BaseDeviceMemoryBuffer[] inputs, int[] chunksPerInput, BaseDeviceMemoryBuffer[] outputs,
+      Cuda.Stream stream) {
+    final long totalBufferSize = totalChunks * 8L * 4L;
+    try (NvtxRange range = new NvtxRange("buildAddrSizesBuffer", NvtxColor.YELLOW)) {
+      try (HostMemoryBuffer metadata = fetchMetadata(totalChunks, inputs, chunksPerInput, stream);
+           HostMemoryBuffer hostAddrsSizes = HostMemoryBuffer.allocate(totalBufferSize);
+           DeviceMemoryBuffer devAddrsSizes = DeviceMemoryBuffer.allocate(totalBufferSize)) {
+        // Build four long vectors in the AddrsSizes buffer:
+        // - compressed input address (one per chunk)
+        // - uncompressed output address (one per chunk)
+        // - compressed input size (one per chunk)
+        // - uncompressed input size (one per chunk)
+        final long srcAddrsOffset = 0;
+        final long destAddrsOffset = srcAddrsOffset + totalChunks * 8L;
+        final long srcSizesOffset = destAddrsOffset + totalChunks * 8L;
+        final long destSizesOffset = srcSizesOffset + totalChunks * 8L;
+        long chunkIdx = 0;
+        for (int inputIdx = 0; inputIdx < inputs.length; inputIdx++) {
+          final BaseDeviceMemoryBuffer input = inputs[inputIdx];
+          final BaseDeviceMemoryBuffer output = outputs[inputIdx];
+          final int numChunksInInput = chunksPerInput[inputIdx];
+          long srcAddr = input.getAddress() +
+              BatchedCompressor.METADATA_BYTES_PER_CHUNK * numChunksInInput;
+          long destAddr = output.getAddress();
+          final long chunkIdxEnd = chunkIdx + numChunksInInput;
+          while (chunkIdx < chunkIdxEnd) {
+            final long srcChunkSize = metadata.getLong(chunkIdx * 8);
+            final long destChunkSize = (chunkIdx < chunkIdxEnd - 1) ? chunkSize
+                : output.getAddress() + output.getLength() - destAddr;
+            hostAddrsSizes.setLong(srcAddrsOffset + chunkIdx * 8, srcAddr);
+            hostAddrsSizes.setLong(destAddrsOffset + chunkIdx * 8, destAddr);
+            hostAddrsSizes.setLong(srcSizesOffset + chunkIdx * 8, srcChunkSize);
+            hostAddrsSizes.setLong(destSizesOffset + chunkIdx * 8, destChunkSize);
+            srcAddr += srcChunkSize;
+            destAddr += destChunkSize;
+            ++chunkIdx;
+          }
+        }
+        devAddrsSizes.copyFromHostBuffer(hostAddrsSizes, stream);
+        devAddrsSizes.incRefCount();
+        return devAddrsSizes;
+      }
+    }
+  }
+
+  /**
+   * Fetch the metadata at the front of each input in a single, contiguous host buffer.
+   * @param totalChunks total number of compressed chunks
+   * @param inputs buffers containing the compressed data
+   * @param chunksPerInput number of compressed chunks for the corresponding input
+   * @param stream CUDA stream to use
+   * @return host buffer containing all of the metadata
+   */
+  private static HostMemoryBuffer fetchMetadata(long totalChunks, BaseDeviceMemoryBuffer[] inputs,
+      int[] chunksPerInput, Cuda.Stream stream) {
+    try (NvtxRange range = new NvtxRange("fetchMetadata", NvtxColor.PURPLE)) {
+      // one long per chunk containing the compressed size
+      final long totalMetadataSize = totalChunks * BatchedCompressor.METADATA_BYTES_PER_CHUNK;
+      // Build corresponding vectors of destination addresses, source addresses and sizes.
+      long[] destAddrs = new long[inputs.length];
+      long[] srcAddrs = new long[inputs.length];
+      long[] sizes = new long[inputs.length];
+      try (HostMemoryBuffer hostMetadata = HostMemoryBuffer.allocate(totalMetadataSize);
+           DeviceMemoryBuffer devMetadata = DeviceMemoryBuffer.allocate(totalMetadataSize)) {
+        long destCopyAddr = devMetadata.getAddress();
+        for (int inputIdx = 0; inputIdx < inputs.length; inputIdx++) {
+          final BaseDeviceMemoryBuffer input = inputs[inputIdx];
+          final long copySize =
+              chunksPerInput[inputIdx] * BatchedCompressor.METADATA_BYTES_PER_CHUNK;
+          destAddrs[inputIdx] = destCopyAddr;
+          srcAddrs[inputIdx] = input.getAddress();
+          sizes[inputIdx] = copySize;
+          destCopyAddr += copySize;
+        }
+        Cuda.multiBufferCopyAsync(destAddrs, srcAddrs, sizes, stream);
+        hostMetadata.copyFromDeviceBuffer(devMetadata, stream);
+        hostMetadata.incRefCount();
+        return hostMetadata;
+      }
+    }
+  }
+
+  /**
+   * Computes the temporary storage size in bytes needed to decompress a compressed batch.
+   * @param numChunks number of chunks in the batch
+   * @param maxUncompressedChunkBytes maximum uncompressed size of any chunk in bytes
+   * @return number of temporary storage bytes needed to decompress the batch
+   */
+  protected abstract long batchedDecompressGetTempSize(long numChunks,
+      long maxUncompressedChunkBytes);
+
+    /**
+   * Asynchronously decompress a batch of compressed data buffers.
+   * @param devInPtrs device address of compressed input buffer addresses vector
+   * @param devInSizes device address of compressed input buffer sizes vector
+   * @param devOutSizes device address of uncompressed buffer sizes vector
+   * @param batchSize number of buffers in the batch
+   * @param tempPtr device address of the temporary decompression space
+   * @param tempSize size of the temporary decompression space in bytes
+   * @param devOutPtrs device address of uncompressed output buffer addresses vector
+   * @param stream CUDA stream to use
+   */
+  protected abstract void batchedDecompressAsync(long devInPtrs, long devInSizes,
+      long devOutSizes, long batchSize, long tempPtr, long tempSize, long devOutPtrs,
+      long stream);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
index 1aa7e5e11a0..58c0e7ee169 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Compressor.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,305 +16,31 @@
 
 package ai.rapids.cudf.nvcomp;
 
-import ai.rapids.cudf.BaseDeviceMemoryBuffer;
-import ai.rapids.cudf.CloseableArray;
-import ai.rapids.cudf.Cuda;
-import ai.rapids.cudf.DefaultHostMemoryAllocator;
-import ai.rapids.cudf.DeviceMemoryBuffer;
-import ai.rapids.cudf.HostMemoryAllocator;
-import ai.rapids.cudf.HostMemoryBuffer;
-import ai.rapids.cudf.MemoryBuffer;
-import ai.rapids.cudf.NvtxColor;
-import ai.rapids.cudf.NvtxRange;
-
 /** Multi-buffer LZ4 compressor */
-public class BatchedLZ4Compressor {
-  private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
-
-  static final long MAX_CHUNK_SIZE = 16777216;  // in bytes
-  // each chunk has a 64-bit integer value as metadata containing the compressed size
-  static final long METADATA_BYTES_PER_CHUNK = 8;
-
-  private final long chunkSize;
-  private final long targetIntermediateBufferSize;
-  private final long maxOutputChunkSize;
+public class BatchedLZ4Compressor extends BatchedCompressor {
 
   /**
    * Construct a batched LZ4 compressor instance
-   * @param chunkSize maximum amount of uncompressed data to compress as a single chunk. Inputs
-   *                  larger than this will be compressed in multiple chunks.
-   * @param targetIntermediateBufferSize desired maximum size of intermediate device buffers
-   *                                     used during compression.
-   */
-  public BatchedLZ4Compressor(long chunkSize, long targetIntermediateBufferSize) {
-    validateChunkSize(chunkSize);
-    this.chunkSize = chunkSize;
-    this.maxOutputChunkSize = NvcompJni.batchedLZ4CompressGetMaxOutputChunkSize(chunkSize);
-    assert maxOutputChunkSize < Integer.MAX_VALUE;
-    this.targetIntermediateBufferSize = Math.max(targetIntermediateBufferSize, maxOutputChunkSize);
-  }
-
-  /**
-   * Compress a batch of buffers with LZ4. The input buffers will be closed.
-   * @param origInputs buffers to compress
-   * @param stream CUDA stream to use
-   * @return compressed buffers corresponding to the input buffers
+   * @param chunkSize maximum amount of uncompressed data to compress as a single chunk.
+   *                  Inputs larger than this will be compressed in multiple chunks.
+   * @param maxIntermediateBufferSize desired maximum size of intermediate device buffers
+   *                                  used during compression.
    */
-  public DeviceMemoryBuffer[] compress(BaseDeviceMemoryBuffer[] origInputs, Cuda.Stream stream) {
-    try (CloseableArray<BaseDeviceMemoryBuffer> inputs = CloseableArray.wrap(origInputs)) {
-      if (chunkSize <= 0) {
-        throw new IllegalArgumentException("Illegal chunk size: " + chunkSize);
-      }
-      final int numInputs = inputs.size();
-      if (numInputs == 0) {
-        return new DeviceMemoryBuffer[0];
-      }
-
-      // Each buffer is broken up into chunkSize chunks for compression.  Calculate how many
-      // chunks are needed for each input buffer.
-      int[] chunksPerInput = new int[numInputs];
-      int numChunks = 0;
-      for (int i = 0; i < numInputs; i++) {
-        BaseDeviceMemoryBuffer buffer = inputs.get(i);
-        int numBufferChunks = getNumChunksInBuffer(buffer);
-        chunksPerInput[i] = numBufferChunks;
-        numChunks += numBufferChunks;
-      }
-
-      // Allocate buffers for each chunk and generate parallel lists of chunk source addresses,
-      // chunk destination addresses, and sizes.
-      try (CloseableArray<DeviceMemoryBuffer> compressedBuffers =
-               allocCompressedBuffers(numChunks, stream);
-           DeviceMemoryBuffer compressedChunkSizes =
-               DeviceMemoryBuffer.allocate(numChunks * 8L, stream)) {
-        long[] inputChunkAddrs = new long[numChunks];
-        long[] inputChunkSizes = new long[numChunks];
-        long[] outputChunkAddrs = new long[numChunks];
-        buildAddrsAndSizes(inputs, inputChunkAddrs, inputChunkSizes,
-            compressedBuffers, outputChunkAddrs);
-
-        long[] outputChunkSizes;
-        final long tempBufferSize = NvcompJni.batchedLZ4CompressGetTempSize(numChunks, chunkSize);
-        try (DeviceMemoryBuffer addrsAndSizes =
-                 putAddrsAndSizesOnDevice(inputChunkAddrs, inputChunkSizes, outputChunkAddrs, stream);
-             DeviceMemoryBuffer tempBuffer = DeviceMemoryBuffer.allocate(tempBufferSize, stream)) {
-          final long devOutputAddrsPtr = addrsAndSizes.getAddress() + numChunks * 8L;
-          final long devInputSizesPtr = devOutputAddrsPtr + numChunks * 8L;
-          NvcompJni.batchedLZ4CompressAsync(
-              addrsAndSizes.getAddress(),
-              devInputSizesPtr,
-              chunkSize,
-              numChunks,
-              tempBuffer.getAddress(),
-              tempBufferSize,
-              devOutputAddrsPtr,
-              compressedChunkSizes.getAddress(),
-              stream.getStream());
-        }
-
-        // Synchronously copy the resulting compressed sizes per chunk.
-        outputChunkSizes = getOutputChunkSizes(compressedChunkSizes, stream);
-
-        // inputs are no longer needed at this point, so free them early
-        inputs.close();
-
-        // Combine compressed chunks into output buffers corresponding to each original input
-        return stitchOutput(chunksPerInput, compressedChunkSizes, outputChunkAddrs,
-            outputChunkSizes, stream);
-      }
-    }
-  }
-
-  static void validateChunkSize(long chunkSize) {
-    if (chunkSize <= 0  || chunkSize > MAX_CHUNK_SIZE) {
-      throw new IllegalArgumentException("Invalid chunk size: " + chunkSize + " Max chunk size is: "
-          + MAX_CHUNK_SIZE + " bytes");
-    }
+  public BatchedLZ4Compressor(long chunkSize, long maxIntermediateBufferSize) {
+    super(chunkSize, NvcompJni.batchedLZ4CompressGetMaxOutputChunkSize(chunkSize),
+        maxIntermediateBufferSize);
   }
 
-  private static long ceilingDivide(long x, long y) {
-    return (x + y - 1) / y;
-  }
-
-  private int getNumChunksInBuffer(MemoryBuffer buffer) {
-    return (int) ceilingDivide(buffer.getLength(), chunkSize);
-  }
-
-  private CloseableArray<DeviceMemoryBuffer> allocCompressedBuffers(long numChunks,
-                                                                    Cuda.Stream stream) {
-    final long chunksPerBuffer = targetIntermediateBufferSize / maxOutputChunkSize;
-    final long numBuffers = ceilingDivide(numChunks, chunksPerBuffer);
-    if (numBuffers > Integer.MAX_VALUE) {
-      throw new IllegalStateException("Too many chunks");
-    }
-    try (NvtxRange range = new NvtxRange("allocCompressedBuffers", NvtxColor.YELLOW)) {
-      CloseableArray<DeviceMemoryBuffer> buffers = CloseableArray.wrap(
-          new DeviceMemoryBuffer[(int) numBuffers]);
-      try {
-        // allocate all of the max-chunks intermediate compressed buffers
-        for (int i = 0; i < buffers.size() - 1; ++i) {
-          buffers.set(i, DeviceMemoryBuffer.allocate(chunksPerBuffer * maxOutputChunkSize, stream));
-        }
-        // allocate the tail intermediate compressed buffer that may be smaller than the others
-        buffers.set(buffers.size() - 1, DeviceMemoryBuffer.allocate(
-            (numChunks - chunksPerBuffer * (buffers.size() - 1)) * maxOutputChunkSize, stream));
-        return buffers;
-      } catch (Exception e) {
-        buffers.close(e);
-        throw e;
-      }
-    }
-  }
-
-  // Fill in the inputChunkAddrs, inputChunkSizes, and outputChunkAddrs arrays to point
-  // into the chunks in the input and output buffers.
-  private void buildAddrsAndSizes(CloseableArray<BaseDeviceMemoryBuffer> inputs,
-                                  long[] inputChunkAddrs,
-                                  long[] inputChunkSizes,
-                                  CloseableArray<DeviceMemoryBuffer> compressedBuffers,
-                                  long[] outputChunkAddrs) {
-    // setup the input addresses and sizes
-    int chunkIdx = 0;
-    for (BaseDeviceMemoryBuffer input : inputs.getArray()) {
-      final int numChunksInBuffer = getNumChunksInBuffer(input);
-      for (int i = 0; i < numChunksInBuffer; i++) {
-        inputChunkAddrs[chunkIdx] = input.getAddress() + i * chunkSize;
-        inputChunkSizes[chunkIdx] = (i != numChunksInBuffer - 1) ? chunkSize
-            : (input.getLength() - (long) i * chunkSize);
-        ++chunkIdx;
-      }
-    }
-    assert chunkIdx == inputChunkAddrs.length;
-    assert chunkIdx == inputChunkSizes.length;
-
-    // setup output addresses
-    chunkIdx = 0;
-    for (DeviceMemoryBuffer buffer : compressedBuffers.getArray()) {
-      assert buffer.getLength() % maxOutputChunkSize == 0;
-      long numChunksInBuffer = buffer.getLength() / maxOutputChunkSize;
-      long baseAddr = buffer.getAddress();
-      for (int i = 0; i < numChunksInBuffer; i++) {
-        outputChunkAddrs[chunkIdx++] = baseAddr + i * maxOutputChunkSize;
-      }
-    }
-    assert chunkIdx == outputChunkAddrs.length;
-  }
-
-  // Write input addresses, output addresses and sizes contiguously into a DeviceMemoryBuffer.
-  private DeviceMemoryBuffer putAddrsAndSizesOnDevice(long[] inputAddrs,
-                                                      long[] inputSizes,
-                                                      long[] outputAddrs,
-                                                      Cuda.Stream stream) {
-    final long totalSize = inputAddrs.length * 8L * 3; // space for input, output, and size arrays
-    final long outputAddrsOffset = inputAddrs.length * 8L;
-    final long sizesOffset = outputAddrsOffset + inputAddrs.length * 8L;
-    try (NvtxRange range = new NvtxRange("putAddrsAndSizesOnDevice", NvtxColor.YELLOW)) {
-      try (HostMemoryBuffer hostbuf = hostMemoryAllocator.allocate(totalSize);
-           DeviceMemoryBuffer result = DeviceMemoryBuffer.allocate(totalSize)) {
-        hostbuf.setLongs(0, inputAddrs, 0, inputAddrs.length);
-        hostbuf.setLongs(outputAddrsOffset, outputAddrs, 0, outputAddrs.length);
-        for (int i = 0; i < inputSizes.length; i++) {
-          hostbuf.setLong(sizesOffset + i * 8L, inputSizes[i]);
-        }
-        result.copyFromHostBuffer(hostbuf, stream);
-        result.incRefCount();
-        return result;
-      }
-    }
-  }
-
-  // Synchronously copy the resulting compressed sizes from device memory to host memory.
-  private long[] getOutputChunkSizes(BaseDeviceMemoryBuffer devChunkSizes, Cuda.Stream stream) {
-    try (NvtxRange range = new NvtxRange("getOutputChunkSizes", NvtxColor.YELLOW)) {
-      try (HostMemoryBuffer hostbuf = hostMemoryAllocator.allocate(devChunkSizes.getLength())) {
-        hostbuf.copyFromDeviceBuffer(devChunkSizes, stream);
-        int numChunks = (int) (devChunkSizes.getLength() / 8);
-        long[] result = new long[numChunks];
-        for (int i = 0; i < numChunks; i++) {
-          long size = hostbuf.getLong(i * 8L);
-          assert size < Integer.MAX_VALUE : "output size is too big";
-          result[i] = size;
-        }
-        return result;
-      }
-    }
-  }
-
-  // Stitch together the individual chunks into the result buffers.
-  // Each result buffer has metadata at the beginning, followed by compressed chunks.
-  // This is done by building up parallel lists of source addr, dest addr and size and
-  // then calling multiBufferCopyAsync()
-  private DeviceMemoryBuffer[] stitchOutput(int[] chunksPerInput,
-                                            DeviceMemoryBuffer compressedChunkSizes,
-                                            long[] outputChunkAddrs,
-                                            long[] outputChunkSizes,
-                                            Cuda.Stream stream) {
-    try (NvtxRange range = new NvtxRange("stitchOutput", NvtxColor.YELLOW)) {
-      final int numOutputs = chunksPerInput.length;
-      final long chunkSizesAddr = compressedChunkSizes.getAddress();
-      long[] outputBufferSizes = calcOutputBufferSizes(chunksPerInput, outputChunkSizes);
-      try (CloseableArray<DeviceMemoryBuffer> outputs =
-               CloseableArray.wrap(new DeviceMemoryBuffer[numOutputs])) {
-        // Each chunk needs to be copied, and each output needs a copy of the
-        // compressed chunk size vector representing the metadata.
-        final int totalBuffersToCopy = numOutputs + outputChunkAddrs.length;
-        long[] destAddrs = new long[totalBuffersToCopy];
-        long[] srcAddrs = new long[totalBuffersToCopy];
-        long[] sizes = new long[totalBuffersToCopy];
-        int copyBufferIdx = 0;
-        int chunkIdx = 0;
-        for (int outputIdx = 0; outputIdx < numOutputs; outputIdx++) {
-          DeviceMemoryBuffer outputBuffer = DeviceMemoryBuffer.allocate(outputBufferSizes[outputIdx]);
-          final long outputBufferAddr = outputBuffer.getAddress();
-          outputs.set(outputIdx, outputBuffer);
-          final long numChunks = chunksPerInput[outputIdx];
-          final long metadataSize = numChunks * METADATA_BYTES_PER_CHUNK;
-
-          // setup a copy of the metadata at the front of the output buffer
-          srcAddrs[copyBufferIdx] = chunkSizesAddr + chunkIdx * 8;
-          destAddrs[copyBufferIdx] = outputBufferAddr;
-          sizes[copyBufferIdx] = metadataSize;
-          ++copyBufferIdx;
-
-          // setup copies of the compressed chunks for this output buffer
-          long nextChunkAddr = outputBufferAddr + metadataSize;
-          for (int i = 0; i < numChunks; ++i) {
-            srcAddrs[copyBufferIdx] = outputChunkAddrs[chunkIdx];
-            destAddrs[copyBufferIdx] = nextChunkAddr;
-            final long chunkSize = outputChunkSizes[chunkIdx];
-            sizes[copyBufferIdx] = chunkSize;
-            copyBufferIdx++;
-            chunkIdx++;
-            nextChunkAddr += chunkSize;
-          }
-        }
-        assert copyBufferIdx == totalBuffersToCopy;
-        assert chunkIdx == outputChunkAddrs.length;
-        assert chunkIdx == outputChunkSizes.length;
-
-        Cuda.multiBufferCopyAsync(destAddrs, srcAddrs, sizes, stream);
-        return outputs.release();
-      }
-    }
+  @Override
+  protected long batchedCompressGetTempSize(long batchSize, long maxChunkSize) {
+    return NvcompJni.batchedLZ4CompressGetTempSize(batchSize, maxChunkSize);
   }
 
-  // Calculate the list of sizes for each output buffer (metadata plus size of compressed chunks)
-  private long[] calcOutputBufferSizes(int[] chunksPerInput,
-                                       long[] outputChunkSizes) {
-    long[] sizes = new long[chunksPerInput.length];
-    int chunkIdx = 0;
-    for (int i = 0; i < sizes.length; i++) {
-      final int chunksInBuffer = chunksPerInput[i];
-      final int chunkEndIdx = chunkIdx + chunksInBuffer;
-      // metadata stored in front of compressed data
-      long bufferSize = METADATA_BYTES_PER_CHUNK * chunksInBuffer;
-      // add in the compressed chunk sizes to get the total size
-      while (chunkIdx < chunkEndIdx) {
-        bufferSize += outputChunkSizes[chunkIdx++];
-      }
-      sizes[i] = bufferSize;
-    }
-    assert chunkIdx == outputChunkSizes.length;
-    return sizes;
+  @Override
+  protected void batchedCompressAsync(long devInPtrs, long devInSizes, long chunkSize,
+      long batchSize, long tempPtr, long tempSize, long devOutPtrs,
+      long compressedSizesOutPtr, long stream) {
+    NvcompJni.batchedLZ4CompressAsync(devInPtrs, devInSizes, chunkSize, batchSize,
+        tempPtr, tempSize, devOutPtrs, compressedSizesOutPtr, stream);
   }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java
index 40ad4d5e9ed..d78d537ea13 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedLZ4Decompressor.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,18 +16,15 @@
 
 package ai.rapids.cudf.nvcomp;
 
-import ai.rapids.cudf.CloseableArray;
 import ai.rapids.cudf.Cuda;
 import ai.rapids.cudf.BaseDeviceMemoryBuffer;
-import ai.rapids.cudf.DeviceMemoryBuffer;
-import ai.rapids.cudf.HostMemoryBuffer;
-import ai.rapids.cudf.NvtxColor;
-import ai.rapids.cudf.NvtxRange;
-
-import java.util.Arrays;
 
 /** LZ4 decompressor that operates on multiple input buffers in a batch */
-public class BatchedLZ4Decompressor {
+public class BatchedLZ4Decompressor extends BatchedDecompressor {
+  public BatchedLZ4Decompressor(long chunkSize) {
+    super(chunkSize);
+  }
+
   /**
    * Asynchronously decompress a batch of buffers
    * @param chunkSize maximum uncompressed block size, must match value used during compression
@@ -35,165 +32,24 @@ public class BatchedLZ4Decompressor {
    * @param outputs output buffers that will contain the compressed results, each must be sized
    *                to the exact decompressed size of the corresponding input
    * @param stream CUDA stream to use
+   *
+   * Deprecated: Use the non-static version in the parent class instead.
    */
-  public static void decompressAsync(long chunkSize,
-                                     BaseDeviceMemoryBuffer[] origInputs,
-                                     BaseDeviceMemoryBuffer[] outputs,
-                                     Cuda.Stream stream) {
-    try (CloseableArray<BaseDeviceMemoryBuffer> inputs =
-             CloseableArray.wrap(Arrays.copyOf(origInputs, origInputs.length))) {
-      BatchedLZ4Compressor.validateChunkSize(chunkSize);
-      if (origInputs.length != outputs.length) {
-        throw new IllegalArgumentException("number of inputs must match number of outputs");
-      }
-      final int numInputs = inputs.size();
-      if (numInputs == 0) {
-        return;
-      }
-
-      int[] chunksPerInput = new int[numInputs];
-      long totalChunks = 0;
-      for (int i = 0; i < numInputs; i++) {
-        // use output size to determine number of chunks in the input, as the output buffer
-        // must be exactly sized to the uncompressed data
-        BaseDeviceMemoryBuffer buffer = outputs[i];
-        int numBufferChunks = getNumChunksInBuffer(chunkSize, buffer);
-        chunksPerInput[i] = numBufferChunks;
-        totalChunks += numBufferChunks;
-      }
-
-      final long tempBufferSize = NvcompJni.batchedLZ4DecompressGetTempSize(totalChunks, chunkSize);
-      try (DeviceMemoryBuffer devAddrsSizes =
-               buildAddrsSizesBuffer(chunkSize, totalChunks, inputs.getArray(), chunksPerInput,
-                   outputs, stream);
-           DeviceMemoryBuffer devTemp = DeviceMemoryBuffer.allocate(tempBufferSize)) {
-        // buffer containing addresses and sizes contains four vectors of longs in this order:
-        // - compressed chunk input addresses
-        // - chunk output buffer addresses
-        // - compressed chunk sizes
-        // - uncompressed chunk sizes
-        final long inputAddrsPtr = devAddrsSizes.getAddress();
-        final long outputAddrsPtr = inputAddrsPtr + totalChunks * 8;
-        final long inputSizesPtr = outputAddrsPtr + totalChunks * 8;
-        final long outputSizesPtr = inputSizesPtr + totalChunks * 8;
-        NvcompJni.batchedLZ4DecompressAsync(
-            inputAddrsPtr,
-            inputSizesPtr,
-            outputSizesPtr,
-            totalChunks,
-            devTemp.getAddress(),
-            devTemp.getLength(),
-            outputAddrsPtr,
-            stream.getStream());
-      }
-    }
+  public static void decompressAsync(long chunkSize, BaseDeviceMemoryBuffer[] origInputs,
+      BaseDeviceMemoryBuffer[] outputs, Cuda.Stream stream) {
+    new BatchedLZ4Decompressor(chunkSize).decompressAsync(origInputs, outputs, stream);
   }
 
-  private static int getNumChunksInBuffer(long chunkSize, BaseDeviceMemoryBuffer buffer) {
-    return (int) ((buffer.getLength() + chunkSize - 1) / chunkSize);
+  @Override
+  protected long batchedDecompressGetTempSize(long numChunks, long maxUncompressedChunkBytes) {
+    return NvcompJni.batchedLZ4DecompressGetTempSize(numChunks, maxUncompressedChunkBytes);
   }
 
-  /**
-   * Build a device memory buffer containing four vectors of longs in the following order:
-   * <ul>
-   *   <li>compressed chunk input addresses</li>
-   *   <li>uncompressed chunk output addresses</li>
-   *   <li>compressed chunk sizes</li>
-   *   <li>uncompressed chunk sizes</li>
-   * </ul>
-   * Each vector contains as many longs as the number of chunks being decompressed
-   * @param chunkSize maximum uncompressed size of a chunk
-   * @param totalChunks total number of chunks to be decompressed
-   * @param inputs device buffers containing the compressed data
-   * @param chunksPerInput number of compressed chunks per input buffer
-   * @param outputs device buffers that will hold the uncompressed output
-   * @param stream CUDA stream to use
-   * @return device buffer containing address and size vectors
-   */
-  private static DeviceMemoryBuffer buildAddrsSizesBuffer(long chunkSize,
-                                                          long totalChunks,
-                                                          BaseDeviceMemoryBuffer[] inputs,
-                                                          int[] chunksPerInput,
-                                                          BaseDeviceMemoryBuffer[] outputs,
-                                                          Cuda.Stream stream) {
-    final long totalBufferSize = totalChunks * 8L * 4L;
-    try (NvtxRange range = new NvtxRange("buildAddrSizesBuffer", NvtxColor.YELLOW)) {
-      try (HostMemoryBuffer metadata = fetchMetadata(totalChunks, inputs, chunksPerInput, stream);
-           HostMemoryBuffer hostAddrsSizes = HostMemoryBuffer.allocate(totalBufferSize);
-           DeviceMemoryBuffer devAddrsSizes = DeviceMemoryBuffer.allocate(totalBufferSize)) {
-        // Build four long vectors in the AddrsSizes buffer:
-        // - compressed input address (one per chunk)
-        // - uncompressed output address (one per chunk)
-        // - compressed input size (one per chunk)
-        // - uncompressed input size (one per chunk)
-        final long srcAddrsOffset = 0;
-        final long destAddrsOffset = srcAddrsOffset + totalChunks * 8L;
-        final long srcSizesOffset = destAddrsOffset + totalChunks * 8L;
-        final long destSizesOffset = srcSizesOffset + totalChunks * 8L;
-        long chunkIdx = 0;
-        for (int inputIdx = 0; inputIdx < inputs.length; inputIdx++) {
-          final BaseDeviceMemoryBuffer input = inputs[inputIdx];
-          final BaseDeviceMemoryBuffer output = outputs[inputIdx];
-          final int numChunksInInput = chunksPerInput[inputIdx];
-          long srcAddr = input.getAddress() +
-              BatchedLZ4Compressor.METADATA_BYTES_PER_CHUNK * numChunksInInput;
-          long destAddr = output.getAddress();
-          final long chunkIdxEnd = chunkIdx + numChunksInInput;
-          while (chunkIdx < chunkIdxEnd) {
-            final long srcChunkSize = metadata.getLong(chunkIdx * 8);
-            final long destChunkSize = (chunkIdx < chunkIdxEnd - 1) ? chunkSize
-                : output.getAddress() + output.getLength() - destAddr;
-            hostAddrsSizes.setLong(srcAddrsOffset + chunkIdx * 8, srcAddr);
-            hostAddrsSizes.setLong(destAddrsOffset + chunkIdx * 8, destAddr);
-            hostAddrsSizes.setLong(srcSizesOffset + chunkIdx * 8, srcChunkSize);
-            hostAddrsSizes.setLong(destSizesOffset + chunkIdx * 8, destChunkSize);
-            srcAddr += srcChunkSize;
-            destAddr += destChunkSize;
-            ++chunkIdx;
-          }
-        }
-        devAddrsSizes.copyFromHostBuffer(hostAddrsSizes, stream);
-        devAddrsSizes.incRefCount();
-        return devAddrsSizes;
-      }
-    }
+  @Override
+  protected void batchedDecompressAsync(long devInPtrs, long devInSizes, long devOutSizes,
+      long batchSize, long tempPtr, long tempSize, long devOutPtrs, long stream) {
+    NvcompJni.batchedLZ4DecompressAsync(devInPtrs, devInSizes, devOutSizes, batchSize, tempPtr,
+        tempSize, devOutPtrs, stream);
   }
 
-  /**
-   * Fetch the metadata at the front of each input in a single, contiguous host buffer.
-   * @param totalChunks total number of compressed chunks
-   * @param inputs buffers containing the compressed data
-   * @param chunksPerInput number of compressed chunks for the corresponding input
-   * @param stream CUDA stream to use
-   * @return host buffer containing all of the metadata
-   */
-  private static HostMemoryBuffer fetchMetadata(long totalChunks,
-                                                BaseDeviceMemoryBuffer[] inputs,
-                                                int[] chunksPerInput,
-                                                Cuda.Stream stream) {
-    try (NvtxRange range = new NvtxRange("fetchMetadata", NvtxColor.PURPLE)) {
-      // one long per chunk containing the compressed size
-      final long totalMetadataSize = totalChunks * BatchedLZ4Compressor.METADATA_BYTES_PER_CHUNK;
-      // Build corresponding vectors of destination addresses, source addresses and sizes.
-      long[] destAddrs = new long[inputs.length];
-      long[] srcAddrs = new long[inputs.length];
-      long[] sizes = new long[inputs.length];
-      try (HostMemoryBuffer hostMetadata = HostMemoryBuffer.allocate(totalMetadataSize);
-           DeviceMemoryBuffer devMetadata = DeviceMemoryBuffer.allocate(totalMetadataSize)) {
-        long destCopyAddr = devMetadata.getAddress();
-        for (int inputIdx = 0; inputIdx < inputs.length; inputIdx++) {
-          final BaseDeviceMemoryBuffer input = inputs[inputIdx];
-          final long copySize = chunksPerInput[inputIdx] * BatchedLZ4Compressor.METADATA_BYTES_PER_CHUNK;
-          destAddrs[inputIdx] = destCopyAddr;
-          srcAddrs[inputIdx] = input.getAddress();
-          sizes[inputIdx] = copySize;
-          destCopyAddr += copySize;
-        }
-        Cuda.multiBufferCopyAsync(destAddrs, srcAddrs, sizes, stream);
-        hostMetadata.copyFromDeviceBuffer(devMetadata, stream);
-        hostMetadata.incRefCount();
-        return hostMetadata;
-      }
-    }
-  }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdCompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdCompressor.java
new file mode 100644
index 00000000000..0532b4aa86d
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdCompressor.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf.nvcomp;
+
+/** Multi-buffer ZSTD compressor */
+public class BatchedZstdCompressor extends BatchedCompressor {
+  /**
+   * Construct a batched ZSTD compressor instance
+   * @param chunkSize maximum amount of uncompressed data to compress as a single chunk.
+   *                  Inputs larger than this will be compressed in multiple chunks.
+   * @param maxIntermediateBufferSize desired maximum size of intermediate device buffers
+   *                                  used during compression.
+   */
+  public BatchedZstdCompressor(long chunkSize, long maxIntermediateBufferSize) {
+    super(chunkSize, NvcompJni.batchedZstdCompressGetMaxOutputChunkSize(chunkSize),
+        maxIntermediateBufferSize);
+  }
+
+  @Override
+  protected long batchedCompressGetTempSize(long batchSize, long maxChunkSize) {
+    return NvcompJni.batchedZstdCompressGetTempSize(batchSize, maxChunkSize);
+  }
+
+  @Override
+  protected void batchedCompressAsync(long devInPtrs, long devInSizes, long chunkSize,
+      long batchSize, long tempPtr, long tempSize, long devOutPtrs,
+      long compressedSizesOutPtr, long stream) {
+    NvcompJni.batchedZstdCompressAsync(devInPtrs, devInSizes, chunkSize, batchSize,
+        tempPtr, tempSize, devOutPtrs, compressedSizesOutPtr, stream);
+  }
+}
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdDecompressor.java b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdDecompressor.java
new file mode 100644
index 00000000000..ba11a236834
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/BatchedZstdDecompressor.java
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf.nvcomp;
+
+/** ZSTD decompressor that operates on multiple input buffers in a batch */
+public class BatchedZstdDecompressor extends BatchedDecompressor {
+  public BatchedZstdDecompressor(long chunkSize) {
+    super(chunkSize);
+  }
+
+  @Override
+  protected long batchedDecompressGetTempSize(long numChunks, long maxUncompressedChunkBytes) {
+    return NvcompJni.batchedZstdDecompressGetTempSize(numChunks, maxUncompressedChunkBytes);
+  }
+
+  @Override
+  protected void batchedDecompressAsync(long devInPtrs, long devInSizes, long devOutSizes,
+      long batchSize, long tempPtr, long tempSize, long devOutPtrs, long stream) {
+    NvcompJni.batchedZstdDecompressAsync(devInPtrs, devInSizes, devOutSizes, batchSize, tempPtr,
+        tempSize, devOutPtrs, stream);
+  }
+
+}
diff --git a/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java b/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java
index 57094008c08..1a21629a208 100644
--- a/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java
+++ b/java/src/main/java/ai/rapids/cudf/nvcomp/NvcompJni.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@ class NvcompJni {
     NativeDepsLoader.loadNativeDeps();
   }
 
+  // For lz4
   /**
    * Get the temporary workspace size required to perform compression of entire LZ4 batch.
    * @param batchSize number of chunks in the batch
@@ -114,4 +115,97 @@ static native void batchedLZ4GetDecompressSizeAsync(
       long devOutSizes,
       long batchSize,
       long stream);
+
+  // For zstd
+  /**
+   * Get the temporary workspace size required to perform compression of entire zstd batch.
+   * @param batchSize number of chunks in the batch
+   * @param maxChunkSize maximum size of an uncompressed chunk in bytes
+   * @return The size of required temporary workspace in bytes to compress the batch.
+   */
+  static native long batchedZstdCompressGetTempSize(long batchSize, long maxChunkSize);
+
+  /**
+   * Get the maximum size any chunk could compress to in a ZSTD batch. This is the minimum
+   * amount of output memory to allocate per chunk when batch compressing.
+   * @param maxChunkSize maximum size of an uncompressed chunk size in bytes
+   * @return maximum compressed output size of a chunk
+   */
+  static native long batchedZstdCompressGetMaxOutputChunkSize(long maxChunkSize);
+
+  /**
+   * Asynchronously compress a batch of buffers with ZSTD. Note that
+   * compressedSizesOutPtr must point to pinned memory for this operation
+   * to be asynchronous.
+   * @param devInPtrs device address of uncompressed buffer addresses vector
+   * @param devInSizes device address of uncompressed buffer sizes vector
+   * @param chunkSize maximum size of an uncompressed chunk in bytes
+   * @param batchSize number of chunks in the batch
+   * @param tempPtr device address of the temporary workspace buffer
+   * @param tempSize size of the temporary workspace buffer in bytes
+   * @param devOutPtrs device address of output buffer addresses vector
+   * @param compressedSizesOutPtr device address where to write the sizes of the
+   *                              compressed data written to the corresponding
+   *                              output buffers. Must point to a buffer with
+   *                              at least 8 bytes of memory per output buffer
+   *                              in the batch.
+   * @param stream CUDA stream to use
+   */
+  static native void batchedZstdCompressAsync(
+      long devInPtrs,
+      long devInSizes,
+      long chunkSize,
+      long batchSize,
+      long tempPtr,
+      long tempSize,
+      long devOutPtrs,
+      long compressedSizesOutPtr,
+      long stream);
+
+  /**
+   * Computes the temporary storage size in bytes needed to decompress a
+   * ZSTD-compressed batch.
+   * @param numChunks number of chunks in the batch
+   * @param maxUncompressedChunkBytes maximum uncompressed size of any chunk in bytes
+   * @return number of temporary storage bytes needed to decompress the batch
+   */
+  static native long batchedZstdDecompressGetTempSize(
+      long numChunks,
+      long maxUncompressedChunkBytes);
+
+  /**
+   * Asynchronously decompress a batch of ZSTD-compressed data buffers.
+   * @param devInPtrs device address of compressed input buffer addresses vector
+   * @param devInSizes device address of compressed input buffer sizes vector
+   * @param devOutSizes device address of uncompressed buffer sizes vector
+   * @param batchSize number of buffers in the batch
+   * @param tempPtr device address of the temporary decompression space
+   * @param tempSize size of the temporary decompression space in bytes
+   * @param devOutPtrs device address of uncompressed output buffer addresses vector
+   * @param stream CUDA stream to use
+   */
+  static native void batchedZstdDecompressAsync(
+      long devInPtrs,
+      long devInSizes,
+      long devOutSizes,
+      long batchSize,
+      long tempPtr,
+      long tempSize,
+      long devOutPtrs,
+      long stream);
+
+  /**
+   * Asynchronously calculates the decompressed size needed for each chunk.
+   * @param devInPtrs device address of compressed input buffer addresses vector
+   * @param devInSizes device address of compressed input buffer sizes vector
+   * @param devOutSizes device address of calculated decompress sizes vector
+   * @param batchSize number of buffers in the batch
+   * @param stream CUDA stream to use
+   */
+  static native void batchedZstdGetDecompressSizeAsync(
+      long devInPtrs,
+      long devInSizes,
+      long devOutSizes,
+      long batchSize,
+      long stream);
 }
diff --git a/java/src/main/native/src/NvcompJni.cpp b/java/src/main/native/src/NvcompJni.cpp
index 47a24653549..8937438e922 100644
--- a/java/src/main/native/src/NvcompJni.cpp
+++ b/java/src/main/native/src/NvcompJni.cpp
@@ -20,6 +20,7 @@
 
 #include <nvcomp.h>
 #include <nvcomp/lz4.h>
+#include <nvcomp/zstd.h>
 
 namespace {
 
@@ -57,6 +58,7 @@ void check_nvcomp_status(JNIEnv* env, nvcompStatus_t status)
 
 extern "C" {
 
+// methods for lz4
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4CompressGetTempSize(
   JNIEnv* env, jclass, jlong j_batch_size, jlong j_max_chunk_size)
 {
@@ -211,4 +213,158 @@ Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedLZ4GetDecompressSizeAsync(JNIEnv* en
   CATCH_STD(env, );
 }
 
+// methods for zstd
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdCompressGetTempSize(
+  JNIEnv* env, jclass, jlong j_batch_size, jlong j_max_chunk_size)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto batch_size       = static_cast<std::size_t>(j_batch_size);
+    auto max_chunk_size   = static_cast<std::size_t>(j_max_chunk_size);
+    std::size_t temp_size = 0;
+    auto status           = nvcompBatchedZstdCompressGetTempSize(
+      batch_size, max_chunk_size, nvcompBatchedZstdDefaultOpts, &temp_size);
+    check_nvcomp_status(env, status);
+    return static_cast<jlong>(temp_size);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdCompressGetMaxOutputChunkSize(
+  JNIEnv* env, jclass, jlong j_max_chunk_size)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto max_chunk_size         = static_cast<std::size_t>(j_max_chunk_size);
+    std::size_t max_output_size = 0;
+    auto status                 = nvcompBatchedZstdCompressGetMaxOutputChunkSize(
+      max_chunk_size, nvcompBatchedZstdDefaultOpts, &max_output_size);
+    check_nvcomp_status(env, status);
+    return static_cast<jlong>(max_output_size);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdCompressAsync(JNIEnv* env,
+                                                              jclass,
+                                                              jlong j_in_ptrs,
+                                                              jlong j_in_sizes,
+                                                              jlong j_chunk_size,
+                                                              jlong j_batch_size,
+                                                              jlong j_temp_ptr,
+                                                              jlong j_temp_size,
+                                                              jlong j_out_ptrs,
+                                                              jlong j_compressed_sizes_out_ptr,
+                                                              jlong j_stream)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto in_ptrs              = reinterpret_cast<void const* const*>(j_in_ptrs);
+    auto in_sizes             = reinterpret_cast<std::size_t const*>(j_in_sizes);
+    auto chunk_size           = static_cast<std::size_t>(j_chunk_size);
+    auto batch_size           = static_cast<std::size_t>(j_batch_size);
+    auto temp_ptr             = reinterpret_cast<void*>(j_temp_ptr);
+    auto temp_size            = static_cast<std::size_t>(j_temp_size);
+    auto out_ptrs             = reinterpret_cast<void* const*>(j_out_ptrs);
+    auto compressed_out_sizes = reinterpret_cast<std::size_t*>(j_compressed_sizes_out_ptr);
+    auto stream               = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status               = nvcompBatchedZstdCompressAsync(in_ptrs,
+                                                 in_sizes,
+                                                 chunk_size,
+                                                 batch_size,
+                                                 temp_ptr,
+                                                 temp_size,
+                                                 out_ptrs,
+                                                 compressed_out_sizes,
+                                                 nvcompBatchedZstdDefaultOpts,
+                                                 stream);
+    check_nvcomp_status(env, status);
+  }
+  CATCH_STD(env, );
+}
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdDecompressGetTempSize(
+  JNIEnv* env, jclass, jlong j_batch_size, jlong j_chunk_size)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto batch_size       = static_cast<std::size_t>(j_batch_size);
+    auto chunk_size       = static_cast<std::size_t>(j_chunk_size);
+    std::size_t temp_size = 0;
+    auto status = nvcompBatchedZstdDecompressGetTempSize(batch_size, chunk_size, &temp_size);
+    check_nvcomp_status(env, status);
+    return static_cast<jlong>(temp_size);
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdDecompressAsync(JNIEnv* env,
+                                                                jclass,
+                                                                jlong j_in_ptrs,
+                                                                jlong j_in_sizes,
+                                                                jlong j_out_sizes,
+                                                                jlong j_batch_size,
+                                                                jlong j_temp_ptr,
+                                                                jlong j_temp_size,
+                                                                jlong j_out_ptrs,
+                                                                jlong j_stream)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto compressed_ptrs           = reinterpret_cast<void const* const*>(j_in_ptrs);
+    auto compressed_sizes          = reinterpret_cast<std::size_t const*>(j_in_sizes);
+    auto uncompressed_sizes        = reinterpret_cast<std::size_t const*>(j_out_sizes);
+    auto batch_size                = static_cast<std::size_t>(j_batch_size);
+    auto temp_ptr                  = reinterpret_cast<void*>(j_temp_ptr);
+    auto temp_size                 = static_cast<std::size_t>(j_temp_size);
+    auto uncompressed_ptrs         = reinterpret_cast<void* const*>(j_out_ptrs);
+    auto stream                    = reinterpret_cast<cudaStream_t>(j_stream);
+    auto uncompressed_statuses     = rmm::device_uvector<nvcompStatus_t>(batch_size, stream);
+    auto actual_uncompressed_sizes = rmm::device_uvector<std::size_t>(batch_size, stream);
+    auto status                    = nvcompBatchedZstdDecompressAsync(compressed_ptrs,
+                                                   compressed_sizes,
+                                                   uncompressed_sizes,
+                                                   actual_uncompressed_sizes.data(),
+                                                   batch_size,
+                                                   temp_ptr,
+                                                   temp_size,
+                                                   uncompressed_ptrs,
+                                                   uncompressed_statuses.data(),
+                                                   stream);
+    check_nvcomp_status(env, status);
+    if (!cudf::java::check_nvcomp_output_sizes(
+          uncompressed_sizes, actual_uncompressed_sizes.data(), batch_size, stream)) {
+      cudf::jni::throw_java_exception(
+        env, NVCOMP_ERROR_CLASS, "nvcomp decompress output size mismatch");
+    }
+  }
+  CATCH_STD(env, );
+}
+
+JNIEXPORT void JNICALL
+Java_ai_rapids_cudf_nvcomp_NvcompJni_batchedZstdGetDecompressSizeAsync(JNIEnv* env,
+                                                                       jclass,
+                                                                       jlong j_in_ptrs,
+                                                                       jlong j_in_sizes,
+                                                                       jlong j_out_sizes,
+                                                                       jlong j_batch_size,
+                                                                       jlong j_stream)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto compressed_ptrs    = reinterpret_cast<void const* const*>(j_in_ptrs);
+    auto compressed_sizes   = reinterpret_cast<std::size_t const*>(j_in_sizes);
+    auto uncompressed_sizes = reinterpret_cast<std::size_t*>(j_out_sizes);
+    auto batch_size         = static_cast<std::size_t>(j_batch_size);
+    auto stream             = reinterpret_cast<cudaStream_t>(j_stream);
+    auto status             = nvcompBatchedZstdGetDecompressSizeAsync(
+      compressed_ptrs, compressed_sizes, uncompressed_sizes, batch_size, stream);
+    check_nvcomp_status(env, status);
+  }
+  CATCH_STD(env, );
+}
+
 }  // extern "C"
diff --git a/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java b/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
index 66f4fe39109..4e8fc225257 100644
--- a/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
+++ b/java/src/test/java/ai/rapids/cudf/nvcomp/NvcompTest.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,18 +23,29 @@
 import org.slf4j.LoggerFactory;
 
 import java.util.Arrays;
-import java.util.Optional;
 
 public class NvcompTest {
   private static final HostMemoryAllocator hostMemoryAllocator = DefaultHostMemoryAllocator.get();
 
   private static final Logger log = LoggerFactory.getLogger(ColumnVector.class);
 
+  private final long chunkSize = 64 * 1024;
+  private final long targetIntermediteSize = Long.MAX_VALUE;
+
   @Test
   void testBatchedLZ4RoundTripAsync() {
+    testBatchedRoundTripAsync(new BatchedLZ4Compressor(chunkSize, targetIntermediteSize),
+        new BatchedLZ4Decompressor(chunkSize));
+  }
+
+  @Test
+  void testBatchedZstdRoundTripAsync() {
+    testBatchedRoundTripAsync(new BatchedZstdCompressor(chunkSize, targetIntermediteSize),
+        new BatchedZstdDecompressor(chunkSize));
+  }
+
+  void testBatchedRoundTripAsync(BatchedCompressor comp, BatchedDecompressor decomp) {
     final Cuda.Stream stream = Cuda.DEFAULT_STREAM;
-    final long chunkSize = 64 * 1024;
-    final long targetIntermediteSize = Long.MAX_VALUE;
     final int maxElements = 1024 * 1024 + 1;
     final int numBuffers = 200;
     long[] data = new long[maxElements];
@@ -52,10 +63,8 @@ void testBatchedLZ4RoundTripAsync() {
       }
 
       // compress and decompress the buffers
-      BatchedLZ4Compressor compressor = new BatchedLZ4Compressor(chunkSize, targetIntermediteSize);
-
       try (CloseableArray<DeviceMemoryBuffer> compressedBuffers =
-               CloseableArray.wrap(compressor.compress(originalBuffers.getArray(), stream));
+               CloseableArray.wrap(comp.compress(originalBuffers.getArray(), stream));
            CloseableArray<DeviceMemoryBuffer> uncompressedBuffers =
                CloseableArray.wrap(new DeviceMemoryBuffer[numBuffers])) {
         for (int i = 0; i < numBuffers; i++) {
@@ -64,8 +73,8 @@ void testBatchedLZ4RoundTripAsync() {
         }
 
         // decompress takes ownership of the compressed buffers and will close them
-        BatchedLZ4Decompressor.decompressAsync(chunkSize, compressedBuffers.release(),
-            uncompressedBuffers.getArray(), stream);
+        decomp.decompressAsync(compressedBuffers.release(), uncompressedBuffers.getArray(),
+            stream);
 
         // check the decompressed results against the original
         for (int i = 0; i < numBuffers; ++i) {

From 4069c8223e2131130295f488ce363af82ead4be5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 14 May 2024 06:39:55 -1000
Subject: [PATCH 202/842] Validate and materialize iterators earlier in
 as_column (#15739)

closes #8796

I left a `TODO` in `as_column` to validate earlier that `arbitrary` is an iterable or sequence like if it wasn't a recognized array like (e.g. numpy array, pandas object, etc). Additionally, ensure we materialize iterators since there are some checks that would exhaust the object

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15739
---
 python/cudf/cudf/core/column/column.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 3754ed1392e..371c91dd96f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2070,8 +2070,15 @@ def as_column(
         except (ValueError, TypeError):
             arbitrary = np.asarray(arbitrary)
         return as_column(arbitrary, dtype=dtype, nan_as_null=nan_as_null)
+    elif not isinstance(arbitrary, (abc.Iterable, abc.Sequence)):
+        raise TypeError(
+            f"{type(arbitrary).__name__} must be an iterable or sequence."
+        )
+    elif isinstance(arbitrary, abc.Iterator):
+        arbitrary = list(arbitrary)
+
     # Start of arbitrary that's not handed above but dtype provided
-    elif isinstance(dtype, pd.DatetimeTZDtype):
+    if isinstance(dtype, pd.DatetimeTZDtype):
         raise NotImplementedError(
             "Use `tz_localize()` to construct timezone aware data."
         )
@@ -2127,11 +2134,7 @@ def as_column(
                 return cudf.core.column.ListColumn.from_sequences(arbitrary)
             raise
         return as_column(data, nan_as_null=nan_as_null)
-    elif not isinstance(arbitrary, (abc.Iterable, abc.Sequence)):
-        # TODO: This validation should probably be done earlier?
-        raise TypeError(
-            f"{type(arbitrary).__name__} must be an iterable or sequence."
-        )
+
     from_pandas = nan_as_null is None or nan_as_null
     if dtype is not None:
         dtype = cudf.dtype(dtype)
@@ -2147,7 +2150,6 @@ def as_column(
             arbitrary = pd.Series(arbitrary, dtype=dtype)
         return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
     else:
-        arbitrary = list(arbitrary)
         for element in arbitrary:
             # Carve-outs that cannot be parsed by pyarrow/pandas
             if is_column_like(element):

From cbe277568c42c122a69fdad012e98580a0bb3d71 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 14 May 2024 12:20:26 -0500
Subject: [PATCH 203/842] Fix `Index.repeat` for `datetime64` types (#15722)

Fixes: #15720

This PR fixes `Index.repeat` where the `freq` of `DatetimeIndex` needs to be reset.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15722
---
 python/cudf/cudf/core/index.py         |  5 +++++
 python/cudf/cudf/core/indexed_frame.py |  5 ++++-
 python/cudf/cudf/tests/test_index.py   | 14 ++++++++++++++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 0710f0f5c42..209e582e5d6 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2373,6 +2373,11 @@ def tz_convert(self, tz: str | None):
         result_col = self._column.tz_convert(tz)
         return DatetimeIndex._from_data({self.name: result_col})
 
+    def repeat(self, repeats, axis=None):
+        res = super().repeat(repeats, axis=axis)
+        res._freq = None
+        return res
+
 
 class TimedeltaIndex(Index):
     """
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index dc261707867..904cd0c69c2 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -4871,13 +4871,16 @@ def repeat(self, repeats, axis=None):
         1    2
         dtype: int64
         """
-        return self._from_columns_like_self(
+        res = self._from_columns_like_self(
             Frame._repeat(
                 [*self._index._data.columns, *self._columns], repeats, axis
             ),
             self._column_names,
             self._index_names,
         )
+        if isinstance(res.index, cudf.DatetimeIndex):
+            res.index._freq = None
+        return res
 
     def astype(
         self,
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 8b7ee1dccf8..8e7532d044d 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -3252,3 +3252,17 @@ def test_Index_init_with_nans():
     assert gi.dtype == np.dtype("float64")
     pi = pd.Index([1, 2, 3, np.nan])
     assert_eq(pi, gi)
+
+
+def test_index_datetime_repeat():
+    gidx = cudf.date_range("2021-01-01", periods=3, freq="D")
+    pidx = gidx.to_pandas()
+
+    actual = gidx.repeat(5)
+    expected = pidx.repeat(5)
+
+    assert_eq(actual, expected)
+
+    actual = gidx.to_frame().repeat(5)
+
+    assert_eq(actual.index, expected)

From 2fb8efb38a71490d0ebaaa4f4fea37591cf02917 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 14 May 2024 10:52:31 -0700
Subject: [PATCH 204/842] Migrate all cpp pxd files into pylibcudf (#15740)

This PR is a mass migration of all the Cython headers exposing libcudf to Cython into the pylibcudf subpackage. This will facilitate splitting out pylibcudf from cudf, and it should also allow us to do some cleanups sooner than that with respect to our imports since this preempts any concerns with circular imports (cudf->pylibcudf->cudf._lib.cpp).

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15740
---
 python/cudf/cudf/_lib/CMakeLists.txt          |  1 -
 python/cudf/cudf/_lib/avro.pyx                | 10 +++---
 python/cudf/cudf/_lib/column.pxd              | 11 ++++---
 python/cudf/cudf/_lib/column.pyx              | 20 +++++------
 python/cudf/cudf/_lib/copying.pxd             |  4 +--
 python/cudf/cudf/_lib/copying.pyx             | 16 +++++----
 .../cudf/_lib/cpp/lists/count_elements.pxd    | 10 ------
 python/cudf/cudf/_lib/cpp/lists/explode.pxd   | 14 --------
 python/cudf/cudf/_lib/cpp/strings/extract.pxd | 15 ---------
 python/cudf/cudf/_lib/cpp/strings/findall.pxd | 14 --------
 python/cudf/cudf/_lib/cpp/strings/strip.pxd   | 16 ---------
 python/cudf/cudf/_lib/cpp/strings/wrap.pxd    | 14 --------
 python/cudf/cudf/_lib/csv.pyx                 | 16 ++++-----
 python/cudf/cudf/_lib/datetime.pyx            | 12 +++----
 python/cudf/cudf/_lib/expressions.pxd         |  4 +--
 python/cudf/cudf/_lib/expressions.pyx         |  9 +++--
 python/cudf/cudf/_lib/groupby.pyx             |  4 +--
 python/cudf/cudf/_lib/hash.pyx                | 14 ++++----
 python/cudf/cudf/_lib/interop.pyx             |  6 ++--
 python/cudf/cudf/_lib/io/datasource.pxd       |  6 ++--
 python/cudf/cudf/_lib/io/datasource.pyx       |  6 ++--
 python/cudf/cudf/_lib/io/utils.pxd            | 10 ++++--
 python/cudf/cudf/_lib/io/utils.pyx            |  8 ++---
 python/cudf/cudf/_lib/json.pyx                | 22 ++++++-------
 python/cudf/cudf/_lib/labeling.pyx            | 11 ++++---
 python/cudf/cudf/_lib/lists.pyx               | 33 ++++++++++++-------
 python/cudf/cudf/_lib/null_mask.pyx           | 10 +++---
 .../cudf/_lib/nvtext/byte_pair_encode.pyx     | 10 +++---
 .../cudf/cudf/_lib/nvtext/edit_distance.pyx   |  8 ++---
 .../cudf/cudf/_lib/nvtext/generate_ngrams.pyx | 12 +++----
 python/cudf/cudf/_lib/nvtext/jaccard.pyx      | 12 ++++---
 python/cudf/cudf/_lib/nvtext/minhash.pyx      | 10 +++---
 .../cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx | 12 +++----
 python/cudf/cudf/_lib/nvtext/normalize.pyx    |  8 ++---
 python/cudf/cudf/_lib/nvtext/replace.pyx      | 12 +++----
 python/cudf/cudf/_lib/nvtext/stemmer.pyx      | 10 +++---
 .../cudf/_lib/nvtext/subword_tokenize.pyx     |  6 ++--
 python/cudf/cudf/_lib/nvtext/tokenize.pyx     | 12 +++----
 python/cudf/cudf/_lib/orc.pyx                 | 28 ++++++++--------
 python/cudf/cudf/_lib/parquet.pyx             | 33 ++++++++++---------
 python/cudf/cudf/_lib/partitioning.pyx        | 14 ++++----
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |  1 +
 .../cudf/cudf/_lib/pylibcudf/aggregation.pxd  |  4 +--
 .../cudf/cudf/_lib/pylibcudf/aggregation.pyx  | 16 +++++----
 python/cudf/cudf/_lib/pylibcudf/binaryop.pxd  |  2 +-
 python/cudf/cudf/_lib/pylibcudf/binaryop.pyx  |  8 ++---
 python/cudf/cudf/_lib/pylibcudf/column.pxd    |  9 +++--
 python/cudf/cudf/_lib/pylibcudf/column.pyx    | 10 +++---
 .../cudf/cudf/_lib/pylibcudf/concatenate.pyx  | 10 +++---
 python/cudf/cudf/_lib/pylibcudf/copying.pxd   |  7 ++--
 python/cudf/cudf/_lib/pylibcudf/copying.pyx   | 28 +++++++++-------
 python/cudf/cudf/_lib/pylibcudf/filling.pxd   |  2 +-
 python/cudf/cudf/_lib/pylibcudf/filling.pyx   |  8 ++---
 python/cudf/cudf/_lib/pylibcudf/groupby.pxd   |  6 ++--
 python/cudf/cudf/_lib/pylibcudf/groupby.pyx   | 10 +++---
 python/cudf/cudf/_lib/pylibcudf/interop.pyx   | 11 ++++---
 python/cudf/cudf/_lib/pylibcudf/join.pxd      |  2 +-
 python/cudf/cudf/_lib/pylibcudf/join.pyx      | 11 +++++--
 .../{cpp => pylibcudf/libcudf}/CMakeLists.txt |  0
 .../{cpp => pylibcudf/libcudf}/__init__.pxd   |  0
 .../{cpp => pylibcudf/libcudf}/__init__.py    |  0
 .../libcudf}/aggregation.pxd                  |  2 +-
 .../libcudf}/aggregation.pyx                  |  0
 .../{cpp => pylibcudf/libcudf}/binaryop.pxd   |  8 ++---
 .../{cpp => pylibcudf/libcudf}/binaryop.pyx   |  0
 .../libcudf}/column/__init__.pxd              |  0
 .../libcudf}/column/__init__.py               |  0
 .../libcudf}/column/column.pxd                |  9 +++--
 .../libcudf}/column/column_factories.pxd      |  8 ++---
 .../libcudf}/column/column_view.pxd           |  8 +++--
 .../libcudf}/concatenate.pxd                  |  6 ++--
 .../libcudf}/contiguous_split.pxd             |  6 ++--
 .../{cpp => pylibcudf/libcudf}/copying.pxd    | 15 +++++----
 .../{cpp => pylibcudf/libcudf}/copying.pyx    |  0
 .../{cpp => pylibcudf/libcudf}/datetime.pxd   |  8 ++---
 .../libcudf}/expressions.pxd                  | 10 +++---
 .../{cpp => pylibcudf/libcudf}/filling.pxd    | 17 ++++++----
 .../{cpp => pylibcudf/libcudf}/groupby.pxd    | 18 +++++-----
 .../_lib/{cpp => pylibcudf/libcudf}/hash.pxd  |  6 ++--
 .../{cpp => pylibcudf/libcudf}/interop.pxd    |  8 ++---
 .../libcudf}/io/__init__.pxd                  |  0
 .../{cpp => pylibcudf/libcudf}/io/__init__.py |  0
 .../libcudf}/io/arrow_io_source.pxd           |  4 +--
 .../{cpp => pylibcudf/libcudf}/io/avro.pxd    |  6 ++--
 .../{cpp => pylibcudf/libcudf}/io/csv.pxd     |  8 ++---
 .../libcudf}/io/data_sink.pxd                 |  0
 .../libcudf}/io/datasource.pxd                |  0
 .../{cpp => pylibcudf/libcudf}/io/json.pxd    |  6 ++--
 .../{cpp => pylibcudf/libcudf}/io/orc.pxd     |  6 ++--
 .../libcudf}/io/orc_metadata.pxd              |  2 +-
 .../{cpp => pylibcudf/libcudf}/io/parquet.pxd |  8 ++---
 .../libcudf}/io/parquet_metadata.pxd          |  4 +--
 .../{cpp => pylibcudf/libcudf}/io/text.pxd    |  4 +--
 .../libcudf}/io/timezone.pxd                  |  4 +--
 .../{cpp => pylibcudf/libcudf}/io/types.pxd   | 12 +++----
 .../_lib/{cpp => pylibcudf/libcudf}/join.pxd  |  8 ++---
 .../{cpp => pylibcudf/libcudf}/labeling.pxd   |  6 ++--
 .../libcudf}/lists/__init__.pxd               |  0
 .../libcudf}/lists/__init__.py                |  0
 .../libcudf}/lists/combine.pxd                |  8 ++---
 .../libcudf}/lists/contains.pxd               | 12 ++++---
 .../libcudf/lists/count_elements.pxd          | 12 +++++++
 .../_lib/pylibcudf/libcudf/lists/explode.pxd  | 14 ++++++++
 .../libcudf}/lists/extract.pxd                | 10 +++---
 .../libcudf}/lists/gather.pxd                 |  8 +++--
 .../libcudf}/lists/lists_column_view.pxd      |  9 +++--
 .../libcudf}/lists/sorting.pxd                | 10 +++---
 .../libcudf}/lists/stream_compaction.pxd      | 10 +++---
 .../_lib/{cpp => pylibcudf/libcudf}/merge.pxd |  8 ++---
 .../{cpp => pylibcudf/libcudf}/null_mask.pxd  | 12 ++++---
 .../libcudf}/nvtext/__init__.pxd              |  0
 .../libcudf}/nvtext/__init__.py               |  0
 .../libcudf}/nvtext/byte_pair_encode.pxd      |  8 ++---
 .../libcudf}/nvtext/edit_distance.pxd         |  6 ++--
 .../libcudf}/nvtext/generate_ngrams.pxd       | 10 +++---
 .../libcudf}/nvtext/jaccard.pxd               |  8 ++---
 .../libcudf}/nvtext/minhash.pxd               |  8 ++---
 .../libcudf}/nvtext/ngrams_tokenize.pxd       | 10 +++---
 .../libcudf}/nvtext/normalize.pxd             |  6 ++--
 .../libcudf}/nvtext/replace.pxd               | 10 +++---
 .../libcudf}/nvtext/stemmer.pxd               |  8 ++---
 .../libcudf}/nvtext/subword_tokenize.pxd      |  6 ++--
 .../libcudf}/nvtext/tokenize.pxd              | 10 +++---
 .../libcudf}/partitioning.pxd                 | 12 +++----
 .../{cpp => pylibcudf/libcudf}/quantiles.pxd  | 12 +++----
 .../{cpp => pylibcudf/libcudf}/reduce.pxd     | 13 +++++---
 .../{cpp => pylibcudf/libcudf}/reduce.pyx     |  0
 .../{cpp => pylibcudf/libcudf}/replace.pxd    |  9 +++--
 .../{cpp => pylibcudf/libcudf}/replace.pyx    |  0
 .../{cpp => pylibcudf/libcudf}/reshape.pxd    | 10 +++---
 .../{cpp => pylibcudf/libcudf}/rolling.pxd    |  8 ++---
 .../_lib/{cpp => pylibcudf/libcudf}/round.pxd |  6 ++--
 .../libcudf}/scalar/__init__.pxd              |  0
 .../libcudf}/scalar/__init__.py               |  0
 .../libcudf}/scalar/scalar.pxd                | 10 +++---
 .../{cpp => pylibcudf/libcudf}/search.pxd     | 10 +++---
 .../{cpp => pylibcudf/libcudf}/sorting.pxd    | 12 +++----
 .../libcudf}/stream_compaction.pxd            | 10 +++---
 .../libcudf}/stream_compaction.pyx            |  0
 .../libcudf}/strings/__init__.pxd             |  0
 .../libcudf}/strings/__init__.py              |  0
 .../libcudf}/strings/attributes.pxd           |  6 ++--
 .../libcudf}/strings/capitalize.pxd           |  6 ++--
 .../libcudf}/strings/case.pxd                 |  6 ++--
 .../libcudf}/strings/char_types.pxd           |  8 ++---
 .../libcudf}/strings/combine.pxd              | 10 +++---
 .../libcudf}/strings/contains.pxd             | 10 +++---
 .../libcudf}/strings/convert/__init__.pxd     |  0
 .../libcudf}/strings/convert/__init__.py      |  0
 .../strings/convert/convert_booleans.pxd      |  8 ++---
 .../strings/convert/convert_datetime.pxd      |  8 ++---
 .../strings/convert/convert_durations.pxd     |  8 ++---
 .../strings/convert/convert_fixed_point.pxd   |  8 ++---
 .../strings/convert/convert_floats.pxd        |  8 ++---
 .../strings/convert/convert_integers.pxd      |  8 ++---
 .../libcudf}/strings/convert/convert_ipv4.pxd |  6 ++--
 .../strings/convert/convert_lists.pxd         |  8 ++---
 .../libcudf}/strings/convert/convert_urls.pxd |  6 ++--
 .../pylibcudf/libcudf/strings/extract.pxd     | 15 +++++++++
 .../libcudf}/strings/find.pxd                 |  8 ++---
 .../libcudf}/strings/find_multiple.pxd        |  6 ++--
 .../pylibcudf/libcudf/strings/findall.pxd     | 14 ++++++++
 .../libcudf}/strings/json.pxd                 |  8 ++---
 .../libcudf}/strings/padding.pxd              | 12 +++----
 .../libcudf}/strings/regex_flags.pxd          |  0
 .../libcudf}/strings/regex_program.pxd        |  4 +--
 .../libcudf}/strings/repeat.pxd               |  8 ++---
 .../libcudf}/strings/replace.pxd              | 10 +++---
 .../libcudf}/strings/replace_re.pxd           | 14 ++++----
 .../libcudf}/strings/side_type.pxd            |  0
 .../libcudf}/strings/split/__init__.pxd       |  0
 .../libcudf}/strings/split/__init__.py        |  0
 .../libcudf}/strings/split/partition.pxd      | 10 +++---
 .../libcudf}/strings/split/split.pxd          | 14 ++++----
 .../_lib/pylibcudf/libcudf/strings/strip.pxd  | 16 +++++++++
 .../libcudf}/strings/substring.pxd            | 10 +++---
 .../libcudf}/strings/translate.pxd            | 10 +++---
 .../_lib/pylibcudf/libcudf/strings/wrap.pxd   | 14 ++++++++
 .../libcudf}/strings_udf.pxd                  |  8 ++---
 .../libcudf}/table/__init__.pxd               |  0
 .../libcudf}/table/__init__.py                |  0
 .../libcudf}/table/table.pxd                  | 11 ++++---
 .../libcudf}/table/table_view.pxd             |  9 +++--
 .../{cpp => pylibcudf/libcudf}/transform.pxd  | 18 ++++++----
 .../{cpp => pylibcudf/libcudf}/transpose.pxd  |  6 ++--
 .../_lib/{cpp => pylibcudf/libcudf}/types.pxd |  0
 .../_lib/{cpp => pylibcudf/libcudf}/types.pyx |  0
 .../_lib/{cpp => pylibcudf/libcudf}/unary.pxd |  6 ++--
 .../_lib/{cpp => pylibcudf/libcudf}/unary.pyx |  0
 .../libcudf}/utilities/__init__.pxd           |  0
 .../libcudf}/utilities/__init__.py            |  0
 .../libcudf}/utilities/host_span.pxd          |  0
 .../libcudf}/wrappers/__init__.pxd            |  0
 .../libcudf}/wrappers/__init__.py             |  0
 .../libcudf}/wrappers/decimals.pxd            |  4 +--
 .../libcudf}/wrappers/durations.pxd           |  0
 .../libcudf}/wrappers/timestamps.pxd          |  0
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  2 +-
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     |  6 ++--
 python/cudf/cudf/_lib/pylibcudf/merge.pyx     |  8 ++---
 python/cudf/cudf/_lib/pylibcudf/reduce.pxd    |  2 +-
 python/cudf/cudf/_lib/pylibcudf/reduce.pyx    | 16 +++++----
 python/cudf/cudf/_lib/pylibcudf/replace.pxd   |  2 +-
 python/cudf/cudf/_lib/pylibcudf/replace.pyx   |  6 ++--
 python/cudf/cudf/_lib/pylibcudf/rolling.pxd   |  2 +-
 python/cudf/cudf/_lib/pylibcudf/rolling.pyx   |  8 ++---
 python/cudf/cudf/_lib/pylibcudf/scalar.pxd    |  2 +-
 python/cudf/cudf/_lib/pylibcudf/scalar.pyx    |  2 +-
 python/cudf/cudf/_lib/pylibcudf/search.pyx    |  6 ++--
 python/cudf/cudf/_lib/pylibcudf/sorting.pxd   |  9 +++--
 python/cudf/cudf/_lib/pylibcudf/sorting.pyx   | 10 +++---
 .../cudf/_lib/pylibcudf/stream_compaction.pxd |  6 ++--
 .../cudf/_lib/pylibcudf/stream_compaction.pyx | 16 +++++----
 .../cudf/cudf/_lib/pylibcudf/strings/case.pyx |  4 +--
 .../cudf/cudf/_lib/pylibcudf/strings/find.pxd |  2 +-
 .../cudf/cudf/_lib/pylibcudf/strings/find.pyx |  6 ++--
 python/cudf/cudf/_lib/pylibcudf/table.pxd     |  4 +--
 python/cudf/cudf/_lib/pylibcudf/table.pyx     |  6 ++--
 python/cudf/cudf/_lib/pylibcudf/types.pxd     |  2 +-
 python/cudf/cudf/_lib/pylibcudf/types.pyx     | 20 +++++------
 python/cudf/cudf/_lib/pylibcudf/unary.pxd     |  2 +-
 python/cudf/cudf/_lib/pylibcudf/unary.pyx     |  8 ++---
 python/cudf/cudf/_lib/pylibcudf/utils.pxd     |  4 +--
 python/cudf/cudf/_lib/pylibcudf/utils.pyx     |  4 +--
 python/cudf/cudf/_lib/quantiles.pyx           | 19 +++++++----
 python/cudf/cudf/_lib/reshape.pyx             | 12 +++----
 python/cudf/cudf/_lib/round.pyx               |  8 ++---
 python/cudf/cudf/_lib/scalar.pxd              |  2 +-
 python/cudf/cudf/_lib/scalar.pyx              | 18 +++++-----
 python/cudf/cudf/_lib/sort.pyx                | 10 +++---
 python/cudf/cudf/_lib/string_casting.pyx      | 20 +++++------
 python/cudf/cudf/_lib/strings/attributes.pyx  |  8 ++---
 python/cudf/cudf/_lib/strings/capitalize.pyx  |  8 ++---
 python/cudf/cudf/_lib/strings/char_types.pyx  | 10 +++---
 python/cudf/cudf/_lib/strings/combine.pyx     | 12 +++----
 python/cudf/cudf/_lib/strings/contains.pyx    | 14 ++++----
 .../strings/convert/convert_fixed_point.pyx   | 10 +++---
 .../_lib/strings/convert/convert_floats.pyx   |  8 ++---
 .../_lib/strings/convert/convert_integers.pyx |  8 ++---
 .../_lib/strings/convert/convert_lists.pyx    | 10 +++---
 .../_lib/strings/convert/convert_urls.pyx     |  8 ++---
 python/cudf/cudf/_lib/strings/extract.pyx     | 12 +++----
 python/cudf/cudf/_lib/strings/find.pyx        |  2 +-
 .../cudf/cudf/_lib/strings/find_multiple.pyx  |  8 ++---
 python/cudf/cudf/_lib/strings/findall.pyx     | 12 +++----
 python/cudf/cudf/_lib/strings/json.pyx        | 10 +++---
 python/cudf/cudf/_lib/strings/padding.pyx     | 15 +++++----
 python/cudf/cudf/_lib/strings/repeat.pyx      | 10 +++---
 python/cudf/cudf/_lib/strings/replace.pyx     | 12 +++----
 python/cudf/cudf/_lib/strings/replace_re.pyx  | 16 ++++-----
 .../cudf/_lib/strings/split/partition.pyx     | 10 +++---
 python/cudf/cudf/_lib/strings/split/split.pyx | 18 +++++-----
 python/cudf/cudf/_lib/strings/strip.pyx       | 12 +++----
 python/cudf/cudf/_lib/strings/substring.pyx   | 14 ++++----
 python/cudf/cudf/_lib/strings/translate.pyx   | 12 +++----
 python/cudf/cudf/_lib/strings/wrap.pyx        | 10 +++---
 python/cudf/cudf/_lib/strings_udf.pyx         |  8 ++---
 python/cudf/cudf/_lib/text.pyx                |  6 ++--
 python/cudf/cudf/_lib/timezone.pyx            |  6 ++--
 python/cudf/cudf/_lib/transform.pyx           | 21 +++++++-----
 python/cudf/cudf/_lib/transpose.pyx           |  8 ++---
 python/cudf/cudf/_lib/types.pxd               | 10 +++---
 python/cudf/cudf/_lib/types.pyx               |  8 +++--
 python/cudf/cudf/_lib/utils.pxd               |  4 +--
 python/cudf/cudf/_lib/utils.pyx               |  8 ++---
 python/cudf_kafka/cudf_kafka/_lib/kafka.pxd   |  4 +--
 python/cudf_kafka/cudf_kafka/_lib/kafka.pyx   |  4 +--
 267 files changed, 1134 insertions(+), 995 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/cpp/lists/count_elements.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/lists/explode.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/strings/extract.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/strings/findall.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/strings/strip.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/strings/wrap.pxd
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/CMakeLists.txt (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/aggregation.pxd (98%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/aggregation.pyx (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/binaryop.pxd (84%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/binaryop.pyx (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/column/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/column/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/column/column.pxd (78%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/column/column_factories.pxd (65%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/column/column_view.pxd (95%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/concatenate.pxd (77%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/contiguous_split.pxd (80%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/copying.pxd (91%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/copying.pyx (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/datetime.pxd (90%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/expressions.pxd (92%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/filling.pxd (67%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/groupby.pxd (83%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/hash.pxd (86%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/interop.pxd (83%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/arrow_io_source.pxd (77%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/avro.pxd (88%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/csv.pxd (97%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/data_sink.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/datasource.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/json.pxd (96%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/orc.pxd (97%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/orc_metadata.pxd (96%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/parquet.pxd (97%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/parquet_metadata.pxd (89%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/text.pxd (93%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/timezone.pxd (75%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/io/types.pxd (91%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/join.pxd (87%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/labeling.pxd (71%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/lists/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/lists/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/lists/combine.pxd (73%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/lists/contains.pxd (63%)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/lists/explode.pxd
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/lists/extract.pxd (56%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/lists/gather.pxd (57%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/lists/lists_column_view.pxd (69%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/lists/sorting.pxd (51%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/lists/stream_compaction.pxd (52%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/merge.pxd (62%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/null_mask.pxd (76%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/byte_pair_encode.pxd (67%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/edit_distance.pxd (67%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/generate_ngrams.pxd (63%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/jaccard.pxd (52%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/minhash.pxd (63%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/ngrams_tokenize.pxd (50%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/normalize.pxd (66%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/replace.pxd (63%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/stemmer.pxd (74%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/subword_tokenize.pxd (89%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/nvtext/tokenize.pxd (81%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/partitioning.pxd (64%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/quantiles.pxd (65%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/reduce.pxd (69%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/reduce.pyx (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/replace.pxd (85%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/replace.pyx (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/reshape.pxd (50%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/rolling.pxd (71%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/round.pxd (71%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/scalar/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/scalar/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/scalar/scalar.pxd (89%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/search.pxd (69%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/sorting.pxd (86%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/stream_compaction.pxd (88%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/stream_compaction.pyx (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/attributes.pxd (68%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/capitalize.pxd (67%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/case.pxd (67%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/char_types.pxd (84%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/combine.pxd (80%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/contains.pxd (64%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/convert_booleans.pxd (62%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/convert_datetime.pxd (70%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/convert_durations.pxd (65%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/convert_fixed_point.pxd (66%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/convert_floats.pxd (64%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/convert_integers.pxd (75%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/convert_ipv4.pxd (69%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/convert_lists.pxd (53%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/convert/convert_urls.pxd (62%)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/find.pxd (83%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/find_multiple.pxd (58%)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/findall.pxd
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/json.pxd (75%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/padding.pxd (54%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/regex_flags.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/regex_program.pxd (74%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/repeat.pxd (60%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/replace.pxd (68%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/replace_re.pxd (59%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/side_type.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/split/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/split/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/split/partition.pxd (56%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/split/split.pxd (76%)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/strip.pxd
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/substring.pxd (60%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings/translate.pxd (69%)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/wrap.pxd
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/strings_udf.pxd (81%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/table/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/table/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/table/table.pxd (62%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/table/table_view.pxd (77%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/transform.pxd (70%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/transpose.pxd (59%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/types.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/types.pyx (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/unary.pxd (84%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/unary.pyx (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/utilities/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/utilities/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/utilities/host_span.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/wrappers/__init__.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/wrappers/__init__.py (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/wrappers/decimals.pxd (82%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/wrappers/durations.pxd (100%)
 rename python/cudf/cudf/_lib/{cpp => pylibcudf/libcudf}/wrappers/timestamps.pxd (100%)

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 07f334fdc12..5a067e84f56 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -68,7 +68,6 @@ target_link_libraries(strings_udf PUBLIC cudf_strings_udf)
 set(targets_using_arrow_headers interop avro csv orc json parquet)
 link_to_pyarrow_headers("${targets_using_arrow_headers}")
 
-add_subdirectory(cpp)
 add_subdirectory(io)
 add_subdirectory(nvtext)
 add_subdirectory(pylibcudf)
diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx
index 0e24b5b7459..ae17a5f1ab6 100644
--- a/python/cudf/cudf/_lib/avro.pyx
+++ b/python/cudf/cudf/_lib/avro.pyx
@@ -1,16 +1,16 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.io.avro cimport (
+from cudf._lib.io.utils cimport make_source_info
+from cudf._lib.pylibcudf.libcudf.io.avro cimport (
     avro_reader_options,
     read_avro as libcudf_read_avro,
 )
-from cudf._lib.cpp.io.types cimport table_with_metadata
-from cudf._lib.cpp.types cimport size_type
-from cudf._lib.io.utils cimport make_source_info
+from cudf._lib.pylibcudf.libcudf.io.types cimport table_with_metadata
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.utils cimport data_from_unique_ptr
 
 
diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd
index 7ffb55a6cc6..437f44af9f0 100644
--- a/python/cudf/cudf/_lib/column.pxd
+++ b/python/cudf/cudf/_lib/column.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from typing import Literal
 
@@ -7,9 +7,12 @@ from libcpp.memory cimport unique_ptr
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef class Column:
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 9c48a731cea..f33e121241d 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -43,18 +43,18 @@ from cudf._lib.types import dtype_from_pylibcudf_column
 # from_pylibcudf by instead creating an empty numeric column. We will be able
 # to remove this once column factories are exposed to pylibcudf.
 
-cimport cudf._lib.cpp.copying as cpp_copying
-cimport cudf._lib.cpp.types as libcudf_types
-cimport cudf._lib.cpp.unary as libcudf_unary
-from cudf._lib cimport pylibcudf
-from cudf._lib.cpp.column.column cimport column, column_contents
-from cudf._lib.cpp.column.column_factories cimport (
+cimport cudf._lib.pylibcudf.libcudf.copying as cpp_copying
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+cimport cudf._lib.pylibcudf.libcudf.unary as libcudf_unary
+from cudf._lib.pylibcudf cimport Column as plc_Column
+from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_contents
+from cudf._lib.pylibcudf.libcudf.column.column_factories cimport (
     make_column_from_scalar as cpp_make_column_from_scalar,
     make_numeric_column,
 )
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.null_mask cimport null_count as cpp_null_count
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 from cudf._lib.scalar cimport DeviceScalar
 
 
@@ -633,7 +633,7 @@ cdef class Column:
             # TODO: This function call is what requires cimporting pylibcudf.
             # We can remove the cimport once we can directly do
             # pylibcudf.column_factories.make_numeric_column or equivalent.
-            col = pylibcudf.Column.from_libcudf(
+            col = plc_Column.from_libcudf(
                 move(
                     make_numeric_column(
                         new_dtype, col.size(), libcudf_types.mask_state.ALL_NULL
diff --git a/python/cudf/cudf/_lib/copying.pxd b/python/cudf/cudf/_lib/copying.pxd
index 599b9c5a067..8fc7f4e1da0 100644
--- a/python/cudf/cudf/_lib/copying.pxd
+++ b/python/cudf/cudf/_lib/copying.pxd
@@ -1,6 +1,6 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.contiguous_split cimport packed_columns
+from cudf._lib.pylibcudf.libcudf.contiguous_split cimport packed_columns
 
 
 cdef class _CPackedColumns:
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 6a52af520f0..796c70e615c 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -26,15 +26,17 @@ from cudf.core.abc import Serializable
 
 from libcpp.memory cimport make_unique
 
-cimport cudf._lib.cpp.contiguous_split as cpp_contiguous_split
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.lists.gather cimport (
+cimport cudf._lib.pylibcudf.libcudf.contiguous_split as cpp_contiguous_split
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.lists.gather cimport (
     segmented_gather as cpp_segmented_gather,
 )
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_table_view
 
 # workaround for https://github.com/cython/cython/issues/3885
diff --git a/python/cudf/cudf/_lib/cpp/lists/count_elements.pxd b/python/cudf/cudf/_lib/cpp/lists/count_elements.pxd
deleted file mode 100644
index 9be38f26237..00000000000
--- a/python/cudf/cudf/_lib/cpp/lists/count_elements.pxd
+++ /dev/null
@@ -1,10 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-
-
-cdef extern from "cudf/lists/count_elements.hpp" namespace "cudf::lists" nogil:
-    cdef unique_ptr[column] count_elements(const lists_column_view) except +
diff --git a/python/cudf/cudf/_lib/cpp/lists/explode.pxd b/python/cudf/cudf/_lib/cpp/lists/explode.pxd
deleted file mode 100644
index c3e15dd203c..00000000000
--- a/python/cudf/cudf/_lib/cpp/lists/explode.pxd
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
-
-
-cdef extern from "cudf/lists/explode.hpp" namespace "cudf" nogil:
-    cdef unique_ptr[table] explode_outer(
-        const table_view,
-        size_type explode_column_idx,
-    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/extract.pxd b/python/cudf/cudf/_lib/cpp/strings/extract.pxd
deleted file mode 100644
index 384f0f0ef42..00000000000
--- a/python/cudf/cudf/_lib/cpp/strings/extract.pxd
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-from cudf._lib.cpp.table.table cimport table
-
-
-cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil:
-
-    cdef unique_ptr[table] extract(
-        column_view source_strings,
-        regex_program) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/findall.pxd b/python/cudf/cudf/_lib/cpp/strings/findall.pxd
deleted file mode 100644
index 8c878ada097..00000000000
--- a/python/cudf/cudf/_lib/cpp/strings/findall.pxd
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-
-
-cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil:
-
-    cdef unique_ptr[column] findall(
-        column_view source_strings,
-        regex_program) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/strip.pxd b/python/cudf/cudf/_lib/cpp/strings/strip.pxd
deleted file mode 100644
index 3a86f80328f..00000000000
--- a/python/cudf/cudf/_lib/cpp/strings/strip.pxd
+++ /dev/null
@@ -1,16 +0,0 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.side_type cimport side_type
-
-
-cdef extern from "cudf/strings/strip.hpp" namespace "cudf::strings" nogil:
-
-    cdef unique_ptr[column] strip(
-        column_view source_strings,
-        side_type stype,
-        string_scalar to_strip) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/wrap.pxd b/python/cudf/cudf/_lib/cpp/strings/wrap.pxd
deleted file mode 100644
index 62c791799ad..00000000000
--- a/python/cudf/cudf/_lib/cpp/strings/wrap.pxd
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
-
-
-cdef extern from "cudf/strings/wrap.hpp" namespace "cudf::strings" nogil:
-
-    cdef unique_ptr[column] wrap(
-        column_view source_strings,
-        size_type width) except +
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index b2e4d442bd2..aa771295607 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -7,9 +7,9 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.types cimport data_type
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
 from cudf._lib.io.datasource cimport Datasource, NativeFileDatasource
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 from cudf._lib.types cimport dtype_to_data_type
 
 import numpy as np
@@ -18,7 +18,7 @@ import pandas as pd
 import cudf
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 import errno
 import os
@@ -29,22 +29,22 @@ from io import BytesIO, StringIO
 from libc.stdint cimport int32_t
 from libcpp cimport bool
 
-from cudf._lib.cpp.io.csv cimport (
+from cudf._lib.io.utils cimport make_sink_info, make_source_info
+from cudf._lib.pylibcudf.libcudf.io.csv cimport (
     csv_reader_options,
     csv_writer_options,
     read_csv as cpp_read_csv,
     write_csv as cpp_write_csv,
 )
-from cudf._lib.cpp.io.data_sink cimport data_sink
-from cudf._lib.cpp.io.types cimport (
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
     compression_type,
     quote_style,
     sink_info,
     source_info,
     table_with_metadata,
 )
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.io.utils cimport make_sink_info, make_source_info
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
 
 from pyarrow.lib import NativeFile
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index 009a69ea501..b30ef875a7b 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -7,13 +7,13 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-cimport cudf._lib.cpp.datetime as libcudf_datetime
+cimport cudf._lib.pylibcudf.libcudf.datetime as libcudf_datetime
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.filling cimport calendrical_month_sequence
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.filling cimport calendrical_month_sequence
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/expressions.pxd b/python/cudf/cudf/_lib/expressions.pxd
index c2ee504c626..4a20c5fc545 100644
--- a/python/cudf/cudf/_lib/expressions.pxd
+++ b/python/cudf/cudf/_lib/expressions.pxd
@@ -3,13 +3,13 @@
 from libc.stdint cimport int32_t, int64_t
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.expressions cimport (
+from cudf._lib.pylibcudf.libcudf.expressions cimport (
     column_reference,
     expression,
     literal,
     operation,
 )
-from cudf._lib.cpp.scalar.scalar cimport (
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
     numeric_scalar,
     scalar,
     string_scalar,
diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx
index a3b07075507..3fb29279ed7 100644
--- a/python/cudf/cudf/_lib/expressions.pyx
+++ b/python/cudf/cudf/_lib/expressions.pyx
@@ -10,9 +10,12 @@ from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.cpp cimport expressions as libcudf_exp
-from cudf._lib.cpp.types cimport size_type
-from cudf._lib.cpp.wrappers.timestamps cimport timestamp_ms, timestamp_us
+from cudf._lib.pylibcudf.libcudf cimport expressions as libcudf_exp
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport (
+    timestamp_ms,
+    timestamp_us,
+)
 
 # Necessary for proper casting, see below.
 ctypedef int32_t underlying_type_ast_operator
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index d5e97439180..7533ed56647 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -18,8 +18,8 @@ from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf._lib.scalar import as_device_scalar
 
-from cudf._lib.cpp.replace cimport replace_policy
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 from cudf._lib import pylibcudf
 from cudf._lib.aggregation import make_aggregation
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index 6854cff7763..b8331d5a226 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -7,10 +7,10 @@ from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.types as libcudf_types
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.hash cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.hash cimport (
     md5,
     murmurhash3_x86_32,
     sha1,
@@ -20,9 +20,11 @@ from cudf._lib.cpp.hash cimport (
     sha512,
     xxhash_64,
 )
-from cudf._lib.cpp.partitioning cimport hash_partition as cpp_hash_partition
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.partitioning cimport (
+    hash_partition as cpp_hash_partition,
+)
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 0afed1bbd2e..37595b65e65 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -6,13 +6,13 @@ from libcpp.utility cimport move
 
 from cudf._lib import pylibcudf
 
-from cudf._lib.cpp.interop cimport (
+from cudf._lib.pylibcudf.libcudf.interop cimport (
     DLManagedTensor,
     from_dlpack as cpp_from_dlpack,
     to_dlpack as cpp_to_dlpack,
 )
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.utils cimport (
     columns_from_pylibcudf_table,
     columns_from_unique_ptr,
diff --git a/python/cudf/cudf/_lib/io/datasource.pxd b/python/cudf/cudf/_lib/io/datasource.pxd
index bd5bf0227a5..a0a9c3fa0d4 100644
--- a/python/cudf/cudf/_lib/io/datasource.pxd
+++ b/python/cudf/cudf/_lib/io/datasource.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr
 
-from cudf._lib.cpp.io.arrow_io_source cimport arrow_io_source
-from cudf._lib.cpp.io.datasource cimport datasource
+from cudf._lib.pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
+from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 
 
 cdef class Datasource:
diff --git a/python/cudf/cudf/_lib/io/datasource.pyx b/python/cudf/cudf/_lib/io/datasource.pyx
index 5cadd58d8d3..aa7fa0efdaf 100644
--- a/python/cudf/cudf/_lib/io/datasource.pyx
+++ b/python/cudf/cudf/_lib/io/datasource.pyx
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 from pyarrow.lib cimport NativeFile
 
-from cudf._lib.cpp.io.arrow_io_source cimport arrow_io_source
-from cudf._lib.cpp.io.datasource cimport datasource
+from cudf._lib.pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
+from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 
 
 cdef class Datasource:
diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index 2c2d52b512b..252d986843a 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -1,11 +1,15 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.io.data_sink cimport data_sink
-from cudf._lib.cpp.io.types cimport column_name_info, sink_info, source_info
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    column_name_info,
+    sink_info,
+    source_info,
+)
 
 
 cdef source_info make_source_info(list src) except*
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index ae978d18813..3c14ec46122 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -8,15 +8,15 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.io.data_sink cimport data_sink
-from cudf._lib.cpp.io.datasource cimport datasource
-from cudf._lib.cpp.io.types cimport (
+from cudf._lib.io.datasource cimport Datasource
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
     column_name_info,
     host_buffer,
     sink_info,
     source_info,
 )
-from cudf._lib.io.datasource cimport Datasource
 
 import codecs
 import errno
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index cef71ed24a5..283a451dd4a 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -14,30 +14,30 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.io.data_sink cimport data_sink
-from cudf._lib.cpp.io.json cimport (
+from cudf._lib.io.utils cimport (
+    make_sink_info,
+    make_source_info,
+    update_struct_field_names,
+)
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.pylibcudf.libcudf.io.json cimport (
     json_reader_options,
     json_writer_options,
     read_json as libcudf_read_json,
     schema_element,
     write_json as libcudf_write_json,
 )
-from cudf._lib.cpp.io.types cimport (
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
     column_name_info,
     compression_type,
     sink_info,
     table_metadata,
     table_with_metadata,
 )
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport data_type, size_type
-from cudf._lib.io.utils cimport (
-    make_sink_info,
-    make_source_info,
-    update_struct_field_names,
-)
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 from cudf._lib.types cimport dtype_to_data_type
 from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
 
diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx
index 2c2538ab0af..439a727a9ca 100644
--- a/python/cudf/cudf/_lib/labeling.pyx
+++ b/python/cudf/cudf/_lib/labeling.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -7,9 +7,12 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.labeling cimport inclusive, label_bins as cpp_label_bins
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.labeling cimport (
+    inclusive,
+    label_bins as cpp_label_bins,
+)
 
 
 # Note that the parameter input shadows a Python built-in in the local scope,
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index f4d16967300..656d92c1a4b 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -7,24 +7,33 @@ from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.lists.combine cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
     concatenate_list_elements as cpp_concatenate_list_elements,
     concatenate_null_policy,
     concatenate_rows as cpp_concatenate_rows,
 )
-from cudf._lib.cpp.lists.contains cimport contains, index_of as cpp_index_of
-from cudf._lib.cpp.lists.count_elements cimport (
+from cudf._lib.pylibcudf.libcudf.lists.contains cimport (
+    contains,
+    index_of as cpp_index_of,
+)
+from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
     count_elements as cpp_count_elements,
 )
-from cudf._lib.cpp.lists.extract cimport extract_list_element
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.lists.sorting cimport sort_lists as cpp_sort_lists
-from cudf._lib.cpp.lists.stream_compaction cimport distinct as cpp_distinct
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.lists.extract cimport extract_list_element
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
+    sort_lists as cpp_sort_lists,
+)
+from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
+    distinct as cpp_distinct,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     null_equality,
     null_order,
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
index 1f98140d9e4..b00deae2270 100644
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ b/python/cudf/cudf/_lib/null_mask.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from enum import Enum
 
@@ -11,8 +11,8 @@ from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.null_mask cimport (
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.null_mask cimport (
     bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes,
     bitmask_and as cpp_bitmask_and,
     bitmask_or as cpp_bitmask_or,
@@ -20,8 +20,8 @@ from cudf._lib.cpp.null_mask cimport (
     create_null_mask as cpp_create_null_mask,
     underlying_type_t_mask_state,
 )
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport mask_state, size_type
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type
 from cudf._lib.utils cimport table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
index cfc76afa8a5..d60162d0656 100644
--- a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
+++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 
 from cudf.core.buffer import acquire_spill_lock
@@ -7,14 +7,14 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.byte_pair_encode cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.byte_pair_encode cimport (
     bpe_merge_pairs as cpp_bpe_merge_pairs,
     byte_pair_encoding as cpp_byte_pair_encoding,
     load_merge_pairs as cpp_load_merge_pairs,
 )
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
index 984c8e84d7c..514b6610575 100644
--- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
+++ b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,9 +6,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.edit_distance cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.edit_distance cimport (
     edit_distance as cpp_edit_distance,
     edit_distance_matrix as cpp_edit_distance_matrix,
 )
diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
index 96b95c8792d..a6b9a1e4f7a 100644
--- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
+++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,15 +6,15 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.generate_ngrams cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.generate_ngrams cimport (
     generate_character_ngrams as cpp_generate_character_ngrams,
     generate_ngrams as cpp_generate_ngrams,
     hash_character_ngrams as cpp_hash_character_ngrams,
 )
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
index 9035e743fa5..42fe15d6869 100644
--- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx
+++ b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,10 +6,12 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.jaccard cimport jaccard_index as cpp_jaccard_index
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.jaccard cimport (
+    jaccard_index as cpp_jaccard_index,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index 6ed5ca834ee..4c92999e190 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,13 +6,13 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.minhash cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
 )
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
index 3e7911c8ae8..ccd8de8c96f 100644
--- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,13 +6,13 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.ngrams_tokenize cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.ngrams_tokenize cimport (
     ngrams_tokenize as cpp_ngrams_tokenize,
 )
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx
index 80c6ef792ab..9f81f865bb7 100644
--- a/python/cudf/cudf/_lib/nvtext/normalize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/normalize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -7,9 +7,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.normalize cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.normalize cimport (
     normalize_characters as cpp_normalize_characters,
     normalize_spaces as cpp_normalize_spaces,
 )
diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx
index 289e5611010..ce2edc58d19 100644
--- a/python/cudf/cudf/_lib/nvtext/replace.pyx
+++ b/python/cudf/cudf/_lib/nvtext/replace.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,14 +6,14 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.replace cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.replace cimport (
     filter_tokens as cpp_filter_tokens,
     replace_tokens as cpp_replace_tokens,
 )
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
index c690aba70de..8f75953ae99 100644
--- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx
+++ b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -8,15 +8,15 @@ from libcpp.utility cimport move
 from enum import IntEnum
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.stemmer cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.stemmer cimport (
     is_letter as cpp_is_letter,
     letter_type,
     porter_stemmer_measure as cpp_porter_stemmer_measure,
     underlying_type_t_letter_type,
 )
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 class LetterType(IntEnum):
diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
index bf675a16adc..1112667a087 100644
--- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint32_t
 
@@ -10,8 +10,8 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.subword_tokenize cimport (
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.subword_tokenize cimport (
     hashed_vocabulary as cpp_hashed_vocabulary,
     load_vocabulary_file as cpp_load_vocabulary_file,
     move as tr_move,
diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
index bee9d6f6c4d..98afd94ab1c 100644
--- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,9 +6,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.nvtext.tokenize cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.nvtext.tokenize cimport (
     character_tokenize as cpp_character_tokenize,
     count_tokens as cpp_count_tokens,
     detokenize as cpp_detokenize,
@@ -17,8 +17,8 @@ from cudf._lib.cpp.nvtext.tokenize cimport (
     tokenize_vocabulary as cpp_tokenize_vocabulary,
     tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
 )
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 918880648bf..d3e6053ef4b 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -14,17 +14,23 @@ from libcpp.vector cimport vector
 import datetime
 from collections import OrderedDict
 
-cimport cudf._lib.cpp.lists.lists_column_view as cpp_lists_column_view
+cimport cudf._lib.pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view
 
 try:
     import ujson as json
 except ImportError:
     import json
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.io.data_sink cimport data_sink
-from cudf._lib.cpp.io.orc cimport (
+from cudf._lib.io.datasource cimport NativeFileDatasource
+from cudf._lib.io.utils cimport (
+    make_sink_info,
+    make_source_info,
+    update_column_struct_field_names,
+)
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.pylibcudf.libcudf.io.orc cimport (
     chunked_orc_writer_options,
     orc_chunked_writer,
     orc_reader_options,
@@ -32,7 +38,7 @@ from cudf._lib.cpp.io.orc cimport (
     read_orc as libcudf_read_orc,
     write_orc as libcudf_write_orc,
 )
-from cudf._lib.cpp.io.orc_metadata cimport (
+from cudf._lib.pylibcudf.libcudf.io.orc_metadata cimport (
     binary_statistics,
     bucket_statistics,
     column_statistics,
@@ -47,7 +53,7 @@ from cudf._lib.cpp.io.orc_metadata cimport (
     string_statistics,
     timestamp_statistics,
 )
-from cudf._lib.cpp.io.types cimport (
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
     column_in_metadata,
     compression_type,
     sink_info,
@@ -55,14 +61,8 @@ from cudf._lib.cpp.io.types cimport (
     table_input_metadata,
     table_with_metadata,
 )
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport data_type, size_type, type_id
-from cudf._lib.io.datasource cimport NativeFileDatasource
-from cudf._lib.io.utils cimport (
-    make_sink_info,
-    make_source_info,
-    update_column_struct_field_names,
-)
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type, type_id
 from cudf._lib.variant cimport get_if as std_get_if, holds_alternative
 
 from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index dcfa087a1fa..4a23a58b523 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -31,12 +31,19 @@ from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.data_sink as cudf_io_data_sink
-cimport cudf._lib.cpp.io.types as cudf_io_types
-cimport cudf._lib.cpp.types as cudf_types
+cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.types as cudf_types
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.expressions cimport expression
-from cudf._lib.cpp.io.parquet cimport (
+from cudf._lib.expressions cimport Expression
+from cudf._lib.io.datasource cimport NativeFileDatasource
+from cudf._lib.io.utils cimport (
+    make_sinks_info,
+    make_source_info,
+    update_struct_field_names,
+)
+from cudf._lib.pylibcudf.libcudf.expressions cimport expression
+from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
     chunked_parquet_writer_options,
     merge_row_group_metadata as parquet_merge_metadata,
     parquet_chunked_writer as cpp_parquet_chunked_writer,
@@ -46,20 +53,16 @@ from cudf._lib.cpp.io.parquet cimport (
     read_parquet as parquet_reader,
     write_parquet as parquet_writer,
 )
-from cudf._lib.cpp.io.parquet_metadata cimport (
+from cudf._lib.pylibcudf.libcudf.io.parquet_metadata cimport (
     parquet_metadata,
     read_parquet_metadata as parquet_metadata_reader,
 )
-from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport data_type, size_type
-from cudf._lib.expressions cimport Expression
-from cudf._lib.io.datasource cimport NativeFileDatasource
-from cudf._lib.io.utils cimport (
-    make_sinks_info,
-    make_source_info,
-    update_struct_field_names,
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    column_in_metadata,
+    table_input_metadata,
 )
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 from cudf._lib.utils cimport table_view_from_table
 
 from pyarrow.lib import NativeFile
diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx
index 4bf8b32ea7e..708ec4174aa 100644
--- a/python/cudf/cudf/_lib/partitioning.pyx
+++ b/python/cudf/cudf/_lib/partitioning.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -8,16 +8,18 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.partitioning cimport partition as cpp_partition
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.partitioning cimport (
+    partition as cpp_partition,
+)
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 from cudf._lib.reduce import minmax
 from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
 
-cimport cudf._lib.cpp.types as libcudf_types
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index c2b7cb7ca3d..efc978fc6d0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -45,4 +45,5 @@ rapids_cython_create_modules(
 )
 link_to_pyarrow_headers(pylibcudf_interop)
 
+add_subdirectory(libcudf)
 add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
index a9491793b88..8526728656b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
@@ -2,7 +2,7 @@
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.aggregation cimport (
+from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     Kind as kind_t,
     aggregation,
     correlation_type,
@@ -14,7 +14,7 @@ from cudf._lib.cpp.aggregation cimport (
     rolling_aggregation,
     scan_aggregation,
 )
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.types cimport (
     interpolation,
     nan_equality,
     null_equality,
diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
index fe7daea38bf..672b1ba2221 100644
--- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
@@ -5,7 +5,7 @@ from libcpp.cast cimport dynamic_cast
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.aggregation cimport (
+from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     aggregation,
     correlation_type,
     groupby_aggregation,
@@ -39,7 +39,7 @@ from cudf._lib.cpp.aggregation cimport (
     rolling_aggregation,
     scan_aggregation,
 )
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.types cimport (
     interpolation,
     nan_equality,
     null_equality,
@@ -49,14 +49,16 @@ from cudf._lib.cpp.types cimport (
     size_type,
 )
 
-from cudf._lib.cpp.aggregation import Kind  # no-cython-lint
-from cudf._lib.cpp.aggregation import \
+from cudf._lib.pylibcudf.libcudf.aggregation import Kind  # no-cython-lint
+from cudf._lib.pylibcudf.libcudf.aggregation import \
     correlation_type as CorrelationType  # no-cython-lint
-from cudf._lib.cpp.aggregation import \
+from cudf._lib.pylibcudf.libcudf.aggregation import \
     rank_method as RankMethod  # no-cython-lint
-from cudf._lib.cpp.aggregation import \
+from cudf._lib.pylibcudf.libcudf.aggregation import \
     rank_percentage as RankPercentage  # no-cython-lint
-from cudf._lib.cpp.aggregation import udf_type as UdfType  # no-cython-lint
+from cudf._lib.pylibcudf.libcudf.aggregation import (  # no-cython-lint
+    udf_type as UdfType,
+)
 
 from .types cimport DataType
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
index 0aa6aac7b39..9a8c8e49dcf 100644
--- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.binaryop cimport binary_operator
+from cudf._lib.pylibcudf.libcudf.binaryop cimport binary_operator
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
index 16de7757469..c1d669c3c1c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
@@ -5,11 +5,11 @@ from cython.operator import dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp cimport binaryop as cpp_binaryop
-from cudf._lib.cpp.binaryop cimport binary_operator
-from cudf._lib.cpp.column.column cimport column
+from cudf._lib.pylibcudf.libcudf cimport binaryop as cpp_binaryop
+from cudf._lib.pylibcudf.libcudf.binaryop cimport binary_operator
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
 
-from cudf._lib.cpp.binaryop import \
+from cudf._lib.pylibcudf.libcudf.binaryop import \
     binary_operator as BinaryOperator  # no-cython-lint
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd
index 66ccdb53d1a..e121e856865 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd
@@ -3,9 +3,12 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.types cimport bitmask_type, size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type, size_type
 
 from .gpumemoryview cimport gpumemoryview
 from .types cimport DataType
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index b9e5e48226d..e726eca154f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -6,10 +6,12 @@ from libcpp.utility cimport move
 
 from rmm._lib.device_buffer cimport DeviceBuffer
 
-from cudf._lib.cpp.column.column cimport column, column_contents
-from cudf._lib.cpp.column.column_factories cimport make_column_from_scalar
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_contents
+from cudf._lib.pylibcudf.libcudf.column.column_factories cimport (
+    make_column_from_scalar,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .gpumemoryview cimport gpumemoryview
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/concatenate.pyx b/python/cudf/cudf/_lib/pylibcudf/concatenate.pyx
index ce7ef84e20e..5e40f921b2c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/concatenate.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/concatenate.pyx
@@ -4,11 +4,11 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp cimport concatenate as cpp_concatenate
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf cimport concatenate as cpp_concatenate
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/copying.pxd
index 0211d122c8e..06543d3ca92 100644
--- a/python/cudf/cudf/_lib/pylibcudf/copying.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/copying.pxd
@@ -2,8 +2,11 @@
 
 from libcpp cimport bool as cbool
 
-from cudf._lib.cpp.copying cimport mask_allocation_policy, out_of_bounds_policy
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.copying cimport (
+    mask_allocation_policy,
+    out_of_bounds_policy,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/copying.pyx
index 125a4ffe65f..2d59deb3864 100644
--- a/python/cudf/cudf/_lib/pylibcudf/copying.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/copying.pyx
@@ -11,18 +11,24 @@ from libcpp.vector cimport vector
 # directly from that. It will make namespacing much cleaner in pylibcudf. What
 # we really want here would be
 # cimport libcudf... libcudf.copying.algo(...)
-from cudf._lib.cpp cimport copying as cpp_copying
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.copying cimport mask_allocation_policy, out_of_bounds_policy
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
-
-from cudf._lib.cpp.copying import \
+from cudf._lib.pylibcudf.libcudf cimport copying as cpp_copying
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.copying cimport (
+    mask_allocation_policy,
+    out_of_bounds_policy,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.pylibcudf.libcudf.copying import \
     mask_allocation_policy as MaskAllocationPolicy  # no-cython-lint
-from cudf._lib.cpp.copying import \
+from cudf._lib.pylibcudf.libcudf.copying import \
     out_of_bounds_policy as OutOfBoundsPolicy  # no-cython-lint
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/filling.pxd b/python/cudf/cudf/_lib/pylibcudf/filling.pxd
index 55dbd7b075f..3560ebf2ea2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/filling.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/filling.pxd
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/filling.pyx b/python/cudf/cudf/_lib/pylibcudf/filling.pyx
index 588ab58a146..05f67681428 100644
--- a/python/cudf/cudf/_lib/pylibcudf/filling.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/filling.pyx
@@ -4,15 +4,15 @@ from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.filling cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.filling cimport (
     fill as cpp_fill,
     fill_in_place as cpp_fill_in_place,
     repeat as cpp_repeat,
     sequence as cpp_sequence,
 )
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
index f1b7a25d5f9..c6c146b0445 100644
--- a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
@@ -4,18 +4,18 @@ from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.aggregation cimport (
+from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     aggregation,
     groupby_aggregation,
     groupby_scan_aggregation,
 )
-from cudf._lib.cpp.groupby cimport (
+from cudf._lib.pylibcudf.libcudf.groupby cimport (
     aggregation_request,
     aggregation_result,
     groupby,
     scan_request,
 )
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
index 3b800abf266..46fe61025ce 100644
--- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
@@ -7,17 +7,17 @@ from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.groupby cimport (
+from cudf._lib.pylibcudf.libcudf.groupby cimport (
     aggregation_request,
     aggregation_result,
     groupby,
     groups,
     scan_request,
 )
-from cudf._lib.cpp.replace cimport replace_policy
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
index 8dc41fccc0c..f172080cece 100644
--- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
@@ -11,14 +11,17 @@ from functools import singledispatch
 
 from pyarrow import lib as pa
 
-from cudf._lib.cpp.interop cimport (
+from cudf._lib.pylibcudf.libcudf.interop cimport (
     column_metadata,
     from_arrow as cpp_from_arrow,
     to_arrow as cpp_to_arrow,
 )
-from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.wrappers.decimals cimport (
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
+    fixed_point_scalar,
+    scalar,
+)
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.wrappers.decimals cimport (
     decimal32,
     decimal64,
     decimal128,
diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pxd b/python/cudf/cudf/_lib/pylibcudf/join.pxd
index ff7dec97596..f560eeef06d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/join.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.types cimport null_equality
+from cudf._lib.pylibcudf.libcudf.types cimport null_equality
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pyx b/python/cudf/cudf/_lib/pylibcudf/join.pyx
index 3710a84e594..cf2a6a8187f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/join.pyx
@@ -7,9 +7,14 @@ from libcpp.utility cimport move
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp cimport join as cpp_join
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.types cimport data_type, null_equality, size_type, type_id
+from cudf._lib.pylibcudf.libcudf cimport join as cpp_join
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    data_type,
+    null_equality,
+    size_type,
+    type_id,
+)
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/cpp/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/CMakeLists.txt
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
diff --git a/python/cudf/cudf/_lib/cpp/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
similarity index 98%
rename from python/cudf/cudf/_lib/cpp/aggregation.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
index 91b9d7d024f..e0e01207589 100644
--- a/python/cudf/cudf/_lib/cpp/aggregation.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
@@ -5,7 +5,7 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.types cimport (
     data_type,
     interpolation,
     nan_equality,
diff --git a/python/cudf/cudf/_lib/cpp/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/aggregation.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pyx
diff --git a/python/cudf/cudf/_lib/cpp/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
similarity index 84%
rename from python/cudf/cudf/_lib/cpp/binaryop.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
index 735216e656a..788a94a0bbc 100644
--- a/python/cudf/cudf/_lib/cpp/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
@@ -4,10 +4,10 @@ from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/binaryop.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/binaryop.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pyx
diff --git a/python/cudf/cudf/_lib/cpp/column/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/column/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/column/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/column/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/column/column.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column.pxd
similarity index 78%
rename from python/cudf/cudf/_lib/cpp/column/column.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/column/column.pxd
index 136f1d795a9..dd184d31cc6 100644
--- a/python/cudf/cudf/_lib/cpp/column/column.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -6,8 +6,11 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.types cimport data_type, size_type
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/column/column.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/column/column_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd
similarity index 65%
rename from python/cudf/cudf/_lib/cpp/column/column_factories.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd
index 0f22e788bd7..fd22d92cb30 100644
--- a/python/cudf/cudf/_lib/cpp/column/column_factories.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport data_type, mask_state, size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, mask_state, size_type
 
 
 cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/column/column_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_view.pxd
similarity index 95%
rename from python/cudf/cudf/_lib/cpp/column/column_view.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_view.pxd
index edd013d9340..c6403babe89 100644
--- a/python/cudf/cudf/_lib/cpp/column/column_view.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_view.pxd
@@ -1,9 +1,13 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    bitmask_type,
+    data_type,
+    size_type,
+)
 
 
 cdef extern from "cudf/column/column_view.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/concatenate.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/concatenate.pxd
similarity index 77%
rename from python/cudf/cudf/_lib/cpp/concatenate.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/concatenate.pxd
index a64c7426f5e..0c362390ff2 100644
--- a/python/cudf/cudf/_lib/cpp/concatenate.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/concatenate.pxd
@@ -5,9 +5,9 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.column.column cimport column, column_view
-from cudf._lib.cpp.table.table cimport table, table_view
-from cudf._lib.cpp.utilities.host_span cimport host_span
+from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table, table_view
+from cudf._lib.pylibcudf.libcudf.utilities.host_span cimport host_span
 
 
 cdef extern from "cudf/concatenate.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/contiguous_split.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/contiguous_split.pxd
similarity index 80%
rename from python/cudf/cudf/_lib/cpp/contiguous_split.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/contiguous_split.pxd
index 134e4ed0723..b06feacb016 100644
--- a/python/cudf/cudf/_lib/cpp/contiguous_split.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/contiguous_split.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t
 from libcpp.memory cimport unique_ptr
@@ -6,8 +6,8 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
similarity index 91%
rename from python/cudf/cudf/_lib/cpp/copying.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
index 053e2299f22..001489d69bf 100644
--- a/python/cudf/cudf/_lib/cpp/copying.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
@@ -8,13 +8,16 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
 from cudf._lib.exception_handler cimport cudf_exception_handler
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 ctypedef const scalar constscalar
 
diff --git a/python/cudf/cudf/_lib/cpp/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/copying.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pyx
diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/datetime.pxd
similarity index 90%
rename from python/cudf/cudf/_lib/cpp/datetime.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/datetime.pxd
index d03587745e1..7db77b9c7c5 100644
--- a/python/cudf/cudf/_lib/cpp/datetime.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/datetime.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/expressions.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
similarity index 92%
rename from python/cudf/cudf/_lib/cpp/expressions.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
index 291afbcc62a..279d969db50 100644
--- a/python/cudf/cudf/_lib/cpp/expressions.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
@@ -1,16 +1,16 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.scalar.scalar cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
     duration_scalar,
     numeric_scalar,
     timestamp_scalar,
 )
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/filling.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/filling.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/cpp/filling.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/filling.pxd
index e412f294537..16ed682f930 100644
--- a/python/cudf/cudf/_lib/cpp/filling.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/filling.pxd
@@ -1,14 +1,17 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/filling.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/groupby.pxd
similarity index 83%
rename from python/cudf/cudf/_lib/cpp/groupby.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/groupby.pxd
index 8bbefcde0dd..16607cc3711 100644
--- a/python/cudf/cudf/_lib/cpp/groupby.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/groupby.pxd
@@ -6,24 +6,24 @@ from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.aggregation cimport (
+from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     groupby_aggregation,
     groupby_scan_aggregation,
 )
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.replace cimport replace_policy
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
     null_order,
     null_policy,
     order,
     size_type,
     sorted,
 )
-from cudf._lib.cpp.utilities.host_span cimport host_span
+from cudf._lib.pylibcudf.libcudf.utilities.host_span cimport host_span
 
 # workaround for https://github.com/cython/cython/issues/3885
 ctypedef const scalar constscalar
diff --git a/python/cudf/cudf/_lib/cpp/hash.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/hash.pxd
similarity index 86%
rename from python/cudf/cudf/_lib/cpp/hash.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/hash.pxd
index d55e244dc2c..5346252df69 100644
--- a/python/cudf/cudf/_lib/cpp/hash.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/hash.pxd
@@ -4,9 +4,9 @@ from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
similarity index 83%
rename from python/cudf/cudf/_lib/cpp/interop.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
index 88e9d83ee98..471b78505fb 100644
--- a/python/cudf/cudf/_lib/cpp/interop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
@@ -7,9 +7,9 @@ from pyarrow.lib cimport CScalar, CTable
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "dlpack/dlpack.h" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/io/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/io/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/arrow_io_source.pxd
similarity index 77%
rename from python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/arrow_io_source.pxd
index 4aef4841844..1d2138f8d10 100644
--- a/python/cudf/cudf/_lib/cpp/io/arrow_io_source.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/arrow_io_source.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr
 from libcpp.string cimport string
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 
-cimport cudf._lib.cpp.io.datasource as cudf_io_datasource
+cimport cudf._lib.pylibcudf.libcudf.io.datasource as cudf_io_datasource
 
 
 cdef extern from "cudf/io/arrow_io_source.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/io/avro.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/avro.pxd
similarity index 88%
rename from python/cudf/cudf/_lib/cpp/io/avro.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/avro.pxd
index 9b683e5bce3..530df5aa8f1 100644
--- a/python/cudf/cudf/_lib/cpp/io/avro.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/avro.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
-from cudf._lib.cpp.types cimport size_type
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/io/avro.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/io/csv.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
similarity index 97%
rename from python/cudf/cudf/_lib/cpp/io/csv.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
index e7c0fec2e3d..754dd37d53f 100644
--- a/python/cudf/cudf/_lib/cpp/io/csv.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
@@ -7,9 +7,9 @@ from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
-cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.types cimport data_type, size_type
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/csv.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/io/data_sink.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/data_sink.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/io/data_sink.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/data_sink.pxd
diff --git a/python/cudf/cudf/_lib/cpp/io/datasource.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/datasource.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/io/datasource.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/datasource.pxd
diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
similarity index 96%
rename from python/cudf/cudf/_lib/cpp/io/json.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
index 1e1057beede..7e64a4cae29 100644
--- a/python/cudf/cudf/_lib/cpp/io/json.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
@@ -7,9 +7,9 @@ from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
-cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.types cimport data_type, size_type
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/json.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
similarity index 97%
rename from python/cudf/cudf/_lib/cpp/io/orc.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
index d5bb1726a43..e553515dfdf 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
@@ -8,9 +8,9 @@ from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
-cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.types cimport data_type, size_type
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/orc.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc_metadata.pxd
similarity index 96%
rename from python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc_metadata.pxd
index aad4f1c6870..a23655b06f8 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc_metadata.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc_metadata.pxd
@@ -6,7 +6,7 @@ from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
 from cudf._lib.variant cimport monostate, variant
 
 
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
similarity index 97%
rename from python/cudf/cudf/_lib/cpp/io/parquet.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
index 1680eb43700..b7f3f89f71c 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
@@ -9,10 +9,10 @@ from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
-cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.expressions cimport expression
-from cudf._lib.cpp.types cimport data_type, size_type
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
+from cudf._lib.pylibcudf.libcudf.expressions cimport expression
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet_metadata.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet_metadata.pxd
similarity index 89%
rename from python/cudf/cudf/_lib/cpp/io/parquet_metadata.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet_metadata.pxd
index e9def2aea5d..34a299b73ab 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet_metadata.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet_metadata.pxd
@@ -5,8 +5,8 @@ from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.io.types as cudf_io_types
-from cudf._lib.cpp.types cimport size_type
+cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/io/parquet_metadata.hpp" namespace "cudf::io" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/io/text.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/text.pxd
similarity index 93%
rename from python/cudf/cudf/_lib/cpp/io/text.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/text.pxd
index 368b014ea4b..bec223d4079 100644
--- a/python/cudf/cudf/_lib/cpp/io/text.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/text.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint64_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
 
 
 cdef extern from "cudf/io/text/byte_range_info.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/io/timezone.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/timezone.pxd
similarity index 75%
rename from python/cudf/cudf/_lib/cpp/io/timezone.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/timezone.pxd
index 927c2118473..88cb5544dc1 100644
--- a/python/cudf/cudf/_lib/cpp/io/timezone.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/timezone.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.optional cimport optional
 from libcpp.string cimport string
 
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 
 
 cdef extern from "cudf/timezone.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
similarity index 91%
rename from python/cudf/cudf/_lib/cpp/io/types.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
index d8cc329b0a0..4725c4e5937 100644
--- a/python/cudf/cudf/_lib/cpp/io/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
@@ -10,11 +10,11 @@ from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 
-cimport cudf._lib.cpp.io.data_sink as cudf_io_data_sink
-cimport cudf._lib.cpp.io.datasource as cudf_io_datasource
-cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
+cimport cudf._lib.pylibcudf.libcudf.io.datasource as cudf_io_datasource
+cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/io/types.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/join.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
similarity index 87%
rename from python/cudf/cudf/_lib/cpp/join.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
index 7508052646a..89a30f0f255 100644
--- a/python/cudf/cudf/_lib/cpp/join.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
@@ -7,10 +7,10 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_uvector cimport device_uvector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport null_equality, size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport null_equality, size_type
 
 ctypedef unique_ptr[device_uvector[size_type]] gather_map_type
 ctypedef pair[gather_map_type, gather_map_type] gather_map_pair_type
diff --git a/python/cudf/cudf/_lib/cpp/labeling.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/labeling.pxd
similarity index 71%
rename from python/cudf/cudf/_lib/cpp/labeling.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/labeling.pxd
index af9c4bb9a04..54731bf29af 100644
--- a/python/cudf/cudf/_lib/cpp/labeling.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/labeling.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/labeling/label_bins.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/lists/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/lists/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/lists/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/lists/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/lists/combine.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/combine.pxd
similarity index 73%
rename from python/cudf/cudf/_lib/cpp/lists/combine.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/combine.pxd
index a7ad8e7ba41..728bd840f71 100644
--- a/python/cudf/cudf/_lib/cpp/lists/combine.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/combine.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/lists/combine.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/lists/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
similarity index 63%
rename from python/cudf/cudf/_lib/cpp/lists/contains.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
index e86c73deed2..721679f35c7 100644
--- a/python/cudf/cudf/_lib/cpp/lists/contains.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
@@ -1,12 +1,14 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.exception_handler cimport cudf_exception_handler
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
new file mode 100644
index 00000000000..38bdd4db0bb
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
@@ -0,0 +1,12 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+
+
+cdef extern from "cudf/lists/count_elements.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] count_elements(const lists_column_view) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/explode.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/explode.pxd
new file mode 100644
index 00000000000..622a866f593
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/explode.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+
+cdef extern from "cudf/lists/explode.hpp" namespace "cudf" nogil:
+    cdef unique_ptr[table] explode_outer(
+        const table_view,
+        size_type explode_column_idx,
+    ) except +
diff --git a/python/cudf/cudf/_lib/cpp/lists/extract.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
similarity index 56%
rename from python/cudf/cudf/_lib/cpp/lists/extract.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
index 93a886d7268..caa12f41914 100644
--- a/python/cudf/cudf/_lib/cpp/lists/extract.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
@@ -1,10 +1,12 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column, column_view
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/lists/extract.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/lists/gather.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
similarity index 57%
rename from python/cudf/cudf/_lib/cpp/lists/gather.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
index ea664eee82e..17b4c1877a6 100644
--- a/python/cudf/cudf/_lib/cpp/lists/gather.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
@@ -1,9 +1,11 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
 
 
 cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/lists/lists_column_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/cpp/lists/lists_column_view.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
index 793f4b8750d..dbafc415e45 100644
--- a/python/cudf/cudf/_lib/cpp/lists/lists_column_view.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
@@ -1,7 +1,10 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/lists/sorting.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
similarity index 51%
rename from python/cudf/cudf/_lib/cpp/lists/sorting.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
index 2115885ed95..145ab41302f 100644
--- a/python/cudf/cudf/_lib/cpp/lists/sorting.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
@@ -1,10 +1,12 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.types cimport null_order, order
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, order
 
 
 cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd
similarity index 52%
rename from python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd
index 58c1ab1dcec..22b91df7192 100644
--- a/python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd
@@ -1,10 +1,12 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
-from cudf._lib.cpp.types cimport nan_equality, null_equality
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport nan_equality, null_equality
 
 
 cdef extern from "cudf/lists/stream_compaction.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/merge.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/merge.pxd
similarity index 62%
rename from python/cudf/cudf/_lib/cpp/merge.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/merge.pxd
index 32fe14ac479..dacb3dc2d74 100644
--- a/python/cudf/cudf/_lib/cpp/merge.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/merge.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/merge.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/null_mask.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/null_mask.pxd
similarity index 76%
rename from python/cudf/cudf/_lib/cpp/null_mask.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/null_mask.pxd
index bd0eb684690..0cab404c05f 100644
--- a/python/cudf/cudf/_lib/cpp/null_mask.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/null_mask.pxd
@@ -1,13 +1,17 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp.pair cimport pair
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport bitmask_type, mask_state, size_type
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    bitmask_type,
+    mask_state,
+    size_type,
+)
 
 ctypedef int32_t underlying_type_t_mask_state
 
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/nvtext/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/nvtext/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
index e678e4e84db..033a820d2ef 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/edit_distance.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/edit_distance.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/cpp/nvtext/edit_distance.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/edit_distance.pxd
index 11de596ec8f..ca1f6650a5a 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/edit_distance.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/edit_distance.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "nvtext/edit_distance.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/generate_ngrams.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
similarity index 63%
rename from python/cudf/cudf/_lib/cpp/nvtext/generate_ngrams.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
index 75822054e4a..2034b1c1ee5 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/generate_ngrams.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/generate_ngrams.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/jaccard.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/jaccard.pxd
similarity index 52%
rename from python/cudf/cudf/_lib/cpp/nvtext/jaccard.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/jaccard.pxd
index a77f95f07ac..789a1a2c35a 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/jaccard.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/jaccard.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/jaccard.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/minhash.pxd
similarity index 63%
rename from python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/minhash.pxd
index 08b3330953e..fc5577bf3f9 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/minhash.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/ngrams_tokenize.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
similarity index 50%
rename from python/cudf/cudf/_lib/cpp/nvtext/ngrams_tokenize.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
index d716df22546..229f4d8f5a3 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/ngrams_tokenize.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/ngrams_tokenize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/normalize.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/normalize.pxd
similarity index 66%
rename from python/cudf/cudf/_lib/cpp/nvtext/normalize.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/normalize.pxd
index f012670317a..65c63b089df 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/normalize.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/normalize.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "nvtext/normalize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/replace.pxd
similarity index 63%
rename from python/cudf/cudf/_lib/cpp/nvtext/replace.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/replace.pxd
index c4e5258a710..aaad28d2684 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/replace.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/replace.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/replace.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/stemmer.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/stemmer.pxd
similarity index 74%
rename from python/cudf/cudf/_lib/cpp/nvtext/stemmer.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/stemmer.pxd
index 5a92b45b6dd..040d4c9de63 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/stemmer.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/stemmer.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/stemmer.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
similarity index 89%
rename from python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
index 226fa613f2c..cce40bcd3f6 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/subword_tokenize.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
@@ -1,12 +1,12 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint16_t, uint32_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/tokenize.pxd
similarity index 81%
rename from python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/tokenize.pxd
index 3cc3fd6251a..721a6cabd01 100644
--- a/python/cudf/cudf/_lib/cpp/nvtext/tokenize.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/tokenize.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/partitioning.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/partitioning.pxd
similarity index 64%
rename from python/cudf/cudf/_lib/cpp/partitioning.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/partitioning.pxd
index 5c58dbcc4ac..babb167d2a0 100644
--- a/python/cudf/cudf/_lib/cpp/partitioning.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/partitioning.pxd
@@ -1,15 +1,15 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/quantiles.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/quantiles.pxd
similarity index 65%
rename from python/cudf/cudf/_lib/cpp/quantiles.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/quantiles.pxd
index 03fda16856c..32cfec2d4fc 100644
--- a/python/cudf/cudf/_lib/cpp/quantiles.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/quantiles.pxd
@@ -1,14 +1,14 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
     interpolation,
     null_order,
     order,
diff --git a/python/cudf/cudf/_lib/cpp/reduce.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/cpp/reduce.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pxd
index 9c893fe9bcb..3ae1f1a2906 100644
--- a/python/cudf/cudf/_lib/cpp/reduce.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pxd
@@ -4,11 +4,14 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport pair
 
-from cudf._lib.cpp.aggregation cimport reduce_aggregation, scan_aggregation
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.aggregation cimport (
+    reduce_aggregation,
+    scan_aggregation,
+)
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/reduce.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/reduce.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pyx
diff --git a/python/cudf/cudf/_lib/cpp/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/cpp/replace.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pxd
index 5d57f01b816..e67efbdaba0 100644
--- a/python/cudf/cudf/_lib/cpp/replace.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pxd
@@ -5,9 +5,12 @@ from libcpp.memory cimport unique_ptr
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/replace.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/replace.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/replace.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pyx
diff --git a/python/cudf/cudf/_lib/cpp/reshape.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/reshape.pxd
similarity index 50%
rename from python/cudf/cudf/_lib/cpp/reshape.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/reshape.pxd
index 5b9d40aa2ad..dfd9a71c3d3 100644
--- a/python/cudf/cudf/_lib/cpp/reshape.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/reshape.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2019, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/reshape.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/rolling.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/rolling.pxd
similarity index 71%
rename from python/cudf/cudf/_lib/cpp/rolling.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/rolling.pxd
index 6b620e3a4c0..d7844f99a73 100644
--- a/python/cudf/cudf/_lib/cpp/rolling.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/rolling.pxd
@@ -4,10 +4,10 @@ from libcpp.memory cimport unique_ptr
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
-from cudf._lib.cpp.aggregation cimport rolling_aggregation
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.aggregation cimport rolling_aggregation
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/rolling.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/round.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd
similarity index 71%
rename from python/cudf/cudf/_lib/cpp/round.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd
index 66d76c35d72..06ff42485ea 100644
--- a/python/cudf/cudf/_lib/cpp/round.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/round.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/scalar/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/scalar/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/scalar/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/scalar/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar.pxd
similarity index 89%
rename from python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar.pxd
index b5e9b0ba06b..662eb90096e 100644
--- a/python/cudf/cudf/_lib/cpp/scalar/scalar.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar.pxd
@@ -1,13 +1,13 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport data_type
-from cudf._lib.cpp.wrappers.decimals cimport scale_type
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.wrappers.decimals cimport scale_type
 
 
 cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/search.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/search.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/cpp/search.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/search.pxd
index 8baef0aa1b9..e2247a1366f 100644
--- a/python/cudf/cudf/_lib/cpp/search.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/search.pxd
@@ -1,12 +1,12 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table_view cimport table_view
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/search.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/sorting.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/sorting.pxd
similarity index 86%
rename from python/cudf/cudf/_lib/cpp/sorting.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/sorting.pxd
index 86dc0f0de95..3d7d3aa2790 100644
--- a/python/cudf/cudf/_lib/cpp/sorting.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/sorting.pxd
@@ -6,12 +6,12 @@ from libcpp.vector cimport vector
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.aggregation cimport rank_method
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pxd
similarity index 88%
rename from python/cudf/cudf/_lib/cpp/stream_compaction.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pxd
index 55854a9444f..11d803e5b76 100644
--- a/python/cudf/cudf/_lib/cpp/stream_compaction.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pxd
@@ -6,11 +6,11 @@ from libcpp.vector cimport vector
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     nan_policy,
     null_equality,
diff --git a/python/cudf/cudf/_lib/cpp/stream_compaction.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/stream_compaction.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pyx
diff --git a/python/cudf/cudf/_lib/cpp/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/strings/attributes.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/attributes.pxd
similarity index 68%
rename from python/cudf/cudf/_lib/cpp/strings/attributes.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/attributes.pxd
index 31133b45b6d..c4d52c83663 100644
--- a/python/cudf/cudf/_lib/cpp/strings/attributes.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/attributes.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/attributes.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/cpp/strings/capitalize.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd
index d193a8265b1..f95d4f35566 100644
--- a/python/cudf/cudf/_lib/cpp/strings/capitalize.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd
@@ -1,8 +1,8 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/case.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/cpp/strings/case.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd
index 01cd08c10ff..9ccd2737afe 100644
--- a/python/cudf/cudf/_lib/cpp/strings/case.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd
@@ -1,8 +1,8 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/case.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd
similarity index 84%
rename from python/cudf/cudf/_lib/cpp/strings/char_types.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd
index ae921c6ead9..408b3687c4a 100644
--- a/python/cudf/cudf/_lib/cpp/strings/char_types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "cudf/strings/char_types/char_types.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/strings/combine.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/combine.pxd
similarity index 80%
rename from python/cudf/cudf/_lib/cpp/strings/combine.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/combine.pxd
index 2b10427283f..b05e46af0d6 100644
--- a/python/cudf/cudf/_lib/cpp/strings/combine.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/combine.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/contains.pxd
similarity index 64%
rename from python/cudf/cudf/_lib/cpp/strings/contains.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/contains.pxd
index 94c2fb21fc1..f8ed253ff3c 100644
--- a/python/cudf/cudf/_lib/cpp/strings/contains.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/contains.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
 
 
 cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/convert/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/convert/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_booleans.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
similarity index 62%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_booleans.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
index 96cb43973f1..daac2b5be28 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_booleans.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "cudf/strings/convert/convert_booleans.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
similarity index 70%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
index 5e7380c1d4e..263cee4fe1e 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_datetime.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_durations.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_durations.pxd
similarity index 65%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_durations.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_durations.pxd
index 8c54fd52aa2..af357b9bde4 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_durations.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_durations.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_durations.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_fixed_point.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
similarity index 66%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_fixed_point.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
index a993c5b17b8..91c1abdb5e4 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_fixed_point.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_floats.pxd
similarity index 64%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_floats.pxd
index 6388f43077d..5fbf2be0244 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_floats.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_floats.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_integers.pxd
similarity index 75%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_integers.pxd
index b5443979b81..3d6c59cbfcf 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_integers.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_integers.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_ipv4.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_ipv4.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
index d6e881caea4..86de956b6b6 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_ipv4.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/convert/convert_ipv4.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_lists.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_lists.pxd
similarity index 53%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_lists.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_lists.pxd
index 99bb80a813d..aba2dbcca64 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_lists.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "cudf/strings/convert/convert_lists.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/convert/convert_urls.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_urls.pxd
similarity index 62%
rename from python/cudf/cudf/_lib/cpp/strings/convert/convert_urls.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_urls.pxd
index 5d9991dd610..fb7e0cae6de 100644
--- a/python/cudf/cudf/_lib/cpp/strings/convert/convert_urls.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_urls.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/convert/convert_urls.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd
new file mode 100644
index 00000000000..57903ca27de
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd
@@ -0,0 +1,15 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+
+
+cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil:
+
+    cdef unique_ptr[table] extract(
+        column_view source_strings,
+        regex_program) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/find.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find.pxd
similarity index 83%
rename from python/cudf/cudf/_lib/cpp/strings/find.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find.pxd
index dfbdebb9651..04e2ed554ee 100644
--- a/python/cudf/cudf/_lib/cpp/strings/find.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find.pxd
@@ -3,10 +3,10 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/find.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/find_multiple.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find_multiple.pxd
similarity index 58%
rename from python/cudf/cudf/_lib/cpp/strings/find_multiple.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find_multiple.pxd
index 27b19728f60..1f1adc8e99f 100644
--- a/python/cudf/cudf/_lib/cpp/strings/find_multiple.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find_multiple.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/find_multiple.hpp" namespace "cudf::strings" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/findall.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/findall.pxd
new file mode 100644
index 00000000000..4bc450b8911
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/findall.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+
+
+cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil:
+
+    cdef unique_ptr[column] findall(
+        column_view source_strings,
+        regex_program) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/json.pxd
similarity index 75%
rename from python/cudf/cudf/_lib/cpp/strings/json.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/json.pxd
index eed627c96b5..5926fa1d29f 100644
--- a/python/cudf/cudf/_lib/cpp/strings/json.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/json.pxd
@@ -1,12 +1,12 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport scalar, string_scalar
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar, string_scalar
 
 
 cdef extern from "cudf/json/json.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/padding.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/padding.pxd
similarity index 54%
rename from python/cudf/cudf/_lib/cpp/strings/padding.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/padding.pxd
index c3906a5b4c6..26681a1aa00 100644
--- a/python/cudf/cudf/_lib/cpp/strings/padding.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/padding.pxd
@@ -1,13 +1,13 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.side_type cimport side_type
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.side_type cimport side_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/padding.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/regex_flags.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/regex_flags.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd
diff --git a/python/cudf/cudf/_lib/cpp/strings/regex_program.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_program.pxd
similarity index 74%
rename from python/cudf/cudf/_lib/cpp/strings/regex_program.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_program.pxd
index 7818c9c7d01..e92c8bd7737 100644
--- a/python/cudf/cudf/_lib/cpp/strings/regex_program.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_program.pxd
@@ -1,9 +1,9 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
 
 
 cdef extern from "cudf/strings/regex/regex_program.hpp" \
diff --git a/python/cudf/cudf/_lib/cpp/strings/repeat.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/repeat.pxd
similarity index 60%
rename from python/cudf/cudf/_lib/cpp/strings/repeat.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/repeat.pxd
index 2a6754b9a11..9e128529406 100644
--- a/python/cudf/cudf/_lib/cpp/strings/repeat.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/repeat.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \
diff --git a/python/cudf/cudf/_lib/cpp/strings/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd
similarity index 68%
rename from python/cudf/cudf/_lib/cpp/strings/replace.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd
index 2a9c6913bb3..92e142b33fc 100644
--- a/python/cudf/cudf/_lib/cpp/strings/replace.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd
@@ -1,13 +1,13 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/replace.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/replace_re.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace_re.pxd
similarity index 59%
rename from python/cudf/cudf/_lib/cpp/strings/replace_re.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace_re.pxd
index 94f3d0528a5..739505cd51d 100644
--- a/python/cudf/cudf/_lib/cpp/strings/replace_re.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace_re.pxd
@@ -1,15 +1,15 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/replace_re.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/side_type.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/side_type.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/side_type.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/side_type.pxd
diff --git a/python/cudf/cudf/_lib/cpp/strings/split/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/split/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/strings/split/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/strings/split/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/strings/split/partition.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/partition.pxd
similarity index 56%
rename from python/cudf/cudf/_lib/cpp/strings/split/partition.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/partition.pxd
index fb83512e9f0..5119124b3e3 100644
--- a/python/cudf/cudf/_lib/cpp/strings/split/partition.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/partition.pxd
@@ -1,12 +1,12 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 
 
 cdef extern from "cudf/strings/split/partition.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/strings/split/split.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/split.pxd
similarity index 76%
rename from python/cudf/cudf/_lib/cpp/strings/split/split.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/split.pxd
index d6207cd5c76..4f75664e47a 100644
--- a/python/cudf/cudf/_lib/cpp/strings/split/split.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/split.pxd
@@ -1,14 +1,14 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/split/split.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/strip.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/strip.pxd
new file mode 100644
index 00000000000..2d6fd6a9e89
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/strip.pxd
@@ -0,0 +1,16 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.side_type cimport side_type
+
+
+cdef extern from "cudf/strings/strip.hpp" namespace "cudf::strings" nogil:
+
+    cdef unique_ptr[column] strip(
+        column_view source_strings,
+        side_type stype,
+        string_scalar to_strip) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings/substring.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/substring.pxd
similarity index 60%
rename from python/cudf/cudf/_lib/cpp/strings/substring.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/substring.pxd
index 99ea8c7ff3f..02123cc0807 100644
--- a/python/cudf/cudf/_lib/cpp/strings/substring.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/substring.pxd
@@ -1,11 +1,11 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport numeric_scalar
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/slice.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/strings/translate.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/translate.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/cpp/strings/translate.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings/translate.pxd
index 3239ba314e4..b23ac277216 100644
--- a/python/cudf/cudf/_lib/cpp/strings/translate.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/translate.pxd
@@ -1,14 +1,14 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport char_utf8
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.types cimport char_utf8
 
 
 cdef extern from "cudf/strings/translate.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/wrap.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/wrap.pxd
new file mode 100644
index 00000000000..1d92d445634
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/wrap.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+
+cdef extern from "cudf/strings/wrap.hpp" namespace "cudf::strings" nogil:
+
+    cdef unique_ptr[column] wrap(
+        column_view source_strings,
+        size_type width) except +
diff --git a/python/cudf/cudf/_lib/cpp/strings_udf.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd
similarity index 81%
rename from python/cudf/cudf/_lib/cpp/strings_udf.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd
index 7d45bc858f5..b895d5e6925 100644
--- a/python/cudf/cudf/_lib/cpp/strings_udf.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t, uint16_t
 from libcpp.memory cimport unique_ptr
@@ -7,9 +7,9 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/cpp/table/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/table/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/table/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/table/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/table/table.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table.pxd
similarity index 62%
rename from python/cudf/cudf/_lib/cpp/table/table.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/table/table.pxd
index ac93e3def19..737a1327d45 100644
--- a/python/cudf/cudf/_lib/cpp/table/table.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table.pxd
@@ -1,11 +1,14 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table_view cimport mutable_table_view, table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport (
+    mutable_table_view,
+    table_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/table/table.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/table/table_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table_view.pxd
similarity index 77%
rename from python/cudf/cudf/_lib/cpp/table/table_view.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/table/table_view.pxd
index 728b6d2be4b..00e1a89c025 100644
--- a/python/cudf/cudf/_lib/cpp/table/table_view.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table_view.pxd
@@ -1,9 +1,12 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+    column_view,
+    mutable_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/table/table_view.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/transform.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/transform.pxd
similarity index 70%
rename from python/cudf/cudf/_lib/cpp/transform.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/transform.pxd
index d9de04b676e..b0a978fe5c5 100644
--- a/python/cudf/cudf/_lib/cpp/transform.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/transform.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -7,12 +7,16 @@ from libcpp.string cimport string
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.expressions cimport expression
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.expressions cimport expression
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    bitmask_type,
+    data_type,
+    size_type,
+)
 
 
 cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/transpose.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/transpose.pxd
similarity index 59%
rename from python/cudf/cudf/_lib/cpp/transpose.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/transpose.pxd
index 8cbfb0055bd..5dcb9c165ad 100644
--- a/python/cudf/cudf/_lib/cpp/transpose.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/transpose.pxd
@@ -1,10 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/transpose.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/types.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
diff --git a/python/cudf/cudf/_lib/cpp/types.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/types.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/types.pyx
diff --git a/python/cudf/cudf/_lib/cpp/unary.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
similarity index 84%
rename from python/cudf/cudf/_lib/cpp/unary.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
index cc07290b6c4..7f8ae2b7617 100644
--- a/python/cudf/cudf/_lib/cpp/unary.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
@@ -3,9 +3,9 @@
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport data_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/unary.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/unary.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/unary.pyx
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pyx
diff --git a/python/cudf/cudf/_lib/cpp/utilities/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/utilities/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/utilities/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/utilities/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/utilities/host_span.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/host_span.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/utilities/host_span.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/host_span.pxd
diff --git a/python/cudf/cudf/_lib/cpp/wrappers/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/wrappers/__init__.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.pxd
diff --git a/python/cudf/cudf/_lib/cpp/wrappers/__init__.py b/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/wrappers/__init__.py
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.py
diff --git a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/decimals.pxd
similarity index 82%
rename from python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/decimals.pxd
index 858569fd696..09b0c87e4b8 100644
--- a/python/cudf/cudf/_lib/cpp/wrappers/decimals.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/decimals.pxd
@@ -1,8 +1,8 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
 
-from cudf._lib.cpp.types cimport int128
+from cudf._lib.pylibcudf.libcudf.types cimport int128
 
 
 cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil:
diff --git a/python/cudf/cudf/_lib/cpp/wrappers/durations.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/durations.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/wrappers/durations.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/durations.pxd
diff --git a/python/cudf/cudf/_lib/cpp/wrappers/timestamps.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/timestamps.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/cpp/wrappers/timestamps.pxd
rename to python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/timestamps.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index cf96dfcb81e..b780d299977 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .table cimport Table
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index faeca56286e..654f39742b6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -3,9 +3,9 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.lists cimport explode as cpp_explode
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.lists cimport explode as cpp_explode
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .table cimport Table
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/merge.pyx b/python/cudf/cudf/_lib/pylibcudf/merge.pyx
index 91b2b0ea65b..5aa46c142f6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/merge.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/merge.pyx
@@ -4,10 +4,10 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp cimport merge as cpp_merge
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport null_order, order, size_type
+from cudf._lib.pylibcudf.libcudf cimport merge as cpp_merge
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, size_type
 
 from .table cimport Table
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/reduce.pxd b/python/cudf/cudf/_lib/pylibcudf/reduce.pxd
index a613e877ce2..935efd4acf2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/reduce.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/reduce.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.reduce cimport scan_type
+from cudf._lib.pylibcudf.libcudf.reduce cimport scan_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/reduce.pyx b/python/cudf/cudf/_lib/pylibcudf/reduce.pyx
index d12da712fcf..c272f183007 100644
--- a/python/cudf/cudf/_lib/pylibcudf/reduce.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/reduce.pyx
@@ -4,18 +4,22 @@ from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move, pair
 
-from cudf._lib.cpp cimport reduce as cpp_reduce
-from cudf._lib.cpp.aggregation cimport reduce_aggregation, scan_aggregation
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.reduce cimport scan_type
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf cimport reduce as cpp_reduce
+from cudf._lib.pylibcudf.libcudf.aggregation cimport (
+    reduce_aggregation,
+    scan_aggregation,
+)
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.reduce cimport scan_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 from .aggregation cimport Aggregation
 from .column cimport Column
 from .scalar cimport Scalar
 from .types cimport DataType
 
-from cudf._lib.cpp.reduce import scan_type as ScanType  # no-cython-lint
+from cudf._lib.pylibcudf.libcudf.reduce import \
+    scan_type as ScanType  # no-cython-lint
 
 
 cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type):
diff --git a/python/cudf/cudf/_lib/pylibcudf/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/replace.pxd
index fc42b985c8e..40484c728db 100644
--- a/python/cudf/cudf/_lib/pylibcudf/replace.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/replace.pxd
@@ -2,7 +2,7 @@
 
 from libcpp cimport bool
 
-from cudf._lib.cpp.replace cimport replace_policy
+from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/replace.pyx b/python/cudf/cudf/_lib/pylibcudf/replace.pyx
index dd3a733ee3a..6e08e8f64a9 100644
--- a/python/cudf/cudf/_lib/pylibcudf/replace.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/replace.pyx
@@ -7,10 +7,10 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp cimport replace as cpp_replace
-from cudf._lib.cpp.column.column cimport column
+from cudf._lib.pylibcudf.libcudf cimport replace as cpp_replace
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
 
-from cudf._lib.cpp.replace import \
+from cudf._lib.pylibcudf.libcudf.replace import \
     replace_policy as ReplacePolicy  # no-cython-lint
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/rolling.pxd b/python/cudf/cudf/_lib/pylibcudf/rolling.pxd
index 88d683c0c35..cdadee68d43 100644
--- a/python/cudf/cudf/_lib/pylibcudf/rolling.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/rolling.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/rolling.pyx b/python/cudf/cudf/_lib/pylibcudf/rolling.pyx
index 8a1d83911ca..7aa7828a5dd 100644
--- a/python/cudf/cudf/_lib/pylibcudf/rolling.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/rolling.pyx
@@ -4,10 +4,10 @@ from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp cimport rolling as cpp_rolling
-from cudf._lib.cpp.aggregation cimport rolling_aggregation
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf cimport rolling as cpp_rolling
+from cudf._lib.pylibcudf.libcudf.aggregation cimport rolling_aggregation
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
index 85744eca902..3de86d93519 100644
--- a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
@@ -5,7 +5,7 @@ from libcpp.memory cimport unique_ptr
 
 from rmm._lib.memory_resource cimport DeviceMemoryResource
 
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 from .types cimport DataType
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
index 4a2d8f393bd..6799c37cea2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
@@ -5,7 +5,7 @@ from libcpp.memory cimport unique_ptr
 
 from rmm._lib.memory_resource cimport get_current_device_resource
 
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 from .types cimport DataType
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/search.pyx b/python/cudf/cudf/_lib/pylibcudf/search.pyx
index a186167af13..151a39f204f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/search.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/search.pyx
@@ -4,9 +4,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp cimport search as cpp_search
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.types cimport null_order, order
+from cudf._lib.pylibcudf.libcudf cimport search as cpp_search
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, order
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd b/python/cudf/cudf/_lib/pylibcudf/sorting.pxd
index 3ed241622c0..a4ea541a03b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/sorting.pxd
@@ -2,8 +2,13 @@
 
 from libcpp cimport bool
 
-from cudf._lib.cpp.aggregation cimport rank_method
-from cudf._lib.cpp.types cimport null_order, null_policy, order, size_type
+from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    null_order,
+    null_policy,
+    order,
+    size_type,
+)
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx b/python/cudf/cudf/_lib/pylibcudf/sorting.pyx
index 1668a3efc7c..8c5a8e26899 100644
--- a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/sorting.pyx
@@ -4,11 +4,11 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp cimport sorting as cpp_sorting
-from cudf._lib.cpp.aggregation cimport rank_method
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport null_order, null_policy, order
+from cudf._lib.pylibcudf.libcudf cimport sorting as cpp_sorting
+from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, null_policy, order
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
index 29acc21fc05..6f89aaf90e7 100644
--- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
@@ -1,7 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.stream_compaction cimport duplicate_keep_option
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.stream_compaction cimport (
+    duplicate_keep_option,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     nan_policy,
     null_equality,
diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
index af7a85d31bf..43449d3690a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
@@ -4,11 +4,15 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp cimport stream_compaction as cpp_stream_compaction
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.stream_compaction cimport duplicate_keep_option
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf cimport (
+    stream_compaction as cpp_stream_compaction,
+)
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.stream_compaction cimport (
+    duplicate_keep_option,
+)
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     nan_policy,
     null_equality,
@@ -16,7 +20,7 @@ from cudf._lib.cpp.types cimport (
     size_type,
 )
 
-from cudf._lib.cpp.stream_compaction import \
+from cudf._lib.pylibcudf.libcudf.stream_compaction import \
     duplicate_keep_option as DuplicateKeepOption  # no-cython-lint, isort:skip
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
index 69910fd8c50..3a360fd6b10 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
@@ -3,9 +3,9 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.strings cimport case as cpp_case
 from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.strings cimport case as cpp_case
 
 
 cpdef Column to_lower(Column input):
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
index 22e933106c7..bb43069f190 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
@@ -1,7 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.types cimport size_type
 from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.pylibcudf.scalar cimport Scalar
 
 ctypedef fused ColumnOrScalar:
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
index 1d94132a8b3..a0214efd0a1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
@@ -2,14 +2,14 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.strings cimport find as cpp_find
 from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.strings cimport find as cpp_find
 from cudf._lib.pylibcudf.scalar cimport Scalar
 
 from cython.operator import dereference
 
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cpdef Column find(
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/cudf/cudf/_lib/pylibcudf/table.pxd
index 7467bfccaa8..e476fc770e3 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pxd
@@ -2,8 +2,8 @@
 
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef class Table:
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx
index 1fa60ec2b6c..d93ac78721b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx
@@ -5,9 +5,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 
 from .column cimport Column
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd
index 6c53636d332..e54a259819e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd
@@ -3,7 +3,7 @@
 from libc.stdint cimport int32_t
 from libcpp cimport bool as cbool
 
-from cudf._lib.cpp.types cimport (
+from cudf._lib.pylibcudf.libcudf.types cimport (
     data_type,
     interpolation,
     mask_state,
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index baf92223714..ebe4d66fa20 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -2,17 +2,17 @@
 
 from libc.stdint cimport int32_t
 
-from cudf._lib.cpp.types cimport data_type, type_id
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
 
-from cudf._lib.cpp.types import type_id as TypeId  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import nan_policy as NanPolicy  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import null_policy as NullPolicy  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import interpolation as Interpolation  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import nan_equality as NanEquality  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import null_equality as NullEquality  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import null_order as NullOrder  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import order as Order  # no-cython-lint, isort:skip
-from cudf._lib.cpp.types import sorted as Sorted  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import type_id as TypeId  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import nan_policy as NanPolicy  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import null_policy as NullPolicy  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import interpolation as Interpolation  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import nan_equality as NanEquality  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import null_equality as NullEquality  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import null_order as NullOrder  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import order as Order  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import sorted as Sorted  # no-cython-lint, isort:skip
 
 
 cdef class DataType:
diff --git a/python/cudf/cudf/_lib/pylibcudf/unary.pxd b/python/cudf/cudf/_lib/pylibcudf/unary.pxd
index b4372db4ae2..4aa4543bb80 100644
--- a/python/cudf/cudf/_lib/pylibcudf/unary.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/unary.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.cpp.unary cimport unary_operator
+from cudf._lib.pylibcudf.libcudf.unary cimport unary_operator
 
 from .column cimport Column
 from .types cimport DataType
diff --git a/python/cudf/cudf/_lib/pylibcudf/unary.pyx b/python/cudf/cudf/_lib/pylibcudf/unary.pyx
index 437dd313e85..0879b501a49 100644
--- a/python/cudf/cudf/_lib/pylibcudf/unary.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/unary.pyx
@@ -3,11 +3,11 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.cpp cimport unary as cpp_unary
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.unary cimport unary_operator
+from cudf._lib.pylibcudf.libcudf cimport unary as cpp_unary
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.unary cimport unary_operator
 
-from cudf._lib.cpp.unary import \
+from cudf._lib.pylibcudf.libcudf.unary import \
     unary_operator as UnaryOperator  # no-cython-lint
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/utils.pxd b/python/cudf/cudf/_lib/pylibcudf/utils.pxd
index 7efeaaf7e24..77c05086397 100644
--- a/python/cudf/cudf/_lib/pylibcudf/utils.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/utils.pxd
@@ -3,8 +3,8 @@
 from libcpp.functional cimport reference_wrapper
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport bitmask_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type
 
 
 cdef void * int_to_void_ptr(Py_ssize_t ptr) nogil
diff --git a/python/cudf/cudf/_lib/pylibcudf/utils.pyx b/python/cudf/cudf/_lib/pylibcudf/utils.pyx
index ea34a87a72a..b4427e8ecff 100644
--- a/python/cudf/cudf/_lib/pylibcudf/utils.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/utils.pyx
@@ -6,8 +6,8 @@ from libc.stdint cimport uintptr_t
 from libcpp.functional cimport reference_wrapper
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport bitmask_type
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type
 
 from .scalar cimport Scalar
 
diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx
index d3a02fa7cbf..3d20454a7ce 100644
--- a/python/cudf/cudf/_lib/quantiles.pyx
+++ b/python/cudf/cudf/_lib/quantiles.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -17,15 +17,20 @@ from cudf._lib.types cimport (
 
 from cudf._lib.types import Interpolation
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.quantiles cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.quantiles cimport (
     quantile as cpp_quantile,
     quantiles as cpp_quantile_table,
 )
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport interpolation, null_order, order, sorted
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    interpolation,
+    null_order,
+    order,
+    sorted,
+)
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx
index c237b7b1389..48e386bcf02 100644
--- a/python/cudf/cudf/_lib/reshape.pyx
+++ b/python/cudf/cudf/_lib/reshape.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,14 +6,14 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.reshape cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.reshape cimport (
     interleave_columns as cpp_interleave_columns,
     tile as cpp_tile,
 )
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/round.pyx b/python/cudf/cudf/_lib/round.pyx
index 7eddb1b8cbd..c1c36dd8854 100644
--- a/python/cudf/cudf/_lib/round.pyx
+++ b/python/cudf/cudf/_lib/round.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,9 +6,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.round cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.round cimport (
     round as cpp_round,
     rounding_method as cpp_rounding_method,
 )
diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd
index 154ee22e796..b57acbb37f1 100644
--- a/python/cudf/cudf/_lib/scalar.pxd
+++ b/python/cudf/cudf/_lib/scalar.pxd
@@ -5,7 +5,7 @@ from libcpp.memory cimport unique_ptr
 
 from rmm._lib.memory_resource cimport DeviceMemoryResource
 
-from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef class DeviceScalar:
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index aee496e9f1c..e68398498d1 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -17,27 +17,27 @@ from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES
 from cudf.core.dtypes import ListDtype, StructDtype
 from cudf.core.missing import NA, NaT
 
-cimport cudf._lib.cpp.types as libcudf_types
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
 # We currently need this cimport because some of the implementations here
 # access the c_obj of the scalar, and because we need to be able to call
 # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until
 # DeviceScalar is phased out entirely from cuDF Cython (at which point
 # cudf.Scalar will be directly backed by pylibcudf.Scalar).
-from cudf._lib cimport pylibcudf
-from cudf._lib.cpp.scalar.scalar cimport (
+from cudf._lib.pylibcudf cimport Scalar as plc_Scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
     duration_scalar,
     list_scalar,
     scalar,
     struct_scalar,
     timestamp_scalar,
 )
-from cudf._lib.cpp.wrappers.durations cimport (
+from cudf._lib.pylibcudf.libcudf.wrappers.durations cimport (
     duration_ms,
     duration_ns,
     duration_s,
     duration_us,
 )
-from cudf._lib.cpp.wrappers.timestamps cimport (
+from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport (
     timestamp_ms,
     timestamp_ns,
     timestamp_s,
@@ -206,7 +206,7 @@ cdef class DeviceScalar:
         return self._to_host_scalar()
 
     cdef const scalar* get_raw_ptr(self) except *:
-        return (<pylibcudf.Scalar> self.c_value).c_obj.get()
+        return (<plc_Scalar> self.c_value).c_obj.get()
 
     cpdef bool is_valid(self):
         """
@@ -230,7 +230,7 @@ cdef class DeviceScalar:
         """
         cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar)
         # Note: This line requires pylibcudf to be cimported
-        s.c_value = pylibcudf.Scalar.from_libcudf(move(ptr))
+        s.c_value = plc_Scalar.from_libcudf(move(ptr))
         s._set_dtype(dtype)
         return s
 
@@ -369,11 +369,11 @@ def _create_proxy_nat_scalar(dtype):
         nat = dtype.type('NaT').astype(dtype)
         if dtype.type == np.datetime64:
             _set_datetime64_from_np_scalar(
-                (<pylibcudf.Scalar> result.c_value).c_obj, nat, dtype, True
+                (<plc_Scalar> result.c_value).c_obj, nat, dtype, True
             )
         elif dtype.type == np.timedelta64:
             _set_timedelta64_from_np_scalar(
-                (<pylibcudf.Scalar> result.c_value).c_obj, nat, dtype, True
+                (<plc_Scalar> result.c_value).c_obj, nat, dtype, True
             )
         return result
     else:
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
index b2b84c17cf4..ff9565b9a89 100644
--- a/python/cudf/cudf/_lib/sort.pyx
+++ b/python/cudf/cudf/_lib/sort.pyx
@@ -10,11 +10,11 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.aggregation cimport rank_method
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.search cimport lower_bound, upper_bound
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport null_order, order as cpp_order
+from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.search cimport lower_bound, upper_bound
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, order as cpp_order
 from cudf._lib.utils cimport (
     columns_from_pylibcudf_table,
     table_view_from_columns,
diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index 3826e71f850..dfad7fd101c 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -12,39 +12,39 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.convert.convert_booleans cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_booleans cimport (
     from_booleans as cpp_from_booleans,
     to_booleans as cpp_to_booleans,
 )
-from cudf._lib.cpp.strings.convert.convert_datetime cimport (
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_datetime cimport (
     from_timestamps as cpp_from_timestamps,
     is_timestamp as cpp_is_timestamp,
     to_timestamps as cpp_to_timestamps,
 )
-from cudf._lib.cpp.strings.convert.convert_durations cimport (
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_durations cimport (
     from_durations as cpp_from_durations,
     to_durations as cpp_to_durations,
 )
-from cudf._lib.cpp.strings.convert.convert_floats cimport (
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_floats cimport (
     from_floats as cpp_from_floats,
     to_floats as cpp_to_floats,
 )
-from cudf._lib.cpp.strings.convert.convert_integers cimport (
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_integers cimport (
     from_integers as cpp_from_integers,
     hex_to_integers as cpp_hex_to_integers,
     integers_to_hex as cpp_integers_to_hex,
     is_hex as cpp_is_hex,
     to_integers as cpp_to_integers,
 )
-from cudf._lib.cpp.strings.convert.convert_ipv4 cimport (
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_ipv4 cimport (
     integers_to_ipv4 as cpp_integers_to_ipv4,
     ipv4_to_integers as cpp_ipv4_to_integers,
     is_ipv4 as cpp_is_ipv4,
 )
-from cudf._lib.cpp.types cimport data_type, type_id
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
 from cudf._lib.types cimport underlying_type_t_type_id
 
 import cudf
diff --git a/python/cudf/cudf/_lib/strings/attributes.pyx b/python/cudf/cudf/_lib/strings/attributes.pyx
index c1b69dda353..1f3d7c4eb1b 100644
--- a/python/cudf/cudf/_lib/strings/attributes.pyx
+++ b/python/cudf/cudf/_lib/strings/attributes.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,9 +6,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.attributes cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.attributes cimport (
     code_points as cpp_code_points,
     count_bytes as cpp_count_bytes,
     count_characters as cpp_count_characters,
diff --git a/python/cudf/cudf/_lib/strings/capitalize.pyx b/python/cudf/cudf/_lib/strings/capitalize.pyx
index f6a80ac8fbe..1420a2bbaf2 100644
--- a/python/cudf/cudf/_lib/strings/capitalize.pyx
+++ b/python/cudf/cudf/_lib/strings/capitalize.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,9 +6,9 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.capitalize cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.capitalize cimport (
     capitalize as cpp_capitalize,
     is_title as cpp_is_title,
     title as cpp_title,
diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx
index 14d78cdaa51..5b7b6d19d9e 100644
--- a/python/cudf/cudf/_lib/strings/char_types.pyx
+++ b/python/cudf/cudf/_lib/strings/char_types.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 
 from libcpp cimport bool
@@ -8,10 +8,10 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.char_types cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.char_types cimport (
     all_characters_of_type as cpp_all_characters_of_type,
     filter_characters_of_type as cpp_filter_characters_of_type,
     string_character_types,
diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx
index 7d86d34ab25..288f333d4d8 100644
--- a/python/cudf/cudf/_lib/strings/combine.pyx
+++ b/python/cudf/cudf/_lib/strings/combine.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.core.buffer import acquire_spill_lock
 
@@ -6,17 +6,17 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.combine cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.combine cimport (
     concatenate as cpp_concatenate,
     join_list_elements as cpp_join_list_elements,
     join_strings as cpp_join_strings,
     output_if_empty_list,
     separator_on_nulls,
 )
-from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport table_view_from_columns
 
diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx
index 82034f7f8b7..087acd8062d 100644
--- a/python/cudf/cudf/_lib/strings/contains.pyx
+++ b/python/cudf/cudf/_lib/strings/contains.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
@@ -10,17 +10,17 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.contains cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.contains cimport (
     contains_re as cpp_contains_re,
     count_re as cpp_count_re,
     like as cpp_like,
     matches_re as cpp_matches_re,
 )
-from cudf._lib.cpp.strings.regex_flags cimport regex_flags
-from cudf._lib.cpp.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
index 2085d5c2896..6faff606226 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 import cudf
 
@@ -8,14 +8,14 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.convert.convert_fixed_point cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_fixed_point cimport (
     from_fixed_point as cpp_from_fixed_point,
     is_fixed_point as cpp_is_fixed_point,
     to_fixed_point as cpp_to_fixed_point,
 )
-from cudf._lib.cpp.types cimport data_type, type_id
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
index d1617d85593..341cbc99dab 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,9 +6,9 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.convert.convert_floats cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_floats cimport (
     is_float as cpp_is_float,
 )
 
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
index 52a4791775a..081b03cdc0d 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,9 +6,9 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.convert.convert_integers cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_integers cimport (
     is_integer as cpp_is_integer,
 )
 
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
index 1a89fa7604b..4418bf2a72d 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,10 +6,10 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.convert.convert_lists cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_lists cimport (
     format_list_column as cpp_format_list_column,
 )
 
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
index bc8123281f0..5f62efe5c00 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,9 +6,9 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.convert.convert_urls cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.convert.convert_urls cimport (
     url_decode as cpp_url_decode,
     url_encode as cpp_url_encode,
 )
diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx
index d3d8610cdf0..3b80c4f6368 100644
--- a/python/cudf/cudf/_lib/strings/extract.pyx
+++ b/python/cudf/cudf/_lib/strings/extract.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
@@ -9,11 +9,11 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.extract cimport extract as cpp_extract
-from cudf._lib.cpp.strings.regex_flags cimport regex_flags
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.extract cimport extract as cpp_extract
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.utils cimport data_from_unique_ptr
 
 
diff --git a/python/cudf/cudf/_lib/strings/find.pyx b/python/cudf/cudf/_lib/strings/find.pyx
index 341776b102c..3c0009ee569 100644
--- a/python/cudf/cudf/_lib/strings/find.pyx
+++ b/python/cudf/cudf/_lib/strings/find.pyx
@@ -4,7 +4,7 @@ import cudf._lib.pylibcudf as plc
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx
index c2a97a4fd7c..c75f28db21b 100644
--- a/python/cudf/cudf/_lib/strings/find_multiple.pyx
+++ b/python/cudf/cudf/_lib/strings/find_multiple.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,9 +6,9 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.find_multiple cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.find_multiple cimport (
     find_multiple as cpp_find_multiple,
 )
 
diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx
index 6df1d32dcfe..0d409889bc8 100644
--- a/python/cudf/cudf/_lib/strings/findall.pyx
+++ b/python/cudf/cudf/_lib/strings/findall.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
 from libc.stdint cimport uint32_t
@@ -9,11 +9,11 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.findall cimport findall as cpp_findall
-from cudf._lib.cpp.strings.regex_flags cimport regex_flags
-from cudf._lib.cpp.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.findall cimport findall as cpp_findall
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx
index 861e0daa6e3..560f284b56c 100644
--- a/python/cudf/cudf/_lib/strings/json.pyx
+++ b/python/cudf/cudf/_lib/strings/json.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,10 +6,10 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.json cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.json cimport (
     get_json_object as cpp_get_json_object,
     get_json_object_options,
 )
diff --git a/python/cudf/cudf/_lib/strings/padding.pyx b/python/cudf/cudf/_lib/strings/padding.pyx
index 340d7eb52d8..9226810951f 100644
--- a/python/cudf/cudf/_lib/strings/padding.pyx
+++ b/python/cudf/cudf/_lib/strings/padding.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
@@ -7,14 +7,17 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from enum import IntEnum
 
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.strings.padding cimport pad as cpp_pad, zfill as cpp_zfill
-from cudf._lib.cpp.strings.side_type cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.strings.padding cimport (
+    pad as cpp_pad,
+    zfill as cpp_zfill,
+)
+from cudf._lib.pylibcudf.libcudf.strings.side_type cimport (
     side_type,
     underlying_type_t_side_type,
 )
diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx
index 4896fb74f41..2b8116848cf 100644
--- a/python/cudf/cudf/_lib/strings/repeat.pyx
+++ b/python/cudf/cudf/_lib/strings/repeat.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,10 +6,10 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings cimport repeat as cpp_repeat
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings cimport repeat as cpp_repeat
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx
index 80c9ba95fd8..880201e65a2 100644
--- a/python/cudf/cudf/_lib/strings/replace.pyx
+++ b/python/cudf/cudf/_lib/strings/replace.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
@@ -7,14 +7,14 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.replace cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.replace cimport (
     replace as cpp_replace,
     replace_slice as cpp_replace_slice,
 )
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx
index 1fbbaa8f44f..e13880a6186 100644
--- a/python/cudf/cudf/_lib/strings/replace_re.pyx
+++ b/python/cudf/cudf/_lib/strings/replace_re.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
@@ -9,16 +9,16 @@ from libcpp.vector cimport vector
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.regex_flags cimport regex_flags
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-from cudf._lib.cpp.strings.replace_re cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.strings.replace_re cimport (
     replace_re as cpp_replace_re,
     replace_with_backrefs as cpp_replace_with_backrefs,
 )
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx
index 281d131372a..be377c0f86b 100644
--- a/python/cudf/cudf/_lib/strings/split/partition.pyx
+++ b/python/cudf/cudf/_lib/strings/split/partition.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,13 +6,13 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.split.partition cimport (
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.split.partition cimport (
     partition as cpp_partition,
     rpartition as cpp_rpartition,
 )
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport data_from_unique_ptr
 
diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx
index 08c7dde921f..942235686d7 100644
--- a/python/cudf/cudf/_lib/strings/split/split.pyx
+++ b/python/cudf/cudf/_lib/strings/split/split.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
@@ -8,12 +8,12 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.regex_flags cimport regex_flags
-from cudf._lib.cpp.strings.regex_program cimport regex_program
-from cudf._lib.cpp.strings.split.split cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from cudf._lib.pylibcudf.libcudf.strings.split.split cimport (
     rsplit as cpp_rsplit,
     rsplit_re as cpp_rsplit_re,
     rsplit_record as cpp_rsplit_record,
@@ -23,8 +23,8 @@ from cudf._lib.cpp.strings.split.split cimport (
     split_record as cpp_split_record,
     split_record_re as cpp_split_record_re,
 )
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport data_from_unique_ptr
 
diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx
index 2c53782d6ba..199fa5fc3b6 100644
--- a/python/cudf/cudf/_lib/strings/strip.pyx
+++ b/python/cudf/cudf/_lib/strings/strip.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,11 +6,11 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.side_type cimport side_type
-from cudf._lib.cpp.strings.strip cimport strip as cpp_strip
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.side_type cimport side_type
+from cudf._lib.pylibcudf.libcudf.strings.strip cimport strip as cpp_strip
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/substring.pyx b/python/cudf/cudf/_lib/strings/substring.pyx
index e6b8cdd28ee..170c1016b89 100644
--- a/python/cudf/cudf/_lib/strings/substring.pyx
+++ b/python/cudf/cudf/_lib/strings/substring.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import numpy as np
 
@@ -8,14 +8,16 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.substring cimport slice_strings as cpp_slice_strings
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.substring cimport (
+    slice_strings as cpp_slice_strings,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.scalar import as_device_scalar
 
-from cudf._lib.cpp.scalar.scalar cimport numeric_scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx
index 55659e98dcb..8846e2e280d 100644
--- a/python/cudf/cudf/_lib/strings/translate.pyx
+++ b/python/cudf/cudf/_lib/strings/translate.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -9,15 +9,15 @@ from libcpp.vector cimport vector
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.strings.translate cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.translate cimport (
     filter_characters as cpp_filter_characters,
     filter_type,
     translate as cpp_translate,
 )
-from cudf._lib.cpp.types cimport char_utf8
+from cudf._lib.pylibcudf.libcudf.types cimport char_utf8
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/wrap.pyx b/python/cudf/cudf/_lib/strings/wrap.pyx
index 8b0c367e791..92750f21e4d 100644
--- a/python/cudf/cudf/_lib/strings/wrap.pyx
+++ b/python/cudf/cudf/_lib/strings/wrap.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -6,10 +6,10 @@ from libcpp.utility cimport move
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.strings.wrap cimport wrap as cpp_wrap
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.strings.wrap cimport wrap as cpp_wrap
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx
index a59e6db1b72..e952492c45d 100644
--- a/python/cudf/cudf/_lib/strings_udf.pyx
+++ b/python/cudf/cudf/_lib/strings_udf.pyx
@@ -1,8 +1,8 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t, uint16_t, uintptr_t
 
-from cudf._lib.cpp.strings_udf cimport (
+from cudf._lib.pylibcudf.libcudf.strings_udf cimport (
     get_character_cases_table as cpp_get_character_cases_table,
     get_character_flags_table as cpp_get_character_flags_table,
     get_special_case_mapping_table as cpp_get_special_case_mapping_table,
@@ -18,8 +18,8 @@ from cudf.core.buffer import as_buffer
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column, column_view
-from cudf._lib.cpp.strings_udf cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
+from cudf._lib.pylibcudf.libcudf.strings_udf cimport (
     column_from_udf_string_array as cpp_column_from_udf_string_array,
     free_udf_string_array as cpp_free_udf_string_array,
     to_string_view_array as cpp_to_string_view_array,
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
index a7346cdd586..6e63b8758b8 100644
--- a/python/cudf/cudf/_lib/text.pyx
+++ b/python/cudf/cudf/_lib/text.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from io import TextIOBase
 
@@ -9,8 +9,8 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.io.text cimport (
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.io.text cimport (
     byte_range_info,
     data_chunk_source,
     make_source,
diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx
index 808d1321b0b..53977e984c2 100644
--- a/python/cudf/cudf/_lib/timezone.pyx
+++ b/python/cudf/cudf/_lib/timezone.pyx
@@ -1,14 +1,14 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.optional cimport make_optional
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.io.timezone cimport (
+from cudf._lib.pylibcudf.libcudf.io.timezone cimport (
     make_timezone_transition_table as cpp_make_timezone_transition_table,
 )
-from cudf._lib.cpp.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.utils cimport columns_from_unique_ptr
 
 
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index d8eb6134042..b325173f20d 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from numba.np import numpy_support
 
@@ -17,15 +17,20 @@ from libcpp.utility cimport move
 
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
-cimport cudf._lib.cpp.transform as libcudf_transform
+cimport cudf._lib.pylibcudf.libcudf.transform as libcudf_transform
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.expressions cimport expression
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport bitmask_type, data_type, size_type, type_id
 from cudf._lib.expressions cimport Expression
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.expressions cimport expression
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    bitmask_type,
+    data_type,
+    size_type,
+    type_id,
+)
 from cudf._lib.types cimport underlying_type_t_type_id
 from cudf._lib.utils cimport (
     columns_from_unique_ptr,
diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
index 51e49b1f27a..82b23439e6a 100644
--- a/python/cudf/cudf/_lib/transpose.pyx
+++ b/python/cudf/cudf/_lib/transpose.pyx
@@ -1,13 +1,13 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.transpose cimport transpose as cpp_transpose
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.transpose cimport transpose as cpp_transpose
 from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
index a95db84ceff..519d5ff8554 100644
--- a/python/cudf/cudf/_lib/types.pxd
+++ b/python/cudf/cudf/_lib/types.pxd
@@ -1,11 +1,13 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp cimport bool
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
 
 ctypedef bool underlying_type_t_order
 ctypedef bool underlying_type_t_null_order
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index 1b4f4617e97..895e1afc502 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -7,9 +7,11 @@ import pandas as pd
 
 from libcpp.memory cimport make_shared, shared_ptr
 
-cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
+cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
 from cudf._lib.types cimport (
     underlying_type_t_interpolation,
     underlying_type_t_order,
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index 51c69bdcaf9..c5a1e7552b9 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -4,8 +4,8 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.column.column cimport column_view
-from cudf._lib.cpp.table.table cimport table, table_view
+from cudf._lib.pylibcudf.libcudf.column.column cimport column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table, table_view
 
 
 cdef data_from_unique_ptr(
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 0afecb215e4..4c4cd48d6ed 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -11,10 +11,10 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column, column_view
-from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 try:
     import ujson as json
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
index 068837d04ee..84a3a32646d 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool
@@ -7,8 +7,8 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from cudf._lib.cpp.io.datasource cimport datasource
 from cudf._lib.io.datasource cimport Datasource
+from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 
 
 cdef extern from "cudf_kafka/kafka_callback.hpp" \
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
index 2fbaacff7c6..2927dc0aa9a 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool, nullptr
@@ -7,7 +7,7 @@ from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.cpp.io.datasource cimport datasource
+from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 
 from cudf_kafka._lib.kafka cimport kafka_consumer
 

From 0fea3ed7e649ec8acf23ae91edf2058fe7d9e77e Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 14 May 2024 20:21:13 -0700
Subject: [PATCH 205/842] Fix arrow versioning logic (#15755)

Resolves #15754

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15755
---
 cpp/cmake/thirdparty/get_arrow.cmake | 21 ++++++++++++++-------
 dependencies.yaml                    |  4 ++--
 python/cudf/pyproject.toml           |  2 +-
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index e9d2f479088..73e66cce608 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -26,13 +26,20 @@ include_guard(GLOBAL)
 # pyarrow.
 function(find_libarrow_in_python_wheel PYARROW_VERSION)
   string(REPLACE "." ";" PYARROW_VER_COMPONENTS "${PYARROW_VERSION}")
-  list(GET PYARROW_VER_COMPONENTS 0 PYARROW_SO_VER)
-  # The soname for Arrow libraries is constructed using the major version plus "00". Note that,
-  # although it may seem like it due to Arrow almost exclusively releasing new major versions (i.e.
-  # `${MINOR_VERSION}${PATCH_VERSION}` is almost always equivalent to "00"),
-  # the soname is not generated by concatenating the major, minor, and patch versions into a single
-  # version number soname, just `${MAJOR_VERSION}00`
-  set(PYARROW_LIB "libarrow.so.${PYARROW_SO_VER}00")
+  list(GET PYARROW_VER_COMPONENTS 0 PYARROW_MAJOR_VER)
+  list(GET PYARROW_VER_COMPONENTS 1 PYARROW_MINOR_VER)
+
+  # Ensure that the major and minor versions are two digits long
+  string(LENGTH ${PYARROW_MAJOR_VER} PYARROW_MAJOR_LENGTH)
+  string(LENGTH ${PYARROW_MINOR_VER} PYARROW_MINOR_LENGTH)
+  if(${PYARROW_MAJOR_LENGTH} EQUAL 1)
+    set(PYARROW_MAJOR_VER "0${PYARROW_MAJOR_VER}")
+  endif()
+  if(${PYARROW_MINOR_LENGTH} EQUAL 1)
+    set(PYARROW_MINOR_VER "0${PYARROW_MINOR_VER}")
+  endif()
+
+  set(PYARROW_LIB "libarrow.so.${PYARROW_MAJOR_VER}${PYARROW_MINOR_VER}")
 
   string(
     APPEND
diff --git a/dependencies.yaml b/dependencies.yaml
index 27b0f23389c..898760d1351 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -352,8 +352,8 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          # Allow runtime version to float up to minor version
-          - pyarrow>=16.0.0,<17.0.0a0
+          # Allow runtime version to float up to patch version
+          - pyarrow>=16.0.0,<16.1.0a0
   cuda_version:
     specific:
       - output_types: conda
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 4b57bcd018a..826362f0632 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -34,7 +34,7 @@ dependencies = [
     "packaging",
     "pandas>=2.0,<2.2.3dev0",
     "ptxcompiler",
-    "pyarrow>=16.0.0,<17.0.0a0",
+    "pyarrow>=16.0.0,<16.1.0a0",
     "rich",
     "rmm==24.6.*",
     "typing_extensions>=4.0.0",

From 4a6d13f232aba099b47ea3c95fa429209fcf863b Mon Sep 17 00:00:00 2001
From: Nick Becker <nickb500@gmail.com>
Date: Wed, 15 May 2024 00:26:37 -0400
Subject: [PATCH 206/842] Update cudf.pandas docs for GA (#15744)

cudf.pandas is now generally available (see the [RAPIDS 24.04 release blog](https://medium.com/rapids-ai/rapids-24-04-release-c11cf44c3e23#f263) for more information).

This PR updates the docs accordingly.

Authors:
  - Nick Becker (https://github.com/beckernick)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15744
---
 docs/cudf/source/cudf_pandas/index.rst | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/docs/cudf/source/cudf_pandas/index.rst b/docs/cudf/source/cudf_pandas/index.rst
index 628194cc8a5..f98c04cc383 100644
--- a/docs/cudf/source/cudf_pandas/index.rst
+++ b/docs/cudf/source/cudf_pandas/index.rst
@@ -34,10 +34,8 @@ automatically **falling back to pandas** for other operations.
 | Nothing changes, not even your `import` statements, when going from CPU to GPU.             | Combines the full flexibility of Pandas with blazing fast performance of cuDF                                       |
 +---------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------+
 
-Starting with the version 23.10.01 release ``cudf.pandas`` is
-available in Open Beta, as part of the ``cudf`` package .  See `RAPIDS
-Quick Start <https://rapids.ai/#quick-start>`_ to get up-and-running
-with ``cudf``.
+``cudf.pandas`` is now Generally Available (GA) as part of the ``cudf`` package.  See `RAPIDS
+Quick Start <https://rapids.ai/#quick-start>`_ to get up-and-running with ``cudf``.
 
 .. toctree::
    :maxdepth: 1

From 04d247c072b55ce8265d18c7e1a56e4abb31f6cf Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 15 May 2024 06:34:25 -0500
Subject: [PATCH 207/842] Handle empty dataframe object with index present in
 setitem of `loc` (#15752)

Fixes: #15718

This PR fixes an issue with `loc` setitem where the dataframe is empty but has an index of length greater than 0.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15752
---
 python/cudf/cudf/core/dataframe.py      | 4 ++--
 python/cudf/cudf/tests/test_indexing.py | 9 +++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b29089cb81a..8442cf05f01 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -405,12 +405,12 @@ def _setitem_tuple_arg(self, key, value):
                 value = as_column(value, length=length)
 
             new_col = cudf.Series(value, index=idx)
-            if not self._frame.empty:
+            if len(self._frame.index) != 0:
                 new_col = new_col._align_to_index(
                     self._frame.index, how="right"
                 )
 
-            if self._frame.empty:
+            if len(self._frame.index) == 0:
                 self._frame.index = (
                     idx if idx is not None else cudf.RangeIndex(len(new_col))
                 )
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 5f5c4579e01..f49b9b02076 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -2255,3 +2255,12 @@ def test_scalar_loc_row_categoricalindex():
     result = df.loc["a"]
     expected = df.to_pandas().loc["a"]
     assert_eq(result, expected)
+
+
+def test_loc_setitem_empty_dataframe():
+    pdf = pd.DataFrame(index=["index_1", "index_2", "index_3"])
+    gdf = cudf.from_pandas(pdf)
+    pdf.loc[["index_1"], "new_col"] = "A"
+    gdf.loc[["index_1"], "new_col"] = "A"
+
+    assert_eq(pdf, gdf)

From fa9d028073f73218fe0dd4e49671c39fa11fc42c Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 15 May 2024 06:34:54 -0500
Subject: [PATCH 208/842] Allow `None` when `nan_as_null=False` in column
 constructor (#15709)

Fixes: #15708

This PR fixes an issue where we were throwing an error when `None` is present and `nan_as_null=False`, this is a bug because of using `pd.isna`, this returns `True` for `nan`, `None` and `NA`. Whereas we are only looking for `np.nan` and not `None` and `pd.NA`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15709
---
 python/cudf/cudf/core/column/column.py   | 22 ++++++++++++--
 python/cudf/cudf/tests/test_dataframe.py | 38 +++++++-----------------
 python/cudf/cudf/tests/test_series.py    | 23 ++++++++++----
 3 files changed, 49 insertions(+), 34 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 371c91dd96f..1785eb834b2 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1411,6 +1411,13 @@ def column_empty_like(
     return column_empty(row_count, dtype, masked)
 
 
+def _has_any_nan(arbitrary):
+    return any(
+        ((isinstance(x, float) or isinstance(x, np.floating)) and np.isnan(x))
+        for x in np.asarray(arbitrary)
+    )
+
+
 def column_empty_like_same_mask(
     column: ColumnBase, dtype: Dtype
 ) -> ColumnBase:
@@ -1948,9 +1955,20 @@ def as_column(
                 raise TypeError(
                     f"Cannot convert a {inferred_dtype} of object type"
                 )
-            elif nan_as_null is False and (
-                pd.isna(arbitrary).any()
+            elif inferred_dtype == "boolean":
+                if cudf.get_option("mode.pandas_compatible"):
+                    if dtype != np.dtype("bool") or pd.isna(arbitrary).any():
+                        raise MixedTypeError(
+                            f"Cannot have mixed values with {inferred_dtype}"
+                        )
+                elif nan_as_null is False and _has_any_nan(arbitrary):
+                    raise MixedTypeError(
+                        f"Cannot have mixed values with {inferred_dtype}"
+                    )
+            elif (
+                nan_as_null is False
                 and inferred_dtype not in ("decimal", "empty")
+                and _has_any_nan(arbitrary)
             ):
                 # Decimal can hold float("nan")
                 # All np.nan is not restricted by type
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 96301670e9c..8b18e53d320 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -4008,44 +4008,28 @@ def test_diff(dtype, period, data_empty):
 
 @pytest.mark.parametrize("df", _dataframe_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
-def test_dataframe_isnull_isna(df, nan_as_null):
-    if nan_as_null is False and (
-        df.select_dtypes(object).isna().any().any()
-        and not df.select_dtypes(object).isna().all().all()
-    ):
-        with pytest.raises(MixedTypeError):
-            cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
-    else:
-        gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
+@pytest.mark.parametrize("api_call", ["isnull", "isna", "notna", "notnull"])
+def test_dataframe_isnull_isna_and_reverse(df, nan_as_null, api_call):
+    def detect_nan(x):
+        # Check if the input is a float and if it is nan
+        return x.apply(lambda v: isinstance(v, float) and np.isnan(v))
 
-        assert_eq(df.isnull(), gdf.isnull())
-        assert_eq(df.isna(), gdf.isna())
-
-        # Test individual columns
-        for col in df:
-            assert_eq(df[col].isnull(), gdf[col].isnull())
-            assert_eq(df[col].isna(), gdf[col].isna())
-
-
-@pytest.mark.parametrize("df", _dataframe_na_data())
-@pytest.mark.parametrize("nan_as_null", [True, False, None])
-def test_dataframe_notna_notnull(df, nan_as_null):
+    nan_contains = df.select_dtypes(object).apply(detect_nan)
     if nan_as_null is False and (
-        df.select_dtypes(object).isna().any().any()
-        and not df.select_dtypes(object).isna().all().all()
+        nan_contains.any().any() and not nan_contains.all().all()
     ):
         with pytest.raises(MixedTypeError):
             cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
     else:
         gdf = cudf.DataFrame.from_pandas(df, nan_as_null=nan_as_null)
 
-        assert_eq(df.notnull(), gdf.notnull())
-        assert_eq(df.notna(), gdf.notna())
+        assert_eq(getattr(df, api_call)(), getattr(gdf, api_call)())
 
         # Test individual columns
         for col in df:
-            assert_eq(df[col].notnull(), gdf[col].notnull())
-            assert_eq(df[col].notna(), gdf[col].notna())
+            assert_eq(
+                getattr(df[col], api_call)(), getattr(gdf[col], api_call)()
+            )
 
 
 def test_ndim():
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 08a6173d3f5..9aeae566730 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -774,8 +774,9 @@ def test_round_nan_as_null_false(series, decimal):
 @pytest.mark.parametrize("ps", _series_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_series_isnull_isna(ps, nan_as_null):
+    nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x))
     if nan_as_null is False and (
-        ps.isna().any() and not ps.isna().all() and ps.dtype == object
+        nan_contains.any() and not nan_contains.all() and ps.dtype == object
     ):
         with pytest.raises(MixedTypeError):
             cudf.Series.from_pandas(ps, nan_as_null=nan_as_null)
@@ -789,8 +790,9 @@ def test_series_isnull_isna(ps, nan_as_null):
 @pytest.mark.parametrize("ps", _series_na_data())
 @pytest.mark.parametrize("nan_as_null", [True, False, None])
 def test_series_notnull_notna(ps, nan_as_null):
+    nan_contains = ps.apply(lambda x: isinstance(x, float) and np.isnan(x))
     if nan_as_null is False and (
-        ps.isna().any() and not ps.isna().all() and ps.dtype == object
+        nan_contains.any() and not nan_contains.all() and ps.dtype == object
     ):
         with pytest.raises(MixedTypeError):
             cudf.Series.from_pandas(ps, nan_as_null=nan_as_null)
@@ -2356,12 +2358,23 @@ def test_multi_dim_series_error():
 
 def test_bool_series_mixed_dtype_error():
     ps = pd.Series([True, False, None])
+    all_bool_ps = pd.Series([True, False, True], dtype="object")
     # ps now has `object` dtype, which
     # isn't supported by `cudf`.
+    with cudf.option_context("mode.pandas_compatible", True):
+        with pytest.raises(TypeError):
+            cudf.Series(ps)
+        with pytest.raises(TypeError):
+            cudf.from_pandas(ps)
+        with pytest.raises(TypeError):
+            cudf.Series(ps, dtype=bool)
+        expected = cudf.Series(all_bool_ps, dtype=bool)
+        assert_eq(expected, all_bool_ps.astype(bool))
+    nan_bools_mix = pd.Series([True, False, True, np.nan], dtype="object")
+    gs = cudf.Series(nan_bools_mix, nan_as_null=True)
+    assert_eq(gs.to_pandas(nullable=True), nan_bools_mix.astype("boolean"))
     with pytest.raises(TypeError):
-        cudf.Series(ps, nan_as_null=False)
-    with pytest.raises(TypeError):
-        cudf.from_pandas(ps, nan_as_null=False)
+        cudf.Series(nan_bools_mix, nan_as_null=False)
 
 
 @pytest.mark.parametrize(

From c5c95b74b4a72884865f694129586ede8cb08de3 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 15 May 2024 09:17:47 -0700
Subject: [PATCH 209/842] Support `DurationType` in cudf parquet reader via
 `arrow:schema` (#15617)

This PR adds the support for reading and using the `arrow:schema` struct from the serialized `arrow:ipc` message written at the key-value metadata section of the Parquet file with `ARROW:schema` key. This allows cudf to read and interop with arrow for non-standard parquet types (`DurationType` in this PR).

Arrow uses Google flatbuffers (inside Schema.fbs) to serialize the `arrow:Schema` structure (containing column descriptors) and puts it (padded for 8 byte alignment) into the header of an empty `ipc:Message` (also a flatbuffer-serialized structure inside Message.fbs). The `ipc:Message` is prepended with two integers containing a `validity` message and the `size of the header` (the `arrow:Schema` + padding). The final message is endoded as a base64 string and written to Parquet file footer key-value metadata using `"ARROW:schema"` key.

In this PR, we base64-decode the `ipc:Message`, then we decode the `validity` message and the header size, and offset pointers to the `arrow:Schema` flatbuffer. We then use Flatbuffer structs to walk the `arrow:Schema` and collect information on columns of interest as an unordered_map (using column name as key).  This unordered_map is used inside `select_columns` function to build cudf Table columns and get the correct `dtype`.

Closes #13410

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15617
---
 .pre-commit-config.yaml                       |    8 +-
 cpp/CMakeLists.txt                            |    4 +
 cpp/cmake/thirdparty/get_flatbuffers.cmake    |   33 +
 cpp/include/cudf/io/parquet.hpp               |   28 +
 cpp/src/io/parquet/ipc/Message_generated.h    |  651 ++++
 cpp/src/io/parquet/ipc/Schema_generated.h     | 2769 +++++++++++++++++
 cpp/src/io/parquet/ipc/schema/Message.fbs     |  176 ++
 cpp/src/io/parquet/ipc/schema/Schema.fbs      |  591 ++++
 cpp/src/io/parquet/parquet.hpp                |    3 +
 cpp/src/io/parquet/reader_impl.cpp            |    8 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    |  336 +-
 cpp/src/io/parquet/reader_impl_helpers.hpp    |   29 +-
 cpp/src/io/utilities/base64_utilities.cpp     |  234 ++
 cpp/src/io/utilities/base64_utilities.hpp     |   87 +
 .../utilities_tests/io_utilities_tests.cpp    |  116 +
 python/cudf/cudf/_lib/parquet.pyx             |    1 +
 .../_lib/pylibcudf/libcudf/io/parquet.pxd     |    5 +
 python/cudf/cudf/tests/test_parquet.py        |   88 +
 18 files changed, 5152 insertions(+), 15 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/get_flatbuffers.cmake
 create mode 100644 cpp/src/io/parquet/ipc/Message_generated.h
 create mode 100644 cpp/src/io/parquet/ipc/Schema_generated.h
 create mode 100644 cpp/src/io/parquet/ipc/schema/Message.fbs
 create mode 100644 cpp/src/io/parquet/ipc/schema/Schema.fbs
 create mode 100644 cpp/src/io/utilities/base64_utilities.cpp
 create mode 100644 cpp/src/io/utilities/base64_utilities.hpp

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d44462236b2..5a8d9f54673 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -143,9 +143,11 @@ repos:
     hooks:
       - id: verify-copyright
         exclude: |
-          (?x)
-              cpp/include/cudf_test/cxxopts[.]hpp$
-
+          (?x)^(
+            cpp/include/cudf_test/cxxopts[.]hpp$|
+            cpp/src/io/parquet/ipc/Message_generated[.]h$|
+            cpp/src/io/parquet/ipc/Schema_generated[.]h$
+          )
 
 default_language_version:
       python: python3
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f11f3fc3c9a..474269364de 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -192,6 +192,8 @@ include(cmake/thirdparty/get_cccl.cmake)
 include(cmake/thirdparty/get_rmm.cmake)
 # find arrow
 include(cmake/thirdparty/get_arrow.cmake)
+# find flatbuffers
+include(cmake/thirdparty/get_flatbuffers.cmake)
 # find dlpack
 include(cmake/thirdparty/get_dlpack.cmake)
 # find cuCollections, should come after including CCCL
@@ -429,6 +431,7 @@ add_library(
   src/io/text/bgzip_utils.cpp
   src/io/text/multibyte_split.cu
   src/io/utilities/arrow_io_source.cpp
+  src/io/utilities/base64_utilities.cpp
   src/io/utilities/column_buffer.cpp
   src/io/utilities/column_buffer_strings.cu
   src/io/utilities/config_utils.cpp
@@ -742,6 +745,7 @@ target_include_directories(
          "$<BUILD_INTERFACE:${CUDF_GENERATED_INCLUDE_DIR}/include>"
   PRIVATE "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
           "$<BUILD_INTERFACE:${nanoarrow_SOURCE_DIR}/src>"
+          "$<BUILD_INTERFACE:${FlatBuffers_SOURCE_DIR}/include>"
   INTERFACE "$<INSTALL_INTERFACE:include>"
 )
 
diff --git a/cpp/cmake/thirdparty/get_flatbuffers.cmake b/cpp/cmake/thirdparty/get_flatbuffers.cmake
new file mode 100644
index 00000000000..b0ece38b8ef
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_flatbuffers.cmake
@@ -0,0 +1,33 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# Use CPM to find or clone flatbuffers
+function(find_and_configure_flatbuffers VERSION)
+
+  rapids_cpm_find(
+    flatbuffers ${VERSION}
+    GLOBAL_TARGETS flatbuffers
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/google/flatbuffers.git
+    GIT_TAG v${VERSION}
+    GIT_SHALLOW TRUE
+  )
+
+  rapids_export_find_package_root(
+    BUILD flatbuffers "${flatbuffers_BINARY_DIR}" EXPORT_SET cudf-exports
+  )
+
+endfunction()
+
+find_and_configure_flatbuffers(24.3.25)
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 8bfcacdb47f..7f034668e43 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -71,6 +71,8 @@ class parquet_reader_options {
   bool _convert_strings_to_categories = false;
   // Whether to use PANDAS metadata to load columns
   bool _use_pandas_metadata = true;
+  // Whether to read and use ARROW schema
+  bool _use_arrow_schema = true;
   // Cast timestamp columns to a specific type
   data_type _timestamp_type{type_id::EMPTY};
 
@@ -126,6 +128,13 @@ class parquet_reader_options {
    */
   [[nodiscard]] bool is_enabled_use_pandas_metadata() const { return _use_pandas_metadata; }
 
+  /**
+   * @brief Returns true/false depending whether to use arrow schema while reading.
+   *
+   * @return `true` if arrow schema is used while reading
+   */
+  [[nodiscard]] bool is_enabled_use_arrow_schema() const { return _use_arrow_schema; }
+
   /**
    * @brief Returns optional tree of metadata.
    *
@@ -214,6 +223,13 @@ class parquet_reader_options {
    */
   void enable_use_pandas_metadata(bool val) { _use_pandas_metadata = val; }
 
+  /**
+   * @brief Sets to enable/disable use of arrow schema to read.
+   *
+   * @param val Boolean value whether to use arrow schema
+   */
+  void enable_use_arrow_schema(bool val) { _use_arrow_schema = val; }
+
   /**
    * @brief Sets reader column schema.
    *
@@ -328,6 +344,18 @@ class parquet_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets to enable/disable use of arrow schema to read.
+   *
+   * @param val Boolean value whether to use arrow schema
+   * @return this for chaining
+   */
+  parquet_reader_options_builder& use_arrow_schema(bool val)
+  {
+    options._use_arrow_schema = val;
+    return *this;
+  }
+
   /**
    * @brief Sets reader metadata.
    *
diff --git a/cpp/src/io/parquet/ipc/Message_generated.h b/cpp/src/io/parquet/ipc/Message_generated.h
new file mode 100644
index 00000000000..8ddd859f51c
--- /dev/null
+++ b/cpp/src/io/parquet/ipc/Message_generated.h
@@ -0,0 +1,651 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+#ifndef FLATBUFFERS_GENERATED_MESSAGE_CUDF_IO_PARQUET_FLATBUF_H_
+#define FLATBUFFERS_GENERATED_MESSAGE_CUDF_IO_PARQUET_FLATBUF_H_
+
+#include <flatbuffers/flatbuffers.h>
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 && FLATBUFFERS_VERSION_MINOR == 3 &&
+                FLATBUFFERS_VERSION_REVISION == 25,
+              "Non-compatible flatbuffers version included");
+
+#include "Schema_generated.h"
+
+namespace cudf {
+namespace io {
+namespace parquet {
+namespace flatbuf {
+
+struct FieldNode;
+
+struct BodyCompression;
+struct BodyCompressionBuilder;
+
+struct RecordBatch;
+struct RecordBatchBuilder;
+
+struct DictionaryBatch;
+struct DictionaryBatchBuilder;
+
+struct Message;
+struct MessageBuilder;
+
+enum CompressionType : int8_t {
+  CompressionType_LZ4_FRAME = 0,
+  CompressionType_ZSTD      = 1,
+  CompressionType_MIN       = CompressionType_LZ4_FRAME,
+  CompressionType_MAX       = CompressionType_ZSTD
+};
+
+inline const CompressionType (&EnumValuesCompressionType())[2]
+{
+  static const CompressionType values[] = {CompressionType_LZ4_FRAME, CompressionType_ZSTD};
+  return values;
+}
+
+inline const char* const* EnumNamesCompressionType()
+{
+  static const char* const names[3] = {"LZ4_FRAME", "ZSTD", nullptr};
+  return names;
+}
+
+inline const char* EnumNameCompressionType(CompressionType e)
+{
+  if (::flatbuffers::IsOutRange(e, CompressionType_LZ4_FRAME, CompressionType_ZSTD)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesCompressionType()[index];
+}
+
+/// Provided for forward compatibility in case we need to support different
+/// strategies for compressing the IPC message body (like whole-body
+/// compression rather than buffer-level) in the future
+enum BodyCompressionMethod : int8_t {
+  /// Each constituent buffer is first compressed with the indicated
+  /// compressor, and then written with the uncompressed length in the first 8
+  /// bytes as a 64-bit little-endian signed integer followed by the compressed
+  /// buffer bytes (and then padding as required by the protocol). The
+  /// uncompressed length may be set to -1 to indicate that the data that
+  /// follows is not compressed, which can be useful for cases where
+  /// compression does not yield appreciable savings.
+  BodyCompressionMethod_BUFFER = 0,
+  BodyCompressionMethod_MIN    = BodyCompressionMethod_BUFFER,
+  BodyCompressionMethod_MAX    = BodyCompressionMethod_BUFFER
+};
+
+inline const BodyCompressionMethod (&EnumValuesBodyCompressionMethod())[1]
+{
+  static const BodyCompressionMethod values[] = {BodyCompressionMethod_BUFFER};
+  return values;
+}
+
+inline const char* const* EnumNamesBodyCompressionMethod()
+{
+  static const char* const names[2] = {"BUFFER", nullptr};
+  return names;
+}
+
+inline const char* EnumNameBodyCompressionMethod(BodyCompressionMethod e)
+{
+  if (::flatbuffers::IsOutRange(e, BodyCompressionMethod_BUFFER, BodyCompressionMethod_BUFFER))
+    return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesBodyCompressionMethod()[index];
+}
+
+/// ----------------------------------------------------------------------
+/// The root Message type
+/// This union enables us to easily send different message types without
+/// redundant storage, and in the future we can easily add new message types.
+///
+/// Arrow implementations do not need to implement all of the message types,
+/// which may include experimental metadata types. For maximum compatibility,
+/// it is best to send data using RecordBatch
+enum MessageHeader : uint8_t {
+  MessageHeader_NONE   = 0,
+  MessageHeader_Schema = 1,
+  MessageHeader_MIN    = MessageHeader_NONE,
+  MessageHeader_MAX    = MessageHeader_Schema
+};
+
+inline const MessageHeader (&EnumValuesMessageHeader())[2]
+{
+  static const MessageHeader values[] = {MessageHeader_NONE, MessageHeader_Schema};
+  return values;
+}
+
+inline const char* const* EnumNamesMessageHeader()
+{
+  static const char* const names[3] = {"NONE", "Schema", nullptr};
+  return names;
+}
+
+inline const char* EnumNameMessageHeader(MessageHeader e)
+{
+  if (::flatbuffers::IsOutRange(e, MessageHeader_NONE, MessageHeader_Schema)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesMessageHeader()[index];
+}
+
+template <typename T>
+struct MessageHeaderTraits {
+  static const MessageHeader enum_value = MessageHeader_NONE;
+};
+
+template <>
+struct MessageHeaderTraits<cudf::io::parquet::flatbuf::Schema> {
+  static const MessageHeader enum_value = MessageHeader_Schema;
+};
+
+bool VerifyMessageHeader(::flatbuffers::Verifier& verifier, const void* obj, MessageHeader type);
+bool VerifyMessageHeaderVector(::flatbuffers::Verifier& verifier,
+                               const ::flatbuffers::Vector<::flatbuffers::Offset<void>>* values,
+                               const ::flatbuffers::Vector<uint8_t>* types);
+
+/// ----------------------------------------------------------------------
+/// Data structures for describing a table row batch (a collection of
+/// equal-length Arrow arrays)
+/// Metadata about a field at some level of a nested type tree (but not
+/// its children).
+///
+/// For example, a List<Int16> with values `[[1, 2, 3], null, [4], [5, 6], null]`
+/// would have {length: 5, null_count: 2} for its List node, and {length: 6,
+/// null_count: 0} for its Int16 node, as separate FieldNode structs
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) FieldNode FLATBUFFERS_FINAL_CLASS
+{
+ private:
+  int64_t length_;
+  int64_t null_count_;
+
+ public:
+  FieldNode() : length_(0), null_count_(0) {}
+  FieldNode(int64_t _length, int64_t _null_count)
+    : length_(::flatbuffers::EndianScalar(_length)),
+      null_count_(::flatbuffers::EndianScalar(_null_count))
+  {
+  }
+  /// The number of value slots in the Arrow array at this level of a nested
+  /// tree
+  int64_t length() const { return ::flatbuffers::EndianScalar(length_); }
+  /// The number of observed nulls. Fields with null_count == 0 may choose not
+  /// to write their physical validity bitmap out as a materialized buffer,
+  /// instead setting the length of the bitmap buffer to 0.
+  int64_t null_count() const { return ::flatbuffers::EndianScalar(null_count_); }
+};
+FLATBUFFERS_STRUCT_END(FieldNode, 16);
+
+/// Optional compression for the memory buffers constituting IPC message
+/// bodies. Intended for use with RecordBatch but could be used for other
+/// message types
+struct BodyCompression FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BodyCompressionBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_CODEC = 4, VT_METHOD = 6 };
+  /// Compressor library.
+  /// For LZ4_FRAME, each compressed buffer must consist of a single frame.
+  cudf::io::parquet::flatbuf::CompressionType codec() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::CompressionType>(GetField<int8_t>(VT_CODEC, 0));
+  }
+  /// Indicates the way the record batch body was compressed
+  cudf::io::parquet::flatbuf::BodyCompressionMethod method() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::BodyCompressionMethod>(
+      GetField<int8_t>(VT_METHOD, 0));
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int8_t>(verifier, VT_CODEC, 1) &&
+           VerifyField<int8_t>(verifier, VT_METHOD, 1) && verifier.EndTable();
+  }
+};
+
+struct BodyCompressionBuilder {
+  typedef BodyCompression Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_codec(cudf::io::parquet::flatbuf::CompressionType codec)
+  {
+    fbb_.AddElement<int8_t>(BodyCompression::VT_CODEC, static_cast<int8_t>(codec), 0);
+  }
+  void add_method(cudf::io::parquet::flatbuf::BodyCompressionMethod method)
+  {
+    fbb_.AddElement<int8_t>(BodyCompression::VT_METHOD, static_cast<int8_t>(method), 0);
+  }
+  explicit BodyCompressionBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BodyCompression> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<BodyCompression>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BodyCompression> CreateBodyCompression(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::CompressionType codec =
+    cudf::io::parquet::flatbuf::CompressionType_LZ4_FRAME,
+  cudf::io::parquet::flatbuf::BodyCompressionMethod method =
+    cudf::io::parquet::flatbuf::BodyCompressionMethod_BUFFER)
+{
+  BodyCompressionBuilder builder_(_fbb);
+  builder_.add_method(method);
+  builder_.add_codec(codec);
+  return builder_.Finish();
+}
+
+/// A data header describing the shared memory layout of a "record" or "row"
+/// batch. Some systems call this a "row batch" internally and others a "record
+/// batch".
+struct RecordBatch FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef RecordBatchBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_LENGTH               = 4,
+    VT_NODES                = 6,
+    VT_BUFFERS              = 8,
+    VT_COMPRESSION          = 10,
+    VT_VARIADICBUFFERCOUNTS = 12
+  };
+  /// number of records / rows. The arrays in the batch should all have this
+  /// length
+  int64_t length() const { return GetField<int64_t>(VT_LENGTH, 0); }
+  /// Nodes correspond to the pre-ordered flattened logical schema
+  const ::flatbuffers::Vector<const cudf::io::parquet::flatbuf::FieldNode*>* nodes() const
+  {
+    return GetPointer<const ::flatbuffers::Vector<const cudf::io::parquet::flatbuf::FieldNode*>*>(
+      VT_NODES);
+  }
+  /// Buffers correspond to the pre-ordered flattened buffer tree
+  ///
+  /// The number of buffers appended to this list depends on the schema. For
+  /// example, most primitive arrays will have 2 buffers, 1 for the validity
+  /// bitmap and 1 for the values. For struct arrays, there will only be a
+  /// single buffer for the validity (nulls) bitmap
+  const ::flatbuffers::Vector<const cudf::io::parquet::flatbuf::Buffer*>* buffers() const
+  {
+    return GetPointer<const ::flatbuffers::Vector<const cudf::io::parquet::flatbuf::Buffer*>*>(
+      VT_BUFFERS);
+  }
+  /// Optional compression of the message body
+  const cudf::io::parquet::flatbuf::BodyCompression* compression() const
+  {
+    return GetPointer<const cudf::io::parquet::flatbuf::BodyCompression*>(VT_COMPRESSION);
+  }
+  /// Some types such as Utf8View are represented using a variable number of buffers.
+  /// For each such Field in the pre-ordered flattened logical schema, there will be
+  /// an entry in variadicBufferCounts to indicate the number of number of variadic
+  /// buffers which belong to that Field in the current RecordBatch.
+  ///
+  /// For example, the schema
+  ///     col1: Struct<alpha: Int32, beta: BinaryView, gamma: Float64>
+  ///     col2: Utf8View
+  /// contains two Fields with variadic buffers so variadicBufferCounts will have
+  /// two entries, the first counting the variadic buffers of `col1.beta` and the
+  /// second counting `col2`'s.
+  ///
+  /// This field may be omitted if and only if the schema contains no Fields with
+  /// a variable number of buffers, such as BinaryView and Utf8View.
+  const ::flatbuffers::Vector<int64_t>* variadicBufferCounts() const
+  {
+    return GetPointer<const ::flatbuffers::Vector<int64_t>*>(VT_VARIADICBUFFERCOUNTS);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int64_t>(verifier, VT_LENGTH, 8) &&
+           VerifyOffset(verifier, VT_NODES) && verifier.VerifyVector(nodes()) &&
+           VerifyOffset(verifier, VT_BUFFERS) && verifier.VerifyVector(buffers()) &&
+           VerifyOffset(verifier, VT_COMPRESSION) && verifier.VerifyTable(compression()) &&
+           VerifyOffset(verifier, VT_VARIADICBUFFERCOUNTS) &&
+           verifier.VerifyVector(variadicBufferCounts()) && verifier.EndTable();
+  }
+};
+
+struct RecordBatchBuilder {
+  typedef RecordBatch Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_length(int64_t length) { fbb_.AddElement<int64_t>(RecordBatch::VT_LENGTH, length, 0); }
+  void add_nodes(
+    ::flatbuffers::Offset<::flatbuffers::Vector<const cudf::io::parquet::flatbuf::FieldNode*>>
+      nodes)
+  {
+    fbb_.AddOffset(RecordBatch::VT_NODES, nodes);
+  }
+  void add_buffers(
+    ::flatbuffers::Offset<::flatbuffers::Vector<const cudf::io::parquet::flatbuf::Buffer*>> buffers)
+  {
+    fbb_.AddOffset(RecordBatch::VT_BUFFERS, buffers);
+  }
+  void add_compression(
+    ::flatbuffers::Offset<cudf::io::parquet::flatbuf::BodyCompression> compression)
+  {
+    fbb_.AddOffset(RecordBatch::VT_COMPRESSION, compression);
+  }
+  void add_variadicBufferCounts(
+    ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> variadicBufferCounts)
+  {
+    fbb_.AddOffset(RecordBatch::VT_VARIADICBUFFERCOUNTS, variadicBufferCounts);
+  }
+  explicit RecordBatchBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<RecordBatch> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<RecordBatch>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<RecordBatch> CreateRecordBatch(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  int64_t length = 0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<const cudf::io::parquet::flatbuf::FieldNode*>> nodes =
+    0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<const cudf::io::parquet::flatbuf::Buffer*>> buffers =
+    0,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::BodyCompression> compression = 0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> variadicBufferCounts     = 0)
+{
+  RecordBatchBuilder builder_(_fbb);
+  builder_.add_length(length);
+  builder_.add_variadicBufferCounts(variadicBufferCounts);
+  builder_.add_compression(compression);
+  builder_.add_buffers(buffers);
+  builder_.add_nodes(nodes);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<RecordBatch> CreateRecordBatchDirect(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  int64_t length                                                                 = 0,
+  const std::vector<cudf::io::parquet::flatbuf::FieldNode>* nodes                = nullptr,
+  const std::vector<cudf::io::parquet::flatbuf::Buffer>* buffers                 = nullptr,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::BodyCompression> compression = 0,
+  const std::vector<int64_t>* variadicBufferCounts                               = nullptr)
+{
+  auto nodes__ =
+    nodes ? _fbb.CreateVectorOfStructs<cudf::io::parquet::flatbuf::FieldNode>(*nodes) : 0;
+  auto buffers__ =
+    buffers ? _fbb.CreateVectorOfStructs<cudf::io::parquet::flatbuf::Buffer>(*buffers) : 0;
+  auto variadicBufferCounts__ =
+    variadicBufferCounts ? _fbb.CreateVector<int64_t>(*variadicBufferCounts) : 0;
+  return cudf::io::parquet::flatbuf::CreateRecordBatch(
+    _fbb, length, nodes__, buffers__, compression, variadicBufferCounts__);
+}
+
+/// For sending dictionary encoding information. Any Field can be
+/// dictionary-encoded, but in this case none of its children may be
+/// dictionary-encoded.
+/// There is one vector / column per dictionary, but that vector / column
+/// may be spread across multiple dictionary batches by using the isDelta
+/// flag
+struct DictionaryBatch FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DictionaryBatchBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ID      = 4,
+    VT_DATA    = 6,
+    VT_ISDELTA = 8
+  };
+  int64_t id() const { return GetField<int64_t>(VT_ID, 0); }
+  const cudf::io::parquet::flatbuf::RecordBatch* data() const
+  {
+    return GetPointer<const cudf::io::parquet::flatbuf::RecordBatch*>(VT_DATA);
+  }
+  /// If isDelta is true the values in the dictionary are to be appended to a
+  /// dictionary with the indicated id. If isDelta is false this dictionary
+  /// should replace the existing dictionary.
+  bool isDelta() const { return GetField<uint8_t>(VT_ISDELTA, 0) != 0; }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int64_t>(verifier, VT_ID, 8) &&
+           VerifyOffset(verifier, VT_DATA) && verifier.VerifyTable(data()) &&
+           VerifyField<uint8_t>(verifier, VT_ISDELTA, 1) && verifier.EndTable();
+  }
+};
+
+struct DictionaryBatchBuilder {
+  typedef DictionaryBatch Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_id(int64_t id) { fbb_.AddElement<int64_t>(DictionaryBatch::VT_ID, id, 0); }
+  void add_data(::flatbuffers::Offset<cudf::io::parquet::flatbuf::RecordBatch> data)
+  {
+    fbb_.AddOffset(DictionaryBatch::VT_DATA, data);
+  }
+  void add_isDelta(bool isDelta)
+  {
+    fbb_.AddElement<uint8_t>(DictionaryBatch::VT_ISDELTA, static_cast<uint8_t>(isDelta), 0);
+  }
+  explicit DictionaryBatchBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DictionaryBatch> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<DictionaryBatch>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DictionaryBatch> CreateDictionaryBatch(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  int64_t id                                                          = 0,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::RecordBatch> data = 0,
+  bool isDelta                                                        = false)
+{
+  DictionaryBatchBuilder builder_(_fbb);
+  builder_.add_id(id);
+  builder_.add_data(data);
+  builder_.add_isDelta(isDelta);
+  return builder_.Finish();
+}
+
+struct Message FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MessageBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_VERSION         = 4,
+    VT_HEADER_TYPE     = 6,
+    VT_HEADER          = 8,
+    VT_BODYLENGTH      = 10,
+    VT_CUSTOM_METADATA = 12
+  };
+  cudf::io::parquet::flatbuf::MetadataVersion version() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::MetadataVersion>(
+      GetField<int16_t>(VT_VERSION, 0));
+  }
+  cudf::io::parquet::flatbuf::MessageHeader header_type() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::MessageHeader>(
+      GetField<uint8_t>(VT_HEADER_TYPE, 0));
+  }
+  const void* header() const { return GetPointer<const void*>(VT_HEADER); }
+  template <typename T>
+  const T* header_as() const;
+  const cudf::io::parquet::flatbuf::Schema* header_as_Schema() const
+  {
+    return header_type() == cudf::io::parquet::flatbuf::MessageHeader_Schema
+             ? static_cast<const cudf::io::parquet::flatbuf::Schema*>(header())
+             : nullptr;
+  }
+  int64_t bodyLength() const { return GetField<int64_t>(VT_BODYLENGTH, 0); }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*
+  custom_metadata() const
+  {
+    return GetPointer<
+      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*>(
+      VT_CUSTOM_METADATA);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_VERSION, 2) &&
+           VerifyField<uint8_t>(verifier, VT_HEADER_TYPE, 1) && VerifyOffset(verifier, VT_HEADER) &&
+           VerifyMessageHeader(verifier, header(), header_type()) &&
+           VerifyField<int64_t>(verifier, VT_BODYLENGTH, 8) &&
+           VerifyOffset(verifier, VT_CUSTOM_METADATA) && verifier.VerifyVector(custom_metadata()) &&
+           verifier.VerifyVectorOfTables(custom_metadata()) && verifier.EndTable();
+  }
+};
+
+template <>
+inline const cudf::io::parquet::flatbuf::Schema*
+Message::header_as<cudf::io::parquet::flatbuf::Schema>() const
+{
+  return header_as_Schema();
+}
+
+struct MessageBuilder {
+  typedef Message Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_version(cudf::io::parquet::flatbuf::MetadataVersion version)
+  {
+    fbb_.AddElement<int16_t>(Message::VT_VERSION, static_cast<int16_t>(version), 0);
+  }
+  void add_header_type(cudf::io::parquet::flatbuf::MessageHeader header_type)
+  {
+    fbb_.AddElement<uint8_t>(Message::VT_HEADER_TYPE, static_cast<uint8_t>(header_type), 0);
+  }
+  void add_header(::flatbuffers::Offset<void> header)
+  {
+    fbb_.AddOffset(Message::VT_HEADER, header);
+  }
+  void add_bodyLength(int64_t bodyLength)
+  {
+    fbb_.AddElement<int64_t>(Message::VT_BODYLENGTH, bodyLength, 0);
+  }
+  void add_custom_metadata(
+    ::flatbuffers::Offset<
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>>
+      custom_metadata)
+  {
+    fbb_.AddOffset(Message::VT_CUSTOM_METADATA, custom_metadata);
+  }
+  explicit MessageBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Message> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Message>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Message> CreateMessage(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::MetadataVersion version =
+    cudf::io::parquet::flatbuf::MetadataVersion_V1,
+  cudf::io::parquet::flatbuf::MessageHeader header_type =
+    cudf::io::parquet::flatbuf::MessageHeader_NONE,
+  ::flatbuffers::Offset<void> header                                              = 0,
+  int64_t bodyLength                                                              = 0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<
+    ::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>> custom_metadata = 0)
+{
+  MessageBuilder builder_(_fbb);
+  builder_.add_bodyLength(bodyLength);
+  builder_.add_custom_metadata(custom_metadata);
+  builder_.add_header(header);
+  builder_.add_version(version);
+  builder_.add_header_type(header_type);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Message> CreateMessageDirect(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::MetadataVersion version =
+    cudf::io::parquet::flatbuf::MetadataVersion_V1,
+  cudf::io::parquet::flatbuf::MessageHeader header_type =
+    cudf::io::parquet::flatbuf::MessageHeader_NONE,
+  ::flatbuffers::Offset<void> header = 0,
+  int64_t bodyLength                 = 0,
+  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>* custom_metadata =
+    nullptr)
+{
+  auto custom_metadata__ =
+    custom_metadata
+      ? _fbb.CreateVector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>(
+          *custom_metadata)
+      : 0;
+  return cudf::io::parquet::flatbuf::CreateMessage(
+    _fbb, version, header_type, header, bodyLength, custom_metadata__);
+}
+
+inline bool VerifyMessageHeader(::flatbuffers::Verifier& verifier,
+                                const void* obj,
+                                MessageHeader type)
+{
+  switch (type) {
+    case MessageHeader_NONE: {
+      return true;
+    }
+    case MessageHeader_Schema: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Schema*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyMessageHeaderVector(
+  ::flatbuffers::Verifier& verifier,
+  const ::flatbuffers::Vector<::flatbuffers::Offset<void>>* values,
+  const ::flatbuffers::Vector<uint8_t>* types)
+{
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyMessageHeader(verifier, values->Get(i), types->GetEnum<MessageHeader>(i))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+inline const cudf::io::parquet::flatbuf::Message* GetMessage(const void* buf)
+{
+  return ::flatbuffers::GetRoot<cudf::io::parquet::flatbuf::Message>(buf);
+}
+
+inline const cudf::io::parquet::flatbuf::Message* GetSizePrefixedMessage(const void* buf)
+{
+  return ::flatbuffers::GetSizePrefixedRoot<cudf::io::parquet::flatbuf::Message>(buf);
+}
+
+inline bool VerifyMessageBuffer(::flatbuffers::Verifier& verifier)
+{
+  return verifier.VerifyBuffer<cudf::io::parquet::flatbuf::Message>(nullptr);
+}
+
+inline bool VerifySizePrefixedMessageBuffer(::flatbuffers::Verifier& verifier)
+{
+  return verifier.VerifySizePrefixedBuffer<cudf::io::parquet::flatbuf::Message>(nullptr);
+}
+
+inline void FinishMessageBuffer(::flatbuffers::FlatBufferBuilder& fbb,
+                                ::flatbuffers::Offset<cudf::io::parquet::flatbuf::Message> root)
+{
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedMessageBuffer(
+  ::flatbuffers::FlatBufferBuilder& fbb,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::Message> root)
+{
+  fbb.FinishSizePrefixed(root);
+}
+
+}  // namespace flatbuf
+}  // namespace parquet
+}  // namespace io
+}  // namespace cudf
+
+#endif  // FLATBUFFERS_GENERATED_MESSAGE_CUDF_IO_PARQUET_FLATBUF_H_
diff --git a/cpp/src/io/parquet/ipc/Schema_generated.h b/cpp/src/io/parquet/ipc/Schema_generated.h
new file mode 100644
index 00000000000..27141b4af31
--- /dev/null
+++ b/cpp/src/io/parquet/ipc/Schema_generated.h
@@ -0,0 +1,2769 @@
+// automatically generated by the FlatBuffers compiler, do not modify
+
+#ifndef FLATBUFFERS_GENERATED_SCHEMA_CUDF_IO_PARQUET_FLATBUF_H_
+#define FLATBUFFERS_GENERATED_SCHEMA_CUDF_IO_PARQUET_FLATBUF_H_
+
+#include <flatbuffers/flatbuffers.h>
+
+// Ensure the included flatbuffers.h is the same version as when this file was
+// generated, otherwise it may not be compatible.
+static_assert(FLATBUFFERS_VERSION_MAJOR == 24 && FLATBUFFERS_VERSION_MINOR == 3 &&
+                FLATBUFFERS_VERSION_REVISION == 25,
+              "Non-compatible flatbuffers version included");
+
+namespace cudf {
+namespace io {
+namespace parquet {
+namespace flatbuf {
+
+struct Null;
+struct NullBuilder;
+
+struct Struct_;
+struct Struct_Builder;
+
+struct List;
+struct ListBuilder;
+
+struct LargeList;
+struct LargeListBuilder;
+
+struct ListView;
+struct ListViewBuilder;
+
+struct LargeListView;
+struct LargeListViewBuilder;
+
+struct FixedSizeList;
+struct FixedSizeListBuilder;
+
+struct Map;
+struct MapBuilder;
+
+struct Union;
+struct UnionBuilder;
+
+struct Int;
+struct IntBuilder;
+
+struct FloatingPoint;
+struct FloatingPointBuilder;
+
+struct Utf8;
+struct Utf8Builder;
+
+struct Binary;
+struct BinaryBuilder;
+
+struct LargeUtf8;
+struct LargeUtf8Builder;
+
+struct LargeBinary;
+struct LargeBinaryBuilder;
+
+struct Utf8View;
+struct Utf8ViewBuilder;
+
+struct BinaryView;
+struct BinaryViewBuilder;
+
+struct FixedSizeBinary;
+struct FixedSizeBinaryBuilder;
+
+struct Bool;
+struct BoolBuilder;
+
+struct RunEndEncoded;
+struct RunEndEncodedBuilder;
+
+struct Decimal;
+struct DecimalBuilder;
+
+struct Date;
+struct DateBuilder;
+
+struct Time;
+struct TimeBuilder;
+
+struct Timestamp;
+struct TimestampBuilder;
+
+struct Interval;
+struct IntervalBuilder;
+
+struct Duration;
+struct DurationBuilder;
+
+struct KeyValue;
+struct KeyValueBuilder;
+
+struct DictionaryEncoding;
+struct DictionaryEncodingBuilder;
+
+struct Field;
+struct FieldBuilder;
+
+struct Buffer;
+
+struct Schema;
+struct SchemaBuilder;
+
+enum MetadataVersion : int16_t {
+  /// 0.1.0 (October 2016).
+  MetadataVersion_V1 = 0,
+  /// 0.2.0 (February 2017). Non-backwards compatible with V1.
+  MetadataVersion_V2 = 1,
+  /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2.
+  MetadataVersion_V3 = 2,
+  /// >= 0.8.0 (December 2017). Non-backwards compatible with V3.
+  MetadataVersion_V4 = 3,
+  /// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4
+  /// metadata and IPC messages). Implementations are recommended to provide a
+  /// V4 compatibility mode with V5 format changes disabled.
+  ///
+  /// Incompatible changes between V4 and V5:
+  /// - Union buffer layout has changed. In V5, Unions don't have a validity
+  ///   bitmap buffer.
+  MetadataVersion_V5  = 4,
+  MetadataVersion_MIN = MetadataVersion_V1,
+  MetadataVersion_MAX = MetadataVersion_V5
+};
+
+inline const MetadataVersion (&EnumValuesMetadataVersion())[5]
+{
+  static const MetadataVersion values[] = {MetadataVersion_V1,
+                                           MetadataVersion_V2,
+                                           MetadataVersion_V3,
+                                           MetadataVersion_V4,
+                                           MetadataVersion_V5};
+  return values;
+}
+
+inline const char* const* EnumNamesMetadataVersion()
+{
+  static const char* const names[6] = {"V1", "V2", "V3", "V4", "V5", nullptr};
+  return names;
+}
+
+inline const char* EnumNameMetadataVersion(MetadataVersion e)
+{
+  if (::flatbuffers::IsOutRange(e, MetadataVersion_V1, MetadataVersion_V5)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesMetadataVersion()[index];
+}
+
+/// Represents Arrow Features that might not have full support
+/// within implementations. This is intended to be used in
+/// two scenarios:
+///  1.  A mechanism for readers of Arrow Streams
+///      and files to understand that the stream or file makes
+///      use of a feature that isn't supported or unknown to
+///      the implementation (and therefore can meet the Arrow
+///      forward compatibility guarantees).
+///  2.  A means of negotiating between a client and server
+///      what features a stream is allowed to use. The enums
+///      values here are intended to represent higher level
+///      features, additional details maybe negotiated
+///      with key-value pairs specific to the protocol.
+///
+/// Enums added to this list should be assigned power-of-two values
+/// to facilitate exchanging and comparing bitmaps for supported
+/// features.
+enum Feature : int64_t {
+  /// Needed to make flatbuffers happy.
+  Feature_UNUSED = 0,
+  /// The stream makes use of multiple full dictionaries with the
+  /// same ID and assumes clients implement dictionary replacement
+  /// correctly.
+  Feature_DICTIONARY_REPLACEMENT = 1LL,
+  /// The stream makes use of compressed bodies as described
+  /// in Message.fbs.
+  Feature_COMPRESSED_BODY = 2LL,
+  Feature_MIN             = Feature_UNUSED,
+  Feature_MAX             = Feature_COMPRESSED_BODY
+};
+
+inline const Feature (&EnumValuesFeature())[3]
+{
+  static const Feature values[] = {
+    Feature_UNUSED, Feature_DICTIONARY_REPLACEMENT, Feature_COMPRESSED_BODY};
+  return values;
+}
+
+inline const char* const* EnumNamesFeature()
+{
+  static const char* const names[4] = {
+    "UNUSED", "DICTIONARY_REPLACEMENT", "COMPRESSED_BODY", nullptr};
+  return names;
+}
+
+inline const char* EnumNameFeature(Feature e)
+{
+  if (::flatbuffers::IsOutRange(e, Feature_UNUSED, Feature_COMPRESSED_BODY)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesFeature()[index];
+}
+
+enum UnionMode : int16_t {
+  UnionMode_Sparse = 0,
+  UnionMode_Dense  = 1,
+  UnionMode_MIN    = UnionMode_Sparse,
+  UnionMode_MAX    = UnionMode_Dense
+};
+
+inline const UnionMode (&EnumValuesUnionMode())[2]
+{
+  static const UnionMode values[] = {UnionMode_Sparse, UnionMode_Dense};
+  return values;
+}
+
+inline const char* const* EnumNamesUnionMode()
+{
+  static const char* const names[3] = {"Sparse", "Dense", nullptr};
+  return names;
+}
+
+inline const char* EnumNameUnionMode(UnionMode e)
+{
+  if (::flatbuffers::IsOutRange(e, UnionMode_Sparse, UnionMode_Dense)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesUnionMode()[index];
+}
+
+enum Precision : int16_t {
+  Precision_HALF   = 0,
+  Precision_SINGLE = 1,
+  Precision_DOUBLE = 2,
+  Precision_MIN    = Precision_HALF,
+  Precision_MAX    = Precision_DOUBLE
+};
+
+inline const Precision (&EnumValuesPrecision())[3]
+{
+  static const Precision values[] = {Precision_HALF, Precision_SINGLE, Precision_DOUBLE};
+  return values;
+}
+
+inline const char* const* EnumNamesPrecision()
+{
+  static const char* const names[4] = {"HALF", "SINGLE", "DOUBLE", nullptr};
+  return names;
+}
+
+inline const char* EnumNamePrecision(Precision e)
+{
+  if (::flatbuffers::IsOutRange(e, Precision_HALF, Precision_DOUBLE)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesPrecision()[index];
+}
+
+enum DateUnit : int16_t {
+  DateUnit_DAY         = 0,
+  DateUnit_MILLISECOND = 1,
+  DateUnit_MIN         = DateUnit_DAY,
+  DateUnit_MAX         = DateUnit_MILLISECOND
+};
+
+inline const DateUnit (&EnumValuesDateUnit())[2]
+{
+  static const DateUnit values[] = {DateUnit_DAY, DateUnit_MILLISECOND};
+  return values;
+}
+
+inline const char* const* EnumNamesDateUnit()
+{
+  static const char* const names[3] = {"DAY", "MILLISECOND", nullptr};
+  return names;
+}
+
+inline const char* EnumNameDateUnit(DateUnit e)
+{
+  if (::flatbuffers::IsOutRange(e, DateUnit_DAY, DateUnit_MILLISECOND)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesDateUnit()[index];
+}
+
+enum TimeUnit : int16_t {
+  TimeUnit_SECOND      = 0,
+  TimeUnit_MILLISECOND = 1,
+  TimeUnit_MICROSECOND = 2,
+  TimeUnit_NANOSECOND  = 3,
+  TimeUnit_MIN         = TimeUnit_SECOND,
+  TimeUnit_MAX         = TimeUnit_NANOSECOND
+};
+
+inline const TimeUnit (&EnumValuesTimeUnit())[4]
+{
+  static const TimeUnit values[] = {
+    TimeUnit_SECOND, TimeUnit_MILLISECOND, TimeUnit_MICROSECOND, TimeUnit_NANOSECOND};
+  return values;
+}
+
+inline const char* const* EnumNamesTimeUnit()
+{
+  static const char* const names[5] = {
+    "SECOND", "MILLISECOND", "MICROSECOND", "NANOSECOND", nullptr};
+  return names;
+}
+
+inline const char* EnumNameTimeUnit(TimeUnit e)
+{
+  if (::flatbuffers::IsOutRange(e, TimeUnit_SECOND, TimeUnit_NANOSECOND)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesTimeUnit()[index];
+}
+
+enum IntervalUnit : int16_t {
+  IntervalUnit_YEAR_MONTH     = 0,
+  IntervalUnit_DAY_TIME       = 1,
+  IntervalUnit_MONTH_DAY_NANO = 2,
+  IntervalUnit_MIN            = IntervalUnit_YEAR_MONTH,
+  IntervalUnit_MAX            = IntervalUnit_MONTH_DAY_NANO
+};
+
+inline const IntervalUnit (&EnumValuesIntervalUnit())[3]
+{
+  static const IntervalUnit values[] = {
+    IntervalUnit_YEAR_MONTH, IntervalUnit_DAY_TIME, IntervalUnit_MONTH_DAY_NANO};
+  return values;
+}
+
+inline const char* const* EnumNamesIntervalUnit()
+{
+  static const char* const names[4] = {"YEAR_MONTH", "DAY_TIME", "MONTH_DAY_NANO", nullptr};
+  return names;
+}
+
+inline const char* EnumNameIntervalUnit(IntervalUnit e)
+{
+  if (::flatbuffers::IsOutRange(e, IntervalUnit_YEAR_MONTH, IntervalUnit_MONTH_DAY_NANO)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesIntervalUnit()[index];
+}
+
+/// ----------------------------------------------------------------------
+/// Top-level Type value, enabling extensible type-specific metadata. We can
+/// add new logical types to Type without breaking backwards compatibility
+enum Type : uint8_t {
+  Type_NONE            = 0,
+  Type_Null            = 1,
+  Type_Int             = 2,
+  Type_FloatingPoint   = 3,
+  Type_Binary          = 4,
+  Type_Utf8            = 5,
+  Type_Bool            = 6,
+  Type_Decimal         = 7,
+  Type_Date            = 8,
+  Type_Time            = 9,
+  Type_Timestamp       = 10,
+  Type_Interval        = 11,
+  Type_List            = 12,
+  Type_Struct_         = 13,
+  Type_Union           = 14,
+  Type_FixedSizeBinary = 15,
+  Type_FixedSizeList   = 16,
+  Type_Map             = 17,
+  Type_Duration        = 18,
+  Type_LargeBinary     = 19,
+  Type_LargeUtf8       = 20,
+  Type_LargeList       = 21,
+  Type_RunEndEncoded   = 22,
+  Type_BinaryView      = 23,
+  Type_Utf8View        = 24,
+  Type_ListView        = 25,
+  Type_LargeListView   = 26,
+  Type_MIN             = Type_NONE,
+  Type_MAX             = Type_LargeListView
+};
+
+inline const Type (&EnumValuesType())[27]
+{
+  static const Type values[] = {
+    Type_NONE,          Type_Null,      Type_Int,           Type_FloatingPoint,
+    Type_Binary,        Type_Utf8,      Type_Bool,          Type_Decimal,
+    Type_Date,          Type_Time,      Type_Timestamp,     Type_Interval,
+    Type_List,          Type_Struct_,   Type_Union,         Type_FixedSizeBinary,
+    Type_FixedSizeList, Type_Map,       Type_Duration,      Type_LargeBinary,
+    Type_LargeUtf8,     Type_LargeList, Type_RunEndEncoded, Type_BinaryView,
+    Type_Utf8View,      Type_ListView,  Type_LargeListView};
+  return values;
+}
+
+inline const char* const* EnumNamesType()
+{
+  static const char* const names[28] = {
+    "NONE",          "Null",      "Int",           "FloatingPoint",
+    "Binary",        "Utf8",      "Bool",          "Decimal",
+    "Date",          "Time",      "Timestamp",     "Interval",
+    "List",          "Struct_",   "Union",         "FixedSizeBinary",
+    "FixedSizeList", "Map",       "Duration",      "LargeBinary",
+    "LargeUtf8",     "LargeList", "RunEndEncoded", "BinaryView",
+    "Utf8View",      "ListView",  "LargeListView", nullptr};
+  return names;
+}
+
+inline const char* EnumNameType(Type e)
+{
+  if (::flatbuffers::IsOutRange(e, Type_NONE, Type_LargeListView)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesType()[index];
+}
+
+template <typename T>
+struct TypeTraits {
+  static const Type enum_value = Type_NONE;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Null> {
+  static const Type enum_value = Type_Null;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Int> {
+  static const Type enum_value = Type_Int;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::FloatingPoint> {
+  static const Type enum_value = Type_FloatingPoint;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Binary> {
+  static const Type enum_value = Type_Binary;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Utf8> {
+  static const Type enum_value = Type_Utf8;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Bool> {
+  static const Type enum_value = Type_Bool;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Decimal> {
+  static const Type enum_value = Type_Decimal;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Date> {
+  static const Type enum_value = Type_Date;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Time> {
+  static const Type enum_value = Type_Time;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Timestamp> {
+  static const Type enum_value = Type_Timestamp;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Interval> {
+  static const Type enum_value = Type_Interval;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::List> {
+  static const Type enum_value = Type_List;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Struct_> {
+  static const Type enum_value = Type_Struct_;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Union> {
+  static const Type enum_value = Type_Union;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::FixedSizeBinary> {
+  static const Type enum_value = Type_FixedSizeBinary;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::FixedSizeList> {
+  static const Type enum_value = Type_FixedSizeList;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Map> {
+  static const Type enum_value = Type_Map;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Duration> {
+  static const Type enum_value = Type_Duration;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::LargeBinary> {
+  static const Type enum_value = Type_LargeBinary;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::LargeUtf8> {
+  static const Type enum_value = Type_LargeUtf8;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::LargeList> {
+  static const Type enum_value = Type_LargeList;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::RunEndEncoded> {
+  static const Type enum_value = Type_RunEndEncoded;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::BinaryView> {
+  static const Type enum_value = Type_BinaryView;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::Utf8View> {
+  static const Type enum_value = Type_Utf8View;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::ListView> {
+  static const Type enum_value = Type_ListView;
+};
+
+template <>
+struct TypeTraits<cudf::io::parquet::flatbuf::LargeListView> {
+  static const Type enum_value = Type_LargeListView;
+};
+
+bool VerifyType(::flatbuffers::Verifier& verifier, const void* obj, Type type);
+bool VerifyTypeVector(::flatbuffers::Verifier& verifier,
+                      const ::flatbuffers::Vector<::flatbuffers::Offset<void>>* values,
+                      const ::flatbuffers::Vector<uint8_t>* types);
+
+/// ----------------------------------------------------------------------
+/// Dictionary encoding metadata
+/// Maintained for forwards compatibility, in the future
+/// Dictionaries might be explicit maps between integers and values
+/// allowing for non-contiguous index values
+enum DictionaryKind : int16_t {
+  DictionaryKind_DenseArray = 0,
+  DictionaryKind_MIN        = DictionaryKind_DenseArray,
+  DictionaryKind_MAX        = DictionaryKind_DenseArray
+};
+
+inline const DictionaryKind (&EnumValuesDictionaryKind())[1]
+{
+  static const DictionaryKind values[] = {DictionaryKind_DenseArray};
+  return values;
+}
+
+inline const char* const* EnumNamesDictionaryKind()
+{
+  static const char* const names[2] = {"DenseArray", nullptr};
+  return names;
+}
+
+inline const char* EnumNameDictionaryKind(DictionaryKind e)
+{
+  if (::flatbuffers::IsOutRange(e, DictionaryKind_DenseArray, DictionaryKind_DenseArray)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesDictionaryKind()[index];
+}
+
+/// ----------------------------------------------------------------------
+/// Endianness of the platform producing the data
+enum Endianness : int16_t {
+  Endianness_Little = 0,
+  Endianness_Big    = 1,
+  Endianness_MIN    = Endianness_Little,
+  Endianness_MAX    = Endianness_Big
+};
+
+inline const Endianness (&EnumValuesEndianness())[2]
+{
+  static const Endianness values[] = {Endianness_Little, Endianness_Big};
+  return values;
+}
+
+inline const char* const* EnumNamesEndianness()
+{
+  static const char* const names[3] = {"Little", "Big", nullptr};
+  return names;
+}
+
+inline const char* EnumNameEndianness(Endianness e)
+{
+  if (::flatbuffers::IsOutRange(e, Endianness_Little, Endianness_Big)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesEndianness()[index];
+}
+
+/// ----------------------------------------------------------------------
+/// A Buffer represents a single contiguous memory segment
+FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Buffer FLATBUFFERS_FINAL_CLASS
+{
+ private:
+  int64_t offset_;
+  int64_t length_;
+
+ public:
+  Buffer() : offset_(0), length_(0) {}
+  Buffer(int64_t _offset, int64_t _length)
+    : offset_(::flatbuffers::EndianScalar(_offset)), length_(::flatbuffers::EndianScalar(_length))
+  {
+  }
+  /// The relative offset into the shared memory page where the bytes for this
+  /// buffer starts
+  int64_t offset() const { return ::flatbuffers::EndianScalar(offset_); }
+  /// The absolute length (in bytes) of the memory buffer. The memory is found
+  /// from offset (inclusive) to offset + length (non-inclusive). When building
+  /// messages using the encapsulated IPC message, padding bytes may be written
+  /// after a buffer, but such padding bytes do not need to be accounted for in
+  /// the size here.
+  int64_t length() const { return ::flatbuffers::EndianScalar(length_); }
+};
+FLATBUFFERS_STRUCT_END(Buffer, 16);
+
+/// These are stored in the flatbuffer in the Type union below
+struct Null FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef NullBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct NullBuilder {
+  typedef Null Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit NullBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Null> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Null>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Null> CreateNull(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  NullBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct
+/// (according to the physical memory layout). We used Struct_ here as
+/// Struct is a reserved word in Flatbuffers
+struct Struct_ FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Struct_Builder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct Struct_Builder {
+  typedef Struct_ Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit Struct_Builder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Struct_> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Struct_>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Struct_> CreateStruct_(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  Struct_Builder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct List FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ListBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct ListBuilder {
+  typedef List Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ListBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<List> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<List>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<List> CreateList(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  ListBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Same as List, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+struct LargeList FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LargeListBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct LargeListBuilder {
+  typedef LargeList Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LargeListBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LargeList> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<LargeList>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LargeList> CreateLargeList(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  LargeListBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Represents the same logical types that List can, but contains offsets and
+/// sizes allowing for writes in any order and sharing of child values among
+/// list values.
+struct ListView FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ListViewBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct ListViewBuilder {
+  typedef ListView Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit ListViewBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ListView> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<ListView>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ListView> CreateListView(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  ListViewBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Same as ListView, but with 64-bit offsets and sizes, allowing to represent
+/// extremely large data values.
+struct LargeListView FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LargeListViewBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct LargeListViewBuilder {
+  typedef LargeListView Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LargeListViewBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LargeListView> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<LargeListView>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LargeListView> CreateLargeListView(
+  ::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  LargeListViewBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct FixedSizeList FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FixedSizeListBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_LISTSIZE = 4 };
+  /// Number of list items per value
+  int32_t listSize() const { return GetField<int32_t>(VT_LISTSIZE, 0); }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_LISTSIZE, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct FixedSizeListBuilder {
+  typedef FixedSizeList Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_listSize(int32_t listSize)
+  {
+    fbb_.AddElement<int32_t>(FixedSizeList::VT_LISTSIZE, listSize, 0);
+  }
+  explicit FixedSizeListBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FixedSizeList> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<FixedSizeList>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FixedSizeList> CreateFixedSizeList(
+  ::flatbuffers::FlatBufferBuilder& _fbb, int32_t listSize = 0)
+{
+  FixedSizeListBuilder builder_(_fbb);
+  builder_.add_listSize(listSize);
+  return builder_.Finish();
+}
+
+/// A Map is a logical nested type that is represented as
+///
+/// List<entries: Struct<key: K, value: V>>
+///
+/// In this layout, the keys and values are each respectively contiguous. We do
+/// not constrain the key and value types, so the application is responsible
+/// for ensuring that the keys are hashable and unique. Whether the keys are sorted
+/// may be set in the metadata for this field.
+///
+/// In a field with Map type, the field has a child Struct field, which then
+/// has two children: key type and the second the value type. The names of the
+/// child fields may be respectively "entries", "key", and "value", but this is
+/// not enforced.
+///
+/// Map
+/// ```text
+///   - child[0] entries: Struct
+///     - child[0] key: K
+///     - child[1] value: V
+/// ```
+/// Neither the "entries" field nor the "key" field may be nullable.
+///
+/// The metadata is structured so that Arrow systems without special handling
+/// for Map can make Map an alias for List. The "layout" attribute for the Map
+/// field must have the same contents as a List.
+struct Map FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef MapBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_KEYSSORTED = 4 };
+  /// Set to true if the keys within each value are sorted
+  bool keysSorted() const { return GetField<uint8_t>(VT_KEYSSORTED, 0) != 0; }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<uint8_t>(verifier, VT_KEYSSORTED, 1) &&
+           verifier.EndTable();
+  }
+};
+
+struct MapBuilder {
+  typedef Map Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_keysSorted(bool keysSorted)
+  {
+    fbb_.AddElement<uint8_t>(Map::VT_KEYSSORTED, static_cast<uint8_t>(keysSorted), 0);
+  }
+  explicit MapBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Map> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Map>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Map> CreateMap(::flatbuffers::FlatBufferBuilder& _fbb,
+                                            bool keysSorted = false)
+{
+  MapBuilder builder_(_fbb);
+  builder_.add_keysSorted(keysSorted);
+  return builder_.Finish();
+}
+
+/// A union is a complex type with children in Field
+/// By default ids in the type vector refer to the offsets in the children
+/// optionally typeIds provides an indirection between the child offset and the type id
+/// for each child `typeIds[offset]` is the id used in the type vector
+struct Union FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef UnionBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_MODE = 4, VT_TYPEIDS = 6 };
+  cudf::io::parquet::flatbuf::UnionMode mode() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::UnionMode>(GetField<int16_t>(VT_MODE, 0));
+  }
+  const ::flatbuffers::Vector<int32_t>* typeIds() const
+  {
+    return GetPointer<const ::flatbuffers::Vector<int32_t>*>(VT_TYPEIDS);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_MODE, 2) &&
+           VerifyOffset(verifier, VT_TYPEIDS) && verifier.VerifyVector(typeIds()) &&
+           verifier.EndTable();
+  }
+};
+
+struct UnionBuilder {
+  typedef Union Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_mode(cudf::io::parquet::flatbuf::UnionMode mode)
+  {
+    fbb_.AddElement<int16_t>(Union::VT_MODE, static_cast<int16_t>(mode), 0);
+  }
+  void add_typeIds(::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> typeIds)
+  {
+    fbb_.AddOffset(Union::VT_TYPEIDS, typeIds);
+  }
+  explicit UnionBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Union> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Union>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Union> CreateUnion(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::UnionMode mode = cudf::io::parquet::flatbuf::UnionMode_Sparse,
+  ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> typeIds = 0)
+{
+  UnionBuilder builder_(_fbb);
+  builder_.add_typeIds(typeIds);
+  builder_.add_mode(mode);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Union> CreateUnionDirect(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::UnionMode mode = cudf::io::parquet::flatbuf::UnionMode_Sparse,
+  const std::vector<int32_t>* typeIds        = nullptr)
+{
+  auto typeIds__ = typeIds ? _fbb.CreateVector<int32_t>(*typeIds) : 0;
+  return cudf::io::parquet::flatbuf::CreateUnion(_fbb, mode, typeIds__);
+}
+
+struct Int FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef IntBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BITWIDTH  = 4,
+    VT_IS_SIGNED = 6
+  };
+  int32_t bitWidth() const { return GetField<int32_t>(VT_BITWIDTH, 0); }
+  bool is_signed() const { return GetField<uint8_t>(VT_IS_SIGNED, 0) != 0; }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_BITWIDTH, 4) &&
+           VerifyField<uint8_t>(verifier, VT_IS_SIGNED, 1) && verifier.EndTable();
+  }
+};
+
+struct IntBuilder {
+  typedef Int Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_bitWidth(int32_t bitWidth) { fbb_.AddElement<int32_t>(Int::VT_BITWIDTH, bitWidth, 0); }
+  void add_is_signed(bool is_signed)
+  {
+    fbb_.AddElement<uint8_t>(Int::VT_IS_SIGNED, static_cast<uint8_t>(is_signed), 0);
+  }
+  explicit IntBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Int> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Int>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Int> CreateInt(::flatbuffers::FlatBufferBuilder& _fbb,
+                                            int32_t bitWidth = 0,
+                                            bool is_signed   = false)
+{
+  IntBuilder builder_(_fbb);
+  builder_.add_bitWidth(bitWidth);
+  builder_.add_is_signed(is_signed);
+  return builder_.Finish();
+}
+
+struct FloatingPoint FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FloatingPointBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_PRECISION = 4 };
+  cudf::io::parquet::flatbuf::Precision precision() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::Precision>(GetField<int16_t>(VT_PRECISION, 0));
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_PRECISION, 2) &&
+           verifier.EndTable();
+  }
+};
+
+struct FloatingPointBuilder {
+  typedef FloatingPoint Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_precision(cudf::io::parquet::flatbuf::Precision precision)
+  {
+    fbb_.AddElement<int16_t>(FloatingPoint::VT_PRECISION, static_cast<int16_t>(precision), 0);
+  }
+  explicit FloatingPointBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FloatingPoint> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<FloatingPoint>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FloatingPoint> CreateFloatingPoint(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::Precision precision = cudf::io::parquet::flatbuf::Precision_HALF)
+{
+  FloatingPointBuilder builder_(_fbb);
+  builder_.add_precision(precision);
+  return builder_.Finish();
+}
+
+/// Unicode with UTF-8 encoding
+struct Utf8 FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Utf8Builder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct Utf8Builder {
+  typedef Utf8 Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit Utf8Builder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Utf8> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Utf8>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Utf8> CreateUtf8(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  Utf8Builder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Opaque binary data
+struct Binary FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BinaryBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct BinaryBuilder {
+  typedef Binary Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BinaryBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Binary> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Binary>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Binary> CreateBinary(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  BinaryBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Same as Utf8, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+struct LargeUtf8 FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LargeUtf8Builder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct LargeUtf8Builder {
+  typedef LargeUtf8 Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LargeUtf8Builder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LargeUtf8> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<LargeUtf8>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LargeUtf8> CreateLargeUtf8(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  LargeUtf8Builder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Same as Binary, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+struct LargeBinary FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef LargeBinaryBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct LargeBinaryBuilder {
+  typedef LargeBinary Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit LargeBinaryBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<LargeBinary> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<LargeBinary>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<LargeBinary> CreateLargeBinary(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  LargeBinaryBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Logically the same as Utf8, but the internal representation uses a view
+/// struct that contains the string length and either the string's entire data
+/// inline (for small strings) or an inlined prefix, an index of another buffer,
+/// and an offset pointing to a slice in that buffer (for non-small strings).
+///
+/// Since it uses a variable number of data buffers, each Field with this type
+/// must have a corresponding entry in `variadicBufferCounts`.
+struct Utf8View FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef Utf8ViewBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct Utf8ViewBuilder {
+  typedef Utf8View Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit Utf8ViewBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Utf8View> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Utf8View>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Utf8View> CreateUtf8View(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  Utf8ViewBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Logically the same as Binary, but the internal representation uses a view
+/// struct that contains the string length and either the string's entire data
+/// inline (for small strings) or an inlined prefix, an index of another buffer,
+/// and an offset pointing to a slice in that buffer (for non-small strings).
+///
+/// Since it uses a variable number of data buffers, each Field with this type
+/// must have a corresponding entry in `variadicBufferCounts`.
+struct BinaryView FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BinaryViewBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct BinaryViewBuilder {
+  typedef BinaryView Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BinaryViewBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<BinaryView> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<BinaryView>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<BinaryView> CreateBinaryView(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  BinaryViewBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+struct FixedSizeBinary FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FixedSizeBinaryBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_BYTEWIDTH = 4 };
+  /// Number of bytes per value
+  int32_t byteWidth() const { return GetField<int32_t>(VT_BYTEWIDTH, 0); }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_BYTEWIDTH, 4) &&
+           verifier.EndTable();
+  }
+};
+
+struct FixedSizeBinaryBuilder {
+  typedef FixedSizeBinary Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_byteWidth(int32_t byteWidth)
+  {
+    fbb_.AddElement<int32_t>(FixedSizeBinary::VT_BYTEWIDTH, byteWidth, 0);
+  }
+  explicit FixedSizeBinaryBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<FixedSizeBinary> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<FixedSizeBinary>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<FixedSizeBinary> CreateFixedSizeBinary(
+  ::flatbuffers::FlatBufferBuilder& _fbb, int32_t byteWidth = 0)
+{
+  FixedSizeBinaryBuilder builder_(_fbb);
+  builder_.add_byteWidth(byteWidth);
+  return builder_.Finish();
+}
+
+struct Bool FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef BoolBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct BoolBuilder {
+  typedef Bool Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit BoolBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Bool> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Bool>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Bool> CreateBool(::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  BoolBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Contains two child arrays, run_ends and values.
+/// The run_ends child array must be a 16/32/64-bit integer array
+/// which encodes the indices at which the run with the value in
+/// each corresponding index in the values child array ends.
+/// Like list/struct types, the value array can be of any type.
+struct RunEndEncoded FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef RunEndEncodedBuilder Builder;
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && verifier.EndTable();
+  }
+};
+
+struct RunEndEncodedBuilder {
+  typedef RunEndEncoded Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  explicit RunEndEncodedBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<RunEndEncoded> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<RunEndEncoded>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<RunEndEncoded> CreateRunEndEncoded(
+  ::flatbuffers::FlatBufferBuilder& _fbb)
+{
+  RunEndEncodedBuilder builder_(_fbb);
+  return builder_.Finish();
+}
+
+/// Exact decimal value represented as an integer value in two's
+/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers
+/// are used. The representation uses the endianness indicated
+/// in the Schema.
+struct Decimal FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DecimalBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_PRECISION = 4,
+    VT_SCALE     = 6,
+    VT_BITWIDTH  = 8
+  };
+  /// Total number of decimal digits
+  int32_t precision() const { return GetField<int32_t>(VT_PRECISION, 0); }
+  /// Number of digits after the decimal point "."
+  int32_t scale() const { return GetField<int32_t>(VT_SCALE, 0); }
+  /// Number of bits per value. The only accepted widths are 128 and 256.
+  /// We use bitWidth for consistency with Int::bitWidth.
+  int32_t bitWidth() const { return GetField<int32_t>(VT_BITWIDTH, 128); }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int32_t>(verifier, VT_PRECISION, 4) &&
+           VerifyField<int32_t>(verifier, VT_SCALE, 4) &&
+           VerifyField<int32_t>(verifier, VT_BITWIDTH, 4) && verifier.EndTable();
+  }
+};
+
+struct DecimalBuilder {
+  typedef Decimal Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_precision(int32_t precision)
+  {
+    fbb_.AddElement<int32_t>(Decimal::VT_PRECISION, precision, 0);
+  }
+  void add_scale(int32_t scale) { fbb_.AddElement<int32_t>(Decimal::VT_SCALE, scale, 0); }
+  void add_bitWidth(int32_t bitWidth)
+  {
+    fbb_.AddElement<int32_t>(Decimal::VT_BITWIDTH, bitWidth, 128);
+  }
+  explicit DecimalBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Decimal> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Decimal>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Decimal> CreateDecimal(::flatbuffers::FlatBufferBuilder& _fbb,
+                                                    int32_t precision = 0,
+                                                    int32_t scale     = 0,
+                                                    int32_t bitWidth  = 128)
+{
+  DecimalBuilder builder_(_fbb);
+  builder_.add_bitWidth(bitWidth);
+  builder_.add_scale(scale);
+  builder_.add_precision(precision);
+  return builder_.Finish();
+}
+
+/// Date is either a 32-bit or 64-bit signed integer type representing an
+/// elapsed time since UNIX epoch (1970-01-01), stored in either of two units:
+///
+/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no
+///   leap seconds), where the values are evenly divisible by 86400000
+/// * Days (32 bits) since the UNIX epoch
+struct Date FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DateBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_UNIT = 4 };
+  cudf::io::parquet::flatbuf::DateUnit unit() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::DateUnit>(GetField<int16_t>(VT_UNIT, 1));
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_UNIT, 2) &&
+           verifier.EndTable();
+  }
+};
+
+struct DateBuilder {
+  typedef Date Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_unit(cudf::io::parquet::flatbuf::DateUnit unit)
+  {
+    fbb_.AddElement<int16_t>(Date::VT_UNIT, static_cast<int16_t>(unit), 1);
+  }
+  explicit DateBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Date> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Date>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Date> CreateDate(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::DateUnit unit = cudf::io::parquet::flatbuf::DateUnit_MILLISECOND)
+{
+  DateBuilder builder_(_fbb);
+  builder_.add_unit(unit);
+  return builder_.Finish();
+}
+
+/// Time is either a 32-bit or 64-bit signed integer type representing an
+/// elapsed time since midnight, stored in either of four units: seconds,
+/// milliseconds, microseconds or nanoseconds.
+///
+/// The integer `bitWidth` depends on the `unit` and must be one of the following:
+/// * SECOND and MILLISECOND: 32 bits
+/// * MICROSECOND and NANOSECOND: 64 bits
+///
+/// The allowed values are between 0 (inclusive) and 86400 (=24*60*60) seconds
+/// (exclusive), adjusted for the time unit (for example, up to 86400000
+/// exclusive for the MILLISECOND unit).
+/// This definition doesn't allow for leap seconds. Time values from
+/// measurements with leap seconds will need to be corrected when ingesting
+/// into Arrow (for example by replacing the value 86400 with 86399).
+struct Time FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TimeBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_UNIT = 4, VT_BITWIDTH = 6 };
+  cudf::io::parquet::flatbuf::TimeUnit unit() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::TimeUnit>(GetField<int16_t>(VT_UNIT, 1));
+  }
+  int32_t bitWidth() const { return GetField<int32_t>(VT_BITWIDTH, 32); }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_UNIT, 2) &&
+           VerifyField<int32_t>(verifier, VT_BITWIDTH, 4) && verifier.EndTable();
+  }
+};
+
+struct TimeBuilder {
+  typedef Time Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_unit(cudf::io::parquet::flatbuf::TimeUnit unit)
+  {
+    fbb_.AddElement<int16_t>(Time::VT_UNIT, static_cast<int16_t>(unit), 1);
+  }
+  void add_bitWidth(int32_t bitWidth) { fbb_.AddElement<int32_t>(Time::VT_BITWIDTH, bitWidth, 32); }
+  explicit TimeBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Time> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Time>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Time> CreateTime(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::TimeUnit unit = cudf::io::parquet::flatbuf::TimeUnit_MILLISECOND,
+  int32_t bitWidth                          = 32)
+{
+  TimeBuilder builder_(_fbb);
+  builder_.add_bitWidth(bitWidth);
+  builder_.add_unit(unit);
+  return builder_.Finish();
+}
+
+/// Timestamp is a 64-bit signed integer representing an elapsed time since a
+/// fixed epoch, stored in either of four units: seconds, milliseconds,
+/// microseconds or nanoseconds, and is optionally annotated with a timezone.
+///
+/// Timestamp values do not include any leap seconds (in other words, all
+/// days are considered 86400 seconds long).
+///
+/// Timestamps with a non-empty timezone
+/// ------------------------------------
+///
+/// If a Timestamp column has a non-empty timezone value, its epoch is
+/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
+/// (the Unix epoch), regardless of the Timestamp's own timezone.
+///
+/// Therefore, timestamp values with a non-empty timezone correspond to
+/// physical points in time together with some additional information about
+/// how the data was obtained and/or how to display it (the timezone).
+///
+///   For example, the timestamp value 0 with the timezone string "Europe/Paris"
+///   corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
+///   application may prefer to display it as "January 1st 1970, 01h00" in
+///   the Europe/Paris timezone (which is the same physical point in time).
+///
+/// One consequence is that timestamp values with a non-empty timezone
+/// can be compared and ordered directly, since they all share the same
+/// well-known point of reference (the Unix epoch).
+///
+/// Timestamps with an unset / empty timezone
+/// -----------------------------------------
+///
+/// If a Timestamp column has no timezone value, its epoch is
+/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
+///
+/// Therefore, timestamp values without a timezone cannot be meaningfully
+/// interpreted as physical points in time, but only as calendar / clock
+/// indications ("wall clock time") in an unspecified timezone.
+///
+///   For example, the timestamp value 0 with an empty timezone string
+///   corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
+///   is not enough information to interpret it as a well-defined physical
+///   point in time.
+///
+/// One consequence is that timestamp values without a timezone cannot
+/// be reliably compared or ordered, since they may have different points of
+/// reference.  In particular, it is *not* possible to interpret an unset
+/// or empty timezone as the same as "UTC".
+///
+/// Conversion between timezones
+/// ----------------------------
+///
+/// If a Timestamp column has a non-empty timezone, changing the timezone
+/// to a different non-empty value is a metadata-only operation:
+/// the timestamp values need not change as their point of reference remains
+/// the same (the Unix epoch).
+///
+/// However, if a Timestamp column has no timezone value, changing it to a
+/// non-empty value requires to think about the desired semantics.
+/// One possibility is to assume that the original timestamp values are
+/// relative to the epoch of the timezone being set; timestamp values should
+/// then adjusted to the Unix epoch (for example, changing the timezone from
+/// empty to "Europe/Paris" would require converting the timestamp values
+/// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
+/// nevertheless correct).
+///
+/// Guidelines for encoding data from external libraries
+/// ----------------------------------------------------
+///
+/// Date & time libraries often have multiple different data types for temporal
+/// data. In order to ease interoperability between different implementations the
+/// Arrow project has some recommendations for encoding these types into a Timestamp
+/// column.
+///
+/// An "instant" represents a physical point in time that has no relevant timezone
+/// (for example, astronomical data). To encode an instant, use a Timestamp with
+/// the timezone string set to "UTC", and make sure the Timestamp values
+/// are relative to the UTC epoch (January 1st 1970, midnight).
+///
+/// A "zoned date-time" represents a physical point in time annotated with an
+/// informative timezone (for example, the timezone in which the data was
+/// recorded).  To encode a zoned date-time, use a Timestamp with the timezone
+/// string set to the name of the timezone, and make sure the Timestamp values
+/// are relative to the UTC epoch (January 1st 1970, midnight).
+///
+///  (There is some ambiguity between an instant and a zoned date-time with the
+///   UTC timezone.  Both of these are stored the same in Arrow.  Typically,
+///   this distinction does not matter.  If it does, then an application should
+///   use custom metadata or an extension type to distinguish between the two cases.)
+///
+/// An "offset date-time" represents a physical point in time combined with an
+/// explicit offset from UTC.  To encode an offset date-time, use a Timestamp
+/// with the timezone string set to the numeric timezone offset string
+/// (e.g. "+03:00"), and make sure the Timestamp values are relative to
+/// the UTC epoch (January 1st 1970, midnight).
+///
+/// A "naive date-time" (also called "local date-time" in some libraries)
+/// represents a wall clock time combined with a calendar date, but with
+/// no indication of how to map this information to a physical point in time.
+/// Naive date-times must be handled with care because of this missing
+/// information, and also because daylight saving time (DST) may make
+/// some values ambiguous or nonexistent. A naive date-time may be
+/// stored as a struct with Date and Time fields. However, it may also be
+/// encoded into a Timestamp column with an empty timezone. The timestamp
+/// values should be computed "as if" the timezone of the date-time values
+/// was UTC; for example, the naive date-time "January 1st 1970, 00h00" would
+/// be encoded as timestamp value 0.
+struct Timestamp FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef TimestampBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_UNIT = 4, VT_TIMEZONE = 6 };
+  cudf::io::parquet::flatbuf::TimeUnit unit() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::TimeUnit>(GetField<int16_t>(VT_UNIT, 0));
+  }
+  /// The timezone is an optional string indicating the name of a timezone,
+  /// one of:
+  ///
+  /// * As used in the Olson timezone database (the "tz database" or
+  ///   "tzdata"), such as "America/New_York".
+  /// * An absolute timezone offset of the form "+XX:XX" or "-XX:XX",
+  ///   such as "+07:30".
+  ///
+  /// Whether a timezone string is present indicates different semantics about
+  /// the data (see above).
+  const ::flatbuffers::String* timezone() const
+  {
+    return GetPointer<const ::flatbuffers::String*>(VT_TIMEZONE);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_UNIT, 2) &&
+           VerifyOffset(verifier, VT_TIMEZONE) && verifier.VerifyString(timezone()) &&
+           verifier.EndTable();
+  }
+};
+
+struct TimestampBuilder {
+  typedef Timestamp Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_unit(cudf::io::parquet::flatbuf::TimeUnit unit)
+  {
+    fbb_.AddElement<int16_t>(Timestamp::VT_UNIT, static_cast<int16_t>(unit), 0);
+  }
+  void add_timezone(::flatbuffers::Offset<::flatbuffers::String> timezone)
+  {
+    fbb_.AddOffset(Timestamp::VT_TIMEZONE, timezone);
+  }
+  explicit TimestampBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Timestamp> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Timestamp>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Timestamp> CreateTimestamp(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::TimeUnit unit = cudf::io::parquet::flatbuf::TimeUnit_SECOND,
+  ::flatbuffers::Offset<::flatbuffers::String> timezone = 0)
+{
+  TimestampBuilder builder_(_fbb);
+  builder_.add_timezone(timezone);
+  builder_.add_unit(unit);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Timestamp> CreateTimestampDirect(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::TimeUnit unit = cudf::io::parquet::flatbuf::TimeUnit_SECOND,
+  const char* timezone                      = nullptr)
+{
+  auto timezone__ = timezone ? _fbb.CreateString(timezone) : 0;
+  return cudf::io::parquet::flatbuf::CreateTimestamp(_fbb, unit, timezone__);
+}
+
+struct Interval FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef IntervalBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_UNIT = 4 };
+  cudf::io::parquet::flatbuf::IntervalUnit unit() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::IntervalUnit>(GetField<int16_t>(VT_UNIT, 0));
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_UNIT, 2) &&
+           verifier.EndTable();
+  }
+};
+
+struct IntervalBuilder {
+  typedef Interval Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_unit(cudf::io::parquet::flatbuf::IntervalUnit unit)
+  {
+    fbb_.AddElement<int16_t>(Interval::VT_UNIT, static_cast<int16_t>(unit), 0);
+  }
+  explicit IntervalBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Interval> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Interval>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Interval> CreateInterval(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::IntervalUnit unit =
+    cudf::io::parquet::flatbuf::IntervalUnit_YEAR_MONTH)
+{
+  IntervalBuilder builder_(_fbb);
+  builder_.add_unit(unit);
+  return builder_.Finish();
+}
+
+struct Duration FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DurationBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_UNIT = 4 };
+  cudf::io::parquet::flatbuf::TimeUnit unit() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::TimeUnit>(GetField<int16_t>(VT_UNIT, 1));
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_UNIT, 2) &&
+           verifier.EndTable();
+  }
+};
+
+struct DurationBuilder {
+  typedef Duration Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_unit(cudf::io::parquet::flatbuf::TimeUnit unit)
+  {
+    fbb_.AddElement<int16_t>(Duration::VT_UNIT, static_cast<int16_t>(unit), 1);
+  }
+  explicit DurationBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Duration> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Duration>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Duration> CreateDuration(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::TimeUnit unit = cudf::io::parquet::flatbuf::TimeUnit_MILLISECOND)
+{
+  DurationBuilder builder_(_fbb);
+  builder_.add_unit(unit);
+  return builder_.Finish();
+}
+
+/// ----------------------------------------------------------------------
+/// user defined key value pairs to add custom metadata to arrow
+/// key namespacing is the responsibility of the user
+struct KeyValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef KeyValueBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_KEY = 4, VT_VALUE = 6 };
+  const ::flatbuffers::String* key() const
+  {
+    return GetPointer<const ::flatbuffers::String*>(VT_KEY);
+  }
+  const ::flatbuffers::String* value() const
+  {
+    return GetPointer<const ::flatbuffers::String*>(VT_VALUE);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_KEY) &&
+           verifier.VerifyString(key()) && VerifyOffset(verifier, VT_VALUE) &&
+           verifier.VerifyString(value()) && verifier.EndTable();
+  }
+};
+
+struct KeyValueBuilder {
+  typedef KeyValue Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_key(::flatbuffers::Offset<::flatbuffers::String> key)
+  {
+    fbb_.AddOffset(KeyValue::VT_KEY, key);
+  }
+  void add_value(::flatbuffers::Offset<::flatbuffers::String> value)
+  {
+    fbb_.AddOffset(KeyValue::VT_VALUE, value);
+  }
+  explicit KeyValueBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<KeyValue> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<KeyValue>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<KeyValue> CreateKeyValue(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  ::flatbuffers::Offset<::flatbuffers::String> key   = 0,
+  ::flatbuffers::Offset<::flatbuffers::String> value = 0)
+{
+  KeyValueBuilder builder_(_fbb);
+  builder_.add_value(value);
+  builder_.add_key(key);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<KeyValue> CreateKeyValueDirect(::flatbuffers::FlatBufferBuilder& _fbb,
+                                                            const char* key   = nullptr,
+                                                            const char* value = nullptr)
+{
+  auto key__   = key ? _fbb.CreateString(key) : 0;
+  auto value__ = value ? _fbb.CreateString(value) : 0;
+  return cudf::io::parquet::flatbuf::CreateKeyValue(_fbb, key__, value__);
+}
+
+struct DictionaryEncoding FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef DictionaryEncodingBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ID             = 4,
+    VT_INDEXTYPE      = 6,
+    VT_ISORDERED      = 8,
+    VT_DICTIONARYKIND = 10
+  };
+  /// The known dictionary id in the application where this data is used. In
+  /// the file or streaming formats, the dictionary ids are found in the
+  /// DictionaryBatch messages
+  int64_t id() const { return GetField<int64_t>(VT_ID, 0); }
+  /// The dictionary indices are constrained to be non-negative integers. If
+  /// this field is null, the indices must be signed int32. To maximize
+  /// cross-language compatibility and performance, implementations are
+  /// recommended to prefer signed integer types over unsigned integer types
+  /// and to avoid uint64 indices unless they are required by an application.
+  const cudf::io::parquet::flatbuf::Int* indexType() const
+  {
+    return GetPointer<const cudf::io::parquet::flatbuf::Int*>(VT_INDEXTYPE);
+  }
+  /// By default, dictionaries are not ordered, or the order does not have
+  /// semantic meaning. In some statistical, applications, dictionary-encoding
+  /// is used to represent ordered categorical data, and we provide a way to
+  /// preserve that metadata here
+  bool isOrdered() const { return GetField<uint8_t>(VT_ISORDERED, 0) != 0; }
+  cudf::io::parquet::flatbuf::DictionaryKind dictionaryKind() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::DictionaryKind>(
+      GetField<int16_t>(VT_DICTIONARYKIND, 0));
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int64_t>(verifier, VT_ID, 8) &&
+           VerifyOffset(verifier, VT_INDEXTYPE) && verifier.VerifyTable(indexType()) &&
+           VerifyField<uint8_t>(verifier, VT_ISORDERED, 1) &&
+           VerifyField<int16_t>(verifier, VT_DICTIONARYKIND, 2) && verifier.EndTable();
+  }
+};
+
+struct DictionaryEncodingBuilder {
+  typedef DictionaryEncoding Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_id(int64_t id) { fbb_.AddElement<int64_t>(DictionaryEncoding::VT_ID, id, 0); }
+  void add_indexType(::flatbuffers::Offset<cudf::io::parquet::flatbuf::Int> indexType)
+  {
+    fbb_.AddOffset(DictionaryEncoding::VT_INDEXTYPE, indexType);
+  }
+  void add_isOrdered(bool isOrdered)
+  {
+    fbb_.AddElement<uint8_t>(DictionaryEncoding::VT_ISORDERED, static_cast<uint8_t>(isOrdered), 0);
+  }
+  void add_dictionaryKind(cudf::io::parquet::flatbuf::DictionaryKind dictionaryKind)
+  {
+    fbb_.AddElement<int16_t>(
+      DictionaryEncoding::VT_DICTIONARYKIND, static_cast<int16_t>(dictionaryKind), 0);
+  }
+  explicit DictionaryEncodingBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<DictionaryEncoding> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<DictionaryEncoding>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<DictionaryEncoding> CreateDictionaryEncoding(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  int64_t id                                                       = 0,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::Int> indexType = 0,
+  bool isOrdered                                                   = false,
+  cudf::io::parquet::flatbuf::DictionaryKind dictionaryKind =
+    cudf::io::parquet::flatbuf::DictionaryKind_DenseArray)
+{
+  DictionaryEncodingBuilder builder_(_fbb);
+  builder_.add_id(id);
+  builder_.add_indexType(indexType);
+  builder_.add_dictionaryKind(dictionaryKind);
+  builder_.add_isOrdered(isOrdered);
+  return builder_.Finish();
+}
+
+/// ----------------------------------------------------------------------
+/// A field represents a named column in a record / row batch or child of a
+/// nested type.
+struct Field FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef FieldBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_NAME            = 4,
+    VT_NULLABLE        = 6,
+    VT_TYPE_TYPE       = 8,
+    VT_TYPE            = 10,
+    VT_DICTIONARY      = 12,
+    VT_CHILDREN        = 14,
+    VT_CUSTOM_METADATA = 16
+  };
+  /// Name is not required, in i.e. a List
+  const ::flatbuffers::String* name() const
+  {
+    return GetPointer<const ::flatbuffers::String*>(VT_NAME);
+  }
+  /// Whether or not this field can contain nulls. Should be true in general.
+  bool nullable() const { return GetField<uint8_t>(VT_NULLABLE, 0) != 0; }
+  cudf::io::parquet::flatbuf::Type type_type() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::Type>(GetField<uint8_t>(VT_TYPE_TYPE, 0));
+  }
+  /// This is the type of the decoded value if the field is dictionary encoded.
+  const void* type() const { return GetPointer<const void*>(VT_TYPE); }
+  template <typename T>
+  const T* type_as() const;
+  const cudf::io::parquet::flatbuf::Null* type_as_Null() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Null
+             ? static_cast<const cudf::io::parquet::flatbuf::Null*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Int* type_as_Int() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Int
+             ? static_cast<const cudf::io::parquet::flatbuf::Int*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::FloatingPoint* type_as_FloatingPoint() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_FloatingPoint
+             ? static_cast<const cudf::io::parquet::flatbuf::FloatingPoint*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Binary* type_as_Binary() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Binary
+             ? static_cast<const cudf::io::parquet::flatbuf::Binary*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Utf8* type_as_Utf8() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Utf8
+             ? static_cast<const cudf::io::parquet::flatbuf::Utf8*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Bool* type_as_Bool() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Bool
+             ? static_cast<const cudf::io::parquet::flatbuf::Bool*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Decimal* type_as_Decimal() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Decimal
+             ? static_cast<const cudf::io::parquet::flatbuf::Decimal*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Date* type_as_Date() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Date
+             ? static_cast<const cudf::io::parquet::flatbuf::Date*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Time* type_as_Time() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Time
+             ? static_cast<const cudf::io::parquet::flatbuf::Time*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Timestamp* type_as_Timestamp() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Timestamp
+             ? static_cast<const cudf::io::parquet::flatbuf::Timestamp*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Interval* type_as_Interval() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Interval
+             ? static_cast<const cudf::io::parquet::flatbuf::Interval*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::List* type_as_List() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_List
+             ? static_cast<const cudf::io::parquet::flatbuf::List*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Struct_* type_as_Struct_() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Struct_
+             ? static_cast<const cudf::io::parquet::flatbuf::Struct_*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Union* type_as_Union() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Union
+             ? static_cast<const cudf::io::parquet::flatbuf::Union*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::FixedSizeBinary* type_as_FixedSizeBinary() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_FixedSizeBinary
+             ? static_cast<const cudf::io::parquet::flatbuf::FixedSizeBinary*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::FixedSizeList* type_as_FixedSizeList() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_FixedSizeList
+             ? static_cast<const cudf::io::parquet::flatbuf::FixedSizeList*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Map* type_as_Map() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Map
+             ? static_cast<const cudf::io::parquet::flatbuf::Map*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Duration* type_as_Duration() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Duration
+             ? static_cast<const cudf::io::parquet::flatbuf::Duration*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::LargeBinary* type_as_LargeBinary() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_LargeBinary
+             ? static_cast<const cudf::io::parquet::flatbuf::LargeBinary*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::LargeUtf8* type_as_LargeUtf8() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_LargeUtf8
+             ? static_cast<const cudf::io::parquet::flatbuf::LargeUtf8*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::LargeList* type_as_LargeList() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_LargeList
+             ? static_cast<const cudf::io::parquet::flatbuf::LargeList*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::RunEndEncoded* type_as_RunEndEncoded() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_RunEndEncoded
+             ? static_cast<const cudf::io::parquet::flatbuf::RunEndEncoded*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::BinaryView* type_as_BinaryView() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_BinaryView
+             ? static_cast<const cudf::io::parquet::flatbuf::BinaryView*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::Utf8View* type_as_Utf8View() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_Utf8View
+             ? static_cast<const cudf::io::parquet::flatbuf::Utf8View*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::ListView* type_as_ListView() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_ListView
+             ? static_cast<const cudf::io::parquet::flatbuf::ListView*>(type())
+             : nullptr;
+  }
+  const cudf::io::parquet::flatbuf::LargeListView* type_as_LargeListView() const
+  {
+    return type_type() == cudf::io::parquet::flatbuf::Type_LargeListView
+             ? static_cast<const cudf::io::parquet::flatbuf::LargeListView*>(type())
+             : nullptr;
+  }
+  /// Present only if the field is dictionary encoded.
+  const cudf::io::parquet::flatbuf::DictionaryEncoding* dictionary() const
+  {
+    return GetPointer<const cudf::io::parquet::flatbuf::DictionaryEncoding*>(VT_DICTIONARY);
+  }
+  /// children apply only to nested data types like Struct, List and Union. For
+  /// primitive types children will have length 0.
+  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>* children()
+    const
+  {
+    return GetPointer<
+      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>*>(
+      VT_CHILDREN);
+  }
+  /// User-defined metadata
+  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*
+  custom_metadata() const
+  {
+    return GetPointer<
+      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*>(
+      VT_CUSTOM_METADATA);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NAME) &&
+           verifier.VerifyString(name()) && VerifyField<uint8_t>(verifier, VT_NULLABLE, 1) &&
+           VerifyField<uint8_t>(verifier, VT_TYPE_TYPE, 1) && VerifyOffset(verifier, VT_TYPE) &&
+           VerifyType(verifier, type(), type_type()) && VerifyOffset(verifier, VT_DICTIONARY) &&
+           verifier.VerifyTable(dictionary()) && VerifyOffset(verifier, VT_CHILDREN) &&
+           verifier.VerifyVector(children()) && verifier.VerifyVectorOfTables(children()) &&
+           VerifyOffset(verifier, VT_CUSTOM_METADATA) && verifier.VerifyVector(custom_metadata()) &&
+           verifier.VerifyVectorOfTables(custom_metadata()) && verifier.EndTable();
+  }
+};
+
+template <>
+inline const cudf::io::parquet::flatbuf::Null* Field::type_as<cudf::io::parquet::flatbuf::Null>()
+  const
+{
+  return type_as_Null();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Int* Field::type_as<cudf::io::parquet::flatbuf::Int>()
+  const
+{
+  return type_as_Int();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::FloatingPoint*
+Field::type_as<cudf::io::parquet::flatbuf::FloatingPoint>() const
+{
+  return type_as_FloatingPoint();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Binary*
+Field::type_as<cudf::io::parquet::flatbuf::Binary>() const
+{
+  return type_as_Binary();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Utf8* Field::type_as<cudf::io::parquet::flatbuf::Utf8>()
+  const
+{
+  return type_as_Utf8();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Bool* Field::type_as<cudf::io::parquet::flatbuf::Bool>()
+  const
+{
+  return type_as_Bool();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Decimal*
+Field::type_as<cudf::io::parquet::flatbuf::Decimal>() const
+{
+  return type_as_Decimal();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Date* Field::type_as<cudf::io::parquet::flatbuf::Date>()
+  const
+{
+  return type_as_Date();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Time* Field::type_as<cudf::io::parquet::flatbuf::Time>()
+  const
+{
+  return type_as_Time();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Timestamp*
+Field::type_as<cudf::io::parquet::flatbuf::Timestamp>() const
+{
+  return type_as_Timestamp();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Interval*
+Field::type_as<cudf::io::parquet::flatbuf::Interval>() const
+{
+  return type_as_Interval();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::List* Field::type_as<cudf::io::parquet::flatbuf::List>()
+  const
+{
+  return type_as_List();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Struct_*
+Field::type_as<cudf::io::parquet::flatbuf::Struct_>() const
+{
+  return type_as_Struct_();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Union* Field::type_as<cudf::io::parquet::flatbuf::Union>()
+  const
+{
+  return type_as_Union();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::FixedSizeBinary*
+Field::type_as<cudf::io::parquet::flatbuf::FixedSizeBinary>() const
+{
+  return type_as_FixedSizeBinary();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::FixedSizeList*
+Field::type_as<cudf::io::parquet::flatbuf::FixedSizeList>() const
+{
+  return type_as_FixedSizeList();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Map* Field::type_as<cudf::io::parquet::flatbuf::Map>()
+  const
+{
+  return type_as_Map();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Duration*
+Field::type_as<cudf::io::parquet::flatbuf::Duration>() const
+{
+  return type_as_Duration();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::LargeBinary*
+Field::type_as<cudf::io::parquet::flatbuf::LargeBinary>() const
+{
+  return type_as_LargeBinary();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::LargeUtf8*
+Field::type_as<cudf::io::parquet::flatbuf::LargeUtf8>() const
+{
+  return type_as_LargeUtf8();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::LargeList*
+Field::type_as<cudf::io::parquet::flatbuf::LargeList>() const
+{
+  return type_as_LargeList();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::RunEndEncoded*
+Field::type_as<cudf::io::parquet::flatbuf::RunEndEncoded>() const
+{
+  return type_as_RunEndEncoded();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::BinaryView*
+Field::type_as<cudf::io::parquet::flatbuf::BinaryView>() const
+{
+  return type_as_BinaryView();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::Utf8View*
+Field::type_as<cudf::io::parquet::flatbuf::Utf8View>() const
+{
+  return type_as_Utf8View();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::ListView*
+Field::type_as<cudf::io::parquet::flatbuf::ListView>() const
+{
+  return type_as_ListView();
+}
+
+template <>
+inline const cudf::io::parquet::flatbuf::LargeListView*
+Field::type_as<cudf::io::parquet::flatbuf::LargeListView>() const
+{
+  return type_as_LargeListView();
+}
+
+struct FieldBuilder {
+  typedef Field Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_name(::flatbuffers::Offset<::flatbuffers::String> name)
+  {
+    fbb_.AddOffset(Field::VT_NAME, name);
+  }
+  void add_nullable(bool nullable)
+  {
+    fbb_.AddElement<uint8_t>(Field::VT_NULLABLE, static_cast<uint8_t>(nullable), 0);
+  }
+  void add_type_type(cudf::io::parquet::flatbuf::Type type_type)
+  {
+    fbb_.AddElement<uint8_t>(Field::VT_TYPE_TYPE, static_cast<uint8_t>(type_type), 0);
+  }
+  void add_type(::flatbuffers::Offset<void> type) { fbb_.AddOffset(Field::VT_TYPE, type); }
+  void add_dictionary(
+    ::flatbuffers::Offset<cudf::io::parquet::flatbuf::DictionaryEncoding> dictionary)
+  {
+    fbb_.AddOffset(Field::VT_DICTIONARY, dictionary);
+  }
+  void add_children(
+    ::flatbuffers::Offset<
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>> children)
+  {
+    fbb_.AddOffset(Field::VT_CHILDREN, children);
+  }
+  void add_custom_metadata(
+    ::flatbuffers::Offset<
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>>
+      custom_metadata)
+  {
+    fbb_.AddOffset(Field::VT_CUSTOM_METADATA, custom_metadata);
+  }
+  explicit FieldBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Field> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Field>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Field> CreateField(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  ::flatbuffers::Offset<::flatbuffers::String> name = 0,
+  bool nullable                                     = false,
+  cudf::io::parquet::flatbuf::Type type_type        = cudf::io::parquet::flatbuf::Type_NONE,
+  ::flatbuffers::Offset<void> type                  = 0,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::DictionaryEncoding> dictionary = 0,
+  ::flatbuffers::Offset<
+    ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>> children = 0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<
+    ::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>> custom_metadata             = 0)
+{
+  FieldBuilder builder_(_fbb);
+  builder_.add_custom_metadata(custom_metadata);
+  builder_.add_children(children);
+  builder_.add_dictionary(dictionary);
+  builder_.add_type(type);
+  builder_.add_name(name);
+  builder_.add_type_type(type_type);
+  builder_.add_nullable(nullable);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Field> CreateFieldDirect(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  const char* name                           = nullptr,
+  bool nullable                              = false,
+  cudf::io::parquet::flatbuf::Type type_type = cudf::io::parquet::flatbuf::Type_NONE,
+  ::flatbuffers::Offset<void> type           = 0,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::DictionaryEncoding> dictionary      = 0,
+  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>* children = nullptr,
+  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>* custom_metadata =
+    nullptr)
+{
+  auto name__ = name ? _fbb.CreateString(name) : 0;
+  auto children__ =
+    children
+      ? _fbb.CreateVector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>(*children)
+      : 0;
+  auto custom_metadata__ =
+    custom_metadata
+      ? _fbb.CreateVector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>(
+          *custom_metadata)
+      : 0;
+  return cudf::io::parquet::flatbuf::CreateField(
+    _fbb, name__, nullable, type_type, type, dictionary, children__, custom_metadata__);
+}
+
+/// ----------------------------------------------------------------------
+/// A Schema describes the columns in a row batch
+struct Schema FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef SchemaBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_ENDIANNESS      = 4,
+    VT_FIELDS          = 6,
+    VT_CUSTOM_METADATA = 8,
+    VT_FEATURES        = 10
+  };
+  /// endianness of the buffer
+  /// it is Little Endian by default
+  /// if endianness doesn't match the underlying system then the vectors need to be converted
+  cudf::io::parquet::flatbuf::Endianness endianness() const
+  {
+    return static_cast<cudf::io::parquet::flatbuf::Endianness>(GetField<int16_t>(VT_ENDIANNESS, 0));
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>* fields()
+    const
+  {
+    return GetPointer<
+      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>*>(
+      VT_FIELDS);
+  }
+  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*
+  custom_metadata() const
+  {
+    return GetPointer<
+      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*>(
+      VT_CUSTOM_METADATA);
+  }
+  /// Features used in the stream/file.
+  const ::flatbuffers::Vector<int64_t>* features() const
+  {
+    return GetPointer<const ::flatbuffers::Vector<int64_t>*>(VT_FEATURES);
+  }
+  bool Verify(::flatbuffers::Verifier& verifier) const
+  {
+    return VerifyTableStart(verifier) && VerifyField<int16_t>(verifier, VT_ENDIANNESS, 2) &&
+           VerifyOffset(verifier, VT_FIELDS) && verifier.VerifyVector(fields()) &&
+           verifier.VerifyVectorOfTables(fields()) && VerifyOffset(verifier, VT_CUSTOM_METADATA) &&
+           verifier.VerifyVector(custom_metadata()) &&
+           verifier.VerifyVectorOfTables(custom_metadata()) &&
+           VerifyOffset(verifier, VT_FEATURES) && verifier.VerifyVector(features()) &&
+           verifier.EndTable();
+  }
+};
+
+struct SchemaBuilder {
+  typedef Schema Table;
+  ::flatbuffers::FlatBufferBuilder& fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_endianness(cudf::io::parquet::flatbuf::Endianness endianness)
+  {
+    fbb_.AddElement<int16_t>(Schema::VT_ENDIANNESS, static_cast<int16_t>(endianness), 0);
+  }
+  void add_fields(
+    ::flatbuffers::Offset<
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>> fields)
+  {
+    fbb_.AddOffset(Schema::VT_FIELDS, fields);
+  }
+  void add_custom_metadata(
+    ::flatbuffers::Offset<
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>>
+      custom_metadata)
+  {
+    fbb_.AddOffset(Schema::VT_CUSTOM_METADATA, custom_metadata);
+  }
+  void add_features(::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> features)
+  {
+    fbb_.AddOffset(Schema::VT_FEATURES, features);
+  }
+  explicit SchemaBuilder(::flatbuffers::FlatBufferBuilder& _fbb) : fbb_(_fbb)
+  {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<Schema> Finish()
+  {
+    const auto end = fbb_.EndTable(start_);
+    auto o         = ::flatbuffers::Offset<Schema>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<Schema> CreateSchema(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::Endianness endianness = cudf::io::parquet::flatbuf::Endianness_Little,
+  ::flatbuffers::Offset<
+    ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>> fields = 0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<
+    ::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>> custom_metadata           = 0,
+  ::flatbuffers::Offset<::flatbuffers::Vector<int64_t>> features                            = 0)
+{
+  SchemaBuilder builder_(_fbb);
+  builder_.add_features(features);
+  builder_.add_custom_metadata(custom_metadata);
+  builder_.add_fields(fields);
+  builder_.add_endianness(endianness);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<Schema> CreateSchemaDirect(
+  ::flatbuffers::FlatBufferBuilder& _fbb,
+  cudf::io::parquet::flatbuf::Endianness endianness = cudf::io::parquet::flatbuf::Endianness_Little,
+  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>* fields = nullptr,
+  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>* custom_metadata =
+    nullptr,
+  const std::vector<int64_t>* features = nullptr)
+{
+  auto fields__ =
+    fields ? _fbb.CreateVector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>(*fields)
+           : 0;
+  auto custom_metadata__ =
+    custom_metadata
+      ? _fbb.CreateVector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>(
+          *custom_metadata)
+      : 0;
+  auto features__ = features ? _fbb.CreateVector<int64_t>(*features) : 0;
+  return cudf::io::parquet::flatbuf::CreateSchema(
+    _fbb, endianness, fields__, custom_metadata__, features__);
+}
+
+inline bool VerifyType(::flatbuffers::Verifier& verifier, const void* obj, Type type)
+{
+  switch (type) {
+    case Type_NONE: {
+      return true;
+    }
+    case Type_Null: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Null*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Int: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Int*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_FloatingPoint: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::FloatingPoint*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Binary: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Binary*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Utf8: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Utf8*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Bool: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Bool*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Decimal: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Decimal*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Date: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Date*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Time: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Time*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Timestamp: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Timestamp*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Interval: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Interval*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_List: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::List*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Struct_: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Struct_*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Union: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Union*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_FixedSizeBinary: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::FixedSizeBinary*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_FixedSizeList: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::FixedSizeList*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Map: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Map*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Duration: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Duration*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_LargeBinary: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::LargeBinary*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_LargeUtf8: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::LargeUtf8*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_LargeList: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::LargeList*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_RunEndEncoded: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::RunEndEncoded*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_BinaryView: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::BinaryView*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_Utf8View: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Utf8View*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_ListView: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::ListView*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    case Type_LargeListView: {
+      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::LargeListView*>(obj);
+      return verifier.VerifyTable(ptr);
+    }
+    default: return true;
+  }
+}
+
+inline bool VerifyTypeVector(::flatbuffers::Verifier& verifier,
+                             const ::flatbuffers::Vector<::flatbuffers::Offset<void>>* values,
+                             const ::flatbuffers::Vector<uint8_t>* types)
+{
+  if (!values || !types) return !values && !types;
+  if (values->size() != types->size()) return false;
+  for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) {
+    if (!VerifyType(verifier, values->Get(i), types->GetEnum<Type>(i))) { return false; }
+  }
+  return true;
+}
+
+inline const cudf::io::parquet::flatbuf::Schema* GetSchema(const void* buf)
+{
+  return ::flatbuffers::GetRoot<cudf::io::parquet::flatbuf::Schema>(buf);
+}
+
+inline const cudf::io::parquet::flatbuf::Schema* GetSizePrefixedSchema(const void* buf)
+{
+  return ::flatbuffers::GetSizePrefixedRoot<cudf::io::parquet::flatbuf::Schema>(buf);
+}
+
+inline bool VerifySchemaBuffer(::flatbuffers::Verifier& verifier)
+{
+  return verifier.VerifyBuffer<cudf::io::parquet::flatbuf::Schema>(nullptr);
+}
+
+inline bool VerifySizePrefixedSchemaBuffer(::flatbuffers::Verifier& verifier)
+{
+  return verifier.VerifySizePrefixedBuffer<cudf::io::parquet::flatbuf::Schema>(nullptr);
+}
+
+inline void FinishSchemaBuffer(::flatbuffers::FlatBufferBuilder& fbb,
+                               ::flatbuffers::Offset<cudf::io::parquet::flatbuf::Schema> root)
+{
+  fbb.Finish(root);
+}
+
+inline void FinishSizePrefixedSchemaBuffer(
+  ::flatbuffers::FlatBufferBuilder& fbb,
+  ::flatbuffers::Offset<cudf::io::parquet::flatbuf::Schema> root)
+{
+  fbb.FinishSizePrefixed(root);
+}
+
+}  // namespace flatbuf
+}  // namespace parquet
+}  // namespace io
+}  // namespace cudf
+
+#endif  // FLATBUFFERS_GENERATED_SCHEMA_CUDF_IO_PARQUET_FLATBUF_H_
diff --git a/cpp/src/io/parquet/ipc/schema/Message.fbs b/cpp/src/io/parquet/ipc/schema/Message.fbs
new file mode 100644
index 00000000000..25534410597
--- /dev/null
+++ b/cpp/src/io/parquet/ipc/schema/Message.fbs
@@ -0,0 +1,176 @@
+//
+// Copyright (c) 2024, NVIDIA CORPORATION.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+//
+// Portions of this file are derived from Apache's Arrow project at
+// https://github.com/apache/arrow, original license text below.
+//
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+include "Schema.fbs";
+
+namespace cudf.io.parquet.flatbuf;
+
+/// ----------------------------------------------------------------------
+/// Data structures for describing a table row batch (a collection of
+/// equal-length Arrow arrays)
+
+/// Metadata about a field at some level of a nested type tree (but not
+/// its children).
+///
+/// For example, a List<Int16> with values `[[1, 2, 3], null, [4], [5, 6], null]`
+/// would have {length: 5, null_count: 2} for its List node, and {length: 6,
+/// null_count: 0} for its Int16 node, as separate FieldNode structs
+struct FieldNode {
+  /// The number of value slots in the Arrow array at this level of a nested
+  /// tree
+  length: long;
+
+  /// The number of observed nulls. Fields with null_count == 0 may choose not
+  /// to write their physical validity bitmap out as a materialized buffer,
+  /// instead setting the length of the bitmap buffer to 0.
+  null_count: long;
+}
+
+enum CompressionType:byte {
+  // LZ4 frame format, for portability, as provided by lz4frame.h or wrappers
+  // thereof. Not to be confused with "raw" (also called "block") format
+  // provided by lz4.h
+  LZ4_FRAME,
+
+  // Zstandard
+  ZSTD
+}
+
+/// Provided for forward compatibility in case we need to support different
+/// strategies for compressing the IPC message body (like whole-body
+/// compression rather than buffer-level) in the future
+enum BodyCompressionMethod:byte {
+  /// Each constituent buffer is first compressed with the indicated
+  /// compressor, and then written with the uncompressed length in the first 8
+  /// bytes as a 64-bit little-endian signed integer followed by the compressed
+  /// buffer bytes (and then padding as required by the protocol). The
+  /// uncompressed length may be set to -1 to indicate that the data that
+  /// follows is not compressed, which can be useful for cases where
+  /// compression does not yield appreciable savings.
+  BUFFER
+}
+
+/// Optional compression for the memory buffers constituting IPC message
+/// bodies. Intended for use with RecordBatch but could be used for other
+/// message types
+table BodyCompression {
+  /// Compressor library.
+  /// For LZ4_FRAME, each compressed buffer must consist of a single frame.
+  codec: CompressionType = LZ4_FRAME;
+
+  /// Indicates the way the record batch body was compressed
+  method: BodyCompressionMethod = BUFFER;
+}
+
+/// A data header describing the shared memory layout of a "record" or "row"
+/// batch. Some systems call this a "row batch" internally and others a "record
+/// batch".
+table RecordBatch {
+  /// number of records / rows. The arrays in the batch should all have this
+  /// length
+  length: long;
+
+  /// Nodes correspond to the pre-ordered flattened logical schema
+  nodes: [FieldNode];
+
+  /// Buffers correspond to the pre-ordered flattened buffer tree
+  ///
+  /// The number of buffers appended to this list depends on the schema. For
+  /// example, most primitive arrays will have 2 buffers, 1 for the validity
+  /// bitmap and 1 for the values. For struct arrays, there will only be a
+  /// single buffer for the validity (nulls) bitmap
+  buffers: [Buffer];
+
+  /// Optional compression of the message body
+  compression: BodyCompression;
+
+  /// Some types such as Utf8View are represented using a variable number of buffers.
+  /// For each such Field in the pre-ordered flattened logical schema, there will be
+  /// an entry in variadicBufferCounts to indicate the number of number of variadic
+  /// buffers which belong to that Field in the current RecordBatch.
+  ///
+  /// For example, the schema
+  ///     col1: Struct<alpha: Int32, beta: BinaryView, gamma: Float64>
+  ///     col2: Utf8View
+  /// contains two Fields with variadic buffers so variadicBufferCounts will have
+  /// two entries, the first counting the variadic buffers of `col1.beta` and the
+  /// second counting `col2`'s.
+  ///
+  /// This field may be omitted if and only if the schema contains no Fields with
+  /// a variable number of buffers, such as BinaryView and Utf8View.
+  variadicBufferCounts: [long];
+}
+
+/// For sending dictionary encoding information. Any Field can be
+/// dictionary-encoded, but in this case none of its children may be
+/// dictionary-encoded.
+/// There is one vector / column per dictionary, but that vector / column
+/// may be spread across multiple dictionary batches by using the isDelta
+/// flag
+
+table DictionaryBatch {
+  id: long;
+  data: RecordBatch;
+
+  /// If isDelta is true the values in the dictionary are to be appended to a
+  /// dictionary with the indicated id. If isDelta is false this dictionary
+  /// should replace the existing dictionary.
+  isDelta: bool = false;
+}
+
+/// ----------------------------------------------------------------------
+/// The root Message type
+
+/// This union enables us to easily send different message types without
+/// redundant storage, and in the future we can easily add new message types.
+///
+/// Arrow implementations do not need to implement all of the message types,
+/// which may include experimental metadata types. For maximum compatibility,
+/// it is best to send data using RecordBatch
+union MessageHeader {
+  Schema
+}
+
+table Message {
+  version: cudf.io.parquet.flatbuf.MetadataVersion;
+  header: MessageHeader;
+  bodyLength: long;
+  custom_metadata: [ KeyValue ];
+}
+
+root_type Message;
diff --git a/cpp/src/io/parquet/ipc/schema/Schema.fbs b/cpp/src/io/parquet/ipc/schema/Schema.fbs
new file mode 100644
index 00000000000..5f66e7bbd5e
--- /dev/null
+++ b/cpp/src/io/parquet/ipc/schema/Schema.fbs
@@ -0,0 +1,591 @@
+//
+// Copyright (c) 2024, NVIDIA CORPORATION.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+
+//
+// Portions of this file are derived from Apache's Arrow project at
+// https://github.com/apache/arrow, original license text below.
+//
+
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/// Logical types, vector layouts, and schemas
+
+/// Format Version History.
+/// Version 1.0 - Forward and backwards compatibility guaranteed.
+/// Version 1.1 - Add Decimal256.
+/// Version 1.2 - Add Interval MONTH_DAY_NANO.
+/// Version 1.3 - Add Run-End Encoded.
+/// Version 1.4 - Add BinaryView, Utf8View, variadicBufferCounts, ListView, and
+/// LargeListView.
+
+namespace cudf.io.parquet.flatbuf;
+
+enum MetadataVersion:short {
+  /// 0.1.0 (October 2016).
+  V1,
+
+  /// 0.2.0 (February 2017). Non-backwards compatible with V1.
+  V2,
+
+  /// 0.3.0 -> 0.7.1 (May - December 2017). Non-backwards compatible with V2.
+  V3,
+
+  /// >= 0.8.0 (December 2017). Non-backwards compatible with V3.
+  V4,
+
+  /// >= 1.0.0 (July 2020). Backwards compatible with V4 (V5 readers can read V4
+  /// metadata and IPC messages). Implementations are recommended to provide a
+  /// V4 compatibility mode with V5 format changes disabled.
+  ///
+  /// Incompatible changes between V4 and V5:
+  /// - Union buffer layout has changed. In V5, Unions don't have a validity
+  ///   bitmap buffer.
+  V5,
+}
+
+/// Represents Arrow Features that might not have full support
+/// within implementations. This is intended to be used in
+/// two scenarios:
+///  1.  A mechanism for readers of Arrow Streams
+///      and files to understand that the stream or file makes
+///      use of a feature that isn't supported or unknown to
+///      the implementation (and therefore can meet the Arrow
+///      forward compatibility guarantees).
+///  2.  A means of negotiating between a client and server
+///      what features a stream is allowed to use. The enums
+///      values here are intended to represent higher level
+///      features, additional details maybe negotiated
+///      with key-value pairs specific to the protocol.
+///
+/// Enums added to this list should be assigned power-of-two values
+/// to facilitate exchanging and comparing bitmaps for supported
+/// features.
+enum Feature : long {
+  /// Needed to make flatbuffers happy.
+  UNUSED = 0,
+  /// The stream makes use of multiple full dictionaries with the
+  /// same ID and assumes clients implement dictionary replacement
+  /// correctly.
+  DICTIONARY_REPLACEMENT = 1,
+  /// The stream makes use of compressed bodies as described
+  /// in Message.fbs.
+  COMPRESSED_BODY = 2
+}
+
+/// These are stored in the flatbuffer in the Type union below
+
+table Null {
+}
+
+/// A Struct_ in the flatbuffer metadata is the same as an Arrow Struct
+/// (according to the physical memory layout). We used Struct_ here as
+/// Struct is a reserved word in Flatbuffers
+table Struct_ {
+}
+
+table List {
+}
+
+/// Same as List, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+table LargeList {
+}
+
+/// Represents the same logical types that List can, but contains offsets and
+/// sizes allowing for writes in any order and sharing of child values among
+/// list values.
+table ListView {
+}
+
+/// Same as ListView, but with 64-bit offsets and sizes, allowing to represent
+/// extremely large data values.
+table LargeListView {
+}
+
+table FixedSizeList {
+  /// Number of list items per value
+  listSize: int;
+}
+
+/// A Map is a logical nested type that is represented as
+///
+/// List<entries: Struct<key: K, value: V>>
+///
+/// In this layout, the keys and values are each respectively contiguous. We do
+/// not constrain the key and value types, so the application is responsible
+/// for ensuring that the keys are hashable and unique. Whether the keys are sorted
+/// may be set in the metadata for this field.
+///
+/// In a field with Map type, the field has a child Struct field, which then
+/// has two children: key type and the second the value type. The names of the
+/// child fields may be respectively "entries", "key", and "value", but this is
+/// not enforced.
+///
+/// Map
+/// ```text
+///   - child[0] entries: Struct
+///     - child[0] key: K
+///     - child[1] value: V
+/// ```
+/// Neither the "entries" field nor the "key" field may be nullable.
+///
+/// The metadata is structured so that Arrow systems without special handling
+/// for Map can make Map an alias for List. The "layout" attribute for the Map
+/// field must have the same contents as a List.
+table Map {
+  /// Set to true if the keys within each value are sorted
+  keysSorted: bool;
+}
+
+enum UnionMode:short { Sparse, Dense }
+
+/// A union is a complex type with children in Field
+/// By default ids in the type vector refer to the offsets in the children
+/// optionally typeIds provides an indirection between the child offset and the type id
+/// for each child `typeIds[offset]` is the id used in the type vector
+table Union {
+  mode: UnionMode;
+  typeIds: [ int ]; // optional, describes typeid of each child.
+}
+
+table Int {
+  bitWidth: int; // restricted to 8, 16, 32, and 64 in v1
+  is_signed: bool;
+}
+
+enum Precision:short {HALF, SINGLE, DOUBLE}
+
+table FloatingPoint {
+  precision: Precision;
+}
+
+/// Unicode with UTF-8 encoding
+table Utf8 {
+}
+
+/// Opaque binary data
+table Binary {
+}
+
+/// Same as Utf8, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+table LargeUtf8 {
+}
+
+/// Same as Binary, but with 64-bit offsets, allowing to represent
+/// extremely large data values.
+table LargeBinary {
+}
+
+/// Logically the same as Utf8, but the internal representation uses a view
+/// struct that contains the string length and either the string's entire data
+/// inline (for small strings) or an inlined prefix, an index of another buffer,
+/// and an offset pointing to a slice in that buffer (for non-small strings).
+///
+/// Since it uses a variable number of data buffers, each Field with this type
+/// must have a corresponding entry in `variadicBufferCounts`.
+table Utf8View {
+}
+
+/// Logically the same as Binary, but the internal representation uses a view
+/// struct that contains the string length and either the string's entire data
+/// inline (for small strings) or an inlined prefix, an index of another buffer,
+/// and an offset pointing to a slice in that buffer (for non-small strings).
+///
+/// Since it uses a variable number of data buffers, each Field with this type
+/// must have a corresponding entry in `variadicBufferCounts`.
+table BinaryView {
+}
+
+
+table FixedSizeBinary {
+  /// Number of bytes per value
+  byteWidth: int;
+}
+
+table Bool {
+}
+
+/// Contains two child arrays, run_ends and values.
+/// The run_ends child array must be a 16/32/64-bit integer array
+/// which encodes the indices at which the run with the value in
+/// each corresponding index in the values child array ends.
+/// Like list/struct types, the value array can be of any type.
+table RunEndEncoded {
+}
+
+/// Exact decimal value represented as an integer value in two's
+/// complement. Currently only 128-bit (16-byte) and 256-bit (32-byte) integers
+/// are used. The representation uses the endianness indicated
+/// in the Schema.
+table Decimal {
+  /// Total number of decimal digits
+  precision: int;
+
+  /// Number of digits after the decimal point "."
+  scale: int;
+
+  /// Number of bits per value. The only accepted widths are 128 and 256.
+  /// We use bitWidth for consistency with Int::bitWidth.
+  bitWidth: int = 128;
+}
+
+enum DateUnit: short {
+  DAY,
+  MILLISECOND
+}
+
+/// Date is either a 32-bit or 64-bit signed integer type representing an
+/// elapsed time since UNIX epoch (1970-01-01), stored in either of two units:
+///
+/// * Milliseconds (64 bits) indicating UNIX time elapsed since the epoch (no
+///   leap seconds), where the values are evenly divisible by 86400000
+/// * Days (32 bits) since the UNIX epoch
+table Date {
+  unit: DateUnit = MILLISECOND;
+}
+
+enum TimeUnit: short { SECOND, MILLISECOND, MICROSECOND, NANOSECOND }
+
+/// Time is either a 32-bit or 64-bit signed integer type representing an
+/// elapsed time since midnight, stored in either of four units: seconds,
+/// milliseconds, microseconds or nanoseconds.
+///
+/// The integer `bitWidth` depends on the `unit` and must be one of the following:
+/// * SECOND and MILLISECOND: 32 bits
+/// * MICROSECOND and NANOSECOND: 64 bits
+///
+/// The allowed values are between 0 (inclusive) and 86400 (=24*60*60) seconds
+/// (exclusive), adjusted for the time unit (for example, up to 86400000
+/// exclusive for the MILLISECOND unit).
+/// This definition doesn't allow for leap seconds. Time values from
+/// measurements with leap seconds will need to be corrected when ingesting
+/// into Arrow (for example by replacing the value 86400 with 86399).
+table Time {
+  unit: TimeUnit = MILLISECOND;
+  bitWidth: int = 32;
+}
+
+/// Timestamp is a 64-bit signed integer representing an elapsed time since a
+/// fixed epoch, stored in either of four units: seconds, milliseconds,
+/// microseconds or nanoseconds, and is optionally annotated with a timezone.
+///
+/// Timestamp values do not include any leap seconds (in other words, all
+/// days are considered 86400 seconds long).
+///
+/// Timestamps with a non-empty timezone
+/// ------------------------------------
+///
+/// If a Timestamp column has a non-empty timezone value, its epoch is
+/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in the *UTC* timezone
+/// (the Unix epoch), regardless of the Timestamp's own timezone.
+///
+/// Therefore, timestamp values with a non-empty timezone correspond to
+/// physical points in time together with some additional information about
+/// how the data was obtained and/or how to display it (the timezone).
+///
+///   For example, the timestamp value 0 with the timezone string "Europe/Paris"
+///   corresponds to "January 1st 1970, 00h00" in the UTC timezone, but the
+///   application may prefer to display it as "January 1st 1970, 01h00" in
+///   the Europe/Paris timezone (which is the same physical point in time).
+///
+/// One consequence is that timestamp values with a non-empty timezone
+/// can be compared and ordered directly, since they all share the same
+/// well-known point of reference (the Unix epoch).
+///
+/// Timestamps with an unset / empty timezone
+/// -----------------------------------------
+///
+/// If a Timestamp column has no timezone value, its epoch is
+/// 1970-01-01 00:00:00 (January 1st 1970, midnight) in an *unknown* timezone.
+///
+/// Therefore, timestamp values without a timezone cannot be meaningfully
+/// interpreted as physical points in time, but only as calendar / clock
+/// indications ("wall clock time") in an unspecified timezone.
+///
+///   For example, the timestamp value 0 with an empty timezone string
+///   corresponds to "January 1st 1970, 00h00" in an unknown timezone: there
+///   is not enough information to interpret it as a well-defined physical
+///   point in time.
+///
+/// One consequence is that timestamp values without a timezone cannot
+/// be reliably compared or ordered, since they may have different points of
+/// reference.  In particular, it is *not* possible to interpret an unset
+/// or empty timezone as the same as "UTC".
+///
+/// Conversion between timezones
+/// ----------------------------
+///
+/// If a Timestamp column has a non-empty timezone, changing the timezone
+/// to a different non-empty value is a metadata-only operation:
+/// the timestamp values need not change as their point of reference remains
+/// the same (the Unix epoch).
+///
+/// However, if a Timestamp column has no timezone value, changing it to a
+/// non-empty value requires to think about the desired semantics.
+/// One possibility is to assume that the original timestamp values are
+/// relative to the epoch of the timezone being set; timestamp values should
+/// then adjusted to the Unix epoch (for example, changing the timezone from
+/// empty to "Europe/Paris" would require converting the timestamp values
+/// from "Europe/Paris" to "UTC", which seems counter-intuitive but is
+/// nevertheless correct).
+///
+/// Guidelines for encoding data from external libraries
+/// ----------------------------------------------------
+///
+/// Date & time libraries often have multiple different data types for temporal
+/// data. In order to ease interoperability between different implementations the
+/// Arrow project has some recommendations for encoding these types into a Timestamp
+/// column.
+///
+/// An "instant" represents a physical point in time that has no relevant timezone
+/// (for example, astronomical data). To encode an instant, use a Timestamp with
+/// the timezone string set to "UTC", and make sure the Timestamp values
+/// are relative to the UTC epoch (January 1st 1970, midnight).
+///
+/// A "zoned date-time" represents a physical point in time annotated with an
+/// informative timezone (for example, the timezone in which the data was
+/// recorded).  To encode a zoned date-time, use a Timestamp with the timezone
+/// string set to the name of the timezone, and make sure the Timestamp values
+/// are relative to the UTC epoch (January 1st 1970, midnight).
+///
+///  (There is some ambiguity between an instant and a zoned date-time with the
+///   UTC timezone.  Both of these are stored the same in Arrow.  Typically,
+///   this distinction does not matter.  If it does, then an application should
+///   use custom metadata or an extension type to distinguish between the two cases.)
+///
+/// An "offset date-time" represents a physical point in time combined with an
+/// explicit offset from UTC.  To encode an offset date-time, use a Timestamp
+/// with the timezone string set to the numeric timezone offset string
+/// (e.g. "+03:00"), and make sure the Timestamp values are relative to
+/// the UTC epoch (January 1st 1970, midnight).
+///
+/// A "naive date-time" (also called "local date-time" in some libraries)
+/// represents a wall clock time combined with a calendar date, but with
+/// no indication of how to map this information to a physical point in time.
+/// Naive date-times must be handled with care because of this missing
+/// information, and also because daylight saving time (DST) may make
+/// some values ambiguous or nonexistent. A naive date-time may be
+/// stored as a struct with Date and Time fields. However, it may also be
+/// encoded into a Timestamp column with an empty timezone. The timestamp
+/// values should be computed "as if" the timezone of the date-time values
+/// was UTC; for example, the naive date-time "January 1st 1970, 00h00" would
+/// be encoded as timestamp value 0.
+table Timestamp {
+  unit: TimeUnit;
+
+  /// The timezone is an optional string indicating the name of a timezone,
+  /// one of:
+  ///
+  /// * As used in the Olson timezone database (the "tz database" or
+  ///   "tzdata"), such as "America/New_York".
+  /// * An absolute timezone offset of the form "+XX:XX" or "-XX:XX",
+  ///   such as "+07:30".
+  ///
+  /// Whether a timezone string is present indicates different semantics about
+  /// the data (see above).
+  timezone: string;
+}
+
+enum IntervalUnit: short { YEAR_MONTH, DAY_TIME, MONTH_DAY_NANO}
+// A "calendar" interval which models types that don't necessarily
+// have a precise duration without the context of a base timestamp (e.g.
+// days can differ in length during day light savings time transitions).
+// All integers in the types below are stored in the endianness indicated
+// by the schema.
+//
+// YEAR_MONTH - Indicates the number of elapsed whole months, stored as
+//   4-byte signed integers.
+// DAY_TIME - Indicates the number of elapsed days and milliseconds (no leap seconds),
+//   stored as 2 contiguous 32-bit signed integers (8-bytes in total). Support
+//   of this IntervalUnit is not required for full arrow compatibility.
+// MONTH_DAY_NANO - A triple of the number of elapsed months, days, and nanoseconds.
+//  The values are stored contiguously in 16-byte blocks. Months and days are
+//  encoded as 32-bit signed integers and nanoseconds is encoded as a 64-bit
+//  signed integer. Nanoseconds does not allow for leap seconds. Each field is
+//  independent (e.g. there is no constraint that nanoseconds have the same
+//  sign as days or that the quantity of nanoseconds represents less than a
+//  day's worth of time).
+table Interval {
+  unit: IntervalUnit;
+}
+
+// An absolute length of time unrelated to any calendar artifacts.
+//
+// For the purposes of Arrow Implementations, adding this value to a Timestamp
+// ("t1") naively (i.e. simply summing the two number) is acceptable even
+// though in some cases the resulting Timestamp (t2) would not account for
+// leap-seconds during the elapsed time between "t1" and "t2".  Similarly,
+// representing the difference between two Unix timestamp is acceptable, but
+// would yield a value that is possibly a few seconds off from the true elapsed
+// time.
+//
+//  The resolution defaults to millisecond, but can be any of the other
+//  supported TimeUnit values as with Timestamp and Time types.  This type is
+//  always represented as an 8-byte integer.
+table Duration {
+  unit: TimeUnit = MILLISECOND;
+}
+
+/// ----------------------------------------------------------------------
+/// Top-level Type value, enabling extensible type-specific metadata. We can
+/// add new logical types to Type without breaking backwards compatibility
+
+union Type {
+  Null,
+  Int,
+  FloatingPoint,
+  Binary,
+  Utf8,
+  Bool,
+  Decimal,
+  Date,
+  Time,
+  Timestamp,
+  Interval,
+  List,
+  Struct_,
+  Union,
+  FixedSizeBinary,
+  FixedSizeList,
+  Map,
+  Duration,
+  LargeBinary,
+  LargeUtf8,
+  LargeList,
+  RunEndEncoded,
+  BinaryView,
+  Utf8View,
+  ListView,
+  LargeListView,
+}
+
+/// ----------------------------------------------------------------------
+/// user defined key value pairs to add custom metadata to arrow
+/// key namespacing is the responsibility of the user
+
+table KeyValue {
+  key: string;
+  value: string;
+}
+
+/// ----------------------------------------------------------------------
+/// Dictionary encoding metadata
+/// Maintained for forwards compatibility, in the future
+/// Dictionaries might be explicit maps between integers and values
+/// allowing for non-contiguous index values
+enum DictionaryKind : short { DenseArray }
+table DictionaryEncoding {
+  /// The known dictionary id in the application where this data is used. In
+  /// the file or streaming formats, the dictionary ids are found in the
+  /// DictionaryBatch messages
+  id: long;
+
+  /// The dictionary indices are constrained to be non-negative integers. If
+  /// this field is null, the indices must be signed int32. To maximize
+  /// cross-language compatibility and performance, implementations are
+  /// recommended to prefer signed integer types over unsigned integer types
+  /// and to avoid uint64 indices unless they are required by an application.
+  indexType: Int;
+
+  /// By default, dictionaries are not ordered, or the order does not have
+  /// semantic meaning. In some statistical, applications, dictionary-encoding
+  /// is used to represent ordered categorical data, and we provide a way to
+  /// preserve that metadata here
+  isOrdered: bool;
+
+  dictionaryKind: DictionaryKind;
+}
+
+/// ----------------------------------------------------------------------
+/// A field represents a named column in a record / row batch or child of a
+/// nested type.
+
+table Field {
+  /// Name is not required, in i.e. a List
+  name: string;
+
+  /// Whether or not this field can contain nulls. Should be true in general.
+  nullable: bool;
+
+  /// This is the type of the decoded value if the field is dictionary encoded.
+  type: Type;
+
+  /// Present only if the field is dictionary encoded.
+  dictionary: DictionaryEncoding;
+
+  /// children apply only to nested data types like Struct, List and Union. For
+  /// primitive types children will have length 0.
+  children: [ Field ];
+
+  /// User-defined metadata
+  custom_metadata: [ KeyValue ];
+}
+
+/// ----------------------------------------------------------------------
+/// Endianness of the platform producing the data
+
+enum Endianness:short { Little, Big }
+
+/// ----------------------------------------------------------------------
+/// A Buffer represents a single contiguous memory segment
+struct Buffer {
+  /// The relative offset into the shared memory page where the bytes for this
+  /// buffer starts
+  offset: long;
+
+  /// The absolute length (in bytes) of the memory buffer. The memory is found
+  /// from offset (inclusive) to offset + length (non-inclusive). When building
+  /// messages using the encapsulated IPC message, padding bytes may be written
+  /// after a buffer, but such padding bytes do not need to be accounted for in
+  /// the size here.
+  length: long;
+}
+
+/// ----------------------------------------------------------------------
+/// A Schema describes the columns in a row batch
+
+table Schema {
+
+  /// endianness of the buffer
+  /// it is Little Endian by default
+  /// if endianness doesn't match the underlying system then the vectors need to be converted
+  endianness: Endianness=Little;
+
+  fields: [Field];
+  // User-defined metadata
+  custom_metadata: [ KeyValue ];
+
+  /// Features used in the stream/file.
+  features : [ Feature ];
+}
+
+root_type Schema;
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 756726945cf..e35742c2527 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -183,6 +183,9 @@ struct SchemaElement {
   // extra cudf specific fields
   bool output_as_byte_array = false;
 
+  // cudf type determined from arrow:schema
+  thrust::optional<type_id> arrow_type;
+
   // The following fields are filled in later during schema initialization
   int max_definition_level = 0;
   int max_repetition_level = 0;
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 3af4d5cdb86..5b7c180195b 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -422,7 +422,8 @@ reader::impl::impl(std::size_t chunk_read_limit,
     _input_pass_read_limit{pass_read_limit}
 {
   // Open and parse the source dataset metadata
-  _metadata = std::make_unique<aggregate_reader_metadata>(_sources);
+  _metadata =
+    std::make_unique<aggregate_reader_metadata>(_sources, options.is_enabled_use_arrow_schema());
 
   // Override output timestamp resolution if requested
   if (options.get_timestamp_type().id() != type_id::EMPTY) {
@@ -642,8 +643,11 @@ parquet_column_schema walk_schema(aggregate_reader_metadata const* mt, int idx)
 
 parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> const> sources)
 {
+  // do not use arrow schema when reading information from parquet metadata.
+  static constexpr auto use_arrow_schema = false;
+
   // Open and parse the source dataset metadata
-  auto metadata = aggregate_reader_metadata(sources);
+  auto metadata = aggregate_reader_metadata(sources, use_arrow_schema);
 
   return parquet_metadata{parquet_schema{walk_schema(&metadata, 0)},
                           metadata.get_num_rows(),
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 68dbf532a68..dfbc8c565ad 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -17,13 +17,21 @@
 #include "reader_impl_helpers.hpp"
 
 #include "io/parquet/parquet.hpp"
+#include "io/utilities/base64_utilities.hpp"
 #include "io/utilities/row_selection.hpp"
+#include "ipc/Message_generated.h"
+#include "ipc/Schema_generated.h"
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
 
 #include <numeric>
 #include <regex>
 
 namespace cudf::io::parquet::detail {
 
+namespace flatbuf = cudf::io::parquet::flatbuf;
+
 namespace {
 
 thrust::optional<LogicalType> converted_to_logical_type(SchemaElement const& schema)
@@ -66,8 +74,9 @@ type_id to_type_id(SchemaElement const& schema,
                    bool strings_to_categorical,
                    type_id timestamp_type_id)
 {
-  auto const physical = schema.type;
-  auto logical_type   = schema.logical_type;
+  auto const physical_type = schema.type;
+  auto const arrow_type    = schema.arrow_type;
+  auto logical_type        = schema.logical_type;
 
   // sanity check, but not worth failing over
   if (schema.converted_type.has_value() and not logical_type.has_value()) {
@@ -75,6 +84,16 @@ type_id to_type_id(SchemaElement const& schema,
     logical_type = converted_to_logical_type(schema);
   }
 
+  // check if have set the type through arrow schema?
+  if (arrow_type.has_value()) {
+    // is it duration type? i.e. phyical_type == INT64 and no logical/converted types
+    if (physical_type == Type::INT64 and not logical_type.has_value()) {
+      return arrow_type.value();
+    }
+    // should warn but not fail.
+    CUDF_LOG_WARN("Indeterminable arrow type encountered");
+  }
+
   if (logical_type.has_value()) {
     switch (logical_type->type) {
       case LogicalType::INTEGER: {
@@ -113,11 +132,11 @@ type_id to_type_id(SchemaElement const& schema,
 
       case LogicalType::DECIMAL: {
         int32_t const decimal_precision = logical_type->precision();
-        if (physical == INT32) {
+        if (physical_type == INT32) {
           return type_id::DECIMAL32;
-        } else if (physical == INT64) {
+        } else if (physical_type == INT64) {
           return type_id::DECIMAL64;
-        } else if (physical == FIXED_LEN_BYTE_ARRAY) {
+        } else if (physical_type == FIXED_LEN_BYTE_ARRAY) {
           if (schema.type_length <= static_cast<int32_t>(sizeof(int32_t))) {
             return type_id::DECIMAL32;
           } else if (schema.type_length <= static_cast<int32_t>(sizeof(int64_t))) {
@@ -125,7 +144,7 @@ type_id to_type_id(SchemaElement const& schema,
           } else if (schema.type_length <= static_cast<int32_t>(sizeof(__int128_t))) {
             return type_id::DECIMAL128;
           }
-        } else if (physical == BYTE_ARRAY) {
+        } else if (physical_type == BYTE_ARRAY) {
           CUDF_EXPECTS(decimal_precision <= MAX_DECIMAL128_PRECISION, "Invalid decimal precision");
           if (decimal_precision <= MAX_DECIMAL32_PRECISION) {
             return type_id::DECIMAL32;
@@ -158,7 +177,7 @@ type_id to_type_id(SchemaElement const& schema,
 
   // Physical storage type supported by Parquet; controls the on-disk storage
   // format in combination with the encoding type.
-  switch (physical) {
+  switch (physical_type) {
     case BOOLEAN: return type_id::BOOL8;
     case INT32: return type_id::INT32;
     case INT64: return type_id::INT64;
@@ -516,7 +535,7 @@ void aggregate_reader_metadata::column_info_for_row_group(row_group_info& rg_inf
 }
 
 aggregate_reader_metadata::aggregate_reader_metadata(
-  host_span<std::unique_ptr<datasource> const> sources)
+  host_span<std::unique_ptr<datasource> const> sources, bool use_arrow_schema)
   : per_file_metadata(metadatas_from_sources(sources)),
     keyval_maps(collect_keyval_metadata()),
     num_rows(calc_num_rows()),
@@ -537,6 +556,307 @@ aggregate_reader_metadata::aggregate_reader_metadata(
       CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema");
     }
   }
+
+  // Collect and apply arrow:schema from Parquet's key value metadata section
+  if (use_arrow_schema) { apply_arrow_schema(); }
+
+  // Erase "ARROW:schema" from the output pfm if exists
+  std::for_each(
+    keyval_maps.begin(), keyval_maps.end(), [](auto& pfm) { pfm.erase("ARROW:schema"); });
+}
+
+arrow_schema_data_types aggregate_reader_metadata::collect_arrow_schema() const
+{
+  // Check the key_value metadata for ARROW:schema, decode and walk it
+  // Function to convert from flatbuf::duration type to cudf::type_id
+  auto const duration_from_flatbuffer = [](flatbuf::Duration const* duration) {
+    // TODO: we only need this for arrow::DurationType for now. Else, we can take in a
+    // void ptr and typecast it to the corresponding type based on the type_id.
+    auto fb_unit = duration->unit();
+    switch (fb_unit) {
+      case flatbuf::TimeUnit::TimeUnit_SECOND:
+        return cudf::data_type{cudf::type_id::DURATION_SECONDS};
+      case flatbuf::TimeUnit::TimeUnit_MILLISECOND:
+        return cudf::data_type{cudf::type_id::DURATION_MILLISECONDS};
+      case flatbuf::TimeUnit::TimeUnit_MICROSECOND:
+        return cudf::data_type{cudf::type_id::DURATION_MICROSECONDS};
+      case flatbuf::TimeUnit::TimeUnit_NANOSECOND:
+        return cudf::data_type{cudf::type_id::DURATION_NANOSECONDS};
+      default: return cudf::data_type{};
+    }
+  };
+
+  // variable that tracks if an arrow_type specific column is seen
+  // in the walk
+  bool arrow_type_col_seen = false;
+
+  // Lambda function to walk a field and its children in DFS manner and
+  // return boolean walk success status
+  std::function<bool(flatbuf::Field const* const, arrow_schema_data_types&)> walk_field =
+    [&walk_field, &duration_from_flatbuffer, &arrow_type_col_seen](
+      flatbuf::Field const* const field, arrow_schema_data_types& schema_elem) {
+      // DFS: recursively walk over the children first
+      auto const field_children = field->children();
+
+      if (field_children != nullptr) {
+        auto schema_children = std::vector<arrow_schema_data_types>(field->children()->size());
+
+        if (not std::all_of(
+              thrust::make_counting_iterator(0),
+              thrust::make_counting_iterator(static_cast<int32_t>(field_children->size())),
+              [&](auto const& idx) {
+                return walk_field((*field_children)[idx], schema_children[idx]);
+              })) {
+          return false;
+        }
+        // arrow and parquet schemas are structured slightly differently for list type fields. list
+        // type fields in arrow are structured as: "field:list<element>" vs structured as:
+        // "field:list.element" in Parquet. To handle this, whenever we encounter a list type field,
+        // we add a dummy node "field.list" to the end of current children and move the current
+        // children (".element") to it.
+        switch (field->type_type()) {
+          case flatbuf::Type::Type_List:
+          case flatbuf::Type::Type_LargeList:
+          case flatbuf::Type::Type_FixedSizeList:
+            schema_elem.children.emplace_back(arrow_schema_data_types{std::move(schema_children)});
+            break;
+          default: schema_elem.children = std::move(schema_children); break;
+        }
+      }
+
+      // Walk the field itself
+      if (field->type_type() == flatbuf::Type::Type_Duration) {
+        auto type_data = field->type_as_Duration();
+        if (type_data != nullptr) {
+          auto name = (field->name()) ? field->name()->str() : "";
+          // set the schema_elem type to duration type
+          schema_elem.type = duration_from_flatbuffer(type_data);
+          arrow_type_col_seen |= (schema_elem.type.id() != type_id::EMPTY);
+        } else {
+          CUDF_LOG_ERROR("Parquet reader encountered an invalid type_data pointer.",
+                         "arrow:schema not processed.");
+          return false;
+        }
+      }
+      return true;
+    };
+
+  // TODO: Should we check if any file has the "ARROW:schema" key
+  // Or if all files have the same "ARROW:schema"?
+  auto const it = keyval_maps[0].find("ARROW:schema");
+  if (it == keyval_maps[0].end()) { return {}; }
+
+  // Decode the base64 encoded ipc message string
+  // Note: Store the output from base64_decode in the lvalue here and then pass
+  // it to decode_ipc_message. Directly passing rvalue from base64_decode to
+  // decode_ipc_message can lead to unintended nullptr dereferences.
+  auto const decoded_message = cudf::io::detail::base64_decode(it->second);
+
+  // Decode the ipc message to get an optional string_view of the ipc:Message flatbuffer
+  auto const metadata_buf = decode_ipc_message(decoded_message);
+
+  // Check if the string_view exists
+  if (not metadata_buf.has_value()) {
+    // No need to re-log error here as already logged inside decode_ipc_message
+    return {};
+  }
+
+  // Check if the decoded Message flatbuffer is valid
+  if (flatbuf::GetMessage(metadata_buf.value().data()) == nullptr) {
+    CUDF_LOG_ERROR("Parquet reader encountered an invalid ipc:Message flatbuffer pointer.",
+                   "arrow:schema not processed.");
+    return {};
+  }
+
+  // Check if the Message flatbuffer has a valid arrow:schema in its header
+  if (flatbuf::GetMessage(metadata_buf.value().data())->header_as_Schema() == nullptr) {
+    CUDF_LOG_ERROR("Parquet reader encountered an invalid arrow:schema flatbuffer pointer.",
+                   "arrow:schema not processed.");
+    return {};
+  }
+
+  // Get the vector of fields from arrow:schema flatbuffer object
+  auto const fields =
+    flatbuf::GetMessage(metadata_buf.value().data())->header_as_Schema()->fields();
+  if (fields == nullptr) {
+    CUDF_LOG_ERROR("Parquet reader encountered an invalid fields pointer.",
+                   "arrow:schema not processed.");
+    return {};
+  }
+
+  // arrow schema structure to return
+  arrow_schema_data_types schema;
+
+  // Recursively walk the arrow schema and set cudf::data_type for all duration columns
+  if (fields->size() > 0) {
+    schema.children = std::vector<arrow_schema_data_types>(fields->size());
+
+    if (not std::all_of(
+          thrust::make_counting_iterator(0),
+          thrust::make_counting_iterator(static_cast<int32_t>(fields->size())),
+          [&](auto const& idx) { return walk_field((*fields)[idx], schema.children[idx]); })) {
+      return {};
+    }
+
+    // if no arrow type column seen, return nullopt.
+    if (not arrow_type_col_seen) { return {}; }
+  }
+
+  return schema;
+}
+
+void aggregate_reader_metadata::apply_arrow_schema()
+{
+  // Collect the arrow schema from the key value section of Parquet metadata
+  auto arrow_schema_root = collect_arrow_schema();
+
+  // Check if empty arrow schema collected
+  if (arrow_schema_root.type.id() == type_id::EMPTY and arrow_schema_root.children.size() == 0) {
+    return;
+  }
+
+  // Function to verify equal num_children at each level in Parquet and arrow schemas.
+  std::function<bool(arrow_schema_data_types const&, int const)> validate_schemas =
+    [&](arrow_schema_data_types const& arrow_schema, int const schema_idx) {
+      auto& pq_schema_elem = per_file_metadata[0].schema[schema_idx];
+
+      // ensure equal number of children first to avoid any segfaults in children
+      if (pq_schema_elem.num_children == static_cast<int32_t>(arrow_schema.children.size())) {
+        // true if and only if true for all children as well
+        return std::all_of(thrust::make_zip_iterator(thrust::make_tuple(
+                             arrow_schema.children.begin(), pq_schema_elem.children_idx.begin())),
+                           thrust::make_zip_iterator(thrust::make_tuple(
+                             arrow_schema.children.end(), pq_schema_elem.children_idx.end())),
+                           [&](auto const& elem) {
+                             return validate_schemas(thrust::get<0>(elem), thrust::get<1>(elem));
+                           });
+      } else {
+        return false;
+      }
+    };
+
+  // Function to co-walk arrow and parquet schemas
+  std::function<void(arrow_schema_data_types const&, int const)> co_walk_schemas =
+    [&](arrow_schema_data_types const& arrow_schema, int const schema_idx) {
+      auto& pq_schema_elem = per_file_metadata[0].schema[schema_idx];
+      std::for_each(
+        thrust::make_zip_iterator(
+          thrust::make_tuple(arrow_schema.children.begin(), pq_schema_elem.children_idx.begin())),
+        thrust::make_zip_iterator(
+          thrust::make_tuple(arrow_schema.children.end(), pq_schema_elem.children_idx.end())),
+        [&](auto const& elem) { co_walk_schemas(thrust::get<0>(elem), thrust::get<1>(elem)); });
+
+      // true for DurationType columns only for now.
+      if (arrow_schema.type.id() != type_id::EMPTY) {
+        pq_schema_elem.arrow_type = arrow_schema.type.id();
+      }
+    };
+
+  // Get Parquet schema root
+  auto pq_schema_root = get_schema(0);
+
+  // verify equal number of children for both schemas at root level
+  if (pq_schema_root.num_children != static_cast<int32_t>(arrow_schema_root.children.size())) {
+    CUDF_LOG_ERROR("Parquet reader encountered a mismatch between Parquet and arrow schema.",
+                   "arrow:schema not processed.");
+    return;
+  }
+
+  // zip iterator to validate and co-walk the two schemas
+  auto schemas = thrust::make_zip_iterator(
+    thrust::make_tuple(arrow_schema_root.children.begin(), pq_schema_root.children_idx.begin()));
+
+  // Verify equal number of children at all sub-levels
+  if (not std::all_of(schemas, schemas + pq_schema_root.num_children, [&](auto const& elem) {
+        return validate_schemas(thrust::get<0>(elem), thrust::get<1>(elem));
+      })) {
+    CUDF_LOG_ERROR("Parquet reader encountered a mismatch between Parquet and arrow schema.",
+                   "arrow:schema not processed.");
+    return;
+  }
+
+  // All good, now co-walk schemas
+  std::for_each(schemas, schemas + pq_schema_root.num_children, [&](auto const& elem) {
+    co_walk_schemas(thrust::get<0>(elem), thrust::get<1>(elem));
+  });
+}
+
+std::optional<std::string_view> aggregate_reader_metadata::decode_ipc_message(
+  std::string_view const serialized_message) const
+{
+  // Constants copied from arrow source and renamed to match the case
+  constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL         = sizeof(int32_t);
+  constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t);
+  constexpr int32_t IPC_CONTINUATION_TOKEN                             = -1;
+
+  // message buffer
+  auto message_buf = serialized_message.data();
+  // current message (buffer) size
+  auto message_size = static_cast<int32_t>(serialized_message.size());
+
+  // Lambda function to read and return 4 bytes as int32_t from the ipc message buffer and update
+  // buffer pointer and size
+  auto read_int32_from_ipc_message = [&]() {
+    int32_t bytes;
+    std::memcpy(&bytes, message_buf, sizeof(int32_t));
+    // Offset the message buf and reduce remaining size
+    message_buf += sizeof(int32_t);
+    message_size -= sizeof(int32_t);
+    return bytes;
+  };
+
+  // Check for empty message
+  if (message_size == 0) {
+    CUDF_LOG_ERROR("Parquet reader encountered zero length arrow:schema.",
+                   "arrow:schema not processed.");
+    return std::nullopt;
+  }
+
+  // Check for improper message size.
+  if (message_size < MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL) {
+    CUDF_LOG_ERROR("Parquet reader encountered unexpected arrow:schema message length.",
+                   "arrow:schema not processed.");
+    return std::nullopt;
+  }
+
+  // Get the first 4 bytes (continuation) of the ipc message
+  // and check if it matches the expected token
+  if (read_int32_from_ipc_message() != IPC_CONTINUATION_TOKEN) {
+    CUDF_LOG_ERROR("Parquet reader encountered unexpected IPC continuation token.",
+                   "arrow:schema not processed.");
+    return std::nullopt;
+  }
+
+  // Check for improper message size after the continuation bytes.
+  if (message_size < MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH) {
+    CUDF_LOG_ERROR("Parquet reader encountered unexpected arrow:schema message length.",
+                   "arrow:schema not processed.");
+    return std::nullopt;
+  }
+
+  // Get the next 4 bytes (metadata_len) of the ipc message
+  // and check if invalid metadata length read
+  auto const metadata_len = read_int32_from_ipc_message();
+
+  // Check if the read metadata (header) length is > zero
+  if (metadata_len <= 0) {
+    CUDF_LOG_ERROR("Parquet reader encountered unexpected metadata length.",
+                   "arrow:schema not processed.");
+    return std::nullopt;
+  }
+
+  // Check if the remaining message size is smaller than the expected metadata length
+  // TODO: Since the arrow:schema message doesn't have a body,
+  // the following check may be made tighter from < to ==
+  if (message_size < metadata_len) {
+    CUDF_LOG_ERROR("Parquet reader encountered unexpected arrow:schema message length.",
+                   "arrow:schema not processed.");
+    return std::nullopt;
+  }
+
+  // All good, return the current message_buf as string_view
+  return std::string_view{message_buf,
+                          static_cast<std::basic_string_view<char>::size_type>(message_size)};
 }
 
 RowGroup const& aggregate_reader_metadata::get_row_group(size_type row_group_index,
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 09f65f9c388..398812945e2 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -121,9 +121,15 @@ struct metadata : public FileMetaData {
   void sanitize_schema();
 };
 
+struct arrow_schema_data_types {
+  std::vector<arrow_schema_data_types> children;
+  data_type type{type_id::EMPTY};
+};
+
 class aggregate_reader_metadata {
   std::vector<metadata> per_file_metadata;
   std::vector<std::unordered_map<std::string, std::string>> keyval_maps;
+
   int64_t num_rows;
   size_type num_row_groups;
 
@@ -139,6 +145,25 @@ class aggregate_reader_metadata {
   [[nodiscard]] std::vector<std::unordered_map<std::string, std::string>> collect_keyval_metadata()
     const;
 
+  /**
+   * @brief Decodes and constructs the arrow schema from the "ARROW:schema" IPC message
+   * in key value metadata section of Parquet file footer
+   */
+  [[nodiscard]] arrow_schema_data_types collect_arrow_schema() const;
+
+  /**
+   * @brief Co-walks the collected arrow and Parquet schema, updates
+   * dtypes and destroys the no longer needed arrow schema object(s).
+   */
+  void apply_arrow_schema();
+
+  /**
+   * @brief Decode an arrow:IPC message and returns an optional string_view of
+   * its metadata header
+   */
+  [[nodiscard]] std::optional<std::string_view> decode_ipc_message(
+    std::string_view const serialized_message) const;
+
   /**
    * @brief Sums up the number of rows of each source
    */
@@ -158,7 +183,8 @@ class aggregate_reader_metadata {
   void column_info_for_row_group(row_group_info& rg_info, size_type chunk_start_row) const;
 
  public:
-  aggregate_reader_metadata(host_span<std::unique_ptr<datasource> const> sources);
+  aggregate_reader_metadata(host_span<std::unique_ptr<datasource> const> sources,
+                            bool use_arrow_schema);
 
   [[nodiscard]] RowGroup const& get_row_group(size_type row_group_index, size_type src_idx) const;
 
@@ -183,7 +209,6 @@ class aggregate_reader_metadata {
   }
 
   [[nodiscard]] auto const& get_key_value_metadata() const& { return keyval_maps; }
-
   [[nodiscard]] auto&& get_key_value_metadata() && { return std::move(keyval_maps); }
 
   /**
diff --git a/cpp/src/io/utilities/base64_utilities.cpp b/cpp/src/io/utilities/base64_utilities.cpp
new file mode 100644
index 00000000000..856c29599a7
--- /dev/null
+++ b/cpp/src/io/utilities/base64_utilities.cpp
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions of this file are derived from Rene Nyffenegger's codebase at
+ * https://github.com/ReneNyffenegger/cpp-base64, original license text below.
+ */
+
+/*
+ *  base64.cpp and base64.h
+ *
+ *  base64 encoding and decoding with C++.
+ *  More information at
+ *    https://renenyffenegger.ch/notes/development/Base64/Encoding-and-decoding-base-64-with-cpp
+ *
+ *  Version: 2.rc.09 (release candidate)
+ *
+ *  Copyright (C) 2004-2017, 2020-2022 René Nyffenegger
+ *
+ *  This source code is provided 'as-is', without any express or implied
+ *  warranty. In no event will the author be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  1. The origin of this source code must not be misrepresented; you must not
+ *     claim that you wrote the original source code. If you use this source code
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *
+ *  2. Altered source versions must be plainly marked as such, and must not be
+ *     misrepresented as being the original source code.
+ *
+ *  3. This notice may not be removed or altered from any source distribution.
+ *
+ *  René Nyffenegger rene.nyffenegger@adp-gmbh.ch
+ */
+
+/**
+ * @file base64_utils.cpp
+ * @brief base64 string encoding/decoding implementation
+ */
+
+// altered: applying clang-format for libcudf on this file.
+
+#include "base64_utilities.hpp"
+
+#include <cudf/detail/utilities/logger.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <algorithm>
+
+// altered: use cudf namespaces
+namespace cudf::io::detail {
+
+static const std::string base64_chars =
+  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+  "abcdefghijklmnopqrstuvwxyz"
+  "0123456789+/";
+
+static constexpr unsigned char trailing_char = '=';
+
+// Function to encode input string to base64 and return the encoded string
+std::string base64_encode(std::string_view string_to_encode)
+{
+  auto input_length = static_cast<int32_t>(string_to_encode.size());
+
+  // altered: compute number of encoding iterations = floor(multiple of 3)
+  int32_t num_iterations = (input_length / 3);
+  num_iterations += (input_length % 3) ? 1 : 0;
+
+  std::string encoded;
+  size_t encoded_length = (input_length + 2) / 3 * 4;
+  encoded.reserve(encoded_length);
+
+  // altered: modify base64 encoder loop using STL and Thrust.
+  // TODO: Port this loop to thrust cooperative groups if needed for too-wide tables.
+  std::for_each(thrust::make_counting_iterator(0),
+                thrust::make_counting_iterator(num_iterations),
+                [&](auto&& iter) {
+                  auto idx = iter * 3;
+
+                  encoded.push_back(base64_chars[(string_to_encode[idx] & 0xfc) >> 2]);
+                  // increment the index by 1
+                  idx += 1;
+
+                  if (idx < input_length) {
+                    encoded.push_back(base64_chars[((string_to_encode[idx - 1] & 0x03) << 4) +
+                                                   ((string_to_encode[idx] & 0xf0) >> 4)]);
+                    // increment the index by 1
+                    idx += 1;
+
+                    if (idx < input_length) {
+                      encoded.push_back(base64_chars[((string_to_encode[idx - 1] & 0x0f) << 2) +
+                                                     ((string_to_encode[idx] & 0xc0) >> 6)]);
+                      encoded.push_back(base64_chars[string_to_encode[idx] & 0x3f]);
+                    } else {
+                      encoded.push_back(base64_chars[(string_to_encode[idx - 1] & 0x0f) << 2]);
+                      encoded.push_back(trailing_char);
+                    }
+                  } else {
+                    encoded.push_back(base64_chars[(string_to_encode[idx - 1] & 0x03) << 4]);
+                    encoded.push_back(trailing_char);
+                    encoded.push_back(trailing_char);
+                  }
+                });
+
+  return encoded;
+}
+
+// base64 decode function
+std::string base64_decode(std::string_view encoded_string)
+{
+  // altered: there must be at least 2 characters in the base64-encoded string
+  if (encoded_string.size() < 2) {
+    CUDF_LOG_ERROR(
+      "Parquet reader encountered invalid base64-encoded string size."
+      "arrow:schema not processed.");
+    return std::string{};
+  }
+
+  size_t input_length = encoded_string.length();
+  std::string decoded;
+
+  // altered: compute number of decoding iterations = floor (multiple of 4)
+  int32_t num_iterations = (input_length / 4);
+  num_iterations += (input_length % 4) ? 1 : 0;
+
+  //
+  // The approximate length (bytes) of the decoded string might be one or
+  // two bytes smaller, depending on the amount of trailing equal signs
+  // in the encoded string. This approximation is needed to reserve
+  // enough space in the string to be returned.
+  size_t approx_decoded_length = input_length / 4 * 3;
+  decoded.reserve(approx_decoded_length);
+
+  //
+  // Iterate over encoded input string in chunks. The size of all
+  // chunks except the last one is 4 bytes.
+  //
+  // The last chunk might be padded with equal signs or dots
+  // in order to make it 4 bytes in size as well, but this
+  // is not required as per RFC 2045.
+  //
+  // All chunks except the last one produce three output bytes.
+  //
+  // The last chunk produces at least one and up to three bytes.
+  //
+  // altered: modify base64 encoder loop to number of iterations using STL and Thrust.
+  // TODO: Port this loop to thrust cooperative groups if needed for too-wide tables.
+  if (not std::all_of(
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_iterations),
+        [&](auto&& iter) {
+          int32_t idx                  = iter * 4;
+          size_t current_char_position = 0;
+          size_t char1_position        = 0;
+          size_t char2_position        = 0;
+
+          // Check for data that is not padded with equal
+          // signs (which is allowed by RFC 2045)
+          if (encoded_string[idx] == '=') { return true; }
+
+          current_char_position = base64_chars.find(encoded_string[idx]);
+          char1_position        = base64_chars.find(encoded_string[idx + 1]);
+          if (current_char_position == std::string::npos or char1_position == std::string::npos) {
+            return false;
+          }
+          // Emit the first output byte that is produced in each chunk:
+          decoded.push_back(static_cast<std::string::value_type>((current_char_position << 2) +
+                                                                 ((char1_position & 0x30) >> 4)));
+
+          // increment the index by 1
+          idx += 1;
+          // check for = padding
+          if (encoded_string[idx] == '=') { return true; }
+
+          // increment the index by 1
+          idx += 1;
+          // check for = padding
+          if (encoded_string[idx] == '=') { return true; }
+
+          char1_position = base64_chars.find(encoded_string[idx - 1]);
+          char2_position = base64_chars.find(encoded_string[idx]);
+          if (char1_position == std::string::npos or char2_position == std::string::npos) {
+            return false;
+          }
+          // Emit a chunk's second byte (which might not be produced in the last
+          // chunk).
+          decoded.push_back(static_cast<std::string::value_type>(((char1_position & 0x0f) << 4) +
+                                                                 ((char2_position & 0x3c) >> 2)));
+
+          // increment the index by 1
+          idx += 1;
+          // check for = padding
+          if (encoded_string[idx] == '=') { return true; }
+
+          char2_position        = base64_chars.find(encoded_string[idx - 1]);
+          current_char_position = base64_chars.find(encoded_string[idx]);
+          if (current_char_position == std::string::npos or char2_position == std::string::npos) {
+            return false;
+          }
+          // Emit a chunk's third byte (which might not be produced in the last
+          // chunk).
+          decoded.push_back(static_cast<std::string::value_type>(((char2_position & 0x03) << 6) +
+                                                                 current_char_position));
+
+          // all good, return true
+          return true;
+        })) {
+    return std::string{};
+  }
+
+  // return the decoded string
+  return decoded;
+}
+
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/base64_utilities.hpp b/cpp/src/io/utilities/base64_utilities.hpp
new file mode 100644
index 00000000000..537d9c96d6b
--- /dev/null
+++ b/cpp/src/io/utilities/base64_utilities.hpp
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * Portions of this file are derived from Rene Nyffenegger's codebase at
+ * https://github.com/ReneNyffenegger/cpp-base64, original license text below.
+ */
+
+/*
+ *  base64.cpp and base64.h
+ *
+ *  base64 encoding and decoding with C++.
+ *  More information at
+ *    https://renenyffenegger.ch/notes/development/Base64/Encoding-and-decoding-base-64-with-cpp
+ *
+ *  Version: 2.rc.09 (release candidate)
+ *
+ *  Copyright (C) 2004-2017, 2020-2022 René Nyffenegger
+ *
+ *  This source code is provided 'as-is', without any express or implied
+ *  warranty. In no event will the author be held liable for any damages
+ *  arising from the use of this software.
+ *
+ *  Permission is granted to anyone to use this software for any purpose,
+ *  including commercial applications, and to alter it and redistribute it
+ *  freely, subject to the following restrictions:
+ *
+ *  1. The origin of this source code must not be misrepresented; you must not
+ *     claim that you wrote the original source code. If you use this source code
+ *     in a product, an acknowledgment in the product documentation would be
+ *     appreciated but is not required.
+ *
+ *  2. Altered source versions must be plainly marked as such, and must not be
+ *     misrepresented as being the original source code.
+ *
+ *  3. This notice may not be removed or altered from any source distribution.
+ *
+ *  René Nyffenegger rene.nyffenegger@adp-gmbh.ch
+ */
+
+/**
+ * @file base64_utils.cpp
+ * @brief base64 string encoding/decoding utilities
+ */
+
+#pragma once
+
+// altered: applying clang-format for libcudf on this file.
+
+// altered: include required headers
+#include <string>
+
+// altered: use cudf namespaces
+namespace cudf::io::detail {
+
+/**
+ * @brief Encodes input string to base64 and returns it
+ *
+ * @param string_to_encode a view of the string to be encoded in base64
+ * @return the base64-encoded string
+ *
+ */
+std::string base64_encode(std::string_view string_to_encode);
+
+/**
+ * @brief Decodes the input base64-encoded string and returns it
+ *
+ * @param encoded_string a view of the base64-encoded string to be decoded
+ * @return the decoded string
+ *
+ */
+std::string base64_decode(std::string_view encoded_string);
+
+}  // namespace cudf::io::detail
diff --git a/cpp/tests/utilities_tests/io_utilities_tests.cpp b/cpp/tests/utilities_tests/io_utilities_tests.cpp
index 6981ad71f1e..e5a153bf781 100644
--- a/cpp/tests/utilities_tests/io_utilities_tests.cpp
+++ b/cpp/tests/utilities_tests/io_utilities_tests.cpp
@@ -25,6 +25,11 @@
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <src/io/utilities/base64_utilities.hpp>
+
+using cudf::io::detail::base64_decode;
+using cudf::io::detail::base64_encode;
+
 class IoUtilitiesTest : public cudf::test::BaseFixture {};
 
 TEST(IoUtilitiesTest, HostMemoryGetAndSet)
@@ -63,3 +68,114 @@ TEST(IoUtilitiesTest, HostMemoryGetAndSet)
   // reset memory resource back
   cudf::io::set_host_memory_resource(last_mr);
 }
+
+TEST(IoUtilitiesTest, Base64EncodeAndDecode)
+{
+  // a vector of lorem ipsum strings
+  std::vector<std::string> strings = {
+    "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut ",
+    "labore et dolore magna aliqua. Id ornare arcu odio ut sem. Ultrices neque ornare aenean ",
+    "euismod elementum nisi quis. Faucibus pulvinar elementum integer enim. Ut tortor pretium ",
+    "viverra suspendisse potenti nullam ac tortor vitae. Elementum pulvinar etiam non quam lacus. ",
+    "Fermentum odio eu feugiat pretium nibh. Commodo ullamcorper a lacus vestibulum sed arcu. "
+    "Elit ",
+    "ut aliquam purus sit amet luctus venenatis lectus magna. Aliquet enim tortor at auctor urna ",
+    "nunc id cursus metus. Vivamus at augue eget arcu dictum. Ultricies leo integer malesuada "
+    "nunc ",
+    "vel risus commodo viverra maecenas.Netus et malesuada fames ac turpis egestas. Erat ",
+    "pellentesque adipiscing commodo elit at imperdiet. Commodo nulla facilisi nullam vehicula. ",
+    "Morbi tristique senectus et netus et. Cursus vitae congue mauris rhoncus aenean vel elit ",
+    "scelerisque mauris. Eros donec ac odio tempor orci dapibus ultrices. Purus in mollis nunc "
+    "sed ",
+    "id. Justo eget magna fermentum iaculis eu. Diam maecenas ultricies mi eget. Justo laoreet "
+    "sit ",
+    "amet cursus sit amet. Nibh venenatis cras sed felis eget velit aliquet sagittis id. Dui ut ",
+    "ornare lectus sit amet est placerat in egestas. Malesuada nunc vel risus commodo viverra ",
+    "maecenas accumsan lacus. Arcu non odio euismod lacinia at. Euismod elementum nisi quis ",
+    "eleifend quam adipiscing vitae proin sagittis. Eget sit amet tellus cras adipiscing enim ",
+    "eu.Neque ornare aenean euismod elementum nisi quis eleifend quam adipiscing. Posuere ",
+    "sollicitudin aliquam ultrices sagittis orci a scelerisque purus. Lobortis elementum nibh ",
+    "tellus molestie. Et ligula ullamcorper malesuada proin libero nunc consequat interdum "
+    "varius. ",
+    "Neque volutpat ac tincidunt vitae semper quis lectus. Nunc mi ipsum faucibus vitae. Congue "
+    "eu ",
+    "consequat ac felis donec et. Faucibus in ornare quam viverra orci sagittis. Egestas "
+    "fringilla ",
+    "phasellus faucibus scelerisque eleifend. Sem fringilla ut morbi tincidunt augue. Lobortis ",
+    "elementum nibh tellus molestie nunc non. Ultrices neque ornare aenean euismod elementum. ",
+    "Cursus turpis massa tincidunt dui ut ornare lectus sit. Eu facilisis sed odio morbi quis "
+    "commodo odio. Tortor dignissim convallis aenean et tortor at risus. Sed euismod nisi porta ",
+    "lorem. In ornare quam viverra orci sagittis. Sed blandit libero volutpat sed cras. Quis ",
+    "viverra nibh cras pulvinar mattis nunc sed blandit libero. Non tellus orci ac auctor augue. ",
+    "Mattis molestie a iaculis at erat pellentesque adipiscing. Est lorem ipsum dolor sit amet ",
+    "consectetur. Commodo odio aenean sed adipiscing. Nunc lobortis mattis aliquam faucibus "
+    "purus. ",
+    "Pellentesque massa placerat duis ultricies lacus. Sed viverra tellus in hac habitasse "
+    "platea. ",
+    "Ut porttitor leo a diam sollicitudin tempor id eu. Rhoncus aenean vel elit scelerisque "
+    "mauris ",
+    "pellentesque pulvinar pellentesque. Ornare quam viverra orci sagittis. Interdum consectetur ",
+    "libero id faucibus nisl tincidunt eget. Eget est lorem ipsum dolor sit amet. Malesuada fames ",
+    "ac turpis egestas integer eget aliquet nibh. Scelerisque felis imperdiet proin fermentum "
+    "leo. ",
+    "Duis convallis convallis tellus id interdum velit. Sit amet massa vitae tortor condimentum ",
+    "lacinia quis vel. Eu turpis egestas pretium aenean pharetra. Sed enim ut sem viverra aliquet ",
+    "eget sit amet tellus. Feugiat nisl pretium fusce id velit ut tortor. In hendrerit gravida ",
+    "rutrum quisque non tellus orci ac auctor. Sit amet nulla facilisi morbi. Nunc congue nisi ",
+    "vitae suscipit tellus. Posuere morbi leo urna molestie at elementum eu. Egestas sed tempus ",
+    "urna et pharetra pharetra. Sed euismod nisi porta lorem. At elementum eu facilisis sed. Odio ",
+    "aenean sed adipiscing diam donec. Congue nisi vitae suscipit tellus mauris a diam. Fringilla ",
+    "urna porttitor rhoncus dolor purus non enim praesent. Eget gravida cum sociis natoque. ",
+    "Facilisis mauris sit amet massa vitae tortor. Vulputate odio ut enim blandit volutpat ",
+    "maecenas volutpat blandit. Ut ornare lectus sit amet est placerat in. Quis vel eros donec ac ",
+    "odio tempor orci dapibus ultrices. Venenatis lectus magna fringilla urna porttitor rhoncus ",
+    "dolor. Mattis vulputate enim nulla aliquet porttitor lacus. Lectus nulla at volutpat diam ut ",
+    "venenatis tellus in. Et ligula ullamcorper malesuada proin libero nunc consequat interdum. "
+    "Ut ",
+    "enim blandit volutpat maecenas volutpat blandit aliquam etiam erat. Pellentesque pulvinar ",
+    "pellentesque habitant morbi tristique senectus et. Auctor eu augue ut lectus arcu bibendum "
+    "at ",
+    "varius. Posuere ac ut consequat semper viverra nam. Sed euismod nisi porta lorem mollis ",
+    "aliquam ut. Porttitor eget dolor morbi non arcu risus quis varius. Adipiscing bibendum est ",
+    "ultricies integer quis auctor. Hac habitasse platea dictumst quisque sagittis purus sit amet ",
+    "volutpat. Nullam vehicula ipsum a arcu cursus vitae. Velit scelerisque in dictum non ",
+    "consectetur a erat nam at. Nulla facilisi cras fermentum odio eu. Tincidunt augue interdum ",
+    "velit euismod in pellentesque massa placerat. Suspendisse potenti nullam ac tortor vitae ",
+    "purus faucibus ornare. Amet dictum sit amet justo donec enim diam vulputate. Tellus ",
+    "pellentesque eu tincidunt tortor aliquam nulla facilisi cras. Mauris in aliquam sem "
+    "fringilla ",
+    "ut morbi tincidunt. Volutpat diam ut venenatis tellus in metus. Sed pulvinar proin gravida ",
+    "hendrerit lectus a. Feugiat nisl pretium fusce id velit ut tortor pretium viverra. Non ",
+    "consectetur a erat nam. Fermentum odio eu feugiat pretium nibh ipsum consequat nisl. Donec ",
+    "pretium vulputate sapien nec. Purus sit amet luctus venenatis lectus magna fringilla. Mauris ",
+    "cursus mattis molestie a iaculis. A iaculis at erat pellentesque adipiscing. Auctor augue ",
+    "mauris augue neque gravida in fermentum et sollicitudin. Lectus quam id leo in vitae turpis ",
+    "massa sed. Erat nam at lectus urna duis convallis convallis. Dignissim cras tincidunt ",
+    "lobortis feugiat vivamus at augue eget arcu. Eleifend mi in nulla posuere sollicitudin ",
+    "aliquam ultrices sagittis. Pellentesque nec nam aliquam sem. Feugiat in fermentum posuere ",
+    "urna nec tincidunt praesent. Morbi non arcu risus quis varius quam quisque. Morbi tristique ",
+    "senectus et netus et malesuada fames ac. Et ligula ullamcorper malesuada proin libero. ",
+    "Vivamus at augue eget arcu dictum varius duis at consectetur. Eget mauris pharetra et ",
+    "ultrices neque ornare aenean euismod. Sapien faucibus et molestie ac feugiat sed lectus ",
+    "vestibulum mattis. Blandit turpis cursus in hac habitasse platea dictumst quisque sagittis. ",
+    "Fermentum iaculis eu non diam phasellus vestibulum. Mattis aliquam faucibus purus in massa ",
+    "tempor nec feugiat nisl. Lectus sit amet est placerat. Accumsan sit amet nulla facilisi "
+    "morbi ",
+    "tempus iaculis urna. Magna eget est lorem ipsum dolor sit. Curabitur gravida arcu ac tortor ",
+    "dignissim convallis aenean."};
+
+  std::vector<std::string> base64_roundtripped_strings;
+
+  std::transform(strings.begin(),
+                 strings.end(),
+                 std::back_inserter(base64_roundtripped_strings),
+                 [&](auto& str) { return base64_decode(base64_encode(str)); });
+
+  // Create columns for expected and results
+  cudf::test::strings_column_wrapper expected(strings.begin(), strings.end());
+  cudf::test::strings_column_wrapper results(base64_roundtripped_strings.begin(),
+                                             base64_roundtripped_strings.end());
+
+  // Check equal columns
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, results);
+}
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 4a23a58b523..70acb7f917b 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -170,6 +170,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
         parquet_reader_options.builder(source)
         .row_groups(cpp_row_groups)
         .use_pandas_metadata(cpp_use_pandas_metadata)
+        .use_arrow_schema(True)
         .timestamp_type(cpp_timestamp_type)
     )
     if filters is not None:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
index b7f3f89f71c..33a594b432f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
@@ -23,11 +23,13 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         const optional[reference_wrapper[expression]]& get_filter() except +
         data_type get_timestamp_type() except +
         bool is_enabled_use_pandas_metadata() except +
+        bool is_enabled_arrow_schema() except +
 
         # setter
 
         void set_columns(vector[string] col_names) except +
         void set_row_groups(vector[vector[size_type]] row_grp) except +
+        void enable_use_arrow_schema(bool val) except +
         void enable_use_pandas_metadata(bool val) except +
         void set_timestamp_type(data_type type) except +
 
@@ -50,6 +52,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_reader_options_builder& use_pandas_metadata(
             bool val
         ) except +
+        parquet_reader_options_builder& use_arrow_schema(
+            bool val
+        ) except +
         parquet_reader_options_builder& timestamp_type(
             data_type type
         ) except +
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 3680c1e0c62..b2896d55b80 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3243,3 +3243,91 @@ def test_parquet_reader_zstd_huff_tables(datadir):
     expected = pa.parquet.read_table(fname).to_pandas()
     actual = cudf.read_parquet(fname)
     assert_eq(actual, expected)
+
+
+def test_parquet_reader_roundtrip_with_arrow_schema():
+    # Ensure that the nested types are faithfully being roundtripped
+    # across Parquet with arrow schema which is used to faithfully
+    # round trip duration types (timedelta64) across Parquet read and write.
+    pdf = pd.DataFrame(
+        {
+            "s": pd.Series([None, None, None], dtype="timedelta64[s]"),
+            "ms": pd.Series([1234, None, 32442], dtype="timedelta64[ms]"),
+            "us": pd.Series([None, 3456, None], dtype="timedelta64[us]"),
+            "ns": pd.Series([1234, 3456, 32442], dtype="timedelta64[ns]"),
+            "duration_list": list(
+                [
+                    [
+                        datetime.timedelta(minutes=7, seconds=4),
+                        datetime.timedelta(minutes=7),
+                    ],
+                    [
+                        None,
+                        None,
+                    ],
+                    [
+                        datetime.timedelta(minutes=7, seconds=4),
+                        None,
+                    ],
+                ]
+            ),
+            "int64": pd.Series([1234, 123, 4123], dtype="int64"),
+            "list": list([[1, 2], [1, 2], [1, 2]]),
+            "datetime": pd.Series([1234, 123, 4123], dtype="datetime64[ms]"),
+            "map": pd.Series(["cat", "dog", "lion"]).map(
+                {"cat": "kitten", "dog": "puppy", "lion": "cub"}
+            ),
+        }
+    )
+
+    # Write parquet with arrow for now (to write arrow:schema)
+    buffer = BytesIO()
+    pdf.to_parquet(buffer, engine="pyarrow")
+
+    # Read parquet with arrow schema
+    got = cudf.read_parquet(buffer)
+    # Convert to cudf table for an apple to apple comparison
+    expected = cudf.from_pandas(pdf)
+
+    # Check results for reader with schema
+    assert_eq(expected, got)
+
+
+def test_parquet_reader_roundtrip_structs_with_arrow_schema():
+    # Ensure that the structs with duration types are faithfully being
+    # roundtripped across Parquet with arrow schema
+    pdf = pd.DataFrame(
+        {
+            "struct": {
+                "payload": {
+                    "Domain": {
+                        "Name": "abc",
+                        "Id": {"Name": "host", "Value": "127.0.0.8"},
+                        "Duration": datetime.timedelta(minutes=12),
+                    },
+                    "StreamId": "12345678",
+                    "Duration": datetime.timedelta(minutes=4),
+                    "Offset": None,
+                    "Resource": [
+                        {
+                            "Name": "ZoneName",
+                            "Value": "RAPIDS",
+                            "Duration": datetime.timedelta(seconds=1),
+                        }
+                    ],
+                }
+            }
+        }
+    )
+
+    # Reset the buffer and write parquet with arrow
+    buffer = BytesIO()
+    pdf.to_parquet(buffer, engine="pyarrow")
+
+    # Read parquet with arrow schema
+    got = cudf.read_parquet(buffer)
+    # Convert to cudf table for an apple to apple comparison
+    expected = cudf.from_pandas(pdf)
+
+    # Check results
+    assert_eq(expected, got)

From 08115239ad1f5155108430e0d0ac2f747f4bbd59 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Wed, 15 May 2024 11:29:22 -0500
Subject: [PATCH 210/842] Avoid running sanitizer on Java test designed to
 cause an error (#15753)

Fixes NVIDIA/spark-rapids-jni#2039.  CudaTest#testCudaException causes the compute-sanitizer to fail the test because it (correctly) flags an invalid argument being passed to a CUDA runtime call.  Updated the tagging for the test to avoid running it under the compute-sanitizer.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Gera Shegalov (https://github.com/gerashegalov)

URL: https://github.com/rapidsai/cudf/pull/15753
---
 java/src/test/java/ai/rapids/cudf/CudaTest.java | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/java/src/test/java/ai/rapids/cudf/CudaTest.java b/java/src/test/java/ai/rapids/cudf/CudaTest.java
index 9aaa9cee916..a741b0a5e31 100644
--- a/java/src/test/java/ai/rapids/cudf/CudaTest.java
+++ b/java/src/test/java/ai/rapids/cudf/CudaTest.java
@@ -16,6 +16,7 @@
 
 package ai.rapids.cudf;
 
+import org.junit.jupiter.api.Tag;
 import org.junit.jupiter.api.Test;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
@@ -33,13 +34,14 @@ public void testGetCudaRuntimeInfo() {
     assertEquals(Cuda.getNativeComputeMode(), Cuda.getComputeMode().nativeId);
   }
 
+  @Tag("noSanitizer")
   @Test
   public void testCudaException() {
     assertThrows(CudaException.class, () -> {
           try {
             Cuda.freePinned(-1L);
           } catch (CudaFatalException fatalEx) {
-            throw new AssertionError("Expected UnFatalError but got FatalError: " + fatalEx);
+            throw new AssertionError("Expected CudaException but got fatal error", fatalEx);
           } catch (CudaException ex) {
             assertEquals(CudaException.CudaError.cudaErrorInvalidValue, ex.getCudaError());
             throw ex;

From 92b2b1231bd16dec8cf50b7ea23fd8955431337d Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 15 May 2024 17:58:43 +0100
Subject: [PATCH 211/842] Implement null-aware NOT_EQUALS binop (#15731)

Fill out the table of null-aware comparison binops by also supporting a new NULL_NOT_EQUALS. This is the negation of NULL_EQUALS but implemented in a single pass, rather than binop(NULL_EQUALS) followed by uop(NEGATE).

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/15731
---
 cpp/CMakeLists.txt                            |  1 +
 cpp/benchmarks/binaryop/compiled_binaryop.cpp |  3 +-
 cpp/include/cudf/binaryop.hpp                 |  2 ++
 cpp/src/binaryop/binaryop.cpp                 |  9 ++---
 cpp/src/binaryop/compiled/NullNotEquals.cu    | 26 ++++++++++++++
 cpp/src/binaryop/compiled/binary_ops.cu       |  4 ++-
 cpp/src/binaryop/compiled/binary_ops.cuh      |  1 +
 cpp/src/binaryop/compiled/binary_ops.hpp      |  2 +-
 cpp/src/binaryop/compiled/operation.cuh       | 17 ++++++++--
 .../binaryop/compiled/struct_binary_ops.cuh   |  4 +--
 cpp/src/binaryop/compiled/util.cpp            |  4 ++-
 cpp/tests/binaryop/binop-compiled-test.cpp    | 34 +++++++++++++++++++
 cpp/tests/binaryop/util/operation.h           |  8 +++++
 .../main/java/ai/rapids/cudf/BinaryOp.java    |  9 ++---
 .../java/ai/rapids/cudf/BinaryOperable.java   | 14 ++++++++
 python/cudf/cudf/_lib/binaryop.pyx            |  2 +-
 .../cudf/_lib/pylibcudf/libcudf/binaryop.pxd  |  1 +
 python/cudf/cudf/core/column/categorical.py   |  7 +++-
 python/cudf/cudf/core/column/datetime.py      |  8 +++--
 python/cudf/cudf/core/column/numerical.py     |  1 +
 python/cudf/cudf/core/column/string.py        |  1 +
 python/cudf/cudf/core/column/timedelta.py     | 10 ++++--
 22 files changed, 143 insertions(+), 25 deletions(-)
 create mode 100644 cpp/src/binaryop/compiled/NullNotEquals.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 474269364de..7390c465ccb 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -264,6 +264,7 @@ add_library(
   src/binaryop/compiled/Mod.cu
   src/binaryop/compiled/Mul.cu
   src/binaryop/compiled/NullEquals.cu
+  src/binaryop/compiled/NullNotEquals.cu
   src/binaryop/compiled/NullLogicalAnd.cu
   src/binaryop/compiled/NullLogicalOr.cu
   src/binaryop/compiled/NullMax.cu
diff --git a/cpp/benchmarks/binaryop/compiled_binaryop.cpp b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
index a1131df4472..7086a61c7c5 100644
--- a/cpp/benchmarks/binaryop/compiled_binaryop.cpp
+++ b/cpp/benchmarks/binaryop/compiled_binaryop.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -111,5 +111,6 @@ BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NOT_EQUAL,            bool
 BINARYOP_BENCHMARK_DEFINE(timestamp_s,  timestamp_s,  LESS,                 bool);
 BINARYOP_BENCHMARK_DEFINE(timestamp_ms, timestamp_s,  GREATER,              bool);
 BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  NULL_EQUALS,          bool);
+BINARYOP_BENCHMARK_DEFINE(duration_ms,  duration_ns,  NULL_NOT_EQUALS,      bool);
 BINARYOP_BENCHMARK_DEFINE(decimal32,    decimal32,    NULL_MAX,             decimal32);
 BINARYOP_BENCHMARK_DEFINE(timestamp_D,  timestamp_s,  NULL_MIN,             timestamp_s);
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 20550e92f9f..5e41a871f32 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -77,6 +77,8 @@ enum class binary_operator : int32_t {
   GREATER_EQUAL,         ///< operator >=
   NULL_EQUALS,           ///< Returns true when both operands are null; false when one is null; the
                          ///< result of equality when both are non-null
+  NULL_NOT_EQUALS,       ///< Returns false when both operands are null; true when one is null; the
+                         ///< result of inequality when both are non-null
   NULL_MAX,              ///< Returns max of operands when both are non-null; returns the non-null
                          ///< operand when one is null; or invalid when both are null
   NULL_MIN,              ///< Returns min of operands when both are non-null; returns the non-null
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index e39a2bb3ae8..ac31f9045fe 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -77,9 +77,9 @@ std::pair<rmm::device_buffer, size_type> scalar_col_valid_mask_and(
  */
 inline bool is_null_dependent(binary_operator op)
 {
-  return op == binary_operator::NULL_EQUALS || op == binary_operator::NULL_MIN ||
-         op == binary_operator::NULL_MAX || op == binary_operator::NULL_LOGICAL_AND ||
-         op == binary_operator::NULL_LOGICAL_OR;
+  return op == binary_operator::NULL_EQUALS || op == binary_operator::NULL_NOT_EQUALS ||
+         op == binary_operator::NULL_MIN || op == binary_operator::NULL_MAX ||
+         op == binary_operator::NULL_LOGICAL_AND || op == binary_operator::NULL_LOGICAL_OR;
 }
 
 /**
@@ -109,7 +109,8 @@ bool is_comparison_binop(binary_operator op)
          op == binary_operator::GREATER or        // operator >
          op == binary_operator::LESS_EQUAL or     // operator <=
          op == binary_operator::GREATER_EQUAL or  // operator >=
-         op == binary_operator::NULL_EQUALS;      // 2 null = true; 1 null = false; else ==
+         op == binary_operator::NULL_EQUALS or    // 2 null = true; 1 null = false; else ==
+         op == binary_operator::NULL_NOT_EQUALS;  // 2 null = false; 1 null = true; else !=
 }
 
 /**
diff --git a/cpp/src/binaryop/compiled/NullNotEquals.cu b/cpp/src/binaryop/compiled/NullNotEquals.cu
new file mode 100644
index 00000000000..34f73cca48a
--- /dev/null
+++ b/cpp/src/binaryop/compiled/NullNotEquals.cu
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "binary_ops.cuh"
+
+namespace cudf::binops::compiled {
+template void apply_binary_op<ops::NullNotEquals>(mutable_column_view&,
+                                                  column_view const&,
+                                                  column_view const&,
+                                                  bool is_lhs_scalar,
+                                                  bool is_rhs_scalar,
+                                                  rmm::cuda_stream_view);
+}  // namespace cudf::binops::compiled
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index d3257fadb1d..ba0253ec853 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -356,6 +356,7 @@ case binary_operator::LOG_BASE:             apply_binary_op<ops::LogBase>(out, l
 case binary_operator::ATAN2:                apply_binary_op<ops::ATan2>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::PMOD:                 apply_binary_op<ops::PMod>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_EQUALS:          apply_binary_op<ops::NullEquals>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
+case binary_operator::NULL_NOT_EQUALS:      apply_binary_op<ops::NullNotEquals>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_MAX:             apply_binary_op<ops::NullMax>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_MIN:             apply_binary_op<ops::NullMin>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
 case binary_operator::NULL_LOGICAL_AND:     apply_binary_op<ops::NullLogicalAnd>(out, lhs, rhs, is_lhs_scalar, is_rhs_scalar, stream); break;
@@ -412,8 +413,9 @@ void apply_sorting_struct_binary_op(mutable_column_view& out,
   // Struct child column type and structure mismatches are caught within the two_table_comparator
   switch (op) {
     case binary_operator::EQUAL: [[fallthrough]];
+    case binary_operator::NOT_EQUAL: [[fallthrough]];
     case binary_operator::NULL_EQUALS: [[fallthrough]];
-    case binary_operator::NOT_EQUAL:
+    case binary_operator::NULL_NOT_EQUALS:
       detail::apply_struct_equality_op(
         out,
         lhs,
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 0bc144baa83..5177e7d4bda 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -109,6 +109,7 @@ struct ops_wrapper {
         type_dispatcher(rhs.type(), type_casted_accessor<TypeCommon>{}, i, rhs, is_rhs_scalar);
       auto result = [&]() {
         if constexpr (std::is_same_v<BinaryOperator, ops::NullEquals> or
+                      std::is_same_v<BinaryOperator, ops::NullNotEquals> or
                       std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
                       std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
                       std::is_same_v<BinaryOperator, ops::NullMax> or
diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp
index c7eb08cd133..ceeba9cf817 100644
--- a/cpp/src/binaryop/compiled/binary_ops.hpp
+++ b/cpp/src/binaryop/compiled/binary_ops.hpp
@@ -194,7 +194,7 @@ void apply_binary_op(mutable_column_view& out,
  * @brief Deploys single type or double type dispatcher that runs equality operation on each element
  * of @p lhs and @p rhs columns.
  *
- * Comparison operators are EQUAL, NOT_EQUAL, NULL_EQUALS.
+ * Comparison operators are EQUAL, NOT_EQUAL, NULL_EQUALS, NULL_NOT_EQUALS.
  * @p out type is boolean.
  *
  * This template is instantiated for each binary operator.
diff --git a/cpp/src/binaryop/compiled/operation.cuh b/cpp/src/binaryop/compiled/operation.cuh
index 214803dc415..43b4bd232c4 100644
--- a/cpp/src/binaryop/compiled/operation.cuh
+++ b/cpp/src/binaryop/compiled/operation.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -422,15 +422,26 @@ struct NullEquals {
     TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) -> decltype(x == y)
   {
     output_valid = true;
-    if (!lhs_valid && !rhs_valid) return true;
     if (lhs_valid && rhs_valid) return x == y;
-    return false;
+    return !lhs_valid && !rhs_valid;
   }
   // To allow std::is_invocable_v = true
   template <typename TypeLhs, typename TypeRhs>
   __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x == y);
 };
 
+struct NullNotEquals {
+  template <typename TypeLhs, typename TypeRhs>
+  __device__ inline auto operator()(
+    TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) -> decltype(x != y)
+  {
+    return !NullEquals{}(x, y, lhs_valid, rhs_valid, output_valid);
+  }
+  // To allow std::is_invocable_v = true
+  template <typename TypeLhs, typename TypeRhs>
+  __device__ inline auto operator()(TypeLhs x, TypeRhs y) -> decltype(x != y);
+};
+
 struct NullMax {
   template <typename TypeLhs,
             typename TypeRhs,
diff --git a/cpp/src/binaryop/compiled/struct_binary_ops.cuh b/cpp/src/binaryop/compiled/struct_binary_ops.cuh
index 2299df5a9bb..a57ff661d67 100644
--- a/cpp/src/binaryop/compiled/struct_binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/struct_binary_ops.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -148,7 +148,7 @@ void apply_struct_equality_op(mutable_column_view& out,
                               rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(op == binary_operator::EQUAL || op == binary_operator::NOT_EQUAL ||
-                 op == binary_operator::NULL_EQUALS,
+                 op == binary_operator::NULL_EQUALS || op == binary_operator::NULL_NOT_EQUALS,
                "Unsupported operator for these types",
                cudf::data_type_error);
 
diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp
index 1ef521d241a..02f4e480ecb 100644
--- a/cpp/src/binaryop/compiled/util.cpp
+++ b/cpp/src/binaryop/compiled/util.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -182,6 +182,8 @@ struct is_supported_operation_functor {
       case binary_operator::LESS_EQUAL: return bool_op<ops::LessEqual, TypeLhs, TypeRhs>(out);
       case binary_operator::GREATER_EQUAL: return bool_op<ops::GreaterEqual, TypeLhs, TypeRhs>(out);
       case binary_operator::NULL_EQUALS: return bool_op<ops::NullEquals, TypeLhs, TypeRhs>(out);
+      case binary_operator::NULL_NOT_EQUALS:
+        return bool_op<ops::NullNotEquals, TypeLhs, TypeRhs>(out);
       case binary_operator::NULL_LOGICAL_AND:
         return bool_op<ops::NullLogicalAnd, TypeLhs, TypeRhs>(out);
       case binary_operator::NULL_LOGICAL_OR:
diff --git a/cpp/tests/binaryop/binop-compiled-test.cpp b/cpp/tests/binaryop/binop-compiled-test.cpp
index 27865bd062f..06e0d193d80 100644
--- a/cpp/tests/binaryop/binop-compiled-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-test.cpp
@@ -699,6 +699,40 @@ TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullEquals_Vector_Vector)
 using BinaryOperationCompiledTest_NullOpsString =
   BinaryOperationCompiledTest_NullOps<cudf::test::Types<std::string, std::string, std::string>>;
 TEST_F(BinaryOperationCompiledTest_NullOpsString, NullEquals_Vector_Vector)
+{
+  using TypeOut         = bool;
+  using TypeLhs         = std::string;
+  using TypeRhs         = std::string;
+  using NULL_NOT_EQUALS = cudf::library::operation::NullNotEquals<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs            = lhs_random_column<TypeLhs>(col_size);
+  auto rhs            = rhs_random_column<TypeRhs>(col_size);
+  auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_NOT_EQUALS>(lhs, rhs);
+
+  auto const result = cudf::binary_operation(
+    lhs, rhs, cudf::binary_operator::NULL_NOT_EQUALS, cudf::data_type(cudf::type_to_id<TypeOut>()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(BinaryOperationCompiledTest_NullOps, NullNotEquals_Vector_Vector)
+{
+  using TypeOut         = bool;
+  using TypeLhs         = typename TestFixture::TypeLhs;
+  using TypeRhs         = typename TestFixture::TypeRhs;
+  using NULL_NOT_EQUALS = cudf::library::operation::NullNotEquals<TypeOut, TypeLhs, TypeRhs>;
+
+  auto lhs            = lhs_random_column<TypeLhs>(col_size);
+  auto rhs            = rhs_random_column<TypeRhs>(col_size);
+  auto const expected = NullOp_Result<TypeOut, TypeLhs, TypeRhs, NULL_NOT_EQUALS>(lhs, rhs);
+
+  auto const result = cudf::binary_operation(
+    lhs, rhs, cudf::binary_operator::NULL_NOT_EQUALS, cudf::data_type(cudf::type_to_id<TypeOut>()));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
+using BinaryOperationCompiledTest_NullOpsString =
+  BinaryOperationCompiledTest_NullOps<cudf::test::Types<std::string, std::string, std::string>>;
+TEST_F(BinaryOperationCompiledTest_NullOpsString, NullNotEquals_Vector_Vector)
 {
   using TypeOut     = bool;
   using TypeLhs     = std::string;
diff --git a/cpp/tests/binaryop/util/operation.h b/cpp/tests/binaryop/util/operation.h
index efebc02bc89..c900c4c558c 100644
--- a/cpp/tests/binaryop/util/operation.h
+++ b/cpp/tests/binaryop/util/operation.h
@@ -415,6 +415,14 @@ struct NullEquals {
   }
 };
 
+template <typename TypeOut, typename TypeLhs, typename TypeRhs>
+struct NullNotEquals {
+  TypeOut operator()(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) const
+  {
+    return !NullEquals<TypeOut, TypeLhs, TypeRhs>()(x, y, lhs_valid, rhs_valid, output_valid);
+  }
+};
+
 template <typename TypeOut, typename TypeLhs, typename TypeRhs>
 struct NullMax {
   TypeOut operator()(TypeLhs x, TypeRhs y, bool lhs_valid, bool rhs_valid, bool& output_valid) const
diff --git a/java/src/main/java/ai/rapids/cudf/BinaryOp.java b/java/src/main/java/ai/rapids/cudf/BinaryOp.java
index fe559184878..c60323775ce 100644
--- a/java/src/main/java/ai/rapids/cudf/BinaryOp.java
+++ b/java/src/main/java/ai/rapids/cudf/BinaryOp.java
@@ -49,11 +49,12 @@ public enum BinaryOp {
   LESS_EQUAL(25), // <=
   GREATER_EQUAL(26), // >=
   NULL_EQUALS(27), // like EQUAL but NULL == NULL is TRUE and NULL == not NULL is FALSE
-  NULL_MAX(28), // MAX but NULL < not NULL
-  NULL_MIN(29), // MIN but NULL > not NULL
+  NULL_NOT_EQUALS(28), // negation of NULL_EQUALS
+  NULL_MAX(29), // MAX but NULL < not NULL
+  NULL_MIN(30), // MIN but NULL > not NULL
   //NOT IMPLEMENTED YET GENERIC_BINARY(30);
-  NULL_LOGICAL_AND(31),
-  NULL_LOGICAL_OR(32);
+  NULL_LOGICAL_AND(32),
+  NULL_LOGICAL_OR(33);
 
 
   static final EnumSet<BinaryOp> COMPARISON = EnumSet.of(
diff --git a/java/src/main/java/ai/rapids/cudf/BinaryOperable.java b/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
index 48a7861f1a1..6e8d862213e 100644
--- a/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
+++ b/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
@@ -546,6 +546,20 @@ default ColumnVector equalToNullAware(BinaryOperable rhs) {
     return equalToNullAware(rhs, DType.BOOL8);
   }
 
+  /**
+   * like notEqualTo but NULL != NULL is TRUE and NULL != not NULL is FALSE
+   */
+  default ColumnVector notEqualToNullAware(BinaryOperable rhs, DType outType) {
+    return binaryOp(BinaryOp.NULL_NOT_EQUALS, rhs, outType);
+  }
+
+  /**
+   * like notEqualTo but NULL != NULL is TRUE and NULL != not NULL is FALSE
+   */
+  default ColumnVector notEqualToNullAware(BinaryOperable rhs) {
+    return notEqualToNullAware(rhs, DType.BOOL8);
+  }
+
   /**
    * Returns the max non null value.
    */
diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
index 969be426044..2e352dd7904 100644
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ b/python/cudf/cudf/_lib/binaryop.pyx
@@ -34,7 +34,7 @@ def binaryop(lhs, rhs, op, dtype):
     """
     # TODO: Shouldn't have to keep special-casing. We need to define a separate
     # pipeline for libcudf binops that don't map to Python binops.
-    if op not in {"INT_POW", "NULL_EQUALS"}:
+    if op not in {"INT_POW", "NULL_EQUALS", "NULL_NOT_EQUALS"}:
         op = op[2:-2]
     op = op.upper()
     op = _op_map.get(op, op)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
index 788a94a0bbc..0eda7d34ff9 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
@@ -29,6 +29,7 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         LESS_EQUAL
         GREATER_EQUAL
         NULL_EQUALS
+        NULL_NOT_EQUALS
         BITWISE_AND
         BITWISE_OR
         BITWISE_XOR
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 1f003534913..adda8a34cd0 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -729,7 +729,12 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         if not isinstance(other, CategoricalColumn):
             raise ValueError
         # Note: at this stage we are guaranteed that the dtypes are equal.
-        if not self.ordered and op not in {"__eq__", "__ne__", "NULL_EQUALS"}:
+        if not self.ordered and op not in {
+            "__eq__",
+            "__ne__",
+            "NULL_EQUALS",
+            "NULL_NOT_EQUALS",
+        }:
             raise TypeError(
                 "The only binary operations supported by unordered "
                 "categorical columns are equality and inequality."
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 9fe4e5da96d..d92a3a00641 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -570,18 +570,20 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 out_dtype = _resolve_mixed_dtypes(lhs, rhs, "datetime64")
         elif op in {
             "__eq__",
-            "NULL_EQUALS",
             "__ne__",
+            "NULL_EQUALS",
+            "NULL_NOT_EQUALS",
         }:
             out_dtype = cudf.dtype(np.bool_)
             if isinstance(other, ColumnBase) and not isinstance(
                 other, DatetimeColumn
             ):
+                fill_value = op in ("__ne__", "NULL_NOT_EQUALS")
                 result = _all_bools_with_nulls(
-                    self, other, bool_fill_value=op == "__ne__"
+                    self, other, bool_fill_value=fill_value
                 )
                 if cudf.get_option("mode.pandas_compatible"):
-                    result = result.fillna(op == "__ne__")
+                    result = result.fillna(fill_value)
                 return result
 
         if out_dtype is None:
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index f6c7ca7675a..12c27ed0bc1 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -248,6 +248,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             "__eq__",
             "__ne__",
             "NULL_EQUALS",
+            "NULL_NOT_EQUALS",
         }:
             out_dtype = "bool"
 
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 3e941d60079..40e58e14612 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5957,6 +5957,7 @@ def _binaryop(
                 "__ge__",
                 "__le__",
                 "NULL_EQUALS",
+                "NULL_NOT_EQUALS",
             }:
                 lhs, rhs = (other, self) if reflect else (self, other)
                 return libcudf.binaryop.binaryop(
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index c5ed889b5dc..c6af052b56f 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -163,6 +163,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 "__le__",
                 "__ge__",
                 "NULL_EQUALS",
+                "NULL_NOT_EQUALS",
             }:
                 out_dtype = cudf.dtype(np.bool_)
             elif op == "__mod__":
@@ -185,15 +186,18 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         elif other.dtype.kind in {"f", "i", "u"}:
             if op in {"__mul__", "__mod__", "__truediv__", "__floordiv__"}:
                 out_dtype = self.dtype
-            elif op in {"__eq__", "NULL_EQUALS", "__ne__"}:
+            elif op in {"__eq__", "__ne__", "NULL_EQUALS", "NULL_NOT_EQUALS"}:
                 if isinstance(other, ColumnBase) and not isinstance(
                     other, TimeDeltaColumn
                 ):
+                    fill_value = op in ("__ne__", "NULL_NOT_EQUALS")
                     result = _all_bools_with_nulls(
-                        self, other, bool_fill_value=op == "__ne__"
+                        self,
+                        other,
+                        bool_fill_value=fill_value,
                     )
                     if cudf.get_option("mode.pandas_compatible"):
-                        result = result.fillna(op == "__ne__")
+                        result = result.fillna(fill_value)
                     return result
 
         if out_dtype is None:

From b5f6aa59cd9d2ebb238f9f249b305d1883169332 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 15 May 2024 07:26:21 -1000
Subject: [PATCH 212/842] Eliminate circular reference in
 DataFrame/Series.iloc/loc (#15749)

closes #15748

The performance implication can be seen in the issue

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15749
---
 python/cudf/cudf/core/indexed_frame.py  |  5 ++---
 python/cudf/cudf/tests/test_indexing.py | 11 +++++++++++
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 904cd0c69c2..8d67afa34bc 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -8,7 +8,6 @@
 import textwrap
 import warnings
 from collections import Counter, abc
-from functools import cached_property
 from typing import (
     Any,
     Callable,
@@ -2266,7 +2265,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True):
         slicer[axis] = slice(before, after)
         return self.loc[tuple(slicer)].copy()
 
-    @cached_property
+    @property
     def loc(self):
         """Select rows and columns by label or boolean mask.
 
@@ -2332,7 +2331,7 @@ def loc(self):
         """
         return self._loc_indexer_type(self)
 
-    @cached_property
+    @property
     def iloc(self):
         """Select values by position.
 
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index f49b9b02076..b1d871b6abc 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
+import weakref
 from datetime import datetime
 from itertools import combinations
 
@@ -2257,6 +2258,16 @@ def test_scalar_loc_row_categoricalindex():
     assert_eq(result, expected)
 
 
+@pytest.mark.parametrize("klass", [cudf.DataFrame, cudf.Series])
+@pytest.mark.parametrize("indexer", ["iloc", "loc"])
+def test_iloc_loc_no_circular_reference(klass, indexer):
+    obj = klass([0])
+    ref = weakref.ref(obj)
+    getattr(obj, indexer)[0]
+    del obj
+    assert ref() is None
+
+
 def test_loc_setitem_empty_dataframe():
     pdf = pd.DataFrame(index=["index_1", "index_2", "index_3"])
     gdf = cudf.from_pandas(pdf)

From 516d0f9033e73d10a473e2ca3fcc891e980450bc Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 15 May 2024 17:49:48 -0500
Subject: [PATCH 213/842] remove unnecessary 'setuptools' host dependency,
 simplify dependencies.yaml (#15736)

Pulls out some changes I noticed while working on #15245.

* removes `host` dependency on `setuptools` for `cudf` and `cudf_kafka`
  - *they don't need it now that they build with `scikit-build-core`*
* consolidates some redundant blocks in `dependencies.yaml`

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15736
---
 conda/recipes/cudf/meta.yaml       | 1 -
 conda/recipes/cudf_kafka/meta.yaml | 1 -
 dependencies.yaml                  | 8 +-------
 3 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 24210830ada..12e29c77a98 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -62,7 +62,6 @@ requirements:
     - python
     - cython >=3.0.3
     - scikit-build-core >=0.7.0
-    - setuptools
     - dlpack >=0.8,<1.0
     - numpy 1.23
     - pyarrow ==16.0.0.*
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index ab41d9e1f15..4d91cf6320c 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -61,7 +61,6 @@ requirements:
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
     - scikit-build-core >=0.7.0
-    - setuptools
     {% if cuda_major != "11" %}
     - cuda-cudart-dev
     {% endif %}
diff --git a/dependencies.yaml b/dependencies.yaml
index 898760d1351..4f8f3c16ea1 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -325,9 +325,6 @@ dependencies:
           - matrix: {cuda: "11.*"}
             packages: &build_python_packages_cu11
               - &rmm_cu11 rmm-cu11==24.6.*
-          - {matrix: null, packages: null }
-      - output_types: pyproject
-        matrices:
           - {matrix: null, packages: [*rmm_conda] }
   libarrow_build:
     common:
@@ -578,10 +575,7 @@ dependencies:
               - rmm-cu11==24.6.*
               - cubinlinker-cu11
               - ptxcompiler-cu11
-          - {matrix: null, packages: null}
-      - output_types: pyproject
-        matrices:
-          - {matrix: null, packages: [cubinlinker, ptxcompiler, *rmm_conda] }
+          - {matrix: null, packages: [cubinlinker, ptxcompiler, *rmm_conda]}
   run_cudf_polars:
     common:
       - output_types: [conda, requirements, pyproject]

From ec07927b70c0a98d8c1d070e79a2cb9bf281bf12 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
Date: Wed, 15 May 2024 22:03:03 -0500
Subject: [PATCH 214/842] Expose stream parameter in public reduction APIs
 (#15737)

Add stream parameter to public reduction APIs:

- `reduce()`
- `segmented_reduce()`
- `scan()`
- `minmax()`

Reference #13744

Authors:
  - Srinivas Yadav (https://github.com/srinivasyadav18)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/15737
---
 cpp/include/cudf/reduction.hpp              |  12 +++
 cpp/src/reductions/minmax.cu                |   4 +-
 cpp/src/reductions/reductions.cpp           |   7 +-
 cpp/src/reductions/scan/scan.cpp            |   3 +-
 cpp/src/reductions/segmented/reductions.cpp |  22 ++---
 cpp/tests/CMakeLists.txt                    |   1 +
 cpp/tests/streams/reduction_test.cpp        | 102 ++++++++++++++++++++
 7 files changed, 129 insertions(+), 22 deletions(-)
 create mode 100644 cpp/tests/streams/reduction_test.cpp

diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp
index 5adf89d1706..52f39925a2d 100644
--- a/cpp/include/cudf/reduction.hpp
+++ b/cpp/include/cudf/reduction.hpp
@@ -75,6 +75,7 @@ enum class scan_type : bool { INCLUSIVE, EXCLUSIVE };
  * @param col Input column view
  * @param agg Aggregation operator applied by the reduction
  * @param output_dtype The output scalar type
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @returns Output scalar with reduce result
  */
@@ -82,6 +83,7 @@ std::unique_ptr<scalar> reduce(
   column_view const& col,
   reduce_aggregation const& agg,
   data_type output_dtype,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -96,6 +98,7 @@ std::unique_ptr<scalar> reduce(
  * @param agg Aggregation operator applied by the reduction
  * @param output_dtype The output scalar type
  * @param init The initial value of the reduction
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @returns Output scalar with reduce result
  */
@@ -104,6 +107,7 @@ std::unique_ptr<scalar> reduce(
   reduce_aggregation const& agg,
   data_type output_dtype,
   std::optional<std::reference_wrapper<scalar const>> init,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -145,6 +149,7 @@ std::unique_ptr<scalar> reduce(
  * @param null_handling If `INCLUDE`, the reduction is valid if all elements in a segment are valid,
  * otherwise null. If `EXCLUDE`, the reduction is valid if any element in the segment is valid,
  * otherwise null.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @returns Output column with results of segmented reduction
  */
@@ -154,6 +159,7 @@ std::unique_ptr<column> segmented_reduce(
   segmented_reduce_aggregation const& agg,
   data_type output_dtype,
   null_policy null_handling,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -169,6 +175,7 @@ std::unique_ptr<column> segmented_reduce(
  * otherwise null. If `EXCLUDE`, the reduction is valid if any element in the segment is valid,
  * otherwise null.
  * @param init The initial value of the reduction
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned scalar's device memory
  * @returns Output column with results of segmented reduction.
  */
@@ -179,6 +186,7 @@ std::unique_ptr<column> segmented_reduce(
   data_type output_dtype,
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -195,6 +203,7 @@ std::unique_ptr<column> segmented_reduce(
  * exclusive scan if scan_type::EXCLUSIVE.
  * @param[in] null_handling Exclude null values when computing the result if null_policy::EXCLUDE.
  * Include nulls if null_policy::INCLUDE. Any operation with a null results in a null.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned scalar's device memory
  * @returns Scanned output column
  */
@@ -203,6 +212,7 @@ std::unique_ptr<column> scan(
   scan_aggregation const& agg,
   scan_type inclusive,
   null_policy null_handling         = null_policy::EXCLUDE,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -210,12 +220,14 @@ std::unique_ptr<column> scan(
  *
  *
  * @param col column to compute minmax
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A std::pair of scalars with the first scalar being the minimum value and the second
  * scalar being the maximum value of the input column.
  */
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
   column_view const& col,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index 62a1f4aab7c..2c1181972c5 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -275,10 +275,10 @@ std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
 }  // namespace detail
 
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
-  column_view const& col, rmm::device_async_resource_ref mr)
+  column_view const& col, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::minmax(col, cudf::get_default_stream(), mr);
+  return detail::minmax(col, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index cde0274339a..8fa036a0949 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -208,20 +208,21 @@ std::unique_ptr<scalar> reduce(column_view const& col,
 std::unique_ptr<scalar> reduce(column_view const& col,
                                reduce_aggregation const& agg,
                                data_type output_dtype,
+                               rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return reduction::detail::reduce(
-    col, agg, output_dtype, std::nullopt, cudf::get_default_stream(), mr);
+  return reduction::detail::reduce(col, agg, output_dtype, std::nullopt, stream, mr);
 }
 
 std::unique_ptr<scalar> reduce(column_view const& col,
                                reduce_aggregation const& agg,
                                data_type output_dtype,
                                std::optional<std::reference_wrapper<scalar const>> init,
+                               rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return reduction::detail::reduce(col, agg, output_dtype, init, cudf::get_default_stream(), mr);
+  return reduction::detail::reduce(col, agg, output_dtype, init, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp
index b6e8690a6c9..de4dcf1de52 100644
--- a/cpp/src/reductions/scan/scan.cpp
+++ b/cpp/src/reductions/scan/scan.cpp
@@ -60,10 +60,11 @@ std::unique_ptr<column> scan(column_view const& input,
                              scan_aggregation const& agg,
                              scan_type inclusive,
                              null_policy null_handling,
+                             rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::scan(input, agg, inclusive, null_handling, cudf::get_default_stream(), mr);
+  return detail::scan(input, agg, inclusive, null_handling, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index 1ae344dcace..48ab5963a29 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -138,17 +138,12 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          segmented_reduce_aggregation const& agg,
                                          data_type output_dtype,
                                          null_policy null_handling,
+                                         rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return reduction::detail::segmented_reduce(segmented_values,
-                                             offsets,
-                                             agg,
-                                             output_dtype,
-                                             null_handling,
-                                             std::nullopt,
-                                             cudf::get_default_stream(),
-                                             mr);
+  return reduction::detail::segmented_reduce(
+    segmented_values, offsets, agg, output_dtype, null_handling, std::nullopt, stream, mr);
 }
 
 std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
@@ -157,17 +152,12 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                          data_type output_dtype,
                                          null_policy null_handling,
                                          std::optional<std::reference_wrapper<scalar const>> init,
+                                         rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return reduction::detail::segmented_reduce(segmented_values,
-                                             offsets,
-                                             agg,
-                                             output_dtype,
-                                             null_handling,
-                                             init,
-                                             cudf::get_default_stream(),
-                                             mr);
+  return reduction::detail::segmented_reduce(
+    segmented_values, offsets, agg, output_dtype, null_handling, init, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index e779e1d1410..c2982c478cd 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -695,6 +695,7 @@ ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing)
 ConfigureTest(
   STREAM_STRINGS_TEST
   streams/strings/case_test.cpp
diff --git a/cpp/tests/streams/reduction_test.cpp b/cpp/tests/streams/reduction_test.cpp
new file mode 100644
index 00000000000..53dd1eed459
--- /dev/null
+++ b/cpp/tests/streams/reduction_test.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+
+class ReductionTest : public cudf::test::BaseFixture {};
+
+TEST_F(ReductionTest, ReductionSum)
+{
+  cudf::test::fixed_width_column_wrapper<int> input({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  cudf::reduce(input,
+               *cudf::make_sum_aggregation<cudf::reduce_aggregation>(),
+               cudf::data_type(cudf::type_id::INT32),
+               cudf::test::get_default_stream());
+}
+
+TEST_F(ReductionTest, ReductionSumScalarInit)
+{
+  cudf::test::fixed_width_column_wrapper<int> input({1, 2, 3, 4, 5, 6, 7, 8, 9, 10});
+  auto const init_scalar = cudf::make_fixed_width_scalar<int>(3, cudf::test::get_default_stream());
+  cudf::reduce(input,
+               *cudf::make_sum_aggregation<cudf::reduce_aggregation>(),
+               cudf::data_type(cudf::type_id::INT32),
+               *init_scalar,
+               cudf::test::get_default_stream());
+}
+
+TEST_F(ReductionTest, SegmentedReductionSum)
+{
+  auto const input     = cudf::test::fixed_width_column_wrapper<int>{{1, 2, 3, 1, 0, 3, 1, 0, 0, 0},
+                                                                     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+
+  auto res =
+    cudf::segmented_reduce(input,
+                           d_offsets,
+                           *cudf::make_sum_aggregation<cudf::segmented_reduce_aggregation>(),
+                           cudf::data_type(cudf::type_id::INT32),
+                           cudf::null_policy::EXCLUDE,
+                           cudf::test::get_default_stream());
+}
+
+TEST_F(ReductionTest, SegmentedReductionSumScalarInit)
+{
+  auto const input     = cudf::test::fixed_width_column_wrapper<int>{{1, 2, 3, 1, 0, 3, 1, 0, 0, 0},
+                                                                     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
+  auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
+  auto const d_offsets = cudf::detail::make_device_uvector_async(
+    offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto const init_scalar = cudf::make_fixed_width_scalar<int>(3, cudf::test::get_default_stream());
+  auto res =
+    cudf::segmented_reduce(input,
+                           d_offsets,
+                           *cudf::make_sum_aggregation<cudf::segmented_reduce_aggregation>(),
+                           cudf::data_type(cudf::type_id::INT32),
+                           cudf::null_policy::EXCLUDE,
+                           *init_scalar,
+                           cudf::test::get_default_stream());
+}
+
+TEST_F(ReductionTest, ScanMin)
+{
+  auto const input = cudf::test::fixed_width_column_wrapper<int>{
+    {123, 64, 63, 99, -5, 123, -16, -120, -111}, {1, 0, 1, 1, 1, 1, 0, 0, 1}};
+
+  cudf::scan(input,
+             *cudf::make_min_aggregation<cudf::scan_aggregation>(),
+             cudf::scan_type::INCLUSIVE,
+             cudf::null_policy::EXCLUDE,
+             cudf::test::get_default_stream());
+}
+
+TEST_F(ReductionTest, MinMax)
+{
+  auto const input = cudf::test::fixed_width_column_wrapper<int>{
+    {123, 64, 63, 99, -5, 123, -16, -120, -111}, {1, 0, 1, 1, 1, 1, 0, 0, 1}};
+
+  cudf::minmax(input, cudf::test::get_default_stream());
+}

From 4e87069bd43ee969797265eaed00f82eda255dd4 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Wed, 15 May 2024 22:14:30 -0500
Subject: [PATCH 215/842] Cap the absolute row index per pass in parquet
 chunked reader. (#15735)

Fixes  https://github.com/rapidsai/cudf/issues/15690

There was an issue when computing page row counts/indices at the pass level in the chunked reader.  Because we estimate list row counts for pages we have not yet decompressed, this can sometimes lead to estimates row counts that are larger than the actual (known) number of rows for a pass.  This caused an out-of-bounds read down the line.  We were already handling this at the subpass level, just not at the pass level.

Also includes some fixes in debug logging code that is #ifdef'd out.

Authors:
  - https://github.com/nvdbaranec
  - David Wendt (https://github.com/davidwendt)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15735
---
 cpp/src/io/parquet/reader_impl_chunking.cu | 31 +++++++++++++---------
 1 file changed, 19 insertions(+), 12 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 912f53a8277..f4fb6bc57e6 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -101,7 +101,7 @@ void print_cumulative_page_info(device_span<PageInfo const> d_pages,
       printf("\tP %s: {%lu, %lu, %lu}\n",
              is_list ? "(L)" : "",
              pidx,
-             c_info[pidx].row_index,
+             c_info[pidx].end_row_index,
              c_info[pidx].size_bytes);
     }
   }
@@ -121,16 +121,17 @@ void print_cumulative_row_info(host_span<cumulative_page_info const> sizes,
   printf("------------\nCumulative sizes %s (index, row_index, size_bytes, page_key)\n",
          label.c_str());
   for (size_t idx = 0; idx < sizes.size(); idx++) {
-    printf("{%lu, %lu, %lu, %d}", idx, sizes[idx].row_index, sizes[idx].size_bytes, sizes[idx].key);
+    printf(
+      "{%lu, %lu, %lu, %d}", idx, sizes[idx].end_row_index, sizes[idx].size_bytes, sizes[idx].key);
     if (splits.has_value()) {
       // if we have a split at this row count and this is the last instance of this row count
       auto start             = thrust::make_transform_iterator(splits->begin(),
                                                    [](row_range const& i) { return i.skip_rows; });
       auto end               = start + splits->size();
-      auto split             = std::find(start, end, sizes[idx].row_index);
+      auto split             = std::find(start, end, sizes[idx].end_row_index);
       auto const split_index = [&]() -> int {
-        if (split != end &&
-            ((idx == sizes.size() - 1) || (sizes[idx + 1].row_index > sizes[idx].row_index))) {
+        if (split != end && ((idx == sizes.size() - 1) ||
+                             (sizes[idx + 1].end_row_index > sizes[idx].end_row_index))) {
           return static_cast<int>(std::distance(start, split));
         }
         return idx == 0 ? 0 : -1;
@@ -259,8 +260,9 @@ struct set_row_index {
     auto const& page          = pages[i];
     auto const& chunk         = chunks[page.chunk_idx];
     size_t const page_end_row = chunk.start_row + page.chunk_row + page.num_rows;
-    // if we have been passed in a cap, apply it
-    c_info[i].end_row_index = max_row > 0 ? min(max_row, page_end_row) : page_end_row;
+    // this cap is necessary because in the chunked reader, we use estimations for the row
+    // counts for list columns, which can result in values > than the absolute number of rows.
+    c_info[i].end_row_index = min(max_row, page_end_row);
   }
 };
 
@@ -461,6 +463,7 @@ adjust_cumulative_sizes(device_span<cumulative_page_info const> c_info,
                                                      thrust::make_discard_iterator(),
                                                      key_offsets.begin())
                                  .second;
+
   size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
   thrust::exclusive_scan(
     rmm::exec_policy_nosync(stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
@@ -1292,10 +1295,12 @@ void reader::impl::setup_next_pass(bool uses_custom_row_bounds)
     printf("\tnum_rows: %'lu\n", pass.num_rows);
     printf("\tbase mem usage: %'lu\n", pass.base_mem_size);
     auto const num_columns = _input_columns.size();
+    std::vector<size_type> h_page_offsets =
+      cudf::detail::make_std_vector_sync(pass.page_offsets, _stream);
     for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
       printf("\t\tColumn %'lu: num_pages(%'d)\n",
              c_idx,
-             pass.page_offsets[c_idx + 1] - pass.page_offsets[c_idx]);
+             h_page_offsets[c_idx + 1] - h_page_offsets[c_idx]);
     }
 #endif
 
@@ -1362,11 +1367,12 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
     // can be considerable.
     include_decompression_scratch_size(pass.chunks, pass.pages, c_info, _stream);
 
-    auto iter = thrust::make_counting_iterator(0);
+    auto iter               = thrust::make_counting_iterator(0);
+    auto const pass_max_row = pass.skip_rows + pass.num_rows;
     thrust::for_each(rmm::exec_policy_nosync(_stream),
                      iter,
                      iter + pass.pages.size(),
-                     set_row_index{pass.chunks, pass.pages, c_info, 0});
+                     set_row_index{pass.chunks, pass.pages, c_info, pass_max_row});
     // print_cumulative_page_info(pass.pages, pass.chunks, c_info, _stream);
 
     // get the next batch of pages
@@ -1448,11 +1454,12 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
   printf("\t\tTotal expected usage: %'lu\n",
          total_expected_size == 0 ? subpass.decomp_page_data.size() + pass.base_mem_size
                                   : total_expected_size + pass.base_mem_size);
+  std::vector<page_span> h_page_indices = cudf::detail::make_std_vector_sync(page_indices, _stream);
   for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
     printf("\t\tColumn %'lu: pages(%'lu - %'lu)\n",
            c_idx,
-           page_indices[c_idx].start,
-           page_indices[c_idx].end);
+           h_page_indices[c_idx].start,
+           h_page_indices[c_idx].end);
   }
   printf("\t\tOutput chunks:\n");
   for (size_t idx = 0; idx < subpass.output_chunk_read_info.size(); idx++) {

From 0a544c2ab3c14e3feff42380518f73778b8b3d7d Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 16 May 2024 10:15:58 +0100
Subject: [PATCH 216/842] Defer to C++ equality and hashing for pylibcudf
 DataType and Aggregation objects (#15732)

Since the C++ layer provides implementations of these, use them, rather than redoing an implementation. This avoids things ever getting out of sync.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15732
---
 python/cudf/cudf/_lib/pylibcudf/aggregation.pyx         | 8 ++++++++
 python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd | 3 +++
 python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd       | 5 +++--
 python/cudf/cudf/_lib/pylibcudf/types.pyx               | 6 +++---
 4 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
index 672b1ba2221..7bb64e32a1b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
@@ -79,6 +79,14 @@ cdef class Aggregation:
             "Aggregations should not be constructed directly. Use one of the factories."
         )
 
+    def __eq__(self, other):
+        return type(self) is type(other) and (
+            dereference(self.c_obj).is_equal(dereference((<Aggregation>other).c_obj))
+        )
+
+    def __hash__(self):
+        return dereference(self.c_obj).do_hash()
+
     # TODO: Ideally we would include the return type here, but we need to do so
     # in a way that Sphinx understands (currently have issues due to
     # https://github.com/cython/cython/issues/5609).
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
index e0e01207589..8c14bc45723 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
@@ -1,4 +1,5 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from libc.stddef cimport size_t
 from libc.stdint cimport int32_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -51,6 +52,8 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
     cdef cppclass aggregation:
         Kind kind
         unique_ptr[aggregation] clone()
+        size_t do_hash() noexcept
+        bool is_equal(const aggregation const) noexcept
 
     cdef cppclass rolling_aggregation(aggregation):
         pass
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
index 13aebdff726..8e94ec296cf 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
@@ -88,8 +88,9 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
         data_type(const data_type&) except +
         data_type(type_id id) except +
         data_type(type_id id, int32_t scale) except +
-        type_id id() except +
-        int32_t scale() except +
+        type_id id() noexcept
+        int32_t scale() noexcept
+        bool operator==(const data_type&, const data_type&) noexcept
 
     cpdef enum class interpolation(int32_t):
         LINEAR
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index ebe4d66fa20..de10196e289 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -47,9 +47,9 @@ cdef class DataType:
         return self.c_obj.scale()
 
     def __eq__(self, other):
-        if not isinstance(other, DataType):
-            return False
-        return self.id() == other.id() and self.scale() == other.scale()
+        return type(self) is type(other) and (
+            self.c_obj == (<DataType>other).c_obj
+        )
 
     @staticmethod
     cdef DataType from_libcudf(data_type dt):

From bdd48f1ce16982f31e01108280d91b5d2a1f8847 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 16 May 2024 06:05:35 -0500
Subject: [PATCH 217/842] Fix `DatetimeIndex.loc` for all types of ordering
 cases (#15761)

Fixes: #15742

This PR resolves issues with returning incorrect ranges for `DatetimeIndex.loc` when the index objects are monotonically decreasing. Additionally, I went ahead and fixed it for all cases, (i.e., random ordering) too.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15761
---
 python/cudf/cudf/core/indexed_frame.py  | 20 +++++--
 python/cudf/cudf/tests/test_indexing.py | 74 +++++++++++++++++++++++++
 2 files changed, 88 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 8d67afa34bc..7aae0d1729e 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -194,7 +194,6 @@ def _get_label_range_or_mask(index, start, stop, step):
     if (
         not (start is None and stop is None)
         and type(index) is cudf.core.index.DatetimeIndex
-        and index.is_monotonic_increasing is False
     ):
         start = pd.to_datetime(start)
         stop = pd.to_datetime(stop)
@@ -205,8 +204,8 @@ def _get_label_range_or_mask(index, start, stop, step):
                 # when we have a non-monotonic datetime index, return
                 # values in the slice defined by index_of(start) and
                 # index_of(end)
-                start_loc = index.get_loc(start.to_datetime64())
-                stop_loc = index.get_loc(stop.to_datetime64()) + 1
+                start_loc = index.get_loc(start)
+                stop_loc = index.get_loc(stop) + 1
                 return slice(start_loc, stop_loc)
             else:
                 raise KeyError(
@@ -214,10 +213,19 @@ def _get_label_range_or_mask(index, start, stop, step):
                     "DatetimeIndexes with non-existing keys is not allowed.",
                 )
         elif start is not None:
-            boolean_mask = index >= start
+            if index.is_monotonic_increasing:
+                return index >= start
+            elif index.is_monotonic_decreasing:
+                return index <= start
+            else:
+                return index.find_label_range(slice(start, stop, step))
         else:
-            boolean_mask = index <= stop
-        return boolean_mask
+            if index.is_monotonic_increasing:
+                return index <= stop
+            elif index.is_monotonic_decreasing:
+                return index >= stop
+            else:
+                return index.find_label_range(slice(start, stop, step))
     else:
         return index.find_label_range(slice(start, stop, step))
 
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index b1d871b6abc..16754c3040b 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -2275,3 +2275,77 @@ def test_loc_setitem_empty_dataframe():
     gdf.loc[["index_1"], "new_col"] = "A"
 
     assert_eq(pdf, gdf)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [15, 14, 12, 10, 1],
+        [1, 10, 12, 14, 15],
+    ],
+)
+@pytest.mark.parametrize(
+    "scalar",
+    [
+        1,
+        10,
+        15,
+        14,
+        0,
+        2,
+    ],
+)
+def test_loc_datetime_monotonic_with_ts(data, scalar):
+    gdf = cudf.DataFrame(
+        {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5]},
+        index=cudf.Index(data, dtype="datetime64[ns]"),
+    )
+    pdf = gdf.to_pandas()
+
+    i = pd.Timestamp(scalar)
+
+    actual = gdf.loc[i:]
+    expected = pdf.loc[i:]
+
+    assert_eq(actual, expected)
+
+    actual = gdf.loc[:i]
+    expected = pdf.loc[:i]
+
+    assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize("data", [[15, 14, 3, 10, 1]])
+@pytest.mark.parametrize("scalar", [1, 10, 15, 14, 0, 2])
+def test_loc_datetime_random_with_ts(data, scalar):
+    gdf = cudf.DataFrame(
+        {"a": [1, 1, 1, 2, 2], "b": [1, 2, 3, 4, 5]},
+        index=cudf.Index(data, dtype="datetime64[ns]"),
+    )
+    pdf = gdf.to_pandas()
+
+    i = pd.Timestamp(scalar)
+
+    if i not in pdf.index:
+        assert_exceptions_equal(
+            lambda: pdf.loc[i:],
+            lambda: gdf.loc[i:],
+            lfunc_args_and_kwargs=([],),
+            rfunc_args_and_kwargs=([],),
+        )
+        assert_exceptions_equal(
+            lambda: pdf.loc[:i],
+            lambda: gdf.loc[:i],
+            lfunc_args_and_kwargs=([],),
+            rfunc_args_and_kwargs=([],),
+        )
+    else:
+        actual = gdf.loc[i:]
+        expected = pdf.loc[i:]
+
+        assert_eq(actual, expected)
+
+        actual = gdf.loc[:i]
+        expected = pdf.loc[:i]
+
+        assert_eq(actual, expected)

From 1e92f3f962cb27175e804889fd6d8c9be18b98c9 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 16 May 2024 09:01:46 -0400
Subject: [PATCH 218/842] Reduce runtime for ParquetChunkedReaderInputLimitTest
 gtests (#15672)

Reduces the runtime for the `ParquetChunkedReaderInputLimitTest.List` and `ParquetChunkedReaderInputLimitTest.Mixed` which together are 1/3 the total time for `PARQUET_TEST`.
These two tests produce multi-GB test files that are not strictly necessary for testing the chunked reader since the chunk sizes are controllable. The changes here reduce the runtime for these 2 tests by about 1/3 the original runtime.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Paul Mattione (https://github.com/pmattione-nvidia)

URL: https://github.com/rapidsai/cudf/pull/15672
---
 cpp/tests/io/parquet_chunked_reader_test.cu | 28 ++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index 58eee34a108..b3f3fac5a3d 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -1175,7 +1175,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, List)
   auto base_path      = temp_env->get_temp_filepath("list");
   auto test_filenames = input_limit_get_test_names(base_path);
 
-  constexpr int num_rows  = 50'000'000;
+  constexpr int num_rows  = 10'000'000;
   constexpr int list_size = 4;
 
   auto const stream = cudf::get_default_stream();
@@ -1225,14 +1225,14 @@ TEST_F(ParquetChunkedReaderInputLimitTest, List)
   //
   // Note that in the dictionary cases, both of these revert down to 1 chunk because the
   // dictionaries dramatically shrink the size of the uncompressed data.
-  constexpr int expected_a[] = {2, 2, 1, 1};
-  input_limit_test_read(test_filenames, tbl, 0, size_t{2} * 1024 * 1024 * 1024, expected_a);
+  constexpr int expected_a[] = {3, 3, 1, 1};
+  input_limit_test_read(test_filenames, tbl, 0, 256 * 1024 * 1024, expected_a);
   // smaller limit
-  constexpr int expected_b[] = {6, 6, 2, 1};
-  input_limit_test_read(test_filenames, tbl, 0, 512 * 1024 * 1024, expected_b);
+  constexpr int expected_b[] = {5, 5, 2, 1};
+  input_limit_test_read(test_filenames, tbl, 0, 128 * 1024 * 1024, expected_b);
   // include output chunking as well
-  constexpr int expected_c[] = {11, 11, 9, 8};
-  input_limit_test_read(test_filenames, tbl, 128 * 1024 * 1024, 512 * 1024 * 1024, expected_c);
+  constexpr int expected_c[] = {10, 9, 8, 7};
+  input_limit_test_read(test_filenames, tbl, 32 * 1024 * 1024, 64 * 1024 * 1024, expected_c);
 }
 
 void tiny_list_rowgroup_test(bool just_list_col)
@@ -1318,7 +1318,7 @@ TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
   auto base_path      = temp_env->get_temp_filepath("mixed_types");
   auto test_filenames = input_limit_get_test_names(base_path);
 
-  constexpr int num_rows  = 50'000'000;
+  constexpr int num_rows  = 10'000'000;
   constexpr int list_size = 4;
   constexpr int str_size  = 3;
 
@@ -1400,12 +1400,12 @@ TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
   //
   // Note that in the dictionary cases, both of these revert down to 1 chunk because the
   // dictionaries dramatically shrink the size of the uncompressed data.
-  constexpr int expected_a[] = {3, 3, 1, 1};
-  input_limit_test_read(test_filenames, tbl, 0, size_t{2} * 1024 * 1024 * 1024, expected_a);
+  constexpr int expected_a[] = {5, 5, 2, 1};
+  input_limit_test_read(test_filenames, tbl, 0, 256 * 1024 * 1024, expected_a);
   // smaller limit
-  constexpr int expected_b[] = {10, 11, 4, 1};
-  input_limit_test_read(test_filenames, tbl, 0, 512 * 1024 * 1024, expected_b);
+  constexpr int expected_b[] = {10, 9, 3, 1};
+  input_limit_test_read(test_filenames, tbl, 0, 128 * 1024 * 1024, expected_b);
   // include output chunking as well
-  constexpr int expected_c[] = {20, 21, 15, 14};
-  input_limit_test_read(test_filenames, tbl, 128 * 1024 * 1024, 512 * 1024 * 1024, expected_c);
+  constexpr int expected_c[] = {20, 18, 15, 12};
+  input_limit_test_read(test_filenames, tbl, 32 * 1024 * 1024, 64 * 1024 * 1024, expected_c);
 }

From bf255cb0414a439d1d61a06040f9b3c4003579b8 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 16 May 2024 10:12:27 -0400
Subject: [PATCH 219/842] Fix split-record result list column offset type
 (#15707)

Fixes offsets type for list column returned by `cudf::strings::split_record` and `cudf::strings::split_record_re` when large-strings enabled. The list column offsets type must be INT32. The code was changed to use the appropriate `make_offsets_child_column` utility function.
Also added some `is_large_strings_enabled()` checks to check-overflow gtests.
This allows all current gtests to pass when the large-strings support environment variable is set.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15707
---
 cpp/src/strings/split/split.cuh            |  4 ++--
 cpp/src/strings/split/split_re.cu          |  2 +-
 cpp/tests/column/factories_test.cpp        |  4 +++-
 cpp/tests/copying/concatenate_tests.cpp    |  7 +++++--
 cpp/tests/strings/array_tests.cpp          |  3 +++
 cpp/tests/strings/factories_test.cu        | 18 ++++++------------
 cpp/tests/strings/repeat_strings_tests.cpp |  5 ++++-
 7 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 160d1be3978..69a11aabfcd 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -365,8 +365,8 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
     });
 
   // create offsets from the counts for return to the caller
-  auto [offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column(
-    token_counts.begin(), token_counts.end(), stream, mr);
+  auto [offsets, total_tokens] =
+    cudf::detail::make_offsets_child_column(token_counts.begin(), token_counts.end(), stream, mr);
   auto const d_tokens_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
 
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 4dfb3e9ea62..6785ab9c893 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -147,7 +147,7 @@ std::pair<rmm::device_uvector<string_index_pair>, std::unique_ptr<column>> gener
   auto const begin = cudf::detail::make_counting_transform_iterator(0, map_fn);
   auto const end   = begin + strings_count;
 
-  auto [offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column(
+  auto [offsets, total_tokens] = cudf::detail::make_offsets_child_column(
     begin, end, stream, rmm::mr::get_current_device_resource());
   auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
 
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index b06d097647d..afebc91dd73 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -761,6 +762,7 @@ TEST_F(ColumnFactoryTest, FromStructScalarNull) { struct_from_scalar(false); }
 
 TEST_F(ColumnFactoryTest, FromScalarErrors)
 {
+  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
   cudf::string_scalar ss("hello world");
   EXPECT_THROW(cudf::make_column_from_scalar(ss, 214748365), std::overflow_error);
 
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index a9bf22682cf..3b7bff69938 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -29,6 +29,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/filling.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -188,6 +189,8 @@ TEST_F(StringColumnTest, ConcatenateManyColumns)
 
 TEST_F(StringColumnTest, ConcatenateTooLarge)
 {
+  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+
   std::string big_str(1000000, 'a');  // 1 million bytes x 5 = 5 million bytes
   cudf::test::strings_column_wrapper input{big_str, big_str, big_str, big_str, big_str};
   std::vector<cudf::column_view> input_cols;
@@ -374,7 +377,7 @@ TEST_F(OverflowTest, OverflowTest)
   }
 
   // string column, overflow on chars
-  {
+  if (!cudf::strings::detail::is_large_strings_enabled()) {
     constexpr auto size = static_cast<cudf::size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 6 string columns of with 1 billion chars in each
@@ -497,7 +500,7 @@ TEST_F(OverflowTest, Presliced)
   }
 
   // strings, overflow on chars
-  {
+  if (!cudf::strings::detail::is_large_strings_enabled()) {
     constexpr cudf::size_type total_chars_size = 1024 * 1024 * 1024;
     constexpr cudf::size_type string_size      = 64;
     constexpr cudf::size_type num_rows         = total_chars_size / string_size;
diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp
index b22d7257041..a1bb87a43fb 100644
--- a/cpp/tests/strings/array_tests.cpp
+++ b/cpp/tests/strings/array_tests.cpp
@@ -23,6 +23,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/sorting.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -152,6 +153,8 @@ TEST_F(StringsColumnTest, GatherZeroSizeStringsColumn)
 
 TEST_F(StringsColumnTest, GatherTooBig)
 {
+  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+
   std::vector<int8_t> h_chars(3000000);
   cudf::test::fixed_width_column_wrapper<int8_t> chars(h_chars.begin(), h_chars.end());
   cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets({0, 3000000});
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index 64123690aea..35d648f16e0 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -17,6 +17,7 @@
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
@@ -96,18 +97,11 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
   EXPECT_EQ(strings_view.chars_size(cudf::get_default_stream()), memsize);
 
   // check string data
-  auto h_chars_data = cudf::detail::make_std_vector_sync(
-    cudf::device_span<char const>(strings_view.chars_begin(cudf::get_default_stream()),
-                                  strings_view.chars_size(cudf::get_default_stream())),
-    cudf::get_default_stream());
-  auto h_offsets_data = cudf::detail::make_std_vector_sync(
-    cudf::device_span<cudf::size_type const>(
-      strings_view.offsets().data<cudf::size_type>() + strings_view.offset(),
-      strings_view.size() + 1),
-    cudf::get_default_stream());
-  EXPECT_EQ(memcmp(h_buffer.data(), h_chars_data.data(), h_buffer.size()), 0);
-  EXPECT_EQ(
-    memcmp(h_offsets.data(), h_offsets_data.data(), h_offsets.size() * sizeof(cudf::size_type)), 0);
+  cudf::test::strings_column_wrapper expected(
+    h_test_strings.begin(),
+    h_test_strings.end(),
+    cudf::test::iterators::nulls_from_nullptrs(h_test_strings));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(column->view(), expected);
 }
 
 TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp
index 9d08ac9c00c..0539895c5f4 100644
--- a/cpp/tests/strings/repeat_strings_tests.cpp
+++ b/cpp/tests/strings/repeat_strings_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
@@ -220,6 +221,8 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesInvalidInput)
 
 TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesOverflowOutput)
 {
+  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+
   auto const strs    = strs_col{"1", "12", "123", "1234", "12345", "123456", "1234567"};
   auto const strs_cv = cudf::strings_column_view(strs);
 

From c7fe7fe30853763ed790d0396129e33a583b47e8 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 16 May 2024 10:43:58 -0400
Subject: [PATCH 220/842] Fix multibyte check for case convert for large
 strings (#15721)

Fixes check for multibyte characters on large strings column. The `thrust::count_if` exceeds the int64 reduce type maximum and so the logic was recoded as a native kernel. Added additional tests and fixed subsequent errors where kernels are launched with greater than max(size_type) threads.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15721
---
 cpp/benchmarks/string/case.cpp                |  2 +-
 cpp/src/strings/case.cu                       | 88 ++++++++++++++-----
 cpp/src/strings/copying/concatenate.cu        | 18 ++--
 cpp/tests/CMakeLists.txt                      |  3 +-
 cpp/tests/large_strings/case_tests.cpp        | 52 +++++++++++
 cpp/tests/large_strings/concatenate_tests.cpp | 13 +++
 6 files changed, 141 insertions(+), 35 deletions(-)
 create mode 100644 cpp/tests/large_strings/case_tests.cpp

diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
index a7db972d39f..cd4d3ca964b 100644
--- a/cpp/benchmarks/string/case.cpp
+++ b/cpp/benchmarks/string/case.cpp
@@ -75,5 +75,5 @@ void bench_case(nvbench::state& state)
 NVBENCH_BENCH(bench_case)
   .set_name("case")
   .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
-  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216})
+  .add_int64_axis("num_rows", {32768, 262144, 2097152, 16777216})
   .add_string_axis("encoding", {"ascii", "utf8"});
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 77c014301ba..c1688d20791 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -34,6 +34,7 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cub/cub.cuh>
 #include <cuda/atomic>
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -237,13 +238,16 @@ CUDF_KERNEL void count_bytes_kernel(convert_char_fn converter,
   auto const d_str   = d_strings.element<string_view>(str_idx);
   auto const str_ptr = d_str.data();
 
+  // each thread processes 4 bytes
   size_type size = 0;
-  for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) {
-    auto const chr = str_ptr[i];
-    if (is_utf8_continuation_char(chr)) { continue; }
-    char_utf8 u8 = 0;
-    to_char_utf8(str_ptr + i, u8);
-    size += converter.process_character(u8);
+  for (auto i = lane_idx * 4; i < d_str.size_bytes(); i += cudf::detail::warp_size * 4) {
+    for (auto j = i; (j < (i + 4)) && (j < d_str.size_bytes()); j++) {
+      auto const chr = str_ptr[j];
+      if (is_utf8_continuation_char(chr)) { continue; }
+      char_utf8 u8 = 0;
+      to_char_utf8(str_ptr + j, u8);
+      size += converter.process_character(u8);
+    }
   }
   // this is slightly faster than using the cub::warp_reduce
   if (size > 0) {
@@ -260,6 +264,41 @@ struct ascii_converter_fn {
   __device__ char operator()(char chr) { return converter.process_ascii(chr); }
 };
 
+constexpr int64_t block_size       = 512;
+constexpr int64_t bytes_per_thread = 8;
+
+/**
+ * @brief Checks the chars data for any multibyte characters
+ *
+ * The output count is not accurate but it is only checked for > 0.
+ */
+CUDF_KERNEL void has_multibytes_kernel(char const* d_input_chars,
+                                       int64_t first_offset,
+                                       int64_t last_offset,
+                                       int64_t* d_output)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  // read only every 2nd byte; all bytes in a multibyte char have high bit set
+  auto const byte_idx = (static_cast<int64_t>(idx) * bytes_per_thread) + first_offset;
+  auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
+
+  using block_reduce = cub::BlockReduce<int64_t, block_size>;
+  __shared__ typename block_reduce::TempStorage temp_storage;
+
+  // each thread processes 8 bytes (only 4 need to be checked)
+  int64_t mb_count = 0;
+  for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < last_offset); i += 2) {
+    u_char const chr = static_cast<u_char>(d_input_chars[i]);
+    mb_count += ((chr & 0x80) > 0);
+  }
+  auto const mb_total = block_reduce(temp_storage).Reduce(mb_count, cub::Sum());
+
+  if ((lane_idx == 0) && (mb_total > 0)) {
+    cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
+    ref.fetch_add(mb_total, cuda::std::memory_order_relaxed);
+  }
+}
+
 /**
  * @brief Utility method for converting upper and lower case characters
  * in a strings column
@@ -289,7 +328,8 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
                                                       input.offsets(), input.offset(), stream);
   auto const last_offset =
     cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream);
-  auto const chars_size = last_offset - first_offset;
+  auto const chars_size  = last_offset - first_offset;
+  auto const input_chars = input.chars_begin(stream);
 
   convert_char_fn ccfn{case_flag, d_flags, d_cases, d_special};
   upper_lower_fn converter{ccfn, *d_strings};
@@ -306,16 +346,15 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
 
   // Check if the input contains any multi-byte characters.
   // This check incurs ~20% performance hit for smaller strings and so we only use it
-  // after the threshold check above. The check makes very little impact for larger strings
+  // after the threshold check above. The check makes very little impact for long strings
   // but results in a large performance gain when the input contains only single-byte characters.
-  // The count_if is faster than any_of or all_of: https://github.com/NVIDIA/thrust/issues/1016
-  bool const multi_byte_chars =
-    thrust::count_if(rmm::exec_policy(stream),
-                     input.chars_begin(stream),
-                     input.chars_end(stream),
-                     cuda::proclaim_return_type<bool>(
-                       [] __device__(auto chr) { return is_utf8_continuation_char(chr); })) > 0;
-  if (!multi_byte_chars) {
+  rmm::device_scalar<int64_t> mb_count(0, stream);
+  // cudf::detail::grid_1d is limited to size_type elements
+  auto const num_blocks = util::div_rounding_up_safe(chars_size / bytes_per_thread, block_size);
+  // we only need to check every other byte since either will contain high bit
+  has_multibytes_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+    input_chars, first_offset, last_offset, mb_count.data());
+  if (mb_count.value(stream) == 0) {
     // optimization for ASCII-only case: copy the input column and inplace replace each character
     auto result  = std::make_unique<column>(input.parent(), stream, mr);
     auto d_chars = result->mutable_view().head<char>();
@@ -329,21 +368,21 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
   // note: tried to use segmented-reduce approach instead here and it was consistently slower
   auto [offsets, bytes] = [&] {
     rmm::device_uvector<size_type> sizes(input.size(), stream);
-    constexpr int block_size = 512;
-    cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
-    count_bytes_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    // cudf::detail::grid_1d is limited to size_type threads
+    auto const num_blocks = util::div_rounding_up_safe(
+      static_cast<int64_t>(input.size()) * cudf::detail::warp_size, block_size);
+    count_bytes_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
       ccfn, *d_strings, sizes.data());
     // convert sizes to offsets
     return cudf::strings::detail::make_offsets_child_column(sizes.begin(), sizes.end(), stream, mr);
   }();
 
   // build sub-offsets
-  auto const input_chars = input.chars_begin(stream);
-  auto const sub_count   = chars_size / LS_SUB_BLOCK_SIZE;
-  auto tmp_offsets       = rmm::device_uvector<int64_t>(sub_count + input.size() + 1, stream);
+  auto const sub_count = chars_size / LS_SUB_BLOCK_SIZE;
+  auto tmp_offsets     = rmm::device_uvector<int64_t>(sub_count + input.size() + 1, stream);
   {
-    rmm::device_uvector<size_type> sub_offsets(sub_count, stream);
-    auto const count_itr = thrust::make_counting_iterator<size_type>(0);
+    rmm::device_uvector<int64_t> sub_offsets(sub_count, stream);
+    auto const count_itr = thrust::make_counting_iterator<int64_t>(0);
     thrust::transform(rmm::exec_policy_nosync(stream),
                       count_itr,
                       count_itr + sub_count,
@@ -359,6 +398,7 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
                   sub_offsets.begin(),
                   sub_offsets.end(),
                   tmp_offsets.begin());
+    stream.synchronize();  // protect against destruction of sub_offsets
   }
 
   // run case conversion over the new sub-strings
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 5daacbdc2fa..7622e39e735 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -265,15 +265,15 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
     // Use a heuristic to guess when the fused kernel will be faster than memcpy
     if (use_fused_kernel_heuristic(has_nulls, total_bytes, columns.size())) {
       // Use single kernel launch to copy chars columns
-      constexpr size_type block_size{256};
-      cudf::detail::grid_1d config(total_bytes, block_size);
-      auto const kernel = fused_concatenate_string_chars_kernel;
-      kernel<<<config.num_blocks, config.num_threads_per_block, 0, stream.value()>>>(
-        d_views,
-        d_partition_offsets.data(),
-        static_cast<size_type>(columns.size()),
-        total_bytes,
-        d_new_chars);
+      constexpr size_t block_size{256};
+      // cudf::detail::grid_1d limited to size_type elements
+      auto const num_blocks = util::div_rounding_up_safe(total_bytes, block_size);
+      auto const kernel     = fused_concatenate_string_chars_kernel;
+      kernel<<<num_blocks, block_size, 0, stream.value()>>>(d_views,
+                                                            d_partition_offsets.data(),
+                                                            static_cast<size_type>(columns.size()),
+                                                            total_bytes,
+                                                            d_new_chars);
     } else {
       // Memcpy each input chars column (more efficient for very large strings)
       for (auto column = columns.begin(); column != columns.end(); ++column) {
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index c2982c478cd..db934818ae7 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -572,9 +572,10 @@ ConfigureTest(
 # * large strings test ----------------------------------------------------------------------------
 ConfigureTest(
   LARGE_STRINGS_TEST
+  large_strings/concatenate_tests.cpp
+  large_strings/case_tests.cpp
   large_strings/large_strings_fixture.cpp
   large_strings/merge_tests.cpp
-  large_strings/concatenate_tests.cpp
   large_strings/parquet_tests.cpp
   large_strings/reshape_tests.cpp
   GPUS 1
diff --git a/cpp/tests/large_strings/case_tests.cpp b/cpp/tests/large_strings/case_tests.cpp
new file mode 100644
index 00000000000..e56d984421a
--- /dev/null
+++ b/cpp/tests/large_strings/case_tests.cpp
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/strings/case.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <vector>
+
+struct CaseTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(CaseTest, ToLower)
+{
+  auto const wide = this->wide_column();
+  auto input      = cudf::concatenate(std::vector<cudf::column_view>(120000, wide));  // 230MB
+  auto expected   = cudf::strings::to_lower(cudf::strings_column_view(input->view()));
+
+  int const multiplier = 12;
+  std::vector<cudf::column_view> input_cols(multiplier, input->view());
+  std::vector<cudf::size_type> splits;
+  std::generate_n(std::back_inserter(splits), multiplier - 1, [&input, n = 1]() mutable {
+    return input->view().size() * (n++);
+  });
+
+  auto large_input = cudf::concatenate(input_cols);  // 2700MB > 2GB
+  auto const sv    = cudf::strings_column_view(large_input->view());
+  auto result      = cudf::strings::to_lower(sv);
+
+  // verify results in sections
+  auto sliced = cudf::split(result->view(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, expected->view());
+  }
+}
diff --git a/cpp/tests/large_strings/concatenate_tests.cpp b/cpp/tests/large_strings/concatenate_tests.cpp
index aa445bf761b..89be2c307bf 100644
--- a/cpp/tests/large_strings/concatenate_tests.cpp
+++ b/cpp/tests/large_strings/concatenate_tests.cpp
@@ -63,3 +63,16 @@ TEST_F(ConcatenateTest, ConcatenateVertical)
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, input);
   }
 }
+
+TEST_F(ConcatenateTest, ManyColumns)
+{
+  auto input           = this->wide_column();
+  auto view            = cudf::column_view(input);
+  int const multiplier = 1200000;
+  std::vector<cudf::column_view> input_cols(multiplier, view);  // 2500MB > 2GB
+  // this tests a unique path through the code
+  auto result = cudf::concatenate(input_cols);
+  auto sv     = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.size(), view.size() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+}

From 47ed34551f860cb2bcc187d806a5d7612fbea38d Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 16 May 2024 10:47:57 -0500
Subject: [PATCH 221/842] Fix parquet predicate filtering with column
 projection (#15113)

Fixes #15051

The predicate filtering in parquet did not work while column projection is used. This PR fixes that limitation.

With this PR change, the user will be able to use both column name reference and column index reference in the filter.
- column name reference: the filters may specify any columns by name even if they are not present in column projection.
- column reference (index): The indices used should be the indices of output columns in the requested order.

This is achieved by extracting column names from filter and add to output buffers, after predicate filtering is done, these filter-only columns are removed and only requested columns are returned.
The change includes reading only output columns' statistics data instead of all root columns.

Summary of changes:
- `get_column_names_in_expression` extracts column names in filter.
- The extra columns in filter are added to output buffers during reader initialization
  - `cpp/src/io/parquet/reader_impl_helpers.cpp`, `cpp/src/io/parquet/reader_impl.cpp`
- instead of extracting statistics data of all root columns, it extracts for only output columns (including columns in filter)
  - `cpp/src/io/parquet/predicate_pushdown.cpp`
  - To do this, output column schemas and its dtypes should be cached.
  - statistics data extraction code is updated to check for `schema_idx` in row group metadata.
  - No need to convert filter again for all root columns, reuse the passed output columns reference filter.
  - Rest of the code is same.
- After the output filter predicate is calculated, these filter-only columns are removed
- moved `named_to_reference_converter` constructor to cpp, and remove used constructor.
- small include<> cleanup

Authors:
  - Karthikeyan (https://github.com/karthikeyann)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/15113
---
 cpp/include/cudf/io/parquet.hpp              |  29 +++-
 cpp/src/io/parquet/predicate_pushdown.cpp    | 139 ++++++++++++++++---
 cpp/src/io/parquet/reader_impl.cpp           |  20 ++-
 cpp/src/io/parquet/reader_impl.hpp           |   3 +
 cpp/src/io/parquet/reader_impl_chunking.cu   |   1 +
 cpp/src/io/parquet/reader_impl_helpers.cpp   |  37 +++--
 cpp/src/io/parquet/reader_impl_helpers.hpp   |  44 +++---
 cpp/src/io/parquet/reader_impl_preprocess.cu |  18 ++-
 cpp/tests/io/parquet_reader_test.cpp         |  50 +++++++
 9 files changed, 276 insertions(+), 65 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 7f034668e43..b2f949cdcee 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -205,6 +205,31 @@ class parquet_reader_options {
   /**
    * @brief Sets AST based filter for predicate pushdown.
    *
+   * The filter can utilize cudf::ast::column_name_reference to reference a column by its name,
+   * even if it's not necessarily present in the requested projected columns.
+   * To refer to output column indices, you can use cudf::ast::column_reference.
+   *
+   * For a parquet with columns ["A", "B", "C", ... "X", "Y", "Z"],
+   * Example 1: with/without column projection
+   * @code
+   * use_columns({"A", "X", "Z"})
+   * .filter(operation(ast_operator::LESS, column_name_reference{"C"}, literal{100}));
+   * @endcode
+   * Column "C" need not be present in output table.
+   * Example 2: without column projection
+   * @code
+   * filter(operation(ast_operator::LESS, column_reference{1}, literal{100}));
+   * @endcode
+   * Here, `1` will refer to column "B" because output will contain all columns in
+   * order ["A", ..., "Z"].
+   * Example 3: with column projection
+   * @code
+   * use_columns({"A", "Z", "X"})
+   * .filter(operation(ast_operator::LESS, column_reference{1}, literal{100}));
+   * @endcode
+   * Here, `1` will refer to column "Z" because output will contain 3 columns in
+   * order ["A", "Z", "X"].
+   *
    * @param filter AST expression to use as filter
    */
   void set_filter(ast::expression const& filter) { _filter = filter; }
@@ -309,9 +334,7 @@ class parquet_reader_options_builder {
   }
 
   /**
-   * @brief Sets vector of individual row groups to read.
-   *
-   * @param filter Vector of row groups to read
+   * @copydoc parquet_reader_options::set_filter
    * @return this for chaining
    */
   parquet_reader_options_builder& filter(ast::expression const& filter)
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 9869dafadfb..0109be661a7 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -31,10 +31,12 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+
 #include <algorithm>
-#include <list>
 #include <numeric>
 #include <optional>
+#include <unordered_set>
 
 namespace cudf::io::parquet::detail {
 
@@ -127,7 +129,7 @@ struct stats_caster {
   // Creates device columns from column statistics (min, max)
   template <typename T>
   std::pair<std::unique_ptr<column>, std::unique_ptr<column>> operator()(
-    size_t col_idx,
+    int schema_idx,
     cudf::data_type dtype,
     rmm::cuda_stream_view stream,
     rmm::device_async_resource_ref mr) const
@@ -206,22 +208,31 @@ struct stats_caster {
       };  // local struct host_column
       host_column min(total_row_groups);
       host_column max(total_row_groups);
-
       size_type stats_idx = 0;
       for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) {
         for (auto const rg_idx : row_group_indices[src_idx]) {
           auto const& row_group = per_file_metadata[src_idx].row_groups[rg_idx];
-          auto const& colchunk  = row_group.columns[col_idx];
-          // To support deprecated min, max fields.
-          auto const& min_value = colchunk.meta_data.statistics.min_value.has_value()
-                                    ? colchunk.meta_data.statistics.min_value
-                                    : colchunk.meta_data.statistics.min;
-          auto const& max_value = colchunk.meta_data.statistics.max_value.has_value()
-                                    ? colchunk.meta_data.statistics.max_value
-                                    : colchunk.meta_data.statistics.max;
-          // translate binary data to Type then to <T>
-          min.set_index(stats_idx, min_value, colchunk.meta_data.type);
-          max.set_index(stats_idx, max_value, colchunk.meta_data.type);
+          auto col              = std::find_if(
+            row_group.columns.begin(),
+            row_group.columns.end(),
+            [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx; });
+          if (col != std::end(row_group.columns)) {
+            auto const& colchunk = *col;
+            // To support deprecated min, max fields.
+            auto const& min_value = colchunk.meta_data.statistics.min_value.has_value()
+                                      ? colchunk.meta_data.statistics.min_value
+                                      : colchunk.meta_data.statistics.min;
+            auto const& max_value = colchunk.meta_data.statistics.max_value.has_value()
+                                      ? colchunk.meta_data.statistics.max_value
+                                      : colchunk.meta_data.statistics.max;
+            // translate binary data to Type then to <T>
+            min.set_index(stats_idx, min_value, colchunk.meta_data.type);
+            max.set_index(stats_idx, max_value, colchunk.meta_data.type);
+          } else {
+            // Marking it null, if column present in row group
+            min.set_index(stats_idx, thrust::nullopt, {});
+            max.set_index(stats_idx, thrust::nullopt, {});
+          }
           stats_idx++;
         }
       };
@@ -378,6 +389,7 @@ class stats_expression_converter : public ast::detail::expression_transformer {
 std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::filter_row_groups(
   host_span<std::vector<size_type> const> row_group_indices,
   host_span<data_type const> output_dtypes,
+  host_span<int const> output_column_schemas,
   std::reference_wrapper<ast::expression const> filter,
   rmm::cuda_stream_view stream) const
 {
@@ -412,7 +424,8 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   std::vector<std::unique_ptr<column>> columns;
   stats_caster stats_col{total_row_groups, per_file_metadata, input_row_group_indices};
   for (size_t col_idx = 0; col_idx < output_dtypes.size(); col_idx++) {
-    auto const& dtype = output_dtypes[col_idx];
+    auto const schema_idx = output_column_schemas[col_idx];
+    auto const& dtype     = output_dtypes[col_idx];
     // Only comparable types except fixed point are supported.
     if (cudf::is_compound(dtype) && dtype.id() != cudf::type_id::STRING) {
       // placeholder only for unsupported types.
@@ -423,14 +436,14 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
       continue;
     }
     auto [min_col, max_col] =
-      cudf::type_dispatcher<dispatch_storage_type>(dtype, stats_col, col_idx, dtype, stream, mr);
+      cudf::type_dispatcher<dispatch_storage_type>(dtype, stats_col, schema_idx, dtype, stream, mr);
     columns.push_back(std::move(min_col));
     columns.push_back(std::move(max_col));
   }
   auto stats_table = cudf::table(std::move(columns));
 
   // Converts AST to StatsAST with reference to min, max columns in above `stats_table`.
-  stats_expression_converter stats_expr{filter, static_cast<size_type>(output_dtypes.size())};
+  stats_expression_converter stats_expr{filter.get(), static_cast<size_type>(output_dtypes.size())};
   auto stats_ast     = stats_expr.get_stats_expr();
   auto predicate_col = cudf::detail::compute_column(stats_table, stats_ast.get(), stream, mr);
   auto predicate     = predicate_col->view();
@@ -475,6 +488,20 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
 }
 
 // convert column named expression to column index reference expression
+named_to_reference_converter::named_to_reference_converter(
+  std::optional<std::reference_wrapper<ast::expression const>> expr, table_metadata const& metadata)
+{
+  if (!expr.has_value()) return;
+  // create map for column name.
+  std::transform(metadata.schema_info.cbegin(),
+                 metadata.schema_info.cend(),
+                 thrust::counting_iterator<size_t>(0),
+                 std::inserter(column_name_to_index, column_name_to_index.end()),
+                 [](auto const& sch, auto index) { return std::make_pair(sch.name, index); });
+
+  expr.value().get().accept(*this);
+}
+
 std::reference_wrapper<ast::expression const> named_to_reference_converter::visit(
   ast::literal const& expr)
 {
@@ -530,4 +557,82 @@ named_to_reference_converter::visit_operands(
   return transformed_operands;
 }
 
+/**
+ * @brief Converts named columns to index reference columns
+ *
+ */
+class names_from_expression : public ast::detail::expression_transformer {
+ public:
+  names_from_expression(std::optional<std::reference_wrapper<ast::expression const>> expr,
+                        std::vector<std::string> const& skip_names)
+    : _skip_names(skip_names.cbegin(), skip_names.cend())
+  {
+    if (!expr.has_value()) return;
+    expr.value().get().accept(*this);
+  }
+
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::literal const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::literal const& expr) override
+  {
+    return expr;
+  }
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::column_reference const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::column_reference const& expr) override
+  {
+    return expr;
+  }
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::column_name_reference const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(
+    ast::column_name_reference const& expr) override
+  {
+    // collect column names
+    auto col_name = expr.get_column_name();
+    if (_skip_names.count(col_name) == 0) { _column_names.insert(col_name); }
+    return expr;
+  }
+  /**
+   * @copydoc ast::detail::expression_transformer::visit(ast::operation const& )
+   */
+  std::reference_wrapper<ast::expression const> visit(ast::operation const& expr) override
+  {
+    visit_operands(expr.get_operands());
+    return expr;
+  }
+
+  /**
+   * @brief Returns the column names in AST.
+   *
+   * @return AST operation expression
+   */
+  [[nodiscard]] std::vector<std::string> to_vector() &&
+  {
+    return {std::make_move_iterator(_column_names.begin()),
+            std::make_move_iterator(_column_names.end())};
+  }
+
+ private:
+  void visit_operands(std::vector<std::reference_wrapper<ast::expression const>> operands)
+  {
+    for (auto const& operand : operands) {
+      operand.get().accept(*this);
+    }
+  }
+
+  std::unordered_set<std::string> _column_names;
+  std::unordered_set<std::string> _skip_names;
+};
+
+[[nodiscard]] std::vector<std::string> get_column_names_in_expression(
+  std::optional<std::reference_wrapper<ast::expression const>> expr,
+  std::vector<std::string> const& skip_names)
+{
+  return names_from_expression(expr, skip_names).to_vector();
+}
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 5b7c180195b..b0d19ad00f3 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -26,6 +26,8 @@
 
 #include <rmm/resource_ref.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+
 #include <bitset>
 #include <numeric>
 
@@ -436,9 +438,18 @@ reader::impl::impl(std::size_t chunk_read_limit,
   // Binary columns can be read as binary or strings
   _reader_column_schema = options.get_column_schema();
 
-  // Select only columns required by the options
+  // Select only columns required by the options and filter
+  std::optional<std::vector<std::string>> filter_columns_names;
+  if (options.get_filter().has_value() and options.get_columns().has_value()) {
+    // list, struct, dictionary are not supported by AST filter yet.
+    // extract columns not present in get_columns() & keep count to remove at end.
+    filter_columns_names =
+      get_column_names_in_expression(options.get_filter(), *(options.get_columns()));
+    _num_filter_only_columns = filter_columns_names->size();
+  }
   std::tie(_input_columns, _output_buffers, _output_column_schemas) =
     _metadata->select_columns(options.get_columns(),
+                              filter_columns_names,
                               options.is_enabled_use_pandas_metadata(),
                               _strings_to_categorical,
                               _timestamp_type.id());
@@ -572,7 +583,12 @@ table_with_metadata reader::impl::finalize_output(
       *read_table, filter.value().get(), _stream, rmm::mr::get_current_device_resource());
     CUDF_EXPECTS(predicate->view().type().id() == type_id::BOOL8,
                  "Predicate filter should return a boolean");
-    auto output_table = cudf::detail::apply_boolean_mask(*read_table, *predicate, _stream, _mr);
+    // Exclude columns present in filter only in output
+    auto counting_it        = thrust::make_counting_iterator<std::size_t>(0);
+    auto const output_count = read_table->num_columns() - _num_filter_only_columns;
+    auto only_output        = read_table->select(counting_it, counting_it + output_count);
+    auto output_table = cudf::detail::apply_boolean_mask(only_output, *predicate, _stream, _mr);
+    if (_num_filter_only_columns > 0) { out_metadata.schema_info.resize(output_count); }
     return {std::move(output_table), std::move(out_metadata)};
   }
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 6c6cedf4e76..b67d2e312d7 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -368,6 +368,9 @@ class reader::impl {
   // _output_buffers associated metadata
   std::unique_ptr<table_metadata> _output_metadata;
 
+  // number of extra filter columns
+  std::size_t _num_filter_only_columns{0};
+
   bool _strings_to_categorical = false;
 
   // are there usable page indexes available
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index f4fb6bc57e6..6824d72cf04 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "compact_protocol_reader.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/utilities/config_utils.hpp"
 #include "io/utilities/time_utils.cuh"
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index dfbc8c565ad..eb653c6b9ac 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -16,6 +16,7 @@
 
 #include "reader_impl_helpers.hpp"
 
+#include "compact_protocol_reader.hpp"
 #include "io/parquet/parquet.hpp"
 #include "io/utilities/base64_utilities.hpp"
 #include "io/utilities/row_selection.hpp"
@@ -25,6 +26,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 
+#include <functional>
 #include <numeric>
 #include <regex>
 
@@ -954,13 +956,15 @@ aggregate_reader_metadata::select_row_groups(
   int64_t skip_rows_opt,
   std::optional<size_type> const& num_rows_opt,
   host_span<data_type const> output_dtypes,
+  host_span<int const> output_column_schemas,
   std::optional<std::reference_wrapper<ast::expression const>> filter,
   rmm::cuda_stream_view stream) const
 {
   std::optional<std::vector<std::vector<size_type>>> filtered_row_group_indices;
+  // if filter is not empty, then gather row groups to read after predicate pushdown
   if (filter.has_value()) {
-    filtered_row_group_indices =
-      filter_row_groups(row_group_indices, output_dtypes, filter.value(), stream);
+    filtered_row_group_indices = filter_row_groups(
+      row_group_indices, output_dtypes, output_column_schemas, filter.value(), stream);
     if (filtered_row_group_indices.has_value()) {
       row_group_indices =
         host_span<std::vector<size_type> const>(filtered_row_group_indices.value());
@@ -1017,10 +1021,12 @@ aggregate_reader_metadata::select_row_groups(
 std::tuple<std::vector<input_column_info>,
            std::vector<cudf::io::detail::inline_column_buffer>,
            std::vector<size_type>>
-aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>> const& use_names,
-                                          bool include_index,
-                                          bool strings_to_categorical,
-                                          type_id timestamp_type_id) const
+aggregate_reader_metadata::select_columns(
+  std::optional<std::vector<std::string>> const& use_names,
+  std::optional<std::vector<std::string>> const& filter_columns_names,
+  bool include_index,
+  bool strings_to_categorical,
+  type_id timestamp_type_id) const
 {
   auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) {
     auto const& col_schema_idx =
@@ -1184,13 +1190,18 @@ aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>
 
     // Find which of the selected paths are valid and get their schema index
     std::vector<path_info> valid_selected_paths;
-    for (auto const& selected_path : *use_names) {
-      auto found_path =
-        std::find_if(all_paths.begin(), all_paths.end(), [&](path_info& valid_path) {
-          return valid_path.full_path == selected_path;
-        });
-      if (found_path != all_paths.end()) {
-        valid_selected_paths.push_back({selected_path, found_path->schema_idx});
+    // vector reference pushback (*use_names). If filter names passed.
+    std::vector<std::reference_wrapper<std::vector<std::string> const>> column_names{
+      *use_names, *filter_columns_names};
+    for (auto const& used_column_names : column_names) {
+      for (auto const& selected_path : used_column_names.get()) {
+        auto found_path =
+          std::find_if(all_paths.begin(), all_paths.end(), [&](path_info& valid_path) {
+            return valid_path.full_path == selected_path;
+          });
+        if (found_path != all_paths.end()) {
+          valid_selected_paths.push_back({selected_path, found_path->schema_idx});
+        }
       }
     }
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 398812945e2..9aeb19a7723 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include "compact_protocol_reader.hpp"
 #include "parquet_gpu.hpp"
 
 #include <cudf/ast/detail/expression_transformer.hpp>
@@ -25,9 +24,6 @@
 #include <cudf/io/datasource.hpp>
 #include <cudf/types.hpp>
 
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-
 #include <list>
 #include <tuple>
 #include <vector>
@@ -257,7 +253,8 @@ class aggregate_reader_metadata {
    * @brief Filters the row groups based on predicate filter
    *
    * @param row_group_indices Lists of row groups to read, one per source
-   * @param output_dtypes List of output column datatypes
+   * @param output_dtypes Datatypes of of output columns
+   * @param output_column_schemas schema indices of output columns
    * @param filter AST expression to filter row groups based on Column chunk statistics
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return Filtered row group indices, if any is filtered.
@@ -265,6 +262,7 @@ class aggregate_reader_metadata {
   [[nodiscard]] std::optional<std::vector<std::vector<size_type>>> filter_row_groups(
     host_span<std::vector<size_type> const> row_group_indices,
     host_span<data_type const> output_dtypes,
+    host_span<int const> output_column_schemas,
     std::reference_wrapper<ast::expression const> filter,
     rmm::cuda_stream_view stream) const;
 
@@ -277,7 +275,8 @@ class aggregate_reader_metadata {
    * @param row_group_indices Lists of row groups to read, one per source
    * @param row_start Starting row of the selection
    * @param row_count Total number of rows selected
-   * @param output_dtypes List of output column datatypes
+   * @param output_dtypes Datatypes of of output columns
+   * @param output_column_schemas schema indices of output columns
    * @param filter Optional AST expression to filter row groups based on Column chunk statistics
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return A tuple of corrected row_start, row_count and list of row group indexes and its
@@ -288,6 +287,7 @@ class aggregate_reader_metadata {
     int64_t row_start,
     std::optional<size_type> const& row_count,
     host_span<data_type const> output_dtypes,
+    host_span<int const> output_column_schemas,
     std::optional<std::reference_wrapper<ast::expression const>> filter,
     rmm::cuda_stream_view stream) const;
 
@@ -296,6 +296,7 @@ class aggregate_reader_metadata {
    *
    * @param use_names List of paths of column names to select; `nullopt` if user did not select
    * columns to read
+   * @param filter_columns_names List of paths of column names that are present only in filter
    * @param include_index Whether to always include the PANDAS index column(s)
    * @param strings_to_categorical Type conversion parameter
    * @param timestamp_type_id Type conversion parameter
@@ -307,6 +308,7 @@ class aggregate_reader_metadata {
                            std::vector<cudf::io::detail::inline_column_buffer>,
                            std::vector<size_type>>
   select_columns(std::optional<std::vector<std::string>> const& use_names,
+                 std::optional<std::vector<std::string>> const& filter_columns_names,
                  bool include_index,
                  bool strings_to_categorical,
                  type_id timestamp_type_id) const;
@@ -319,23 +321,7 @@ class aggregate_reader_metadata {
 class named_to_reference_converter : public ast::detail::expression_transformer {
  public:
   named_to_reference_converter(std::optional<std::reference_wrapper<ast::expression const>> expr,
-                               table_metadata const& metadata)
-    : metadata(metadata)
-  {
-    if (!expr.has_value()) return;
-    // create map for column name.
-    std::transform(
-      thrust::make_zip_iterator(metadata.schema_info.cbegin(),
-                                thrust::counting_iterator<size_t>(0)),
-      thrust::make_zip_iterator(metadata.schema_info.cend(),
-                                thrust::counting_iterator(metadata.schema_info.size())),
-      std::inserter(column_name_to_index, column_name_to_index.end()),
-      [](auto const& name_index) {
-        return std::make_pair(thrust::get<0>(name_index).name, thrust::get<1>(name_index));
-      });
-
-    expr.value().get().accept(*this);
-  }
+                               table_metadata const& metadata);
 
   /**
    * @copydoc ast::detail::expression_transformer::visit(ast::literal const& )
@@ -370,7 +356,6 @@ class named_to_reference_converter : public ast::detail::expression_transformer
   std::vector<std::reference_wrapper<ast::expression const>> visit_operands(
     std::vector<std::reference_wrapper<ast::expression const>> operands);
 
-  table_metadata const& metadata;
   std::unordered_map<std::string, size_type> column_name_to_index;
   std::optional<std::reference_wrapper<ast::expression const>> _stats_expr;
   // Using std::list or std::deque to avoid reference invalidation
@@ -378,4 +363,15 @@ class named_to_reference_converter : public ast::detail::expression_transformer
   std::list<ast::operation> _operators;
 };
 
+/**
+ * @brief Get the column names in expression object
+ *
+ * @param expr The optional expression object to get the column names from
+ * @param skip_names The names of column names to skip in returned column names
+ * @return The column names present in expression object except the skip_names
+ */
+[[nodiscard]] std::vector<std::string> get_column_names_in_expression(
+  std::optional<std::reference_wrapper<ast::expression const>> expr,
+  std::vector<std::string> const& skip_names);
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index a5cd7d06536..084f82a2ca0 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1230,17 +1230,23 @@ void reader::impl::preprocess_file(
   CUDF_EXPECTS(!_file_preprocessed, "Attempted to preprocess file more than once");
 
   // if filter is not empty, then create output types as vector and pass for filtering.
-  std::vector<data_type> output_types;
+  std::vector<data_type> output_dtypes;
   if (filter.has_value()) {
-    std::transform(_output_buffers.cbegin(),
-                   _output_buffers.cend(),
-                   std::back_inserter(output_types),
+    std::transform(_output_buffers_template.cbegin(),
+                   _output_buffers_template.cend(),
+                   std::back_inserter(output_dtypes),
                    [](auto const& col) { return col.type; });
   }
+
   std::tie(
     _file_itm_data.global_skip_rows, _file_itm_data.global_num_rows, _file_itm_data.row_groups) =
-    _metadata->select_row_groups(
-      row_group_indices, skip_rows, num_rows, output_types, filter, _stream);
+    _metadata->select_row_groups(row_group_indices,
+                                 skip_rows,
+                                 num_rows,
+                                 output_dtypes,
+                                 _output_column_schemas,
+                                 filter,
+                                 _stream);
 
   // check for page indexes
   _has_page_index = std::all_of(_file_itm_data.row_groups.begin(),
diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index 85ada9b38fc..aa9172b0608 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -1406,6 +1406,56 @@ TEST_F(ParquetReaderTest, FilterIdentity)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *result2.tbl);
 }
 
+TEST_F(ParquetReaderTest, FilterWithColumnProjection)
+{
+  // col_uint32, col_int64, col_double
+  auto [src, filepath] = create_parquet_with_stats("FilterWithColumnProjection.parquet");
+  auto val             = cudf::numeric_scalar<uint32_t>{10};
+  auto lit             = cudf::ast::literal{val};
+  auto col_ref         = cudf::ast::column_name_reference{"col_uint32"};
+  auto col_index       = cudf::ast::column_reference{0};
+  auto filter_expr     = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_index, lit);
+
+  auto predicate = cudf::compute_column(src, filter_expr);
+
+  {  // column_name_reference in parquet filter (not present in column projection)
+    auto read_expr       = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_ref, lit);
+    auto projected_table = cudf::table_view{{src.get_column(2)}};
+    auto expected        = cudf::apply_boolean_mask(projected_table, *predicate);
+
+    auto read_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+                       .columns({"col_double"})
+                       .filter(read_expr);
+    auto result = cudf::io::read_parquet(read_opts);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
+  }
+
+  {  // column_reference in parquet filter (indices as per order of column projection)
+    auto col_index2    = cudf::ast::column_reference{1};
+    auto read_ref_expr = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_index2, lit);
+
+    auto projected_table = cudf::table_view{{src.get_column(2), src.get_column(0)}};
+    auto expected        = cudf::apply_boolean_mask(projected_table, *predicate);
+    auto read_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+                       .columns({"col_double", "col_uint32"})
+                       .filter(read_ref_expr);
+    auto result = cudf::io::read_parquet(read_opts);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
+  }
+
+  // Error cases
+  {  // column_reference is not same type as literal, column_reference index is out of bounds
+    for (auto const index : {0, 2}) {
+      auto col_index2    = cudf::ast::column_reference{index};
+      auto read_ref_expr = cudf::ast::operation(cudf::ast::ast_operator::LESS, col_index2, lit);
+      auto read_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+                         .columns({"col_double", "col_uint32"})
+                         .filter(read_ref_expr);
+      EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error);
+    }
+  }
+}
+
 TEST_F(ParquetReaderTest, FilterReferenceExpression)
 {
   auto [src, filepath] = create_parquet_with_stats("FilterReferenceExpression.parquet");

From fcbc1bc8a5d81797c4974ff7559eac44f3854697 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 16 May 2024 07:53:44 -1000
Subject: [PATCH 222/842] Fix id_vars and value_vars not accepting string
 scalars in melt (#15765)

closes #15758

Also fixes an inconsistency with pandas where `var_name` data was always a `Categorical` unlike pandas

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15765
---
 python/cudf/cudf/core/reshape.py       | 27 +++++++++++++-------------
 python/cudf/cudf/tests/test_reshape.py | 27 ++++++++++++++++++--------
 2 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 26d91bed173..0b44ab58f30 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -2,10 +2,8 @@
 
 import itertools
 import warnings
-from collections import abc
 from typing import Dict, Optional
 
-import cupy
 import numpy as np
 import pandas as pd
 
@@ -590,7 +588,7 @@ def melt(
 
     # id_vars
     if id_vars is not None:
-        if not isinstance(id_vars, abc.Sequence):
+        if cudf.api.types.is_scalar(id_vars):
             id_vars = [id_vars]
         id_vars = list(id_vars)
         missing = set(id_vars) - set(frame._column_names)
@@ -604,7 +602,7 @@ def melt(
 
     # value_vars
     if value_vars is not None:
-        if not isinstance(value_vars, abc.Sequence):
+        if cudf.api.types.is_scalar(value_vars):
             value_vars = [value_vars]
         value_vars = list(value_vars)
         missing = set(value_vars) - set(frame._column_names)
@@ -658,21 +656,22 @@ def _tile(A, reps):
     # Step 2: add variable
     nval = len(value_vars)
     dtype = min_unsigned_type(nval)
-    temp = cudf.Series(cupy.repeat(cupy.arange(nval, dtype=dtype), N))
 
     if not var_name:
         var_name = "variable"
 
-    mdata[var_name] = cudf.Series(
-        cudf.core.column.build_categorical_column(
-            categories=value_vars,
-            codes=temp._column,
-            mask=temp._column.base_mask,
-            size=temp._column.size,
-            offset=temp._column.offset,
-            ordered=False,
+    if not value_vars:
+        # TODO: Use frame._data.label_dtype when it's more consistently set
+        var_data = cudf.Series(
+            value_vars, dtype=frame._data.to_pandas_index().dtype
         )
-    )
+    else:
+        var_data = (
+            cudf.Series(value_vars)
+            .take(np.repeat(np.arange(nval, dtype=dtype), N))
+            .reset_index(drop=True)
+        )
+    mdata[var_name] = var_data
 
     # Step 3: add values
     mdata[value_name] = cudf.Series._concat(
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index d618669755d..daa1e70808f 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -8,7 +8,6 @@
 import pytest
 
 import cudf
-from cudf import melt as cudf_melt
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.testing._utils import (
@@ -71,15 +70,10 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):
 
     gdf = cudf.from_pandas(pdf)
 
-    got = cudf_melt(frame=gdf, id_vars=id_vars, value_vars=value_vars)
+    got = cudf.melt(frame=gdf, id_vars=id_vars, value_vars=value_vars)
     got_from_melt_method = gdf.melt(id_vars=id_vars, value_vars=value_vars)
 
     expect = pd.melt(frame=pdf, id_vars=id_vars, value_vars=value_vars)
-    # pandas' melt makes the 'variable' column of 'object' type (string)
-    # cuDF's melt makes it Categorical because it doesn't support strings
-    expect["variable"] = expect["variable"].astype(
-        got["variable"].dtype.to_pandas()
-    )
 
     assert_eq(expect, got)
 
@@ -98,11 +92,28 @@ def test_melt_many_columns():
     grid_df_d = cudf.melt(
         df_d, id_vars=["id"], var_name="d", value_name="sales"
     )
-    grid_df_d["d"] = grid_df_d["d"].astype("str")
+    grid_df_d["d"] = grid_df_d["d"]
 
     assert_eq(grid_df, grid_df_d)
 
 
+def test_melt_str_scalar_id_var():
+    data = {"index": [1, 2], "id": [1, 2], "d0": [10, 20], "d1": [30, 40]}
+    result = cudf.melt(
+        cudf.DataFrame(data),
+        id_vars="index",
+        var_name="column",
+        value_name="value",
+    )
+    expected = pd.melt(
+        pd.DataFrame(data),
+        id_vars="index",
+        var_name="column",
+        value_name="value",
+    )
+    assert_eq(result, expected)
+
+
 @pytest.mark.parametrize("num_cols", [1, 2, 10])
 @pytest.mark.parametrize("num_rows", [1, 2, 1000])
 @pytest.mark.parametrize(

From 49af2615ca81e65c991954ed905c4a6151fc88fd Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 16 May 2024 14:55:06 -0400
Subject: [PATCH 223/842] Update strings contains benchmarks to nvbench
 (#15495)

Reference #15405
Updates the benchmarks for `cudf::strings::contains()` to use nvbench and also introduce a hit-test axis.
The logic has been updated to remove the unneeded `fill()` call for long strings.
Also cleaned up code and updated logic to process 4 bytes per warp thread.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15495
---
 cpp/benchmarks/CMakeLists.txt  |   2 +-
 cpp/benchmarks/string/find.cpp | 109 ++++++++++++++++-----------------
 cpp/src/strings/search/find.cu |  33 +++++-----
 3 files changed, 73 insertions(+), 71 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index ac4cce02318..4586a12f466 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -301,7 +301,6 @@ ConfigureBench(
   string/copy.cu
   string/factory.cu
   string/filter.cpp
-  string/find.cpp
   string/repeat_strings.cpp
   string/replace.cpp
   string/slice.cpp
@@ -318,6 +317,7 @@ ConfigureNVBench(
   string/copy_range.cpp
   string/count.cpp
   string/extract.cpp
+  string/find.cpp
   string/gather.cpp
   string/join_strings.cpp
   string/lengths.cpp
diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index e866092f3a3..a9c620e4bf0 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -16,78 +16,75 @@
 
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
+#include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
 #include <cudf/strings/find.hpp>
 #include <cudf/strings/find_multiple.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include <limits>
+#include <nvbench/nvbench.cuh>
 
-enum FindAPI { find, find_multi, contains, starts_with, ends_with };
+std::unique_ptr<cudf::column> build_input_column(cudf::size_type n_rows,
+                                                 cudf::size_type row_width,
+                                                 int32_t hit_rate);
 
-class StringFindScalar : public cudf::benchmark {};
-
-static void BM_find_scalar(benchmark::State& state, FindAPI find_api)
+static void bench_find_string(nvbench::state& state)
 {
-  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
-  data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
-  cudf::strings_column_view input(column->view());
-  cudf::string_scalar target("+");
-  cudf::test::strings_column_wrapper targets({"+", "-"});
+  auto const n_rows    = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const hit_rate  = static_cast<cudf::size_type>(state.get_int64("hit_rate"));
+  auto const api       = state.get_string("api");
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    switch (find_api) {
-      case find: cudf::strings::find(input, target); break;
-      case find_multi:
-        cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
-        break;
-      case contains: cudf::strings::contains(input, target); break;
-      case starts_with: cudf::strings::starts_with(input, target); break;
-      case ends_with: cudf::strings::ends_with(input, target); break;
-    }
+  if (static_cast<std::size_t>(n_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
   }
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
-}
+  auto const stream = cudf::get_default_stream();
+  auto const col    = build_input_column(n_rows, row_width, hit_rate);
+  auto const input  = cudf::strings_column_view(col->view());
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 2;
-  for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
-    for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
-      // avoid generating combinations that exceed the cudf column limit
-      size_t total_chars = static_cast<size_t>(row_count) * rowlen;
-      if (total_chars < static_cast<size_t>(std::numeric_limits<cudf::size_type>::max())) {
-        b->Args({row_count, rowlen});
-      }
-    }
+  std::vector<std::string> h_targets({"5W", "5W43", "0987 5W43"});
+  cudf::string_scalar target(h_targets[2]);
+  cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto const chars_size = input.chars_size(stream);
+  state.add_element_count(chars_size, "chars_size");
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  if (api.substr(0, 4) == "find") {
+    state.add_global_memory_writes<nvbench::int32_t>(input.size());
+  } else {
+    state.add_global_memory_writes<nvbench::int8_t>(input.size());
   }
-}
 
-#define STRINGS_BENCHMARK_DEFINE(name)                    \
-  BENCHMARK_DEFINE_F(StringFindScalar, name)              \
-  (::benchmark::State & st) { BM_find_scalar(st, name); } \
-  BENCHMARK_REGISTER_F(StringFindScalar, name)            \
-    ->Apply(generate_bench_args)                          \
-    ->UseManualTime()                                     \
-    ->Unit(benchmark::kMillisecond);
+  if (api == "find") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::find(input, target); });
+  } else if (api == "find_multi") {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::find_multiple(input, cudf::strings_column_view(targets));
+    });
+  } else if (api == "contains") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::contains(input, target); });
+  } else if (api == "starts_with") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::starts_with(input, target); });
+  } else if (api == "ends_with") {
+    state.exec(nvbench::exec_tag::sync,
+               [&](nvbench::launch& launch) { cudf::strings::ends_with(input, target); });
+  }
+}
 
-STRINGS_BENCHMARK_DEFINE(find)
-STRINGS_BENCHMARK_DEFINE(find_multi)
-STRINGS_BENCHMARK_DEFINE(contains)
-STRINGS_BENCHMARK_DEFINE(starts_with)
-STRINGS_BENCHMARK_DEFINE(ends_with)
+NVBENCH_BENCH(bench_find_string)
+  .set_name("find_string")
+  .add_string_axis("api", {"find", "find_multi", "contains", "starts_with", "ends_with"})
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {260'000, 1'953'000, 16'777'216})
+  .add_int64_axis("hit_rate", {20, 80});  // percentage
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index bbd98c4e9ff..45eba39f413 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -361,14 +361,22 @@ CUDF_KERNEL void contains_warp_parallel_fn(column_device_view const d_strings,
   if (d_strings.is_null(str_idx)) { return; }
   // get the string for this warp
   auto const d_str = d_strings.element<string_view>(str_idx);
-  // each thread of the warp will check just part of the string
-  auto found = false;
-  for (auto i = static_cast<size_type>(idx % cudf::detail::warp_size);
+  // each warp processes 4 starting bytes
+  auto constexpr bytes_per_warp = 4;
+  auto found                    = false;
+  for (auto i = lane_idx * bytes_per_warp;
        !found && ((i + d_target.size_bytes()) <= d_str.size_bytes());
-       i += cudf::detail::warp_size) {
+       i += cudf::detail::warp_size * bytes_per_warp) {
     // check the target matches this part of the d_str data
-    if (d_target.compare(d_str.data() + i, d_target.size_bytes()) == 0) { found = true; }
+    // this is definitely faster for very long strings > 128B
+    for (auto j = 0; j < bytes_per_warp; j++) {
+      if (((i + j + d_target.size_bytes()) <= d_str.size_bytes()) &&
+          d_target.compare(d_str.data() + i + j, d_target.size_bytes()) == 0) {
+        found = true;
+      }
+    }
   }
+
   auto const result = warp_reduce(temp_storage).Reduce(found, cub::Max());
   if (lane_idx == 0) { d_results[str_idx] = result; }
 }
@@ -391,12 +399,10 @@ std::unique_ptr<column> contains_warp_parallel(strings_column_view const& input,
 
   // fill the output with `false` unless the `d_target` is empty
   auto results_view = results->mutable_view();
-  thrust::fill(rmm::exec_policy(stream),
-               results_view.begin<bool>(),
-               results_view.end<bool>(),
-               d_target.empty());
-
-  if (!d_target.empty()) {
+  if (d_target.empty()) {
+    thrust::fill(
+      rmm::exec_policy_nosync(stream), results_view.begin<bool>(), results_view.end<bool>(), true);
+  } else {
     // launch warp per string
     auto const d_strings     = column_device_view::create(input.parent(), stream);
     constexpr int block_size = 256;
@@ -461,9 +467,8 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
                     thrust::make_counting_iterator<size_type>(strings_count),
                     d_results,
                     [d_strings, pfn, d_target] __device__(size_type idx) {
-                      if (!d_strings.is_null(idx))
-                        return bool{pfn(d_strings.element<string_view>(idx), d_target)};
-                      return false;
+                      return !d_strings.is_null(idx) &&
+                             bool{pfn(d_strings.element<string_view>(idx), d_target)};
                     });
   results->set_null_count(strings.null_count());
   return results;

From 6d5f9653debe57c7eb52f42fb980d38451a9a460 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 16 May 2024 13:25:47 -0700
Subject: [PATCH 224/842] Fix `chunked_parquet_reader` behavior when input has
 no more rows to read (#15757)

Fixes #15743

This PR solves two problems.

First, it does not any longer throw a CUDA failure or exception when an invalid (out of bound) chunk is read via `chunked_parquet_reader::read_chunk()` and instead returns an empty chunk.

Second, for empty tables, it returns true for `has_next()` until the first call to `chunked_parquet_reader::read_chunk()`. After that `has_next()` returns false but `chunked_parquet_reader::read_chunk()` keeps returning empty chunks

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15757
---
 cpp/include/cudf/io/detail/parquet.hpp      |  7 +++
 cpp/src/io/parquet/reader_impl.cpp          | 22 ++++---
 cpp/src/io/parquet/reader_impl.hpp          | 12 ++++
 cpp/tests/io/parquet_chunked_reader_test.cu | 68 +++++++++++++++++++++
 4 files changed, 102 insertions(+), 7 deletions(-)

diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 55338d422ad..fcf5f0d9290 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -102,6 +102,13 @@ class chunked_reader : private reader {
    *    // Process chunk
    *  } while (reader.has_next());
    *
+   * // Alternatively
+   *
+   *  while (reader.has_next()) {
+   *    auto const chunk = reader.read_chunk();
+   *    // Process chunk
+   *  }
+   *
    * ```
    *
    * If `chunk_read_limit == 0` (i.e., no output limit), and `pass_read_limit == 0` (no input
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index b0d19ad00f3..fba95093c9c 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -476,8 +476,10 @@ void reader::impl::prepare_data(int64_t skip_rows,
   }
 
   // handle any chunking work (ratcheting through the subpasses and chunks within
-  // our current pass)
-  if (_file_itm_data.num_passes() > 0) { handle_chunking(uses_custom_row_bounds); }
+  // our current pass) if in bounds
+  if (_file_itm_data._current_input_pass < _file_itm_data.num_passes()) {
+    handle_chunking(uses_custom_row_bounds);
+  }
 }
 
 void reader::impl::populate_metadata(table_metadata& out_metadata)
@@ -569,14 +571,16 @@ table_with_metadata reader::impl::finalize_output(
     _output_metadata = std::make_unique<table_metadata>(out_metadata);
   }
 
-  // advance output chunk/subpass/pass info
-  if (_file_itm_data.num_passes() > 0) {
+  // advance output chunk/subpass/pass info for non-empty tables if and only if we are in bounds
+  if (_file_itm_data._current_input_pass < _file_itm_data.num_passes()) {
     auto& pass    = *_pass_itm_data;
     auto& subpass = *pass.subpass;
     subpass.current_output_chunk++;
-    _file_itm_data._output_chunk_count++;
   }
 
+  // increment the output chunk count
+  _file_itm_data._output_chunk_count++;
+
   if (filter.has_value()) {
     auto read_table = std::make_unique<table>(std::move(out_columns));
     auto predicate  = cudf::detail::compute_column(
@@ -616,7 +620,8 @@ table_with_metadata reader::impl::read_chunk()
 {
   // Reset the output buffers to their original states (right after reader construction).
   // Don't need to do it if we read the file all at once.
-  if (_file_itm_data._output_chunk_count > 0) {
+  if (_file_itm_data._current_input_pass < _file_itm_data.num_passes() and
+      not is_first_output_chunk()) {
     _output_buffers.resize(0);
     for (auto const& buff : _output_buffers_template) {
       _output_buffers.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
@@ -628,6 +633,7 @@ table_with_metadata reader::impl::read_chunk()
                true /*uses_custom_row_bounds*/,
                {} /*row_group_indices, empty means read all row groups*/,
                std::nullopt /*filter*/);
+
   return read_chunk_internal(true, std::nullopt);
 }
 
@@ -641,7 +647,9 @@ bool reader::impl::has_next()
 
   // current_input_pass will only be incremented to be == num_passes after
   // the last chunk in the last subpass in the last pass has been returned
-  return has_more_work();
+  // if not has_more_work then check if this is the first pass in an empty
+  // table and return true so it could be read once.
+  return has_more_work() or is_first_output_chunk();
 }
 
 namespace {
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index b67d2e312d7..04da8eed591 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -90,6 +90,13 @@ class reader::impl {
    *    // Process chunk
    *  } while (reader.has_next());
    *
+   * // Alternatively
+   *
+   *  while (reader.has_next()) {
+   *    auto const chunk = reader.read_chunk();
+   *    // Process chunk
+   *  }
+   *
    * ```
    *
    * Reading the whole given file at once through `read()` function is still supported if
@@ -347,6 +354,11 @@ class reader::impl {
   }
 
  private:
+  [[nodiscard]] bool is_first_output_chunk() const
+  {
+    return _file_itm_data._output_chunk_count == 0;
+  }
+
   rmm::cuda_stream_view _stream;
   rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index b3f3fac5a3d..cff85647725 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -1409,3 +1409,71 @@ TEST_F(ParquetChunkedReaderInputLimitTest, Mixed)
   constexpr int expected_c[] = {20, 18, 15, 12};
   input_limit_test_read(test_filenames, tbl, 32 * 1024 * 1024, 64 * 1024 * 1024, expected_c);
 }
+
+TEST_F(ParquetChunkedReaderTest, TestChunkedReadOutOfBoundChunks)
+{
+  auto const generate_input = [](int num_rows, bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+    input_columns.emplace_back(int64s_col(value_iter, value_iter + num_rows).release());
+
+    auto filename = "chunked_out_of_bounds_" + std::to_string(num_rows);
+
+    return write_file(input_columns, filename, nullable, false);
+  };
+
+  auto const read_chunks_with_while_loop = [](cudf::io::chunked_parquet_reader const& reader) {
+    auto out_tables = std::vector<std::unique_ptr<cudf::table>>{};
+    int num_chunks  = 0;
+    // should always be true
+    EXPECT_EQ(reader.has_next(), true);
+    while (reader.has_next()) {
+      out_tables.emplace_back(reader.read_chunk().tbl);
+      num_chunks++;
+    }
+    auto out_tviews = std::vector<cudf::table_view>{};
+    for (auto const& tbl : out_tables) {
+      out_tviews.emplace_back(tbl->view());
+    }
+
+    return std::pair(cudf::concatenate(out_tviews), num_chunks);
+  };
+
+  // empty table to compare with the out of bound chunks
+  auto const empty_table = generate_input(0, false).first;
+
+  {
+    auto constexpr num_rows          = 0;
+    auto const [expected, filepath]  = generate_input(num_rows, false);
+    auto constexpr output_read_limit = 1'000;
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{filepath}).build();
+    auto const reader =
+      cudf::io::chunked_parquet_reader(output_read_limit, 0, options, cudf::get_default_stream());
+    auto const [result, num_chunks]     = read_chunks_with_while_loop(reader);
+    auto const out_of_bound_table_chunk = reader.read_chunk().tbl;
+
+    EXPECT_EQ(num_chunks, 1);
+    EXPECT_EQ(reader.has_next(), false);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*out_of_bound_table_chunk, *empty_table);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  {
+    auto constexpr num_rows          = 40'000;
+    auto constexpr output_read_limit = 240'000;
+    auto const [expected, filepath]  = generate_input(num_rows, false);
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{filepath}).build();
+    auto const reader =
+      cudf::io::chunked_parquet_reader(output_read_limit, 0, options, cudf::get_default_stream());
+    auto const [result, num_chunks]     = read_chunks_with_while_loop(reader);
+    auto const out_of_bound_table_chunk = reader.read_chunk().tbl;
+
+    EXPECT_EQ(num_chunks, 2);
+    EXPECT_EQ(reader.has_next(), false);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*out_of_bound_table_chunk, *empty_table);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}

From d10b8e4c9b437377cb6d231873e8f0fe9f8dc817 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 17 May 2024 11:55:21 -0500
Subject: [PATCH 225/842] Handle mixed-like homogeneous types in `isin`
 (#15771)

Fixes: #15768

There is a possibility that a host array can have `object` type but contain all values of a homogeneous type, this still cannot be supported by column constructors because `cudf` doesn't have a true `object` types, hence this PR introduces a workaround for this problem in `isin` API.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15771
---
 python/cudf/cudf/core/column/numerical.py     | 29 ++++++++++++++++---
 python/dask_cudf/dask_cudf/tests/test_core.py | 22 ++++++++++++++
 2 files changed, 47 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 12c27ed0bc1..bab862f775f 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -38,6 +38,7 @@
 )
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.mixins import BinaryOperand
+from cudf.errors import MixedTypeError
 from cudf.utils.dtypes import (
     min_column_type,
     min_signed_type,
@@ -404,10 +405,30 @@ def _process_values_for_isin(
         self, values: Sequence
     ) -> Tuple[ColumnBase, ColumnBase]:
         lhs = cast("cudf.core.column.ColumnBase", self)
-        rhs = as_column(values, nan_as_null=False)
-
-        if isinstance(rhs, NumericalColumn):
-            rhs = rhs.astype(dtype=self.dtype)
+        try:
+            rhs = as_column(values, nan_as_null=False)
+        except (MixedTypeError, TypeError) as e:
+            # There is a corner where `values` can be of `object` dtype
+            # but have values of homogeneous type.
+            inferred_dtype = cudf.api.types.infer_dtype(values)
+            if (
+                self.dtype.kind in {"i", "u"} and inferred_dtype == "integer"
+            ) or (
+                self.dtype.kind == "f"
+                and inferred_dtype in {"floating", "integer"}
+            ):
+                rhs = as_column(values, nan_as_null=False, dtype=self.dtype)
+            elif self.dtype.kind == "f" and inferred_dtype == "integer":
+                rhs = as_column(values, nan_as_null=False, dtype="int")
+            elif (
+                self.dtype.kind in {"i", "u"} and inferred_dtype == "floating"
+            ):
+                rhs = as_column(values, nan_as_null=False, dtype="float")
+            else:
+                raise e
+        else:
+            if isinstance(rhs, NumericalColumn):
+                rhs = rhs.astype(dtype=self.dtype)
 
         if lhs.null_count == len(lhs):
             lhs = lhs.astype(rhs.dtype)
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 981c2c369f1..18a9e3b496f 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -971,3 +971,25 @@ def func(x):
     # NOTE: The calculation here doesn't need to make sense.
     # We just need to make sure we get the right type back.
     assert type(result) == type(expect)
+
+
+@pytest.mark.parametrize("data", [[1, 2, 3], [1.1, 2.3, 4.5]])
+@pytest.mark.parametrize("values", [[1, 5], [1.1, 2.4, 2.3]])
+def test_series_isin(data, values):
+    ser = cudf.Series(data)
+    pddf = dd.from_pandas(ser.to_pandas(), 1)
+    ddf = dask_cudf.from_cudf(ser, 1)
+
+    actual = ddf.isin(values)
+    expected = pddf.isin(values)
+
+    dd.assert_eq(actual, expected)
+
+
+def test_series_isin_error():
+    ser = cudf.Series([1, 2, 3])
+    ddf = dask_cudf.from_cudf(ser, 1)
+    with pytest.raises(TypeError):
+        ser.isin([1, 5, "a"])
+    with pytest.raises(TypeError):
+        ddf.isin([1, 5, "a"]).compute()

From e6e67615c248d4992d0bf2ce5a47b09534cd4c82 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Fri, 17 May 2024 19:52:26 -0400
Subject: [PATCH 226/842] Eagerly populate the class dict for cudf.pandas proxy
 types (#14534)

Rather than dynamically looking up class attributes (and methods), this PR makes it so that we eagerly populate the class with all known methods and attributes (by inspecting the "slow" class).

This solves a number of problems:

- it makes `getattr` trivially inexpensive (no dynamic `__getattr__` for each attribute access)
- it ensures the _same_ object is returned every time you do, e.g., `DataFrame.max`
- it makes tab completion fast because the attributes don't have to be computed each time
- it no longer exposes attributes that are specific to cuDF - for example `Series.list`
- it allows subclassing of proxy types to work better. For example, derived types can now call `super().` to access attributes of base types

Authors:
  - Ashwin Srinath (https://github.com/shwina)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/14534
---
 docs/cudf/source/cudf_pandas/faq.md           |  12 -
 python/cudf/cudf/pandas/_wrappers/common.py   |   8 +-
 python/cudf/cudf/pandas/_wrappers/numpy.py    |   4 +-
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 117 ++++++-
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 297 +++++++++++-------
 python/cudf/cudf/pandas/profiler.py           |   7 +-
 .../cudf/pandas/scripts/run-pandas-tests.sh   |   2 +-
 .../cudf_pandas_tests/test_cudf_pandas.py     |  32 ++
 .../cudf/cudf_pandas_tests/test_profiler.py   |   4 +-
 9 files changed, 326 insertions(+), 157 deletions(-)

diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index dde7afb1360..55976740105 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -151,15 +151,3 @@ for testing or benchmarking purposes. To do so, set the
 ```bash
 CUDF_PANDAS_FALLBACK_MODE=1 python -m cudf.pandas some_script.py
 ```
-
-## Slow tab completion in IPython?
-
-You may experience slow tab completion when inspecting the
-methods/attributes of large dataframes. We expect this issue to be
-resolved in an upcoming release. In the mean time, you may execute the
-following command in IPython before loading `cudf.pandas` to work
-around the issue:
-
-```
-%config IPCompleter.jedi_compute_type_timeout=0
-```
diff --git a/python/cudf/cudf/pandas/_wrappers/common.py b/python/cudf/cudf/pandas/_wrappers/common.py
index 1669882631b..468c5687c15 100644
--- a/python/cudf/cudf/pandas/_wrappers/common.py
+++ b/python/cudf/cudf/pandas/_wrappers/common.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -17,9 +17,9 @@ def array_method(self: _FastSlowProxy, *args, **kwargs):
 
 def array_function_method(self, func, types, args, kwargs):
     try:
-        return _FastSlowAttribute("__array_function__").__get__(self)(
-            func, types, args, kwargs
-        )
+        return _FastSlowAttribute("__array_function__").__get__(
+            self, type(self)
+        )(func, types, args, kwargs)
     except Exception:
         # if something went wrong with __array_function__ we
         # attempt to call the function directly on the slow
diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index 9955550ef90..94298872213 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -10,6 +10,7 @@
 import numpy.core.multiarray
 
 from ..fast_slow_proxy import (
+    _FastSlowAttribute,
     make_final_proxy_type,
     make_intermediate_proxy_type,
 )
@@ -122,6 +123,7 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor):
         "__iter__": custom_iter,
         # Special wrapping to handle scalar values
         "_fsproxy_wrap": classmethod(wrap_ndarray),
+        "base": _FastSlowAttribute("base", private=True),
     },
 )
 
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index de92cce8ebb..29aaaac245d 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -107,14 +107,16 @@ class _AccessorAttr:
     """
 
     def __init__(self, typ):
-        self.__typ = typ
+        self._typ = typ
+
+    def __set_name__(self, owner, name):
+        self._name = name
 
     def __get__(self, obj, cls=None):
         if obj is None:
-            return self.__typ
+            return self._typ
         else:
-            # allow __getattr__ to handle this
-            raise AttributeError()
+            return _FastSlowAttribute(self._name).__get__(obj, type(obj))
 
 
 def Timestamp_Timedelta__new__(cls, *args, **kwargs):
@@ -214,6 +216,7 @@ def _DataFrame__dir__(self):
         "__dir__": _DataFrame__dir__,
         "_constructor": _FastSlowAttribute("_constructor"),
         "_constructor_sliced": _FastSlowAttribute("_constructor_sliced"),
+        "_accessors": set(),
     },
 )
 
@@ -236,6 +239,7 @@ def _DataFrame__dir__(self):
         "cat": _AccessorAttr(_CategoricalAccessor),
         "_constructor": _FastSlowAttribute("_constructor"),
         "_constructor_expanddim": _FastSlowAttribute("_constructor_expanddim"),
+        "_accessors": set(),
     },
 )
 
@@ -273,6 +277,9 @@ def Index__new__(cls, *args, **kwargs):
         "__new__": Index__new__,
         "_constructor": _FastSlowAttribute("_constructor"),
         "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
+        "_accessors": set(),
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
     },
 )
 
@@ -337,7 +344,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=lambda fast: fast.to_pandas(),
     slow_to_fast=cudf.from_pandas,
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 DatetimeArray = make_final_proxy_type(
@@ -346,6 +357,10 @@ def Index__new__(cls, *args, **kwargs):
     pd.arrays.DatetimeArray,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
+    additional_attributes={
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 DatetimeTZDtype = make_final_proxy_type(
@@ -364,7 +379,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=lambda fast: fast.to_pandas(),
     slow_to_fast=cudf.from_pandas,
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 NumpyExtensionArray = make_final_proxy_type(
@@ -385,6 +404,10 @@ def Index__new__(cls, *args, **kwargs):
     pd.arrays.TimedeltaArray,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
+    additional_attributes={
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 PeriodIndex = make_final_proxy_type(
@@ -394,7 +417,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 PeriodArray = make_final_proxy_type(
@@ -403,6 +430,11 @@ def Index__new__(cls, *args, **kwargs):
     pd.arrays.PeriodArray,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
+    additional_attributes={
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
+    },
 )
 
 PeriodDtype = make_final_proxy_type(
@@ -464,6 +496,10 @@ def Index__new__(cls, *args, **kwargs):
     pd.arrays.StringArray,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
+    additional_attributes={
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 StringDtype = make_final_proxy_type(
@@ -472,7 +508,10 @@ def Index__new__(cls, *args, **kwargs):
     pd.StringDtype,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
-    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+    additional_attributes={
+        "__hash__": _FastSlowAttribute("__hash__"),
+        "storage": _FastSlowAttribute("storage"),
+    },
 )
 
 BooleanArray = make_final_proxy_type(
@@ -482,7 +521,9 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
     additional_attributes={
-        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__")
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
     },
 )
 
@@ -502,7 +543,9 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
     additional_attributes={
-        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__")
+        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
     },
 )
 
@@ -586,7 +629,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=lambda fast: fast.to_pandas(),
     slow_to_fast=cudf.from_pandas,
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 IntervalArray = make_final_proxy_type(
@@ -595,6 +642,10 @@ def Index__new__(cls, *args, **kwargs):
     pd.arrays.IntervalArray,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
+    additional_attributes={
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
+    },
 )
 
 IntervalDtype = make_final_proxy_type(
@@ -622,7 +673,9 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
     additional_attributes={
-        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__")
+        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
+        "_data": _FastSlowAttribute("_data", private=True),
+        "_mask": _FastSlowAttribute("_mask", private=True),
     },
 )
 
@@ -798,6 +851,14 @@ def Index__new__(cls, *args, **kwargs):
         pd_Styler,
         fast_to_slow=_Unusable(),
         slow_to_fast=_Unusable(),
+        additional_attributes={
+            "css": _FastSlowAttribute("css"),
+            "ctx": _FastSlowAttribute("ctx"),
+            "index": _FastSlowAttribute("ctx"),
+            "data": _FastSlowAttribute("data"),
+            "_display_funcs": _FastSlowAttribute("_display_funcs"),
+            "table_styles": _FastSlowAttribute("table_styles"),
+        },
     )
 except ImportError:
     # Styler requires Jinja to be installed
@@ -813,7 +874,7 @@ def _get_eval_locals_and_globals(level, local_dict=None, global_dict=None):
     return local_dict, global_dict
 
 
-@register_proxy_func(pd.eval)
+@register_proxy_func(pd.core.computation.eval.eval)
 @nvtx.annotate(
     "CUDF_PANDAS_EVAL",
     color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
@@ -843,6 +904,24 @@ def _eval(
     )
 
 
+_orig_df_eval_method = DataFrame.eval
+
+
+@register_proxy_func(pd.core.accessor.register_dataframe_accessor)
+def _register_dataframe_accessor(name):
+    return pd.core.accessor._register_accessor(name, DataFrame)
+
+
+@register_proxy_func(pd.core.accessor.register_series_accessor)
+def _register_series_accessor(name):
+    return pd.core.accessor._register_accessor(name, Series)
+
+
+@register_proxy_func(pd.core.accessor.register_index_accessor)
+def _register_index_accessor(name):
+    return pd.core.accessor._register_accessor(name, Index)
+
+
 @nvtx.annotate(
     "CUDF_PANDAS_DATAFRAME_EVAL",
     color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
@@ -853,11 +932,14 @@ def _df_eval_method(self, *args, local_dict=None, global_dict=None, **kwargs):
     local_dict, global_dict = _get_eval_locals_and_globals(
         level, local_dict, global_dict
     )
-    return super(type(self), self).__getattr__("eval")(
-        *args, local_dict=local_dict, global_dict=global_dict, **kwargs
+    return _orig_df_eval_method(
+        self, *args, local_dict=local_dict, global_dict=global_dict, **kwargs
     )
 
 
+_orig_query_eval_method = DataFrame.query
+
+
 @nvtx.annotate(
     "CUDF_PANDAS_DATAFRAME_QUERY",
     color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
@@ -870,8 +952,8 @@ def _df_query_method(self, *args, local_dict=None, global_dict=None, **kwargs):
     local_dict, global_dict = _get_eval_locals_and_globals(
         level, local_dict, global_dict
     )
-    return super(type(self), self).__getattr__("query")(
-        *args, local_dict=local_dict, global_dict=global_dict, **kwargs
+    return _orig_query_eval_method(
+        self, *args, local_dict=local_dict, global_dict=global_dict, **kwargs
     )
 
 
@@ -1277,6 +1359,7 @@ def holiday_calendar_factory_wrapper(*args, **kwargs):
     additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
 )
 
+
 MonthBegin = make_final_proxy_type(
     "MonthBegin",
     _Unusable,
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index e5c86d2318e..94caec1ce6c 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -83,6 +83,9 @@ def __getattribute__(self, name: str) -> Any:
             return super().__getattribute__(name)
         raise TypeError("Unusable type. Falling back to the slow object")
 
+    def __repr__(self) -> str:
+        raise AttributeError("Unusable type. Falling back to the slow object")
+
 
 class _PickleConstructor:
     """A pickleable object to support construction in __reduce__.
@@ -231,6 +234,13 @@ def _fsproxy_state(self) -> _State:
         elif v is not _DELETE:
             cls_dict[k] = v
 
+    for slow_name in dir(slow_type):
+        if slow_name in cls_dict or slow_name.startswith("__"):
+            continue
+        else:
+            cls_dict[slow_name] = _FastSlowAttribute(
+                slow_name, private=slow_name.startswith("_")
+            )
     if meta_class is None:
         meta_class = _FastSlowProxyMeta
     else:
@@ -329,11 +339,26 @@ def _fsproxy_fast_to_slow(self):
         "_fsproxy_fast_to_slow": _fsproxy_fast_to_slow,
         "_fsproxy_state": _fsproxy_state,
     }
-
     for method in _SPECIAL_METHODS:
         if getattr(slow_type, method, False):
             cls_dict[method] = _FastSlowAttribute(method)
 
+    for slow_name in dir(slow_type):
+        if slow_name in cls_dict or slow_name.startswith("__"):
+            continue
+        else:
+            cls_dict[slow_name] = _FastSlowAttribute(
+                slow_name, private=slow_name.startswith("_")
+            )
+
+    for slow_name in getattr(slow_type, "_attributes", []):
+        if slow_name in cls_dict:
+            continue
+        else:
+            cls_dict[slow_name] = _FastSlowAttribute(
+                slow_name, private=slow_name.startswith("_")
+            )
+
     cls = types.new_class(
         name,
         (_IntermediateProxy,),
@@ -411,62 +436,16 @@ def _raise_attribute_error(obj, name):
     raise AttributeError(f"'{obj}' object has no attribute '{name}'")
 
 
-class _FastSlowAttribute:
-    """
-    A descriptor type used to define attributes of fast-slow proxies.
-    """
-
-    def __init__(self, name: str):
-        self._name = name
-
-    def __get__(self, obj, owner=None) -> Any:
-        if obj is None:
-            # class attribute
-            obj = owner
-
-        if not (
-            isinstance(obj, _FastSlowProxy)
-            or issubclass(type(obj), _FastSlowProxyMeta)
-        ):
-            # we only want to look up attributes on the underlying
-            # fast/slow objects for instances of _FastSlowProxy or
-            # subtypes of _FastSlowProxyMeta:
-            _raise_attribute_error(owner if owner else obj, self._name)
-
-        result, _ = _fast_slow_function_call(getattr, obj, self._name)
-
-        if isinstance(result, functools.cached_property):
-            # TODO: temporary workaround until dask is able
-            # to correctly inspect cached_property objects.
-            # GH: 264
-            result = property(result.func)
-
-        if isinstance(result, (_MethodProxy, property)):
-            from .module_accelerator import disable_module_accelerator
-
-            type_ = owner if owner else type(obj)
-            slow_result_type = getattr(type_._fsproxy_slow, self._name)
-            with disable_module_accelerator():
-                result.__doc__ = inspect.getdoc(  # type: ignore
-                    slow_result_type
-                )
-
-            if isinstance(result, _MethodProxy):
-                # Note that this will produce the wrong result for bound
-                # methods because dir for the method won't be the same as for
-                # the pure unbound function, but the alternative is
-                # materializing the slow object when we don't really want to.
-                result._fsproxy_slow_dir = dir(slow_result_type)  # type: ignore
-
-        return result
-
-
 class _FastSlowProxyMeta(type):
     """
     Metaclass used to dynamically find class attributes and
     classmethods of fast-slow proxy types.
     """
 
+    _fsproxy_slow_dir: list
+    _fsproxy_slow_type: type
+    _fsproxy_fast_type: type
+
     @property
     def _fsproxy_slow(self) -> type:
         return self._fsproxy_slow_type
@@ -483,15 +462,6 @@ def __dir__(self):
         except AttributeError:
             return type.__dir__(self)
 
-    def __getattr__(self, name: str) -> Any:
-        if name.startswith("_fsproxy") or name.startswith("__"):
-            # an AttributeError was raised when trying to evaluate
-            # an internal attribute, we just need to propagate this
-            _raise_attribute_error(self.__class__.__name__, name)
-
-        attr = _FastSlowAttribute(name)
-        return attr.__get__(None, owner=self)
-
     def __subclasscheck__(self, __subclass: type) -> bool:
         if super().__subclasscheck__(__subclass):
             return True
@@ -565,56 +535,13 @@ def __dir__(self):
         except AttributeError:
             return object.__dir__(self)
 
-    def __getattr__(self, name: str) -> Any:
-        if name.startswith("_fsproxy"):
-            # an AttributeError was raised when trying to evaluate
-            # an internal attribute, we just need to propagate this
-            _raise_attribute_error(self.__class__.__name__, name)
-        if name in {
-            "_ipython_canary_method_should_not_exist_",
-            "_ipython_display_",
-            "_repr_mimebundle_",
-            # Workaround for https://github.com/numpy/numpy/issues/5350
-            # see GH:216 for details
-            "__array_struct__",
-        }:
-            # IPython always looks for these names in its display
-            # logic. See #GH:70 and #GH:172 for more details but the
-            # gist is that not raising an AttributeError immediately
-            # results in slow display in IPython (since the fast
-            # object will be copied to the slow one to look for
-            # attributes there which then also won't exist).
-            # This is somewhat delicate to the order in which IPython
-            # implements special display fallbacks.
-            _raise_attribute_error(self.__class__.__name__, name)
-        if name.startswith("_"):
-            # private attributes always come from `._fsproxy_slow`:
-            obj = getattr(self._fsproxy_slow, name)
-            if name.startswith("__array"):
-                # TODO: numpy methods raise when given proxy ndarray objects
-                # https://numpy.org/doc/stable/reference/arrays.classes.html#special-attributes-and-methods  # noqa:E501
-                return obj
-
-            if not _is_function_or_method(obj):
-                return _maybe_wrap_result(
-                    obj, getattr, self._fsproxy_slow, name
-                )
-
-            @functools.wraps(obj)
-            def _wrapped_private_slow(*args, **kwargs):
-                slow_args, slow_kwargs = _slow_arg(args), _slow_arg(kwargs)
-                result = obj(*slow_args, **slow_kwargs)
-                return _maybe_wrap_result(result, obj, *args, **kwargs)
-
-            return _wrapped_private_slow
-        attr = _FastSlowAttribute(name)
-        return attr.__get__(self)
-
     def __setattr__(self, name, value):
         if name.startswith("_"):
             object.__setattr__(self, name, value)
             return
-        return _FastSlowAttribute("__setattr__").__get__(self)(name, value)
+        return _FastSlowAttribute("__setattr__").__get__(self, type(self))(
+            name, value
+        )
 
 
 class _FinalProxy(_FastSlowProxy):
@@ -790,17 +717,162 @@ class _FunctionProxy(_CallableProxyMixin):
 
     __name__: str
 
-    def __init__(self, fast: Callable | _Unusable, slow: Callable):
+    def __init__(
+        self,
+        fast: Callable | _Unusable,
+        slow: Callable,
+        *,
+        assigned=None,
+        updated=None,
+    ):
         self._fsproxy_fast = fast
         self._fsproxy_slow = slow
-        functools.update_wrapper(self, slow)
+        if assigned is None:
+            assigned = functools.WRAPPER_ASSIGNMENTS
+        if updated is None:
+            updated = functools.WRAPPER_UPDATES
+        functools.update_wrapper(
+            self,
+            slow,
+            assigned=assigned,
+            updated=updated,
+        )
 
+    def __reduce__(self):
+        """
+        In conjunction with `__proxy_setstate__`, this effectively enables
+        proxy types to be pickled and unpickled by pickling and unpickling
+        the underlying wrapped types.
+        """
+        # Need a local import to avoid circular import issues
+        from .module_accelerator import disable_module_accelerator
+
+        with disable_module_accelerator():
+            pickled_fast = pickle.dumps(self._fsproxy_fast)
+            pickled_slow = pickle.dumps(self._fsproxy_slow)
+        return (
+            _PickleConstructor(type(self)),
+            (),
+            (pickled_fast, pickled_slow),
+        )
 
-class _MethodProxy(_CallableProxyMixin, _IntermediateProxy):
+    def __setstate__(self, state):
+        # Need a local import to avoid circular import issues
+        from .module_accelerator import disable_module_accelerator
+
+        with disable_module_accelerator():
+            unpickled_fast = pickle.loads(state[0])
+            unpickled_slow = pickle.loads(state[1])
+        self._fsproxy_fast = unpickled_fast
+        self._fsproxy_slow = unpickled_slow
+
+
+def is_bound_method(obj):
+    return inspect.ismethod(obj) and not inspect.isfunction(obj)
+
+
+def is_function(obj):
+    return inspect.isfunction(obj) or isinstance(obj, types.FunctionType)
+
+
+class _FastSlowAttribute:
     """
-    Methods of fast-slow proxies are of type _MethodProxy.
+    A descriptor type used to define attributes of fast-slow proxies.
     """
 
+    _attr: Any
+
+    def __init__(self, name: str, *, private: bool = False):
+        self._name = name
+        self._private = private
+        self._attr = None
+        self._doc = None
+        self._dir = None
+
+    def __get__(self, instance, owner) -> Any:
+        from .module_accelerator import disable_module_accelerator
+
+        if self._attr is None:
+            if self._private:
+                fast_attr = _Unusable()
+            else:
+                fast_attr = getattr(
+                    owner._fsproxy_fast, self._name, _Unusable()
+                )
+
+            try:
+                slow_attr = getattr(owner._fsproxy_slow, self._name)
+            except AttributeError as e:
+                if instance is not None:
+                    return _maybe_wrap_result(
+                        getattr(instance._fsproxy_slow, self._name),
+                        None,  # type: ignore
+                    )
+                else:
+                    raise e
+
+            if _is_function_or_method(slow_attr):
+                self._attr = _MethodProxy(fast_attr, slow_attr)
+            else:
+                # for anything else, use a fast-slow attribute:
+                self._attr, _ = _fast_slow_function_call(
+                    getattr, owner, self._name
+                )
+
+                if isinstance(
+                    self._attr, (property, functools.cached_property)
+                ):
+                    with disable_module_accelerator():
+                        self._attr.__doc__ = inspect.getdoc(slow_attr)
+
+        if instance is not None:
+            if isinstance(self._attr, _MethodProxy):
+                if is_bound_method(self._attr._fsproxy_slow):
+                    return self._attr
+                else:
+                    return types.MethodType(self._attr, instance)
+            else:
+                if self._private:
+                    return _maybe_wrap_result(
+                        getattr(instance._fsproxy_slow, self._name),
+                        None,  # type: ignore
+                    )
+                return _fast_slow_function_call(getattr, instance, self._name)[
+                    0
+                ]
+        return self._attr
+
+
+class _MethodProxy(_FunctionProxy):
+    def __init__(self, fast, slow):
+        super().__init__(
+            fast,
+            slow,
+            updated=functools.WRAPPER_UPDATES,
+            assigned=(
+                tuple(filter(lambda x: x != "__name__", _WRAPPER_ASSIGNMENTS))
+            ),
+        )
+
+    def __dir__(self):
+        return self._fsproxy_slow.__dir__()
+
+    @property
+    def __doc__(self):
+        return self._fsproxy_slow.__doc__
+
+    @property
+    def __name__(self):
+        return self._fsproxy_slow.__name__
+
+    @__name__.setter
+    def __name__(self, value):
+        try:
+            setattr(self._fsproxy_fast, "__name__", value)
+        except AttributeError:
+            pass
+        setattr(self._fsproxy_slow, "__name__", value)
+
 
 def _fast_slow_function_call(func: Callable, /, *args, **kwargs) -> Any:
     """
@@ -981,10 +1053,6 @@ def _maybe_wrap_result(result: Any, func: Callable, /, *args, **kwargs) -> Any:
             return type(result)(wrapped)
     elif isinstance(result, Iterator):
         return (_maybe_wrap_result(r, lambda x: x, r) for r in result)
-    elif _is_function_or_method(result):
-        return _MethodProxy._fsproxy_wrap(
-            result, method_chain=(func, args, kwargs)
-        )
     else:
         return result
 
@@ -1081,6 +1149,7 @@ def _replace_closurevars(
     "__and__",
     "__bool__",
     "__call__",
+    "__getattr__",
     "__complex__",
     "__contains__",
     "__copy__",
diff --git a/python/cudf/cudf/pandas/profiler.py b/python/cudf/cudf/pandas/profiler.py
index 0124d411e3b..0dbd333ce4f 100644
--- a/python/cudf/cudf/pandas/profiler.py
+++ b/python/cudf/cudf/pandas/profiler.py
@@ -127,12 +127,7 @@ def get_namespaced_function_name(
         ],
     ):
         if isinstance(func_obj, _MethodProxy):
-            # Extract classname from method object
-            type_name = type(func_obj._fsproxy_wrapped.__self__).__name__
-            # Explicitly ask for __name__ on _fsproxy_wrapped to avoid
-            # getting a private attribute and forcing a slow-path copy
-            func_name = func_obj._fsproxy_wrapped.__name__
-            return ".".join([type_name, func_name])
+            return func_obj._fsproxy_slow.__qualname__
         elif isinstance(func_obj, _FunctionProxy) or issubclass(
             func_obj, (_FinalProxy, _IntermediateProxy)
         ):
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 6eb28104120..cd9f90d50fe 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -138,7 +138,7 @@ and not test_eof_states"
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
-    -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS" \
+    -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \
     --import-mode=importlib \
     ${PYTEST_IGNORES} \
     "$@" || [ $? = 1 ]  # Exit success if exit code was 1 (permit test failures but not other errors)
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 9fb0891fa52..e3d4f878ad5 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -379,6 +379,8 @@ def test_pickle_round_trip(dataframe):
 
 
 def test_excel_round_trip(dataframe):
+    pytest.importorskip("openpyxl")
+
     pdf, df = dataframe
     excel_pdf = BytesIO()
     excel_cudf_pandas = BytesIO()
@@ -1211,6 +1213,24 @@ def test_func_namespace():
     assert xpd.concat is xpd.core.reshape.concat.concat
 
 
+def test_register_accessor():
+    @xpd.api.extensions.register_dataframe_accessor("xyz")
+    class XYZ:
+        def __init__(self, obj):
+            self._obj = obj
+
+        @property
+        def foo(self):
+            return "spam"
+
+    # the accessor must be registered with the proxy type,
+    # not the underlying fast or slow type
+    assert "xyz" in xpd.DataFrame.__dict__
+
+    df = xpd.DataFrame()
+    assert df.xyz.foo == "spam"
+
+
 def test_pickle_groupby(dataframe):
     pdf, df = dataframe
     pgb = pdf.groupby("a")
@@ -1232,6 +1252,18 @@ def test_isinstance_base_offset():
     assert isinstance(offset, xpd.tseries.offsets.BaseOffset)
 
 
+def test_super_attribute_lookup():
+    # test that we can use super() to access attributes
+    # of the base class when subclassing proxy types
+
+    class Foo(xpd.Series):
+        def max_times_two(self):
+            return super().max() * 2
+
+    s = Foo([1, 2, 3])
+    assert s.max_times_two() == 6
+
+
 def test_floordiv_array_vs_df():
     xarray = xpd.Series([1, 2, 3], dtype="datetime64[ns]").array
     parray = pd.Series([1, 2, 3], dtype="datetime64[ns]").array
diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py
index 359a2a2c515..588398265f2 100644
--- a/python/cudf/cudf_pandas_tests/test_profiler.py
+++ b/python/cudf/cudf_pandas_tests/test_profiler.py
@@ -33,11 +33,11 @@ def test_profiler():
         "Timestamp",
         "DataFrame",
         "DataFrame.groupby",
-        "DataFrameGroupBy.sum",
+        "GroupBy.sum",
         "DataFrame.sum",
         "Series.__getitem__",
         "Timedelta",
-        "Timestamp.__add__",
+        "_Timestamp.__add__",
     }
     for name, func in per_function_stats.items():
         assert (

From 9ce1721567ccb599fbf2efc8ec770d45b57ddef8 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 20 May 2024 12:12:45 -0400
Subject: [PATCH 227/842] Reading multi-line JSON in string columns using
 runtime configurable delimiter (#15556)

Addresses #15277
Given a JSON lines buffer with records separated by a delimiter passed at runtime, the idea is to modify the JSON tokenization FST to consider the passed delimiter to generate EOL token instead of the newline character currently hard-coded.
This PR does not modify the whitespace normalization FST to [strip out unquoted `\n` and `\r`](https://github.com/rapidsai/cudf/issues/14865#issuecomment-1917575436). Whitespace normalization will be handled in follow-up works.
Note that this is not a multi-object JSON reader since we are not using the offsets data in the string column, and hence there is no resetting of the start state at every row offset.

Current status:
- [X] Semantic bracket/brace DFA
- [X] DFA removing excess characters after record in line
- [X] Pushdown automata generating tokens
- [x] Test passing arbitrary delimiter that does not occur in input to the reader

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Elias Stehle (https://github.com/elstehle)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15556
---
 cpp/include/cudf/io/json.hpp       |  45 ++++
 cpp/src/io/json/nested_json.hpp    |   6 +-
 cpp/src/io/json/nested_json_gpu.cu | 144 +++++++-----
 cpp/tests/io/json_test.cpp         |  78 +++++++
 cpp/tests/io/nested_json_test.cpp  | 352 ++++++++++++++++++++++-------
 5 files changed, 476 insertions(+), 149 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 7374ffc37e6..aa4bee4fb5e 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -101,6 +101,8 @@ class json_reader_options {
   bool _lines = false;
   // Parse mixed types as a string column
   bool _mixed_types_as_string = false;
+  // Delimiter separating records in JSON lines
+  char _delimiter = '\n';
   // Prune columns on read, selected based on the _dtypes option
   bool _prune_columns = false;
 
@@ -229,6 +231,13 @@ class json_reader_options {
     return base_padding + num_columns * column_bytes;
   }
 
+  /**
+   * @brief Returns delimiter separating records in JSON lines
+   *
+   * @return Delimiter separating records in JSON lines
+   */
+  char get_delimiter() const { return _delimiter; }
+
   /**
    * @brief Whether to read the file as a json object per line.
    *
@@ -340,6 +349,30 @@ class json_reader_options {
    */
   void set_byte_range_size(size_type size) { _byte_range_size = size; }
 
+  /**
+   * @brief Set delimiter separating records in JSON lines
+   *
+   * @param delimiter Delimiter separating records in JSON lines
+   */
+  void set_delimiter(char delimiter)
+  {
+    switch (delimiter) {
+      case '{':
+      case '[':
+      case '}':
+      case ']':
+      case ',':
+      case ':':
+      case '"':
+      case '\'':
+      case '\\':
+      case ' ':
+      case '\t':
+      case '\r': CUDF_FAIL("Unsupported delimiter character.", std::invalid_argument); break;
+    }
+    _delimiter = delimiter;
+  }
+
   /**
    * @brief Set whether to read the file as a json object per line.
    *
@@ -507,6 +540,18 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set delimiter separating records in JSON lines
+   *
+   * @param delimiter Delimiter separating records in JSON lines
+   * @return this for chaining
+   */
+  json_reader_options_builder& delimiter(char delimiter)
+  {
+    options.set_delimiter(delimiter);
+    return *this;
+  }
+
   /**
    * @brief Set whether to read the file as a json object per line.
    *
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 5817a01c21f..e12892a2d50 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -59,8 +59,8 @@ enum class stack_behavior_t : char {
   PushPopWithoutReset,
 
   /// Opening brackets and braces, [, {, push onto the stack, closing brackets and braces, ], }, pop
-  /// from the stack. Newline characters are considered delimiters and therefore reset to an empty
-  /// stack.
+  /// from the stack. Delimiter characters are passed when the stack context is constructed to
+  /// reset to an empty stack.
   ResetOnDelimiter
 };
 
@@ -198,11 +198,13 @@ namespace detail {
  * within the context of a struct, a '[' represents that it is within the context of an array, and a
  * '_' symbol that it is at the root of the JSON.
  * @param[in] stack_behavior Specifies the stack's behavior
+ * @param[in] delimiter Specifies the delimiter to use as separator for JSON lines input
  * @param[in] stream The cuda stream to dispatch GPU kernels to
  */
 void get_stack_context(device_span<SymbolT const> json_in,
                        SymbolT* d_top_of_stack,
                        stack_behavior_t stack_behavior,
+                       SymbolT delimiter,
                        rmm::cuda_stream_view stream);
 
 /**
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 8da1bb3ddfc..b243e4ba006 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -131,12 +131,13 @@ constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NU
  * @brief Function object to map (input_symbol,stack_context) tuples to a symbol group.
  */
 struct SymbolPairToSymbolGroupId {
+  SymbolT delimiter = '\n';
   CUDF_HOST_DEVICE SymbolGroupT operator()(thrust::tuple<SymbolT, StackSymbolT> symbol) const
   {
     auto const input_symbol = thrust::get<0>(symbol);
     auto const stack_symbol = thrust::get<1>(symbol);
     return static_cast<SymbolGroupT>(
-      input_symbol == '\n'
+      input_symbol == delimiter
         ? dfa_symbol_group_id::DELIMITER
         : (stack_symbol == '_' ? dfa_symbol_group_id::ROOT : dfa_symbol_group_id::OTHER));
   }
@@ -331,7 +332,7 @@ enum class dfa_symbol_group_id : uint8_t {
   CLOSING_BRACKET,   ///< Closing bracket SG: ]
   QUOTE_CHAR,        ///< Quote character SG: "
   ESCAPE_CHAR,       ///< Escape character SG: '\'
-  NEWLINE_CHAR,      ///< Newline character SG: '\n'
+  DELIMITER_CHAR,    ///< Delimiter character SG
   OTHER_SYMBOLS,     ///< SG implicitly matching all other characters
   NUM_SYMBOL_GROUPS  ///< Total number of symbol groups
 };
@@ -339,42 +340,64 @@ enum class dfa_symbol_group_id : uint8_t {
 constexpr auto TT_NUM_STATES     = static_cast<StateT>(dfa_states::TT_NUM_STATES);
 constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);
 
-// The i-th string representing all the characters of a symbol group
-std::array<std::string, NUM_SYMBOL_GROUPS - 1> const symbol_groups{
-  {{"{"}, {"["}, {"}"}, {"]"}, {"\""}, {"\\"}, {"\n"}}};
+// The DFA's starting state
+constexpr auto start_state = static_cast<StateT>(TT_OOS);
 
-// Transition table for the default JSON and JSON lines formats
-std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const transition_table{
-  {/* IN_STATE          {       [       }       ]       "       \      \n    OTHER */
-   /* TT_OOS    */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS, TT_OOS}},
-   /* TT_STR    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR, TT_STR}},
-   /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}};
-
-// Transition table for the JSON lines format that recovers from invalid JSON lines
-std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
-  resetting_transition_table{
+template <typename SymbolT>
+auto get_sgid_lut(SymbolT delim)
+{
+  // The i-th string representing all the characters of a symbol group
+  std::array<std::vector<SymbolT>, NUM_SYMBOL_GROUPS - 1> symbol_groups{
+    {{'{'}, {'['}, {'}'}, {']'}, {'"'}, {'\\'}, {delim}}};
+
+  return symbol_groups;
+}
+
+auto get_transition_table(stack_behavior_t stack_behavior)
+{
+  // Transition table for the default JSON and JSON lines formats
+  std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const transition_table{
     {/* IN_STATE          {       [       }       ]       "       \      \n    OTHER */
      /* TT_OOS    */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS, TT_OOS}},
-     /* TT_STR    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_OOS, TT_STR}},
-     /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_STR}}}};
-
-// Translation table for the default JSON and JSON lines formats
-std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const translation_table{
-  {/* IN_STATE         {      [      }      ]      "      \     \n    OTHER */
-   /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}, {}}},
-   /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {}, {}}},
-   /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {}, {}}}}};
-
-// Translation table for the JSON lines format that recovers from invalid JSON lines
-std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
-  resetting_translation_table{
-    {/* IN_STATE         {      [      }      ]      "      \     \n    OTHER */
-     /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {'\n'}, {}}},
-     /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {'\n'}, {}}},
-     /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {'\n'}, {}}}}};
+     /* TT_STR    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR, TT_STR}},
+     /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}};
+
+  // Transition table for the JSON lines format that recovers from invalid JSON lines
+  std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
+    resetting_transition_table{
+      {/* IN_STATE          {       [       }       ]       "       \      \n    OTHER */
+       /* TT_OOS    */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS, TT_OOS}},
+       /* TT_STR    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_OOS, TT_STR}},
+       /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_STR}}}};
+
+  // Transition table specialized on the choice of whether to reset on newlines
+  return (stack_behavior == stack_behavior_t::ResetOnDelimiter) ? resetting_transition_table
+                                                                : transition_table;
+}
+
+auto get_translation_table(stack_behavior_t stack_behavior)
+{
+  // Translation table for the default JSON and JSON lines formats
+  std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
+    translation_table{
+      {/* IN_STATE         {      [      }      ]      "      \     <delim>    OTHER */
+       /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}, {}}},
+       /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {}, {}}},
+       /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {}, {}}}}};
+
+  // Translation table for the JSON lines format that recovers from invalid JSON lines
+  std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const
+    resetting_translation_table{
+      {/* IN_STATE         {      [      }      ]      "      \     <delim>    OTHER */
+       /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {'\n'}, {}}},
+       /* TT_STR    */ {{{}, {}, {}, {}, {}, {}, {'\n'}, {}}},
+       /* TT_ESC    */ {{{}, {}, {}, {}, {}, {}, {'\n'}, {}}}}};
+
+  // Translation table specialized on the choice of whether to reset on newlines
+  return stack_behavior == stack_behavior_t::ResetOnDelimiter ? resetting_translation_table
+                                                              : translation_table;
+}
 
-// The DFA's starting state
-constexpr auto start_state = static_cast<StateT>(TT_OOS);
 }  // namespace to_stack_op
 
 // JSON tokenizer pushdown automaton
@@ -572,6 +595,7 @@ static __constant__ PdaSymbolGroupIdT tos_sg_to_pda_sgid[] = {
  * visibly pushdown automaton (DVPA)
  */
 struct PdaSymbolToSymbolGroupId {
+  SymbolT delimiter = '\n';
   template <typename SymbolT, typename StackSymbolT>
   __device__ __forceinline__ PdaSymbolGroupIdT
   operator()(thrust::tuple<SymbolT, StackSymbolT> symbol_pair) const
@@ -593,8 +617,15 @@ struct PdaSymbolToSymbolGroupId {
     // The relative symbol group id of the current input symbol
     constexpr auto pda_sgid_lookup_size =
       static_cast<int32_t>(sizeof(tos_sg_to_pda_sgid) / sizeof(tos_sg_to_pda_sgid[0]));
+    // We map the delimiter character to LINE_BREAK symbol group id, and the newline character
+    // to OTHER. Note that delimiter cannot be any of opening(closing) brace, bracket, quote,
+    // escape, comma, colon or whitespace characters.
+    auto const symbol_position =
+      symbol == delimiter
+        ? static_cast<int32_t>('\n')
+        : (symbol == '\n' ? static_cast<int32_t>(delimiter) : static_cast<int32_t>(symbol));
     PdaSymbolGroupIdT symbol_gid =
-      tos_sg_to_pda_sgid[min(static_cast<int32_t>(symbol), pda_sgid_lookup_size - 1)];
+      tos_sg_to_pda_sgid[min(symbol_position, pda_sgid_lookup_size - 1)];
     return stack_idx * static_cast<PdaSymbolGroupIdT>(symbol_group_id::NUM_PDA_INPUT_SGS) +
            symbol_gid;
   }
@@ -1398,6 +1429,7 @@ namespace detail {
 void get_stack_context(device_span<SymbolT const> json_in,
                        SymbolT* d_top_of_stack,
                        stack_behavior_t stack_behavior,
+                       SymbolT delimiter,
                        rmm::cuda_stream_view stream)
 {
   check_input_size(json_in.size());
@@ -1423,20 +1455,11 @@ void get_stack_context(device_span<SymbolT const> json_in,
   constexpr auto max_translation_table_size =
     to_stack_op::NUM_SYMBOL_GROUPS * to_stack_op::TT_NUM_STATES;
 
-  // Transition table specialized on the choice of whether to reset on newlines
-  const auto transition_table = (stack_behavior == stack_behavior_t::ResetOnDelimiter)
-                                  ? to_stack_op::resetting_transition_table
-                                  : to_stack_op::transition_table;
-
-  // Translation table specialized on the choice of whether to reset on newlines
-  const auto translation_table = (stack_behavior == stack_behavior_t::ResetOnDelimiter)
-                                   ? to_stack_op::resetting_translation_table
-                                   : to_stack_op::translation_table;
-
   auto json_to_stack_ops_fst = fst::detail::make_fst(
-    fst::detail::make_symbol_group_lut(to_stack_op::symbol_groups),
-    fst::detail::make_transition_table(transition_table),
-    fst::detail::make_translation_table<max_translation_table_size>(translation_table),
+    fst::detail::make_symbol_group_lut(to_stack_op::get_sgid_lut(delimiter)),
+    fst::detail::make_transition_table(to_stack_op::get_transition_table(stack_behavior)),
+    fst::detail::make_translation_table<max_translation_table_size>(
+      to_stack_op::get_translation_table(stack_behavior)),
     stream);
 
   // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end
@@ -1539,16 +1562,16 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   // Range of encapsulating function that parses to internal columnar data representation
   CUDF_FUNC_RANGE();
 
-  auto const new_line_delimited_json = options.is_enabled_lines();
+  auto const delimited_json = options.is_enabled_lines();
+  auto const delimiter      = options.get_delimiter();
 
-  // (!new_line_delimited_json)                         => JSON
-  // (new_line_delimited_json and recover_from_error)   => JSON_LINES_RECOVER
-  // (new_line_delimited_json and !recover_from_error)  => JSON_LINES
-  auto format = new_line_delimited_json
-                  ? (options.recovery_mode() == json_recovery_mode_t::RECOVER_WITH_NULL
-                       ? tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER
-                       : tokenizer_pda::json_format_cfg_t::JSON_LINES)
-                  : tokenizer_pda::json_format_cfg_t::JSON;
+  // (!delimited_json)                         => JSON
+  // (delimited_json and recover_from_error)   => JSON_LINES_RECOVER
+  // (delimited_json and !recover_from_error)  => JSON_LINES
+  auto format = delimited_json ? (options.recovery_mode() == json_recovery_mode_t::RECOVER_WITH_NULL
+                                    ? tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER
+                                    : tokenizer_pda::json_format_cfg_t::JSON_LINES)
+                               : tokenizer_pda::json_format_cfg_t::JSON;
 
   // Prepare for PDA transducer pass, merging input symbols with stack symbols
   auto const recover_from_error = (format == tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER);
@@ -1559,7 +1582,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   // Identify what is the stack context for each input character (JSON-root, struct, or list)
   auto const stack_behavior =
     recover_from_error ? stack_behavior_t::ResetOnDelimiter : stack_behavior_t::PushPopWithoutReset;
-  get_stack_context(json_in, stack_symbols.data(), stack_behavior, stream);
+  get_stack_context(json_in, stack_symbols.data(), stack_behavior, delimiter, stream);
 
   // Input to the full pushdown automaton finite-state transducer, where a input symbol comprises
   // the combination of a character from the JSON input together with the stack context for that
@@ -1573,7 +1596,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   if (recover_from_error) {
     auto fix_stack_of_excess_chars = fst::detail::make_fst(
       fst::detail::make_symbol_group_lookup_op(
-        fix_stack_of_excess_chars::SymbolPairToSymbolGroupId{}),
+        fix_stack_of_excess_chars::SymbolPairToSymbolGroupId{delimiter}),
       fst::detail::make_transition_table(fix_stack_of_excess_chars::transition_table),
       fst::detail::make_translation_functor(fix_stack_of_excess_chars::TransduceInputOp{}),
       stream);
@@ -1592,8 +1615,9 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   constexpr auto max_translation_table_size =
     tokenizer_pda::NUM_PDA_SGIDS *
     static_cast<tokenizer_pda::StateT>(tokenizer_pda::pda_state_t::PD_NUM_STATES);
+
   auto json_to_tokens_fst = fst::detail::make_fst(
-    fst::detail::make_symbol_group_lookup_op(tokenizer_pda::PdaSymbolToSymbolGroupId{}),
+    fst::detail::make_symbol_group_lookup_op(tokenizer_pda::PdaSymbolToSymbolGroupId{delimiter}),
     fst::detail::make_transition_table(tokenizer_pda::get_transition_table(format)),
     fst::detail::make_translation_table<max_translation_table_size>(
       tokenizer_pda::get_translation_table(recover_from_error)),
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index b25822f6613..35e6adf20e7 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -2434,6 +2434,84 @@ TEST_F(JsonReaderTest, MapTypes)
           {type_id::LIST, type_id::STRING, type_id::STRING});
 }
 
+/**
+ * @brief Test fixture for parametrized JSON reader tests
+ */
+struct JsonDelimiterParamTest : public cudf::test::BaseFixture,
+                                public testing::WithParamInterface<char> {};
+
+// Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
+INSTANTIATE_TEST_SUITE_P(JsonDelimiterParamTest,
+                         JsonDelimiterParamTest,
+                         ::testing::Values('\n', '\b', '\v', '\f', 'h'));
+
+TEST_P(JsonDelimiterParamTest, JsonLinesDelimiter)
+{
+  using SymbolT = char;
+
+  SymbolT const random_delimiter = GetParam();
+
+  // Test input
+  std::string input             = R"({"col1":100, "col2":1.1, "col3":"aaa"})";
+  std::size_t const string_size = 400;
+  /*
+   * We are constructing a JSON lines string where each row is {"col1":100, "col2":1.1,
+   * "col3":"aaa"} and rows are separated by random_delimiter. Instead of concatenating lines
+   * linearly in O(n), we can do it in O(log n) by doubling the input in each iteration. The total
+   * number of such iterations is log_repetitions.
+   */
+  std::size_t const log_repetitions =
+    static_cast<std::size_t>(std::ceil(std::log2(string_size / input.size())));
+  std::size_t const repetitions = 1UL << log_repetitions;
+  for (std::size_t i = 0; i < log_repetitions; i++) {
+    input = input + random_delimiter + input;
+  }
+
+  cudf::io::json_reader_options json_parser_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{input.c_str(), input.size()})
+      .lines(true)
+      .delimiter(random_delimiter);
+
+  cudf::io::table_with_metadata result = cudf::io::read_json(json_parser_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 3);
+  EXPECT_EQ(result.tbl->num_rows(), repetitions);
+
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64);
+  EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64);
+  EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::STRING);
+
+  EXPECT_EQ(result.metadata.schema_info[0].name, "col1");
+  EXPECT_EQ(result.metadata.schema_info[1].name, "col2");
+  EXPECT_EQ(result.metadata.schema_info[2].name, "col3");
+
+  auto col1_iterator = thrust::constant_iterator<int64_t>(100);
+  auto col2_iterator = thrust::constant_iterator<double>(1.1);
+  auto col3_iterator = thrust::constant_iterator<std::string>("aaa");
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0),
+                                 int64_wrapper(col1_iterator, col1_iterator + repetitions));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1),
+                                 float64_wrapper(col2_iterator, col2_iterator + repetitions));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
+    result.tbl->get_column(2),
+    cudf::test::strings_column_wrapper(col3_iterator, col3_iterator + repetitions));
+}
+
+TEST_F(JsonReaderTest, ViableDelimiter)
+{
+  // Test input
+  std::string input = R"({"col1":100, "col2":1.1, "col3":"aaa"})";
+
+  cudf::io::json_reader_options json_parser_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{input.c_str(), input.size()})
+      .lines(true);
+
+  json_parser_options.set_delimiter('\f');
+  CUDF_EXPECT_NO_THROW(cudf::io::read_json(json_parser_options));
+
+  EXPECT_THROW(json_parser_options.set_delimiter('\t'), std::invalid_argument);
+}
+
 // Test case for dtype prune:
 // all paths, only one.
 // one present, another not present, nothing present
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 112ee8fb57b..d6f800cce8b 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -148,6 +148,7 @@ TEST_F(JsonTest, StackContext)
   auto const stream = cudf::get_default_stream();
 
   // Test input
+  char const delimiter    = 'h';
   std::string const input = R"(  [{)"
                             R"("category": "reference",)"
                             R"("index:": [4,12,42],)"
@@ -171,7 +172,8 @@ TEST_F(JsonTest, StackContext)
 
   // Run algorithm
   constexpr auto stack_behavior = cuio_json::stack_behavior_t::PushPopWithoutReset;
-  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream);
+  cuio_json::detail::get_stack_context(
+    d_input, stack_context.device_ptr(), stack_behavior, delimiter, stream);
 
   // Copy back the results
   stack_context.device_to_host_async(stream);
@@ -210,6 +212,7 @@ TEST_F(JsonTest, StackContextUtf8)
   auto const stream = cudf::get_default_stream();
 
   // Test input
+  char const delimiter    = 'h';
   std::string const input = R"([{"a":{"year":1882,"author": "Bharathi"}, {"a":"filip ʒakotɛ"}}])";
 
   // Prepare input & output buffers
@@ -220,7 +223,8 @@ TEST_F(JsonTest, StackContextUtf8)
 
   // Run algorithm
   constexpr auto stack_behavior = cuio_json::stack_behavior_t::PushPopWithoutReset;
-  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream);
+  cuio_json::detail::get_stack_context(
+    d_input, stack_context.device_ptr(), stack_behavior, delimiter, stream);
 
   // Copy back the results
   stack_context.device_to_host_async(stream);
@@ -238,7 +242,18 @@ TEST_F(JsonTest, StackContextUtf8)
   CUDF_TEST_EXPECT_VECTOR_EQUAL(golden_stack_context, stack_context, stack_context.size());
 }
 
-TEST_F(JsonTest, StackContextRecovering)
+/**
+ * @brief Test fixture for parametrized JSON reader tests
+ */
+struct JsonDelimiterParamTest : public cudf::test::BaseFixture,
+                                public testing::WithParamInterface<char> {};
+
+// Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
+INSTANTIATE_TEST_SUITE_P(JsonDelimiterParamTest,
+                         JsonDelimiterParamTest,
+                         ::testing::Values('\n', '\b', '\v', '\f', 'h'));
+
+TEST_P(JsonDelimiterParamTest, StackContextRecovering)
 {
   // Type used to represent the atomic symbol type used within the finite-state machine
   using SymbolT      = char;
@@ -248,13 +263,15 @@ TEST_F(JsonTest, StackContextRecovering)
   auto const stream = cudf::get_default_stream();
 
   // JSON lines input that recovers on invalid lines
-  std::string const input = R"({"a":-2},
+  char const delimiter = GetParam();
+  std::string input    = R"({"a":-2},
   {"a":
   {"a":{"a":[321
   {"a":[1]}
 
   {"b":123}
   )";
+  std::replace(input.begin(), input.end(), '\n', delimiter);
 
   // Expected stack context (including stack context of the newline characters)
   std::string const golden_stack_context =
@@ -274,7 +291,8 @@ TEST_F(JsonTest, StackContextRecovering)
 
   // Run algorithm
   constexpr auto stack_behavior = cuio_json::stack_behavior_t::ResetOnDelimiter;
-  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream);
+  cuio_json::detail::get_stack_context(
+    d_input, stack_context.device_ptr(), stack_behavior, delimiter, stream);
 
   // Copy back the results
   stack_context.device_to_host_async(stream);
@@ -287,15 +305,16 @@ TEST_F(JsonTest, StackContextRecovering)
   CUDF_TEST_EXPECT_VECTOR_EQUAL(golden_stack_context, stack_context, stack_context.size());
 }
 
-TEST_F(JsonTest, StackContextRecoveringFuzz)
+TEST_P(JsonDelimiterParamTest, StackContextRecoveringFuzz)
 {
   // Type used to represent the atomic symbol type used within the finite-state machine
   using SymbolT      = char;
   using StackSymbolT = char;
 
-  std::random_device rd;
+  char const delimiter = GetParam();
   std::mt19937 gen(42);
   std::uniform_int_distribution<int> distribution(0, 4);
+
   constexpr std::size_t input_length = 1024 * 1024;
   std::string input{};
   input.reserve(input_length);
@@ -313,36 +332,29 @@ TEST_F(JsonTest, StackContextRecoveringFuzz)
         case 1: current = '['; break;
         case 2: current = '}'; break;
         case 3: current = '"'; break;
-        case 4: current = '\n'; break;
+        case 4: current = delimiter; break;
       }
-      switch (current) {
-        case '"': inside_quotes = !inside_quotes; break;
-        case '{':
-          if (!inside_quotes) { host_stack.push('{'); }
-          break;
-        case '[':
-          if (!inside_quotes) { host_stack.push('['); }
-          break;
-        case '}':
-          if (!inside_quotes) {
-            if (host_stack.size() > 0) {
-              // Get the proper 'pop' stack symbol
-              current = (host_stack.top() == '{' ? '}' : ']');
-              host_stack.pop();
-            } else
-              is_ok = false;
-          }
-          break;
-        case '\n':
-          // Increase chance to have longer lines
-          if (distribution(gen) == 0) {
-            is_ok = false;
-            break;
-          } else {
-            host_stack    = {};
-            inside_quotes = false;
-            break;
-          }
+      if (current == '"')
+        inside_quotes = !inside_quotes;
+      else if (current == '{' && !inside_quotes)
+        host_stack.push('{');
+      else if (current == '[' && !inside_quotes)
+        host_stack.push('[');
+      else if (current == '}' && !inside_quotes) {
+        if (host_stack.size() > 0) {
+          // Get the proper 'pop' stack symbol
+          current = (host_stack.top() == '{' ? '}' : ']');
+          host_stack.pop();
+        } else
+          is_ok = false;
+      } else if (current == delimiter) {
+        // Increase chance to have longer lines
+        if (distribution(gen) == 0) {
+          is_ok = false;
+        } else {
+          host_stack    = {};
+          inside_quotes = false;
+        }
       }
     } while (!is_ok);
     input += current;
@@ -360,24 +372,19 @@ TEST_F(JsonTest, StackContextRecoveringFuzz)
       expected_stack_context += host_stack.top();
     }
 
-    switch (current) {
-      case '"': inside_quotes = !inside_quotes; break;
-      case '{':
-        if (!inside_quotes) { host_stack.push('{'); }
-        break;
-      case '[':
-        if (!inside_quotes) { host_stack.push('['); }
-        break;
-      case '}':
-        if (!inside_quotes && host_stack.size() > 0) { host_stack.pop(); }
-        break;
-      case ']':
-        if (!inside_quotes && host_stack.size() > 0) { host_stack.pop(); }
-        break;
-      case '\n':
-        host_stack    = {};
-        inside_quotes = false;
-        break;
+    if (current == '"')
+      inside_quotes = !inside_quotes;
+    else if (current == '{' && !inside_quotes)
+      host_stack.push('{');
+    else if (current == '[' && !inside_quotes)
+      host_stack.push('[');
+    else if (current == '}' && !inside_quotes && host_stack.size() > 0)
+      host_stack.pop();
+    else if (current == ']' && !inside_quotes && host_stack.size() > 0)
+      host_stack.pop();
+    else if (current == delimiter) {
+      host_stack    = {};
+      inside_quotes = false;
     }
   }
 
@@ -392,7 +399,8 @@ TEST_F(JsonTest, StackContextRecoveringFuzz)
 
   // Run algorithm
   constexpr auto stack_behavior = cuio_json::stack_behavior_t::ResetOnDelimiter;
-  cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream);
+  cuio_json::detail::get_stack_context(
+    d_input, stack_context.device_ptr(), stack_behavior, delimiter, stream);
 
   // Copy back the results
   stack_context.device_to_host_async(stream);
@@ -404,7 +412,9 @@ TEST_F(JsonTest, StackContextRecoveringFuzz)
   CUDF_TEST_EXPECT_VECTOR_EQUAL(expected_stack_context, stack_context, stack_context.size());
 }
 
-TEST_F(JsonTest, TokenStream)
+struct JsonNewlineDelimiterTest : public cudf::test::BaseFixture {};
+
+TEST_F(JsonNewlineDelimiterTest, TokenStream)
 {
   using cuio_json::PdaTokenT;
   using cuio_json::SymbolOffsetT;
@@ -549,7 +559,7 @@ TEST_F(JsonTest, TokenStream)
   }
 }
 
-TEST_F(JsonTest, TokenStream2)
+TEST_F(JsonNewlineDelimiterTest, TokenStream2)
 {
   using cuio_json::PdaTokenT;
   using cuio_json::SymbolOffsetT;
@@ -653,29 +663,32 @@ TEST_F(JsonParserTest, ExtractColumn)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2);
 }
 
-TEST_F(JsonTest, RecoveringTokenStream)
+TEST_P(JsonDelimiterParamTest, RecoveringTokenStream)
 {
   // Test input. Inline comments used to indicate character indexes
   //                           012345678 <= line 0
-  std::string const input = R"({"a":2 {})"
-                            // 9
-                            "\n"
-                            // 01234 <= line 1
-                            R"({"a":)"
-                            // 5
-                            "\n"
-                            // 67890123456789 <= line 2
-                            R"({"a":{"a":[321)"
-                            // 0
-                            "\n"
-                            // 123456789 <= line 3
-                            R"({"a":[1]})"
-                            // 0
-                            "\n"
-                            // 1  <= line 4
-                            "\n"
-                            // 23456789 <= line 5
-                            R"({"b":123})";
+  char const delimiter = GetParam();
+
+  std::string input = R"({"a":2 {})"
+                      // 9
+                      "\n"
+                      // 01234 <= line 1
+                      R"({"a":)"
+                      // 5
+                      "\n"
+                      // 67890123456789 <= line 2
+                      R"({"a":{"a":[321)"
+                      // 0
+                      "\n"
+                      // 123456789 <= line 3
+                      R"({"a":[1]})"
+                      // 0
+                      "\n"
+                      // 1  <= line 4
+                      "\n"
+                      // 23456789 <= line 5
+                      R"({"b":123})";
+  std::replace(input.begin(), input.end(), '\n', delimiter);
 
   // Golden token stream sample
   using token_t = cuio_json::token_t;
@@ -717,6 +730,7 @@ TEST_F(JsonTest, RecoveringTokenStream)
   cudf::io::json_reader_options default_options{};
   default_options.set_recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
   default_options.enable_lines(true);
+  default_options.set_delimiter(delimiter);
 
   // Prepare input & output buffers
   cudf::string_scalar const d_scalar(input, true, stream);
@@ -730,6 +744,7 @@ TEST_F(JsonTest, RecoveringTokenStream)
   auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
   auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
 
+  stream.synchronize();
   // Verify the number of tokens matches
   ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size());
   ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size());
@@ -864,25 +879,29 @@ TEST_F(JsonTest, PostProcessTokenStream)
   }
 }
 
-TEST_F(JsonParserTest, UTF_JSON)
+TEST_P(JsonDelimiterParamTest, UTF_JSON)
 {
   // Prepare cuda stream for data transfers & kernels
-  auto const stream = cudf::get_default_stream();
-  auto mr           = rmm::mr::get_current_device_resource();
-  auto json_parser  = cuio_json::detail::device_parse_nested_json;
+  auto const stream    = cudf::get_default_stream();
+  auto mr              = rmm::mr::get_current_device_resource();
+  auto json_parser     = cuio_json::detail::device_parse_nested_json;
+  char const delimiter = GetParam();
 
   // Default parsing options
   cudf::io::json_reader_options default_options{};
+  default_options.set_delimiter(delimiter);
 
   // Only ASCII string
-  std::string const ascii_pass = R"([
+  std::string ascii_pass = R"([
   {"a":1,"b":2,"c":[3], "d": {}},
   {"a":1,"b":4.0,"c":[], "d": {"year":1882,"author": "Bharathi"}},
   {"a":1,"b":6.0,"c":[5, 7], "d": null},
   {"a":1,"b":8.0,"c":null, "d": {}},
   {"a":1,"b":null,"c":null},
   {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}}])";
-  auto const d_ascii_pass      = cudf::detail::make_device_uvector_sync(
+  std::replace(ascii_pass.begin(), ascii_pass.end(), '\n', delimiter);
+
+  auto const d_ascii_pass = cudf::detail::make_device_uvector_sync(
     cudf::host_span<char const>{ascii_pass.c_str(), ascii_pass.size()},
     stream,
     rmm::mr::get_current_device_resource());
@@ -890,21 +909,23 @@ TEST_F(JsonParserTest, UTF_JSON)
   CUDF_EXPECT_NO_THROW(json_parser(d_ascii_pass, default_options, stream, mr));
 
   // utf-8 string that fails parsing.
-  std::string const utf_failed = R"([
+  std::string utf_failed = R"([
   {"a":1,"b":2,"c":[3], "d": {}},
   {"a":1,"b":4.0,"c":[], "d": {"year":1882,"author": "Bharathi"}},
   {"a":1,"b":6.0,"c":[5, 7], "d": null},
   {"a":1,"b":8.0,"c":null, "d": {}},
   {"a":1,"b":null,"c":null},
   {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "filip ʒakotɛ"}}])";
-  auto const d_utf_failed      = cudf::detail::make_device_uvector_sync(
+  std::replace(utf_failed.begin(), utf_failed.end(), '\n', delimiter);
+
+  auto const d_utf_failed = cudf::detail::make_device_uvector_sync(
     cudf::host_span<char const>{utf_failed.c_str(), utf_failed.size()},
     stream,
     rmm::mr::get_current_device_resource());
   CUDF_EXPECT_NO_THROW(json_parser(d_utf_failed, default_options, stream, mr));
 
   // utf-8 string that passes parsing.
-  std::string const utf_pass = R"([
+  std::string utf_pass = R"([
   {"a":1,"b":2,"c":[3], "d": {}},
   {"a":1,"b":4.0,"c":[], "d": {"year":1882,"author": "Bharathi"}},
   {"a":1,"b":6.0,"c":[5, 7], "d": null},
@@ -912,7 +933,9 @@ TEST_F(JsonParserTest, UTF_JSON)
   {"a":1,"b":null,"c":null},
   {"a":1,"b":Infinity,"c":[null], "d": {"year":-600,"author": "Kaniyan"}},
   {"a":1,"b":NaN,"c":[null, null], "d": {"year": 2, "author": "filip ʒakotɛ"}}])";
-  auto const d_utf_pass      = cudf::detail::make_device_uvector_sync(
+  std::replace(utf_pass.begin(), utf_pass.end(), '\n', delimiter);
+
+  auto const d_utf_pass = cudf::detail::make_device_uvector_sync(
     cudf::host_span<char const>{utf_pass.c_str(), utf_pass.size()},
     stream,
     rmm::mr::get_current_device_resource());
@@ -1017,4 +1040,159 @@ TEST_F(JsonParserTest, EmptyString)
   EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count);
 }
 
+TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAndDelimiter)
+{
+  // Test input. Inline comments used to indicate character indexes
+  //                           012345678 <= line 0
+  char const delimiter = GetParam();
+
+  /* Input:
+   * {"a":2}
+   * {"a":<delimiter>{"a":{"a":[321<delimiter>{"a":[1]}
+   *
+   * <delimiter>{"b":123}
+   * {"b":123}
+   */
+  std::string input = R"({"a":2})"
+                      "\n";
+  // starting position 8 (zero indexed)
+  input += R"({"a":)" + std::string(1, delimiter);
+  // starting position 14 (zero indexed)
+  input += R"({"a":{"a":[321)" + std::string(1, delimiter);
+  // starting position 29 (zero indexed)
+  input += R"({"a":[1]})" + std::string("\n\n") + std::string(1, delimiter);
+  // starting position 41 (zero indexed)
+  input += R"({"b":123})"
+           "\n";
+  // starting position 51 (zero indexed)
+  input += R"({"b":123})";
+
+  // Golden token stream sample
+  using token_t = cuio_json::token_t;
+  std::vector<std::pair<std::size_t, cuio_json::PdaTokenT>> golden_token_stream;
+  if (delimiter != '\n') {
+    golden_token_stream.resize(28);
+    golden_token_stream = {// Line 0 (valid)
+                           {0, token_t::StructBegin},
+                           {1, token_t::StructMemberBegin},
+                           {1, token_t::FieldNameBegin},
+                           {3, token_t::FieldNameEnd},
+                           {5, token_t::ValueBegin},
+                           {6, token_t::ValueEnd},
+                           {6, token_t::StructMemberEnd},
+                           {6, token_t::StructEnd},
+                           // Line 1 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           // Line 2 (valid)
+                           {29, token_t::StructBegin},
+                           {30, token_t::StructMemberBegin},
+                           {30, token_t::FieldNameBegin},
+                           {32, token_t::FieldNameEnd},
+                           {34, token_t::ListBegin},
+                           {35, token_t::ValueBegin},
+                           {36, token_t::ValueEnd},
+                           {36, token_t::ListEnd},
+                           {37, token_t::StructMemberEnd},
+                           {37, token_t::StructEnd},
+                           // Line 3 (valid)
+                           {41, token_t::StructBegin},
+                           {42, token_t::StructMemberBegin},
+                           {42, token_t::FieldNameBegin},
+                           {44, token_t::FieldNameEnd},
+                           {46, token_t::ValueBegin},
+                           {49, token_t::ValueEnd},
+                           {49, token_t::StructMemberEnd},
+                           {49, token_t::StructEnd}};
+  } else {
+    /* Input:
+     * {"a":2}
+     * {"a":
+     * {"a":{"a":[321
+     * {"a":[1]}
+     *
+     *
+     * {"b":123}
+     * {"b":123}
+     */
+    golden_token_stream.resize(38);
+    golden_token_stream = {// Line 0 (valid)
+                           {0, token_t::StructBegin},
+                           {1, token_t::StructMemberBegin},
+                           {1, token_t::FieldNameBegin},
+                           {3, token_t::FieldNameEnd},
+                           {5, token_t::ValueBegin},
+                           {6, token_t::ValueEnd},
+                           {6, token_t::StructMemberEnd},
+                           {6, token_t::StructEnd},
+                           // Line 1 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           // Line 2 (invalid)
+                           {0, token_t::StructBegin},
+                           {0, token_t::StructEnd},
+                           // Line 3 (valid)
+                           {29, token_t::StructBegin},
+                           {30, token_t::StructMemberBegin},
+                           {30, token_t::FieldNameBegin},
+                           {32, token_t::FieldNameEnd},
+                           {34, token_t::ListBegin},
+                           {35, token_t::ValueBegin},
+                           {36, token_t::ValueEnd},
+                           {36, token_t::ListEnd},
+                           {37, token_t::StructMemberEnd},
+                           {37, token_t::StructEnd},
+                           // Line 4 (valid)
+                           {41, token_t::StructBegin},
+                           {42, token_t::StructMemberBegin},
+                           {42, token_t::FieldNameBegin},
+                           {44, token_t::FieldNameEnd},
+                           {46, token_t::ValueBegin},
+                           {49, token_t::ValueEnd},
+                           {49, token_t::StructMemberEnd},
+                           {49, token_t::StructEnd},
+                           // Line 5 (valid)
+                           {51, token_t::StructBegin},
+                           {52, token_t::StructMemberBegin},
+                           {52, token_t::FieldNameBegin},
+                           {54, token_t::FieldNameEnd},
+                           {56, token_t::ValueBegin},
+                           {59, token_t::ValueEnd},
+                           {59, token_t::StructMemberEnd},
+                           {59, token_t::StructEnd}};
+  }
+
+  auto const stream = cudf::get_default_stream();
+
+  // Default parsing options
+  cudf::io::json_reader_options default_options{};
+  default_options.set_recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
+  default_options.enable_lines(true);
+  default_options.set_delimiter(delimiter);
+
+  // Prepare input & output buffers
+  cudf::string_scalar const d_scalar(input, true, stream);
+  auto const d_input = cudf::device_span<cuio_json::SymbolT const>{
+    d_scalar.data(), static_cast<size_t>(d_scalar.size())};
+
+  // Parse the JSON and get the token stream
+  auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
+    d_input, default_options, stream, rmm::mr::get_current_device_resource());
+  // Copy back the number of tokens that were written
+  auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
+  auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
+
+  stream.synchronize();
+  // Verify the number of tokens matches
+  ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size());
+  ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size());
+
+  for (std::size_t i = 0; i < tokens_gpu.size(); i++) {
+    // Ensure the index the tokens are pointing to do match
+    EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i;
+    // Ensure the token category is correct
+    EXPECT_EQ(golden_token_stream[i].second, tokens_gpu[i]) << "Mismatch at #" << i;
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From 16e8625fde773d00732134ea985a42156bd8619b Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 20 May 2024 20:29:12 +0200
Subject: [PATCH 228/842] Limit runtime dependency to
 `libarrow>=16.0.0,<16.1.0a0` (#15782)

Fix `libarrow` runtime dependency which is currently broken due to the
release of `libarrow=16.1.0`:

```python
$ python -c "import cudf"
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/__init__.py", line 9, in <module>
    _setup_numba()
  File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/utils/_numba.py", line 124, in _setup_numba
    _get_cc_60_ptx_file()
  File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/utils/_numba.py", line 16, in _get_cc_60_ptx_file
    from cudf._lib import strings_udf
  File "/opt/conda/envs/rapids/lib/python3.10/site-packages/cudf/_lib/__init__.py", line 4, in <module>
    from . import (
ImportError: libarrow.so.1600: cannot open shared object file: No such file or directory
```

---------

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
Co-authored-by: James Lamb <jlamb@nvidia.com>
---
 conda/recipes/libcudf/meta.yaml | 9 +++++++++
 dependencies.yaml               | 8 ++++----
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 76115362b6c..ad2e840c71d 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -86,6 +86,9 @@ outputs:
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
+        # TODO: start taking libarrow's run exports again wwhen they're correct for 16.0
+        # ref: https://github.com/conda-forge/arrow-cpp-feedstock/issues/1418
+        - libarrow
     requirements:
       build:
         - cmake {{ cmake_version }}
@@ -105,6 +108,12 @@ outputs:
         - librmm ={{ minor_version }}
         - libkvikio ={{ minor_version }}
         - dlpack {{ dlpack_version }}
+        # TODO: start taking libarrow's run exports again wwhen they're correct for 16.0
+        # ref: https://github.com/conda-forge/arrow-cpp-feedstock/issues/1418
+        - libarrow>=16.0.0,<16.1.0a0
+        - libarrow-acero>=16.0.0,<16.1.0a0
+        - libarrow-dataset>=16.0.0,<16.1.0a0
+        - libparquet>=16.0.0,<16.1.0a0
     test:
       commands:
         - test -f $PREFIX/lib/libcudf.so
diff --git a/dependencies.yaml b/dependencies.yaml
index 4f8f3c16ea1..f20c1591e73 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -341,10 +341,10 @@ dependencies:
       - output_types: conda
         packages:
           # Allow runtime version to float up to minor version
-          - libarrow-acero>=16.0.0,<17.0.0a0
-          - libarrow-dataset>=16.0.0,<17.0.0a0
-          - libarrow>=16.0.0,<17.0.0a0
-          - libparquet>=16.0.0,<17.0.0a0
+          - libarrow-acero>=16.0.0,<16.1.0a0
+          - libarrow-dataset>=16.0.0,<16.1.0a0
+          - libarrow>=16.0.0,<16.1.0a0
+          - libparquet>=16.0.0,<16.1.0a0
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]

From f5d1c24760d90003c1a577c696ac5de23a289e64 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Mon, 20 May 2024 17:38:30 -0400
Subject: [PATCH 229/842] DOC v24.08 Updates [skip ci]

---
 .../cuda11.8-conda/devcontainer.json          |  6 +--
 .devcontainer/cuda11.8-pip/devcontainer.json  |  6 +--
 .../cuda12.2-conda/devcontainer.json          |  6 +--
 .devcontainer/cuda12.2-pip/devcontainer.json  |  6 +--
 .github/workflows/build.yaml                  | 16 ++++----
 .github/workflows/pandas-tests.yaml           |  2 +-
 .github/workflows/pr.yaml                     | 40 +++++++++----------
 .github/workflows/test.yaml                   | 22 +++++-----
 README.md                                     |  2 +-
 VERSION                                       |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             | 10 ++---
 .../all_cuda-122_arch-x86_64.yaml             | 10 ++---
 cpp/examples/versions.cmake                   |  2 +-
 dependencies.yaml                             | 32 +++++++--------
 java/ci/README.md                             |  4 +-
 java/pom.xml                                  |  2 +-
 python/cudf/pyproject.toml                    |  4 +-
 python/cudf_kafka/pyproject.toml              |  2 +-
 python/cudf_polars/pyproject.toml             |  2 +-
 python/custreamz/pyproject.toml               |  4 +-
 python/dask_cudf/pyproject.toml               |  6 +--
 21 files changed, 93 insertions(+), 93 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 944a73ecc98..c62e18512a0 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda"
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 8b802333bda..4ab4bd75643 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip"
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json
index 886b07025cc..2b50454410f 100644
--- a/.devcontainer/cuda12.2-conda/devcontainer.json
+++ b/.devcontainer/cuda12.2-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.2",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.06-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda"
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index 86df56ada19..fc5abc56094 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.2",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.2-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip"
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 6942ef0009d..c5679cc5141 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index 60544294809..a8643923a4d 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,7 +17,7 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
       with:
         matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
         build_type: nightly
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index f9d5976f1fe..cb582df21e0 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -32,41 +32,41 @@ jobs:
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: pull-request
       script: "ci/test_python_cudf.sh"
@@ -74,14 +74,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: pull-request
       script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -91,7 +91,7 @@ jobs:
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -101,7 +101,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -111,7 +111,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -121,21 +121,21 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -144,7 +144,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -152,7 +152,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08
     with:
       arch: '["amd64"]'
       cuda: '["12.2"]'
@@ -163,7 +163,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
@@ -172,7 +172,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
       build_type: pull-request
@@ -182,7 +182,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
         node_type: cpu4
         build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 170f45e23fd..36c9088d93c 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -54,7 +54,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -85,7 +85,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,7 +97,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -106,7 +106,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -117,7 +117,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/README.md b/README.md
index 205e16ea0e5..377998cd991 100644
--- a/README.md
+++ b/README.md
@@ -93,7 +93,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=24.06 python=3.11 cuda-version=12.2
+    cudf=24.08 python=3.11 cuda-version=12.2
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/VERSION b/VERSION
index 0bff6981a3d..ec8489fda92 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.06.00
+24.08.00
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 48699b81eed..2ce1d9597e8 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -26,7 +26,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.6.*
+- dask-cuda==24.8.*
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -43,10 +43,10 @@ dependencies:
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
-- libkvikio==24.6.*
+- libkvikio==24.8.*
 - libparquet==16.0.0.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.6.*
+- librmm==24.8.*
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -76,9 +76,9 @@ dependencies:
 - python-confluent-kafka>=1.9.0,<1.10.0a0
 - python>=3.9,<3.12
 - pytorch>=2.1.0
-- rapids-dask-dependency==24.6.*
+- rapids-dask-dependency==24.8.*
 - rich
-- rmm==24.6.*
+- rmm==24.8.*
 - s3fs>=2022.3.0
 - scikit-build-core>=0.7.0
 - scipy
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index d06a727f331..64d97dd742e 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -27,7 +27,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.6.*
+- dask-cuda==24.8.*
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -42,10 +42,10 @@ dependencies:
 - libarrow==16.0.0.*
 - libcufile-dev
 - libcurand-dev
-- libkvikio==24.6.*
+- libkvikio==24.8.*
 - libparquet==16.0.0.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.6.*
+- librmm==24.8.*
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -74,9 +74,9 @@ dependencies:
 - python-confluent-kafka>=1.9.0,<1.10.0a0
 - python>=3.9,<3.12
 - pytorch>=2.1.0
-- rapids-dask-dependency==24.6.*
+- rapids-dask-dependency==24.8.*
 - rich
-- rmm==24.6.*
+- rmm==24.8.*
 - s3fs>=2022.3.0
 - scikit-build-core>=0.7.0
 - scipy
diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake
index dff66b4d7d8..144b3d3721b 100644
--- a/cpp/examples/versions.cmake
+++ b/cpp/examples/versions.cmake
@@ -12,4 +12,4 @@
 # the License.
 # =============================================================================
 
-set(CUDF_TAG branch-24.06)
+set(CUDF_TAG branch-24.08)
diff --git a/dependencies.yaml b/dependencies.yaml
index f20c1591e73..39290fd2b93 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -270,8 +270,8 @@ dependencies:
       - output_types: conda
         packages:
           - fmt>=10.1.1,<11
-          - librmm==24.6.*
-          - libkvikio==24.6.*
+          - librmm==24.8.*
+          - libkvikio==24.8.*
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==3.0.6
@@ -305,7 +305,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &rmm_conda rmm==24.6.*
+          - &rmm_conda rmm==24.8.*
           - pip
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
@@ -321,10 +321,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages: &build_python_packages_cu12
-              - &rmm_cu12 rmm-cu12==24.6.*
+              - &rmm_cu12 rmm-cu12==24.8.*
           - matrix: {cuda: "11.*"}
             packages: &build_python_packages_cu11
-              - &rmm_cu11 rmm-cu11==24.6.*
+              - &rmm_cu11 rmm-cu11==24.8.*
           - {matrix: null, packages: [*rmm_conda] }
   libarrow_build:
     common:
@@ -477,7 +477,7 @@ dependencies:
       - output_types: [conda]
         packages:
           - breathe>=4.35.0
-          - dask-cuda==24.6.*
+          - dask-cuda==24.8.*
           - *doxygen
           - make
           - myst-nb
@@ -568,11 +568,11 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - rmm-cu12==24.6.*
+              - rmm-cu12==24.8.*
               - pynvjitlink-cu12
           - matrix: {cuda: "11.*"}
             packages:
-              - rmm-cu11==24.6.*
+              - rmm-cu11==24.8.*
               - cubinlinker-cu11
               - ptxcompiler-cu11
           - {matrix: null, packages: [cubinlinker, ptxcompiler, *rmm_conda]}
@@ -585,7 +585,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==24.6.*
+          - rapids-dask-dependency==24.8.*
   run_custreamz:
     common:
       - output_types: conda
@@ -671,13 +671,13 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask-cuda==24.6.*
+          - dask-cuda==24.8.*
           - *numba
   depends_on_cudf:
     common:
       - output_types: conda
         packages:
-          - &cudf_conda cudf==24.6.*
+          - &cudf_conda cudf==24.8.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -689,16 +689,16 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf-cu12==24.6.*
+              - cudf-cu12==24.8.*
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf-cu11==24.6.*
+              - cudf-cu11==24.8.*
           - {matrix: null, packages: [*cudf_conda]}
   depends_on_cudf_kafka:
     common:
       - output_types: conda
         packages:
-          - &cudf_kafka_conda cudf_kafka==24.6.*
+          - &cudf_kafka_conda cudf_kafka==24.8.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -710,10 +710,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf_kafka-cu12==24.6.*
+              - cudf_kafka-cu12==24.8.*
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf_kafka-cu11==24.6.*
+              - cudf_kafka-cu11==24.8.*
           - {matrix: null, packages: [*cudf_kafka_conda]}
   depends_on_cupy:
     common:
diff --git a/java/ci/README.md b/java/ci/README.md
index 18ad3cc4d0d..49481efab6b 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-rocky8 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.06
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.08
 ```
 
 ### Build cuDF jar with devtoolset
@@ -47,4 +47,4 @@ scl enable gcc-toolset-11 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-24.06.0-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-24.08.0-SNAPSHOT-cuda11.jar.
diff --git a/java/pom.xml b/java/pom.xml
index 46b5ce4c083..70230e6bc71 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>24.06.0-SNAPSHOT</version>
+    <version>24.08.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 826362f0632..1b7bb106d49 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -8,7 +8,7 @@ requires = [
     "ninja",
     "numpy==1.23.*",
     "pyarrow==16.0.0.*",
-    "rmm==24.6.*",
+    "rmm==24.8.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -36,7 +36,7 @@ dependencies = [
     "ptxcompiler",
     "pyarrow>=16.0.0,<16.1.0a0",
     "rich",
-    "rmm==24.6.*",
+    "rmm==24.8.*",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 787dd8a97d7..b1bb4c5bd24 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -22,7 +22,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.6.*",
+    "cudf==24.8.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index de26a3eb51c..00fde6c0e05 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -18,7 +18,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.6.*",
+    "cudf==24.8.*",
     "polars>=0.20.24",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 7786bf98bef..f7e5698900a 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -19,8 +19,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "confluent-kafka>=1.9.0,<1.10.0a0",
-    "cudf==24.6.*",
-    "cudf_kafka==24.6.*",
+    "cudf==24.8.*",
+    "cudf_kafka==24.8.*",
     "streamz",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 5fbdd98225e..e353eac06b9 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -18,12 +18,12 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.6.*",
+    "cudf==24.8.*",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "numpy>=1.23,<2.0a0",
     "pandas>=2.0,<2.2.3dev0",
-    "rapids-dask-dependency==24.6.*",
+    "rapids-dask-dependency==24.8.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -44,7 +44,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint"
 
 [project.optional-dependencies]
 test = [
-    "dask-cuda==24.6.*",
+    "dask-cuda==24.8.*",
     "numba>=0.57",
     "pytest-cov",
     "pytest-xdist",

From 58f45269b2f0dc2edada61dd07a57c3cb1cf565e Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 20 May 2024 15:28:50 -0700
Subject: [PATCH 230/842] Add default pinned pool that falls back to new pinned
 allocations (#15665)

Issue #15612

Adds a pooled pinned memory resource that is created on first call to `get_host_memory_resource` or `set_host_memory_resource`.
The pool has a fixed size: 0.5% of the device memory capacity, limited to 100MB. At 100MB, the pool takes ~30ms to initialize. Size of the pool can be overridden with environment variable `LIBCUDF_PINNED_POOL_SIZE`.
If an allocation cannot be done within the pool, a new pinned allocation is performed.
The allocator uses a stream from the global stream pool to initialize and perform synchronous operations (`allocate`/`deallocate`). Users of the resource don't need to be aware of this implementation detail as these operations synchronize before they are completed.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Alessandro Bellina (https://github.com/abellina)
  - Jake Hemstad (https://github.com/jrhemstad)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15665
---
 .../cudf/detail/utilities/stream_pool.hpp     |   7 +-
 cpp/include/cudf/io/memory_resource.hpp       |  19 ++
 cpp/src/io/utilities/config_utils.cpp         | 191 ++++++++++++++++--
 3 files changed, 202 insertions(+), 15 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/stream_pool.hpp b/cpp/include/cudf/detail/utilities/stream_pool.hpp
index 19ef26a10cb..e19cc3ec2f7 100644
--- a/cpp/include/cudf/detail/utilities/stream_pool.hpp
+++ b/cpp/include/cudf/detail/utilities/stream_pool.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -81,6 +81,11 @@ class cuda_stream_pool {
  */
 cuda_stream_pool* create_global_cuda_stream_pool();
 
+/**
+ * @brief Get the global stream pool.
+ */
+cuda_stream_pool& global_cuda_stream_pool();
+
 /**
  * @brief Acquire a set of `cuda_stream_view` objects and synchronize them to an event on another
  * stream.
diff --git a/cpp/include/cudf/io/memory_resource.hpp b/cpp/include/cudf/io/memory_resource.hpp
index ea79d6a3029..e31ebce4b1f 100644
--- a/cpp/include/cudf/io/memory_resource.hpp
+++ b/cpp/include/cudf/io/memory_resource.hpp
@@ -18,6 +18,8 @@
 
 #include <rmm/resource_ref.hpp>
 
+#include <optional>
+
 namespace cudf::io {
 
 /**
@@ -41,4 +43,21 @@ rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_r
  */
 rmm::host_async_resource_ref get_host_memory_resource();
 
+/**
+ * @brief Options to configure the default host memory resource
+ */
+struct host_mr_options {
+  std::optional<size_t> pool_size;  ///< The size of the pool to use for the default host memory
+                                    ///< resource. If not set, the default pool size is used.
+};
+
+/**
+ * @brief Configure the size of the default host memory resource.
+ *
+ * @throws cudf::logic_error if called after the default host memory resource has been created
+ *
+ * @param opts Options to configure the default host memory resource
+ */
+void config_default_host_memory_resource(host_mr_options const& opts);
+
 }  // namespace cudf::io
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index 2f7a6131e3d..7720c073a97 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -16,10 +16,13 @@
 
 #include "config_utils.hpp"
 
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/io/memory_resource.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
@@ -87,38 +90,198 @@ bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_
 
 }  // namespace nvcomp_integration
 
-inline std::mutex& host_mr_lock()
+}  // namespace detail
+
+namespace {
+class fixed_pinned_pool_memory_resource {
+  using upstream_mr    = rmm::mr::pinned_host_memory_resource;
+  using host_pooled_mr = rmm::mr::pool_memory_resource<upstream_mr>;
+
+ private:
+  upstream_mr upstream_mr_{};
+  size_t pool_size_{0};
+  // Raw pointer to avoid a segfault when the pool is destroyed on exit
+  host_pooled_mr* pool_{nullptr};
+  void* pool_begin_{nullptr};
+  void* pool_end_{nullptr};
+  cuda::stream_ref stream_{cudf::detail::global_cuda_stream_pool().get_stream().value()};
+
+ public:
+  fixed_pinned_pool_memory_resource(size_t size)
+    : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)}
+  {
+    if (pool_size_ == 0) { return; }
+
+    // Allocate full size from the pinned pool to figure out the beginning and end address
+    pool_begin_ = pool_->allocate_async(pool_size_, stream_);
+    pool_end_   = static_cast<void*>(static_cast<uint8_t*>(pool_begin_) + pool_size_);
+    pool_->deallocate_async(pool_begin_, pool_size_, stream_);
+  }
+
+  void* do_allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
+  {
+    if (bytes <= pool_size_) {
+      try {
+        return pool_->allocate_async(bytes, alignment, stream);
+      } catch (...) {
+        // If the pool is exhausted, fall back to the upstream memory resource
+      }
+    }
+
+    return upstream_mr_.allocate_async(bytes, alignment, stream);
+  }
+
+  void do_deallocate_async(void* ptr,
+                           std::size_t bytes,
+                           std::size_t alignment,
+                           cuda::stream_ref stream) noexcept
+  {
+    if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr <= pool_end_) {
+      pool_->deallocate_async(ptr, bytes, alignment, stream);
+    } else {
+      upstream_mr_.deallocate_async(ptr, bytes, alignment, stream);
+    }
+  }
+
+  void* allocate_async(std::size_t bytes, cuda::stream_ref stream)
+  {
+    return do_allocate_async(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+  }
+
+  void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
+  {
+    return do_allocate_async(bytes, alignment, stream);
+  }
+
+  void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
+  {
+    auto const result = do_allocate_async(bytes, alignment, stream_);
+    stream_.wait();
+    return result;
+  }
+
+  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept
+  {
+    return do_deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+  }
+
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        std::size_t alignment,
+                        cuda::stream_ref stream) noexcept
+  {
+    return do_deallocate_async(ptr, bytes, alignment, stream);
+  }
+
+  void deallocate(void* ptr,
+                  std::size_t bytes,
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
+  {
+    deallocate_async(ptr, bytes, alignment, stream_);
+    stream_.wait();
+  }
+
+  bool operator==(fixed_pinned_pool_memory_resource const& other) const
+  {
+    return pool_ == other.pool_ and stream_ == other.stream_;
+  }
+
+  bool operator!=(fixed_pinned_pool_memory_resource const& other) const
+  {
+    return !operator==(other);
+  }
+
+  [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&,
+                                            cuda::mr::device_accessible) noexcept
+  {
+  }
+
+  [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&,
+                                            cuda::mr::host_accessible) noexcept
+  {
+  }
+};
+
+static_assert(cuda::mr::resource_with<fixed_pinned_pool_memory_resource,
+                                      cuda::mr::device_accessible,
+                                      cuda::mr::host_accessible>,
+              "");
+
+}  // namespace
+
+CUDF_EXPORT rmm::host_async_resource_ref& make_default_pinned_mr(std::optional<size_t> config_size)
+{
+  static fixed_pinned_pool_memory_resource mr = [config_size]() {
+    auto const size = [&config_size]() -> size_t {
+      if (auto const env_val = getenv("LIBCUDF_PINNED_POOL_SIZE"); env_val != nullptr) {
+        return std::atol(env_val);
+      }
+
+      if (config_size.has_value()) { return *config_size; }
+
+      size_t free{}, total{};
+      CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total));
+      // 0.5% of the total device memory, capped at 100MB
+      return std::min(total / 200, size_t{100} * 1024 * 1024);
+    }();
+
+    // rmm requires the pool size to be a multiple of 256 bytes
+    auto const aligned_size = (size + 255) & ~255;
+    CUDF_LOG_INFO("Pinned pool size = {}", aligned_size);
+
+    // make the pool with max size equal to the initial size
+    return fixed_pinned_pool_memory_resource{aligned_size};
+  }();
+
+  static rmm::host_async_resource_ref mr_ref{mr};
+  return mr_ref;
+}
+
+CUDF_EXPORT std::mutex& host_mr_mutex()
 {
   static std::mutex map_lock;
   return map_lock;
 }
 
-inline rmm::host_async_resource_ref default_pinned_mr()
+// Must be called with the host_mr_mutex mutex held
+CUDF_EXPORT rmm::host_async_resource_ref& make_host_mr(std::optional<host_mr_options> const& opts)
 {
-  static rmm::mr::pinned_host_memory_resource default_mr{};
-  return default_mr;
+  static rmm::host_async_resource_ref* mr_ref = nullptr;
+  if (mr_ref == nullptr) {
+    mr_ref = &make_default_pinned_mr(opts ? opts->pool_size : std::nullopt);
+  } else {
+    // Throw an error if the user tries to reconfigure the default host resource
+    CUDF_EXPECTS(opts == std::nullopt, "The default host memory resource has already been created");
+  }
+
+  return *mr_ref;
 }
 
-CUDF_EXPORT inline auto& host_mr()
+// Must be called with the host_mr_mutex mutex held
+CUDF_EXPORT rmm::host_async_resource_ref& host_mr()
 {
-  static rmm::host_async_resource_ref host_mr = default_pinned_mr();
-  return host_mr;
+  static rmm::host_async_resource_ref mr_ref = make_host_mr(std::nullopt);
+  return mr_ref;
 }
 
-}  // namespace detail
-
 rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_ref mr)
 {
-  std::lock_guard lock{detail::host_mr_lock()};
-  auto last_mr      = detail::host_mr();
-  detail::host_mr() = mr;
+  std::scoped_lock lock{host_mr_mutex()};
+  auto last_mr = host_mr();
+  host_mr()    = mr;
   return last_mr;
 }
 
 rmm::host_async_resource_ref get_host_memory_resource()
 {
-  std::lock_guard lock{detail::host_mr_lock()};
-  return detail::host_mr();
+  std::scoped_lock lock{host_mr_mutex()};
+  return host_mr();
+}
+
+void config_default_host_memory_resource(host_mr_options const& opts)
+{
+  std::scoped_lock lock{host_mr_mutex()};
+  make_host_mr(opts);
 }
 
 }  // namespace cudf::io

From 4da00eab26b10cd9445d7cb69373608f5685bb01 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 20 May 2024 18:59:35 -0500
Subject: [PATCH 231/842] Raise error when sorting by categorical column in
 dask-cudf (#15788)

Some dask-cudf tests are currently producing a segfault when sorting by categorical columns. These tests were already marked as "xfail". This PR goes one step further, and raises an error in the top-level `sort_values` API. This `NotImplementedError` can be removed as soon as the problem is fixed up-stream (working on this now, but probably won't be available for 24.06).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15788
---
 .../dask_cudf/dask_cudf/expr/_collection.py   | 19 +++++++++++++++++++
 python/dask_cudf/dask_cudf/tests/test_sort.py | 18 ++++++++++++++++--
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index d50dfb24256..926b7cfaf0e 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -15,6 +15,7 @@
 
 from dask import config
 from dask.dataframe.core import is_dataframe_like
+from dask.dataframe.dispatch import is_categorical_dtype
 
 import cudf
 
@@ -81,6 +82,24 @@ def from_dict(cls, *args, **kwargs):
         with config.set({"dataframe.backend": "cudf"}):
             return DXDataFrame.from_dict(*args, **kwargs)
 
+    def sort_values(
+        self,
+        by,
+        **kwargs,
+    ):
+        # Raise if the first column is categorical, otherwise the
+        # upstream divisions logic may produce errors
+        # (See: https://github.com/rapidsai/cudf/issues/11795)
+        check_by = by[0] if isinstance(by, list) else by
+        if is_categorical_dtype(self.dtypes.get(check_by, None)):
+            raise NotImplementedError(
+                "Dask-cudf does not support sorting on categorical "
+                "columns when query-planning is enabled. Please use "
+                "the legacy API for now."
+                f"\n{_LEGACY_WORKAROUND}",
+            )
+        return super().sort_values(by, **kwargs)
+
     def groupby(
         self,
         by,
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index 400600a1598..9d9fe297248 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -10,7 +10,7 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import xfail_dask_expr
+from dask_cudf.tests.utils import QUERY_PLANNING_ON, xfail_dask_expr
 
 
 @pytest.mark.parametrize("ascending", [True, False])
@@ -23,7 +23,7 @@
         pytest.param(
             "d",
             marks=xfail_dask_expr(
-                "Dask-expr fails to sort by categorical column."
+                "Possible segfault when sorting by categorical column.",
             ),
         ),
         ["a", "b"],
@@ -47,6 +47,20 @@ def test_sort_values(nelem, nparts, by, ascending):
     dd.assert_eq(got, expect, check_index=False)
 
 
+@pytest.mark.parametrize("by", ["b", ["b", "a"]])
+def test_sort_values_categorical_raises(by):
+    df = cudf.DataFrame()
+    df["a"] = np.ascontiguousarray(np.arange(10)[::-1])
+    df["b"] = df["a"].astype("category")
+    ddf = dd.from_pandas(df, npartitions=10)
+
+    if QUERY_PLANNING_ON:
+        with pytest.raises(
+            NotImplementedError, match="sorting on categorical"
+        ):
+            ddf.sort_values(by=by)
+
+
 @pytest.mark.parametrize("ascending", [True, False])
 @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]])
 def test_sort_values_single_partition(by, ascending):

From 6b1248e62dd35d9c5343a540cf655fe967a4d02a Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 20 May 2024 20:04:58 -0500
Subject: [PATCH 232/842] Raise errors for unsupported operations on certain
 types (#15712)

Fixes: https://github.com/rapidsai/cudf/issues/15668

This PR raises errors for groupby operations on un-supported types.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15712
---
 python/cudf/cudf/_lib/groupby.pyx      | 42 +++++++++++++++++++++++-
 python/cudf/cudf/tests/test_groupby.py | 45 ++++++++++++++++++++++----
 2 files changed, 80 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 7533ed56647..9d18e023fe8 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -3,7 +3,7 @@ from functools import singledispatch
 
 from pandas.errors import DataError
 
-from cudf.api.types import is_string_dtype
+from cudf.api.types import _is_categorical_dtype, is_string_dtype
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.dtypes import (
     CategoricalDtype,
@@ -167,6 +167,46 @@ cdef class GroupBy:
             included_aggregations_i = []
             col_aggregations = []
             for agg in aggs:
+                str_agg = str(agg)
+                if (
+                    is_string_dtype(col)
+                    and agg not in _STRING_AGGS
+                    and
+                    (
+                        str_agg in {"cumsum", "cummin", "cummax"}
+                        or not (
+                        any(a in str_agg for a in {
+                            "count",
+                            "max",
+                            "min",
+                            "first",
+                            "last",
+                            "nunique",
+                            "unique",
+                            "nth"
+                        })
+                        or (agg is list)
+                        )
+                    )
+                ):
+                    raise TypeError(
+                        f"function is not supported for this dtype: {agg}"
+                    )
+                elif (
+                    _is_categorical_dtype(col)
+                    and agg not in _CATEGORICAL_AGGS
+                    and (
+                        str_agg in {"cumsum", "cummin", "cummax"}
+                        or
+                        not (
+                            any(a in str_agg for a in {"count", "max", "min", "unique"})
+                        )
+                    )
+                ):
+                    raise TypeError(
+                        f"{col.dtype} type does not support {agg} operations"
+                    )
+
                 agg_obj = make_aggregation(agg)
                 if valid_aggregations == "ALL" or agg_obj.kind in valid_aggregations:
                     included_aggregations_i.append((agg, agg_obj.kind))
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index c139b06d20f..674f694a224 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -504,7 +504,6 @@ def test_groupby_apply_jit_unary_reductions(
     func, dtype, dataset, groupby_jit_datasets
 ):
     dataset = groupby_jit_datasets[dataset]
-
     groupby_apply_jit_reductions_test_inner(func, dataset, dtype)
 
 
@@ -1891,9 +1890,6 @@ def test_groupby_nth(n, by):
     assert_groupby_results_equal(expect, got, check_dtype=False)
 
 
-@pytest.mark.xfail(
-    reason="https://github.com/pandas-dev/pandas/issues/43209",
-)
 def test_raise_data_error():
     pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": ["a", "b", "c", "d"]})
     gdf = cudf.from_pandas(pdf)
@@ -1904,12 +1900,13 @@ def test_raise_data_error():
     )
 
 
-def test_drop_unsupported_multi_agg():
+def test_multi_agg():
     gdf = cudf.DataFrame(
         {"a": [1, 1, 2, 2], "b": [1, 2, 3, 4], "c": ["a", "b", "c", "d"]}
     )
+    pdf = gdf.to_pandas()
     assert_groupby_results_equal(
-        gdf.groupby("a").agg(["count", "mean"]),
+        pdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}),
         gdf.groupby("a").agg({"b": ["count", "mean"], "c": ["count"]}),
     )
 
@@ -3852,3 +3849,39 @@ def test_group_by_reduce_numeric_only(by, data, func):
     )
     result = getattr(df.groupby(by, sort=True), func)(numeric_only=True)
     assert_eq(expected, result)
+
+
+@pytest.mark.parametrize(
+    "op", ["cummax", "cummin", "cumprod", "cumsum", "mean", "median"]
+)
+def test_group_by_raises_string_error(op):
+    df = cudf.DataFrame({"a": [1, 2, 3, 4, 5], "b": ["a", "b", "c", "d", "e"]})
+
+    with pytest.raises(TypeError):
+        df.groupby(df.a).agg(op)
+
+
+@pytest.mark.parametrize(
+    "op",
+    [
+        "cummax",
+        "cummin",
+        "cumprod",
+        "cumsum",
+        "mean",
+        "median",
+        "prod",
+        "sum",
+        list,
+    ],
+)
+def test_group_by_raises_category_error(op):
+    df = cudf.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5],
+            "b": cudf.Series(["a", "b", "c", "d", "e"], dtype="category"),
+        }
+    )
+
+    with pytest.raises(TypeError):
+        df.groupby(df.a).agg(op)

From 1dd19102d0df7b8523e29a921c62654463278b43 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 20 May 2024 20:18:22 -0500
Subject: [PATCH 233/842] Add support for `PandasArray` for `pandas<2.1.0`
 (#15786)

Only `pandas-2.1.0+` has support for `NumpyExtensionArray` and any version below that only have support for `PandasArray`. This PR make `cudf.pandas` back-ward compatible in that aspect.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - https://github.com/brandon-b-miller
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15786
---
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 40 ++++++++++++++-----
 .../cudf_pandas_tests/test_cudf_pandas.py     |  8 +++-
 2 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 29aaaac245d..2e3880e14f6 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -386,17 +386,35 @@ def Index__new__(cls, *args, **kwargs):
     },
 )
 
-NumpyExtensionArray = make_final_proxy_type(
-    "NumpyExtensionArray",
-    _Unusable,
-    pd.arrays.NumpyExtensionArray,
-    fast_to_slow=_Unusable(),
-    slow_to_fast=_Unusable(),
-    additional_attributes={
-        "_ndarray": _FastSlowAttribute("_ndarray"),
-        "_dtype": _FastSlowAttribute("_dtype"),
-    },
-)
+try:
+    from pandas.arrays import NumpyExtensionArray as pd_NumpyExtensionArray
+
+    NumpyExtensionArray = make_final_proxy_type(
+        "NumpyExtensionArray",
+        _Unusable,
+        pd_NumpyExtensionArray,
+        fast_to_slow=_Unusable(),
+        slow_to_fast=_Unusable(),
+        additional_attributes={
+            "_ndarray": _FastSlowAttribute("_ndarray"),
+            "_dtype": _FastSlowAttribute("_dtype"),
+        },
+    )
+
+except ImportError:
+    from pandas.arrays import PandasArray as pd_PandasArray
+
+    PandasArray = make_final_proxy_type(
+        "PandasArray",
+        _Unusable,
+        pd_PandasArray,
+        fast_to_slow=_Unusable(),
+        slow_to_fast=_Unusable(),
+        additional_attributes={
+            "_ndarray": _FastSlowAttribute("_ndarray"),
+            "_dtype": _FastSlowAttribute("_dtype"),
+        },
+    )
 
 TimedeltaArray = make_final_proxy_type(
     "TimedeltaArray",
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index e3d4f878ad5..75bceea3034 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1241,8 +1241,12 @@ def test_pickle_groupby(dataframe):
 
 def test_numpy_extension_array():
     np_array = np.array([0, 1, 2, 3])
-    xarray = xpd.arrays.NumpyExtensionArray(np_array)
-    array = pd.arrays.NumpyExtensionArray(np_array)
+    try:
+        xarray = xpd.arrays.NumpyExtensionArray(np_array)
+        array = pd.arrays.NumpyExtensionArray(np_array)
+    except AttributeError:
+        xarray = xpd.arrays.PandasArray(np_array)
+        array = pd.arrays.PandasArray(np_array)
 
     tm.assert_equal(xarray, array)
 

From eb7b50a293f47afac8ba4166c7bb0059d940b1c9 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Mon, 20 May 2024 18:33:02 -0700
Subject: [PATCH 234/842] Support filtered I/O in `chunked_parquet_reader` and
 simplify the use of `parquet_reader_options` (#15764)

This PR does the following:

1. It enables the support for filtered I/O in chunked parquet reader.
2. It simplifies the use of `parquet_reader_options` in `parquet::readers` by taking and saving the options at reader construction for later use instead of passing around `options` as arguments from `read()`, `has_next()` and `chunked_read()` to `prepare_data()`, `read_chunk_internal()` and several other internal APIs.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15764
---
 cpp/include/cudf/io/detail/parquet.hpp       |   4 +-
 cpp/src/io/functions.cpp                     |   2 +-
 cpp/src/io/parquet/reader.cpp                |  12 +-
 cpp/src/io/parquet/reader_impl.cpp           |  89 ++++++--------
 cpp/src/io/parquet/reader_impl.hpp           | 118 +++++++++----------
 cpp/src/io/parquet/reader_impl_chunking.cu   |  20 ++--
 cpp/src/io/parquet/reader_impl_preprocess.cu |  29 ++---
 7 files changed, 116 insertions(+), 158 deletions(-)

diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index fcf5f0d9290..978216d971e 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -76,11 +76,9 @@ class reader {
   /**
    * @brief Reads the dataset as per given options.
    *
-   * @param options Settings for controlling reading behavior
-   *
    * @return The set of columns along with table metadata
    */
-  table_with_metadata read(parquet_reader_options const& options);
+  table_with_metadata read();
 };
 
 /**
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 0358a1a6b86..3ba2facf276 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -548,7 +548,7 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
   auto reader =
     std::make_unique<detail_parquet::reader>(std::move(datasources), options, stream, mr);
 
-  return reader->read(options);
+  return reader->read();
 }
 
 parquet_metadata read_parquet_metadata(source_info const& src_info)
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index 170f7503134..8dfd68cd9b8 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -32,17 +32,7 @@ reader::reader(std::vector<std::unique_ptr<datasource>>&& sources,
 
 reader::~reader() = default;
 
-table_with_metadata reader::read(parquet_reader_options const& options)
-{
-  // if the user has specified custom row bounds
-  bool const uses_custom_row_bounds =
-    options.get_num_rows().has_value() || options.get_skip_rows() != 0;
-  return _impl->read(options.get_skip_rows(),
-                     options.get_num_rows(),
-                     uses_custom_row_bounds,
-                     options.get_row_groups(),
-                     options.get_filter());
-}
+table_with_metadata reader::read() { return _impl->read(); }
 
 chunked_reader::chunked_reader(std::size_t chunk_read_limit,
                                std::size_t pass_read_limit,
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index fba95093c9c..1bd2fae281c 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -46,7 +46,7 @@ inline bool is_treat_fixed_length_as_string(thrust::optional<LogicalType> const&
 
 }  // namespace
 
-void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_rows, size_t num_rows)
+void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num_rows)
 {
   auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
@@ -88,7 +88,7 @@ void reader::impl::decode_page_data(bool uses_custom_row_bounds, size_t skip_row
                is_treat_fixed_length_as_string(chunk.logical_type);
       });
 
-    if (!_has_page_index || uses_custom_row_bounds || has_flba) {
+    if (!_has_page_index || uses_custom_row_bounds(mode) || has_flba) {
       ComputePageStringSizes(subpass.pages,
                              pass.chunks,
                              delta_temp_buf,
@@ -419,6 +419,10 @@ reader::impl::impl(std::size_t chunk_read_limit,
                    rmm::device_async_resource_ref mr)
   : _stream{stream},
     _mr{mr},
+    _options{options.get_timestamp_type(),
+             options.get_skip_rows(),
+             options.get_num_rows(),
+             options.get_row_groups()},
     _sources{std::move(sources)},
     _output_chunk_read_limit{chunk_read_limit},
     _input_pass_read_limit{pass_read_limit}
@@ -427,11 +431,6 @@ reader::impl::impl(std::size_t chunk_read_limit,
   _metadata =
     std::make_unique<aggregate_reader_metadata>(_sources, options.is_enabled_use_arrow_schema());
 
-  // Override output timestamp resolution if requested
-  if (options.get_timestamp_type().id() != type_id::EMPTY) {
-    _timestamp_type = options.get_timestamp_type();
-  }
-
   // Strings may be returned as either string or categorical columns
   _strings_to_categorical = options.is_enabled_convert_strings_to_categories();
 
@@ -452,19 +451,21 @@ reader::impl::impl(std::size_t chunk_read_limit,
                               filter_columns_names,
                               options.is_enabled_use_pandas_metadata(),
                               _strings_to_categorical,
-                              _timestamp_type.id());
+                              _options.timestamp_type.id());
 
   // Save the states of the output buffers for reuse in `chunk_read()`.
   for (auto const& buff : _output_buffers) {
     _output_buffers_template.emplace_back(cudf::io::detail::inline_column_buffer::empty_like(buff));
   }
+
+  // Save the name to reference converter to extract output filter AST in
+  // `preprocess_file()` and `finalize_output()`
+  table_metadata metadata;
+  populate_metadata(metadata);
+  _expr_conv = named_to_reference_converter(options.get_filter(), metadata);
 }
 
-void reader::impl::prepare_data(int64_t skip_rows,
-                                std::optional<size_type> const& num_rows,
-                                bool uses_custom_row_bounds,
-                                host_span<std::vector<size_type> const> row_group_indices,
-                                std::optional<std::reference_wrapper<ast::expression const>> filter)
+void reader::impl::prepare_data(read_mode mode)
 {
   // if we have not preprocessed at the whole-file level, do that now
   if (!_file_preprocessed) {
@@ -472,14 +473,12 @@ void reader::impl::prepare_data(int64_t skip_rows,
     // - read row group information
     // - setup information on (parquet) chunks
     // - compute schedule of input passes
-    preprocess_file(skip_rows, num_rows, row_group_indices, filter);
+    preprocess_file(mode);
   }
 
   // handle any chunking work (ratcheting through the subpasses and chunks within
   // our current pass) if in bounds
-  if (_file_itm_data._current_input_pass < _file_itm_data.num_passes()) {
-    handle_chunking(uses_custom_row_bounds);
-  }
+  if (_file_itm_data._current_input_pass < _file_itm_data.num_passes()) { handle_chunking(mode); }
 }
 
 void reader::impl::populate_metadata(table_metadata& out_metadata)
@@ -498,8 +497,7 @@ void reader::impl::populate_metadata(table_metadata& out_metadata)
                                      out_metadata.per_file_user_data[0].end()};
 }
 
-table_with_metadata reader::impl::read_chunk_internal(
-  bool uses_custom_row_bounds, std::optional<std::reference_wrapper<ast::expression const>> filter)
+table_with_metadata reader::impl::read_chunk_internal(read_mode mode)
 {
   // If `_output_metadata` has been constructed, just copy it over.
   auto out_metadata = _output_metadata ? table_metadata{*_output_metadata} : table_metadata{};
@@ -510,17 +508,17 @@ table_with_metadata reader::impl::read_chunk_internal(
   out_columns.reserve(_output_buffers.size());
 
   // no work to do (this can happen on the first pass if we have no rows to read)
-  if (!has_more_work()) { return finalize_output(out_metadata, out_columns, filter); }
+  if (!has_more_work()) { return finalize_output(out_metadata, out_columns); }
 
   auto& pass            = *_pass_itm_data;
   auto& subpass         = *pass.subpass;
   auto const& read_info = subpass.output_chunk_read_info[subpass.current_output_chunk];
 
   // Allocate memory buffers for the output columns.
-  allocate_columns(read_info.skip_rows, read_info.num_rows, uses_custom_row_bounds);
+  allocate_columns(mode, read_info.skip_rows, read_info.num_rows);
 
   // Parse data into the output buffers.
-  decode_page_data(uses_custom_row_bounds, read_info.skip_rows, read_info.num_rows);
+  decode_page_data(mode, read_info.skip_rows, read_info.num_rows);
 
   // Create the final output cudf columns.
   for (size_t i = 0; i < _output_buffers.size(); ++i) {
@@ -547,13 +545,11 @@ table_with_metadata reader::impl::read_chunk_internal(
   }
 
   // Add empty columns if needed. Filter output columns based on filter.
-  return finalize_output(out_metadata, out_columns, filter);
+  return finalize_output(out_metadata, out_columns);
 }
 
-table_with_metadata reader::impl::finalize_output(
-  table_metadata& out_metadata,
-  std::vector<std::unique_ptr<column>>& out_columns,
-  std::optional<std::reference_wrapper<ast::expression const>> filter)
+table_with_metadata reader::impl::finalize_output(table_metadata& out_metadata,
+                                                  std::vector<std::unique_ptr<column>>& out_columns)
 {
   // Create empty columns as needed (this can happen if we've ended up with no actual data to read)
   for (size_t i = out_columns.size(); i < _output_buffers.size(); ++i) {
@@ -581,10 +577,13 @@ table_with_metadata reader::impl::finalize_output(
   // increment the output chunk count
   _file_itm_data._output_chunk_count++;
 
-  if (filter.has_value()) {
+  // check if the output filter AST expression (= _expr_conv.get_converted_expr()) exists
+  if (_expr_conv.get_converted_expr().has_value()) {
     auto read_table = std::make_unique<table>(std::move(out_columns));
-    auto predicate  = cudf::detail::compute_column(
-      *read_table, filter.value().get(), _stream, rmm::mr::get_current_device_resource());
+    auto predicate  = cudf::detail::compute_column(*read_table,
+                                                  _expr_conv.get_converted_expr().value().get(),
+                                                  _stream,
+                                                  rmm::mr::get_current_device_resource());
     CUDF_EXPECTS(predicate->view().type().id() == type_id::BOOL8,
                  "Predicate filter should return a boolean");
     // Exclude columns present in filter only in output
@@ -598,22 +597,13 @@ table_with_metadata reader::impl::finalize_output(
   return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
 }
 
-table_with_metadata reader::impl::read(
-  int64_t skip_rows,
-  std::optional<size_type> const& num_rows,
-  bool uses_custom_row_bounds,
-  host_span<std::vector<size_type> const> row_group_indices,
-  std::optional<std::reference_wrapper<ast::expression const>> filter)
+table_with_metadata reader::impl::read()
 {
   CUDF_EXPECTS(_output_chunk_read_limit == 0,
                "Reading the whole file must not have non-zero byte_limit.");
-  table_metadata metadata;
-  populate_metadata(metadata);
-  auto expr_conv     = named_to_reference_converter(filter, metadata);
-  auto output_filter = expr_conv.get_converted_expr();
 
-  prepare_data(skip_rows, num_rows, uses_custom_row_bounds, row_group_indices, output_filter);
-  return read_chunk_internal(uses_custom_row_bounds, output_filter);
+  prepare_data(read_mode::READ_ALL);
+  return read_chunk_internal(read_mode::READ_ALL);
 }
 
 table_with_metadata reader::impl::read_chunk()
@@ -628,22 +618,13 @@ table_with_metadata reader::impl::read_chunk()
     }
   }
 
-  prepare_data(0 /*skip_rows*/,
-               std::nullopt /*num_rows, `nullopt` means unlimited*/,
-               true /*uses_custom_row_bounds*/,
-               {} /*row_group_indices, empty means read all row groups*/,
-               std::nullopt /*filter*/);
-
-  return read_chunk_internal(true, std::nullopt);
+  prepare_data(read_mode::CHUNKED_READ);
+  return read_chunk_internal(read_mode::CHUNKED_READ);
 }
 
 bool reader::impl::has_next()
 {
-  prepare_data(0 /*skip_rows*/,
-               std::nullopt /*num_rows, `nullopt` means unlimited*/,
-               true /*uses_custom_row_bounds*/,
-               {} /*row_group_indices, empty means read all row groups*/,
-               std::nullopt /*filter*/);
+  prepare_data(read_mode::CHUNKED_READ);
 
   // current_input_pass will only be incremented to be == num_passes after
   // the last chunk in the last subpass in the last pass has been returned
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 04da8eed591..3b8e80a29e6 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -64,20 +64,9 @@ class reader::impl {
   /**
    * @brief Read an entire set or a subset of data and returns a set of columns
    *
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows Number of rows to read
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
-   * @param row_group_indices Lists of row groups to read, one per source
-   * @param filter Optional AST expression to filter output rows
-   *
    * @return The set of columns along with metadata
    */
-  table_with_metadata read(int64_t skip_rows,
-                           std::optional<size_type> const& num_rows,
-                           bool uses_custom_row_bounds,
-                           host_span<std::vector<size_type> const> row_group_indices,
-                           std::optional<std::reference_wrapper<ast::expression const>> filter);
+  table_with_metadata read();
 
   /**
    * @brief Constructor from a chunk read limit and an array of dataset sources with reader options.
@@ -132,21 +121,17 @@ class reader::impl {
   // top level functions involved with ratcheting through the passes, subpasses
   // and output chunks of the read process
  private:
+  /**
+   * @brief The enum indicating whether the data sources are read all at once or chunk by chunk.
+   */
+  enum class read_mode { READ_ALL, CHUNKED_READ };
+
   /**
    * @brief Perform the necessary data preprocessing for parsing file later on.
    *
-   * @param skip_rows Number of rows to skip from the start
-   * @param num_rows Number of rows to read, or `std::nullopt` to read all rows
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
-   * @param row_group_indices Lists of row groups to read (one per source), or empty if read all
-   * @param filter Optional AST expression to filter row groups based on column chunk statistics
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void prepare_data(int64_t skip_rows,
-                    std::optional<size_type> const& num_rows,
-                    bool uses_custom_row_bounds,
-                    host_span<std::vector<size_type> const> row_group_indices,
-                    std::optional<std::reference_wrapper<ast::expression const>> filter);
+  void prepare_data(read_mode mode);
 
   /**
    * @brief Preprocess step for the entire file.
@@ -154,23 +139,16 @@ class reader::impl {
    * Only ever called once. This function reads in rowgroup and associated chunk
    * information and computes the schedule of top level passes (see `pass_intermediate_data`).
    *
-   * @param skip_rows The number of rows to skip in the requested set of rowgroups to be read
-   * @param num_rows The total number of rows to read out of the selected rowgroups
-   * @param row_group_indices Lists of row groups to read, one per source
-   * @param filter Optional AST expression to filter output rows
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void preprocess_file(int64_t skip_rows,
-                       std::optional<size_type> const& num_rows,
-                       host_span<std::vector<size_type> const> row_group_indices,
-                       std::optional<std::reference_wrapper<ast::expression const>> filter);
+  void preprocess_file(read_mode mode);
 
   /**
    * @brief Ratchet the pass/subpass/chunk process forward.
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specified
-   *        bounds
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void handle_chunking(bool uses_custom_row_bounds);
+  void handle_chunking(read_mode mode);
 
   /**
    * @brief Setup step for the next input read pass.
@@ -178,36 +156,31 @@ class reader::impl {
    * A 'pass' is defined as a subset of row groups read out of the globally
    * requested set of all row groups.
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    */
-  void setup_next_pass(bool uses_custom_row_bounds);
+  void setup_next_pass(read_mode mode);
 
   /**
    * @brief Setup step for the next decompression subpass.
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
-   *
    * A 'subpass' is defined as a subset of pages within a pass that are
    * decompressed and decoded as a batch. Subpasses may be further subdivided
    * into output chunks.
+   *
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
+   *
    */
-  void setup_next_subpass(bool uses_custom_row_bounds);
+  void setup_next_subpass(read_mode mode);
 
   /**
    * @brief Read a chunk of data and return an output table.
    *
    * This function is called internally and expects all preprocessing steps have already been done.
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
-   * @param filter Optional AST expression to filter output rows
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    * @return The output table along with columns' metadata
    */
-  table_with_metadata read_chunk_internal(
-    bool uses_custom_row_bounds,
-    std::optional<std::reference_wrapper<ast::expression const>> filter);
+  table_with_metadata read_chunk_internal(read_mode mode);
 
   // utility functions
  private:
@@ -253,12 +226,11 @@ class reader::impl {
    *
    * For flat schemas, these values are computed during header decoding (see gpuDecodePageHeaders).
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    * @param chunk_read_limit Limit on total number of bytes to be returned per read,
    *        or `0` if there is no limit
    */
-  void preprocess_subpass_pages(bool uses_custom_row_bounds, size_t chunk_read_limit);
+  void preprocess_subpass_pages(read_mode mode, size_t chunk_read_limit);
 
   /**
    * @brief Allocate nesting information storage for all pages and set pointers to it.
@@ -292,23 +264,19 @@ class reader::impl {
    *
    * @param out_metadata The output table metadata
    * @param out_columns The columns for building the output table
-   * @param filter Optional AST expression to filter output rows
    * @return The output table along with columns' metadata
    */
-  table_with_metadata finalize_output(
-    table_metadata& out_metadata,
-    std::vector<std::unique_ptr<column>>& out_columns,
-    std::optional<std::reference_wrapper<ast::expression const>> filter);
+  table_with_metadata finalize_output(table_metadata& out_metadata,
+                                      std::vector<std::unique_ptr<column>>& out_columns);
 
   /**
    * @brief Allocate data buffers for the output columns.
    *
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    * @param skip_rows Crop all rows below skip_rows
    * @param num_rows Maximum number of rows to read
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
    */
-  void allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds);
+  void allocate_columns(read_mode mode, size_t skip_rows, size_t num_rows);
 
   /**
    * @brief Calculate per-page offsets for string data
@@ -320,12 +288,11 @@ class reader::impl {
   /**
    * @brief Converts the page data and outputs to columns.
    *
-   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
-   *        bounds
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    * @param skip_rows Minimum number of rows from start
    * @param num_rows Number of rows to output
    */
-  void decode_page_data(bool uses_custom_row_bounds, size_t skip_rows, size_t num_rows);
+  void decode_page_data(read_mode mode, size_t skip_rows, size_t num_rows);
 
   /**
    * @brief Creates file-wide parquet chunk information.
@@ -354,6 +321,21 @@ class reader::impl {
   }
 
  private:
+  /**
+   * @brief Check if the user has specified custom row bounds
+   *
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
+   * @return True if the user has specified custom row bounds
+   */
+  [[nodiscard]] bool uses_custom_row_bounds(read_mode mode) const
+  {
+    // TODO: `read_mode` is hardcoded to `true` when `read_mode::CHUNKED_READ` to enforce
+    // `ComputePageSizes()` computation for all remaining chunks.
+    return (mode == read_mode::READ_ALL)
+             ? (_options.num_rows.has_value() or _options.skip_rows != 0)
+             : true;
+  }
+
   [[nodiscard]] bool is_first_output_chunk() const
   {
     return _file_itm_data._output_chunk_count == 0;
@@ -362,6 +344,19 @@ class reader::impl {
   rmm::cuda_stream_view _stream;
   rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 
+  // Reader configs.
+  struct {
+    // timestamp_type
+    data_type timestamp_type{type_id::EMPTY};
+    // User specified reading rows/stripes selection.
+    int64_t const skip_rows;
+    std::optional<int64_t> num_rows;
+    std::vector<std::vector<size_type>> row_group_indices;
+  } const _options;
+
+  // name to reference converter to extract AST output filter
+  named_to_reference_converter _expr_conv{std::nullopt, table_metadata{}};
+
   std::vector<std::unique_ptr<datasource>> _sources;
   std::unique_ptr<aggregate_reader_metadata> _metadata;
 
@@ -389,7 +384,6 @@ class reader::impl {
   bool _has_page_index = false;
 
   std::optional<std::vector<reader_column_schema>> _reader_column_schema;
-  data_type _timestamp_type{type_id::EMPTY};
 
   // chunked reading happens in 2 parts:
   //
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 6824d72cf04..d3f321af0bd 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -1148,12 +1148,12 @@ void include_decompression_scratch_size(device_span<ColumnChunkDesc const> chunk
 
 }  // anonymous namespace
 
-void reader::impl::handle_chunking(bool uses_custom_row_bounds)
+void reader::impl::handle_chunking(read_mode mode)
 {
   // if this is our first time in here, setup the first pass.
   if (!_pass_itm_data) {
     // setup the next pass
-    setup_next_pass(uses_custom_row_bounds);
+    setup_next_pass(mode);
   }
 
   auto& pass = *_pass_itm_data;
@@ -1181,15 +1181,15 @@ void reader::impl::handle_chunking(bool uses_custom_row_bounds)
       if (_file_itm_data._current_input_pass == _file_itm_data.num_passes()) { return; }
 
       // setup the next pass
-      setup_next_pass(uses_custom_row_bounds);
+      setup_next_pass(mode);
     }
   }
 
   // setup the next sub pass
-  setup_next_subpass(uses_custom_row_bounds);
+  setup_next_subpass(mode);
 }
 
-void reader::impl::setup_next_pass(bool uses_custom_row_bounds)
+void reader::impl::setup_next_pass(read_mode mode)
 {
   auto const num_passes = _file_itm_data.num_passes();
 
@@ -1260,7 +1260,7 @@ void reader::impl::setup_next_pass(bool uses_custom_row_bounds)
     detect_malformed_pages(
       pass.pages,
       pass.chunks,
-      uses_custom_row_bounds ? std::nullopt : std::make_optional(pass.num_rows),
+      uses_custom_row_bounds(mode) ? std::nullopt : std::make_optional(pass.num_rows),
       _stream);
 
     // decompress dictionary data if applicable.
@@ -1309,7 +1309,7 @@ void reader::impl::setup_next_pass(bool uses_custom_row_bounds)
   }
 }
 
-void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
+void reader::impl::setup_next_subpass(read_mode mode)
 {
   auto& pass    = *_pass_itm_data;
   pass.subpass  = std::make_unique<subpass_intermediate_data>();
@@ -1444,7 +1444,7 @@ void reader::impl::setup_next_subpass(bool uses_custom_row_bounds)
 
   // preprocess pages (computes row counts for lists, computes output chunks and computes
   // the actual row counts we will be able load out of this subpass)
-  preprocess_subpass_pages(uses_custom_row_bounds, _output_chunk_read_limit);
+  preprocess_subpass_pages(mode, _output_chunk_read_limit);
 
 #if defined(PARQUET_CHUNK_LOGGING)
   printf("\tSubpass: skip_rows(%'lu), num_rows(%'lu), remaining read limit(%'lu)\n",
@@ -1519,8 +1519,8 @@ void reader::impl::create_global_chunk_info()
       auto& schema   = _metadata->get_schema(col.schema_idx);
 
       auto [clock_rate, logical_type] =
-        conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
-                        _timestamp_type.id(),
+        conversion_info(to_type_id(schema, _strings_to_categorical, _options.timestamp_type.id()),
+                        _options.timestamp_type.id(),
                         schema.type,
                         schema.logical_type);
 
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 084f82a2ca0..f533f04e427 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -873,7 +873,7 @@ void reader::impl::allocate_nesting_info()
           nesting_info[cur_depth].max_def_level = cur_schema.max_definition_level;
           pni[cur_depth].size                   = 0;
           pni[cur_depth].type =
-            to_type_id(cur_schema, _strings_to_categorical, _timestamp_type.id());
+            to_type_id(cur_schema, _strings_to_categorical, _options.timestamp_type.id());
           pni[cur_depth].nullable = cur_schema.repetition_type == OPTIONAL;
         }
 
@@ -1221,17 +1221,14 @@ struct update_pass_num_rows {
 
 }  // anonymous namespace
 
-void reader::impl::preprocess_file(
-  int64_t skip_rows,
-  std::optional<size_type> const& num_rows,
-  host_span<std::vector<size_type> const> row_group_indices,
-  std::optional<std::reference_wrapper<ast::expression const>> filter)
+void reader::impl::preprocess_file(read_mode mode)
 {
   CUDF_EXPECTS(!_file_preprocessed, "Attempted to preprocess file more than once");
 
   // if filter is not empty, then create output types as vector and pass for filtering.
+
   std::vector<data_type> output_dtypes;
-  if (filter.has_value()) {
+  if (_expr_conv.get_converted_expr().has_value()) {
     std::transform(_output_buffers_template.cbegin(),
                    _output_buffers_template.cend(),
                    std::back_inserter(output_dtypes),
@@ -1240,12 +1237,12 @@ void reader::impl::preprocess_file(
 
   std::tie(
     _file_itm_data.global_skip_rows, _file_itm_data.global_num_rows, _file_itm_data.row_groups) =
-    _metadata->select_row_groups(row_group_indices,
-                                 skip_rows,
-                                 num_rows,
+    _metadata->select_row_groups(_options.row_group_indices,
+                                 _options.skip_rows,
+                                 _options.num_rows,
                                  output_dtypes,
                                  _output_column_schemas,
-                                 filter,
+                                 _expr_conv.get_converted_expr(),
                                  _stream);
 
   // check for page indexes
@@ -1276,7 +1273,7 @@ void reader::impl::preprocess_file(
   printf("# Input columns: %'lu\n", _input_columns.size());
   for (size_t idx = 0; idx < _input_columns.size(); idx++) {
     auto const& schema = _metadata->get_schema(_input_columns[idx].schema_idx);
-    auto const type_id = to_type_id(schema, _strings_to_categorical, _timestamp_type.id());
+    auto const type_id = to_type_id(schema, _strings_to_categorical, _options.timestamp_type.id());
     printf("\tC(%'lu, %s): %s\n",
            idx,
            _input_columns[idx].name.c_str(),
@@ -1330,7 +1327,7 @@ void reader::impl::generate_list_column_row_count_estimates()
   _stream.synchronize();
 }
 
-void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t chunk_read_limit)
+void reader::impl::preprocess_subpass_pages(read_mode mode, size_t chunk_read_limit)
 {
   auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
@@ -1457,7 +1454,7 @@ void reader::impl::preprocess_subpass_pages(bool uses_custom_row_bounds, size_t
   compute_output_chunks_for_subpass();
 }
 
-void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds)
+void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num_rows)
 {
   auto& pass    = *_pass_itm_data;
   auto& subpass = *pass.subpass;
@@ -1470,7 +1467,7 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
   // account. PageInfo::skipped_values, which tells us where to start decoding in the input to
   // respect the user bounds. It is only necessary to do this second pass if uses_custom_row_bounds
   // is set (if the user has specified artificial bounds).
-  if (uses_custom_row_bounds) {
+  if (uses_custom_row_bounds(mode)) {
     ComputePageSizes(subpass.pages,
                      pass.chunks,
                      skip_rows,
@@ -1479,8 +1476,6 @@ void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses
                      false,  // no need to compute string sizes
                      pass.level_type_size,
                      _stream);
-
-    // print_pages(pages, _stream);
   }
 
   // iterate over all input columns and allocate any associated output

From 70922d0f7fc48bbfeec6c48534642d2ff0d11782 Mon Sep 17 00:00:00 2001
From: er-eis <eeisenberg0@gmail.com>
Date: Mon, 20 May 2024 21:41:52 -0400
Subject: [PATCH 235/842] Add contributing warning about circular imports
 (#15691)

Closes #15689

Adds warning to contributing doc regarding the necessity to rebuild and the type of error a developer may see

Authors:
  - https://github.com/er-eis

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15691
---
 CONTRIBUTING.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 757eaa44510..98c2ec0a22e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -161,6 +161,8 @@ To build all libraries and tests, with Python packages in development mode, simp
 ./build.sh --pydevelop libcudf libcudf_kafka cudf dask_cudf cudf_kafka custreamz
 ```
 
+- **Note**: if Cython files (`*.pyx` or `*.pxd`) have changed, the Python build must be rerun.
+
 To run the C++ tests, run
 
 ```bash

From 8b7245548c63d1ce84031a0bd187cbfb8e072f8c Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Mon, 20 May 2024 20:52:34 -0500
Subject: [PATCH 236/842] [JNI] Expose java API for
 cudf::io::config_host_memory_resource (#15745)

This PR depends on https://github.com/rapidsai/cudf/pull/15665 and so it won't build until that PR merges.

Adds support for `cudf::io::config_host_memory_resource` which is being worked on in #15665.  In 24.06 we are going to disable the cuDF pinned pool and look into this more in 24.08.

We currently have a pinned pooled resource that has been setup to share pinned memory with other APIs we use from java, so we wanted to prevent extra pinned memory being created by default, and @vuule has added an API for us to call to accomplish this.

Authors:
  - Alessandro Bellina (https://github.com/abellina)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15745
---
 .../main/java/ai/rapids/cudf/PinnedMemoryPool.java  | 13 +++++++++++++
 java/src/main/java/ai/rapids/cudf/Rmm.java          | 10 ++++++++++
 java/src/main/native/src/RmmJni.cpp                 | 11 +++++++++++
 3 files changed, 34 insertions(+)

diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
index 6cb34683e5a..9038700cb30 100644
--- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
+++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
@@ -252,4 +252,17 @@ private synchronized HostMemoryBuffer tryAllocateInternal(long bytes) {
   private synchronized void free(long address, long size) {
     Rmm.freeFromPinnedPool(this.poolHandle, address, size);
   }
+
+  /**
+   * Sets the size of the cuDF default pinned pool.
+   *
+   * @note This has to be called before cuDF functions are executed.
+   *
+   * @param size initial and maximum size for the cuDF default pinned pool.
+   *        Pass size=0 to disable the default pool.
+   */
+  public static synchronized void configureDefaultCudfPinnedPoolSize(long size) {
+    Rmm.configureDefaultCudfPinnedPoolSize(size);
+  }
+
 }
diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index 6e9f90e477f..fdbdfdfff6f 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -266,6 +266,16 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
     }
   }
 
+  /**
+   * Sets the size of the cuDF default pinned pool.
+   *
+   * @note This has to be called before cuDF functions are executed.
+   *
+   * @param size initial and maximum size for the cuDF default pinned pool.
+   *        Pass size=0 to disable the default pool.
+   */
+  public static synchronized native void configureDefaultCudfPinnedPoolSize(long size);
+
   /**
    * Get the most recently set pool size or -1 if RMM has not been initialized or pooling is
    * not enabled.
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 68453c924d6..9c015fee409 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -1106,4 +1106,15 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromFallbackPinnedPool(JNIEnv
   }
   CATCH_STD(env, )
 }
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_configureDefaultCudfPinnedPoolSize(JNIEnv* env,
+                                                                                  jclass clazz,
+                                                                                  jlong size)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::io::config_default_host_memory_resource(cudf::io::host_mr_options{size});
+  }
+  CATCH_STD(env, )
+}
 }

From b4daa16f1d67d505abbdd816d4123d4b3a418369 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 21 May 2024 03:56:13 -1000
Subject: [PATCH 237/842] Fix cat.as_ordered not propogating correct size
 (#15780)

closes #15778

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15780
---
 python/cudf/cudf/core/column/categorical.py |  2 +-
 python/cudf/cudf/tests/test_indexing.py     | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index adda8a34cd0..0ff8209dcd4 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1439,7 +1439,7 @@ def as_ordered(self, ordered: bool):
             categories=self.categories,
             codes=self.codes,
             mask=self.base_mask,
-            size=self.base_size,
+            size=self.size,
             offset=self.offset,
             ordered=ordered,
         )
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 16754c3040b..009e48a8669 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -2349,3 +2349,16 @@ def test_loc_datetime_random_with_ts(data, scalar):
         expected = pdf.loc[:i]
 
         assert_eq(actual, expected)
+
+
+def test_sliced_categorical_as_ordered():
+    df = cudf.DataFrame({"a": list("caba"), "b": list(range(4))})
+    df["a"] = df["a"].astype("category")
+    df = df.iloc[:2]
+    result = df["a"].cat.as_ordered()
+    expected = cudf.Series(
+        ["c", "a"],
+        dtype=cudf.CategoricalDtype(list("abc"), ordered=True),
+        name="a",
+    )
+    assert_eq(result, expected)

From 60d5717ba5b9a51cb031b506885a656e50199d22 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 21 May 2024 03:59:27 -1000
Subject: [PATCH 238/842] Improve performance of Series.to_numpy/to_cupy
 (#15792)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

xref https://github.com/rapidsai/cudf/issues/11648

Essentially refactors `Frame._to_array` to short circuit some checks for a `Frame` with 1 column or `ndim == 1`

```python
In [1]: import cudf

In [2]: s = cudf.Series(range(10000))

In [3]: %timeit s.to_cupy()
252 µs ± 3.47 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)  # PR

419 µs ± 2.21 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)  # branch 24.06
```

I needed to add `Frame.ndim` which will raise a `NotImplementedError` (until Frame actually becomes an ABC)

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15792
---
 python/cudf/cudf/core/_base_index.py         |  2 +-
 python/cudf/cudf/core/dataframe.py           |  2 +-
 python/cudf/cudf/core/frame.py               | 85 +++++++++++---------
 python/cudf/cudf/core/multiindex.py          |  2 +-
 python/cudf/cudf/core/single_column_frame.py |  2 +-
 5 files changed, 53 insertions(+), 40 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 6c116e740ff..e6868ae3431 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -145,7 +145,7 @@ def name(self):
         raise NotImplementedError
 
     @property  # type: ignore
-    def ndim(self):  # noqa: D401
+    def ndim(self) -> int:  # noqa: D401
         """Number of dimensions of the underlying data, by definition 1."""
         return 1
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 8442cf05f01..88b1ae2ea22 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1234,7 +1234,7 @@ def dtypes(self):
         return pd.Series(self._dtypes, dtype="object")
 
     @property
-    def ndim(self):
+    def ndim(self) -> int:
         """Dimension of the data. DataFrame ndim is always 2."""
         return 2
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 58932db2bda..92ca76d6ceb 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6,6 +6,7 @@
 import itertools
 import operator
 import pickle
+import types
 import warnings
 from collections import abc
 from typing import (
@@ -91,6 +92,10 @@ def _dtypes(self):
             zip(self._data.names, (col.dtype for col in self._data.columns))
         )
 
+    @property
+    def ndim(self) -> int:
+        raise NotImplementedError()
+
     @_cudf_nvtx_annotate
     def serialize(self):
         # TODO: See if self._data can be serialized outright
@@ -417,51 +422,60 @@ def __arrow_array__(self, type=None):
     @_cudf_nvtx_annotate
     def _to_array(
         self,
-        get_column_values: Callable,
-        make_empty_matrix: Callable,
+        get_array: Callable,
+        module: types.ModuleType,
+        copy: bool,
         dtype: Union[Dtype, None] = None,
         na_value=None,
-    ) -> Union[cupy.ndarray, np.ndarray]:
+    ) -> Union[cupy.ndarray, numpy.ndarray]:
         # Internal function to implement to_cupy and to_numpy, which are nearly
         # identical except for the attribute they access to generate values.
 
-        def get_column_values_na(col):
+        def to_array(
+            col: ColumnBase, dtype: np.dtype
+        ) -> Union[cupy.ndarray, numpy.ndarray]:
             if na_value is not None:
                 col = col.fillna(na_value)
-            return get_column_values(col)
+            array = get_array(col)
+            casted_array = module.asarray(array, dtype=dtype)
+            if copy and casted_array is array:
+                # Don't double copy after asarray
+                casted_array = casted_array.copy()
+            return casted_array
 
-        # Early exit for an empty Frame.
         ncol = self._num_columns
         if ncol == 0:
-            return make_empty_matrix(
-                shape=(len(self), ncol), dtype=np.dtype("float64"), order="F"
+            return module.empty(
+                shape=(len(self), ncol),
+                dtype=numpy.dtype("float64"),
+                order="F",
             )
 
         if dtype is None:
-            dtypes = [col.dtype for col in self._data.values()]
-            for dtype in dtypes:
-                if isinstance(
-                    dtype,
-                    (
-                        cudf.ListDtype,
-                        cudf.core.dtypes.DecimalDtype,
-                        cudf.StructDtype,
-                    ),
-                ):
-                    raise NotImplementedError(
-                        f"{dtype} cannot be exposed as a cupy array"
-                    )
-            dtype = find_common_type(dtypes)
+            if ncol == 1:
+                dtype = next(iter(self._data.values())).dtype
+            else:
+                dtype = find_common_type(
+                    [col.dtype for col in self._data.values()]
+                )
 
-        matrix = make_empty_matrix(
-            shape=(len(self), ncol), dtype=dtype, order="F"
-        )
-        for i, col in enumerate(self._data.values()):
-            # TODO: col.values may fail if there is nullable data or an
-            # unsupported dtype. We may want to catch and provide a more
-            # suitable error.
-            matrix[:, i] = get_column_values_na(col)
-        return matrix
+            if not isinstance(dtype, numpy.dtype):
+                raise NotImplementedError(
+                    f"{dtype} cannot be exposed as an array"
+                )
+
+        if self.ndim == 1:
+            return to_array(self._data.columns[0], dtype)
+        else:
+            matrix = module.empty(
+                shape=(len(self), ncol), dtype=dtype, order="F"
+            )
+            for i, col in enumerate(self._data.values()):
+                # TODO: col.values may fail if there is nullable data or an
+                # unsupported dtype. We may want to catch and provide a more
+                # suitable error.
+                matrix[:, i] = to_array(col, dtype)
+            return matrix
 
     # TODO: As of now, calling cupy.asarray is _much_ faster than calling
     # to_cupy. We should investigate the reasons why and whether we can provide
@@ -496,10 +510,9 @@ def to_cupy(
         cupy.ndarray
         """
         return self._to_array(
-            (lambda col: col.values.copy())
-            if copy
-            else (lambda col: col.values),
-            cupy.empty,
+            lambda col: col.values,
+            cupy,
+            copy,
             dtype,
             na_value,
         )
@@ -536,7 +549,7 @@ def to_numpy(
             )
 
         return self._to_array(
-            (lambda col: col.values_host), np.empty, dtype, na_value
+            lambda col: col.values_host, numpy, copy, dtype, na_value
         )
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 58a2846bf43..c149a1028a0 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -563,7 +563,7 @@ def levels(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def ndim(self):
+    def ndim(self) -> int:
         """Dimension of the data. For MultiIndex ndim is always 2."""
         return 2
 
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 829790007c9..d864b563208 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -77,7 +77,7 @@ def name(self, value):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def ndim(self):  # noqa: D401
+    def ndim(self) -> int:  # noqa: D401
         """Number of dimensions of the underlying data, by definition 1."""
         return 1
 

From d78d565b15bd9a2e3200176af4656ee2098b209b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 21 May 2024 07:57:11 -1000
Subject: [PATCH 239/842] Avoid index-to-column conversion in some DataFrame
 ops (#15763)

xref https://github.com/rapidsai/cudf/pull/15494

* For `Index.str`, check the `dtype` instead of the underlying column type (which would materialize RangeIndex)
* For `set_index`, don't immediately convert passed objects to column until necessary
* For `_make_operands_and_index_for_binop`, don't create pandas object more than once

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15763
---
 python/cudf/cudf/core/dataframe.py | 109 +++++++++++------------------
 python/cudf/cudf/core/index.py     |   3 +-
 2 files changed, 43 insertions(+), 69 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 88b1ae2ea22..0b7c40ff516 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2047,29 +2047,24 @@ def _make_operands_and_index_for_binop(
             equal_columns = True
         elif isinstance(other, Series):
             if (
-                not can_reindex
-                and fn in cudf.utils.utils._EQUALITY_OPS
-                and (
-                    not self._data.to_pandas_index().equals(
-                        other.index.to_pandas()
-                    )
+                not (self_pd_columns := self._data.to_pandas_index()).equals(
+                    other_pd_index := other.index.to_pandas()
                 )
+                and not can_reindex
+                and fn in cudf.utils.utils._EQUALITY_OPS
             ):
                 raise ValueError(
                     "Can only compare DataFrame & Series objects "
                     "whose columns & index are same respectively, "
                     "please reindex."
                 )
-            rhs = dict(zip(other.index.to_pandas(), other.values_host))
+            rhs = dict(zip(other_pd_index, other.values_host))
             # For keys in right but not left, perform binops between NaN (not
             # NULL!) and the right value (result is NaN).
             left_default = as_column(np.nan, length=len(self))
-            equal_columns = other.index.to_pandas().equals(
-                self._data.to_pandas_index()
-            )
+            equal_columns = other_pd_index.equals(self_pd_columns)
             can_use_self_column_name = (
-                equal_columns
-                or list(other._index._data.names) == self._data._level_names
+                equal_columns or other_pd_index.names == self_pd_columns.names
             )
         elif isinstance(other, DataFrame):
             if (
@@ -2952,82 +2947,60 @@ def set_index(
 
         if not isinstance(keys, list):
             keys = [keys]
+        if len(keys) == 0:
+            raise ValueError("No valid columns to be added to index.")
+        if append:
+            keys = [self.index] + keys
 
         # Preliminary type check
-        col_not_found = []
-        columns_to_add = []
+        labels_not_found = []
+        data_to_add = []
         names = []
         to_drop = []
         for col in keys:
-            # Is column label
+            # label-like
             if is_scalar(col) or isinstance(col, tuple):
                 if col in self._column_names:
-                    columns_to_add.append(self[col])
+                    data_to_add.append(self[col])
                     names.append(col)
                     if drop:
                         to_drop.append(col)
                 else:
-                    col_not_found.append(col)
+                    labels_not_found.append(col)
+            # index-like
+            elif isinstance(col, (MultiIndex, pd.MultiIndex)):
+                if isinstance(col, pd.MultiIndex):
+                    col = MultiIndex.from_pandas(col)
+                data_to_add.extend(col._data.columns)
+                names.extend(col.names)
+            elif isinstance(
+                col, (cudf.Series, cudf.Index, pd.Series, pd.Index)
+            ):
+                data_to_add.append(col)
+                names.append(col.name)
             else:
-                # Try coerce into column
-                if not is_column_like(col):
-                    try:
-                        col = as_column(col)
-                    except TypeError:
-                        msg = f"{col} cannot be converted to column-like."
-                        raise TypeError(msg)
-                if isinstance(col, (MultiIndex, pd.MultiIndex)):
-                    col = (
-                        cudf.from_pandas(col)
-                        if isinstance(col, pd.MultiIndex)
-                        else col
-                    )
-                    cols = [col._data[x] for x in col._data]
-                    columns_to_add.extend(cols)
-                    names.extend(col.names)
-                else:
-                    if isinstance(col, (pd.RangeIndex, cudf.RangeIndex)):
-                        # Corner case: RangeIndex does not need to instantiate
-                        columns_to_add.append(col)
-                    else:
-                        # For pandas obj, convert to gpu obj
-                        columns_to_add.append(as_column(col))
-                    if isinstance(
-                        col, (cudf.Series, cudf.Index, pd.Series, pd.Index)
-                    ):
-                        names.append(col.name)
-                    else:
-                        names.append(None)
-
-        if col_not_found:
-            raise KeyError(f"None of {col_not_found} are in the columns")
+                try:
+                    col = as_column(col)
+                except TypeError as err:
+                    msg = f"{col} cannot be converted to column-like."
+                    raise TypeError(msg) from err
+                data_to_add.append(col)
+                names.append(None)
 
-        if append:
-            idx_cols = [self.index._data[x] for x in self.index._data]
-            if isinstance(self.index, MultiIndex):
-                idx_names = self.index.names
-            else:
-                idx_names = [self.index.name]
-            columns_to_add = idx_cols + columns_to_add
-            names = idx_names + names
+        if labels_not_found:
+            raise KeyError(f"None of {labels_not_found} are in the columns")
 
-        if len(columns_to_add) == 0:
-            raise ValueError("No valid columns to be added to index.")
-        elif (
-            len(columns_to_add) == 1
+        if (
+            len(data_to_add) == 1
             and len(keys) == 1
             and not isinstance(keys[0], (cudf.MultiIndex, pd.MultiIndex))
         ):
-            idx = cudf.Index(columns_to_add[0], name=names[0])
+            # Don't turn single level MultiIndex into an Index
+            idx = cudf.Index(data_to_add[0], name=names[0])
         else:
-            idx = MultiIndex._from_data(
-                {i: col for i, col in enumerate(columns_to_add)}
-            )
+            idx = MultiIndex._from_data(dict(enumerate(data_to_add)))
             idx.names = names
 
-        if not isinstance(idx, BaseIndex):
-            raise ValueError("Parameter index should be type `Index`.")
-
         df = self if inplace else self.copy(deep=True)
 
         if verify_integrity and not idx.is_unique:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 209e582e5d6..49bfb150f60 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -38,6 +38,7 @@
     is_integer,
     is_list_like,
     is_scalar,
+    is_string_dtype,
 )
 from cudf.core._base_index import BaseIndex, _return_get_indexer_result
 from cudf.core._compat import PANDAS_LT_300
@@ -1623,7 +1624,7 @@ def _indices_of(self, value):
     @property
     @_cudf_nvtx_annotate
     def str(self):
-        if isinstance(self._values, cudf.core.column.StringColumn):
+        if is_string_dtype(self.dtype):
             return StringMethods(parent=self)
         else:
             raise AttributeError(

From 333718ac90b8d98e026aa57cfa0084af4c68a0f3 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Tue, 21 May 2024 14:31:55 -0400
Subject: [PATCH 240/842] For powers of 10, replace ipow with switch (#15353)

This adds a new runtime calculation of the power-of-10 needed for applying decimal scale factors with a switch statement.  This provides the fastest way of applying the scale.  Note that the multiply and divide operations are performed within the switch itself, so that the compiler sees the full instruction to optimize assembly code gen.  See code comments for details.

This cannot be used within fixed_point (e.g. for comparison operators and rescaling) as it introduced too much register pressure to unrelated benchmarks.  It will only be used for the decimal <--> floating conversion, so it has been moved there to be in a new header file where that code will reside (in an upcoming PR).  This is part of a larger change to change the algorithm for decimal <--> floating conversion to a more accurate one that is forthcoming soon.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/15353
---
 .../cudf/fixed_point/floating_conversion.hpp  | 374 ++++++++++++++++++
 1 file changed, 374 insertions(+)
 create mode 100644 cpp/include/cudf/fixed_point/floating_conversion.hpp

diff --git a/cpp/include/cudf/fixed_point/floating_conversion.hpp b/cpp/include/cudf/fixed_point/floating_conversion.hpp
new file mode 100644
index 00000000000..492f7e75219
--- /dev/null
+++ b/cpp/include/cudf/fixed_point/floating_conversion.hpp
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda/std/type_traits>
+
+namespace numeric {
+
+/**
+ * @addtogroup floating_conversion
+ * @{
+ * @file
+ * @brief fixed_point <--> floating-point conversion functions.
+ */
+
+namespace detail {
+
+/**
+ * @brief Recursively calculate a signed large power of 10 (>= 10^19) that can only be stored in an
+ * 128bit integer
+ *
+ * @note Intended to be run at compile time.
+ *
+ * @tparam Exp10 The power of 10 to calculate
+ * @return Returns 10^Exp10
+ */
+template <int Exp10>
+constexpr __uint128_t large_power_of_10()
+{
+  // Stop at 10^19 to speed up compilation; literals can be used for smaller powers of 10.
+  static_assert(Exp10 >= 19);
+  if constexpr (Exp10 == 19)
+    return __uint128_t(10000000000000000000ULL);
+  else
+    return large_power_of_10<Exp10 - 1>() * __uint128_t(10);
+}
+
+/**
+ * @brief Divide by a power of 10 that fits within a 32bit integer.
+ *
+ * @tparam T Type of value to be divided-from.
+ * @param value The number to be divided-from.
+ * @param exp10 The power-of-10 of the denominator, from 0 to 9 inclusive.
+ * @return Returns value / 10^exp10
+ */
+template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
+CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10)
+{
+  // Computing division this way is much faster than the alternatives.
+  // Division is not implemented in GPU hardware, and the compiler will often implement it as a
+  // multiplication of the reciprocal of the denominator, requiring a conversion to floating point.
+  // Ths is especially slow for larger divides that have to use the FP64 pipeline, where threads
+  // bottleneck.
+
+  // Instead, if the compiler can see exactly what number it is dividing by, it can
+  // produce much more optimal assembly, doing bit shifting, multiplies by a constant, etc.
+  // For the compiler to see the value though, array lookup (with exp10 as the index)
+  // is not sufficient: We have to use a switch statement. Although this introduces a branch,
+  // it is still much faster than doing the divide any other way.
+  // Perhaps an array can be used in C++23 with the assume attribute?
+
+  // Since we're optimizing division this way, we have to do this for multiplication as well.
+  // That's because doing them in different ways (switch, array, runtime-computation, etc.)
+  // increases the register pressure on all kernels that use fixed_point types, specifically slowing
+  // down some of the PYMOD and join benchmarks.
+
+  // This is split up into separate functions for 32-, 64-, and 128-bit denominators.
+  // That way we limit the templated, inlined code generation to the exponents that are
+  // capable of being represented. Combining them together into a single function again
+  // introduces too much pressure on the kernels that use this code, slowing down their benchmarks.
+  // It also dramatically slows down the compile time.
+
+  switch (exp10) {
+    case 0: return value;
+    case 1: return value / 10U;
+    case 2: return value / 100U;
+    case 3: return value / 1000U;
+    case 4: return value / 10000U;
+    case 5: return value / 100000U;
+    case 6: return value / 1000000U;
+    case 7: return value / 10000000U;
+    case 8: return value / 100000000U;
+    case 9: return value / 1000000000U;
+    default: return 0;
+  }
+}
+
+/**
+ * @brief Divide by a power of 10 that fits within a 64bit integer.
+ *
+ * @tparam T Type of value to be divided-from.
+ * @param value The number to be divided-from.
+ * @param exp10 The power-of-10 of the denominator, from 0 to 19 inclusive.
+ * @return Returns value / 10^exp10
+ */
+template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
+CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int exp10)
+{
+  // See comments in divide_power10_32bit() for discussion.
+  switch (exp10) {
+    case 0: return value;
+    case 1: return value / 10U;
+    case 2: return value / 100U;
+    case 3: return value / 1000U;
+    case 4: return value / 10000U;
+    case 5: return value / 100000U;
+    case 6: return value / 1000000U;
+    case 7: return value / 10000000U;
+    case 8: return value / 100000000U;
+    case 9: return value / 1000000000U;
+    case 10: return value / 10000000000ULL;
+    case 11: return value / 100000000000ULL;
+    case 12: return value / 1000000000000ULL;
+    case 13: return value / 10000000000000ULL;
+    case 14: return value / 100000000000000ULL;
+    case 15: return value / 1000000000000000ULL;
+    case 16: return value / 10000000000000000ULL;
+    case 17: return value / 100000000000000000ULL;
+    case 18: return value / 1000000000000000000ULL;
+    case 19: return value / 10000000000000000000ULL;
+    default: return 0;
+  }
+}
+
+/**
+ * @brief Divide by a power of 10 that fits within a 128bit integer.
+ *
+ * @tparam T Type of value to be divided-from.
+ * @param value The number to be divided-from.
+ * @param exp10 The power-of-10 of the denominator, from 0 to 38 inclusive.
+ * @return Returns value / 10^exp10.
+ */
+template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
+CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int exp10)
+{
+  // See comments in divide_power10_32bit() for an introduction.
+  switch (exp10) {
+    case 0: return value;
+    case 1: return value / 10U;
+    case 2: return value / 100U;
+    case 3: return value / 1000U;
+    case 4: return value / 10000U;
+    case 5: return value / 100000U;
+    case 6: return value / 1000000U;
+    case 7: return value / 10000000U;
+    case 8: return value / 100000000U;
+    case 9: return value / 1000000000U;
+    case 10: return value / 10000000000ULL;
+    case 11: return value / 100000000000ULL;
+    case 12: return value / 1000000000000ULL;
+    case 13: return value / 10000000000000ULL;
+    case 14: return value / 100000000000000ULL;
+    case 15: return value / 1000000000000000ULL;
+    case 16: return value / 10000000000000000ULL;
+    case 17: return value / 100000000000000000ULL;
+    case 18: return value / 1000000000000000000ULL;
+    case 19: return value / 10000000000000000000ULL;
+    case 20: return value / large_power_of_10<20>();
+    case 21: return value / large_power_of_10<21>();
+    case 22: return value / large_power_of_10<22>();
+    case 23: return value / large_power_of_10<23>();
+    case 24: return value / large_power_of_10<24>();
+    case 25: return value / large_power_of_10<25>();
+    case 26: return value / large_power_of_10<26>();
+    case 27: return value / large_power_of_10<27>();
+    case 28: return value / large_power_of_10<28>();
+    case 29: return value / large_power_of_10<29>();
+    case 30: return value / large_power_of_10<30>();
+    case 31: return value / large_power_of_10<31>();
+    case 32: return value / large_power_of_10<32>();
+    case 33: return value / large_power_of_10<33>();
+    case 34: return value / large_power_of_10<34>();
+    case 35: return value / large_power_of_10<35>();
+    case 36: return value / large_power_of_10<36>();
+    case 37: return value / large_power_of_10<37>();
+    case 38: return value / large_power_of_10<38>();
+    default: return 0;
+  }
+}
+
+/**
+ * @brief Multiply by a power of 10 that fits within a 32bit integer.
+ *
+ * @tparam T Type of value to be multiplied.
+ * @param value The number to be multiplied.
+ * @param exp10 The power-of-10 of the multiplier, from 0 to 9 inclusive.
+ * @return Returns value * 10^exp10
+ */
+template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int exp10)
+{
+  // See comments in divide_power10_32bit() for discussion.
+  switch (exp10) {
+    case 0: return value;
+    case 1: return value * 10U;
+    case 2: return value * 100U;
+    case 3: return value * 1000U;
+    case 4: return value * 10000U;
+    case 5: return value * 100000U;
+    case 6: return value * 1000000U;
+    case 7: return value * 10000000U;
+    case 8: return value * 100000000U;
+    case 9: return value * 1000000000U;
+    default: return 0;
+  }
+}
+
+/**
+ * @brief Multiply by a power of 10 that fits within a 64bit integer.
+ *
+ * @tparam T Type of value to be multiplied.
+ * @param value The number to be multiplied.
+ * @param exp10 The power-of-10 of the multiplier, from 0 to 19 inclusive.
+ * @return Returns value * 10^exp10
+ */
+template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int exp10)
+{
+  // See comments in divide_power10_32bit() for discussion.
+  switch (exp10) {
+    case 0: return value;
+    case 1: return value * 10U;
+    case 2: return value * 100U;
+    case 3: return value * 1000U;
+    case 4: return value * 10000U;
+    case 5: return value * 100000U;
+    case 6: return value * 1000000U;
+    case 7: return value * 10000000U;
+    case 8: return value * 100000000U;
+    case 9: return value * 1000000000U;
+    case 10: return value * 10000000000ULL;
+    case 11: return value * 100000000000ULL;
+    case 12: return value * 1000000000000ULL;
+    case 13: return value * 10000000000000ULL;
+    case 14: return value * 100000000000000ULL;
+    case 15: return value * 1000000000000000ULL;
+    case 16: return value * 10000000000000000ULL;
+    case 17: return value * 100000000000000000ULL;
+    case 18: return value * 1000000000000000000ULL;
+    case 19: return value * 10000000000000000000ULL;
+    default: return 0;
+  }
+}
+
+/**
+ * @brief Multiply by a power of 10 that fits within a 128bit integer.
+ *
+ * @tparam T Type of value to be multiplied.
+ * @param value The number to be multiplied.
+ * @param exp10 The power-of-10 of the multiplier, from 0 to 38 inclusive.
+ * @return Returns value * 10^exp10.
+ */
+template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10_128bit(T value, int exp10)
+{
+  // See comments in divide_power10_128bit() for discussion.
+  switch (exp10) {
+    case 0: return value;
+    case 1: return value * 10U;
+    case 2: return value * 100U;
+    case 3: return value * 1000U;
+    case 4: return value * 10000U;
+    case 5: return value * 100000U;
+    case 6: return value * 1000000U;
+    case 7: return value * 10000000U;
+    case 8: return value * 100000000U;
+    case 9: return value * 1000000000U;
+    case 10: return value * 10000000000ULL;
+    case 11: return value * 100000000000ULL;
+    case 12: return value * 1000000000000ULL;
+    case 13: return value * 10000000000000ULL;
+    case 14: return value * 100000000000000ULL;
+    case 15: return value * 1000000000000000ULL;
+    case 16: return value * 10000000000000000ULL;
+    case 17: return value * 100000000000000000ULL;
+    case 18: return value * 1000000000000000000ULL;
+    case 19: return value * 10000000000000000000ULL;
+    case 20: return value * large_power_of_10<20>();
+    case 21: return value * large_power_of_10<21>();
+    case 22: return value * large_power_of_10<22>();
+    case 23: return value * large_power_of_10<23>();
+    case 24: return value * large_power_of_10<24>();
+    case 25: return value * large_power_of_10<25>();
+    case 26: return value * large_power_of_10<26>();
+    case 27: return value * large_power_of_10<27>();
+    case 28: return value * large_power_of_10<28>();
+    case 29: return value * large_power_of_10<29>();
+    case 30: return value * large_power_of_10<30>();
+    case 31: return value * large_power_of_10<31>();
+    case 32: return value * large_power_of_10<32>();
+    case 33: return value * large_power_of_10<33>();
+    case 34: return value * large_power_of_10<34>();
+    case 35: return value * large_power_of_10<35>();
+    case 36: return value * large_power_of_10<36>();
+    case 37: return value * large_power_of_10<37>();
+    case 38: return value * large_power_of_10<38>();
+    default: return 0;
+  }
+}
+
+/**
+ * @brief Multiply an integer by a power of 10.
+ *
+ * @note Use this function if you have no a-priori knowledge of what exp10 might be.
+ * If you do, prefer calling the bit-size-specific versions
+ *
+ * @tparam Rep Representation type needed for integer exponentiation
+ * @tparam T Integral type of value to be multiplied.
+ * @param value The number to be multiplied.
+ * @param exp10 The power-of-10 of the multiplier.
+ * @return Returns value * 10^exp10
+ */
+template <typename Rep,
+          typename T,
+          typename cuda::std::enable_if_t<(cuda::std::is_unsigned_v<T>)>* = nullptr>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10(T value, int exp10)
+{
+  // Use this function if you have no knowledge of what exp10 might be
+  // If you do, prefer calling the bit-size-specific versions
+  if constexpr (sizeof(Rep) <= 4) {
+    return multiply_power10_32bit(value, exp10);
+  } else if constexpr (sizeof(Rep) <= 8) {
+    return multiply_power10_64bit(value, exp10);
+  } else {
+    return multiply_power10_128bit(value, exp10);
+  }
+}
+
+/**
+ * @brief Divide an integer by a power of 10.
+ *
+ * @note Use this function if you have no a-priori knowledge of what exp10 might be.
+ * If you do, prefer calling the bit-size-specific versions
+ *
+ * @tparam Rep Representation type needed for integer exponentiation
+ * @tparam T Integral type of value to be divided-from.
+ * @param value The number to be divided-from.
+ * @param exp10 The power-of-10 of the denominator.
+ * @return Returns value / 10^exp10
+ */
+template <typename Rep,
+          typename T,
+          typename cuda::std::enable_if_t<(cuda::std::is_unsigned_v<T>)>* = nullptr>
+CUDF_HOST_DEVICE inline constexpr T divide_power10(T value, int exp10)
+{
+  // Use this function if you have no knowledge of what exp10 might be
+  // If you do, prefer calling the bit-size-specific versions
+  if constexpr (sizeof(Rep) <= 4) {
+    return divide_power10_32bit(value, exp10);
+  } else if constexpr (sizeof(Rep) <= 8) {
+    return divide_power10_64bit(value, exp10);
+  } else {
+    return divide_power10_128bit(value, exp10);
+  }
+}
+
+}  // namespace detail
+
+/** @} */  // end of group
+}  // namespace numeric

From 2c70971ecc66960dcf4bfb2fc6618c7f9f60980f Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 21 May 2024 16:36:01 -0500
Subject: [PATCH 241/842] Upgrade `arrow` to 16.1 (#15787)

This PR upgrades arrow to 16.1

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15787
---
 .../all_cuda-118_arch-x86_64.yaml             | 10 ++++-----
 .../all_cuda-122_arch-x86_64.yaml             | 10 ++++-----
 conda/recipes/cudf/meta.yaml                  |  4 ++--
 conda/recipes/libcudf/conda_build_config.yaml |  2 +-
 conda/recipes/libcudf/meta.yaml               |  9 --------
 cpp/cmake/thirdparty/get_arrow.cmake          |  2 +-
 dependencies.yaml                             | 22 +++++++++----------
 python/cudf/cudf/tests/test_orc.py            |  3 +++
 python/cudf/pyproject.toml                    |  4 ++--
 python/cudf_kafka/pyproject.toml              |  2 +-
 10 files changed, 31 insertions(+), 37 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 48699b81eed..804b09bab59 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -36,15 +36,15 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-acero==16.0.0.*
-- libarrow-dataset==16.0.0.*
-- libarrow==16.0.0.*
+- libarrow-acero==16.1.0.*
+- libarrow-dataset==16.1.0.*
+- libarrow==16.1.0.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
 - libkvikio==24.6.*
-- libparquet==16.0.0.*
+- libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.6.*
 - make
@@ -66,7 +66,7 @@ dependencies:
 - pip
 - pre-commit
 - ptxcompiler
-- pyarrow==16.0.0.*
+- pyarrow==16.1.0.*
 - pydata-sphinx-theme!=0.14.2
 - pytest-benchmark
 - pytest-cases>=3.8.2
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index d06a727f331..89eac98f652 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -37,13 +37,13 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-acero==16.0.0.*
-- libarrow-dataset==16.0.0.*
-- libarrow==16.0.0.*
+- libarrow-acero==16.1.0.*
+- libarrow-dataset==16.1.0.*
+- libarrow==16.1.0.*
 - libcufile-dev
 - libcurand-dev
 - libkvikio==24.6.*
-- libparquet==16.0.0.*
+- libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.6.*
 - make
@@ -63,7 +63,7 @@ dependencies:
 - pandoc
 - pip
 - pre-commit
-- pyarrow==16.0.0.*
+- pyarrow==16.1.0.*
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink
 - pytest-benchmark
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 12e29c77a98..e7245e67659 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -64,7 +64,7 @@ requirements:
     - scikit-build-core >=0.7.0
     - dlpack >=0.8,<1.0
     - numpy 1.23
-    - pyarrow ==16.0.0.*
+    - pyarrow ==16.1.0.*
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}
@@ -82,7 +82,7 @@ requirements:
     - cupy >=12.0.0
     - numba >=0.57
     - {{ pin_compatible('numpy', max_pin='x') }}
-    - {{ pin_compatible('pyarrow', max_pin='x') }}
+    - {{ pin_compatible('pyarrow', max_pin='x.x') }}
     - libcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 61ffcf3c3de..c01178bf732 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -20,7 +20,7 @@ cmake_version:
   - ">=3.26.4"
 
 libarrow_version:
-  - "==16.0.0"
+  - "==16.1.0"
 
 dlpack_version:
   - ">=0.8,<1.0"
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index ad2e840c71d..76115362b6c 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -86,9 +86,6 @@ outputs:
         {% else %}
         - {{ compiler('cuda') }}
         {% endif %}
-        # TODO: start taking libarrow's run exports again wwhen they're correct for 16.0
-        # ref: https://github.com/conda-forge/arrow-cpp-feedstock/issues/1418
-        - libarrow
     requirements:
       build:
         - cmake {{ cmake_version }}
@@ -108,12 +105,6 @@ outputs:
         - librmm ={{ minor_version }}
         - libkvikio ={{ minor_version }}
         - dlpack {{ dlpack_version }}
-        # TODO: start taking libarrow's run exports again wwhen they're correct for 16.0
-        # ref: https://github.com/conda-forge/arrow-cpp-feedstock/issues/1418
-        - libarrow>=16.0.0,<16.1.0a0
-        - libarrow-acero>=16.0.0,<16.1.0a0
-        - libarrow-dataset>=16.0.0,<16.1.0a0
-        - libparquet>=16.0.0,<16.1.0a0
     test:
       commands:
         - test -f $PREFIX/lib/libcudf.so
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 73e66cce608..0afdc526981 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -430,7 +430,7 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
   set(CUDF_VERSION_Arrow
       # This version must be kept in sync with the libarrow version pinned for builds in
       # dependencies.yaml.
-      16.0.0
+      16.1.0
       CACHE STRING "The version of Arrow to find (or build)"
   )
 endif()
diff --git a/dependencies.yaml b/dependencies.yaml
index f20c1591e73..0844d86fb66 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -289,7 +289,7 @@ dependencies:
           - cython>=3.0.3
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - pyarrow==16.0.0.*
+          - pyarrow==16.1.0.*
       - output_types: conda
         packages:
           - scikit-build-core>=0.7.0
@@ -332,25 +332,25 @@ dependencies:
         packages:
           # Hard pin the Arrow patch version used during the build. This must
           # be kept in sync with the version pinned in get_arrow.cmake.
-          - libarrow-acero==16.0.0.*
-          - libarrow-dataset==16.0.0.*
-          - libarrow==16.0.0.*
-          - libparquet==16.0.0.*
+          - libarrow-acero==16.1.0.*
+          - libarrow-dataset==16.1.0.*
+          - libarrow==16.1.0.*
+          - libparquet==16.1.0.*
   libarrow_run:
     common:
       - output_types: conda
         packages:
-          # Allow runtime version to float up to minor version
-          - libarrow-acero>=16.0.0,<16.1.0a0
-          - libarrow-dataset>=16.0.0,<16.1.0a0
-          - libarrow>=16.0.0,<16.1.0a0
-          - libparquet>=16.0.0,<16.1.0a0
+          # Allow runtime version to float up to patch version
+          - libarrow-acero>=16.1.0,<16.2.0a0
+          - libarrow-dataset>=16.1.0,<16.2.0a0
+          - libarrow>=16.1.0,<16.2.0a0
+          - libparquet>=16.1.0,<16.2.0a0
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
           # Allow runtime version to float up to patch version
-          - pyarrow>=16.0.0,<16.1.0a0
+          - pyarrow>=16.1.0,<16.2.0a0
   cuda_version:
     specific:
       - output_types: conda
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index a9bca7d8b98..83b7353ad89 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1833,6 +1833,9 @@ def test_orc_writer_negative_timestamp(negative_timestamp_df):
     )
 
 
+@pytest.mark.skip(
+    reason="Bug specific to rockylinux8: https://github.com/rapidsai/cudf/issues/15802",
+)
 def test_orc_reader_apache_negative_timestamp(datadir):
     path = datadir / "TestOrcFile.apache_timestamp.orc"
 
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 826362f0632..38aa6eeb24e 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
-    "pyarrow==16.0.0.*",
+    "pyarrow==16.1.0.*",
     "rmm==24.6.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -34,7 +34,7 @@ dependencies = [
     "packaging",
     "pandas>=2.0,<2.2.3dev0",
     "ptxcompiler",
-    "pyarrow>=16.0.0,<16.1.0a0",
+    "pyarrow>=16.1.0,<16.2.0a0",
     "rich",
     "rmm==24.6.*",
     "typing_extensions>=4.0.0",
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 787dd8a97d7..80e30e000c0 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -7,7 +7,7 @@ requires = [
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
-    "pyarrow==16.0.0.*",
+    "pyarrow==16.1.0.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

From fea8fd611f38dc2610d97caded44b17905efbfa5 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Tue, 21 May 2024 17:51:38 -0500
Subject: [PATCH 242/842] Add multithreaded parquet reader benchmarks. (#15585)

Addresses:  https://github.com/rapidsai/cudf/issues/12700

Adds multithreaded benchmarks for the parquet reader.  Separate benchmarks for the chunked and non-chunked readers.  In both cases, the primary cases are 2, 4 and 8 threads running reads at the same time.   There is not a ton of variability in the other benchmarking axes.

The primary use of this particular benchmark is to see inter-kernel performance (that is, how well do our many different kernel types coexist with each other).  Whereas normal benchmarks tend to be more for intra-kernel performance checking.

NVTX ranges are included to help visually group the bundles of reads together in nsight-sys.   I also posted a new issue which would help along these lines: https://github.com/rapidsai/cudf/issues/15575

Update:  I've tweaked some of the numbers to demonstrate some mild performance improvements as we go up in thread count, and included 1-thread as a case.  Some examples:

```
## parquet_multithreaded_read_decode_mixed
| cardinality | total_data_size | num_threads | num_cols | bytes_per_second |
|-------------|-----------------|-------------|----------|------------------|
|        1000 |       536870912 |           1 |        4 |      28874731473 |
|        1000 |      1073741824 |           1 |        4 |      30564139526 |
|        1000 |       536870912 |           2 |        4 |      29399214255 |
|        1000 |      1073741824 |           2 |        4 |      31486327920 |
|        1000 |       536870912 |           4 |        4 |      27009769400 |
|        1000 |      1073741824 |           4 |        4 |      32234841632 |
|        1000 |       536870912 |           8 |        4 |      24416650118 |
|        1000 |      1073741824 |           8 |        4 |      30841124677 |
```

```
## parquet_multithreaded_read_decode_chunked_string
| cardinality | total_data_size | num_threads | num_cols | bytes_per_second |
|-------------|-----------------|-------------|----------|------------------|
|        1000 |       536870912 |           1 |        4 |      14637004584 |
|        1000 |      1073741824 |           1 |        4 |      16025843421 |
|        1000 |       536870912 |           2 |        4 |      15333491977 |
|        1000 |      1073741824 |           2 |        4 |      17164197747 |
|        1000 |       536870912 |           4 |        4 |      16556300728 |
|        1000 |      1073741824 |           4 |        4 |      17711338934 |
|        1000 |       536870912 |           8 |        4 |      15788371298 |
|        1000 |      1073741824 |           8 |        4 |      17911649578 |
```

In addition, this benchmark clearly shows multi-thread only regressions. An example case below using the pageable-error-code regression we've seen in the past.

Example without regression:
```

## parquet_multithreaded_read_decode_chunked_fixed_width
total_data_size | num_threads | bytes_per_second |
----------------|-------------|------------------|
      536870912 |           1 |      25681728660 |
     1073741824 |           1 |      26281335927 |
      536870912 |           2 |      25597258848 |
     1073741824 |           2 |      26733626352 |
      536870912 |           4 |      25190211717 |
     1073741824 |           4 |      28117411682 |
      536870912 |           8 |      25805791994 |
     1073741824 |           8 |      27788485204 |
```

Example with regression (pageable error-code return values):

```
## parquet_multithreaded_read_decode_chunked_fixed_width
total_data_size | num_threads | bytes_per_second |
-----------------|------------|------------------|
       536870912 |          1 |      25660470283 |
      1073741824 |          1 |      26146862480 |
       536870912 |          2 |      25040145602 |
      1073741824 |          2 |      25460591520 |
       536870912 |          4 |      22917046969 |
      1073741824 |          4 |      24922624784 |
       536870912 |          8 |      20529770200 |
      1073741824 |          8 |      23333751767 |
```

In both cases, we can see that the single-thread case remains the same but there's a regression in the multi-thread case. particularly with 4 threads.

Authors:
  - https://github.com/nvdbaranec
  - Mike Wilson (https://github.com/hyperbolic2346)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15585
---
 cpp/benchmarks/CMakeLists.txt                 |   5 +
 cpp/benchmarks/io/cuio_common.hpp             |   4 +
 .../io/parquet/parquet_reader_multithread.cpp | 351 ++++++++++++++++++
 .../cudf}/utilities/thread_pool.hpp           |   0
 cpp/src/io/utilities/file_io_utilities.hpp    |   4 +-
 5 files changed, 362 insertions(+), 2 deletions(-)
 create mode 100644 cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
 rename cpp/{src/io => include/cudf}/utilities/thread_pool.hpp (100%)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 4586a12f466..170cf27b72b 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -256,6 +256,11 @@ ConfigureNVBench(
   PARQUET_READER_NVBENCH io/parquet/parquet_reader_input.cpp io/parquet/parquet_reader_options.cpp
 )
 
+# ##################################################################################################
+# * parquet multithread reader benchmark
+# ----------------------------------------------------------------------
+ConfigureNVBench(PARQUET_MULTITHREAD_READER_NVBENCH io/parquet/parquet_reader_multithread.cpp)
+
 # ##################################################################################################
 # * orc reader benchmark --------------------------------------------------------------------------
 ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp io/orc/orc_reader_options.cpp)
diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp
index 3d5be41e25f..6e0b32219ce 100644
--- a/cpp/benchmarks/io/cuio_common.hpp
+++ b/cpp/benchmarks/io/cuio_common.hpp
@@ -39,6 +39,10 @@ class cuio_source_sink_pair {
     // delete the temporary file
     std::remove(file_name.c_str());
   }
+  // move constructor
+  cuio_source_sink_pair(cuio_source_sink_pair&& ss)            = default;
+  cuio_source_sink_pair& operator=(cuio_source_sink_pair&& ss) = default;
+
   /**
    * @brief Created a source info of the set type
    *
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
new file mode 100644
index 00000000000..fbdcfb0ade9
--- /dev/null
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
+#include <benchmarks/io/nvbench_helpers.hpp>
+
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/io/memory_resource.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/thread_pool.hpp>
+
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <nvtx3/nvtx3.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <vector>
+
+// TODO: remove this once pinned/pooled is enabled by default in cuIO
+void set_cuio_host_pinned_pool()
+{
+  using host_pooled_mr = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
+  static std::shared_ptr<host_pooled_mr> mr = std::make_shared<host_pooled_mr>(
+    std::make_shared<rmm::mr::pinned_host_memory_resource>().get(), 256ul * 1024 * 1024);
+  cudf::io::set_host_memory_resource(*mr);
+}
+
+size_t get_num_reads(nvbench::state const& state) { return state.get_int64("num_threads"); }
+
+size_t get_read_size(nvbench::state const& state)
+{
+  auto const num_reads = get_num_reads(state);
+  return state.get_int64("total_data_size") / num_reads;
+}
+
+std::string get_label(std::string const& test_name, nvbench::state const& state)
+{
+  auto const num_cols       = state.get_int64("num_cols");
+  size_t const read_size_mb = get_read_size(state) / (1024 * 1024);
+  return {test_name + ", " + std::to_string(num_cols) + " columns, " +
+          std::to_string(state.get_int64("num_threads")) + " threads " + " (" +
+          std::to_string(read_size_mb) + " MB each)"};
+}
+
+std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
+  nvbench::state& state, std::vector<cudf::type_id> const& d_types)
+{
+  cudf::size_type const cardinality = state.get_int64("cardinality");
+  cudf::size_type const run_length  = state.get_int64("run_length");
+  cudf::size_type const num_cols    = state.get_int64("num_cols");
+  size_t const num_files            = get_num_reads(state);
+  size_t const per_file_data_size   = get_read_size(state);
+
+  std::vector<cuio_source_sink_pair> source_sink_vector;
+
+  size_t total_file_size = 0;
+
+  for (size_t i = 0; i < num_files; ++i) {
+    cuio_source_sink_pair source_sink{cudf::io::io_type::HOST_BUFFER};
+
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_types, num_cols),
+      table_size_bytes{per_file_data_size},
+      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const view = tbl->view();
+
+    cudf::io::parquet_writer_options write_opts =
+      cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+        .compression(cudf::io::compression_type::SNAPPY)
+        .max_page_size_rows(50000)
+        .max_page_size_bytes(1024 * 1024);
+
+    cudf::io::write_parquet(write_opts);
+    total_file_size += source_sink.size();
+
+    source_sink_vector.push_back(std::move(source_sink));
+  }
+
+  return {std::move(source_sink_vector), total_file_size, num_files};
+}
+
+void BM_parquet_multithreaded_read_common(nvbench::state& state,
+                                          std::vector<cudf::type_id> const& d_types,
+                                          std::string const& label)
+{
+  size_t const data_size = state.get_int64("total_data_size");
+  auto const num_threads = state.get_int64("num_threads");
+
+  set_cuio_host_pinned_pool();
+
+  auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
+  cudf::detail::thread_pool threads(num_threads);
+
+  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  nvtxRangePushA(("(read) " + label).c_str());
+  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+             [&](nvbench::launch& launch, auto& timer) {
+               auto read_func = [&](int index) {
+                 auto const stream = streams[index % num_threads];
+                 auto& source_sink = source_sink_vector[index];
+                 cudf::io::parquet_reader_options read_opts =
+                   cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
+                 cudf::io::read_parquet(read_opts, stream, rmm::mr::get_current_device_resource());
+               };
+
+               threads.paused = true;
+               for (size_t i = 0; i < num_files; ++i) {
+                 threads.submit(read_func, i);
+               }
+               timer.start();
+               threads.paused = false;
+               threads.wait_for_tasks();
+               cudf::detail::join_streams(streams, cudf::get_default_stream());
+               timer.stop();
+             });
+  nvtxRangePop();
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size");
+}
+
+void BM_parquet_multithreaded_read_mixed(nvbench::state& state)
+{
+  auto label = get_label("mixed", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_common(
+    state, {cudf::type_id::INT32, cudf::type_id::DECIMAL64, cudf::type_id::STRING}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_fixed_width(nvbench::state& state)
+{
+  auto label = get_label("fixed width", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_common(state, {cudf::type_id::INT32}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_string(nvbench::state& state)
+{
+  auto label = get_label("string", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_common(state, {cudf::type_id::STRING}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_list(nvbench::state& state)
+{
+  auto label = get_label("list", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_common(state, {cudf::type_id::LIST}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
+                                                  std::vector<cudf::type_id> const& d_types,
+                                                  std::string const& label)
+{
+  size_t const data_size    = state.get_int64("total_data_size");
+  auto const num_threads    = state.get_int64("num_threads");
+  size_t const input_limit  = state.get_int64("input_limit");
+  size_t const output_limit = state.get_int64("output_limit");
+
+  set_cuio_host_pinned_pool();
+
+  auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
+  cudf::detail::thread_pool threads(num_threads);
+  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  nvtxRangePushA(("(read) " + label).c_str());
+  std::vector<cudf::io::table_with_metadata> chunks;
+  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+             [&](nvbench::launch& launch, auto& timer) {
+               auto read_func = [&](int index) {
+                 auto const stream = streams[index % num_threads];
+                 auto& source_sink = source_sink_vector[index];
+                 cudf::io::parquet_reader_options read_opts =
+                   cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
+                 // divide chunk limits by number of threads so the number of chunks produced is the
+                 // same for all cases. this seems better than the alternative, which is to keep the
+                 // limits the same. if we do that, as the number of threads goes up, the number of
+                 // chunks goes down - so are actually benchmarking the same thing in that case?
+                 auto reader = cudf::io::chunked_parquet_reader(
+                   output_limit / num_threads, input_limit / num_threads, read_opts, stream);
+
+                 // read all the chunks
+                 do {
+                   auto table = reader.read_chunk();
+                 } while (reader.has_next());
+               };
+
+               threads.paused = true;
+               for (size_t i = 0; i < num_files; ++i) {
+                 threads.submit(read_func, i);
+               }
+               timer.start();
+               threads.paused = false;
+               threads.wait_for_tasks();
+               cudf::detail::join_streams(streams, cudf::get_default_stream());
+               timer.stop();
+             });
+  nvtxRangePop();
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size");
+}
+
+void BM_parquet_multithreaded_read_chunked_mixed(nvbench::state& state)
+{
+  auto label = get_label("mixed", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_chunked_common(
+    state, {cudf::type_id::INT32, cudf::type_id::DECIMAL64, cudf::type_id::STRING}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_chunked_fixed_width(nvbench::state& state)
+{
+  auto label = get_label("mixed", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_chunked_common(state, {cudf::type_id::INT32}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_chunked_string(nvbench::state& state)
+{
+  auto label = get_label("string", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_chunked_common(state, {cudf::type_id::STRING}, label);
+  nvtxRangePop();
+}
+
+void BM_parquet_multithreaded_read_chunked_list(nvbench::state& state)
+{
+  auto label = get_label("list", state);
+  nvtxRangePushA(label.c_str());
+  BM_parquet_multithreaded_read_chunked_common(state, {cudf::type_id::LIST}, label);
+  nvtxRangePop();
+}
+
+// mixed data types: fixed width and strings
+NVBENCH_BENCH(BM_parquet_multithreaded_read_mixed)
+  .set_name("parquet_multithreaded_read_decode_mixed")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width)
+  .set_name("parquet_multithreaded_read_decode_fixed_width")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+NVBENCH_BENCH(BM_parquet_multithreaded_read_string)
+  .set_name("parquet_multithreaded_read_decode_string")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+NVBENCH_BENCH(BM_parquet_multithreaded_read_list)
+  .set_name("parquet_multithreaded_read_decode_list")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+// mixed data types: fixed width, strings
+NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed)
+  .set_name("parquet_multithreaded_read_decode_chunked_mixed")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+
+NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width)
+  .set_name("parquet_multithreaded_read_decode_chunked_fixed_width")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+
+NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string)
+  .set_name("parquet_multithreaded_read_decode_chunked_string")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+
+NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list)
+  .set_name("parquet_multithreaded_read_decode_chunked_list")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
+  .add_int64_axis("num_threads", {1, 2, 4, 8})
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
diff --git a/cpp/src/io/utilities/thread_pool.hpp b/cpp/include/cudf/utilities/thread_pool.hpp
similarity index 100%
rename from cpp/src/io/utilities/thread_pool.hpp
rename to cpp/include/cudf/utilities/thread_pool.hpp
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 74a2ae53961..91ef41fba6e 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -17,10 +17,10 @@
 #pragma once
 
 #ifdef CUFILE_FOUND
-#include "thread_pool.hpp"
-
 #include <cudf_test/file_utilities.hpp>
 
+#include <cudf/utilities/thread_pool.hpp>
+
 #include <cufile.h>
 #endif
 

From 9a0612b3add9c76ea8cb45cc230b75b2474d91f7 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 21 May 2024 17:09:00 -0700
Subject: [PATCH 243/842] Fix row group alignment in ORC writer (#15789)

Closes https://github.com/rapidsai/cudf/issues/15775

ORC writer encodes null mask bits in multiples of eight to avoid issues with other readers reading partial encoded bytes. When this does not align with row groups, the null mask encode boundaries are moved to align to multiples of eight. There was a bug in the alignment code that caused a pointless shift by 8 bits and, then, issues in encode. This PR fixes the unnecessary shift.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15789
---
 cpp/src/io/orc/writer_impl.cu                    |  12 ++++++++++--
 .../data/orc/TestOrcFile.MapManyNulls.parquet    | Bin 0 -> 6353 bytes
 python/cudf/cudf/tests/test_orc.py               |  13 +++++++++++++
 3 files changed, 23 insertions(+), 2 deletions(-)
 create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.MapManyNulls.parquet

diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 750a593920c..344e216cdc8 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -782,8 +782,16 @@ std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
         } else {
           // pushdown mask present; null mask bits w/ set pushdown mask bits will be encoded
           // Use the number of set bits in pushdown mask as size
-          auto bits_to_borrow =
-            8 - (d_pd_set_counts[rg_idx][parent_col_idx] - previously_borrowed) % 8;
+          auto bits_to_borrow = [&]() {
+            auto const parent_valid_count = d_pd_set_counts[rg_idx][parent_col_idx];
+            if (parent_valid_count < previously_borrowed) {
+              // Borrow to make an empty rowgroup
+              return previously_borrowed - parent_valid_count;
+            }
+            auto const misalignment = (parent_valid_count - previously_borrowed) % 8;
+            return (8 - misalignment) % 8;
+          }();
+
           if (bits_to_borrow == 0) {
             // Didn't borrow any bits for this rowgroup
             previously_borrowed = 0;
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.MapManyNulls.parquet b/python/cudf/cudf/tests/data/orc/TestOrcFile.MapManyNulls.parquet
new file mode 100644
index 0000000000000000000000000000000000000000..a80ce5fbd25837e1e25991dd14e9d2bed849c651
GIT binary patch
literal 6353
zcmaLcd0Z36qX6)5#So7I0)~hRDT$i}5fm>(E&)SSL<EGWXd(&8ilBg26bwX&Ve3Hx
ziOFg;S(9~lyzs&c%g;h9RjajHZ>o6Vg;H-k%3FTFzTf-f{oc!GKiSFb?99&0?#z5Q
zsc~uHKCV9AKHf9vohyUk#9%Pu`@Z6Q7(UO#=`F<P#5O1TAk&A*V9+TT^iRfZcLqak
zM^OZ&bV?vI*y|0#OcK@uIJaIWw&{cw*#Kn=*cJ}kX12n-0B2JbnXGiz3EZ(%o0Mlx
zO=c@3kVXlT*peD0Y<34oa-~KlCkP1zA$C1ihGYUI#X|r?M_FM#0;4)u;)M|a7Xkxn
z)n=-d2(TFd;9DgmB!z6afbERSDFCBEL_8#N$JqoKs8nW>np}<vXjwdJ5zMOf$^rmb
zV*{`tVAU9KvmIAbumUCs4Mv(#*o279frdaKi4X|KiW4YB2z?2MLvL`f{k&09YEhY!
zV69O}sFf&cH5t+gX`qgfxU-n<xWo>@7$?cZ$5DQoO(;gHAYGC$7f&<ds3g}x=)5yg
zfdB&#ViaRs7%|#-2CF+32LY>`LfBlfm}><Z0&rB27C6&kfN+5hhedF9s?5)7C1D<6
zsK7yzoek97Tnx)ZNz9mP<q|*zBX%Q&!I-_8BDqGEN#zVE9OG7!8b=b$Lrk-^fnHdR
z8L4$r2}$>KfDtMi@ZfVRStQ2JCIkW_q0g2JQCx}hfXxKB6pCge3b?vTWmoF5Sg@-=
zhm(YoM_;VU%T4XDK?(@x)Ud4vp&*mW3XnmGlotxQ8FZPX(f}j`T9IiZK$1*FAifD4
zQlAD@=qaM!!j#z|2M=ba`zbUmDWXe&DJ&UvLjyfAHZzl5;VCABSrtBMNpz2s9wxmz
zK<Ub%sKB0`gt!V3B3D4DjT()Q8=S}$@e<6^3J1WfVoWOzqBI$(#y5zE`x*jxOn|4>
z)EI&|u!`<uhy#R#1*gnBDh-ehzE~LK=B#taDV(g~NdqWn7$7{7G?S8ek_7RHTxSwe
zV!k-b0$01a`6=TqvuzIGjN)7XtC32GKoE*jNKr&Go0U|Z<jGLf+OZ%U#%pn<QIF7#
zAQ75L3#zKnQZ_gl)}fR&(_0TSC<;exxWk44jwlhqn3y85USQ|dh*UVi6DbqyRbq`e
z+qH&rhHP54U2FvinJvkpoPY{))bgDJ3~CbMyPGY68Xr<;NR|M!3TK(PN-m(7)u7%F
zqVF)bY@}WzQ}8_`$&`q+N&pPRm_ux3RmuEh1Wu|Xnk<Ab4geZpU<3uiDt1y5EU}^l
zic)|ydq8QFMyDZaL|n04gD1mwUyYP<;{iD#NmAR)nI4(e1p2YC14xs~O^da{IEw31
z;YxWL!A$@x3ScBAOw#HwQY-aCFgHw$nUqPfR@6_a4A40soHW;HD7KJ;D{ug+<SLF>
z2?*R(QOm=eGJq0yPbDiczJPK#f()8ufB}V^4Wn6n*jJ4y$a*5Ks#>6Bah#KJy3e$*
z4+vl;vn&+fFV!6gjW%-yh1Mu6vW#?{5q4t(Ukd^3hImo}DFijP3M))eFi8moAgCq_
z)f**%8E7FfMH*4XQ^+VTTa_TtX4w}xxDsnRM_+5MmCG`<+!|*Atj4ogzyK&1HDTIp
z)B@u+Dx0!V04E5egv9t>T*PJ*<;qCdoWz36kdEmr)M#Ldss^T*Z~%R-oYE*~8(^hW
z133sU9MaTa*+h-Q0Z4&Nk8?0_G6}T4I<SZbq3LL@UY5b5Y!J_6slpL;K(2=k0rV&t
zh`K99$!4vNN=wedNR`41He0M@mPCfKR0$4TLiz%;H^x?(9Rw>O$P#Fi5MlsOSzvZN
z0U0oFrHqXe4nmQLr~<^^mKs2JKTkrq(lv7y8}zu1fODyICzUdR@-;bx1T3_e)F=m}
zVi>ntLb+240wwHPmQI2O0fHyao{cC85yL};#yW)Y3N5MN#*!MbM2BdIY}`)=bQ$Jq
zV3i_5N~%di5==>{Kq!GR6ABYtO>R0+LJCZ%G>A;D?qm|A0wza>QfUW)6q-QER0ddP
zQ7bVj&1B)2RXCZ;)-e%{n1|UJVjaff==>nU&W25pK_-SU#EaltF+ml{m(|<2sv4#l
zP5`yQBtl`bTIL~?AgPN~dU_xLK+KilHmuqb3u7oiUAZj!nlJ%iBN_TC%p}2Ts&O3C
z^Q(lo8qgzJ6$Vk>L0XH_2&^{5kN~A(HZLzfF|gPwfJuwdlS($9zziyrLJ8o?8X)2T
z*e8~xM2ILrh*FdiLbPZNuCY2HY(#HG0fvh$c$|vi>MIr!QnN;El!!@ltVoocL_m~`
zn<1#w3P{YX#;w&Pz)dK-$}U2=d`B{gSEB|CJKh3drI%GjA{J*Wx(JXe2M)xz9e8L&
zY4kK{MKh^DaW*Q4>J0%zkU3Cak1YbuI#NTz%vgX*GzJXzaMb_^lNd}^S$s0&8>rya
z6Kt<~bCtG|NlJtQR1M@_x!gpXA{S(<;;m-F#Ad@>07yG&%K{85WuvEmoB}E&fs`dz
z5vf{IfWuYUOo0IA!%Udu^MppAwOH+_#KKdF<Cv5e&oWygpq!~{F>Zk!az1XRv_v2X
zWPAEKq%e^vB=qzc;$;<Lh<bJ{Txlk8fgP?f!7|*g5~1u|(u&IgA~TqAh9(}iIVc4R
z?1U+iiU$@T69Pm6a9E;H2w@ElYH^3Yrn25l_~Gn82|Y*gVQxI09dE<gz!96ws|5l+
z>}=AQ74+ci<)CMTz6S<Jxw#~z#%=Ts3YZc^1&|DYv&?Zq*e<mCSsk#|y$aZaloB6<
zO2u)ng&_lq$QCJuFljRDC^ITUIc&340E|_9!be-53F0*Zj6jY2WV-PaWrUN>7MN)O
zn35^V5;zd|TA4zd4SVwGp7d1Ia?yIkP>8v^(W4fqQr3d{8eTRLL{x&>K#yvs#SVPo
z3|N`X=W1(V5eQNNx^_4mD~AOfRAn*1q&e4FNa1dXwh9x`k6V%jrAHwE;Q&ixnmw}s
z#U?2n;>05kok|Gfgvr@S-In-^ZQ0~D&ZE@iJMBfvD{rQq5LG7LdAcR@c*M{v?_aSm
zML3`24Q~HzSTee)FnF66w&YdYt@f@C=I3R9g-Fy-JdP|{x8u9|!<wxx-`$QW$=Ftr
zT(-6Cv#fAh*{Vm3^V#Mtl~cdnu(9d9EWEV-N4wXEhJ?7mzZCPQ+``u!4_5czbVvL)
zuKC01$oj8We05B6&FRwNlNp`1#p#XnBAnlOj~neB@n+R9N!i^4wS`kQrtLZ2O!=H3
z9qGHy%<lPdi~7*$&EFIs)1COWr8mvDFtg2Oyu2jp*u@vzhOQ%z0(_@G;13^fQ09mp
z6#UV9{=0tX<ueq&mB9BqJ#ve>TY@{(FN4Ecs@HUUH}<#BNAuK=Z;op$zkRm(-pv~#
zUYqFW<4F$~P2InqE4n>27F&5?`hg_jZ!Yb@yNg&n{^Xk3tEK82{gy4%Emrz+S2Rwz
zT9|wFY&U=NuE#NtUGSLDp!Uosqgq?n4tsL6DkMO={yOuwOW!YfXq?NP8}{wCA2K&b
z2-NbK{0na;b)262$1krFrUkUl&l?(U+j~VYrlq~Y>AB_Io&1q&2cOz%&p$DDU-X-z
z?iayN2TcxI+rM+ZsXm9hR_(J?+5V51k(K^Er=1^9im*!WUs2TdANq{#IsBy4$t~k7
z<ia0I?&a_Ho&?>jx_r~n)uPz5Eh_dIan-)=Ww>3v;^=a9`Ki$zCq`e7%M%_NG-SAJ
zYMyb>hwAlby9T~Cs$;#IveHbm2Kyd4#9$M%r<VROzV@TIu57e&%`)>?S9MiG^61R1
zyB2)<^W8U%nuc%ZvOKRwEF%kabIe^U*GZSOzsTc7t=X`5Tx;*__MZ81FN5$6y=tc_
zTTlO8lbz%I2CWny3CRpPeeY=@gVqPjhc&~;xlbs4AKE6@N7CE>W0Or6)}UQ&`G3g_
zhp!lfbDKtA+7Fl1w@tU5r){$TB3sb^JAd|p>vv<m4s+Sk+B7qNpl-rIEVV;AmbOX%
zyoJ!ZvfgRzoSOWY>+UBCX7<_S=&qces5OT#4KGs9J-Mo9Ou;k8`Qvw^^=qS1-Q=9X
zciz3cH|1m<+Mu1QUbW<t?Kkj|-#za+Bn|1<Ig;)2YDLe-m~%mm@li9}@ArTIJW#o6
zZc|H%|CX2Q2aKT>#jt7zXFUU-tg&G`BEG+?xqQfc_*Gd?bY9H6&UJV8m@h`tHn|;s
zIb8U-@SWPf;Z@_=p2ktWoo^lb-yDPdyw5$SZE`zlwAQ%ly}In|xo%TkdgJoIwnsSw
zS^eC1N*+W--}Ik$jsN-rf6kwyQVLt6&QF_?i_bB>-ZZb|A#vV&?pfL<`$GzhDR+jB
zE;JtC)!iL<GcKyRLEf|WtKrjU4YlrWc@O-{Lw<VZGSmv5E*bG`!_LSVkqc6b{TlA>
zfyW#*RNOzA^nkrzJ-p-(Z0_05+l{H++@rh8LsuU*?!6Pd{c!cEJ;Q%zT~)cfc!J*<
zLEGfvw9qlgmf7nAy>9-zt)SMabkd%I>CJz1mfD_q@JE~;@c35$1)YNN#`pRKE2eCE
zmH&FozCW*m;it}CT(Bl@a&^_x!};gu4+wS@w7vS=sT>p4Y*h_*Qy)1Hl?NU4ulcIM
zFlu~e*Zi=esKmWq=tQ^6ZJE#W>ze1i@3YCS7t>@*kE(qW{F^S#nOJe@`1|@B{fh0-
zk+H#p(uXO2U@b3bOPJiXW9B(GG~DwKHgj&DO&*P!{|}!z(;_$No_k!q<o?$=xZp6l
zX##DNHLg>Zyek>mss4Q(ZIjT%5u&dX%*+4rb~n~N3p<k%M6JuX&@v=w?7<<I!+8C^
zznA)RO<e>ha%ujl=n((jDeo-zd#;b6(mp+R@TZT8aC2#MKe+e)v0uKHhV{4v|Al=0
z{PFG2Iq{D_pNv<_+qO*klEP1!-wyxi<~FP>zJ2zAw;x^5S3`FPs|!v~Ji26c&nboU
zqI%)%0n@6d(Kh+@!Oe(+k*{kHt@>TRCX>w1nPPr?<6LRQPp3MfXHIVWwx|1_Hu=)?
zdi8<V5C7c0zt1K+`rXP)yfWeN*T0CC9(wmRZIcI8;+7QJCZ}AW1KR!%W;HxRJ};1D
z^*DX63VB$|ccE?aJcK{+_IKeLpU1ko<eZP^TJH}qjVK%5`=#HHAM5?1Ztah9Td;(-
z$v1qDoS9vs57wM#k<X@Yf0)w1YAqVJxbgnurIMEUYJN)C?jOK!v`t2rvZqZ8mfXE~
z6usEBZ;#m{YR;E+=UX~|?z2hDy_xWXA?ub?Zew1qI69BE$?drwbMGb`efZYCf1)OJ
z=$zPw=+_tTZ%B<fEl)c-BB6Hx^<u~KTT_zOpU0ZcNQbSt<F++r$F)^Su?-i7m6e6H
zJ%nC8zrL|$?^l!j@w|ik_EpJCIVt%k8h5(p)sEfjl+m1j$UXH~<#zsry?r+MVdhKh
zuRJ+>^2MxikwetmS2oqZi)*c(aE9gYLfhoAdHrag7h%kq1HS7SN86-#lCk4@#glVC
z=UEnKq;D`zc4O}wa^pR)*G+9&7`1YI&7TzwlaKcVKc<8uPXuk8e^zf?_!l$qyJI)=
z(i&TNdBvxOvP%<2uFrb2=<SGkf7@h5apsv38(jpEYJC2H-IqaE@a8Sp`UT(K2z_lk
zwV><IuHA<^-~T?Z=_cQ6#OebFW|R~ii>7T7bvtAO(?!_Q0xvADnN0PI4(q)P<XtUg
zeKuLHnfUPi;6qT$#;w_nKRb>zH%%zg-{)+fHS_sNt?!19i^f0~(l3-;D$W?wa{&7K
z>W$XdYt&)69tR4ZKAN|ewn@Gy`L<T~WlnO|O|pFD;Aq+=d*<g9qXSA4j=Wy@VaSVZ
z-nZE0w;wjL5+d5eJ8~YS>~22SXOp87>$ZBYl6|$a)A1-Tv2bZvig)|Q6L&8VV<o#>
zC!gKol2WI?-|+}8IhF2InxAld$hEh^_mM)&HT7oB)6_e+hF9l&asRS%*Tr3L&am1f
zn<Dl%_Ss~Kau;or^0TpTC&hoW{=>?lu}zxklJl!IkFpO9+i<ci<9hYW2I-PXPnK)u
zEJ-j;OMN*dCm1trrft%ao8GI8QrsF<c6r7%*MZNcLFM0HJCd<V`Jgwv=WGJCxcqYT
z;&!!i>)tt6E&kie?|mp4biQA~x|Ffo4>M++-^$`_7#7uO|0>ut@$%wXA$JM~H}dap
zpV0bYoUFcMX!%B_IXGeWuf1DloP$67DD#*yZ*A(bQvLUPj=VTAYedn#j;+-<Vq<ok
z+qcPe1ZyF@eaFb=ulj8A$F+q!e*SUi&aJdfCf!r-?|9VPib1EmBl>J|qWjO)muZ{a
z>=;bj<mrtA6(6szzj^uI_3Uw-TQ93jE0xW`l|O$HalW7I6Wi1!LS_UAzkTvWIdRRI
z3$#tTo!0hOU%Gqt<c@<W-yQ6;$&i#u@0V{|H~n`>%ARepGiJ<8YJ|SXMr4dX{nN<j
z+pnt+T7P`JAK&urpsF{b&BJM%e7;w`xvqWC)Eg(FR}H>f-h{L}cb{GM^m(b{^@)_%
zeKwhqR#wp5eKxZzJSBUxJfyxZ>?m!MOIPkJ>UGvNX^w^Vi#>TUr{%X>!5*$Zyo|at
zukKp+;;G3|L&8`3Upd;}>x#MRr_=q8EgUuBqIw@~lM#s>-{gV?rBfTOcbfY5kLJGo
zi+s>@AnPNeo9fVijK+^Qm##@Twm7-<UFv#L@G7P?yPrT1GY)1>KE@wA>i17)ZZz00
zNZQ9PogUZk$>6c=vlr4fIbNM|&=UD}1%K!DF72-Hec}B*hdGwV>`EMURKEdh8f=}k
zy-+>-*U^=?kI^;>-DrwjeDIA*bZ$@ydH0~aceTuL`xWDdsj%(gla&{m5AWKNcq;yx
zrFZd^mx$!ktLTLa#iY@iS08`9cSK)vy3Zy%+2N9pIlIP14xf5xLA>oy|4$ENXYzh+
z9?u``a_xufd0$+jyc{u~*3N2fzVK?(&|Gq_`rd)&e5jW>Z;5JmLB<Juz_z$iqaRFa
zpluRE+vMcoh_W1?I#nrclNhM(+trH~l}(!5bnW{29y3~ua=yCWF!H6bbzG;<`YUrk
zO>G(Z$0PK2+9n@2Cl+-&{dpiT^YYVM`Ta!mM;<A3uPsviv>?WF+_GNMGw;{o7p|<W
zEBs@iTX*ECleA4fUD-Hx!K0wY%6+xr9@YIUZ=Mc`$%rXiFmA+|+F;ryrQi@d@P2UY
zikJ!hn^ZS0M}F8=;xg(9ZIks?*o>Rk(4-sZ_SZdoG4cM#?P<*0o>?cKu=ni6qc~HM
zw;Ps@ih13+#o6gB#PD(Un=fJ(FILKzN`0Ie<D5Az3m1pc3q(5X%5h&JFI~7&n!iF$
zFBp9W@EIHzI)=}f;orA*9Kd(xxc&3!-!Z=Q8*M*-zYqb#-)Ddy-<9b%u`jp($))dX
zA4Xp?-_|koCKxYZ`Hv@OKu+--r#MefhF?hE+jF{bdSTkPV=(^Dy#7`4-?`Z#Uppjt
zF-CBHjQ{xVVu-<E{KwpUA4~d57qIAnH-j#YBP=Y+3zaUD(k&2Lyi8iOB(!*WekdJS
zzCvCST2%N==u&x!G*?<8<yf88@JdRT$$62y;*z2ji%WPRJcUZ0pIgj}oVSKoAYIyb
zh^{al|F3laPQO&TjE<yhpZ@RjRRwu;)IXc=KSoY1EGUtx3W`%!<mdAu>GI^|A-n?m
zhg6nNm(v#!!u#)Q<3+A1FJDm3;Bfv|bymuYidBUL9JjHd5usxk9G}H2auor8#|Dgv
Z2n`z_8W!dkLU(CjuX@sro=X3j@n1^C^eO-V

literal 0
HcmV?d00001

diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 83b7353ad89..b83b8f08a8b 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1954,3 +1954,16 @@ def test_writer_lz4():
 
     got = pd.read_orc(buffer)
     assert_eq(gdf, got)
+
+
+def test_row_group_alignment(datadir):
+    path = datadir / "TestOrcFile.MapManyNulls.parquet"
+
+    expected = cudf.read_parquet(path)
+
+    buffer = BytesIO()
+    expected.to_orc(buffer)
+
+    got = cudf.read_orc(buffer)
+
+    assert_eq(expected, got)

From 217c73f7d34c84f786707f335b95ed1aae94e87a Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 22 May 2024 11:26:50 -0500
Subject: [PATCH 244/842] Deprecate `Groupby.collect` (#15808)

After we made our groupby fail more aggressively for unsupported types in https://github.com/rapidsai/cudf/pull/15712, `Groupby.collect` started to fail on string column, where this isn't a supported aggregation on string column in pandas and this method doesn't exist in pandas Groupby, hence this PR suggest the alternative equivalent and deprecates the API to be removed in next release.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/15808
---
 python/cudf/cudf/core/groupby/groupby.py      | 12 ++++++++-
 python/dask_cudf/dask_cudf/expr/_groupby.py   | 26 +++++++++++--------
 python/dask_cudf/dask_cudf/groupby.py         | 23 +++++++++-------
 .../dask_cudf/dask_cudf/tests/test_groupby.py | 14 +++++++---
 4 files changed, 51 insertions(+), 24 deletions(-)

diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 3e4b8192888..bf24864c29d 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -40,6 +40,15 @@
 from cudf.utils.utils import GetAttrGetItemMixin
 
 
+def _deprecate_collect():
+    warnings.warn(
+        "Groupby.collect is deprecated and "
+        "will be removed in a future version. "
+        "Use `.agg(list)` instead.",
+        FutureWarning,
+    )
+
+
 # The three functions below return the quantiles [25%, 50%, 75%]
 # respectively, which are called in the describe() method to output
 # the summary stats of a GroupBy object
@@ -2180,7 +2189,8 @@ def func(x):
     @_cudf_nvtx_annotate
     def collect(self):
         """Get a list of all the values for each column in each group."""
-        return self.agg("collect")
+        _deprecate_collect()
+        return self.agg(list)
 
     @_cudf_nvtx_annotate
     def unique(self):
diff --git a/python/dask_cudf/dask_cudf/expr/_groupby.py b/python/dask_cudf/dask_cudf/expr/_groupby.py
index 116893891e3..65688115b59 100644
--- a/python/dask_cudf/dask_cudf/expr/_groupby.py
+++ b/python/dask_cudf/dask_cudf/expr/_groupby.py
@@ -9,19 +9,21 @@
 
 from dask.dataframe.groupby import Aggregation
 
+from cudf.core.groupby.groupby import _deprecate_collect
+
 ##
 ## Custom groupby classes
 ##
 
 
-class Collect(SingleAggregation):
+class ListAgg(SingleAggregation):
     @staticmethod
     def groupby_chunk(arg):
-        return arg.agg("collect")
+        return arg.agg(list)
 
     @staticmethod
     def groupby_aggregate(arg):
-        gb = arg.agg("collect")
+        gb = arg.agg(list)
         if gb.ndim > 1:
             for col in gb.columns:
                 gb[col] = gb[col].list.concat()
@@ -30,10 +32,10 @@ def groupby_aggregate(arg):
             return gb.list.concat()
 
 
-collect_aggregation = Aggregation(
-    name="collect",
-    chunk=Collect.groupby_chunk,
-    agg=Collect.groupby_aggregate,
+list_aggregation = Aggregation(
+    name="list",
+    chunk=ListAgg.groupby_chunk,
+    agg=ListAgg.groupby_aggregate,
 )
 
 
@@ -41,13 +43,13 @@ def _translate_arg(arg):
     # Helper function to translate args so that
     # they can be processed correctly by upstream
     # dask & dask-expr. Right now, the only necessary
-    # translation is "collect" aggregations.
+    # translation is list aggregations.
     if isinstance(arg, dict):
         return {k: _translate_arg(v) for k, v in arg.items()}
     elif isinstance(arg, list):
         return [_translate_arg(x) for x in arg]
     elif arg in ("collect", "list", list):
-        return collect_aggregation
+        return list_aggregation
     else:
         return arg
 
@@ -84,7 +86,8 @@ def __getitem__(self, key):
         return g
 
     def collect(self, **kwargs):
-        return self._single_agg(Collect, **kwargs)
+        _deprecate_collect()
+        return self._single_agg(ListAgg, **kwargs)
 
     def aggregate(self, arg, **kwargs):
         return super().aggregate(_translate_arg(arg), **kwargs)
@@ -96,7 +99,8 @@ def __init__(self, *args, observed=None, **kwargs):
         super().__init__(*args, observed=observed, **kwargs)
 
     def collect(self, **kwargs):
-        return self._single_agg(Collect, **kwargs)
+        _deprecate_collect()
+        return self._single_agg(ListAgg, **kwargs)
 
     def aggregate(self, arg, **kwargs):
         return super().aggregate(_translate_arg(arg), **kwargs)
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 43ad4f0fee3..ef47ea436c7 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -15,6 +15,7 @@
 from dask.utils import funcname
 
 import cudf
+from cudf.core.groupby.groupby import _deprecate_collect
 from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
 
 from dask_cudf.sorting import _deprecate_shuffle_kwarg
@@ -28,7 +29,7 @@
     "sum",
     "min",
     "max",
-    "collect",
+    list,
     "first",
     "last",
 )
@@ -164,9 +165,10 @@ def max(self, split_every=None, split_out=1):
     @_dask_cudf_nvtx_annotate
     @_check_groupby_optimized
     def collect(self, split_every=None, split_out=1):
+        _deprecate_collect()
         return _make_groupby_agg_call(
             self,
-            self._make_groupby_method_aggs("collect"),
+            self._make_groupby_method_aggs(list),
             split_every,
             split_out,
         )
@@ -308,9 +310,10 @@ def max(self, split_every=None, split_out=1):
     @_dask_cudf_nvtx_annotate
     @_check_groupby_optimized
     def collect(self, split_every=None, split_out=1):
+        _deprecate_collect()
         return _make_groupby_agg_call(
             self,
-            {self._slice: "collect"},
+            {self._slice: list},
             split_every,
             split_out,
         )[self._slice]
@@ -472,7 +475,7 @@ def groupby_agg(
 
     This aggregation algorithm only supports the following options
 
-    * "collect"
+    * "list"
     * "count"
     * "first"
     * "last"
@@ -667,8 +670,8 @@ def _redirect_aggs(arg):
         sum: "sum",
         max: "max",
         min: "min",
-        list: "collect",
-        "list": "collect",
+        "collect": list,
+        "list": list,
     }
     if isinstance(arg, dict):
         new_arg = dict()
@@ -704,7 +707,7 @@ def _aggs_optimized(arg, supported: set):
             _global_set = set(arg)
 
         return bool(_global_set.issubset(supported))
-    elif isinstance(arg, str):
+    elif isinstance(arg, (str, type)):
         return arg in supported
     return False
 
@@ -783,6 +786,8 @@ def _tree_node_agg(df, gb_cols, dropna, sort, sep):
         agg = col.split(sep)[-1]
         if agg in ("count", "sum"):
             agg_dict[col] = ["sum"]
+        elif agg == "list":
+            agg_dict[col] = [list]
         elif agg in OPTIMIZED_AGGS:
             agg_dict[col] = [agg]
         else:
@@ -873,8 +878,8 @@ def _finalize_gb_agg(
                 gb.drop(columns=[sum_name], inplace=True)
             if "count" not in agg_list:
                 gb.drop(columns=[count_name], inplace=True)
-        if "collect" in agg_list:
-            collect_name = _make_name((col, "collect"), sep=sep)
+        if list in agg_list:
+            collect_name = _make_name((col, "list"), sep=sep)
             gb[collect_name] = gb[collect_name].list.concat()
 
     # Ensure sorted keys if `sort=True`
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index dc279bfa690..cf916b713b2 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -9,6 +9,7 @@
 from dask.utils_test import hlg_layer
 
 import cudf
+from cudf.testing._utils import expect_warning_if
 
 import dask_cudf
 from dask_cudf.groupby import OPTIMIZED_AGGS, _aggs_optimized
@@ -47,7 +48,13 @@ def pdf(request):
     return pdf
 
 
-@pytest.mark.parametrize("aggregation", OPTIMIZED_AGGS)
+# NOTE: We only want to test aggregation "methods" here,
+# so we need to leave out `list`. We also include a
+# deprecation check for "collect".
+@pytest.mark.parametrize(
+    "aggregation",
+    sorted(tuple(set(OPTIMIZED_AGGS) - {list}) + ("collect",)),
+)
 @pytest.mark.parametrize("series", [False, True])
 def test_groupby_basic(series, aggregation, pdf):
     gdf = cudf.DataFrame.from_pandas(pdf)
@@ -62,8 +69,9 @@ def test_groupby_basic(series, aggregation, pdf):
 
     check_dtype = aggregation != "count"
 
-    expect = getattr(gdf_grouped, aggregation)()
-    actual = getattr(ddf_grouped, aggregation)()
+    with expect_warning_if(aggregation == "collect"):
+        expect = getattr(gdf_grouped, aggregation)()
+        actual = getattr(ddf_grouped, aggregation)()
 
     if not QUERY_PLANNING_ON:
         assert_cudf_groupby_layers(actual)

From 766fbb7f0220960320413ba540ca26ef38591ef1 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 22 May 2024 11:43:16 -0500
Subject: [PATCH 245/842] Add temporary dask-cudf workaround for categorical
 sorting (#15801)

Follow up to https://github.com/rapidsai/cudf/pull/15788

Adds a temporary workaround for sorting on categorical columns in 24.06: We convert only the partitioning column to pandas to calculate divisions.

This is related to https://github.com/rapidsai/cudf/issues/11795, but I don't want to "close" that issue until `RepartitionQuantiles` works with cudf-backed data.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/15801
---
 .../dask_cudf/dask_cudf/expr/_collection.py   | 19 --------------
 python/dask_cudf/dask_cudf/expr/_expr.py      | 25 +++++++++++++++++++
 python/dask_cudf/dask_cudf/tests/test_sort.py | 23 ++---------------
 3 files changed, 27 insertions(+), 40 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index 926b7cfaf0e..d50dfb24256 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -15,7 +15,6 @@
 
 from dask import config
 from dask.dataframe.core import is_dataframe_like
-from dask.dataframe.dispatch import is_categorical_dtype
 
 import cudf
 
@@ -82,24 +81,6 @@ def from_dict(cls, *args, **kwargs):
         with config.set({"dataframe.backend": "cudf"}):
             return DXDataFrame.from_dict(*args, **kwargs)
 
-    def sort_values(
-        self,
-        by,
-        **kwargs,
-    ):
-        # Raise if the first column is categorical, otherwise the
-        # upstream divisions logic may produce errors
-        # (See: https://github.com/rapidsai/cudf/issues/11795)
-        check_by = by[0] if isinstance(by, list) else by
-        if is_categorical_dtype(self.dtypes.get(check_by, None)):
-            raise NotImplementedError(
-                "Dask-cudf does not support sorting on categorical "
-                "columns when query-planning is enabled. Please use "
-                "the legacy API for now."
-                f"\n{_LEGACY_WORKAROUND}",
-            )
-        return super().sort_values(by, **kwargs)
-
     def groupby(
         self,
         by,
diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py
index ff037b9520c..8fccaccb695 100644
--- a/python/dask_cudf/dask_cudf/expr/_expr.py
+++ b/python/dask_cudf/dask_cudf/expr/_expr.py
@@ -1,11 +1,14 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import functools
 
+import dask_expr._shuffle as _shuffle_module
+from dask_expr import new_collection
 from dask_expr._cumulative import CumulativeBlockwise
 from dask_expr._expr import Expr, VarColumns
 from dask_expr._reductions import Reduction, Var
 
 from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty
+from dask.dataframe.dispatch import is_categorical_dtype
 
 ##
 ## Custom expression patching
@@ -121,3 +124,25 @@ def _patched_var(
 
 
 Expr.var = _patched_var
+
+
+# Temporary work-around for missing cudf + categorical support
+# See: https://github.com/rapidsai/cudf/issues/11795
+# TODO: Fix RepartitionQuantiles and remove this in cudf>24.06
+
+_original_get_divisions = _shuffle_module._get_divisions
+
+
+def _patched_get_divisions(frame, other, *args, **kwargs):
+    # NOTE: The following two lines contains the "patch"
+    # (we simply convert the partitioning column to pandas)
+    if is_categorical_dtype(other._meta.dtype) and hasattr(
+        other.frame._meta, "to_pandas"
+    ):
+        other = new_collection(other).to_backend("pandas")._expr
+
+    # Call "original" function
+    return _original_get_divisions(frame, other, *args, **kwargs)
+
+
+_shuffle_module._get_divisions = _patched_get_divisions
diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py
index 9d9fe297248..9bbbbc79561 100644
--- a/python/dask_cudf/dask_cudf/tests/test_sort.py
+++ b/python/dask_cudf/dask_cudf/tests/test_sort.py
@@ -10,7 +10,7 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import QUERY_PLANNING_ON, xfail_dask_expr
+from dask_cudf.tests.utils import xfail_dask_expr
 
 
 @pytest.mark.parametrize("ascending", [True, False])
@@ -20,12 +20,7 @@
         "a",
         "b",
         "c",
-        pytest.param(
-            "d",
-            marks=xfail_dask_expr(
-                "Possible segfault when sorting by categorical column.",
-            ),
-        ),
+        "d",
         ["a", "b"],
         ["c", "d"],
     ],
@@ -47,20 +42,6 @@ def test_sort_values(nelem, nparts, by, ascending):
     dd.assert_eq(got, expect, check_index=False)
 
 
-@pytest.mark.parametrize("by", ["b", ["b", "a"]])
-def test_sort_values_categorical_raises(by):
-    df = cudf.DataFrame()
-    df["a"] = np.ascontiguousarray(np.arange(10)[::-1])
-    df["b"] = df["a"].astype("category")
-    ddf = dd.from_pandas(df, npartitions=10)
-
-    if QUERY_PLANNING_ON:
-        with pytest.raises(
-            NotImplementedError, match="sorting on categorical"
-        ):
-            ddf.sort_values(by=by)
-
-
 @pytest.mark.parametrize("ascending", [True, False])
 @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]])
 def test_sort_values_single_partition(by, ascending):

From 24320a18563f1defd8bf7a164adebc066f8c7135 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Wed, 22 May 2024 12:01:24 -0500
Subject: [PATCH 246/842] Switch cuIO benchmarks to use pinned-pool host
 allocations by default. (#15805)

Previously, the benchmarks used a non-pooled pinned memory allocator by default, and exposed an option to use an internally-declared pooled pinned allocator.  Now that we have a pooled pinned allocator enabled in cuIO itself, this PR switches to using that as the new default for the benchmarks.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/15805
---
 cpp/benchmarks/fixture/nvbench_fixture.hpp | 19 ++-----------------
 1 file changed, 2 insertions(+), 17 deletions(-)

diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp
index ac0cab4071b..ebcbcb17e98 100644
--- a/cpp/benchmarks/fixture/nvbench_fixture.hpp
+++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp
@@ -45,8 +45,6 @@ static std::string cuio_host_mem_param{
  * Initializes the default memory resource to use the RMM pool device resource.
  */
 struct nvbench_base_fixture {
-  using host_pooled_mr_t = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
-
   inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
 
   inline auto make_pool()
@@ -90,22 +88,10 @@ struct nvbench_base_fixture {
     return *mr;
   }
 
-  inline rmm::host_async_resource_ref make_cuio_host_pinned_pool()
-  {
-    if (!this->host_pooled_mr) {
-      // Don't store in static, as the CUDA context may be destroyed before static destruction
-      this->host_pooled_mr = std::make_shared<host_pooled_mr_t>(
-        std::make_shared<rmm::mr::pinned_host_memory_resource>().get(),
-        size_t{1} * 1024 * 1024 * 1024);
-    }
-
-    return *this->host_pooled_mr;
-  }
-
   inline rmm::host_async_resource_ref create_cuio_host_memory_resource(std::string const& mode)
   {
     if (mode == "pinned") return make_cuio_host_pinned();
-    if (mode == "pinned_pool") return make_cuio_host_pinned_pool();
+    if (mode == "pinned_pool") return cudf::io::get_host_memory_resource();
     CUDF_FAIL("Unknown cuio_host_mem parameter: " + mode + "\nExpecting: pinned or pinned_pool");
   }
 
@@ -139,8 +125,7 @@ struct nvbench_base_fixture {
   std::shared_ptr<rmm::mr::device_memory_resource> mr;
   std::string rmm_mode{"pool"};
 
-  std::shared_ptr<host_pooled_mr_t> host_pooled_mr;
-  std::string cuio_host_mode{"pinned"};
+  std::string cuio_host_mode{"pinned_pool"};
 };
 
 }  // namespace cudf

From ad56bc30c53745a43fca0852e4a46e74db988039 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Wed, 22 May 2024 11:48:15 -0700
Subject: [PATCH 247/842] Raise FileNotFoundError when a literal JSON string
 that looks like a json filename is passed (#15806)

- closes #13026

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15806
---
 python/cudf/cudf/tests/test_json.py |  8 ++++++++
 python/cudf/cudf/utils/ioutils.py   | 24 +++++++++++++++++++++++-
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 3033a3e75e3..51287fe26a0 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -355,6 +355,14 @@ def test_json_lines_basic(json_input, engine):
         np.testing.assert_array_equal(pd_df[pd_col], cu_df[cu_col].to_numpy())
 
 
+@pytest.mark.filterwarnings("ignore:Using CPU")
+@pytest.mark.parametrize("engine", ["auto", "cudf", "pandas"])
+def test_nonexistent_json_correct_error(engine):
+    json_input = "doesnotexist.json"
+    with pytest.raises(FileNotFoundError):
+        cudf.read_json(json_input, engine=engine)
+
+
 @pytest.mark.skipif(
     PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
     reason="warning not present in older pandas versions",
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 18e81078587..dd9b44c5a53 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1718,10 +1718,32 @@ def get_reader_filepath_or_buffer(
         if _is_local_filesystem(fs):
             # Doing this as `read_json` accepts a json string
             # path_or_data need not be a filepath like string
+
+            # helper for checking if raw text looks like a json filename
+            compression_extensions = [
+                ".tar",
+                ".tar.gz",
+                ".tar.bz2",
+                ".tar.xz",
+                ".gz",
+                ".bz2",
+                ".zip",
+                ".xz",
+                ".zst",
+                "",
+            ]
+
             if len(paths):
                 if fs.exists(paths[0]):
                     path_or_data = paths if len(paths) > 1 else paths[0]
-                elif not allow_raw_text_input:
+
+                # raise FileNotFound if path looks like json
+                # following pandas
+                # see
+                # https://github.com/pandas-dev/pandas/pull/46718/files#diff-472ce5fe087e67387942e1e1c409a5bc58dde9eb8a2db6877f1a45ae4974f694R724-R729
+                elif not allow_raw_text_input or paths[0].lower().endswith(
+                    tuple(f".json{c}" for c in compression_extensions)
+                ):
                     raise FileNotFoundError(
                         f"{path_or_data} could not be resolved to any files"
                     )

From f626ece7e3639c53096bd06ec40cf48ccd807f93 Mon Sep 17 00:00:00 2001
From: William Kaiser <wkaisertexas@gmail.com>
Date: Wed, 22 May 2024 15:16:46 -0400
Subject: [PATCH 248/842] Simplified README Examples (#15338)

Pandas will automatically perform fetches for paths detected as URLs. No need for `requests` and `StringIO`. A marginal change, but this makes the docs a bit simpler.

Authors:
  - William Kaiser (https://github.com/wkaisertexas)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15338
---
 README.md | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 205e16ea0e5..75ee405bc1f 100644
--- a/README.md
+++ b/README.md
@@ -14,13 +14,8 @@ You can import `cudf` directly and use it like `pandas`:
 
 ```python
 import cudf
-import requests
-from io import StringIO
 
-url = "https://github.com/plotly/datasets/raw/master/tips.csv"
-content = requests.get(url).content.decode("utf-8")
-
-tips_df = cudf.read_csv(StringIO(content))
+tips_df = cudf.read_csv("https://github.com/plotly/datasets/raw/master/tips.csv")
 tips_df["tip_percentage"] = tips_df["tip"] / tips_df["total_bill"] * 100
 
 # display average tip by dining party size
@@ -36,13 +31,8 @@ supported operations and falling back to pandas when needed:
 %load_ext cudf.pandas  # pandas operations now use the GPU!
 
 import pandas as pd
-import requests
-from io import StringIO
-
-url = "https://github.com/plotly/datasets/raw/master/tips.csv"
-content = requests.get(url).content.decode("utf-8")
 
-tips_df = pd.read_csv(StringIO(content))
+tips_df = pd.read_csv("https://github.com/plotly/datasets/raw/master/tips.csv")
 tips_df["tip_percentage"] = tips_df["tip"] / tips_df["total_bill"] * 100
 
 # display average tip by dining party size

From 45dc595945301f4076e66ec54a6e4de0b539cfb0 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 22 May 2024 14:17:05 -0500
Subject: [PATCH 249/842] Deprecate `divisions='quantile'` support in
 `set_index` (#15804)

Using `set_index(..., divisions='quantile')` is not supported by dask-cudf when query-planning is enabled. However, this option doesn't seem to serve a purpose anymore. This PR deprecates the option in general.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/15804
---
 python/dask_cudf/dask_cudf/core.py             |  8 ++++++++
 python/dask_cudf/dask_cudf/expr/_collection.py | 18 ++++++++++++++++++
 python/dask_cudf/dask_cudf/tests/test_core.py  |  4 ++--
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 3f0cfeb6d2c..3bd455a3a57 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -167,6 +167,14 @@ def set_index(
         pre_sorted = sorted
         del sorted
 
+        if divisions == "quantile":
+            warnings.warn(
+                "Using divisions='quantile' is now deprecated. "
+                "Please raise an issue on github if you believe "
+                "this feature is necessary.",
+                FutureWarning,
+            )
+
         if (
             divisions == "quantile"
             or isinstance(divisions, (cudf.DataFrame, cudf.Series))
diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index d50dfb24256..f60e4ff81ef 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -81,6 +81,24 @@ def from_dict(cls, *args, **kwargs):
         with config.set({"dataframe.backend": "cudf"}):
             return DXDataFrame.from_dict(*args, **kwargs)
 
+    def set_index(
+        self,
+        *args,
+        divisions=None,
+        **kwargs,
+    ):
+        if divisions == "quantile":
+            divisions = None
+            warnings.warn(
+                "Ignoring divisions='quantile'. This option is now "
+                "deprecated. Please use the legacy API and raise an "
+                "issue on github if this feature is necessary."
+                f"\n{_LEGACY_WORKAROUND}",
+                FutureWarning,
+            )
+
+        return super().set_index(*args, divisions=divisions, **kwargs)
+
     def groupby(
         self,
         by,
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 18a9e3b496f..7f8a619ae22 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -231,7 +231,6 @@ def test_set_index(nelem):
         dd.assert_eq(expect, got, check_index=False, check_divisions=False)
 
 
-@xfail_dask_expr("missing support for divisions='quantile'")
 @pytest.mark.parametrize("by", ["a", "b"])
 @pytest.mark.parametrize("nelem", [10, 500])
 @pytest.mark.parametrize("nparts", [1, 10])
@@ -241,7 +240,8 @@ def test_set_index_quantile(nelem, nparts, by):
     df["b"] = np.random.choice(cudf.datasets.names, size=nelem)
     ddf = dd.from_pandas(df, npartitions=nparts)
 
-    got = ddf.set_index(by, divisions="quantile")
+    with pytest.warns(FutureWarning, match="deprecated"):
+        got = ddf.set_index(by, divisions="quantile")
     expect = df.sort_values(by=by).set_index(by)
     dd.assert_eq(got, expect)
 

From 57444ed421fbddeacf4f9919ff53c1225eb977dd Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 22 May 2024 10:04:07 -1000
Subject: [PATCH 250/842] Access `self.index` instead of `self._index` where
 possible (#15781)

Since `index` is defined as

```python
@property
def index(self):
    return self._index
```

Get and set to `self.index` when possible. Setting to `self.index` ensures that we may not be creating an invalid `IndexedFrame` with a `len(index) != len(columns)`.

There are times when still setting `self._index` was necessary because some data was being swapped "inplace" and validation needed to be avoided. (Hoping to avoid this pattern in the future)

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15781
---
 python/cudf/cudf/core/dataframe.py          | 216 +++++++++++---------
 python/cudf/cudf/core/groupby/groupby.py    |  10 +-
 python/cudf/cudf/core/indexed_frame.py      | 167 ++++++++-------
 python/cudf/cudf/core/join/_join_helpers.py |   4 +-
 python/cudf/cudf/core/join/join.py          |  12 +-
 python/cudf/cudf/core/reshape.py            |  10 +-
 python/cudf/cudf/core/series.py             |  56 ++---
 python/cudf/cudf/tests/test_dlpack.py       |   9 +-
 8 files changed, 253 insertions(+), 231 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 0b7c40ff516..9f3f756a1e7 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -254,7 +254,7 @@ def _getitem_tuple_arg(self, arg):
         # Step 1: Gather columns
         if isinstance(arg, tuple):
             columns_df = self._frame._get_columns_by_label(arg[1])
-            columns_df._index = self._frame._index
+            columns_df.index = self._frame.index
         else:
             columns_df = self._frame
 
@@ -545,7 +545,7 @@ def __getitem__(self, arg):
     @_cudf_nvtx_annotate
     def _setitem_tuple_arg(self, key, value):
         columns_df = self._frame._from_data(
-            self._frame._data.select_by_index(key[1]), self._frame._index
+            self._frame._data.select_by_index(key[1]), self._frame.index
         )
 
         if is_scalar(value):
@@ -710,11 +710,11 @@ def __init__(
             if index is not None:
                 if not data.index.equals(index):
                     data = data.reindex(index)
-                    index = data._index
+                    index = data.index
                 else:
                     index = as_index(index)
             else:
-                index = data._index
+                index = data.index
 
             self._index = index
 
@@ -1176,7 +1176,7 @@ def _constructor_expanddim(self):
     def serialize(self):
         header, frames = super().serialize()
 
-        header["index"], index_frames = self._index.serialize()
+        header["index"], index_frames = self.index.serialize()
         header["index_frame_count"] = len(index_frames)
         # For backwards compatibility with older versions of cuDF, index
         # columns are placed before data columns.
@@ -1193,7 +1193,7 @@ def deserialize(cls, header, frames):
 
         idx_typ = pickle.loads(header["index"]["type-serialized"])
         index = idx_typ.deserialize(header["index"], frames[:index_nframes])
-        obj._index = index
+        obj.index = index
 
         return obj
 
@@ -1396,27 +1396,27 @@ def __setitem__(self, arg, value):
             else:
                 if arg in self._data:
                     if not is_scalar(value) and len(self) == 0:
+                        value = column.as_column(value)
+                        length = len(value)
+                        new_columns = (
+                            value
+                            if key == arg
+                            else column.column_empty_like(
+                                col, masked=True, newsize=length
+                            )
+                            for key, col in self._data.items()
+                        )
+                        self._data = self._data._from_columns_like_self(
+                            new_columns, verify=False
+                        )
                         if isinstance(value, (pd.Series, Series)):
                             self._index = as_index(value.index)
                         elif len(value) > 0:
-                            self._index = RangeIndex(start=0, stop=len(value))
-                        value = column.as_column(value)
-                        new_data = self._data.__class__()
-                        for key in self._data:
-                            if key == arg:
-                                new_data[key] = value
-                            else:
-                                new_data[key] = column.column_empty_like(
-                                    self._data[key],
-                                    masked=True,
-                                    newsize=len(value),
-                                )
-
-                        self._data = new_data
+                            self._index = RangeIndex(length)
                         return
                     elif isinstance(value, (pd.Series, Series)):
                         value = Series(value)._align_to_index(
-                            self._index,
+                            self.index,
                             how="right",
                             sort=False,
                             allow_non_unique=True,
@@ -1489,7 +1489,7 @@ def memory_usage(self, index=True, deep=False):
         mem_usage = [col.memory_usage for col in self._data.columns]
         names = [str(name) for name in self._data.names]
         if index:
-            mem_usage.append(self._index.memory_usage())
+            mem_usage.append(self.index.memory_usage())
             names.append("Index")
         return Series._from_data(
             data={None: as_column(mem_usage)},
@@ -1698,7 +1698,7 @@ def _concat(
                 []
                 if are_all_range_index
                 or (ignore_index and not empty_has_index)
-                else list(f._index._data.columns)
+                else list(f.index._data.columns)
             )
             + [f._data[name] if name in f._data else None for name in names]
             for f in objs
@@ -1761,11 +1761,9 @@ def _concat(
         # least one input frame has an index, assign a new RangeIndex
         # to the result frame.
         if empty_has_index and num_empty_input_frames == len(objs):
-            out._index = cudf.RangeIndex(result_index_length)
+            out.index = cudf.RangeIndex(result_index_length)
         elif are_all_range_index and not ignore_index:
-            out._index = cudf.core.index.Index._concat(
-                [o._index for o in objs]
-            )
+            out.index = cudf.core.index.Index._concat([o.index for o in objs])
 
         # Reassign the categories for any categorical table cols
         _reassign_categories(
@@ -1773,14 +1771,14 @@ def _concat(
         )
 
         # Reassign the categories for any categorical index cols
-        if not isinstance(out._index, cudf.RangeIndex):
+        if not isinstance(out.index, cudf.RangeIndex):
             _reassign_categories(
                 categories,
-                out._index._data,
+                out.index._data,
                 indices[:first_data_column_position],
             )
-            if not isinstance(out._index, MultiIndex) and isinstance(
-                out._index.dtype, cudf.CategoricalDtype
+            if not isinstance(out.index, MultiIndex) and isinstance(
+                out.index.dtype, cudf.CategoricalDtype
             ):
                 out = out.set_index(
                     cudf.core.index.as_index(out.index._values)
@@ -1796,8 +1794,8 @@ def _concat(
         else:
             out.columns = names
         if not ignore_index:
-            out._index.name = objs[0]._index.name
-            out._index.names = objs[0]._index.names
+            out.index.name = objs[0].index.name
+            out.index.names = objs[0].index.names
 
         return out
 
@@ -1965,7 +1963,7 @@ def _get_renderable_dataframe(self):
                 output = cudf.concat([upper, lower])
 
         output = self._clean_nulls_from_dataframe(output)
-        output._index = output._index._clean_nulls_from_index()
+        output.index = output.index._clean_nulls_from_index()
 
         return output
 
@@ -2036,7 +2034,7 @@ def _make_operands_and_index_for_binop(
         bool,
     ]:
         lhs, rhs = self._data, other
-        index = self._index
+        index = self.index
         fill_requires_key = False
         left_default: Any = False
         equal_columns = False
@@ -2081,7 +2079,7 @@ def _make_operands_and_index_for_binop(
                     "Can only compare identically-labeled DataFrame objects"
                 )
             new_lhs, new_rhs = _align_indices(self, other)
-            index = new_lhs._index
+            index = new_lhs.index
             lhs, rhs = new_lhs._data, new_rhs._data
             fill_requires_key = True
             # For DataFrame-DataFrame ops, always default to operating against
@@ -2455,7 +2453,7 @@ def scatter_by_map(
                 )
 
         partitioned_columns, output_offsets = libcudf.partitioning.partition(
-            [*(self._index._columns if keep_index else ()), *self._columns],
+            [*(self.index._columns if keep_index else ()), *self._columns],
             map_index,
             map_size,
         )
@@ -3248,23 +3246,28 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
         if len(self) == 0:
             if isinstance(value, (pd.Series, Series)):
                 if not ignore_index:
-                    self._index = as_index(value.index)
-            elif len(value) > 0:
-                self._index = RangeIndex(start=0, stop=len(value))
-                new_data = self._data.__class__()
+                    self.index = as_index(value.index)
+            elif (length := len(value)) > 0:
                 if num_cols != 0:
-                    for col_name in self._data:
-                        new_data[col_name] = column.column_empty_like(
-                            self._data[col_name],
-                            masked=True,
-                            newsize=len(value),
-                        )
-                self._data = new_data
+                    ca = self._data._from_columns_like_self(
+                        (
+                            column.column_empty_like(
+                                col_data, masked=True, newsize=length
+                            )
+                            for col_data in self._data.values()
+                        ),
+                        verify=False,
+                    )
+                else:
+                    ca = ColumnAccessor({})
+                self._data = ca
+                self._index = RangeIndex(length)
+
         elif isinstance(value, (pd.Series, Series)):
             value = Series(value, nan_as_null=nan_as_null)
             if not ignore_index:
                 value = value._align_to_index(
-                    self._index, how="right", sort=False
+                    self.index, how="right", sort=False
                 )
 
         value = column.as_column(value, nan_as_null=nan_as_null)
@@ -3293,7 +3296,7 @@ def axes(self):
             Index(['key', 'k2', 'val', 'temp'], dtype='object')]
 
         """
-        return [self._index, self._data.to_pandas_index()]
+        return [self.index, self._data.to_pandas_index()]
 
     def diff(self, periods=1, axis=0):
         """
@@ -4853,8 +4856,8 @@ def partition_by_hash(self, columns, nparts, keep_index=True):
         """
         key_indices = [self._column_names.index(k) for k in columns]
         if keep_index:
-            cols = [*self._index._columns, *self._columns]
-            key_indices = [i + len(self._index._columns) for i in key_indices]
+            cols = [*self.index._columns, *self._columns]
+            key_indices = [i + len(self.index._columns) for i in key_indices]
         else:
             cols = [*self._columns]
 
@@ -5019,13 +5022,13 @@ def info(
 
         lines = [str(type(self))]
 
-        index_name = type(self._index).__name__
-        if len(self._index) > 0:
-            entries_summary = f", {self._index[0]} to {self._index[-1]}"
+        index_name = type(self.index).__name__
+        if len(self.index) > 0:
+            entries_summary = f", {self.index[0]} to {self.index[-1]}"
         else:
             entries_summary = ""
         index_summary = (
-            f"{index_name}: {len(self._index)} entries{entries_summary}"
+            f"{index_name}: {len(self.index)} entries{entries_summary}"
         )
         lines.append(index_summary)
 
@@ -5629,7 +5632,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         num_cols = len(data[0])
 
         if columns is None and data.dtype.names is None:
-            names = [i for i in range(num_cols)]
+            names = range(num_cols)
 
         elif data.dtype.names is not None:
             names = data.dtype.names
@@ -5642,28 +5645,43 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
                 )
             names = columns
 
-        df = DataFrame()
-
         if data.ndim == 2:
-            for i, k in enumerate(names):
-                df._data[k] = column.as_column(
-                    data[:, i], nan_as_null=nan_as_null
-                )
+            ca_data = {
+                k: column.as_column(data[:, i], nan_as_null=nan_as_null)
+                for i, k in enumerate(names)
+            }
         elif data.ndim == 1:
-            for k in names:
-                df._data[k] = column.as_column(
-                    data[k], nan_as_null=nan_as_null
-                )
+            ca_data = {
+                name: column.as_column(data[name], nan_as_null=nan_as_null)
+                for name in names
+            }
 
-        if index is None:
-            df._index = RangeIndex(start=0, stop=len(data))
-        elif is_scalar(index):
-            df._index = RangeIndex(start=0, stop=len(data))
-            df = df.set_index(index)
+        if not is_scalar(index):
+            new_index = as_index(index)
+        else:
+            new_index = None
+
+        if isinstance(columns, (pd.Index, cudf.Index)):
+            level_names = tuple(columns.names)
         else:
-            df._index = as_index(index)
-        if isinstance(columns, pd.Index):
-            df._data._level_names = tuple(columns.names)
+            level_names = None
+
+        df = cls._from_data(
+            ColumnAccessor(
+                data=ca_data,
+                multiindex=isinstance(
+                    columns, (pd.MultiIndex, cudf.MultiIndex)
+                ),
+                rangeindex=isinstance(
+                    columns, (range, pd.RangeIndex, cudf.RangeIndex)
+                ),
+                level_names=level_names,
+                label_dtype=getattr(columns, "dtype", None),
+            ),
+            index=new_index,
+        )
+        if is_scalar(index) and index is not None:
+            df = df.set_index(index)
         return df
 
     @classmethod
@@ -5712,26 +5730,38 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
                 raise ValueError("Duplicate column names are not allowed")
             names = columns
 
-        df = cls()
         if data.ndim == 2:
-            for i, k in enumerate(names):
-                df._data[k] = column.as_column(
-                    data[:, i], nan_as_null=nan_as_null
-                )
+            ca_data = {
+                k: column.as_column(data[:, i], nan_as_null=nan_as_null)
+                for i, k in enumerate(names)
+            }
         elif data.ndim == 1:
-            df._data[names[0]] = column.as_column(
-                data, nan_as_null=nan_as_null
-            )
-        if isinstance(columns, pd.Index):
-            df._data._level_names = tuple(columns.names)
-        if isinstance(columns, (range, pd.RangeIndex, cudf.RangeIndex)):
-            df._data.rangeindex = True
+            ca_data = {
+                names[0]: column.as_column(data, nan_as_null=nan_as_null)
+            }
 
-        if index is None:
-            df._index = RangeIndex(start=0, stop=len(data))
+        if index is not None:
+            index = as_index(index)
+
+        if isinstance(columns, (pd.Index, cudf.Index)):
+            level_names = tuple(columns.names)
         else:
-            df._index = as_index(index)
-        return df
+            level_names = None
+
+        return cls._from_data(
+            ColumnAccessor(
+                data=ca_data,
+                multiindex=isinstance(
+                    columns, (pd.MultiIndex, cudf.MultiIndex)
+                ),
+                rangeindex=isinstance(
+                    columns, (range, pd.RangeIndex, cudf.RangeIndex)
+                ),
+                level_names=level_names,
+                label_dtype=getattr(columns, "dtype", None),
+            ),
+            index=index,
+        )
 
     @_cudf_nvtx_annotate
     def interpolate(
@@ -7006,7 +7036,7 @@ def stack(self, level=-1, dropna=no_default, future_stack=False):
 
         # Assemble the final index
         new_index_columns = [*repeated_index._columns, *tiled_index]
-        index_names = [*self._index.names, *unique_named_levels.names]
+        index_names = [*self.index.names, *unique_named_levels.names]
         new_index = MultiIndex.from_frame(
             DataFrame._from_data(
                 dict(zip(range(0, len(new_index_columns)), new_index_columns))
@@ -7797,7 +7827,7 @@ def value_counts(
             result = result / result._column.sum()
         # Pandas always returns MultiIndex even if only one column.
         if not isinstance(result.index, MultiIndex):
-            result.index = MultiIndex._from_data(result._index._data)
+            result.index = MultiIndex._from_data(result.index._data)
         result.name = "proportion" if normalize else "count"
         return result
 
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index bf24864c29d..3e7a1ee6026 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -949,7 +949,7 @@ def nth(self, n):
 
         result = result[sizes > n]
 
-        result._index = self.obj.index.take(
+        result.index = self.obj.index.take(
             result._data["__groupbynth_order__"]
         )
         del result._data["__groupbynth_order__"]
@@ -1038,7 +1038,7 @@ def ngroup(self, ascending=True):
         if has_null_group:
             group_ids.iloc[-1] = cudf.NA
 
-        group_ids._index = index
+        group_ids.index = index
         return self._broadcast(group_ids)
 
     def sample(
@@ -1208,7 +1208,7 @@ def deserialize(cls, header, frames):
 
     def _grouped(self, *, include_groups: bool = True):
         offsets, grouped_key_cols, grouped_value_cols = self._groupby.groups(
-            [*self.obj._index._columns, *self.obj._columns]
+            [*self.obj.index._columns, *self.obj._columns]
         )
         grouped_keys = cudf.core.index._index_from_data(
             dict(enumerate(grouped_key_cols))
@@ -2849,8 +2849,8 @@ def _handle_label(self, by):
             self._key_columns.append(self._obj._data[by])
         except KeyError as e:
             # `by` can be index name(label) too.
-            if by in self._obj._index.names:
-                self._key_columns.append(self._obj._index._data[by])
+            if by in self._obj.index.names:
+                self._key_columns.append(self._obj.index._data[by])
             else:
                 raise e
         self.names.append(by)
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 7aae0d1729e..a166c256689 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -289,11 +289,11 @@ def __init__(self, data=None, index=None):
     @property
     def _num_rows(self) -> int:
         # Important to use the index because the data may be empty.
-        return len(self._index)
+        return len(self.index)
 
     @property
     def _index_names(self) -> Tuple[Any, ...]:  # TODO: Tuple[str]?
-        return self._index._data.names
+        return self.index._data.names
 
     @classmethod
     def _from_data(
@@ -307,7 +307,7 @@ def _from_data(
 
     @_cudf_nvtx_annotate
     def _from_data_like_self(self, data: MutableMapping):
-        out = self._from_data(data, self._index)
+        out = self._from_data(data, self.index)
         out._data._level_names = self._data._level_names
         return out
 
@@ -350,7 +350,7 @@ def _from_columns_like_self(
         frame = self.__class__._from_data(data)
 
         if index is not None:
-            frame._index = index
+            frame.index = index
         return frame._copy_type_metadata(
             self,
             include_index=bool(index_names),
@@ -367,7 +367,7 @@ def _mimic_inplace(
         self, result: Self, inplace: bool = False
     ) -> Optional[Self]:
         if inplace:
-            self._index = result._index
+            self._index = result.index
         return super()._mimic_inplace(result, inplace)
 
     # Scans
@@ -442,15 +442,15 @@ def _scan(self, op, axis=None, skipna=True):
                 # pandas returns an int64 dtype for all int or bool dtypes.
                 result_col = result_col.astype(np.int64)
             results[name] = getattr(result_col, op)()
-        return self._from_data(results, self._index)
+        return self._from_data(results, self.index)
 
     def _check_data_index_length_match(self) -> None:
         # Validate that the number of rows in the data matches the index if the
         # data is not empty. This is a helper for the constructor.
-        if self._data.nrows > 0 and self._data.nrows != len(self._index):
+        if self._data.nrows > 0 and self._data.nrows != len(self.index):
             raise ValueError(
                 f"Length of values ({self._data.nrows}) does not "
-                f"match length of index ({len(self._index)})"
+                f"match length of index ({len(self.index)})"
             )
 
     @property
@@ -618,14 +618,14 @@ def copy(self, deep: bool = True) -> Self:
         return self._from_data(
             self._data.copy(deep=deep),
             # Indexes are immutable so copies can always be shallow.
-            self._index.copy(deep=False),
+            self.index.copy(deep=False),
         )
 
     @_cudf_nvtx_annotate
     def equals(self, other):  # noqa: D102
         if not super().equals(other):
             return False
-        return self._index.equals(other._index)
+        return self.index.equals(other.index)
 
     @property
     def index(self):
@@ -908,7 +908,7 @@ def replace(
         else:
             copy_data = self._data.copy(deep=True)
 
-        result = self._from_data(copy_data, self._index)
+        result = self._from_data(copy_data, self.index)
 
         return self._mimic_inplace(result, inplace=inplace)
 
@@ -1033,7 +1033,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1):
             name: col.clip(lower[i], upper[i])
             for i, (name, col) in enumerate(self._data.items())
         }
-        output = self._from_data(data, self._index)
+        output = self._from_data(data, self.index)
         output._copy_type_metadata(self, include_index=False)
         return self._mimic_inplace(output, inplace=inplace)
 
@@ -1935,29 +1935,27 @@ def _copy_type_metadata(
         super()._copy_type_metadata(other, override_dtypes=override_dtypes)
         if (
             include_index
-            and self._index is not None
-            and other._index is not None
+            and self.index is not None
+            and other.index is not None
         ):
-            self._index._copy_type_metadata(other._index)
-            # When other._index is a CategoricalIndex, the current index
+            self.index._copy_type_metadata(other.index)
+            # When other.index is a CategoricalIndex, the current index
             # will be a NumericalIndex with an underlying CategoricalColumn
             # (the above _copy_type_metadata call will have converted the
             # column). Calling cudf.Index on that column generates the
             # appropriate index.
             if isinstance(
-                other._index, cudf.core.index.CategoricalIndex
-            ) and not isinstance(
-                self._index, cudf.core.index.CategoricalIndex
-            ):
-                self._index = cudf.Index(
-                    cast("cudf.Index", self._index)._column,
-                    name=self._index.name,
+                other.index, cudf.core.index.CategoricalIndex
+            ) and not isinstance(self.index, cudf.core.index.CategoricalIndex):
+                self.index = cudf.Index(
+                    cast("cudf.Index", self.index)._column,
+                    name=self.index.name,
                 )
-            elif isinstance(other._index, cudf.MultiIndex) and not isinstance(
-                self._index, cudf.MultiIndex
+            elif isinstance(other.index, cudf.MultiIndex) and not isinstance(
+                self.index, cudf.MultiIndex
             ):
-                self._index = cudf.MultiIndex._from_data(
-                    self._index._data, name=self._index.name
+                self.index = cudf.MultiIndex._from_data(
+                    self.index._data, name=self.index.name
                 )
         return self
 
@@ -2017,8 +2015,8 @@ def interpolate(
 
         data = self
 
-        if not isinstance(data._index, cudf.RangeIndex):
-            perm_sort = data._index.argsort()
+        if not isinstance(data.index, cudf.RangeIndex):
+            perm_sort = data.index.argsort()
             data = data._gather(
                 GatherMap.from_column_unchecked(
                     cudf.core.column.as_column(perm_sort),
@@ -2040,13 +2038,13 @@ def interpolate(
                 col = col.astype("float64").fillna(np.nan)
 
             # Interpolation methods may or may not need the index
-            columns[colname] = interpolator(col, index=data._index)
+            columns[colname] = interpolator(col, index=data.index)
 
-        result = self._from_data(columns, index=data._index)
+        result = self._from_data(columns, index=data.index)
 
         return (
             result
-            if isinstance(data._index, cudf.RangeIndex)
+            if isinstance(data.index, cudf.RangeIndex)
             # TODO: This should be a scatter, avoiding an argsort.
             else result._gather(
                 GatherMap.from_column_unchecked(
@@ -2070,7 +2068,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
             col.shift(periods, fill_value) for col in self._columns
         )
         return self.__class__._from_data(
-            zip(self._column_names, data_columns), self._index
+            zip(self._column_names, data_columns), self.index
         )
 
     @_cudf_nvtx_annotate
@@ -2254,7 +2252,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True):
         if not copy:
             raise ValueError("Truncating with copy=False is not supported.")
         axis = self._get_axis_from_axis_arg(axis)
-        ax = self._index if axis == 0 else self._data.to_pandas_index()
+        ax = self.index if axis == 0 else self._data.to_pandas_index()
 
         if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
             raise ValueError("truncate requires a sorted index")
@@ -2585,7 +2583,7 @@ def scale(self):
         vmin = self.min()
         vmax = self.max()
         scaled = (self - vmin) / (vmax - vmin)
-        scaled._index = self._index.copy(deep=False)
+        scaled.index = self.index.copy(deep=False)
         return scaled
 
     @_cudf_nvtx_annotate
@@ -2919,14 +2917,14 @@ def _gather(
             raise IndexError("Gather map is out of bounds")
         return self._from_columns_like_self(
             libcudf.copying.gather(
-                list(self._index._columns + self._columns)
+                list(self.index._columns + self._columns)
                 if keep_index
                 else list(self._columns),
                 gather_map.column,
                 nullify=gather_map.nullify,
             ),
             self._column_names,
-            self._index.names if keep_index else None,
+            self.index.names if keep_index else None,
         )
 
     def _slice(self, arg: slice, keep_index: bool = True) -> Self:
@@ -3000,7 +2998,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
 
         columns_to_slice = [
             *(
-                self._index._data.columns
+                self.index._data.columns
                 if keep_index and not has_range_index
                 else []
             ),
@@ -3009,7 +3007,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
         result = self._from_columns_like_self(
             libcudf.copying.columns_slice(columns_to_slice, [start, stop])[0],
             self._column_names,
-            None if has_range_index or not keep_index else self._index.names,
+            None if has_range_index or not keep_index else self.index.names,
         )
         result._data.label_dtype = self._data.label_dtype
         result._data.rangeindex = self._data.rangeindex
@@ -3028,7 +3026,7 @@ def _positions_from_column_names(
         indices returned corresponds to the column order in this Frame.
         """
         num_index_columns = (
-            len(self._index._data) if offset_by_index_columns else 0
+            len(self.index._data) if offset_by_index_columns else 0
         )
         return [
             i + num_index_columns
@@ -3073,13 +3071,13 @@ def drop_duplicates(
             libcudf.stream_compaction.drop_duplicates(
                 list(self._columns)
                 if ignore_index
-                else list(self._index._columns + self._columns),
+                else list(self.index._columns + self._columns),
                 keys=keys,
                 keep=keep,
                 nulls_are_equal=nulls_are_equal,
             ),
             self._column_names,
-            self._index.names if not ignore_index else None,
+            self.index.names if not ignore_index else None,
         )
 
     @_cudf_nvtx_annotate
@@ -3197,12 +3195,12 @@ def _empty_like(self, keep_index=True) -> Self:
         result = self._from_columns_like_self(
             libcudf.copying.columns_empty_like(
                 [
-                    *(self._index._data.columns if keep_index else ()),
+                    *(self.index._data.columns if keep_index else ()),
                     *self._columns,
                 ]
             ),
             self._column_names,
-            self._index.names if keep_index else None,
+            self.index.names if keep_index else None,
         )
         result._data.label_dtype = self._data.label_dtype
         result._data.rangeindex = self._data.rangeindex
@@ -3214,7 +3212,7 @@ def _split(self, splits, keep_index=True):
 
         columns_split = libcudf.copying.columns_split(
             [
-                *(self._index._data.columns if keep_index else []),
+                *(self.index._data.columns if keep_index else []),
                 *self._columns,
             ],
             splits,
@@ -3224,7 +3222,7 @@ def _split(self, splits, keep_index=True):
             self._from_columns_like_self(
                 columns_split[i],
                 self._column_names,
-                self._index.names if keep_index else None,
+                self.index.names if keep_index else None,
             )
             for i in range(len(splits) + 1)
         ]
@@ -3244,12 +3242,12 @@ def fillna(
                 "Use obj.ffill() or obj.bfill() instead.",
                 FutureWarning,
             )
-        old_index = self._index
+        old_index = self.index
         ret = super().fillna(value, method, axis, inplace, limit)
         if inplace:
-            self._index = old_index
+            self.index = old_index
         else:
-            ret._index = old_index
+            ret.index = old_index
         return ret
 
     @_cudf_nvtx_annotate
@@ -3479,7 +3477,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
         col = _post_process_output_col(ans_col, retty)
 
         col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
-        result = cudf.Series._from_data({None: col}, self._index)
+        result = cudf.Series._from_data({None: col}, self.index)
 
         return result
 
@@ -3706,12 +3704,12 @@ def _reindex(
 
         df = self
         if index is not None:
-            if not df._index.is_unique:
+            if not df.index.is_unique:
                 raise ValueError(
                     "cannot reindex on an axis with duplicate labels"
                 )
             index = cudf.core.index.as_index(
-                index, name=getattr(index, "name", self._index.name)
+                index, name=getattr(index, "name", self.index.name)
             )
 
             idx_dtype_match = (df.index.nlevels == index.nlevels) and all(
@@ -3739,7 +3737,7 @@ def _reindex(
                         else name: col
                         for name, col in df._data.items()
                     },
-                    index=df._index,
+                    index=df.index,
                 )
                 df = lhs.join(rhs, how="left", sort=True)
                 # double-argsort to map back from sorted to unsorted positions
@@ -3915,7 +3913,7 @@ def round(self, decimals=0, how="half_even"):
                 multiindex=self._data.multiindex,
                 level_names=self._data.level_names,
             ),
-            index=self._index,
+            index=self.index,
         )
 
     def resample(
@@ -4267,7 +4265,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None):
 
         return self._from_columns_like_self(
             libcudf.stream_compaction.drop_nulls(
-                [*self._index._data.columns, *data_columns],
+                [*self.index._data.columns, *data_columns],
                 how=how,
                 keys=self._positions_from_column_names(
                     subset, offset_by_index_columns=True
@@ -4275,7 +4273,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None):
                 thresh=thresh,
             ),
             self._column_names,
-            self._index.names,
+            self.index.names,
         )
 
     def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True):
@@ -4292,13 +4290,13 @@ def _apply_boolean_mask(self, boolean_mask: BooleanMask, keep_index=True):
             )
         return self._from_columns_like_self(
             libcudf.stream_compaction.apply_boolean_mask(
-                list(self._index._columns + self._columns)
+                list(self.index._columns + self._columns)
                 if keep_index
                 else list(self._columns),
                 boolean_mask.column,
             ),
             column_names=self._column_names,
-            index_names=self._index.names if keep_index else None,
+            index_names=self.index.names if keep_index else None,
         )
 
     def take(self, indices, axis=0):
@@ -4358,7 +4356,7 @@ def _reset_index(self, level, drop, col_level=0, col_fill=""):
                 )
             if not isinstance(level, (tuple, list)):
                 level = (level,)
-        _check_duplicate_level_names(level, self._index.names)
+        _check_duplicate_level_names(level, self.index.names)
 
         index = self.index._new_index_for_reset_index(level, self.index.name)
         if index is None:
@@ -4394,7 +4392,7 @@ def _first_or_last(
         self, offset, idx: int, op: Callable, side: str, slice_func: Callable
     ) -> "IndexedFrame":
         """Shared code path for ``first`` and ``last``."""
-        if not isinstance(self._index, cudf.core.index.DatetimeIndex):
+        if not isinstance(self.index, cudf.core.index.DatetimeIndex):
             raise TypeError("'first' only supports a DatetimeIndex index.")
         if not isinstance(offset, str):
             raise NotImplementedError(
@@ -4406,20 +4404,20 @@ def _first_or_last(
 
         pd_offset = pd.tseries.frequencies.to_offset(offset)
         to_search = op(
-            pd.Timestamp(self._index._column.element_indexing(idx)), pd_offset
+            pd.Timestamp(self.index._column.element_indexing(idx)), pd_offset
         )
         if (
             idx == 0
             and not isinstance(pd_offset, pd.tseries.offsets.Tick)
-            and pd_offset.is_on_offset(pd.Timestamp(self._index[0]))
+            and pd_offset.is_on_offset(pd.Timestamp(self.index[0]))
         ):
             # Special handle is required when the start time of the index
             # is on the end of the offset. See pandas gh29623 for detail.
             to_search = to_search - pd_offset.base
             return self.loc[:to_search]
-        needle = as_column(to_search, dtype=self._index.dtype)
+        needle = as_column(to_search, dtype=self.index.dtype)
         end_point = int(
-            self._index._column.searchsorted(
+            self.index._column.searchsorted(
                 needle, side=side
             ).element_indexing(0)
         )
@@ -4802,7 +4800,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
                     name: (col, None, False, None)
                     for name, col in self._data.items()
                 }
-                index = self._index
+                index = self.index
 
             data = self._apply_cupy_ufunc_to_operands(
                 ufunc, cupy_func, inputs, **kwargs
@@ -4880,7 +4878,7 @@ def repeat(self, repeats, axis=None):
         """
         res = self._from_columns_like_self(
             Frame._repeat(
-                [*self._index._data.columns, *self._columns], repeats, axis
+                [*self.index._data.columns, *self._columns], repeats, axis
             ),
             self._column_names,
             self._index_names,
@@ -5011,7 +5009,7 @@ def astype(
                 raise e
             return self
 
-        return self._from_data(data, index=self._index)
+        return self._from_data(data, index=self.index)
 
     @_cudf_nvtx_annotate
     def drop(
@@ -5220,8 +5218,7 @@ def drop(
                 columns = _get_host_unique(columns)
                 _drop_columns(dropped, columns, errors)
 
-            out._data = dropped._data
-            out._index = dropped._index
+            out._mimic_inplace(dropped, inplace=True)
 
         if not inplace:
             return out
@@ -5234,18 +5231,18 @@ def _explode(self, explode_column: Any, ignore_index: bool):
         # exploded and will be replaced with a `RangeIndex`.
         if not isinstance(self._data[explode_column].dtype, ListDtype):
             data = self._data.copy(deep=True)
-            idx = None if ignore_index else self._index.copy(deep=True)
+            idx = None if ignore_index else self.index.copy(deep=True)
             return self.__class__._from_data(data, index=idx)
 
         column_index = self._column_names.index(explode_column)
-        if not ignore_index and self._index is not None:
-            index_offset = self._index.nlevels
+        if not ignore_index and self.index is not None:
+            index_offset = self.index.nlevels
         else:
             index_offset = 0
 
         exploded = libcudf.lists.explode_outer(
             [
-                *(self._index._data.columns if not ignore_index else ()),
+                *(self.index._data.columns if not ignore_index else ()),
                 *self._columns,
             ],
             column_index + index_offset,
@@ -5292,7 +5289,7 @@ def tile(self, count):
         """
         return self._from_columns_like_self(
             libcudf.reshape.tile(
-                [*self._index._columns, *self._columns], count
+                [*self.index._columns, *self._columns], count
             ),
             column_names=self._column_names,
             index_names=self._index_names,
@@ -6273,7 +6270,7 @@ def rank(
 
         return self.__class__._from_data(
             dict(zip(source._column_names, result_columns)),
-            index=source._index,
+            index=source.index,
         ).astype(np.float64)
 
     def convert_dtypes(
@@ -6505,7 +6502,7 @@ def _is_series(obj):
     Checks if the `obj` is of type `cudf.Series`
     instead of checking for isinstance(obj, cudf.Series)
     """
-    return isinstance(obj, Frame) and obj.ndim == 1 and obj._index is not None
+    return isinstance(obj, Frame) and obj.ndim == 1 and obj.index is not None
 
 
 @_cudf_nvtx_annotate
@@ -6518,7 +6515,7 @@ def _drop_rows_by_labels(
     """Remove rows specified by `labels`.
 
     If `errors="raise"`, an error is raised if some items in `labels` do not
-    exist in `obj._index`.
+    exist in `obj.index`.
 
     Will raise if level(int) is greater or equal to index nlevels.
     """
@@ -6539,17 +6536,17 @@ def _drop_rows_by_labels(
         if isinstance(level, int):
             ilevel = level
         else:
-            ilevel = obj._index.names.index(level)
+            ilevel = obj.index.names.index(level)
 
         # 1. Merge Index df and data df along column axis:
-        # | id | ._index df | data column(s) |
-        idx_nlv = obj._index.nlevels
-        working_df = obj._index.to_frame(index=False)
+        # | id | .index df | data column(s) |
+        idx_nlv = obj.index.nlevels
+        working_df = obj.index.to_frame(index=False)
         working_df.columns = list(range(idx_nlv))
         for i, col in enumerate(obj._data):
             working_df[idx_nlv + i] = obj._data[col]
         # 2. Set `level` as common index:
-        # | level | ._index df w/o level | data column(s) |
+        # | level | .index df w/o level | data column(s) |
         working_df = working_df.set_index(level)
 
         # 3. Use "leftanti" join to drop
@@ -6560,11 +6557,11 @@ def _drop_rows_by_labels(
 
         # 4. Reconstruct original layout, and rename
         join_res._insert(
-            ilevel, name=join_res._index.name, value=join_res._index
+            ilevel, name=join_res.index.name, value=join_res.index
         )
 
         midx = cudf.MultiIndex.from_frame(
-            join_res.iloc[:, 0:idx_nlv], names=obj._index.names
+            join_res.iloc[:, 0:idx_nlv], names=obj.index.names
         )
 
         if isinstance(obj, cudf.Series):
@@ -6596,7 +6593,7 @@ def _drop_rows_by_labels(
         # Join changes the index to common type,
         # but we need to preserve the type of
         # index being returned, Hence this type-cast.
-        res._index = res.index.astype(obj.index.dtype)
+        res.index = res.index.astype(obj.index.dtype)
         return res
 
 
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index 6a619945e75..05cbb4429b9 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -43,10 +43,10 @@ def set(self, obj: cudf.DataFrame, value: ColumnBase, validate=False):
 
 class _IndexIndexer(_Indexer):
     def get(self, obj: cudf.DataFrame) -> ColumnBase:
-        return obj._index._data[self.name]
+        return obj.index._data[self.name]
 
     def set(self, obj: cudf.DataFrame, value: ColumnBase, validate=False):
-        obj._index._data.set_by_label(self.name, value, validate=validate)
+        obj.index._data.set_by_label(self.name, value, validate=validate)
 
 
 def _match_join_keys(
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index 1ef2915bc59..da999441ca3 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -373,10 +373,10 @@ def _merge_results(
         index: Optional[cudf.BaseIndex]
         if self._using_right_index:
             # right_index and left_on
-            index = left_result._index
+            index = left_result.index
         elif self._using_left_index:
             # left_index and right_on
-            index = right_result._index
+            index = right_result.index
         else:
             index = None
 
@@ -400,7 +400,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame:
         # producing the input result.
         by: List[Any] = []
         if self._using_left_index and self._using_right_index:
-            by.extend(result._index._data.columns)
+            by.extend(result.index._data.columns)
         if not self._using_left_index:
             by.extend([result._data[col.name] for col in self._left_keys])
         if not self._using_right_index:
@@ -408,8 +408,8 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame:
         if by:
             keep_index = self._using_left_index or self._using_right_index
             if keep_index:
-                to_sort = [*result._index._columns, *result._columns]
-                index_names = result._index.names
+                to_sort = [*result.index._columns, *result._columns]
+                index_names = result.index.names
             else:
                 to_sort = [*result._columns]
                 index_names = None
@@ -547,4 +547,4 @@ class MergeSemi(Merge):
 
     def _merge_results(self, lhs: cudf.DataFrame, rhs: cudf.DataFrame):
         # semi-join result includes only lhs columns
-        return lhs._data, lhs._index
+        return lhs._data, lhs.index
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 0b44ab58f30..d4772d5b4c2 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -836,7 +836,7 @@ def get_dummies(
                     dtype=dtype,
                 )
                 result_data.update(col_enc_data)
-            return cudf.DataFrame._from_data(result_data, index=df._index)
+            return cudf.DataFrame._from_data(result_data, index=df.index)
     else:
         ser = cudf.Series(df)
         unique = _get_unique(column=ser._column, dummy_na=dummy_na)
@@ -847,7 +847,7 @@ def get_dummies(
             prefix_sep=prefix_sep,
             dtype=dtype,
         )
-        return cudf.DataFrame._from_data(data, index=ser._index)
+        return cudf.DataFrame._from_data(data, index=ser.index)
 
 
 def _merge_sorted(
@@ -899,7 +899,7 @@ def _merge_sorted(
         raise ValueError("`by_index` and `ignore_index` cannot both be True")
 
     if by_index:
-        key_columns_indices = list(range(0, objs[0]._index.nlevels))
+        key_columns_indices = list(range(0, objs[0].index.nlevels))
     else:
         if keys is None:
             key_columns_indices = list(range(0, objs[0]._num_columns))
@@ -909,12 +909,12 @@ def _merge_sorted(
             ]
         if not ignore_index:
             key_columns_indices = [
-                idx + objs[0]._index.nlevels for idx in key_columns_indices
+                idx + objs[0].index.nlevels for idx in key_columns_indices
             ]
 
     columns = [
         [
-            *(obj._index._data.columns if not ignore_index else ()),
+            *(obj.index._data.columns if not ignore_index else ()),
             *obj._columns,
         ]
         for obj in objs
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index c7bc97edd68..41fbf269699 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -296,7 +296,7 @@ def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]:
             result = self._frame.index._get_row_major(self._frame, row_arg)
             if (
                 isinstance(arg, tuple)
-                and len(arg) == self._frame._index.nlevels
+                and len(arg) == self._frame.index.nlevels
                 and not any(isinstance(x, slice) for x in arg)
             ):
                 result = result.iloc[0]
@@ -318,7 +318,7 @@ def __setitem__(self, key, value):
                 and not isinstance(self._frame.index, cudf.MultiIndex)
                 and is_scalar(value)
             ):
-                idx = self._frame._index
+                idx = self._frame.index
                 if isinstance(idx, cudf.RangeIndex):
                     if isinstance(key, int) and (key == idx[-1] + idx.step):
                         idx_copy = cudf.RangeIndex(
@@ -682,7 +682,7 @@ def _from_data(
 
     @_cudf_nvtx_annotate
     def __contains__(self, item):
-        return item in self._index
+        return item in self.index
 
     @classmethod
     @_cudf_nvtx_annotate
@@ -832,7 +832,7 @@ def hasnans(self):
     def serialize(self):
         header, frames = super().serialize()
 
-        header["index"], index_frames = self._index.serialize()
+        header["index"], index_frames = self.index.serialize()
         header["index_frame_count"] = len(index_frames)
         # For backwards compatibility with older versions of cuDF, index
         # columns are placed before data columns.
@@ -850,7 +850,7 @@ def deserialize(cls, header, frames):
 
         idx_typ = pickle.loads(header["index"]["type-serialized"])
         index = idx_typ.deserialize(header["index"], frames[:index_nframes])
-        obj._index = index
+        obj.index = index
 
         return obj
 
@@ -995,7 +995,7 @@ def reindex(self, *args, **kwargs):
                     "'index' passed as both positional and keyword argument"
                 )
         else:
-            index = kwargs.get("index", self._index)
+            index = kwargs.get("index", self.index)
 
         name = self.name or 0
         series = self._reindex(
@@ -1140,7 +1140,7 @@ def to_frame(self, name=None):
     @_cudf_nvtx_annotate
     def memory_usage(self, index=True, deep=False):
         return self._column.memory_usage + (
-            self._index.memory_usage() if index else 0
+            self.index.memory_usage() if index else 0
         )
 
     @_cudf_nvtx_annotate
@@ -1506,7 +1506,7 @@ def _make_operands_and_index_for_binop(
             can_use_self_column_name = False
 
         operands = lhs._make_operands_for_binop(other, fill_value, reflect)
-        return operands, lhs._index, can_use_self_column_name
+        return operands, lhs.index, can_use_self_column_name
 
     @copy_docstring(CategoricalAccessor)  # type: ignore
     @property
@@ -1917,7 +1917,7 @@ def between(self, left, right, inclusive="both") -> Series:
                 "Inclusive has to be either string of 'both', "
                 "'left', 'right', or 'neither'."
             )
-        return self._from_data({self.name: lmask & rmask}, self._index)
+        return self._from_data({self.name: lmask & rmask}, self.index)
 
     @_cudf_nvtx_annotate
     def all(self, axis=0, bool_only=None, skipna=True, **kwargs):
@@ -3119,7 +3119,7 @@ def value_counts(
                 # TODO: Remove this workaround once `observed`
                 # parameter support is added to `groupby`
                 res = res.reindex(self.dtype.categories).fillna(0)
-                res._index = res._index.astype(self.dtype)
+                res.index = res.index.astype(self.dtype)
 
         res.index.name = self.name
 
@@ -3927,7 +3927,7 @@ def microsecond(self):
                 * cudf.Scalar(1000, dtype="int32")
             )
             + self.series._column.get_dt_field("microsecond"),
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4161,7 +4161,7 @@ def is_leap_year(self):
         res = libcudf.datetime.is_leap_year(self.series._column).fillna(False)
         return Series._from_data(
             ColumnAccessor({None: res}),
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4195,7 +4195,7 @@ def quarter(self):
         )
         return Series._from_data(
             {None: res},
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4299,7 +4299,7 @@ def days_in_month(self):
         res = libcudf.datetime.days_in_month(self.series._column)
         return Series._from_data(
             ColumnAccessor({None: res}),
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4345,7 +4345,7 @@ def is_month_end(self):
         last_day = libcudf.datetime.last_day_of_month(self.series._column)
         last_day = Series._from_data(
             ColumnAccessor({None: last_day}),
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
         return (self.day == last_day.dt.day).fillna(False)
@@ -4395,7 +4395,7 @@ def is_quarter_start(self):
         result = ((day == cudf.Scalar(1)) & first_month).fillna(False)
         return Series._from_data(
             {None: result},
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4446,7 +4446,7 @@ def is_quarter_end(self):
         result = ((day == last_day) & last_month).fillna(False)
         return Series._from_data(
             {None: result},
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4481,7 +4481,7 @@ def is_year_start(self):
         ) == cudf.Scalar(1)
         return Series._from_data(
             {None: outcol.fillna(False)},
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4520,7 +4520,7 @@ def is_year_end(self):
         result = result.fillna(False)
         return Series._from_data(
             {None: result},
-            index=self.series._index,
+            index=self.series.index,
             name=self.series.name,
         )
 
@@ -4528,7 +4528,7 @@ def is_year_end(self):
     def _get_dt_field(self, field):
         out_column = self.series._column.get_dt_field(field)
         return Series(
-            data=out_column, index=self.series._index, name=self.series.name
+            data=out_column, index=self.series.index, name=self.series.name
         )
 
     @_cudf_nvtx_annotate
@@ -4565,7 +4565,7 @@ def ceil(self, freq):
         out_column = self.series._column.ceil(freq)
 
         return Series._from_data(
-            data={self.series.name: out_column}, index=self.series._index
+            data={self.series.name: out_column}, index=self.series.index
         )
 
     @_cudf_nvtx_annotate
@@ -4602,7 +4602,7 @@ def floor(self, freq):
         out_column = self.series._column.floor(freq)
 
         return Series._from_data(
-            data={self.series.name: out_column}, index=self.series._index
+            data={self.series.name: out_column}, index=self.series.index
         )
 
     @_cudf_nvtx_annotate
@@ -4642,7 +4642,7 @@ def round(self, freq):
         out_column = self.series._column.round(freq)
 
         return Series._from_data(
-            data={self.series.name: out_column}, index=self.series._index
+            data={self.series.name: out_column}, index=self.series.index
         )
 
     @_cudf_nvtx_annotate
@@ -4724,7 +4724,7 @@ def strftime(self, date_format, *args, **kwargs):
             dtype="str", format=date_format
         )
         return Series(
-            data=str_col, index=self.series._index, name=self.series.name
+            data=str_col, index=self.series.index, name=self.series.name
         )
 
     @copy_docstring(DatetimeIndex.tz_localize)
@@ -4739,7 +4739,7 @@ def tz_localize(
         )
         return Series._from_data(
             data={self.series.name: result_col},
-            index=self.series._index,
+            index=self.series.index,
         )
 
     @copy_docstring(DatetimeIndex.tz_convert)
@@ -4755,7 +4755,7 @@ def tz_convert(self, tz: str | None):
         """
         result_col = self.series._column.tz_convert(tz)
         return Series._from_data(
-            {self.series.name: result_col}, index=self.series._index
+            {self.series.name: result_col}, index=self.series.index
         )
 
 
@@ -4993,13 +4993,13 @@ def components(self):
         3      0      0       35       35           656             0            0
         4     37     13       12       14           234             0            0
         """  # noqa: E501
-        return self.series._column.components(index=self.series._index)
+        return self.series._column.components(index=self.series.index)
 
     @_cudf_nvtx_annotate
     def _get_td_field(self, field):
         out_column = getattr(self.series._column, field)
         return Series(
-            data=out_column, index=self.series._index, name=self.series.name
+            data=out_column, index=self.series.index, name=self.series.name
         )
 
 
diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py
index 6e34817c4fd..aafe920d3a1 100644
--- a/python/cudf/cudf/tests/test_dlpack.py
+++ b/python/cudf/cudf/tests/test_dlpack.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import itertools
 from contextlib import ExitStack as does_not_raise
@@ -201,12 +201,7 @@ def test_to_dlpack_mixed_dtypes():
     "shape",
     [
         (0, 3),
-        pytest.param(
-            (3, 0),
-            marks=pytest.mark.xfail(
-                reason="Index information not available via from_dlpack"
-            ),
-        ),
+        (3, 0),
         (0, 0),
     ],
 )

From b4ce6e4815dbf1af533312a2b0350303a7db785d Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Wed, 22 May 2024 13:20:10 -0700
Subject: [PATCH 251/842] Expose some Parquet per-column configuration options
 via the python API (#15613)

Several recent PRs (#15081, #15411, #15600) added the ability to control some aspects of Parquet file writing on a per-column basis. During discussion of #15081 it was [suggested](https://github.com/rapidsai/cudf/pull/15081#issuecomment-1979731930) that these options be exposed by cuDF-python in a manner similar to pyarrow. This PR adds the ability to control per-column encoding, compression, binary output, and fixed-length data width, using fully qualified Parquet column names. For example, given a cuDF table with an integer column 'a', and a `list<int32>` column 'b', the fully qualified column names would be 'a' and 'b.list.element'.

Addresses "Add cuDF-python API support for specifying encodings" task in #13501.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vukasin Milovanovic (https://github.com/vuule)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15613
---
 python/cudf/cudf/_lib/parquet.pyx             | 74 +++++++++++++++++-
 .../cudf/_lib/pylibcudf/libcudf/io/types.pxd  | 18 ++++-
 python/cudf/cudf/core/dataframe.py            |  8 ++
 python/cudf/cudf/io/parquet.py                | 64 ++++++++++++++++
 python/cudf/cudf/tests/test_parquet.py        | 76 +++++++++++++++++++
 python/cudf/cudf/utils/ioutils.py             | 16 ++++
 6 files changed, 252 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 70acb7f917b..f0eef9be124 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -408,6 +408,10 @@ def write_parquet(
     object force_nullable_schema=False,
     header_version="1.0",
     use_dictionary=True,
+    object skip_compression=None,
+    object column_encoding=None,
+    object column_type_length=None,
+    object output_as_binary=None,
 ):
     """
     Cython function to call into libcudf API, see `write_parquet`.
@@ -458,7 +462,12 @@ def write_parquet(
         _set_col_metadata(
             table[name]._column,
             tbl_meta.column_metadata[i],
-            force_nullable_schema
+            force_nullable_schema,
+            None,
+            skip_compression,
+            column_encoding,
+            column_type_length,
+            output_as_binary
         )
 
     cdef map[string, string] tmp_user_data
@@ -810,16 +819,62 @@ cdef cudf_io_types.compression_type _get_comp_type(object compression):
         raise ValueError("Unsupported `compression` type")
 
 
+cdef cudf_io_types.column_encoding _get_encoding_type(object encoding):
+    if encoding is None:
+        return cudf_io_types.column_encoding.USE_DEFAULT
+
+    enc = str(encoding).upper()
+    if enc == "PLAIN":
+        return cudf_io_types.column_encoding.PLAIN
+    elif enc == "DICTIONARY":
+        return cudf_io_types.column_encoding.DICTIONARY
+    elif enc == "DELTA_BINARY_PACKED":
+        return cudf_io_types.column_encoding.DELTA_BINARY_PACKED
+    elif enc == "DELTA_LENGTH_BYTE_ARRAY":
+        return cudf_io_types.column_encoding.DELTA_LENGTH_BYTE_ARRAY
+    elif enc == "DELTA_BYTE_ARRAY":
+        return cudf_io_types.column_encoding.DELTA_BYTE_ARRAY
+    elif enc == "BYTE_STREAM_SPLIT":
+        return cudf_io_types.column_encoding.BYTE_STREAM_SPLIT
+    elif enc == "USE_DEFAULT":
+        return cudf_io_types.column_encoding.USE_DEFAULT
+    else:
+        raise ValueError("Unsupported `column_encoding` type")
+
+
 cdef _set_col_metadata(
     Column col,
     column_in_metadata& col_meta,
     bool force_nullable_schema=False,
+    str path=None,
+    object skip_compression=None,
+    object column_encoding=None,
+    object column_type_length=None,
+    object output_as_binary=None,
 ):
+    need_path = (skip_compression is not None or column_encoding is not None or
+                 column_type_length is not None or output_as_binary is not None)
+    name = col_meta.get_name().decode('UTF-8') if need_path else None
+    full_path = path + "." + name if path is not None else name
+
     if force_nullable_schema:
         # Only set nullability if `force_nullable_schema`
         # is true.
         col_meta.set_nullability(True)
 
+    if skip_compression is not None and full_path in skip_compression:
+        col_meta.set_skip_compression(True)
+
+    if column_encoding is not None and full_path in column_encoding:
+        col_meta.set_encoding(_get_encoding_type(column_encoding[full_path]))
+
+    if column_type_length is not None and full_path in column_type_length:
+        col_meta.set_output_as_binary(True)
+        col_meta.set_type_length(column_type_length[full_path])
+
+    if output_as_binary is not None and full_path in output_as_binary:
+        col_meta.set_output_as_binary(True)
+
     if isinstance(col.dtype, cudf.StructDtype):
         for i, (child_col, name) in enumerate(
             zip(col.children, list(col.dtype.fields))
@@ -828,13 +883,26 @@ cdef _set_col_metadata(
             _set_col_metadata(
                 child_col,
                 col_meta.child(i),
-                force_nullable_schema
+                force_nullable_schema,
+                full_path,
+                skip_compression,
+                column_encoding,
+                column_type_length,
+                output_as_binary
             )
     elif isinstance(col.dtype, cudf.ListDtype):
+        if full_path is not None:
+            full_path = full_path + ".list"
+            col_meta.child(1).set_name("element".encode())
         _set_col_metadata(
             col.children[1],
             col_meta.child(1),
-            force_nullable_schema
+            force_nullable_schema,
+            full_path,
+            skip_compression,
+            column_encoding,
+            column_type_length,
+            output_as_binary
         )
     elif isinstance(col.dtype, cudf.core.dtypes.DecimalDtype):
         col_meta.set_decimal_precision(col.dtype.precision)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
index 4725c4e5937..38fae1df1e5 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int32_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
@@ -57,6 +57,19 @@ cdef extern from "cudf/io/types.hpp" \
         ADAPTIVE = 1,
         ALWAYS = 2,
 
+    cdef extern from "cudf/io/types.hpp" namespace "cudf::io" nogil:
+        cpdef enum class column_encoding:
+            USE_DEFAULT = -1
+            DICTIONARY = 0
+            PLAIN = 1
+            DELTA_BINARY_PACKED = 2
+            DELTA_LENGTH_BYTE_ARRAY =3
+            DELTA_BYTE_ARRAY = 4
+            BYTE_STREAM_SPLIT = 5
+            DIRECT = 6
+            DIRECT_V2 = 7
+            DICTIONARY_V2 = 8
+
     cdef cppclass column_name_info:
         string name
         vector[column_name_info] children
@@ -81,6 +94,9 @@ cdef extern from "cudf/io/types.hpp" \
         column_in_metadata& set_decimal_precision(uint8_t precision)
         column_in_metadata& child(size_type i)
         column_in_metadata& set_output_as_binary(bool binary)
+        column_in_metadata& set_type_length(int32_t type_length)
+        column_in_metadata& set_skip_compression(bool skip)
+        column_in_metadata& set_encoding(column_encoding enc)
         string get_name()
 
     cdef cppclass table_input_metadata:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 9f3f756a1e7..1f530aa3108 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6707,6 +6707,10 @@ def to_parquet(
         return_metadata=False,
         use_dictionary=True,
         header_version="1.0",
+        skip_compression=None,
+        column_encoding=None,
+        column_type_length=None,
+        output_as_binary=None,
         *args,
         **kwargs,
     ):
@@ -6733,6 +6737,10 @@ def to_parquet(
             return_metadata=return_metadata,
             use_dictionary=use_dictionary,
             header_version=header_version,
+            skip_compression=skip_compression,
+            column_encoding=column_encoding,
+            column_type_length=column_type_length,
+            output_as_binary=output_as_binary,
             *args,
             **kwargs,
         )
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index a6c67d22af7..dbdb2093b72 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -69,6 +69,10 @@ def _write_parquet(
     force_nullable_schema=False,
     header_version="1.0",
     use_dictionary=True,
+    skip_compression=None,
+    column_encoding=None,
+    column_type_length=None,
+    output_as_binary=None,
 ):
     if is_list_like(paths) and len(paths) > 1:
         if partitions_info is None:
@@ -102,6 +106,10 @@ def _write_parquet(
         "force_nullable_schema": force_nullable_schema,
         "header_version": header_version,
         "use_dictionary": use_dictionary,
+        "skip_compression": skip_compression,
+        "column_encoding": column_encoding,
+        "column_type_length": column_type_length,
+        "output_as_binary": output_as_binary,
     }
     if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs):
         with ExitStack() as stack:
@@ -140,6 +148,12 @@ def write_to_dataset(
     max_page_size_rows=None,
     storage_options=None,
     force_nullable_schema=False,
+    header_version="1.0",
+    use_dictionary=True,
+    skip_compression=None,
+    column_encoding=None,
+    column_type_length=None,
+    output_as_binary=None,
 ):
     """Wraps `to_parquet` to write partitioned Parquet datasets.
     For each combination of partition group and value,
@@ -204,6 +218,30 @@ def write_to_dataset(
         If True, writes all columns as `null` in schema.
         If False, columns are written as `null` if they contain null values,
         otherwise as `not null`.
+    header_version : {{'1.0', '2.0'}}, default "1.0"
+        Controls whether to use version 1.0 or version 2.0 page headers when
+        encoding. Version 1.0 is more portable, but version 2.0 enables the
+        use of newer encoding schemes.
+    force_nullable_schema : bool, default False.
+        If True, writes all columns as `null` in schema.
+        If False, columns are written as `null` if they contain null values,
+        otherwise as `not null`.
+    skip_compression : set, optional, default None
+        If a column name is present in the set, that column will not be compressed,
+        regardless of the ``compression`` setting.
+    column_encoding : dict, optional, default None
+        Sets the page encoding to use on a per-column basis. The key is a column
+        name, and the value is one of: 'PLAIN', 'DICTIONARY', 'DELTA_BINARY_PACKED',
+        'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY', 'BYTE_STREAM_SPLIT', or
+        'USE_DEFAULT'.
+    column_type_length : dict, optional, default None
+        Specifies the width in bytes of ``FIXED_LEN_BYTE_ARRAY`` column elements.
+        The key is a column name and the value is an integer. The named column
+        will be output as unannotated binary (i.e. the column will behave as if
+        ``output_as_binary`` was set).
+    output_as_binary : set, optional, default None
+        If a column name is present in the set, that column will be output as
+        unannotated binary, rather than the default 'UTF-8'.
     """
 
     fs = ioutils._ensure_filesystem(fs, root_path, storage_options)
@@ -241,6 +279,12 @@ def write_to_dataset(
             max_page_size_bytes=max_page_size_bytes,
             max_page_size_rows=max_page_size_rows,
             force_nullable_schema=force_nullable_schema,
+            header_version=header_version,
+            use_dictionary=use_dictionary,
+            skip_compression=skip_compression,
+            column_encoding=column_encoding,
+            column_type_length=column_type_length,
+            output_as_binary=output_as_binary,
         )
 
     else:
@@ -262,6 +306,12 @@ def write_to_dataset(
             max_page_size_bytes=max_page_size_bytes,
             max_page_size_rows=max_page_size_rows,
             force_nullable_schema=force_nullable_schema,
+            header_version=header_version,
+            use_dictionary=use_dictionary,
+            skip_compression=skip_compression,
+            column_encoding=column_encoding,
+            column_type_length=column_type_length,
+            output_as_binary=output_as_binary,
         )
 
     return metadata
@@ -906,6 +956,10 @@ def to_parquet(
     force_nullable_schema=False,
     header_version="1.0",
     use_dictionary=True,
+    skip_compression=None,
+    column_encoding=None,
+    column_type_length=None,
+    output_as_binary=None,
     *args,
     **kwargs,
 ):
@@ -955,6 +1009,12 @@ def to_parquet(
                 return_metadata=return_metadata,
                 storage_options=storage_options,
                 force_nullable_schema=force_nullable_schema,
+                header_version=header_version,
+                use_dictionary=use_dictionary,
+                skip_compression=skip_compression,
+                column_encoding=column_encoding,
+                column_type_length=column_type_length,
+                output_as_binary=output_as_binary,
             )
 
         partition_info = (
@@ -983,6 +1043,10 @@ def to_parquet(
             force_nullable_schema=force_nullable_schema,
             header_version=header_version,
             use_dictionary=use_dictionary,
+            skip_compression=skip_compression,
+            column_encoding=column_encoding,
+            column_type_length=column_type_length,
+            output_as_binary=output_as_binary,
         )
 
     else:
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index b2896d55b80..e32fdacd8d6 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2870,6 +2870,82 @@ def flba(i):
     assert_eq(expect, got)
 
 
+def test_parquet_flba_round_trip(tmpdir):
+    def flba(i):
+        hasher = hashlib.sha256()
+        hasher.update(i.to_bytes(4, "little"))
+        return hasher.digest()
+
+    # use pyarrow to write table of fixed_len_byte_array
+    num_rows = 200
+    data = pa.array([flba(i) for i in range(num_rows)], type=pa.binary(32))
+    padf = pa.Table.from_arrays([data], names=["flba"])
+    padf_fname = tmpdir.join("padf.parquet")
+    pq.write_table(padf, padf_fname)
+
+    # round trip data with cudf
+    cdf = cudf.read_parquet(padf_fname)
+    cdf_fname = tmpdir.join("cdf.parquet")
+    cdf.to_parquet(cdf_fname, column_type_length={"flba": 32})
+
+    # now read back in with pyarrow to test it was written properly by cudf
+    padf2 = pq.read_table(padf_fname)
+    padf3 = pq.read_table(cdf_fname)
+    assert_eq(padf2, padf3)
+    assert_eq(padf2.schema[0].type, padf3.schema[0].type)
+
+
+@pytest.mark.parametrize(
+    "encoding",
+    [
+        "PLAIN",
+        "DICTIONARY",
+        "DELTA_BINARY_PACKED",
+        "BYTE_STREAM_SPLIT",
+        "USE_DEFAULT",
+    ],
+)
+def test_per_column_options(tmpdir, encoding):
+    pdf = pd.DataFrame({"ilist": [[1, 2, 3, 1, 2, 3]], "i1": [1]})
+    cdf = cudf.from_pandas(pdf)
+    fname = tmpdir.join("ilist.parquet")
+    cdf.to_parquet(
+        fname,
+        column_encoding={"ilist.list.element": encoding},
+        compression="SNAPPY",
+        skip_compression={"ilist.list.element"},
+    )
+    # DICTIONARY and USE_DEFAULT should both result in a PLAIN_DICTIONARY encoding in parquet
+    encoding_name = (
+        "PLAIN_DICTIONARY"
+        if encoding == "DICTIONARY" or encoding == "USE_DEFAULT"
+        else encoding
+    )
+    pf = pq.ParquetFile(fname)
+    fmd = pf.metadata
+    assert encoding_name in fmd.row_group(0).column(0).encodings
+    assert fmd.row_group(0).column(0).compression == "UNCOMPRESSED"
+    assert fmd.row_group(0).column(1).compression == "SNAPPY"
+
+
+@pytest.mark.parametrize(
+    "encoding",
+    ["DELTA_LENGTH_BYTE_ARRAY", "DELTA_BYTE_ARRAY"],
+)
+def test_per_column_options_string_col(tmpdir, encoding):
+    pdf = pd.DataFrame({"s": ["a string"], "i1": [1]})
+    cdf = cudf.from_pandas(pdf)
+    fname = tmpdir.join("strcol.parquet")
+    cdf.to_parquet(
+        fname,
+        column_encoding={"s": encoding},
+        compression="SNAPPY",
+    )
+    pf = pq.ParquetFile(fname)
+    fmd = pf.metadata
+    assert encoding in fmd.row_group(0).column(0).encodings
+
+
 def test_parquet_reader_rle_boolean(datadir):
     fname = datadir / "rle_boolean_encoding.parquet"
 
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index dd9b44c5a53..1366a0b8e84 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -306,6 +306,22 @@
     If True, writes all columns as `null` in schema.
     If False, columns are written as `null` if they contain null values,
     otherwise as `not null`.
+skip_compression : set, optional, default None
+    If a column name is present in the set, that column will not be compressed,
+    regardless of the ``compression`` setting.
+column_encoding : dict, optional, default None
+    Sets the page encoding to use on a per-column basis. The key is a column
+    name, and the value is one of: 'PLAIN', 'DICTIONARY', 'DELTA_BINARY_PACKED',
+    'DELTA_LENGTH_BYTE_ARRAY', 'DELTA_BYTE_ARRAY', 'BYTE_STREAM_SPLIT', or
+    'USE_DEFAULT'.
+column_type_length : dict, optional, default None
+    Specifies the width in bytes of ``FIXED_LEN_BYTE_ARRAY`` column elements.
+    The key is a column name and the value is an integer. The named column
+    will be output as unannotated binary (i.e. the column will behave as if
+    ``output_as_binary`` was set).
+output_as_binary : set, optional, default None
+    If a column name is present in the set, that column will be output as
+    unannotated binary, rather than the default 'UTF-8'.
 **kwargs
     Additional parameters will be passed to execution engines other
     than ``cudf``.

From a5f6fa3674ed91713adf390954fb7234618201fa Mon Sep 17 00:00:00 2001
From: Mohamed Thabet <thabetx@gmail.com>
Date: Wed, 22 May 2024 23:20:58 +0300
Subject: [PATCH 252/842] Fix spaces around CSV quoted strings (#15727)

This PR adds an option to CSV parsing to detect quotes even if they are surrounded by whitespaces.

Current behavior when `options.keepquotes == false`:
- `"A"` ->  `A`
- `  "A"  ` -> `  "A"  ` (The spaces around the 'A' are not removed and the quotes are kept)

New behavior after enabling the new option:
- `"A"` -> `A`
- `  "A"  ` -> `A`

The new option is false by default to avoid breaking any code that relied on the old behavior.

Closes #13892.

Authors:
  - Mohamed Thabet (https://github.com/thabetx)
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15727
---
 cpp/include/cudf/io/csv.hpp                   | 35 ++++++++++++++++
 cpp/src/io/csv/csv_gpu.cu                     | 16 ++++++--
 cpp/src/io/csv/reader_impl.cu                 |  6 ++-
 cpp/src/io/utilities/parsing_utils.cuh        |  3 ++
 cpp/tests/io/csv_test.cpp                     | 41 +++++++++++++++++++
 .../cudf/_lib/pylibcudf/libcudf/io/csv.pxd    |  3 ++
 6 files changed, 99 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index fdceda40e92..a20f75cecd7 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -106,6 +106,9 @@ class csv_reader_options {
   char _quotechar = '"';
   // Whether a quote inside a value is double-quoted
   bool _doublequote = true;
+  // Whether to detect quotes surrounded by spaces e.g. `   "data"   `. This flag has no effect when
+  // _doublequote is true
+  bool _detect_whitespace_around_quotes = false;
   // Names of columns to read as datetime
   std::vector<std::string> _parse_dates_names;
   // Indexes of columns to read as datetime
@@ -375,6 +378,17 @@ class csv_reader_options {
    */
   [[nodiscard]] bool is_enabled_doublequote() const { return _doublequote; }
 
+  /**
+   * @brief Whether to detect quotes surrounded by spaces e.g. `   "data"   `. This flag has no
+   * effect when _doublequote is true
+   *
+   * @return `true` if detect_whitespace_around_quotes is enabled
+   */
+  [[nodiscard]] bool is_enabled_detect_whitespace_around_quotes() const
+  {
+    return _detect_whitespace_around_quotes;
+  }
+
   /**
    * @brief Returns names of columns to read as datetime.
    *
@@ -698,6 +712,14 @@ class csv_reader_options {
    */
   void enable_doublequote(bool val) { _doublequote = val; }
 
+  /**
+   * @brief Sets whether to detect quotes surrounded by spaces e.g. `   "data"   `. This flag has no
+   * effect when _doublequote is true
+   *
+   * @param val Boolean value to enable/disable
+   */
+  void enable_detect_whitespace_around_quotes(bool val) { _detect_whitespace_around_quotes = val; }
+
   /**
    * @brief Sets names of columns to read as datetime.
    *
@@ -1126,6 +1148,19 @@ class csv_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets whether to detect quotes surrounded by spaces e.g. `   "data"   `. This flag has no
+   * effect when _doublequote is true
+   *
+   * @param val Boolean value to enable/disable
+   * @return this for chaining
+   */
+  csv_reader_options_builder& detect_whitespace_around_quotes(bool val)
+  {
+    options._detect_whitespace_around_quotes = val;
+    return *this;
+  }
+
   /**
    * @brief Sets names of columns to read as datetime.
    *
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 9c186f161b3..7a05d0aebaf 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -351,9 +351,19 @@ CUDF_KERNEL void __launch_bounds__(csvparse_block_dim)
         if (dtypes[actual_col].id() == cudf::type_id::STRING) {
           auto end = next_delimiter;
           if (not options.keepquotes) {
-            if ((*field_start == options.quotechar) && (*(end - 1) == options.quotechar)) {
-              ++field_start;
-              --end;
+            if (not options.detect_whitespace_around_quotes) {
+              if ((*field_start == options.quotechar) && (*(end - 1) == options.quotechar)) {
+                ++field_start;
+                --end;
+              }
+            } else {
+              // If the string is quoted, whitespace around the quotes get removed as well
+              auto const trimmed_field = trim_whitespaces(field_start, end);
+              if ((*trimmed_field.first == options.quotechar) &&
+                  (*(trimmed_field.second - 1) == options.quotechar)) {
+                field_start = trimmed_field.first + 1;
+                end         = trimmed_field.second - 1;
+              }
             }
           }
           auto str_list = static_cast<std::pair<char const*, size_t>*>(columns[actual_col]);
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 67c1194578a..5dee0c17a33 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -951,8 +951,10 @@ parse_options make_parse_options(csv_reader_options const& reader_opts,
   parse_opts.terminator = reader_opts.get_lineterminator();
 
   if (reader_opts.get_quotechar() != '\0' && reader_opts.get_quoting() != quote_style::NONE) {
-    parse_opts.quotechar   = reader_opts.get_quotechar();
-    parse_opts.keepquotes  = false;
+    parse_opts.quotechar  = reader_opts.get_quotechar();
+    parse_opts.keepquotes = false;
+    parse_opts.detect_whitespace_around_quotes =
+      reader_opts.is_enabled_detect_whitespace_around_quotes();
     parse_opts.doublequote = reader_opts.is_enabled_doublequote();
   } else {
     parse_opts.quotechar   = '\0';
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 06a0a63c0ab..faee05541cc 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -63,6 +63,7 @@ struct parse_options_view {
   char thousands;
   char comment;
   bool keepquotes;
+  bool detect_whitespace_around_quotes;
   bool doublequote;
   bool dayfirst;
   bool skipblanklines;
@@ -80,6 +81,7 @@ struct parse_options {
   char thousands;
   char comment;
   bool keepquotes;
+  bool detect_whitespace_around_quotes;
   bool doublequote;
   bool dayfirst;
   bool skipblanklines;
@@ -105,6 +107,7 @@ struct parse_options {
             thousands,
             comment,
             keepquotes,
+            detect_whitespace_around_quotes,
             doublequote,
             dayfirst,
             skipblanklines,
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 8e3ecd817e4..880dc911954 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -1018,6 +1018,47 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored)
     view.column(1));
 }
 
+TEST_F(CsvReaderTest, StringsQuotesWhitespace)
+{
+  std::vector<std::string> names{"line", "verse"};
+
+  auto filepath = temp_env->get_temp_dir() + "StringsQuotesIgnored.csv";
+  {
+    std::ofstream outfile(filepath, std::ofstream::out);
+    outfile << names[0] << ',' << names[1] << '\n';
+    outfile << "A,a" << '\n';              // unquoted no whitespace
+    outfile << "    B,b" << '\n';          // unquoted leading whitespace
+    outfile << "C    ,c" << '\n';          // unquoted trailing whitespace
+    outfile << "    D    ,d" << '\n';      // unquoted leading and trailing whitespace
+    outfile << "\"E\",e" << '\n';          // quoted no whitespace
+    outfile << "\"F\"    ,f" << '\n';      // quoted trailing whitespace
+    outfile << "    \"G\",g" << '\n';      // quoted leading whitespace
+    outfile << "    \"H\"    ,h" << '\n';  // quoted leading and trailing whitespace
+    outfile << "    \"    I    \"    ,i"
+            << '\n';  // quoted leading and trailing whitespace with spaces inside quotes
+  }
+
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
+      .names(names)
+      .dtypes(std::vector<data_type>{dtype<cudf::string_view>(), dtype<cudf::string_view>()})
+      .quoting(cudf::io::quote_style::ALL)
+      .doublequote(false)
+      .detect_whitespace_around_quotes(true);
+  auto result = cudf::io::read_csv(in_opts);
+
+  auto const view = result.tbl->view();
+  ASSERT_EQ(2, view.num_columns());
+  ASSERT_EQ(type_id::STRING, view.column(0).type().id());
+  ASSERT_EQ(type_id::STRING, view.column(1).type().id());
+
+  expect_column_data_equal(
+    std::vector<std::string>{"A", "    B", "C    ", "    D    ", "E", "F", "G", "H", "    I    "},
+    view.column(0));
+  expect_column_data_equal(std::vector<std::string>{"a", "b", "c", "d", "e", "f", "g", "h", "i"},
+                           view.column(1));
+}
+
 TEST_F(CsvReaderTest, SkiprowsNrows)
 {
   auto filepath = temp_env->get_temp_dir() + "SkiprowsNrows.csv";
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
index 754dd37d53f..b5ff6558cd8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
@@ -50,6 +50,7 @@ cdef extern from "cudf/io/csv.hpp" \
         cudf_io_types.quote_style get_quoting() except +
         char get_quotechar() except +
         bool is_enabled_doublequote() except +
+        bool is_enabled_updated_quotes_detection() except +
         vector[string] get_parse_dates_names() except +
         vector[int] get_parse_dates_indexes() except +
         vector[string] get_parse_hex_names() except +
@@ -95,6 +96,7 @@ cdef extern from "cudf/io/csv.hpp" \
         void set_quoting(cudf_io_types.quote_style style) except +
         void set_quotechar(char val) except +
         void set_doublequote(bool val) except +
+        void set_detect_whitespace_around_quotes(bool val) except +
         void set_parse_dates(vector[string]) except +
         void set_parse_dates(vector[int]) except +
         void set_parse_hex(vector[string]) except +
@@ -163,6 +165,7 @@ cdef extern from "cudf/io/csv.hpp" \
         ) except +
         csv_reader_options_builder& quotechar(char val) except +
         csv_reader_options_builder& doublequote(bool val) except +
+        csv_reader_options_builder& detect_whitespace_around_quotes(bool val) except +
         csv_reader_options_builder& parse_dates(vector[string]) except +
         csv_reader_options_builder& parse_dates(vector[int]) except +
 

From f6cca5086c5eaeff7971813a3ca557a1708f4225 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 22 May 2024 10:47:24 -1000
Subject: [PATCH 253/842] Push some as_column arrow logic to
 ColumnBase.from_arrow (#15738)

`as_column` and `ColumnBase.from_arrow` have similar checks for handling `pa.Array` objects so consolidating them to
`ColumnBase.from_arrow` as `as_column` calls to that eventually.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15738
---
 python/cudf/cudf/core/column/column.py | 76 ++++++++------------------
 1 file changed, 23 insertions(+), 53 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 1785eb834b2..59bae179497 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -333,16 +333,27 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
         """
         if not isinstance(array, (pa.Array, pa.ChunkedArray)):
             raise TypeError("array should be PyArrow array or chunked array")
-
-        data = pa.table([array], [None])
-
-        if (
-            isinstance(array.type, pa.TimestampType)
-            and array.type.tz is not None
-        ):
+        elif pa.types.is_float16(array.type):
+            raise NotImplementedError(
+                "Type casting from `float16` to `float32` is not "
+                "yet supported in pyarrow, see: "
+                "https://github.com/apache/arrow/issues/20213"
+            )
+        elif pa.types.is_timestamp(array.type) and array.type.tz is not None:
             raise NotImplementedError(
                 "cuDF does not yet support timezone-aware datetimes"
             )
+        elif isinstance(array.type, ArrowIntervalType):
+            return cudf.core.column.IntervalColumn.from_arrow(array)
+        elif pa.types.is_large_string(array.type):
+            # Pandas-2.2+: Pandas defaults to `large_string` type
+            # instead of `string` without data-introspection.
+            # Temporary workaround until cudf has native
+            # support for `LARGE_STRING` i.e., 64 bit offsets
+            array = array.cast(pa.string())
+
+        data = pa.table([array], [None])
+
         if isinstance(array.type, pa.DictionaryType):
             indices_table = pa.table(
                 {
@@ -371,8 +382,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
                 size=codes.size,
                 ordered=array.type.ordered,
             )
-        elif isinstance(array.type, ArrowIntervalType):
-            return cudf.core.column.IntervalColumn.from_arrow(array)
 
         result = libcudf.interop.from_arrow(data)[0]
 
@@ -1809,27 +1818,7 @@ def as_column(
         return col
 
     elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)):
-        if pa.types.is_large_string(arbitrary.type):
-            # Pandas-2.2+: Pandas defaults to `large_string` type
-            # instead of `string` without data-introspection.
-            # Temporary workaround until cudf has native
-            # support for `LARGE_STRING` i.e., 64 bit offsets
-            arbitrary = arbitrary.cast(pa.string())
-
-        if pa.types.is_float16(arbitrary.type):
-            raise NotImplementedError(
-                "Type casting from `float16` to `float32` is not "
-                "yet supported in pyarrow, see: "
-                "https://github.com/apache/arrow/issues/20213"
-            )
-        elif (
-            pa.types.is_timestamp(arbitrary.type)
-            and arbitrary.type.tz is not None
-        ):
-            raise NotImplementedError(
-                "cuDF does not yet support timezone-aware datetimes"
-            )
-        elif (nan_as_null is None or nan_as_null) and pa.types.is_floating(
+        if (nan_as_null is None or nan_as_null) and pa.types.is_floating(
             arbitrary.type
         ):
             arbitrary = pc.if_else(
@@ -1837,31 +1826,12 @@ def as_column(
                 pa.nulls(len(arbitrary), type=arbitrary.type),
                 arbitrary,
             )
+        elif dtype is None and pa.types.is_null(arbitrary.type):
+            # default "empty" type
+            dtype = "str"
         col = ColumnBase.from_arrow(arbitrary)
 
-        if isinstance(arbitrary, pa.NullArray):
-            if dtype is not None:
-                # Cast the column to the `dtype` if specified.
-                new_dtype = dtype
-            elif len(arbitrary) == 0:
-                # If the column is empty, it has to be
-                # a `str` dtype.
-                new_dtype = cudf.dtype("str")
-            else:
-                # If the null column is not empty, it has to
-                # be of `object` dtype.
-                new_dtype = cudf.dtype(arbitrary.type.to_pandas_dtype())
-
-            if cudf.get_option(
-                "mode.pandas_compatible"
-            ) and new_dtype == cudf.dtype("O"):
-                # We internally raise if we do `astype("object")`, hence
-                # need to cast to `str` since this is safe to do so because
-                # it is a null-array.
-                new_dtype = "str"
-
-            col = col.astype(new_dtype)
-        elif dtype is not None:
+        if dtype is not None:
             col = col.astype(dtype)
 
         return col

From 1710e11c3ae9dd072305ca49e12e10d0f2e3aec0 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Thu, 23 May 2024 08:59:55 -0500
Subject: [PATCH 254/842] Return boolean from config_host_memory_resource
 instead of throwing (#15815)

Closes https://github.com/rapidsai/cudf/issues/15814

This adds a boolean return value from `cudf::io::config_host_memory_resource` to allow the caller to handle the case where the memory resource has already been configured in the past. Before this the function would throw, forcing callers to try/catch.

Authors:
  - Alessandro Bellina (https://github.com/abellina)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - https://github.com/nvdbaranec
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15815
---
 cpp/include/cudf/io/memory_resource.hpp       |  4 +++-
 cpp/src/io/utilities/config_utils.cpp         | 20 ++++++++++++-------
 .../java/ai/rapids/cudf/PinnedMemoryPool.java |  7 +++++--
 java/src/main/java/ai/rapids/cudf/Rmm.java    |  5 ++++-
 java/src/main/native/src/RmmJni.cpp           | 11 +++++-----
 5 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/cpp/include/cudf/io/memory_resource.hpp b/cpp/include/cudf/io/memory_resource.hpp
index e31ebce4b1f..a36e220ae7b 100644
--- a/cpp/include/cudf/io/memory_resource.hpp
+++ b/cpp/include/cudf/io/memory_resource.hpp
@@ -57,7 +57,9 @@ struct host_mr_options {
  * @throws cudf::logic_error if called after the default host memory resource has been created
  *
  * @param opts Options to configure the default host memory resource
+ * @return True if this call successfully configured the host memory resource, false if a
+ * a resource was already configured.
  */
-void config_default_host_memory_resource(host_mr_options const& opts);
+bool config_default_host_memory_resource(host_mr_options const& opts);
 
 }  // namespace cudf::io
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index 7720c073a97..dad1135e766 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -244,16 +244,20 @@ CUDF_EXPORT std::mutex& host_mr_mutex()
 }
 
 // Must be called with the host_mr_mutex mutex held
-CUDF_EXPORT rmm::host_async_resource_ref& make_host_mr(std::optional<host_mr_options> const& opts)
+CUDF_EXPORT rmm::host_async_resource_ref& make_host_mr(std::optional<host_mr_options> const& opts,
+                                                       bool* did_configure = nullptr)
 {
   static rmm::host_async_resource_ref* mr_ref = nullptr;
+  bool configured                             = false;
   if (mr_ref == nullptr) {
-    mr_ref = &make_default_pinned_mr(opts ? opts->pool_size : std::nullopt);
-  } else {
-    // Throw an error if the user tries to reconfigure the default host resource
-    CUDF_EXPECTS(opts == std::nullopt, "The default host memory resource has already been created");
+    configured = true;
+    mr_ref     = &make_default_pinned_mr(opts ? opts->pool_size : std::nullopt);
   }
 
+  // If the user passed an out param to detect whether this call configured a resource
+  // set the result
+  if (did_configure != nullptr) { *did_configure = configured; }
+
   return *mr_ref;
 }
 
@@ -278,10 +282,12 @@ rmm::host_async_resource_ref get_host_memory_resource()
   return host_mr();
 }
 
-void config_default_host_memory_resource(host_mr_options const& opts)
+bool config_default_host_memory_resource(host_mr_options const& opts)
 {
   std::scoped_lock lock{host_mr_mutex()};
-  make_host_mr(opts);
+  auto did_configure = false;
+  make_host_mr(opts, &did_configure);
+  return did_configure;
 }
 
 }  // namespace cudf::io
diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
index 9038700cb30..83b801db7fb 100644
--- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
+++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
@@ -260,9 +260,12 @@ private synchronized void free(long address, long size) {
    *
    * @param size initial and maximum size for the cuDF default pinned pool.
    *        Pass size=0 to disable the default pool.
+   *
+   * @return true if we were able to setup the default resource, false if there was
+   *         a resource already set.
    */
-  public static synchronized void configureDefaultCudfPinnedPoolSize(long size) {
-    Rmm.configureDefaultCudfPinnedPoolSize(size);
+  public static synchronized boolean configureDefaultCudfPinnedPoolSize(long size) {
+    return Rmm.configureDefaultCudfPinnedPoolSize(size);
   }
 
 }
diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index fdbdfdfff6f..4dee1b7aa24 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -273,8 +273,11 @@ public static synchronized void initialize(int allocationMode, LogConf logConf,
    *
    * @param size initial and maximum size for the cuDF default pinned pool.
    *        Pass size=0 to disable the default pool.
+   *
+   * @return true if we were able to setup the default resource, false if there was
+   *         a resource already set.
    */
-  public static synchronized native void configureDefaultCudfPinnedPoolSize(long size);
+  public static synchronized native boolean configureDefaultCudfPinnedPoolSize(long size);
 
   /**
    * Get the most recently set pool size or -1 if RMM has not been initialized or pooling is
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 9c015fee409..fa78f6ca4e2 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -1035,7 +1035,6 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCuioPinnedPoolMemoryResource(J
     // create a pinned fallback pool that will allocate pinned memory
     // if the regular pinned pool is exhausted
     pinned_fallback_mr.reset(new pinned_fallback_host_memory_resource(pool));
-    // set the cuio host mr and store the prior resource in our static variable
     prior_cuio_host_mr() = cudf::io::set_host_memory_resource(*pinned_fallback_mr);
   }
   CATCH_STD(env, )
@@ -1107,14 +1106,14 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromFallbackPinnedPool(JNIEnv
   CATCH_STD(env, )
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_configureDefaultCudfPinnedPoolSize(JNIEnv* env,
-                                                                                  jclass clazz,
-                                                                                  jlong size)
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Rmm_configureDefaultCudfPinnedPoolSize(JNIEnv* env,
+                                                                                      jclass clazz,
+                                                                                      jlong size)
 {
   try {
     cudf::jni::auto_set_device(env);
-    cudf::io::config_default_host_memory_resource(cudf::io::host_mr_options{size});
+    return cudf::io::config_default_host_memory_resource(cudf::io::host_mr_options{size});
   }
-  CATCH_STD(env, )
+  CATCH_STD(env, false)
 }
 }

From 9d8e43ef6ad75f6babc08fea88642ea006822e04 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 23 May 2024 11:41:49 -0400
Subject: [PATCH 255/842] Remove legacy JSON reader and
 concurrent_unordered_map.cuh. (#15813)

This completes the final two steps and closes https://github.com/rapidsai/cudf/issues/15537. Also addresses one step of https://github.com/rapidsai/cudf/issues/12261.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - David Wendt (https://github.com/davidwendt)
  - Shruti Shivakumar (https://github.com/shrshi)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15813
---
 cpp/CMakeLists.txt                            |   2 -
 cpp/include/cudf/io/json.hpp                  |  32 -
 cpp/src/groupby/hash/groupby.cu               |   1 -
 cpp/src/hash/concurrent_unordered_map.cuh     | 557 ---------------
 cpp/src/hash/managed.cuh                      |  41 --
 cpp/src/io/json/legacy/json_gpu.cu            | 615 ----------------
 cpp/src/io/json/legacy/json_gpu.hpp           |  99 ---
 cpp/src/io/json/legacy/read_json.hpp          |  38 -
 cpp/src/io/json/legacy/reader_impl.cu         | 667 ------------------
 cpp/src/io/json/read_json.cu                  |   9 -
 cpp/tests/CMakeLists.txt                      |   4 -
 cpp/tests/hash_map/map_test.cu                | 217 ------
 cpp/tests/io/json_test.cpp                    |  49 +-
 cpp/tests/io/nested_json_test.cpp             |   2 +-
 python/cudf/cudf/_lib/json.pyx                |   2 -
 .../cudf/_lib/pylibcudf/libcudf/io/json.pxd   |   3 -
 python/cudf/cudf/io/json.py                   |   1 -
 17 files changed, 8 insertions(+), 2331 deletions(-)
 delete mode 100644 cpp/src/hash/concurrent_unordered_map.cuh
 delete mode 100644 cpp/src/hash/managed.cuh
 delete mode 100644 cpp/src/io/json/legacy/json_gpu.cu
 delete mode 100644 cpp/src/io/json/legacy/json_gpu.hpp
 delete mode 100644 cpp/src/io/json/legacy/read_json.hpp
 delete mode 100644 cpp/src/io/json/legacy/reader_impl.cu
 delete mode 100644 cpp/tests/hash_map/map_test.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7390c465ccb..228d21ddccb 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -390,8 +390,6 @@ add_library(
   src/io/json/json_tree.cu
   src/io/json/nested_json_gpu.cu
   src/io/json/read_json.cu
-  src/io/json/legacy/json_gpu.cu
-  src/io/json/legacy/reader_impl.cu
   src/io/json/parser_features.cpp
   src/io/json/write_json.cu
   src/io/orc/aggregate_orc_metadata.cpp
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index aa4bee4fb5e..65ba8f25577 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -270,15 +270,6 @@ class json_reader_options {
    */
   bool is_enabled_dayfirst() const { return _dayfirst; }
 
-  /**
-   * @brief Whether the legacy reader should be used.
-   *
-   * @deprecated Since 24.06
-   *
-   * @returns true if the legacy reader will be used, false otherwise
-   */
-  [[deprecated]] bool is_enabled_legacy() const { return _legacy; }
-
   /**
    * @brief Whether the reader should keep quotes of string values.
    *
@@ -406,15 +397,6 @@ class json_reader_options {
    */
   void enable_dayfirst(bool val) { _dayfirst = val; }
 
-  /**
-   * @brief Set whether to use the legacy reader.
-   *
-   * @deprecated Since 24.06
-   *
-   * @param val Boolean value to enable/disable the legacy reader
-   */
-  [[deprecated]] void enable_legacy(bool val) { _legacy = val; }
-
   /**
    * @brief Set whether the reader should keep quotes of string values.
    *
@@ -605,20 +587,6 @@ class json_reader_options_builder {
     return *this;
   }
 
-  /**
-   * @brief Set whether to use the legacy reader.
-   *
-   * @deprecated Since 24.06
-   *
-   * @param val Boolean value to enable/disable legacy parsing
-   * @return this for chaining
-   */
-  [[deprecated]] json_reader_options_builder& legacy(bool val)
-  {
-    options._legacy = val;
-    return *this;
-  }
-
   /**
    * @brief Set whether the reader should keep quotes of string values.
    *
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 4f75ab19c66..0ec293ae3f0 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -16,7 +16,6 @@
 
 #include "groupby/common/utils.hpp"
 #include "groupby/hash/groupby_kernels.cuh"
-#include "hash/concurrent_unordered_map.cuh"
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh
deleted file mode 100644
index a010a462de3..00000000000
--- a/cpp/src/hash/concurrent_unordered_map.cuh
+++ /dev/null
@@ -1,557 +0,0 @@
-/*
- * Copyright (c) 2017-2024, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "hash/managed.cuh"
-
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/hashing/detail/default_hash.cuh>
-#include <cudf/hashing/detail/helper_functions.cuh>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/polymorphic_allocator.hpp>
-
-#include <cuda/atomic>
-#include <thrust/pair.h>
-
-#include <iostream>
-#include <iterator>
-#include <limits>
-#include <type_traits>
-
-namespace {
-template <std::size_t N>
-struct packed {
-  using type = void;
-};
-template <>
-struct packed<sizeof(uint64_t)> {
-  using type = uint64_t;
-};
-template <>
-struct packed<sizeof(uint32_t)> {
-  using type = uint32_t;
-};
-template <typename pair_type>
-using packed_t = typename packed<sizeof(pair_type)>::type;
-
-/**
- * @brief Indicates if a pair type can be packed.
- *
- * When the size of the key,value pair being inserted into the hash table is
- * equal in size to a type where atomicCAS is natively supported, it is more
- * efficient to "pack" the pair and insert it with a single atomicCAS.
- *
- * Only integral key and value types may be packed because we use
- * bitwise equality comparison, which may not be valid for non-integral
- * types.
- *
- * Also, the `pair_type` must not contain any padding bits otherwise
- * accessing the packed value would be undefined.
- *
- * @tparam pair_type The pair type that will be packed
- * @return true If the pair type can be packed
- * @return false  If the pair type cannot be packed
- */
-template <typename pair_type,
-          typename key_type   = typename pair_type::first_type,
-          typename value_type = typename pair_type::second_type>
-constexpr bool is_packable()
-{
-  return std::is_integral_v<key_type> and std::is_integral_v<value_type> and
-         not std::is_void_v<packed_t<pair_type>> and
-         std::has_unique_object_representations_v<pair_type>;
-}
-
-/**
- * @brief Allows viewing a pair in a packed representation
- *
- * Used as an optimization for inserting when a pair can be inserted with a
- * single atomicCAS
- */
-template <typename pair_type, typename Enable = void>
-union pair_packer;
-
-template <typename pair_type>
-union pair_packer<pair_type, std::enable_if_t<is_packable<pair_type>()>> {
-  using packed_type = packed_t<pair_type>;
-  packed_type packed;
-  pair_type pair;
-
-  __device__ pair_packer(pair_type _pair) : pair{_pair} {}
-
-  __device__ pair_packer(packed_type _packed) : packed{_packed} {}
-};
-}  // namespace
-
-/**
- * Supports concurrent insert, but not concurrent insert and find.
- *
- * @note The user is responsible for the following stream semantics:
- * - Either the same stream should be used to create the map as is used by the kernels that access
- * it, or
- * - the stream used to create the map should be synchronized before it is accessed from a different
- * stream or from host code.
- *
- * TODO:
- *  - add constructor that takes pointer to hash_table to avoid allocations
- */
-template <typename Key,
-          typename Element,
-          typename Hasher    = cudf::hashing::detail::default_hash<Key>,
-          typename Equality  = equal_to<Key>,
-          typename Allocator = rmm::mr::polymorphic_allocator<thrust::pair<Key, Element>>>
-class concurrent_unordered_map {
- public:
-  using size_type      = size_t;
-  using hasher         = Hasher;
-  using key_equal      = Equality;
-  using allocator_type = Allocator;
-  using key_type       = Key;
-  using mapped_type    = Element;
-  using value_type     = thrust::pair<Key, Element>;
-  using iterator       = cycle_iterator_adapter<value_type*>;
-  using const_iterator = cycle_iterator_adapter<value_type*> const;
-
- public:
-  /**
-   * @brief Factory to construct a new concurrent unordered map.
-   *
-   * Returns a `std::unique_ptr` to a new concurrent unordered map object. The
-   * map is non-owning and trivially copyable and should be passed by value into
-   * kernels. The `unique_ptr` contains a custom deleter that will free the
-   * map's contents.
-   *
-   * @note The implementation of this unordered_map uses sentinel values to
-   * indicate an entry in the hash table that is empty, i.e., if a hash bucket
-   * is empty, the pair residing there will be equal to (unused_key,
-   * unused_element). As a result, attempting to insert a key equal to
-   *`unused_key` results in undefined behavior.
-   *
-   * @note All allocations, kernels and copies in the constructor take place
-   * on stream but the constructor does not synchronize the stream. It is the user's
-   * responsibility to synchronize or use the same stream to access the map.
-   *
-   * @param capacity The maximum number of pairs the map may hold
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   * @param unused_element The sentinel value to use for an empty value
-   * @param unused_key The sentinel value to use for an empty key
-   * @param hash_function The hash function to use for hashing keys
-   * @param equal The equality comparison function for comparing if two keys are
-   * equal
-   * @param allocator The allocator to use for allocation the hash table's
-   * storage
-   */
-  static auto create(size_type capacity,
-                     rmm::cuda_stream_view stream,
-                     mapped_type const unused_element = std::numeric_limits<mapped_type>::max(),
-                     key_type const unused_key        = std::numeric_limits<key_type>::max(),
-                     Hasher const& hash_function      = hasher(),
-                     Equality const& equal            = key_equal(),
-                     allocator_type const& allocator  = allocator_type())
-  {
-    CUDF_FUNC_RANGE();
-    using Self = concurrent_unordered_map<Key, Element, Hasher, Equality, Allocator>;
-
-    // Note: need `(*p).destroy` instead of `p->destroy` here
-    // due to compiler bug: https://github.com/rapidsai/cudf/pull/5692
-    auto deleter = [stream](Self* p) { (*p).destroy(stream); };
-
-    return std::unique_ptr<Self, std::function<void(Self*)>>{
-      new Self(capacity, unused_element, unused_key, hash_function, equal, allocator, stream),
-      deleter};
-  }
-
-  /**
-   * @brief Returns an iterator to the first element in the map
-   *
-   * @note `__device__` code that calls this function should either run in the
-   * same stream as `create()`, or the accessing stream either be running on the
-   * same stream as create(), or the accessing stream should be appropriately
-   * synchronized with the creating stream.
-   *
-   * @returns iterator to the first element in the map.
-   */
-  __device__ iterator begin()
-  {
-    return iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, m_hashtbl_values);
-  }
-
-  /**
-   * @brief Returns a constant iterator to the first element in the map
-   *
-   * @note `__device__` code that calls this function should either run in the
-   * same stream as `create()`, or the accessing stream either be running on the
-   * same stream as create(), or the accessing stream should be appropriately
-   * synchronized with the creating stream.
-   *
-   * @returns constant iterator to the first element in the map.
-   */
-  __device__ const_iterator begin() const
-  {
-    return const_iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, m_hashtbl_values);
-  }
-
-  /**
-   * @brief Returns an iterator to the one past the last element in the map
-   *
-   * @note `__device__` code that calls this function should either run in the
-   * same stream as `create()`, or the accessing stream either be running on the
-   * same stream as create(), or the accessing stream should be appropriately
-   * synchronized with the creating stream.
-   *
-   * @returns iterator to the one past the last element in the map.
-   */
-  __device__ iterator end()
-  {
-    return iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, m_hashtbl_values + m_capacity);
-  }
-
-  /**
-   * @brief Returns a constant iterator to the one past the last element in the map
-   *
-   * @note When called in a device code, user should make sure that it should
-   * either be running on the same stream as create(), or the accessing stream
-   * should be appropriately synchronized with the creating stream.
-   *
-   * @returns constant iterator to the one past the last element in the map.
-   */
-  __device__ const_iterator end() const
-  {
-    return const_iterator(
-      m_hashtbl_values, m_hashtbl_values + m_capacity, m_hashtbl_values + m_capacity);
-  }
-  __host__ __device__ value_type* data() const { return m_hashtbl_values; }
-
-  __host__ __device__ key_type get_unused_key() const { return m_unused_key; }
-
-  __host__ __device__ mapped_type get_unused_element() const { return m_unused_element; }
-
-  [[nodiscard]] __host__ __device__ size_type capacity() const { return m_capacity; }
-
- private:
-  /**
-   * @brief Enumeration of the possible results of attempting to insert into
-   *a hash bucket
-   */
-  enum class insert_result {
-    CONTINUE,  ///< Insert did not succeed, continue trying to insert
-               ///< (collision)
-    SUCCESS,   ///< New pair inserted successfully
-    DUPLICATE  ///< Insert did not succeed, key is already present
-  };
-
-  /**
-   * @brief Specialization for value types that can be packed.
-   *
-   * When the size of the key,value pair being inserted is equal in size to
-   *a type where atomicCAS is natively supported, this optimization path
-   *will insert the pair in a single atomicCAS operation.
-   */
-  template <typename pair_type = value_type>
-  __device__ std::enable_if_t<is_packable<pair_type>(), insert_result> attempt_insert(
-    value_type* const __restrict__ insert_location, value_type const& insert_pair)
-  {
-    pair_packer<pair_type> expected{thrust::make_pair(m_unused_key, m_unused_element)};
-    pair_packer<pair_type> desired{insert_pair};
-
-    using packed_type = typename pair_packer<pair_type>::packed_type;
-
-    auto* insert_ptr = reinterpret_cast<packed_type*>(insert_location);
-    cuda::atomic_ref<packed_type, cuda::thread_scope_device> ref{*insert_ptr};
-    auto const success =
-      ref.compare_exchange_strong(expected.packed, desired.packed, cuda::std::memory_order_relaxed);
-
-    if (success) {
-      return insert_result::SUCCESS;
-    } else if (m_equal(expected.pair.first, insert_pair.first)) {
-      return insert_result::DUPLICATE;
-    }
-    return insert_result::CONTINUE;
-  }
-
-  /**
-   * @brief Attempts to insert a key,value pair at the specified hash bucket.
-   *
-   * @param[in] insert_location Pointer to hash bucket to attempt insert
-   * @param[in] insert_pair The pair to insert
-   * @return Enum indicating result of insert attempt.
-   */
-  template <typename pair_type = value_type>
-  __device__ std::enable_if_t<not is_packable<pair_type>(), insert_result> attempt_insert(
-    value_type* const __restrict__ insert_location, value_type const& insert_pair)
-  {
-    auto expected = m_unused_key;
-    cuda::atomic_ref<key_type, cuda::thread_scope_device> ref{insert_location->first};
-    auto const key_success =
-      ref.compare_exchange_strong(expected, insert_pair.first, cuda::std::memory_order_relaxed);
-
-    // Hash bucket empty
-    if (key_success) {
-      insert_location->second = insert_pair.second;
-      return insert_result::SUCCESS;
-    }
-    // Key already exists
-    else if (m_equal(expected, insert_pair.first)) {
-      return insert_result::DUPLICATE;
-    }
-
-    return insert_result::CONTINUE;
-  }
-
- public:
-  /**
-   * @brief Attempts to insert a key, value pair into the map.
-   *
-   * Returns an iterator, boolean pair.
-   *
-   * If the new key already present in the map, the iterator points to
-   * the location of the existing key and the boolean is `false` indicating
-   * that the insert did not succeed.
-   *
-   * If the new key was not present, the iterator points to the location
-   * where the insert occurred and the boolean is `true` indicating that the
-   *insert succeeded.
-   *
-   * @param insert_pair The key and value pair to insert
-   * @return Iterator, Boolean pair. Iterator is to the location of the
-   *newly inserted pair, or the existing pair that prevented the insert.
-   *Boolean indicates insert success.
-   */
-  __device__ thrust::pair<iterator, bool> insert(value_type const& insert_pair)
-  {
-    size_type const key_hash{m_hf(insert_pair.first)};
-    size_type index{key_hash % m_capacity};
-
-    insert_result status{insert_result::CONTINUE};
-
-    value_type* current_bucket{nullptr};
-
-    while (status == insert_result::CONTINUE) {
-      current_bucket = &m_hashtbl_values[index];
-      status         = attempt_insert(current_bucket, insert_pair);
-      index          = (index + 1) % m_capacity;
-    }
-
-    bool const insert_success = status == insert_result::SUCCESS;
-
-    return thrust::make_pair(
-      iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, current_bucket), insert_success);
-  }
-
-  /**
-   * @brief Searches the map for the specified key.
-   *
-   * @note `find` is not threadsafe with `insert`. I.e., it is not safe to
-   *do concurrent `insert` and `find` operations.
-   *
-   * @param k The key to search for
-   * @return An iterator to the key if it exists, else map.end()
-   */
-  __device__ const_iterator find(key_type const& k) const
-  {
-    size_type const key_hash = m_hf(k);
-    size_type index          = key_hash % m_capacity;
-
-    value_type* current_bucket = &m_hashtbl_values[index];
-
-    while (true) {
-      key_type const existing_key = current_bucket->first;
-
-      if (m_unused_key == existing_key) { return this->end(); }
-
-      if (m_equal(k, existing_key)) {
-        return const_iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, current_bucket);
-      }
-
-      index          = (index + 1) % m_capacity;
-      current_bucket = &m_hashtbl_values[index];
-    }
-  }
-
-  /**
-   * @brief Searches the map for the specified key.
-   *
-   * This version of the find function specifies a hashing function and an
-   * equality comparison.  This allows the caller to use different functions
-   * for insert and find (for example, when you want to insert keys from
-   * one table and use find to match keys from a different table with the
-   * keys from the first table).
-   *
-   * @note `find` is not threadsafe with `insert`. I.e., it is not safe to
-   * do concurrent `insert` and `find` operations.
-   *
-   * @tparam find_hasher     Type of hashing function
-   * @tparam find_key_equal  Type of equality comparison
-   *
-   * @param k         The key to search for
-   * @param f_hash    The hashing function to use to hash this key
-   * @param f_equal   The equality function to use to compare this key with the
-   *                  contents of the hash table
-   * @return An iterator to the key if it exists, else map.end()
-   */
-  template <typename find_hasher, typename find_key_equal>
-  __device__ const_iterator find(key_type const& k,
-                                 find_hasher f_hash,
-                                 find_key_equal f_equal) const
-  {
-    size_type const key_hash = f_hash(k);
-    size_type index          = key_hash % m_capacity;
-
-    value_type* current_bucket = &m_hashtbl_values[index];
-
-    while (true) {
-      key_type const existing_key = current_bucket->first;
-
-      if (m_unused_key == existing_key) { return this->end(); }
-
-      if (f_equal(k, existing_key)) {
-        return const_iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, current_bucket);
-      }
-
-      index          = (index + 1) % m_capacity;
-      current_bucket = &m_hashtbl_values[index];
-    }
-  }
-
-  void assign_async(concurrent_unordered_map const& other, rmm::cuda_stream_view stream)
-  {
-    if (other.m_capacity <= m_capacity) {
-      m_capacity = other.m_capacity;
-    } else {
-      m_allocator.deallocate(m_hashtbl_values, m_capacity, stream);
-      m_capacity = other.m_capacity;
-      m_capacity = other.m_capacity;
-
-      m_hashtbl_values = m_allocator.allocate(m_capacity, stream);
-    }
-    CUDF_CUDA_TRY(cudaMemcpyAsync(m_hashtbl_values,
-                                  other.m_hashtbl_values,
-                                  m_capacity * sizeof(value_type),
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-  }
-
-  void clear_async(rmm::cuda_stream_view stream)
-  {
-    constexpr int block_size = 128;
-    init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>(
-      m_hashtbl_values, m_capacity, m_unused_key, m_unused_element);
-  }
-
-  void print()
-  {
-    for (size_type i = 0; i < m_capacity; ++i) {
-      std::cout << i << ": " << m_hashtbl_values[i].first << "," << m_hashtbl_values[i].second
-                << std::endl;
-    }
-  }
-
-  void prefetch(int const dev_id, rmm::cuda_stream_view stream)
-  {
-    cudaPointerAttributes hashtbl_values_ptr_attributes;
-    cudaError_t status = cudaPointerGetAttributes(&hashtbl_values_ptr_attributes, m_hashtbl_values);
-
-    if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) {
-      CUDF_CUDA_TRY(cudaMemPrefetchAsync(
-        m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream.value()));
-    }
-    CUDF_CUDA_TRY(cudaMemPrefetchAsync(this, sizeof(*this), dev_id, stream.value()));
-  }
-
-  /**
-   * @brief Frees the contents of the map and destroys the map object.
-   *
-   * This function is invoked as the deleter of the `std::unique_ptr` returned
-   * from the `create()` factory function.
-   *
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  void destroy(rmm::cuda_stream_view stream)
-  {
-    m_allocator.deallocate(m_hashtbl_values, m_capacity, stream);
-    delete this;
-  }
-
-  concurrent_unordered_map()                                           = delete;
-  concurrent_unordered_map(concurrent_unordered_map const&)            = default;
-  concurrent_unordered_map(concurrent_unordered_map&&)                 = default;
-  concurrent_unordered_map& operator=(concurrent_unordered_map const&) = default;
-  concurrent_unordered_map& operator=(concurrent_unordered_map&&)      = default;
-  ~concurrent_unordered_map()                                          = default;
-
- private:
-  hasher m_hf;
-  key_equal m_equal;
-  mapped_type m_unused_element;
-  key_type m_unused_key;
-  allocator_type m_allocator;
-  size_type m_capacity;
-  value_type* m_hashtbl_values;
-
-  /**
-   * @brief Private constructor used by `create` factory function.
-   *
-   * @param capacity The desired m_capacity of the hash table
-   * @param unused_element The sentinel value to use for an empty value
-   * @param unused_key The sentinel value to use for an empty key
-   * @param hash_function The hash function to use for hashing keys
-   * @param equal The equality comparison function for comparing if two keys
-   *are equal
-   * @param allocator The allocator to use for allocation the hash table's
-   * storage
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   */
-  concurrent_unordered_map(size_type capacity,
-                           mapped_type const unused_element,
-                           key_type const unused_key,
-                           Hasher const& hash_function,
-                           Equality const& equal,
-                           allocator_type const& allocator,
-                           rmm::cuda_stream_view stream)
-    : m_hf(hash_function),
-      m_equal(equal),
-      m_allocator(allocator),
-      m_capacity(capacity),
-      m_unused_element(unused_element),
-      m_unused_key(unused_key)
-  {
-    m_hashtbl_values         = m_allocator.allocate(m_capacity, stream);
-    constexpr int block_size = 128;
-    {
-      cudaPointerAttributes hashtbl_values_ptr_attributes;
-      cudaError_t status =
-        cudaPointerGetAttributes(&hashtbl_values_ptr_attributes, m_hashtbl_values);
-
-      if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) {
-        int dev_id = 0;
-        CUDF_CUDA_TRY(cudaGetDevice(&dev_id));
-        CUDF_CUDA_TRY(cudaMemPrefetchAsync(
-          m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream.value()));
-      }
-    }
-
-    if (m_capacity > 0) {
-      init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>(
-        m_hashtbl_values, m_capacity, m_unused_key, m_unused_element);
-    }
-
-    CUDF_CHECK_CUDA(stream.value());
-  }
-};
diff --git a/cpp/src/hash/managed.cuh b/cpp/src/hash/managed.cuh
deleted file mode 100644
index 9797c83c47c..00000000000
--- a/cpp/src/hash/managed.cuh
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2017-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cassert>
-#include <new>
-
-struct managed {
-  static void* operator new(size_t n)
-  {
-    void* ptr          = nullptr;
-    cudaError_t result = cudaMallocManaged(&ptr, n);
-    if (cudaSuccess != result || 0 == ptr) throw std::bad_alloc();
-    return ptr;
-  }
-
-  static void operator delete(void* ptr) noexcept
-  {
-    auto const free_result = cudaFree(ptr);
-    assert(free_result == cudaSuccess);
-  }
-};
-
-inline bool isPtrManaged(cudaPointerAttributes attr)
-{
-  return (attr.type == cudaMemoryTypeManaged);
-}
diff --git a/cpp/src/io/json/legacy/json_gpu.cu b/cpp/src/io/json/legacy/json_gpu.cu
deleted file mode 100644
index ff4845fcecb..00000000000
--- a/cpp/src/io/json/legacy/json_gpu.cu
+++ /dev/null
@@ -1,615 +0,0 @@
-/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "io/utilities/column_type_histogram.hpp"
-#include "io/utilities/parsing_utils.cuh"
-#include "io/utilities/trie.cuh"
-#include "json_gpu.hpp"
-
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
-#include <cudf/types.hpp>
-#include <cudf/utilities/bit.hpp>
-#include <cudf/utilities/span.hpp>
-#include <cudf/utilities/traits.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/advance.h>
-#include <thrust/detail/copy.h>
-#include <thrust/execution_policy.h>
-#include <thrust/find.h>
-#include <thrust/generate.h>
-#include <thrust/iterator/reverse_iterator.h>
-#include <thrust/mismatch.h>
-#include <thrust/optional.h>
-#include <thrust/pair.h>
-
-using cudf::device_span;
-using cudf::detail::grid_1d;
-
-namespace cudf::io::json::detail::legacy {
-
-namespace {
-/**
- * @brief CUDA Kernel that adjusts the row range to exclude the character outside of the top level
- * brackets.
- *
- * The top level brackets characters are excluded from the resulting range.
- *
- * @param[in] begin Pointer to the first character in the row
- * @param[in] end pointer to the first character after the row
- */
-__device__ std::pair<char const*, char const*> limit_range_to_brackets(char const* begin,
-                                                                       char const* end)
-{
-  auto const data_begin = thrust::next(thrust::find_if(
-    thrust::seq, begin, end, [] __device__(auto c) { return c == '[' || c == '{'; }));
-  auto const data_end   = thrust::next(thrust::find_if(thrust::seq,
-                                                     thrust::make_reverse_iterator(end),
-                                                     thrust::make_reverse_iterator(data_begin),
-                                                     [](auto c) { return c == ']' || c == '}'; }))
-                          .base();
-  return {data_begin, data_end};
-}
-
-/**
- * @brief Find the first JSON object key in the range.
- *
- * Assumes that begin is not in the middle of a field.
- *
- * @param[in] begin Pointer to the first character in the parsing range
- * @param[in] end pointer to the first character after the parsing range
- * @param[in] quotechar The character used to denote quotes
- *
- * @return Begin and end iterators of the key name; (`end`, `end`) if a key is not found
- */
-__device__ std::pair<char const*, char const*> get_next_key(char const* begin,
-                                                            char const* end,
-                                                            char quotechar)
-{
-  // Key starts after the first quote
-  auto const key_begin = thrust::find(thrust::seq, begin, end, quotechar) + 1;
-  if (key_begin > end) return {end, end};
-
-  // Key ends after the next unescaped quote
-  auto const key_end_pair = thrust::mismatch(
-    thrust::seq, key_begin, end - 1, key_begin + 1, [quotechar] __device__(auto prev_ch, auto ch) {
-      return !(ch == quotechar && prev_ch != '\\');
-    });
-
-  return {key_begin, key_end_pair.second};
-}
-
-/**
- * @brief Returns true is the input character is a valid digit.
- * Supports both decimal and hexadecimal digits (uppercase and lowercase).
- *
- * @param c Character to check
- * @param is_hex Whether to check as a hexadecimal
- *
- * @return `true` if it is digit-like, `false` otherwise
- */
-__device__ __inline__ bool is_digit(char c, bool is_hex = false)
-{
-  if (c >= '0' && c <= '9') return true;
-
-  if (is_hex) {
-    if (c >= 'A' && c <= 'F') return true;
-    if (c >= 'a' && c <= 'f') return true;
-  }
-
-  return false;
-}
-
-/**
- * @brief Returns true if the counters indicate a potentially valid float.
- * False positives are possible because positions are not taken into account.
- * For example, field "e.123-" would match the pattern.
- */
-__device__ __inline__ bool is_like_float(
-  long len, long digit_cnt, long decimal_cnt, long dash_cnt, long exponent_cnt)
-{
-  // Can't have more than one exponent and one decimal point
-  if (decimal_cnt > 1) return false;
-  if (exponent_cnt > 1) return false;
-  // Without the exponent or a decimal point, this is an integer, not a float
-  if (decimal_cnt == 0 && exponent_cnt == 0) return false;
-
-  // Can only have one '-' per component
-  if (dash_cnt > 1 + exponent_cnt) return false;
-
-  // If anything other than these characters is present, it's not a float
-  if (digit_cnt + decimal_cnt + dash_cnt + exponent_cnt != len) return false;
-
-  // Needs at least 1 digit, 2 if exponent is present
-  if (digit_cnt < 1 + exponent_cnt) return false;
-
-  return true;
-}
-
-/**
- * @brief Contains information on a JSON file field.
- */
-struct field_descriptor {
-  cudf::size_type column;
-  char const* value_begin;
-  char const* value_end;
-  bool is_quoted;
-};
-
-/**
- * @brief Parse the first field in the given range and return its descriptor.
- *
- * @param[in] begin Pointer to the first character in the parsing range
- * @param[in] end pointer to the first character after the parsing range
- * @param[in] opts The global parsing behavior options
- * @param[in] field_idx Index of the current field in the input row
- * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory.
- * nullptr is passed when the input file does not consist of objects.
- * @return Descriptor of the parsed field
- */
-__device__ field_descriptor next_field_descriptor(char const* begin,
-                                                  char const* end,
-                                                  parse_options_view const& opts,
-                                                  cudf::size_type field_idx,
-                                                  col_map_type col_map)
-{
-  auto const desc_pre_trim =
-    col_map.capacity() == 0
-      // No key - column and begin are trivial
-      ? field_descriptor{field_idx,
-                         begin,
-                         cudf::io::gpu::seek_field_end(begin, end, opts, true),
-                         false}
-      : [&]() {
-          auto const key_range = get_next_key(begin, end, opts.quotechar);
-          auto const key_hash  = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{}(
-            cudf::string_view(key_range.first, key_range.second - key_range.first));
-          auto const hash_col = col_map.find(key_hash);
-          // Fall back to field index if not found (parsing error)
-          auto const column = (hash_col != col_map.end()) ? (*hash_col).second : field_idx;
-
-          // Skip the colon between the key and the value
-          auto const value_begin = thrust::find(thrust::seq, key_range.second, end, ':') + 1;
-          return field_descriptor{column,
-                                  value_begin,
-                                  cudf::io::gpu::seek_field_end(value_begin, end, opts, true),
-                                  false};
-        }();
-
-  // Modify start & end to ignore whitespace and quotechars
-  auto const trimmed_value_range =
-    trim_whitespaces(desc_pre_trim.value_begin, desc_pre_trim.value_end);
-  bool const is_quoted =
-    thrust::distance(trimmed_value_range.first, trimmed_value_range.second) >= 2 and
-    *trimmed_value_range.first == opts.quotechar and
-    *thrust::prev(trimmed_value_range.second) == opts.quotechar;
-  return {desc_pre_trim.column,
-          trimmed_value_range.first + static_cast<std::ptrdiff_t>(is_quoted),
-          trimmed_value_range.second - static_cast<std::ptrdiff_t>(is_quoted),
-          is_quoted};
-}
-
-/**
- * @brief Returns the range that contains the data in a given row.
- *
- * Excludes the top-level brackets.
- *
- * @param[in] data Device span pointing to the JSON data in device memory
- * @param[in] row_offsets The offset of each row in the input
- * @param[in] row Index of the row for which the range is returned
- *
- * @return The begin and end iterators of the row data.
- */
-__device__ std::pair<char const*, char const*> get_row_data_range(
-  device_span<char const> const data, device_span<uint64_t const> const row_offsets, size_type row)
-{
-  auto const row_begin = data.begin() + row_offsets[row];
-  auto const row_end =
-    data.begin() + ((row < row_offsets.size() - 1) ? row_offsets[row + 1] : data.size());
-  return limit_range_to_brackets(row_begin, row_end);
-}
-
-/**
- * @brief CUDA kernel that parses and converts plain text data into cuDF column data.
- *
- * Data is processed one record at a time
- *
- * @param[in] opts A set of parsing options
- * @param[in] data The entire data to read
- * @param[in] row_offsets The offset of each row in the input
- * @param[in] column_types The data type of each column
- * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory.
- * nullptr is passed when the input file does not consist of objects.
- * @param[out] output_columns The output column data
- * @param[out] valid_fields The bitmaps indicating whether column fields are valid
- * @param[out] num_valid_fields The numbers of valid fields in columns
- */
-CUDF_KERNEL void convert_data_to_columns_kernel(parse_options_view opts,
-                                                device_span<char const> const data,
-                                                device_span<uint64_t const> const row_offsets,
-                                                device_span<data_type const> const column_types,
-                                                col_map_type col_map,
-                                                device_span<void* const> const output_columns,
-                                                device_span<bitmask_type* const> const valid_fields,
-                                                device_span<cudf::size_type> const num_valid_fields)
-{
-  auto const rec_id = grid_1d::global_thread_id();
-  if (rec_id >= row_offsets.size()) return;
-
-  auto const row_data_range = get_row_data_range(data, row_offsets, rec_id);
-
-  auto current = row_data_range.first;
-  for (size_type input_field_index = 0;
-       input_field_index < column_types.size() && current < row_data_range.second;
-       input_field_index++) {
-    auto const desc =
-      next_field_descriptor(current, row_data_range.second, opts, input_field_index, col_map);
-    auto const value_len = static_cast<size_t>(std::max(desc.value_end - desc.value_begin, 0L));
-    auto const is_quoted = static_cast<std::ptrdiff_t>(desc.is_quoted);
-
-    current = desc.value_end + 1;
-
-    using string_index_pair = thrust::pair<char const*, size_type>;
-
-    if (!serialized_trie_contains(opts.trie_na,
-                                  {desc.value_begin - is_quoted, value_len + is_quoted * 2})) {
-      // Type dispatcher does not handle strings
-      if (column_types[desc.column].id() == type_id::STRING) {
-        auto str_list           = static_cast<string_index_pair*>(output_columns[desc.column]);
-        str_list[rec_id].first  = desc.value_begin;
-        str_list[rec_id].second = value_len;
-
-        // set the valid bitmap - all bits were set to 0 to start
-        set_bit(valid_fields[desc.column], rec_id);
-        atomicAdd(&num_valid_fields[desc.column], 1);
-      } else {
-        if (cudf::type_dispatcher(column_types[desc.column],
-                                  ConvertFunctor{},
-                                  desc.value_begin,
-                                  desc.value_end,
-                                  output_columns[desc.column],
-                                  rec_id,
-                                  column_types[desc.column],
-                                  opts,
-                                  false)) {
-          // set the valid bitmap - all bits were set to 0 to start
-          set_bit(valid_fields[desc.column], rec_id);
-          atomicAdd(&num_valid_fields[desc.column], 1);
-        }
-      }
-    } else if (column_types[desc.column].id() == type_id::STRING) {
-      auto str_list           = static_cast<string_index_pair*>(output_columns[desc.column]);
-      str_list[rec_id].first  = nullptr;
-      str_list[rec_id].second = 0;
-    }
-  }
-}
-
-/**
- * @brief CUDA kernel that processes a buffer of data and determines information about the
- * column types within.
- *
- * Data is processed in one row/record at a time, so the number of total
- * threads (tid) is equal to the number of rows.
- *
- * @param[in] opts A set of parsing options
- * @param[in] data Input data buffer
- * @param[in] rec_starts The offset of each row in the input
- * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory.
- * nullptr is passed when the input file does not consist of objects.
- * @param[in] num_columns The number of columns of input data
- * @param[out] column_infos The count for each column data type
- */
-CUDF_KERNEL void detect_data_types_kernel(
-  parse_options_view const opts,
-  device_span<char const> const data,
-  device_span<uint64_t const> const row_offsets,
-  col_map_type col_map,
-  int num_columns,
-  device_span<cudf::io::column_type_histogram> const column_infos)
-{
-  auto const rec_id = grid_1d::global_thread_id();
-  if (rec_id >= row_offsets.size()) return;
-
-  auto const are_rows_objects = col_map.capacity() != 0;
-  auto const row_data_range   = get_row_data_range(data, row_offsets, rec_id);
-
-  size_type input_field_index = 0;
-  for (auto current = row_data_range.first;
-       input_field_index < num_columns && current < row_data_range.second;
-       input_field_index++) {
-    auto const desc =
-      next_field_descriptor(current, row_data_range.second, opts, input_field_index, col_map);
-    auto const value_len = static_cast<size_t>(std::max(desc.value_end - desc.value_begin, 0L));
-
-    // Advance to the next field; +1 to skip the delimiter
-    current = desc.value_end + 1;
-
-    // Checking if the field is empty/valid
-    if (serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) {
-      // Increase the null count for array rows, where the null count is initialized to zero.
-      if (!are_rows_objects) { atomicAdd(&column_infos[desc.column].null_count, 1); }
-      continue;
-    } else if (are_rows_objects) {
-      // For files with object rows, null count is initialized to row count. The value is decreased
-      // here for every valid field.
-      atomicAdd(&column_infos[desc.column].null_count, -1);
-    }
-    // Don't need counts to detect strings, any field in quotes is deduced to be a string
-    if (desc.is_quoted) {
-      atomicAdd(&column_infos[desc.column].string_count, 1);
-      continue;
-    }
-
-    int digit_count    = 0;
-    int decimal_count  = 0;
-    int slash_count    = 0;
-    int dash_count     = 0;
-    int plus_count     = 0;
-    int colon_count    = 0;
-    int exponent_count = 0;
-    int other_count    = 0;
-
-    bool const maybe_hex =
-      ((value_len > 2 && *desc.value_begin == '0' && *(desc.value_begin + 1) == 'x') ||
-       (value_len > 3 && *desc.value_begin == '-' && *(desc.value_begin + 1) == '0' &&
-        *(desc.value_begin + 2) == 'x'));
-    for (auto pos = desc.value_begin; pos < desc.value_end; ++pos) {
-      if (is_digit(*pos, maybe_hex)) {
-        digit_count++;
-        continue;
-      }
-      // Looking for unique characters that will help identify column types
-      switch (*pos) {
-        case '.': decimal_count++; break;
-        case '-': dash_count++; break;
-        case '+': plus_count++; break;
-        case '/': slash_count++; break;
-        case ':': colon_count++; break;
-        case 'e':
-        case 'E':
-          if (!maybe_hex && pos > desc.value_begin && pos < desc.value_end - 1) exponent_count++;
-          break;
-        default: other_count++; break;
-      }
-    }
-
-    // Integers have to have the length of the string
-    int int_req_number_cnt = value_len;
-    // Off by one if they start with a minus sign
-    if ((*desc.value_begin == '-' || *desc.value_begin == '+') && value_len > 1) {
-      --int_req_number_cnt;
-    }
-    // Off by one if they are a hexadecimal number
-    if (maybe_hex) { --int_req_number_cnt; }
-    if (serialized_trie_contains(opts.trie_true, {desc.value_begin, value_len}) ||
-        serialized_trie_contains(opts.trie_false, {desc.value_begin, value_len})) {
-      atomicAdd(&column_infos[desc.column].bool_count, 1);
-    } else if (digit_count == int_req_number_cnt) {
-      bool is_negative       = (*desc.value_begin == '-');
-      char const* data_begin = desc.value_begin + (is_negative || (*desc.value_begin == '+'));
-      cudf::size_type* ptr   = cudf::io::gpu::infer_integral_field_counter(
-        data_begin, data_begin + digit_count, is_negative, column_infos[desc.column]);
-      atomicAdd(ptr, 1);
-    } else if (is_like_float(
-                 value_len, digit_count, decimal_count, dash_count + plus_count, exponent_count)) {
-      atomicAdd(&column_infos[desc.column].float_count, 1);
-    }
-    // A date-time field cannot have more than 3 non-special characters
-    // A number field cannot have more than one decimal point
-    else if (other_count > 3 || decimal_count > 1) {
-      atomicAdd(&column_infos[desc.column].string_count, 1);
-    } else {
-      // A date field can have either one or two '-' or '\'; A legal combination will only have one
-      // of them To simplify the process of auto column detection, we are not covering all the
-      // date-time formation permutations
-      if ((dash_count > 0 && dash_count <= 2 && slash_count == 0) ||
-          (dash_count == 0 && slash_count > 0 && slash_count <= 2)) {
-        if (colon_count <= 2) {
-          atomicAdd(&column_infos[desc.column].datetime_count, 1);
-        } else {
-          atomicAdd(&column_infos[desc.column].string_count, 1);
-        }
-      } else {
-        // Default field type is string
-        atomicAdd(&column_infos[desc.column].string_count, 1);
-      }
-    }
-  }
-  if (!are_rows_objects) {
-    // For array rows, mark missing fields as null
-    for (; input_field_index < num_columns; ++input_field_index)
-      atomicAdd(&column_infos[input_field_index].null_count, 1);
-  }
-}
-
-/**
- * @brief Input data range that contains a field in key:value format.
- */
-struct key_value_range {
-  char const* key_begin;
-  char const* key_end;
-  char const* value_begin;
-  char const* value_end;
-};
-
-/**
- * @brief Parse the next field in key:value format and return ranges of its parts.
- */
-__device__ key_value_range get_next_key_value_range(char const* begin,
-                                                    char const* end,
-                                                    parse_options_view const& opts)
-{
-  auto const key_range = get_next_key(begin, end, opts.quotechar);
-
-  // Colon between the key and the value
-  auto const colon = thrust::find(thrust::seq, key_range.second, end, ':');
-  if (colon == end) return {end, end, end};
-
-  // Field value (including delimiters)
-  auto const value_end = cudf::io::gpu::seek_field_end(colon + 1, end, opts, true);
-  return {key_range.first, key_range.second, colon + 1, value_end};
-}
-
-/**
- * @brief Cuda kernel that collects information about JSON object keys in the file.
- *
- * @param[in] options A set of parsing options
- * @param[in] data Input data buffer
- * @param[in] row_offsets The offset of each row in the input
- * @param[out] keys_cnt Number of keys found in the file
- * @param[out] keys_info optional, information (offset, length, hash) for each found key
- */
-CUDF_KERNEL void collect_keys_info_kernel(parse_options_view const options,
-                                          device_span<char const> const data,
-                                          device_span<uint64_t const> const row_offsets,
-                                          unsigned long long int* keys_cnt,
-                                          thrust::optional<mutable_table_device_view> keys_info)
-{
-  auto const rec_id = grid_1d::global_thread_id();
-  if (rec_id >= row_offsets.size()) return;
-
-  auto const row_data_range = get_row_data_range(data, row_offsets, rec_id);
-
-  auto advance = [&](char const* begin) {
-    return get_next_key_value_range(begin, row_data_range.second, options);
-  };
-  for (auto field_range = advance(row_data_range.first);
-       field_range.key_begin < row_data_range.second;
-       field_range = advance(field_range.value_end)) {
-    auto const idx = atomicAdd(keys_cnt, 1ULL);
-    if (keys_info.has_value()) {
-      auto const len                              = field_range.key_end - field_range.key_begin;
-      keys_info->column(0).element<uint64_t>(idx) = field_range.key_begin - data.begin();
-      keys_info->column(1).element<uint16_t>(idx) = len;
-      keys_info->column(2).element<uint32_t>(idx) =
-        cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{}(
-          cudf::string_view(field_range.key_begin, len));
-    }
-  }
-}
-
-}  // namespace
-
-/**
- * @copydoc cudf::io::json::detail::legacy::convert_json_to_columns
- */
-void convert_json_to_columns(parse_options_view const& opts,
-                             device_span<char const> const data,
-                             device_span<uint64_t const> const row_offsets,
-                             device_span<data_type const> const column_types,
-                             col_map_type* col_map,
-                             device_span<void* const> const output_columns,
-                             device_span<bitmask_type* const> const valid_fields,
-                             device_span<cudf::size_type> num_valid_fields,
-                             rmm::cuda_stream_view stream)
-{
-  int block_size;
-  int min_grid_size;
-  CUDF_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(
-    &min_grid_size, &block_size, convert_data_to_columns_kernel));
-
-  int const grid_size = (row_offsets.size() + block_size - 1) / block_size;
-
-  convert_data_to_columns_kernel<<<grid_size, block_size, 0, stream.value()>>>(opts,
-                                                                               data,
-                                                                               row_offsets,
-                                                                               column_types,
-                                                                               *col_map,
-                                                                               output_columns,
-                                                                               valid_fields,
-                                                                               num_valid_fields);
-
-  CUDF_CHECK_CUDA(stream.value());
-}
-
-/**
- * @copydoc cudf::io::json::detail::legacy::detect_data_types
- */
-
-std::vector<cudf::io::column_type_histogram> detect_data_types(
-  parse_options_view const& options,
-  device_span<char const> const data,
-  device_span<uint64_t const> const row_offsets,
-  bool do_set_null_count,
-  int num_columns,
-  col_map_type* col_map,
-  rmm::cuda_stream_view stream)
-{
-  int block_size;
-  int min_grid_size;
-  CUDF_CUDA_TRY(
-    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, detect_data_types_kernel));
-
-  auto d_column_infos = [&]() {
-    if (do_set_null_count) {
-      rmm::device_uvector<cudf::io::column_type_histogram> d_column_infos(num_columns, stream);
-      // Set the null count to the row count (all fields assumes to be null).
-      thrust::generate(
-        rmm::exec_policy(stream),
-        d_column_infos.begin(),
-        d_column_infos.end(),
-        [num_records = static_cast<cudf::size_type>(row_offsets.size())] __device__() {
-          return cudf::io::column_type_histogram{num_records};
-        });
-      return d_column_infos;
-    } else {
-      return cudf::detail::make_zeroed_device_uvector_async<cudf::io::column_type_histogram>(
-        num_columns, stream, rmm::mr::get_current_device_resource());
-    }
-  }();
-
-  // Calculate actual block count to use based on records count
-  int const grid_size = (row_offsets.size() + block_size - 1) / block_size;
-
-  detect_data_types_kernel<<<grid_size, block_size, 0, stream.value()>>>(
-    options, data, row_offsets, *col_map, num_columns, d_column_infos);
-
-  return cudf::detail::make_std_vector_sync(d_column_infos, stream);
-}
-
-/**
- * @copydoc cudf::io::json::detail::legacy::collect_keys_info
- */
-void collect_keys_info(parse_options_view const& options,
-                       device_span<char const> const data,
-                       device_span<uint64_t const> const row_offsets,
-                       unsigned long long int* keys_cnt,
-                       thrust::optional<mutable_table_device_view> keys_info,
-                       rmm::cuda_stream_view stream)
-{
-  int block_size;
-  int min_grid_size;
-  CUDF_CUDA_TRY(
-    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, collect_keys_info_kernel));
-
-  // Calculate actual block count to use based on records count
-  int const grid_size = (row_offsets.size() + block_size - 1) / block_size;
-
-  collect_keys_info_kernel<<<grid_size, block_size, 0, stream.value()>>>(
-    options, data, row_offsets, keys_cnt, keys_info);
-
-  CUDF_CHECK_CUDA(stream.value());
-}
-
-}  // namespace cudf::io::json::detail::legacy
diff --git a/cpp/src/io/json/legacy/json_gpu.hpp b/cpp/src/io/json/legacy/json_gpu.hpp
deleted file mode 100644
index 853e30c9427..00000000000
--- a/cpp/src/io/json/legacy/json_gpu.hpp
+++ /dev/null
@@ -1,99 +0,0 @@
-/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "hash/concurrent_unordered_map.cuh"
-#include "io/utilities/column_type_histogram.hpp"
-#include "io/utilities/parsing_utils.cuh"
-
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/types.hpp>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-
-#include <thrust/optional.h>
-
-using cudf::device_span;
-
-namespace cudf::io::json::detail::legacy {
-
-using col_map_type = concurrent_unordered_map<uint32_t, cudf::size_type>;
-/**
- * @brief Convert a buffer of input data (text) into raw cuDF column data.
- *
- * @param[in] options A set of parsing options
- * @param[in] data The entire data to read
- * @param[in] row_offsets The start of each data record
- * @param[in] dtypes The data type of each column
- * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory.
- * nullptr is passed when the input file does not consist of objects.
- * @param[out] output_columns The output column data
- * @param[out] valid_fields The bitmaps indicating whether column fields are valid
- * @param[out] num_valid_fields The numbers of valid fields in columns
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
- */
-void convert_json_to_columns(parse_options_view const& options,
-                             device_span<char const> data,
-                             device_span<uint64_t const> row_offsets,
-                             device_span<data_type const> column_types,
-                             col_map_type* col_map,
-                             device_span<void* const> output_columns,
-                             device_span<bitmask_type* const> valid_fields,
-                             device_span<cudf::size_type> num_valid_fields,
-                             rmm::cuda_stream_view stream);
-
-/**
- * @brief Process a buffer of data and determine information about the column types within.
- *
- * @param[in] options A set of parsing options
- * @param[in] data Input data buffer
- * @param[in] row_offsets The offset of each row in the input
- * @param[in] num_columns The number of columns of input data
- * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory.
- * nullptr is passed when the input file does not consist of objects.
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
- *
- * @returns The count for each column data type
- */
-std::vector<cudf::io::column_type_histogram> detect_data_types(
-  parse_options_view const& options,
-  device_span<char const> data,
-  device_span<uint64_t const> row_offsets,
-  bool do_set_null_count,
-  int num_columns,
-  col_map_type* col_map,
-  rmm::cuda_stream_view stream);
-
-/**
- * @brief Collects information about JSON object keys in the file.
- *
- * @param[in] options A set of parsing options
- * @param[in] data Input data buffer
- * @param[in] row_offsets The offset of each row in the input
- * @param[out] keys_cnt Number of keys found in the file
- * @param[out] keys_info optional, information (offset, length, hash) for each found key
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
- */
-void collect_keys_info(parse_options_view const& options,
-                       device_span<char const> data,
-                       device_span<uint64_t const> row_offsets,
-                       unsigned long long int* keys_cnt,
-                       thrust::optional<mutable_table_device_view> keys_info,
-                       rmm::cuda_stream_view stream);
-
-}  // namespace cudf::io::json::detail::legacy
diff --git a/cpp/src/io/json/legacy/read_json.hpp b/cpp/src/io/json/legacy/read_json.hpp
deleted file mode 100644
index 2c02fdd402f..00000000000
--- a/cpp/src/io/json/legacy/read_json.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/types.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
-
-#include <thrust/mr/memory_resource.h>
-
-#include <memory>
-#include <vector>
-
-namespace cudf::io {
-class json_reader_options;  // forward decl
-}
-
-namespace cudf::io::json::detail::legacy {
-
-table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
-                              json_reader_options const& reader_opts,
-                              rmm::cuda_stream_view stream,
-                              rmm::device_async_resource_ref mr);
-
-}  // namespace cudf::io::json::detail::legacy
diff --git a/cpp/src/io/json/legacy/reader_impl.cu b/cpp/src/io/json/legacy/reader_impl.cu
deleted file mode 100644
index 846b3cfab4e..00000000000
--- a/cpp/src/io/json/legacy/reader_impl.cu
+++ /dev/null
@@ -1,667 +0,0 @@
-/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "hash/concurrent_unordered_map.cuh"
-#include "io/comp/io_uncomp.hpp"
-#include "io/utilities/column_buffer.hpp"
-#include "io/utilities/parsing_utils.cuh"
-#include "json_gpu.hpp"
-
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/detail/utilities/visitor_overload.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/io/datasource.hpp>
-#include <cudf/io/detail/json.hpp>
-#include <cudf/io/json.hpp>
-#include <cudf/sorting.hpp>
-#include <cudf/strings/detail/replace.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_scalar.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
-
-#include <thrust/for_each.h>
-#include <thrust/functional.h>
-#include <thrust/host_vector.h>
-#include <thrust/iterator/constant_iterator.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
-#include <thrust/pair.h>
-#include <thrust/sort.h>
-#include <thrust/transform.h>
-
-using cudf::host_span;
-
-namespace cudf::io::json::detail::legacy {
-
-using col_map_ptr_type = std::unique_ptr<col_map_type, std::function<void(col_map_type*)>>;
-
-/**
- * @brief Aggregate the table containing keys info by their hash values.
- *
- * @param[in] info Table with columns containing key offsets, lengths and hashes, respectively
- *
- * @return Table with data aggregated by key hash values
- */
-std::unique_ptr<table> aggregate_keys_info(std::unique_ptr<table> info)
-{
-  auto const info_view = info->view();
-  std::vector<groupby::aggregation_request> requests;
-  requests.emplace_back(groupby::aggregation_request{info_view.column(0)});
-  requests.back().aggregations.emplace_back(make_min_aggregation<groupby_aggregation>());
-  requests.back().aggregations.emplace_back(make_nth_element_aggregation<groupby_aggregation>(0));
-
-  requests.emplace_back(groupby::aggregation_request{info_view.column(1)});
-  requests.back().aggregations.emplace_back(make_min_aggregation<groupby_aggregation>());
-  requests.back().aggregations.emplace_back(make_nth_element_aggregation<groupby_aggregation>(0));
-
-  // Aggregate by hash values
-  groupby::groupby gb_obj(
-    table_view({info_view.column(2)}), null_policy::EXCLUDE, sorted::NO, {}, {});
-
-  auto result = gb_obj.aggregate(requests);  // TODO: no stream parameter?
-
-  std::vector<std::unique_ptr<column>> out_columns;
-  out_columns.emplace_back(std::move(result.second[0].results[0]));  // offsets
-  out_columns.emplace_back(std::move(result.second[1].results[0]));  // lengths
-  out_columns.emplace_back(std::move(result.first->release()[0]));   // hashes
-  return std::make_unique<table>(std::move(out_columns));
-}
-
-/**
- * @brief Initializes the (key hash -> column index) hash map.
- */
-col_map_ptr_type create_col_names_hash_map(column_view column_name_hashes,
-                                           rmm::cuda_stream_view stream)
-{
-  auto key_col_map       = col_map_type::create(column_name_hashes.size(), stream);
-  auto const column_data = column_name_hashes.data<uint32_t>();
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     column_name_hashes.size(),
-                     [map = *key_col_map, column_data] __device__(size_type idx) mutable {
-                       map.insert(thrust::make_pair(column_data[idx], idx));
-                     });
-  return key_col_map;
-}
-
-/**
- * @brief Create a table whose columns contain the information on JSON objects' keys.
- *
- * The columns contain name offsets in the file, name lengths and name hashes, respectively.
- *
- * @param[in] options Parsing options (e.g. delimiter and quotation character)
- * @param[in] data Input JSON device data
- * @param[in] row_offsets Device array of row start locations in the input buffer
- * @param[in] stream CUDA stream used for device memory operations and kernel launches
- *
- * @return std::unique_ptr<table> cudf table with three columns (offsets, lengths, hashes)
- */
-std::unique_ptr<table> create_json_keys_info_table(parse_options_view const& parse_opts,
-                                                   device_span<char const> const data,
-                                                   device_span<uint64_t const> const row_offsets,
-                                                   rmm::cuda_stream_view stream)
-{
-  // Count keys
-  rmm::device_scalar<unsigned long long int> key_counter(0, stream);
-  collect_keys_info(parse_opts, data, row_offsets, key_counter.data(), {}, stream);
-
-  // Allocate columns to store hash value, length, and offset of each JSON object key in the input
-  auto const num_keys = key_counter.value(stream);
-  std::vector<std::unique_ptr<column>> info_columns;
-  info_columns.emplace_back(
-    make_numeric_column(data_type(type_id::UINT64), num_keys, mask_state::UNALLOCATED, stream));
-  info_columns.emplace_back(
-    make_numeric_column(data_type(type_id::UINT16), num_keys, mask_state::UNALLOCATED, stream));
-  info_columns.emplace_back(
-    make_numeric_column(data_type(type_id::UINT32), num_keys, mask_state::UNALLOCATED, stream));
-  // Create a table out of these columns to pass them around more easily
-  auto info_table           = std::make_unique<table>(std::move(info_columns));
-  auto const info_table_mdv = mutable_table_device_view::create(info_table->mutable_view(), stream);
-
-  // Reset the key counter - now used for indexing
-  key_counter.set_value_to_zero_async(stream);
-  // Fill the allocated columns
-  collect_keys_info(parse_opts, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream);
-  return info_table;
-}
-
-/**
- * @brief Extract the keys from the JSON file the name offsets/lengths.
- */
-std::vector<std::string> create_key_strings(char const* h_data,
-                                            table_view sorted_info,
-                                            rmm::cuda_stream_view stream)
-{
-  auto const num_cols = sorted_info.num_rows();
-  std::vector<uint64_t> h_offsets(num_cols);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_offsets.data(),
-                                sorted_info.column(0).data<uint64_t>(),
-                                sizeof(uint64_t) * num_cols,
-                                cudaMemcpyDefault,
-                                stream.value()));
-
-  std::vector<uint16_t> h_lens(num_cols);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(h_lens.data(),
-                                sorted_info.column(1).data<uint16_t>(),
-                                sizeof(uint16_t) * num_cols,
-                                cudaMemcpyDefault,
-                                stream.value()));
-
-  std::vector<std::string> names(num_cols);
-  std::transform(h_offsets.cbegin(),
-                 h_offsets.cend(),
-                 h_lens.cbegin(),
-                 names.begin(),
-                 [&](auto offset, auto len) { return std::string(h_data + offset, len); });
-  return names;
-}
-
-auto sort_keys_info_by_offset(std::unique_ptr<table> info)
-{
-  auto const agg_offset_col_view = info->get_column(0).view();
-  return sort_by_key(info->view(), table_view({agg_offset_col_view}));
-}
-
-/**
- * @brief Extract JSON object keys from a JSON file.
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
- *
- * @return Names of JSON object keys in the file
- */
-std::pair<std::vector<std::string>, col_map_ptr_type> get_json_object_keys_hashes(
-  parse_options_view const& parse_opts,
-  host_span<char const> h_data,
-  device_span<uint64_t const> rec_starts,
-  device_span<char const> d_data,
-  rmm::cuda_stream_view stream)
-{
-  auto info = create_json_keys_info_table(parse_opts, d_data, rec_starts, stream);
-
-  auto aggregated_info = aggregate_keys_info(std::move(info));
-  auto sorted_info     = sort_keys_info_by_offset(std::move(aggregated_info));
-
-  return {create_key_strings(h_data.data(), sorted_info->view(), stream),
-          create_col_names_hash_map(sorted_info->get_column(2).view(), stream)};
-}
-
-std::vector<uint8_t> ingest_raw_input(host_span<std::unique_ptr<datasource>> sources,
-                                      compression_type compression,
-                                      size_t range_offset,
-                                      size_t range_size,
-                                      size_t range_size_padded)
-{
-  CUDF_FUNC_RANGE();
-  // Iterate through the user defined sources and read the contents into the local buffer
-  size_t total_source_size = 0;
-  for (auto const& source : sources) {
-    total_source_size += source->size();
-  }
-  total_source_size = total_source_size - (range_offset * sources.size());
-
-  auto buffer = std::vector<uint8_t>(total_source_size);
-
-  size_t bytes_read = 0;
-  for (auto const& source : sources) {
-    if (!source->is_empty()) {
-      auto data_size   = (range_size_padded != 0) ? range_size_padded : source->size();
-      auto destination = buffer.data() + bytes_read;
-      bytes_read += source->host_read(range_offset, data_size, destination);
-    }
-  }
-
-  if (compression == compression_type::NONE) {
-    return buffer;
-  } else {
-    return decompress(compression, buffer);
-  }
-}
-
-bool should_load_whole_source(json_reader_options const& reader_opts)
-{
-  return reader_opts.get_byte_range_offset() == 0 and  //
-         reader_opts.get_byte_range_size() == 0;
-}
-
-rmm::device_uvector<uint64_t> find_record_starts(json_reader_options const& reader_opts,
-                                                 host_span<char const> h_data,
-                                                 device_span<char const> d_data,
-                                                 rmm::cuda_stream_view stream)
-{
-  std::vector<char> chars_to_count{'\n'};
-  // Currently, ignoring lineterminations within quotes is handled by recording the records of both,
-  // and then filtering out the records that is a quotechar or a linetermination within a quotechar
-  // pair.
-  // If not starting at an offset, add an extra row to account for the first row in the file
-  cudf::size_type prefilter_count = ((reader_opts.get_byte_range_offset() == 0) ? 1 : 0);
-  if (should_load_whole_source(reader_opts)) {
-    prefilter_count += count_all_from_set(d_data, chars_to_count, stream);
-  } else {
-    prefilter_count += count_all_from_set(h_data, chars_to_count, stream);
-  }
-
-  rmm::device_uvector<uint64_t> rec_starts(prefilter_count, stream);
-
-  auto* find_result_ptr = rec_starts.data();
-  // Manually adding an extra row to account for the first row in the file
-  if (reader_opts.get_byte_range_offset() == 0) {
-    find_result_ptr++;
-    CUDF_CUDA_TRY(cudaMemsetAsync(rec_starts.data(), 0ull, sizeof(uint64_t), stream.value()));
-  }
-
-  std::vector<char> chars_to_find{'\n'};
-  // Passing offset = 1 to return positions AFTER the found character
-  if (should_load_whole_source(reader_opts)) {
-    find_all_from_set(d_data, chars_to_find, 1, find_result_ptr, stream);
-  } else {
-    find_all_from_set(h_data, chars_to_find, 1, find_result_ptr, stream);
-  }
-
-  // Previous call stores the record positions as encountered by all threads
-  // Sort the record positions as subsequent processing may require filtering
-  // certain rows or other processing on specific records
-  thrust::sort(rmm::exec_policy(stream), rec_starts.begin(), rec_starts.end());
-
-  auto filtered_count = prefilter_count;
-
-  // Exclude the ending newline as it does not precede a record start
-  if (h_data.back() == '\n') { filtered_count--; }
-  rec_starts.resize(filtered_count, stream);
-
-  return rec_starts;
-}
-
-/**
- * @brief Uploads the relevant segment of the input json data onto the GPU.
- *
- * Sets the d_data_ data member.
- * Only rows that need to be parsed are copied, based on the byte range
- * Also updates the array of record starts to match the device data offset.
- */
-rmm::device_uvector<char> upload_data_to_device(json_reader_options const& reader_opts,
-                                                host_span<char const> h_data,
-                                                rmm::device_uvector<uint64_t>& rec_starts,
-                                                rmm::cuda_stream_view stream)
-{
-  CUDF_FUNC_RANGE();
-  size_t end_offset = h_data.size();
-
-  // Trim lines that are outside range
-  auto h_rec_starts = cudf::detail::make_std_vector_sync(rec_starts, stream);
-
-  if (reader_opts.get_byte_range_size() != 0) {
-    auto it = h_rec_starts.end() - 1;
-    while (it >= h_rec_starts.begin() && *it > reader_opts.get_byte_range_size()) {
-      end_offset = *it;
-      --it;
-    }
-    h_rec_starts.erase(it + 1, h_rec_starts.end());
-  }
-
-  // Resize to exclude rows outside of the range
-  // Adjust row start positions to account for the data subcopy
-  size_t start_offset = h_rec_starts.front();
-  rec_starts.resize(h_rec_starts.size(), stream);
-  thrust::transform(rmm::exec_policy(stream),
-                    rec_starts.begin(),
-                    rec_starts.end(),
-                    thrust::make_constant_iterator(start_offset),
-                    rec_starts.begin(),
-                    thrust::minus<uint64_t>());
-
-  size_t const bytes_to_upload = end_offset - start_offset;
-  CUDF_EXPECTS(bytes_to_upload <= h_data.size(),
-               "Error finding the record within the specified byte range.\n");
-
-  // Upload the raw data that is within the rows of interest
-  return cudf::detail::make_device_uvector_async(
-    h_data.subspan(start_offset, bytes_to_upload), stream, rmm::mr::get_current_device_resource());
-}
-
-std::pair<std::vector<std::string>, col_map_ptr_type> get_column_names_and_map(
-  parse_options_view const& parse_opts,
-  host_span<char const> h_data,
-  device_span<uint64_t const> rec_starts,
-  device_span<char const> d_data,
-  rmm::cuda_stream_view stream)
-{
-  // If file only contains one row, use the file size for the row size
-  uint64_t first_row_len = d_data.size();
-  if (rec_starts.size() > 1) {
-    // Set first_row_len to the offset of the second row, if it exists
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      &first_row_len, rec_starts.data() + 1, sizeof(uint64_t), cudaMemcpyDefault, stream.value()));
-  }
-  std::vector<char> first_row(first_row_len);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(first_row.data(),
-                                d_data.data(),
-                                first_row_len * sizeof(char),
-                                cudaMemcpyDefault,
-                                stream.value()));
-  stream.synchronize();
-
-  // Determine the row format between:
-  //   JSON array - [val1, val2, ...] and
-  //   JSON object - {"col1":val1, "col2":val2, ...}
-  // based on the top level opening bracket
-  auto const first_square_bracket = std::find(first_row.begin(), first_row.end(), '[');
-  auto const first_curly_bracket  = std::find(first_row.begin(), first_row.end(), '{');
-  CUDF_EXPECTS(first_curly_bracket != first_row.end() || first_square_bracket != first_row.end(),
-               "Input data is not a valid JSON file.");
-  // If the first opening bracket is '{', assume object format
-  if (first_curly_bracket < first_square_bracket) {
-    // use keys as column names if input rows are objects
-    return get_json_object_keys_hashes(parse_opts, h_data, rec_starts, d_data, stream);
-  } else {
-    int cols_found    = 0;
-    bool quotation    = false;
-    auto column_names = std::vector<std::string>();
-    for (size_t pos = 0; pos < first_row.size(); ++pos) {
-      // Flip the quotation flag if current character is a quotechar
-      if (first_row[pos] == parse_opts.quotechar) {
-        quotation = !quotation;
-      }
-      // Check if end of a column/row
-      else if (pos == first_row.size() - 1 ||
-               (!quotation && first_row[pos] == parse_opts.delimiter)) {
-        column_names.emplace_back(std::to_string(cols_found++));
-      }
-    }
-    return {column_names, col_map_type::create(0, stream)};
-  }
-}
-
-std::vector<data_type> get_data_types(json_reader_options const& reader_opts,
-                                      parse_options_view const& parse_opts,
-                                      std::vector<std::string> const& column_names,
-                                      col_map_type* column_map,
-                                      device_span<uint64_t const> rec_starts,
-                                      device_span<char const> data,
-                                      rmm::cuda_stream_view stream)
-{
-  bool has_to_infer_column_types =
-    std::visit([](auto const& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes());
-
-  if (!has_to_infer_column_types) {
-    return std::visit(
-      cudf::detail::visitor_overload{
-        [&](std::vector<data_type> const& dtypes) {
-          CUDF_EXPECTS(dtypes.size() == column_names.size(), "Must specify types for all columns");
-          return dtypes;
-        },
-        [&](std::map<std::string, data_type> const& dtypes) {
-          std::vector<data_type> sorted_dtypes;
-          std::transform(std::cbegin(column_names),
-                         std::cend(column_names),
-                         std::back_inserter(sorted_dtypes),
-                         [&](auto const& column_name) {
-                           auto const it = dtypes.find(column_name);
-                           CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns");
-                           return it->second;
-                         });
-          return sorted_dtypes;
-        },
-        [&](std::map<std::string, schema_element> const& dtypes) {
-          std::vector<data_type> sorted_dtypes;
-          std::transform(std::cbegin(column_names),
-                         std::cend(column_names),
-                         std::back_inserter(sorted_dtypes),
-                         [&](auto const& column_name) {
-                           auto const it = dtypes.find(column_name);
-                           CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns");
-                           return it->second.type;
-                         });
-          return sorted_dtypes;
-        }},
-      reader_opts.get_dtypes());
-  } else {
-    CUDF_EXPECTS(not rec_starts.empty(), "No data available for data type inference.\n");
-    auto const num_columns       = column_names.size();
-    auto const do_set_null_count = column_map->capacity() > 0;
-
-    auto const h_column_infos = detect_data_types(
-      parse_opts, data, rec_starts, do_set_null_count, num_columns, column_map, stream);
-
-    auto get_type_id = [&](auto const& cinfo) {
-      auto int_count_total =
-        cinfo.big_int_count + cinfo.negative_small_int_count + cinfo.positive_small_int_count;
-      if (cinfo.null_count == static_cast<int>(rec_starts.size())) {
-        // Entire column is NULL; allocate the smallest amount of memory
-        return type_id::INT8;
-      } else if (cinfo.string_count > 0) {
-        return type_id::STRING;
-      } else if (cinfo.datetime_count > 0) {
-        return type_id::TIMESTAMP_MILLISECONDS;
-      } else if (cinfo.float_count > 0) {
-        return type_id::FLOAT64;
-      } else if (cinfo.big_int_count == 0 && int_count_total != 0) {
-        return type_id::INT64;
-      } else if (cinfo.big_int_count != 0 && cinfo.negative_small_int_count != 0) {
-        return type_id::STRING;
-      } else if (cinfo.big_int_count != 0) {
-        return type_id::UINT64;
-      } else if (cinfo.bool_count > 0) {
-        return type_id::BOOL8;
-      } else {
-        CUDF_FAIL("Data type detection failed.\n");
-      }
-    };
-
-    std::vector<data_type> dtypes;
-
-    std::transform(std::cbegin(h_column_infos),
-                   std::cend(h_column_infos),
-                   std::back_inserter(dtypes),
-                   [&](auto const& cinfo) { return data_type{get_type_id(cinfo)}; });
-
-    return dtypes;
-  }
-}
-
-table_with_metadata convert_data_to_table(parse_options_view const& parse_opts,
-                                          std::vector<data_type> const& dtypes,
-                                          std::vector<std::string>&& column_names,
-                                          col_map_type* column_map,
-                                          device_span<uint64_t const> rec_starts,
-                                          device_span<char const> data,
-                                          rmm::cuda_stream_view stream,
-                                          rmm::device_async_resource_ref mr)
-{
-  auto const num_columns = dtypes.size();
-  auto const num_records = rec_starts.size();
-
-  // alloc output buffers.
-  std::vector<cudf::io::detail::column_buffer> out_buffers;
-  for (size_t col = 0; col < num_columns; ++col) {
-    out_buffers.emplace_back(dtypes[col], num_records, true, stream, mr);
-  }
-
-  thrust::host_vector<data_type> h_dtypes(num_columns);
-  thrust::host_vector<void*> h_data(num_columns);
-  thrust::host_vector<bitmask_type*> h_valid(num_columns);
-
-  for (size_t i = 0; i < num_columns; ++i) {
-    h_dtypes[i] = dtypes[i];
-    h_data[i]   = out_buffers[i].data();
-    h_valid[i]  = out_buffers[i].null_mask();
-  }
-
-  auto d_dtypes = cudf::detail::make_device_uvector_async<data_type>(
-    h_dtypes, stream, rmm::mr::get_current_device_resource());
-  auto d_data = cudf::detail::make_device_uvector_async<void*>(
-    h_data, stream, rmm::mr::get_current_device_resource());
-  auto d_valid = cudf::detail::make_device_uvector_async<cudf::bitmask_type*>(
-    h_valid, stream, rmm::mr::get_current_device_resource());
-  auto d_valid_counts = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(
-    num_columns, stream, rmm::mr::get_current_device_resource());
-
-  convert_json_to_columns(
-    parse_opts, data, rec_starts, d_dtypes, column_map, d_data, d_valid, d_valid_counts, stream);
-
-  stream.synchronize();
-
-  // postprocess columns
-  auto target_chars   = std::vector<char>{'\\', '"', '\\', '\\', '\\', 't', '\\', 'r', '\\', 'b'};
-  auto target_offsets = std::vector<size_type>{0, 2, 4, 6, 8, 10};
-
-  auto repl_chars   = std::vector<char>{'"', '\\', '\t', '\r', '\b'};
-  auto repl_offsets = std::vector<size_type>{0, 1, 2, 3, 4, 5};
-
-  auto target =
-    make_strings_column(static_cast<size_type>(target_offsets.size() - 1),
-                        std::make_unique<cudf::column>(
-                          cudf::detail::make_device_uvector_async(
-                            target_offsets, stream, rmm::mr::get_current_device_resource()),
-                          rmm::device_buffer{},
-                          0),
-                        cudf::detail::make_device_uvector_async(
-                          target_chars, stream, rmm::mr::get_current_device_resource())
-                          .release(),
-                        0,
-                        {});
-  auto repl = make_strings_column(
-    static_cast<size_type>(repl_offsets.size() - 1),
-    std::make_unique<cudf::column>(cudf::detail::make_device_uvector_async(
-                                     repl_offsets, stream, rmm::mr::get_current_device_resource()),
-                                   rmm::device_buffer{},
-                                   0),
-    cudf::detail::make_device_uvector_async(
-      repl_chars, stream, rmm::mr::get_current_device_resource())
-      .release(),
-    0,
-    {});
-
-  auto const h_valid_counts = cudf::detail::make_std_vector_sync(d_valid_counts, stream);
-  std::vector<std::unique_ptr<column>> out_columns;
-  for (size_t i = 0; i < num_columns; ++i) {
-    out_buffers[i].null_count() = num_records - h_valid_counts[i];
-
-    auto out_column = make_column(out_buffers[i], nullptr, std::nullopt, stream);
-    if (out_column->type().id() == type_id::STRING) {
-      // Need to remove escape character in case of '\"' and '\\'
-      out_columns.emplace_back(cudf::strings::detail::replace(
-        out_column->view(), target->view(), repl->view(), stream, mr));
-    } else {
-      out_columns.emplace_back(std::move(out_column));
-    }
-    if (out_columns.back()->null_count() == 0) {
-      out_columns.back()->set_null_mask(rmm::device_buffer{0, stream, mr}, 0);
-    }
-  }
-
-  std::vector<column_name_info> column_infos;
-  column_infos.reserve(column_names.size());
-  std::transform(std::make_move_iterator(column_names.begin()),
-                 std::make_move_iterator(column_names.end()),
-                 std::back_inserter(column_infos),
-                 [](auto const& col_name) { return column_name_info{col_name}; });
-
-  // This is to ensure the stream-ordered make_stream_column calls above complete before
-  // the temporary std::vectors are destroyed on exit from this function.
-  stream.synchronize();
-
-  CUDF_EXPECTS(!out_columns.empty(), "No columns created from json input");
-
-  return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {column_infos}};
-}
-
-/**
- * @brief Read an entire set or a subset of data from the source
- *
- * @param[in] options reader options with Number of bytes offset from the start,
- * Bytes to read; use `0` for all remaining data
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
- *
- * @return Table and its metadata
- */
-table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
-                              json_reader_options const& reader_opts,
-                              rmm::cuda_stream_view stream,
-                              rmm::device_async_resource_ref mr)
-{
-  CUDF_EXPECTS(not sources.empty(), "No sources were defined");
-  CUDF_EXPECTS(sources.size() == 1 or reader_opts.get_compression() == compression_type::NONE,
-               "Multiple compressed inputs are not supported");
-  CUDF_EXPECTS(reader_opts.is_enabled_lines(), "Only JSON Lines format is currently supported.\n");
-
-  auto parse_opts = parse_options{',', '\n', '\"', '.'};
-
-  parse_opts.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
-  parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
-  parse_opts.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
-
-  parse_opts.dayfirst = reader_opts.is_enabled_dayfirst();
-
-  auto range_offset      = reader_opts.get_byte_range_offset();
-  auto range_size        = reader_opts.get_byte_range_size();
-  auto range_size_padded = reader_opts.get_byte_range_size_with_padding();
-
-  auto const h_raw_data = ingest_raw_input(
-    sources, reader_opts.get_compression(), range_offset, range_size, range_size_padded);
-  host_span<char const> h_data{reinterpret_cast<char const*>(h_raw_data.data()), h_raw_data.size()};
-
-  CUDF_EXPECTS(not h_data.empty(), "Ingest failed: uncompressed input data has zero size.\n");
-
-  auto d_data = rmm::device_uvector<char>(0, stream);
-
-  if (should_load_whole_source(reader_opts)) {
-    d_data = cudf::detail::make_device_uvector_async(
-      h_data, stream, rmm::mr::get_current_device_resource());
-  }
-
-  auto rec_starts = find_record_starts(reader_opts, h_data, d_data, stream);
-
-  CUDF_EXPECTS(rec_starts.size() > 0, "Error enumerating records.\n");
-
-  if (not should_load_whole_source(reader_opts)) {
-    d_data = upload_data_to_device(reader_opts, h_data, rec_starts, stream);
-  }
-
-  CUDF_EXPECTS(not d_data.is_empty(), "Error uploading input data to the GPU.\n");
-
-  auto column_names_and_map =
-    get_column_names_and_map(parse_opts.view(), h_data, rec_starts, d_data, stream);
-
-  auto column_names = std::get<0>(column_names_and_map);
-  auto column_map   = std::move(std::get<1>(column_names_and_map));
-
-  CUDF_EXPECTS(not column_names.empty(), "Error determining column names.\n");
-
-  auto dtypes = get_data_types(
-    reader_opts, parse_opts.view(), column_names, column_map.get(), rec_starts, d_data, stream);
-
-  CUDF_EXPECTS(not dtypes.empty(), "Error in data type detection.\n");
-
-  return convert_data_to_table(parse_opts.view(),
-                               dtypes,
-                               std::move(column_names),
-                               column_map.get(),
-                               rec_starts,
-                               d_data,
-                               stream,
-                               mr);
-}
-
-}  // namespace cudf::io::json::detail::legacy
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index ea52dce020e..df5c7bc21e1 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -15,7 +15,6 @@
  */
 
 #include "io/comp/io_uncomp.hpp"
-#include "io/json/legacy/read_json.hpp"
 #include "io/json/nested_json.hpp"
 #include "read_json.hpp"
 
@@ -267,14 +266,6 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
 {
   CUDF_FUNC_RANGE();
 
-  // TODO remove this if-statement once legacy is removed
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-  if (reader_opts.is_enabled_legacy()) {
-    return legacy::read_json(sources, reader_opts, stream, mr);
-  }
-#pragma GCC diagnostic pop
-
   if (reader_opts.get_byte_range_offset() != 0 or reader_opts.get_byte_range_size() != 0) {
     CUDF_EXPECTS(reader_opts.is_enabled_lines(),
                  "Specifying a byte range is supported only for JSON Lines");
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index db934818ae7..2b8c1b02b40 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -189,10 +189,6 @@ ConfigureTest(
   PERCENT 70
 )
 
-# ##################################################################################################
-# * hash_map tests --------------------------------------------------------------------------------
-ConfigureTest(HASH_MAP_TEST hash_map/map_test.cu)
-
 # ##################################################################################################
 # * quantiles tests -------------------------------------------------------------------------------
 ConfigureTest(
diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu
deleted file mode 100644
index 4b10716706b..00000000000
--- a/cpp/tests/hash_map/map_test.cu
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "hash/concurrent_unordered_map.cuh"
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/testing_main.hpp>
-
-#include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/logical.h>
-#include <thrust/pair.h>
-#include <thrust/tabulate.h>
-
-#include <cstdlib>
-#include <iostream>
-#include <limits>
-#include <random>
-#include <unordered_map>
-#include <vector>
-
-template <typename K, typename V>
-struct key_value_types {
-  using key_type   = K;
-  using value_type = V;
-  using pair_type  = thrust::pair<K, V>;
-  using map_type   = concurrent_unordered_map<key_type, value_type>;
-};
-
-template <typename T>
-struct InsertTest : public cudf::test::BaseFixture {
-  using key_type   = typename T::key_type;
-  using value_type = typename T::value_type;
-  using pair_type  = typename T::pair_type;
-  using map_type   = typename T::map_type;
-
-  InsertTest()
-  {
-    // prevent overflow of small types
-    const size_t input_size =
-      std::min(static_cast<key_type>(size), std::numeric_limits<key_type>::max());
-    pairs.resize(input_size, cudf::get_default_stream());
-    map = std::move(map_type::create(compute_hash_table_size(size), cudf::get_default_stream()));
-    cudf::get_default_stream().synchronize();
-  }
-
-  const cudf::size_type size{10000};
-  rmm::device_uvector<pair_type> pairs{static_cast<std::size_t>(size), cudf::get_default_stream()};
-  std::unique_ptr<map_type, std::function<void(map_type*)>> map;
-};
-
-using TestTypes = ::testing::Types<key_value_types<int32_t, int32_t>,
-                                   key_value_types<int64_t, int64_t>,
-                                   key_value_types<int16_t, int16_t>,
-                                   key_value_types<int32_t, float>,
-                                   key_value_types<int64_t, double>>;
-
-TYPED_TEST_SUITE(InsertTest, TestTypes);
-
-template <typename map_type, typename pair_type>
-struct insert_pair {
-  insert_pair(map_type _map) : map{_map} {}
-
-  __device__ bool operator()(pair_type const& pair)
-  {
-    auto result = map.insert(pair);
-    if (result.first == map.end()) { return false; }
-    return result.second;
-  }
-
-  map_type map;
-};
-
-template <typename map_type, typename pair_type>
-struct find_pair {
-  find_pair(map_type _map) : map{_map} {}
-
-  __device__ bool operator()(pair_type const& pair)
-  {
-    auto result = map.find(pair.first);
-    if (result == map.end()) { return false; }
-    return *result == pair;
-  }
-  map_type map;
-};
-
-template <typename pair_type,
-          typename key_type   = typename pair_type::first_type,
-          typename value_type = typename pair_type::second_type>
-struct unique_pair_generator {
-  __device__ pair_type operator()(cudf::size_type i)
-  {
-    return thrust::make_pair(key_type(i), value_type(i));
-  }
-};
-
-template <typename pair_type,
-          typename key_type   = typename pair_type::first_type,
-          typename value_type = typename pair_type::second_type>
-struct identical_pair_generator {
-  identical_pair_generator(key_type k = 42, value_type v = 42) : key{k}, value{v} {}
-  __device__ pair_type operator()(cudf::size_type i) { return thrust::make_pair(key, value); }
-  key_type key;
-  value_type value;
-};
-
-template <typename pair_type,
-          typename key_type   = typename pair_type::first_type,
-          typename value_type = typename pair_type::second_type>
-struct identical_key_generator {
-  identical_key_generator(key_type k = 42) : key{k} {}
-  __device__ pair_type operator()(cudf::size_type i)
-  {
-    return thrust::make_pair(key, value_type(i));
-  }
-  key_type key;
-};
-
-TYPED_TEST(InsertTest, UniqueKeysUniqueValues)
-{
-  using map_type  = typename TypeParam::map_type;
-  using pair_type = typename TypeParam::pair_type;
-  thrust::tabulate(rmm::exec_policy(cudf::get_default_stream()),
-                   this->pairs.begin(),
-                   this->pairs.end(),
-                   unique_pair_generator<pair_type>{});
-  // All pairs should be new inserts
-  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
-                             this->pairs.begin(),
-                             this->pairs.end(),
-                             insert_pair<map_type, pair_type>{*this->map}));
-
-  // All pairs should be present in the map
-  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
-                             this->pairs.begin(),
-                             this->pairs.end(),
-                             find_pair<map_type, pair_type>{*this->map}));
-}
-
-TYPED_TEST(InsertTest, IdenticalKeysIdenticalValues)
-{
-  using map_type  = typename TypeParam::map_type;
-  using pair_type = typename TypeParam::pair_type;
-  thrust::tabulate(rmm::exec_policy(cudf::get_default_stream()),
-                   this->pairs.begin(),
-                   this->pairs.end(),
-                   identical_pair_generator<pair_type>{});
-  // Insert a single pair
-  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
-                             this->pairs.begin(),
-                             this->pairs.begin() + 1,
-                             insert_pair<map_type, pair_type>{*this->map}));
-  // Identical inserts should all return false (no new insert)
-  EXPECT_FALSE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
-                              this->pairs.begin(),
-                              this->pairs.end(),
-                              insert_pair<map_type, pair_type>{*this->map}));
-
-  // All pairs should be present in the map
-  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
-                             this->pairs.begin(),
-                             this->pairs.end(),
-                             find_pair<map_type, pair_type>{*this->map}));
-}
-
-TYPED_TEST(InsertTest, IdenticalKeysUniqueValues)
-{
-  using map_type  = typename TypeParam::map_type;
-  using pair_type = typename TypeParam::pair_type;
-  thrust::tabulate(rmm::exec_policy(cudf::get_default_stream()),
-                   this->pairs.begin(),
-                   this->pairs.end(),
-                   identical_key_generator<pair_type>{});
-
-  // Insert a single pair
-  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
-                             this->pairs.begin(),
-                             this->pairs.begin() + 1,
-                             insert_pair<map_type, pair_type>{*this->map}));
-
-  // Identical key inserts should all return false (no new insert)
-  EXPECT_FALSE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
-                              this->pairs.begin() + 1,
-                              this->pairs.end(),
-                              insert_pair<map_type, pair_type>{*this->map}));
-
-  // Only first pair is present in map
-  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
-                             this->pairs.begin(),
-                             this->pairs.begin() + 1,
-                             find_pair<map_type, pair_type>{*this->map}));
-
-  EXPECT_FALSE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
-                              this->pairs.begin() + 1,
-                              this->pairs.end(),
-                              find_pair<map_type, pair_type>{*this->map}));
-}
-
-CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 35e6adf20e7..9d766e80094 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -264,13 +264,13 @@ struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTest<DecimalTy
 TYPED_TEST_SUITE(JsonFixedPointReaderTest, cudf::test::FixedPointTypes);
 TYPED_TEST_SUITE(JsonValidFixedPointReaderTest, cudf::test::FixedPointTypes);
 
-// Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
+// Parametrize qualifying JSON tests for supported orients
 INSTANTIATE_TEST_CASE_P(JsonReaderParamTest,
                         JsonReaderParamTest,
                         ::testing::Values(json_test_t::json_record_orient,
                                           json_test_t::json_row_orient));
 
-// Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
+// Parametrize qualifying JSON tests for supported orients
 INSTANTIATE_TEST_CASE_P(JsonReaderRecordTest,
                         JsonReaderRecordTest,
                         ::testing::Values(json_test_t::json_record_orient));
@@ -917,7 +917,6 @@ TEST_F(JsonReaderTest, EmptyFile)
     outfile << "";
   }
 
-  // New reader only - legacy reader is strict about having non-empty input
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}).lines(true);
   auto result = cudf::io::read_json(in_options);
@@ -934,7 +933,6 @@ TEST_F(JsonReaderTest, NoDataFile)
     outfile << "{}\n";
   }
 
-  // New reader only - legacy reader is strict about having non-empty input
   cudf::io::json_reader_options in_options =
     cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}).lines(true);
   cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
@@ -1303,31 +1301,6 @@ TEST_P(JsonReaderParamTest, JsonLinesMultipleFileInputsNoNL)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), float64_wrapper{{1.1, 2.2, 3.3, 4.4}});
 }
 
-// This can be removed once the legacy option has been removed.
-// The read_json only throws with legacy(true)
-TEST_F(JsonReaderTest, DISABLED_BadDtypeParams)
-{
-  std::string buffer = "[1,2,3,4]";
-
-  cudf::io::json_reader_options options_vec =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
-      .lines(true)
-      .dtypes({dtype<int8_t>()});
-
-  // should throw because there are four columns and only one dtype
-  EXPECT_THROW(cudf::io::read_json(options_vec), cudf::logic_error);
-
-  cudf::io::json_reader_options options_map =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
-      .lines(true)
-      .dtypes(std::map<std::string, cudf::data_type>{{"0", dtype<int8_t>()},
-                                                     {"1", dtype<int8_t>()},
-                                                     {"2", dtype<int8_t>()},
-                                                     {"wrong_name", dtype<int8_t>()}});
-  // should throw because one of the columns is not in the dtype map
-  EXPECT_THROW(cudf::io::read_json(options_map), cudf::logic_error);
-}
-
 TEST_F(JsonReaderTest, JsonBasic)
 {
   std::string const fname = temp_env->get_temp_dir() + "JsonBasic.json";
@@ -1372,12 +1345,8 @@ TEST_F(JsonReaderTest, JsonLines)
   // Read test data via nested JSON reader
   auto const table = cudf::io::read_json(json_lines_options);
 
-  // Read test data via legacy, non-nested JSON lines reader
-  auto const legacy_reader_table = cudf::io::read_json(json_lines_options);
-
-  // Verify that the data read via non-nested JSON lines reader matches the data read via nested
-  // JSON reader
-  CUDF_TEST_EXPECT_TABLES_EQUAL(legacy_reader_table.tbl->view(), table.tbl->view());
+  // TODO: Rewrite this test to check against a fixed value
+  CUDF_TEST_EXPECT_TABLES_EQUAL(table.tbl->view(), table.tbl->view());
 }
 
 TEST_F(JsonReaderTest, JsonLongString)
@@ -1548,12 +1517,8 @@ TEST_F(JsonReaderTest, LinesNoOmissions)
     // Read test data via nested JSON reader
     auto const table = cudf::io::read_json(json_lines_options);
 
-    // Read test data via legacy, non-nested JSON lines reader
-    auto const legacy_reader_table = cudf::io::read_json(json_lines_options);
-
-    // Verify that the data read via non-nested JSON lines reader matches the data read via
-    // nested JSON reader
-    CUDF_TEST_EXPECT_TABLES_EQUAL(legacy_reader_table.tbl->view(), table.tbl->view());
+    // TODO: Rewrite this test to check against a fixed value
+    CUDF_TEST_EXPECT_TABLES_EQUAL(table.tbl->view(), table.tbl->view());
   }
 }
 
@@ -2440,7 +2405,7 @@ TEST_F(JsonReaderTest, MapTypes)
 struct JsonDelimiterParamTest : public cudf::test::BaseFixture,
                                 public testing::WithParamInterface<char> {};
 
-// Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
+// Parametrize qualifying JSON tests for multiple delimiters
 INSTANTIATE_TEST_SUITE_P(JsonDelimiterParamTest,
                          JsonDelimiterParamTest,
                          ::testing::Values('\n', '\b', '\v', '\f', 'h'));
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index d6f800cce8b..5dc25133719 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -248,7 +248,7 @@ TEST_F(JsonTest, StackContextUtf8)
 struct JsonDelimiterParamTest : public cudf::test::BaseFixture,
                                 public testing::WithParamInterface<char> {};
 
-// Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader
+// Parametrize qualifying JSON tests for multiple delimiters
 INSTANTIATE_TEST_SUITE_P(JsonDelimiterParamTest,
                          JsonDelimiterParamTest,
                          ::testing::Values('\n', '\b', '\v', '\f', 'h'));
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 283a451dd4a..242727163ee 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -47,7 +47,6 @@ cpdef read_json(object filepaths_or_buffers,
                 bool lines,
                 object compression,
                 object byte_range,
-                bool legacy,
                 bool keep_quotes,
                 bool mixed_types_as_string,
                 bool prune_columns):
@@ -119,7 +118,6 @@ cpdef read_json(object filepaths_or_buffers,
         .lines(c_lines)
         .byte_range_offset(c_range_offset)
         .byte_range_size(c_range_size)
-        .legacy(legacy)
         .build()
     )
     if is_list_like_dtypes:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
index 7e64a4cae29..10e43467d57 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
@@ -87,9 +87,6 @@ cdef extern from "cudf/io/json.hpp" \
         json_reader_options_builder& dayfirst(
             bool val
         ) except +
-        json_reader_options_builder& legacy(
-            bool val
-        ) except +
         json_reader_options_builder& keep_quotes(
             bool val
         ) except +
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index 03d07fc3a50..7de9705e4cb 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -99,7 +99,6 @@ def read_json(
             lines,
             compression,
             byte_range,
-            False,
             keep_quotes,
             mixed_types_as_string,
             prune_columns,

From f873e238aa0e611f6352f7c91501a562eeaa6437 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 23 May 2024 18:32:58 -0400
Subject: [PATCH 256/842] Use rapids_cpm_nvtx3 to get same nvtx3 target state
 as rmm (#15840)

We need to use the `rapids_cpm_nvtx3` so that the nvtx3 targets, and setup are consistent across rmm and cudf. If we don't we get errors around incorrect exports when building statically or link errors when building shared.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Nghia Truong (https://github.com/ttnghia)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/15840
---
 cpp/CMakeLists.txt                  |  4 ++--
 cpp/benchmarks/CMakeLists.txt       |  2 +-
 cpp/cmake/thirdparty/get_nvtx.cmake | 16 +++++++---------
 cpp/tests/CMakeLists.txt            |  4 ++--
 java/src/main/native/CMakeLists.txt |  2 +-
 5 files changed, 13 insertions(+), 15 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7390c465ccb..1eab51c8827 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -794,8 +794,8 @@ add_dependencies(cudf jitify_preprocess_run)
 target_link_libraries(
   cudf
   PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm
-  PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp kvikio::kvikio
-          $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
+  PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
+          kvikio::kvikio $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
 )
 
 # Add Conda library, and include paths if specified
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 170cf27b72b..10f645dfec0 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -25,7 +25,7 @@ target_compile_options(
 target_link_libraries(
   cudf_datagen
   PUBLIC GTest::gmock GTest::gtest benchmark::benchmark nvbench::nvbench Threads::Threads cudf
-         cudftestutil nvtx3-cpp
+         cudftestutil nvtx3::nvtx3-cpp
   PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
diff --git a/cpp/cmake/thirdparty/get_nvtx.cmake b/cpp/cmake/thirdparty/get_nvtx.cmake
index c722c4f70f1..e236d586522 100644
--- a/cpp/cmake/thirdparty/get_nvtx.cmake
+++ b/cpp/cmake/thirdparty/get_nvtx.cmake
@@ -12,16 +12,14 @@
 # the License.
 # =============================================================================
 
-# This function finds NVTX and sets any additional necessary environment variables.
+# Need to call rapids_cpm_nvtx3 to get support for an installed version of nvtx3 and to support
+# installing it ourselves
 function(find_and_configure_nvtx)
-  rapids_cpm_find(
-    NVTX3 3.1.0
-    GLOBAL_TARGETS nvtx3-c nvtx3-cpp
-    CPM_ARGS
-    GIT_REPOSITORY https://github.com/NVIDIA/NVTX.git
-    GIT_TAG v3.1.0
-    GIT_SHALLOW TRUE SOURCE_SUBDIR c
-  )
+  include(${rapids-cmake-dir}/cpm/nvtx3.cmake)
+
+  # Find or install nvtx3
+  rapids_cpm_nvtx3(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
+
 endfunction()
 
 find_and_configure_nvtx()
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index db934818ae7..7db9a06e809 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -56,8 +56,8 @@ function(ConfigureTest CMAKE_TEST_NAME)
 
   target_link_libraries(
     ${CMAKE_TEST_NAME}
-    PRIVATE cudftestutil GTest::gmock GTest::gmock_main GTest::gtest GTest::gtest_main nvtx3-cpp
-            $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIB}"
+    PRIVATE cudftestutil GTest::gmock GTest::gmock_main GTest::gtest GTest::gtest_main
+            nvtx3::nvtx3-cpp $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIB}"
   )
   rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME})
   rapids_test_add(
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 0d5339a1402..56f8f9d0472 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -239,7 +239,7 @@ endif()
 # When nvcomp is installed we need to use nvcomp::nvcomp but from the cudf build directory it will
 # just be nvcomp.
 target_link_libraries(
-  cudfjni ${CUDF_LINK} PRIVATE nvtx3-cpp $<TARGET_NAME_IF_EXISTS:nvcomp>
+  cudfjni ${CUDF_LINK} PRIVATE nvtx3::nvtx3-cpp $<TARGET_NAME_IF_EXISTS:nvcomp>
                                $<TARGET_NAME_IF_EXISTS:nvcomp::nvcomp>
 )
 

From 8b5ff188e79bb79ca0c2d581e94d3a91654a2d31 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Thu, 23 May 2024 20:32:30 -0400
Subject: [PATCH 257/842] Remove problematic call of index setter to unblock
 dask-cuda CI (#15844)

Lighter weight alternative to https://github.com/rapidsai/cudf/pull/15843 to unblock dask-cuda's breakage.

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15844
---
 python/cudf/cudf/core/indexed_frame.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index a166c256689..394904c5855 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -350,7 +350,8 @@ def _from_columns_like_self(
         frame = self.__class__._from_data(data)
 
         if index is not None:
-            frame.index = index
+            # TODO: triage why using the setter here breaks dask_cuda.ProxifyHostFile
+            frame._index = index
         return frame._copy_type_metadata(
             self,
             include_index=bool(index_names),

From 72aa271a6ad8cfdcd4373ceadd777b4800fd26c4 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 24 May 2024 06:24:37 -1000
Subject: [PATCH 258/842] Ensure cudf.Series(cudf.Series(...)) creates a
 reference to the same index (#15845)

Aligns these behaviors

```python
In [1]: import pandas as pd

In [3]: ser1 = pd.Series(range(3), index=list("Abc"))

In [4]: ser2 = pd.Series(ser1)

In [5]: ser1.index is ser2.index
Out[5]: True

In [6]: import cudf

In [7]: ser1 = cudf.Series(range(3), index=list("Abc"))

In [8]: ser2 = cudf.Series(ser1)

In [9]: ser1.index is ser2.index
Out[9]: False
```

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15845
---
 python/cudf/cudf/core/series.py       | 4 +++-
 python/cudf/cudf/tests/test_series.py | 6 ++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 41fbf269699..908347e389b 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -595,8 +595,10 @@ def __init__(
                 data = data.copy(deep=True)
             name_from_data = data.name
             column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
-            if isinstance(data, (pd.Series, Series)):
+            if isinstance(data, pd.Series):
                 index_from_data = as_index(data.index)
+            elif isinstance(data, Series):
+                index_from_data = data.index
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
                 "Use cudf.Series._from_data for constructing a Series from "
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 9aeae566730..323716d5fc3 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2835,3 +2835,9 @@ def test_timedelta_series_init(data):
     actual = cudf.Series(scalar)
 
     assert_eq(expected, actual)
+
+
+def test_series_from_series_index_no_shallow_copy():
+    ser1 = cudf.Series(range(3), index=list("abc"))
+    ser2 = cudf.Series(ser1)
+    assert ser1.index is ser2.index

From 8a405674a5ba1554a0ced5d1f39f89fb424a768d Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 24 May 2024 11:24:39 -0500
Subject: [PATCH 259/842] Fix docs for IO readers and strings_convert (#15842)

Fixes documentation for IO readers and strings_convert.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15842
---
 docs/cudf/source/libcudf_docs/api_docs/io_readers.rst      | 2 +-
 docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
index a835673dee4..f94a5ddb403 100644
--- a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
+++ b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
@@ -2,4 +2,4 @@ Io Readers
 ==========
 
 .. doxygengroup:: io_readers
-   :desc-only:
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
index ae5d78fb1a1..f2f320bd0e4 100644
--- a/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
@@ -2,4 +2,4 @@ Strings Convert
 ===============
 
 .. doxygengroup:: strings_convert
-   :desc-only:
+   :members:

From 78a0314d809a24e26b86abecf8f935a4d4340550 Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Fri, 24 May 2024 12:40:28 -0400
Subject: [PATCH 260/842] Avoid unnecessary `Index` cast in
 `IndexedFrame.index` setter (#15843)

Triaging recent dask-cuda [breakage](https://github.com/rapidsai/dask-cuda/actions/runs/9202583065/attempts/1) led me to https://github.com/rapidsai/cudf/pull/15781, where it seems like the passing of an index object directly to the `IndexedFrame.index` setter (and therefore, wrapping of this index in an `Index()` constructor) has caused proxifying issues on dask-cuda's end.

cc @rjzamora @mroeschke

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15843
---
 python/cudf/cudf/core/indexed_frame.py |  6 +++++-
 python/cudf/cudf/tests/test_index.py   | 14 ++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 394904c5855..b4a689804c7 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -644,7 +644,11 @@ def index(self, value):
                 f"Length mismatch: Expected axis has {old_length} elements, "
                 f"new values have {len(value)} elements"
             )
-        self._index = Index(value)
+        # avoid unnecessary cast to Index
+        if not isinstance(value, BaseIndex):
+            value = Index(value)
+
+        self._index = value
 
     @_cudf_nvtx_annotate
     def replace(
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 8e7532d044d..b92ae1b3364 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -3266,3 +3266,17 @@ def test_index_datetime_repeat():
     actual = gidx.to_frame().repeat(5)
 
     assert_eq(actual.index, expected)
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        cudf.Index([1]),
+        cudf.RangeIndex(1),
+        cudf.MultiIndex(levels=[[0]], codes=[[0]]),
+    ],
+)
+def test_index_assignment_no_shallow_copy(index):
+    df = cudf.DataFrame(range(1))
+    df.index = index
+    assert df.index is index

From 4a3315b55a89b2c92908eac8a6fd255a33843ba9 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Fri, 24 May 2024 13:46:27 -0500
Subject: [PATCH 261/842] Remove benchmark-specific use of pinned-pooled memory
 in Parquet multithreaded benchmark. (#15838)

The benchmark was manually creating and using a pinned-pool rmm allocator which is now redundant, since cuIO itself does this by default.  This PR removes it.

Authors:
  - https://github.com/nvdbaranec
  - Nghia Truong (https://github.com/ttnghia)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Nghia Truong (https://github.com/ttnghia)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/15838
---
 .../io/parquet/parquet_reader_multithread.cpp   | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index fbdcfb0ade9..bd80c4e0e88 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -25,25 +25,12 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/thread_pool.hpp>
 
-#include <rmm/mr/device/pool_memory_resource.hpp>
-#include <rmm/mr/pinned_host_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
-
 #include <nvtx3/nvtx3.hpp>
 
 #include <nvbench/nvbench.cuh>
 
 #include <vector>
 
-// TODO: remove this once pinned/pooled is enabled by default in cuIO
-void set_cuio_host_pinned_pool()
-{
-  using host_pooled_mr = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
-  static std::shared_ptr<host_pooled_mr> mr = std::make_shared<host_pooled_mr>(
-    std::make_shared<rmm::mr::pinned_host_memory_resource>().get(), 256ul * 1024 * 1024);
-  cudf::io::set_host_memory_resource(*mr);
-}
-
 size_t get_num_reads(nvbench::state const& state) { return state.get_int64("num_threads"); }
 
 size_t get_read_size(nvbench::state const& state)
@@ -105,8 +92,6 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
   size_t const data_size = state.get_int64("total_data_size");
   auto const num_threads = state.get_int64("num_threads");
 
-  set_cuio_host_pinned_pool();
-
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
   cudf::detail::thread_pool threads(num_threads);
 
@@ -186,8 +171,6 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
   size_t const input_limit  = state.get_int64("input_limit");
   size_t const output_limit = state.get_int64("output_limit");
 
-  set_cuio_host_pinned_pool();
-
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
   cudf::detail::thread_pool threads(num_threads);
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);

From 81cadb60b9cb8840e1700ecc223f651c97618e34 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 24 May 2024 10:20:21 -1000
Subject: [PATCH 262/842] Use ColumnAccessor row and column length attributes
 more consistently (#15857)

Also ensures any calls to `_num_rows` uses the cached version

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15857
---
 python/cudf/cudf/core/dataframe.py     | 29 +++++++++++++-------------
 python/cudf/cudf/core/frame.py         |  2 +-
 python/cudf/cudf/core/indexed_frame.py |  8 ++++---
 python/cudf/cudf/core/multiindex.py    |  2 +-
 4 files changed, 21 insertions(+), 20 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1f530aa3108..acfc2d781a7 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1429,7 +1429,7 @@ def __setitem__(self, arg, value):
                 else:
                     # disc. with pandas here
                     # pandas raises key error here
-                    self.insert(len(self._data), arg, value)
+                    self.insert(self._num_columns, arg, value)
 
         elif can_convert_to_column(arg):
             mask = arg
@@ -1846,7 +1846,7 @@ def _clean_renderable_dataframe(self, output):
         if lines[-1].startswith("["):
             lines = lines[:-1]
             lines.append(
-                "[%d rows x %d columns]" % (len(self), len(self._data.names))
+                "[%d rows x %d columns]" % (len(self), self._num_columns)
             )
         return "\n".join(lines)
 
@@ -1901,7 +1901,7 @@ def _get_renderable_dataframe(self):
             else pd.options.display.width / 2
         )
 
-        if len(self) <= nrows and len(self._data.names) <= ncols:
+        if len(self) <= nrows and self._num_columns <= ncols:
             output = self.copy(deep=False)
         elif self.empty and len(self.index) > 0:
             max_seq_items = pd.options.display.max_seq_items
@@ -1922,15 +1922,15 @@ def _get_renderable_dataframe(self):
             else:
                 output = self.copy(deep=False)
         else:
-            left_cols = len(self._data.names)
+            left_cols = self._num_columns
             right_cols = 0
             upper_rows = len(self)
             lower_rows = 0
             if len(self) > nrows and nrows > 0:
                 upper_rows = int(nrows / 2.0) + 1
                 lower_rows = upper_rows + (nrows % 2)
-            if len(self._data.names) > ncols:
-                right_cols = len(self._data.names) - int(ncols / 2.0)
+            if left_cols > ncols:
+                right_cols = left_cols - int(ncols / 2.0)
                 # adjust right columns for output if multiindex.
                 right_cols = (
                     right_cols - 1
@@ -1945,11 +1945,11 @@ def _get_renderable_dataframe(self):
             else:
                 # If right_cols is 0 or negative, it means
                 # self has lesser number of columns than ncols.
-                # Hence assign len(self._data.names) which
+                # Hence assign self._num_columns which
                 # will result in empty `*_right` quadrants.
                 # This is because `*_left` quadrants will
                 # contain all columns.
-                right_cols = len(self._data.names)
+                right_cols = self._num_columns
 
             upper_left = self.head(upper_rows).iloc[:, :left_cols]
             upper_right = self.head(upper_rows).iloc[:, right_cols:]
@@ -1983,8 +1983,7 @@ def _repr_html_(self):
         if lines[-2].startswith("<p>"):
             lines = lines[:-2]
             lines.append(
-                "<p>%d rows × %d columns</p>"
-                % (len(self), len(self._data.names))
+                "<p>%d rows × %d columns</p>" % (len(self), self._num_columns)
             )
             lines.append("</div>")
         return "\n".join(lines)
@@ -2660,9 +2659,9 @@ def columns(self, columns):
             level_names = (pd_columns.name,)
             label_dtype = pd_columns.dtype
 
-        if len(pd_columns) != len(self._data.names):
+        if len(pd_columns) != self._num_columns:
             raise ValueError(
-                f"Length mismatch: expected {len(self._data.names)} elements, "
+                f"Length mismatch: expected {self._num_columns} elements, "
                 f"got {len(pd_columns)} elements"
             )
 
@@ -2683,7 +2682,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None:
         * The possible .columns.dtype
         * The .columns.names/name (depending on if it's a MultiIndex)
         """
-        if len(self._data.names) != len(other.names):
+        if self._num_columns != len(other.names):
             raise ValueError(
                 f"Length mismatch: expected {len(other)} elements, "
                 f"got {len(self)} elements"
@@ -3207,7 +3206,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
         if name in self._data:
             raise NameError(f"duplicated column name {name}")
 
-        num_cols = len(self._data)
+        num_cols = self._num_columns
         if loc < 0:
             loc += num_cols + 1
 
@@ -5032,7 +5031,7 @@ def info(
         )
         lines.append(index_summary)
 
-        if len(self._data) == 0:
+        if self._num_columns == 0:
             lines.append(f"Empty {type(self).__name__}")
             cudf.utils.ioutils.buffer_write_lines(buf, lines)
             return
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 92ca76d6ceb..7b561906afb 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -76,7 +76,7 @@ def _num_columns(self) -> int:
 
     @property
     def _num_rows(self) -> int:
-        return 0 if self._num_columns == 0 else len(self._data.columns[0])
+        return self._data.nrows
 
     @property
     def _column_names(self) -> Tuple[Any, ...]:  # TODO: Tuple[str]?
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index b4a689804c7..a31430e1571 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -289,6 +289,7 @@ def __init__(self, data=None, index=None):
     @property
     def _num_rows(self) -> int:
         # Important to use the index because the data may be empty.
+        # TODO: Remove once DataFrame.__init__ is cleaned up
         return len(self.index)
 
     @property
@@ -448,6 +449,7 @@ def _scan(self, op, axis=None, skipna=True):
     def _check_data_index_length_match(self) -> None:
         # Validate that the number of rows in the data matches the index if the
         # data is not empty. This is a helper for the constructor.
+        # TODO: Use self._num_rows once DataFrame.__init__ is cleaned up
         if self._data.nrows > 0 and self._data.nrows != len(self.index):
             raise ValueError(
                 f"Length of values ({self._data.nrows}) does not "
@@ -639,7 +641,7 @@ def index(self, value):
         new_length = len(value)
 
         # A DataFrame with 0 columns can have an index of arbitrary length.
-        if len(self._data) > 0 and new_length != old_length:
+        if self._num_columns > 0 and new_length != old_length:
             raise ValueError(
                 f"Length mismatch: Expected axis has {old_length} elements, "
                 f"new values have {len(value)} elements"
@@ -1129,7 +1131,7 @@ def dot(self, other, reflect=False):
             common = self._data.to_pandas_index().union(
                 other.index.to_pandas()
             )
-            if len(common) > len(self._data.names) or len(common) > len(
+            if len(common) > self._num_columns or len(common) > len(
                 other.index
             ):
                 raise ValueError("matrices are not aligned")
@@ -2757,7 +2759,7 @@ def sort_index(
             out = self[labels]
             if ignore_index:
                 out._data.rangeindex = True
-                out._data.names = list(range(len(self._data.names)))
+                out._data.names = list(range(self._num_columns))
 
         return self._mimic_inplace(out, inplace=inplace)
 
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index c149a1028a0..049fac45ba8 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -527,7 +527,7 @@ def get_slice_bound(self, label, side, kind=None):
     @_cudf_nvtx_annotate
     def nlevels(self):
         """Integer number of levels in this MultiIndex."""
-        return len(self._data)
+        return self._num_columns
 
     @property  # type: ignore
     @_cudf_nvtx_annotate

From d756c37ef3a9625862df849e03b503d990dc411b Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 24 May 2024 15:35:31 -0500
Subject: [PATCH 263/842] Implement `on_bad_lines` in json reader (#15834)

Fixes: #15559

This PR implements `on_bad_lines` in json reader. When `on_bad_lines="recover"`, bad lines are replaced by `<NA>` values.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15834
---
 python/cudf/cudf/_lib/json.pyx                | 15 ++++++++-
 .../cudf/_lib/pylibcudf/libcudf/io/json.pxd   |  7 +++++
 python/cudf/cudf/io/json.py                   | 18 ++++++-----
 python/cudf/cudf/tests/test_json.py           | 31 +++++++++++++++++++
 python/cudf/cudf/utils/ioutils.py             |  5 +++
 5 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 242727163ee..a8fef907bad 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -24,6 +24,7 @@ from cudf._lib.io.utils cimport (
 from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
 from cudf._lib.pylibcudf.libcudf.io.json cimport (
     json_reader_options,
+    json_recovery_mode_t,
     json_writer_options,
     read_json as libcudf_read_json,
     schema_element,
@@ -42,6 +43,15 @@ from cudf._lib.types cimport dtype_to_data_type
 from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
 
 
+cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines):
+    if on_bad_lines.lower() == "error":
+        return json_recovery_mode_t.FAIL
+    elif on_bad_lines.lower() == "recover":
+        return json_recovery_mode_t.RECOVER_WITH_NULL
+    else:
+        raise TypeError(f"Invalid parameter for {on_bad_lines=}")
+
+
 cpdef read_json(object filepaths_or_buffers,
                 object dtype,
                 bool lines,
@@ -49,7 +59,8 @@ cpdef read_json(object filepaths_or_buffers,
                 object byte_range,
                 bool keep_quotes,
                 bool mixed_types_as_string,
-                bool prune_columns):
+                bool prune_columns,
+                object on_bad_lines):
     """
     Cython function to call into libcudf API, see `read_json`.
 
@@ -118,6 +129,7 @@ cpdef read_json(object filepaths_or_buffers,
         .lines(c_lines)
         .byte_range_offset(c_range_offset)
         .byte_range_size(c_range_size)
+        .recovery_mode(_get_json_recovery_mode(on_bad_lines))
         .build()
     )
     if is_list_like_dtypes:
@@ -128,6 +140,7 @@ cpdef read_json(object filepaths_or_buffers,
     opts.enable_keep_quotes(keep_quotes)
     opts.enable_mixed_types_as_string(mixed_types_as_string)
     opts.enable_prune_columns(prune_columns)
+
     # Read JSON
     cdef cudf_io_types.table_with_metadata c_result
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
index 10e43467d57..2e50cccd132 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
@@ -19,6 +19,10 @@ cdef extern from "cudf/io/json.hpp" \
         data_type type
         map[string, schema_element] child_types
 
+    cdef enum json_recovery_mode_t:
+        FAIL "cudf::io::json_recovery_mode_t::FAIL"
+        RECOVER_WITH_NULL "cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL"
+
     cdef cppclass json_reader_options:
         json_reader_options() except +
         cudf_io_types.source_info get_source() except +
@@ -90,6 +94,9 @@ cdef extern from "cudf/io/json.hpp" \
         json_reader_options_builder& keep_quotes(
             bool val
         ) except +
+        json_reader_options_builder& recovery_mode(
+            json_recovery_mode_t val
+        ) except +
 
         json_reader_options build() except +
 
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index 7de9705e4cb..dd4a0d9eb07 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -27,6 +27,7 @@ def read_json(
     storage_options=None,
     mixed_types_as_string=False,
     prune_columns=False,
+    on_bad_lines="error",
     *args,
     **kwargs,
 ):
@@ -94,14 +95,15 @@ def read_json(
                 filepaths_or_buffers.append(tmp_source)
 
         df = libjson.read_json(
-            filepaths_or_buffers,
-            dtype,
-            lines,
-            compression,
-            byte_range,
-            keep_quotes,
-            mixed_types_as_string,
-            prune_columns,
+            filepaths_or_buffers=filepaths_or_buffers,
+            dtype=dtype,
+            lines=lines,
+            compression=compression,
+            byte_range=byte_range,
+            keep_quotes=keep_quotes,
+            mixed_types_as_string=mixed_types_as_string,
+            prune_columns=prune_columns,
+            on_bad_lines=on_bad_lines,
         )
     else:
         warnings.warn(
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 51287fe26a0..ba6a8f94719 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -1392,3 +1392,34 @@ def test_json_nested_mixed_types_error(jsonl_string):
             orient="records",
             lines=True,
         )
+
+
+@pytest.mark.parametrize("on_bad_lines", ["error", "recover", "abc"])
+def test_json_reader_on_bad_lines(on_bad_lines):
+    json_input = StringIO(
+        '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n'
+    )
+    if on_bad_lines == "error":
+        with pytest.raises(RuntimeError):
+            cudf.read_json(
+                json_input,
+                lines=True,
+                orient="records",
+                on_bad_lines=on_bad_lines,
+            )
+    elif on_bad_lines == "recover":
+        actual = cudf.read_json(
+            json_input, lines=True, orient="records", on_bad_lines=on_bad_lines
+        )
+        expected = cudf.DataFrame(
+            {"a": [1, 2, None, 3], "b": [10, 11, None, 12]}
+        )
+        assert_eq(actual, expected)
+    else:
+        with pytest.raises(TypeError):
+            cudf.read_json(
+                json_input,
+                lines=True,
+                orient="records",
+                on_bad_lines=on_bad_lines,
+            )
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 1366a0b8e84..0209c692935 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -739,6 +739,11 @@
 
     If True, only return those columns mentioned in the dtype argument.
     If `False` dtype argument is used a type inference suggestion.
+on_bad_lines : {'error', 'recover'}, default 'error'
+    Specifies what to do upon encountering a bad line. Allowed values are :
+
+    - ``'error'``, raise an Exception when a bad line is encountered.
+    - ``'recover'``, fills the row with <NA> when a bad line is encountered.
 Returns
 -------
 result : Series or DataFrame, depending on the value of `typ`.

From 8458306ecbc17d3977a98e2e33752b678394f588 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 24 May 2024 15:04:08 -0700
Subject: [PATCH 264/842] Migrate reshape.pxd to pylibcudf (#15827)

xref #15162

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15827
---
 .../user_guide/api_docs/pylibcudf/index.rst   |  1 +
 .../user_guide/api_docs/pylibcudf/reshape.rst |  6 ++
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |  1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |  1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |  1 +
 python/cudf/cudf/_lib/pylibcudf/reshape.pxd   | 11 ++++
 python/cudf/cudf/_lib/pylibcudf/reshape.pyx   | 65 +++++++++++++++++++
 python/cudf/cudf/_lib/reshape.pyx             | 42 +++++-------
 .../cudf/cudf/pylibcudf_tests/test_reshape.py | 43 ++++++++++++
 9 files changed, 147 insertions(+), 24 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/reshape.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/reshape.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/reshape.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_reshape.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 8cad95f61ae..1c1b37e2c37 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -20,6 +20,7 @@ This page provides API documentation for pylibcudf.
     lists
     merge
     reduce
+    reshape
     rolling
     scalar
     search
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/reshape.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/reshape.rst
new file mode 100644
index 00000000000..964cef04923
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/reshape.rst
@@ -0,0 +1,6 @@
+=======
+reshape
+=======
+
+.. automodule:: cudf._lib.pylibcudf.reshape
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index efc978fc6d0..7d01671e84f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -27,6 +27,7 @@ set(cython_sources
     merge.pyx
     reduce.pyx
     replace.pyx
+    reshape.pyx
     rolling.pyx
     scalar.pyx
     search.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 5adefa5fd93..91c3fdf5602 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -13,6 +13,7 @@ from . cimport (
     merge,
     reduce,
     replace,
+    reshape,
     rolling,
     search,
     sorting,
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 89f874f5fa5..fcdc4992f00 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -13,6 +13,7 @@
     merge,
     reduce,
     replace,
+    reshape,
     rolling,
     search,
     sorting,
diff --git a/python/cudf/cudf/_lib/pylibcudf/reshape.pxd b/python/cudf/cudf/_lib/pylibcudf/reshape.pxd
new file mode 100644
index 00000000000..a7cc45d7a08
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/reshape.pxd
@@ -0,0 +1,11 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+from .column cimport Column
+from .scalar cimport Scalar
+from .table cimport Table
+
+
+cpdef Column interleave_columns(Table source_table)
+cpdef Table tile(Table source_table, size_type count)
diff --git a/python/cudf/cudf/_lib/pylibcudf/reshape.pyx b/python/cudf/cudf/_lib/pylibcudf/reshape.pyx
new file mode 100644
index 00000000000..b68eba48cd6
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/reshape.pyx
@@ -0,0 +1,65 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.reshape cimport (
+    interleave_columns as cpp_interleave_columns,
+    tile as cpp_tile,
+)
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+from .column cimport Column
+from .table cimport Table
+
+
+cpdef Column interleave_columns(Table source_table):
+    """Interleave columns of a table into a single column.
+
+    Converts the column major table `input` into a row major column.
+
+    Example:
+    in     = [[A1, A2, A3], [B1, B2, B3]]
+    return = [A1, B1, A2, B2, A3, B3]
+
+    Parameters
+    ----------
+    source_table: Table
+        The input table to interleave
+
+    Returns
+    -------
+    Column
+        A new column which is the result of interleaving the input columns
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_interleave_columns(source_table.view()))
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Table tile(Table source_table, size_type count):
+    """Repeats the rows from input table count times to form a new table.
+
+    Parameters
+    ----------
+    source_table: Table
+        The input table containing rows to be repeated
+    count: size_type
+        The number of times to tile "rows". Must be non-negative
+
+    Returns
+    -------
+    Table
+        The table containing the tiled "rows"
+    """
+    cdef unique_ptr[table] c_result
+
+    with nogil:
+        c_result = move(cpp_tile(source_table.view(), count))
+
+    return Table.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx
index 48e386bcf02..6bba8f0df35 100644
--- a/python/cudf/cudf/_lib/reshape.pyx
+++ b/python/cudf/cudf/_lib/reshape.pyx
@@ -2,39 +2,33 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.reshape cimport (
-    interleave_columns as cpp_interleave_columns,
-    tile as cpp_tile,
-)
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
+from cudf._lib.utils cimport columns_from_pylibcudf_table
+
+import cudf._lib.pylibcudf as plc
 
 
 @acquire_spill_lock()
 def interleave_columns(list source_columns):
-    cdef table_view c_view = table_view_from_columns(source_columns)
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_interleave_columns(c_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.reshape.interleave_columns(
+            plc.Table([
+                c.to_pylibcudf(mode="read") for c in source_columns
+            ])
+        )
+    )
 
 
 @acquire_spill_lock()
 def tile(list source_columns, size_type count):
     cdef size_type c_count = count
-    cdef table_view c_view = table_view_from_columns(source_columns)
-    cdef unique_ptr[table] c_result
-
-    with nogil:
-        c_result = move(cpp_tile(c_view, c_count))
 
-    return columns_from_unique_ptr(move(c_result))
+    return columns_from_pylibcudf_table(
+        plc.reshape.tile(
+            plc.Table([
+                c.to_pylibcudf(mode="read") for c in source_columns
+            ]),
+            c_count
+        )
+    )
diff --git a/python/cudf/cudf/pylibcudf_tests/test_reshape.py b/python/cudf/cudf/pylibcudf_tests/test_reshape.py
new file mode 100644
index 00000000000..b8b914f3f09
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_reshape.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq, assert_table_eq
+
+from cudf._lib import pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def reshape_data():
+    data = [[1, 2, 3], [4, 5, 6]]
+    return data
+
+
+@pytest.fixture(scope="module")
+def reshape_plc_tbl(reshape_data):
+    arrow_tbl = pa.Table.from_arrays(reshape_data, names=["a", "b"])
+    plc_tbl = plc.interop.from_arrow(arrow_tbl)
+    return plc_tbl
+
+
+def test_interleave_columns(reshape_data, reshape_plc_tbl):
+    res = plc.reshape.interleave_columns(reshape_plc_tbl)
+
+    interleaved_data = [pa.array(pair) for pair in zip(*reshape_data)]
+
+    expect = pa.concat_arrays(interleaved_data)
+
+    assert_column_eq(res, expect)
+
+
+@pytest.mark.parametrize("cnt", [0, 1, 3])
+def test_tile(reshape_data, reshape_plc_tbl, cnt):
+    res = plc.reshape.tile(reshape_plc_tbl, cnt)
+
+    tiled_data = [pa.array(col * cnt) for col in reshape_data]
+
+    expect = pa.Table.from_arrays(
+        tiled_data, schema=plc.interop.to_arrow(reshape_plc_tbl).schema
+    )
+
+    assert_table_eq(res, expect)

From 29429f7e4c871758c0de930026347e6e3b0a5a9a Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 28 May 2024 05:47:58 -0700
Subject: [PATCH 265/842] Work around issues with cccl main (#15552)

This gets cuDF build cccl main on 12.3.

There is one issue with the cuco tuple helpers but that will be fixed on
the cuco side

---------

Co-authored-by: Bernhard Manfred Gruber <bgruber@nvidia.com>
Co-authored-by: Bradley Dice <bdice@bradleydice.com>
Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Co-authored-by: ptaylor <paul.e.taylor@me.com>
Co-authored-by: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Co-authored-by: Yunsong Wang <yunsongw@nvidia.com>
---
 .pre-commit-config.yaml                       |  2 +
 .../thirdparty/patches/cccl_override.json     | 20 ++++++++
 .../patches/revert_pr_211_cccl_2.5.0.diff     | 47 +++++++++++++++++++
 ..._disable_64bit_dispatching_cccl_2.5.0.diff | 25 ++++++++++
 ..._faster_scan_compile_times_cccl_2.5.0.diff | 39 +++++++++++++++
 ..._faster_sort_compile_times_cccl_2.5.0.diff | 39 +++++++++++++++
 cpp/src/io/comp/statistics.cu                 |  9 ++--
 cpp/src/io/orc/reader_impl_decode.cu          |  3 +-
 cpp/src/io/orc/stripe_init.cu                 | 22 +++++----
 cpp/src/io/parquet/page_string_decode.cu      | 13 +++--
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  4 +-
 cpp/src/io/utilities/data_casting.cu          |  6 ++-
 cpp/src/join/distinct_hash_join.cu            |  2 +-
 cpp/src/strings/split/split_re.cu             |  4 +-
 cpp/tests/hash_map/map_test.cu                |  1 -
 15 files changed, 209 insertions(+), 27 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
 create mode 100644 cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff
 create mode 100644 cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff
 create mode 100644 cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 5a8d9f54673..2d3ffc287e9 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -7,11 +7,13 @@ repos:
       - id: trailing-whitespace
         exclude: |
           (?x)^(
+            ^cpp/cmake/thirdparty/patches/.*|
             ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
           )
       - id: end-of-file-fixer
         exclude: |
           (?x)^(
+            ^cpp/cmake/thirdparty/patches/.*|
             ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
           )
   - repo: https://github.com/PyCQA/isort
diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index b33f17f3e4a..059f713e7a5 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -18,6 +18,11 @@
           "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
           "fixed_in" : ""
         },
+        {
+          "file" : "${current_json_dir}/revert_pr_211_cccl_2.5.0.diff",
+          "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
+          "fixed_in" : ""
+        },
         {
           "file": "cccl/kernel_pointer_hiding.diff",
           "issue": "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]",
@@ -28,15 +33,30 @@
           "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
           "fixed_in" : ""
         },
+        {
+          "file" : "${current_json_dir}/thrust_disable_64bit_dispatching_cccl_2.5.0.diff",
+          "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
+          "fixed_in" : ""
+        },
         {
           "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff",
           "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
           "fixed_in" : ""
         },
+        {
+          "file" : "${current_json_dir}/thrust_faster_sort_compile_times_cccl_2.5.0.diff",
+          "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
+          "fixed_in" : ""
+        },
         {
           "file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff",
           "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
           "fixed_in" : ""
+        },
+        {
+          "file" : "${current_json_dir}/thrust_faster_scan_compile_times_cccl_2.5.0.diff",
+          "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
+          "fixed_in" : ""
         }
       ]
     }
diff --git a/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
new file mode 100644
index 00000000000..27ff16744f5
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
@@ -0,0 +1,47 @@
+diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
+index 046eb83c0..8047c9701 100644
+--- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
++++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
+@@ -53,41 +53,15 @@ namespace cuda_cub
+ 
+ namespace __copy
+ {
+-template <class Derived, class InputIt, class OutputIt>
+-OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
+-  execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::true_type)
+-{
+-  typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
+-  const auto n = thrust::distance(first, last);
+-  if (n > 0)
+-  {
+-    cudaError status;
+-    status = trivial_copy_device_to_device(
+-      policy,
+-      reinterpret_cast<InputTy*>(thrust::raw_pointer_cast(&*result)),
+-      reinterpret_cast<InputTy const*>(thrust::raw_pointer_cast(&*first)),
+-      n);
+-    cuda_cub::throw_on_error(status, "__copy:: D->D: failed");
+-  }
+-
+-  return result + n;
+-}
+ 
+ template <class Derived, class InputIt, class OutputIt>
+ OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
+-  execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::false_type)
++  execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
+ {
+   typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
+   return cuda_cub::transform(policy, first, last, result, thrust::identity<InputTy>());
+ }
+ 
+-template <class Derived, class InputIt, class OutputIt>
+-OutputIt THRUST_RUNTIME_FUNCTION
+-device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
+-{
+-  return device_to_device(
+-    policy, first, last, result, typename is_indirectly_trivially_relocatable_to<InputIt, OutputIt>::type());
+-}
+ } // namespace __copy
+ 
+ } // namespace cuda_cub
diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff
new file mode 100644
index 00000000000..6ae1e1c917b
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff
@@ -0,0 +1,25 @@
+diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
+index 2a3cc4e33..8fb337b26 100644
+--- a/thrust/thrust/system/cuda/detail/dispatch.h
++++ b/thrust/thrust/system/cuda/detail/dispatch.h
+@@ -44,8 +44,7 @@
+   }                                                                                   \
+   else                                                                                \
+   {                                                                                   \
+-    auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
+-    status                             = call arguments;                              \
++    throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
+   }
+ 
+ /**
+@@ -66,9 +65,7 @@
+   }                                                                                          \
+   else                                                                                       \
+   {                                                                                          \
+-    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1);      \
+-    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2);      \
+-    status                              = call arguments;                                    \
++    throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
+   }
+ /**
+  * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff
new file mode 100644
index 00000000000..fee46046194
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff
@@ -0,0 +1,39 @@
+diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+index 0606485bb..dbb99ff13 100644
+--- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
++++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy
+   };
+ 
+   /// SM60 (GP100)
+-  struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
++  struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+   {
+     enum
+     {
+diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
+index f39613adb..75bd16ff9 100644
+--- a/cub/cub/device/dispatch/dispatch_reduce.cuh
++++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
+@@ -488,7 +488,7 @@ struct DeviceReducePolicy
+   };
+ 
+   /// SM60
+-  struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
++  struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+   {
+     static constexpr int threads_per_block  = 256;
+     static constexpr int items_per_thread   = 16;
+diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+index 419908c4e..6ab0840e1 100644
+--- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
++++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
+@@ -339,7 +339,7 @@ struct DeviceScanPolicy
+   /// SM600
+   struct Policy600
+       : DefaultTuning
+-      , ChainedPolicy<600, Policy600, Policy520>
++      , ChainedPolicy<600, Policy600, Policy600>
+   {};
+ 
+   /// SM800
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff
new file mode 100644
index 00000000000..cb0cc55f4d2
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff
@@ -0,0 +1,39 @@
+diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
+index eb76ebb0b..c6c529a50 100644
+--- a/cub/cub/block/block_merge_sort.cuh
++++ b/cub/cub/block/block_merge_sort.cuh
+@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
+   KeyT key1 = keys_shared[keys1_beg];
+   KeyT key2 = keys_shared[keys2_beg];
+ 
+-#pragma unroll
++#pragma unroll 1
+   for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+   {
+     bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
+@@ -376,7 +376,7 @@ public:
+       //
+       KeyT max_key = oob_default;
+ 
+-#pragma unroll
++#pragma unroll 1
+       for (int item = 1; item < ITEMS_PER_THREAD; ++item)
+       {
+         if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
+diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh
+index 7d9e8622f..da5627306 100644
+--- a/cub/cub/thread/thread_sort.cuh
++++ b/cub/cub/thread/thread_sort.cuh
+@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE
+ {
+   constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;
+ 
+-#pragma unroll
++#pragma unroll 1
+   for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+   {
+-#pragma unroll
++#pragma unroll 1
+     for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
+     {
+       if (compare_op(keys[j + 1], keys[j]))
diff --git a/cpp/src/io/comp/statistics.cu b/cpp/src/io/comp/statistics.cu
index 2a9eb782800..faf967041bc 100644
--- a/cpp/src/io/comp/statistics.cu
+++ b/cpp/src/io/comp/statistics.cu
@@ -18,6 +18,7 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <cuda/functional>
 #include <thrust/transform_reduce.h>
 
 namespace cudf::io {
@@ -32,9 +33,9 @@ writer_compression_statistics collect_compression_statistics(
     rmm::exec_policy(stream),
     results.begin(),
     results.end(),
-    [] __device__(auto& res) {
+    cuda::proclaim_return_type<size_t>([] __device__(compression_result const& res) {
       return res.status == compression_status::SUCCESS ? res.bytes_written : 0;
-    },
+    }),
     0ul,
     thrust::plus<size_t>());
 
@@ -47,9 +48,9 @@ writer_compression_statistics collect_compression_statistics(
       rmm::exec_policy(stream),
       zipped_begin,
       zipped_end,
-      [status] __device__(auto tup) {
+      cuda::proclaim_return_type<size_t>([status] __device__(auto tup) {
         return thrust::get<1>(tup).status == status ? thrust::get<0>(tup).size() : 0;
-      },
+      }),
       0ul,
       thrust::plus<size_t>());
   };
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index ec936b85761..da9fb802a0a 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -692,8 +692,7 @@ std::vector<range> find_table_splits(table_view const& input,
      d_sizes  = d_segmented_sizes->view().begin<size_type>()] __device__(auto const segment_idx) {
       // Since the number of rows may not divisible by segment_length,
       // the last segment may be shorter than the others.
-      auto const current_length =
-        cuda::std::min(segment_length, num_rows - segment_length * segment_idx);
+      auto const current_length = min(segment_length, num_rows - segment_length * segment_idx);
       auto const size = d_sizes[segment_idx] / CHAR_BIT;  // divide by CHAR_BIT to get size in bytes
       return cumulative_size{static_cast<std::size_t>(current_length),
                              static_cast<std::size_t>(size)};
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index dd44b779402..89dbbcb796c 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -561,20 +561,26 @@ void __host__ ParseCompressedStripeData(CompressedStreamInfo* strm_info,
                                         uint32_t log2maxcr,
                                         rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(128, 1);
-  dim3 dim_grid((num_streams + 3) >> 2, 1);  // 1 stream per warp, 4 warps per block
-  gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream.value()>>>(
-    strm_info, num_streams, compression_block_size, log2maxcr);
+  auto const num_blocks = (num_streams + 3) >> 2;  // 1 stream per warp, 4 warps per block
+  if (num_blocks > 0) {
+    dim3 dim_block(128, 1);
+    dim3 dim_grid(num_blocks, 1);
+    gpuParseCompressedStripeData<<<dim_grid, dim_block, 0, stream.value()>>>(
+      strm_info, num_streams, compression_block_size, log2maxcr);
+  }
 }
 
 void __host__ PostDecompressionReassemble(CompressedStreamInfo* strm_info,
                                           int32_t num_streams,
                                           rmm::cuda_stream_view stream)
 {
-  dim3 dim_block(128, 1);
-  dim3 dim_grid((num_streams + 3) >> 2, 1);  // 1 stream per warp, 4 warps per block
-  gpuPostDecompressionReassemble<<<dim_grid, dim_block, 0, stream.value()>>>(strm_info,
-                                                                             num_streams);
+  auto const num_blocks = (num_streams + 3) >> 2;  // 1 stream per warp, 4 warps per block
+  if (num_blocks > 0) {
+    dim3 dim_block(128, 1);
+    dim3 dim_grid(num_blocks, 1);
+    gpuPostDecompressionReassemble<<<dim_grid, dim_block, 0, stream.value()>>>(strm_info,
+                                                                               num_streams);
+  }
 }
 
 void __host__ ParseRowGroupIndex(RowGroup* row_groups,
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index cf1dc58b06a..ba3d35b9586 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -1197,14 +1197,17 @@ void ComputePageStringSizes(cudf::detail::hostdevice_span<PageInfo> pages,
   cudf::detail::join_streams(streams, stream);
 
   // check for needed temp space for DELTA_BYTE_ARRAY
-  auto const need_sizes = thrust::any_of(
-    rmm::exec_policy(stream), pages.device_begin(), pages.device_end(), [] __device__(auto& page) {
-      return page.temp_string_size != 0;
-    });
+  auto const need_sizes =
+    thrust::any_of(rmm::exec_policy(stream),
+                   pages.device_begin(),
+                   pages.device_end(),
+                   cuda::proclaim_return_type<bool>(
+                     [] __device__(auto& page) { return page.temp_string_size != 0; }));
 
   if (need_sizes) {
     // sum up all of the temp_string_sizes
-    auto const page_sizes = [] __device__(PageInfo const& page) { return page.temp_string_size; };
+    auto const page_sizes = cuda::proclaim_return_type<int64_t>(
+      [] __device__(PageInfo const& page) { return page.temp_string_size; });
     auto const total_size = thrust::transform_reduce(rmm::exec_policy(stream),
                                                      pages.device_begin(),
                                                      pages.device_end(),
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index f533f04e427..7cb982f103d 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -452,9 +452,9 @@ std::string encoding_to_string(Encoding encoding)
 [[nodiscard]] std::string list_unsupported_encodings(device_span<PageInfo const> pages,
                                                      rmm::cuda_stream_view stream)
 {
-  auto const to_mask = [] __device__(auto const& page) {
+  auto const to_mask = cuda::proclaim_return_type<uint32_t>([] __device__(auto const& page) {
     return is_supported_encoding(page.encoding) ? 0U : encoding_to_mask(page.encoding);
-  };
+  });
   uint32_t const unsupported = thrust::transform_reduce(
     rmm::exec_policy(stream), pages.begin(), pages.end(), to_mask, 0U, thrust::bit_or<uint32_t>());
   return encoding_bitmask_to_str(unsupported);
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index c9e507925ec..60cbfbc0dae 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -34,6 +34,7 @@
 #include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
+#include <cuda/functional>
 #include <thrust/copy.h>
 #include <thrust/functional.h>
 #include <thrust/transform_reduce.h>
@@ -783,7 +784,8 @@ template <typename SymbolT>
 struct to_string_view_pair {
   SymbolT const* data;
   to_string_view_pair(SymbolT const* _data) : data(_data) {}
-  __device__ auto operator()(thrust::tuple<size_type, size_type> ip)
+  __device__ thrust::pair<char const*, std::size_t> operator()(
+    thrust::tuple<size_type, size_type> ip)
   {
     return thrust::pair<char const*, std::size_t>{data + thrust::get<0>(ip),
                                                   static_cast<std::size_t>(thrust::get<1>(ip))};
@@ -805,7 +807,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
     rmm::exec_policy(stream),
     str_tuples,
     str_tuples + col_size,
-    [] __device__(auto t) { return t.second; },
+    cuda::proclaim_return_type<std::size_t>([] __device__(auto t) { return t.second; }),
     size_type{0},
     thrust::maximum<size_type>{});
 
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index ad401bdccba..5048da25e86 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -182,7 +182,7 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
     thrust::make_transform_output_iterator(probe_indices->begin(), output_fn{});
 
   auto const [probe_indices_end, _] = this->_hash_table.retrieve(
-    iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, stream.value());
+    iter, iter + probe_table_num_rows, probe_indices_begin, build_indices_begin, {stream.value()});
 
   auto const actual_size = std::distance(probe_indices_begin, probe_indices_end);
   build_indices->resize(actual_size, stream);
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index 6785ab9c893..d72ec1085b5 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -219,9 +219,9 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
     rmm::exec_policy(stream),
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_counting_iterator<size_type>(strings_count),
-    [d_offsets] __device__(auto const idx) -> size_type {
+    cuda::proclaim_return_type<size_type>([d_offsets] __device__(auto const idx) -> size_type {
       return static_cast<size_type>(d_offsets[idx + 1] - d_offsets[idx]);
-    },
+    }),
     0,
     thrust::maximum<size_type>{});
 
diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu
index 4b10716706b..be2e33538b9 100644
--- a/cpp/tests/hash_map/map_test.cu
+++ b/cpp/tests/hash_map/map_test.cu
@@ -69,7 +69,6 @@ struct InsertTest : public cudf::test::BaseFixture {
 
 using TestTypes = ::testing::Types<key_value_types<int32_t, int32_t>,
                                    key_value_types<int64_t, int64_t>,
-                                   key_value_types<int16_t, int16_t>,
                                    key_value_types<int32_t, float>,
                                    key_value_types<int64_t, double>>;
 

From bdafa738cb7c0b4354efb22783ffd5d6edefebd6 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 28 May 2024 22:50:03 -0500
Subject: [PATCH 266/842] Migrate string `capitalize` APIs to `pylibcudf`
 (#15503)

This PR creates the `pylibcudf.strings.capitalize` namespace and migrates the cuDF cython to use it. Depends on https://github.com/rapidsai/cudf/pull/15489

Part of https://github.com/rapidsai/cudf/issues/15162

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15503
---
 .../_lib/pylibcudf/libcudf/CMakeLists.txt     |  2 +-
 .../libcudf/scalar/scalar_factories.pxd       | 10 +++
 .../pylibcudf/libcudf/strings/CMakeLists.txt  | 23 +++++++
 .../pylibcudf/libcudf/strings/capitalize.pxd  | 12 +++-
 .../_lib/pylibcudf/libcudf/strings/case.pxd   |  6 ++
 .../pylibcudf/libcudf/strings/char_types.pxd  | 23 +++----
 .../pylibcudf/libcudf/strings/char_types.pyx  |  0
 .../_lib/pylibcudf/strings/CMakeLists.txt     |  3 +-
 .../cudf/_lib/pylibcudf/strings/__init__.pxd  |  2 +-
 .../cudf/_lib/pylibcudf/strings/__init__.py   |  2 +-
 .../_lib/pylibcudf/strings/capitalize.pxd     |  9 +++
 .../_lib/pylibcudf/strings/capitalize.pyx     | 62 +++++++++++++++++++
 .../_lib/pylibcudf/strings/char_types.pxd     |  5 ++
 .../_lib/pylibcudf/strings/char_types.pyx     |  4 ++
 python/cudf/cudf/_lib/strings/capitalize.pyx  | 48 +++++---------
 .../cudf/cudf/pylibcudf_tests/common/utils.py |  1 -
 .../pylibcudf_tests/test_string_capitalize.py | 54 ++++++++++++++++
 17 files changed, 217 insertions(+), 49 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py

diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
index 89d3dc66f00..8a6ce6a5187 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
@@ -17,9 +17,9 @@ set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.p
 )
 
 set(linked_libraries cudf::cudf)
-
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp
 )
+add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
new file mode 100644
index 00000000000..5c4e5bf346f
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+
+
+cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil:
+    cdef unique_ptr[scalar] make_string_scalar(const string & _string) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt
new file mode 100644
index 00000000000..930c22781d0
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt
@@ -0,0 +1,23 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources char_types.pyx)
+
+set(linked_libraries cudf::cudf)
+
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_strings
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd
index f95d4f35566..b0771e16680 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd
@@ -3,14 +3,22 @@ from libcpp.memory cimport unique_ptr
 
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.strings.char_types cimport (
+    string_character_types,
+)
 
 
 cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil:
     cdef unique_ptr[column] capitalize(
-        const column_view & strings) except +
+        const column_view & strings,
+        const string_scalar & delimiters
+        ) except +
 
     cdef unique_ptr[column] title(
-        const column_view & strings) except +
+        const column_view & strings,
+        string_character_types sequence_type
+        ) except +
 
     cdef unique_ptr[column] is_title(
         const column_view & strings) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd
index 9ccd2737afe..82c146b0023 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd
@@ -6,6 +6,12 @@ from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/case.hpp" namespace "cudf::strings" nogil:
+    cdef unique_ptr[column] capitalize(
+        const column_view & input) except +
+
+    cdef unique_ptr[column] is_title(
+        const column_view & input) except +
+
     cdef unique_ptr[column] to_lower(
         const column_view & strings) except +
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd
index 408b3687c4a..f63e1a93f91 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint32_t
 from libcpp.memory cimport unique_ptr
 
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
@@ -10,17 +11,17 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 cdef extern from "cudf/strings/char_types/char_types.hpp" \
         namespace "cudf::strings" nogil:
 
-    ctypedef enum string_character_types:
-        DECIMAL 'cudf::strings::string_character_types::DECIMAL'
-        NUMERIC  'cudf::strings::string_character_types::NUMERIC'
-        DIGIT 'cudf::strings::string_character_types::DIGIT'
-        ALPHA 'cudf::strings::string_character_types::ALPHA'
-        SPACE 'cudf::strings::string_character_types::SPACE'
-        UPPER 'cudf::strings::string_character_types::UPPER'
-        LOWER 'cudf::strings::string_character_types::LOWER'
-        ALPHANUM 'cudf::strings::string_character_types::ALPHANUM'
-        CASE_TYPES 'cudf::strings::string_character_types::CASE_TYPES'
-        ALL_TYPES 'cudf::strings::string_character_types::ALL_TYPES'
+    cpdef enum class string_character_types(uint32_t):
+        DECIMAL
+        NUMERIC
+        DIGIT
+        ALPHA
+        SPACE
+        UPPER
+        LOWER
+        ALPHANUM
+        CASE_TYPES
+        ALL_TYPES
 
 cdef extern from "cudf/strings/char_types/char_types.hpp" \
         namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
index c42b57ece63..0e9c1c916f0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
@@ -12,7 +12,8 @@
 # the License.
 # =============================================================================
 
-set(cython_sources case.pyx find.pyx)
+set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx)
+
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
   CXX
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
index 33e2d56c087..ec3dbc150b5 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
@@ -1,3 +1,3 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . cimport case, find
+from . cimport capitalize, case, char_types, find
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
index 9220f6bd045..3793bda0aa4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
@@ -1,3 +1,3 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import case, find
+from . import capitalize, case, char_types, find
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd
new file mode 100644
index 00000000000..9acf189fc23
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd
@@ -0,0 +1,9 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.scalar cimport Scalar
+
+
+cpdef Column capitalize(Column input, Scalar delimiters=*)
+cpdef Column title(Column input)
+cpdef Column is_title(Column input)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx
new file mode 100644
index 00000000000..d3f79088018
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx
@@ -0,0 +1,62 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from cudf._lib.pylibcudf.libcudf.strings cimport capitalize as cpp_capitalize
+from cudf._lib.pylibcudf.scalar cimport Scalar
+from cudf._lib.pylibcudf.strings.char_types cimport string_character_types
+
+from cython.operator import dereference
+
+
+cpdef Column capitalize(
+    Column input,
+    Scalar delimiters=None
+    # TODO: default scalar values
+    # https://github.com/rapidsai/cudf/issues/15505
+):
+
+    cdef unique_ptr[column] c_result
+
+    if delimiters is None:
+        delimiters = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    cdef const string_scalar* cpp_delimiters = <const string_scalar*>(
+        delimiters.c_obj.get()
+    )
+
+    with nogil:
+        c_result = cpp_capitalize.capitalize(
+            input.view(),
+            dereference(cpp_delimiters)
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column title(
+    Column input,
+    string_character_types sequence_type=string_character_types.ALPHA
+):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_capitalize.title(input.view(), sequence_type)
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column is_title(Column input):
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = cpp_capitalize.is_title(input.view())
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd
new file mode 100644
index 00000000000..a80e02f520c
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.libcudf.strings.char_types cimport (
+    string_character_types,
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx
new file mode 100644
index 00000000000..d96161951c6
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.libcudf.strings.char_types import \
+    string_character_types as StringCharacterTypes  # no-cython-lint
diff --git a/python/cudf/cudf/_lib/strings/capitalize.pyx b/python/cudf/cudf/_lib/strings/capitalize.pyx
index 1420a2bbaf2..b3ca6a5ac8f 100644
--- a/python/cudf/cudf/_lib/strings/capitalize.pyx
+++ b/python/cudf/cudf/_lib/strings/capitalize.pyx
@@ -2,47 +2,33 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.capitalize cimport (
-    capitalize as cpp_capitalize,
-    is_title as cpp_is_title,
-    title as cpp_title,
-)
+
+import cudf._lib.pylibcudf as plc
 
 
 @acquire_spill_lock()
 def capitalize(Column source_strings):
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_capitalize(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.capitalize.capitalize(
+            source_strings.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
 def title(Column source_strings):
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_title(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.capitalize.title(
+            source_strings.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
 def is_title(Column source_strings):
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    with nogil:
-        c_result = move(cpp_is_title(source_view))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.capitalize.is_title(
+            source_strings.to_pylibcudf(mode="read")
+        )
+    )
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index 6636ab9e5f8..596cd2c92ae 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -35,7 +35,6 @@ def assert_column_eq(plc_column: plc.Column, pa_array: pa.Array) -> None:
         plc_pa = plc_pa.combine_chunks()
     if isinstance(pa_array, pa.ChunkedArray):
         pa_array = pa_array.combine_chunks()
-
     assert plc_pa.equals(pa_array)
 
 
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py
new file mode 100644
index 00000000000..dd7e96e871b
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def pa_data():
+    data = [
+        "leopard",
+        "Golden Eagle",
+        "SNAKE",
+        "",
+        "!A",
+        "hello World",
+        "A B C",
+        "#",
+        "AƻB",
+        "Ⓑⓖ",
+        "Art of War",
+        "The quick bRoWn fox juMps over the laze DOG",
+        '123nr98nv9rev!$#INF4390v03n1243<>?}{:-"',
+        "accénted",
+        None,
+    ]
+    return pa.array(data)
+
+
+@pytest.fixture(scope="module")
+def plc_data(pa_data):
+    return plc.interop.from_arrow(pa_data)
+
+
+def test_capitalize(plc_data, pa_data):
+    got = plc.strings.capitalize.capitalize(plc_data)
+    expected = pa.compute.utf8_capitalize(pa_data)
+    assert_column_eq(got, expected)
+
+
+def test_title(plc_data, pa_data):
+    got = plc.strings.capitalize.title(
+        plc_data, plc.strings.char_types.StringCharacterTypes.CASE_TYPES
+    )
+    expected = pa.compute.utf8_title(pa_data)
+    assert_column_eq(got, expected)
+
+
+def test_is_title(plc_data, pa_data):
+    got = plc.strings.capitalize.is_title(plc_data)
+    expected = pa.compute.utf8_is_title(pa_data)
+    assert_column_eq(got, expected)

From ff981a4048a389b0e2582e94d3397a83096d16c9 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 29 May 2024 09:02:31 -0400
Subject: [PATCH 267/842] Improve performance for long strings for
 nvtext::replace_tokens (#15756)

Improves performance for `nvtext::replace_tokens` for long strings.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15756
---
 cpp/src/text/replace.cu          | 255 ++++++++++++++++++++++++-------
 cpp/tests/text/replace_tests.cpp |  22 +++
 2 files changed, 219 insertions(+), 58 deletions(-)

diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index 84ed1827117..81c787caf86 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -21,6 +21,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -28,16 +29,18 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <nvtext/detail/tokenize.hpp>
 #include <nvtext/replace.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cuda/atomic>
+#include <thrust/binary_search.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
 #include <thrust/find.h>
 #include <thrust/pair.h>
+#include <thrust/remove.h>
 
 namespace nvtext {
 namespace detail {
@@ -46,11 +49,13 @@ namespace {
 using replace_result = thrust::pair<bool, cudf::string_view>;
 
 struct base_token_replacer_fn {
-  cudf::column_device_view const d_strings;  ///< strings to tokenize
-  cudf::string_view const d_delimiter;       ///< delimiter characters for tokenizing
-  cudf::size_type* d_sizes{};                ///< for output string size
-  char* d_chars{};                           ///< output buffer
-  cudf::detail::input_offsetalator d_offsets;
+  cudf::column_device_view d_strings;          ///< strings to tokenize
+  cudf::string_view const d_delimiter;         ///< delimiter characters for tokenizing
+  cudf::size_type* d_sizes{};                  ///< for output string size
+  char* d_chars{};                             ///< output buffer
+  cudf::detail::input_offsetalator d_offsets;  ///< offsets for output buffer
+  cudf::size_type const* d_indices{};          ///< indices for long strings
+  cudf::size_type* d_output_sizes{};           ///< output sizes for long strings
 
   /**
    * @brief Tokenizes each string and calls the provided `replacer` function
@@ -61,7 +66,7 @@ struct base_token_replacer_fn {
    * @param replacer Function to call for each token to determined its replacement
    */
   template <typename ReplaceFn>
-  __device__ void process_string(cudf::size_type idx, ReplaceFn replacer)
+  __device__ void process_string(cudf::size_type idx, ReplaceFn replacer) const
   {
     if (d_strings.is_null(idx)) {
       if (!d_chars) { d_sizes[idx] = 0; }
@@ -100,6 +105,13 @@ struct base_token_replacer_fn {
       memcpy(out_ptr, in_ptr + last_pos, d_str.size_bytes() - last_pos);
     } else {
       d_sizes[idx] = nbytes;
+      // handles output size calculation for long strings
+      if (nbytes > 0 && d_indices) {
+        auto out_idx = d_indices[idx] - 1;  // adjust for upper_bound
+        cuda::atomic_ref<cudf::size_type, cuda::thread_scope_block> ref{
+          *(d_output_sizes + out_idx)};
+        ref.fetch_add(nbytes, cuda::std::memory_order_relaxed);
+      }
     }
   }
 };
@@ -119,7 +131,7 @@ using strings_iterator = cudf::column_device_view::const_iterator<cudf::string_v
 struct replace_tokens_fn : base_token_replacer_fn {
   strings_iterator d_targets_begin;  ///< strings to search for
   strings_iterator d_targets_end;
-  cudf::column_device_view const d_replacements;  ///< replacement strings
+  cudf::column_device_view const d_replacements;
 
   replace_tokens_fn(cudf::column_device_view const& d_strings,
                     cudf::string_view const& d_delimiter,
@@ -139,7 +151,7 @@ struct replace_tokens_fn : base_token_replacer_fn {
    * @param token Token candidate to be replaced.
    * @return result pair specifies replacement condition and new string
    */
-  __device__ replace_result token_replacement(cudf::string_view const& token)
+  __device__ replace_result token_replacement(cudf::string_view const& token) const
   {
     // check if the token matches any of the targets
     auto const found_itr = thrust::find(thrust::seq, d_targets_begin, d_targets_end, token);
@@ -157,13 +169,53 @@ struct replace_tokens_fn : base_token_replacer_fn {
     return replace_result{false, cudf::string_view()};
   }
 
-  __device__ void operator()(cudf::size_type idx)
+  __device__ void operator()(cudf::size_type idx) const
   {
     process_string(
       idx, [this] __device__(cudf::string_view const& token) { return token_replacement(token); });
   }
 };
 
+// For determining long strings processing
+constexpr cudf::size_type AVG_CHAR_BYTES_THRESHOLD = 64;
+// For computing sub-block sizes of long strings
+constexpr cudf::size_type LS_SUB_BLOCK_SIZE = 64;
+
+/**
+ * @brief Locate delimiters to produce sub-offsets in the input device array
+ *
+ * The sub-offsets provide additional tokenize boundaries within longer strings.
+ */
+struct sub_offset_fn {
+  char const* d_input_chars;
+  int64_t first_offset;
+  int64_t last_offset;
+  cudf::string_view const d_delimiter;
+
+  __device__ int64_t operator()(int64_t idx) const
+  {
+    // keep delimiter search within this sub-block
+    auto const end =
+      d_input_chars + std::min(last_offset, ((idx + 2) * LS_SUB_BLOCK_SIZE) + first_offset);
+    // starting point of this sub-block
+    auto itr = d_input_chars + first_offset + ((idx + 1) * LS_SUB_BLOCK_SIZE);
+    while ((itr < end) &&
+           cudf::strings::detail::is_utf8_continuation_char(static_cast<u_char>(*itr))) {
+      ++itr;
+    }
+    if (itr >= end) { return 0; }  // 0s will be filtered out
+    // now check for a delimiter in this block
+    auto tokenizer = characters_tokenizer(cudf::string_view{}, d_delimiter);
+    while (itr < end) {
+      auto chr      = cudf::char_utf8{};
+      auto chr_size = cudf::strings::detail::to_char_utf8(itr, chr);
+      if (tokenizer.is_delimiter(chr)) { break; }
+      itr += chr_size;
+    }
+    return (itr < end) ? thrust::distance(d_input_chars, itr) : 0L;
+  }
+};
+
 /**
  * @brief Functor to filter tokens in each string.
  *
@@ -187,20 +239,131 @@ struct remove_small_tokens_fn : base_token_replacer_fn {
   {
   }
 
-  __device__ void operator()(cudf::size_type idx)
+  __device__ replace_result token_replacement(cudf::string_view token) const
   {
-    auto replacer = [this] __device__(cudf::string_view const& token) {
-      return replace_result{token.length() < min_token_length, d_replacement};
-    };
-    process_string(idx, replacer);
+    return replace_result{token.length() < min_token_length, d_replacement};
+  }
+
+  __device__ void operator()(cudf::size_type idx) const
+  {
+    process_string(
+      idx, [this] __device__(cudf::string_view const& token) { return token_replacement(token); });
   }
 };
 
+/**
+ * @brief Common code for replace and filter
+ *
+ * Builds the output strings column using the given replace functor.
+ *
+ * @tparam ReplaceFn Functor called for replacing tokens
+ *
+ * @param replacer Functor for determining matching token and its replacement
+ * @param input Strings column to tokenize and replace
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of with replaced strings
+ */
+template <typename ReplacerFn>
+std::unique_ptr<cudf::column> replace_helper(ReplacerFn replacer,
+                                             cudf::strings_column_view const& input,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  auto const first_offset = (input.offset() == 0) ? 0L
+                                                  : cudf::strings::detail::get_offset_value(
+                                                      input.offsets(), input.offset(), stream);
+  auto const last_offset =
+    cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream);
+  auto const chars_size = last_offset - first_offset;
+
+  if ((chars_size / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
+    // this utility calls replacer to build the offsets and chars columns
+    auto [offsets_column, chars] =
+      cudf::strings::detail::make_strings_children(replacer, input.size(), stream, mr);
+    // return new strings column
+    return cudf::make_strings_column(input.size(),
+                                     std::move(offsets_column),
+                                     chars.release(),
+                                     input.null_count(),
+                                     cudf::detail::copy_bitmask(input.parent(), stream, mr));
+  }
+
+  // Long strings logic builds a new fake strings column with the same data but additional offsets
+  // thus converting the input to a larger column of smaller strings.
+  // This can be processed in parallel more efficiently than long strings in general.
+
+  auto const input_chars = input.chars_begin(stream);
+  auto const input_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
+
+  // divide up long strings into shorter strings by finding new sub-offsets at delimiters
+  auto sub_count   = chars_size / LS_SUB_BLOCK_SIZE;
+  auto tmp_offsets = rmm::device_uvector<int64_t>(sub_count + input.size() + 1, stream);
+  {
+    rmm::device_uvector<int64_t> sub_offsets(sub_count, stream);
+    auto const count_itr = thrust::make_counting_iterator<int64_t>(0);
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      count_itr,
+                      count_itr + sub_count,
+                      sub_offsets.data(),
+                      sub_offset_fn{input_chars, first_offset, last_offset});
+    // remove 0s -- where sub-offset could not be computed
+    auto const remove_end =
+      thrust::remove(rmm::exec_policy_nosync(stream), sub_offsets.begin(), sub_offsets.end(), 0L);
+    sub_count = thrust::distance(sub_offsets.begin(), remove_end);
+
+    // merge them with input offsets
+    thrust::merge(rmm::exec_policy_nosync(stream),
+                  input_offsets,
+                  input_offsets + input.size() + 1,
+                  sub_offsets.begin(),
+                  sub_offsets.begin() + sub_count,
+                  tmp_offsets.begin());
+    tmp_offsets.resize(sub_count + input.size() + 1, stream);
+    stream.synchronize();  // protect against destruction of sub_offsets
+  }
+
+  // cobble together a column_view of type STRING using the original data and the tmp offsets
+  auto const tmp_size    = static_cast<cudf::size_type>(tmp_offsets.size()) - 1;
+  auto const children    = std::vector<cudf::column_view>({cudf::column_view(
+    cudf::data_type{cudf::type_id::INT64}, tmp_size + 1, tmp_offsets.data(), nullptr, 0)});
+  auto const tmp_strings = cudf::column_view(
+    cudf::data_type{cudf::type_id::STRING}, tmp_size, input_chars, nullptr, 0, 0, children);
+  auto const d_tmp_strings = cudf::column_device_view::create(tmp_strings, stream);
+
+  // compute indices to the actual output rows
+  auto indices = rmm::device_uvector<cudf::size_type>(tmp_offsets.size(), stream);
+  thrust::upper_bound(rmm::exec_policy_nosync(stream),
+                      input_offsets,
+                      input_offsets + input.size() + 1,
+                      tmp_offsets.begin(),
+                      tmp_offsets.end(),
+                      indices.begin());
+
+  // initialize the output row sizes
+  auto d_sizes = rmm::device_uvector<cudf::size_type>(input.size(), stream);
+  thrust::fill(rmm::exec_policy_nosync(stream), d_sizes.begin(), d_sizes.end(), 0);
+
+  replacer.d_strings      = *d_tmp_strings;
+  replacer.d_indices      = indices.data();
+  replacer.d_output_sizes = d_sizes.data();
+
+  auto chars = std::get<1>(
+    cudf::strings::detail::make_strings_children(replacer, tmp_strings.size(), stream, mr));
+  auto offsets_column = std::get<0>(
+    cudf::strings::detail::make_offsets_child_column(d_sizes.begin(), d_sizes.end(), stream, mr));
+  return cudf::make_strings_column(input.size(),
+                                   std::move(offsets_column),
+                                   chars.release(),
+                                   input.null_count(),
+                                   cudf::detail::copy_bitmask(input.parent(), stream, mr));
+}
 }  // namespace
 
 // detail APIs
 
-std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& input,
                                              cudf::strings_column_view const& targets,
                                              cudf::strings_column_view const& replacements,
                                              cudf::string_scalar const& delimiter,
@@ -214,35 +377,23 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
                  "Parameter targets and replacements must be the same size");
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
-  cudf::size_type const strings_count = strings.size();
-  if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
-
-  auto strings_column      = cudf::column_device_view::create(strings.parent(), stream);
-  auto targets_column      = cudf::column_device_view::create(targets.parent(), stream);
-  auto replacements_column = cudf::column_device_view::create(replacements.parent(), stream);
-  cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
-  replace_tokens_fn replacer{*strings_column,
-                             d_delimiter,
-                             targets_column->begin<cudf::string_view>(),
-                             targets_column->end<cudf::string_view>(),
-                             *replacements_column};
+  if (input.is_empty()) { return cudf::make_empty_column(cudf::type_id::STRING); }
 
-  // copy null mask from input column
-  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
+  auto const d_strings      = cudf::column_device_view::create(input.parent(), stream);
+  auto const d_targets      = cudf::column_device_view::create(targets.parent(), stream);
+  auto const d_replacements = cudf::column_device_view::create(replacements.parent(), stream);
+  auto const d_delimiter    = cudf::string_view(delimiter.data(), delimiter.size());
 
-  // this utility calls replacer to build the offsets and chars columns
-  auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(replacer, strings_count, stream, mr);
+  replace_tokens_fn replacer{*d_strings,
+                             d_delimiter,
+                             d_targets->begin<cudf::string_view>(),
+                             d_targets->end<cudf::string_view>(),
+                             *d_replacements};
 
-  // return new strings column
-  return cudf::make_strings_column(strings_count,
-                                   std::move(offsets_column),
-                                   chars.release(),
-                                   strings.null_count(),
-                                   std::move(null_mask));
+  return replace_helper(replacer, input, stream, mr);
 }
 
-std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& strings,
+std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& input,
                                             cudf::size_type min_token_length,
                                             cudf::string_scalar const& replacement,
                                             cudf::string_scalar const& delimiter,
@@ -252,27 +403,15 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
   CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid");
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
-  cudf::size_type const strings_count = strings.size();
-  if (strings_count == 0) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
-
-  auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
-  cudf::string_view d_replacement(replacement.data(), replacement.size());
-  cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
-  remove_small_tokens_fn filterer{*strings_column, d_delimiter, min_token_length, d_replacement};
+  if (input.is_empty()) { return cudf::make_empty_column(cudf::type_id::STRING); }
 
-  // copy null mask from input column
-  rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr);
+  auto const d_strings     = cudf::column_device_view::create(input.parent(), stream);
+  auto const d_replacement = cudf::string_view(replacement.data(), replacement.size());
+  auto const d_delimiter   = cudf::string_view(delimiter.data(), delimiter.size());
 
-  // this utility calls filterer to build the offsets and chars columns
-  auto [offsets_column, chars] =
-    cudf::strings::detail::make_strings_children(filterer, strings_count, stream, mr);
+  remove_small_tokens_fn filterer{*d_strings, d_delimiter, min_token_length, d_replacement};
 
-  // return new strings column
-  return cudf::make_strings_column(strings_count,
-                                   std::move(offsets_column),
-                                   chars.release(),
-                                   strings.null_count(),
-                                   std::move(null_mask));
+  return replace_helper(filterer, input, stream, mr);
 }
 
 }  // namespace detail
diff --git a/cpp/tests/text/replace_tests.cpp b/cpp/tests/text/replace_tests.cpp
index 8c58c6bcaca..faced4a14d3 100644
--- a/cpp/tests/text/replace_tests.cpp
+++ b/cpp/tests/text/replace_tests.cpp
@@ -88,6 +88,28 @@ TEST_F(TextReplaceTest, ReplaceTokensEmptyTest)
   EXPECT_EQ(results->has_nulls(), false);
 }
 
+TEST_F(TextReplaceTest, ReplaceTokensLongStrings)
+{
+  cudf::test::strings_column_wrapper input{
+    "pellentesque ut euismod semo phaselus tristiut libero ut dui congusem non pellentesque nunc ",
+    "pellentesque ut euismod se phaselus tristiut libero ut dui congusem non pellentesque ",
+    "pellentesque ut euismod phaselus tristiut libero ut dui congusem non pellentesque nun ",
+    "pellentesque ut euismod seem phaselus tristiut libero ut dui congusem non pellentesque un "};
+  cudf::test::strings_column_wrapper targets({"ut", "pellentesque"});
+  cudf::test::strings_column_wrapper repls({"___", "é"});
+
+  auto expected = cudf::test::strings_column_wrapper{
+    "é ___ euismod semo phaselus tristiut libero ___ dui congusem non é nunc ",
+    "é ___ euismod se phaselus tristiut libero ___ dui congusem non é ",
+    "é ___ euismod phaselus tristiut libero ___ dui congusem non é nun ",
+    "é ___ euismod seem phaselus tristiut libero ___ dui congusem non é un "};
+
+  auto results = nvtext::replace_tokens(cudf::strings_column_view(input),
+                                        cudf::strings_column_view(targets),
+                                        cudf::strings_column_view(repls));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(TextReplaceTest, ReplaceTokensErrorTest)
 {
   auto strings = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});

From 2b031e06a7fe18eec462db445eea1c596b93a9f1 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 29 May 2024 09:21:52 -0700
Subject: [PATCH 268/842] Revert "Fix docs for IO readers and strings_convert"
 (#15872)

Reverts rapidsai/cudf#15842

The files the original PR added documentation for appear to contain some
text that is problematic for the Sphinx parser to extract from doxygen.
My best guess is that it's something in a table, since parsing doxygen
tables via Breathe is something I know can be tricky. We didn't catch
this issue because [we currently only build the text docs in nightly
builds, not
PRs](https://github.com/rapidsai/cudf/blob/branch-24.08/ci/build_docs.sh#L49),
and this issue only arises in those text builds. We can revisit adding
these docs in 24.08. For the sake of correctness, I have added back
building text docs in PRs in this PR (see #14856 for context on the
removal).
---
 ci/build_docs.sh                                 | 16 ++++++----------
 .../source/libcudf_docs/api_docs/io_readers.rst  |  2 +-
 .../libcudf_docs/api_docs/strings_convert.rst    |  2 +-
 3 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 668d52e530b..db306046667 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -46,11 +46,9 @@ pushd docs/cudf
 make dirhtml
 mkdir -p "${RAPIDS_DOCS_DIR}/cudf/html"
 mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/cudf/html"
-if [[ "${RAPIDS_BUILD_TYPE}" != "pull-request" ]]; then
-  make text
-  mkdir -p "${RAPIDS_DOCS_DIR}/cudf/txt"
-  mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt"
-fi
+make text
+mkdir -p "${RAPIDS_DOCS_DIR}/cudf/txt"
+mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt"
 popd
 
 rapids-logger "Build dask-cuDF Sphinx docs"
@@ -58,11 +56,9 @@ pushd docs/dask_cudf
 make dirhtml
 mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/html"
 mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html"
-if [[ "${RAPIDS_BUILD_TYPE}" != "pull-request" ]]; then
-  make text
-  mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
-  mv build/text/* "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
-fi
+make text
+mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
+mv build/text/* "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
 popd
 
 rapids-upload-docs
diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
index f94a5ddb403..a835673dee4 100644
--- a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
+++ b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
@@ -2,4 +2,4 @@ Io Readers
 ==========
 
 .. doxygengroup:: io_readers
-   :members:
+   :desc-only:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
index f2f320bd0e4..ae5d78fb1a1 100644
--- a/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
@@ -2,4 +2,4 @@ Strings Convert
 ===============
 
 .. doxygengroup:: strings_convert
-   :members:
+   :desc-only:

From 3b98f8100adaca742c00a075bed83175d43b7f26 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 29 May 2024 09:24:49 -0700
Subject: [PATCH 269/842] Refactor join benchmarks to target public APIs with
 the default stream (#15873)

This a followup of #15644.

It fixes the lhs/rhs input bug in the hash join and distinct join benchmarks.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/15873
---
 cpp/benchmarks/join/distinct_join.cu | 22 ++++++++++----------
 cpp/benchmarks/join/join.cu          | 30 ++++++----------------------
 cpp/benchmarks/join/join_common.hpp  |  9 +++------
 cpp/benchmarks/join/mixed_join.cu    | 15 +++++---------
 4 files changed, 24 insertions(+), 52 deletions(-)

diff --git a/cpp/benchmarks/join/distinct_join.cu b/cpp/benchmarks/join/distinct_join.cu
index af8fa1f9d94..3502cbcea2a 100644
--- a/cpp/benchmarks/join/distinct_join.cu
+++ b/cpp/benchmarks/join/distinct_join.cu
@@ -20,17 +20,16 @@ template <typename Key, bool Nullable>
 void distinct_inner_join(nvbench::state& state,
                          nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  auto join = [](cudf::table_view const& build_input,
-                 cudf::table_view const& probe_input,
-                 cudf::null_equality compare_nulls,
-                 rmm::cuda_stream_view stream) {
+  auto join = [](cudf::table_view const& probe_input,
+                 cudf::table_view const& build_input,
+                 cudf::null_equality compare_nulls) {
     auto const has_nulls =
       cudf::has_nested_nulls(build_input) || cudf::has_nested_nulls(probe_input)
         ? cudf::nullable_join::YES
         : cudf::nullable_join::NO;
     auto hj_obj = cudf::distinct_hash_join<cudf::has_nested::NO>{
-      build_input, probe_input, has_nulls, compare_nulls, stream};
-    return hj_obj.inner_join(stream);
+      build_input, probe_input, has_nulls, compare_nulls};
+    return hj_obj.inner_join();
   };
 
   BM_join<Key, Nullable>(state, join);
@@ -40,17 +39,16 @@ template <typename Key, bool Nullable>
 void distinct_left_join(nvbench::state& state,
                         nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
 {
-  auto join = [](cudf::table_view const& build_input,
-                 cudf::table_view const& probe_input,
-                 cudf::null_equality compare_nulls,
-                 rmm::cuda_stream_view stream) {
+  auto join = [](cudf::table_view const& probe_input,
+                 cudf::table_view const& build_input,
+                 cudf::null_equality compare_nulls) {
     auto const has_nulls =
       cudf::has_nested_nulls(build_input) || cudf::has_nested_nulls(probe_input)
         ? cudf::nullable_join::YES
         : cudf::nullable_join::NO;
     auto hj_obj = cudf::distinct_hash_join<cudf::has_nested::NO>{
-      build_input, probe_input, has_nulls, compare_nulls, stream};
-    return hj_obj.left_join(stream);
+      build_input, probe_input, has_nulls, compare_nulls};
+    return hj_obj.left_join();
   };
 
   BM_join<Key, Nullable>(state, join);
diff --git a/cpp/benchmarks/join/join.cu b/cpp/benchmarks/join/join.cu
index c4a39da4662..942fb823ddc 100644
--- a/cpp/benchmarks/join/join.cu
+++ b/cpp/benchmarks/join/join.cu
@@ -22,15 +22,9 @@ void nvbench_inner_join(nvbench::state& state,
 {
   auto join = [](cudf::table_view const& left_input,
                  cudf::table_view const& right_input,
-                 cudf::null_equality compare_nulls,
-                 rmm::cuda_stream_view stream) {
-    auto const has_nulls = cudf::has_nested_nulls(left_input) || cudf::has_nested_nulls(right_input)
-                             ? cudf::nullable_join::YES
-                             : cudf::nullable_join::NO;
-    cudf::hash_join hj_obj(left_input, has_nulls, compare_nulls, stream);
-    return hj_obj.inner_join(right_input, std::nullopt, stream);
+                 cudf::null_equality compare_nulls) {
+    return cudf::inner_join(left_input, right_input, compare_nulls);
   };
-
   BM_join<Key, Nullable>(state, join);
 }
 
@@ -39,15 +33,9 @@ void nvbench_left_join(nvbench::state& state, nvbench::type_list<Key, nvbench::e
 {
   auto join = [](cudf::table_view const& left_input,
                  cudf::table_view const& right_input,
-                 cudf::null_equality compare_nulls,
-                 rmm::cuda_stream_view stream) {
-    auto const has_nulls = cudf::has_nested_nulls(left_input) || cudf::has_nested_nulls(right_input)
-                             ? cudf::nullable_join::YES
-                             : cudf::nullable_join::NO;
-    cudf::hash_join hj_obj(left_input, has_nulls, compare_nulls, stream);
-    return hj_obj.left_join(right_input, std::nullopt, stream);
+                 cudf::null_equality compare_nulls) {
+    return cudf::left_join(left_input, right_input, compare_nulls);
   };
-
   BM_join<Key, Nullable>(state, join);
 }
 
@@ -56,15 +44,9 @@ void nvbench_full_join(nvbench::state& state, nvbench::type_list<Key, nvbench::e
 {
   auto join = [](cudf::table_view const& left_input,
                  cudf::table_view const& right_input,
-                 cudf::null_equality compare_nulls,
-                 rmm::cuda_stream_view stream) {
-    auto const has_nulls = cudf::has_nested_nulls(left_input) || cudf::has_nested_nulls(right_input)
-                             ? cudf::nullable_join::YES
-                             : cudf::nullable_join::NO;
-    cudf::hash_join hj_obj(left_input, has_nulls, compare_nulls, stream);
-    return hj_obj.full_join(right_input, std::nullopt, stream);
+                 cudf::null_equality compare_nulls) {
+    return cudf::full_join(left_input, right_input, compare_nulls);
   };
-
   BM_join<Key, Nullable>(state, join);
 }
 
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index 9e23d28b363..e6792b9dbfb 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -178,6 +178,7 @@ void BM_join(state_type& state, Join JoinFunc)
     }
   }
   if constexpr (std::is_same_v<state_type, nvbench::state> and (join_type != join_t::CONDITIONAL)) {
+    state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
     if constexpr (join_type == join_t::MIXED) {
       auto const col_ref_left_0 = cudf::ast::column_reference(0);
       auto const col_ref_right_0 =
@@ -185,23 +186,19 @@ void BM_join(state_type& state, Join JoinFunc)
       auto left_zero_eq_right_zero =
         cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
       state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-        rmm::cuda_stream_view stream_view{launch.get_stream()};
         auto result = JoinFunc(left_table.select(columns_to_join),
                                right_table.select(columns_to_join),
                                left_table.select({1}),
                                right_table.select({1}),
                                left_zero_eq_right_zero,
-                               cudf::null_equality::UNEQUAL,
-                               stream_view);
+                               cudf::null_equality::UNEQUAL);
       });
     }
     if constexpr (join_type == join_t::HASH) {
       state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-        rmm::cuda_stream_view stream_view{launch.get_stream()};
         auto result = JoinFunc(left_table.select(columns_to_join),
                                right_table.select(columns_to_join),
-                               cudf::null_equality::UNEQUAL,
-                               stream_view);
+                               cudf::null_equality::UNEQUAL);
       });
     }
   }
diff --git a/cpp/benchmarks/join/mixed_join.cu b/cpp/benchmarks/join/mixed_join.cu
index 129ea62e7a6..0345d1e93fa 100644
--- a/cpp/benchmarks/join/mixed_join.cu
+++ b/cpp/benchmarks/join/mixed_join.cu
@@ -25,8 +25,7 @@ void nvbench_mixed_inner_join(nvbench::state& state,
                  cudf::table_view const& left_conditional_input,
                  cudf::table_view const& right_conditional_input,
                  cudf::ast::operation binary_pred,
-                 cudf::null_equality compare_nulls,
-                 rmm::cuda_stream_view stream) {
+                 cudf::null_equality compare_nulls) {
     return cudf::mixed_inner_join(left_equality_input,
                                   right_equality_input,
                                   left_conditional_input,
@@ -47,8 +46,7 @@ void nvbench_mixed_left_join(nvbench::state& state,
                  cudf::table_view const& left_conditional_input,
                  cudf::table_view const& right_conditional_input,
                  cudf::ast::operation binary_pred,
-                 cudf::null_equality compare_nulls,
-                 rmm::cuda_stream_view stream) {
+                 cudf::null_equality compare_nulls) {
     return cudf::mixed_left_join(left_equality_input,
                                  right_equality_input,
                                  left_conditional_input,
@@ -69,8 +67,7 @@ void nvbench_mixed_full_join(nvbench::state& state,
                  cudf::table_view const& left_conditional_input,
                  cudf::table_view const& right_conditional_input,
                  cudf::ast::operation binary_pred,
-                 cudf::null_equality compare_nulls,
-                 rmm::cuda_stream_view stream) {
+                 cudf::null_equality compare_nulls) {
     return cudf::mixed_full_join(left_equality_input,
                                  right_equality_input,
                                  left_conditional_input,
@@ -91,8 +88,7 @@ void nvbench_mixed_left_semi_join(nvbench::state& state,
                  cudf::table_view const& left_conditional_input,
                  cudf::table_view const& right_conditional_input,
                  cudf::ast::operation binary_pred,
-                 cudf::null_equality compare_nulls,
-                 rmm::cuda_stream_view stream) {
+                 cudf::null_equality compare_nulls) {
     return cudf::mixed_left_semi_join(left_equality_input,
                                       right_equality_input,
                                       left_conditional_input,
@@ -113,8 +109,7 @@ void nvbench_mixed_left_anti_join(nvbench::state& state,
                  cudf::table_view const& left_conditional_input,
                  cudf::table_view const& right_conditional_input,
                  cudf::ast::operation binary_pred,
-                 cudf::null_equality compare_nulls,
-                 rmm::cuda_stream_view stream) {
+                 cudf::null_equality compare_nulls) {
     return cudf::mixed_left_anti_join(left_equality_input,
                                       right_equality_input,
                                       left_conditional_input,

From afd5522b31c522bab2f093f620e600e79662c433 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 29 May 2024 12:03:02 -0500
Subject: [PATCH 270/842] add unit test setup for cudf_kafka (#15853)

Fixes #15841

Proposes adding a basic unit test setup for `cudf_kafka`.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15853
---
 ci/run_cudf_kafka_pytests.sh                       |  9 +++++++++
 ci/run_custreamz_pytests.sh                        |  2 +-
 ci/run_dask_cudf_pytests.sh                        |  2 +-
 ci/test_python_other.sh                            |  4 ++++
 python/cudf_kafka/cudf_kafka/tests/__init__.py     |  0
 python/cudf_kafka/cudf_kafka/tests/test_version.py | 12 ++++++++++++
 python/cudf_kafka/pyproject.toml                   |  5 +++++
 7 files changed, 32 insertions(+), 2 deletions(-)
 create mode 100755 ci/run_cudf_kafka_pytests.sh
 create mode 100644 python/cudf_kafka/cudf_kafka/tests/__init__.py
 create mode 100644 python/cudf_kafka/cudf_kafka/tests/test_version.py

diff --git a/ci/run_cudf_kafka_pytests.sh b/ci/run_cudf_kafka_pytests.sh
new file mode 100755
index 00000000000..de227c84872
--- /dev/null
+++ b/ci/run_cudf_kafka_pytests.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Support invoking run_cudf_kafka_pytests.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_kafka/cudf_kafka
+
+pytest --cache-clear "$@" tests
diff --git a/ci/run_custreamz_pytests.sh b/ci/run_custreamz_pytests.sh
index 53e27ec64b3..67b152fc187 100755
--- a/ci/run_custreamz_pytests.sh
+++ b/ci/run_custreamz_pytests.sh
@@ -3,7 +3,7 @@
 
 set -euo pipefail
 
-# It is essential to cd into python/cudf/cudf as `pytest-xdist` + `coverage` seem to work only at this directory level.
+# It is essential to cd into python/custreamz/custreamz/ as `pytest-xdist` + `coverage` seem to work only at this directory level.
 
 # Support invoking run_custreamz_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/custreamz/custreamz/
diff --git a/ci/run_dask_cudf_pytests.sh b/ci/run_dask_cudf_pytests.sh
index 07658c6d234..37aadb5fee9 100755
--- a/ci/run_dask_cudf_pytests.sh
+++ b/ci/run_dask_cudf_pytests.sh
@@ -3,7 +3,7 @@
 
 set -euo pipefail
 
-# It is essential to cd into python/cudf/cudf as `pytest-xdist` + `coverage` seem to work only at this directory level.
+# It is essential to cd into python/dask_cudf/dask_cudf/ as `pytest-xdist` + `coverage` seem to work only at this directory level.
 
 # Support invoking run_dask_cudf_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/dask_cudf/dask_cudf/
diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh
index cbc1dc1cb87..06a24773cae 100755
--- a/ci/test_python_other.sh
+++ b/ci/test_python_other.sh
@@ -36,6 +36,10 @@ DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \
   --dist=loadscope \
   .
 
+rapids-logger "pytest cudf_kafka"
+./ci/run_cudf_kafka_pytests.sh \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-kafka.xml"
+
 rapids-logger "pytest custreamz"
 ./ci/run_custreamz_pytests.sh \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-custreamz.xml" \
diff --git a/python/cudf_kafka/cudf_kafka/tests/__init__.py b/python/cudf_kafka/cudf_kafka/tests/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf_kafka/cudf_kafka/tests/test_version.py b/python/cudf_kafka/cudf_kafka/tests/test_version.py
new file mode 100644
index 00000000000..2dc2846c4cf
--- /dev/null
+++ b/python/cudf_kafka/cudf_kafka/tests/test_version.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import cudf_kafka
+
+
+def test_version_constants_are_populated():
+    # __git_commit__ will only be non-empty in a built distribution
+    assert isinstance(cudf_kafka.__git_commit__, str)
+
+    # __version__ should always be non-empty
+    assert isinstance(cudf_kafka.__version__, str)
+    assert len(cudf_kafka.__version__) > 0
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index d34a1260422..9233d0e92dd 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -82,6 +82,11 @@ skip = [
     "__init__.py",
 ]
 
+[tool.pytest.ini_options]
+filterwarnings = [
+  "error"
+]
+
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"

From 7b02f4b0b5adcc30db106a0b63f7273c9dff1984 Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Wed, 29 May 2024 13:24:24 -0400
Subject: [PATCH 271/842] DOC: add linkcode to docs (#15860)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This PR adds a [source] button in the API docs which allows readers to jump into the code behind the API docs.

This is currently done in pandas e.g. https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html#pandas.DataFrame and below. The code is also copied and modified from the pandas repo (https://github.com/pandas-dev/pandas/blob/main/doc/source/conf.py#L637).

![Screenshot 2024-05-24 at 3 57 57 PM](https://github.com/rapidsai/cudf/assets/17162724/0bc04c1b-25c3-4d0f-a777-5e3fc42d0ce1)

Authors:
  - Ray Bell (https://github.com/raybellwaves)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15860
---
 docs/cudf/source/conf.py | 61 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 61 insertions(+)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index bcefa3fbdf8..73d8b4445d3 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -19,10 +19,12 @@
 import datetime
 import filecmp
 import glob
+import inspect
 import os
 import re
 import sys
 import tempfile
+import warnings
 import xml.etree.ElementTree as ET
 
 from docutils.nodes import Text
@@ -69,6 +71,7 @@ class PseudoLexer(RegexLexer):
     "sphinx.ext.autosummary",
     "sphinx_copybutton",
     "sphinx_remove_toctrees",
+    "sphinx.ext.linkcode",
     "numpydoc",
     "IPython.sphinxext.ipython_console_highlighting",
     "IPython.sphinxext.ipython_directive",
@@ -557,6 +560,64 @@ def on_missing_reference(app, env, node, contnode):
 ]
 
 
+# Needed for the [source] button on the API docs to link to the github code
+# based on pandas doc/source/conf.py
+def linkcode_resolve(domain, info) -> str | None:
+    """
+    Determine the URL corresponding to Python object
+    """
+    if domain != "py":
+        return None
+
+    modname = info["module"]
+    fullname = info["fullname"]
+
+    submod = sys.modules.get(modname)
+    if submod is None:
+        return None
+
+    obj = submod
+    for part in fullname.split("."):
+        try:
+            with warnings.catch_warnings():
+                # Accessing deprecated objects will generate noisy warnings
+                warnings.simplefilter("ignore", FutureWarning)
+                obj = getattr(obj, part)
+        except AttributeError:
+            return None
+
+    try:
+        fn = inspect.getsourcefile(inspect.unwrap(obj))
+    except TypeError:
+        try:  # property
+            fn = inspect.getsourcefile(inspect.unwrap(obj.fget))
+        except (AttributeError, TypeError):
+            fn = None
+    if not fn:
+        return None
+
+    try:
+        source, lineno = inspect.getsourcelines(obj)
+    except TypeError:
+        try:  # property
+            source, lineno = inspect.getsourcelines(obj.fget)
+        except (AttributeError, TypeError):
+            lineno = None
+    except OSError:
+        lineno = None
+
+    if lineno:
+        linespec = f"#L{lineno}-L{lineno + len(source) - 1}"
+    else:
+        linespec = ""
+
+    fn = os.path.relpath(fn, start=os.path.dirname(cudf.__file__))
+    return (
+        f"https://github.com/rapidsai/cudf/blob/"
+        f"branch-{version}/python/cudf/cudf/{fn}{linespec}"
+    )
+
+
 def setup(app):
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")
     app.add_js_file(

From eafa570c24a2130292894dd91b68e57edfcbcc96 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Wed, 29 May 2024 14:46:54 -0400
Subject: [PATCH 272/842] Add `from_arrow_host` functions for cudf interop with
 nanoarrow (#15645)

Following up from #15458 and continuing the work to address #14926 adding host memory version of `from_arrow_device` which will perform the copies from host memory to create cudf objects.

Authors:
  - Matt Topol (https://github.com/zeroshade)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/15645
---
 cpp/CMakeLists.txt                           |   3 +-
 cpp/include/cudf/interop.hpp                 |  91 ++-
 cpp/src/interop/arrow_utilities.cpp          |  90 +++
 cpp/src/interop/arrow_utilities.hpp          |  21 +
 cpp/src/interop/from_arrow_device.cu         | 109 ++--
 cpp/src/interop/from_arrow_host.cu           | 492 +++++++++++++++
 cpp/src/interop/to_arrow_device.cu           |   1 -
 cpp/src/interop/to_arrow_schema.cpp          |   2 +-
 cpp/src/interop/to_arrow_utilities.cpp       |  44 --
 cpp/src/interop/to_arrow_utilities.hpp       |  34 --
 cpp/tests/CMakeLists.txt                     |   1 +
 cpp/tests/interop/from_arrow_device_test.cpp |  12 +-
 cpp/tests/interop/from_arrow_host_test.cpp   | 612 +++++++++++++++++++
 cpp/tests/interop/nanoarrow_utils.hpp        | 236 +++++++
 cpp/tests/interop/to_arrow_device_test.cpp   | 107 ++--
 15 files changed, 1631 insertions(+), 224 deletions(-)
 create mode 100644 cpp/src/interop/arrow_utilities.cpp
 create mode 100644 cpp/src/interop/from_arrow_host.cu
 delete mode 100644 cpp/src/interop/to_arrow_utilities.cpp
 delete mode 100644 cpp/src/interop/to_arrow_utilities.hpp
 create mode 100644 cpp/tests/interop/from_arrow_host_test.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f69f04f9c10..f637db66c2c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -360,11 +360,12 @@ add_library(
   src/hash/xxhash_64.cu
   src/interop/dlpack.cpp
   src/interop/from_arrow.cu
+  src/interop/arrow_utilities.cpp
   src/interop/to_arrow.cu
   src/interop/to_arrow_device.cu
   src/interop/from_arrow_device.cu
+  src/interop/from_arrow_host.cu
   src/interop/to_arrow_schema.cpp
-  src/interop/to_arrow_utilities.cpp
   src/interop/detail/arrow_allocator.cpp
   src/io/avro/avro.cpp
   src/io/avro/avro_gpu.cu
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index bb05a622f40..f3ff0009d5c 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -46,6 +46,8 @@ struct ArrowDeviceArray;
 
 struct ArrowSchema;
 
+struct ArrowArray;
+
 namespace cudf {
 /**
  * @addtogroup interop_dlpack
@@ -348,6 +350,91 @@ std::unique_ptr<cudf::scalar> from_arrow(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Create `cudf::table` from given ArrowArray and ArrowSchema input
+ *
+ * @throws std::invalid_argument if either schema or input are NULL
+ *
+ * @throws cudf::data_type_error if the input array is not a struct array.
+ *
+ * The conversion will not call release on the input Array.
+ *
+ * @param schema `ArrowSchema` pointer to describe the type of the data
+ * @param input `ArrowArray` pointer that needs to be converted to cudf::table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate `cudf::table`
+ * @return cudf table generated from given arrow data
+ */
+std::unique_ptr<cudf::table> from_arrow(ArrowSchema const* schema,
+                                        ArrowArray const* input,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input
+ *
+ * @throws std::invalid_argument if either schema or input are NULL
+ *
+ * The conversion will not call release on the input Array.
+ *
+ * @param schema `ArrowSchema` pointer to describe the type of the data
+ * @param input `ArrowArray` pointer that needs to be converted to cudf::column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate `cudf::column`
+ * @return cudf column generated from given arrow data
+ */
+std::unique_ptr<cudf::column> from_arrow_column(ArrowSchema const* schema,
+                                                ArrowArray const* input,
+                                                rmm::cuda_stream_view stream,
+                                                rmm::mr::device_memory_resource* mr);
+
+/**
+ * @brief Create `cudf::table` from given ArrowDeviceArray input
+ *
+ * @throws std::invalid_argument if either schema or input are NULL
+ *
+ * @throws std::invalid_argument if the device_type is not `ARROW_DEVICE_CPU`
+ *
+ * @throws cudf::data_type_error if the input array is not a struct array,
+ * non-struct arrays should be passed to `from_arrow_host_column` instead.
+ *
+ * The conversion will not call release on the input Array.
+ *
+ * @param schema `ArrowSchema` pointer to describe the type of the data
+ * @param input `ArrowDeviceArray` pointer to object owning the Arrow data
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to perform cuda allocation
+ * @return cudf table generated from the given Arrow data
+ */
+std::unique_ptr<table> from_arrow_host(
+  ArrowSchema const* schema,
+  ArrowDeviceArray const* input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Create `cudf::column` from given ArrowDeviceArray input
+ *
+ * @throws std::invalid_argument if either schema or input are NULL
+ *
+ * @throws std::invalid_argument if the device_type is not `ARROW_DEVICE_CPU`
+ *
+ * @throws cudf::data_type_error if input arrow data type is not supported in cudf.
+ *
+ * The conversion will not call release on the input Array.
+ *
+ * @param schema `ArrowSchema` pointer to describe the type of the data
+ * @param input `ArrowDeviceArray` pointer to object owning the Arrow data
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to perform cuda allocation
+ * @return cudf column generated from the given Arrow data
+ */
+std::unique_ptr<column> from_arrow_host_column(
+  ArrowSchema const* schema,
+  ArrowDeviceArray const* input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray
  *
@@ -398,7 +485,7 @@ using unique_table_view_t =
  * `ArrowDeviceArray` after it is no longer needed, and that the `cudf::table_view` is not
  * accessed after this happens.
  *
- * @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
+ * @throws std::invalid_argument if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
  * or `ARROW_DEVICE_CUDA_MANAGED`
  *
  * @throws cudf::data_type_error if the input array is not a struct array, non-struct
@@ -446,7 +533,7 @@ using unique_column_view_t =
  * `ArrowDeviceArray` after it is no longer needed, and that the `cudf::column_view` is not
  * accessed after this happens.
  *
- * @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
+ * @throws std::invalid_argument if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
  * or `ARROW_DEVICE_CUDA_MANAGED`
  *
  * @throws cudf::data_type_error input arrow data type is not supported.
diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp
new file mode 100644
index 00000000000..05beecfbf9b
--- /dev/null
+++ b/cpp/src/interop/arrow_utilities.cpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arrow_utilities.hpp"
+
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <nanoarrow/nanoarrow.h>
+
+namespace cudf {
+namespace detail {
+data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view)
+{
+  switch (arrow_view->type) {
+    case NANOARROW_TYPE_NA: return data_type(type_id::EMPTY);
+    case NANOARROW_TYPE_BOOL: return data_type(type_id::BOOL8);
+    case NANOARROW_TYPE_INT8: return data_type(type_id::INT8);
+    case NANOARROW_TYPE_INT16: return data_type(type_id::INT16);
+    case NANOARROW_TYPE_INT32: return data_type(type_id::INT32);
+    case NANOARROW_TYPE_INT64: return data_type(type_id::INT64);
+    case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
+    case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
+    case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
+    case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
+    case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32);
+    case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64);
+    case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS);
+    case NANOARROW_TYPE_STRING: return data_type(type_id::STRING);
+    case NANOARROW_TYPE_LIST: return data_type(type_id::LIST);
+    case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32);
+    case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT);
+    case NANOARROW_TYPE_TIMESTAMP: {
+      switch (arrow_view->time_unit) {
+        case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::TIMESTAMP_SECONDS);
+        case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS);
+        case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::TIMESTAMP_MICROSECONDS);
+        case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::TIMESTAMP_NANOSECONDS);
+        default: CUDF_FAIL("Unsupported timestamp unit in arrow", cudf::data_type_error);
+      }
+    }
+    case NANOARROW_TYPE_DURATION: {
+      switch (arrow_view->time_unit) {
+        case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::DURATION_SECONDS);
+        case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::DURATION_MILLISECONDS);
+        case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::DURATION_MICROSECONDS);
+        case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::DURATION_NANOSECONDS);
+        default: CUDF_FAIL("Unsupported duration unit in arrow", cudf::data_type_error);
+      }
+    }
+    case NANOARROW_TYPE_DECIMAL128:
+      return data_type{type_id::DECIMAL128, -arrow_view->decimal_scale};
+    default: CUDF_FAIL("Unsupported type_id conversion to cudf", cudf::data_type_error);
+  }
+}
+
+ArrowType id_to_arrow_type(cudf::type_id id)
+{
+  switch (id) {
+    case cudf::type_id::BOOL8: return NANOARROW_TYPE_BOOL;
+    case cudf::type_id::INT8: return NANOARROW_TYPE_INT8;
+    case cudf::type_id::INT16: return NANOARROW_TYPE_INT16;
+    case cudf::type_id::INT32: return NANOARROW_TYPE_INT32;
+    case cudf::type_id::INT64: return NANOARROW_TYPE_INT64;
+    case cudf::type_id::UINT8: return NANOARROW_TYPE_UINT8;
+    case cudf::type_id::UINT16: return NANOARROW_TYPE_UINT16;
+    case cudf::type_id::UINT32: return NANOARROW_TYPE_UINT32;
+    case cudf::type_id::UINT64: return NANOARROW_TYPE_UINT64;
+    case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT;
+    case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE;
+    case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32;
+    default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error);
+  }
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp
index 9bbdaa2c363..defddb4dc42 100644
--- a/cpp/src/interop/arrow_utilities.hpp
+++ b/cpp/src/interop/arrow_utilities.hpp
@@ -16,6 +16,11 @@
 
 #pragma once
 
+#include <cudf/types.hpp>
+
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow_types.h>
+
 namespace cudf {
 namespace detail {
 
@@ -26,5 +31,21 @@ namespace detail {
 static constexpr int validity_buffer_idx         = 0;
 static constexpr int fixed_width_data_buffer_idx = 1;
 
+/**
+ * @brief Map ArrowType id to cudf column type id
+ *
+ * @param arrow_view SchemaView to pull the logical and storage types from
+ * @return Column type id
+ */
+data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view);
+
+/**
+ * @brief Map cudf column type id to ArrowType id
+ *
+ * @param id Column type id
+ * @return ArrowType id
+ */
+ArrowType id_to_arrow_type(cudf::type_id id);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
index d4d31d1989b..002a8ec1f14 100644
--- a/cpp/src/interop/from_arrow_device.cu
+++ b/cpp/src/interop/from_arrow_device.cu
@@ -42,49 +42,6 @@
 namespace cudf {
 
 namespace detail {
-data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view)
-{
-  switch (arrow_view->type) {
-    case NANOARROW_TYPE_NA: return data_type(type_id::EMPTY);
-    case NANOARROW_TYPE_BOOL: return data_type(type_id::BOOL8);
-    case NANOARROW_TYPE_INT8: return data_type(type_id::INT8);
-    case NANOARROW_TYPE_INT16: return data_type(type_id::INT16);
-    case NANOARROW_TYPE_INT32: return data_type(type_id::INT32);
-    case NANOARROW_TYPE_INT64: return data_type(type_id::INT64);
-    case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
-    case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
-    case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
-    case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
-    case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32);
-    case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64);
-    case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS);
-    case NANOARROW_TYPE_STRING: return data_type(type_id::STRING);
-    case NANOARROW_TYPE_LIST: return data_type(type_id::LIST);
-    case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32);
-    case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT);
-    case NANOARROW_TYPE_TIMESTAMP: {
-      switch (arrow_view->time_unit) {
-        case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::TIMESTAMP_SECONDS);
-        case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS);
-        case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::TIMESTAMP_MICROSECONDS);
-        case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::TIMESTAMP_NANOSECONDS);
-        default: CUDF_FAIL("Unsupported timestamp unit in arrow", cudf::data_type_error);
-      }
-    }
-    case NANOARROW_TYPE_DURATION: {
-      switch (arrow_view->time_unit) {
-        case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::DURATION_SECONDS);
-        case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::DURATION_MILLISECONDS);
-        case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::DURATION_MICROSECONDS);
-        case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::DURATION_NANOSECONDS);
-        default: CUDF_FAIL("Unsupported duration unit in arrow", cudf::data_type_error);
-      }
-    }
-    case NANOARROW_TYPE_DECIMAL128:
-      return data_type{type_id::DECIMAL128, -arrow_view->decimal_scale};
-    default: CUDF_FAIL("Unsupported type_id conversion to cudf", cudf::data_type_error);
-  }
-}
 
 namespace {
 
@@ -379,11 +336,25 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema,
 
 }  // namespace
 
-unique_table_view_t from_arrow_device(ArrowSchemaView* schema,
+unique_table_view_t from_arrow_device(ArrowSchema const* schema,
                                       ArrowDeviceArray const* input,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
+  CUDF_EXPECTS(schema != nullptr && input != nullptr,
+               "input ArrowSchema and ArrowDeviceArray must not be NULL",
+               std::invalid_argument);
+  CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA ||
+                 input->device_type == ARROW_DEVICE_CUDA_HOST ||
+                 input->device_type == ARROW_DEVICE_CUDA_MANAGED,
+               "ArrowDeviceArray memory must be accessible to CUDA",
+               std::invalid_argument);
+
+  rmm::cuda_set_device_raii dev(
+    rmm::cuda_device_id{static_cast<rmm::cuda_device_id::value_type>(input->device_id)});
+  ArrowSchemaView view;
+  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr));
+
   if (input->sync_event != nullptr) {
     CUDF_CUDA_TRY(
       cudaStreamWaitEvent(stream.value(), *reinterpret_cast<cudaEvent_t*>(input->sync_event)));
@@ -392,14 +363,14 @@ unique_table_view_t from_arrow_device(ArrowSchemaView* schema,
   std::vector<column_view> columns;
   owned_columns_t owned_mem;
 
-  auto type = arrow_to_cudf_type(schema);
+  auto type = arrow_to_cudf_type(&view);
   CUDF_EXPECTS(type == data_type(type_id::STRUCT),
                "Must pass a struct to `from_arrow_device`",
                cudf::data_type_error);
   std::transform(
     input->array.children,
     input->array.children + input->array.n_children,
-    schema->schema->children,
+    view.schema->children,
     std::back_inserter(columns),
     [&owned_mem, &stream, &mr](ArrowArray const* child, ArrowSchema const* child_schema) {
       ArrowSchemaView view;
@@ -420,18 +391,32 @@ unique_table_view_t from_arrow_device(ArrowSchemaView* schema,
                              custom_view_deleter<cudf::table_view>{std::move(owned_mem)}};
 }
 
-unique_column_view_t from_arrow_device_column(ArrowSchemaView* schema,
+unique_column_view_t from_arrow_device_column(ArrowSchema const* schema,
                                               ArrowDeviceArray const* input,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
+  CUDF_EXPECTS(schema != nullptr && input != nullptr,
+               "input ArrowSchema and ArrowDeviceArray must not be NULL",
+               std::invalid_argument);
+  CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA ||
+                 input->device_type == ARROW_DEVICE_CUDA_HOST ||
+                 input->device_type == ARROW_DEVICE_CUDA_MANAGED,
+               "ArrowDeviceArray must be accessible to CUDA",
+               std::invalid_argument);
+
+  rmm::cuda_set_device_raii dev(
+    rmm::cuda_device_id{static_cast<rmm::cuda_device_id::value_type>(input->device_id)});
+  ArrowSchemaView view;
+  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr));
+
   if (input->sync_event != nullptr) {
     CUDF_CUDA_TRY(
       cudaStreamWaitEvent(stream.value(), *reinterpret_cast<cudaEvent_t*>(input->sync_event)));
   }
 
-  auto type             = arrow_to_cudf_type(schema);
-  auto [colview, owned] = get_column(schema, &input->array, type, false, stream, mr);
+  auto type             = arrow_to_cudf_type(&view);
+  auto [colview, owned] = get_column(&view, &input->array, type, false, stream, mr);
   return unique_column_view_t{new column_view{colview},
                               custom_view_deleter<cudf::column_view>{std::move(owned)}};
 }
@@ -443,20 +428,9 @@ unique_table_view_t from_arrow_device(ArrowSchema const* schema,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(schema != nullptr && input != nullptr,
-               "input ArrowSchema and ArrowDeviceArray must not be NULL");
-  CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA ||
-                 input->device_type == ARROW_DEVICE_CUDA_HOST ||
-                 input->device_type == ARROW_DEVICE_CUDA_MANAGED,
-               "ArrowDeviceArray memory must be accessible to CUDA");
-
   CUDF_FUNC_RANGE();
 
-  rmm::cuda_set_device_raii dev(
-    rmm::cuda_device_id{static_cast<rmm::cuda_device_id::value_type>(input->device_id)});
-  ArrowSchemaView view;
-  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr));
-  return detail::from_arrow_device(&view, input, stream, mr);
+  return detail::from_arrow_device(schema, input, stream, mr);
 }
 
 unique_column_view_t from_arrow_device_column(ArrowSchema const* schema,
@@ -464,20 +438,9 @@ unique_column_view_t from_arrow_device_column(ArrowSchema const* schema,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(schema != nullptr && input != nullptr,
-               "input ArrowSchema and ArrowDeviceArray must not be NULL");
-  CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CUDA ||
-                 input->device_type == ARROW_DEVICE_CUDA_HOST ||
-                 input->device_type == ARROW_DEVICE_CUDA_MANAGED,
-               "ArrowDeviceArray must be accessible to CUDA");
-
   CUDF_FUNC_RANGE();
 
-  rmm::cuda_set_device_raii dev(
-    rmm::cuda_device_id{static_cast<rmm::cuda_device_id::value_type>(input->device_id)});
-  ArrowSchemaView view;
-  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr));
-  return detail::from_arrow_device_column(&view, input, stream, mr);
+  return detail::from_arrow_device_column(schema, input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu
new file mode 100644
index 00000000000..36bb35d9419
--- /dev/null
+++ b/cpp/src/interop/from_arrow_host.cu
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arrow_utilities.hpp"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/get_value.cuh>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/transform.hpp>
+#include <cudf/detail/unary.hpp>
+#include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/interop/detail/arrow.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow.hpp>
+
+namespace cudf {
+namespace detail {
+
+namespace {
+
+struct dispatch_copy_from_arrow_host {
+  rmm::cuda_stream_view stream;
+  rmm::mr::device_memory_resource* mr;
+
+  std::unique_ptr<rmm::device_buffer> get_mask_buffer(ArrowArray const* array)
+  {
+    auto* bitmap = array->buffers[validity_buffer_idx];
+    if (bitmap == nullptr) { return std::make_unique<rmm::device_buffer>(0, stream, mr); }
+
+    auto const bitmask_size = array->length + array->offset;
+    auto const allocation_size =
+      bitmask_allocation_size_bytes(static_cast<size_type>(bitmask_size));
+    auto mask = std::make_unique<rmm::device_buffer>(allocation_size, stream, mr);
+    CUDF_CUDA_TRY(cudaMemcpyAsync(mask->data(),
+                                  reinterpret_cast<uint8_t const*>(bitmap),
+                                  allocation_size,
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+    return mask;
+  }
+
+  template <typename T,
+            CUDF_ENABLE_IF(not is_rep_layout_compatible<T>() &&
+                           !std::is_same_v<T, numeric::decimal128>)>
+  std::unique_ptr<column> operator()(ArrowSchemaView*, ArrowArray const*, data_type, bool)
+  {
+    CUDF_FAIL("Unsupported type in copy_from_arrow_host.");
+  }
+
+  template <typename T,
+            CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || std::is_same_v<T, numeric::decimal128>)>
+  std::unique_ptr<column> operator()(ArrowSchemaView* schema,
+                                     ArrowArray const* input,
+                                     data_type type,
+                                     bool skip_mask)
+  {
+    using DeviceType = std::conditional_t<std::is_same_v<T, numeric::decimal128>, __int128_t, T>;
+
+    size_type const num_rows   = input->length;
+    size_type const offset     = input->offset;
+    size_type const null_count = input->null_count;
+    auto data_buffer           = input->buffers[fixed_width_data_buffer_idx];
+
+    auto const has_nulls = skip_mask ? false : input->buffers[validity_buffer_idx] != nullptr;
+    auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr);
+    auto mutable_column_view = col->mutable_view();
+    CUDF_CUDA_TRY(
+      cudaMemcpyAsync(mutable_column_view.data<DeviceType>(),
+                      reinterpret_cast<uint8_t const*>(data_buffer) + offset * sizeof(DeviceType),
+                      sizeof(DeviceType) * num_rows,
+                      cudaMemcpyDefault,
+                      stream.value()));
+
+    if (has_nulls) {
+      auto tmp_mask = get_mask_buffer(input);
+
+      // if array is sliced, we have to copy the whole mask and then take copy
+      auto out_mask =
+        (offset == 0)
+          ? std::move(*tmp_mask)
+          : cudf::detail::copy_bitmask(
+              static_cast<bitmask_type*>(tmp_mask->data()), offset, offset + num_rows, stream, mr);
+
+      col->set_null_mask(std::move(out_mask), null_count);
+    }
+
+    return col;
+  }
+};
+
+// forward declaration is needed because `type_dispatch` instantiates the
+// dispatch_copy_from_arrow_host struct causing a recursive situation for struct,
+// dictionary and list_view types.
+//
+// This function is simply a convenience wrapper around the dispatch functor with
+// some extra handling to avoid having to reproduce it for all of the nested types.
+// It also allows us to centralize the location where the recursive calls happen
+// so that we only need to forward declare this one function, rather than multiple
+// functions which handle the overloads for nested types (list, struct, etc.)
+std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
+                                        ArrowArray const* input,
+                                        data_type type,
+                                        bool skip_mask,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr);
+
+template <>
+std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<bool>(ArrowSchemaView* schema,
+                                                                        ArrowArray const* input,
+                                                                        data_type type,
+                                                                        bool skip_mask)
+{
+  auto data_buffer         = input->buffers[fixed_width_data_buffer_idx];
+  const auto buffer_length = bitmask_allocation_size_bytes(input->length + input->offset);
+
+  auto data = rmm::device_buffer(buffer_length, stream, mr);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(data.data(),
+                                reinterpret_cast<uint8_t const*>(data_buffer),
+                                buffer_length,
+                                cudaMemcpyDefault,
+                                stream.value()));
+  auto out_col = mask_to_bools(static_cast<bitmask_type*>(data.data()),
+                               input->offset,
+                               input->offset + input->length,
+                               stream,
+                               mr);
+
+  auto const has_nulls = skip_mask ? false : input->buffers[validity_buffer_idx] != nullptr;
+  if (has_nulls) {
+    auto out_mask = detail::copy_bitmask(static_cast<bitmask_type*>(get_mask_buffer(input)->data()),
+                                         input->offset,
+                                         input->offset + input->length,
+                                         stream,
+                                         mr);
+
+    out_col->set_null_mask(std::move(out_mask), input->null_count);
+  }
+
+  return out_col;
+}
+
+template <>
+std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::string_view>(
+  ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask)
+{
+  if (input->length == 0) { return make_empty_column(type_id::STRING); }
+
+  // offsets column should contain no nulls so we can put nullptr for the bitmask
+  // nulls are tracked in the parent string column itself, not in the offsets
+  void const* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]};
+  ArrowArray offsets_array      = {
+         .length     = input->offset + input->length + 1,
+         .null_count = 0,
+         .offset     = 0,
+         .n_buffers  = 2,
+         .n_children = 0,
+         .buffers    = offset_buffers,
+  };
+
+  // chars_column does not contain any nulls, they are tracked by the parent string column
+  // itself instead. So we pass nullptr for the validity bitmask.
+  size_type const char_data_length =
+    reinterpret_cast<int32_t const*>(offset_buffers[1])[input->length + input->offset];
+  void const* char_buffers[2] = {nullptr, input->buffers[2]};
+  ArrowArray char_array       = {
+          .length     = char_data_length,
+          .null_count = 0,
+          .offset     = 0,
+          .n_buffers  = 2,
+          .n_children = 0,
+          .buffers    = char_buffers,
+  };
+
+  nanoarrow::UniqueSchema offset_schema;
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(offset_schema.get(), NANOARROW_TYPE_INT32));
+
+  nanoarrow::UniqueSchema char_data_schema;
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(char_data_schema.get(), NANOARROW_TYPE_INT8));
+
+  // leverage the dispatch overloads for int32 and char(int8) to generate the child
+  // offset and char data columns for us.
+  ArrowSchemaView view;
+  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr));
+  auto offsets_column =
+    this->operator()<int32_t>(&view, &offsets_array, data_type(type_id::INT32), true);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, char_data_schema.get(), nullptr));
+  auto chars_column = this->operator()<int8_t>(&view, &char_array, data_type(type_id::INT8), true);
+
+  auto const num_rows = offsets_column->size() - 1;
+  auto out_col        = make_strings_column(num_rows,
+                                     std::move(offsets_column),
+                                     std::move(chars_column->release().data.release()[0]),
+                                     input->null_count,
+                                     std::move(*get_mask_buffer(input)));
+
+  return input->offset == 0
+           ? std::move(out_col)
+           : std::make_unique<column>(
+               cudf::detail::slice(out_col->view(),
+                                   static_cast<size_type>(input->offset),
+                                   static_cast<size_type>(input->offset + input->length),
+                                   stream),
+               stream,
+               mr);
+}
+
+template <>
+std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::dictionary32>(
+  ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask)
+{
+  ArrowSchemaView keys_schema_view;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaViewInit(&keys_schema_view, schema->schema->dictionary, nullptr));
+
+  auto const keys_type = arrow_to_cudf_type(&keys_schema_view);
+  auto keys_column =
+    get_column_copy(&keys_schema_view, input->dictionary, keys_type, true, stream, mr);
+
+  auto const dict_indices_type = [&schema]() -> data_type {
+    // cudf dictionary requires an unsigned type for the indices,
+    // since it is invalid for an arrow dictionary to contain negative
+    // indices, we can safely use the unsigned equivalent without having
+    // to modify the buffers.
+    switch (schema->storage_type) {
+      case NANOARROW_TYPE_INT8:
+      case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
+      case NANOARROW_TYPE_INT16:
+      case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
+      case NANOARROW_TYPE_INT32:
+      case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
+      case NANOARROW_TYPE_INT64:
+      case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
+      default: CUDF_FAIL("Unsupported type_id for dictionary indices", cudf::data_type_error);
+    }
+  }();
+
+  auto indices_column = get_column_copy(schema, input, dict_indices_type, false, stream, mr);
+  // child columns shouldn't have masks and we need the mask in the main column
+  auto column_contents = indices_column->release();
+  indices_column       = std::make_unique<column>(dict_indices_type,
+                                            static_cast<size_type>(input->length),
+                                            std::move(*(column_contents.data)),
+                                            rmm::device_buffer{},
+                                            0);
+
+  return make_dictionary_column(std::move(keys_column),
+                                std::move(indices_column),
+                                std::move(*(column_contents.null_mask)),
+                                input->null_count);
+}
+
+template <>
+std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::struct_view>(
+  ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask)
+{
+  std::vector<std::unique_ptr<column>> child_columns;
+  std::transform(
+    input->children,
+    input->children + input->n_children,
+    schema->schema->children,
+    std::back_inserter(child_columns),
+    [this, input](ArrowArray const* child, ArrowSchema const* child_schema) {
+      ArrowSchemaView view;
+      NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr));
+      auto type = arrow_to_cudf_type(&view);
+
+      auto out = get_column_copy(&view, child, type, false, stream, mr);
+      return input->offset == 0 && input->length == out->size()
+               ? std::move(out)
+               : std::make_unique<column>(
+                   cudf::detail::slice(out->view(),
+                                       static_cast<size_type>(input->offset),
+                                       static_cast<size_type>(input->offset + input->length),
+                                       stream),
+                   stream,
+                   mr);
+    });
+
+  auto out_mask = std::move(*(get_mask_buffer(input)));
+  if (input->buffers[validity_buffer_idx] != nullptr) {
+    out_mask = detail::copy_bitmask(static_cast<bitmask_type*>(out_mask.data()),
+                                    input->offset,
+                                    input->offset + input->length,
+                                    stream,
+                                    mr);
+  }
+
+  return make_structs_column(
+    input->length, std::move(child_columns), input->null_count, std::move(out_mask), stream, mr);
+}
+
+template <>
+std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::list_view>(
+  ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask)
+{
+  const void* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]};
+  ArrowArray offsets_array      = {
+         .length     = input->offset + input->length + 1,
+         .null_count = 0,
+         .offset     = 0,
+         .n_buffers  = 2,
+         .n_children = 0,
+         .buffers    = offset_buffers,
+  };
+  nanoarrow::UniqueSchema offset_schema;
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(offset_schema.get(), NANOARROW_TYPE_INT32));
+
+  ArrowSchemaView view;
+  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr));
+  auto offsets_column =
+    this->operator()<int32_t>(&view, &offsets_array, data_type(type_id::INT32), true);
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema->schema->children[0], nullptr));
+  auto child_type   = arrow_to_cudf_type(&view);
+  auto child_column = get_column_copy(&view, input->children[0], child_type, false, stream, mr);
+
+  auto const num_rows = offsets_column->size() - 1;
+  auto out_col        = make_lists_column(num_rows,
+                                   std::move(offsets_column),
+                                   std::move(child_column),
+                                   input->null_count,
+                                   std::move(*get_mask_buffer(input)),
+                                   stream,
+                                   mr);
+
+  return num_rows == input->length
+           ? std::move(out_col)
+           : std::make_unique<column>(
+               cudf::detail::slice(out_col->view(),
+                                   static_cast<size_type>(input->offset),
+                                   static_cast<size_type>(input->offset + input->length),
+                                   stream),
+               stream,
+               mr);
+}
+
+std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
+                                        ArrowArray const* input,
+                                        data_type type,
+                                        bool skip_mask,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  return type.id() != type_id::EMPTY
+           ? std::move(type_dispatcher(
+               type, dispatch_copy_from_arrow_host{stream, mr}, schema, input, type, skip_mask))
+           : std::make_unique<column>(data_type(type_id::EMPTY),
+                                      input->length,
+                                      rmm::device_buffer{},
+                                      rmm::device_buffer{},
+                                      input->length);
+}
+
+}  // namespace
+
+std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
+                                       ArrowDeviceArray const* input,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(schema != nullptr && input != nullptr,
+               "input ArrowSchema and ArrowDeviceArray must not be NULL",
+               std::invalid_argument);
+  CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU,
+               "ArrowDeviceArray must have CPU device type for `from_arrow_host`",
+               std::invalid_argument);
+
+  ArrowSchemaView view;
+  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr));
+
+  std::vector<std::unique_ptr<column>> columns;
+
+  auto type = arrow_to_cudf_type(&view);
+  CUDF_EXPECTS(type == data_type(type_id::STRUCT),
+               "Must pass a struct to `from_arrow_host`",
+               cudf::data_type_error);
+
+  std::transform(input->array.children,
+                 input->array.children + input->array.n_children,
+                 view.schema->children,
+                 std::back_inserter(columns),
+                 [&stream, &mr](ArrowArray const* child, ArrowSchema const* child_schema) {
+                   ArrowSchemaView view;
+                   NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, child_schema, nullptr));
+                   auto type = arrow_to_cudf_type(&view);
+                   return get_column_copy(&view, child, type, false, stream, mr);
+                 });
+
+  return std::make_unique<table>(std::move(columns));
+}
+
+std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
+                                               ArrowDeviceArray const* input,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(schema != nullptr && input != nullptr,
+               "input ArrowSchema and ArrowDeviceArray must not be NULL",
+               std::invalid_argument);
+  CUDF_EXPECTS(input->device_type == ARROW_DEVICE_CPU,
+               "ArrowDeviceArray must have CPU device type for `from_arrow_host_column`",
+               std::invalid_argument);
+
+  ArrowSchemaView view;
+  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, schema, nullptr));
+
+  auto type = arrow_to_cudf_type(&view);
+  return get_column_copy(&view, &input->array, type, false, stream, mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
+                                       ArrowDeviceArray const* input,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+
+  return detail::from_arrow_host(schema, input, stream, mr);
+}
+
+std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
+                                               ArrowDeviceArray const* input,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+
+  return detail::from_arrow_host_column(schema, input, stream, mr);
+}
+
+std::unique_ptr<table> from_arrow(ArrowSchema const* schema,
+                                  ArrowArray const* input,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+
+  ArrowDeviceArray const device_input = {
+    .array       = *input,
+    .device_id   = -1,
+    .device_type = ARROW_DEVICE_CPU,
+  };
+  return detail::from_arrow_host(schema, &device_input, stream, mr);
+}
+
+std::unique_ptr<column> from_arrow_column(ArrowSchema const* schema,
+                                          ArrowArray const* input,
+                                          rmm::cuda_stream_view stream,
+                                          rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+
+  ArrowDeviceArray const device_input = {
+    .array       = *input,
+    .device_id   = -1,
+    .device_type = ARROW_DEVICE_CPU,
+  };
+  return detail::from_arrow_host_column(schema, &device_input, stream, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index f2b1669df9b..ebfd6605977 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -15,7 +15,6 @@
  */
 
 #include "arrow_utilities.hpp"
-#include "to_arrow_utilities.hpp"
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
diff --git a/cpp/src/interop/to_arrow_schema.cpp b/cpp/src/interop/to_arrow_schema.cpp
index 6f943593dce..19915464236 100644
--- a/cpp/src/interop/to_arrow_schema.cpp
+++ b/cpp/src/interop/to_arrow_schema.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "to_arrow_utilities.hpp"
+#include "arrow_utilities.hpp"
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/interop.hpp>
diff --git a/cpp/src/interop/to_arrow_utilities.cpp b/cpp/src/interop/to_arrow_utilities.cpp
deleted file mode 100644
index 04d17847273..00000000000
--- a/cpp/src/interop/to_arrow_utilities.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "to_arrow_utilities.hpp"
-
-#include <cudf/utilities/error.hpp>
-
-namespace cudf {
-namespace detail {
-
-ArrowType id_to_arrow_type(cudf::type_id id)
-{
-  switch (id) {
-    case cudf::type_id::BOOL8: return NANOARROW_TYPE_BOOL;
-    case cudf::type_id::INT8: return NANOARROW_TYPE_INT8;
-    case cudf::type_id::INT16: return NANOARROW_TYPE_INT16;
-    case cudf::type_id::INT32: return NANOARROW_TYPE_INT32;
-    case cudf::type_id::INT64: return NANOARROW_TYPE_INT64;
-    case cudf::type_id::UINT8: return NANOARROW_TYPE_UINT8;
-    case cudf::type_id::UINT16: return NANOARROW_TYPE_UINT16;
-    case cudf::type_id::UINT32: return NANOARROW_TYPE_UINT32;
-    case cudf::type_id::UINT64: return NANOARROW_TYPE_UINT64;
-    case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT;
-    case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE;
-    case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32;
-    default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error);
-  }
-}
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow_utilities.hpp b/cpp/src/interop/to_arrow_utilities.hpp
deleted file mode 100644
index 3c01c726a7b..00000000000
--- a/cpp/src/interop/to_arrow_utilities.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/types.hpp>
-
-#include <nanoarrow/nanoarrow_types.h>
-
-namespace cudf {
-namespace detail {
-
-/**
- * @brief Map cudf column type id to ArrowType id
- *
- * @param id Column type id
- * @return ArrowType id
- */
-ArrowType id_to_arrow_type(cudf::type_id id);
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 42b7f089d61..c6ab8aa021a 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -269,6 +269,7 @@ ConfigureTest(
   interop/to_arrow_test.cpp
   interop/from_arrow_test.cpp
   interop/from_arrow_device_test.cpp
+  interop/from_arrow_host_test.cpp
   interop/dlpack_test.cpp
   EXTRA_LIB
   nanoarrow
diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp
index 66bd4dd1bfb..d776ca57ef6 100644
--- a/cpp/tests/interop/from_arrow_device_test.cpp
+++ b/cpp/tests/interop/from_arrow_device_test.cpp
@@ -49,23 +49,23 @@ TYPED_TEST_SUITE(FromArrowDeviceTestDurationsTest, cudf::test::DurationTypes);
 TEST_F(FromArrowDeviceTest, FailConditions)
 {
   // can't pass null for schema or device array
-  EXPECT_THROW(cudf::from_arrow_device(nullptr, nullptr), cudf::logic_error);
+  EXPECT_THROW(cudf::from_arrow_device(nullptr, nullptr), std::invalid_argument);
   // can't pass null for device array
   ArrowSchema schema;
-  EXPECT_THROW(cudf::from_arrow_device(&schema, nullptr), cudf::logic_error);
+  EXPECT_THROW(cudf::from_arrow_device(&schema, nullptr), std::invalid_argument);
   // device_type must be CUDA/CUDA_HOST/CUDA_MANAGED
   // should fail with ARROW_DEVICE_CPU
   ArrowDeviceArray arr;
   arr.device_type = ARROW_DEVICE_CPU;
-  EXPECT_THROW(cudf::from_arrow_device(&schema, &arr), cudf::logic_error);
+  EXPECT_THROW(cudf::from_arrow_device(&schema, &arr), std::invalid_argument);
 
   // can't pass null for schema or device array
-  EXPECT_THROW(cudf::from_arrow_device_column(nullptr, nullptr), cudf::logic_error);
+  EXPECT_THROW(cudf::from_arrow_device_column(nullptr, nullptr), std::invalid_argument);
   // can't pass null for device array
-  EXPECT_THROW(cudf::from_arrow_device_column(&schema, nullptr), cudf::logic_error);
+  EXPECT_THROW(cudf::from_arrow_device_column(&schema, nullptr), std::invalid_argument);
   // device_type must be CUDA/CUDA_HOST/CUDA_MANAGED
   // should fail with ARROW_DEVICE_CPU
-  EXPECT_THROW(cudf::from_arrow_device_column(&schema, &arr), cudf::logic_error);
+  EXPECT_THROW(cudf::from_arrow_device_column(&schema, &arr), std::invalid_argument);
 }
 
 TEST_F(FromArrowDeviceTest, EmptyTable)
diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp
new file mode 100644
index 00000000000..e6e52099a0c
--- /dev/null
+++ b/cpp/tests/interop/from_arrow_host_test.cpp
@@ -0,0 +1,612 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nanoarrow_utils.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/testing_main.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/dictionary/encode.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+// create a cudf::table and equivalent arrow table with host memory
+std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, nanoarrow::UniqueArray>
+get_nanoarrow_host_tables(cudf::size_type length)
+{
+  auto [table, schema, test_data] = get_nanoarrow_cudf_table(length);
+
+  auto int64_array = get_nanoarrow_array<int64_t>(test_data.int64_data, test_data.validity);
+  auto string_array =
+    get_nanoarrow_array<cudf::string_view>(test_data.string_data, test_data.validity);
+  cudf::dictionary_column_view view(table->get_column(2).view());
+  auto keys       = cudf::test::to_host<int64_t>(view.keys()).first;
+  auto indices    = cudf::test::to_host<uint32_t>(view.indices()).first;
+  auto dict_array = get_nanoarrow_dict_array(std::vector<int64_t>(keys.begin(), keys.end()),
+                                             std::vector<int32_t>(indices.begin(), indices.end()),
+                                             test_data.validity);
+  auto boolarray  = get_nanoarrow_array<bool>(test_data.bool_data, test_data.bool_validity);
+  auto list_array = get_nanoarrow_list_array<int64_t>(test_data.list_int64_data,
+                                                      test_data.list_offsets,
+                                                      test_data.list_int64_data_validity,
+                                                      test_data.bool_data_validity);
+
+  nanoarrow::UniqueArray arrow;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(arrow.get(), schema.get(), nullptr));
+  arrow->length = length;
+
+  int64_array.move(arrow->children[0]);
+  string_array.move(arrow->children[1]);
+  dict_array.move(arrow->children[2]);
+  boolarray.move(arrow->children[3]);
+  list_array.move(arrow->children[4]);
+
+  int64_array  = get_nanoarrow_array<int64_t>(test_data.int64_data, test_data.validity);
+  string_array = get_nanoarrow_array<cudf::string_view>(test_data.string_data, test_data.validity);
+  int64_array.move(arrow->children[5]->children[0]);
+  string_array.move(arrow->children[5]->children[1]);
+
+  ArrowBitmap struct_validity;
+  ArrowBitmapInit(&struct_validity);
+  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&struct_validity, length));
+  ArrowBitmapAppendInt8Unsafe(
+    &struct_validity, reinterpret_cast<const int8_t*>(test_data.bool_data_validity.data()), length);
+  arrow->children[5]->length = length;
+  ArrowArraySetValidityBitmap(arrow->children[5], &struct_validity);
+  arrow->children[5]->null_count =
+    length - ArrowBitCountSet(ArrowArrayValidityBitmap(arrow->children[5])->buffer.data, 0, length);
+
+  ArrowError error;
+  if (ArrowArrayFinishBuilding(arrow.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, &error) !=
+      NANOARROW_OK) {
+    std::cerr << ArrowErrorMessage(&error) << std::endl;
+    CUDF_FAIL("failed to build example arrays");
+  }
+
+  return std::make_tuple(std::move(table), std::move(schema), std::move(arrow));
+}
+
+struct FromArrowHostDeviceTest : public cudf::test::BaseFixture {};
+
+template <typename T>
+struct FromArrowHostDeviceTestDurationsTest : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(FromArrowHostDeviceTestDurationsTest, cudf::test::DurationTypes);
+
+TEST_F(FromArrowHostDeviceTest, EmptyTable)
+{
+  auto [tbl, schema, arr] = get_nanoarrow_host_tables(0);
+
+  auto expected_cudf_table = tbl->view();
+  ArrowDeviceArray input;
+  memcpy(&input.array, arr.get(), sizeof(ArrowArray));
+  input.device_id   = -1;
+  input.device_type = ARROW_DEVICE_CPU;
+
+  auto got_cudf_table = cudf::from_arrow_host(schema.get(), &input);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table, got_cudf_table->view());
+}
+
+TEST_F(FromArrowHostDeviceTest, DateTimeTable)
+{
+  auto data = std::vector<int64_t>{1, 2, 3, 4, 5, 6};
+  auto col  = cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(
+    data.begin(), data.end());
+  cudf::table_view expected_table_view({col});
+
+  // construct equivalent arrow schema with nanoarrow
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
+  ArrowSchemaInit(input_schema->children[0]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    input_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
+
+  // equivalent arrow record batch
+  nanoarrow::UniqueArray input_array;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
+  input_array->length     = 6;
+  input_array->null_count = 0;
+
+  auto arr = get_nanoarrow_array<int64_t>(data);
+  arr.move(input_array->children[0]);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
+
+  ArrowDeviceArray input;
+  memcpy(&input.array, input_array.get(), sizeof(ArrowArray));
+  input.device_id   = -1;
+  input.device_type = ARROW_DEVICE_CPU;
+
+  // test that we get the same cudf table as we expect by converting the
+  // host arrow memory to a cudf table
+  auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view());
+
+  // test that we get a cudf table with a single struct column that is equivalent
+  // if we use from_arrow_host_column
+  auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  auto got_cudf_col_view = got_cudf_col->view();
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct);
+}
+
+TYPED_TEST(FromArrowHostDeviceTestDurationsTest, DurationTable)
+{
+  using T = TypeParam;
+  if (cudf::type_to_id<TypeParam>() == cudf::type_id::DURATION_DAYS) { return; }
+
+  auto data = {T{1}, T{2}, T{3}, T{4}, T{5}, T{6}};
+  auto col  = cudf::test::fixed_width_column_wrapper<T>(data);
+
+  cudf::table_view expected_table_view({col});
+  const ArrowTimeUnit time_unit = [&] {
+    switch (cudf::type_to_id<TypeParam>()) {
+      case cudf::type_id::DURATION_SECONDS: return NANOARROW_TIME_UNIT_SECOND;
+      case cudf::type_id::DURATION_MILLISECONDS: return NANOARROW_TIME_UNIT_MILLI;
+      case cudf::type_id::DURATION_MICROSECONDS: return NANOARROW_TIME_UNIT_MICRO;
+      case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TIME_UNIT_NANO;
+      default: CUDF_FAIL("Unsupported duration unit in arrow");
+    }
+  }();
+
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
+
+  ArrowSchemaInit(input_schema->children[0]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    input_schema->children[0], NANOARROW_TYPE_DURATION, time_unit, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
+
+  nanoarrow::UniqueArray input_array;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
+  input_array->length     = expected_table_view.num_rows();
+  input_array->null_count = 0;
+
+  auto arr = get_nanoarrow_array<T>(data);
+  arr.move(input_array->children[0]);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, nullptr));
+
+  ArrowDeviceArray input;
+  memcpy(&input.array, input_array.get(), sizeof(ArrowArray));
+  input.device_id   = -1;
+  input.device_type = ARROW_DEVICE_CPU;
+
+  // converting arrow host memory to cudf table gives us the expected table
+  auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view());
+
+  // converting to a cudf table with a single struct column gives us the expected
+  // result column
+  auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  auto got_cudf_col_view = got_cudf_col->view();
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct);
+}
+
+TEST_F(FromArrowHostDeviceTest, NestedList)
+{
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; });
+  auto col = cudf::test::lists_column_wrapper<int64_t>(
+    {{{{{1, 2}, valids}, {{3, 4}, valids}, {5}}, {{6}, {{7, 8, 9}, valids}}}, valids});
+  cudf::table_view expected_table_view({col});
+
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
+  input_schema->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[0]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0]->children[0], "element"));
+  input_schema->children[0]->children[0]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(
+    input_schema->children[0]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(input_schema->children[0]->children[0]->children[0], "element"));
+  input_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  // create the base arrow list array
+  auto list_arr = get_nanoarrow_list_array<int64_t>({6, 7, 8, 9}, {0, 1, 4}, {1, 0, 1, 1});
+  std::vector<int32_t> offset{0, 0, 2};
+
+  // populate the bitmask we're going to use for the top level list
+  ArrowBitmap mask;
+  ArrowBitmapInit(&mask);
+  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&mask, 2));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 0, 1));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 1, 1));
+
+  nanoarrow::UniqueArray input_array;
+  EXPECT_EQ(NANOARROW_OK, ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
+  input_array->length     = expected_table_view.num_rows();
+  input_array->null_count = 0;
+
+  ArrowArraySetValidityBitmap(input_array->children[0], &mask);
+  input_array->children[0]->length     = expected_table_view.num_rows();
+  input_array->children[0]->null_count = 1;
+  auto offset_buf                      = ArrowArrayBuffer(input_array->children[0], 1);
+  EXPECT_EQ(
+    NANOARROW_OK,
+    ArrowBufferAppend(
+      offset_buf, reinterpret_cast<const void*>(offset.data()), offset.size() * sizeof(int32_t)));
+
+  // move our base list to be the child of the one we just created
+  // so that we now have an equivalent value to what we created for cudf
+  list_arr.move(input_array->children[0]->children[0]);
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+  ArrowDeviceArray input;
+  memcpy(&input.array, input_array.get(), sizeof(ArrowArray));
+  input.device_id   = -1;
+  input.device_type = ARROW_DEVICE_CPU;
+
+  // converting from arrow host memory to cudf gives us the expected table
+  auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view());
+
+  // converting to a single column cudf table gives us the expected struct column
+  auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  auto got_cudf_col_view = got_cudf_col->view();
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct);
+}
+
+TEST_F(FromArrowHostDeviceTest, StructColumn)
+{
+  // Create cudf table
+  auto nested_type_field_names =
+    std::vector<std::vector<std::string>>{{"string", "integral", "bool", "nested_list", "struct"}};
+  auto str_col =
+    cudf::test::strings_column_wrapper{
+      "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}
+      .release();
+  auto str_col2 =
+    cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {0, 1, 0}}.release();
+  int num_rows{str_col->size()};
+  auto int_col = cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{48, 27, 25}}.release();
+  auto int_col2 =
+    cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
+  auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
+  auto list_col =
+    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
+      .release();
+  vector_of_columns cols2;
+  cols2.push_back(std::move(str_col2));
+  cols2.push_back(std::move(int_col2));
+  auto [null_mask, null_count] =
+    cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}});
+  auto sub_struct_col =
+    cudf::make_structs_column(num_rows, std::move(cols2), null_count, std::move(*null_mask));
+  vector_of_columns cols;
+  cols.push_back(std::move(str_col));
+  cols.push_back(std::move(int_col));
+  cols.push_back(std::move(bool_col));
+  cols.push_back(std::move(list_col));
+  cols.push_back(std::move(sub_struct_col));
+
+  auto struct_col = cudf::make_structs_column(num_rows, std::move(cols), 0, {});
+  cudf::table_view expected_table_view({struct_col->view()});
+
+  // Create name metadata
+  auto sub_metadata          = cudf::column_metadata{"struct"};
+  sub_metadata.children_meta = {{"string2"}, {"integral2"}};
+  auto metadata              = cudf::column_metadata{"a"};
+  metadata.children_meta     = {{"string"}, {"integral"}, {"bool"}, {"nested_list"}, sub_metadata};
+
+  // create the equivalent arrow schema using nanoarrow
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 1));
+
+  ArrowSchemaInit(input_schema->children[0]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema->children[0], 5));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
+  input_schema->children[0]->flags = 0;
+
+  auto child = input_schema->children[0];
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[0], "string"));
+  child->children[0]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[1], "integral"));
+  child->children[1]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[2], "bool"));
+  child->children[2]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3], "nested_list"));
+  child->children[3]->flags = 0;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3]->children[0], "element"));
+  child->children[3]->children[0]->flags = 0;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element"));
+  child->children[3]->children[0]->children[0]->flags = 0;
+
+  ArrowSchemaInit(child->children[4]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(child->children[4], 2));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4], "struct"));
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[0], "string2"));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[1], "integral2"));
+
+  // create nanoarrow table
+  // first our underlying arrays
+  std::vector<std::string> str{"Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"};
+  std::vector<std::string> str2{"CUDF", "ROCKS", "EVERYWHERE"};
+  auto str_array  = get_nanoarrow_array<cudf::string_view>(str);
+  auto int_array  = get_nanoarrow_array<int32_t>({48, 27, 25});
+  auto str2_array = get_nanoarrow_array<cudf::string_view>(str2, {0, 1, 0});
+  auto int2_array = get_nanoarrow_array<int32_t, uint8_t>({12, 24, 47}, {1, 0, 1});
+  auto bool_array = get_nanoarrow_array<bool>({true, true, false});
+  auto list_arr =
+    get_nanoarrow_list_array<int64_t>({1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 2, 4, 5, 6, 7, 9});
+  std::vector<int32_t> offset{0, 3, 4, 6};
+
+  // create the struct array
+  nanoarrow::UniqueArray input_array;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
+
+  input_array->length = expected_table_view.num_rows();
+
+  auto array_a        = input_array->children[0];
+  auto view_a         = expected_table_view.column(0);
+  array_a->length     = view_a.size();
+  array_a->null_count = view_a.null_count();
+  // populate the children of our struct by moving them from the original arrays
+  str_array.move(array_a->children[0]);
+  int_array.move(array_a->children[1]);
+  bool_array.move(array_a->children[2]);
+
+  array_a->children[3]->length     = expected_table_view.num_rows();
+  array_a->children[3]->null_count = 0;
+  auto offset_buf                  = ArrowArrayBuffer(array_a->children[3], 1);
+  EXPECT_EQ(
+    NANOARROW_OK,
+    ArrowBufferAppend(
+      offset_buf, reinterpret_cast<const void*>(offset.data()), offset.size() * sizeof(int32_t)));
+
+  list_arr.move(array_a->children[3]->children[0]);
+
+  // set our struct bitmap validity mask
+  ArrowBitmap mask;
+  ArrowBitmapInit(&mask);
+  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&mask, 3));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 1, 2));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 0, 1));
+
+  auto array_struct = array_a->children[4];
+  auto view_struct  = view_a.child(4);
+  ArrowArraySetValidityBitmap(array_struct, &mask);
+  array_struct->null_count = view_struct.null_count();
+  array_struct->length     = view_struct.size();
+
+  str2_array.move(array_struct->children[0]);
+  int2_array.move(array_struct->children[1]);
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+  ArrowDeviceArray input;
+  memcpy(&input.array, input_array.get(), sizeof(ArrowArray));
+  input.device_id   = -1;
+  input.device_type = ARROW_DEVICE_CPU;
+
+  // test we get the expected cudf::table from the arrow host memory data
+  auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view());
+
+  // test we get the expected cudf struct column
+  auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  auto got_cudf_col_view = got_cudf_col->view();
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct);
+}
+
+TEST_F(FromArrowHostDeviceTest, DictionaryIndicesType)
+{
+  // test dictionary arrays with different index types
+  // cudf asserts that the index type must be unsigned
+  auto array1 =
+    get_nanoarrow_dict_array<int64_t, uint8_t>({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  auto array2 =
+    get_nanoarrow_dict_array<int64_t, uint16_t>({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  auto array3 =
+    get_nanoarrow_dict_array<int64_t, uint64_t>({1, 2, 5, 7}, {0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+
+  // create equivalent cudf dictionary columns
+  auto keys_col = cudf::test::fixed_width_column_wrapper<int64_t>({1, 2, 5, 7});
+  auto ind1_col = cudf::test::fixed_width_column_wrapper<uint8_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  auto ind2_col =
+    cudf::test::fixed_width_column_wrapper<uint16_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+  auto ind3_col =
+    cudf::test::fixed_width_column_wrapper<uint64_t>({0, 1, 2, 1, 3}, {1, 0, 1, 1, 1});
+
+  vector_of_columns columns;
+  columns.emplace_back(cudf::make_dictionary_column(keys_col, ind1_col));
+  columns.emplace_back(cudf::make_dictionary_column(keys_col, ind2_col));
+  columns.emplace_back(cudf::make_dictionary_column(keys_col, ind3_col));
+
+  cudf::table expected_table(std::move(columns));
+
+  nanoarrow::UniqueSchema input_schema;
+  ArrowSchemaInit(input_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(input_schema.get(), 3));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[0], NANOARROW_TYPE_UINT8));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[0], "a"));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[0]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[0]->dictionary, NANOARROW_TYPE_INT64));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[1], NANOARROW_TYPE_UINT16));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[1], "b"));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[1]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[1]->dictionary, NANOARROW_TYPE_INT64));
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(input_schema->children[2], NANOARROW_TYPE_UINT64));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(input_schema->children[2], "c"));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaAllocateDictionary(input_schema->children[2]));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(input_schema->children[2]->dictionary, NANOARROW_TYPE_INT64));
+
+  nanoarrow::UniqueArray input_array;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(input_array.get(), input_schema.get(), nullptr));
+  input_array->length     = expected_table.num_rows();
+  input_array->null_count = 0;
+
+  array1.move(input_array->children[0]);
+  array2.move(input_array->children[1]);
+  array3.move(input_array->children[2]);
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayFinishBuilding(input_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+  ArrowDeviceArray input;
+  memcpy(&input.array, input_array.get(), sizeof(ArrowArray));
+  input.device_id   = -1;
+  input.device_type = ARROW_DEVICE_CPU;
+
+  // test we get the expected cudf table when we convert from Arrow host memory
+  auto got_cudf_table = cudf::from_arrow_host(input_schema.get(), &input);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.view(), got_cudf_table->view());
+
+  // test we get the expected cudf::column as a struct column
+  auto got_cudf_col = cudf::from_arrow_host_column(input_schema.get(), &input);
+  EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+  auto got_cudf_col_view = got_cudf_col->view();
+  cudf::table_view from_struct{
+    std::vector<cudf::column_view>(got_cudf_col_view.child_begin(), got_cudf_col_view.child_end())};
+  CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct);
+}
+
+void slice_host_nanoarrow(ArrowArray* arr, int64_t start, int64_t end)
+{
+  auto op = [&](ArrowArray* array) {
+    // slicing only needs to happen at the top level of an array
+    array->offset = start;
+    array->length = end - start;
+    if (array->null_count != 0) {
+      array->null_count =
+        array->length -
+        ArrowBitCountSet(ArrowArrayValidityBitmap(array)->buffer.data, start, end - start);
+    }
+  };
+
+  if (arr->n_children == 0) {
+    op(arr);
+    return;
+  }
+
+  // since we want to simulate a sliced table where the children are sliced,
+  // we slice each individual child of the record batch
+  arr->length = end - start;
+  for (int64_t i = 0; i < arr->n_children; ++i) {
+    op(arr->children[i]);
+  }
+}
+
+struct FromArrowHostDeviceTestSlice
+  : public FromArrowHostDeviceTest,
+    public ::testing::WithParamInterface<std::tuple<cudf::size_type, cudf::size_type>> {};
+
+TEST_P(FromArrowHostDeviceTestSlice, SliceTest)
+{
+  auto [table, schema, array] = get_nanoarrow_host_tables(10000);
+  auto cudf_table_view        = table->view();
+  auto const [start, end]     = GetParam();
+
+  auto sliced_cudf_table   = cudf::slice(cudf_table_view, {start, end})[0];
+  auto expected_cudf_table = cudf::table{sliced_cudf_table};
+  slice_host_nanoarrow(array.get(), start, end);
+
+  ArrowDeviceArray input;
+  memcpy(&input.array, array.get(), sizeof(ArrowArray));
+  input.device_id   = -1;
+  input.device_type = ARROW_DEVICE_CPU;
+
+  auto got_cudf_table = cudf::from_arrow_host(schema.get(), &input);
+  if (got_cudf_table->num_rows() == 0 and sliced_cudf_table.num_rows() == 0) {
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_cudf_table.view(), got_cudf_table->view());
+
+    auto got_cudf_col = cudf::from_arrow_host_column(schema.get(), &input);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    auto got_cudf_col_view = got_cudf_col->view();
+    cudf::table_view from_struct{std::vector<cudf::column_view>(got_cudf_col_view.child_begin(),
+                                                                got_cudf_col_view.child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(got_cudf_table->view(), from_struct);
+  } else {
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table.view(), got_cudf_table->view());
+
+    auto got_cudf_col = cudf::from_arrow_host_column(schema.get(), &input);
+    EXPECT_EQ(got_cudf_col->type(), cudf::data_type{cudf::type_id::STRUCT});
+    auto got_cudf_col_view = got_cudf_col->view();
+    cudf::table_view from_struct{std::vector<cudf::column_view>(got_cudf_col_view.child_begin(),
+                                                                got_cudf_col_view.child_end())};
+    CUDF_TEST_EXPECT_TABLES_EQUAL(got_cudf_table->view(), from_struct);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(FromArrowHostDeviceTest,
+                        FromArrowHostDeviceTestSlice,
+                        ::testing::Values(std::make_tuple(0, 10000),
+                                          std::make_tuple(2912, 2915),
+                                          std::make_tuple(100, 3000),
+                                          std::make_tuple(0, 0),
+                                          std::make_tuple(0, 3000),
+                                          std::make_tuple(10000, 10000)));
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index fb5d1060f6f..a79e6fdc49c 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -20,14 +20,61 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/interop/detail/arrow.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/null_mask.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/wrappers/durations.hpp>
 
 #include <nanoarrow/nanoarrow.hpp>
 
+struct generated_test_data {
+  generated_test_data(cudf::size_type length)
+    : int64_data(length),
+      bool_data(length),
+      string_data(length),
+      validity(length),
+      bool_validity(length),
+      list_int64_data(3 * length),
+      list_int64_data_validity(3 * length),
+      list_offsets(length + 1)
+  {
+    cudf::size_type length_of_individual_list = 3;
+
+    std::generate(int64_data.begin(), int64_data.end(), []() { return rand() % 500000; });
+    std::generate(list_int64_data.begin(), list_int64_data.end(), []() { return rand() % 500000; });
+    auto validity_generator = []() { return rand() % 7 != 0; };
+    std::generate(
+      list_int64_data_validity.begin(), list_int64_data_validity.end(), validity_generator);
+    std::generate(
+      list_offsets.begin(), list_offsets.end(), [length_of_individual_list, n = 0]() mutable {
+        return (n++) * length_of_individual_list;
+      });
+    std::generate(bool_data.begin(), bool_data.end(), validity_generator);
+    std::generate(
+      string_data.begin(), string_data.end(), []() { return rand() % 7 != 0 ? "CUDF" : "Rocks"; });
+    std::generate(validity.begin(), validity.end(), validity_generator);
+    std::generate(bool_validity.begin(), bool_validity.end(), validity_generator);
+
+    std::transform(bool_validity.cbegin(),
+                   bool_validity.cend(),
+                   std::back_inserter(bool_data_validity),
+                   [](auto val) { return static_cast<uint8_t>(val); });
+  }
+
+  std::vector<int64_t> int64_data;
+  std::vector<bool> bool_data;
+  std::vector<std::string> string_data;
+  std::vector<uint8_t> validity;
+  std::vector<bool> bool_validity;
+  std::vector<uint8_t> bool_data_validity;
+  std::vector<int64_t> list_int64_data;
+  std::vector<uint8_t> list_int64_data_validity;
+  std::vector<int32_t> list_offsets;
+};
+
 // no-op allocator/deallocator to set into ArrowArray buffers that we don't
 // want to own their buffers.
 static ArrowBufferAllocator noop_alloc = (struct ArrowBufferAllocator){
@@ -135,7 +182,196 @@ void populate_dict_from_col(ArrowArray* arr, cudf::dictionary_column_view dview)
   populate_from_col<KEY_TYPE>(arr->dictionary, dview.keys());
 }
 
+using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
+
 std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, nanoarrow::UniqueArray>
 get_nanoarrow_tables(cudf::size_type length = 10000);
 
 void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view);
+
+std::unique_ptr<cudf::table> get_cudf_table();
+
+template <typename T>
+struct nanoarrow_storage_type {};
+
+#define DEFINE_NANOARROW_STORAGE(T, NanoType)                    \
+  template <>                                                    \
+  struct nanoarrow_storage_type<T> {                             \
+    static constexpr ArrowType type = NANOARROW_TYPE_##NanoType; \
+  }
+
+DEFINE_NANOARROW_STORAGE(bool, BOOL);
+DEFINE_NANOARROW_STORAGE(int64_t, INT64);
+DEFINE_NANOARROW_STORAGE(uint16_t, UINT16);
+DEFINE_NANOARROW_STORAGE(uint64_t, UINT64);
+DEFINE_NANOARROW_STORAGE(cudf::duration_D, INT32);
+DEFINE_NANOARROW_STORAGE(cudf::duration_s, INT64);
+DEFINE_NANOARROW_STORAGE(cudf::duration_ms, INT64);
+DEFINE_NANOARROW_STORAGE(cudf::duration_us, INT64);
+DEFINE_NANOARROW_STORAGE(cudf::duration_ns, INT64);
+DEFINE_NANOARROW_STORAGE(uint8_t, UINT8);
+DEFINE_NANOARROW_STORAGE(int32_t, INT32);
+
+#undef DEFINE_NANOARROW_STORAGE
+
+template <typename T>
+std::enable_if_t<cudf::is_fixed_width<T>() and !std::is_same_v<T, bool>, nanoarrow::UniqueArray>
+get_nanoarrow_array(std::vector<T> const& data, std::vector<uint8_t> const& mask = {})
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), nanoarrow_storage_type<T>::type));
+
+  if (!mask.empty()) {
+    ArrowBitmap bitmap;
+    ArrowBitmapInit(&bitmap);
+    NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&bitmap, mask.size()));
+    ArrowBitmapAppendInt8Unsafe(&bitmap, reinterpret_cast<const int8_t*>(mask.data()), mask.size());
+
+    ArrowArraySetValidityBitmap(tmp.get(), &bitmap);
+    tmp->null_count =
+      data.size() -
+      ArrowBitCountSet(ArrowArrayValidityBitmap(tmp.get())->buffer.data, 0, mask.size());
+  }
+
+  ArrowBuffer buf;
+  ArrowBufferInit(&buf);
+  NANOARROW_THROW_NOT_OK(
+    ArrowBufferAppend(&buf, reinterpret_cast<void const*>(data.data()), sizeof(T) * data.size()));
+  NANOARROW_THROW_NOT_OK(ArrowArraySetBuffer(tmp.get(), 1, &buf));
+
+  tmp->length = data.size();
+
+  return tmp;
+}
+
+template <typename T>
+std::enable_if_t<std::is_same_v<T, bool>, nanoarrow::UniqueArray> get_nanoarrow_array(
+  std::vector<bool> const& data, std::vector<bool> const& mask = {})
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_BOOL));
+
+  auto to_arrow_bitmap = [](std::vector<bool> const& b) -> ArrowBitmap {
+    ArrowBitmap out;
+    ArrowBitmapInit(&out);
+    NANOARROW_THROW_NOT_OK(ArrowBitmapResize(&out, b.size(), 1));
+    out.buffer.size_bytes = (b.size() >> 3) + ((b.size() & 7) != 0);
+    out.size_bits         = b.size();
+
+    for (size_t i = 0; i < b.size(); ++i) {
+      ArrowBitSetTo(out.buffer.data, i, static_cast<uint8_t>(b[i]));
+    }
+
+    return out;
+  };
+
+  if (!mask.empty()) {
+    auto validity_bitmap = to_arrow_bitmap(mask);
+    ArrowArraySetValidityBitmap(tmp.get(), &validity_bitmap);
+    tmp->null_count =
+      mask.size() -
+      ArrowBitCountSet(ArrowArrayValidityBitmap(tmp.get())->buffer.data, 0, mask.size());
+  }
+
+  auto raw_buffer = to_arrow_bitmap(data);
+  NANOARROW_THROW_NOT_OK(ArrowArraySetBuffer(tmp.get(), 1, &raw_buffer.buffer));
+  tmp->length = data.size();
+
+  return tmp;
+}
+
+template <typename T, typename B>
+nanoarrow::UniqueArray get_nanoarrow_array(std::initializer_list<T> elements,
+                                           std::initializer_list<B> validity = {})
+{
+  std::vector<B> mask(validity);
+  std::vector<T> data(elements);
+
+  return get_nanoarrow_array<T>(data, mask);
+}
+
+template <typename T>
+std::enable_if_t<std::is_same_v<T, cudf::string_view>, nanoarrow::UniqueArray> get_nanoarrow_array(
+  std::vector<std::string> const& data, std::vector<uint8_t> const& mask = {})
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(tmp.get()));
+  NANOARROW_THROW_NOT_OK(ArrowArrayReserve(tmp.get(), data.size()));
+
+  for (size_t i = 0; i < data.size(); ++i) {
+    if (!mask.empty() && mask[i] == 0) {
+      NANOARROW_THROW_NOT_OK(ArrowArrayAppendNull(tmp.get(), 1));
+    } else {
+      NANOARROW_THROW_NOT_OK(ArrowArrayAppendString(tmp.get(), ArrowCharView(data[i].c_str())));
+    }
+  }
+
+  return tmp;
+}
+
+template <typename KEY_TYPE, typename IND_TYPE>
+nanoarrow::UniqueArray get_nanoarrow_dict_array(std::vector<KEY_TYPE> const& keys,
+                                                std::vector<IND_TYPE> const& ind,
+                                                std::vector<uint8_t> const& validity = {})
+{
+  auto indices_array = get_nanoarrow_array<IND_TYPE>(ind, validity);
+  NANOARROW_THROW_NOT_OK(ArrowArrayAllocateDictionary(indices_array.get()));
+
+  auto keys_array = get_nanoarrow_array<KEY_TYPE>(keys);
+  keys_array.move(indices_array->dictionary);
+
+  return indices_array;
+}
+
+template <typename T>
+nanoarrow::UniqueArray get_nanoarrow_list_array(std::vector<T> const& data,
+                                                std::vector<int32_t> const& offsets,
+                                                std::vector<uint8_t> const& data_validity = {},
+                                                std::vector<uint8_t> const& list_validity = {})
+{
+  auto data_array = get_nanoarrow_array<T>(data, data_validity);
+
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), 1));
+  data_array.move(tmp->children[0]);
+
+  tmp->length = offsets.size() - 1;
+  if (!list_validity.empty()) {
+    ArrowBitmap bitmap;
+    ArrowBitmapInit(&bitmap);
+    NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&bitmap, list_validity.size()));
+    ArrowBitmapAppendInt8Unsafe(
+      &bitmap, reinterpret_cast<const int8_t*>(list_validity.data()), list_validity.size());
+
+    ArrowArraySetValidityBitmap(tmp.get(), &bitmap);
+    tmp->null_count =
+      tmp->length -
+      ArrowBitCountSet(ArrowArrayValidityBitmap(tmp.get())->buffer.data, 0, list_validity.size());
+  }
+
+  ArrowBuffer buf;
+  ArrowBufferInit(&buf);
+  NANOARROW_THROW_NOT_OK(ArrowBufferAppend(
+    &buf, reinterpret_cast<void const*>(offsets.data()), sizeof(int32_t) * offsets.size()));
+  NANOARROW_THROW_NOT_OK(ArrowArraySetBuffer(tmp.get(), 1, &buf));
+
+  return tmp;
+}
+
+template <typename T>
+nanoarrow::UniqueArray get_nanoarrow_list_array(std::initializer_list<T> data,
+                                                std::initializer_list<int32_t> offsets,
+                                                std::initializer_list<uint8_t> data_validity = {},
+                                                std::initializer_list<uint8_t> list_validity = {})
+{
+  std::vector<T> data_vector(data);
+  std::vector<int32_t> offset(offsets);
+  std::vector<uint8_t> data_mask(data_validity);
+  std::vector<uint8_t> list_mask(list_validity);
+  return get_nanoarrow_list_array<T>(data_vector, offset, data_mask, list_mask);
+}
+
+std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, generated_test_data>
+get_nanoarrow_cudf_table(cudf::size_type length);
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 626aeb53cdd..4c73cd637a4 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -38,80 +38,55 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
-using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
-
-std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, nanoarrow::UniqueArray>
-get_nanoarrow_tables(cudf::size_type length)
+std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, generated_test_data>
+get_nanoarrow_cudf_table(cudf::size_type length)
 {
-  std::vector<int64_t> int64_data(length);
-  std::vector<bool> bool_data(length);
-  std::vector<std::string> string_data(length);
-  std::vector<uint8_t> validity(length);
-  std::vector<bool> bool_validity(length);
-  std::vector<uint8_t> bool_data_validity;
-  cudf::size_type length_of_individual_list = 3;
-  cudf::size_type length_of_list            = length_of_individual_list * length;
-  std::vector<int64_t> list_int64_data(length_of_list);
-  std::vector<uint8_t> list_int64_data_validity(length_of_list);
-  std::vector<int32_t> list_offsets(length + 1);
+  generated_test_data test_data(length);
 
   std::vector<std::unique_ptr<cudf::column>> columns;
 
-  std::generate(int64_data.begin(), int64_data.end(), []() { return rand() % 500000; });
-  std::generate(list_int64_data.begin(), list_int64_data.end(), []() { return rand() % 500000; });
-  auto validity_generator = []() { return rand() % 7 != 0; };
-  std::generate(
-    list_int64_data_validity.begin(), list_int64_data_validity.end(), validity_generator);
-  std::generate(
-    list_offsets.begin(), list_offsets.end(), [length_of_individual_list, n = 0]() mutable {
-      return (n++) * length_of_individual_list;
-    });
-  std::generate(bool_data.begin(), bool_data.end(), validity_generator);
-  std::generate(
-    string_data.begin(), string_data.end(), []() { return rand() % 7 != 0 ? "CUDF" : "Rocks"; });
-  std::generate(validity.begin(), validity.end(), validity_generator);
-  std::generate(bool_validity.begin(), bool_validity.end(), validity_generator);
-
-  std::transform(bool_validity.cbegin(),
-                 bool_validity.cend(),
-                 std::back_inserter(bool_data_validity),
-                 [](auto val) { return static_cast<uint8_t>(val); });
-
-  columns.emplace_back(cudf::test::fixed_width_column_wrapper<int64_t>(
-                         int64_data.begin(), int64_data.end(), validity.begin())
+  columns.emplace_back(cudf::test::fixed_width_column_wrapper<int64_t>(test_data.int64_data.begin(),
+                                                                       test_data.int64_data.end(),
+                                                                       test_data.validity.begin())
+                         .release());
+  columns.emplace_back(cudf::test::strings_column_wrapper(test_data.string_data.begin(),
+                                                          test_data.string_data.end(),
+                                                          test_data.validity.begin())
                          .release());
-  columns.emplace_back(
-    cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin())
-      .release());
   auto col4 = cudf::test::fixed_width_column_wrapper<int64_t>(
-    int64_data.begin(), int64_data.end(), validity.begin());
+    test_data.int64_data.begin(), test_data.int64_data.end(), test_data.validity.begin());
   auto dict_col = cudf::dictionary::encode(col4);
   columns.emplace_back(std::move(cudf::dictionary::encode(col4)));
-  columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(
-                         bool_data.begin(), bool_data.end(), bool_validity.begin())
+  columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(test_data.bool_data.begin(),
+                                                                    test_data.bool_data.end(),
+                                                                    test_data.bool_validity.begin())
                          .release());
-  auto list_child_column = cudf::test::fixed_width_column_wrapper<int64_t>(
-    list_int64_data.begin(), list_int64_data.end(), list_int64_data_validity.begin());
-  auto list_offsets_column =
-    cudf::test::fixed_width_column_wrapper<int32_t>(list_offsets.begin(), list_offsets.end());
+  auto list_child_column =
+    cudf::test::fixed_width_column_wrapper<int64_t>(test_data.list_int64_data.begin(),
+                                                    test_data.list_int64_data.end(),
+                                                    test_data.list_int64_data_validity.begin());
+  auto list_offsets_column = cudf::test::fixed_width_column_wrapper<int32_t>(
+    test_data.list_offsets.begin(), test_data.list_offsets.end());
   auto [list_mask, list_nulls] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>(
-    bool_data_validity.begin(), bool_data_validity.end()));
+    test_data.bool_data_validity.begin(), test_data.bool_data_validity.end()));
   columns.emplace_back(cudf::make_lists_column(length,
                                                list_offsets_column.release(),
                                                list_child_column.release(),
                                                list_nulls,
                                                std::move(*list_mask)));
-  auto int_column = cudf::test::fixed_width_column_wrapper<int64_t>(
-                      int64_data.begin(), int64_data.end(), validity.begin())
-                      .release();
+  auto int_column =
+    cudf::test::fixed_width_column_wrapper<int64_t>(
+      test_data.int64_data.begin(), test_data.int64_data.end(), test_data.validity.begin())
+      .release();
   auto str_column =
-    cudf::test::strings_column_wrapper(string_data.begin(), string_data.end(), validity.begin())
+    cudf::test::strings_column_wrapper(
+      test_data.string_data.begin(), test_data.string_data.end(), test_data.validity.begin())
       .release();
   vector_of_columns cols;
   cols.push_back(move(int_column));
   cols.push_back(move(str_column));
   auto [null_mask, null_count] = cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>(
-    bool_data_validity.begin(), bool_data_validity.end()));
+    test_data.bool_data_validity.begin(), test_data.bool_data_validity.end()));
   columns.emplace_back(
     cudf::make_structs_column(length, std::move(cols), null_count, std::move(*null_mask)));
 
@@ -198,21 +173,30 @@ get_nanoarrow_tables(cudf::size_type length)
     schema->children[5]->flags = 0;
   }
 
+  return std::make_tuple(
+    std::make_unique<cudf::table>(std::move(columns)), std::move(schema), std::move(test_data));
+}
+
+std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, nanoarrow::UniqueArray>
+get_nanoarrow_tables(cudf::size_type length)
+{
+  auto [table, schema, test_data] = get_nanoarrow_cudf_table(length);
+
   nanoarrow::UniqueArray arrow;
   NANOARROW_THROW_NOT_OK(ArrowArrayInitFromSchema(arrow.get(), schema.get(), nullptr));
   arrow->length = length;
 
-  populate_from_col<int64_t>(arrow->children[0], columns[0]->view());
-  populate_from_col<cudf::string_view>(arrow->children[1], columns[1]->view());
-  populate_dict_from_col<int64_t, uint32_t>(arrow->children[2],
-                                            cudf::dictionary_column_view(columns[2]->view()));
+  populate_from_col<int64_t>(arrow->children[0], table->get_column(0).view());
+  populate_from_col<cudf::string_view>(arrow->children[1], table->get_column(1).view());
+  populate_dict_from_col<int64_t, uint32_t>(
+    arrow->children[2], cudf::dictionary_column_view(table->get_column(2).view()));
 
-  populate_from_col<bool>(arrow->children[3], columns[3]->view());
-  cudf::lists_column_view list_view{columns[4]->view()};
+  populate_from_col<bool>(arrow->children[3], table->get_column(3).view());
+  cudf::lists_column_view list_view{table->get_column(4).view()};
   populate_list_from_col(arrow->children[4], list_view);
   populate_from_col<int64_t>(arrow->children[4]->children[0], list_view.child());
 
-  cudf::structs_column_view struct_view{columns[5]->view()};
+  cudf::structs_column_view struct_view{table->get_column(5).view()};
   populate_from_col<int64_t>(arrow->children[5]->children[0], struct_view.child(0));
   populate_from_col<cudf::string_view>(arrow->children[5]->children[1], struct_view.child(1));
   arrow->children[5]->length     = struct_view.size();
@@ -231,8 +215,7 @@ get_nanoarrow_tables(cudf::size_type length)
     CUDF_FAIL("failed to build example arrays");
   }
 
-  return std::make_tuple(
-    std::make_unique<cudf::table>(std::move(columns)), std::move(schema), std::move(arrow));
+  return std::make_tuple(std::move(table), std::move(schema), std::move(arrow));
 }
 
 // populate an ArrowArray list array from device buffers using a no-op

From 12336da6ff3ae819635524127e65c0bfde0f3915 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Wed, 29 May 2024 14:47:51 -0400
Subject: [PATCH 273/842] Utilities for decimal <--> floating conversion
 (#15359)

These are some utilities used by the upcoming decimal <--> floating conversion PR.  This has been submitted separately from that PR in order to spread out the complexity for review.  These functions are not called by any code in this PR.

One function is used to extract the components of the floating point number.  Another function is used to set a floating point's sign bit and add some additional powers of two.  These are done using integer and bit operations, which is much faster than using the built-in functions and bottle-necking on the FP64 pipeline.  The final function is used to count the # of significant bits in a number.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15359
---
 .../cudf/fixed_point/floating_conversion.hpp  | 241 ++++++++++++++++++
 1 file changed, 241 insertions(+)

diff --git a/cpp/include/cudf/fixed_point/floating_conversion.hpp b/cpp/include/cudf/fixed_point/floating_conversion.hpp
index 492f7e75219..2c3a5c5629d 100644
--- a/cpp/include/cudf/fixed_point/floating_conversion.hpp
+++ b/cpp/include/cudf/fixed_point/floating_conversion.hpp
@@ -16,8 +16,13 @@
 
 #pragma once
 
+#include <cudf/utilities/traits.hpp>
+
+#include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
+#include <cstring>
+
 namespace numeric {
 
 /**
@@ -29,6 +34,242 @@ namespace numeric {
 
 namespace detail {
 
+/**
+ * @brief Helper struct for getting and setting the components of a floating-point value
+ *
+ * @tparam FloatingType Type of floating-point value
+ */
+template <typename FloatingType, CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+struct floating_converter {
+  // This struct assumes we're working with IEEE 754 floating-point values.
+  // Details on the IEEE-754 floating-point format:
+  // Format: https://learn.microsoft.com/en-us/cpp/build/ieee-floating-point-representation
+  // Float Visualizer: https://www.h-schmidt.net/FloatConverter/IEEE754.html
+  static_assert(cuda::std::numeric_limits<FloatingType>::is_iec559, "Assumes IEEE 754");
+
+  /// Unsigned int type with same size as floating type
+  using IntegralType =
+    cuda::std::conditional_t<cuda::std::is_same_v<FloatingType, float>, uint32_t, uint64_t>;
+
+  // The high bit is the sign bit (0 for positive, 1 for negative).
+  /// How many bits in the floating type
+  static constexpr int num_floating_bits = sizeof(FloatingType) * CHAR_BIT;
+  /// The index of the sign bit
+  static constexpr int sign_bit_index = num_floating_bits - 1;
+  /// The mask to select the sign bit
+  static constexpr IntegralType sign_mask = (IntegralType(1) << sign_bit_index);
+
+  // The low 23 / 52 bits (for float / double) are the mantissa.
+  // The mantissa is normalized. There is an understood 1 bit to the left of the binary point.
+  // The value of the mantissa is in the range [1, 2).
+  /// # mantissa bits (-1 for understood bit)
+  static constexpr int num_mantissa_bits = cuda::std::numeric_limits<FloatingType>::digits - 1;
+  /// The mask for the understood bit
+  static constexpr IntegralType understood_bit_mask = (IntegralType(1) << num_mantissa_bits);
+  /// The mask to select the mantissa
+  static constexpr IntegralType mantissa_mask = understood_bit_mask - 1;
+
+  // And in between are the bits used to store the biased power-of-2 exponent.
+  /// # exponents bits (-1 for sign bit)
+  static constexpr int num_exponent_bits = num_floating_bits - num_mantissa_bits - 1;
+  /// The mask for the exponents, unshifted
+  static constexpr IntegralType unshifted_exponent_mask =
+    (IntegralType(1) << num_exponent_bits) - 1;
+  /// The mask to select the exponents
+  static constexpr IntegralType exponent_mask = unshifted_exponent_mask << num_mantissa_bits;
+
+  // To store positive and negative exponents as unsigned values, the stored value for
+  // the power-of-2 is exponent + bias. The bias is 127 for floats and 1023 for doubles.
+  /// 127 / 1023 for float / double
+  static constexpr IntegralType exponent_bias =
+    cuda::std::numeric_limits<FloatingType>::max_exponent - 1;
+
+  /**
+   * @brief Reinterpret the bits of a floating-point value as an integer
+   *
+   * @param floating The floating-point value to cast
+   * @return An integer with bits identical to the input
+   */
+  CUDF_HOST_DEVICE inline static IntegralType bit_cast_to_integer(FloatingType floating)
+  {
+    // Convert floating to integer
+    IntegralType integer_rep;
+    memcpy(&integer_rep, &floating, sizeof(floating));
+    return integer_rep;
+  }
+
+  /**
+   * @brief Reinterpret the bits of an integer as floating-point value
+   *
+   * @param integer The integer to cast
+   * @return A floating-point value with bits identical to the input
+   */
+  CUDF_HOST_DEVICE inline static FloatingType bit_cast_to_floating(IntegralType integer)
+  {
+    // Convert back to float
+    FloatingType floating;
+    memcpy(&floating, &integer, sizeof(floating));
+    return floating;
+  }
+
+  /**
+   * @brief Extracts the integral significand of a bit-casted floating-point number
+   *
+   * @param integer_rep The bit-casted floating value to extract the exponent from
+   * @return The integral significand, bit-shifted to a (large) whole number
+   */
+  CUDF_HOST_DEVICE inline static IntegralType get_base2_value(IntegralType integer_rep)
+  {
+    // Extract the significand, setting the high bit for the understood 1/2
+    return (integer_rep & mantissa_mask) | understood_bit_mask;
+  }
+
+  /**
+   * @brief Extracts the sign bit of a bit-casted floating-point number
+   *
+   * @param integer_rep The bit-casted floating value to extract the exponent from
+   * @return The sign bit
+   */
+  CUDF_HOST_DEVICE inline static bool get_is_negative(IntegralType integer_rep)
+  {
+    // Extract the sign bit:
+    return static_cast<bool>(sign_mask & integer_rep);
+  }
+
+  /**
+   * @brief Extracts the exponent of a bit-casted floating-point number
+   *
+   * @note This returns INT_MIN for +/-0, +/-inf, NaN's, and denormals
+   * For all of these cases, the decimal fixed_point number should be set to zero
+   *
+   * @param integer_rep The bit-casted floating value to extract the exponent from
+   * @return The stored base-2 exponent, or INT_MIN for special values
+   */
+  CUDF_HOST_DEVICE inline static int get_exp2(IntegralType integer_rep)
+  {
+    // First extract the exponent bits and handle its special values.
+    // To minimize branching, all of these special cases will return INT_MIN.
+    // For all of these cases, the decimal fixed_point number should be set to zero.
+    auto const exponent_bits = integer_rep & exponent_mask;
+    if (exponent_bits == 0) {
+      // Because of the understood set-bit not stored in the mantissa, it is not possible
+      // to store the value zero directly. Instead both +/-0 and denormals are represented with
+      // the exponent bits set to zero.
+      // Thus it's fastest to just floor (generally unwanted) denormals to zero.
+      return INT_MIN;
+    } else if (exponent_bits == exponent_mask) {
+      //+/-inf and NaN values are stored with all of the exponent bits set.
+      // As none of these are representable by integers, we'll return the same value for all cases.
+      return INT_MIN;
+    }
+
+    // Extract the exponent value: shift the bits down and subtract the bias.
+    using SignedIntegralType                       = cuda::std::make_signed_t<IntegralType>;
+    SignedIntegralType const shifted_exponent_bits = exponent_bits >> num_mantissa_bits;
+    return shifted_exponent_bits - static_cast<SignedIntegralType>(exponent_bias);
+  }
+
+  /**
+   * @brief Sets the sign bit of a positive floating-point number
+   *
+   * @param floating The floating-point value to set the sign of. Must be positive.
+   * @param is_negative The sign bit to set for the floating-point number
+   * @return The input floating-point value with the chosen sign
+   */
+  CUDF_HOST_DEVICE inline static FloatingType set_is_negative(FloatingType floating,
+                                                              bool is_negative)
+  {
+    // Convert floating to integer
+    IntegralType integer_rep = bit_cast_to_integer(floating);
+
+    // Set the sign bit. Note that the input floating-point number must be positive (bit = 0).
+    integer_rep |= (IntegralType(is_negative) << sign_bit_index);
+
+    // Convert back to float
+    return bit_cast_to_floating(integer_rep);
+  }
+
+  /**
+   * @brief Adds to the base-2 exponent of a floating-point number
+   *
+   * @param floating The floating value to add to the exponent of. Must be positive.
+   * @param exp2 The power-of-2 to add to the floating-point number
+   * @return The input floating-point value * 2^exp2
+   */
+  CUDF_HOST_DEVICE inline static FloatingType add_exp2(FloatingType floating, int exp2)
+  {
+    // Convert floating to integer
+    auto integer_rep = bit_cast_to_integer(floating);
+
+    // Extract the currently stored (biased) exponent
+    auto exponent_bits = integer_rep & exponent_mask;
+    auto stored_exp2   = exponent_bits >> num_mantissa_bits;
+
+    // Add the additional power-of-2
+    stored_exp2 += exp2;
+
+    // Check for exponent over/under-flow.
+    // Note that the input floating-point number is always positive, so we don't have to
+    // worry about the sign here; the sign will be set later in set_is_negative()
+    if (stored_exp2 <= 0) {
+      return 0.0;
+    } else if (stored_exp2 >= unshifted_exponent_mask) {
+      return cuda::std::numeric_limits<FloatingType>::infinity();
+    } else {
+      // Clear existing exponent bits and set new ones
+      exponent_bits = stored_exp2 << num_mantissa_bits;
+      integer_rep &= (~exponent_mask);
+      integer_rep |= exponent_bits;
+
+      // Convert back to float
+      return bit_cast_to_floating(integer_rep);
+    }
+  }
+};
+
+/**
+ * @brief Determine the number of significant bits in an integer
+ *
+ * @tparam T Type of input integer value. Must be either uint32_t, uint64_t, or __uint128_t
+ * @param value The integer whose bits are being counted
+ * @return The number of significant bits: the # of bits - # of leading zeroes
+ */
+template <typename T,
+          CUDF_ENABLE_IF(std::is_same_v<T, uint32_t> || std::is_same_v<T, uint64_t> ||
+                         std::is_same_v<T, __uint128_t>)>
+CUDF_HOST_DEVICE inline int count_significant_bits(T value)
+{
+#ifdef __CUDA_ARCH__
+  if constexpr (std::is_same_v<T, uint64_t>) {
+    return 64 - __clzll(static_cast<int64_t>(value));
+  } else if constexpr (std::is_same_v<T, uint32_t>) {
+    return 32 - __clz(static_cast<int32_t>(value));
+  } else if constexpr (std::is_same_v<T, __uint128_t>) {
+    // 128 bit type, must break up into high and low components
+    auto const high_bits = static_cast<int64_t>(value >> 64);
+    auto const low_bits  = static_cast<int64_t>(value);
+    return 128 - (__clzll(high_bits) + static_cast<int>(high_bits == 0) * __clzll(low_bits));
+  }
+#else
+  // Undefined behavior to call __builtin_clzll() with zero in gcc and clang
+  if (value == 0) { return 0; }
+
+  if constexpr (std::is_same_v<T, uint64_t>) {
+    return 64 - __builtin_clzll(value);
+  } else if constexpr (std::is_same_v<T, uint32_t>) {
+    return 32 - __builtin_clz(value);
+  } else if constexpr (std::is_same_v<T, __uint128_t>) {
+    // 128 bit type, must break up into high and low components
+    auto const high_bits = static_cast<uint64_t>(value >> 64);
+    if (high_bits == 0) {
+      return 64 - __builtin_clzll(static_cast<uint64_t>(value));
+    } else {
+      return 128 - __builtin_clzll(high_bits);
+    }
+  }
+#endif
+}
+
 /**
  * @brief Recursively calculate a signed large power of 10 (>= 10^19) that can only be stored in an
  * 128bit integer

From 3a75f6db18c911d93727d12a0cf5abcdad22efda Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 29 May 2024 15:10:55 -0700
Subject: [PATCH 274/842] Use rapids-build-backend. (#15245)

This PR uses `rapids-build-backend` to simplify wheel builds and reduce the complexity of various CI/build scripts.

See also:
- https://github.com/rapidsai/rapids-build-backend
- https://github.com/rapidsai/build-planning/issues/31

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15245
---
 .pre-commit-config.yaml                       |  2 +-
 build.sh                                      |  2 +-
 ci/build_python.sh                            | 17 ++--
 ci/build_wheel.sh                             | 46 +----------
 ci/build_wheel_cudf.sh                        |  2 +-
 ci/build_wheel_dask_cudf.sh                   |  2 +-
 ci/release/update-version.sh                  |  4 +-
 .../all_cuda-118_arch-x86_64.yaml             |  6 +-
 .../all_cuda-122_arch-x86_64.yaml             |  6 +-
 conda/recipes/cudf/meta.yaml                  |  1 +
 conda/recipes/cudf_kafka/meta.yaml            |  1 +
 conda/recipes/custreamz/meta.yaml             |  4 +-
 conda/recipes/dask-cudf/meta.yaml             |  4 +-
 dependencies.yaml                             | 79 ++++++++++++-------
 python/cudf/cudf/_version.py                  | 19 ++++-
 python/cudf/cudf/tests/test_version.py        | 12 +++
 python/cudf/pyproject.toml                    | 24 ++++--
 python/cudf_kafka/cudf_kafka/_version.py      | 16 +++-
 python/cudf_kafka/pyproject.toml              | 22 ++++--
 python/cudf_polars/cudf_polars/_version.py    | 21 +++++
 python/cudf_polars/pyproject.toml             | 10 ++-
 python/custreamz/custreamz/_version.py        | 16 +++-
 .../custreamz/custreamz/tests/test_version.py | 12 +++
 python/custreamz/pyproject.toml               | 12 ++-
 python/dask_cudf/dask_cudf/_version.py        | 16 +++-
 .../dask_cudf/dask_cudf/tests/test_version.py | 13 +++
 python/dask_cudf/pyproject.toml               | 14 +++-
 27 files changed, 251 insertions(+), 132 deletions(-)
 create mode 100644 python/cudf/cudf/tests/test_version.py
 create mode 100644 python/cudf_polars/cudf_polars/_version.py
 create mode 100644 python/custreamz/custreamz/tests/test_version.py
 create mode 100644 python/dask_cudf/dask_cudf/tests/test_version.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 2d3ffc287e9..8865fb48e0d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -129,7 +129,7 @@ repos:
             ^CHANGELOG.md$
           )
   - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.13.4
+    rev: v1.13.11
     hooks:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
diff --git a/build.sh b/build.sh
index 43bb04f7a18..4291c88ea12 100755
--- a/build.sh
+++ b/build.sh
@@ -70,7 +70,7 @@ BUILD_PER_THREAD_DEFAULT_STREAM=OFF
 BUILD_REPORT_METRICS=OFF
 BUILD_REPORT_INCL_CACHE_STATS=OFF
 USE_PROPRIETARY_NVCOMP=ON
-PYTHON_ARGS_FOR_INSTALL="-m pip install --no-build-isolation --no-deps"
+PYTHON_ARGS_FOR_INSTALL="-m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true"
 
 # Set defaults for vars that may not have been defined externally
 #  FIXME: if INSTALL_PREFIX is not set, check PREFIX, then check
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 3c2a7761e1a..79e09432779 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -13,14 +13,7 @@ export CMAKE_GENERATOR=Ninja
 
 rapids-print-env
 
-package_dir="python"
-version=$(rapids-generate-version)
-commit=$(git rev-parse HEAD)
-
-echo "${version}" > VERSION
-for package_name in cudf dask_cudf cudf_kafka custreamz; do
-    sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" ${package_dir}/${package_name}/${package_name}/_version.py
-done
+rapids-generate-version > ./VERSION
 
 rapids-logger "Begin py build"
 
@@ -29,24 +22,24 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 # TODO: Remove `--no-test` flag once importing on a CPU
 # node works correctly
 # With boa installed conda build forwards to the boa builder
-RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   conda/recipes/cudf
 
-RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/dask-cudf
 
-RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cudf_kafka
 
-RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
   --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index c4b794e81f7..7c1fa705faa 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -3,54 +3,12 @@
 
 set -euo pipefail
 
-package_name=$1
-package_dir=$2
+package_dir=$1
 
 source rapids-configure-sccache
 source rapids-date-string
 
-version=$(rapids-generate-version)
-commit=$(git rev-parse HEAD)
-
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-
-# This is the version of the suffix with a preceding hyphen. It's used
-# everywhere except in the final wheel name.
-PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
-
-# Patch project metadata files to include the CUDA version suffix and version override.
-pyproject_file="${package_dir}/pyproject.toml"
-
-sed -i "s/^name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
-echo "${version}" > VERSION
-sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_dir}/${package_name//-/_}/_version.py"
-
-# For nightlies we want to ensure that we're pulling in alphas as well. The
-# easiest way to do so is to augment the spec with a constraint containing a
-# min alpha version that doesn't affect the version bounds but does allow usage
-# of alpha versions for that dependency without --pre
-alpha_spec=''
-if ! rapids-is-release-build; then
-    alpha_spec=',>=0.0.0a0'
-fi
-
-if [[ ${package_name} == "dask-cudf" ]]; then
-    sed -r -i "s/cudf==(.*)\"/cudf${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
-    sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
-    sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}
-else
-    sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
-    # ptxcompiler and cubinlinker aren't version constrained
-    sed -r -i "s/ptxcompiler\"/ptxcompiler${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
-    sed -r -i "s/cubinlinker\"/cubinlinker${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
-fi
-
-if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then
-    sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file}
-    sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file}
-    sed -i "s/ptxcompiler/pynvjitlink/g" ${pyproject_file}
-    sed -i "/cubinlinker/d" ${pyproject_file}
-fi
+rapids-generate-version > ./VERSION
 
 cd "${package_dir}"
 
diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index f0886a28fd9..1b563bc499c 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -7,7 +7,7 @@ package_dir="python/cudf"
 
 export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON"
 
-./ci/build_wheel.sh cudf ${package_dir}
+./ci/build_wheel.sh ${package_dir}
 
 python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
 
diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh
index 150fec4e2d7..eb2a91289f7 100755
--- a/ci/build_wheel_dask_cudf.sh
+++ b/ci/build_wheel_dask_cudf.sh
@@ -5,7 +5,7 @@ set -euo pipefail
 
 package_dir="python/dask_cudf"
 
-./ci/build_wheel.sh dask-cudf ${package_dir}
+./ci/build_wheel.sh ${package_dir}
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index beeb130f0f1..f629de64905 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -58,10 +58,10 @@ DEPENDENCIES=(
 )
 for DEP in "${DEPENDENCIES[@]}"; do
   for FILE in dependencies.yaml conda/environments/*.yaml; do
-    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*/g" "${FILE}"
+    sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
   for FILE in python/*/pyproject.toml; do
-    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*\"/g" ${FILE}
+    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE}
   done
 done
 
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 985f873e5eb..946e2d1cd32 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -27,6 +27,7 @@ dependencies:
 - cxx-compiler
 - cython>=3.0.3
 - dask-cuda==24.8.*
+- dask-cuda==24.8.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -76,9 +77,10 @@ dependencies:
 - python-confluent-kafka>=1.9.0,<1.10.0a0
 - python>=3.9,<3.12
 - pytorch>=2.1.0
-- rapids-dask-dependency==24.8.*
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- rapids-dask-dependency==24.8.*,>=0.0.0a0
 - rich
-- rmm==24.8.*
+- rmm==24.8.*,>=0.0.0a0
 - s3fs>=2022.3.0
 - scikit-build-core>=0.7.0
 - scipy
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 3083d1dbb03..f069616ddbe 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -28,6 +28,7 @@ dependencies:
 - cxx-compiler
 - cython>=3.0.3
 - dask-cuda==24.8.*
+- dask-cuda==24.8.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -74,9 +75,10 @@ dependencies:
 - python-confluent-kafka>=1.9.0,<1.10.0a0
 - python>=3.9,<3.12
 - pytorch>=2.1.0
-- rapids-dask-dependency==24.8.*
+- rapids-build-backend>=0.3.0,<0.4.0.dev0
+- rapids-dask-dependency==24.8.*,>=0.0.0a0
 - rich
-- rmm==24.8.*
+- rmm==24.8.*,>=0.0.0a0
 - s3fs>=2022.3.0
 - scikit-build-core>=0.7.0
 - scipy
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index e7245e67659..3cdc2050631 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -61,6 +61,7 @@ requirements:
   host:
     - python
     - cython >=3.0.3
+    - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.7.0
     - dlpack >=0.8,<1.0
     - numpy 1.23
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 4d91cf6320c..1b0e0e2c236 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -60,6 +60,7 @@ requirements:
     - cuda-version ={{ cuda_version }}
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
+    - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.7.0
     {% if cuda_major != "11" %}
     - cuda-cudart-dev
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index 755394e3936..f5ea426e0b1 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
@@ -37,6 +37,8 @@ build:
 requirements:
   host:
     - python
+    - rapids-build-backend >=0.3.0,<0.4.0.dev0
+    - setuptools
     - python-confluent-kafka >=1.9.0,<1.10.0a0
     - cudf_kafka ={{ version }}
     - cuda-version ={{ cuda_version }}
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 16638926492..1e6c0a35a09 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 {% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
 {% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
@@ -37,6 +37,8 @@ build:
 requirements:
   host:
     - python
+    - rapids-build-backend >=0.3.0,<0.4.0.dev0
+    - setuptools
     - cuda-version ={{ cuda_version }}
   run:
     - python
diff --git a/dependencies.yaml b/dependencies.yaml
index 3df7cb71a78..8bfa3190b3d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -9,7 +9,6 @@ files:
       - build_base
       - build_all
       - build_cpp
-      - build_wheels
       - build_python_common
       - build_python_cudf
       - cuda
@@ -19,6 +18,8 @@ files:
       - libarrow_build
       - notebooks
       - py_version
+      - rapids_build_skbuild
+      - rapids_build_setuptools
       - run_common
       - run_cudf
       - run_dask_cudf
@@ -75,11 +76,19 @@ files:
       - docs
       - libarrow_run
       - py_version
-  py_build_cudf:
+  py_rapids_build_cudf:
     output: pyproject
     pyproject_dir: python/cudf
     extras:
       table: build-system
+    includes:
+      - rapids_build_skbuild
+  py_build_cudf:
+    output: pyproject
+    pyproject_dir: python/cudf
+    extras:
+      table: tool.rapids-build-backend
+      key: requires
     includes:
       - build_base
       - build_python_common
@@ -119,13 +128,13 @@ files:
       key: cudf-pandas-tests
     includes:
       - test_python_cudf_pandas
-  py_build_cudf_polars:
+  py_rapids_build_cudf_polars:
     output: pyproject
     pyproject_dir: python/cudf_polars
     extras:
       table: build-system
     includes:
-      - build_wheels
+      - rapids_build_setuptools
   py_run_cudf_polars:
     output: pyproject
     pyproject_dir: python/cudf_polars
@@ -148,7 +157,7 @@ files:
     extras:
       table: build-system
     includes:
-      - build_wheels
+      - rapids_build_setuptools
   py_run_dask_cudf:
     output: pyproject
     pyproject_dir: python/dask_cudf
@@ -168,11 +177,19 @@ files:
     includes:
       - test_python_common
       - test_python_dask_cudf
-  py_build_cudf_kafka:
+  py_rapids_build_cudf_kafka:
     output: pyproject
     pyproject_dir: python/cudf_kafka
     extras:
       table: build-system
+    includes:
+      - rapids_build_skbuild
+  py_build_cudf_kafka:
+    output: pyproject
+    pyproject_dir: python/cudf_kafka
+    extras:
+      table: tool.rapids-build-backend
+      key: requires
     includes:
       - build_base
       - build_python_common
@@ -197,7 +214,7 @@ files:
     extras:
       table: build-system
     includes:
-      - build_wheels
+      - rapids_build_setuptools
   py_run_custreamz:
     output: pyproject
     pyproject_dir: python/custreamz
@@ -276,12 +293,24 @@ dependencies:
           # Align nvcomp version with rapids-cmake
           - nvcomp==3.0.6
           - spdlog>=1.12.0,<1.13
-  build_wheels:
+  rapids_build_skbuild:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - &rapids_build_backend rapids-build-backend>=0.3.0,<0.4.0.dev0
+      - output_types: conda
+        packages:
+          - scikit-build-core>=0.7.0
+      - output_types: [requirements, pyproject]
+        packages:
+          - scikit-build-core[pyproject]>=0.7.0
+  rapids_build_setuptools:
     common:
       - output_types: [requirements, pyproject]
         packages:
-          - wheel
+          - *rapids_build_backend
           - setuptools
+          - wheel
   build_python_common:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -290,22 +319,16 @@ dependencies:
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
           - pyarrow==16.1.0.*
-      - output_types: conda
-        packages:
-          - scikit-build-core>=0.7.0
       - output_types: pyproject
         packages:
           # Hard pin the patch version used during the build.
           # Sync with conda build constraint & wheel run constraint.
           - numpy==1.23.*
-      - output_types: [requirements, pyproject]
-        packages:
-          - scikit-build-core[pyproject]>=0.7.0
   build_python_cudf:
     common:
       - output_types: conda
         packages:
-          - &rmm_conda rmm==24.8.*
+          - &rmm_conda rmm==24.8.*,>=0.0.0a0
           - pip
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
@@ -321,10 +344,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages: &build_python_packages_cu12
-              - &rmm_cu12 rmm-cu12==24.8.*
+              - rmm-cu12==24.8.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages: &build_python_packages_cu11
-              - &rmm_cu11 rmm-cu11==24.8.*
+              - rmm-cu11==24.8.*,>=0.0.0a0
           - {matrix: null, packages: [*rmm_conda] }
   libarrow_build:
     common:
@@ -568,11 +591,11 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - rmm-cu12==24.8.*
+              - rmm-cu12==24.8.*,>=0.0.0a0
               - pynvjitlink-cu12
           - matrix: {cuda: "11.*"}
             packages:
-              - rmm-cu11==24.8.*
+              - rmm-cu11==24.8.*,>=0.0.0a0
               - cubinlinker-cu11
               - ptxcompiler-cu11
           - {matrix: null, packages: [cubinlinker, ptxcompiler, *rmm_conda]}
@@ -585,7 +608,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==24.8.*
+          - rapids-dask-dependency==24.8.*,>=0.0.0a0
   run_custreamz:
     common:
       - output_types: conda
@@ -671,13 +694,13 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask-cuda==24.8.*
+          - dask-cuda==24.8.*,>=0.0.0a0
           - *numba
   depends_on_cudf:
     common:
       - output_types: conda
         packages:
-          - &cudf_conda cudf==24.8.*
+          - &cudf_conda cudf==24.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -689,16 +712,16 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf-cu12==24.8.*
+              - cudf-cu12==24.8.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf-cu11==24.8.*
+              - cudf-cu11==24.8.*,>=0.0.0a0
           - {matrix: null, packages: [*cudf_conda]}
   depends_on_cudf_kafka:
     common:
       - output_types: conda
         packages:
-          - &cudf_kafka_conda cudf_kafka==24.8.*
+          - &cudf_kafka_conda cudf_kafka==24.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -710,10 +733,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf_kafka-cu12==24.8.*
+              - cudf_kafka-cu12==24.8.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf_kafka-cu11==24.8.*
+              - cudf_kafka-cu11==24.8.*,>=0.0.0a0
           - {matrix: null, packages: [*cudf_kafka_conda]}
   depends_on_cupy:
     common:
diff --git a/python/cudf/cudf/_version.py b/python/cudf/cudf/_version.py
index ecf6ddd8e3b..7dd732b4905 100644
--- a/python/cudf/cudf/_version.py
+++ b/python/cudf/cudf/_version.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,6 +15,19 @@
 import importlib.resources
 
 __version__ = (
-    importlib.resources.files("cudf").joinpath("VERSION").read_text().strip()
+    importlib.resources.files(__package__)
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
 )
-__git_commit__ = ""
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__git_commit__", "__version__"]
diff --git a/python/cudf/cudf/tests/test_version.py b/python/cudf/cudf/tests/test_version.py
new file mode 100644
index 00000000000..8c10cc20a9a
--- /dev/null
+++ b/python/cudf/cudf/tests/test_version.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import cudf
+
+
+def test_version_constants_are_populated():
+    # __git_commit__ will only be non-empty in a built distribution
+    assert isinstance(cudf.__git_commit__, str)
+
+    # __version__ should always be non-empty
+    assert isinstance(cudf.__version__, str)
+    assert len(cudf.__version__) > 0
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index e6517825083..9ad02fed044 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -1,14 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 [build-system]
-build-backend = "scikit_build_core.build"
+build-backend = "rapids_build_backend.build"
 requires = [
-    "cmake>=3.26.4",
-    "cython>=3.0.3",
-    "ninja",
-    "numpy==1.23.*",
-    "pyarrow==16.1.0.*",
-    "rmm==24.8.*",
+    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -36,7 +31,7 @@ dependencies = [
     "ptxcompiler",
     "pyarrow>=16.1.0,<16.2.0a0",
     "rich",
-    "rmm==24.8.*",
+    "rmm==24.8.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -122,6 +117,19 @@ skip = [
     "__init__.py",
 ]
 
+[tool.rapids-build-backend]
+build-backend = "scikit_build_core.build"
+commit-file = "cudf/GIT_COMMIT"
+dependencies-file = "../../dependencies.yaml"
+requires = [
+    "cmake>=3.26.4",
+    "cython>=3.0.3",
+    "ninja",
+    "numpy==1.23.*",
+    "pyarrow==16.1.0.*",
+    "rmm==24.8.*,>=0.0.0a0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
diff --git a/python/cudf_kafka/cudf_kafka/_version.py b/python/cudf_kafka/cudf_kafka/_version.py
index 5adab566da0..7dd732b4905 100644
--- a/python/cudf_kafka/cudf_kafka/_version.py
+++ b/python/cudf_kafka/cudf_kafka/_version.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,9 +15,19 @@
 import importlib.resources
 
 __version__ = (
-    importlib.resources.files("cudf_kafka")
+    importlib.resources.files(__package__)
     .joinpath("VERSION")
     .read_text()
     .strip()
 )
-__git_commit__ = ""
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__git_commit__", "__version__"]
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 9233d0e92dd..1bc04742a73 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -1,13 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 [build-system]
-build-backend = "scikit_build_core.build"
+build-backend = "rapids_build_backend.build"
 requires = [
-    "cmake>=3.26.4",
-    "cython>=3.0.3",
-    "ninja",
-    "numpy==1.23.*",
-    "pyarrow==16.1.0.*",
+    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
@@ -22,7 +18,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.8.*",
+    "cudf==24.8.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
@@ -100,3 +96,15 @@ wheel.packages = ["cudf_kafka"]
 provider = "scikit_build_core.metadata.regex"
 input = "cudf_kafka/VERSION"
 regex = "(?P<value>.*)"
+
+[tool.rapids-build-backend]
+build-backend = "scikit_build_core.build"
+commit-file = "cudf_kafka/GIT_COMMIT"
+dependencies-file = "../../dependencies.yaml"
+requires = [
+    "cmake>=3.26.4",
+    "cython>=3.0.3",
+    "ninja",
+    "numpy==1.23.*",
+    "pyarrow==16.1.0.*",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cudf_polars/cudf_polars/_version.py b/python/cudf_polars/cudf_polars/_version.py
new file mode 100644
index 00000000000..d906f11cb00
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/_version.py
@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files(__package__).joinpath("VERSION").read_text().strip()
+)
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__git_commit__", "__version__"]
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 00fde6c0e05..86b0ad414fd 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -1,8 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 [build-system]
-build-backend = "setuptools.build_meta"
+build-backend = "rapids_build_backend.build"
 requires = [
+    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -18,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.8.*",
+    "cudf==24.8.*,>=0.0.0a0",
     "polars>=0.20.24",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -169,3 +170,8 @@ rapids = ["rmm", "cudf"]
 
 [tool.ruff.format]
 docstring-code-format = true
+
+[tool.rapids-build-backend]
+build-backend = "setuptools.build_meta"
+commit-file = "cudf_polars/GIT_COMMIT"
+dependencies-file = "../../dependencies.yaml"
diff --git a/python/custreamz/custreamz/_version.py b/python/custreamz/custreamz/_version.py
index 0f545f95f2b..7dd732b4905 100644
--- a/python/custreamz/custreamz/_version.py
+++ b/python/custreamz/custreamz/_version.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,9 +15,19 @@
 import importlib.resources
 
 __version__ = (
-    importlib.resources.files("custreamz")
+    importlib.resources.files(__package__)
     .joinpath("VERSION")
     .read_text()
     .strip()
 )
-__git_commit__ = ""
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__git_commit__", "__version__"]
diff --git a/python/custreamz/custreamz/tests/test_version.py b/python/custreamz/custreamz/tests/test_version.py
new file mode 100644
index 00000000000..cda2dd92155
--- /dev/null
+++ b/python/custreamz/custreamz/tests/test_version.py
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import custreamz
+
+
+def test_version_constants_are_populated():
+    # __git_commit__ will only be non-empty in a built distribution
+    assert isinstance(custreamz.__git_commit__, str)
+
+    # __version__ should always be non-empty
+    assert isinstance(custreamz.__version__, str)
+    assert len(custreamz.__version__) > 0
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index f7e5698900a..e004a8f5219 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -1,8 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 [build-system]
-build-backend = "setuptools.build_meta"
+build-backend = "rapids_build_backend.build"
 requires = [
+    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -19,8 +20,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "confluent-kafka>=1.9.0,<1.10.0a0",
-    "cudf==24.8.*",
-    "cudf_kafka==24.8.*",
+    "cudf==24.8.*,>=0.0.0a0",
+    "cudf_kafka==24.8.*,>=0.0.0a0",
     "streamz",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -45,6 +46,11 @@ test = [
 [project.urls]
 Homepage = "https://github.com/rapidsai/cudf"
 
+[tool.rapids-build-backend]
+build-backend = "setuptools.build_meta"
+commit-file = "custreamz/COMMIT_FILE"
+dependencies-file = "../../dependencies.yaml"
+
 [tool.setuptools]
 license-files = ["LICENSE"]
 zip-safe = false
diff --git a/python/dask_cudf/dask_cudf/_version.py b/python/dask_cudf/dask_cudf/_version.py
index 0dd62854a4e..7dd732b4905 100644
--- a/python/dask_cudf/dask_cudf/_version.py
+++ b/python/dask_cudf/dask_cudf/_version.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,9 +15,19 @@
 import importlib.resources
 
 __version__ = (
-    importlib.resources.files("dask_cudf")
+    importlib.resources.files(__package__)
     .joinpath("VERSION")
     .read_text()
     .strip()
 )
-__git_commit__ = ""
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__git_commit__", "__version__"]
diff --git a/python/dask_cudf/dask_cudf/tests/test_version.py b/python/dask_cudf/dask_cudf/tests/test_version.py
new file mode 100644
index 00000000000..e2724e530ba
--- /dev/null
+++ b/python/dask_cudf/dask_cudf/tests/test_version.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+
+import dask_cudf
+
+
+def test_version_constants_are_populated():
+    # __git_commit__ will only be non-empty in a built distribution
+    assert isinstance(dask_cudf.__git_commit__, str)
+
+    # __version__ should always be non-empty
+    assert isinstance(dask_cudf.__version__, str)
+    assert len(dask_cudf.__version__) > 0
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index e353eac06b9..6b5d5ccc412 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -1,8 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 [build-system]
-build-backend = "setuptools.build_meta"
+build-backend = "rapids_build_backend.build"
 requires = [
+    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -18,12 +19,12 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.8.*",
+    "cudf==24.8.*,>=0.0.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "numpy>=1.23,<2.0a0",
     "pandas>=2.0,<2.2.3dev0",
-    "rapids-dask-dependency==24.8.*",
+    "rapids-dask-dependency==24.8.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -44,7 +45,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint"
 
 [project.optional-dependencies]
 test = [
-    "dask-cuda==24.8.*",
+    "dask-cuda==24.8.*,>=0.0.0a0",
     "numba>=0.57",
     "pytest-cov",
     "pytest-xdist",
@@ -54,6 +55,11 @@ test = [
 [project.urls]
 Homepage = "https://github.com/rapidsai/cudf"
 
+[tool.rapids-build-backend]
+build-backend = "setuptools.build_meta"
+commit-file = "dask_cudf/GIT_COMMIT"
+dependencies-file = "../../dependencies.yaml"
+
 [tool.setuptools]
 license-files = ["LICENSE"]
 

From 5ce95f05eeae469f4d46516b3cf6fe19902623f6 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 30 May 2024 09:24:58 -0400
Subject: [PATCH 275/842] Update interleave lists column for large strings
 (#15877)

Fixes the `compute_string_sizes_and_interleave_lists_fn` functor to use `column_device_view::element<string_view>()` method to access string row contents instead of using the strings offsets. This removes the need to add specific offsetalator logic to the logic.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15877
---
 cpp/src/lists/interleave_columns.cu | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index be8fad62412..45ae3671d4e 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -166,8 +166,6 @@ struct compute_string_sizes_and_interleave_lists_fn {
       lists_col.child(lists_column_view::offsets_column_index).template data<size_type>() +
       lists_col.offset();
     auto const& str_col = lists_col.child(lists_column_view::child_column_index);
-    auto const str_offsets =
-      str_col.child(strings_column_view::offsets_column_index).template data<size_type>();
 
     // The range of indices of the strings within the source list.
     auto const start_str_idx = list_offsets[list_id];
@@ -181,13 +179,13 @@ struct compute_string_sizes_and_interleave_lists_fn {
     size_type write_idx = dst_list_offsets[idx];
 
     for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) {
-      auto const offset        = str_offsets[read_idx];
-      auto const size          = str_offsets[read_idx + 1] - offset;
-      string_index_pair result = {nullptr, size};
-      if (str_col.is_valid(read_idx)) {
-        result.first = size > 0 ? str_col.template head<char>() + offset : "";
+      if (str_col.is_null(read_idx)) {
+        indices[write_idx] = string_index_pair{nullptr, 0};
+        continue;
       }
-      indices[write_idx] = result;
+      auto const d_str   = str_col.element<string_view>(read_idx);
+      indices[write_idx] = d_str.empty() ? string_index_pair{"", 0}
+                                         : string_index_pair{d_str.data(), d_str.size_bytes()};
     }
   }
 };

From 3e9cff2e3ee4f744bcbf80c6f7ad3e5ebcdf94f7 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 30 May 2024 09:33:06 -0400
Subject: [PATCH 276/842] Change thrust::count_if call to raw kernel in strings
 split APIs (#15762)

Fixes calls to `thrust::count_if` in strings split APIs to better handle large strings.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Paul Mattione (https://github.com/pmattione-nvidia)

URL: https://github.com/rapidsai/cudf/pull/15762
---
 cpp/src/strings/split/split.cu                |  1 +
 cpp/src/strings/split/split.cuh               | 59 +++++++++++++++----
 cpp/tests/CMakeLists.txt                      |  1 +
 .../large_strings/split_strings_tests.cpp     | 53 +++++++++++++++++
 4 files changed, 103 insertions(+), 11 deletions(-)
 create mode 100644 cpp/tests/large_strings/split_strings_tests.cpp

diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 2c6a0b2cf22..bc01a46ca6d 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -34,6 +34,7 @@
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
+#include <thrust/binary_search.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 69a11aabfcd..ae3c0b3aa12 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -30,12 +30,9 @@
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
-#include <thrust/binary_search.h>
 #include <thrust/copy.h>
-#include <thrust/count.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/scan.h>
 #include <thrust/transform.h>
 
 namespace cudf::strings::detail {
@@ -297,6 +294,44 @@ std::unique_ptr<column> create_offsets_from_positions(strings_column_view const&
                                                       rmm::cuda_stream_view stream,
                                                       rmm::device_async_resource_ref mr);
 
+/**
+ * @brief Count the number of delimiters in a strings column
+ *
+ * @tparam Tokenizer Functor containing `is_delimiter` function
+ * @tparam block_size Number of threads per block
+ * @tparam bytes_per_thread Number of bytes processed per thread
+ *
+ * @param tokenizer For checking delimiters
+ * @param d_offsets Offsets for the strings column
+ * @param chars_bytes Number of bytes in the strings column
+ * @param d_output Result of the count
+ */
+template <typename Tokenizer, int64_t block_size, size_type bytes_per_thread>
+CUDF_KERNEL void count_delimiters_kernel(Tokenizer tokenizer,
+                                         cudf::detail::input_offsetalator d_offsets,
+                                         int64_t chars_bytes,
+                                         int64_t* d_output)
+{
+  auto const idx      = cudf::detail::grid_1d::global_thread_id();
+  auto const byte_idx = static_cast<int64_t>(idx) * bytes_per_thread;
+  auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
+
+  using block_reduce = cub::BlockReduce<int64_t, block_size>;
+  __shared__ typename block_reduce::TempStorage temp_storage;
+
+  int64_t count = 0;
+  // each thread processes multiple bytes
+  for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) {
+    count += tokenizer.is_delimiter(i, d_offsets, chars_bytes);
+  }
+  auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum());
+
+  if ((lane_idx == 0) && (total > 0)) {
+    cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
+    ref.fetch_add(total, cuda::std::memory_order_relaxed);
+  }
+}
+
 /**
  * @brief Helper function used by split/rsplit and split_record/rsplit_record
  *
@@ -326,17 +361,19 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
 
   // count the number of delimiters in the entire column
-  auto const delimiter_count =
-    thrust::count_if(rmm::exec_policy(stream),
-                     thrust::counting_iterator<int64_t>(0),
-                     thrust::counting_iterator<int64_t>(chars_bytes),
-                     [tokenizer, d_offsets, chars_bytes] __device__(int64_t idx) {
-                       return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
-                     });
+  rmm::device_scalar<int64_t> d_count(0, stream);
+  constexpr int64_t block_size         = 512;
+  constexpr size_type bytes_per_thread = 4;
+  auto const num_blocks                = util::div_rounding_up_safe(
+    util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
+  count_delimiters_kernel<Tokenizer, block_size, bytes_per_thread>
+    <<<num_blocks, block_size, 0, stream.value()>>>(
+      tokenizer, d_offsets, chars_bytes, d_count.data());
+
   // Create a vector of every delimiter position in the chars column.
   // These may include overlapping or otherwise out-of-bounds delimiters which
   // will be resolved during token processing.
-  auto delimiter_positions = rmm::device_uvector<int64_t>(delimiter_count, stream);
+  auto delimiter_positions = rmm::device_uvector<int64_t>(d_count.value(stream), stream);
   auto d_positions         = delimiter_positions.data();
   cudf::detail::copy_if_safe(
     thrust::counting_iterator<int64_t>(0),
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index c6ab8aa021a..2f2c12f265c 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -575,6 +575,7 @@ ConfigureTest(
   large_strings/merge_tests.cpp
   large_strings/parquet_tests.cpp
   large_strings/reshape_tests.cpp
+  large_strings/split_strings_tests.cpp
   GPUS 1
   PERCENT 100
 )
diff --git a/cpp/tests/large_strings/split_strings_tests.cpp b/cpp/tests/large_strings/split_strings_tests.cpp
new file mode 100644
index 00000000000..320fb222241
--- /dev/null
+++ b/cpp/tests/large_strings/split_strings_tests.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/split/split.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <vector>
+
+struct StringsSplitTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(StringsSplitTest, Split)
+{
+  auto const expected   = this->long_column();
+  auto const view       = cudf::column_view(expected);
+  auto const multiplier = 10;
+  auto const separator  = cudf::string_scalar("|");
+  auto const input      = cudf::strings::concatenate(
+    cudf::table_view(std::vector<cudf::column_view>(multiplier, view)), separator);
+
+  {
+    auto result = cudf::strings::split(cudf::strings_column_view(input->view()), separator);
+    for (auto c : result->view()) {
+      CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, expected);
+    }
+  }
+
+  auto lc = cudf::strings::split_record(cudf::strings_column_view(input->view()), separator);
+  auto lv = cudf::lists_column_view(lc->view());
+  auto sv = cudf::strings_column_view(lv.child());
+  EXPECT_EQ(sv.size(), view.size() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+}

From e95894fc305a2833374933ecbce07be997d4c545 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 30 May 2024 15:31:20 +0100
Subject: [PATCH 277/842] Executor for polars logical plans (#15504)

This builds out the infrastructure for executing polars logical plans using pylibcudf. See `docs/overview.md` in the `cudf_polars` subdirectory for some installation guidance.

Deliberately not fully fleshing out packaging and so forth yet.

Test coverage is incomplete but growing. I'd like to get this in so other people can build on top of it.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15504
---
 dependencies.yaml                             |    2 +-
 python/cudf/cudf/_lib/pylibcudf/types.pyx     |    3 +
 python/cudf_polars/cudf_polars/callback.py    |   56 +
 .../cudf_polars/containers/__init__.py        |   12 +
 .../cudf_polars/containers/column.py          |  119 ++
 .../cudf_polars/containers/dataframe.py       |  223 ++++
 .../cudf_polars/containers/scalar.py          |   23 +
 .../cudf_polars/cudf_polars/dsl/__init__.py   |    8 +
 python/cudf_polars/cudf_polars/dsl/expr.py    | 1038 +++++++++++++++++
 python/cudf_polars/cudf_polars/dsl/ir.py      |  879 ++++++++++++++
 .../cudf_polars/cudf_polars/dsl/translate.py  |  403 +++++++
 .../cudf_polars/testing/__init__.py           |    8 +
 .../cudf_polars/testing/asserts.py            |   76 ++
 .../cudf_polars/cudf_polars/utils/__init__.py |    8 +
 .../cudf_polars/cudf_polars/utils/dtypes.py   |   89 ++
 .../cudf_polars/cudf_polars/utils/sorting.py  |   49 +
 python/cudf_polars/docs/overview.md           |  174 +++
 python/cudf_polars/pyproject.toml             |   10 +-
 .../cudf_polars/tests/expressions/test_agg.py |   63 +
 .../tests/expressions/test_filter.py          |   20 +
 .../tests/expressions/test_gather.py          |   19 +
 .../tests/expressions/test_numeric_binops.py  |  106 ++
 python/cudf_polars/tests/test_distinct.py     |   30 +
 python/cudf_polars/tests/test_extcontext.py   |   23 +
 python/cudf_polars/tests/test_groupby.py      |   78 ++
 python/cudf_polars/tests/test_hconcat.py      |   19 +
 python/cudf_polars/tests/test_hstack.py       |   32 +
 python/cudf_polars/tests/test_join.py         |   57 +
 python/cudf_polars/tests/test_scan.py         |   98 ++
 python/cudf_polars/tests/test_select.py       |   38 +
 python/cudf_polars/tests/test_slice.py        |   34 +
 python/cudf_polars/tests/test_sort.py         |   42 +
 python/cudf_polars/tests/test_union.py        |   37 +
 33 files changed, 3874 insertions(+), 2 deletions(-)
 create mode 100644 python/cudf_polars/cudf_polars/callback.py
 create mode 100644 python/cudf_polars/cudf_polars/containers/__init__.py
 create mode 100644 python/cudf_polars/cudf_polars/containers/column.py
 create mode 100644 python/cudf_polars/cudf_polars/containers/dataframe.py
 create mode 100644 python/cudf_polars/cudf_polars/containers/scalar.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/__init__.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/expr.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/ir.py
 create mode 100644 python/cudf_polars/cudf_polars/dsl/translate.py
 create mode 100644 python/cudf_polars/cudf_polars/testing/__init__.py
 create mode 100644 python/cudf_polars/cudf_polars/testing/asserts.py
 create mode 100644 python/cudf_polars/cudf_polars/utils/__init__.py
 create mode 100644 python/cudf_polars/cudf_polars/utils/dtypes.py
 create mode 100644 python/cudf_polars/cudf_polars/utils/sorting.py
 create mode 100644 python/cudf_polars/docs/overview.md
 create mode 100644 python/cudf_polars/tests/expressions/test_agg.py
 create mode 100644 python/cudf_polars/tests/expressions/test_filter.py
 create mode 100644 python/cudf_polars/tests/expressions/test_gather.py
 create mode 100644 python/cudf_polars/tests/expressions/test_numeric_binops.py
 create mode 100644 python/cudf_polars/tests/test_distinct.py
 create mode 100644 python/cudf_polars/tests/test_extcontext.py
 create mode 100644 python/cudf_polars/tests/test_groupby.py
 create mode 100644 python/cudf_polars/tests/test_hconcat.py
 create mode 100644 python/cudf_polars/tests/test_hstack.py
 create mode 100644 python/cudf_polars/tests/test_join.py
 create mode 100644 python/cudf_polars/tests/test_scan.py
 create mode 100644 python/cudf_polars/tests/test_select.py
 create mode 100644 python/cudf_polars/tests/test_slice.py
 create mode 100644 python/cudf_polars/tests/test_sort.py
 create mode 100644 python/cudf_polars/tests/test_union.py

diff --git a/dependencies.yaml b/dependencies.yaml
index 8bfa3190b3d..38ec30a8033 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -603,7 +603,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=0.20.24
+          - polars>=0.20.30
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index de10196e289..a5248ad0a1f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -51,6 +51,9 @@ cdef class DataType:
             self.c_obj == (<DataType>other).c_obj
         )
 
+    def __hash__(self):
+        return hash((self.c_obj.id(), self.c_obj.scale()))
+
     @staticmethod
     cdef DataType from_libcudf(data_type dt):
         """Create a DataType from a libcudf data_type.
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
new file mode 100644
index 00000000000..aabb8498ce2
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -0,0 +1,56 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Callback for the polars collect function to execute on device."""
+
+from __future__ import annotations
+
+from functools import partial
+from typing import TYPE_CHECKING
+
+import nvtx
+
+from cudf_polars.dsl.translate import translate_ir
+
+if TYPE_CHECKING:
+    import polars as pl
+
+    from cudf_polars.dsl.ir import IR
+
+__all__: list[str] = ["execute_with_cudf"]
+
+
+def _callback(
+    ir: IR,
+    with_columns: list[str] | None,
+    pyarrow_predicate: str | None,
+    n_rows: int | None,
+) -> pl.DataFrame:
+    assert with_columns is None
+    assert pyarrow_predicate is None
+    assert n_rows is None
+    with nvtx.annotate(message="ExecuteIR", domain="cudf_polars"):
+        return ir.evaluate(cache={}).to_polars()
+
+
+def execute_with_cudf(nt, *, raise_on_fail: bool = False) -> None:
+    """
+    A post optimization callback that attempts to execute the plan with cudf.
+
+    Parameters
+    ----------
+    nt
+        NodeTraverser
+
+    raise_on_fail
+        Should conversion raise an exception rather than continuing
+        without setting a callback.
+
+    The NodeTraverser is mutated if the libcudf executor can handle the plan.
+    """
+    try:
+        with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
+            nt.set_udf(partial(_callback, translate_ir(nt)))
+    except NotImplementedError:
+        if raise_on_fail:
+            raise
diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py
new file mode 100644
index 00000000000..ef9d9ca61b6
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/containers/__init__.py
@@ -0,0 +1,12 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Containers of concrete data."""
+
+from __future__ import annotations
+
+__all__: list[str] = ["DataFrame", "Column", "Scalar"]
+
+from cudf_polars.containers.column import Column
+from cudf_polars.containers.dataframe import DataFrame
+from cudf_polars.containers.scalar import Scalar
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
new file mode 100644
index 00000000000..49034b5f5c8
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -0,0 +1,119 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""A column, with some properties."""
+
+from __future__ import annotations
+
+import functools
+from typing import TYPE_CHECKING
+
+import cudf._lib.pylibcudf as plc
+
+if TYPE_CHECKING:
+    from typing_extensions import Self
+
+__all__: list[str] = ["Column"]
+
+
+class Column:
+    """A column, a name, and sortedness."""
+
+    obj: plc.Column
+    name: str
+    is_sorted: plc.types.Sorted
+    order: plc.types.Order
+    null_order: plc.types.NullOrder
+
+    def __init__(self, column: plc.Column, name: str):
+        self.obj = column
+        self.name = name
+        self.is_sorted = plc.types.Sorted.NO
+        self.order = plc.types.Order.ASCENDING
+        self.null_order = plc.types.NullOrder.BEFORE
+
+    def sorted_like(self, like: Column, /) -> Self:
+        """
+        Copy sortedness properties from a column onto self.
+
+        Parameters
+        ----------
+        like
+            The column to copy sortedness metadata from.
+
+        Returns
+        -------
+        Self with metadata set.
+
+        See Also
+        --------
+        set_sorted
+        """
+        return self.set_sorted(
+            is_sorted=like.is_sorted, order=like.order, null_order=like.null_order
+        )
+
+    def set_sorted(
+        self,
+        *,
+        is_sorted: plc.types.Sorted,
+        order: plc.types.Order,
+        null_order: plc.types.NullOrder,
+    ) -> Self:
+        """
+        Modify sortedness metadata in place.
+
+        Parameters
+        ----------
+        is_sorted
+            Is the column sorted
+        order
+            The order if sorted
+        null_order
+            Where nulls sort, if sorted
+
+        Returns
+        -------
+        Self with metadata set.
+        """
+        self.is_sorted = is_sorted
+        self.order = order
+        self.null_order = null_order
+        return self
+
+    def copy(self, *, new_name: str | None = None) -> Self:
+        """
+        Return a shallow copy of the column.
+
+        Parameters
+        ----------
+        new_name
+            Optional new name for the copied column.
+
+        Returns
+        -------
+        New column sharing data with self.
+        """
+        return type(self)(
+            self.obj, self.name if new_name is None else new_name
+        ).sorted_like(self)
+
+    def mask_nans(self) -> Self:
+        """Return a copy of self with nans masked out."""
+        if self.nan_count > 0:
+            raise NotImplementedError
+        return self.copy()
+
+    @functools.cached_property
+    def nan_count(self) -> int:
+        """Return the number of NaN values in the column."""
+        if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
+            return 0
+        return plc.interop.to_arrow(
+            plc.reduce.reduce(
+                plc.unary.is_nan(self.obj),
+                plc.aggregation.sum(),
+                # TODO: pylibcudf needs to have a SizeType DataType singleton
+                plc.DataType(plc.TypeId.INT32),
+            )
+        ).as_py()
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
new file mode 100644
index 00000000000..de21a280020
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -0,0 +1,223 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""A dataframe, with some properties."""
+
+from __future__ import annotations
+
+from functools import cached_property
+from typing import TYPE_CHECKING
+
+import polars as pl
+
+import cudf._lib.pylibcudf as plc
+
+from cudf_polars.containers.column import Column
+
+if TYPE_CHECKING:
+    from collections.abc import Mapping, Sequence, Set
+
+    from typing_extensions import Self
+
+    import cudf
+
+    from cudf_polars.containers.scalar import Scalar
+
+
+__all__: list[str] = ["DataFrame"]
+
+
+class DataFrame:
+    """A representation of a dataframe."""
+
+    columns: list[Column]
+    scalars: list[Scalar]
+    table: plc.Table | None
+
+    def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None:
+        self.columns = list(columns)
+        self._column_map = {c.name: c for c in self.columns}
+        self.scalars = list(scalars)
+        if len(scalars) == 0:
+            self.table = plc.Table([c.obj for c in columns])
+        else:
+            self.table = None
+
+    def copy(self) -> Self:
+        """Return a shallow copy of self."""
+        return type(self)(self.columns, self.scalars)
+
+    def to_polars(self) -> pl.DataFrame:
+        """Convert to a polars DataFrame."""
+        assert len(self.scalars) == 0
+        return pl.from_arrow(
+            plc.interop.to_arrow(
+                self.table,
+                [plc.interop.ColumnMetadata(name=c.name) for c in self.columns],
+            )
+        )
+
+    @cached_property
+    def column_names_set(self) -> frozenset[str]:
+        """Return the column names as a set."""
+        return frozenset(c.name for c in self.columns)
+
+    @cached_property
+    def column_names(self) -> list[str]:
+        """Return a list of the column names."""
+        return [c.name for c in self.columns]
+
+    @cached_property
+    def num_columns(self) -> int:
+        """Number of columns."""
+        return len(self.columns)
+
+    @cached_property
+    def num_rows(self) -> int:
+        """Number of rows."""
+        if self.table is None:
+            raise ValueError("Number of rows of frame with scalars makes no sense")
+        return self.table.num_rows()
+
+    @classmethod
+    def from_cudf(cls, df: cudf.DataFrame) -> Self:
+        """Create from a cudf dataframe."""
+        return cls(
+            [Column(c.to_pylibcudf(mode="read"), name) for name, c in df._data.items()],
+            [],
+        )
+
+    @classmethod
+    def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
+        """
+        Create from a pylibcudf table.
+
+        Parameters
+        ----------
+        table
+            Pylibcudf table to obtain columns from
+        names
+            Names for the columns
+
+        Returns
+        -------
+        New dataframe sharing  data with the input table.
+
+        Raises
+        ------
+        ValueError if the number of provided names does not match the
+        number of columns in the table.
+        """
+        # TODO: strict=True when we drop py39
+        if table.num_columns() != len(names):
+            raise ValueError("Mismatching name and table length.")
+        return cls([Column(c, name) for c, name in zip(table.columns(), names)], [])
+
+    def sorted_like(
+        self, like: DataFrame, /, *, subset: Set[str] | None = None
+    ) -> Self:
+        """
+        Copy sortedness from a dataframe onto self.
+
+        Parameters
+        ----------
+        like
+            The dataframe to copy from
+        subset
+            Optional subset of columns from which to copy data.
+
+        Returns
+        -------
+        Self with metadata set.
+
+        Raises
+        ------
+        ValueError if there is a name mismatch between self and like.
+        """
+        if like.column_names != self.column_names:
+            raise ValueError("Can only copy from identically named frame")
+        subset = self.column_names_set if subset is None else subset
+        self.columns = [
+            c.sorted_like(other) if c.name in subset else c
+            for c, other in zip(self.columns, like.columns)
+        ]
+        return self
+
+    def with_columns(self, columns: Sequence[Column]) -> Self:
+        """
+        Return a new dataframe with extra columns.
+
+        Parameters
+        ----------
+        columns
+            Columns to add
+
+        Returns
+        -------
+        New dataframe
+
+        Notes
+        -----
+        If column names overlap, newer names replace older ones.
+        """
+        return type(self)([*self.columns, *columns], self.scalars)
+
+    def discard_columns(self, names: Set[str]) -> Self:
+        """Drop columns by name."""
+        return type(self)(
+            [c for c in self.columns if c.name not in names], self.scalars
+        )
+
+    def select(self, names: Sequence[str]) -> Self:
+        """Select columns by name returning DataFrame."""
+        want = set(names)
+        if not want.issubset(self.column_names_set):
+            raise ValueError("Can't select missing names")
+        return type(self)([self._column_map[name] for name in names], self.scalars)
+
+    def replace_columns(self, *columns: Column) -> Self:
+        """Return a new dataframe with columns replaced by name."""
+        new = {c.name: c for c in columns}
+        if not set(new).issubset(self.column_names_set):
+            raise ValueError("Cannot replace with non-existing names")
+        return type(self)([new.get(c.name, c) for c in self.columns], self.scalars)
+
+    def rename_columns(self, mapping: Mapping[str, str]) -> Self:
+        """Rename some columns."""
+        return type(self)(
+            [c.copy(new_name=mapping.get(c.name)) for c in self.columns], self.scalars
+        )
+
+    def select_columns(self, names: Set[str]) -> list[Column]:
+        """Select columns by name."""
+        return [c for c in self.columns if c.name in names]
+
+    def filter(self, mask: Column) -> Self:
+        """Return a filtered table given a mask."""
+        table = plc.stream_compaction.apply_boolean_mask(self.table, mask.obj)
+        return type(self).from_table(table, self.column_names).sorted_like(self)
+
+    def slice(self, zlice: tuple[int, int] | None) -> Self:
+        """
+        Slice a dataframe.
+
+        Parameters
+        ----------
+        zlice
+            optional, tuple of start and length, negative values of start
+            treated as for python indexing. If not provided, returns self.
+
+        Returns
+        -------
+        New dataframe (if zlice is not None) other self (if it is)
+        """
+        if zlice is None:
+            return self
+        start, length = zlice
+        if start < 0:
+            start += self.num_rows
+        # Polars slice takes an arbitrary positive integer and slice
+        # to the end of the frame if it is larger.
+        end = min(start + length, self.num_rows)
+        (table,) = plc.copying.slice(self.table, [start, end])
+        return type(self).from_table(table, self.column_names).sorted_like(self)
diff --git a/python/cudf_polars/cudf_polars/containers/scalar.py b/python/cudf_polars/cudf_polars/containers/scalar.py
new file mode 100644
index 00000000000..fc97d0fd9c2
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/containers/scalar.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""A scalar, with some properties."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import cudf._lib.pylibcudf as plc
+
+__all__: list[str] = ["Scalar"]
+
+
+class Scalar:
+    """A scalar, and a name."""
+
+    __slots__ = ("obj", "name")
+    obj: plc.Scalar
+
+    def __init__(self, scalar: plc.Scalar):
+        self.obj = scalar
diff --git a/python/cudf_polars/cudf_polars/dsl/__init__.py b/python/cudf_polars/cudf_polars/dsl/__init__.py
new file mode 100644
index 00000000000..804c5ada566
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""The domain-specific language (DSL) for the polars executor."""
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
new file mode 100644
index 00000000000..249cc3775f7
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -0,0 +1,1038 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""
+DSL nodes for the polars expression language.
+
+An expression node is a function, `DataFrame -> Column` or `DataFrame -> Scalar`.
+
+The evaluation context is provided by a LogicalPlan node, and can
+affect the evaluation rule as well as providing the dataframe input.
+In particular, the interpretation of the expression language in a
+`GroupBy` node is groupwise, rather than whole frame.
+"""
+
+from __future__ import annotations
+
+import enum
+from enum import IntEnum
+from functools import partial, reduce
+from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple
+
+import pyarrow as pa
+
+from polars.polars import _expr_nodes as pl_expr
+
+import cudf._lib.pylibcudf as plc
+
+from cudf_polars.containers import Column, Scalar
+from cudf_polars.utils import sorting
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    import polars.type_aliases as pl_types
+
+    from cudf_polars.containers import DataFrame
+
+__all__ = [
+    "Expr",
+    "NamedExpr",
+    "Literal",
+    "Col",
+    "BooleanFunction",
+    "StringFunction",
+    "Sort",
+    "SortBy",
+    "Gather",
+    "Filter",
+    "RollingWindow",
+    "GroupedRollingWindow",
+    "Cast",
+    "Agg",
+    "BinOp",
+]
+
+
+class ExecutionContext(IntEnum):
+    FRAME = enum.auto()
+    GROUPBY = enum.auto()
+    ROLLING = enum.auto()
+
+
+class AggInfo(NamedTuple):
+    requests: list[tuple[Expr | None, plc.aggregation.Aggregation, Expr]]
+
+
+class Expr:
+    """
+    An abstract expression object.
+
+    This contains a (potentially empty) tuple of child expressions,
+    along with non-child data. For uniform reconstruction and
+    implementation of hashing and equality schemes, child classes need
+    to provide a certain amount of metadata when they are defined.
+    Specifically, the ``_non_child`` attribute must list, in-order,
+    the names of the slots that are passed to the constructor. The
+    constructor must take arguments in the order ``(*_non_child,
+    *children).``
+    """
+
+    __slots__ = ("dtype", "_hash_value", "_repr_value")
+    dtype: plc.DataType
+    """Data type of the expression."""
+    _hash_value: int
+    """Caching slot for the hash of the expression."""
+    _repr_value: str
+    """Caching slot for repr of the expression."""
+    children: tuple[Expr, ...] = ()
+    """Children of the expression."""
+    _non_child: ClassVar[tuple[str, ...]] = ("dtype",)
+    """Names of non-child data (not Exprs) for reconstruction."""
+
+    # Constructor must take arguments in order (*_non_child, *children)
+    def __init__(self, dtype: plc.DataType) -> None:
+        self.dtype = dtype
+
+    def _ctor_arguments(self, children: Sequence[Expr]) -> Sequence:
+        return (*(getattr(self, attr) for attr in self._non_child), *children)
+
+    def get_hash(self) -> int:
+        """
+        Return the hash of this expr.
+
+        Override this in subclasses, rather than __hash__.
+
+        Returns
+        -------
+        The integer hash value.
+        """
+        return hash((type(self), self._ctor_arguments(self.children)))
+
+    def __hash__(self):
+        """Hash of an expression with caching."""
+        try:
+            return self._hash_value
+        except AttributeError:
+            self._hash_value = self.get_hash()
+            return self._hash_value
+
+    def is_equal(self, other: Any) -> bool:
+        """
+        Equality of two expressions.
+
+        Override this in subclasses, rather than __eq__.
+
+        Parameter
+        ---------
+        other
+            object to compare to
+
+        Returns
+        -------
+        True if the two expressions are equal, false otherwise.
+        """
+        if type(self) is not type(other):
+            return False
+        return self._ctor_arguments(self.children) == other._ctor_arguments(
+            other.children
+        )
+
+    def __eq__(self, other):
+        """Equality of expressions."""
+        if type(self) != type(other) or hash(self) != hash(other):
+            return False
+        else:
+            return self.is_equal(other)
+
+    def __ne__(self, other):
+        """Inequality of expressions."""
+        return not self.__eq__(other)
+
+    def __repr__(self):
+        """String representation of an expression with caching."""
+        try:
+            return self._repr_value
+        except AttributeError:
+            args = ", ".join(f"{arg!r}" for arg in self._ctor_arguments(self.children))
+            self._repr_value = f"{type(self).__name__}({args})"
+            return self._repr_value
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ) -> Column:  # TODO: return type is a lie for Literal
+        """
+        Evaluate this expression given a dataframe for context.
+
+        Parameters
+        ----------
+        df
+            DataFrame that will provide columns.
+        context
+            What context are we performing this evaluation in?
+        mapping
+            Substitution mapping from expressions to Columns, used to
+            override the evaluation of a given expression if we're
+            performing a simple rewritten evaluation.
+
+        Notes
+        -----
+        Do not call this function directly, but rather
+        :meth:`evaluate` which handles the mapping lookups.
+
+        The typed return value of :class:`Column` is not true when
+        evaluating :class:`Literal` nodes (which instead produce
+        :class:`Scalar` objects). However, these duck-type to having a
+        pylibcudf container object inside them, and usually they end
+        up appearing in binary expressions which pylibcudf handles
+        appropriately since there are overloads for (column, scalar)
+        pairs. We don't have to handle (scalar, scalar) in binops
+        since the polars optimizer has a constant-folding pass.
+
+        Returns
+        -------
+        Column representing the evaluation of the expression (or maybe
+        a scalar).
+
+        Raises
+        ------
+        NotImplementedError if we couldn't evaluate the expression.
+        Ideally all these are returned during translation to the IR,
+        but for now we are not perfect.
+        """
+        raise NotImplementedError(f"Evaluation of {type(self).__name__}")
+
+    def evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ) -> Column:  # TODO: return type is a lie for Literal
+        """
+        Evaluate this expression given a dataframe for context.
+
+        Parameters
+        ----------
+        df
+            DataFrame that will provide columns.
+        context
+            What context are we performing this evaluation in?
+        mapping
+            Substitution mapping from expressions to Columns, used to
+            override the evaluation of a given expression if we're
+            performing a simple rewritten evaluation.
+
+        Notes
+        -----
+        Individual subclasses should implement :meth:`do_allocate`,
+        this method provides logic to handle lookups in the
+        substitution mapping.
+
+        Returns
+        -------
+        Column representing the evaluation of the expression (or maybe
+        a scalar, annoying!).
+
+        Raises
+        ------
+        NotImplementedError if we couldn't evaluate the expression.
+        Ideally all these are returned during translation to the IR,
+        but for now we are not perfect.
+        """
+        if mapping is None:
+            return self.do_evaluate(df, context=context, mapping=mapping)
+        try:
+            return mapping[self]
+        except KeyError:
+            return self.do_evaluate(df, context=context, mapping=mapping)
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """
+        Collect information about aggregations in groupbys.
+
+        Parameters
+        ----------
+        depth
+            The depth of aggregating (reduction or sampling)
+            expressions we are currently at.
+
+        Returns
+        -------
+        Aggregation info describing the expression to aggregate in the
+        groupby.
+
+        Raises
+        ------
+        NotImplementedError if we can't currently perform the
+        aggregation request (for example nested aggregations like
+        ``a.max().min()``).
+        """
+        raise NotImplementedError(
+            f"Collecting aggregation info for {type(self).__name__}"
+        )
+
+
+class NamedExpr(Expr):
+    __slots__ = ("name", "children")
+    _non_child = ("dtype", "name")
+
+    def __init__(self, dtype: plc.DataType, name: str, value: Expr) -> None:
+        super().__init__(dtype)
+        self.name = name
+        self.children = (value,)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        (child,) = self.children
+        return Column(
+            child.evaluate(df, context=context, mapping=mapping).obj, self.name
+        )
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        (value,) = self.children
+        return value.collect_agg(depth=depth)
+
+
+class Literal(Expr):
+    __slots__ = ("value",)
+    _non_child = ("dtype", "value")
+    value: pa.Scalar
+
+    def __init__(self, dtype: plc.DataType, value: Any) -> None:
+        super().__init__(dtype)
+        self.value = pa.scalar(value)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        # TODO: obey dtype
+        obj = plc.interop.from_arrow(self.value)
+        return Scalar(obj)  # type: ignore
+
+
+class Col(Expr):
+    __slots__ = ("name",)
+    _non_child = ("dtype", "name")
+    name: str
+
+    def __init__(self, dtype: plc.DataType, name: str) -> None:
+        self.dtype = dtype
+        self.name = name
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        return df._column_map[self.name]
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([(self, plc.aggregation.collect_list(), self)])
+
+
+class Len(Expr):
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        # TODO: type is wrong, and dtype
+        return df.num_rows  # type: ignore
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        # TODO: polars returns a uint, not an int for count
+        return AggInfo(
+            [(None, plc.aggregation.count(plc.types.NullPolicy.INCLUDE), self)]
+        )
+
+
+class BooleanFunction(Expr):
+    __slots__ = ("name", "options", "children")
+    _non_child = ("dtype", "name", "options")
+
+    def __init__(self, dtype: plc.DataType, name: str, options: tuple, *children: Expr):
+        super().__init__(dtype)
+        self.options = options
+        self.name = name
+        self.children = children
+        if (
+            self.name in (pl_expr.BooleanFunction.Any, pl_expr.BooleanFunction.All)
+            and not self.options[0]
+        ):
+            # With ignore_nulls == False, polars uses Kleene logic
+            raise NotImplementedError(f"Kleene logic for {self.name}")
+        if self.name in (
+            pl_expr.BooleanFunction.IsFinite,
+            pl_expr.BooleanFunction.IsInfinite,
+            pl_expr.BooleanFunction.IsIn,
+        ):
+            raise NotImplementedError(f"{self.name}")
+
+    @staticmethod
+    def _distinct(
+        column: Column,
+        *,
+        keep: plc.stream_compaction.DuplicateKeepOption,
+        source_value: plc.Scalar,
+        target_value: plc.Scalar,
+    ) -> Column:
+        table = plc.Table([column.obj])
+        indices = plc.stream_compaction.distinct_indices(
+            table,
+            keep,
+            # TODO: polars doesn't expose options for these
+            plc.types.NullEquality.EQUAL,
+            plc.types.NanEquality.ALL_EQUAL,
+        )
+        return Column(
+            plc.copying.scatter(
+                [source_value],
+                indices,
+                plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]),
+            ).columns()[0],
+            column.name,
+        )
+
+    _BETWEEN_OPS: ClassVar[
+        dict[
+            pl_types.ClosedInterval,
+            tuple[plc.binaryop.BinaryOperator, plc.binaryop.BinaryOperator],
+        ]
+    ] = {
+        "none": (
+            plc.binaryop.BinaryOperator.GREATER,
+            plc.binaryop.BinaryOperator.LESS,
+        ),
+        "left": (
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            plc.binaryop.BinaryOperator.LESS,
+        ),
+        "right": (
+            plc.binaryop.BinaryOperator.GREATER,
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+        ),
+        "both": (
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+        ),
+    }
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        columns = [
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        ]
+        if self.name == pl_expr.BooleanFunction.Any:
+            (column,) = columns
+            return plc.Column.from_scalar(
+                plc.reduce.reduce(column.obj, plc.aggregation.any(), self.dtype), 1
+            )
+        elif self.name == pl_expr.BooleanFunction.All:
+            (column,) = columns
+            return plc.Column.from_scalar(
+                plc.reduce.reduce(column.obj, plc.aggregation.all(), self.dtype), 1
+            )
+        if self.name == pl_expr.BooleanFunction.IsNull:
+            (column,) = columns
+            return Column(plc.unary.is_null(column.obj), column.name)
+        elif self.name == pl_expr.BooleanFunction.IsNotNull:
+            (column,) = columns
+            return Column(plc.unary.is_valid(column.obj), column.name)
+        elif self.name == pl_expr.BooleanFunction.IsNan:
+            # TODO: copy over null mask since is_nan(null) => null in polars
+            (column,) = columns
+            return Column(plc.unary.is_nan(column.obj), column.name)
+        elif self.name == pl_expr.BooleanFunction.IsNotNan:
+            # TODO: copy over null mask since is_not_nan(null) => null in polars
+            (column,) = columns
+            return Column(plc.unary.is_not_nan(column.obj), column.name)
+        elif self.name == pl_expr.BooleanFunction.IsFirstDistinct:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
+                source_value=plc.interop.from_arrow(pa.scalar(True)),  # noqa: FBT003
+                target_value=plc.interop.from_arrow(pa.scalar(False)),  # noqa: FBT003
+            )
+        elif self.name == pl_expr.BooleanFunction.IsLastDistinct:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
+                source_value=plc.interop.from_arrow(pa.scalar(True)),  # noqa: FBT003
+                target_value=plc.interop.from_arrow(pa.scalar(False)),  # noqa: FBT003
+            )
+        elif self.name == pl_expr.BooleanFunction.IsUnique:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
+                source_value=plc.interop.from_arrow(pa.scalar(True)),  # noqa: FBT003
+                target_value=plc.interop.from_arrow(pa.scalar(False)),  # noqa: FBT003
+            )
+        elif self.name == pl_expr.BooleanFunction.IsDuplicated:
+            (column,) = columns
+            return self._distinct(
+                column,
+                keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
+                source_value=plc.interop.from_arrow(pa.scalar(False)),  # noqa: FBT003
+                target_value=plc.interop.from_arrow(pa.scalar(True)),  # noqa: FBT003
+            )
+        elif self.name == pl_expr.BooleanFunction.AllHorizontal:
+            name = columns[0].name
+            if any(c.obj.null_count() > 0 for c in columns):
+                raise NotImplementedError("Kleene logic for all_horizontal")
+            return Column(
+                reduce(
+                    partial(
+                        plc.binaryop.binary_operation,
+                        op=plc.binaryop.BinaryOperator.BITWISE_AND,
+                        output_type=self.dtype,
+                    ),
+                    (c.obj for c in columns),
+                ),
+                name,
+            )
+        elif self.name == pl_expr.BooleanFunction.AnyHorizontal:
+            name = columns[0].name
+            if any(c.obj.null_count() > 0 for c in columns):
+                raise NotImplementedError("Kleene logic for any_horizontal")
+            return Column(
+                reduce(
+                    partial(
+                        plc.binaryop.binary_operation,
+                        op=plc.binaryop.BinaryOperator.BITWISE_OR,
+                        output_type=self.dtype,
+                    ),
+                    (c.obj for c in columns),
+                ),
+                name,
+            )
+        elif self.name == pl_expr.BooleanFunction.IsBetween:
+            column, lo, hi = columns
+            (closed,) = self.options
+            lop, rop = self._BETWEEN_OPS[closed]
+            return Column(
+                plc.binaryop.binary_operation(
+                    plc.binaryop.binary_operation(
+                        column.obj, lo.obj, lop, output_type=self.dtype
+                    ),
+                    plc.binaryop.binary_operation(
+                        column.obj, hi.obj, rop, output_type=self.dtype
+                    ),
+                    plc.binaryop.BinaryOperator.LOGICAL_AND,
+                    self.dtype,
+                ),
+                column.name,
+            )
+        else:
+            raise NotImplementedError(f"BooleanFunction {self.name}")
+
+
+class StringFunction(Expr):
+    __slots__ = ("name", "options", "children")
+    _non_child = ("dtype", "name", "options")
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        name: pl_expr.StringFunction,
+        options: tuple,
+        *children: Expr,
+    ):
+        super().__init__(dtype)
+        self.options = options
+        self.name = name
+        self.children = children
+        if self.name not in (
+            pl_expr.StringFunction.Lowercase,
+            pl_expr.StringFunction.Uppercase,
+            pl_expr.StringFunction.EndsWith,
+            pl_expr.StringFunction.StartsWith,
+        ):
+            raise NotImplementedError(f"String function {self.name}")
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        columns = [
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        ]
+        if self.name == pl_expr.StringFunction.Lowercase:
+            (column,) = columns
+            return Column(plc.strings.case.to_lower(column.obj), column.name)
+        elif self.name == pl_expr.StringFunction.Uppercase:
+            (column,) = columns
+            return Column(plc.strings.case.to_upper(column.obj), column.name)
+        elif self.name == pl_expr.StringFunction.EndsWith:
+            column, suffix = columns
+            return Column(
+                plc.strings.find.ends_with(column.obj, suffix.obj), column.name
+            )
+        elif self.name == pl_expr.StringFunction.StartsWith:
+            column, suffix = columns
+            return Column(
+                plc.strings.find.starts_with(column.obj, suffix.obj), column.name
+            )
+        else:
+            raise NotImplementedError(f"StringFunction {self.name}")
+
+
+class Sort(Expr):
+    __slots__ = ("options", "children")
+    _non_child = ("dtype", "options")
+
+    def __init__(
+        self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr
+    ):
+        super().__init__(dtype)
+        self.options = options
+        self.children = (column,)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        (child,) = self.children
+        column = child.evaluate(df, context=context, mapping=mapping)
+        (stable, nulls_last, descending) = self.options
+        order, null_order = sorting.sort_order(
+            [descending], nulls_last=nulls_last, num_keys=1
+        )
+        do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort
+        table = do_sort(plc.Table([column.obj]), order, null_order)
+        return Column(table.columns()[0], column.name).set_sorted(
+            is_sorted=plc.types.Sorted.YES, order=order[0], null_order=null_order[0]
+        )
+
+
+class SortBy(Expr):
+    __slots__ = ("options", "children")
+    _non_child = ("dtype", "options")
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        options: tuple[bool, bool, tuple[bool]],
+        column: Expr,
+        *by: Expr,
+    ):
+        super().__init__(dtype)
+        self.options = options
+        self.children = (column, *by)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        column, *by = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        (stable, nulls_last, descending) = self.options
+        order, null_order = sorting.sort_order(
+            descending, nulls_last=nulls_last, num_keys=len(by)
+        )
+        do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
+        table = do_sort(
+            plc.Table([column.obj]), plc.Table([c.obj for c in by]), order, null_order
+        )
+        return Column(table.columns()[0], column.name)
+
+
+class Gather(Expr):
+    __slots__ = ("children",)
+    _non_child = ("dtype",)
+
+    def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
+        super().__init__(dtype)
+        self.children = (values, indices)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        values, indices = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        lo, hi = plc.reduce.minmax(indices.obj)
+        lo = plc.interop.to_arrow(lo).as_py()
+        hi = plc.interop.to_arrow(hi).as_py()
+        n = df.num_rows
+        if hi >= n or lo < -n:
+            raise ValueError("gather indices are out of bounds")
+        if indices.obj.null_count():
+            bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY
+            obj = plc.replace.replace_nulls(
+                indices.obj,
+                plc.interop.from_arrow(pa.scalar(n), data_type=indices.obj.data_type()),
+            )
+        else:
+            bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK
+            obj = indices.obj
+        table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy)
+        return Column(table.columns()[0], values.name)
+
+
+class Filter(Expr):
+    __slots__ = ("children",)
+    _non_child = ("dtype",)
+
+    def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
+        super().__init__(dtype)
+        self.children = (values, indices)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        values, mask = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        table = plc.stream_compaction.apply_boolean_mask(
+            plc.Table([values.obj]), mask.obj
+        )
+        return Column(table.columns()[0], values.name).sorted_like(values)
+
+
+class RollingWindow(Expr):
+    __slots__ = ("options", "children")
+    _non_child = ("dtype", "options")
+
+    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr):
+        super().__init__(dtype)
+        self.options = options
+        self.children = (agg,)
+
+
+class GroupedRollingWindow(Expr):
+    __slots__ = ("options", "children")
+    _non_child = ("dtype", "options")
+
+    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr):
+        super().__init__(dtype)
+        self.options = options
+        self.children = (agg, *by)
+
+
+class Cast(Expr):
+    __slots__ = ("children",)
+    _non_child = ("dtype",)
+
+    def __init__(self, dtype: plc.DataType, value: Expr):
+        super().__init__(dtype)
+        self.children = (value,)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        (child,) = self.children
+        column = child.evaluate(df, context=context, mapping=mapping)
+        return Column(plc.unary.cast(column.obj, self.dtype), column.name).sorted_like(
+            column
+        )
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        # TODO: Could do with sort-based groupby and segmented filter
+        (child,) = self.children
+        return child.collect_agg(depth=depth)
+
+
+class Agg(Expr):
+    __slots__ = ("name", "options", "op", "request", "children")
+    _non_child = ("dtype", "name", "options")
+
+    def __init__(
+        self, dtype: plc.DataType, name: str, options: Any, value: Expr
+    ) -> None:
+        super().__init__(dtype)
+        # TODO: fix polars name
+        if name == "nunique":
+            name = "n_unique"
+        self.name = name
+        self.options = options
+        self.children = (value,)
+        if name not in Agg._SUPPORTED:
+            raise NotImplementedError(f"Unsupported aggregation {name=}")
+        # TODO: nan handling in groupby case
+        if name == "min":
+            req = plc.aggregation.min()
+        elif name == "max":
+            req = plc.aggregation.max()
+        elif name == "median":
+            req = plc.aggregation.median()
+        elif name == "n_unique":
+            # TODO: datatype of result
+            req = plc.aggregation.nunique(null_handling=plc.types.NullPolicy.INCLUDE)
+        elif name == "first" or name == "last":
+            req = None
+        elif name == "mean":
+            req = plc.aggregation.mean()
+        elif name == "sum":
+            req = plc.aggregation.sum()
+        elif name == "std":
+            # TODO: handle nans
+            req = plc.aggregation.std(ddof=options)
+        elif name == "var":
+            # TODO: handle nans
+            req = plc.aggregation.variance(ddof=options)
+        elif name == "count":
+            req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE)
+        else:
+            raise NotImplementedError
+        self.request = req
+        op = getattr(self, f"_{name}", None)
+        if op is None:
+            op = partial(self._reduce, request=req)
+        elif name in {"min", "max"}:
+            op = partial(op, propagate_nans=options)
+        elif name in {"count", "first", "last"}:
+            pass
+        else:
+            raise AssertionError
+        self.op = op
+
+    _SUPPORTED: ClassVar[frozenset[str]] = frozenset(
+        [
+            "min",
+            "max",
+            "median",
+            "n_unique",
+            "first",
+            "last",
+            "mean",
+            "sum",
+            "count",
+            "std",
+            "var",
+        ]
+    )
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        if depth >= 1:
+            raise NotImplementedError("Nested aggregations in groupby")
+        (child,) = self.children
+        ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
+        if self.request is None:
+            raise NotImplementedError(f"Aggregation {self.name} in groupby")
+        return AggInfo([(expr, self.request, self)])
+
+    def _reduce(
+        self, column: Column, *, request: plc.aggregation.Aggregation
+    ) -> Column:
+        return Column(
+            plc.Column.from_scalar(
+                plc.reduce.reduce(column.obj, request, self.dtype),
+                1,
+            ),
+            column.name,
+        )
+
+    def _count(self, column: Column) -> Column:
+        # TODO: dtype handling
+        return Column(
+            plc.Column.from_scalar(
+                plc.interop.from_arrow(
+                    pa.scalar(column.obj.size() - column.obj.null_count()),
+                ),
+                1,
+            ),
+            column.name,
+        )
+
+    def _min(self, column: Column, *, propagate_nans: bool) -> Column:
+        if propagate_nans and column.nan_count > 0:
+            return Column(
+                plc.Column.from_scalar(
+                    plc.interop.from_arrow(
+                        pa.scalar(float("nan")), data_type=self.dtype
+                    ),
+                    1,
+                ),
+                column.name,
+            )
+        if column.nan_count > 0:
+            column = column.mask_nans()
+        return self._reduce(column, request=plc.aggregation.min())
+
+    def _max(self, column: Column, *, propagate_nans: bool) -> Column:
+        if propagate_nans and column.nan_count > 0:
+            return Column(
+                plc.Column.from_scalar(
+                    plc.interop.from_arrow(
+                        pa.scalar(float("nan")), data_type=self.dtype
+                    ),
+                    1,
+                ),
+                column.name,
+            )
+        if column.nan_count > 0:
+            column = column.mask_nans()
+        return self._reduce(column, request=plc.aggregation.max())
+
+    def _first(self, column: Column) -> Column:
+        return Column(plc.copying.slice(column.obj, [0, 1])[0], column.name)
+
+    def _last(self, column: Column) -> Column:
+        n = column.obj.size()
+        return Column(plc.copying.slice(column.obj, [n - 1, n])[0], column.name)
+
+    def do_evaluate(
+        self,
+        df,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        if context is not ExecutionContext.FRAME:
+            raise NotImplementedError(f"Agg in context {context}")
+        (child,) = self.children
+        return self.op(child.evaluate(df, context=context, mapping=mapping))
+
+
+class BinOp(Expr):
+    __slots__ = ("op", "children")
+    _non_child = ("dtype", "op")
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        op: plc.binaryop.BinaryOperator,
+        left: Expr,
+        right: Expr,
+    ) -> None:
+        super().__init__(dtype)
+        self.op = op
+        self.children = (left, right)
+
+    _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = {
+        pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
+        pl_expr.Operator.EqValidity: plc.binaryop.BinaryOperator.NULL_EQUALS,
+        pl_expr.Operator.NotEq: plc.binaryop.BinaryOperator.NOT_EQUAL,
+        pl_expr.Operator.NotEqValidity: plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
+        pl_expr.Operator.Lt: plc.binaryop.BinaryOperator.LESS,
+        pl_expr.Operator.LtEq: plc.binaryop.BinaryOperator.LESS_EQUAL,
+        pl_expr.Operator.Gt: plc.binaryop.BinaryOperator.GREATER,
+        pl_expr.Operator.GtEq: plc.binaryop.BinaryOperator.GREATER_EQUAL,
+        pl_expr.Operator.Plus: plc.binaryop.BinaryOperator.ADD,
+        pl_expr.Operator.Minus: plc.binaryop.BinaryOperator.SUB,
+        pl_expr.Operator.Multiply: plc.binaryop.BinaryOperator.MUL,
+        pl_expr.Operator.Divide: plc.binaryop.BinaryOperator.DIV,
+        pl_expr.Operator.TrueDivide: plc.binaryop.BinaryOperator.TRUE_DIV,
+        pl_expr.Operator.FloorDivide: plc.binaryop.BinaryOperator.FLOOR_DIV,
+        pl_expr.Operator.Modulus: plc.binaryop.BinaryOperator.PYMOD,
+        pl_expr.Operator.And: plc.binaryop.BinaryOperator.BITWISE_AND,
+        pl_expr.Operator.Or: plc.binaryop.BinaryOperator.BITWISE_OR,
+        pl_expr.Operator.Xor: plc.binaryop.BinaryOperator.BITWISE_XOR,
+        pl_expr.Operator.LogicalAnd: plc.binaryop.BinaryOperator.LOGICAL_AND,
+        pl_expr.Operator.LogicalOr: plc.binaryop.BinaryOperator.LOGICAL_OR,
+    }
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: dict[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        left, right = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        return Column(
+            plc.binaryop.binary_operation(left.obj, right.obj, self.op, self.dtype),
+            "what",
+        )
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        if depth == 1:
+            # inside aggregation, need to pre-evaluate,
+            # groupby construction has checked that we don't have
+            # nested aggs, so stop the recursion and return ourselves
+            # for pre-eval
+            return AggInfo([(self, plc.aggregation.collect_list(), self)])
+        else:
+            left_info, right_info = (
+                child.collect_agg(depth=depth) for child in self.children
+            )
+            requests = [*left_info.requests, *right_info.requests]
+            # TODO: Hack, if there were no reductions inside this
+            # binary expression then we want to pre-evaluate and
+            # collect ourselves. Otherwise we want to collect the
+            # aggregations inside and post-evaluate. This is a bad way
+            # of checking that we are in case 1.
+            if all(
+                agg.kind() == plc.aggregation.Kind.COLLECT_LIST
+                for _, agg, _ in requests
+            ):
+                return AggInfo([(self, plc.aggregation.collect_list(), self)])
+            return AggInfo(
+                [*left_info.requests, *right_info.requests],
+            )
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
new file mode 100644
index 00000000000..d630b40f600
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -0,0 +1,879 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+# TODO: remove need for this
+# ruff: noqa: D101
+"""
+DSL nodes for the LogicalPlan of polars.
+
+An IR node is either a source, normal, or a sink. Respectively they
+can be considered as functions:
+
+- source: `IO () -> DataFrame`
+- normal: `DataFrame -> DataFrame`
+- sink: `DataFrame -> IO ()`
+"""
+
+from __future__ import annotations
+
+import itertools
+import types
+from dataclasses import dataclass
+from functools import cache
+from typing import TYPE_CHECKING, Any, Callable, ClassVar
+
+import pyarrow as pa
+from typing_extensions import assert_never
+
+import polars as pl
+
+import cudf
+import cudf._lib.pylibcudf as plc
+
+import cudf_polars.dsl.expr as expr
+from cudf_polars.containers import Column, DataFrame
+from cudf_polars.utils import sorting
+
+if TYPE_CHECKING:
+    from typing import Literal
+
+
+__all__ = [
+    "IR",
+    "PythonScan",
+    "Scan",
+    "Cache",
+    "DataFrameScan",
+    "Select",
+    "GroupBy",
+    "Join",
+    "HStack",
+    "Distinct",
+    "Sort",
+    "Slice",
+    "Filter",
+    "Projection",
+    "MapFunction",
+    "Union",
+    "HConcat",
+    "ExtContext",
+]
+
+
+@dataclass(slots=True)
+class IR:
+    """Abstract plan node, representing an unevaluated dataframe."""
+
+    schema: dict[str, plc.DataType]
+    """Mapping from column names to their data types."""
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """
+        Evaluate the node and return a dataframe.
+
+        Parameters
+        ----------
+        cache
+            Mapping from cached node ids to constructed DataFrames.
+            Used to implement evaluation of the `Cache` node.
+
+        Returns
+        -------
+        DataFrame (on device) representing the evaluation of this plan
+        node.
+
+        Raises
+        ------
+        NotImplementedError if we couldn't evaluate things. Ideally
+        this should not occur, since the translation phase should pick
+        up things that we cannot handle.
+        """
+        raise NotImplementedError
+
+
+@dataclass(slots=True)
+class PythonScan(IR):
+    """Representation of input from a python function."""
+
+    options: Any
+    """Arbitrary options."""
+    predicate: expr.Expr | None
+    """Filter to apply to the constructed dataframe before returning it."""
+
+
+@dataclass(slots=True)
+class Scan(IR):
+    """Input from files."""
+
+    typ: Any
+    """What type of file are we reading? Parquet, CSV, etc..."""
+    paths: list[str]
+    """List of paths to read from."""
+    file_options: Any
+    """Options for reading the file.
+
+    Attributes are:
+    - ``with_columns: list[str]`` of projected columns to return.
+    - ``n_rows: int``: Number of rows to read.
+    - ``row_index: tuple[name, offset] | None``: Add an integer index
+        column with given name.
+    """
+    predicate: expr.Expr | None
+    """Mask to apply to the read dataframe."""
+
+    def __post_init__(self):
+        """Validate preconditions."""
+        if self.file_options.n_rows is not None:
+            raise NotImplementedError("row limit in scan")
+        if self.typ not in ("csv", "parquet"):
+            raise NotImplementedError(f"Unhandled scan type: {self.typ}")
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        options = self.file_options
+        with_columns = options.with_columns
+        row_index = options.row_index
+        if self.typ == "csv":
+            df = DataFrame.from_cudf(
+                cudf.concat(
+                    [cudf.read_csv(p, usecols=with_columns) for p in self.paths]
+                )
+            )
+        elif self.typ == "parquet":
+            df = DataFrame.from_cudf(
+                cudf.read_parquet(self.paths, columns=with_columns)
+            )
+        else:
+            assert_never(self.typ)
+        if row_index is not None:
+            name, offset = row_index
+            # TODO: dtype
+            step = plc.interop.from_arrow(pa.scalar(1))
+            init = plc.interop.from_arrow(pa.scalar(offset))
+            index = Column(
+                plc.filling.sequence(df.num_rows, init, step), name
+            ).set_sorted(
+                is_sorted=plc.types.Sorted.YES,
+                order=plc.types.Order.ASCENDING,
+                null_order=plc.types.NullOrder.AFTER,
+            )
+            df = DataFrame([index, *df.columns], [])
+        # TODO: should be true, but not the case until we get
+        # cudf-classic out of the loop for IO since it converts date32
+        # to datetime.
+        # assert all(
+        #     c.obj.type() == dtype
+        #     for c, dtype in zip(df.columns, self.schema.values())
+        # )
+        if self.predicate is None:
+            return df
+        else:
+            mask = self.predicate.evaluate(df)
+            return df.filter(mask)
+
+
+@dataclass(slots=True)
+class Cache(IR):
+    """
+    Return a cached plan node.
+
+    Used for CSE at the plan level.
+    """
+
+    key: int
+    """The cache key."""
+    value: IR
+    """The unevaluated node to cache."""
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        try:
+            return cache[self.key]
+        except KeyError:
+            return cache.setdefault(self.key, self.value.evaluate(cache=cache))
+
+
+@dataclass(slots=True)
+class DataFrameScan(IR):
+    """
+    Input from an existing polars DataFrame.
+
+    This typically arises from ``q.collect().lazy()``
+    """
+
+    df: Any
+    """Polars LazyFrame object."""
+    projection: list[str]
+    """List of columns to project out."""
+    predicate: expr.Expr | None
+    """Mask to apply."""
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        pdf = pl.DataFrame._from_pydf(self.df)
+        if self.projection is not None:
+            pdf = pdf.select(self.projection)
+        # TODO: goes away when libcudf supports large strings
+        table = pdf.to_arrow()
+        schema = table.schema
+        for i, field in enumerate(schema):
+            if field.type == pa.large_string():
+                # TODO: Nested types
+                schema = schema.set(i, pa.field(field.name, pa.string()))
+        table = table.cast(schema)
+        df = DataFrame.from_table(
+            plc.interop.from_arrow(table), list(self.schema.keys())
+        )
+        assert all(
+            c.obj.type() == dtype for c, dtype in zip(df.columns, self.schema.values())
+        )
+        if self.predicate is not None:
+            mask = self.predicate.evaluate(df)
+            return df.filter(mask)
+        else:
+            return df
+
+
+@dataclass(slots=True)
+class Select(IR):
+    """Produce a new dataframe selecting given expressions from an input."""
+
+    df: IR
+    """Input dataframe."""
+    cse: list[expr.Expr]
+    """
+    List of common subexpressions that will appear in the selected expressions.
+
+    These must be evaluated before the returned expressions.
+    """
+    expr: list[expr.Expr]
+    """List of expressions to evaluate to form the new dataframe."""
+
+    def evaluate(self, *, cache: dict[int, DataFrame]):
+        """Evaluate and return a dataframe."""
+        df = self.df.evaluate(cache=cache)
+        df = df.with_columns([e.evaluate(df) for e in self.cse])
+        return DataFrame([e.evaluate(df) for e in self.expr], [])
+
+
+@dataclass(slots=True)
+class Reduce(IR):
+    """
+    Produce a new dataframe selecting given expressions from an input.
+
+    This is a special case of :class:`Select` where all outputs are a single row.
+    """
+
+    df: IR
+    """Input dataframe."""
+    expr: list[expr.Expr]
+    """List of expressions to evaluate to form the new dataframe."""
+
+    def evaluate(self, *, cache: dict[int, DataFrame]):
+        """Evaluate and return a dataframe."""
+        df = self.df.evaluate(cache=cache)
+        return DataFrame([e.evaluate(df) for e in self.expr], [])
+
+
+def placeholder_column(n: int):
+    """
+    Produce a placeholder pylibcudf column with NO BACKING DATA.
+
+    Parameters
+    ----------
+    n
+        Number of rows the column will advertise
+
+    Returns
+    -------
+    pylibcudf Column that is almost unusable. DO NOT ACCESS THE DATA BUFFER.
+
+    Notes
+    -----
+    This is used to avoid allocating data for count aggregations.
+    """
+    return plc.Column(
+        plc.DataType(plc.TypeId.INT8),
+        n,
+        plc.gpumemoryview(
+            types.SimpleNamespace(__cuda_array_interface__={"data": (1, True)})
+        ),
+        None,
+        0,
+        0,
+        [],
+    )
+
+
+@dataclass(slots=False)
+class GroupBy(IR):
+    """Perform a groupby."""
+
+    df: IR
+    """Input dataframe."""
+    agg_requests: list[expr.Expr]
+    """List of expressions to evaluate groupwise."""
+    keys: list[expr.Expr]
+    """List of expressions forming the keys."""
+    maintain_order: bool
+    """Should the order of the input dataframe be maintained?"""
+    options: Any
+    """Options controlling style of groupby."""
+
+    @staticmethod
+    def check_agg(agg: expr.Expr) -> int:
+        """
+        Determine if we can handle an aggregation expression.
+
+        Parameters
+        ----------
+        agg
+            Expression to check
+
+        Returns
+        -------
+        depth of nesting
+
+        Raises
+        ------
+        NotImplementedError for unsupported expression nodes.
+        """
+        if isinstance(agg, (expr.NamedExpr, expr.BinOp, expr.Cast)):
+            return max(GroupBy.check_agg(child) for child in agg.children)
+        elif isinstance(agg, expr.Agg):
+            if agg.name == "implode":
+                raise NotImplementedError("implode in groupby")
+            return 1 + max(GroupBy.check_agg(child) for child in agg.children)
+        elif isinstance(agg, (expr.Len, expr.Col, expr.Literal)):
+            return 0
+        else:
+            raise NotImplementedError(f"No handler for {agg=}")
+
+    def __post_init__(self):
+        """Check whether all the aggregations are implemented."""
+        if self.options.rolling is None and self.maintain_order:
+            raise NotImplementedError("Maintaining order in groupby")
+        if self.options.rolling:
+            raise NotImplementedError("rolling window/groupby")
+        if any(GroupBy.check_agg(a) > 1 for a in self.agg_requests):
+            raise NotImplementedError("Nested aggregations in groupby")
+        self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        df = self.df.evaluate(cache=cache)
+        keys = [k.evaluate(df) for k in self.keys]
+        # TODO: use sorted information, need to expose column_order
+        # and null_precedence in pylibcudf groupby constructor
+        # sorted = (
+        #     plc.types.Sorted.YES
+        #     if all(k.is_sorted for k in keys)
+        #     else plc.types.Sorted.NO
+        # )
+        grouper = plc.groupby.GroupBy(
+            plc.Table([k.obj for k in keys]),
+            null_handling=plc.types.NullPolicy.INCLUDE,
+        )
+        # TODO: uniquify
+        requests = []
+        replacements = []
+        for info in self.agg_infos:
+            for pre_eval, req, rep in info.requests:
+                if pre_eval is None:
+                    col = placeholder_column(df.num_rows)
+                else:
+                    col = pre_eval.evaluate(df).obj
+                requests.append(plc.groupby.GroupByRequest(col, [req]))
+                replacements.append(rep)
+        group_keys, raw_tables = grouper.aggregate(requests)
+        raw_columns = []
+        for i, table in enumerate(raw_tables):
+            (column,) = table.columns()
+            raw_columns.append(Column(column, f"column{i}"))
+        mapping = dict(zip(replacements, raw_columns))
+        result_keys = [Column(gk, k.name) for gk, k in zip(group_keys.columns(), keys)]
+        result_subs = DataFrame(raw_columns, [])
+        results = [
+            req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests
+        ]
+        return DataFrame([*result_keys, *results], []).slice(self.options.slice)
+
+
+@dataclass(slots=True)
+class Join(IR):
+    """A join of two dataframes."""
+
+    left: IR
+    """Left frame."""
+    right: IR
+    """Right frame."""
+    left_on: list[expr.Expr]
+    """List of expressions used as keys in the left frame."""
+    right_on: list[expr.Expr]
+    """List of expressions used as keys in the right frame."""
+    options: tuple[
+        Literal["inner", "left", "full", "leftsemi", "leftanti"],
+        bool,
+        tuple[int, int] | None,
+        str | None,
+        bool,
+    ]
+    """
+    tuple of options:
+    - how: join type
+    - join_nulls: do nulls compare equal?
+    - slice: optional slice to perform after joining.
+    - suffix: string suffix for right columns if names match
+    - coalesce: should key columns be coalesced (only makes sense for outer joins)
+    """
+
+    def __post_init__(self):
+        """Validate preconditions."""
+        if self.options[0] == "cross":
+            raise NotImplementedError("cross join not implemented")
+
+    @cache
+    @staticmethod
+    def _joiners(
+        how: Literal["inner", "left", "full", "leftsemi", "leftanti"],
+    ) -> tuple[
+        Callable, plc.copying.OutOfBoundsPolicy, plc.copying.OutOfBoundsPolicy | None
+    ]:
+        if how == "inner":
+            return (
+                plc.join.inner_join,
+                plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+                plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+            )
+        elif how == "left":
+            return (
+                plc.join.left_join,
+                plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+                plc.copying.OutOfBoundsPolicy.NULLIFY,
+            )
+        elif how == "full":
+            return (
+                plc.join.full_join,
+                plc.copying.OutOfBoundsPolicy.NULLIFY,
+                plc.copying.OutOfBoundsPolicy.NULLIFY,
+            )
+        elif how == "leftsemi":
+            return (
+                plc.join.left_semi_join,
+                plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+                None,
+            )
+        elif how == "leftanti":
+            return (
+                plc.join.left_anti_join,
+                plc.copying.OutOfBoundsPolicy.DONT_CHECK,
+                None,
+            )
+        else:
+            assert_never(how)
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        left = self.left.evaluate(cache=cache)
+        right = self.right.evaluate(cache=cache)
+        left_on = DataFrame([e.evaluate(left) for e in self.left_on], [])
+        right_on = DataFrame([e.evaluate(right) for e in self.right_on], [])
+        how, join_nulls, zlice, suffix, coalesce = self.options
+        null_equality = (
+            plc.types.NullEquality.EQUAL
+            if join_nulls
+            else plc.types.NullEquality.UNEQUAL
+        )
+        suffix = "_right" if suffix is None else suffix
+        join_fn, left_policy, right_policy = Join._joiners(how)
+        if right_policy is None:
+            # Semi join
+            lg = join_fn(left_on.table, right_on.table, null_equality)
+            left = left.replace_columns(*left_on.columns)
+            table = plc.copying.gather(left.table, lg, left_policy)
+            result = DataFrame.from_table(table, left.column_names)
+        else:
+            lg, rg = join_fn(left_on.table, right_on.table, null_equality)
+            left = left.replace_columns(*left_on.columns)
+            right = right.replace_columns(*right_on.columns)
+            if coalesce and how == "inner":
+                right = right.discard_columns(right_on.column_names_set)
+            left = DataFrame.from_table(
+                plc.copying.gather(left.table, lg, left_policy), left.column_names
+            )
+            right = DataFrame.from_table(
+                plc.copying.gather(right.table, rg, right_policy), right.column_names
+            )
+            if coalesce and how != "inner":
+                left = left.replace_columns(
+                    *(
+                        Column(
+                            plc.replace.replace_nulls(left_col.obj, right_col.obj),
+                            left_col.name,
+                        )
+                        for left_col, right_col in zip(
+                            left.select_columns(left_on.column_names_set),
+                            right.select_columns(right_on.column_names_set),
+                        )
+                    )
+                )
+                right = right.discard_columns(right_on.column_names_set)
+            right = right.rename_columns(
+                {
+                    name: f"{name}{suffix}"
+                    for name in right.column_names
+                    if name in left.column_names_set
+                }
+            )
+            result = left.with_columns(right.columns)
+        return result.slice(zlice)
+
+
+@dataclass(slots=True)
+class HStack(IR):
+    """Add new columns to a dataframe."""
+
+    df: IR
+    """Input dataframe."""
+    cse: list[expr.Expr]
+    """
+    List of common subexpressions that will appear in the selected expressions.
+
+    These must be evaluated before the returned expressions.
+    """
+    columns: list[expr.Expr]
+    """List of expressions to produce new columns."""
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        df = self.df.evaluate(cache=cache)
+        ctx = df.copy().with_columns([e.evaluate(df) for e in self.cse])
+        return df.with_columns([c.evaluate(ctx) for c in self.columns])
+
+
+@dataclass(slots=True)
+class Distinct(IR):
+    """Produce a new dataframe with distinct rows."""
+
+    df: IR
+    """Input dataframe."""
+    keep: plc.stream_compaction.DuplicateKeepOption
+    """Which rows to keep."""
+    subset: set[str] | None
+    """Which columns to inspect when computing distinct rows."""
+    zlice: tuple[int, int] | None
+    """Optional slice to perform after compaction."""
+    stable: bool
+    """Should order be preserved?"""
+
+    _KEEP_MAP: ClassVar[dict[str, plc.stream_compaction.DuplicateKeepOption]] = {
+        "first": plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
+        "last": plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
+        "none": plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
+        "any": plc.stream_compaction.DuplicateKeepOption.KEEP_ANY,
+    }
+
+    def __init__(self, schema: dict, df: IR, options: Any):
+        self.schema = schema
+        self.df = df
+        (keep, subset, maintain_order, zlice) = options
+        self.keep = Distinct._KEEP_MAP[keep]
+        self.subset = set(subset) if subset is not None else None
+        self.stable = maintain_order
+        self.zlice = zlice
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        df = self.df.evaluate(cache=cache)
+        if self.subset is None:
+            indices = list(range(df.num_columns))
+        else:
+            indices = [i for i, k in enumerate(df.column_names) if k in self.subset]
+        keys_sorted = all(df.columns[i].is_sorted for i in indices)
+        if keys_sorted:
+            table = plc.stream_compaction.unique(
+                df.table,
+                indices,
+                self.keep,
+                plc.types.NullEquality.EQUAL,
+            )
+        else:
+            distinct = (
+                plc.stream_compaction.stable_distinct
+                if self.stable
+                else plc.stream_compaction.distinct
+            )
+            table = distinct(
+                df.table,
+                indices,
+                self.keep,
+                plc.types.NullEquality.EQUAL,
+                plc.types.NanEquality.ALL_EQUAL,
+            )
+        result = DataFrame(
+            [Column(c, old.name) for c, old in zip(table.columns(), df.columns)], []
+        )
+        if keys_sorted or self.stable:
+            result = result.sorted_like(df)
+        return result.slice(self.zlice)
+
+
+@dataclass(slots=True)
+class Sort(IR):
+    """Sort a dataframe."""
+
+    df: IR
+    """Input."""
+    by: list[expr.Expr]
+    """List of expressions to produce sort keys."""
+    do_sort: Callable[..., plc.Table]
+    """pylibcudf sorting function."""
+    zlice: tuple[int, int] | None
+    """Optional slice to apply after sorting."""
+    order: list[plc.types.Order]
+    """Order keys should be sorted in."""
+    null_order: list[plc.types.NullOrder]
+    """Where nulls sort to."""
+
+    def __init__(
+        self,
+        schema: dict,
+        df: IR,
+        by: list[expr.Expr],
+        options: Any,
+        zlice: tuple[int, int] | None,
+    ):
+        self.schema = schema
+        self.df = df
+        self.by = by
+        self.zlice = zlice
+        stable, nulls_last, descending = options
+        self.order, self.null_order = sorting.sort_order(
+            descending, nulls_last=nulls_last, num_keys=len(by)
+        )
+        self.do_sort = (
+            plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
+        )
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        df = self.df.evaluate(cache=cache)
+        sort_keys = [k.evaluate(df) for k in self.by]
+        names = {c.name: i for i, c in enumerate(df.columns)}
+        # TODO: More robust identification here.
+        keys_in_result = [
+            i
+            for k in sort_keys
+            if (i := names.get(k.name)) is not None and k.obj is df.columns[i].obj
+        ]
+        table = self.do_sort(
+            df.table,
+            plc.Table([k.obj for k in sort_keys]),
+            self.order,
+            self.null_order,
+        )
+        columns = [Column(c, old.name) for c, old in zip(table.columns(), df.columns)]
+        # If a sort key is in the result table, set the sortedness property
+        for k, i in enumerate(keys_in_result):
+            columns[i] = columns[i].set_sorted(
+                is_sorted=plc.types.Sorted.YES,
+                order=self.order[k],
+                null_order=self.null_order[k],
+            )
+        return DataFrame(columns, []).slice(self.zlice)
+
+
+@dataclass(slots=True)
+class Slice(IR):
+    """Slice a dataframe."""
+
+    df: IR
+    """Input."""
+    offset: int
+    """Start of the slice."""
+    length: int
+    """Length of the slice."""
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        df = self.df.evaluate(cache=cache)
+        return df.slice((self.offset, self.length))
+
+
+@dataclass(slots=True)
+class Filter(IR):
+    """Filter a dataframe with a boolean mask."""
+
+    df: IR
+    """Input."""
+    mask: expr.Expr
+    """Expression evaluating to a mask."""
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        df = self.df.evaluate(cache=cache)
+        return df.filter(self.mask.evaluate(df))
+
+
+@dataclass(slots=True)
+class Projection(IR):
+    """Select a subset of columns from a dataframe."""
+
+    df: IR
+    """Input."""
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        df = self.df.evaluate(cache=cache)
+        # This can reorder things.
+        return df.select(list(self.schema.keys()))
+
+
+@dataclass(slots=True)
+class MapFunction(IR):
+    """Apply some function to a dataframe."""
+
+    df: IR
+    """Input."""
+    name: str
+    """Function name."""
+    options: Any
+    """Arbitrary options, interpreted per function."""
+
+    _NAMES: ClassVar[frozenset[str]] = frozenset(
+        [
+            "drop_nulls",
+            "rechunk",
+            "merge_sorted",
+            "rename",
+            "explode",
+        ]
+    )
+
+    def __post_init__(self):
+        """Validate preconditions."""
+        if self.name not in MapFunction._NAMES:
+            raise NotImplementedError(f"Unhandled map function {self.name}")
+        if self.name == "explode":
+            (to_explode,) = self.options
+            if len(to_explode) > 1:
+                # TODO: straightforward, but need to error check
+                # polars requires that all to-explode columns have the
+                # same sub-shapes
+                raise NotImplementedError("Explode with more than one column")
+        elif self.name == "merge_sorted":
+            assert isinstance(self.df, Union)
+            (key_column,) = self.options
+            if key_column not in self.df.dfs[0].schema:
+                raise ValueError(f"Key column {key_column} not found")
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        if self.name == "merge_sorted":
+            # merge_sorted operates on Union inputs
+            # but if we evaluate the Union then we can't unpick the
+            # pieces, so we dive inside and evaluate the pieces by hand
+            assert isinstance(self.df, Union)
+            first, *rest = (c.evaluate(cache=cache) for c in self.df.dfs)
+            (key_column,) = self.options
+            if not all(first.column_names == r.column_names for r in rest):
+                raise ValueError("DataFrame shapes/column names don't match")
+            # Already validated that key_column is in column names
+            index = first.column_names.index(key_column)
+            return DataFrame.from_table(
+                plc.merge.merge_sorted(
+                    [first.table, *(df.table for df in rest)],
+                    [index],
+                    [plc.types.Order.ASCENDING],
+                    [plc.types.NullOrder.BEFORE],
+                ),
+                first.column_names,
+            ).sorted_like(first, subset={key_column})
+        elif self.name == "rechunk":
+            # No-op in our data model
+            return self.df.evaluate(cache=cache)
+        elif self.name == "drop_nulls":
+            df = self.df.evaluate(cache=cache)
+            (subset,) = self.options
+            subset = set(subset)
+            indices = [i for i, name in enumerate(df.column_names) if name in subset]
+            return DataFrame.from_table(
+                plc.stream_compaction.drop_nulls(df.table, indices, len(indices)),
+                df.column_names,
+            ).sorted_like(df)
+        elif self.name == "rename":
+            df = self.df.evaluate(cache=cache)
+            # final tag is "swapping" which is useful for the
+            # optimiser (it blocks some pushdown operations)
+            old, new, _ = self.options
+            return df.rename_columns(dict(zip(old, new)))
+        elif self.name == "explode":
+            df = self.df.evaluate(cache=cache)
+            ((to_explode,),) = self.options
+            index = df.column_names.index(to_explode)
+            subset = df.column_names_set - {to_explode}
+            return DataFrame.from_table(
+                plc.lists.explode_outer(df.table, index), df.column_names
+            ).sorted_like(df, subset=subset)
+        else:
+            raise AssertionError("Should never be reached")
+
+
+@dataclass(slots=True)
+class Union(IR):
+    """Concatenate dataframes vertically."""
+
+    dfs: list[IR]
+    """List of inputs."""
+    zlice: tuple[int, int] | None
+    """Optional slice to apply after concatenation."""
+
+    def __post_init__(self):
+        """Validated preconditions."""
+        schema = self.dfs[0].schema
+        if not all(s.schema == schema for s in self.dfs[1:]):
+            raise ValueError("Schema mismatch")
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        # TODO: only evaluate what we need if we have a slice
+        dfs = [df.evaluate(cache=cache) for df in self.dfs]
+        return DataFrame.from_table(
+            plc.concatenate.concatenate([df.table for df in dfs]), dfs[0].column_names
+        ).slice(self.zlice)
+
+
+@dataclass(slots=True)
+class HConcat(IR):
+    """Concatenate dataframes horizontally."""
+
+    dfs: list[IR]
+    """List of inputs."""
+
+    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+        """Evaluate and return a dataframe."""
+        dfs = [df.evaluate(cache=cache) for df in self.dfs]
+        columns, scalars = zip(*((df.columns, df.scalars) for df in dfs))
+        return DataFrame(
+            list(itertools.chain.from_iterable(columns)),
+            list(itertools.chain.from_iterable(scalars)),
+        )
+
+
+@dataclass(slots=True)
+class ExtContext(IR):
+    """
+    Concatenate dataframes horizontally.
+
+    Prefer HConcat, since this is going to be deprecated on the polars side.
+    """
+
+    df: IR
+    """Input."""
+    extra: list[IR]
+    """List of extra inputs."""
+
+    def __post_init__(self):
+        """Validate preconditions."""
+        raise NotImplementedError(
+            "ExtContext will be deprecated, use horizontal concat instead."
+        )
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
new file mode 100644
index 00000000000..b3d0edf183f
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -0,0 +1,403 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Translate polars IR representation to ours."""
+
+from __future__ import annotations
+
+from contextlib import AbstractContextManager, nullcontext
+from functools import singledispatch
+from typing import Any
+
+from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
+
+import cudf._lib.pylibcudf as plc  # noqa: TCH002, singledispatch register needs this name defined.
+
+from cudf_polars.dsl import expr, ir
+from cudf_polars.utils import dtypes
+
+__all__ = ["translate_ir", "translate_expr"]
+
+
+class set_node(AbstractContextManager):
+    """Run a block with current node set in the visitor."""
+
+    __slots__ = ("n", "visitor")
+
+    def __init__(self, visitor, n: int):
+        self.visitor = visitor
+        self.n = n
+
+    def __enter__(self):
+        n = self.visitor.get_node()
+        self.visitor.set_node(self.n)
+        self.n = n
+
+    def __exit__(self, *args):
+        self.visitor.set_node(self.n)
+
+
+noop_context: nullcontext = nullcontext()
+
+
+@singledispatch
+def _translate_ir(node: Any, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    raise NotImplementedError(f"Translation for {type(node).__name__}")
+
+
+@_translate_ir.register
+def _(node: pl_ir.PythonScan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    return ir.PythonScan(
+        schema,
+        node.options,
+        translate_expr(visitor, n=node.predicate)
+        if node.predicate is not None
+        else None,
+    )
+
+
+@_translate_ir.register
+def _(node: pl_ir.Scan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    return ir.Scan(
+        schema,
+        node.scan_type,
+        node.paths,
+        node.file_options,
+        translate_expr(visitor, n=node.predicate)
+        if node.predicate is not None
+        else None,
+    )
+
+
+@_translate_ir.register
+def _(node: pl_ir.Cache, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    return ir.Cache(schema, node.id_, translate_ir(visitor, n=node.input))
+
+
+@_translate_ir.register
+def _(
+    node: pl_ir.DataFrameScan, visitor: Any, schema: dict[str, plc.DataType]
+) -> ir.IR:
+    return ir.DataFrameScan(
+        schema,
+        node.df,
+        node.projection,
+        translate_expr(visitor, n=node.selection)
+        if node.selection is not None
+        else None,
+    )
+
+
+@_translate_ir.register
+def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    with set_node(visitor, node.input):
+        inp = translate_ir(visitor, n=None)
+    cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr]
+    exprs = [translate_expr(visitor, n=e) for e in node.expr]
+    return ir.Select(schema, inp, cse_exprs, exprs)
+
+
+@_translate_ir.register
+def _(node: pl_ir.GroupBy, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    with set_node(visitor, node.input):
+        inp = translate_ir(visitor, n=None)
+    aggs = [translate_expr(visitor, n=e) for e in node.aggs]
+    keys = [translate_expr(visitor, n=e) for e in node.keys]
+    return ir.GroupBy(
+        schema,
+        inp,
+        aggs,
+        keys,
+        node.maintain_order,
+        node.options,
+    )
+
+
+@_translate_ir.register
+def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    # Join key dtypes are dependent on the schema of the left and
+    # right inputs, so these must be translated with the relevant
+    # input active.
+    with set_node(visitor, node.input_left):
+        inp_left = translate_ir(visitor, n=None)
+        left_on = [translate_expr(visitor, n=e) for e in node.left_on]
+    with set_node(visitor, node.input_right):
+        inp_right = translate_ir(visitor, n=None)
+        right_on = [translate_expr(visitor, n=e) for e in node.right_on]
+    return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options)
+
+
+@_translate_ir.register
+def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    with set_node(visitor, node.input):
+        inp = translate_ir(visitor, n=None)
+    cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_exprs]
+    exprs = [translate_expr(visitor, n=e) for e in node.exprs]
+    return ir.HStack(schema, inp, cse_exprs, exprs)
+
+
+@_translate_ir.register
+def _(node: pl_ir.Reduce, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    with set_node(visitor, node.input):
+        inp = translate_ir(visitor, n=None)
+    exprs = [translate_expr(visitor, n=e) for e in node.expr]
+    return ir.Reduce(schema, inp, exprs)
+
+
+@_translate_ir.register
+def _(node: pl_ir.Distinct, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    return ir.Distinct(
+        schema,
+        translate_ir(visitor, n=node.input),
+        node.options,
+    )
+
+
+@_translate_ir.register
+def _(node: pl_ir.Sort, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    with set_node(visitor, node.input):
+        inp = translate_ir(visitor, n=None)
+    by = [translate_expr(visitor, n=e) for e in node.by_column]
+    return ir.Sort(schema, inp, by, node.sort_options, node.slice)
+
+
+@_translate_ir.register
+def _(node: pl_ir.Slice, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    return ir.Slice(schema, translate_ir(visitor, n=node.input), node.offset, node.len)
+
+
+@_translate_ir.register
+def _(node: pl_ir.Filter, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    with set_node(visitor, node.input):
+        inp = translate_ir(visitor, n=None)
+    mask = translate_expr(visitor, n=node.predicate)
+    return ir.Filter(schema, inp, mask)
+
+
+@_translate_ir.register
+def _(
+    node: pl_ir.SimpleProjection, visitor: Any, schema: dict[str, plc.DataType]
+) -> ir.IR:
+    return ir.Projection(schema, translate_ir(visitor, n=node.input))
+
+
+@_translate_ir.register
+def _(node: pl_ir.MapFunction, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    name, *options = node.function
+    return ir.MapFunction(
+        schema,
+        # TODO: merge_sorted breaks this pattern
+        translate_ir(visitor, n=node.input),
+        name,
+        options,
+    )
+
+
+@_translate_ir.register
+def _(node: pl_ir.Union, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    return ir.Union(
+        schema, [translate_ir(visitor, n=n) for n in node.inputs], node.options
+    )
+
+
+@_translate_ir.register
+def _(node: pl_ir.HConcat, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs])
+
+
+@_translate_ir.register
+def _(node: pl_ir.ExtContext, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+    return ir.ExtContext(
+        schema,
+        translate_ir(visitor, n=node.input),
+        [translate_ir(visitor, n=n) for n in node.contexts],
+    )
+
+
+def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
+    """
+    Translate a polars-internal IR node to our representation.
+
+    Parameters
+    ----------
+    visitor
+        Polars NodeTraverser object
+    n
+        Optional node to start traversing from, if not provided uses
+        current polars-internal node.
+
+    Returns
+    -------
+    Translated IR object
+
+    Raises
+    ------
+    NotImplementedError if we can't translate the nodes due to
+    unsupported functionality.
+    """
+    ctx: AbstractContextManager = (
+        set_node(visitor, n) if n is not None else noop_context
+    )
+    with ctx:
+        node = visitor.view_current_node()
+        schema = {k: dtypes.from_polars(v) for k, v in visitor.get_schema().items()}
+        return _translate_ir(node, visitor, schema)
+
+
+@singledispatch
+def _translate_expr(node: Any, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    raise NotImplementedError(f"Translation for {type(node).__name__}")
+
+
+@_translate_expr.register
+def _(node: pl_expr.PyExprIR, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    e = translate_expr(visitor, n=node.node)
+    return expr.NamedExpr(dtype, node.output_name, e)
+
+
+@_translate_expr.register
+def _(node: pl_expr.Function, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    name, *options = node.function_data
+    options = tuple(options)
+    if isinstance(name, pl_expr.StringFunction):
+        return expr.StringFunction(
+            dtype,
+            name,
+            options,
+            *(translate_expr(visitor, n=n) for n in node.input),
+        )
+    elif isinstance(name, pl_expr.BooleanFunction):
+        return expr.BooleanFunction(
+            dtype,
+            name,
+            options,
+            *(translate_expr(visitor, n=n) for n in node.input),
+        )
+    else:
+        raise NotImplementedError(f"No handler for Expr function node with {name=}")
+
+
+@_translate_expr.register
+def _(node: pl_expr.Window, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    # TODO: raise in groupby?
+    if node.partition_by is None:
+        return expr.RollingWindow(
+            dtype, node.options, translate_expr(visitor, n=node.function)
+        )
+    else:
+        return expr.GroupedRollingWindow(
+            dtype,
+            node.options,
+            translate_expr(visitor, n=node.function),
+            *(translate_expr(visitor, n=n) for n in node.partition_by),
+        )
+
+
+@_translate_expr.register
+def _(node: pl_expr.Literal, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    return expr.Literal(dtype, node.value)
+
+
+@_translate_expr.register
+def _(node: pl_expr.Sort, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    # TODO: raise in groupby
+    return expr.Sort(dtype, node.options, translate_expr(visitor, n=node.expr))
+
+
+@_translate_expr.register
+def _(node: pl_expr.SortBy, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    return expr.SortBy(
+        dtype,
+        node.sort_options,
+        translate_expr(visitor, n=node.expr),
+        *(translate_expr(visitor, n=n) for n in node.by),
+    )
+
+
+@_translate_expr.register
+def _(node: pl_expr.Gather, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    return expr.Gather(
+        dtype,
+        translate_expr(visitor, n=node.expr),
+        translate_expr(visitor, n=node.idx),
+    )
+
+
+@_translate_expr.register
+def _(node: pl_expr.Filter, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    return expr.Filter(
+        dtype,
+        translate_expr(visitor, n=node.input),
+        translate_expr(visitor, n=node.by),
+    )
+
+
+@_translate_expr.register
+def _(node: pl_expr.Cast, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    inner = translate_expr(visitor, n=node.expr)
+    # Push casts into literals so we can handle Cast(Literal(Null))
+    if isinstance(inner, expr.Literal):
+        return expr.Literal(dtype, inner.value)
+    else:
+        return expr.Cast(dtype, inner)
+
+
+@_translate_expr.register
+def _(node: pl_expr.Column, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    return expr.Col(dtype, node.name)
+
+
+@_translate_expr.register
+def _(node: pl_expr.Agg, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    return expr.Agg(
+        dtype,
+        node.name,
+        node.options,
+        translate_expr(visitor, n=node.arguments),
+    )
+
+
+@_translate_expr.register
+def _(node: pl_expr.BinaryExpr, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    return expr.BinOp(
+        dtype,
+        expr.BinOp._MAPPING[node.op],
+        translate_expr(visitor, n=node.left),
+        translate_expr(visitor, n=node.right),
+    )
+
+
+@_translate_expr.register
+def _(node: pl_expr.Len, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+    return expr.Len(dtype)
+
+
+def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
+    """
+    Translate a polars-internal expression IR into our representation.
+
+    Parameters
+    ----------
+    visitor
+        Polars NodeTraverser object
+    n
+        Node to translate, either an integer referencing a polars
+        internal node, or a named expression node.
+
+    Returns
+    -------
+    Translated IR object.
+
+    Raises
+    ------
+    NotImplementedError if any translation fails due to unsupported functionality.
+    """
+    if isinstance(n, pl_expr.PyExprIR):
+        # TODO: type narrowing doesn't rule out int since PyExprIR is Unknown
+        assert not isinstance(n, int)
+        node = n
+        dtype = dtypes.from_polars(visitor.get_dtype(node.node))
+    else:
+        node = visitor.view_expression(n)
+        dtype = dtypes.from_polars(visitor.get_dtype(n))
+    return _translate_expr(node, visitor, dtype)
diff --git a/python/cudf_polars/cudf_polars/testing/__init__.py b/python/cudf_polars/cudf_polars/testing/__init__.py
new file mode 100644
index 00000000000..d0147e713f9
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/testing/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Testing utilities for cudf_polars."""
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
new file mode 100644
index 00000000000..a6e26a6425c
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -0,0 +1,76 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Device-aware assertions."""
+
+from __future__ import annotations
+
+from functools import partial
+from typing import TYPE_CHECKING
+
+from polars.testing.asserts import assert_frame_equal
+
+from cudf_polars.callback import execute_with_cudf
+
+if TYPE_CHECKING:
+    import polars as pl
+
+__all__: list[str] = ["assert_gpu_result_equal"]
+
+
+def assert_gpu_result_equal(
+    lazydf: pl.LazyFrame,
+    *,
+    check_row_order: bool = True,
+    check_column_order: bool = True,
+    check_dtype: bool = True,
+    check_exact: bool = True,
+    rtol: float = 1e-05,
+    atol: float = 1e-08,
+    categorical_as_str: bool = False,
+):
+    """
+    Assert that collection of a lazyframe on GPU produces correct results.
+
+    Parameters
+    ----------
+    lazydf
+        frame to collect.
+    check_row_order
+        Expect rows to be in same order
+    check_column_order
+        Expect columns to be in same order
+    check_dtype
+        Expect dtypes to match
+    check_exact
+        Require exact equality for floats, if `False` compare using
+        rtol and atol.
+    rtol
+        Relative tolerance for float comparisons
+    atol
+        Absolute tolerance for float comparisons
+    categorical_as_str
+        Decat categoricals to strings before comparing
+
+    Raises
+    ------
+    AssertionError
+        If the GPU and CPU collection do not match.
+    NotImplementedError
+        If GPU collection failed in some way.
+    """
+    expect = lazydf.collect()
+    got = lazydf.collect(
+        post_opt_callback=partial(execute_with_cudf, raise_on_fail=True)
+    )
+    assert_frame_equal(
+        expect,
+        got,
+        check_row_order=check_row_order,
+        check_column_order=check_column_order,
+        check_dtype=check_dtype,
+        check_exact=check_exact,
+        rtol=rtol,
+        atol=atol,
+        categorical_as_str=categorical_as_str,
+    )
diff --git a/python/cudf_polars/cudf_polars/utils/__init__.py b/python/cudf_polars/cudf_polars/utils/__init__.py
new file mode 100644
index 00000000000..6018209e1e8
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/utils/__init__.py
@@ -0,0 +1,8 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Utilities."""
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
new file mode 100644
index 00000000000..51379433c03
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -0,0 +1,89 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Datatype utilities."""
+
+from __future__ import annotations
+
+from functools import cache
+
+from typing_extensions import assert_never
+
+import polars as pl
+
+import cudf._lib.pylibcudf as plc
+
+
+@cache
+def from_polars(dtype: pl.DataType) -> plc.DataType:
+    """
+    Convert a polars datatype to a pylibcudf one.
+
+    Parameters
+    ----------
+    dtype
+        Polars dtype to convert
+
+    Returns
+    -------
+    Matching pylibcudf DataType object.
+
+    Raises
+    ------
+    NotImplementedError for unsupported conversions.
+    """
+    if isinstance(dtype, pl.Boolean):
+        return plc.DataType(plc.TypeId.BOOL8)
+    elif isinstance(dtype, pl.Int8):
+        return plc.DataType(plc.TypeId.INT8)
+    elif isinstance(dtype, pl.Int16):
+        return plc.DataType(plc.TypeId.INT16)
+    elif isinstance(dtype, pl.Int32):
+        return plc.DataType(plc.TypeId.INT32)
+    elif isinstance(dtype, pl.Int64):
+        return plc.DataType(plc.TypeId.INT64)
+    if isinstance(dtype, pl.UInt8):
+        return plc.DataType(plc.TypeId.UINT8)
+    elif isinstance(dtype, pl.UInt16):
+        return plc.DataType(plc.TypeId.UINT16)
+    elif isinstance(dtype, pl.UInt32):
+        return plc.DataType(plc.TypeId.UINT32)
+    elif isinstance(dtype, pl.UInt64):
+        return plc.DataType(plc.TypeId.UINT64)
+    elif isinstance(dtype, pl.Float32):
+        return plc.DataType(plc.TypeId.FLOAT32)
+    elif isinstance(dtype, pl.Float64):
+        return plc.DataType(plc.TypeId.FLOAT64)
+    elif isinstance(dtype, pl.Date):
+        return plc.DataType(plc.TypeId.TIMESTAMP_DAYS)
+    elif isinstance(dtype, pl.Time):
+        raise NotImplementedError("Time of day dtype not implemented")
+    elif isinstance(dtype, pl.Datetime):
+        if dtype.time_zone is not None:
+            raise NotImplementedError("Time zone support")
+        if dtype.time_unit == "ms":
+            return plc.DataType(plc.TypeId.TIMESTAMP_MILLISECONDS)
+        elif dtype.time_unit == "us":
+            return plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
+        elif dtype.time_unit == "ns":
+            return plc.DataType(plc.TypeId.TIMESTAMP_NANOSECONDS)
+        assert dtype.time_unit is not None
+        assert_never(dtype.time_unit)
+    elif isinstance(dtype, pl.Duration):
+        if dtype.time_unit == "ms":
+            return plc.DataType(plc.TypeId.DURATION_MILLISECONDS)
+        elif dtype.time_unit == "us":
+            return plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
+        elif dtype.time_unit == "ns":
+            return plc.DataType(plc.TypeId.DURATION_NANOSECONDS)
+        assert dtype.time_unit is not None
+        assert_never(dtype.time_unit)
+    elif isinstance(dtype, pl.String):
+        return plc.DataType(plc.TypeId.STRING)
+    elif isinstance(dtype, pl.Null):
+        # TODO: Hopefully
+        return plc.DataType(plc.TypeId.EMPTY)
+    elif isinstance(dtype, pl.List):
+        return plc.DataType(plc.TypeId.LIST)
+    else:
+        raise NotImplementedError(f"{dtype=} conversion not supported")
diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py
new file mode 100644
index 00000000000..b3ecfdd3dd4
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/utils/sorting.py
@@ -0,0 +1,49 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Sorting utilities."""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+import cudf._lib.pylibcudf as plc
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+
+def sort_order(
+    descending: Sequence[bool], *, nulls_last: bool, num_keys: int
+) -> tuple[list[plc.types.Order], list[plc.types.NullOrder]]:
+    """
+    Produce sort order arguments.
+
+    Parameters
+    ----------
+    descending
+        List indicating order for each column
+    nulls_last
+        Should nulls sort last or first?
+    num_keys
+        Number of sort keys
+
+    Returns
+    -------
+    tuple of column_order and null_precendence
+    suitable for passing to sort routines
+    """
+    # Mimicking polars broadcast handling of descending
+    if num_keys > (n := len(descending)) and n == 1:
+        descending = [descending[0]] * num_keys
+    column_order = [
+        plc.types.Order.DESCENDING if d else plc.types.Order.ASCENDING
+        for d in descending
+    ]
+    null_precedence = []
+    for asc in column_order:
+        if (asc == plc.types.Order.ASCENDING) ^ (not nulls_last):
+            null_precedence.append(plc.types.NullOrder.AFTER)
+        elif (asc == plc.types.Order.ASCENDING) ^ nulls_last:
+            null_precedence.append(plc.types.NullOrder.BEFORE)
+    return column_order, null_precedence
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
new file mode 100644
index 00000000000..cbf012f5881
--- /dev/null
+++ b/python/cudf_polars/docs/overview.md
@@ -0,0 +1,174 @@
+# Getting started
+
+You will need:
+
+1. Rust development environment. If you use the rapids [combined
+   devcontainer](https://github.com/rapidsai/devcontainers/), add
+   `"./features/src/rust": {"version": "latest", "profile": "default"},` to your
+   preferred configuration. Or else, use
+   [rustup](https://www.rust-lang.org/tools/install)
+2. A [cudf development
+   environment](https://github.com/rapidsai/cudf/blob/branch-24.08/CONTRIBUTING.md#setting-up-your-build-environment).
+   The combined devcontainer works, or whatever your favourite approach is.
+
+> ![NOTE] These instructions will get simpler as we merge code in.
+
+## Installing polars
+
+We will need to build polars from source. Until things settle down,
+live at `HEAD`.
+
+```sh
+git clone https://github.com/pola-rs/polars
+cd polars
+```
+
+We will install build dependencies in the same environment that we created for
+building cudf. Note that polars offers a `make build` command that sets up a
+separate virtual environment, but we don't want to do that right now. So in the
+polars clone:
+
+```sh
+# cudf environment (conda or pip) is active
+pip install --upgrade uv
+uv pip install --upgrade -r py-polars/requirements-dev.txt
+```
+
+Now we have the necessary machinery to build polars
+```sh
+cd py-polars
+# build in debug mode, best option for development/debugging
+maturin develop -m Cargo.toml
+```
+
+For benchmarking purposes we should build in release mode
+```sh
+RUSTFLAGS='-C target-cpu=native' maturin develop -m Cargo.toml --release
+```
+
+After any update of the polars code, we need to rerun the `maturin` build
+command.
+
+## Installing the cudf polars executor
+
+The executor for the polars logical plan lives in the cudf repo, in
+`python/cudf_polars`. Build cudf as normal and then install the
+`cudf_polars` package in editable mode:
+
+```sh
+cd cudf/python/cudf_polars
+pip install --no-deps -e .
+```
+
+You should now be able to run the tests in the `cudf_polars` package:
+```sh
+pytest -v tests
+```
+
+# Executor design
+
+The polars `LazyFrame.collect` functionality offers a
+"post-optimization" callback that may be used by a third party library
+to replace a node (or more, though we only replace a single node) in the
+optimized logical plan with a Python callback that is to deliver the
+result of evaluating the plan. This splits the execution of the plan
+into two phases. First, a symbolic phase which translates to our
+internal representation (IR). Second, an execution phase which executes
+using our IR.
+
+The translation phase receives the a low-level Rust `NodeTraverse`
+object which delivers Python representations of the plan nodes (and
+expressions) one at a time. During translation, we endeavour to raise
+`NotImplementedError` for any unsupported functionality. This way, if
+we can't execute something, we just don't modify the logical plan at
+all: if we can translate the IR, it is assumed that evaluation will
+later succeed.
+
+The usage of the cudf-based executor is therefore, at present:
+
+```python
+from cudf_polars.callback import execute_with_cudf
+
+result = q.collect(post_opt_callback=execute_with_cudf)
+```
+
+This should either transparently run on the GPU and deliver a polars
+dataframe, or else fail (but be handled) and just run the normal CPU
+execution.
+
+## Adding a handler for a new plan node
+
+Plan node definitions live in `cudf_polars/dsl/ir.py`, these are
+`dataclasses` that inherit from the base `IR` node. The evaluation of
+a plan node is done by implementing the `evaluate` method.
+
+To translate the plan node, add a case handler in `translate_ir` which
+lives in `cudf_polars/dsl/translate.py`.
+
+As well as child nodes that are plans, most plan nodes contain child
+expressions, which should be transformed using the input to the plan as a
+context. The translation of expressions is handled via
+`translate_expr` in `cudf_polars/dsl/translate.py`. So that data-type
+resolution is performed correctly any expression should be translated
+with the correct plan node "active" in the visitor. For example, when
+translating a `Join` node, the left keys (expressions) should be
+translated with the left input active (and right keys with right
+input). To facilitate this, use the `set_node` context manager.
+
+## Adding a handler for a new expression node
+
+Adding a handle for an expression node is very similar to a plan node.
+Expressions are all defined in `cudf_polars/dsl/expr.py` and inherit
+from `Expr`. Unlike plan nodes, these are not `dataclasses`, since it
+is simpler for us to implement efficient hashing, repr, and equality if we
+can write that ourselves.
+
+Every expression consists of two types of data:
+1. child data (other `Expr`s)
+2. non-child data (anything other than an `Expr`)
+The generic implementations of special methods in the base `Expr` base
+class require that the subclasses advertise which arguments to the
+constructor are non-child in a `_non_child` class slot. The
+constructor should then take arguments:
+```python
+def __init__(self, *non_child_data: Any, *children: Expr):
+```
+Read the docstrings in the `Expr` class for more details.
+
+Expressions are evaluated by implementing a `do_evaluate` method that
+takes a `DataFrame` as context (this provides columns) along with an
+`ExecutionContext` parameter (indicating what context we're evaluating
+this expression in, currently unused) and a `mapping` from
+expressions to evaluated `Column`s. This approach enables a simple form of
+expression rewriting during evaluation of expressions that is used in
+evaluation of, for example, groupby-aggregations. To perform the
+evaluation, one should use the base class (generic) `evaluate` method
+which handles the boilerplate for looking up in the substitution
+`mapping`.
+
+To simplify state tracking, all columns should be considered immutable
+on construction. This matches the "functional" description coming from
+the logical plan in any case, so is reasonably natural.
+
+# Containers
+
+Containers should be constructed as relatively lightweight objects
+around their pylibcudf counterparts. We have three (in
+`cudf_polars/containers/`):
+
+1. Scalar (a wrapper around a pylibcudf Scalar)
+2. Column (a wrapper around a pylibcudf Column)
+3. DataFrame (a wrapper around a pylibcudf Table)
+
+The interfaces offered by these are somewhat in flux, but broadly
+speaking, a `DataFrame` is just a list of `Column`s which each hold
+data plus a string `name`, along with a collection of `Scalar`s (this
+might go away).
+
+The columns keep track of metadata (for example, whether or not they
+are sorted).
+
+We offer some utility methods for transferring metadata when
+constructing new dataframes and columns, both `DataFrame` and `Column`
+offer a `with_metadata(*, like: Self)` call which copies metadata from
+the template.
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 86b0ad414fd..49ecd7080b9 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -20,7 +20,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cudf==24.8.*,>=0.0.0a0",
-    "polars>=0.20.24",
+    "polars>=0.20.30",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -52,6 +52,9 @@ version = {file = "cudf_polars/VERSION"}
 [tool.setuptools.packages.find]
 exclude = ["*tests*"]
 
+[tool.pytest.ini_options]
+xfail_strict = true
+
 [tool.ruff]
 line-length = 88
 indent-width = 4
@@ -130,6 +133,9 @@ ignore = [
 ]
 fixable = ["ALL"]
 
+[tool.ruff.lint.per-file-ignores]
+"**/tests/**/test_*.py" = ["D", "INP"]
+
 [tool.ruff.lint.flake8-pytest-style]
 # https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style
 fixture-parentheses = false
@@ -175,3 +181,5 @@ docstring-code-format = true
 build-backend = "setuptools.build_meta"
 commit-file = "cudf_polars/GIT_COMMIT"
 dependencies-file = "../../dependencies.yaml"
+# Pure python
+disable-cuda = true
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
new file mode 100644
index 00000000000..c792ae64f74
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -0,0 +1,63 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.dsl import expr
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(params=sorted(expr.Agg._SUPPORTED))
+def agg(request):
+    return request.param
+
+
+@pytest.fixture(params=[pl.Int32, pl.Float32, pl.Int16])
+def dtype(request):
+    return request.param
+
+
+@pytest.fixture(params=[False, True], ids=["no-nulls", "with-nulls"])
+def with_nulls(request):
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        False,
+        pytest.param(True, marks=pytest.mark.xfail(reason="No handler for set_sorted")),
+    ],
+    ids=["unsorted", "sorted"],
+)
+def is_sorted(request):
+    return request.param
+
+
+@pytest.fixture
+def df(dtype, with_nulls, is_sorted):
+    values = [-10, 4, 5, 2, 3, 6, 8, 9, 4, 4, 5, 2, 3, 7, 3, 6, -10, -11]
+    if with_nulls:
+        values = [None if v % 5 == 0 else v for v in values]
+
+    if is_sorted:
+        values = sorted(values, key=lambda x: -1000 if x is None else x)
+
+    df = pl.LazyFrame({"a": values}, schema={"a": dtype})
+    if is_sorted:
+        return df.set_sorted("a")
+    return df
+
+
+def test_agg(df, agg):
+    expr = getattr(pl.col("a"), agg)()
+    q = df.select(expr)
+
+    # https://github.com/rapidsai/cudf/issues/15852
+    check_dtype = agg not in {"count", "n_unique", "median"}
+    if not check_dtype and q.schema["a"] != pl.Float64:
+        with pytest.raises(AssertionError):
+            assert_gpu_result_equal(q)
+    assert_gpu_result_equal(q, check_dtype=check_dtype, check_exact=False)
diff --git a/python/cudf_polars/tests/expressions/test_filter.py b/python/cudf_polars/tests/expressions/test_filter.py
new file mode 100644
index 00000000000..783403d764c
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_filter.py
@@ -0,0 +1,20 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+def test_filter():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+
+    # group-by is just to avoid the filter being pushed into the scan.
+    query = ldf.group_by(pl.col("a")).agg(pl.col("b").sum()).filter(pl.col("b") < 1)
+    assert_gpu_result_equal(query)
diff --git a/python/cudf_polars/tests/expressions/test_gather.py b/python/cudf_polars/tests/expressions/test_gather.py
new file mode 100644
index 00000000000..df33e19a0b6
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_gather.py
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+def test_gather():
+    ldf = pl.LazyFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [0, 3, 1, 5, 6, 1, 0],
+        }
+    )
+
+    query = ldf.select(pl.col("a").gather(pl.col("b")))
+    assert_gpu_result_equal(query)
diff --git a/python/cudf_polars/tests/expressions/test_numeric_binops.py b/python/cudf_polars/tests/expressions/test_numeric_binops.py
new file mode 100644
index 00000000000..548aebf0875
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_numeric_binops.py
@@ -0,0 +1,106 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+dtypes = [
+    pl.Int8,
+    pl.Int16,
+    pl.Int64,
+    pl.UInt8,
+    pl.UInt64,
+    pl.Float32,
+    pl.Float64,
+]
+
+
+@pytest.fixture(params=dtypes)
+def ltype(request):
+    return request.param
+
+
+@pytest.fixture(params=dtypes)
+def rtype(request):
+    return request.param
+
+
+@pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"])
+def with_nulls(request):
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        pl.Expr.eq,
+        pl.Expr.eq_missing,
+        pl.Expr.ne,
+        pl.Expr.ne_missing,
+        pl.Expr.lt,
+        pl.Expr.le,
+        pl.Expr.gt,
+        pl.Expr.ge,
+        pl.Expr.add,
+        pl.Expr.sub,
+        pl.Expr.mul,
+        pl.Expr.truediv,
+        pl.Expr.floordiv,
+        pl.Expr.mod,
+    ],
+    ids=lambda fn: fn.__name__,
+)
+def binop(request):
+    return request.param
+
+
+@pytest.fixture
+def df(request, ltype, rtype, with_nulls, binop):
+    a = [1, 2, 3, 5, 8]
+    if with_nulls:
+        a[2] = None
+        a[-1] = None
+    b = [10, 20, 30, 50, 0]
+    if with_nulls:
+        b[1] = None
+        b[3] = None
+        b[-1] = None
+
+    lkind = (
+        "i"
+        if ltype.is_signed_integer()
+        else ("u" if ltype.is_unsigned_integer() else "f")
+    )
+    rkind = (
+        "i"
+        if rtype.is_signed_integer()
+        else ("u" if rtype.is_unsigned_integer() else "f")
+    )
+    if (
+        not with_nulls
+        and binop.__name__ in {"floordiv", "mod"}
+        # This catches the case where the result is not promoted to float.
+        and (
+            (lkind == rkind and lkind in {"i", "u"})
+            or ({lkind, rkind} == {"i", "u"} and pl.UInt64 not in {ltype, rtype})
+        )
+    ):
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="Polars nullifies division by zero for integral types"
+            )
+        )
+
+    return pl.LazyFrame({"a": a, "b": b}, schema={"a": ltype, "b": rtype})
+
+
+def test_numeric_binop(df, binop):
+    left = pl.col("a")
+    right = pl.col("b")
+
+    q = df.select(binop(left, right))
+
+    assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/test_distinct.py b/python/cudf_polars/tests/test_distinct.py
new file mode 100644
index 00000000000..d42c4a96f5a
--- /dev/null
+++ b/python/cudf_polars/tests/test_distinct.py
@@ -0,0 +1,30 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize("subset", [None, ["a"], ["a", "b"], ["b", "c"], ["c", "a"]])
+@pytest.mark.parametrize("keep", ["any", "none", "first", "last"])
+@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"])
+@pytest.mark.parametrize("pre_sorted", [False, True], ids=["unsorted", "sorted"])
+def test_distinct(subset, keep, maintain_order, pre_sorted):
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 1, 3, 5, None, None],
+            "b": [1.5, 2.5, None, 1.5, 3, float("nan"), 3],
+            "c": [True, True, True, True, False, False, True],
+        }
+    ).lazy()
+    if pre_sorted:
+        keys = ["a", "b", "c"] if subset is None else subset
+        descending = False if len(keys) == 1 else [False, True, True][: len(keys)]
+        ldf = ldf.sort(*keys, descending=descending)
+
+    query = ldf.unique(subset=subset, keep=keep, maintain_order=maintain_order)
+    assert_gpu_result_equal(query, check_row_order=maintain_order)
diff --git a/python/cudf_polars/tests/test_extcontext.py b/python/cudf_polars/tests/test_extcontext.py
new file mode 100644
index 00000000000..9daf88b4338
--- /dev/null
+++ b/python/cudf_polars/tests/test_extcontext.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+def test_extcontext():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+    ldf2 = ldf.select((pl.col("b") + pl.col("a")).alias("c"))
+    query = ldf.with_context(ldf2).select(pl.col("b"), pl.col("c"))
+    with pytest.raises(pl.exceptions.ComputeError):
+        # ExtContext to be deprecated so we're not implementing it.
+        assert_gpu_result_equal(query)
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
new file mode 100644
index 00000000000..d06a7ecf105
--- /dev/null
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture
+def df():
+    return pl.LazyFrame(
+        {
+            "key1": [1, 1, 1, 2, 3, 1, 4, 6, 7],
+            "key2": [2, 2, 2, 2, 6, 1, 4, 6, 8],
+            "int": [1, 2, 3, 4, 5, 6, 7, 8, 9],
+            "float": [7.0, 1, 2, 3, 4, 5, 6, 7, 8],
+        }
+    )
+
+
+@pytest.fixture(
+    params=[
+        ["key1"],
+        ["key2"],
+        [pl.col("key1") * pl.col("key2")],
+        ["key1", "key2"],
+        [pl.col("key1") == pl.col("key2")],
+        ["key2", pl.col("key1") == pl.lit(1, dtype=pl.Int64)],
+    ],
+    ids=lambda keys: "-".join(map(str, keys)),
+)
+def keys(request):
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        ["int"],
+        ["float", "int"],
+        [pl.col("float") + pl.col("int")],
+        [pl.col("float").max() - pl.col("int").min()],
+        [pl.col("float").mean(), pl.col("int").std()],
+    ],
+    ids=lambda aggs: "-".join(map(str, aggs)),
+)
+def exprs(request):
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        False,
+        pytest.param(
+            True,
+            marks=pytest.mark.xfail(
+                reason="Maintaining order in groupby not implemented"
+            ),
+        ),
+    ],
+    ids=["no_maintain_order", "maintain_order"],
+)
+def maintain_order(request):
+    return request.param
+
+
+def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs):
+    q = df.group_by(*keys, maintain_order=maintain_order).agg(*exprs)
+
+    if not maintain_order:
+        sort_keys = list(q.schema.keys())[: len(keys)]
+        q = q.sort(*sort_keys)
+    # from cudf_polars.dsl.translate import translate_ir
+    # ir = translate_ir(q._ldf.visit())
+    # from IPython import embed; embed()
+    assert_gpu_result_equal(q, check_exact=False)
diff --git a/python/cudf_polars/tests/test_hconcat.py b/python/cudf_polars/tests/test_hconcat.py
new file mode 100644
index 00000000000..46cbb21b25a
--- /dev/null
+++ b/python/cudf_polars/tests/test_hconcat.py
@@ -0,0 +1,19 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+def test_hconcat():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+    ldf2 = ldf.select((pl.col("a") + pl.col("b")).alias("c"))
+    query = pl.concat([ldf, ldf2], how="horizontal")
+    assert_gpu_result_equal(query)
diff --git a/python/cudf_polars/tests/test_hstack.py b/python/cudf_polars/tests/test_hstack.py
new file mode 100644
index 00000000000..b8c97f4607f
--- /dev/null
+++ b/python/cudf_polars/tests/test_hstack.py
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+def test_hstack():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+
+    query = ldf.with_columns(pl.col("a") + pl.col("b"))
+    assert_gpu_result_equal(query)
+
+
+def test_hstack_with_cse():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+
+    expr = pl.col("a") + pl.col("b")
+    query = ldf.with_columns(expr.alias("c"), expr.alias("d") * 2)
+    assert_gpu_result_equal(query)
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
new file mode 100644
index 00000000000..f4a4704f3cc
--- /dev/null
+++ b/python/cudf_polars/tests/test_join.py
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize(
+    "how",
+    [
+        "inner",
+        "left",
+        "semi",
+        "anti",
+        pytest.param(
+            "cross",
+            marks=pytest.mark.xfail(reason="cross join not implemented"),
+        ),
+        "full",
+    ],
+)
+@pytest.mark.parametrize("coalesce", [False, True])
+@pytest.mark.parametrize(
+    "join_nulls", [False, True], ids=["nulls_not_equal", "nulls_equal"]
+)
+@pytest.mark.parametrize(
+    "join_expr",
+    [
+        pl.col("a"),
+        pl.col("a") * 2,
+        [pl.col("a"), pl.col("c") + 1],
+        ["c", "a"],
+    ],
+)
+def test_join(how, coalesce, join_nulls, join_expr):
+    left = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 1, None],
+            "b": [1, 2, 3, 4, 5],
+            "c": [2, 3, 4, 5, 6],
+        }
+    ).lazy()
+    right = pl.DataFrame(
+        {
+            "a": [1, 4, 3, 7, None, None],
+            "c": [2, 3, 4, 5, 6, 7],
+        }
+    ).lazy()
+
+    query = left.join(
+        right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=coalesce
+    )
+    assert_gpu_result_equal(query, check_row_order=False)
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
new file mode 100644
index 00000000000..b75e1bdef10
--- /dev/null
+++ b/python/cudf_polars/tests/test_scan.py
@@ -0,0 +1,98 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(
+    params=[
+        (None, None),
+        pytest.param(
+            ("row-index", 0),
+            marks=pytest.mark.xfail(reason="Incorrect dtype for row index"),
+        ),
+        pytest.param(
+            ("index", 10),
+            marks=pytest.mark.xfail(reason="Incorrect dtype for row index"),
+        ),
+    ],
+    ids=["no-row-index", "zero-offset-row-index", "offset-row-index"],
+)
+def row_index(request):
+    return request.param
+
+
+@pytest.fixture(
+    params=[
+        (None, 0),
+        pytest.param(
+            (2, 1), marks=pytest.mark.xfail(reason="No handling of row limit in scan")
+        ),
+        pytest.param(
+            (3, 0), marks=pytest.mark.xfail(reason="No handling of row limit in scan")
+        ),
+    ],
+    ids=["all-rows", "n_rows-with-skip", "n_rows-no-skip"],
+)
+def n_rows_skip_rows(request):
+    return request.param
+
+
+@pytest.fixture(params=["csv", "parquet"])
+def df(request, tmp_path, row_index, n_rows_skip_rows):
+    df = pl.DataFrame(
+        {
+            "a": [1, 2, 3, None],
+            "b": ["ẅ", "x", "y", "z"],
+            "c": [None, None, 4, 5],
+        }
+    )
+    name, offset = row_index
+    n_rows, skip_rows = n_rows_skip_rows
+    if request.param == "csv":
+        df.write_csv(tmp_path / "file.csv")
+        return pl.scan_csv(
+            tmp_path / "file.csv",
+            row_index_name=name,
+            row_index_offset=offset,
+            skip_rows_after_header=skip_rows,
+            n_rows=n_rows,
+        )
+    else:
+        df.write_parquet(tmp_path / "file.pq")
+        # parquet doesn't have skip_rows argument
+        return pl.scan_parquet(
+            tmp_path / "file.pq",
+            row_index_name=name,
+            row_index_offset=offset,
+            n_rows=n_rows,
+        )
+
+
+@pytest.fixture(params=[None, ["a"], ["b", "a"]], ids=["all", "subset", "reordered"])
+def columns(request, row_index):
+    name, _ = row_index
+    if name is not None and request.param is not None:
+        return [*request.param, name]
+    return request.param
+
+
+@pytest.fixture(
+    params=[None, pl.col("c").is_not_null()], ids=["no-mask", "c-is-not-null"]
+)
+def mask(request):
+    return request.param
+
+
+def test_scan(df, columns, mask):
+    q = df
+    if mask is not None:
+        q = q.filter(mask)
+    if columns is not None:
+        q = df.select(*columns)
+    assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/test_select.py b/python/cudf_polars/tests/test_select.py
new file mode 100644
index 00000000000..503edef152e
--- /dev/null
+++ b/python/cudf_polars/tests/test_select.py
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+def test_select():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+
+    query = ldf.select(
+        pl.col("a") + pl.col("b"), (pl.col("a") * 2 + pl.col("b")).alias("d")
+    )
+
+    assert_gpu_result_equal(query)
+
+
+def test_select_reduce():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+
+    query = ldf.select(
+        (pl.col("a") + pl.col("b")).max(),
+        (pl.col("a") * 2 + pl.col("b")).alias("d").mean(),
+    )
+
+    assert_gpu_result_equal(query)
diff --git a/python/cudf_polars/tests/test_slice.py b/python/cudf_polars/tests/test_slice.py
new file mode 100644
index 00000000000..d27e91302ba
--- /dev/null
+++ b/python/cudf_polars/tests/test_slice.py
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize(
+    "offset",
+    [0, 1, 2],
+)
+@pytest.mark.parametrize(
+    "len",
+    [0, 2, 12],
+)
+def test_slice(offset, len):
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+
+    query = (
+        ldf.group_by(pl.col("a"))
+        .agg(pl.col("b").sum())
+        .sort(by=pl.col("a"))
+        .slice(offset, len)
+    )
+    assert_gpu_result_equal(query)
diff --git a/python/cudf_polars/tests/test_sort.py b/python/cudf_polars/tests/test_sort.py
new file mode 100644
index 00000000000..ecc02efd967
--- /dev/null
+++ b/python/cudf_polars/tests/test_sort.py
@@ -0,0 +1,42 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize(
+    "sort_keys",
+    [
+        (pl.col("a"),),
+        pytest.param(
+            (pl.col("d").abs(),),
+            marks=pytest.mark.xfail(reason="abs not yet implemented"),
+        ),
+        (pl.col("a"), pl.col("d")),
+        (pl.col("b"),),
+    ],
+)
+@pytest.mark.parametrize("nulls_last", [False, True])
+@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"])
+def test_sort(sort_keys, nulls_last, maintain_order):
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 1, 3, 5, None, None],
+            "b": [1.5, 2.5, None, 1.5, 3, float("nan"), 3],
+            "c": [True, True, True, True, False, False, True],
+            "d": [1, 2, -1, 10, 6, -1, -7],
+        }
+    ).lazy()
+
+    query = ldf.sort(
+        *sort_keys,
+        descending=True,
+        nulls_last=nulls_last,
+        maintain_order=maintain_order,
+    )
+    assert_gpu_result_equal(query, check_row_order=maintain_order)
diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py
new file mode 100644
index 00000000000..2c85bb15a55
--- /dev/null
+++ b/python/cudf_polars/tests/test_union.py
@@ -0,0 +1,37 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.xfail(reason="Need handling of null scalars that are cast")
+def test_union():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+    ldf2 = ldf.select((pl.col("a") + pl.col("b")).alias("c"), pl.col("a"))
+    query = pl.concat([ldf, ldf2], how="diagonal")
+    # Plan for this produces a `None`.astype(Int64) which we don't
+    # handle correctly right now
+    assert_gpu_result_equal(query)
+
+
+def test_concat_vertical():
+    ldf = pl.LazyFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    )
+    ldf2 = ldf.select(pl.col("a"), pl.col("b") * 2 + pl.col("a"))
+    q = pl.concat([ldf, ldf2], how="vertical")
+
+    assert_gpu_result_equal(q)

From c268fc106169ae4d2fb4a78125cce724d1ee45b6 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Thu, 30 May 2024 09:58:21 -0500
Subject: [PATCH 278/842] Update `pylibcudf` testing utilities (#15772)

Cleans up some testing utilities for pylibcudf as suggested in https://github.com/rapidsai/cudf/pull/15418#discussion_r1603669456.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15772
---
 .../cudf/cudf/pylibcudf_tests/common/utils.py | 42 +++++++++++++------
 .../test_column_from_device.py                |  2 +-
 .../cudf/cudf/pylibcudf_tests/test_copying.py | 14 +++----
 .../cudf/pylibcudf_tests/test_string_case.py  |  6 +--
 .../cudf/pylibcudf_tests/test_string_find.py  | 18 ++++----
 5 files changed, 49 insertions(+), 33 deletions(-)

diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index 596cd2c92ae..0befb3bb3e8 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from typing import Optional
+from typing import Optional, Union
 
 import pyarrow as pa
 import pytest
@@ -24,27 +24,43 @@ def metadata_from_arrow_array(
     return metadata
 
 
-def assert_column_eq(plc_column: plc.Column, pa_array: pa.Array) -> None:
-    """Verify that the pylibcudf array and PyArrow array are equal."""
+def assert_column_eq(
+    lhs: Union[pa.Array, plc.Column], rhs: Union[pa.Array, plc.Column]
+) -> None:
+    """Verify that a pylibcudf array and PyArrow array are equal."""
     # Nested types require children metadata to be passed to the conversion function.
-    plc_pa = plc.interop.to_arrow(
-        plc_column, metadata=metadata_from_arrow_array(pa_array)
-    )
+    if isinstance(lhs, (pa.Array, pa.ChunkedArray)) and isinstance(
+        rhs, plc.Column
+    ):
+        rhs = plc.interop.to_arrow(
+            rhs, metadata=metadata_from_arrow_array(lhs)
+        )
+    elif isinstance(lhs, plc.Column) and isinstance(
+        rhs, (pa.Array, pa.ChunkedArray)
+    ):
+        lhs = plc.interop.to_arrow(
+            lhs, metadata=metadata_from_arrow_array(rhs)
+        )
+    else:
+        raise ValueError(
+            "One of the inputs must be a Column and the other an Array"
+        )
+
+    if isinstance(lhs, pa.ChunkedArray):
+        lhs = lhs.combine_chunks()
+    if isinstance(rhs, pa.ChunkedArray):
+        rhs = rhs.combine_chunks()
 
-    if isinstance(plc_pa, pa.ChunkedArray):
-        plc_pa = plc_pa.combine_chunks()
-    if isinstance(pa_array, pa.ChunkedArray):
-        pa_array = pa_array.combine_chunks()
-    assert plc_pa.equals(pa_array)
+    assert lhs.equals(rhs)
 
 
 def assert_table_eq(plc_table: plc.Table, pa_table: pa.Table) -> None:
-    """Verify that the pylibcudf array and PyArrow array are equal."""
+    """Verify that a pylibcudf table and PyArrow table are equal."""
     plc_shape = (plc_table.num_rows(), plc_table.num_columns())
     assert plc_shape == pa_table.shape
 
     for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns):
-        assert_column_eq(plc_col, pa_col)
+        assert_column_eq(pa_col, plc_col)
 
 
 def cudf_raises(expected_exception: BaseException, *args, **kwargs):
diff --git a/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py b/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
index 764720d9de1..c4ff7bb43a5 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
@@ -48,4 +48,4 @@ def test_from_cuda_array_interface(valid_column):
     )
     expect = valid_column
 
-    assert_column_eq(col, expect)
+    assert_column_eq(expect, col)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py
index 0bf30f98636..ef70869a145 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_copying.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py
@@ -409,7 +409,7 @@ def test_copy_range_in_place(
             ),
             pa_target_column,
         )
-        assert_column_eq(mutable_target_column, expected)
+        assert_column_eq(expected, mutable_target_column)
 
 
 def test_copy_range_in_place_out_of_bounds(
@@ -480,7 +480,7 @@ def test_copy_range(
             ),
             pa_target_column,
         )
-        assert_column_eq(result, expected)
+        assert_column_eq(expected, result)
     else:
         with pytest.raises(TypeError):
             plc.copying.copy_range(
@@ -528,7 +528,7 @@ def test_shift(
         expected = pa.concat_arrays(
             [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]]
         )
-        assert_column_eq(result, expected)
+        assert_column_eq(expected, result)
     else:
         with pytest.raises(TypeError):
             plc.copying.shift(target_column, shift, source_scalar)
@@ -550,7 +550,7 @@ def test_slice_column(target_column, pa_target_column):
     lower_bounds = bounds[::2]
     result = plc.copying.slice(target_column, bounds)
     for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result):
-        assert_column_eq(slice_, pa_target_column[lb:ub])
+        assert_column_eq(pa_target_column[lb:ub], slice_)
 
 
 def test_slice_column_wrong_length(target_column):
@@ -582,7 +582,7 @@ def test_split_column(target_column, pa_target_column):
     lower_bounds = [0] + upper_bounds[:-1]
     result = plc.copying.split(target_column, upper_bounds)
     for lb, ub, split in zip(lower_bounds, upper_bounds, result):
-        assert_column_eq(split, pa_target_column[lb:ub])
+        assert_column_eq(pa_target_column[lb:ub], split)
 
 
 def test_split_column_decreasing(target_column):
@@ -622,7 +622,7 @@ def test_copy_if_else_column_column(
         pa_target_column,
         pa_other_column,
     )
-    assert_column_eq(result, expected)
+    assert_column_eq(expected, result)
 
 
 def test_copy_if_else_wrong_type(target_column, mask):
@@ -699,7 +699,7 @@ def test_copy_if_else_column_scalar(
         pa_mask,
         *pa_args,
     )
-    assert_column_eq(result, expected)
+    assert_column_eq(expected, result)
 
 
 def test_boolean_mask_scatter_from_table(
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_case.py b/python/cudf/cudf/pylibcudf_tests/test_string_case.py
index ae01d953df5..1039859b2cf 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_string_case.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_case.py
@@ -18,18 +18,18 @@ def test_to_upper(string_col):
     plc_col = plc.interop.from_arrow(string_col)
     got = plc.strings.case.to_upper(plc_col)
     expected = pa.compute.utf8_upper(string_col)
-    assert_column_eq(got, expected)
+    assert_column_eq(expected, got)
 
 
 def test_to_lower(string_col):
     plc_col = plc.interop.from_arrow(string_col)
     got = plc.strings.case.to_lower(plc_col)
     expected = pa.compute.utf8_lower(string_col)
-    assert_column_eq(got, expected)
+    assert_column_eq(expected, got)
 
 
 def test_swapcase(string_col):
     plc_col = plc.interop.from_arrow(string_col)
     got = plc.strings.case.swapcase(plc_col)
     expected = pa.compute.utf8_swapcase(string_col)
-    assert_column_eq(got, expected)
+    assert_column_eq(expected, got)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_find.py b/python/cudf/cudf/pylibcudf_tests/test_string_find.py
index f44c4af9bfc..44900044184 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_string_find.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_find.py
@@ -134,7 +134,7 @@ def test_find(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar):
         type=pa.int32(),
     )
 
-    assert_column_eq(got, expected)
+    assert_column_eq(expected, got)
 
 
 def colwise_apply(pa_data_col, pa_target_col, operator):
@@ -174,7 +174,7 @@ def test_find_column(pa_data_col, pa_target_col, plc_data_col, plc_target_col):
     )
 
     got = plc.strings.find.find(plc_data_col, plc_target_col, 0)
-    assert_column_eq(got, expected)
+    assert_column_eq(expected, got)
 
 
 def test_rfind(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar):
@@ -192,7 +192,7 @@ def test_rfind(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar):
         type=pa.int32(),
     )
 
-    assert_column_eq(got, expected)
+    assert_column_eq(expected, got)
 
 
 def test_contains(
@@ -211,7 +211,7 @@ def test_contains(
         type=pa.bool_(),
     )
 
-    assert_column_eq(got, expected)
+    assert_column_eq(expected, got)
 
 
 def test_contains_column(
@@ -221,7 +221,7 @@ def test_contains_column(
         pa_data_col, pa_target_col, lambda st, target: target in st
     )
     got = plc.strings.find.contains(plc_data_col, plc_target_col)
-    assert_column_eq(got, expected)
+    assert_column_eq(expected, got)
 
 
 def test_starts_with(
@@ -230,7 +230,7 @@ def test_starts_with(
     py_target = pa_target_scalar.as_py()
     got = plc.strings.find.starts_with(plc_data_col, plc_target_scalar)
     expected = pa.compute.starts_with(pa_data_col, py_target)
-    assert_column_eq(got, expected)
+    assert_column_eq(expected, got)
 
 
 def test_starts_with_column(
@@ -240,7 +240,7 @@ def test_starts_with_column(
         pa_data_col, pa_target_col, lambda st, target: st.startswith(target)
     )
     got = plc.strings.find.starts_with(plc_data_col, plc_target_col)
-    assert_column_eq(got, expected)
+    assert_column_eq(expected, got)
 
 
 def test_ends_with(
@@ -249,7 +249,7 @@ def test_ends_with(
     py_target = pa_target_scalar.as_py()
     got = plc.strings.find.ends_with(plc_data_col, plc_target_scalar)
     expected = pa.compute.ends_with(pa_data_col, py_target)
-    assert_column_eq(got, expected)
+    assert_column_eq(expected, got)
 
 
 def test_ends_with_column(
@@ -259,4 +259,4 @@ def test_ends_with_column(
         pa_data_col, pa_target_col, lambda st, target: st.endswith(target)
     )
     got = plc.strings.find.ends_with(plc_data_col, plc_target_col)
-    assert_column_eq(got, expected)
+    assert_column_eq(expected, got)

From 579a167542ce664bb9d28ae6b5419e524ec5288b Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Thu, 30 May 2024 18:37:56 +0200
Subject: [PATCH 279/842] Simple NumPy 2 fixes that are clearly no behavior
 change (#15876)

I have a branch that works, but some changes may need a bit of thought to get right, so splitting out the simpler half.

(N.B. the only bigger chunk that is remaining is to make sure that `uint_series > -1` keeps working at least as well as before)

In either case, these are changes that:
* Avoid `copy=False` in `np.array()`
* Are necessary due to NumPy rejecting e.g. `uint8(-1)` now (only changed this where it is test-only)
* Are necessary due to NumPy preserving the scalar dtype things fail later (the hashing code and using `float(float32)` to avoid overflow.
  * Sorting change is the same, using `int8(-1)` gives effectively the old promotion (to float) rather than erroring to not implicit go to float based on the value.

The main noise, is that I parametrized that one test since it seemed easy enough.

Authors:
  - Sebastian Berg (https://github.com/seberg)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15876
---
 python/cudf/cudf/core/buffer/buffer.py        |  4 +-
 .../cudf/cudf/core/buffer/spillable_buffer.py |  4 +-
 python/cudf/cudf/tests/test_hash_vocab.py     |  8 ++-
 python/cudf/cudf/tests/test_numerical.py      |  2 +-
 python/cudf/cudf/tests/test_replace.py        | 51 +++++--------------
 python/cudf/cudf/tests/test_sorting.py        |  3 +-
 python/cudf/cudf/utils/hash_vocab_utils.py    | 25 ++++-----
 7 files changed, 37 insertions(+), 60 deletions(-)

diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index 5c2d77033b8..bf6f9f1a3c1 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -191,7 +191,7 @@ def from_host_memory(cls, data: Any) -> Self:
         """Create an owner from a buffer or array like object
 
         Data must implement `__array_interface__`, the buffer protocol, and/or
-        be convertible to a buffer object using `numpy.array()`
+        be convertible to a buffer object using `numpy.asanyarray()`
 
         The host memory is copied to a new device allocation.
 
@@ -209,7 +209,7 @@ def from_host_memory(cls, data: Any) -> Self:
         """
 
         # Convert to numpy array, this will not copy data in most cases.
-        ary = numpy.array(data, copy=False, subok=True)
+        ary = numpy.asanyarray(data)
         # Extract pointer and size
         ptr, size = get_ptr_and_size(ary.__array_interface__)
         # Copy to device memory
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index a1af3ba8c9d..49258fea9ab 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -146,7 +146,7 @@ def from_host_memory(cls, data: Any) -> Self:
         """Create a spillabe buffer from host memory.
 
         Data must implement `__array_interface__`, the buffer protocol, and/or
-        be convertible to a buffer object using `numpy.array()`
+        be convertible to a buffer object using `numpy.asanyarray()`
 
         The new buffer is marked as spilled to host memory already.
 
@@ -165,7 +165,7 @@ def from_host_memory(cls, data: Any) -> Self:
 
         # Convert to a memoryview using numpy array, this will not copy data
         # in most cases.
-        data = memoryview(numpy.array(data, copy=False, subok=True))
+        data = memoryview(numpy.asanyarray(data))
         if not data.c_contiguous:
             raise ValueError("Buffer data must be C-contiguous")
         data = data.cast("B")  # Make sure itemsize==1
diff --git a/python/cudf/cudf/tests/test_hash_vocab.py b/python/cudf/cudf/tests/test_hash_vocab.py
index e081119ff89..c98b92f7083 100644
--- a/python/cudf/cudf/tests/test_hash_vocab.py
+++ b/python/cudf/cudf/tests/test_hash_vocab.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 import filecmp
 import os
 import warnings
@@ -21,9 +21,7 @@ def test_correct_bert_base_vocab_hash(datadir, tmpdir):
 
     groundtruth_path = os.path.join(datadir, "vocab-hash.txt")
     output_path = tmpdir.join("cudf-vocab-hash.txt")
-    with warnings.catch_warnings():
-        # See https://github.com/rapidsai/cudf/issues/12403
-        warnings.simplefilter(action="ignore", category=RuntimeWarning)
-        hash_vocab(vocab_path, output_path)
+    warnings.simplefilter(action="ignore", category=RuntimeWarning)
+    hash_vocab(vocab_path, output_path)
 
     assert filecmp.cmp(output_path, groundtruth_path, shallow=False)
diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py
index 2e3be92dbeb..03081208739 100644
--- a/python/cudf/cudf/tests/test_numerical.py
+++ b/python/cudf/cudf/tests/test_numerical.py
@@ -44,7 +44,7 @@ def test_can_cast_safely_same_kind():
     assert data.can_cast_safely(to_dtype)
 
     data = cudf.Series(
-        [np.finfo("float32").max * 2, 1.0], dtype="float64"
+        [float(np.finfo("float32").max) * 2, 1.0], dtype="float64"
     )._column
     to_dtype = np.dtype("float32")
     assert not data.can_cast_safely(to_dtype)
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index 8992c4d617b..d77ec596271 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+import operator
 import re
 from decimal import Decimal
 
@@ -825,43 +826,23 @@ def test_series_fillna_invalid_dtype(data_dtype):
 
 @pytest.mark.parametrize("data_dtype", NUMERIC_TYPES)
 @pytest.mark.parametrize("fill_value", [100, 100.0, 128.5])
-def test_series_where(data_dtype, fill_value):
+@pytest.mark.parametrize("op", [operator.gt, operator.eq, operator.lt])
+def test_series_where(data_dtype, fill_value, op):
     psr = pd.Series(list(range(10)), dtype=data_dtype)
     sr = cudf.from_pandas(psr)
 
-    if sr.dtype.type(fill_value) != fill_value:
-        with pytest.raises(TypeError):
-            sr.where(sr > 0, fill_value)
-    else:
-        # Cast back to original dtype as pandas automatically upcasts
-        expect = psr.where(psr > 0, fill_value)
-        got = sr.where(sr > 0, fill_value)
-        # pandas returns 'float16' dtype, which is not supported in cudf
-        assert_eq(
-            expect,
-            got,
-            check_dtype=expect.dtype.kind not in ("f"),
-        )
+    try:
+        scalar_fits = sr.dtype.type(fill_value) == fill_value
+    except OverflowError:
+        scalar_fits = False
 
-    if sr.dtype.type(fill_value) != fill_value:
+    if not scalar_fits:
         with pytest.raises(TypeError):
-            sr.where(sr < 0, fill_value)
+            sr.where(op(sr, 0), fill_value)
     else:
-        expect = psr.where(psr < 0, fill_value)
-        got = sr.where(sr < 0, fill_value)
-        # pandas returns 'float16' dtype, which is not supported in cudf
-        assert_eq(
-            expect,
-            got,
-            check_dtype=expect.dtype.kind not in ("f"),
-        )
-
-    if sr.dtype.type(fill_value) != fill_value:
-        with pytest.raises(TypeError):
-            sr.where(sr == 0, fill_value)
-    else:
-        expect = psr.where(psr == 0, fill_value)
-        got = sr.where(sr == 0, fill_value)
+        # Cast back to original dtype as pandas automatically upcasts
+        expect = psr.where(op(psr, 0), fill_value)
+        got = sr.where(op(sr, 0), fill_value)
         # pandas returns 'float16' dtype, which is not supported in cudf
         assert_eq(
             expect,
@@ -985,12 +966,8 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
     psr = pd.Series([0, 1, 2, 3, 4, 5], dtype=series_dtype)
     sr = cudf.from_pandas(psr)
 
-    if sr.dtype.kind in "ui":
-        can_replace = np.array([replacement])[0].is_integer() and np.can_cast(
-            int(replacement), sr.dtype
-        )
-    else:
-        can_replace = np.can_cast(replacement, sr.dtype)
+    numpy_replacement = np.array(replacement).astype(sr.dtype)[()]
+    can_replace = numpy_replacement == replacement
 
     # Both Scalar
     if not can_replace:
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index 618c4f30bd9..449f21721f4 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -107,7 +107,8 @@ def test_series_argsort(nelem, dtype, asc):
     if asc:
         expected = np.argsort(sr.to_numpy(), kind="mergesort")
     else:
-        expected = np.argsort(sr.to_numpy() * -1, kind="mergesort")
+        # -1 multiply works around missing desc sort (may promote to float64)
+        expected = np.argsort(sr.to_numpy() * np.int8(-1), kind="mergesort")
     np.testing.assert_array_equal(expected, res.to_numpy())
 
 
diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py
index ef078ed8c5d..babe4be2715 100644
--- a/python/cudf/cudf/utils/hash_vocab_utils.py
+++ b/python/cudf/cudf/utils/hash_vocab_utils.py
@@ -7,8 +7,8 @@
 
 # Coefficients ranges for inner hash - This are important to set to be
 # large so that we have randomness in the bottom bits when modding
-A_SECOND_LEVEL_POW = np.uint8(48)
-B_SECOND_LEVEL_POW = np.uint8(7)
+A_SECOND_LEVEL_POW = np.uint64(48)
+B_SECOND_LEVEL_POW = np.uint64(7)
 
 A_LBOUND_SECOND_LEVEL_HASH = 2**16
 A_HBOUND_SECOND_LEVEL_HASH = 2**A_SECOND_LEVEL_POW
@@ -23,11 +23,11 @@
 
 
 # Shifts for bit packing
-A_SECOND_LEVEL_SHIFT_AMT = np.uint8(64 - A_SECOND_LEVEL_POW)
-B_SECOND_LEVEL_SHIFT_AMT = np.uint8(
+A_SECOND_LEVEL_SHIFT_AMT = np.uint64(64 - A_SECOND_LEVEL_POW)
+B_SECOND_LEVEL_SHIFT_AMT = np.uint64(
     64 - A_SECOND_LEVEL_POW - B_SECOND_LEVEL_POW
 )
-BITS_FOR_INNER_TABLE_SIZE = np.uint8(8)
+BITS_FOR_INNER_TABLE_SIZE = np.uint64(8)
 
 NOT_FOUND = -1
 
@@ -94,7 +94,8 @@ def _find_hash_for_internal(hash_bin):
 
     while True:
         a = np.random.randint(
-            A_LBOUND_SECOND_LEVEL_HASH, A_HBOUND_SECOND_LEVEL_HASH
+            A_LBOUND_SECOND_LEVEL_HASH,
+            A_HBOUND_SECOND_LEVEL_HASH,
         )
         b = np.random.randint(
             B_LBOUND_SECOND_LEVEL_HASH, B_HBOUND_SECOND_LEVEL_HASH
@@ -130,13 +131,13 @@ def _perfect_hash(integers, max_constant):
         bin_length = len(internal_table)
         max_bin_length = max(bin_length, max_bin_length)
         internal_table_coeffs[i] = (
-            coeff_a << A_SECOND_LEVEL_SHIFT_AMT
-            | coeff_b << B_SECOND_LEVEL_SHIFT_AMT
-            | bin_length
-        )
-        offset_into_flattened_table[i + 1] = (
-            offset_into_flattened_table[i] + bin_length
+            np.uint64(coeff_a) << A_SECOND_LEVEL_SHIFT_AMT
+            | np.uint64(coeff_b) << B_SECOND_LEVEL_SHIFT_AMT
+            | np.uint64(bin_length)
         )
+        offset_into_flattened_table[i + 1] = offset_into_flattened_table[
+            i
+        ] + np.uint64(bin_length)
         flattened_bins.extend(internal_table)
 
     print(

From bab0d808bbe6f333b69e7b71a38febdc0e28b773 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 30 May 2024 10:34:07 -0700
Subject: [PATCH 280/842] Fix categorical conversion from chunked arrow arrays
 (#15886)

The current logic for converting arrow dictionary arrays to cudf doesn't properly uniquify categories across chunks of chunked arrays. This PR implements the simplest fix by having arrow combine chunks when this case is encountered.

Resolves #6828

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15886
---
 python/cudf/cudf/core/frame.py           |  7 +++++++
 python/cudf/cudf/tests/test_dataframe.py | 12 ++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 7b561906afb..d60c206ac24 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -897,6 +897,13 @@ def from_arrow(cls, data: pa.Table) -> Self:
         # so handling indices and dictionary as two different columns.
         # This needs be removed once we have hooked libcudf dictionary32
         # with categorical.
+        if any(
+            isinstance(x.type, pa.DictionaryType)
+            and isinstance(x, pa.ChunkedArray)
+            for x in data
+        ):
+            data = data.combine_chunks()
+
         dict_indices = {}
         dict_dictionaries = {}
         dict_ordered = {}
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 8b18e53d320..d76d5eb8065 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1984,6 +1984,18 @@ def test_from_arrow(nelem, data_type):
     np.testing.assert_array_equal(s.to_pandas(), gs.to_numpy())
 
 
+def test_from_arrow_chunked_categories():
+    # Verify that categories are properly deduplicated across chunked arrays.
+    indices = pa.array([0, 1, 0, 1, 2, 0, None, 2])
+    dictionary = pa.array(["foo", "bar", "baz"])
+    dict_array = pa.DictionaryArray.from_arrays(indices, dictionary)
+    chunked_array = pa.chunked_array([dict_array, dict_array])
+    table = pa.table({"a": chunked_array})
+    df = cudf.DataFrame.from_arrow(table)
+    final_dictionary = df["a"].dtype.categories.to_arrow().to_pylist()
+    assert sorted(final_dictionary) == sorted(dictionary.to_pylist())
+
+
 @pytest.mark.parametrize("nelem", [0, 2, 3, 100, 1000])
 @pytest.mark.parametrize("data_type", dtypes)
 def test_to_arrow(nelem, data_type):

From 789cbfdd69648fd7ec553922e64accb763ca3c57 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 30 May 2024 15:02:37 -0400
Subject: [PATCH 281/842] Use offsetalator in nvtext::tokenize_with_vocabulary
 (#15878)

Updates the `token_counts_fn` kernel in the `nvtext::tokenize_with_vocabulary` to use the offsetalator instead of hardcoded `size_type` for accessing strings offsets.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15878
---
 cpp/src/text/vocabulary_tokenize.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index 8913ce22da8..f012f7ce09a 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -240,10 +240,10 @@ CUDF_KERNEL void token_counts_fn(cudf::column_device_view const d_strings,
     return;
   }
 
-  auto const offsets =
-    d_strings.child(cudf::strings_column_view::offsets_column_index).data<cudf::size_type>();
-  auto const offset      = offsets[str_idx + d_strings.offset()] - offsets[d_strings.offset()];
-  auto const chars_begin = d_strings.data<char>() + offsets[d_strings.offset()];
+  auto const offsets     = d_strings.child(cudf::strings_column_view::offsets_column_index);
+  auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type());
+  auto const offset = offsets_itr[str_idx + d_strings.offset()] - offsets_itr[d_strings.offset()];
+  auto const chars_begin = d_strings.data<char>() + offsets_itr[d_strings.offset()];
 
   auto const begin        = d_str.data();
   auto const end          = begin + d_str.size_bytes();

From 476db9fbb4a9969ea7406b916cead38990097fb9 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 30 May 2024 23:42:51 -0500
Subject: [PATCH 282/842] Fix JSON parsing memory corruption - Fix Mixed types
 nested children removal (#15798)

Fixes https://github.com/rapidsai/cudf/issues/15750
The references of deleted child columns are not removed, which caused segfault, and also memory errors (found with valgrind). This fix removes references of child columns and deletes them recursively.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15798
---
 cpp/src/io/json/json_column.cu | 17 ++++++++++++++--
 cpp/tests/io/json_test.cpp     | 36 ++++++++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 631f8adbd6d..3e587768b11 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -594,8 +594,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     col.validity =
       cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
     col.type = json_col_t::StringColumn;
-    col.child_columns.clear();  // their references should be deleted too.
-    col.column_order.clear();
+    // destroy references of all child columns after this step, by calling remove_child_columns
   };
 
   path_from_tree tree_path{column_categories,
@@ -628,6 +627,19 @@ void make_device_json_column(device_span<SymbolT const> input,
   std::vector<uint8_t> is_pruned(num_columns, 0);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
 
+  std::function<void(NodeIndexT, device_json_column&)> remove_child_columns =
+    [&](NodeIndexT this_col_id, device_json_column& col) {
+      for (auto col_name : col.column_order) {
+        auto child_id                  = mapped_columns[{this_col_id, col_name}];
+        is_mixed_type_column[child_id] = 1;
+        remove_child_columns(child_id, col.child_columns.at(col_name));
+        mapped_columns.erase({this_col_id, col_name});
+        columns.erase(child_id);
+      }
+      col.child_columns.clear();  // their references are deleted above.
+      col.column_order.clear();
+    };
+
   auto name_and_parent_index = [&is_array_of_arrays,
                                 &row_array_parent_col_id,
                                 &column_parent_ids,
@@ -721,6 +733,7 @@ void make_device_json_column(device_span<SymbolT const> input,
           auto& col = columns.at(old_col_id).get();
           if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) {
             reinitialize_as_string(old_col_id, col);
+            remove_child_columns(old_col_id, col);
             // all its children (which are already inserted) are ignored later.
           }
           col.forced_as_string_column = true;
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 9d766e80094..5d790e73246 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -2679,4 +2679,40 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilter)
   }
 }
 
+TEST_F(JsonReaderTest, JSONMixedTypeChildren)
+{
+  std::string const json_str = R"(
+{ "Root": { "Key": [ { "EE": "A" } ] } }
+{ "Root": { "Key": {  } } }
+{ "Root": { "Key": [{ "YY": 1}] } }
+)";
+  // Column "EE" is created and destroyed
+  // Column "YY" should not be created
+
+  cudf::io::json_reader_options options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{json_str.c_str(), json_str.size()})
+      .lines(true)
+      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+      .normalize_single_quotes(true)
+      .normalize_whitespace(false)
+      .mixed_types_as_string(true)
+      .keep_quotes(true);
+
+  auto result = cudf::io::read_json(options);
+
+  ASSERT_EQ(result.tbl->num_columns(), 1);
+  ASSERT_EQ(result.metadata.schema_info.size(), 1);
+  EXPECT_EQ(result.metadata.schema_info[0].name, "Root");
+  ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
+  EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key");
+  ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
+  EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
+  // types
+  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
+  EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING);
+  cudf::test::strings_column_wrapper expected({R"([ { "EE": "A" } ])", "{  }", R"([{ "YY": 1}])"});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0));
+}
+
 CUDF_TEST_PROGRAM_MAIN()

From dec0354b1ac2af981d4e8f13aceb45365838a1d8 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 31 May 2024 08:38:57 -0400
Subject: [PATCH 283/842] Fix multi-replace target count logic for large
 strings (#15807)

Replaces `thrust::count_if` with raw kernel counter to handle large strings (int64 offsets) and > 2GB strings columns.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15807
---
 cpp/src/strings/replace/multi.cu | 49 ++++++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 12 deletions(-)

diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 9025234aa52..f4110707c79 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -30,23 +30,17 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
-#include <thrust/binary_search.h>
 #include <thrust/copy.h>
-#include <thrust/count.h>
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/optional.h>
-#include <thrust/scan.h>
 #include <thrust/transform.h>
 
 namespace cudf {
@@ -262,6 +256,38 @@ struct replace_multi_parallel_fn {
   device_span<string_view const> d_replacements;
 };
 
+constexpr int64_t block_size         = 512;  // number of threads per block
+constexpr size_type bytes_per_thread = 4;    // bytes processed per thread
+
+/**
+ * @brief Count the number of targets in a strings column
+ *
+ * @param fn Functor containing has_target() function
+ * @param chars_bytes Number of bytes in the strings column
+ * @param d_output Result of the count
+ */
+CUDF_KERNEL void count_targets(replace_multi_parallel_fn fn, int64_t chars_bytes, int64_t* d_output)
+{
+  auto const idx      = cudf::detail::grid_1d::global_thread_id();
+  auto const byte_idx = static_cast<int64_t>(idx) * bytes_per_thread;
+  auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
+
+  using block_reduce = cub::BlockReduce<int64_t, block_size>;
+  __shared__ typename block_reduce::TempStorage temp_storage;
+
+  int64_t count = 0;
+  // each thread processes multiple bytes
+  for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) {
+    count += fn.has_target(i, chars_bytes);
+  }
+  auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum());
+
+  if ((lane_idx == 0) && (total > 0)) {
+    cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
+    ref.fetch_add(total, cuda::std::memory_order_relaxed);
+  }
+}
+
 /**
  * @brief Used by the copy-if function to produce target_pair objects
  *
@@ -308,12 +334,11 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
 
   // Count the number of targets in the entire column.
   // Note this may over-count in the case where a target spans adjacent strings.
-  auto target_count = thrust::count_if(
-    rmm::exec_policy_nosync(stream),
-    thrust::make_counting_iterator<int64_t>(0),
-    thrust::make_counting_iterator<int64_t>(chars_bytes),
-    [fn, chars_bytes] __device__(int64_t idx) { return fn.has_target(idx, chars_bytes); });
-
+  rmm::device_scalar<int64_t> d_count(0, stream);
+  auto const num_blocks = util::div_rounding_up_safe(
+    util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
+  count_targets<<<num_blocks, block_size, 0, stream.value()>>>(fn, chars_bytes, d_count.data());
+  auto target_count = d_count.value(stream);
   // Create a vector of every target position in the chars column.
   // These may also include overlapping targets which will be resolved later.
   auto targets_positions = rmm::device_uvector<int64_t>(target_count, stream);

From e7be142b2bfd4f08c18d0020a959e162f01d819e Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 31 May 2024 08:14:55 -0700
Subject: [PATCH 284/842] Migrate round to pylibcudf (#15863)

xref #15162

Migrate round.pxd to use pylibcudf APIs.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - https://github.com/brandon-b-miller
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15863
---
 .../user_guide/api_docs/pylibcudf/index.rst   |  1 +
 .../user_guide/api_docs/pylibcudf/round.rst   |  6 +++
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |  1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |  2 +
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |  2 +
 .../_lib/pylibcudf/libcudf/CMakeLists.txt     |  2 +-
 .../cudf/_lib/pylibcudf/libcudf/round.pxd     |  6 +--
 .../cudf/_lib/pylibcudf/libcudf/round.pyx     |  0
 python/cudf/cudf/_lib/pylibcudf/round.pxd     | 13 +++++
 python/cudf/cudf/_lib/pylibcudf/round.pyx     | 54 +++++++++++++++++++
 python/cudf/cudf/_lib/round.pyx               | 36 +++++--------
 .../cudf/cudf/pylibcudf_tests/test_round.py   | 38 +++++++++++++
 12 files changed, 134 insertions(+), 27 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/round.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/round.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/round.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_round.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 1c1b37e2c37..26875ce7d12 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -22,6 +22,7 @@ This page provides API documentation for pylibcudf.
     reduce
     reshape
     rolling
+    round
     scalar
     search
     stream_compaction
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst
new file mode 100644
index 00000000000..c97fda12301
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst
@@ -0,0 +1,6 @@
+=====
+round
+=====
+
+.. automodule:: cudf._lib.pylibcudf.round
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 7d01671e84f..eff14ad549b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -29,6 +29,7 @@ set(cython_sources
     replace.pyx
     reshape.pyx
     rolling.pyx
+    round.pyx
     scalar.pyx
     search.pyx
     stream_compaction.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 91c3fdf5602..4f77f8cbaef 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -15,6 +15,7 @@ from . cimport (
     replace,
     reshape,
     rolling,
+    round,
     search,
     sorting,
     stream_compaction,
@@ -48,6 +49,7 @@ __all__ = [
     "reduce",
     "replace",
     "rolling",
+    "round",
     "search",
     "stream_compaction",
     "strings",
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index fcdc4992f00..048b62b6013 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -15,6 +15,7 @@
     replace,
     reshape,
     rolling,
+    round,
     search,
     sorting,
     stream_compaction,
@@ -48,6 +49,7 @@
     "reduce",
     "replace",
     "rolling",
+    "round",
     "search",
     "stream_compaction",
     "strings",
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
index 8a6ce6a5187..ac56d42dda8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.pxd
+set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.pxd round.pyx
                    stream_compaction.pyx types.pyx unary.pyx
 )
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd
index 06ff42485ea..027c4634c9f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd
@@ -9,9 +9,9 @@ from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 
 cdef extern from "cudf/round.hpp" namespace "cudf" nogil:
 
-    ctypedef enum rounding_method "cudf::rounding_method":
-        HALF_UP "cudf::rounding_method::HALF_UP"
-        HALF_EVEN "cudf::rounding_method::HALF_EVEN"
+    cpdef enum class rounding_method(int32_t):
+        HALF_UP
+        HALF_EVEN
 
     cdef unique_ptr[column] round (
         const column_view& input,
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_lib/pylibcudf/round.pxd b/python/cudf/cudf/_lib/pylibcudf/round.pxd
new file mode 100644
index 00000000000..ccb64fc2847
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/round.pxd
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libc.stdint cimport int32_t
+
+from cudf._lib.pylibcudf.libcudf.round cimport rounding_method
+
+from .column cimport Column
+
+
+cpdef Column round(
+    Column source,
+    int32_t decimal_places = *,
+    rounding_method round_method = *
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/round.pyx b/python/cudf/cudf/_lib/pylibcudf/round.pyx
new file mode 100644
index 00000000000..cfcc2aafbb8
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/round.pyx
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libc.stdint cimport int32_t
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.pylibcudf.libcudf.round cimport (
+    round as cpp_round,
+    rounding_method,
+)
+
+from cudf._lib.pylibcudf.libcudf.round import \
+    rounding_method as RoundingMethod  # no-cython-lint
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+
+from .column cimport Column
+
+
+cpdef Column round(
+    Column source,
+    int32_t decimal_places = 0,
+    rounding_method round_method = rounding_method.HALF_UP
+):
+    """Rounds all the values in a column to the specified number of decimal places.
+
+    For details, see :cpp:func:`round`.
+
+    Parameters
+    ----------
+    source : Column
+        The Column for which to round values.
+    decimal_places: int32_t, optional
+        The number of decimal places to round to (default 0)
+    round_method: rounding_method, optional
+        The method by which to round each value.
+        Can be one of { RoundingMethod.HALF_UP, RoundingMethod.HALF_EVEN }
+        (default rounding_method.HALF_UP)
+
+    Returns
+    -------
+    pylibcudf.Column
+        A Column with values rounded
+    """
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = move(
+            cpp_round(
+                source.view(),
+                decimal_places,
+                round_method
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/round.pyx b/python/cudf/cudf/_lib/round.pyx
index c1c36dd8854..f8ad57947c8 100644
--- a/python/cudf/cudf/_lib/round.pyx
+++ b/python/cudf/cudf/_lib/round.pyx
@@ -2,16 +2,10 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.round cimport (
-    round as cpp_round,
-    rounding_method as cpp_rounding_method,
-)
+
+import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.round import RoundingMethod
 
 
 @acquire_spill_lock()
@@ -31,19 +25,15 @@ def round(Column input_col, int decimal_places=0, how="half_even"):
     if how not in {"half_even", "half_up"}:
         raise ValueError("'how' must be either 'half_even' or 'half_up'")
 
-    cdef column_view input_col_view = input_col.view()
-    cdef unique_ptr[column] c_result
-    cdef cpp_rounding_method c_how = (
-        cpp_rounding_method.HALF_EVEN if how == "half_even"
-        else cpp_rounding_method.HALF_UP
+    how = (
+        RoundingMethod.HALF_EVEN if how == "half_even"
+        else RoundingMethod.HALF_UP
     )
-    with nogil:
-        c_result = move(
-            cpp_round(
-                input_col_view,
-                decimal_places,
-                c_how
-            )
-        )
 
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.round.round(
+            input_col.to_pylibcudf(mode="read"),
+            decimal_places,
+            how
+        )
+    )
diff --git a/python/cudf/cudf/pylibcudf_tests/test_round.py b/python/cudf/cudf/pylibcudf_tests/test_round.py
new file mode 100644
index 00000000000..a234860477f
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_round.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.fixture(params=[False, True])
+def nullable(request):
+    return request.param
+
+
+@pytest.fixture(params=["float32", "float64"])
+def column(request, nullable):
+    values = [2.5, 2.49, 1.6, 8, -1.5, -1.7, -0.5, 0.5]
+    typ = {"float32": pa.float32(), "float64": pa.float64()}[request.param]
+    if nullable:
+        values[2] = None
+    return plc.interop.from_arrow(pa.array(values, type=typ))
+
+
+@pytest.mark.parametrize(
+    "round_mode", ["half_towards_infinity", "half_to_even"]
+)
+@pytest.mark.parametrize("decimals", [0, 1, 2, 5])
+def test_round(column, round_mode, decimals):
+    method = {
+        "half_towards_infinity": plc.round.RoundingMethod.HALF_UP,
+        "half_to_even": plc.round.RoundingMethod.HALF_EVEN,
+    }[round_mode]
+    got = plc.round.round(column, decimals, method)
+    expect = pa.compute.round(
+        plc.interop.to_arrow(column), decimals, round_mode
+    )
+
+    assert_column_eq(expect, got)

From 7949a9cf6911066663e2245a4bb624e0f1847b06 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 31 May 2024 14:54:18 -0400
Subject: [PATCH 285/842] Use offsetalator in orc rowgroup_char_counts_kernel
 (#15891)

Replaces hardcoded `size_type` for accessing strings offsets data with the offsetalator to compute the number of characters in a group in `cudf::io::orc::gpu::rowgroup_char_counts_kernel`

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/15891
---
 cpp/src/io/orc/dict_enc.cu | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 5971482f80c..5181c4a1c0e 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -16,6 +16,7 @@
 
 #include "orc_gpu.hpp"
 
+#include <cudf/detail/offsets_iterator.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/orc_types.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
@@ -43,11 +44,12 @@ CUDF_KERNEL void rowgroup_char_counts_kernel(device_2dspan<size_type> char_count
   auto const start_row = rowgroup_bounds[row_group_idx][col_idx].begin + str_col.offset();
   auto const num_rows  = rowgroup_bounds[row_group_idx][col_idx].size();
 
-  auto const& offsets = str_col.child(strings_column_view::offsets_column_index);
+  auto const& offsets    = str_col.child(strings_column_view::offsets_column_index);
+  auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type());
   char_counts[str_col_idx][row_group_idx] =
     (num_rows == 0)
       ? 0
-      : offsets.element<size_type>(start_row + num_rows) - offsets.element<size_type>(start_row);
+      : static_cast<size_type>(offsets_itr[start_row + num_rows] - offsets_itr[start_row]);
 }
 
 void rowgroup_char_counts(device_2dspan<size_type> counts,

From 1354abdb7a4f9eb58bfc6e359c49d0baabacb4e1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 31 May 2024 16:03:09 -0400
Subject: [PATCH 286/842] Fix url-decode benchmark to use offsetalator (#15871)

Fixes the logic for generating URLs in the url-decoder benchmark to use the offsetalator instead of hardcoding `size_type`.
This will allow benchmarking with large strings column in the future.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/15871
---
 cpp/benchmarks/string/url_decode.cu | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/cpp/benchmarks/string/url_decode.cu b/cpp/benchmarks/string/url_decode.cu
index b3aeb69e5ea..7720e585023 100644
--- a/cpp/benchmarks/string/url_decode.cu
+++ b/cpp/benchmarks/string/url_decode.cu
@@ -20,6 +20,7 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/filling.hpp>
 #include <cudf/strings/convert/convert_urls.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -43,7 +44,7 @@ struct url_string_generator {
   {
   }
 
-  __device__ void operator()(thrust::tuple<cudf::size_type, cudf::size_type> str_begin_end)
+  __device__ void operator()(thrust::tuple<int64_t, int64_t> str_begin_end)
   {
     auto begin = thrust::get<0>(str_begin_end);
     auto end   = thrust::get<1>(str_begin_end);
@@ -69,11 +70,11 @@ auto generate_column(cudf::size_type num_rows, cudf::size_type chars_per_row, do
   auto result_col = std::move(table_a->release()[0]);  // string column with num_rows  aaa...
   auto chars_data = static_cast<char*>(result_col->mutable_view().head());
   auto offset_col = result_col->child(cudf::strings_column_view::offsets_column_index).view();
+  auto offset_itr = cudf::detail::offsetalator_factory::make_input_iterator(offset_col);
 
   auto engine = thrust::default_random_engine{};
   thrust::for_each_n(thrust::device,
-                     thrust::make_zip_iterator(offset_col.begin<cudf::size_type>(),
-                                               offset_col.begin<cudf::size_type>() + 1),
+                     thrust::make_zip_iterator(offset_itr, offset_itr + 1),
                      num_rows,
                      url_string_generator{chars_data, esc_seq_chance, engine});
   return result_col;

From e66f4f50d045da87125430d13e6b862dc845845c Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 3 Jun 2024 10:14:58 -0700
Subject: [PATCH 287/842] Add an option to run cuIO benchmarks with pinned
 buffers as input (#15830)

Adds `io_type::PINNED_BUFFER`, which allows cuIO benchmarks to use a pinned buffer as an input. The output is still a `std::vector` in this case, same as with `io_type::HOST_BUFFER`.
Also stops the used of `cudf::io::io_type` in benchmarks, to allow benchmark-specific IO types, such as this one.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/15830
---
 cpp/benchmarks/io/csv/csv_reader_input.cpp    | 16 +++++--------
 cpp/benchmarks/io/csv/csv_writer.cpp          |  8 +++----
 cpp/benchmarks/io/cuio_common.cpp             | 23 ++++++++++++-------
 cpp/benchmarks/io/cuio_common.hpp             | 14 ++++++++---
 cpp/benchmarks/io/json/json_reader_input.cpp  | 14 +++++------
 cpp/benchmarks/io/json/json_writer.cpp        |  9 ++++----
 cpp/benchmarks/io/nvbench_helpers.hpp         | 11 +++++----
 cpp/benchmarks/io/orc/orc_reader_input.cpp    | 16 ++++++-------
 cpp/benchmarks/io/orc/orc_writer.cpp          |  8 +++----
 .../io/parquet/parquet_reader_multithread.cpp | 18 +++++++++++----
 cpp/benchmarks/io/parquet/parquet_writer.cpp  |  8 +++----
 11 files changed, 77 insertions(+), 68 deletions(-)

diff --git a/cpp/benchmarks/io/csv/csv_reader_input.cpp b/cpp/benchmarks/io/csv/csv_reader_input.cpp
index 2ad3bc36f59..a93bc05ac58 100644
--- a/cpp/benchmarks/io/csv/csv_reader_input.cpp
+++ b/cpp/benchmarks/io/csv/csv_reader_input.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,9 +28,7 @@ constexpr size_t data_size         = 256 << 20;
 constexpr cudf::size_type num_cols = 64;
 
 template <typename DataType>
-void csv_read_common(DataType const& data_types,
-                     cudf::io::io_type const& source_type,
-                     nvbench::state& state)
+void csv_read_common(DataType const& data_types, io_type const& source_type, nvbench::state& state)
 {
   auto const tbl =
     create_random_table(cycle_dtypes(data_types, num_cols), table_size_bytes{data_size});
@@ -66,7 +64,7 @@ void csv_read_common(DataType const& data_types,
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
-template <data_type DataType, cudf::io::io_type IOType>
+template <data_type DataType, io_type IOType>
 void BM_csv_read_input(nvbench::state& state,
                        nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IOType>>)
 {
@@ -76,7 +74,7 @@ void BM_csv_read_input(nvbench::state& state,
   csv_read_common(d_type, source_type, state);
 }
 
-template <cudf::io::io_type IOType>
+template <io_type IOType>
 void BM_csv_read_io(nvbench::state& state, nvbench::type_list<nvbench::enum_type<IOType>>)
 {
   auto const d_type      = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
@@ -97,12 +95,10 @@ using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
                                             data_type::DURATION,
                                             data_type::STRING>;
 
-using io_list =
-  nvbench::enum_type_list<cudf::io::io_type::FILEPATH, cudf::io::io_type::HOST_BUFFER>;
+using io_list = nvbench::enum_type_list<io_type::FILEPATH, io_type::HOST_BUFFER>;
 
 NVBENCH_BENCH_TYPES(BM_csv_read_input,
-                    NVBENCH_TYPE_AXES(d_type_list,
-                                      nvbench::enum_type_list<cudf::io::io_type::DEVICE_BUFFER>))
+                    NVBENCH_TYPE_AXES(d_type_list, nvbench::enum_type_list<io_type::DEVICE_BUFFER>))
   .set_name("csv_read_data_type")
   .set_type_axes_names({"data_type", "io"})
   .set_min_samples(4);
diff --git a/cpp/benchmarks/io/csv/csv_writer.cpp b/cpp/benchmarks/io/csv/csv_writer.cpp
index 8ff07be1531..7ba43850cf2 100644
--- a/cpp/benchmarks/io/csv/csv_writer.cpp
+++ b/cpp/benchmarks/io/csv/csv_writer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@
 constexpr size_t data_size         = 256 << 20;
 constexpr cudf::size_type num_cols = 64;
 
-template <data_type DataType, cudf::io::io_type IO>
+template <data_type DataType, io_type IO>
 void BM_csv_write_dtype_io(nvbench::state& state,
                            nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IO>>)
 {
@@ -112,9 +112,7 @@ using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
                                             data_type::DURATION,
                                             data_type::STRING>;
 
-using io_list = nvbench::enum_type_list<cudf::io::io_type::FILEPATH,
-                                        cudf::io::io_type::HOST_BUFFER,
-                                        cudf::io::io_type::VOID>;
+using io_list = nvbench::enum_type_list<io_type::FILEPATH, io_type::HOST_BUFFER, io_type::VOID>;
 
 NVBENCH_BENCH_TYPES(BM_csv_write_dtype_io, NVBENCH_TYPE_AXES(d_type_list, io_list))
   .set_name("csv_write_dtype_io")
diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index 3a61e5f1e7b..37ced8ea703 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -52,6 +52,11 @@ cudf::io::source_info cuio_source_sink_pair::make_source_info()
   switch (type) {
     case io_type::FILEPATH: return cudf::io::source_info(file_name);
     case io_type::HOST_BUFFER: return cudf::io::source_info(h_buffer.data(), h_buffer.size());
+    case io_type::PINNED_BUFFER: {
+      pinned_buffer.resize(h_buffer.size());
+      std::copy(h_buffer.begin(), h_buffer.end(), pinned_buffer.begin());
+      return cudf::io::source_info(pinned_buffer.data(), pinned_buffer.size());
+    }
     case io_type::DEVICE_BUFFER: {
       // TODO: make cuio_source_sink_pair stream-friendly and avoid implicit use of the default
       // stream
@@ -71,7 +76,8 @@ cudf::io::sink_info cuio_source_sink_pair::make_sink_info()
   switch (type) {
     case io_type::VOID: return cudf::io::sink_info(void_sink.get());
     case io_type::FILEPATH: return cudf::io::sink_info(file_name);
-    case io_type::HOST_BUFFER: [[fallthrough]];
+    case io_type::HOST_BUFFER:
+    case io_type::PINNED_BUFFER:
     case io_type::DEVICE_BUFFER: return cudf::io::sink_info(&h_buffer);
     default: CUDF_FAIL("invalid output type");
   }
@@ -84,7 +90,8 @@ size_t cuio_source_sink_pair::size()
     case io_type::FILEPATH:
       return static_cast<size_t>(
         std::ifstream(file_name, std::ifstream::ate | std::ifstream::binary).tellg());
-    case io_type::HOST_BUFFER: [[fallthrough]];
+    case io_type::HOST_BUFFER:
+    case io_type::PINNED_BUFFER:
     case io_type::DEVICE_BUFFER: return h_buffer.size();
     default: CUDF_FAIL("invalid output type");
   }
@@ -204,13 +211,13 @@ void try_drop_l3_cache()
                "Failed to execute the drop cache command");
 }
 
-cudf::io::io_type retrieve_io_type_enum(std::string_view io_string)
+io_type retrieve_io_type_enum(std::string_view io_string)
 {
-  if (io_string == "FILEPATH") { return cudf::io::io_type::FILEPATH; }
-  if (io_string == "HOST_BUFFER") { return cudf::io::io_type::HOST_BUFFER; }
-  if (io_string == "DEVICE_BUFFER") { return cudf::io::io_type::DEVICE_BUFFER; }
-  if (io_string == "VOID") { return cudf::io::io_type::VOID; }
-  if (io_string == "USER_IMPLEMENTED") { return cudf::io::io_type::USER_IMPLEMENTED; }
+  if (io_string == "FILEPATH") { return io_type::FILEPATH; }
+  if (io_string == "HOST_BUFFER") { return io_type::HOST_BUFFER; }
+  if (io_string == "PINNED_BUFFER") { return io_type::PINNED_BUFFER; }
+  if (io_string == "DEVICE_BUFFER") { return io_type::DEVICE_BUFFER; }
+  if (io_string == "VOID") { return io_type::VOID; }
   CUDF_FAIL("Unsupported io_type.");
 }
 
diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp
index 6e0b32219ce..d4f39a5f243 100644
--- a/cpp/benchmarks/io/cuio_common.hpp
+++ b/cpp/benchmarks/io/cuio_common.hpp
@@ -18,13 +18,20 @@
 
 #include <cudf_test/file_utilities.hpp>
 
+#include <cudf/detail/utilities/pinned_host_vector.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/datasource.hpp>
-#include <cudf/io/types.hpp>
 
 #include <rmm/device_uvector.hpp>
 
-using cudf::io::io_type;
+// IO types supported in the benchmarks
+enum class io_type {
+  FILEPATH,       // Input/output are both files
+  HOST_BUFFER,    // Input/output are both host buffers (pageable)
+  PINNED_BUFFER,  // Input is a pinned host buffer, output is a host buffer (pageable)
+  DEVICE_BUFFER,  // Input is a device buffer, output is a host buffer (pageable)
+  VOID
+};
 
 std::string random_file_in_dir(std::string const& dir_path);
 
@@ -72,6 +79,7 @@ class cuio_source_sink_pair {
 
   io_type const type;
   std::vector<char> h_buffer;
+  cudf::detail::pinned_host_vector<char> pinned_buffer;
   rmm::device_uvector<std::byte> d_buffer;
   std::string const file_name;
   std::unique_ptr<cudf::io::data_sink> void_sink;
@@ -144,7 +152,7 @@ void try_drop_l3_cache();
  *
  * @return The io_type enum value
  */
-cudf::io::io_type retrieve_io_type_enum(std::string_view io_string);
+io_type retrieve_io_type_enum(std::string_view io_string);
 
 /**
  * @brief Convert a string to the corresponding compression_type enum value.
diff --git a/cpp/benchmarks/io/json/json_reader_input.cpp b/cpp/benchmarks/io/json/json_reader_input.cpp
index aa73dacdbc5..4366790f208 100644
--- a/cpp/benchmarks/io/json/json_reader_input.cpp
+++ b/cpp/benchmarks/io/json/json_reader_input.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -70,7 +70,7 @@ cudf::size_type json_write_bm_data(cudf::io::sink_info sink,
   return view.num_rows();
 }
 
-template <cudf::io::io_type IO>
+template <io_type IO>
 void BM_json_read_io(nvbench::state& state, nvbench::type_list<nvbench::enum_type<IO>>)
 {
   cuio_source_sink_pair source_sink(IO);
@@ -87,7 +87,7 @@ void BM_json_read_io(nvbench::state& state, nvbench::type_list<nvbench::enum_typ
   json_read_common(source_sink, num_rows, state);
 }
 
-template <data_type DataType, cudf::io::io_type IO>
+template <data_type DataType, io_type IO>
 void BM_json_read_data_type(
   nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IO>>)
 {
@@ -107,16 +107,14 @@ using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
                                             data_type::LIST,
                                             data_type::STRUCT>;
 
-using io_list = nvbench::enum_type_list<cudf::io::io_type::FILEPATH,
-                                        cudf::io::io_type::HOST_BUFFER,
-                                        cudf::io::io_type::DEVICE_BUFFER>;
+using io_list =
+  nvbench::enum_type_list<io_type::FILEPATH, io_type::HOST_BUFFER, io_type::DEVICE_BUFFER>;
 
 using compression_list =
   nvbench::enum_type_list<cudf::io::compression_type::SNAPPY, cudf::io::compression_type::NONE>;
 
 NVBENCH_BENCH_TYPES(BM_json_read_data_type,
-                    NVBENCH_TYPE_AXES(d_type_list,
-                                      nvbench::enum_type_list<cudf::io::io_type::DEVICE_BUFFER>))
+                    NVBENCH_TYPE_AXES(d_type_list, nvbench::enum_type_list<io_type::DEVICE_BUFFER>))
   .set_name("json_read_data_type")
   .set_type_axes_names({"data_type", "io"})
   .set_min_samples(4);
diff --git a/cpp/benchmarks/io/json/json_writer.cpp b/cpp/benchmarks/io/json/json_writer.cpp
index ae6bb81ff93..444457bbf0d 100644
--- a/cpp/benchmarks/io/json/json_writer.cpp
+++ b/cpp/benchmarks/io/json/json_writer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -52,7 +52,7 @@ void json_write_common(cudf::io::json_writer_options const& write_opts,
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
-template <cudf::io::io_type IO>
+template <io_type IO>
 void BM_json_write_io(nvbench::state& state, nvbench::type_list<nvbench::enum_type<IO>>)
 {
   auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
@@ -114,9 +114,8 @@ void BM_json_writer_options(nvbench::state& state)
   json_write_common(write_opts, source_sink, data_size, state);
 }
 
-using io_list = nvbench::enum_type_list<cudf::io::io_type::FILEPATH,
-                                        cudf::io::io_type::HOST_BUFFER,
-                                        cudf::io::io_type::DEVICE_BUFFER>;
+using io_list =
+  nvbench::enum_type_list<io_type::FILEPATH, io_type::HOST_BUFFER, io_type::DEVICE_BUFFER>;
 
 NVBENCH_BENCH_TYPES(BM_json_write_io, NVBENCH_TYPE_AXES(io_list))
   .set_name("json_write_io")
diff --git a/cpp/benchmarks/io/nvbench_helpers.hpp b/cpp/benchmarks/io/nvbench_helpers.hpp
index 8b79912c7ee..1e3ab2b7b4f 100644
--- a/cpp/benchmarks/io/nvbench_helpers.hpp
+++ b/cpp/benchmarks/io/nvbench_helpers.hpp
@@ -56,13 +56,14 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
   [](auto) { return std::string{}; })
 
 NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
-  cudf::io::io_type,
+  io_type,
   [](auto value) {
     switch (value) {
-      case cudf::io::io_type::FILEPATH: return "FILEPATH";
-      case cudf::io::io_type::HOST_BUFFER: return "HOST_BUFFER";
-      case cudf::io::io_type::DEVICE_BUFFER: return "DEVICE_BUFFER";
-      case cudf::io::io_type::VOID: return "VOID";
+      case io_type::FILEPATH: return "FILEPATH";
+      case io_type::HOST_BUFFER: return "HOST_BUFFER";
+      case io_type::PINNED_BUFFER: return "PINNED_BUFFER";
+      case io_type::DEVICE_BUFFER: return "DEVICE_BUFFER";
+      case io_type::VOID: return "VOID";
       default: return "Unknown";
     }
   },
diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index b7c214a8374..cafd3cc5c39 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -87,7 +87,7 @@ void orc_read_common(cudf::size_type num_rows_to_read,
 
 }  // namespace
 
-template <data_type DataType, cudf::io::io_type IOType>
+template <data_type DataType, io_type IOType>
 void BM_orc_read_data(nvbench::state& state,
                       nvbench::type_list<nvbench::enum_type<DataType>, nvbench::enum_type<IOType>>)
 {
@@ -112,7 +112,7 @@ void BM_orc_read_data(nvbench::state& state,
   orc_read_common<false>(num_rows_written, source_sink, state);
 }
 
-template <cudf::io::io_type IOType, cudf::io::compression_type Compression, bool chunked_read>
+template <io_type IOType, cudf::io::compression_type Compression, bool chunked_read>
 void orc_read_io_compression(nvbench::state& state)
 {
   auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL_SIGNED),
@@ -150,7 +150,7 @@ void orc_read_io_compression(nvbench::state& state)
   orc_read_common<chunked_read>(num_rows_written, source_sink, state);
 }
 
-template <cudf::io::io_type IOType, cudf::io::compression_type Compression>
+template <io_type IOType, cudf::io::compression_type Compression>
 void BM_orc_read_io_compression(
   nvbench::state& state,
   nvbench::type_list<nvbench::enum_type<IOType>, nvbench::enum_type<Compression>>)
@@ -163,7 +163,7 @@ void BM_orc_chunked_read_io_compression(nvbench::state& state,
                                         nvbench::type_list<nvbench::enum_type<Compression>>)
 {
   // Only run benchmark using HOST_BUFFER IO.
-  return orc_read_io_compression<cudf::io::io_type::HOST_BUFFER, Compression, true>(state);
+  return orc_read_io_compression<io_type::HOST_BUFFER, Compression, true>(state);
 }
 
 using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL_SIGNED,
@@ -174,16 +174,14 @@ using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL_SIGNED,
                                             data_type::LIST,
                                             data_type::STRUCT>;
 
-using io_list = nvbench::enum_type_list<cudf::io::io_type::FILEPATH,
-                                        cudf::io::io_type::HOST_BUFFER,
-                                        cudf::io::io_type::DEVICE_BUFFER>;
+using io_list =
+  nvbench::enum_type_list<io_type::FILEPATH, io_type::HOST_BUFFER, io_type::DEVICE_BUFFER>;
 
 using compression_list =
   nvbench::enum_type_list<cudf::io::compression_type::SNAPPY, cudf::io::compression_type::NONE>;
 
 NVBENCH_BENCH_TYPES(BM_orc_read_data,
-                    NVBENCH_TYPE_AXES(d_type_list,
-                                      nvbench::enum_type_list<cudf::io::io_type::DEVICE_BUFFER>))
+                    NVBENCH_TYPE_AXES(d_type_list, nvbench::enum_type_list<io_type::DEVICE_BUFFER>))
   .set_name("orc_read_decode")
   .set_type_axes_names({"data_type", "io"})
   .set_min_samples(4)
diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp
index bb373297222..b795f3e3164 100644
--- a/cpp/benchmarks/io/orc/orc_writer.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -82,7 +82,7 @@ void BM_orc_write_encode(nvbench::state& state, nvbench::type_list<nvbench::enum
   state.add_buffer_size(encoded_file_size, "encoded_file_size", "encoded_file_size");
 }
 
-template <cudf::io::io_type IO, cudf::io::compression_type Compression>
+template <io_type IO, cudf::io::compression_type Compression>
 void BM_orc_write_io_compression(
   nvbench::state& state,
   nvbench::type_list<nvbench::enum_type<IO>, nvbench::enum_type<Compression>>)
@@ -183,9 +183,7 @@ using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL_SIGNED,
                                             data_type::LIST,
                                             data_type::STRUCT>;
 
-using io_list = nvbench::enum_type_list<cudf::io::io_type::FILEPATH,
-                                        cudf::io::io_type::HOST_BUFFER,
-                                        cudf::io::io_type::VOID>;
+using io_list = nvbench::enum_type_list<io_type::FILEPATH, io_type::HOST_BUFFER, io_type::VOID>;
 
 using compression_list =
   nvbench::enum_type_list<cudf::io::compression_type::SNAPPY, cudf::io::compression_type::NONE>;
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index bd80c4e0e88..a67d1932951 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -62,7 +62,7 @@ std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
   size_t total_file_size = 0;
 
   for (size_t i = 0; i < num_files; ++i) {
-    cuio_source_sink_pair source_sink{cudf::io::io_type::HOST_BUFFER};
+    cuio_source_sink_pair source_sink{io_type::HOST_BUFFER};
 
     auto const tbl = create_random_table(
       cycle_dtypes(d_types, num_cols),
@@ -96,6 +96,11 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
   cudf::detail::thread_pool threads(num_threads);
 
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+  std::vector<cudf::io::source_info> source_info_vector;
+  std::transform(source_sink_vector.begin(),
+                 source_sink_vector.end(),
+                 std::back_inserter(source_info_vector),
+                 [](auto& source_sink) { return source_sink.make_source_info(); });
 
   auto mem_stats_logger = cudf::memory_stats_logger();
 
@@ -104,9 +109,8 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
              [&](nvbench::launch& launch, auto& timer) {
                auto read_func = [&](int index) {
                  auto const stream = streams[index % num_threads];
-                 auto& source_sink = source_sink_vector[index];
                  cudf::io::parquet_reader_options read_opts =
-                   cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
+                   cudf::io::parquet_reader_options::builder(source_info_vector[index]);
                  cudf::io::read_parquet(read_opts, stream, rmm::mr::get_current_device_resource());
                };
 
@@ -174,6 +178,11 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
   cudf::detail::thread_pool threads(num_threads);
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+  std::vector<cudf::io::source_info> source_info_vector;
+  std::transform(source_sink_vector.begin(),
+                 source_sink_vector.end(),
+                 std::back_inserter(source_info_vector),
+                 [](auto& source_sink) { return source_sink.make_source_info(); });
 
   auto mem_stats_logger = cudf::memory_stats_logger();
 
@@ -183,9 +192,8 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
              [&](nvbench::launch& launch, auto& timer) {
                auto read_func = [&](int index) {
                  auto const stream = streams[index % num_threads];
-                 auto& source_sink = source_sink_vector[index];
                  cudf::io::parquet_reader_options read_opts =
-                   cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
+                   cudf::io::parquet_reader_options::builder(source_info_vector[index]);
                  // divide chunk limits by number of threads so the number of chunks produced is the
                  // same for all cases. this seems better than the alternative, which is to keep the
                  // limits the same. if we do that, as the number of threads goes up, the number of
diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp
index 13b396ea267..46d2927a92b 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -82,7 +82,7 @@ void BM_parq_write_encode(nvbench::state& state, nvbench::type_list<nvbench::enu
   state.add_buffer_size(encoded_file_size, "encoded_file_size", "encoded_file_size");
 }
 
-template <cudf::io::io_type IO, cudf::io::compression_type Compression>
+template <io_type IO, cudf::io::compression_type Compression>
 void BM_parq_write_io_compression(
   nvbench::state& state,
   nvbench::type_list<nvbench::enum_type<IO>, nvbench::enum_type<Compression>>)
@@ -188,9 +188,7 @@ using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
                                             data_type::LIST,
                                             data_type::STRUCT>;
 
-using io_list = nvbench::enum_type_list<cudf::io::io_type::FILEPATH,
-                                        cudf::io::io_type::HOST_BUFFER,
-                                        cudf::io::io_type::VOID>;
+using io_list = nvbench::enum_type_list<io_type::FILEPATH, io_type::HOST_BUFFER, io_type::VOID>;
 
 using compression_list =
   nvbench::enum_type_list<cudf::io::compression_type::SNAPPY, cudf::io::compression_type::NONE>;

From ba1299dfc03e87f11cf021a67d01531ed6afd7f7 Mon Sep 17 00:00:00 2001
From: Brian Tepera <btepera@gmail.com>
Date: Mon, 3 Jun 2024 13:45:09 -0400
Subject: [PATCH 288/842] Implement day_name and month_name to match pandas
 (#15479)

This PR implements the `month_name` and `day_name` datetime methods, matching the equivalent [month_name](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dt.month_name.html) and [day_name](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.dt.day_name.html) methods from pandas.

Currently this is implemented just for English locale, though it could be expanded to include additional languages in the future.

Closes #12407

Authors:
  - Brian Tepera (https://github.com/btepera)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15479
---
 python/cudf/cudf/core/column/datetime.py | 29 ++++++++++
 python/cudf/cudf/core/index.py           | 39 +++++++++++++
 python/cudf/cudf/core/series.py          | 72 ++++++++++++++++++++++++
 python/cudf/cudf/tests/test_datetime.py  | 39 +++++++++++++
 4 files changed, 179 insertions(+)

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index d92a3a00641..27f31c8f500 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import calendar
 import datetime
 import functools
 import locale
@@ -339,6 +340,34 @@ def element_indexing(self, index: int):
     def get_dt_field(self, field: str) -> ColumnBase:
         return libcudf.datetime.extract_datetime_component(self, field)
 
+    def _get_field_names(
+        self,
+        field: Literal["month", "weekday"],
+        labels: list[str],
+        locale: str | None = None,
+    ) -> ColumnBase:
+        if locale is not None:
+            raise NotImplementedError(
+                "Setting a locale is currently not supported. "
+                "Results will be returned in your current locale."
+            )
+        col_labels = as_column(labels)
+        indices = self.get_dt_field(field)
+        has_nulls = indices.has_nulls()
+        if has_nulls:
+            indices = indices.fillna(len(col_labels))
+        return col_labels.take(indices, nullify=True, check_bounds=has_nulls)
+
+    def get_day_names(self, locale: str | None = None) -> ColumnBase:
+        return self._get_field_names(
+            "weekday", list(calendar.day_name), locale=locale
+        )
+
+    def get_month_names(self, locale: str | None = None) -> ColumnBase:
+        return self._get_field_names(
+            "month", list(calendar.month_name), locale=locale
+        )
+
     def ceil(self, freq: str) -> ColumnBase:
         return libcudf.datetime.ceil_datetime(self, freq)
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 49bfb150f60..2a75b374a1e 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2120,6 +2120,45 @@ def quarter(self):
         res = extract_quarter(self._values)
         return Index(res, dtype="int8")
 
+    @_cudf_nvtx_annotate
+    def day_name(self, locale: str | None = None) -> Index:
+        """
+        Return the day names. Currently supports English locale only.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> datetime_index = cudf.date_range("2016-12-31", "2017-01-08", freq="D")
+        >>> datetime_index
+        DatetimeIndex(['2016-12-31', '2017-01-01', '2017-01-02', '2017-01-03',
+                       '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07'],
+                      dtype='datetime64[ns]', freq='D')
+        >>> datetime_index.day_name()
+        Index(['Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
+               'Friday', 'Saturday'], dtype='object')
+        """
+        day_names = self._column.get_day_names(locale)
+        return Index._from_data({self.name: day_names})
+
+    @_cudf_nvtx_annotate
+    def month_name(self, locale: str | None = None) -> Index:
+        """
+        Return the month names. Currently supports English locale only.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> datetime_index = cudf.date_range("2017-12-30", periods=6, freq='W')
+        >>> datetime_index
+        DatetimeIndex(['2017-12-30', '2018-01-06', '2018-01-13', '2018-01-20',
+                    '2018-01-27', '2018-02-03'],
+                      dtype='datetime64[ns]', freq='7D')
+        >>> datetime_index.month_name()
+        Index(['December', 'January', 'January', 'January', 'January', 'February'], dtype='object')
+        """
+        month_names = self._column.get_month_names(locale)
+        return Index._from_data({self.name: month_names})
+
     @_cudf_nvtx_annotate
     def isocalendar(self):
         """
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 908347e389b..a5b204ef346 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4201,6 +4201,78 @@ def quarter(self):
             name=self.series.name,
         )
 
+    @_cudf_nvtx_annotate
+    def day_name(self, locale=None):
+        """
+        Return the day names. Currently supports English locale only.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> datetime_series = cudf.Series(cudf.date_range('2016-12-31',
+        ...     '2017-01-08', freq='D'))
+        >>> datetime_series
+        0   2016-12-31
+        1   2017-01-01
+        2   2017-01-02
+        3   2017-01-03
+        4   2017-01-04
+        5   2017-01-05
+        6   2017-01-06
+        7   2017-01-07
+        8   2017-01-08
+        dtype: datetime64[ns]
+        >>> datetime_series.dt.day_name()
+        0     Saturday
+        1       Sunday
+        2       Monday
+        3      Tuesday
+        4    Wednesday
+        5     Thursday
+        6       Friday
+        7     Saturday
+        dtype: object
+        """
+        day_names = self.series._column.get_day_names(locale)
+        return Series._from_data(
+            ColumnAccessor({None: day_names}),
+            index=self.series.index,
+            name=self.series.name,
+        )
+
+    @_cudf_nvtx_annotate
+    def month_name(self, locale: str | None = None) -> Series:
+        """
+        Return the month names. Currently supports English locale only.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> datetime_series = cudf.Series(cudf.date_range("2017-12-30", periods=6, freq='W'))
+        >>> datetime_series
+        0   2017-12-30
+        1   2018-01-06
+        2   2018-01-13
+        3   2018-01-20
+        4   2018-01-27
+        5   2018-02-03
+        dtype: datetime64[ns]
+        >>> datetime_series.dt.month_name()
+        0    December
+        1     January
+        2     January
+        3     January
+        4     January
+        5    February
+        dtype: object
+        """
+        month_names = self.series._column.get_month_names(locale)
+        return Series._from_data(
+            ColumnAccessor({None: month_names}),
+            index=self.series.index,
+            name=self.series.name,
+        )
+
     @_cudf_nvtx_annotate
     def isocalendar(self):
         """
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 46a0dcd315d..4186fff038a 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2419,3 +2419,42 @@ def test_date_range_tz():
     result = pd.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC")
     expected = cudf.date_range("2020-01-01", "2020-01-02", periods=2, tz="UTC")
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("meth", ["day_name", "month_name"])
+@pytest.mark.parametrize("klass", [pd.Series, pd.DatetimeIndex])
+def test_day_month_name(meth, klass):
+    data = [
+        "2020-05-31 08:00:00",
+        None,
+        "1999-12-31 18:40:00",
+        "2000-12-31 04:00:00",
+        None,
+        "1900-02-28 07:00:00",
+        "1800-03-14 07:30:00",
+        "2100-03-14 07:30:00",
+        "1970-01-01 00:00:00",
+        "1969-12-31 12:59:00",
+    ]
+
+    p_obj = klass(data, dtype="datetime64[s]")
+    g_obj = cudf.from_pandas(p_obj)
+
+    if klass is pd.Series:
+        p_obj = p_obj.dt
+        g_obj = g_obj.dt
+
+    expect = getattr(p_obj, meth)()
+    got = getattr(g_obj, meth)()
+
+    assert_eq(expect, got)
+
+
+@pytest.mark.parametrize("meth", ["day_name", "month_name"])
+@pytest.mark.parametrize("klass", [cudf.Series, cudf.DatetimeIndex])
+def test_day_month_name_locale_not_implemented(meth, klass):
+    obj = klass(cudf.date_range("2020-01-01", periods=7))
+    if klass is cudf.Series:
+        obj = obj.dt
+    with pytest.raises(NotImplementedError):
+        getattr(obj, meth)(locale="pt_BR.utf8")

From 7d5561a8c0aeb8531913d7767faca55a5ab31fa5 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 3 Jun 2024 15:29:39 -0400
Subject: [PATCH 289/842] Fix debug assert in rowgroup_char_counts_kernel
 (#15902)

Fixes assert triggered by `OrcWriterTest.EmptyChildStringColumn` in a Debug build.

```
$ gtests/ORC_TEST --gtest_filter=OrcWriterTest.EmptyChildStringColumn
Note: Google Test filter = OrcWriterTest.EmptyChildStringColumn
[==========] Running 1 test from 1 test suite.
[----------] Global test environment set-up.
[----------] 1 test from OrcWriterTest
[ RUN      ] OrcWriterTest.EmptyChildStringColumn
/cudf/cpp/include/cudf/detail/offsets_iterator.cuh:79: cudf::detail::input_offsetalator::input_offsetalator(const void *, cudf::data_type, int): block: [0,0,0], thread: [0,0,0] Assertion `(dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) && "Unexpected offsets type"` failed.
CUDA Error detected. cudaErrorAssert device-side assert triggered
ORC_TEST: /conda/envs/rapids/include/rmm/mr/device/detail/stream_ordered_memory_resource.hpp:248: void rmm::mr::detail::stream_ordered_memory_resource<PoolResource, FreeListType>::do_deallocate(void*, std::size_t, rmm::cuda_stream_view) [with PoolResource = rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>; FreeListType = rmm::mr::detail::coalescing_free_list; std::size_t = long unsigned int]: Assertion `status__ == cudaSuccess' failed.
Aborted (core dumped)
```

Error introduced in #15891 where offsetalator wraps an offsets column in the `cudf::io::orc::gpu::rowgroup_char_counts_kernel`.
But when `num_rows==0` the offsets column is `EMPTY` causing the assert to trigger.
Checking the `num_rows` before accessing the offsets column fixes the issue.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/15902
---
 cpp/src/io/orc/dict_enc.cu | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 5181c4a1c0e..5be75350951 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -44,12 +44,13 @@ CUDF_KERNEL void rowgroup_char_counts_kernel(device_2dspan<size_type> char_count
   auto const start_row = rowgroup_bounds[row_group_idx][col_idx].begin + str_col.offset();
   auto const num_rows  = rowgroup_bounds[row_group_idx][col_idx].size();
 
-  auto const& offsets    = str_col.child(strings_column_view::offsets_column_index);
-  auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type());
-  char_counts[str_col_idx][row_group_idx] =
-    (num_rows == 0)
-      ? 0
-      : static_cast<size_type>(offsets_itr[start_row + num_rows] - offsets_itr[start_row]);
+  size_type char_count = 0;
+  if (num_rows > 0) {
+    auto const& offsets    = str_col.child(strings_column_view::offsets_column_index);
+    auto const offsets_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type());
+    char_count = static_cast<size_type>(offsets_itr[start_row + num_rows] - offsets_itr[start_row]);
+  }
+  char_counts[str_col_idx][row_group_idx] = char_count;
 }
 
 void rowgroup_char_counts(device_2dspan<size_type> counts,

From 4a17c451719a5d1e144b21703650bd323990e892 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 3 Jun 2024 15:32:12 -0400
Subject: [PATCH 290/842] Rename strings multiple target replace API (#15898)

Renames the multi-target overload of `cudf::strings::replace()` to `cudf::strings::replace_multiple()`.
This helps with some Cython issues involving fused types and overloaded functions with the same number of arguments.
Reference: https://github.com/rapidsai/cudf/issues/15855#issuecomment-2129980298

This change deprecates the old name to be removed in a future release.

Also added some additional error unit tests.

Closes #15855

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15898
---
 cpp/benchmarks/string/replace.cpp             |  2 +-
 cpp/include/cudf/strings/detail/replace.hpp   | 12 +++----
 cpp/include/cudf/strings/replace.hpp          | 14 +++++++-
 cpp/src/strings/replace/multi.cu              | 23 +++++++++----
 cpp/tests/json/json_tests.cpp                 |  2 +-
 cpp/tests/streams/strings/replace_test.cpp    |  4 +--
 cpp/tests/strings/replace_tests.cpp           | 33 +++++++++++++++----
 java/src/main/native/src/ColumnViewJni.cpp    |  2 +-
 .../pylibcudf/libcudf/strings/replace.pxd     |  2 +-
 python/cudf/cudf/_lib/strings/replace.pyx     |  3 +-
 10 files changed, 71 insertions(+), 26 deletions(-)

diff --git a/cpp/benchmarks/string/replace.cpp b/cpp/benchmarks/string/replace.cpp
index c8f26142193..3d9d51bfd6d 100644
--- a/cpp/benchmarks/string/replace.cpp
+++ b/cpp/benchmarks/string/replace.cpp
@@ -52,7 +52,7 @@ static void BM_replace(benchmark::State& state, replace_type rt)
       case scalar: cudf::strings::replace(input, target, repl); break;
       case slice: cudf::strings::replace_slice(input, repl, 1, 10); break;
       case multi:
-        cudf::strings::replace(
+        cudf::strings::replace_multiple(
           input, cudf::strings_column_view(targets), cudf::strings_column_view(repls));
         break;
     }
diff --git a/cpp/include/cudf/strings/detail/replace.hpp b/cpp/include/cudf/strings/detail/replace.hpp
index aad89beb47e..481d00f1bce 100644
--- a/cpp/include/cudf/strings/detail/replace.hpp
+++ b/cpp/include/cudf/strings/detail/replace.hpp
@@ -39,14 +39,14 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
                                 rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::strings::replace(strings_column_view const&, strings_column_view const&,
+ * @copydoc cudf::strings::replace_multiple(strings_column_view const&, strings_column_view const&,
  * strings_column_view const&, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
-std::unique_ptr<column> replace(strings_column_view const& strings,
-                                strings_column_view const& targets,
-                                strings_column_view const& repls,
-                                rmm::cuda_stream_view stream,
-                                rmm::device_async_resource_ref mr);
+std::unique_ptr<column> replace_mutiple(strings_column_view const& strings,
+                                        strings_column_view const& targets,
+                                        strings_column_view const& repls,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::device_async_resource_ref mr);
 
 /**
  * @brief Replaces any null string entries with the given string.
diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index 9525db44b69..a19aa9be0c0 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -153,7 +153,19 @@ std::unique_ptr<column> replace_slice(
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New strings column
  */
-std::unique_ptr<column> replace(
+std::unique_ptr<column> replace_multiple(
+  strings_column_view const& input,
+  strings_column_view const& targets,
+  strings_column_view const& repls,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc cudf::strings::replace_multiple
+ *
+ * @deprecated since 24.08
+ */
+[[deprecated]] std::unique_ptr<column> replace(
   strings_column_view const& input,
   strings_column_view const& targets,
   strings_column_view const& repls,
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index f4110707c79..8e5c5cf60b8 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -499,11 +499,11 @@ std::unique_ptr<column> replace_string_parallel(strings_column_view const& input
 
 }  // namespace
 
-std::unique_ptr<column> replace(strings_column_view const& input,
-                                strings_column_view const& targets,
-                                strings_column_view const& repls,
-                                rmm::cuda_stream_view stream,
-                                rmm::device_async_resource_ref mr)
+std::unique_ptr<column> replace_multiple(strings_column_view const& input,
+                                         strings_column_view const& targets,
+                                         strings_column_view const& repls,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::device_async_resource_ref mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   CUDF_EXPECTS(((targets.size() > 0) && (targets.null_count() == 0)),
@@ -524,6 +524,17 @@ std::unique_ptr<column> replace(strings_column_view const& input,
 
 // external API
 
+std::unique_ptr<column> replace_multiple(strings_column_view const& strings,
+                                         strings_column_view const& targets,
+                                         strings_column_view const& repls,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::replace_multiple(strings, targets, repls, stream, mr);
+}
+
+// deprecated in 24.08
 std::unique_ptr<column> replace(strings_column_view const& strings,
                                 strings_column_view const& targets,
                                 strings_column_view const& repls,
@@ -531,7 +542,7 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace(strings, targets, repls, stream, mr);
+  return detail::replace_multiple(strings, targets, repls, stream, mr);
 }
 
 }  // namespace strings
diff --git a/cpp/tests/json/json_tests.cpp b/cpp/tests/json/json_tests.cpp
index 6c9050becc1..e38ca6628f3 100644
--- a/cpp/tests/json/json_tests.cpp
+++ b/cpp/tests/json/json_tests.cpp
@@ -76,7 +76,7 @@ std::unique_ptr<cudf::column> drop_whitespace(cudf::column_view const& col)
   cudf::strings_column_view strings(col);
   cudf::strings_column_view targets(whitespace);
   cudf::strings_column_view replacements(repl);
-  return cudf::strings::replace(strings, targets, replacements);
+  return cudf::strings::replace_multiple(strings, targets, replacements);
 }
 
 struct JsonPathTests : public cudf::test::BaseFixture {};
diff --git a/cpp/tests/streams/strings/replace_test.cpp b/cpp/tests/streams/strings/replace_test.cpp
index fc87460b706..95c1209b5db 100644
--- a/cpp/tests/streams/strings/replace_test.cpp
+++ b/cpp/tests/streams/strings/replace_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ TEST_F(StringsReplaceTest, Replace)
   auto const target = cudf::string_scalar("é", true, cudf::test::get_default_stream());
   auto const repl   = cudf::string_scalar(" ", true, cudf::test::get_default_stream());
   cudf::strings::replace(view, target, repl, -1, cudf::test::get_default_stream());
-  cudf::strings::replace(view, view, view, cudf::test::get_default_stream());
+  cudf::strings::replace_multiple(view, view, view, cudf::test::get_default_stream());
   cudf::strings::replace_slice(view, repl, 1, 2, cudf::test::get_default_stream());
 
   auto const pattern = std::string("[a-z]");
diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp
index 726d9f95c7d..ef4f3bc2b2a 100644
--- a/cpp/tests/strings/replace_tests.cpp
+++ b/cpp/tests/strings/replace_tests.cpp
@@ -277,6 +277,23 @@ TEST_F(StringsReplaceTest, ReplaceErrors)
   EXPECT_THROW(cudf::strings::replace(sv, target, null_input), cudf::logic_error);
   EXPECT_THROW(cudf::strings::replace(sv, null_input, replacement), cudf::logic_error);
   EXPECT_THROW(cudf::strings::replace(sv, empty_input, replacement), cudf::logic_error);
+
+  auto const empty       = cudf::test::strings_column_wrapper();
+  auto const ev          = cudf::strings_column_view(empty);
+  auto const targets     = cudf::test::strings_column_wrapper({"x"});
+  auto const tv          = cudf::strings_column_view(targets);
+  auto const target_null = cudf::test::strings_column_wrapper({""}, {0});
+  auto const tv_null     = cudf::strings_column_view(target_null);
+  auto const repls       = cudf::test::strings_column_wrapper({"y", "z"});
+  auto const rv          = cudf::strings_column_view(repls);
+  auto const repl_null   = cudf::test::strings_column_wrapper({""}, {0});
+  auto const rv_null     = cudf::strings_column_view(repl_null);
+
+  EXPECT_THROW(cudf::strings::replace_multiple(sv, ev, rv), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::replace_multiple(sv, tv_null, rv), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::replace_multiple(sv, tv, ev), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::replace_multiple(sv, tv, rv_null), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::replace_multiple(sv, tv, rv), cudf::logic_error);
 }
 
 TEST_F(StringsReplaceTest, ReplaceSlice)
@@ -341,7 +358,7 @@ TEST_F(StringsReplaceTest, ReplaceMulti)
     cudf::test::strings_column_wrapper repls({"_ ", "A ", "2 "});
     auto repls_view = cudf::strings_column_view(repls);
 
-    auto results = cudf::strings::replace(strings_view, targets_view, repls_view);
+    auto results = cudf::strings::replace_multiple(strings_view, targets_view, repls_view);
 
     std::vector<char const*> h_expected{"_ quick brown fox jumps over _ lazy dog",
                                         "_ fat cat lays next 2 _ other accénted cat",
@@ -361,7 +378,7 @@ TEST_F(StringsReplaceTest, ReplaceMulti)
     cudf::test::strings_column_wrapper repls({"* "});
     auto repls_view = cudf::strings_column_view(repls);
 
-    auto results = cudf::strings::replace(strings_view, targets_view, repls_view);
+    auto results = cudf::strings::replace_multiple(strings_view, targets_view, repls_view);
 
     std::vector<char const*> h_expected{"* quick brown fox jumps over * lazy dog",
                                         "* fat cat lays next * * other accénted cat",
@@ -422,7 +439,7 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong)
     cudf::test::strings_column_wrapper repls({"x", "PEAR", "avocado", "$$"});
     auto repls_view = cudf::strings_column_view(repls);
 
-    auto results = cudf::strings::replace(strings_view, targets_view, repls_view);
+    auto results = cudf::strings::replace_multiple(strings_view, targets_view, repls_view);
 
     cudf::test::strings_column_wrapper expected(
       {"This string needs to be very long to trigger the long-replace internal functions. "
@@ -454,7 +471,7 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong)
     cudf::test::strings_column_wrapper repls({"*"});
     auto repls_view = cudf::strings_column_view(repls);
 
-    auto results = cudf::strings::replace(strings_view, targets_view, repls_view);
+    auto results = cudf::strings::replace_multiple(strings_view, targets_view, repls_view);
 
     cudf::test::strings_column_wrapper expected(
       {"This string needs to be very long to trigger the long-replace internal functions. "
@@ -494,7 +511,7 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong)
     auto repls      = cudf::test::strings_column_wrapper({""});
     auto repls_view = cudf::strings_column_view(repls);
 
-    auto results = cudf::strings::replace(strings_view, targets_view, repls_view);
+    auto results = cudf::strings::replace_multiple(strings_view, targets_view, repls_view);
 
     cudf::test::strings_column_wrapper expected(
       {"This string needs to be very long to trigger the long-replace internal functions. "
@@ -522,6 +539,10 @@ TEST_F(StringsReplaceTest, EmptyStringsColumn)
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
   auto results      = cudf::strings::replace(
     strings_view, cudf::string_scalar("not"), cudf::string_scalar("pertinent"));
-  auto view = results->view();
+  cudf::test::expect_column_empty(results->view());
+
+  auto const target      = cudf::test::strings_column_wrapper({"x"});
+  auto const target_view = cudf::strings_column_view(target);
+  results                = cudf::strings::replace_multiple(strings_view, target_view, target_view);
   cudf::test::expect_column_empty(results->view());
 }
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 086d4672788..8487fb6dc91 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -1755,7 +1755,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringReplaceMulti(
     cudf::strings_column_view scvtargets(*cvtargets);
     cudf::column_view* cvrepls = reinterpret_cast<cudf::column_view*>(repls_cv);
     cudf::strings_column_view scvrepls(*cvrepls);
-    return release_as_jlong(cudf::strings::replace(scv, scvtargets, scvrepls));
+    return release_as_jlong(cudf::strings::replace_multiple(scv, scvtargets, scvrepls));
   }
   CATCH_STD(env, 0);
 }
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd
index 92e142b33fc..34e03eec638 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd
@@ -23,7 +23,7 @@ cdef extern from "cudf/strings/replace.hpp" namespace "cudf::strings" nogil:
         string_scalar repl,
         int32_t maxrepl) except +
 
-    cdef unique_ptr[column] replace(
+    cdef unique_ptr[column] replace_multiple(
         column_view source_strings,
         column_view target_strings,
         column_view repl_strings) except +
diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx
index 880201e65a2..2d9330a8a24 100644
--- a/python/cudf/cudf/_lib/strings/replace.pyx
+++ b/python/cudf/cudf/_lib/strings/replace.pyx
@@ -12,6 +12,7 @@ from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from cudf._lib.pylibcudf.libcudf.strings.replace cimport (
     replace as cpp_replace,
+    replace_multiple as cpp_replace_multiple,
     replace_slice as cpp_replace_slice,
 )
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
@@ -126,7 +127,7 @@ def replace_multi(Column source_strings,
     cdef column_view repl_view = repl_strings.view()
 
     with nogil:
-        c_result = move(cpp_replace(
+        c_result = move(cpp_replace_multiple(
             source_view,
             target_view,
             repl_view

From f30ea0a7d12625a755bb5726e7514dfdf12094d6 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 3 Jun 2024 17:37:56 -0400
Subject: [PATCH 291/842] Use offsetalator in strings shift functor (#15870)

Replaces hardcoded `size_type` used for offset values in the `shift_chars_fn` functor with offsetalator.
Follow on to #15630

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/15870
---
 cpp/src/strings/copying/shift.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu
index 5bba4855390..b386c0860d1 100644
--- a/cpp/src/strings/copying/shift.cu
+++ b/cpp/src/strings/copying/shift.cu
@@ -67,9 +67,9 @@ struct shift_chars_fn {
     if (offset < 0) {
       auto const last_index = -offset;
       if (idx < last_index) {
-        auto const first_index =
-          offset + d_column.child(strings_column_view::offsets_column_index)
-                     .element<size_type>(d_column.offset() + d_column.size());
+        auto const offsets     = d_column.child(strings_column_view::offsets_column_index);
+        auto const off_itr     = cudf::detail::input_offsetalator(offsets.head(), offsets.type());
+        auto const first_index = offset + off_itr[d_column.offset() + d_column.size()];
         return d_column.head<char>()[idx + first_index];
       } else {
         auto const char_index = idx - last_index;
@@ -79,9 +79,9 @@ struct shift_chars_fn {
       if (idx < offset) {
         return d_filler.data()[idx % d_filler.size_bytes()];
       } else {
-        return d_column.head<char>()[idx - offset +
-                                     d_column.child(strings_column_view::offsets_column_index)
-                                       .element<size_type>(d_column.offset())];
+        auto const offsets = d_column.child(strings_column_view::offsets_column_index);
+        auto const off_itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type());
+        return d_column.head<char>()[idx - offset + off_itr[d_column.offset()]];
       }
     }
   }

From 90b3094f8a5a12b029a156cf484b673b589d2fec Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Mon, 3 Jun 2024 14:52:46 -0700
Subject: [PATCH 292/842] Clean up pylibcudf test assertations (#15892)

Swap the order of result,expected to expected, result for assert_table_eq too
Fix a few places where result,expected was swapped for assert_column_eq

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15892
---
 python/cudf/cudf/pylibcudf_tests/common/utils.py   |  2 +-
 python/cudf/cudf/pylibcudf_tests/test_copying.py   | 14 +++++++-------
 python/cudf/cudf/pylibcudf_tests/test_reshape.py   |  4 ++--
 .../cudf/pylibcudf_tests/test_string_capitalize.py |  6 +++---
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index 0befb3bb3e8..e00053529a8 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -54,7 +54,7 @@ def assert_column_eq(
     assert lhs.equals(rhs)
 
 
-def assert_table_eq(plc_table: plc.Table, pa_table: pa.Table) -> None:
+def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None:
     """Verify that a pylibcudf table and PyArrow table are equal."""
     plc_shape = (plc_table.num_rows(), plc_table.num_columns())
     assert plc_shape == pa_table.shape
diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py
index ef70869a145..cd70ce4abf5 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_copying.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py
@@ -138,7 +138,7 @@ def test_gather(target_table, pa_target_table, index_column, pa_index_column):
         plc.copying.OutOfBoundsPolicy.DONT_CHECK,
     )
     expected = pa_target_table.take(pa_index_column)
-    assert_table_eq(result, expected)
+    assert_table_eq(expected, result)
 
 
 def test_gather_map_has_nulls(target_table):
@@ -240,7 +240,7 @@ def test_scatter_table(
             pa_target_table,
         )
 
-    assert_table_eq(result, expected)
+    assert_table_eq(expected, result)
 
 
 def test_scatter_table_num_col_mismatch(
@@ -315,7 +315,7 @@ def test_scatter_scalars(
         pa_target_table,
     )
 
-    assert_table_eq(result, expected)
+    assert_table_eq(expected, result)
 
 
 def test_scatter_scalars_num_scalars_mismatch(
@@ -574,7 +574,7 @@ def test_slice_table(target_table, pa_target_table):
     lower_bounds = bounds[::2]
     result = plc.copying.slice(target_table, bounds)
     for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result):
-        assert_table_eq(slice_, pa_target_table[lb:ub])
+        assert_table_eq(pa_target_table[lb:ub], slice_)
 
 
 def test_split_column(target_column, pa_target_column):
@@ -600,7 +600,7 @@ def test_split_table(target_table, pa_target_table):
     lower_bounds = [0] + upper_bounds[:-1]
     result = plc.copying.split(target_table, upper_bounds)
     for lb, ub, split in zip(lower_bounds, upper_bounds, result):
-        assert_table_eq(split, pa_target_table[lb:ub])
+        assert_table_eq(pa_target_table[lb:ub], split)
 
 
 def test_copy_if_else_column_column(
@@ -753,7 +753,7 @@ def test_boolean_mask_scatter_from_table(
             pa_source_table, pa_mask, pa_target_table
         )
 
-    assert_table_eq(result, expected)
+    assert_table_eq(expected, result)
 
 
 def test_boolean_mask_scatter_from_wrong_num_cols(source_table, target_table):
@@ -828,7 +828,7 @@ def test_boolean_mask_scatter_from_scalars(
         pa_target_table,
     )
 
-    assert_table_eq(result, expected)
+    assert_table_eq(expected, result)
 
 
 def test_get_element(input_column, pa_input_column):
diff --git a/python/cudf/cudf/pylibcudf_tests/test_reshape.py b/python/cudf/cudf/pylibcudf_tests/test_reshape.py
index b8b914f3f09..32d79257f4f 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_reshape.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_reshape.py
@@ -27,7 +27,7 @@ def test_interleave_columns(reshape_data, reshape_plc_tbl):
 
     expect = pa.concat_arrays(interleaved_data)
 
-    assert_column_eq(res, expect)
+    assert_column_eq(expect, res)
 
 
 @pytest.mark.parametrize("cnt", [0, 1, 3])
@@ -40,4 +40,4 @@ def test_tile(reshape_data, reshape_plc_tbl, cnt):
         tiled_data, schema=plc.interop.to_arrow(reshape_plc_tbl).schema
     )
 
-    assert_table_eq(res, expect)
+    assert_table_eq(expect, res)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py
index dd7e96e871b..818d6e6e72a 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py
@@ -37,7 +37,7 @@ def plc_data(pa_data):
 def test_capitalize(plc_data, pa_data):
     got = plc.strings.capitalize.capitalize(plc_data)
     expected = pa.compute.utf8_capitalize(pa_data)
-    assert_column_eq(got, expected)
+    assert_column_eq(expected, got)
 
 
 def test_title(plc_data, pa_data):
@@ -45,10 +45,10 @@ def test_title(plc_data, pa_data):
         plc_data, plc.strings.char_types.StringCharacterTypes.CASE_TYPES
     )
     expected = pa.compute.utf8_title(pa_data)
-    assert_column_eq(got, expected)
+    assert_column_eq(expected, got)
 
 
 def test_is_title(plc_data, pa_data):
     got = plc.strings.capitalize.is_title(plc_data)
     expected = pa.compute.utf8_is_title(pa_data)
-    assert_column_eq(got, expected)
+    assert_column_eq(expected, got)

From 6176776e1f88718d802b317f506e2b56635fa31a Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 3 Jun 2024 15:06:39 -0700
Subject: [PATCH 293/842] Improve options docs (#15888)

Recently I have answered a few user questions about how to use cudf options for display. We were missing documentation that explained that display options are inherited from pandas. I also found a broken link in the docs. This PR fixes both of those doc-related issues.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15888
---
 docs/cudf/source/cudf_pandas/how-it-works.md     |  5 ++---
 docs/cudf/source/user_guide/api_docs/options.rst | 13 +++++++++++++
 docs/cudf/source/user_guide/options.md           |  2 +-
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/docs/cudf/source/cudf_pandas/how-it-works.md b/docs/cudf/source/cudf_pandas/how-it-works.md
index ee856c84b78..75f57742ac9 100644
--- a/docs/cudf/source/cudf_pandas/how-it-works.md
+++ b/docs/cudf/source/cudf_pandas/how-it-works.md
@@ -34,6 +34,5 @@ correct result. Data is automatically transferred from host to device
 transfers.
 
 When using `cudf.pandas`, cuDF's [pandas compatibility
-mode](https://docs.rapids.ai/api/cudf/stable/api_docs/options/#available-options)
-is automatically enabled, ensuring consistency with pandas-specific
-semantics like default sort ordering.
+mode](api.options) is automatically enabled, ensuring consistency with
+pandas-specific semantics like default sort ordering.
diff --git a/docs/cudf/source/user_guide/api_docs/options.rst b/docs/cudf/source/user_guide/api_docs/options.rst
index b3a4004e2d9..4c0f6684b76 100644
--- a/docs/cudf/source/user_guide/api_docs/options.rst
+++ b/docs/cudf/source/user_guide/api_docs/options.rst
@@ -12,6 +12,19 @@ Options and settings
    cudf.describe_option
    cudf.option_context
 
+Display options are controlled by pandas
+----------------------------------------
+
+Options for display are inherited from pandas. This includes commonly accessed options such as:
+
+- ``display.max_columns``
+- ``display.max_info_rows``
+- ``display.max_rows``
+- ``display.max_seq_items``
+
+For example, to show all rows of a DataFrame or Series in a Jupyter notebook, call ``pandas.set_option("display.max_rows", None)``.
+
+See also the :ref:`full list of pandas display options <pandas:options.available>`.
 
 Available options
 -----------------
diff --git a/docs/cudf/source/user_guide/options.md b/docs/cudf/source/user_guide/options.md
index 245d3fd1974..997681212fb 100644
--- a/docs/cudf/source/user_guide/options.md
+++ b/docs/cudf/source/user_guide/options.md
@@ -11,4 +11,4 @@ When no argument is provided,
 all options are printed.
 To set value to a option, use {py:func}`cudf.set_option`.
 
-See the [API reference](api.options) for more details.
+See the [options API reference](api.options) for descriptions of the available options.

From 4a0b59133ed56c043fc73d24785f24be0b4fbe69 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 3 Jun 2024 15:08:31 -0700
Subject: [PATCH 294/842] Update Python labels and remove unnecessary ones
 (#15893)

This PR leverages some of the new labels we have for organizing our issues and removes labels that aren't really used at the moment. If reviewers feel strongly I can keep the ci label, but AFAICT that doesn't really get used for anything at the moment and we'll benefit more from leveraging future labels to help direct tasks to the build/infra team vs cudf devs.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15893
---
 .github/labeler.yml | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/.github/labeler.yml b/.github/labeler.yml
index d14344384d1..48967417af3 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,9 +1,19 @@
 # Documentation for config - https://github.com/actions/labeler#common-examples
 
-cuDF (Python):
+Python:
   - 'python/**'
   - 'notebooks/**'
 
+cudf.pandas:
+  - 'python/cudf/cudf/pandas/**'
+  - 'python/cudf/cudf_pandas_tests/**'
+
+cudf.polars:
+  - 'python/cudf_polars/**'
+
+pylibcudf:
+  - 'python/cudf/cudf/_lib/pylibcudf/**'
+
 libcudf:
   - 'cpp/**'
 
@@ -12,11 +22,5 @@ CMake:
   - '**/cmake/**'
   - '**/*.cmake'
 
-cuDF (Java):
+Java:
   - 'java/**'
-
-ci:
-  - 'ci/**'
-
-conda:
-  - 'conda/**'

From 382de32e8137a3a59a0800f46ef8a1de62b1a6e5 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 3 Jun 2024 15:14:52 -0700
Subject: [PATCH 295/842] Add support for additional metaclasses of proxies and
 use for ExcelWriter (#15399)

The ExcelWriter supports the abstract os.PathLike interface, but we would also like that support to be reflected in the class's MRO. Doing so is slightly complicated because os.PathLike is an ABC, and as such has a different metaclass. Therefore, in order to add os.PathLike as a base class, we must also generate a suitable combined metaclass for our ExcelWriter wrapper.

This change ensures the `isinstance(pd.ExcelWriter(...), os.PathLike)` returns `True` when using cudf.pandas.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15399
---
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 11 +++++--
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 30 +++++++------------
 .../cudf_pandas_tests/test_cudf_pandas.py     |  5 ++++
 3 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 2e3880e14f6..698dd946022 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -1,8 +1,10 @@
 # SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
+import abc
 import copyreg
 import importlib
+import os
 import pickle
 import sys
 
@@ -857,7 +859,12 @@ def Index__new__(cls, *args, **kwargs):
     pd.ExcelWriter,
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
-    additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
+    additional_attributes={
+        "__hash__": _FastSlowAttribute("__hash__"),
+        "__fspath__": _FastSlowAttribute("__fspath__"),
+    },
+    bases=(os.PathLike,),
+    metaclasses=(abc.ABCMeta,),
 )
 
 try:
@@ -1032,7 +1039,7 @@ def holiday_calendar_factory_wrapper(*args, **kwargs):
     fast_to_slow=_Unusable(),
     slow_to_fast=_Unusable(),
     additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
-    meta_class=pd_HolidayCalendarMetaClass,
+    metaclasses=(pd_HolidayCalendarMetaClass,),
 )
 
 Holiday = make_final_proxy_type(
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 94caec1ce6c..169dd80e132 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -106,19 +106,6 @@ def __call__(self):
 _DELETE = object()
 
 
-def create_composite_metaclass(base_meta, additional_meta):
-    """
-    Dynamically creates a composite metaclass that inherits from both provided metaclasses.
-    This ensures that the metaclass behaviors of both base_meta and additional_meta are preserved.
-    """
-
-    class CompositeMeta(base_meta, additional_meta):
-        def __new__(cls, name, bases, namespace):
-            return super().__new__(cls, name, bases, namespace)
-
-    return CompositeMeta
-
-
 def make_final_proxy_type(
     name: str,
     fast_type: type,
@@ -130,7 +117,7 @@ def make_final_proxy_type(
     additional_attributes: Mapping[str, Any] | None = None,
     postprocess: Callable[[_FinalProxy, Any, Any], Any] | None = None,
     bases: Tuple = (),
-    meta_class=None,
+    metaclasses: Tuple = (),
 ) -> Type[_FinalProxy]:
     """
     Defines a fast-slow proxy type for a pair of "final" fast and slow
@@ -161,6 +148,8 @@ def make_final_proxy_type(
         construct said unwrapped object. See also `_maybe_wrap_result`.
     bases
         Optional tuple of base classes to insert into the mro.
+    metaclasses
+        Optional tuple of metaclasses to unify with the base proxy metaclass.
 
     Notes
     -----
@@ -241,15 +230,18 @@ def _fsproxy_state(self) -> _State:
             cls_dict[slow_name] = _FastSlowAttribute(
                 slow_name, private=slow_name.startswith("_")
             )
-    if meta_class is None:
-        meta_class = _FastSlowProxyMeta
-    else:
-        meta_class = create_composite_metaclass(_FastSlowProxyMeta, meta_class)
 
+    metaclass = _FastSlowProxyMeta
+    if metaclasses:
+        metaclass = types.new_class(  # type: ignore
+            f"{name}_Meta",
+            metaclasses + (_FastSlowProxyMeta,),
+            {},
+        )
     cls = types.new_class(
         name,
         (*bases, _FinalProxy),
-        {"metaclass": meta_class},
+        {"metaclass": metaclass},
         lambda ns: ns.update(cls_dict),
     )
     functools.update_wrapper(
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 75bceea3034..fef829b17fc 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -6,6 +6,7 @@
 import copy
 import datetime
 import operator
+import os
 import pathlib
 import pickle
 import tempfile
@@ -1421,3 +1422,7 @@ def test_holidays_within_dates(holiday, start, expected):
             utc.localize(xpd.Timestamp(start)),
         )
     ) == [utc.localize(dt) for dt in expected]
+
+
+def test_excelwriter_pathlike():
+    assert isinstance(pd.ExcelWriter("foo.xlsx"), os.PathLike)

From eb460169786665b1624cb6c4f9b502b800810b37 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 4 Jun 2024 06:32:49 -0500
Subject: [PATCH 296/842] Migrate column factories to pylibcudf (#15257)

This PR implements `column_factories.hpp` using `pylibcudf` and migrates the cuDF cython to use them cc @vyasr

Authors:
  - https://github.com/brandon-b-miller
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15257
---
 cpp/src/column/column_factories.cpp           |  17 +-
 cpp/tests/column/factories_test.cpp           |   4 +-
 cpp/tests/fixed_point/fixed_point_tests.cpp   |   2 +-
 .../api_docs/pylibcudf/column_factories.rst   |   6 +
 .../user_guide/api_docs/pylibcudf/index.rst   |   1 +
 python/cudf/cudf/_lib/column.pyx              |  21 +-
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |   1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |   2 +
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |   4 +-
 .../cudf/_lib/pylibcudf/column_factories.pxd  |  52 ++++
 .../cudf/_lib/pylibcudf/column_factories.pyx  | 205 ++++++++++++++
 python/cudf/cudf/_lib/pylibcudf/interop.pyx   |  82 ++++++
 .../libcudf/column/column_factories.pxd       |  73 ++++-
 python/cudf/cudf/_lib/pylibcudf/types.pxd     |   1 +
 python/cudf/cudf/_lib/pylibcudf/types.pyx     |   3 +-
 .../pylibcudf_tests/test_column_factories.py  | 253 ++++++++++++++++++
 .../cudf/cudf/pylibcudf_tests/test_interop.py |  69 +++++
 17 files changed, 767 insertions(+), 29 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/column_factories.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/column_factories.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_column_factories.py
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_interop.py

diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index e40056fc8a1..0260068d4db 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -65,7 +65,8 @@ std::size_t size_of(data_type element_type)
 std::unique_ptr<column> make_empty_column(data_type type)
 {
   CUDF_EXPECTS(type.id() == type_id::EMPTY || !cudf::is_nested(type),
-               "make_empty_column is invalid to call on nested types");
+               "make_empty_column is invalid to call on nested types",
+               cudf::data_type_error);
   return std::make_unique<column>(type, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0);
 }
 
@@ -80,7 +81,9 @@ std::unique_ptr<column> make_numeric_column(data_type type,
                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
+  CUDF_EXPECTS(type.id() != type_id::EMPTY && is_numeric(type),
+               "Invalid, non-numeric type.",
+               cudf::data_type_error);
   CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
 
   return std::make_unique<column>(
@@ -100,7 +103,7 @@ std::unique_ptr<column> make_fixed_point_column(data_type type,
                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.");
+  CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.", cudf::data_type_error);
   CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
 
   return std::make_unique<column>(
@@ -120,7 +123,7 @@ std::unique_ptr<column> make_timestamp_column(data_type type,
                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
+  CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.", cudf::data_type_error);
   CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
 
   return std::make_unique<column>(
@@ -140,7 +143,7 @@ std::unique_ptr<column> make_duration_column(data_type type,
                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
+  CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.", cudf::data_type_error);
   CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
 
   return std::make_unique<column>(
@@ -160,7 +163,9 @@ std::unique_ptr<column> make_fixed_width_column(data_type type,
                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type.");
+  CUDF_EXPECTS(type.id() != type_id::EMPTY && is_fixed_width(type),
+               "Invalid, non-fixed-width type.",
+               cudf::data_type_error);
 
   // clang-format off
   if      (is_timestamp  (type)) return make_timestamp_column  (type, size, state, stream, mr);
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index afebc91dd73..dca36eaa4e7 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -164,7 +164,7 @@ TEST_P(NonNumericFactoryTest, NonNumericThrow)
     auto column = cudf::make_numeric_column(
       cudf::data_type{GetParam()}, this->size(), cudf::mask_state::UNALLOCATED);
   };
-  EXPECT_THROW(construct(), cudf::logic_error);
+  EXPECT_THROW(construct(), cudf::data_type_error);
 }
 
 INSTANTIATE_TEST_CASE_P(NonNumeric,
@@ -307,7 +307,7 @@ TEST_P(NonFixedWidthFactoryTest, NonFixedWidthThrow)
     auto column = cudf::make_fixed_width_column(
       cudf::data_type{GetParam()}, this->size(), cudf::mask_state::UNALLOCATED);
   };
-  EXPECT_THROW(construct(), cudf::logic_error);
+  EXPECT_THROW(construct(), cudf::data_type_error);
 }
 
 INSTANTIATE_TEST_CASE_P(NonFixedWidth,
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index 73de1fbaa68..ab7984d4b03 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -498,7 +498,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointColumnWrapper)
 TYPED_TEST(FixedPointTestAllReps, NoScaleOrWrongTypeID)
 {
   EXPECT_THROW(cudf::make_fixed_point_column(cudf::data_type{cudf::type_id::INT32}, 0),
-               cudf::logic_error);
+               cudf::data_type_error);
 }
 
 TYPED_TEST(FixedPointTestAllReps, SimpleFixedPointColumnWrapper)
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst
new file mode 100644
index 00000000000..c858135b6ce
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst
@@ -0,0 +1,6 @@
+================
+column_factories
+================
+
+.. automodule:: cudf._lib.pylibcudf.column_factories
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 26875ce7d12..58fea77adaa 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -11,6 +11,7 @@ This page provides API documentation for pylibcudf.
     aggregation
     binaryop
     column
+    column_factories
     concatenate
     copying
     filling
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index f33e121241d..7155017b7af 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -39,14 +39,10 @@ from cudf._lib.types cimport (
 from cudf._lib.null_mask import bitmask_allocation_size_bytes
 from cudf._lib.types import dtype_from_pylibcudf_column
 
-# TODO: We currently need this for "casting" empty pylibcudf columns in
-# from_pylibcudf by instead creating an empty numeric column. We will be able
-# to remove this once column factories are exposed to pylibcudf.
 
 cimport cudf._lib.pylibcudf.libcudf.copying as cpp_copying
 cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
 cimport cudf._lib.pylibcudf.libcudf.unary as libcudf_unary
-from cudf._lib.pylibcudf cimport Column as plc_Column
 from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_contents
 from cudf._lib.pylibcudf.libcudf.column.column_factories cimport (
     make_column_from_scalar as cpp_make_column_from_scalar,
@@ -623,22 +619,17 @@ cdef class Column:
         pylibcudf.Column
             A new pylibcudf.Column referencing the same data.
         """
-        cdef libcudf_types.data_type new_dtype
         if col.type().id() == pylibcudf.TypeId.TIMESTAMP_DAYS:
             col = pylibcudf.unary.cast(
                 col, pylibcudf.DataType(pylibcudf.TypeId.TIMESTAMP_SECONDS)
             )
         elif col.type().id() == pylibcudf.TypeId.EMPTY:
-            new_dtype = libcudf_types.data_type(libcudf_types.type_id.INT8)
-            # TODO: This function call is what requires cimporting pylibcudf.
-            # We can remove the cimport once we can directly do
-            # pylibcudf.column_factories.make_numeric_column or equivalent.
-            col = plc_Column.from_libcudf(
-                move(
-                    make_numeric_column(
-                        new_dtype, col.size(), libcudf_types.mask_state.ALL_NULL
-                        )
-                    )
+            new_dtype = pylibcudf.DataType(pylibcudf.TypeId.INT8)
+
+            col = pylibcudf.column_factories.make_numeric_column(
+                new_dtype,
+                col.size(),
+                pylibcudf.column_factories.MaskState.ALL_NULL
             )
 
         dtype = dtype_from_pylibcudf_column(col)
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index eff14ad549b..7d0676f6def 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -16,6 +16,7 @@ set(cython_sources
     aggregation.pyx
     binaryop.pyx
     column.pyx
+    column_factories.pyx
     concatenate.pyx
     copying.pyx
     filling.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 4f77f8cbaef..b289d112a90 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -4,6 +4,7 @@
 from . cimport (
     aggregation,
     binaryop,
+    column_factories,
     concatenate,
     copying,
     filling,
@@ -40,6 +41,7 @@ __all__ = [
     "binaryop",
     "concatenate",
     "copying",
+    "column_factories",
     "filling",
     "gpumemoryview",
     "groupby",
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 048b62b6013..2565332f3ed 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -3,6 +3,7 @@
 from . import (
     aggregation,
     binaryop,
+    column_factories,
     concatenate,
     copying,
     filling,
@@ -27,7 +28,7 @@
 from .gpumemoryview import gpumemoryview
 from .scalar import Scalar
 from .table import Table
-from .types import DataType, TypeId
+from .types import DataType, MaskState, TypeId
 
 __all__ = [
     "Column",
@@ -39,6 +40,7 @@
     "binaryop",
     "concatenate",
     "copying",
+    "column_factories",
     "filling",
     "gpumemoryview",
     "groupby",
diff --git a/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd
new file mode 100644
index 00000000000..9dbd74ab16c
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd
@@ -0,0 +1,52 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type
+
+from .column cimport Column
+from .types cimport DataType, size_type, type_id
+
+ctypedef fused MakeEmptyColumnOperand:
+    DataType
+    type_id
+    object
+
+ctypedef fused MaskArg:
+    mask_state
+    object
+
+
+cpdef Column make_empty_column(
+    MakeEmptyColumnOperand type_or_id
+)
+
+cpdef Column make_numeric_column(
+    DataType type_,
+    size_type size,
+    MaskArg mask,
+)
+
+cpdef Column make_fixed_point_column(
+    DataType type_,
+    size_type size,
+    MaskArg mask,
+)
+
+cpdef Column make_timestamp_column(
+    DataType type_,
+    size_type size,
+    MaskArg mask,
+)
+
+cpdef Column make_duration_column(
+    DataType type_,
+    size_type size,
+    MaskArg mask,
+)
+
+cpdef Column make_fixed_width_column(
+    DataType type_,
+    size_type size,
+    MaskArg mask,
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx b/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx
new file mode 100644
index 00000000000..ef7f512f0e5
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx
@@ -0,0 +1,205 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_factories cimport (
+    make_duration_column as cpp_make_duration_column,
+    make_empty_column as cpp_make_empty_column,
+    make_fixed_point_column as cpp_make_fixed_point_column,
+    make_fixed_width_column as cpp_make_fixed_width_column,
+    make_numeric_column as cpp_make_numeric_column,
+    make_timestamp_column as cpp_make_timestamp_column,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type
+
+from .types cimport DataType, type_id
+
+from .types import MaskState, TypeId
+
+
+cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id):
+    cdef unique_ptr[column] result
+    cdef type_id id
+
+    if MakeEmptyColumnOperand is object:
+        if isinstance(type_or_id, TypeId):
+            id = type_or_id
+            with nogil:
+                result = move(
+                    cpp_make_empty_column(
+                        id
+                    )
+                )
+        else:
+            raise TypeError(
+                "Must pass a TypeId or DataType"
+            )
+    elif MakeEmptyColumnOperand is DataType:
+        with nogil:
+            result = move(
+                cpp_make_empty_column(
+                    type_or_id.c_obj
+                )
+            )
+    elif MakeEmptyColumnOperand is type_id:
+        with nogil:
+            result = move(
+                cpp_make_empty_column(
+                    type_or_id
+                )
+            )
+    else:
+        raise TypeError(
+            "Must pass a TypeId or DataType"
+        )
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column make_numeric_column(
+    DataType type_,
+    size_type size,
+    MaskArg mstate
+):
+
+    cdef unique_ptr[column] result
+    cdef mask_state state
+
+    if MaskArg is object:
+        if isinstance(mstate, MaskState):
+            state = mstate
+        else:
+            raise TypeError("Invalid mask argument")
+    elif MaskArg is mask_state:
+        state = mstate
+    else:
+        raise TypeError("Invalid mask argument")
+    with nogil:
+        result = move(
+            cpp_make_numeric_column(
+                type_.c_obj,
+                size,
+                state
+            )
+        )
+
+    return Column.from_libcudf(move(result))
+
+cpdef Column make_fixed_point_column(
+    DataType type_,
+    size_type size,
+    MaskArg mstate
+):
+
+    cdef unique_ptr[column] result
+    cdef mask_state state
+
+    if MaskArg is object:
+        if isinstance(mstate, MaskState):
+            state = mstate
+        else:
+            raise TypeError("Invalid mask argument")
+    elif MaskArg is mask_state:
+        state = mstate
+    else:
+        raise TypeError("Invalid mask argument")
+    with nogil:
+        result = move(
+            cpp_make_fixed_point_column(
+                type_.c_obj,
+                size,
+                state
+            )
+        )
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column make_timestamp_column(
+    DataType type_,
+    size_type size,
+    MaskArg mstate
+):
+
+    cdef unique_ptr[column] result
+    cdef mask_state state
+
+    if MaskArg is object:
+        if isinstance(mstate, MaskState):
+            state = mstate
+        else:
+            raise TypeError("Invalid mask argument")
+    elif MaskArg is mask_state:
+        state = mstate
+    else:
+        raise TypeError("Invalid mask argument")
+    with nogil:
+        result = move(
+            cpp_make_timestamp_column(
+                type_.c_obj,
+                size,
+                state
+            )
+        )
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column make_duration_column(
+    DataType type_,
+    size_type size,
+    MaskArg mstate
+):
+
+    cdef unique_ptr[column] result
+    cdef mask_state state
+
+    if MaskArg is object:
+        if isinstance(mstate, MaskState):
+            state = mstate
+        else:
+            raise TypeError("Invalid mask argument")
+    elif MaskArg is mask_state:
+        state = mstate
+    else:
+        raise TypeError("Invalid mask argument")
+    with nogil:
+        result = move(
+            cpp_make_duration_column(
+                type_.c_obj,
+                size,
+                state
+            )
+        )
+
+    return Column.from_libcudf(move(result))
+
+
+cpdef Column make_fixed_width_column(
+    DataType type_,
+    size_type size,
+    MaskArg mstate
+):
+
+    cdef unique_ptr[column] result
+    cdef mask_state state
+
+    if MaskArg is object:
+        if isinstance(mstate, MaskState):
+            state = mstate
+        else:
+            raise TypeError("Invalid mask argument")
+    elif MaskArg is mask_state:
+        state = mstate
+    else:
+        raise TypeError("Invalid mask argument")
+    with nogil:
+        result = move(
+            cpp_make_fixed_width_column(
+                type_.c_obj,
+                size,
+                state
+            )
+        )
+
+    return Column.from_libcudf(move(result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
index f172080cece..1e4102e4b64 100644
--- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
@@ -33,6 +33,33 @@ from .scalar cimport Scalar
 from .table cimport Table
 from .types cimport DataType, type_id
 
+ARROW_TO_PYLIBCUDF_TYPES = {
+    pa.int8(): type_id.INT8,
+    pa.int16(): type_id.INT16,
+    pa.int32(): type_id.INT32,
+    pa.int64(): type_id.INT64,
+    pa.uint8(): type_id.UINT8,
+    pa.uint16(): type_id.UINT16,
+    pa.uint32(): type_id.UINT32,
+    pa.uint64(): type_id.UINT64,
+    pa.float32(): type_id.FLOAT32,
+    pa.float64(): type_id.FLOAT64,
+    pa.bool_(): type_id.BOOL8,
+    pa.string(): type_id.STRING,
+    pa.duration('s'): type_id.DURATION_SECONDS,
+    pa.duration('ms'): type_id.DURATION_MILLISECONDS,
+    pa.duration('us'): type_id.DURATION_MICROSECONDS,
+    pa.duration('ns'): type_id.DURATION_NANOSECONDS,
+    pa.timestamp('s'): type_id.TIMESTAMP_SECONDS,
+    pa.timestamp('ms'): type_id.TIMESTAMP_MILLISECONDS,
+    pa.timestamp('us'): type_id.TIMESTAMP_MICROSECONDS,
+    pa.timestamp('ns'): type_id.TIMESTAMP_NANOSECONDS,
+    pa.date32(): type_id.TIMESTAMP_DAYS,
+}
+
+LIBCUDF_TO_ARROW_TYPES = {
+    v: k for k, v in ARROW_TO_PYLIBCUDF_TYPES.items()
+}
 
 cdef column_metadata _metadata_to_libcudf(metadata):
     """Convert a ColumnMetadata object to C++ column_metadata.
@@ -77,6 +104,21 @@ def from_arrow(pyarrow_object, *, DataType data_type=None):
     raise TypeError("from_arrow only accepts Table and Scalar objects")
 
 
+@from_arrow.register(pa.DataType)
+def _from_arrow_datatype(pyarrow_object):
+    if isinstance(pyarrow_object, pa.Decimal128Type):
+        return DataType(type_id.DECIMAL128, scale=-pyarrow_object.scale)
+    elif isinstance(pyarrow_object, pa.StructType):
+        return DataType(type_id.STRUCT)
+    elif isinstance(pyarrow_object, pa.ListType):
+        return DataType(type_id.LIST)
+    else:
+        try:
+            return DataType(ARROW_TO_PYLIBCUDF_TYPES[pyarrow_object])
+        except KeyError:
+            raise TypeError(f"Unable to convert {pyarrow_object} to cudf datatype")
+
+
 @from_arrow.register(pa.Table)
 def _from_arrow_table(pyarrow_object, *, DataType data_type=None):
     if data_type is not None:
@@ -170,6 +212,46 @@ def to_arrow(cudf_object, metadata=None):
     raise TypeError("to_arrow only accepts Table and Scalar objects")
 
 
+@to_arrow.register(DataType)
+def _to_arrow_datatype(cudf_object, **kwargs):
+    """
+    Convert a datatype to arrow.
+
+    Translation of some types requires extra information as a keyword
+    argument. Specifically:
+
+    - When translating a decimal type, provide ``precision``
+    - When translating a struct type, provide ``fields``
+    - When translating a list type, provide the wrapped ``value_type``
+    """
+    if cudf_object.id() in {type_id.DECIMAL32, type_id.DECIMAL64, type_id.DECIMAL128}:
+        if not (precision := kwargs.get("precision")):
+            raise ValueError(
+                "Precision must be provided for decimal types"
+            )
+            # no pa.decimal32 or pa.decimal64
+        return pa.decimal128(precision, -cudf_object.scale())
+    elif cudf_object.id() == type_id.STRUCT:
+        if not (fields := kwargs.get("fields")):
+            raise ValueError(
+                "Fields must be provided for struct types"
+            )
+        return pa.struct(fields)
+    elif cudf_object.id() == type_id.LIST:
+        if not (value_type := kwargs.get("value_type")):
+            raise ValueError(
+                "Value type must be provided for list types"
+            )
+        return pa.list_(value_type)
+    else:
+        try:
+            return ARROW_TO_PYLIBCUDF_TYPES[cudf_object.id()]
+        except KeyError:
+            raise TypeError(
+                f"Unable to convert {cudf_object.id()} to arrow datatype"
+            )
+
+
 @to_arrow.register(Table)
 def _to_arrow_table(cudf_object, metadata=None):
     if metadata is None:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd
index fd22d92cb30..2faff21a77b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd
@@ -2,9 +2,17 @@
 
 from libcpp.memory cimport unique_ptr
 
+from rmm._lib.device_buffer cimport device_buffer
+
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, mask_state, size_type
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    bitmask_type,
+    data_type,
+    mask_state,
+    size_type,
+    type_id,
+)
 
 
 cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil:
@@ -12,5 +20,64 @@ cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil:
                                                 size_type size,
                                                 mask_state state) except +
 
-    cdef unique_ptr[column] make_column_from_scalar (const scalar & s,
-                                                     size_type size) except +
+    cdef unique_ptr[column] make_numeric_column(data_type type,
+                                                size_type size,
+                                                device_buffer mask,
+                                                size_type null_count) except +
+
+    cdef unique_ptr[column] make_fixed_point_column(
+        data_type type,
+        size_type size,
+        mask_state state) except +
+
+    cdef unique_ptr[column] make_fixed_point_column(
+        data_type type,
+        size_type size,
+        device_buffer mask,
+        size_type null_count) except +
+
+    cdef unique_ptr[column] make_timestamp_column(
+        data_type type,
+        size_type size,
+        mask_state state) except +
+
+    cdef unique_ptr[column] make_timestamp_column(
+        data_type type,
+        size_type size,
+        device_buffer mask,
+        size_type null_count) except +
+
+    cdef unique_ptr[column] make_duration_column(
+        data_type type,
+        size_type size,
+        mask_state state) except +
+
+    cdef unique_ptr[column] make_duration_column(
+        data_type type,
+        size_type size,
+        device_buffer mask,
+        size_type null_count) except +
+
+    cdef unique_ptr[column] make_fixed_width_column(
+        data_type type,
+        size_type size,
+        mask_state state) except +
+
+    cdef unique_ptr[column] make_fixed_width_column(
+        data_type type,
+        size_type size,
+        device_buffer mask,
+        size_type null_count) except +
+
+    cdef unique_ptr[column] make_column_from_scalar(const scalar& s,
+                                                    size_type size) except +
+
+    cdef unique_ptr[column] make_dictionary_from_scalar(const scalar& s,
+                                                        size_type size) except +
+
+    cdef unique_ptr[column] make_empty_column(type_id id) except +
+    cdef unique_ptr[column] make_empty_column(data_type type_) except +
+
+    cdef unique_ptr[column] make_dictionary_column(
+        unique_ptr[column] keys_column,
+        unique_ptr[column] indices_column) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd
index e54a259819e..7d3ddca14a1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd
@@ -13,6 +13,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport (
     null_order,
     null_policy,
     order,
+    size_type,
     sorted,
     type_id,
 )
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index a5248ad0a1f..6dbb287f3c4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -8,6 +8,7 @@ from cudf._lib.pylibcudf.libcudf.types import type_id as TypeId  # no-cython-lin
 from cudf._lib.pylibcudf.libcudf.types import nan_policy as NanPolicy  # no-cython-lint, isort:skip
 from cudf._lib.pylibcudf.libcudf.types import null_policy as NullPolicy  # no-cython-lint, isort:skip
 from cudf._lib.pylibcudf.libcudf.types import interpolation as Interpolation  # no-cython-lint, isort:skip
+from cudf._lib.pylibcudf.libcudf.types import mask_state as MaskState  # no-cython-lint, isort:skip
 from cudf._lib.pylibcudf.libcudf.types import nan_equality as NanEquality  # no-cython-lint, isort:skip
 from cudf._lib.pylibcudf.libcudf.types import null_equality as NullEquality  # no-cython-lint, isort:skip
 from cudf._lib.pylibcudf.libcudf.types import null_order as NullOrder  # no-cython-lint, isort:skip
@@ -22,7 +23,7 @@ cdef class DataType:
 
     Parameters
     ----------
-    id : TypeId
+    id : type_id
         The type's identifier
     scale : int
         The scale associated with the data. Only used for decimal data types.
diff --git a/python/cudf/cudf/pylibcudf_tests/test_column_factories.py b/python/cudf/cudf/pylibcudf_tests/test_column_factories.py
new file mode 100644
index 00000000000..4c05770a41f
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_column_factories.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import DEFAULT_STRUCT_TESTING_TYPE, assert_column_eq
+
+from cudf._lib import pylibcudf as plc
+
+EMPTY_COL_SIZE = 3
+
+NUMERIC_TYPES = [
+    pa.uint8(),
+    pa.uint16(),
+    pa.uint32(),
+    pa.uint64(),
+    pa.int8(),
+    pa.int16(),
+    pa.int32(),
+    pa.int64(),
+    pa.float32(),
+    pa.float64(),
+    pa.bool_(),
+]
+
+TIMESTAMP_TYPES = [
+    pa.timestamp("s"),
+    pa.timestamp("ms"),
+    pa.timestamp("us"),
+    pa.timestamp("ns"),
+]
+
+DURATION_TYPES = [
+    pa.duration("s"),
+    pa.duration("ms"),
+    pa.duration("us"),
+    pa.duration("ns"),
+]
+
+DECIMAL_TYPES = [pa.decimal128(38, 2)]
+
+STRING_TYPES = [pa.string()]
+STRUCT_TYPES = [DEFAULT_STRUCT_TESTING_TYPE]
+LIST_TYPES = [pa.list_(pa.int64())]
+
+ALL_TYPES = (
+    NUMERIC_TYPES
+    + TIMESTAMP_TYPES
+    + DURATION_TYPES
+    + STRING_TYPES
+    + DECIMAL_TYPES
+    + STRUCT_TYPES
+    + LIST_TYPES
+)
+
+
+@pytest.fixture(scope="module", params=NUMERIC_TYPES, ids=repr)
+def numeric_pa_type(request):
+    return request.param
+
+
+@pytest.fixture(
+    scope="module",
+    params=DECIMAL_TYPES,
+    ids=repr,
+)
+def fixed_point_pa_type(request):
+    return request.param
+
+
+@pytest.fixture(
+    scope="module",
+    params=TIMESTAMP_TYPES,
+    ids=repr,
+)
+def timestamp_pa_type(request):
+    return request.param
+
+
+@pytest.fixture(
+    scope="module",
+    params=DURATION_TYPES,
+    ids=repr,
+)
+def duration_pa_type(request):
+    return request.param
+
+
+@pytest.fixture(
+    scope="module",
+    params=[
+        plc.MaskState.UNALLOCATED,
+        plc.MaskState.ALL_VALID,
+        plc.MaskState.ALL_NULL,
+        plc.MaskState.UNINITIALIZED,
+    ],
+    ids=["unallocated", "all_valid", "all_null", "uninitialized"],
+)
+def mask_state(request):
+    return request.param
+
+
+def test_make_empty_column_dtype(pa_type):
+    pa_col = pa.array([], type=pa_type)
+
+    plc_type = plc.interop.from_arrow(pa_col).type()
+
+    if isinstance(pa_type, (pa.ListType, pa.StructType)):
+        with pytest.raises(ValueError):
+            plc.column_factories.make_empty_column(plc_type)
+        return
+
+    cudf_col = plc.column_factories.make_empty_column(plc_type)
+    assert_column_eq(cudf_col, pa_col)
+
+
+def test_make_empty_column_typeid(pa_type):
+    pa_col = pa.array([], type=pa_type)
+
+    tid = plc.interop.from_arrow(pa_col).type().id()
+
+    if isinstance(pa_type, (pa.ListType, pa.StructType)):
+        with pytest.raises(ValueError):
+            plc.column_factories.make_empty_column(tid)
+        return
+
+    cudf_col = plc.column_factories.make_empty_column(tid)
+    assert_column_eq(cudf_col, pa_col)
+
+
+def validate_empty_column(col, mask_state, dtype):
+    assert col.size() == EMPTY_COL_SIZE
+
+    if mask_state == plc.types.MaskState.UNALLOCATED:
+        assert col.null_count() == 0
+    elif mask_state == plc.types.MaskState.ALL_VALID:
+        assert col.null_count() == 0
+    elif mask_state == plc.types.MaskState.ALL_NULL:
+        assert col.null_count() == EMPTY_COL_SIZE
+
+    assert plc.interop.to_arrow(col).type == dtype
+
+
+def test_make_numeric_column(numeric_pa_type, mask_state):
+    plc_type = plc.interop.from_arrow(numeric_pa_type)
+
+    got = plc.column_factories.make_numeric_column(
+        plc_type, EMPTY_COL_SIZE, mask_state
+    )
+    validate_empty_column(got, mask_state, numeric_pa_type)
+
+
+@pytest.mark.parametrize(
+    "non_numeric_pa_type", [t for t in ALL_TYPES if t not in NUMERIC_TYPES]
+)
+def test_make_numeric_column_dtype_err(non_numeric_pa_type):
+    plc_type = plc.interop.from_arrow(non_numeric_pa_type)
+    with pytest.raises(ValueError):
+        plc.column_factories.make_numeric_column(
+            plc_type, 3, plc.types.MaskState.UNALLOCATED
+        )
+
+
+def test_make_numeric_column_negative_size_err(numeric_pa_type):
+    plc_type = plc.interop.from_arrow(numeric_pa_type)
+    with pytest.raises(RuntimeError):
+        plc.column_factories.make_numeric_column(
+            plc_type, -1, plc.types.MaskState.UNALLOCATED
+        )
+
+
+def test_make_fixed_point_column(fixed_point_pa_type, mask_state):
+    plc_type = plc.interop.from_arrow(fixed_point_pa_type)
+
+    got = plc.column_factories.make_fixed_point_column(
+        plc_type, EMPTY_COL_SIZE, mask_state
+    )
+
+    validate_empty_column(got, mask_state, fixed_point_pa_type)
+
+
+@pytest.mark.parametrize(
+    "non_fixed_point_pa_type", [t for t in ALL_TYPES if t not in DECIMAL_TYPES]
+)
+def test_make_fixed_point_column_dtype_err(non_fixed_point_pa_type):
+    plc_type = plc.interop.from_arrow(non_fixed_point_pa_type)
+    with pytest.raises(ValueError):
+        plc.column_factories.make_fixed_point_column(
+            plc_type, 3, plc.types.MaskState.UNALLOCATED
+        )
+
+
+def test_make_fixed_point_column_negative_size_err(fixed_point_pa_type):
+    plc_type = plc.interop.from_arrow(fixed_point_pa_type)
+    with pytest.raises(RuntimeError):
+        plc.column_factories.make_fixed_point_column(
+            plc_type, -1, plc.types.MaskState.UNALLOCATED
+        )
+
+
+def test_make_timestamp_column(timestamp_pa_type, mask_state):
+    plc_type = plc.interop.from_arrow(timestamp_pa_type)
+
+    got = plc.column_factories.make_timestamp_column(
+        plc_type, EMPTY_COL_SIZE, mask_state
+    )
+    validate_empty_column(got, mask_state, timestamp_pa_type)
+
+
+@pytest.mark.parametrize(
+    "non_timestamp_pa_type", [t for t in ALL_TYPES if t not in TIMESTAMP_TYPES]
+)
+def test_make_timestamp_column_dtype_err(non_timestamp_pa_type):
+    plc_type = plc.interop.from_arrow(non_timestamp_pa_type)
+    with pytest.raises(ValueError):
+        plc.column_factories.make_timestamp_column(
+            plc_type, 3, plc.types.MaskState.UNALLOCATED
+        )
+
+
+def test_make_timestamp_column_negative_size_err(timestamp_pa_type):
+    plc_type = plc.interop.from_arrow(timestamp_pa_type)
+    with pytest.raises(RuntimeError):
+        plc.column_factories.make_timestamp_column(
+            plc_type, -1, plc.types.MaskState.UNALLOCATED
+        )
+
+
+def test_make_duration_column(duration_pa_type, mask_state):
+    plc_type = plc.interop.from_arrow(duration_pa_type)
+
+    got = plc.column_factories.make_duration_column(
+        plc_type, EMPTY_COL_SIZE, mask_state
+    )
+    validate_empty_column(got, mask_state, duration_pa_type)
+
+
+@pytest.mark.parametrize(
+    "non_duration_pa_type", [t for t in ALL_TYPES if t not in DURATION_TYPES]
+)
+def test_make_duration_column_dtype_err(non_duration_pa_type):
+    plc_type = plc.interop.from_arrow(non_duration_pa_type)
+    with pytest.raises(ValueError):
+        plc.column_factories.make_duration_column(
+            plc_type, 3, plc.types.MaskState.UNALLOCATED
+        )
+
+
+def test_make_duration_column_negative_size_err(duration_pa_type):
+    plc_type = plc.interop.from_arrow(duration_pa_type)
+    with pytest.raises(RuntimeError):
+        plc.column_factories.make_duration_column(
+            plc_type, -1, plc.types.MaskState.UNALLOCATED
+        )
diff --git a/python/cudf/cudf/pylibcudf_tests/test_interop.py b/python/cudf/cudf/pylibcudf_tests/test_interop.py
new file mode 100644
index 00000000000..5c05f460e28
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_interop.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+
+import cudf._lib.pylibcudf as plc
+
+
+def test_list_dtype_roundtrip():
+    list_type = pa.list_(pa.int32())
+    plc_type = plc.interop.from_arrow(list_type)
+
+    assert plc_type == plc.types.DataType(plc.types.TypeId.LIST)
+
+    with pytest.raises(ValueError):
+        plc.interop.to_arrow(plc_type)
+
+    arrow_type = plc.interop.to_arrow(
+        plc_type, value_type=list_type.value_type
+    )
+    assert arrow_type == list_type
+
+
+def test_struct_dtype_roundtrip():
+    struct_type = pa.struct([("a", pa.int32()), ("b", pa.string())])
+    plc_type = plc.interop.from_arrow(struct_type)
+
+    assert plc_type == plc.types.DataType(plc.types.TypeId.STRUCT)
+
+    with pytest.raises(ValueError):
+        plc.interop.to_arrow(plc_type)
+
+    arrow_type = plc.interop.to_arrow(
+        plc_type,
+        fields=[struct_type.field(i) for i in range(struct_type.num_fields)],
+    )
+    assert arrow_type == struct_type
+
+
+def test_decimal128_roundtrip():
+    decimal_type = pa.decimal128(10, 2)
+    plc_type = plc.interop.from_arrow(decimal_type)
+
+    assert plc_type.id() == plc.types.TypeId.DECIMAL128
+
+    with pytest.raises(ValueError):
+        plc.interop.to_arrow(plc_type)
+
+    arrow_type = plc.interop.to_arrow(
+        plc_type, precision=decimal_type.precision
+    )
+    assert arrow_type == decimal_type
+
+
+@pytest.mark.parametrize(
+    "data_type",
+    [
+        plc.types.DataType(plc.types.TypeId.DECIMAL32),
+        plc.types.DataType(plc.types.TypeId.DECIMAL64),
+    ],
+)
+def test_decimal_other(data_type):
+    precision = 3
+
+    with pytest.raises(ValueError):
+        plc.interop.to_arrow(data_type)
+
+    arrow_type = plc.interop.to_arrow(data_type, precision=precision)
+    assert arrow_type == pa.decimal128(precision, 0)

From fc31aa3c4f99d6348e7c32a3e3c52c68b26ca700 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 4 Jun 2024 10:19:30 -0400
Subject: [PATCH 297/842] Add overflow check when converting large strings to
 lists columns (#15887)

Fixes a couple places where strings columns are converted to lists column as binary -- chars are represented as INT8.
Since lists columns only support `size_type` offsets type, this change will throw an error if the size of the chars exceeds max `size_type` values.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/15887
---
 cpp/src/io/utilities/column_buffer.cpp |  4 ++++
 cpp/src/reshape/byte_cast.cu           | 11 ++++++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index e5d4e1a360f..27fc53fbc9e 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -191,6 +191,10 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
         auto data      = col_content.data.release();
         auto char_size = data->size();
 
+        CUDF_EXPECTS(char_size < static_cast<std::size_t>(std::numeric_limits<size_type>::max()),
+                     "Cannot convert strings column to lists column due to size_type limit",
+                     std::overflow_error);
+
         auto uint8_col = std::make_unique<column>(
           data_type{type_id::UINT8}, char_size, std::move(*data), rmm::device_buffer{}, 0);
 
diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu
index 1b05a9744fa..3dfa0b65814 100644
--- a/cpp/src/reshape/byte_cast.cu
+++ b/cpp/src/reshape/byte_cast.cu
@@ -135,9 +135,14 @@ struct byte_list_conversion_fn<T, std::enable_if_t<std::is_same_v<T, cudf::strin
         input.size(), output_type, stream, mr);
     }
 
-    auto col_content     = std::make_unique<column>(input, stream, mr)->release();
-    auto const num_chars = col_content.data->size();
-    auto uint8_col       = std::make_unique<column>(
+    auto const num_chars = strings_column_view(input).chars_size(stream);
+    CUDF_EXPECTS(num_chars < static_cast<int64_t>(std::numeric_limits<size_type>::max()),
+                 "Cannot convert strings column to lists column due to size_type limit",
+                 std::overflow_error);
+
+    auto col_content = std::make_unique<column>(input, stream, mr)->release();
+
+    auto uint8_col = std::make_unique<column>(
       output_type, num_chars, std::move(*(col_content.data)), rmm::device_buffer{}, 0);
 
     auto result = make_lists_column(

From 54d49fcea4e7ad73df21f0dbfe99097c635b1023 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 4 Jun 2024 16:17:25 +0100
Subject: [PATCH 298/842] Ensure literals have correct dtype (#15890)

The polars schema tells us the dtype for any literals, but previously we were relying on pyarrow inference. Add pylibcudf to pyarrow datatype conversion utilities and use the resulting datatypes explicitly.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - https://github.com/brandon-b-miller
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/15890
---
 python/cudf/cudf/_lib/pylibcudf/interop.pyx   |  3 +-
 python/cudf_polars/cudf_polars/dsl/expr.py    | 46 +++++++++++++------
 python/cudf_polars/cudf_polars/dsl/ir.py      | 10 ++--
 .../cudf_polars/cudf_polars/dsl/translate.py  |  9 ++--
 .../cudf_polars/cudf_polars/utils/dtypes.py   |  3 ++
 python/cudf_polars/pyproject.toml             |  2 +-
 python/cudf_polars/tests/__init__.py          |  6 +++
 .../cudf_polars/tests/expressions/__init__.py |  6 +++
 .../cudf_polars/tests/expressions/test_agg.py |  2 +-
 .../tests/expressions/test_distinct.py        | 36 +++++++++++++++
 python/cudf_polars/tests/test_scan.py         | 12 +----
 11 files changed, 102 insertions(+), 33 deletions(-)
 create mode 100644 python/cudf_polars/tests/__init__.py
 create mode 100644 python/cudf_polars/tests/expressions/__init__.py
 create mode 100644 python/cudf_polars/tests/expressions/test_distinct.py

diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
index 1e4102e4b64..07e9d1ead11 100644
--- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
@@ -55,6 +55,7 @@ ARROW_TO_PYLIBCUDF_TYPES = {
     pa.timestamp('us'): type_id.TIMESTAMP_MICROSECONDS,
     pa.timestamp('ns'): type_id.TIMESTAMP_NANOSECONDS,
     pa.date32(): type_id.TIMESTAMP_DAYS,
+    pa.null(): type_id.EMPTY,
 }
 
 LIBCUDF_TO_ARROW_TYPES = {
@@ -245,7 +246,7 @@ def _to_arrow_datatype(cudf_object, **kwargs):
         return pa.list_(value_type)
     else:
         try:
-            return ARROW_TO_PYLIBCUDF_TYPES[cudf_object.id()]
+            return LIBCUDF_TO_ARROW_TYPES[cudf_object.id()]
         except KeyError:
             raise TypeError(
                 f"Unable to convert {cudf_object.id()} to arrow datatype"
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 249cc3775f7..7187a36f21c 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -484,32 +484,48 @@ def do_evaluate(
             return self._distinct(
                 column,
                 keep=plc.stream_compaction.DuplicateKeepOption.KEEP_FIRST,
-                source_value=plc.interop.from_arrow(pa.scalar(True)),  # noqa: FBT003
-                target_value=plc.interop.from_arrow(pa.scalar(False)),  # noqa: FBT003
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
             )
         elif self.name == pl_expr.BooleanFunction.IsLastDistinct:
             (column,) = columns
             return self._distinct(
                 column,
                 keep=plc.stream_compaction.DuplicateKeepOption.KEEP_LAST,
-                source_value=plc.interop.from_arrow(pa.scalar(True)),  # noqa: FBT003
-                target_value=plc.interop.from_arrow(pa.scalar(False)),  # noqa: FBT003
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
             )
         elif self.name == pl_expr.BooleanFunction.IsUnique:
             (column,) = columns
             return self._distinct(
                 column,
                 keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
-                source_value=plc.interop.from_arrow(pa.scalar(True)),  # noqa: FBT003
-                target_value=plc.interop.from_arrow(pa.scalar(False)),  # noqa: FBT003
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
             )
         elif self.name == pl_expr.BooleanFunction.IsDuplicated:
             (column,) = columns
             return self._distinct(
                 column,
                 keep=plc.stream_compaction.DuplicateKeepOption.KEEP_NONE,
-                source_value=plc.interop.from_arrow(pa.scalar(False)),  # noqa: FBT003
-                target_value=plc.interop.from_arrow(pa.scalar(True)),  # noqa: FBT003
+                source_value=plc.interop.from_arrow(
+                    pa.scalar(value=False, type=plc.interop.to_arrow(self.dtype))
+                ),
+                target_value=plc.interop.from_arrow(
+                    pa.scalar(value=True, type=plc.interop.to_arrow(self.dtype))
+                ),
             )
         elif self.name == pl_expr.BooleanFunction.AllHorizontal:
             name = columns[0].name
@@ -717,7 +733,9 @@ def do_evaluate(
             bounds_policy = plc.copying.OutOfBoundsPolicy.NULLIFY
             obj = plc.replace.replace_nulls(
                 indices.obj,
-                plc.interop.from_arrow(pa.scalar(n), data_type=indices.obj.data_type()),
+                plc.interop.from_arrow(
+                    pa.scalar(n, type=plc.interop.to_arrow(indices.obj.data_type()))
+                ),
             )
         else:
             bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK
@@ -893,11 +911,13 @@ def _reduce(
         )
 
     def _count(self, column: Column) -> Column:
-        # TODO: dtype handling
         return Column(
             plc.Column.from_scalar(
                 plc.interop.from_arrow(
-                    pa.scalar(column.obj.size() - column.obj.null_count()),
+                    pa.scalar(
+                        column.obj.size() - column.obj.null_count(),
+                        type=plc.interop.to_arrow(self.dtype),
+                    ),
                 ),
                 1,
             ),
@@ -909,7 +929,7 @@ def _min(self, column: Column, *, propagate_nans: bool) -> Column:
             return Column(
                 plc.Column.from_scalar(
                     plc.interop.from_arrow(
-                        pa.scalar(float("nan")), data_type=self.dtype
+                        pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
                     ),
                     1,
                 ),
@@ -924,7 +944,7 @@ def _max(self, column: Column, *, propagate_nans: bool) -> Column:
             return Column(
                 plc.Column.from_scalar(
                     plc.interop.from_arrow(
-                        pa.scalar(float("nan")), data_type=self.dtype
+                        pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
                     ),
                     1,
                 ),
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index d630b40f600..f8441b793b5 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -146,9 +146,13 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             assert_never(self.typ)
         if row_index is not None:
             name, offset = row_index
-            # TODO: dtype
-            step = plc.interop.from_arrow(pa.scalar(1))
-            init = plc.interop.from_arrow(pa.scalar(offset))
+            dtype = self.schema[name]
+            step = plc.interop.from_arrow(
+                pa.scalar(1, type=plc.interop.to_arrow(dtype))
+            )
+            init = plc.interop.from_arrow(
+                pa.scalar(offset, type=plc.interop.to_arrow(dtype))
+            )
             index = Column(
                 plc.filling.sequence(df.num_rows, init, step), name
             ).set_sorted(
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index b3d0edf183f..9a301164beb 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -9,9 +9,11 @@
 from functools import singledispatch
 from typing import Any
 
+import pyarrow as pa
+
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
-import cudf._lib.pylibcudf as plc  # noqa: TCH002, singledispatch register needs this name defined.
+import cudf._lib.pylibcudf as plc
 
 from cudf_polars.dsl import expr, ir
 from cudf_polars.utils import dtypes
@@ -295,7 +297,8 @@ def _(node: pl_expr.Window, visitor: Any, dtype: plc.DataType) -> expr.Expr:
 
 @_translate_expr.register
 def _(node: pl_expr.Literal, visitor: Any, dtype: plc.DataType) -> expr.Expr:
-    return expr.Literal(dtype, node.value)
+    value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype))
+    return expr.Literal(dtype, value)
 
 
 @_translate_expr.register
@@ -337,7 +340,7 @@ def _(node: pl_expr.Cast, visitor: Any, dtype: plc.DataType) -> expr.Expr:
     inner = translate_expr(visitor, n=node.expr)
     # Push casts into literals so we can handle Cast(Literal(Null))
     if isinstance(inner, expr.Literal):
-        return expr.Literal(dtype, inner.value)
+        return expr.Literal(dtype, inner.value.cast(plc.interop.to_arrow(dtype)))
     else:
         return expr.Cast(dtype, inner)
 
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 51379433c03..bede0de3c9f 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -13,6 +13,8 @@
 
 import cudf._lib.pylibcudf as plc
 
+__all__ = ["from_polars"]
+
 
 @cache
 def from_polars(dtype: pl.DataType) -> plc.DataType:
@@ -84,6 +86,7 @@ def from_polars(dtype: pl.DataType) -> plc.DataType:
         # TODO: Hopefully
         return plc.DataType(plc.TypeId.EMPTY)
     elif isinstance(dtype, pl.List):
+        # TODO: This doesn't consider the value type.
         return plc.DataType(plc.TypeId.LIST)
     else:
         raise NotImplementedError(f"{dtype=} conversion not supported")
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 49ecd7080b9..e50ee76a9b9 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -134,7 +134,7 @@ ignore = [
 fixable = ["ALL"]
 
 [tool.ruff.lint.per-file-ignores]
-"**/tests/**/test_*.py" = ["D", "INP"]
+"**/tests/**/*.py" = ["D"]
 
 [tool.ruff.lint.flake8-pytest-style]
 # https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style
diff --git a/python/cudf_polars/tests/__init__.py b/python/cudf_polars/tests/__init__.py
new file mode 100644
index 00000000000..4611d642f14
--- /dev/null
+++ b/python/cudf_polars/tests/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/tests/expressions/__init__.py b/python/cudf_polars/tests/expressions/__init__.py
new file mode 100644
index 00000000000..4611d642f14
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index c792ae64f74..645dbd26140 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -56,7 +56,7 @@ def test_agg(df, agg):
     q = df.select(expr)
 
     # https://github.com/rapidsai/cudf/issues/15852
-    check_dtype = agg not in {"count", "n_unique", "median"}
+    check_dtype = agg not in {"n_unique", "median"}
     if not check_dtype and q.schema["a"] != pl.Float64:
         with pytest.raises(AssertionError):
             assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/expressions/test_distinct.py b/python/cudf_polars/tests/expressions/test_distinct.py
new file mode 100644
index 00000000000..22865a7ce22
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_distinct.py
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(params=[False, True], ids=["no-nulls", "nulls"])
+def nullable(request):
+    return request.param
+
+
+@pytest.fixture(
+    params=["is_first_distinct", "is_last_distinct", "is_unique", "is_duplicated"]
+)
+def op(request):
+    return request.param
+
+
+@pytest.fixture
+def df(nullable):
+    values: list[int | None] = [1, 2, 3, 1, 1, 7, 3, 2, 7, 8, 1]
+    if nullable:
+        values[1] = None
+        values[4] = None
+    return pl.LazyFrame({"a": values})
+
+
+def test_expr_distinct(df, op):
+    expr = getattr(pl.col("a"), op)()
+    query = df.select(expr)
+    assert_gpu_result_equal(query)
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index b75e1bdef10..b2443e357e2 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -10,17 +10,7 @@
 
 
 @pytest.fixture(
-    params=[
-        (None, None),
-        pytest.param(
-            ("row-index", 0),
-            marks=pytest.mark.xfail(reason="Incorrect dtype for row index"),
-        ),
-        pytest.param(
-            ("index", 10),
-            marks=pytest.mark.xfail(reason="Incorrect dtype for row index"),
-        ),
-    ],
+    params=[(None, None), ("row-index", 0), ("index", 10)],
     ids=["no-row-index", "zero-offset-row-index", "offset-row-index"],
 )
 def row_index(request):

From faf39299ebf178ee10971e4222c534f00d035b6d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 4 Jun 2024 08:52:51 -1000
Subject: [PATCH 299/842] Make Frame.astype return Self instead of a
 ColumnAccessor (#15861)

Allows simplification for it's subclasses (`IndexFrame.astype`, `Index.astype`)

Also minor cleanups in the `equals` method

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15861
---
 python/cudf/cudf/core/_base_index.py   |  2 +-
 python/cudf/cudf/core/dataframe.py     |  2 +-
 python/cudf/cudf/core/frame.py         | 23 ++++++-----------------
 python/cudf/cudf/core/index.py         | 22 ++++++++++++++--------
 python/cudf/cudf/core/indexed_frame.py | 14 +++++---------
 5 files changed, 27 insertions(+), 36 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index e6868ae3431..baca7b19e58 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -149,7 +149,7 @@ def ndim(self) -> int:  # noqa: D401
         """Number of dimensions of the underlying data, by definition 1."""
         return 1
 
-    def equals(self, other):
+    def equals(self, other) -> bool:
         """
         Determine if two Index objects contain the same elements.
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index acfc2d781a7..0fc36fa80e4 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2590,7 +2590,7 @@ def items(self):
             yield (k, self[k])
 
     @_cudf_nvtx_annotate
-    def equals(self, other):
+    def equals(self, other) -> bool:
         ret = super().equals(other)
         # If all other checks matched, validate names.
         if ret:
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index d60c206ac24..7326696c994 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -273,20 +273,13 @@ def __len__(self) -> int:
         return self._num_rows
 
     @_cudf_nvtx_annotate
-    def astype(self, dtype, copy: bool = False):
-        result_data = {
-            col_name: col.astype(dtype.get(col_name, col.dtype), copy=copy)
+    def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self:
+        casted = (
+            col.astype(dtype.get(col_name, col.dtype), copy=copy)
             for col_name, col in self._data.items()
-        }
-
-        return ColumnAccessor(
-            data=result_data,
-            multiindex=self._data.multiindex,
-            level_names=self._data.level_names,
-            rangeindex=self._data.rangeindex,
-            label_dtype=self._data.label_dtype,
-            verify=False,
         )
+        ca = self._data._from_columns_like_self(casted, verify=False)
+        return self._from_data_like_self(ca)
 
     @_cudf_nvtx_annotate
     def equals(self, other) -> bool:
@@ -349,11 +342,7 @@ def equals(self, other) -> bool:
         """
         if self is other:
             return True
-        if (
-            other is None
-            or not isinstance(other, type(self))
-            or len(self) != len(other)
-        ):
+        if not isinstance(other, type(self)) or len(self) != len(other):
             return False
 
         return all(
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 2a75b374a1e..9b4c5473438 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -445,7 +445,7 @@ def __getitem__(self, index):
         return self._as_int_index()[index]
 
     @_cudf_nvtx_annotate
-    def equals(self, other):
+    def equals(self, other) -> bool:
         if isinstance(other, RangeIndex):
             return self._range == other._range
         return self._as_int_index().equals(other)
@@ -1058,6 +1058,16 @@ def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self:
             out.name = name
         return out
 
+    @classmethod
+    @_cudf_nvtx_annotate
+    def _from_data_like_self(
+        cls, data: MutableMapping, name: Any = no_default
+    ) -> Self:
+        out = _index_from_data(data, name)
+        if name is not no_default:
+            out.name = name
+        return out
+
     @classmethod
     @_cudf_nvtx_annotate
     def from_arrow(cls, obj):
@@ -1180,12 +1190,8 @@ def is_unique(self):
         return self._column.is_unique
 
     @_cudf_nvtx_annotate
-    def equals(self, other):
-        if (
-            other is None
-            or not isinstance(other, BaseIndex)
-            or len(self) != len(other)
-        ):
+    def equals(self, other) -> bool:
+        if not isinstance(other, BaseIndex) or len(self) != len(other):
             return False
 
         check_dtypes = False
@@ -1231,7 +1237,7 @@ def copy(self, name=None, deep=False):
 
     @_cudf_nvtx_annotate
     def astype(self, dtype, copy: bool = True):
-        return _index_from_data(super().astype({self.name: dtype}, copy))
+        return super().astype({self.name: dtype}, copy)
 
     @_cudf_nvtx_annotate
     def get_indexer(self, target, method=None, limit=None, tolerance=None):
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index a31430e1571..5a466f20f8c 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -625,10 +625,8 @@ def copy(self, deep: bool = True) -> Self:
         )
 
     @_cudf_nvtx_annotate
-    def equals(self, other):  # noqa: D102
-        if not super().equals(other):
-            return False
-        return self.index.equals(other.index)
+    def equals(self, other) -> bool:  # noqa: D102
+        return super().equals(other) and self.index.equals(other.index)
 
     @property
     def index(self):
@@ -4896,10 +4894,10 @@ def repeat(self, repeats, axis=None):
 
     def astype(
         self,
-        dtype,
+        dtype: dict[Any, Dtype],
         copy: bool = False,
         errors: Literal["raise", "ignore"] = "raise",
-    ):
+    ) -> Self:
         """Cast the object to the given dtype.
 
         Parameters
@@ -5010,14 +5008,12 @@ def astype(
             raise ValueError("invalid error value specified")
 
         try:
-            data = super().astype(dtype, copy)
+            return super().astype(dtype, copy)
         except Exception as e:
             if errors == "raise":
                 raise e
             return self
 
-        return self._from_data(data, index=self.index)
-
     @_cudf_nvtx_annotate
     def drop(
         self,

From fe7412915a289e7a9469040ada1dcf74cda2c4d6 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 4 Jun 2024 08:56:25 -1000
Subject: [PATCH 300/842] Make Column.to_pandas return Index instead of Series
 (#15833)

Column.to_pandas backs `Index.to_pandas`/`Series.to_pandas`/`DataFrame.to_pandas` and returned a `pandas.Series`; however, the `index` of this `pandas.Series` was not strictly necessary for `Index.to_pandas` and `DataFrame.to_pandas`.

Additionally, `pandas.Index` is 1D-like like `Column` and provides a better mental model to `to_pandas` conversion.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15833
---
 python/cudf/cudf/core/column/categorical.py   |  7 ++-
 python/cudf/cudf/core/column/column.py        | 13 ++----
 python/cudf/cudf/core/column/datetime.py      | 20 ++-------
 python/cudf/cudf/core/column/interval.py      | 15 ++-----
 python/cudf/cudf/core/column/lists.py         | 20 ++-------
 python/cudf/cudf/core/column/numerical.py     | 17 +++----
 python/cudf/cudf/core/column/string.py        | 17 ++-----
 python/cudf/cudf/core/column/struct.py        | 19 ++------
 python/cudf/cudf/core/dataframe.py            |  4 +-
 python/cudf/cudf/core/index.py                | 45 ++++---------------
 python/cudf/cudf/core/series.py               |  8 ++--
 .../cudf/tests/test_cuda_array_interface.py   |  4 +-
 12 files changed, 46 insertions(+), 143 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 0ff8209dcd4..1828c5ce97b 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -789,12 +789,11 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
     def to_pandas(
         self,
         *,
-        index: Optional[pd.Index] = None,
         nullable: bool = False,
         arrow_type: bool = False,
-    ) -> pd.Series:
+    ) -> pd.Index:
         if nullable:
-            raise NotImplementedError(f"{nullable=} is not implemented.")
+            return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
         elif arrow_type:
             raise NotImplementedError(f"{arrow_type=} is not implemented.")
 
@@ -828,7 +827,7 @@ def to_pandas(
         data = pd.Categorical.from_codes(
             codes, categories=cats.to_pandas(), ordered=col.ordered
         )
-        return pd.Series(data, index=index)
+        return pd.Index(data)
 
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array."""
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 59bae179497..68079371b85 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -190,10 +190,9 @@ def __repr__(self):
     def to_pandas(
         self,
         *,
-        index: Optional[pd.Index] = None,
         nullable: bool = False,
         arrow_type: bool = False,
-    ) -> pd.Series:
+    ) -> pd.Index:
         """Convert object to pandas type.
 
         The default implementation falls back to PyArrow for the conversion.
@@ -208,15 +207,9 @@ def to_pandas(
             raise NotImplementedError(f"{nullable=} is not implemented.")
         pa_array = self.to_arrow()
         if arrow_type:
-            return pd.Series(
-                pd.arrays.ArrowExtensionArray(pa_array), index=index
-            )
+            return pd.Index(pd.arrays.ArrowExtensionArray(pa_array))
         else:
-            pd_series = pa_array.to_pandas()
-
-            if index is not None:
-                pd_series.index = index
-            return pd_series
+            return pd.Index(pa_array.to_pandas())
 
     @property
     def values_host(self) -> "np.ndarray":
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 27f31c8f500..057169aa7e1 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -840,27 +840,15 @@ def __init__(
     def to_pandas(
         self,
         *,
-        index: Optional[pd.Index] = None,
         nullable: bool = False,
         arrow_type: bool = False,
-    ) -> pd.Series:
-        if arrow_type and nullable:
-            raise ValueError(
-                f"{arrow_type=} and {nullable=} cannot both be set."
-            )
-        elif nullable:
-            raise NotImplementedError(f"{nullable=} is not implemented.")
-        elif arrow_type:
-            return pd.Series(
-                pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
-            )
+    ) -> pd.Index:
+        if arrow_type or nullable:
+            return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
         else:
-            series = self._local_time.to_pandas().dt.tz_localize(
+            return self._local_time.to_pandas().tz_localize(
                 self.dtype.tz, ambiguous="NaT", nonexistent="NaT"
             )
-            if index is not None:
-                series.index = index
-            return series
 
     def to_arrow(self):
         return pa.compute.assume_timezone(
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index 7bd693966dc..f24ca3fdad1 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -1,6 +1,4 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
-from typing import Optional
-
 import pandas as pd
 import pyarrow as pa
 
@@ -109,28 +107,21 @@ def as_interval_column(self, dtype):
     def to_pandas(
         self,
         *,
-        index: Optional[pd.Index] = None,
         nullable: bool = False,
         arrow_type: bool = False,
-    ) -> pd.Series:
+    ) -> pd.Index:
         # Note: This does not handle null values in the interval column.
         # However, this exact sequence (calling __from_arrow__ on the output of
         # self.to_arrow) is currently the best known way to convert interval
         # types into pandas (trying to convert the underlying numerical columns
         # directly is problematic), so we're stuck with this for now.
-        if arrow_type and nullable:
-            raise ValueError(
-                f"{arrow_type=} and {nullable=} cannot both be set."
-            )
         if nullable:
-            raise NotImplementedError(f"{nullable=} is not implemented.")
+            return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
         elif arrow_type:
             raise NotImplementedError(f"{arrow_type=} is not implemented.")
 
         pd_type = self.dtype.to_pandas()
-        return pd.Series(
-            pd_type.__from_arrow__(self.to_arrow()), index=index, dtype=pd_type
-        )
+        return pd.Index(pd_type.__from_arrow__(self.to_arrow()), dtype=pd_type)
 
     def element_indexing(self, index: int):
         result = super().element_indexing(index)
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 1c2bcbef2ec..8f8ee46c796 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -292,25 +292,13 @@ def _transform_leaves(self, func, *args, **kwargs) -> Self:
     def to_pandas(
         self,
         *,
-        index: Optional[pd.Index] = None,
         nullable: bool = False,
         arrow_type: bool = False,
-    ) -> pd.Series:
-        # Can't rely on Column.to_pandas implementation for lists.
-        # Need to perform `to_pylist` to preserve list types.
-        if arrow_type and nullable:
-            raise ValueError(
-                f"{arrow_type=} and {nullable=} cannot both be set."
-            )
-        if nullable:
-            raise NotImplementedError(f"{nullable=} is not implemented.")
-        pa_array = self.to_arrow()
-        if arrow_type:
-            return pd.Series(
-                pd.arrays.ArrowExtensionArray(pa_array), index=index
-            )
+    ) -> pd.Index:
+        if arrow_type or nullable:
+            return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
         else:
-            return pd.Series(pa_array.tolist(), dtype="object", index=index)
+            return pd.Index(self.to_arrow().tolist(), dtype="object")
 
 
 class ListMethods(ColumnMethods):
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index bab862f775f..fb413959eb9 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -674,18 +674,13 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
     def to_pandas(
         self,
         *,
-        index: Optional[pd.Index] = None,
         nullable: bool = False,
         arrow_type: bool = False,
-    ) -> pd.Series:
+    ) -> pd.Index:
         if arrow_type and nullable:
-            raise ValueError(
-                f"{arrow_type=} and {nullable=} cannot both be set."
-            )
+            return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
         elif arrow_type:
-            return pd.Series(
-                pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
-            )
+            return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
         elif (
             nullable
             and (
@@ -697,11 +692,11 @@ def to_pandas(
         ):
             arrow_array = self.to_arrow()
             pandas_array = pandas_nullable_dtype.__from_arrow__(arrow_array)  # type: ignore[attr-defined]
-            return pd.Series(pandas_array, copy=False, index=index)
+            return pd.Index(pandas_array, copy=False)
         elif self.dtype.kind in set("iuf") and not self.has_nulls():
-            return pd.Series(self.values_host, copy=False, index=index)
+            return pd.Index(self.values_host, copy=False)
         else:
-            return super().to_pandas(index=index, nullable=nullable)
+            return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
 
     def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
         col_dtype = self.dtype
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 40e58e14612..fd98d0dc163 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5783,23 +5783,14 @@ def values(self) -> cupy.ndarray:
     def to_pandas(
         self,
         *,
-        index: Optional[pd.Index] = None,
         nullable: bool = False,
         arrow_type: bool = False,
-    ) -> pd.Series:
-        if arrow_type and nullable:
-            raise ValueError(
-                f"{arrow_type=} and {nullable=} cannot both be set."
-            )
-        if arrow_type:
-            return pd.Series(
-                pd.arrays.ArrowExtensionArray(self.to_arrow()), index=index
-            )
-        elif nullable:
+    ) -> pd.Index:
+        if nullable and not arrow_type:
             pandas_array = pd.StringDtype().__from_arrow__(self.to_arrow())
-            return pd.Series(pandas_array, copy=False, index=index)
+            return pd.Index(pandas_array, copy=False)
         else:
-            return super().to_pandas(index=index, nullable=nullable)
+            return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
 
     def can_cast_safely(self, to_dtype: Dtype) -> bool:
         to_dtype = cudf.api.types.dtype(to_dtype)
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 1b2ffcc2700..6dd35570b95 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -2,7 +2,6 @@
 from __future__ import annotations
 
 from functools import cached_property
-from typing import Optional
 
 import pandas as pd
 import pyarrow as pa
@@ -60,25 +59,15 @@ def to_arrow(self):
     def to_pandas(
         self,
         *,
-        index: Optional[pd.Index] = None,
         nullable: bool = False,
         arrow_type: bool = False,
-    ) -> pd.Series:
+    ) -> pd.Index:
         # We cannot go via Arrow's `to_pandas` because of the following issue:
         # https://issues.apache.org/jira/browse/ARROW-12680
-        if arrow_type and nullable:
-            raise ValueError(
-                f"{arrow_type=} and {nullable=} cannot both be set."
-            )
-        elif nullable:
-            raise NotImplementedError(f"{nullable=} is not implemented.")
-        pa_array = self.to_arrow()
-        if arrow_type:
-            return pd.Series(
-                pd.arrays.ArrowExtensionArray(pa_array), index=index
-            )
+        if arrow_type or nullable:
+            return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
         else:
-            return pd.Series(pa_array.tolist(), dtype="object", index=index)
+            return pd.Index(self.to_arrow().tolist(), dtype="object")
 
     @cached_property
     def memory_usage(self):
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 0fc36fa80e4..4c55b5427de 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5321,9 +5321,7 @@ def to_pandas(
         """
         out_index = self.index.to_pandas()
         out_data = {
-            i: col.to_pandas(
-                index=out_index, nullable=nullable, arrow_type=arrow_type
-            )
+            i: col.to_pandas(nullable=nullable, arrow_type=arrow_type)
             for i, col in enumerate(self._data.columns)
         }
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 9b4c5473438..4b09765fa46 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1568,10 +1568,11 @@ def any(self):
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.Index:
-        return pd.Index(
-            self._values.to_pandas(nullable=nullable, arrow_type=arrow_type),
-            name=self.name,
+        result = self._column.to_pandas(
+            nullable=nullable, arrow_type=arrow_type
         )
+        result.name = self.name
+        return result
 
     def append(self, other):
         if is_list_like(other):
@@ -2191,23 +2192,10 @@ def isocalendar(self):
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.DatetimeIndex:
-        if arrow_type and nullable:
-            raise ValueError(
-                f"{arrow_type=} and {nullable=} cannot both be set."
-            )
-        elif nullable:
-            raise NotImplementedError(f"{nullable=} is not implemented.")
-
-        result = self._values.to_pandas(arrow_type=arrow_type)
-        if arrow_type:
-            return pd.Index(result, name=self.name)
-        else:
-            freq = (
-                self._freq._maybe_as_fast_pandas_offset()
-                if self._freq is not None
-                else None
-            )
-            return pd.DatetimeIndex(result, name=self.name, freq=freq)
+        result = super().to_pandas(nullable=nullable, arrow_type=arrow_type)
+        if not arrow_type and self._freq is not None:
+            result.freq = self._freq._maybe_as_fast_pandas_offset()
+        return result
 
     @_cudf_nvtx_annotate
     def _get_dt_field(self, field):
@@ -2527,23 +2515,6 @@ def __getitem__(self, index):
             return pd.Timedelta(value)
         return value
 
-    @_cudf_nvtx_annotate
-    def to_pandas(
-        self, *, nullable: bool = False, arrow_type: bool = False
-    ) -> pd.TimedeltaIndex:
-        if arrow_type and nullable:
-            raise ValueError(
-                f"{arrow_type=} and {nullable=} cannot both be set."
-            )
-        elif nullable:
-            raise NotImplementedError(f"{nullable=} is not implemented.")
-
-        result = self._values.to_pandas(arrow_type=arrow_type)
-        if arrow_type:
-            return pd.Index(result, name=self.name)
-        else:
-            return pd.TimedeltaIndex(result, name=self.name)
-
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def days(self):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index a5b204ef346..169f7c11cf9 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2022,11 +2022,11 @@ def to_pandas(
             index = self.index.to_pandas()
         else:
             index = None  # type: ignore[assignment]
-        s = self._column.to_pandas(
-            index=index, nullable=nullable, arrow_type=arrow_type
+        return pd.Series(
+            self._column.to_pandas(nullable=nullable, arrow_type=arrow_type),
+            index=index,
+            name=self.name,
         )
-        s.name = self.name
-        return s
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index f98c3ad0475..06d63561fc1 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -175,12 +175,12 @@ def test_column_from_ephemeral_cupy_try_lose_reference():
     a = cudf.Series(cupy.asarray([1, 2, 3]))._column
     a = cudf.core.column.as_column(a)
     b = cupy.asarray([1, 1, 1])  # noqa: F841
-    assert_eq(pd.Series([1, 2, 3]), a.to_pandas())
+    assert_eq(pd.Index([1, 2, 3]), a.to_pandas())
 
     a = cudf.Series(cupy.asarray([1, 2, 3]))._column
     a.name = "b"
     b = cupy.asarray([1, 1, 1])  # noqa: F841
-    assert_eq(pd.Series([1, 2, 3]), a.to_pandas())
+    assert_eq(pd.Index([1, 2, 3]), a.to_pandas())
 
 
 @pytest.mark.xfail(

From 22ef0634f07f7b40d718e80bed176e88ac734ebe Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 4 Jun 2024 14:58:11 -1000
Subject: [PATCH 301/842] Remove internal usage of core.index.as_index in favor
 of cudf.Index (#15851)

`cudf.Index.__init__` essentially calls `as_index` immediately internally. To avoid both from potentially diverging, the public `cudf.Index` should be preferred to ensure the public behaviors are used internally

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15851
---
 python/cudf/cudf/core/algorithms.py           |  4 +-
 python/cudf/cudf/core/column/methods.py       |  4 +-
 python/cudf/cudf/core/column/string.py        |  4 +-
 python/cudf/cudf/core/cut.py                  |  4 +-
 python/cudf/cudf/core/dataframe.py            | 36 +++++++--------
 python/cudf/cudf/core/dtypes.py               |  4 +-
 python/cudf/cudf/core/groupby/groupby.py      |  6 +--
 python/cudf/cudf/core/index.py                | 30 +++++--------
 python/cudf/cudf/core/indexed_frame.py        |  4 +-
 python/cudf/cudf/core/multiindex.py           |  7 +--
 python/cudf/cudf/core/series.py               |  8 ++--
 python/cudf/cudf/core/tools/datetimes.py      |  5 +--
 python/cudf/cudf/tests/test_array_function.py |  4 +-
 python/cudf/cudf/tests/test_binops.py         | 31 +++++++------
 python/cudf/cudf/tests/test_contains.py       |  6 +--
 python/cudf/cudf/tests/test_dlpack.py         |  2 +-
 python/cudf/cudf/tests/test_index.py          | 44 ++++++++-----------
 python/cudf/cudf/tests/test_multiindex.py     |  7 +--
 python/cudf/cudf/tests/test_string.py         | 38 ++++++++--------
 .../cudf/cudf/tests/text/test_text_methods.py |  8 ++--
 20 files changed, 116 insertions(+), 140 deletions(-)

diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 272abdece9e..51a32e29886 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -6,7 +6,7 @@
 
 from cudf.core.column import as_column
 from cudf.core.copy_types import BooleanMask
-from cudf.core.index import RangeIndex, as_index
+from cudf.core.index import Index, RangeIndex
 from cudf.core.indexed_frame import IndexedFrame
 from cudf.core.scalar import Scalar
 from cudf.options import get_option
@@ -107,7 +107,7 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
         dtype="int64" if get_option("mode.pandas_compatible") else None,
     ).values
 
-    return labels, cats.values if return_cupy_array else as_index(cats)
+    return labels, cats.values if return_cupy_array else Index(cats)
 
 
 def _linear_interpolation(column, index=None):
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index e827c7a3dd3..7f7355c571a 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -93,8 +93,6 @@ def _return_or_inplace(
                 else:
                     return cudf.Series(new_col, name=self._parent.name)
             elif isinstance(self._parent, cudf.BaseIndex):
-                return cudf.core.index.as_index(
-                    new_col, name=self._parent.name
-                )
+                return cudf.Index(new_col, name=self._parent.name)
             else:
                 return self._parent._mimic_inplace(new_col, inplace=False)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index fd98d0dc163..d12aa80e9a3 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4391,7 +4391,7 @@ def code_points(self) -> SeriesOrIndex:
         if isinstance(self._parent, cudf.Series):
             return cudf.Series(new_col, name=self._parent.name)
         elif isinstance(self._parent, cudf.BaseIndex):
-            return cudf.core.index.as_index(new_col, name=self._parent.name)
+            return cudf.Index(new_col, name=self._parent.name)
         else:
             return new_col
 
@@ -4706,7 +4706,7 @@ def character_tokenize(self) -> SeriesOrIndex:
             index = self._parent.index.repeat(lengths)
             return cudf.Series(result_col, name=self._parent.name, index=index)
         elif isinstance(self._parent, cudf.BaseIndex):
-            return cudf.core.index.as_index(result_col, name=self._parent.name)
+            return cudf.Index(result_col, name=self._parent.name)
         else:
             return result_col
 
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index ccf730c91fb..54c5e829e8a 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from collections import abc
 
@@ -292,7 +292,7 @@ def cut(
     )
 
     # we return a categorical index, as we don't have a Categorical method
-    categorical_index = cudf.core.index.as_index(col)
+    categorical_index = cudf.Index(col)
 
     if isinstance(orig_x, (pd.Series, cudf.Series)):
         # if we have a series input we return a series output
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 4c55b5427de..c8f1e872300 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -712,7 +712,7 @@ def __init__(
                     data = data.reindex(index)
                     index = data.index
                 else:
-                    index = as_index(index)
+                    index = cudf.Index(index)
             else:
                 index = data.index
 
@@ -761,7 +761,7 @@ def __init__(
             if index is None:
                 self._index = RangeIndex(0)
             else:
-                self._index = as_index(index)
+                self._index = cudf.Index(index)
             if columns is not None:
                 rangeindex = isinstance(
                     columns, (range, pd.RangeIndex, cudf.RangeIndex)
@@ -875,7 +875,7 @@ def _init_from_series_list(self, data, columns, index):
             # When `index` is `None`, the final index of
             # resulting dataframe will be union of
             # all Series's names.
-            final_index = as_index(_get_union_of_series_names(data))
+            final_index = cudf.Index(_get_union_of_series_names(data))
         else:
             # When an `index` is passed, the final index of
             # resulting dataframe will be whatever
@@ -919,7 +919,7 @@ def _init_from_series_list(self, data, columns, index):
                         f"not match length of index ({index_length})"
                     )
 
-            final_index = as_index(index)
+            final_index = cudf.Index(index)
 
         series_lengths = list(map(len, data))
         data = numeric_normalize_types(*data)
@@ -943,7 +943,7 @@ def _init_from_series_list(self, data, columns, index):
             # Setting `final_columns` to self._index so
             # that the resulting `transpose` will be have
             # columns set to `final_columns`
-            self._index = as_index(final_columns)
+            self._index = cudf.Index(final_columns)
 
             transpose = self.T
         else:
@@ -987,9 +987,9 @@ def _init_from_list_like(self, data, index=None, columns=None):
         if index is None:
             index = RangeIndex(start=0, stop=len(data))
         else:
-            index = as_index(index)
+            index = cudf.Index(index)
 
-        self._index = as_index(index)
+        self._index = cudf.Index(index)
         # list-of-dicts case
         if len(data) > 0 and isinstance(data[0], dict):
             data = DataFrame.from_pandas(pd.DataFrame(data))
@@ -1095,7 +1095,7 @@ def _init_from_dict_like(
 
             self._index = RangeIndex(0, num_rows)
         else:
-            self._index = as_index(index)
+            self._index = cudf.Index(index)
 
         if len(data):
             self._data.multiindex = True
@@ -1410,7 +1410,7 @@ def __setitem__(self, arg, value):
                             new_columns, verify=False
                         )
                         if isinstance(value, (pd.Series, Series)):
-                            self._index = as_index(value.index)
+                            self._index = cudf.Index(value.index)
                         elif len(value) > 0:
                             self._index = RangeIndex(length)
                         return
@@ -1728,7 +1728,7 @@ def _concat(
         for cols in columns:
             table_index = None
             if 1 == first_data_column_position:
-                table_index = cudf.core.index.as_index(cols[0])
+                table_index = cudf.Index(cols[0])
             elif first_data_column_position > 1:
                 table_index = DataFrame._from_data(
                     data=dict(
@@ -1780,9 +1780,7 @@ def _concat(
             if not isinstance(out.index, MultiIndex) and isinstance(
                 out.index.dtype, cudf.CategoricalDtype
             ):
-                out = out.set_index(
-                    cudf.core.index.as_index(out.index._values)
-                )
+                out = out.set_index(cudf.Index(out.index._values))
         for name, col in out._data.items():
             out._data[name] = col._with_type_metadata(
                 tables[0]._data[name].dtype
@@ -2828,7 +2826,7 @@ def reindex(
         if columns is None:
             df = self
         else:
-            columns = as_index(columns)
+            columns = cudf.Index(columns)
             intersection = self._data.to_pandas_index().intersection(
                 columns.to_pandas()
             )
@@ -3245,7 +3243,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
         if len(self) == 0:
             if isinstance(value, (pd.Series, Series)):
                 if not ignore_index:
-                    self.index = as_index(value.index)
+                    self.index = cudf.Index(value.index)
             elif (length := len(value)) > 0:
                 if num_cols != 0:
                     ca = self._data._from_columns_like_self(
@@ -5654,7 +5652,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
             }
 
         if not is_scalar(index):
-            new_index = as_index(index)
+            new_index = cudf.Index(index)
         else:
             new_index = None
 
@@ -5738,7 +5736,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
             }
 
         if index is not None:
-            index = as_index(index)
+            index = cudf.Index(index)
 
         if isinstance(columns, (pd.Index, cudf.Index)):
             level_names = tuple(columns.names)
@@ -6171,7 +6169,7 @@ def count(self, axis=0, numeric_only=False):
                     for col in self._data.names
                 ]
             },
-            as_index(self._data.names),
+            cudf.Index(self._data.names),
         )
 
     _SUPPORT_AXIS_LOOKUP = {
@@ -6298,7 +6296,7 @@ def _reduce(
                         source._data.names, names=source._data.level_names
                     )
                 else:
-                    idx = as_index(source._data.names)
+                    idx = cudf.Index(source._data.names)
                 return Series._from_data({None: as_column(result)}, idx)
         elif axis == 1:
             return source._apply_cupy_method_axis_1(op, **kwargs)
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 9bb1995b836..4729233ee6e 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -186,10 +186,10 @@ def categories(self) -> "cudf.core.index.Index":
         Index(['b', 'a'], dtype='object')
         """
         if self._categories is None:
-            return cudf.core.index.as_index(
+            return cudf.Index(
                 cudf.core.column.column_empty(0, dtype="object", masked=False)
             )
-        return cudf.core.index.as_index(self._categories, copy=False)
+        return cudf.Index(self._categories, copy=False)
 
     @property
     def type(self):
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 3e7a1ee6026..ac8b381cbec 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -2800,15 +2800,13 @@ def keys(self):
         nkeys = len(self._key_columns)
 
         if nkeys == 0:
-            return cudf.core.index.as_index([], name=None)
+            return cudf.Index([], name=None)
         elif nkeys > 1:
             return cudf.MultiIndex._from_data(
                 dict(zip(range(nkeys), self._key_columns))
             )._set_names(self.names)
         else:
-            return cudf.core.index.as_index(
-                self._key_columns[0], name=self.names[0]
-            )
+            return cudf.Index(self._key_columns[0], name=self.names[0])
 
     @property
     def values(self) -> cudf.core.frame.Frame:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 4b09765fa46..7297ac4e929 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1169,7 +1169,7 @@ def _concat(cls, objs):
             result = _concat_range_index(non_empties)
         else:
             data = concat_columns([o._values for o in non_empties])
-            result = as_index(data)
+            result = Index(data)
 
         names = {obj.name for obj in objs}
         if len(names) == 1:
@@ -1437,7 +1437,7 @@ def __repr__(self):
     def __getitem__(self, index):
         res = self._get_elements_from_column(index)
         if isinstance(res, ColumnBase):
-            res = as_index(res, name=self.name)
+            res = Index(res, name=self.name)
         return res
 
     @property  # type: ignore
@@ -1958,7 +1958,7 @@ def microsecond(self):
         >>> datetime_index.microsecond
         Index([0, 1, 2], dtype='int32')
         """  # noqa: E501
-        return as_index(
+        return Index(
             (
                 # Need to manually promote column to int32 because
                 # pandas-matching binop behaviour requires that this
@@ -2209,7 +2209,7 @@ def _get_dt_field(self, field):
             mask=out_column.base_mask,
             offset=out_column.offset,
         )
-        return as_index(out_column, name=self.name)
+        return Index(out_column, name=self.name)
 
     def _is_boolean(self):
         return False
@@ -2522,9 +2522,7 @@ def days(self):
         Number of days for each element.
         """
         # Need to specifically return `int64` to avoid overflow.
-        return as_index(
-            arbitrary=self._values.days, name=self.name, dtype="int64"
-        )
+        return Index(self._values.days, name=self.name, dtype="int64")
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -2532,9 +2530,7 @@ def seconds(self):
         """
         Number of seconds (>= 0 and less than 1 day) for each element.
         """
-        return as_index(
-            arbitrary=self._values.seconds, name=self.name, dtype="int32"
-        )
+        return Index(self._values.seconds, name=self.name, dtype="int32")
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -2542,9 +2538,7 @@ def microseconds(self):
         """
         Number of microseconds (>= 0 and less than 1 second) for each element.
         """
-        return as_index(
-            arbitrary=self._values.microseconds, name=self.name, dtype="int32"
-        )
+        return Index(self._values.microseconds, name=self.name, dtype="int32")
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -2553,9 +2547,7 @@ def nanoseconds(self):
         Number of nanoseconds (>= 0 and less than 1 microsecond) for each
         element.
         """
-        return as_index(
-            arbitrary=self._values.nanoseconds, name=self.name, dtype="int32"
-        )
+        return Index(self._values.nanoseconds, name=self.name, dtype="int32")
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -2693,7 +2685,7 @@ def codes(self):
         """
         The category codes of this categorical.
         """
-        return as_index(self._values.codes)
+        return Index(self._values.codes)
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
@@ -3137,7 +3129,7 @@ def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex:
         elif step is None:
             # First non-empty index had only one element
             if obj.start == start:
-                result = as_index(concat_columns([x._values for x in indexes]))
+                result = Index(concat_columns([x._values for x in indexes]))
                 return result
             step = obj.start - start
 
@@ -3145,7 +3137,7 @@ def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex:
             next_ is not None and obj.start != next_
         )
         if non_consecutive:
-            result = as_index(concat_columns([x._values for x in indexes]))
+            result = Index(concat_columns([x._values for x in indexes]))
             return result
         if step is not None:
             next_ = obj[-1] + step
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 5a466f20f8c..688b268d478 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3638,7 +3638,7 @@ def _align_to_index(
         sort: bool = True,
         allow_non_unique: bool = False,
     ) -> Self:
-        index = cudf.core.index.as_index(index)
+        index = cudf.Index(index)
 
         if self.index.equals(index):
             return self
@@ -3713,7 +3713,7 @@ def _reindex(
                 raise ValueError(
                     "cannot reindex on an axis with duplicate labels"
                 )
-            index = cudf.core.index.as_index(
+            index = cudf.Index(
                 index, name=getattr(index, "name", self.index.name)
             )
 
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 049fac45ba8..11b4b9154a2 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -30,7 +30,6 @@
     BaseIndex,
     _get_indexer_basic,
     _lexsorted_equal_range,
-    as_index,
 )
 from cudf.core.join._join_helpers import _match_join_keys
 from cudf.utils.dtypes import is_column_like
@@ -824,7 +823,7 @@ def _index_and_downcast(self, result, index, index_key):
             # it into an Index and name the final index values according
             # to that column's name.
             *_, last_column = index._data.columns
-            out_index = as_index(last_column)
+            out_index = cudf.Index(last_column)
             out_index.name = index.names[-1]
             index = out_index
         elif out_index._num_columns > 1:
@@ -1082,7 +1081,9 @@ def get_level_values(self, level):
                 raise KeyError(f"Level not found: '{level}'")
         else:
             level_idx = colnames.index(level)
-        level_values = as_index(self._data[level], name=self.names[level_idx])
+        level_values = cudf.Index(
+            self._data[level], name=self.names[level_idx]
+        )
         return level_values
 
     def _is_numeric(self):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 169f7c11cf9..a52b583d3b4 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -596,7 +596,7 @@ def __init__(
             name_from_data = data.name
             column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
             if isinstance(data, pd.Series):
-                index_from_data = as_index(data.index)
+                index_from_data = cudf.Index(data.index)
             elif isinstance(data, Series):
                 index_from_data = data.index
         elif isinstance(data, ColumnAccessor):
@@ -612,7 +612,7 @@ def __init__(
                 column = as_column(
                     list(data.values()), nan_as_null=nan_as_null, dtype=dtype
                 )
-                index_from_data = as_index(list(data.keys()))
+                index_from_data = cudf.Index(list(data.keys()))
         else:
             # Using `getattr_static` to check if
             # `data` is on device memory and perform
@@ -649,7 +649,7 @@ def __init__(
             name = name_from_data
 
         if index is not None:
-            index = as_index(index)
+            index = cudf.Index(index)
 
         if index_from_data is not None:
             first_index = index_from_data
@@ -5241,7 +5241,7 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
 
     if isinstance(a, cudf.Series) and isinstance(b, cudf.Series):
         b = b.reindex(a.index)
-        index = as_index(a.index)
+        index = cudf.Index(a.index)
 
     a_col = as_column(a)
     a_array = cupy.asarray(a_col.data_array_view(mode="read"))
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 12a1ecc68e0..f002a838fa9 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -18,7 +18,6 @@
 )
 from cudf.api.types import is_integer, is_scalar
 from cudf.core import column
-from cudf.core.index import as_index
 
 # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112
 _unit_map = {
@@ -287,13 +286,13 @@ def to_datetime(
                 utc=utc,
             )
             if isinstance(arg, (cudf.BaseIndex, pd.Index)):
-                return as_index(col, name=arg.name)
+                return cudf.Index(col, name=arg.name)
             elif isinstance(arg, (cudf.Series, pd.Series)):
                 return cudf.Series(col, index=arg.index, name=arg.name)
             elif is_scalar(arg):
                 return col.element_indexing(0)
             else:
-                return as_index(col)
+                return cudf.Index(col)
     except Exception as e:
         if errors == "raise":
             raise e
diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
index 58939f0ddd9..e6b89e2c5fa 100644
--- a/python/cudf/cudf/tests/test_array_function.py
+++ b/python/cudf/cudf/tests/test_array_function.py
@@ -108,7 +108,7 @@ def test_array_func_missing_cudf_dataframe(pd_df, func):
     ],
 )
 def test_array_func_cudf_index(np_ar, func):
-    cudf_index = cudf.core.index.as_index(cudf.Series(np_ar))
+    cudf_index = cudf.Index(cudf.Series(np_ar))
     expect = func(np_ar)
     got = func(cudf_index)
     if np.isscalar(expect):
@@ -128,7 +128,7 @@ def test_array_func_cudf_index(np_ar, func):
     ],
 )
 def test_array_func_missing_cudf_index(np_ar, func):
-    cudf_index = cudf.core.index.as_index(cudf.Series(np_ar))
+    cudf_index = cudf.Index(cudf.Series(np_ar))
     with pytest.raises(TypeError):
         func(cudf_index)
 
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 5d0c403daa2..fa371914c3e 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -12,10 +12,9 @@
 import pytest
 
 import cudf
-from cudf import Series
+from cudf import Index, Series
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.core.buffer.spill_manager import get_global_manager
-from cudf.core.index import as_index
 from cudf.testing import _utils as utils
 from cudf.utils.dtypes import (
     BOOL_TYPES,
@@ -186,8 +185,8 @@ def test_series_binop(binop, obj_class):
     sr2 = Series(arr2)
 
     if obj_class == "Index":
-        sr1 = as_index(sr1)
-        sr2 = as_index(sr2)
+        sr1 = Index(sr1)
+        sr2 = Index(sr2)
 
     result = binop(sr1, sr2)
     expect = binop(pd.Series(arr1), pd.Series(arr2))
@@ -225,7 +224,7 @@ def test_series_binop_scalar(nelem, binop, obj_class, use_cudf_scalar):
 
     sr = Series(arr)
     if obj_class == "Index":
-        sr = as_index(sr)
+        sr = Index(sr)
 
     if use_cudf_scalar:
         result = binop(sr, rhs)
@@ -251,8 +250,8 @@ def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype):
     sr2 = Series(arr2)
 
     if obj_class == "Index":
-        sr1 = as_index(sr1)
-        sr2 = as_index(sr2)
+        sr1 = Index(sr1)
+        sr2 = Index(sr2)
 
     result = binop(sr1, sr2)
 
@@ -274,8 +273,8 @@ def test_series_compare(cmpop, obj_class, dtype):
     sr2 = Series(arr2)
 
     if obj_class == "Index":
-        sr1 = as_index(sr1)
-        sr2 = as_index(sr2)
+        sr1 = Index(sr1)
+        sr2 = Index(sr2)
 
     result1 = cmpop(sr1, sr1)
     result2 = cmpop(sr2, sr2)
@@ -402,7 +401,7 @@ def test_series_compare_scalar(
         rhs = cudf.Scalar(rhs)
 
     if obj_class == "Index":
-        sr1 = as_index(sr1)
+        sr1 = Index(sr1)
 
     result1 = cmpop(sr1, rhs)
     result2 = cmpop(rhs, sr1)
@@ -488,8 +487,8 @@ def test_series_binop_mixed_dtype(binop, lhs_dtype, rhs_dtype, obj_class):
     sr2 = Series(rhs)
 
     if obj_class == "Index":
-        sr1 = as_index(sr1)
-        sr2 = as_index(sr2)
+        sr1 = Index(sr1)
+        sr2 = Index(sr2)
 
     result = binop(Series(sr1), Series(sr2))
 
@@ -513,8 +512,8 @@ def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class):
     sr2 = Series(rhs)
 
     if obj_class == "Index":
-        sr1 = as_index(sr1)
-        sr2 = as_index(sr2)
+        sr1 = Index(sr1)
+        sr2 = Index(sr2)
 
     result = cmpop(Series(sr1), Series(sr2))
 
@@ -538,7 +537,7 @@ def test_series_reflected_ops_scalar(func, dtype, obj_class):
 
     # class typing
     if obj_class == "Index":
-        gs = as_index(gs)
+        gs = Index(gs)
 
     gs_result = func(gs)
 
@@ -588,7 +587,7 @@ def test_series_reflected_ops_cudf_scalar(funcs, dtype, obj_class):
 
     # class typing
     if obj_class == "Index":
-        gs = as_index(gs)
+        gs = Index(gs)
 
     gs_result = gpu_func(gs)
 
diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py
index 15dfa111860..a65ab1780b6 100644
--- a/python/cudf/cudf/tests/test_contains.py
+++ b/python/cudf/cudf/tests/test_contains.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import datetime
 
@@ -8,7 +8,7 @@
 
 import cudf
 from cudf import Series
-from cudf.core.index import RangeIndex, as_index
+from cudf.core.index import Index, RangeIndex
 from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
@@ -74,7 +74,7 @@ def test_series_contains(values, item, expected):
 
 @pytest.mark.parametrize("values, item, expected", testdata_all)
 def test_index_contains(values, item, expected):
-    index = as_index(values)
+    index = Index(values)
     assert_eq(expected, item in index)
 
 
diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py
index aafe920d3a1..7ea3979b0f1 100644
--- a/python/cudf/cudf/tests/test_dlpack.py
+++ b/python/cudf/cudf/tests/test_dlpack.py
@@ -101,7 +101,7 @@ def test_to_dlpack_index(data_1d):
     with expectation:
         if np.isnan(data_1d).any():
             pytest.skip("Nulls not allowed in Index")
-        gi = cudf.core.index.as_index(data_1d)
+        gi = cudf.Index(data_1d)
         dlt = gi.to_dlpack()
 
         # PyCapsules are a C-API thing so couldn't come up with a better way
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index b92ae1b3364..3d6c71ebc1b 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -17,13 +17,7 @@
 import cudf
 from cudf.api.extensions import no_default
 from cudf.api.types import is_bool_dtype
-from cudf.core.index import (
-    CategoricalIndex,
-    DatetimeIndex,
-    Index,
-    RangeIndex,
-    as_index,
-)
+from cudf.core.index import CategoricalIndex, DatetimeIndex, Index, RangeIndex
 from cudf.testing._utils import (
     ALL_TYPES,
     FLOAT_TYPES,
@@ -200,11 +194,11 @@ def test_pandas_as_index():
     pdf_category_index = pd.CategoricalIndex(["a", "b", "c", "b", "a"])
 
     # Define cudf Indexes
-    gdf_int_index = as_index(pdf_int_index)
-    gdf_uint_index = as_index(pdf_uint_index)
-    gdf_float_index = as_index(pdf_float_index)
-    gdf_datetime_index = as_index(pdf_datetime_index)
-    gdf_category_index = as_index(pdf_category_index)
+    gdf_int_index = Index(pdf_int_index)
+    gdf_uint_index = Index(pdf_uint_index)
+    gdf_float_index = Index(pdf_float_index)
+    gdf_datetime_index = Index(pdf_datetime_index)
+    gdf_category_index = Index(pdf_category_index)
 
     # Check instance types
     assert isinstance(gdf_int_index, Index)
@@ -232,7 +226,7 @@ def test_pandas_as_index():
 @pytest.mark.parametrize("name", SERIES_OR_INDEX_NAMES)
 def test_index_rename(initial_name, name):
     pds = pd.Index([1, 2, 3], name=initial_name)
-    gds = as_index(pds)
+    gds = Index(pds)
 
     assert_eq(pds, gds)
 
@@ -245,18 +239,18 @@ def test_index_rename(initial_name, name):
     and if name is being handles in recursive creation.
     """
     pds = pd.Index(expect)
-    gds = as_index(got)
+    gds = Index(got)
 
     assert_eq(pds, gds)
 
     pds = pd.Index(pds, name="abc")
-    gds = as_index(gds, name="abc")
+    gds = Index(gds, name="abc")
     assert_eq(pds, gds)
 
 
 def test_index_rename_inplace():
     pds = pd.Index([1, 2, 3], name="asdf")
-    gds = as_index(pds)
+    gds = Index(pds)
 
     # inplace=False should yield a deep copy
     gds_renamed_deep = gds.rename("new_name", inplace=False)
@@ -280,7 +274,7 @@ def test_index_rename_preserves_arg():
     assert idx1.name == "orig_name"
 
     # a new object but referencing the same data
-    idx3 = as_index(idx1, name="last_name")
+    idx3 = Index(idx1, name="last_name")
 
     assert idx3.name == "last_name"
     assert idx1.name == "orig_name"
@@ -456,7 +450,7 @@ def test_from_pandas_gen():
 
 
 def test_index_names():
-    idx = cudf.core.index.as_index([1, 2, 3], name="idx")
+    idx = Index([1, 2, 3], name="idx")
     assert idx.names == ("idx",)
 
 
@@ -874,8 +868,8 @@ def test_index_equals(data, other):
     pd_data = pd.Index(data)
     pd_other = pd.Index(other)
 
-    gd_data = cudf.core.index.as_index(data)
-    gd_other = cudf.core.index.as_index(other)
+    gd_data = Index(data)
+    gd_other = Index(other)
 
     expected = pd_data.equals(pd_other)
     actual = gd_data.equals(gd_other)
@@ -920,8 +914,8 @@ def test_index_categories_equal(data, other):
     pd_data = pd.Index(data).astype("category")
     pd_other = pd.Index(other)
 
-    gd_data = cudf.core.index.as_index(data).astype("category")
-    gd_other = cudf.core.index.as_index(other)
+    gd_data = Index(data).astype("category")
+    gd_other = Index(other)
 
     expected = pd_data.equals(pd_other)
     actual = gd_data.equals(gd_other)
@@ -970,7 +964,7 @@ def test_index_equal_misc(data, other):
     pd_data = pd.Index(data)
     pd_other = other
 
-    gd_data = cudf.core.index.as_index(data)
+    gd_data = Index(data)
     gd_other = other
 
     expected = pd_data.equals(pd_other)
@@ -1089,8 +1083,8 @@ def test_index_empty_append_name_conflict():
     ],
 )
 def test_index_append_error(data, other):
-    gd_data = cudf.core.index.as_index(data)
-    gd_other = cudf.core.index.as_index(other)
+    gd_data = Index(data)
+    gd_other = Index(other)
 
     got_dtype = (
         gd_other.dtype
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index dd731fab8f3..f143112a45f 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -21,7 +21,6 @@
 import cudf
 from cudf.api.extensions import no_default
 from cudf.core.column import as_column
-from cudf.core.index import as_index
 from cudf.testing._utils import (
     assert_eq,
     assert_exceptions_equal,
@@ -158,8 +157,6 @@ def test_multiindex_swaplevel():
 
 
 def test_string_index():
-    from cudf.core.index import Index
-
     pdf = pd.DataFrame(np.random.rand(5, 5))
     gdf = cudf.from_pandas(pdf)
     stringIndex = ["a", "b", "c", "d", "e"]
@@ -170,11 +167,11 @@ def test_string_index():
     pdf.index = stringIndex
     gdf.index = stringIndex
     assert_eq(pdf, gdf)
-    stringIndex = Index(["a", "b", "c", "d", "e"], name="name")
+    stringIndex = cudf.Index(["a", "b", "c", "d", "e"], name="name")
     pdf.index = stringIndex.to_pandas()
     gdf.index = stringIndex
     assert_eq(pdf, gdf)
-    stringIndex = as_index(as_column(["a", "b", "c", "d", "e"]), name="name")
+    stringIndex = cudf.Index(as_column(["a", "b", "c", "d", "e"]), name="name")
     pdf.index = stringIndex.to_pandas()
     gdf.index = stringIndex
     assert_eq(pdf, gdf)
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index de771a56e77..801c530da43 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -16,7 +16,7 @@
 import cudf
 from cudf import concat
 from cudf.core.column.string import StringColumn
-from cudf.core.index import Index, as_index
+from cudf.core.index import Index
 from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
@@ -1500,7 +1500,7 @@ def test_strings_partition(data):
     assert_eq(ps.str.partition(","), gs.str.partition(","))
     assert_eq(ps.str.partition("-"), gs.str.partition("-"))
 
-    gi = as_index(data, name="new name")
+    gi = cudf.Index(data, name="new name")
     pi = pd.Index(data, name="new name")
     assert_eq(pi.str.partition(), gi.str.partition())
     assert_eq(pi.str.partition(","), gi.str.partition(","))
@@ -1639,7 +1639,7 @@ def test_strings_strip_tests(data, to_strip):
         ps.str.lstrip(to_strip=to_strip), gs.str.lstrip(to_strip=to_strip)
     )
 
-    gi = as_index(data)
+    gi = cudf.Index(data)
     pi = pd.Index(data)
 
     assert_eq(pi.str.strip(to_strip=to_strip), gi.str.strip(to_strip=to_strip))
@@ -1696,7 +1696,7 @@ def test_strings_filling_tests(data, width, fillchar):
         gs.str.rjust(width=width, fillchar=fillchar),
     )
 
-    gi = as_index(data)
+    gi = cudf.Index(data)
     pi = pd.Index(data)
 
     assert_eq(
@@ -1731,7 +1731,7 @@ def test_strings_zfill_tests(data, width):
 
     assert_eq(ps.str.zfill(width=width), gs.str.zfill(width=width))
 
-    gi = as_index(data)
+    gi = cudf.Index(data)
     pi = pd.Index(data)
 
     assert_eq(pi.str.zfill(width=width), gi.str.zfill(width=width))
@@ -1763,7 +1763,7 @@ def test_strings_pad_tests(data, width, side, fillchar):
         gs.str.pad(width=width, side=side, fillchar=fillchar),
     )
 
-    gi = as_index(data)
+    gi = cudf.Index(data)
     pi = pd.Index(data)
 
     assert_eq(
@@ -1807,7 +1807,7 @@ def test_string_wrap(data, width):
         ),
     )
 
-    gi = as_index(data)
+    gi = cudf.Index(data)
     pi = pd.Index(data)
 
     assert_eq(
@@ -1941,7 +1941,7 @@ def test_string_replace_with_backrefs(find, replace):
     expected = ps.str.replace(find, replace, regex=True)
     assert_eq(got, expected)
 
-    got = as_index(gs).str.replace_with_backrefs(find, replace)
+    got = cudf.Index(gs).str.replace_with_backrefs(find, replace)
     expected = pd.Index(ps).str.replace(find, replace, regex=True)
     assert_eq(got, expected)
 
@@ -2227,7 +2227,7 @@ def test_string_str_rindex(data, sub, er):
         assert_eq(ps.str.rindex(sub), gs.str.rindex(sub), check_dtype=False)
         assert_eq(
             pd.Index(ps).str.rindex(sub),
-            as_index(gs).str.rindex(sub),
+            cudf.Index(gs).str.rindex(sub),
             exact=False,
         )
 
@@ -2336,7 +2336,7 @@ def test_string_str_match(data, pat):
 
     assert_eq(ps.str.match(pat), gs.str.match(pat))
     assert_eq(
-        pd.Index(pd.Index(ps).str.match(pat)), as_index(gs).str.match(pat)
+        pd.Index(pd.Index(ps).str.match(pat)), cudf.Index(gs).str.match(pat)
     )
 
 
@@ -2363,7 +2363,7 @@ def test_string_str_translate(data):
     )
     assert_eq(
         pd.Index(ps).str.translate(str.maketrans({"a": "z"})),
-        as_index(gs).str.translate(str.maketrans({"a": "z"})),
+        cudf.Index(gs).str.translate(str.maketrans({"a": "z"})),
     )
     assert_eq(
         ps.str.translate(str.maketrans({"a": "z", "i": "$", "z": "1"})),
@@ -2373,7 +2373,7 @@ def test_string_str_translate(data):
         pd.Index(ps).str.translate(
             str.maketrans({"a": "z", "i": "$", "z": "1"})
         ),
-        as_index(gs).str.translate(
+        cudf.Index(gs).str.translate(
             str.maketrans({"a": "z", "i": "$", "z": "1"})
         ),
     )
@@ -2389,7 +2389,7 @@ def test_string_str_translate(data):
         pd.Index(ps).str.translate(
             str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."})
         ),
-        as_index(gs).str.translate(
+        cudf.Index(gs).str.translate(
             str.maketrans({"+": "-", "-": "$", "?": "!", "B": "."})
         ),
     )
@@ -2779,8 +2779,8 @@ def test_string_str_byte_count(data, expected):
     actual = sr.str.byte_count()
     assert_eq(expected, actual)
 
-    si = as_index(data)
-    expected = as_index(expected, dtype="int32")
+    si = cudf.Index(data)
+    expected = cudf.Index(expected, dtype="int32")
     actual = si.str.byte_count()
     assert_eq(expected, actual)
 
@@ -2828,8 +2828,8 @@ def test_str_isinteger(data, expected):
     actual = sr.str.isinteger()
     assert_eq(expected, actual)
 
-    sr = as_index(data)
-    expected = as_index(expected)
+    sr = cudf.Index(data)
+    expected = cudf.Index(expected)
     actual = sr.str.isinteger()
     assert_eq(expected, actual)
 
@@ -2884,8 +2884,8 @@ def test_str_isfloat(data, expected):
     actual = sr.str.isfloat()
     assert_eq(expected, actual)
 
-    sr = as_index(data)
-    expected = as_index(expected)
+    sr = cudf.Index(data)
+    expected = cudf.Index(expected)
     actual = sr.str.isfloat()
     assert_eq(expected, actual)
 
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 6ecead862bb..6bd3b99bae1 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -539,7 +539,7 @@ def test_character_tokenize_series():
 
 
 def test_character_tokenize_index():
-    sr = cudf.core.index.as_index(
+    sr = cudf.Index(
         [
             "hello world",
             "sdf",
@@ -550,7 +550,7 @@ def test_character_tokenize_index():
             ),
         ]
     )
-    expected = cudf.core.index.as_index(
+    expected = cudf.Index(
         [
             "h",
             "e",
@@ -648,8 +648,8 @@ def test_character_tokenize_index():
     actual = sr.str.character_tokenize()
     assert_eq(expected, actual)
 
-    sr = cudf.core.index.as_index(["a"])
-    expected = cudf.core.index.as_index(["a"])
+    sr = cudf.Index(["a"])
+    expected = cudf.Index(["a"])
 
     actual = sr.str.character_tokenize()
     assert_eq(expected, actual)

From dc829b8372487615b74494a19c63d43cdbdb0d79 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 5 Jun 2024 10:10:11 -0400
Subject: [PATCH 302/842] Update Changelog [skip ci]

---
 CHANGELOG.md | 306 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 306 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 7ecad2c9c39..a5efe4eb9e5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,309 @@
+# cudf 24.06.00 (5 Jun 2024)
+
+## 🚨 Breaking Changes
+
+- Deprecate `Groupby.collect` ([#15808](https://github.com/rapidsai/cudf/pull/15808)) [@galipremsagar](https://github.com/galipremsagar)
+- Raise FileNotFoundError when a literal JSON string that looks like a json filename is passed ([#15806](https://github.com/rapidsai/cudf/pull/15806)) [@lithomas1](https://github.com/lithomas1)
+- Support filtered I/O in `chunked_parquet_reader` and simplify the use of `parquet_reader_options` ([#15764](https://github.com/rapidsai/cudf/pull/15764)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Raise errors for unsupported operations on certain types ([#15712](https://github.com/rapidsai/cudf/pull/15712)) [@galipremsagar](https://github.com/galipremsagar)
+- Support `DurationType` in cudf parquet reader via `arrow:schema` ([#15617](https://github.com/rapidsai/cudf/pull/15617)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Remove protobuf and use parsed ORC statistics from libcudf ([#15564](https://github.com/rapidsai/cudf/pull/15564)) [@bdice](https://github.com/bdice)
+- Remove legacy JSON reader from Python ([#15538](https://github.com/rapidsai/cudf/pull/15538)) [@bdice](https://github.com/bdice)
+- Removing all batching code from parquet writer ([#15528](https://github.com/rapidsai/cudf/pull/15528)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Convert libcudf resource parameters to rmm::device_async_resource_ref ([#15507](https://github.com/rapidsai/cudf/pull/15507)) [@harrism](https://github.com/harrism)
+- Remove deprecated strings offsets_begin ([#15454](https://github.com/rapidsai/cudf/pull/15454)) [@davidwendt](https://github.com/davidwendt)
+- Floating &lt;--&gt; fixed-point conversion must now be called explicitly ([#15438](https://github.com/rapidsai/cudf/pull/15438)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Bind `read_parquet_metadata` API to libcudf instead of pyarrow and extract `RowGroup` information ([#15398](https://github.com/rapidsai/cudf/pull/15398)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Remove deprecated hash() and spark_murmurhash3_x86_32() ([#15375](https://github.com/rapidsai/cudf/pull/15375)) [@davidwendt](https://github.com/davidwendt)
+- Remove empty elements from exploded character-ngrams output ([#15371](https://github.com/rapidsai/cudf/pull/15371)) [@davidwendt](https://github.com/davidwendt)
+- [FEA] Performance improvement for mixed left semi/anti join ([#15288](https://github.com/rapidsai/cudf/pull/15288)) [@tgujar](https://github.com/tgujar)
+- Align date_range defaults with pandas, support tz ([#15139](https://github.com/rapidsai/cudf/pull/15139)) [@mroeschke](https://github.com/mroeschke)
+
+## 🐛 Bug Fixes
+
+- Revert &quot;Fix docs for IO readers and strings_convert&quot; ([#15872](https://github.com/rapidsai/cudf/pull/15872)) [@vyasr](https://github.com/vyasr)
+- Remove problematic call of index setter to unblock dask-cuda CI ([#15844](https://github.com/rapidsai/cudf/pull/15844)) [@charlesbluca](https://github.com/charlesbluca)
+- Use rapids_cpm_nvtx3 to get same nvtx3 target state as rmm ([#15840](https://github.com/rapidsai/cudf/pull/15840)) [@robertmaynard](https://github.com/robertmaynard)
+- Return boolean from config_host_memory_resource instead of throwing ([#15815](https://github.com/rapidsai/cudf/pull/15815)) [@abellina](https://github.com/abellina)
+- Add temporary dask-cudf workaround for categorical sorting ([#15801](https://github.com/rapidsai/cudf/pull/15801)) [@rjzamora](https://github.com/rjzamora)
+- Fix row group alignment in ORC writer ([#15789](https://github.com/rapidsai/cudf/pull/15789)) [@vuule](https://github.com/vuule)
+- Raise error when sorting by categorical column in dask-cudf ([#15788](https://github.com/rapidsai/cudf/pull/15788)) [@rjzamora](https://github.com/rjzamora)
+- Upgrade `arrow` to 16.1 ([#15787](https://github.com/rapidsai/cudf/pull/15787)) [@galipremsagar](https://github.com/galipremsagar)
+- Add support for `PandasArray` for `pandas&lt;2.1.0` ([#15786](https://github.com/rapidsai/cudf/pull/15786)) [@galipremsagar](https://github.com/galipremsagar)
+- Limit runtime dependency to `libarrow&gt;=16.0.0,&lt;16.1.0a0` ([#15782](https://github.com/rapidsai/cudf/pull/15782)) [@pentschev](https://github.com/pentschev)
+- Fix cat.as_ordered not propogating correct size ([#15780](https://github.com/rapidsai/cudf/pull/15780)) [@mroeschke](https://github.com/mroeschke)
+- Handle mixed-like homogeneous types in `isin` ([#15771](https://github.com/rapidsai/cudf/pull/15771)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix id_vars and value_vars not accepting string scalars in melt ([#15765](https://github.com/rapidsai/cudf/pull/15765)) [@mroeschke](https://github.com/mroeschke)
+- Fix `DatetimeIndex.loc` for all types of ordering cases ([#15761](https://github.com/rapidsai/cudf/pull/15761)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix arrow versioning logic ([#15755](https://github.com/rapidsai/cudf/pull/15755)) [@vyasr](https://github.com/vyasr)
+- Avoid running sanitizer on Java test designed to cause an error ([#15753](https://github.com/rapidsai/cudf/pull/15753)) [@jlowe](https://github.com/jlowe)
+- Handle empty dataframe object with index present in setitem of `loc` ([#15752](https://github.com/rapidsai/cudf/pull/15752)) [@galipremsagar](https://github.com/galipremsagar)
+- Eliminate circular reference in DataFrame/Series.iloc/loc ([#15749](https://github.com/rapidsai/cudf/pull/15749)) [@mroeschke](https://github.com/mroeschke)
+- Cap the absolute row index per pass in parquet chunked reader. ([#15735](https://github.com/rapidsai/cudf/pull/15735)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Fix `Index.repeat` for `datetime64` types ([#15722](https://github.com/rapidsai/cudf/pull/15722)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix multibyte check for case convert for large strings ([#15721](https://github.com/rapidsai/cudf/pull/15721)) [@davidwendt](https://github.com/davidwendt)
+- Fix `get_loc` to properly fetch results from an index that is in decreasing order ([#15719](https://github.com/rapidsai/cudf/pull/15719)) [@galipremsagar](https://github.com/galipremsagar)
+- Return same type as the original index for `.loc` operations ([#15717](https://github.com/rapidsai/cudf/pull/15717)) [@galipremsagar](https://github.com/galipremsagar)
+- Correct static builds + static arrow ([#15715](https://github.com/rapidsai/cudf/pull/15715)) [@robertmaynard](https://github.com/robertmaynard)
+- Raise errors for unsupported operations on certain types ([#15712](https://github.com/rapidsai/cudf/pull/15712)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix ColumnAccessor caching of nrows if empty previously ([#15710](https://github.com/rapidsai/cudf/pull/15710)) [@mroeschke](https://github.com/mroeschke)
+- Allow `None` when `nan_as_null=False` in column constructor ([#15709](https://github.com/rapidsai/cudf/pull/15709)) [@galipremsagar](https://github.com/galipremsagar)
+- Refine `CudaTest.testCudaException` in case throwing wrong type of CudaError under aarch64 ([#15706](https://github.com/rapidsai/cudf/pull/15706)) [@sperlingxx](https://github.com/sperlingxx)
+- Fix maxima of categorical column ([#15701](https://github.com/rapidsai/cudf/pull/15701)) [@rjzamora](https://github.com/rjzamora)
+- Add proxy for inplace operations in `cudf.pandas` ([#15695](https://github.com/rapidsai/cudf/pull/15695)) [@galipremsagar](https://github.com/galipremsagar)
+- Make `nan_as_null` behavior consistent across all APIs ([#15692](https://github.com/rapidsai/cudf/pull/15692)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix CI s3 api command to fetch latest results ([#15687](https://github.com/rapidsai/cudf/pull/15687)) [@galipremsagar](https://github.com/galipremsagar)
+- Add `NumpyExtensionArray` proxy type in `cudf.pandas` ([#15686](https://github.com/rapidsai/cudf/pull/15686)) [@galipremsagar](https://github.com/galipremsagar)
+- Properly implement binaryops for proxy types ([#15684](https://github.com/rapidsai/cudf/pull/15684)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix copy assignment and the comparison operator of `rmm_host_allocator` ([#15677](https://github.com/rapidsai/cudf/pull/15677)) [@vuule](https://github.com/vuule)
+- Fix multi-source reading in JSON byte range reader ([#15671](https://github.com/rapidsai/cudf/pull/15671)) [@shrshi](https://github.com/shrshi)
+- Return `int64` when pandas compatible mode is turned on for `get_indexer` ([#15659](https://github.com/rapidsai/cudf/pull/15659)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix Index contains for error validations and float vs int comparisons ([#15657](https://github.com/rapidsai/cudf/pull/15657)) [@galipremsagar](https://github.com/galipremsagar)
+- Preserve sub-second data for time scalars in column construction ([#15655](https://github.com/rapidsai/cudf/pull/15655)) [@galipremsagar](https://github.com/galipremsagar)
+- Check row limit size in cudf::strings::join_strings ([#15643](https://github.com/rapidsai/cudf/pull/15643)) [@davidwendt](https://github.com/davidwendt)
+- Enable sorting on column with nulls using query-planning ([#15639](https://github.com/rapidsai/cudf/pull/15639)) [@rjzamora](https://github.com/rjzamora)
+- Fix operator precedence problem in Parquet reader ([#15638](https://github.com/rapidsai/cudf/pull/15638)) [@etseidl](https://github.com/etseidl)
+- Fix decoding of dictionary encoded FIXED_LEN_BYTE_ARRAY data in Parquet reader ([#15601](https://github.com/rapidsai/cudf/pull/15601)) [@etseidl](https://github.com/etseidl)
+- Fix debug warnings/errors in from_arrow_device_test.cpp ([#15596](https://github.com/rapidsai/cudf/pull/15596)) [@davidwendt](https://github.com/davidwendt)
+- Add &quot;collect&quot; aggregation support to dask-cudf ([#15593](https://github.com/rapidsai/cudf/pull/15593)) [@rjzamora](https://github.com/rjzamora)
+- Fix categorical-accessor support and testing in dask-cudf ([#15591](https://github.com/rapidsai/cudf/pull/15591)) [@rjzamora](https://github.com/rjzamora)
+- Disable compute-sanitizer usage in CI tests with CUDA&lt;11.6 ([#15584](https://github.com/rapidsai/cudf/pull/15584)) [@davidwendt](https://github.com/davidwendt)
+- Preserve RangeIndex.step in to_arrow/from_arrow ([#15581](https://github.com/rapidsai/cudf/pull/15581)) [@mroeschke](https://github.com/mroeschke)
+- Ignore new cupy warning ([#15574](https://github.com/rapidsai/cudf/pull/15574)) [@vyasr](https://github.com/vyasr)
+- Add cuda-sanitizer-api dependency for test-cpp matrix 11.4 ([#15573](https://github.com/rapidsai/cudf/pull/15573)) [@davidwendt](https://github.com/davidwendt)
+- Allow apply udf to reference global modules in cudf.pandas ([#15569](https://github.com/rapidsai/cudf/pull/15569)) [@mroeschke](https://github.com/mroeschke)
+- Fix deprecation warnings for json legacy reader ([#15563](https://github.com/rapidsai/cudf/pull/15563)) [@davidwendt](https://github.com/davidwendt)
+- Fix millisecond resampling in cudf Python ([#15560](https://github.com/rapidsai/cudf/pull/15560)) [@mroeschke](https://github.com/mroeschke)
+- Rename JSON_READER_OPTION to JSON_READER_OPTION_NVBENCH. ([#15553](https://github.com/rapidsai/cudf/pull/15553)) [@bdice](https://github.com/bdice)
+- Fix a JNI bug in JSON parsing fixup ([#15550](https://github.com/rapidsai/cudf/pull/15550)) [@revans2](https://github.com/revans2)
+- Remove conda channel setup from wheel CI image script. ([#15539](https://github.com/rapidsai/cudf/pull/15539)) [@bdice](https://github.com/bdice)
+- cudf.pandas: Series dt accessor is CombinedDatetimelikeProperties ([#15523](https://github.com/rapidsai/cudf/pull/15523)) [@wence-](https://github.com/wence-)
+- Fix for some compiler warnings in parquet/page_decode.cuh ([#15518](https://github.com/rapidsai/cudf/pull/15518)) [@etseidl](https://github.com/etseidl)
+- Fix exponent overflow in strings-to-double conversion ([#15517](https://github.com/rapidsai/cudf/pull/15517)) [@davidwendt](https://github.com/davidwendt)
+- nanoarrow uses package override for proper pinned versions generation ([#15515](https://github.com/rapidsai/cudf/pull/15515)) [@robertmaynard](https://github.com/robertmaynard)
+- Remove index name overrides in dask-cudf pyarrow table dispatch ([#15514](https://github.com/rapidsai/cudf/pull/15514)) [@charlesbluca](https://github.com/charlesbluca)
+- Fix async synchronization issues in json_column.cu ([#15497](https://github.com/rapidsai/cudf/pull/15497)) [@karthikeyann](https://github.com/karthikeyann)
+- Add new patch to hide more CCCL APIs ([#15493](https://github.com/rapidsai/cudf/pull/15493)) [@vyasr](https://github.com/vyasr)
+- Make improvements in pandas-test reporting ([#15485](https://github.com/rapidsai/cudf/pull/15485)) [@galipremsagar](https://github.com/galipremsagar)
+- Fixed page data truncation in parquet writer under certain conditions. ([#15474](https://github.com/rapidsai/cudf/pull/15474)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Only use data_type constructor with scale for decimal types ([#15472](https://github.com/rapidsai/cudf/pull/15472)) [@wence-](https://github.com/wence-)
+- Avoid &quot;p2p&quot; shuffle as a default when `dask_cudf` is imported ([#15469](https://github.com/rapidsai/cudf/pull/15469)) [@rjzamora](https://github.com/rjzamora)
+- Fix debug build errors from to_arrow_device_test.cpp ([#15463](https://github.com/rapidsai/cudf/pull/15463)) [@davidwendt](https://github.com/davidwendt)
+- Fix base_normalator::integer_sizeof_fn integer dispatch ([#15457](https://github.com/rapidsai/cudf/pull/15457)) [@davidwendt](https://github.com/davidwendt)
+- Allow consumers of static builds to find nanoarrow ([#15456](https://github.com/rapidsai/cudf/pull/15456)) [@robertmaynard](https://github.com/robertmaynard)
+- Allow jit compilation when using a splayed CUDA toolkit ([#15451](https://github.com/rapidsai/cudf/pull/15451)) [@robertmaynard](https://github.com/robertmaynard)
+- Handle case of scan aggregation in groupby-transform ([#15450](https://github.com/rapidsai/cudf/pull/15450)) [@wence-](https://github.com/wence-)
+- Test static builds in CI and fix nanoarrow configure ([#15437](https://github.com/rapidsai/cudf/pull/15437)) [@vyasr](https://github.com/vyasr)
+- Fixes potential race in JSON parser when parsing JSON lines format and when recovering from invalid lines ([#15419](https://github.com/rapidsai/cudf/pull/15419)) [@elstehle](https://github.com/elstehle)
+- Fix errors in chunked ORC writer when no tables were (successfully) written ([#15393](https://github.com/rapidsai/cudf/pull/15393)) [@vuule](https://github.com/vuule)
+- Support implicit array conversion with query-planning enabled ([#15378](https://github.com/rapidsai/cudf/pull/15378)) [@rjzamora](https://github.com/rjzamora)
+- Fix arrow-based round trip of empty dataframes ([#15373](https://github.com/rapidsai/cudf/pull/15373)) [@wence-](https://github.com/wence-)
+- Remove empty elements from exploded character-ngrams output ([#15371](https://github.com/rapidsai/cudf/pull/15371)) [@davidwendt](https://github.com/davidwendt)
+- Remove boundscheck=False setting in cython files ([#15362](https://github.com/rapidsai/cudf/pull/15362)) [@wence-](https://github.com/wence-)
+- Patch dask-expr `var` logic in dask-cudf ([#15347](https://github.com/rapidsai/cudf/pull/15347)) [@rjzamora](https://github.com/rjzamora)
+- Fix for logical and syntactical errors in libcudf c++ examples ([#15346](https://github.com/rapidsai/cudf/pull/15346)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Disable dask-expr in docs builds. ([#15343](https://github.com/rapidsai/cudf/pull/15343)) [@bdice](https://github.com/bdice)
+- Apply the cuFile error work around to data_sink as well ([#15335](https://github.com/rapidsai/cudf/pull/15335)) [@vuule](https://github.com/vuule)
+- Fix parquet predicate filtering with column projection ([#15113](https://github.com/rapidsai/cudf/pull/15113)) [@karthikeyann](https://github.com/karthikeyann)
+- Check column type equality, handling nested types correctly. ([#14531](https://github.com/rapidsai/cudf/pull/14531)) [@bdice](https://github.com/bdice)
+
+## 📖 Documentation
+
+- Fix docs for IO readers and strings_convert ([#15842](https://github.com/rapidsai/cudf/pull/15842)) [@bdice](https://github.com/bdice)
+- Update cudf.pandas docs for GA ([#15744](https://github.com/rapidsai/cudf/pull/15744)) [@beckernick](https://github.com/beckernick)
+- Add contributing warning about circular imports ([#15691](https://github.com/rapidsai/cudf/pull/15691)) [@er-eis](https://github.com/er-eis)
+- Update libcudf developer guide for strings offsets column ([#15661](https://github.com/rapidsai/cudf/pull/15661)) [@davidwendt](https://github.com/davidwendt)
+- Update developer guide with device_async_resource_ref guidelines ([#15562](https://github.com/rapidsai/cudf/pull/15562)) [@harrism](https://github.com/harrism)
+- DOC: add pandas intersphinx mapping ([#15531](https://github.com/rapidsai/cudf/pull/15531)) [@raybellwaves](https://github.com/raybellwaves)
+- rm-dup-doc in frame.py ([#15530](https://github.com/rapidsai/cudf/pull/15530)) [@raybellwaves](https://github.com/raybellwaves)
+- Update CONTRIBUTING.md to use latest cuda env ([#15467](https://github.com/rapidsai/cudf/pull/15467)) [@raybellwaves](https://github.com/raybellwaves)
+- Doc: interleave columns pandas compat ([#15383](https://github.com/rapidsai/cudf/pull/15383)) [@raybellwaves](https://github.com/raybellwaves)
+- Simplified README Examples ([#15338](https://github.com/rapidsai/cudf/pull/15338)) [@wkaisertexas](https://github.com/wkaisertexas)
+- Add debug tips section to libcudf developer guide ([#15329](https://github.com/rapidsai/cudf/pull/15329)) [@davidwendt](https://github.com/davidwendt)
+- Fix and clarify notes on result ordering ([#13255](https://github.com/rapidsai/cudf/pull/13255)) [@shwina](https://github.com/shwina)
+
+## 🚀 New Features
+
+- Add JNI bindings for zstd compression of NVCOMP. ([#15729](https://github.com/rapidsai/cudf/pull/15729)) [@firestarman](https://github.com/firestarman)
+- Fix spaces around CSV quoted strings ([#15727](https://github.com/rapidsai/cudf/pull/15727)) [@thabetx](https://github.com/thabetx)
+- Add default pinned pool that falls back to new pinned allocations ([#15665](https://github.com/rapidsai/cudf/pull/15665)) [@vuule](https://github.com/vuule)
+- Overhaul ops-codeowners coverage ([#15660](https://github.com/rapidsai/cudf/pull/15660)) [@raydouglass](https://github.com/raydouglass)
+- Concatenate dictionary of objects along axis=1 ([#15623](https://github.com/rapidsai/cudf/pull/15623)) [@er-eis](https://github.com/er-eis)
+- Construct `pylibcudf` columns from objects supporting `__cuda_array_interface__` ([#15615](https://github.com/rapidsai/cudf/pull/15615)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Expose some Parquet per-column configuration options via the python API ([#15613](https://github.com/rapidsai/cudf/pull/15613)) [@etseidl](https://github.com/etseidl)
+- Migrate string `find` operations to `pylibcudf` ([#15604](https://github.com/rapidsai/cudf/pull/15604)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Round trip FIXED_LEN_BYTE_ARRAY data properly in Parquet writer ([#15600](https://github.com/rapidsai/cudf/pull/15600)) [@etseidl](https://github.com/etseidl)
+- Reading multi-line JSON in string columns using runtime configurable delimiter ([#15556](https://github.com/rapidsai/cudf/pull/15556)) [@shrshi](https://github.com/shrshi)
+- Remove public gtest dependency from libcudf conda package ([#15534](https://github.com/rapidsai/cudf/pull/15534)) [@robertmaynard](https://github.com/robertmaynard)
+- Fea/move to latest nanoarrow ([#15526](https://github.com/rapidsai/cudf/pull/15526)) [@robertmaynard](https://github.com/robertmaynard)
+- Migrate string `case` operations to `pylibcudf` ([#15489](https://github.com/rapidsai/cudf/pull/15489)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add Parquet encoding statistics to column chunk metadata ([#15452](https://github.com/rapidsai/cudf/pull/15452)) [@etseidl](https://github.com/etseidl)
+- Implement JNI for chunked ORC reader ([#15446](https://github.com/rapidsai/cudf/pull/15446)) [@ttnghia](https://github.com/ttnghia)
+- Add some missing optional fields to the Parquet RowGroup metadata ([#15421](https://github.com/rapidsai/cudf/pull/15421)) [@etseidl](https://github.com/etseidl)
+- Adding parquet transcoding example ([#15420](https://github.com/rapidsai/cudf/pull/15420)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Add fields to Parquet Statistics structure that were added in parquet-format 2.10 ([#15412](https://github.com/rapidsai/cudf/pull/15412)) [@etseidl](https://github.com/etseidl)
+- Add option to Parquet writer to skip compressing individual columns ([#15411](https://github.com/rapidsai/cudf/pull/15411)) [@etseidl](https://github.com/etseidl)
+- Add BYTE_STREAM_SPLIT support to Parquet ([#15311](https://github.com/rapidsai/cudf/pull/15311)) [@etseidl](https://github.com/etseidl)
+- Introduce benchmark suite for JSON reader options ([#15124](https://github.com/rapidsai/cudf/pull/15124)) [@shrshi](https://github.com/shrshi)
+- Implement ORC chunked reader ([#15094](https://github.com/rapidsai/cudf/pull/15094)) [@ttnghia](https://github.com/ttnghia)
+- Extend cudf devcontainers to specify jitify2 kernel cache ([#15068](https://github.com/rapidsai/cudf/pull/15068)) [@robertmaynard](https://github.com/robertmaynard)
+- Add `to_arrow_device` function to cudf interop using nanoarrow ([#15047](https://github.com/rapidsai/cudf/pull/15047)) [@zeroshade](https://github.com/zeroshade)
+- Add JSON option to prune columns ([#14996](https://github.com/rapidsai/cudf/pull/14996)) [@karthikeyann](https://github.com/karthikeyann)
+
+## 🛠️ Improvements
+
+- Deprecate `Groupby.collect` ([#15808](https://github.com/rapidsai/cudf/pull/15808)) [@galipremsagar](https://github.com/galipremsagar)
+- Raise FileNotFoundError when a literal JSON string that looks like a json filename is passed ([#15806](https://github.com/rapidsai/cudf/pull/15806)) [@lithomas1](https://github.com/lithomas1)
+- Deprecate `divisions=&#39;quantile&#39;` support in `set_index` ([#15804](https://github.com/rapidsai/cudf/pull/15804)) [@rjzamora](https://github.com/rjzamora)
+- Improve performance of Series.to_numpy/to_cupy ([#15792](https://github.com/rapidsai/cudf/pull/15792)) [@mroeschke](https://github.com/mroeschke)
+- Access `self.index` instead of `self._index` where possible ([#15781](https://github.com/rapidsai/cudf/pull/15781)) [@mroeschke](https://github.com/mroeschke)
+- Support filtered I/O in `chunked_parquet_reader` and simplify the use of `parquet_reader_options` ([#15764](https://github.com/rapidsai/cudf/pull/15764)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Avoid index-to-column conversion in some DataFrame ops ([#15763](https://github.com/rapidsai/cudf/pull/15763)) [@mroeschke](https://github.com/mroeschke)
+- Fix `chunked_parquet_reader` behavior when input has no more rows to read ([#15757](https://github.com/rapidsai/cudf/pull/15757)) [@mhaseeb123](https://github.com/mhaseeb123)
+- [JNI] Expose java API for cudf::io::config_host_memory_resource ([#15745](https://github.com/rapidsai/cudf/pull/15745)) [@abellina](https://github.com/abellina)
+- Migrate all cpp pxd files into pylibcudf ([#15740](https://github.com/rapidsai/cudf/pull/15740)) [@vyasr](https://github.com/vyasr)
+- Validate and materialize iterators earlier in as_column ([#15739](https://github.com/rapidsai/cudf/pull/15739)) [@mroeschke](https://github.com/mroeschke)
+- Push some as_column arrow logic to ColumnBase.from_arrow ([#15738](https://github.com/rapidsai/cudf/pull/15738)) [@mroeschke](https://github.com/mroeschke)
+- Expose stream parameter in public reduction APIs ([#15737](https://github.com/rapidsai/cudf/pull/15737)) [@srinivasyadav18](https://github.com/srinivasyadav18)
+- remove unnecessary &#39;setuptools&#39; host dependency, simplify dependencies.yaml ([#15736](https://github.com/rapidsai/cudf/pull/15736)) [@jameslamb](https://github.com/jameslamb)
+- Defer to C++ equality and hashing for pylibcudf DataType and Aggregation objects ([#15732](https://github.com/rapidsai/cudf/pull/15732)) [@wence-](https://github.com/wence-)
+- Implement null-aware NOT_EQUALS binop ([#15731](https://github.com/rapidsai/cudf/pull/15731)) [@wence-](https://github.com/wence-)
+- Fix split-record result list column offset type ([#15707](https://github.com/rapidsai/cudf/pull/15707)) [@davidwendt](https://github.com/davidwendt)
+- Upgrade `arrow` to `16` ([#15703](https://github.com/rapidsai/cudf/pull/15703)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove experimental namespace from make_strings_children ([#15702](https://github.com/rapidsai/cudf/pull/15702)) [@davidwendt](https://github.com/davidwendt)
+- Rework get_json_object benchmark to use nvbench ([#15698](https://github.com/rapidsai/cudf/pull/15698)) [@davidwendt](https://github.com/davidwendt)
+- Rework some python tests of Parquet delta encodings ([#15693](https://github.com/rapidsai/cudf/pull/15693)) [@etseidl](https://github.com/etseidl)
+- Skeleton cudf polars package ([#15688](https://github.com/rapidsai/cudf/pull/15688)) [@wence-](https://github.com/wence-)
+- Upgrade pre commit hooks ([#15685](https://github.com/rapidsai/cudf/pull/15685)) [@wence-](https://github.com/wence-)
+- Allow `fillna` to validate for `CategoricalColumn.fillna` ([#15683](https://github.com/rapidsai/cudf/pull/15683)) [@galipremsagar](https://github.com/galipremsagar)
+- Misc Column cleanups ([#15682](https://github.com/rapidsai/cudf/pull/15682)) [@mroeschke](https://github.com/mroeschke)
+- Reducing runtime of JSON reader options benchmark ([#15681](https://github.com/rapidsai/cudf/pull/15681)) [@shrshi](https://github.com/shrshi)
+- Add `Timestamp` and `Timedelta` proxy types ([#15680](https://github.com/rapidsai/cudf/pull/15680)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove host_parse_nested_json. ([#15674](https://github.com/rapidsai/cudf/pull/15674)) [@bdice](https://github.com/bdice)
+- Reduce runtime for ParquetChunkedReaderInputLimitTest gtests ([#15672](https://github.com/rapidsai/cudf/pull/15672)) [@davidwendt](https://github.com/davidwendt)
+- Add large-strings gtest for cudf::interleave_columns ([#15669](https://github.com/rapidsai/cudf/pull/15669)) [@davidwendt](https://github.com/davidwendt)
+- Use experimental make_strings_children for multi-replace_re ([#15667](https://github.com/rapidsai/cudf/pull/15667)) [@davidwendt](https://github.com/davidwendt)
+- Enabled `Holiday` types in `cudf.pandas` ([#15664](https://github.com/rapidsai/cudf/pull/15664)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove obsolete `XFAIL` markers for query-planning ([#15662](https://github.com/rapidsai/cudf/pull/15662)) [@rjzamora](https://github.com/rjzamora)
+- Clean up join benchmarks ([#15644](https://github.com/rapidsai/cudf/pull/15644)) [@PointKernel](https://github.com/PointKernel)
+- Enable warnings as errors in custreamz ([#15642](https://github.com/rapidsai/cudf/pull/15642)) [@mroeschke](https://github.com/mroeschke)
+- Improve distinct join with set `retrieve` ([#15636](https://github.com/rapidsai/cudf/pull/15636)) [@PointKernel](https://github.com/PointKernel)
+- Fix -Werror=type-limits. ([#15635](https://github.com/rapidsai/cudf/pull/15635)) [@bdice](https://github.com/bdice)
+- Enable FutureWarnings/DeprecationWarnings as errors for dask_cudf ([#15634](https://github.com/rapidsai/cudf/pull/15634)) [@mroeschke](https://github.com/mroeschke)
+- Remove NVBench SHA override. ([#15633](https://github.com/rapidsai/cudf/pull/15633)) [@alliepiper](https://github.com/alliepiper)
+- Add support for large string columns to Parquet reader and writer ([#15632](https://github.com/rapidsai/cudf/pull/15632)) [@etseidl](https://github.com/etseidl)
+- Large strings support in MD5 and SHA hashers ([#15631](https://github.com/rapidsai/cudf/pull/15631)) [@davidwendt](https://github.com/davidwendt)
+- Fix make_offsets_child_column usage in cudf::strings::detail::shift ([#15630](https://github.com/rapidsai/cudf/pull/15630)) [@davidwendt](https://github.com/davidwendt)
+- Use experimental make_strings_children for strings convert ([#15629](https://github.com/rapidsai/cudf/pull/15629)) [@davidwendt](https://github.com/davidwendt)
+- Forward-merge branch-24.04 to branch-24.06 ([#15627](https://github.com/rapidsai/cudf/pull/15627)) [@bdice](https://github.com/bdice)
+- Avoid accessing attributes via `_column` if not needed ([#15624](https://github.com/rapidsai/cudf/pull/15624)) [@mroeschke](https://github.com/mroeschke)
+- Make ColumnBase.__cuda_array_interface__ opt out instead of opt in ([#15622](https://github.com/rapidsai/cudf/pull/15622)) [@mroeschke](https://github.com/mroeschke)
+- Large strings support for cudf::gather ([#15621](https://github.com/rapidsai/cudf/pull/15621)) [@davidwendt](https://github.com/davidwendt)
+- Remove jni-docker-build workflow ([#15619](https://github.com/rapidsai/cudf/pull/15619)) [@bdice](https://github.com/bdice)
+- Support `DurationType` in cudf parquet reader via `arrow:schema` ([#15617](https://github.com/rapidsai/cudf/pull/15617)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Drop Centos7 support ([#15608](https://github.com/rapidsai/cudf/pull/15608)) [@NvTimLiu](https://github.com/NvTimLiu)
+- Use experimental make_strings_children for json/csv writers ([#15599](https://github.com/rapidsai/cudf/pull/15599)) [@davidwendt](https://github.com/davidwendt)
+- Use experimental make_strings_children for strings join/url_encode/slice ([#15598](https://github.com/rapidsai/cudf/pull/15598)) [@davidwendt](https://github.com/davidwendt)
+- Use experimental make_strings_children in nvtext APIs ([#15595](https://github.com/rapidsai/cudf/pull/15595)) [@davidwendt](https://github.com/davidwendt)
+- Migrate to `{{ stdlib(&quot;c&quot;) }}` ([#15594](https://github.com/rapidsai/cudf/pull/15594)) [@hcho3](https://github.com/hcho3)
+- Deprecate `to/from_dask_dataframe` APIs in dask-cudf ([#15592](https://github.com/rapidsai/cudf/pull/15592)) [@rjzamora](https://github.com/rjzamora)
+- Minor fixups for future NumPy 2 compatibility ([#15590](https://github.com/rapidsai/cudf/pull/15590)) [@seberg](https://github.com/seberg)
+- Delay materializing RangeIndex in .reset_index ([#15588](https://github.com/rapidsai/cudf/pull/15588)) [@mroeschke](https://github.com/mroeschke)
+- Use experimental make_strings_children for capitalize/case/pad functions ([#15587](https://github.com/rapidsai/cudf/pull/15587)) [@davidwendt](https://github.com/davidwendt)
+- Use experimental make_strings_children for strings replace/filter/translate ([#15586](https://github.com/rapidsai/cudf/pull/15586)) [@davidwendt](https://github.com/davidwendt)
+- Add multithreaded parquet reader benchmarks. ([#15585](https://github.com/rapidsai/cudf/pull/15585)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Don&#39;t materialize column during RangeIndex methods ([#15582](https://github.com/rapidsai/cudf/pull/15582)) [@mroeschke](https://github.com/mroeschke)
+- Improve performance for cudf::strings::count_re ([#15578](https://github.com/rapidsai/cudf/pull/15578)) [@davidwendt](https://github.com/davidwendt)
+- Replace RangeIndex._start/_stop/_step with _range ([#15576](https://github.com/rapidsai/cudf/pull/15576)) [@mroeschke](https://github.com/mroeschke)
+- add --rm and --name to devcontainer run args ([#15572](https://github.com/rapidsai/cudf/pull/15572)) [@trxcllnt](https://github.com/trxcllnt)
+- Change the default dictionary policy in Parquet writer from `ALWAYS` to `ADAPTIVE` ([#15570](https://github.com/rapidsai/cudf/pull/15570)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Rename experimental JSON tests. ([#15568](https://github.com/rapidsai/cudf/pull/15568)) [@bdice](https://github.com/bdice)
+- Refactor JNI native dependency loading to allow returning of library path ([#15566](https://github.com/rapidsai/cudf/pull/15566)) [@jlowe](https://github.com/jlowe)
+- Remove protobuf and use parsed ORC statistics from libcudf ([#15564](https://github.com/rapidsai/cudf/pull/15564)) [@bdice](https://github.com/bdice)
+- Deprecate legacy JSON reader options. ([#15558](https://github.com/rapidsai/cudf/pull/15558)) [@bdice](https://github.com/bdice)
+- Use same .clang-format in cuDF JNI ([#15557](https://github.com/rapidsai/cudf/pull/15557)) [@bdice](https://github.com/bdice)
+- Large strings support for cudf::fill ([#15555](https://github.com/rapidsai/cudf/pull/15555)) [@davidwendt](https://github.com/davidwendt)
+- Upgrade upper bound pinning to `pandas-2.2.2` ([#15554](https://github.com/rapidsai/cudf/pull/15554)) [@galipremsagar](https://github.com/galipremsagar)
+- Work around issues with cccl main ([#15552](https://github.com/rapidsai/cudf/pull/15552)) [@miscco](https://github.com/miscco)
+- Enable pandas plotting unit tests for cudf.pandas ([#15547](https://github.com/rapidsai/cudf/pull/15547)) [@mroeschke](https://github.com/mroeschke)
+- Move timezone conversion logic to `DatetimeColumn` ([#15545](https://github.com/rapidsai/cudf/pull/15545)) [@mroeschke](https://github.com/mroeschke)
+- Large strings support for cudf::interleave_columns ([#15544](https://github.com/rapidsai/cudf/pull/15544)) [@davidwendt](https://github.com/davidwendt)
+- [skip ci] Switch back to 24.06 branch for pandas tests ([#15543](https://github.com/rapidsai/cudf/pull/15543)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove checks dependency from static-configure test job. ([#15542](https://github.com/rapidsai/cudf/pull/15542)) [@bdice](https://github.com/bdice)
+- Remove legacy JSON reader from Python ([#15538](https://github.com/rapidsai/cudf/pull/15538)) [@bdice](https://github.com/bdice)
+- Enable more ignored pandas unit tests for cudf.pandas ([#15535](https://github.com/rapidsai/cudf/pull/15535)) [@mroeschke](https://github.com/mroeschke)
+- Large strings support for cudf::clamp ([#15533](https://github.com/rapidsai/cudf/pull/15533)) [@davidwendt](https://github.com/davidwendt)
+- Remove version hard-coding ([#15529](https://github.com/rapidsai/cudf/pull/15529)) [@galipremsagar](https://github.com/galipremsagar)
+- Removing all batching code from parquet writer ([#15528](https://github.com/rapidsai/cudf/pull/15528)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Make some private class properties not settable ([#15527](https://github.com/rapidsai/cudf/pull/15527)) [@mroeschke](https://github.com/mroeschke)
+- Large strings support in regex replace APIs ([#15524](https://github.com/rapidsai/cudf/pull/15524)) [@davidwendt](https://github.com/davidwendt)
+- Skip pandas unit tests that crash pytest workers in `cudf.pandas` ([#15521](https://github.com/rapidsai/cudf/pull/15521)) [@mroeschke](https://github.com/mroeschke)
+- Preserve column metadata during more DataFrame operations ([#15519](https://github.com/rapidsai/cudf/pull/15519)) [@mroeschke](https://github.com/mroeschke)
+- Move to pandas-tests to a dedicated workflow file and trigger it from branch.yaml ([#15516](https://github.com/rapidsai/cudf/pull/15516)) [@galipremsagar](https://github.com/galipremsagar)
+- Large strings gtest fixture and utilities ([#15513](https://github.com/rapidsai/cudf/pull/15513)) [@davidwendt](https://github.com/davidwendt)
+- Convert libcudf resource parameters to rmm::device_async_resource_ref ([#15507](https://github.com/rapidsai/cudf/pull/15507)) [@harrism](https://github.com/harrism)
+- Relax protobuf lower bound to 3.20. ([#15506](https://github.com/rapidsai/cudf/pull/15506)) [@bdice](https://github.com/bdice)
+- Clean up index methods ([#15496](https://github.com/rapidsai/cudf/pull/15496)) [@mroeschke](https://github.com/mroeschke)
+- Update strings contains benchmarks to nvbench ([#15495](https://github.com/rapidsai/cudf/pull/15495)) [@davidwendt](https://github.com/davidwendt)
+- Update NVBench fixture to use new hooks, fix pinned memory segfault. ([#15492](https://github.com/rapidsai/cudf/pull/15492)) [@alliepiper](https://github.com/alliepiper)
+- Enable tests/scalar and test/series in cudf.pandas tests ([#15486](https://github.com/rapidsai/cudf/pull/15486)) [@mroeschke](https://github.com/mroeschke)
+- Clean up __cuda_array_interface__ handling in as_column ([#15477](https://github.com/rapidsai/cudf/pull/15477)) [@mroeschke](https://github.com/mroeschke)
+- Avoid .ordered and .categories from being settable in CategoricalColumn and CategoricalDtype ([#15475](https://github.com/rapidsai/cudf/pull/15475)) [@mroeschke](https://github.com/mroeschke)
+- Ignore pandas tests for cudf.pandas that need motoserver ([#15468](https://github.com/rapidsai/cudf/pull/15468)) [@mroeschke](https://github.com/mroeschke)
+- Use cached_property for NumericColumn.nan_count instead of ._nan_count variable ([#15466](https://github.com/rapidsai/cudf/pull/15466)) [@mroeschke](https://github.com/mroeschke)
+- Add to_arrow_device() functions that accept views ([#15465](https://github.com/rapidsai/cudf/pull/15465)) [@davidwendt](https://github.com/davidwendt)
+- Add custom status check workflow ([#15464](https://github.com/rapidsai/cudf/pull/15464)) [@galipremsagar](https://github.com/galipremsagar)
+- Disable pandas 2.x clipboard tests in cudf.pandas tests ([#15462](https://github.com/rapidsai/cudf/pull/15462)) [@mroeschke](https://github.com/mroeschke)
+- Enable tests/strings/test_api.py and tests/io/pytables in cudf.pandas tests ([#15461](https://github.com/rapidsai/cudf/pull/15461)) [@mroeschke](https://github.com/mroeschke)
+- Enable test_parsing in cudf.pandas tests ([#15460](https://github.com/rapidsai/cudf/pull/15460)) [@mroeschke](https://github.com/mroeschke)
+- Add `from_arrow_device` function to cudf interop using nanoarrow ([#15458](https://github.com/rapidsai/cudf/pull/15458)) [@zeroshade](https://github.com/zeroshade)
+- Remove deprecated strings offsets_begin ([#15454](https://github.com/rapidsai/cudf/pull/15454)) [@davidwendt](https://github.com/davidwendt)
+- Enable tests/windows/ in cudf.pandas tests ([#15444](https://github.com/rapidsai/cudf/pull/15444)) [@mroeschke](https://github.com/mroeschke)
+- Enable tests/interchange/test_impl.py in cudf.pandas tests ([#15443](https://github.com/rapidsai/cudf/pull/15443)) [@mroeschke](https://github.com/mroeschke)
+- Enable tests/io/test_user_agent.py in cudf pandas tests ([#15442](https://github.com/rapidsai/cudf/pull/15442)) [@mroeschke](https://github.com/mroeschke)
+- Performance improvement in libcudf case conversion for long strings ([#15441](https://github.com/rapidsai/cudf/pull/15441)) [@davidwendt](https://github.com/davidwendt)
+- Remove prior test skipping in run-pandas-tests with testing 2.2.1 ([#15440](https://github.com/rapidsai/cudf/pull/15440)) [@mroeschke](https://github.com/mroeschke)
+- Support orc and text IO with dask-expr using legacy conversion ([#15439](https://github.com/rapidsai/cudf/pull/15439)) [@rjzamora](https://github.com/rjzamora)
+- Floating &lt;--&gt; fixed-point conversion must now be called explicitly ([#15438](https://github.com/rapidsai/cudf/pull/15438)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Unify Copy-On-Write and Spilling ([#15436](https://github.com/rapidsai/cudf/pull/15436)) [@madsbk](https://github.com/madsbk)
+- Enable ``dask_cudf`` json and s3 tests with query-planning on ([#15408](https://github.com/rapidsai/cudf/pull/15408)) [@rjzamora](https://github.com/rjzamora)
+- Bump ruff and codespell pre-commit checks ([#15407](https://github.com/rapidsai/cudf/pull/15407)) [@mroeschke](https://github.com/mroeschke)
+- Enable all tests for `arm` arch ([#15402](https://github.com/rapidsai/cudf/pull/15402)) [@galipremsagar](https://github.com/galipremsagar)
+- Bind `read_parquet_metadata` API to libcudf instead of pyarrow and extract `RowGroup` information ([#15398](https://github.com/rapidsai/cudf/pull/15398)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Optimizing multi-source byte range reading in JSON reader ([#15396](https://github.com/rapidsai/cudf/pull/15396)) [@shrshi](https://github.com/shrshi)
+- add correct labels to pandas_function_request.md ([#15381](https://github.com/rapidsai/cudf/pull/15381)) [@raybellwaves](https://github.com/raybellwaves)
+- Remove deprecated hash() and spark_murmurhash3_x86_32() ([#15375](https://github.com/rapidsai/cudf/pull/15375)) [@davidwendt](https://github.com/davidwendt)
+- Large strings support in cudf::merge ([#15374](https://github.com/rapidsai/cudf/pull/15374)) [@davidwendt](https://github.com/davidwendt)
+- Enable test-reporting for pandas pytests in CI ([#15369](https://github.com/rapidsai/cudf/pull/15369)) [@galipremsagar](https://github.com/galipremsagar)
+- Use logical types in Parquet reader ([#15365](https://github.com/rapidsai/cudf/pull/15365)) [@etseidl](https://github.com/etseidl)
+- Add experimental make_strings_children utility ([#15363](https://github.com/rapidsai/cudf/pull/15363)) [@davidwendt](https://github.com/davidwendt)
+- Forward-merge branch-24.04 to branch-24.06 ([#15349](https://github.com/rapidsai/cudf/pull/15349)) [@bdice](https://github.com/bdice)
+- Fix CMake files in libcudf C++ examples to use existing libcudf build if present ([#15348](https://github.com/rapidsai/cudf/pull/15348)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Use ruff pydocstyle over pydocstyle pre-commit hook ([#15345](https://github.com/rapidsai/cudf/pull/15345)) [@mroeschke](https://github.com/mroeschke)
+- Refactor stream mode setup for gtests ([#15337](https://github.com/rapidsai/cudf/pull/15337)) [@davidwendt](https://github.com/davidwendt)
+- Benchmark decimal &lt;--&gt; floating conversions. ([#15334](https://github.com/rapidsai/cudf/pull/15334)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Avoid duplicate dask-cudf testing ([#15333](https://github.com/rapidsai/cudf/pull/15333)) [@rjzamora](https://github.com/rjzamora)
+- Skip decode steps in Parquet reader when nullable columns have no nulls ([#15332](https://github.com/rapidsai/cudf/pull/15332)) [@etseidl](https://github.com/etseidl)
+- Update udf_cpp to use rapids_cpm_cccl. ([#15331](https://github.com/rapidsai/cudf/pull/15331)) [@bdice](https://github.com/bdice)
+- Forward-merge branch-24.04 into branch-24.06 [skip ci] ([#15330](https://github.com/rapidsai/cudf/pull/15330)) [@rapids-bot[bot]](https://github.com/rapids-bot[bot])
+- Allow ``numeric_only=True`` for simple groupby reductions ([#15326](https://github.com/rapidsai/cudf/pull/15326)) [@rjzamora](https://github.com/rjzamora)
+- Drop CentOS 7 support. ([#15323](https://github.com/rapidsai/cudf/pull/15323)) [@bdice](https://github.com/bdice)
+- Rework cudf::find_and_replace_all to use gather-based make_strings_column ([#15305](https://github.com/rapidsai/cudf/pull/15305)) [@davidwendt](https://github.com/davidwendt)
+- First pass at adding testing for pylibcudf ([#15300](https://github.com/rapidsai/cudf/pull/15300)) [@vyasr](https://github.com/vyasr)
+- [FEA] Performance improvement for mixed left semi/anti join ([#15288](https://github.com/rapidsai/cudf/pull/15288)) [@tgujar](https://github.com/tgujar)
+- Rework cudf::replace_nulls to use strings::detail::copy_if_else ([#15286](https://github.com/rapidsai/cudf/pull/15286)) [@davidwendt](https://github.com/davidwendt)
+- Clean up special casing in `as_column` for non-typed input ([#15276](https://github.com/rapidsai/cudf/pull/15276)) [@mroeschke](https://github.com/mroeschke)
+- Large strings support in cudf::concatenate ([#15195](https://github.com/rapidsai/cudf/pull/15195)) [@davidwendt](https://github.com/davidwendt)
+- Use less _is_categorical_dtype ([#15148](https://github.com/rapidsai/cudf/pull/15148)) [@mroeschke](https://github.com/mroeschke)
+- Align date_range defaults with pandas, support tz ([#15139](https://github.com/rapidsai/cudf/pull/15139)) [@mroeschke](https://github.com/mroeschke)
+- `ModuleAccelerator` performance: cache the result of checking if a caller is in the denylist ([#15056](https://github.com/rapidsai/cudf/pull/15056)) [@shwina](https://github.com/shwina)
+- Use offsetalator in cudf::strings::replace functions ([#14824](https://github.com/rapidsai/cudf/pull/14824)) [@davidwendt](https://github.com/davidwendt)
+- Cleanup some timedelta/datetime column logic ([#14715](https://github.com/rapidsai/cudf/pull/14715)) [@mroeschke](https://github.com/mroeschke)
+- Refactor numpy array input in as_column ([#14651](https://github.com/rapidsai/cudf/pull/14651)) [@mroeschke](https://github.com/mroeschke)
+- Refactor joins for conditional semis and antis ([#14646](https://github.com/rapidsai/cudf/pull/14646)) [@DanialJavady96](https://github.com/DanialJavady96)
+- Eagerly populate the class dict for cudf.pandas proxy types ([#14534](https://github.com/rapidsai/cudf/pull/14534)) [@shwina](https://github.com/shwina)
+- Some additional kernel thread index refactoring. ([#14107](https://github.com/rapidsai/cudf/pull/14107)) [@bdice](https://github.com/bdice)
+
 # cuDF 24.04.00 (10 Apr 2024)
 
 ## 🚨 Breaking Changes

From db1b36592ba5d76158d1c6e1a3c6440c25a382e7 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Wed, 5 Jun 2024 09:48:20 -0700
Subject: [PATCH 303/842] Migrate string replace.pxd to pylibcudf (#15839)

xref #15162

Change replace.pxd to use pylibcudf APIs.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15839
---
 .../user_guide/api_docs/pylibcudf/index.rst   |   8 +-
 .../api_docs/pylibcudf/strings/index.rst      |   7 +
 .../api_docs/pylibcudf/strings/replace.rst    |   6 +
 .../_lib/pylibcudf/strings/CMakeLists.txt     |   4 +-
 .../cudf/_lib/pylibcudf/strings/__init__.pxd  |   2 +-
 .../cudf/_lib/pylibcudf/strings/__init__.py   |   2 +-
 .../cudf/_lib/pylibcudf/strings/replace.pxd   |  25 +++
 .../cudf/_lib/pylibcudf/strings/replace.pyx   | 162 ++++++++++++++++++
 python/cudf/cudf/_lib/strings/replace.pyx     |  99 +++--------
 .../pylibcudf_tests/test_string_replace.py    | 126 ++++++++++++++
 10 files changed, 362 insertions(+), 79 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_replace.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 58fea77adaa..b6ad1157511 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -6,7 +6,7 @@ This page provides API documentation for pylibcudf.
 
 .. toctree::
     :maxdepth: 1
-    :caption: API Documentation
+    :caption: Top-level modules
 
     aggregation
     binaryop
@@ -32,3 +32,9 @@ This page provides API documentation for pylibcudf.
     table
     types
     unary
+
+.. toctree::
+    :maxdepth: 2
+    :caption: Subpackages
+
+    strings/index.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
new file mode 100644
index 00000000000..8970fc80c0b
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -0,0 +1,7 @@
+strings
+=======
+
+.. toctree::
+    :maxdepth: 1
+
+    replace
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst
new file mode 100644
index 00000000000..9575ec226a7
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst
@@ -0,0 +1,6 @@
+=======
+replace
+=======
+
+.. automodule:: cudf._lib.pylibcudf.strings.replace
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
index 0e9c1c916f0..c9a983e24f4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
@@ -12,11 +12,11 @@
 # the License.
 # =============================================================================
 
-set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx)
+set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx replace.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_strings_ ASSOCIATED_TARGETS cudf
 )
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
index ec3dbc150b5..7563df8a107 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
@@ -1,3 +1,3 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . cimport capitalize, case, char_types, find
+from . cimport capitalize, case, char_types, find, replace
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
index 3793bda0aa4..cb4f0e38f97 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
@@ -1,3 +1,3 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import capitalize, case, char_types, find
+from . import capitalize, case, char_types, find, replace
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd
new file mode 100644
index 00000000000..52e2dc3c738
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd
@@ -0,0 +1,25 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.scalar cimport Scalar
+
+
+cpdef Column replace(
+    Column input,
+    Scalar target,
+    Scalar repl,
+    size_type maxrepl = *
+)
+cpdef Column replace_multiple(
+    Column input,
+    Column target,
+    Column repl,
+    size_type maxrepl = *
+)
+cpdef Column replace_slice(
+    Column input,
+    Scalar repl = *,
+    size_type start = *,
+    size_type stop = *
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx
new file mode 100644
index 00000000000..c757150a600
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx
@@ -0,0 +1,162 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_string_scalar as cpp_make_string_scalar,
+)
+from cudf._lib.pylibcudf.libcudf.strings.replace cimport (
+    replace as cpp_replace,
+    replace_multiple as cpp_replace_multiple,
+    replace_slice as cpp_replace_slice,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.scalar cimport Scalar
+
+
+cpdef Column replace(
+    Column input,
+    Scalar target,
+    Scalar repl,
+    size_type maxrepl = -1
+):
+    """Replaces target string within each string with the specified replacement string.
+
+    Null string entries will return null output string entries.
+
+    For details, see :cpp:func:`replace`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Scalar
+        String to search for in each string.
+    repl : Scalar
+        String to replace target with.
+    maxrepl : size_type, default -1
+        Maximum times to replace if target appears multiple times in the input string.
+        Default of -1 specifies to replace all occurrences of target in each string.
+
+    Returns
+    -------
+    pylibcudf.Column
+        New string column with target replaced.
+    """
+    cdef:
+        unique_ptr[column] c_result
+        const string_scalar* target_str
+        const string_scalar* repl_str
+
+    target_str = <string_scalar *>(target.c_obj.get())
+    repl_str = <string_scalar *>(repl.c_obj.get())
+
+    with nogil:
+        c_result = move(cpp_replace(
+            input.view(),
+            target_str[0],
+            repl_str[0],
+            maxrepl,
+        ))
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column replace_multiple(
+    Column input,
+    Column target,
+    Column repl,
+    size_type maxrepl = -1
+):
+    """Replaces target string within each string with the specified replacement string.
+
+    Null string entries will return null output string entries.
+
+    For details, see :cpp:func:`replace_multiple`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    target : Column
+        Column containing strings to search for in the input column.
+    repl : Column
+        Column containing strings to replace target with.
+        Each target, when found, will be replaced by the value at the
+        corresponding index in the repl Column.
+
+        Must be of the same length as target.
+
+    Returns
+    -------
+    pylibcudf.Column
+        New string column with target replaced.
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_replace_multiple(
+            input.view(),
+            target.view(),
+            repl.view(),
+        ))
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column replace_slice(
+    Column input,
+    # TODO: default scalar values
+    # https://github.com/rapidsai/cudf/issues/15505
+    Scalar repl = None,
+    size_type start = 0,
+    size_type stop = -1
+):
+    """Replaces each string in the column with the provided repl string
+    within the [start,stop) character position range.
+
+    Null string entries will return null output string entries.
+    This function can be used to insert a string into specific position
+    by specifying the same position value for start and stop.
+    The repl string can be appended to each string by specifying -1
+    for both start and stop.
+
+    For details, see :cpp:func:`replace_slice`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    repl : Scalar, default ""
+        String scalar to replace target with.
+    start : size_type, default 0
+        Start position where repl will be added.
+    stop : size_type, default -1
+        End position (exclusive) to use for replacement.
+    Returns
+    -------
+    pylibcudf.Column
+        New string column
+    """
+    cdef unique_ptr[column] c_result
+
+    if repl is None:
+        repl = Scalar.from_libcudf(
+            cpp_make_string_scalar("".encode())
+        )
+
+    cdef const string_scalar* scalar_str = <string_scalar*>(repl.c_obj.get())
+
+    with nogil:
+        c_result = move(cpp_replace_slice(
+            input.view(),
+            scalar_str[0],
+            start,
+            stop
+        ))
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx
index 2d9330a8a24..374831f1833 100644
--- a/python/cudf/cudf/_lib/strings/replace.pyx
+++ b/python/cudf/cudf/_lib/strings/replace.pyx
@@ -1,23 +1,15 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.replace cimport (
-    replace as cpp_replace,
-    replace_multiple as cpp_replace_multiple,
-    replace_slice as cpp_replace_slice,
-)
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
+import cudf._lib.pylibcudf as plc
+
 
 @acquire_spill_lock()
 def slice_replace(Column source_strings,
@@ -32,22 +24,12 @@ def slice_replace(Column source_strings,
 
     cdef DeviceScalar repl = py_repl.device_value
 
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        repl.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_replace_slice(
-            source_view,
-            scalar_str[0],
-            start,
-            stop
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc.strings.replace.replace_slice(
+        source_strings.to_pylibcudf(mode="read"),
+        repl.c_value,
+        start,
+        stop
+    ))
 
 
 @acquire_spill_lock()
@@ -61,22 +43,12 @@ def insert(Column source_strings,
 
     cdef DeviceScalar repl = py_repl.device_value
 
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_str = <const string_scalar*>(
-        repl.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_replace_slice(
-            source_view,
-            scalar_str[0],
-            start,
-            start
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc.strings.replace.replace_slice(
+        source_strings.to_pylibcudf(mode="read"),
+        repl.c_value,
+        start,
+        start,
+    ))
 
 
 @acquire_spill_lock()
@@ -92,25 +64,12 @@ def replace(Column source_strings,
     cdef DeviceScalar target = py_target.device_value
     cdef DeviceScalar repl = py_repl.device_value
 
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef const string_scalar* scalar_target = <const string_scalar*>(
-        target.get_raw_ptr()
-    )
-    cdef const string_scalar* scalar_repl = <const string_scalar*>(
-        repl.get_raw_ptr()
-    )
-
-    with nogil:
-        c_result = move(cpp_replace(
-            source_view,
-            scalar_target[0],
-            scalar_repl[0],
-            maxrepl
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc.strings.replace.replace(
+        source_strings.to_pylibcudf(mode="read"),
+        target.c_value,
+        repl.c_value,
+        maxrepl
+    ))
 
 
 @acquire_spill_lock()
@@ -121,16 +80,8 @@ def replace_multi(Column source_strings,
     Returns a Column after replacing occurrences of
     patterns `target_strings` with `repl_strings` in `source_strings`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view target_view = target_strings.view()
-    cdef column_view repl_view = repl_strings.view()
-
-    with nogil:
-        c_result = move(cpp_replace_multiple(
-            source_view,
-            target_view,
-            repl_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(plc.strings.replace.replace_multiple(
+        source_strings.to_pylibcudf(mode="read"),
+        target_strings.to_pylibcudf(mode="read"),
+        repl_strings.to_pylibcudf(mode="read"),
+    ))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_replace.py b/python/cudf/cudf/pylibcudf_tests/test_string_replace.py
new file mode 100644
index 00000000000..f20edf6a506
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_replace.py
@@ -0,0 +1,126 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def data_col():
+    pa_data_col = pa.array(
+        ["a", "c", "A", "aa", None, "aaaaaaaaa", "AAAA", "ÁÁÁÁ"],
+        type=pa.string(),
+    )
+    return pa_data_col, plc.interop.from_arrow(pa_data_col)
+
+
+@pytest.fixture(scope="module", params=["a", "c", "A", "Á", "aa", "ÁÁÁ"])
+def scalar_repl_target(request):
+    pa_target = pa.scalar(request.param, type=pa.string())
+    return request.param, plc.interop.from_arrow(pa_target)
+
+
+@pytest.fixture(scope="module", params=["b", "B", "", "B́"])
+def scalar_repl(request):
+    pa_repl = pa.scalar(request.param, type=pa.string())
+    return request.param, plc.interop.from_arrow(pa_repl)
+
+
+@pytest.fixture(
+    scope="module",
+    params=[
+        ["a", "c", "A", "ÁÁÁÁ"],
+    ],
+)
+def col_repl_target(request):
+    pa_target = pa.array(request.param, type=pa.string())
+    return (pa_target, plc.interop.from_arrow(pa_target))
+
+
+@pytest.fixture(
+    scope="module",
+    params=[
+        [
+            "",
+            "z",
+            "XX",
+            "blahblah",
+        ]
+    ],
+)
+def col_repl(request):
+    pa_repl = pa.array(request.param, type=pa.string())
+    return (pa_repl, plc.interop.from_arrow(pa_repl))
+
+
+@pytest.mark.parametrize("maxrepl", [-1, 1, 2, 10])
+def test_replace(data_col, scalar_repl_target, scalar_repl, maxrepl):
+    pa_data_col, plc_data_col = data_col
+    pa_target, plc_target = scalar_repl_target
+    pa_repl, plc_repl = scalar_repl
+    got = plc.strings.replace.replace(
+        plc_data_col, plc_target, plc_repl, maxrepl
+    )
+
+    expected = pa.compute.replace_substring(
+        pa_data_col,
+        pattern=pa_target,
+        replacement=pa_repl,
+        max_replacements=maxrepl,
+    )
+
+    assert_column_eq(expected, got)
+
+
+@pytest.mark.parametrize("startstop", [(0, -1), (0, 0), (1, 3)])
+def test_replace_slice(data_col, scalar_repl, startstop):
+    pa_data_col, plc_data_col = data_col
+    pa_repl, plc_repl = scalar_repl
+    start, stop = startstop
+    got = plc.strings.replace.replace_slice(
+        plc_data_col, plc_repl, start, stop
+    )
+
+    if stop == -1:
+        # pyarrow doesn't support -1 as stop, so just set to really big number
+
+        # TODO: once libcudf's count_characters() is migrated, we can call
+        # count_characters on the input, take the max and set stop to that
+        stop = 1000
+
+    expected = pa.compute.utf8_replace_slice(pa_data_col, start, stop, pa_repl)
+
+    assert_column_eq(expected, got)
+
+
+def test_replace_col(data_col, col_repl_target, col_repl):
+    pa_data_col, plc_data_col = data_col
+    pa_target, plc_target = col_repl_target
+    pa_repl, plc_repl = col_repl
+    got = plc.strings.replace.replace_multiple(
+        plc_data_col, plc_target, plc_repl
+    )
+
+    # There's nothing in pyarrow that does string replace with columns
+    # for targets/repls, so let's implement our own in python
+
+    def replace_list(elem, targets, repls):
+        for target, repl in zip(targets, repls):
+            res = elem.replace(target, repl)
+            if res != elem:
+                return res
+
+    targets = pa_target.to_pylist()
+    repls = pa_repl.to_pylist()
+
+    expected = pa.array(
+        [
+            replace_list(elem, targets, repls) if elem is not None else None
+            for elem in pa_data_col.to_pylist()
+        ],
+        type=pa.string(),
+    )
+
+    assert_column_eq(expected, got)

From 57aeeb78d85e169ac18b82f51d2b1cbd01b0608d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 5 Jun 2024 06:49:57 -1000
Subject: [PATCH 304/842] Make Frame._dtype an iterator instead of a dict
 (#15920)

A lot of the usages of `Frame._dtype` didn't require the previous `dict` return type since that was just re-iterated over anyways.

Also removed a redundant `tuple` call in `Frame._column_names` and `Frame._columns`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15920
---
 python/cudf/cudf/core/dataframe.py       |  4 ++--
 python/cudf/cudf/core/frame.py           | 16 +++++++---------
 python/cudf/cudf/core/groupby/groupby.py | 16 +++-------------
 python/cudf/cudf/core/indexed_frame.py   | 10 +++++-----
 python/cudf/cudf/io/csv.py               |  5 ++---
 python/cudf/cudf/io/json.py              |  5 ++---
 6 files changed, 21 insertions(+), 35 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index c8f1e872300..9307267b227 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1231,7 +1231,7 @@ def dtypes(self):
         string              object
         dtype: object
         """
-        return pd.Series(self._dtypes, dtype="object")
+        return pd.Series(dict(self._dtypes), dtype="object")
 
     @property
     def ndim(self) -> int:
@@ -2834,7 +2834,7 @@ def reindex(
 
         return df._reindex(
             column_names=columns,
-            dtypes=self._dtypes,
+            dtypes=dict(self._dtypes),
             deep=copy,
             index=index,
             inplace=False,
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 7326696c994..af8886a44a6 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -79,18 +79,16 @@ def _num_rows(self) -> int:
         return self._data.nrows
 
     @property
-    def _column_names(self) -> Tuple[Any, ...]:  # TODO: Tuple[str]?
-        return tuple(self._data.names)
+    def _column_names(self) -> Tuple[Any, ...]:
+        return self._data.names
 
     @property
-    def _columns(self) -> Tuple[Any, ...]:  # TODO: Tuple[Column]?
-        return tuple(self._data.columns)
+    def _columns(self) -> Tuple[ColumnBase, ...]:
+        return self._data.columns
 
     @property
-    def _dtypes(self):
-        return dict(
-            zip(self._data.names, (col.dtype for col in self._data.columns))
-        )
+    def _dtypes(self) -> abc.Iterator:
+        return zip(self._data.names, (col.dtype for col in self._data.columns))
 
     @property
     def ndim(self) -> int:
@@ -1969,7 +1967,7 @@ def __dask_tokenize__(self):
 
         return [
             type(self),
-            str(self._dtypes),
+            str(dict(self._dtypes)),
             normalize_token(self.to_pandas()),
         ]
 
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index ac8b381cbec..aa96051ea51 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -22,12 +22,7 @@
 from cudf._lib.types import size_type_dtype
 from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
 from cudf.api.extensions import no_default
-from cudf.api.types import (
-    is_bool_dtype,
-    is_float_dtype,
-    is_list_like,
-    is_numeric_dtype,
-)
+from cudf.api.types import is_bool_dtype, is_list_like, is_numeric_dtype
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.abc import Serializable
 from cudf.core.column.column import ColumnBase, StructDtype, as_column
@@ -335,12 +330,8 @@ def dtypes(self):
             FutureWarning,
         )
         index = self.grouping.keys.unique().sort_values().to_pandas()
-        obj_dtypes = self.obj._dtypes
         return pd.DataFrame(
-            {
-                name: [obj_dtypes[name]] * len(index)
-                for name in self.obj._data.names
-            },
+            {name: [dtype] * len(index) for name, dtype in self.obj._dtypes},
             index=index,
         )
 
@@ -499,8 +490,7 @@ def rank(
         # treats NaNs the way we treat nulls.
         if cudf.get_option("mode.pandas_compatible"):
             if any(
-                is_float_dtype(typ)
-                for typ in self.grouping.values._dtypes.values()
+                col.dtype.kind == "f" for col in self.grouping.values._columns
             ):
                 raise NotImplementedError(
                     "NaNs are not supported in groupby.rank."
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 688b268d478..ecfcec15337 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -891,7 +891,7 @@ def replace(
             ) = _get_replacement_values_for_columns(
                 to_replace=to_replace,
                 value=value,
-                columns_dtype_map=self._dtypes,
+                columns_dtype_map=dict(self._dtypes),
             )
 
             for name, col in self._data.items():
@@ -6313,11 +6313,11 @@ def __dask_tokenize__(self):
 
         return [
             type(self),
-            str(self._dtypes),
+            str(dict(self._dtypes)),
             *[
-                normalize_token(cat.categories)
-                for cat in self._dtypes.values()
-                if cat == "category"
+                normalize_token(col.dtype.categories)
+                for col in self._columns
+                if col.dtype == "category"
             ],
             normalize_token(self.index),
             normalize_token(self.hash_values().values_host),
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 3eeeac405b3..f07764e2ce4 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -132,10 +132,9 @@ def read_csv(
         # There exists some dtypes in the result columns that is inferred.
         # Find them and map them to the default dtypes.
         specified_dtypes = {} if dtype is None else dtype
-        df_dtypes = df._dtypes
         unspecified_dtypes = {
-            name: df_dtypes[name]
-            for name in df._column_names
+            name: dtype
+            for name, dtype in df._dtypes
             if name not in specified_dtypes
         }
         default_dtypes = {}
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index dd4a0d9eb07..fc3387d5117 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -147,10 +147,9 @@ def read_json(
         # There exists some dtypes in the result columns that is inferred.
         # Find them and map them to the default dtypes.
         specified_dtypes = {} if dtype is True else dtype
-        df_dtypes = df._dtypes
         unspecified_dtypes = {
-            name: df_dtypes[name]
-            for name in df._column_names
+            name: dtype
+            for name, dtype in df._dtypes
             if name not in specified_dtypes
         }
         default_dtypes = {}

From 20aa4442d27ca858796c7890ad0542dbaee542e1 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 5 Jun 2024 15:25:51 -0400
Subject: [PATCH 305/842] DOC: Add documentation for cudf.pandas in the
 Developer Guide (#15889)

This PR provides documentation for cudf.pandas in the Developer Guide. It will describe the fast-slow proxy wrapping scheme as well as document the `CUDF_PANDAS_DEBUGGING` environment variable created in PR #15837 for issue #14975.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15889
---
 .../source/developer_guide/cudf_pandas.md     | 121 ++++++++++++++++++
 docs/cudf/source/developer_guide/index.md     |   1 +
 2 files changed, 122 insertions(+)
 create mode 100644 docs/cudf/source/developer_guide/cudf_pandas.md

diff --git a/docs/cudf/source/developer_guide/cudf_pandas.md b/docs/cudf/source/developer_guide/cudf_pandas.md
new file mode 100644
index 00000000000..aeb43f66b2d
--- /dev/null
+++ b/docs/cudf/source/developer_guide/cudf_pandas.md
@@ -0,0 +1,121 @@
+# cudf.pandas
+The use of the cuDF pandas accelerator mode (`cudf.pandas`) is explained [in the user guide](../cudf_pandas/index.rst).
+The purpose of this document is to explain how the fast-slow proxy mechanism works and document internal environment variables that can be used to debug `cudf.pandas` itself.
+
+## fast-slow proxy mechanism
+`cudf.pandas` works by wrapping each Pandas type and its corresponding cuDF type in a new proxy type also known as a fast-slow proxy type.
+The purpose of proxy types is to attempt computations on the fast (cuDF) object first, and then fall back to running on the slow (Pandas) object if the fast version fails.
+
+### Types:
+#### Wrapped Types and Proxy Types
+The "wrapped" types/classes are the Pandas and cuDF specific types that have been wrapped into proxy types.
+Wrapped objects and proxy objects are instances of wrapped types and proxy types, respectively.
+In the snippet below `s1` and `s2` are wrapped objects and `s3` is a fast-slow proxy object.
+Also note that the module `xpd` is a wrapped module and contains cuDF and Pandas modules as attributes.
+  ```python
+  import cudf.pandas
+  cudf.pandas.install()
+  import pandas as xpd
+
+  cudf = xpd._fsproxy_fast
+  pd = xpd._fsproxy_slow
+
+  s1 = cudf.Series([1,2])
+  s2 = pd.Series([1,2])
+  s3 = xpd.Series([1,2])
+  ```
+
+```{note}
+Note that users should never have to interact with the wrapped objects directly in this way.
+This code is purely for demonstrative purposes.
+```
+
+#### The Different Kinds of Proxy Types
+In `cudf.pandas`, there are two main kinds of proxy types: final types and intermediate types.
+
+##### Final and Intermediate Proxy Types
+Final types are types for which known operations exist for converting an object of a "fast" type to a "slow" type and vice versa.
+For example, `cudf.DataFrame` can be converted to Pandas using the method `to_pandas`, and `pd.DataFrame` can be converted to cuDF using the function `cudf.from_pandas`.
+Intermediate types are the types of the results of operations invoked on final types.
+For example, `xpd.DataFrameGroupBy` is an intermediate type that will be created during a groupby operation on the final type `xpd.DataFrame`.
+
+##### Attributes and Callable Proxy Types
+Final proxy types are typically classes or modules, both of which have attributes.
+Classes also have methods.
+These attributes and methods must be wrapped as well to support the fast-slow proxy scheme.
+
+#### Creating New Proxy Types
+`_FinalProxy` and `_IntermediateProxy` types are created using the functions `make_final_proxy_type` and `make_intermediate_proxy` type, respectively.
+Creating a new final type looks like this.
+
+```python
+DataFrame = make_final_proxy_type(
+    "DataFrame",
+    cudf.DataFrame,
+    pd.DataFrame,
+    fast_to_slow=lambda fast: fast.to_pandas(),
+    slow_to_fast=cudf.from_pandas,
+)
+```
+
+### The Fallback Mechanism
+Proxied calls are implemented with fallback via [`_fast_slow_function_call`](https://github.com/rapidsai/cudf/blob/57aeeb78d85e169ac18b82f51d2b1cbd01b0608d/python/cudf/cudf/pandas/fast_slow_proxy.py#L869). This implements the mechanism by which we attempt operations the fast way (using cuDF) and then fall back to the slow way (using Pandas) on failure.
+The function looks like this:
+```python
+def _fast_slow_function_call(func: Callable, *args, **kwargs):
+    try:
+        ...
+        fast_args, fast_kwargs = _fast_arg(args), _fast_arg(kwargs)
+        result = func(*fast_args, **fast_kwargs)
+        ...
+    except Exception:
+        ...
+        slow_args, slow_kwargs = _slow_arg(args), _slow_arg(kwargs)
+        result = func(*slow_args, **slow_kwargs)
+        ...
+    return _maybe_wrap_result(result, func, *args, **kwargs), fast
+```
+As we can see the function attempts to call `func` the fast way using cuDF and if any `Exception` occurs, it calls the function using Pandas.
+In essence, this `try-except` is what allows `cudf.pandas` to support the bulk of the Pandas API.
+
+At the end, the function wraps the result from either path in a fast-slow proxy object, if necessary.
+
+#### Converting Proxy Objects
+Note that before the `func` is called, the proxy object and its attributes need to be converted to either their cuDF or Pandas implementations.
+This conversion is handled in the function `_transform_arg` which both `_fast_arg` and `_slow_arg` call.
+
+`_transform_arg` is a recursive function that will call itself depending on the type or argument passed to it (eg. `_transform_arg` is called for each element in a list of arguments).
+
+### Using Metaclasses
+`cudf.pandas` uses a [metaclass](https://docs.python.org/3/glossary.html#term-metaclass) called (`_FastSlowProxyMeta`) to find class attributes and classmethods of fast-slow proxy types.
+For example, in the snippet below, the `xpd.Series` type is an instance of `_FastSlowProxyMeta`.
+Therefore we can access the property `_fsproxy_fast` defined in the metaclass.
+```python
+import cudf.pandas
+cudf.pandas.install()
+import pandas as xpd
+
+print(xpd.Series._fsproxy_fast) # output is cudf.core.series.Series
+```
+
+## debugging `cudf.pandas`
+Several environment variables are available for debugging purposes.
+
+Setting the environment variable `CUDF_PANDAS_DEBUGGING` produces a warning when the results from cuDF and Pandas differ from one another.
+For example, the snippet below produces the warning below.
+```python
+import cudf.pandas
+cudf.pandas.install()
+import pandas as pd
+import numpy as np
+
+setattr(pd.Series.mean, "_fsproxy_slow", lambda self, *args, **kwargs: np.float64(1))
+s = pd.Series([1,2,3])
+s.mean()
+```
+```
+UserWarning: The results from cudf and pandas were different. The exception was
+Arrays are not almost equal to 7 decimals
+ ACTUAL: 1.0
+ DESIRED: 2.0.
+```
diff --git a/docs/cudf/source/developer_guide/index.md b/docs/cudf/source/developer_guide/index.md
index 5cafa8f784c..5e099631fc5 100644
--- a/docs/cudf/source/developer_guide/index.md
+++ b/docs/cudf/source/developer_guide/index.md
@@ -27,4 +27,5 @@ testing
 benchmarking
 options
 pylibcudf
+cudf_pandas
 ```

From d91380ef393e9156c34a078998041a6affca7923 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 5 Jun 2024 21:16:29 -0400
Subject: [PATCH 306/842] Allow tests to be built when stream util is disabled
 (#15933)

Allows cudf to be built with `BUILD_SHARED_LIBS=OFF`, `CUDA_STATIC_RUNTIME=ON` and tests enabled

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Gera Shegalov (https://github.com/gerashegalov)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15933
---
 cpp/tests/CMakeLists.txt | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 2f2c12f265c..a0d9083c4a4 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -68,12 +68,14 @@ function(ConfigureTest CMAKE_TEST_NAME)
     INSTALL_COMPONENT_SET testing
   )
 
-  set_tests_properties(
-    ${CMAKE_TEST_NAME}
-    PROPERTIES
-      ENVIRONMENT
-      "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$<TARGET_FILE:cudf_identify_stream_usage_mode_${_CUDF_TEST_STREAM_MODE}>"
-  )
+  if(CUDF_BUILD_STREAMS_TEST_UTIL)
+    set_tests_properties(
+      ${CMAKE_TEST_NAME}
+      PROPERTIES
+        ENVIRONMENT
+        "GTEST_CUDF_STREAM_MODE=new_${_CUDF_TEST_STREAM_MODE}_default;LD_PRELOAD=$<TARGET_FILE:cudf_identify_stream_usage_mode_${_CUDF_TEST_STREAM_MODE}>"
+    )
+  endif()
 endfunction()
 
 # ##################################################################################################
@@ -401,14 +403,10 @@ ConfigureTest(SPAN_TEST utilities_tests/span_tests.cu)
 ConfigureTest(SPAN_TEST_DEVICE_VECTOR utilities_tests/span_tests.cu)
 
 # Overwrite the environments set by ConfigureTest
-set_tests_properties(
-  SPAN_TEST
-  PROPERTIES
-    ENVIRONMENT
-    "GTEST_FILTER=-${_allowlist_filter};GTEST_CUDF_STREAM_MODE=new_cudf_default;LD_PRELOAD=$<TARGET_FILE:cudf_identify_stream_usage_mode_cudf>"
-)
-set_tests_properties(
-  SPAN_TEST_DEVICE_VECTOR PROPERTIES ENVIRONMENT "GTEST_FILTER=${_allowlist_filter}"
+set_property(
+  TEST SPAN_TEST SPAN_TEST_DEVICE_VECTOR
+  APPEND
+  PROPERTY ENVIRONMENT "GTEST_FILTER=-${_allowlist_filter}"
 )
 
 # ##################################################################################################
@@ -671,9 +669,11 @@ target_include_directories(JIT_PARSER_TEST PRIVATE "$<BUILD_INTERFACE:${CUDF_SOU
 
 # ##################################################################################################
 # * stream testing ---------------------------------------------------------------------------------
-ConfigureTest(
-  STREAM_IDENTIFICATION_TEST identify_stream_usage/test_default_stream_identification.cu
-)
+if(CUDF_BUILD_STREAMS_TEST_UTIL)
+  ConfigureTest(
+    STREAM_IDENTIFICATION_TEST identify_stream_usage/test_default_stream_identification.cu
+  )
+endif()
 
 ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)

From 7fd6918f9f4bbfc499bc60a3532a464c357da4f4 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 5 Jun 2024 20:48:10 -0500
Subject: [PATCH 307/842] Migrate strings `contains` operations to `pylibcudf`
 (#15880)

This PR creates pylibcudf strings `contains` APIs and migrates the cuDF cython to leverage them. Part of https://github.com/rapidsai/cudf/issues/15162.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15880
---
 .../api_docs/pylibcudf/strings/contains.rst   |  6 ++
 .../api_docs/pylibcudf/strings/index.rst      |  1 +
 .../pylibcudf/libcudf/strings/CMakeLists.txt  |  2 +-
 .../pylibcudf/libcudf/strings/regex_flags.pxd | 13 +++--
 .../pylibcudf/libcudf/strings/regex_flags.pyx |  0
 .../_lib/pylibcudf/strings/CMakeLists.txt     |  4 +-
 .../cudf/_lib/pylibcudf/strings/__init__.pxd  | 11 +++-
 .../cudf/_lib/pylibcudf/strings/__init__.py   | 11 +++-
 .../cudf/_lib/pylibcudf/strings/contains.pxd  |  7 +++
 .../cudf/_lib/pylibcudf/strings/contains.pyx  | 41 ++++++++++++++
 .../_lib/pylibcudf/strings/regex_flags.pxd    |  2 +
 .../_lib/pylibcudf/strings/regex_flags.pyx    |  4 ++
 .../_lib/pylibcudf/strings/regex_program.pxd  | 10 ++++
 .../_lib/pylibcudf/strings/regex_program.pyx  | 37 +++++++++++++
 python/cudf/cudf/_lib/strings/contains.pyx    | 23 +++-----
 .../pylibcudf_tests/test_regex_program.py     | 13 +++++
 .../pylibcudf_tests/test_string_contains.py   | 55 +++++++++++++++++++
 17 files changed, 215 insertions(+), 25 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_regex_program.py
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_contains.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst
new file mode 100644
index 00000000000..e5745331bc7
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst
@@ -0,0 +1,6 @@
+========
+contains
+========
+
+.. automodule:: cudf._lib.pylibcudf.strings.contains
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index 8970fc80c0b..bfaef732555 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -4,4 +4,5 @@ strings
 .. toctree::
     :maxdepth: 1
 
+    contains
     replace
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt
index 930c22781d0..bd6e2e0af02 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources char_types.pyx)
+set(cython_sources char_types.pyx regex_flags.pyx)
 
 set(linked_libraries cudf::cudf)
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd
index 2a5701fa6a3..41617f157b7 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd
@@ -1,9 +1,12 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport int32_t
+
 
 cdef extern from "cudf/strings/regex/flags.hpp" \
         namespace "cudf::strings" nogil:
 
-    ctypedef enum regex_flags:
-        DEFAULT 'cudf::strings::regex_flags::DEFAULT'
-        MULTILINE  'cudf::strings::regex_flags::MULTILINE'
-        DOTALL 'cudf::strings::regex_flags::DOTALL'
+    cpdef enum class regex_flags(int32_t):
+        DEFAULT
+        MULTILINE
+        DOTALL
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
index c9a983e24f4..cb7f71b1912 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
@@ -12,7 +12,9 @@
 # the License.
 # =============================================================================
 
-set(cython_sources capitalize.pyx case.pyx char_types.pyx find.pyx replace.pyx)
+set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx
+                   regex_program.pyx replace.pyx
+)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
index 7563df8a107..959aa94737d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
@@ -1,3 +1,12 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . cimport capitalize, case, char_types, find, replace
+from . cimport (
+    capitalize,
+    case,
+    char_types,
+    contains,
+    find,
+    regex_flags,
+    regex_program,
+    replace,
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
index cb4f0e38f97..b7384913286 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
@@ -1,3 +1,12 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import capitalize, case, char_types, find, replace
+from . import (
+    capitalize,
+    case,
+    char_types,
+    contains,
+    find,
+    regex_flags,
+    regex_program,
+    replace,
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd
new file mode 100644
index 00000000000..275aa95d97e
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram
+
+
+cpdef Column contains_re(Column input, RegexProgram prog)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx
new file mode 100644
index 00000000000..8c598b7c953
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx
@@ -0,0 +1,41 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.strings cimport contains as cpp_contains
+from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram
+
+
+cpdef Column contains_re(
+    Column input,
+    RegexProgram prog
+):
+    """Returns a boolean column identifying rows which match the given
+    regex_program object.
+
+    For details, see :cpp:func:`cudf::strings::contains_re`.
+
+    Parameters
+    ----------
+    input : Column
+        The input strings
+    prog : RegexProgram
+        Regex program instance
+
+    Returns
+    -------
+    pylibcudf.Column
+        New column of boolean results for each string
+    """
+
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = cpp_contains.contains_re(
+            input.view(),
+            prog.c_obj.get()[0]
+        )
+
+    return Column.from_libcudf(move(result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd
new file mode 100644
index 00000000000..79937bf574a
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd
@@ -0,0 +1,2 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx
new file mode 100644
index 00000000000..903c2ddd503
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags import \
+    regex_flags as RegexFlags  # no-cython-lint
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd
new file mode 100644
index 00000000000..61ed268fb2d
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+
+
+cdef class RegexProgram:
+    cdef unique_ptr[regex_program] c_obj
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
new file mode 100644
index 00000000000..d605b0aba02
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
@@ -0,0 +1,37 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move
+
+from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+
+from cudf._lib.pylibcudf.strings.regex_flags import RegexFlags
+from cudf._lib.pylibcudf.strings.regex_flags cimport regex_flags
+
+
+cdef class RegexProgram:
+
+    def __init__(self, *args, **kwargs):
+        raise ValueError("Do not instantiate RegexProgram directly, use create")
+
+    @staticmethod
+    def create(str pattern, int flags):
+        cdef unique_ptr[regex_program] c_prog
+        cdef regex_flags c_flags
+        cdef string c_pattern = pattern.encode()
+
+        cdef RegexProgram ret = RegexProgram.__new__(RegexProgram)
+        if isinstance(flags, object):
+            if isinstance(flags, (int, RegexFlags)):
+                c_flags = <regex_flags>flags
+                with nogil:
+                    c_prog = regex_program.create(c_pattern, c_flags)
+
+                ret.c_obj = move(c_prog)
+            else:
+                raise ValueError("flags must be of type RegexFlags")
+
+        return ret
diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx
index 087acd8062d..502a1d14696 100644
--- a/python/cudf/cudf/_lib/strings/contains.pyx
+++ b/python/cudf/cudf/_lib/strings/contains.pyx
@@ -14,7 +14,6 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
 from cudf._lib.pylibcudf.libcudf.strings.contains cimport (
-    contains_re as cpp_contains_re,
     count_re as cpp_count_re,
     like as cpp_like,
     matches_re as cpp_matches_re,
@@ -23,6 +22,9 @@ from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
 from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
 from cudf._lib.scalar cimport DeviceScalar
 
+from cudf._lib.pylibcudf.strings import contains
+from cudf._lib.pylibcudf.strings.regex_program import RegexProgram
+
 
 @acquire_spill_lock()
 def contains_re(Column source_strings, object reg_ex, uint32_t flags):
@@ -30,21 +32,10 @@ def contains_re(Column source_strings, object reg_ex, uint32_t flags):
     Returns a Column of boolean values with True for `source_strings`
     that contain regular expression `reg_ex`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
-    cdef string reg_ex_string = <string>str(reg_ex).encode()
-    cdef regex_flags c_flags = <regex_flags>flags
-    cdef unique_ptr[regex_program] c_prog
-
-    with nogil:
-        c_prog = move(regex_program.create(reg_ex_string, c_flags))
-        c_result = move(cpp_contains_re(
-            source_view,
-            dereference(c_prog)
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    prog = RegexProgram.create(str(reg_ex), flags)
+    return Column.from_pylibcudf(
+        contains.contains_re(source_strings.to_pylibcudf(mode="read"), prog)
+    )
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/pylibcudf_tests/test_regex_program.py b/python/cudf/cudf/pylibcudf_tests/test_regex_program.py
new file mode 100644
index 00000000000..3a9bcec3616
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_regex_program.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pytest
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.mark.parametrize("pat", ["(", "*", "\\"])
+def test_regex_program_invalid(pat):
+    with pytest.raises(RuntimeError):
+        plc.strings.regex_program.RegexProgram.create(
+            pat, plc.strings.regex_flags.RegexFlags.DEFAULT
+        )
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_contains.py b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py
new file mode 100644
index 00000000000..8cdb6f7c521
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py
@@ -0,0 +1,55 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def pa_target_col():
+    return pa.array(
+        ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"]
+    )
+
+
+@pytest.fixture(scope="module")
+def plc_target_col(pa_target_col):
+    return plc.interop.from_arrow(pa_target_col)
+
+
+@pytest.fixture(
+    params=[
+        "A",
+        "de",
+        ".*",
+        "^a",
+        "^A",
+        "[^a-z]",
+        "[a-z]{3,}",
+        "^[A-Z]{2,}",
+        "j|u",
+    ],
+    scope="module",
+)
+def pa_target_scalar(request):
+    return pa.scalar(request.param, type=pa.string())
+
+
+@pytest.fixture(scope="module")
+def plc_target_pat(pa_target_scalar):
+    prog = plc.strings.regex_program.RegexProgram.create(
+        pa_target_scalar.as_py(), plc.strings.regex_flags.RegexFlags.DEFAULT
+    )
+    return prog
+
+
+def test_contains_re(
+    pa_target_col, plc_target_col, pa_target_scalar, plc_target_pat
+):
+    got = plc.strings.contains.contains_re(plc_target_col, plc_target_pat)
+    expected = pa.compute.match_substring_regex(
+        pa_target_col, pa_target_scalar.as_py()
+    )
+    assert_column_eq(got, expected)

From 3b734ec2fd591f037fe1d8f8ce424c7049cb5a3e Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 6 Jun 2024 04:41:01 -0700
Subject: [PATCH 308/842] Start migrating I/O to pylibcudf (#15899)

xref #15162

Starts migrating cudf I/O cython to use pylibcudf APIs, starting with avro.

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15899
---
 .../user_guide/api_docs/pylibcudf/index.rst   |   1 +
 .../user_guide/api_docs/pylibcudf/io/avro.rst |   6 +
 .../api_docs/pylibcudf/io/index.rst           |  18 +++
 python/cudf/cudf/_lib/avro.pyx                |  50 ++-----
 python/cudf/cudf/_lib/csv.pyx                 |   8 +-
 python/cudf/cudf/_lib/parquet.pyx             |   2 +-
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |   1 +
 .../cudf/_lib/pylibcudf/io/CMakeLists.txt     |  25 ++++
 .../cudf/cudf/_lib/pylibcudf/io/__init__.pxd  |   4 +
 .../cudf/cudf/_lib/pylibcudf/io/__init__.py   |   4 +
 python/cudf/cudf/_lib/pylibcudf/io/avro.pxd   |  12 ++
 python/cudf/cudf/_lib/pylibcudf/io/avro.pyx   |  58 +++++++++
 python/cudf/cudf/_lib/pylibcudf/io/types.pxd  |  29 +++++
 python/cudf/cudf/_lib/pylibcudf/io/types.pyx  | 110 ++++++++++++++++
 .../cudf/_lib/pylibcudf/libcudf/io/orc.pxd    |   6 +-
 .../cudf/_lib/pylibcudf/libcudf/io/types.pxd  |  58 ++++-----
 python/cudf/cudf/_lib/utils.pxd               |   1 +
 python/cudf/cudf/_lib/utils.pyx               |  11 ++
 .../cudf/cudf/pylibcudf_tests/common/utils.py |  17 +++
 python/cudf/cudf/pylibcudf_tests/test_avro.py | 123 ++++++++++++++++++
 .../cudf/pylibcudf_tests/test_source_info.py  |  69 ++++++++++
 21 files changed, 541 insertions(+), 72 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/__init__.py
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/avro.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/types.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/types.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_avro.py
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_source_info.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index b6ad1157511..870ed8856d1 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -17,6 +17,7 @@ This page provides API documentation for pylibcudf.
     filling
     gpumemoryview
     groupby
+    io/index.rst
     join
     lists
     merge
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst
new file mode 100644
index 00000000000..495bd505fdc
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst
@@ -0,0 +1,6 @@
+====
+Avro
+====
+
+.. automodule:: cudf._lib.pylibcudf.io.avro
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
new file mode 100644
index 00000000000..0d53ac92db9
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -0,0 +1,18 @@
+===
+I/O
+===
+
+I/O Utility Classes
+===================
+
+.. automodule:: cudf._lib.pylibcudf.io.types
+   :members:
+
+
+I/O Functions
+=============
+
+.. toctree::
+    :maxdepth: 1
+
+    avro
diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx
index ae17a5f1ab6..3c132b22880 100644
--- a/python/cudf/cudf/_lib/avro.pyx
+++ b/python/cudf/cudf/_lib/avro.pyx
@@ -1,20 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.string cimport string
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
+from cudf._lib.utils cimport data_from_pylibcudf_io
 
-from cudf._lib.io.utils cimport make_source_info
-from cudf._lib.pylibcudf.libcudf.io.avro cimport (
-    avro_reader_options,
-    read_avro as libcudf_read_avro,
-)
-from cudf._lib.pylibcudf.libcudf.io.types cimport table_with_metadata
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.utils cimport data_from_unique_ptr
+import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.io.types import SourceInfo
 
 
-cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1):
+cpdef read_avro(datasource, columns=None, skip_rows=0, num_rows=-1):
     """
     Cython function to call libcudf read_avro, see `read_avro`.
 
@@ -28,28 +20,14 @@ cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1):
 
     if not isinstance(num_rows, int) or num_rows < -1:
         raise TypeError("num_rows must be an int >= -1")
-    if not isinstance(skip_rows, int) or skip_rows < -1:
-        raise TypeError("skip_rows must be an int >= -1")
-
-    cdef vector[string] c_columns
-    if columns is not None and len(columns) > 0:
-        c_columns.reserve(len(columns))
-        for col in columns:
-            c_columns.push_back(str(col).encode())
-
-    cdef avro_reader_options options = move(
-        avro_reader_options.builder(make_source_info([datasource]))
-        .columns(c_columns)
-        .skip_rows(<size_type> skip_rows)
-        .num_rows(<size_type> num_rows)
-        .build()
+    if not isinstance(skip_rows, int) or skip_rows < 0:
+        raise TypeError("skip_rows must be an int >= 0")
+
+    return data_from_pylibcudf_io(
+        plc.io.avro.read_avro(
+            SourceInfo([datasource]),
+            columns,
+            skip_rows,
+            num_rows
+        )
     )
-
-    cdef table_with_metadata c_result
-
-    with nogil:
-        c_result = move(libcudf_read_avro(options))
-
-    names = [info.name.decode() for info in c_result.metadata.schema_info]
-
-    return data_from_unique_ptr(move(c_result.tbl), column_names=names)
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index aa771295607..0b0bbdb2589 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -151,14 +151,14 @@ cdef csv_reader_options make_csv_reader_options(
         )
 
     if quoting == 1:
-        c_quoting = quote_style.QUOTE_ALL
+        c_quoting = quote_style.ALL
     elif quoting == 2:
-        c_quoting = quote_style.QUOTE_NONNUMERIC
+        c_quoting = quote_style.NONNUMERIC
     elif quoting == 3:
-        c_quoting = quote_style.QUOTE_NONE
+        c_quoting = quote_style.NONE
     else:
         # Default value
-        c_quoting = quote_style.QUOTE_MINIMAL
+        c_quoting = quote_style.MINIMAL
 
     cdef csv_reader_options csv_reader_options_c = move(
         csv_reader_options.builder(c_source_info)
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index f0eef9be124..ac592cedaac 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -491,7 +491,7 @@ def write_parquet(
             "Valid values are '1.0' and '2.0'"
         )
 
-    dict_policy = (
+    cdef cudf_io_types.dictionary_policy dict_policy = (
         cudf_io_types.dictionary_policy.ADAPTIVE
         if use_dictionary
         else cudf_io_types.dictionary_policy.NEVER
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 7d0676f6def..6beb7b0f506 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -50,3 +50,4 @@ link_to_pyarrow_headers(pylibcudf_interop)
 
 add_subdirectory(libcudf)
 add_subdirectory(strings)
+add_subdirectory(io)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
new file mode 100644
index 00000000000..2cfec101bab
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
@@ -0,0 +1,25 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources avro.pyx types.pyx)
+
+set(linked_libraries cudf::cudf)
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
+)
+
+set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_types)
+link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
new file mode 100644
index 00000000000..250292746c1
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from . cimport avro, types
+from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
new file mode 100644
index 00000000000..5242c741911
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from . import avro, types
+from .types import SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd
new file mode 100644
index 00000000000..3695f36a6e7
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from cudf._lib.pylibcudf.libcudf.io.avro cimport avro_reader_options
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+
+cpdef TableWithMetadata read_avro(
+    SourceInfo source_info,
+    list columns = *,
+    size_type skip_rows = *,
+    size_type num_rows = *
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
new file mode 100644
index 00000000000..946e0896fc8
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
@@ -0,0 +1,58 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from cudf._lib.pylibcudf.libcudf.io.avro cimport (
+    avro_reader_options,
+    read_avro as cpp_read_avro,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+
+cpdef TableWithMetadata read_avro(
+    SourceInfo source_info,
+    list columns = None,
+    size_type skip_rows = 0,
+    size_type num_rows = -1
+):
+    """
+    Reads an Avro dataset into a set of columns.
+
+    Parameters
+    ----------
+    source_info: SourceInfo
+        The SourceInfo object to read the avro dataset from.
+    columns: list, default None
+        Optional columns to read, if not provided, reads all columns in the file.
+    skip_rows: size_type, default 0
+        The number of rows to skip.
+    num_rows: size_type, default -1
+        The number of rows to read, after skipping rows.
+        If -1 is passed, all rows will be read.
+
+    Returns
+    -------
+    TableWithMetadata
+        The Table and its corresponding metadata that was read in.
+    """
+    cdef vector[string] c_columns
+    if columns is not None and len(columns) > 0:
+        c_columns.reserve(len(columns))
+        for col in columns:
+            c_columns.push_back(str(col).encode())
+
+    cdef avro_reader_options avro_opts = move(
+        avro_reader_options.builder(source_info.c_obj)
+        .columns(c_columns)
+        .skip_rows(skip_rows)
+        .num_rows(num_rows)
+        .build()
+    )
+
+    with nogil:
+        c_result = move(cpp_read_avro(avro_opts))
+
+    return TableWithMetadata.from_libcudf(c_result)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
new file mode 100644
index 00000000000..aa846a47343
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    column_encoding,
+    column_in_metadata,
+    column_name_info,
+    compression_type,
+    dictionary_policy,
+    io_type,
+    partition_info,
+    quote_style,
+    sink_info,
+    source_info,
+    statistics_freq,
+    table_input_metadata,
+    table_metadata,
+    table_with_metadata,
+)
+from cudf._lib.pylibcudf.table cimport Table
+
+
+cdef class TableWithMetadata:
+    cdef public Table tbl
+    cdef table_metadata metadata
+
+    @staticmethod
+    cdef TableWithMetadata from_libcudf(table_with_metadata& tbl)
+
+cdef class SourceInfo:
+    cdef source_info c_obj
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
new file mode 100644
index 00000000000..cd777232b33
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
@@ -0,0 +1,110 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    host_buffer,
+    source_info,
+    table_with_metadata,
+)
+
+import errno
+import io
+import os
+
+
+cdef class TableWithMetadata:
+    """A container holding a table and its associated metadata
+    (e.g. column names)
+
+    For details, see :cpp:class:`cudf::io::table_with_metadata`.
+    """
+
+    @property
+    def columns(self):
+        """
+        Return a list containing the columns of the table
+        """
+        return self.tbl.columns()
+
+    @property
+    def column_names(self):
+        """
+        Return a list containing the column names of the table
+        """
+        cdef list names = []
+        for col_info in self.metadata.schema_info:
+            # TODO: Handle nesting (columns with child columns)
+            assert col_info.children.size() == 0, "Child column names are not handled!"
+            names.append(col_info.name.decode())
+        return names
+
+    @staticmethod
+    cdef TableWithMetadata from_libcudf(table_with_metadata& tbl_with_meta):
+        """Create a Python TableWithMetadata from a libcudf table_with_metadata"""
+        cdef TableWithMetadata out = TableWithMetadata.__new__(TableWithMetadata)
+        out.tbl = Table.from_libcudf(move(tbl_with_meta.tbl))
+        out.metadata = tbl_with_meta.metadata
+        return out
+
+cdef class SourceInfo:
+    """A class containing details on a source to read from.
+
+    For details, see :cpp:class:`cudf::io::source_info`.
+
+    Parameters
+    ----------
+    sources : List[Union[str, os.PathLike, bytes, io.BytesIO]]
+        A homogeneous list of sources (this can be a string filename,
+        an os.PathLike, bytes, or an io.BytesIO) to read from.
+
+        Mixing different types of sources will raise a `ValueError`.
+    """
+
+    def __init__(self, list sources):
+        if not sources:
+            raise ValueError("Need to pass at least one source")
+
+        cdef vector[string] c_files
+
+        if isinstance(sources[0], (os.PathLike, str)):
+            c_files.reserve(len(sources))
+
+            for src in sources:
+                if not isinstance(src, (os.PathLike, str)):
+                    raise ValueError("All sources must be of the same type!")
+                if not os.path.isfile(src):
+                    raise FileNotFoundError(errno.ENOENT,
+                                            os.strerror(errno.ENOENT),
+                                            src)
+
+                c_files.push_back(<string> str(src).encode())
+
+            self.c_obj = move(source_info(c_files))
+            return
+
+        # TODO: host_buffer is deprecated API, use host_span instead
+        cdef vector[host_buffer] c_host_buffers
+        cdef const unsigned char[::1] c_buffer
+        cdef bint empty_buffer = False
+        if isinstance(sources[0], bytes):
+            empty_buffer = True
+            for buffer in sources:
+                if not isinstance(buffer, bytes):
+                    raise ValueError("All sources must be of the same type!")
+                if (len(buffer) > 0):
+                    c_buffer = buffer
+                    c_host_buffers.push_back(host_buffer(<char*>&c_buffer[0],
+                                                         c_buffer.shape[0]))
+                    empty_buffer = False
+        elif isinstance(sources[0], io.BytesIO):
+            for bio in sources:
+                if not isinstance(bio, io.BytesIO):
+                    raise ValueError("All sources must be of the same type!")
+                c_buffer = bio.getbuffer()  # check if empty?
+                c_host_buffers.push_back(host_buffer(<char*>&c_buffer[0],
+                                                     c_buffer.shape[0]))
+
+        self.c_obj = source_info(c_host_buffers)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
index e553515dfdf..25f91849dea 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
@@ -94,7 +94,9 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_writer_options_builder& compression(
             cudf_io_types.compression_type comp
         ) except +
-        orc_writer_options_builder& enable_statistics(bool val) except +
+        orc_writer_options_builder& enable_statistics(
+            cudf_io_types.statistics_freq val
+        ) except +
         orc_writer_options_builder& stripe_size_bytes(size_t val) except +
         orc_writer_options_builder& stripe_size_rows(size_type val) except +
         orc_writer_options_builder& row_index_stride(size_type val) except +
@@ -147,7 +149,7 @@ cdef extern from "cudf/io/orc.hpp" \
             cudf_io_types.compression_type comp
         ) except +
         chunked_orc_writer_options_builder& enable_statistics(
-            bool val
+            cudf_io_types.statistics_freq val
         ) except +
         orc_writer_options_builder& stripe_size_bytes(size_t val) except +
         orc_writer_options_builder& stripe_size_rows(size_type val) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
index 38fae1df1e5..8d87deb1472 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
@@ -20,45 +20,45 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 cdef extern from "cudf/io/types.hpp" \
         namespace "cudf::io" nogil:
 
-    ctypedef enum quote_style:
-        QUOTE_MINIMAL "cudf::io::quote_style::MINIMAL"
-        QUOTE_ALL "cudf::io::quote_style::ALL"
-        QUOTE_NONNUMERIC "cudf::io::quote_style::NONNUMERIC"
-        QUOTE_NONE "cudf::io::quote_style::NONE"
-
-    ctypedef enum compression_type:
-        NONE "cudf::io::compression_type::NONE"
-        AUTO "cudf::io::compression_type::AUTO"
-        SNAPPY "cudf::io::compression_type::SNAPPY"
-        GZIP "cudf::io::compression_type::GZIP"
-        BZIP2 "cudf::io::compression_type::BZIP2"
-        BROTLI "cudf::io::compression_type::BROTLI"
-        ZIP "cudf::io::compression_type::ZIP"
-        XZ "cudf::io::compression_type::XZ"
-        ZLIB "cudf::io::compression_type::ZLIB"
-        LZ4 "cudf::io::compression_type::LZ4"
-        LZO "cudf::io::compression_type::LZO"
-        ZSTD "cudf::io::compression_type::ZSTD"
-
-    ctypedef enum io_type:
-        FILEPATH "cudf::io::io_type::FILEPATH"
-        HOST_BUFFER "cudf::io::io_type::HOST_BUFFER"
-        VOID "cudf::io::io_type::VOID"
-        USER_IMPLEMENTED "cudf::io::io_type::USER_IMPLEMENTED"
-
-    ctypedef enum statistics_freq:
+    cpdef enum class quote_style(int32_t):
+        MINIMAL
+        ALL
+        NONNUMERIC
+        NONE
+
+    cpdef enum class compression_type(int32_t):
+        NONE
+        AUTO
+        SNAPPY
+        GZIP
+        BZIP2
+        BROTLI
+        ZIP
+        XZ
+        ZLIB
+        LZ4
+        LZO
+        ZSTD
+
+    cpdef enum class io_type(int32_t):
+        FILEPATH
+        HOST_BUFFER
+        VOID
+        USER_IMPLEMENTED
+
+    cpdef enum class statistics_freq(int32_t):
         STATISTICS_NONE = 0,
         STATISTICS_ROWGROUP = 1,
         STATISTICS_PAGE = 2,
         STATISTICS_COLUMN = 3,
 
-    ctypedef enum dictionary_policy:
+    cpdef enum class dictionary_policy(int32_t):
         NEVER = 0,
         ADAPTIVE = 1,
         ALWAYS = 2,
 
     cdef extern from "cudf/io/types.hpp" namespace "cudf::io" nogil:
-        cpdef enum class column_encoding:
+        cpdef enum class column_encoding(int32_t):
             USE_DEFAULT = -1
             DICTIONARY = 0
             PLAIN = 1
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index c5a1e7552b9..99850d549a1 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -11,6 +11,7 @@ from cudf._lib.pylibcudf.libcudf.table.table cimport table, table_view
 cdef data_from_unique_ptr(
     unique_ptr[table] c_tbl, column_names, index_names=*)
 cdef data_from_pylibcudf_table(tbl, column_names, index_names=*)
+cdef data_from_pylibcudf_io(tbl_with_meta)
 cdef data_from_table_view(
     table_view tv, object owner, object column_names, object index_names=*)
 cdef table_view table_view_from_columns(columns) except *
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 4c4cd48d6ed..de6b9f690b6 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -315,6 +315,17 @@ cdef data_from_pylibcudf_table(tbl, column_names, index_names=None):
         index_names
     )
 
+cdef data_from_pylibcudf_io(tbl_with_meta):
+    """
+    Unpacks the TableWithMetadata from libcudf I/O
+    into a dict of columns and an Index (cuDF format)
+    """
+    return _data_from_columns(
+        columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns],
+        column_names=tbl_with_meta.column_names,
+        index_names=None
+    )
+
 cdef columns_from_table_view(
     table_view tv,
     object owners,
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index e00053529a8..54d38f1a8cf 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -63,6 +63,23 @@ def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None:
         assert_column_eq(pa_col, plc_col)
 
 
+def assert_table_and_meta_eq(
+    plc_table_w_meta: plc.io.types.TableWithMetadata, pa_table: pa.Table
+) -> None:
+    """Verify that the pylibcudf TableWithMetadata and PyArrow table are equal"""
+
+    plc_table = plc_table_w_meta.tbl
+
+    plc_shape = (plc_table.num_rows(), plc_table.num_columns())
+    assert plc_shape == pa_table.shape
+
+    for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns):
+        assert_column_eq(plc_col, pa_col)
+
+    # Check column name equality
+    assert plc_table_w_meta.column_names == pa_table.column_names
+
+
 def cudf_raises(expected_exception: BaseException, *args, **kwargs):
     # A simple wrapper around pytest.raises that defaults to looking for cudf exceptions
     match = kwargs.get("match", None)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_avro.py b/python/cudf/cudf/pylibcudf_tests/test_avro.py
new file mode 100644
index 00000000000..d6cd86768cd
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_avro.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import io
+import itertools
+
+import fastavro
+import pyarrow as pa
+import pytest
+from utils import assert_table_and_meta_eq
+
+import cudf._lib.pylibcudf as plc
+
+avro_dtype_pairs = [
+    ("boolean", pa.bool_()),
+    ("int", pa.int32()),
+    ("long", pa.int64()),
+    ("float", pa.float32()),
+    ("double", pa.float64()),
+    ("bytes", pa.string()),
+    ("string", pa.string()),
+]
+
+
+@pytest.fixture(
+    scope="module", params=itertools.combinations(avro_dtype_pairs, 2)
+)
+def avro_dtypes(request):
+    return request.param
+
+
+@pytest.fixture
+def avro_dtype_data(avro_dtypes):
+    (avro_type1, _), (avro_type2, _) = avro_dtypes
+
+    def _get_data(avro_type):
+        if avro_type == "boolean":
+            return [True, False, True]
+        elif avro_type in {"int", "long"}:
+            return [1, 2, -1]
+        elif avro_type in {"float", "double"}:
+            return [1.0, 3.1415, -3.1415]
+        elif avro_type == "bytes":
+            return [b"a", b"b", b"c"]
+        elif avro_type == "string":
+            return ["Hello", "World!", ""]
+
+    return _get_data(avro_type1), _get_data(avro_type2)
+
+
+@pytest.fixture(
+    params=[
+        (0, 0),
+        (0, -1),
+        (1, -1),
+        (3, -1),
+    ]
+)
+def row_opts(request):
+    """
+    (skip_rows, num_rows) combos for the avro reader
+    """
+    return request.param
+
+
+@pytest.mark.parametrize("columns", [["prop1"], [], ["prop1", "prop2"]])
+@pytest.mark.parametrize("nullable", [True, False])
+def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable):
+    (avro_type1, expected_type1), (avro_type2, expected_type2) = avro_dtypes
+
+    avro_type1 = avro_type1 if not nullable else ["null", avro_type1]
+    avro_type2 = avro_type2 if not nullable else ["null", avro_type2]
+
+    skip_rows, num_rows = row_opts
+
+    schema = fastavro.parse_schema(
+        {
+            "type": "record",
+            "name": "test",
+            "fields": [
+                {"name": "prop1", "type": avro_type1},
+                {"name": "prop2", "type": avro_type2},
+            ],
+        }
+    )
+
+    if nullable:
+        avro_dtype_data = (
+            avro_dtype_data[0] + [None],
+            avro_dtype_data[1] + [None],
+        )
+
+    records = [
+        {"prop1": val1, "prop2": val2} for val1, val2 in zip(*avro_dtype_data)
+    ]
+
+    buffer = io.BytesIO()
+    fastavro.writer(buffer, schema, records)
+    buffer.seek(0)
+
+    res = plc.io.avro.read_avro(
+        plc.io.types.SourceInfo([buffer]),
+        columns=columns,
+        skip_rows=skip_rows,
+        num_rows=num_rows,
+    )
+
+    expected = pa.Table.from_arrays(
+        [
+            pa.array(avro_dtype_data[0], type=expected_type1),
+            pa.array(avro_dtype_data[1], type=expected_type2),
+        ],
+        names=["prop1", "prop2"],
+    )
+
+    # Adjust for skip_rows/num_rows in result
+    length = num_rows if num_rows != -1 else None
+    expected = expected.slice(skip_rows, length=length)
+
+    # adjust for # of columns
+    if columns != []:
+        expected = expected.select(columns)
+
+    assert_table_and_meta_eq(res, expected)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_source_info.py b/python/cudf/cudf/pylibcudf_tests/test_source_info.py
new file mode 100644
index 00000000000..71a3ecbcc30
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_source_info.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import io
+
+import pytest
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.mark.parametrize(
+    "source", ["a.txt", b"hello world", io.BytesIO(b"hello world")]
+)
+def test_source_info_ctor(source, tmp_path):
+    if isinstance(source, str):
+        file = tmp_path / source
+        file.write_bytes("hello world".encode("utf-8"))
+        source = str(file)
+
+    plc.io.SourceInfo([source])
+
+    # TODO: test contents of source_info buffer is correct
+    # once buffers are exposed on python side
+
+
+@pytest.mark.parametrize(
+    "sources",
+    [
+        ["a.txt", "a.txt"],
+        [b"hello world", b"hello there"],
+        [io.BytesIO(b"hello world"), io.BytesIO(b"hello there")],
+    ],
+)
+def test_source_info_ctor_multiple(sources, tmp_path):
+    for i in range(len(sources)):
+        source = sources[i]
+        if isinstance(source, str):
+            file = tmp_path / source
+            file.write_bytes("hello world".encode("utf-8"))
+            sources[i] = str(file)
+
+    plc.io.SourceInfo(sources)
+
+    # TODO: test contents of source_info buffer is correct
+    # once buffers are exposed on python side
+
+
+@pytest.mark.parametrize(
+    "sources",
+    [
+        ["awef.txt", b"hello world", io.BytesIO(b"hello world")],
+        [b"hello world", b"hello there", "awef.txt"],
+        [
+            io.BytesIO(b"hello world"),
+            io.BytesIO(b"hello there"),
+            b"hello world",
+        ],
+    ],
+)
+def test_source_info_ctor_mixing_invalid(sources, tmp_path):
+    # Unlike the previous test
+    # don't create files so that they are missing
+    for i in range(len(sources)):
+        source = sources[i]
+        if isinstance(source, str):
+            file = tmp_path / source
+            file.write_bytes("hello world".encode("utf-8"))
+            sources[i] = str(file)
+    with pytest.raises(ValueError):
+        plc.io.SourceInfo(sources)

From d1e511edc88deb7604bed71b2689d72da0aed19a Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 6 Jun 2024 15:19:06 +0100
Subject: [PATCH 309/842] Introduce `NamedColumn` concept in cudf-polars
 (#15914)

Simplify name tracking in expression evaluation by only requiring names for columns when putting them in to a `DataFrame`. At the same time, this allows us to have one place where we broadcast-expand `Scalar`s to the size of the `DataFrame`, so we can expunge tracking them in the `DataFrame` itself.

Additionally, adapt to minor changes on the polars side in terms of translating the DSL: we no longer need to handle CSE expressions specially, and sorting by multiple keys takes a list of `descending` flags, rather than a single bool as previously.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15914
---
 .../cudf_polars/containers/__init__.py        |   4 +-
 .../cudf_polars/containers/column.py          |  78 ++++--
 .../cudf_polars/containers/dataframe.py       |  59 ++---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 239 +++++++++++-------
 python/cudf_polars/cudf_polars/dsl/ir.py      | 176 ++++++++-----
 .../cudf_polars/cudf_polars/dsl/translate.py  | 106 +++++---
 .../cudf_polars/testing/asserts.py            |   6 +-
 .../cudf_polars/cudf_polars/utils/dtypes.py   |   3 +-
 .../cudf_polars/cudf_polars/utils/sorting.py  |  12 +-
 python/cudf_polars/docs/overview.md           | 101 +++++++-
 .../cudf_polars/tests/expressions/test_agg.py |   6 +-
 python/cudf_polars/tests/test_select.py       |  21 ++
 python/cudf_polars/tests/test_union.py        |   5 -
 13 files changed, 541 insertions(+), 275 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py
index ef9d9ca61b6..ee69e748eb5 100644
--- a/python/cudf_polars/cudf_polars/containers/__init__.py
+++ b/python/cudf_polars/cudf_polars/containers/__init__.py
@@ -5,8 +5,8 @@
 
 from __future__ import annotations
 
-__all__: list[str] = ["DataFrame", "Column", "Scalar"]
+__all__: list[str] = ["DataFrame", "Column", "NamedColumn", "Scalar"]
 
-from cudf_polars.containers.column import Column
+from cudf_polars.containers.column import Column, NamedColumn
 from cudf_polars.containers.dataframe import DataFrame
 from cudf_polars.containers.scalar import Scalar
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 49034b5f5c8..575d15d3ece 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -13,24 +13,29 @@
 if TYPE_CHECKING:
     from typing_extensions import Self
 
-__all__: list[str] = ["Column"]
+__all__: list[str] = ["Column", "NamedColumn"]
 
 
 class Column:
-    """A column, a name, and sortedness."""
+    """A column with sortedness metadata."""
 
     obj: plc.Column
-    name: str
     is_sorted: plc.types.Sorted
     order: plc.types.Order
     null_order: plc.types.NullOrder
 
-    def __init__(self, column: plc.Column, name: str):
+    def __init__(
+        self,
+        column: plc.Column,
+        *,
+        is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
+        order: plc.types.Order = plc.types.Order.ASCENDING,
+        null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
+    ):
         self.obj = column
-        self.name = name
-        self.is_sorted = plc.types.Sorted.NO
-        self.order = plc.types.Order.ASCENDING
-        self.null_order = plc.types.NullOrder.BEFORE
+        self.is_sorted = is_sorted
+        self.order = order
+        self.null_order = null_order
 
     def sorted_like(self, like: Column, /) -> Self:
         """
@@ -81,22 +86,20 @@ def set_sorted(
         self.null_order = null_order
         return self
 
-    def copy(self, *, new_name: str | None = None) -> Self:
+    def copy(self) -> Self:
         """
-        Return a shallow copy of the column.
-
-        Parameters
-        ----------
-        new_name
-            Optional new name for the copied column.
+        A shallow copy of the column.
 
         Returns
         -------
         New column sharing data with self.
         """
         return type(self)(
-            self.obj, self.name if new_name is None else new_name
-        ).sorted_like(self)
+            self.obj,
+            is_sorted=self.is_sorted,
+            order=self.order,
+            null_order=self.null_order,
+        )
 
     def mask_nans(self) -> Self:
         """Return a copy of self with nans masked out."""
@@ -117,3 +120,44 @@ def nan_count(self) -> int:
                 plc.DataType(plc.TypeId.INT32),
             )
         ).as_py()
+
+
+class NamedColumn(Column):
+    """A column with a name."""
+
+    name: str
+
+    def __init__(
+        self,
+        column: plc.Column,
+        name: str,
+        *,
+        is_sorted: plc.types.Sorted = plc.types.Sorted.NO,
+        order: plc.types.Order = plc.types.Order.ASCENDING,
+        null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
+    ) -> None:
+        super().__init__(
+            column, is_sorted=is_sorted, order=order, null_order=null_order
+        )
+        self.name = name
+
+    def copy(self, *, new_name: str | None = None) -> Self:
+        """
+        A shallow copy of the column.
+
+        Parameters
+        ----------
+        new_name
+            Optional new name for the copied column.
+
+        Returns
+        -------
+        New column sharing data with self.
+        """
+        return type(self)(
+            self.obj,
+            self.name if new_name is None else new_name,
+            is_sorted=self.is_sorted,
+            order=self.order,
+            null_order=self.null_order,
+        )
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index de21a280020..eeaf181be0c 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -12,7 +12,7 @@
 
 import cudf._lib.pylibcudf as plc
 
-from cudf_polars.containers.column import Column
+from cudf_polars.containers.column import NamedColumn
 
 if TYPE_CHECKING:
     from collections.abc import Mapping, Sequence, Set
@@ -21,7 +21,7 @@
 
     import cudf
 
-    from cudf_polars.containers.scalar import Scalar
+    from cudf_polars.containers import Column
 
 
 __all__: list[str] = ["DataFrame"]
@@ -30,26 +30,20 @@
 class DataFrame:
     """A representation of a dataframe."""
 
-    columns: list[Column]
-    scalars: list[Scalar]
+    columns: list[NamedColumn]
     table: plc.Table | None
 
-    def __init__(self, columns: Sequence[Column], scalars: Sequence[Scalar]) -> None:
+    def __init__(self, columns: Sequence[NamedColumn]) -> None:
         self.columns = list(columns)
         self._column_map = {c.name: c for c in self.columns}
-        self.scalars = list(scalars)
-        if len(scalars) == 0:
-            self.table = plc.Table([c.obj for c in columns])
-        else:
-            self.table = None
+        self.table = plc.Table([c.obj for c in columns])
 
     def copy(self) -> Self:
         """Return a shallow copy of self."""
-        return type(self)(self.columns, self.scalars)
+        return type(self)(self.columns)
 
     def to_polars(self) -> pl.DataFrame:
         """Convert to a polars DataFrame."""
-        assert len(self.scalars) == 0
         return pl.from_arrow(
             plc.interop.to_arrow(
                 self.table,
@@ -83,8 +77,10 @@ def num_rows(self) -> int:
     def from_cudf(cls, df: cudf.DataFrame) -> Self:
         """Create from a cudf dataframe."""
         return cls(
-            [Column(c.to_pylibcudf(mode="read"), name) for name, c in df._data.items()],
-            [],
+            [
+                NamedColumn(c.to_pylibcudf(mode="read"), name)
+                for name, c in df._data.items()
+            ]
         )
 
     @classmethod
@@ -105,13 +101,16 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
 
         Raises
         ------
-        ValueError if the number of provided names does not match the
-        number of columns in the table.
+        ValueError
+            If the number of provided names does not match the
+            number of columns in the table.
         """
-        # TODO: strict=True when we drop py39
         if table.num_columns() != len(names):
             raise ValueError("Mismatching name and table length.")
-        return cls([Column(c, name) for c, name in zip(table.columns(), names)], [])
+        return cls(
+            # TODO: strict=True when we drop py39
+            [NamedColumn(c, name) for c, name in zip(table.columns(), names)]
+        )
 
     def sorted_like(
         self, like: DataFrame, /, *, subset: Set[str] | None = None
@@ -132,18 +131,20 @@ def sorted_like(
 
         Raises
         ------
-        ValueError if there is a name mismatch between self and like.
+        ValueError
+            If there is a name mismatch between self and like.
         """
         if like.column_names != self.column_names:
             raise ValueError("Can only copy from identically named frame")
         subset = self.column_names_set if subset is None else subset
         self.columns = [
             c.sorted_like(other) if c.name in subset else c
+            # TODO: strict=True when we drop py39
             for c, other in zip(self.columns, like.columns)
         ]
         return self
 
-    def with_columns(self, columns: Sequence[Column]) -> Self:
+    def with_columns(self, columns: Sequence[NamedColumn]) -> Self:
         """
         Return a new dataframe with extra columns.
 
@@ -160,35 +161,31 @@ def with_columns(self, columns: Sequence[Column]) -> Self:
         -----
         If column names overlap, newer names replace older ones.
         """
-        return type(self)([*self.columns, *columns], self.scalars)
+        return type(self)([*self.columns, *columns])
 
     def discard_columns(self, names: Set[str]) -> Self:
         """Drop columns by name."""
-        return type(self)(
-            [c for c in self.columns if c.name not in names], self.scalars
-        )
+        return type(self)([c for c in self.columns if c.name not in names])
 
     def select(self, names: Sequence[str]) -> Self:
         """Select columns by name returning DataFrame."""
         want = set(names)
         if not want.issubset(self.column_names_set):
             raise ValueError("Can't select missing names")
-        return type(self)([self._column_map[name] for name in names], self.scalars)
+        return type(self)([self._column_map[name] for name in names])
 
-    def replace_columns(self, *columns: Column) -> Self:
+    def replace_columns(self, *columns: NamedColumn) -> Self:
         """Return a new dataframe with columns replaced by name."""
         new = {c.name: c for c in columns}
         if not set(new).issubset(self.column_names_set):
             raise ValueError("Cannot replace with non-existing names")
-        return type(self)([new.get(c.name, c) for c in self.columns], self.scalars)
+        return type(self)([new.get(c.name, c) for c in self.columns])
 
     def rename_columns(self, mapping: Mapping[str, str]) -> Self:
         """Rename some columns."""
-        return type(self)(
-            [c.copy(new_name=mapping.get(c.name)) for c in self.columns], self.scalars
-        )
+        return type(self)([c.copy(new_name=mapping.get(c.name)) for c in self.columns])
 
-    def select_columns(self, names: Set[str]) -> list[Column]:
+    def select_columns(self, names: Set[str]) -> list[NamedColumn]:
         """Select columns by name."""
         return [c for c in self.columns if c.name in names]
 
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 7187a36f21c..c7c11cf6c68 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -26,11 +26,11 @@
 
 import cudf._lib.pylibcudf as plc
 
-from cudf_polars.containers import Column, Scalar
+from cudf_polars.containers import Column, NamedColumn, Scalar
 from cudf_polars.utils import sorting
 
 if TYPE_CHECKING:
-    from collections.abc import Sequence
+    from collections.abc import Mapping, Sequence
 
     import polars.type_aliases as pl_types
 
@@ -110,7 +110,7 @@ def get_hash(self) -> int:
         """
         return hash((type(self), self._ctor_arguments(self.children)))
 
-    def __hash__(self):
+    def __hash__(self) -> int:
         """Hash of an expression with caching."""
         try:
             return self._hash_value
@@ -139,18 +139,18 @@ def is_equal(self, other: Any) -> bool:
             other.children
         )
 
-    def __eq__(self, other):
+    def __eq__(self, other) -> bool:
         """Equality of expressions."""
         if type(self) != type(other) or hash(self) != hash(other):
             return False
         else:
             return self.is_equal(other)
 
-    def __ne__(self, other):
+    def __ne__(self, other) -> bool:
         """Inequality of expressions."""
         return not self.__eq__(other)
 
-    def __repr__(self):
+    def __repr__(self) -> str:
         """String representation of an expression with caching."""
         try:
             return self._repr_value
@@ -164,7 +164,7 @@ def do_evaluate(
         df: DataFrame,
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: dict[Expr, Column] | None = None,
+        mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:  # TODO: return type is a lie for Literal
         """
         Evaluate this expression given a dataframe for context.
@@ -185,15 +185,6 @@ def do_evaluate(
         Do not call this function directly, but rather
         :meth:`evaluate` which handles the mapping lookups.
 
-        The typed return value of :class:`Column` is not true when
-        evaluating :class:`Literal` nodes (which instead produce
-        :class:`Scalar` objects). However, these duck-type to having a
-        pylibcudf container object inside them, and usually they end
-        up appearing in binary expressions which pylibcudf handles
-        appropriately since there are overloads for (column, scalar)
-        pairs. We don't have to handle (scalar, scalar) in binops
-        since the polars optimizer has a constant-folding pass.
-
         Returns
         -------
         Column representing the evaluation of the expression (or maybe
@@ -201,9 +192,10 @@ def do_evaluate(
 
         Raises
         ------
-        NotImplementedError if we couldn't evaluate the expression.
-        Ideally all these are returned during translation to the IR,
-        but for now we are not perfect.
+        NotImplementedError
+            If we couldn't evaluate the expression. Ideally all these
+            are returned during translation to the IR, but for now we
+            are not perfect.
         """
         raise NotImplementedError(f"Evaluation of {type(self).__name__}")
 
@@ -212,7 +204,7 @@ def evaluate(
         df: DataFrame,
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: dict[Expr, Column] | None = None,
+        mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:  # TODO: return type is a lie for Literal
         """
         Evaluate this expression given a dataframe for context.
@@ -234,16 +226,26 @@ def evaluate(
         this method provides logic to handle lookups in the
         substitution mapping.
 
+        The typed return value of :class:`Column` is not true when
+        evaluating :class:`Literal` nodes (which instead produce
+        :class:`Scalar` objects). However, these duck-type to having a
+        pylibcudf container object inside them, and usually they end
+        up appearing in binary expressions which pylibcudf handles
+        appropriately since there are overloads for (column, scalar)
+        pairs. We don't have to handle (scalar, scalar) in binops
+        since the polars optimizer has a constant-folding pass.
+
         Returns
         -------
         Column representing the evaluation of the expression (or maybe
-        a scalar, annoying!).
+        a scalar).
 
         Raises
         ------
-        NotImplementedError if we couldn't evaluate the expression.
-        Ideally all these are returned during translation to the IR,
-        but for now we are not perfect.
+        NotImplementedError
+            If we couldn't evaluate the expression. Ideally all these
+            are returned during translation to the IR, but for now we
+            are not perfect.
         """
         if mapping is None:
             return self.do_evaluate(df, context=context, mapping=mapping)
@@ -269,41 +271,74 @@ def collect_agg(self, *, depth: int) -> AggInfo:
 
         Raises
         ------
-        NotImplementedError if we can't currently perform the
-        aggregation request (for example nested aggregations like
-        ``a.max().min()``).
+        NotImplementedError
+            If we can't currently perform the aggregation request, for
+            example nested aggregations like ``a.max().min()``.
         """
         raise NotImplementedError(
             f"Collecting aggregation info for {type(self).__name__}"
         )
 
 
-class NamedExpr(Expr):
-    __slots__ = ("name", "children")
-    _non_child = ("dtype", "name")
+class NamedExpr:
+    # NamedExpr does not inherit from Expr since it does not appear
+    # when evaluating expressions themselves, only when constructing
+    # named return values in dataframe (IR) nodes.
+    __slots__ = ("name", "value")
 
-    def __init__(self, dtype: plc.DataType, name: str, value: Expr) -> None:
-        super().__init__(dtype)
+    def __init__(self, name: str, value: Expr) -> None:
         self.name = name
-        self.children = (value,)
+        self.value = value
+
+    def __hash__(self) -> int:
+        """Hash of the expression."""
+        return hash((type(self), self.name, self.value))
+
+    def __repr__(self) -> str:
+        """Repr of the expression."""
+        return f"NamedExpr({self.name}, {self.value}"
+
+    def __eq__(self, other) -> bool:
+        """Equality of two expressions."""
+        return (
+            type(self) is type(other)
+            and self.name == other.name
+            and self.value == other.value
+        )
 
-    def do_evaluate(
+    def __ne__(self, other) -> bool:
+        """Inequality of expressions."""
+        return not self.__eq__(other)
+
+    def evaluate(
         self,
         df: DataFrame,
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: dict[Expr, Column] | None = None,
-    ) -> Column:
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> NamedColumn:
         """Evaluate this expression given a dataframe for context."""
-        (child,) = self.children
-        return Column(
-            child.evaluate(df, context=context, mapping=mapping).obj, self.name
-        )
+        obj = self.value.evaluate(df, context=context, mapping=mapping)
+        if isinstance(obj, Scalar):
+            return NamedColumn(
+                plc.Column.from_scalar(obj.obj, 1),
+                self.name,
+                is_sorted=plc.types.Sorted.YES,
+                order=plc.types.Order.ASCENDING,
+                null_order=plc.types.NullOrder.BEFORE,
+            )
+        else:
+            return NamedColumn(
+                obj.obj,
+                self.name,
+                is_sorted=obj.is_sorted,
+                order=obj.order,
+                null_order=obj.null_order,
+            )
 
     def collect_agg(self, *, depth: int) -> AggInfo:
         """Collect information about aggregations in groupbys."""
-        (value,) = self.children
-        return value.collect_agg(depth=depth)
+        return self.value.collect_agg(depth=depth)
 
 
 class Literal(Expr):
@@ -311,21 +346,21 @@ class Literal(Expr):
     _non_child = ("dtype", "value")
     value: pa.Scalar
 
-    def __init__(self, dtype: plc.DataType, value: Any) -> None:
+    def __init__(self, dtype: plc.DataType, value: pa.Scalar) -> None:
         super().__init__(dtype)
-        self.value = pa.scalar(value)
+        assert value.type == plc.interop.to_arrow(dtype)
+        self.value = value
 
     def do_evaluate(
         self,
         df: DataFrame,
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: dict[Expr, Column] | None = None,
+        mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        # TODO: obey dtype
-        obj = plc.interop.from_arrow(self.value)
-        return Scalar(obj)  # type: ignore
+        # datatype of pyarrow scalar is correct by construction.
+        return Scalar(plc.interop.from_arrow(self.value))  # type: ignore
 
 
 class Col(Expr):
@@ -342,7 +377,7 @@ def do_evaluate(
         df: DataFrame,
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: dict[Expr, Column] | None = None,
+        mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         return df._column_map[self.name]
@@ -358,7 +393,7 @@ def do_evaluate(
         df: DataFrame,
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: dict[Expr, Column] | None = None,
+        mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         # TODO: type is wrong, and dtype
@@ -415,8 +450,7 @@ def _distinct(
                 [source_value],
                 indices,
                 plc.Table([plc.Column.from_scalar(target_value, table.num_rows())]),
-            ).columns()[0],
-            column.name,
+            ).columns()[0]
         )
 
     _BETWEEN_OPS: ClassVar[
@@ -448,7 +482,7 @@ def do_evaluate(
         df: DataFrame,
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: dict[Expr, Column] | None = None,
+        mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         columns = [
@@ -467,18 +501,18 @@ def do_evaluate(
             )
         if self.name == pl_expr.BooleanFunction.IsNull:
             (column,) = columns
-            return Column(plc.unary.is_null(column.obj), column.name)
+            return Column(plc.unary.is_null(column.obj))
         elif self.name == pl_expr.BooleanFunction.IsNotNull:
             (column,) = columns
-            return Column(plc.unary.is_valid(column.obj), column.name)
+            return Column(plc.unary.is_valid(column.obj))
         elif self.name == pl_expr.BooleanFunction.IsNan:
             # TODO: copy over null mask since is_nan(null) => null in polars
             (column,) = columns
-            return Column(plc.unary.is_nan(column.obj), column.name)
+            return Column(plc.unary.is_nan(column.obj))
         elif self.name == pl_expr.BooleanFunction.IsNotNan:
             # TODO: copy over null mask since is_not_nan(null) => null in polars
             (column,) = columns
-            return Column(plc.unary.is_not_nan(column.obj), column.name)
+            return Column(plc.unary.is_not_nan(column.obj))
         elif self.name == pl_expr.BooleanFunction.IsFirstDistinct:
             (column,) = columns
             return self._distinct(
@@ -528,7 +562,6 @@ def do_evaluate(
                 ),
             )
         elif self.name == pl_expr.BooleanFunction.AllHorizontal:
-            name = columns[0].name
             if any(c.obj.null_count() > 0 for c in columns):
                 raise NotImplementedError("Kleene logic for all_horizontal")
             return Column(
@@ -539,11 +572,9 @@ def do_evaluate(
                         output_type=self.dtype,
                     ),
                     (c.obj for c in columns),
-                ),
-                name,
+                )
             )
         elif self.name == pl_expr.BooleanFunction.AnyHorizontal:
-            name = columns[0].name
             if any(c.obj.null_count() > 0 for c in columns):
                 raise NotImplementedError("Kleene logic for any_horizontal")
             return Column(
@@ -554,8 +585,7 @@ def do_evaluate(
                         output_type=self.dtype,
                     ),
                     (c.obj for c in columns),
-                ),
-                name,
+                )
             )
         elif self.name == pl_expr.BooleanFunction.IsBetween:
             column, lo, hi = columns
@@ -571,8 +601,7 @@ def do_evaluate(
                     ),
                     plc.binaryop.BinaryOperator.LOGICAL_AND,
                     self.dtype,
-                ),
-                column.name,
+                )
             )
         else:
             raise NotImplementedError(f"BooleanFunction {self.name}")
@@ -606,7 +635,7 @@ def do_evaluate(
         df: DataFrame,
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: dict[Expr, Column] | None = None,
+        mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         columns = [
@@ -615,20 +644,16 @@ def do_evaluate(
         ]
         if self.name == pl_expr.StringFunction.Lowercase:
             (column,) = columns
-            return Column(plc.strings.case.to_lower(column.obj), column.name)
+            return Column(plc.strings.case.to_lower(column.obj))
         elif self.name == pl_expr.StringFunction.Uppercase:
             (column,) = columns
-            return Column(plc.strings.case.to_upper(column.obj), column.name)
+            return Column(plc.strings.case.to_upper(column.obj))
         elif self.name == pl_expr.StringFunction.EndsWith:
             column, suffix = columns
-            return Column(
-                plc.strings.find.ends_with(column.obj, suffix.obj), column.name
-            )
+            return Column(plc.strings.find.ends_with(column.obj, suffix.obj))
         elif self.name == pl_expr.StringFunction.StartsWith:
             column, suffix = columns
-            return Column(
-                plc.strings.find.starts_with(column.obj, suffix.obj), column.name
-            )
+            return Column(plc.strings.find.starts_with(column.obj, suffix.obj))
         else:
             raise NotImplementedError(f"StringFunction {self.name}")
 
@@ -649,19 +674,22 @@ def do_evaluate(
         df: DataFrame,
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: dict[Expr, Column] | None = None,
+        mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         (child,) = self.children
         column = child.evaluate(df, context=context, mapping=mapping)
         (stable, nulls_last, descending) = self.options
         order, null_order = sorting.sort_order(
-            [descending], nulls_last=nulls_last, num_keys=1
+            [descending], nulls_last=[nulls_last], num_keys=1
         )
         do_sort = plc.sorting.stable_sort if stable else plc.sorting.sort
         table = do_sort(plc.Table([column.obj]), order, null_order)
-        return Column(table.columns()[0], column.name).set_sorted(
-            is_sorted=plc.types.Sorted.YES, order=order[0], null_order=null_order[0]
+        return Column(
+            table.columns()[0],
+            is_sorted=plc.types.Sorted.YES,
+            order=order[0],
+            null_order=null_order[0],
         )
 
 
@@ -672,7 +700,7 @@ class SortBy(Expr):
     def __init__(
         self,
         dtype: plc.DataType,
-        options: tuple[bool, bool, tuple[bool]],
+        options: tuple[bool, tuple[bool], tuple[bool]],
         column: Expr,
         *by: Expr,
     ):
@@ -685,7 +713,7 @@ def do_evaluate(
         df: DataFrame,
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: dict[Expr, Column] | None = None,
+        mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         column, *by = (
@@ -700,7 +728,7 @@ def do_evaluate(
         table = do_sort(
             plc.Table([column.obj]), plc.Table([c.obj for c in by]), order, null_order
         )
-        return Column(table.columns()[0], column.name)
+        return Column(table.columns()[0])
 
 
 class Gather(Expr):
@@ -716,7 +744,7 @@ def do_evaluate(
         df: DataFrame,
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: dict[Expr, Column] | None = None,
+        mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         values, indices = (
@@ -741,7 +769,7 @@ def do_evaluate(
             bounds_policy = plc.copying.OutOfBoundsPolicy.DONT_CHECK
             obj = indices.obj
         table = plc.copying.gather(plc.Table([values.obj]), obj, bounds_policy)
-        return Column(table.columns()[0], values.name)
+        return Column(table.columns()[0])
 
 
 class Filter(Expr):
@@ -757,7 +785,7 @@ def do_evaluate(
         df: DataFrame,
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: dict[Expr, Column] | None = None,
+        mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         values, mask = (
@@ -767,7 +795,7 @@ def do_evaluate(
         table = plc.stream_compaction.apply_boolean_mask(
             plc.Table([values.obj]), mask.obj
         )
-        return Column(table.columns()[0], values.name).sorted_like(values)
+        return Column(table.columns()[0]).sorted_like(values)
 
 
 class RollingWindow(Expr):
@@ -803,14 +831,12 @@ def do_evaluate(
         df: DataFrame,
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: dict[Expr, Column] | None = None,
+        mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         (child,) = self.children
         column = child.evaluate(df, context=context, mapping=mapping)
-        return Column(plc.unary.cast(column.obj, self.dtype), column.name).sorted_like(
-            column
-        )
+        return Column(plc.unary.cast(column.obj, self.dtype)).sorted_like(column)
 
     def collect_agg(self, *, depth: int) -> AggInfo:
         """Collect information about aggregations in groupbys."""
@@ -907,7 +933,9 @@ def _reduce(
                 plc.reduce.reduce(column.obj, request, self.dtype),
                 1,
             ),
-            column.name,
+            is_sorted=plc.types.Sorted.YES,
+            order=plc.types.Order.ASCENDING,
+            null_order=plc.types.NullOrder.BEFORE,
         )
 
     def _count(self, column: Column) -> Column:
@@ -921,7 +949,9 @@ def _count(self, column: Column) -> Column:
                 ),
                 1,
             ),
-            column.name,
+            is_sorted=plc.types.Sorted.YES,
+            order=plc.types.Order.ASCENDING,
+            null_order=plc.types.NullOrder.BEFORE,
         )
 
     def _min(self, column: Column, *, propagate_nans: bool) -> Column:
@@ -933,7 +963,9 @@ def _min(self, column: Column, *, propagate_nans: bool) -> Column:
                     ),
                     1,
                 ),
-                column.name,
+                is_sorted=plc.types.Sorted.YES,
+                order=plc.types.Order.ASCENDING,
+                null_order=plc.types.NullOrder.BEFORE,
             )
         if column.nan_count > 0:
             column = column.mask_nans()
@@ -948,25 +980,37 @@ def _max(self, column: Column, *, propagate_nans: bool) -> Column:
                     ),
                     1,
                 ),
-                column.name,
+                is_sorted=plc.types.Sorted.YES,
+                order=plc.types.Order.ASCENDING,
+                null_order=plc.types.NullOrder.BEFORE,
             )
         if column.nan_count > 0:
             column = column.mask_nans()
         return self._reduce(column, request=plc.aggregation.max())
 
     def _first(self, column: Column) -> Column:
-        return Column(plc.copying.slice(column.obj, [0, 1])[0], column.name)
+        return Column(
+            plc.copying.slice(column.obj, [0, 1])[0],
+            is_sorted=plc.types.Sorted.YES,
+            order=plc.types.Order.ASCENDING,
+            null_order=plc.types.NullOrder.BEFORE,
+        )
 
     def _last(self, column: Column) -> Column:
         n = column.obj.size()
-        return Column(plc.copying.slice(column.obj, [n - 1, n])[0], column.name)
+        return Column(
+            plc.copying.slice(column.obj, [n - 1, n])[0],
+            is_sorted=plc.types.Sorted.YES,
+            order=plc.types.Order.ASCENDING,
+            null_order=plc.types.NullOrder.BEFORE,
+        )
 
     def do_evaluate(
         self,
         df,
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: dict[Expr, Column] | None = None,
+        mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         if context is not ExecutionContext.FRAME:
@@ -1018,7 +1062,7 @@ def do_evaluate(
         df: DataFrame,
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
-        mapping: dict[Expr, Column] | None = None,
+        mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         left, right = (
@@ -1027,7 +1071,6 @@ def do_evaluate(
         )
         return Column(
             plc.binaryop.binary_operation(left.obj, right.obj, self.op, self.dtype),
-            "what",
         )
 
     def collect_agg(self, *, depth: int) -> AggInfo:
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index f8441b793b5..0a72cbd9f83 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -30,7 +30,7 @@
 import cudf._lib.pylibcudf as plc
 
 import cudf_polars.dsl.expr as expr
-from cudf_polars.containers import Column, DataFrame
+from cudf_polars.containers import DataFrame, NamedColumn
 from cudf_polars.utils import sorting
 
 if TYPE_CHECKING:
@@ -59,6 +59,38 @@
 ]
 
 
+def broadcast(
+    *columns: NamedColumn, target_length: int | None = None
+) -> list[NamedColumn]:
+    lengths = {column.obj.size() for column in columns}
+    if len(lengths - {1}) > 1:
+        raise RuntimeError("Mismatching column lengths")
+    if lengths == {1}:
+        if target_length is None:
+            return list(columns)
+        nrows = target_length
+    elif len(lengths) == 1:
+        if target_length is not None:
+            assert target_length in lengths
+        return list(columns)
+    else:
+        (nrows,) = lengths - {1}
+        if target_length is not None:
+            assert target_length == nrows
+    return [
+        column
+        if column.obj.size() != 1
+        else NamedColumn(
+            plc.Column.from_scalar(plc.copying.get_element(column.obj, 0), nrows),
+            column.name,
+            is_sorted=plc.types.Sorted.YES,
+            order=plc.types.Order.ASCENDING,
+            null_order=plc.types.NullOrder.BEFORE,
+        )
+        for column in columns
+    ]
+
+
 @dataclass(slots=True)
 class IR:
     """Abstract plan node, representing an unevaluated dataframe."""
@@ -83,9 +115,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
 
         Raises
         ------
-        NotImplementedError if we couldn't evaluate things. Ideally
-        this should not occur, since the translation phase should pick
-        up things that we cannot handle.
+        NotImplementedError
+            If we couldn't evaluate things. Ideally this should not occur,
+            since the translation phase should pick up things that we
+            cannot handle.
         """
         raise NotImplementedError
 
@@ -96,7 +129,7 @@ class PythonScan(IR):
 
     options: Any
     """Arbitrary options."""
-    predicate: expr.Expr | None
+    predicate: expr.NamedExpr | None
     """Filter to apply to the constructed dataframe before returning it."""
 
 
@@ -117,7 +150,7 @@ class Scan(IR):
     - ``row_index: tuple[name, offset] | None``: Add an integer index
         column with given name.
     """
-    predicate: expr.Expr | None
+    predicate: expr.NamedExpr | None
     """Mask to apply to the read dataframe."""
 
     def __post_init__(self):
@@ -153,14 +186,14 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             init = plc.interop.from_arrow(
                 pa.scalar(offset, type=plc.interop.to_arrow(dtype))
             )
-            index = Column(
-                plc.filling.sequence(df.num_rows, init, step), name
-            ).set_sorted(
+            index = NamedColumn(
+                plc.filling.sequence(df.num_rows, init, step),
+                name,
                 is_sorted=plc.types.Sorted.YES,
                 order=plc.types.Order.ASCENDING,
                 null_order=plc.types.NullOrder.AFTER,
             )
-            df = DataFrame([index, *df.columns], [])
+            df = DataFrame([index, *df.columns])
         # TODO: should be true, but not the case until we get
         # cudf-classic out of the loop for IO since it converts date32
         # to datetime.
@@ -171,7 +204,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         if self.predicate is None:
             return df
         else:
-            mask = self.predicate.evaluate(df)
+            (mask,) = broadcast(self.predicate.evaluate(df), target_length=df.num_rows)
             return df.filter(mask)
 
 
@@ -208,7 +241,7 @@ class DataFrameScan(IR):
     """Polars LazyFrame object."""
     projection: list[str]
     """List of columns to project out."""
-    predicate: expr.Expr | None
+    predicate: expr.NamedExpr | None
     """Mask to apply."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
@@ -231,7 +264,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             c.obj.type() == dtype for c, dtype in zip(df.columns, self.schema.values())
         )
         if self.predicate is not None:
-            mask = self.predicate.evaluate(df)
+            (mask,) = broadcast(self.predicate.evaluate(df), target_length=df.num_rows)
             return df.filter(mask)
         else:
             return df
@@ -243,20 +276,15 @@ class Select(IR):
 
     df: IR
     """Input dataframe."""
-    cse: list[expr.Expr]
-    """
-    List of common subexpressions that will appear in the selected expressions.
-
-    These must be evaluated before the returned expressions.
-    """
-    expr: list[expr.Expr]
+    expr: list[expr.NamedExpr]
     """List of expressions to evaluate to form the new dataframe."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]):
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
-        df = df.with_columns([e.evaluate(df) for e in self.cse])
-        return DataFrame([e.evaluate(df) for e in self.expr], [])
+        # Handle any broadcasting
+        columns = broadcast(*(e.evaluate(df) for e in self.expr))
+        return DataFrame(columns)
 
 
 @dataclass(slots=True)
@@ -269,13 +297,15 @@ class Reduce(IR):
 
     df: IR
     """Input dataframe."""
-    expr: list[expr.Expr]
+    expr: list[expr.NamedExpr]
     """List of expressions to evaluate to form the new dataframe."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]):
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
-        return DataFrame([e.evaluate(df) for e in self.expr], [])
+        columns = broadcast(*(e.evaluate(df) for e in self.expr))
+        assert all(column.obj.size() == 1 for column in columns)
+        return DataFrame(columns)
 
 
 def placeholder_column(n: int):
@@ -314,9 +344,9 @@ class GroupBy(IR):
 
     df: IR
     """Input dataframe."""
-    agg_requests: list[expr.Expr]
+    agg_requests: list[expr.NamedExpr]
     """List of expressions to evaluate groupwise."""
-    keys: list[expr.Expr]
+    keys: list[expr.NamedExpr]
     """List of expressions forming the keys."""
     maintain_order: bool
     """Should the order of the input dataframe be maintained?"""
@@ -339,9 +369,10 @@ def check_agg(agg: expr.Expr) -> int:
 
         Raises
         ------
-        NotImplementedError for unsupported expression nodes.
+        NotImplementedError
+            For unsupported expression nodes.
         """
-        if isinstance(agg, (expr.NamedExpr, expr.BinOp, expr.Cast)):
+        if isinstance(agg, (expr.BinOp, expr.Cast)):
             return max(GroupBy.check_agg(child) for child in agg.children)
         elif isinstance(agg, expr.Agg):
             if agg.name == "implode":
@@ -358,14 +389,16 @@ def __post_init__(self):
             raise NotImplementedError("Maintaining order in groupby")
         if self.options.rolling:
             raise NotImplementedError("rolling window/groupby")
-        if any(GroupBy.check_agg(a) > 1 for a in self.agg_requests):
+        if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests):
             raise NotImplementedError("Nested aggregations in groupby")
         self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
-        keys = [k.evaluate(df) for k in self.keys]
+        keys = broadcast(
+            *(k.evaluate(df) for k in self.keys), target_length=df.num_rows
+        )
         # TODO: use sorted information, need to expose column_order
         # and null_precedence in pylibcudf groupby constructor
         # sorted = (
@@ -379,7 +412,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         )
         # TODO: uniquify
         requests = []
-        replacements = []
+        replacements: list[expr.Expr] = []
         for info in self.agg_infos:
             for pre_eval, req, rep in info.requests:
                 if pre_eval is None:
@@ -389,17 +422,20 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
                 requests.append(plc.groupby.GroupByRequest(col, [req]))
                 replacements.append(rep)
         group_keys, raw_tables = grouper.aggregate(requests)
-        raw_columns = []
+        # TODO: names
+        raw_columns: list[NamedColumn] = []
         for i, table in enumerate(raw_tables):
             (column,) = table.columns()
-            raw_columns.append(Column(column, f"column{i}"))
+            raw_columns.append(NamedColumn(column, f"tmp{i}"))
         mapping = dict(zip(replacements, raw_columns))
-        result_keys = [Column(gk, k.name) for gk, k in zip(group_keys.columns(), keys)]
-        result_subs = DataFrame(raw_columns, [])
+        result_keys = [
+            NamedColumn(gk, k.name) for gk, k in zip(group_keys.columns(), keys)
+        ]
+        result_subs = DataFrame(raw_columns)
         results = [
             req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests
         ]
-        return DataFrame([*result_keys, *results], []).slice(self.options.slice)
+        return DataFrame([*result_keys, *results]).slice(self.options.slice)
 
 
 @dataclass(slots=True)
@@ -410,9 +446,9 @@ class Join(IR):
     """Left frame."""
     right: IR
     """Right frame."""
-    left_on: list[expr.Expr]
+    left_on: list[expr.NamedExpr]
     """List of expressions used as keys in the left frame."""
-    right_on: list[expr.Expr]
+    right_on: list[expr.NamedExpr]
     """List of expressions used as keys in the right frame."""
     options: tuple[
         Literal["inner", "left", "full", "leftsemi", "leftanti"],
@@ -479,8 +515,17 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         left = self.left.evaluate(cache=cache)
         right = self.right.evaluate(cache=cache)
-        left_on = DataFrame([e.evaluate(left) for e in self.left_on], [])
-        right_on = DataFrame([e.evaluate(right) for e in self.right_on], [])
+        left_on = DataFrame(
+            broadcast(
+                *(e.evaluate(left) for e in self.left_on), target_length=left.num_rows
+            )
+        )
+        right_on = DataFrame(
+            broadcast(
+                *(e.evaluate(right) for e in self.right_on),
+                target_length=right.num_rows,
+            )
+        )
         how, join_nulls, zlice, suffix, coalesce = self.options
         null_equality = (
             plc.types.NullEquality.EQUAL
@@ -510,7 +555,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             if coalesce and how != "inner":
                 left = left.replace_columns(
                     *(
-                        Column(
+                        NamedColumn(
                             plc.replace.replace_nulls(left_col.obj, right_col.obj),
                             left_col.name,
                         )
@@ -538,20 +583,18 @@ class HStack(IR):
 
     df: IR
     """Input dataframe."""
-    cse: list[expr.Expr]
-    """
-    List of common subexpressions that will appear in the selected expressions.
-
-    These must be evaluated before the returned expressions.
-    """
-    columns: list[expr.Expr]
+    columns: list[expr.NamedExpr]
     """List of expressions to produce new columns."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
-        ctx = df.copy().with_columns([e.evaluate(df) for e in self.cse])
-        return df.with_columns([c.evaluate(ctx) for c in self.columns])
+        columns = [c.evaluate(df) for c in self.columns]
+        # TODO: a bit of a hack, should inherit the should_broadcast
+        # property of polars' ProjectionOptions on the hstack node.
+        if not any(e.name.startswith("__POLARS_CSER_0x") for e in self.columns):
+            columns = broadcast(*columns, target_length=df.num_rows)
+        return df.with_columns(columns)
 
 
 @dataclass(slots=True)
@@ -614,7 +657,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
                 plc.types.NanEquality.ALL_EQUAL,
             )
         result = DataFrame(
-            [Column(c, old.name) for c, old in zip(table.columns(), df.columns)], []
+            [
+                NamedColumn(c, old.name).sorted_like(old)
+                for c, old in zip(table.columns(), df.columns)
+            ]
         )
         if keys_sorted or self.stable:
             result = result.sorted_like(df)
@@ -627,7 +673,7 @@ class Sort(IR):
 
     df: IR
     """Input."""
-    by: list[expr.Expr]
+    by: list[expr.NamedExpr]
     """List of expressions to produce sort keys."""
     do_sort: Callable[..., plc.Table]
     """pylibcudf sorting function."""
@@ -642,7 +688,7 @@ def __init__(
         self,
         schema: dict,
         df: IR,
-        by: list[expr.Expr],
+        by: list[expr.NamedExpr],
         options: Any,
         zlice: tuple[int, int] | None,
     ):
@@ -661,7 +707,9 @@ def __init__(
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
-        sort_keys = [k.evaluate(df) for k in self.by]
+        sort_keys = broadcast(
+            *(k.evaluate(df) for k in self.by), target_length=df.num_rows
+        )
         names = {c.name: i for i, c in enumerate(df.columns)}
         # TODO: More robust identification here.
         keys_in_result = [
@@ -675,7 +723,9 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             self.order,
             self.null_order,
         )
-        columns = [Column(c, old.name) for c, old in zip(table.columns(), df.columns)]
+        columns = [
+            NamedColumn(c, old.name) for c, old in zip(table.columns(), df.columns)
+        ]
         # If a sort key is in the result table, set the sortedness property
         for k, i in enumerate(keys_in_result):
             columns[i] = columns[i].set_sorted(
@@ -683,7 +733,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
                 order=self.order[k],
                 null_order=self.null_order[k],
             )
-        return DataFrame(columns, []).slice(self.zlice)
+        return DataFrame(columns).slice(self.zlice)
 
 
 @dataclass(slots=True)
@@ -709,13 +759,14 @@ class Filter(IR):
 
     df: IR
     """Input."""
-    mask: expr.Expr
+    mask: expr.NamedExpr
     """Expression evaluating to a mask."""
 
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
-        return df.filter(self.mask.evaluate(df))
+        (mask,) = broadcast(self.mask.evaluate(df), target_length=df.num_rows)
+        return df.filter(mask)
 
 
 @dataclass(slots=True)
@@ -729,7 +780,10 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
         # This can reorder things.
-        return df.select(list(self.schema.keys()))
+        columns = broadcast(
+            *df.select(list(self.schema.keys())).columns, target_length=df.num_rows
+        )
+        return DataFrame(columns)
 
 
 @dataclass(slots=True)
@@ -856,10 +910,8 @@ class HConcat(IR):
     def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         dfs = [df.evaluate(cache=cache) for df in self.dfs]
-        columns, scalars = zip(*((df.columns, df.scalars) for df in dfs))
         return DataFrame(
-            list(itertools.chain.from_iterable(columns)),
-            list(itertools.chain.from_iterable(scalars)),
+            list(itertools.chain.from_iterable(df.columns for df in dfs)),
         )
 
 
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 9a301164beb..641176daff4 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -18,11 +18,25 @@
 from cudf_polars.dsl import expr, ir
 from cudf_polars.utils import dtypes
 
-__all__ = ["translate_ir", "translate_expr"]
+__all__ = ["translate_ir", "translate_named_expr"]
 
 
 class set_node(AbstractContextManager):
-    """Run a block with current node set in the visitor."""
+    """
+    Run a block with current node set in the visitor.
+
+    Parameters
+    ----------
+    visitor
+        The internal Rust visitor object
+    n
+        The node to set as the current root.
+
+    Notes
+    -----
+    This is useful for translating expressions with a given node
+    active, restoring the node when the block exits.
+    """
 
     __slots__ = ("n", "visitor")
 
@@ -52,7 +66,7 @@ def _(node: pl_ir.PythonScan, visitor: Any, schema: dict[str, plc.DataType]) ->
     return ir.PythonScan(
         schema,
         node.options,
-        translate_expr(visitor, n=node.predicate)
+        translate_named_expr(visitor, n=node.predicate)
         if node.predicate is not None
         else None,
     )
@@ -65,7 +79,7 @@ def _(node: pl_ir.Scan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
         node.scan_type,
         node.paths,
         node.file_options,
-        translate_expr(visitor, n=node.predicate)
+        translate_named_expr(visitor, n=node.predicate)
         if node.predicate is not None
         else None,
     )
@@ -84,7 +98,7 @@ def _(
         schema,
         node.df,
         node.projection,
-        translate_expr(visitor, n=node.selection)
+        translate_named_expr(visitor, n=node.selection)
         if node.selection is not None
         else None,
     )
@@ -94,17 +108,16 @@ def _(
 def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
-    cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_expr]
-    exprs = [translate_expr(visitor, n=e) for e in node.expr]
-    return ir.Select(schema, inp, cse_exprs, exprs)
+        exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
+    return ir.Select(schema, inp, exprs)
 
 
 @_translate_ir.register
 def _(node: pl_ir.GroupBy, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
-    aggs = [translate_expr(visitor, n=e) for e in node.aggs]
-    keys = [translate_expr(visitor, n=e) for e in node.keys]
+        aggs = [translate_named_expr(visitor, n=e) for e in node.aggs]
+        keys = [translate_named_expr(visitor, n=e) for e in node.keys]
     return ir.GroupBy(
         schema,
         inp,
@@ -122,10 +135,10 @@ def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
     # input active.
     with set_node(visitor, node.input_left):
         inp_left = translate_ir(visitor, n=None)
-        left_on = [translate_expr(visitor, n=e) for e in node.left_on]
+        left_on = [translate_named_expr(visitor, n=e) for e in node.left_on]
     with set_node(visitor, node.input_right):
         inp_right = translate_ir(visitor, n=None)
-        right_on = [translate_expr(visitor, n=e) for e in node.right_on]
+        right_on = [translate_named_expr(visitor, n=e) for e in node.right_on]
     return ir.Join(schema, inp_left, inp_right, left_on, right_on, node.options)
 
 
@@ -133,16 +146,15 @@ def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
 def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
-    cse_exprs = [translate_expr(visitor, n=e) for e in node.cse_exprs]
-    exprs = [translate_expr(visitor, n=e) for e in node.exprs]
-    return ir.HStack(schema, inp, cse_exprs, exprs)
+        exprs = [translate_named_expr(visitor, n=e) for e in node.exprs]
+    return ir.HStack(schema, inp, exprs)
 
 
 @_translate_ir.register
 def _(node: pl_ir.Reduce, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
-    exprs = [translate_expr(visitor, n=e) for e in node.expr]
+        exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
     return ir.Reduce(schema, inp, exprs)
 
 
@@ -159,7 +171,7 @@ def _(node: pl_ir.Distinct, visitor: Any, schema: dict[str, plc.DataType]) -> ir
 def _(node: pl_ir.Sort, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
-    by = [translate_expr(visitor, n=e) for e in node.by_column]
+        by = [translate_named_expr(visitor, n=e) for e in node.by_column]
     return ir.Sort(schema, inp, by, node.sort_options, node.slice)
 
 
@@ -172,7 +184,7 @@ def _(node: pl_ir.Slice, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR
 def _(node: pl_ir.Filter, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
-    mask = translate_expr(visitor, n=node.predicate)
+        mask = translate_named_expr(visitor, n=node.predicate)
     return ir.Filter(schema, inp, mask)
 
 
@@ -234,8 +246,8 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
 
     Raises
     ------
-    NotImplementedError if we can't translate the nodes due to
-    unsupported functionality.
+    NotImplementedError
+        If we can't translate the nodes due to unsupported functionality.
     """
     ctx: AbstractContextManager = (
         set_node(visitor, n) if n is not None else noop_context
@@ -246,17 +258,41 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
         return _translate_ir(node, visitor, schema)
 
 
+def translate_named_expr(visitor: Any, *, n: pl_expr.PyExprIR) -> expr.NamedExpr:
+    """
+    Translate a polars-internal named expression IR object into our representation.
+
+    Parameters
+    ----------
+    visitor
+        Polars NodeTraverser object
+    n
+        Node to translate, a named expression node.
+
+    Returns
+    -------
+    Translated IR object.
+
+    Notes
+    -----
+    The datatype of the internal expression will be obtained from the
+    visitor by calling ``get_dtype``, for this to work properly, the
+    caller should arrange that the expression is translated with the
+    node that it references "active" for the visitor (see :class:`set_node`).
+
+    Raises
+    ------
+    NotImplementedError
+        If any translation fails due to unsupported functionality.
+    """
+    return expr.NamedExpr(n.output_name, translate_expr(visitor, n=n.node))
+
+
 @singledispatch
 def _translate_expr(node: Any, visitor: Any, dtype: plc.DataType) -> expr.Expr:
     raise NotImplementedError(f"Translation for {type(node).__name__}")
 
 
-@_translate_expr.register
-def _(node: pl_expr.PyExprIR, visitor: Any, dtype: plc.DataType) -> expr.Expr:
-    e = translate_expr(visitor, n=node.node)
-    return expr.NamedExpr(dtype, node.output_name, e)
-
-
 @_translate_expr.register
 def _(node: pl_expr.Function, visitor: Any, dtype: plc.DataType) -> expr.Expr:
     name, *options = node.function_data
@@ -375,7 +411,7 @@ def _(node: pl_expr.Len, visitor: Any, dtype: plc.DataType) -> expr.Expr:
     return expr.Len(dtype)
 
 
-def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
+def translate_expr(visitor: Any, *, n: int) -> expr.Expr:
     """
     Translate a polars-internal expression IR into our representation.
 
@@ -384,8 +420,7 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
     visitor
         Polars NodeTraverser object
     n
-        Node to translate, either an integer referencing a polars
-        internal node, or a named expression node.
+        Node to translate, an integer referencing a polars internal node.
 
     Returns
     -------
@@ -393,14 +428,9 @@ def translate_expr(visitor: Any, *, n: int | pl_expr.PyExprIR) -> expr.Expr:
 
     Raises
     ------
-    NotImplementedError if any translation fails due to unsupported functionality.
+    NotImplementedError
+        If any translation fails due to unsupported functionality.
     """
-    if isinstance(n, pl_expr.PyExprIR):
-        # TODO: type narrowing doesn't rule out int since PyExprIR is Unknown
-        assert not isinstance(n, int)
-        node = n
-        dtype = dtypes.from_polars(visitor.get_dtype(node.node))
-    else:
-        node = visitor.view_expression(n)
-        dtype = dtypes.from_polars(visitor.get_dtype(n))
+    node = visitor.view_expression(n)
+    dtype = dtypes.from_polars(visitor.get_dtype(n))
     return _translate_expr(node, visitor, dtype)
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
index a6e26a6425c..2fbfa971fef 100644
--- a/python/cudf_polars/cudf_polars/testing/asserts.py
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -23,7 +23,7 @@ def assert_gpu_result_equal(
     *,
     check_row_order: bool = True,
     check_column_order: bool = True,
-    check_dtype: bool = True,
+    check_dtypes: bool = True,
     check_exact: bool = True,
     rtol: float = 1e-05,
     atol: float = 1e-08,
@@ -40,7 +40,7 @@ def assert_gpu_result_equal(
         Expect rows to be in same order
     check_column_order
         Expect columns to be in same order
-    check_dtype
+    check_dtypes
         Expect dtypes to match
     check_exact
         Require exact equality for floats, if `False` compare using
@@ -68,7 +68,7 @@ def assert_gpu_result_equal(
         got,
         check_row_order=check_row_order,
         check_column_order=check_column_order,
-        check_dtype=check_dtype,
+        check_dtypes=check_dtypes,
         check_exact=check_exact,
         rtol=rtol,
         atol=atol,
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index bede0de3c9f..7b0049daf11 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -32,7 +32,8 @@ def from_polars(dtype: pl.DataType) -> plc.DataType:
 
     Raises
     ------
-    NotImplementedError for unsupported conversions.
+    NotImplementedError
+        For unsupported conversions.
     """
     if isinstance(dtype, pl.Boolean):
         return plc.DataType(plc.TypeId.BOOL8)
diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py
index b3ecfdd3dd4..d35459db20d 100644
--- a/python/cudf_polars/cudf_polars/utils/sorting.py
+++ b/python/cudf_polars/cudf_polars/utils/sorting.py
@@ -14,7 +14,7 @@
 
 
 def sort_order(
-    descending: Sequence[bool], *, nulls_last: bool, num_keys: int
+    descending: Sequence[bool], *, nulls_last: Sequence[bool], num_keys: int
 ) -> tuple[list[plc.types.Order], list[plc.types.NullOrder]]:
     """
     Produce sort order arguments.
@@ -36,14 +36,18 @@ def sort_order(
     # Mimicking polars broadcast handling of descending
     if num_keys > (n := len(descending)) and n == 1:
         descending = [descending[0]] * num_keys
+    if num_keys > (n := len(nulls_last)) and n == 1:
+        nulls_last = [nulls_last[0]] * num_keys
     column_order = [
         plc.types.Order.DESCENDING if d else plc.types.Order.ASCENDING
         for d in descending
     ]
     null_precedence = []
-    for asc in column_order:
-        if (asc == plc.types.Order.ASCENDING) ^ (not nulls_last):
+    # TODO: use strict=True when we drop py39
+    assert len(descending) == len(nulls_last)
+    for asc, null_last in zip(column_order, nulls_last):
+        if (asc == plc.types.Order.ASCENDING) ^ (not null_last):
             null_precedence.append(plc.types.NullOrder.AFTER)
-        elif (asc == plc.types.Order.ASCENDING) ^ nulls_last:
+        elif (asc == plc.types.Order.ASCENDING) ^ null_last:
             null_precedence.append(plc.types.NullOrder.BEFORE)
     return column_order, null_precedence
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index cbf012f5881..b50d01c26db 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -34,6 +34,8 @@ pip install --upgrade uv
 uv pip install --upgrade -r py-polars/requirements-dev.txt
 ```
 
+> ![NOTE] plain `pip install` works fine, but `uv` is _much_ faster!
+
 Now we have the necessary machinery to build polars
 ```sh
 cd py-polars
@@ -57,7 +59,7 @@ The executor for the polars logical plan lives in the cudf repo, in
 
 ```sh
 cd cudf/python/cudf_polars
-pip install --no-deps -e .
+uv pip install --no-build-isolation --no-deps -e .
 ```
 
 You should now be able to run the tests in the `cudf_polars` package:
@@ -96,6 +98,21 @@ This should either transparently run on the GPU and deliver a polars
 dataframe, or else fail (but be handled) and just run the normal CPU
 execution.
 
+If you want to fail during translation, set the keyword argument
+`raise_on_fail` to `True`:
+
+```python
+from functools import partial
+from cudf_polars.callback import execute_with_cudf
+
+result = q.collect(
+    post_opt_callback=partial(execute_with_cudf, raise_on_fail=True)
+)
+```
+
+This is mostly useful when writing tests, since in that case we want
+any failures to propagate, rather than falling back to the CPU mode.
+
 ## Adding a handler for a new plan node
 
 Plan node definitions live in `cudf_polars/dsl/ir.py`, these are
@@ -153,22 +170,84 @@ the logical plan in any case, so is reasonably natural.
 # Containers
 
 Containers should be constructed as relatively lightweight objects
-around their pylibcudf counterparts. We have three (in
+around their pylibcudf counterparts. We have four (in
 `cudf_polars/containers/`):
 
-1. Scalar (a wrapper around a pylibcudf Scalar)
-2. Column (a wrapper around a pylibcudf Column)
-3. DataFrame (a wrapper around a pylibcudf Table)
+1. `Scalar` (a wrapper around a pylibcudf `Scalar`)
+2. `Column` (a wrapper around a pylibcudf `Column`)
+3. `NamedColumn` a `Column` with an additional name
+4. `DataFrame` (a wrapper around a pylibcudf `Table`)
 
 The interfaces offered by these are somewhat in flux, but broadly
-speaking, a `DataFrame` is just a list of `Column`s which each hold
-data plus a string `name`, along with a collection of `Scalar`s (this
-might go away).
+speaking, a `DataFrame` is just a list of `NamedColumn`s which each
+hold a `Column` plus a string `name`. `NamedColumn`s are only ever
+constructed via `NamedExpr`s, which are the top-level expression node
+that lives inside an `IR` node. This means that the expression
+evaluator never has to concern itself with column names: columns are
+only ever decorated with names when constructing a `DataFrame`.
 
 The columns keep track of metadata (for example, whether or not they
-are sorted).
+are sorted). We could imagine tracking more metadata, like minimum and
+maximum, though perhaps that is better left to libcudf itself.
 
 We offer some utility methods for transferring metadata when
 constructing new dataframes and columns, both `DataFrame` and `Column`
-offer a `with_metadata(*, like: Self)` call which copies metadata from
-the template.
+offer a `sorted_like(like: Self)` call which copies metadata from the
+template.
+
+All methods on containers that modify in place should return `self`,
+to facilitate use in a ["fluent"
+style](https://en.wikipedia.org/wiki/Fluent_interface). It makes it
+much easier to write iteration over objects and collect the results if
+everyone always returns a value.
+
+# Writing tests
+
+We use `pytest`, tests live in the `tests/` subdirectory,
+organisationally the top-level test files each handle one of the `IR`
+nodes. The goal is that they are parametrized over all the options
+each node will handle, to have reasonable coverage. Tests of
+expression functionality should live in `tests/expressions/`.
+
+To write a test an assert correctness, build a lazyframe as a query,
+and then use the utility assertion function from
+`cudf_polars.testing.asserts`. This runs the query using both the cudf
+executor and polars CPU, and checks that they match. So:
+
+```python
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+def test_whatever():
+    query = pl.LazyFrame(...).(...)
+
+    assert_gpu_result_equal(query)
+```
+
+# Debugging
+
+If the callback execution fails during the polars `collect` call, we
+obtain an error, but are not able to drop into the debugger and
+inspect the stack properly: we can't cross the language barrier.
+
+However, we can drive the translation and execution of the DSL by
+hand. Given some `LazyFrame` representing a query, we can first
+translate it to our intermediate representation (IR), and then execute
+and convert back to polars:
+
+```python
+from cudf_polars.dsl.translate import translate_ir
+
+q = ...
+
+# Convert to our IR
+ir = translate_ir(q._ldf.visit())
+
+# DataFrame living on the device
+result = ir.evaluate(cache={})
+
+# Polars dataframe
+host_result = result.to_polars()
+```
+
+If we get any exceptions, we can then debug as normal in Python.
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index 645dbd26140..79018c80bf3 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -56,8 +56,8 @@ def test_agg(df, agg):
     q = df.select(expr)
 
     # https://github.com/rapidsai/cudf/issues/15852
-    check_dtype = agg not in {"n_unique", "median"}
-    if not check_dtype and q.schema["a"] != pl.Float64:
+    check_dtypes = agg not in {"n_unique", "median"}
+    if not check_dtypes and q.schema["a"] != pl.Float64:
         with pytest.raises(AssertionError):
             assert_gpu_result_equal(q)
-    assert_gpu_result_equal(q, check_dtype=check_dtype, check_exact=False)
+    assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False)
diff --git a/python/cudf_polars/tests/test_select.py b/python/cudf_polars/tests/test_select.py
index 503edef152e..037f3ab5428 100644
--- a/python/cudf_polars/tests/test_select.py
+++ b/python/cudf_polars/tests/test_select.py
@@ -36,3 +36,24 @@ def test_select_reduce():
     )
 
     assert_gpu_result_equal(query)
+
+
+def test_select_with_cse_no_agg():
+    df = pl.LazyFrame({"a": [1, 2, 3]})
+    expr = pl.col("a") + pl.col("a")
+
+    query = df.select(expr, (expr * 2).alias("b"), ((expr * 2) + 10).alias("c"))
+
+    assert_gpu_result_equal(query)
+
+
+def test_select_with_cse_with_agg():
+    df = pl.LazyFrame({"a": [1, 2, 3]})
+    expr = pl.col("a") + pl.col("a")
+    asum = pl.col("a").sum() + pl.col("a").sum()
+
+    query = df.select(
+        expr, (expr * 2).alias("b"), asum.alias("c"), (asum + 10).alias("d")
+    )
+
+    assert_gpu_result_equal(query)
diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py
index 2c85bb15a55..18cf4748692 100644
--- a/python/cudf_polars/tests/test_union.py
+++ b/python/cudf_polars/tests/test_union.py
@@ -2,14 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
-import pytest
-
 import polars as pl
 
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
-@pytest.mark.xfail(reason="Need handling of null scalars that are cast")
 def test_union():
     ldf = pl.DataFrame(
         {
@@ -19,8 +16,6 @@ def test_union():
     ).lazy()
     ldf2 = ldf.select((pl.col("a") + pl.col("b")).alias("c"), pl.col("a"))
     query = pl.concat([ldf, ldf2], how="diagonal")
-    # Plan for this produces a `None`.astype(Int64) which we don't
-    # handle correctly right now
     assert_gpu_result_equal(query)
 
 
From 66895af970c19978e12c242f92f5b5676d91b9e3 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 6 Jun 2024 11:12:15 -0500
Subject: [PATCH 310/842] Implement chunked parquet reader in cudf-python
 (#15728)

Partially Addresses: #14966

This PR implements chunked parquet bindings in python.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/15728
---
 python/cudf/cudf/_lib/parquet.pyx             | 242 +++++++++++++-----
 .../_lib/pylibcudf/libcudf/io/parquet.pxd     |  12 +
 python/cudf/cudf/tests/test_parquet.py        |  27 ++
 3 files changed, 220 insertions(+), 61 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index ac592cedaac..f6f9cfa9a7c 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -26,6 +26,7 @@ from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.pair cimport pair
 from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
@@ -44,6 +45,7 @@ from cudf._lib.io.utils cimport (
 )
 from cudf._lib.pylibcudf.libcudf.expressions cimport expression
 from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
+    chunked_parquet_reader as cpp_chunked_parquet_reader,
     chunked_parquet_writer_options,
     merge_row_group_metadata as parquet_merge_metadata,
     parquet_chunked_writer as cpp_parquet_chunked_writer,
@@ -60,6 +62,7 @@ from cudf._lib.pylibcudf.libcudf.io.parquet_metadata cimport (
 from cudf._lib.pylibcudf.libcudf.io.types cimport (
     column_in_metadata,
     table_input_metadata,
+    table_metadata,
 )
 from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
@@ -126,50 +129,22 @@ def _parse_metadata(meta):
     return file_is_range_index, file_index_cols, file_column_dtype
 
 
-cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
-                   use_pandas_metadata=True,
-                   Expression filters=None):
-    """
-    Cython function to call into libcudf API, see `read_parquet`.
-
-    filters, if not None, should be an Expression that evaluates to a
-    boolean predicate as a function of columns being read.
-
-    See Also
-    --------
-    cudf.io.parquet.read_parquet
-    cudf.io.parquet.to_parquet
-    """
-
-    # Convert NativeFile buffers to NativeFileDatasource,
-    # but save original buffers in case we need to use
-    # pyarrow for metadata processing
-    # (See: https://github.com/rapidsai/cudf/issues/9599)
-    pa_buffers = []
-    for i, datasource in enumerate(filepaths_or_buffers):
-        if isinstance(datasource, NativeFile):
-            pa_buffers.append(datasource)
-            filepaths_or_buffers[i] = NativeFileDatasource(datasource)
+cdef pair[parquet_reader_options, bool] _setup_parquet_reader_options(
+     cudf_io_types.source_info source,
+     vector[vector[size_type]] row_groups,
+     bool use_pandas_metadata,
+     Expression filters,
+     object columns):
 
-    cdef cudf_io_types.source_info source = make_source_info(
-        filepaths_or_buffers)
-
-    cdef bool cpp_use_pandas_metadata = use_pandas_metadata
-
-    cdef vector[vector[size_type]] cpp_row_groups
+    cdef parquet_reader_options args
+    cdef parquet_reader_options_builder builder
     cdef data_type cpp_timestamp_type = cudf_types.data_type(
         cudf_types.type_id.EMPTY
     )
-    if row_groups is not None:
-        cpp_row_groups = row_groups
-
-    # Setup parquet reader arguments
-    cdef parquet_reader_options args
-    cdef parquet_reader_options_builder builder
     builder = (
         parquet_reader_options.builder(source)
-        .row_groups(cpp_row_groups)
-        .use_pandas_metadata(cpp_use_pandas_metadata)
+        .row_groups(row_groups)
+        .use_pandas_metadata(use_pandas_metadata)
         .use_arrow_schema(True)
         .timestamp_type(cpp_timestamp_type)
     )
@@ -185,28 +160,28 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
         for col in columns:
             cpp_columns.push_back(str(col).encode())
         args.set_columns(cpp_columns)
-    # Filters don't handle the range index correctly
     allow_range_index &= filters is None
 
-    # Read Parquet
-    cdef cudf_io_types.table_with_metadata c_result
-
-    with nogil:
-        c_result = move(parquet_reader(args))
-
-    names = [info.name.decode() for info in c_result.metadata.schema_info]
-
-    # Access the Parquet per_file_user_data to find the index
+    return pair[parquet_reader_options, bool](args, allow_range_index)
+
+cdef object _process_metadata(object df,
+                              table_metadata table_meta,
+                              list names,
+                              object row_groups,
+                              object filepaths_or_buffers,
+                              list pa_buffers,
+                              bool allow_range_index,
+                              bool use_pandas_metadata):
+    update_struct_field_names(df, table_meta.schema_info)
     index_col = None
-    cdef vector[unordered_map[string, string]] per_file_user_data = \
-        c_result.metadata.per_file_user_data
-
+    is_range_index = True
     column_index_type = None
     index_col_names = None
-    is_range_index = True
+    meta = None
+    cdef vector[unordered_map[string, string]] per_file_user_data = \
+        table_meta.per_file_user_data
     for single_file in per_file_user_data:
         json_str = single_file[b'pandas'].decode('utf-8')
-        meta = None
         if json_str != "":
             meta = json.loads(json_str)
             file_is_range_index, index_col, column_index_type = _parse_metadata(meta)
@@ -220,13 +195,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
                         if c['field_name'] == idx_col:
                             index_col_names[idx_col] = c['name']
 
-    df = cudf.DataFrame._from_data(*data_from_unique_ptr(
-        move(c_result.tbl),
-        column_names=names
-    ))
-
-    update_struct_field_names(df, c_result.metadata.schema_info)
-
     if meta is not None:
         # Book keep each column metadata as the order
         # of `meta["columns"]` and `column_names` are not
@@ -319,9 +287,65 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
             if use_pandas_metadata:
                 df.index.names = index_col
 
-    # Set column dtype for empty types.
     if len(df._data.names) == 0 and column_index_type is not None:
         df._data.label_dtype = cudf.dtype(column_index_type)
+
+    return df
+
+
+cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
+                   use_pandas_metadata=True,
+                   Expression filters=None):
+    """
+    Cython function to call into libcudf API, see `read_parquet`.
+
+    filters, if not None, should be an Expression that evaluates to a
+    boolean predicate as a function of columns being read.
+
+    See Also
+    --------
+    cudf.io.parquet.read_parquet
+    cudf.io.parquet.to_parquet
+    """
+
+    # Convert NativeFile buffers to NativeFileDatasource,
+    # but save original buffers in case we need to use
+    # pyarrow for metadata processing
+    # (See: https://github.com/rapidsai/cudf/issues/9599)
+    pa_buffers = []
+    for i, datasource in enumerate(filepaths_or_buffers):
+        if isinstance(datasource, NativeFile):
+            pa_buffers.append(datasource)
+            filepaths_or_buffers[i] = NativeFileDatasource(datasource)
+
+    cdef cudf_io_types.source_info source = make_source_info(
+        filepaths_or_buffers)
+
+    cdef vector[vector[size_type]] cpp_row_groups
+    if row_groups is not None:
+        cpp_row_groups = row_groups
+
+    # Setup parquet reader arguments
+    cdef parquet_reader_options args
+    cdef pair[parquet_reader_options, bool] c_res = _setup_parquet_reader_options(
+            source, cpp_row_groups, use_pandas_metadata, filters, columns)
+    args, allow_range_index = c_res.first, c_res.second
+
+    # Read Parquet
+    cdef cudf_io_types.table_with_metadata c_result
+
+    with nogil:
+        c_result = move(parquet_reader(args))
+
+    names = [info.name.decode() for info in c_result.metadata.schema_info]
+
+    df = cudf.DataFrame._from_data(*data_from_unique_ptr(
+        move(c_result.tbl),
+        column_names=names
+    ))
+    df = _process_metadata(df, c_result.metadata, names, row_groups,
+                           filepaths_or_buffers, pa_buffers,
+                           allow_range_index, use_pandas_metadata)
     return df
 
 cpdef read_parquet_metadata(filepaths_or_buffers):
@@ -767,6 +791,102 @@ cdef class ParquetWriter:
         self.initialized = True
 
 
+cdef class ParquetReader:
+    cdef bool initialized
+    cdef unique_ptr[cpp_chunked_parquet_reader] reader
+    cdef size_t chunk_read_limit
+    cdef size_t pass_read_limit
+    cdef size_t row_group_size_bytes
+    cdef table_metadata result_meta
+    cdef vector[unordered_map[string, string]] per_file_user_data
+    cdef object pandas_meta
+    cdef list pa_buffers
+    cdef bool allow_range_index
+    cdef object row_groups
+    cdef object filepaths_or_buffers
+    cdef object names
+    cdef object column_index_type
+    cdef object index_col_names
+    cdef bool is_range_index
+    cdef object index_col
+    cdef bool cpp_use_pandas_metadata
+
+    def __cinit__(self, filepaths_or_buffers, columns=None, row_groups=None,
+                  use_pandas_metadata=True,
+                  size_t chunk_read_limit=0,
+                  size_t pass_read_limit=1024000000):
+
+        # Convert NativeFile buffers to NativeFileDatasource,
+        # but save original buffers in case we need to use
+        # pyarrow for metadata processing
+        # (See: https://github.com/rapidsai/cudf/issues/9599)
+
+        pa_buffers = []
+        for i, datasource in enumerate(filepaths_or_buffers):
+            if isinstance(datasource, NativeFile):
+                pa_buffers.append(datasource)
+                filepaths_or_buffers[i] = NativeFileDatasource(datasource)
+        self.pa_buffers = pa_buffers
+        cdef cudf_io_types.source_info source = make_source_info(
+            filepaths_or_buffers)
+
+        self.cpp_use_pandas_metadata = use_pandas_metadata
+
+        cdef vector[vector[size_type]] cpp_row_groups
+        if row_groups is not None:
+            cpp_row_groups = row_groups
+        cdef parquet_reader_options args
+        cdef pair[parquet_reader_options, bool] c_res = _setup_parquet_reader_options(
+            source, cpp_row_groups, use_pandas_metadata, None, columns)
+        args, self.allow_range_index = c_res.first, c_res.second
+
+        with nogil:
+            self.reader.reset(
+                new cpp_chunked_parquet_reader(
+                    chunk_read_limit,
+                    pass_read_limit,
+                    args
+                )
+            )
+        self.initialized = False
+        self.row_groups = row_groups
+        self.filepaths_or_buffers = filepaths_or_buffers
+
+    def _has_next(self):
+        cdef bool res
+        with nogil:
+            res = self.reader.get()[0].has_next()
+        return res
+
+    def _read_chunk(self):
+        # Read Parquet
+        cdef cudf_io_types.table_with_metadata c_result
+
+        with nogil:
+            c_result = move(self.reader.get()[0].read_chunk())
+
+        if not self.initialized:
+            self.names = [info.name.decode() for info in c_result.metadata.schema_info]
+            self.result_meta = c_result.metadata
+
+        df = cudf.DataFrame._from_data(*data_from_unique_ptr(
+            move(c_result.tbl),
+            column_names=self.names,
+        ))
+
+        self.initialized = True
+        return df
+
+    def read(self):
+        dfs = []
+        while self._has_next():
+            dfs.append(self._read_chunk())
+        df = cudf.concat(dfs)
+        df = _process_metadata(df, self.result_meta, self.names, self.row_groups,
+                               self.filepaths_or_buffers, self.pa_buffers,
+                               self.allow_range_index, self.cpp_use_pandas_metadata)
+        return df
+
 cpdef merge_filemetadata(object filemetadata_list):
     """
     Cython function to call into libcudf API, see `merge_row_group_metadata`.
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
index 33a594b432f..fb98650308a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
@@ -283,6 +283,18 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             vector[string] column_chunks_file_paths,
         ) except +
 
+    cdef cppclass chunked_parquet_reader:
+        chunked_parquet_reader() except +
+        chunked_parquet_reader(
+            size_t chunk_read_limit,
+            const parquet_reader_options& options) except +
+        chunked_parquet_reader(
+            size_t chunk_read_limit,
+            size_t pass_read_limit,
+            const parquet_reader_options& options) except +
+        bool has_next() except +
+        cudf_io_types.table_with_metadata read_chunk() except +
+
     cdef unique_ptr[vector[uint8_t]] merge_row_group_metadata(
         const vector[unique_ptr[vector[uint8_t]]]& metadata_list
     ) except +
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index e32fdacd8d6..2596fe8cd37 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -22,6 +22,7 @@
 from pyarrow import fs as pa_fs, parquet as pq
 
 import cudf
+from cudf._lib.parquet import ParquetReader
 from cudf.io.parquet import (
     ParquetDatasetWriter,
     ParquetWriter,
@@ -3407,3 +3408,29 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema():
 
     # Check results
     assert_eq(expected, got)
+
+
+@pytest.mark.parametrize("chunk_read_limit", [0, 240, 1024000000])
+@pytest.mark.parametrize("pass_read_limit", [0, 240, 1024000000])
+@pytest.mark.parametrize("use_pandas_metadata", [True, False])
+@pytest.mark.parametrize("row_groups", [[[0]], None, [[0, 1]]])
+def test_parquet_chunked_reader(
+    chunk_read_limit, pass_read_limit, use_pandas_metadata, row_groups
+):
+    df = pd.DataFrame(
+        {"a": [1, 2, 3, 4] * 1000000, "b": ["av", "qw", "hi", "xyz"] * 1000000}
+    )
+    buffer = BytesIO()
+    df.to_parquet(buffer)
+    reader = ParquetReader(
+        [buffer],
+        chunk_read_limit=chunk_read_limit,
+        pass_read_limit=pass_read_limit,
+        use_pandas_metadata=use_pandas_metadata,
+        row_groups=row_groups,
+    )
+    expected = cudf.read_parquet(
+        buffer, use_pandas_metadata=use_pandas_metadata, row_groups=row_groups
+    )
+    actual = reader.read()
+    assert_eq(expected, actual)

From 61da92415f1449f64a4050d2dec47b29344389a9 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 6 Jun 2024 17:19:28 +0100
Subject: [PATCH 311/842] Document how to use cudf.pandas in tandem with
 multiprocessing (#15940)

We need to arrange that cudf.pandas.install() is run on the workers, this requires that we programmatically install the metapath loader in our script. Unfortunately, passing an initializer function to the pool startup is not sufficient if any part of the script transitively loads pandas at the top level.

- Closes #15246

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15940
---
 docs/cudf/source/cudf_pandas/usage.md | 30 +++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md
index b174c606d66..376784439aa 100644
--- a/docs/cudf/source/cudf_pandas/usage.md
+++ b/docs/cudf/source/cudf_pandas/usage.md
@@ -26,6 +26,36 @@ From the command line, run your Python scripts with `-m cudf.pandas`:
 python -m cudf.pandas script.py
 ```
 
+### Usage in tandem with
+[`multiprocessing`](https://docs.python.org/3/library/multiprocessing.html)
+or
+[`concurrent.futures`](https://docs.python.org/3/library/concurrent.futures.html)
+process pools
+
+To use a pool of workers (for example
+[`multiprocessing.Pool`](https://docs.python.org/3/library/multiprocessing.html#multiprocessing.pool.Pool)
+or
+[`concurrent.futures.ProcessPoolExecutor`](https://docs.python.org/3/library/concurrent.futures.html#concurrent.futures.ProcessPoolExecutor))
+in your script with `cudf.pandas`, the `cudf.pandas` module must be
+loaded on the worker processes, as well as by the controlling script.
+The most foolproof way to do this is to programmatically install
+`cudf.pandas` at the top of your script, before anything else.
+For example
+
+```python
+# This is equivalent to python -m cudf.pandas, but will run on the
+# workers too. These two lines must run before pandas is imported,
+# either directly or transitively.
+import cudf.pandas
+cudf.pandas.install()
+
+from multiprocessing import Pool
+
+with Pool(4) as pool:
+    # use pool here
+    ...
+```
+
 ## Understanding performance - the `cudf.pandas` profiler
 
 `cudf.pandas` will attempt to use the GPU whenever possible and fall

From 3468fa1f5b9dfcf83a95bcb09fe5a4d8d3808620 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 6 Jun 2024 19:30:48 +0100
Subject: [PATCH 312/842] Add more complete type annotations in polars
 interpreter (#15942)

We can check this with:

    pyright --verifytypes cudf_polars --ignoreexternal

Which reports a "type completeness" score of around 94%. This will
improve once pylibcudf gets type stubs.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15942
---
 .pre-commit-config.yaml                       |   2 +-
 python/cudf_polars/cudf_polars/__init__.py    |   5 +-
 python/cudf_polars/cudf_polars/callback.py    |   3 +-
 .../cudf_polars/containers/dataframe.py       |  13 +-
 python/cudf_polars/cudf_polars/dsl/expr.py    |  55 +++++---
 python/cudf_polars/cudf_polars/dsl/ir.py      | 110 +++++++--------
 .../cudf_polars/cudf_polars/dsl/translate.py  | 127 ++++++++++++------
 python/cudf_polars/cudf_polars/py.typed       |   0
 .../cudf_polars/testing/asserts.py            |   2 +-
 .../cudf_polars/typing/__init__.py            |  91 +++++++++++++
 python/cudf_polars/pyproject.toml             |   2 -
 11 files changed, 287 insertions(+), 123 deletions(-)
 create mode 100644 python/cudf_polars/cudf_polars/py.typed
 create mode 100644 python/cudf_polars/cudf_polars/typing/__init__.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8865fb48e0d..4cdcac88091 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -134,7 +134,7 @@ repos:
       - id: rapids-dependency-file-generator
         args: ["--clean"]
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.4.3
+    rev: v0.4.8
     hooks:
       - id: ruff
         files: python/.*$
diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py
index 74547fe2448..b19a282129a 100644
--- a/python/cudf_polars/cudf_polars/__init__.py
+++ b/python/cudf_polars/cudf_polars/__init__.py
@@ -10,4 +10,7 @@
 
 from __future__ import annotations
 
-__all__: list[str] = []
+from cudf_polars.callback import execute_with_cudf
+from cudf_polars.dsl.translate import translate_ir
+
+__all__: list[str] = ["execute_with_cudf", "translate_ir"]
diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index aabb8498ce2..979087d5273 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -16,6 +16,7 @@
     import polars as pl
 
     from cudf_polars.dsl.ir import IR
+    from cudf_polars.typing import NodeTraverser
 
 __all__: list[str] = ["execute_with_cudf"]
 
@@ -33,7 +34,7 @@ def _callback(
         return ir.evaluate(cache={}).to_polars()
 
 
-def execute_with_cudf(nt, *, raise_on_fail: bool = False) -> None:
+def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None:
     """
     A post optimization callback that attempts to execute the plan with cudf.
 
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index eeaf181be0c..ac7e748095e 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -6,7 +6,7 @@
 from __future__ import annotations
 
 from functools import cached_property
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, cast
 
 import polars as pl
 
@@ -17,6 +17,7 @@
 if TYPE_CHECKING:
     from collections.abc import Mapping, Sequence, Set
 
+    import pyarrow as pa
     from typing_extensions import Self
 
     import cudf
@@ -44,13 +45,13 @@ def copy(self) -> Self:
 
     def to_polars(self) -> pl.DataFrame:
         """Convert to a polars DataFrame."""
-        return pl.from_arrow(
-            plc.interop.to_arrow(
-                self.table,
-                [plc.interop.ColumnMetadata(name=c.name) for c in self.columns],
-            )
+        table: pa.Table = plc.interop.to_arrow(
+            self.table,
+            [plc.interop.ColumnMetadata(name=c.name) for c in self.columns],
         )
 
+        return cast(pl.DataFrame, pl.from_arrow(table))
+
     @cached_property
     def column_names_set(self) -> frozenset[str]:
         """Return the column names as a set."""
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index c7c11cf6c68..6d9435ce373 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -139,14 +139,14 @@ def is_equal(self, other: Any) -> bool:
             other.children
         )
 
-    def __eq__(self, other) -> bool:
+    def __eq__(self, other: Any) -> bool:
         """Equality of expressions."""
         if type(self) != type(other) or hash(self) != hash(other):
             return False
         else:
             return self.is_equal(other)
 
-    def __ne__(self, other) -> bool:
+    def __ne__(self, other: Any) -> bool:
         """Inequality of expressions."""
         return not self.__eq__(other)
 
@@ -285,6 +285,8 @@ class NamedExpr:
     # when evaluating expressions themselves, only when constructing
     # named return values in dataframe (IR) nodes.
     __slots__ = ("name", "value")
+    value: Expr
+    name: str
 
     def __init__(self, name: str, value: Expr) -> None:
         self.name = name
@@ -298,7 +300,7 @@ def __repr__(self) -> str:
         """Repr of the expression."""
         return f"NamedExpr({self.name}, {self.value}"
 
-    def __eq__(self, other) -> bool:
+    def __eq__(self, other: Any) -> bool:
         """Equality of two expressions."""
         return (
             type(self) is type(other)
@@ -306,7 +308,7 @@ def __eq__(self, other) -> bool:
             and self.value == other.value
         )
 
-    def __ne__(self, other) -> bool:
+    def __ne__(self, other: Any) -> bool:
         """Inequality of expressions."""
         return not self.__eq__(other)
 
@@ -344,9 +346,10 @@ def collect_agg(self, *, depth: int) -> AggInfo:
 class Literal(Expr):
     __slots__ = ("value",)
     _non_child = ("dtype", "value")
-    value: pa.Scalar
+    value: pa.Scalar[Any]
+    children: tuple[()]
 
-    def __init__(self, dtype: plc.DataType, value: pa.Scalar) -> None:
+    def __init__(self, dtype: plc.DataType, value: pa.Scalar[Any]) -> None:
         super().__init__(dtype)
         assert value.type == plc.interop.to_arrow(dtype)
         self.value = value
@@ -367,6 +370,7 @@ class Col(Expr):
     __slots__ = ("name",)
     _non_child = ("dtype", "name")
     name: str
+    children: tuple[()]
 
     def __init__(self, dtype: plc.DataType, name: str) -> None:
         self.dtype = dtype
@@ -388,6 +392,8 @@ def collect_agg(self, *, depth: int) -> AggInfo:
 
 
 class Len(Expr):
+    children: tuple[()]
+
     def do_evaluate(
         self,
         df: DataFrame,
@@ -410,8 +416,15 @@ def collect_agg(self, *, depth: int) -> AggInfo:
 class BooleanFunction(Expr):
     __slots__ = ("name", "options", "children")
     _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
 
-    def __init__(self, dtype: plc.DataType, name: str, options: tuple, *children: Expr):
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        name: pl_expr.BooleanFunction,
+        options: tuple[Any, ...],
+        *children: Expr,
+    ) -> None:
         super().__init__(dtype)
         self.options = options
         self.name = name
@@ -610,14 +623,15 @@ def do_evaluate(
 class StringFunction(Expr):
     __slots__ = ("name", "options", "children")
     _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
 
     def __init__(
         self,
         dtype: plc.DataType,
         name: pl_expr.StringFunction,
-        options: tuple,
+        options: tuple[Any, ...],
         *children: Expr,
-    ):
+    ) -> None:
         super().__init__(dtype)
         self.options = options
         self.name = name
@@ -661,10 +675,11 @@ def do_evaluate(
 class Sort(Expr):
     __slots__ = ("options", "children")
     _non_child = ("dtype", "options")
+    children: tuple[Expr]
 
     def __init__(
         self, dtype: plc.DataType, options: tuple[bool, bool, bool], column: Expr
-    ):
+    ) -> None:
         super().__init__(dtype)
         self.options = options
         self.children = (column,)
@@ -696,6 +711,7 @@ def do_evaluate(
 class SortBy(Expr):
     __slots__ = ("options", "children")
     _non_child = ("dtype", "options")
+    children: tuple[Expr, ...]
 
     def __init__(
         self,
@@ -703,7 +719,7 @@ def __init__(
         options: tuple[bool, tuple[bool], tuple[bool]],
         column: Expr,
         *by: Expr,
-    ):
+    ) -> None:
         super().__init__(dtype)
         self.options = options
         self.children = (column, *by)
@@ -734,8 +750,9 @@ def do_evaluate(
 class Gather(Expr):
     __slots__ = ("children",)
     _non_child = ("dtype",)
+    children: tuple[Expr, Expr]
 
-    def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
+    def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr) -> None:
         super().__init__(dtype)
         self.children = (values, indices)
 
@@ -775,6 +792,7 @@ def do_evaluate(
 class Filter(Expr):
     __slots__ = ("children",)
     _non_child = ("dtype",)
+    children: tuple[Expr, Expr]
 
     def __init__(self, dtype: plc.DataType, values: Expr, indices: Expr):
         super().__init__(dtype)
@@ -801,8 +819,9 @@ def do_evaluate(
 class RollingWindow(Expr):
     __slots__ = ("options", "children")
     _non_child = ("dtype", "options")
+    children: tuple[Expr]
 
-    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr):
+    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None:
         super().__init__(dtype)
         self.options = options
         self.children = (agg,)
@@ -811,8 +830,9 @@ def __init__(self, dtype: plc.DataType, options: Any, agg: Expr):
 class GroupedRollingWindow(Expr):
     __slots__ = ("options", "children")
     _non_child = ("dtype", "options")
+    children: tuple[Expr, ...]
 
-    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr):
+    def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> None:
         super().__init__(dtype)
         self.options = options
         self.children = (agg, *by)
@@ -821,8 +841,9 @@ def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr):
 class Cast(Expr):
     __slots__ = ("children",)
     _non_child = ("dtype",)
+    children: tuple[Expr]
 
-    def __init__(self, dtype: plc.DataType, value: Expr):
+    def __init__(self, dtype: plc.DataType, value: Expr) -> None:
         super().__init__(dtype)
         self.children = (value,)
 
@@ -848,6 +869,7 @@ def collect_agg(self, *, depth: int) -> AggInfo:
 class Agg(Expr):
     __slots__ = ("name", "options", "op", "request", "children")
     _non_child = ("dtype", "name", "options")
+    children: tuple[Expr]
 
     def __init__(
         self, dtype: plc.DataType, name: str, options: Any, value: Expr
@@ -1007,7 +1029,7 @@ def _last(self, column: Column) -> Column:
 
     def do_evaluate(
         self,
-        df,
+        df: DataFrame,
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
         mapping: Mapping[Expr, Column] | None = None,
@@ -1022,6 +1044,7 @@ def do_evaluate(
 class BinOp(Expr):
     __slots__ = ("op", "children")
     _non_child = ("dtype", "op")
+    children: tuple[Expr, Expr]
 
     def __init__(
         self,
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 0a72cbd9f83..665bbe5be41 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -1,7 +1,5 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
 # SPDX-License-Identifier: Apache-2.0
-# TODO: remove need for this
-# ruff: noqa: D101
 """
 DSL nodes for the LogicalPlan of polars.
 
@@ -15,11 +13,11 @@
 
 from __future__ import annotations
 
+import dataclasses
 import itertools
 import types
-from dataclasses import dataclass
 from functools import cache
-from typing import TYPE_CHECKING, Any, Callable, ClassVar
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, NoReturn
 
 import pyarrow as pa
 from typing_extensions import assert_never
@@ -34,8 +32,11 @@
 from cudf_polars.utils import sorting
 
 if TYPE_CHECKING:
+    from collections.abc import MutableMapping
     from typing import Literal
 
+    from cudf_polars.typing import Schema
+
 
 __all__ = [
     "IR",
@@ -91,14 +92,14 @@ def broadcast(
     ]
 
 
-@dataclass(slots=True)
+@dataclasses.dataclass(slots=True)
 class IR:
     """Abstract plan node, representing an unevaluated dataframe."""
 
-    schema: dict[str, plc.DataType]
+    schema: Schema
     """Mapping from column names to their data types."""
 
-    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """
         Evaluate the node and return a dataframe.
 
@@ -123,7 +124,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         raise NotImplementedError
 
 
-@dataclass(slots=True)
+@dataclasses.dataclass(slots=True)
 class PythonScan(IR):
     """Representation of input from a python function."""
 
@@ -133,7 +134,7 @@ class PythonScan(IR):
     """Filter to apply to the constructed dataframe before returning it."""
 
 
-@dataclass(slots=True)
+@dataclasses.dataclass(slots=True)
 class Scan(IR):
     """Input from files."""
 
@@ -153,14 +154,14 @@ class Scan(IR):
     predicate: expr.NamedExpr | None
     """Mask to apply to the read dataframe."""
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         """Validate preconditions."""
         if self.file_options.n_rows is not None:
             raise NotImplementedError("row limit in scan")
         if self.typ not in ("csv", "parquet"):
             raise NotImplementedError(f"Unhandled scan type: {self.typ}")
 
-    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         options = self.file_options
         with_columns = options.with_columns
@@ -172,9 +173,9 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
                 )
             )
         elif self.typ == "parquet":
-            df = DataFrame.from_cudf(
-                cudf.read_parquet(self.paths, columns=with_columns)
-            )
+            cdf = cudf.read_parquet(self.paths, columns=with_columns)
+            assert isinstance(cdf, cudf.DataFrame)
+            df = DataFrame.from_cudf(cdf)
         else:
             assert_never(self.typ)
         if row_index is not None:
@@ -208,7 +209,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             return df.filter(mask)
 
 
-@dataclass(slots=True)
+@dataclasses.dataclass(slots=True)
 class Cache(IR):
     """
     Return a cached plan node.
@@ -221,7 +222,7 @@ class Cache(IR):
     value: IR
     """The unevaluated node to cache."""
 
-    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         try:
             return cache[self.key]
@@ -229,7 +230,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             return cache.setdefault(self.key, self.value.evaluate(cache=cache))
 
 
-@dataclass(slots=True)
+@dataclasses.dataclass(slots=True)
 class DataFrameScan(IR):
     """
     Input from an existing polars DataFrame.
@@ -244,7 +245,7 @@ class DataFrameScan(IR):
     predicate: expr.NamedExpr | None
     """Mask to apply."""
 
-    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         pdf = pl.DataFrame._from_pydf(self.df)
         if self.projection is not None:
@@ -270,7 +271,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             return df
 
 
-@dataclass(slots=True)
+@dataclasses.dataclass(slots=True)
 class Select(IR):
     """Produce a new dataframe selecting given expressions from an input."""
 
@@ -279,7 +280,7 @@ class Select(IR):
     expr: list[expr.NamedExpr]
     """List of expressions to evaluate to form the new dataframe."""
 
-    def evaluate(self, *, cache: dict[int, DataFrame]):
+    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
         # Handle any broadcasting
@@ -287,7 +288,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]):
         return DataFrame(columns)
 
 
-@dataclass(slots=True)
+@dataclasses.dataclass(slots=True)
 class Reduce(IR):
     """
     Produce a new dataframe selecting given expressions from an input.
@@ -300,7 +301,7 @@ class Reduce(IR):
     expr: list[expr.NamedExpr]
     """List of expressions to evaluate to form the new dataframe."""
 
-    def evaluate(self, *, cache: dict[int, DataFrame]):
+    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
         columns = broadcast(*(e.evaluate(df) for e in self.expr))
@@ -308,7 +309,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]):
         return DataFrame(columns)
 
 
-def placeholder_column(n: int):
+def placeholder_column(n: int) -> plc.Column:
     """
     Produce a placeholder pylibcudf column with NO BACKING DATA.
 
@@ -338,7 +339,7 @@ def placeholder_column(n: int):
     )
 
 
-@dataclass(slots=False)
+@dataclasses.dataclass(slots=False)
 class GroupBy(IR):
     """Perform a groupby."""
 
@@ -352,6 +353,7 @@ class GroupBy(IR):
     """Should the order of the input dataframe be maintained?"""
     options: Any
     """Options controlling style of groupby."""
+    agg_infos: list[expr.AggInfo] = dataclasses.field(init=False)
 
     @staticmethod
     def check_agg(agg: expr.Expr) -> int:
@@ -383,7 +385,7 @@ def check_agg(agg: expr.Expr) -> int:
         else:
             raise NotImplementedError(f"No handler for {agg=}")
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         """Check whether all the aggregations are implemented."""
         if self.options.rolling is None and self.maintain_order:
             raise NotImplementedError("Maintaining order in groupby")
@@ -393,7 +395,7 @@ def __post_init__(self):
             raise NotImplementedError("Nested aggregations in groupby")
         self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
 
-    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
         keys = broadcast(
@@ -438,7 +440,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         return DataFrame([*result_keys, *results]).slice(self.options.slice)
 
 
-@dataclass(slots=True)
+@dataclasses.dataclass(slots=True)
 class Join(IR):
     """A join of two dataframes."""
 
@@ -466,7 +468,7 @@ class Join(IR):
     - coalesce: should key columns be coalesced (only makes sense for outer joins)
     """
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         """Validate preconditions."""
         if self.options[0] == "cross":
             raise NotImplementedError("cross join not implemented")
@@ -511,7 +513,7 @@ def _joiners(
         else:
             assert_never(how)
 
-    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         left = self.left.evaluate(cache=cache)
         right = self.right.evaluate(cache=cache)
@@ -577,7 +579,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         return result.slice(zlice)
 
 
-@dataclass(slots=True)
+@dataclasses.dataclass(slots=True)
 class HStack(IR):
     """Add new columns to a dataframe."""
 
@@ -586,7 +588,7 @@ class HStack(IR):
     columns: list[expr.NamedExpr]
     """List of expressions to produce new columns."""
 
-    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
         columns = [c.evaluate(df) for c in self.columns]
@@ -597,7 +599,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         return df.with_columns(columns)
 
 
-@dataclass(slots=True)
+@dataclasses.dataclass(slots=True)
 class Distinct(IR):
     """Produce a new dataframe with distinct rows."""
 
@@ -619,7 +621,7 @@ class Distinct(IR):
         "any": plc.stream_compaction.DuplicateKeepOption.KEEP_ANY,
     }
 
-    def __init__(self, schema: dict, df: IR, options: Any):
+    def __init__(self, schema: Schema, df: IR, options: Any) -> None:
         self.schema = schema
         self.df = df
         (keep, subset, maintain_order, zlice) = options
@@ -628,7 +630,7 @@ def __init__(self, schema: dict, df: IR, options: Any):
         self.stable = maintain_order
         self.zlice = zlice
 
-    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
         if self.subset is None:
@@ -667,7 +669,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         return result.slice(self.zlice)
 
 
-@dataclass(slots=True)
+@dataclasses.dataclass(slots=True)
 class Sort(IR):
     """Sort a dataframe."""
 
@@ -686,12 +688,12 @@ class Sort(IR):
 
     def __init__(
         self,
-        schema: dict,
+        schema: Schema,
         df: IR,
         by: list[expr.NamedExpr],
         options: Any,
         zlice: tuple[int, int] | None,
-    ):
+    ) -> None:
         self.schema = schema
         self.df = df
         self.by = by
@@ -704,7 +706,7 @@ def __init__(
             plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key
         )
 
-    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
         sort_keys = broadcast(
@@ -736,7 +738,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         return DataFrame(columns).slice(self.zlice)
 
 
-@dataclass(slots=True)
+@dataclasses.dataclass(slots=True)
 class Slice(IR):
     """Slice a dataframe."""
 
@@ -747,13 +749,13 @@ class Slice(IR):
     length: int
     """Length of the slice."""
 
-    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
         return df.slice((self.offset, self.length))
 
 
-@dataclass(slots=True)
+@dataclasses.dataclass(slots=True)
 class Filter(IR):
     """Filter a dataframe with a boolean mask."""
 
@@ -762,21 +764,21 @@ class Filter(IR):
     mask: expr.NamedExpr
     """Expression evaluating to a mask."""
 
-    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
         (mask,) = broadcast(self.mask.evaluate(df), target_length=df.num_rows)
         return df.filter(mask)
 
 
-@dataclass(slots=True)
+@dataclasses.dataclass(slots=True)
 class Projection(IR):
     """Select a subset of columns from a dataframe."""
 
     df: IR
     """Input."""
 
-    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
         # This can reorder things.
@@ -786,7 +788,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         return DataFrame(columns)
 
 
-@dataclass(slots=True)
+@dataclasses.dataclass(slots=True)
 class MapFunction(IR):
     """Apply some function to a dataframe."""
 
@@ -807,7 +809,7 @@ class MapFunction(IR):
         ]
     )
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         """Validate preconditions."""
         if self.name not in MapFunction._NAMES:
             raise NotImplementedError(f"Unhandled map function {self.name}")
@@ -824,7 +826,7 @@ def __post_init__(self):
             if key_column not in self.df.dfs[0].schema:
                 raise ValueError(f"Key column {key_column} not found")
 
-    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         if self.name == "merge_sorted":
             # merge_sorted operates on Union inputs
@@ -876,7 +878,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
             raise AssertionError("Should never be reached")
 
 
-@dataclass(slots=True)
+@dataclasses.dataclass(slots=True)
 class Union(IR):
     """Concatenate dataframes vertically."""
 
@@ -885,13 +887,13 @@ class Union(IR):
     zlice: tuple[int, int] | None
     """Optional slice to apply after concatenation."""
 
-    def __post_init__(self):
+    def __post_init__(self) -> None:
         """Validated preconditions."""
         schema = self.dfs[0].schema
         if not all(s.schema == schema for s in self.dfs[1:]):
             raise ValueError("Schema mismatch")
 
-    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         # TODO: only evaluate what we need if we have a slice
         dfs = [df.evaluate(cache=cache) for df in self.dfs]
@@ -900,14 +902,14 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         ).slice(self.zlice)
 
 
-@dataclass(slots=True)
+@dataclasses.dataclass(slots=True)
 class HConcat(IR):
     """Concatenate dataframes horizontally."""
 
     dfs: list[IR]
     """List of inputs."""
 
-    def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
+    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         dfs = [df.evaluate(cache=cache) for df in self.dfs]
         return DataFrame(
@@ -915,7 +917,7 @@ def evaluate(self, *, cache: dict[int, DataFrame]) -> DataFrame:
         )
 
 
-@dataclass(slots=True)
+@dataclasses.dataclass(slots=True)
 class ExtContext(IR):
     """
     Concatenate dataframes horizontally.
@@ -928,7 +930,7 @@ class ExtContext(IR):
     extra: list[IR]
     """List of extra inputs."""
 
-    def __post_init__(self):
+    def __post_init__(self) -> NoReturn:
         """Validate preconditions."""
         raise NotImplementedError(
             "ExtContext will be deprecated, use horizontal concat instead."
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 641176daff4..38107023365 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -16,12 +16,13 @@
 import cudf._lib.pylibcudf as plc
 
 from cudf_polars.dsl import expr, ir
+from cudf_polars.typing import NodeTraverser
 from cudf_polars.utils import dtypes
 
 __all__ = ["translate_ir", "translate_named_expr"]
 
 
-class set_node(AbstractContextManager):
+class set_node(AbstractContextManager[None]):
     """
     Run a block with current node set in the visitor.
 
@@ -39,30 +40,36 @@ class set_node(AbstractContextManager):
     """
 
     __slots__ = ("n", "visitor")
+    visitor: NodeTraverser
+    n: int
 
-    def __init__(self, visitor, n: int):
+    def __init__(self, visitor: NodeTraverser, n: int) -> None:
         self.visitor = visitor
         self.n = n
 
-    def __enter__(self):
+    def __enter__(self) -> None:
         n = self.visitor.get_node()
         self.visitor.set_node(self.n)
         self.n = n
 
-    def __exit__(self, *args):
+    def __exit__(self, *args: Any) -> None:
         self.visitor.set_node(self.n)
 
 
-noop_context: nullcontext = nullcontext()
+noop_context: nullcontext[None] = nullcontext()
 
 
 @singledispatch
-def _translate_ir(node: Any, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+def _translate_ir(
+    node: Any, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+) -> ir.IR:
     raise NotImplementedError(f"Translation for {type(node).__name__}")
 
 
 @_translate_ir.register
-def _(node: pl_ir.PythonScan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+def _(
+    node: pl_ir.PythonScan, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+) -> ir.IR:
     return ir.PythonScan(
         schema,
         node.options,
@@ -73,7 +80,9 @@ def _(node: pl_ir.PythonScan, visitor: Any, schema: dict[str, plc.DataType]) ->
 
 
 @_translate_ir.register
-def _(node: pl_ir.Scan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+def _(
+    node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+) -> ir.IR:
     return ir.Scan(
         schema,
         node.scan_type,
@@ -86,13 +95,15 @@ def _(node: pl_ir.Scan, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
 
 
 @_translate_ir.register
-def _(node: pl_ir.Cache, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+def _(
+    node: pl_ir.Cache, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+) -> ir.IR:
     return ir.Cache(schema, node.id_, translate_ir(visitor, n=node.input))
 
 
 @_translate_ir.register
 def _(
-    node: pl_ir.DataFrameScan, visitor: Any, schema: dict[str, plc.DataType]
+    node: pl_ir.DataFrameScan, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     return ir.DataFrameScan(
         schema,
@@ -105,7 +116,9 @@ def _(
 
 
 @_translate_ir.register
-def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+def _(
+    node: pl_ir.Select, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
@@ -113,7 +126,9 @@ def _(node: pl_ir.Select, visitor: Any, schema: dict[str, plc.DataType]) -> ir.I
 
 
 @_translate_ir.register
-def _(node: pl_ir.GroupBy, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+def _(
+    node: pl_ir.GroupBy, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         aggs = [translate_named_expr(visitor, n=e) for e in node.aggs]
@@ -129,7 +144,9 @@ def _(node: pl_ir.GroupBy, visitor: Any, schema: dict[str, plc.DataType]) -> ir.
 
 
 @_translate_ir.register
-def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+def _(
+    node: pl_ir.Join, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+) -> ir.IR:
     # Join key dtypes are dependent on the schema of the left and
     # right inputs, so these must be translated with the relevant
     # input active.
@@ -143,7 +160,9 @@ def _(node: pl_ir.Join, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
 
 
 @_translate_ir.register
-def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+def _(
+    node: pl_ir.HStack, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.exprs]
@@ -151,7 +170,9 @@ def _(node: pl_ir.HStack, visitor: Any, schema: dict[str, plc.DataType]) -> ir.I
 
 
 @_translate_ir.register
-def _(node: pl_ir.Reduce, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+def _(
+    node: pl_ir.Reduce, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
@@ -159,7 +180,9 @@ def _(node: pl_ir.Reduce, visitor: Any, schema: dict[str, plc.DataType]) -> ir.I
 
 
 @_translate_ir.register
-def _(node: pl_ir.Distinct, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+def _(
+    node: pl_ir.Distinct, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+) -> ir.IR:
     return ir.Distinct(
         schema,
         translate_ir(visitor, n=node.input),
@@ -168,7 +191,9 @@ def _(node: pl_ir.Distinct, visitor: Any, schema: dict[str, plc.DataType]) -> ir
 
 
 @_translate_ir.register
-def _(node: pl_ir.Sort, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+def _(
+    node: pl_ir.Sort, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         by = [translate_named_expr(visitor, n=e) for e in node.by_column]
@@ -176,12 +201,16 @@ def _(node: pl_ir.Sort, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
 
 
 @_translate_ir.register
-def _(node: pl_ir.Slice, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+def _(
+    node: pl_ir.Slice, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+) -> ir.IR:
     return ir.Slice(schema, translate_ir(visitor, n=node.input), node.offset, node.len)
 
 
 @_translate_ir.register
-def _(node: pl_ir.Filter, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+def _(
+    node: pl_ir.Filter, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+) -> ir.IR:
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         mask = translate_named_expr(visitor, n=node.predicate)
@@ -190,13 +219,17 @@ def _(node: pl_ir.Filter, visitor: Any, schema: dict[str, plc.DataType]) -> ir.I
 
 @_translate_ir.register
 def _(
-    node: pl_ir.SimpleProjection, visitor: Any, schema: dict[str, plc.DataType]
+    node: pl_ir.SimpleProjection,
+    visitor: NodeTraverser,
+    schema: dict[str, plc.DataType],
 ) -> ir.IR:
     return ir.Projection(schema, translate_ir(visitor, n=node.input))
 
 
 @_translate_ir.register
-def _(node: pl_ir.MapFunction, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+def _(
+    node: pl_ir.MapFunction, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+) -> ir.IR:
     name, *options = node.function
     return ir.MapFunction(
         schema,
@@ -208,19 +241,25 @@ def _(node: pl_ir.MapFunction, visitor: Any, schema: dict[str, plc.DataType]) ->
 
 
 @_translate_ir.register
-def _(node: pl_ir.Union, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+def _(
+    node: pl_ir.Union, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+) -> ir.IR:
     return ir.Union(
         schema, [translate_ir(visitor, n=n) for n in node.inputs], node.options
     )
 
 
 @_translate_ir.register
-def _(node: pl_ir.HConcat, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+def _(
+    node: pl_ir.HConcat, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+) -> ir.IR:
     return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs])
 
 
 @_translate_ir.register
-def _(node: pl_ir.ExtContext, visitor: Any, schema: dict[str, plc.DataType]) -> ir.IR:
+def _(
+    node: pl_ir.ExtContext, visitor: NodeTraverser, schema: dict[str, plc.DataType]
+) -> ir.IR:
     return ir.ExtContext(
         schema,
         translate_ir(visitor, n=node.input),
@@ -228,7 +267,7 @@ def _(node: pl_ir.ExtContext, visitor: Any, schema: dict[str, plc.DataType]) ->
     )
 
 
-def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
+def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR:
     """
     Translate a polars-internal IR node to our representation.
 
@@ -249,7 +288,7 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
     NotImplementedError
         If we can't translate the nodes due to unsupported functionality.
     """
-    ctx: AbstractContextManager = (
+    ctx: AbstractContextManager[None] = (
         set_node(visitor, n) if n is not None else noop_context
     )
     with ctx:
@@ -258,7 +297,9 @@ def translate_ir(visitor: Any, *, n: int | None = None) -> ir.IR:
         return _translate_ir(node, visitor, schema)
 
 
-def translate_named_expr(visitor: Any, *, n: pl_expr.PyExprIR) -> expr.NamedExpr:
+def translate_named_expr(
+    visitor: NodeTraverser, *, n: pl_expr.PyExprIR
+) -> expr.NamedExpr:
     """
     Translate a polars-internal named expression IR object into our representation.
 
@@ -289,12 +330,14 @@ def translate_named_expr(visitor: Any, *, n: pl_expr.PyExprIR) -> expr.NamedExpr
 
 
 @singledispatch
-def _translate_expr(node: Any, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+def _translate_expr(
+    node: Any, visitor: NodeTraverser, dtype: plc.DataType
+) -> expr.Expr:
     raise NotImplementedError(f"Translation for {type(node).__name__}")
 
 
 @_translate_expr.register
-def _(node: pl_expr.Function, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
     name, *options = node.function_data
     options = tuple(options)
     if isinstance(name, pl_expr.StringFunction):
@@ -316,7 +359,7 @@ def _(node: pl_expr.Function, visitor: Any, dtype: plc.DataType) -> expr.Expr:
 
 
 @_translate_expr.register
-def _(node: pl_expr.Window, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
     # TODO: raise in groupby?
     if node.partition_by is None:
         return expr.RollingWindow(
@@ -332,19 +375,19 @@ def _(node: pl_expr.Window, visitor: Any, dtype: plc.DataType) -> expr.Expr:
 
 
 @_translate_expr.register
-def _(node: pl_expr.Literal, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
     value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype))
     return expr.Literal(dtype, value)
 
 
 @_translate_expr.register
-def _(node: pl_expr.Sort, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Sort, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
     # TODO: raise in groupby
     return expr.Sort(dtype, node.options, translate_expr(visitor, n=node.expr))
 
 
 @_translate_expr.register
-def _(node: pl_expr.SortBy, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.SortBy, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
     return expr.SortBy(
         dtype,
         node.sort_options,
@@ -354,7 +397,7 @@ def _(node: pl_expr.SortBy, visitor: Any, dtype: plc.DataType) -> expr.Expr:
 
 
 @_translate_expr.register
-def _(node: pl_expr.Gather, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Gather, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
     return expr.Gather(
         dtype,
         translate_expr(visitor, n=node.expr),
@@ -363,7 +406,7 @@ def _(node: pl_expr.Gather, visitor: Any, dtype: plc.DataType) -> expr.Expr:
 
 
 @_translate_expr.register
-def _(node: pl_expr.Filter, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Filter, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
     return expr.Filter(
         dtype,
         translate_expr(visitor, n=node.input),
@@ -372,7 +415,7 @@ def _(node: pl_expr.Filter, visitor: Any, dtype: plc.DataType) -> expr.Expr:
 
 
 @_translate_expr.register
-def _(node: pl_expr.Cast, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
     inner = translate_expr(visitor, n=node.expr)
     # Push casts into literals so we can handle Cast(Literal(Null))
     if isinstance(inner, expr.Literal):
@@ -382,12 +425,12 @@ def _(node: pl_expr.Cast, visitor: Any, dtype: plc.DataType) -> expr.Expr:
 
 
 @_translate_expr.register
-def _(node: pl_expr.Column, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Column, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
     return expr.Col(dtype, node.name)
 
 
 @_translate_expr.register
-def _(node: pl_expr.Agg, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
     return expr.Agg(
         dtype,
         node.name,
@@ -397,7 +440,9 @@ def _(node: pl_expr.Agg, visitor: Any, dtype: plc.DataType) -> expr.Expr:
 
 
 @_translate_expr.register
-def _(node: pl_expr.BinaryExpr, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+def _(
+    node: pl_expr.BinaryExpr, visitor: NodeTraverser, dtype: plc.DataType
+) -> expr.Expr:
     return expr.BinOp(
         dtype,
         expr.BinOp._MAPPING[node.op],
@@ -407,11 +452,11 @@ def _(node: pl_expr.BinaryExpr, visitor: Any, dtype: plc.DataType) -> expr.Expr:
 
 
 @_translate_expr.register
-def _(node: pl_expr.Len, visitor: Any, dtype: plc.DataType) -> expr.Expr:
+def _(node: pl_expr.Len, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
     return expr.Len(dtype)
 
 
-def translate_expr(visitor: Any, *, n: int) -> expr.Expr:
+def translate_expr(visitor: NodeTraverser, *, n: int) -> expr.Expr:
     """
     Translate a polars-internal expression IR into our representation.
 
diff --git a/python/cudf_polars/cudf_polars/py.typed b/python/cudf_polars/cudf_polars/py.typed
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
index 2fbfa971fef..2f19b41cc3a 100644
--- a/python/cudf_polars/cudf_polars/testing/asserts.py
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -28,7 +28,7 @@ def assert_gpu_result_equal(
     rtol: float = 1e-05,
     atol: float = 1e-08,
     categorical_as_str: bool = False,
-):
+) -> None:
     """
     Assert that collection of a lazyframe on GPU produces correct results.
 
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
new file mode 100644
index 00000000000..287c977f4eb
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Typing utilities for cudf_polars."""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+from typing import TYPE_CHECKING, Protocol, TypeAlias
+
+from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
+
+import cudf._lib.pylibcudf as plc
+
+if TYPE_CHECKING:
+    from typing import Callable
+
+    import polars as pl
+
+IR: TypeAlias = (
+    pl_ir.PythonScan
+    | pl_ir.Scan
+    | pl_ir.Cache
+    | pl_ir.DataFrameScan
+    | pl_ir.Select
+    | pl_ir.GroupBy
+    | pl_ir.Join
+    | pl_ir.HStack
+    | pl_ir.Distinct
+    | pl_ir.Sort
+    | pl_ir.Slice
+    | pl_ir.Filter
+    | pl_ir.SimpleProjection
+    | pl_ir.MapFunction
+    | pl_ir.Union
+    | pl_ir.HConcat
+    | pl_ir.ExtContext
+)
+
+Expr: TypeAlias = (
+    pl_expr.Function
+    | pl_expr.Window
+    | pl_expr.Literal
+    | pl_expr.Sort
+    | pl_expr.SortBy
+    | pl_expr.Gather
+    | pl_expr.Filter
+    | pl_expr.Cast
+    | pl_expr.Column
+    | pl_expr.Agg
+    | pl_expr.BinaryExpr
+    | pl_expr.Len
+    | pl_expr.PyExprIR
+)
+
+Schema: TypeAlias = Mapping[str, plc.DataType]
+
+
+class NodeTraverser(Protocol):
+    """Abstract protocol for polars NodeTraverser."""
+
+    def get_node(self) -> int:
+        """Return current plan node id."""
+        ...
+
+    def set_node(self, n: int) -> None:
+        """Set the current plan node to n."""
+        ...
+
+    def view_current_node(self) -> IR:
+        """Convert current plan node to python rep."""
+        ...
+
+    def get_schema(self) -> Mapping[str, pl.DataType]:
+        """Get the schema of the current plan node."""
+        ...
+
+    def get_dtype(self, n: int) -> pl.DataType:
+        """Get the datatype of the given expression id."""
+        ...
+
+    def view_expression(self, n: int) -> Expr:
+        """Convert the given expression to python rep."""
+        ...
+
+    def set_udf(
+        self,
+        callback: Callable[[list[str] | None, str | None, int | None], pl.DataFrame],
+    ) -> None:
+        """Set the callback replacing the current node in the plan."""
+        ...
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index e50ee76a9b9..2faf8c3193f 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -62,8 +62,6 @@ target-version = "py39"
 fix = true
 
 [tool.ruff.lint]
-# __init__.py must re-export everything it imports
-ignore-init-module-imports = false
 select = [
   "E", # pycodestyle
   "W", # pycodestyle

From 5f45803b2a68b49d330d94e2f701791a7590612a Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 6 Jun 2024 13:00:12 -0700
Subject: [PATCH 313/842] Migrate quantile.pxd to pylibcudf (#15874)

xref #15162

Migrate quantile.pxd to use pylibcudf APIs.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15874
---
 cpp/src/quantiles/quantiles.cu                |   4 +-
 cpp/tests/quantiles/quantiles_test.cpp        |   9 +-
 .../user_guide/api_docs/pylibcudf/index.rst   |   1 +
 .../api_docs/pylibcudf/quantiles.rst          |   6 +
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |   1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |   2 +
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |   2 +
 python/cudf/cudf/_lib/pylibcudf/quantiles.pxd |  25 ++
 python/cudf/cudf/_lib/pylibcudf/quantiles.pyx | 152 ++++++++++++
 python/cudf/cudf/_lib/quantiles.pyx           | 102 ++------
 python/cudf/cudf/pylibcudf_tests/conftest.py  |  29 +++
 .../cudf/pylibcudf_tests/test_quantiles.py    | 234 ++++++++++++++++++
 12 files changed, 486 insertions(+), 81 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/quantiles.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/quantiles.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_quantiles.py

diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu
index c0f536536ce..af3bda2e62e 100644
--- a/cpp/src/quantiles/quantiles.cu
+++ b/cpp/src/quantiles/quantiles.cu
@@ -34,6 +34,7 @@
 #include <thrust/iterator/transform_iterator.h>
 
 #include <memory>
+#include <stdexcept>
 #include <vector>
 
 namespace cudf {
@@ -78,7 +79,8 @@ std::unique_ptr<table> quantiles(table_view const& input,
 
   CUDF_EXPECTS(interp == interpolation::HIGHER || interp == interpolation::LOWER ||
                  interp == interpolation::NEAREST,
-               "multi-column quantiles require a non-arithmetic interpolation strategy.");
+               "multi-column quantiles require a non-arithmetic interpolation strategy.",
+               std::invalid_argument);
 
   CUDF_EXPECTS(input.num_rows() > 0, "multi-column quantiles require at least one input row.");
 
diff --git a/cpp/tests/quantiles/quantiles_test.cpp b/cpp/tests/quantiles/quantiles_test.cpp
index 5b7b6dd2718..b7faa20e8c1 100644
--- a/cpp/tests/quantiles/quantiles_test.cpp
+++ b/cpp/tests/quantiles/quantiles_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <stdexcept>
+
 template <typename T>
 struct QuantilesTest : public cudf::test::BaseFixture {};
 
@@ -104,9 +106,10 @@ TYPED_TEST(QuantilesTest, TestMultiColumnArithmeticInterpolation)
   cudf::test::fixed_width_column_wrapper<T> input_b({});
   auto input = cudf::table_view({input_a});
 
-  EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::LINEAR), cudf::logic_error);
+  EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::LINEAR), std::invalid_argument);
 
-  EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::MIDPOINT), cudf::logic_error);
+  EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::MIDPOINT),
+               std::invalid_argument);
 }
 
 TYPED_TEST(QuantilesTest, TestMultiColumnUnsorted)
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 870ed8856d1..1e03fa80bb5 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -21,6 +21,7 @@ This page provides API documentation for pylibcudf.
     join
     lists
     merge
+    quantiles
     reduce
     reshape
     rolling
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst
new file mode 100644
index 00000000000..3417c1ff59d
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst
@@ -0,0 +1,6 @@
+=========
+quantiles
+=========
+
+.. automodule:: cudf._lib.pylibcudf.quantiles
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 6beb7b0f506..ed396208f98 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -26,6 +26,7 @@ set(cython_sources
     join.pyx
     lists.pyx
     merge.pyx
+    quantiles.pyx
     reduce.pyx
     replace.pyx
     reshape.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index b289d112a90..a628ecdb038 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -12,6 +12,7 @@ from . cimport (
     join,
     lists,
     merge,
+    quantiles,
     reduce,
     replace,
     reshape,
@@ -48,6 +49,7 @@ __all__ = [
     "join",
     "lists",
     "merge",
+    "quantiles",
     "reduce",
     "replace",
     "rolling",
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 2565332f3ed..46d0fe13cd1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -12,6 +12,7 @@
     join,
     lists,
     merge,
+    quantiles,
     reduce,
     replace,
     reshape,
@@ -48,6 +49,7 @@
     "join",
     "lists",
     "merge",
+    "quantiles",
     "reduce",
     "replace",
     "rolling",
diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd b/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd
new file mode 100644
index 00000000000..70ff135ca77
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd
@@ -0,0 +1,25 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.libcudf.types cimport interpolation, sorted
+
+from .column cimport Column
+from .table cimport Table
+
+
+cpdef Column quantile(
+    Column input,
+    vector[double] q,
+    interpolation interp = *,
+    Column ordered_indices = *,
+    bint exact = *
+)
+
+cpdef Table quantiles(
+    Table input,
+    vector[double] q,
+    interpolation interp = *,
+    sorted is_input_sorted = *,
+    list column_order = *,
+    list null_precedence = *,
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx
new file mode 100644
index 00000000000..c1f0e30ccd3
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx
@@ -0,0 +1,152 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from cudf._lib.pylibcudf.libcudf.quantiles cimport (
+    quantile as cpp_quantile,
+    quantiles as cpp_quantiles,
+)
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, sorted
+
+from .column cimport Column
+from .table cimport Table
+from .types cimport interpolation
+
+
+cpdef Column quantile(
+    Column input,
+    vector[double] q,
+    interpolation interp = interpolation.LINEAR,
+    Column ordered_indices = None,
+    bool exact=True
+):
+    """Computes quantiles with interpolation.
+
+    Computes the specified quantiles by interpolating values between which they lie,
+    using the interpolation strategy specified in interp.
+
+    Parameters
+    ----------
+    input: Column
+        The Column to calculate quantiles on.
+    q: array-like that implements buffer-protocol
+        The quantiles to calculate in range [0,1]
+    interp: Interpolation, default Interpolation.LINEAR
+        The strategy used to select between values adjacent to a specified quantile.
+    ordered_indices: Column, default empty column
+        The column containing the sorted order of input.
+
+        If empty, all input values are used in existing order.
+        Indices must be in range [0, input.size()), but are not required to be unique.
+        Values not indexed by this column will be ignored.
+    exact: bool, default True
+        Returns doubles if True. Otherwise, returns same type as input
+
+    For details, see :cpp:func:`quantile`.
+
+    Returns
+    -------
+    Column
+        A Column containing specified quantiles, with nulls for indeterminable values
+    """
+    cdef:
+        unique_ptr[column] c_result
+        column_view ordered_indices_view
+
+    if ordered_indices is None:
+        ordered_indices_view = column_view()
+    else:
+        ordered_indices_view = ordered_indices.view()
+
+    with nogil:
+        c_result = move(
+            cpp_quantile(
+                input.view(),
+                q,
+                interp,
+                ordered_indices_view,
+                exact,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Table quantiles(
+    Table input,
+    vector[double] q,
+    interpolation interp = interpolation.NEAREST,
+    sorted is_input_sorted = sorted.NO,
+    list column_order = None,
+    list null_precedence = None,
+):
+    """Computes row quantiles with interpolation.
+
+    Computes the specified quantiles by retrieving the row corresponding to the
+    specified quantiles. In the event a quantile lies in between rows, the specified
+    interpolation strategy is used to pick between the rows.
+
+    Parameters
+    ----------
+    input: Table
+        The Table to calculate row quantiles on.
+    q: array-like
+        The quantiles to calculate in range [0,1]
+    interp: Interpolation, default Interpolation.NEAREST
+        The strategy used to select between values adjacent to a specified quantile.
+
+        Must be a non-arithmetic interpolation strategy
+        (i.e. one of
+        {`Interpolation.HIGHER`, `Interpolation.LOWER`, `Interpolation.NEAREST`})
+    is_input_sorted: Sorted, default Sorted.NO
+        Whether the input table has been pre-sorted or not.
+    column_order: list, default None
+        A list of `Order` enums,
+        indicating the desired sort order for each column.
+        By default, will sort all columns so that they are in ascending order.
+
+        Ignored if `is_input_sorted` is `Sorted.YES`
+    null_precedence: list, default None
+        A list of `NullOrder` enums,
+        indicating how nulls should be sorted.
+        By default, will sort all columns so that nulls appear before
+        all other elements.
+
+        Ignored if `is_input_sorted` is `Sorted.YES`
+
+    For details, see :cpp:func:`quantiles`.
+
+    Returns
+    -------
+    Column
+        A Column containing specified quantiles, with nulls for indeterminable values
+    """
+    cdef:
+        unique_ptr[table] c_result
+        vector[order] column_order_vec
+        vector[null_order] null_precedence_vec
+
+    if column_order is not None:
+        column_order_vec = column_order
+    if null_precedence is not None:
+        null_precedence_vec = null_precedence
+
+    with nogil:
+        c_result = move(
+            cpp_quantiles(
+                input.view(),
+                q,
+                interp,
+                is_input_sorted,
+                column_order_vec,
+                null_precedence_vec,
+            )
+        )
+
+    return Table.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx
index 3d20454a7ce..7b50c00919a 100644
--- a/python/cudf/cudf/_lib/quantiles.pyx
+++ b/python/cudf/cudf/_lib/quantiles.pyx
@@ -3,76 +3,43 @@
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
 from cudf._lib.types cimport (
     underlying_type_t_interpolation,
-    underlying_type_t_null_order,
-    underlying_type_t_order,
     underlying_type_t_sorted,
 )
 
 from cudf._lib.types import Interpolation
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.quantiles cimport (
-    quantile as cpp_quantile,
-    quantiles as cpp_quantile_table,
-)
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport (
-    interpolation,
-    null_order,
-    order,
-    sorted,
-)
-from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
+from cudf._lib.pylibcudf.libcudf.types cimport interpolation, sorted
+from cudf._lib.utils cimport columns_from_pylibcudf_table
+
+import cudf._lib.pylibcudf as plc
 
 
 @acquire_spill_lock()
 def quantile(
     Column input,
-    object q,
+    vector[double] q,
     str interp,
     Column ordered_indices,
     bool exact,
-
 ):
-    cdef column_view c_input = input.view()
-    cdef column_view c_ordered_indices = (
-        column_view() if ordered_indices is None
-        else ordered_indices.view()
-    )
     cdef interpolation c_interp = <interpolation>(
         <underlying_type_t_interpolation> Interpolation[interp.upper()]
     )
-    cdef bool c_exact = exact
-
-    cdef vector[double] c_q
-    c_q.reserve(len(q))
-
-    for value in q:
-        c_q.push_back(value)
 
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_quantile(
-                c_input,
-                c_q,
-                c_interp,
-                c_ordered_indices,
-                c_exact,
-            )
+    return Column.from_pylibcudf(
+        plc.quantiles.quantile(
+            input.to_pylibcudf(mode="read"),
+            q,
+            c_interp,
+            ordered_indices.to_pylibcudf(mode="read"),
+            <bool>exact
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 def quantile_table(
@@ -83,42 +50,23 @@ def quantile_table(
     list column_order,
     list null_precedence,
 ):
-    cdef table_view c_input = table_view_from_columns(source_columns)
-    cdef vector[double] c_q = q
+
     cdef interpolation c_interp = <interpolation>(
         <underlying_type_t_interpolation> interp
     )
     cdef sorted c_is_input_sorted = <sorted>(
         <underlying_type_t_sorted> is_input_sorted
     )
-    cdef vector[order] c_column_order
-    cdef vector[null_order] c_null_precedence
-
-    c_column_order.reserve(len(column_order))
-    c_null_precedence.reserve(len(null_precedence))
-
-    for value in column_order:
-        c_column_order.push_back(
-            <order>(<underlying_type_t_order> value)
-        )
 
-    for value in null_precedence:
-        c_null_precedence.push_back(
-            <null_order>(<underlying_type_t_null_order> value)
+    return columns_from_pylibcudf_table(
+        plc.quantiles.quantiles(
+            plc.Table([
+                c.to_pylibcudf(mode="read") for c in source_columns
+            ]),
+            q,
+            c_interp,
+            c_is_input_sorted,
+            column_order,
+            null_precedence
         )
-
-    cdef unique_ptr[table] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_quantile_table(
-                c_input,
-                c_q,
-                c_interp,
-                c_is_input_sorted,
-                c_column_order,
-                c_null_precedence,
-            )
-        )
-
-    return columns_from_unique_ptr(move(c_result))
+    )
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
index 6d8284fb3db..f3c6584ef8c 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -7,6 +7,8 @@
 import pyarrow as pa
 import pytest
 
+import cudf._lib.pylibcudf as plc
+
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
 
 from utils import DEFAULT_STRUCT_TESTING_TYPE
@@ -29,3 +31,30 @@
 )
 def pa_type(request):
     return request.param
+
+
+@pytest.fixture(
+    scope="session",
+    params=[
+        pa.int64(),
+        pa.float64(),
+        pa.uint64(),
+    ],
+)
+def numeric_pa_type(request):
+    return request.param
+
+
+@pytest.fixture(
+    scope="session", params=[opt for opt in plc.types.Interpolation]
+)
+def interp_opt(request):
+    return request.param
+
+
+@pytest.fixture(
+    scope="session",
+    params=[opt for opt in plc.types.Sorted],
+)
+def sorted_opt(request):
+    return request.param
diff --git a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py
new file mode 100644
index 00000000000..a5d332a7795
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py
@@ -0,0 +1,234 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import numpy as np
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+from utils import assert_column_eq, assert_table_eq
+
+import cudf._lib.pylibcudf as plc
+
+# Map pylibcudf interpolation options to pyarrow options
+interp_mapping = {
+    plc.types.Interpolation.LINEAR: "linear",
+    plc.types.Interpolation.LOWER: "lower",
+    plc.types.Interpolation.HIGHER: "higher",
+    plc.types.Interpolation.MIDPOINT: "midpoint",
+    plc.types.Interpolation.NEAREST: "nearest",
+}
+
+
+@pytest.fixture(scope="module", params=[[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]])
+def pa_col_data(request, numeric_pa_type):
+    return pa.array(request.param, type=numeric_pa_type)
+
+
+@pytest.fixture(scope="module")
+def plc_col_data(pa_col_data):
+    return plc.interop.from_arrow(pa_col_data)
+
+
+@pytest.fixture(
+    scope="module",
+    params=[
+        {
+            "arrays": [[1, 2, 3, 5, 4], [5.0, 6.0, 8.0, 7.0, 9.0]],
+            "schema": pa.schema(
+                [
+                    ("a", pa.int64()),
+                    ("b", pa.int64()),
+                ]
+            ),
+        },
+        {
+            "arrays": [
+                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+                [1, 2.0, 2.2, 2.3, 2.4, None, None, 3.5, 4.5, 5.5],
+            ],
+            "schema": pa.schema(
+                [
+                    ("a", pa.int64()),
+                    ("b", pa.float64()),
+                ]
+            ),
+        },
+    ],
+)
+def plc_tbl_data(request):
+    return plc.interop.from_arrow(pa.Table.from_arrays(**request.param))
+
+
+@pytest.mark.parametrize("q", [[], [0], [0.5], [0.1, 0.5, 0.7, 0.9]])
+@pytest.mark.parametrize("exact", [True, False])
+def test_quantile(pa_col_data, plc_col_data, interp_opt, q, exact):
+    ordered_indices = plc.interop.from_arrow(
+        pc.cast(pc.sort_indices(pa_col_data), pa.int32())
+    )
+    res = plc.quantiles.quantile(
+        plc_col_data, q, interp_opt, ordered_indices, exact
+    )
+
+    pa_interp_opt = interp_mapping[interp_opt]
+
+    if exact:
+        pa_col_data = pc.cast(pa_col_data, pa.float64())
+
+    if len(q) > 0:
+        # pyarrow quantile doesn't support empty q
+        exp = pc.quantile(pa_col_data, q=q, interpolation=pa_interp_opt)
+    else:
+        exp = pa.array([], type=pa.float64())
+
+    if not exact:
+        exp = pc.cast(exp, pa_col_data.type, safe=False)
+
+    assert_column_eq(exp, res)
+
+
+def _pyarrow_quantiles(
+    pa_tbl_data,
+    q,
+    interp_opt=plc.types.Interpolation.NEAREST,
+    sorted_opt=plc.types.Sorted.NO,
+    column_order=None,
+    null_precedence=None,
+):
+    """
+    The pyarrow equivalent of plc.quantiles.quantiles
+
+    Takes the same arguments (except input should be a pyarrow table instead of
+    of a pylibcudf table)
+
+    NOTE: This function doesn't support having different null precedences because of
+    a lack of support in pyarrow.
+    """
+    if len(q) > 0:
+        # pyarrow quantile doesn't support empty q
+        pa_interp_opt = interp_mapping[interp_opt]
+
+        if sorted_opt == plc.types.Sorted.NO:
+            order_mapper = {
+                plc.types.Order.ASCENDING: "ascending",
+                plc.types.Order.DESCENDING: "descending",
+            }
+            if null_precedence is None:
+                null_precedence = [plc.types.NullOrder.BEFORE] * len(
+                    pa_tbl_data.columns
+                )
+            if column_order is None:
+                column_order = [plc.types.Order.ASCENDING] * len(
+                    pa_tbl_data.columns
+                )
+
+            if not all(
+                [
+                    null_prec == null_precedence[0]
+                    for null_prec in null_precedence
+                ]
+            ):
+                raise NotImplementedError(
+                    "Having varying null precendences is not implemented!"
+                )
+
+            pa_tbl_data = pa_tbl_data.sort_by(
+                [
+                    (name, order_mapper[order])
+                    for name, order in zip(
+                        pa_tbl_data.column_names, column_order
+                    )
+                ],
+                null_placement="at_start"
+                if null_precedence[0] == plc.types.NullOrder.BEFORE
+                else "at_end",
+            )
+        row_idxs = pc.quantile(
+            np.arange(0, len(pa_tbl_data)), q=q, interpolation=pa_interp_opt
+        )
+        exp = pa_tbl_data.take(row_idxs)
+    else:
+        exp = pa.Table.from_arrays(
+            [[] for _ in range(len(pa_tbl_data.schema))],
+            schema=pa_tbl_data.schema,
+        )
+    return exp
+
+
+@pytest.mark.parametrize(
+    "q", [[], [0.1], [0.2], [0.3], [0.4], [0.5], [0.1, 0.5, 0.7, 0.9]]
+)
+@pytest.mark.parametrize(
+    "column_order", [[plc.types.Order.ASCENDING, plc.types.Order.ASCENDING]]
+)
+@pytest.mark.parametrize(
+    "null_precedence",
+    [
+        [plc.types.NullOrder.BEFORE, plc.types.NullOrder.BEFORE],
+        [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER],
+    ],
+)
+def test_quantiles(
+    plc_tbl_data, interp_opt, q, sorted_opt, column_order, null_precedence
+):
+    if interp_opt in {
+        plc.types.Interpolation.LINEAR,
+        plc.types.Interpolation.MIDPOINT,
+    }:
+        pytest.skip(
+            "interp cannot be an arithmetic interpolation strategy for quantiles"
+        )
+
+    pa_tbl_data = plc.interop.to_arrow(plc_tbl_data, ["a", "b"])
+
+    exp = _pyarrow_quantiles(
+        pa_tbl_data,
+        q=q,
+        interp_opt=interp_opt,
+        sorted_opt=sorted_opt,
+        column_order=column_order,
+        null_precedence=null_precedence,
+    )
+
+    res = plc.quantiles.quantiles(
+        plc_tbl_data, q, interp_opt, sorted_opt, column_order, null_precedence
+    )
+
+    assert_table_eq(exp, res)
+
+
+@pytest.mark.parametrize(
+    "invalid_interp",
+    [plc.types.Interpolation.LINEAR, plc.types.Interpolation.MIDPOINT],
+)
+def test_quantiles_invalid_interp(plc_tbl_data, invalid_interp):
+    with pytest.raises(ValueError):
+        plc.quantiles.quantiles(
+            plc_tbl_data, q=np.array([0.1]), interp=invalid_interp
+        )
+
+
+@pytest.mark.parametrize(
+    "q",
+    [[0.1], (0.1,), np.array([0.1])],
+)
+def test_quantile_q_array_like(pa_col_data, plc_col_data, q):
+    ordered_indices = plc.interop.from_arrow(
+        pc.cast(pc.sort_indices(pa_col_data), pa.int32())
+    )
+    res = plc.quantiles.quantile(
+        plc_col_data,
+        q=q,
+        ordered_indices=ordered_indices,
+    )
+    exp = pc.quantile(pa_col_data, q=q)
+    assert_column_eq(exp, res)
+
+
+@pytest.mark.parametrize(
+    "q",
+    [[0.1], (0.1,), np.array([0.1])],
+)
+def test_quantiles_q_array_like(plc_tbl_data, q):
+    res = plc.quantiles.quantiles(plc_tbl_data, q=q)
+    pa_tbl_data = plc.interop.to_arrow(plc_tbl_data, ["a", "b"])
+    exp = _pyarrow_quantiles(pa_tbl_data, q=q)
+    assert_table_eq(exp, res)

From d4dd474f0db6047b2404c2c98b86cf4446445e1b Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 6 Jun 2024 17:52:50 -0400
Subject: [PATCH 314/842] Use offsetalator in
 cudf::io::json::detail::parse_string (#15900)

Updates the `cudf::io::json::detail::parse_string` function to use the offsetalator for building a strings column instead of `size_type` pointers. The output row sizes are computed in the first pass through the kernels and then converted to offsets. The offsets are wrapped with an offsetalator on the 2nd pass to locate each individual rows' output position in the chars data.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15900
---
 cpp/src/io/utilities/data_casting.cu | 56 ++++++++++++++++------------
 cpp/tests/io/json_test.cpp           |  1 -
 2 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 60cbfbc0dae..288a5690282 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -22,6 +22,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
@@ -417,6 +418,7 @@ struct bitfield_block {
  * @param null_mask Null mask
  * @param null_count_data pointer to store null count
  * @param options Settings for controlling string processing behavior
+ * @param d_sizes Output size of each row
  * @param d_offsets Offsets to identify where to store the results for each string
  * @param d_chars Character array to store the characters of strings
  */
@@ -427,7 +429,8 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
                                           bitmask_type* null_mask,
                                           size_type* null_count_data,
                                           cudf::io::parse_options_view const options,
-                                          size_type* d_offsets,
+                                          size_type* d_sizes,
+                                          cudf::detail::input_offsetalator d_offsets,
                                           char* d_chars)
 {
   constexpr auto BLOCK_SIZE =
@@ -455,7 +458,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
        istring           = get_next_string()) {
     // skip nulls
     if (null_mask != nullptr && not bit_is_set(null_mask, istring)) {
-      if (!d_chars && lane == 0) d_offsets[istring] = 0;
+      if (!d_chars && lane == 0) { d_sizes[istring] = 0; }
       continue;  // gride-stride return;
     }
 
@@ -476,7 +479,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
         if (lane == 0) {
           clear_bit(null_mask, istring);
           atomicAdd(null_count_data, 1);
-          if (!d_chars) d_offsets[istring] = 0;
+          if (!d_chars) { d_sizes[istring] = 0; }
         }
         continue;  // gride-stride return;
       }
@@ -491,7 +494,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
     // Copy literal/numeric value
     if (not is_string_value) {
       if (!d_chars) {
-        if (lane == 0) { d_offsets[istring] = in_end - in_begin; }
+        if (lane == 0) { d_sizes[istring] = in_end - in_begin; }
       } else {
         for (thread_index_type char_index = lane; char_index < (in_end - in_begin);
              char_index += BLOCK_SIZE) {
@@ -621,8 +624,8 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
             clear_bit(null_mask, istring);
             atomicAdd(null_count_data, 1);
           }
-          last_offset        = 0;
-          d_offsets[istring] = 0;
+          last_offset      = 0;
+          d_sizes[istring] = 0;
         }
         if constexpr (!is_warp) { __syncthreads(); }
         break;  // gride-stride return;
@@ -729,7 +732,7 @@ CUDF_KERNEL void parse_fn_string_parallel(str_tuple_it str_tuples,
         }
       }
     }  // char for-loop
-    if (!d_chars && lane == 0) { d_offsets[istring] = last_offset; }
+    if (!d_chars && lane == 0) { d_sizes[istring] = last_offset; }
   }  // grid-stride for-loop
 }
 
@@ -739,13 +742,14 @@ struct string_parse {
   bitmask_type* null_mask;
   size_type* null_count_data;
   cudf::io::parse_options_view const options;
-  size_type* d_offsets{};
+  size_type* d_sizes{};
+  cudf::detail::input_offsetalator d_offsets;
   char* d_chars{};
 
   __device__ void operator()(size_type idx)
   {
     if (null_mask != nullptr && not bit_is_set(null_mask, idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
       return;
     }
     auto const in_begin     = str_tuples[idx].first;
@@ -761,7 +765,7 @@ struct string_parse {
       if (is_null_literal && null_mask != nullptr) {
         clear_bit(null_mask, idx);
         atomicAdd(null_count_data, 1);
-        if (!d_chars) d_offsets[idx] = 0;
+        if (!d_chars) { d_sizes[idx] = 0; }
         return;
       }
     }
@@ -773,9 +777,9 @@ struct string_parse {
         clear_bit(null_mask, idx);
         atomicAdd(null_count_data, 1);
       }
-      if (!d_chars) d_offsets[idx] = 0;
+      if (!d_chars) { d_sizes[idx] = 0; }
     } else {
-      if (!d_chars) d_offsets[idx] = str_process_info.bytes;
+      if (!d_chars) { d_sizes[idx] = str_process_info.bytes; }
     }
   }
 };
@@ -811,13 +815,12 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
     size_type{0},
     thrust::maximum<size_type>{});
 
-  auto offsets = cudf::make_numeric_column(
-    data_type{type_to_id<size_type>()}, col_size + 1, cudf::mask_state::UNALLOCATED, stream, mr);
-  auto d_offsets       = offsets->mutable_view().data<size_type>();
+  auto sizes           = rmm::device_uvector<size_type>(col_size, stream);
+  auto d_sizes         = sizes.data();
   auto null_count_data = d_null_count.data();
 
   auto single_thread_fn = string_parse<decltype(str_tuples)>{
-    str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options, d_offsets};
+    str_tuples, static_cast<bitmask_type*>(null_mask.data()), null_count_data, options, d_sizes};
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      col_size,
@@ -838,7 +841,8 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
         static_cast<bitmask_type*>(null_mask.data()),
         null_count_data,
         options,
-        d_offsets,
+        d_sizes,
+        cudf::detail::input_offsetalator{},
         nullptr);
   }
 
@@ -853,20 +857,22 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
         static_cast<bitmask_type*>(null_mask.data()),
         null_count_data,
         options,
-        d_offsets,
+        d_sizes,
+        cudf::detail::input_offsetalator{},
         nullptr);
   }
-  auto const bytes =
-    cudf::detail::sizes_to_offsets(d_offsets, d_offsets + col_size + 1, d_offsets, stream);
-  CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
-               "Size of output exceeds the column size limit",
-               std::overflow_error);
+
+  auto [offsets, bytes] =
+    cudf::strings::detail::make_offsets_child_column(sizes.begin(), sizes.end(), stream, mr);
+  auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
 
   // CHARS column
   rmm::device_uvector<char> chars(bytes, stream, mr);
   auto d_chars = chars.data();
 
-  single_thread_fn.d_chars = d_chars;
+  single_thread_fn.d_chars   = d_chars;
+  single_thread_fn.d_offsets = d_offsets;
+
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      col_size,
@@ -882,6 +888,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
         static_cast<bitmask_type*>(null_mask.data()),
         null_count_data,
         options,
+        d_sizes,
         d_offsets,
         d_chars);
   }
@@ -897,6 +904,7 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
         static_cast<bitmask_type*>(null_mask.data()),
         null_count_data,
         options,
+        d_sizes,
         d_offsets,
         d_chars);
   }
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 5d790e73246..57aa2721756 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -2374,7 +2374,6 @@ TEST_F(JsonReaderTest, MapTypes)
       EXPECT_EQ(col.type().id(), types[i]) << "column[" << i << "].type";
       i++;
     }
-    std::cout << "\n";
   };
 
   // json

From 582d237e1b07696de86a3f4df16dca2922dda5eb Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 6 Jun 2024 17:55:06 -0400
Subject: [PATCH 315/842] Fix offsetalator when accessing over 268 million rows
 (#15921)

Fixes an access error when the `offsetalator` wraps an INT64 offsets column with more than 268,435,455 rows.
The row access type is `size_type` and is used to calculate the appropriate position within the offsets buffer.
This fix promotes the multiplication to int64 to properly resolve the correct pointer position.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/15921
---
 cpp/include/cudf/detail/offsets_iterator.cuh  |  6 +-
 cpp/tests/CMakeLists.txt                      |  1 +
 .../large_strings/large_strings_fixture.cpp   | 11 +++
 .../large_strings/large_strings_fixture.hpp   | 11 +++
 .../large_strings/many_strings_tests.cpp      | 67 +++++++++++++++++++
 5 files changed, 93 insertions(+), 3 deletions(-)
 create mode 100644 cpp/tests/large_strings/many_strings_tests.cpp

diff --git a/cpp/include/cudf/detail/offsets_iterator.cuh b/cpp/include/cudf/detail/offsets_iterator.cuh
index 15b334245ff..1ab1fd46230 100644
--- a/cpp/include/cudf/detail/offsets_iterator.cuh
+++ b/cpp/include/cudf/detail/offsets_iterator.cuh
@@ -53,7 +53,7 @@ struct input_offsetalator : base_normalator<input_offsetalator, int64_t> {
    */
   __device__ inline int64_t operator[](size_type idx) const
   {
-    void const* tp = p_ + (idx * this->width_);
+    void const* tp = p_ + (static_cast<int64_t>(idx) * this->width_);
     return this->width_ == sizeof(int32_t) ? static_cast<int64_t>(*static_cast<int32_t const*>(tp))
                                            : *static_cast<int64_t const*>(tp);
   }
@@ -79,7 +79,7 @@ struct input_offsetalator : base_normalator<input_offsetalator, int64_t> {
     cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) &&
                 "Unexpected offsets type");
 #endif
-    p_ += (this->width_ * offset);
+    p_ += (this->width_ * static_cast<int64_t>(offset));
   }
 
  protected:
@@ -121,7 +121,7 @@ struct output_offsetalator : base_normalator<output_offsetalator, int64_t> {
   __device__ inline output_offsetalator const operator[](size_type idx) const
   {
     output_offsetalator tmp{*this};
-    tmp.p_ += (idx * this->width_);
+    tmp.p_ += (static_cast<int64_t>(idx) * this->width_);
     return tmp;
   }
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index a0d9083c4a4..826f879ddc0 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -570,6 +570,7 @@ ConfigureTest(
   large_strings/concatenate_tests.cpp
   large_strings/case_tests.cpp
   large_strings/large_strings_fixture.cpp
+  large_strings/many_strings_tests.cpp
   large_strings/merge_tests.cpp
   large_strings/parquet_tests.cpp
   large_strings/reshape_tests.cpp
diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp
index 59e0cd43d05..416b106c5a5 100644
--- a/cpp/tests/large_strings/large_strings_fixture.cpp
+++ b/cpp/tests/large_strings/large_strings_fixture.cpp
@@ -95,6 +95,17 @@ cudf::column_view StringsLargeTest::long_column()
   return g_ls_data->get_column(name);
 }
 
+cudf::column_view StringsLargeTest::very_long_column()
+{
+  std::string name("long2");
+  if (!g_ls_data->has_key(name)) {
+    auto itr   = thrust::constant_iterator<std::string_view>("12345");
+    auto input = cudf::test::strings_column_wrapper(itr, itr + 30'000'000);
+    g_ls_data->add_column(name, input.release());
+  }
+  return g_ls_data->get_column(name);
+}
+
 std::unique_ptr<LargeStringsData> StringsLargeTest::get_ls_data()
 {
   CUDF_EXPECTS(g_ls_data == nullptr, "invalid call to get_ls_data");
diff --git a/cpp/tests/large_strings/large_strings_fixture.hpp b/cpp/tests/large_strings/large_strings_fixture.hpp
index 8827b65f1ce..fb7b1cd00b8 100644
--- a/cpp/tests/large_strings/large_strings_fixture.hpp
+++ b/cpp/tests/large_strings/large_strings_fixture.hpp
@@ -33,14 +33,25 @@ class LargeStringsData;
 struct StringsLargeTest : public cudf::test::BaseFixture {
   /**
    * @brief Returns a column of long strings
+   *
+   * This returns 8 rows of 400 bytes
    */
   cudf::column_view wide_column();
 
   /**
    * @brief Returns a long column of strings
+   *
+   * This returns 5 million rows of 50 bytes
    */
   cudf::column_view long_column();
 
+  /**
+   * @brief Returns a very long column of strings
+   *
+   * This returns 30 million rows of 5 bytes
+   */
+  cudf::column_view very_long_column();
+
   large_strings_enabler g_ls_enabler;
   static LargeStringsData* g_ls_data;
 
diff --git a/cpp/tests/large_strings/many_strings_tests.cpp b/cpp/tests/large_strings/many_strings_tests.cpp
new file mode 100644
index 00000000000..73fbb21d014
--- /dev/null
+++ b/cpp/tests/large_strings/many_strings_tests.cpp
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+
+#include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/replace.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <limits>
+#include <vector>
+
+struct StringsManyTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(StringsManyTest, Replace)
+{
+  auto const expected = this->very_long_column();
+  auto const view     = cudf::column_view(expected);
+  // force addressing (rows > max_size_type/sizeof(int64)) in a 64-bit offsets column
+  int constexpr max_size_type = std::numeric_limits<cudf::size_type>::max();
+  // minimum number of duplicates to achieve large strings (64-bit offsets)
+  int const min_size_multiplier =
+    (max_size_type / cudf::strings_column_view(view).chars_size(cudf::get_default_stream())) + 1;
+  // minimum row multiplier to create max_size_type/sizeof(int64) = 268,435,455 rows
+  int const min_row_multiplier = ((max_size_type / sizeof(int64_t)) / view.size()) + 1;
+  int const multiplier         = std::max(min_size_multiplier, min_row_multiplier);
+
+  std::vector<cudf::column_view> input_cols(multiplier, view);
+  std::vector<cudf::size_type> splits;
+  std::generate_n(std::back_inserter(splits), multiplier - 1, [view, n = 1]() mutable {
+    return view.size() * (n++);
+  });
+
+  auto large_input = cudf::concatenate(input_cols);  // 480 million rows
+  auto const sv    = cudf::strings_column_view(large_input->view());
+  EXPECT_EQ(sv.size(), view.size() * multiplier);
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+
+  // Using replace tests reading large strings as well as creating large strings
+  auto const target = cudf::string_scalar("3");  // fake the actual replace;
+  auto const repl   = cudf::string_scalar("3");  // logic still builds the output
+  auto result       = cudf::strings::replace(sv, target, repl);
+
+  // verify results in sections
+  auto sliced = cudf::split(result->view(), splits);
+  for (auto c : sliced) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, expected);
+  }
+}

From 451d12a2d8d69f63d2b9491286b8895ace6f87ba Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 6 Jun 2024 18:57:04 -0500
Subject: [PATCH 316/842] Allow anonymous user in devcontainer name. (#15784)

In https://github.com/rapidsai/cudf/pull/15572, we updated the devcontainer name to include the current user's name. However, in GitHub Codespaces, the username is not defined. As a result, the container name starts with a dash. This is not allowed by GitHub Codespaces, so it fails to launch.

This PR adds a default value of `anon` to the devcontainer username.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Paul Taylor (https://github.com/trxcllnt)

URL: https://github.com/rapidsai/cudf/pull/15784
---
 .devcontainer/cuda11.8-conda/devcontainer.json | 2 +-
 .devcontainer/cuda11.8-pip/devcontainer.json   | 2 +-
 .devcontainer/cuda12.2-conda/devcontainer.json | 2 +-
 .devcontainer/cuda12.2-pip/devcontainer.json   | 2 +-
 .github/CODEOWNERS                             | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index c62e18512a0..8423fe21c29 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -11,7 +11,7 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 4ab4bd75643..4945d6cf753 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -11,7 +11,7 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json
index 2b50454410f..05bf9173d25 100644
--- a/.devcontainer/cuda12.2-conda/devcontainer.json
+++ b/.devcontainer/cuda12.2-conda/devcontainer.json
@@ -11,7 +11,7 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index fc5abc56094..74420214726 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -11,7 +11,7 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 9efac3f1904..5e2f46714d9 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -22,7 +22,7 @@ java/              @rapidsai/cudf-java-codeowners
 /.pre-commit-config.yaml @rapidsai/ci-codeowners
 
 #packaging code owners
-/.devcontainers/   @rapidsai/packaging-codeowners
+/.devcontainer/    @rapidsai/packaging-codeowners
 /conda/            @rapidsai/packaging-codeowners
 /dependencies.yaml @rapidsai/packaging-codeowners
 /build.sh          @rapidsai/packaging-codeowners

From 9bd16bb719e14ed1e0ee3edbd8c8417c03ac2f25 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 6 Jun 2024 18:50:23 -0700
Subject: [PATCH 317/842] Reland "Fix docs for IO readers and strings_convert"
 (#15872)" (#15941)

This reverts commit 2b031e06a7fe18eec462db445eea1c596b93a9f1.

We got the go ahead to remove the text docs from @taureandyernv.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15941
---
 ci/build_docs.sh                                           | 6 ------
 docs/cudf/source/libcudf_docs/api_docs/io_readers.rst      | 2 +-
 docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst | 2 +-
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index db306046667..67a5415f353 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -46,9 +46,6 @@ pushd docs/cudf
 make dirhtml
 mkdir -p "${RAPIDS_DOCS_DIR}/cudf/html"
 mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/cudf/html"
-make text
-mkdir -p "${RAPIDS_DOCS_DIR}/cudf/txt"
-mv build/text/* "${RAPIDS_DOCS_DIR}/cudf/txt"
 popd
 
 rapids-logger "Build dask-cuDF Sphinx docs"
@@ -56,9 +53,6 @@ pushd docs/dask_cudf
 make dirhtml
 mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/html"
 mv build/dirhtml/* "${RAPIDS_DOCS_DIR}/dask-cudf/html"
-make text
-mkdir -p "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
-mv build/text/* "${RAPIDS_DOCS_DIR}/dask-cudf/txt"
 popd
 
 rapids-upload-docs
diff --git a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
index a835673dee4..f94a5ddb403 100644
--- a/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
+++ b/docs/cudf/source/libcudf_docs/api_docs/io_readers.rst
@@ -2,4 +2,4 @@ Io Readers
 ==========
 
 .. doxygengroup:: io_readers
-   :desc-only:
+   :members:
diff --git a/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst b/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
index ae5d78fb1a1..f2f320bd0e4 100644
--- a/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
+++ b/docs/cudf/source/libcudf_docs/api_docs/strings_convert.rst
@@ -2,4 +2,4 @@ Strings Convert
 ===============
 
 .. doxygengroup:: strings_convert
-   :desc-only:
+   :members:

From d83d086afda1d25f5711a0aecf4ecfe6c05f7b9d Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 7 Jun 2024 07:30:32 -1000
Subject: [PATCH 318/842] Define Column.nan_as_null to return self (#15923)

While trying to clean all the `fillna` logic, I needed to have a `Column.nan_as_null` defined to make the `fillna` logic more re-useable.

This allows other `nan_as_null` usages in cudf to avoiding checking whether it's defined on the column or not.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15923
---
 python/cudf/cudf/core/_base_index.py          |  7 +----
 python/cudf/cudf/core/column/categorical.py   |  6 ++--
 python/cudf/cudf/core/column/column.py        | 14 +++++----
 python/cudf/cudf/core/column/numerical.py     |  6 ++--
 .../cudf/cudf/core/column/numerical_base.py   |  4 +--
 python/cudf/cudf/core/indexed_frame.py        | 29 ++++++-------------
 python/cudf/cudf/core/reshape.py              |  4 +--
 python/cudf/cudf/tests/test_replace.py        |  8 +++++
 python/cudf/cudf/tests/test_series.py         |  7 +++++
 9 files changed, 42 insertions(+), 43 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index baca7b19e58..5d0f7c4ede4 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -2072,12 +2072,7 @@ def dropna(self, how="any"):
             pass
         # This is to be consistent with IndexedFrame.dropna to handle nans
         # as nulls by default
-        data_columns = [
-            col.nans_to_nulls()
-            if isinstance(col, cudf.core.column.NumericalColumn)
-            else col
-            for col in self._columns
-        ]
+        data_columns = [col.nans_to_nulls() for col in self._columns]
 
         return self._from_columns_like_self(
             drop_nulls(
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 1828c5ce97b..de20b2ace1d 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -816,10 +816,8 @@ def to_pandas(
             .values_host
         )
 
-        cats = col.categories
-        if cats.dtype.kind in "biuf":
-            cats = cats.nans_to_nulls().dropna()  # type: ignore[attr-defined]
-        elif not isinstance(cats.dtype, IntervalDtype):
+        cats = col.categories.nans_to_nulls()
+        if not isinstance(cats.dtype, IntervalDtype):
             # leaving out dropna because it temporarily changes an interval
             # index into a struct and throws off results.
             # TODO: work on interval index dropna
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 68079371b85..475d52d0fbb 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -281,7 +281,7 @@ def any(self, skipna: bool = True) -> bool:
 
         return libcudf.reduce.reduce("any", self, dtype=np.bool_)
 
-    def dropna(self) -> ColumnBase:
+    def dropna(self) -> Self:
         return drop_nulls([self])[0]._with_type_metadata(self.dtype)
 
     def to_arrow(self) -> pa.Array:
@@ -695,7 +695,9 @@ def fillna(
         Returns a copy with null filled.
         """
         return libcudf.replace.replace_nulls(
-            input_col=self, replacement=fill_value, method=method
+            input_col=self.nans_to_nulls(),
+            replacement=fill_value,
+            method=method,
         )._with_type_metadata(self.dtype)
 
     def isnull(self) -> ColumnBase:
@@ -1240,6 +1242,10 @@ def unary_operator(self, unaryop: str):
             f"Operation {unaryop} not supported for dtype {self.dtype}."
         )
 
+    def nans_to_nulls(self: Self) -> Self:
+        """Convert NaN to NA."""
+        return self
+
     def normalize_binop_value(
         self, other: ScalarLike
     ) -> Union[ColumnBase, ScalarLike]:
@@ -1802,9 +1808,7 @@ def as_column(
 
         data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write"))
         col = build_column(data, dtype=arbitrary.dtype, mask=mask)
-        if (
-            nan_as_null or (mask is None and nan_as_null is None)
-        ) and col.dtype.kind == "f":
+        if nan_as_null or (mask is None and nan_as_null is None):
             col = col.nans_to_nulls()
         if dtype is not None:
             col = col.astype(dtype)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index fb413959eb9..6fb4f17b76d 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -536,7 +536,7 @@ def fillna(
             return col
 
         if method is not None:
-            return super(NumericalColumn, col).fillna(fill_value, method)
+            return super().fillna(fill_value, method)
 
         if fill_value is None:
             raise ValueError("Must specify either 'fill_value' or 'method'")
@@ -545,7 +545,7 @@ def fillna(
             isinstance(fill_value, cudf.Scalar)
             and fill_value.dtype == col.dtype
         ):
-            return super(NumericalColumn, col).fillna(fill_value, method)
+            return super().fillna(fill_value, method)
 
         if np.isscalar(fill_value):
             # cast safely to the same dtype as self
@@ -572,7 +572,7 @@ def fillna(
             else:
                 fill_value = fill_value.astype(col.dtype)
 
-        return super(NumericalColumn, col).fillna(fill_value, method)
+        return super().fillna(fill_value, method)
 
     def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
         """
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 541c32a2520..d38ec9cf30f 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -49,7 +49,7 @@ def kurtosis(self, skipna: Optional[bool] = None) -> float:
         if len(self) == 0 or self._can_return_nan(skipna=skipna):
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
-        self = self.nans_to_nulls().dropna()  # type: ignore
+        self = self.nans_to_nulls().dropna()
 
         if len(self) < 4:
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
@@ -74,7 +74,7 @@ def skew(self, skipna: Optional[bool] = None) -> ScalarLike:
         if len(self) == 0 or self._can_return_nan(skipna=skipna):
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
 
-        self = self.nans_to_nulls().dropna()  # type: ignore
+        self = self.nans_to_nulls().dropna()
 
         if len(self) < 3:
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index ecfcec15337..d898eb4b9c3 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -420,10 +420,7 @@ def _scan(self, op, axis=None, skipna=True):
         results = {}
         for name, col in self._data.items():
             if skipna:
-                try:
-                    result_col = col.nans_to_nulls()
-                except AttributeError:
-                    result_col = col
+                result_col = col.nans_to_nulls()
             else:
                 if col.has_nulls(include_nan=True):
                     first_index = col.isnull().find_first_value(True)
@@ -1915,12 +1912,12 @@ def nans_to_nulls(self):
         1  <NA>  3.14
         2  <NA>  <NA>
         """
-        result = (
-            col.nans_to_nulls()
-            if isinstance(col, cudf.core.column.NumericalColumn)
-            else col.copy()
-            for col in self._data.columns
-        )
+        result = []
+        for col in self._data.columns:
+            converted = col.nans_to_nulls()
+            if converted is col:
+                converted = converted.copy()
+            result.append(converted)
         return self._from_data_like_self(
             self._data._from_columns_like_self(result)
         )
@@ -4228,10 +4225,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):
                 thresh = len(df)
 
         for name, col in df._data.items():
-            try:
-                check_col = col.nans_to_nulls()
-            except AttributeError:
-                check_col = col
+            check_col = col.nans_to_nulls()
             no_threshold_valid_count = (
                 len(col) - check_col.null_count
             ) < thresh
@@ -4261,12 +4255,7 @@ def _drop_na_rows(self, how="any", subset=None, thresh=None):
         if len(subset) == 0:
             return self.copy(deep=True)
 
-        data_columns = [
-            col.nans_to_nulls()
-            if isinstance(col, cudf.core.column.NumericalColumn)
-            else col
-            for col in self._columns
-        ]
+        data_columns = [col.nans_to_nulls() for col in self._columns]
 
         return self._from_columns_like_self(
             libcudf.stream_compaction.drop_nulls(
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index d4772d5b4c2..53239cb7ea0 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1210,9 +1210,7 @@ def _get_unique(column, dummy_na):
     else:
         unique = column.unique().sort_values()
     if not dummy_na:
-        if np.issubdtype(unique.dtype, np.floating):
-            unique = unique.nans_to_nulls()
-        unique = unique.dropna()
+        unique = unique.nans_to_nulls().dropna()
     return unique
 
 
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index d77ec596271..9466398964a 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -6,6 +6,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 import pytest
 
 import cudf
@@ -1370,3 +1371,10 @@ def test_fillna_columns_multiindex():
     actual = gdf.fillna(10)
 
     assert_eq(expected, actual)
+
+
+def test_fillna_nan_and_null():
+    ser = cudf.Series(pa.array([float("nan"), None, 1.1]), nan_as_null=False)
+    result = ser.fillna(2.2)
+    expected = cudf.Series([2.2, 2.2, 1.1])
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 323716d5fc3..f47c42d9a1d 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2841,3 +2841,10 @@ def test_series_from_series_index_no_shallow_copy():
     ser1 = cudf.Series(range(3), index=list("abc"))
     ser2 = cudf.Series(ser1)
     assert ser1.index is ser2.index
+
+
+@pytest.mark.parametrize("value", [1, 1.1])
+def test_nans_to_nulls_noop_copies_column(value):
+    ser1 = cudf.Series([value])
+    ser2 = ser1.nans_to_nulls()
+    assert ser1._column is not ser2._column

From 39c5b86645dc61bf0c59d7bf733ca13872b46a44 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Fri, 7 Jun 2024 10:53:53 -0700
Subject: [PATCH 319/842] Handling for `NaN` and `inf` when converting floating
 point to fixed point types (#15885)

This PR adds the ability to check for `NaN` and `inf` values when converting floating point types to fixed point types. For these input values, the corresponding output will be `null`.

Closes https://github.com/rapidsai/cudf/issues/15883.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/15885
---
 cpp/src/unary/cast_ops.cu      | 43 ++++++++++++++++++++++++++++++++--
 cpp/tests/unary/cast_tests.cpp | 21 +++++++++++++++++
 2 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 98c412f805d..64427326d87 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -15,11 +15,13 @@
  */
 
 #include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/binaryop.hpp>
 #include <cudf/detail/fill.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/unary.hpp>
+#include <cudf/detail/valid_if.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -219,6 +221,28 @@ std::unique_ptr<column> rescale(column_view input,
   }
 };
 
+/**
+ * @brief Check if a floating point value is convertible to fixed point type.
+ *
+ * A floating point value is convertible if it is not null, not `NaN`, and not `inf`.
+ *
+ * Note that convertible input values may be out of the representable range of the target fixed
+ * point type. Values out of the representable range need to be checked separately.
+ */
+template <typename FloatType>
+struct is_convertible_floating_point {
+  column_device_view d_input;
+
+  bool __device__ operator()(size_type idx) const
+  {
+    static_assert(std::is_floating_point_v<FloatType>);
+
+    if (d_input.is_null(idx)) { return false; }
+    auto const value = d_input.element<FloatType>(idx);
+    return std::isfinite(value);
+  }
+};
+
 template <typename _SourceT>
 struct dispatch_unary_cast_to {
   column_view input;
@@ -294,8 +318,8 @@ struct dispatch_unary_cast_to {
       std::make_unique<column>(type,
                                size,
                                rmm::device_buffer{size * cudf::size_of(type), stream, mr},
-                               detail::copy_bitmask(input, stream, mr),
-                               input.null_count());
+                               rmm::device_buffer{},
+                               0);
 
     mutable_column_view output_mutable = *output;
 
@@ -308,6 +332,21 @@ struct dispatch_unary_cast_to {
                       output_mutable.begin<DeviceT>(),
                       fixed_point_unary_cast<SourceT, TargetT>{scale});
 
+    if constexpr (cudf::is_floating_point<SourceT>()) {
+      // For floating-point values, beside input nulls, we also need to set nulls for the output
+      // rows corresponding to NaN and inf in the input.
+      auto const d_input_ptr = column_device_view::create(input, stream);
+      auto [null_mask, null_count] =
+        cudf::detail::valid_if(thrust::make_counting_iterator(0),
+                               thrust::make_counting_iterator(size),
+                               is_convertible_floating_point<SourceT>{*d_input_ptr},
+                               stream,
+                               mr);
+      if (null_count > 0) { output->set_null_mask(std::move(null_mask), null_count); }
+    } else {
+      output->set_null_mask(detail::copy_bitmask(input, stream, mr), input.null_count());
+    }
+
     return output;
   }
 
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index a82449ffc10..ebeafc82039 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -665,6 +665,27 @@ TYPED_TEST(FixedPointTests, CastFromDouble)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
 }
 
+TYPED_TEST(FixedPointTests, CastFromDoubleWithNaNAndInf)
+{
+  using namespace numeric;
+  using decimalXX  = TypeParam;
+  using RepType    = cudf::device_storage_type_t<decimalXX>;
+  using fp_wrapper = cudf::test::fixed_point_column_wrapper<RepType>;
+  using fw_wrapper = cudf::test::fixed_width_column_wrapper<double>;
+
+  auto const NaN  = std::numeric_limits<double>::quiet_NaN();
+  auto const inf  = std::numeric_limits<double>::infinity();
+  auto const null = 0;
+
+  auto const input    = fw_wrapper{1.729, -inf, NaN, 172.9, -inf, NaN, inf, 1.23, inf};
+  auto const expected = fp_wrapper{{1729, null, null, 172900, null, null, null, 1230, null},
+                                   {true, false, false, true, false, false, false, true, false},
+                                   scale_type{-3}};
+  auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(-3));
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
+}
+
 TYPED_TEST(FixedPointTests, CastFromDoubleLarge)
 {
   using namespace numeric;

From 0067444597127f23a09a349f1c97dc33b9ec3958 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 7 Jun 2024 16:10:22 -0400
Subject: [PATCH 320/842] cudf.pandas documentation improvement (#15948)

Added some more about the generality of the fast-slow proxy scheme from a suggestion from @wence-

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/15948
---
 docs/cudf/source/developer_guide/cudf_pandas.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/docs/cudf/source/developer_guide/cudf_pandas.md b/docs/cudf/source/developer_guide/cudf_pandas.md
index aeb43f66b2d..827ba18a4a4 100644
--- a/docs/cudf/source/developer_guide/cudf_pandas.md
+++ b/docs/cudf/source/developer_guide/cudf_pandas.md
@@ -3,8 +3,16 @@ The use of the cuDF pandas accelerator mode (`cudf.pandas`) is explained [in the
 The purpose of this document is to explain how the fast-slow proxy mechanism works and document internal environment variables that can be used to debug `cudf.pandas` itself.
 
 ## fast-slow proxy mechanism
-`cudf.pandas` works by wrapping each Pandas type and its corresponding cuDF type in a new proxy type also known as a fast-slow proxy type.
-The purpose of proxy types is to attempt computations on the fast (cuDF) object first, and then fall back to running on the slow (Pandas) object if the fast version fails.
+The core of `cudf.pandas` is implemented through proxy types defined in [`fast_slow_proxy.py`](https://github.com/rapidsai/cudf/blob/5f45803b2a68b49d330d94e2f701791a7590612a/python/cudf/cudf/pandas/fast_slow_proxy.py), which link a pair of "fast" and "slow" libraries.
+`cudf.pandas` works by wrapping each "slow" type and its corresponding "fast" type in a new proxy type, also known as a fast-slow proxy type.
+The purpose of these proxy types is so we can first attempt computations on the fast object, and then fall back to the slow object if the fast version fails.
+While the core wrapping functionality is generic, the current usage mainly involves providing a proxy pair using cuDF and Pandas.
+In the rest of this document, to maintain a concrete pair of libraries in mind, we use cuDF and Pandas interchangeably as names for the "fast" and "slow" libraries, respectively, with the understanding that any pair of API-matching libraries could be used.
+For example, future support could include pairs such as CuPy (as the "fast" library) and NumPy (as the "slow" library).
+
+```{note}
+We currently do not wrap the entire NumPy library because it exposes a C API. But we do wrap NumPy's `numpy.ndarray` and CuPy's `cupy.ndarray` in a proxy type.
+```
 
 ### Types:
 #### Wrapped Types and Proxy Types

From 139ed6c3085feac8116085e35c7897cad141ce69 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 7 Jun 2024 10:49:05 -1000
Subject: [PATCH 321/842] Add __array_interface__ to cudf.pandas numpy.ndarray
 proxy (#15936)

closes #15926

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/15936
---
 python/cudf/cudf/pandas/_wrappers/common.py | 5 +++++
 python/cudf/cudf/pandas/_wrappers/numpy.py  | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/python/cudf/cudf/pandas/_wrappers/common.py b/python/cudf/cudf/pandas/_wrappers/common.py
index 468c5687c15..66a51a83896 100644
--- a/python/cudf/cudf/pandas/_wrappers/common.py
+++ b/python/cudf/cudf/pandas/_wrappers/common.py
@@ -46,5 +46,10 @@ def cuda_array_interface(self: _FastSlowProxy):
     return self._fsproxy_fast.__cuda_array_interface__
 
 
+@property  # type: ignore
+def array_interface(self: _FastSlowProxy):
+    return self._fsproxy_slow.__array_interface__
+
+
 def custom_iter(self: _FastSlowProxy):
     return iter(self._fsproxy_slow)
diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index 94298872213..c445be46f58 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -15,6 +15,7 @@
     make_intermediate_proxy_type,
 )
 from .common import (
+    array_interface,
     array_method,
     arrow_array_method,
     cuda_array_interface,
@@ -115,6 +116,7 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor):
         # So that pa.array(wrapped-numpy-array) works
         "__arrow_array__": arrow_array_method,
         "__cuda_array_interface__": cuda_array_interface,
+        "__array_interface__": array_interface,
         # ndarrays are unhashable
         "__hash__": None,
         # iter(cupy-array) produces an iterable of zero-dim device

From 8e40fe7e6b01a399c3ea406a59d4cbcbc9bfce5c Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Fri, 7 Jun 2024 16:08:42 -0700
Subject: [PATCH 322/842] Remove unused parsing utilities (#15955)

Some parsing utilities have been unused since legacy JSON removal.
This PR removes these functions.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15955
---
 cpp/CMakeLists.txt                     |   1 -
 cpp/src/io/utilities/parsing_utils.cu  | 221 -------------------------
 cpp/src/io/utilities/parsing_utils.cuh |  76 ---------
 3 files changed, 298 deletions(-)
 delete mode 100644 cpp/src/io/utilities/parsing_utils.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f637db66c2c..ca85996b990 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -439,7 +439,6 @@ add_library(
   src/io/utilities/data_sink.cpp
   src/io/utilities/datasource.cpp
   src/io/utilities/file_io_utilities.cpp
-  src/io/utilities/parsing_utils.cu
   src/io/utilities/row_selection.cpp
   src/io/utilities/type_inference.cu
   src/io/utilities/trie.cu
diff --git a/cpp/src/io/utilities/parsing_utils.cu b/cpp/src/io/utilities/parsing_utils.cu
deleted file mode 100644
index cb8be380c5b..00000000000
--- a/cpp/src/io/utilities/parsing_utils.cu
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/io/types.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <rmm/device_buffer.hpp>
-
-#include <thrust/pair.h>
-
-#include <algorithm>
-
-namespace cudf {
-namespace io {
-namespace {
-// When processing the input in chunks, this is the maximum size of each chunk.
-// Only one chunk is loaded on the GPU at a time, so this value is chosen to
-// be small enough to fit on the GPU in most cases.
-constexpr size_t max_chunk_bytes = 256 * 1024 * 1024;  // 256MB
-
-constexpr int bytes_per_find_thread = 64;
-
-using pos_key_pair = thrust::pair<uint64_t, char>;
-
-template <typename T>
-constexpr T divCeil(T dividend, T divisor) noexcept
-{
-  return (dividend + divisor - 1) / divisor;
-}
-
-/**
- * @brief Sets the specified element of the array to the passed value
- */
-template <class T, class V>
-__device__ __forceinline__ void setElement(T* array, cudf::size_type idx, T const& t, V const&)
-{
-  array[idx] = t;
-}
-
-/**
- * @brief Sets the specified element of the array of pairs using the two passed
- * parameters.
- */
-template <class T, class V>
-__device__ __forceinline__ void setElement(thrust::pair<T, V>* array,
-                                           cudf::size_type idx,
-                                           T const& t,
-                                           V const& v)
-{
-  array[idx] = {t, v};
-}
-
-/**
- * @brief Overloads the setElement() functions for void* arrays.
- * Does not do anything, indexing is not allowed with void* arrays.
- */
-template <class T, class V>
-__device__ __forceinline__ void setElement(void*, cudf::size_type, T const&, V const&)
-{
-}
-
-/**
- * @brief CUDA kernel that finds all occurrences of a character in the given
- * character array. If the 'positions' parameter is not void*,
- * positions of all occurrences are stored in the output array.
- *
- * @param[in] data Pointer to the input character array
- * @param[in] size Number of bytes in the input array
- * @param[in] offset Offset to add to the output positions
- * @param[in] key Character to find in the array
- * @param[in,out] count Pointer to the number of found occurrences
- * @param[out] positions Array containing the output positions
- */
-template <class T>
-CUDF_KERNEL void count_and_set_positions(char const* data,
-                                         uint64_t size,
-                                         uint64_t offset,
-                                         char const key,
-                                         cudf::size_type* count,
-                                         T* positions)
-{
-  // thread IDs range per block, so also need the block id
-  auto const tid = cudf::detail::grid_1d::global_thread_id();
-  auto const did = tid * bytes_per_find_thread;
-
-  char const* raw = (data + did);
-
-  long const byteToProcess =
-    ((did + bytes_per_find_thread) < size) ? bytes_per_find_thread : (size - did);
-
-  // Process the data
-  for (long i = 0; i < byteToProcess; i++) {
-    if (raw[i] == key) {
-      auto const idx = atomicAdd(count, static_cast<cudf::size_type>(1));
-      setElement(positions, idx, did + offset + i, key);
-    }
-  }
-}
-
-}  // namespace
-
-template <class T>
-cudf::size_type find_all_from_set(device_span<char const> data,
-                                  std::vector<char> const& keys,
-                                  uint64_t result_offset,
-                                  T* positions,
-                                  rmm::cuda_stream_view stream)
-{
-  int block_size    = 0;  // suggested thread count to use
-  int min_grid_size = 0;  // minimum block count required
-  CUDF_CUDA_TRY(
-    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions<T>));
-  int const grid_size = divCeil(data.size(), (size_t)block_size);
-
-  auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(
-    1, stream, rmm::mr::get_current_device_resource());
-  for (char key : keys) {
-    count_and_set_positions<T><<<grid_size, block_size, 0, stream.value()>>>(
-      data.data(), data.size(), result_offset, key, d_count.data(), positions);
-  }
-
-  return cudf::detail::make_std_vector_sync(d_count, stream)[0];
-}
-
-template <class T>
-cudf::size_type find_all_from_set(host_span<char const> data,
-                                  std::vector<char> const& keys,
-                                  uint64_t result_offset,
-                                  T* positions,
-                                  rmm::cuda_stream_view stream)
-{
-  rmm::device_buffer d_chunk(std::min(max_chunk_bytes, data.size()), stream);
-  auto d_count = cudf::detail::make_zeroed_device_uvector_async<cudf::size_type>(
-    1, stream, rmm::mr::get_current_device_resource());
-
-  int block_size    = 0;  // suggested thread count to use
-  int min_grid_size = 0;  // minimum block count required
-  CUDF_CUDA_TRY(
-    cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, count_and_set_positions<T>));
-
-  size_t const chunk_count = divCeil(data.size(), max_chunk_bytes);
-  for (size_t ci = 0; ci < chunk_count; ++ci) {
-    auto const chunk_offset = ci * max_chunk_bytes;
-    auto const h_chunk      = data.data() + chunk_offset;
-    int const chunk_bytes = std::min((size_t)(data.size() - ci * max_chunk_bytes), max_chunk_bytes);
-    auto const chunk_bits = divCeil(chunk_bytes, bytes_per_find_thread);
-    int const grid_size   = divCeil(chunk_bits, block_size);
-
-    // Copy chunk to device
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(d_chunk.data(), h_chunk, chunk_bytes, cudaMemcpyDefault, stream.value()));
-
-    for (char key : keys) {
-      count_and_set_positions<T>
-        <<<grid_size, block_size, 0, stream.value()>>>(static_cast<char*>(d_chunk.data()),
-                                                       chunk_bytes,
-                                                       chunk_offset + result_offset,
-                                                       key,
-                                                       d_count.data(),
-                                                       positions);
-    }
-  }
-
-  return cudf::detail::make_std_vector_sync(d_count, stream)[0];
-}
-
-template cudf::size_type find_all_from_set<uint64_t>(device_span<char const> data,
-                                                     std::vector<char> const& keys,
-                                                     uint64_t result_offset,
-                                                     uint64_t* positions,
-                                                     rmm::cuda_stream_view stream);
-
-template cudf::size_type find_all_from_set<pos_key_pair>(device_span<char const> data,
-                                                         std::vector<char> const& keys,
-                                                         uint64_t result_offset,
-                                                         pos_key_pair* positions,
-                                                         rmm::cuda_stream_view stream);
-
-template cudf::size_type find_all_from_set<uint64_t>(host_span<char const> data,
-                                                     std::vector<char> const& keys,
-                                                     uint64_t result_offset,
-                                                     uint64_t* positions,
-                                                     rmm::cuda_stream_view stream);
-
-template cudf::size_type find_all_from_set<pos_key_pair>(host_span<char const> data,
-                                                         std::vector<char> const& keys,
-                                                         uint64_t result_offset,
-                                                         pos_key_pair* positions,
-                                                         rmm::cuda_stream_view stream);
-
-cudf::size_type count_all_from_set(device_span<char const> data,
-                                   std::vector<char> const& keys,
-                                   rmm::cuda_stream_view stream)
-{
-  return find_all_from_set<void>(data, keys, 0, nullptr, stream);
-}
-
-cudf::size_type count_all_from_set(host_span<char const> data,
-                                   std::vector<char> const& keys,
-                                   rmm::cuda_stream_view stream)
-{
-  return find_all_from_set<void>(data, keys, 0, nullptr, stream);
-}
-
-}  // namespace io
-}  // namespace cudf
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index faee05541cc..bc2722441d0 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -414,82 +414,6 @@ __device__ __inline__ cudf::size_type* infer_integral_field_counter(char const*
 
 }  // namespace gpu
 
-/**
- * @brief Searches the input character array for each of characters in a set.
- * Sums up the number of occurrences. If the 'positions' parameter is not void*,
- * positions of all occurrences are stored in the output device array.
- *
- * @param[in] d_data Input character array in device memory
- * @param[in] keys Vector containing the keys to count in the buffer
- * @param[in] result_offset Offset to add to the output positions
- * @param[out] positions Array containing the output positions
- * @param[in] stream CUDA stream used for device memory operations and kernel launches
- *
- * @return cudf::size_type total number of occurrences
- */
-template <class T>
-cudf::size_type find_all_from_set(device_span<char const> data,
-                                  std::vector<char> const& keys,
-                                  uint64_t result_offset,
-                                  T* positions,
-                                  rmm::cuda_stream_view stream);
-
-/**
- * @brief Searches the input character array for each of characters in a set.
- * Sums up the number of occurrences. If the 'positions' parameter is not void*,
- * positions of all occurrences are stored in the output device array.
- *
- * Does not load the entire file into the GPU memory at any time, so it can
- * be used to parse large files. Output array needs to be preallocated.
- *
- * @param[in] h_data Pointer to the input character array
- * @param[in] h_size Number of bytes in the input array
- * @param[in] keys Vector containing the keys to count in the buffer
- * @param[in] result_offset Offset to add to the output positions
- * @param[out] positions Array containing the output positions
- * @param[in] stream CUDA stream used for device memory operations and kernel launches
- *
- * @return cudf::size_type total number of occurrences
- */
-template <class T>
-cudf::size_type find_all_from_set(host_span<char const> data,
-                                  std::vector<char> const& keys,
-                                  uint64_t result_offset,
-                                  T* positions,
-                                  rmm::cuda_stream_view stream);
-
-/**
- * @brief Searches the input character array for each of characters in a set
- * and sums up the number of occurrences.
- *
- * @param d_data Input data buffer in device memory
- * @param keys Vector containing the keys to count in the buffer
- * @param stream CUDA stream used for device memory operations and kernel launches
- *
- * @return cudf::size_type total number of occurrences
- */
-cudf::size_type count_all_from_set(device_span<char const> data,
-                                   std::vector<char> const& keys,
-                                   rmm::cuda_stream_view stream);
-
-/**
- * @brief Searches the input character array for each of characters in a set
- * and sums up the number of occurrences.
- *
- * Does not load the entire buffer into the GPU memory at any time, so it can
- * be used with buffers of any size.
- *
- * @param h_data Pointer to the data in host memory
- * @param h_size Size of the input data, in bytes
- * @param keys Vector containing the keys to count in the buffer
- * @param stream CUDA stream used for device memory operations and kernel launches
- *
- * @return cudf::size_type total number of occurrences
- */
-cudf::size_type count_all_from_set(host_span<char const> data,
-                                   std::vector<char> const& keys,
-                                   rmm::cuda_stream_view stream);
-
 /**
  * @brief Checks whether the given character is a whitespace character.
  *

From bfad68c66fba06cb87327265b8b74ab329c58e4e Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Sun, 9 Jun 2024 09:17:12 -0400
Subject: [PATCH 323/842] Add an Environment Variable for debugging the fast
 path in cudf.pandas (#15837)

Part of #14975 This PR adds a pandas debugging option to `_fast_slow_function_call` that runs the slow path after the fast and returns a warning if the results differ.

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15837
---
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 63 ++++++++++++++++--
 .../cudf_pandas_tests/test_cudf_pandas.py     | 64 ++++++++++++++++++-
 2 files changed, 121 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 169dd80e132..5f4cf2e6cc6 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -9,6 +9,7 @@
 import operator
 import pickle
 import types
+import warnings
 from collections.abc import Iterator
 from enum import IntEnum
 from typing import (
@@ -23,6 +24,10 @@
     Type,
 )
 
+import numpy as np
+
+from ..options import _env_get_bool
+from ..testing._utils import assert_eq
 from .annotation import nvtx
 
 
@@ -808,7 +813,9 @@ def __get__(self, instance, owner) -> Any:
             else:
                 # for anything else, use a fast-slow attribute:
                 self._attr, _ = _fast_slow_function_call(
-                    getattr, owner, self._name
+                    getattr,
+                    owner,
+                    self._name,
                 )
 
                 if isinstance(
@@ -829,9 +836,11 @@ def __get__(self, instance, owner) -> Any:
                         getattr(instance._fsproxy_slow, self._name),
                         None,  # type: ignore
                     )
-                return _fast_slow_function_call(getattr, instance, self._name)[
-                    0
-                ]
+                return _fast_slow_function_call(
+                    getattr,
+                    instance,
+                    self._name,
+                )[0]
         return self._attr
 
 
@@ -866,7 +875,17 @@ def __name__(self, value):
         setattr(self._fsproxy_slow, "__name__", value)
 
 
-def _fast_slow_function_call(func: Callable, /, *args, **kwargs) -> Any:
+def _assert_fast_slow_eq(left, right):
+    if _is_final_type(type(left)) or type(left) in NUMPY_TYPES:
+        assert_eq(left, right)
+
+
+def _fast_slow_function_call(
+    func: Callable,
+    /,
+    *args,
+    **kwargs,
+) -> Any:
     """
     Call `func` with all `args` and `kwargs` converted to their
     respective fast type. If that fails, call `func` with all
@@ -890,6 +909,37 @@ def _fast_slow_function_call(func: Callable, /, *args, **kwargs) -> Any:
                 # try slow path
                 raise Exception()
             fast = True
+            if _env_get_bool("CUDF_PANDAS_DEBUGGING", False):
+                try:
+                    with nvtx.annotate(
+                        "EXECUTE_SLOW_DEBUG",
+                        color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
+                        domain="cudf_pandas",
+                    ):
+                        slow_args, slow_kwargs = (
+                            _slow_arg(args),
+                            _slow_arg(kwargs),
+                        )
+                        with disable_module_accelerator():
+                            slow_result = func(*slow_args, **slow_kwargs)
+                except Exception as e:
+                    warnings.warn(
+                        "The result from pandas could not be computed. "
+                        f"The exception was {e}."
+                    )
+                else:
+                    try:
+                        _assert_fast_slow_eq(result, slow_result)
+                    except AssertionError as e:
+                        warnings.warn(
+                            "The results from cudf and pandas were different. "
+                            f"The exception was {e}."
+                        )
+                    except Exception as e:
+                        warnings.warn(
+                            "Pandas debugging mode failed. "
+                            f"The exception was {e}."
+                        )
     except Exception:
         with nvtx.annotate(
             "EXECUTE_SLOW",
@@ -1135,6 +1185,9 @@ def _replace_closurevars(
     )
 
 
+NUMPY_TYPES: Set[str] = set(np.sctypeDict.values())
+
+
 _SPECIAL_METHODS: Set[str] = {
     "__abs__",
     "__add__",
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index fef829b17fc..72e9ad5fca3 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -41,8 +41,9 @@
     get_calendar,
 )
 
-# Accelerated pandas has the real pandas module as an attribute
+# Accelerated pandas has the real pandas and cudf modules as attributes
 pd = xpd._fsproxy_slow
+cudf = xpd._fsproxy_fast
 
 
 @pytest.fixture
@@ -1424,5 +1425,66 @@ def test_holidays_within_dates(holiday, start, expected):
     ) == [utc.localize(dt) for dt in expected]
 
 
+def test_cudf_pandas_debugging_different_results(monkeypatch):
+    cudf_mean = cudf.Series.mean
+
+    def mock_mean_one(self, *args, **kwargs):
+        return np.float64(1.0)
+
+    with monkeypatch.context() as monkeycontext:
+        monkeypatch.setattr(xpd.Series.mean, "_fsproxy_fast", mock_mean_one)
+        monkeycontext.setenv("CUDF_PANDAS_DEBUGGING", "True")
+        s = xpd.Series([1, 2])
+        with pytest.warns(
+            UserWarning,
+            match="The results from cudf and pandas were different.",
+        ):
+            assert s.mean() == 1.0
+    # Must explicitly undo the patch. Proxy dispatch doesn't work with monkeypatch contexts.
+    monkeypatch.setattr(xpd.Series.mean, "_fsproxy_fast", cudf_mean)
+
+
+def test_cudf_pandas_debugging_pandas_error(monkeypatch):
+    pd_mean = pd.Series.mean
+
+    def mock_mean_exception(self, *args, **kwargs):
+        raise Exception()
+
+    with monkeypatch.context() as monkeycontext:
+        monkeycontext.setattr(
+            xpd.Series.mean, "_fsproxy_slow", mock_mean_exception
+        )
+        monkeycontext.setenv("CUDF_PANDAS_DEBUGGING", "True")
+        s = xpd.Series([1, 2])
+        with pytest.warns(
+            UserWarning,
+            match="The result from pandas could not be computed.",
+        ):
+            s = xpd.Series([1, 2])
+            assert s.mean() == 1.5
+    # Must explicitly undo the patch. Proxy dispatch doesn't work with monkeypatch contexts.
+    monkeypatch.setattr(xpd.Series.mean, "_fsproxy_slow", pd_mean)
+
+
+def test_cudf_pandas_debugging_failed(monkeypatch):
+    pd_mean = pd.Series.mean
+
+    def mock_mean_none(self, *args, **kwargs):
+        return None
+
+    with monkeypatch.context() as monkeycontext:
+        monkeycontext.setattr(xpd.Series.mean, "_fsproxy_slow", mock_mean_none)
+        monkeycontext.setenv("CUDF_PANDAS_DEBUGGING", "True")
+        s = xpd.Series([1, 2])
+        with pytest.warns(
+            UserWarning,
+            match="Pandas debugging mode failed.",
+        ):
+            s = xpd.Series([1, 2])
+            assert s.mean() == 1.5
+    # Must explicitly undo the patch. Proxy dispatch doesn't work with monkeypatch contexts.
+    monkeypatch.setattr(xpd.Series.mean, "_fsproxy_slow", pd_mean)
+
+
 def test_excelwriter_pathlike():
     assert isinstance(pd.ExcelWriter("foo.xlsx"), os.PathLike)

From c02260f2fb1c162eabf0da0604cc6f08f2cc74ff Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Sun, 9 Jun 2024 22:09:44 -0700
Subject: [PATCH 324/842] Refactor Parquet writer options and builders (#15831)

Adding options to the Parquet writer is made somewhat tedious by the duplication of code between the two current sets of options/builder classes; one each for the chunked and non-chunked Parquet writers. This PR pulls common options into a parent options class, and common setters into a parent builder class. The builder parent uses CRTP to allow chaining of options.

Authors:
  - Ed Seidl (https://github.com/etseidl)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15831
---
 cpp/include/cudf/io/parquet.hpp               | 906 ++++--------------
 cpp/src/io/functions.cpp                      | 271 ++++--
 .../_lib/pylibcudf/libcudf/io/parquet.pxd     | 173 ++--
 3 files changed, 410 insertions(+), 940 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index b2f949cdcee..51eeed5b721 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -29,6 +29,7 @@
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 namespace cudf::io {
@@ -576,22 +577,16 @@ struct sorting_column {
   bool is_nulls_first{true};  //!< true if nulls come before non-null values
 };
 
-class parquet_writer_options_builder;
-
 /**
- * @brief Settings for `write_parquet()`.
+ * @brief Base settings for `write_parquet()` and `parquet_chunked_writer`.
  */
-class parquet_writer_options {
+class parquet_writer_options_base {
   // Specify the sink to use for writer output
   sink_info _sink;
   // Specify the compression format to use
   compression_type _compression = compression_type::SNAPPY;
   // Specify the level of statistics in the output file
   statistics_freq _stats_level = statistics_freq::STATISTICS_ROWGROUP;
-  // Sets of columns to output
-  table_view _table;
-  // Partitions described as {start_row, num_rows} pairs
-  std::vector<partition_info> _partitions;
   // Optional associated metadata
   std::optional<table_input_metadata> _metadata;
   // Optional footer key_value_metadata
@@ -602,8 +597,6 @@ class parquet_writer_options {
   // Parquet writer can write timestamps as UTC
   // Defaults to true because libcudf timestamps are implicitly UTC
   bool _write_timestamps_as_UTC = true;
-  // Column chunks file paths to be set in the raw output metadata. One per output file
-  std::vector<std::string> _column_chunks_file_paths;
   // Maximum size of each row group (unless smaller than a single page)
   size_t _row_group_size_bytes = default_row_group_size_bytes;
   // Maximum number of rows in row group (unless smaller than a single page)
@@ -627,18 +620,13 @@ class parquet_writer_options {
   // Which columns in _table are used for sorting
   std::optional<std::vector<sorting_column>> _sorting_columns;
 
+ protected:
   /**
-   * @brief Constructor from sink and table.
+   * @brief Constructor from sink.
    *
    * @param sink The sink used for writer output
-   * @param table Table to be written to output
    */
-  explicit parquet_writer_options(sink_info const& sink, table_view const& table)
-    : _sink(sink), _table(table)
-  {
-  }
-
-  friend parquet_writer_options_builder;
+  explicit parquet_writer_options_base(sink_info const& sink) : _sink(sink) {}
 
  public:
   /**
@@ -646,24 +634,7 @@ class parquet_writer_options {
    *
    * This has been added since Cython requires a default constructor to create objects on stack.
    */
-  parquet_writer_options() = default;
-
-  /**
-   * @brief Create builder to create `parquet_writer_options`.
-   *
-   * @param sink The sink used for writer output
-   * @param table Table to be written to output
-   *
-   * @return Builder to build parquet_writer_options
-   */
-  static parquet_writer_options_builder builder(sink_info const& sink, table_view const& table);
-
-  /**
-   * @brief Create builder to create `parquet_writer_options`.
-   *
-   * @return parquet_writer_options_builder
-   */
-  static parquet_writer_options_builder builder();
+  parquet_writer_options_base() = default;
 
   /**
    * @brief Returns sink info.
@@ -686,20 +657,6 @@ class parquet_writer_options {
    */
   [[nodiscard]] statistics_freq get_stats_level() const { return _stats_level; }
 
-  /**
-   * @brief Returns table_view.
-   *
-   * @return Table view
-   */
-  [[nodiscard]] table_view get_table() const { return _table; }
-
-  /**
-   * @brief Returns partitions.
-   *
-   * @return Partitions
-   */
-  [[nodiscard]] std::vector<partition_info> const& get_partitions() const { return _partitions; }
-
   /**
    * @brief Returns associated metadata.
    *
@@ -712,7 +669,8 @@ class parquet_writer_options {
    *
    * @return Key-Value footer metadata information
    */
-  std::vector<std::map<std::string, std::string>> const& get_key_value_metadata() const
+  [[nodiscard]] std::vector<std::map<std::string, std::string>> const& get_key_value_metadata()
+    const
   {
     return _user_data;
   }
@@ -722,7 +680,7 @@ class parquet_writer_options {
    *
    * @return `true` if timestamps will be written as INT96
    */
-  bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
+  [[nodiscard]] bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
 
   /**
    * @brief Returns `true` if timestamps will be written as UTC
@@ -731,29 +689,19 @@ class parquet_writer_options {
    */
   [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }
 
-  /**
-   * @brief Returns Column chunks file paths to be set in the raw output metadata.
-   *
-   * @return Column chunks file paths to be set in the raw output metadata
-   */
-  std::vector<std::string> const& get_column_chunks_file_paths() const
-  {
-    return _column_chunks_file_paths;
-  }
-
   /**
    * @brief Returns maximum row group size, in bytes.
    *
    * @return Maximum row group size, in bytes
    */
-  auto get_row_group_size_bytes() const { return _row_group_size_bytes; }
+  [[nodiscard]] auto get_row_group_size_bytes() const { return _row_group_size_bytes; }
 
   /**
    * @brief Returns maximum row group size, in rows.
    *
    * @return Maximum row group size, in rows
    */
-  auto get_row_group_size_rows() const { return _row_group_size_rows; }
+  [[nodiscard]] auto get_row_group_size_rows() const { return _row_group_size_rows; }
 
   /**
    * @brief Returns the maximum uncompressed page size, in bytes.
@@ -762,7 +710,7 @@ class parquet_writer_options {
    *
    * @return Maximum uncompressed page size, in bytes
    */
-  auto get_max_page_size_bytes() const
+  [[nodiscard]] auto get_max_page_size_bytes() const
   {
     return std::min(_max_page_size_bytes, get_row_group_size_bytes());
   }
@@ -774,7 +722,7 @@ class parquet_writer_options {
    *
    * @return Maximum page size, in rows
    */
-  auto get_max_page_size_rows() const
+  [[nodiscard]] auto get_max_page_size_rows() const
   {
     return std::min(_max_page_size_rows, get_row_group_size_rows());
   }
@@ -784,7 +732,10 @@ class parquet_writer_options {
    *
    * @return length min/max will be truncated to
    */
-  auto get_column_index_truncate_length() const { return _column_index_truncate_length; }
+  [[nodiscard]] auto get_column_index_truncate_length() const
+  {
+    return _column_index_truncate_length;
+  }
 
   /**
    * @brief Returns policy for dictionary use.
@@ -831,20 +782,12 @@ class parquet_writer_options {
    */
   [[nodiscard]] auto const& get_sorting_columns() const { return _sorting_columns; }
 
-  /**
-   * @brief Sets partitions.
-   *
-   * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must
-   * be same size as number of sinks in sink_info
-   */
-  void set_partitions(std::vector<partition_info> partitions);
-
   /**
    * @brief Sets metadata.
    *
    * @param metadata Associated metadata
    */
-  void set_metadata(table_input_metadata metadata) { _metadata = std::move(metadata); }
+  void set_metadata(table_input_metadata metadata);
 
   /**
    * @brief Sets metadata.
@@ -858,14 +801,13 @@ class parquet_writer_options {
    *
    * @param sf Level of statistics requested in the output file
    */
-  void set_stats_level(statistics_freq sf) { _stats_level = sf; }
-
+  void set_stats_level(statistics_freq sf);
   /**
    * @brief Sets compression type.
    *
    * @param compression The compression type to use
    */
-  void set_compression(compression_type compression) { _compression = compression; }
+  void set_compression(compression_type compression);
 
   /**
    * @brief Sets timestamp writing preferences. INT96 timestamps will be written
@@ -873,22 +815,14 @@ class parquet_writer_options {
    *
    * @param req Boolean value to enable/disable writing of INT96 timestamps
    */
-  void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; }
+  void enable_int96_timestamps(bool req);
 
   /**
    * @brief Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to `true`.
    *
    * @param val Boolean value to enable/disable writing of timestamps as UTC.
    */
-  void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; }
-
-  /**
-   * @brief Sets column chunks file path to be set in the raw output metadata.
-   *
-   * @param file_paths Vector of Strings which indicates file path. Must be same size as number of
-   * data sinks in sink info
-   */
-  void set_column_chunks_file_paths(std::vector<std::string> file_paths);
+  void enable_utc_timestamps(bool val);
 
   /**
    * @brief Sets the maximum row group size, in bytes.
@@ -951,116 +885,84 @@ class parquet_writer_options {
    *
    * @param comp_stats Pointer to compression statistics to be updated after writing
    */
-  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
-  {
-    _compression_stats = std::move(comp_stats);
-  }
+  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats);
 
   /**
    * @brief Sets preference for V2 page headers. Write V2 page headers if set to `true`.
    *
    * @param val Boolean value to enable/disable writing of V2 page headers.
    */
-  void enable_write_v2_headers(bool val) { _v2_page_headers = val; }
+  void enable_write_v2_headers(bool val);
 
   /**
    * @brief Sets sorting columns.
    *
    * @param sorting_columns Column sort order metadata
    */
-  void set_sorting_columns(std::vector<sorting_column> sorting_columns)
-  {
-    _sorting_columns = std::move(sorting_columns);
-  }
+  void set_sorting_columns(std::vector<sorting_column> sorting_columns);
 };
 
 /**
- * @brief Class to build `parquet_writer_options`.
+ * @brief Base class for Parquet options builders.
  */
-class parquet_writer_options_builder {
-  parquet_writer_options options;
+template <class BuilderT, class OptionsT>
+class parquet_writer_options_builder_base {
+  OptionsT _options;
 
- public:
+ protected:
   /**
-   * @brief Default constructor.
+   * @brief Return reference to the options object being built
    *
-   * This has been added since Cython requires a default constructor to create objects on stack.
+   * @return the options object
    */
-  explicit parquet_writer_options_builder() = default;
+  inline OptionsT& get_options() { return _options; }
 
   /**
-   * @brief Constructor from sink and table.
+   * @brief Constructor from options.
    *
-   * @param sink The sink used for writer output
-   * @param table Table to be written to output
+   * @param options Options object to build
    */
-  explicit parquet_writer_options_builder(sink_info const& sink, table_view const& table)
-    : options(sink, table)
-  {
-  }
+  explicit parquet_writer_options_builder_base(OptionsT options);
 
+ public:
   /**
-   * @brief Sets partitions in parquet_writer_options.
+   * @brief Default constructor.
    *
-   * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must
-   * be same size as number of sinks in sink_info
-   * @return this for chaining
+   * This has been added since Cython requires a default constructor to create objects on stack.
    */
-  parquet_writer_options_builder& partitions(std::vector<partition_info> partitions);
+  explicit parquet_writer_options_builder_base() = default;
 
   /**
-   * @brief Sets metadata in parquet_writer_options.
+   * @brief Sets metadata.
    *
    * @param metadata Associated metadata
    * @return this for chaining
    */
-  parquet_writer_options_builder& metadata(table_input_metadata metadata)
-  {
-    options._metadata = std::move(metadata);
-    return *this;
-  }
+  BuilderT& metadata(table_input_metadata metadata);
 
   /**
-   * @brief Sets Key-Value footer metadata in parquet_writer_options.
+   * @brief Sets Key-Value footer metadata.
    *
    * @param metadata Key-Value footer metadata
    * @return this for chaining
    */
-  parquet_writer_options_builder& key_value_metadata(
-    std::vector<std::map<std::string, std::string>> metadata);
+  BuilderT& key_value_metadata(std::vector<std::map<std::string, std::string>> metadata);
 
   /**
-   * @brief Sets the level of statistics in parquet_writer_options.
+   * @brief Sets the level of statistics.
    *
    * @param sf Level of statistics requested in the output file
    * @return this for chaining
    */
-  parquet_writer_options_builder& stats_level(statistics_freq sf)
-  {
-    options._stats_level = sf;
-    return *this;
-  }
+  BuilderT& stats_level(statistics_freq sf);
 
   /**
-   * @brief Sets compression type in parquet_writer_options.
+   * @brief Sets compression type.
    *
    * @param compression The compression type to use
    * @return this for chaining
    */
-  parquet_writer_options_builder& compression(compression_type compression)
-  {
-    options._compression = compression;
-    return *this;
-  }
-
-  /**
-   * @brief Sets column chunks file path to be set in the raw output metadata.
-   *
-   * @param file_paths Vector of Strings which indicates file path. Must be same size as number of
-   * data sinks
-   * @return this for chaining
-   */
-  parquet_writer_options_builder& column_chunks_file_paths(std::vector<std::string> file_paths);
+  BuilderT& compression(compression_type compression);
 
   /**
    * @brief Sets the maximum row group size, in bytes.
@@ -1068,11 +970,7 @@ class parquet_writer_options_builder {
    * @param val maximum row group size
    * @return this for chaining
    */
-  parquet_writer_options_builder& row_group_size_bytes(size_t val)
-  {
-    options.set_row_group_size_bytes(val);
-    return *this;
-  }
+  BuilderT& row_group_size_bytes(size_t val);
 
   /**
    * @brief Sets the maximum number of rows in output row groups.
@@ -1080,11 +978,7 @@ class parquet_writer_options_builder {
    * @param val maximum number or rows
    * @return this for chaining
    */
-  parquet_writer_options_builder& row_group_size_rows(size_type val)
-  {
-    options.set_row_group_size_rows(val);
-    return *this;
-  }
+  BuilderT& row_group_size_rows(size_type val);
 
   /**
    * @brief Sets the maximum uncompressed page size, in bytes.
@@ -1096,11 +990,7 @@ class parquet_writer_options_builder {
    * @param val maximum page size
    * @return this for chaining
    */
-  parquet_writer_options_builder& max_page_size_bytes(size_t val)
-  {
-    options.set_max_page_size_bytes(val);
-    return *this;
-  }
+  BuilderT& max_page_size_bytes(size_t val);
 
   /**
    * @brief Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting.
@@ -1109,11 +999,7 @@ class parquet_writer_options_builder {
    * @param val maximum rows per page
    * @return this for chaining
    */
-  parquet_writer_options_builder& max_page_size_rows(size_type val)
-  {
-    options.set_max_page_size_rows(val);
-    return *this;
-  }
+  BuilderT& max_page_size_rows(size_type val);
 
   /**
    * @brief Sets the desired maximum size in bytes for min and max values in the column index.
@@ -1128,11 +1014,7 @@ class parquet_writer_options_builder {
    * @param val length min/max will be truncated to, with 0 indicating no truncation
    * @return this for chaining
    */
-  parquet_writer_options_builder& column_index_truncate_length(int32_t val)
-  {
-    options.set_column_index_truncate_length(val);
-    return *this;
-  }
+  BuilderT& column_index_truncate_length(int32_t val);
 
   /**
    * @brief Sets the policy for dictionary use.
@@ -1151,7 +1033,7 @@ class parquet_writer_options_builder {
    * @param val policy for dictionary use
    * @return this for chaining
    */
-  parquet_writer_options_builder& dictionary_policy(enum dictionary_policy val);
+  BuilderT& dictionary_policy(enum dictionary_policy val);
 
   /**
    * @brief Sets the maximum dictionary size, in bytes.
@@ -1164,7 +1046,7 @@ class parquet_writer_options_builder {
    * @param val maximum dictionary size
    * @return this for chaining
    */
-  parquet_writer_options_builder& max_dictionary_size(size_t val);
+  BuilderT& max_dictionary_size(size_t val);
 
   /**
    * @brief Sets the maximum page fragment size, in rows.
@@ -1176,7 +1058,7 @@ class parquet_writer_options_builder {
    * @param val maximum page fragment size
    * @return this for chaining
    */
-  parquet_writer_options_builder& max_page_fragment_size(size_type val);
+  BuilderT& max_page_fragment_size(size_type val);
 
   /**
    * @brief Sets the pointer to the output compression statistics.
@@ -1184,24 +1066,16 @@ class parquet_writer_options_builder {
    * @param comp_stats Pointer to compression statistics to be filled once writer is done
    * @return this for chaining
    */
-  parquet_writer_options_builder& compression_statistics(
-    std::shared_ptr<writer_compression_statistics> const& comp_stats)
-  {
-    options._compression_stats = comp_stats;
-    return *this;
-  }
+  BuilderT& compression_statistics(
+    std::shared_ptr<writer_compression_statistics> const& comp_stats);
 
   /**
-   * @brief Sets whether int96 timestamps are written or not in parquet_writer_options.
+   * @brief Sets whether int96 timestamps are written or not.
    *
    * @param enabled Boolean value to enable/disable int96 timestamps
    * @return this for chaining
    */
-  parquet_writer_options_builder& int96_timestamps(bool enabled)
-  {
-    options._write_timestamps_as_int96 = enabled;
-    return *this;
-  }
+  BuilderT& int96_timestamps(bool enabled);
 
   /**
    * @brief Set to true if timestamps are to be written as UTC.
@@ -1209,126 +1083,60 @@ class parquet_writer_options_builder {
    * @param enabled Boolean value to enable/disable writing of timestamps as UTC.
    * @return this for chaining
    */
-  parquet_writer_options_builder& utc_timestamps(bool enabled)
-  {
-    options._write_timestamps_as_UTC = enabled;
-    return *this;
-  }
-
+  BuilderT& utc_timestamps(bool enabled);
   /**
    * @brief Set to true if V2 page headers are to be written.
    *
    * @param enabled Boolean value to enable/disable writing of V2 page headers.
    * @return this for chaining
    */
-  parquet_writer_options_builder& write_v2_headers(bool enabled);
+  BuilderT& write_v2_headers(bool enabled);
 
   /**
-   * @brief Sets column sorting metadata to chunked_parquet_writer_options.
+   * @brief Sets column sorting metadata.
    *
    * @param sorting_columns Column sort order metadata
    * @return this for chaining
    */
-  parquet_writer_options_builder& sorting_columns(std::vector<sorting_column> sorting_columns);
+  BuilderT& sorting_columns(std::vector<sorting_column> sorting_columns);
 
   /**
-   * @brief move parquet_writer_options member once it's built.
+   * @brief move options member once it's built.
    */
-  operator parquet_writer_options&&() { return std::move(options); }
+  operator OptionsT&&();
 
   /**
-   * @brief move parquet_writer_options member once it's built.
+   * @brief move options member once it's built.
    *
    * This has been added since Cython does not support overloading of conversion operators.
    *
    * @return Built `parquet_writer_options` object's r-value reference
    */
-  parquet_writer_options&& build() { return std::move(options); }
+  OptionsT&& build();
 };
 
-/**
- * @brief Writes a set of columns to parquet format.
- *
- * The following code snippet demonstrates how to write columns to a file:
- * @code
- *  auto destination = cudf::io::sink_info("dataset.parquet");
- *  auto options     = cudf::io::parquet_writer_options::builder(destination, table->view());
- *  cudf::io::write_parquet(options);
- * @endcode
- *
- * @param options Settings for controlling writing behavior
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return A blob that contains the file metadata (parquet FileMetadata thrift message) if
- *         requested in parquet_writer_options (empty blob otherwise).
- */
-
-std::unique_ptr<std::vector<uint8_t>> write_parquet(
-  parquet_writer_options const& options, rmm::cuda_stream_view stream = cudf::get_default_stream());
+class parquet_writer_options_builder;
 
 /**
- * @brief Merges multiple raw metadata blobs that were previously created by write_parquet
- * into a single metadata blob.
- *
- * @ingroup io_writers
- *
- * @param[in] metadata_list List of input file metadata
- * @return A parquet-compatible blob that contains the data for all row groups in the list
+ * @brief Settings for `write_parquet()`.
  */
-std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
-  std::vector<std::unique_ptr<std::vector<uint8_t>>> const& metadata_list);
-
-class chunked_parquet_writer_options_builder;
+class parquet_writer_options : public parquet_writer_options_base {
+  // Sets of columns to output
+  table_view _table;
+  // Partitions described as {start_row, num_rows} pairs
+  std::vector<partition_info> _partitions;
+  // Column chunks file paths to be set in the raw output metadata. One per output file
+  std::vector<std::string> _column_chunks_file_paths;
 
-/**
- * @brief Settings for `write_parquet_chunked()`.
- */
-class chunked_parquet_writer_options {
-  // Specify the sink to use for writer output
-  sink_info _sink;
-  // Specify the compression format to use
-  compression_type _compression = compression_type::AUTO;
-  // Specify the level of statistics in the output file
-  statistics_freq _stats_level = statistics_freq::STATISTICS_ROWGROUP;
-  // Optional associated metadata.
-  std::optional<table_input_metadata> _metadata;
-  // Optional footer key_value_metadata
-  std::vector<std::map<std::string, std::string>> _user_data;
-  // Parquet writer can write INT96 or TIMESTAMP_MICROS. Defaults to TIMESTAMP_MICROS.
-  // If true then overrides any per-column setting in _metadata.
-  bool _write_timestamps_as_int96 = false;
-  // Parquet writer can write timestamps as UTC. Defaults to true.
-  bool _write_timestamps_as_UTC = true;
-  // Maximum size of each row group (unless smaller than a single page)
-  size_t _row_group_size_bytes = default_row_group_size_bytes;
-  // Maximum number of rows in row group (unless smaller than a single page)
-  size_type _row_group_size_rows = default_row_group_size_rows;
-  // Maximum size of each page (uncompressed)
-  size_t _max_page_size_bytes = default_max_page_size_bytes;
-  // Maximum number of rows in a page
-  size_type _max_page_size_rows = default_max_page_size_rows;
-  // Maximum size of min or max values in column index
-  int32_t _column_index_truncate_length = default_column_index_truncate_length;
-  // When to use dictionary encoding for data
-  dictionary_policy _dictionary_policy = dictionary_policy::ADAPTIVE;
-  // Maximum size of column chunk dictionary (in bytes)
-  size_t _max_dictionary_size = default_max_dictionary_size;
-  // Maximum number of rows in a page fragment
-  std::optional<size_type> _max_page_fragment_size;
-  // Optional compression statistics
-  std::shared_ptr<writer_compression_statistics> _compression_stats;
-  // write V2 page headers?
-  bool _v2_page_headers = false;
-  // Which columns in _table are used for sorting
-  std::optional<std::vector<sorting_column>> _sorting_columns;
+  friend parquet_writer_options_builder;
 
   /**
-   * @brief Constructor from sink.
+   * @brief Constructor from sink and table.
    *
-   * @param sink Sink used for writer output
+   * @param sink The sink used for writer output
+   * @param table Table to be written to output
    */
-  explicit chunked_parquet_writer_options(sink_info const& sink) : _sink(sink) {}
-
-  friend chunked_parquet_writer_options_builder;
+  explicit parquet_writer_options(sink_info const& sink, table_view const& table);
 
  public:
   /**
@@ -1336,277 +1144,160 @@ class chunked_parquet_writer_options {
    *
    * This has been added since Cython requires a default constructor to create objects on stack.
    */
-  chunked_parquet_writer_options() = default;
+  parquet_writer_options() = default;
 
   /**
-   * @brief Returns sink info.
+   * @brief Create builder to create `parquet_writer_options`.
    *
-   * @return Sink info
+   * @param sink The sink used for writer output
+   * @param table Table to be written to output
+   *
+   * @return Builder to build parquet_writer_options
    */
-  [[nodiscard]] sink_info const& get_sink() const { return _sink; }
+  static parquet_writer_options_builder builder(sink_info const& sink, table_view const& table);
 
   /**
-   * @brief Returns compression format used.
+   * @brief Create builder to create `parquet_writer_options`.
    *
-   * @return Compression format
+   * @return parquet_writer_options_builder
    */
-  [[nodiscard]] compression_type get_compression() const { return _compression; }
+  static parquet_writer_options_builder builder();
 
   /**
-   * @brief Returns level of statistics requested in output file.
+   * @brief Returns table_view.
    *
-   * @return Level of statistics requested in output file
+   * @return Table view
    */
-  [[nodiscard]] statistics_freq get_stats_level() const { return _stats_level; }
+  [[nodiscard]] table_view get_table() const { return _table; }
 
   /**
-   * @brief Returns metadata information.
+   * @brief Returns partitions.
    *
-   * @return Metadata information
+   * @return Partitions
    */
-  [[nodiscard]] auto const& get_metadata() const { return _metadata; }
+  [[nodiscard]] std::vector<partition_info> const& get_partitions() const { return _partitions; }
 
   /**
-   * @brief Returns Key-Value footer metadata information.
+   * @brief Returns Column chunks file paths to be set in the raw output metadata.
    *
-   * @return Key-Value footer metadata information
+   * @return Column chunks file paths to be set in the raw output metadata
    */
-  std::vector<std::map<std::string, std::string>> const& get_key_value_metadata() const
+  [[nodiscard]] std::vector<std::string> const& get_column_chunks_file_paths() const
   {
-    return _user_data;
-  }
-
-  /**
-   * @brief Returns `true` if timestamps will be written as INT96
-   *
-   * @return `true` if timestamps will be written as INT96
-   */
-  bool is_enabled_int96_timestamps() const { return _write_timestamps_as_int96; }
-
-  /**
-   * @brief Returns `true` if timestamps will be written as UTC
-   *
-   * @return `true` if timestamps will be written as UTC
-   */
-  [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }
-
-  /**
-   * @brief Returns maximum row group size, in bytes.
-   *
-   * @return Maximum row group size, in bytes
-   */
-  auto get_row_group_size_bytes() const { return _row_group_size_bytes; }
-
-  /**
-   * @brief Returns maximum row group size, in rows.
-   *
-   * @return Maximum row group size, in rows
-   */
-  auto get_row_group_size_rows() const { return _row_group_size_rows; }
-
-  /**
-   * @brief Returns maximum uncompressed page size, in bytes.
-   *
-   * If set larger than the row group size, then this will return the
-   * row group size.
-   *
-   * @return Maximum uncompressed page size, in bytes
-   */
-  auto get_max_page_size_bytes() const
-  {
-    return std::min(_max_page_size_bytes, get_row_group_size_bytes());
-  }
-
-  /**
-   * @brief Returns maximum page size, in rows.
-   *
-   * If set larger than the row group size, then this will return the row group size.
-   *
-   * @return Maximum page size, in rows
-   */
-  auto get_max_page_size_rows() const
-  {
-    return std::min(_max_page_size_rows, get_row_group_size_rows());
-  }
-
-  /**
-   * @brief Returns maximum length of min or max values in column index, in bytes.
-   *
-   * @return length min/max will be truncated to
-   */
-  auto get_column_index_truncate_length() const { return _column_index_truncate_length; }
-
-  /**
-   * @brief Returns policy for dictionary use.
-   *
-   * @return policy for dictionary use
-   */
-  [[nodiscard]] dictionary_policy get_dictionary_policy() const { return _dictionary_policy; }
-
-  /**
-   * @brief Returns maximum dictionary size, in bytes.
-   *
-   * @return Maximum dictionary size, in bytes.
-   */
-  [[nodiscard]] auto get_max_dictionary_size() const { return _max_dictionary_size; }
-
-  /**
-   * @brief Returns maximum page fragment size, in rows.
-   *
-   * @return Maximum page fragment size, in rows.
-   */
-  [[nodiscard]] auto get_max_page_fragment_size() const { return _max_page_fragment_size; }
-
-  /**
-   * @brief Returns a shared pointer to the user-provided compression statistics.
-   *
-   * @return Compression statistics
-   */
-  [[nodiscard]] std::shared_ptr<writer_compression_statistics> get_compression_statistics() const
-  {
-    return _compression_stats;
+    return _column_chunks_file_paths;
   }
 
   /**
-   * @brief Returns `true` if V2 page headers should be written.
-   *
-   * @return `true` if V2 page headers should be written.
-   */
-  [[nodiscard]] auto is_enabled_write_v2_headers() const { return _v2_page_headers; }
-
-  /**
-   * @brief Returns the sorting_columns.
-   *
-   * @return Column sort order metadata
-   */
-  [[nodiscard]] auto const& get_sorting_columns() const { return _sorting_columns; }
-
-  /**
-   * @brief Sets metadata.
-   *
-   * @param metadata Associated metadata
-   */
-  void set_metadata(table_input_metadata metadata) { _metadata = std::move(metadata); }
-
-  /**
-   * @brief Sets Key-Value footer metadata.
-   *
-   * @param metadata Key-Value footer metadata
-   */
-  void set_key_value_metadata(std::vector<std::map<std::string, std::string>> metadata);
-
-  /**
-   * @brief Sets the level of statistics in parquet_writer_options.
-   *
-   * @param sf Level of statistics requested in the output file
-   */
-  void set_stats_level(statistics_freq sf) { _stats_level = sf; }
-
-  /**
-   * @brief Sets compression type.
-   *
-   * @param compression The compression type to use
-   */
-  void set_compression(compression_type compression) { _compression = compression; }
-
-  /**
-   * @brief Sets timestamp writing preferences.
-   *
-   * INT96 timestamps will be written if `true` and TIMESTAMP_MICROS will be written if `false`.
+   * @brief Sets partitions.
    *
-   * @param req Boolean value to enable/disable writing of INT96 timestamps
+   * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must
+   * be same size as number of sinks in sink_info
    */
-  void enable_int96_timestamps(bool req) { _write_timestamps_as_int96 = req; }
+  void set_partitions(std::vector<partition_info> partitions);
 
   /**
-   * @brief Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to `true`.
+   * @brief Sets column chunks file path to be set in the raw output metadata.
    *
-   * @param val Boolean value to enable/disable writing of timestamps as UTC.
+   * @param file_paths Vector of Strings which indicates file path. Must be same size as number of
+   * data sinks in sink info
    */
-  void enable_utc_timestamps(bool val) { _write_timestamps_as_UTC = val; }
+  void set_column_chunks_file_paths(std::vector<std::string> file_paths);
+};
 
+/**
+ * @brief Class to build `parquet_writer_options`.
+ */
+class parquet_writer_options_builder
+  : public parquet_writer_options_builder_base<parquet_writer_options_builder,
+                                               parquet_writer_options> {
+ public:
   /**
-   * @brief Sets the maximum row group size, in bytes.
+   * @brief Default constructor.
    *
-   * @param size_bytes Maximum row group size, in bytes to set
+   * This has been added since Cython requires a default constructor to create objects on stack.
    */
-  void set_row_group_size_bytes(size_t size_bytes);
+  explicit parquet_writer_options_builder() = default;
 
   /**
-   * @brief Sets the maximum row group size, in rows.
+   * @brief Constructor from sink and table.
    *
-   * @param size_rows The maximum row group size, in rows to set
+   * @param sink The sink used for writer output
+   * @param table Table to be written to output
    */
-  void set_row_group_size_rows(size_type size_rows);
+  explicit parquet_writer_options_builder(sink_info const& sink, table_view const& table);
 
   /**
-   * @brief Sets the maximum uncompressed page size, in bytes.
+   * @brief Sets partitions in parquet_writer_options.
    *
-   * @param size_bytes Maximum uncompressed page size, in bytes to set
+   * @param partitions Partitions of input table in {start_row, num_rows} pairs. If specified, must
+   * be same size as number of sinks in sink_info
+   * @return this for chaining
    */
-  void set_max_page_size_bytes(size_t size_bytes);
+  parquet_writer_options_builder& partitions(std::vector<partition_info> partitions);
 
   /**
-   * @brief Sets the maximum page size, in rows.
+   * @brief Sets column chunks file path to be set in the raw output metadata.
    *
-   * @param size_rows The maximum page size, in rows to set
+   * @param file_paths Vector of Strings which indicates file path. Must be same size as number of
+   * data sinks
+   * @return this for chaining
    */
-  void set_max_page_size_rows(size_type size_rows);
+  parquet_writer_options_builder& column_chunks_file_paths(std::vector<std::string> file_paths);
+};
 
-  /**
-   * @brief Sets the maximum length of min or max values in column index, in bytes.
-   *
-   * @param size_bytes length min/max will be truncated to
-   */
-  void set_column_index_truncate_length(int32_t size_bytes);
+/**
+ * @brief Writes a set of columns to parquet format.
+ *
+ * The following code snippet demonstrates how to write columns to a file:
+ * @code
+ *  auto destination = cudf::io::sink_info("dataset.parquet");
+ *  auto options     = cudf::io::parquet_writer_options::builder(destination, table->view());
+ *  cudf::io::write_parquet(options);
+ * @endcode
+ *
+ * @param options Settings for controlling writing behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return A blob that contains the file metadata (parquet FileMetadata thrift message) if
+ *         requested in parquet_writer_options (empty blob otherwise).
+ */
 
-  /**
-   * @brief Sets the policy for dictionary use.
-   *
-   * @param policy Policy for dictionary use
-   */
-  void set_dictionary_policy(dictionary_policy policy);
+std::unique_ptr<std::vector<uint8_t>> write_parquet(
+  parquet_writer_options const& options, rmm::cuda_stream_view stream = cudf::get_default_stream());
 
-  /**
-   * @brief Sets the maximum dictionary size, in bytes.
-   *
-   * @param size_bytes Maximum dictionary size, in bytes
-   */
-  void set_max_dictionary_size(size_t size_bytes);
+/**
+ * @brief Merges multiple raw metadata blobs that were previously created by write_parquet
+ * into a single metadata blob.
+ *
+ * @ingroup io_writers
+ *
+ * @param[in] metadata_list List of input file metadata
+ * @return A parquet-compatible blob that contains the data for all row groups in the list
+ */
+std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(
+  std::vector<std::unique_ptr<std::vector<uint8_t>>> const& metadata_list);
 
-  /**
-   * @brief Sets the maximum page fragment size, in rows.
-   *
-   * @param size_rows Maximum page fragment size, in rows.
-   */
-  void set_max_page_fragment_size(size_type size_rows);
+class chunked_parquet_writer_options_builder;
 
+/**
+ * @brief Settings for `parquet_chunked_writer`.
+ */
+class chunked_parquet_writer_options : public parquet_writer_options_base {
   /**
-   * @brief Sets the pointer to the output compression statistics.
+   * @brief Constructor from sink.
    *
-   * @param comp_stats Pointer to compression statistics to be updated after writing
+   * @param sink Sink used for writer output
    */
-  void set_compression_statistics(std::shared_ptr<writer_compression_statistics> comp_stats)
-  {
-    _compression_stats = std::move(comp_stats);
-  }
+  explicit chunked_parquet_writer_options(sink_info const& sink);
 
-  /**
-   * @brief Sets preference for V2 page headers. Write V2 page headers if set to `true`.
-   *
-   * @param val Boolean value to enable/disable writing of V2 page headers.
-   */
-  void enable_write_v2_headers(bool val) { _v2_page_headers = val; }
+  friend chunked_parquet_writer_options_builder;
 
+ public:
   /**
-   * @brief Sets sorting columns.
+   * @brief Default constructor.
    *
-   * @param sorting_columns Column sort order metadata
+   * This has been added since Cython requires a default constructor to create objects on stack.
    */
-  void set_sorting_columns(std::vector<sorting_column> sorting_columns)
-  {
-    _sorting_columns = std::move(sorting_columns);
-  }
+  chunked_parquet_writer_options() = default;
 
   /**
    * @brief creates builder to build chunked_parquet_writer_options.
@@ -1619,11 +1310,11 @@ class chunked_parquet_writer_options {
 };
 
 /**
- * @brief Builds options for chunked_parquet_writer_options.
+ * @brief Class to build `chunked_parquet_writer_options`.
  */
-class chunked_parquet_writer_options_builder {
-  chunked_parquet_writer_options options;
-
+class chunked_parquet_writer_options_builder
+  : public parquet_writer_options_builder_base<chunked_parquet_writer_options_builder,
+                                               chunked_parquet_writer_options> {
  public:
   /**
    * @brief Default constructor.
@@ -1637,238 +1328,7 @@ class chunked_parquet_writer_options_builder {
    *
    * @param sink The sink used for writer output
    */
-  chunked_parquet_writer_options_builder(sink_info const& sink) : options(sink){};
-
-  /**
-   * @brief Sets metadata to chunked_parquet_writer_options.
-   *
-   * @param metadata Associated metadata
-   * @return this for chaining
-   */
-  chunked_parquet_writer_options_builder& metadata(table_input_metadata metadata)
-  {
-    options._metadata = std::move(metadata);
-    return *this;
-  }
-
-  /**
-   * @brief Sets Key-Value footer metadata in parquet_writer_options.
-   *
-   * @param metadata Key-Value footer metadata
-   * @return this for chaining
-   */
-  chunked_parquet_writer_options_builder& key_value_metadata(
-    std::vector<std::map<std::string, std::string>> metadata);
-
-  /**
-   * @brief Sets the level of statistics in chunked_parquet_writer_options.
-   *
-   * @param sf Level of statistics requested in the output file
-   * @return this for chaining
-   */
-  chunked_parquet_writer_options_builder& stats_level(statistics_freq sf)
-  {
-    options._stats_level = sf;
-    return *this;
-  }
-
-  /**
-   * @brief Sets compression type to chunked_parquet_writer_options.
-   *
-   * @param compression The compression type to use
-   * @return this for chaining
-   */
-  chunked_parquet_writer_options_builder& compression(compression_type compression)
-  {
-    options._compression = compression;
-    return *this;
-  }
-
-  /**
-   * @brief Set to true if timestamps should be written as
-   * int96 types instead of int64 types. Even though int96 is deprecated and is
-   * not an internal type for cudf, it needs to be written for backwards
-   * compatibility reasons.
-   *
-   * @param enabled Boolean value to enable/disable int96 timestamps
-   * @return this for chaining
-   */
-  chunked_parquet_writer_options_builder& int96_timestamps(bool enabled)
-  {
-    options._write_timestamps_as_int96 = enabled;
-    return *this;
-  }
-
-  /**
-   * @brief Set to true if timestamps are to be written as UTC.
-   *
-   * @param enabled Boolean value to enable/disable writing of timestamps as UTC.
-   * @return this for chaining
-   */
-  chunked_parquet_writer_options_builder& utc_timestamps(bool enabled)
-  {
-    options._write_timestamps_as_UTC = enabled;
-    return *this;
-  }
-
-  /**
-   * @brief Set to true if V2 page headers are to be written.
-   *
-   * @param enabled Boolean value to enable/disable writing of V2 page headers.
-   * @return this for chaining
-   */
-  chunked_parquet_writer_options_builder& write_v2_headers(bool enabled);
-
-  /**
-   * @brief Sets the maximum row group size, in bytes.
-   *
-   * @param val maximum row group size
-   * @return this for chaining
-   */
-  chunked_parquet_writer_options_builder& row_group_size_bytes(size_t val)
-  {
-    options.set_row_group_size_bytes(val);
-    return *this;
-  }
-
-  /**
-   * @brief Sets the maximum number of rows in output row groups.
-   *
-   * @param val maximum number or rows
-   * @return this for chaining
-   */
-  chunked_parquet_writer_options_builder& row_group_size_rows(size_type val)
-  {
-    options.set_row_group_size_rows(val);
-    return *this;
-  }
-
-  /**
-   * @brief Sets the maximum uncompressed page size, in bytes.
-   *
-   * Serves as a hint to the writer, and can be exceeded under certain circumstances. Cannot be
-   * larger than the row group size in bytes, and will be adjusted to match if it is.
-   *
-   * @param val maximum page size
-   * @return this for chaining
-   */
-  chunked_parquet_writer_options_builder& max_page_size_bytes(size_t val)
-  {
-    options.set_max_page_size_bytes(val);
-    return *this;
-  }
-
-  /**
-   * @brief Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting.
-   * Cannot be larger than the row group size in rows, and will be adjusted to match if it is.
-   *
-   * @param val maximum rows per page
-   * @return this for chaining
-   */
-  chunked_parquet_writer_options_builder& max_page_size_rows(size_type val)
-  {
-    options.set_max_page_size_rows(val);
-    return *this;
-  }
-
-  /**
-   * @brief Sets the desired maximum size in bytes for min and max values in the column index.
-   *
-   * Values exceeding this limit will be truncated, but modified such that they will still
-   * be valid lower and upper bounds. This only applies to variable length types, such as string.
-   * Maximum values will not be truncated if there is no suitable truncation that results in
-   * a valid upper bound.
-   *
-   * Default value is 64.
-   *
-   * @param val length min/max will be truncated to, with 0 indicating no truncation
-   * @return this for chaining
-   */
-  chunked_parquet_writer_options_builder& column_index_truncate_length(int32_t val)
-  {
-    options.set_column_index_truncate_length(val);
-    return *this;
-  }
-
-  /**
-   * @brief Sets the policy for dictionary use.
-   *
-   * Certain compression algorithms (e.g Zstandard) have limits on how large of a buffer can
-   * be compressed. In some circumstances, the dictionary can grow beyond this limit, which
-   * will prevent the column from being compressed. This setting controls how the writer
-   * should act in these circumstances. A setting of dictionary_policy::ADAPTIVE will disable
-   * dictionary encoding for columns where the dictionary exceeds the limit. A setting of
-   * dictionary_policy::NEVER will disable the use of dictionary encoding globally. A setting of
-   * dictionary_policy::ALWAYS will allow the use of dictionary encoding even if it will result in
-   * the disabling of compression for columns that would otherwise be compressed.
-   *
-   * The default value is dictionary_policy::ADAPTIVE.
-   *
-   * @param val policy for dictionary use
-   * @return this for chaining
-   */
-  chunked_parquet_writer_options_builder& dictionary_policy(enum dictionary_policy val);
-
-  /**
-   * @brief Sets the maximum dictionary size, in bytes.
-   *
-   * Disables dictionary encoding for any column chunk where the dictionary will
-   * exceed this limit.  Only used when the dictionary_policy is set to 'ADAPTIVE'.
-   *
-   * Default value is 1048576 (1MiB).
-   *
-   * @param val maximum dictionary size
-   * @return this for chaining
-   */
-  chunked_parquet_writer_options_builder& max_dictionary_size(size_t val);
-
-  /**
-   * @brief Sets the maximum page fragment size, in rows.
-   *
-   * Files with nested schemas or very long strings may need a page fragment size
-   * smaller than the default value of 5000 to ensure a single fragment will not
-   * exceed the desired maximum page size in bytes.
-   *
-   * @param val maximum page fragment size
-   * @return this for chaining
-   */
-  chunked_parquet_writer_options_builder& max_page_fragment_size(size_type val);
-
-  /**
-   * @brief Sets the pointer to the output compression statistics.
-   *
-   * @param comp_stats Pointer to compression statistics to be filled once writer is done
-   * @return this for chaining
-   */
-  chunked_parquet_writer_options_builder& compression_statistics(
-    std::shared_ptr<writer_compression_statistics> const& comp_stats)
-  {
-    options._compression_stats = comp_stats;
-    return *this;
-  }
-
-  /**
-   * @brief Sets column sorting metadata to chunked_parquet_writer_options.
-   *
-   * @param sorting_columns Column sort order metadata
-   * @return this for chaining
-   */
-  chunked_parquet_writer_options_builder& sorting_columns(
-    std::vector<sorting_column> sorting_columns);
-
-  /**
-   * @brief move chunked_parquet_writer_options member once it's built.
-   */
-  operator chunked_parquet_writer_options&&() { return std::move(options); }
-
-  /**
-   * @brief move chunked_parquet_writer_options member once it's is built.
-   *
-   * This has been added since Cython does not support overloading of conversion operators.
-   *
-   * @return Built `chunked_parquet_writer_options` object's r-value reference
-   */
-  chunked_parquet_writer_options&& build() { return std::move(options); }
+  chunked_parquet_writer_options_builder(sink_info const& sink);
 };
 
 /**
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 3ba2facf276..1ed8ee5ce06 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -115,7 +115,7 @@ parquet_writer_options_builder parquet_writer_options::builder()
 chunked_parquet_writer_options_builder chunked_parquet_writer_options::builder(
   sink_info const& sink)
 {
-  return chunked_parquet_writer_options_builder(sink);
+  return chunked_parquet_writer_options_builder{sink};
 }
 
 namespace {
@@ -740,29 +740,37 @@ void parquet_reader_options::set_num_rows(size_type val)
   _num_rows = val;
 }
 
-void parquet_writer_options::set_partitions(std::vector<partition_info> partitions)
+void parquet_writer_options_base::set_metadata(table_input_metadata metadata)
 {
-  CUDF_EXPECTS(partitions.size() == _sink.num_sinks(),
-               "Mismatch between number of sinks and number of partitions");
-  _partitions = std::move(partitions);
+  _metadata = std::move(metadata);
 }
 
-void parquet_writer_options::set_key_value_metadata(
+void parquet_writer_options_base::set_key_value_metadata(
   std::vector<std::map<std::string, std::string>> metadata)
 {
-  CUDF_EXPECTS(metadata.size() == _sink.num_sinks(),
+  CUDF_EXPECTS(metadata.size() == get_sink().num_sinks(),
                "Mismatch between number of sinks and number of metadata maps");
   _user_data = std::move(metadata);
 }
 
-void parquet_writer_options::set_column_chunks_file_paths(std::vector<std::string> file_paths)
+void parquet_writer_options_base::set_stats_level(statistics_freq sf) { _stats_level = sf; }
+
+void parquet_writer_options_base::set_compression(compression_type compression)
 {
-  CUDF_EXPECTS(file_paths.size() == _sink.num_sinks(),
-               "Mismatch between number of sinks and number of chunk paths to set");
-  _column_chunks_file_paths = std::move(file_paths);
+  _compression = compression;
+}
+
+void parquet_writer_options_base::enable_int96_timestamps(bool req)
+{
+  _write_timestamps_as_int96 = req;
+}
+
+void parquet_writer_options_base::enable_utc_timestamps(bool val)
+{
+  _write_timestamps_as_UTC = val;
 }
 
-void parquet_writer_options::set_row_group_size_bytes(size_t size_bytes)
+void parquet_writer_options_base::set_row_group_size_bytes(size_t size_bytes)
 {
   CUDF_EXPECTS(
     size_bytes >= 1024,
@@ -770,13 +778,13 @@ void parquet_writer_options::set_row_group_size_bytes(size_t size_bytes)
   _row_group_size_bytes = size_bytes;
 }
 
-void parquet_writer_options::set_row_group_size_rows(size_type size_rows)
+void parquet_writer_options_base::set_row_group_size_rows(size_type size_rows)
 {
   CUDF_EXPECTS(size_rows > 0, "The maximum row group row count must be a positive integer.");
   _row_group_size_rows = size_rows;
 }
 
-void parquet_writer_options::set_max_page_size_bytes(size_t size_bytes)
+void parquet_writer_options_base::set_max_page_size_bytes(size_t size_bytes)
 {
   CUDF_EXPECTS(size_bytes >= 1024, "The maximum page size cannot be smaller than 1KB.");
   CUDF_EXPECTS(size_bytes <= static_cast<size_t>(std::numeric_limits<int32_t>::max()),
@@ -784,190 +792,249 @@ void parquet_writer_options::set_max_page_size_bytes(size_t size_bytes)
   _max_page_size_bytes = size_bytes;
 }
 
-void parquet_writer_options::set_max_page_size_rows(size_type size_rows)
+void parquet_writer_options_base::set_max_page_size_rows(size_type size_rows)
 {
   CUDF_EXPECTS(size_rows > 0, "The maximum page row count must be a positive integer.");
   _max_page_size_rows = size_rows;
 }
 
-void parquet_writer_options::set_column_index_truncate_length(int32_t size_bytes)
+void parquet_writer_options_base::set_column_index_truncate_length(int32_t size_bytes)
 {
   CUDF_EXPECTS(size_bytes >= 0, "Column index truncate length cannot be negative.");
   _column_index_truncate_length = size_bytes;
 }
 
-void parquet_writer_options::set_dictionary_policy(dictionary_policy policy)
+void parquet_writer_options_base::set_dictionary_policy(dictionary_policy policy)
 {
   _dictionary_policy = policy;
 }
 
-void parquet_writer_options::set_max_dictionary_size(size_t size_bytes)
+void parquet_writer_options_base::set_max_dictionary_size(size_t size_bytes)
 {
   CUDF_EXPECTS(size_bytes <= static_cast<size_t>(std::numeric_limits<int32_t>::max()),
                "The maximum dictionary size cannot exceed 2GB.");
   _max_dictionary_size = size_bytes;
 }
 
-void parquet_writer_options::set_max_page_fragment_size(size_type size_rows)
+void parquet_writer_options_base::set_max_page_fragment_size(size_type size_rows)
 {
   CUDF_EXPECTS(size_rows > 0, "Page fragment size must be a positive integer.");
   _max_page_fragment_size = size_rows;
 }
 
-parquet_writer_options_builder& parquet_writer_options_builder::partitions(
-  std::vector<partition_info> partitions)
+void parquet_writer_options_base::set_compression_statistics(
+  std::shared_ptr<writer_compression_statistics> comp_stats)
 {
-  options.set_partitions(std::move(partitions));
-  return *this;
+  _compression_stats = std::move(comp_stats);
+}
+
+void parquet_writer_options_base::enable_write_v2_headers(bool val) { _v2_page_headers = val; }
+
+void parquet_writer_options_base::set_sorting_columns(std::vector<sorting_column> sorting_columns)
+{
+  _sorting_columns = std::move(sorting_columns);
+}
+
+parquet_writer_options::parquet_writer_options(sink_info const& sink, table_view const& table)
+  : parquet_writer_options_base(sink), _table(table)
+{
+}
+
+void parquet_writer_options::set_partitions(std::vector<partition_info> partitions)
+{
+  CUDF_EXPECTS(partitions.size() == get_sink().num_sinks(),
+               "Mismatch between number of sinks and number of partitions");
+  _partitions = std::move(partitions);
+}
+
+void parquet_writer_options::set_column_chunks_file_paths(std::vector<std::string> file_paths)
+{
+  CUDF_EXPECTS(file_paths.size() == get_sink().num_sinks(),
+               "Mismatch between number of sinks and number of chunk paths to set");
+  _column_chunks_file_paths = std::move(file_paths);
+}
+
+template <class BuilderT, class OptionsT>
+parquet_writer_options_builder_base<BuilderT, OptionsT>::parquet_writer_options_builder_base(
+  OptionsT options)
+  : _options(std::move(options))
+{
+}
+
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::metadata(
+  table_input_metadata metadata)
+{
+  _options.set_metadata(std::move(metadata));
+  return static_cast<BuilderT&>(*this);
 }
 
-parquet_writer_options_builder& parquet_writer_options_builder::key_value_metadata(
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::key_value_metadata(
   std::vector<std::map<std::string, std::string>> metadata)
 {
-  options.set_key_value_metadata(std::move(metadata));
-  return *this;
+  _options.set_key_value_metadata(std::move(metadata));
+  return static_cast<BuilderT&>(*this);
 }
 
-parquet_writer_options_builder& parquet_writer_options_builder::column_chunks_file_paths(
-  std::vector<std::string> file_paths)
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::stats_level(statistics_freq sf)
 {
-  options.set_column_chunks_file_paths(std::move(file_paths));
-  return *this;
+  _options.set_stats_level(sf);
+  return static_cast<BuilderT&>(*this);
 }
 
-parquet_writer_options_builder& parquet_writer_options_builder::dictionary_policy(
-  enum dictionary_policy val)
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::compression(
+  compression_type compression)
 {
-  options.set_dictionary_policy(val);
-  return *this;
+  _options.set_compression(compression);
+  return static_cast<BuilderT&>(*this);
 }
 
-parquet_writer_options_builder& parquet_writer_options_builder::max_dictionary_size(size_t val)
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::row_group_size_bytes(size_t val)
 {
-  options.set_max_dictionary_size(val);
-  return *this;
+  _options.set_row_group_size_bytes(val);
+  return static_cast<BuilderT&>(*this);
 }
 
-parquet_writer_options_builder& parquet_writer_options_builder::max_page_fragment_size(
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::row_group_size_rows(
   size_type val)
 {
-  options.set_max_page_fragment_size(val);
-  return *this;
+  _options.set_row_group_size_rows(val);
+  return static_cast<BuilderT&>(*this);
 }
 
-parquet_writer_options_builder& parquet_writer_options_builder::write_v2_headers(bool enabled)
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::max_page_size_bytes(size_t val)
 {
-  options.enable_write_v2_headers(enabled);
-  return *this;
+  _options.set_max_page_size_bytes(val);
+  return static_cast<BuilderT&>(*this);
 }
 
-parquet_writer_options_builder& parquet_writer_options_builder::sorting_columns(
-  std::vector<sorting_column> sorting_columns)
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::max_page_size_rows(size_type val)
 {
-  options._sorting_columns = std::move(sorting_columns);
-  return *this;
+  _options.set_max_page_size_rows(val);
+  return static_cast<BuilderT&>(*this);
 }
 
-void chunked_parquet_writer_options::set_key_value_metadata(
-  std::vector<std::map<std::string, std::string>> metadata)
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::column_index_truncate_length(
+  int32_t val)
 {
-  CUDF_EXPECTS(metadata.size() == _sink.num_sinks(),
-               "Mismatch between number of sinks and number of metadata maps");
-  _user_data = std::move(metadata);
+  _options.set_column_index_truncate_length(val);
+  return static_cast<BuilderT&>(*this);
 }
 
-void chunked_parquet_writer_options::set_row_group_size_bytes(size_t size_bytes)
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::dictionary_policy(
+  enum dictionary_policy val)
 {
-  CUDF_EXPECTS(
-    size_bytes >= 1024,
-    "The maximum row group size cannot be smaller than the minimum page size, which is 1KB.");
-  _row_group_size_bytes = size_bytes;
+  _options.set_dictionary_policy(val);
+  return static_cast<BuilderT&>(*this);
 }
 
-void chunked_parquet_writer_options::set_row_group_size_rows(size_type size_rows)
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::max_dictionary_size(size_t val)
 {
-  CUDF_EXPECTS(size_rows > 0, "The maximum row group row count must be a positive integer.");
-  _row_group_size_rows = size_rows;
+  _options.set_max_dictionary_size(val);
+  return static_cast<BuilderT&>(*this);
 }
 
-void chunked_parquet_writer_options::set_max_page_size_bytes(size_t size_bytes)
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::max_page_fragment_size(
+  size_type val)
 {
-  CUDF_EXPECTS(size_bytes >= 1024, "The maximum page size cannot be smaller than 1KB.");
-  CUDF_EXPECTS(size_bytes <= static_cast<size_t>(std::numeric_limits<int32_t>::max()),
-               "The maximum page size cannot exceed 2GB.");
-  _max_page_size_bytes = size_bytes;
+  _options.set_max_page_fragment_size(val);
+  return static_cast<BuilderT&>(*this);
 }
 
-void chunked_parquet_writer_options::set_max_page_size_rows(size_type size_rows)
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::compression_statistics(
+  std::shared_ptr<writer_compression_statistics> const& comp_stats)
 {
-  CUDF_EXPECTS(size_rows > 0, "The maximum page row count must be a positive integer.");
-  _max_page_size_rows = size_rows;
+  _options.set_compression_statistics(comp_stats);
+  return static_cast<BuilderT&>(*this);
 }
 
-void chunked_parquet_writer_options::set_column_index_truncate_length(int32_t size_bytes)
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::int96_timestamps(bool enabled)
 {
-  CUDF_EXPECTS(size_bytes >= 0, "Column index truncate length cannot be negative.");
-  _column_index_truncate_length = size_bytes;
+  _options.enable_int96_timestamps(enabled);
+  return static_cast<BuilderT&>(*this);
 }
 
-void chunked_parquet_writer_options::set_dictionary_policy(dictionary_policy policy)
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::utc_timestamps(bool enabled)
 {
-  _dictionary_policy = policy;
+  _options.enable_utc_timestamps(enabled);
+  return static_cast<BuilderT&>(*this);
 }
 
-void chunked_parquet_writer_options::set_max_dictionary_size(size_t size_bytes)
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::write_v2_headers(bool enabled)
 {
-  CUDF_EXPECTS(size_bytes <= static_cast<size_t>(std::numeric_limits<int32_t>::max()),
-               "The maximum dictionary size cannot exceed 2GB.");
-  _max_dictionary_size = size_bytes;
+  _options.enable_write_v2_headers(enabled);
+  return static_cast<BuilderT&>(*this);
 }
 
-void chunked_parquet_writer_options::set_max_page_fragment_size(size_type size_rows)
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::sorting_columns(
+  std::vector<sorting_column> sorting_columns)
 {
-  CUDF_EXPECTS(size_rows > 0, "Page fragment size must be a positive integer.");
-  _max_page_fragment_size = size_rows;
+  _options.set_sorting_columns(std::move(sorting_columns));
+  return static_cast<BuilderT&>(*this);
 }
 
-chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::key_value_metadata(
-  std::vector<std::map<std::string, std::string>> metadata)
+template <class BuilderT, class OptionsT>
+parquet_writer_options_builder_base<BuilderT, OptionsT>::operator OptionsT&&()
 {
-  options.set_key_value_metadata(std::move(metadata));
-  return *this;
+  return std::move(_options);
 }
 
-chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::dictionary_policy(
-  enum dictionary_policy val)
+template <class BuilderT, class OptionsT>
+OptionsT&& parquet_writer_options_builder_base<BuilderT, OptionsT>::build()
 {
-  options.set_dictionary_policy(val);
-  return *this;
+  return std::move(_options);
 }
 
-chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::max_dictionary_size(
-  size_t val)
+template class parquet_writer_options_builder_base<parquet_writer_options_builder,
+                                                   parquet_writer_options>;
+template class parquet_writer_options_builder_base<chunked_parquet_writer_options_builder,
+                                                   chunked_parquet_writer_options>;
+
+parquet_writer_options_builder::parquet_writer_options_builder(sink_info const& sink,
+                                                               table_view const& table)
+  : parquet_writer_options_builder_base(parquet_writer_options{sink, table})
 {
-  options.set_max_dictionary_size(val);
-  return *this;
 }
 
-chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::write_v2_headers(
-  bool enabled)
+parquet_writer_options_builder& parquet_writer_options_builder::partitions(
+  std::vector<partition_info> partitions)
 {
-  options.enable_write_v2_headers(enabled);
+  get_options().set_partitions(std::move(partitions));
   return *this;
 }
 
-chunked_parquet_writer_options_builder& chunked_parquet_writer_options_builder::sorting_columns(
-  std::vector<sorting_column> sorting_columns)
+parquet_writer_options_builder& parquet_writer_options_builder::column_chunks_file_paths(
+  std::vector<std::string> file_paths)
 {
-  options._sorting_columns = std::move(sorting_columns);
+  get_options().set_column_chunks_file_paths(std::move(file_paths));
   return *this;
 }
 
-chunked_parquet_writer_options_builder&
-chunked_parquet_writer_options_builder::max_page_fragment_size(size_type val)
+chunked_parquet_writer_options::chunked_parquet_writer_options(sink_info const& sink)
+  : parquet_writer_options_base(sink)
+{
+}
+
+chunked_parquet_writer_options_builder::chunked_parquet_writer_options_builder(
+  sink_info const& sink)
+  : parquet_writer_options_builder_base(chunked_parquet_writer_options{sink})
 {
-  options.set_max_page_fragment_size(val);
-  return *this;
 }
 
 }  // namespace cudf::io
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
index fb98650308a..36654457995 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
@@ -66,24 +66,19 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
     cdef cudf_io_types.table_with_metadata read_parquet(
         parquet_reader_options args) except +
 
-    cdef cppclass parquet_writer_options:
-        parquet_writer_options() except +
+    cdef cppclass parquet_writer_options_base:
+        parquet_writer_options_base() except +
         cudf_io_types.sink_info get_sink_info() except +
         cudf_io_types.compression_type get_compression() except +
         cudf_io_types.statistics_freq get_stats_level() except +
-        cudf_table_view.table_view get_table() except +
         const optional[cudf_io_types.table_input_metadata]& get_metadata(
         ) except +
-        string get_column_chunks_file_paths() except +
         size_t get_row_group_size_bytes() except +
         size_type get_row_group_size_rows() except +
         size_t get_max_page_size_bytes() except +
         size_type get_max_page_size_rows() except +
         size_t get_max_dictionary_size() except +
 
-        void set_partitions(
-            vector[cudf_io_types.partition_info] partitions
-        ) except +
         void set_metadata(
             cudf_io_types.table_input_metadata m
         ) except +
@@ -96,9 +91,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_compression(
             cudf_io_types.compression_type compression
         ) except +
-        void set_column_chunks_file_paths(
-            vector[string] column_chunks_file_paths
-        ) except +
         void set_int96_timestamps(
             bool enabled
         ) except +
@@ -113,161 +105,112 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void enable_write_v2_headers(bool val) except +
         void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except +
 
+    cdef cppclass parquet_writer_options(parquet_writer_options_base):
+        parquet_writer_options() except +
+        cudf_table_view.table_view get_table() except +
+        string get_column_chunks_file_paths() except +
+        void set_partitions(
+            vector[cudf_io_types.partition_info] partitions
+        ) except +
+        void set_column_chunks_file_paths(
+            vector[string] column_chunks_file_paths
+        ) except +
+
         @staticmethod
         parquet_writer_options_builder builder(
             cudf_io_types.sink_info sink_,
             cudf_table_view.table_view table_
         ) except +
 
-    cdef cppclass parquet_writer_options_builder:
-
+    cdef cppclass parquet_writer_options_builder_base[BuilderT, OptionsT]:
         parquet_writer_options_builder() except +
-        parquet_writer_options_builder(
-            cudf_io_types.sink_info sink_,
-            cudf_table_view.table_view table_
-        ) except +
-        parquet_writer_options_builder& partitions(
-            vector[cudf_io_types.partition_info] partitions
-        ) except +
-        parquet_writer_options_builder& metadata(
+
+        BuilderT& metadata(
             cudf_io_types.table_input_metadata m
         ) except +
-        parquet_writer_options_builder& key_value_metadata(
+        BuilderT& key_value_metadata(
             vector[map[string, string]] kvm
         ) except +
-        parquet_writer_options_builder& stats_level(
+        BuilderT& stats_level(
             cudf_io_types.statistics_freq sf
         ) except +
-        parquet_writer_options_builder& compression(
+        BuilderT& compression(
             cudf_io_types.compression_type compression
         ) except +
-        parquet_writer_options_builder& column_chunks_file_paths(
-            vector[string] column_chunks_file_paths
-        ) except +
-        parquet_writer_options_builder& int96_timestamps(
+        BuilderT& int96_timestamps(
             bool enabled
         ) except +
-        parquet_writer_options_builder& utc_timestamps(
+        BuilderT& utc_timestamps(
             bool enabled
         ) except +
-        parquet_writer_options_builder& row_group_size_bytes(
+        BuilderT& row_group_size_bytes(
             size_t val
         ) except +
-        parquet_writer_options_builder& row_group_size_rows(
+        BuilderT& row_group_size_rows(
             size_type val
         ) except +
-        parquet_writer_options_builder& max_page_size_bytes(
+        BuilderT& max_page_size_bytes(
             size_t val
         ) except +
-        parquet_writer_options_builder& max_page_size_rows(
+        BuilderT& max_page_size_rows(
             size_type val
         ) except +
-        parquet_writer_options_builder& max_dictionary_size(
+        BuilderT& max_dictionary_size(
             size_t val
         ) except +
-        parquet_writer_options_builder& write_v2_headers(
+        BuilderT& write_v2_headers(
             bool val
         ) except +
-        parquet_writer_options_builder& dictionary_policy(
+        BuilderT& dictionary_policy(
             cudf_io_types.dictionary_policy val
         ) except +
+        # FIXME: the following two functions actually belong in
+        # parquet_writer_options_builder, but placing them there yields a
+        # "'parquet_writer_options_builder' is not a type identifier" error.
+        # This is probably a bug in cython since a simpler CRTP example that
+        # has methods returning references to a child class seem to work.
+        # Calling these from the chunked options builder will fail at compile
+        # time, so this should be safe.
+        # NOTE: these two are never actually called from libcudf. Instead these
+        # properties are set in the options after calling build(), so perhaps
+        # they can be removed.
+        BuilderT& partitions(
+            vector[cudf_io_types.partition_info] partitions
+        ) except +
+        BuilderT& column_chunks_file_paths(
+            vector[string] column_chunks_file_paths
+        ) except +
+        OptionsT build() except +
 
-        parquet_writer_options build() except +
+    cdef cppclass parquet_writer_options_builder(
+            parquet_writer_options_builder_base[parquet_writer_options_builder,
+                                                parquet_writer_options]):
+        parquet_writer_options_builder() except +
+        parquet_writer_options_builder(
+            cudf_io_types.sink_info sink_,
+            cudf_table_view.table_view table_
+        ) except +
 
     cdef unique_ptr[vector[uint8_t]] write_parquet(
         parquet_writer_options args
     ) except +
 
-    cdef cppclass chunked_parquet_writer_options:
+    cdef cppclass chunked_parquet_writer_options(parquet_writer_options_base):
         chunked_parquet_writer_options() except +
-        cudf_io_types.sink_info get_sink() except +
-        cudf_io_types.compression_type get_compression() except +
-        cudf_io_types.statistics_freq get_stats_level() except +
-        const optional[cudf_io_types.table_input_metadata]& get_metadata(
-        ) except +
-        size_t get_row_group_size_bytes() except +
-        size_type get_row_group_size_rows() except +
-        size_t get_max_page_size_bytes() except +
-        size_type get_max_page_size_rows() except +
-        size_t get_max_dictionary_size() except +
-
-        void set_metadata(
-            cudf_io_types.table_input_metadata m
-        ) except +
-        void set_key_value_metadata(
-            vector[map[string, string]] kvm
-        ) except +
-        void set_stats_level(
-            cudf_io_types.statistics_freq sf
-        ) except +
-        void set_compression(
-            cudf_io_types.compression_type compression
-        ) except +
-        void set_int96_timestamps(
-            bool enabled
-        ) except +
-        void set_utc_timestamps(
-            bool enabled
-        ) except +
-        void set_row_group_size_bytes(size_t val) except +
-        void set_row_group_size_rows(size_type val) except +
-        void set_max_page_size_bytes(size_t val) except +
-        void set_max_page_size_rows(size_type val) except +
-        void set_max_dictionary_size(size_t val) except +
-        void enable_write_v2_headers(bool val) except +
-        void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except +
 
         @staticmethod
         chunked_parquet_writer_options_builder builder(
             cudf_io_types.sink_info sink_,
         ) except +
 
-    cdef cppclass chunked_parquet_writer_options_builder:
+    cdef cppclass chunked_parquet_writer_options_builder(
+            parquet_writer_options_builder_base[chunked_parquet_writer_options_builder,
+                                                chunked_parquet_writer_options]
+            ):
         chunked_parquet_writer_options_builder() except +
         chunked_parquet_writer_options_builder(
             cudf_io_types.sink_info sink_,
         ) except +
-        chunked_parquet_writer_options_builder& metadata(
-            cudf_io_types.table_input_metadata m
-        ) except +
-        chunked_parquet_writer_options_builder& key_value_metadata(
-            vector[map[string, string]] kvm
-        ) except +
-        chunked_parquet_writer_options_builder& stats_level(
-            cudf_io_types.statistics_freq sf
-        ) except +
-        chunked_parquet_writer_options_builder& compression(
-            cudf_io_types.compression_type compression
-        ) except +
-        chunked_parquet_writer_options_builder& int96_timestamps(
-            bool enabled
-        ) except +
-        chunked_parquet_writer_options_builder& utc_timestamps(
-            bool enabled
-        ) except +
-        chunked_parquet_writer_options_builder& row_group_size_bytes(
-            size_t val
-        ) except +
-        chunked_parquet_writer_options_builder& row_group_size_rows(
-            size_type val
-        ) except +
-        chunked_parquet_writer_options_builder& max_page_size_bytes(
-            size_t val
-        ) except +
-        chunked_parquet_writer_options_builder& max_page_size_rows(
-            size_type val
-        ) except +
-        chunked_parquet_writer_options_builder& max_dictionary_size(
-            size_t val
-        ) except +
-        parquet_writer_options_builder& write_v2_headers(
-            bool val
-        ) except +
-        parquet_writer_options_builder& dictionary_policy(
-            cudf_io_types.dictionary_policy val
-        ) except +
-
-        chunked_parquet_writer_options build() except +
 
     cdef cppclass parquet_chunked_writer:
         parquet_chunked_writer() except +

From ae12634c834a82d3d8884110c9de07d91877c828 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 10 Jun 2024 09:51:28 -0400
Subject: [PATCH 325/842] Fix large strings handling in
 nvtext::character_tokenize (#15829)

Fix logic for `nvtext::character_tokenize` to handle large strings input. The output for > 2GB input strings column will turn characters into rows and so will likely overflow the `size_type` rows as expected. The `thrust::count_if` is replaced with a raw kernel to produce the appropriate count that can be checked against max row size.
Also changed the API to not accept null rows since the code does not check for them and can return invalid results for inputs with unsanitized-null rows.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15829
---
 cpp/benchmarks/text/tokenize.cpp              |  6 +-
 cpp/include/nvtext/tokenize.hpp               |  3 +-
 cpp/src/text/tokenize.cu                      | 66 ++++++++++++++-----
 cpp/tests/text/tokenize_tests.cpp             | 10 +--
 python/cudf/cudf/core/column/string.py        | 13 ++--
 .../cudf/cudf/tests/text/test_text_methods.py |  2 -
 6 files changed, 66 insertions(+), 34 deletions(-)

diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp
index 2151b28d637..e83310e0343 100644
--- a/cpp/benchmarks/text/tokenize.cpp
+++ b/cpp/benchmarks/text/tokenize.cpp
@@ -39,8 +39,10 @@ static void bench_tokenize(nvbench::state& state)
     state.skip("Skip benchmarks greater than size_type limit");
   }
 
-  data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  data_profile const profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .no_validity();
   auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
 
diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp
index ea1b9c716f0..29fed0759c7 100644
--- a/cpp/include/nvtext/tokenize.hpp
+++ b/cpp/include/nvtext/tokenize.hpp
@@ -176,7 +176,8 @@ std::unique_ptr<cudf::column> count_tokens(
  * t is now ["h","e","l","l","o"," ","w","o","r","l","d","g","o","o","d","b","y","e"]
  * @endcode
  *
- * All null row entries are ignored and the output contains all valid rows.
+ * @throw std::invalid_argument if `input` contains nulls
+ * @throw std::overflow_error if the output would produce more than max size_type rows
  *
  * @param input Strings column to tokenize
  * @param stream CUDA stream used for device memory operations and kernel launches
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index 0b16305a81a..25406bce759 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -21,6 +21,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/algorithm.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -35,6 +36,7 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cuda/atomic>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/for_each.h>
@@ -99,6 +101,31 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
   return cudf::strings::detail::make_strings_column(tokens.begin(), tokens.end(), stream, mr);
 }
 
+constexpr int64_t block_size       = 512;  // number of threads per block
+constexpr int64_t bytes_per_thread = 4;    // bytes processed per thread
+
+CUDF_KERNEL void count_characters(uint8_t const* d_chars, int64_t chars_bytes, int64_t* d_output)
+{
+  auto const idx      = cudf::detail::grid_1d::global_thread_id();
+  auto const byte_idx = static_cast<int64_t>(idx) * bytes_per_thread;
+  auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
+
+  using block_reduce = cub::BlockReduce<int64_t, block_size>;
+  __shared__ typename block_reduce::TempStorage temp_storage;
+
+  int64_t count = 0;
+  // each thread processes multiple bytes
+  for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) {
+    count += cudf::strings::detail::is_begin_utf8_char(d_chars[i]);
+  }
+  auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum());
+
+  if ((lane_idx == 0) && (total > 0)) {
+    cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
+    ref.fetch_add(total, cuda::std::memory_order_relaxed);
+  }
+}
+
 }  // namespace
 
 // detail APIs
@@ -176,11 +203,17 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
     return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
   }
 
-  auto offsets = strings_column.offsets();
-  auto offset  = cudf::strings::detail::get_offset_value(offsets, strings_column.offset(), stream);
-  auto chars_bytes = cudf::strings::detail::get_offset_value(
-                       offsets, strings_column.offset() + strings_count, stream) -
-                     offset;
+  CUDF_EXPECTS(
+    strings_column.null_count() == 0, "input must not contain nulls", std::invalid_argument);
+
+  auto const offsets = strings_column.offsets();
+  auto const offset =
+    cudf::strings::detail::get_offset_value(offsets, strings_column.offset(), stream);
+  auto const chars_bytes = cudf::strings::detail::get_offset_value(
+                             offsets, strings_column.offset() + strings_count, stream) -
+                           offset;
+  // no bytes -- this could happen in an all-empty column
+  if (chars_bytes == 0) { return cudf::make_empty_column(cudf::type_id::STRING); }
   auto d_chars =
     strings_column.parent().data<uint8_t>();  // unsigned is necessary for checking bits
   d_chars += offset;
@@ -188,23 +221,26 @@ std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const
   // To minimize memory, count the number of characters so we can
   // build the output offsets without an intermediate buffer.
   // In the worst case each byte is a character so the output is 4x the input.
-  cudf::size_type num_characters = thrust::count_if(
-    rmm::exec_policy(stream), d_chars, d_chars + chars_bytes, [] __device__(uint8_t byte) {
-      return cudf::strings::detail::is_begin_utf8_char(byte);
-    });
+  rmm::device_scalar<int64_t> d_count(0, stream);
+  auto const num_blocks = cudf::util::div_rounding_up_safe(
+    cudf::util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)),
+    block_size);
+  count_characters<<<num_blocks, block_size, 0, stream.value()>>>(
+    d_chars, chars_bytes, d_count.data());
+  auto const num_characters = d_count.value(stream);
 
-  // no characters check -- this could happen in all-empty or all-null strings column
-  if (num_characters == 0) {
-    return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
-  }
+  // number of characters becomes the number of rows so need to check the row limit
+  CUDF_EXPECTS(
+    num_characters + 1 < static_cast<int64_t>(std::numeric_limits<cudf::size_type>::max()),
+    "output exceeds the column size limit",
+    std::overflow_error);
 
   // create output offsets column
-  // -- conditionally copy a counting iterator where
-  //    the first byte of each character is located
   auto offsets_column = cudf::make_numeric_column(
     offsets.type(), num_characters + 1, cudf::mask_state::UNALLOCATED, stream, mr);
   auto d_new_offsets =
     cudf::detail::offsetalator_factory::make_output_iterator(offsets_column->mutable_view());
+  // offsets are at the beginning byte of each character
   cudf::detail::copy_if_safe(
     thrust::counting_iterator<int64_t>(0),
     thrust::counting_iterator<int64_t>(chars_bytes + 1),
diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp
index 6a6bcda87cc..a59a54169d7 100644
--- a/cpp/tests/text/tokenize_tests.cpp
+++ b/cpp/tests/text/tokenize_tests.cpp
@@ -111,17 +111,13 @@ TEST_F(TextTokenizeTest, TokenizeErrorTest)
 
 TEST_F(TextTokenizeTest, CharacterTokenize)
 {
-  std::vector<char const*> h_strings{"the mousé ate the cheese", nullptr, ""};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  cudf::test::strings_column_wrapper input({"the mousé ate the cheese", ""});
 
   cudf::test::strings_column_wrapper expected{"t", "h", "e", " ", "m", "o", "u", "s",
                                               "é", " ", "a", "t", "e", " ", "t", "h",
                                               "e", " ", "c", "h", "e", "e", "s", "e"};
 
-  auto results = nvtext::character_tokenize(cudf::strings_column_view(strings));
+  auto results = nvtext::character_tokenize(cudf::strings_column_view(input));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
@@ -151,8 +147,6 @@ TEST_F(TextTokenizeTest, TokenizeEmptyTest)
   EXPECT_EQ(results->size(), 0);
   results = nvtext::character_tokenize(all_empty);
   EXPECT_EQ(results->size(), 0);
-  results = nvtext::character_tokenize(all_null);
-  EXPECT_EQ(results->size(), 0);
   auto const delimiter = cudf::string_scalar{""};
   results              = nvtext::tokenize_with_vocabulary(view, all_empty, delimiter);
   EXPECT_EQ(results->size(), 0);
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index d12aa80e9a3..ad7dbe5e52e 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -552,16 +552,17 @@ def join(
         return self._return_or_inplace(data)
 
     def _split_by_character(self):
-        result_col = libstrings.character_tokenize(self._column)
+        col = self._column.fillna("")  # sanitize nulls
+        result_col = libstrings.character_tokenize(col)
 
-        offset_col = self._column.children[0]
+        offset_col = col.children[0]
 
         return cudf.core.column.ListColumn(
-            size=len(self._column),
-            dtype=cudf.ListDtype(self._column.dtype),
-            mask=self._column.mask,
+            size=len(col),
+            dtype=cudf.ListDtype(col.dtype),
+            mask=col.mask,
             offset=0,
-            null_count=self._column.null_count,
+            null_count=0,
             children=(offset_col, result_col),
         )
 
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 6bd3b99bae1..36f7f3de828 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -426,7 +426,6 @@ def test_character_tokenize_series():
         [
             "hello world",
             "sdf",
-            None,
             (
                 "goodbye, one-two:three~four+five_six@sev"
                 "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé Ǆ"
@@ -543,7 +542,6 @@ def test_character_tokenize_index():
         [
             "hello world",
             "sdf",
-            None,
             (
                 "goodbye, one-two:three~four+five_six@sev"
                 "en#eight^nine heŒŽ‘•™œ$µ¾ŤƠé Ǆ"

From 9b2c35f346b91b598238cbf54e40a463820708c0 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 10 Jun 2024 11:40:08 -0500
Subject: [PATCH 326/842] Support arbitrary CUDA versions in UDF code (#15950)

This PR eliminates the manual mapping from PTX versions to CUDA versions, to help support CUDA 12.5 and newer without requiring a manual update to `_numba.py` for every CUDA release. This also updates the minimum compute capability PTX file from arch 60 to arch 70, since that is now the minimum required by RAPIDS.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Graham Markall (https://github.com/gmarkall)
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/15950
---
 .../_lib/pylibcudf/libcudf/strings_udf.pxd    |  1 +
 python/cudf/cudf/_lib/strings_udf.pyx         |  5 ++
 python/cudf/cudf/utils/_numba.py              | 84 +++----------------
 python/cudf/udf_cpp/CMakeLists.txt            |  2 +-
 .../include/cudf/strings/udf/udf_apis.hpp     |  9 +-
 .../strings/src/strings/udf/udf_apis.cu       |  2 +
 6 files changed, 30 insertions(+), 73 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd
index b895d5e6925..804ad30dfb1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd
@@ -18,6 +18,7 @@ cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \
 
 cdef extern from "cudf/strings/udf/udf_apis.hpp"  namespace \
         "cudf::strings::udf" nogil:
+    cdef int get_cuda_build_version() except +
     cdef unique_ptr[device_buffer] to_string_view_array(column_view) except +
     cdef unique_ptr[column] column_from_udf_string_array(
         udf_string* strings, size_type size,
diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx
index e952492c45d..7610cad0b40 100644
--- a/python/cudf/cudf/_lib/strings_udf.pyx
+++ b/python/cudf/cudf/_lib/strings_udf.pyx
@@ -22,11 +22,16 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
 from cudf._lib.pylibcudf.libcudf.strings_udf cimport (
     column_from_udf_string_array as cpp_column_from_udf_string_array,
     free_udf_string_array as cpp_free_udf_string_array,
+    get_cuda_build_version as cpp_get_cuda_build_version,
     to_string_view_array as cpp_to_string_view_array,
     udf_string,
 )
 
 
+def get_cuda_build_version():
+    return cpp_get_cuda_build_version()
+
+
 def column_to_string_view_array(Column strings_col):
     cdef unique_ptr[device_buffer] c_buffer
     cdef column_view input_view = strings_col.view()
diff --git a/python/cudf/cudf/utils/_numba.py b/python/cudf/cudf/utils/_numba.py
index 494b48b3cfd..d9dde58d998 100644
--- a/python/cudf/cudf/utils/_numba.py
+++ b/python/cudf/cudf/utils/_numba.py
@@ -12,16 +12,14 @@
 # strings_udf. This is the easiest way to break an otherwise circular import
 # loop of _lib.*->cudautils->_numba->_lib.strings_udf
 @lru_cache
-def _get_cc_60_ptx_file():
+def _get_cuda_build_version():
     from cudf._lib import strings_udf
 
-    return os.path.join(
-        os.path.dirname(strings_udf.__file__),
-        "..",
-        "core",
-        "udf",
-        "shim_60.ptx",
-    )
+    # The version is an integer, parsed as 1000 * major + 10 * minor
+    cuda_build_version = strings_udf.get_cuda_build_version()
+    cuda_major_version = cuda_build_version // 1000
+    cuda_minor_version = (cuda_build_version % 1000) // 10
+    return (cuda_major_version, cuda_minor_version)
 
 
 def _get_best_ptx_file(archs, max_compute_capability):
@@ -38,8 +36,8 @@ def _get_best_ptx_file(archs, max_compute_capability):
 
 def _get_ptx_file(path, prefix):
     if "RAPIDS_NO_INITIALIZE" in os.environ:
-        # cc=60 ptx is always built
-        cc = int(os.environ.get("STRINGS_UDF_CC", "60"))
+        # cc=70 ptx is always built
+        cc = int(os.environ.get("STRINGS_UDF_CC", "70"))
     else:
         from numba import cuda
 
@@ -120,15 +118,13 @@ def _setup_numba():
     versions = safe_get_versions()
     if versions != NO_DRIVER:
         driver_version, runtime_version = versions
-        ptx_toolkit_version = _get_cuda_version_from_ptx_file(
-            _get_cc_60_ptx_file()
-        )
+        shim_ptx_cuda_version = _get_cuda_build_version()
 
         # MVC is required whenever any PTX is newer than the driver
-        # This could be the shipped PTX file or the PTX emitted by
-        # the version of NVVM on the user system, the latter aligning
-        # with the runtime version
-        if (driver_version < ptx_toolkit_version) or (
+        # This could be the shipped shim PTX file (determined by the CUDA
+        # version used at build time) or the PTX emitted by the version of NVVM
+        # on the user system (determined by the user's CUDA runtime version)
+        if (driver_version < shim_ptx_cuda_version) or (
             driver_version < runtime_version
         ):
             if driver_version < (12, 0):
@@ -139,60 +135,6 @@ def _setup_numba():
                 patch_numba_linker()
 
 
-def _get_cuda_version_from_ptx_file(path):
-    """
-    https://docs.nvidia.com/cuda/parallel-thread-execution/
-    Each PTX module must begin with a .version
-    directive specifying the PTX language version
-
-    example header:
-    //
-    // Generated by NVIDIA NVVM Compiler
-    //
-    // Compiler Build ID: CL-31057947
-    // Cuda compilation tools, release 11.6, V11.6.124
-    // Based on NVVM 7.0.1
-    //
-
-    .version 7.6
-    .target sm_52
-    .address_size 64
-
-    """
-    with open(path) as ptx_file:
-        for line in ptx_file:
-            if line.startswith(".version"):
-                ver_line = line
-                break
-        else:
-            raise ValueError("Could not read CUDA version from ptx file.")
-    version = ver_line.strip("\n").split(" ")[1]
-    # This dictionary maps from supported versions of NVVM to the
-    # PTX version it produces. The lowest value should be the minimum
-    # CUDA version required to compile the library. Currently CUDA 11.5
-    # or higher is required to build cudf. New CUDA versions should
-    # be added to this dictionary when officially supported.
-    ver_map = {
-        "7.5": (11, 5),
-        "7.6": (11, 6),
-        "7.7": (11, 7),
-        "7.8": (11, 8),
-        "8.0": (12, 0),
-        "8.1": (12, 1),
-        "8.2": (12, 2),
-        "8.3": (12, 3),
-        "8.4": (12, 4),
-    }
-
-    cuda_ver = ver_map.get(version)
-    if cuda_ver is None:
-        raise ValueError(
-            f"Could not map PTX version {version} to a CUDA version"
-        )
-
-    return cuda_ver
-
-
 class _CUDFNumbaConfig:
     def __enter__(self):
         self.CUDA_LOW_OCCUPANCY_WARNINGS = (
diff --git a/python/cudf/udf_cpp/CMakeLists.txt b/python/cudf/udf_cpp/CMakeLists.txt
index fe7f9d0b00d..fa7855cfc65 100644
--- a/python/cudf/udf_cpp/CMakeLists.txt
+++ b/python/cudf/udf_cpp/CMakeLists.txt
@@ -60,7 +60,7 @@ set(SHIM_CUDA_FLAGS --expt-relaxed-constexpr -rdc=true)
 
 # always build a default PTX file in case RAPIDS_NO_INITIALIZE is set and the device cc can't be
 # safely queried through a context
-list(INSERT CMAKE_CUDA_ARCHITECTURES 0 "60")
+list(INSERT CMAKE_CUDA_ARCHITECTURES 0 "70")
 
 list(TRANSFORM CMAKE_CUDA_ARCHITECTURES REPLACE "-real" "")
 list(TRANSFORM CMAKE_CUDA_ARCHITECTURES REPLACE "-virtual" "")
diff --git a/python/cudf/udf_cpp/strings/include/cudf/strings/udf/udf_apis.hpp b/python/cudf/udf_cpp/strings/include/cudf/strings/udf/udf_apis.hpp
index 219dbe27682..8635b1280de 100644
--- a/python/cudf/udf_cpp/strings/include/cudf/strings/udf/udf_apis.hpp
+++ b/python/cudf/udf_cpp/strings/include/cudf/strings/udf/udf_apis.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,6 +27,13 @@ namespace cudf {
 namespace strings {
 namespace udf {
 
+/**
+ * @brief Get the CUDA version used at build time.
+ *
+ * @return The CUDA version as an integer, parsed as major * 1000 + minor * 10.
+ */
+int get_cuda_build_version();
+
 class udf_string;
 
 /**
diff --git a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
index 9cf86b5ea48..941e61e6787 100644
--- a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
+++ b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
@@ -101,6 +101,8 @@ void free_udf_string_array(cudf::strings::udf::udf_string* d_strings,
 
 // external APIs
 
+int get_cuda_build_version() { return CUDA_VERSION; }
+
 std::unique_ptr<rmm::device_buffer> to_string_view_array(cudf::column_view const input)
 {
   return detail::to_string_view_array(input, cudf::get_default_stream());

From e3ba131baf340dfcf575abc99a872cdb36671307 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 10 Jun 2024 06:48:41 -1000
Subject: [PATCH 327/842] Support timezone aware pandas inputs in cudf (#15935)

closes #13611

(This technically does not support pandas objects have interval types that are timezone aware)

@rjzamora let me know if the test I adapted from your PR in https://github.com/rapidsai/cudf/pull/15929 is adequate

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15935
---
 python/cudf/cudf/core/column/column.py        | 27 +++++--------------
 python/cudf/cudf/core/index.py                | 11 +++-----
 .../cudf/tests/series/test_datetimelike.py    | 13 +++++++++
 python/cudf/cudf/tests/test_datetime.py       | 26 +++---------------
 .../dask_cudf/io/tests/test_parquet.py        | 20 ++++++++++++++
 5 files changed, 48 insertions(+), 49 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 475d52d0fbb..f87797a1fa3 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -332,10 +332,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
                 "yet supported in pyarrow, see: "
                 "https://github.com/apache/arrow/issues/20213"
             )
-        elif pa.types.is_timestamp(array.type) and array.type.tz is not None:
-            raise NotImplementedError(
-                "cuDF does not yet support timezone-aware datetimes"
-            )
         elif isinstance(array.type, ArrowIntervalType):
             return cudf.core.column.IntervalColumn.from_arrow(array)
         elif pa.types.is_large_string(array.type):
@@ -992,9 +988,9 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
             return col
         elif isinstance(dtype, cudf.core.dtypes.DecimalDtype):
             return col.as_decimal_column(dtype)
-        elif np.issubdtype(cast(Any, dtype), np.datetime64):
+        elif dtype.kind == "M":
             return col.as_datetime_column(dtype)
-        elif np.issubdtype(cast(Any, dtype), np.timedelta64):
+        elif dtype.kind == "m":
             return col.as_timedelta_column(dtype)
         elif dtype.kind == "O":
             if cudf.get_option("mode.pandas_compatible") and was_object:
@@ -1846,21 +1842,11 @@ def as_column(
             and arbitrary.freq is not None
         ):
             raise NotImplementedError("freq is not implemented yet")
-        elif (
-            isinstance(arbitrary.dtype, pd.DatetimeTZDtype)
-            or (
-                isinstance(arbitrary.dtype, pd.IntervalDtype)
-                and isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype)
-            )
-            or (
-                isinstance(arbitrary.dtype, pd.CategoricalDtype)
-                and isinstance(
-                    arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
-                )
-            )
+        elif isinstance(arbitrary.dtype, pd.IntervalDtype) and isinstance(
+            arbitrary.dtype.subtype, pd.DatetimeTZDtype
         ):
             raise NotImplementedError(
-                "cuDF does not yet support timezone-aware datetimes"
+                "cuDF does not yet support Intervals with timezone-aware datetimes"
             )
         elif _is_pandas_nullable_extension_dtype(arbitrary.dtype):
             if cudf.get_option("mode.pandas_compatible"):
@@ -1876,7 +1862,8 @@ def as_column(
                 length=length,
             )
         elif isinstance(
-            arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype)
+            arbitrary.dtype,
+            (pd.CategoricalDtype, pd.IntervalDtype, pd.DatetimeTZDtype),
         ):
             return as_column(
                 pa.array(arbitrary, from_pandas=True),
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 7297ac4e929..732e5cdb01a 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1757,13 +1757,10 @@ def __init__(
         name = _getdefault_name(data, name=name)
         data = column.as_column(data)
 
-        # TODO: Remove this if statement and fix tests now that
-        # there's timezone support
-        if isinstance(data.dtype, pd.DatetimeTZDtype):
-            raise NotImplementedError(
-                "cuDF does not yet support timezone-aware datetimes"
-            )
-        data = data.astype(dtype)
+        # TODO: if data.dtype.kind == "M" (i.e. data is already datetime type)
+        # We probably shouldn't always astype to datetime64[ns]
+        if not isinstance(data.dtype, pd.DatetimeTZDtype):
+            data = data.astype(dtype)
 
         if copy:
             data = data.copy()
diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py
index 7ef55761b2b..58ffc610c3c 100644
--- a/python/cudf/cudf/tests/series/test_datetimelike.py
+++ b/python/cudf/cudf/tests/series/test_datetimelike.py
@@ -223,3 +223,16 @@ def test_contains_tz_aware(item, expected):
 def test_tz_convert_naive_typeerror():
     with pytest.raises(TypeError):
         cudf.date_range("2020", periods=2, freq="D").tz_convert(None)
+
+
+@pytest.mark.parametrize(
+    "klass", ["Series", "DatetimeIndex", "Index", "CategoricalIndex"]
+)
+def test_from_pandas_obj_tz_aware(klass):
+    tz_aware_data = [
+        pd.Timestamp("2020-01-01", tz="UTC").tz_convert("US/Pacific")
+    ]
+    pandas_obj = getattr(pd, klass)(tz_aware_data)
+    result = cudf.from_pandas(pandas_obj)
+    expected = getattr(cudf, klass)(tz_aware_data)
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 4186fff038a..e3ecaafae5b 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2088,25 +2088,6 @@ def test_datetime_constructor(data, dtype):
     assert_eq(expected, actual)
 
 
-@pytest.mark.parametrize(
-    "data",
-    [
-        [pd.Timestamp("2001-01-01", tz="America/New_York")],
-        pd.Series(["2001-01-01"], dtype="datetime64[ns, America/New_York]"),
-        pd.Index(["2001-01-01"], dtype="datetime64[ns, America/New_York]"),
-    ],
-)
-def test_construction_from_tz_timestamps(data):
-    with pytest.raises(NotImplementedError):
-        _ = cudf.Series(data)
-    with pytest.raises(NotImplementedError):
-        _ = cudf.Index(data)
-    with pytest.raises(NotImplementedError):
-        _ = cudf.DatetimeIndex(data)
-    with pytest.raises(NotImplementedError):
-        cudf.CategoricalIndex(data)
-
-
 @pytest.mark.parametrize("op", _cmpops)
 def test_datetime_binop_tz_timestamp(op):
     s = cudf.Series([1, 2, 3], dtype="datetime64[ns]")
@@ -2391,13 +2372,14 @@ def test_datetime_raise_warning(freqstr):
         t.dt.ceil(freqstr)
 
 
-def test_timezone_array_notimplemented():
+def test_timezone_pyarrow_array():
     pa_array = pa.array(
         [datetime.datetime(2020, 1, 1, tzinfo=datetime.timezone.utc)],
         type=pa.timestamp("ns", "UTC"),
     )
-    with pytest.raises(NotImplementedError):
-        cudf.Series(pa_array)
+    result = cudf.Series(pa_array)
+    expected = pa_array.to_pandas()
+    assert_eq(result, expected)
 
 
 def test_to_datetime_errors_ignore_deprecated():
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index 39800145585..f3e3911e6c7 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -596,3 +596,23 @@ def test_parquet_read_filter_and_project(tmpdir):
     # Check result
     expected = df[(df.a == 5) & (df.c > 20)][columns].reset_index(drop=True)
     dd.assert_eq(got, expected)
+
+
+def test_timezone_column(tmpdir):
+    path = str(tmpdir.join("test.parquet"))
+    pdf = pd.DataFrame(
+        {
+            "time": pd.to_datetime(
+                ["1996-01-02", "1996-12-01"],
+                utc=True,
+            ),
+            "x": [1, 2],
+        }
+    )
+    pdf.to_parquet(path)
+    got = dask_cudf.read_parquet(path)
+    # cudf.read_parquet does not support reading timezone aware types yet
+    assert got["time"].dtype == pd.DatetimeTZDtype("ns", "UTC")
+    got["time"] = got["time"].astype("datetime64[ns]")
+    expected = cudf.read_parquet(path)
+    dd.assert_eq(got, expected)

From f9b0fc3d1986d5ac8994c09229d62063854c0856 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 10 Jun 2024 08:34:15 -1000
Subject: [PATCH 328/842] Preserve column type and class information in more
 DataFrame operations (#15949)

Narrowing down to a pattern of using `ColumnAccessor._from_columns_like_self` to preserve the column information and then calling `Frame._from_data_like_self` to preserve the `.index`/`.name` information.

This is specifically for operations that operates column wise and the result should be the same shape as the input.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15949
---
 python/cudf/cudf/core/dataframe.py       |   3 +-
 python/cudf/cudf/core/indexed_frame.py   | 131 +++++++++++------------
 python/cudf/cudf/core/window/rolling.py  |  41 ++-----
 python/cudf/cudf/tests/test_dataframe.py |  12 ++-
 4 files changed, 83 insertions(+), 104 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 9307267b227..e1b6cc45dd3 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2688,6 +2688,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None:
         self._data = ColumnAccessor(
             data=dict(zip(other.names, self._data.columns)),
             multiindex=other.multiindex,
+            rangeindex=other.rangeindex,
             level_names=other.level_names,
             label_dtype=other.label_dtype,
             verify=False,
@@ -7534,7 +7535,7 @@ def _sample_axis_1(
     def _from_columns_like_self(
         self,
         columns: List[ColumnBase],
-        column_names: abc.Iterable[str],
+        column_names: Optional[abc.Iterable[str]] = None,
         index_names: Optional[List[str]] = None,
         *,
         override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None,
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index d898eb4b9c3..fdc78005996 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -40,8 +40,6 @@
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
-    is_bool_dtype,
-    is_decimal_dtype,
     is_dict_like,
     is_list_like,
     is_scalar,
@@ -372,7 +370,6 @@ def _mimic_inplace(
             self._index = result.index
         return super()._mimic_inplace(result, inplace)
 
-    # Scans
     @_cudf_nvtx_annotate
     def _scan(self, op, axis=None, skipna=True):
         """
@@ -417,8 +414,8 @@ def _scan(self, op, axis=None, skipna=True):
         cast_to_int = op in ("cumsum", "cumprod")
         skipna = True if skipna is None else skipna
 
-        results = {}
-        for name, col in self._data.items():
+        results = []
+        for col in self._columns:
             if skipna:
                 result_col = col.nans_to_nulls()
             else:
@@ -429,19 +426,14 @@ def _scan(self, op, axis=None, skipna=True):
                 else:
                     result_col = col
 
-            if (
-                cast_to_int
-                and not is_decimal_dtype(result_col.dtype)
-                and (
-                    np.issubdtype(result_col.dtype, np.integer)
-                    or np.issubdtype(result_col.dtype, np.bool_)
-                )
-            ):
+            if cast_to_int and result_col.dtype.kind in "uib":
                 # For reductions that accumulate a value (e.g. sum, not max)
                 # pandas returns an int64 dtype for all int or bool dtypes.
                 result_col = result_col.astype(np.int64)
-            results[name] = getattr(result_col, op)()
-        return self._from_data(results, self.index)
+            results.append(getattr(result_col, op)())
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(results)
+        )
 
     def _check_data_index_length_match(self) -> None:
         # Validate that the number of rows in the data matches the index if the
@@ -880,7 +872,6 @@ def replace(
                 FutureWarning,
             )
         if not (to_replace is None and value is no_default):
-            copy_data = {}
             (
                 all_na_per_column,
                 to_replace_per_column,
@@ -890,10 +881,10 @@ def replace(
                 value=value,
                 columns_dtype_map=dict(self._dtypes),
             )
-
+            copy_data = []
             for name, col in self._data.items():
                 try:
-                    copy_data[name] = col.find_and_replace(
+                    replaced = col.find_and_replace(
                         to_replace_per_column[name],
                         replacements_per_column[name],
                         all_na_per_column[name],
@@ -906,11 +897,13 @@ def replace(
                     #    that exists in `copy_data`.
                     # ii. There is an OverflowError while trying to cast
                     #     `to_replace_per_column` to `replacements_per_column`.
-                    copy_data[name] = col.copy(deep=True)
+                    replaced = col.copy(deep=True)
+                copy_data.append(replaced)
+            result = self._from_data_like_self(
+                self._data._from_columns_like_self(copy_data)
+            )
         else:
-            copy_data = self._data.copy(deep=True)
-
-        result = self._from_data(copy_data, self.index)
+            result = self.copy()
 
         return self._mimic_inplace(result, inplace=inplace)
 
@@ -1031,12 +1024,13 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1):
             ):
                 lower[0], upper[0] = upper[0], lower[0]
 
-        data = {
-            name: col.clip(lower[i], upper[i])
-            for i, (name, col) in enumerate(self._data.items())
-        }
-        output = self._from_data(data, self.index)
-        output._copy_type_metadata(self, include_index=False)
+        data = (
+            col.clip(low, high)
+            for col, low, high in zip(self._columns, lower, upper)
+        )
+        output = self._from_data_like_self(
+            self._data._from_columns_like_self(data)
+        )
         return self._mimic_inplace(output, inplace=inplace)
 
     @_cudf_nvtx_annotate
@@ -1913,7 +1907,7 @@ def nans_to_nulls(self):
         2  <NA>  <NA>
         """
         result = []
-        for col in self._data.columns:
+        for col in self._columns:
             converted = col.nans_to_nulls()
             if converted is col:
                 converted = converted.copy()
@@ -2028,8 +2022,8 @@ def interpolate(
             )
 
         interpolator = cudf.core.algorithms.get_column_interpolator(method)
-        columns = {}
-        for colname, col in data._data.items():
+        columns = []
+        for col in data._columns:
             if isinstance(col, cudf.core.column.StringColumn):
                 warnings.warn(
                     f"{type(self).__name__}.interpolate with object dtype is "
@@ -2040,9 +2034,12 @@ def interpolate(
                 col = col.astype("float64").fillna(np.nan)
 
             # Interpolation methods may or may not need the index
-            columns[colname] = interpolator(col, index=data.index)
+            columns.append(interpolator(col, index=data.index))
 
-        result = self._from_data(columns, index=data.index)
+        result = self._from_data_like_self(
+            self._data._from_columns_like_self(columns)
+        )
+        result.index = data.index
 
         return (
             result
@@ -2069,8 +2066,8 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         data_columns = (
             col.shift(periods, fill_value) for col in self._columns
         )
-        return self.__class__._from_data(
-            zip(self._column_names, data_columns), self.index
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(data_columns)
         )
 
     @_cudf_nvtx_annotate
@@ -3011,8 +3008,6 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
             self._column_names,
             None if has_range_index or not keep_index else self.index.names,
         )
-        result._data.label_dtype = self._data.label_dtype
-        result._data.rangeindex = self._data.rangeindex
 
         if keep_index and has_range_index:
             result.index = self.index[start:stop]
@@ -3561,11 +3556,6 @@ def sort_values(
             ),
             keep_index=not ignore_index,
         )
-        if (
-            isinstance(self, cudf.core.dataframe.DataFrame)
-            and self._data.multiindex
-        ):
-            out.columns = self._data.to_pandas_index()
         return out
 
     def _n_largest_or_smallest(
@@ -3659,14 +3649,12 @@ def _align_to_index(
             result = result.sort_values(sort_col_id)
             del result[sort_col_id]
 
-        result = self.__class__._from_data(
-            data=result._data, index=result.index
+        out = self._from_data(
+            self._data._from_columns_like_self(result._columns)
         )
-        result._data.multiindex = self._data.multiindex
-        result._data._level_names = self._data._level_names
-        result.index.names = self.index.names
-
-        return result
+        out.index = result.index
+        out.index.names = self.index.names
+        return out
 
     @_cudf_nvtx_annotate
     def _reindex(
@@ -3898,24 +3886,14 @@ def round(self, decimals=0, how="half_even"):
                 "decimals must be an integer, a dict-like or a Series"
             )
 
-        cols = {
-            name: col.round(decimals[name], how=how)
-            if (
-                name in decimals
-                and _is_non_decimal_numeric_dtype(col.dtype)
-                and not is_bool_dtype(col.dtype)
-            )
+        cols = (
+            col.round(decimals[name], how=how)
+            if name in decimals and col.dtype.kind in "fiu"
             else col.copy(deep=True)
             for name, col in self._data.items()
-        }
-
-        return self.__class__._from_data(
-            data=cudf.core.column_accessor.ColumnAccessor(
-                cols,
-                multiindex=self._data.multiindex,
-                level_names=self._data.level_names,
-            ),
-            index=self.index,
+        )
+        return self._from_data_like_self(
+            self._data._from_columns_like_self(cols)
         )
 
     def resample(
@@ -6238,6 +6216,8 @@ def rank(
                 f"axis={axis} is not yet supported in rank"
             )
 
+        num_cols = self._num_columns
+        dropped_cols = False
         source = self
         if numeric_only:
             if isinstance(
@@ -6255,15 +6235,28 @@ def rank(
             source = self._get_columns_by_label(numeric_cols)
             if source.empty:
                 return source.astype("float64")
+            elif source._num_columns != num_cols:
+                dropped_cols = True
 
         result_columns = libcudf.sort.rank_columns(
             [*source._columns], method_enum, na_option, ascending, pct
         )
 
-        return self.__class__._from_data(
-            dict(zip(source._column_names, result_columns)),
-            index=source.index,
-        ).astype(np.float64)
+        if dropped_cols:
+            result = type(source)._from_data(
+                ColumnAccessor(
+                    dict(zip(source._column_names, result_columns)),
+                    multiindex=self._data.multiindex,
+                    level_names=self._data.level_names,
+                    label_dtype=self._data.label_dtype,
+                ),
+            )
+        else:
+            result = source._from_data_like_self(
+                self._data._from_columns_like_self(result_columns)
+            )
+        result.index = source.index
+        return result.astype(np.float64)
 
     def convert_dtypes(
         self,
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 2037b1682db..7d140a1ffa5 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -1,7 +1,5 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION
 
-import itertools
-
 import numba
 import pandas as pd
 from pandas.api.indexers import BaseIndexer
@@ -251,27 +249,13 @@ def _apply_agg_column(self, source_column, agg_name):
             agg_params=self.agg_params,
         )
 
-    def _apply_agg_dataframe(self, df, agg_name):
-        return cudf.DataFrame._from_data(
-            {
-                col_name: self._apply_agg_column(col, agg_name)
-                for col_name, col in df._data.items()
-            },
-            index=df.index,
-        )
-
     def _apply_agg(self, agg_name):
-        if isinstance(self.obj, cudf.Series):
-            return cudf.Series._from_data(
-                {
-                    self.obj.name: self._apply_agg_column(
-                        self.obj._column, agg_name
-                    )
-                },
-                index=self.obj.index,
-            )
-        else:
-            return self._apply_agg_dataframe(self.obj, agg_name)
+        applied = (
+            self._apply_agg_column(col, agg_name) for col in self.obj._columns
+        )
+        return self.obj._from_data_like_self(
+            self.obj._data._from_columns_like_self(applied)
+        )
 
     def _reduce(
         self,
@@ -533,18 +517,9 @@ def _window_to_window_sizes(self, window):
             )
 
     def _apply_agg(self, agg_name):
-        index = cudf.MultiIndex.from_frame(
-            cudf.DataFrame(
-                {
-                    key: value
-                    for key, value in itertools.chain(
-                        self._group_keys._data.items(),
-                        self.obj.index._data.items(),
-                    )
-                }
-            )
+        index = cudf.MultiIndex._from_data(
+            {**self._group_keys._data, **self.obj.index._data}
         )
-
         result = super()._apply_agg(agg_name)
         result.index = index
         return result
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d76d5eb8065..98e9f9881c7 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10980,7 +10980,7 @@ def test_squeeze(axis, data):
     assert_eq(result, expected)
 
 
-@pytest.mark.parametrize("column", [range(1), np.array([1], dtype=np.int8)])
+@pytest.mark.parametrize("column", [range(1, 2), np.array([1], dtype=np.int8)])
 @pytest.mark.parametrize(
     "operation",
     [
@@ -10991,6 +10991,16 @@ def test_squeeze(axis, data):
         lambda df: abs(df),
         lambda df: -df,
         lambda df: ~df,
+        lambda df: df.cumsum(),
+        lambda df: df.replace(1, 2),
+        lambda df: df.replace(10, 20),
+        lambda df: df.clip(0, 10),
+        lambda df: df.rolling(1).mean(),
+        lambda df: df.interpolate(),
+        lambda df: df.shift(),
+        lambda df: df.sort_values(1),
+        lambda df: df.round(),
+        lambda df: df.rank(),
     ],
 )
 def test_op_preserves_column_metadata(column, operation):

From 58a15a84078c42b331ced4fd4384724d42328258 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 10 Jun 2024 11:42:11 -0700
Subject: [PATCH 329/842] Explicitly build for all GPU architectures (#15959)

The libcudf conda package is not specifying to build for all supported architectures and is instead letting build.sh fall back to NATIVE. However, because the default behavior of rapids-cmake is to build SASS for all supported architectures if NATIVE is specified but no local architecture is detected, we're still ending up with all of the RAPIDS architectures having SASS built for them. The problem is that we are failing to build PTX for the latest version, which would be produced if we used RAPIDS instead of NATIVE. This PR should resolve that issue.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/15959
---
 conda/recipes/libcudf/build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/libcudf/build.sh b/conda/recipes/libcudf/build.sh
index fef3dabd733..a3a0415575b 100644
--- a/conda/recipes/libcudf/build.sh
+++ b/conda/recipes/libcudf/build.sh
@@ -5,5 +5,5 @@ export cudf_ROOT="$(realpath ./cpp/build)"
 
 ./build.sh -n -v \
     libcudf libcudf_kafka benchmarks tests \
-    --build_metrics --incl_cache_stats \
+    --build_metrics --incl_cache_stats --allgpuarch \
     --cmake-args=\"-DCMAKE_INSTALL_LIBDIR=lib -DCUDF_ENABLE_ARROW_S3=ON\"

From 719a8a6934ae5eaeb22764d1bfdeb75893750bae Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Mon, 10 Jun 2024 15:57:17 -0400
Subject: [PATCH 330/842] Update PandasCompat.py to resolve references (#15704)

This PR allows the PandasCompat sphinx ext to contain resolved references. For example, you can now add intersphinx mapping to the content of the admonition.

### Motivation

I enjoy connecting the PyData communities and this PR allows for more opportunities to use intersphinx mapping to link back to the pandas docs.

### History

I first tried this in a previous PR (https://github.com/rapidsai/cudf/pull/15383#discussion_r1537888240) and commented here (https://github.com/rapidsai/cudf/pull/15383#issuecomment-2028451487) that I may get around to investigating this further. I finally had to time to work on this and made a bit of progress.

### Testing

I created a separate repo for this at https://github.com/raybellwaves/compatsphinxext which deploys straight to https://raybellwaves.github.io/compatsphinxext you can see it's working as expected here: https://raybellwaves.github.io/compatsphinxext/compat.html. You should be able to fork that and tinker pretty quickly.

### Further work

This could be cleaned up (for example I couldn't get the [source] to display in the admonition as I worked from the latest sphinx todo extension (https://github.com/sphinx-doc/sphinx/blob/master/sphinx/ext/todo.py)). The existing pandas-compat Admonition's could be switched to this if agreed. In addition, the documentation around how to write pandas-compat entries going forward (https://github.com/rapidsai/cudf/blob/branch-24.06/docs/cudf/source/developer_guide/documentation.md#comparing-to-pandas) will also have to be updated.

Longer term the extension could be published and used across RAPIDS libraries where there are differences in compatibility with PyData libraries e.g. pandas, network, scikit-learn to simplify linking to those dos. I'm not sure if I'll have time to work on this though.

Authors:
  - Ray Bell (https://github.com/raybellwaves)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15704
---
 docs/cudf/source/_ext/PandasCompat.py | 143 +++++++++++++++++---------
 docs/cudf/source/conf.py              |   2 +
 2 files changed, 94 insertions(+), 51 deletions(-)

diff --git a/docs/cudf/source/_ext/PandasCompat.py b/docs/cudf/source/_ext/PandasCompat.py
index af2b16035c3..331495c981e 100644
--- a/docs/cudf/source/_ext/PandasCompat.py
+++ b/docs/cudf/source/_ext/PandasCompat.py
@@ -1,14 +1,20 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION
+# Copyright (c) 2021-2024, NVIDIA CORPORATION
 
 # This file is adapted from official sphinx tutorial for `todo` extension:
 # https://www.sphinx-doc.org/en/master/development/tutorials/todo.html
+from __future__ import annotations
+
+from typing import cast
 
 from docutils import nodes
+from docutils.nodes import Element
 from docutils.parsers.rst import Directive
-from sphinx.locale import get_translation
-from sphinx.util.docutils import SphinxDirective
-
-translator = get_translation("sphinx")
+from docutils.parsers.rst.directives.admonitions import BaseAdmonition
+from sphinx import addnodes
+from sphinx.domains import Domain
+from sphinx.errors import NoUri
+from sphinx.locale import _ as get_translation_sphinx
+from sphinx.util.docutils import SphinxDirective, new_document
 
 
 class PandasCompat(nodes.Admonition, nodes.Element):
@@ -32,7 +38,7 @@ def run(self):
         return [PandasCompatList("")]
 
 
-class PandasCompatDirective(SphinxDirective):
+class PandasCompatDirective(BaseAdmonition, SphinxDirective):
 
     # this enables content in the directive
     has_content = True
@@ -43,9 +49,11 @@ def run(self):
 
         PandasCompat_node = PandasCompat("\n".join(self.content))
         PandasCompat_node += nodes.title(
-            translator("Pandas Compatibility Note"),
-            translator("Pandas Compatibility Note"),
+            get_translation_sphinx("Pandas Compatibility Note"),
+            get_translation_sphinx("Pandas Compatibility Note"),
         )
+        PandasCompat_node["docname"] = self.env.docname
+        PandasCompat_node["target"] = targetnode
         self.state.nested_parse(
             self.content, self.content_offset, PandasCompat_node
         )
@@ -84,71 +92,104 @@ def merge_PandasCompats(app, env, docnames, other):
         )
 
 
-def process_PandasCompat_nodes(app, doctree, fromdocname):
-    if not app.config.include_pandas_compat:
-        for node in doctree.traverse(PandasCompat):
-            node.parent.remove(node)
+class PandasCompatDomain(Domain):
+    name = "pandascompat"
+    label = "pandascompat"
 
-    # Replace all PandasCompatList nodes with a list of the collected
-    # PandasCompats. Augment each PandasCompat with a backlink to the
-    # original location.
-    env = app.builder.env
+    @property
+    def pandascompats(self):
+        return self.data.setdefault("pandascompats", {})
 
-    if not hasattr(env, "PandasCompat_all_pandas_compat"):
-        env.PandasCompat_all_pandas_compat = []
+    def clear_doc(self, docname):
+        self.pandascompats.pop(docname, None)
+
+    def merge_domaindata(self, docnames, otherdata):
+        for docname in docnames:
+            self.pandascompats[docname] = otherdata["pandascompats"][docname]
+
+    def process_doc(self, env, docname, document):
+        pandascompats = self.pandascompats.setdefault(docname, [])
+        for pandascompat in document.findall(PandasCompat):
+            env.app.emit("pandascompat-defined", pandascompat)
+            pandascompats.append(pandascompat)
 
-    for node in doctree.traverse(PandasCompatList):
-        if not app.config.include_pandas_compat:
-            node.replace_self([])
-            continue
 
-        content = []
+class PandasCompatListProcessor:
+    def __init__(self, app, doctree, docname):
+        self.builder = app.builder
+        self.config = app.config
+        self.env = app.env
+        self.domain = cast(PandasCompatDomain, app.env.get_domain("pandascompat"))
+        self.document = new_document("")
+        self.process(doctree, docname)
 
-        for PandasCompat_info in env.PandasCompat_all_pandas_compat:
-            para = nodes.paragraph()
+    def process(self, doctree: nodes.document, docname: str) -> None:
+        pandascompats = [v for vals in self.domain.pandascompats.values() for v in vals]
+        for node in doctree.findall(PandasCompatList):
+            if not self.config.include_pandas_compat:
+                node.parent.remove(node)
+                continue
 
-            # Create a reference back to the original docstring
-            newnode = nodes.reference("", "")
-            innernode = nodes.emphasis(
-                translator("[source]"), translator("[source]")
-            )
-            newnode["refdocname"] = PandasCompat_info["docname"]
-            newnode["refuri"] = app.builder.get_relative_uri(
-                fromdocname, PandasCompat_info["docname"]
-            )
-            newnode["refuri"] += "#" + PandasCompat_info["target"]["refid"]
-            newnode.append(innernode)
-            para += newnode
+            content: list[Element | None] = [nodes.target()] if node.get("ids") else []
 
-            # Insert the reference node into PandasCompat node
-            # Note that this node is a deepcopy from the original copy
-            # in the docstring, so changing this does not affect that in the
-            # doc.
-            PandasCompat_info["PandasCompat"].append(para)
+            for pandascompat in pandascompats:
+                # Create a copy of the pandascompat node
+                new_pandascompat = pandascompat.deepcopy()
+                new_pandascompat["ids"].clear()
 
-            # Insert the PandasCompand node into the PandasCompatList Node
-            content.append(PandasCompat_info["PandasCompat"])
+                self.resolve_reference(new_pandascompat, docname)
+                content.append(new_pandascompat)
 
-        node.replace_self(content)
+                ref = self.create_reference(pandascompat, docname)
+                content.append(ref)
+
+            node.replace_self(content)
+
+    def create_reference(self, pandascompat, docname):
+        para = nodes.paragraph()
+        newnode = nodes.reference("", "")
+        innernode = nodes.emphasis(
+            get_translation_sphinx("[source]"), get_translation_sphinx("[source]")
+        )
+        newnode["refdocname"] = pandascompat["docname"]
+        try:
+            newnode["refuri"] = self.builder.get_relative_uri(
+                docname, pandascompat["docname"]
+            ) + "#" + pandascompat["target"]["refid"]
+        except NoUri:
+            # ignore if no URI can be determined, e.g. for LaTeX output
+            pass
+        newnode.append(innernode)
+        para += newnode
+        return para
+
+    def resolve_reference(self, todo, docname: str) -> None:
+        """Resolve references in the todo content."""
+        for node in todo.findall(addnodes.pending_xref):
+            if "refdoc" in node:
+                node["refdoc"] = docname
+
+        # Note: To resolve references, it is needed to wrap it with document node
+        self.document += todo
+        self.env.resolve_references(self.document, docname, self.builder)
+        self.document.remove(todo)
 
 
 def setup(app):
     app.add_config_value("include_pandas_compat", False, "html")
-
     app.add_node(PandasCompatList)
     app.add_node(
         PandasCompat,
         html=(visit_PandasCompat_node, depart_PandasCompat_node),
         latex=(visit_PandasCompat_node, depart_PandasCompat_node),
         text=(visit_PandasCompat_node, depart_PandasCompat_node),
+        man=(visit_PandasCompat_node, depart_PandasCompat_node),
+        texinfo=(visit_PandasCompat_node, depart_PandasCompat_node),
     )
-
-    # Sphinx directives are lower-cased
     app.add_directive("pandas-compat", PandasCompatDirective)
     app.add_directive("pandas-compat-list", PandasCompatListDirective)
-    app.connect("doctree-resolved", process_PandasCompat_nodes)
-    app.connect("env-purge-doc", purge_PandasCompats)
-    app.connect("env-merge-info", merge_PandasCompats)
+    app.add_domain(PandasCompatDomain)
+    app.connect("doctree-resolved", PandasCompatListProcessor)
 
     return {
         "version": "0.1",
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 73d8b4445d3..e9c760e288e 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -617,6 +617,8 @@ def linkcode_resolve(domain, info) -> str | None:
         f"branch-{version}/python/cudf/cudf/{fn}{linespec}"
     )
 
+# Needed for avoid build warning for PandasCompat extension
+suppress_warnings = ["myst.domains"]
 
 def setup(app):
     app.add_css_file("https://docs.rapids.ai/assets/css/custom.css")

From 570df6c5fbb0a2120b539aba0a65702c2190527f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 10 Jun 2024 15:24:40 -1000
Subject: [PATCH 331/842] Add typing to single_column_frame (#15965)

Also removes an extra copy from `.flatten()` when calling `.values` or `.values_host`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/15965
---
 python/cudf/cudf/api/types.py                |  7 ++-
 python/cudf/cudf/core/column/column.py       |  4 +-
 python/cudf/cudf/core/single_column_frame.py | 58 ++++++++------------
 3 files changed, 29 insertions(+), 40 deletions(-)

diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index 417d8b0922a..42b1524bd76 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -8,7 +8,7 @@
 from collections import abc
 from functools import wraps
 from inspect import isclass
-from typing import List, Union
+from typing import List, Union, cast
 
 import cupy as cp
 import numpy as np
@@ -238,7 +238,10 @@ def _union_categoricals(
         raise TypeError("ignore_order is not yet implemented")
 
     result_col = cudf.core.column.CategoricalColumn._concat(
-        [obj._column for obj in to_union]
+        [
+            cast(cudf.core.column.CategoricalColumn, obj._column)
+            for obj in to_union
+        ]
     )
     if sort_categories:
         sorted_categories = result_col.categories.sort_values(ascending=True)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index f87797a1fa3..7abdbc85720 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -212,7 +212,7 @@ def to_pandas(
             return pd.Index(pa_array.to_pandas())
 
     @property
-    def values_host(self) -> "np.ndarray":
+    def values_host(self) -> np.ndarray:
         """
         Return a numpy representation of the Column.
         """
@@ -226,7 +226,7 @@ def values_host(self) -> "np.ndarray":
             return self.data_array_view(mode="read").copy_to_host()
 
     @property
-    def values(self) -> "cupy.ndarray":
+    def values(self) -> cupy.ndarray:
         """
         Return a CuPy representation of the Column.
         """
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index d864b563208..acc74129a29 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -7,9 +7,11 @@
 
 import cupy
 import numpy
+import pyarrow as pa
+from typing_extensions import Self
 
 import cudf
-from cudf._typing import Dtype, NotImplementedType, ScalarLike
+from cudf._typing import NotImplementedType, ScalarLike
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
@@ -27,8 +29,8 @@
 class SingleColumnFrame(Frame, NotIterable):
     """A one-dimensional frame.
 
-    Frames with only a single column share certain logic that is encoded in
-    this class.
+    Frames with only a single column (Index or Series)
+    share certain logic that is encoded in this class.
     """
 
     _SUPPORT_AXIS_LOOKUP = {
@@ -47,7 +49,7 @@ def _reduce(
         if axis not in (None, 0, no_default):
             raise NotImplementedError("axis parameter is not implemented yet")
 
-        if numeric_only and not is_numeric_dtype(self._column):
+        if numeric_only and not is_numeric_dtype(self.dtype):
             raise TypeError(
                 f"Series.{op} does not allow numeric_only={numeric_only} "
                 "with non-numeric dtypes."
@@ -68,7 +70,7 @@ def _scan(self, op, axis=None, *args, **kwargs):
     @_cudf_nvtx_annotate
     def name(self):
         """Get the name of this object."""
-        return next(iter(self._data.names))
+        return next(iter(self._column_names))
 
     @name.setter  # type: ignore
     @_cudf_nvtx_annotate
@@ -83,7 +85,7 @@ def ndim(self) -> int:  # noqa: D401
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def shape(self):
+    def shape(self) -> tuple[int]:
         """Get a tuple representing the dimensionality of the Index."""
         return (len(self),)
 
@@ -95,45 +97,27 @@ def __bool__(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def _num_columns(self):
+    def _num_columns(self) -> int:
         return 1
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def _column(self):
-        return self._data[self.name]
+    def _column(self) -> ColumnBase:
+        return next(iter(self._columns))
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def values(self):  # noqa: D102
+    def values(self) -> cupy.ndarray:  # noqa: D102
         return self._column.values
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def values_host(self):  # noqa: D102
+    def values_host(self) -> numpy.ndarray:  # noqa: D102
         return self._column.values_host
 
-    @_cudf_nvtx_annotate
-    def to_cupy(
-        self,
-        dtype: Union[Dtype, None] = None,
-        copy: bool = True,
-        na_value=None,
-    ) -> cupy.ndarray:  # noqa: D102
-        return super().to_cupy(dtype, copy, na_value).flatten()
-
-    @_cudf_nvtx_annotate
-    def to_numpy(
-        self,
-        dtype: Union[Dtype, None] = None,
-        copy: bool = True,
-        na_value=None,
-    ) -> numpy.ndarray:  # noqa: D102
-        return super().to_numpy(dtype, copy, na_value).flatten()
-
     @classmethod
     @_cudf_nvtx_annotate
-    def from_arrow(cls, array):
+    def from_arrow(cls, array) -> Self:
         """Create from PyArrow Array/ChunkedArray.
 
         Parameters
@@ -164,7 +148,7 @@ def from_arrow(cls, array):
         return cls(ColumnBase.from_arrow(array))
 
     @_cudf_nvtx_annotate
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         """
         Convert to a PyArrow Array.
 
@@ -196,7 +180,7 @@ def to_arrow(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_unique(self):
+    def is_unique(self) -> bool:
         """Return boolean if values in the object are unique.
 
         Returns
@@ -207,7 +191,7 @@ def is_unique(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_monotonic_increasing(self):
+    def is_monotonic_increasing(self) -> bool:
         """Return boolean if values in the object are monotonically increasing.
 
         Returns
@@ -218,7 +202,7 @@ def is_monotonic_increasing(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_monotonic_decreasing(self):
+    def is_monotonic_decreasing(self) -> bool:
         """Return boolean if values in the object are monotonically decreasing.
 
         Returns
@@ -243,7 +227,9 @@ def __cuda_array_interface__(self):
             )
 
     @_cudf_nvtx_annotate
-    def factorize(self, sort=False, use_na_sentinel=True):
+    def factorize(
+        self, sort: bool = False, use_na_sentinel: bool = True
+    ) -> tuple[cupy.ndarray, cudf.Index]:
         """Encode the input values as integer labels.
 
         Parameters
@@ -335,7 +321,7 @@ def _make_operands_for_binop(
         return {result_name: (self._column, other, reflect, fill_value)}
 
     @_cudf_nvtx_annotate
-    def nunique(self, dropna: bool = True):
+    def nunique(self, dropna: bool = True) -> int:
         """
         Return count of unique values for the column.
 

From 1bd210d76ab05c669aea230b9287b76a03328efa Mon Sep 17 00:00:00 2001
From: Ben Jarmak <104460670+jarmak-nv@users.noreply.github.com>
Date: Mon, 10 Jun 2024 21:35:46 -0400
Subject: [PATCH 332/842] Add external issue label and project automation
 (#15945)

This PR creates two new GitHub Actions around issue and PR tracking

### `external_issue_labeler.yml`
This action automatically adds a label, currently `External`, to any issue or PR that is opened by someone that is not either an owner, member, or collaborator to the cuDF repo

### `pr_issue_status_automation.yml`
This action uses the [shared workflows](https://github.com/rapidsai/shared-workflows/tree/branch-24.08/.github/workflows) in rapdsai/shared-workflows to, on open/edit/synchronize of an open PR, to:
1. Set the PR to `in progress`
2. Set all linked issues `in progress`
3. Set the PR's sprint to the current iteration
4. Set all linked issues to the current iteration

Edit triggers on edit of the PR description, (so new linked issues will get synchronized to `in progress`). Synchronize triggers on push and rebase events - this really is to cover the "what are we working on right now" because anything we touch goes into the current sprint in the project.

Authors:
  - Ben Jarmak (https://github.com/jarmak-nv)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15945
---
 .github/workflows/external_issue_labeler.yml  | 55 ++++++++++++++++
 .../workflows/pr_issue_status_automation.yml  | 64 +++++++++++++++++++
 2 files changed, 119 insertions(+)
 create mode 100644 .github/workflows/external_issue_labeler.yml
 create mode 100644 .github/workflows/pr_issue_status_automation.yml

diff --git a/.github/workflows/external_issue_labeler.yml b/.github/workflows/external_issue_labeler.yml
new file mode 100644
index 00000000000..e6d987e9f34
--- /dev/null
+++ b/.github/workflows/external_issue_labeler.yml
@@ -0,0 +1,55 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Label external issues and PRs
+
+on:
+  issues:
+    types:
+      - opened
+
+  pull_request:
+    types:
+      - opened
+
+env:
+  GITHUB_TOKEN: ${{ github.token }}
+
+permissions:
+  issues: write
+  pull-requests: write
+
+jobs:
+  Label-Issue:
+    runs-on: ubuntu-latest
+    # Only run if the issue author is not part of RAPIDS
+    if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.issue.author_association)}}
+    steps:
+      - name: add-external-labels
+        run: |
+          issue_url=${{ github.event.issue.html_url }}
+          gh issue edit ${issue_url} --add-label "External"
+        continue-on-error: true
+
+  Label-PR:
+    runs-on: ubuntu-latest
+    # Only run if the issue author is not part of RAPIDS
+    if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association)}}
+    steps:
+      - name: add-external-labels
+        run: |
+            pr_url=${{ github.event.pull_request.html_url }}
+            gh issue edit ${pr_url} --add-label "External"
+    continue-on-error: true
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
new file mode 100644
index 00000000000..aaece1bfa3e
--- /dev/null
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -0,0 +1,64 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+name: Set PR and Issue Project Fields
+
+on:
+  pull_request_target:
+    # This job runs when a PR is first opened, or it is updated
+    # Only runs if the PR is open (we don't want to update the status of a closed PR)
+    types: [opened, edited, synchronize]
+
+jobs:
+    get-project-id:
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.08
+      if: github.event.pull_request.state == 'open'
+      secrets: inherit
+      permissions:
+        contents: read
+      with:
+        PROJECT_ID: "PVT_kwDOAp2shc4AiNzl"
+        ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}"
+
+    update-status:
+      # This job sets the PR and its linked issues to "In Progress" status
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.08
+      if: github.event.pull_request.state == 'open'
+      needs: get-project-id
+      with:
+        PROJECT_ID: "PVT_kwDOAp2shc4AiNzl"
+        SINGLE_SELECT_FIELD_ID: "PVTSSF_lADOAp2shc4AiNzlzgaxNac"
+        SINGLE_SELECT_FIELD_NAME: "Status"
+        SINGLE_SELECT_OPTION_VALUE: "In Progress"
+        ITEM_PROJECT_ID: "${{ needs.get-project-id.outputs.ITEM_PROJECT_ID }}"
+        ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}"
+        UPDATE_ITEM: true
+        UPDATE_LINKED_ISSUES: true
+      secrets: inherit
+
+    update-sprint:
+      # This job sets the PR and its linked issues to the current "Weekly Sprint"
+      uses: jarmak-nv/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08
+      if: github.event.pull_request.state == 'open'
+      needs: get-project-id
+      with:
+        PROJECT_ID: "PVT_kwDOAp2shc4AiNzl"
+        ITERATION_FIELD_ID: "PVTIF_lADOAp2shc4AiNzlzgbU_po"
+        ITERATION_FIELD_NAME: "Weekly Sprint"
+        ITEM_PROJECT_ID: "${{ needs.get-project-id.outputs.ITEM_PROJECT_ID }}"
+        ITEM_NODE_ID: "${{ github.event.pull_request.node_id }}"
+        UPDATE_ITEM: true
+        UPDATE_LINKED_ISSUES: true
+      secrets: inherit

From ff1e4bb82ce4ab8ac54bc8715bf761a3700024bc Mon Sep 17 00:00:00 2001
From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
Date: Mon, 10 Jun 2024 19:34:00 -0700
Subject: [PATCH 333/842] Migrate left join and conditional join benchmarks to
 use nvbench (#15931)

The current [left join](https://github.com/rapidsai/cudf/blob/580ee40bf5fe1a66eaba914cdddb718a09193bab/cpp/benchmarks/join/left_join.cu) and [conditional join](https://github.com/rapidsai/cudf/blob/580ee40bf5fe1a66eaba914cdddb718a09193bab/cpp/benchmarks/join/conditional_join.cu) benchmarks are still using gbench.
This PR migrates the **left join** and **conditional join** benchmarks to use **nvbench**.

Closes #15699.

- [x] Migrate from gbench to nvbench
- [x] Similar to #15644, use `JOIN_KEY_TYPE_RANGE`, `JOIN_NULLABLE_RANGE` and `JOIN_SIZE_RANGE` to reduce the number of test cases and simplify the implementation
- [x] Get rid of the dispatching between gbench and nvbench in [join_common.hpp](https://github.com/rapidsai/cudf/blob/580ee40bf5fe1a66eaba914cdddb718a09193bab/cpp/benchmarks/join/join_common.hpp)

Authors:
  - Srinivas Yadav (https://github.com/srinivasyadav18)
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Shruti Shivakumar (https://github.com/shrshi)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15931
---
 cpp/benchmarks/CMakeLists.txt           |   6 +-
 cpp/benchmarks/join/conditional_join.cu | 288 ++++--------------------
 cpp/benchmarks/join/join_common.hpp     |  99 +++-----
 cpp/benchmarks/join/left_join.cu        | 152 ++++---------
 4 files changed, 116 insertions(+), 429 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 10f645dfec0..49504e53424 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -163,8 +163,10 @@ ConfigureNVBench(
 
 # ##################################################################################################
 # * join benchmark --------------------------------------------------------------------------------
-ConfigureBench(JOIN_BENCH join/left_join.cu join/conditional_join.cu)
-ConfigureNVBench(JOIN_NVBENCH join/join.cu join/mixed_join.cu join/distinct_join.cu)
+ConfigureNVBench(
+  JOIN_NVBENCH join/left_join.cu join/conditional_join.cu join/join.cu join/mixed_join.cu
+  join/distinct_join.cu
+)
 
 # ##################################################################################################
 # * iterator benchmark ----------------------------------------------------------------------------
diff --git a/cpp/benchmarks/join/conditional_join.cu b/cpp/benchmarks/join/conditional_join.cu
index d95fc0a5b59..e332d09d31b 100644
--- a/cpp/benchmarks/join/conditional_join.cu
+++ b/cpp/benchmarks/join/conditional_join.cu
@@ -14,250 +14,44 @@
  * limitations under the License.
  */
 
-#include <benchmarks/join/join_common.hpp>
-
-template <typename Key>
-class ConditionalJoin : public cudf::benchmark {};
-
-// For compatibility with the shared logic for equality (hash) joins, all of
-// the join lambdas defined by these macros accept a null_equality parameter
-// but ignore it (don't forward it to the underlying join implementation)
-// because conditional joins do not use this parameter.
-#define CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)            \
-  (::benchmark::State & st)                                          \
-  {                                                                  \
-    auto join = [](cudf::table_view const& left,                     \
-                   cudf::table_view const& right,                    \
-                   cudf::ast::operation binary_pred,                 \
-                   cudf::null_equality compare_nulls) {              \
-      return cudf::conditional_inner_join(left, right, binary_pred); \
-    };                                                               \
-    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);           \
-  }
-
-CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit, int32_t, false);
-CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit, int64_t, false);
-CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_32bit_nulls, int32_t, true);
-CONDITIONAL_INNER_JOIN_BENCHMARK_DEFINE(conditional_inner_join_64bit_nulls, int64_t, true);
-
-#define CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)           \
-  (::benchmark::State & st)                                         \
-  {                                                                 \
-    auto join = [](cudf::table_view const& left,                    \
-                   cudf::table_view const& right,                   \
-                   cudf::ast::operation binary_pred,                \
-                   cudf::null_equality compare_nulls) {             \
-      return cudf::conditional_left_join(left, right, binary_pred); \
-    };                                                              \
-    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);          \
-  }
-
-CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit, int32_t, false);
-CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit, int64_t, false);
-CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_32bit_nulls, int32_t, true);
-CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit_nulls, int64_t, true);
-
-#define CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)           \
-  (::benchmark::State & st)                                         \
-  {                                                                 \
-    auto join = [](cudf::table_view const& left,                    \
-                   cudf::table_view const& right,                   \
-                   cudf::ast::operation binary_pred,                \
-                   cudf::null_equality compare_nulls) {             \
-      return cudf::conditional_full_join(left, right, binary_pred); \
-    };                                                              \
-    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);          \
-  }
-
-CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit, int32_t, false);
-CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit, int64_t, false);
-CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_32bit_nulls, int32_t, true);
-CONDITIONAL_FULL_JOIN_BENCHMARK_DEFINE(conditional_full_join_64bit_nulls, int64_t, true);
-
-#define CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)                \
-  (::benchmark::State & st)                                              \
-  {                                                                      \
-    auto join = [](cudf::table_view const& left,                         \
-                   cudf::table_view const& right,                        \
-                   cudf::ast::operation binary_pred,                     \
-                   cudf::null_equality compare_nulls) {                  \
-      return cudf::conditional_left_anti_join(left, right, binary_pred); \
-    };                                                                   \
-    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);               \
-  }
-
-CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit, int32_t, false);
-CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit, int64_t, false);
-CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_32bit_nulls, int32_t, true);
-CONDITIONAL_LEFT_ANTI_JOIN_BENCHMARK_DEFINE(conditional_left_anti_join_64bit_nulls, int64_t, true);
-
-#define CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable) \
-  BENCHMARK_TEMPLATE_DEFINE_F(ConditionalJoin, name, Key)                \
-  (::benchmark::State & st)                                              \
-  {                                                                      \
-    auto join = [](cudf::table_view const& left,                         \
-                   cudf::table_view const& right,                        \
-                   cudf::ast::operation binary_pred,                     \
-                   cudf::null_equality compare_nulls) {                  \
-      return cudf::conditional_left_semi_join(left, right, binary_pred); \
-    };                                                                   \
-    BM_join<Key, Nullable, join_t::CONDITIONAL>(st, join);               \
-  }
-
-CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit, int32_t, false);
-CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit, int64_t, false);
-CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_32bit_nulls, int32_t, true);
-CONDITIONAL_LEFT_SEMI_JOIN_BENCHMARK_DEFINE(conditional_left_semi_join_64bit_nulls, int64_t, true);
-
-// inner join -----------------------------------------------------------------------
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({400'000, 100'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({400'000, 100'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_32bit_nulls)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({400'000, 100'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_inner_join_64bit_nulls)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({400'000, 100'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-// left join -----------------------------------------------------------------------
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_32bit)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_64bit)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_32bit_nulls)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_join_64bit_nulls)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-// full join -----------------------------------------------------------------------
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_32bit)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_64bit)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_32bit_nulls)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_full_join_64bit_nulls)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-// left anti-join -------------------------------------------------------------
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_32bit)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_64bit)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_32bit_nulls)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_anti_join_64bit_nulls)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-// left semi-join -------------------------------------------------------------
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_32bit)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_64bit)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_32bit_nulls)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(ConditionalJoin, conditional_left_semi_join_64bit_nulls)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->UseManualTime();
+#include "join_common.hpp"
+
+template <typename Key, bool Nullable>
+void nvbench_conditional_inner_join(nvbench::state& state,
+                                    nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
+{
+  auto join = [](cudf::table_view const& left,
+                 cudf::table_view const& right,
+                 cudf::ast::operation binary_pred,
+                 cudf::null_equality compare_nulls) {
+    return cudf::conditional_inner_join(left, right, binary_pred);
+  };
+  BM_join<Key, Nullable, join_t::CONDITIONAL>(state, join);
+}
+
+template <typename Key, bool Nullable>
+void nvbench_conditional_left_join(nvbench::state& state,
+                                   nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
+{
+  auto join = [](cudf::table_view const& left,
+                 cudf::table_view const& right,
+                 cudf::ast::operation binary_pred,
+                 cudf::null_equality compare_nulls) {
+    return cudf::conditional_left_join(left, right, binary_pred);
+  };
+  BM_join<Key, Nullable, join_t::CONDITIONAL>(state, join);
+}
+
+NVBENCH_BENCH_TYPES(nvbench_conditional_inner_join,
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("conditional_inner_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+
+NVBENCH_BENCH_TYPES(nvbench_conditional_left_join,
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("conditional_left_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index e6792b9dbfb..3d9d9c57548 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -19,7 +19,6 @@
 #include "generate_input_tables.cuh"
 
 #include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/column/column_factories.hpp>
@@ -67,28 +66,12 @@ template <typename Key,
           typename Join>
 void BM_join(state_type& state, Join JoinFunc)
 {
-  auto const right_size = [&]() {
-    if constexpr (std::is_same_v<state_type, benchmark::State>) {
-      return static_cast<cudf::size_type>(state.range(0));
-    }
-    if constexpr (std::is_same_v<state_type, nvbench::state>) {
-      return static_cast<cudf::size_type>(state.get_int64("right_size"));
-    }
-  }();
-  auto const left_size = [&]() {
-    if constexpr (std::is_same_v<state_type, benchmark::State>) {
-      return static_cast<cudf::size_type>(state.range(1));
-    }
-    if constexpr (std::is_same_v<state_type, nvbench::state>) {
-      return static_cast<cudf::size_type>(state.get_int64("left_size"));
-    }
-  }();
+  auto const right_size = static_cast<cudf::size_type>(state.get_int64("right_size"));
+  auto const left_size  = static_cast<cudf::size_type>(state.get_int64("left_size"));
 
-  if constexpr (std::is_same_v<state_type, nvbench::state>) {
-    if (right_size > left_size) {
-      state.skip("Skip large right table");
-      return;
-    }
+  if (right_size > left_size) {
+    state.skip("Skip large right table");
+    return;
   }
 
   double const selectivity = 0.3;
@@ -165,57 +148,37 @@ void BM_join(state_type& state, Join JoinFunc)
 
   // Setup join parameters and result table
   [[maybe_unused]] std::vector<cudf::size_type> columns_to_join = {0};
-
-  // Benchmark the inner join operation
-  if constexpr (std::is_same_v<state_type, benchmark::State> and
-                (join_type != join_t::CONDITIONAL)) {
-    for (auto _ : state) {
-      cuda_event_timer raii(state, true, cudf::get_default_stream());
-
-      auto result = JoinFunc(left_table.select(columns_to_join),
-                             right_table.select(columns_to_join),
-                             cudf::null_equality::UNEQUAL);
-    }
-  }
-  if constexpr (std::is_same_v<state_type, nvbench::state> and (join_type != join_t::CONDITIONAL)) {
-    state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
-    if constexpr (join_type == join_t::MIXED) {
-      auto const col_ref_left_0 = cudf::ast::column_reference(0);
-      auto const col_ref_right_0 =
-        cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
-      auto left_zero_eq_right_zero =
-        cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
-      state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-        auto result = JoinFunc(left_table.select(columns_to_join),
-                               right_table.select(columns_to_join),
-                               left_table.select({1}),
-                               right_table.select({1}),
-                               left_zero_eq_right_zero,
-                               cudf::null_equality::UNEQUAL);
-      });
-    }
-    if constexpr (join_type == join_t::HASH) {
-      state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-        auto result = JoinFunc(left_table.select(columns_to_join),
-                               right_table.select(columns_to_join),
-                               cudf::null_equality::UNEQUAL);
-      });
-    }
-  }
-
-  // Benchmark conditional join
-  if constexpr (std::is_same_v<state_type, benchmark::State> and join_type == join_t::CONDITIONAL) {
-    // Common column references.
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  if constexpr (join_type == join_t::CONDITIONAL) {
     auto const col_ref_left_0  = cudf::ast::column_reference(0);
     auto const col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
     auto left_zero_eq_right_zero =
       cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
-
-    for (auto _ : state) {
-      cuda_event_timer raii(state, true, cudf::get_default_stream());
-
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
       auto result =
         JoinFunc(left_table, right_table, left_zero_eq_right_zero, cudf::null_equality::UNEQUAL);
-    }
+      ;
+    });
+  }
+  if constexpr (join_type == join_t::MIXED) {
+    auto const col_ref_left_0  = cudf::ast::column_reference(0);
+    auto const col_ref_right_0 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+    auto left_zero_eq_right_zero =
+      cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = JoinFunc(left_table.select(columns_to_join),
+                             right_table.select(columns_to_join),
+                             left_table.select({1}),
+                             right_table.select({1}),
+                             left_zero_eq_right_zero,
+                             cudf::null_equality::UNEQUAL);
+    });
+  }
+  if constexpr (join_type == join_t::HASH) {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = JoinFunc(left_table.select(columns_to_join),
+                             right_table.select(columns_to_join),
+                             cudf::null_equality::UNEQUAL);
+    });
   }
 }
diff --git a/cpp/benchmarks/join/left_join.cu b/cpp/benchmarks/join/left_join.cu
index 3e398e721fa..92123ce1621 100644
--- a/cpp/benchmarks/join/left_join.cu
+++ b/cpp/benchmarks/join/left_join.cu
@@ -14,115 +14,43 @@
  * limitations under the License.
  */
 
-#include <benchmarks/join/join_common.hpp>
-
-template <typename Key>
-class Join : public cudf::benchmark {};
-
-#define LEFT_ANTI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable)   \
-  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, Key)                 \
-  (::benchmark::State & st)                                    \
-  {                                                            \
-    auto join = [](cudf::table_view const& left,               \
-                   cudf::table_view const& right,              \
-                   cudf::null_equality compare_nulls) {        \
-      return cudf::left_anti_join(left, right, compare_nulls); \
-    };                                                         \
-    BM_join<Key, Nullable>(st, join);                          \
-  }
-
-LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit, int32_t, false);
-LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit, int64_t, false);
-LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_32bit_nulls, int32_t, true);
-LEFT_ANTI_JOIN_BENCHMARK_DEFINE(left_anti_join_64bit_nulls, int64_t, true);
-
-#define LEFT_SEMI_JOIN_BENCHMARK_DEFINE(name, Key, Nullable)   \
-  BENCHMARK_TEMPLATE_DEFINE_F(Join, name, Key)                 \
-  (::benchmark::State & st)                                    \
-  {                                                            \
-    auto join = [](cudf::table_view const& left,               \
-                   cudf::table_view const& right,              \
-                   cudf::null_equality compare_nulls) {        \
-      return cudf::left_semi_join(left, right, compare_nulls); \
-    };                                                         \
-    BM_join<Key, Nullable>(st, join);                          \
-  }
-
-LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit, int32_t, false);
-LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit, int64_t, false);
-LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_32bit_nulls, int32_t, true);
-LEFT_SEMI_JOIN_BENCHMARK_DEFINE(left_semi_join_64bit_nulls, int64_t, true);
-
-// left anti-join -------------------------------------------------------------
-BENCHMARK_REGISTER_F(Join, left_anti_join_32bit)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->Args({10'000'000, 10'000'000})
-  ->Args({10'000'000, 40'000'000})
-  ->Args({10'000'000, 100'000'000})
-  ->Args({100'000'000, 100'000'000})
-  ->Args({80'000'000, 240'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(Join, left_anti_join_64bit)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({50'000'000, 50'000'000})
-  ->Args({40'000'000, 120'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(Join, left_anti_join_32bit_nulls)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->Args({10'000'000, 10'000'000})
-  ->Args({10'000'000, 40'000'000})
-  ->Args({10'000'000, 100'000'000})
-  ->Args({100'000'000, 100'000'000})
-  ->Args({80'000'000, 240'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(Join, left_anti_join_64bit_nulls)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({50'000'000, 50'000'000})
-  ->Args({40'000'000, 120'000'000})
-  ->UseManualTime();
-
-// left semi-join -------------------------------------------------------------
-BENCHMARK_REGISTER_F(Join, left_semi_join_32bit)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->Args({10'000'000, 10'000'000})
-  ->Args({10'000'000, 40'000'000})
-  ->Args({10'000'000, 100'000'000})
-  ->Args({100'000'000, 100'000'000})
-  ->Args({80'000'000, 240'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(Join, left_semi_join_64bit)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({50'000'000, 50'000'000})
-  ->Args({40'000'000, 120'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(Join, left_semi_join_32bit_nulls)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({100'000, 100'000})
-  ->Args({100'000, 400'000})
-  ->Args({100'000, 1'000'000})
-  ->Args({10'000'000, 10'000'000})
-  ->Args({10'000'000, 40'000'000})
-  ->Args({10'000'000, 100'000'000})
-  ->Args({100'000'000, 100'000'000})
-  ->Args({80'000'000, 240'000'000})
-  ->UseManualTime();
-
-BENCHMARK_REGISTER_F(Join, left_semi_join_64bit_nulls)
-  ->Unit(benchmark::kMillisecond)
-  ->Args({50'000'000, 50'000'000})
-  ->Args({40'000'000, 120'000'000})
-  ->UseManualTime();
+#include "join_common.hpp"
+
+template <typename Key, bool Nullable>
+void nvbench_left_anti_join(nvbench::state& state,
+                            nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
+{
+  auto join = [](cudf::table_view const& left,
+                 cudf::table_view const& right,
+                 cudf::null_equality compare_nulls) {
+    return cudf::left_anti_join(left, right, compare_nulls);
+  };
+
+  BM_join<Key, Nullable>(state, join);
+}
+
+template <typename Key, bool Nullable>
+void nvbench_left_semi_join(nvbench::state& state,
+                            nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
+{
+  auto join = [](cudf::table_view const& left,
+                 cudf::table_view const& right,
+                 cudf::null_equality compare_nulls) {
+    return cudf::left_semi_join(left, right, compare_nulls);
+  };
+  BM_join<Key, Nullable>(state, join);
+}
+
+NVBENCH_BENCH_TYPES(nvbench_left_anti_join,
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("left_anti_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+
+NVBENCH_BENCH_TYPES(nvbench_left_semi_join,
+                    NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
+  .set_name("left_semi_join")
+  .set_type_axes_names({"Key", "Nullable"})
+  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", JOIN_SIZE_RANGE);

From 66c2f4fded3aa5d83745fada3e4c4d5eee7895b2 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Tue, 11 Jun 2024 07:24:19 -0700
Subject: [PATCH 334/842] Condense pylibcudf data fixtures (#15958)

Condense all pa_foo/plc_foo data fixtures into just foo, as recommended by
https://github.com/rapidsai/cudf/pull/15839#discussion_r1626769872.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15958
---
 .../cudf/cudf/pylibcudf_tests/test_copying.py | 499 ++++++++++--------
 .../cudf/pylibcudf_tests/test_quantiles.py    |  16 +-
 .../cudf/cudf/pylibcudf_tests/test_reshape.py |  20 +-
 .../pylibcudf_tests/test_string_capitalize.py |  54 +-
 .../pylibcudf_tests/test_string_contains.py   |  15 +-
 .../cudf/pylibcudf_tests/test_string_find.py  |  78 ++-
 6 files changed, 358 insertions(+), 324 deletions(-)

diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py
index cd70ce4abf5..da3ca3a6d1e 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_copying.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py
@@ -20,121 +20,104 @@
 
 # TODO: Test nullable data
 @pytest.fixture(scope="module")
-def pa_input_column(pa_type):
+def input_column(pa_type):
     if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type):
-        return pa.array([1, 2, 3], type=pa_type)
+        pa_array = pa.array([1, 2, 3], type=pa_type)
     elif pa.types.is_string(pa_type):
-        return pa.array(["a", "b", "c"], type=pa_type)
+        pa_array = pa.array(["a", "b", "c"], type=pa_type)
     elif pa.types.is_boolean(pa_type):
-        return pa.array([True, True, False], type=pa_type)
+        pa_array = pa.array([True, True, False], type=pa_type)
     elif pa.types.is_list(pa_type):
         # TODO: Add heterogenous sizes
-        return pa.array([[1], [2], [3]], type=pa_type)
+        pa_array = pa.array([[1], [2], [3]], type=pa_type)
     elif pa.types.is_struct(pa_type):
-        return pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type)
-    raise ValueError("Unsupported type")
-
-
-@pytest.fixture(scope="module")
-def input_column(pa_input_column):
-    return plc.interop.from_arrow(pa_input_column)
+        pa_array = pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type)
+    else:
+        raise ValueError("Unsupported type")
+    return pa_array, plc.interop.from_arrow(pa_array)
 
 
 @pytest.fixture(scope="module")
-def pa_index_column():
+def index_column():
     # Index column for testing gather/scatter, always integral.
-    return pa.array([1, 2, 3])
-
-
-@pytest.fixture(scope="module")
-def index_column(pa_index_column):
-    return plc.interop.from_arrow(pa_index_column)
+    pa_array = pa.array([1, 2, 3])
+    return pa_array, plc.interop.from_arrow(pa_array)
 
 
 @pytest.fixture(scope="module")
-def pa_target_column(pa_type):
+def target_column(pa_type):
     if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type):
-        return pa.array([4, 5, 6, 7, 8, 9], type=pa_type)
+        pa_array = pa.array([4, 5, 6, 7, 8, 9], type=pa_type)
     elif pa.types.is_string(pa_type):
-        return pa.array(["d", "e", "f", "g", "h", "i"], type=pa_type)
+        pa_array = pa.array(["d", "e", "f", "g", "h", "i"], type=pa_type)
     elif pa.types.is_boolean(pa_type):
-        return pa.array([False, True, True, False, True, False], type=pa_type)
+        pa_array = pa.array(
+            [False, True, True, False, True, False], type=pa_type
+        )
     elif pa.types.is_list(pa_type):
         # TODO: Add heterogenous sizes
-        return pa.array([[4], [5], [6], [7], [8], [9]], type=pa_type)
+        pa_array = pa.array([[4], [5], [6], [7], [8], [9]], type=pa_type)
     elif pa.types.is_struct(pa_type):
-        return pa.array(
+        pa_array = pa.array(
             [{"v": 4}, {"v": 5}, {"v": 6}, {"v": 7}, {"v": 8}, {"v": 9}],
             type=pa_type,
         )
-    raise ValueError("Unsupported type")
-
-
-@pytest.fixture(scope="module")
-def target_column(pa_target_column):
-    return plc.interop.from_arrow(pa_target_column)
+    else:
+        raise ValueError("Unsupported type")
+    return pa_array, plc.interop.from_arrow(pa_array)
 
 
 @pytest.fixture
 def mutable_target_column(target_column):
-    return target_column.copy()
+    _, plc_target_column = target_column
+    return plc_target_column.copy()
 
 
 @pytest.fixture(scope="module")
-def pa_source_table(pa_input_column):
-    return pa.table([pa_input_column] * 3, [""] * 3)
+def source_table(input_column):
+    pa_input_column, _ = input_column
+    pa_table = pa.table([pa_input_column] * 3, [""] * 3)
+    return pa_table, plc.interop.from_arrow(pa_table)
 
 
 @pytest.fixture(scope="module")
-def source_table(pa_source_table):
-    return plc.interop.from_arrow(pa_source_table)
+def target_table(target_column):
+    pa_target_column, _ = target_column
+    pa_table = pa.table([pa_target_column] * 3, [""] * 3)
+    return pa_table, plc.interop.from_arrow(pa_table)
 
 
 @pytest.fixture(scope="module")
-def pa_target_table(pa_target_column):
-    return pa.table([pa_target_column] * 3, [""] * 3)
-
-
-@pytest.fixture(scope="module")
-def target_table(pa_target_table):
-    return plc.interop.from_arrow(pa_target_table)
-
-
-@pytest.fixture(scope="module")
-def pa_source_scalar(pa_type):
+def source_scalar(pa_type):
     if pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type):
-        return pa.scalar(1, type=pa_type)
+        pa_scalar = pa.scalar(1, type=pa_type)
     elif pa.types.is_string(pa_type):
-        return pa.scalar("a", type=pa_type)
+        pa_scalar = pa.scalar("a", type=pa_type)
     elif pa.types.is_boolean(pa_type):
-        return pa.scalar(False, type=pa_type)
+        pa_scalar = pa.scalar(False, type=pa_type)
     elif pa.types.is_list(pa_type):
         # TODO: Longer list?
-        return pa.scalar([1], type=pa_type)
+        pa_scalar = pa.scalar([1], type=pa_type)
     elif pa.types.is_struct(pa_type):
-        return pa.scalar({"v": 1}, type=pa_type)
-    raise ValueError("Unsupported type")
-
-
-@pytest.fixture(scope="module")
-def source_scalar(pa_source_scalar):
-    return plc.interop.from_arrow(pa_source_scalar)
-
-
-@pytest.fixture(scope="module")
-def pa_mask(pa_target_column):
-    return pa.array([True, False] * (len(pa_target_column) // 2))
+        pa_scalar = pa.scalar({"v": 1}, type=pa_type)
+    else:
+        raise ValueError("Unsupported type")
+    return pa_scalar, plc.interop.from_arrow(pa_scalar)
 
 
 @pytest.fixture(scope="module")
-def mask(pa_mask):
-    return plc.interop.from_arrow(pa_mask)
+def mask(target_column):
+    pa_target_column, _ = target_column
+    pa_mask = pa.array([True, False] * (len(pa_target_column) // 2))
+    return pa_mask, plc.interop.from_arrow(pa_mask)
 
 
-def test_gather(target_table, pa_target_table, index_column, pa_index_column):
+def test_gather(target_table, index_column):
+    pa_target_table, plc_target_table = target_table
+    pa_index_column, plc_index_column = index_column
     result = plc.copying.gather(
-        target_table,
-        index_column,
+        plc_target_table,
+        plc_index_column,
         plc.copying.OutOfBoundsPolicy.DONT_CHECK,
     )
     expected = pa_target_table.take(pa_index_column)
@@ -142,10 +125,11 @@ def test_gather(target_table, pa_target_table, index_column, pa_index_column):
 
 
 def test_gather_map_has_nulls(target_table):
+    _, plc_target_table = target_table
     gather_map = plc.interop.from_arrow(pa.array([0, 1, None]))
     with cudf_raises(ValueError):
         plc.copying.gather(
-            target_table,
+            plc_target_table,
             gather_map,
             plc.copying.OutOfBoundsPolicy.DONT_CHECK,
         )
@@ -185,16 +169,16 @@ def _pyarrow_boolean_mask_scatter_table(source, mask, target_table):
 
 def test_scatter_table(
     source_table,
-    pa_source_table,
     index_column,
-    pa_index_column,
     target_table,
-    pa_target_table,
 ):
+    pa_source_table, plc_source_table = source_table
+    pa_index_column, plc_index_column = index_column
+    pa_target_table, plc_target_table = target_table
     result = plc.copying.scatter(
-        source_table,
-        index_column,
-        target_table,
+        plc_source_table,
+        plc_index_column,
+        plc_target_table,
     )
 
     if pa.types.is_list(
@@ -247,68 +231,80 @@ def test_scatter_table_num_col_mismatch(
     source_table, index_column, target_table
 ):
     # Number of columns in source and target must match.
+    _, plc_source_table = source_table
+    _, plc_index_column = index_column
+    _, plc_target_table = target_table
     with cudf_raises(ValueError):
         plc.copying.scatter(
-            plc.Table(source_table.columns()[:2]),
-            index_column,
-            target_table,
+            plc.Table(plc_source_table.columns()[:2]),
+            plc_index_column,
+            plc_target_table,
         )
 
 
 def test_scatter_table_num_row_mismatch(source_table, target_table):
     # Number of rows in source and scatter map must match.
+    _, plc_source_table = source_table
+    _, plc_target_table = target_table
     with cudf_raises(ValueError):
         plc.copying.scatter(
-            source_table,
+            plc_source_table,
             plc.interop.from_arrow(
-                pa.array(range(source_table.num_rows() * 2))
+                pa.array(range(plc_source_table.num_rows() * 2))
             ),
-            target_table,
+            plc_target_table,
         )
 
 
 def test_scatter_table_map_has_nulls(source_table, target_table):
+    _, plc_source_table = source_table
+    _, plc_target_table = target_table
     with cudf_raises(ValueError):
         plc.copying.scatter(
-            source_table,
-            plc.interop.from_arrow(pa.array([None] * source_table.num_rows())),
-            target_table,
+            plc_source_table,
+            plc.interop.from_arrow(
+                pa.array([None] * plc_source_table.num_rows())
+            ),
+            plc_target_table,
         )
 
 
 def test_scatter_table_type_mismatch(source_table, index_column, target_table):
+    _, plc_source_table = source_table
+    _, plc_index_column = index_column
+    _, plc_target_table = target_table
     with cudf_raises(TypeError):
         if is_integer(
-            dtype := target_table.columns()[0].type()
+            dtype := plc_target_table.columns()[0].type()
         ) or is_floating(dtype):
-            pa_array = pa.array([True] * source_table.num_rows())
+            pa_array = pa.array([True] * plc_source_table.num_rows())
         else:
-            pa_array = pa.array([1] * source_table.num_rows())
-        ncol = source_table.num_columns()
+            pa_array = pa.array([1] * plc_source_table.num_rows())
+        ncol = plc_source_table.num_columns()
         pa_table = pa.table([pa_array] * ncol, [""] * ncol)
         plc.copying.scatter(
             plc.interop.from_arrow(pa_table),
-            index_column,
-            target_table,
+            plc_index_column,
+            plc_target_table,
         )
 
 
 def test_scatter_scalars(
     source_scalar,
-    pa_source_scalar,
     index_column,
-    pa_index_column,
     target_table,
-    pa_target_table,
 ):
+    pa_source_scalar, plc_source_scalar = source_scalar
+    pa_index_column, plc_index_column = index_column
+    pa_target_table, plc_target_table = target_table
     result = plc.copying.scatter(
-        [source_scalar] * target_table.num_columns(),
-        index_column,
-        target_table,
+        [plc_source_scalar] * plc_target_table.num_columns(),
+        plc_index_column,
+        plc_target_table,
     )
 
     expected = _pyarrow_boolean_mask_scatter_table(
-        [pa_source_scalar] * target_table.num_columns(),
+        [pa_source_scalar] * plc_target_table.num_columns(),
         pc.invert(
             _pyarrow_index_to_mask(pa_index_column, pa_target_table.num_rows)
         ),
@@ -321,85 +317,103 @@ def test_scatter_scalars(
 def test_scatter_scalars_num_scalars_mismatch(
     source_scalar, index_column, target_table
 ):
+    _, plc_source_scalar = source_scalar
+    _, plc_index_column = index_column
+    _, plc_target_table = target_table
     with cudf_raises(ValueError):
         plc.copying.scatter(
-            [source_scalar] * (target_table.num_columns() - 1),
-            index_column,
-            target_table,
+            [plc_source_scalar] * (plc_target_table.num_columns() - 1),
+            plc_index_column,
+            plc_target_table,
         )
 
 
 def test_scatter_scalars_map_has_nulls(source_scalar, target_table):
+    _, plc_source_scalar = source_scalar
+    _, plc_target_table = target_table
     with cudf_raises(ValueError):
         plc.copying.scatter(
-            [source_scalar] * target_table.num_columns(),
+            [plc_source_scalar] * plc_target_table.num_columns(),
             plc.interop.from_arrow(pa.array([None, None])),
-            target_table,
+            plc_target_table,
         )
 
 
 def test_scatter_scalars_type_mismatch(index_column, target_table):
+    _, plc_index_column = index_column
+    _, plc_target_table = target_table
     with cudf_raises(TypeError):
         if is_integer(
-            dtype := target_table.columns()[0].type()
+            dtype := plc_target_table.columns()[0].type()
         ) or is_floating(dtype):
-            source_scalar = [plc.interop.from_arrow(pa.scalar(True))]
+            plc_source_scalar = [plc.interop.from_arrow(pa.scalar(True))]
         else:
-            source_scalar = [plc.interop.from_arrow(pa.scalar(1))]
+            plc_source_scalar = [plc.interop.from_arrow(pa.scalar(1))]
         plc.copying.scatter(
-            source_scalar * target_table.num_columns(),
-            index_column,
-            target_table,
+            plc_source_scalar * plc_target_table.num_columns(),
+            plc_index_column,
+            plc_target_table,
         )
 
 
 def test_empty_like_column(input_column):
-    result = plc.copying.empty_like(input_column)
-    assert result.type() == input_column.type()
+    _, plc_input_column = input_column
+    result = plc.copying.empty_like(plc_input_column)
+    assert result.type() == plc_input_column.type()
 
 
 def test_empty_like_table(source_table):
-    result = plc.copying.empty_like(source_table)
-    assert result.num_columns() == source_table.num_columns()
-    for icol, rcol in zip(source_table.columns(), result.columns()):
+    _, plc_source_table = source_table
+    result = plc.copying.empty_like(plc_source_table)
+    assert result.num_columns() == plc_source_table.num_columns()
+    for icol, rcol in zip(plc_source_table.columns(), result.columns()):
         assert rcol.type() == icol.type()
 
 
 @pytest.mark.parametrize("size", [None, 10])
 def test_allocate_like(input_column, size):
-    if is_fixed_width(input_column.type()):
+    _, plc_input_column = input_column
+    if is_fixed_width(plc_input_column.type()):
         result = plc.copying.allocate_like(
-            input_column, plc.copying.MaskAllocationPolicy.RETAIN, size=size
+            plc_input_column,
+            plc.copying.MaskAllocationPolicy.RETAIN,
+            size=size,
+        )
+        assert result.type() == plc_input_column.type()
+        assert result.size() == (
+            plc_input_column.size() if size is None else size
         )
-        assert result.type() == input_column.type()
-        assert result.size() == (input_column.size() if size is None else size)
     else:
         with pytest.raises(TypeError):
             plc.copying.allocate_like(
-                input_column,
+                plc_input_column,
                 plc.copying.MaskAllocationPolicy.RETAIN,
                 size=size,
             )
 
 
 def test_copy_range_in_place(
-    input_column, pa_input_column, mutable_target_column, pa_target_column
+    input_column, mutable_target_column, target_column
 ):
+    pa_input_column, plc_input_column = input_column
+
+    pa_target_column, _ = target_column
+
     if not is_fixed_width(mutable_target_column.type()):
         with pytest.raises(TypeError):
             plc.copying.copy_range_in_place(
-                input_column,
+                plc_input_column,
                 mutable_target_column,
                 0,
-                input_column.size(),
+                plc_input_column.size(),
                 0,
             )
     else:
         plc.copying.copy_range_in_place(
-            input_column,
+            plc_input_column,
             mutable_target_column,
             0,
-            input_column.size(),
+            plc_input_column.size(),
             0,
         )
         expected = _pyarrow_boolean_mask_scatter_column(
@@ -415,36 +429,40 @@ def test_copy_range_in_place(
 def test_copy_range_in_place_out_of_bounds(
     input_column, mutable_target_column
 ):
+    _, plc_input_column = input_column
+
     if is_fixed_width(mutable_target_column.type()):
         with cudf_raises(IndexError):
             plc.copying.copy_range_in_place(
-                input_column,
+                plc_input_column,
                 mutable_target_column,
                 5,
-                5 + input_column.size(),
+                5 + plc_input_column.size(),
                 0,
             )
 
 
 def test_copy_range_in_place_different_types(mutable_target_column):
     if is_integer(dtype := mutable_target_column.type()) or is_floating(dtype):
-        input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
+        plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
     else:
-        input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
+        plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
 
     with cudf_raises(TypeError):
         plc.copying.copy_range_in_place(
-            input_column,
+            plc_input_column,
             mutable_target_column,
             0,
-            input_column.size(),
+            plc_input_column.size(),
             0,
         )
 
 
 def test_copy_range_in_place_null_mismatch(
-    pa_input_column, mutable_target_column
+    input_column, mutable_target_column
 ):
+    pa_input_column, _ = input_column
+
     if is_fixed_width(mutable_target_column.type()):
         pa_input_column = pc.if_else(
             _pyarrow_index_to_mask([0], len(pa_input_column)),
@@ -462,15 +480,15 @@ def test_copy_range_in_place_null_mismatch(
             )
 
 
-def test_copy_range(
-    input_column, pa_input_column, target_column, pa_target_column
-):
-    if is_fixed_width(dtype := target_column.type()) or is_string(dtype):
+def test_copy_range(input_column, target_column):
+    pa_input_column, plc_input_column = input_column
+    pa_target_column, plc_target_column = target_column
+    if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype):
         result = plc.copying.copy_range(
-            input_column,
-            target_column,
+            plc_input_column,
+            plc_target_column,
             0,
-            input_column.size(),
+            plc_input_column.size(),
             0,
         )
         expected = _pyarrow_boolean_mask_scatter_column(
@@ -484,137 +502,152 @@ def test_copy_range(
     else:
         with pytest.raises(TypeError):
             plc.copying.copy_range(
-                input_column,
-                target_column,
+                plc_input_column,
+                plc_target_column,
                 0,
-                input_column.size(),
+                plc_input_column.size(),
                 0,
             )
 
 
 def test_copy_range_out_of_bounds(input_column, target_column):
+    _, plc_input_column = input_column
+    _, plc_target_column = target_column
     with cudf_raises(IndexError):
         plc.copying.copy_range(
-            input_column,
-            target_column,
+            plc_input_column,
+            plc_target_column,
             5,
-            5 + input_column.size(),
+            5 + plc_input_column.size(),
             0,
         )
 
 
 def test_copy_range_different_types(target_column):
-    if is_integer(dtype := target_column.type()) or is_floating(dtype):
-        input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
+    _, plc_target_column = target_column
+    if is_integer(dtype := plc_target_column.type()) or is_floating(dtype):
+        plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
     else:
-        input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
+        plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
 
     with cudf_raises(TypeError):
         plc.copying.copy_range(
-            input_column,
-            target_column,
+            plc_input_column,
+            plc_target_column,
             0,
-            input_column.size(),
+            plc_input_column.size(),
             0,
         )
 
 
-def test_shift(
-    target_column, pa_target_column, source_scalar, pa_source_scalar
-):
+def test_shift(target_column, source_scalar):
+    pa_source_scalar, plc_source_scalar = source_scalar
+    pa_target_column, plc_target_column = target_column
     shift = 2
-    if is_fixed_width(dtype := target_column.type()) or is_string(dtype):
-        result = plc.copying.shift(target_column, shift, source_scalar)
+    if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype):
+        result = plc.copying.shift(plc_target_column, shift, plc_source_scalar)
         expected = pa.concat_arrays(
             [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]]
         )
         assert_column_eq(expected, result)
     else:
         with pytest.raises(TypeError):
-            plc.copying.shift(target_column, shift, source_scalar)
+            plc.copying.shift(plc_target_column, shift, source_scalar)
 
 
 def test_shift_type_mismatch(target_column):
-    if is_integer(dtype := target_column.type()) or is_floating(dtype):
+    _, plc_target_column = target_column
+    if is_integer(dtype := plc_target_column.type()) or is_floating(dtype):
         fill_value = plc.interop.from_arrow(pa.scalar("a"))
     else:
         fill_value = plc.interop.from_arrow(pa.scalar(1))
 
     with cudf_raises(TypeError):
-        plc.copying.shift(target_column, 2, fill_value)
+        plc.copying.shift(plc_target_column, 2, fill_value)
 
 
-def test_slice_column(target_column, pa_target_column):
+def test_slice_column(target_column):
+    pa_target_column, plc_target_column = target_column
     bounds = list(range(6))
     upper_bounds = bounds[1::2]
     lower_bounds = bounds[::2]
-    result = plc.copying.slice(target_column, bounds)
+    result = plc.copying.slice(plc_target_column, bounds)
     for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result):
         assert_column_eq(pa_target_column[lb:ub], slice_)
 
 
 def test_slice_column_wrong_length(target_column):
+    _, plc_target_column = target_column
     with cudf_raises(ValueError):
-        plc.copying.slice(target_column, list(range(5)))
+        plc.copying.slice(plc_target_column, list(range(5)))
 
 
 def test_slice_column_decreasing(target_column):
+    _, plc_target_column = target_column
     with cudf_raises(ValueError):
-        plc.copying.slice(target_column, list(range(5, -1, -1)))
+        plc.copying.slice(plc_target_column, list(range(5, -1, -1)))
 
 
 def test_slice_column_out_of_bounds(target_column):
+    _, plc_target_column = target_column
     with cudf_raises(IndexError):
-        plc.copying.slice(target_column, list(range(2, 8)))
+        plc.copying.slice(plc_target_column, list(range(2, 8)))
 
 
-def test_slice_table(target_table, pa_target_table):
+def test_slice_table(target_table):
+    pa_target_table, plc_target_table = target_table
     bounds = list(range(6))
     upper_bounds = bounds[1::2]
     lower_bounds = bounds[::2]
-    result = plc.copying.slice(target_table, bounds)
+    result = plc.copying.slice(plc_target_table, bounds)
     for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result):
         assert_table_eq(pa_target_table[lb:ub], slice_)
 
 
-def test_split_column(target_column, pa_target_column):
+def test_split_column(target_column):
     upper_bounds = [1, 3, 5]
     lower_bounds = [0] + upper_bounds[:-1]
-    result = plc.copying.split(target_column, upper_bounds)
+    pa_target_column, plc_target_column = target_column
+    result = plc.copying.split(plc_target_column, upper_bounds)
     for lb, ub, split in zip(lower_bounds, upper_bounds, result):
         assert_column_eq(pa_target_column[lb:ub], split)
 
 
 def test_split_column_decreasing(target_column):
+    _, plc_target_column = target_column
     with cudf_raises(ValueError):
-        plc.copying.split(target_column, list(range(5, -1, -1)))
+        plc.copying.split(plc_target_column, list(range(5, -1, -1)))
 
 
 def test_split_column_out_of_bounds(target_column):
+    _, plc_target_column = target_column
     with cudf_raises(IndexError):
-        plc.copying.split(target_column, list(range(5, 8)))
+        plc.copying.split(plc_target_column, list(range(5, 8)))
 
 
-def test_split_table(target_table, pa_target_table):
+def test_split_table(target_table):
+    pa_target_table, plc_target_table = target_table
     upper_bounds = [1, 3, 5]
     lower_bounds = [0] + upper_bounds[:-1]
-    result = plc.copying.split(target_table, upper_bounds)
+    result = plc.copying.split(plc_target_table, upper_bounds)
     for lb, ub, split in zip(lower_bounds, upper_bounds, result):
         assert_table_eq(pa_target_table[lb:ub], split)
 
 
-def test_copy_if_else_column_column(
-    target_column, pa_target_column, pa_source_scalar, mask, pa_mask
-):
+def test_copy_if_else_column_column(target_column, mask, source_scalar):
+    pa_target_column, plc_target_column = target_column
+    pa_source_scalar, _ = source_scalar
+    pa_mask, plc_mask = mask
+
     pa_other_column = pa.concat_arrays(
         [pa.array([pa_source_scalar] * 2), pa_target_column[:-2]]
     )
-    other_column = plc.interop.from_arrow(pa_other_column)
+    plc_other_column = plc.interop.from_arrow(pa_other_column)
 
     result = plc.copying.copy_if_else(
-        target_column,
-        other_column,
-        mask,
+        plc_target_column,
+        plc_other_column,
+        plc_mask,
     )
 
     expected = pc.if_else(
@@ -626,46 +659,51 @@ def test_copy_if_else_column_column(
 
 
 def test_copy_if_else_wrong_type(target_column, mask):
-    if is_integer(dtype := target_column.type()) or is_floating(dtype):
-        input_column = plc.interop.from_arrow(
-            pa.array(["a"] * target_column.size())
+    _, plc_target_column = target_column
+    _, plc_mask = mask
+    if is_integer(dtype := plc_target_column.type()) or is_floating(dtype):
+        plc_input_column = plc.interop.from_arrow(
+            pa.array(["a"] * plc_target_column.size())
         )
     else:
-        input_column = plc.interop.from_arrow(
-            pa.array([1] * target_column.size())
+        plc_input_column = plc.interop.from_arrow(
+            pa.array([1] * plc_target_column.size())
         )
 
     with cudf_raises(TypeError):
-        plc.copying.copy_if_else(input_column, target_column, mask)
+        plc.copying.copy_if_else(plc_input_column, plc_target_column, plc_mask)
 
 
 def test_copy_if_else_wrong_type_mask(target_column):
+    _, plc_target_column = target_column
     with cudf_raises(TypeError):
         plc.copying.copy_if_else(
-            target_column,
-            target_column,
+            plc_target_column,
+            plc_target_column,
             plc.interop.from_arrow(
-                pa.array([1.0, 2.0] * (target_column.size() // 2))
+                pa.array([1.0, 2.0] * (plc_target_column.size() // 2))
             ),
         )
 
 
 def test_copy_if_else_wrong_size(target_column):
+    _, plc_target_column = target_column
     with cudf_raises(ValueError):
         plc.copying.copy_if_else(
             plc.interop.from_arrow(pa.array([1])),
-            target_column,
+            plc_target_column,
             plc.interop.from_arrow(
-                pa.array([True, False] * (target_column.size() // 2))
+                pa.array([True, False] * (plc_target_column.size() // 2))
             ),
         )
 
 
 def test_copy_if_else_wrong_size_mask(target_column):
+    _, plc_target_column = target_column
     with cudf_raises(ValueError):
         plc.copying.copy_if_else(
-            target_column,
-            target_column,
+            plc_target_column,
+            plc_target_column,
             plc.interop.from_arrow(pa.array([True])),
         )
 
@@ -673,21 +711,21 @@ def test_copy_if_else_wrong_size_mask(target_column):
 @pytest.mark.parametrize("array_left", [True, False])
 def test_copy_if_else_column_scalar(
     target_column,
-    pa_target_column,
     source_scalar,
-    pa_source_scalar,
     array_left,
     mask,
-    pa_mask,
 ):
+    pa_target_column, plc_target_column = target_column
+    pa_source_scalar, plc_source_scalar = source_scalar
+    pa_mask, plc_mask = mask
     args = (
-        (target_column, source_scalar)
+        (plc_target_column, plc_source_scalar)
         if array_left
-        else (source_scalar, target_column)
+        else (plc_source_scalar, plc_target_column)
     )
     result = plc.copying.copy_if_else(
         *args,
-        mask,
+        plc_mask,
     )
 
     pa_args = (
@@ -704,16 +742,17 @@ def test_copy_if_else_column_scalar(
 
 def test_boolean_mask_scatter_from_table(
     source_table,
-    pa_source_table,
     target_table,
-    pa_target_table,
     mask,
-    pa_mask,
 ):
+    pa_source_table, plc_source_table = source_table
+    pa_target_table, plc_target_table = target_table
+    pa_mask, plc_mask = mask
+
     result = plc.copying.boolean_mask_scatter(
-        source_table,
-        target_table,
-        mask,
+        plc_source_table,
+        plc_target_table,
+        plc_mask,
     )
 
     if pa.types.is_list(
@@ -757,28 +796,34 @@ def test_boolean_mask_scatter_from_table(
 
 
 def test_boolean_mask_scatter_from_wrong_num_cols(source_table, target_table):
+    _, plc_source_table = source_table
+    _, plc_target_table = target_table
     with cudf_raises(ValueError):
         plc.copying.boolean_mask_scatter(
-            plc.Table(source_table.columns()[:2]),
-            target_table,
+            plc.Table(plc_source_table.columns()[:2]),
+            plc_target_table,
             plc.interop.from_arrow(pa.array([True, False] * 3)),
         )
 
 
 def test_boolean_mask_scatter_from_wrong_mask_size(source_table, target_table):
+    _, plc_source_table = source_table
+    _, plc_target_table = target_table
     with cudf_raises(ValueError):
         plc.copying.boolean_mask_scatter(
-            source_table,
-            target_table,
+            plc_source_table,
+            plc_target_table,
             plc.interop.from_arrow(pa.array([True, False] * 2)),
         )
 
 
 def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table):
+    _, plc_source_table = source_table
+    _, plc_target_table = target_table
     with cudf_raises(ValueError):
         plc.copying.boolean_mask_scatter(
-            plc.Table(source_table.columns()[:2]),
-            target_table,
+            plc.Table(plc_source_table.columns()[:2]),
+            plc_target_table,
             plc.interop.from_arrow(
                 pa.array([True, False] * 2 + [False, False])
             ),
@@ -786,44 +831,48 @@ def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table):
 
 
 def test_boolean_mask_scatter_from_wrong_col_type(target_table, mask):
-    if is_integer(dtype := target_table.columns()[0].type()) or is_floating(
-        dtype
-    ):
+    _, plc_target_table = target_table
+    _, plc_mask = mask
+    if is_integer(
+        dtype := plc_target_table.columns()[0].type()
+    ) or is_floating(dtype):
         input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
     else:
         input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
 
     with cudf_raises(TypeError):
         plc.copying.boolean_mask_scatter(
-            plc.Table([input_column] * 3), target_table, mask
+            plc.Table([input_column] * 3), plc_target_table, plc_mask
         )
 
 
 def test_boolean_mask_scatter_from_wrong_mask_type(source_table, target_table):
+    _, plc_source_table = source_table
+    _, plc_target_table = target_table
     with cudf_raises(TypeError):
         plc.copying.boolean_mask_scatter(
-            source_table,
-            target_table,
+            plc_source_table,
+            plc_target_table,
             plc.interop.from_arrow(pa.array([1.0, 2.0] * 3)),
         )
 
 
 def test_boolean_mask_scatter_from_scalars(
     source_scalar,
-    pa_source_scalar,
     target_table,
-    pa_target_table,
     mask,
-    pa_mask,
 ):
+    pa_source_scalar, plc_source_scalar = source_scalar
+    pa_target_table, plc_target_table = target_table
+    pa_mask, plc_mask = mask
     result = plc.copying.boolean_mask_scatter(
-        [source_scalar] * 3,
-        target_table,
-        mask,
+        [plc_source_scalar] * 3,
+        plc_target_table,
+        plc_mask,
     )
 
     expected = _pyarrow_boolean_mask_scatter_table(
-        [pa_source_scalar] * target_table.num_columns(),
+        [pa_source_scalar] * plc_target_table.num_columns(),
         pc.invert(pa_mask),
         pa_target_table,
     )
@@ -831,9 +880,10 @@ def test_boolean_mask_scatter_from_scalars(
     assert_table_eq(expected, result)
 
 
-def test_get_element(input_column, pa_input_column):
+def test_get_element(input_column):
     index = 1
-    result = plc.copying.get_element(input_column, index)
+    pa_input_column, plc_input_column = input_column
+    result = plc.copying.get_element(plc_input_column, index)
 
     assert (
         plc.interop.to_arrow(
@@ -844,5 +894,6 @@ def test_get_element(input_column, pa_input_column):
 
 
 def test_get_element_out_of_bounds(input_column):
+    _, plc_input_column = input_column
     with cudf_raises(IndexError):
-        plc.copying.get_element(input_column, 100)
+        plc.copying.get_element(plc_input_column, 100)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py
index a5d332a7795..13f3b037606 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_quantiles.py
@@ -19,13 +19,9 @@
 
 
 @pytest.fixture(scope="module", params=[[1, 2, 3, 4, 5], [5, 4, 3, 2, 1]])
-def pa_col_data(request, numeric_pa_type):
-    return pa.array(request.param, type=numeric_pa_type)
-
-
-@pytest.fixture(scope="module")
-def plc_col_data(pa_col_data):
-    return plc.interop.from_arrow(pa_col_data)
+def col_data(request, numeric_pa_type):
+    pa_array = pa.array(request.param, type=numeric_pa_type)
+    return pa_array, plc.interop.from_arrow(pa_array)
 
 
 @pytest.fixture(
@@ -60,7 +56,8 @@ def plc_tbl_data(request):
 
 @pytest.mark.parametrize("q", [[], [0], [0.5], [0.1, 0.5, 0.7, 0.9]])
 @pytest.mark.parametrize("exact", [True, False])
-def test_quantile(pa_col_data, plc_col_data, interp_opt, q, exact):
+def test_quantile(col_data, interp_opt, q, exact):
+    pa_col_data, plc_col_data = col_data
     ordered_indices = plc.interop.from_arrow(
         pc.cast(pc.sort_indices(pa_col_data), pa.int32())
     )
@@ -210,7 +207,8 @@ def test_quantiles_invalid_interp(plc_tbl_data, invalid_interp):
     "q",
     [[0.1], (0.1,), np.array([0.1])],
 )
-def test_quantile_q_array_like(pa_col_data, plc_col_data, q):
+def test_quantile_q_array_like(col_data, q):
+    pa_col_data, plc_col_data = col_data
     ordered_indices = plc.interop.from_arrow(
         pc.cast(pc.sort_indices(pa_col_data), pa.int32())
     )
diff --git a/python/cudf/cudf/pylibcudf_tests/test_reshape.py b/python/cudf/cudf/pylibcudf_tests/test_reshape.py
index 32d79257f4f..da1157e5832 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_reshape.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_reshape.py
@@ -10,20 +10,15 @@
 @pytest.fixture(scope="module")
 def reshape_data():
     data = [[1, 2, 3], [4, 5, 6]]
-    return data
+    arrow_tbl = pa.Table.from_arrays(data, names=["a", "b"])
+    return data, plc.interop.from_arrow(arrow_tbl)
 
 
-@pytest.fixture(scope="module")
-def reshape_plc_tbl(reshape_data):
-    arrow_tbl = pa.Table.from_arrays(reshape_data, names=["a", "b"])
-    plc_tbl = plc.interop.from_arrow(arrow_tbl)
-    return plc_tbl
-
-
-def test_interleave_columns(reshape_data, reshape_plc_tbl):
+def test_interleave_columns(reshape_data):
+    raw_data, reshape_plc_tbl = reshape_data
     res = plc.reshape.interleave_columns(reshape_plc_tbl)
 
-    interleaved_data = [pa.array(pair) for pair in zip(*reshape_data)]
+    interleaved_data = [pa.array(pair) for pair in zip(*raw_data)]
 
     expect = pa.concat_arrays(interleaved_data)
 
@@ -31,10 +26,11 @@ def test_interleave_columns(reshape_data, reshape_plc_tbl):
 
 
 @pytest.mark.parametrize("cnt", [0, 1, 3])
-def test_tile(reshape_data, reshape_plc_tbl, cnt):
+def test_tile(reshape_data, cnt):
+    raw_data, reshape_plc_tbl = reshape_data
     res = plc.reshape.tile(reshape_plc_tbl, cnt)
 
-    tiled_data = [pa.array(col * cnt) for col in reshape_data]
+    tiled_data = [pa.array(col * cnt) for col in raw_data]
 
     expect = pa.Table.from_arrays(
         tiled_data, schema=plc.interop.to_arrow(reshape_plc_tbl).schema
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py
index 818d6e6e72a..c4e437fe5d9 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py
@@ -8,39 +8,38 @@
 
 
 @pytest.fixture(scope="module")
-def pa_data():
-    data = [
-        "leopard",
-        "Golden Eagle",
-        "SNAKE",
-        "",
-        "!A",
-        "hello World",
-        "A B C",
-        "#",
-        "AƻB",
-        "Ⓑⓖ",
-        "Art of War",
-        "The quick bRoWn fox juMps over the laze DOG",
-        '123nr98nv9rev!$#INF4390v03n1243<>?}{:-"',
-        "accénted",
-        None,
-    ]
-    return pa.array(data)
-
-
-@pytest.fixture(scope="module")
-def plc_data(pa_data):
-    return plc.interop.from_arrow(pa_data)
+def str_data():
+    pa_data = pa.array(
+        [
+            "leopard",
+            "Golden Eagle",
+            "SNAKE",
+            "",
+            "!A",
+            "hello World",
+            "A B C",
+            "#",
+            "AƻB",
+            "Ⓑⓖ",
+            "Art of War",
+            "The quick bRoWn fox juMps over the laze DOG",
+            '123nr98nv9rev!$#INF4390v03n1243<>?}{:-"',
+            "accénted",
+            None,
+        ]
+    )
+    return pa_data, plc.interop.from_arrow(pa_data)
 
 
-def test_capitalize(plc_data, pa_data):
+def test_capitalize(str_data):
+    pa_data, plc_data = str_data
     got = plc.strings.capitalize.capitalize(plc_data)
     expected = pa.compute.utf8_capitalize(pa_data)
     assert_column_eq(expected, got)
 
 
-def test_title(plc_data, pa_data):
+def test_title(str_data):
+    pa_data, plc_data = str_data
     got = plc.strings.capitalize.title(
         plc_data, plc.strings.char_types.StringCharacterTypes.CASE_TYPES
     )
@@ -48,7 +47,8 @@ def test_title(plc_data, pa_data):
     assert_column_eq(expected, got)
 
 
-def test_is_title(plc_data, pa_data):
+def test_is_title(str_data):
+    pa_data, plc_data = str_data
     got = plc.strings.capitalize.is_title(plc_data)
     expected = pa.compute.utf8_is_title(pa_data)
     assert_column_eq(expected, got)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_contains.py b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py
index 8cdb6f7c521..fc8c6656b5d 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_string_contains.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_contains.py
@@ -8,15 +8,11 @@
 
 
 @pytest.fixture(scope="module")
-def pa_target_col():
-    return pa.array(
+def target_col():
+    pa_array = pa.array(
         ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"]
     )
-
-
-@pytest.fixture(scope="module")
-def plc_target_col(pa_target_col):
-    return plc.interop.from_arrow(pa_target_col)
+    return pa_array, plc.interop.from_arrow(pa_array)
 
 
 @pytest.fixture(
@@ -45,9 +41,8 @@ def plc_target_pat(pa_target_scalar):
     return prog
 
 
-def test_contains_re(
-    pa_target_col, plc_target_col, pa_target_scalar, plc_target_pat
-):
+def test_contains_re(target_col, pa_target_scalar, plc_target_pat):
+    pa_target_col, plc_target_col = target_col
     got = plc.strings.contains.contains_re(plc_target_col, plc_target_pat)
     expected = pa.compute.match_substring_regex(
         pa_target_col, pa_target_scalar.as_py()
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_find.py b/python/cudf/cudf/pylibcudf_tests/test_string_find.py
index 44900044184..95a1a3cf731 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_string_find.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_find.py
@@ -8,8 +8,8 @@
 
 
 @pytest.fixture(scope="module")
-def pa_data_col():
-    return pa.array(
+def data_col():
+    pa_array = pa.array(
         [
             "abc123",
             "ABC123",
@@ -53,16 +53,12 @@ def pa_data_col():
             None,
         ]
     )
+    return pa_array, plc.interop.from_arrow(pa_array)
 
 
 @pytest.fixture(scope="module")
-def plc_data_col(pa_data_col):
-    return plc.interop.from_arrow(pa_data_col)
-
-
-@pytest.fixture(scope="module")
-def pa_target_col():
-    return pa.array(
+def target_col():
+    pa_array = pa.array(
         [
             "a",
             "B",
@@ -106,24 +102,18 @@ def pa_target_col():
             None,  # ends_with
         ]
     )
-
-
-@pytest.fixture(scope="module")
-def plc_target_col(pa_target_col):
-    return plc.interop.from_arrow(pa_target_col)
+    return pa_array, plc.interop.from_arrow(pa_array)
 
 
 @pytest.fixture(params=["a", " ", "A", "Ab", "23"], scope="module")
-def pa_target_scalar(request):
-    return pa.scalar(request.param, type=pa.string())
-
-
-@pytest.fixture(scope="module")
-def plc_target_scalar(pa_target_scalar):
-    return plc.interop.from_arrow(pa_target_scalar)
+def target_scalar(request):
+    pa_scalar = pa.scalar(request.param, type=pa.string())
+    return pa_scalar, plc.interop.from_arrow(pa_scalar)
 
 
-def test_find(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar):
+def test_find(data_col, target_scalar):
+    pa_data_col, plc_data_col = data_col
+    pa_target_scalar, plc_target_scalar = target_scalar
     got = plc.strings.find.find(plc_data_col, plc_target_scalar, 0, -1)
 
     expected = pa.array(
@@ -161,7 +151,9 @@ def handle_none(st, target):
     return expected
 
 
-def test_find_column(pa_data_col, pa_target_col, plc_data_col, plc_target_col):
+def test_find_column(data_col, target_col):
+    pa_data_col, plc_data_col = data_col
+    pa_target_col, plc_target_col = target_col
     expected = pa.array(
         [
             elem.find(target) if not (elem is None or target is None) else None
@@ -177,7 +169,9 @@ def test_find_column(pa_data_col, pa_target_col, plc_data_col, plc_target_col):
     assert_column_eq(expected, got)
 
 
-def test_rfind(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar):
+def test_rfind(data_col, target_scalar):
+    pa_data_col, plc_data_col = data_col
+    pa_target_scalar, plc_target_scalar = target_scalar
     py_target = pa_target_scalar.as_py()
 
     got = plc.strings.find.rfind(plc_data_col, plc_target_scalar, 0, -1)
@@ -195,9 +189,9 @@ def test_rfind(pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar):
     assert_column_eq(expected, got)
 
 
-def test_contains(
-    pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar
-):
+def test_contains(data_col, target_scalar):
+    pa_data_col, plc_data_col = data_col
+    pa_target_scalar, plc_target_scalar = target_scalar
     py_target = pa_target_scalar.as_py()
 
     got = plc.strings.find.contains(plc_data_col, plc_target_scalar)
@@ -214,9 +208,9 @@ def test_contains(
     assert_column_eq(expected, got)
 
 
-def test_contains_column(
-    pa_data_col, pa_target_col, plc_data_col, plc_target_col
-):
+def test_contains_column(data_col, target_col):
+    pa_data_col, plc_data_col = data_col
+    pa_target_col, plc_target_col = target_col
     expected = colwise_apply(
         pa_data_col, pa_target_col, lambda st, target: target in st
     )
@@ -224,18 +218,18 @@ def test_contains_column(
     assert_column_eq(expected, got)
 
 
-def test_starts_with(
-    pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar
-):
+def test_starts_with(data_col, target_scalar):
+    pa_data_col, plc_data_col = data_col
+    pa_target_scalar, plc_target_scalar = target_scalar
     py_target = pa_target_scalar.as_py()
     got = plc.strings.find.starts_with(plc_data_col, plc_target_scalar)
     expected = pa.compute.starts_with(pa_data_col, py_target)
     assert_column_eq(expected, got)
 
 
-def test_starts_with_column(
-    pa_data_col, pa_target_col, plc_data_col, plc_target_col
-):
+def test_starts_with_column(data_col, target_col):
+    pa_data_col, plc_data_col = data_col
+    pa_target_col, plc_target_col = target_col
     expected = colwise_apply(
         pa_data_col, pa_target_col, lambda st, target: st.startswith(target)
     )
@@ -243,18 +237,18 @@ def test_starts_with_column(
     assert_column_eq(expected, got)
 
 
-def test_ends_with(
-    pa_data_col, plc_data_col, pa_target_scalar, plc_target_scalar
-):
+def test_ends_with(data_col, target_scalar):
+    pa_data_col, plc_data_col = data_col
+    pa_target_scalar, plc_target_scalar = target_scalar
     py_target = pa_target_scalar.as_py()
     got = plc.strings.find.ends_with(plc_data_col, plc_target_scalar)
     expected = pa.compute.ends_with(pa_data_col, py_target)
     assert_column_eq(expected, got)
 
 
-def test_ends_with_column(
-    pa_data_col, pa_target_col, plc_data_col, plc_target_col
-):
+def test_ends_with_column(data_col, target_col):
+    pa_data_col, plc_data_col = data_col
+    pa_target_col, plc_target_col = target_col
     expected = colwise_apply(
         pa_data_col, pa_target_col, lambda st, target: st.endswith(target)
     )

From 22ac996dea6f297736c9fd8cda735c0e7a5dbe43 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 11 Jun 2024 16:30:09 +0100
Subject: [PATCH 335/842] Remove `Scalar` container type from polars
 interpreter (#15953)

Now we always return columns and, where usage of a scalar might be
correct (for example broadcasting in binops), we check if the column
is "actually" a scalar and extract it.

This is slightly annoying because we have to introspect things in
various places. But without changing libcudf to treat length-1 columns
as always broadcastable like scalars this is, I think, the best we can
do.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - https://github.com/brandon-b-miller
  - James Lamb (https://github.com/jameslamb)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15953
---
 python/cudf_polars/cudf_polars/__init__.py    |   8 +-
 .../cudf_polars/containers/__init__.py        |   3 +-
 .../cudf_polars/containers/column.py          |  28 ++++-
 .../cudf_polars/containers/dataframe.py       |   6 +-
 .../cudf_polars/containers/scalar.py          |  23 ----
 python/cudf_polars/cudf_polars/dsl/expr.py    | 114 +++++++++++-------
 python/cudf_polars/cudf_polars/dsl/ir.py      |  75 +++++++++---
 .../cudf_polars/cudf_polars/dsl/translate.py  |   4 +-
 .../cudf_polars/cudf_polars/utils/sorting.py  |   2 +-
 python/cudf_polars/pyproject.toml             |   3 -
 python/cudf_polars/tests/utils/__init__.py    |   6 +
 .../cudf_polars/tests/utils/test_broadcast.py |  74 ++++++++++++
 12 files changed, 249 insertions(+), 97 deletions(-)
 delete mode 100644 python/cudf_polars/cudf_polars/containers/scalar.py
 create mode 100644 python/cudf_polars/tests/utils/__init__.py
 create mode 100644 python/cudf_polars/tests/utils/test_broadcast.py

diff --git a/python/cudf_polars/cudf_polars/__init__.py b/python/cudf_polars/cudf_polars/__init__.py
index b19a282129a..41d06f8631b 100644
--- a/python/cudf_polars/cudf_polars/__init__.py
+++ b/python/cudf_polars/cudf_polars/__init__.py
@@ -10,7 +10,13 @@
 
 from __future__ import annotations
 
+from cudf_polars._version import __git_commit__, __version__
 from cudf_polars.callback import execute_with_cudf
 from cudf_polars.dsl.translate import translate_ir
 
-__all__: list[str] = ["execute_with_cudf", "translate_ir"]
+__all__: list[str] = [
+    "execute_with_cudf",
+    "translate_ir",
+    "__git_commit__",
+    "__version__",
+]
diff --git a/python/cudf_polars/cudf_polars/containers/__init__.py b/python/cudf_polars/cudf_polars/containers/__init__.py
index ee69e748eb5..06bb08953f1 100644
--- a/python/cudf_polars/cudf_polars/containers/__init__.py
+++ b/python/cudf_polars/cudf_polars/containers/__init__.py
@@ -5,8 +5,7 @@
 
 from __future__ import annotations
 
-__all__: list[str] = ["DataFrame", "Column", "NamedColumn", "Scalar"]
+__all__: list[str] = ["DataFrame", "Column", "NamedColumn"]
 
 from cudf_polars.containers.column import Column, NamedColumn
 from cudf_polars.containers.dataframe import DataFrame
-from cudf_polars.containers.scalar import Scalar
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 575d15d3ece..156dd395d64 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -17,12 +17,13 @@
 
 
 class Column:
-    """A column with sortedness metadata."""
+    """An immutable column with sortedness metadata."""
 
     obj: plc.Column
     is_sorted: plc.types.Sorted
     order: plc.types.Order
     null_order: plc.types.NullOrder
+    is_scalar: bool
 
     def __init__(
         self,
@@ -33,10 +34,33 @@ def __init__(
         null_order: plc.types.NullOrder = plc.types.NullOrder.BEFORE,
     ):
         self.obj = column
+        self.is_scalar = self.obj.size() == 1
+        if self.obj.size() <= 1:
+            is_sorted = plc.types.Sorted.YES
         self.is_sorted = is_sorted
         self.order = order
         self.null_order = null_order
 
+    @functools.cached_property
+    def obj_scalar(self) -> plc.Scalar:
+        """
+        A copy of the column object as a pylibcudf Scalar.
+
+        Returns
+        -------
+        pylibcudf Scalar object.
+
+        Raises
+        ------
+        ValueError
+            If the column is not length-1.
+        """
+        if not self.is_scalar:
+            raise ValueError(
+                f"Cannot convert a column of length {self.obj.size()} to scalar"
+            )
+        return plc.copying.get_element(self.obj, 0)
+
     def sorted_like(self, like: Column, /) -> Self:
         """
         Copy sortedness properties from a column onto self.
@@ -81,6 +105,8 @@ def set_sorted(
         -------
         Self with metadata set.
         """
+        if self.obj.size() <= 1:
+            is_sorted = plc.types.Sorted.YES
         self.is_sorted = is_sorted
         self.order = order
         self.null_order = null_order
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index ac7e748095e..7039fcaf077 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -32,7 +32,7 @@ class DataFrame:
     """A representation of a dataframe."""
 
     columns: list[NamedColumn]
-    table: plc.Table | None
+    table: plc.Table
 
     def __init__(self, columns: Sequence[NamedColumn]) -> None:
         self.columns = list(columns)
@@ -41,7 +41,7 @@ def __init__(self, columns: Sequence[NamedColumn]) -> None:
 
     def copy(self) -> Self:
         """Return a shallow copy of self."""
-        return type(self)(self.columns)
+        return type(self)([c.copy() for c in self.columns])
 
     def to_polars(self) -> pl.DataFrame:
         """Convert to a polars DataFrame."""
@@ -70,8 +70,6 @@ def num_columns(self) -> int:
     @cached_property
     def num_rows(self) -> int:
         """Number of rows."""
-        if self.table is None:
-            raise ValueError("Number of rows of frame with scalars makes no sense")
         return self.table.num_rows()
 
     @classmethod
diff --git a/python/cudf_polars/cudf_polars/containers/scalar.py b/python/cudf_polars/cudf_polars/containers/scalar.py
deleted file mode 100644
index fc97d0fd9c2..00000000000
--- a/python/cudf_polars/cudf_polars/containers/scalar.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-
-"""A scalar, with some properties."""
-
-from __future__ import annotations
-
-from typing import TYPE_CHECKING
-
-if TYPE_CHECKING:
-    import cudf._lib.pylibcudf as plc
-
-__all__: list[str] = ["Scalar"]
-
-
-class Scalar:
-    """A scalar, and a name."""
-
-    __slots__ = ("obj", "name")
-    obj: plc.Scalar
-
-    def __init__(self, scalar: plc.Scalar):
-        self.obj = scalar
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 6d9435ce373..a81cdcbf0c3 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -5,7 +5,7 @@
 """
 DSL nodes for the polars expression language.
 
-An expression node is a function, `DataFrame -> Column` or `DataFrame -> Scalar`.
+An expression node is a function, `DataFrame -> Column`.
 
 The evaluation context is provided by a LogicalPlan node, and can
 affect the evaluation rule as well as providing the dataframe input.
@@ -26,7 +26,7 @@
 
 import cudf._lib.pylibcudf as plc
 
-from cudf_polars.containers import Column, NamedColumn, Scalar
+from cudf_polars.containers import Column, NamedColumn
 from cudf_polars.utils import sorting
 
 if TYPE_CHECKING:
@@ -165,7 +165,7 @@ def do_evaluate(
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
         mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:  # TODO: return type is a lie for Literal
+    ) -> Column:
         """
         Evaluate this expression given a dataframe for context.
 
@@ -187,8 +187,7 @@ def do_evaluate(
 
         Returns
         -------
-        Column representing the evaluation of the expression (or maybe
-        a scalar).
+        Column representing the evaluation of the expression.
 
         Raises
         ------
@@ -205,7 +204,7 @@ def evaluate(
         *,
         context: ExecutionContext = ExecutionContext.FRAME,
         mapping: Mapping[Expr, Column] | None = None,
-    ) -> Column:  # TODO: return type is a lie for Literal
+    ) -> Column:
         """
         Evaluate this expression given a dataframe for context.
 
@@ -222,23 +221,13 @@ def evaluate(
 
         Notes
         -----
-        Individual subclasses should implement :meth:`do_allocate`,
+        Individual subclasses should implement :meth:`do_evaluate`,
         this method provides logic to handle lookups in the
         substitution mapping.
 
-        The typed return value of :class:`Column` is not true when
-        evaluating :class:`Literal` nodes (which instead produce
-        :class:`Scalar` objects). However, these duck-type to having a
-        pylibcudf container object inside them, and usually they end
-        up appearing in binary expressions which pylibcudf handles
-        appropriately since there are overloads for (column, scalar)
-        pairs. We don't have to handle (scalar, scalar) in binops
-        since the polars optimizer has a constant-folding pass.
-
         Returns
         -------
-        Column representing the evaluation of the expression (or maybe
-        a scalar).
+        Column representing the evaluation of the expression.
 
         Raises
         ------
@@ -319,24 +308,35 @@ def evaluate(
         context: ExecutionContext = ExecutionContext.FRAME,
         mapping: Mapping[Expr, Column] | None = None,
     ) -> NamedColumn:
-        """Evaluate this expression given a dataframe for context."""
+        """
+        Evaluate this expression given a dataframe for context.
+
+        Parameters
+        ----------
+        df
+            DataFrame providing context
+        context
+            Execution context
+        mapping
+            Substitution mapping
+
+        Returns
+        -------
+        NamedColumn attaching a name to an evaluated Column
+
+        See Also
+        --------
+        :meth:`Expr.evaluate` for details, this function just adds the
+        name to a column produced from an expression.
+        """
         obj = self.value.evaluate(df, context=context, mapping=mapping)
-        if isinstance(obj, Scalar):
-            return NamedColumn(
-                plc.Column.from_scalar(obj.obj, 1),
-                self.name,
-                is_sorted=plc.types.Sorted.YES,
-                order=plc.types.Order.ASCENDING,
-                null_order=plc.types.NullOrder.BEFORE,
-            )
-        else:
-            return NamedColumn(
-                obj.obj,
-                self.name,
-                is_sorted=obj.is_sorted,
-                order=obj.order,
-                null_order=obj.null_order,
-            )
+        return NamedColumn(
+            obj.obj,
+            self.name,
+            is_sorted=obj.is_sorted,
+            order=obj.order,
+            null_order=obj.null_order,
+        )
 
     def collect_agg(self, *, depth: int) -> AggInfo:
         """Collect information about aggregations in groupbys."""
@@ -363,7 +363,7 @@ def do_evaluate(
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         # datatype of pyarrow scalar is correct by construction.
-        return Scalar(plc.interop.from_arrow(self.value))  # type: ignore
+        return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1))
 
 
 class Col(Expr):
@@ -402,8 +402,14 @@ def do_evaluate(
         mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
-        # TODO: type is wrong, and dtype
-        return df.num_rows  # type: ignore
+        return Column(
+            plc.Column.from_scalar(
+                plc.interop.from_arrow(
+                    pa.scalar(df.num_rows, type=plc.interop.to_arrow(self.dtype))
+                ),
+                1,
+            )
+        )
 
     def collect_agg(self, *, depth: int) -> AggInfo:
         """Collect information about aggregations in groupbys."""
@@ -664,10 +670,24 @@ def do_evaluate(
             return Column(plc.strings.case.to_upper(column.obj))
         elif self.name == pl_expr.StringFunction.EndsWith:
             column, suffix = columns
-            return Column(plc.strings.find.ends_with(column.obj, suffix.obj))
+            return Column(
+                plc.strings.find.ends_with(
+                    column.obj,
+                    suffix.obj_scalar
+                    if column.obj.size() != suffix.obj.size() and suffix.is_scalar
+                    else suffix.obj,
+                )
+            )
         elif self.name == pl_expr.StringFunction.StartsWith:
-            column, suffix = columns
-            return Column(plc.strings.find.starts_with(column.obj, suffix.obj))
+            column, prefix = columns
+            return Column(
+                plc.strings.find.starts_with(
+                    column.obj,
+                    prefix.obj_scalar
+                    if column.obj.size() != prefix.obj.size() and prefix.is_scalar
+                    else prefix.obj,
+                )
+            )
         else:
             raise NotImplementedError(f"StringFunction {self.name}")
 
@@ -875,9 +895,6 @@ def __init__(
         self, dtype: plc.DataType, name: str, options: Any, value: Expr
     ) -> None:
         super().__init__(dtype)
-        # TODO: fix polars name
-        if name == "nunique":
-            name = "n_unique"
         self.name = name
         self.options = options
         self.children = (value,)
@@ -1092,8 +1109,15 @@ def do_evaluate(
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
         )
+        lop = left.obj
+        rop = right.obj
+        if left.obj.size() != right.obj.size():
+            if left.is_scalar:
+                lop = left.obj_scalar
+            elif right.is_scalar:
+                rop = right.obj_scalar
         return Column(
-            plc.binaryop.binary_operation(left.obj, right.obj, self.op, self.dtype),
+            plc.binaryop.binary_operation(lop, rop, self.op, self.dtype),
         )
 
     def collect_agg(self, *, depth: int) -> AggInfo:
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 665bbe5be41..0a6deb5698c 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -63,26 +63,58 @@
 def broadcast(
     *columns: NamedColumn, target_length: int | None = None
 ) -> list[NamedColumn]:
-    lengths = {column.obj.size() for column in columns}
-    if len(lengths - {1}) > 1:
-        raise RuntimeError("Mismatching column lengths")
+    """
+    Broadcast a sequence of columns to a common length.
+
+    Parameters
+    ----------
+    columns
+        Columns to broadcast.
+    target_length
+        Optional length to broadcast to. If not provided, uses the
+        non-unit length of existing columns.
+
+    Returns
+    -------
+    List of broadcasted columns all of the same length.
+
+    Raises
+    ------
+    RuntimeError
+        If broadcasting is not possible.
+
+    Notes
+    -----
+    In evaluation of a set of expressions, polars type-puns length-1
+    columns with scalars. When we insert these into a DataFrame
+    object, we need to ensure they are of equal length. This function
+    takes some columns, some of which may be length-1 and ensures that
+    all length-1 columns are broadcast to the length of the others.
+
+    Broadcasting is only possible if the set of lengths of the input
+    columns is a subset of ``{1, n}`` for some (fixed) ``n``. If
+    ``target_length`` is provided and not all columns are length-1
+    (i.e. ``n != 1``), then ``target_length`` must be equal to ``n``.
+    """
+    lengths: set[int] = {column.obj.size() for column in columns}
     if lengths == {1}:
         if target_length is None:
             return list(columns)
         nrows = target_length
-    elif len(lengths) == 1:
-        if target_length is not None:
-            assert target_length in lengths
-        return list(columns)
     else:
-        (nrows,) = lengths - {1}
-        if target_length is not None:
-            assert target_length == nrows
+        try:
+            (nrows,) = lengths.difference([1])
+        except ValueError as e:
+            raise RuntimeError("Mismatching column lengths") from e
+        if target_length is not None and nrows != target_length:
+            raise RuntimeError(
+                f"Cannot broadcast columns of length {nrows=} to {target_length=}"
+            )
     return [
         column
         if column.obj.size() != 1
         else NamedColumn(
-            plc.Column.from_scalar(plc.copying.get_element(column.obj, 0), nrows),
+            plc.Column.from_scalar(column.obj_scalar, nrows),
             column.name,
             is_sorted=plc.types.Sorted.YES,
             order=plc.types.Order.ASCENDING,
@@ -279,12 +311,16 @@ class Select(IR):
     """Input dataframe."""
     expr: list[expr.NamedExpr]
     """List of expressions to evaluate to form the new dataframe."""
+    should_broadcast: bool
+    """Should columns be broadcast?"""
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
         # Handle any broadcasting
-        columns = broadcast(*(e.evaluate(df) for e in self.expr))
+        columns = [e.evaluate(df) for e in self.expr]
+        if self.should_broadcast:
+            columns = broadcast(*columns)
         return DataFrame(columns)
 
 
@@ -587,15 +623,24 @@ class HStack(IR):
     """Input dataframe."""
     columns: list[expr.NamedExpr]
     """List of expressions to produce new columns."""
+    should_broadcast: bool
+    """Should columns be broadcast?"""
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
         columns = [c.evaluate(df) for c in self.columns]
-        # TODO: a bit of a hack, should inherit the should_broadcast
-        # property of polars' ProjectionOptions on the hstack node.
-        if not any(e.name.startswith("__POLARS_CSER_0x") for e in self.columns):
+        if self.should_broadcast:
             columns = broadcast(*columns, target_length=df.num_rows)
+        else:
+            # Polars ensures this is true, but let's make sure nothing
+            # went wrong. In this case, the parent node is a
+            # guaranteed to be a Select which will take care of making
+            # sure that everything is the same length. The result
+            # table that might have mismatching column lengths will
+            # never be turned into a pylibcudf Table with all columns
+            # by the Select, which is why this is safe.
+            assert all(e.name.startswith("__POLARS_CSER_0x") for e in self.columns)
         return df.with_columns(columns)
 
 
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 38107023365..adde3b1a9dc 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -122,7 +122,7 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
-    return ir.Select(schema, inp, exprs)
+    return ir.Select(schema, inp, exprs, node.should_broadcast)
 
 
 @_translate_ir.register
@@ -166,7 +166,7 @@ def _(
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.exprs]
-    return ir.HStack(schema, inp, exprs)
+    return ir.HStack(schema, inp, exprs, node.should_broadcast)
 
 
 @_translate_ir.register
diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py
index d35459db20d..24fd449dd88 100644
--- a/python/cudf_polars/cudf_polars/utils/sorting.py
+++ b/python/cudf_polars/cudf_polars/utils/sorting.py
@@ -30,7 +30,7 @@ def sort_order(
 
     Returns
     -------
-    tuple of column_order and null_precendence
+    tuple of column_order and null_precedence
     suitable for passing to sort routines
     """
     # Mimicking polars broadcast handling of descending
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 2faf8c3193f..11178a3be74 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -49,9 +49,6 @@ license-files = ["LICENSE"]
 [tool.setuptools.dynamic]
 version = {file = "cudf_polars/VERSION"}
 
-[tool.setuptools.packages.find]
-exclude = ["*tests*"]
-
 [tool.pytest.ini_options]
 xfail_strict = true
 
diff --git a/python/cudf_polars/tests/utils/__init__.py b/python/cudf_polars/tests/utils/__init__.py
new file mode 100644
index 00000000000..4611d642f14
--- /dev/null
+++ b/python/cudf_polars/tests/utils/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/tests/utils/test_broadcast.py b/python/cudf_polars/tests/utils/test_broadcast.py
new file mode 100644
index 00000000000..69ad1e519e2
--- /dev/null
+++ b/python/cudf_polars/tests/utils/test_broadcast.py
@@ -0,0 +1,74 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import cudf._lib.pylibcudf as plc
+
+from cudf_polars.containers import NamedColumn
+from cudf_polars.dsl.ir import broadcast
+
+
+@pytest.mark.parametrize("target", [4, None])
+def test_broadcast_all_scalar(target):
+    columns = [
+        NamedColumn(
+            plc.column_factories.make_numeric_column(
+                plc.DataType(plc.TypeId.INT8), 1, plc.MaskState.ALL_VALID
+            ),
+            f"col{i}",
+        )
+        for i in range(3)
+    ]
+    result = broadcast(*columns, target_length=target)
+    expected = 1 if target is None else target
+
+    assert all(column.obj.size() == expected for column in result)
+
+
+def test_invalid_target_length():
+    columns = [
+        NamedColumn(
+            plc.column_factories.make_numeric_column(
+                plc.DataType(plc.TypeId.INT8), 4, plc.MaskState.ALL_VALID
+            ),
+            f"col{i}",
+        )
+        for i in range(3)
+    ]
+    with pytest.raises(RuntimeError):
+        _ = broadcast(*columns, target_length=8)
+
+
+def test_broadcast_mismatching_column_lengths():
+    columns = [
+        NamedColumn(
+            plc.column_factories.make_numeric_column(
+                plc.DataType(plc.TypeId.INT8), i + 1, plc.MaskState.ALL_VALID
+            ),
+            f"col{i}",
+        )
+        for i in range(3)
+    ]
+    with pytest.raises(RuntimeError):
+        _ = broadcast(*columns)
+
+
+@pytest.mark.parametrize("nrows", [0, 5])
+def test_broadcast_with_scalars(nrows):
+    columns = [
+        NamedColumn(
+            plc.column_factories.make_numeric_column(
+                plc.DataType(plc.TypeId.INT8),
+                nrows if i == 0 else 1,
+                plc.MaskState.ALL_VALID,
+            ),
+            f"col{i}",
+        )
+        for i in range(3)
+    ]
+
+    result = broadcast(*columns)
+    assert all(column.obj.size() == nrows for column in result)

From 8efa64ea61905969423bbfcc11353817c7cc1bca Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 11 Jun 2024 11:31:20 -0500
Subject: [PATCH 336/842] Fix `dask_cudf.read_parquet` regression for legacy
 timestamp data (#15929)

cudf does not currently support timezone-aware datetime columns. For example:

```python
    pdf = pd.DataFrame(
        {
            "time": pd.to_datetime(
                ["1996-01-02", "1996-12-01"],
                utc=True,
            ),
            "x": [1, 2],
        }
    )
    cudf.DataFrame.from_pandas(pdf)
```
```
NotImplementedError: cuDF does not yet support timezone-aware datetimes
```

However, `cudf.read_parquet` **does** allow you to read this same data from a Parquet file. This PR adds a simple fix to allow the same data to be read with `dask_cudf`. The dask_cudf version was previously "broken" because it relies on upstream pyarrow logic to construct `meta` as a pandas DataFrame (and then we just convert `meta` from pandas to cudf). As illustrated in the example above, this direct conversion is not allowed when one or more columns contain timezone information.

**Important Context**
The actual motivation for this PR is to fix a **regression** in 24.06+ for older parquet files containing "legacy" timestamp types (e.g. `TIMESTAMP_MILLIS` and `TIMESTAMP_MICROS`).  In `pyarrow 14.0.2` (used by cudf-24.04), these legacy types were not automatically translated to timezone-aware dtypes by pyarrow. In  `pyarrow 16.1.0` (used by cudf-24.06+),  the legacy types **ARE** automatically translated. Therefore, in moving from cudf-24.04 to cudf-24.06+, some `dask_cudf` users will find that they can no longer read the same parquet file containing legacy timestamp data.

I'm not entirely sure if cudf should always allow users to read Parquet data with timezone-aware dtypes (e.g. if the timezone is **not** utc), but it definitely makes sense for cudf to ignore automatic/unnecessary timezone translations.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15929
---
 python/dask_cudf/dask_cudf/io/parquet.py            | 5 +++++
 python/dask_cudf/dask_cudf/io/tests/test_parquet.py | 9 ++++-----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index fc962670c47..ba8b1e89721 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -6,6 +6,7 @@
 from io import BufferedWriter, BytesIO, IOBase
 
 import numpy as np
+import pandas as pd
 from pyarrow import dataset as pa_ds, parquet as pq
 
 from dask import dataframe as dd
@@ -41,6 +42,10 @@ def _create_dd_meta(cls, dataset_info, **kwargs):
         meta_pd = super()._create_dd_meta(dataset_info, **kwargs)
 
         # Convert to cudf
+        # (drop unsupported timezone information)
+        for k, v in meta_pd.dtypes.items():
+            if isinstance(v, pd.DatetimeTZDtype) and v.tz is not None:
+                meta_pd[k] = meta_pd[k].dt.tz_localize(None)
         meta_cudf = cudf.from_pandas(meta_pd)
 
         # Re-set "object" dtypes to align with pa schema
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index f3e3911e6c7..620a917109e 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -610,9 +610,8 @@ def test_timezone_column(tmpdir):
         }
     )
     pdf.to_parquet(path)
+
+    # Check that `cudf` and `dask_cudf` results match
     got = dask_cudf.read_parquet(path)
-    # cudf.read_parquet does not support reading timezone aware types yet
-    assert got["time"].dtype == pd.DatetimeTZDtype("ns", "UTC")
-    got["time"] = got["time"].astype("datetime64[ns]")
-    expected = cudf.read_parquet(path)
-    dd.assert_eq(got, expected)
+    expect = cudf.read_parquet(path)
+    dd.assert_eq(got, expect)

From d844d670dfbfcbaeb673253f762bed7fbebf6c86 Mon Sep 17 00:00:00 2001
From: Ben Jarmak <104460670+jarmak-nv@users.noreply.github.com>
Date: Tue, 11 Jun 2024 13:05:01 -0400
Subject: [PATCH 337/842] Project automation bug fixes (#15971)

## Description
This PR resolves two bugs in the recent pr #15945

## external issue labeling
Recent runs show that it is labeling [issues
created](https://github.com/rapidsai/cudf/issues/15967) by team members
as `External`

Using graphQL to explore the authorAssociation shows
`"authorAssociation": "MEMBER"` - I've updated the permissions to be
specific to the job in an attempt to ensure that we have the permissions
we need. Testing this action in personal repos shows it works as
expected so not 100% on what's going on.

A PR was also unable to run due to the token only having read
permissions, so hopefully this is a two birds one stone fix.

It may be beneficial to re-run
https://github.com/rapidsai/cudf/actions/runs/9462546964/job/26065765728
with debug mode on to see if `author_association` is different to the
action (which would be concerning)

*edit test*

## project automation
This fixes the workflow incorrectly calling my personal workflows for
testing.


## Checklist
- [x] I am familiar with the [Contributing
Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
- [ ] ~New or existing tests cover these changes.~
- [ ] ~The documentation is up to date with these changes.~
---
 .github/workflows/external_issue_labeler.yml  | 25 +++++++++++--------
 .../workflows/pr_issue_status_automation.yml  |  2 +-
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/external_issue_labeler.yml b/.github/workflows/external_issue_labeler.yml
index e6d987e9f34..81bc9b18296 100644
--- a/.github/workflows/external_issue_labeler.yml
+++ b/.github/workflows/external_issue_labeler.yml
@@ -20,36 +20,41 @@ on:
     types:
       - opened
 
-  pull_request:
+  pull_request_target:
     types:
       - opened
 
 env:
   GITHUB_TOKEN: ${{ github.token }}
 
-permissions:
-  issues: write
-  pull-requests: write
-
 jobs:
   Label-Issue:
     runs-on: ubuntu-latest
-    # Only run if the issue author is not part of RAPIDS
-    if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.issue.author_association)}}
+    permissions:
+      issues: write
+    if: github.event_name == 'issues'
     steps:
       - name: add-external-labels
+        # Only run if the issue author is not part of RAPIDS
+        if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.issue.author_association)}}
         run: |
+          echo ${{ github.event.issue.author_association }}
           issue_url=${{ github.event.issue.html_url }}
           gh issue edit ${issue_url} --add-label "External"
         continue-on-error: true
 
   Label-PR:
     runs-on: ubuntu-latest
-    # Only run if the issue author is not part of RAPIDS
-    if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association)}}
+    permissions:
+      pull-requests: write
+      issues: write
+    if: github.event_name == 'pull_request_target'
     steps:
       - name: add-external-labels
+        # Only run if the issue author is not part of RAPIDS
+        if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association)}}
         run: |
+            echo ${{ github.event.pull_request.author_association }}
             pr_url=${{ github.event.pull_request.html_url }}
             gh issue edit ${pr_url} --add-label "External"
-    continue-on-error: true
+        continue-on-error: true
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index aaece1bfa3e..837963c3286 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: jarmak-nv/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08
       if: github.event.pull_request.state == 'open'
       needs: get-project-id
       with:

From dfa79d457138dcb9a70410e06c77c45a63ae0b25 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 11 Jun 2024 14:58:06 -0400
Subject: [PATCH 338/842] Add a developer check for proxy objects (#15956)

Closes #15864

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15956
---
 docs/cudf/source/developer_guide/cudf_pandas.md  |  9 +++++++++
 python/cudf/cudf/pandas/__init__.py              |  5 +++--
 python/cudf/cudf/pandas/fast_slow_proxy.py       | 14 ++++++++++++++
 .../cudf/cudf_pandas_tests/test_cudf_pandas.py   | 16 +++++++++++++++-
 4 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/docs/cudf/source/developer_guide/cudf_pandas.md b/docs/cudf/source/developer_guide/cudf_pandas.md
index 827ba18a4a4..a8a6d81d6fb 100644
--- a/docs/cudf/source/developer_guide/cudf_pandas.md
+++ b/docs/cudf/source/developer_guide/cudf_pandas.md
@@ -20,6 +20,7 @@ The "wrapped" types/classes are the Pandas and cuDF specific types that have bee
 Wrapped objects and proxy objects are instances of wrapped types and proxy types, respectively.
 In the snippet below `s1` and `s2` are wrapped objects and `s3` is a fast-slow proxy object.
 Also note that the module `xpd` is a wrapped module and contains cuDF and Pandas modules as attributes.
+To check if an object is a proxy type, we can use `cudf.pandas.is_proxy_object`.
   ```python
   import cudf.pandas
   cudf.pandas.install()
@@ -31,6 +32,14 @@ Also note that the module `xpd` is a wrapped module and contains cuDF and Pandas
   s1 = cudf.Series([1,2])
   s2 = pd.Series([1,2])
   s3 = xpd.Series([1,2])
+
+  from cudf.pandas import is_proxy_object
+
+  is_proxy_object(s1) # returns False
+
+  is_proxy_object(s2) # returns False
+
+  is_proxy_object(s3) # returns True
   ```
 
 ```{note}
diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py
index f2e855ae55c..5b3785531d3 100644
--- a/python/cudf/cudf/pandas/__init__.py
+++ b/python/cudf/cudf/pandas/__init__.py
@@ -1,11 +1,12 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+from .fast_slow_proxy import is_proxy_object
 from .magics import load_ipython_extension
 from .profiler import Profiler
 
-__all__ = ["Profiler", "load_ipython_extension", "install"]
+__all__ = ["Profiler", "load_ipython_extension", "install", "is_proxy_object"]
 
 
 LOADED = False
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 5f4cf2e6cc6..128913e5746 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -1185,6 +1185,20 @@ def _replace_closurevars(
     )
 
 
+def is_proxy_object(obj: Any) -> bool:
+    """Determine if an object is proxy object
+
+    Parameters
+    ----------
+    obj : object
+        Any python object.
+
+    """
+    if _FastSlowProxyMeta in type(type(obj)).__mro__:
+        return True
+    return False
+
+
 NUMPY_TYPES: Set[str] = set(np.sctypeDict.values())
 
 
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 72e9ad5fca3..515a4714a5a 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -20,7 +20,7 @@
 from pytz import utc
 
 from cudf.pandas import LOADED, Profiler
-from cudf.pandas.fast_slow_proxy import _Unusable
+from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object
 
 if not LOADED:
     raise ImportError("These tests must be run with cudf.pandas loaded")
@@ -1488,3 +1488,17 @@ def mock_mean_none(self, *args, **kwargs):
 
 def test_excelwriter_pathlike():
     assert isinstance(pd.ExcelWriter("foo.xlsx"), os.PathLike)
+
+
+def test_is_proxy_object():
+    np_arr = np.array([1])
+
+    s1 = xpd.Series([1])
+    s2 = pd.Series([1])
+
+    np_arr_proxy = s1.to_numpy()
+
+    assert not is_proxy_object(np_arr)
+    assert is_proxy_object(np_arr_proxy)
+    assert is_proxy_object(s1)
+    assert not is_proxy_object(s2)

From f655602ecd8f254dfcee5eb0c790bd3336e83d7c Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 11 Jun 2024 15:59:20 -0700
Subject: [PATCH 339/842] Fix Cython typo preventing proper inheritance
 (#15978)

#15831 added new inheritance patterns to the Parquet options classes, but mirroring them perfectly in Cython proved problematic due to what appeared to be issues with Cython parsing of CRTP and inheritance. A deeper investigation revealed that the underlying issue was https://github.com/cython/cython/issues/6238. This PR applies the appropriate fix.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15978
---
 .../_lib/pylibcudf/libcudf/io/parquet.pxd     | 24 ++++++-------------
 1 file changed, 7 insertions(+), 17 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
index 36654457995..0ef6553db56 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
@@ -123,7 +123,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         ) except +
 
     cdef cppclass parquet_writer_options_builder_base[BuilderT, OptionsT]:
-        parquet_writer_options_builder() except +
+        parquet_writer_options_builder_base() except +
 
         BuilderT& metadata(
             cudf_io_types.table_input_metadata m
@@ -164,22 +164,6 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         BuilderT& dictionary_policy(
             cudf_io_types.dictionary_policy val
         ) except +
-        # FIXME: the following two functions actually belong in
-        # parquet_writer_options_builder, but placing them there yields a
-        # "'parquet_writer_options_builder' is not a type identifier" error.
-        # This is probably a bug in cython since a simpler CRTP example that
-        # has methods returning references to a child class seem to work.
-        # Calling these from the chunked options builder will fail at compile
-        # time, so this should be safe.
-        # NOTE: these two are never actually called from libcudf. Instead these
-        # properties are set in the options after calling build(), so perhaps
-        # they can be removed.
-        BuilderT& partitions(
-            vector[cudf_io_types.partition_info] partitions
-        ) except +
-        BuilderT& column_chunks_file_paths(
-            vector[string] column_chunks_file_paths
-        ) except +
         OptionsT build() except +
 
     cdef cppclass parquet_writer_options_builder(
@@ -190,6 +174,12 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             cudf_io_types.sink_info sink_,
             cudf_table_view.table_view table_
         ) except +
+        parquet_writer_options_builder& partitions(
+            vector[cudf_io_types.partition_info] partitions
+        ) except +
+        parquet_writer_options_builder& column_chunks_file_paths(
+            vector[string] column_chunks_file_paths
+        ) except +
 
     cdef unique_ptr[vector[uint8_t]] write_parquet(
         parquet_writer_options args

From 49e2a565ffb85479589406f622c74116d7f891c7 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 11 Jun 2024 20:27:54 -0400
Subject: [PATCH 340/842] Support large strings in
 cudf::io::text::multibyte_split (#15947)

Replaces int32 type used for building offsets in `cudf::io::text::multibyte_split()` to use the offsetalator instead.
This allows creating large strings columns from input text files.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15947
---
 cpp/src/io/text/multibyte_split.cu | 38 ++++++++++++++++--------------
 1 file changed, 20 insertions(+), 18 deletions(-)

diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 976d735e010..9c406369068 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -20,6 +20,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
@@ -30,6 +31,7 @@
 #include <cudf/io/text/multibyte_split.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -518,32 +520,37 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   bool const insert_end =
     not(last_row_offset.has_value() or
         (global_offsets.size() > 0 and global_offsets.back_element(stream) == chunk_offset));
-  rmm::device_uvector<int32_t> offsets{
-    global_offsets.size() + insert_begin + insert_end, stream, mr};
-  if (insert_begin) { offsets.set_element_to_zero_async(0, stream); }
-  if (insert_end) {
-    offsets.set_element(offsets.size() - 1, chunk_offset - *first_row_offset, stream);
-  }
+  auto const chars_bytes = chunk_offset - *first_row_offset;
+  auto offsets           = cudf::strings::detail::create_offsets_child_column(
+    chars_bytes, global_offsets.size() + insert_begin + insert_end, stream, mr);
+  auto offsets_itr =
+    cudf::detail::offsetalator_factory::make_output_iterator(offsets->mutable_view());
+  auto set_offset_value = [offsets_itr, stream](size_type index, int64_t value) {
+    cudf::detail::device_single_thread(
+      [offsets_itr, index, value] __device__() mutable { offsets_itr[index] = value; }, stream);
+  };
+  if (insert_begin) { set_offset_value(0, 0); }
+  if (insert_end) { set_offset_value(offsets->size() - 1, chars_bytes); }
   thrust::transform(rmm::exec_policy(stream),
                     global_offsets.begin(),
                     global_offsets.end(),
-                    offsets.begin() + insert_begin,
-                    cuda::proclaim_return_type<int32_t>(
+                    offsets_itr + insert_begin,
+                    cuda::proclaim_return_type<int64_t>(
                       [baseline = *first_row_offset] __device__(byte_offset global_offset) {
-                        return static_cast<int32_t>(global_offset - baseline);
+                        return (global_offset - baseline);
                       }));
-  auto string_count = offsets.size() - 1;
+  auto string_count = offsets->size() - 1;
   if (strip_delimiters) {
     auto it = cudf::detail::make_counting_transform_iterator(
       0,
       cuda::proclaim_return_type<thrust::pair<char*, int32_t>>(
-        [ofs        = offsets.data(),
+        [ofs        = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()),
          chars      = chars.data(),
          delim_size = static_cast<size_type>(delimiter.size()),
          last_row   = static_cast<size_type>(string_count) - 1,
          insert_end] __device__(size_type row) {
           auto const begin = ofs[row];
-          auto const len   = ofs[row + 1] - begin;
+          auto const len   = static_cast<size_type>(ofs[row + 1] - begin);
           if (row == last_row && insert_end) {
             return thrust::make_pair(chars + begin, len);
           } else {
@@ -552,12 +559,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
         }));
     return cudf::strings::detail::make_strings_column(it, it + string_count, stream, mr);
   } else {
-    return cudf::make_strings_column(
-      string_count,
-      std::make_unique<cudf::column>(std::move(offsets), rmm::device_buffer{}, 0),
-      chars.release(),
-      0,
-      {});
+    return cudf::make_strings_column(string_count, std::move(offsets), chars.release(), 0, {});
   }
 }
 

From d2cd1d4411e1a16f5c989efff07643ca3411f8ab Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 11 Jun 2024 20:28:40 -0400
Subject: [PATCH 341/842] Migrate lists/combine to pylibcudf (#15928)

Part of #15162. concatenate_rows, concatenate_list_elements

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/15928
---
 python/cudf/cudf/_lib/lists.pyx               | 46 ++++----------
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  7 +++
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 61 +++++++++++++++++++
 .../cudf/cudf/pylibcudf_tests/test_lists.py   | 46 ++++++++++++++
 4 files changed, 127 insertions(+), 33 deletions(-)
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_lists.py

diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 656d92c1a4b..5d406f5c85f 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -9,11 +9,6 @@ from libcpp.utility cimport move
 from cudf._lib.column cimport Column
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
-    concatenate_list_elements as cpp_concatenate_list_elements,
-    concatenate_null_policy,
-    concatenate_rows as cpp_concatenate_rows,
-)
 from cudf._lib.pylibcudf.libcudf.lists.contains cimport (
     contains,
     index_of as cpp_index_of,
@@ -32,7 +27,6 @@ from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
     distinct as cpp_distinct,
 )
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     null_equality,
@@ -41,10 +35,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport (
     size_type,
 )
 from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport (
-    columns_from_pylibcudf_table,
-    table_view_from_columns,
-)
+from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf._lib import pylibcudf
 
@@ -223,31 +214,20 @@ def index_of_column(Column col, Column search_keys):
 
 @acquire_spill_lock()
 def concatenate_rows(list source_columns):
-    cdef unique_ptr[column] c_result
-
-    cdef table_view c_table_view = table_view_from_columns(source_columns)
-
-    with nogil:
-        c_result = move(cpp_concatenate_rows(
-            c_table_view,
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        pylibcudf.lists.concatenate_rows(
+            pylibcudf.Table([
+                c.to_pylibcudf(mode="read") for c in source_columns
+            ])
+        )
+    )
 
 
 @acquire_spill_lock()
 def concatenate_list_elements(Column input_column, dropna=False):
-    cdef concatenate_null_policy policy = (
-        concatenate_null_policy.IGNORE if dropna
-        else concatenate_null_policy.NULLIFY_OUTPUT_ROW
+    return Column.from_pylibcudf(
+        pylibcudf.lists.concatenate_list_elements(
+            input_column.to_pylibcudf(mode="read"),
+            dropna,
+        )
     )
-    cdef column_view c_input = input_column.view()
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_concatenate_list_elements(
-            c_input,
-            policy
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index b780d299977..2d2a5b2a9ea 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -1,8 +1,15 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from libcpp cimport bool
+
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
+from .column cimport Column
 from .table cimport Table
 
 
 cpdef Table explode_outer(Table, size_type explode_column_idx)
+
+cpdef Column concatenate_rows(Table)
+
+cpdef Column concatenate_list_elements(Column, bool dropna)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index 654f39742b6..069c9da31c2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -1,12 +1,20 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.lists cimport explode as cpp_explode
+from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
+    concatenate_list_elements as cpp_concatenate_list_elements,
+    concatenate_null_policy,
+    concatenate_rows as cpp_concatenate_rows,
+)
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
+from .column cimport Column
 from .table cimport Table
 
 
@@ -33,3 +41,56 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx):
         c_result = move(cpp_explode.explode_outer(input.view(), explode_column_idx))
 
     return Table.from_libcudf(move(c_result))
+
+
+cpdef Column concatenate_rows(Table input):
+    """Concatenate multiple lists columns into a single lists column row-wise.
+
+    Parameters
+    ----------
+    input : Table
+        The input table
+
+    Returns
+    -------
+    Table
+        A new Column of concatenated rows
+    """
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_concatenate_rows(input.view()))
+
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column concatenate_list_elements(Column input, bool dropna):
+    """Concatenate multiple lists on the same row into a single list.
+
+    Parameters
+    ----------
+    input : Column
+        The input column
+
+    Returns
+    -------
+    Column
+        A new Column of concatenated list elements
+    dropna : bool
+        If true, null list elements will be ignored
+        from concatenation. Otherwise any input null values will result in
+        the corresponding output row being set to null.
+    """
+    cdef concatenate_null_policy null_policy = (
+        concatenate_null_policy.IGNORE if dropna
+        else concatenate_null_policy.NULLIFY_OUTPUT_ROW
+    )
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_concatenate_list_elements(
+            input.view(),
+            null_policy,
+        ))
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
new file mode 100644
index 00000000000..b21af8ea11c
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+from cudf._lib import pylibcudf as plc
+
+
+def test_concatenate_rows():
+    test_data = [[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]]
+
+    arrow_tbl = pa.Table.from_arrays(test_data, names=["a", "b"])
+    plc_tbl = plc.interop.from_arrow(arrow_tbl)
+
+    res = plc.lists.concatenate_rows(plc_tbl)
+
+    expect = pa.array([pair[0] + pair[1] for pair in zip(*test_data)])
+
+    assert_column_eq(expect, res)
+
+
+@pytest.mark.parametrize(
+    "test_data, dropna, expected",
+    [
+        (
+            [[[1, 2], [3, 4], [5]], [[6], None, [7, 8, 9]]],
+            False,
+            [[1, 2, 3, 4, 5], None],
+        ),
+        (
+            [[[1, 2], [3, 4], [5, None]], [[6], [None], [7, 8, 9]]],
+            True,
+            [[1, 2, 3, 4, 5, None], [6, None, 7, 8, 9]],
+        ),
+    ],
+)
+def test_concatenate_list_elements(test_data, dropna, expected):
+    arr = pa.array(test_data)
+    plc_column = plc.interop.from_arrow(arr)
+
+    res = plc.lists.concatenate_list_elements(plc_column, dropna)
+
+    expect = pa.array(expected)
+
+    assert_column_eq(expect, res)

From f7ba6ab47ac994e6a1363119c01eee5dd6304181 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 11 Jun 2024 17:47:19 -0700
Subject: [PATCH 342/842] Pinned vector factory that uses the global pool
 (#15895)

closes https://github.com/rapidsai/cudf/issues/15612
Expanded the set of vector factories to cover pinned vectors. The functions return `cudf::detail::host_vector`, which use a type-erased allocator, allowing us to utilize the runtime configurable global pinned (previously host) resource.
The `pinned_host_vector` type has been removed as it can only support the non-pooled pinned allocations. Its use is not replaced with `cudf::detail::host_vector`.
Moved the global host (now pinned) resource out of cuIO and changed the type to host_device. User-specified resources are now required to allocate device-accessible memory. The name has been changed to pinned to reflect the new requirement.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)
  - Yunsong Wang (https://github.com/PointKernel)
  - Mark Harris (https://github.com/harrism)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/15895
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/benchmarks/fixture/nvbench_fixture.hpp    |  13 +-
 cpp/benchmarks/io/cuio_common.cpp             |  12 +
 cpp/benchmarks/io/cuio_common.hpp             |   4 +-
 .../io/parquet/parquet_reader_multithread.cpp |   2 +-
 cpp/benchmarks/io/text/multibyte_split.cpp    |  10 +-
 .../{rmm_host_vector.hpp => host_vector.hpp}  |  18 +-
 .../detail/utilities/pinned_host_vector.hpp   | 216 ------------------
 .../detail/utilities/vector_factories.hpp     |  38 ++-
 cpp/include/cudf/io/memory_resource.hpp       |  65 ------
 cpp/include/cudf/utilities/pinned_memory.hpp  |  58 +++++
 cpp/src/io/csv/reader_impl.cu                 |   1 +
 cpp/src/io/orc/reader_impl_chunking.cu        |   1 +
 cpp/src/io/orc/writer_impl.cu                 |   5 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    |   2 +
 cpp/src/io/parquet/writer_impl.cu             |   3 +-
 cpp/src/io/text/bgzip_data_chunk_source.cu    |  16 +-
 .../io/text/data_chunk_source_factories.cpp   |  51 ++---
 cpp/src/io/utilities/config_utils.cpp         | 214 +----------------
 cpp/src/io/utilities/hostdevice_vector.hpp    |   9 +-
 cpp/src/utilities/pinned_memory.cpp           | 216 ++++++++++++++++++
 cpp/tests/CMakeLists.txt                      |   5 +-
 cpp/tests/io/json_test.cpp                    |   6 +-
 .../utilities_tests/io_utilities_tests.cpp    |  45 ----
 .../utilities_tests/pinned_memory_tests.cpp   |  65 ++++++
 .../java/ai/rapids/cudf/PinnedMemoryPool.java |  12 +-
 java/src/main/java/ai/rapids/cudf/Rmm.java    |   2 +-
 java/src/main/native/src/RmmJni.cpp           |  34 +--
 28 files changed, 487 insertions(+), 637 deletions(-)
 rename cpp/include/cudf/detail/utilities/{rmm_host_vector.hpp => host_vector.hpp} (93%)
 delete mode 100644 cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
 delete mode 100644 cpp/include/cudf/io/memory_resource.hpp
 create mode 100644 cpp/include/cudf/utilities/pinned_memory.hpp
 create mode 100644 cpp/src/utilities/pinned_memory.cpp
 create mode 100644 cpp/tests/utilities_tests/pinned_memory_tests.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ca85996b990..aab0a9b2d49 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -664,6 +664,7 @@ add_library(
   src/utilities/default_stream.cpp
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
+  src/utilities/pinned_memory.cpp
   src/utilities/stacktrace.cpp
   src/utilities/stream_pool.cpp
   src/utilities/traits.cpp
diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp
index ebcbcb17e98..df1492690bb 100644
--- a/cpp/benchmarks/fixture/nvbench_fixture.hpp
+++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp
@@ -15,8 +15,8 @@
  */
 #pragma once
 
-#include <cudf/io/memory_resource.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
@@ -81,17 +81,18 @@ struct nvbench_base_fixture {
               "\nExpecting: cuda, pool, async, arena, managed, or managed_pool");
   }
 
-  inline rmm::host_async_resource_ref make_cuio_host_pinned()
+  inline rmm::host_device_async_resource_ref make_cuio_host_pinned()
   {
     static std::shared_ptr<rmm::mr::pinned_host_memory_resource> mr =
       std::make_shared<rmm::mr::pinned_host_memory_resource>();
     return *mr;
   }
 
-  inline rmm::host_async_resource_ref create_cuio_host_memory_resource(std::string const& mode)
+  inline rmm::host_device_async_resource_ref create_cuio_host_memory_resource(
+    std::string const& mode)
   {
     if (mode == "pinned") return make_cuio_host_pinned();
-    if (mode == "pinned_pool") return cudf::io::get_host_memory_resource();
+    if (mode == "pinned_pool") return cudf::get_pinned_memory_resource();
     CUDF_FAIL("Unknown cuio_host_mem parameter: " + mode + "\nExpecting: pinned or pinned_pool");
   }
 
@@ -112,14 +113,14 @@ struct nvbench_base_fixture {
     rmm::mr::set_current_device_resource(mr.get());
     std::cout << "RMM memory resource = " << rmm_mode << "\n";
 
-    cudf::io::set_host_memory_resource(create_cuio_host_memory_resource(cuio_host_mode));
+    cudf::set_pinned_memory_resource(create_cuio_host_memory_resource(cuio_host_mode));
     std::cout << "CUIO host memory resource = " << cuio_host_mode << "\n";
   }
 
   ~nvbench_base_fixture()
   {
     // Ensure the the pool is freed before the CUDA context is destroyed:
-    cudf::io::set_host_memory_resource(this->make_cuio_host_pinned());
+    cudf::set_pinned_memory_resource(this->make_cuio_host_pinned());
   }
 
   std::shared_ptr<rmm::mr::device_memory_resource> mr;
diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index 37ced8ea703..645994f3f0d 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -19,6 +19,9 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/logger.hpp>
 
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
 #include <unistd.h>
 
 #include <cstdio>
@@ -28,6 +31,14 @@
 
 temp_directory const cuio_source_sink_pair::tmpdir{"cudf_gbench"};
 
+// Don't use cudf's pinned pool for the source data
+rmm::host_async_resource_ref pinned_memory_resource()
+{
+  static rmm::mr::pinned_host_memory_resource mr = rmm::mr::pinned_host_memory_resource{};
+
+  return mr;
+}
+
 std::string random_file_in_dir(std::string const& dir_path)
 {
   // `mkstemp` modifies the template in place
@@ -41,6 +52,7 @@ std::string random_file_in_dir(std::string const& dir_path)
 
 cuio_source_sink_pair::cuio_source_sink_pair(io_type type)
   : type{type},
+    pinned_buffer({pinned_memory_resource(), cudf::get_default_stream()}),
     d_buffer{0, cudf::get_default_stream()},
     file_name{random_file_in_dir(tmpdir.path())},
     void_sink{cudf::io::data_sink::create()}
diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp
index d4f39a5f243..64d6021cf50 100644
--- a/cpp/benchmarks/io/cuio_common.hpp
+++ b/cpp/benchmarks/io/cuio_common.hpp
@@ -18,7 +18,7 @@
 
 #include <cudf_test/file_utilities.hpp>
 
-#include <cudf/detail/utilities/pinned_host_vector.hpp>
+#include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/datasource.hpp>
 
@@ -79,7 +79,7 @@ class cuio_source_sink_pair {
 
   io_type const type;
   std::vector<char> h_buffer;
-  cudf::detail::pinned_host_vector<char> pinned_buffer;
+  cudf::detail::host_vector<char> pinned_buffer;
   rmm::device_uvector<std::byte> d_buffer;
   std::string const file_name;
   std::unique_ptr<cudf::io::data_sink> void_sink;
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index a67d1932951..b4c8ed78ed8 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -20,9 +20,9 @@
 #include <benchmarks/io/nvbench_helpers.hpp>
 
 #include <cudf/detail/utilities/stream_pool.hpp>
-#include <cudf/io/memory_resource.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 #include <cudf/utilities/thread_pool.hpp>
 
 #include <nvtx3/nvtx3.hpp>
diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
index b5d855d8881..67705863d41 100644
--- a/cpp/benchmarks/io/text/multibyte_split.cpp
+++ b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,6 @@
 #include <cudf_test/file_utilities.hpp>
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/utilities/pinned_host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/detail/bgzip_utils.hpp>
@@ -132,9 +131,10 @@ static void bench_multibyte_split(nvbench::state& state,
 
   auto const delim_factor = static_cast<double>(delim_percent) / 100;
   std::unique_ptr<cudf::io::datasource> datasource;
-  auto device_input      = create_random_input(file_size_approx, delim_factor, 0.05, delim);
-  auto host_input        = std::vector<char>{};
-  auto host_pinned_input = cudf::detail::pinned_host_vector<char>{};
+  auto device_input = create_random_input(file_size_approx, delim_factor, 0.05, delim);
+  auto host_input   = std::vector<char>{};
+  auto host_pinned_input =
+    cudf::detail::make_pinned_vector_async<char>(0, cudf::get_default_stream());
 
   if (source_type != data_chunk_source_type::device &&
       source_type != data_chunk_source_type::host_pinned) {
diff --git a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
similarity index 93%
rename from cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
rename to cpp/include/cudf/detail/utilities/host_vector.hpp
index 6901a19473e..6a115177ab5 100644
--- a/cpp/include/cudf/detail/utilities/rmm_host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -19,6 +19,7 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/aligned.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/host_vector.h>
@@ -32,8 +33,6 @@ namespace cudf::detail {
 /*! \p rmm_host_allocator is a CUDA-specific host memory allocator
  *  that employs \c a `rmm::host_async_resource_ref` for allocation.
  *
- * This implementation is ported from pinned_host_vector in cudf.
- *
  *  \see https://en.cppreference.com/w/cpp/memory/allocator
  */
 template <typename T>
@@ -42,8 +41,6 @@ class rmm_host_allocator;
 /*! \p rmm_host_allocator is a CUDA-specific host memory allocator
  *  that employs \c an `cudf::host_async_resource_ref` for allocation.
  *
- * This implementation is ported from pinned_host_vector in cudf.
- *
  *  \see https://en.cppreference.com/w/cpp/memory/allocator
  */
 template <>
@@ -70,8 +67,7 @@ class rmm_host_allocator<void> {
  * The \p rmm_host_allocator provides an interface for host memory allocation through the user
  * provided \c `rmm::host_async_resource_ref`. The \p rmm_host_allocator does not take ownership of
  * this reference and therefore it is the user's responsibility to ensure its lifetime for the
- * duration of the lifetime of the \p rmm_host_allocator. This implementation is ported from
- * pinned_host_vector in cudf.
+ * duration of the lifetime of the \p rmm_host_allocator.
  *
  *  \see https://en.cppreference.com/w/cpp/memory/allocator
  */
@@ -121,8 +117,12 @@ class rmm_host_allocator {
   inline pointer allocate(size_type cnt)
   {
     if (cnt > this->max_size()) { throw std::bad_alloc(); }  // end if
-    return static_cast<pointer>(
-      mr.allocate_async(cnt * sizeof(value_type), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream));
+    auto const result =
+      mr.allocate_async(cnt * sizeof(value_type), rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+    // Synchronize to ensure the memory is allocated before thrust::host_vector initialization
+    // TODO: replace thrust::host_vector with a type that does not require synchronization
+    stream.synchronize();
+    return static_cast<pointer>(result);
   }
 
   /**
@@ -182,6 +182,6 @@ class rmm_host_allocator {
  * @brief A vector class with rmm host memory allocator
  */
 template <typename T>
-using rmm_host_vector = thrust::host_vector<T, rmm_host_allocator<T>>;
+using host_vector = thrust::host_vector<T, rmm_host_allocator<T>>;
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp b/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
deleted file mode 100644
index c22b6a6ba15..00000000000
--- a/cpp/include/cudf/detail/utilities/pinned_host_vector.hpp
+++ /dev/null
@@ -1,216 +0,0 @@
-/*
- *  Copyright (c) 2008-2024, NVIDIA CORPORATION
- *
- *  Licensed under the Apache License, Version 2.0 (the "License");
- *  you may not use this file except in compliance with the License.
- *  You may obtain a copy of the License at
- *
- *      http://www.apache.org/licenses/LICENSE-2.0
- *
- *  Unless required by applicable law or agreed to in writing, software
- *  distributed under the License is distributed on an "AS IS" BASIS,
- *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- *  See the License for the specific language governing permissions and
- *  limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/utilities/error.hpp>
-
-#include <thrust/host_vector.h>
-
-#include <cstddef>
-#include <limits>
-#include <new>  // for bad_alloc
-
-namespace cudf::detail {
-
-/*! \p pinned_allocator is a CUDA-specific host memory allocator
- *  that employs \c cudaMallocHost for allocation.
- *
- * This implementation is ported from the experimental/pinned_allocator
- * that Thrust used to provide.
- *
- *  \see https://en.cppreference.com/w/cpp/memory/allocator
- */
-template <typename T>
-class pinned_allocator;
-
-/*! \p pinned_allocator is a CUDA-specific host memory allocator
- *  that employs \c cudaMallocHost for allocation.
- *
- * This implementation is ported from the experimental/pinned_allocator
- * that Thrust used to provide.
- *
- *  \see https://en.cppreference.com/w/cpp/memory/allocator
- */
-template <>
-class pinned_allocator<void> {
- public:
-  using value_type      = void;            ///< The type of the elements in the allocator
-  using pointer         = void*;           ///< The type returned by address() / allocate()
-  using const_pointer   = void const*;     ///< The type returned by address()
-  using size_type       = std::size_t;     ///< The type used for the size of the allocation
-  using difference_type = std::ptrdiff_t;  ///< The type of the distance between two pointers
-
-  /**
-   * @brief converts a `pinned_allocator<void>` to `pinned_allocator<U>`
-   */
-  template <typename U>
-  struct rebind {
-    using other = pinned_allocator<U>;  ///< The rebound type
-  };
-};
-
-/*! \p pinned_allocator is a CUDA-specific host memory allocator
- *  that employs \c cudaMallocHost for allocation.
- *
- * This implementation is ported from the experimental/pinned_allocator
- * that Thrust used to provide.
- *
- *  \see https://en.cppreference.com/w/cpp/memory/allocator
- */
-template <typename T>
-class pinned_allocator {
- public:
-  using value_type      = T;               ///< The type of the elements in the allocator
-  using pointer         = T*;              ///< The type returned by address() / allocate()
-  using const_pointer   = T const*;        ///< The type returned by address()
-  using reference       = T&;              ///< The parameter type for address()
-  using const_reference = T const&;        ///< The parameter type for address()
-  using size_type       = std::size_t;     ///< The type used for the size of the allocation
-  using difference_type = std::ptrdiff_t;  ///< The type of the distance between two pointers
-
-  /**
-   * @brief converts a `pinned_allocator<T>` to `pinned_allocator<U>`
-   */
-  template <typename U>
-  struct rebind {
-    using other = pinned_allocator<U>;  ///< The rebound type
-  };
-
-  /**
-   * @brief pinned_allocator's null constructor does nothing.
-   */
-  __host__ __device__ inline pinned_allocator() {}
-
-  /**
-   * @brief pinned_allocator's null destructor does nothing.
-   */
-  __host__ __device__ inline ~pinned_allocator() {}
-
-  /**
-   * @brief pinned_allocator's copy constructor does nothing.
-   */
-  __host__ __device__ inline pinned_allocator(pinned_allocator const&) {}
-
-  /**
-   * @brief  pinned_allocator's copy constructor does nothing.
-   *
-   *  This version of pinned_allocator's copy constructor
-   *  is templated on the \c value_type of the pinned_allocator
-   *  to copy from.  It is provided merely for convenience; it
-   *  does nothing.
-   */
-  template <typename U>
-  __host__ __device__ inline pinned_allocator(pinned_allocator<U> const&)
-  {
-  }
-
-  /**
-   * @brief This method returns the address of a \c reference of
-   *  interest.
-   *
-   *  @param r The \c reference of interest.
-   *  @return \c r's address.
-   */
-  __host__ __device__ inline pointer address(reference r) { return &r; }
-
-  /**
-   * @brief This method returns the address of a \c const_reference
-   *  of interest.
-   *
-   *  @param r The \c const_reference of interest.
-   *  @return \c r's address.
-   */
-  __host__ __device__ inline const_pointer address(const_reference r) { return &r; }
-
-  /**
-   * @brief This method allocates storage for objects in pinned host
-   *  memory.
-   *
-   *  @param cnt The number of objects to allocate.
-   *  @return a \c pointer to the newly allocated objects.
-   *  @note The second parameter to this function is meant as a
-   *        hint pointer to a nearby memory location, but is
-   *        not used by this allocator.
-   *  @note This method does not invoke \p value_type's constructor.
-   *        It is the responsibility of the caller to initialize the
-   *        objects at the returned \c pointer.
-   */
-  __host__ inline pointer allocate(size_type cnt, const_pointer /*hint*/ = 0)
-  {
-    if (cnt > this->max_size()) { throw std::bad_alloc(); }  // end if
-
-    pointer result(0);
-    CUDF_CUDA_TRY(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
-    return result;
-  }
-
-  /**
-   * @brief This method deallocates pinned host memory previously allocated
-   *  with this \c pinned_allocator.
-   *
-   *  @param p A \c pointer to the previously allocated memory.
-   *  @note The second parameter is the number of objects previously allocated
-   *        but is ignored by this allocator.
-   *  @note This method does not invoke \p value_type's destructor.
-   *        It is the responsibility of the caller to destroy
-   *        the objects stored at \p p.
-   */
-  __host__ inline void deallocate(pointer p, size_type /*cnt*/)
-  {
-    auto dealloc_worked = cudaFreeHost(p);
-    (void)dealloc_worked;
-    assert(dealloc_worked == cudaSuccess);
-  }
-
-  /**
-   * @brief This method returns the maximum size of the \c cnt parameter
-   *  accepted by the \p allocate() method.
-   *
-   *  @return The maximum number of objects that may be allocated
-   *          by a single call to \p allocate().
-   */
-  inline size_type max_size() const { return (std::numeric_limits<size_type>::max)() / sizeof(T); }
-
-  /**
-   * @brief This method tests this \p pinned_allocator for equality to
-   *  another.
-   *
-   *  @param x The other \p pinned_allocator of interest.
-   *  @return This method always returns \c true.
-   */
-  __host__ __device__ inline bool operator==(pinned_allocator const& x) const { return true; }
-
-  /**
-   * @brief This method tests this \p pinned_allocator for inequality
-   *  to another.
-   *
-   *  @param x The other \p pinned_allocator of interest.
-   *  @return This method always returns \c false.
-   */
-  __host__ __device__ inline bool operator!=(pinned_allocator const& x) const
-  {
-    return !operator==(x);
-  }
-};
-
-/**
- * @brief A vector class with pinned host memory allocator
- */
-template <typename T>
-using pinned_host_vector = thrust::host_vector<T, pinned_allocator<T>>;
-
-}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 293a4096c57..20cb55bb1c7 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -21,8 +21,10 @@
  * @file vector_factories.hpp
  */
 
+#include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -380,7 +382,7 @@ thrust::host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a device
  * container
  *
- * @note This function synchronizes `stream`.
+ * @note This function does not synchronize `stream`.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -439,6 +441,40 @@ thrust::host_vector<typename Container::value_type> make_host_vector_sync(
   return make_host_vector_sync(device_span<typename Container::value_type const>{c}, stream);
 }
 
+/**
+ * @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size
+ *
+ * @note This function may not synchronize `stream`.
+ *
+ * @tparam T The type of the vector data
+ * @param size The number of elements in the created vector
+ * @param stream The stream on which to allocate memory
+ * @return A host_vector of the given size
+ */
+template <typename T>
+host_vector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view stream)
+{
+  return host_vector<T>(size, {cudf::get_pinned_memory_resource(), stream});
+}
+
+/**
+ * @brief Synchronously construct a pinned `cudf::detail::host_vector` of the given size
+ *
+ * @note This function synchronizes `stream`.
+ *
+ * @tparam T The type of the vector data
+ * @param size The number of elements in the created vector
+ * @param stream The stream on which to allocate memory
+ * @return A host_vector of the given size
+ */
+template <typename T>
+host_vector<T> make_pinned_vector_sync(size_t size, rmm::cuda_stream_view stream)
+{
+  auto result = make_pinned_vector_async<T>(size, stream);
+  stream.synchronize();
+  return result;
+}
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/include/cudf/io/memory_resource.hpp b/cpp/include/cudf/io/memory_resource.hpp
deleted file mode 100644
index a36e220ae7b..00000000000
--- a/cpp/include/cudf/io/memory_resource.hpp
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <rmm/resource_ref.hpp>
-
-#include <optional>
-
-namespace cudf::io {
-
-/**
- * @brief Set the rmm resource to be used for host memory allocations by
- * cudf::detail::hostdevice_vector
- *
- * hostdevice_vector is a utility class that uses a pair of host and device-side buffers for
- * bouncing state between the cpu and the gpu. The resource set with this function (typically a
- * pinned memory allocator) is what it uses to allocate space for it's host-side buffer.
- *
- * @param mr The rmm resource to be used for host-side allocations
- * @return The previous resource that was in use
- */
-rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_ref mr);
-
-/**
- * @brief Get the rmm resource being used for host memory allocations by
- * cudf::detail::hostdevice_vector
- *
- * @return The rmm resource used for host-side allocations
- */
-rmm::host_async_resource_ref get_host_memory_resource();
-
-/**
- * @brief Options to configure the default host memory resource
- */
-struct host_mr_options {
-  std::optional<size_t> pool_size;  ///< The size of the pool to use for the default host memory
-                                    ///< resource. If not set, the default pool size is used.
-};
-
-/**
- * @brief Configure the size of the default host memory resource.
- *
- * @throws cudf::logic_error if called after the default host memory resource has been created
- *
- * @param opts Options to configure the default host memory resource
- * @return True if this call successfully configured the host memory resource, false if a
- * a resource was already configured.
- */
-bool config_default_host_memory_resource(host_mr_options const& opts);
-
-}  // namespace cudf::io
diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
new file mode 100644
index 00000000000..b423eab6d38
--- /dev/null
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/resource_ref.hpp>
+
+#include <optional>
+
+namespace cudf {
+
+/**
+ * @brief Set the rmm resource to be used for pinned memory allocations.
+ *
+ * @param mr The rmm resource to be used for pinned allocations
+ * @return The previous resource that was in use
+ */
+rmm::host_device_async_resource_ref set_pinned_memory_resource(
+  rmm::host_device_async_resource_ref mr);
+
+/**
+ * @brief Get the rmm resource being used for pinned memory allocations.
+ *
+ * @return The rmm resource used for pinned allocations
+ */
+rmm::host_device_async_resource_ref get_pinned_memory_resource();
+
+/**
+ * @brief Options to configure the default pinned memory resource
+ */
+struct pinned_mr_options {
+  std::optional<size_t> pool_size;  ///< The size of the pool to use for the default pinned memory
+                                    ///< resource. If not set, the default pool size is used.
+};
+
+/**
+ * @brief Configure the size of the default pinned memory resource.
+ *
+ * @param opts Options to configure the default pinned memory resource
+ * @return True if this call successfully configured the pinned memory resource, false if a
+ * a resource was already configured.
+ */
+bool config_default_pinned_memory_resource(pinned_mr_options const& opts);
+
+}  // namespace cudf
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 5dee0c17a33..05faded651d 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -27,6 +27,7 @@
 #include "io/utilities/parsing_utils.cuh"
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/io/csv.hpp>
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 5034aa14a95..43301826003 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -22,6 +22,7 @@
 
 #include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/device_buffer.hpp>
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 344e216cdc8..e9e031a407a 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -27,7 +27,6 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/detail/utilities/pinned_host_vector.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -2339,7 +2338,7 @@ auto convert_table_to_orc_data(table_view const& input,
                       std::move(streams),
                       std::move(stripes),
                       std::move(stripe_dicts.views),
-                      cudf::detail::pinned_host_vector<uint8_t>()};
+                      cudf::detail::make_pinned_vector_async<uint8_t>(0, stream)};
   }
 
   // Allocate intermediate output stream buffer
@@ -2407,7 +2406,7 @@ auto convert_table_to_orc_data(table_view const& input,
     return max_stream_size;
   }();
 
-  cudf::detail::pinned_host_vector<uint8_t> bounce_buffer(max_out_stream_size);
+  auto bounce_buffer = cudf::detail::make_pinned_vector_async<uint8_t>(max_out_stream_size, stream);
 
   auto intermediate_stats = gather_statistic_blobs(stats_freq, orc_table, segmentation, stream);
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index eb653c6b9ac..9de8a9e2719 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -23,6 +23,8 @@
 #include "ipc/Message_generated.h"
 #include "ipc/Schema_generated.h"
 
+#include <cudf/detail/utilities/logger.hpp>
+
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 1dfced94f5b..6d466748c17 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -36,7 +36,6 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/linked_column.hpp>
-#include <cudf/detail/utilities/pinned_host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
@@ -2278,7 +2277,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   }
 
   auto bounce_buffer =
-    cudf::detail::pinned_host_vector<uint8_t>(all_device_write ? 0 : max_write_size);
+    cudf::detail::make_pinned_vector_async<uint8_t>(all_device_write ? 0 : max_write_size, stream);
 
   return std::tuple{std::move(agg_meta),
                     std::move(pages),
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index faa09e586ab..0e3ce779089 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -19,8 +19,9 @@
 #include "io/utilities/config_utils.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/pinned_host_vector.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/detail/bgzip_utils.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -66,7 +67,7 @@ struct bgzip_nvcomp_transform_functor {
 class bgzip_data_chunk_reader : public data_chunk_reader {
  private:
   template <typename T>
-  static void copy_to_device(cudf::detail::pinned_host_vector<T> const& host,
+  static void copy_to_device(cudf::detail::host_vector<T> const& host,
                              rmm::device_uvector<T>& device,
                              rmm::cuda_stream_view stream)
   {
@@ -84,9 +85,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
       1 << 16;  // 64k offset allocation, resized on demand
 
     cudaEvent_t event;
-    cudf::detail::pinned_host_vector<char> h_compressed_blocks;
-    cudf::detail::pinned_host_vector<std::size_t> h_compressed_offsets;
-    cudf::detail::pinned_host_vector<std::size_t> h_decompressed_offsets;
+    cudf::detail::host_vector<char> h_compressed_blocks;
+    cudf::detail::host_vector<std::size_t> h_compressed_offsets;
+    cudf::detail::host_vector<std::size_t> h_decompressed_offsets;
     rmm::device_uvector<char> d_compressed_blocks;
     rmm::device_uvector<char> d_decompressed_blocks;
     rmm::device_uvector<std::size_t> d_compressed_offsets;
@@ -103,7 +104,10 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
     bool is_decompressed{};
 
     decompression_blocks(rmm::cuda_stream_view init_stream)
-      : d_compressed_blocks(0, init_stream),
+      : h_compressed_blocks{cudf::detail::make_pinned_vector_async<char>(0, init_stream)},
+        h_compressed_offsets{cudf::detail::make_pinned_vector_async<std::size_t>(0, init_stream)},
+        h_decompressed_offsets{cudf::detail::make_pinned_vector_async<std::size_t>(0, init_stream)},
+        d_compressed_blocks(0, init_stream),
         d_decompressed_blocks(0, init_stream),
         d_compressed_offsets(0, init_stream),
         d_decompressed_offsets(0, init_stream),
diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index 9d1d0498ace..596ca3458c8 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,10 +14,12 @@
  * limitations under the License.
  */
 
+#include "cudf/utilities/default_stream.hpp"
 #include "io/text/device_data_chunks.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/utilities/pinned_host_vector.hpp>
+#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -31,8 +33,15 @@ namespace cudf::io::text {
 namespace {
 
 struct host_ticket {
-  cudaEvent_t event;
-  cudf::detail::pinned_host_vector<char> buffer;
+  cudaEvent_t event{};  // tracks the completion of the last device-to-host copy.
+  cudf::detail::host_vector<char> buffer;
+
+  host_ticket() : buffer{cudf::detail::make_pinned_vector_sync<char>(0, cudf::get_default_stream())}
+  {
+    cudaEventCreate(&event);
+  }
+
+  ~host_ticket() { cudaEventDestroy(event); }
 };
 
 /**
@@ -43,20 +52,7 @@ class datasource_chunk_reader : public data_chunk_reader {
   constexpr static int num_tickets = 2;
 
  public:
-  datasource_chunk_reader(datasource* source) : _source(source)
-  {
-    // create an event to track the completion of the last device-to-host copy.
-    for (auto& ticket : _tickets) {
-      CUDF_CUDA_TRY(cudaEventCreate(&(ticket.event)));
-    }
-  }
-
-  ~datasource_chunk_reader() override
-  {
-    for (auto& ticket : _tickets) {
-      CUDF_CUDA_TRY(cudaEventDestroy(ticket.event));
-    }
-  }
+  datasource_chunk_reader(datasource* source) : _source(source) {}
 
   void skip_bytes(std::size_t size) override
   {
@@ -84,7 +80,9 @@ class datasource_chunk_reader : public data_chunk_reader {
       CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event));
 
       // resize the host buffer as necessary to contain the requested number of bytes
-      if (h_ticket.buffer.size() < read_size) { h_ticket.buffer.resize(read_size); }
+      if (h_ticket.buffer.size() < read_size) {
+        h_ticket.buffer = cudf::detail::make_pinned_vector_sync<char>(read_size, stream);
+      }
 
       _source->host_read(_offset, read_size, reinterpret_cast<uint8_t*>(h_ticket.buffer.data()));
 
@@ -120,17 +118,6 @@ class istream_data_chunk_reader : public data_chunk_reader {
   istream_data_chunk_reader(std::unique_ptr<std::istream> datastream)
     : _datastream(std::move(datastream))
   {
-    // create an event to track the completion of the last device-to-host copy.
-    for (auto& ticket : _tickets) {
-      CUDF_CUDA_TRY(cudaEventCreate(&(ticket.event)));
-    }
-  }
-
-  ~istream_data_chunk_reader() override
-  {
-    for (auto& ticket : _tickets) {
-      CUDF_CUDA_TRY(cudaEventDestroy(ticket.event));
-    }
   }
 
   void skip_bytes(std::size_t size) override { _datastream->ignore(size); };
@@ -148,7 +135,9 @@ class istream_data_chunk_reader : public data_chunk_reader {
     CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event));
 
     // resize the host buffer as necessary to contain the requested number of bytes
-    if (h_ticket.buffer.size() < read_size) { h_ticket.buffer.resize(read_size); }
+    if (h_ticket.buffer.size() < read_size) {
+      h_ticket.buffer = cudf::detail::make_pinned_vector_sync<char>(read_size, stream);
+    }
 
     // read data from the host istream in to the pinned host memory buffer
     _datastream->read(h_ticket.buffer.data(), read_size);
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index dad1135e766..20ac89b4d53 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -16,22 +16,12 @@
 
 #include "config_utils.hpp"
 
-#include <cudf/detail/utilities/stream_pool.hpp>
-#include <cudf/io/memory_resource.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/export.hpp>
-
-#include <rmm/cuda_device.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-#include <rmm/mr/pinned_host_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cstdlib>
 #include <string>
 
-namespace cudf::io {
-
-namespace detail {
+namespace cudf::io::detail {
 
 namespace cufile_integration {
 
@@ -90,204 +80,4 @@ bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_
 
 }  // namespace nvcomp_integration
 
-}  // namespace detail
-
-namespace {
-class fixed_pinned_pool_memory_resource {
-  using upstream_mr    = rmm::mr::pinned_host_memory_resource;
-  using host_pooled_mr = rmm::mr::pool_memory_resource<upstream_mr>;
-
- private:
-  upstream_mr upstream_mr_{};
-  size_t pool_size_{0};
-  // Raw pointer to avoid a segfault when the pool is destroyed on exit
-  host_pooled_mr* pool_{nullptr};
-  void* pool_begin_{nullptr};
-  void* pool_end_{nullptr};
-  cuda::stream_ref stream_{cudf::detail::global_cuda_stream_pool().get_stream().value()};
-
- public:
-  fixed_pinned_pool_memory_resource(size_t size)
-    : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)}
-  {
-    if (pool_size_ == 0) { return; }
-
-    // Allocate full size from the pinned pool to figure out the beginning and end address
-    pool_begin_ = pool_->allocate_async(pool_size_, stream_);
-    pool_end_   = static_cast<void*>(static_cast<uint8_t*>(pool_begin_) + pool_size_);
-    pool_->deallocate_async(pool_begin_, pool_size_, stream_);
-  }
-
-  void* do_allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
-  {
-    if (bytes <= pool_size_) {
-      try {
-        return pool_->allocate_async(bytes, alignment, stream);
-      } catch (...) {
-        // If the pool is exhausted, fall back to the upstream memory resource
-      }
-    }
-
-    return upstream_mr_.allocate_async(bytes, alignment, stream);
-  }
-
-  void do_deallocate_async(void* ptr,
-                           std::size_t bytes,
-                           std::size_t alignment,
-                           cuda::stream_ref stream) noexcept
-  {
-    if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr <= pool_end_) {
-      pool_->deallocate_async(ptr, bytes, alignment, stream);
-    } else {
-      upstream_mr_.deallocate_async(ptr, bytes, alignment, stream);
-    }
-  }
-
-  void* allocate_async(std::size_t bytes, cuda::stream_ref stream)
-  {
-    return do_allocate_async(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
-  }
-
-  void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
-  {
-    return do_allocate_async(bytes, alignment, stream);
-  }
-
-  void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
-  {
-    auto const result = do_allocate_async(bytes, alignment, stream_);
-    stream_.wait();
-    return result;
-  }
-
-  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept
-  {
-    return do_deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
-  }
-
-  void deallocate_async(void* ptr,
-                        std::size_t bytes,
-                        std::size_t alignment,
-                        cuda::stream_ref stream) noexcept
-  {
-    return do_deallocate_async(ptr, bytes, alignment, stream);
-  }
-
-  void deallocate(void* ptr,
-                  std::size_t bytes,
-                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
-  {
-    deallocate_async(ptr, bytes, alignment, stream_);
-    stream_.wait();
-  }
-
-  bool operator==(fixed_pinned_pool_memory_resource const& other) const
-  {
-    return pool_ == other.pool_ and stream_ == other.stream_;
-  }
-
-  bool operator!=(fixed_pinned_pool_memory_resource const& other) const
-  {
-    return !operator==(other);
-  }
-
-  [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&,
-                                            cuda::mr::device_accessible) noexcept
-  {
-  }
-
-  [[maybe_unused]] friend void get_property(fixed_pinned_pool_memory_resource const&,
-                                            cuda::mr::host_accessible) noexcept
-  {
-  }
-};
-
-static_assert(cuda::mr::resource_with<fixed_pinned_pool_memory_resource,
-                                      cuda::mr::device_accessible,
-                                      cuda::mr::host_accessible>,
-              "");
-
-}  // namespace
-
-CUDF_EXPORT rmm::host_async_resource_ref& make_default_pinned_mr(std::optional<size_t> config_size)
-{
-  static fixed_pinned_pool_memory_resource mr = [config_size]() {
-    auto const size = [&config_size]() -> size_t {
-      if (auto const env_val = getenv("LIBCUDF_PINNED_POOL_SIZE"); env_val != nullptr) {
-        return std::atol(env_val);
-      }
-
-      if (config_size.has_value()) { return *config_size; }
-
-      size_t free{}, total{};
-      CUDF_CUDA_TRY(cudaMemGetInfo(&free, &total));
-      // 0.5% of the total device memory, capped at 100MB
-      return std::min(total / 200, size_t{100} * 1024 * 1024);
-    }();
-
-    // rmm requires the pool size to be a multiple of 256 bytes
-    auto const aligned_size = (size + 255) & ~255;
-    CUDF_LOG_INFO("Pinned pool size = {}", aligned_size);
-
-    // make the pool with max size equal to the initial size
-    return fixed_pinned_pool_memory_resource{aligned_size};
-  }();
-
-  static rmm::host_async_resource_ref mr_ref{mr};
-  return mr_ref;
-}
-
-CUDF_EXPORT std::mutex& host_mr_mutex()
-{
-  static std::mutex map_lock;
-  return map_lock;
-}
-
-// Must be called with the host_mr_mutex mutex held
-CUDF_EXPORT rmm::host_async_resource_ref& make_host_mr(std::optional<host_mr_options> const& opts,
-                                                       bool* did_configure = nullptr)
-{
-  static rmm::host_async_resource_ref* mr_ref = nullptr;
-  bool configured                             = false;
-  if (mr_ref == nullptr) {
-    configured = true;
-    mr_ref     = &make_default_pinned_mr(opts ? opts->pool_size : std::nullopt);
-  }
-
-  // If the user passed an out param to detect whether this call configured a resource
-  // set the result
-  if (did_configure != nullptr) { *did_configure = configured; }
-
-  return *mr_ref;
-}
-
-// Must be called with the host_mr_mutex mutex held
-CUDF_EXPORT rmm::host_async_resource_ref& host_mr()
-{
-  static rmm::host_async_resource_ref mr_ref = make_host_mr(std::nullopt);
-  return mr_ref;
-}
-
-rmm::host_async_resource_ref set_host_memory_resource(rmm::host_async_resource_ref mr)
-{
-  std::scoped_lock lock{host_mr_mutex()};
-  auto last_mr = host_mr();
-  host_mr()    = mr;
-  return last_mr;
-}
-
-rmm::host_async_resource_ref get_host_memory_resource()
-{
-  std::scoped_lock lock{host_mr_mutex()};
-  return host_mr();
-}
-
-bool config_default_host_memory_resource(host_mr_options const& opts)
-{
-  std::scoped_lock lock{host_mr_mutex()};
-  auto did_configure = false;
-  make_host_mr(opts, &did_configure);
-  return did_configure;
-}
-
-}  // namespace cudf::io
+}  // namespace cudf::io::detail
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 0883ac3609f..1ae27a2f4ae 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -16,11 +16,10 @@
 
 #pragma once
 
-#include "config_utils.hpp"
 #include "hostdevice_span.hpp"
 
-#include <cudf/detail/utilities/rmm_host_vector.hpp>
-#include <cudf/io/memory_resource.hpp>
+#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
@@ -53,7 +52,7 @@ class hostdevice_vector {
   }
 
   explicit hostdevice_vector(size_t initial_size, size_t max_size, rmm::cuda_stream_view stream)
-    : h_data({cudf::io::get_host_memory_resource(), stream}), d_data(max_size, stream)
+    : h_data{make_pinned_vector_async<T>(0, stream)}, d_data(max_size, stream)
   {
     CUDF_EXPECTS(initial_size <= max_size, "initial_size cannot be larger than max_size");
 
@@ -173,7 +172,7 @@ class hostdevice_vector {
   }
 
  private:
-  cudf::detail::rmm_host_vector<T> h_data;
+  cudf::detail::host_vector<T> h_data;
   rmm::device_uvector<T> d_data;
 };
 
diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp
new file mode 100644
index 00000000000..5d2e3ac332a
--- /dev/null
+++ b/cpp/src/utilities/pinned_memory.cpp
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+namespace cudf {
+
+namespace {
+class fixed_pinned_pool_memory_resource {
+  using upstream_mr    = rmm::mr::pinned_host_memory_resource;
+  using host_pooled_mr = rmm::mr::pool_memory_resource<upstream_mr>;
+
+ private:
+  upstream_mr upstream_mr_{};
+  size_t pool_size_{0};
+  // Raw pointer to avoid a segfault when the pool is destroyed on exit
+  host_pooled_mr* pool_{nullptr};
+  void* pool_begin_{nullptr};
+  void* pool_end_{nullptr};
+  cuda::stream_ref stream_{cudf::detail::global_cuda_stream_pool().get_stream().value()};
+
+ public:
+  fixed_pinned_pool_memory_resource(size_t size)
+    : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)}
+  {
+    if (pool_size_ == 0) { return; }
+
+    // Allocate full size from the pinned pool to figure out the beginning and end address
+    pool_begin_ = pool_->allocate_async(pool_size_, stream_);
+    pool_end_   = static_cast<void*>(static_cast<uint8_t*>(pool_begin_) + pool_size_);
+    pool_->deallocate_async(pool_begin_, pool_size_, stream_);
+  }
+
+  void* allocate_async(std::size_t bytes, std::size_t alignment, cuda::stream_ref stream)
+  {
+    if (bytes <= pool_size_) {
+      try {
+        return pool_->allocate_async(bytes, alignment, stream);
+      } catch (...) {
+        // If the pool is exhausted, fall back to the upstream memory resource
+      }
+    }
+
+    return upstream_mr_.allocate_async(bytes, alignment, stream);
+  }
+
+  void* allocate_async(std::size_t bytes, cuda::stream_ref stream)
+  {
+    return allocate_async(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+  }
+
+  void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
+  {
+    auto const result = allocate_async(bytes, alignment, stream_);
+    stream_.wait();
+    return result;
+  }
+
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        std::size_t alignment,
+                        cuda::stream_ref stream) noexcept
+  {
+    if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr < pool_end_) {
+      pool_->deallocate_async(ptr, bytes, alignment, stream);
+    } else {
+      upstream_mr_.deallocate_async(ptr, bytes, alignment, stream);
+    }
+  }
+
+  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept
+  {
+    return deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
+  }
+
+  void deallocate(void* ptr,
+                  std::size_t bytes,
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
+  {
+    deallocate_async(ptr, bytes, alignment, stream_);
+    stream_.wait();
+  }
+
+  bool operator==(fixed_pinned_pool_memory_resource const& other) const
+  {
+    return pool_ == other.pool_ and stream_ == other.stream_;
+  }
+
+  bool operator!=(fixed_pinned_pool_memory_resource const& other) const
+  {
+    return !operator==(other);
+  }
+
+  friend void get_property(fixed_pinned_pool_memory_resource const&,
+                           cuda::mr::device_accessible) noexcept
+  {
+  }
+
+  friend void get_property(fixed_pinned_pool_memory_resource const&,
+                           cuda::mr::host_accessible) noexcept
+  {
+  }
+};
+
+static_assert(cuda::mr::resource_with<fixed_pinned_pool_memory_resource,
+                                      cuda::mr::device_accessible,
+                                      cuda::mr::host_accessible>,
+              "Pinned pool mr must be accessible from both host and device");
+
+CUDF_EXPORT rmm::host_device_async_resource_ref& make_default_pinned_mr(
+  std::optional<size_t> config_size)
+{
+  static fixed_pinned_pool_memory_resource mr = [config_size]() {
+    auto const size = [&config_size]() -> size_t {
+      if (auto const env_val = getenv("LIBCUDF_PINNED_POOL_SIZE"); env_val != nullptr) {
+        return std::atol(env_val);
+      }
+
+      if (config_size.has_value()) { return *config_size; }
+
+      auto const total = rmm::available_device_memory().second;
+      // 0.5% of the total device memory, capped at 100MB
+      return std::min(total / 200, size_t{100} * 1024 * 1024);
+    }();
+
+    // rmm requires the pool size to be a multiple of 256 bytes
+    auto const aligned_size = rmm::align_up(size, rmm::RMM_DEFAULT_HOST_ALIGNMENT);
+    CUDF_LOG_INFO("Pinned pool size = {}", aligned_size);
+
+    // make the pool with max size equal to the initial size
+    return fixed_pinned_pool_memory_resource{aligned_size};
+  }();
+
+  static rmm::host_device_async_resource_ref mr_ref{mr};
+  return mr_ref;
+}
+
+CUDF_EXPORT std::mutex& host_mr_mutex()
+{
+  static std::mutex map_lock;
+  return map_lock;
+}
+
+// Must be called with the host_mr_mutex mutex held
+CUDF_EXPORT rmm::host_device_async_resource_ref& make_host_mr(
+  std::optional<pinned_mr_options> const& opts, bool* did_configure = nullptr)
+{
+  static rmm::host_device_async_resource_ref* mr_ref = nullptr;
+  bool configured                                    = false;
+  if (mr_ref == nullptr) {
+    configured = true;
+    mr_ref     = &make_default_pinned_mr(opts ? opts->pool_size : std::nullopt);
+  }
+
+  // If the user passed an out param to detect whether this call configured a resource
+  // set the result
+  if (did_configure != nullptr) { *did_configure = configured; }
+
+  return *mr_ref;
+}
+
+// Must be called with the host_mr_mutex mutex held
+CUDF_EXPORT rmm::host_device_async_resource_ref& host_mr()
+{
+  static rmm::host_device_async_resource_ref mr_ref = make_host_mr(std::nullopt);
+  return mr_ref;
+}
+
+}  // namespace
+
+rmm::host_device_async_resource_ref set_pinned_memory_resource(
+  rmm::host_device_async_resource_ref mr)
+{
+  std::scoped_lock lock{host_mr_mutex()};
+  auto last_mr = host_mr();
+  host_mr()    = mr;
+  return last_mr;
+}
+
+rmm::host_device_async_resource_ref get_pinned_memory_resource()
+{
+  std::scoped_lock lock{host_mr_mutex()};
+  return host_mr();
+}
+
+bool config_default_pinned_memory_resource(pinned_mr_options const& opts)
+{
+  std::scoped_lock lock{host_mr_mutex()};
+  auto did_configure = false;
+  make_host_mr(opts, &did_configure);
+  return did_configure;
+}
+
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 826f879ddc0..f6d762cc2ec 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -380,15 +380,16 @@ ConfigureTest(
 # * utilities tests -------------------------------------------------------------------------------
 ConfigureTest(
   UTILITIES_TEST
-  utilities_tests/type_list_tests.cpp
   utilities_tests/column_debug_tests.cpp
   utilities_tests/column_utilities_tests.cpp
   utilities_tests/column_wrapper_tests.cpp
+  utilities_tests/default_stream_tests.cpp
   utilities_tests/io_utilities_tests.cpp
   utilities_tests/lists_column_wrapper_tests.cpp
   utilities_tests/logger_tests.cpp
-  utilities_tests/default_stream_tests.cpp
+  utilities_tests/pinned_memory_tests.cpp
   utilities_tests/type_check_tests.cpp
+  utilities_tests/type_list_tests.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 57aa2721756..4c01a1fb87b 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -28,13 +28,13 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/json.hpp>
-#include <cudf/io/memory_resource.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 
@@ -2068,7 +2068,7 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringSync)
                     size_t{128} * 1024 * 1024};
 
   // Set new resource
-  auto last_mr = cudf::io::set_host_memory_resource(mr);
+  auto last_mr = cudf::set_pinned_memory_resource(mr);
 
   /**
    * @brief Spark has the specific need to ignore extra characters that come after the first record
@@ -2158,7 +2158,7 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringSync)
     float64_wrapper{c_data.cbegin(), c_data.cend(), c_validity.cbegin()});
 
   // Restore original memory source
-  cudf::io::set_host_memory_resource(last_mr);
+  cudf::set_pinned_memory_resource(last_mr);
 }
 
 TEST_F(JsonReaderTest, MixedTypes)
diff --git a/cpp/tests/utilities_tests/io_utilities_tests.cpp b/cpp/tests/utilities_tests/io_utilities_tests.cpp
index e5a153bf781..9ed8f18f5cc 100644
--- a/cpp/tests/utilities_tests/io_utilities_tests.cpp
+++ b/cpp/tests/utilities_tests/io_utilities_tests.cpp
@@ -16,14 +16,6 @@
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/table_utilities.hpp>
-
-#include <cudf/io/memory_resource.hpp>
-#include <cudf/io/parquet.hpp>
-
-#include <rmm/mr/device/pool_memory_resource.hpp>
-#include <rmm/mr/pinned_host_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <src/io/utilities/base64_utilities.hpp>
 
@@ -32,43 +24,6 @@ using cudf::io::detail::base64_encode;
 
 class IoUtilitiesTest : public cudf::test::BaseFixture {};
 
-TEST(IoUtilitiesTest, HostMemoryGetAndSet)
-{
-  // Global environment for temporary files
-  auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
-    ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
-
-  // pinned/pooled host memory resource
-  using host_pooled_mr = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
-  host_pooled_mr mr(std::make_shared<rmm::mr::pinned_host_memory_resource>().get(),
-                    size_t{128} * 1024 * 1024);
-
-  // set new resource
-  auto last_mr = cudf::io::get_host_memory_resource();
-  cudf::io::set_host_memory_resource(mr);
-
-  constexpr int num_rows = 32 * 1024;
-  auto valids =
-    cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 2; });
-  auto values = thrust::make_counting_iterator(0);
-
-  cudf::test::fixed_width_column_wrapper<int> col(values, values + num_rows, valids);
-
-  cudf::table_view expected({col});
-  auto filepath = temp_env->get_temp_filepath("IoUtilsMemTest.parquet");
-  cudf::io::parquet_writer_options out_args =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
-  cudf::io::write_parquet(out_args);
-
-  cudf::io::parquet_reader_options const read_opts =
-    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
-  auto const result = cudf::io::read_parquet(read_opts);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected);
-
-  // reset memory resource back
-  cudf::io::set_host_memory_resource(last_mr);
-}
-
 TEST(IoUtilitiesTest, Base64EncodeAndDecode)
 {
   // a vector of lorem ipsum strings
diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
new file mode 100644
index 00000000000..df9103640f4
--- /dev/null
+++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <cudf/io/parquet.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+
+#include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+class PinnedMemoryTest : public cudf::test::BaseFixture {};
+
+TEST(PinnedMemoryTest, MemoryResourceGetAndSet)
+{
+  // Global environment for temporary files
+  auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
+    ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+  // pinned/pooled host memory resource
+  using host_pooled_mr = rmm::mr::pool_memory_resource<rmm::mr::pinned_host_memory_resource>;
+  host_pooled_mr mr(std::make_shared<rmm::mr::pinned_host_memory_resource>().get(),
+                    4 * 1024 * 1024);
+
+  // set new resource
+  auto last_mr = cudf::get_pinned_memory_resource();
+  cudf::set_pinned_memory_resource(mr);
+
+  constexpr int num_rows = 32 * 1024;
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 2; });
+  auto values = thrust::make_counting_iterator(0);
+
+  cudf::test::fixed_width_column_wrapper<int> col(values, values + num_rows, valids);
+
+  cudf::table_view expected({col});
+  auto filepath = temp_env->get_temp_filepath("MemoryResourceGetAndSetTest.parquet");
+  cudf::io::parquet_writer_options out_args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_parquet(out_args);
+
+  cudf::io::parquet_reader_options const read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto const result = cudf::io::read_parquet(read_opts);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected);
+
+  // reset memory resource back
+  cudf::set_pinned_memory_resource(last_mr);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
index 83b801db7fb..df0d9dc7c3e 100644
--- a/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
+++ b/java/src/main/java/ai/rapids/cudf/PinnedMemoryPool.java
@@ -128,9 +128,9 @@ public static synchronized void initialize(long poolSize, int gpuId) {
    *
    * @param poolSize size of the pool to initialize.
    * @param gpuId    gpu id to set to get memory pool from, -1 means to use default
-   * @param setCuioHostMemoryResource true if this pinned pool should be used by cuIO for host memory
+   * @param setCudfPinnedPoolMemoryResource true if this pinned pool should be used by cuDF for pinned memory
    */
-  public static synchronized void initialize(long poolSize, int gpuId, boolean setCuioHostMemoryResource) {
+  public static synchronized void initialize(long poolSize, int gpuId, boolean setCudfPinnedPoolMemoryResource) {
     if (isInitialized()) {
       throw new IllegalStateException("Can only initialize the pool once.");
     }
@@ -139,7 +139,7 @@ public static synchronized void initialize(long poolSize, int gpuId, boolean set
       t.setDaemon(true);
       return t;
     });
-    initFuture = initService.submit(() -> new PinnedMemoryPool(poolSize, gpuId, setCuioHostMemoryResource));
+    initFuture = initService.submit(() -> new PinnedMemoryPool(poolSize, gpuId, setCudfPinnedPoolMemoryResource));
     initService.shutdown();
   }
 
@@ -216,15 +216,15 @@ public static long getTotalPoolSizeBytes() {
     return 0;
   }
 
-  private PinnedMemoryPool(long poolSize, int gpuId, boolean setCuioHostMemoryResource) {
+  private PinnedMemoryPool(long poolSize, int gpuId, boolean setCudfPinnedPoolMemoryResource) {
     if (gpuId > -1) {
       // set the gpu device to use
       Cuda.setDevice(gpuId);
       Cuda.freeZero();
     }
     this.poolHandle = Rmm.newPinnedPoolMemoryResource(poolSize, poolSize);
-    if (setCuioHostMemoryResource) {
-      Rmm.setCuioPinnedPoolMemoryResource(this.poolHandle);
+    if (setCudfPinnedPoolMemoryResource) {
+      Rmm.setCudfPinnedPoolMemoryResource(this.poolHandle);
     }
     this.poolSize = poolSize;
   }
diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index 4dee1b7aa24..ed029c918e4 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -597,7 +597,7 @@ static native long newEventHandlerResourceAdaptor(long handle, long trackerHandl
 
   public static native long newPinnedPoolMemoryResource(long initSize, long maxSize);
 
-  public static native long setCuioPinnedPoolMemoryResource(long poolPtr);
+  public static native long setCudfPinnedPoolMemoryResource(long poolPtr);
 
   public static native void releasePinnedPoolMemoryResource(long poolPtr);
 
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index fa78f6ca4e2..8bd0f7793b4 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -16,7 +16,7 @@
 
 #include "cudf_jni_apis.hpp"
 
-#include <cudf/io/memory_resource.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/aligned.hpp>
 #include <rmm/mr/device/aligned_resource_adaptor.hpp>
@@ -395,15 +395,17 @@ class java_debug_event_handler_memory_resource final : public java_event_handler
   }
 };
 
-inline auto& prior_cuio_host_mr()
+inline auto& prior_cudf_pinned_mr()
 {
-  static rmm::host_async_resource_ref _prior_cuio_host_mr = cudf::io::get_host_memory_resource();
-  return _prior_cuio_host_mr;
+  static rmm::host_device_async_resource_ref _prior_cudf_pinned_mr =
+    cudf::get_pinned_memory_resource();
+  return _prior_cudf_pinned_mr;
 }
 
 /**
  * This is a pinned fallback memory resource that will try to allocate `pool`
- * and if that fails, attempt to allocate from the prior resource used by cuIO `prior_cuio_host_mr`.
+ * and if that fails, attempt to allocate from the prior resource used by cuDF
+ * `prior_cudf_pinned_mr`.
  *
  * We detect whether a pointer to free is inside of the pool by checking its address (see
  * constructor)
@@ -433,7 +435,7 @@ class pinned_fallback_host_memory_resource {
 
   /**
    * @brief Allocates pinned host memory of size at least \p bytes bytes from either the
-   *        _pool argument provided, or prior_cuio_host_mr.
+   *        _pool argument provided, or prior_cudf_pinned_mr.
    *
    * @throws rmm::bad_alloc if the requested allocation could not be fulfilled due to any other
    * reason.
@@ -450,7 +452,7 @@ class pinned_fallback_host_memory_resource {
       return _pool->allocate(bytes, alignment);
     } catch (const std::exception& unused) {
       // try to allocate using the underlying pinned resource
-      return prior_cuio_host_mr().allocate(bytes, alignment);
+      return prior_cudf_pinned_mr().allocate(bytes, alignment);
     }
     // we should not reached here
     return nullptr;
@@ -459,7 +461,7 @@ class pinned_fallback_host_memory_resource {
   /**
    * @brief Deallocate memory pointed to by \p ptr of size \p bytes bytes. We attempt
    *        to deallocate from _pool, if ptr is detected to be in the pool address range,
-   *        otherwise we deallocate from `prior_cuio_host_mr`.
+   *        otherwise we deallocate from `prior_cudf_pinned_mr`.
    *
    * @param ptr Pointer to be deallocated.
    * @param bytes Size of the allocation.
@@ -472,7 +474,7 @@ class pinned_fallback_host_memory_resource {
     if (ptr >= pool_begin_ && ptr <= pool_end_) {
       _pool->deallocate(ptr, bytes, alignment);
     } else {
-      prior_cuio_host_mr().deallocate(ptr, bytes, alignment);
+      prior_cudf_pinned_mr().deallocate(ptr, bytes, alignment);
     }
   }
 
@@ -1025,7 +1027,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_newPinnedPoolMemoryResource(JNIE
   CATCH_STD(env, 0)
 }
 
-JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCuioPinnedPoolMemoryResource(JNIEnv* env,
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCudfPinnedPoolMemoryResource(JNIEnv* env,
                                                                                jclass clazz,
                                                                                jlong pool_ptr)
 {
@@ -1035,7 +1037,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCuioPinnedPoolMemoryResource(J
     // create a pinned fallback pool that will allocate pinned memory
     // if the regular pinned pool is exhausted
     pinned_fallback_mr.reset(new pinned_fallback_host_memory_resource(pool));
-    prior_cuio_host_mr() = cudf::io::set_host_memory_resource(*pinned_fallback_mr);
+    prior_cudf_pinned_mr() = cudf::set_pinned_memory_resource(*pinned_fallback_mr);
   }
   CATCH_STD(env, )
 }
@@ -1047,8 +1049,8 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_releasePinnedPoolMemoryResource(J
   try {
     cudf::jni::auto_set_device(env);
     // set the cuio host memory resource to what it was before, or the same
-    // if we didn't overwrite it with setCuioPinnedPoolMemoryResource
-    cudf::io::set_host_memory_resource(prior_cuio_host_mr());
+    // if we didn't overwrite it with setCudfPinnedPoolMemoryResource
+    cudf::set_pinned_memory_resource(prior_cudf_pinned_mr());
     pinned_fallback_mr.reset();
     delete reinterpret_cast<rmm_pinned_pool_t*>(pool_ptr);
   }
@@ -1088,7 +1090,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocFromFallbackPinnedPool(JNIE
                                                                             jlong size)
 {
   cudf::jni::auto_set_device(env);
-  void* ret = cudf::io::get_host_memory_resource().allocate(size);
+  void* ret = cudf::get_pinned_memory_resource().allocate(size);
   return reinterpret_cast<jlong>(ret);
 }
 
@@ -1101,7 +1103,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeFromFallbackPinnedPool(JNIEnv
   try {
     cudf::jni::auto_set_device(env);
     void* cptr = reinterpret_cast<void*>(ptr);
-    cudf::io::get_host_memory_resource().deallocate(cptr, size);
+    cudf::get_pinned_memory_resource().deallocate(cptr, size);
   }
   CATCH_STD(env, )
 }
@@ -1112,7 +1114,7 @@ JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Rmm_configureDefaultCudfPinnedPoo
 {
   try {
     cudf::jni::auto_set_device(env);
-    return cudf::io::config_default_host_memory_resource(cudf::io::host_mr_options{size});
+    return cudf::config_default_pinned_memory_resource(cudf::pinned_mr_options{size});
   }
   CATCH_STD(env, false)
 }

From 2b1029908af97b74304169631189dd57f382f072 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 12 Jun 2024 01:14:31 -0700
Subject: [PATCH 343/842] Apply clang-tidy autofixes (#15894)

This changeset is large, but it's not very substantial. It's all the automated fixes produced by clang-tidy using our script. The bulk of the changes are either adding `[[nodiscard]]` to many functions or changing const ref args to pass by value and then move in cases where the parameter is only used to set a value. There are also some places where clang-tidy preferred either more or less namespacing of objects depending on the current namespace. The goal is to enable clang-tidy in CI, which we made progress towards in #9860 but stalled in #10064. This PR contains the first set of changes that will required for such a check to pass.

I've marked this PR as breaking because some of the functions now marked as `[[nodiscard]]` are public APIs, so if consumers were ignoring the return values they will now see warnings, and if they are compiling with warnings as errors then the builds will break.

Contributes to #584

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15894
---
 .pre-commit-config.yaml                       |   8 +
 cpp/include/cudf/ast/expressions.hpp          |   7 +-
 .../cudf/column/column_device_view.cuh        |  10 +-
 .../cudf/detail/aggregation/aggregation.hpp   |  27 +-
 cpp/include/cudf/detail/contiguous_split.hpp  |   2 +-
 .../cudf/detail/normalizing_iterator.cuh      |   8 +-
 cpp/include/cudf/detail/structs/utilities.hpp |  24 +-
 .../cudf/detail/utilities/host_vector.hpp     |   4 +-
 .../cudf/detail/utilities/stream_pool.hpp     |   2 +-
 cpp/include/cudf/fixed_point/fixed_point.hpp  |   6 +-
 cpp/include/cudf/interop.hpp                  |   4 +-
 cpp/include/cudf/interop/detail/arrow.hpp     |   7 +-
 cpp/include/cudf/io/arrow_io_source.hpp       |   8 +-
 cpp/include/cudf/io/csv.hpp                   |  22 +-
 cpp/include/cudf/io/detail/parquet.hpp        |   2 +-
 cpp/include/cudf/io/json.hpp                  |  42 +-
 cpp/include/cudf/io/orc.hpp                   |  26 +-
 cpp/include/cudf/io/parquet.hpp               |   6 +-
 cpp/include/cudf/io/types.hpp                 |   5 +-
 cpp/include/cudf/join.hpp                     |  33 +-
 cpp/include/cudf/scalar/scalar.hpp            |  19 +-
 .../cudf/strings/regex/regex_program.hpp      |  14 +-
 cpp/include/cudf/strings/string_view.cuh      |   8 +-
 cpp/include/cudf/table/table.hpp              |   2 +-
 cpp/include/cudf/table/table_view.hpp         |   4 +-
 cpp/include/cudf/utilities/error.hpp          |   8 +-
 cpp/include/cudf/utilities/span.hpp           |  24 +-
 cpp/include/cudf/utilities/thread_pool.hpp    |   6 +-
 cpp/include/cudf/wrappers/dictionary.hpp      |   2 +-
 cpp/include/cudf/wrappers/durations.hpp       |  16 +-
 cpp/include/cudf/wrappers/timestamps.hpp      |  16 +-
 cpp/include/cudf_test/base_fixture.hpp        |   2 +-
 cpp/include/cudf_test/column_wrapper.hpp      |  15 +-
 .../stream_checking_resource_adaptor.hpp      |   2 +-
 cpp/src/binaryop/binaryop.cpp                 |   2 +-
 cpp/src/binaryop/compiled/operation.cuh       |   8 +-
 cpp/src/binaryop/compiled/util.cpp            |   4 +-
 cpp/src/copying/pack.cpp                      |   2 +-
 cpp/src/datetime/timezone.cpp                 |   2 +-
 cpp/src/interop/arrow_utilities.cpp           |   2 +-
 cpp/src/interop/arrow_utilities.hpp           |   2 +-
 cpp/src/interop/detail/arrow_allocator.cpp    |   2 +-
 cpp/src/interop/from_arrow_host.cu            |   4 +-
 cpp/src/io/avro/avro.cpp                      |   6 +-
 cpp/src/io/comp/uncomp.cpp                    |   8 +-
 cpp/src/io/functions.cpp                      |   8 +-
 cpp/src/io/json/nested_json_gpu.cu            |   8 +-
 cpp/src/io/json/read_json.cu                  |   2 +-
 cpp/src/io/orc/orc.hpp                        |   2 +-
 cpp/src/io/orc/orc_field_writer.hpp           |   6 +-
 cpp/src/io/orc/reader_impl_chunking.cu        |   2 +-
 cpp/src/io/orc/reader_impl_decode.cu          |   2 +-
 .../io/parquet/compact_protocol_reader.cpp    |   2 +-
 .../io/parquet/compact_protocol_writer.hpp    |   4 +-
 cpp/src/io/parquet/ipc/Schema_generated.h     | 416 +++++++++---------
 cpp/src/io/parquet/page_string_decode.cu      |  10 +-
 cpp/src/io/parquet/page_string_utils.cuh      |   4 +-
 cpp/src/io/parquet/parquet.hpp                |  30 +-
 cpp/src/io/parquet/parquet_gpu.hpp            |  33 +-
 cpp/src/io/parquet/predicate_pushdown.cpp     |   4 +-
 cpp/src/io/parquet/reader_impl_chunking.cu    |   2 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    |  26 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu  |   8 +-
 cpp/src/io/statistics/byte_array_view.cuh     |   6 +-
 cpp/src/io/utilities/arrow_io_source.cpp      |   6 +-
 cpp/src/io/utilities/column_buffer.cpp        |  20 +-
 cpp/src/io/utilities/column_buffer.hpp        |  21 +-
 cpp/src/io/utilities/data_casting.cu          |   4 +-
 cpp/src/io/utilities/data_sink.cpp            |   8 +-
 cpp/src/io/utilities/datasource.cpp           |   2 +-
 cpp/src/io/utilities/file_io_utilities.cpp    |   8 +-
 cpp/src/io/utilities/hostdevice_span.hpp      |   2 +-
 cpp/src/io/utilities/hostdevice_vector.hpp    |   2 +-
 cpp/src/io/utilities/output_builder.cuh       |   4 +-
 cpp/src/io/utilities/string_parsing.hpp       |   6 +-
 cpp/src/io/utilities/type_inference.cu        |   2 +-
 cpp/src/jit/cache.cpp                         |   4 +-
 cpp/src/jit/parser.cpp                        |  17 +-
 cpp/src/jit/parser.hpp                        |   8 +-
 cpp/src/reductions/reductions.cpp             |   6 +-
 .../detail/optimized_unbounded_window.cpp     |   2 +-
 cpp/src/strings/regex/regcomp.cpp             |  26 +-
 cpp/src/strings/regex/regex.cuh               |  45 +-
 cpp/src/strings/regex/regex.inl               |   7 +-
 cpp/src/strings/regex/regexec.cpp             |  14 +-
 cpp/src/transform/transform.cpp               |   2 +-
 cpp/src/utilities/stream_pool.cpp             |   4 +-
 .../binop-compiled-fixed_point-test.cpp       |   8 +-
 cpp/tests/bitmask/is_element_valid_tests.cpp  |   8 +-
 cpp/tests/column/column_view_shallow_test.cpp |   3 +-
 cpp/tests/copying/concatenate_tests.cpp       |  79 ++--
 cpp/tests/copying/copy_tests.cpp              |   5 +-
 cpp/tests/copying/gather_str_tests.cpp        |  27 +-
 cpp/tests/copying/gather_struct_tests.cpp     |   4 +-
 cpp/tests/copying/get_value_tests.cpp         |  12 +-
 cpp/tests/copying/pack_tests.cpp              |  86 ++--
 cpp/tests/copying/scatter_list_tests.cpp      |  11 +-
 cpp/tests/copying/scatter_struct_tests.cpp    |   9 +-
 cpp/tests/copying/scatter_tests.cpp           |  47 +-
 cpp/tests/copying/shift_tests.cpp             |  57 +--
 cpp/tests/copying/slice_tests.cpp             |  69 ++-
 cpp/tests/copying/split_tests.cpp             | 123 ++++--
 cpp/tests/dictionary/decode_test.cpp          |   5 +-
 cpp/tests/dictionary/encode_test.cpp          |   5 +-
 cpp/tests/dictionary/factories_test.cpp       |   6 +-
 cpp/tests/dictionary/fill_test.cpp            |  10 +-
 cpp/tests/dictionary/gather_test.cpp          |   5 +-
 cpp/tests/dictionary/remove_keys_test.cpp     |  14 +-
 cpp/tests/dictionary/scatter_test.cpp         |  19 +-
 cpp/tests/dictionary/search_test.cpp          |   6 +-
 cpp/tests/dictionary/set_keys_test.cpp        |  12 +-
 cpp/tests/dictionary/slice_test.cpp           |  15 +-
 cpp/tests/groupby/argmax_tests.cpp            |   5 +-
 cpp/tests/groupby/argmin_tests.cpp            |   7 +-
 cpp/tests/groupby/collect_set_tests.cpp       |   4 +-
 cpp/tests/groupby/correlation_tests.cpp       |   8 +-
 cpp/tests/groupby/count_scan_tests.cpp        |   4 +-
 cpp/tests/groupby/count_tests.cpp             |   7 +-
 cpp/tests/groupby/covariance_tests.cpp        |   8 +-
 cpp/tests/groupby/groupby_test_util.cpp       |   4 +-
 cpp/tests/groupby/groups_tests.cpp            |   5 +-
 cpp/tests/groupby/keys_tests.cpp              |   8 +-
 cpp/tests/groupby/m2_tests.cpp                |   4 +-
 cpp/tests/groupby/max_scan_tests.cpp          |   4 +-
 cpp/tests/groupby/max_tests.cpp               |  25 +-
 cpp/tests/groupby/mean_tests.cpp              |   7 +-
 cpp/tests/groupby/median_tests.cpp            |   7 +-
 cpp/tests/groupby/merge_lists_tests.cpp       |   4 +-
 cpp/tests/groupby/merge_m2_tests.cpp          |   6 +-
 cpp/tests/groupby/merge_sets_tests.cpp        |   4 +-
 cpp/tests/groupby/min_scan_tests.cpp          |   4 +-
 cpp/tests/groupby/min_tests.cpp               |  25 +-
 cpp/tests/groupby/nth_element_tests.cpp       |  40 +-
 cpp/tests/groupby/nunique_tests.cpp           |  19 +-
 cpp/tests/groupby/product_scan_tests.cpp      |   2 +-
 cpp/tests/groupby/product_tests.cpp           |   4 +-
 cpp/tests/groupby/quantile_tests.cpp          |   7 +-
 cpp/tests/groupby/rank_scan_tests.cpp         |  12 +-
 cpp/tests/groupby/replace_nulls_tests.cpp     |  10 +-
 cpp/tests/groupby/shift_tests.cpp             |  23 +-
 cpp/tests/groupby/std_tests.cpp               |  12 +-
 cpp/tests/groupby/sum_of_squares_tests.cpp    |   7 +-
 cpp/tests/groupby/sum_scan_tests.cpp          |   4 +-
 cpp/tests/groupby/sum_tests.cpp               |   5 +-
 cpp/tests/groupby/var_tests.cpp               |  12 +-
 cpp/tests/hashing/md5_test.cpp                |  32 +-
 cpp/tests/hashing/murmurhash3_x86_32_test.cpp | 106 ++++-
 cpp/tests/hashing/sha1_test.cpp               |   8 +-
 cpp/tests/hashing/sha224_test.cpp             |   8 +-
 cpp/tests/hashing/sha256_test.cpp             |   8 +-
 cpp/tests/hashing/sha384_test.cpp             |   8 +-
 cpp/tests/hashing/sha512_test.cpp             |   8 +-
 cpp/tests/interop/dlpack_test.cpp             |   2 +-
 cpp/tests/interop/from_arrow_device_test.cpp  |  14 +-
 cpp/tests/interop/from_arrow_host_test.cpp    |   6 +-
 cpp/tests/interop/from_arrow_test.cpp         |  43 +-
 cpp/tests/interop/nanoarrow_utils.hpp         |  14 +-
 cpp/tests/interop/to_arrow_device_test.cpp    |  26 +-
 cpp/tests/io/csv_test.cpp                     |   4 +-
 cpp/tests/io/json_chunked_reader.cpp          |   4 +-
 .../io/json_quote_normalization_test.cpp      |   2 +-
 cpp/tests/io/json_test.cpp                    |   4 +-
 cpp/tests/io/json_tree.cpp                    |   8 +-
 cpp/tests/io/orc_chunked_reader_test.cu       |   4 +-
 cpp/tests/io/orc_test.cpp                     |   8 +-
 cpp/tests/io/parquet_chunked_writer_test.cpp  |  36 +-
 cpp/tests/io/parquet_reader_test.cpp          |  54 ++-
 cpp/tests/io/parquet_v2_test.cpp              |  79 ++--
 cpp/tests/io/parquet_writer_test.cpp          |  20 +-
 cpp/tests/join/distinct_join_tests.cpp        |  76 ++--
 cpp/tests/join/join_tests.cpp                 | 342 +++++++-------
 cpp/tests/join/semi_anti_join_tests.cpp       |  43 +-
 cpp/tests/json/json_tests.cpp                 |   6 +-
 .../large_strings/large_strings_fixture.cpp   |   9 +-
 cpp/tests/lists/contains_tests.cpp            |   2 +-
 cpp/tests/lists/count_elements_tests.cpp      |  10 +-
 cpp/tests/lists/explode_tests.cpp             |  68 +--
 cpp/tests/lists/sort_lists_tests.cpp          |   8 +-
 cpp/tests/merge/merge_dictionary_test.cpp     |  18 +-
 cpp/tests/merge/merge_string_test.cpp         |  63 ++-
 .../partitioning/hash_partition_test.cpp      |   2 +-
 cpp/tests/partitioning/round_robin_test.cpp   |  73 +--
 .../quantiles/percentile_approx_test.cpp      |  11 +-
 cpp/tests/quantiles/quantile_test.cpp         |   2 +-
 cpp/tests/quantiles/quantiles_test.cpp        |  12 +-
 cpp/tests/reductions/collect_ops_tests.cpp    |  47 +-
 cpp/tests/reductions/list_rank_test.cpp       |  85 +++-
 cpp/tests/reductions/reduction_tests.cpp      | 131 +++---
 cpp/tests/reductions/scan_tests.cpp           |  15 +-
 .../reductions/segmented_reduction_tests.cpp  |  69 +--
 cpp/tests/reshape/byte_cast_tests.cpp         |  16 +-
 cpp/tests/rolling/collect_ops_test.cpp        |  30 +-
 cpp/tests/rolling/grouped_rolling_test.cpp    | 110 +++--
 .../rolling/range_rolling_window_test.cpp     |  24 +-
 cpp/tests/round/round_tests.cpp               |   5 +-
 cpp/tests/scalar/scalar_test.cpp              |   4 +-
 cpp/tests/search/search_dictionary_test.cpp   |  30 +-
 cpp/tests/sort/is_sorted_tests.cpp            |   8 +-
 cpp/tests/sort/rank_test.cpp                  |  91 ++--
 cpp/tests/sort/stable_sort_tests.cpp          |   8 +-
 .../distinct_count_tests.cpp                  |  37 +-
 .../stream_compaction/distinct_tests.cpp      |   4 +-
 .../stream_compaction/drop_nans_tests.cpp     |  38 +-
 .../stream_compaction/drop_nulls_tests.cpp    |  67 +--
 .../stable_distinct_tests.cpp                 |   4 +-
 cpp/tests/stream_compaction/unique_tests.cpp  |  72 +--
 cpp/tests/streams/interop_test.cpp            |   1 +
 cpp/tests/streams/io/orc_test.cpp             |   4 +-
 cpp/tests/streams/io/parquet_test.cpp         |   4 +-
 cpp/tests/streams/lists_test.cpp              |   5 +-
 cpp/tests/streams/reduction_test.cpp          |  16 +-
 cpp/tests/streams/replace_test.cpp            |   9 +-
 cpp/tests/streams/strings/filter_test.cpp     |   4 +-
 cpp/tests/strings/case_tests.cpp              |  50 ++-
 cpp/tests/strings/chars_types_tests.cpp       |  51 ++-
 .../strings/combine/concatenate_tests.cpp     |  11 +-
 .../strings/combine/join_strings_tests.cpp    |   6 +-
 cpp/tests/strings/contains_tests.cpp          |  16 +-
 cpp/tests/strings/datetime_tests.cpp          |   6 +-
 cpp/tests/strings/extract_tests.cpp           |  23 +-
 cpp/tests/strings/fill_tests.cpp              |   6 +-
 cpp/tests/strings/find_multiple_tests.cpp     |   2 +-
 cpp/tests/strings/find_tests.cpp              | 102 +++--
 cpp/tests/strings/findall_tests.cpp           |   6 +-
 cpp/tests/strings/fixed_point_tests.cpp       |   6 +-
 cpp/tests/strings/integers_tests.cpp          |  24 +-
 cpp/tests/strings/ipv4_tests.cpp              |   7 +-
 cpp/tests/strings/like_tests.cpp              |   7 +-
 cpp/tests/strings/pad_tests.cpp               |   5 +-
 cpp/tests/strings/replace_regex_tests.cpp     |   6 +-
 cpp/tests/strings/replace_tests.cpp           |  12 +-
 cpp/tests/strings/reverse_tests.cpp           |  18 +-
 cpp/tests/strings/slice_tests.cpp             |   8 +-
 cpp/tests/strings/split_tests.cpp             |  42 +-
 cpp/tests/strings/strip_tests.cpp             |   5 +-
 cpp/tests/strings/translate_tests.cpp         |   4 +-
 cpp/tests/structs/structs_column_tests.cpp    |   2 +-
 cpp/tests/structs/utilities_tests.cpp         |   4 +-
 cpp/tests/table/row_operators_tests.cpp       |   8 +-
 cpp/tests/text/bpe_tests.cpp                  |   2 +-
 cpp/tests/text/jaccard_tests.cpp              |  15 +-
 cpp/tests/text/normalize_tests.cpp            |   6 +-
 cpp/tests/text/replace_tests.cpp              |   2 +-
 cpp/tests/text/stemmer_tests.cpp              |   2 +-
 cpp/tests/text/subword_tests.cpp              |   2 +-
 cpp/tests/text/tokenize_tests.cpp             |   6 +-
 cpp/tests/transform/nans_to_null_test.cpp     |   4 +-
 cpp/tests/transform/one_hot_encode_tests.cpp  |   9 +-
 cpp/tests/unary/cast_tests.cpp                |  15 +-
 cpp/tests/unary/math_ops_test.cpp             |   3 +-
 cpp/tests/utilities/column_utilities.cu       |   2 +-
 cpp/tests/utilities/identify_stream_usage.cpp |   2 +-
 cpp/tests/utilities_tests/logger_tests.cpp    |   4 +-
 cpp/tests/utilities_tests/type_list_tests.cpp |  54 +--
 java/src/main/native/include/jni_utils.hpp    |  26 +-
 java/src/main/native/src/ColumnVectorJni.cpp  |  14 +-
 java/src/main/native/src/ColumnViewJni.cpp    |  44 +-
 java/src/main/native/src/RmmJni.cpp           |   8 +-
 java/src/main/native/src/ScalarJni.cpp        |   4 +-
 java/src/main/native/src/TableJni.cpp         |  28 +-
 .../main/native/src/jni_writer_data_sink.hpp  |   4 +-
 261 files changed, 2911 insertions(+), 2151 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4cdcac88091..cc08b832e69 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -56,12 +56,20 @@ repos:
       - id: clang-format
         types_or: [c, c++, cuda]
         args: ["-fallback-style=none", "-style=file", "-i"]
+        exclude: |
+          (?x)^(
+            ^cpp/src/io/parquet/ipc/Schema_generated.h|
+            ^cpp/src/io/parquet/ipc/Message_generated.h|
+            ^cpp/include/cudf_test/cxxopts.hpp|
+          )
   - repo: https://github.com/sirosen/texthooks
     rev: 0.6.6
     hooks:
       - id: fix-smartquotes
         exclude: |
           (?x)^(
+            ^cpp/src/io/parquet/ipc/Schema_generated.h|
+            ^cpp/src/io/parquet/ipc/Message_generated.h|
             ^cpp/include/cudf_test/cxxopts.hpp|
             ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*|
             ^python/cudf/cudf/tests/text/test_text_methods.py
diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp
index 26916e49012..918271e3e4f 100644
--- a/cpp/include/cudf/ast/expressions.hpp
+++ b/cpp/include/cudf/ast/expressions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -478,7 +478,10 @@ class operation : public expression {
    *
    * @return Vector of operands
    */
-  std::vector<std::reference_wrapper<expression const>> get_operands() const { return operands; }
+  [[nodiscard]] std::vector<std::reference_wrapper<expression const>> get_operands() const
+  {
+    return operands;
+  }
 
   /**
    * @copydoc expression::accept
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 19722d127cb..787e9c2c479 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -442,7 +442,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @return string_view instance representing this element at this index
    */
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, string_view>)>
-  __device__ T element(size_type element_index) const noexcept
+  __device__ [[nodiscard]] T element(size_type element_index) const noexcept
   {
     size_type index       = element_index + offset();  // account for this view's _offset
     char const* d_strings = static_cast<char const*>(_data);
@@ -501,7 +501,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @return dictionary32 instance representing this element at this index
    */
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, dictionary32>)>
-  __device__ T element(size_type element_index) const noexcept
+  __device__ [[nodiscard]] T element(size_type element_index) const noexcept
   {
     size_type index    = element_index + offset();  // account for this view's _offset
     auto const indices = d_children[0];
@@ -519,7 +519,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @return numeric::fixed_point representing the element at this index
    */
   template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_point<T>())>
-  __device__ T element(size_type element_index) const noexcept
+  __device__ [[nodiscard]] T element(size_type element_index) const noexcept
   {
     using namespace numeric;
     using rep        = typename T::rep;
@@ -858,7 +858,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    */
   [[nodiscard]] __device__ device_span<column_device_view const> children() const noexcept
   {
-    return device_span<column_device_view const>(d_children, _num_children);
+    return {d_children, static_cast<std::size_t>(_num_children)};
   }
 
   /**
@@ -1032,7 +1032,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    * @return Reference to the element at the specified index
    */
   template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
-  __device__ T& element(size_type element_index) const noexcept
+  __device__ [[nodiscard]] T& element(size_type element_index) const noexcept
   {
     return data<T>()[element_index];
   }
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 87c0f8ec7f1..edee83783b8 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -24,6 +24,7 @@
 
 #include <functional>
 #include <numeric>
+#include <utility>
 
 namespace cudf {
 namespace detail {
@@ -510,7 +511,7 @@ class quantile_aggregation final : public groupby_aggregation, public reduce_agg
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 
  private:
-  size_t hash_impl() const
+  [[nodiscard]] size_t hash_impl() const
   {
     return std::hash<int>{}(static_cast<int>(_interpolation)) ^
            std::accumulate(
@@ -596,7 +597,10 @@ class nunique_aggregation final : public groupby_aggregation,
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 
  private:
-  size_t hash_impl() const { return std::hash<int>{}(static_cast<int>(_null_handling)); }
+  [[nodiscard]] size_t hash_impl() const
+  {
+    return std::hash<int>{}(static_cast<int>(_null_handling));
+  }
 };
 
 /**
@@ -638,7 +642,7 @@ class nth_element_aggregation final : public groupby_aggregation,
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 
  private:
-  size_t hash_impl() const
+  [[nodiscard]] size_t hash_impl() const
   {
     return std::hash<size_type>{}(_n) ^ std::hash<int>{}(static_cast<int>(_null_handling));
   }
@@ -763,7 +767,10 @@ class collect_list_aggregation final : public rolling_aggregation,
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 
  private:
-  size_t hash_impl() const { return std::hash<int>{}(static_cast<int>(_null_handling)); }
+  [[nodiscard]] size_t hash_impl() const
+  {
+    return std::hash<int>{}(static_cast<int>(_null_handling));
+  }
 };
 
 /**
@@ -813,7 +820,7 @@ class collect_set_aggregation final : public rolling_aggregation,
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 
  protected:
-  size_t hash_impl() const
+  [[nodiscard]] size_t hash_impl() const
   {
     return std::hash<int>{}(static_cast<int>(_null_handling) ^ static_cast<int>(_nulls_equal) ^
                             static_cast<int>(_nans_equal));
@@ -866,10 +873,10 @@ class lead_lag_aggregation final : public rolling_aggregation {
 class udf_aggregation final : public rolling_aggregation {
  public:
   udf_aggregation(aggregation::Kind type,
-                  std::string const& user_defined_aggregator,
+                  std::string user_defined_aggregator,
                   data_type output_type)
     : aggregation{type},
-      _source{user_defined_aggregator},
+      _source{std::move(user_defined_aggregator)},
       _operator_name{(type == aggregation::PTX) ? "rolling_udf_ptx" : "rolling_udf_cuda"},
       _function_name{"rolling_udf"},
       _output_type{output_type}
@@ -973,7 +980,7 @@ class merge_sets_aggregation final : public groupby_aggregation, public reduce_a
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 
  protected:
-  size_t hash_impl() const
+  [[nodiscard]] size_t hash_impl() const
   {
     return std::hash<int>{}(static_cast<int>(_nulls_equal) ^ static_cast<int>(_nans_equal));
   }
@@ -1046,7 +1053,7 @@ class covariance_aggregation final : public groupby_aggregation {
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 
  protected:
-  size_t hash_impl() const
+  [[nodiscard]] size_t hash_impl() const
   {
     return std::hash<size_type>{}(_min_periods) ^ std::hash<size_type>{}(_ddof);
   }
@@ -1088,7 +1095,7 @@ class correlation_aggregation final : public groupby_aggregation {
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 
  protected:
-  size_t hash_impl() const
+  [[nodiscard]] size_t hash_impl() const
   {
     return std::hash<int>{}(static_cast<int>(_type)) ^ std::hash<size_type>{}(_min_periods);
   }
diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp
index de00b61cdca..1467ed1aa67 100644
--- a/cpp/include/cudf/detail/contiguous_split.hpp
+++ b/cpp/include/cudf/detail/contiguous_split.hpp
@@ -104,7 +104,7 @@ class metadata_builder {
    *
    * @returns A vector containing the serialized column metadata
    */
-  std::vector<uint8_t> build() const;
+  [[nodiscard]] std::vector<uint8_t> build() const;
 
   /**
    * @brief Clear the internal buffer containing all added metadata.
diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh
index 32df13104e0..308fd188b09 100644
--- a/cpp/include/cudf/detail/normalizing_iterator.cuh
+++ b/cpp/include/cudf/detail/normalizing_iterator.cuh
@@ -51,7 +51,7 @@ struct alignas(16) base_normalator {
    */
   CUDF_HOST_DEVICE inline Derived& operator++()
   {
-    Derived& derived = static_cast<Derived&>(*this);
+    auto& derived = static_cast<Derived&>(*this);
     derived.p_ += width_;
     return derived;
   }
@@ -71,7 +71,7 @@ struct alignas(16) base_normalator {
    */
   CUDF_HOST_DEVICE inline Derived& operator--()
   {
-    Derived& derived = static_cast<Derived&>(*this);
+    auto& derived = static_cast<Derived&>(*this);
     derived.p_ -= width_;
     return derived;
   }
@@ -91,7 +91,7 @@ struct alignas(16) base_normalator {
    */
   CUDF_HOST_DEVICE inline Derived& operator+=(difference_type offset)
   {
-    Derived& derived = static_cast<Derived&>(*this);
+    auto& derived = static_cast<Derived&>(*this);
     derived.p_ += offset * width_;
     return derived;
   }
@@ -121,7 +121,7 @@ struct alignas(16) base_normalator {
    */
   CUDF_HOST_DEVICE inline Derived& operator-=(difference_type offset)
   {
-    Derived& derived = static_cast<Derived&>(*this);
+    auto& derived = static_cast<Derived&>(*this);
     derived.p_ -= offset * width_;
     return derived;
   }
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index e736514ac29..beedc009c84 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -25,6 +25,8 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <utility>
+
 namespace cudf::structs::detail {
 
 enum class column_nullability {
@@ -112,12 +114,12 @@ class flattened_table {
    * @param columns_ Newly allocated columns to back the table_view
    * @param nullable_data_ Newly generated temporary data that needs to be kept alive
    */
-  flattened_table(table_view const& flattened_columns_,
+  flattened_table(table_view flattened_columns_,
                   std::vector<order> const& orders_,
                   std::vector<null_order> const& null_orders_,
                   std::vector<std::unique_ptr<column>>&& columns_,
                   temporary_nullable_data&& nullable_data_)
-    : _flattened_columns{flattened_columns_},
+    : _flattened_columns{std::move(flattened_columns_)},
       _orders{orders_},
       _null_orders{null_orders_},
       _columns{std::move(columns_)},
@@ -170,11 +172,11 @@ class flattened_table {
  *         orders, flattened null precedence, alongside the supporting columns and device_buffers
  *         for the flattened table.
  */
-[[nodiscard]] std::unique_ptr<flattened_table> flatten_nested_columns(
+[[nodiscard]] std::unique_ptr<cudf::structs::detail::flattened_table> flatten_nested_columns(
   table_view const& input,
-  std::vector<order> const& column_order,
-  std::vector<null_order> const& null_precedence,
-  column_nullability nullability,
+  std::vector<cudf::order> const& column_order,
+  std::vector<cudf::null_order> const& null_precedence,
+  cudf::structs::detail::column_nullability nullability,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr);
 
@@ -194,11 +196,11 @@ class flattened_table {
  * @param mr Device memory resource used to allocate new device memory
  * @return A new column with potentially new null mask
  */
-[[nodiscard]] std::unique_ptr<column> superimpose_nulls(bitmask_type const* null_mask,
-                                                        size_type null_count,
-                                                        std::unique_ptr<column>&& input,
-                                                        rmm::cuda_stream_view stream,
-                                                        rmm::device_async_resource_ref mr);
+[[nodiscard]] std::unique_ptr<cudf::column> superimpose_nulls(bitmask_type const* null_mask,
+                                                              cudf::size_type null_count,
+                                                              std::unique_ptr<cudf::column>&& input,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::device_async_resource_ref mr);
 
 /**
  * @brief Push down nulls from the given input column into its children columns, using bitwise AND.
diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
index 6a115177ab5..2d14d0306cd 100644
--- a/cpp/include/cudf/detail/utilities/host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -82,7 +82,7 @@ class rmm_host_allocator {
   using size_type       = std::size_t;     ///< The type used for the size of the allocation
   using difference_type = std::ptrdiff_t;  ///< The type of the distance between two pointers
 
-  typedef cuda::std::true_type propagate_on_container_move_assignment;
+  using propagate_on_container_move_assignment = cuda::std::true_type;
 
   /**
    * @brief converts a `rmm_host_allocator<T>` to `rmm_host_allocator<U>`
@@ -147,7 +147,7 @@ class rmm_host_allocator {
    *  @return The maximum number of objects that may be allocated
    *          by a single call to \p allocate().
    */
-  constexpr inline size_type max_size() const
+  [[nodiscard]] constexpr inline size_type max_size() const
   {
     return (std::numeric_limits<size_type>::max)() / sizeof(T);
   }
diff --git a/cpp/include/cudf/detail/utilities/stream_pool.hpp b/cpp/include/cudf/detail/utilities/stream_pool.hpp
index e19cc3ec2f7..64c1d4ae514 100644
--- a/cpp/include/cudf/detail/utilities/stream_pool.hpp
+++ b/cpp/include/cudf/detail/utilities/stream_pool.hpp
@@ -73,7 +73,7 @@ class cuda_stream_pool {
    *
    * @return the number of stream objects in the pool
    */
-  virtual std::size_t get_stream_pool_size() const = 0;
+  [[nodiscard]] virtual std::size_t get_stream_pool_size() const = 0;
 };
 
 /**
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index e39d75757e8..6c3c3b4da07 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -291,14 +291,14 @@ class fixed_point {
    *
    * @return The underlying value of the `fixed_point` number
    */
-  CUDF_HOST_DEVICE inline rep value() const { return _value; }
+  CUDF_HOST_DEVICE [[nodiscard]] inline rep value() const { return _value; }
 
   /**
    * @brief Method that returns the scale of the `fixed_point` number
    *
    * @return The scale of the `fixed_point` number
    */
-  CUDF_HOST_DEVICE inline scale_type scale() const { return _scale; }
+  CUDF_HOST_DEVICE [[nodiscard]] inline scale_type scale() const { return _scale; }
 
   /**
    * @brief Explicit conversion operator to `bool`
@@ -573,7 +573,7 @@ class fixed_point {
    * @param scale The `scale` of the returned `fixed_point` number
    * @return `fixed_point` number with a new `scale`
    */
-  CUDF_HOST_DEVICE inline fixed_point<Rep, Rad> rescaled(scale_type scale) const
+  CUDF_HOST_DEVICE [[nodiscard]] inline fixed_point<Rep, Rad> rescaled(scale_type scale) const
   {
     if (scale == _scale) { return *this; }
     Rep const value = detail::shift<Rep, Rad>(_value, scale_type{scale - _scale});
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index f3ff0009d5c..56ec62fa6e1 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -40,6 +40,8 @@
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
+#include <utility>
+
 struct DLManagedTensor;
 
 struct ArrowDeviceArray;
@@ -121,7 +123,7 @@ struct column_metadata {
    *
    * @param _name Name of the column
    */
-  column_metadata(std::string const& _name) : name(_name) {}
+  column_metadata(std::string _name) : name(std::move(_name)) {}
   column_metadata() = default;
 };
 
diff --git a/cpp/include/cudf/interop/detail/arrow.hpp b/cpp/include/cudf/interop/detail/arrow.hpp
index 8043ecf5422..906d48f636b 100644
--- a/cpp/include/cudf/interop/detail/arrow.hpp
+++ b/cpp/include/cudf/interop/detail/arrow.hpp
@@ -24,8 +24,12 @@
 #define ARROW_C_DEVICE_DATA_INTERFACE
 
 // Device type for the allocated memory
-typedef int32_t ArrowDeviceType;
+using ArrowDeviceType = int32_t;
 
+// The Arrow spec specifies using macros rather than enums here to avoid being
+// susceptible to changes in the underlying type chosen by the compiler, but
+// clang-tidy doesn't like this.
+// NOLINTBEGIN
 // CPU device, same as using ArrowArray directly
 #define ARROW_DEVICE_CPU 1
 // CUDA GPU Device
@@ -34,6 +38,7 @@ typedef int32_t ArrowDeviceType;
 #define ARROW_DEVICE_CUDA_HOST 3
 // CUDA managed/unified memory allocated by cudaMallocManaged
 #define ARROW_DEVICE_CUDA_MANAGED 13
+// NOLINTEND
 
 struct ArrowDeviceArray {
   struct ArrowArray array;
diff --git a/cpp/include/cudf/io/arrow_io_source.hpp b/cpp/include/cudf/io/arrow_io_source.hpp
index 5f79f05c5a1..d7a48c34e12 100644
--- a/cpp/include/cudf/io/arrow_io_source.hpp
+++ b/cpp/include/cudf/io/arrow_io_source.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 
 #include <memory>
 #include <string>
+#include <utility>
 
 namespace cudf::io {
 /**
@@ -49,7 +50,10 @@ class arrow_io_source : public datasource {
    *
    * @param file The `arrow` object from which the data is read
    */
-  explicit arrow_io_source(std::shared_ptr<arrow::io::RandomAccessFile> file) : arrow_file(file) {}
+  explicit arrow_io_source(std::shared_ptr<arrow::io::RandomAccessFile> file)
+    : arrow_file(std::move(file))
+  {
+  }
 
   /**
    * @brief Returns a buffer with a subset of data from the `arrow` source.
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index a20f75cecd7..68bb7fba00e 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -27,6 +27,7 @@
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <variant>
 #include <vector>
 
@@ -431,7 +432,8 @@ class csv_reader_options {
    *
    * @return Per-column types
    */
-  std::variant<std::vector<data_type>, std::map<std::string, data_type>> const& get_dtypes() const
+  [[nodiscard]] std::variant<std::vector<data_type>, std::map<std::string, data_type>> const&
+  get_dtypes() const
   {
     return _dtypes;
   }
@@ -441,49 +443,49 @@ class csv_reader_options {
    *
    * @return Additional values to recognize as boolean true values
    */
-  std::vector<std::string> const& get_true_values() const { return _true_values; }
+  [[nodiscard]] std::vector<std::string> const& get_true_values() const { return _true_values; }
 
   /**
    * @brief Returns additional values to recognize as boolean false values.
    *
    * @return Additional values to recognize as boolean false values
    */
-  std::vector<std::string> const& get_false_values() const { return _false_values; }
+  [[nodiscard]] std::vector<std::string> const& get_false_values() const { return _false_values; }
 
   /**
    * @brief Returns additional values to recognize as null values.
    *
    * @return Additional values to recognize as null values
    */
-  std::vector<std::string> const& get_na_values() const { return _na_values; }
+  [[nodiscard]] std::vector<std::string> const& get_na_values() const { return _na_values; }
 
   /**
    * @brief Whether to keep the built-in default NA values.
    *
    * @return `true` if the built-in default NA values are kept
    */
-  bool is_enabled_keep_default_na() const { return _keep_default_na; }
+  [[nodiscard]] bool is_enabled_keep_default_na() const { return _keep_default_na; }
 
   /**
    * @brief Whether to disable null filter.
    *
    * @return `true` if null filter is enabled
    */
-  bool is_enabled_na_filter() const { return _na_filter; }
+  [[nodiscard]] bool is_enabled_na_filter() const { return _na_filter; }
 
   /**
    * @brief Whether to parse dates as DD/MM versus MM/DD.
    *
    * @return True if dates are parsed as DD/MM, false if MM/DD
    */
-  bool is_enabled_dayfirst() const { return _dayfirst; }
+  [[nodiscard]] bool is_enabled_dayfirst() const { return _dayfirst; }
 
   /**
    * @brief Returns timestamp_type to which all timestamp columns will be cast.
    *
    * @return timestamp_type to which all timestamp columns will be cast
    */
-  data_type get_timestamp_type() const { return _timestamp_type; }
+  [[nodiscard]] data_type get_timestamp_type() const { return _timestamp_type; }
 
   /**
    * @brief Sets compression format of the source.
@@ -1399,8 +1401,8 @@ class csv_writer_options {
    * @param sink The sink used for writer output
    * @param table Table to be written to output
    */
-  explicit csv_writer_options(sink_info const& sink, table_view const& table)
-    : _sink(sink), _table(table), _rows_per_chunk(table.num_rows())
+  explicit csv_writer_options(sink_info sink, table_view const& table)
+    : _sink(std::move(sink)), _table(table), _rows_per_chunk(table.num_rows())
   {
   }
 
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 978216d971e..21c870cb75e 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -160,7 +160,7 @@ class chunked_reader : private reader {
    * destructor needs to be defined in a separate source file which can access to that object's
    * declaration.
    */
-  ~chunked_reader();
+  ~chunked_reader() override;
 
   /**
    * @copydoc cudf::io::chunked_parquet_reader::has_next
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 65ba8f25577..8de690482f9 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -26,6 +26,7 @@
 
 #include <map>
 #include <string>
+#include <utility>
 #include <variant>
 #include <vector>
 
@@ -166,9 +167,9 @@ class json_reader_options {
    *
    * @returns Data types of the columns
    */
-  std::variant<std::vector<data_type>,
-               std::map<std::string, data_type>,
-               std::map<std::string, schema_element>> const&
+  [[nodiscard]] std::variant<std::vector<data_type>,
+                             std::map<std::string, data_type>,
+                             std::map<std::string, schema_element>> const&
   get_dtypes() const
   {
     return _dtypes;
@@ -179,28 +180,28 @@ class json_reader_options {
    *
    * @return Compression format of the source
    */
-  compression_type get_compression() const { return _compression; }
+  [[nodiscard]] compression_type get_compression() const { return _compression; }
 
   /**
    * @brief Returns number of bytes to skip from source start.
    *
    * @return Number of bytes to skip from source start
    */
-  size_t get_byte_range_offset() const { return _byte_range_offset; }
+  [[nodiscard]] size_t get_byte_range_offset() const { return _byte_range_offset; }
 
   /**
    * @brief Returns number of bytes to read.
    *
    * @return Number of bytes to read
    */
-  size_t get_byte_range_size() const { return _byte_range_size; }
+  [[nodiscard]] size_t get_byte_range_size() const { return _byte_range_size; }
 
   /**
    * @brief Returns number of bytes to read with padding.
    *
    * @return Number of bytes to read with padding
    */
-  size_t get_byte_range_size_with_padding() const
+  [[nodiscard]] size_t get_byte_range_size_with_padding() const
   {
     if (_byte_range_size == 0) {
       return 0;
@@ -214,7 +215,7 @@ class json_reader_options {
    *
    * @return Number of bytes to pad
    */
-  size_t get_byte_range_padding() const
+  [[nodiscard]] size_t get_byte_range_padding() const
   {
     auto const num_columns = std::visit([](auto const& dtypes) { return dtypes.size(); }, _dtypes);
 
@@ -236,67 +237,68 @@ class json_reader_options {
    *
    * @return Delimiter separating records in JSON lines
    */
-  char get_delimiter() const { return _delimiter; }
+  [[nodiscard]] char get_delimiter() const { return _delimiter; }
 
   /**
    * @brief Whether to read the file as a json object per line.
    *
    * @return `true` if reading the file as a json object per line
    */
-  bool is_enabled_lines() const { return _lines; }
+  [[nodiscard]] bool is_enabled_lines() const { return _lines; }
 
   /**
    * @brief Whether to parse mixed types as a string column.
    *
    * @return `true` if mixed types are parsed as a string column
    */
-  bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; }
+  [[nodiscard]] bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; }
 
   /**
    * @brief Whether to prune columns on read, selected based on the @ref set_dtypes option.
    *
    * When set as true, if the reader options include @ref set_dtypes, then
    * the reader will only return those columns which are mentioned in @ref set_dtypes.
-   * If false, then all columns are returned, independent of the @ref set_dtypes setting.
+   * If false, then all columns are returned, independent of the @ref set_dtypes
+   * setting.
    *
    * @return True if column pruning is enabled
    */
-  bool is_enabled_prune_columns() const { return _prune_columns; }
+  [[nodiscard]] bool is_enabled_prune_columns() const { return _prune_columns; }
 
   /**
    * @brief Whether to parse dates as DD/MM versus MM/DD.
    *
    * @returns true if dates are parsed as DD/MM, false if MM/DD
    */
-  bool is_enabled_dayfirst() const { return _dayfirst; }
+  [[nodiscard]] bool is_enabled_dayfirst() const { return _dayfirst; }
 
   /**
    * @brief Whether the reader should keep quotes of string values.
    *
    * @returns true if the reader should keep quotes, false otherwise
    */
-  bool is_enabled_keep_quotes() const { return _keep_quotes; }
+  [[nodiscard]] bool is_enabled_keep_quotes() const { return _keep_quotes; }
 
   /**
    * @brief Whether the reader should normalize single quotes around strings
    *
    * @returns true if the reader should normalize single quotes, false otherwise
    */
-  bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; }
+  [[nodiscard]] bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; }
 
   /**
    * @brief Whether the reader should normalize unquoted whitespace characters
    *
    * @returns true if the reader should normalize whitespace, false otherwise
    */
-  bool is_enabled_normalize_whitespace() const { return _normalize_whitespace; }
+  [[nodiscard]] bool is_enabled_normalize_whitespace() const { return _normalize_whitespace; }
 
   /**
    * @brief Queries the JSON reader's behavior on invalid JSON lines.
    *
    * @returns An enum that specifies the JSON reader's behavior on invalid JSON lines.
    */
-  json_recovery_mode_t recovery_mode() const { return _recovery_mode; }
+  [[nodiscard]] json_recovery_mode_t recovery_mode() const { return _recovery_mode; }
 
   /**
    * @brief Set data types for columns to be read.
@@ -717,8 +719,8 @@ class json_writer_options {
    * @param sink The sink used for writer output
    * @param table Table to be written to output
    */
-  explicit json_writer_options(sink_info const& sink, table_view const& table)
-    : _sink(sink), _table(table), _rows_per_chunk(table.num_rows())
+  explicit json_writer_options(sink_info sink, table_view table)
+    : _sink(std::move(sink)), _table(std::move(table)), _rows_per_chunk(table.num_rows())
   {
   }
 
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 8140f8897b7..623c1d9fc72 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -28,6 +28,7 @@
 #include <optional>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 namespace cudf {
@@ -125,7 +126,7 @@ class orc_reader_options {
    *
    * @return Number of rows to skip from the start
    */
-  int64_t get_skip_rows() const { return _skip_rows; }
+  [[nodiscard]] int64_t get_skip_rows() const { return _skip_rows; }
 
   /**
    * @brief Returns number of row to read.
@@ -133,35 +134,38 @@ class orc_reader_options {
    * @return Number of rows to read; `nullopt` if the option hasn't been set (in which case the file
    * is read until the end)
    */
-  std::optional<int64_t> const& get_num_rows() const { return _num_rows; }
+  [[nodiscard]] std::optional<int64_t> const& get_num_rows() const { return _num_rows; }
 
   /**
    * @brief Whether to use row index to speed-up reading.
    *
    * @return `true` if row index is used to speed-up reading
    */
-  bool is_enabled_use_index() const { return _use_index; }
+  [[nodiscard]] bool is_enabled_use_index() const { return _use_index; }
 
   /**
    * @brief Whether to use numpy-compatible dtypes.
    *
    * @return `true` if numpy-compatible dtypes are used
    */
-  bool is_enabled_use_np_dtypes() const { return _use_np_dtypes; }
+  [[nodiscard]] bool is_enabled_use_np_dtypes() const { return _use_np_dtypes; }
 
   /**
    * @brief Returns timestamp type to which timestamp column will be cast.
    *
    * @return Timestamp type to which timestamp column will be cast
    */
-  data_type get_timestamp_type() const { return _timestamp_type; }
+  [[nodiscard]] data_type get_timestamp_type() const { return _timestamp_type; }
 
   /**
    * @brief Returns fully qualified names of columns that should be read as 128-bit Decimal.
    *
    * @return Fully qualified names of columns that should be read as 128-bit Decimal
    */
-  std::vector<std::string> const& get_decimal128_columns() const { return _decimal128_columns; }
+  [[nodiscard]] std::vector<std::string> const& get_decimal128_columns() const
+  {
+    return _decimal128_columns;
+  }
 
   // Setters
 
@@ -603,8 +607,8 @@ class orc_writer_options {
    * @param sink The sink used for writer output
    * @param table Table to be written to output
    */
-  explicit orc_writer_options(sink_info const& sink, table_view const& table)
-    : _sink(sink), _table(table)
+  explicit orc_writer_options(sink_info sink, table_view table)
+    : _sink(std::move(sink)), _table(std::move(table))
   {
   }
 
@@ -676,7 +680,7 @@ class orc_writer_options {
    *
    * @return Row index stride
    */
-  auto get_row_index_stride() const
+  [[nodiscard]] auto get_row_index_stride() const
   {
     auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
     return unaligned_stride - unaligned_stride % 8;
@@ -1048,7 +1052,7 @@ class chunked_orc_writer_options {
    *
    * @param sink The sink used for writer output
    */
-  chunked_orc_writer_options(sink_info const& sink) : _sink(sink) {}
+  chunked_orc_writer_options(sink_info sink) : _sink(std::move(sink)) {}
 
  public:
   /**
@@ -1107,7 +1111,7 @@ class chunked_orc_writer_options {
    *
    * @return Row index stride
    */
-  auto get_row_index_stride() const
+  [[nodiscard]] auto get_row_index_stride() const
   {
     auto const unaligned_stride = std::min(_row_index_stride, get_stripe_size_rows());
     return unaligned_stride - unaligned_stride % 8;
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 51eeed5b721..431f14af522 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -187,7 +187,7 @@ class parquet_reader_options {
    *
    * @return Timestamp type used to cast timestamp columns
    */
-  data_type get_timestamp_type() const { return _timestamp_type; }
+  [[nodiscard]] data_type get_timestamp_type() const { return _timestamp_type; }
 
   /**
    * @brief Sets names of the columns to be read.
@@ -626,7 +626,7 @@ class parquet_writer_options_base {
    *
    * @param sink The sink used for writer output
    */
-  explicit parquet_writer_options_base(sink_info const& sink) : _sink(sink) {}
+  explicit parquet_writer_options_base(sink_info sink) : _sink(std::move(sink)) {}
 
  public:
   /**
@@ -1287,7 +1287,7 @@ class chunked_parquet_writer_options : public parquet_writer_options_base {
    *
    * @param sink Sink used for writer output
    */
-  explicit chunked_parquet_writer_options(sink_info const& sink);
+  explicit chunked_parquet_writer_options(sink_info sink);
 
   friend chunked_parquet_writer_options_builder;
 
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 150e997f533..0dab1c606de 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -30,6 +30,7 @@
 #include <optional>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 namespace cudf {
@@ -247,10 +248,10 @@ struct column_name_info {
    * @param _is_nullable True if column is nullable
    * @param _is_binary True if column is binary data
    */
-  column_name_info(std::string const& _name,
+  column_name_info(std::string _name,
                    std::optional<bool> _is_nullable = std::nullopt,
                    std::optional<bool> _is_binary   = std::nullopt)
-    : name(_name), is_nullable(_is_nullable), is_binary(_is_binary)
+    : name(std::move(_name)), is_nullable(_is_nullable), is_binary(_is_binary)
   {
   }
 
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index 825f758adbd..ba485bd6372 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -336,8 +336,8 @@ class hash_join {
    * the result of performing an inner join between two tables with `build` and `probe`
    * as the join keys .
    */
-  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-            std::unique_ptr<rmm::device_uvector<size_type>>>
+  [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+                          std::unique_ptr<rmm::device_uvector<size_type>>>
   inner_join(cudf::table_view const& probe,
              std::optional<std::size_t> output_size = {},
              rmm::cuda_stream_view stream           = cudf::get_default_stream(),
@@ -359,10 +359,10 @@ class hash_join {
    *
    * @return A pair of columns [`left_indices`, `right_indices`] that can be used to construct
    * the result of performing a left join between two tables with `build` and `probe`
-   * as the join keys .
+   * as the join keys.
    */
-  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-            std::unique_ptr<rmm::device_uvector<size_type>>>
+  [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+                          std::unique_ptr<rmm::device_uvector<size_type>>>
   left_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size = {},
             rmm::cuda_stream_view stream           = cudf::get_default_stream(),
@@ -386,8 +386,8 @@ class hash_join {
    * the result of performing a full join between two tables with `build` and `probe`
    * as the join keys .
    */
-  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-            std::unique_ptr<rmm::device_uvector<size_type>>>
+  [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+                          std::unique_ptr<rmm::device_uvector<size_type>>>
   full_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size = {},
             rmm::cuda_stream_view stream           = cudf::get_default_stream(),
@@ -440,7 +440,7 @@ class hash_join {
    * @return The exact number of output when performing a full join between two tables with `build`
    * and `probe` as the join keys .
    */
-  std::size_t full_join_size(
+  [[nodiscard]] std::size_t full_join_size(
     cudf::table_view const& probe,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
@@ -492,12 +492,12 @@ class distinct_hash_join {
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned indices' device memory.
    *
-   * @return A pair of columns [`build_indices`, `probe_indices`] that can be used to construct
-   * the result of performing an inner join between two tables with `build` and `probe`
-   * as the join keys.
+   * @return A pair of columns [`build_indices`, `probe_indices`] that can be used to
+   * construct the result of performing an inner join between two tables
+   * with `build` and `probe` as the join keys.
    */
-  std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
-            std::unique_ptr<rmm::device_uvector<size_type>>>
+  [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
+                          std::unique_ptr<rmm::device_uvector<size_type>>>
   inner_join(rmm::cuda_stream_view stream      = cudf::get_default_stream(),
              rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
@@ -512,10 +512,11 @@ class distinct_hash_join {
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @param mr Device memory resource used to allocate the returned table and columns' device
    * memory.
-   * @return A `build_indices` column that can be used to construct the result of performing a left
-   * join between two tables with `build` and `probe` as the join keys.
+   * @return A `build_indices` column that can be used to construct the result of
+   * performing a left join between two tables with `build` and `probe` as the join
+   * keys.
    */
-  std::unique_ptr<rmm::device_uvector<size_type>> left_join(
+  [[nodiscard]] std::unique_ptr<rmm::device_uvector<size_type>> left_join(
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
 
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index da1d0d743a7..d78907b473a 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -187,7 +187,7 @@ class fixed_width_scalar : public scalar {
    * @param stream CUDA stream used for device memory operations.
    * @return Value of the scalar
    */
-  T value(rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
+  [[nodiscard]] T value(rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
 
   /**
    * @brief Returns a raw pointer to the value in device memory.
@@ -199,7 +199,7 @@ class fixed_width_scalar : public scalar {
    * @brief Returns a const raw pointer to the value in device memory.
    * @return A const raw pointer to the value in device memory
    */
-  T const* data() const;
+  [[nodiscard]] T const* data() const;
 
  protected:
   rmm::device_scalar<T> _data;  ///< device memory containing the value
@@ -245,8 +245,8 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
   static_assert(is_numeric<T>(), "Unexpected non-numeric type.");
 
  public:
-  numeric_scalar()  = delete;
-  ~numeric_scalar() = default;
+  numeric_scalar()           = delete;
+  ~numeric_scalar() override = default;
 
   /**
    * @brief Move constructor for numeric_scalar.
@@ -393,7 +393,7 @@ class fixed_point_scalar : public scalar {
    * @param stream CUDA stream used for device memory operations.
    * @return The value of the scalar
    */
-  rep_type value(rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
+  [[nodiscard]] rep_type value(rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
 
   /**
    * @brief Get the decimal32, decimal64 or decimal128.
@@ -401,7 +401,8 @@ class fixed_point_scalar : public scalar {
    * @param stream CUDA stream used for device memory operations.
    * @return The decimal32, decimal64 or decimal128 value
    */
-  T fixed_point_value(rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
+  [[nodiscard]] T fixed_point_value(
+    rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
 
   /**
    * @brief Explicit conversion operator to get the value of the scalar on the host.
@@ -418,7 +419,7 @@ class fixed_point_scalar : public scalar {
    * @brief Returns a const raw pointer to the value in device memory.
    * @return a const raw pointer to the value in device memory
    */
-  rep_type const* data() const;
+  [[nodiscard]] rep_type const* data() const;
 
  protected:
   rmm::device_scalar<rep_type> _data;  ///< device memory containing the value
@@ -565,8 +566,8 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
   static_assert(is_chrono<T>(), "Unexpected non-chrono type");
 
  public:
-  chrono_scalar()  = delete;
-  ~chrono_scalar() = default;
+  chrono_scalar()           = delete;
+  ~chrono_scalar() override = default;
 
   /**
    * @brief Move constructor for chrono_scalar.
diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp
index bdf541f455f..95c86ae0f8a 100644
--- a/cpp/include/cudf/strings/regex/regex_program.hpp
+++ b/cpp/include/cudf/strings/regex/regex_program.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,35 +74,35 @@ struct regex_program {
    *
    * @return regex pattern as a string
    */
-  std::string pattern() const;
+  [[nodiscard]] std::string pattern() const;
 
   /**
    * @brief Return the regex_flags used to create this instance
    *
    * @return regex flags setting
    */
-  regex_flags flags() const;
+  [[nodiscard]] regex_flags flags() const;
 
   /**
    * @brief Return the capture_groups used to create this instance
    *
    * @return capture groups setting
    */
-  capture_groups capture() const;
+  [[nodiscard]] capture_groups capture() const;
 
   /**
    * @brief Return the number of instructions in this instance
    *
    * @return Number of instructions
    */
-  int32_t instructions_count() const;
+  [[nodiscard]] int32_t instructions_count() const;
 
   /**
    * @brief Return the number of capture groups in this instance
    *
    * @return Number of groups
    */
-  int32_t groups_count() const;
+  [[nodiscard]] int32_t groups_count() const;
 
   /**
    * @brief Return the size of the working memory for the regex execution
@@ -110,7 +110,7 @@ struct regex_program {
    * @param num_strings Number of strings for computation
    * @return Size of the working memory in bytes
    */
-  std::size_t compute_working_memory_size(int32_t num_strings) const;
+  [[nodiscard]] std::size_t compute_working_memory_size(int32_t num_strings) const;
 
   ~regex_program();
 
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 74df1ea1887..93cc787683b 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -110,7 +110,7 @@ static __constant__ char max_string_sentinel[5]{"\xF7\xBF\xBF\xBF"};
  *
  * @return An empty string
  */
-CUDF_HOST_DEVICE inline string_view string_view::min() { return string_view(); }
+CUDF_HOST_DEVICE inline string_view string_view::min() { return {}; }
 
 /**
  * @brief Return maximum value associated with the string type
@@ -130,7 +130,7 @@ CUDF_HOST_DEVICE inline string_view string_view::max()
   CUDF_CUDA_TRY(
     cudaGetSymbolAddress((void**)&psentinel, cudf::strings::detail::max_string_sentinel));
 #endif
-  return string_view(psentinel, 4);
+  return {psentinel, 4};
 }
 
 __device__ inline size_type string_view::length() const
@@ -439,7 +439,7 @@ __device__ inline string_view string_view::substr(size_type pos, size_type count
   auto const itr  = begin() + pos;
   auto const spos = itr.byte_offset();
   auto const epos = count >= 0 ? (itr + count).byte_offset() : size_bytes();
-  return string_view(data() + spos, epos - spos);
+  return {data() + spos, epos - spos};
 }
 
 __device__ inline size_type string_view::character_offset(size_type bytepos) const
diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp
index 8efe6eb8c72..c4f14af53fb 100644
--- a/cpp/include/cudf/table/table.hpp
+++ b/cpp/include/cudf/table/table.hpp
@@ -144,7 +144,7 @@ class table {
    */
 
   template <typename InputIterator>
-  table_view select(InputIterator begin, InputIterator end) const
+  [[nodiscard]] table_view select(InputIterator begin, InputIterator end) const
   {
     std::vector<column_view> columns(std::distance(begin, end));
     std::transform(
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index ad12b1eef4e..a71e0558dec 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -123,7 +123,7 @@ class table_view_base {
    * @param column_index The index of the desired column
    * @return A reference to the desired column
    */
-  ColumnView const& column(size_type column_index) const;
+  [[nodiscard]] ColumnView const& column(size_type column_index) const;
 
   /**
    * @brief Returns the number of columns
@@ -224,7 +224,7 @@ class table_view : public detail::table_view_base<column_view> {
    * specified by the elements of `column_indices`
    */
   template <typename InputIterator>
-  table_view select(InputIterator begin, InputIterator end) const
+  [[nodiscard]] table_view select(InputIterator begin, InputIterator end) const
   {
     std::vector<column_view> columns(std::distance(begin, end));
     std::transform(begin, end, columns.begin(), [this](auto index) { return this->column(index); });
diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp
index 719d44a9ab3..f019f516b84 100644
--- a/cpp/include/cudf/utilities/error.hpp
+++ b/cpp/include/cudf/utilities/error.hpp
@@ -48,7 +48,7 @@ struct stacktrace_recorder {
    *
    * @return The pointer to a null-terminated string storing the output stacktrace
    */
-  char const* stacktrace() const { return _stacktrace.c_str(); }
+  [[nodiscard]] char const* stacktrace() const { return _stacktrace.c_str(); }
 
  protected:
   std::string const _stacktrace;  //!< The whole stacktrace stored as one string.
@@ -78,7 +78,7 @@ struct logic_error : public std::logic_error, public stacktrace_recorder {
   // TODO Add an error code member? This would be useful for translating an
   // exception to an error code in a pure-C API
 
-  ~logic_error()
+  ~logic_error() override
   {
     // Needed so that the first instance of the implicit destructor for any TU isn't 'constructed'
     // from a host+device function marking the implicit version also as host+device
@@ -106,7 +106,7 @@ struct cuda_error : public std::runtime_error, public stacktrace_recorder {
    *
    * @return CUDA error code
    */
-  cudaError_t error_code() const { return _cudaError; }
+  [[nodiscard]] cudaError_t error_code() const { return _cudaError; }
 
  protected:
   cudaError_t _cudaError;  //!< CUDA error code
@@ -237,7 +237,7 @@ inline void throw_cuda_error(cudaError_t error, char const* file, unsigned int l
   // Calls cudaGetLastError to clear the error status. It is nearly certain that a fatal error
   // occurred if it still returns the same error after a cleanup.
   cudaGetLastError();
-  auto const last = cudaFree(0);
+  auto const last = cudaFree(nullptr);
   auto const msg  = std::string{"CUDA error encountered at: " + std::string{file} + ":" +
                                std::to_string(line) + ": " + std::to_string(error) + " " +
                                cudaGetErrorName(error) + " " + cudaGetErrorString(error)};
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 47e92d61a9f..3b35e60e034 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,6 +28,7 @@
 #include <cstddef>
 #include <limits>
 #include <type_traits>
+#include <utility>
 
 namespace cudf {
 /**
@@ -90,7 +91,7 @@ class span_base {
    *
    * @return Reference to the first element in the span
    */
-  constexpr reference front() const { return _data[0]; }
+  [[nodiscard]] constexpr reference front() const { return _data[0]; }
   // not noexcept due to undefined behavior when size = 0
   /**
    * @brief Returns a reference to the last element in the span.
@@ -99,7 +100,7 @@ class span_base {
    *
    * @return Reference to the last element in the span
    */
-  constexpr reference back() const { return _data[_size - 1]; }
+  [[nodiscard]] constexpr reference back() const { return _data[_size - 1]; }
   // not noexcept due to undefined behavior when idx < 0 || idx >= size
   /**
    * @brief Returns a reference to the idx-th element of the sequence.
@@ -119,7 +120,7 @@ class span_base {
    *
    * @return An iterator to the first element of the span
    */
-  constexpr iterator begin() const noexcept { return _data; }
+  [[nodiscard]] constexpr iterator begin() const noexcept { return _data; }
   /**
    * @brief Returns an iterator to the element following the last element of the span.
    *
@@ -127,13 +128,13 @@ class span_base {
    *
    * @return An iterator to the element following the last element of the span
    */
-  constexpr iterator end() const noexcept { return _data + _size; }
+  [[nodiscard]] constexpr iterator end() const noexcept { return _data + _size; }
   /**
    * @brief Returns a pointer to the beginning of the sequence.
    *
    * @return A pointer to the first element of the span
    */
-  constexpr pointer data() const noexcept { return _data; }
+  [[nodiscard]] constexpr pointer data() const noexcept { return _data; }
 
   /**
    * @brief Returns the number of elements in the span.
@@ -160,7 +161,10 @@ class span_base {
    * @param count Number of elements from the beginning of this span to put in the subspan.
    * @return A subspan of the first N elements of the sequence
    */
-  constexpr Derived first(size_type count) const noexcept { return Derived(_data, count); }
+  [[nodiscard]] constexpr Derived first(size_type count) const noexcept
+  {
+    return Derived(_data, count);
+  }
 
   /**
    * @brief Obtains a subspan consisting of the last N elements of the sequence
@@ -168,7 +172,7 @@ class span_base {
    * @param count Number of elements from the end of this span to put in the subspan
    * @return A subspan of the last N elements of the sequence
    */
-  constexpr Derived last(size_type count) const noexcept
+  [[nodiscard]] constexpr Derived last(size_type count) const noexcept
   {
     return Derived(_data + _size - count, count);
   }
@@ -180,7 +184,7 @@ class span_base {
    * @param count The number of elements in the subspan
    * @return A subspan of the sequence, of requested count and offset
    */
-  constexpr Derived subspan(size_type offset, size_type count) const noexcept
+  [[nodiscard]] constexpr Derived subspan(size_type offset, size_type count) const noexcept
   {
     return Derived(_data + offset, count);
   }
@@ -365,7 +369,7 @@ class base_2dspan {
    * @param data Pointer to the data
    * @param size Size of the 2D span as pair
    */
-  base_2dspan(T* data, size_type size) noexcept : _data{data}, _size{size} {}
+  base_2dspan(T* data, size_type size) noexcept : _data{data}, _size{std::move(size)} {}
 
   /**
    * @brief Returns a pointer to the beginning of the sequence.
diff --git a/cpp/include/cudf/utilities/thread_pool.hpp b/cpp/include/cudf/utilities/thread_pool.hpp
index 74a2531710b..c8c3eb097c4 100644
--- a/cpp/include/cudf/utilities/thread_pool.hpp
+++ b/cpp/include/cudf/utilities/thread_pool.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -201,8 +201,8 @@ class thread_pool {
     running = false;
     destroy_threads();
     thread_count = _thread_count ? _thread_count : std::thread::hardware_concurrency();
-    threads.reset(new std::thread[thread_count]);
-    paused = was_paused;
+    threads      = std::make_unique<std::thread[]>(thread_count);
+    paused       = was_paused;
     create_threads();
     running = true;
   }
diff --git a/cpp/include/cudf/wrappers/dictionary.hpp b/cpp/include/cudf/wrappers/dictionary.hpp
index 37264c5a33c..95f4ac00a53 100644
--- a/cpp/include/cudf/wrappers/dictionary.hpp
+++ b/cpp/include/cudf/wrappers/dictionary.hpp
@@ -87,7 +87,7 @@ struct dictionary_wrapper {
    *
    * @return The value of this dictionary wrapper
    */
-  CUDF_HOST_DEVICE inline value_type value() const { return _value; }
+  CUDF_HOST_DEVICE [[nodiscard]] inline value_type value() const { return _value; }
 
   /**
    * @brief Returns the maximum value of the value type.
diff --git a/cpp/include/cudf/wrappers/durations.hpp b/cpp/include/cudf/wrappers/durations.hpp
index 62aa22c2788..840dba4f4ba 100644
--- a/cpp/include/cudf/wrappers/durations.hpp
+++ b/cpp/include/cudf/wrappers/durations.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,13 +56,13 @@ using duration_us = cuda::std::chrono::duration<int64_t, cuda::std::chrono::micr
  */
 using duration_ns = cuda::std::chrono::duration<int64_t, cuda::std::chrono::nanoseconds::period>;
 
-static_assert(sizeof(duration_D) == sizeof(typename duration_D::rep), "");
-static_assert(sizeof(duration_h) == sizeof(typename duration_h::rep), "");
-static_assert(sizeof(duration_m) == sizeof(typename duration_m::rep), "");
-static_assert(sizeof(duration_s) == sizeof(typename duration_s::rep), "");
-static_assert(sizeof(duration_ms) == sizeof(typename duration_ms::rep), "");
-static_assert(sizeof(duration_us) == sizeof(typename duration_us::rep), "");
-static_assert(sizeof(duration_ns) == sizeof(typename duration_ns::rep), "");
+static_assert(sizeof(duration_D) == sizeof(typename duration_D::rep));
+static_assert(sizeof(duration_h) == sizeof(typename duration_h::rep));
+static_assert(sizeof(duration_m) == sizeof(typename duration_m::rep));
+static_assert(sizeof(duration_s) == sizeof(typename duration_s::rep));
+static_assert(sizeof(duration_ms) == sizeof(typename duration_ms::rep));
+static_assert(sizeof(duration_us) == sizeof(typename duration_us::rep));
+static_assert(sizeof(duration_ns) == sizeof(typename duration_ns::rep));
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf/wrappers/timestamps.hpp b/cpp/include/cudf/wrappers/timestamps.hpp
index 0341ac6ede4..5194a3e8f96 100644
--- a/cpp/include/cudf/wrappers/timestamps.hpp
+++ b/cpp/include/cudf/wrappers/timestamps.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -73,13 +73,13 @@ using timestamp_us = detail::timestamp<cudf::duration_us>;
  */
 using timestamp_ns = detail::timestamp<cudf::duration_ns>;
 
-static_assert(sizeof(timestamp_D) == sizeof(typename timestamp_D::rep), "");
-static_assert(sizeof(timestamp_h) == sizeof(typename timestamp_h::rep), "");
-static_assert(sizeof(timestamp_m) == sizeof(typename timestamp_m::rep), "");
-static_assert(sizeof(timestamp_s) == sizeof(typename timestamp_s::rep), "");
-static_assert(sizeof(timestamp_ms) == sizeof(typename timestamp_ms::rep), "");
-static_assert(sizeof(timestamp_us) == sizeof(typename timestamp_us::rep), "");
-static_assert(sizeof(timestamp_ns) == sizeof(typename timestamp_ns::rep), "");
+static_assert(sizeof(timestamp_D) == sizeof(typename timestamp_D::rep));
+static_assert(sizeof(timestamp_h) == sizeof(typename timestamp_h::rep));
+static_assert(sizeof(timestamp_m) == sizeof(typename timestamp_m::rep));
+static_assert(sizeof(timestamp_s) == sizeof(typename timestamp_s::rep));
+static_assert(sizeof(timestamp_ms) == sizeof(typename timestamp_ms::rep));
+static_assert(sizeof(timestamp_us) == sizeof(typename timestamp_us::rep));
+static_assert(sizeof(timestamp_ns) == sizeof(typename timestamp_ns::rep));
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 18f75bbc842..0e35ff64af4 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -66,7 +66,7 @@ class BaseFixtureWithParam : public ::testing::TestWithParam<T> {
    * all tests inheriting from this fixture
    * @return pointer to memory resource
    */
-  rmm::device_async_resource_ref mr() const { return _mr; }
+  [[nodiscard]] rmm::device_async_resource_ref mr() const { return _mr; }
 };
 
 /**
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index dc873658abf..47d17988775 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -1121,14 +1121,20 @@ class dictionary_column_wrapper<std::string> : public detail::column_wrapper {
    *
    * @return column_view to keys column
    */
-  column_view keys() const { return cudf::dictionary_column_view{wrapped->view()}.keys(); }
+  [[nodiscard]] column_view keys() const
+  {
+    return cudf::dictionary_column_view{wrapped->view()}.keys();
+  }
 
   /**
    * @brief Access indices column view
    *
    * @return column_view to indices column
    */
-  column_view indices() const { return cudf::dictionary_column_view{wrapped->view()}.indices(); }
+  [[nodiscard]] column_view indices() const
+  {
+    return cudf::dictionary_column_view{wrapped->view()}.indices();
+  }
 
   /**
    * @brief Default constructor initializes an empty dictionary column of strings
@@ -1792,7 +1798,10 @@ class lists_column_wrapper : public detail::column_wrapper {
     return {std::move(cols), std::move(stubs)};
   }
 
-  column_view get_view() const { return root ? lists_column_view(*wrapped).child() : *wrapped; }
+  [[nodiscard]] column_view get_view() const
+  {
+    return root ? lists_column_view(*wrapped).child() : *wrapped;
+  }
 
   int depth = 0;
   bool root = false;
diff --git a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
index cafde6ca7d5..5a077e86a0f 100644
--- a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
+++ b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
@@ -110,7 +110,7 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
    * @param other The other resource to compare to
    * @return Whether or not the two resources are equivalent
    */
-  bool do_is_equal(device_memory_resource const& other) const noexcept override
+  [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
   {
     if (this == &other) { return true; }
     auto cast = dynamic_cast<stream_checking_resource_adaptor<Upstream> const*>(&other);
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index ac31f9045fe..8ac1491547d 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -153,7 +153,7 @@ void binary_operation(mutable_column_view& out,
 
   cudf::jit::get_program_cache(*binaryop_jit_kernel_cu_jit)
     .get_kernel(kernel_name, {}, {{"binaryop/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})
-    ->configure_1d_max_occupancy(0, 0, 0, stream.value())
+    ->configure_1d_max_occupancy(0, 0, nullptr, stream.value())
     ->launch(out.size(),
              cudf::jit::get_data_ptr(out),
              cudf::jit::get_data_ptr(lhs),
diff --git a/cpp/src/binaryop/compiled/operation.cuh b/cpp/src/binaryop/compiled/operation.cuh
index 43b4bd232c4..57113785a29 100644
--- a/cpp/src/binaryop/compiled/operation.cuh
+++ b/cpp/src/binaryop/compiled/operation.cuh
@@ -173,8 +173,8 @@ struct PMod {
   __device__ inline auto operator()(TypeLhs x, TypeRhs y)
   {
     using common_t = std::common_type_t<TypeLhs, TypeRhs>;
-    common_t xconv = static_cast<common_t>(x);
-    common_t yconv = static_cast<common_t>(y);
+    auto xconv     = static_cast<common_t>(x);
+    auto yconv     = static_cast<common_t>(y);
     auto rem       = xconv % yconv;
     if constexpr (std::is_signed_v<decltype(rem)>)
       if (rem < 0) rem = (rem + yconv) % yconv;
@@ -188,8 +188,8 @@ struct PMod {
   __device__ inline auto operator()(TypeLhs x, TypeRhs y)
   {
     using common_t = std::common_type_t<TypeLhs, TypeRhs>;
-    common_t xconv = static_cast<common_t>(x);
-    common_t yconv = static_cast<common_t>(y);
+    auto xconv     = static_cast<common_t>(x);
+    auto yconv     = static_cast<common_t>(y);
     auto rem       = std::fmod(xconv, yconv);
     if (rem < 0) rem = std::fmod(rem + yconv, yconv);
     return rem;
diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp
index 02f4e480ecb..2b6a4f58895 100644
--- a/cpp/src/binaryop/compiled/util.cpp
+++ b/cpp/src/binaryop/compiled/util.cpp
@@ -123,7 +123,7 @@ struct is_supported_operation_functor {
   template <typename TypeLhs, typename TypeRhs>
   struct nested_support_functor {
     template <typename BinaryOperator>
-    inline constexpr bool call(data_type out_type) const
+    [[nodiscard]] inline constexpr bool call(data_type out_type) const
     {
       return is_binary_operation_supported<BinaryOperator>{}.template operator()<TypeLhs, TypeRhs>(
         out_type);
@@ -163,7 +163,7 @@ struct is_supported_operation_functor {
   };
 
   template <typename BinaryOperator, typename TypeLhs, typename TypeRhs>
-  inline constexpr bool bool_op(data_type out) const
+  [[nodiscard]] inline constexpr bool bool_op(data_type out) const
   {
     return out.id() == type_id::BOOL8 and
            is_binary_operation_supported<BinaryOperator>{}.template operator()<TypeLhs, TypeRhs>();
diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp
index b0208a58896..819ad593c0a 100644
--- a/cpp/src/copying/pack.cpp
+++ b/cpp/src/copying/pack.cpp
@@ -181,7 +181,7 @@ class metadata_builder_impl {
       col_type, col_size, col_null_count, data_offset, null_mask_offset, num_children);
   }
 
-  std::vector<uint8_t> build() const
+  [[nodiscard]] std::vector<uint8_t> build() const
   {
     auto output = std::vector<uint8_t>(metadata.size() * sizeof(detail::serialized_column));
     std::memcpy(output.data(), metadata.data(), output.size());
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index a3471485293..1b0d201501b 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -221,7 +221,7 @@ class posix_parser {
   /**
    * @brief Returns the remaining number of characters in the input.
    */
-  auto remaining_char_cnt() const { return end - cur; }
+  [[nodiscard]] auto remaining_char_cnt() const { return end - cur; }
 
   /**
    * @brief Returns the next character in the input.
diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp
index 05beecfbf9b..dd9e9600a87 100644
--- a/cpp/src/interop/arrow_utilities.cpp
+++ b/cpp/src/interop/arrow_utilities.cpp
@@ -23,7 +23,7 @@
 
 namespace cudf {
 namespace detail {
-data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view)
+data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view)
 {
   switch (arrow_view->type) {
     case NANOARROW_TYPE_NA: return data_type(type_id::EMPTY);
diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp
index defddb4dc42..4e2628ab689 100644
--- a/cpp/src/interop/arrow_utilities.hpp
+++ b/cpp/src/interop/arrow_utilities.hpp
@@ -37,7 +37,7 @@ static constexpr int fixed_width_data_buffer_idx = 1;
  * @param arrow_view SchemaView to pull the logical and storage types from
  * @return Column type id
  */
-data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view);
+data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view);
 
 /**
  * @brief Map cudf column type id to ArrowType id
diff --git a/cpp/src/interop/detail/arrow_allocator.cpp b/cpp/src/interop/detail/arrow_allocator.cpp
index 3e6a337457a..2a19a5360fe 100644
--- a/cpp/src/interop/detail/arrow_allocator.cpp
+++ b/cpp/src/interop/detail/arrow_allocator.cpp
@@ -38,7 +38,7 @@ T enable_hugepage(T&& buf)
   }
 
 #ifdef MADV_HUGEPAGE
-  const auto pagesize = sysconf(_SC_PAGESIZE);
+  auto const pagesize = sysconf(_SC_PAGESIZE);
   void* addr          = const_cast<uint8_t*>(buf->data());
   if (addr == nullptr) { return std::move(buf); }
   auto length{static_cast<std::size_t>(buf->size())};
diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu
index 36bb35d9419..854a1d68fdc 100644
--- a/cpp/src/interop/from_arrow_host.cu
+++ b/cpp/src/interop/from_arrow_host.cu
@@ -140,7 +140,7 @@ std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<bool>(ArrowSch
                                                                         bool skip_mask)
 {
   auto data_buffer         = input->buffers[fixed_width_data_buffer_idx];
-  const auto buffer_length = bitmask_allocation_size_bytes(input->length + input->offset);
+  auto const buffer_length = bitmask_allocation_size_bytes(input->length + input->offset);
 
   auto data = rmm::device_buffer(buffer_length, stream, mr);
   CUDF_CUDA_TRY(cudaMemcpyAsync(data.data(),
@@ -322,7 +322,7 @@ template <>
 std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::list_view>(
   ArrowSchemaView* schema, ArrowArray const* input, data_type type, bool skip_mask)
 {
-  const void* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]};
+  void const* offset_buffers[2] = {nullptr, input->buffers[fixed_width_data_buffer_idx]};
   ArrowArray offsets_array      = {
          .length     = input->offset + input->length + 1,
          .null_count = 0,
diff --git a/cpp/src/io/avro/avro.cpp b/cpp/src/io/avro/avro.cpp
index 221cdf93042..2041f03cd81 100644
--- a/cpp/src/io/avro/avro.cpp
+++ b/cpp/src/io/avro/avro.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -485,8 +485,8 @@ std::string schema_parser::get_str()
   char const* cur   = start;
   while (cur < m_end && *cur++ != '"')
     ;
-  int32_t len = static_cast<int32_t>(cur - start - 1);
-  m_cur       = cur;
+  auto len = static_cast<int32_t>(cur - start - 1);
+  m_cur    = cur;
   return s.assign(start, std::max(len, 0));
 }
 
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index 3e5d966282d..ab516dd585d 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -305,7 +305,7 @@ std::vector<uint8_t> decompress(compression_type compression, host_span<uint8_t
       if (OpenZipArchive(&za, raw, src.size())) {
         size_t cdfh_ofs = 0;
         for (int i = 0; i < za.eocd->num_entries; i++) {
-          zip_cdfh_s const* cdfh = reinterpret_cast<zip_cdfh_s const*>(
+          auto const* cdfh = reinterpret_cast<zip_cdfh_s const*>(
             reinterpret_cast<uint8_t const*>(za.cdfh) + cdfh_ofs);
           int cdfh_len = sizeof(zip_cdfh_s) + cdfh->fname_len + cdfh->extra_len + cdfh->comment_len;
           if (cdfh_ofs + cdfh_len > za.eocd->cdir_size || cdfh->sig != 0x0201'4b50) {
@@ -314,8 +314,8 @@ std::vector<uint8_t> decompress(compression_type compression, host_span<uint8_t
           }
           // For now, only accept with non-zero file sizes and DEFLATE
           if (cdfh->comp_method == 8 && cdfh->comp_size > 0 && cdfh->uncomp_size > 0) {
-            size_t lfh_ofs       = cdfh->hdr_ofs;
-            zip_lfh_s const* lfh = reinterpret_cast<zip_lfh_s const*>(raw + lfh_ofs);
+            size_t lfh_ofs  = cdfh->hdr_ofs;
+            auto const* lfh = reinterpret_cast<zip_lfh_s const*>(raw + lfh_ofs);
             if (lfh_ofs + sizeof(zip_lfh_s) <= src.size() && lfh->sig == 0x0403'4b50 &&
                 lfh_ofs + sizeof(zip_lfh_s) + lfh->fname_len + lfh->extra_len <= src.size()) {
               if (lfh->comp_method == 8 && lfh->comp_size > 0 && lfh->uncomp_size > 0) {
@@ -340,7 +340,7 @@ std::vector<uint8_t> decompress(compression_type compression, host_span<uint8_t
       [[fallthrough]];
     case compression_type::BZIP2:
       if (src.size() > 4) {
-        bz2_file_header_s const* fhdr = reinterpret_cast<bz2_file_header_s const*>(raw);
+        auto const* fhdr = reinterpret_cast<bz2_file_header_s const*>(raw);
         // Check for BZIP2 file signature "BZh1" to "BZh9"
         if (fhdr->sig[0] == 'B' && fhdr->sig[1] == 'Z' && fhdr->sig[2] == 'h' &&
             fhdr->blksz >= '1' && fhdr->blksz <= '9') {
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 1ed8ee5ce06..5daa55d4552 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -306,14 +306,14 @@ raw_orc_statistics read_raw_orc_statistics(source_info const& src_info,
 
   // Get file-level statistics, statistics of each column of file
   for (auto const& stats : metadata.ff.statistics) {
-    result.file_stats.push_back(std::string(stats.cbegin(), stats.cend()));
+    result.file_stats.emplace_back(stats.cbegin(), stats.cend());
   }
 
   // Get stripe-level statistics
   for (auto const& stripes_stats : metadata.md.stripeStats) {
     result.stripes_stats.emplace_back();
     for (auto const& stats : stripes_stats.colStats) {
-      result.stripes_stats.back().push_back(std::string(stats.cbegin(), stats.cend()));
+      result.stripes_stats.back().emplace_back(stats.cbegin(), stats.cend());
     }
   }
 
@@ -1026,8 +1026,8 @@ parquet_writer_options_builder& parquet_writer_options_builder::column_chunks_fi
   return *this;
 }
 
-chunked_parquet_writer_options::chunked_parquet_writer_options(sink_info const& sink)
-  : parquet_writer_options_base(sink)
+chunked_parquet_writer_options::chunked_parquet_writer_options(sink_info sink)
+  : parquet_writer_options_base(std::move(sink))
 {
 }
 
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index b243e4ba006..031edfde4f6 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -245,7 +245,7 @@ struct TransduceToken {
                                                 RelativeOffsetT const relative_offset,
                                                 SymbolT const read_symbol) const
   {
-    const bool is_end_of_invalid_line =
+    bool const is_end_of_invalid_line =
       (state_id == static_cast<StateT>(TT_INV) &&
        match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DELIMITER));
 
@@ -265,15 +265,15 @@ struct TransduceToken {
     // Number of tokens emitted on invalid lines
     constexpr int32_t num_inv_tokens = 2;
 
-    const bool is_delimiter = match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DELIMITER);
+    bool const is_delimiter = match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DELIMITER);
 
     // If state is either invalid or we're entering an invalid state, we discard tokens
-    const bool is_part_of_invalid_line =
+    bool const is_part_of_invalid_line =
       (match_id != static_cast<SymbolGroupT>(dfa_symbol_group_id::ERROR) &&
        state_id == static_cast<StateT>(TT_VLD));
 
     // Indicates whether we transition from an invalid line to a potentially valid line
-    const bool is_end_of_invalid_line = (state_id == static_cast<StateT>(TT_INV) && is_delimiter);
+    bool const is_end_of_invalid_line = (state_id == static_cast<StateT>(TT_INV) && is_delimiter);
 
     int32_t const emit_count =
       is_end_of_invalid_line ? num_inv_tokens : (is_part_of_invalid_line && !is_delimiter ? 1 : 0);
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index df5c7bc21e1..e999be8f83a 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -85,7 +85,7 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
                                   sources.end(),
                                   prefsum_source_sizes.begin(),
                                   std::plus<int>{},
-                                  [](const std::unique_ptr<datasource>& s) { return s->size(); });
+                                  [](std::unique_ptr<datasource> const& s) { return s->size(); });
     auto upper =
       std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset);
     size_t start_source = std::distance(prefsum_source_sizes.begin(), upper);
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index fd55cbb6846..e1403acd455 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -511,7 +511,7 @@ class ProtobufWriter {
                            TypeKind kind,
                            ColStatsBlob const* stats);
 
-  std::size_t size() const { return m_buff.size(); }
+  [[nodiscard]] std::size_t size() const { return m_buff.size(); }
   uint8_t const* data() { return m_buff.data(); }
 
   std::vector<uint8_t>& buffer() { return m_buff; }
diff --git a/cpp/src/io/orc/orc_field_writer.hpp b/cpp/src/io/orc/orc_field_writer.hpp
index 4862562d526..731e9d7687e 100644
--- a/cpp/src/io/orc/orc_field_writer.hpp
+++ b/cpp/src/io/orc/orc_field_writer.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,10 +31,10 @@ namespace io {
 namespace orc {
 
 struct ProtobufWriter::ProtobufFieldWriter {
-  int struct_size;
+  int struct_size{0};
   ProtobufWriter* p;
 
-  ProtobufFieldWriter(ProtobufWriter* pbw) : struct_size(0), p(pbw) {}
+  ProtobufFieldWriter(ProtobufWriter* pbw) : p(pbw) {}
 
   /**
    * @brief Function to write a unsigned integer to the internal buffer
diff --git a/cpp/src/io/orc/reader_impl_chunking.cu b/cpp/src/io/orc/reader_impl_chunking.cu
index 43301826003..01ee5ad177d 100644
--- a/cpp/src/io/orc/reader_impl_chunking.cu
+++ b/cpp/src/io/orc/reader_impl_chunking.cu
@@ -537,7 +537,7 @@ void reader_impl::load_next_stripe_data(read_mode mode)
     _file_itm_data.selected_stripes.begin() + stripe_start,
     _file_itm_data.selected_stripes.begin() + stripe_start + stripe_count,
     std::size_t{0},
-    [](std::size_t count, const auto& stripe) { return count + stripe.stripe_info->numberOfRows; });
+    [](std::size_t count, auto const& stripe) { return count + stripe.stripe_info->numberOfRows; });
 
   // Decoding range needs to be reset to start from the first position in `decode_stripe_ranges`.
   _chunk_read_data.curr_decode_stripe_range = 0;
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index da9fb802a0a..72eb41b1360 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -810,7 +810,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
       cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>(stripe_count, num_lvl_columns, _stream);
     memset(chunks.base_host_ptr(), 0, chunks.size_bytes());
 
-    const bool use_index =
+    bool const use_index =
       _options.use_index &&
       // Do stripes have row group index
       _metadata.is_row_grp_idx_present() &&
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index c9212334a96..192833507b0 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -42,7 +42,7 @@ class parquet_field {
 
  public:
   virtual ~parquet_field() = default;
-  int field() const { return _field_val; }
+  [[nodiscard]] int field() const { return _field_val; }
 };
 
 std::string field_type_string(FieldType type)
diff --git a/cpp/src/io/parquet/compact_protocol_writer.hpp b/cpp/src/io/parquet/compact_protocol_writer.hpp
index c2e6178acbf..d4778b1ea15 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.hpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.hpp
@@ -64,11 +64,11 @@ class CompactProtocolWriter {
 class CompactProtocolFieldWriter {
   CompactProtocolWriter& writer;
   size_t struct_start_pos;
-  int current_field_value;
+  int current_field_value{0};
 
  public:
   CompactProtocolFieldWriter(CompactProtocolWriter& caller)
-    : writer(caller), struct_start_pos(writer.m_buf.size()), current_field_value(0)
+    : writer(caller), struct_start_pos(writer.m_buf.size())
   {
   }
 
diff --git a/cpp/src/io/parquet/ipc/Schema_generated.h b/cpp/src/io/parquet/ipc/Schema_generated.h
index 27141b4af31..c091204417a 100644
--- a/cpp/src/io/parquet/ipc/Schema_generated.h
+++ b/cpp/src/io/parquet/ipc/Schema_generated.h
@@ -139,13 +139,13 @@ inline const MetadataVersion (&EnumValuesMetadataVersion())[5]
   return values;
 }
 
-inline const char* const* EnumNamesMetadataVersion()
+inline char const* const* EnumNamesMetadataVersion()
 {
-  static const char* const names[6] = {"V1", "V2", "V3", "V4", "V5", nullptr};
+  static char const* const names[6] = {"V1", "V2", "V3", "V4", "V5", nullptr};
   return names;
 }
 
-inline const char* EnumNameMetadataVersion(MetadataVersion e)
+inline char const* EnumNameMetadataVersion(MetadataVersion e)
 {
   if (::flatbuffers::IsOutRange(e, MetadataVersion_V1, MetadataVersion_V5)) return "";
   const size_t index = static_cast<size_t>(e);
@@ -190,14 +190,14 @@ inline const Feature (&EnumValuesFeature())[3]
   return values;
 }
 
-inline const char* const* EnumNamesFeature()
+inline char const* const* EnumNamesFeature()
 {
-  static const char* const names[4] = {
+  static char const* const names[4] = {
     "UNUSED", "DICTIONARY_REPLACEMENT", "COMPRESSED_BODY", nullptr};
   return names;
 }
 
-inline const char* EnumNameFeature(Feature e)
+inline char const* EnumNameFeature(Feature e)
 {
   if (::flatbuffers::IsOutRange(e, Feature_UNUSED, Feature_COMPRESSED_BODY)) return "";
   const size_t index = static_cast<size_t>(e);
@@ -217,13 +217,13 @@ inline const UnionMode (&EnumValuesUnionMode())[2]
   return values;
 }
 
-inline const char* const* EnumNamesUnionMode()
+inline char const* const* EnumNamesUnionMode()
 {
-  static const char* const names[3] = {"Sparse", "Dense", nullptr};
+  static char const* const names[3] = {"Sparse", "Dense", nullptr};
   return names;
 }
 
-inline const char* EnumNameUnionMode(UnionMode e)
+inline char const* EnumNameUnionMode(UnionMode e)
 {
   if (::flatbuffers::IsOutRange(e, UnionMode_Sparse, UnionMode_Dense)) return "";
   const size_t index = static_cast<size_t>(e);
@@ -244,13 +244,13 @@ inline const Precision (&EnumValuesPrecision())[3]
   return values;
 }
 
-inline const char* const* EnumNamesPrecision()
+inline char const* const* EnumNamesPrecision()
 {
-  static const char* const names[4] = {"HALF", "SINGLE", "DOUBLE", nullptr};
+  static char const* const names[4] = {"HALF", "SINGLE", "DOUBLE", nullptr};
   return names;
 }
 
-inline const char* EnumNamePrecision(Precision e)
+inline char const* EnumNamePrecision(Precision e)
 {
   if (::flatbuffers::IsOutRange(e, Precision_HALF, Precision_DOUBLE)) return "";
   const size_t index = static_cast<size_t>(e);
@@ -270,13 +270,13 @@ inline const DateUnit (&EnumValuesDateUnit())[2]
   return values;
 }
 
-inline const char* const* EnumNamesDateUnit()
+inline char const* const* EnumNamesDateUnit()
 {
-  static const char* const names[3] = {"DAY", "MILLISECOND", nullptr};
+  static char const* const names[3] = {"DAY", "MILLISECOND", nullptr};
   return names;
 }
 
-inline const char* EnumNameDateUnit(DateUnit e)
+inline char const* EnumNameDateUnit(DateUnit e)
 {
   if (::flatbuffers::IsOutRange(e, DateUnit_DAY, DateUnit_MILLISECOND)) return "";
   const size_t index = static_cast<size_t>(e);
@@ -299,14 +299,14 @@ inline const TimeUnit (&EnumValuesTimeUnit())[4]
   return values;
 }
 
-inline const char* const* EnumNamesTimeUnit()
+inline char const* const* EnumNamesTimeUnit()
 {
-  static const char* const names[5] = {
+  static char const* const names[5] = {
     "SECOND", "MILLISECOND", "MICROSECOND", "NANOSECOND", nullptr};
   return names;
 }
 
-inline const char* EnumNameTimeUnit(TimeUnit e)
+inline char const* EnumNameTimeUnit(TimeUnit e)
 {
   if (::flatbuffers::IsOutRange(e, TimeUnit_SECOND, TimeUnit_NANOSECOND)) return "";
   const size_t index = static_cast<size_t>(e);
@@ -328,13 +328,13 @@ inline const IntervalUnit (&EnumValuesIntervalUnit())[3]
   return values;
 }
 
-inline const char* const* EnumNamesIntervalUnit()
+inline char const* const* EnumNamesIntervalUnit()
 {
-  static const char* const names[4] = {"YEAR_MONTH", "DAY_TIME", "MONTH_DAY_NANO", nullptr};
+  static char const* const names[4] = {"YEAR_MONTH", "DAY_TIME", "MONTH_DAY_NANO", nullptr};
   return names;
 }
 
-inline const char* EnumNameIntervalUnit(IntervalUnit e)
+inline char const* EnumNameIntervalUnit(IntervalUnit e)
 {
   if (::flatbuffers::IsOutRange(e, IntervalUnit_YEAR_MONTH, IntervalUnit_MONTH_DAY_NANO)) return "";
   const size_t index = static_cast<size_t>(e);
@@ -389,9 +389,9 @@ inline const Type (&EnumValuesType())[27]
   return values;
 }
 
-inline const char* const* EnumNamesType()
+inline char const* const* EnumNamesType()
 {
-  static const char* const names[28] = {
+  static char const* const names[28] = {
     "NONE",          "Null",      "Int",           "FloatingPoint",
     "Binary",        "Utf8",      "Bool",          "Decimal",
     "Date",          "Time",      "Timestamp",     "Interval",
@@ -402,7 +402,7 @@ inline const char* const* EnumNamesType()
   return names;
 }
 
-inline const char* EnumNameType(Type e)
+inline char const* EnumNameType(Type e)
 {
   if (::flatbuffers::IsOutRange(e, Type_NONE, Type_LargeListView)) return "";
   const size_t index = static_cast<size_t>(e);
@@ -544,10 +544,10 @@ struct TypeTraits<cudf::io::parquet::flatbuf::LargeListView> {
   static const Type enum_value = Type_LargeListView;
 };
 
-bool VerifyType(::flatbuffers::Verifier& verifier, const void* obj, Type type);
+bool VerifyType(::flatbuffers::Verifier& verifier, void const* obj, Type type);
 bool VerifyTypeVector(::flatbuffers::Verifier& verifier,
-                      const ::flatbuffers::Vector<::flatbuffers::Offset<void>>* values,
-                      const ::flatbuffers::Vector<uint8_t>* types);
+                      ::flatbuffers::Vector<::flatbuffers::Offset<void>> const* values,
+                      ::flatbuffers::Vector<uint8_t> const* types);
 
 /// ----------------------------------------------------------------------
 /// Dictionary encoding metadata
@@ -566,13 +566,13 @@ inline const DictionaryKind (&EnumValuesDictionaryKind())[1]
   return values;
 }
 
-inline const char* const* EnumNamesDictionaryKind()
+inline char const* const* EnumNamesDictionaryKind()
 {
-  static const char* const names[2] = {"DenseArray", nullptr};
+  static char const* const names[2] = {"DenseArray", nullptr};
   return names;
 }
 
-inline const char* EnumNameDictionaryKind(DictionaryKind e)
+inline char const* EnumNameDictionaryKind(DictionaryKind e)
 {
   if (::flatbuffers::IsOutRange(e, DictionaryKind_DenseArray, DictionaryKind_DenseArray)) return "";
   const size_t index = static_cast<size_t>(e);
@@ -594,13 +594,13 @@ inline const Endianness (&EnumValuesEndianness())[2]
   return values;
 }
 
-inline const char* const* EnumNamesEndianness()
+inline char const* const* EnumNamesEndianness()
 {
-  static const char* const names[3] = {"Little", "Big", nullptr};
+  static char const* const names[3] = {"Little", "Big", nullptr};
   return names;
 }
 
-inline const char* EnumNameEndianness(Endianness e)
+inline char const* EnumNameEndianness(Endianness e)
 {
   if (::flatbuffers::IsOutRange(e, Endianness_Little, Endianness_Big)) return "";
   const size_t index = static_cast<size_t>(e);
@@ -652,7 +652,7 @@ struct NullBuilder {
   }
   ::flatbuffers::Offset<Null> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<Null>(end);
     return o;
   }
@@ -685,7 +685,7 @@ struct Struct_Builder {
   }
   ::flatbuffers::Offset<Struct_> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<Struct_>(end);
     return o;
   }
@@ -715,7 +715,7 @@ struct ListBuilder {
   }
   ::flatbuffers::Offset<List> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<List>(end);
     return o;
   }
@@ -747,7 +747,7 @@ struct LargeListBuilder {
   }
   ::flatbuffers::Offset<LargeList> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<LargeList>(end);
     return o;
   }
@@ -780,7 +780,7 @@ struct ListViewBuilder {
   }
   ::flatbuffers::Offset<ListView> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<ListView>(end);
     return o;
   }
@@ -812,7 +812,7 @@ struct LargeListViewBuilder {
   }
   ::flatbuffers::Offset<LargeListView> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<LargeListView>(end);
     return o;
   }
@@ -851,7 +851,7 @@ struct FixedSizeListBuilder {
   }
   ::flatbuffers::Offset<FixedSizeList> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<FixedSizeList>(end);
     return o;
   }
@@ -916,7 +916,7 @@ struct MapBuilder {
   }
   ::flatbuffers::Offset<Map> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<Map>(end);
     return o;
   }
@@ -941,9 +941,9 @@ struct Union FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   {
     return static_cast<cudf::io::parquet::flatbuf::UnionMode>(GetField<int16_t>(VT_MODE, 0));
   }
-  const ::flatbuffers::Vector<int32_t>* typeIds() const
+  ::flatbuffers::Vector<int32_t> const* typeIds() const
   {
-    return GetPointer<const ::flatbuffers::Vector<int32_t>*>(VT_TYPEIDS);
+    return GetPointer<::flatbuffers::Vector<int32_t> const*>(VT_TYPEIDS);
   }
   bool Verify(::flatbuffers::Verifier& verifier) const
   {
@@ -971,7 +971,7 @@ struct UnionBuilder {
   }
   ::flatbuffers::Offset<Union> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<Union>(end);
     return o;
   }
@@ -991,7 +991,7 @@ inline ::flatbuffers::Offset<Union> CreateUnion(
 inline ::flatbuffers::Offset<Union> CreateUnionDirect(
   ::flatbuffers::FlatBufferBuilder& _fbb,
   cudf::io::parquet::flatbuf::UnionMode mode = cudf::io::parquet::flatbuf::UnionMode_Sparse,
-  const std::vector<int32_t>* typeIds        = nullptr)
+  std::vector<int32_t> const* typeIds        = nullptr)
 {
   auto typeIds__ = typeIds ? _fbb.CreateVector<int32_t>(*typeIds) : 0;
   return cudf::io::parquet::flatbuf::CreateUnion(_fbb, mode, typeIds__);
@@ -1027,7 +1027,7 @@ struct IntBuilder {
   }
   ::flatbuffers::Offset<Int> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<Int>(end);
     return o;
   }
@@ -1071,7 +1071,7 @@ struct FloatingPointBuilder {
   }
   ::flatbuffers::Offset<FloatingPoint> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<FloatingPoint>(end);
     return o;
   }
@@ -1105,7 +1105,7 @@ struct Utf8Builder {
   }
   ::flatbuffers::Offset<Utf8> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<Utf8>(end);
     return o;
   }
@@ -1136,7 +1136,7 @@ struct BinaryBuilder {
   }
   ::flatbuffers::Offset<Binary> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<Binary>(end);
     return o;
   }
@@ -1168,7 +1168,7 @@ struct LargeUtf8Builder {
   }
   ::flatbuffers::Offset<LargeUtf8> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<LargeUtf8>(end);
     return o;
   }
@@ -1200,7 +1200,7 @@ struct LargeBinaryBuilder {
   }
   ::flatbuffers::Offset<LargeBinary> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<LargeBinary>(end);
     return o;
   }
@@ -1237,7 +1237,7 @@ struct Utf8ViewBuilder {
   }
   ::flatbuffers::Offset<Utf8View> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<Utf8View>(end);
     return o;
   }
@@ -1274,7 +1274,7 @@ struct BinaryViewBuilder {
   }
   ::flatbuffers::Offset<BinaryView> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<BinaryView>(end);
     return o;
   }
@@ -1312,7 +1312,7 @@ struct FixedSizeBinaryBuilder {
   }
   ::flatbuffers::Offset<FixedSizeBinary> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<FixedSizeBinary>(end);
     return o;
   }
@@ -1344,7 +1344,7 @@ struct BoolBuilder {
   }
   ::flatbuffers::Offset<Bool> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<Bool>(end);
     return o;
   }
@@ -1379,7 +1379,7 @@ struct RunEndEncodedBuilder {
   }
   ::flatbuffers::Offset<RunEndEncoded> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<RunEndEncoded>(end);
     return o;
   }
@@ -1437,7 +1437,7 @@ struct DecimalBuilder {
   }
   ::flatbuffers::Offset<Decimal> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<Decimal>(end);
     return o;
   }
@@ -1489,7 +1489,7 @@ struct DateBuilder {
   }
   ::flatbuffers::Offset<Date> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<Date>(end);
     return o;
   }
@@ -1548,7 +1548,7 @@ struct TimeBuilder {
   }
   ::flatbuffers::Offset<Time> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<Time>(end);
     return o;
   }
@@ -1687,9 +1687,9 @@ struct Timestamp FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   ///
   /// Whether a timezone string is present indicates different semantics about
   /// the data (see above).
-  const ::flatbuffers::String* timezone() const
+  ::flatbuffers::String const* timezone() const
   {
-    return GetPointer<const ::flatbuffers::String*>(VT_TIMEZONE);
+    return GetPointer<::flatbuffers::String const*>(VT_TIMEZONE);
   }
   bool Verify(::flatbuffers::Verifier& verifier) const
   {
@@ -1717,7 +1717,7 @@ struct TimestampBuilder {
   }
   ::flatbuffers::Offset<Timestamp> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<Timestamp>(end);
     return o;
   }
@@ -1737,7 +1737,7 @@ inline ::flatbuffers::Offset<Timestamp> CreateTimestamp(
 inline ::flatbuffers::Offset<Timestamp> CreateTimestampDirect(
   ::flatbuffers::FlatBufferBuilder& _fbb,
   cudf::io::parquet::flatbuf::TimeUnit unit = cudf::io::parquet::flatbuf::TimeUnit_SECOND,
-  const char* timezone                      = nullptr)
+  char const* timezone                      = nullptr)
 {
   auto timezone__ = timezone ? _fbb.CreateString(timezone) : 0;
   return cudf::io::parquet::flatbuf::CreateTimestamp(_fbb, unit, timezone__);
@@ -1771,7 +1771,7 @@ struct IntervalBuilder {
   }
   ::flatbuffers::Offset<Interval> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<Interval>(end);
     return o;
   }
@@ -1815,7 +1815,7 @@ struct DurationBuilder {
   }
   ::flatbuffers::Offset<Duration> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<Duration>(end);
     return o;
   }
@@ -1836,13 +1836,13 @@ inline ::flatbuffers::Offset<Duration> CreateDuration(
 struct KeyValue FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   typedef KeyValueBuilder Builder;
   enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_KEY = 4, VT_VALUE = 6 };
-  const ::flatbuffers::String* key() const
+  ::flatbuffers::String const* key() const
   {
-    return GetPointer<const ::flatbuffers::String*>(VT_KEY);
+    return GetPointer<::flatbuffers::String const*>(VT_KEY);
   }
-  const ::flatbuffers::String* value() const
+  ::flatbuffers::String const* value() const
   {
-    return GetPointer<const ::flatbuffers::String*>(VT_VALUE);
+    return GetPointer<::flatbuffers::String const*>(VT_VALUE);
   }
   bool Verify(::flatbuffers::Verifier& verifier) const
   {
@@ -1870,7 +1870,7 @@ struct KeyValueBuilder {
   }
   ::flatbuffers::Offset<KeyValue> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<KeyValue>(end);
     return o;
   }
@@ -1888,8 +1888,8 @@ inline ::flatbuffers::Offset<KeyValue> CreateKeyValue(
 }
 
 inline ::flatbuffers::Offset<KeyValue> CreateKeyValueDirect(::flatbuffers::FlatBufferBuilder& _fbb,
-                                                            const char* key   = nullptr,
-                                                            const char* value = nullptr)
+                                                            char const* key   = nullptr,
+                                                            char const* value = nullptr)
 {
   auto key__   = key ? _fbb.CreateString(key) : 0;
   auto value__ = value ? _fbb.CreateString(value) : 0;
@@ -1913,9 +1913,9 @@ struct DictionaryEncoding FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table
   /// cross-language compatibility and performance, implementations are
   /// recommended to prefer signed integer types over unsigned integer types
   /// and to avoid uint64 indices unless they are required by an application.
-  const cudf::io::parquet::flatbuf::Int* indexType() const
+  cudf::io::parquet::flatbuf::Int const* indexType() const
   {
-    return GetPointer<const cudf::io::parquet::flatbuf::Int*>(VT_INDEXTYPE);
+    return GetPointer<cudf::io::parquet::flatbuf::Int const*>(VT_INDEXTYPE);
   }
   /// By default, dictionaries are not ordered, or the order does not have
   /// semantic meaning. In some statistical, applications, dictionary-encoding
@@ -1960,7 +1960,7 @@ struct DictionaryEncodingBuilder {
   }
   ::flatbuffers::Offset<DictionaryEncoding> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<DictionaryEncoding>(end);
     return o;
   }
@@ -1997,9 +1997,9 @@ struct Field FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
     VT_CUSTOM_METADATA = 16
   };
   /// Name is not required, in i.e. a List
-  const ::flatbuffers::String* name() const
+  ::flatbuffers::String const* name() const
   {
-    return GetPointer<const ::flatbuffers::String*>(VT_NAME);
+    return GetPointer<::flatbuffers::String const*>(VT_NAME);
   }
   /// Whether or not this field can contain nulls. Should be true in general.
   bool nullable() const { return GetField<uint8_t>(VT_NULLABLE, 0) != 0; }
@@ -2008,185 +2008,185 @@ struct Field FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
     return static_cast<cudf::io::parquet::flatbuf::Type>(GetField<uint8_t>(VT_TYPE_TYPE, 0));
   }
   /// This is the type of the decoded value if the field is dictionary encoded.
-  const void* type() const { return GetPointer<const void*>(VT_TYPE); }
+  void const* type() const { return GetPointer<void const*>(VT_TYPE); }
   template <typename T>
-  const T* type_as() const;
-  const cudf::io::parquet::flatbuf::Null* type_as_Null() const
+  T const* type_as() const;
+  cudf::io::parquet::flatbuf::Null const* type_as_Null() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_Null
-             ? static_cast<const cudf::io::parquet::flatbuf::Null*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::Null const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::Int* type_as_Int() const
+  cudf::io::parquet::flatbuf::Int const* type_as_Int() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_Int
-             ? static_cast<const cudf::io::parquet::flatbuf::Int*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::Int const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::FloatingPoint* type_as_FloatingPoint() const
+  cudf::io::parquet::flatbuf::FloatingPoint const* type_as_FloatingPoint() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_FloatingPoint
-             ? static_cast<const cudf::io::parquet::flatbuf::FloatingPoint*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::FloatingPoint const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::Binary* type_as_Binary() const
+  cudf::io::parquet::flatbuf::Binary const* type_as_Binary() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_Binary
-             ? static_cast<const cudf::io::parquet::flatbuf::Binary*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::Binary const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::Utf8* type_as_Utf8() const
+  cudf::io::parquet::flatbuf::Utf8 const* type_as_Utf8() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_Utf8
-             ? static_cast<const cudf::io::parquet::flatbuf::Utf8*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::Utf8 const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::Bool* type_as_Bool() const
+  cudf::io::parquet::flatbuf::Bool const* type_as_Bool() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_Bool
-             ? static_cast<const cudf::io::parquet::flatbuf::Bool*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::Bool const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::Decimal* type_as_Decimal() const
+  cudf::io::parquet::flatbuf::Decimal const* type_as_Decimal() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_Decimal
-             ? static_cast<const cudf::io::parquet::flatbuf::Decimal*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::Decimal const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::Date* type_as_Date() const
+  cudf::io::parquet::flatbuf::Date const* type_as_Date() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_Date
-             ? static_cast<const cudf::io::parquet::flatbuf::Date*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::Date const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::Time* type_as_Time() const
+  cudf::io::parquet::flatbuf::Time const* type_as_Time() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_Time
-             ? static_cast<const cudf::io::parquet::flatbuf::Time*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::Time const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::Timestamp* type_as_Timestamp() const
+  cudf::io::parquet::flatbuf::Timestamp const* type_as_Timestamp() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_Timestamp
-             ? static_cast<const cudf::io::parquet::flatbuf::Timestamp*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::Timestamp const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::Interval* type_as_Interval() const
+  cudf::io::parquet::flatbuf::Interval const* type_as_Interval() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_Interval
-             ? static_cast<const cudf::io::parquet::flatbuf::Interval*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::Interval const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::List* type_as_List() const
+  cudf::io::parquet::flatbuf::List const* type_as_List() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_List
-             ? static_cast<const cudf::io::parquet::flatbuf::List*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::List const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::Struct_* type_as_Struct_() const
+  cudf::io::parquet::flatbuf::Struct_ const* type_as_Struct_() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_Struct_
-             ? static_cast<const cudf::io::parquet::flatbuf::Struct_*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::Struct_ const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::Union* type_as_Union() const
+  cudf::io::parquet::flatbuf::Union const* type_as_Union() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_Union
-             ? static_cast<const cudf::io::parquet::flatbuf::Union*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::Union const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::FixedSizeBinary* type_as_FixedSizeBinary() const
+  cudf::io::parquet::flatbuf::FixedSizeBinary const* type_as_FixedSizeBinary() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_FixedSizeBinary
-             ? static_cast<const cudf::io::parquet::flatbuf::FixedSizeBinary*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::FixedSizeBinary const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::FixedSizeList* type_as_FixedSizeList() const
+  cudf::io::parquet::flatbuf::FixedSizeList const* type_as_FixedSizeList() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_FixedSizeList
-             ? static_cast<const cudf::io::parquet::flatbuf::FixedSizeList*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::FixedSizeList const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::Map* type_as_Map() const
+  cudf::io::parquet::flatbuf::Map const* type_as_Map() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_Map
-             ? static_cast<const cudf::io::parquet::flatbuf::Map*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::Map const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::Duration* type_as_Duration() const
+  cudf::io::parquet::flatbuf::Duration const* type_as_Duration() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_Duration
-             ? static_cast<const cudf::io::parquet::flatbuf::Duration*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::Duration const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::LargeBinary* type_as_LargeBinary() const
+  cudf::io::parquet::flatbuf::LargeBinary const* type_as_LargeBinary() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_LargeBinary
-             ? static_cast<const cudf::io::parquet::flatbuf::LargeBinary*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::LargeBinary const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::LargeUtf8* type_as_LargeUtf8() const
+  cudf::io::parquet::flatbuf::LargeUtf8 const* type_as_LargeUtf8() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_LargeUtf8
-             ? static_cast<const cudf::io::parquet::flatbuf::LargeUtf8*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::LargeUtf8 const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::LargeList* type_as_LargeList() const
+  cudf::io::parquet::flatbuf::LargeList const* type_as_LargeList() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_LargeList
-             ? static_cast<const cudf::io::parquet::flatbuf::LargeList*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::LargeList const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::RunEndEncoded* type_as_RunEndEncoded() const
+  cudf::io::parquet::flatbuf::RunEndEncoded const* type_as_RunEndEncoded() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_RunEndEncoded
-             ? static_cast<const cudf::io::parquet::flatbuf::RunEndEncoded*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::RunEndEncoded const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::BinaryView* type_as_BinaryView() const
+  cudf::io::parquet::flatbuf::BinaryView const* type_as_BinaryView() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_BinaryView
-             ? static_cast<const cudf::io::parquet::flatbuf::BinaryView*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::BinaryView const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::Utf8View* type_as_Utf8View() const
+  cudf::io::parquet::flatbuf::Utf8View const* type_as_Utf8View() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_Utf8View
-             ? static_cast<const cudf::io::parquet::flatbuf::Utf8View*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::Utf8View const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::ListView* type_as_ListView() const
+  cudf::io::parquet::flatbuf::ListView const* type_as_ListView() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_ListView
-             ? static_cast<const cudf::io::parquet::flatbuf::ListView*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::ListView const*>(type())
              : nullptr;
   }
-  const cudf::io::parquet::flatbuf::LargeListView* type_as_LargeListView() const
+  cudf::io::parquet::flatbuf::LargeListView const* type_as_LargeListView() const
   {
     return type_type() == cudf::io::parquet::flatbuf::Type_LargeListView
-             ? static_cast<const cudf::io::parquet::flatbuf::LargeListView*>(type())
+             ? static_cast<cudf::io::parquet::flatbuf::LargeListView const*>(type())
              : nullptr;
   }
   /// Present only if the field is dictionary encoded.
-  const cudf::io::parquet::flatbuf::DictionaryEncoding* dictionary() const
+  cudf::io::parquet::flatbuf::DictionaryEncoding const* dictionary() const
   {
-    return GetPointer<const cudf::io::parquet::flatbuf::DictionaryEncoding*>(VT_DICTIONARY);
+    return GetPointer<cudf::io::parquet::flatbuf::DictionaryEncoding const*>(VT_DICTIONARY);
   }
   /// children apply only to nested data types like Struct, List and Union. For
   /// primitive types children will have length 0.
-  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>* children()
+  ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>> const* children()
     const
   {
     return GetPointer<
-      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>*>(
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>> const*>(
       VT_CHILDREN);
   }
   /// User-defined metadata
-  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*
+  ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>> const*
   custom_metadata() const
   {
     return GetPointer<
-      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*>(
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>> const*>(
       VT_CUSTOM_METADATA);
   }
   bool Verify(::flatbuffers::Verifier& verifier) const
@@ -2203,182 +2203,182 @@ struct Field FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
 };
 
 template <>
-inline const cudf::io::parquet::flatbuf::Null* Field::type_as<cudf::io::parquet::flatbuf::Null>()
+inline cudf::io::parquet::flatbuf::Null const* Field::type_as<cudf::io::parquet::flatbuf::Null>()
   const
 {
   return type_as_Null();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::Int* Field::type_as<cudf::io::parquet::flatbuf::Int>()
+inline cudf::io::parquet::flatbuf::Int const* Field::type_as<cudf::io::parquet::flatbuf::Int>()
   const
 {
   return type_as_Int();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::FloatingPoint*
+inline cudf::io::parquet::flatbuf::FloatingPoint const*
 Field::type_as<cudf::io::parquet::flatbuf::FloatingPoint>() const
 {
   return type_as_FloatingPoint();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::Binary*
+inline cudf::io::parquet::flatbuf::Binary const*
 Field::type_as<cudf::io::parquet::flatbuf::Binary>() const
 {
   return type_as_Binary();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::Utf8* Field::type_as<cudf::io::parquet::flatbuf::Utf8>()
+inline cudf::io::parquet::flatbuf::Utf8 const* Field::type_as<cudf::io::parquet::flatbuf::Utf8>()
   const
 {
   return type_as_Utf8();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::Bool* Field::type_as<cudf::io::parquet::flatbuf::Bool>()
+inline cudf::io::parquet::flatbuf::Bool const* Field::type_as<cudf::io::parquet::flatbuf::Bool>()
   const
 {
   return type_as_Bool();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::Decimal*
+inline cudf::io::parquet::flatbuf::Decimal const*
 Field::type_as<cudf::io::parquet::flatbuf::Decimal>() const
 {
   return type_as_Decimal();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::Date* Field::type_as<cudf::io::parquet::flatbuf::Date>()
+inline cudf::io::parquet::flatbuf::Date const* Field::type_as<cudf::io::parquet::flatbuf::Date>()
   const
 {
   return type_as_Date();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::Time* Field::type_as<cudf::io::parquet::flatbuf::Time>()
+inline cudf::io::parquet::flatbuf::Time const* Field::type_as<cudf::io::parquet::flatbuf::Time>()
   const
 {
   return type_as_Time();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::Timestamp*
+inline cudf::io::parquet::flatbuf::Timestamp const*
 Field::type_as<cudf::io::parquet::flatbuf::Timestamp>() const
 {
   return type_as_Timestamp();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::Interval*
+inline cudf::io::parquet::flatbuf::Interval const*
 Field::type_as<cudf::io::parquet::flatbuf::Interval>() const
 {
   return type_as_Interval();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::List* Field::type_as<cudf::io::parquet::flatbuf::List>()
+inline cudf::io::parquet::flatbuf::List const* Field::type_as<cudf::io::parquet::flatbuf::List>()
   const
 {
   return type_as_List();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::Struct_*
+inline cudf::io::parquet::flatbuf::Struct_ const*
 Field::type_as<cudf::io::parquet::flatbuf::Struct_>() const
 {
   return type_as_Struct_();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::Union* Field::type_as<cudf::io::parquet::flatbuf::Union>()
+inline cudf::io::parquet::flatbuf::Union const* Field::type_as<cudf::io::parquet::flatbuf::Union>()
   const
 {
   return type_as_Union();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::FixedSizeBinary*
+inline cudf::io::parquet::flatbuf::FixedSizeBinary const*
 Field::type_as<cudf::io::parquet::flatbuf::FixedSizeBinary>() const
 {
   return type_as_FixedSizeBinary();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::FixedSizeList*
+inline cudf::io::parquet::flatbuf::FixedSizeList const*
 Field::type_as<cudf::io::parquet::flatbuf::FixedSizeList>() const
 {
   return type_as_FixedSizeList();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::Map* Field::type_as<cudf::io::parquet::flatbuf::Map>()
+inline cudf::io::parquet::flatbuf::Map const* Field::type_as<cudf::io::parquet::flatbuf::Map>()
   const
 {
   return type_as_Map();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::Duration*
+inline cudf::io::parquet::flatbuf::Duration const*
 Field::type_as<cudf::io::parquet::flatbuf::Duration>() const
 {
   return type_as_Duration();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::LargeBinary*
+inline cudf::io::parquet::flatbuf::LargeBinary const*
 Field::type_as<cudf::io::parquet::flatbuf::LargeBinary>() const
 {
   return type_as_LargeBinary();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::LargeUtf8*
+inline cudf::io::parquet::flatbuf::LargeUtf8 const*
 Field::type_as<cudf::io::parquet::flatbuf::LargeUtf8>() const
 {
   return type_as_LargeUtf8();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::LargeList*
+inline cudf::io::parquet::flatbuf::LargeList const*
 Field::type_as<cudf::io::parquet::flatbuf::LargeList>() const
 {
   return type_as_LargeList();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::RunEndEncoded*
+inline cudf::io::parquet::flatbuf::RunEndEncoded const*
 Field::type_as<cudf::io::parquet::flatbuf::RunEndEncoded>() const
 {
   return type_as_RunEndEncoded();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::BinaryView*
+inline cudf::io::parquet::flatbuf::BinaryView const*
 Field::type_as<cudf::io::parquet::flatbuf::BinaryView>() const
 {
   return type_as_BinaryView();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::Utf8View*
+inline cudf::io::parquet::flatbuf::Utf8View const*
 Field::type_as<cudf::io::parquet::flatbuf::Utf8View>() const
 {
   return type_as_Utf8View();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::ListView*
+inline cudf::io::parquet::flatbuf::ListView const*
 Field::type_as<cudf::io::parquet::flatbuf::ListView>() const
 {
   return type_as_ListView();
 }
 
 template <>
-inline const cudf::io::parquet::flatbuf::LargeListView*
+inline cudf::io::parquet::flatbuf::LargeListView const*
 Field::type_as<cudf::io::parquet::flatbuf::LargeListView>() const
 {
   return type_as_LargeListView();
@@ -2425,7 +2425,7 @@ struct FieldBuilder {
   }
   ::flatbuffers::Offset<Field> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<Field>(end);
     return o;
   }
@@ -2456,13 +2456,13 @@ inline ::flatbuffers::Offset<Field> CreateField(
 
 inline ::flatbuffers::Offset<Field> CreateFieldDirect(
   ::flatbuffers::FlatBufferBuilder& _fbb,
-  const char* name                           = nullptr,
+  char const* name                           = nullptr,
   bool nullable                              = false,
   cudf::io::parquet::flatbuf::Type type_type = cudf::io::parquet::flatbuf::Type_NONE,
   ::flatbuffers::Offset<void> type           = 0,
   ::flatbuffers::Offset<cudf::io::parquet::flatbuf::DictionaryEncoding> dictionary      = 0,
-  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>* children = nullptr,
-  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>* custom_metadata =
+  std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>> const* children = nullptr,
+  std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>> const* custom_metadata =
     nullptr)
 {
   auto name__ = name ? _fbb.CreateString(name) : 0;
@@ -2496,24 +2496,24 @@ struct Schema FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   {
     return static_cast<cudf::io::parquet::flatbuf::Endianness>(GetField<int16_t>(VT_ENDIANNESS, 0));
   }
-  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>* fields()
+  ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>> const* fields()
     const
   {
     return GetPointer<
-      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>*>(
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>> const*>(
       VT_FIELDS);
   }
-  const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*
+  ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>> const*
   custom_metadata() const
   {
     return GetPointer<
-      const ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>*>(
+      ::flatbuffers::Vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>> const*>(
       VT_CUSTOM_METADATA);
   }
   /// Features used in the stream/file.
-  const ::flatbuffers::Vector<int64_t>* features() const
+  ::flatbuffers::Vector<int64_t> const* features() const
   {
-    return GetPointer<const ::flatbuffers::Vector<int64_t>*>(VT_FEATURES);
+    return GetPointer<::flatbuffers::Vector<int64_t> const*>(VT_FEATURES);
   }
   bool Verify(::flatbuffers::Verifier& verifier) const
   {
@@ -2558,7 +2558,7 @@ struct SchemaBuilder {
   }
   ::flatbuffers::Offset<Schema> Finish()
   {
-    const auto end = fbb_.EndTable(start_);
+    auto const end = fbb_.EndTable(start_);
     auto o         = ::flatbuffers::Offset<Schema>(end);
     return o;
   }
@@ -2584,10 +2584,10 @@ inline ::flatbuffers::Offset<Schema> CreateSchema(
 inline ::flatbuffers::Offset<Schema> CreateSchemaDirect(
   ::flatbuffers::FlatBufferBuilder& _fbb,
   cudf::io::parquet::flatbuf::Endianness endianness = cudf::io::parquet::flatbuf::Endianness_Little,
-  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>* fields = nullptr,
-  const std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>>* custom_metadata =
+  std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>> const* fields = nullptr,
+  std::vector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::KeyValue>> const* custom_metadata =
     nullptr,
-  const std::vector<int64_t>* features = nullptr)
+  std::vector<int64_t> const* features = nullptr)
 {
   auto fields__ =
     fields ? _fbb.CreateVector<::flatbuffers::Offset<cudf::io::parquet::flatbuf::Field>>(*fields)
@@ -2602,114 +2602,114 @@ inline ::flatbuffers::Offset<Schema> CreateSchemaDirect(
     _fbb, endianness, fields__, custom_metadata__, features__);
 }
 
-inline bool VerifyType(::flatbuffers::Verifier& verifier, const void* obj, Type type)
+inline bool VerifyType(::flatbuffers::Verifier& verifier, void const* obj, Type type)
 {
   switch (type) {
     case Type_NONE: {
       return true;
     }
     case Type_Null: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Null*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::Null const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_Int: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Int*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::Int const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_FloatingPoint: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::FloatingPoint*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::FloatingPoint const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_Binary: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Binary*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::Binary const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_Utf8: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Utf8*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::Utf8 const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_Bool: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Bool*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::Bool const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_Decimal: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Decimal*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::Decimal const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_Date: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Date*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::Date const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_Time: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Time*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::Time const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_Timestamp: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Timestamp*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::Timestamp const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_Interval: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Interval*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::Interval const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_List: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::List*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::List const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_Struct_: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Struct_*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::Struct_ const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_Union: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Union*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::Union const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_FixedSizeBinary: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::FixedSizeBinary*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::FixedSizeBinary const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_FixedSizeList: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::FixedSizeList*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::FixedSizeList const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_Map: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Map*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::Map const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_Duration: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Duration*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::Duration const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_LargeBinary: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::LargeBinary*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::LargeBinary const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_LargeUtf8: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::LargeUtf8*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::LargeUtf8 const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_LargeList: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::LargeList*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::LargeList const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_RunEndEncoded: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::RunEndEncoded*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::RunEndEncoded const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_BinaryView: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::BinaryView*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::BinaryView const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_Utf8View: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::Utf8View*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::Utf8View const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_ListView: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::ListView*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::ListView const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     case Type_LargeListView: {
-      auto ptr = reinterpret_cast<const cudf::io::parquet::flatbuf::LargeListView*>(obj);
+      auto ptr = reinterpret_cast<cudf::io::parquet::flatbuf::LargeListView const*>(obj);
       return verifier.VerifyTable(ptr);
     }
     default: return true;
@@ -2717,8 +2717,8 @@ inline bool VerifyType(::flatbuffers::Verifier& verifier, const void* obj, Type
 }
 
 inline bool VerifyTypeVector(::flatbuffers::Verifier& verifier,
-                             const ::flatbuffers::Vector<::flatbuffers::Offset<void>>* values,
-                             const ::flatbuffers::Vector<uint8_t>* types)
+                             ::flatbuffers::Vector<::flatbuffers::Offset<void>> const* values,
+                             ::flatbuffers::Vector<uint8_t> const* types)
 {
   if (!values || !types) return !values && !types;
   if (values->size() != types->size()) return false;
@@ -2728,12 +2728,12 @@ inline bool VerifyTypeVector(::flatbuffers::Verifier& verifier,
   return true;
 }
 
-inline const cudf::io::parquet::flatbuf::Schema* GetSchema(const void* buf)
+inline cudf::io::parquet::flatbuf::Schema const* GetSchema(void const* buf)
 {
   return ::flatbuffers::GetRoot<cudf::io::parquet::flatbuf::Schema>(buf);
 }
 
-inline const cudf::io::parquet::flatbuf::Schema* GetSizePrefixedSchema(const void* buf)
+inline cudf::io::parquet::flatbuf::Schema const* GetSizePrefixedSchema(void const* buf)
 {
   return ::flatbuffers::GetSizePrefixedRoot<cudf::io::parquet::flatbuf::Schema>(buf);
 }
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index ba3d35b9586..58e8a09d5b6 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -379,7 +379,7 @@ __device__ size_t totalDictEntriesSize(uint8_t const* data,
       if (mytid < batch_len) {
         dict_idx         = dict_val;
         int32_t ofs      = (mytid - ((batch_len + 7) & ~7)) * dict_bits;
-        const uint8_t* p = ptr + (ofs >> 3);
+        uint8_t const* p = ptr + (ofs >> 3);
         ofs &= 7;
         if (p < end) {
           uint32_t c = 8 - ofs;
@@ -399,7 +399,7 @@ __device__ size_t totalDictEntriesSize(uint8_t const* data,
         if (pos + mytid < end_value) {
           uint32_t const dict_pos = (dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0;
           if (pos + mytid >= start_value && dict_pos < (uint32_t)dict_size) {
-            const auto* src = reinterpret_cast<const string_index_pair*>(dict_base + dict_pos);
+            auto const* src = reinterpret_cast<string_index_pair const*>(dict_base + dict_pos);
             l_str_len += src->second;
           }
         }
@@ -413,7 +413,7 @@ __device__ size_t totalDictEntriesSize(uint8_t const* data,
       if (mytid == 0) {
         uint32_t const dict_pos = (dict_bits > 0) ? dict_val * sizeof(string_index_pair) : 0;
         if (pos + batch_len > start_value && dict_pos < (uint32_t)dict_size) {
-          const auto* src = reinterpret_cast<const string_index_pair*>(dict_base + dict_pos);
+          auto const* src = reinterpret_cast<string_index_pair const*>(dict_base + dict_pos);
           l_str_len += (batch_len - start_off) * src->second;
         }
       }
@@ -452,7 +452,7 @@ __device__ size_t totalPlainEntriesSize(uint8_t const* data,
 
   // This step is purely serial
   if (!t) {
-    const uint8_t* cur = data;
+    uint8_t const* cur = data;
     int k              = 0;
 
     while (pos < end_value && k < data_size) {
@@ -899,7 +899,7 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size) gpuComputePageStringSi
         // RLE-packed dictionary indices, first byte indicates index length in bits
         if (col.str_dict_index) {
           // String dictionary: use index
-          dict_base = reinterpret_cast<const uint8_t*>(col.str_dict_index);
+          dict_base = reinterpret_cast<uint8_t const*>(col.str_dict_index);
           dict_size = col.dict_page->num_input_values * sizeof(string_index_pair);
         } else {
           dict_base = col.dict_page->page_data;
diff --git a/cpp/src/io/parquet/page_string_utils.cuh b/cpp/src/io/parquet/page_string_utils.cuh
index a81d0a64466..66073097579 100644
--- a/cpp/src/io/parquet/page_string_utils.cuh
+++ b/cpp/src/io/parquet/page_string_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -51,7 +51,7 @@ inline __device__ void wideStrcpy(uint8_t* dst, uint8_t const* src, size_t len,
   for (int64_t ichar = out_start_aligned + lane_id * out_datatype_size; ichar < out_end_aligned;
        ichar += warp_size * out_datatype_size) {
     *(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) =
-      load_uint4((const char*)in_start + ichar);
+      load_uint4((char const*)in_start + ichar);
   }
 
   // Tail logic: copy characters of the current string outside
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index e35742c2527..8ee4c175e09 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -105,43 +105,51 @@ struct LogicalType {
   LogicalType(TimestampType&& tst) : type(TIMESTAMP), timestamp_type(tst) {}
   LogicalType(IntType&& it) : type(INTEGER), int_type(it) {}
 
-  constexpr bool is_time_millis() const
+  [[nodiscard]] constexpr bool is_time_millis() const
   {
     return type == TIME and time_type->unit.type == TimeUnit::MILLIS;
   }
 
-  constexpr bool is_time_micros() const
+  [[nodiscard]] constexpr bool is_time_micros() const
   {
     return type == TIME and time_type->unit.type == TimeUnit::MICROS;
   }
 
-  constexpr bool is_time_nanos() const
+  [[nodiscard]] constexpr bool is_time_nanos() const
   {
     return type == TIME and time_type->unit.type == TimeUnit::NANOS;
   }
 
-  constexpr bool is_timestamp_millis() const
+  [[nodiscard]] constexpr bool is_timestamp_millis() const
   {
     return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MILLIS;
   }
 
-  constexpr bool is_timestamp_micros() const
+  [[nodiscard]] constexpr bool is_timestamp_micros() const
   {
     return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::MICROS;
   }
 
-  constexpr bool is_timestamp_nanos() const
+  [[nodiscard]] constexpr bool is_timestamp_nanos() const
   {
     return type == TIMESTAMP and timestamp_type->unit.type == TimeUnit::NANOS;
   }
+  [[nodiscard]] constexpr int8_t bit_width() const
+  {
+    return type == INTEGER ? int_type->bitWidth : -1;
+  }
 
-  constexpr int8_t bit_width() const { return type == INTEGER ? int_type->bitWidth : -1; }
-
-  constexpr bool is_signed() const { return type == INTEGER and int_type->isSigned; }
+  [[nodiscard]] constexpr bool is_signed() const { return type == INTEGER and int_type->isSigned; }
 
-  constexpr int32_t scale() const { return type == DECIMAL ? decimal_type->scale : -1; }
+  [[nodiscard]] constexpr int32_t scale() const
+  {
+    return type == DECIMAL ? decimal_type->scale : -1;
+  }
 
-  constexpr int32_t precision() const { return type == DECIMAL ? decimal_type->precision : -1; }
+  [[nodiscard]] constexpr int32_t precision() const
+  {
+    return type == DECIMAL ? decimal_type->precision : -1;
+  }
 };
 
 /**
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index e3e4d8736c7..d82c6f0de59 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -36,6 +36,7 @@
 #include <cuda_runtime.h>
 
 #include <type_traits>
+#include <utility>
 #include <vector>
 
 namespace cudf::io::parquet::detail {
@@ -133,11 +134,11 @@ struct input_column_info {
   std::vector<int> nesting;
 
   input_column_info(int _schema_idx, std::string _name, bool _has_repetition)
-    : schema_idx(_schema_idx), name(_name), has_repetition(_has_repetition)
+    : schema_idx(_schema_idx), name(std::move(_name)), has_repetition(_has_repetition)
   {
   }
 
-  auto nesting_depth() const { return nesting.size(); }
+  [[nodiscard]] auto nesting_depth() const { return nesting.size(); }
 };
 
 // The delta encodings use ULEB128 integers, but parquet only uses max 64 bits.
@@ -148,12 +149,12 @@ using zigzag128_t = int64_t;
 #if !defined(__cpp_lib_is_scoped_enum)
 template <typename Enum, bool = std::is_enum_v<Enum>>
 struct is_scoped_enum {
-  static const bool value = not std::is_convertible_v<Enum, std::underlying_type_t<Enum>>;
+  static bool const value = not std::is_convertible_v<Enum, std::underlying_type_t<Enum>>;
 };
 
 template <typename Enum>
 struct is_scoped_enum<Enum, false> {
-  static const bool value = false;
+  static bool const value = false;
 };
 #else
 using std::is_scoped_enum;
@@ -406,13 +407,7 @@ struct ColumnChunkDesc {
       type_length(datatype_length_),
       physical_type(datatype_),
       level_bits{def_level_bits_, rep_level_bits_},
-      num_data_pages(0),
-      num_dict_pages(0),
-      dict_page(nullptr),
-      str_dict_index(nullptr),
-      valid_map_base{nullptr},
-      column_data_base{nullptr},
-      column_string_base{nullptr},
+
       codec(codec_),
       logical_type(logical_type_),
       ts_clock_rate(ts_clock_rate_),
@@ -420,8 +415,8 @@ struct ColumnChunkDesc {
       src_col_schema(src_col_schema_),
       h_chunk_info(chunk_info_),
       list_bytes_per_row_est(list_bytes_per_row_est_),
-      is_strings_to_cat(strings_to_categorical_),
-      is_large_string_col(false)
+      is_strings_to_cat(strings_to_categorical_)
+
   {
   }
 
@@ -475,8 +470,8 @@ struct parquet_column_device_view : stats_column_desc {
   int32_t type_length;           //!< length of fixed_length_byte_array data
   uint8_t level_bits;  //!< bits to encode max definition (lower nibble) & repetition (upper nibble)
                        //!< levels
-  constexpr uint8_t num_def_level_bits() const { return level_bits & 0xf; }
-  constexpr uint8_t num_rep_level_bits() const { return level_bits >> 4; }
+  [[nodiscard]] constexpr uint8_t num_def_level_bits() const { return level_bits & 0xf; }
+  [[nodiscard]] constexpr uint8_t num_rep_level_bits() const { return level_bits >> 4; }
   uint8_t max_def_level;  //!< needed for SizeStatistics calculation
   uint8_t max_rep_level;
 
@@ -578,9 +573,9 @@ struct EncColumnChunk {
   uint32_t* rep_histogram_data;  //!< Size is (max(level) + 1) * (num_data_pages + 1).
   size_t var_bytes_size;         //!< Sum of var_bytes_size from the pages (byte arrays only)
 
-  constexpr uint32_t num_dict_pages() const { return use_dictionary ? 1 : 0; }
+  [[nodiscard]] constexpr uint32_t num_dict_pages() const { return use_dictionary ? 1 : 0; }
 
-  constexpr uint32_t num_data_pages() const { return num_pages - num_dict_pages(); }
+  [[nodiscard]] constexpr uint32_t num_data_pages() const { return num_pages - num_dict_pages(); }
 };
 
 /**
@@ -619,9 +614,9 @@ struct EncPage {
   Encoding encoding;       //!< Encoding used for page data
   uint16_t num_fragments;  //!< Number of fragments in page
 
-  constexpr bool is_v2() const { return page_type == PageType::DATA_PAGE_V2; }
+  [[nodiscard]] constexpr bool is_v2() const { return page_type == PageType::DATA_PAGE_V2; }
 
-  constexpr auto level_bytes() const { return def_lvl_bytes + rep_lvl_bytes; }
+  [[nodiscard]] constexpr auto level_bytes() const { return def_lvl_bytes + rep_lvl_bytes; }
 };
 
 /**
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 0109be661a7..11f4a00ee8b 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -474,9 +474,9 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
     return std::nullopt;
   }
   size_type is_required_idx = 0;
-  for (size_t src_idx = 0; src_idx < input_row_group_indices.size(); ++src_idx) {
+  for (auto const& input_row_group_index : input_row_group_indices) {
     std::vector<size_type> filtered_row_groups;
-    for (auto const rg_idx : input_row_group_indices[src_idx]) {
+    for (auto const rg_idx : input_row_group_index) {
       if ((!validity_it[is_required_idx]) || is_row_group_required[is_required_idx]) {
         filtered_row_groups.push_back(rg_idx);
       }
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index d3f321af0bd..9ad5a2d6e8d 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -1027,7 +1027,7 @@ struct decompression_info {
  *
  */
 struct get_decomp_info {
-  device_span<const ColumnChunkDesc> chunks;
+  device_span<ColumnChunkDesc const> chunks;
 
   __device__ decompression_info operator()(PageInfo const& p) const
   {
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 9de8a9e2719..ebd4affd099 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -48,10 +48,10 @@ thrust::optional<LogicalType> converted_to_logical_type(SchemaElement const& sch
       case LIST: return LogicalType{LogicalType::LIST};
       case DECIMAL: return LogicalType{DecimalType{schema.decimal_scale, schema.decimal_precision}};
       case DATE: return LogicalType{LogicalType::DATE};
-      case TIME_MILLIS: return LogicalType{TimeType{true, TimeUnit::MILLIS}};
-      case TIME_MICROS: return LogicalType{TimeType{true, TimeUnit::MICROS}};
-      case TIMESTAMP_MILLIS: return LogicalType{TimestampType{true, TimeUnit::MILLIS}};
-      case TIMESTAMP_MICROS: return LogicalType{TimestampType{true, TimeUnit::MICROS}};
+      case TIME_MILLIS: return LogicalType{TimeType{true, {TimeUnit::MILLIS}}};
+      case TIME_MICROS: return LogicalType{TimeType{true, {TimeUnit::MICROS}}};
+      case TIMESTAMP_MILLIS: return LogicalType{TimestampType{true, {TimeUnit::MILLIS}}};
+      case TIMESTAMP_MICROS: return LogicalType{TimestampType{true, {TimeUnit::MICROS}}};
       case UINT_8: return LogicalType{IntType{8, false}};
       case UINT_16: return LogicalType{IntType{16, false}};
       case UINT_32: return LogicalType{IntType{32, false}};
@@ -1093,12 +1093,11 @@ aggregate_reader_metadata::select_columns(
                                         has_list_parent || col_type == type_id::LIST);
         }
       } else {
-        for (size_t idx = 0; idx < col_name_info->children.size(); idx++) {
-          path_is_valid |=
-            build_column(&col_name_info->children[idx],
-                         find_schema_child(schema_elem, col_name_info->children[idx].name),
-                         output_col.children,
-                         has_list_parent || col_type == type_id::LIST);
+        for (const auto& idx : col_name_info->children) {
+          path_is_valid |= build_column(&idx,
+                                        find_schema_child(schema_elem, idx.name),
+                                        output_col.children,
+                                        has_list_parent || col_type == type_id::LIST);
         }
       }
 
@@ -1106,7 +1105,7 @@ aggregate_reader_metadata::select_columns(
       // data stored) so add me to the list.
       if (schema_elem.num_children == 0) {
         input_column_info& input_col = input_columns.emplace_back(
-          input_column_info{schema_idx, schema_elem.name, schema_elem.max_repetition_level > 0});
+          schema_idx, schema_elem.name, schema_elem.max_repetition_level > 0);
 
         // set up child output column for one-level encoding list
         if (one_level_list) {
@@ -1257,10 +1256,9 @@ aggregate_reader_metadata::select_columns(
      */
     for (auto const& path : use_names3) {
       auto array_to_find_in = &selected_columns;
-      for (size_t depth = 0; depth < path.size(); ++depth) {
+      for (auto const& name_to_find : path) {
         // Check if the path exists in our selected_columns and if not, add it.
-        auto const& name_to_find = path[depth];
-        auto found_col           = std::find_if(
+        auto found_col = std::find_if(
           array_to_find_in->begin(),
           array_to_find_in->end(),
           [&name_to_find](column_name_info const& col) { return col.name == name_to_find; });
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 7cb982f103d..9df5c362cdd 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -647,7 +647,7 @@ constexpr bool is_string_chunk(ColumnChunkDesc const& chunk)
 
 struct set_str_dict_index_count {
   device_span<size_t> str_dict_index_count;
-  device_span<const ColumnChunkDesc> chunks;
+  device_span<ColumnChunkDesc const> chunks;
 
   __device__ void operator()(PageInfo const& page)
   {
@@ -662,7 +662,7 @@ struct set_str_dict_index_count {
 
 struct set_str_dict_index_ptr {
   string_index_pair* const base;
-  device_span<const size_t> str_dict_index_offsets;
+  device_span<size_t const> str_dict_index_offsets;
   device_span<ColumnChunkDesc> chunks;
 
   __device__ void operator()(size_t i)
@@ -679,7 +679,7 @@ struct set_str_dict_index_ptr {
  *
  */
 struct set_list_row_count_estimate {
-  device_span<const ColumnChunkDesc> chunks;
+  device_span<ColumnChunkDesc const> chunks;
 
   __device__ void operator()(PageInfo& page)
   {
@@ -708,7 +708,7 @@ struct set_list_row_count_estimate {
  */
 struct set_final_row_count {
   device_span<PageInfo> pages;
-  device_span<const ColumnChunkDesc> chunks;
+  device_span<ColumnChunkDesc const> chunks;
 
   __device__ void operator()(size_t i)
   {
diff --git a/cpp/src/io/statistics/byte_array_view.cuh b/cpp/src/io/statistics/byte_array_view.cuh
index 0fe6c17db89..58698c6a19d 100644
--- a/cpp/src/io/statistics/byte_array_view.cuh
+++ b/cpp/src/io/statistics/byte_array_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -161,7 +161,7 @@ class byte_array_view {
    *
    * @return An empty byte_array_view
    */
-  [[nodiscard]] __device__ inline static byte_array_view min() { return byte_array_view(); }
+  [[nodiscard]] __device__ inline static byte_array_view min() { return {}; }
 
   /**
    * @brief Return a byte_array_view to interpret as maximum value
@@ -170,7 +170,7 @@ class byte_array_view {
    */
   [[nodiscard]] __device__ inline static byte_array_view max()
   {
-    return byte_array_view(nullptr, std::numeric_limits<std::size_t>::max());
+    return {nullptr, std::numeric_limits<std::size_t>::max()};
   }
 
  private:
diff --git a/cpp/src/io/utilities/arrow_io_source.cpp b/cpp/src/io/utilities/arrow_io_source.cpp
index d647f3c0a4b..157240b8b08 100644
--- a/cpp/src/io/utilities/arrow_io_source.cpp
+++ b/cpp/src/io/utilities/arrow_io_source.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <memory>
 #include <string>
+#include <utility>
 
 namespace cudf::io {
 
@@ -32,7 +33,8 @@ class arrow_io_buffer : public datasource::buffer {
   std::shared_ptr<arrow::Buffer> arrow_buffer;
 
  public:
-  explicit arrow_io_buffer(std::shared_ptr<arrow::Buffer> arrow_buffer) : arrow_buffer(arrow_buffer)
+  explicit arrow_io_buffer(std::shared_ptr<arrow::Buffer> arrow_buffer)
+    : arrow_buffer(std::move(arrow_buffer))
   {
   }
   [[nodiscard]] size_t size() const override { return arrow_buffer->size(); }
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 27fc53fbc9e..2f4272b0367 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -171,9 +171,7 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
   switch (buffer.type.id()) {
     case type_id::STRING:
       if (schema.value_or(reader_column_schema{}).is_enabled_convert_binary_to_strings()) {
-        if (schema_info != nullptr) {
-          schema_info->children.push_back(column_name_info{"offsets"});
-        }
+        if (schema_info != nullptr) { schema_info->children.emplace_back("offsets"); }
 
         // make_strings_column allocates new memory, it does not simply move
         // from the inputs, so we need to pass it the memory resource given to
@@ -199,8 +197,8 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
           data_type{type_id::UINT8}, char_size, std::move(*data), rmm::device_buffer{}, 0);
 
         if (schema_info != nullptr) {
-          schema_info->children.push_back(column_name_info{"offsets"});
-          schema_info->children.push_back(column_name_info{"binary"});
+          schema_info->children.emplace_back("offsets");
+          schema_info->children.emplace_back("binary");
           // cuDF type will be list<UINT8>, but remember it was originally binary data
           schema_info->is_binary = true;
           if (schema.has_value() and schema->get_type_length() > 0) {
@@ -224,8 +222,8 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
 
       column_name_info* child_info = nullptr;
       if (schema_info != nullptr) {
-        schema_info->children.push_back(column_name_info{"offsets"});
-        schema_info->children.push_back(column_name_info{""});
+        schema_info->children.emplace_back("offsets");
+        schema_info->children.emplace_back("");
         child_info = &schema_info->children.back();
       }
 
@@ -256,7 +254,7 @@ std::unique_ptr<column> make_column(column_buffer_base<string_policy>& buffer,
       for (size_t i = 0; i < buffer.children.size(); ++i) {
         column_name_info* child_info = nullptr;
         if (schema_info != nullptr) {
-          schema_info->children.push_back(column_name_info{""});
+          schema_info->children.emplace_back("");
           child_info = &schema_info->children.back();
         }
 
@@ -306,8 +304,8 @@ std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
 
       column_name_info* child_info = nullptr;
       if (schema_info != nullptr) {
-        schema_info->children.push_back(column_name_info{"offsets"});
-        schema_info->children.push_back(column_name_info{""});
+        schema_info->children.emplace_back("offsets");
+        schema_info->children.emplace_back("");
         child_info = &schema_info->children.back();
       }
 
@@ -330,7 +328,7 @@ std::unique_ptr<column> empty_like(column_buffer_base<string_policy>& buffer,
                      [&](auto& col) {
                        column_name_info* child_info = nullptr;
                        if (schema_info != nullptr) {
-                         schema_info->children.push_back(column_name_info{""});
+                         schema_info->children.emplace_back("");
                          child_info = &schema_info->children.back();
                        }
                        return cudf::io::detail::empty_like<string_policy>(
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index e6bfae0681a..ed6bb8bbdca 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -137,8 +137,11 @@ class column_buffer_base {
   auto& null_count() { return _null_count; }
 
   auto data() { return static_cast<string_policy*>(this)->data_impl(); }
-  auto data() const { return static_cast<string_policy const*>(this)->data_impl(); }
-  auto data_size() const { return static_cast<string_policy const*>(this)->data_size_impl(); }
+  [[nodiscard]] auto data() const { return static_cast<string_policy const*>(this)->data_impl(); }
+  [[nodiscard]] auto data_size() const
+  {
+    return static_cast<string_policy const*>(this)->data_size_impl();
+  }
 
   std::unique_ptr<column> make_string_column(rmm::cuda_stream_view stream)
   {
@@ -191,9 +194,9 @@ class gather_column_buffer : public column_buffer_base<gather_column_buffer> {
 
   void allocate_strings_data(rmm::cuda_stream_view stream);
 
-  void* data_impl() { return _strings ? _strings->data() : _data.data(); }
-  void const* data_impl() const { return _strings ? _strings->data() : _data.data(); }
-  size_t data_size_impl() const { return _strings ? _strings->size() : _data.size(); }
+  [[nodiscard]] void* data_impl() { return _strings ? _strings->data() : _data.data(); }
+  [[nodiscard]] void const* data_impl() const { return _strings ? _strings->data() : _data.data(); }
+  [[nodiscard]] size_t data_size_impl() const { return _strings ? _strings->size() : _data.size(); }
 
   std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream);
 
@@ -226,14 +229,14 @@ class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
   void allocate_strings_data(rmm::cuda_stream_view stream);
 
   void* data_impl() { return _data.data(); }
-  void const* data_impl() const { return _data.data(); }
-  size_t data_size_impl() const { return _data.size(); }
+  [[nodiscard]] void const* data_impl() const { return _data.data(); }
+  [[nodiscard]] size_t data_size_impl() const { return _data.size(); }
   std::unique_ptr<column> make_string_column_impl(rmm::cuda_stream_view stream);
 
   void create_string_data(size_t num_bytes, rmm::cuda_stream_view stream);
   void* string_data() { return _string_data.data(); }
-  void const* string_data() const { return _string_data.data(); }
-  size_t string_size() const { return _string_data.size(); }
+  [[nodiscard]] void const* string_data() const { return _string_data.data(); }
+  [[nodiscard]] size_t string_size() const { return _string_data.size(); }
 
  private:
   rmm::device_buffer _string_data{};
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 288a5690282..aa1b29a101f 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -917,8 +917,8 @@ static std::unique_ptr<column> parse_string(string_view_pair_it str_tuples,
 }
 
 std::unique_ptr<column> parse_data(
-  const char* data,
-  thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
+  char const* data,
+  thrust::zip_iterator<thrust::tuple<size_type const*, size_type const*>> offset_length_begin,
   size_type col_size,
   data_type col_type,
   rmm::device_buffer&& null_mask,
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index 66905c5256f..a6cbbcd84a6 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -49,7 +49,7 @@ class file_sink : public data_sink {
     }
   }
 
-  virtual ~file_sink() { flush(); }
+  ~file_sink() override { flush(); }
 
   void host_write(void const* data, size_t size) override
   {
@@ -113,7 +113,7 @@ class host_buffer_sink : public data_sink {
  public:
   explicit host_buffer_sink(std::vector<char>* buffer) : buffer_(buffer) {}
 
-  virtual ~host_buffer_sink() { flush(); }
+  ~host_buffer_sink() override { flush(); }
 
   void host_write(void const* data, size_t size) override
   {
@@ -136,7 +136,7 @@ class void_sink : public data_sink {
  public:
   explicit void_sink() {}
 
-  virtual ~void_sink() {}
+  ~void_sink() override {}
 
   void host_write(void const* data, size_t size) override { _bytes_written += size; }
 
@@ -169,7 +169,7 @@ class user_sink_wrapper : public data_sink {
  public:
   explicit user_sink_wrapper(cudf::io::data_sink* const user_sink_) : user_sink(user_sink_) {}
 
-  virtual ~user_sink_wrapper() {}
+  ~user_sink_wrapper() override {}
 
   void host_write(void const* data, size_t size) override { user_sink->host_write(data, size); }
 
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index d8dbd3614c8..ca8932322bf 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -53,7 +53,7 @@ class file_source : public datasource {
     }
   }
 
-  virtual ~file_source() = default;
+  ~file_source() override = default;
 
   [[nodiscard]] bool supports_device_read() const override
   {
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 39031526fc8..a9d4f19c848 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -23,9 +23,9 @@
 #include <rmm/device_buffer.hpp>
 
 #include <dlfcn.h>
-#include <errno.h>
-#include <string.h>
 
+#include <cerrno>
+#include <cstring>
 #include <filesystem>
 #include <fstream>
 #include <numeric>
@@ -39,7 +39,7 @@ void force_init_cuda_context()
   // Workaround for https://github.com/rapidsai/cudf/issues/14140, where cuFileDriverOpen errors
   // out if no CUDA calls have been made before it. This is a no-op if the CUDA context is already
   // initialized.
-  cudaFree(0);
+  cudaFree(nullptr);
 }
 
 [[noreturn]] void throw_on_file_open_failure(std::string const& filepath, bool is_create)
@@ -98,7 +98,7 @@ class cufile_shim {
   decltype(cuFileDriverClose)* driver_close = nullptr;
 
   std::unique_ptr<cudf::logic_error> init_error;
-  auto is_valid() const noexcept { return init_error == nullptr; }
+  [[nodiscard]] auto is_valid() const noexcept { return init_error == nullptr; }
 
  public:
   cufile_shim(cufile_shim const&)            = delete;
diff --git a/cpp/src/io/utilities/hostdevice_span.hpp b/cpp/src/io/utilities/hostdevice_span.hpp
index c9a58ab31cf..d9eac423901 100644
--- a/cpp/src/io/utilities/hostdevice_span.hpp
+++ b/cpp/src/io/utilities/hostdevice_span.hpp
@@ -170,7 +170,7 @@ class hostdevice_span {
    * @param count The number of elements in the subspan
    * @return A subspan of the sequence, of requested count and offset
    */
-  constexpr hostdevice_span<T> subspan(size_t offset, size_t count) const noexcept
+  [[nodiscard]] constexpr hostdevice_span<T> subspan(size_t offset, size_t count) const noexcept
   {
     return hostdevice_span<T>(_host_data + offset, _device_data + offset, count);
   }
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 1ae27a2f4ae..9acd6a1e3a9 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -224,7 +224,7 @@ class hostdevice_2dvector {
 
   T const* base_device_ptr(size_t offset = 0) const { return _data.device_ptr(offset); }
 
-  size_t size_bytes() const noexcept { return _data.size_bytes(); }
+  [[nodiscard]] size_t size_bytes() const noexcept { return _data.size_bytes(); }
 
   void host_to_device_async(rmm::cuda_stream_view stream) { _data.host_to_device_async(stream); }
   void host_to_device_sync(rmm::cuda_stream_view stream) { _data.host_to_device_sync(stream); }
diff --git a/cpp/src/io/utilities/output_builder.cuh b/cpp/src/io/utilities/output_builder.cuh
index a7517983cd3..3bc5ccf41ef 100644
--- a/cpp/src/io/utilities/output_builder.cuh
+++ b/cpp/src/io/utilities/output_builder.cuh
@@ -208,7 +208,7 @@ class output_builder {
                  size_type max_growth,
                  rmm::cuda_stream_view stream,
                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
-    : _size{0}, _max_write_size{max_write_size}, _max_growth{max_growth}
+    : _max_write_size{max_write_size}, _max_growth{max_growth}
   {
     CUDF_EXPECTS(max_write_size > 0, "Internal error");
     _chunks.emplace_back(0, stream, mr);
@@ -349,7 +349,7 @@ class output_builder {
     return device_span<T>{vector.data() + vector.size(), vector.capacity() - vector.size()};
   }
 
-  size_type _size;
+  size_type _size{0};
   size_type _max_write_size;
   size_type _max_growth;
   std::vector<rmm::device_uvector<T>> _chunks;
diff --git a/cpp/src/io/utilities/string_parsing.hpp b/cpp/src/io/utilities/string_parsing.hpp
index 612889af74b..3e6f57f2896 100644
--- a/cpp/src/io/utilities/string_parsing.hpp
+++ b/cpp/src/io/utilities/string_parsing.hpp
@@ -46,7 +46,7 @@ namespace detail {
 cudf::data_type infer_data_type(
   cudf::io::json_inference_options_view const& options,
   device_span<char const> data,
-  thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
+  thrust::zip_iterator<thrust::tuple<size_type const*, size_type const*>> offset_length_begin,
   std::size_t const size,
   rmm::cuda_stream_view stream);
 }  // namespace detail
@@ -67,8 +67,8 @@ namespace json::detail {
  * @return The column that contains the parsed data
  */
 std::unique_ptr<column> parse_data(
-  const char* data,
-  thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
+  char const* data,
+  thrust::zip_iterator<thrust::tuple<size_type const*, size_type const*>> offset_length_begin,
   size_type col_size,
   data_type col_type,
   rmm::device_buffer&& null_mask,
diff --git a/cpp/src/io/utilities/type_inference.cu b/cpp/src/io/utilities/type_inference.cu
index dff40cc09ed..43dc38c4ac6 100644
--- a/cpp/src/io/utilities/type_inference.cu
+++ b/cpp/src/io/utilities/type_inference.cu
@@ -255,7 +255,7 @@ cudf::io::column_type_histogram infer_column_type(OptionsView const& options,
 cudf::data_type infer_data_type(
   cudf::io::json_inference_options_view const& options,
   device_span<char const> data,
-  thrust::zip_iterator<thrust::tuple<const size_type*, const size_type*>> offset_length_begin,
+  thrust::zip_iterator<thrust::tuple<size_type const*, size_type const*>> offset_length_begin,
   std::size_t const size,
   rmm::cuda_stream_view stream)
 {
diff --git a/cpp/src/jit/cache.cpp b/cpp/src/jit/cache.cpp
index bc8e3e8e392..89c47d246d0 100644
--- a/cpp/src/jit/cache.cpp
+++ b/cpp/src/jit/cache.cpp
@@ -33,7 +33,7 @@ std::filesystem::path get_user_home_cache_dir()
   if (home_dir != nullptr) {
     return std::filesystem::path(home_dir) / ".cudf";
   } else {
-    return std::filesystem::path();
+    return {};
   }
 }
 
@@ -90,7 +90,7 @@ std::filesystem::path get_cache_dir()
       std::filesystem::create_directories(kernel_cache_path);
     } catch (std::exception const& e) {
       // if directory creation fails for any reason, return empty path
-      return std::filesystem::path();
+      return {};
     }
   }
   return kernel_cache_path;
diff --git a/cpp/src/jit/parser.cpp b/cpp/src/jit/parser.cpp
index e59c1089318..398c36821cc 100644
--- a/cpp/src/jit/parser.cpp
+++ b/cpp/src/jit/parser.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <map>
 #include <set>
 #include <string>
+#include <utility>
 #include <vector>
 
 namespace cudf {
@@ -207,7 +208,7 @@ std::string ptx_parser::parse_instruction(std::string const& src)
       } else if (is_pragma_instruction) {
         // quote any string
         std::string transformed_piece;
-        for (const auto& c : piece) {
+        for (auto const& c : piece) {
           if (c == '"') {
             transformed_piece += "\\\"";
           } else {
@@ -378,13 +379,13 @@ std::string ptx_parser::parse()
   return final_output + " asm volatile (\"RETTGT:}\");}";
 }
 
-ptx_parser::ptx_parser(std::string const& ptx_,
-                       std::string const& function_name_,
-                       std::string const& output_arg_type_,
+ptx_parser::ptx_parser(std::string ptx_,
+                       std::string function_name_,
+                       std::string output_arg_type_,
                        std::set<int> const& pointer_arg_list_)
-  : ptx(ptx_),
-    function_name(function_name_),
-    output_arg_type(output_arg_type_),
+  : ptx(std::move(ptx_)),
+    function_name(std::move(function_name_)),
+    output_arg_type(std::move(output_arg_type_)),
     pointer_arg_list(pointer_arg_list_)
 {
 }
diff --git a/cpp/src/jit/parser.hpp b/cpp/src/jit/parser.hpp
index 86f869c5e97..55528bed6cf 100644
--- a/cpp/src/jit/parser.hpp
+++ b/cpp/src/jit/parser.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -195,9 +195,9 @@ class ptx_parser {
    * function.
    * @param pointer_arg_list_ A list of the parameters that are pointers.
    */
-  ptx_parser(std::string const& ptx_,
-             std::string const& function_name_,
-             std::string const& output_arg_type_,
+  ptx_parser(std::string ptx_,
+             std::string function_name_,
+             std::string output_arg_type_,
              std::set<int> const& pointer_arg_list_);
 
   // parse the source!!!
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 8fa036a0949..d4ea84742c7 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -34,6 +34,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <utility>
+
 namespace cudf {
 namespace reduction {
 namespace detail {
@@ -44,12 +46,12 @@ struct reduce_dispatch_functor {
   rmm::device_async_resource_ref mr;
   rmm::cuda_stream_view stream;
 
-  reduce_dispatch_functor(column_view const& col,
+  reduce_dispatch_functor(column_view col,
                           data_type output_dtype,
                           std::optional<std::reference_wrapper<scalar const>> init,
                           rmm::cuda_stream_view stream,
                           rmm::device_async_resource_ref mr)
-    : col(col), output_dtype(output_dtype), init(init), mr(mr), stream(stream)
+    : col(std::move(col)), output_dtype(output_dtype), init(init), mr(mr), stream(stream)
   {
   }
 
diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.cpp b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
index 3e085fa963c..4175c6e34c1 100644
--- a/cpp/src/rolling/detail/optimized_unbounded_window.cpp
+++ b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
@@ -102,7 +102,7 @@ std::unique_ptr<column> aggregation_based_rolling_window(table_view const& group
                "Ungrouped rolling window not supported in aggregation path.");
 
   auto agg_requests = std::vector<cudf::groupby::aggregation_request>{};
-  agg_requests.push_back(cudf::groupby::aggregation_request());
+  agg_requests.emplace_back();
   agg_requests.front().values = input;
   agg_requests.front().aggregations.push_back(convert_to<cudf::groupby_aggregation>(aggr));
 
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 170ed59d2fe..adf650a4f27 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -80,7 +80,7 @@ std::array<char, 33> const escapable_chars{
  */
 std::vector<char32_t> string_to_char32_vector(std::string_view pattern)
 {
-  size_type size  = static_cast<size_type>(pattern.size());
+  auto size       = static_cast<size_type>(pattern.size());
   size_type count = std::count_if(pattern.cbegin(), pattern.cend(), [](char ch) {
     return is_begin_utf8_char(static_cast<uint8_t>(ch));
   });
@@ -165,8 +165,8 @@ class regex_parser {
         int16_t m;
       } count;
     } d;
-    Item(int32_t type, char32_t chr) : type{type}, d{chr} {}
-    Item(int32_t type, int32_t id) : type{type}, d{.cclass_id{id}} {}
+    Item(int32_t type, char32_t chr) : type{type}, d{.chr = chr} {}
+    Item(int32_t type, int32_t id) : type{type}, d{.cclass_id = id} {}
     Item(int32_t type, int16_t n, int16_t m) : type{type}, d{.count{n, m}} {}
   };
 
@@ -692,7 +692,7 @@ class regex_parser {
     return CHAR;
   }
 
-  std::vector<regex_parser::Item> expand_counted_items() const
+  [[nodiscard]] std::vector<regex_parser::Item> expand_counted_items() const
   {
     std::vector<regex_parser::Item> const& in = _items;
     std::vector<regex_parser::Item> out;
@@ -738,20 +738,20 @@ class regex_parser {
         // optional maximum repeats (m)
         if (m >= 0) {
           for (int j = n; j < m; j++) {
-            out.push_back(regex_parser::Item{LBRA_NC, 0});
+            out.emplace_back(LBRA_NC, 0);
             out.insert(out.end(), begin, end);
           }
           for (int j = n; j < m; j++) {
-            out.push_back(regex_parser::Item{RBRA, 0});
-            out.push_back(regex_parser::Item{item.type == COUNTED ? QUEST : QUEST_LAZY, 0});
+            out.emplace_back(RBRA, 0);
+            out.emplace_back(item.type == COUNTED ? QUEST : QUEST_LAZY, 0);
           }
         } else {
           // infinite repeats
           if (n > 0) {  // append '+' after last repetition
-            out.push_back(regex_parser::Item{item.type == COUNTED ? PLUS : PLUS_LAZY, 0});
+            out.emplace_back(item.type == COUNTED ? PLUS : PLUS_LAZY, 0);
           } else {  // copy it once then append '*'
             out.insert(out.end(), begin, end);
-            out.push_back(regex_parser::Item{item.type == COUNTED ? STAR : STAR_LAZY, 0});
+            out.emplace_back(item.type == COUNTED ? STAR : STAR_LAZY, 0);
           }
         }
       }
@@ -780,7 +780,7 @@ class regex_parser {
     }
   }
 
-  std::vector<regex_parser::Item> get_items() const
+  [[nodiscard]] std::vector<regex_parser::Item> get_items() const
   {
     return _has_counted ? expand_counted_items() : _items;
   }
@@ -803,8 +803,8 @@ class regex_compiler {
   reprog& _prog;
   std::stack<and_node> _and_stack;
   std::stack<re_operator> _operator_stack;
-  bool _last_was_and;
-  int _bracket_count;
+  bool _last_was_and{false};
+  int _bracket_count{0};
   regex_flags _flags;
 
   inline void push_and(int first, int last) { _and_stack.push({first, last}); }
@@ -971,7 +971,7 @@ class regex_compiler {
                  regex_flags const flags,
                  capture_groups const capture,
                  reprog& prog)
-    : _prog(prog), _last_was_and(false), _bracket_count(0), _flags(flags)
+    : _prog(prog), _flags(flags)
   {
     // Parse pattern into items
     auto const items = regex_parser(pattern, _flags, capture, _prog).get_items();
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index c8d846624f8..e6134296e45 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -186,10 +186,10 @@ class reprog_device {
    *            Specify -1 to match any virtual positions past the end of the string.
    * @return If match found, returns character positions of the matches.
    */
-  __device__ inline match_result find(int32_t const thread_idx,
-                                      string_view const d_str,
-                                      string_view::const_iterator begin,
-                                      cudf::size_type end = -1) const;
+  __device__ [[nodiscard]] inline match_result find(int32_t const thread_idx,
+                                                    string_view const d_str,
+                                                    string_view::const_iterator begin,
+                                                    cudf::size_type end = -1) const;
 
   /**
    * @brief Does an extract evaluation using the compiled expression on the given string.
@@ -205,11 +205,11 @@ class reprog_device {
    * @param group_id The specific group to return its matching position values.
    * @return If valid, returns the character position of the matched group in the given string,
    */
-  __device__ inline match_result extract(int32_t const thread_idx,
-                                         string_view const d_str,
-                                         string_view::const_iterator begin,
-                                         cudf::size_type end,
-                                         cudf::size_type const group_id) const;
+  __device__ [[nodiscard]] inline match_result extract(int32_t const thread_idx,
+                                                       string_view const d_str,
+                                                       string_view::const_iterator begin,
+                                                       cudf::size_type end,
+                                                       cudf::size_type const group_id) const;
 
  private:
   struct reljunk {
@@ -225,30 +225,31 @@ class reprog_device {
   /**
    * @brief Returns the regex instruction object for a given id.
    */
-  __device__ inline reinst get_inst(int32_t id) const;
+  __device__ [[nodiscard]] inline reinst get_inst(int32_t id) const;
 
   /**
    * @brief Returns the regex class object for a given id.
    */
-  __device__ inline reclass_device get_class(int32_t id) const;
+  __device__ [[nodiscard]] inline reclass_device get_class(int32_t id) const;
 
   /**
    * @brief Executes the regex pattern on the given string.
    */
-  __device__ inline match_result regexec(string_view const d_str,
-                                         reljunk jnk,
-                                         string_view::const_iterator begin,
-                                         cudf::size_type end,
-                                         cudf::size_type const group_id = 0) const;
+  __device__ [[nodiscard]] inline match_result regexec(string_view const d_str,
+                                                       reljunk jnk,
+                                                       string_view::const_iterator begin,
+                                                       cudf::size_type end,
+                                                       cudf::size_type const group_id = 0) const;
 
   /**
    * @brief Utility wrapper to setup state memory structures for calling regexec
    */
-  __device__ inline match_result call_regexec(int32_t const thread_idx,
-                                              string_view const d_str,
-                                              string_view::const_iterator begin,
-                                              cudf::size_type end,
-                                              cudf::size_type const group_id = 0) const;
+  __device__ [[nodiscard]] inline match_result call_regexec(
+    int32_t const thread_idx,
+    string_view const d_str,
+    string_view::const_iterator begin,
+    cudf::size_type end,
+    cudf::size_type const group_id = 0) const;
 
   reprog_device(reprog const&);
 
@@ -300,7 +301,7 @@ __device__ __forceinline__ string_view string_from_match(match_pair const result
                                                          string_view::const_iterator last)
 {
   auto const [begin, end] = match_positions_to_bytes(result, d_str, last);
-  return string_view(d_str.data() + begin, end - begin);
+  return {d_str.data() + begin, end - begin};
 }
 
 }  // namespace detail
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 10e06505094..23e1944cda4 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -81,12 +81,11 @@ struct alignas(8) relist {
     return true;
   }
 
-  __device__ __forceinline__ restate get_state(int16_t idx) const
+  __device__ [[nodiscard]] __forceinline__ restate get_state(int16_t idx) const
   {
     return restate{ranges[idx * stride], inst_ids[idx * stride]};
   }
-
-  __device__ __forceinline__ int16_t get_size() const { return size; }
+  __device__ [[nodiscard]] __forceinline__ int16_t get_size() const { return size; }
 
  private:
   int16_t size{};
@@ -102,7 +101,7 @@ struct alignas(8) relist {
     mask[pos >> 3] |= uc;
   }
 
-  __device__ __forceinline__ bool readMask(int32_t pos) const
+  __device__ [[nodiscard]] __forceinline__ bool readMask(int32_t pos) const
   {
     u_char const uc = mask[pos >> 3];
     return static_cast<bool>((uc >> (pos & 7)) & 1);
diff --git a/cpp/src/strings/regex/regexec.cpp b/cpp/src/strings/regex/regexec.cpp
index b5e7e7e8922..d1990733e81 100644
--- a/cpp/src/strings/regex/regexec.cpp
+++ b/cpp/src/strings/regex/regexec.cpp
@@ -55,12 +55,12 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
   // compute size of each section
   auto insts_size    = insts_count * sizeof(_insts[0]);
   auto startids_size = starts_count * sizeof(_startinst_ids[0]);
-  auto classes_size  = std::transform_reduce(
-    h_prog.classes_data(),
-    h_prog.classes_data() + h_prog.classes_count(),
-    classes_count * sizeof(_classes[0]),
-    std::plus<std::size_t>{},
-    [&h_prog](auto& cls) { return cls.literals.size() * sizeof(reclass_range); });
+  auto classes_size =
+    std::transform_reduce(h_prog.classes_data(),
+                          h_prog.classes_data() + h_prog.classes_count(),
+                          classes_count * sizeof(_classes[0]),
+                          std::plus<std::size_t>{},
+                          [](auto& cls) { return cls.literals.size() * sizeof(reclass_range); });
   // make sure each section is aligned for the subsequent section's data type
   auto const memsize = cudf::util::round_up_safe(insts_size, sizeof(_startinst_ids[0])) +
                        cudf::util::round_up_safe(startids_size, sizeof(_classes[0])) +
@@ -73,7 +73,7 @@ std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_devic
   auto d_ptr    = reinterpret_cast<u_char*>(d_buffer->data());  // running device pointer
 
   // create our device object; this is managed separately and returned to the caller
-  reprog_device* d_prog = new reprog_device(h_prog);
+  auto* d_prog = new reprog_device(h_prog);
 
   // copy the instructions array first (fixed-sized structs)
   memcpy(h_ptr, h_prog.insts_data(), insts_size);
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 072eb73453b..98ec44758b9 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -59,7 +59,7 @@ void unary_operation(mutable_column_view output,
   cudf::jit::get_program_cache(*transform_jit_kernel_cu_jit)
     .get_kernel(
       kernel_name, {}, {{"transform/jit/operation-udf.hpp", cuda_source}}, {"-arch=sm_."})  //
-    ->configure_1d_max_occupancy(0, 0, 0, stream.value())                                   //
+    ->configure_1d_max_occupancy(0, 0, nullptr, stream.value())                             //
     ->launch(output.size(),                                                                 //
              cudf::jit::get_data_ptr(output),
              cudf::jit::get_data_ptr(input));
diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp
index 121873ad44b..9d3a7ce5a4e 100644
--- a/cpp/src/utilities/stream_pool.cpp
+++ b/cpp/src/utilities/stream_pool.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -101,7 +101,7 @@ class debug_cuda_stream_pool : public cuda_stream_pool {
     return std::vector<rmm::cuda_stream_view>(count, cudf::get_default_stream());
   }
 
-  std::size_t get_stream_pool_size() const override { return 1UL; }
+  [[nodiscard]] std::size_t get_stream_pool_size() const override { return 1UL; }
 };
 
 cuda_stream_pool* create_global_cuda_stream_pool()
diff --git a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
index 8b1e987c8bf..6d097b2ff12 100644
--- a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -380,7 +380,7 @@ TYPED_TEST(FixedPointCompiledTest, FixedPointBinaryOpEqualSimpleScale0Null)
 
   auto const col1     = fp_wrapper<RepType>{{1, 2, 3, 4}, {1, 1, 1, 1}, scale_type{0}};
   auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
-  auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
+  auto const expected = wrapper<bool>{{0, 1, 0, 1}, {false, false, false, false}};
 
   auto const result = cudf::binary_operation(
     col1, col2, cudf::binary_operator::EQUAL, cudf::data_type{cudf::type_id::BOOL8});
@@ -396,7 +396,7 @@ TYPED_TEST(FixedPointCompiledTest, FixedPointBinaryOpEqualSimpleScale2Null)
 
   auto const col1     = fp_wrapper<RepType>{{1, 2, 3, 4}, {1, 1, 1, 1}, scale_type{-2}};
   auto const col2     = fp_wrapper<RepType>{{1, 2, 3, 4}, {0, 0, 0, 0}, scale_type{0}};
-  auto const expected = wrapper<bool>{{0, 1, 0, 1}, {0, 0, 0, 0}};
+  auto const expected = wrapper<bool>{{0, 1, 0, 1}, {false, false, false, false}};
 
   auto const result = cudf::binary_operation(
     col1, col2, cudf::binary_operator::EQUAL, cudf::data_type{cudf::type_id::BOOL8});
@@ -495,7 +495,7 @@ TYPED_TEST(FixedPointCompiledTest, FixedPointBinaryOpNullEqualsSimple)
 
   auto const col1     = fp_wrapper<RepType>{{400, 300, 300, 100}, {1, 1, 1, 0}, scale_type{-2}};
   auto const col2     = fp_wrapper<RepType>{{40, 200, 20, 400}, {1, 0, 1, 0}, scale_type{-1}};
-  auto const expected = wrapper<bool>{{1, 0, 0, 1}, {1, 1, 1, 1}};
+  auto const expected = wrapper<bool>{{1, 0, 0, 1}, {true, true, true, true}};
 
   auto const result = cudf::binary_operation(
     col1, col2, cudf::binary_operator::NULL_EQUALS, cudf::data_type{cudf::type_id::BOOL8});
diff --git a/cpp/tests/bitmask/is_element_valid_tests.cpp b/cpp/tests/bitmask/is_element_valid_tests.cpp
index 224b9893c4a..077d761cc1d 100644
--- a/cpp/tests/bitmask/is_element_valid_tests.cpp
+++ b/cpp/tests/bitmask/is_element_valid_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,8 @@ struct IsElementValidTest : public cudf::test::BaseFixture {};
 
 TEST_F(IsElementValidTest, IsElementValidBasic)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col({1, 1, 1, 1, 1}, {1, 0, 0, 0, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> col({1, 1, 1, 1, 1},
+                                                      {true, false, false, false, true});
   EXPECT_TRUE(cudf::detail::is_element_valid_sync(col, 0, cudf::get_default_stream()));
   EXPECT_FALSE(cudf::detail::is_element_valid_sync(col, 1, cudf::get_default_stream()));
   EXPECT_FALSE(cudf::detail::is_element_valid_sync(col, 2, cudf::get_default_stream()));
@@ -51,7 +52,8 @@ TEST_F(IsElementValidTest, IsElementValidLarge)
 
 TEST_F(IsElementValidTest, IsElementValidOffset)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col({1, 1, 1, 1, 1}, {1, 0, 0, 0, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> col({1, 1, 1, 1, 1},
+                                                      {true, false, false, false, true});
   {
     auto offset_col = cudf::slice(col, {1, 5}).front();
     EXPECT_FALSE(cudf::detail::is_element_valid_sync(offset_col, 0, cudf::get_default_stream()));
diff --git a/cpp/tests/column/column_view_shallow_test.cpp b/cpp/tests/column/column_view_shallow_test.cpp
index 87187dfe57b..37ab4b8f387 100644
--- a/cpp/tests/column/column_view_shallow_test.cpp
+++ b/cpp/tests/column/column_view_shallow_test.cpp
@@ -43,7 +43,8 @@ template <typename T, std::enable_if_t<cudf::is_dictionary<T>()>* = nullptr>
 std::unique_ptr<cudf::column> example_column()
 {
   return cudf::test::dictionary_column_wrapper<std::string>(
-           {"fff", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 0})
+           {"fff", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "", ""},
+           {true, true, true, true, true, true, true, true, false})
     .release();
 }
 
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 3b7bff69938..078e0ef9bae 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -74,8 +74,8 @@ struct TypedColumnTest : public cudf::test::BaseFixture {
     stream.synchronize();
   }
 
-  cudf::size_type num_elements() const { return _num_elements; }
-  cudf::size_type null_count() const { return _null_count; }
+  [[nodiscard]] cudf::size_type num_elements() const { return _num_elements; }
+  [[nodiscard]] cudf::size_type null_count() const { return _null_count; }
 
   std::random_device r;
   std::default_random_engine generator{r()};
@@ -297,16 +297,17 @@ TEST_F(TableTest, ConcatenateTablesWithOffsets)
 
 TEST_F(TableTest, ConcatenateTablesWithOffsetsAndNulls)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col1_1{{5, 4, 3, 5, 8, 5, 6},
-                                                         {0, 1, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col1_1{
+    {5, 4, 3, 5, 8, 5, 6}, {false, true, true, true, true, true, true}};
   cudf::test::strings_column_wrapper col2_1({"dada", "egg", "avocado", "dada", "kite", "dog", "ln"},
-                                            {1, 1, 1, 0, 1, 1, 1});
+                                            {true, true, true, false, true, true, true});
   cudf::table_view table_view_in1{{col1_1, col2_1}};
 
-  cudf::test::fixed_width_column_wrapper<int32_t> col1_2{{5, 8, 5, 6, 15, 14, 13},
-                                                         {1, 1, 1, 1, 1, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col1_2{
+    {5, 8, 5, 6, 15, 14, 13}, {true, true, true, true, true, true, false}};
   cudf::test::strings_column_wrapper col2_2(
-    {"dada", "kite", "dog", "ln", "dado", "greg", "spinach"}, {1, 0, 1, 1, 1, 1, 1});
+    {"dada", "kite", "dog", "ln", "dado", "greg", "spinach"},
+    {true, false, true, true, true, true, true});
   cudf::table_view table_view_in2{{col1_2, col2_2}};
 
   std::vector<cudf::size_type> split_indexes1{3};
@@ -321,10 +322,11 @@ TEST_F(TableTest, ConcatenateTablesWithOffsetsAndNulls)
     table_views_to_concat.push_back(partitioned2[1]);
     std::unique_ptr<cudf::table> concatenated_tables = cudf::concatenate(table_views_to_concat);
 
-    cudf::test::fixed_width_column_wrapper<int32_t> exp1_1{{5, 8, 5, 6, 6, 15, 14, 13},
-                                                           {1, 1, 1, 1, 1, 1, 1, 0}};
+    cudf::test::fixed_width_column_wrapper<int32_t> exp1_1{
+      {5, 8, 5, 6, 6, 15, 14, 13}, {true, true, true, true, true, true, true, false}};
     cudf::test::strings_column_wrapper exp2_1(
-      {"dada", "kite", "dog", "ln", "ln", "dado", "greg", "spinach"}, {0, 1, 1, 1, 1, 1, 1, 1});
+      {"dada", "kite", "dog", "ln", "ln", "dado", "greg", "spinach"},
+      {false, true, true, true, true, true, true, true});
     cudf::table_view table_view_exp1{{exp1_1, exp2_1}};
     CUDF_TEST_EXPECT_TABLES_EQUAL(concatenated_tables->view(), table_view_exp1);
   }
@@ -336,7 +338,7 @@ TEST_F(TableTest, ConcatenateTablesWithOffsetsAndNulls)
 
     cudf::test::fixed_width_column_wrapper<int32_t> exp1_1{5, 8, 5, 6, 5, 8, 5};
     cudf::test::strings_column_wrapper exp2_1({"dada", "kite", "dog", "ln", "dada", "kite", "dog"},
-                                              {0, 1, 1, 1, 1, 0, 1});
+                                              {false, true, true, true, true, false, true});
     cudf::table_view table_view_exp1{{exp1_1, exp2_1}};
     CUDF_TEST_EXPECT_TABLES_EQUAL(concatenated_tables->view(), table_view_exp1);
   }
@@ -507,7 +509,7 @@ TEST_F(OverflowTest, Presliced)
 
     // try and concatenate 4 string columns of with ~1/2 billion chars in each
     auto offset_gen = cudf::detail::make_counting_transform_iterator(
-      0, [string_size](cudf::size_type index) { return index * string_size; });
+      0, [](cudf::size_type index) { return index * string_size; });
     cudf::test::fixed_width_column_wrapper<int> offsets(offset_gen, offset_gen + num_rows + 1);
     auto many_chars = rmm::device_uvector<char>(total_chars_size, cudf::get_default_stream());
     auto col        = cudf::make_strings_column(
@@ -775,7 +777,7 @@ TEST_F(StructsColumnTest, ConcatenateStructs)
   // 1. String "names" column.
   std::vector<std::vector<std::string>> names(
     {{"Vimes", "Carrot"}, {"Angua", "Cheery"}, {}, {"Detritus", "Slant"}});
-  std::vector<std::vector<bool>> names_validity({{1, 1}, {1, 1}, {}, {1, 1}});
+  std::vector<std::vector<bool>> names_validity({{true, true}, {true, true}, {}, {true, true}});
   std::vector<cudf::test::strings_column_wrapper> name_cols;
   std::transform(count_iter, count_iter + names.size(), std::back_inserter(name_cols), [&](int i) {
     return cudf::test::strings_column_wrapper(
@@ -784,7 +786,7 @@ TEST_F(StructsColumnTest, ConcatenateStructs)
 
   // 2. Numeric "ages" column.
   std::vector<std::vector<int>> ages({{5, 10}, {15, 20}, {}, {25, 30}});
-  std::vector<std::vector<bool>> ages_validity({{1, 1}, {1, 1}, {}, {0, 1}});
+  std::vector<std::vector<bool>> ages_validity({{true, true}, {true, true}, {}, {false, true}});
   std::vector<cudf::test::fixed_width_column_wrapper<int>> age_cols;
   std::transform(count_iter, count_iter + ages.size(), std::back_inserter(age_cols), [&](int i) {
     return cudf::test::fixed_width_column_wrapper<int>(
@@ -793,7 +795,7 @@ TEST_F(StructsColumnTest, ConcatenateStructs)
 
   // 3. Boolean "is_human" column.
   std::vector<std::vector<bool>> is_human({{true, true}, {false, false}, {}, {false, false}});
-  std::vector<std::vector<bool>> is_human_validity({{1, 1}, {1, 0}, {}, {1, 1}});
+  std::vector<std::vector<bool>> is_human_validity({{true, true}, {true, false}, {}, {true, true}});
   std::vector<cudf::test::fixed_width_column_wrapper<bool>> is_human_cols;
   std::transform(
     count_iter, count_iter + is_human.size(), std::back_inserter(is_human_cols), [&](int i) {
@@ -811,7 +813,7 @@ TEST_F(StructsColumnTest, ConcatenateStructs)
   expected_children.push_back(cudf::concatenate(name_col_vec));
   expected_children.push_back(cudf::concatenate(age_col_vec));
   expected_children.push_back(cudf::concatenate(is_human_col_vec));
-  std::vector<bool> struct_validity({1, 0, 1, 1, 1, 0});
+  std::vector<bool> struct_validity({true, false, true, true, true, false});
   auto [null_mask, null_count] =
     cudf::test::detail::make_null_mask(struct_validity.begin(), struct_validity.end());
   auto expected =
@@ -819,14 +821,14 @@ TEST_F(StructsColumnTest, ConcatenateStructs)
 
   // concatenate as structs
   std::vector<cudf::test::structs_column_wrapper> src;
-  src.push_back(
-    cudf::test::structs_column_wrapper({name_cols[0], age_cols[0], is_human_cols[0]}, {1, 0}));
-  src.push_back(
-    cudf::test::structs_column_wrapper({name_cols[1], age_cols[1], is_human_cols[1]}, {1, 1}));
+  src.push_back(cudf::test::structs_column_wrapper({name_cols[0], age_cols[0], is_human_cols[0]},
+                                                   {true, false}));
+  src.push_back(cudf::test::structs_column_wrapper({name_cols[1], age_cols[1], is_human_cols[1]},
+                                                   {true, true}));
   src.push_back(
     cudf::test::structs_column_wrapper({name_cols[2], age_cols[2], is_human_cols[2]}, {}));
-  src.push_back(
-    cudf::test::structs_column_wrapper({name_cols[3], age_cols[3], is_human_cols[3]}, {1, 0}));
+  src.push_back(cudf::test::structs_column_wrapper({name_cols[3], age_cols[3], is_human_cols[3]},
+                                                   {true, false}));
 
   // concatenate
   auto result = cudf::concatenate(std::vector<column_view>({src[0], src[1], src[2], src[3]}));
@@ -857,7 +859,8 @@ TEST_F(StructsColumnTest, ConcatenateSplitStructs)
   std::vector<std::vector<std::string>> names(
     {{"Vimes", "Carrot", "Angua", "Cheery", "Detritus", "Slant"},
      {"Bill", "Bob", "Sam", "Fred", "Tom"}});
-  std::vector<std::vector<bool>> names_validity({{1, 1, 1, 1, 1, 1}, {0, 1, 0, 1, 0}});
+  std::vector<std::vector<bool>> names_validity(
+    {{true, true, true, true, true, true}, {false, true, false, true, false}});
   std::vector<cudf::test::strings_column_wrapper> name_cols;
   std::transform(count_iter, count_iter + names.size(), std::back_inserter(name_cols), [&](int i) {
     return cudf::test::strings_column_wrapper(
@@ -866,7 +869,8 @@ TEST_F(StructsColumnTest, ConcatenateSplitStructs)
 
   // 2. Numeric "ages" column.
   std::vector<std::vector<int>> ages({{5, 10, 15, 20, 25, 30}, {11, 16, 17, 41, 42}});
-  std::vector<std::vector<bool>> ages_validity({{1, 1, 1, 1, 0, 1}, {1, 1, 1, 0, 0}});
+  std::vector<std::vector<bool>> ages_validity(
+    {{true, true, true, true, false, true}, {true, true, true, false, false}});
   std::vector<cudf::test::fixed_width_column_wrapper<int>> age_cols;
   std::transform(count_iter, count_iter + ages.size(), std::back_inserter(age_cols), [&](int i) {
     return cudf::test::fixed_width_column_wrapper<int>(
@@ -876,7 +880,8 @@ TEST_F(StructsColumnTest, ConcatenateSplitStructs)
   // 3. Boolean "is_human" column.
   std::vector<std::vector<bool>> is_human(
     {{true, true, false, false, false, false}, {true, true, true, false, true}});
-  std::vector<std::vector<bool>> is_human_validity({{1, 1, 1, 0, 1, 1}, {0, 0, 0, 1, 1}});
+  std::vector<std::vector<bool>> is_human_validity(
+    {{true, true, true, false, true, true}, {false, false, false, true, true}});
   std::vector<cudf::test::fixed_width_column_wrapper<bool>> is_human_cols;
   std::transform(
     count_iter, count_iter + is_human.size(), std::back_inserter(is_human_cols), [&](int i) {
@@ -910,7 +915,7 @@ TEST_F(StructsColumnTest, ConcatenateSplitStructs)
     inputs.push_back(std::make_unique<column>(split_names_cols[idx]));
     inputs.push_back(std::make_unique<column>(split_ages_cols[idx]));
     inputs.push_back(std::make_unique<column>(split_is_human_cols[idx]));
-    src.push_back(cudf::test::structs_column_wrapper(std::move(inputs)));
+    src.emplace_back(std::move(inputs));
   }
 
   // concatenate
@@ -932,7 +937,8 @@ TEST_F(StructsColumnTest, ConcatenateStructsNested)
     std::vector<std::vector<std::string>> names(
       {{"Vimes", "Carrot", "Angua", "Cheery", "Detritus", "Slant"},
        {"Bill", "Bob", "Sam", "Fred", "Tom"}});
-    std::vector<std::vector<bool>> names_validity({{1, 1, 1, 1, 1, 1}, {0, 1, 0, 1, 0}});
+    std::vector<std::vector<bool>> names_validity(
+      {{true, true, true, true, true, true}, {false, true, false, true, false}});
     std::vector<cudf::test::strings_column_wrapper> name_cols;
     std::transform(
       count_iter, count_iter + names.size(), std::back_inserter(name_cols), [&](int i) {
@@ -942,7 +948,8 @@ TEST_F(StructsColumnTest, ConcatenateStructsNested)
 
     // 2. Numeric "ages" column.
     std::vector<std::vector<int>> ages({{5, 10, 15, 20, 25, 30}, {11, 16, 17, 41, 42}});
-    std::vector<std::vector<bool>> ages_validity({{1, 1, 1, 1, 0, 1}, {1, 1, 1, 0, 0}});
+    std::vector<std::vector<bool>> ages_validity(
+      {{true, true, true, true, false, true}, {true, true, true, false, false}});
     std::vector<cudf::test::fixed_width_column_wrapper<int>> age_cols;
     std::transform(count_iter, count_iter + ages.size(), std::back_inserter(age_cols), [&](int i) {
       return cudf::test::fixed_width_column_wrapper<int>(
@@ -953,7 +960,7 @@ TEST_F(StructsColumnTest, ConcatenateStructsNested)
       std::vector<std::unique_ptr<column>> children;
       children.push_back(name_cols[idx].release());
       children.push_back(age_cols[idx].release());
-      inner_structs.push_back(cudf::test::structs_column_wrapper(std::move(children)));
+      inner_structs.emplace_back(std::move(children));
     }
   }
 
@@ -983,7 +990,7 @@ TEST_F(StructsColumnTest, ConcatenateStructsNested)
     std::vector<std::unique_ptr<column>> inputs;
     inputs.push_back(std::make_unique<column>(inner_structs[idx]));
     inputs.push_back(std::make_unique<column>(inner_lists[idx]));
-    src.push_back(cudf::test::structs_column_wrapper(std::move(inputs)));
+    src.emplace_back(std::move(inputs));
   }
 
   // concatenate
@@ -1499,7 +1506,8 @@ TEST_F(ListsColumnTest, ListOfStructs)
        {},
        {},
        {"Bill", "Bob", "Sam", "Fred", "Tom"}});
-    std::vector<std::vector<bool>> names_validity({{1, 1, 1, 1, 1, 1}, {}, {}, {0, 1, 0, 1, 0}});
+    std::vector<std::vector<bool>> names_validity(
+      {{true, true, true, true, true, true}, {}, {}, {false, true, false, true, false}});
     std::vector<cudf::test::strings_column_wrapper> name_cols;
     std::transform(
       count_iter, count_iter + names.size(), std::back_inserter(name_cols), [&](int i) {
@@ -1509,7 +1517,8 @@ TEST_F(ListsColumnTest, ListOfStructs)
 
     // 2. Numeric "ages" column.
     std::vector<std::vector<int>> ages({{5, 10, 15, 20, 25, 30}, {}, {}, {11, 16, 17, 41, 42}});
-    std::vector<std::vector<bool>> ages_validity({{1, 1, 1, 1, 0, 1}, {}, {}, {1, 1, 1, 0, 0}});
+    std::vector<std::vector<bool>> ages_validity(
+      {{true, true, true, true, false, true}, {}, {}, {true, true, true, false, false}});
     std::vector<cudf::test::fixed_width_column_wrapper<int>> age_cols;
     std::transform(count_iter, count_iter + ages.size(), std::back_inserter(age_cols), [&](int i) {
       return cudf::test::fixed_width_column_wrapper<int>(
@@ -1520,7 +1529,7 @@ TEST_F(ListsColumnTest, ListOfStructs)
       std::vector<std::unique_ptr<column>> children;
       children.push_back(name_cols[idx].release());
       children.push_back(age_cols[idx].release());
-      inner_structs.push_back(cudf::test::structs_column_wrapper(std::move(children)));
+      inner_structs.emplace_back(std::move(children));
     }
   }
 
@@ -1618,7 +1627,7 @@ TEST_F(DictionaryConcatTest, StringsKeys)
 {
   cudf::test::strings_column_wrapper strings(
     {"eee", "aaa", "ddd", "bbb", "", "", "ccc", "ccc", "ccc", "eee", "aaa"},
-    {1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1});
+    {true, true, true, true, false, true, true, true, true, true, true});
   auto dictionary = cudf::dictionary::encode(strings);
 
   std::vector<cudf::size_type> splits{0, 2, 2, 5, 5, 7, 7, 7, 7, 11};
diff --git a/cpp/tests/copying/copy_tests.cpp b/cpp/tests/copying/copy_tests.cpp
index f31d8d6f79a..7c8729b6a77 100644
--- a/cpp/tests/copying/copy_tests.cpp
+++ b/cpp/tests/copying/copy_tests.cpp
@@ -57,7 +57,8 @@ TYPED_TEST(CopyTest, CopyIfElseTestManyNulls)
 {
   using T = TypeParam;
 
-  cudf::test::fixed_width_column_wrapper<bool> mask_w{{1, 0, 0, 0, 0, 0, 1}, {1, 1, 1, 1, 1, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<bool> mask_w{{1, 0, 0, 0, 0, 0, 1},
+                                                      {true, true, true, true, true, true, false}};
 
   wrapper<T, int32_t> lhs_w({5, 5, 5, 5, 5, 5, 5}, {1, 1, 1, 1, 1, 1, 1});
   wrapper<T, int32_t> rhs_w({6, 6, 6, 6, 6, 6, 6}, {1, 0, 0, 0, 0, 0, 1});
@@ -124,7 +125,7 @@ TYPED_TEST(CopyTest, CopyIfElseTestMultipleBlocks)
   std::vector<int32_t> h_rhs(num, 6);
   std::vector<bool> h_mask(num, false);
   std::vector<bool> h_validity(num, true);
-  h_validity[0] = 0;
+  h_validity[0] = false;
 
   cudf::test::fixed_width_column_wrapper<T, int32_t> lhs_w(
     h_lhs.begin(), h_lhs.end(), h_validity.begin());
diff --git a/cpp/tests/copying/gather_str_tests.cpp b/cpp/tests/copying/gather_str_tests.cpp
index 22af600ab96..b31f34504e7 100644
--- a/cpp/tests/copying/gather_str_tests.cpp
+++ b/cpp/tests/copying/gather_str_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,15 +31,17 @@ class GatherTestStr : public cudf::test::BaseFixture {};
 
 TEST_F(GatherTestStr, StringColumn)
 {
-  cudf::test::fixed_width_column_wrapper<int16_t> col1{{1, 2, 3, 4, 5, 6}, {1, 1, 0, 1, 0, 1}};
+  cudf::test::fixed_width_column_wrapper<int16_t> col1{{1, 2, 3, 4, 5, 6},
+                                                       {true, true, false, true, false, true}};
   cudf::test::strings_column_wrapper col2{{"This", "is", "not", "a", "string", "type"},
-                                          {1, 1, 1, 1, 1, 0}};
+                                          {true, true, true, true, true, false}};
   cudf::table_view source_table{{col1, col2}};
 
   cudf::test::fixed_width_column_wrapper<int16_t> gather_map{{0, 1, 3, 4}};
 
-  cudf::test::fixed_width_column_wrapper<int16_t> exp_col1{{1, 2, 4, 5}, {1, 1, 1, 0}};
-  cudf::test::strings_column_wrapper exp_col2{{"This", "is", "a", "string"}, {1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int16_t> exp_col1{{1, 2, 4, 5}, {true, true, true, false}};
+  cudf::test::strings_column_wrapper exp_col2{{"This", "is", "a", "string"},
+                                              {true, true, true, true}};
   cudf::table_view expected{{exp_col1, exp_col2}};
 
   auto got = cudf::gather(source_table, gather_map);
@@ -50,26 +52,26 @@ TEST_F(GatherTestStr, StringColumn)
 TEST_F(GatherTestStr, GatherSlicedStringsColumn)
 {
   cudf::test::strings_column_wrapper strings{{"This", "is", "not", "a", "string", "type"},
-                                             {1, 1, 1, 1, 1, 0}};
+                                             {true, true, true, true, true, false}};
   std::vector<cudf::size_type> slice_indices{0, 2, 2, 3, 3, 6};
   auto sliced_strings = cudf::slice(strings, slice_indices);
   {
     cudf::test::fixed_width_column_wrapper<int16_t> gather_map{{1, 0, 1}};
-    cudf::test::strings_column_wrapper expected_strings{{"is", "This", "is"}, {1, 1, 1}};
+    cudf::test::strings_column_wrapper expected_strings{{"is", "This", "is"}, {true, true, true}};
     cudf::table_view expected{{expected_strings}};
     auto result = cudf::gather(cudf::table_view{{sliced_strings[0]}}, gather_map);
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result->view());
   }
   {
     cudf::test::fixed_width_column_wrapper<int16_t> gather_map{{0, 0, 0}};
-    cudf::test::strings_column_wrapper expected_strings{{"not", "not", "not"}, {1, 1, 1}};
+    cudf::test::strings_column_wrapper expected_strings{{"not", "not", "not"}, {true, true, true}};
     cudf::table_view expected{{expected_strings}};
     auto result = cudf::gather(cudf::table_view{{sliced_strings[1]}}, gather_map);
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result->view());
   }
   {
     cudf::test::fixed_width_column_wrapper<int16_t> gather_map{{2, 1, 0}};
-    cudf::test::strings_column_wrapper expected_strings{{"", "string", "a"}, {0, 1, 1}};
+    cudf::test::strings_column_wrapper expected_strings{{"", "string", "a"}, {false, true, true}};
     cudf::table_view expected{{expected_strings}};
     auto result = cudf::gather(cudf::table_view{{sliced_strings[2]}}, gather_map);
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result->view());
@@ -93,8 +95,7 @@ TEST_F(GatherTestStr, Gather)
 
   std::vector<char const*> h_expected;
   std::vector<int32_t> expected_validity;
-  for (auto itr = h_map.begin(); itr != h_map.end(); ++itr) {
-    auto index = *itr;
+  for (int index : h_map) {
     if ((0 <= index) && (index < static_cast<decltype(index)>(h_strings.size()))) {
       h_expected.push_back(h_strings[index]);
       expected_validity.push_back(1);
@@ -124,8 +125,8 @@ TEST_F(GatherTestStr, GatherDontCheckOutOfBounds)
                                       rmm::mr::get_current_device_resource());
 
   std::vector<char const*> h_expected;
-  for (auto itr = h_map.begin(); itr != h_map.end(); ++itr) {
-    h_expected.push_back(h_strings[*itr]);
+  for (int itr : h_map) {
+    h_expected.push_back(h_strings[itr]);
   }
   cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
diff --git a/cpp/tests/copying/gather_struct_tests.cpp b/cpp/tests/copying/gather_struct_tests.cpp
index 2bc18c706db..1598ab2646a 100644
--- a/cpp/tests/copying/gather_struct_tests.cpp
+++ b/cpp/tests/copying/gather_struct_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -408,7 +408,7 @@ TYPED_TEST(TypedStructGatherTest, TestGatherStructOfListOfStructs)
       5, offsets{0, 2, 4, 6, 8, 10}.release(), std::move(expected_struct_col), 0, {});
     std::vector<std::unique_ptr<cudf::column>> expected_vector_of_columns;
     expected_vector_of_columns.push_back(std::move(expected_list_of_structs_column));
-    return structs{std::move(expected_vector_of_columns), {0, 1, 1, 1, 1}};
+    return structs{std::move(expected_vector_of_columns), {false, true, true, true, true}};
   }();
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_gather_result, gathered_structs->view());
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index 99b86c86997..90ff97e7355 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -109,7 +109,7 @@ TEST_F(StringGetValueTest, GetEmpty)
 
 TEST_F(StringGetValueTest, GetFromNullable)
 {
-  cudf::test::strings_column_wrapper col({"this", "is", "a", "test"}, {0, 1, 0, 1});
+  cudf::test::strings_column_wrapper col({"this", "is", "a", "test"}, {false, true, false, true});
   auto s = cudf::get_element(col, 1);
 
   auto typed_s = static_cast<cudf::string_scalar const*>(s.get());
@@ -120,7 +120,7 @@ TEST_F(StringGetValueTest, GetFromNullable)
 
 TEST_F(StringGetValueTest, GetNull)
 {
-  cudf::test::strings_column_wrapper col({"this", "is", "a", "test"}, {0, 1, 0, 1});
+  cudf::test::strings_column_wrapper col({"this", "is", "a", "test"}, {false, true, false, true});
   auto s = cudf::get_element(col, 2);
 
   EXPECT_FALSE(s->is_valid());
@@ -149,8 +149,8 @@ TYPED_TEST(DictionaryGetValueTest, BasicGet)
 TYPED_TEST(DictionaryGetValueTest, GetFromNullable)
 {
   cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> keys({6, 7, 8, 9});
-  cudf::test::fixed_width_column_wrapper<uint32_t> indices({0, 0, 1, 2, 1, 3, 3, 2},
-                                                           {0, 1, 0, 1, 1, 1, 0, 0});
+  cudf::test::fixed_width_column_wrapper<uint32_t> indices(
+    {0, 0, 1, 2, 1, 3, 3, 2}, {false, true, false, true, true, true, false, false});
   auto col = cudf::make_dictionary_column(keys, indices);
 
   auto s = cudf::get_element(*col, 3);
@@ -165,8 +165,8 @@ TYPED_TEST(DictionaryGetValueTest, GetFromNullable)
 TYPED_TEST(DictionaryGetValueTest, GetNull)
 {
   cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> keys({6, 7, 8, 9});
-  cudf::test::fixed_width_column_wrapper<uint32_t> indices({0, 0, 1, 2, 1, 3, 3, 2},
-                                                           {0, 1, 0, 1, 1, 1, 0, 0});
+  cudf::test::fixed_width_column_wrapper<uint32_t> indices(
+    {0, 0, 1, 2, 1, 3, 3, 2}, {false, true, false, true, true, true, false, false});
   auto col = cudf::make_dictionary_column(keys, indices);
 
   auto s = cudf::get_element(*col, 2);
diff --git a/cpp/tests/copying/pack_tests.cpp b/cpp/tests/copying/pack_tests.cpp
index 8a33e017935..ea4408efa6a 100644
--- a/cpp/tests/copying/pack_tests.cpp
+++ b/cpp/tests/copying/pack_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,8 +41,8 @@ struct PackUnpackTest : public cudf::test::BaseFixture {
 
 TEST_F(PackUnpackTest, SingleColumnFixedWidth)
 {
-  cudf::test::fixed_width_column_wrapper<int64_t> col1({1, 2, 3, 4, 5, 6, 7},
-                                                       {1, 1, 1, 0, 1, 0, 1});
+  cudf::test::fixed_width_column_wrapper<int64_t> col1(
+    {1, 2, 3, 4, 5, 6, 7}, {true, true, true, false, true, false, true});
 
   this->run_test({col1});
 }
@@ -56,20 +56,22 @@ TEST_F(PackUnpackTest, SingleColumnFixedWidthNonNullable)
 
 TEST_F(PackUnpackTest, MultiColumnFixedWidth)
 {
-  cudf::test::fixed_width_column_wrapper<int16_t> col1({1, 2, 3, 4, 5, 6, 7},
-                                                       {1, 1, 1, 0, 1, 0, 1});
-  cudf::test::fixed_width_column_wrapper<float> col2({7, 8, 6, 5, 4, 3, 2}, {1, 0, 1, 1, 1, 1, 1});
-  cudf::test::fixed_width_column_wrapper<double> col3({8, 4, 2, 0, 7, 1, 3}, {0, 1, 1, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int16_t> col1(
+    {1, 2, 3, 4, 5, 6, 7}, {true, true, true, false, true, false, true});
+  cudf::test::fixed_width_column_wrapper<float> col2({7, 8, 6, 5, 4, 3, 2},
+                                                     {true, false, true, true, true, true, true});
+  cudf::test::fixed_width_column_wrapper<double> col3({8, 4, 2, 0, 7, 1, 3},
+                                                      {false, true, true, true, true, true, true});
 
   this->run_test({col1, col2, col3});
 }
 
 TEST_F(PackUnpackTest, MultiColumnWithStrings)
 {
-  cudf::test::fixed_width_column_wrapper<int16_t> col1({1, 2, 3, 4, 5, 6, 7},
-                                                       {1, 1, 1, 0, 1, 0, 1});
+  cudf::test::fixed_width_column_wrapper<int16_t> col1(
+    {1, 2, 3, 4, 5, 6, 7}, {true, true, true, false, true, false, true});
   cudf::test::strings_column_wrapper col2({"Lorem", "ipsum", "dolor", "sit", "amet", "ort", "ral"},
-                                          {1, 0, 1, 1, 1, 0, 1});
+                                          {true, false, true, true, true, false, true});
   cudf::test::strings_column_wrapper col3({"", "this", "is", "a", "column", "of", "strings"});
 
   this->run_test({col1, col2, col3});
@@ -164,7 +166,7 @@ std::vector<std::unique_ptr<cudf::column>> generate_structs(bool include_validit
 
   // 2. Numeric "ages" column.
   std::vector<int> ages{5, 10, 15, 20, 25, 30, 100, 101, 102};
-  std::vector<bool> ages_validity = {1, 1, 1, 1, 0, 1, 0, 0, 1};
+  std::vector<bool> ages_validity = {true, true, true, true, false, true, false, false, true};
   auto ages_column =
     include_validity
       ? cudf::test::fixed_width_column_wrapper<int>(ages.begin(), ages.end(), ages_validity.begin())
@@ -172,7 +174,7 @@ std::vector<std::unique_ptr<cudf::column>> generate_structs(bool include_validit
 
   // 3. Boolean "is_human" column.
   std::vector<bool> is_human{true, true, false, false, false, false, true, true, true};
-  std::vector<bool> is_human_validity{1, 1, 1, 0, 1, 1, 1, 1, 0};
+  std::vector<bool> is_human_validity{true, true, true, false, true, true, true, true, false};
   auto is_human_col =
     include_validity
       ? cudf::test::fixed_width_column_wrapper<bool>(
@@ -180,7 +182,8 @@ std::vector<std::unique_ptr<cudf::column>> generate_structs(bool include_validit
       : cudf::test::fixed_width_column_wrapper<bool>(is_human.begin(), is_human.end());
 
   // Assemble struct column.
-  auto const struct_validity = std::vector<bool>{1, 1, 1, 1, 1, 0, 0, 1, 0};
+  auto const struct_validity =
+    std::vector<bool>{true, true, true, true, true, false, false, true, false};
   auto struct_column =
     include_validity
       ? cudf::test::structs_column_wrapper({names_column, ages_column, is_human_col},
@@ -201,13 +204,13 @@ std::vector<std::unique_ptr<cudf::column>> generate_struct_of_list()
 
   // 2. Numeric "ages" column.
   std::vector<int> ages{5, 10, 15, 20, 25, 30, 100, 101, 102};
-  std::vector<bool> ages_validity = {1, 1, 1, 1, 0, 1, 0, 0, 1};
+  std::vector<bool> ages_validity = {true, true, true, true, false, true, false, false, true};
   auto ages_column =
     cudf::test::fixed_width_column_wrapper<int>(ages.begin(), ages.end(), ages_validity.begin());
 
   // 3. List column
   using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
-  std::vector<bool> list_validity{1, 1, 1, 1, 1, 0, 1, 0, 1};
+  std::vector<bool> list_validity{true, true, true, true, true, false, true, false, true};
   cudf::test::lists_column_wrapper<cudf::string_view> list(
     {{{"abc", "d", "edf"}, {"jjj"}},
      {{"dgaer", "-7"}, LCW{}},
@@ -221,7 +224,8 @@ std::vector<std::unique_ptr<cudf::column>> generate_struct_of_list()
     list_validity.begin());
 
   // Assemble struct column.
-  auto const struct_validity = std::vector<bool>{1, 1, 1, 1, 1, 0, 0, 1, 0};
+  auto const struct_validity =
+    std::vector<bool>{true, true, true, true, true, false, false, true, false};
   auto struct_column =
     cudf::test::structs_column_wrapper({names_column, ages_column, list}, struct_validity.begin());
 
@@ -253,17 +257,47 @@ std::vector<std::unique_ptr<cudf::column>> generate_list_of_struct()
 
   // 2. Numeric "ages" column.
   std::vector<int> ages{5, 10, 15, 20, 25, 30, 100, 101, 102, -1, -2, -3, -4, -5, -6, -7};
-  std::vector<bool> ages_validity = {1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1};
+  std::vector<bool> ages_validity = {true,
+                                     true,
+                                     true,
+                                     true,
+                                     false,
+                                     true,
+                                     false,
+                                     false,
+                                     true,
+                                     false,
+                                     false,
+                                     false,
+                                     false,
+                                     true,
+                                     true,
+                                     true};
   auto ages_column =
     cudf::test::fixed_width_column_wrapper<int>(ages.begin(), ages.end(), ages_validity.begin());
 
   // Assemble struct column.
-  auto const struct_validity = std::vector<bool>{1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1};
+  auto const struct_validity = std::vector<bool>{true,
+                                                 true,
+                                                 true,
+                                                 true,
+                                                 true,
+                                                 false,
+                                                 false,
+                                                 true,
+                                                 false,
+                                                 true,
+                                                 true,
+                                                 true,
+                                                 true,
+                                                 true,
+                                                 true,
+                                                 true};
   auto struct_column =
     cudf::test::structs_column_wrapper({names_column, ages_column}, struct_validity.begin());
 
   // 3. List column
-  std::vector<bool> list_validity{1, 1, 1, 1, 1, 0, 1, 0, 1};
+  std::vector<bool> list_validity{true, true, true, true, true, false, true, false, true};
 
   cudf::test::fixed_width_column_wrapper<int> offsets{0, 1, 4, 5, 7, 7, 10, 13, 14, 16};
   auto [null_mask, null_count] =
@@ -479,19 +513,21 @@ TEST_F(PackUnpackTest, NestedSliced)
   // struct
   {
     cudf::test::fixed_width_column_wrapper<int> a{0, 1, 2, 3, 4, 5, 6, 7};
-    cudf::test::fixed_width_column_wrapper<float> b{{0, -1, -2, -3, -4, -5, -6, -7},
-                                                    {1, 1, 1, 0, 0, 0, 0, 1}};
+    cudf::test::fixed_width_column_wrapper<float> b{
+      {0, -1, -2, -3, -4, -5, -6, -7}, {true, true, true, false, false, false, false, true}};
     cudf::test::strings_column_wrapper c{{"abc", "def", "ghi", "jkl", "mno", "", "st", "uvwx"},
-                                         {0, 0, 1, 1, 1, 1, 1, 1}};
-    std::vector<bool> list_validity{1, 0, 1, 0, 1, 0, 1, 1};
+                                         {false, false, true, true, true, true, true, true}};
+    std::vector<bool> list_validity{true, false, true, false, true, false, true, true};
     cudf::test::lists_column_wrapper<int16_t> d{
       {{0, 1}, {2, 3, 4}, {5, 6}, {7}, {8, 9, 10}, {11, 12}, {}, {15, 16, 17}},
       list_validity.begin()};
     cudf::test::fixed_width_column_wrapper<int> _a{10, 20, 30, 40, 50, 60, 70, 80};
     cudf::test::fixed_width_column_wrapper<float> _b{-10, -20, -30, -40, -50, -60, -70, -80};
     cudf::test::strings_column_wrapper _c{"aa", "", "ccc", "dddd", "eeeee", "f", "gg", "hhh"};
-    cudf::test::structs_column_wrapper e({_a, _b, _c}, {1, 1, 1, 0, 1, 1, 1, 0});
-    cudf::test::structs_column_wrapper s({a, b, c, d, e}, {1, 1, 0, 1, 1, 1, 1, 1});
+    cudf::test::structs_column_wrapper e({_a, _b, _c},
+                                         {true, true, true, false, true, true, true, false});
+    cudf::test::structs_column_wrapper s({a, b, c, d, e},
+                                         {true, true, false, true, true, true, true, true});
 
     auto split = cudf::split(s, {2, 5});
 
diff --git a/cpp/tests/copying/scatter_list_tests.cpp b/cpp/tests/copying/scatter_list_tests.cpp
index 9501bb29245..a82860a3eec 100644
--- a/cpp/tests/copying/scatter_list_tests.cpp
+++ b/cpp/tests/copying/scatter_list_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -235,7 +235,8 @@ TEST_F(ScatterListsTest, ListsOfStrings)
 TEST_F(ScatterListsTest, ListsOfNullableStrings)
 {
   auto src_strings_column = cudf::test::strings_column_wrapper{
-    {"all", "the", "leaves", "are", "brown", "california", "dreaming"}, {1, 1, 1, 0, 1, 0, 1}};
+    {"all", "the", "leaves", "are", "brown", "california", "dreaming"},
+    {true, true, true, false, true, false, true}};
 
   auto src_list_column = cudf::make_lists_column(
     2,
@@ -288,7 +289,8 @@ TEST_F(ScatterListsTest, ListsOfNullableStrings)
 TEST_F(ScatterListsTest, EmptyListsOfNullableStrings)
 {
   auto src_strings_column = cudf::test::strings_column_wrapper{
-    {"all", "the", "leaves", "are", "brown", "california", "dreaming"}, {1, 1, 1, 0, 1, 0, 1}};
+    {"all", "the", "leaves", "are", "brown", "california", "dreaming"},
+    {true, true, true, false, true, false, true}};
 
   auto src_list_column = cudf::make_lists_column(
     3,
@@ -339,7 +341,8 @@ TEST_F(ScatterListsTest, EmptyListsOfNullableStrings)
 TEST_F(ScatterListsTest, NullableListsOfNullableStrings)
 {
   auto src_strings_column = cudf::test::strings_column_wrapper{
-    {"all", "the", "leaves", "are", "brown", "california", "dreaming"}, {1, 1, 1, 0, 1, 0, 1}};
+    {"all", "the", "leaves", "are", "brown", "california", "dreaming"},
+    {true, true, true, false, true, false, true}};
 
   auto src_validity =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; });
diff --git a/cpp/tests/copying/scatter_struct_tests.cpp b/cpp/tests/copying/scatter_struct_tests.cpp
index f678880617c..c92244d047b 100644
--- a/cpp/tests/copying/scatter_struct_tests.cpp
+++ b/cpp/tests/copying/scatter_struct_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -209,9 +209,10 @@ TYPED_TEST(TypedStructScatterTest, ScatterStructOfListsTest)
   auto const structs_tgt = structs_col{{lists_col_tgt}}.release();
 
   // Expected data
-  auto const validity_expected = std::vector<bool>{0, 1, 1, 0, 0, 1, 1, 0, 0};
-  auto lists_col_expected      = lists_col{
-         {{1}, {2, 3}, {80}, {70, 75}, {55, 60, 65}, {35, 40, 45, 50}, {5}, {10, 15}, {20, 25, 30}},
+  auto const validity_expected =
+    std::vector<bool>{false, true, true, false, false, true, true, false, false};
+  auto lists_col_expected = lists_col{
+    {{1}, {2, 3}, {80}, {70, 75}, {55, 60, 65}, {35, 40, 45, 50}, {5}, {10, 15}, {20, 25, 30}},
     validity_expected.begin()};
   auto const structs_expected = structs_col{{lists_col_expected}}.release();
 
diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp
index 16cbeb7e657..41a753cd0ac 100644
--- a/cpp/tests/copying/scatter_tests.cpp
+++ b/cpp/tests/copying/scatter_tests.cpp
@@ -47,7 +47,8 @@ TEST_F(ScatterUntypedTests, ScatterMapNulls)
 {
   cudf::test::fixed_width_column_wrapper<int32_t> source({1, 2, 3, 4, 5, 6});
   cudf::test::fixed_width_column_wrapper<int32_t> target({10, 20, 30, 40, 50, 60, 70, 80});
-  cudf::test::fixed_width_column_wrapper<int32_t> scatter_map({-3, 3, 1, -1}, {0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> scatter_map({-3, 3, 1, -1},
+                                                              {false, true, true, true});
 
   auto const source_table = cudf::table_view({source, source});
   auto const target_table = cudf::table_view({target, target});
@@ -63,7 +64,8 @@ TEST_F(ScatterUntypedTests, ScatterScalarMapNulls)
   std::vector<std::reference_wrapper<const cudf::scalar>> source_vector{slr_ref};
 
   cudf::test::fixed_width_column_wrapper<int32_t> target({10, 20, 30, 40, 50, 60, 70, 80});
-  cudf::test::fixed_width_column_wrapper<int32_t> scatter_map({-3, 3, 1, -1}, {0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> scatter_map({-3, 3, 1, -1},
+                                                              {false, true, true, true});
 
   auto const target_table = cudf::table_view({target});
 
@@ -524,11 +526,12 @@ TYPED_TEST(BooleanMaskScatter, WithNull)
   using T = TypeParam;
   cudf::test::fixed_width_column_wrapper<T, int32_t> source_col1({1, 5, 6, 8, 9}, {1, 0, 1, 0, 1});
   cudf::test::strings_column_wrapper source_col2({"This", "is", "cudf", "test", "column"},
-                                                 {1, 0, 0, 1, 0});
+                                                 {true, false, false, true, false});
   cudf::test::fixed_width_column_wrapper<T, int32_t> target_col1({2, 2, 3, 4, 11, 12, 7, 7, 10, 10},
                                                                  {1, 1, 0, 1, 1, 1, 1, 1, 1, 0});
   cudf::test::strings_column_wrapper target_col2(
-    {"a", "bc", "cd", "ef", "gh", "ij", "jk", "lm", "no", "pq"}, {1, 1, 0, 1, 1, 1, 1, 1, 1, 0});
+    {"a", "bc", "cd", "ef", "gh", "ij", "jk", "lm", "no", "pq"},
+    {true, true, false, true, true, true, true, true, true, false});
   cudf::test::fixed_width_column_wrapper<bool> mask(
     {true, false, false, false, true, true, false, true, true, false});
 
@@ -536,7 +539,7 @@ TYPED_TEST(BooleanMaskScatter, WithNull)
                                                                    {1, 1, 0, 1, 0, 1, 1, 0, 1, 0});
   cudf::test::strings_column_wrapper expected_col2(
     {"This", "bc", "cd", "ef", "is", "cudf", "jk", "test", "column", "pq"},
-    {1, 1, 0, 1, 0, 0, 1, 1, 0, 0});
+    {true, true, false, true, false, false, true, true, false, false});
   auto source_table   = cudf::table_view({source_col1, source_col2});
   auto target_table   = cudf::table_view({target_col1, target_col2});
   auto expected_table = cudf::table_view({expected_col1, expected_col2});
@@ -566,11 +569,13 @@ TEST_F(BooleanMaskScatterString, NoNUll)
 
 TEST_F(BooleanMaskScatterString, WithNUll)
 {
-  cudf::test::strings_column_wrapper source({"This", "cudf"}, {0, 1});
-  cudf::test::strings_column_wrapper target({"is", "is", "a", "udf", "api"}, {1, 0, 0, 1, 1});
+  cudf::test::strings_column_wrapper source({"This", "cudf"}, {false, true});
+  cudf::test::strings_column_wrapper target({"is", "is", "a", "udf", "api"},
+                                            {true, false, false, true, true});
   cudf::test::fixed_width_column_wrapper<bool> mask({true, false, false, true, false});
 
-  cudf::test::strings_column_wrapper expected({"This", "is", "a", "cudf", "api"}, {0, 0, 0, 1, 1});
+  cudf::test::strings_column_wrapper expected({"This", "is", "a", "cudf", "api"},
+                                              {false, false, false, true, true});
   auto source_table   = cudf::table_view({source});
   auto target_table   = cudf::table_view({target});
   auto expected_table = cudf::table_view({expected});
@@ -697,11 +702,12 @@ TYPED_TEST(BooleanMaskScalarScatter, WithNull)
   scalar_2->set_valid_async(true);
   std::vector<std::reference_wrapper<const cudf::scalar>> scalar_vect;
   scalar_vect.push_back(*scalar_1);
-  scalar_vect.push_back(*scalar_2);
+  scalar_vect.emplace_back(*scalar_2);
   cudf::test::fixed_width_column_wrapper<T, int32_t> target_col1({2, 2, 3, 4, 11, 12, 7, 7, 10, 10},
                                                                  {1, 1, 0, 1, 1, 1, 1, 1, 1, 0});
   cudf::test::strings_column_wrapper target_col2(
-    {"a", "bc", "cd", "ef", "gh", "ij", "jk", "lm", "no", "pq"}, {1, 1, 0, 1, 1, 1, 1, 1, 1, 0});
+    {"a", "bc", "cd", "ef", "gh", "ij", "jk", "lm", "no", "pq"},
+    {true, true, false, true, true, true, true, true, true, false});
   cudf::test::fixed_width_column_wrapper<bool> mask(
     {true, false, false, false, true, true, false, true, true, false});
 
@@ -709,7 +715,7 @@ TYPED_TEST(BooleanMaskScalarScatter, WithNull)
     {11, 2, 3, 4, 11, 11, 7, 11, 11, 10}, {0, 1, 0, 1, 0, 0, 1, 0, 0, 0});
   cudf::test::strings_column_wrapper expected_col2(
     {"cudf", "bc", "cd", "ef", "cudf", "cudf", "jk", "cudf", "cudf", "pq"},
-    {1, 1, 0, 1, 1, 1, 1, 1, 1, 0});
+    {true, true, false, true, true, true, true, true, true, false});
   auto target_table   = cudf::table_view({target_col1, target_col2});
   auto expected_table = cudf::table_view({expected_col1, expected_col2});
 
@@ -725,7 +731,7 @@ TEST_F(BooleanMaskScatterScalarString, NoNUll)
   auto scalar = cudf::make_string_scalar("cudf");
   scalar->set_valid_async(true);
   std::vector<std::reference_wrapper<const cudf::scalar>> scalar_vect;
-  scalar_vect.push_back(*scalar);
+  scalar_vect.emplace_back(*scalar);
 
   cudf::test::strings_column_wrapper target({"is", "is", "a", "udf", "api"});
   cudf::test::fixed_width_column_wrapper<bool> mask({true, false, false, true, false});
@@ -744,12 +750,13 @@ TEST_F(BooleanMaskScatterScalarString, WithNUll)
   auto scalar = cudf::make_string_scalar("cudf");
   scalar->set_valid_async(true);
   std::vector<std::reference_wrapper<const cudf::scalar>> scalar_vect;
-  scalar_vect.push_back(*scalar);
-  cudf::test::strings_column_wrapper target({"is", "is", "a", "udf", "api"}, {1, 0, 0, 1, 1});
+  scalar_vect.emplace_back(*scalar);
+  cudf::test::strings_column_wrapper target({"is", "is", "a", "udf", "api"},
+                                            {true, false, false, true, true});
   cudf::test::fixed_width_column_wrapper<bool> mask({true, false, true, true, false});
 
   cudf::test::strings_column_wrapper expected({"cudf", "is", "cudf", "cudf", "api"},
-                                              {1, 0, 1, 1, 1});
+                                              {true, false, true, true, true});
   auto target_table   = cudf::table_view({target});
   auto expected_table = cudf::table_view({expected});
   auto got            = cudf::boolean_mask_scatter(scalar_vect, target_table, mask);
@@ -764,7 +771,7 @@ TEST_F(BooleanMaskScatterScalarFails, SourceAndTargetTypeMismatch)
   auto scalar =
     cudf::make_numeric_scalar(cudf::data_type(cudf::data_type{cudf::type_to_id<int32_t>()}));
   std::vector<std::reference_wrapper<const cudf::scalar>> scalar_vect;
-  scalar_vect.push_back(*scalar);
+  scalar_vect.emplace_back(*scalar);
   cudf::test::fixed_width_column_wrapper<int64_t> target({2, 2, 3, 4, 11, 12, 7, 7, 10, 10});
   cudf::test::fixed_width_column_wrapper<bool> mask(
     {true, false, false, false, true, true, false, true, true, false});
@@ -778,7 +785,7 @@ TEST_F(BooleanMaskScatterScalarFails, BooleanMaskTypeMismatch)
   auto scalar =
     cudf::make_numeric_scalar(cudf::data_type(cudf::data_type{cudf::type_to_id<int32_t>()}));
   std::vector<std::reference_wrapper<const cudf::scalar>> scalar_vect;
-  scalar_vect.push_back(*scalar);
+  scalar_vect.emplace_back(*scalar);
   cudf::test::fixed_width_column_wrapper<int32_t> target({2, 2, 3, 4, 11, 12, 7, 7, 10, 10});
   cudf::test::fixed_width_column_wrapper<int8_t> mask(
     {true, false, false, false, true, true, false, true, true, false});
@@ -792,7 +799,7 @@ TEST_F(BooleanMaskScatterScalarFails, BooleanMaskTargetSizeMismatch)
   auto scalar =
     cudf::make_numeric_scalar(cudf::data_type(cudf::data_type{cudf::type_to_id<int32_t>()}));
   std::vector<std::reference_wrapper<const cudf::scalar>> scalar_vect;
-  scalar_vect.push_back(*scalar);
+  scalar_vect.emplace_back(*scalar);
   cudf::test::fixed_width_column_wrapper<int32_t> target({2, 2, 3, 4, 11, 12, 7, 7, 10, 10});
   cudf::test::fixed_width_column_wrapper<bool> mask(
     {true, false, false, false, true, true, false, true, true});
@@ -806,8 +813,8 @@ TEST_F(BooleanMaskScatterScalarFails, NumberOfColumnAndScalarMismatch)
   auto scalar =
     cudf::make_numeric_scalar(cudf::data_type(cudf::data_type{cudf::type_to_id<int32_t>()}));
   std::vector<std::reference_wrapper<const cudf::scalar>> scalar_vect;
-  scalar_vect.push_back(*scalar);
-  scalar_vect.push_back(*scalar);
+  scalar_vect.emplace_back(*scalar);
+  scalar_vect.emplace_back(*scalar);
   cudf::test::fixed_width_column_wrapper<int32_t> target({2, 2, 3, 4, 11, 12, 7, 7, 10, 10});
   cudf::test::fixed_width_column_wrapper<bool> mask(
     {true, false, false, false, true, true, false, true, true});
diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp
index 9c2b16df1e1..01ad4f2247c 100644
--- a/cpp/tests/copying/shift_tests.cpp
+++ b/cpp/tests/copying/shift_tests.cpp
@@ -201,28 +201,29 @@ struct ShiftTests : public cudf::test::BaseFixture {};
 
 TEST_F(ShiftTests, StringsShiftTest)
 {
-  auto input =
-    cudf::test::strings_column_wrapper({"", "bb", "ccc", "ddddddé", ""}, {0, 1, 1, 1, 0});
+  auto input = cudf::test::strings_column_wrapper({"", "bb", "ccc", "ddddddé", ""},
+                                                  {false, true, true, true, false});
 
-  auto fill    = cudf::string_scalar("xx");
-  auto results = cudf::shift(input, 2, fill);
-  auto expected_right =
-    cudf::test::strings_column_wrapper({"xx", "xx", "", "bb", "ccc"}, {1, 1, 0, 1, 1});
+  auto fill           = cudf::string_scalar("xx");
+  auto results        = cudf::shift(input, 2, fill);
+  auto expected_right = cudf::test::strings_column_wrapper({"xx", "xx", "", "bb", "ccc"},
+                                                           {true, true, false, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_right, *results);
 
-  results = cudf::shift(input, -2, fill);
-  auto expected_left =
-    cudf::test::strings_column_wrapper({"ccc", "ddddddé", "", "xx", "xx"}, {1, 1, 0, 1, 1});
+  results            = cudf::shift(input, -2, fill);
+  auto expected_left = cudf::test::strings_column_wrapper({"ccc", "ddddddé", "", "xx", "xx"},
+                                                          {true, true, false, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_left, *results);
 
   auto sliced = cudf::slice(input, {1, 4}).front();
 
   results           = cudf::shift(sliced, 1, fill);
-  auto sliced_right = cudf::test::strings_column_wrapper({"xx", "bb", "ccc"}, {1, 1, 1});
+  auto sliced_right = cudf::test::strings_column_wrapper({"xx", "bb", "ccc"}, {true, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(sliced_right, *results);
 
-  results          = cudf::shift(sliced, -1, fill);
-  auto sliced_left = cudf::test::strings_column_wrapper({"ccc", "ddddddé", "xx"}, {1, 1, 1});
+  results = cudf::shift(sliced, -1, fill);
+  auto sliced_left =
+    cudf::test::strings_column_wrapper({"ccc", "ddddddé", "xx"}, {true, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(sliced_left, *results);
 }
 
@@ -234,42 +235,47 @@ TEST_F(ShiftTests, StringsShiftNullFillTest)
 
   auto results  = cudf::shift(input, -1, phil);
   auto expected = cudf::test::strings_column_wrapper(
-    {"b", "c", "d", "e", "ff", "ggg", "hhhh", "iii", "jjjjj", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+    {"b", "c", "d", "e", "ff", "ggg", "hhhh", "iii", "jjjjj", ""},
+    {true, true, true, true, true, true, true, true, true, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
   results  = cudf::shift(input, 1, phil);
   expected = cudf::test::strings_column_wrapper(
-    {"", "a", "b", "c", "d", "e", "ff", "ggg", "hhhh", "iii"}, {0, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+    {"", "a", "b", "c", "d", "e", "ff", "ggg", "hhhh", "iii"},
+    {false, true, true, true, true, true, true, true, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
   auto sliced = cudf::slice(input, {5, 10}).front();
   results     = cudf::shift(sliced, -2, phil);
-  expected = cudf::test::strings_column_wrapper({"hhhh", "iii", "jjjjj", "", ""}, {1, 1, 1, 0, 0});
+  expected    = cudf::test::strings_column_wrapper({"hhhh", "iii", "jjjjj", "", ""},
+                                                   {true, true, true, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
   results  = cudf::shift(sliced, 2, phil);
-  expected = cudf::test::strings_column_wrapper({"", "", "ff", "ggg", "hhhh"}, {0, 0, 1, 1, 1});
+  expected = cudf::test::strings_column_wrapper({"", "", "ff", "ggg", "hhhh"},
+                                                {false, false, true, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(ShiftTests, OffsetGreaterThanSize)
 {
-  auto const input_str =
-    cudf::test::strings_column_wrapper({"", "bb", "ccc", "ddé", ""}, {0, 1, 1, 1, 0});
-  auto results      = cudf::shift(input_str, 6, cudf::string_scalar("xx"));
-  auto expected_str = cudf::test::strings_column_wrapper({"xx", "xx", "xx", "xx", "xx"});
+  auto const input_str = cudf::test::strings_column_wrapper({"", "bb", "ccc", "ddé", ""},
+                                                            {false, true, true, true, false});
+  auto results         = cudf::shift(input_str, 6, cudf::string_scalar("xx"));
+  auto expected_str    = cudf::test::strings_column_wrapper({"xx", "xx", "xx", "xx", "xx"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_str, *results);
   results = cudf::shift(input_str, -6, cudf::string_scalar("xx"));
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_str, *results);
 
-  results      = cudf::shift(input_str, 6, cudf::string_scalar("", false));
-  expected_str = cudf::test::strings_column_wrapper({"", "", "", "", ""}, {0, 0, 0, 0, 0});
+  results = cudf::shift(input_str, 6, cudf::string_scalar("", false));
+  expected_str =
+    cudf::test::strings_column_wrapper({"", "", "", "", ""}, {false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_str, *results);
   results = cudf::shift(input_str, -6, cudf::string_scalar("", false));
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_str, *results);
 
-  auto const input =
-    cudf::test::fixed_width_column_wrapper<int32_t>({0, 2, 3, 4, 0}, {0, 1, 1, 1, 0});
+  auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
+    {0, 2, 3, 4, 0}, {false, true, true, true, false});
   results       = cudf::shift(input, 6, cudf::numeric_scalar<int32_t>(9));
   auto expected = cudf::test::fixed_width_column_wrapper<int32_t>({9, 9, 9, 9, 9});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results);
@@ -277,7 +283,8 @@ TEST_F(ShiftTests, OffsetGreaterThanSize)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results);
 
   results  = cudf::shift(input, 6, cudf::numeric_scalar<int32_t>(0, false));
-  expected = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 0, 0, 0}, {0, 0, 0, 0, 0});
+  expected = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 0, 0, 0},
+                                                             {false, false, false, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results);
   results = cudf::shift(input, -6, cudf::numeric_scalar<int32_t>(0, false));
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results);
diff --git a/cpp/tests/copying/slice_tests.cpp b/cpp/tests/copying/slice_tests.cpp
index fffc51eef2c..bebd3d25610 100644
--- a/cpp/tests/copying/slice_tests.cpp
+++ b/cpp/tests/copying/slice_tests.cpp
@@ -168,7 +168,7 @@ TEST_F(SliceListTest, Lists)
   {
     cudf::test::lists_column_wrapper<int> list{{{1, 2, 3}, {4, 5}},
                                                {LCW{}, LCW{}, {7, 8}, LCW{}},
-                                               {{{6}}},
+                                               {{{6}}},  // NOLINT
                                                {{7, 8}, {9, 10, 11}, LCW{}},
                                                {LCW{}, {-1, -2, -3, -4, -5}},
                                                {LCW{}},
@@ -177,7 +177,7 @@ TEST_F(SliceListTest, Lists)
     std::vector<cudf::size_type> indices{1, 3, 3, 6};
 
     std::vector<cudf::test::lists_column_wrapper<int>> expected;
-    expected.push_back(LCW{{LCW{}, LCW{}, {7, 8}, LCW{}}, {{{6}}}});
+    expected.push_back(LCW{{LCW{}, LCW{}, {7, 8}, LCW{}}, {{{6}}}});  // NOLINT
     expected.push_back(LCW{{{7, 8}, {9, 10, 11}, LCW{}}, {LCW{}, {-1, -2, -3, -4, -5}}, {LCW{}}});
 
     std::vector<cudf::column_view> result = cudf::slice(list, indices);
@@ -233,7 +233,7 @@ TEST_F(SliceListTest, ListsWithNulls)
   {
     cudf::test::lists_column_wrapper<int> list{{{{1, 2, 3}, valids}, {4, 5}},
                                                {{LCW{}, LCW{}, {7, 8}, LCW{}}, valids},
-                                               {{{6}}},
+                                               {{{6}}},  // NOLINT
                                                {{{7, 8}, {{9, 10, 11}, valids}, LCW{}}, valids},
                                                {{LCW{}, {-1, -2, -3, -4, -5}}, valids},
                                                {LCW{}},
@@ -242,7 +242,7 @@ TEST_F(SliceListTest, ListsWithNulls)
     std::vector<cudf::size_type> indices{1, 3, 3, 6};
 
     std::vector<cudf::test::lists_column_wrapper<int>> expected;
-    expected.push_back(LCW{{{LCW{}, LCW{}, {7, 8}, LCW{}}, valids}, {{{6}}}});
+    expected.push_back(LCW{{{LCW{}, LCW{}, {7, 8}, LCW{}}, valids}, {{{6}}}});  // NOLINT
     expected.push_back(LCW{{{{7, 8}, {{9, 10, 11}, valids}, LCW{}}, valids},
                            {{LCW{}, {-1, -2, -3, -4, -5}}, valids},
                            {LCW{}}});
@@ -476,11 +476,30 @@ TEST_F(SliceTableCornerCases, MiscOffset)
   cudf::test::fixed_width_column_wrapper<int32_t> col2{
     {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
      3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3},
-    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}};
+    {true, true, true, true, true, true, true, true, true, true,  true,  true, true,
+     true, true, true, true, true, true, true, true, true, true,  true,  true, true,
+     true, true, true, true, true, true, true, true, true, false, false, false}};
   cudf::test::fixed_width_column_wrapper<int32_t> col3{
     {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3},
-    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0}};
+    {true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     false,
+     false,
+     false}};
   std::vector<cudf::size_type> indices{19, 38};
   std::vector<cudf::column_view> result = cudf::slice(col2, indices);
   cudf::column result_column(result[0]);
@@ -493,16 +512,19 @@ TEST_F(SliceTableCornerCases, PreSlicedInputs)
   {
     using LCW = cudf::test::lists_column_wrapper<float>;
 
-    cudf::test::fixed_width_column_wrapper<int> a{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
-                                                  {1, 1, 0, 1, 1, 1, 0, 0, 1, 0}};
+    cudf::test::fixed_width_column_wrapper<int> a{
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+      {true, true, false, true, true, true, false, false, true, false}};
 
-    cudf::test::fixed_width_column_wrapper<int> b{{0, -1, -2, -3, -4, -5, -6, -7, -8, -9},
-                                                  {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}};
+    cudf::test::fixed_width_column_wrapper<int> b{
+      {0, -1, -2, -3, -4, -5, -6, -7, -8, -9},
+      {false, false, false, false, false, false, false, false, false, false}};
 
-    cudf::test::strings_column_wrapper c{{"aa", "b", "", "ccc", "ddd", "e", "ff", "", "", "gggg"},
-                                         {0, 0, 1, 1, 0, 0, 1, 1, 1, 0}};
+    cudf::test::strings_column_wrapper c{
+      {"aa", "b", "", "ccc", "ddd", "e", "ff", "", "", "gggg"},
+      {false, false, true, true, false, false, true, true, true, false}};
 
-    std::vector<bool> list_validity{1, 0, 1, 0, 1, 1, 0, 0, 1, 1};
+    std::vector<bool> list_validity{true, false, true, false, true, true, false, false, true, true};
     cudf::test::lists_column_wrapper<float> d{
       {{0, 1}, {2}, {3, 4, 5}, {6}, {7, 7}, {8, 9}, {10, 11}, {12, 13}, {}, {14, 15, 16}},
       list_validity.begin()};
@@ -513,18 +535,21 @@ TEST_F(SliceTableCornerCases, PreSlicedInputs)
 
     auto result = cudf::slice(pre_sliced[1], {0, 1, 1, 6});
 
-    cudf::test::fixed_width_column_wrapper<int> e0_a({4}, {1});
-    cudf::test::fixed_width_column_wrapper<int> e0_b({-4}, {0});
-    cudf::test::strings_column_wrapper e0_c({""}, {0});
-    std::vector<bool> e0_list_validity{1};
+    cudf::test::fixed_width_column_wrapper<int> e0_a({4}, {true});
+    cudf::test::fixed_width_column_wrapper<int> e0_b({-4}, {false});
+    cudf::test::strings_column_wrapper e0_c({""}, {false});
+    std::vector<bool> e0_list_validity{true};
     cudf::test::lists_column_wrapper<float> e0_d({LCW{7, 7}}, e0_list_validity.begin());
     cudf::table_view expected0({e0_a, e0_b, e0_c, e0_d});
     CUDF_TEST_EXPECT_TABLES_EQUAL(result[0], expected0);
 
-    cudf::test::fixed_width_column_wrapper<int> e1_a{{5, 6, 7, 8, 9}, {1, 0, 0, 1, 0}};
-    cudf::test::fixed_width_column_wrapper<int> e1_b{{-5, -6, -7, -8, -9}, {0, 0, 0, 0, 0}};
-    cudf::test::strings_column_wrapper e1_c{{"e", "ff", "", "", "gggg"}, {0, 1, 1, 1, 0}};
-    std::vector<bool> e1_list_validity{1, 0, 0, 1, 1};
+    cudf::test::fixed_width_column_wrapper<int> e1_a{{5, 6, 7, 8, 9},
+                                                     {true, false, false, true, false}};
+    cudf::test::fixed_width_column_wrapper<int> e1_b{{-5, -6, -7, -8, -9},
+                                                     {false, false, false, false, false}};
+    cudf::test::strings_column_wrapper e1_c{{"e", "ff", "", "", "gggg"},
+                                            {false, true, true, true, false}};
+    std::vector<bool> e1_list_validity{true, false, false, true, true};
     cudf::test::lists_column_wrapper<float> e1_d{{{8, 9}, {10, 11}, {12, 13}, {}, {14, 15, 16}},
                                                  e1_list_validity.begin()};
     cudf::table_view expected1({e1_a, e1_b, e1_c, e1_d});
diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index 077092ca036..7ff159cf896 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -116,8 +116,7 @@ std::vector<std::vector<bool>> create_expected_validity(std::vector<cudf::size_t
   std::vector<cudf::size_type> indices  = splits_to_indices(splits, validity.size());
 
   for (unsigned long index = 0; index < indices.size(); index += 2) {
-    result.push_back(
-      std::vector<bool>(validity.begin() + indices[index], validity.begin() + indices[index + 1]));
+    result.emplace_back(validity.begin() + indices[index], validity.begin() + indices[index + 1]);
   }
 
   return result;
@@ -914,12 +913,12 @@ void split_structs(bool include_validity, SplitFunc Split, CompareFunc Compare,
   // 1. String "names" column.
   std::vector<std::string> names{
     "Vimes", "Carrot", "Angua", "Cheery", "Detritus", "Slant", "Fred", "Todd", "Kevin"};
-  std::vector<bool> names_validity{1, 1, 1, 1, 1, 1, 1, 1, 1};
+  std::vector<bool> names_validity{true, true, true, true, true, true, true, true, true};
   cudf::test::strings_column_wrapper names_column(names.begin(), names.end());
 
   // 2. Numeric "ages" column.
   std::vector<int> ages{5, 10, 15, 20, 25, 30, 100, 101, 102};
-  std::vector<bool> ages_validity = {1, 1, 1, 1, 0, 1, 0, 0, 1};
+  std::vector<bool> ages_validity = {true, true, true, true, false, true, false, false, true};
   auto ages_column =
     include_validity
       ? cudf::test::fixed_width_column_wrapper<int>(ages.begin(), ages.end(), ages_validity.begin())
@@ -927,7 +926,7 @@ void split_structs(bool include_validity, SplitFunc Split, CompareFunc Compare,
 
   // 3. Boolean "is_human" column.
   std::vector<bool> is_human{true, true, false, false, false, false, true, true, true};
-  std::vector<bool> is_human_validity{1, 1, 1, 0, 1, 1, 1, 1, 0};
+  std::vector<bool> is_human_validity{true, true, true, false, true, true, true, true, false};
   auto is_human_col =
     include_validity
       ? cudf::test::fixed_width_column_wrapper<bool>(
@@ -935,7 +934,8 @@ void split_structs(bool include_validity, SplitFunc Split, CompareFunc Compare,
       : cudf::test::fixed_width_column_wrapper<bool>(is_human.begin(), is_human.end());
 
   // Assemble struct column.
-  auto const struct_validity = std::vector<bool>{1, 1, 1, 1, 1, 0, 0, 1, 0};
+  auto const struct_validity =
+    std::vector<bool>{true, true, true, true, true, false, false, true, false};
   auto struct_column =
     include_validity
       ? cudf::test::structs_column_wrapper({names_column, ages_column, is_human_col},
@@ -1084,17 +1084,17 @@ void split_nested_struct_of_list(SplitFunc Split, CompareFunc Compare, bool spli
   // 1. String "names" column.
   std::vector<std::string> names{
     "Vimes", "Carrot", "Angua", "Cheery", "Detritus", "Slant", "Fred", "Todd", "Kevin"};
-  std::vector<bool> names_validity{1, 1, 1, 1, 1, 1, 1, 1, 1};
+  std::vector<bool> names_validity{true, true, true, true, true, true, true, true, true};
   cudf::test::strings_column_wrapper names_column(names.begin(), names.end());
 
   // 2. Numeric "ages" column.
   std::vector<int> ages{5, 10, 15, 20, 25, 30, 100, 101, 102};
-  std::vector<bool> ages_validity = {1, 1, 1, 1, 0, 1, 0, 0, 1};
+  std::vector<bool> ages_validity = {true, true, true, true, false, true, false, false, true};
   auto ages_column =
     cudf::test::fixed_width_column_wrapper<int>(ages.begin(), ages.end(), ages_validity.begin());
 
   // 3. List column
-  std::vector<bool> list_validity{1, 1, 1, 1, 1, 0, 1, 0, 1};
+  std::vector<bool> list_validity{true, true, true, true, true, false, true, false, true};
   cudf::test::lists_column_wrapper<float> list({{{1, 2, 3}, {4}},
                                                 {{-1, -2}, LCW{}},
                                                 LCW{},
@@ -1107,7 +1107,8 @@ void split_nested_struct_of_list(SplitFunc Split, CompareFunc Compare, bool spli
                                                list_validity.begin());
 
   // Assemble struct column.
-  auto const struct_validity = std::vector<bool>{1, 1, 1, 1, 1, 0, 0, 1, 0};
+  auto const struct_validity =
+    std::vector<bool>{true, true, true, true, true, false, false, true, false};
   auto struct_column =
     cudf::test::structs_column_wrapper({names_column, ages_column, list}, struct_validity.begin());
 
@@ -1120,7 +1121,7 @@ void split_nested_struct_of_list(SplitFunc Split, CompareFunc Compare, bool spli
     std::vector<cudf::test::lists_column_wrapper<float>> expected_lists;
     expected_lists.push_back(LCW({{{1, 2, 3}, {4}}}));
     expected_lists.push_back(LCW({{{-1, -2}, LCW{}}, LCW{}}));
-    std::vector<bool> ex_v{1, 1, 0, 1, 0};
+    std::vector<bool> ex_v{true, true, false, true, false};
     expected_lists.push_back(LCW({{{10}, {20, 30, 40}, {100, -100}},
                                   {LCW{}, LCW{}, {8, 9}},
                                   LCW{},
@@ -1169,17 +1170,68 @@ void split_nested_list_of_structs(SplitFunc Split, CompareFunc Compare, bool spl
                                  "Mark",
                                  "Herman",
                                  "Will"};
-  std::vector<bool> names_validity{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  std::vector<bool> names_validity{true,
+                                   true,
+                                   true,
+                                   true,
+                                   true,
+                                   true,
+                                   true,
+                                   true,
+                                   true,
+                                   true,
+                                   true,
+                                   true,
+                                   true,
+                                   true,
+                                   true,
+                                   true,
+                                   true,
+                                   true};
   cudf::test::strings_column_wrapper names_column(names.begin(), names.end());
 
   // 2. Numeric "ages" column.
   std::vector<int> ages{5, 10, 15, 20, 25, 30, 100, 101, 102, 26, 64, 12, 17, 16, 120, 44, 23, 50};
-  std::vector<bool> ages_validity = {1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0};
+  std::vector<bool> ages_validity = {true,
+                                     true,
+                                     true,
+                                     true,
+                                     false,
+                                     true,
+                                     false,
+                                     false,
+                                     true,
+                                     true,
+                                     true,
+                                     false,
+                                     false,
+                                     false,
+                                     true,
+                                     true,
+                                     true,
+                                     false};
   auto ages_column =
     cudf::test::fixed_width_column_wrapper<int>(ages.begin(), ages.end(), ages_validity.begin());
 
   // 3. List column
-  std::vector<bool> list_validity{1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1};
+  std::vector<bool> list_validity{true,
+                                  true,
+                                  true,
+                                  true,
+                                  true,
+                                  false,
+                                  true,
+                                  false,
+                                  true,
+                                  true,
+                                  true,
+                                  true,
+                                  true,
+                                  true,
+                                  true,
+                                  false,
+                                  true,
+                                  true};
   cudf::test::lists_column_wrapper<cudf::string_view> list(
     {{"ab", "cd", "ef"},
      LCW{"gh"},
@@ -1202,8 +1254,24 @@ void split_nested_list_of_structs(SplitFunc Split, CompareFunc Compare, bool spl
     list_validity.begin());
 
   // Assembly struct column
-  auto const struct_validity =
-    std::vector<bool>{1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1};
+  auto const struct_validity = std::vector<bool>{true,
+                                                 true,
+                                                 true,
+                                                 true,
+                                                 true,
+                                                 false,
+                                                 false,
+                                                 true,
+                                                 false,
+                                                 false,
+                                                 false,
+                                                 false,
+                                                 true,
+                                                 true,
+                                                 true,
+                                                 true,
+                                                 false,
+                                                 true};
   auto struct_column =
     cudf::test::structs_column_wrapper({names_column, ages_column, list}, struct_validity.begin());
 
@@ -1211,7 +1279,7 @@ void split_nested_list_of_structs(SplitFunc Split, CompareFunc Compare, bool spl
   std::vector<int> outer_offsets{0, 3, 4, 8, 13, 16, 17, 18};
   cudf::test::fixed_width_column_wrapper<int> outer_offsets_col(outer_offsets.begin(),
                                                                 outer_offsets.end());
-  std::vector<bool> outer_validity{1, 1, 1, 0, 1, 1, 0};
+  std::vector<bool> outer_validity{true, true, true, false, true, true, false};
   auto [outer_null_mask, outer_null_count] =
     cudf::test::detail::make_null_mask(outer_validity.begin(), outer_validity.end());
   auto outer_list = make_lists_column(static_cast<cudf::size_type>(outer_validity.size()),
@@ -1713,8 +1781,8 @@ TEST_F(ContiguousSplitStringTableTest, EmptyInputColumn)
     auto result = cudf::contiguous_split(src_table, splits);
     ASSERT_EQ(result.size(), 5);
 
-    for (size_t idx = 0; idx < result.size(); idx++) {
-      CUDF_TEST_EXPECT_TABLES_EQUIVALENT(src_table, result[idx].table);
+    for (auto& idx : result) {
+      CUDF_TEST_EXPECT_TABLES_EQUIVALENT(src_table, idx.table);
     }
   }
 }
@@ -1953,7 +2021,7 @@ TEST_F(ContiguousSplitTableCornerCases, PreSplitTable)
 
   cudf::test::lists_column_wrapper<int> col0{{{1, 2, 3}, {4, 5}},
                                              {{LCW{}, LCW{}, {7, 8}, LCW{}}, valids},
-                                             {{{6}}},
+                                             {{{6}}},  // NOLINT
                                              {{{7, 8}, LCW{}, {{9, 10, 11}, valids}}, valids},
                                              {{{-1, -2, -3, -4, -5}, LCW{}}, valids},
                                              {LCW{}},
@@ -2098,12 +2166,13 @@ TEST_F(ContiguousSplitTableCornerCases, PreSplitStructs)
   // includes struct<list>
   {
     cudf::test::fixed_width_column_wrapper<int> a{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-    cudf::test::fixed_width_column_wrapper<float> b{{0, -1, -2, -3, -4, -5, -6, -7, -8, -9},
-                                                    {1, 1, 1, 0, 0, 0, 0, 1, 1, 1}};
+    cudf::test::fixed_width_column_wrapper<float> b{
+      {0, -1, -2, -3, -4, -5, -6, -7, -8, -9},
+      {true, true, true, false, false, false, false, true, true, true}};
     cudf::test::strings_column_wrapper c{
       {"abc", "def", "ghi", "jkl", "mno", "", "st", "uvwx", "yy", "zzzz"},
-      {0, 0, 1, 1, 1, 1, 1, 1, 1, 1}};
-    std::vector<bool> list_validity{1, 0, 1, 0, 1, 0, 1, 1, 1, 1};
+      {false, false, true, true, true, true, true, true, true, true}};
+    std::vector<bool> list_validity{true, false, true, false, true, false, true, true, true, true};
     cudf::test::lists_column_wrapper<int16_t> d{
       {{0, 1}, {2, 3, 4}, {5, 6}, {7}, {8, 9, 10}, {11, 12}, {}, {15, 16, 17}, {18, 19}, {20}},
       list_validity.begin()};
@@ -2112,8 +2181,10 @@ TEST_F(ContiguousSplitTableCornerCases, PreSplitStructs)
       -10, -20, -30, -40, -50, -60, -70, -80, -90, -100};
     cudf::test::strings_column_wrapper _c{
       "aa", "", "ccc", "dddd", "eeeee", "f", "gg", "hhh", "i", "jjj"};
-    cudf::test::structs_column_wrapper e({_a, _b, _c}, {1, 1, 1, 0, 1, 1, 1, 0, 1, 1});
-    cudf::test::structs_column_wrapper s({a, b, c, d, e}, {1, 1, 0, 1, 1, 1, 1, 1, 1, 1});
+    cudf::test::structs_column_wrapper e(
+      {_a, _b, _c}, {true, true, true, false, true, true, true, false, true, true});
+    cudf::test::structs_column_wrapper s(
+      {a, b, c, d, e}, {true, true, false, true, true, true, true, true, true, true});
 
     auto pre_split = cudf::split(s, {4});
 
diff --git a/cpp/tests/dictionary/decode_test.cpp b/cpp/tests/dictionary/decode_test.cpp
index 33c8cb23110..84b6c728e4b 100644
--- a/cpp/tests/dictionary/decode_test.cpp
+++ b/cpp/tests/dictionary/decode_test.cpp
@@ -48,8 +48,9 @@ TEST_F(DictionaryDecodeTest, FloatColumn)
 
 TEST_F(DictionaryDecodeTest, ColumnWithNull)
 {
-  cudf::test::fixed_width_column_wrapper<int64_t> input{{444, 0, 333, 111, 222, 222, 222, 444, 000},
-                                                        {1, 1, 1, 1, 1, 0, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int64_t> input{
+    {444, 0, 333, 111, 222, 222, 222, 444, 000},
+    {true, true, true, true, true, false, true, true, true}};
 
   auto dictionary = cudf::dictionary::encode(input);
   auto output     = cudf::dictionary::decode(cudf::dictionary_column_view(dictionary->view()));
diff --git a/cpp/tests/dictionary/encode_test.cpp b/cpp/tests/dictionary/encode_test.cpp
index 93c2ab4c0ef..5db0e9fa1e4 100644
--- a/cpp/tests/dictionary/encode_test.cpp
+++ b/cpp/tests/dictionary/encode_test.cpp
@@ -56,8 +56,9 @@ TEST_F(DictionaryEncodeTest, EncodeFloat)
 
 TEST_F(DictionaryEncodeTest, EncodeWithNull)
 {
-  cudf::test::fixed_width_column_wrapper<int64_t> input{{444, 0, 333, 111, 222, 222, 222, 444, 000},
-                                                        {1, 1, 1, 1, 1, 0, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int64_t> input{
+    {444, 0, 333, 111, 222, 222, 222, 444, 000},
+    {true, true, true, true, true, false, true, true, true}};
 
   auto dictionary = cudf::dictionary::encode(input);
   cudf::dictionary_column_view view(dictionary->view());
diff --git a/cpp/tests/dictionary/factories_test.cpp b/cpp/tests/dictionary/factories_test.cpp
index 35aa19c5558..051ea45aed6 100644
--- a/cpp/tests/dictionary/factories_test.cpp
+++ b/cpp/tests/dictionary/factories_test.cpp
@@ -96,7 +96,8 @@ TEST_F(DictionaryFactoriesTest, ColumnsWithNulls)
 
 TEST_F(DictionaryFactoriesTest, KeysWithNulls)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> keys{{0, 1, 2, 3, 4}, {1, 1, 1, 0, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> keys{{0, 1, 2, 3, 4},
+                                                       {true, true, true, false, true}};
   cudf::test::fixed_width_column_wrapper<uint32_t> indices{5, 4, 3, 2, 1, 0};
   EXPECT_THROW(cudf::make_dictionary_column(keys, indices), cudf::logic_error);
 }
@@ -104,7 +105,8 @@ TEST_F(DictionaryFactoriesTest, KeysWithNulls)
 TEST_F(DictionaryFactoriesTest, IndicesWithNulls)
 {
   cudf::test::fixed_width_column_wrapper<int32_t> keys{0, 1, 2, 3, 4};
-  cudf::test::fixed_width_column_wrapper<uint32_t> indices{{5, 4, 3, 2, 1, 0}, {1, 1, 1, 0, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<uint32_t> indices{{5, 4, 3, 2, 1, 0},
+                                                           {true, true, true, false, true, false}};
   EXPECT_THROW(
     cudf::make_dictionary_column(keys.release(), indices.release(), rmm::device_buffer{}, 0),
     cudf::logic_error);
diff --git a/cpp/tests/dictionary/fill_test.cpp b/cpp/tests/dictionary/fill_test.cpp
index 7f2bb5496f3..18696b66e48 100644
--- a/cpp/tests/dictionary/fill_test.cpp
+++ b/cpp/tests/dictionary/fill_test.cpp
@@ -42,25 +42,27 @@ TEST_F(DictionaryFillTest, StringsColumn)
 
 TEST_F(DictionaryFillTest, WithNulls)
 {
-  cudf::test::fixed_width_column_wrapper<int64_t> input({9, 8, 7, 6, 4}, {0, 1, 1, 0, 1});
+  cudf::test::fixed_width_column_wrapper<int64_t> input({9, 8, 7, 6, 4},
+                                                        {false, true, true, false, true});
   auto dictionary = cudf::dictionary::encode(input);
   cudf::numeric_scalar<int64_t> fv(-10);
   auto results = cudf::fill(dictionary->view(), 0, 2, fv);
   auto decoded = cudf::dictionary::decode(results->view());
-  cudf::test::fixed_width_column_wrapper<int64_t> expected({-10, -10, 7, 6, 4}, {1, 1, 1, 0, 1});
+  cudf::test::fixed_width_column_wrapper<int64_t> expected({-10, -10, 7, 6, 4},
+                                                           {true, true, true, false, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(decoded->view(), expected);
 }
 
 TEST_F(DictionaryFillTest, FillWithNull)
 {
   cudf::test::fixed_width_column_wrapper<double> input({1.2, 8.5, 7.75, 6.25, 4.125},
-                                                       {1, 1, 1, 0, 1});
+                                                       {true, true, true, false, true});
   auto dictionary = cudf::dictionary::encode(input);
   cudf::numeric_scalar<double> fv(0, false);
   auto results = cudf::fill(dictionary->view(), 1, 3, fv);
   auto decoded = cudf::dictionary::decode(results->view());
   cudf::test::fixed_width_column_wrapper<double> expected({1.2, 0.0, 0.0, 0.0, 4.125},
-                                                          {1, 0, 0, 0, 1});
+                                                          {true, false, false, false, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(decoded->view(), expected);
 }
 
diff --git a/cpp/tests/dictionary/gather_test.cpp b/cpp/tests/dictionary/gather_test.cpp
index 8fd8751bc76..71e3a2adaa1 100644
--- a/cpp/tests/dictionary/gather_test.cpp
+++ b/cpp/tests/dictionary/gather_test.cpp
@@ -45,7 +45,8 @@ TEST_F(DictionaryGatherTest, Gather)
 
 TEST_F(DictionaryGatherTest, GatherWithNulls)
 {
-  cudf::test::fixed_width_column_wrapper<int64_t> data{{1, 5, 5, 3, 7, 1}, {0, 1, 0, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int64_t> data{{1, 5, 5, 3, 7, 1},
+                                                       {false, true, false, true, true, true}};
 
   auto dictionary = cudf::dictionary::encode(data);
   cudf::dictionary_column_view view(dictionary->view());
@@ -54,7 +55,7 @@ TEST_F(DictionaryGatherTest, GatherWithNulls)
   auto table_result = cudf::gather(cudf::table_view{{dictionary->view()}}, gather_map);
   auto result       = cudf::dictionary_column_view(table_result->view().column(0));
 
-  cudf::test::fixed_width_column_wrapper<int64_t> expected{{7, 5, 5, 7}, {1, 1, 0, 1}};
+  cudf::test::fixed_width_column_wrapper<int64_t> expected{{7, 5, 5, 7}, {true, true, false, true}};
   auto result_decoded = cudf::dictionary::decode(result);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result_decoded->view());
 }
diff --git a/cpp/tests/dictionary/remove_keys_test.cpp b/cpp/tests/dictionary/remove_keys_test.cpp
index 9950a39d630..7067201ba5e 100644
--- a/cpp/tests/dictionary/remove_keys_test.cpp
+++ b/cpp/tests/dictionary/remove_keys_test.cpp
@@ -74,7 +74,7 @@ TEST_F(DictionaryRemoveKeysTest, FloatColumn)
       cudf::dictionary::remove_keys(cudf::dictionary_column_view(dictionary->view()), del_keys);
     auto const decoded = cudf::dictionary::decode(result->view());
     cudf::test::fixed_width_column_wrapper<float> expected{{0., 7.125, 0.5, 0., 7.125, 0.5},
-                                                           {0, 1, 1, 0, 1, 1}};
+                                                           {false, true, true, false, true, true}};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(decoded->view(), expected);
   }
   {
@@ -90,8 +90,9 @@ TEST_F(DictionaryRemoveKeysTest, FloatColumn)
 
 TEST_F(DictionaryRemoveKeysTest, WithNull)
 {
-  cudf::test::fixed_width_column_wrapper<int64_t> input{{444, 0, 333, 111, 222, 222, 222, 444, 0},
-                                                        {1, 1, 1, 1, 1, 0, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int64_t> input{
+    {444, 0, 333, 111, 222, 222, 222, 444, 0},
+    {true, true, true, true, true, false, true, true, true}};
   cudf::test::fixed_width_column_wrapper<int64_t> del_keys{0, 111, 777};
 
   auto const dictionary = cudf::dictionary::encode(input);
@@ -99,8 +100,9 @@ TEST_F(DictionaryRemoveKeysTest, WithNull)
     auto const result =
       cudf::dictionary::remove_keys(cudf::dictionary_column_view(dictionary->view()), del_keys);
     auto const decoded = cudf::dictionary::decode(result->view());
-    cudf::test::fixed_width_column_wrapper<int64_t> expected{{444, 0, 333, 0, 222, 0, 222, 444, 0},
-                                                             {1, 0, 1, 0, 1, 0, 1, 1, 0}};
+    cudf::test::fixed_width_column_wrapper<int64_t> expected{
+      {444, 0, 333, 0, 222, 0, 222, 444, 0},
+      {true, false, true, false, true, false, true, true, false}};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(decoded->view(), expected);
   }
   {
@@ -121,6 +123,6 @@ TEST_F(DictionaryRemoveKeysTest, Errors)
 
   cudf::test::fixed_width_column_wrapper<float> del_keys{1.0, 2.0, 3.0};
   EXPECT_THROW(cudf::dictionary::remove_keys(dictionary->view(), del_keys), cudf::data_type_error);
-  cudf::test::fixed_width_column_wrapper<int64_t> null_keys{{1, 2, 3}, {1, 0, 1}};
+  cudf::test::fixed_width_column_wrapper<int64_t> null_keys{{1, 2, 3}, {true, false, true}};
   EXPECT_THROW(cudf::dictionary::remove_keys(dictionary->view(), null_keys), cudf::logic_error);
 }
diff --git a/cpp/tests/dictionary/scatter_test.cpp b/cpp/tests/dictionary/scatter_test.cpp
index 2f77f4ee621..59279e0f0cd 100644
--- a/cpp/tests/dictionary/scatter_test.cpp
+++ b/cpp/tests/dictionary/scatter_test.cpp
@@ -90,10 +90,11 @@ TEST_F(DictionaryScatterTest, ScatterScalar)
 
 TEST_F(DictionaryScatterTest, WithNulls)
 {
-  cudf::test::fixed_width_column_wrapper<int64_t> data_source{{1, 5, 7, 9}, {0, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int64_t> data_source{{1, 5, 7, 9},
+                                                              {false, true, true, true}};
   auto source = cudf::dictionary::encode(data_source);
-  cudf::test::fixed_width_column_wrapper<int64_t> data_target{{1, 5, 5, 3, 7, 1, 4, 2},
-                                                              {0, 1, 0, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int64_t> data_target{
+    {1, 5, 5, 3, 7, 1, 4, 2}, {false, true, false, true, true, true, true, true}};
   auto target = cudf::dictionary::encode(data_target);
 
   cudf::test::fixed_width_column_wrapper<int32_t> scatter_map{7, 2, 3, 1};
@@ -104,15 +105,15 @@ TEST_F(DictionaryScatterTest, WithNulls)
   auto decoded =
     cudf::dictionary::decode(cudf::dictionary_column_view(table_result.front()->view()));
 
-  cudf::test::fixed_width_column_wrapper<int64_t> expected{{1, 9, 5, 7, 7, 1, 4, 1},
-                                                           {0, 1, 1, 1, 1, 1, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<int64_t> expected{
+    {1, 9, 5, 7, 7, 1, 4, 1}, {false, true, true, true, true, true, true, false}};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, decoded->view());
 }
 
 TEST_F(DictionaryScatterTest, ScalarWithNulls)
 {
-  cudf::test::fixed_width_column_wrapper<int64_t> data_target{{1, 5, 5, 3, 7, 1, 4, 2},
-                                                              {0, 1, 0, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int64_t> data_target{
+    {1, 5, 5, 3, 7, 1, 4, 2}, {false, true, false, true, true, true, true, true}};
   auto target = cudf::dictionary::encode(data_target);
   std::vector<std::reference_wrapper<const cudf::scalar>> source;
   const cudf::numeric_scalar<int64_t> source_slr = cudf::test::make_type_param_scalar<int64_t>(100);
@@ -126,8 +127,8 @@ TEST_F(DictionaryScatterTest, ScalarWithNulls)
   auto decoded =
     cudf::dictionary::decode(cudf::dictionary_column_view(table_result.front()->view()));
 
-  cudf::test::fixed_width_column_wrapper<int64_t> expected{{1, 100, 100, 100, 7, 100, 4, 100},
-                                                           {0, 1, 1, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int64_t> expected{
+    {1, 100, 100, 100, 7, 100, 4, 100}, {false, true, true, true, true, true, true, true}};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, decoded->view());
 }
 
diff --git a/cpp/tests/dictionary/search_test.cpp b/cpp/tests/dictionary/search_test.cpp
index b49b4ce5aa0..1b73576e083 100644
--- a/cpp/tests/dictionary/search_test.cpp
+++ b/cpp/tests/dictionary/search_test.cpp
@@ -26,7 +26,8 @@ struct DictionarySearchTest : public cudf::test::BaseFixture {};
 TEST_F(DictionarySearchTest, StringsColumn)
 {
   cudf::test::dictionary_column_wrapper<std::string> dictionary(
-    {"fff", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "", ""}, {1, 1, 1, 1, 1, 1, 1, 1, 0});
+    {"fff", "aaa", "ddd", "bbb", "ccc", "ccc", "ccc", "", ""},
+    {true, true, true, true, true, true, true, true, false});
 
   auto result = cudf::dictionary::get_index(dictionary, cudf::string_scalar("ccc"));
   EXPECT_TRUE(result->is_valid());
@@ -45,7 +46,8 @@ TEST_F(DictionarySearchTest, StringsColumn)
 
 TEST_F(DictionarySearchTest, WithNulls)
 {
-  cudf::test::dictionary_column_wrapper<int64_t> dictionary({9, 8, 7, 6, 4}, {0, 1, 1, 0, 1});
+  cudf::test::dictionary_column_wrapper<int64_t> dictionary({9, 8, 7, 6, 4},
+                                                            {false, true, true, false, true});
 
   auto result = cudf::dictionary::get_index(dictionary, cudf::numeric_scalar<int64_t>(4));
   EXPECT_TRUE(result->is_valid());
diff --git a/cpp/tests/dictionary/set_keys_test.cpp b/cpp/tests/dictionary/set_keys_test.cpp
index 5c9ec3567fe..62bd9e00584 100644
--- a/cpp/tests/dictionary/set_keys_test.cpp
+++ b/cpp/tests/dictionary/set_keys_test.cpp
@@ -57,22 +57,24 @@ TEST_F(DictionarySetKeysTest, FloatKeys)
   auto result = cudf::dictionary::set_keys(dictionary->view(), new_keys);
 
   cudf::test::fixed_width_column_wrapper<float> expected{{4.25, 7.125, 0.5, 0., 7.125, 0.5},
-                                                         {1, 1, 1, 0, 1, 1}};
+                                                         {true, true, true, false, true, true}};
   auto decoded = cudf::dictionary::decode(result->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*decoded, expected);
 }
 
 TEST_F(DictionarySetKeysTest, WithNulls)
 {
-  cudf::test::fixed_width_column_wrapper<int64_t> input{{444, 0, 333, 111, 222, 222, 222, 444, 0},
-                                                        {1, 1, 1, 1, 1, 0, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int64_t> input{
+    {444, 0, 333, 111, 222, 222, 222, 444, 0},
+    {true, true, true, true, true, false, true, true, true}};
   auto dictionary = cudf::dictionary::encode(input);
 
   cudf::test::fixed_width_column_wrapper<int64_t> new_keys{0, 222, 333, 444};
   auto result = cudf::dictionary::set_keys(dictionary->view(), new_keys);
 
   cudf::test::fixed_width_column_wrapper<int64_t> expected{
-    {444, 0, 333, 111, 222, 222, 222, 444, 0}, {1, 1, 1, 0, 1, 0, 1, 1, 1}};
+    {444, 0, 333, 111, 222, 222, 222, 444, 0},
+    {true, true, true, false, true, false, true, true, true}};
   auto decoded = cudf::dictionary::decode(result->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*decoded, expected);
 }
@@ -84,7 +86,7 @@ TEST_F(DictionarySetKeysTest, Errors)
 
   cudf::test::fixed_width_column_wrapper<float> new_keys{1.0, 2.0, 3.0};
   EXPECT_THROW(cudf::dictionary::set_keys(dictionary->view(), new_keys), cudf::data_type_error);
-  cudf::test::fixed_width_column_wrapper<int64_t> null_keys{{1, 2, 3}, {1, 0, 1}};
+  cudf::test::fixed_width_column_wrapper<int64_t> null_keys{{1, 2, 3}, {true, false, true}};
   EXPECT_THROW(cudf::dictionary::set_keys(dictionary->view(), null_keys), cudf::logic_error);
 }
 
diff --git a/cpp/tests/dictionary/slice_test.cpp b/cpp/tests/dictionary/slice_test.cpp
index 42bf7d488d2..d80f8dee079 100644
--- a/cpp/tests/dictionary/slice_test.cpp
+++ b/cpp/tests/dictionary/slice_test.cpp
@@ -31,14 +31,16 @@ struct DictionarySliceTest : public cudf::test::BaseFixture {};
 TEST_F(DictionarySliceTest, SliceColumn)
 {
   cudf::test::strings_column_wrapper strings{
-    {"eee", "aaa", "ddd", "bbb", "ccc", "", "ccc", "eee", "aaa"}, {1, 1, 1, 1, 1, 0, 1, 1, 1}};
+    {"eee", "aaa", "ddd", "bbb", "ccc", "", "ccc", "eee", "aaa"},
+    {true, true, true, true, true, false, true, true, true}};
   auto dictionary = cudf::dictionary::encode(strings);
 
   std::vector<cudf::size_type> splits{1, 6};
   auto result = cudf::slice(dictionary->view(), splits);
 
   auto output = cudf::dictionary::decode(cudf::dictionary_column_view(result.front()));
-  cudf::test::strings_column_wrapper expected{{"aaa", "ddd", "bbb", "ccc", ""}, {1, 1, 1, 1, 0}};
+  cudf::test::strings_column_wrapper expected{{"aaa", "ddd", "bbb", "ccc", ""},
+                                              {true, true, true, true, false}};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *output);
 
   {
@@ -69,21 +71,22 @@ TEST_F(DictionarySliceTest, SliceColumn)
 TEST_F(DictionarySliceTest, SplitColumn)
 {
   cudf::test::fixed_width_column_wrapper<float> input{{4.25, 7.125, 0.5, 0., -11.75, 7.125, 0.5},
-                                                      {1, 1, 1, 0, 1, 1, 1}};
+                                                      {true, true, true, false, true, true, true}};
   auto dictionary = cudf::dictionary::encode(input);
 
   std::vector<cudf::size_type> splits{2, 6};
   auto results = cudf::split(dictionary->view(), splits);
 
-  cudf::test::fixed_width_column_wrapper<float> expected1{{4.25, 7.125}, {1, 1}};
+  cudf::test::fixed_width_column_wrapper<float> expected1{{4.25, 7.125}, {true, true}};
   auto output1 = cudf::dictionary::decode(cudf::dictionary_column_view(results[0]));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, output1->view());
 
-  cudf::test::fixed_width_column_wrapper<float> expected2{{0.5, 0., -11.75, 7.125}, {1, 0, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<float> expected2{{0.5, 0., -11.75, 7.125},
+                                                          {true, false, true, true}};
   auto output2 = cudf::dictionary::decode(cudf::dictionary_column_view(results[1]));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, output2->view());
 
-  cudf::test::fixed_width_column_wrapper<float> expected3({0.5}, {1});
+  cudf::test::fixed_width_column_wrapper<float> expected3({0.5}, {true});
   auto output3 = cudf::dictionary::decode(cudf::dictionary_column_view(results[2]));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, output3->view());
 }
diff --git a/cpp/tests/groupby/argmax_tests.cpp b/cpp/tests/groupby/argmax_tests.cpp
index f9d034ad0c7..6b45d460cd6 100644
--- a/cpp/tests/groupby/argmax_tests.cpp
+++ b/cpp/tests/groupby/argmax_tests.cpp
@@ -97,8 +97,9 @@ TYPED_TEST(groupby_argmax_test, null_keys_and_values)
 
   if (std::is_same_v<V, bool>) return;
 
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                                 {1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<K> keys(
+    {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+    {true, true, false, true, true, true, true, true, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> vals({9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 4},
                                                  {0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0});
 
diff --git a/cpp/tests/groupby/argmin_tests.cpp b/cpp/tests/groupby/argmin_tests.cpp
index 0211bc5fb43..c3d7360a072 100644
--- a/cpp/tests/groupby/argmin_tests.cpp
+++ b/cpp/tests/groupby/argmin_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -98,8 +98,9 @@ TYPED_TEST(groupby_argmin_test, null_keys_and_values)
 
   if (std::is_same_v<V, bool>) return;
 
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                                 {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<K> keys(
+    {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+    {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> vals({9, 8, 7, 6, 5, 4, 3, 2, 1, 0, 4},
                                                  {1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0});
 
diff --git a/cpp/tests/groupby/collect_set_tests.cpp b/cpp/tests/groupby/collect_set_tests.cpp
index 518fec65f61..61d2838590b 100644
--- a/cpp/tests/groupby/collect_set_tests.cpp
+++ b/cpp/tests/groupby/collect_set_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,7 +39,7 @@ auto groupby_collect_set(cudf::column_view const& keys,
                          std::unique_ptr<cudf::groupby_aggregation>&& agg)
 {
   std::vector<cudf::groupby::aggregation_request> requests;
-  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests.emplace_back();
   requests[0].values = values;
   requests[0].aggregations.emplace_back(std::move(agg));
 
diff --git a/cpp/tests/groupby/correlation_tests.cpp b/cpp/tests/groupby/correlation_tests.cpp
index 399ff9f51a6..26f714632dd 100644
--- a/cpp/tests/groupby/correlation_tests.cpp
+++ b/cpp/tests/groupby/correlation_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -119,7 +119,7 @@ TYPED_TEST(groupby_correlation_test, null_keys_and_values)
 
   // clang-format off
   cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+                                     {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> val0({9, 1, 1, 2, 2, 3, 3,-1, 1, 4, 4},
                                      {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
   cudf::test::fixed_width_column_wrapper<V> val1({1, 1, 1, 2, 0, 3, 3,-1, 0, 2, 2});
@@ -143,7 +143,7 @@ TYPED_TEST(groupby_correlation_test, null_values_same)
 
   // clang-format off
   cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+                                     {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> val0({9, 1, 1, 2, 2, 3, 3,-1, 1, 4, 4},
                                      {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
   cudf::test::fixed_width_column_wrapper<V> val1({1, 1, 1, 2, 0, 3, 3,-1, 0, 2, 2},
@@ -172,7 +172,7 @@ TYPED_TEST(groupby_correlation_test, null_values_different)
 
   // clang-format off
   cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+                                     {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> val0({9, 1, 1, 2, 2, 3, 3,-1, 1, 4, 4},
                                      {0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1});
   cudf::test::fixed_width_column_wrapper<V> val1({1, 2, 1, 2,-1, 6, 3,-1, 0, 1, 2},
diff --git a/cpp/tests/groupby/count_scan_tests.cpp b/cpp/tests/groupby/count_scan_tests.cpp
index fb80989e8ed..b694d3514b6 100644
--- a/cpp/tests/groupby/count_scan_tests.cpp
+++ b/cpp/tests/groupby/count_scan_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -112,7 +112,7 @@ TYPED_TEST(groupby_count_scan_test, null_keys_and_values)
   using result_wrapper = typename TestFixture::result_wrapper;
 
   // clang-format off
-  key_wrapper keys(  {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  key_wrapper keys(  {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {true, true, true, true, true, true, true, false, true, true, true});
   value_wrapper vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
   //                        {1, 1, 1, 2, 2, 2, 2, 3, _, 3, 4}
diff --git a/cpp/tests/groupby/count_tests.cpp b/cpp/tests/groupby/count_tests.cpp
index 4f0fdd53a4d..9ed6c11f266 100644
--- a/cpp/tests/groupby/count_tests.cpp
+++ b/cpp/tests/groupby/count_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,8 +116,9 @@ TYPED_TEST(groupby_count_test, null_keys_and_values)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, cudf::aggregation::COUNT_VALID>;
 
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                                 {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<K> keys(
+    {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+    {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
                                                  {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
diff --git a/cpp/tests/groupby/covariance_tests.cpp b/cpp/tests/groupby/covariance_tests.cpp
index 96001172e00..e3eb2da201f 100644
--- a/cpp/tests/groupby/covariance_tests.cpp
+++ b/cpp/tests/groupby/covariance_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -129,7 +129,7 @@ TYPED_TEST(groupby_covariance_test, null_keys_and_values)
 
   // clang-format off
   cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+                                     {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> val0({9, 1, 1, 2, 2, 3, 3,-1, 1, 4, 4},
                                      {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
   cudf::test::fixed_width_column_wrapper<V> val1({1, 1, 1, 2, 0, 3, 3,-1, 0, 2, 2});
@@ -150,7 +150,7 @@ TYPED_TEST(groupby_covariance_test, null_values_same)
 
   // clang-format off
   cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+                                     {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> val0({9, 1, 1, 2, 2, 3, 3,-1, 1, 4, 4},
                                      {0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
   cudf::test::fixed_width_column_wrapper<V> val1({1, 1, 1, 2, 0, 3, 3,-1, 0, 2, 2},
@@ -172,7 +172,7 @@ TYPED_TEST(groupby_covariance_test, null_values_different)
 
   // clang-format off
   cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+                                     {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> val0({9, 1, 1, 2, 2, 3, 3,-1, 1, 4, 4},
                                      {0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1});
   cudf::test::fixed_width_column_wrapper<V> val1({1, 2, 1, 2,-1, 3, 3,-1, 0, 4, 2},
diff --git a/cpp/tests/groupby/groupby_test_util.cpp b/cpp/tests/groupby/groupby_test_util.cpp
index 8bd109fca53..5d99d15ae77 100644
--- a/cpp/tests/groupby/groupby_test_util.cpp
+++ b/cpp/tests/groupby/groupby_test_util.cpp
@@ -58,7 +58,7 @@ void test_single_agg(cudf::column_view const& keys,
   }();
 
   std::vector<cudf::groupby::aggregation_request> requests;
-  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests.emplace_back();
   requests[0].values = values;
 
   requests[0].aggregations.push_back(std::move(agg));
@@ -126,7 +126,7 @@ void test_single_scan(cudf::column_view const& keys,
                       std::vector<cudf::null_order> const& null_precedence)
 {
   std::vector<cudf::groupby::scan_request> requests;
-  requests.emplace_back(cudf::groupby::scan_request());
+  requests.emplace_back();
   requests[0].values = values;
 
   requests[0].aggregations.push_back(std::move(agg));
diff --git a/cpp/tests/groupby/groups_tests.cpp b/cpp/tests/groupby/groups_tests.cpp
index fb471e3a03e..f3d303c0fb8 100644
--- a/cpp/tests/groupby/groups_tests.cpp
+++ b/cpp/tests/groupby/groups_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -108,7 +108,8 @@ TYPED_TEST(groupby_group_keys_and_values_test, some_nulls)
   using K = int32_t;
   using V = TypeParam;
 
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 1, 3, 2, 1, 2}, {1, 0, 1, 0, 0, 1});
+  cudf::test::fixed_width_column_wrapper<K> keys({1, 1, 3, 2, 1, 2},
+                                                 {true, false, true, false, false, true});
   cudf::test::fixed_width_column_wrapper<K> expect_grouped_keys({1, 2, 3},
                                                                 cudf::test::iterators::no_nulls());
   cudf::test::fixed_width_column_wrapper<V> values({1, 2, 3, 4, 5, 6});
diff --git a/cpp/tests/groupby/keys_tests.cpp b/cpp/tests/groupby/keys_tests.cpp
index 7064abc459d..7a8e64dc61d 100644
--- a/cpp/tests/groupby/keys_tests.cpp
+++ b/cpp/tests/groupby/keys_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -374,7 +374,7 @@ TEST_F(groupby_cache_test, duplicate_agggregations)
   cudf::groupby::groupby gb_obj(cudf::table_view({keys}));
 
   std::vector<cudf::groupby::aggregation_request> requests;
-  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests.emplace_back();
   requests[0].values = vals;
   requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
   requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
@@ -403,10 +403,10 @@ TEST_F(groupby_cache_test, duplicate_columns)
   cudf::groupby::groupby gb_obj(cudf::table_view({keys}));
 
   std::vector<cudf::groupby::aggregation_request> requests;
-  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests.emplace_back();
   requests[0].values = vals;
   requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
-  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests.emplace_back();
   requests[1].values = vals;
   requests[1].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
 
diff --git a/cpp/tests/groupby/m2_tests.cpp b/cpp/tests/groupby/m2_tests.cpp
index 294791397b0..4359c154cf6 100644
--- a/cpp/tests/groupby/m2_tests.cpp
+++ b/cpp/tests/groupby/m2_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ using M2s_col = cudf::test::fixed_width_column_wrapper<T>;
 auto compute_M2(cudf::column_view const& keys, cudf::column_view const& values)
 {
   std::vector<cudf::groupby::aggregation_request> requests;
-  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests.emplace_back();
   requests[0].values = values;
   requests[0].aggregations.emplace_back(cudf::make_m2_aggregation<cudf::groupby_aggregation>());
 
diff --git a/cpp/tests/groupby/max_scan_tests.cpp b/cpp/tests/groupby/max_scan_tests.cpp
index 2d8400e02b7..d86de798844 100644
--- a/cpp/tests/groupby/max_scan_tests.cpp
+++ b/cpp/tests/groupby/max_scan_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -127,7 +127,7 @@ TYPED_TEST(groupby_max_scan_test, null_keys_and_values)
   using result_wrapper = typename TestFixture::result_wrapper;
 
   // clang-format off
-  key_wrapper keys(  {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  key_wrapper keys(  {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {true, true, true, true, true, true, true, false, true, true, true});
   value_wrapper vals({5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
                          //  {1, 1, 1, 2, 2, 2, 2, 3,   _, 3, 4}
diff --git a/cpp/tests/groupby/max_tests.cpp b/cpp/tests/groupby/max_tests.cpp
index 9481770dc58..6feeb4ee618 100644
--- a/cpp/tests/groupby/max_tests.cpp
+++ b/cpp/tests/groupby/max_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -111,8 +111,9 @@ TYPED_TEST(groupby_max_test, null_keys_and_values)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, cudf::aggregation::MAX>;
 
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                                 {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<K> keys(
+    {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+    {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
                                                  {1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0});
 
@@ -168,20 +169,24 @@ TEST_F(groupby_max_string_test, max_sorted_strings)
     {"",   "",   "",   "",   "",   "",   "06", "06", "06", "06", "10", "10", "10", "10", "14", "14",
      "14", "14", "18", "18", "18", "18", "22", "22", "22", "22", "26", "26", "26", "26", "30", "30",
      "30", "30", "34", "34", "34", "34", "38", "38", "38", "38", "42", "42", "42", "42"},
-    {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+    {false, false, false, false, false, false, true, true, true, true, true, true,
+     true,  true,  true,  true,  true,  true,  true, true, true, true, true, true,
+     true,  true,  true,  true,  true,  true,  true, true, true, true, true, true,
+     true,  true,  true,  true,  true,  true,  true, true, true, true});
   cudf::test::strings_column_wrapper vals(
     {"", "", "",   "", "", "", "06", "", "", "", "10", "", "", "", "14", "",
      "", "", "18", "", "", "", "22", "", "", "", "26", "", "", "", "30", "",
      "", "", "34", "", "", "", "38", "", "", "", "42", "", "", ""},
-    {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
-     0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0});
+    {false, false, false, false, false, false, true, false, false, false, true, false,
+     false, false, true,  false, false, false, true, false, false, false, true, false,
+     false, false, true,  false, false, false, true, false, false, false, true, false,
+     false, false, true,  false, false, false, true, false, false, false});
   cudf::test::strings_column_wrapper expect_keys(
     {"06", "10", "14", "18", "22", "26", "30", "34", "38", "42", ""},
-    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+    {true, true, true, true, true, true, true, true, true, true, false});
   cudf::test::strings_column_wrapper expect_vals(
     {"06", "10", "14", "18", "22", "26", "30", "34", "38", "42", ""},
-    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+    {true, true, true, true, true, true, true, true, true, true, false});
 
   // cudf::test::fixed_width_column_wrapper<size_type> expect_argmax(
   // {6, 10, 14, 18, 22, 26, 30, 34, 38, 42, -1},
@@ -537,7 +542,7 @@ TYPED_TEST(groupby_max_floating_point_test, values_with_nan)
   auto const vals = floats_col{nan, nan};
 
   std::vector<cudf::groupby::aggregation_request> requests;
-  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests.emplace_back();
   requests[0].values = vals;
   requests[0].aggregations.emplace_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
 
diff --git a/cpp/tests/groupby/mean_tests.cpp b/cpp/tests/groupby/mean_tests.cpp
index 2d6ad24a096..0cb5ee30a8b 100644
--- a/cpp/tests/groupby/mean_tests.cpp
+++ b/cpp/tests/groupby/mean_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -116,8 +116,9 @@ TYPED_TEST(groupby_mean_test, null_keys_and_values)
   using R  = cudf::detail::target_type_t<V, cudf::aggregation::MEAN>;
   using RT = typename std::conditional<cudf::is_duration<R>(), int, double>::type;
 
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                                 {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<K> keys(
+    {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+    {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
                                                  {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
diff --git a/cpp/tests/groupby/median_tests.cpp b/cpp/tests/groupby/median_tests.cpp
index 4b037c08ac3..49b4480831a 100644
--- a/cpp/tests/groupby/median_tests.cpp
+++ b/cpp/tests/groupby/median_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -106,8 +106,9 @@ TYPED_TEST(groupby_median_test, null_keys_and_values)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, cudf::aggregation::MEDIAN>;
 
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                                 {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<K> keys(
+    {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+    {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
                                                  {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
diff --git a/cpp/tests/groupby/merge_lists_tests.cpp b/cpp/tests/groupby/merge_lists_tests.cpp
index f2909f870aa..279d71560b4 100644
--- a/cpp/tests/groupby/merge_lists_tests.cpp
+++ b/cpp/tests/groupby/merge_lists_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -40,7 +40,7 @@ auto merge_lists(vcol_views const& keys_cols, vcol_views const& values_cols)
   auto const values = cudf::concatenate(values_cols);
 
   std::vector<cudf::groupby::aggregation_request> requests;
-  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests.emplace_back();
   requests[0].values = *values;
   requests[0].aggregations.emplace_back(
     cudf::make_merge_lists_aggregation<cudf::groupby_aggregation>());
diff --git a/cpp/tests/groupby/merge_m2_tests.cpp b/cpp/tests/groupby/merge_m2_tests.cpp
index 1087410dfff..67f231e5206 100644
--- a/cpp/tests/groupby/merge_m2_tests.cpp
+++ b/cpp/tests/groupby/merge_m2_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,7 +57,7 @@ using vcol_views  = std::vector<cudf::column_view>;
 auto compute_partial_results(cudf::column_view const& keys, cudf::column_view const& values)
 {
   std::vector<cudf::groupby::aggregation_request> requests;
-  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests.emplace_back();
   requests[0].values = values;
   requests[0].aggregations.emplace_back(cudf::make_count_aggregation<cudf::groupby_aggregation>());
   requests[0].aggregations.emplace_back(cudf::make_mean_aggregation<cudf::groupby_aggregation>());
@@ -85,7 +85,7 @@ auto merge_M2(vcol_views const& keys_cols, vcol_views const& values_cols)
   auto const values = cudf::concatenate(values_cols);
 
   std::vector<cudf::groupby::aggregation_request> requests;
-  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests.emplace_back();
   requests[0].values = *values;
   requests[0].aggregations.emplace_back(
     cudf::make_merge_m2_aggregation<cudf::groupby_aggregation>());
diff --git a/cpp/tests/groupby/merge_sets_tests.cpp b/cpp/tests/groupby/merge_sets_tests.cpp
index 5fc7e68b524..9736bb84dd6 100644
--- a/cpp/tests/groupby/merge_sets_tests.cpp
+++ b/cpp/tests/groupby/merge_sets_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -42,7 +42,7 @@ auto merge_sets(vcol_views const& keys_cols, vcol_views const& values_cols)
   auto const values = cudf::concatenate(values_cols);
 
   std::vector<cudf::groupby::aggregation_request> requests;
-  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests.emplace_back();
   requests[0].values = *values;
   requests[0].aggregations.emplace_back(
     cudf::make_merge_sets_aggregation<cudf::groupby_aggregation>());
diff --git a/cpp/tests/groupby/min_scan_tests.cpp b/cpp/tests/groupby/min_scan_tests.cpp
index 035f8e3926b..877eb7a1c53 100644
--- a/cpp/tests/groupby/min_scan_tests.cpp
+++ b/cpp/tests/groupby/min_scan_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -123,7 +123,7 @@ TYPED_TEST(groupby_min_scan_test, null_keys_and_values)
   using result_wrapper = typename TestFixture::result_wrapper;
 
   // clang-format off
-  key_wrapper keys(  {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  key_wrapper keys(  {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {true, true, true, true, true, true, true, false, true, true, true});
   value_wrapper vals({5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
                          //  { 1, 1, 1, 2, 2,  2, 2, 3, _, 3, 4}
diff --git a/cpp/tests/groupby/min_tests.cpp b/cpp/tests/groupby/min_tests.cpp
index 44f9b7040c6..38007a81f68 100644
--- a/cpp/tests/groupby/min_tests.cpp
+++ b/cpp/tests/groupby/min_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -111,8 +111,9 @@ TYPED_TEST(groupby_min_test, null_keys_and_values)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, cudf::aggregation::MIN>;
 
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                                 {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<K> keys(
+    {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+    {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
                                                  {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
@@ -168,20 +169,24 @@ TEST_F(groupby_min_string_test, min_sorted_strings)
     {"",   "",   "",   "",   "",   "",   "06", "06", "06", "06", "10", "10", "10", "10", "14", "14",
      "14", "14", "18", "18", "18", "18", "22", "22", "22", "22", "26", "26", "26", "26", "30", "30",
      "30", "30", "34", "34", "34", "34", "38", "38", "38", "38", "42", "42", "42", "42"},
-    {0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+    {false, false, false, false, false, false, true, true, true, true, true, true,
+     true,  true,  true,  true,  true,  true,  true, true, true, true, true, true,
+     true,  true,  true,  true,  true,  true,  true, true, true, true, true, true,
+     true,  true,  true,  true,  true,  true,  true, true, true, true});
   cudf::test::strings_column_wrapper vals(
     {"", "", "",   "", "", "", "06", "", "", "", "10", "", "", "", "14", "",
      "", "", "18", "", "", "", "22", "", "", "", "26", "", "", "", "30", "",
      "", "", "34", "", "", "", "38", "", "", "", "42", "", "", ""},
-    {0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
-     0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0});
+    {false, false, false, false, false, false, true, false, false, false, true, false,
+     false, false, true,  false, false, false, true, false, false, false, true, false,
+     false, false, true,  false, false, false, true, false, false, false, true, false,
+     false, false, true,  false, false, false, true, false, false, false});
   cudf::test::strings_column_wrapper expect_keys(
     {"06", "10", "14", "18", "22", "26", "30", "34", "38", "42", ""},
-    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+    {true, true, true, true, true, true, true, true, true, true, false});
   cudf::test::strings_column_wrapper expect_vals(
     {"06", "10", "14", "18", "22", "26", "30", "34", "38", "42", ""},
-    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+    {true, true, true, true, true, true, true, true, true, true, false});
 
   auto agg = cudf::make_min_aggregation<cudf::groupby_aggregation>();
   test_single_agg(keys,
@@ -533,7 +538,7 @@ TYPED_TEST(groupby_min_floating_point_test, values_with_nan)
   auto const vals = floats_col{nan, nan};
 
   std::vector<cudf::groupby::aggregation_request> requests;
-  requests.emplace_back(cudf::groupby::aggregation_request());
+  requests.emplace_back();
   requests[0].values = vals;
   requests[0].aggregations.emplace_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
 
diff --git a/cpp/tests/groupby/nth_element_tests.cpp b/cpp/tests/groupby/nth_element_tests.cpp
index 5fb911ea0f1..e274bfa8c65 100644
--- a/cpp/tests/groupby/nth_element_tests.cpp
+++ b/cpp/tests/groupby/nth_element_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -173,7 +173,7 @@ TYPED_TEST(groupby_nth_element_test, null_keys_and_values)
   using R = cudf::detail::target_type_t<V, cudf::aggregation::NTH_ELEMENT>;
 
   cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+                                     {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V, int32_t> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
                                               {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
@@ -193,7 +193,7 @@ TYPED_TEST(groupby_nth_element_test, null_keys_and_values_out_of_bounds)
   using R = cudf::detail::target_type_t<V, cudf::aggregation::NTH_ELEMENT>;
 
   cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                     {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+                                     {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V, int32_t> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
                                               {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
   //                                        {1, 1, 1    2, 2, 2,    3, 3,   4}
@@ -213,7 +213,7 @@ TYPED_TEST(groupby_nth_element_test, exclude_nulls)
   using R = cudf::detail::target_type_t<V, cudf::aggregation::NTH_ELEMENT>;
 
   cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 3, 1, 2, 2, 1, 3, 3, 2, 4, 4, 2},
-                                     {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
+                                     {true, true, true, true, true, true, true, true, false, true, true, true, true, true});
   cudf::test::fixed_width_column_wrapper<V, int32_t> vals({0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 4, 4, 2},
                                               {0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0});
 
@@ -255,7 +255,7 @@ TYPED_TEST(groupby_nth_element_test, exclude_nulls_negative_index)
   using R = cudf::detail::target_type_t<V, cudf::aggregation::NTH_ELEMENT>;
 
   cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 3, 1, 2, 2, 1, 3, 3, 2, 4, 4, 2},
-                                     {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
+                                     {true, true, true, true, true, true, true, true, false, true, true, true, true, true});
   cudf::test::fixed_width_column_wrapper<V, int32_t> vals({0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 4, 4, 2},
                                               {0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0});
 
@@ -320,7 +320,7 @@ TEST_F(groupby_nth_element_string_test, basic_string)
 
   //+ve out of bounds
   agg = cudf::make_nth_element_aggregation<cudf::groupby_aggregation>(3);
-  cudf::test::strings_column_wrapper expect_vals3{{"", "9", ""}, {0, 1, 0}};
+  cudf::test::strings_column_wrapper expect_vals3{{"", "9", ""}, {false, true, false}};
   test_single_agg(keys, vals, expect_keys, expect_vals3, std::move(agg));
 
   //groupby.last()
@@ -338,7 +338,7 @@ TEST_F(groupby_nth_element_string_test, basic_string)
 
   //-ve out of bounds
   agg = cudf::make_nth_element_aggregation<cudf::groupby_aggregation>(-4);
-  cudf::test::strings_column_wrapper expect_vals7{{"", "1", ""}, {0, 1, 0}};
+  cudf::test::strings_column_wrapper expect_vals7{{"", "1", ""}, {false, true, false}};
   test_single_agg(keys, vals, expect_keys, expect_vals7, std::move(agg));
 }
 // clang-format on
@@ -420,13 +420,15 @@ TEST_F(groupby_nth_element_structs_test, Basics)
   auto child0 = ints{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
   auto child1 = doubles{0.1, 1.2, 2.3, 3.4, 4.51, 5.3e4, 6.3231, -0.07, 832.1, 9.999};
   auto child2 = strings{"", "a", "b", "c", "d", "e", "f", "g", "HH", "JJJ"};
-  auto values = structs{{child0, child1, child2}, {1, 0, 1, 0, 1, 1, 1, 1, 0, 1}};
-
-  auto expected_keys   = ints{0, 1, 2, 3};
-  auto expected_ch0    = ints{1, 4, 7, 0};
-  auto expected_ch1    = doubles{1.2, 4.51, -0.07, 0.0};
-  auto expected_ch2    = strings{"a", "d", "g", ""};
-  auto expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}, {0, 1, 1, 0}};
+  auto values = structs{{child0, child1, child2},
+                        {true, false, true, false, true, true, true, true, false, true}};
+
+  auto expected_keys = ints{0, 1, 2, 3};
+  auto expected_ch0  = ints{1, 4, 7, 0};
+  auto expected_ch1  = doubles{1.2, 4.51, -0.07, 0.0};
+  auto expected_ch2  = strings{"a", "d", "g", ""};
+  auto expected_values =
+    structs{{expected_ch0, expected_ch1, expected_ch2}, {false, true, true, false}};
   test_single_agg(keys,
                   values,
                   expected_keys,
@@ -437,7 +439,7 @@ TEST_F(groupby_nth_element_structs_test, Basics)
   expected_ch0    = ints{0, 4, 6, 9};
   expected_ch1    = doubles{0.1, 4.51, 6.3231, 9.999};
   expected_ch2    = strings{"", "d", "f", "JJJ"};
-  expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}, {1, 1, 1, 1}};
+  expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}, {true, true, true, true}};
   test_single_agg(
     keys,
     values,
@@ -459,7 +461,8 @@ TEST_F(groupby_nth_element_structs_test, NestedStructs)
   auto child1_of_child1 = doubles{0.1, 1.2, 2.3, 3.4, 4.51, 5.3e4, 6.3231, -0.07, 832.1, 9.999};
   auto child1           = structs{child0_of_child1, child1_of_child1};
   auto child2           = lists{{0}, {1, 2, 3}, {}, {4}, {5, 6}, {}, {}, {7}, {8, 9}, {}};
-  auto values           = structs{{child0, child1, child2}, {1, 0, 1, 0, 1, 1, 1, 1, 0, 1}};
+  auto values           = structs{{child0, child1, child2},
+                                  {true, false, true, false, true, true, true, true, false, true}};
 
   auto expected_keys       = ints{0, 1, 2, 3};
   auto expected_ch0        = ints{1, 4, 7, 0};
@@ -467,7 +470,8 @@ TEST_F(groupby_nth_element_structs_test, NestedStructs)
   auto expected_ch1_of_ch1 = doubles{1.2, 4.51, -0.07, 0.0};
   auto expected_ch1        = structs{expected_ch0_of_ch1, expected_ch1_of_ch1};
   auto expected_ch2        = lists{{1, 2, 3}, {5, 6}, {7}, {}};
-  auto expected_values     = structs{{expected_ch0, expected_ch1, expected_ch2}, {0, 1, 1, 0}};
+  auto expected_values =
+    structs{{expected_ch0, expected_ch1, expected_ch2}, {false, true, true, false}};
   test_single_agg(keys,
                   values,
                   expected_keys,
@@ -480,7 +484,7 @@ TEST_F(groupby_nth_element_structs_test, NestedStructs)
   expected_ch1_of_ch1 = doubles{0.1, 4.51, 6.3231, 9.999};
   expected_ch1        = structs{expected_ch0_of_ch1, expected_ch1_of_ch1};
   expected_ch2        = lists{{0}, {5, 6}, {}, {}};
-  expected_values     = structs{{expected_ch0, expected_ch1, expected_ch2}, {1, 1, 1, 1}};
+  expected_values = structs{{expected_ch0, expected_ch1, expected_ch2}, {true, true, true, true}};
   test_single_agg(
     keys,
     values,
diff --git a/cpp/tests/groupby/nunique_tests.cpp b/cpp/tests/groupby/nunique_tests.cpp
index c9156d837f7..8c2f9299c05 100644
--- a/cpp/tests/groupby/nunique_tests.cpp
+++ b/cpp/tests/groupby/nunique_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -125,8 +125,9 @@ TYPED_TEST(groupby_nunique_test, null_keys_and_values)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, cudf::aggregation::NUNIQUE>;
 
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                                 {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<K> keys(
+    {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+    {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
                                                  {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
@@ -150,8 +151,9 @@ TYPED_TEST(groupby_nunique_test, null_keys_and_values_with_duplicates)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, cudf::aggregation::NUNIQUE>;
 
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 3, 1, 2, 2, 1, 3, 3, 2, 4, 4, 2},
-                                                 {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<K> keys(
+    {1, 2, 3, 3, 1, 2, 2, 1, 3, 3, 2, 4, 4, 2},
+    {true, true, true, true, true, true, true, true, false, true, true, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> vals({0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 4, 4, 2},
                                                  {0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0});
 
@@ -176,8 +178,9 @@ TYPED_TEST(groupby_nunique_test, include_nulls)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, cudf::aggregation::NUNIQUE>;
 
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 3, 1, 2, 2, 1, 3, 3, 2, 4, 4, 2},
-                                                 {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<K> keys(
+    {1, 2, 3, 3, 1, 2, 2, 1, 3, 3, 2, 4, 4, 2},
+    {true, true, true, true, true, true, true, true, false, true, true, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> vals({0, 1, 2, 2, 3, 4, 5, 6, 7, 8, 9, 4, 4, 2},
                                                  {0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0});
 
@@ -204,7 +207,7 @@ TYPED_TEST(groupby_nunique_test, dictionary)
 
   // clang-format off
   cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 3, 1, 2, 2, 1, 0, 3, 2, 4, 4, 2},
-                                     {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
+                                     {true, true, true, true, true, true, true, true, false, true, true, true, true, true});
   cudf::test::dictionary_column_wrapper<V>  vals({0, 1, 2, 2, 3, 4, 0, 6, 7, 8, 9, 0, 0, 0},
                                      {0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0});
 
diff --git a/cpp/tests/groupby/product_scan_tests.cpp b/cpp/tests/groupby/product_scan_tests.cpp
index 6010abd8a20..fd1512541a0 100644
--- a/cpp/tests/groupby/product_scan_tests.cpp
+++ b/cpp/tests/groupby/product_scan_tests.cpp
@@ -127,7 +127,7 @@ TYPED_TEST(groupby_product_scan_test, null_keys_and_values)
   using result_wrapper = typename TestFixture::result_wrapper;
 
   // clang-format off
-  key_wrapper keys(  {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  key_wrapper keys(  {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {true, true, true, true, true, true, true, false, true, true, true});
   value_wrapper vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
   //                         { 1, 1, 1, 2, 2,  2,  2, 3, *, 3, 4};
diff --git a/cpp/tests/groupby/product_tests.cpp b/cpp/tests/groupby/product_tests.cpp
index 0145293682b..8be4040db1c 100644
--- a/cpp/tests/groupby/product_tests.cpp
+++ b/cpp/tests/groupby/product_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -120,7 +120,7 @@ TYPED_TEST(groupby_product_test, null_keys_and_values)
 
   // clang-format off
   cudf::test::fixed_width_column_wrapper<K> keys(       { 1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                            { 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+                                            { true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> vals(       { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3},
                                             { 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
diff --git a/cpp/tests/groupby/quantile_tests.cpp b/cpp/tests/groupby/quantile_tests.cpp
index 8d134f340a0..7a5fee08238 100644
--- a/cpp/tests/groupby/quantile_tests.cpp
+++ b/cpp/tests/groupby/quantile_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -106,8 +106,9 @@ TYPED_TEST(groupby_quantile_test, null_keys_and_values)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, cudf::aggregation::QUANTILE>;
 
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                                 {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<K> keys(
+    {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+    {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
                                                  {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp
index 76b05566e4d..7f31bc9089f 100644
--- a/cpp/tests/groupby/rank_scan_tests.cpp
+++ b/cpp/tests/groupby/rank_scan_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -244,7 +244,7 @@ TYPED_TEST(typed_groupby_rank_scan_test, mixedStructs)
     0.0, 0.0, 2.0 / 5, 2.0 / 5, 4.0 / 5, 5.0 / 5, 0.0, 0.0, 2.0 / 2, 0.0, 0.0, 2.0 / 2};
 
   std::vector<cudf::groupby::scan_request> requests;
-  requests.emplace_back(cudf::groupby::scan_request());
+  requests.emplace_back();
   requests[0].values = *struct_col;
   requests[0].aggregations.push_back(cudf::make_rank_aggregation<cudf::groupby_scan_aggregation>(
     cudf::rank_method::DENSE, {}, cudf::null_policy::INCLUDE));
@@ -295,8 +295,8 @@ TYPED_TEST(typed_groupby_rank_scan_test, nestedStructs)
     {"0", "0", "0", "0", "0", "0", "1", "1", "1", "1", "0", "1"}, nulls_at({9, 10, 11})};
 
   std::vector<cudf::groupby::scan_request> requests;
-  requests.emplace_back(cudf::groupby::scan_request());
-  requests.emplace_back(cudf::groupby::scan_request());
+  requests.emplace_back();
+  requests.emplace_back();
   requests[0].values = *nested_structs;
   requests[0].aggregations.push_back(
     cudf::make_rank_aggregation<cudf::groupby_scan_aggregation>(cudf::rank_method::DENSE));
@@ -363,8 +363,8 @@ TYPED_TEST(typed_groupby_rank_scan_test, structsWithNullPushdown)
     {"0", "0", "0", "0", "0", "0", "1", "1", "1", "X", "X", "X"}, nulls_at({9, 10, 11})};
 
   std::vector<cudf::groupby::scan_request> requests;
-  requests.emplace_back(cudf::groupby::scan_request());
-  requests.emplace_back(cudf::groupby::scan_request());
+  requests.emplace_back();
+  requests.emplace_back();
   requests[0].values = *possibly_null_structs;
   requests[0].aggregations.push_back(cudf::make_rank_aggregation<cudf::groupby_scan_aggregation>(
     cudf::rank_method::DENSE, {}, cudf::null_policy::INCLUDE));
diff --git a/cpp/tests/groupby/replace_nulls_tests.cpp b/cpp/tests/groupby/replace_nulls_tests.cpp
index d4bd278aaad..748a5bdd638 100644
--- a/cpp/tests/groupby/replace_nulls_tests.cpp
+++ b/cpp/tests/groupby/replace_nulls_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -327,7 +327,7 @@ TEST_F(GroupbyReplaceNullsStructsTest, PrecedingFill)
   // Only null rows are replaced.
 
   SCW val =
-    this->data({{1, -1, 3, -1, -1, -1, 7}, {1, 0, 1, 0, 0, 0, 1}},
+    this->data({{1, -1, 3, -1, -1, -1, 7}, {true, false, true, false, false, false, true}},
                {{"x", "yy", "", "", "", "zz", ""}, {true, true, false, false, false, true, false}},
                LCW({{1, 2, 3}, {-1}, {}, {}, {42}, {}, {}}, Mask_t{1, 1, 0, 0, 1, 0, 0}.begin()),
                {1, 1, 0, 0, 1, 1, 0});
@@ -335,7 +335,7 @@ TEST_F(GroupbyReplaceNullsStructsTest, PrecedingFill)
   cudf::test::fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1, 1};
 
   SCW expect_val = this->data(
-    {{-1, -1, -1, 1, 1, -1, -1}, {0, 0, 0, 1, 1, 0, 0}},
+    {{-1, -1, -1, 1, 1, -1, -1}, {false, false, false, true, true, false, false}},
     {{"yy", "yy", "", "x", "x", "zz", "zz"}, {true, true, false, true, true, true, true}},
     LCW({LCW{-1}, {-1}, {42}, {1, 2, 3}, {1, 2, 3}, {}, {}}, Mask_t{1, 1, 1, 1, 1, 0, 0}.begin()),
     {1, 1, 1, 1, 1, 1, 1});
@@ -352,7 +352,7 @@ TEST_F(GroupbyReplaceNullsStructsTest, FollowingFill)
   // Only null rows are replaced.
 
   SCW val =
-    this->data({{1, -1, 3, -1, -1, -1, 7}, {1, 0, 1, 0, 0, 0, 1}},
+    this->data({{1, -1, 3, -1, -1, -1, 7}, {true, false, true, false, false, false, true}},
                {{"x", "yy", "", "", "", "zz", ""}, {true, true, false, false, false, true, false}},
                LCW({{1, 2, 3}, {-1}, {}, {}, {42}, {}, {}}, Mask_t{1, 1, 0, 0, 1, 0, 0}.begin()),
                {1, 1, 0, 0, 1, 1, 0});
@@ -360,7 +360,7 @@ TEST_F(GroupbyReplaceNullsStructsTest, FollowingFill)
   cudf::test::fixed_width_column_wrapper<K> expect_key{0, 0, 0, 1, 1, 1, 1};
 
   SCW expect_val = this->data(
-    {{-1, -1, -1, 1, -1, -1, -1}, {0, 0, 0, 1, 0, 0, 0}},
+    {{-1, -1, -1, 1, -1, -1, -1}, {false, false, false, true, false, false, false}},
     {{"yy", "", "", "x", "zz", "zz", ""}, {true, false, false, true, true, true, false}},
     LCW({LCW{-1}, {42}, {42}, {1, 2, 3}, {}, {}, {}}, Mask_t{1, 1, 1, 1, 0, 0, 0}.begin()),
     {1, 1, 1, 1, 1, 1, 0});
diff --git a/cpp/tests/groupby/shift_tests.cpp b/cpp/tests/groupby/shift_tests.cpp
index 1a6abf2e734..14c9ceb4508 100644
--- a/cpp/tests/groupby/shift_tests.cpp
+++ b/cpp/tests/groupby/shift_tests.cpp
@@ -248,7 +248,7 @@ TEST_F(groupby_shift_string_test, ForwardShiftWithoutNull_NullScalar)
   cudf::test::fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
   cudf::test::strings_column_wrapper val{"a", "bb", "cc", "d", "eee", "f", "gg"};
   cudf::test::strings_column_wrapper expected({"", "a", "cc", "f", "", "bb", "d"},
-                                              {0, 1, 1, 1, 0, 1, 1});
+                                              {false, true, true, true, false, true, true});
   cudf::size_type offset = 1;
   auto slr               = cudf::make_default_constructed_scalar(cudf::column_view(val).type());
 
@@ -260,9 +260,9 @@ TEST_F(groupby_shift_string_test, ForwardShiftWithNull_NullScalar)
   using K = int32_t;
   cudf::test::fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
   cudf::test::strings_column_wrapper val({"a", "bb", "cc", "d", "eee", "f", "gg"},
-                                         {1, 0, 1, 1, 0, 0, 0});
+                                         {true, false, true, true, false, false, false});
   cudf::test::strings_column_wrapper expected({"", "", "a", "cc", "", "", ""},
-                                              {0, 0, 1, 1, 0, 0, 0});
+                                              {false, false, true, true, false, false, false});
   cudf::size_type offset = 2;
   auto slr               = cudf::make_default_constructed_scalar(cudf::column_view(val).type());
 
@@ -287,9 +287,9 @@ TEST_F(groupby_shift_string_test, ForwardShiftWithNull_ValidScalar)
   using K = int32_t;
   cudf::test::fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
   cudf::test::strings_column_wrapper val({"a", "bb", "cc", "d", "eee", "f", "gg"},
-                                         {1, 1, 0, 0, 1, 0, 1});
+                                         {true, true, false, false, true, false, true});
   cudf::test::strings_column_wrapper expected({"42", "a", "", "", "42", "bb", ""},
-                                              {1, 1, 0, 0, 1, 1, 0});
+                                              {true, true, false, false, true, true, false});
 
   cudf::size_type offset = 1;
   auto slr               = cudf::make_string_scalar("42");
@@ -303,7 +303,7 @@ TEST_F(groupby_shift_string_test, BackwardShiftWithoutNull_NullScalar)
   cudf::test::fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
   cudf::test::strings_column_wrapper val{"a", "bb", "cc", "d", "eee", "f", "gg"};
   cudf::test::strings_column_wrapper expected({"gg", "", "", "", "", "", ""},
-                                              {1, 0, 0, 0, 0, 0, 0});
+                                              {true, false, false, false, false, false, false});
 
   cudf::size_type offset = -3;
   auto slr               = cudf::make_default_constructed_scalar(cudf::column_view(val).type());
@@ -316,9 +316,9 @@ TEST_F(groupby_shift_string_test, BackwardShiftWithNull_NullScalar)
   using K = int32_t;
   cudf::test::fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
   cudf::test::strings_column_wrapper val({"a", "bb", "cc", "d", "eee", "f", "gg"},
-                                         {1, 0, 1, 1, 0, 0, 0});
+                                         {true, false, true, true, false, false, false});
   cudf::test::strings_column_wrapper expected({"cc", "", "", "", "d", "", ""},
-                                              {1, 0, 0, 0, 1, 0, 0});
+                                              {true, false, false, false, true, false, false});
 
   cudf::size_type offset = -1;
   auto slr               = cudf::make_default_constructed_scalar(cudf::column_view(val).type());
@@ -344,9 +344,9 @@ TEST_F(groupby_shift_string_test, BackwardShiftWithNull_ValidScalar)
   using K = int32_t;
   cudf::test::fixed_width_column_wrapper<K> key{1, 2, 1, 2, 2, 1, 1};
   cudf::test::strings_column_wrapper val({"a", "bb", "cc", "d", "eee", "f", "gg"},
-                                         {1, 1, 0, 0, 1, 0, 1});
+                                         {true, true, false, false, true, false, true});
   cudf::test::strings_column_wrapper expected({"", "gg", "42", "42", "eee", "42", "42"},
-                                              {0, 1, 1, 1, 1, 1, 1});
+                                              {false, true, true, true, true, true, true});
 
   cudf::size_type offset = -2;
   auto slr               = cudf::make_string_scalar("42");
@@ -431,7 +431,8 @@ TYPED_TEST(groupby_shift_mixed_test, NoFill)
   cudf::test::fixed_width_column_wrapper<TypeParam> v2{1, 2, 3, 4, 5, 6, 7};
   cudf::table_view value{{v1, v2}};
 
-  cudf::test::strings_column_wrapper e1({"", "", "a", "cc", "", "", "bb"}, {0, 0, 1, 1, 0, 0, 1});
+  cudf::test::strings_column_wrapper e1({"", "", "a", "cc", "", "", "bb"},
+                                        {false, false, true, true, false, false, true});
   cudf::test::fixed_width_column_wrapper<TypeParam> e2({-1, 1, 3, 6, -1, 2, 4},
                                                        {0, 1, 1, 1, 0, 1, 1});
   cudf::table_view expected{{e1, e2}};
diff --git a/cpp/tests/groupby/std_tests.cpp b/cpp/tests/groupby/std_tests.cpp
index 4c07e2e8ddc..732a0ba9561 100644
--- a/cpp/tests/groupby/std_tests.cpp
+++ b/cpp/tests/groupby/std_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -106,8 +106,9 @@ TYPED_TEST(groupby_std_test, null_keys_and_values)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, cudf::aggregation::STD>;
 
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                                 {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<K> keys(
+    {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+    {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3},
                                                  {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
 
@@ -127,8 +128,9 @@ TYPED_TEST(groupby_std_test, ddof_non_default)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, cudf::aggregation::STD>;
 
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                                 {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<K> keys(
+    {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+    {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3},
                                                  {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
 
diff --git a/cpp/tests/groupby/sum_of_squares_tests.cpp b/cpp/tests/groupby/sum_of_squares_tests.cpp
index 23724113d50..a5b3ac97773 100644
--- a/cpp/tests/groupby/sum_of_squares_tests.cpp
+++ b/cpp/tests/groupby/sum_of_squares_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -104,8 +104,9 @@ TYPED_TEST(groupby_sum_of_squares_test, null_keys_and_values)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, cudf::aggregation::SUM_OF_SQUARES>;
 
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                                 {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<K> keys(
+    {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+    {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3},
                                                  {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
diff --git a/cpp/tests/groupby/sum_scan_tests.cpp b/cpp/tests/groupby/sum_scan_tests.cpp
index b03212f3197..13cb1b8dbca 100644
--- a/cpp/tests/groupby/sum_scan_tests.cpp
+++ b/cpp/tests/groupby/sum_scan_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -130,7 +130,7 @@ TYPED_TEST(groupby_sum_scan_test, null_keys_and_values)
   using result_wrapper = typename TestFixture::result_wrapper;
 
   // clang-format off
-  key_wrapper keys(  {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  key_wrapper keys(  {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4}, {true, true, true, true, true, true, true, false, true, true, true});
   value_wrapper vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4}, {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
   //                         { 1, 1, 1, 2, 2,  2,  2, 3, *, 3, 4};
diff --git a/cpp/tests/groupby/sum_tests.cpp b/cpp/tests/groupby/sum_tests.cpp
index 03cc3fab568..5f5329e5d7a 100644
--- a/cpp/tests/groupby/sum_tests.cpp
+++ b/cpp/tests/groupby/sum_tests.cpp
@@ -112,8 +112,9 @@ TYPED_TEST(groupby_sum_test, null_keys_and_values)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, cudf::aggregation::SUM>;
 
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                                 {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<K> keys(
+    {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+    {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 4},
                                                  {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0});
 
diff --git a/cpp/tests/groupby/var_tests.cpp b/cpp/tests/groupby/var_tests.cpp
index baebc45b975..da03169c93f 100644
--- a/cpp/tests/groupby/var_tests.cpp
+++ b/cpp/tests/groupby/var_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -106,8 +106,9 @@ TYPED_TEST(groupby_var_test, null_keys_and_values)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, cudf::aggregation::VARIANCE>;
 
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                                 {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<K> keys(
+    {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+    {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3},
                                                  {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
 
@@ -128,8 +129,9 @@ TYPED_TEST(groupby_var_test, ddof_non_default)
   using V = TypeParam;
   using R = cudf::detail::target_type_t<V, cudf::aggregation::VARIANCE>;
 
-  cudf::test::fixed_width_column_wrapper<K> keys({1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
-                                                 {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<K> keys(
+    {1, 2, 3, 1, 2, 2, 1, 3, 3, 2, 4},
+    {true, true, true, true, true, true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<V> vals({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 3},
                                                  {0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
 
diff --git a/cpp/tests/hashing/md5_test.cpp b/cpp/tests/hashing/md5_test.cpp
index 081ab7978cd..69e518cbf8d 100644
--- a/cpp/tests/hashing/md5_test.cpp
+++ b/cpp/tests/hashing/md5_test.cpp
@@ -34,7 +34,7 @@ TEST_F(MD5HashTest, MultiValue)
      "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the "
      "MD5 hash function. This string needed to be longer.",
      "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)",
      "Multi-byte characters: é¼³⅝"});
 
   /*
@@ -92,8 +92,8 @@ TEST_F(MD5HashTest, MultiValue)
 TEST_F(MD5HashTest, EmptyNullEquivalence)
 {
   // Test that empty strings hash the same as nulls
-  cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0});
-  cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1});
+  cudf::test::strings_column_wrapper const strings_col1({"", ""}, {true, false});
+  cudf::test::strings_column_wrapper const strings_col2({"", ""}, {false, true});
 
   auto const input1 = cudf::table_view({strings_col1});
   auto const input2 = cudf::table_view({strings_col2});
@@ -127,7 +127,7 @@ TEST_F(MD5HashTest, StringLists)
       "MD5 hash function. This string needed to be longer.",
       " It needed to be even longer."},
      {"All ", "work ", "and", " no", " play ", "makes Jack", " a dull boy"},
-     {"!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`", "{|}~"}});
+     {R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`)", "{|}~"}});
 
   auto const input1 = cudf::table_view({strings_col});
   auto const input2 = cudf::table_view({strings_list_col});
@@ -171,16 +171,24 @@ TYPED_TEST(MD5HashTestTyped, WithNulls)
 
 TEST_F(MD5HashTest, TestBoolListsWithNulls)
 {
-  cudf::test::fixed_width_column_wrapper<bool> const col1({0, 0, 0, 0, 1, 1, 1, 0, 0},
-                                                          {1, 0, 0, 0, 1, 1, 1, 0, 0});
-  cudf::test::fixed_width_column_wrapper<bool> const col2({0, 0, 0, 1, 0, 1, 0, 1, 0},
-                                                          {1, 0, 0, 1, 0, 1, 0, 1, 0});
-  cudf::test::fixed_width_column_wrapper<bool> const col3({0, 0, 0, 1, 1, 0, 0, 0, 1},
-                                                          {1, 0, 0, 1, 1, 0, 0, 0, 1});
+  cudf::test::fixed_width_column_wrapper<bool> const col1(
+    {0, 0, 0, 0, 1, 1, 1, 0, 0}, {true, false, false, false, true, true, true, false, false});
+  cudf::test::fixed_width_column_wrapper<bool> const col2(
+    {0, 0, 0, 1, 0, 1, 0, 1, 0}, {true, false, false, true, false, true, false, true, false});
+  cudf::test::fixed_width_column_wrapper<bool> const col3(
+    {0, 0, 0, 1, 1, 0, 0, 0, 1}, {true, false, false, true, true, false, false, false, true});
 
   auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; });
-  cudf::test::lists_column_wrapper<bool> const list_col(
-    {{0, 0, 0}, {1}, {}, {{1, 1, 1}, validity}, {1, 1}, {1, 1}, {1}, {1}, {1}}, validity);
+  cudf::test::lists_column_wrapper<bool> const list_col({{false, false, false},
+                                                         {true},
+                                                         {},
+                                                         {{true, true, true}, validity},
+                                                         {true, true},
+                                                         {true, true},
+                                                         {true},
+                                                         {true},
+                                                         {true}},
+                                                        validity);
 
   auto const input1 = cudf::table_view({col1, col2, col3});
   auto const input2 = cudf::table_view({list_col});
diff --git a/cpp/tests/hashing/murmurhash3_x86_32_test.cpp b/cpp/tests/hashing/murmurhash3_x86_32_test.cpp
index 24524140e74..c1a6e6ff6e1 100644
--- a/cpp/tests/hashing/murmurhash3_x86_32_test.cpp
+++ b/cpp/tests/hashing/murmurhash3_x86_32_test.cpp
@@ -72,26 +72,28 @@ TEST_F(MurmurHashTest, MultiValueNulls)
      "jumps over the lazy dog.",
      "All work and no play makes Jack a dull boy",
      R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
-    {0, 1, 1, 0, 1});
+    {false, true, true, false, true});
   cudf::test::strings_column_wrapper const strings_col2(
     {"different but null",
      "The quick brown fox",
      "jumps over the lazy dog.",
      "I am Jack's complete lack of null value",
      R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
-    {0, 1, 1, 0, 1});
+    {false, true, true, false, true});
 
   // Nulls with different values should be equal
   using limits = std::numeric_limits<int32_t>;
   cudf::test::fixed_width_column_wrapper<int32_t> const ints_col1(
-    {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
+    {0, 100, -100, limits::min(), limits::max()}, {true, false, false, true, true});
   cudf::test::fixed_width_column_wrapper<int32_t> const ints_col2(
-    {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
+    {0, -200, 200, limits::min(), limits::max()}, {true, false, false, true, true});
 
   // Nulls with different values should be equal
   // Different truth values should be equal
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1});
-  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 0, 1, 1},
+                                                                {true, true, false, false, true});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 2, 1, 0, 255},
+                                                                {true, true, false, false, true});
 
   // Nulls with different values should be equal
   using ts = cudf::timestamp_s;
@@ -101,14 +103,14 @@ TEST_F(MurmurHashTest, MultiValueNulls)
      static_cast<ts::duration>(-100),
      ts::duration::min(),
      ts::duration::max()},
-    {1, 0, 0, 1, 1});
+    {true, false, false, true, true});
   cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col2(
     {ts::duration::zero(),
      static_cast<ts::duration>(-200),
      static_cast<ts::duration>(200),
      ts::duration::min(),
      ts::duration::max()},
-    {1, 0, 0, 1, 1});
+    {true, false, false, true, true});
 
   auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1, secs_col1});
   auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2, secs_col2});
@@ -165,7 +167,8 @@ TEST_F(MurmurHashTest, NullableList)
   using LCW = cudf::test::lists_column_wrapper<uint64_t>;
   using ICW = cudf::test::fixed_width_column_wrapper<uint32_t>;
 
-  auto const valids = std::vector<bool>{1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0};
+  auto const valids =
+    std::vector<bool>{true, true, true, true, true, true, true, false, true, true, false};
   auto const col =
     LCW{{{}, {}, {1}, {1}, {2, 2}, {2}, {2}, {}, {2, 2}, {2, 2}, {}}, valids.begin()};
   auto expect = ICW{-2023148619,
@@ -203,17 +206,84 @@ TEST_F(MurmurHashTest, ListOfStruct)
 {
   auto col1 = cudf::test::fixed_width_column_wrapper<int32_t>{
     {-1, -1, 0, 2, 2, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, 1, 2},
-    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}};
+    {true,
+     true,
+     true,
+     true,
+     true,
+     false,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     false,
+     false}};
   auto col2 = cudf::test::strings_column_wrapper{
     {"x", "x", "a", "a", "b", "b", "a", "b", "a", "b", "a", "c", "a", "c", "a", "c", "b", "b"},
-    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1}};
-  auto struct_col = cudf::test::structs_column_wrapper{
-    {col1, col2}, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+    {true,
+     true,
+     true,
+     true,
+     true,
+     false,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     false,
+     false,
+     true,
+     true}};
+  auto struct_col = cudf::test::structs_column_wrapper{{col1, col2},
+                                                       {false,
+                                                        false,
+                                                        false,
+                                                        false,
+                                                        false,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true}};
 
   auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
     0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18};
 
-  auto list_nullmask = std::vector<bool>{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  auto list_nullmask = std::vector<bool>{true,
+                                         true,
+                                         false,
+                                         false,
+                                         true,
+                                         true,
+                                         true,
+                                         true,
+                                         true,
+                                         true,
+                                         true,
+                                         true,
+                                         true,
+                                         true,
+                                         true,
+                                         true,
+                                         true};
   auto [null_mask, null_count] =
     cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
   auto list_column = cudf::make_lists_column(
@@ -279,14 +349,16 @@ TEST_F(MurmurHashTest, ListOfEmptyStruct)
   // [{}, {}]
   // [{}, {}]
 
-  auto struct_validity = std::vector<bool>{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1};
+  auto struct_validity = std::vector<bool>{
+    false, false, false, false, false, false, false, false, true, true, true, true, true, true};
   auto [null_mask, null_count] =
     cudf::test::detail::make_null_mask(struct_validity.begin(), struct_validity.end());
   auto struct_col = cudf::make_structs_column(14, {}, null_count, std::move(null_mask));
 
   auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
     0, 0, 0, 0, 0, 2, 4, 6, 7, 8, 9, 10, 12, 14};
-  auto list_nullmask = std::vector<bool>{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  auto list_nullmask = std::vector<bool>{
+    true, true, false, false, true, true, true, true, true, true, true, true, true};
   std::tie(null_mask, null_count) =
     cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
   auto list_column = cudf::make_lists_column(
@@ -322,7 +394,7 @@ TEST_F(MurmurHashTest, EmptyDeepList)
   auto list1 = cudf::test::lists_column_wrapper<int>{};
 
   auto offsets       = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 0};
-  auto list_nullmask = std::vector<bool>{1, 1, 0, 0};
+  auto list_nullmask = std::vector<bool>{true, true, false, false};
   auto [null_mask, null_count] =
     cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
   auto list_column = cudf::make_lists_column(
diff --git a/cpp/tests/hashing/sha1_test.cpp b/cpp/tests/hashing/sha1_test.cpp
index c3d0fe7450a..e28e71442a6 100644
--- a/cpp/tests/hashing/sha1_test.cpp
+++ b/cpp/tests/hashing/sha1_test.cpp
@@ -50,7 +50,7 @@ TEST_F(SHA1HashTest, MultiValue)
      "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in "
      "the hash function being tested. This string needed to be longer.",
      "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)",
      "Multi-byte characters: é¼³⅝"});
 
   /*
@@ -114,8 +114,8 @@ TEST_F(SHA1HashTest, MultiValue)
 TEST_F(SHA1HashTest, EmptyNullEquivalence)
 {
   // Test that empty strings hash the same as nulls
-  cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0});
-  cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1});
+  cudf::test::strings_column_wrapper const strings_col1({"", ""}, {true, false});
+  cudf::test::strings_column_wrapper const strings_col2({"", ""}, {false, true});
 
   auto const input1 = cudf::table_view({strings_col1});
   auto const input2 = cudf::table_view({strings_col2});
@@ -133,7 +133,7 @@ TEST_F(SHA1HashTest, ListsUnsupported)
     {{""},
      {"", "Some inputs"},
      {"All ", "work ", "and", " no", " play ", "makes Jack", " a dull boy"},
-     {"!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`", "{|}~"}});
+     {R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`)", "{|}~"}});
 
   auto const input = cudf::table_view({strings_list_col});
 
diff --git a/cpp/tests/hashing/sha224_test.cpp b/cpp/tests/hashing/sha224_test.cpp
index def5e934177..61b584f94df 100644
--- a/cpp/tests/hashing/sha224_test.cpp
+++ b/cpp/tests/hashing/sha224_test.cpp
@@ -50,7 +50,7 @@ TEST_F(SHA224HashTest, MultiValue)
      "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in "
      "the hash function being tested. This string needed to be longer.",
      "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)",
      "Multi-byte characters: é¼³⅝"});
 
   /*
@@ -114,8 +114,8 @@ TEST_F(SHA224HashTest, MultiValue)
 TEST_F(SHA224HashTest, EmptyNullEquivalence)
 {
   // Test that empty strings hash the same as nulls
-  cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0});
-  cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1});
+  cudf::test::strings_column_wrapper const strings_col1({"", ""}, {true, false});
+  cudf::test::strings_column_wrapper const strings_col2({"", ""}, {false, true});
 
   auto const input1 = cudf::table_view({strings_col1});
   auto const input2 = cudf::table_view({strings_col2});
@@ -133,7 +133,7 @@ TEST_F(SHA224HashTest, ListsUnsupported)
     {{""},
      {"", "Some inputs"},
      {"All ", "work ", "and", " no", " play ", "makes Jack", " a dull boy"},
-     {"!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`", "{|}~"}});
+     {R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`)", "{|}~"}});
 
   auto const input = cudf::table_view({strings_list_col});
 
diff --git a/cpp/tests/hashing/sha256_test.cpp b/cpp/tests/hashing/sha256_test.cpp
index 410a99edd77..cc95c7a2f0f 100644
--- a/cpp/tests/hashing/sha256_test.cpp
+++ b/cpp/tests/hashing/sha256_test.cpp
@@ -52,7 +52,7 @@ TEST_F(SHA256HashTest, MultiValue)
      "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in "
      "the hash function being tested. This string needed to be longer.",
      "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)",
      "Multi-byte characters: é¼³⅝"});
 
   /*
@@ -115,8 +115,8 @@ TEST_F(SHA256HashTest, MultiValue)
 TEST_F(SHA256HashTest, EmptyNullEquivalence)
 {
   // Test that empty strings hash the same as nulls
-  cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0});
-  cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1});
+  cudf::test::strings_column_wrapper const strings_col1({"", ""}, {true, false});
+  cudf::test::strings_column_wrapper const strings_col2({"", ""}, {false, true});
 
   auto const input1 = cudf::table_view({strings_col1});
   auto const input2 = cudf::table_view({strings_col2});
@@ -134,7 +134,7 @@ TEST_F(SHA256HashTest, ListsUnsupported)
     {{""},
      {"", "Some inputs"},
      {"All ", "work ", "and", " no", " play ", "makes Jack", " a dull boy"},
-     {"!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`", "{|}~"}});
+     {R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`)", "{|}~"}});
 
   auto const input = cudf::table_view({strings_list_col});
 
diff --git a/cpp/tests/hashing/sha384_test.cpp b/cpp/tests/hashing/sha384_test.cpp
index 810fbc82d8e..4c79934f98d 100644
--- a/cpp/tests/hashing/sha384_test.cpp
+++ b/cpp/tests/hashing/sha384_test.cpp
@@ -50,7 +50,7 @@ TEST_F(SHA384HashTest, MultiValue)
      "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in "
      "the hash function being tested. This string needed to be longer.",
      "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)",
      "Multi-byte characters: é¼³⅝"});
 
   /*
@@ -132,8 +132,8 @@ TEST_F(SHA384HashTest, MultiValue)
 TEST_F(SHA384HashTest, EmptyNullEquivalence)
 {
   // Test that empty strings hash the same as nulls
-  cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0});
-  cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1});
+  cudf::test::strings_column_wrapper const strings_col1({"", ""}, {true, false});
+  cudf::test::strings_column_wrapper const strings_col2({"", ""}, {false, true});
 
   auto const input1 = cudf::table_view({strings_col1});
   auto const input2 = cudf::table_view({strings_col2});
@@ -151,7 +151,7 @@ TEST_F(SHA384HashTest, ListsUnsupported)
     {{""},
      {"", "Some inputs"},
      {"All ", "work ", "and", " no", " play ", "makes Jack", " a dull boy"},
-     {"!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`", "{|}~"}});
+     {R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`)", "{|}~"}});
 
   auto const input = cudf::table_view({strings_list_col});
 
diff --git a/cpp/tests/hashing/sha512_test.cpp b/cpp/tests/hashing/sha512_test.cpp
index 93caa16c1c4..0eb1c60b8fc 100644
--- a/cpp/tests/hashing/sha512_test.cpp
+++ b/cpp/tests/hashing/sha512_test.cpp
@@ -50,7 +50,7 @@ TEST_F(SHA512HashTest, MultiValue)
      "A very long (greater than 128 bytes/char string) to execute a multi hash-step data point in "
      "the hash function being tested. This string needed to be longer.",
      "All work and no play makes Jack a dull boy",
-     "!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`{|}~",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)",
      "Multi-byte characters: é¼³⅝"});
 
   /*
@@ -132,8 +132,8 @@ TEST_F(SHA512HashTest, MultiValue)
 TEST_F(SHA512HashTest, EmptyNullEquivalence)
 {
   // Test that empty strings hash the same as nulls
-  cudf::test::strings_column_wrapper const strings_col1({"", ""}, {1, 0});
-  cudf::test::strings_column_wrapper const strings_col2({"", ""}, {0, 1});
+  cudf::test::strings_column_wrapper const strings_col1({"", ""}, {true, false});
+  cudf::test::strings_column_wrapper const strings_col2({"", ""}, {false, true});
 
   auto const input1 = cudf::table_view({strings_col1});
   auto const input2 = cudf::table_view({strings_col2});
@@ -151,7 +151,7 @@ TEST_F(SHA512HashTest, ListsUnsupported)
     {{""},
      {"", "Some inputs"},
      {"All ", "work ", "and", " no", " play ", "makes Jack", " a dull boy"},
-     {"!\"#$%&\'()*+,-./0123456789:;<=>?@[\\]^_`", "{|}~"}});
+     {R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`)", "{|}~"}});
 
   auto const input = cudf::table_view({strings_list_col});
 
diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp
index ecc8558243d..330f07ac8e2 100644
--- a/cpp/tests/interop/dlpack_test.cpp
+++ b/cpp/tests/interop/dlpack_test.cpp
@@ -105,7 +105,7 @@ TEST_F(DLPackUntypedTests, MultipleTypesToDlpack)
 TEST_F(DLPackUntypedTests, InvalidNullsToDlpack)
 {
   cudf::test::fixed_width_column_wrapper<int32_t> col1({1, 2, 3, 4});
-  cudf::test::fixed_width_column_wrapper<int32_t> col2({1, 2, 3, 4}, {1, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> col2({1, 2, 3, 4}, {true, false, true, true});
   cudf::table_view input({col1, col2});
   EXPECT_THROW(cudf::to_dlpack(input), cudf::logic_error);
 }
diff --git a/cpp/tests/interop/from_arrow_device_test.cpp b/cpp/tests/interop/from_arrow_device_test.cpp
index d776ca57ef6..a4dc7531765 100644
--- a/cpp/tests/interop/from_arrow_device_test.cpp
+++ b/cpp/tests/interop/from_arrow_device_test.cpp
@@ -70,7 +70,7 @@ TEST_F(FromArrowDeviceTest, FailConditions)
 
 TEST_F(FromArrowDeviceTest, EmptyTable)
 {
-  const auto [table, schema, arr] = get_nanoarrow_tables(0);
+  auto const [table, schema, arr] = get_nanoarrow_tables(0);
 
   auto expected_cudf_table = table->view();
 
@@ -354,7 +354,7 @@ TEST_F(FromArrowDeviceTest, StructColumn)
 
   NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc));
   ArrowArrayValidityBitmap(array_a)->buffer.data =
-    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_a.null_mask()));
+    const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(view_a.null_mask()));
 
   populate_from_col<cudf::string_view>(array_a->children[0], view_a.child(0));
   populate_from_col<int32_t>(array_a->children[1], view_a.child(1));
@@ -372,7 +372,7 @@ TEST_F(FromArrowDeviceTest, StructColumn)
 
   NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc));
   ArrowArrayValidityBitmap(array_struct)->buffer.data =
-    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_struct.null_mask()));
+    const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(view_struct.null_mask()));
 
   populate_from_col<cudf::string_view>(array_struct->children[0], view_struct.child(0));
   populate_from_col<int32_t>(array_struct->children[1], view_struct.child(1));
@@ -392,7 +392,7 @@ TEST_F(FromArrowDeviceTest, StructColumn)
   {
     // there's one boolean column so we should have one "owned_mem" column in the
     // returned unique_ptr's custom deleter
-    const cudf::custom_view_deleter<cudf::table_view>& deleter = got_cudf_table_view.get_deleter();
+    cudf::custom_view_deleter<cudf::table_view> const& deleter = got_cudf_table_view.get_deleter();
     EXPECT_EQ(deleter.owned_mem_.size(), 1);
   }
 
@@ -405,7 +405,7 @@ TEST_F(FromArrowDeviceTest, StructColumn)
   {
     // there's one boolean column so we should have one "owned_mem" column in the
     // returned unique_ptr's custom deleter
-    const cudf::custom_view_deleter<cudf::column_view>& deleter = got_cudf_col.get_deleter();
+    cudf::custom_view_deleter<cudf::column_view> const& deleter = got_cudf_col.get_deleter();
     EXPECT_EQ(deleter.owned_mem_.size(), 1);
   }
 }
@@ -479,7 +479,7 @@ TEST_F(FromArrowDeviceTest, DictionaryIndicesType)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, *got_cudf_table_view);
 
   {
-    const cudf::custom_view_deleter<cudf::table_view>& deleter = got_cudf_table_view.get_deleter();
+    cudf::custom_view_deleter<cudf::table_view> const& deleter = got_cudf_table_view.get_deleter();
     EXPECT_EQ(deleter.owned_mem_.size(), 0);
   }
 
@@ -490,7 +490,7 @@ TEST_F(FromArrowDeviceTest, DictionaryIndicesType)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*got_cudf_table_view, from_struct);
 
   {
-    const cudf::custom_view_deleter<cudf::column_view>& deleter = got_cudf_col.get_deleter();
+    cudf::custom_view_deleter<cudf::column_view> const& deleter = got_cudf_col.get_deleter();
     EXPECT_EQ(deleter.owned_mem_.size(), 0);
   }
 }
diff --git a/cpp/tests/interop/from_arrow_host_test.cpp b/cpp/tests/interop/from_arrow_host_test.cpp
index e6e52099a0c..cbfa4911c3c 100644
--- a/cpp/tests/interop/from_arrow_host_test.cpp
+++ b/cpp/tests/interop/from_arrow_host_test.cpp
@@ -76,7 +76,7 @@ get_nanoarrow_host_tables(cudf::size_type length)
   ArrowBitmapInit(&struct_validity);
   NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&struct_validity, length));
   ArrowBitmapAppendInt8Unsafe(
-    &struct_validity, reinterpret_cast<const int8_t*>(test_data.bool_data_validity.data()), length);
+    &struct_validity, reinterpret_cast<int8_t const*>(test_data.bool_data_validity.data()), length);
   arrow->children[5]->length = length;
   ArrowArraySetValidityBitmap(arrow->children[5], &struct_validity);
   arrow->children[5]->null_count =
@@ -267,7 +267,7 @@ TEST_F(FromArrowHostDeviceTest, NestedList)
   EXPECT_EQ(
     NANOARROW_OK,
     ArrowBufferAppend(
-      offset_buf, reinterpret_cast<const void*>(offset.data()), offset.size() * sizeof(int32_t)));
+      offset_buf, reinterpret_cast<void const*>(offset.data()), offset.size() * sizeof(int32_t)));
 
   // move our base list to be the child of the one we just created
   // so that we now have an equivalent value to what we created for cudf
@@ -416,7 +416,7 @@ TEST_F(FromArrowHostDeviceTest, StructColumn)
   EXPECT_EQ(
     NANOARROW_OK,
     ArrowBufferAppend(
-      offset_buf, reinterpret_cast<const void*>(offset.data()), offset.size() * sizeof(int32_t)));
+      offset_buf, reinterpret_cast<void const*>(offset.data()), offset.size() * sizeof(int32_t)));
 
   list_arr.move(array_a->children[3]->children[0]);
 
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index 94b0c75f184..aec2bab7196 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -39,17 +39,19 @@
 std::unique_ptr<cudf::table> get_cudf_table()
 {
   std::vector<std::unique_ptr<cudf::column>> columns;
-  columns.emplace_back(
-    cudf::test::fixed_width_column_wrapper<int32_t>({1, 2, 5, 2, 7}, {1, 0, 1, 1, 1}).release());
+  columns.emplace_back(cudf::test::fixed_width_column_wrapper<int32_t>(
+                         {1, 2, 5, 2, 7}, {true, false, true, true, true})
+                         .release());
   columns.emplace_back(cudf::test::fixed_width_column_wrapper<int64_t>({1, 2, 3, 4, 5}).release());
-  columns.emplace_back(
-    cudf::test::strings_column_wrapper({"fff", "aaa", "", "fff", "ccc"}, {1, 1, 1, 0, 1})
-      .release());
-  auto col4 = cudf::test::fixed_width_column_wrapper<int32_t>({1, 2, 5, 2, 7}, {1, 0, 1, 1, 1});
+  columns.emplace_back(cudf::test::strings_column_wrapper({"fff", "aaa", "", "fff", "ccc"},
+                                                          {true, true, true, false, true})
+                         .release());
+  auto col4 = cudf::test::fixed_width_column_wrapper<int32_t>({1, 2, 5, 2, 7},
+                                                              {true, false, true, true, true});
   columns.emplace_back(std::move(cudf::dictionary::encode(col4)));
-  columns.emplace_back(
-    cudf::test::fixed_width_column_wrapper<bool>({true, false, true, false, true}, {1, 0, 1, 1, 0})
-      .release());
+  columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(
+                         {true, false, true, false, true}, {true, false, true, true, false})
+                         .release());
   // columns.emplace_back(cudf::test::lists_column_wrapper<int>({{1, 2}, {3, 4}, {}, {6}, {7, 8,
   // 9}}).release());
   return std::make_unique<cudf::table>(std::move(columns));
@@ -171,15 +173,17 @@ TEST_F(FromArrowTest, StructColumn)
       "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}
       .release();
   auto str_col2 =
-    cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {0, 1, 0}}.release();
+    cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {false, true, false}}
+      .release();
   int num_rows{str_col->size()};
   auto int_col = cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{48, 27, 25}}.release();
   auto int_col2 =
-    cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
-  auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
-  auto list_col =
-    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
+    cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {true, false, true}}
       .release();
+  auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
+  auto list_col = cudf::test::lists_column_wrapper<int64_t>(
+                    {{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})  // NOLINT
+                    .release();
   vector_of_columns cols2;
   cols2.push_back(std::move(str_col2));
   cols2.push_back(std::move(int_col2));
@@ -261,7 +265,8 @@ TEST_F(FromArrowTest, DictionaryIndicesType)
   auto arrow_table = arrow::Table::Make(schema, {array1, array2, array3});
 
   std::vector<std::unique_ptr<cudf::column>> columns;
-  auto col = cudf::test::fixed_width_column_wrapper<int64_t>({1, 2, 5, 2, 7}, {1, 0, 1, 1, 1});
+  auto col = cudf::test::fixed_width_column_wrapper<int64_t>({1, 2, 5, 2, 7},
+                                                             {true, false, true, true, true});
   columns.emplace_back(std::move(cudf::dictionary::encode(col)));
   columns.emplace_back(std::move(cudf::dictionary::encode(col)));
   columns.emplace_back(std::move(cudf::dictionary::encode(col)));
@@ -299,7 +304,8 @@ TEST_F(FromArrowTest, ChunkedArray)
     std::vector<std::shared_ptr<arrow::Array>>{string_array_1, string_array_2});
   auto dict_chunked_array = std::make_shared<arrow::ChunkedArray>(
     std::vector<std::shared_ptr<arrow::Array>>{dict_array1, dict_array2});
-  auto boolean_array = get_arrow_array<bool>({true, false, true, false, true}, {1, 0, 1, 1, 0});
+  auto boolean_array =
+    get_arrow_array<bool>({true, false, true, false, true}, {true, false, true, true, false});
   auto boolean_chunked_array = std::make_shared<arrow::ChunkedArray>(boolean_array);
 
   std::vector<std::shared_ptr<arrow::Field>> schema_vector(
@@ -404,8 +410,9 @@ TEST_F(FromArrowTest, FixedPoint128TableNulls)
   for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
     auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 0, 0};
     auto const validity = std::vector<int32_t>{1, 1, 1, 1, 1, 1, 0, 0};
-    auto const col =
-      fp_wrapper<__int128_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale});
+    auto const col      = fp_wrapper<__int128_t>({1, 2, 3, 4, 5, 6, 0, 0},
+                                            {true, true, true, true, true, true, false, false},
+                                            scale_type{scale});
     auto const expected = cudf::table_view({col});
 
     auto const arr = make_decimal128_arrow_array(data, validity, scale);
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index a79e6fdc49c..94c4372e74a 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -97,7 +97,7 @@ std::enable_if_t<cudf::is_fixed_width<T>() and !std::is_same_v<T, bool>, void> p
   ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
     cudf::bitmask_allocation_size_bytes(view.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
-    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
+    const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(view.null_mask()));
   NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
   ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(T) * view.size();
   ArrowArrayBuffer(arr, 1)->data       = const_cast<uint8_t*>(view.data<uint8_t>());
@@ -117,7 +117,7 @@ std::enable_if_t<std::is_same_v<T, bool>, void> populate_from_col(ArrowArray* ar
   ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
     cudf::bitmask_allocation_size_bytes(view.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
-    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
+    const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(view.null_mask()));
 
   auto bitmask = cudf::bools_to_mask(view);
   auto ptr     = reinterpret_cast<uint8_t*>(bitmask.first->data());
@@ -147,7 +147,7 @@ std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> populate_from_col(
   ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
     cudf::bitmask_allocation_size_bytes(view.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
-    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
+    const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(view.null_mask()));
 
   cudf::strings_column_view sview{view};
   if (view.size() > 0) {
@@ -159,7 +159,7 @@ std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> populate_from_col(
     ArrowArrayBuffer(arr, 2)->data       = const_cast<uint8_t*>(view.data<uint8_t>());
   } else {
     auto zero          = rmm::device_scalar<int32_t>(0, cudf::get_default_stream());
-    const uint8_t* ptr = reinterpret_cast<uint8_t*>(zero.data());
+    uint8_t const* ptr = reinterpret_cast<uint8_t*>(zero.data());
     nanoarrow::BufferInitWrapped(ArrowArrayBuffer(arr, 1), std::move(zero), ptr, 4);
   }
 }
@@ -173,7 +173,7 @@ void populate_dict_from_col(ArrowArray* arr, cudf::dictionary_column_view dview)
   ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
     cudf::bitmask_allocation_size_bytes(dview.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
-    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(dview.null_mask()));
+    const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(dview.null_mask()));
 
   NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
   ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(IND_TYPE) * dview.indices().size();
@@ -225,7 +225,7 @@ get_nanoarrow_array(std::vector<T> const& data, std::vector<uint8_t> const& mask
     ArrowBitmap bitmap;
     ArrowBitmapInit(&bitmap);
     NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&bitmap, mask.size()));
-    ArrowBitmapAppendInt8Unsafe(&bitmap, reinterpret_cast<const int8_t*>(mask.data()), mask.size());
+    ArrowBitmapAppendInt8Unsafe(&bitmap, reinterpret_cast<int8_t const*>(mask.data()), mask.size());
 
     ArrowArraySetValidityBitmap(tmp.get(), &bitmap);
     tmp->null_count =
@@ -343,7 +343,7 @@ nanoarrow::UniqueArray get_nanoarrow_list_array(std::vector<T> const& data,
     ArrowBitmapInit(&bitmap);
     NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&bitmap, list_validity.size()));
     ArrowBitmapAppendInt8Unsafe(
-      &bitmap, reinterpret_cast<const int8_t*>(list_validity.data()), list_validity.size());
+      &bitmap, reinterpret_cast<int8_t const*>(list_validity.data()), list_validity.size());
 
     ArrowArraySetValidityBitmap(tmp.get(), &bitmap);
     tmp->null_count =
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 4c73cd637a4..860544b8606 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -206,7 +206,7 @@ get_nanoarrow_tables(cudf::size_type length)
   ArrowArrayValidityBitmap(arrow->children[5])->buffer.size_bytes =
     cudf::bitmask_allocation_size_bytes(struct_view.size());
   ArrowArrayValidityBitmap(arrow->children[5])->buffer.data =
-    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(struct_view.null_mask()));
+    const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(struct_view.null_mask()));
 
   ArrowError error;
   if (ArrowArrayFinishBuilding(arrow.get(), NANOARROW_VALIDATION_LEVEL_MINIMAL, &error) !=
@@ -229,7 +229,7 @@ void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view)
   ArrowArrayValidityBitmap(arr)->buffer.size_bytes =
     cudf::bitmask_allocation_size_bytes(view.size());
   ArrowArrayValidityBitmap(arr)->buffer.data =
-    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view.null_mask()));
+    const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(view.null_mask()));
 
   NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(arr, 1), noop_alloc));
   ArrowArrayBuffer(arr, 1)->size_bytes = sizeof(int32_t) * view.offsets().size();
@@ -237,7 +237,7 @@ void populate_list_from_col(ArrowArray* arr, cudf::lists_column_view view)
 }
 
 struct BaseArrowFixture : public cudf::test::BaseFixture {
-  void compare_schemas(const ArrowSchema* expected, const ArrowSchema* actual)
+  void compare_schemas(ArrowSchema const* expected, ArrowSchema const* actual)
   {
     EXPECT_STREQ(expected->format, actual->format);
     EXPECT_STREQ(expected->name, actual->name);
@@ -264,9 +264,9 @@ struct BaseArrowFixture : public cudf::test::BaseFixture {
   }
 
   void compare_device_buffers(const size_t nbytes,
-                              const int buffer_idx,
-                              const ArrowArray* expected,
-                              const ArrowArray* actual)
+                              int const buffer_idx,
+                              ArrowArray const* expected,
+                              ArrowArray const* actual)
   {
     std::vector<uint8_t> actual_bytes;
     std::vector<uint8_t> expected_bytes;
@@ -281,9 +281,9 @@ struct BaseArrowFixture : public cudf::test::BaseFixture {
     ASSERT_EQ(expected_bytes, actual_bytes);
   }
 
-  void compare_arrays(const ArrowSchema* schema,
-                      const ArrowArray* expected,
-                      const ArrowArray* actual)
+  void compare_arrays(ArrowSchema const* schema,
+                      ArrowArray const* expected,
+                      ArrowArray const* actual)
   {
     ArrowSchemaView schema_view;
     NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, nullptr));
@@ -337,7 +337,7 @@ TYPED_TEST_SUITE(ToArrowDeviceTestDurationsTest, cudf::test::DurationTypes);
 
 TEST_F(ToArrowDeviceTest, EmptyTable)
 {
-  const auto [table, schema, arr] = get_nanoarrow_tables(0);
+  auto const [table, schema, arr] = get_nanoarrow_tables(0);
 
   auto struct_meta          = cudf::column_metadata{"f"};
   struct_meta.children_meta = {{"integral"}, {"string"}};
@@ -653,7 +653,7 @@ TEST_F(ToArrowDeviceTest, StructColumn)
 
   NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_a, 0), noop_alloc));
   ArrowArrayValidityBitmap(array_a)->buffer.data =
-    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_a.null_mask()));
+    const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(view_a.null_mask()));
 
   populate_from_col<cudf::string_view>(array_a->children[0], view_a.child(0));
   populate_from_col<int32_t>(array_a->children[1], view_a.child(1));
@@ -671,7 +671,7 @@ TEST_F(ToArrowDeviceTest, StructColumn)
 
   NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(ArrowArrayBuffer(array_struct, 0), noop_alloc));
   ArrowArrayValidityBitmap(array_struct)->buffer.data =
-    const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(view_struct.null_mask()));
+    const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(view_struct.null_mask()));
 
   populate_from_col<cudf::string_view>(array_struct->children[0], view_struct.child(0));
   populate_from_col<int32_t>(array_struct->children[1], view_struct.child(1));
@@ -736,7 +736,7 @@ TEST_F(ToArrowDeviceTest, FixedPoint64Table)
     NANOARROW_THROW_NOT_OK(
       ArrowBufferSetAllocator(ArrowArrayBuffer(expected_array->children[0], 0), noop_alloc));
     ArrowArrayValidityBitmap(expected_array->children[0])->buffer.data =
-      const_cast<uint8_t*>(reinterpret_cast<const uint8_t*>(input.view().column(0).null_mask()));
+      const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(input.view().column(0).null_mask()));
 
     auto data_ptr = reinterpret_cast<uint8_t*>(result_dev_data->data());
     NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 880dc911954..ff433264446 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -938,7 +938,7 @@ TEST_F(CsvReaderTest, Strings)
     outfile << names[0] << ',' << names[1] << '\n';
     outfile << "10,abc def ghi" << '\n';
     outfile << "20,\"jkl mno pqr\"" << '\n';
-    outfile << "30,stu \"\"vwx\"\" yz" << '\n';
+    outfile << R"(30,stu ""vwx"" yz)" << '\n';
   }
 
   cudf::io::csv_reader_options in_opts =
@@ -996,7 +996,7 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored)
     std::ofstream outfile(filepath, std::ofstream::out);
     outfile << names[0] << ',' << names[1] << '\n';
     outfile << "10,\"abcdef ghi\"" << '\n';
-    outfile << "20,\"jkl \"\"mno\"\" pqr\"" << '\n';
+    outfile << R"(20,"jkl ""mno"" pqr")" << '\n';
     outfile << "30,stu \"vwx\" yz" << '\n';
   }
 
diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json_chunked_reader.cpp
index 7482cb1b70d..23d54f7263c 100644
--- a/cpp/tests/io/json_chunked_reader.cpp
+++ b/cpp/tests/io/json_chunked_reader.cpp
@@ -76,10 +76,10 @@ std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
   auto prev                = first_delimiter_index[0];
   for (size_t i = 1; i < num_chunks; i++) {
     if (first_delimiter_index[i] == no_min_value) continue;
-    record_ranges.push_back({prev, first_delimiter_index[i]});
+    record_ranges.emplace_back(prev, first_delimiter_index[i]);
     prev = first_delimiter_index[i];
   }
-  record_ranges.push_back({prev, total_source_size});
+  record_ranges.emplace_back(prev, total_source_size);
 
   std::vector<cudf::io::table_with_metadata> tables;
   // Process each chunk in parallel.
diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json_quote_normalization_test.cpp
index 5260b435482..55ad0afe499 100644
--- a/cpp/tests/io/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json_quote_normalization_test.cpp
@@ -35,7 +35,7 @@
 // Base test fixture for tests
 struct JsonNormalizationTest : public cudf::test::BaseFixture {};
 
-void run_test(const std::string& host_input, const std::string& expected_host_output)
+void run_test(std::string const& host_input, std::string const& expected_host_output)
 {
   // RMM memory resource
   std::shared_ptr<rmm::mr::device_memory_resource> rsc =
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 4c01a1fb87b..9c76c344157 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1829,7 +1829,7 @@ TYPED_TEST(JsonValidFixedPointReaderTest, SingleColumnPositiveScale)
 
 TYPED_TEST(JsonFixedPointReaderTest, EmptyValues)
 {
-  auto const buffer = std::string{"{\"col0\":\"\"}"};
+  auto const buffer = std::string{R"({"col0":""})"};
 
   cudf::io::json_reader_options const in_opts =
     cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
@@ -2424,7 +2424,7 @@ TEST_P(JsonDelimiterParamTest, JsonLinesDelimiter)
    * linearly in O(n), we can do it in O(log n) by doubling the input in each iteration. The total
    * number of such iterations is log_repetitions.
    */
-  std::size_t const log_repetitions =
+  auto const log_repetitions =
     static_cast<std::size_t>(std::ceil(std::log2(string_size / input.size())));
   std::size_t const repetitions = 1UL << log_repetitions;
   for (std::size_t i = 0; i < log_repetitions; i++) {
diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp
index 3577b47a7e2..7a72b77e1fb 100644
--- a/cpp/tests/io/json_tree.cpp
+++ b/cpp/tests/io/json_tree.cpp
@@ -297,8 +297,8 @@ tree_meta_t2 get_tree_representation_cpu(
   };
 
   // Includes quote char for end-of-string token or Skips the quote char for beginning-of-field-name
-  auto get_token_index = [include_quote_char](cuio_json::PdaTokenT const token,
-                                              cuio_json::SymbolOffsetT const token_index) {
+  auto get_token_index = [](cuio_json::PdaTokenT const token,
+                            cuio_json::SymbolOffsetT const token_index) {
     constexpr cuio_json::SymbolOffsetT quote_char_size = 1;
     switch (token) {
       // Strip off or include quote char for StringBegin
@@ -398,10 +398,10 @@ tree_meta_t2 get_tree_representation_cpu(
 
     // Modify the stack if needed
     if (token == cuio_json::token_t::FieldNameBegin) {
-      parent_stack.push({node_id, field_name_node});
+      parent_stack.emplace(node_id, field_name_node);
     } else {
       if (does_push(token)) {
-        parent_stack.push({node_id, no_field_name_node});
+        parent_stack.emplace(node_id, no_field_name_node);
       } else if (does_pop(token)) {
         CUDF_EXPECTS(parent_stack.size() >= 1, "Invalid JSON input.");
         parent_stack.pop();
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 1c1b53ea17f..2b78a5e7251 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -1308,9 +1308,9 @@ TEST_F(OrcChunkedReaderInputLimitTest, ReadWithRowSelection)
   int constexpr num_rows_to_read = rows_per_stripe * 5 + random_val;
 
   // Just shift the read data region back by a random offset.
-  const auto num_rows_to_skip = num_rows - num_rows_to_read - random_val;
+  auto const num_rows_to_skip = num_rows - num_rows_to_read - random_val;
 
-  const auto sequence_start = num_rows_to_skip % num_rows;
+  auto const sequence_start = num_rows_to_skip % num_rows;
   auto const skipped_col = int32s_col(it + sequence_start, it + sequence_start + num_rows_to_read);
   auto const expected    = cudf::table_view{{skipped_col}};
 
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index a544a812efb..b5e080f3cc5 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -2140,7 +2140,7 @@ TEST_F(OrcReaderTest, SizeTypeRowsOverflow)
   EXPECT_EQ(metadata.num_stripes(), total_rows / 1'000'000);
 
   constexpr auto num_rows_to_read = 1'000'000;
-  const auto num_rows_to_skip     = metadata.num_rows() - num_rows_to_read;
+  auto const num_rows_to_skip     = metadata.num_rows() - num_rows_to_read;
 
   // Read the last million rows
   cudf::io::orc_reader_options skip_opts =
@@ -2148,9 +2148,9 @@ TEST_F(OrcReaderTest, SizeTypeRowsOverflow)
       cudf::io::source_info{out_buffer.data(), out_buffer.size()})
       .use_index(false)
       .skip_rows(num_rows_to_skip);
-  const auto got_with_skip = cudf::io::read_orc(skip_opts).tbl;
+  auto const got_with_skip = cudf::io::read_orc(skip_opts).tbl;
 
-  const auto sequence_start = num_rows_to_skip % num_rows;
+  auto const sequence_start = num_rows_to_skip % num_rows;
   column_wrapper<int8_t, typename decltype(sequence)::value_type> skipped_col(
     sequence + sequence_start, sequence + sequence_start + num_rows_to_read, no_nulls());
   table_view expected({skipped_col});
@@ -2163,7 +2163,7 @@ TEST_F(OrcReaderTest, SizeTypeRowsOverflow)
       cudf::io::source_info{out_buffer.data(), out_buffer.size()})
       .use_index(false)
       .stripes({{metadata.num_stripes() - 1}});
-  const auto got_with_stripe_selection = cudf::io::read_orc(stripe_opts).tbl;
+  auto const got_with_stripe_selection = cudf::io::read_orc(stripe_opts).tbl;
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_with_stripe_selection->view());
 }
diff --git a/cpp/tests/io/parquet_chunked_writer_test.cpp b/cpp/tests/io/parquet_chunked_writer_test.cpp
index a0c9641097b..282c6f3adad 100644
--- a/cpp/tests/io/parquet_chunked_writer_test.cpp
+++ b/cpp/tests/io/parquet_chunked_writer_test.cpp
@@ -228,10 +228,12 @@ TEST_F(ParquetChunkedWriterTest, ListOfStruct)
   auto table_1 = table_view({*list_col_1});
 
   // Table 2
-  auto weight_2   = cudf::test::fixed_width_column_wrapper<float>{{1.1, -1.0, -1.0}};
-  auto ages_2     = cudf::test::fixed_width_column_wrapper<int32_t>{{31, 351, 351}, {1, 1, 0}};
-  auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2}, {1, 0, 1}};
-  auto is_human_2 = cudf::test::fixed_width_column_wrapper<bool>{{false, false, false}, {1, 1, 0}};
+  auto weight_2 = cudf::test::fixed_width_column_wrapper<float>{{1.1, -1.0, -1.0}};
+  auto ages_2 =
+    cudf::test::fixed_width_column_wrapper<int32_t>{{31, 351, 351}, {true, true, false}};
+  auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2}, {true, false, true}};
+  auto is_human_2 =
+    cudf::test::fixed_width_column_wrapper<bool>{{false, false, false}, {true, true, false}};
   auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}};
 
   auto list_offsets_column_2 =
@@ -313,10 +315,11 @@ TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList)
   // [[], [], []]
   lcw flats_2{lcw{lcw{}}, lcw{lcw{}, lcw{}, lcw{}}};
 
-  auto weight_2   = cudf::test::fixed_width_column_wrapper<float>{{-1.0, -1.0}};
-  auto ages_2     = cudf::test::fixed_width_column_wrapper<int32_t>{{351, 351}, {1, 0}};
-  auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2, land_2, flats_2}, {0, 1}};
-  auto is_human_2 = cudf::test::fixed_width_column_wrapper<bool>{{false, false}, {1, 0}};
+  auto weight_2 = cudf::test::fixed_width_column_wrapper<float>{{-1.0, -1.0}};
+  auto ages_2   = cudf::test::fixed_width_column_wrapper<int32_t>{{351, 351}, {true, false}};
+  auto struct_1_2 =
+    cudf::test::structs_column_wrapper{{weight_2, ages_2, land_2, flats_2}, {false, true}};
+  auto is_human_2 = cudf::test::fixed_width_column_wrapper<bool>{{false, false}, {true, false}};
   auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}};
 
   auto list_offsets_column_2 =
@@ -495,10 +498,12 @@ TEST_F(ParquetChunkedWriterTest, DifferentNullabilityStruct)
 
   // Table 2: struct_1 and is_human are nullable now so if we hadn't assumed worst case (nullable)
   // when writing table_1, we would have wrong pages for it.
-  auto weight_2   = cudf::test::fixed_width_column_wrapper<float>{{1.1, -1.0, -1.0}};
-  auto ages_2     = cudf::test::fixed_width_column_wrapper<int32_t>{{31, 351, 351}, {1, 1, 0}};
-  auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2}, {1, 0, 1}};
-  auto is_human_2 = cudf::test::fixed_width_column_wrapper<bool>{{false, false, false}, {1, 1, 0}};
+  auto weight_2 = cudf::test::fixed_width_column_wrapper<float>{{1.1, -1.0, -1.0}};
+  auto ages_2 =
+    cudf::test::fixed_width_column_wrapper<int32_t>{{31, 351, 351}, {true, true, false}};
+  auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2}, {true, false, true}};
+  auto is_human_2 =
+    cudf::test::fixed_width_column_wrapper<bool>{{false, false, false}, {true, true, false}};
   auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}};
   auto table_2    = cudf::table_view({struct_2_2});
 
@@ -628,9 +633,10 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityStruct)
   auto struct_2_1 = cudf::test::structs_column_wrapper{{is_human_1, struct_1_1}};
   auto table_1    = cudf::table_view({struct_2_1});
 
-  auto weight_2   = cudf::test::fixed_width_column_wrapper<float>{{1.1, -1.0, -1.0}};
-  auto ages_2     = cudf::test::fixed_width_column_wrapper<int32_t>{{31, 351, 351}, {1, 1, 0}};
-  auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2}, {1, 0, 1}};
+  auto weight_2 = cudf::test::fixed_width_column_wrapper<float>{{1.1, -1.0, -1.0}};
+  auto ages_2 =
+    cudf::test::fixed_width_column_wrapper<int32_t>{{31, 351, 351}, {true, true, false}};
+  auto struct_1_2 = cudf::test::structs_column_wrapper{{weight_2, ages_2}, {true, false, true}};
   auto is_human_2 = cudf::test::fixed_width_column_wrapper<bool>{{false, false, false}};
   auto struct_2_2 = cudf::test::structs_column_wrapper{{is_human_2, struct_1_2}};
   auto table_2    = cudf::table_view({struct_2_2});
diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index aa9172b0608..2edf9e0aee6 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -119,7 +119,7 @@ TEST_F(ParquetReaderTest, UserBoundsWithNulls)
 {
   // clang-format off
   cudf::test::fixed_width_column_wrapper<float> col{{1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4,  5,5,5,5,5,5,5,5, 6,6,6,6,6,6,6,6, 7,7,7,7,7,7,7,7, 8,8,8,8,8,8,8,8}
-                                                   ,{1,1,1,0,0,0,1,1, 1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0, 1,1,1,1,1,1,0,0,  1,0,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,0}};
+                                                   ,{true,true,true,false,false,false,true,true, true,true,true,true,true,true,true,true, false,false,false,false,false,false,false,false, true,true,true,true,true,true,false,false,  true,false,true,true,true,true,true,true, true,true,true,true,true,true,true,true, true,true,true,true,true,true,true,true, true,true,true,true,true,true,true,false}};
   // clang-format on
   cudf::table_view tbl({col});
   auto filepath = temp_env->get_temp_filepath("UserBoundsWithNulls.parquet");
@@ -168,7 +168,7 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes)
   // list<float>
   constexpr int floats_per_row = 4;
   auto c1_offset_iter          = cudf::detail::make_counting_transform_iterator(
-    0, [floats_per_row](cudf::size_type idx) { return idx * floats_per_row; });
+    0, [](cudf::size_type idx) { return idx * floats_per_row; });
   cudf::test::fixed_width_column_wrapper<cudf::size_type> c1_offsets(c1_offset_iter,
                                                                      c1_offset_iter + num_rows + 1);
   cudf::test::fixed_width_column_wrapper<float> c1_floats(
@@ -192,7 +192,7 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes)
   constexpr int num_string_rows = num_rows * string_per_row;
   cudf::test::strings_column_wrapper string_col{string_iter, string_iter + num_string_rows};
   auto offset_iter = cudf::detail::make_counting_transform_iterator(
-    0, [string_per_row](cudf::size_type idx) { return idx * string_per_row; });
+    0, [](cudf::size_type idx) { return idx * string_per_row; });
   cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets(offset_iter,
                                                                   offset_iter + num_rows + 1);
 
@@ -444,16 +444,18 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
 
   auto weights_col = cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
 
-  auto ages_col =
-    cudf::test::fixed_width_column_wrapper<int32_t>{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+  auto ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
+    {48, 27, 25, 31, 351, 351}, {true, true, true, true, true, false}};
 
-  auto struct_1 = cudf::test::structs_column_wrapper{{weights_col, ages_col}, {1, 1, 1, 1, 0, 1}};
+  auto struct_1 = cudf::test::structs_column_wrapper{{weights_col, ages_col},
+                                                     {true, true, true, true, false, true}};
 
   auto is_human_col = cudf::test::fixed_width_column_wrapper<bool>{
-    {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}};
+    {true, true, false, false, false, false}, {true, true, false, true, true, false}};
 
-  auto struct_2 =
-    cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release();
+  auto struct_2 = cudf::test::structs_column_wrapper{{is_human_col, struct_1},
+                                                     {false, true, true, true, true, true}}
+                    .release();
 
   auto input = table_view({*struct_2});
 
@@ -477,10 +479,12 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
     auto const result = cudf::io::read_parquet(read_args);
 
     auto expect_ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
-      {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
-    auto expect_s_1 = cudf::test::structs_column_wrapper{{expect_ages_col}, {1, 1, 1, 1, 0, 1}};
+      {48, 27, 25, 31, 351, 351}, {true, true, true, true, true, false}};
+    auto expect_s_1 =
+      cudf::test::structs_column_wrapper{{expect_ages_col}, {true, true, true, true, false, true}};
     auto expect_s_2 =
-      cudf::test::structs_column_wrapper{{expect_s_1}, {0, 1, 1, 1, 1, 1}}.release();
+      cudf::test::structs_column_wrapper{{expect_s_1}, {false, true, true, true, true, true}}
+        .release();
     auto expected = table_view({*expect_s_2});
 
     cudf::io::table_input_metadata expected_metadata(expected);
@@ -502,13 +506,14 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
       cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
 
     auto expected_ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
-      {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+      {48, 27, 25, 31, 351, 351}, {true, true, true, true, true, false}};
 
     auto expected_s_1 = cudf::test::structs_column_wrapper{
-      {expected_weights_col, expected_ages_col}, {1, 1, 1, 1, 0, 1}};
+      {expected_weights_col, expected_ages_col}, {true, true, true, true, false, true}};
 
     auto expect_s_2 =
-      cudf::test::structs_column_wrapper{{expected_s_1}, {0, 1, 1, 1, 1, 1}}.release();
+      cudf::test::structs_column_wrapper{{expected_s_1}, {false, true, true, true, true, true}}
+        .release();
     auto expected = table_view({*expect_s_2});
 
     cudf::io::table_input_metadata expected_metadata(expected);
@@ -531,17 +536,17 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
       cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
 
     auto expected_ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
-      {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+      {48, 27, 25, 31, 351, 351}, {true, true, true, true, true, false}};
 
     auto expected_is_human_col = cudf::test::fixed_width_column_wrapper<bool>{
-      {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}};
+      {true, true, false, false, false, false}, {true, true, false, true, true, false}};
 
     auto expect_s_1 = cudf::test::structs_column_wrapper{{expected_ages_col, expected_weights_col},
-                                                         {1, 1, 1, 1, 0, 1}};
+                                                         {true, true, true, true, false, true}};
 
-    auto expect_s_2 =
-      cudf::test::structs_column_wrapper{{expect_s_1, expected_is_human_col}, {0, 1, 1, 1, 1, 1}}
-        .release();
+    auto expect_s_2 = cudf::test::structs_column_wrapper{{expect_s_1, expected_is_human_col},
+                                                         {false, true, true, true, true, true}}
+                        .release();
 
     auto expected = table_view({*expect_s_2});
 
@@ -1980,7 +1985,8 @@ TEST_F(ParquetReaderTest, RepeatedNoAnnotations)
 
   column_wrapper<int32_t> col0{1, 2, 3, 4, 5, 6};
   column_wrapper<int64_t> child0{{5555555555l, 1111111111l, 1111111111l, 2222222222l, 3333333333l}};
-  cudf::test::strings_column_wrapper child1{{"-", "home", "home", "-", "mobile"}, {0, 1, 1, 0, 1}};
+  cudf::test::strings_column_wrapper child1{{"-", "home", "home", "-", "mobile"},
+                                            {false, true, true, false, true}};
   auto struct_col = cudf::test::structs_column_wrapper{{child0, child1}};
 
   auto list_offsets_column =
@@ -1996,8 +2002,8 @@ TEST_F(ParquetReaderTest, RepeatedNoAnnotations)
   std::vector<std::unique_ptr<cudf::column>> struct_children;
   struct_children.push_back(std::move(list_col));
 
-  auto outer_struct =
-    cudf::test::structs_column_wrapper{{std::move(struct_children)}, {0, 0, 1, 1, 1, 1}};
+  auto outer_struct = cudf::test::structs_column_wrapper{{std::move(struct_children)},
+                                                         {false, false, true, true, true, true}};
   table_view expected{{col0, outer_struct}};
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(result.tbl->view(), expected);
diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp
index 25d58a96512..f106fd5a487 100644
--- a/cpp/tests/io/parquet_v2_test.cpp
+++ b/cpp/tests/io/parquet_v2_test.cpp
@@ -317,9 +317,10 @@ TEST_P(ParquetV2Test, SlicedTable)
 
   // Struct column
   auto ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
-    {48, 27, 25, 31, 351, 351, 29, 15}, {1, 1, 1, 1, 1, 0, 1, 1}};
+    {48, 27, 25, 31, 351, 351, 29, 15}, {true, true, true, true, true, false, true, true}};
 
-  auto col5 = cudf::test::structs_column_wrapper{{ages_col}, {1, 1, 1, 1, 0, 1, 1, 1}};
+  auto col5 = cudf::test::structs_column_wrapper{{ages_col},
+                                                 {true, true, true, true, false, true, true, true}};
 
   // Struct/List mixed column
 
@@ -503,8 +504,8 @@ TEST_P(ParquetV2Test, StructOfList)
 
   auto weights_col = cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
 
-  auto ages_col =
-    cudf::test::fixed_width_column_wrapper<int32_t>{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+  auto ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
+    {48, 27, 25, 31, 351, 351}, {true, true, true, true, true, false}};
 
   auto valids  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
   auto valids2 = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; });
@@ -533,13 +534,14 @@ TEST_P(ParquetV2Test, StructOfList)
             lcw{lcw{}, lcw{}, lcw{}}};
 
   auto struct_1 = cudf::test::structs_column_wrapper{{weights_col, ages_col, land_unit, flats},
-                                                     {1, 1, 1, 1, 0, 1}};
+                                                     {true, true, true, true, false, true}};
 
   auto is_human_col = cudf::test::fixed_width_column_wrapper<bool>{
-    {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}};
+    {true, true, false, false, false, false}, {true, true, false, true, true, false}};
 
-  auto struct_2 =
-    cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release();
+  auto struct_2 = cudf::test::structs_column_wrapper{{is_human_col, struct_1},
+                                                     {false, true, true, true, true, true}}
+                    .release();
 
   auto expected = table_view({*struct_2});
 
@@ -580,16 +582,18 @@ TEST_P(ParquetV2Test, ListOfStruct)
 
   auto weight_col = cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
 
-  auto ages_col =
-    cudf::test::fixed_width_column_wrapper<int32_t>{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+  auto ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
+    {48, 27, 25, 31, 351, 351}, {true, true, true, true, true, false}};
 
-  auto struct_1 = cudf::test::structs_column_wrapper{{weight_col, ages_col}, {1, 1, 1, 1, 0, 1}};
+  auto struct_1 = cudf::test::structs_column_wrapper{{weight_col, ages_col},
+                                                     {true, true, true, true, false, true}};
 
   auto is_human_col = cudf::test::fixed_width_column_wrapper<bool>{
-    {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}};
+    {true, true, false, false, false, false}, {true, true, false, true, true, false}};
 
-  auto struct_2 =
-    cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release();
+  auto struct_2 = cudf::test::structs_column_wrapper{{is_human_col, struct_1},
+                                                     {false, true, true, true, true, true}}
+                    .release();
 
   auto list_offsets_column =
     cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 2, 5, 5, 6}.release();
@@ -752,9 +756,8 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
       auto const oi = read_offset_index(source, chunk);
 
       int64_t num_vals = 0;
-      for (size_t o = 0; o < oi.page_locations.size(); o++) {
-        auto const& page_loc = oi.page_locations[o];
-        auto const ph        = read_page_header(source, page_loc);
+      for (auto const& page_loc : oi.page_locations) {
+        auto const ph = read_page_header(source, page_loc);
         EXPECT_EQ(ph.type, expected_hdr_type);
         EXPECT_EQ(page_loc.first_row_index, num_vals);
         num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values;
@@ -779,8 +782,8 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndex)
         EXPECT_EQ(ci.null_counts.value()[p], 0);
         EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0);
       }
-      for (size_t p = 0; p < ci.max_values.size(); p++)
-        EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0);
+      for (auto const& max_value : ci.max_values)
+        EXPECT_TRUE(compare_binary(stats.max_value.value(), max_value, ptype, ctype) >= 0);
     }
   }
 }
@@ -857,9 +860,8 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
       auto const oi = read_offset_index(source, chunk);
 
       int64_t num_vals = 0;
-      for (size_t o = 0; o < oi.page_locations.size(); o++) {
-        auto const& page_loc = oi.page_locations[o];
-        auto const ph        = read_page_header(source, page_loc);
+      for (auto const& page_loc : oi.page_locations) {
+        auto const ph = read_page_header(source, page_loc);
         EXPECT_EQ(ph.type, expected_hdr_type);
         EXPECT_EQ(page_loc.first_row_index, num_vals);
         num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values;
@@ -889,8 +891,8 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNulls)
         }
         EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0);
       }
-      for (size_t p = 0; p < ci.max_values.size(); p++) {
-        EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0);
+      for (auto const& max_value : ci.max_values) {
+        EXPECT_TRUE(compare_binary(stats.max_value.value(), max_value, ptype, ctype) >= 0);
       }
     }
   }
@@ -953,9 +955,8 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexNullColumn)
       auto const oi = read_offset_index(source, chunk);
 
       int64_t num_vals = 0;
-      for (size_t o = 0; o < oi.page_locations.size(); o++) {
-        auto const& page_loc = oi.page_locations[o];
-        auto const ph        = read_page_header(source, page_loc);
+      for (auto const& page_loc : oi.page_locations) {
+        auto const ph = read_page_header(source, page_loc);
         EXPECT_EQ(ph.type, expected_hdr_type);
         EXPECT_EQ(page_loc.first_row_index, num_vals);
         num_vals += is_v2 ? ph.data_page_header_v2.num_rows : ph.data_page_header.num_values;
@@ -1055,9 +1056,8 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
       auto const oi = read_offset_index(source, chunk);
 
       int64_t num_vals = 0;
-      for (size_t o = 0; o < oi.page_locations.size(); o++) {
-        auto const& page_loc = oi.page_locations[o];
-        auto const ph        = read_page_header(source, page_loc);
+      for (auto const& page_loc : oi.page_locations) {
+        auto const ph = read_page_header(source, page_loc);
         EXPECT_EQ(ph.type, expected_hdr_type);
         EXPECT_EQ(page_loc.first_row_index, num_vals);
         // last column has 2 values per row
@@ -1075,11 +1075,11 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStruct)
 
       auto const ptype = fmd.schema[colidx].type;
       auto const ctype = fmd.schema[colidx].converted_type;
-      for (size_t p = 0; p < ci.min_values.size(); p++) {
-        EXPECT_TRUE(compare_binary(stats.min_value.value(), ci.min_values[p], ptype, ctype) <= 0);
+      for (auto const& min_value : ci.min_values) {
+        EXPECT_TRUE(compare_binary(stats.min_value.value(), min_value, ptype, ctype) <= 0);
       }
-      for (size_t p = 0; p < ci.max_values.size(); p++) {
-        EXPECT_TRUE(compare_binary(stats.max_value.value(), ci.max_values[p], ptype, ctype) >= 0);
+      for (auto const& max_value : ci.max_values) {
+        EXPECT_TRUE(compare_binary(stats.max_value.value(), max_value, ptype, ctype) >= 0);
       }
     }
   }
@@ -1141,8 +1141,7 @@ TEST_P(ParquetV2Test, CheckColumnOffsetIndexStructNulls)
   // col3 will have num_ordered_rows / 4 nulls total
   int const null_mods[] = {0, 2, 3, 4};
 
-  for (size_t r = 0; r < fmd.row_groups.size(); r++) {
-    auto const& rg = fmd.row_groups[r];
+  for (auto const& rg : fmd.row_groups) {
     for (size_t c = 0; c < rg.columns.size(); c++) {
       auto const& chunk = rg.columns[c];
 
@@ -1343,8 +1342,7 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls)
 
   read_footer(source, &fmd);
 
-  for (size_t r = 0; r < fmd.row_groups.size(); r++) {
-    auto const& rg = fmd.row_groups[r];
+  for (auto const& rg : fmd.row_groups) {
     for (size_t c = 0; c < rg.columns.size(); c++) {
       auto const& chunk = rg.columns[c];
 
@@ -1371,9 +1369,8 @@ TEST_P(ParquetV2Test, CheckColumnIndexListWithNulls)
       // the first row index is correct
       auto const oi = read_offset_index(source, chunk);
 
-      for (size_t o = 0; o < oi.page_locations.size(); o++) {
-        auto const& page_loc = oi.page_locations[o];
-        auto const ph        = read_page_header(source, page_loc);
+      for (auto const& page_loc : oi.page_locations) {
+        auto const ph = read_page_header(source, page_loc);
         EXPECT_EQ(ph.type, expected_hdr_type);
         // check null counts in V2 header
         if (is_v2) { EXPECT_EQ(ph.data_page_header_v2.num_nulls, expected_null_counts[c]); }
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index ad0860e265e..84ab83e33d0 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -134,7 +134,7 @@ TEST_F(ParquetWriterTest, MultiIndex)
     cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .metadata(expected_metadata)
       .key_value_metadata(
-        {{{"pandas", "\"index_columns\": [\"int8s\", \"int16s\"], \"column1\": [\"int32s\"]"}}});
+        {{{"pandas", R"("index_columns": ["int8s", "int16s"], "column1": ["int32s"])"}}});
   cudf::io::write_parquet(out_opts);
 
   cudf::io::parquet_reader_options in_opts =
@@ -242,16 +242,18 @@ TEST_F(ParquetWriterTest, Struct)
   // `Name` column has all valid values.
   auto names_col = cudf::test::strings_column_wrapper{names.begin(), names.end()};
 
-  auto ages_col =
-    cudf::test::fixed_width_column_wrapper<int32_t>{{48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
+  auto ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
+    {48, 27, 25, 31, 351, 351}, {true, true, true, true, true, false}};
 
-  auto struct_1 = cudf::test::structs_column_wrapper{{names_col, ages_col}, {1, 1, 1, 1, 0, 1}};
+  auto struct_1 = cudf::test::structs_column_wrapper{{names_col, ages_col},
+                                                     {true, true, true, true, false, true}};
 
   auto is_human_col = cudf::test::fixed_width_column_wrapper<bool>{
-    {true, true, false, false, false, false}, {1, 1, 0, 1, 1, 0}};
+    {true, true, false, false, false, false}, {true, true, false, true, true, false}};
 
-  auto struct_2 =
-    cudf::test::structs_column_wrapper{{is_human_col, struct_1}, {0, 1, 1, 1, 1, 1}}.release();
+  auto struct_2 = cudf::test::structs_column_wrapper{{is_human_col, struct_1},
+                                                     {false, true, true, true, true, true}}
+                    .release();
 
   auto expected = table_view({*struct_2});
 
@@ -274,7 +276,7 @@ class custom_test_data_sink : public cudf::io::data_sink {
     CUDF_EXPECTS(outfile_.is_open(), "Cannot open output file");
   }
 
-  virtual ~custom_test_data_sink() { flush(); }
+  ~custom_test_data_sink() override { flush(); }
 
   void host_write(void const* data, size_t size) override
   {
@@ -1968,7 +1970,7 @@ class custom_test_memmap_sink : public cudf::io::data_sink {
     mm_writer = cudf::io::data_sink::create(mm_writer_buf);
   }
 
-  virtual ~custom_test_memmap_sink() { mm_writer->flush(); }
+  ~custom_test_memmap_sink() override { mm_writer->flush(); }
 
   void host_write(void const* data, size_t size) override { mm_writer->host_write(data, size); }
 
diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp
index 698256251ef..05ae4ea1d04 100644
--- a/cpp/tests/join/distinct_join_tests.cpp
+++ b/cpp/tests/join/distinct_join_tests.cpp
@@ -148,12 +148,12 @@ TEST_F(DistinctJoinTest, InnerJoinNoNulls)
 TEST_F(DistinctJoinTest, InnerJoinWithNulls)
 {
   column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 2}};
-  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1});
+  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {true, true, false, true, true});
   column_wrapper<int32_t> col0_2{{1, 1, 2, 4, 1}};
 
   column_wrapper<int32_t> col1_0{{1, 2, 0, 2, 3}};
   strcol_wrapper col1_1({"s1", "s0", "s1", "s0", "s1"});
-  column_wrapper<int32_t> col1_2{{1, 1, 1, 1, 1}, {0, 1, 1, 0, 1}};
+  column_wrapper<int32_t> col1_2{{1, 1, 1, 1, 1}, {false, true, true, false, true}};
 
   CVector cols0, cols1;
   cols0.push_back(col0_0.release());
@@ -170,10 +170,10 @@ TEST_F(DistinctJoinTest, InnerJoinWithNulls)
   auto result        = distinct_join.inner_join();
 
   column_wrapper<int32_t> col_gold_0{{3, 2}};
-  strcol_wrapper col_gold_1({"s1", "s0"}, {1, 1});
+  strcol_wrapper col_gold_1({"s1", "s0"}, {true, true});
   column_wrapper<int32_t> col_gold_2{{1, 1}};
   column_wrapper<int32_t> col_gold_3{{3, 2}};
-  strcol_wrapper col_gold_4({"s1", "s0"}, {1, 1});
+  strcol_wrapper col_gold_4({"s1", "s0"}, {true, true});
   column_wrapper<int32_t> col_gold_5{{1, 1}};
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
@@ -190,21 +190,22 @@ TEST_F(DistinctJoinTest, InnerJoinWithNulls)
 TEST_F(DistinctJoinTest, InnerJoinWithStructsAndNulls)
 {
   column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 2}};
-  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1});
-  column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 4}, {1, 1, 1, 1, 0}};
+  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {true, true, false, true, true});
+  column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 4}, {true, true, true, true, false}};
   std::initializer_list<std::string> col0_names = {
     "Samuel Vimes", "Carrot Ironfoundersson", "Detritus", "Samuel Vimes", "Angua von Überwald"};
   auto col0_names_col = strcol_wrapper{col0_names.begin(), col0_names.end()};
   auto col0_ages_col  = column_wrapper<int32_t>{{48, 27, 351, 31, 25}};
 
-  auto col0_is_human_col = column_wrapper<bool>{{true, true, false, false, false}, {1, 1, 0, 1, 0}};
+  auto col0_is_human_col =
+    column_wrapper<bool>{{true, true, false, false, false}, {true, true, false, true, false}};
 
   auto col0_3 =
     cudf::test::structs_column_wrapper{{col0_names_col, col0_ages_col, col0_is_human_col}};
 
   column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
   strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"});
-  column_wrapper<int32_t> col1_2{{1, 1, 1, 2, 0}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col1_2{{1, 1, 1, 2, 0}, {true, false, true, true, true}};
   std::initializer_list<std::string> col1_names = {"Carrot Ironfoundersson",
                                                    "Angua von Überwald",
                                                    "Detritus",
@@ -213,7 +214,8 @@ TEST_F(DistinctJoinTest, InnerJoinWithStructsAndNulls)
   auto col1_names_col = strcol_wrapper{col1_names.begin(), col1_names.end()};
   auto col1_ages_col  = column_wrapper<int32_t>{{31, 25, 351, 27, 48}};
 
-  auto col1_is_human_col = column_wrapper<bool>{{true, false, false, false, true}, {1, 0, 0, 1, 1}};
+  auto col1_is_human_col =
+    column_wrapper<bool>{{true, false, false, false, true}, {true, false, false, true, true}};
 
   auto col1_3 =
     cudf::test::structs_column_wrapper{{col1_names_col, col1_ages_col, col1_is_human_col}};
@@ -235,23 +237,23 @@ TEST_F(DistinctJoinTest, InnerJoinWithStructsAndNulls)
   auto result        = distinct_join.inner_join();
 
   column_wrapper<int32_t> col_gold_0{{3, 2}};
-  strcol_wrapper col_gold_1({"s1", "s0"}, {1, 1});
-  column_wrapper<int32_t> col_gold_2{{0, 4}, {1, 0}};
+  strcol_wrapper col_gold_1({"s1", "s0"}, {true, true});
+  column_wrapper<int32_t> col_gold_2{{0, 4}, {true, false}};
   auto col_gold_3_names_col = strcol_wrapper{"Samuel Vimes", "Angua von Überwald"};
   auto col_gold_3_ages_col  = column_wrapper<int32_t>{{48, 25}};
 
-  auto col_gold_3_is_human_col = column_wrapper<bool>{{true, false}, {1, 0}};
+  auto col_gold_3_is_human_col = column_wrapper<bool>{{true, false}, {true, false}};
 
   auto col_gold_3 = cudf::test::structs_column_wrapper{
     {col_gold_3_names_col, col_gold_3_ages_col, col_gold_3_is_human_col}};
 
   column_wrapper<int32_t> col_gold_4{{3, 2}};
-  strcol_wrapper col_gold_5({"s1", "s0"}, {1, 1});
-  column_wrapper<int32_t> col_gold_6{{0, -1}, {1, 0}};
+  strcol_wrapper col_gold_5({"s1", "s0"}, {true, true});
+  column_wrapper<int32_t> col_gold_6{{0, -1}, {true, false}};
   auto col_gold_7_names_col = strcol_wrapper{"Samuel Vimes", "Angua von Überwald"};
   auto col_gold_7_ages_col  = column_wrapper<int32_t>{{48, 25}};
 
-  auto col_gold_7_is_human_col = column_wrapper<bool>{{true, false}, {1, 0}};
+  auto col_gold_7_is_human_col = column_wrapper<bool>{{true, false}, {true, false}};
 
   auto col_gold_7 = cudf::test::structs_column_wrapper{
     {col_gold_7_names_col, col_gold_7_ages_col, col_gold_7_is_human_col}};
@@ -275,7 +277,7 @@ TEST_F(DistinctJoinTest, EmptyBuildTableInnerJoin)
   column_wrapper<int32_t> col0_1;
 
   column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
-  column_wrapper<int32_t> col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col1_1{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
 
   CVector cols0, cols1;
   cols0.push_back(col0_0.release());
@@ -298,7 +300,7 @@ TEST_F(DistinctJoinTest, EmptyBuildTableLeftJoin)
   column_wrapper<int32_t> col0_1;
 
   column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
-  column_wrapper<int32_t> col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col1_1{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
 
   CVector cols0, cols1;
   cols0.push_back(col0_0.release());
@@ -320,7 +322,7 @@ TEST_F(DistinctJoinTest, EmptyBuildTableLeftJoin)
 TEST_F(DistinctJoinTest, EmptyProbeTableInnerJoin)
 {
   column_wrapper<int32_t> col0_0{{2, 2, 0, 4, 3}};
-  column_wrapper<int32_t> col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col0_1{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
 
   column_wrapper<int32_t> col1_0;
   column_wrapper<int32_t> col1_1;
@@ -343,7 +345,7 @@ TEST_F(DistinctJoinTest, EmptyProbeTableInnerJoin)
 TEST_F(DistinctJoinTest, EmptyProbeTableLeftJoin)
 {
   column_wrapper<int32_t> col0_0{{2, 2, 0, 4, 3}};
-  column_wrapper<int32_t> col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col0_1{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
 
   column_wrapper<int32_t> col1_0;
   column_wrapper<int32_t> col1_1;
@@ -384,8 +386,8 @@ TEST_F(DistinctJoinTest, LeftJoinNoNulls)
 
   column_wrapper<int32_t> col_gold_0({3, 1, 2, 0, 3});
   strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1"});
-  column_wrapper<int32_t> col_gold_2{{-1, -1, -1, -1, 3}, {0, 0, 0, 0, 1}};
-  strcol_wrapper col_gold_3{{"", "", "", "", "s1"}, {0, 0, 0, 0, 1}};
+  column_wrapper<int32_t> col_gold_2{{-1, -1, -1, -1, 3}, {false, false, false, false, true}};
+  strcol_wrapper col_gold_3{{"", "", "", "", "s1"}, {false, false, false, false, true}};
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
@@ -404,7 +406,7 @@ TEST_F(DistinctJoinTest, LeftJoinNoNulls)
 TEST_F(DistinctJoinTest, LeftJoinWithNulls)
 {
   column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 2}};
-  strcol_wrapper col0_1({"s1", "s1", "", "s4", "s0"}, {1, 1, 0, 1, 1});
+  strcol_wrapper col0_1({"s1", "s1", "", "s4", "s0"}, {true, true, false, true, true});
 
   column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
   strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"});
@@ -422,10 +424,10 @@ TEST_F(DistinctJoinTest, LeftJoinWithNulls)
   auto result        = distinct_join.left_join();
   auto gather_map    = std::pair{std::move(result), std::move(get_left_indices(result->size()))};
 
-  column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 2}, {1, 1, 1, 1, 1}};
-  strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {1, 1, 0, 1, 1});
-  column_wrapper<int32_t> col_gold_2{{3, -1, -1, -1, 2}, {1, 0, 0, 0, 1}};
-  strcol_wrapper col_gold_3{{"s1", "", "", "", "s0"}, {1, 0, 0, 0, 1}};
+  column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 2}, {true, true, true, true, true}};
+  strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {true, true, false, true, true});
+  column_wrapper<int32_t> col_gold_2{{3, -1, -1, -1, 2}, {true, false, false, false, true}};
+  strcol_wrapper col_gold_3{{"s1", "", "", "", "s0"}, {true, false, false, false, true}};
 
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
@@ -442,15 +444,17 @@ TEST_F(DistinctJoinTest, LeftJoinWithStructsAndNulls)
 {
   auto col0_names_col = strcol_wrapper{
     "Samuel Vimes", "Carrot Ironfoundersson", "Detritus", "Samuel Vimes", "Angua von Überwald"};
-  auto col0_ages_col     = column_wrapper<int32_t>{{48, 27, 351, 31, 25}};
-  auto col0_is_human_col = column_wrapper<bool>{{true, true, false, false, false}, {1, 1, 0, 1, 0}};
+  auto col0_ages_col = column_wrapper<int32_t>{{48, 27, 351, 31, 25}};
+  auto col0_is_human_col =
+    column_wrapper<bool>{{true, true, false, false, false}, {true, true, false, true, false}};
   auto col0 =
     cudf::test::structs_column_wrapper{{col0_names_col, col0_ages_col, col0_is_human_col}};
 
   auto col1_names_col = strcol_wrapper{
     "Samuel Vimes", "Detritus", "Detritus", "Carrot Ironfoundersson", "Angua von Überwald"};
-  auto col1_ages_col     = column_wrapper<int32_t>{{48, 35, 351, 22, 25}};
-  auto col1_is_human_col = column_wrapper<bool>{{true, true, false, false, true}, {1, 1, 0, 1, 1}};
+  auto col1_ages_col = column_wrapper<int32_t>{{48, 35, 351, 22, 25}};
+  auto col1_is_human_col =
+    column_wrapper<bool>{{true, true, false, false, true}, {true, true, false, true, true}};
   auto col1 =
     cudf::test::structs_column_wrapper{{col1_names_col, col1_ages_col, col1_is_human_col}};
 
@@ -469,7 +473,7 @@ TEST_F(DistinctJoinTest, LeftJoinWithStructsAndNulls)
     "Samuel Vimes", "Detritus", "Carrot Ironfoundersson", "Samuel Vimes", "Angua von Überwald"};
   auto col0_gold_ages_col = column_wrapper<int32_t>{{48, 351, 27, 31, 25}};
   auto col0_gold_is_human_col =
-    column_wrapper<bool>{{true, false, true, false, false}, {1, 0, 1, 1, 0}};
+    column_wrapper<bool>{{true, false, true, false, false}, {true, false, true, true, false}};
   auto col0_gold = cudf::test::structs_column_wrapper{
     {col0_gold_names_col, col0_gold_ages_col, col0_gold_is_human_col}};
 
@@ -480,12 +484,14 @@ TEST_F(DistinctJoinTest, LeftJoinWithStructsAndNulls)
                                               "",
                                               "",
                                             },
-                                            {1, 1, 0, 0, 0}};
-  auto col1_gold_ages_col  = column_wrapper<int32_t>{{48, 351, -1, -1, -1}, {1, 1, 0, 0, 0}};
+                                            {true, true, false, false, false}};
+  auto col1_gold_ages_col =
+    column_wrapper<int32_t>{{48, 351, -1, -1, -1}, {true, true, false, false, false}};
   auto col1_gold_is_human_col =
-    column_wrapper<bool>{{true, false, false, false, false}, {1, 0, 0, 0, 0}};
+    column_wrapper<bool>{{true, false, false, false, false}, {true, false, false, false, false}};
   auto col1_gold = cudf::test::structs_column_wrapper{
-    {col1_gold_names_col, col1_gold_ages_col, col1_gold_is_human_col}, {1, 1, 0, 0, 0}};
+    {col1_gold_names_col, col1_gold_ages_col, col1_gold_is_human_col},
+    {true, true, false, false, false}};
 
   CVector cols_gold;
   cols_gold.push_back(col0_gold.release());
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index c35ad5319e4..4e88414d553 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -201,12 +201,14 @@ TEST_F(JoinTest, LeftJoinNoNullsWithNoCommon)
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{3, 1, 2, 2, 0, 3}, {1, 1, 1, 1, 1, 1}};
-  strcol_wrapper col_gold_1({"s0", "s1", "s2", "s2", "s4", "s1"}, {1, 1, 1, 1, 1, 1});
-  column_wrapper<int32_t> col_gold_2{{0, 1, 2, 2, 4, 1}, {1, 1, 1, 1, 1, 1}};
-  column_wrapper<int32_t> col_gold_3{{3, -1, 2, 2, 0, 3}, {1, 0, 1, 1, 1, 1}};
-  strcol_wrapper col_gold_4({"s1", "", "s1", "s0", "s1", "s1"}, {1, 0, 1, 1, 1, 1});
-  column_wrapper<int32_t> col_gold_5{{1, -1, 1, 0, 1, 1}, {1, 0, 1, 1, 1, 1}};
+  column_wrapper<int32_t> col_gold_0{{3, 1, 2, 2, 0, 3}, {true, true, true, true, true, true}};
+  strcol_wrapper col_gold_1({"s0", "s1", "s2", "s2", "s4", "s1"},
+                            {true, true, true, true, true, true});
+  column_wrapper<int32_t> col_gold_2{{0, 1, 2, 2, 4, 1}, {true, true, true, true, true, true}};
+  column_wrapper<int32_t> col_gold_3{{3, -1, 2, 2, 0, 3}, {true, false, true, true, true, true}};
+  strcol_wrapper col_gold_4({"s1", "", "s1", "s0", "s1", "s1"},
+                            {true, false, true, true, true, true});
+  column_wrapper<int32_t> col_gold_5{{1, -1, 1, 0, 1, 1}, {true, false, true, true, true, true}};
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
@@ -247,14 +249,18 @@ TEST_F(JoinTest, FullJoinNoNulls)
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 3, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 3, -1, -1, -1, -1},
+                                     {true, true, true, true, true, false, false, false, false}};
   strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1", "", "", "", ""},
-                            {1, 1, 1, 1, 1, 0, 0, 0, 0});
-  column_wrapper<int32_t> col_gold_2{{0, 1, 2, 4, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}};
-  column_wrapper<int32_t> col_gold_3{{-1, -1, -1, -1, 3, 2, 2, 0, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 1}};
+                            {true, true, true, true, true, false, false, false, false});
+  column_wrapper<int32_t> col_gold_2{{0, 1, 2, 4, 1, -1, -1, -1, -1},
+                                     {true, true, true, true, true, false, false, false, false}};
+  column_wrapper<int32_t> col_gold_3{{-1, -1, -1, -1, 3, 2, 2, 0, 4},
+                                     {false, false, false, false, true, true, true, true, true}};
   strcol_wrapper col_gold_4({"", "", "", "", "s1", "s1", "s0", "s1", "s2"},
-                            {0, 0, 0, 0, 1, 1, 1, 1, 1});
-  column_wrapper<int32_t> col_gold_5{{-1, -1, -1, -1, 1, 1, 0, 1, 2}, {0, 0, 0, 0, 1, 1, 1, 1, 1}};
+                            {false, false, false, false, true, true, true, true, true});
+  column_wrapper<int32_t> col_gold_5{{-1, -1, -1, -1, 1, 1, 0, 1, 2},
+                                     {false, false, false, false, true, true, true, true, true}};
 
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
@@ -277,7 +283,7 @@ TEST_F(JoinTest, FullJoinWithNulls)
   strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"});
   column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 1}};
 
-  column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}, {1, 1, 1, 0, 1}};
+  column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}, {true, true, true, false, true}};
   strcol_wrapper col1_1{{"s1", "s0", "s1", "s2", "s1"}};
   column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}};
 
@@ -296,14 +302,18 @@ TEST_F(JoinTest, FullJoinWithNulls)
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 3, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 3, -1, -1, -1, -1},
+                                     {true, true, true, true, true, false, false, false, false}};
   strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1", "", "", "", ""},
-                            {1, 1, 1, 1, 1, 0, 0, 0, 0});
-  column_wrapper<int32_t> col_gold_2{{0, 1, 2, 4, 1, -1, -1, -1, -1}, {1, 1, 1, 1, 1, 0, 0, 0, 0}};
-  column_wrapper<int32_t> col_gold_3{{-1, -1, -1, -1, 3, 2, 2, 0, 4}, {0, 0, 0, 0, 1, 1, 1, 1, 0}};
+                            {true, true, true, true, true, false, false, false, false});
+  column_wrapper<int32_t> col_gold_2{{0, 1, 2, 4, 1, -1, -1, -1, -1},
+                                     {true, true, true, true, true, false, false, false, false}};
+  column_wrapper<int32_t> col_gold_3{{-1, -1, -1, -1, 3, 2, 2, 0, 4},
+                                     {false, false, false, false, true, true, true, true, false}};
   strcol_wrapper col_gold_4({"", "", "", "", "s1", "s1", "s0", "s1", "s2"},
-                            {0, 0, 0, 0, 1, 1, 1, 1, 1});
-  column_wrapper<int32_t> col_gold_5{{-1, -1, -1, -1, 1, 1, 0, 1, 2}, {0, 0, 0, 0, 1, 1, 1, 1, 1}};
+                            {false, false, false, false, true, true, true, true, true});
+  column_wrapper<int32_t> col_gold_5{{-1, -1, -1, -1, 1, 1, 0, 1, 2},
+                                     {false, false, false, false, true, true, true, true, true}};
 
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
@@ -324,12 +334,12 @@ TEST_F(JoinTest, FullJoinOnNulls)
 {
   // clang-format off
   column_wrapper<int32_t> col0_0{{  3,    1 },
-                                 {  1,    0  }};
+                                 {  true,    false  }};
   strcol_wrapper          col0_1({"s0", "s1" });
   column_wrapper<int32_t> col0_2{{  0,    1 }};
 
   column_wrapper<int32_t> col1_0{{  2,    5,    3,    7 },
-                                 {  1,    1,    1,    0 }};
+                                 {  true,    true,    true,    false }};
   strcol_wrapper          col1_1({"s1", "s0", "s0", "s1" });
   column_wrapper<int32_t> col1_2{{  1,    4,    2,    8 }};
 
@@ -349,13 +359,13 @@ TEST_F(JoinTest, FullJoinOnNulls)
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
   column_wrapper<int32_t> col_gold_0{{   3,   -1,   -1,    -1},
-                                     {   1,    0,    0,     0}};
+                                     {   true,    false,    false,     false}};
   strcol_wrapper          col_gold_1{{ "s0", "s1",  "",    ""},
-                                     {   1,    1,    0,     0}};
+                                     {   true,    true,    false,     false}};
   column_wrapper<int32_t> col_gold_2{{   0,    1,   -1,    -1},
-                                     {   1,    1,    0,     0}};
+                                     {   true,    true,    false,     false}};
   column_wrapper<int32_t> col_gold_3{{   3,   -1,    2,     5},
-                                     {   1,    0,    1,     1}};
+                                     {   true,    false,    true,     true}};
   strcol_wrapper          col_gold_4{{ "s0", "s1", "s1",  "s0"}};
   column_wrapper<int32_t> col_gold_5{{   2,    8,    1,     4}};
 
@@ -382,17 +392,17 @@ TEST_F(JoinTest, FullJoinOnNulls)
   sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
   col_gold_0 =               {{   3,   -1,   -1,    -1,   -1},
-                              {   1,    0,    0,     0,    0}};
+                              {   true,    false,    false,     false,    false}};
   col_gold_1 = strcol_wrapper{{ "s0", "s1",   "",    "",   ""},
-                              {   1,    1,    0,     0,    0}};
+                              {   true,    true,    false,     false,    false}};
   col_gold_2 =               {{   0,    1,   -1,    -1,   -1},
-                              {   1,    1,    0,     0,    0}};
+                              {   true,    true,    false,     false,    false}};
   col_gold_3 =               {{   3,   -1,    2,     5,   -1},
-                              {   1,    0,    1,     1,    0}};
+                              {   true,    false,    true,     true,    false}};
   col_gold_4 = strcol_wrapper{{ "s0",  "",  "s1",  "s0",  "s1"},
-                              {   1,    0,    1,     1,    1}};
+                              {   true,    false,    true,     true,    true}};
   col_gold_5 =               {{   2,   -1,    1,     4,    8},
-                              {   1,    0,    1,     1,    1}};
+                              {   true,    false,    true,     true,    true}};
 
   // clang-format on
 
@@ -440,9 +450,9 @@ TEST_F(JoinTest, LeftJoinNoNulls)
   column_wrapper<int32_t> col_gold_0({3, 1, 2, 0, 3});
   strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1"});
   column_wrapper<int32_t> col_gold_2({0, 1, 2, 4, 1});
-  column_wrapper<int32_t> col_gold_3{{-1, -1, -1, -1, 3}, {0, 0, 0, 0, 1}};
-  strcol_wrapper col_gold_4{{"", "", "", "", "s1"}, {0, 0, 0, 0, 1}};
-  column_wrapper<int32_t> col_gold_5{{-1, -1, -1, -1, 1}, {0, 0, 0, 0, 1}};
+  column_wrapper<int32_t> col_gold_3{{-1, -1, -1, -1, 3}, {false, false, false, false, true}};
+  strcol_wrapper col_gold_4{{"", "", "", "", "s1"}, {false, false, false, false, true}};
+  column_wrapper<int32_t> col_gold_5{{-1, -1, -1, -1, 1}, {false, false, false, false, true}};
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
@@ -460,12 +470,12 @@ TEST_F(JoinTest, LeftJoinNoNulls)
 TEST_F(JoinTest, LeftJoinWithNulls)
 {
   column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 2}};
-  strcol_wrapper col0_1({"s1", "s1", "", "s4", "s0"}, {1, 1, 0, 1, 1});
+  strcol_wrapper col0_1({"s1", "s1", "", "s4", "s0"}, {true, true, false, true, true});
   column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 1}};
 
   column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
   strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"});
-  column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
 
   CVector cols0, cols1;
   cols0.push_back(col0_0.release());
@@ -482,12 +492,12 @@ TEST_F(JoinTest, LeftJoinWithNulls)
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 2}, {1, 1, 1, 1, 1}};
-  strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {1, 1, 0, 1, 1});
-  column_wrapper<int32_t> col_gold_2{{0, 1, 2, 4, 1}, {1, 1, 1, 1, 1}};
-  column_wrapper<int32_t> col_gold_3{{3, -1, -1, -1, 2}, {1, 0, 0, 0, 1}};
-  strcol_wrapper col_gold_4{{"s1", "", "", "", "s0"}, {1, 0, 0, 0, 1}};
-  column_wrapper<int32_t> col_gold_5{{1, -1, -1, -1, -1}, {1, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 2}, {true, true, true, true, true}};
+  strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {true, true, false, true, true});
+  column_wrapper<int32_t> col_gold_2{{0, 1, 2, 4, 1}, {true, true, true, true, true}};
+  column_wrapper<int32_t> col_gold_3{{3, -1, -1, -1, 2}, {true, false, false, false, true}};
+  strcol_wrapper col_gold_4{{"s1", "", "", "", "s0"}, {true, false, false, false, true}};
+  column_wrapper<int32_t> col_gold_5{{1, -1, -1, -1, -1}, {true, false, false, false, false}};
 
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
@@ -506,25 +516,27 @@ TEST_F(JoinTest, LeftJoinWithNulls)
 TEST_F(JoinTest, LeftJoinWithStructsAndNulls)
 {
   column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 2}};
-  strcol_wrapper col0_1({"s1", "s1", "", "s4", "s0"}, {1, 1, 0, 1, 1});
+  strcol_wrapper col0_1({"s1", "s1", "", "s4", "s0"}, {true, true, false, true, true});
   column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 1}};
   auto col0_names_col = strcol_wrapper{
     "Samuel Vimes", "Carrot Ironfoundersson", "Detritus", "Samuel Vimes", "Angua von Überwald"};
   auto col0_ages_col = column_wrapper<int32_t>{{48, 27, 351, 31, 25}};
 
-  auto col0_is_human_col = column_wrapper<bool>{{true, true, false, false, false}, {1, 1, 0, 1, 0}};
+  auto col0_is_human_col =
+    column_wrapper<bool>{{true, true, false, false, false}, {true, true, false, true, false}};
 
   auto col0_3 =
     cudf::test::structs_column_wrapper{{col0_names_col, col0_ages_col, col0_is_human_col}};
 
   column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
   strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"});
-  column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
   auto col1_names_col = strcol_wrapper{
     "Samuel Vimes", "Detritus", "Detritus", "Carrot Ironfoundersson", "Angua von Überwald"};
   auto col1_ages_col = column_wrapper<int32_t>{{48, 35, 351, 22, 25}};
 
-  auto col1_is_human_col = column_wrapper<bool>{{true, true, false, false, true}, {1, 1, 0, 1, 1}};
+  auto col1_is_human_col =
+    column_wrapper<bool>{{true, true, false, false, true}, {true, true, false, true, true}};
 
   auto col1_3 =
     cudf::test::structs_column_wrapper{{col1_names_col, col1_ages_col, col1_is_human_col}};
@@ -546,22 +558,22 @@ TEST_F(JoinTest, LeftJoinWithStructsAndNulls)
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{3, 2, 1, 0, 2}, {1, 1, 1, 1, 1}};
-  strcol_wrapper col_gold_1({"s1", "", "s1", "s4", "s0"}, {1, 0, 1, 1, 1});
-  column_wrapper<int32_t> col_gold_2{{0, 2, 1, 4, 1}, {1, 1, 1, 1, 1}};
+  column_wrapper<int32_t> col_gold_0{{3, 2, 1, 0, 2}, {true, true, true, true, true}};
+  strcol_wrapper col_gold_1({"s1", "", "s1", "s4", "s0"}, {true, false, true, true, true});
+  column_wrapper<int32_t> col_gold_2{{0, 2, 1, 4, 1}, {true, true, true, true, true}};
   auto col0_gold_names_col = strcol_wrapper{
     "Samuel Vimes", "Detritus", "Carrot Ironfoundersson", "Samuel Vimes", "Angua von Überwald"};
   auto col0_gold_ages_col = column_wrapper<int32_t>{{48, 351, 27, 31, 25}};
 
   auto col0_gold_is_human_col =
-    column_wrapper<bool>{{true, false, true, false, false}, {1, 0, 1, 1, 0}};
+    column_wrapper<bool>{{true, false, true, false, false}, {true, false, true, true, false}};
 
   auto col_gold_3 = cudf::test::structs_column_wrapper{
     {col0_gold_names_col, col0_gold_ages_col, col0_gold_is_human_col}};
 
-  column_wrapper<int32_t> col_gold_4{{2, 0, -1, -1, -1}, {1, 1, 0, 0, 0}};
-  strcol_wrapper col_gold_5{{"s1", "s1", "", "", ""}, {1, 1, 0, 0, 0}};
-  column_wrapper<int32_t> col_gold_6{{1, 1, -1, -1, -1}, {1, 1, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_4{{2, 0, -1, -1, -1}, {true, true, false, false, false}};
+  strcol_wrapper col_gold_5{{"s1", "s1", "", "", ""}, {true, true, false, false, false}};
+  column_wrapper<int32_t> col_gold_6{{1, 1, -1, -1, -1}, {true, true, false, false, false}};
   auto col1_gold_names_col = strcol_wrapper{{
                                               "Samuel Vimes",
                                               "Detritus",
@@ -569,14 +581,16 @@ TEST_F(JoinTest, LeftJoinWithStructsAndNulls)
                                               "",
                                               "",
                                             },
-                                            {1, 1, 0, 0, 0}};
-  auto col1_gold_ages_col  = column_wrapper<int32_t>{{48, 351, -1, -1, -1}, {1, 1, 0, 0, 0}};
+                                            {true, true, false, false, false}};
+  auto col1_gold_ages_col =
+    column_wrapper<int32_t>{{48, 351, -1, -1, -1}, {true, true, false, false, false}};
 
   auto col1_gold_is_human_col =
-    column_wrapper<bool>{{true, false, false, false, false}, {1, 0, 0, 0, 0}};
+    column_wrapper<bool>{{true, false, false, false, false}, {true, false, false, false, false}};
 
   auto col_gold_7 = cudf::test::structs_column_wrapper{
-    {col1_gold_names_col, col1_gold_ages_col, col1_gold_is_human_col}, {1, 1, 0, 0, 0}};
+    {col1_gold_names_col, col1_gold_ages_col, col1_gold_is_human_col},
+    {true, true, false, false, false}};
 
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
@@ -598,12 +612,12 @@ TEST_F(JoinTest, LeftJoinOnNulls)
 {
   // clang-format off
   column_wrapper<int32_t> col0_0{{  3,    1,    2},
-                                 {  1,    0,    1}};
+                                 {  true,    false,    true}};
   strcol_wrapper          col0_1({"s0", "s1", "s2" });
   column_wrapper<int32_t> col0_2{{  0,    1,    2 }};
 
   column_wrapper<int32_t> col1_0{{  2,    5,    3,    7 },
-                                 {  1,    1,    1,    0 }};
+                                 {  true,    true,    true,    false }};
   strcol_wrapper          col1_1({"s1", "s0", "s0", "s1" });
   column_wrapper<int32_t> col1_2{{  1,    4,    2,    8 }};
 
@@ -623,17 +637,17 @@ TEST_F(JoinTest, LeftJoinOnNulls)
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
   column_wrapper<int32_t> col_gold_0{{   3,    -1,    2},
-                                     {   1,     0,    1}};
+                                     {   true,     false,    true}};
   strcol_wrapper          col_gold_1({ "s0",  "s1", "s2"},
-                                     {   1,     1,    1});
+                                     {   true,     true,    true});
   column_wrapper<int32_t> col_gold_2{{   0,     1,    2},
-                                     {   1,     1,    1}};
+                                     {   true,     true,    true}};
   column_wrapper<int32_t> col_gold_3{{   3,    -1,   -1},
-                                     {   1,     0,    0}};
+                                     {   true,     false,    false}};
   strcol_wrapper          col_gold_4({ "s0",  "s1",  ""},
-                                     {   1,     1,    0});
+                                     {   true,     true,    false});
   column_wrapper<int32_t> col_gold_5{{   2,     8,   -1},
-                                     {   1,     1,    0}};
+                                     {   true,     true,    false}};
 
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
@@ -658,17 +672,17 @@ TEST_F(JoinTest, LeftJoinOnNulls)
 
 
   col_gold_0 = {{   3,    -1,    2},
-                {   1,     0,    1}};
+                {   true,     false,    true}};
   col_gold_1 = {{ "s0",  "s1", "s2"},
-                {   1,     1,    1}};
+                {   true,     true,    true}};
   col_gold_2 = {{   0,     1,    2},
-                {   1,     1,    1}};
+                {   true,     true,    true}};
   col_gold_3 = {{   3,    -1,   -1},
-                {   1,     0,    0}};
+                {   true,     false,    false}};
   col_gold_4 = {{ "s0",   "",   ""},
-                {   1,     0,    0}};
+                {   true,     false,    false}};
   col_gold_5 = {{   2,    -1,   -1},
-                {   1,     0,    0}};
+                {   true,     false,    false}};
 
   // clang-format on
   CVector cols_gold_nulls_unequal;
@@ -732,12 +746,12 @@ TEST_F(JoinTest, InnerJoinNoNulls)
 TEST_F(JoinTest, InnerJoinWithNulls)
 {
   column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 2}};
-  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1});
+  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {true, true, false, true, true});
   column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 1}};
 
   column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
   strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"});
-  column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
 
   CVector cols0, cols1;
   cols0.push_back(col0_0.release());
@@ -755,11 +769,11 @@ TEST_F(JoinTest, InnerJoinWithNulls)
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
   column_wrapper<int32_t> col_gold_0{{3, 2}};
-  strcol_wrapper col_gold_1({"s1", "s0"}, {1, 1});
+  strcol_wrapper col_gold_1({"s1", "s0"}, {true, true});
   column_wrapper<int32_t> col_gold_2{{0, 1}};
   column_wrapper<int32_t> col_gold_3{{3, 2}};
-  strcol_wrapper col_gold_4({"s1", "s0"}, {1, 1});
-  column_wrapper<int32_t> col_gold_5{{1, -1}, {1, 0}};
+  strcol_wrapper col_gold_4({"s1", "s0"}, {true, true});
+  column_wrapper<int32_t> col_gold_5{{1, -1}, {true, false}};
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
   cols_gold.push_back(col_gold_1.release());
@@ -777,21 +791,22 @@ TEST_F(JoinTest, InnerJoinWithNulls)
 TEST_F(JoinTest, InnerJoinWithStructsAndNulls)
 {
   column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 2}};
-  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1});
+  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {true, true, false, true, true});
   column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 1}};
   std::initializer_list<std::string> col0_names = {
     "Samuel Vimes", "Carrot Ironfoundersson", "Detritus", "Samuel Vimes", "Angua von Überwald"};
   auto col0_names_col = strcol_wrapper{col0_names.begin(), col0_names.end()};
   auto col0_ages_col  = column_wrapper<int32_t>{{48, 27, 351, 31, 25}};
 
-  auto col0_is_human_col = column_wrapper<bool>{{true, true, false, false, false}, {1, 1, 0, 1, 0}};
+  auto col0_is_human_col =
+    column_wrapper<bool>{{true, true, false, false, false}, {true, true, false, true, false}};
 
   auto col0_3 =
     cudf::test::structs_column_wrapper{{col0_names_col, col0_ages_col, col0_is_human_col}};
 
   column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
   strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"});
-  column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
   std::initializer_list<std::string> col1_names = {"Carrot Ironfoundersson",
                                                    "Angua von Überwald",
                                                    "Detritus",
@@ -800,7 +815,8 @@ TEST_F(JoinTest, InnerJoinWithStructsAndNulls)
   auto col1_names_col = strcol_wrapper{col1_names.begin(), col1_names.end()};
   auto col1_ages_col  = column_wrapper<int32_t>{{351, 25, 27, 31, 48}};
 
-  auto col1_is_human_col = column_wrapper<bool>{{true, false, false, false, true}, {1, 0, 0, 1, 1}};
+  auto col1_is_human_col =
+    column_wrapper<bool>{{true, false, false, false, true}, {true, false, false, true, true}};
 
   auto col1_3 =
     cudf::test::structs_column_wrapper{{col1_names_col, col1_ages_col, col1_is_human_col}};
@@ -823,23 +839,23 @@ TEST_F(JoinTest, InnerJoinWithStructsAndNulls)
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
   column_wrapper<int32_t> col_gold_0{{3, 2}};
-  strcol_wrapper col_gold_1({"s1", "s0"}, {1, 1});
+  strcol_wrapper col_gold_1({"s1", "s0"}, {true, true});
   column_wrapper<int32_t> col_gold_2{{0, 1}};
   auto col_gold_3_names_col = strcol_wrapper{"Samuel Vimes", "Angua von Überwald"};
   auto col_gold_3_ages_col  = column_wrapper<int32_t>{{48, 25}};
 
-  auto col_gold_3_is_human_col = column_wrapper<bool>{{true, false}, {1, 0}};
+  auto col_gold_3_is_human_col = column_wrapper<bool>{{true, false}, {true, false}};
 
   auto col_gold_3 = cudf::test::structs_column_wrapper{
     {col_gold_3_names_col, col_gold_3_ages_col, col_gold_3_is_human_col}};
 
   column_wrapper<int32_t> col_gold_4{{3, 2}};
-  strcol_wrapper col_gold_5({"s1", "s0"}, {1, 1});
-  column_wrapper<int32_t> col_gold_6{{1, -1}, {1, 0}};
+  strcol_wrapper col_gold_5({"s1", "s0"}, {true, true});
+  column_wrapper<int32_t> col_gold_6{{1, -1}, {true, false}};
   auto col_gold_7_names_col = strcol_wrapper{"Samuel Vimes", "Angua von Überwald"};
   auto col_gold_7_ages_col  = column_wrapper<int32_t>{{48, 25}};
 
-  auto col_gold_7_is_human_col = column_wrapper<bool>{{true, false}, {1, 0}};
+  auto col_gold_7_is_human_col = column_wrapper<bool>{{true, false}, {true, false}};
 
   auto col_gold_7 = cudf::test::structs_column_wrapper{
     {col_gold_7_names_col, col_gold_7_ages_col, col_gold_7_is_human_col}};
@@ -865,12 +881,12 @@ TEST_F(JoinTest, InnerJoinOnNulls)
   // clang-format off
   column_wrapper<int32_t> col0_0{{  3,    1,    2,    0,    2}};
   strcol_wrapper          col0_1({"s1", "s1", "s8", "s4", "s0"},
-                                 {  1,    1,    0,    1,    1});
+                                 {  true,    true,    false,    true,    true});
   column_wrapper<int32_t> col0_2{{  0,    1,    2,    4,    1}};
 
   column_wrapper<int32_t> col1_0{{  2,    2,    0,    4,    3}};
   strcol_wrapper          col1_1({"s1", "s0", "s1", "s2", "s1"},
-                                 {  1,    0,    1,    1,    1});
+                                 {  true,    false,    true,    true,    true});
   column_wrapper<int32_t> col1_2{{  1,    0,    1,    2,    1}};
 
   CVector cols0, cols1;
@@ -890,11 +906,11 @@ TEST_F(JoinTest, InnerJoinOnNulls)
 
   column_wrapper<int32_t> col_gold_0 {{  3,    2}};
   strcol_wrapper          col_gold_1 ({"s1", "s0"},
-                                      {  1,    0});
+                                      {  true,    false});
   column_wrapper<int32_t> col_gold_2{{   0,    2}};
   column_wrapper<int32_t> col_gold_3 {{  3,    2}};
   strcol_wrapper          col_gold_4 ({"s1", "s0"},
-                                      {  1,    0});
+                                      {  true,    false});
   column_wrapper<int32_t> col_gold_5{{   1,    0}};
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
@@ -919,11 +935,11 @@ TEST_F(JoinTest, InnerJoinOnNulls)
 
   col_gold_0 =               {{  3}};
   col_gold_1 = strcol_wrapper({"s1"},
-                              {  1});
+                              {  true});
   col_gold_2 =               {{  0}};
   col_gold_3 =               {{  3}};
   col_gold_4 = strcol_wrapper({"s1"},
-                              {  1});
+                              {  true});
   col_gold_5 =               {{  1}};
 
   // clang-format on
@@ -949,7 +965,7 @@ TEST_F(JoinTest, EmptyLeftTableInnerJoin)
   column_wrapper<int32_t> col0_1;
 
   column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
-  column_wrapper<int32_t> col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col1_1{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
 
   CVector cols0, cols1;
   cols0.push_back(col0_0.release());
@@ -970,7 +986,7 @@ TEST_F(JoinTest, EmptyLeftTableLeftJoin)
   column_wrapper<int32_t> col0_1;
 
   column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
-  column_wrapper<int32_t> col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col1_1{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
 
   CVector cols0, cols1;
   cols0.push_back(col0_0.release());
@@ -991,7 +1007,7 @@ TEST_F(JoinTest, EmptyLeftTableFullJoin)
   column_wrapper<int32_t> col0_1;
 
   column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
-  column_wrapper<int32_t> col1_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col1_1{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
 
   CVector cols0, cols1;
   cols0.push_back(col0_0.release());
@@ -1006,10 +1022,10 @@ TEST_F(JoinTest, EmptyLeftTableFullJoin)
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{-1, -1, -1, -1, -1}, {0, 0, 0, 0, 0}};
-  column_wrapper<int32_t> col_gold_1{{-1, -1, -1, -1, -1}, {0, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_0{{-1, -1, -1, -1, -1}, {false, false, false, false, false}};
+  column_wrapper<int32_t> col_gold_1{{-1, -1, -1, -1, -1}, {false, false, false, false, false}};
   column_wrapper<int32_t> col_gold_2{{2, 2, 0, 4, 3}};
-  column_wrapper<int32_t> col_gold_3{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col_gold_3{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
 
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
@@ -1028,7 +1044,7 @@ TEST_F(JoinTest, EmptyLeftTableFullJoin)
 TEST_F(JoinTest, EmptyRightTableInnerJoin)
 {
   column_wrapper<int32_t> col0_0{{2, 2, 0, 4, 3}};
-  column_wrapper<int32_t> col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col0_1{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
 
   column_wrapper<int32_t> col1_0;
   column_wrapper<int32_t> col1_1;
@@ -1066,8 +1082,8 @@ TEST_F(JoinTest, EmptyRightTableInnerJoin)
 
 TEST_F(JoinTest, EmptyRightTableLeftJoin)
 {
-  column_wrapper<int32_t> col0_0{{2, 2, 0, 4, 3}, {1, 1, 1, 1, 1}};
-  column_wrapper<int32_t> col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col0_0{{2, 2, 0, 4, 3}, {true, true, true, true, true}};
+  column_wrapper<int32_t> col0_1{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
 
   column_wrapper<int32_t> col1_0;
   column_wrapper<int32_t> col1_1;
@@ -1106,7 +1122,7 @@ TEST_F(JoinTest, EmptyRightTableLeftJoin)
 TEST_F(JoinTest, EmptyRightTableFullJoin)
 {
   column_wrapper<int32_t> col0_0{{2, 2, 0, 4, 3}};
-  column_wrapper<int32_t> col0_1{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col0_1{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
 
   column_wrapper<int32_t> col1_0;
   column_wrapper<int32_t> col1_1;
@@ -1262,10 +1278,10 @@ TEST_F(JoinTest, EqualValuesLeftJoin)
 
   auto result = left_join(t0, t1, {0, 1}, {0, 1});
 
-  column_wrapper<int32_t> col_gold_0{{0, 0, 0, 0}, {1, 1, 1, 1}};
-  strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}, {1, 1, 1, 1});
-  column_wrapper<int32_t> col_gold_2{{0, 0, 0, 0}, {1, 1, 1, 1}};
-  strcol_wrapper col_gold_3({"s0", "s0", "s0", "s0"}, {1, 1, 1, 1});
+  column_wrapper<int32_t> col_gold_0{{0, 0, 0, 0}, {true, true, true, true}};
+  strcol_wrapper col_gold_1({"s0", "s0", "s0", "s0"}, {true, true, true, true});
+  column_wrapper<int32_t> col_gold_2{{0, 0, 0, 0}, {true, true, true, true}};
+  strcol_wrapper col_gold_3({"s0", "s0", "s0", "s0"}, {true, true, true, true});
 
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
@@ -1416,7 +1432,8 @@ TEST_F(JoinTest, HashJoinWithStructsAndNulls)
     "Samuel Vimes", "Carrot Ironfoundersson", "Detritus", "Samuel Vimes", "Angua von Überwald"};
   auto col0_ages_col = column_wrapper<int32_t>{{48, 27, 351, 31, 25}};
 
-  auto col0_is_human_col = column_wrapper<bool>{{true, true, false, false, false}, {1, 1, 0, 1, 0}};
+  auto col0_is_human_col =
+    column_wrapper<bool>{{true, true, false, false, false}, {true, true, false, true, false}};
 
   auto col0 =
     cudf::test::structs_column_wrapper{{col0_names_col, col0_ages_col, col0_is_human_col}};
@@ -1425,7 +1442,8 @@ TEST_F(JoinTest, HashJoinWithStructsAndNulls)
     "Samuel Vimes", "Detritus", "Detritus", "Carrot Ironfoundersson", "Angua von Überwald"};
   auto col1_ages_col = column_wrapper<int32_t>{{48, 35, 351, 22, 25}};
 
-  auto col1_is_human_col = column_wrapper<bool>{{true, true, false, false, true}, {1, 1, 0, 1, 1}};
+  auto col1_is_human_col =
+    column_wrapper<bool>{{true, true, false, false, true}, {true, true, false, true, true}};
 
   auto col1 =
     cudf::test::structs_column_wrapper{{col1_names_col, col1_ages_col, col1_is_human_col}};
@@ -1638,13 +1656,13 @@ TEST_F(JoinDictionaryTest, LeftJoinNoNulls)
 TEST_F(JoinDictionaryTest, LeftJoinWithNulls)
 {
   column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 2}};
-  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1});
+  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {true, true, false, true, true});
   column_wrapper<int32_t> col0_2_w{{0, 1, 2, 4, 1}};
   auto col0_2 = cudf::dictionary::encode(col0_2_w);
 
   column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
   strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"});
-  column_wrapper<int32_t> col1_2_w{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col1_2_w{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
   auto col1_2 = cudf::dictionary::encode(col1_2_w);
 
   auto t0 = cudf::table_view({col0_0, col0_1, col0_2->view()});
@@ -1712,13 +1730,13 @@ TEST_F(JoinDictionaryTest, InnerJoinNoNulls)
 TEST_F(JoinDictionaryTest, InnerJoinWithNulls)
 {
   column_wrapper<int32_t> col0_0{{3, 1, 2, 0, 2}};
-  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1});
+  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {true, true, false, true, true});
   column_wrapper<int32_t> col0_2_w{{0, 1, 2, 4, 1}};
   auto col0_2 = cudf::dictionary::encode(col0_2_w);
 
   column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}};
   strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"});
-  column_wrapper<int32_t> col1_2_w{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col1_2_w{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
   auto col1_2 = cudf::dictionary::encode(col1_2_w);
 
   auto t0 = cudf::table_view({col0_0, col0_1, col0_2->view()});
@@ -1790,7 +1808,7 @@ TEST_F(JoinDictionaryTest, FullJoinWithNulls)
   strcol_wrapper col0_1({"s0", "s1", "s2", "s4", "s1"});
   column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 1}};
 
-  column_wrapper<int32_t> col1_0_w{{2, 2, 0, 4, 3}, {1, 1, 1, 0, 1}};
+  column_wrapper<int32_t> col1_0_w{{2, 2, 0, 4, 3}, {true, true, true, false, true}};
   auto col1_0 = cudf::dictionary::encode(col1_0_w);
   strcol_wrapper col1_1{{"s1", "s0", "s1", "s2", "s1"}};
   column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}};
@@ -1834,12 +1852,13 @@ TEST_F(JoinTest, FullJoinWithStructsAndNulls)
   auto col0_names_col = strcol_wrapper{col0_names.begin(), col0_names.end()};
   auto col0_ages_col  = column_wrapper<int32_t>{{48, 27, 25, 31, 351}};
 
-  auto col0_is_human_col = column_wrapper<bool>{{true, true, false, false, false}, {1, 1, 0, 1, 1}};
+  auto col0_is_human_col =
+    column_wrapper<bool>{{true, true, false, false, false}, {true, true, false, true, true}};
 
   auto col0_3 = cudf::test::structs_column_wrapper{
-    {col0_names_col, col0_ages_col, col0_is_human_col}, {1, 1, 1, 1, 1}};
+    {col0_names_col, col0_ages_col, col0_is_human_col}, {true, true, true, true, true}};
 
-  column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}, {1, 1, 1, 0, 1}};
+  column_wrapper<int32_t> col1_0{{2, 2, 0, 4, 3}, {true, true, true, false, true}};
   strcol_wrapper col1_1{{"s1", "s0", "s1", "s2", "s1"}};
   column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}};
 
@@ -1851,7 +1870,8 @@ TEST_F(JoinTest, FullJoinWithStructsAndNulls)
   auto col1_names_col = strcol_wrapper{col1_names.begin(), col1_names.end()};
   auto col1_ages_col  = column_wrapper<int32_t>{{27, 48, 27, 25, 27}};
 
-  auto col1_is_human_col = column_wrapper<bool>{{true, true, true, false, true}, {1, 1, 1, 0, 1}};
+  auto col1_is_human_col =
+    column_wrapper<bool>{{true, true, true, false, true}, {true, true, true, false, true}};
 
   auto col1_3 =
     cudf::test::structs_column_wrapper{{col1_names_col, col1_ages_col, col1_is_human_col}};
@@ -1873,59 +1893,69 @@ TEST_F(JoinTest, FullJoinWithStructsAndNulls)
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{3, 1, 2, 0, 3, -1, -1, -1, -1, -1},
-                                     {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}};
+  column_wrapper<int32_t> col_gold_0{
+    {3, 1, 2, 0, 3, -1, -1, -1, -1, -1},
+    {true, true, true, true, true, false, false, false, false, false}};
   strcol_wrapper col_gold_1({"s0", "s1", "s2", "s4", "s1", "", "", "", "", ""},
-                            {1, 1, 1, 1, 1, 0, 0, 0, 0, 0});
-  column_wrapper<int32_t> col_gold_2{{0, 1, 2, 4, 1, -1, -1, -1, -1, -1},
-                                     {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}};
-  auto gold_names0_col = strcol_wrapper{{"Samuel Vimes",
-                                         "Carrot Ironfoundersson",
-                                         "Angua von Überwald",
-                                         "Detritus",
-                                         "Carrot Ironfoundersson",
-                                         "",
-                                         "",
-                                         "",
-                                         "",
-                                         ""},
-                                        {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}};
-  auto gold_ages0_col  = column_wrapper<int32_t>{{48, 27, 25, 31, 351, -1, -1, -1, -1, -1},
-                                                 {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}};
+                            {true, true, true, true, true, false, false, false, false, false});
+  column_wrapper<int32_t> col_gold_2{
+    {0, 1, 2, 4, 1, -1, -1, -1, -1, -1},
+    {true, true, true, true, true, false, false, false, false, false}};
+  auto gold_names0_col =
+    strcol_wrapper{{"Samuel Vimes",
+                    "Carrot Ironfoundersson",
+                    "Angua von Überwald",
+                    "Detritus",
+                    "Carrot Ironfoundersson",
+                    "",
+                    "",
+                    "",
+                    "",
+                    ""},
+                   {true, true, true, true, true, false, false, false, false, false}};
+  auto gold_ages0_col =
+    column_wrapper<int32_t>{{48, 27, 25, 31, 351, -1, -1, -1, -1, -1},
+                            {true, true, true, true, true, false, false, false, false, false}};
 
   auto gold_is_human0_col =
     column_wrapper<bool>{{true, true, false, false, false, false, false, false, false, false},
-                         {1, 1, 0, 1, 1, 0, 0, 0, 0, 0}};
+                         {true, true, false, true, true, false, false, false, false, false}};
 
   auto col_gold_3 = cudf::test::structs_column_wrapper{
-    {gold_names0_col, gold_ages0_col, gold_is_human0_col}, {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}};
+    {gold_names0_col, gold_ages0_col, gold_is_human0_col},
+    {true, true, true, true, true, false, false, false, false, false}};
 
-  column_wrapper<int32_t> col_gold_4{{-1, -1, -1, -1, -1, 3, 2, 2, 0, 4},
-                                     {0, 0, 0, 0, 0, 1, 1, 1, 1, 0}};
+  column_wrapper<int32_t> col_gold_4{
+    {-1, -1, -1, -1, -1, 3, 2, 2, 0, 4},
+    {false, false, false, false, false, true, true, true, true, false}};
   strcol_wrapper col_gold_5({"", "", "", "", "", "s1", "s1", "s0", "s1", "s2"},
-                            {0, 0, 0, 0, 0, 1, 1, 1, 1, 1});
-  column_wrapper<int32_t> col_gold_6{{-1, -1, -1, -1, -1, 1, 1, 0, 1, 2},
-                                     {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}};
-  auto gold_names1_col = strcol_wrapper{{"",
-                                         "",
-                                         "",
-                                         "",
-                                         "",
-                                         "Carrot Ironfoundersson",
-                                         "Carrot Ironfoundersson",
-                                         "Samuel Vimes",
-                                         "Carrot Ironfoundersson",
-                                         "Angua von Überwald"},
-                                        {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}};
-  auto gold_ages1_col  = column_wrapper<int32_t>{{-1, -1, -1, -1, -1, 27, 27, 48, 27, 25},
-                                                 {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}};
+                            {false, false, false, false, false, true, true, true, true, true});
+  column_wrapper<int32_t> col_gold_6{
+    {-1, -1, -1, -1, -1, 1, 1, 0, 1, 2},
+    {false, false, false, false, false, true, true, true, true, true}};
+  auto gold_names1_col =
+    strcol_wrapper{{"",
+                    "",
+                    "",
+                    "",
+                    "",
+                    "Carrot Ironfoundersson",
+                    "Carrot Ironfoundersson",
+                    "Samuel Vimes",
+                    "Carrot Ironfoundersson",
+                    "Angua von Überwald"},
+                   {false, false, false, false, false, true, true, true, true, true}};
+  auto gold_ages1_col =
+    column_wrapper<int32_t>{{-1, -1, -1, -1, -1, 27, 27, 48, 27, 25},
+                            {false, false, false, false, false, true, true, true, true, true}};
 
   auto gold_is_human1_col =
     column_wrapper<bool>{{false, false, false, false, false, true, true, true, true, false},
-                         {0, 0, 0, 0, 0, 1, 1, 1, 1, 0}};
+                         {false, false, false, false, false, true, true, true, true, false}};
 
   auto col_gold_7 = cudf::test::structs_column_wrapper{
-    {gold_names1_col, gold_ages1_col, gold_is_human1_col}, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1}};
+    {gold_names1_col, gold_ages1_col, gold_is_human1_col},
+    {false, false, false, false, false, true, true, true, true, true}};
 
   CVector cols_gold;
   cols_gold.push_back(col_gold_0.release());
diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp
index 61bb3069308..de3d8bdaa23 100644
--- a/cpp/tests/join/semi_anti_join_tests.cpp
+++ b/cpp/tests/join/semi_anti_join_tests.cpp
@@ -114,8 +114,8 @@ TEST_F(JoinTest, TestSimple)
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> get_saj_tables(
   std::vector<bool> const& left_is_human_nulls, std::vector<bool> const& right_is_human_nulls)
 {
-  column_wrapper<int32_t> col0_0{{99, 1, 2, 0, 2}, {0, 1, 1, 1, 1}};
-  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {1, 1, 0, 1, 1});
+  column_wrapper<int32_t> col0_0{{99, 1, 2, 0, 2}, {false, true, true, true, true}};
+  strcol_wrapper col0_1({"s1", "s1", "s0", "s4", "s0"}, {true, true, false, true, true});
   column_wrapper<int32_t> col0_2{{0, 1, 2, 4, 1}};
   auto col0_names_col = strcol_wrapper{
     "Samuel Vimes", "Carrot Ironfoundersson", "Detritus", "Samuel Vimes", "Angua von Überwald"};
@@ -125,11 +125,11 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> get_saj_ta
     column_wrapper<bool>{{true, true, false, false, false}, left_is_human_nulls.begin()};
 
   auto col0_3 = cudf::test::structs_column_wrapper{
-    {col0_names_col, col0_ages_col, col0_is_human_col}, {1, 1, 1, 1, 1}};
+    {col0_names_col, col0_ages_col, col0_is_human_col}, {true, true, true, true, true}};
 
-  column_wrapper<int32_t> col1_0{{2, 2, 0, 4, -99}, {1, 1, 1, 1, 0}};
+  column_wrapper<int32_t> col1_0{{2, 2, 0, 4, -99}, {true, true, true, true, false}};
   strcol_wrapper col1_1({"s1", "s0", "s1", "s2", "s1"});
-  column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}, {1, 0, 1, 1, 1}};
+  column_wrapper<int32_t> col1_2{{1, 0, 1, 2, 1}, {true, false, true, true, true}};
   auto col1_names_col = strcol_wrapper{"Carrot Ironfoundersson",
                                        "Angua von Überwald",
                                        "Detritus",
@@ -158,20 +158,20 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>> get_saj_ta
 
 TEST_F(JoinTest, SemiJoinWithStructsAndNulls)
 {
-  auto tables = get_saj_tables({1, 1, 0, 1, 0}, {1, 0, 0, 1, 1});
+  auto tables = get_saj_tables({true, true, false, true, false}, {true, false, false, true, true});
 
   auto result =
     left_semi_join(*tables.first, *tables.second, {0, 1, 3}, {0, 1, 3}, cudf::null_equality::EQUAL);
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{99, 2}, {0, 1}};
-  strcol_wrapper col_gold_1({"s1", "s0"}, {1, 1});
+  column_wrapper<int32_t> col_gold_0{{99, 2}, {false, true}};
+  strcol_wrapper col_gold_1({"s1", "s0"}, {true, true});
   column_wrapper<int32_t> col_gold_2{{0, 1}};
   auto col_gold_3_names_col = strcol_wrapper{"Samuel Vimes", "Angua von Überwald"};
   auto col_gold_3_ages_col  = column_wrapper<int32_t>{{48, 25}};
 
-  auto col_gold_3_is_human_col = column_wrapper<bool>{{true, false}, {1, 0}};
+  auto col_gold_3_is_human_col = column_wrapper<bool>{{true, false}, {true, false}};
 
   auto col_gold_3 = cudf::test::structs_column_wrapper{
     {col_gold_3_names_col, col_gold_3_ages_col, col_gold_3_is_human_col}};
@@ -190,20 +190,20 @@ TEST_F(JoinTest, SemiJoinWithStructsAndNulls)
 
 TEST_F(JoinTest, SemiJoinWithStructsAndNullsNotEqual)
 {
-  auto tables = get_saj_tables({1, 1, 0, 1, 1}, {1, 1, 0, 1, 1});
+  auto tables = get_saj_tables({true, true, false, true, true}, {true, true, false, true, true});
 
   auto result = left_semi_join(
     *tables.first, *tables.second, {0, 1, 3}, {0, 1, 3}, cudf::null_equality::UNEQUAL);
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{2}, {1}};
-  strcol_wrapper col_gold_1({"s0"}, {1});
+  column_wrapper<int32_t> col_gold_0{{2}, {true}};
+  strcol_wrapper col_gold_1({"s0"}, {true});
   column_wrapper<int32_t> col_gold_2{{1}};
   auto col_gold_3_names_col = strcol_wrapper{"Angua von Überwald"};
   auto col_gold_3_ages_col  = column_wrapper<int32_t>{{25}};
 
-  auto col_gold_3_is_human_col = column_wrapper<bool>{{false}, {1}};
+  auto col_gold_3_is_human_col = column_wrapper<bool>{{false}, {true}};
 
   auto col_gold_3 = cudf::test::structs_column_wrapper{
     {col_gold_3_names_col, col_gold_3_ages_col, col_gold_3_is_human_col}};
@@ -223,20 +223,20 @@ TEST_F(JoinTest, SemiJoinWithStructsAndNullsNotEqual)
 
 TEST_F(JoinTest, AntiJoinWithStructsAndNulls)
 {
-  auto tables = get_saj_tables({1, 1, 0, 1, 0}, {1, 0, 0, 1, 1});
+  auto tables = get_saj_tables({true, true, false, true, false}, {true, false, false, true, true});
 
   auto result =
     left_anti_join(*tables.first, *tables.second, {0, 1, 3}, {0, 1, 3}, cudf::null_equality::EQUAL);
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{1, 2, 0}, {1, 1, 1}};
-  strcol_wrapper col_gold_1({"s1", "s0", "s4"}, {1, 0, 1});
+  column_wrapper<int32_t> col_gold_0{{1, 2, 0}, {true, true, true}};
+  strcol_wrapper col_gold_1({"s1", "s0", "s4"}, {true, false, true});
   column_wrapper<int32_t> col_gold_2{{1, 2, 4}};
   auto col_gold_3_names_col = strcol_wrapper{"Carrot Ironfoundersson", "Detritus", "Samuel Vimes"};
   auto col_gold_3_ages_col  = column_wrapper<int32_t>{{27, 351, 31}};
 
-  auto col_gold_3_is_human_col = column_wrapper<bool>{{true, false, false}, {1, 0, 1}};
+  auto col_gold_3_is_human_col = column_wrapper<bool>{{true, false, false}, {true, false, true}};
 
   auto col_gold_3 = cudf::test::structs_column_wrapper{
     {col_gold_3_names_col, col_gold_3_ages_col, col_gold_3_is_human_col}};
@@ -256,21 +256,22 @@ TEST_F(JoinTest, AntiJoinWithStructsAndNulls)
 
 TEST_F(JoinTest, AntiJoinWithStructsAndNullsNotEqual)
 {
-  auto tables = get_saj_tables({1, 1, 0, 1, 1}, {1, 1, 0, 1, 1});
+  auto tables = get_saj_tables({true, true, false, true, true}, {true, true, false, true, true});
 
   auto result = left_anti_join(
     *tables.first, *tables.second, {0, 1, 3}, {0, 1, 3}, cudf::null_equality::UNEQUAL);
   auto result_sort_order = cudf::sorted_order(result->view());
   auto sorted_result     = cudf::gather(result->view(), *result_sort_order);
 
-  column_wrapper<int32_t> col_gold_0{{99, 1, 2, 0}, {0, 1, 1, 1}};
-  strcol_wrapper col_gold_1({"s1", "s1", "s0", "s4"}, {1, 1, 0, 1});
+  column_wrapper<int32_t> col_gold_0{{99, 1, 2, 0}, {false, true, true, true}};
+  strcol_wrapper col_gold_1({"s1", "s1", "s0", "s4"}, {true, true, false, true});
   column_wrapper<int32_t> col_gold_2{{0, 1, 2, 4}};
   auto col_gold_3_names_col =
     strcol_wrapper{"Samuel Vimes", "Carrot Ironfoundersson", "Detritus", "Samuel Vimes"};
   auto col_gold_3_ages_col = column_wrapper<int32_t>{{48, 27, 351, 31}};
 
-  auto col_gold_3_is_human_col = column_wrapper<bool>{{true, true, false, false}, {1, 1, 0, 1}};
+  auto col_gold_3_is_human_col =
+    column_wrapper<bool>{{true, true, false, false}, {true, true, false, true}};
 
   auto col_gold_3 = cudf::test::structs_column_wrapper{
     {col_gold_3_names_col, col_gold_3_ages_col, col_gold_3_is_human_col}};
diff --git a/cpp/tests/json/json_tests.cpp b/cpp/tests/json/json_tests.cpp
index e38ca6628f3..a9186874e83 100644
--- a/cpp/tests/json/json_tests.cpp
+++ b/cpp/tests/json/json_tests.cpp
@@ -454,8 +454,8 @@ TEST_F(JsonPathTests, GetJsonObjectFilter)
 TEST_F(JsonPathTests, GetJsonObjectNullInputs)
 {
   {
-    std::string str("{\"a\" : \"b\"}");
-    cudf::test::strings_column_wrapper input({str, str, str, str}, {1, 0, 1, 0});
+    std::string str(R"({"a" : "b"})");
+    cudf::test::strings_column_wrapper input({str, str, str, str}, {true, false, true, false});
 
     std::string json_path("$.a");
     auto result_raw = cudf::get_json_object(cudf::strings_column_view(input), json_path);
@@ -786,7 +786,7 @@ TEST_F(JsonPathTests, StripQuotes)
   // but with string_quotes_from_single_strings false, we expect
   // "b"   (with quotes)
   {
-    std::string str("{\"a\" : \"b\"}");
+    std::string str(R"({"a" : "b"})");
     cudf::test::strings_column_wrapper input({str, str});
 
     cudf::get_json_object_options options;
diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp
index 416b106c5a5..ac8159369a1 100644
--- a/cpp/tests/large_strings/large_strings_fixture.cpp
+++ b/cpp/tests/large_strings/large_strings_fixture.cpp
@@ -41,7 +41,7 @@ class LargeStringsData {
     _data[std::string(name)] = std::move(data);
   }
 
-  cudf::table_view get_table(std::string_view name) const
+  [[nodiscard]] cudf::table_view get_table(std::string_view name) const
   {
     std::string key{name};
     return _data.find(key) != _data.end() ? _data.at(key)->view() : cudf::table_view{};
@@ -54,13 +54,16 @@ class LargeStringsData {
     _data[std::string(name)] = std::make_unique<cudf::table>(std::move(cols));
   }
 
-  cudf::column_view get_column(std::string_view name) const
+  [[nodiscard]] cudf::column_view get_column(std::string_view name) const
   {
     std::string key{name};
     return _data.find(key) != _data.end() ? _data.at(key)->view().column(0) : cudf::column_view{};
   }
 
-  bool has_key(std::string_view name) const { return _data.find(std::string(name)) != _data.end(); }
+  [[nodiscard]] bool has_key(std::string_view name) const
+  {
+    return _data.find(std::string(name)) != _data.end();
+  }
 
  protected:
   std::map<std::string, DataPointer> _data;
diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp
index 961437ba81e..718ee83cf09 100644
--- a/cpp/tests/lists/contains_tests.cpp
+++ b/cpp/tests/lists/contains_tests.cpp
@@ -385,7 +385,7 @@ TEST_F(ContainsTest, BoolScalarWithNullsInLists)
     std::move(null_mask));
 
   // Search space: [ [x], [1,1], [x,1,1,x], [], x, [1,1,x], [x], [1,1,x,1] ]
-  auto search_key_one = create_scalar_search_key<T>(1);
+  auto search_key_one = create_scalar_search_key<T>(true);
   {
     // CONTAINS
     auto result   = cudf::lists::contains(search_space->view(), *search_key_one);
diff --git a/cpp/tests/lists/count_elements_tests.cpp b/cpp/tests/lists/count_elements_tests.cpp
index 0933740b850..032bd0fa6ae 100644
--- a/cpp/tests/lists/count_elements_tests.cpp
+++ b/cpp/tests/lists/count_elements_tests.cpp
@@ -43,7 +43,8 @@ TYPED_TEST(ListsElementsNumericsTest, CountElements)
   LCW input({LCW{3, 2, 1}, LCW{}, LCW{30, 20, 10, 50}, LCW{100, 120}, LCW{0}}, validity);
 
   auto result = cudf::lists::count_elements(cudf::lists_column_view(input));
-  cudf::test::fixed_width_column_wrapper<int32_t> expected({3, 0, 4, 2, 1}, {1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> expected({3, 0, 4, 2, 1},
+                                                           {true, false, true, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
 }
 
@@ -57,7 +58,8 @@ TEST_F(ListsElementsTest, CountElementsStrings)
     validity);
 
   auto result = cudf::lists::count_elements(cudf::lists_column_view(input));
-  cudf::test::fixed_width_column_wrapper<int32_t> expected({3, 0, 4, 2, 1}, {1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> expected({3, 0, 4, 2, 1},
+                                                           {true, false, true, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
 }
 
@@ -72,7 +74,7 @@ TEST_F(ListsElementsTest, CountElementsSliced)
 
   auto sliced = cudf::slice(input, {1, 4}).front();
   auto result = cudf::lists::count_elements(cudf::lists_column_view(sliced));
-  cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 4, 2}, {0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 4, 2}, {false, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
 }
 
@@ -87,7 +89,7 @@ TYPED_TEST(ListsElementsNumericsTest, CountElementsNestedLists)
            validity.begin());
 
   auto result = cudf::lists::count_elements(cudf::lists_column_view(list));
-  cudf::test::fixed_width_column_wrapper<int32_t> expected({2, 1, 3, 5}, {1, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> expected({2, 1, 3, 5}, {true, false, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
 }
 
diff --git a/cpp/tests/lists/explode_tests.cpp b/cpp/tests/lists/explode_tests.cpp
index 4ba7a773cb7..be332de0ba0 100644
--- a/cpp/tests/lists/explode_tests.cpp
+++ b/cpp/tests/lists/explode_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -178,7 +178,8 @@ TEST_F(ExplodeTest, NullsInList)
     LCW({1, null, 7}, valids), LCW({5, null, 0, null}, valids), LCW{}, LCW({0, null, 8}, valids)};
   FCW b{100, 200, 300, 400};
 
-  FCW expected_a({1, null, 7, 5, null, 0, null, 0, null, 8}, {1, 0, 1, 1, 0, 1, 0, 1, 0, 1});
+  FCW expected_a({1, null, 7, 5, null, 0, null, 0, null, 8},
+                 {true, false, true, true, false, true, false, true, false, true});
   FCW expected_b{100, 100, 100, 200, 200, 200, 200, 400, 400, 400};
 
   cudf::table_view t({a, b});
@@ -308,7 +309,8 @@ TEST_F(ExplodeTest, NullsInNestedDoubleExplode)
         LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}};
   FCW b{100, 200, 300};
 
-  FCW expected_a({1, null, 7, 6, 5, 5, 6, 0, 3, 5, 2, null}, {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  FCW expected_a({1, null, 7, 6, 5, 5, 6, 0, 3, 5, 2, null},
+                 {true, false, true, true, true, true, true, true, true, true, true, false});
   FCW expected_b{100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300};
 
   cudf::table_view t({a, b});
@@ -432,9 +434,10 @@ TEST_F(ExplodeTest, ListOfStructsWithEmpties)
 
   auto ret = cudf::explode(t, 0);
   auto expected_numeric_col =
-    cudf::test::fixed_width_column_wrapper<int32_t>{{1, null, null}, {1, 0, 0}};
+    cudf::test::fixed_width_column_wrapper<int32_t>{{1, null, null}, {true, false, false}};
 
-  auto expected_a = cudf::test::structs_column_wrapper{{expected_numeric_col}, {1, 1, 0}}.release();
+  auto expected_a =
+    cudf::test::structs_column_wrapper{{expected_numeric_col}, {true, true, false}}.release();
   auto expected_b = cudf::test::strings_column_wrapper({"a", "b", "c"}).release();
 
   cudf::table_view expected({expected_a->view(), expected_b->view()});
@@ -600,7 +603,7 @@ TEST_F(ExplodeOuterTest, SingleNull)
   LCW a({LCW{null}, LCW{5, 6}, LCW{}, LCW{0, 3}}, first_invalid);
   FCW b({100, 200, 300, 400});
 
-  FCW expected_a{{null, 5, 6, 0, 0, 3}, {0, 1, 1, 0, 1, 1}};
+  FCW expected_a{{null, 5, 6, 0, 0, 3}, {false, true, true, false, true, true}};
   FCW expected_b{100, 200, 200, 300, 400, 400};
 
   cudf::table_view t({a, b});
@@ -609,7 +612,7 @@ TEST_F(ExplodeOuterTest, SingleNull)
   auto ret = cudf::explode_outer(t, 0);
   CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
 
-  FCW expected_pos_col{{0, 0, 1, 0, 0, 1}, {0, 1, 1, 0, 1, 1}};
+  FCW expected_pos_col{{0, 0, 1, 0, 0, 1}, {false, true, true, false, true, true}};
   cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
   auto pos_ret = cudf::explode_outer_position(t, 0);
   CUDF_TEST_EXPECT_TABLES_EQUAL(pos_ret->view(), pos_expected);
@@ -630,8 +633,8 @@ TEST_F(ExplodeOuterTest, Nulls)
   LCW a({LCW{1, 2, 7}, LCW{null}, LCW{0, 3}}, valids);
   FCW b({100, null, 300}, valids);
 
-  FCW expected_a({1, 2, 7, null, 0, 3}, {1, 1, 1, 0, 1, 1});
-  FCW expected_b({100, 100, 100, null, 300, 300}, {1, 1, 1, 0, 1, 1});
+  FCW expected_a({1, 2, 7, null, 0, 3}, {true, true, true, false, true, true});
+  FCW expected_b({100, 100, 100, null, 300, 300}, {true, true, true, false, true, true});
 
   cudf::table_view t({a, b});
   cudf::table_view expected({expected_a, expected_b});
@@ -639,7 +642,7 @@ TEST_F(ExplodeOuterTest, Nulls)
   auto ret = cudf::explode_outer(t, 0);
   CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
 
-  FCW expected_pos_col{{0, 1, 2, 0, 0, 1}, {1, 1, 1, 0, 1, 1}};
+  FCW expected_pos_col{{0, 1, 2, 0, 0, 1}, {true, true, true, false, true, true}};
   cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
 
   auto pos_ret = cudf::explode_outer_position(t, 0);
@@ -660,7 +663,7 @@ TEST_F(ExplodeOuterTest, AllNulls)
   LCW a({LCW{null}, LCW{null}, LCW{null}}, non_valid);
   FCW b({100, 200, 300});
 
-  FCW expected_a({null, null, null}, {0, 0, 0});
+  FCW expected_a({null, null, null}, {false, false, false});
   FCW expected_b({100, 200, 300});
 
   cudf::table_view t({a, b});
@@ -669,7 +672,7 @@ TEST_F(ExplodeOuterTest, AllNulls)
   auto ret = cudf::explode_outer(t, 0);
   CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
 
-  FCW expected_pos_col{{0, 0, 0}, {0, 0, 0}};
+  FCW expected_pos_col{{0, 0, 0}, {false, false, false}};
   cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
 
   auto pos_ret = cudf::explode_outer_position(t, 0);
@@ -693,7 +696,8 @@ TEST_F(ExplodeOuterTest, SequentialNulls)
   LCW a{LCW({1, 2, null}, third_invalid), LCW{3, 4}, LCW{}, LCW{}, LCW{5, 6, 7}};
   FCW b{100, 200, 300, 400, 500};
 
-  FCW expected_a({1, 2, null, 3, 4, null, null, 5, 6, 7}, {1, 1, 0, 1, 1, 0, 0, 1, 1, 1});
+  FCW expected_a({1, 2, null, 3, 4, null, null, 5, 6, 7},
+                 {true, true, false, true, true, false, false, true, true, true});
   FCW expected_b({100, 100, 100, 200, 200, 300, 400, 500, 500, 500});
 
   cudf::table_view t({a, b});
@@ -702,7 +706,8 @@ TEST_F(ExplodeOuterTest, SequentialNulls)
   auto ret = cudf::explode_outer(t, 0);
   CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
 
-  FCW expected_pos_col{{0, 1, 2, 0, 1, 0, 0, 0, 1, 2}, {1, 1, 1, 1, 1, 0, 0, 1, 1, 1}};
+  FCW expected_pos_col{{0, 1, 2, 0, 1, 0, 0, 0, 1, 2},
+                       {true, true, true, true, true, false, false, true, true, true}};
   cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
 
   auto pos_ret = cudf::explode_outer_position(t, 0);
@@ -724,7 +729,7 @@ TEST_F(ExplodeOuterTest, MoreEmptyThanData)
   LCW a{LCW{1, 2}, LCW{}, LCW{}, LCW{}, LCW{}, LCW{3}};
   FCW b{100, 200, 300, 400, 500, 600};
 
-  FCW expected_a({1, 2, null, null, null, null, 3}, {1, 1, 0, 0, 0, 0, 1});
+  FCW expected_a({1, 2, null, null, null, null, 3}, {true, true, false, false, false, false, true});
   FCW expected_b({100, 100, 200, 300, 400, 500, 600});
 
   cudf::table_view t({a, b});
@@ -733,7 +738,7 @@ TEST_F(ExplodeOuterTest, MoreEmptyThanData)
   auto ret = cudf::explode_outer(t, 0);
   CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
 
-  FCW expected_pos_col{{0, 1, 0, 0, 0, 0, 0}, {1, 1, 0, 0, 0, 0, 1}};
+  FCW expected_pos_col{{0, 1, 0, 0, 0, 0, 0}, {true, true, false, false, false, false, true}};
   cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
 
   auto pos_ret = cudf::explode_outer_position(t, 0);
@@ -754,7 +759,7 @@ TEST_F(ExplodeOuterTest, TrailingEmptys)
   LCW a{LCW{1, 2}, LCW{}, LCW{}, LCW{}, LCW{}};
   FCW b{100, 200, 300, 400, 500};
 
-  FCW expected_a({1, 2, null, null, null, null}, {1, 1, 0, 0, 0, 0});
+  FCW expected_a({1, 2, null, null, null, null}, {true, true, false, false, false, false});
   FCW expected_b({100, 100, 200, 300, 400, 500});
 
   cudf::table_view t({a, b});
@@ -763,7 +768,7 @@ TEST_F(ExplodeOuterTest, TrailingEmptys)
   auto ret = cudf::explode_outer(t, 0);
   CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
 
-  FCW expected_pos_col{{0, 1, 0, 0, 0, 0}, {1, 1, 0, 0, 0, 0}};
+  FCW expected_pos_col{{0, 1, 0, 0, 0, 0}, {true, true, false, false, false, false}};
   cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
 
   auto pos_ret = cudf::explode_outer_position(t, 0);
@@ -786,7 +791,7 @@ TEST_F(ExplodeOuterTest, LeadingNulls)
   LCW a({LCW{null}, LCW{null}, LCW{null}, LCW{null}, LCW{1, 2}}, valids);
   FCW b{100, 200, 300, 400, 500};
 
-  FCW expected_a({null, null, null, null, 1, 2}, {0, 0, 0, 0, 1, 1});
+  FCW expected_a({null, null, null, null, 1, 2}, {false, false, false, false, true, true});
   FCW expected_b({100, 200, 300, 400, 500, 500});
 
   cudf::table_view t({a, b});
@@ -795,7 +800,7 @@ TEST_F(ExplodeOuterTest, LeadingNulls)
   auto ret = cudf::explode_outer(t, 0);
   CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
 
-  FCW expected_pos_col{{0, 0, 0, 0, 0, 1}, {0, 0, 0, 0, 1, 1}};
+  FCW expected_pos_col{{0, 0, 0, 0, 0, 1}, {false, false, false, false, true, true}};
   cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
 
   auto pos_ret = cudf::explode_outer_position(t, 0);
@@ -820,7 +825,7 @@ TEST_F(ExplodeOuterTest, NullsInList)
   FCW b{100, 200, 300, 400};
 
   FCW expected_a({1, null, 7, 5, null, 0, null, null, 0, null, 8},
-                 {1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1});
+                 {true, false, true, true, false, true, false, false, true, false, true});
   FCW expected_b{100, 100, 100, 200, 200, 200, 200, 300, 400, 400, 400};
 
   cudf::table_view t({a, b});
@@ -830,7 +835,8 @@ TEST_F(ExplodeOuterTest, NullsInList)
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
 
-  FCW expected_pos_col{{0, 1, 2, 0, 1, 2, 3, 0, 0, 1, 2}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1}};
+  FCW expected_pos_col{{0, 1, 2, 0, 1, 2, 3, 0, 0, 1, 2},
+                       {true, true, true, true, true, true, true, false, true, true, true}};
   cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
 
   auto pos_ret = cudf::explode_outer_position(t, 0);
@@ -890,7 +896,7 @@ TEST_F(ExplodeOuterTest, NestedNulls)
   auto ret = cudf::explode_outer(t, 0);
   CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
 
-  FCW expected_pos_col{{0, 1, 0, 0, 1, 2}, {1, 1, 0, 1, 1, 1}};
+  FCW expected_pos_col{{0, 1, 0, 0, 1, 2}, {true, true, false, true, true, true}};
   cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
 
   auto pos_ret = cudf::explode_outer_position(t, 0);
@@ -950,7 +956,7 @@ TEST_F(ExplodeOuterTest, NullsInNestedDoubleExplode)
   FCW b{100, 200, 300};
 
   FCW expected_a({1, null, null, 7, 6, 5, 5, 6, 0, 3, 5, 2, null},
-                 {1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+                 {true, false, false, true, true, true, true, true, true, true, true, true, false});
   FCW expected_b{100, 100, 100, 100, 100, 100, 200, 200, 300, 300, 300, 300, 300};
 
   cudf::table_view t({a, b});
@@ -961,8 +967,9 @@ TEST_F(ExplodeOuterTest, NullsInNestedDoubleExplode)
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
 
-  FCW expected_pos_col{{0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1},
-                       {1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+  FCW expected_pos_col{
+    {0, 1, 0, 0, 1, 2, 0, 1, 0, 1, 0, 0, 1},
+    {true, true, false, true, true, true, true, true, true, true, true, true, true}};
   cudf::table_view pos_expected({expected_pos_col, expected_a, expected_b});
 
   auto pos_ret = cudf::explode_outer_position(first_explode_ret->view(), 0);
@@ -1075,17 +1082,18 @@ TEST_F(ExplodeOuterTest, ListOfStructsWithEmpties)
 
   auto ret = cudf::explode_outer(t, 0);
 
-  auto expected_numeric_col =
-    cudf::test::fixed_width_column_wrapper<int32_t>{{1, null, null, null, null}, {1, 0, 0, 0, 0}};
+  auto expected_numeric_col = cudf::test::fixed_width_column_wrapper<int32_t>{
+    {1, null, null, null, null}, {true, false, false, false, false}};
 
   auto expected_a =
-    cudf::test::structs_column_wrapper{{expected_numeric_col}, {1, 1, 0, 0, 0}}.release();
+    cudf::test::structs_column_wrapper{{expected_numeric_col}, {true, true, false, false, false}}
+      .release();
   auto expected_b = cudf::test::strings_column_wrapper({"a", "b", "c", "d", "e"}).release();
 
   cudf::table_view expected({expected_a->view(), expected_b->view()});
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(ret->view(), expected);
-  FCW expected_pos_col{{0, 0, 0, null, null}, {1, 1, 1, 0, 0}};
+  FCW expected_pos_col{{0, 0, 0, null, null}, {true, true, true, false, false}};
   cudf::table_view pos_expected({expected_pos_col, expected_a->view(), expected_b->view()});
 
   auto pos_ret = cudf::explode_outer_position(t, 0);
diff --git a/cpp/tests/lists/sort_lists_tests.cpp b/cpp/tests/lists/sort_lists_tests.cpp
index 7d925da85a7..a3280f901c5 100644
--- a/cpp/tests/lists/sort_lists_tests.cpp
+++ b/cpp/tests/lists/sort_lists_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -82,9 +82,9 @@ TYPED_TEST(SortLists, Null)
 {
   using T = TypeParam;
   if (std::is_same_v<T, bool>) return;
-  std::vector<bool> valids_o{1, 1, 0, 1};
-  std::vector<bool> valids_a{1, 1, 1, 0};
-  std::vector<bool> valids_b{0, 1, 1, 1};
+  std::vector<bool> valids_o{true, true, false, true};
+  std::vector<bool> valids_a{true, true, true, false};
+  std::vector<bool> valids_b{false, true, true, true};
 
   // List<T>
   LCW<T> list{{{3, 2, 4, 1}, valids_o.begin()}, {5}, {10, 8, 9}, {6, 7}};
diff --git a/cpp/tests/merge/merge_dictionary_test.cpp b/cpp/tests/merge/merge_dictionary_test.cpp
index 55365cb972a..dd528c19e4e 100644
--- a/cpp/tests/merge/merge_dictionary_test.cpp
+++ b/cpp/tests/merge/merge_dictionary_test.cpp
@@ -101,18 +101,19 @@ TEST_F(MergeDictionaryTest, Merge2Columns)
 
 TEST_F(MergeDictionaryTest, WithNulls)
 {
-  cudf::test::fixed_width_column_wrapper<int8_t> left_w1({1, 2, 2, 4, 4, 5, 0},
-                                                         {1, 1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<int8_t> left_w1(
+    {1, 2, 2, 4, 4, 5, 0}, {true, true, true, true, true, true, false});
   auto left1 = cudf::dictionary::encode(left_w1);
-  cudf::test::fixed_width_column_wrapper<int64_t> left_w2({1000, 1000, 800, 500, 500, 100, 0},
-                                                          {1, 1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<int64_t> left_w2(
+    {1000, 1000, 800, 500, 500, 100, 0}, {true, true, true, true, true, true, false});
   auto left2 = cudf::dictionary::encode(left_w2);
   cudf::table_view left_view{{left1->view(), left2->view()}};
 
-  cudf::test::fixed_width_column_wrapper<int8_t> right_w1({1, 1, 2, 4, 5, 0}, {1, 1, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<int8_t> right_w1({1, 1, 2, 4, 5, 0},
+                                                          {true, true, true, true, true, false});
   auto right1 = cudf::dictionary::encode(right_w1);
   cudf::test::fixed_width_column_wrapper<int64_t> right_w2({1000, 800, 800, 400, 100, 0},
-                                                           {1, 1, 1, 1, 1, 0});
+                                                           {true, true, true, true, true, false});
   auto right2 = cudf::dictionary::encode(right_w2);
   cudf::table_view right_view{{right1->view(), right2->view()}};
 
@@ -125,10 +126,11 @@ TEST_F(MergeDictionaryTest, WithNulls)
   auto decoded2 = cudf::dictionary::decode(result->get_column(1).view());
 
   cudf::test::fixed_width_column_wrapper<int8_t> expected_1(
-    {1, 1, 1, 2, 2, 2, 4, 4, 4, 5, 5, 0, 0}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0});
+    {1, 1, 1, 2, 2, 2, 4, 4, 4, 5, 5, 0, 0},
+    {true, true, true, true, true, true, true, true, true, true, true, false, false});
   cudf::test::fixed_width_column_wrapper<int64_t> expected_2(
     {1000, 1000, 800, 1000, 800, 800, 500, 500, 400, 100, 100, 0, 0},
-    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0});
+    {true, true, true, true, true, true, true, true, true, true, true, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_1, decoded1->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_2, decoded2->view());
 
diff --git a/cpp/tests/merge/merge_string_test.cpp b/cpp/tests/merge/merge_string_test.cpp
index 28179a7341c..97979e79010 100644
--- a/cpp/tests/merge/merge_string_test.cpp
+++ b/cpp/tests/merge/merge_string_test.cpp
@@ -228,7 +228,7 @@ TYPED_TEST(MergeStringTest, Merge1StringKeyNullColumns)
 {
   // data: "ab", "bc", "cd", "de" | valid: 1 1 1 0
   strings_column_wrapper leftColWrap1({"ab", "bc", "cd", "de", "ef", "fg", "gh", "hi"},
-                                      {1, 1, 1, 1, 1, 1, 1, 0});
+                                      {true, true, true, true, true, true, true, false});
 
   cudf::size_type inputRows = static_cast<cudf::column_view const&>(leftColWrap1).size();
 
@@ -245,7 +245,7 @@ TYPED_TEST(MergeStringTest, Merge1StringKeyNullColumns)
 
   // data: "ac", "bd", "ce", "df" | valid: 1 1 1 0
   strings_column_wrapper rightColWrap1({"ac", "bd", "ce", "df", "eg", "fh", "gi", "hj"},
-                                       {1, 1, 1, 1, 1, 1, 1, 0});
+                                       {true, true, true, true, true, true, true, false});
   fixed_width_column_wrapper<TypeParam, typename decltype(sequence0)::value_type> rightColWrap2(
     sequence0, sequence0 + inputRows);
 
@@ -280,7 +280,22 @@ TYPED_TEST(MergeStringTest, Merge1StringKeyNullColumns)
                                             "gi",
                                             "hi",
                                             "hj"},
-                                           {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0});
+                                           {true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            false,
+                                            false});
   auto seq_out2 = cudf::detail::make_counting_transform_iterator(0, [outputRows](auto row) {
     if (cudf::type_to_id<TypeParam>() == cudf::type_id::BOOL8)
       return 0;
@@ -303,9 +318,9 @@ TYPED_TEST(MergeStringTest, Merge1StringKeyNullColumns)
 TYPED_TEST(MergeStringTest, Merge2StringKeyNullColumns)
 {
   strings_column_wrapper leftColWrap1({"ab", "bc", "cd", "de", "ef", "fg", "gh", "hi"},
-                                      {1, 1, 1, 1, 1, 1, 1, 0});
+                                      {true, true, true, true, true, true, true, false});
   strings_column_wrapper leftColWrap3({"zy", "yx", "xw", "wv", "vu", "ut", "ts", "sr"},
-                                      {1, 1, 1, 1, 1, 1, 1, 0});
+                                      {true, true, true, true, true, true, true, false});
 
   cudf::size_type inputRows = static_cast<cudf::column_view const&>(leftColWrap1).size();
 
@@ -324,7 +339,7 @@ TYPED_TEST(MergeStringTest, Merge2StringKeyNullColumns)
   cudf::table_view left_view{{leftColWrap1, leftColWrap2, leftColWrap3}};
 
   strings_column_wrapper rightColWrap1({"ac", "bd", "ce", "df", "eg", "fh", "gi", "hj"},
-                                       {1, 1, 1, 1, 1, 1, 1, 0});
+                                       {true, true, true, true, true, true, true, false});
 
   EXPECT_EQ(inputRows, static_cast<cudf::column_view const&>(rightColWrap1).size());
 
@@ -338,7 +353,7 @@ TYPED_TEST(MergeStringTest, Merge2StringKeyNullColumns)
     sequence_r, sequence_r + inputRows);
 
   strings_column_wrapper rightColWrap3({"zx", "yw", "xv", "wu", "vt", "us", "tr", "sp"},
-                                       {1, 1, 1, 1, 1, 1, 1, 0});
+                                       {true, true, true, true, true, true, true, false});
 
   EXPECT_EQ(inputRows, static_cast<cudf::column_view const&>(rightColWrap3).size());
 
@@ -371,7 +386,22 @@ TYPED_TEST(MergeStringTest, Merge2StringKeyNullColumns)
                                             "gi",
                                             "hi",
                                             "hj"},
-                                           {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0});
+                                           {true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            false,
+                                            false});
 
   auto seq_out2 = cudf::detail::make_counting_transform_iterator(
     0, [bool8 = (cudf::type_to_id<TypeParam>() == cudf::type_id::BOOL8)](auto row) {
@@ -397,7 +427,22 @@ TYPED_TEST(MergeStringTest, Merge2StringKeyNullColumns)
                                             "tr",
                                             "sr",
                                             "sp"},
-                                           {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0});
+                                           {true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            true,
+                                            false,
+                                            false});
 
   auto expected_column_view1{static_cast<cudf::column_view const&>(expectedDataWrap1)};
   auto expected_column_view2{static_cast<cudf::column_view const&>(expectedDataWrap2)};
diff --git a/cpp/tests/partitioning/hash_partition_test.cpp b/cpp/tests/partitioning/hash_partition_test.cpp
index 521e1193036..24dadf9b520 100644
--- a/cpp/tests/partitioning/hash_partition_test.cpp
+++ b/cpp/tests/partitioning/hash_partition_test.cpp
@@ -140,7 +140,7 @@ TEST_F(HashPartition, MixedColumnTypes)
 
 TEST_F(HashPartition, NullableStrings)
 {
-  strings_column_wrapper strings({"a", "bb", "ccc", "d"}, {1, 1, 1, 1});
+  strings_column_wrapper strings({"a", "bb", "ccc", "d"}, {true, true, true, true});
   cudf::table_view input({strings});
 
   std::vector<cudf::size_type> const columns_to_hash({0});
diff --git a/cpp/tests/partitioning/round_robin_test.cpp b/cpp/tests/partitioning/round_robin_test.cpp
index 8049c7c3a7a..89d23c39dca 100644
--- a/cpp/tests/partitioning/round_robin_test.cpp
+++ b/cpp/tests/partitioning/round_robin_test.cpp
@@ -62,7 +62,7 @@ TYPED_TEST(RoundRobinTest, RoundRobinPartitions13_3)
 {
   strings_column_wrapper rrColWrap1(
     {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m"},
-    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+    {true, true, true, true, true, true, true, true, true, true, true, true, false});
 
   cudf::size_type inputRows = static_cast<cudf::column_view const&>(rrColWrap1).size();
 
@@ -90,7 +90,7 @@ TYPED_TEST(RoundRobinTest, RoundRobinPartitions13_3)
 
     strings_column_wrapper expectedDataWrap1(
       {"a", "d", "g", "j", "m", "b", "e", "h", "k", "c", "f", "i", "l"},
-      {1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1});
+      {true, true, true, true, false, true, true, true, true, true, true, true, true});
 
     auto expected_column_view1{static_cast<cudf::column_view const&>(expectedDataWrap1)};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view1, output_column_view1);
@@ -128,7 +128,7 @@ TYPED_TEST(RoundRobinTest, RoundRobinPartitions13_3)
 
     strings_column_wrapper expectedDataWrap1(
       {"c", "f", "i", "l", "a", "d", "g", "j", "m", "b", "e", "h", "k"},
-      {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1});
+      {true, true, true, true, true, true, true, true, false, true, true, true, true});
 
     auto expected_column_view1{static_cast<cudf::column_view const&>(expectedDataWrap1)};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view1, output_column_view1);
@@ -166,7 +166,7 @@ TYPED_TEST(RoundRobinTest, RoundRobinPartitions13_3)
 
     strings_column_wrapper expectedDataWrap1(
       {"b", "e", "h", "k", "c", "f", "i", "l", "a", "d", "g", "j", "m"},
-      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+      {true, true, true, true, true, true, true, true, true, true, true, true, false});
 
     auto expected_column_view1{static_cast<cudf::column_view const&>(expectedDataWrap1)};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view1, output_column_view1);
@@ -195,8 +195,9 @@ TYPED_TEST(RoundRobinTest, RoundRobinPartitions13_3)
 
 TYPED_TEST(RoundRobinTest, RoundRobinPartitions11_3)
 {
-  strings_column_wrapper rrColWrap1({"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"},
-                                    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  strings_column_wrapper rrColWrap1(
+    {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"},
+    {true, true, true, true, true, true, true, true, true, true, false});
 
   cudf::size_type inputRows = static_cast<cudf::column_view const&>(rrColWrap1).size();
 
@@ -223,7 +224,8 @@ TYPED_TEST(RoundRobinTest, RoundRobinPartitions11_3)
     auto output_column_view2{p_outputTable->view().column(1)};
 
     strings_column_wrapper expectedDataWrap1(
-      {"a", "d", "g", "j", "b", "e", "h", "k", "c", "f", "i"}, {1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1});
+      {"a", "d", "g", "j", "b", "e", "h", "k", "c", "f", "i"},
+      {true, true, true, true, true, true, true, false, true, true, true});
 
     auto expected_column_view1{static_cast<cudf::column_view const&>(expectedDataWrap1)};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view1, output_column_view1);
@@ -260,7 +262,8 @@ TYPED_TEST(RoundRobinTest, RoundRobinPartitions11_3)
     auto output_column_view2{p_outputTable->view().column(1)};
 
     strings_column_wrapper expectedDataWrap1(
-      {"c", "f", "i", "a", "d", "g", "j", "b", "e", "h", "k"}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+      {"c", "f", "i", "a", "d", "g", "j", "b", "e", "h", "k"},
+      {true, true, true, true, true, true, true, true, true, true, false});
 
     auto expected_column_view1{static_cast<cudf::column_view const&>(expectedDataWrap1)};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view1, output_column_view1);
@@ -297,7 +300,8 @@ TYPED_TEST(RoundRobinTest, RoundRobinPartitions11_3)
     auto output_column_view2{p_outputTable->view().column(1)};
 
     strings_column_wrapper expectedDataWrap1(
-      {"b", "e", "h", "k", "c", "f", "i", "a", "d", "g", "j"}, {1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1});
+      {"b", "e", "h", "k", "c", "f", "i", "a", "d", "g", "j"},
+      {true, true, true, false, true, true, true, true, true, true, true});
 
     auto expected_column_view1{static_cast<cudf::column_view const&>(expectedDataWrap1)};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view1, output_column_view1);
@@ -326,8 +330,9 @@ TYPED_TEST(RoundRobinTest, RoundRobinPartitions11_3)
 
 TYPED_TEST(RoundRobinTest, RoundRobinDegeneratePartitions11_15)
 {
-  strings_column_wrapper rrColWrap1({"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"},
-                                    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  strings_column_wrapper rrColWrap1(
+    {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"},
+    {true, true, true, true, true, true, true, true, true, true, false});
 
   cudf::size_type inputRows = static_cast<cudf::column_view const&>(rrColWrap1).size();
 
@@ -354,7 +359,8 @@ TYPED_TEST(RoundRobinTest, RoundRobinDegeneratePartitions11_15)
     auto output_column_view2{p_outputTable->view().column(1)};
 
     strings_column_wrapper expectedDataWrap1(
-      {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+      {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"},
+      {true, true, true, true, true, true, true, true, true, true, false});
 
     auto expected_column_view1{static_cast<cudf::column_view const&>(expectedDataWrap1)};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view1, output_column_view1);
@@ -392,7 +398,8 @@ TYPED_TEST(RoundRobinTest, RoundRobinDegeneratePartitions11_15)
     auto output_column_view2{p_outputTable->view().column(1)};
 
     strings_column_wrapper expectedDataWrap1(
-      {"f", "g", "h", "i", "j", "k", "a", "b", "c", "d", "e"}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1});
+      {"f", "g", "h", "i", "j", "k", "a", "b", "c", "d", "e"},
+      {true, true, true, true, true, false, true, true, true, true, true});
 
     auto expected_column_view1{static_cast<cudf::column_view const&>(expectedDataWrap1)};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view1, output_column_view1);
@@ -430,7 +437,8 @@ TYPED_TEST(RoundRobinTest, RoundRobinDegeneratePartitions11_15)
     auto output_column_view2{p_outputTable->view().column(1)};
 
     strings_column_wrapper expectedDataWrap1(
-      {"b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "a"}, {1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1});
+      {"b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "a"},
+      {true, true, true, true, true, true, true, true, true, false, true});
 
     auto expected_column_view1{static_cast<cudf::column_view const&>(expectedDataWrap1)};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view1, output_column_view1);
@@ -460,8 +468,9 @@ TYPED_TEST(RoundRobinTest, RoundRobinDegeneratePartitions11_15)
 
 TYPED_TEST(RoundRobinTest, RoundRobinDegeneratePartitions11_11)
 {
-  strings_column_wrapper rrColWrap1({"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"},
-                                    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  strings_column_wrapper rrColWrap1(
+    {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"},
+    {true, true, true, true, true, true, true, true, true, true, false});
 
   cudf::size_type inputRows = static_cast<cudf::column_view const&>(rrColWrap1).size();
 
@@ -488,7 +497,8 @@ TYPED_TEST(RoundRobinTest, RoundRobinDegeneratePartitions11_11)
     auto output_column_view2{p_outputTable->view().column(1)};
 
     strings_column_wrapper expectedDataWrap1(
-      {"j", "k", "a", "b", "c", "d", "e", "f", "g", "h", "i"}, {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+      {"j", "k", "a", "b", "c", "d", "e", "f", "g", "h", "i"},
+      {true, false, true, true, true, true, true, true, true, true, true});
 
     auto expected_column_view1{static_cast<cudf::column_view const&>(expectedDataWrap1)};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view1, output_column_view1);
@@ -527,7 +537,8 @@ TYPED_TEST(RoundRobinTest, RoundRobinNPartitionsDivideNRows)
   strings_column_wrapper rrColWrap1(
     {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k",
      "l", "m", "n", "o", "p", "q", "r", "s", "t", "u"},
-    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+    {true, true, true, true, true, true, true, true, true, true, true,
+     true, true, true, true, true, true, true, true, true, false});
 
   cudf::size_type inputRows = static_cast<cudf::column_view const&>(rrColWrap1).size();
 
@@ -563,7 +574,8 @@ TYPED_TEST(RoundRobinTest, RoundRobinNPartitionsDivideNRows)
     strings_column_wrapper expectedDataWrap1(
       {"a", "d", "g", "j", "m", "p", "s", "b", "e", "h", "k",
        "n", "q", "t", "c", "f", "i", "l", "o", "r", "u"},
-      {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+      {true, true, true, true, true, true, true, true, true, true, true,
+       true, true, true, true, true, true, true, true, true, false});
 
     auto expected_column_view1{static_cast<cudf::column_view const&>(expectedDataWrap1)};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view1, output_column_view1);
@@ -610,7 +622,8 @@ TYPED_TEST(RoundRobinTest, RoundRobinNPartitionsDivideNRows)
     strings_column_wrapper expectedDataWrap1(
       {"c", "f", "i", "l", "o", "r", "u", "a", "d", "g", "j",
        "m", "p", "s", "b", "e", "h", "k", "n", "q", "t"},
-      {1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+      {true, true, true, true, true, true, false, true, true, true, true,
+       true, true, true, true, true, true, true,  true, true, true});
 
     auto expected_column_view1{static_cast<cudf::column_view const&>(expectedDataWrap1)};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view1, output_column_view1);
@@ -640,8 +653,9 @@ TYPED_TEST(RoundRobinTest, RoundRobinNPartitionsDivideNRows)
 
 TYPED_TEST(RoundRobinTest, RoundRobinSinglePartition)
 {
-  strings_column_wrapper rrColWrap1({"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"},
-                                    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  strings_column_wrapper rrColWrap1(
+    {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"},
+    {true, true, true, true, true, true, true, true, true, true, false});
 
   cudf::size_type inputRows = static_cast<cudf::column_view const&>(rrColWrap1).size();
 
@@ -665,8 +679,9 @@ TYPED_TEST(RoundRobinTest, RoundRobinSinglePartition)
   auto output_column_view1{p_outputTable->view().column(0)};
   auto output_column_view2{p_outputTable->view().column(1)};
 
-  strings_column_wrapper expectedDataWrap1({"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"},
-                                           {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  strings_column_wrapper expectedDataWrap1(
+    {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"},
+    {true, true, true, true, true, true, true, true, true, true, false});
 
   auto expected_column_view1{static_cast<cudf::column_view const&>(expectedDataWrap1)};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_column_view1, output_column_view1);
@@ -693,8 +708,9 @@ TYPED_TEST(RoundRobinTest, RoundRobinSinglePartition)
 
 TYPED_TEST(RoundRobinTest, RoundRobinIncorrectNumPartitions)
 {
-  strings_column_wrapper rrColWrap1({"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"},
-                                    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  strings_column_wrapper rrColWrap1(
+    {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"},
+    {true, true, true, true, true, true, true, true, true, true, false});
 
   cudf::size_type inputRows = static_cast<cudf::column_view const&>(rrColWrap1).size();
 
@@ -719,8 +735,9 @@ TYPED_TEST(RoundRobinTest, RoundRobinIncorrectNumPartitions)
 
 TYPED_TEST(RoundRobinTest, RoundRobinIncorrectStartPartition)
 {
-  strings_column_wrapper rrColWrap1({"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"},
-                                    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0});
+  strings_column_wrapper rrColWrap1(
+    {"a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k"},
+    {true, true, true, true, true, true, true, true, true, true, false});
 
   cudf::size_type inputRows = static_cast<cudf::column_view const&>(rrColWrap1).size();
 
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index 46d4066ddff..06c6b9dfbe4 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -384,7 +384,7 @@ TEST_F(PercentileApproxTest, EmptyInput)
   auto result = cudf::percentile_approx(tdv, percentiles);
 
   cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets{0, 0, 0, 0};
-  std::vector<bool> nulls{0, 0, 0};
+  std::vector<bool> nulls{false, false, false};
   auto [null_mask, null_count] = cudf::test::detail::make_null_mask(nulls.begin(), nulls.end());
 
   auto expected = cudf::make_lists_column(3,
@@ -416,7 +416,7 @@ TEST_F(PercentileApproxTest, EmptyPercentiles)
   auto result = cudf::percentile_approx(tdv, percentiles);
 
   cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets{0, 0, 0};
-  std::vector<bool> nulls{0, 0};
+  std::vector<bool> nulls{false, false};
   auto [null_mask, null_count] = cudf::test::detail::make_null_mask(nulls.begin(), nulls.end());
 
   auto expected = cudf::make_lists_column(2,
@@ -444,10 +444,11 @@ TEST_F(PercentileApproxTest, NullPercentiles)
 
   cudf::tdigest::tdigest_column_view tdv(*tdigest_column.second[0].results[0]);
 
-  cudf::test::fixed_width_column_wrapper<double> npercentiles{{0.5, 0.5, 1.0, 1.0}, {0, 0, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<double> npercentiles{{0.5, 0.5, 1.0, 1.0},
+                                                              {false, false, true, true}};
   auto result = cudf::percentile_approx(tdv, npercentiles);
 
-  std::vector<bool> valids{0, 0, 1, 1};
+  std::vector<bool> valids{false, false, true, true};
   cudf::test::lists_column_wrapper<double> expected{{{99, 99, 4, 4}, valids.begin()},
                                                     {{99, 99, 8, 8}, valids.begin()}};
 
diff --git a/cpp/tests/quantiles/quantile_test.cpp b/cpp/tests/quantiles/quantile_test.cpp
index b25a4d6c666..6e88365b6e8 100644
--- a/cpp/tests/quantiles/quantile_test.cpp
+++ b/cpp/tests/quantiles/quantile_test.cpp
@@ -413,7 +413,7 @@ TYPED_TEST(QuantileTest, TestInterpolateExtremaLow)
 TYPED_TEST(QuantileTest, TestEmpty)
 {
   auto input    = cudf::test::fixed_width_column_wrapper<TypeParam>({});
-  auto expected = cudf::test::fixed_width_column_wrapper<double>({0, 0}, {0, 0});
+  auto expected = cudf::test::fixed_width_column_wrapper<double>({0, 0}, {false, false});
   auto actual   = cudf::quantile(input, {0.5, 0.25});
 }
 
diff --git a/cpp/tests/quantiles/quantiles_test.cpp b/cpp/tests/quantiles/quantiles_test.cpp
index b7faa20e8c1..44d4ec61852 100644
--- a/cpp/tests/quantiles/quantiles_test.cpp
+++ b/cpp/tests/quantiles/quantiles_test.cpp
@@ -119,7 +119,8 @@ TYPED_TEST(QuantilesTest, TestMultiColumnUnsorted)
   auto input_a = cudf::test::strings_column_wrapper(
     {"C", "B", "A", "A", "D", "B", "D", "B", "D", "C", "C", "C",
      "D", "B", "D", "B", "C", "C", "A", "D", "B", "A", "A", "A"},
-    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+    {true, true, true, true, true, true, true, true, true, true, true, true,
+     true, true, true, true, true, true, true, true, true, true, true, true});
 
   cudf::test::fixed_width_column_wrapper<T, int32_t> input_b(
     {4, 3, 5, 0, 1, 0, 4, 1, 5, 3, 0, 5, 2, 4, 3, 2, 1, 2, 3, 0, 5, 1, 4, 2},
@@ -133,7 +134,8 @@ TYPED_TEST(QuantilesTest, TestMultiColumnUnsorted)
                                 cudf::sorted::NO,
                                 {cudf::order::ASCENDING, cudf::order::DESCENDING});
 
-  auto expected_a = cudf::test::strings_column_wrapper({"A", "C", "C", "B", "D"}, {1, 1, 1, 1, 1});
+  auto expected_a =
+    cudf::test::strings_column_wrapper({"A", "C", "C", "B", "D"}, {true, true, true, true, true});
 
   cudf::test::fixed_width_column_wrapper<T, int32_t> expected_b({5, 5, 1, 5, 0}, {1, 1, 1, 1, 1});
 
@@ -149,7 +151,8 @@ TYPED_TEST(QuantilesTest, TestMultiColumnAssumedSorted)
   auto input_a = cudf::test::strings_column_wrapper(
     {"C", "B", "A", "A", "D", "B", "D", "B", "D", "C", "C", "C",
      "D", "B", "D", "B", "C", "C", "A", "D", "B", "A", "A", "A"},
-    {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+    {true, true, true, true, true, true, true, true, true, true, true, true,
+     true, true, true, true, true, true, true, true, true, true, true, true});
 
   cudf::test::fixed_width_column_wrapper<T, int32_t> input_b(
     {4, 3, 5, 0, 1, 0, 4, 1, 5, 3, 0, 5, 2, 4, 3, 2, 1, 2, 3, 0, 5, 1, 4, 2},
@@ -160,7 +163,8 @@ TYPED_TEST(QuantilesTest, TestMultiColumnAssumedSorted)
   auto actual = cudf::quantiles(
     input, {0.0f, 0.5f, 0.7f, 0.25f, 1.0f}, cudf::interpolation::NEAREST, cudf::sorted::YES);
 
-  auto expected_a = cudf::test::strings_column_wrapper({"C", "D", "C", "D", "A"}, {1, 1, 1, 1, 1});
+  auto expected_a =
+    cudf::test::strings_column_wrapper({"C", "D", "C", "D", "A"}, {true, true, true, true, true});
 
   cudf::test::fixed_width_column_wrapper<T, int32_t> expected_b({4, 2, 1, 4, 2}, {1, 1, 1, 1, 1});
 
diff --git a/cpp/tests/reductions/collect_ops_tests.cpp b/cpp/tests/reductions/collect_ops_tests.cpp
index 65d0b3a54ad..a41682bc632 100644
--- a/cpp/tests/reductions/collect_ops_tests.cpp
+++ b/cpp/tests/reductions/collect_ops_tests.cpp
@@ -56,7 +56,7 @@ TYPED_TEST(CollectTestFixedWidth, CollectList)
   using fw_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
 
   std::vector<int> values({5, 0, -120, -111, 0, 64, 63, 99, 123, -16});
-  std::vector<bool> null_mask({1, 1, 0, 1, 1, 1, 0, 1, 0, 1});
+  std::vector<bool> null_mask({true, true, false, true, true, true, false, true, false, true});
 
   // null_include without nulls
   fw_wrapper col(values.begin(), values.end());
@@ -88,7 +88,7 @@ TYPED_TEST(CollectTestFixedWidth, CollectSet)
   using fw_wrapper = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
 
   std::vector<int> values({5, 0, 120, 0, 0, 64, 64, 99, 120, 99});
-  std::vector<bool> null_mask({1, 1, 0, 1, 1, 1, 0, 1, 0, 1});
+  std::vector<bool> null_mask({true, true, false, true, true, true, false, true, false, true});
 
   fw_wrapper col(values.begin(), values.end());
   fw_wrapper col_with_null(values.begin(), values.end(), null_mask.begin());
@@ -197,11 +197,11 @@ TEST_F(CollectTest, CollectSetWithNaN)
   using fp_wrapper = cudf::test::fixed_width_column_wrapper<float>;
 
   fp_wrapper col{{1.0f, 1.0f, -2.3e-5f, -2.3e-5f, 2.3e5f, 2.3e5f, -NAN, -NAN, NAN, NAN, 0.0f, 0.0f},
-                 {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}};
+                 {true, true, true, true, true, true, true, true, true, true, false, false}};
 
   // nan unequal with null equal
   fp_wrapper expected1{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, -NAN, NAN, NAN, 0.0f},
-                       {1, 1, 1, 1, 1, 1, 1, 0}};
+                       {true, true, true, true, true, true, true, false}};
   auto const ret1 = collect_set(
     col,
     cudf::make_collect_set_aggregation<cudf::reduce_aggregation>(
@@ -210,7 +210,7 @@ TEST_F(CollectTest, CollectSetWithNaN)
 
   // nan unequal with null unequal
   fp_wrapper expected2{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, -NAN, NAN, NAN, 0.0f, 0.0f},
-                       {1, 1, 1, 1, 1, 1, 1, 0, 0}};
+                       {true, true, true, true, true, true, true, false, false}};
   auto const ret2 = collect_set(
     col,
     cudf::make_collect_set_aggregation<cudf::reduce_aggregation>(
@@ -218,7 +218,7 @@ TEST_F(CollectTest, CollectSetWithNaN)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, dynamic_cast<cudf::list_scalar*>(ret2.get())->view());
 
   // nan equal with null equal
-  fp_wrapper expected3{{-2.3e-5f, 1.0f, 2.3e5f, NAN, 0.0f}, {1, 1, 1, 1, 0}};
+  fp_wrapper expected3{{-2.3e-5f, 1.0f, 2.3e5f, NAN, 0.0f}, {true, true, true, true, false}};
   auto const ret3 = collect_set(
     col,
     cudf::make_collect_set_aggregation<cudf::reduce_aggregation>(
@@ -226,7 +226,8 @@ TEST_F(CollectTest, CollectSetWithNaN)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, dynamic_cast<cudf::list_scalar*>(ret3.get())->view());
 
   // nan equal with null unequal
-  fp_wrapper expected4{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, 0.0f, 0.0f}, {1, 1, 1, 1, 0, 0}};
+  fp_wrapper expected4{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, 0.0f, 0.0f},
+                       {true, true, true, true, false, false}};
   auto const ret4 = collect_set(
     col,
     cudf::make_collect_set_aggregation<cudf::reduce_aggregation>(
@@ -248,7 +249,8 @@ TEST_F(CollectTest, MergeSetsWithNaN)
   };
 
   // nan unequal with null equal
-  fp_wrapper expected1{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, NAN, NAN, 0.0f}, {1, 1, 1, 1, 1, 1, 0}};
+  fp_wrapper expected1{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, NAN, NAN, 0.0f},
+                       {true, true, true, true, true, true, false}};
   auto const ret1 = collect_set(col,
                                 cudf::make_merge_sets_aggregation<cudf::reduce_aggregation>(
                                   cudf::null_equality::EQUAL, cudf::nan_equality::UNEQUAL));
@@ -256,21 +258,22 @@ TEST_F(CollectTest, MergeSetsWithNaN)
 
   // nan unequal with null unequal
   fp_wrapper expected2{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, NAN, NAN, 0.0f, 0.0f, 0.0f},
-                       {1, 1, 1, 1, 1, 1, 0, 0, 0}};
+                       {true, true, true, true, true, true, false, false, false}};
   auto const ret2 = collect_set(col,
                                 cudf::make_merge_sets_aggregation<cudf::reduce_aggregation>(
                                   cudf::null_equality::UNEQUAL, cudf::nan_equality::UNEQUAL));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, dynamic_cast<cudf::list_scalar*>(ret2.get())->view());
 
   // nan equal with null equal
-  fp_wrapper expected3{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, 0.0f}, {1, 1, 1, 1, 0}};
+  fp_wrapper expected3{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, 0.0f}, {true, true, true, true, false}};
   auto const ret3 = collect_set(col,
                                 cudf::make_merge_sets_aggregation<cudf::reduce_aggregation>(
                                   cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, dynamic_cast<cudf::list_scalar*>(ret3.get())->view());
 
   // nan equal with null unequal
-  fp_wrapper expected4{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, 0.0f, 0.0f, 0.0f}, {1, 1, 1, 1, 0, 0, 0}};
+  fp_wrapper expected4{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, 0.0f, 0.0f, 0.0f},
+                       {true, true, true, true, false, false, false}};
   auto const ret4 = collect_set(col,
                                 cudf::make_merge_sets_aggregation<cudf::reduce_aggregation>(
                                   cudf::null_equality::UNEQUAL, cudf::nan_equality::ALL_EQUAL));
@@ -282,8 +285,8 @@ TEST_F(CollectTest, CollectStrings)
   using str_col   = cudf::test::strings_column_wrapper;
   using lists_col = cudf::test::lists_column_wrapper<cudf::string_view>;
 
-  auto const s_col =
-    str_col{{"a", "a", "b", "b", "b", "c", "c", "d", "e", "e"}, {1, 1, 1, 0, 1, 1, 0, 1, 1, 1}};
+  auto const s_col = str_col{{"a", "a", "b", "b", "b", "c", "c", "d", "e", "e"},
+                             {true, true, true, false, true, true, false, true, true, true}};
 
   // collect_list including nulls
   auto const ret1 = cudf::reduce(s_col,
@@ -306,8 +309,9 @@ TEST_F(CollectTest, CollectStrings)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, dynamic_cast<cudf::list_scalar*>(ret3.get())->view());
 
   // collect_set with null_unequal
-  auto const expected4 = str_col{{"a", "b", "c", "d", "e", "", ""}, {1, 1, 1, 1, 1, 0, 0}};
-  auto const ret4      = collect_set(s_col,
+  auto const expected4 =
+    str_col{{"a", "b", "c", "d", "e", "", ""}, {true, true, true, true, true, false, false}};
+  auto const ret4 = collect_set(s_col,
                                 cudf::make_collect_set_aggregation<cudf::reduce_aggregation>(
                                   cudf::null_policy::INCLUDE, cudf::null_equality::UNEQUAL));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected4, dynamic_cast<cudf::list_scalar*>(ret4.get())->view());
@@ -322,22 +326,23 @@ TEST_F(CollectTest, CollectStrings)
 
   // merge_lists
   auto const expected5 = str_col{{"a", "a", "b", "b", "null", "c", "null", "d", "null", "e"},
-                                 {1, 1, 1, 1, 0, 1, 0, 1, 0, 1}};
+                                 {true, true, true, true, false, true, false, true, false, true}};
   auto const ret5      = cudf::reduce(strings,
                                  *cudf::make_merge_lists_aggregation<cudf::reduce_aggregation>(),
                                  cudf::data_type{cudf::type_id::LIST});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected5, dynamic_cast<cudf::list_scalar*>(ret5.get())->view());
 
   // merge_sets with null_equal
-  auto const expected6 = str_col{{"a", "b", "c", "d", "e", "null"}, {1, 1, 1, 1, 1, 0}};
+  auto const expected6 =
+    str_col{{"a", "b", "c", "d", "e", "null"}, {true, true, true, true, true, false}};
   auto const ret6 =
     collect_set(strings, cudf::make_merge_sets_aggregation<cudf::reduce_aggregation>());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected6, dynamic_cast<cudf::list_scalar*>(ret6.get())->view());
 
   // merge_sets with null_unequal
-  auto const expected7 =
-    str_col{{"a", "b", "c", "d", "e", "null", "null", "null"}, {1, 1, 1, 1, 1, 0, 0, 0}};
-  auto const ret7 = collect_set(
+  auto const expected7 = str_col{{"a", "b", "c", "d", "e", "null", "null", "null"},
+                                 {true, true, true, true, true, false, false, false}};
+  auto const ret7      = collect_set(
     strings,
     cudf::make_merge_sets_aggregation<cudf::reduce_aggregation>(cudf::null_equality::UNEQUAL));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected7, dynamic_cast<cudf::list_scalar*>(ret7.get())->view());
@@ -358,7 +363,7 @@ TEST_F(CollectTest, CollectEmptys)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(int_col{}, dynamic_cast<cudf::list_scalar*>(ret.get())->view());
 
   // test collect all null columns
-  auto all_nulls = int_col{{1, 2, 3, 4, 5}, {0, 0, 0, 0, 0}};
+  auto all_nulls = int_col{{1, 2, 3, 4, 5}, {false, false, false, false, false}};
   ret            = cudf::reduce(all_nulls,
                      *cudf::make_collect_list_aggregation<cudf::reduce_aggregation>(),
                      cudf::data_type{cudf::type_id::LIST});
diff --git a/cpp/tests/reductions/list_rank_test.cpp b/cpp/tests/reductions/list_rank_test.cpp
index f5aeb87a3c0..f5470f7d881 100644
--- a/cpp/tests/reductions/list_rank_test.cpp
+++ b/cpp/tests/reductions/list_rank_test.cpp
@@ -117,17 +117,84 @@ TEST_F(ListRankScanTest, ListOfStruct)
 
   auto col1 = cudf::test::fixed_width_column_wrapper<int32_t>{
     {-1, -1, 0, 2, 2, 2, 1, 2, 0, 2, 0, 2, 0, 2, 0, 0, 1, 2},
-    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0}};
+    {true,
+     true,
+     true,
+     true,
+     true,
+     false,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     false,
+     false}};
   auto col2 = cudf::test::strings_column_wrapper{
     {"x", "x", "a", "a", "b", "b", "a", "b", "a", "b", "a", "c", "a", "c", "a", "c", "b", "b"},
-    {1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1}};
-  auto struct_col = cudf::test::structs_column_wrapper{
-    {col1, col2}, {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+    {true,
+     true,
+     true,
+     true,
+     true,
+     false,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     true,
+     false,
+     false,
+     true,
+     true}};
+  auto struct_col = cudf::test::structs_column_wrapper{{col1, col2},
+                                                       {false,
+                                                        false,
+                                                        false,
+                                                        false,
+                                                        false,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true,
+                                                        true}};
 
   auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
     0, 0, 0, 0, 0, 2, 3, 4, 5, 6, 8, 10, 12, 14, 15, 16, 17, 18};
 
-  auto list_nullmask = std::vector<bool>{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  auto list_nullmask = std::vector<bool>{true,
+                                         true,
+                                         false,
+                                         false,
+                                         true,
+                                         true,
+                                         true,
+                                         true,
+                                         true,
+                                         true,
+                                         true,
+                                         true,
+                                         true,
+                                         true,
+                                         true,
+                                         true,
+                                         true};
   auto [null_mask, null_count] =
     cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
   auto list_column = cudf::column_view(cudf::data_type(cudf::type_id::LIST),
@@ -178,14 +245,16 @@ TEST_F(ListRankScanTest, ListOfEmptyStruct)
   // [{}, {}]
   // [{}, {}]
 
-  auto struct_validity = std::vector<bool>{0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1};
+  auto struct_validity = std::vector<bool>{
+    false, false, false, false, false, false, false, false, true, true, true, true, true, true};
   auto [null_mask, null_count] =
     cudf::test::detail::make_null_mask(struct_validity.begin(), struct_validity.end());
   auto struct_col = cudf::make_structs_column(14, {}, null_count, std::move(null_mask));
 
   auto offsets = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
     0, 0, 0, 0, 0, 2, 4, 6, 7, 8, 9, 10, 12, 14};
-  auto list_nullmask = std::vector<bool>{1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1};
+  auto list_nullmask = std::vector<bool>{
+    true, true, false, false, true, true, true, true, true, true, true, true, true};
   std::tie(null_mask, null_count) =
     cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
   auto list_column = cudf::make_lists_column(
@@ -213,7 +282,7 @@ TEST_F(ListRankScanTest, EmptyDeepList)
   auto list1 = cudf::test::lists_column_wrapper<int>{};
 
   auto offsets       = cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 0};
-  auto list_nullmask = std::vector<bool>{1, 1, 0, 0};
+  auto list_nullmask = std::vector<bool>{true, true, false, false};
   auto [null_mask, null_count] =
     cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
   auto list_column = cudf::make_lists_column(
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index c41594e6933..0ec4cfa34c4 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -91,7 +91,7 @@ struct ReductionTest : public cudf::test::BaseFixture {
 
   ReductionTest() {}
 
-  ~ReductionTest() {}
+  ~ReductionTest() override {}
 
   template <typename T_out>
   std::pair<T_out, bool> reduction_test(cudf::column_view const& underlying_column,
@@ -132,8 +132,9 @@ TYPED_TEST(MinMaxReductionTest, MinMaxTypes)
 {
   using T = TypeParam;
   std::vector<int> int_values({5, 0, -120, -111, 0, 64, 63, 99, 123, -16});
-  std::vector<bool> host_bools({1, 1, 0, 1, 1, 1, 0, 1, 0, 1});
-  std::vector<bool> all_null({0, 0, 0, 0, 0, 0, 0, 0, 0, 0});
+  std::vector<bool> host_bools({true, true, false, true, true, true, false, true, false, true});
+  std::vector<bool> all_null(
+    {false, false, false, false, false, false, false, false, false, false});
   std::vector<T> v       = convert_values<T>(int_values);
   T init_value           = convert_int<T>(100);
   auto const init_scalar = cudf::make_fixed_width_scalar<T>(init_value);
@@ -259,7 +260,7 @@ TYPED_TEST(SumReductionTest, Sum)
 {
   using T = TypeParam;
   std::vector<int> int_values({6, -14, 13, 64, 0, -13, -20, 45});
-  std::vector<bool> host_bools({1, 1, 0, 0, 1, 1, 1, 1});
+  std::vector<bool> host_bools({true, true, false, false, true, true, true, true});
   std::vector<T> v       = convert_values<T>(int_values);
   T init_value           = convert_int<T>(100);
   auto const init_scalar = cudf::make_fixed_width_scalar<T>(init_value);
@@ -304,7 +305,7 @@ TYPED_TEST(ReductionTest, Product)
   }
 
   std::vector<int> int_values({5, -1, 1, 0, 3, 2, 4});
-  std::vector<bool> host_bools({1, 1, 0, 0, 1, 1, 1});
+  std::vector<bool> host_bools({true, true, false, false, true, true, true});
   std::vector<TypeParam> v = convert_values<TypeParam>(int_values);
   T init_value             = convert_int<T>(4);
   auto const init_scalar   = cudf::make_fixed_width_scalar<T>(init_value);
@@ -355,7 +356,7 @@ TYPED_TEST(ReductionTest, SumOfSquare)
 {
   using T = TypeParam;
   std::vector<int> int_values({-3, 2, 1, 0, 5, -3, -2});
-  std::vector<bool> host_bools({1, 1, 0, 0, 1, 1, 1, 1});
+  std::vector<bool> host_bools({true, true, false, false, true, true, true, true});
   std::vector<T> v = convert_values<T>(int_values);
 
   auto calc_reduction = [](std::vector<T>& v) {
@@ -600,7 +601,7 @@ TYPED_TEST(ReductionAnyAllTest, AnyAllTrueTrue)
 {
   using T = TypeParam;
   std::vector<int> int_values({true, true, true, true});
-  std::vector<bool> host_bools({1, 1, 0, 1});
+  std::vector<bool> host_bools({true, true, false, true});
   std::vector<T> v       = convert_values<T>(int_values);
   auto const init_scalar = cudf::make_fixed_width_scalar<T>(convert_int<T>(true));
 
@@ -663,7 +664,7 @@ TYPED_TEST(ReductionAnyAllTest, AnyAllFalseFalse)
 {
   using T = TypeParam;
   std::vector<int> int_values({false, false, false, false});
-  std::vector<bool> host_bools({1, 1, 0, 1});
+  std::vector<bool> host_bools({true, true, false, true});
   std::vector<T> v       = convert_values<T>(int_values);
   auto const init_scalar = cudf::make_fixed_width_scalar<T>(convert_int<T>(false));
 
@@ -733,7 +734,7 @@ TYPED_TEST(MultiStepReductionTest, Mean)
 {
   using T = TypeParam;
   std::vector<int> int_values({-3, 2, 1, 0, 5, -3, -2, 28});
-  std::vector<bool> host_bools({1, 1, 0, 1, 1, 1, 0, 1});
+  std::vector<bool> host_bools({true, true, false, true, true, true, false, true});
 
   auto calc_mean = [](std::vector<T>& v, cudf::size_type valid_count) {
     double sum = std::accumulate(v.begin(), v.end(), double{0});
@@ -778,7 +779,7 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std)
 {
   using T = TypeParam;
   std::vector<int> int_values({-3, 2, 1, 0, 5, -3, -2, 28});
-  std::vector<bool> host_bools({1, 1, 0, 1, 1, 1, 0, 1});
+  std::vector<bool> host_bools({true, true, false, true, true, true, false, true});
 
   auto calc_var = [](std::vector<T>& v, cudf::size_type valid_count, int ddof) {
     double mean = std::accumulate(v.begin(), v.end(), double{0});
@@ -865,7 +866,7 @@ TYPED_TEST(ReductionMultiStepErrorCheck, DISABLED_ErrorHandling)
 {
   using T = TypeParam;
   std::vector<int> int_values({-3, 2});
-  std::vector<bool> host_bools({1, 0});
+  std::vector<bool> host_bools({true, false});
 
   std::vector<T> v = convert_values<T>(int_values);
   cudf::test::fixed_width_column_wrapper<T> col(v.begin(), v.end());
@@ -941,9 +942,11 @@ TEST_F(ReductionDtypeTest, all_null_output)
 {
   auto sum_agg = cudf::make_sum_aggregation<reduce_aggregation>();
 
-  auto const col =
-    cudf::test::fixed_point_column_wrapper<int32_t>{{0, 0, 0}, {0, 0, 0}, numeric::scale_type{-2}}
-      .release();
+  auto const col = cudf::test::fixed_point_column_wrapper<int32_t>{
+    {0, 0, 0},
+    {false, false, false},
+    numeric::scale_type{
+      -2}}.release();
 
   std::unique_ptr<cudf::scalar> result = cudf::reduce(*col, *sum_agg, col->type());
   EXPECT_EQ(result->is_valid(), false);
@@ -1097,7 +1100,7 @@ TEST_F(ReductionEmptyTest, empty_column)
   // expect result.is_valid() is false
   int col_size = 5;
   std::vector<T> col_data(col_size);
-  std::vector<bool> valids(col_size, 0);
+  std::vector<bool> valids(col_size, false);
 
   cudf::test::fixed_width_column_wrapper<T> col_nulls = construct_null_column(col_data, valids);
   CUDF_EXPECT_NO_THROW(statement(col_nulls));
@@ -1138,7 +1141,7 @@ TEST_P(ReductionParamTest, DISABLED_std_var)
 {
   int ddof = GetParam();
   std::vector<double> int_values({-3, 2, 1, 0, 5, -3, -2, 28});
-  std::vector<bool> host_bools({1, 1, 0, 1, 1, 1, 0, 1});
+  std::vector<bool> host_bools({true, true, false, true, true, true, false, true});
 
   auto calc_var = [ddof](std::vector<double>& v, cudf::size_type valid_count) {
     double mean = std::accumulate(v.begin(), v.end(), double{0});
@@ -1270,7 +1273,7 @@ TEST_P(StringReductionTest, MinMax)
 {
   // data and valid arrays
   std::vector<std::string> host_strings(GetParam());
-  std::vector<bool> host_bools({1, 0, 1, 1, 1, 1, 0, 0, 1});
+  std::vector<bool> host_bools({true, false, true, true, true, true, false, false, true});
   bool succeed(true);
   std::string initial_value = "init";
 
@@ -1361,7 +1364,7 @@ TEST_P(StringReductionTest, DictionaryMinMax)
             expected_max_result);
 
   // column with nulls
-  std::vector<bool> validity({1, 0, 1, 1, 1, 1, 0, 0, 1});
+  std::vector<bool> validity({true, false, true, true, true, true, false, false, true});
   cudf::test::dictionary_column_wrapper<std::string> col_nulls(
     host_strings.begin(), host_strings.end(), validity.begin());
 
@@ -1429,7 +1432,7 @@ TYPED_TEST(ReductionTest, Median)
   using T = TypeParam;
   //{-20, -14, -13,  0, 6, 13, 45, 64/None} =  3.0, 0.0
   std::vector<int> int_values({6, -14, 13, 64, 0, -13, -20, 45});
-  std::vector<bool> host_bools({1, 1, 1, 0, 1, 1, 1, 1});
+  std::vector<bool> host_bools({true, true, true, false, true, true, true, true});
   std::vector<T> v = convert_values<T>(int_values);
 
   // test without nulls
@@ -1488,7 +1491,7 @@ TYPED_TEST(ReductionTest, Quantile)
   using T = TypeParam;
   //{-20, -14, -13,  0, 6, 13, 45, 64/None}
   std::vector<int> int_values({6, -14, 13, 64, 0, -13, -20, 45});
-  std::vector<bool> host_bools({1, 1, 1, 0, 1, 1, 1, 1});
+  std::vector<bool> host_bools({true, true, true, false, true, true, true, true});
   std::vector<T> v = convert_values<T>(int_values);
   cudf::interpolation interp{cudf::interpolation::LINEAR};
 
@@ -1528,7 +1531,7 @@ TYPED_TEST(ReductionTest, UniqueCount)
 {
   using T = TypeParam;
   std::vector<int> int_values({1, -3, 1, 2, 0, 2, -4, 45});  // 6 unique values
-  std::vector<bool> host_bools({1, 1, 1, 0, 1, 1, 1, 1});
+  std::vector<bool> host_bools({true, true, true, false, true, true, true, true});
   std::vector<T> v = convert_values<T>(int_values);
 
   // test without nulls
@@ -2311,7 +2314,7 @@ TYPED_TEST(DictionaryAnyAllTest, AnyAll)
   }
   // with nulls
   {
-    std::vector<bool> valid({1, 1, 0, 1});
+    std::vector<bool> valid({true, true, false, true});
     cudf::test::dictionary_column_wrapper<T> all_col(v_all.begin(), v_all.end(), valid.begin());
     EXPECT_TRUE(this->template reduction_test<bool>(all_col, *any_agg, output_dtype).first);
     EXPECT_TRUE(this->template reduction_test<bool>(all_col, *all_agg, output_dtype).first);
@@ -2351,7 +2354,7 @@ TYPED_TEST(DictionaryReductionTest, Sum)
             expected_value);
 
   // test with nulls
-  std::vector<bool> validity({1, 1, 0, 0, 1, 1, 1, 1});
+  std::vector<bool> validity({true, true, false, false, true, true, true, true});
   cudf::test::dictionary_column_wrapper<T> col_nulls(v.begin(), v.end(), validity.begin());
   expected_value = [v, validity] {
     auto const r = replace_nulls(v, validity, T{0});
@@ -2385,7 +2388,7 @@ TYPED_TEST(DictionaryReductionTest, Product)
             calc_prod(v));
 
   // test with nulls
-  std::vector<bool> validity({1, 1, 0, 0, 1, 1, 1});
+  std::vector<bool> validity({true, true, false, false, true, true, true});
   cudf::test::dictionary_column_wrapper<T> col_nulls(v.begin(), v.end(), validity.begin());
 
   EXPECT_EQ(this
@@ -2416,7 +2419,7 @@ TYPED_TEST(DictionaryReductionTest, SumOfSquare)
             calc_reduction(v));
 
   // test with nulls
-  std::vector<bool> validity({1, 1, 0, 0, 1, 1, 1, 1});
+  std::vector<bool> validity({true, true, false, false, true, true, true, true});
   cudf::test::dictionary_column_wrapper<T> col_nulls(v.begin(), v.end(), validity.begin());
 
   EXPECT_EQ(
@@ -2449,7 +2452,7 @@ TYPED_TEST(DictionaryReductionTest, Mean)
             calc_mean(v, v.size()));
 
   // test with nulls
-  std::vector<bool> validity({1, 1, 0, 1, 1, 1, 0, 1});
+  std::vector<bool> validity({true, true, false, true, true, true, false, true});
   cudf::test::dictionary_column_wrapper<T> col_nulls(v.begin(), v.end(), validity.begin());
 
   cudf::size_type valid_count = std::count(validity.begin(), validity.end(), true);
@@ -2495,7 +2498,7 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd)
   EXPECT_EQ(this->template reduction_test<double>(col, *std_agg, output_type).first, std);
 
   // test with nulls
-  std::vector<bool> validity({1, 1, 0, 1, 1, 1, 0, 1});
+  std::vector<bool> validity({true, true, false, true, true, true, false, true});
   cudf::test::dictionary_column_wrapper<T> col_nulls(v.begin(), v.end(), validity.begin());
 
   cudf::size_type const valid_count = std::count(validity.begin(), validity.end(), true);
@@ -2528,7 +2531,7 @@ TYPED_TEST(DictionaryReductionTest, NthElement)
             v[n]);
 
   // test with nulls
-  std::vector<bool> validity({1, 1, 0, 1, 1, 1, 0, 1});
+  std::vector<bool> validity({true, true, false, true, true, true, false, true});
   cudf::test::dictionary_column_wrapper<T> col_nulls(v.begin(), v.end(), validity.begin());
 
   EXPECT_EQ(this
@@ -2565,7 +2568,7 @@ TYPED_TEST(DictionaryReductionTest, UniqueCount)
             6);
 
   // test with nulls
-  std::vector<bool> validity({1, 1, 1, 0, 1, 1, 1, 1});
+  std::vector<bool> validity({true, true, true, false, true, true, true, true});
   cudf::test::dictionary_column_wrapper<T> col_nulls(v.begin(), v.end(), validity.begin());
 
   EXPECT_EQ(this
@@ -2598,7 +2601,7 @@ TYPED_TEST(DictionaryReductionTest, Median)
     (std::is_signed_v<T>) ? 3.0 : 13.5);
 
   // test with nulls
-  std::vector<bool> validity({1, 1, 1, 0, 1, 1, 1, 1});
+  std::vector<bool> validity({true, true, true, false, true, true, true, true});
   cudf::test::dictionary_column_wrapper<T> col_nulls(v.begin(), v.end(), validity.begin());
   EXPECT_EQ(this
               ->template reduction_test<double>(
@@ -2629,7 +2632,7 @@ TYPED_TEST(DictionaryReductionTest, Quantile)
             64.0);
 
   // test with nulls
-  std::vector<bool> validity({1, 1, 1, 0, 1, 1, 1, 1});
+  std::vector<bool> validity({true, true, true, false, true, true, true, true});
   cudf::test::dictionary_column_wrapper<T> col_nulls(v.begin(), v.end(), validity.begin());
 
   EXPECT_EQ(this
@@ -2686,7 +2689,7 @@ TEST_F(ListReductionTest, ListReductionNthElement)
     *cudf::make_nth_element_aggregation<reduce_aggregation>(2, cudf::null_policy::INCLUDE));
 
   // test with null-exclude
-  std::vector<bool> validity{1, 0, 0, 1, 1, 0};
+  std::vector<bool> validity{true, false, false, true, true, false};
   LCW col_nulls({{-3}, {2, 1}, {0, 5, -3}, {-2}, {}, {28}}, validity.begin());
   this->reduction_test(
     col_nulls,
@@ -2709,7 +2712,7 @@ TEST_F(ListReductionTest, NestedListReductionNthElement)
   using LCW = cudf::test::lists_column_wrapper<int>;
 
   // test without nulls
-  auto validity    = std::vector<bool>{1, 0, 0, 1, 1};
+  auto validity    = std::vector<bool>{true, false, false, true, true};
   auto nested_list = LCW(
     {{LCW{}, LCW{2, 3, 4}}, {}, {LCW{5}, LCW{6}, LCW{7, 8}}, {LCW{9, 10}}, {LCW{11}, LCW{12, 13}}},
     validity.begin());
@@ -2743,7 +2746,7 @@ TEST_F(ListReductionTest, NonValidListReductionNthElement)
   using ElementCol = cudf::test::fixed_width_column_wrapper<int>;
 
   // test against col.size() <= col.null_count()
-  std::vector<bool> validity{0};
+  std::vector<bool> validity{false};
   this->reduction_test(
     LCW{{{1, 2}}, validity.begin()},
     ElementCol{},  // expected_value,
@@ -2891,8 +2894,9 @@ TEST_F(StructReductionTest, StructReductionNthElement)
   // test without nulls
   auto child0 = *ICW{-3, 2, 1, 0, 5, -3, -2, 28}.release();
   auto child1 = *ICW{0, 1, 2, 3, 4, 5, 6, 7}.release();
-  auto child2 =
-    *ICW{{-10, 10, -100, 100, -1000, 1000, -10000, 10000}, {1, 0, 0, 1, 1, 1, 0, 1}}.release();
+  auto child2 = *ICW{{-10, 10, -100, 100, -1000, 1000, -10000, 10000},
+                     {true, false, false, true, true, true, false, true}}
+                   .release();
   std::vector<std::unique_ptr<cudf::column>> input_vector;
   input_vector.push_back(std::make_unique<cudf::column>(child0));
   input_vector.push_back(std::make_unique<cudf::column>(child1));
@@ -2900,7 +2904,7 @@ TEST_F(StructReductionTest, StructReductionNthElement)
   auto struct_col  = SCW(std::move(input_vector));
   auto result_col0 = ICW{1};
   auto result_col1 = ICW{2};
-  auto result_col2 = ICW{{0}, {0}};
+  auto result_col2 = ICW{{0}, {false}};
   this->reduction_test(
     struct_col,
     cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
@@ -2909,15 +2913,15 @@ TEST_F(StructReductionTest, StructReductionNthElement)
     *cudf::make_nth_element_aggregation<reduce_aggregation>(2, cudf::null_policy::INCLUDE));
 
   // test with null-include
-  std::vector<bool> validity{1, 1, 1, 0, 1, 0, 0, 1};
+  std::vector<bool> validity{true, true, true, false, true, false, false, true};
   input_vector.clear();
   input_vector.push_back(std::make_unique<cudf::column>(child0));
   input_vector.push_back(std::make_unique<cudf::column>(child1));
   input_vector.push_back(std::make_unique<cudf::column>(child2));
   struct_col  = SCW(std::move(input_vector), validity);
-  result_col0 = ICW{{0}, {0}};
-  result_col1 = ICW{{0}, {0}};
-  result_col2 = ICW{{0}, {0}};
+  result_col0 = ICW{{0}, {false}};
+  result_col1 = ICW{{0}, {false}};
+  result_col2 = ICW{{0}, {false}};
   this->reduction_test(
     struct_col,
     cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
@@ -2926,9 +2930,9 @@ TEST_F(StructReductionTest, StructReductionNthElement)
     *cudf::make_nth_element_aggregation<reduce_aggregation>(6, cudf::null_policy::INCLUDE));
 
   // test with null-exclude
-  result_col0 = ICW{{28}, {1}};
-  result_col1 = ICW{{7}, {1}};
-  result_col2 = ICW{{10000}, {1}};
+  result_col0 = ICW{{28}, {true}};
+  result_col1 = ICW{{7}, {true}};
+  result_col2 = ICW{{10000}, {true}};
   this->reduction_test(
     struct_col,
     cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
@@ -2942,15 +2946,16 @@ TEST_F(StructReductionTest, NestedStructReductionNthElement)
   using ICW = cudf::test::fixed_width_column_wrapper<int>;
   using LCW = cudf::test::lists_column_wrapper<int>;
 
-  auto int_col0      = ICW{-4, -3, -2, -1, 0};
-  auto struct_col0   = SCW({int_col0}, std::vector<bool>{1, 0, 0, 1, 1});
-  auto int_col1      = ICW{0, 1, 2, 3, 4};
-  auto list_col      = LCW{{0}, {}, {1, 2}, {3}, {4}};
-  auto struct_col1   = SCW({struct_col0, int_col1, list_col}, std::vector<bool>{1, 1, 1, 0, 1});
+  auto int_col0    = ICW{-4, -3, -2, -1, 0};
+  auto struct_col0 = SCW({int_col0}, std::vector<bool>{true, false, false, true, true});
+  auto int_col1    = ICW{0, 1, 2, 3, 4};
+  auto list_col    = LCW{{0}, {}, {1, 2}, {3}, {4}};
+  auto struct_col1 =
+    SCW({struct_col0, int_col1, list_col}, std::vector<bool>{true, true, true, false, true});
   auto result_child0 = ICW{0};
-  auto result_col0   = SCW({result_child0}, std::vector<bool>{0});
-  auto result_col1   = ICW{{1}, {1}};
-  auto result_col2   = LCW({LCW{}}, std::vector<bool>{1}.begin());
+  auto result_col0   = SCW({result_child0}, std::vector<bool>{false});
+  auto result_col1   = ICW{{1}, {true}};
+  auto result_col2   = LCW({LCW{}}, std::vector<bool>{true}.begin());
   // test without nulls
   this->reduction_test(
     struct_col1,
@@ -2961,9 +2966,9 @@ TEST_F(StructReductionTest, NestedStructReductionNthElement)
 
   // test with null-include
   result_child0 = ICW{0};
-  result_col0   = SCW({result_child0}, std::vector<bool>{0});
-  result_col1   = ICW{{0}, {0}};
-  result_col2   = LCW({LCW{3}}, std::vector<bool>{0}.begin());
+  result_col0   = SCW({result_child0}, std::vector<bool>{false});
+  result_col1   = ICW{{0}, {false}};
+  result_col2   = LCW({LCW{3}}, std::vector<bool>{false}.begin());
   this->reduction_test(
     struct_col1,
     cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
@@ -2973,9 +2978,9 @@ TEST_F(StructReductionTest, NestedStructReductionNthElement)
 
   // test with null-exclude
   result_child0 = ICW{0};
-  result_col0   = SCW({result_child0}, std::vector<bool>{1});
-  result_col1   = ICW{{4}, {1}};
-  result_col2   = LCW({LCW{4}}, std::vector<bool>{1}.begin());
+  result_col0   = SCW({result_child0}, std::vector<bool>{true});
+  result_col1   = ICW{{4}, {true}};
+  result_col2   = LCW({LCW{4}}, std::vector<bool>{true}.begin());
   this->reduction_test(
     struct_col1,
     cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
@@ -2991,11 +2996,11 @@ TEST_F(StructReductionTest, NonValidStructReductionNthElement)
   // test against col.size() <= col.null_count()
   auto child0     = ICW{-3, 3};
   auto child1     = ICW{0, 0};
-  auto child2     = ICW{{-10, 10}, {0, 1}};
-  auto struct_col = SCW{{child0, child1, child2}, {0, 0}};
-  auto ret_col0   = ICW{{0}, {0}};
-  auto ret_col1   = ICW{{0}, {0}};
-  auto ret_col2   = ICW{{0}, {0}};
+  auto child2     = ICW{{-10, 10}, {false, true}};
+  auto struct_col = SCW{{child0, child1, child2}, {false, false}};
+  auto ret_col0   = ICW{{0}, {false}};
+  auto ret_col1   = ICW{{0}, {false}};
+  auto ret_col2   = ICW{{0}, {false}};
   this->reduction_test(
     struct_col,
     cudf::table_view{{ret_col0, ret_col1, ret_col2}},  // expected_value,
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index 161b1ee61ac..76dbbaef491 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -217,7 +217,8 @@ TYPED_TEST_SUITE(ScanTest, TestTypes);
 TYPED_TEST(ScanTest, Min)
 {
   auto const v = make_vector<TypeParam>({123, 64, 63, 99, -5, 123, -16, -120, -111});
-  auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 0, 1, 1, 1, 1, 0, 0, 1});
+  auto const b = thrust::host_vector<bool>(
+    std::vector<bool>{true, false, true, true, true, true, false, false, true});
 
   // no nulls
   this->scan_test(v, {}, *cudf::make_min_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
@@ -249,7 +250,8 @@ TYPED_TEST(ScanTest, Min)
 TYPED_TEST(ScanTest, Max)
 {
   auto const v = make_vector<TypeParam>({-120, 5, 0, -120, -111, 64, 63, 99, 123, -16});
-  auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 0, 1, 1, 1, 1, 0, 1, 0, 1});
+  auto const b = thrust::host_vector<bool>(
+    std::vector<bool>{true, false, true, true, true, true, false, true, false, true});
 
   // inclusive
   // no nulls
@@ -282,7 +284,7 @@ TYPED_TEST(ScanTest, Max)
 TYPED_TEST(ScanTest, Product)
 {
   auto const v = make_vector<TypeParam>({5, -1, 1, 3, -2, 4});
-  auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 1, 1, 0, 1, 1});
+  auto const b = thrust::host_vector<bool>(std::vector<bool>{true, true, true, false, true, true});
 
   // no nulls
   this->scan_test(v, {}, *cudf::make_product_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
@@ -318,7 +320,8 @@ TYPED_TEST(ScanTest, Sum)
       return make_vector<TypeParam>({-120, 5, 6, 113, -111, 64, -63, 9, 34, -16});
     return make_vector<TypeParam>({12, 5, 6, 13, 11, 14, 3, 9, 34, 16});
   }();
-  auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 0, 1, 1, 0, 0, 1, 1, 1, 1});
+  auto const b = thrust::host_vector<bool>(
+    std::vector<bool>{true, false, true, true, false, false, true, true, true, true});
 
   // no nulls
   this->scan_test(v, {}, *cudf::make_sum_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
@@ -379,7 +382,7 @@ TYPED_TEST(ScanTest, EmptyColumn)
 TYPED_TEST(ScanTest, LeadingNulls)
 {
   auto const v = make_vector<TypeParam>({100, 200, 300});
-  auto const b = thrust::host_vector<bool>(std::vector<bool>{0, 1, 1});
+  auto const b = thrust::host_vector<bool>(std::vector<bool>{false, true, true});
 
   // skipna = true (default)
   this->scan_test(v,
diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
index 21a5c0c176c..37efc116d2a 100644
--- a/cpp/tests/reductions/segmented_reduction_tests.cpp
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -796,7 +796,7 @@ TEST_F(SegmentedReductionTestUntyped, Mean)
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
 
   auto const expected =
-    cudf::test::fixed_width_column_wrapper<float>{{10, 0, 30, 70}, {1, 0, 1, 1}};
+    cudf::test::fixed_width_column_wrapper<float>{{10, 0, 30, 70}, {true, false, true, true}};
   auto result =
     cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
@@ -807,19 +807,21 @@ TEST_F(SegmentedReductionTestUntyped, Mean)
 TEST_F(SegmentedReductionTestUntyped, MeanNulls)
 {
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
-    {10, 20, 30, 40, 50, 60, 0, 80, 90}, {1, 1, 1, 1, 1, 1, 0, 1, 1});
+    {10, 20, 30, 40, 50, 60, 0, 80, 90}, {true, true, true, true, true, true, false, true, true});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
     offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_mean_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
 
-  auto expected = cudf::test::fixed_width_column_wrapper<double>{{10, 0, 30, 70}, {1, 0, 1, 1}};
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<double>{{10, 0, 30, 70}, {true, false, true, true}};
   auto result =
     cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::EXCLUDE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
 
-  expected = cudf::test::fixed_width_column_wrapper<double>{{10, 0, 30, 0}, {1, 0, 1, 0}};
+  expected =
+    cudf::test::fixed_width_column_wrapper<double>{{10, 0, 30, 0}, {true, false, true, false}};
   result = cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
 }
@@ -834,8 +836,8 @@ TEST_F(SegmentedReductionTestUntyped, SumOfSquares)
   auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::INT32};
 
-  auto const expected =
-    cudf::test::fixed_width_column_wrapper<int32_t>{{100, 0, 2900, 25500}, {1, 0, 1, 1}};
+  auto const expected = cudf::test::fixed_width_column_wrapper<int32_t>{{100, 0, 2900, 25500},
+                                                                        {true, false, true, true}};
 
   auto result =
     cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
@@ -847,20 +849,21 @@ TEST_F(SegmentedReductionTestUntyped, SumOfSquares)
 TEST_F(SegmentedReductionTestUntyped, SumOfSquaresNulls)
 {
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
-    {10, 20, 30, 40, 50, 60, 0, 80, 90}, {1, 1, 1, 1, 1, 1, 0, 1, 1});
+    {10, 20, 30, 40, 50, 60, 0, 80, 90}, {true, true, true, true, true, true, false, true, true});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
     offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::INT64};
 
-  auto expected =
-    cudf::test::fixed_width_column_wrapper<int64_t>{{100, 0, 2900, 20600}, {1, 0, 1, 1}};
+  auto expected = cudf::test::fixed_width_column_wrapper<int64_t>{{100, 0, 2900, 20600},
+                                                                  {true, false, true, true}};
   auto result =
     cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::EXCLUDE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
 
-  expected = cudf::test::fixed_width_column_wrapper<int64_t>{{100, 0, 2900, 0}, {1, 0, 1, 0}};
+  expected =
+    cudf::test::fixed_width_column_wrapper<int64_t>{{100, 0, 2900, 0}, {true, false, true, false}};
   result = cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
 }
@@ -877,7 +880,7 @@ TEST_F(SegmentedReductionTestUntyped, StandardDeviation)
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
 
   auto expected = cudf::test::fixed_width_column_wrapper<float>{
-    {NaN, 0.f, 10.f, static_cast<float>(std::sqrt(250.))}, {1, 0, 1, 1}};
+    {NaN, 0.f, 10.f, static_cast<float>(std::sqrt(250.))}, {true, false, true, true}};
   auto result =
     cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
@@ -889,7 +892,7 @@ TEST_F(SegmentedReductionTestUntyped, StandardDeviationNulls)
 {
   constexpr double NaN{std::numeric_limits<double>::quiet_NaN()};
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
-    {10, 0, 20, 30, 54, 63, 0, 72, 81}, {1, 0, 1, 1, 1, 1, 0, 1, 1});
+    {10, 0, 20, 30, 54, 63, 0, 72, 81}, {true, false, true, true, true, true, false, true, true});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
     offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
@@ -897,12 +900,13 @@ TEST_F(SegmentedReductionTestUntyped, StandardDeviationNulls)
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
 
   auto expected = cudf::test::fixed_width_column_wrapper<double>{
-    {NaN, 0., std::sqrt(50.), std::sqrt(135.)}, {1, 0, 1, 1}};
+    {NaN, 0., std::sqrt(50.), std::sqrt(135.)}, {true, false, true, true}};
   auto result =
     cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::EXCLUDE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
 
-  expected = cudf::test::fixed_width_column_wrapper<double>{{NaN, 0., 0., 0.}, {1, 0, 0, 0}};
+  expected =
+    cudf::test::fixed_width_column_wrapper<double>{{NaN, 0., 0., 0.}, {true, false, false, false}};
   result = cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
 }
@@ -918,8 +922,8 @@ TEST_F(SegmentedReductionTestUntyped, Variance)
   auto const agg         = cudf::make_variance_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
 
-  auto expected =
-    cudf::test::fixed_width_column_wrapper<float>{{NaN, 0.f, 100.f, 250.f}, {1, 0, 1, 1}};
+  auto expected = cudf::test::fixed_width_column_wrapper<float>{{NaN, 0.f, 100.f, 250.f},
+                                                                {true, false, true, true}};
   auto result =
     cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
@@ -931,7 +935,7 @@ TEST_F(SegmentedReductionTestUntyped, VarianceNulls)
 {
   constexpr double NaN{std::numeric_limits<double>::quiet_NaN()};
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
-    {10, 0, 20, 30, 54, 63, 0, 72, 81}, {1, 0, 1, 1, 1, 1, 0, 1, 1});
+    {10, 0, 20, 30, 54, 63, 0, 72, 81}, {true, false, true, true, true, true, false, true, true});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
     offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
@@ -939,12 +943,13 @@ TEST_F(SegmentedReductionTestUntyped, VarianceNulls)
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
 
   auto expected =
-    cudf::test::fixed_width_column_wrapper<double>{{NaN, 0., 50., 135.}, {1, 0, 1, 1}};
+    cudf::test::fixed_width_column_wrapper<double>{{NaN, 0., 50., 135.}, {true, false, true, true}};
   auto result =
     cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::EXCLUDE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
 
-  expected = cudf::test::fixed_width_column_wrapper<double>{{NaN, 0., 0., 0.}, {1, 0, 0, 0}};
+  expected =
+    cudf::test::fixed_width_column_wrapper<double>{{NaN, 0., 0., 0.}, {true, false, false, false}};
   result = cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
 }
@@ -959,8 +964,8 @@ TEST_F(SegmentedReductionTestUntyped, NUnique)
   auto const agg         = cudf::make_nunique_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::INT32};
 
-  auto expected =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{{1, 0, 1, 2, 3}, {1, 0, 1, 1, 1}};
+  auto expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
+    {1, 0, 1, 2, 3}, {true, false, true, true, true}};
   auto result =
     cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::EXCLUDE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
@@ -972,21 +977,21 @@ TEST_F(SegmentedReductionTestUntyped, NUnique)
 TEST_F(SegmentedReductionTestUntyped, NUniqueNulls)
 {
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
-    {10, 0, 20, 30, 60, 60, 70, 70, 0}, {1, 0, 1, 1, 1, 1, 1, 1, 0});
+    {10, 0, 20, 30, 60, 60, 70, 70, 0}, {true, false, true, true, true, true, true, true, false});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 2, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
     offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto const agg         = cudf::make_nunique_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::INT32};
 
-  auto expected =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{{1, 0, 0, 2, 2}, {1, 0, 0, 1, 1}};
+  auto expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
+    {1, 0, 0, 2, 2}, {true, false, false, true, true}};
   auto result =
     cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::EXCLUDE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
 
-  expected =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>{{1, 0, 1, 2, 3}, {1, 0, 1, 1, 1}};
+  expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
+    {1, 0, 1, 2, 3}, {true, false, true, true, true}};
   result = cudf::segmented_reduce(input, d_offsets, *agg, output_type, cudf::null_policy::INCLUDE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
 }
@@ -994,7 +999,7 @@ TEST_F(SegmentedReductionTestUntyped, NUniqueNulls)
 TEST_F(SegmentedReductionTestUntyped, Errors)
 {
   auto const input = cudf::test::fixed_width_column_wrapper<int32_t>(
-    {10, 0, 20, 30, 54, 63, 0, 72, 81}, {1, 0, 1, 1, 1, 1, 0, 1, 1});
+    {10, 0, 20, 30, 54, 63, 0, 72, 81}, {true, false, true, true, true, true, false, true, true});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
     offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
@@ -1104,8 +1109,8 @@ TEST_F(SegmentedReductionTestUntyped, EmptyInputWithOffsets)
   auto const offsets   = std::vector<cudf::size_type>{0, 0, 0, 0, 0, 0};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
     offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-  auto const expect =
-    cudf::test::fixed_width_column_wrapper<int32_t>{{XXX, XXX, XXX, XXX, XXX}, {0, 0, 0, 0, 0}};
+  auto const expect = cudf::test::fixed_width_column_wrapper<int32_t>{
+    {XXX, XXX, XXX, XXX, XXX}, {false, false, false, false, false}};
 
   auto aggregates =
     std::vector<std::unique_ptr<cudf::segmented_reduce_aggregation,
@@ -1123,8 +1128,8 @@ TEST_F(SegmentedReductionTestUntyped, EmptyInputWithOffsets)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expect);
   }
 
-  auto const expect_bool =
-    cudf::test::fixed_width_column_wrapper<bool>{{XXX, XXX, XXX, XXX, XXX}, {0, 0, 0, 0, 0}};
+  auto const expect_bool = cudf::test::fixed_width_column_wrapper<bool>{
+    {XXX, XXX, XXX, XXX, XXX}, {false, false, false, false, false}};
 
   auto result =
     cudf::segmented_reduce(input,
@@ -1498,7 +1503,7 @@ TEST_F(SegmentedReductionStringTest, EmptyInputWithOffsets)
   auto const offsets   = std::vector<cudf::size_type>{0, 0, 0, 0};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
     offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-  auto const expect = cudf::test::strings_column_wrapper({XXX, XXX, XXX}, {0, 0, 0});
+  auto const expect = cudf::test::strings_column_wrapper({XXX, XXX, XXX}, {false, false, false});
 
   auto result =
     cudf::segmented_reduce(input,
diff --git a/cpp/tests/reshape/byte_cast_tests.cpp b/cpp/tests/reshape/byte_cast_tests.cpp
index 0b4cd0c9b40..cd280302677 100644
--- a/cpp/tests/reshape/byte_cast_tests.cpp
+++ b/cpp/tests/reshape/byte_cast_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -54,7 +54,8 @@ TEST_F(ByteCastTest, int16ValuesWithNulls)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
 
   cudf::test::fixed_width_column_wrapper<int16_t> const int16_col(
-    {short(0), short(100), short(-100), limits::min(), limits::max()}, {0, 1, 0, 1, 0});
+    {short(0), short(100), short(-100), limits::min(), limits::max()},
+    {false, true, false, true, false});
 
   auto int16_data = cudf::test::fixed_width_column_wrapper<uint8_t>{0x00, 0x64, 0x80, 0x00};
   auto [null_mask, null_count] = cudf::test::detail::make_null_mask(odd_validity, odd_validity + 5);
@@ -99,7 +100,7 @@ TEST_F(ByteCastTest, int32ValuesWithNulls)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i + 1) % 2; });
 
   cudf::test::fixed_width_column_wrapper<int32_t> const int32_col(
-    {0, 100, -100, limits::min(), limits::max()}, {1, 0, 1, 0, 1});
+    {0, 100, -100, limits::min(), limits::max()}, {true, false, true, false, true});
 
   auto int32_data = cudf::test::fixed_width_column_wrapper<uint8_t>{
     0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0x9c, 0x7f, 0xff, 0xff, 0xff};
@@ -154,7 +155,8 @@ TEST_F(ByteCastTest, int64ValuesWithNulls)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
 
   cudf::test::fixed_width_column_wrapper<int64_t> const int64_col(
-    {long(0), long(100), long(-100), limits::min(), limits::max()}, {0, 1, 0, 1, 0});
+    {long(0), long(100), long(-100), limits::min(), limits::max()},
+    {false, true, false, true, false});
 
   auto int64_data = cudf::test::fixed_width_column_wrapper<uint8_t>{
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
@@ -215,7 +217,8 @@ TEST_F(ByteCastTest, fp32ValuesWithNulls)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i + 1) % 2; });
 
   cudf::test::fixed_width_column_wrapper<float> const fp32_col(
-    {float(0.0), float(100.0), float(-100.0), limits::min(), limits::max()}, {1, 0, 1, 0, 1});
+    {float(0.0), float(100.0), float(-100.0), limits::min(), limits::max()},
+    {true, false, true, false, true});
 
   auto fp32_data = cudf::test::fixed_width_column_wrapper<uint8_t>{
     0x00, 0x00, 0x00, 0x00, 0xc2, 0xc8, 0x00, 0x00, 0x7f, 0x7f, 0xff, 0xff};
@@ -286,7 +289,8 @@ TEST_F(ByteCastTest, fp64ValuesWithNulls)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
 
   cudf::test::fixed_width_column_wrapper<double> const fp64_col(
-    {double(0.0), double(100.0), double(-100.0), limits::min(), limits::max()}, {0, 1, 0, 1, 0});
+    {double(0.0), double(100.0), double(-100.0), limits::min(), limits::max()},
+    {false, true, false, true, false});
 
   auto fp64_data = cudf::test::fixed_width_column_wrapper<uint8_t>{
     0x40, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp
index a6fa5b33c5b..f702dc78371 100644
--- a/cpp/tests/rolling/collect_ops_test.cpp
+++ b/cpp/tests/rolling/collect_ops_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -253,8 +253,9 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods)
                            min_periods,
                            *cudf::make_collect_list_aggregation<cudf::rolling_aggregation>());
 
-    auto expected_result_child_values   = std::vector<int32_t>{0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5};
-    auto expected_result_child_validity = std::vector<bool>{1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1};
+    auto expected_result_child_values = std::vector<int32_t>{0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, 5};
+    auto expected_result_child_validity =
+      std::vector<bool>{true, false, true, false, true, true, true, true, false, true, false, true};
     auto expected_result_child =
       cudf::test::fixed_width_column_wrapper<T, int32_t>(expected_result_child_values.begin(),
                                                          expected_result_child_values.end(),
@@ -325,8 +326,9 @@ TYPED_TEST(TypedCollectListTest, RollingWindowWithNullInputsHonoursMinPeriods)
                            min_periods,
                            *cudf::make_collect_list_aggregation<cudf::rolling_aggregation>());
 
-    auto expected_result_child_values   = std::vector<int32_t>{0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5};
-    auto expected_result_child_validity = std::vector<bool>{1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1};
+    auto expected_result_child_values = std::vector<int32_t>{0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5};
+    auto expected_result_child_validity =
+      std::vector<bool>{true, false, true, true, false, true, true, false, true, true, false, true};
     auto expected_result_child =
       cudf::test::fixed_width_column_wrapper<T, int32_t>(expected_result_child_values.begin(),
                                                          expected_result_child_values.end(),
@@ -432,7 +434,7 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsOnStrings)
                          *cudf::make_collect_list_aggregation<cudf::rolling_aggregation>());
   auto expected_result_2 = cudf::test::lists_column_wrapper<cudf::string_view>{
     {{}, {"0", "1", "2", "3"}, {"1", "2", "3", "4"}, {"2", "3", "4", "5"}, {}, {}},
-    cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) {
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) {
       return i != 0 && i < 4;
     })}.release();
 
@@ -525,7 +527,7 @@ TEST_F(CollectListTest, RollingWindowHonoursMinPeriodsWithDecimal)
       cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 4, 8, 12, 12, 12}.release();
     auto expected_num_rows = expected_offsets->size() - 1;
     auto null_mask_iter    = cudf::detail::make_counting_transform_iterator(
-      cudf::size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; });
+      cudf::size_type{0}, [](auto i) { return i > 0 && i < 4; });
 
     auto [null_mask, null_count] =
       cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows);
@@ -833,8 +835,9 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithNulls)
       1, 1, 2, 2, 3, 1, 4, 5, 6};
   auto const group_column =
     cudf::test::fixed_width_column_wrapper<int32_t>{1, 1, 1, 1, 1, 2, 2, 2, 2};
-  auto const input_column = cudf::test::strings_column_wrapper{
-    {"10", "11", "12", "13", "14", "20", "21", "22", "23"}, {1, 0, 1, 1, 1, 1, 0, 1, 1}};
+  auto const input_column =
+    cudf::test::strings_column_wrapper{{"10", "11", "12", "13", "14", "20", "21", "22", "23"},
+                                       {true, false, true, true, true, true, false, true, true}};
   auto const preceding   = 2;
   auto const following   = 1;
   auto const min_periods = 1;
@@ -1148,8 +1151,9 @@ TEST_F(CollectListTest, GroupedTimeRangeRollingWindowOnStringsWithNullsAndMinPer
       1, 1, 2, 2, 3, 1, 4, 5, 6};
   auto const group_column =
     cudf::test::fixed_width_column_wrapper<int32_t>{1, 1, 1, 1, 1, 2, 2, 2, 2};
-  auto const input_column = cudf::test::strings_column_wrapper{
-    {"10", "11", "12", "13", "14", "20", "21", "22", "23"}, {1, 0, 1, 1, 1, 1, 0, 1, 1}};
+  auto const input_column =
+    cudf::test::strings_column_wrapper{{"10", "11", "12", "13", "14", "20", "21", "22", "23"},
+                                       {true, false, true, true, true, true, false, true, true}};
   auto const preceding   = 2;
   auto const following   = 1;
   auto const min_periods = 4;
@@ -1558,7 +1562,7 @@ TEST_F(CollectSetTest, RollingWindowHonoursMinPeriodsOnStrings)
                         *cudf::make_collect_set_aggregation<cudf::rolling_aggregation>());
   auto expected_result_2 = cudf::test::lists_column_wrapper<cudf::string_view>{
     {{}, {"0", "1", "2"}, {"1", "2", "4"}, {"2", "4"}, {}, {}},
-    cudf::detail::make_counting_transform_iterator(0, [num_elements](auto i) {
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) {
       return i != 0 && i < 4;
     })}.release();
 
@@ -1650,7 +1654,7 @@ TEST_F(CollectSetTest, RollingWindowHonoursMinPeriodsWithDecimal)
       cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 3, 7, 10, 10, 10}.release();
     auto expected_num_rows = expected_offsets->size() - 1;
     auto null_mask_iter    = cudf::detail::make_counting_transform_iterator(
-      cudf::size_type{0}, [expected_num_rows](auto i) { return i > 0 && i < 4; });
+      cudf::size_type{0}, [](auto i) { return i > 0 && i < 4; });
 
     auto [null_mask, null_count] =
       cudf::test::detail::make_null_mask(null_mask_iter, null_mask_iter + expected_num_rows);
diff --git a/cpp/tests/rolling/grouped_rolling_test.cpp b/cpp/tests/rolling/grouped_rolling_test.cpp
index a4abe5ee608..78d5daf7e83 100644
--- a/cpp/tests/rolling/grouped_rolling_test.cpp
+++ b/cpp/tests/rolling/grouped_rolling_test.cpp
@@ -338,7 +338,7 @@ class GroupedRollingTest : public cudf::test::BaseFixture {
 
     agg_op op;
     for (cudf::size_type i = 0; i < num_rows; i++) {
-      OutputType val = agg_op::template identity<OutputType>();
+      auto val = agg_op::template identity<OutputType>();
 
       // load sizes
       min_periods = std::max(min_periods, 1);  // at least one observation is required
@@ -458,7 +458,7 @@ TEST_F(GroupedRollingErrorTest, NegativeMinPeriods)
 {
   // Construct agg column.
   const std::vector<cudf::size_type> col_data{0, 1, 2, 0, 4};
-  const std::vector<bool> col_valid{1, 1, 1, 0, 1};
+  const std::vector<bool> col_valid{true, true, true, false, true};
   cudf::test::fixed_width_column_wrapper<cudf::size_type> input{
     col_data.begin(), col_data.end(), col_valid.begin()};
 
@@ -674,7 +674,7 @@ using GroupedRollingTestStrings = GroupedRollingTest<cudf::string_view>;
 TEST_F(GroupedRollingTestStrings, StringsUnsupportedOperators)
 {
   cudf::test::strings_column_wrapper input{{"This", "is", "not", "a", "string", "type"},
-                                           {1, 1, 1, 0, 1, 0}};
+                                           {true, true, true, false, true, false}};
 
   const cudf::size_type DATA_SIZE{static_cast<cudf::column_view>(input).size()};
   const std::vector<cudf::size_type> key_col_vec(DATA_SIZE, 0);
@@ -984,7 +984,7 @@ class GroupedTimeRangeRollingTest : public cudf::test::BaseFixture {
 
     agg_op op;
     for (cudf::size_type i = 0; i < num_rows; i++) {
-      OutputType val = agg_op::template identity<OutputType>();
+      auto val = agg_op::template identity<OutputType>();
 
       // load sizes
       min_periods = std::max(min_periods, 1);  // at least one observation is required
@@ -1272,7 +1272,8 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountSingleGroupTimestampASCNu
                                                                  {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+      {false, false, false, false, true, true, true, true, true, true}};
 
   auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const preceding     = 1L;
@@ -1303,7 +1304,8 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountSingleGroupTimestampASCNu
 
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 1, 0, 0, 0, 0}};
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+      {true, true, true, true, true, true, false, false, false, false}};
 
   auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const preceding     = 1L;
@@ -1332,7 +1334,8 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupTimestampASCNul
   auto const agg_col = cudf::test::fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {1, 2, 2, 1, 2, 1, 2, 3, 4, 5}, {0, 0, 0, 1, 1, 0, 0, 1, 1, 1}};
+      {1, 2, 2, 1, 2, 1, 2, 3, 4, 5},
+      {false, false, false, true, true, false, false, true, true, true}};
 
   auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const preceding     = 1L;
@@ -1361,7 +1364,8 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupTimestampASCNul
   auto const agg_col = cudf::test::fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {1, 2, 2, 1, 3, 1, 2, 3, 4, 5}, {1, 1, 1, 0, 0, 1, 1, 1, 0, 0}};
+      {1, 2, 2, 1, 3, 1, 2, 3, 4, 5},
+      {true, true, true, false, false, true, true, true, false, false}};
 
   auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const preceding     = 1L;
@@ -1391,7 +1395,8 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountSingleGroupTimestampDESCN
                                                                  {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+      {9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+      {false, false, false, false, true, true, true, true, true, true}};
 
   auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const preceding     = 1L;
@@ -1422,7 +1427,8 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountSingleGroupTimestampDESCN
 
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, {1, 1, 1, 1, 1, 1, 0, 0, 0, 0}};
+      {9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+      {true, true, true, true, true, true, false, false, false, false}};
 
   auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const preceding     = 1L;
@@ -1451,7 +1457,8 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupTimestampDESCNu
   auto const agg_col = cudf::test::fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {4, 3, 2, 1, 0, 9, 8, 7, 6, 5}, {0, 0, 0, 1, 1, 0, 0, 1, 1, 1}};
+      {4, 3, 2, 1, 0, 9, 8, 7, 6, 5},
+      {false, false, false, true, true, false, false, true, true, true}};
 
   auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const preceding     = 1L;
@@ -1480,7 +1487,8 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupTimestampDESCNu
   auto const agg_col = cudf::test::fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {4, 3, 2, 1, 0, 9, 8, 7, 6, 5}, {1, 1, 1, 0, 0, 1, 1, 1, 0, 0}};
+      {4, 3, 2, 1, 0, 9, 8, 7, 6, 5},
+      {true, true, true, false, false, true, true, true, false, false}};
 
   auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const preceding     = 1L;
@@ -1511,7 +1519,8 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountSingleGroupAllNullTimesta
 
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}};
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+      {false, false, false, false, false, false, false, false, false, false}};
 
   auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const preceding     = 1L;
@@ -1542,7 +1551,8 @@ TYPED_TEST(TypedNullTimestampTestForRangeQueries, CountMultiGroupAllNullTimestam
 
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 0, 0, 0, 0}};
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+      {true, true, true, true, true, false, false, false, false, false}};
 
   auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const preceding     = 1L;
@@ -1584,7 +1594,8 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingWindowSingleGroupTimestam
                                                                  {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+      {false, false, false, false, true, true, true, true, true, true}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
@@ -1614,7 +1625,8 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedFollowingWindowSingleGroupTimestam
                                                                  {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+      {false, false, false, false, true, true, true, true, true, true}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const one_day_preceding   = cudf::window_bounds::get(1L);
@@ -1645,7 +1657,8 @@ TYPED_TEST(TypedUnboundedWindowTest,
                                                                  {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+      {false, false, false, false, true, true, true, true, true, true}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
@@ -1675,7 +1688,8 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingWindowSingleGroupTimestam
                                                                  {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 1, 0, 0, 0, 0}};
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+      {true, true, true, true, true, true, false, false, false, false}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
@@ -1705,7 +1719,8 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedFollowingWindowSingleGroupTimestam
                                                                  {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 1, 0, 0, 0, 0}};
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+      {true, true, true, true, true, true, false, false, false, false}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const one_day_preceding   = cudf::window_bounds::get(1L);
@@ -1736,7 +1751,8 @@ TYPED_TEST(TypedUnboundedWindowTest,
                                                                  {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 1, 0, 0, 0, 0}};
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+      {true, true, true, true, true, true, false, false, false, false}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
@@ -1766,7 +1782,8 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingWindowSingleGroupTimestam
                                                                  {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+      {9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+      {false, false, false, false, true, true, true, true, true, true}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
@@ -1796,7 +1813,8 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedFollowingWindowSingleGroupTimestam
                                                                  {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+      {9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+      {false, false, false, false, true, true, true, true, true, true}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const one_day_preceding   = cudf::window_bounds::get(1L);
@@ -1827,7 +1845,8 @@ TYPED_TEST(TypedUnboundedWindowTest,
                                                                  {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+      {9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+      {false, false, false, false, true, true, true, true, true, true}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
@@ -1857,7 +1876,8 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingWindowSingleGroupTimestam
                                                                  {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, {1, 1, 1, 1, 1, 1, 0, 0, 0, 0}};
+      {9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+      {true, true, true, true, true, true, false, false, false, false}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
@@ -1887,7 +1907,8 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedFollowingWindowSingleGroupTimestam
                                                                  {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, {1, 1, 1, 1, 1, 1, 0, 0, 0, 0}};
+      {9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+      {true, true, true, true, true, true, false, false, false, false}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const one_day_preceding   = cudf::window_bounds::get(1L);
@@ -1918,7 +1939,8 @@ TYPED_TEST(TypedUnboundedWindowTest,
                                                                  {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, {1, 1, 1, 1, 1, 1, 0, 0, 0, 0}};
+      {9, 8, 7, 6, 5, 4, 3, 2, 1, 0},
+      {true, true, true, true, true, true, false, false, false, false}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
@@ -1947,7 +1969,8 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingCountMultiGroupTimestampA
   auto const agg_col = cudf::test::fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {1, 2, 2, 1, 2, 1, 2, 3, 4, 5}, {0, 0, 0, 1, 1, 0, 0, 1, 1, 1}};
+      {1, 2, 2, 1, 2, 1, 2, 3, 4, 5},
+      {false, false, false, true, true, false, false, true, true, true}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
@@ -1976,7 +1999,8 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedFollowingCountMultiGroupTimestampA
   auto const agg_col = cudf::test::fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {1, 2, 2, 1, 2, 1, 2, 3, 4, 5}, {0, 0, 0, 1, 1, 0, 0, 1, 1, 1}};
+      {1, 2, 2, 1, 2, 1, 2, 3, 4, 5},
+      {false, false, false, true, true, false, false, true, true, true}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const one_day_preceding   = cudf::window_bounds::get(1L);
@@ -2006,7 +2030,8 @@ TYPED_TEST(TypedUnboundedWindowTest,
   auto const agg_col = cudf::test::fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {1, 2, 2, 1, 2, 1, 2, 3, 4, 5}, {0, 0, 0, 1, 1, 0, 0, 1, 1, 1}};
+      {1, 2, 2, 1, 2, 1, 2, 3, 4, 5},
+      {false, false, false, true, true, false, false, true, true, true}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
@@ -2035,7 +2060,8 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingCountMultiGroupTimestampA
   auto const agg_col = cudf::test::fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {1, 2, 2, 1, 3, 1, 2, 3, 4, 5}, {1, 1, 1, 0, 0, 1, 1, 1, 0, 0}};
+      {1, 2, 2, 1, 3, 1, 2, 3, 4, 5},
+      {true, true, true, false, false, true, true, true, false, false}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
@@ -2064,7 +2090,8 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedFollowingCountMultiGroupTimestampA
   auto const agg_col = cudf::test::fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {1, 2, 2, 1, 3, 1, 2, 3, 4, 5}, {1, 1, 1, 0, 0, 1, 1, 1, 0, 0}};
+      {1, 2, 2, 1, 3, 1, 2, 3, 4, 5},
+      {true, true, true, false, false, true, true, true, false, false}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const one_day_preceding   = cudf::window_bounds::get(1L);
@@ -2094,7 +2121,8 @@ TYPED_TEST(TypedUnboundedWindowTest,
   auto const agg_col = cudf::test::fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {1, 2, 2, 1, 3, 1, 2, 3, 4, 5}, {1, 1, 1, 0, 0, 1, 1, 1, 0, 0}};
+      {1, 2, 2, 1, 3, 1, 2, 3, 4, 5},
+      {true, true, true, false, false, true, true, true, false, false}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
@@ -2123,7 +2151,8 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingCountMultiGroupTimestampD
   auto const agg_col = cudf::test::fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {4, 3, 2, 1, 0, 9, 8, 7, 6, 5}, {0, 0, 0, 1, 1, 0, 0, 1, 1, 1}};
+      {4, 3, 2, 1, 0, 9, 8, 7, 6, 5},
+      {false, false, false, true, true, false, false, true, true, true}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
@@ -2152,7 +2181,8 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedFollowingCountMultiGroupTimestampD
   auto const agg_col = cudf::test::fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {4, 3, 2, 1, 0, 9, 8, 7, 6, 5}, {0, 0, 0, 1, 1, 0, 0, 1, 1, 1}};
+      {4, 3, 2, 1, 0, 9, 8, 7, 6, 5},
+      {false, false, false, true, true, false, false, true, true, true}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const one_day_preceding   = cudf::window_bounds::get(1L);
@@ -2182,7 +2212,8 @@ TYPED_TEST(TypedUnboundedWindowTest,
   auto const agg_col = cudf::test::fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {4, 3, 2, 1, 0, 9, 8, 7, 6, 5}, {0, 0, 0, 1, 1, 0, 0, 1, 1, 1}};
+      {4, 3, 2, 1, 0, 9, 8, 7, 6, 5},
+      {false, false, false, true, true, false, false, true, true, true}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
@@ -2211,7 +2242,8 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedPrecedingCountMultiGroupTimestampD
   auto const agg_col = cudf::test::fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {4, 3, 2, 1, 0, 9, 8, 7, 6, 5}, {1, 1, 1, 0, 0, 1, 1, 1, 0, 0}};
+      {4, 3, 2, 1, 0, 9, 8, 7, 6, 5},
+      {true, true, true, false, false, true, true, true, false, false}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
@@ -2240,7 +2272,8 @@ TYPED_TEST(TypedUnboundedWindowTest, UnboundedFollowingCountMultiGroupTimestampD
   auto const agg_col = cudf::test::fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {4, 3, 2, 1, 0, 9, 8, 7, 6, 5}, {1, 1, 1, 0, 0, 1, 1, 1, 0, 0}};
+      {4, 3, 2, 1, 0, 9, 8, 7, 6, 5},
+      {true, true, true, false, false, true, true, true, false, false}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const one_day_preceding   = cudf::window_bounds::get(1L);
@@ -2270,7 +2303,8 @@ TYPED_TEST(TypedUnboundedWindowTest,
   auto const agg_col = cudf::test::fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
   auto const time_col =
     cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
-      {4, 3, 2, 1, 0, 9, 8, 7, 6, 5}, {1, 1, 1, 0, 0, 1, 1, 1, 0, 0}};
+      {4, 3, 2, 1, 0, 9, 8, 7, 6, 5},
+      {true, true, true, false, false, true, true, true, false, false}};
 
   auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
   auto const unbounded_preceding = cudf::window_bounds::unbounded();
diff --git a/cpp/tests/rolling/range_rolling_window_test.cpp b/cpp/tests/rolling/range_rolling_window_test.cpp
index fcd0cc18019..461c41025e9 100644
--- a/cpp/tests/rolling/range_rolling_window_test.cpp
+++ b/cpp/tests/rolling/range_rolling_window_test.cpp
@@ -34,6 +34,7 @@
 #include <src/rolling/detail/range_window_bounds.hpp>
 #include <src/rolling/detail/rolling.hpp>
 
+#include <utility>
 #include <vector>
 
 template <typename T, typename R = int32_t>
@@ -57,12 +58,12 @@ struct window_exec {
               ScalarT preceding_scalar,
               ScalarT following_scalar,
               cudf::size_type min_periods = 1)
-    : gby_column(gby),
-      oby_column(oby),
+    : gby_column(std::move(gby)),
+      oby_column(std::move(oby)),
       order(ordering),
-      agg_column(agg),
-      preceding(preceding_scalar),
-      following(following_scalar),
+      agg_column(std::move(agg)),
+      preceding(std::move(preceding_scalar)),
+      following(std::move(following_scalar)),
       min_periods(min_periods)
   {
   }
@@ -170,7 +171,7 @@ TYPED_TEST(TypedTimeRangeRollingTest, TimestampASC)
   // clang-format off
   auto gby_column  = int_col { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
   auto agg_column  = int_col {{0, 8, 4, 6, 2, 9, 3, 5, 1, 7},
-                              {1, 1, 1, 1, 1, 1, 1, 1, 1, 0}};
+                              {true, true, true, true, true, true, true, true, true, false}};
   auto time_column = time_col{ 1, 5, 6, 8, 9, 2, 2, 3, 4, 9};
   // clang-format on
 
@@ -252,7 +253,7 @@ TYPED_TEST(TypedTimeRangeRollingTest, TimestampDESC)
   // clang-format off
   auto gby_column  = int_col { 5, 5, 5, 5, 5, 1, 1, 1, 1, 1};
   auto agg_column  = int_col {{7, 1, 5, 3, 9, 2, 6, 4, 8, 0},
-                              {0, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+                              {false, true, true, true, true, true, true, true, true, true}};
   auto time_column = time_col{ 9, 4, 3, 2, 2, 9, 8, 6, 5, 1};
   // clang-format on
 
@@ -281,7 +282,7 @@ TYPED_TEST(TypedIntegralRangeRollingTest, OrderByASC)
   // clang-format off
   auto gby_column = int_col { 0, 0, 0, 0, 0, 1, 1, 1, 1, 1};
   auto agg_column = int_col {{0, 8, 4, 6, 2, 9, 3, 5, 1, 7},
-                             {1, 1, 1, 1, 1, 1, 1, 1, 1, 0}};
+                             {true, true, true, true, true, true, true, true, true, false}};
   auto oby_column = fwcw<T>{  1, 5, 6, 8, 9, 2, 2, 3, 4, 9};
   // clang-format on
 
@@ -304,7 +305,7 @@ TYPED_TEST(TypedIntegralRangeRollingTest, OrderByDesc)
   // clang-format off
   auto gby_column  = int_col { 5, 5, 5, 5, 5, 1, 1, 1, 1, 1};
   auto agg_column  = int_col {{7, 1, 5, 3, 9, 2, 6, 4, 8, 0},
-                              {0, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+                              {false, true, true, true, true, true, true, true, true, true}};
   auto oby_column  = fwcw<T>{  9, 4, 3, 2, 2, 9, 8, 6, 5, 1};
   // clang-format on
 
@@ -418,8 +419,9 @@ TYPED_TEST(TypedRangeRollingNullsTest, CountMultiGroupOrderByASCNullsLast)
   // Aggregation column.
   auto const agg_col = cudf::test::fixed_width_column_wrapper<T>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
   // OrderBy column.
-  auto const oby_col = cudf::test::fixed_width_column_wrapper<T>{{1, 2, 2, 1, 3, 1, 2, 3, 4, 5},
-                                                                 {1, 1, 1, 0, 0, 1, 1, 1, 0, 0}};
+  auto const oby_col = cudf::test::fixed_width_column_wrapper<T>{
+    {1, 2, 2, 1, 3, 1, 2, 3, 4, 5},
+    {true, true, true, false, false, true, true, true, false, false}};
 
   auto const output = do_count_over_window<T>(grp_col, oby_col, cudf::order::ASCENDING, agg_col);
 
diff --git a/cpp/tests/round/round_tests.cpp b/cpp/tests/round/round_tests.cpp
index 64d08fa7338..a1cdcc5b793 100644
--- a/cpp/tests/round/round_tests.cpp
+++ b/cpp/tests/round/round_tests.cpp
@@ -336,7 +336,8 @@ TYPED_TEST(RoundTestsFixedPointTypes, TestScaleMovementExceedingMaxPrecision)
   auto const result_even = cudf::round(input, -target_scale, cudf::rounding_method::HALF_EVEN);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_even, result_even->view());
 
-  const std::initializer_list<bool> validity = {1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0};
+  const std::initializer_list<bool> validity = {
+    true, false, true, true, true, false, false, true, true, true, true, false};
   auto const input_null =
     fp_wrapper{{14, 15, 16, 24, 25, 26, -14, -15, -16, -24, -25, -26}, validity, scale_type{1}};
   auto const expected_null =
@@ -705,7 +706,7 @@ TEST_F(RoundTests, BoolTestHalfUp)
 }
 
 // Use __uint128_t for demonstration.
-constexpr __uint128_t operator""_uint128_t(const char* s)
+constexpr __uint128_t operator""_uint128_t(char const* s)
 {
   __uint128_t ret = 0;
   for (int i = 0; s[i] != '\0'; ++i) {
diff --git a/cpp/tests/scalar/scalar_test.cpp b/cpp/tests/scalar/scalar_test.cpp
index cb689abb8d8..2d37de920d5 100644
--- a/cpp/tests/scalar/scalar_test.cpp
+++ b/cpp/tests/scalar/scalar_test.cpp
@@ -248,7 +248,7 @@ TEST_F(StructScalarTest, BasicNulls)
   src_columns.push_back(std::make_unique<cudf::column>(src_children[0]));
   src_columns.push_back(std::make_unique<cudf::column>(src_children[1]));
   src_columns.push_back(std::make_unique<cudf::column>(src_children[2]));
-  cudf::test::structs_column_wrapper valid_struct_col(std::move(src_columns), {1});
+  cudf::test::structs_column_wrapper valid_struct_col(std::move(src_columns), {true});
   cudf::column_view vcv = static_cast<cudf::column_view>(valid_struct_col);
   std::vector<cudf::column_view> valid_children(vcv.child_begin(), vcv.child_end());
 
@@ -256,7 +256,7 @@ TEST_F(StructScalarTest, BasicNulls)
   src_columns.push_back(std::make_unique<cudf::column>(src_children[0]));
   src_columns.push_back(std::make_unique<cudf::column>(src_children[1]));
   src_columns.push_back(std::make_unique<cudf::column>(src_children[2]));
-  cudf::test::structs_column_wrapper invalid_struct_col(std::move(src_columns), {0});
+  cudf::test::structs_column_wrapper invalid_struct_col(std::move(src_columns), {false});
   cudf::column_view icv = static_cast<cudf::column_view>(invalid_struct_col);
   std::vector<cudf::column_view> invalid_children(icv.child_begin(), icv.child_end());
 
diff --git a/cpp/tests/search/search_dictionary_test.cpp b/cpp/tests/search/search_dictionary_test.cpp
index 79a3d3b3b47..78f79ccc648 100644
--- a/cpp/tests/search/search_dictionary_test.cpp
+++ b/cpp/tests/search/search_dictionary_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,9 +31,10 @@ using cudf::test::fixed_width_column_wrapper;
 TEST_F(DictionarySearchTest, search_dictionary)
 {
   cudf::test::dictionary_column_wrapper<std::string> input(
-    {"", "", "10", "10", "20", "20", "30", "40"}, {0, 0, 1, 1, 1, 1, 1, 1});
+    {"", "", "10", "10", "20", "20", "30", "40"},
+    {false, false, true, true, true, true, true, true});
   cudf::test::dictionary_column_wrapper<std::string> values(
-    {"", "08", "10", "11", "30", "32", "90"}, {0, 1, 1, 1, 1, 1, 1});
+    {"", "08", "10", "11", "30", "32", "90"}, {false, true, true, true, true, true, true});
 
   auto result = cudf::upper_bound({cudf::table_view{{input}}},
                                   {cudf::table_view{{values}}},
@@ -52,17 +53,20 @@ TEST_F(DictionarySearchTest, search_dictionary)
 
 TEST_F(DictionarySearchTest, search_table_dictionary)
 {
-  fixed_width_column_wrapper<int32_t> column_0{{10, 10, 20, 20, 20, 20, 20, 20, 20, 50, 30},
-                                               {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0}};
-  fixed_width_column_wrapper<float> column_1{{5.0, 6.0, .5, .5, .5, .5, .7, .7, .7, .7, .5},
-                                             {1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1}};
+  fixed_width_column_wrapper<int32_t> column_0{
+    {10, 10, 20, 20, 20, 20, 20, 20, 20, 50, 30},
+    {true, true, true, true, true, true, true, true, true, true, false}};
+  fixed_width_column_wrapper<float> column_1{
+    {5.0, 6.0, .5, .5, .5, .5, .7, .7, .7, .7, .5},
+    {true, false, true, true, true, true, true, true, true, true, true}};
   cudf::test::dictionary_column_wrapper<int16_t> column_2{
-    {90, 95, 77, 78, 79, 76, 61, 62, 63, 41, 50}, {1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1}};
+    {90, 95, 77, 78, 79, 76, 61, 62, 63, 41, 50},
+    {true, true, true, true, false, false, true, true, true, true, true}};
   cudf::table_view input({column_0, column_1, column_2});
 
-  fixed_width_column_wrapper<int32_t> values_0{{10, 40, 20}, {1, 0, 1}};
-  fixed_width_column_wrapper<float> values_1{{6., .5, .5}, {0, 1, 1}};
-  cudf::test::dictionary_column_wrapper<int16_t> values_2{{95, 50, 77}, {1, 1, 0}};
+  fixed_width_column_wrapper<int32_t> values_0{{10, 40, 20}, {true, false, true}};
+  fixed_width_column_wrapper<float> values_1{{6., .5, .5}, {false, true, true}};
+  cudf::test::dictionary_column_wrapper<int16_t> values_2{{95, 50, 77}, {true, true, false}};
   cudf::table_view values({values_0, values_1, values_2});
 
   std::vector<cudf::order> order_flags{
@@ -94,8 +98,8 @@ TEST_F(DictionarySearchTest, contains_dictionary)
 
 TEST_F(DictionarySearchTest, contains_nullable_dictionary)
 {
-  cudf::test::dictionary_column_wrapper<int64_t> column({0, 0, 17, 17, 23, 23, 29},
-                                                        {1, 0, 1, 1, 1, 1, 1});
+  cudf::test::dictionary_column_wrapper<int64_t> column(
+    {0, 0, 17, 17, 23, 23, 29}, {true, false, true, true, true, true, true});
   EXPECT_TRUE(cudf::contains(column, numeric_scalar<int64_t>{23}));
   EXPECT_FALSE(cudf::contains(column, numeric_scalar<int64_t>{28}));
 
diff --git a/cpp/tests/sort/is_sorted_tests.cpp b/cpp/tests/sort/is_sorted_tests.cpp
index 271b119ff80..109095192f9 100644
--- a/cpp/tests/sort/is_sorted_tests.cpp
+++ b/cpp/tests/sort/is_sorted_tests.cpp
@@ -148,13 +148,13 @@ auto empty<cudf::string_view>()
 template <>
 auto nulls_after<cudf::string_view>()
 {
-  return cudf::test::strings_column_wrapper({"identical", "identical"}, {1, 0});
+  return cudf::test::strings_column_wrapper({"identical", "identical"}, {true, false});
 }
 
 template <>
 auto nulls_before<cudf::string_view>()
 {
-  return cudf::test::strings_column_wrapper({"identical", "identical"}, {0, 1});
+  return cudf::test::strings_column_wrapper({"identical", "identical"}, {false, true});
 }
 
 // ----- struct_view {"nestedInt" : {"Int" : 0 }, "float" : 1}
@@ -213,7 +213,7 @@ auto nulls_after<cudf::struct_view>()
   auto int_col = cudf::test::fixed_width_column_wrapper<int32_t>({1, 1});
   auto col1    = cudf::test::structs_column_wrapper{{int_col}};
   auto col2    = cudf::test::fixed_width_column_wrapper<float>({1, 1});
-  return cudf::test::structs_column_wrapper{{col1, col2}, {1, 0}};
+  return cudf::test::structs_column_wrapper{{col1, col2}, {true, false}};
 }
 
 template <>
@@ -222,7 +222,7 @@ auto nulls_before<cudf::struct_view>()
   auto int_col = cudf::test::fixed_width_column_wrapper<int32_t>({1, 1});
   auto col1    = cudf::test::structs_column_wrapper{{int_col}};
   auto col2    = cudf::test::fixed_width_column_wrapper<float>({1, 1});
-  return cudf::test::structs_column_wrapper{{col1, col2}, {0, 1}};
+  return cudf::test::structs_column_wrapper{{col1, col2}, {false, true}};
 }
 
 using lcw = cudf::test::lists_column_wrapper<int32_t>;
diff --git a/cpp/tests/sort/rank_test.cpp b/cpp/tests/sort/rank_test.cpp
index 47a1ba3b294..e08a2105aea 100644
--- a/cpp/tests/sort/rank_test.cpp
+++ b/cpp/tests/sort/rank_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,7 +74,8 @@ template <typename T>
 struct Rank : public cudf::test::BaseFixture {
   cudf::test::fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8, 5}};
   cudf::test::fixed_width_column_wrapper<T> col2{{5, 4, 3, 5, 8, 5}, {1, 1, 0, 1, 1, 1}};
-  cudf::test::strings_column_wrapper col3{{"d", "e", "a", "d", "k", "d"}, {1, 1, 1, 1, 1, 1}};
+  cudf::test::strings_column_wrapper col3{{"d", "e", "a", "d", "k", "d"},
+                                          {true, true, true, true, true, true}};
 
   void run_all_tests(cudf::rank_method method,
                      input_arg_t input_arg,
@@ -116,10 +117,10 @@ TYPED_TEST(Rank, first_asc_keep)
 {
   // ASCENDING
   cudf::test::fixed_width_column_wrapper<cudf::size_type> col1_rank{{3, 2, 1, 4, 6, 5}};
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> col2_rank{{2, 1, -1, 3, 5, 4},
-                                                                    {1, 1, 0, 1, 1, 1}};  // KEEP
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> col3_rank{{2, 5, 1, 3, 6, 4},
-                                                                    {1, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> col2_rank{
+    {2, 1, -1, 3, 5, 4}, {true, true, false, true, true, true}};  // KEEP
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> col3_rank{
+    {2, 5, 1, 3, 6, 4}, {true, true, true, true, true, true}};
   this->run_all_tests(cudf::rank_method::FIRST, asc_keep, col1_rank, col2_rank, col3_rank);
 }
 
@@ -145,10 +146,10 @@ TYPED_TEST(Rank, first_desc_keep)
 {
   // DESCENDING
   cudf::test::fixed_width_column_wrapper<cudf::size_type> col1_rank{{2, 5, 6, 3, 1, 4}};
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> col2_rank{{2, 5, -1, 3, 1, 4},
-                                                                    {1, 1, 0, 1, 1, 1}};  // KEEP
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> col3_rank{{3, 2, 6, 4, 1, 5},
-                                                                    {1, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> col2_rank{
+    {2, 5, -1, 3, 1, 4}, {true, true, false, true, true, true}};  // KEEP
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> col3_rank{
+    {3, 2, 6, 4, 1, 5}, {true, true, true, true, true, true}};
   this->run_all_tests(cudf::rank_method::FIRST, desc_keep, col1_rank, col2_rank, col3_rank);
 }
 
@@ -173,10 +174,10 @@ TYPED_TEST(Rank, first_desc_bottom)
 TYPED_TEST(Rank, dense_asc_keep)
 {
   cudf::test::fixed_width_column_wrapper<cudf::size_type> col1_rank{{3, 2, 1, 3, 4, 3}};
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> col2_rank{{2, 1, -1, 2, 3, 2},
-                                                                    {1, 1, 0, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> col3_rank{{2, 3, 1, 2, 4, 2},
-                                                                    {1, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> col2_rank{
+    {2, 1, -1, 2, 3, 2}, {true, true, false, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> col3_rank{
+    {2, 3, 1, 2, 4, 2}, {true, true, true, true, true, true}};
   this->run_all_tests(cudf::rank_method::DENSE, asc_keep, col1_rank, col2_rank, col3_rank);
 }
 
@@ -199,10 +200,10 @@ TYPED_TEST(Rank, dense_asc_bottom)
 TYPED_TEST(Rank, dense_desc_keep)
 {
   cudf::test::fixed_width_column_wrapper<cudf::size_type> col1_rank{{2, 3, 4, 2, 1, 2}};
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> col2_rank{{2, 3, -1, 2, 1, 2},
-                                                                    {1, 1, 0, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> col3_rank{{3, 2, 4, 3, 1, 3},
-                                                                    {1, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> col2_rank{
+    {2, 3, -1, 2, 1, 2}, {true, true, false, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> col3_rank{
+    {3, 2, 4, 3, 1, 3}, {true, true, true, true, true, true}};
   this->run_all_tests(cudf::rank_method::DENSE, desc_keep, col1_rank, col2_rank, col3_rank);
 }
 
@@ -225,10 +226,10 @@ TYPED_TEST(Rank, dense_desc_bottom)
 TYPED_TEST(Rank, min_asc_keep)
 {
   cudf::test::fixed_width_column_wrapper<cudf::size_type> col1_rank{{3, 2, 1, 3, 6, 3}};
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> col2_rank{{2, 1, -1, 2, 5, 2},
-                                                                    {1, 1, 0, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> col3_rank{{2, 5, 1, 2, 6, 2},
-                                                                    {1, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> col2_rank{
+    {2, 1, -1, 2, 5, 2}, {true, true, false, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> col3_rank{
+    {2, 5, 1, 2, 6, 2}, {true, true, true, true, true, true}};
   this->run_all_tests(cudf::rank_method::MIN, asc_keep, col1_rank, col2_rank, col3_rank);
 }
 
@@ -251,10 +252,10 @@ TYPED_TEST(Rank, min_asc_bottom)
 TYPED_TEST(Rank, min_desc_keep)
 {
   cudf::test::fixed_width_column_wrapper<cudf::size_type> col1_rank{{2, 5, 6, 2, 1, 2}};
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> col2_rank{{2, 5, -1, 2, 1, 2},
-                                                                    {1, 1, 0, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> col3_rank{{3, 2, 6, 3, 1, 3},
-                                                                    {1, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> col2_rank{
+    {2, 5, -1, 2, 1, 2}, {true, true, false, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> col3_rank{
+    {3, 2, 6, 3, 1, 3}, {true, true, true, true, true, true}};
   this->run_all_tests(cudf::rank_method::MIN, desc_keep, col1_rank, col2_rank, col3_rank);
 }
 
@@ -277,10 +278,10 @@ TYPED_TEST(Rank, min_desc_bottom)
 TYPED_TEST(Rank, max_asc_keep)
 {
   cudf::test::fixed_width_column_wrapper<cudf::size_type> col1_rank{{5, 2, 1, 5, 6, 5}};
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> col2_rank{{4, 1, -1, 4, 5, 4},
-                                                                    {1, 1, 0, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> col3_rank{{4, 5, 1, 4, 6, 4},
-                                                                    {1, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> col2_rank{
+    {4, 1, -1, 4, 5, 4}, {true, true, false, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> col3_rank{
+    {4, 5, 1, 4, 6, 4}, {true, true, true, true, true, true}};
   this->run_all_tests(cudf::rank_method::MAX, asc_keep, col1_rank, col2_rank, col3_rank);
 }
 
@@ -303,10 +304,10 @@ TYPED_TEST(Rank, max_asc_bottom)
 TYPED_TEST(Rank, max_desc_keep)
 {
   cudf::test::fixed_width_column_wrapper<cudf::size_type> col1_rank{{4, 5, 6, 4, 1, 4}};
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> col2_rank{{4, 5, -1, 4, 1, 4},
-                                                                    {1, 1, 0, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> col3_rank{{5, 2, 6, 5, 1, 5},
-                                                                    {1, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> col2_rank{
+    {4, 5, -1, 4, 1, 4}, {true, true, false, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> col3_rank{
+    {5, 2, 6, 5, 1, 5}, {true, true, true, true, true, true}};
   this->run_all_tests(cudf::rank_method::MAX, desc_keep, col1_rank, col2_rank, col3_rank);
 }
 
@@ -329,8 +330,10 @@ TYPED_TEST(Rank, max_desc_bottom)
 TYPED_TEST(Rank, average_asc_keep)
 {
   cudf::test::fixed_width_column_wrapper<double> col1_rank{{4, 2, 1, 4, 6, 4}};
-  cudf::test::fixed_width_column_wrapper<double> col2_rank{{3, 1, -1, 3, 5, 3}, {1, 1, 0, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<double> col3_rank{{3, 5, 1, 3, 6, 3}, {1, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<double> col2_rank{{3, 1, -1, 3, 5, 3},
+                                                           {true, true, false, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<double> col3_rank{{3, 5, 1, 3, 6, 3},
+                                                           {true, true, true, true, true, true}};
   this->run_all_tests(cudf::rank_method::AVERAGE, asc_keep, col1_rank, col2_rank, col3_rank);
 }
 
@@ -353,8 +356,10 @@ TYPED_TEST(Rank, average_asc_bottom)
 TYPED_TEST(Rank, average_desc_keep)
 {
   cudf::test::fixed_width_column_wrapper<double> col1_rank{{3, 5, 6, 3, 1, 3}};
-  cudf::test::fixed_width_column_wrapper<double> col2_rank{{3, 5, -1, 3, 1, 3}, {1, 1, 0, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<double> col3_rank{{4, 2, 6, 4, 1, 4}, {1, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<double> col2_rank{{3, 5, -1, 3, 1, 3},
+                                                           {true, true, false, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<double> col3_rank{{4, 2, 6, 4, 1, 4},
+                                                           {true, true, true, true, true, true}};
   this->run_all_tests(cudf::rank_method::AVERAGE, desc_keep, col1_rank, col2_rank, col3_rank);
 }
 
@@ -379,9 +384,9 @@ TYPED_TEST(Rank, dense_asc_keep_pct)
 {
   cudf::test::fixed_width_column_wrapper<double> col1_rank{{0.75, 0.5, 0.25, 0.75, 1., 0.75}};
   cudf::test::fixed_width_column_wrapper<double> col2_rank{
-    {2.0 / 3.0, 1.0 / 3.0, -1., 2.0 / 3.0, 1., 2.0 / 3.0}, {1, 1, 0, 1, 1, 1}};
+    {2.0 / 3.0, 1.0 / 3.0, -1., 2.0 / 3.0, 1., 2.0 / 3.0}, {true, true, false, true, true, true}};
   cudf::test::fixed_width_column_wrapper<double> col3_rank{{0.5, 0.75, 0.25, 0.5, 1., 0.5},
-                                                           {1, 1, 1, 1, 1, 1}};
+                                                           {true, true, true, true, true, true}};
   this->run_all_tests(cudf::rank_method::DENSE, asc_keep, col1_rank, col2_rank, col3_rank, true);
 }
 
@@ -406,9 +411,9 @@ TYPED_TEST(Rank, min_desc_keep_pct)
   cudf::test::fixed_width_column_wrapper<double> col1_rank{
     {1.0 / 3.0, 5.0 / 6.0, 1., 1.0 / 3.0, 1.0 / 6.0, 1.0 / 3.0}};
   cudf::test::fixed_width_column_wrapper<double> col2_rank{{0.4, 1., -1., 0.4, 0.2, 0.4},
-                                                           {1, 1, 0, 1, 1, 1}};
+                                                           {true, true, false, true, true, true}};
   cudf::test::fixed_width_column_wrapper<double> col3_rank{
-    {0.5, 1.0 / 3.0, 1., 0.5, 1.0 / 6.0, 0.5}, {1, 1, 1, 1, 1, 1}};
+    {0.5, 1.0 / 3.0, 1., 0.5, 1.0 / 6.0, 0.5}, {true, true, true, true, true, true}};
   this->run_all_tests(cudf::rank_method::MIN, desc_keep, col1_rank, col2_rank, col3_rank, true);
 }
 
@@ -494,7 +499,7 @@ struct RankListAndStruct : public cudf::test::BaseFixture {
     7 |   {null, 0}|
       +------------+
     */
-    std::vector<bool>                           struct_valids{1, 1, 0, 1, 0, 1, 1, 1};
+    std::vector<bool>                           struct_valids{true, true, false, true, false, true, true, true};
     auto col1       = cudf::test::fixed_width_column_wrapper<T>{{ 0,  1,  9, -1,  9, -1, -1, -1}, {1, 1, 1, 0, 1, 0, 0, 0}};
     auto col2       = cudf::test::fixed_width_column_wrapper<T>{{-1, -1,  9, -1,  9, -1,  1,  0}, {0, 0, 1, 0, 1, 0, 1, 1}};
     auto struct_col = cudf::test::structs_column_wrapper{{col1, col2}, struct_valids}.release();
diff --git a/cpp/tests/sort/stable_sort_tests.cpp b/cpp/tests/sort/stable_sort_tests.cpp
index 341f8317004..655166e0d62 100644
--- a/cpp/tests/sort/stable_sort_tests.cpp
+++ b/cpp/tests/sort/stable_sort_tests.cpp
@@ -60,7 +60,7 @@ TYPED_TEST(StableSort, MixedNullOrder)
   cudf::test::fixed_width_column_wrapper<T> col1({0, 1, 1, 0, 0, 1, 0, 1},
                                                  {0, 1, 1, 1, 1, 1, 1, 1});
   cudf::test::strings_column_wrapper col2({"2", "a", "b", "x", "k", "a", "x", "a"},
-                                          {1, 1, 1, 1, 0, 1, 1, 1});
+                                          {true, true, true, true, false, true, true, true});
 
   cudf::test::fixed_width_column_wrapper<R> expected{{4, 3, 6, 1, 5, 7, 2, 0}};
 
@@ -76,7 +76,8 @@ TYPED_TEST(StableSort, WithNullMax)
   using T = TypeParam;
 
   cudf::test::fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8, 5}, {1, 1, 0, 1, 1, 1}};
-  cudf::test::strings_column_wrapper col2({"d", "e", "a", "d", "k", "d"}, {1, 1, 0, 1, 1, 1});
+  cudf::test::strings_column_wrapper col2({"d", "e", "a", "d", "k", "d"},
+                                          {true, true, false, true, true, true});
   cudf::test::fixed_width_column_wrapper<T> col3{{10, 40, 70, 10, 2, 10}, {1, 1, 0, 1, 1, 1}};
   cudf::table_view input{{col1, col2, col3}};
 
@@ -140,7 +141,8 @@ TYPED_TEST(StableSort, WithNullMin)
   using T = TypeParam;
 
   cudf::test::fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}, {1, 1, 0, 1, 1}};
-  cudf::test::strings_column_wrapper col2({"d", "e", "a", "d", "k"}, {1, 1, 0, 1, 1});
+  cudf::test::strings_column_wrapper col2({"d", "e", "a", "d", "k"},
+                                          {true, true, false, true, true});
   cudf::test::fixed_width_column_wrapper<T> col3{{10, 40, 70, 10, 2}, {1, 1, 0, 1, 1}};
   cudf::table_view input{{col1, col2, col3}};
 
diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp
index ee0ca3f86c1..a2dab649961 100644
--- a/cpp/tests/stream_compaction/distinct_count_tests.cpp
+++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp
@@ -254,7 +254,7 @@ TEST_F(DistinctCount, StringColumnWithNull)
 {
   cudf::test::strings_column_wrapper input_col{
     {"", "this", "is", "this", "This", "a", "column", "of", "the", "strings"},
-    {1, 1, 1, 1, 1, 1, 1, 1, 0, 1}};
+    {true, true, true, true, true, true, true, true, false, true}};
 
   cudf::size_type const expected =
     (std::vector<std::string>{"", "this", "is", "This", "a", "column", "of", "strings"}).size();
@@ -264,10 +264,12 @@ TEST_F(DistinctCount, StringColumnWithNull)
 
 TEST_F(DistinctCount, TableWithNull)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1},
-                                                       {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<int32_t> col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1},
-                                                       {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col1{
+    {5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1},
+    {true, true, true, true, true, true, true, true, false, true, false}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col2{
+    {2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1},
+    {true, true, true, false, true, true, true, false, false, true, false}};
   cudf::table_view input{{col1, col2}};
 
   EXPECT_EQ(8, cudf::distinct_count(input, null_equality::EQUAL));
@@ -276,7 +278,8 @@ TEST_F(DistinctCount, TableWithNull)
 
 TEST_F(DistinctCount, TableWithSomeNull)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col1{{1, 2, 3, 4, 5, 6}, {1, 0, 1, 0, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col1{{1, 2, 3, 4, 5, 6},
+                                                       {true, false, true, false, true, false}};
   cudf::test::fixed_width_column_wrapper<int32_t> col2{{1, 1, 1, 1, 1, 1}};
   cudf::table_view input{{col1, col2}};
 
@@ -296,12 +299,15 @@ TEST_F(DistinctCount, EmptyColumnedTable)
 
 TEST_F(DistinctCount, TableMixedTypes)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col1{{5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1},
-                                                       {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<double> col2{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1},
-                                                      {1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<uint32_t> col3{{2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1},
-                                                        {1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col1{
+    {5, 4, 3, 5, 8, 1, 4, 5, 0, 9, -1},
+    {true, true, true, true, true, true, true, true, false, true, false}};
+  cudf::test::fixed_width_column_wrapper<double> col2{
+    {2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1},
+    {true, true, true, false, true, true, true, false, false, true, false}};
+  cudf::test::fixed_width_column_wrapper<uint32_t> col3{
+    {2, 2, 2, -1, 2, 1, 2, 0, 0, 9, -1},
+    {true, true, true, false, true, true, true, true, false, true, false}};
   cudf::table_view input{{col1, col2, col3}};
 
   EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL));
@@ -310,11 +316,12 @@ TEST_F(DistinctCount, TableMixedTypes)
 
 TEST_F(DistinctCount, TableWithStringColumnWithNull)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col1{{0, 9, 8, 9, 6, 5, 4, 3, 2, 1, 0},
-                                                       {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col1{
+    {0, 9, 8, 9, 6, 5, 4, 3, 2, 1, 0},
+    {true, true, true, true, true, true, true, true, false, true, false}};
   cudf::test::strings_column_wrapper col2{
     {"", "this", "is", "this", "this", "a", "column", "of", "the", "strings", ""},
-    {1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0}};
+    {true, true, true, true, true, true, true, true, false, true, false}};
 
   cudf::table_view input{{col1, col2}};
   EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL));
diff --git a/cpp/tests/stream_compaction/distinct_tests.cpp b/cpp/tests/stream_compaction/distinct_tests.cpp
index 586792b4b30..14d7d8789ac 100644
--- a/cpp/tests/stream_compaction/distinct_tests.cpp
+++ b/cpp/tests/stream_compaction/distinct_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -143,7 +143,7 @@ TEST_F(DistinctKeepAny, NoColumnInputTable)
 
 TEST_F(DistinctKeepAny, EmptyKeys)
 {
-  int32s_col col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}};
+  int32s_col col{{5, 4, 3, 5, 8, 1}, {true, false, true, true, true, true}};
   int32s_col empty_col{};
   cudf::table_view input{{col}};
   std::vector<cudf::size_type> key_idx{};
diff --git a/cpp/tests/stream_compaction/drop_nans_tests.cpp b/cpp/tests/stream_compaction/drop_nans_tests.cpp
index 425d9a47ecc..bf72da5c840 100644
--- a/cpp/tests/stream_compaction/drop_nans_tests.cpp
+++ b/cpp/tests/stream_compaction/drop_nans_tests.cpp
@@ -35,16 +35,19 @@ TEST_F(DropNANsTest, MixedNANsAndNull)
   using F = float;
   using D = double;
   cudf::test::fixed_width_column_wrapper<float> col1{
-    {F(1.0), F(2.0), F(NAN), F(NAN), F(5.0), F(6.0)}, {1, 1, 0, 1, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<int32_t> col2{{10, 40, 70, 5, 2, 10}, {1, 1, 0, 1, 1, 0}};
+    {F(1.0), F(2.0), F(NAN), F(NAN), F(5.0), F(6.0)}, {true, true, false, true, true, false}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col2{{10, 40, 70, 5, 2, 10},
+                                                       {true, true, false, true, true, false}};
   cudf::test::fixed_width_column_wrapper<double> col3{{D(NAN), 40.0, 70.0, 5.0, 2.0, 10.0},
-                                                      {1, 1, 0, 1, 1, 0}};
+                                                      {true, true, false, true, true, false}};
   cudf::table_view input{{col1, col2, col3}};
   std::vector<cudf::size_type> keys{0, 2};
-  cudf::test::fixed_width_column_wrapper<float> col1_expected{{2.0, 3.0, 5.0, 6.0}, {1, 0, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<int32_t> col2_expected{{40, 70, 2, 10}, {1, 0, 1, 0}};
+  cudf::test::fixed_width_column_wrapper<float> col1_expected{{2.0, 3.0, 5.0, 6.0},
+                                                              {true, false, true, false}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col2_expected{{40, 70, 2, 10},
+                                                                {true, false, true, false}};
   cudf::test::fixed_width_column_wrapper<double> col3_expected{{40.0, 70.0, 2.0, 10.0},
-                                                               {1, 0, 1, 0}};
+                                                               {true, false, true, false}};
   cudf::table_view expected{{col1_expected, col2_expected, col3_expected}};
 
   auto got = cudf::drop_nans(input, keys);
@@ -55,9 +58,11 @@ TEST_F(DropNANsTest, MixedNANsAndNull)
 TEST_F(DropNANsTest, NoNANs)
 {
   cudf::test::fixed_width_column_wrapper<float> col1{{1.0, 2.0, 3.0, 4.0, 5.0, 6.0},
-                                                     {1, 1, 0, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> col2{{10, 40, 70, 5, 2, 10}, {1, 1, 1, 1, 0, 1}};
-  cudf::test::fixed_width_column_wrapper<double> col3{{10, 40, 70, 5, 2, 10}, {1, 1, 0, 1, 1, 1}};
+                                                     {true, true, false, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col2{{10, 40, 70, 5, 2, 10},
+                                                       {true, true, true, true, false, true}};
+  cudf::test::fixed_width_column_wrapper<double> col3{{10, 40, 70, 5, 2, 10},
+                                                      {true, true, false, true, true, true}};
   cudf::table_view input{{col1, col2, col3}};
   std::vector<cudf::size_type> keys{0, 2};
 
@@ -71,18 +76,19 @@ TEST_F(DropNANsTest, MixedWithThreshold)
   using F = float;
   using D = double;
   cudf::test::fixed_width_column_wrapper<float> col1{
-    {F(1.0), F(2.0), F(NAN), F(NAN), F(5.0), F(6.0)}, {1, 1, 0, 1, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<int32_t> col2{{10, 40, 70, 5, 2, 10}, {1, 1, 0, 1, 1, 0}};
+    {F(1.0), F(2.0), F(NAN), F(NAN), F(5.0), F(6.0)}, {true, true, false, true, true, false}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col2{{10, 40, 70, 5, 2, 10},
+                                                       {true, true, false, true, true, false}};
   cudf::test::fixed_width_column_wrapper<double> col3{{D(NAN), 40.0, 70.0, D(NAN), 2.0, 10.0},
-                                                      {1, 1, 0, 1, 1, 0}};
+                                                      {true, true, false, true, true, false}};
   cudf::table_view input{{col1, col2, col3}};
   std::vector<cudf::size_type> keys{0, 2};
   cudf::test::fixed_width_column_wrapper<float> col1_expected{{1.0, 2.0, 3.0, 5.0, 6.0},
-                                                              {1, 1, 0, 1, 0}};
+                                                              {true, true, false, true, false}};
   cudf::test::fixed_width_column_wrapper<int32_t> col2_expected{{10, 40, 70, 2, 10},
-                                                                {1, 1, 0, 1, 0}};
+                                                                {true, true, false, true, false}};
   cudf::test::fixed_width_column_wrapper<double> col3_expected{{D(NAN), 40.0, 70.0, 2.0, 10.0},
-                                                               {1, 1, 0, 1, 0}};
+                                                               {true, true, false, true, false}};
   cudf::table_view expected{{col1_expected, col2_expected, col3_expected}};
 
   auto got = cudf::drop_nans(input, keys, 1);
@@ -122,7 +128,7 @@ TEST_F(DropNANsTest, EmptyKeys)
 {
   using F = float;
   cudf::test::fixed_width_column_wrapper<float> col1{
-    {F(1.0), F(2.0), F(NAN), F(NAN), F(5.0), F(6.0)}, {1, 1, 0, 1, 1, 0}};
+    {F(1.0), F(2.0), F(NAN), F(NAN), F(5.0), F(6.0)}, {true, true, false, true, true, false}};
   cudf::table_view input{{col1}};
   std::vector<cudf::size_type> keys{};
 
diff --git a/cpp/tests/stream_compaction/drop_nulls_tests.cpp b/cpp/tests/stream_compaction/drop_nulls_tests.cpp
index 47aa2d8ee3e..dbac1d58195 100644
--- a/cpp/tests/stream_compaction/drop_nulls_tests.cpp
+++ b/cpp/tests/stream_compaction/drop_nulls_tests.cpp
@@ -34,15 +34,19 @@ struct DropNullsTest : public cudf::test::BaseFixture {};
 TEST_F(DropNullsTest, WholeRowIsNull)
 {
   cudf::test::fixed_width_column_wrapper<int16_t> col1{{true, false, true, false, true, false},
-                                                       {1, 1, 0, 1, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<int32_t> col2{{10, 40, 70, 5, 2, 10}, {1, 1, 0, 1, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<double> col3{{10, 40, 70, 5, 2, 10}, {1, 1, 0, 1, 1, 0}};
+                                                       {true, true, false, true, true, false}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col2{{10, 40, 70, 5, 2, 10},
+                                                       {true, true, false, true, true, false}};
+  cudf::test::fixed_width_column_wrapper<double> col3{{10, 40, 70, 5, 2, 10},
+                                                      {true, true, false, true, true, false}};
   cudf::table_view input{{col1, col2, col3}};
   std::vector<cudf::size_type> keys{0, 1, 2};
   cudf::test::fixed_width_column_wrapper<int16_t> col1_expected{{true, false, false, true},
-                                                                {1, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> col2_expected{{10, 40, 5, 2}, {1, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<double> col3_expected{{10, 40, 5, 2}, {1, 1, 1, 1}};
+                                                                {true, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col2_expected{{10, 40, 5, 2},
+                                                                {true, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<double> col3_expected{{10, 40, 5, 2},
+                                                               {true, true, true, true}};
   cudf::table_view expected{{col1_expected, col2_expected, col3_expected}};
 
   auto got = cudf::drop_nulls(input, keys);
@@ -53,9 +57,11 @@ TEST_F(DropNullsTest, WholeRowIsNull)
 TEST_F(DropNullsTest, NoNull)
 {
   cudf::test::fixed_width_column_wrapper<int16_t> col1{{true, false, true, false, true, false},
-                                                       {1, 1, 1, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> col2{{10, 40, 70, 5, 2, 10}, {1, 1, 1, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<double> col3{{10, 40, 70, 5, 2, 10}, {1, 1, 1, 1, 1, 1}};
+                                                       {true, true, true, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col2{{10, 40, 70, 5, 2, 10},
+                                                       {true, true, true, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<double> col3{{10, 40, 70, 5, 2, 10},
+                                                      {true, true, true, true, true, true}};
   cudf::table_view input{{col1, col2, col3}};
   std::vector<cudf::size_type> keys{0, 1, 2};
 
@@ -67,15 +73,19 @@ TEST_F(DropNullsTest, NoNull)
 TEST_F(DropNullsTest, MixedSetOfRows)
 {
   cudf::test::fixed_width_column_wrapper<int16_t> col1{{true, false, true, false, true, false},
-                                                       {1, 1, 0, 1, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<int32_t> col2{{10, 40, 70, 5, 2, 10}, {1, 1, 0, 1, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<double> col3{{10, 40, 70, 5, 2, 10}, {1, 1, 0, 1, 1, 1}};
+                                                       {true, true, false, true, true, false}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col2{{10, 40, 70, 5, 2, 10},
+                                                       {true, true, false, true, true, false}};
+  cudf::test::fixed_width_column_wrapper<double> col3{{10, 40, 70, 5, 2, 10},
+                                                      {true, true, false, true, true, true}};
   cudf::table_view input{{col1, col2, col3}};
   std::vector<cudf::size_type> keys{0, 1, 2};
   cudf::test::fixed_width_column_wrapper<int16_t> col1_expected{{true, false, false, true},
-                                                                {1, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> col2_expected{{10, 40, 5, 2}, {1, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<double> col3_expected{{10, 40, 5, 2}, {1, 1, 1, 1}};
+                                                                {true, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col2_expected{{10, 40, 5, 2},
+                                                                {true, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<double> col3_expected{{10, 40, 5, 2},
+                                                               {true, true, true, true}};
   cudf::table_view expected{{col1_expected, col2_expected, col3_expected}};
 
   auto got = cudf::drop_nulls(input, keys);
@@ -132,16 +142,19 @@ TEST_F(DropNullsTest, LargeColumn)
 TEST_F(DropNullsTest, MixedSetOfRowsWithThreshold)
 {
   cudf::test::fixed_width_column_wrapper<int16_t> col1{{true, false, true, false, true, false},
-                                                       {1, 1, 0, 1, 1, 0}};
-  cudf::test::fixed_width_column_wrapper<int32_t> col2{{10, 40, 70, 5, 2, 10}, {1, 1, 0, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<double> col3{{10, 40, 70, 5, 2, 10}, {1, 1, 1, 1, 1, 1}};
+                                                       {true, true, false, true, true, false}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col2{{10, 40, 70, 5, 2, 10},
+                                                       {true, true, false, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<double> col3{{10, 40, 70, 5, 2, 10},
+                                                      {true, true, true, true, true, true}};
   cudf::table_view input{{col1, col2, col3}};
   std::vector<cudf::size_type> keys{0, 1, 2};
   cudf::test::fixed_width_column_wrapper<int16_t> col1_expected{{true, false, false, true, false},
-                                                                {1, 1, 1, 1, 0}};
+                                                                {true, true, true, true, false}};
   cudf::test::fixed_width_column_wrapper<int32_t> col2_expected{{10, 40, 5, 2, 10},
-                                                                {1, 1, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<double> col3_expected{{10, 40, 5, 2, 10}, {1, 1, 1, 1, 1}};
+                                                                {true, true, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<double> col3_expected{{10, 40, 5, 2, 10},
+                                                               {true, true, true, true, true}};
   cudf::table_view expected{{col1_expected, col2_expected, col3_expected}};
 
   auto got = cudf::drop_nulls(input, keys, keys.size() - 1);
@@ -180,7 +193,7 @@ TEST_F(DropNullsTest, EmptyColumns)
 TEST_F(DropNullsTest, EmptyKeys)
 {
   cudf::test::fixed_width_column_wrapper<int16_t> col1{{true, false, true, false, true, false},
-                                                       {1, 1, 0, 1, 1, 0}};
+                                                       {true, true, false, true, true, false}};
   cudf::table_view input{{col1}};
   std::vector<cudf::size_type> keys{};
 
@@ -191,13 +204,15 @@ TEST_F(DropNullsTest, EmptyKeys)
 TEST_F(DropNullsTest, StringColWithNull)
 {
   cudf::test::fixed_width_column_wrapper<int16_t> col1{{11, 12, 11, 13, 12, 15},
-                                                       {1, 1, 0, 1, 0, 1}};
+                                                       {true, true, false, true, false, true}};
   cudf::test::strings_column_wrapper col2{{"Hi", "Hello", "Hi", "No", "Hello", "Naive"},
-                                          {1, 1, 0, 1, 0, 1}};
+                                          {true, true, false, true, false, true}};
   cudf::table_view input{{col1, col2}};
   std::vector<cudf::size_type> keys{0, 1};
-  cudf::test::fixed_width_column_wrapper<int16_t> col1_expected{{11, 12, 13, 15}, {1, 1, 1, 1}};
-  cudf::test::strings_column_wrapper col2_expected{{"Hi", "Hello", "No", "Naive"}, {1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int16_t> col1_expected{{11, 12, 13, 15},
+                                                                {true, true, true, true}};
+  cudf::test::strings_column_wrapper col2_expected{{"Hi", "Hello", "No", "Naive"},
+                                                   {true, true, true, true}};
   cudf::table_view expected{{col1_expected, col2_expected}};
 
   auto got = cudf::drop_nulls(input, keys);
diff --git a/cpp/tests/stream_compaction/stable_distinct_tests.cpp b/cpp/tests/stream_compaction/stable_distinct_tests.cpp
index e28b96fc8be..6c6c53331d4 100644
--- a/cpp/tests/stream_compaction/stable_distinct_tests.cpp
+++ b/cpp/tests/stream_compaction/stable_distinct_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -137,7 +137,7 @@ TEST_F(StableDistinctKeepAny, NoColumnInputTable)
 
 TEST_F(StableDistinctKeepAny, EmptyKeys)
 {
-  int32s_col col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}};
+  int32s_col col{{5, 4, 3, 5, 8, 1}, {true, false, true, true, true, true}};
   int32s_col empty_col{};
   cudf::table_view input{{col}};
   std::vector<cudf::size_type> key_idx{};
diff --git a/cpp/tests/stream_compaction/unique_tests.cpp b/cpp/tests/stream_compaction/unique_tests.cpp
index 01f5f4d39db..4d7d23dc881 100644
--- a/cpp/tests/stream_compaction/unique_tests.cpp
+++ b/cpp/tests/stream_compaction/unique_tests.cpp
@@ -56,15 +56,17 @@ struct Unique : public cudf::test::BaseFixture {};
 
 TEST_F(Unique, StringKeyColumn)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 4, 5, 5, 8, 1}, {1, 0, 0, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 4, 5, 5, 8, 1},
+                                                      {true, false, false, true, true, true, true}};
   cudf::test::strings_column_wrapper key_col{{"all", "new", "new", "all", "new", "the", "strings"},
-                                             {1, 1, 1, 1, 0, 1, 1}};
+                                             {true, true, true, true, false, true, true}};
   cudf::table_view input{{col, key_col}};
   std::vector<cudf::size_type> keys{1};
 
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_col{{5, 4, 5, 5, 8, 1}, {1, 0, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col{{5, 4, 5, 5, 8, 1},
+                                                          {true, false, true, true, true, true}};
   cudf::test::strings_column_wrapper exp_key_col{{"all", "new", "all", "new", "the", "strings"},
-                                                 {1, 1, 1, 0, 1, 1}};
+                                                 {true, true, true, false, true, true}};
   cudf::table_view expected{{exp_col, exp_key_col}};
 
   auto got = unique(input, keys, cudf::duplicate_keep_option::KEEP_LAST);
@@ -92,7 +94,8 @@ TEST_F(Unique, NoColumnInputTable)
 
 TEST_F(Unique, EmptyKeys)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 5, 8, 1}, {1, 0, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 5, 8, 1},
+                                                      {true, false, true, true, true, true}};
   cudf::test::fixed_width_column_wrapper<int32_t> empty_col{};
   cudf::table_view input{{col}};
   std::vector<cudf::size_type> keys{};
@@ -151,17 +154,18 @@ TEST_F(Unique, NonNullTable)
 
 TEST_F(Unique, KeepFirstWithNull)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 2, 5, 8, 1},
+                                                      {true, false, true, true, true, true, true}};
   cudf::test::fixed_width_column_wrapper<int32_t> key{{20, 20, 20, 20, 19, 21, 19},
-                                                      {1, 1, 0, 0, 1, 1, 1}};
+                                                      {true, true, false, false, true, true, true}};
   cudf::table_view input{{col, key}};
   std::vector<cudf::size_type> keys{1};
 
   // nulls are equal
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_first_equal{{5, 3, 5, 8, 1},
-                                                                      {1, 1, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_first_equal{{20, 20, 19, 21, 19},
-                                                                          {1, 0, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_first_equal{
+    {5, 3, 5, 8, 1}, {true, true, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_first_equal{
+    {20, 20, 19, 21, 19}, {true, false, true, true, true}};
   cudf::table_view expected_first_equal{{exp_col_first_equal, exp_key_col_first_equal}};
   auto got_first_equal =
     unique(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::EQUAL);
@@ -169,10 +173,10 @@ TEST_F(Unique, KeepFirstWithNull)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first_equal, got_first_equal->view());
 
   // nulls are unequal
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_first_unequal{{5, 3, 2, 5, 8, 1},
-                                                                        {1, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_first_unequal{
+    {5, 3, 2, 5, 8, 1}, {true, true, true, true, true, true}};
   cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_first_unequal{
-    {20, 20, 20, 19, 21, 19}, {1, 0, 0, 1, 1, 1}};
+    {20, 20, 20, 19, 21, 19}, {true, false, false, true, true, true}};
   cudf::table_view expected_first_unequal{{exp_col_first_unequal, exp_key_col_first_unequal}};
   auto got_first_unequal =
     unique(input, keys, cudf::duplicate_keep_option::KEEP_FIRST, null_equality::UNEQUAL);
@@ -182,17 +186,18 @@ TEST_F(Unique, KeepFirstWithNull)
 
 TEST_F(Unique, KeepLastWithNull)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 2, 5, 8, 1},
+                                                      {true, false, true, true, true, true, true}};
   cudf::test::fixed_width_column_wrapper<int32_t> key{{20, 20, 20, 20, 19, 21, 19},
-                                                      {1, 1, 0, 0, 1, 1, 1}};
+                                                      {true, true, false, false, true, true, true}};
   cudf::table_view input{{col, key}};
   std::vector<cudf::size_type> keys{1};
 
   // nulls are equal
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_last_equal{{4, 2, 5, 8, 1},
-                                                                     {0, 1, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_last_equal{{20, 20, 19, 21, 19},
-                                                                         {1, 0, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_last_equal{
+    {4, 2, 5, 8, 1}, {false, true, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_last_equal{
+    {20, 20, 19, 21, 19}, {true, false, true, true, true}};
   cudf::table_view expected_last_equal{{exp_col_last_equal, exp_key_col_last_equal}};
   auto got_last_equal =
     unique(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::EQUAL);
@@ -200,10 +205,10 @@ TEST_F(Unique, KeepLastWithNull)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last_equal, got_last_equal->view());
 
   // nulls are unequal
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_last_unequal{{4, 3, 2, 5, 8, 1},
-                                                                       {0, 1, 1, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_last_unequal{{20, 20, 20, 19, 21, 19},
-                                                                           {1, 0, 0, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_last_unequal{
+    {4, 3, 2, 5, 8, 1}, {false, true, true, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_last_unequal{
+    {20, 20, 20, 19, 21, 19}, {true, false, false, true, true, true}};
   cudf::table_view expected_last_unequal{{exp_col_last_unequal, exp_key_col_last_unequal}};
   auto got_last_unequal =
     unique(input, keys, cudf::duplicate_keep_option::KEEP_LAST, null_equality::UNEQUAL);
@@ -213,15 +218,18 @@ TEST_F(Unique, KeepLastWithNull)
 
 TEST_F(Unique, KeepNoneWithNull)
 {
-  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 2, 5, 8, 1}, {1, 0, 1, 1, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> col{{5, 4, 3, 2, 5, 8, 1},
+                                                      {true, false, true, true, true, true, true}};
   cudf::test::fixed_width_column_wrapper<int32_t> key{{20, 20, 20, 20, 19, 21, 19},
-                                                      {1, 1, 0, 0, 1, 1, 1}};
+                                                      {true, true, false, false, true, true, true}};
   cudf::table_view input{{col, key}};
   std::vector<cudf::size_type> keys{1};
 
   // nulls are equal
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_unique_equal{{5, 8, 1}, {1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_unique_equal{{19, 21, 19}, {1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_unique_equal{{5, 8, 1},
+                                                                       {true, true, true}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_unique_equal{{19, 21, 19},
+                                                                           {true, true, true}};
   cudf::table_view expected_unique_equal{{exp_col_unique_equal, exp_key_col_unique_equal}};
   auto got_unique_equal =
     unique(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::EQUAL);
@@ -229,10 +237,10 @@ TEST_F(Unique, KeepNoneWithNull)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique_equal, got_unique_equal->view());
 
   // nulls are unequal
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_unique_unequal{{3, 2, 5, 8, 1},
-                                                                         {1, 1, 1, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_unique_unequal{{20, 20, 19, 21, 19},
-                                                                             {0, 0, 1, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_col_unique_unequal{
+    {3, 2, 5, 8, 1}, {true, true, true, true, true}};
+  cudf::test::fixed_width_column_wrapper<int32_t> exp_key_col_unique_unequal{
+    {20, 20, 19, 21, 19}, {false, false, true, true, true}};
   cudf::table_view expected_unique_unequal{{exp_col_unique_unequal, exp_key_col_unique_unequal}};
   auto got_unique_unequal =
     unique(input, keys, cudf::duplicate_keep_option::KEEP_NONE, null_equality::UNEQUAL);
diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp
index cf620749d8f..9e4ee5a4a93 100644
--- a/cpp/tests/streams/interop_test.cpp
+++ b/cpp/tests/streams/interop_test.cpp
@@ -21,6 +21,7 @@
 #include <cudf/interop.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/table/table_view.hpp>
 
 struct ArrowTest : public cudf::test::BaseFixture {};
 
diff --git a/cpp/tests/streams/io/orc_test.cpp b/cpp/tests/streams/io/orc_test.cpp
index 57e36d13224..401c7049381 100644
--- a/cpp/tests/streams/io/orc_test.cpp
+++ b/cpp/tests/streams/io/orc_test.cpp
@@ -60,7 +60,7 @@ cudf::table construct_table()
   cudf::test::fixed_width_column_wrapper<float> col4(zeros_iterator, zeros_iterator + num_rows);
   cudf::test::fixed_width_column_wrapper<double> col5(zeros_iterator, zeros_iterator + num_rows);
 
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6 = [&ones_iterator, num_rows] {
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6 = [&ones_iterator] {
     auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
       return numeric::decimal128{ones_iterator[i], numeric::scale_type{12}};
     });
@@ -68,7 +68,7 @@ cudf::table construct_table()
                                                                        col6_data + num_rows);
   }();
 
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7 = [&ones_iterator, num_rows] {
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7 = [&ones_iterator] {
     auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
       return numeric::decimal128{ones_iterator[i], numeric::scale_type{-12}};
     });
diff --git a/cpp/tests/streams/io/parquet_test.cpp b/cpp/tests/streams/io/parquet_test.cpp
index f6bb2cf4336..b277d184e3a 100644
--- a/cpp/tests/streams/io/parquet_test.cpp
+++ b/cpp/tests/streams/io/parquet_test.cpp
@@ -55,14 +55,14 @@ cudf::table construct_table()
   cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6 = [&ones, num_rows] {
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6 = [&ones] {
     auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
       return numeric::decimal128{ones[i], numeric::scale_type{12}};
     });
     return cudf::test::fixed_width_column_wrapper<numeric::decimal128>(col6_data,
                                                                        col6_data + num_rows);
   }();
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7 = [&ones, num_rows] {
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7 = [&ones] {
     auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
       return numeric::decimal128{ones[i], numeric::scale_type{-12}};
     });
diff --git a/cpp/tests/streams/lists_test.cpp b/cpp/tests/streams/lists_test.cpp
index 74e0e8837f7..711e20e4b17 100644
--- a/cpp/tests/streams/lists_test.cpp
+++ b/cpp/tests/streams/lists_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -154,7 +154,8 @@ TEST_F(ListTest, StableSortLists)
 TEST_F(ListTest, ApplyBooleanMask)
 {
   cudf::test::lists_column_wrapper<int> list_col{{0, 1}, {2, 3, 7, 8}, {4, 5}};
-  cudf::test::lists_column_wrapper<bool> boolean_mask{{0, 1}, {1, 1, 1, 0}, {0, 1}};
+  cudf::test::lists_column_wrapper<bool> boolean_mask{
+    {false, true}, {true, true, true, false}, {false, true}};
   cudf::lists::apply_boolean_mask(list_col, boolean_mask, cudf::test::get_default_stream());
 }
 
diff --git a/cpp/tests/streams/reduction_test.cpp b/cpp/tests/streams/reduction_test.cpp
index 53dd1eed459..e6438ac2834 100644
--- a/cpp/tests/streams/reduction_test.cpp
+++ b/cpp/tests/streams/reduction_test.cpp
@@ -48,8 +48,9 @@ TEST_F(ReductionTest, ReductionSumScalarInit)
 
 TEST_F(ReductionTest, SegmentedReductionSum)
 {
-  auto const input     = cudf::test::fixed_width_column_wrapper<int>{{1, 2, 3, 1, 0, 3, 1, 0, 0, 0},
-                                                                     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
+  auto const input = cudf::test::fixed_width_column_wrapper<int>{
+    {1, 2, 3, 1, 0, 3, 1, 0, 0, 0},
+    {true, true, true, true, false, true, true, false, false, false}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
     offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
@@ -65,8 +66,9 @@ TEST_F(ReductionTest, SegmentedReductionSum)
 
 TEST_F(ReductionTest, SegmentedReductionSumScalarInit)
 {
-  auto const input     = cudf::test::fixed_width_column_wrapper<int>{{1, 2, 3, 1, 0, 3, 1, 0, 0, 0},
-                                                                     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
+  auto const input = cudf::test::fixed_width_column_wrapper<int>{
+    {1, 2, 3, 1, 0, 3, 1, 0, 0, 0},
+    {true, true, true, true, false, true, true, false, false, false}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
     offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
@@ -84,7 +86,8 @@ TEST_F(ReductionTest, SegmentedReductionSumScalarInit)
 TEST_F(ReductionTest, ScanMin)
 {
   auto const input = cudf::test::fixed_width_column_wrapper<int>{
-    {123, 64, 63, 99, -5, 123, -16, -120, -111}, {1, 0, 1, 1, 1, 1, 0, 0, 1}};
+    {123, 64, 63, 99, -5, 123, -16, -120, -111},
+    {true, false, true, true, true, true, false, false, true}};
 
   cudf::scan(input,
              *cudf::make_min_aggregation<cudf::scan_aggregation>(),
@@ -96,7 +99,8 @@ TEST_F(ReductionTest, ScanMin)
 TEST_F(ReductionTest, MinMax)
 {
   auto const input = cudf::test::fixed_width_column_wrapper<int>{
-    {123, 64, 63, 99, -5, 123, -16, -120, -111}, {1, 0, 1, 1, 1, 1, 0, 0, 1}};
+    {123, 64, 63, 99, -5, 123, -16, -120, -111},
+    {true, false, true, true, true, true, false, false, true}};
 
   cudf::minmax(input, cudf::test::get_default_stream());
 }
diff --git a/cpp/tests/streams/replace_test.cpp b/cpp/tests/streams/replace_test.cpp
index 25293db4347..752ddc87dfc 100644
--- a/cpp/tests/streams/replace_test.cpp
+++ b/cpp/tests/streams/replace_test.cpp
@@ -27,21 +27,24 @@ class ReplaceTest : public cudf::test::BaseFixture {};
 
 TEST_F(ReplaceTest, ReplaceNullsColumn)
 {
-  cudf::test::fixed_width_column_wrapper<int> input({{0, 0, 0, 0, 0}, {0, 0, 1, 1, 1}});
+  cudf::test::fixed_width_column_wrapper<int> input(
+    {{0, 0, 0, 0, 0}, {false, false, true, true, true}});
   cudf::test::fixed_width_column_wrapper<int> replacement({1, 1, 1, 1, 1});
   cudf::replace_nulls(input, replacement, cudf::test::get_default_stream());
 }
 
 TEST_F(ReplaceTest, ReplaceNullsScalar)
 {
-  cudf::test::fixed_width_column_wrapper<int> input({{0, 0, 0, 0, 0}, {0, 0, 1, 1, 1}});
+  cudf::test::fixed_width_column_wrapper<int> input(
+    {{0, 0, 0, 0, 0}, {false, false, true, true, true}});
   auto replacement = cudf::numeric_scalar<int>(1, true, cudf::test::get_default_stream());
   cudf::replace_nulls(input, replacement, cudf::test::get_default_stream());
 }
 
 TEST_F(ReplaceTest, ReplaceNullsPolicy)
 {
-  cudf::test::fixed_width_column_wrapper<int> input({{0, 0, 0, 0, 0}, {0, 0, 1, 1, 1}});
+  cudf::test::fixed_width_column_wrapper<int> input(
+    {{0, 0, 0, 0, 0}, {false, false, true, true, true}});
   cudf::replace_nulls(input, cudf::replace_policy::FOLLOWING, cudf::test::get_default_stream());
 }
 
diff --git a/cpp/tests/streams/strings/filter_test.cpp b/cpp/tests/streams/strings/filter_test.cpp
index 53ebe4e0b0d..b4e361201fd 100644
--- a/cpp/tests/streams/strings/filter_test.cpp
+++ b/cpp/tests/streams/strings/filter_test.cpp
@@ -41,7 +41,7 @@ TEST_F(StringsFilterTest, Translate)
   auto view  = cudf::strings_column_view(input);
 
   std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> translate_table{
-    make_entry("b", 0), make_entry("a", "A"), make_entry(" ", "_")};
+    make_entry("b", nullptr), make_entry("a", "A"), make_entry(" ", "_")};
   cudf::strings::translate(view, translate_table, cudf::test::get_default_stream());
 }
 
@@ -51,7 +51,7 @@ TEST_F(StringsFilterTest, Filter)
   auto view  = cudf::strings_column_view(input);
 
   std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> filter_table{
-    make_entry("b", 0), make_entry("a", "A"), make_entry(" ", "_")};
+    make_entry("b", nullptr), make_entry("a", "A"), make_entry(" ", "_")};
 
   auto const repl = cudf::string_scalar("X", true, cudf::test::get_default_stream());
   auto const keep = cudf::strings::filter_type::KEEP;
diff --git a/cpp/tests/strings/case_tests.cpp b/cpp/tests/strings/case_tests.cpp
index bb0e77a29d0..ce61a1bda8c 100644
--- a/cpp/tests/strings/case_tests.cpp
+++ b/cpp/tests/strings/case_tests.cpp
@@ -99,28 +99,28 @@ TEST_F(StringsCaseTest, Capitalize)
 {
   cudf::test::strings_column_wrapper strings(
     {"SȺȺnich xyZ", "Examples aBc", "thesé", "", "ARE\tTHE", "tést\tstrings", ""},
-    {1, 1, 1, 0, 1, 1, 1});
+    {true, true, true, false, true, true, true});
   auto strings_view = cudf::strings_column_view(strings);
 
   {
     auto results = cudf::strings::capitalize(strings_view);
     cudf::test::strings_column_wrapper expected(
       {"Sⱥⱥnich xyz", "Examples abc", "Thesé", "", "Are\tthe", "Tést\tstrings", ""},
-      {1, 1, 1, 0, 1, 1, 1});
+      {true, true, true, false, true, true, true});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     auto results = cudf::strings::capitalize(strings_view, std::string(" "));
     cudf::test::strings_column_wrapper expected(
       {"Sⱥⱥnich Xyz", "Examples Abc", "Thesé", "", "Are\tthe", "Tést\tstrings", ""},
-      {1, 1, 1, 0, 1, 1, 1});
+      {true, true, true, false, true, true, true});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     auto results = cudf::strings::capitalize(strings_view, std::string(" \t"));
     cudf::test::strings_column_wrapper expected(
       {"Sⱥⱥnich Xyz", "Examples Abc", "Thesé", "", "Are\tThe", "Tést\tStrings", ""},
-      {1, 1, 1, 0, 1, 1, 1});
+      {true, true, true, false, true, true, true});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
@@ -129,47 +129,49 @@ TEST_F(StringsCaseTest, Title)
 {
   cudf::test::strings_column_wrapper input(
     {"SȺȺnich", "Examples aBc", "thesé", "", "ARE THE", "tést strings", "", "n2viDIA corp"},
-    {1, 1, 1, 0, 1, 1, 1, 1});
+    {true, true, true, false, true, true, true, true});
   auto strings_view = cudf::strings_column_view(input);
 
   auto results = cudf::strings::title(strings_view);
 
   cudf::test::strings_column_wrapper expected(
     {"Sⱥⱥnich", "Examples Abc", "Thesé", "", "Are The", "Tést Strings", "", "N2Vidia Corp"},
-    {1, 1, 1, 0, 1, 1, 1, 1});
+    {true, true, true, false, true, true, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
   results = cudf::strings::title(strings_view, cudf::strings::string_character_types::ALPHANUM);
 
   cudf::test::strings_column_wrapper expected2(
     {"Sⱥⱥnich", "Examples Abc", "Thesé", "", "Are The", "Tést Strings", "", "N2vidia Corp"},
-    {1, 1, 1, 0, 1, 1, 1, 1});
+    {true, true, true, false, true, true, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected2);
 }
 
 TEST_F(StringsCaseTest, IsTitle)
 {
-  cudf::test::strings_column_wrapper input({"Sⱥⱥnich",
-                                            "Examples Abc",
-                                            "Thesé Strings",
-                                            "",
-                                            "Are The",
-                                            "Tést strings",
-                                            "",
-                                            "N2Vidia Corp",
-                                            "SNAKE",
-                                            "!Abc",
-                                            " Eagle",
-                                            "A Test",
-                                            "12345",
-                                            "Alpha Not Upper Or Lower: ƻC",
-                                            "one More"},
-                                           {1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  cudf::test::strings_column_wrapper input(
+    {"Sⱥⱥnich",
+     "Examples Abc",
+     "Thesé Strings",
+     "",
+     "Are The",
+     "Tést strings",
+     "",
+     "N2Vidia Corp",
+     "SNAKE",
+     "!Abc",
+     " Eagle",
+     "A Test",
+     "12345",
+     "Alpha Not Upper Or Lower: ƻC",
+     "one More"},
+    {true, true, true, false, true, true, true, true, true, true, true, true, true, true, true});
 
   auto results = cudf::strings::is_title(cudf::strings_column_view(input));
 
   cudf::test::fixed_width_column_wrapper<bool> expected(
-    {1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0}, {1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+    {1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0},
+    {true, true, true, false, true, true, true, true, true, true, true, true, true, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
diff --git a/cpp/tests/strings/chars_types_tests.cpp b/cpp/tests/strings/chars_types_tests.cpp
index fbc059186a8..7e530b2a34d 100644
--- a/cpp/tests/strings/chars_types_tests.cpp
+++ b/cpp/tests/strings/chars_types_tests.cpp
@@ -148,7 +148,22 @@ TEST_F(StringsCharsTest, Alphanumeric)
   auto results = cudf::strings::all_characters_of_type(
     strings_view, cudf::strings::string_character_types::ALPHANUM);
 
-  std::vector<bool> h_expected{1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0};
+  std::vector<bool> h_expected{true,
+                               true,
+                               false,
+                               true,
+                               false,
+                               false,
+                               false,
+                               false,
+                               false,
+                               true,
+                               true,
+                               true,
+                               false,
+                               true,
+                               true,
+                               false};
   cudf::test::fixed_width_column_wrapper<bool> expected(
     h_expected.begin(),
     h_expected.end(),
@@ -186,7 +201,22 @@ TEST_F(StringsCharsTest, AlphaNumericSpace)
   auto results = cudf::strings::all_characters_of_type(
     strings_view, (cudf::strings::string_character_types)types);
 
-  std::vector<bool> h_expected{1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1};
+  std::vector<bool> h_expected{true,
+                               true,
+                               false,
+                               true,
+                               true,
+                               false,
+                               false,
+                               false,
+                               false,
+                               true,
+                               true,
+                               true,
+                               true,
+                               true,
+                               true,
+                               true};
   cudf::test::fixed_width_column_wrapper<bool> expected(
     h_expected.begin(),
     h_expected.end(),
@@ -225,7 +255,22 @@ TEST_F(StringsCharsTest, Numerics)
   auto results = cudf::strings::all_characters_of_type(
     strings_view, (cudf::strings::string_character_types)types);
 
-  std::vector<bool> h_expected{0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0};
+  std::vector<bool> h_expected{false,
+                               false,
+                               false,
+                               false,
+                               false,
+                               false,
+                               false,
+                               false,
+                               false,
+                               true,
+                               false,
+                               true,
+                               false,
+                               true,
+                               false,
+                               false};
   cudf::test::fixed_width_column_wrapper<bool> expected(
     h_expected.begin(),
     h_expected.end(),
diff --git a/cpp/tests/strings/combine/concatenate_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp
index 95993e6ecbc..bb57d6f5e8a 100644
--- a/cpp/tests/strings/combine/concatenate_tests.cpp
+++ b/cpp/tests/strings/combine/concatenate_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -98,11 +98,11 @@ TEST_F(StringsCombineTest, Concatenate)
 TEST_F(StringsCombineTest, ConcatenateSkipNulls)
 {
   cudf::test::strings_column_wrapper strings1({"eee", "", "", "", "aa", "bbb", "ééé"},
-                                              {1, 0, 0, 1, 1, 1, 1});
+                                              {true, false, false, true, true, true, true});
   cudf::test::strings_column_wrapper strings2({"xyz", "", "d", "éa", "", "", "f"},
-                                              {1, 0, 1, 1, 1, 0, 1});
+                                              {true, false, true, true, true, false, true});
   cudf::test::strings_column_wrapper strings3({"q", "", "s", "t", "u", "", "w"},
-                                              {1, 1, 1, 1, 1, 0, 1});
+                                              {true, true, true, true, true, false, true});
 
   cudf::table_view table({strings1, strings2, strings3});
 
@@ -126,7 +126,8 @@ TEST_F(StringsCombineTest, ConcatenateSkipNulls)
   }
   {
     cudf::test::strings_column_wrapper expected(
-      {"eee+xyz+q", "", "", "+éa+t", "aa++u", "", "ééé+f+w"}, {1, 0, 0, 1, 1, 0, 1});
+      {"eee+xyz+q", "", "", "+éa+t", "aa++u", "", "ééé+f+w"},
+      {true, false, false, true, true, false, true});
     auto results = cudf::strings::concatenate(table,
                                               cudf::string_scalar("+"),
                                               cudf::string_scalar("", false),
diff --git a/cpp/tests/strings/combine/join_strings_tests.cpp b/cpp/tests/strings/combine/join_strings_tests.cpp
index ecc7432201f..817cbab461b 100644
--- a/cpp/tests/strings/combine/join_strings_tests.cpp
+++ b/cpp/tests/strings/combine/join_strings_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -82,10 +82,10 @@ TEST_F(JoinStringsTest, JoinZeroSizeStringsColumn)
 
 TEST_F(JoinStringsTest, JoinAllNullStringsColumn)
 {
-  cudf::test::strings_column_wrapper strings({"", "", ""}, {0, 0, 0});
+  cudf::test::strings_column_wrapper strings({"", "", ""}, {false, false, false});
 
   auto results = cudf::strings::join_strings(cudf::strings_column_view(strings));
-  cudf::test::strings_column_wrapper expected1({""}, {0});
+  cudf::test::strings_column_wrapper expected1({""}, {false});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected1);
 
   results = cudf::strings::join_strings(
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 2d9e2035e5e..59423d5b927 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -86,8 +86,8 @@ TEST_F(StringsContainsTests, ContainsTest)
                                     "\n",
                                     "b.\\s*\n",
                                     ".*c",
-                                    "\\d\\d:\\d\\d:\\d\\d",
-                                    "\\d\\d?:\\d\\d?:\\d\\d?",
+                                    R"(\d\d:\d\d:\d\d)",
+                                    R"(\d\d?:\d\d?:\d\d?)",
                                     "[Hh]ello [Ww]orld",
                                     "\\bworld\\b",
                                     ".*"};
@@ -282,7 +282,7 @@ TEST_F(StringsContainsTests, OctalTest)
   results  = cudf::strings::contains_re(strings_view, *prog);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  pattern  = std::string("[\\7][\\11][\\15]");
+  pattern  = std::string(R"([\7][\11][\15])");
   expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 1});
   prog     = cudf::strings::regex_program::create(pattern);
   results  = cudf::strings::contains_re(strings_view, *prog);
@@ -689,11 +689,11 @@ TEST_F(StringsContainsTests, ASCII)
   auto input = cudf::test::strings_column_wrapper({"abc \t\f\r 12", "áé 　❽❽", "aZ ❽4", "XYZ　8"});
   auto view = cudf::strings_column_view(input);
 
-  std::string patterns[] = {"\\w+[\\s]+\\d+",
-                            "[^\\W]+\\s+[^\\D]+",
-                            "[\\w]+[^\\S]+[\\d]+",
-                            "[\\w]+\\s+[\\d]+",
-                            "\\w+\\s+\\d+"};
+  std::string patterns[] = {R"(\w+[\s]+\d+)",
+                            R"([^\W]+\s+[^\D]+)",
+                            R"([\w]+[^\S]+[\d]+)",
+                            R"([\w]+\s+[\d]+)",
+                            R"(\w+\s+\d+)"};
 
   for (auto ptn : patterns) {
     auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 0, 0});
diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp
index bb5c96a09bf..b3dc3010c67 100644
--- a/cpp/tests/strings/datetime_tests.cpp
+++ b/cpp/tests/strings/datetime_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,8 +59,8 @@ TEST_F(StringsDatetimeTest, ToTimestamp)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
   results = cudf::strings::is_timestamp(strings_view, "%Y-%m-%dT%H:%M:%SZ");
-  cudf::test::fixed_width_column_wrapper<bool> is_expected({1, 1, 0, 0, 1, 1, 1, 1},
-                                                           {1, 1, 0, 1, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<bool> is_expected(
+    {1, 1, 0, 0, 1, 1, 1, 1}, {true, true, false, true, true, true, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, is_expected);
 }
 
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index 70112f7ca75..b26cbd5a549 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -98,7 +98,7 @@ TEST_F(StringsExtractTests, ExtractDomainTest)
                                               "a23-44-13-2.deploy.static.akamaitechnologies.com"});
   auto strings_view = cudf::strings_column_view(strings);
 
-  std::string pattern = "([\\w]+[\\.].*[^/]|[\\-\\w]+[\\.].*[^/])";
+  std::string pattern = R"(([\w]+[\.].*[^/]|[\-\w]+[\.].*[^/]))";
 
   cudf::test::strings_column_wrapper expected1({
     "www.google.com",
@@ -126,11 +126,11 @@ TEST_F(StringsExtractTests, ExtractDomainTest)
 TEST_F(StringsExtractTests, ExtractEventTest)
 {
   std::vector<std::string> patterns({"(^[0-9]+\\.?[0-9]*),",
-                                     "search_name=\"([0-9A-Za-z\\s\\-\\(\\)]+)",
-                                     "message.ip=\"([\\w\\.]+)",
-                                     "message.hostname=\"([\\w\\.]+)",
-                                     "message.user_name=\"([\\w\\.\\@]+)",
-                                     "message\\.description=\"([\\w\\.\\s]+)"});
+                                     R"(search_name="([0-9A-Za-z\s\-\(\)]+))",
+                                     R"(message.ip="([\w\.]+))",
+                                     R"(message.hostname="([\w\.]+))",
+                                     R"(message.user_name="([\w\.\@]+))",
+                                     R"(message\.description="([\w\.\s]+))"});
 
   cudf::test::strings_column_wrapper strings(
     {"15162388.26, search_name=\"Test Search Name\", orig_time=\"1516238826\", "
@@ -164,7 +164,7 @@ TEST_F(StringsExtractTests, MultiLine)
 
   auto pattern = std::string("(^[a-c]+$)");
   cudf::test::strings_column_wrapper expected_multiline({"abc", "abc", "abc", "", "abc", "abc"},
-                                                        {1, 1, 1, 0, 1, 1});
+                                                        {true, true, true, false, true, true});
   auto expected = cudf::table_view{{expected_multiline}};
   auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::MULTILINE);
   auto results = cudf::strings::extract(view, *prog);
@@ -172,7 +172,7 @@ TEST_F(StringsExtractTests, MultiLine)
 
   pattern = std::string("^([a-c]+)$");
   cudf::test::strings_column_wrapper expected_default({"", "", "abc", "", "abc", ""},
-                                                      {0, 0, 1, 0, 1, 0});
+                                                      {false, false, true, false, true, false});
   expected = cudf::table_view{{expected_default}};
   prog     = cudf::strings::regex_program::create(pattern);
   results  = cudf::strings::extract(view, *prog);
@@ -186,13 +186,14 @@ TEST_F(StringsExtractTests, DotAll)
 
   auto pattern = std::string("(a.*f)");
   cudf::test::strings_column_wrapper expected_dotall({"abc\nfa\nef", "abbc\nfff", "abcdef", ""},
-                                                     {1, 1, 1, 0});
+                                                     {true, true, true, false});
   auto expected = cudf::table_view{{expected_dotall}};
   auto prog     = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL);
   auto results  = cudf::strings::extract(view, *prog);
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
 
-  cudf::test::strings_column_wrapper expected_default({"", "", "abcdef", ""}, {0, 0, 1, 0});
+  cudf::test::strings_column_wrapper expected_default({"", "", "abcdef", ""},
+                                                      {false, false, true, false});
   expected = cudf::table_view{{expected_default}};
   prog     = cudf::strings::regex_program::create(pattern);
   results  = cudf::strings::extract(view, *prog);
diff --git a/cpp/tests/strings/fill_tests.cpp b/cpp/tests/strings/fill_tests.cpp
index aadd68402c8..ef54b00d08b 100644
--- a/cpp/tests/strings/fill_tests.cpp
+++ b/cpp/tests/strings/fill_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -56,13 +56,13 @@ TEST_F(StringsFillTest, Fill)
   {
     auto results = cudf::fill(input, 0, 7, cudf::string_scalar(""));
     cudf::test::strings_column_wrapper expected({"", "", "", "", "", "", ""},
-                                                {1, 1, 1, 1, 1, 1, 1});
+                                                {true, true, true, true, true, true, true});
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
   {
     auto results = cudf::fill(input, 0, 7, cudf::string_scalar("", false));
     cudf::test::strings_column_wrapper expected({"", "", "", "", "", "", ""},
-                                                {0, 0, 0, 0, 0, 0, 0});
+                                                {false, false, false, false, false, false, false});
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
 }
diff --git a/cpp/tests/strings/find_multiple_tests.cpp b/cpp/tests/strings/find_multiple_tests.cpp
index 57cba495ba0..41a5940c880 100644
--- a/cpp/tests/strings/find_multiple_tests.cpp
+++ b/cpp/tests/strings/find_multiple_tests.cpp
@@ -69,7 +69,7 @@ TEST_F(StringsFindMultipleTest, ZeroSizeStringsColumn)
 
 TEST_F(StringsFindMultipleTest, ErrorTest)
 {
-  cudf::test::strings_column_wrapper strings({"this string intentionally left blank"}, {0});
+  cudf::test::strings_column_wrapper strings({"this string intentionally left blank"}, {false});
   auto strings_view = cudf::strings_column_view(strings);
 
   auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
diff --git a/cpp/tests/strings/find_tests.cpp b/cpp/tests/strings/find_tests.cpp
index 7f89cc9fb53..2da95ba5c27 100644
--- a/cpp/tests/strings/find_tests.cpp
+++ b/cpp/tests/strings/find_tests.cpp
@@ -34,55 +34,55 @@ struct StringsFindTest : public cudf::test::BaseFixture {};
 TEST_F(StringsFindTest, Find)
 {
   cudf::test::strings_column_wrapper strings({"Héllo", "thesé", "", "lest", "tést strings", ""},
-                                             {1, 1, 0, 1, 1, 1});
+                                             {true, true, false, true, true, true});
   auto strings_view = cudf::strings_column_view(strings);
 
   {
     auto const target = cudf::string_scalar("é");
-    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({1, 4, -1, -1, 1, -1},
-                                                                     {1, 1, 0, 1, 1, 1});
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected(
+      {1, 4, -1, -1, 1, -1}, {true, true, false, true, true, true});
     auto results = cudf::strings::find(strings_view, target);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
     results = cudf::strings::rfind(strings_view, target);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({3, -1, -1, 0, -1, -1},
-                                                                     {1, 1, 0, 1, 1, 1});
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected(
+      {3, -1, -1, 0, -1, -1}, {true, true, false, true, true, true});
     auto results = cudf::strings::rfind(strings_view, cudf::string_scalar("l"));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     auto const target = cudf::string_scalar("es");
-    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({-1, 2, -1, 1, -1, -1},
-                                                                     {1, 1, 0, 1, 1, 1});
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected(
+      {-1, 2, -1, 1, -1, -1}, {true, true, false, true, true, true});
     auto results = cudf::strings::find(strings_view, target);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
     results = cudf::strings::rfind(strings_view, target);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({0, 0, 0, 0, 0, 0},
-                                                                     {1, 1, 0, 1, 1, 1});
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected(
+      {0, 0, 0, 0, 0, 0}, {true, true, false, true, true, true});
     auto results = cudf::strings::find(strings_view, cudf::string_scalar(""));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({5, 5, 0, 4, 12, 0},
-                                                                     {1, 1, 0, 1, 1, 1});
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected(
+      {5, 5, 0, 4, 12, 0}, {true, true, false, true, true, true});
     auto results = cudf::strings::rfind(strings_view, cudf::string_scalar(""));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     auto const targets = cudf::test::strings_column_wrapper({"l", "t", "", "x", "é", "o"});
-    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({2, 0, 0, -1, 1, -1},
-                                                                     {1, 1, 0, 1, 1, 1});
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected(
+      {2, 0, 0, -1, 1, -1}, {true, true, false, true, true, true});
     auto results = cudf::strings::find(strings_view, cudf::strings_column_view(targets));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({0, 0, 0, 0, 0, 0},
-                                                                     {1, 1, 0, 1, 1, 1});
+    cudf::test::fixed_width_column_wrapper<cudf::size_type> expected(
+      {0, 0, 0, 0, 0, 0}, {true, true, false, true, true, true});
     auto results = cudf::strings::find(strings_view, strings_view);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
@@ -91,13 +91,13 @@ TEST_F(StringsFindTest, Find)
 TEST_F(StringsFindTest, FindWithNullTargets)
 {
   cudf::test::strings_column_wrapper input({"hello hello", "thesé help", "", "helicopter", "", "x"},
-                                           {1, 1, 0, 1, 1, 1});
+                                           {true, true, false, true, true, true});
   auto strings_view = cudf::strings_column_view(input);
 
   auto const targets = cudf::test::strings_column_wrapper(
-    {"lo he", "", "hhh", "cop", "help", "xyz"}, {1, 0, 1, 1, 1, 1});
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({3, -1, -1, 4, -1, -1},
-                                                                   {1, 0, 0, 1, 1, 1});
+    {"lo he", "", "hhh", "cop", "help", "xyz"}, {true, false, true, true, true, true});
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected(
+    {3, -1, -1, 4, -1, -1}, {true, false, false, true, true, true});
   auto results = cudf::strings::find(strings_view, cudf::strings_column_view(targets));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
@@ -144,25 +144,26 @@ TEST_F(StringsFindTest, FindLongStrings)
 TEST_F(StringsFindTest, Contains)
 {
   cudf::test::strings_column_wrapper strings(
-    {"Héllo", "thesé", "", "lease", "tést strings", "", "eé", "éte"}, {1, 1, 0, 1, 1, 1, 1, 1});
+    {"Héllo", "thesé", "", "lease", "tést strings", "", "eé", "éte"},
+    {true, true, false, true, true, true, true, true});
   auto strings_view = cudf::strings_column_view(strings);
   {
-    cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 1, 0, 0, 1, 1},
-                                                          {1, 1, 0, 1, 1, 1, 1, 1});
+    cudf::test::fixed_width_column_wrapper<bool> expected(
+      {0, 1, 0, 1, 0, 0, 1, 1}, {true, true, false, true, true, true, true, true});
     auto results = cudf::strings::contains(strings_view, cudf::string_scalar("e"));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    cudf::test::fixed_width_column_wrapper<bool> expected({1, 1, 0, 0, 1, 0, 1, 1},
-                                                          {1, 1, 0, 1, 1, 1, 1, 1});
+    cudf::test::fixed_width_column_wrapper<bool> expected(
+      {1, 1, 0, 0, 1, 0, 1, 1}, {true, true, false, true, true, true, true, true});
     auto results = cudf::strings::contains(strings_view, cudf::string_scalar("é"));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     cudf::test::strings_column_wrapper targets({"Hello", "é", "e", "x", "", "", "n", "t"},
-                                               {1, 1, 1, 1, 1, 0, 1, 1});
-    cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 0, 1, 0, 0, 1},
-                                                          {1, 1, 0, 1, 1, 1, 1, 1});
+                                               {true, true, true, true, true, false, true, true});
+    cudf::test::fixed_width_column_wrapper<bool> expected(
+      {0, 1, 0, 0, 1, 0, 0, 1}, {true, true, false, true, true, true, true, true});
     auto results = cudf::strings::contains(strings_view, cudf::strings_column_view(targets));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
@@ -200,10 +201,11 @@ TEST_F(StringsFindTest, ContainsLongStrings)
 TEST_F(StringsFindTest, StartsWith)
 {
   cudf::test::strings_column_wrapper strings({"Héllo", "thesé", "", "lease", "tést strings", ""},
-                                             {1, 1, 0, 1, 1, 1});
+                                             {true, true, false, true, true, true});
   auto strings_view = cudf::strings_column_view(strings);
   {
-    cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 0, 1, 0}, {1, 1, 0, 1, 1, 1});
+    cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 0, 1, 0},
+                                                          {true, true, false, true, true, true});
     auto results = cudf::strings::starts_with(strings_view, cudf::string_scalar("t"));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
@@ -212,12 +214,14 @@ TEST_F(StringsFindTest, StartsWith)
     cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
 
     auto targets_view = cudf::strings_column_view(targets);
-    cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 0, 1, 1}, {1, 1, 0, 1, 1, 1});
+    cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 0, 1, 1},
+                                                          {true, true, false, true, true, true});
     auto results = cudf::strings::starts_with(strings_view, targets_view);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 0, 0, 0}, {1, 1, 0, 1, 1, 1});
+    cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 0, 0, 0},
+                                                          {true, true, false, true, true, true});
     auto results = cudf::strings::starts_with(strings_view, cudf::string_scalar("thesé"));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
@@ -229,7 +233,8 @@ TEST_F(StringsFindTest, StartsWith)
       thrust::make_transform_iterator(h_targets.begin(), [](auto str) { return str != nullptr; }));
 
     auto targets_view = cudf::strings_column_view(targets);
-    cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 0, 0, 1}, {1, 1, 0, 1, 1, 1});
+    cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 0, 0, 1},
+                                                          {true, true, false, true, true, true});
     auto results = cudf::strings::starts_with(strings_view, targets_view);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
@@ -238,10 +243,11 @@ TEST_F(StringsFindTest, StartsWith)
 TEST_F(StringsFindTest, EndsWith)
 {
   cudf::test::strings_column_wrapper strings({"Héllo", "thesé", "", "lease", "tést strings", ""},
-                                             {1, 1, 0, 1, 1, 1});
+                                             {true, true, false, true, true, true});
   auto strings_view = cudf::strings_column_view(strings);
   {
-    cudf::test::fixed_width_column_wrapper<bool> expected({0, 0, 0, 1, 0, 0}, {1, 1, 0, 1, 1, 1});
+    cudf::test::fixed_width_column_wrapper<bool> expected({0, 0, 0, 1, 0, 0},
+                                                          {true, true, false, true, true, true});
     auto results = cudf::strings::ends_with(strings_view, cudf::string_scalar("se"));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
@@ -250,12 +256,14 @@ TEST_F(StringsFindTest, EndsWith)
     cudf::test::strings_column_wrapper targets(h_targets.begin(), h_targets.end());
 
     auto targets_view = cudf::strings_column_view(targets);
-    cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 0, 1, 1}, {1, 1, 0, 1, 1, 1});
+    cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 0, 1, 1},
+                                                          {true, true, false, true, true, true});
     auto results = cudf::strings::ends_with(strings_view, targets_view);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 0, 0, 0}, {1, 1, 0, 1, 1, 1});
+    cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 0, 0, 0},
+                                                          {true, true, false, true, true, true});
     auto results = cudf::strings::ends_with(strings_view, cudf::string_scalar("thesé"));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
@@ -267,7 +275,8 @@ TEST_F(StringsFindTest, EndsWith)
       thrust::make_transform_iterator(h_targets.begin(), [](auto str) { return str != nullptr; }));
 
     auto targets_view = cudf::strings_column_view(targets);
-    cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 0, 1, 1}, {1, 1, 0, 1, 1, 1});
+    cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 0, 1, 1},
+                                                          {true, true, false, true, true, true});
     auto results = cudf::strings::ends_with(strings_view, targets_view);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
@@ -296,10 +305,11 @@ TEST_F(StringsFindTest, ZeroSizeStringsColumn)
 TEST_F(StringsFindTest, EmptyTarget)
 {
   cudf::test::strings_column_wrapper strings({"Héllo", "thesé", "", "lease", "tést strings", ""},
-                                             {1, 1, 0, 1, 1, 1});
+                                             {true, true, false, true, true, true});
   auto strings_view = cudf::strings_column_view(strings);
 
-  cudf::test::fixed_width_column_wrapper<bool> expected({1, 1, 1, 1, 1, 1}, {1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<bool> expected({1, 1, 1, 1, 1, 1},
+                                                        {true, true, false, true, true, true});
   auto results = cudf::strings::contains(strings_view, cudf::string_scalar(""));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   results = cudf::strings::starts_with(strings_view, cudf::string_scalar(""));
@@ -307,8 +317,8 @@ TEST_F(StringsFindTest, EmptyTarget)
   results = cudf::strings::ends_with(strings_view, cudf::string_scalar(""));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected_find({0, 0, 0, 0, 0, 0},
-                                                                        {1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> expected_find(
+    {0, 0, 0, 0, 0, 0}, {true, true, false, true, true, true});
   results = cudf::strings::find(strings_view, cudf::string_scalar(""));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected_find);
   auto expected_rfind = cudf::strings::count_characters(strings_view);
@@ -325,7 +335,7 @@ TEST_F(StringsFindTest, AllEmpty)
   cudf::test::fixed_width_column_wrapper<cudf::size_type> expected32(h_expected32.begin(),
                                                                      h_expected32.end());
 
-  std::vector<bool> h_expected8(h_strings.size(), 0);
+  std::vector<bool> h_expected8(h_strings.size(), false);
   cudf::test::fixed_width_column_wrapper<bool> expected8(h_expected8.begin(), h_expected8.end());
 
   auto strings_view = cudf::strings_column_view(strings);
@@ -419,8 +429,8 @@ TEST_P(FindParmsTest, Find)
   {
     auto results = cudf::strings::find(strings_view, cudf::string_scalar("e"), position);
     std::vector<cudf::size_type> h_expected;
-    for (auto itr = h_strings.begin(); itr != h_strings.end(); ++itr)
-      h_expected.push_back(static_cast<cudf::size_type>((*itr).find("e", position)));
+    for (auto& h_string : h_strings)
+      h_expected.push_back(static_cast<cudf::size_type>(h_string.find("e", position)));
     cudf::test::fixed_width_column_wrapper<cudf::size_type> expected(h_expected.begin(),
                                                                      h_expected.end());
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
@@ -428,8 +438,8 @@ TEST_P(FindParmsTest, Find)
   {
     auto results = cudf::strings::rfind(strings_view, cudf::string_scalar("e"), 0, position + 1);
     std::vector<cudf::size_type> h_expected;
-    for (auto itr = h_strings.begin(); itr != h_strings.end(); ++itr)
-      h_expected.push_back(static_cast<cudf::size_type>((*itr).rfind("e", position)));
+    for (auto& h_string : h_strings)
+      h_expected.push_back(static_cast<cudf::size_type>(h_string.rfind("e", position)));
     cudf::test::fixed_width_column_wrapper<cudf::size_type> expected(h_expected.begin(),
                                                                      h_expected.end());
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index fe27beed197..4582dcb1e38 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@ struct StringsFindallTests : public cudf::test::BaseFixture {};
 
 TEST_F(StringsFindallTests, FindallTest)
 {
-  bool valids[] = {1, 1, 1, 1, 1, 0, 1, 1};
+  bool valids[] = {true, true, true, true, true, false, true, true};
   cudf::test::strings_column_wrapper input(
     {"3-A", "4-May 5-Day 6-Hay", "12-Dec-2021-Jan", "Feb-March", "4 ABC", "", "", "25-9000-Hal"},
     valids);
@@ -83,7 +83,7 @@ TEST_F(StringsFindallTests, DotAll)
 TEST_F(StringsFindallTests, MediumRegex)
 {
   // This results in 15 regex instructions and falls in the 'medium' range.
-  std::string medium_regex = "(\\w+) (\\w+) (\\d+)";
+  std::string medium_regex = R"((\w+) (\w+) (\d+))";
   auto prog                = cudf::strings::regex_program::create(medium_regex);
 
   cudf::test::strings_column_wrapper input({"first words 1234 and just numbers 9876", "neither"});
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index 9205207cc53..79054551498 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -54,7 +54,7 @@ TYPED_TEST(StringsFixedPointConvertTest, ToFixedPoint)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_scaled);
 
   cudf::test::strings_column_wrapper strings_nulls(
-    {"1234", "-876", "543", "900000", "25E5", "", ""}, {1, 1, 1, 1, 1, 1, 0});
+    {"1234", "-876", "543", "900000", "25E5", "", ""}, {true, true, true, true, true, true, false});
   results = cudf::strings::to_fixed_point(cudf::strings_column_view(strings_nulls),
                                           cudf::data_type{cudf::type_to_id<DecimalType>()});
   auto const expected_nulls = fp_wrapper{
@@ -205,14 +205,14 @@ TYPED_TEST(StringsFixedPointConvertTest, FromFixedPoint)
     fp_wrapper({110, -222, 3330, 4, -550, 0}, {1, 1, 1, 1, 1, 0}, numeric::scale_type{2});
   results = cudf::strings::from_fixed_point(positive_scale);
   cudf::test::strings_column_wrapper positive_expected(
-    {"11000", "-22200", "333000", "400", "-55000", ""}, {1, 1, 1, 1, 1, 0});
+    {"11000", "-22200", "333000", "400", "-55000", ""}, {true, true, true, true, true, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, positive_expected);
 
   auto const zero_scale =
     fp_wrapper({0, -222, 3330, 4, -550, 0}, {0, 1, 1, 1, 1, 1}, numeric::scale_type{0});
   results = cudf::strings::from_fixed_point(zero_scale);
   cudf::test::strings_column_wrapper zero_expected({"", "-222", "3330", "4", "-550", "0"},
-                                                   {0, 1, 1, 1, 1, 1});
+                                                   {false, true, true, true, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, zero_expected);
 }
 
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index c8f292f55b2..51e9b3bd0a0 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -352,11 +352,11 @@ TEST_F(StringsConvertTest, HexToInteger)
 
   {
     std::vector<int32_t> h_expected;
-    for (auto itr = h_strings.begin(); itr != h_strings.end(); ++itr) {
-      if (*itr == nullptr)
+    for (auto& h_string : h_strings) {
+      if (h_string == nullptr)
         h_expected.push_back(0);
       else
-        h_expected.push_back(static_cast<int>(std::stol(std::string(*itr), 0, 16)));
+        h_expected.push_back(static_cast<int>(std::stol(std::string(h_string), nullptr, 16)));
     }
 
     auto results = cudf::strings::hex_to_integers(cudf::strings_column_view(strings),
@@ -369,11 +369,11 @@ TEST_F(StringsConvertTest, HexToInteger)
   }
   {
     std::vector<int64_t> h_expected;
-    for (auto itr = h_strings.begin(); itr != h_strings.end(); ++itr) {
-      if (*itr == nullptr)
+    for (auto& h_string : h_strings) {
+      if (h_string == nullptr)
         h_expected.push_back(0);
       else
-        h_expected.push_back(std::stol(std::string(*itr), 0, 16));
+        h_expected.push_back(std::stol(std::string(h_string), nullptr, 16));
     }
 
     auto results = cudf::strings::hex_to_integers(cudf::strings_column_view(strings),
@@ -404,8 +404,9 @@ TEST_F(StringsConvertTest, IsHex)
     h_strings.begin(),
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  cudf::test::fixed_width_column_wrapper<bool> expected({0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0},
-                                                        {1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<bool> expected(
+    {0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0},
+    {true, true, false, true, true, true, true, true, true, true, true, true});
   auto results = cudf::strings::is_hex(cudf::strings_column_view(strings));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
@@ -447,11 +448,12 @@ TYPED_TEST(StringsIntegerConvertTest, IntegerToHex)
 TEST_F(StringsConvertTest, IntegerToHexWithNull)
 {
   cudf::test::fixed_width_column_wrapper<int32_t> integers(
-    {123456, -1, 0, 0, 12, 12345, 123456789, -123456789}, {1, 1, 1, 0, 1, 1, 1, 1});
+    {123456, -1, 0, 0, 12, 12345, 123456789, -123456789},
+    {true, true, true, false, true, true, true, true});
 
   cudf::test::strings_column_wrapper expected(
     {"01E240", "FFFFFFFF", "00", "", "0C", "3039", "075BCD15", "F8A432EB"},
-    {1, 1, 1, 0, 1, 1, 1, 1});
+    {true, true, true, false, true, true, true, true});
 
   auto results = cudf::strings::integers_to_hex(integers);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
diff --git a/cpp/tests/strings/ipv4_tests.cpp b/cpp/tests/strings/ipv4_tests.cpp
index 2b2d5730ca7..3bfe0f9727e 100644
--- a/cpp/tests/strings/ipv4_tests.cpp
+++ b/cpp/tests/strings/ipv4_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -104,8 +104,9 @@ TEST_F(StringsConvertTest, IsIPv4)
     h_strings.begin(),
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  cudf::test::fixed_width_column_wrapper<bool> expected({0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0},
-                                                        {1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<bool> expected(
+    {0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0},
+    {true, true, false, true, true, true, true, true, true, true, true, true});
   auto results = cudf::strings::is_ipv4(cudf::strings_column_view(strings));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
diff --git a/cpp/tests/strings/like_tests.cpp b/cpp/tests/strings/like_tests.cpp
index 4352a1ed584..6aedbdeb537 100644
--- a/cpp/tests/strings/like_tests.cpp
+++ b/cpp/tests/strings/like_tests.cpp
@@ -26,12 +26,13 @@ struct StringsLikeTests : public cudf::test::BaseFixture {};
 TEST_F(StringsLikeTests, Basic)
 {
   cudf::test::strings_column_wrapper input({"abc", "a bc", "ABC", "abcd", " abc", "", "", "áéêú"},
-                                           {1, 1, 1, 1, 1, 1, 0, 1});
+                                           {true, true, true, true, true, true, false, true});
   auto const sv      = cudf::strings_column_view(input);
   auto const pattern = std::string("abc");
   auto const results = cudf::strings::like(sv, pattern);
   cudf::test::fixed_width_column_wrapper<bool> expected(
-    {true, false, false, false, false, false, false, false}, {1, 1, 1, 1, 1, 1, 0, 1});
+    {true, false, false, false, false, false, false, false},
+    {true, true, true, true, true, true, false, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
@@ -201,7 +202,7 @@ TEST_F(StringsLikeTests, Errors)
   EXPECT_THROW(cudf::strings::like(sv, invalid_str), cudf::logic_error);
   EXPECT_THROW(cudf::strings::like(sv, std::string("3"), invalid_str), cudf::logic_error);
 
-  auto patterns          = cudf::test::strings_column_wrapper({"3", ""}, {1, 0});
+  auto patterns          = cudf::test::strings_column_wrapper({"3", ""}, {true, false});
   auto const sv_patterns = cudf::strings_column_view(patterns);
   EXPECT_THROW(cudf::strings::like(sv, sv_patterns), cudf::logic_error);
   EXPECT_THROW(cudf::strings::like(sv, sv, invalid_str), cudf::logic_error);
diff --git a/cpp/tests/strings/pad_tests.cpp b/cpp/tests/strings/pad_tests.cpp
index 81ec87a12a8..693c043ac3f 100644
--- a/cpp/tests/strings/pad_tests.cpp
+++ b/cpp/tests/strings/pad_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -115,8 +115,7 @@ TEST_P(PadParameters, Padding)
   auto results          = cudf::strings::pad(strings_view, width, cudf::strings::side_type::RIGHT);
 
   std::vector<std::string> h_expected;
-  for (auto itr = h_strings.begin(); itr != h_strings.end(); ++itr) {
-    std::string str      = *itr;
+  for (auto str : h_strings) {
     cudf::size_type size = str.size();
     if (size < width) str.insert(size, width - size, ' ');
     h_expected.push_back(str);
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index d1c545b0e2f..8c0482653fb 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -165,7 +165,7 @@ TEST_F(StringsReplaceRegexTest, Alternation)
     {"16  6  brr  232323  1  hello  90", "123 ABC 00 2022", "abé123  4567  89xyz"});
   auto sv = cudf::strings_column_view(input);
 
-  auto pattern = std::string("(^|\\s)\\d+(\\s|$)");
+  auto pattern = std::string(R"((^|\s)\d+(\s|$))");
   auto repl    = cudf::string_scalar("_");
   auto expected =
     cudf::test::strings_column_wrapper({"__ brr __ hello _", "_ABC_2022", "abé123 _ 89xyz"});
@@ -173,7 +173,7 @@ TEST_F(StringsReplaceRegexTest, Alternation)
   auto results = cudf::strings::replace_re(sv, *prog, repl);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 
-  pattern = std::string("(\\s|^)\\d+($|\\s)");
+  pattern = std::string(R"((\s|^)\d+($|\s))");
   prog    = cudf::strings::regex_program::create(pattern);
   results = cudf::strings::replace_re(sv, *prog, repl);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp
index ef4f3bc2b2a..3aa7467d156 100644
--- a/cpp/tests/strings/replace_tests.cpp
+++ b/cpp/tests/strings/replace_tests.cpp
@@ -37,10 +37,10 @@ struct StringsReplaceTest : public cudf::test::BaseFixture {
                                        "",
                                        nullptr};
 
-    return cudf::test::strings_column_wrapper(
+    return {
       h_strings.begin(),
       h_strings.end(),
-      thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+      thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; })};
   }
 
   std::unique_ptr<cudf::column> build_large(cudf::column_view const& first,
@@ -429,7 +429,7 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong)
      "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá",
      "",
      ""},
-    {1, 1, 1, 1, 0, 1});
+    {true, true, true, true, false, true});
   auto strings_view = cudf::strings_column_view(input);
 
   auto targets      = cudf::test::strings_column_wrapper({"78901", "bananá", "ápple", "78"});
@@ -463,7 +463,7 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong)
        "Test string for overlap check: bananaavocado PEAR avocadoPEAR banavocado avocado PEAR",
        "",
        ""},
-      {1, 1, 1, 1, 0, 1});
+      {true, true, true, true, false, true});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 
@@ -491,7 +491,7 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong)
        "*",
        "",
        ""},
-      {1, 1, 1, 1, 0, 1});
+      {true, true, true, true, false, true});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 
@@ -527,7 +527,7 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong)
        "Test string for overlap check: bananaápple bananá ápplebananá banápple ápple bananá",
        "",
        ""},
-      {1, 1, 1, 1, 0, 1});
+      {true, true, true, true, false, true});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
diff --git a/cpp/tests/strings/reverse_tests.cpp b/cpp/tests/strings/reverse_tests.cpp
index 3df42b61ebf..40858f1ad23 100644
--- a/cpp/tests/strings/reverse_tests.cpp
+++ b/cpp/tests/strings/reverse_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,17 +29,19 @@ struct StringsReverseTest : public cudf::test::BaseFixture {};
 
 TEST_F(StringsReverseTest, Reverse)
 {
-  auto input = cudf::test::strings_column_wrapper(
-    {"abcdef", "12345", "", "", "aébé", "A é Z", "X", "é"}, {1, 1, 1, 0, 1, 1, 1, 1});
-  auto results  = cudf::strings::reverse(cudf::strings_column_view(input));
-  auto expected = cudf::test::strings_column_wrapper(
-    {"fedcba", "54321", "", "", "ébéa", "Z é A", "X", "é"}, {1, 1, 1, 0, 1, 1, 1, 1});
+  auto input =
+    cudf::test::strings_column_wrapper({"abcdef", "12345", "", "", "aébé", "A é Z", "X", "é"},
+                                       {true, true, true, false, true, true, true, true});
+  auto results = cudf::strings::reverse(cudf::strings_column_view(input));
+  auto expected =
+    cudf::test::strings_column_wrapper({"fedcba", "54321", "", "", "ébéa", "Z é A", "X", "é"},
+                                       {true, true, true, false, true, true, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 
   auto sliced = cudf::slice(input, {1, 7}).front();
   results     = cudf::strings::reverse(cudf::strings_column_view(sliced));
-  expected =
-    cudf::test::strings_column_wrapper({"54321", "", "", "ébéa", "Z é A", "X"}, {1, 1, 0, 1, 1, 1});
+  expected    = cudf::test::strings_column_wrapper({"54321", "", "", "ébéa", "Z é A", "X"},
+                                                   {true, true, false, true, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
diff --git a/cpp/tests/strings/slice_tests.cpp b/cpp/tests/strings/slice_tests.cpp
index 92230d06672..52e439bd93f 100644
--- a/cpp/tests/strings/slice_tests.cpp
+++ b/cpp/tests/strings/slice_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,8 +62,8 @@ TEST_P(Parameters, Substring)
   auto results        = cudf::strings::slice_strings(strings_column, start);
 
   std::vector<std::string> h_expected;
-  for (auto itr = h_strings.begin(); itr != h_strings.end(); ++itr)
-    h_expected.push_back((*itr).substr(start));
+  for (auto& h_string : h_strings)
+    h_expected.push_back(h_string.substr(start));
 
   cudf::test::strings_column_wrapper expected(h_expected.begin(), h_expected.end());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
@@ -277,7 +277,7 @@ TEST_F(StringsSliceTest, Error)
   auto indexes = cudf::test::fixed_width_column_wrapper<int32_t>({1, 2});
   EXPECT_THROW(cudf::strings::slice_strings(strings_view, indexes, indexes), cudf::logic_error);
 
-  auto indexes_null = cudf::test::fixed_width_column_wrapper<int32_t>({1}, {0});
+  auto indexes_null = cudf::test::fixed_width_column_wrapper<int32_t>({1}, {false});
   EXPECT_THROW(cudf::strings::slice_strings(strings_view, indexes_null, indexes_null),
                cudf::logic_error);
 
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index 445e283ef45..d53c64ed539 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -337,7 +337,8 @@ TEST_F(StringsSplitTest, MultiByteDelimiters)
 
     auto c0 = cudf::test::strings_column_wrapper({"u", "w", "y", "", "", ""});
     auto c1 = cudf::test::strings_column_wrapper({"", ":x", "", "a", ":b", ":c"});
-    auto c2 = cudf::test::strings_column_wrapper({"", "", "z", "", "", ":"}, {0, 0, 1, 0, 0, 1});
+    auto c2 = cudf::test::strings_column_wrapper({"", "", "z", "", "", ":"},
+                                                 {false, false, true, false, false, true});
     std::vector<std::unique_ptr<cudf::column>> expected_columns;
     expected_columns.push_back(c0.release());
     expected_columns.push_back(c1.release());
@@ -349,7 +350,8 @@ TEST_F(StringsSplitTest, MultiByteDelimiters)
 
     c0 = cudf::test::strings_column_wrapper({"u", "w:", "y", "", ":", ":"});
     c1 = cudf::test::strings_column_wrapper({"", "x", "", "a", "b", "c:"});
-    c2 = cudf::test::strings_column_wrapper({"", "", "z", "", "", ""}, {0, 0, 1, 0, 0, 1});
+    c2 = cudf::test::strings_column_wrapper({"", "", "z", "", "", ""},
+                                            {false, false, true, false, false, true});
     expected_columns.push_back(c0.release());
     expected_columns.push_back(c1.release());
     expected_columns.push_back(c2.release());
@@ -371,7 +373,7 @@ TEST_F(StringsSplitTest, MultiByteDelimiters)
     auto result = cudf::strings::split(view, cudf::string_scalar("}:{"));
 
     auto c0 = cudf::test::strings_column_wrapper({"{a=1", "{c=3}", ":{"});
-    auto c1 = cudf::test::strings_column_wrapper({"b=2}:", "", "}"}, {1, 0, 1});
+    auto c1 = cudf::test::strings_column_wrapper({"b=2}:", "", "}"}, {true, false, true});
     std::vector<std::unique_ptr<cudf::column>> expected_columns;
     expected_columns.push_back(c0.release());
     expected_columns.push_back(c1.release());
@@ -395,8 +397,10 @@ TEST_F(StringsSplitTest, SplitRegex)
     auto pattern = std::string("\\s+");
 
     cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, validity);
-    cudf::test::strings_column_wrapper col1({"Héllo", "", "some", "String", ""}, {1, 0, 1, 1, 0});
-    cudf::test::strings_column_wrapper col2({"thesé", "", "", "", ""}, {1, 0, 1, 0, 0});
+    cudf::test::strings_column_wrapper col1({"Héllo", "", "some", "String", ""},
+                                            {true, false, true, true, false});
+    cudf::test::strings_column_wrapper col2({"thesé", "", "", "", ""},
+                                            {true, false, true, false, false});
     auto expected = cudf::table_view({col0, col1, col2});
     auto prog     = cudf::strings::regex_program::create(pattern);
     auto result   = cudf::strings::split_re(sv, *prog);
@@ -412,9 +416,11 @@ TEST_F(StringsSplitTest, SplitRegex)
 
     cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, validity);
     cudf::test::strings_column_wrapper col1({"llo th", "", " som", "st String", ""},
-                                            {1, 0, 1, 1, 0});
-    cudf::test::strings_column_wrapper col2({"s", "", "  ", "", ""}, {1, 0, 1, 0, 0});
-    cudf::test::strings_column_wrapper col3({"", "", "", "", ""}, {1, 0, 0, 0, 0});
+                                            {true, false, true, true, false});
+    cudf::test::strings_column_wrapper col2({"s", "", "  ", "", ""},
+                                            {true, false, true, false, false});
+    cudf::test::strings_column_wrapper col3({"", "", "", "", ""},
+                                            {true, false, false, false, false});
     auto expected = cudf::table_view({col0, col1, col2, col3});
     auto prog     = cudf::strings::regex_program::create(pattern);
     auto result   = cudf::strings::split_re(sv, *prog);
@@ -479,9 +485,10 @@ TEST_F(StringsSplitTest, SplitRegexWithMaxSplit)
   {
     auto pattern = std::string("\\s+");
 
-    cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1});
+    cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""},
+                                            {true, false, true, true, true});
     cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some  ", "String", ""},
-                                            {1, 0, 1, 1, 0});
+                                            {true, false, true, true, false});
     auto expected = cudf::table_view({col0, col1});
     auto prog     = cudf::strings::regex_program::create(pattern);
     auto result   = cudf::strings::split_re(sv, *prog, 1);
@@ -528,10 +535,10 @@ TEST_F(StringsSplitTest, SplitRegexWordBoundary)
     auto pattern = std::string("\\b");
 
     cudf::test::strings_column_wrapper col0({"", "", "-+", ""});
-    cudf::test::strings_column_wrapper col1({"a", "ab", "", "e"}, {1, 1, 0, 1});
-    cudf::test::strings_column_wrapper col2({"", "", "", "\n"}, {1, 1, 0, 1});
-    cudf::test::strings_column_wrapper col3({"", "", "", "é"}, {0, 0, 0, 1});
-    cudf::test::strings_column_wrapper col4({"", "", "", ""}, {0, 0, 0, 1});
+    cudf::test::strings_column_wrapper col1({"a", "ab", "", "e"}, {true, true, false, true});
+    cudf::test::strings_column_wrapper col2({"", "", "", "\n"}, {true, true, false, true});
+    cudf::test::strings_column_wrapper col3({"", "", "", "é"}, {false, false, false, true});
+    cudf::test::strings_column_wrapper col4({"", "", "", ""}, {false, false, false, true});
     auto expected = cudf::table_view({col0, col1, col2, col3, col4});
     auto prog     = cudf::strings::regex_program::create(pattern);
     auto result   = cudf::strings::split_re(sv, *prog);
@@ -652,7 +659,8 @@ TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit)
 
   {
     cudf::test::strings_column_wrapper col0({" Héllo", "", "are some", "tést", ""}, validity);
-    cudf::test::strings_column_wrapper col1({"thesé", "", "", "String", ""}, {1, 0, 1, 1, 0});
+    cudf::test::strings_column_wrapper col1({"thesé", "", "", "String", ""},
+                                            {true, false, true, true, false});
     auto expected = cudf::table_view({col0, col1});
     auto result   = cudf::strings::rsplit_re(sv, *prog, 1);
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
@@ -710,7 +718,7 @@ TEST_F(StringsSplitTest, SplitZeroSizeStringsColumns)
 // This test specifically for https://github.com/rapidsai/custrings/issues/119
 TEST_F(StringsSplitTest, AllNullsCase)
 {
-  cudf::test::strings_column_wrapper input({"", "", ""}, {0, 0, 0});
+  cudf::test::strings_column_wrapper input({"", "", ""}, {false, false, false});
   auto sv   = cudf::strings_column_view(input);
   auto prog = cudf::strings::regex_program::create("-");
 
diff --git a/cpp/tests/strings/strip_tests.cpp b/cpp/tests/strings/strip_tests.cpp
index 63179474944..b72186f5141 100644
--- a/cpp/tests/strings/strip_tests.cpp
+++ b/cpp/tests/strings/strip_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -102,7 +102,8 @@ TEST_F(StringsStripTest, EmptyStringsColumn)
 
 TEST_F(StringsStripTest, AllEmptyStrings)
 {
-  auto input = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, {1, 1, 0, 1, 1});
+  auto input =
+    cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, {true, true, false, true, true});
   auto results =
     cudf::strings::strip(cudf::strings_column_view(input), cudf::strings::side_type::BOTH);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
diff --git a/cpp/tests/strings/translate_tests.cpp b/cpp/tests/strings/translate_tests.cpp
index ab3973242c6..3672d65406d 100644
--- a/cpp/tests/strings/translate_tests.cpp
+++ b/cpp/tests/strings/translate_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,7 +49,7 @@ TEST_F(StringsTranslateTest, Translate)
   auto strings_view = cudf::strings_column_view(strings);
 
   std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> translate_table{
-    make_entry("b", 0), make_entry("a", "A"), make_entry("é", "E"), make_entry("e", "_")};
+    make_entry("b", nullptr), make_entry("a", "A"), make_entry("é", "E"), make_entry("e", "_")};
   auto results = cudf::strings::translate(strings_view, translate_table);
 
   std::vector<char const*> h_expected{"___ ddd", " cc", nullptr, "", "AA", "dEd"};
diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp
index 8f492a930a8..df005dfa1dc 100644
--- a/cpp/tests/structs/structs_column_tests.cpp
+++ b/cpp/tests/structs/structs_column_tests.cpp
@@ -144,7 +144,7 @@ TYPED_TEST(TypedStructColumnWrapperTest, TestColumnWrapperConstruction)
   // Check child columns for exactly correct values.
   vector_of_columns expected_children;
   expected_children.emplace_back(
-    cudf::test::strings_column_wrapper{names, {1, 1, 1, 0, 1, 1}}.release());
+    cudf::test::strings_column_wrapper{names, {true, true, true, false, true, true}}.release());
   expected_children.emplace_back(cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>{
     {48, 27, 25, 31, 351, 351},
     {1, 1, 1, 0, 1, 0}}.release());
diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp
index 00f7d636530..e5ff700a242 100644
--- a/cpp/tests/structs/utilities_tests.cpp
+++ b/cpp/tests/structs/utilities_tests.cpp
@@ -325,8 +325,8 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtAllLevels)
   auto expected_nums_col_2 =
     cudf::column(static_cast<cudf::structs_column_view>(struct_of_structs_col)
                    .get_sliced_child(0, cudf::get_default_stream()));
-  auto expected_structs_col_2 =
-    cudf::test::fixed_width_column_wrapper<bool>{{1, 1, 0, 1, 0, 1, 1}, {1, 1, 0, 1, 0, 1, 1}};
+  auto expected_structs_col_2 = cudf::test::fixed_width_column_wrapper<bool>{
+    {1, 1, 0, 1, 0, 1, 1}, {true, true, false, true, false, true, true}};
   auto expected_nums_col_3 =
     cudf::column(static_cast<cudf::structs_column_view>(struct_of_structs_col)
                    .get_sliced_child(1, cudf::get_default_stream())
diff --git a/cpp/tests/table/row_operators_tests.cpp b/cpp/tests/table/row_operators_tests.cpp
index 974e7d67658..5fa63c47cf0 100644
--- a/cpp/tests/table/row_operators_tests.cpp
+++ b/cpp/tests/table/row_operators_tests.cpp
@@ -30,8 +30,10 @@ struct RowOperatorTestForNAN : public cudf::test::BaseFixture {};
 
 TEST_F(RowOperatorTestForNAN, NANEquality)
 {
-  cudf::test::fixed_width_column_wrapper<double> col1{{1., double(NAN), 3., 4.}, {1, 1, 0, 1}};
-  cudf::test::fixed_width_column_wrapper<double> col2{{1., double(NAN), 3., 4.}, {1, 1, 0, 1}};
+  cudf::test::fixed_width_column_wrapper<double> col1{{1., double(NAN), 3., 4.},
+                                                      {true, true, false, true}};
+  cudf::test::fixed_width_column_wrapper<double> col2{{1., double(NAN), 3., 4.},
+                                                      {true, true, false, true}};
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(col1, col2);
 }
@@ -47,7 +49,7 @@ TEST_F(RowOperatorTestForNAN, NANSorting)
      std::numeric_limits<double>::infinity(),
      1.,
      -1 * std::numeric_limits<double>::infinity()},
-    {1, 1, 1, 0, 1, 1, 1, 1}};
+    {true, true, true, false, true, true, true, true}};
   cudf::test::fixed_width_column_wrapper<int32_t> expected1{{3, 6, 2, 0, 5, 4, 1}};
   std::vector<cudf::order> column_order{cudf::order::ASCENDING};
   std::vector<cudf::null_order> null_precedence_1{cudf::null_order::BEFORE};
diff --git a/cpp/tests/text/bpe_tests.cpp b/cpp/tests/text/bpe_tests.cpp
index b03df12c5ed..3b08439612b 100644
--- a/cpp/tests/text/bpe_tests.cpp
+++ b/cpp/tests/text/bpe_tests.cpp
@@ -129,6 +129,6 @@ TEST_F(TextBytePairEncoding, BPE_Error)
 {
   auto empty = cudf::make_empty_column(cudf::type_id::STRING);
   EXPECT_THROW(nvtext::load_merge_pairs(cudf::strings_column_view(*empty)), cudf::logic_error);
-  auto null_pairs = cudf::test::strings_column_wrapper({"", ""}, {1, 0});
+  auto null_pairs = cudf::test::strings_column_wrapper({"", ""}, {true, false});
   EXPECT_THROW(nvtext::load_merge_pairs(cudf::strings_column_view(null_pairs)), cudf::logic_error);
 }
diff --git a/cpp/tests/text/jaccard_tests.cpp b/cpp/tests/text/jaccard_tests.cpp
index a0aee594609..91ebb644f83 100644
--- a/cpp/tests/text/jaccard_tests.cpp
+++ b/cpp/tests/text/jaccard_tests.cpp
@@ -48,21 +48,22 @@ TEST_F(JaccardTest, Basic)
 
 TEST_F(JaccardTest, WithNulls)
 {
-  auto input1 =
-    cudf::test::strings_column_wrapper({"brown fox", "jumps over dog", "", ""}, {1, 1, 0, 1});
-  auto input2 =
-    cudf::test::strings_column_wrapper({"brown cat", "jumps on fox", "", ""}, {1, 1, 1, 0});
+  auto input1 = cudf::test::strings_column_wrapper({"brown fox", "jumps over dog", "", ""},
+                                                   {true, true, false, true});
+  auto input2 = cudf::test::strings_column_wrapper({"brown cat", "jumps on fox", "", ""},
+                                                   {true, true, true, false});
 
   auto view1 = cudf::strings_column_view(input1);
   auto view2 = cudf::strings_column_view(input2);
 
   auto results = nvtext::jaccard_index(view1, view2, 5);
 
-  auto expected =
-    cudf::test::fixed_width_column_wrapper<float>({0.25f, 0.200000003f, 0.f, 0.f}, {1, 1, 0, 0});
+  auto expected = cudf::test::fixed_width_column_wrapper<float>({0.25f, 0.200000003f, 0.f, 0.f},
+                                                                {true, true, false, false});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 
-  expected = cudf::test::fixed_width_column_wrapper<float>({1.0f, 1.0f, 0.f, 0.f}, {1, 1, 0, 1});
+  expected = cudf::test::fixed_width_column_wrapper<float>({1.0f, 1.0f, 0.f, 0.f},
+                                                           {true, true, false, true});
   results  = nvtext::jaccard_index(view1, view1, 7);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
diff --git a/cpp/tests/text/normalize_tests.cpp b/cpp/tests/text/normalize_tests.cpp
index bf619bf49bc..b0d41004e7e 100644
--- a/cpp/tests/text/normalize_tests.cpp
+++ b/cpp/tests/text/normalize_tests.cpp
@@ -79,7 +79,7 @@ TEST_F(TextNormalizeTest, NormalizeEmptyTest)
 
 TEST_F(TextNormalizeTest, AllNullStrings)
 {
-  cudf::test::strings_column_wrapper strings({"", "", ""}, {0, 0, 0});
+  cudf::test::strings_column_wrapper strings({"", "", ""}, {false, false, false});
   cudf::strings_column_view strings_view(strings);
   auto results = nvtext::normalize_spaces(strings_view);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
@@ -89,10 +89,10 @@ TEST_F(TextNormalizeTest, AllNullStrings)
 
 TEST_F(TextNormalizeTest, SomeNullStrings)
 {
-  cudf::test::strings_column_wrapper strings({"", ".", "a"}, {0, 1, 1});
+  cudf::test::strings_column_wrapper strings({"", ".", "a"}, {false, true, true});
   cudf::strings_column_view strings_view(strings);
   auto results = nvtext::normalize_characters(strings_view, false);
-  cudf::test::strings_column_wrapper expected({"", " . ", "a"}, {0, 1, 1});
+  cudf::test::strings_column_wrapper expected({"", " . ", "a"}, {false, true, true});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
diff --git a/cpp/tests/text/replace_tests.cpp b/cpp/tests/text/replace_tests.cpp
index faced4a14d3..fadeb690df7 100644
--- a/cpp/tests/text/replace_tests.cpp
+++ b/cpp/tests/text/replace_tests.cpp
@@ -116,7 +116,7 @@ TEST_F(TextReplaceTest, ReplaceTokensErrorTest)
   cudf::strings_column_view strings_view(strings->view());
   cudf::test::strings_column_wrapper notnulls({"", "", ""});
   cudf::strings_column_view notnulls_view(notnulls);
-  cudf::test::strings_column_wrapper nulls({"", ""}, {0, 0});
+  cudf::test::strings_column_wrapper nulls({"", ""}, {false, false});
   cudf::strings_column_view nulls_view(nulls);
 
   EXPECT_THROW(nvtext::replace_tokens(strings_view, nulls_view, notnulls_view), cudf::logic_error);
diff --git a/cpp/tests/text/stemmer_tests.cpp b/cpp/tests/text/stemmer_tests.cpp
index bbc145e0fe7..a343913411c 100644
--- a/cpp/tests/text/stemmer_tests.cpp
+++ b/cpp/tests/text/stemmer_tests.cpp
@@ -168,7 +168,7 @@ TEST_F(TextStemmerTest, EmptyTest)
 TEST_F(TextStemmerTest, ErrorTest)
 {
   auto empty = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
-  cudf::test::fixed_width_column_wrapper<int32_t> indices({0}, {0});
+  cudf::test::fixed_width_column_wrapper<int32_t> indices({0}, {false});
   EXPECT_THROW(nvtext::is_letter(
                  cudf::strings_column_view(empty->view()), nvtext::letter_type::VOWEL, indices),
                cudf::logic_error);
diff --git a/cpp/tests/text/subword_tests.cpp b/cpp/tests/text/subword_tests.cpp
index 5a347e5fe68..a615780c02a 100644
--- a/cpp/tests/text/subword_tests.cpp
+++ b/cpp/tests/text/subword_tests.cpp
@@ -253,7 +253,7 @@ TEST(TextSubwordTest, EmptyStrings)
 
 TEST(TextSubwordTest, AllNullStrings)
 {
-  cudf::test::strings_column_wrapper strings({"", "", ""}, {0, 0, 0});
+  cudf::test::strings_column_wrapper strings({"", "", ""}, {false, false, false});
   std::string hash_file = temp_env->get_temp_filepath("hashed_vocab.txt");
   create_hashed_vocab(hash_file);
   auto vocab  = nvtext::load_vocabulary_file(hash_file);
diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp
index a59a54169d7..f9ca343eaac 100644
--- a/cpp/tests/text/tokenize_tests.cpp
+++ b/cpp/tests/text/tokenize_tests.cpp
@@ -102,7 +102,7 @@ TEST_F(TextTokenizeTest, TokenizeErrorTest)
     EXPECT_THROW(nvtext::count_tokens(strings_view, delimiters_view), cudf::logic_error);
   }
   {
-    cudf::test::strings_column_wrapper delimiters({"", ""}, {0, 0});  // null delimiters
+    cudf::test::strings_column_wrapper delimiters({"", ""}, {false, false});  // null delimiters
     cudf::strings_column_view delimiters_view(delimiters);
     EXPECT_THROW(nvtext::tokenize(strings_view, delimiters_view), cudf::logic_error);
     EXPECT_THROW(nvtext::count_tokens(strings_view, delimiters_view), cudf::logic_error);
@@ -127,7 +127,7 @@ TEST_F(TextTokenizeTest, TokenizeEmptyTest)
   auto view  = cudf::strings_column_view(input->view());
   cudf::test::strings_column_wrapper all_empty_wrapper({"", "", ""});
   auto all_empty = cudf::strings_column_view(all_empty_wrapper);
-  cudf::test::strings_column_wrapper all_null_wrapper({"", "", ""}, {0, 0, 0});
+  cudf::test::strings_column_wrapper all_null_wrapper({"", "", ""}, {false, false, false});
   auto all_null = cudf::strings_column_view(all_null_wrapper);
   cudf::test::fixed_width_column_wrapper<cudf::size_type> expected({0, 0, 0});
 
@@ -278,7 +278,7 @@ TEST_F(TextTokenizeTest, TokenizeErrors)
   cudf::strings_column_view view(empty);
   EXPECT_THROW(nvtext::load_vocabulary(view), cudf::logic_error);
 
-  cudf::test::strings_column_wrapper vocab_nulls({""}, {0});
+  cudf::test::strings_column_wrapper vocab_nulls({""}, {false});
   cudf::strings_column_view nulls(vocab_nulls);
   EXPECT_THROW(nvtext::load_vocabulary(nulls), cudf::logic_error);
 
diff --git a/cpp/tests/transform/nans_to_null_test.cpp b/cpp/tests/transform/nans_to_null_test.cpp
index 5dcfe18b7a0..ba16c100e7a 100644
--- a/cpp/tests/transform/nans_to_null_test.cpp
+++ b/cpp/tests/transform/nans_to_null_test.cpp
@@ -70,7 +70,7 @@ TYPED_TEST(NaNsToNullTest, WithMask)
   using T = TypeParam;
 
   std::vector<T> input   = {1, NAN, 3, NAN, 5, NAN};
-  std::vector<bool> mask = {1, 1, 1, 1, 0, 0};
+  std::vector<bool> mask = {true, true, true, true, false, false};
   auto input_column =
     cudf::test::fixed_width_column_wrapper<T>(input.begin(), input.end(), mask.begin());
   auto expected_column = this->create_expected(input, mask);
@@ -92,7 +92,7 @@ TYPED_TEST(NaNsToNullTest, NoNANWithMask)
   using T = TypeParam;
 
   std::vector<T> input   = {1, 2, 3, 4, 5, 6};
-  std::vector<bool> mask = {1, 1, 1, 1, 0, 0};
+  std::vector<bool> mask = {true, true, true, true, false, false};
   auto input_column =
     cudf::test::fixed_width_column_wrapper<T>(input.begin(), input.end(), mask.begin());
   auto expected_column = this->create_expected(input, mask);
diff --git a/cpp/tests/transform/one_hot_encode_tests.cpp b/cpp/tests/transform/one_hot_encode_tests.cpp
index 8384cb3480b..ae2c3bc0c0c 100644
--- a/cpp/tests/transform/one_hot_encode_tests.cpp
+++ b/cpp/tests/transform/one_hot_encode_tests.cpp
@@ -57,8 +57,9 @@ TYPED_TEST(OneHotEncodingTestTyped, Basic)
 
 TYPED_TEST(OneHotEncodingTestTyped, Nulls)
 {
-  auto input    = cudf::test::fixed_width_column_wrapper<int32_t>{{8, 8, 8, 9, 9}, {1, 1, 0, 1, 1}};
-  auto category = cudf::test::fixed_width_column_wrapper<int32_t>({8, 9, -1}, {1, 1, 0});
+  auto input    = cudf::test::fixed_width_column_wrapper<int32_t>{{8, 8, 8, 9, 9},
+                                                                  {true, true, false, true, true}};
+  auto category = cudf::test::fixed_width_column_wrapper<int32_t>({8, 9, -1}, {true, true, false});
 
   auto col0 = cudf::test::fixed_width_column_wrapper<bool>{1, 1, 0, 0, 0};
   auto col1 = cudf::test::fixed_width_column_wrapper<bool>{0, 0, 0, 1, 1};
@@ -164,8 +165,8 @@ TEST_F(OneHotEncodingTest, Strings)
 {
   auto input = cudf::test::strings_column_wrapper{
     {"hello", "rapidsai", "cudf", "hello", "cuspatial", "hello", "world", "!"},
-    {1, 1, 1, 1, 0, 1, 1, 0}};
-  auto category = cudf::test::strings_column_wrapper{{"hello", "world", ""}, {1, 1, 0}};
+    {true, true, true, true, false, true, true, false}};
+  auto category = cudf::test::strings_column_wrapper{{"hello", "world", ""}, {true, true, false}};
 
   auto col0 = cudf::test::fixed_width_column_wrapper<bool>{1, 0, 0, 1, 0, 1, 0, 0};
   auto col1 = cudf::test::fixed_width_column_wrapper<bool>{0, 0, 0, 0, 0, 0, 1, 0};
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index ebeafc82039..45b89b76070 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -1036,8 +1036,9 @@ TYPED_TEST(FixedPointTests, Decimal32ToDecimalXXWithLargerScaleAndNullMask)
   using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
   using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
 
-  auto const vec      = std::vector{1729, 17290, 172900, 1729000};
-  auto const input    = fp_wrapperFrom{vec.cbegin(), vec.cend(), {1, 1, 1, 0}, scale_type{-3}};
+  auto const vec = std::vector{1729, 17290, 172900, 1729000};
+  auto const input =
+    fp_wrapperFrom{vec.cbegin(), vec.cend(), {true, true, true, false}, scale_type{-3}};
   auto const expected = fp_wrapperTo{{1, 17, 172, 1729000}, {1, 1, 1, 0}, scale_type{0}};
   auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
 
@@ -1053,8 +1054,9 @@ TYPED_TEST(FixedPointTests, Decimal64ToDecimalXXWithLargerScaleAndNullMask)
   using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
   using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
 
-  auto const vec      = std::vector{1729, 17290, 172900, 1729000};
-  auto const input    = fp_wrapperFrom{vec.cbegin(), vec.cend(), {1, 1, 1, 0}, scale_type{-3}};
+  auto const vec = std::vector{1729, 17290, 172900, 1729000};
+  auto const input =
+    fp_wrapperFrom{vec.cbegin(), vec.cend(), {true, true, true, false}, scale_type{-3}};
   auto const expected = fp_wrapperTo{{1, 17, 172, 1729000}, {1, 1, 1, 0}, scale_type{0}};
   auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
 
@@ -1070,8 +1072,9 @@ TYPED_TEST(FixedPointTests, Decimal128ToDecimalXXWithLargerScaleAndNullMask)
   using fp_wrapperFrom = cudf::test::fixed_point_column_wrapper<RepTypeFrom>;
   using fp_wrapperTo   = cudf::test::fixed_point_column_wrapper<RepTypeTo>;
 
-  auto const vec      = std::vector{1729, 17290, 172900, 1729000};
-  auto const input    = fp_wrapperFrom{vec.cbegin(), vec.cend(), {1, 1, 1, 0}, scale_type{-3}};
+  auto const vec = std::vector{1729, 17290, 172900, 1729000};
+  auto const input =
+    fp_wrapperFrom{vec.cbegin(), vec.cend(), {true, true, true, false}, scale_type{-3}};
   auto const expected = fp_wrapperTo{{1, 17, 172, 1729000}, {1, 1, 1, 0}, scale_type{0}};
   auto const result   = cudf::cast(input, make_fixed_point_data_type<decimalXX>(0));
 
diff --git a/cpp/tests/unary/math_ops_test.cpp b/cpp/tests/unary/math_ops_test.cpp
index acbf0732522..5bfbf70d5f9 100644
--- a/cpp/tests/unary/math_ops_test.cpp
+++ b/cpp/tests/unary/math_ops_test.cpp
@@ -69,7 +69,8 @@ TYPED_TEST(UnaryLogicalOpsTest, SimpleLogicalNot)
 TYPED_TEST(UnaryLogicalOpsTest, SimpleLogicalNotWithNullMask)
 {
   cudf::test::fixed_width_column_wrapper<TypeParam> input{{true, true, true, true}, {1, 0, 1, 1}};
-  cudf::test::fixed_width_column_wrapper<bool> expected{{false, true, false, false}, {1, 0, 1, 1}};
+  cudf::test::fixed_width_column_wrapper<bool> expected{{false, true, false, false},
+                                                        {true, false, true, true}};
   auto output = cudf::unary_operation(input, cudf::unary_operator::NOT);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, output->view());
   auto encoded = cudf::dictionary::encode(input);
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 7cc2777972e..fb9bdeb0b22 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -777,7 +777,7 @@ struct column_comparator {
 
 void check_non_empty_nulls(column_view const& lhs, column_view const& rhs)
 {
-  auto check_column_nulls = [](column_view const& col, const char* col_name) {
+  auto check_column_nulls = [](column_view const& col, char const* col_name) {
     if (cudf::detail::has_nonempty_nulls(col, cudf::get_default_stream())) {
       throw std::invalid_argument(col_name + std::string(" column has non-empty nulls"));
     }
diff --git a/cpp/tests/utilities/identify_stream_usage.cpp b/cpp/tests/utilities/identify_stream_usage.cpp
index 5628f7966c3..5100c066883 100644
--- a/cpp/tests/utilities/identify_stream_usage.cpp
+++ b/cpp/tests/utilities/identify_stream_usage.cpp
@@ -92,7 +92,7 @@ class test_cuda_stream_pool : public cuda_stream_pool {
     return std::vector<rmm::cuda_stream_view>(count, cudf::test::get_default_stream());
   }
 
-  std::size_t get_stream_pool_size() const override { return 1UL; }
+  [[nodiscard]] std::size_t get_stream_pool_size() const override { return 1UL; }
 };
 
 cuda_stream_pool* create_global_cuda_stream_pool() { return new test_cuda_stream_pool(); }
diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp
index 9d44e9d8247..d052e20eedb 100644
--- a/cpp/tests/utilities_tests/logger_tests.cpp
+++ b/cpp/tests/utilities_tests/logger_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -34,7 +34,7 @@ class LoggerTest : public cudf::test::BaseFixture {
     cudf::logger().set_formatter(
       std::unique_ptr<spdlog::formatter>(new spdlog::pattern_formatter("%v")));
   }
-  ~LoggerTest()
+  ~LoggerTest() override
   {
     cudf::logger().set_level(prev_level);
     cudf::logger().sinks() = prev_sinks;
diff --git a/cpp/tests/utilities_tests/type_list_tests.cpp b/cpp/tests/utilities_tests/type_list_tests.cpp
index d0b10b774eb..849457056e4 100644
--- a/cpp/tests/utilities_tests/type_list_tests.cpp
+++ b/cpp/tests/utilities_tests/type_list_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -61,7 +61,7 @@ std::string type_name()
 {
   int status;
   char* realname;
-  realname = abi::__cxa_demangle(typeid(T).name(), 0, 0, &status);
+  realname = abi::__cxa_demangle(typeid(T).name(), nullptr, nullptr, &status);
   std::string name{realname};
   free(realname);
   return name;
@@ -69,10 +69,10 @@ std::string type_name()
 
 TEST(TypeList, GetSize)
 {
-  static_assert(GetSize<Types<>> == 0, "");
-  static_assert(GetSize<Types<int>> == 1, "");
-  static_assert(GetSize<Types<int, int>> == 2, "");
-  static_assert(GetSize<Types<int, void>> == 2, "");
+  static_assert(GetSize<Types<>> == 0);
+  static_assert(GetSize<Types<int>> == 1);
+  static_assert(GetSize<Types<int, int>> == 2);
+  static_assert(GetSize<Types<int, void>> == 2);
 }
 
 TEST(TypeList, GetType)
@@ -149,39 +149,39 @@ TEST(TypeList, CrossProduct)
 
 TEST(TypeList, AllSame)
 {
-  static_assert(AllSame::Call<Types<int, int>>::value, "");
-  static_assert(AllSame::Call<Types<int, int>>::value, "");
-  static_assert(!AllSame::Call<Types<bool, int>>::value, "");
+  static_assert(AllSame::Call<Types<int, int>>::value);
+  static_assert(AllSame::Call<Types<int, int>>::value);
+  static_assert(!AllSame::Call<Types<bool, int>>::value);
 
-  static_assert(AllSame::Call<int, int>::value, "");
-  static_assert(!AllSame::Call<int, bool>::value, "");
+  static_assert(AllSame::Call<int, int>::value);
+  static_assert(!AllSame::Call<int, bool>::value);
 
-  static_assert(AllSame::Call<int, int, int>::value, "");
-  static_assert(!AllSame::Call<int, float, int>::value, "");
-  static_assert(!AllSame::Call<int, int, float>::value, "");
+  static_assert(AllSame::Call<int, int, int>::value);
+  static_assert(!AllSame::Call<int, float, int>::value);
+  static_assert(!AllSame::Call<int, int, float>::value);
 }
 
 TEST(TypeList, Exists)
 {
-  static_assert(Exists<int, Types<int, char, float>>, "");
-  static_assert(!Exists<int, Types<double, char, float>>, "");
-  static_assert(!Exists<int, Types<>>, "");
-  static_assert(Exists<int, Types<double, char, float, int>>, "");
-  static_assert(!Exists<int, Types<double>>, "");
-  static_assert(Exists<int, Types<int>>, "");
+  static_assert(Exists<int, Types<int, char, float>>);
+  static_assert(!Exists<int, Types<double, char, float>>);
+  static_assert(!Exists<int, Types<>>);
+  static_assert(Exists<int, Types<double, char, float, int>>);
+  static_assert(!Exists<int, Types<double>>);
+  static_assert(Exists<int, Types<int>>);
 }
 
 TEST(TypeList, ContainedIn)
 {
-  static_assert(ContainedIn<Types<Types<int, char>>>::Call<Types<int, char>>::value, "");
-  static_assert(!ContainedIn<Types<Types<int, char>>>::Call<Types<int, float>>::value, "");
-  static_assert(!ContainedIn<Types<>>::Call<Types<int, float>>::value, "");
+  static_assert(ContainedIn<Types<Types<int, char>>>::Call<Types<int, char>>::value);
+  static_assert(!ContainedIn<Types<Types<int, char>>>::Call<Types<int, float>>::value);
+  static_assert(!ContainedIn<Types<>>::Call<Types<int, float>>::value);
   static_assert(
-    ContainedIn<Types<Types<int, float>, Types<char, char>>>::Call<Types<int, float>>::value, "");
+    ContainedIn<Types<Types<int, float>, Types<char, char>>>::Call<Types<int, float>>::value);
   static_assert(
-    !ContainedIn<Types<Types<int, float>, Types<char, char>>>::Call<Types<int, double>>::value, "");
-  static_assert(ContainedIn<Types<Types<int, float>, Types<>>>::Call<Types<>>::value, "");
-  static_assert(!ContainedIn<Types<Types<int, float>, Types<int>>>::Call<Types<>>::value, "");
+    !ContainedIn<Types<Types<int, float>, Types<char, char>>>::Call<Types<int, double>>::value);
+  static_assert(ContainedIn<Types<Types<int, float>, Types<>>>::Call<Types<>>::value);
+  static_assert(!ContainedIn<Types<Types<int, float>, Types<int>>>::Call<Types<>>::value);
 }
 
 TEST(TypeList, RemoveIf)
diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index 96ad1f23b8c..ea04c1cda83 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -53,7 +53,7 @@ class jni_exception : public std::runtime_error {
 /**
  * @brief throw a java exception and a C++ one for flow control.
  */
-inline void throw_java_exception(JNIEnv* const env, const char* class_name, const char* message)
+inline void throw_java_exception(JNIEnv* const env, char const* class_name, char const* message)
 {
   jclass ex_class = env->FindClass(class_name);
   if (ex_class != NULL) { env->ThrowNew(ex_class, message); }
@@ -258,7 +258,7 @@ class native_jArray {
     check_java_exception(env);
   }
 
-  native_jArray(JNIEnv* const env, const std::vector<N_TYPE>& arr)
+  native_jArray(JNIEnv* const env, std::vector<N_TYPE> const& arr)
     : env(env), orig(access.newArray(env, arr.size())), len(arr.size()), data_ptr(NULL)
   {
     check_java_exception(env);
@@ -485,7 +485,7 @@ class unique_jpointerArray {
   {
   }
 
-  unique_jpointerArray(JNIEnv* const env, jlongArray orig, const D& del)
+  unique_jpointerArray(JNIEnv* const env, jlongArray orig, D const& del)
     : wrapped(new native_jpointerArray<T>(env, orig)), del(del)
   {
   }
@@ -494,7 +494,7 @@ class unique_jpointerArray {
   {
   }
 
-  unique_jpointerArray(JNIEnv* const env, int len, const D& del)
+  unique_jpointerArray(JNIEnv* const env, int len, D const& del)
     : wrapped(new native_jpointerArray<T>(env, len)), del(del)
   {
   }
@@ -504,7 +504,7 @@ class unique_jpointerArray {
   {
   }
 
-  unique_jpointerArray(JNIEnv* const env, T* arr, int len, const D& del)
+  unique_jpointerArray(JNIEnv* const env, T* arr, int len, D const& del)
     : wrapped(new native_jpointerArray<T>(env, arr, len)), del(del)
   {
   }
@@ -561,7 +561,7 @@ class native_jstring {
  private:
   JNIEnv* env;
   jstring orig;
-  mutable const char* cstr;
+  mutable char const* cstr;
   mutable size_t cstr_length;
 
   void init_cstr() const
@@ -600,7 +600,7 @@ class native_jstring {
 
   bool is_null() const noexcept { return orig == NULL; }
 
-  const char* get() const
+  char const* get() const
   {
     init_cstr();
     return cstr;
@@ -665,7 +665,7 @@ class native_jobjectArray {
     return ret;
   }
 
-  void set(int index, const T& val)
+  void set(int index, T const& val)
   {
     if (orig == NULL) { throw_java_exception(env, NPE_CLASS, "jobjectArray pointer is NULL"); }
     env->SetObjectArrayElement(orig, index, val);
@@ -685,7 +685,7 @@ class native_jstringArray {
   native_jobjectArray<jstring> arr;
   mutable std::vector<native_jstring> cache;
   mutable std::vector<std::string> cpp_cache;
-  mutable std::vector<const char*> c_cache;
+  mutable std::vector<char const*> c_cache;
 
   void init_cache() const
   {
@@ -753,7 +753,7 @@ class native_jstringArray {
     return cache[index];
   }
 
-  const char** const as_c_array() const
+  char const** const as_c_array() const
   {
     init_c_cache();
     return c_cache.data();
@@ -771,13 +771,13 @@ class native_jstringArray {
     update_caches(index, val);
   }
 
-  void set(int index, const native_jstring& val)
+  void set(int index, native_jstring const& val)
   {
     arr.set(index, val.get_jstring());
     update_caches(index, val.get_jstring());
   }
 
-  void set(int index, const char* val)
+  void set(int index, char const* val)
   {
     jstring str = env->NewStringUTF(val);
     check_java_exception(env);
@@ -791,7 +791,7 @@ class native_jstringArray {
  */
 inline jthrowable cuda_exception(JNIEnv* const env, cudaError_t status, jthrowable cause = NULL)
 {
-  const char* ex_class_name;
+  char const* ex_class_name;
 
   // Calls cudaGetLastError twice. It is nearly certain that a fatal error occurred if the second
   // call doesn't return with cudaSuccess.
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index 30a04e37d2c..cdc5aa41abe 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -109,10 +109,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(JNIEnv* env,
       offsets_length  = env->GetDirectBufferCapacity(j_offsets_obj);
     }
     auto data_buffer =
-      arrow::Buffer::Wrap(static_cast<const char*>(data_address), static_cast<int>(data_length));
-    auto null_buffer    = arrow::Buffer::Wrap(static_cast<const char*>(validity_address),
+      arrow::Buffer::Wrap(static_cast<char const*>(data_address), static_cast<int>(data_length));
+    auto null_buffer    = arrow::Buffer::Wrap(static_cast<char const*>(validity_address),
                                            static_cast<int>(validity_length));
-    auto offsets_buffer = arrow::Buffer::Wrap(static_cast<const char*>(offsets_address),
+    auto offsets_buffer = arrow::Buffer::Wrap(static_cast<char const*>(offsets_address),
                                               static_cast<int>(offsets_length));
 
     std::shared_ptr<arrow::Array> arrow_array;
@@ -171,8 +171,8 @@ Java_ai_rapids_cudf_ColumnVector_stringConcatenation(JNIEnv* env,
   JNI_NULL_CHECK(env, narep, "narep string scalar object is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto& separator_scalar = *reinterpret_cast<cudf::string_scalar*>(separator);
-    const auto& narep_scalar     = *reinterpret_cast<cudf::string_scalar*>(narep);
+    auto const& separator_scalar = *reinterpret_cast<cudf::string_scalar*>(separator);
+    auto const& narep_scalar     = *reinterpret_cast<cudf::string_scalar*>(narep);
     auto null_policy             = separate_nulls ? cudf::strings::separator_on_nulls::YES
                                                   : cudf::strings::separator_on_nulls::NO;
 
@@ -199,8 +199,8 @@ Java_ai_rapids_cudf_ColumnVector_stringConcatenationSepCol(JNIEnv* env,
   JNI_NULL_CHECK(env, col_narep, "column narep string scalar object is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto& separator_narep_scalar = *reinterpret_cast<cudf::string_scalar*>(separator_narep);
-    const auto& col_narep_scalar       = *reinterpret_cast<cudf::string_scalar*>(col_narep);
+    auto const& separator_narep_scalar = *reinterpret_cast<cudf::string_scalar*>(separator_narep);
+    auto const& col_narep_scalar       = *reinterpret_cast<cudf::string_scalar*>(col_narep);
     auto null_policy                   = separate_nulls ? cudf::strings::separator_on_nulls::YES
                                                         : cudf::strings::separator_on_nulls::NO;
 
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 8487fb6dc91..4551325ebb1 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -1026,7 +1026,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNullNative(JNIEnv* env,
   JNI_NULL_CHECK(env, handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(handle);
+    cudf::column_view const* input = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::is_null(*input));
   }
   CATCH_STD(env, 0);
@@ -1039,7 +1039,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNotNullNative(JNIEnv* e
   JNI_NULL_CHECK(env, handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(handle);
+    cudf::column_view const* input = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::is_valid(*input));
   }
   CATCH_STD(env, 0);
@@ -1052,7 +1052,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNanNative(JNIEnv* env,
   JNI_NULL_CHECK(env, handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(handle);
+    cudf::column_view const* input = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::is_nan(*input));
   }
   CATCH_STD(env, 0);
@@ -1065,7 +1065,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isNotNanNative(JNIEnv* en
   JNI_NULL_CHECK(env, handle, "input column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(handle);
+    cudf::column_view const* input = reinterpret_cast<cudf::column_view*>(handle);
     return release_as_jlong(cudf::is_not_nan(*input));
   }
   CATCH_STD(env, 0);
@@ -1104,7 +1104,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_year(JNIEnv* env, jclass,
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
+    cudf::column_view const* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_year(*input));
   }
   CATCH_STD(env, 0);
@@ -1115,7 +1115,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_month(JNIEnv* env, jclass
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
+    cudf::column_view const* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_month(*input));
   }
   CATCH_STD(env, 0);
@@ -1126,7 +1126,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_day(JNIEnv* env, jclass,
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
+    cudf::column_view const* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_day(*input));
   }
   CATCH_STD(env, 0);
@@ -1137,7 +1137,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_hour(JNIEnv* env, jclass,
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
+    cudf::column_view const* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_hour(*input));
   }
   CATCH_STD(env, 0);
@@ -1148,7 +1148,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_minute(JNIEnv* env, jclas
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
+    cudf::column_view const* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_minute(*input));
   }
   CATCH_STD(env, 0);
@@ -1159,7 +1159,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_second(JNIEnv* env, jclas
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
+    cudf::column_view const* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_second(*input));
   }
   CATCH_STD(env, 0);
@@ -1170,7 +1170,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_weekDay(JNIEnv* env, jcla
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
+    cudf::column_view const* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_weekday(*input));
   }
   CATCH_STD(env, 0);
@@ -1183,7 +1183,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_lastDayOfMonth(JNIEnv* en
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
+    cudf::column_view const* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::last_day_of_month(*input));
   }
   CATCH_STD(env, 0);
@@ -1196,7 +1196,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dayOfYear(JNIEnv* env,
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
+    cudf::column_view const* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::day_of_year(*input));
   }
   CATCH_STD(env, 0);
@@ -1209,7 +1209,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_quarterOfYear(JNIEnv* env
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
+    cudf::column_view const* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::extract_quarter(*input));
   }
   CATCH_STD(env, 0);
@@ -1224,8 +1224,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_addCalendricalMonths(JNIE
   JNI_NULL_CHECK(env, months_ptr, "months is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view* ts     = reinterpret_cast<cudf::column_view*>(ts_ptr);
-    const cudf::column_view* months = reinterpret_cast<cudf::column_view*>(months_ptr);
+    cudf::column_view const* ts     = reinterpret_cast<cudf::column_view*>(ts_ptr);
+    cudf::column_view const* months = reinterpret_cast<cudf::column_view*>(months_ptr);
     return release_as_jlong(cudf::datetime::add_calendrical_months(*ts, *months));
   }
   CATCH_STD(env, 0);
@@ -1238,7 +1238,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_isLeapYear(JNIEnv* env,
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
+    cudf::column_view const* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::datetime::is_leap_year(*input));
   }
   CATCH_STD(env, 0);
@@ -2702,8 +2702,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringConcatenationListEl
   JNI_NULL_CHECK(env, col_narep, "column narep string scalar object is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto& separator_narep_scalar = *reinterpret_cast<cudf::string_scalar*>(separator_narep);
-    const auto& col_narep_scalar       = *reinterpret_cast<cudf::string_scalar*>(col_narep);
+    auto const& separator_narep_scalar = *reinterpret_cast<cudf::string_scalar*>(separator_narep);
+    auto const& col_narep_scalar       = *reinterpret_cast<cudf::string_scalar*>(col_narep);
     auto null_policy                   = separate_nulls ? cudf::strings::separator_on_nulls::YES
                                                         : cudf::strings::separator_on_nulls::NO;
     auto empty_list_output             = empty_string_output_if_empty_list
@@ -2738,8 +2738,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_stringConcatenationListEl
   JNI_NULL_CHECK(env, narep, "separator narep string scalar object is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto& separator_scalar = *reinterpret_cast<cudf::string_scalar*>(separator);
-    const auto& narep_scalar     = *reinterpret_cast<cudf::string_scalar*>(narep);
+    auto const& separator_scalar = *reinterpret_cast<cudf::string_scalar*>(separator);
+    auto const& narep_scalar     = *reinterpret_cast<cudf::string_scalar*>(narep);
     auto null_policy             = separate_nulls ? cudf::strings::separator_on_nulls::YES
                                                   : cudf::strings::separator_on_nulls::NO;
     auto empty_list_output       = empty_string_output_if_empty_list
@@ -2837,7 +2837,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_toHex(JNIEnv* env, jclass
   JNI_NULL_CHECK(env, input_ptr, "input is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const cudf::column_view* input = reinterpret_cast<cudf::column_view*>(input_ptr);
+    cudf::column_view const* input = reinterpret_cast<cudf::column_view*>(input_ptr);
     return release_as_jlong(cudf::strings::integers_to_hex(*input));
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 8bd0f7793b4..5842a980fc4 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -450,7 +450,7 @@ class pinned_fallback_host_memory_resource {
   {
     try {
       return _pool->allocate(bytes, alignment);
-    } catch (const std::exception& unused) {
+    } catch (std::exception const& unused) {
       // try to allocate using the underlying pinned resource
       return prior_cudf_pinned_mr().allocate(bytes, alignment);
     }
@@ -558,13 +558,13 @@ class pinned_fallback_host_memory_resource {
   /**
    * @briefreturn{true if the specified resource is the same type as this resource.}
    */
-  bool operator==(const pinned_fallback_host_memory_resource&) const { return true; }
+  bool operator==(pinned_fallback_host_memory_resource const&) const { return true; }
 
   /**
    * @briefreturn{true if the specified resource is not the same type as this resource, otherwise
    * false.}
    */
-  bool operator!=(const pinned_fallback_host_memory_resource&) const { return false; }
+  bool operator!=(pinned_fallback_host_memory_resource const&) const { return false; }
 
   /**
    * @brief Enables the `cuda::mr::device_accessible` property
@@ -1067,7 +1067,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocFromPinnedPool(JNIEnv* env,
     auto pool = reinterpret_cast<rmm_pinned_pool_t*>(pool_ptr);
     void* ret = pool->allocate(size);
     return reinterpret_cast<jlong>(ret);
-  } catch (const std::exception& unused) {
+  } catch (std::exception const& unused) {
     return -1;
   }
 }
diff --git a/java/src/main/native/src/ScalarJni.cpp b/java/src/main/native/src/ScalarJni.cpp
index 6a1ad1a9f32..55037910abe 100644
--- a/java/src/main/native/src/ScalarJni.cpp
+++ b/java/src/main/native/src/ScalarJni.cpp
@@ -180,8 +180,8 @@ Java_ai_rapids_cudf_Scalar_getChildrenFromStructScalar(JNIEnv* env, jclass, jlon
   JNI_NULL_CHECK(env, scalar_handle, "scalar handle is null", 0);
   try {
     cudf::jni::auto_set_device(env);
-    const auto s                  = reinterpret_cast<cudf::struct_scalar*>(scalar_handle);
-    const cudf::table_view& table = s->view();
+    auto const s                  = reinterpret_cast<cudf::struct_scalar*>(scalar_handle);
+    cudf::table_view const& table = s->view();
     cudf::jni::native_jpointerArray<cudf::column_view> column_handles(env, table.num_columns());
     for (int i = 0; i < table.num_columns(); i++) {
       column_handles[i] = new cudf::column_view(table.column(i));
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index e411b1d5362..c58cd732b39 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -102,14 +102,14 @@ typedef jni_table_writer_handle<cudf::io::orc_chunked_writer> native_orc_writer_
 
 class native_arrow_ipc_writer_handle final {
  public:
-  explicit native_arrow_ipc_writer_handle(const std::vector<std::string>& col_names,
-                                          const std::string& file_name)
+  explicit native_arrow_ipc_writer_handle(std::vector<std::string> const& col_names,
+                                          std::string const& file_name)
     : initialized(false), column_names(col_names), file_name(file_name)
   {
   }
 
-  explicit native_arrow_ipc_writer_handle(const std::vector<std::string>& col_names,
-                                          const std::shared_ptr<arrow::io::OutputStream>& sink)
+  explicit native_arrow_ipc_writer_handle(std::vector<std::string> const& col_names,
+                                          std::shared_ptr<arrow::io::OutputStream> const& sink)
     : initialized(false), column_names(col_names), file_name(""), sink(sink)
   {
   }
@@ -178,7 +178,7 @@ class native_arrow_ipc_writer_handle final {
     initialized = false;
   }
 
-  std::vector<cudf::column_metadata> get_column_metadata(const cudf::table_view& tview)
+  std::vector<cudf::column_metadata> get_column_metadata(cudf::table_view const& tview)
   {
     if (!column_names.empty() && columns_meta.empty()) {
       // Rebuild the structure of column meta according to table schema.
@@ -200,9 +200,9 @@ class native_arrow_ipc_writer_handle final {
   }
 
  private:
-  cudf::column_metadata build_one_column_meta(const cudf::column_view& cview,
+  cudf::column_metadata build_one_column_meta(cudf::column_view const& cview,
                                               size_t& idx,
-                                              const bool consume_name = true)
+                                              bool const consume_name = true)
   {
     auto col_meta = cudf::column_metadata{};
     if (consume_name) { col_meta.name = get_column_name(idx++); }
@@ -266,16 +266,16 @@ class jni_arrow_output_stream final : public arrow::io::OutputStream {
     host_memory_allocator = nullptr;
   }
 
-  arrow::Status Write(const std::shared_ptr<arrow::Buffer>& data) override
+  arrow::Status Write(std::shared_ptr<arrow::Buffer> const& data) override
   {
     return Write(data->data(), data->size());
   }
 
-  arrow::Status Write(const void* data, int64_t nbytes) override
+  arrow::Status Write(void const* data, int64_t nbytes) override
   {
     JNIEnv* env           = cudf::jni::get_jni_env(jvm);
     int64_t left_to_copy  = nbytes;
-    const char* copy_from = static_cast<const char*>(data);
+    char const* copy_from = static_cast<char const*>(data);
     while (left_to_copy > 0) {
       long buffer_amount_available = current_buffer_len - current_buffer_written;
       if (buffer_amount_available <= 0) {
@@ -440,7 +440,7 @@ class jni_arrow_input_stream final : public arrow::io::InputStream {
 
 class native_arrow_ipc_reader_handle final {
  public:
-  explicit native_arrow_ipc_reader_handle(const std::string& file_name)
+  explicit native_arrow_ipc_reader_handle(std::string const& file_name)
   {
     auto tmp_source = arrow::io::ReadableFile::Open(file_name);
     if (!tmp_source.ok()) { throw std::runtime_error(tmp_source.status().message()); }
@@ -2056,7 +2056,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readAvro(JNIEnv* env,
                                                                 jlong buffer,
                                                                 jlong buffer_length)
 {
-  const bool read_buffer = (buffer != 0);
+  bool const read_buffer = (buffer != 0);
   if (!read_buffer) {
     JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL);
   } else if (inputfilepath != NULL) {
@@ -2421,7 +2421,7 @@ Java_ai_rapids_cudf_Table_writeORCBufferBegin(JNIEnv* env,
                    meta_keys.end(),
                    meta_values.begin(),
                    std::inserter(kv_metadata, kv_metadata.end()),
-                   [](const std::string& k, const std::string& v) { return std::make_pair(k, v); });
+                   [](std::string const& k, std::string const& v) { return std::make_pair(k, v); });
 
     std::unique_ptr<cudf::jni::jni_writer_data_sink> data_sink(
       new cudf::jni::jni_writer_data_sink(env, consumer, host_memory_allocator));
@@ -2495,7 +2495,7 @@ JNIEXPORT long JNICALL Java_ai_rapids_cudf_Table_writeORCFileBegin(JNIEnv* env,
                    meta_keys.end(),
                    meta_values.begin(),
                    std::inserter(kv_metadata, kv_metadata.end()),
-                   [](const std::string& k, const std::string& v) { return std::make_pair(k, v); });
+                   [](std::string const& k, std::string const& v) { return std::make_pair(k, v); });
 
     sink_info sink{output_path.get()};
     auto stats                      = std::make_shared<cudf::io::writer_compression_statistics>();
diff --git a/java/src/main/native/src/jni_writer_data_sink.hpp b/java/src/main/native/src/jni_writer_data_sink.hpp
index 52756266beb..c918e87ba89 100644
--- a/java/src/main/native/src/jni_writer_data_sink.hpp
+++ b/java/src/main/native/src/jni_writer_data_sink.hpp
@@ -61,7 +61,7 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
   {
     JNIEnv* env           = cudf::jni::get_jni_env(jvm);
     long left_to_copy     = static_cast<long>(size);
-    const char* copy_from = static_cast<const char*>(data);
+    char const* copy_from = static_cast<char const*>(data);
     while (left_to_copy > 0) {
       long buffer_amount_available = current_buffer_len - current_buffer_written;
       if (buffer_amount_available <= 0) {
@@ -87,7 +87,7 @@ class jni_writer_data_sink final : public cudf::io::data_sink {
   {
     JNIEnv* env           = cudf::jni::get_jni_env(jvm);
     long left_to_copy     = static_cast<long>(size);
-    const char* copy_from = static_cast<const char*>(gpu_data);
+    char const* copy_from = static_cast<char const*>(gpu_data);
     while (left_to_copy > 0) {
       long buffer_amount_available = current_buffer_len - current_buffer_written;
       if (buffer_amount_available <= 0) {

From 9fae8ab6133614dd155c8ca445d59eb1ce36b4bd Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 12 Jun 2024 14:16:31 +0100
Subject: [PATCH 344/842] Add test coverage for slicing with "out of bounds"
 negative indices (#15990)

Polars wraps negative starts and then clamps both the resulting start
and length to [0, num_rows), so we should do that.

Add tests of this behaviour as well.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15990
---
 .../cudf_polars/containers/dataframe.py           | 13 ++++++++-----
 python/cudf_polars/cudf_polars/testing/asserts.py | 14 ++++++++++++--
 python/cudf_polars/cudf_polars/typing/__init__.py | 15 ++++++++++++++-
 python/cudf_polars/tests/test_slice.py            | 13 +++++++------
 4 files changed, 41 insertions(+), 14 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index 7039fcaf077..d1f7a9ed2cf 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -96,7 +96,7 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
 
         Returns
         -------
-        New dataframe sharing  data with the input table.
+        New dataframe sharing data with the input table.
 
         Raises
         ------
@@ -205,15 +205,18 @@ def slice(self, zlice: tuple[int, int] | None) -> Self:
 
         Returns
         -------
-        New dataframe (if zlice is not None) other self (if it is)
+        New dataframe (if zlice is not None) otherwise self (if it is)
         """
         if zlice is None:
             return self
         start, length = zlice
         if start < 0:
             start += self.num_rows
-        # Polars slice takes an arbitrary positive integer and slice
-        # to the end of the frame if it is larger.
-        end = min(start + length, self.num_rows)
+        # Polars implementation wraps negative start by num_rows, then
+        # adds length to start to get the end, then clamps both to
+        # [0, num_rows)
+        end = start + length
+        start = max(min(start, self.num_rows), 0)
+        end = max(min(end, self.num_rows), 0)
         (table,) = plc.copying.slice(self.table, [start, end])
         return type(self).from_table(table, self.column_names).sorted_like(self)
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
index 2f19b41cc3a..3edaa427432 100644
--- a/python/cudf_polars/cudf_polars/testing/asserts.py
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -13,14 +13,19 @@
 from cudf_polars.callback import execute_with_cudf
 
 if TYPE_CHECKING:
+    from collections.abc import Mapping
+
     import polars as pl
 
+    from cudf_polars.typing import OptimizationArgs
+
 __all__: list[str] = ["assert_gpu_result_equal"]
 
 
 def assert_gpu_result_equal(
     lazydf: pl.LazyFrame,
     *,
+    collect_kwargs: Mapping[OptimizationArgs, bool] | None = None,
     check_row_order: bool = True,
     check_column_order: bool = True,
     check_dtypes: bool = True,
@@ -36,6 +41,9 @@ def assert_gpu_result_equal(
     ----------
     lazydf
         frame to collect.
+    collect_kwargs
+        Keyword arguments to pass to collect. Useful for controlling
+        optimization settings.
     check_row_order
         Expect rows to be in same order
     check_column_order
@@ -59,9 +67,11 @@ def assert_gpu_result_equal(
     NotImplementedError
         If GPU collection failed in some way.
     """
-    expect = lazydf.collect()
+    collect_kwargs = {} if collect_kwargs is None else collect_kwargs
+    expect = lazydf.collect(**collect_kwargs)
     got = lazydf.collect(
-        post_opt_callback=partial(execute_with_cudf, raise_on_fail=True)
+        **collect_kwargs,
+        post_opt_callback=partial(execute_with_cudf, raise_on_fail=True),
     )
     assert_frame_equal(
         expect,
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index 287c977f4eb..6d597a91724 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -6,7 +6,7 @@
 from __future__ import annotations
 
 from collections.abc import Mapping
-from typing import TYPE_CHECKING, Protocol, TypeAlias
+from typing import TYPE_CHECKING, Literal, Protocol, TypeAlias
 
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
@@ -89,3 +89,16 @@ def set_udf(
     ) -> None:
         """Set the callback replacing the current node in the plan."""
         ...
+
+
+OptimizationArgs: TypeAlias = Literal[
+    "type_coercion",
+    "predicate_pushdown",
+    "projection_pushdown",
+    "simplify_expression",
+    "slice_pushdown",
+    "comm_subplan_elim",
+    "comm_subexpr_elim",
+    "cluster_with_columns",
+    "no_optimization",
+]
diff --git a/python/cudf_polars/tests/test_slice.py b/python/cudf_polars/tests/test_slice.py
index d27e91302ba..8ea5c623ae7 100644
--- a/python/cudf_polars/tests/test_slice.py
+++ b/python/cudf_polars/tests/test_slice.py
@@ -11,13 +11,14 @@
 
 @pytest.mark.parametrize(
     "offset",
-    [0, 1, 2],
+    [0, 1, 2, -10, -20, -1, -2, 20],
 )
 @pytest.mark.parametrize(
-    "len",
-    [0, 2, 12],
+    "length",
+    [0, 2, 12, 11],
 )
-def test_slice(offset, len):
+@pytest.mark.parametrize("slice_pushdown", [False, True])
+def test_slice(offset, length, slice_pushdown):
     ldf = pl.DataFrame(
         {
             "a": [1, 2, 3, 4, 5, 6, 7],
@@ -29,6 +30,6 @@ def test_slice(offset, len):
         ldf.group_by(pl.col("a"))
         .agg(pl.col("b").sum())
         .sort(by=pl.col("a"))
-        .slice(offset, len)
+        .slice(offset, length)
     )
-    assert_gpu_result_equal(query)
+    assert_gpu_result_equal(query, collect_kwargs={"slice_pushdown": slice_pushdown})

From e57f0fe4edafb689ff468ae6336d47b3aea4772d Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 12 Jun 2024 09:19:48 -0500
Subject: [PATCH 345/842] Enable round-tripping of large strings in `cudf`
 (#15944)

Fixes: #15922

This PR adds support for round-tripping `LargeStringArray` in `cudf` using 64 bit offsets.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15944
---
 cpp/src/interop/from_arrow.cu          | 42 ++++++++++++++++++++------
 cpp/src/interop/to_arrow.cu            | 18 ++++++++---
 cpp/tests/interop/from_arrow_test.cpp  | 42 ++++++++++++++++++++++++--
 python/cudf/cudf/core/column/column.py |  6 ----
 python/cudf/cudf/tests/test_series.py  | 11 ++++---
 5 files changed, 91 insertions(+), 28 deletions(-)

diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
index f100ca0cc2b..579820cbae3 100644
--- a/cpp/src/interop/from_arrow.cu
+++ b/cpp/src/interop/from_arrow.cu
@@ -78,6 +78,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type)
       }
     }
     case arrow::Type::STRING: return data_type(type_id::STRING);
+    case arrow::Type::LARGE_STRING: return data_type(type_id::STRING);
     case arrow::Type::DICTIONARY: return data_type(type_id::DICTIONARY32);
     case arrow::Type::LIST: return data_type(type_id::LIST);
     case arrow::Type::DECIMAL: {
@@ -276,21 +277,42 @@ std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
   rmm::device_async_resource_ref mr)
 {
   if (array.length() == 0) { return make_empty_column(type_id::STRING); }
-  auto str_array    = static_cast<arrow::StringArray const*>(&array);
-  auto offset_array = std::make_unique<arrow::Int32Array>(
-    str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr);
-  auto char_array = std::make_unique<arrow::Int8Array>(
-    str_array->value_data()->size(), str_array->value_data(), nullptr);
 
-  auto offsets_column = dispatch_to_cudf_column{}.operator()<int32_t>(
-    *offset_array, data_type(type_id::INT32), true, stream, mr);
-  auto chars_column = dispatch_to_cudf_column{}.operator()<int8_t>(
-    *char_array, data_type(type_id::INT8), true, stream, mr);
+  std::unique_ptr<column> offsets_column;
+  std::unique_ptr<arrow::Array> char_array;
+
+  if (array.type_id() == arrow::Type::LARGE_STRING) {
+    auto str_array    = static_cast<arrow::LargeStringArray const*>(&array);
+    auto offset_array = std::make_unique<arrow::Int64Array>(
+      str_array->value_offsets()->size() / sizeof(int64_t), str_array->value_offsets(), nullptr);
+    offsets_column = dispatch_to_cudf_column{}.operator()<int64_t>(
+      *offset_array, data_type(type_id::INT64), true, stream, mr);
+    char_array = std::make_unique<arrow::Int8Array>(
+      str_array->value_data()->size(), str_array->value_data(), nullptr);
+  } else if (array.type_id() == arrow::Type::STRING) {
+    auto str_array    = static_cast<arrow::StringArray const*>(&array);
+    auto offset_array = std::make_unique<arrow::Int32Array>(
+      str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr);
+    offsets_column = dispatch_to_cudf_column{}.operator()<int32_t>(
+      *offset_array, data_type(type_id::INT32), true, stream, mr);
+    char_array = std::make_unique<arrow::Int8Array>(
+      str_array->value_data()->size(), str_array->value_data(), nullptr);
+  } else {
+    throw std::runtime_error("Unsupported array type");
+  }
+
+  rmm::device_buffer chars(char_array->length(), stream, mr);
+  auto data_buffer = char_array->data()->buffers[1];
+  CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(),
+                                reinterpret_cast<uint8_t const*>(data_buffer->address()),
+                                chars.size(),
+                                cudaMemcpyDefault,
+                                stream.value()));
 
   auto const num_rows = offsets_column->size() - 1;
   auto out_col        = make_strings_column(num_rows,
                                      std::move(offsets_column),
-                                     std::move(chars_column->release().data.release()[0]),
+                                     std::move(chars),
                                      array.null_count(),
                                      std::move(*get_mask_buffer(array, stream, mr)));
 
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index e871e656c48..47aee982c32 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -306,11 +306,19 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::string_view>(
                               static_cast<std::size_t>(sview.chars_size(stream))},
     ar_mr,
     stream);
-  return std::make_shared<arrow::StringArray>(static_cast<int64_t>(input_view.size()),
-                                              offset_buffer,
-                                              data_buffer,
-                                              fetch_mask_buffer(input_view, ar_mr, stream),
-                                              static_cast<int64_t>(input_view.null_count()));
+  if (sview.offsets().type().id() == cudf::type_id::INT64) {
+    return std::make_shared<arrow::LargeStringArray>(static_cast<int64_t>(input_view.size()),
+                                                     offset_buffer,
+                                                     data_buffer,
+                                                     fetch_mask_buffer(input_view, ar_mr, stream),
+                                                     static_cast<int64_t>(input_view.null_count()));
+  } else {
+    return std::make_shared<arrow::StringArray>(static_cast<int64_t>(input_view.size()),
+                                                offset_buffer,
+                                                data_buffer,
+                                                fetch_mask_buffer(input_view, ar_mr, stream),
+                                                static_cast<int64_t>(input_view.null_count()));
+  }
 }
 
 template <>
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index aec2bab7196..af20a5c772f 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -50,13 +50,36 @@ std::unique_ptr<cudf::table> get_cudf_table()
                                                               {true, false, true, true, true});
   columns.emplace_back(std::move(cudf::dictionary::encode(col4)));
   columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(
-                         {true, false, true, false, true}, {true, false, true, true, false})
+                         {true, false, true, false, true}, {true, false, true, true, false}).release());
+  columns.emplace_back(cudf::test::strings_column_wrapper(
+                         {
+                           "",
+                           "abc",
+                           "def",
+                           "1",
+                           "2",
+                         },
+                         {0, 1, 1, 1, 1})
                          .release());
   // columns.emplace_back(cudf::test::lists_column_wrapper<int>({{1, 2}, {3, 4}, {}, {6}, {7, 8,
   // 9}}).release());
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
+std::shared_ptr<arrow::LargeStringArray> get_arrow_large_string_array(
+  std::vector<std::string> const& data, std::vector<uint8_t> const& mask = {})
+{
+  std::shared_ptr<arrow::LargeStringArray> large_string_array;
+  arrow::LargeStringBuilder large_string_builder;
+
+  CUDF_EXPECTS(large_string_builder.AppendValues(data, mask.data()).ok(),
+               "Failed to append values to string builder");
+  CUDF_EXPECTS(large_string_builder.Finish(&large_string_array).ok(),
+               "Failed to create arrow string array");
+
+  return large_string_array;
+}
+
 struct FromArrowTest : public cudf::test::BaseFixture {};
 
 template <typename T>
@@ -294,6 +317,15 @@ TEST_F(FromArrowTest, ChunkedArray)
       "ccc",
     },
     {0, 1});
+  auto large_string_array_1 = get_arrow_large_string_array(
+    {
+      "",
+      "abc",
+      "def",
+      "1",
+      "2",
+    },
+    {0, 1, 1, 1, 1});
   auto dict_array1 = get_arrow_dict_array({1, 2, 5, 7}, {0, 1, 2}, {1, 0, 1});
   auto dict_array2 = get_arrow_dict_array({1, 2, 5, 7}, {1, 3});
 
@@ -307,13 +339,16 @@ TEST_F(FromArrowTest, ChunkedArray)
   auto boolean_array =
     get_arrow_array<bool>({true, false, true, false, true}, {true, false, true, true, false});
   auto boolean_chunked_array = std::make_shared<arrow::ChunkedArray>(boolean_array);
+  auto large_string_chunked_array = std::make_shared<arrow::ChunkedArray>(
+    std::vector<std::shared_ptr<arrow::Array>>{large_string_array_1});
 
   std::vector<std::shared_ptr<arrow::Field>> schema_vector(
     {arrow::field("a", int32_chunked_array->type()),
      arrow::field("b", int64array->type()),
      arrow::field("c", string_array_1->type()),
      arrow::field("d", dict_chunked_array->type()),
-     arrow::field("e", boolean_chunked_array->type())});
+     arrow::field("e", boolean_chunked_array->type()),
+     arrow::field("f", large_string_array_1->type())});
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
 
   auto arrow_table = arrow::Table::Make(schema,
@@ -321,7 +356,8 @@ TEST_F(FromArrowTest, ChunkedArray)
                                          int64_chunked_array,
                                          string_chunked_array,
                                          dict_chunked_array,
-                                         boolean_chunked_array});
+                                         boolean_chunked_array,
+                                         large_string_chunked_array});
 
   auto expected_cudf_table = get_cudf_table();
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 7abdbc85720..001e8996c19 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -334,12 +334,6 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
             )
         elif isinstance(array.type, ArrowIntervalType):
             return cudf.core.column.IntervalColumn.from_arrow(array)
-        elif pa.types.is_large_string(array.type):
-            # Pandas-2.2+: Pandas defaults to `large_string` type
-            # instead of `string` without data-introspection.
-            # Temporary workaround until cudf has native
-            # support for `LARGE_STRING` i.e., 64 bit offsets
-            array = array.cast(pa.string())
 
         data = pa.table([array], [None])
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index f47c42d9a1d..30189e1ac8a 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2737,13 +2737,16 @@ def test_series_dtype_astypes(data):
     assert_eq(result, expected)
 
 
-def test_series_from_large_string():
-    pa_large_string_array = pa.array(["a", "b", "c"]).cast(pa.large_string())
-    got = cudf.Series(pa_large_string_array)
-    expected = pd.Series(pa_large_string_array)
+@pytest.mark.parametrize("pa_type", [pa.string, pa.large_string])
+def test_series_from_large_string(pa_type):
+    pa_string_array = pa.array(["a", "b", "c"]).cast(pa_type())
+    got = cudf.Series(pa_string_array)
+    expected = pd.Series(pa_string_array)
 
     assert_eq(expected, got)
 
+    assert pa_string_array.equals(got.to_arrow())
+
 
 @pytest.mark.parametrize(
     "scalar",

From c0c2ad355c720d4d168b48aea3d5564efcd890a7 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 12 Jun 2024 09:54:34 -0500
Subject: [PATCH 346/842] resolve dependency-file-generator warning, remove
 unnecessary rapids-build-backend configuration (#15980)

Contributes to https://github.com/rapidsai/build-planning/issues/31
Contributes to https://github.com/rapidsai/dependency-file-generator/issues/89

#15245 was one of the first `rapids-build-backend` PRs merged across RAPIDS. Since it was merged, we've made some small adjustments to the approach for `rapids-build-backend`. This catches `cudf` up with those changes:

* consolidates version-handling in `ci/build_cpp.sh`
* removes `commit-file` configuration in `pyproject.toml`
  - *as of https://github.com/rapidsai/rapids-build-backend/pull/30, this is no longer necessary if the project's top-level directory is `{project_name}.replace("-", "_")*
  - *and anyway, it was changed from `commit-file` to `commit-files` in that PR, so `commit-file` was being silently ignored here*
* uses `--file-key` instead of `--file_key` in `rapids-dependency-file-generator` calls

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15980
---
 ci/build_cpp.sh                   | 4 +---
 ci/build_docs.sh                  | 2 +-
 ci/check_style.sh                 | 2 +-
 ci/configure_cpp_static.sh        | 2 +-
 ci/test_cpp_common.sh             | 2 +-
 ci/test_java.sh                   | 2 +-
 ci/test_notebooks.sh              | 2 +-
 ci/test_python_common.sh          | 2 +-
 python/cudf/pyproject.toml        | 1 -
 python/cudf_kafka/pyproject.toml  | 1 -
 python/cudf_polars/pyproject.toml | 1 -
 python/custreamz/pyproject.toml   | 1 -
 python/dask_cudf/pyproject.toml   | 1 -
 13 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
index 740a6409ccd..e5fcef17a83 100755
--- a/ci/build_cpp.sh
+++ b/ci/build_cpp.sh
@@ -13,12 +13,10 @@ export CMAKE_GENERATOR=Ninja
 
 rapids-print-env
 
-version=$(rapids-generate-version)
-
 rapids-logger "Begin cpp build"
 
 # With boa installed conda build forward to boa
-RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild \
     conda/recipes/libcudf
 
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 67a5415f353..14dc7a59048 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -14,7 +14,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key docs \
+  --file-key docs \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n docs
diff --git a/ci/check_style.sh b/ci/check_style.sh
index 029cd305f1d..634d8b0d702 100755
--- a/ci/check_style.sh
+++ b/ci/check_style.sh
@@ -10,7 +10,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key checks \
+  --file-key checks \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n checks
diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh
index 11d5585d98f..51e41b065fb 100755
--- a/ci/configure_cpp_static.sh
+++ b/ci/configure_cpp_static.sh
@@ -12,7 +12,7 @@ REQUIREMENTS_FILE="${ENV_YAML_DIR}/requirements.txt"
 
 rapids-dependency-file-generator \
   --output requirements \
-  --file_key test_static_build \
+  --file-key test_static_build \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${REQUIREMENTS_FILE}"
 
 python -m pip install -r "${REQUIREMENTS_FILE}"
diff --git a/ci/test_cpp_common.sh b/ci/test_cpp_common.sh
index da847137a2b..f5a8de543f6 100755
--- a/ci/test_cpp_common.sh
+++ b/ci/test_cpp_common.sh
@@ -11,7 +11,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_cpp \
+  --file-key test_cpp \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test
diff --git a/ci/test_java.sh b/ci/test_java.sh
index c93079742f0..9713eb192d2 100755
--- a/ci/test_java.sh
+++ b/ci/test_java.sh
@@ -11,7 +11,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_java \
+  --file-key test_java \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test
diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh
index 8be2d374bed..da9478ce25d 100755
--- a/ci/test_notebooks.sh
+++ b/ci/test_notebooks.sh
@@ -11,7 +11,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_notebooks \
+  --file-key test_notebooks \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test
diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh
index 7559d970f6d..e8849588aa5 100755
--- a/ci/test_python_common.sh
+++ b/ci/test_python_common.sh
@@ -13,7 +13,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_python \
+  --file-key test_python \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 9ad02fed044..20b731624df 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -119,7 +119,6 @@ skip = [
 
 [tool.rapids-build-backend]
 build-backend = "scikit_build_core.build"
-commit-file = "cudf/GIT_COMMIT"
 dependencies-file = "../../dependencies.yaml"
 requires = [
     "cmake>=3.26.4",
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 1bc04742a73..11e18cd4f32 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -99,7 +99,6 @@ regex = "(?P<value>.*)"
 
 [tool.rapids-build-backend]
 build-backend = "scikit_build_core.build"
-commit-file = "cudf_kafka/GIT_COMMIT"
 dependencies-file = "../../dependencies.yaml"
 requires = [
     "cmake>=3.26.4",
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 11178a3be74..face04b9bd8 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -174,7 +174,6 @@ docstring-code-format = true
 
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
-commit-file = "cudf_polars/GIT_COMMIT"
 dependencies-file = "../../dependencies.yaml"
 # Pure python
 disable-cuda = true
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index e004a8f5219..7b99e041b54 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -48,7 +48,6 @@ Homepage = "https://github.com/rapidsai/cudf"
 
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
-commit-file = "custreamz/COMMIT_FILE"
 dependencies-file = "../../dependencies.yaml"
 
 [tool.setuptools]
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 6b5d5ccc412..9b2e3a5a7b1 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -57,7 +57,6 @@ Homepage = "https://github.com/rapidsai/cudf"
 
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
-commit-file = "dask_cudf/GIT_COMMIT"
 dependencies-file = "../../dependencies.yaml"
 
 [tool.setuptools]

From 0891c5dec7fd8ce0f2e0233fe1c637e49a53f86e Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 12 Jun 2024 17:50:52 +0100
Subject: [PATCH 347/842] Add tests covering magic methods of Expr objects
 (#15996)

repr is not stable for now because the pylibcudf datatype repr is not stable (it includes the address).

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15996
---
 python/cudf_polars/cudf_polars/dsl/expr.py | 12 ++--
 python/cudf_polars/tests/dsl/__init__.py   |  6 ++
 python/cudf_polars/tests/dsl/test_expr.py  | 76 ++++++++++++++++++++++
 3 files changed, 89 insertions(+), 5 deletions(-)
 create mode 100644 python/cudf_polars/tests/dsl/__init__.py
 create mode 100644 python/cudf_polars/tests/dsl/test_expr.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index a81cdcbf0c3..13e496136b5 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -134,14 +134,14 @@ def is_equal(self, other: Any) -> bool:
         True if the two expressions are equal, false otherwise.
         """
         if type(self) is not type(other):
-            return False
+            return False  # pragma: no cover; __eq__ trips first
         return self._ctor_arguments(self.children) == other._ctor_arguments(
             other.children
         )
 
     def __eq__(self, other: Any) -> bool:
         """Equality of expressions."""
-        if type(self) != type(other) or hash(self) != hash(other):
+        if type(self) is not type(other) or hash(self) != hash(other):
             return False
         else:
             return self.is_equal(other)
@@ -196,7 +196,9 @@ def do_evaluate(
             are returned during translation to the IR, but for now we
             are not perfect.
         """
-        raise NotImplementedError(f"Evaluation of {type(self).__name__}")
+        raise NotImplementedError(
+            f"Evaluation of expression {type(self).__name__}"
+        )  # pragma: no cover; translation of unimplemented nodes trips first
 
     def evaluate(
         self,
@@ -266,7 +268,7 @@ def collect_agg(self, *, depth: int) -> AggInfo:
         """
         raise NotImplementedError(
             f"Collecting aggregation info for {type(self).__name__}"
-        )
+        )  # pragma: no cover; check_agg trips first
 
 
 class NamedExpr:
@@ -287,7 +289,7 @@ def __hash__(self) -> int:
 
     def __repr__(self) -> str:
         """Repr of the expression."""
-        return f"NamedExpr({self.name}, {self.value}"
+        return f"NamedExpr({self.name}, {self.value})"
 
     def __eq__(self, other: Any) -> bool:
         """Equality of two expressions."""
diff --git a/python/cudf_polars/tests/dsl/__init__.py b/python/cudf_polars/tests/dsl/__init__.py
new file mode 100644
index 00000000000..4611d642f14
--- /dev/null
+++ b/python/cudf_polars/tests/dsl/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/tests/dsl/test_expr.py b/python/cudf_polars/tests/dsl/test_expr.py
new file mode 100644
index 00000000000..ddc3ca66d86
--- /dev/null
+++ b/python/cudf_polars/tests/dsl/test_expr.py
@@ -0,0 +1,76 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import cudf._lib.pylibcudf as plc
+
+from cudf_polars.dsl import expr
+
+
+def test_expression_equality_not_expression():
+    col = expr.Col(plc.DataType(plc.TypeId.INT8), "a")
+    assert not (col == "a")  # noqa: SIM201
+    assert col != "a"
+
+
+@pytest.mark.parametrize("dtype", [plc.TypeId.INT8, plc.TypeId.INT16])
+def test_column_ne_dtypes_differ(dtype):
+    a = expr.Col(plc.DataType(dtype), "a")
+    b = expr.Col(plc.DataType(plc.TypeId.FLOAT32), "a")
+    assert a != b
+
+
+@pytest.mark.parametrize("dtype", [plc.TypeId.INT8, plc.TypeId.INT16])
+def test_column_ne_names_differ(dtype):
+    a = expr.Col(plc.DataType(dtype), "a")
+    b = expr.Col(plc.DataType(dtype), "b")
+    assert a != b
+
+
+@pytest.mark.parametrize("dtype", [plc.TypeId.INT8, plc.TypeId.INT16])
+def test_column_eq_names_eq(dtype):
+    a = expr.Col(plc.DataType(dtype), "a")
+    b = expr.Col(plc.DataType(dtype), "a")
+    assert a == b
+
+
+def test_expr_hashable():
+    a = expr.Col(plc.DataType(plc.TypeId.INT8), "a")
+    b = expr.Col(plc.DataType(plc.TypeId.INT8), "b")
+    c = expr.Col(plc.DataType(plc.TypeId.FLOAT32), "c")
+
+    collection = {a, b, c}
+    assert len(collection) == 3
+    assert a in collection
+    assert b in collection
+    assert c in collection
+
+
+def test_namedexpr_hashable():
+    b = expr.NamedExpr("b", expr.Col(plc.DataType(plc.TypeId.INT8), "a"))
+    c = expr.NamedExpr("c", expr.Col(plc.DataType(plc.TypeId.INT8), "a"))
+
+    collection = {b, c}
+
+    assert len(collection) == 2
+
+    assert b in collection
+    assert c in collection
+
+
+def test_namedexpr_ne_values():
+    b1 = expr.NamedExpr("b1", expr.Col(plc.DataType(plc.TypeId.INT8), "a"))
+    b2 = expr.NamedExpr("b2", expr.Col(plc.DataType(plc.TypeId.INT16), "a"))
+
+    assert b1 != b2
+
+
+@pytest.mark.xfail(reason="pylibcudf datatype repr not stable")
+def test_namedexpr_repr_stable():
+    b1 = expr.NamedExpr("b1", expr.Col(plc.DataType(plc.TypeId.INT8), "a"))
+    b2 = expr.NamedExpr("b1", expr.Col(plc.DataType(plc.TypeId.INT8), "a"))
+
+    assert repr(b1) == repr(b2)

From 97518ac124c2e5992f0bd75f71ccacf06cd866a8 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 12 Jun 2024 19:04:03 +0100
Subject: [PATCH 348/842] Fix typo bug in gather implementation (#16000)

Pylibcudf calls the datatype accessor type(). Add tests to cover this case, and raising on out of bounds accesses.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16000
---
 python/cudf_polars/cudf_polars/dsl/expr.py    |  2 +-
 .../tests/expressions/test_gather.py          | 31 +++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 13e496136b5..377a905aed6 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -801,7 +801,7 @@ def do_evaluate(
             obj = plc.replace.replace_nulls(
                 indices.obj,
                 plc.interop.from_arrow(
-                    pa.scalar(n, type=plc.interop.to_arrow(indices.obj.data_type()))
+                    pa.scalar(n, type=plc.interop.to_arrow(indices.obj.type()))
                 ),
             )
         else:
diff --git a/python/cudf_polars/tests/expressions/test_gather.py b/python/cudf_polars/tests/expressions/test_gather.py
index df33e19a0b6..6bffa3e252c 100644
--- a/python/cudf_polars/tests/expressions/test_gather.py
+++ b/python/cudf_polars/tests/expressions/test_gather.py
@@ -2,8 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import pytest
+
 import polars as pl
 
+from cudf_polars import execute_with_cudf
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
@@ -17,3 +20,31 @@ def test_gather():
 
     query = ldf.select(pl.col("a").gather(pl.col("b")))
     assert_gpu_result_equal(query)
+
+
+def test_gather_with_nulls():
+    ldf = pl.LazyFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [0, None, 1, None, 6, 1, 0],
+        }
+    )
+
+    query = ldf.select(pl.col("a").gather(pl.col("b")))
+
+    assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize("negative", [False, True])
+def test_gather_out_of_bounds(negative):
+    ldf = pl.LazyFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [0, -10 if negative else 10, 1, 2, 6, 1, 0],
+        }
+    )
+
+    query = ldf.select(pl.col("a").gather(pl.col("b")))
+
+    with pytest.raises(pl.exceptions.ComputeError):
+        query.collect(post_opt_callback=execute_with_cudf)

From b35991c366cf81b650fb79fc27604fd79468f132 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 12 Jun 2024 22:50:52 +0100
Subject: [PATCH 349/842] Add test that diagonal concat with mismatching
 schemas raises (#16006)

Arguably this should be determined during query optimization by polars, but for now it is raised late during compute, so we must validate on our side.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16006
---
 python/cudf_polars/cudf_polars/dsl/ir.py |  4 ++--
 python/cudf_polars/tests/test_union.py   | 16 ++++++++++++++++
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 0a6deb5698c..46241ab8e71 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -933,10 +933,10 @@ class Union(IR):
     """Optional slice to apply after concatenation."""
 
     def __post_init__(self) -> None:
-        """Validated preconditions."""
+        """Validate preconditions."""
         schema = self.dfs[0].schema
         if not all(s.schema == schema for s in self.dfs[1:]):
-            raise ValueError("Schema mismatch")
+            raise NotImplementedError("Schema mismatch")
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py
index 18cf4748692..6c9122bc260 100644
--- a/python/cudf_polars/tests/test_union.py
+++ b/python/cudf_polars/tests/test_union.py
@@ -2,8 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import pytest
+
 import polars as pl
 
+from cudf_polars import translate_ir
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
@@ -19,6 +22,19 @@ def test_union():
     assert_gpu_result_equal(query)
 
 
+def test_union_schema_mismatch_raises():
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+        }
+    ).lazy()
+    ldf2 = ldf.select(pl.col("a").cast(pl.Float32))
+    query = pl.concat([ldf, ldf2], how="diagonal")
+    with pytest.raises(NotImplementedError):
+        _ = translate_ir(query._ldf.visit())
+
+
 def test_concat_vertical():
     ldf = pl.LazyFrame(
         {

From 31b33b90430a4f2496fcf1a42778bcd8e070c87c Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 13 Jun 2024 08:58:02 +0100
Subject: [PATCH 350/842] Add tests of implemented StringFunctions (#16007)

Additionally, assert that we raise during translation for an unhandled function.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16007
---
 python/cudf_polars/cudf_polars/dsl/expr.py    |  4 +-
 .../tests/expressions/test_stringfunction.py  | 41 +++++++++++++++++++
 2 files changed, 44 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf_polars/tests/expressions/test_stringfunction.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 377a905aed6..298ef5ab070 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -691,7 +691,9 @@ def do_evaluate(
                 )
             )
         else:
-            raise NotImplementedError(f"StringFunction {self.name}")
+            raise NotImplementedError(
+                f"StringFunction {self.name}"
+            )  # pragma: no cover; handled by init raising
 
 
 class Sort(Expr):
diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py
new file mode 100644
index 00000000000..198f35d376b
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_stringfunction.py
@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars import translate_ir
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+def test_supported_stringfunction_expression():
+    ldf = pl.LazyFrame(
+        {
+            "a": ["a", "b", "cdefg", "h", "Wıth ünιcοde"],  # noqa: RUF001
+            "b": [0, 3, 1, -1, None],
+        }
+    )
+
+    query = ldf.select(
+        pl.col("a").str.starts_with("Z"),
+        pl.col("a").str.ends_with("h").alias("endswith_h"),
+        pl.col("a").str.to_lowercase().alias("lower"),
+        pl.col("a").str.to_uppercase().alias("upper"),
+    )
+    assert_gpu_result_equal(query)
+
+
+def test_unsupported_stringfunction():
+    ldf = pl.LazyFrame(
+        {
+            "a": ["a", "b", "cdefg", "h", "Wıth ünιcοde"],  # noqa: RUF001
+            "b": [0, 3, 1, -1, None],
+        }
+    )
+
+    q = ldf.select(pl.col("a").str.count_matches("e", literal=True))
+
+    with pytest.raises(NotImplementedError):
+        _ = translate_ir(q._ldf.visit())

From 8bbc5121b2dec93d24337d399ff6616bbb971a06 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 13 Jun 2024 08:58:27 +0100
Subject: [PATCH 351/842] Add coverage selecting len from a dataframe (number
 of rows) (#16005)

Fix bug (and report a polars issue) for the case that the dataframe is empty, and therefore we cannot ask a column for its length.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16005
---
 .../cudf_polars/containers/dataframe.py       |  2 +-
 .../cudf_polars/tests/expressions/test_len.py | 26 +++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf_polars/tests/expressions/test_len.py

diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index d1f7a9ed2cf..ec8d00c3123 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -70,7 +70,7 @@ def num_columns(self) -> int:
     @cached_property
     def num_rows(self) -> int:
         """Number of rows."""
-        return self.table.num_rows()
+        return 0 if len(self.columns) == 0 else self.table.num_rows()
 
     @classmethod
     def from_cudf(cls, df: cudf.DataFrame) -> Self:
diff --git a/python/cudf_polars/tests/expressions/test_len.py b/python/cudf_polars/tests/expressions/test_len.py
new file mode 100644
index 00000000000..03b30928184
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_len.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize("dtype", [pl.UInt32, pl.Int32, None])
+@pytest.mark.parametrize("empty", [False, True])
+def test_len(dtype, empty):
+    if empty:
+        df = pl.LazyFrame({})
+    else:
+        df = pl.LazyFrame({"a": [1, 2, 3]})
+
+    if dtype is None:
+        q = df.select(pl.len())
+    else:
+        q = df.select(pl.len().cast(dtype))
+
+    # Workaround for https://github.com/pola-rs/polars/issues/16904
+    assert_gpu_result_equal(q, collect_kwargs={"projection_pushdown": False})

From af09d3e60e4ac4c86602e4e47e58cdb47a02b22c Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 13 Jun 2024 08:58:46 +0100
Subject: [PATCH 352/842] Raise early on unhandled PythonScan node (#15992)

Add test of the behaviour.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15992
---
 python/cudf_polars/cudf_polars/dsl/ir.py     |  4 ++++
 python/cudf_polars/tests/test_python_scan.py | 20 ++++++++++++++++++++
 2 files changed, 24 insertions(+)
 create mode 100644 python/cudf_polars/tests/test_python_scan.py

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 46241ab8e71..9fb2468e4e9 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -165,6 +165,10 @@ class PythonScan(IR):
     predicate: expr.NamedExpr | None
     """Filter to apply to the constructed dataframe before returning it."""
 
+    def __post_init__(self):
+        """Validate preconditions."""
+        raise NotImplementedError("PythonScan not implemented")
+
 
 @dataclasses.dataclass(slots=True)
 class Scan(IR):
diff --git a/python/cudf_polars/tests/test_python_scan.py b/python/cudf_polars/tests/test_python_scan.py
new file mode 100644
index 00000000000..c03474e3dc8
--- /dev/null
+++ b/python/cudf_polars/tests/test_python_scan.py
@@ -0,0 +1,20 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars import translate_ir
+
+
+def test_python_scan():
+    def source(with_columns, predicate, nrows):
+        return pl.DataFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int8())})
+
+    q = pl.LazyFrame._scan_python_function({"a": pl.Int8}, source, pyarrow=False)
+    with pytest.raises(NotImplementedError):
+        _ = translate_ir(q._ldf.visit())
+
+    assert q.collect().equals(source(None, None, None))

From 246d017669cbeca3570106b4bb52a92f931ea2c1 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Thu, 13 Jun 2024 09:33:43 -0500
Subject: [PATCH 353/842] Plumb pylibcudf strings `contains_re` through
 cudf_polars (#15918)

This PR adds cudf-polars code for evaluating the `StringFunction.Contains` expression node.

Depends on https://github.com/rapidsai/cudf/pull/15880/

Authors:
  - https://github.com/brandon-b-miller
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15918
---
 python/cudf_polars/cudf_polars/dsl/expr.py | 51 ++++++++++++++++++
 python/cudf_polars/tests/test_string.py    | 61 ++++++++++++++++++++++
 2 files changed, 112 insertions(+)
 create mode 100644 python/cudf_polars/tests/test_string.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 298ef5ab070..03c1db68dbd 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -644,13 +644,28 @@ def __init__(
         self.options = options
         self.name = name
         self.children = children
+        self._validate_input()
+
+    def _validate_input(self):
         if self.name not in (
             pl_expr.StringFunction.Lowercase,
             pl_expr.StringFunction.Uppercase,
             pl_expr.StringFunction.EndsWith,
             pl_expr.StringFunction.StartsWith,
+            pl_expr.StringFunction.Contains,
         ):
             raise NotImplementedError(f"String function {self.name}")
+        if self.name == pl_expr.StringFunction.Contains:
+            literal, strict = self.options
+            if not literal:
+                if not strict:
+                    raise NotImplementedError(
+                        "f{strict=} is not supported for regex contains"
+                    )
+                if not isinstance(self.children[1], Literal):
+                    raise NotImplementedError(
+                        "Regex contains only supports a scalar pattern"
+                    )
 
     def do_evaluate(
         self,
@@ -660,6 +675,26 @@ def do_evaluate(
         mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
+        if self.name == pl_expr.StringFunction.Contains:
+            child, arg = self.children
+            column = child.evaluate(df, context=context, mapping=mapping)
+
+            literal, _ = self.options
+            if literal:
+                pat = arg.evaluate(df, context=context, mapping=mapping)
+                pattern = (
+                    pat.obj_scalar
+                    if pat.is_scalar and pat.obj.size() != column.obj.size()
+                    else pat.obj
+                )
+                return Column(plc.strings.find.contains(column.obj, pattern))
+            else:
+                assert isinstance(arg, Literal)
+                prog = plc.strings.regex_program.RegexProgram.create(
+                    arg.value.as_py(),
+                    flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
+                )
+                return Column(plc.strings.contains.contains_re(column.obj, prog))
         columns = [
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
@@ -691,6 +726,22 @@ def do_evaluate(
                 )
             )
         else:
+            columns = [
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            ]
+            if self.name == pl_expr.StringFunction.Lowercase:
+                (column,) = columns
+                return Column(plc.strings.case.to_lower(column.obj))
+            elif self.name == pl_expr.StringFunction.Uppercase:
+                (column,) = columns
+                return Column(plc.strings.case.to_upper(column.obj))
+            elif self.name == pl_expr.StringFunction.EndsWith:
+                column, suffix = columns
+                return Column(plc.strings.find.ends_with(column.obj, suffix.obj))
+            elif self.name == pl_expr.StringFunction.StartsWith:
+                column, suffix = columns
+                return Column(plc.strings.find.starts_with(column.obj, suffix.obj))
             raise NotImplementedError(
                 f"StringFunction {self.name}"
             )  # pragma: no cover; handled by init raising
diff --git a/python/cudf_polars/tests/test_string.py b/python/cudf_polars/tests/test_string.py
new file mode 100644
index 00000000000..f1a080d040f
--- /dev/null
+++ b/python/cudf_polars/tests/test_string.py
@@ -0,0 +1,61 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+from functools import partial
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.callback import execute_with_cudf
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture
+def ldf():
+    return pl.DataFrame(
+        {"a": ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"]}
+    ).lazy()
+
+
+@pytest.mark.parametrize(
+    "substr",
+    [
+        "A",
+        "de",
+        ".*",
+        "^a",
+        "^A",
+        "[^a-z]",
+        "[a-z]{3,}",
+        "^[A-Z]{2,}",
+        "j|u",
+    ],
+)
+def test_contains_regex(ldf, substr):
+    query = ldf.select(pl.col("a").str.contains(substr))
+    assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize(
+    "literal", ["A", "de", "FGHI", "j", "kLm", "nOPq", "RsT", "uVw"]
+)
+def test_contains_literal(ldf, literal):
+    query = ldf.select(pl.col("a").str.contains(pl.lit(literal), literal=True))
+    assert_gpu_result_equal(query)
+
+
+def test_contains_column(ldf):
+    query = ldf.select(pl.col("a").str.contains(pl.col("a"), literal=True))
+    assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize("pat", ["["])
+def test_contains_invalid(ldf, pat):
+    query = ldf.select(pl.col("a").str.contains(pat))
+
+    with pytest.raises(pl.exceptions.ComputeError):
+        query.collect()
+    with pytest.raises(pl.exceptions.ComputeError):
+        query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True))

From f651f12471edda51bf4c4071d74ff6720bd037fc Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 13 Jun 2024 16:05:44 +0100
Subject: [PATCH 354/842] Port start of datetime.hpp to pylibcudf (#15916)

Start exposing datetime extraction functions.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15916
---
 .../api_docs/pylibcudf/datetime.rst           |  6 ++++
 .../user_guide/api_docs/pylibcudf/index.rst   |  1 +
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |  1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |  4 ++-
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |  4 ++-
 python/cudf/cudf/_lib/pylibcudf/datetime.pxd  |  8 +++++
 python/cudf/cudf/_lib/pylibcudf/datetime.pyx  | 33 +++++++++++++++++++
 .../_lib/pylibcudf/libcudf/CMakeLists.txt     |  2 +-
 python/cudf/cudf/pylibcudf_tests/conftest.py  |  5 +++
 .../cudf/pylibcudf_tests/test_datetime.py     | 30 +++++++++++++++++
 .../cudf/cudf/pylibcudf_tests/test_round.py   |  9 ++---
 11 files changed, 93 insertions(+), 10 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/datetime.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/datetime.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_datetime.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
new file mode 100644
index 00000000000..ebf5fab3052
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
@@ -0,0 +1,6 @@
+=======
+copying
+=======
+
+.. automodule:: cudf._lib.pylibcudf.datetime
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 1e03fa80bb5..f98298ff052 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -14,6 +14,7 @@ This page provides API documentation for pylibcudf.
     column_factories
     concatenate
     copying
+    datetime
     filling
     gpumemoryview
     groupby
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index ed396208f98..0a198f431a7 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -19,6 +19,7 @@ set(cython_sources
     column_factories.pyx
     concatenate.pyx
     copying.pyx
+    datetime.pyx
     filling.pyx
     gpumemoryview.pyx
     groupby.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index a628ecdb038..5131df9a5cd 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -7,6 +7,7 @@ from . cimport (
     column_factories,
     concatenate,
     copying,
+    datetime,
     filling,
     groupby,
     join,
@@ -40,9 +41,10 @@ __all__ = [
     "Table",
     "aggregation",
     "binaryop",
+    "column_factories",
     "concatenate",
     "copying",
-    "column_factories",
+    "datetime",
     "filling",
     "gpumemoryview",
     "groupby",
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 46d0fe13cd1..43a9e2aca31 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -6,6 +6,7 @@
     column_factories,
     concatenate,
     copying,
+    datetime,
     filling,
     groupby,
     interop,
@@ -39,9 +40,10 @@
     "TypeId",
     "aggregation",
     "binaryop",
+    "column_factories",
     "concatenate",
     "copying",
-    "column_factories",
+    "datetime",
     "filling",
     "gpumemoryview",
     "groupby",
diff --git a/python/cudf/cudf/_lib/pylibcudf/datetime.pxd b/python/cudf/cudf/_lib/pylibcudf/datetime.pxd
new file mode 100644
index 00000000000..2fce48cf1b4
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/datetime.pxd
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from .column cimport Column
+
+
+cpdef Column extract_year(
+    Column col
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/datetime.pyx b/python/cudf/cudf/_lib/pylibcudf/datetime.pyx
new file mode 100644
index 00000000000..82351327de6
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/datetime.pyx
@@ -0,0 +1,33 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.datetime cimport (
+    extract_year as cpp_extract_year,
+)
+
+from .column cimport Column
+
+
+cpdef Column extract_year(
+    Column values
+):
+    """
+    Extract the year from a datetime column.
+
+    Parameters
+    ----------
+    values : Column
+        The column to extract the year from.
+
+    Returns
+    -------
+    Column
+        Column with the extracted years.
+    """
+    cdef unique_ptr[column] result
+
+    with nogil:
+        result = move(cpp_extract_year(values.view()))
+    return Column.from_libcudf(move(result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
index ac56d42dda8..6c66d01ca57 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources aggregation.pyx binaryop.pyx copying.pyx replace.pyx reduce.pxd round.pyx
+set(cython_sources aggregation.pyx binaryop.pyx copying.pyx reduce.pyx replace.pyx round.pyx
                    stream_compaction.pyx types.pyx unary.pyx
 )
 
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
index f3c6584ef8c..b169bbdee5b 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -58,3 +58,8 @@ def interp_opt(request):
 )
 def sorted_opt(request):
     return request.param
+
+
+@pytest.fixture(scope="session", params=[False, True])
+def has_nulls(request):
+    return request.param
diff --git a/python/cudf/cudf/pylibcudf_tests/test_datetime.py b/python/cudf/cudf/pylibcudf_tests/test_datetime.py
new file mode 100644
index 00000000000..75af0fa6ca1
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_datetime.py
@@ -0,0 +1,30 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import datetime
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.fixture
+def column(has_nulls):
+    values = [
+        datetime.date(1999, 1, 1),
+        datetime.date(2024, 10, 12),
+        datetime.date(1, 1, 1),
+        datetime.date(9999, 1, 1),
+    ]
+    if has_nulls:
+        values[2] = None
+    return plc.interop.from_arrow(pa.array(values, type=pa.date32()))
+
+
+def test_extract_year(column):
+    got = plc.datetime.extract_year(column)
+    # libcudf produces an int16, arrow produces an int64
+    expect = pa.compute.year(plc.interop.to_arrow(column)).cast(pa.int16())
+
+    assert_column_eq(expect, got)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_round.py b/python/cudf/cudf/pylibcudf_tests/test_round.py
index a234860477f..991e6ed310d 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_round.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_round.py
@@ -7,16 +7,11 @@
 import cudf._lib.pylibcudf as plc
 
 
-@pytest.fixture(params=[False, True])
-def nullable(request):
-    return request.param
-
-
 @pytest.fixture(params=["float32", "float64"])
-def column(request, nullable):
+def column(request, has_nulls):
     values = [2.5, 2.49, 1.6, 8, -1.5, -1.7, -0.5, 0.5]
     typ = {"float32": pa.float32(), "float64": pa.float64()}[request.param]
-    if nullable:
+    if has_nulls:
         values[2] = None
     return plc.interop.from_arrow(pa.array(values, type=typ))
 

From cb564da1204f0da7eaeb8a0e636a0f23c97c314f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 13 Jun 2024 05:11:37 -1000
Subject: [PATCH 355/842] Move some misc Frame methods to appropriate locations
 (#15963)

* Move `Frame._is_sorted` to `MultiIndex._is_sorted` (the only class that uses this method)
* Move `_apply_inverse_column` helper function to define `Column.__invert__`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15963
---
 python/cudf/cudf/core/column/column.py    |  5 ++
 python/cudf/cudf/core/column/numerical.py |  8 +++
 python/cudf/cudf/core/frame.py            | 61 +----------------------
 python/cudf/cudf/core/multiindex.py       | 49 +++++++++++++++++-
 4 files changed, 62 insertions(+), 61 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 001e8996c19..75fc31ddbce 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1118,6 +1118,11 @@ def __cuda_array_interface__(self) -> abc.Mapping[str, Any]:
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return _array_ufunc(self, ufunc, method, inputs, kwargs)
 
+    def __invert__(self):
+        raise TypeError(
+            f"Operation `~` not supported on {self.dtype.type.__name__}"
+        )
+
     def searchsorted(
         self,
         value,
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 6fb4f17b76d..1952d7eeb71 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -194,6 +194,14 @@ def unary_operator(self, unaryop: Union[str, Callable]) -> ColumnBase:
         unaryop = pylibcudf.unary.UnaryOperator[unaryop]
         return libcudf.unary.unary_operation(self, unaryop)
 
+    def __invert__(self):
+        if self.dtype.kind in "ui":
+            return self.unary_operator("invert")
+        elif self.dtype.kind == "b":
+            return self.unary_operator("not")
+        else:
+            return super().__invert__()
+
     def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         int_float_dtype_mapping = {
             np.int8: np.float32,
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index af8886a44a6..01b56f1edc4 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -32,7 +32,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._typing import Dtype
-from cudf.api.types import is_bool_dtype, is_dtype_equal, is_scalar
+from cudf.api.types import is_dtype_equal, is_scalar
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
     ColumnBase,
@@ -1455,51 +1455,6 @@ def _get_sorted_inds(
             stable=True,
         )
 
-    @_cudf_nvtx_annotate
-    def _is_sorted(self, ascending=None, null_position=None):
-        """
-        Returns a boolean indicating whether the data of the Frame are sorted
-        based on the parameters given. Does not account for the index.
-
-        Parameters
-        ----------
-        self : Frame
-            Frame whose columns are to be checked for sort order
-        ascending : None or list-like of booleans
-            None or list-like of boolean values indicating expected sort order
-            of each column. If list-like, size of list-like must be
-            len(columns). If None, all columns expected sort order is set to
-            ascending. False (0) - ascending, True (1) - descending.
-        null_position : None or list-like of booleans
-            None or list-like of boolean values indicating desired order of
-            nulls compared to other elements. If list-like, size of list-like
-            must be len(columns). If None, null order is set to before. False
-            (0) - before, True (1) - after.
-
-        Returns
-        -------
-        returns : boolean
-            Returns True, if sorted as expected by ``ascending`` and
-            ``null_position``, False otherwise.
-        """
-        if ascending is not None and not cudf.api.types.is_list_like(
-            ascending
-        ):
-            raise TypeError(
-                f"Expected a list-like or None for `ascending`, got "
-                f"{type(ascending)}"
-            )
-        if null_position is not None and not cudf.api.types.is_list_like(
-            null_position
-        ):
-            raise TypeError(
-                f"Expected a list-like or None for `null_position`, got "
-                f"{type(null_position)}"
-            )
-        return libcudf.sort.is_sorted(
-            [*self._columns], ascending=ascending, null_position=null_position
-        )
-
     @_cudf_nvtx_annotate
     def _split(self, splits):
         """Split a frame with split points in ``splits``. Returns a list of
@@ -1920,7 +1875,7 @@ def __invert__(self):
         """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
             self._data._from_columns_like_self(
-                (_apply_inverse_column(col) for col in self._data.columns)
+                (~col for col in self._data.columns)
             )
         )
 
@@ -1970,15 +1925,3 @@ def __dask_tokenize__(self):
             str(dict(self._dtypes)),
             normalize_token(self.to_pandas()),
         ]
-
-
-def _apply_inverse_column(col: ColumnBase) -> ColumnBase:
-    """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
-    if np.issubdtype(col.dtype, np.integer):
-        return col.unary_operator("invert")
-    elif is_bool_dtype(col.dtype):
-        return col.unary_operator("not")
-    else:
-        raise TypeError(
-            f"Operation `~` not supported on {col.dtype.type.__name__}"
-        )
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 11b4b9154a2..6d3520e33cf 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1636,9 +1636,54 @@ def is_unique(self):
     def dtype(self):
         return np.dtype("O")
 
+    @_cudf_nvtx_annotate
+    def _is_sorted(self, ascending=None, null_position=None) -> bool:
+        """
+        Returns a boolean indicating whether the data of the MultiIndex are sorted
+        based on the parameters given. Does not account for the index.
+
+        Parameters
+        ----------
+        self : MultiIndex
+            MultiIndex whose columns are to be checked for sort order
+        ascending : None or list-like of booleans
+            None or list-like of boolean values indicating expected sort order
+            of each column. If list-like, size of list-like must be
+            len(columns). If None, all columns expected sort order is set to
+            ascending. False (0) - ascending, True (1) - descending.
+        null_position : None or list-like of booleans
+            None or list-like of boolean values indicating desired order of
+            nulls compared to other elements. If list-like, size of list-like
+            must be len(columns). If None, null order is set to before. False
+            (0) - before, True (1) - after.
+
+        Returns
+        -------
+        returns : boolean
+            Returns True, if sorted as expected by ``ascending`` and
+            ``null_position``, False otherwise.
+        """
+        if ascending is not None and not cudf.api.types.is_list_like(
+            ascending
+        ):
+            raise TypeError(
+                f"Expected a list-like or None for `ascending`, got "
+                f"{type(ascending)}"
+            )
+        if null_position is not None and not cudf.api.types.is_list_like(
+            null_position
+        ):
+            raise TypeError(
+                f"Expected a list-like or None for `null_position`, got "
+                f"{type(null_position)}"
+            )
+        return libcudf.sort.is_sorted(
+            [*self._columns], ascending=ascending, null_position=null_position
+        )
+
     @cached_property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_monotonic_increasing(self):
+    def is_monotonic_increasing(self) -> bool:
         """
         Return if the index is monotonic increasing
         (only equal or increasing) values.
@@ -1647,7 +1692,7 @@ def is_monotonic_increasing(self):
 
     @cached_property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_monotonic_decreasing(self):
+    def is_monotonic_decreasing(self) -> bool:
         """
         Return if the index is monotonic decreasing
         (only equal or decreasing) values.

From 3cb3df3255efaec4a5ebb6cb7606067f753e3554 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 13 Jun 2024 11:54:55 -0500
Subject: [PATCH 356/842] Add ability to enable rmm pool on `cudf.pandas`
 import (#15628)

This PR enables allocating of rmm memory pool on `cudf.pandas` import using the following environment variables:

```
export CUDF_PANDAS_RMM_MODE="pool"
```

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15628
---
 python/cudf/cudf/pandas/__init__.py           | 43 +++++++++++++++++++
 .../cudf_pandas_tests/test_cudf_pandas.py     | 28 ++++++++++++
 2 files changed, 71 insertions(+)

diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py
index 5b3785531d3..59a88f85dda 100644
--- a/python/cudf/cudf/pandas/__init__.py
+++ b/python/cudf/cudf/pandas/__init__.py
@@ -2,6 +2,9 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
+
+import warnings
+
 from .fast_slow_proxy import is_proxy_object
 from .magics import load_ipython_extension
 from .profiler import Profiler
@@ -19,6 +22,46 @@ def install():
     loader = ModuleAccelerator.install("pandas", "cudf", "pandas")
     global LOADED
     LOADED = loader is not None
+    import os
+
+    if (rmm_mode := os.getenv("CUDF_PANDAS_RMM_MODE", None)) is not None:
+        import rmm.mr
+        from rmm.mr import available_device_memory
+
+        # Check if a non-default memory resource is set
+        current_mr = rmm.mr.get_current_device_resource()
+        if not isinstance(current_mr, rmm.mr.CudaMemoryResource):
+            warnings.warn(
+                f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={str(rmm_mode)}",
+                UserWarning,
+            )
+        free_memory, _ = available_device_memory()
+        free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
+
+        if rmm_mode == "cuda":
+            mr = rmm.mr.CudaMemoryResource()
+            rmm.mr.set_current_device_resource(mr)
+        elif rmm_mode == "pool":
+            rmm.mr.set_current_device_resource(
+                rmm.mr.PoolMemoryResource(
+                    rmm.mr.get_current_device_resource(),
+                    initial_pool_size=free_memory,
+                )
+            )
+        elif rmm_mode == "async":
+            mr = rmm.mr.CudaAsyncMemoryResource(initial_pool_size=free_memory)
+            rmm.mr.set_current_device_resource(mr)
+        elif rmm_mode == "managed":
+            mr = rmm.mr.ManagedMemoryResource()
+            rmm.mr.set_current_device_resource(mr)
+        elif rmm_mode == "managed_pool":
+            rmm.reinitialize(
+                managed_memory=True,
+                pool_allocator=True,
+                initial_pool_size=free_memory,
+            )
+        else:
+            raise TypeError(f"Unsupported rmm mode: {rmm_mode}")
 
 
 def pytest_load_initial_conftests(early_config, parser, args):
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 515a4714a5a..c251e4a197e 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -9,6 +9,7 @@
 import os
 import pathlib
 import pickle
+import subprocess
 import tempfile
 import types
 from io import BytesIO, StringIO
@@ -1425,6 +1426,33 @@ def test_holidays_within_dates(holiday, start, expected):
     ) == [utc.localize(dt) for dt in expected]
 
 
+@pytest.mark.parametrize(
+    "env_value",
+    ["", "cuda", "pool", "async", "managed", "managed_pool", "abc"],
+)
+def test_rmm_option_on_import(env_value):
+    data_directory = os.path.dirname(os.path.abspath(__file__))
+    # Create a copy of the current environment variables
+    env = os.environ.copy()
+    env["CUDF_PANDAS_RMM_MODE"] = env_value
+
+    sp_completed = subprocess.run(
+        [
+            "python",
+            "-m",
+            "cudf.pandas",
+            data_directory + "/data/profile_basic.py",
+        ],
+        capture_output=True,
+        text=True,
+        env=env,
+    )
+    if env_value in {"cuda", "pool", "async", "managed", "managed_pool"}:
+        assert sp_completed.returncode == 0
+    else:
+        assert sp_completed.returncode == 1
+
+
 def test_cudf_pandas_debugging_different_results(monkeypatch):
     cudf_mean = cudf.Series.mean
 

From 3f8f2149129f97947223611e2709d235e889389b Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 13 Jun 2024 17:04:45 -0500
Subject: [PATCH 357/842] Refactor rmm usage in `cudf.pandas` (#16021)

This PR addresses review comments made by @bdice here: https://github.com/rapidsai/cudf/pull/15628#pullrequestreview-2116067037

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16021
---
 python/cudf/cudf/pandas/__init__.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py
index 59a88f85dda..ff445a63f74 100644
--- a/python/cudf/cudf/pandas/__init__.py
+++ b/python/cudf/cudf/pandas/__init__.py
@@ -2,9 +2,11 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-
+import os
 import warnings
 
+import rmm.mr
+
 from .fast_slow_proxy import is_proxy_object
 from .magics import load_ipython_extension
 from .profiler import Profiler
@@ -22,12 +24,8 @@ def install():
     loader = ModuleAccelerator.install("pandas", "cudf", "pandas")
     global LOADED
     LOADED = loader is not None
-    import os
 
     if (rmm_mode := os.getenv("CUDF_PANDAS_RMM_MODE", None)) is not None:
-        import rmm.mr
-        from rmm.mr import available_device_memory
-
         # Check if a non-default memory resource is set
         current_mr = rmm.mr.get_current_device_resource()
         if not isinstance(current_mr, rmm.mr.CudaMemoryResource):
@@ -35,7 +33,7 @@ def install():
                 f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={str(rmm_mode)}",
                 UserWarning,
             )
-        free_memory, _ = available_device_memory()
+        free_memory, _ = rmm.mr.available_device_memory()
         free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
 
         if rmm_mode == "cuda":
@@ -55,13 +53,13 @@ def install():
             mr = rmm.mr.ManagedMemoryResource()
             rmm.mr.set_current_device_resource(mr)
         elif rmm_mode == "managed_pool":
-            rmm.reinitialize(
-                managed_memory=True,
-                pool_allocator=True,
+            mr = rmm.mr.PoolMemoryResource(
+                rmm.mr.ManagedMemoryResource(),
                 initial_pool_size=free_memory,
             )
+            rmm.mr.set_current_device_resource(mr)
         else:
-            raise TypeError(f"Unsupported rmm mode: {rmm_mode}")
+            raise ValueError(f"Unsupported rmm mode: {rmm_mode}")
 
 
 def pytest_load_initial_conftests(early_config, parser, args):

From 31d909b0af9bcf9cf804ca1c3893ea71fbd5d765 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 13 Jun 2024 13:27:05 -1000
Subject: [PATCH 358/842] Support IntervalDtype in cudf.from_pandas (#16014)

Noticed while running the pandas test suite against `cudf.pandas`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16014
---
 python/cudf/cudf/core/dataframe.py      | 6 +++---
 python/cudf/cudf/tests/test_interval.py | 7 +++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index e1b6cc45dd3..7438b0237d5 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -8072,11 +8072,11 @@ def from_pandas(obj, nan_as_null=no_default):
         return cudf.Index.from_pandas(obj, nan_as_null=nan_as_null)
     elif isinstance(obj, pd.CategoricalDtype):
         return cudf.CategoricalDtype.from_pandas(obj)
+    elif isinstance(obj, pd.IntervalDtype):
+        return cudf.IntervalDtype.from_pandas(obj)
     else:
         raise TypeError(
-            "from_pandas only accepts Pandas Dataframes, Series, "
-            "Index, RangeIndex and MultiIndex objects. "
-            "Got %s" % type(obj)
+            f"from_pandas unsupported for object of type {type(obj).__name__}"
         )
 
 
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index 7b923af1f75..013f4439ad5 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -181,3 +181,10 @@ def test_interval_with_datetime(tz, box):
     else:
         with pytest.raises(NotImplementedError):
             cudf.from_pandas(pobj)
+
+
+def test_from_pandas_intervaldtype():
+    dtype = pd.IntervalDtype("int64", closed="left")
+    result = cudf.from_pandas(dtype)
+    expected = cudf.IntervalDtype("int64", closed="left")
+    assert_eq(result, expected)

From 987879ca4bdcae0d959266fd39196123007fa45e Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 13 Jun 2024 19:27:11 -0700
Subject: [PATCH 359/842] Fix the pool size alignment issue (#16024)

This PR fixes a pool size alignment bug.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Vukasin Milovanovic (https://github.com/vuule)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16024
---
 cpp/src/utilities/pinned_memory.cpp | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp
index 5d2e3ac332a..e90b7969b4d 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/pinned_memory.cpp
@@ -43,9 +43,11 @@ class fixed_pinned_pool_memory_resource {
 
  public:
   fixed_pinned_pool_memory_resource(size_t size)
-    : pool_size_{size}, pool_{new host_pooled_mr(upstream_mr_, size, size)}
+    :  // rmm requires the pool size to be a multiple of 256 bytes
+      pool_size_{rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT)},
+      pool_{new host_pooled_mr(upstream_mr_, pool_size_, pool_size_)}
   {
-    if (pool_size_ == 0) { return; }
+    CUDF_LOG_INFO("Pinned pool size = {}", pool_size_);
 
     // Allocate full size from the pinned pool to figure out the beginning and end address
     pool_begin_ = pool_->allocate_async(pool_size_, stream_);
@@ -145,12 +147,8 @@ CUDF_EXPORT rmm::host_device_async_resource_ref& make_default_pinned_mr(
       return std::min(total / 200, size_t{100} * 1024 * 1024);
     }();
 
-    // rmm requires the pool size to be a multiple of 256 bytes
-    auto const aligned_size = rmm::align_up(size, rmm::RMM_DEFAULT_HOST_ALIGNMENT);
-    CUDF_LOG_INFO("Pinned pool size = {}", aligned_size);
-
     // make the pool with max size equal to the initial size
-    return fixed_pinned_pool_memory_resource{aligned_size};
+    return fixed_pinned_pool_memory_resource{size};
   }();
 
   static rmm::host_device_async_resource_ref mr_ref{mr};

From 829b3a959cc5f0d41fe51dca9a4335dba0da69a5 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 13 Jun 2024 20:40:56 -0700
Subject: [PATCH 360/842] Fix the int32 overflow when computing page fragment
 sizes for large string columns (#16028)

This PR fixes the possible `int32` overflow when computing page fragment sizes for large (2B+ char) string columns.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16028
---
 cpp/src/io/parquet/writer_impl.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 6d466748c17..ca15b532d07 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1763,10 +1763,10 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     // for multiple fragments per page to smooth things out. using 2 was too
     // unbalanced in final page sizes, so using 4 which seems to be a good
     // compromise at smoothing things out without getting fragment sizes too small.
-    auto frag_size_fn = [&](auto const& col, size_type col_size) {
+    auto frag_size_fn = [&](auto const& col, size_t col_size) {
       int const target_frags_per_page = is_col_fixed_width(col) ? 1 : 4;
       auto const avg_len =
-        target_frags_per_page * util::div_rounding_up_safe<size_type>(col_size, input.num_rows());
+        target_frags_per_page * util::div_rounding_up_safe<size_t>(col_size, input.num_rows());
       if (avg_len > 0) {
         auto const frag_size = util::div_rounding_up_safe<size_type>(max_page_size_bytes, avg_len);
         return std::min<size_type>(max_page_fragment_size, frag_size);

From 34227d3cb687d465f1d4a5f12cbb37a47b97866e Mon Sep 17 00:00:00 2001
From: Zach Puller <zach.puller@gmail.com>
Date: Thu, 13 Jun 2024 23:45:35 -0700
Subject: [PATCH 361/842] orc multithreaded benchmark (#16009)

Addresses: https://github.com/rapidsai/cudf/issues/15973

Adds multithreaded benchmarks for the ORC reader. Based off of the parquet equivalent in https://github.com/rapidsai/cudf/pull/15585

```
# Benchmark Results

## orc_multithreaded_read_decode_mixed

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | Samples | CPU Time  | Noise | GPU Time  | Noise | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |    338x | 44.348 ms | 1.18% | 44.343 ms | 1.18% |      12107185968 |       939.341 MiB |        39.557 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |     80x | 77.634 ms | 0.65% | 77.629 ms | 0.65% |      13831742649 |         1.834 GiB |        79.072 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |    341x | 43.921 ms | 1.20% | 43.916 ms | 1.20% |      12224889363 |       825.333 MiB |        39.568 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |     80x | 75.418 ms | 0.70% | 75.414 ms | 0.70% |      14237999015 |         1.611 GiB |        79.113 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |     80x | 42.682 ms | 1.18% | 42.678 ms | 1.18% |      12579566132 |       883.436 MiB |        39.587 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |      9x | 74.056 ms | 0.48% | 74.052 ms | 0.48% |      14499873867 |         1.724 GiB |        79.136 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |     25x | 42.198 ms | 0.50% | 42.194 ms | 0.49% |      12723960975 |       940.562 MiB |        39.600 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |      8x | 73.933 ms | 0.49% | 73.929 ms | 0.49% |      14524042443 |         1.781 GiB |        79.175 MiB |

## orc_multithreaded_read_decode_fixed_width

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | Samples | CPU Time  | Noise | GPU Time  | Noise | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |     13x | 40.149 ms | 0.04% | 40.144 ms | 0.04% |      13373482726 |       643.390 MiB |        59.821 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |    211x | 71.216 ms | 0.67% | 71.211 ms | 0.67% |      15078297784 |         1.257 GiB |       119.650 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |    378x | 39.662 ms | 1.31% | 39.658 ms | 1.31% |      13537590893 |       643.392 MiB |        59.833 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |    209x | 71.693 ms | 0.71% | 71.688 ms | 0.71% |      14978085376 |         1.257 GiB |       119.642 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |    377x | 39.731 ms | 1.30% | 39.726 ms | 1.30% |      13514305239 |       643.394 MiB |        59.856 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |      8x | 70.766 ms | 0.08% | 70.761 ms | 0.08% |      15174115364 |         1.030 GiB |       119.665 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |    379x | 39.486 ms | 1.27% | 39.482 ms | 1.27% |      13597888468 |       647.399 MiB |        59.928 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |    207x | 72.686 ms | 2.04% | 72.681 ms | 2.04% |      14773317833 |         1.143 GiB |       119.711 MiB |

## orc_multithreaded_read_decode_string

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | Samples | CPU Time  | Noise | GPU Time  | Noise | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |     80x | 22.933 ms | 2.13% | 22.928 ms | 2.13% |      23415352877 |       661.948 MiB |        10.879 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |    160x | 34.167 ms | 1.41% | 34.162 ms | 1.41% |      31430436877 |         1.293 GiB |        21.757 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |    560x | 22.533 ms | 2.18% | 22.528 ms | 2.18% |      23830839172 |       609.407 MiB |        10.941 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |     80x | 34.311 ms | 1.54% | 34.307 ms | 1.54% |      31298288990 |         1.188 GiB |        21.758 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |     23x | 22.179 ms | 0.11% | 22.175 ms | 0.11% |      24211151047 |       624.177 MiB |        10.947 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |     15x | 33.793 ms | 0.08% | 33.789 ms | 0.08% |      31777989791 |         1.190 GiB |        21.881 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |    679x | 22.006 ms | 1.74% | 22.002 ms | 1.74% |      24401381631 |       624.524 MiB |        10.951 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |    160x | 33.320 ms | 1.57% | 33.316 ms | 1.57% |      32229227026 |         1.207 GiB |        21.894 MiB |

## orc_multithreaded_read_decode_list

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | Samples |  CPU Time  | Noise  |  GPU Time  | Noise  | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|---------|------------|--------|------------|--------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |     96x |  74.437 ms |  0.68% |  74.433 ms |  0.68% |       7212831148 |       600.751 MiB |        60.245 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |      7x |  80.994 ms |  0.49% |  80.990 ms |  0.49% |      13257745936 |         1.173 GiB |       120.549 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |     80x |  79.234 ms |  4.57% |  79.229 ms |  4.57% |       6776190522 |       600.950 MiB |        60.250 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |    166x |  90.437 ms | 17.19% |  90.432 ms | 17.19% |      11873413959 |         1.173 GiB |       120.489 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |     80x |  78.613 ms |  2.98% |  78.608 ms |  2.98% |       6829702014 |       602.764 MiB |        60.323 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |    127x | 118.629 ms | 22.67% | 118.624 ms | 22.67% |       9051644873 |         1.174 GiB |       120.499 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |    112x | 133.950 ms |  4.45% | 133.945 ms |  4.45% |       4008135293 |       603.471 MiB |        60.353 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |     90x | 167.850 ms | 15.93% | 167.844 ms | 15.93% |       6397248426 |         1.177 GiB |       120.646 MiB |

## orc_multithreaded_read_decode_chunked_mixed

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples | CPU Time  | Noise | GPU Time  | Noise | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |   671088640 |    671088640 |    333x | 45.009 ms | 1.10% | 45.005 ms | 1.10% |      11929261073 |       939.341 MiB |        39.557 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |   671088640 |    671088640 |     96x | 81.524 ms | 0.61% | 81.519 ms | 0.61% |      13171640865 |         1.834 GiB |        79.072 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |   671088640 |    671088640 |    339x | 44.183 ms | 0.96% | 44.179 ms | 0.96% |      12152252271 |       825.333 MiB |        39.568 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |   671088640 |    671088640 |      7x | 79.051 ms | 0.02% | 79.046 ms | 0.02% |      13583676002 |         1.611 GiB |        79.113 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |   671088640 |    671088640 |     12x | 43.276 ms | 0.09% | 43.272 ms | 0.09% |      12407024794 |       883.436 MiB |        39.587 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |   671088640 |    671088640 |     19x | 78.019 ms | 0.49% | 78.014 ms | 0.49% |      13763433041 |         1.724 GiB |        79.136 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |   671088640 |    671088640 |     80x | 42.803 ms | 1.22% | 42.799 ms | 1.22% |      12543864010 |       911.993 MiB |        39.600 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |   671088640 |    671088640 |    193x | 77.856 ms | 0.59% | 77.852 ms | 0.59% |      13792063986 |         1.837 GiB |        79.175 MiB |

## orc_multithreaded_read_decode_chunked_fixed_width

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples | CPU Time  | Noise | GPU Time  | Noise | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |   671088640 |    671088640 |    112x | 40.497 ms | 1.23% | 40.493 ms | 1.23% |      13258480947 |       643.390 MiB |        59.821 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |   671088640 |    671088640 |      7x | 75.440 ms | 0.09% | 75.435 ms | 0.09% |      14234033611 |         1.648 GiB |       119.651 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |   671088640 |    671088640 |     80x | 39.793 ms | 1.36% | 39.789 ms | 1.36% |      13493067216 |       643.392 MiB |        59.833 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |   671088640 |    671088640 |     69x | 74.499 ms | 0.50% | 74.494 ms | 0.50% |      14413864845 |         1.336 GiB |       119.642 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |   671088640 |    671088640 |    381x | 39.273 ms | 1.11% | 39.269 ms | 1.11% |      13671742653 |       643.394 MiB |        59.856 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |   671088640 |    671088640 |    204x | 73.755 ms | 0.60% | 73.751 ms | 0.60% |      14559012350 |         1.648 GiB |       119.665 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |   671088640 |    671088640 |     80x | 39.490 ms | 1.31% | 39.486 ms | 1.31% |      13596333864 |       631.980 MiB |        59.928 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |   671088640 |    671088640 |    203x | 73.907 ms | 1.34% | 73.903 ms | 1.34% |      14529071322 |         1.454 GiB |       119.711 MiB |

## orc_multithreaded_read_decode_chunked_string

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples | CPU Time  | Noise | GPU Time  | Noise | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|-----------|-------|-----------|-------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |   671088640 |    671088640 |     80x | 23.022 ms | 1.96% | 23.017 ms | 1.96% |      23324556592 |       661.948 MiB |        10.879 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |   671088640 |    671088640 |     80x | 37.687 ms | 1.37% | 37.682 ms | 1.37% |      28494755419 |         1.659 GiB |        21.757 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |   671088640 |    671088640 |     80x | 22.703 ms | 2.30% | 22.699 ms | 2.30% |      23652118769 |       609.407 MiB |        10.941 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |   671088640 |    671088640 |     80x | 37.581 ms | 1.42% | 37.577 ms | 1.42% |      28574723179 |         1.658 GiB |        21.758 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |   671088640 |    671088640 |    544x | 22.296 ms | 1.56% | 22.293 ms | 1.56% |      24082840350 |       631.319 MiB |        10.947 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |   671088640 |    671088640 |     14x | 36.990 ms | 0.14% | 36.985 ms | 0.14% |      29031484389 |         1.554 GiB |        21.881 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |   671088640 |    671088640 |    676x | 22.114 ms | 1.22% | 22.110 ms | 1.22% |      24281965280 |       627.616 MiB |        10.951 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |   671088640 |    671088640 |     80x | 37.409 ms | 1.40% | 37.405 ms | 1.40% |      28706077426 |         1.562 GiB |        21.894 MiB |

## orc_multithreaded_read_decode_chunked_list

### [0] NVIDIA RTX 5880 Ada Generation

| cardinality | total_data_size | num_threads | num_cols | run_length | input_limit | output_limit | Samples |  CPU Time  | Noise  |  GPU Time  | Noise  | bytes_per_second | peak_memory_usage | encoded_file_size |
|-------------|-----------------|-------------|----------|------------|-------------|--------------|---------|------------|--------|------------|--------|------------------|-------------------|-------------------|
|        1000 |       536870912 |           1 |        4 |          8 |   671088640 |    671088640 |     80x |  74.780 ms |  0.67% |  74.776 ms |  0.67% |       7179747067 |       600.751 MiB |        60.245 MiB |
|        1000 |      1073741824 |           1 |        4 |          8 |   671088640 |    671088640 |    175x |  86.040 ms |  0.56% |  86.035 ms |  0.56% |      12480222210 |         1.576 GiB |       120.549 MiB |
|        1000 |       536870912 |           2 |        4 |          8 |   671088640 |    671088640 |    186x |  80.668 ms |  4.14% |  80.664 ms |  4.14% |       6655685080 |       600.951 MiB |        60.250 MiB |
|        1000 |      1073741824 |           2 |        4 |          8 |   671088640 |    671088640 |    143x | 105.217 ms | 21.56% | 105.212 ms | 21.56% |      10205531345 |         1.576 GiB |       120.489 MiB |
|        1000 |       536870912 |           4 |        4 |          8 |   671088640 |    671088640 |    128x |  80.087 ms |  3.05% |  80.082 ms |  3.05% |       6704042147 |       602.764 MiB |        60.323 MiB |
|        1000 |      1073741824 |           4 |        4 |          8 |   671088640 |    671088640 |    135x | 111.556 ms | 21.88% | 111.551 ms | 21.88% |       9625546746 |         1.489 GiB |       120.499 MiB |
|        1000 |       536870912 |           8 |        4 |          8 |   671088640 |    671088640 |    112x | 134.677 ms |  4.14% | 134.672 ms |  4.14% |       3986513604 |       603.471 MiB |        60.353 MiB |
|        1000 |      1073741824 |           8 |        4 |          8 |   671088640 |    671088640 |     80x | 178.735 ms | 14.17% | 178.730 ms | 14.17% |       6007630497 |         1.520 GiB |       120.646 MiB |

```

Authors:
  - Zach Puller (https://github.com/zpuller)
  - Vukasin Milovanovic (https://github.com/vuule)
  - MithunR (https://github.com/mythrocks)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/16009
---
 cpp/benchmarks/CMakeLists.txt                 |   5 +
 .../io/orc/orc_reader_multithreaded.cpp       | 335 ++++++++++++++++++
 2 files changed, 340 insertions(+)
 create mode 100644 cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 49504e53424..8a48126e195 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -267,6 +267,11 @@ ConfigureNVBench(PARQUET_MULTITHREAD_READER_NVBENCH io/parquet/parquet_reader_mu
 # * orc reader benchmark --------------------------------------------------------------------------
 ConfigureNVBench(ORC_READER_NVBENCH io/orc/orc_reader_input.cpp io/orc/orc_reader_options.cpp)
 
+# ##################################################################################################
+# * orc multithreaded benchmark
+# --------------------------------------------------------------------------
+ConfigureNVBench(ORC_MULTITHREADED_NVBENCH io/orc/orc_reader_multithreaded.cpp)
+
 # ##################################################################################################
 # * csv reader benchmark --------------------------------------------------------------------------
 ConfigureNVBench(CSV_READER_NVBENCH io/csv/csv_reader_input.cpp io/csv/csv_reader_options.cpp)
diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
new file mode 100644
index 00000000000..ffbbc6f8464
--- /dev/null
+++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
@@ -0,0 +1,335 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
+#include <benchmarks/io/nvbench_helpers.hpp>
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/io/orc.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+#include <cudf/utilities/thread_pool.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+#include <vector>
+
+size_t get_num_read_threads(nvbench::state const& state) { return state.get_int64("num_threads"); }
+
+size_t get_read_size(nvbench::state const& state)
+{
+  auto const num_reads = get_num_read_threads(state);
+  return state.get_int64("total_data_size") / num_reads;
+}
+
+std::string get_label(std::string const& test_name, nvbench::state const& state)
+{
+  auto const num_cols       = state.get_int64("num_cols");
+  size_t const read_size_mb = get_read_size(state) / (1024 * 1024);
+  return {test_name + ", " + std::to_string(num_cols) + " columns, " +
+          std::to_string(get_num_read_threads(state)) + " threads " + " (" +
+          std::to_string(read_size_mb) + " MB each)"};
+}
+
+std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
+  nvbench::state& state, std::vector<cudf::type_id> const& d_types)
+{
+  auto const cardinality = state.get_int64("cardinality");
+  auto const run_length  = state.get_int64("run_length");
+  auto const num_cols    = state.get_int64("num_cols");
+  size_t const num_files            = get_num_read_threads(state);
+  size_t const per_file_data_size   = get_read_size(state);
+
+  std::vector<cuio_source_sink_pair> source_sink_vector;
+
+  size_t total_file_size = 0;
+
+  for (size_t i = 0; i < num_files; ++i) {
+    cuio_source_sink_pair source_sink{io_type::HOST_BUFFER};
+
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_types, num_cols),
+      table_size_bytes{per_file_data_size},
+      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const view = tbl->view();
+
+    cudf::io::orc_writer_options const write_opts =
+      cudf::io::orc_writer_options::builder(source_sink.make_sink_info(), view)
+        .compression(cudf::io::compression_type::SNAPPY);
+
+    cudf::io::write_orc(write_opts);
+    total_file_size += source_sink.size();
+
+    source_sink_vector.push_back(std::move(source_sink));
+  }
+
+  return {std::move(source_sink_vector), total_file_size, num_files};
+}
+
+void BM_orc_multithreaded_read_common(nvbench::state& state,
+                                      std::vector<cudf::type_id> const& d_types,
+                                      std::string const& label)
+{
+  auto const data_size = state.get_int64("total_data_size");
+  auto const num_threads = state.get_int64("num_threads");
+
+  auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
+  cudf::detail::thread_pool threads(num_threads);
+
+  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+  std::vector<cudf::io::source_info> source_info_vector;
+  std::transform(source_sink_vector.begin(),
+                 source_sink_vector.end(),
+                 std::back_inserter(source_info_vector),
+                 [](auto& source_sink) { return source_sink.make_source_info(); });
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  {
+    cudf::scoped_range range{("(read) " + label).c_str()};
+    state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+              [&](nvbench::launch& launch, auto& timer) {
+                auto read_func = [&](int index) {
+                  auto const stream = streams[index % num_threads];
+                  cudf::io::orc_reader_options read_opts =
+                    cudf::io::orc_reader_options::builder(source_info_vector[index]);
+                  cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource());
+                };
+
+                threads.paused = true;
+                for (size_t i = 0; i < num_files; ++i) {
+                  threads.submit(read_func, i);
+                }
+                timer.start();
+                threads.paused = false;
+                threads.wait_for_tasks();
+                cudf::detail::join_streams(streams, cudf::get_default_stream());
+                timer.stop();
+              });
+  }
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size");
+}
+
+void BM_orc_multithreaded_read_mixed(nvbench::state& state)
+{
+  auto label = get_label("mixed", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_common(
+    state, {cudf::type_id::INT32, cudf::type_id::DECIMAL64, cudf::type_id::STRING}, label);
+}
+
+void BM_orc_multithreaded_read_fixed_width(nvbench::state& state)
+{
+  auto label = get_label("fixed width", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_common(state, {cudf::type_id::INT32}, label);
+}
+
+void BM_orc_multithreaded_read_string(nvbench::state& state)
+{
+  auto label = get_label("string", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_common(state, {cudf::type_id::STRING}, label);
+}
+
+void BM_orc_multithreaded_read_list(nvbench::state& state)
+{
+  auto label = get_label("list", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_common(state, {cudf::type_id::LIST}, label);
+}
+
+void BM_orc_multithreaded_read_chunked_common(nvbench::state& state,
+                                              std::vector<cudf::type_id> const& d_types,
+                                              std::string const& label)
+{
+  size_t const data_size    = state.get_int64("total_data_size");
+  auto const num_threads    = state.get_int64("num_threads");
+  size_t const input_limit  = state.get_int64("input_limit");
+  size_t const output_limit = state.get_int64("output_limit");
+
+  auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
+  cudf::detail::thread_pool threads(num_threads);
+  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+  std::vector<cudf::io::source_info> source_info_vector;
+  std::transform(source_sink_vector.begin(),
+                 source_sink_vector.end(),
+                 std::back_inserter(source_info_vector),
+                 [](auto& source_sink) { return source_sink.make_source_info(); });
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+
+  {
+    cudf::scoped_range range{("(read) " + label).c_str()};
+    std::vector<cudf::io::table_with_metadata> chunks;
+    state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+              [&](nvbench::launch& launch, auto& timer) {
+                auto read_func = [&](int index) {
+                  auto const stream = streams[index % num_threads];
+                  cudf::io::orc_reader_options read_opts =
+                    cudf::io::orc_reader_options::builder(source_info_vector[index]);
+                  // divide chunk limits by number of threads so the number of chunks produced is the
+                  // same for all cases. this seems better than the alternative, which is to keep the
+                  // limits the same. if we do that, as the number of threads goes up, the number of
+                  // chunks goes down - so are actually benchmarking the same thing in that case?
+                  auto reader = cudf::io::chunked_orc_reader(
+                    output_limit / num_threads, input_limit / num_threads, read_opts, stream);
+
+                  // read all the chunks
+                  do {
+                    auto table = reader.read_chunk();
+                  } while (reader.has_next());
+                };
+
+                threads.paused = true;
+                for (size_t i = 0; i < num_files; ++i) {
+                  threads.submit(read_func, i);
+                }
+                timer.start();
+                threads.paused = false;
+                threads.wait_for_tasks();
+                cudf::detail::join_streams(streams, cudf::get_default_stream());
+                timer.stop();
+              });
+  }
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(total_file_size, "encoded_file_size", "encoded_file_size");
+}
+
+void BM_orc_multithreaded_read_chunked_mixed(nvbench::state& state)
+{
+  auto label = get_label("mixed", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_chunked_common(
+    state, {cudf::type_id::INT32, cudf::type_id::DECIMAL64, cudf::type_id::STRING}, label);
+}
+
+void BM_orc_multithreaded_read_chunked_fixed_width(nvbench::state& state)
+{
+  auto label = get_label("fixed width", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::INT32}, label);
+}
+
+void BM_orc_multithreaded_read_chunked_string(nvbench::state& state)
+{
+  auto label = get_label("string", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::STRING}, label);
+}
+
+void BM_orc_multithreaded_read_chunked_list(nvbench::state& state)
+{
+  auto label = get_label("list", state);
+  cudf::scoped_range range{label.c_str()};
+  BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::LIST}, label);
+}
+auto const thread_range  = std::vector<nvbench::int64_t>{1, 2, 4, 8};
+auto const total_data_size = std::vector<nvbench::int64_t>{512 * 1024 * 1024, 1024 * 1024 * 1024};
+
+// mixed data types: fixed width and strings
+NVBENCH_BENCH(BM_orc_multithreaded_read_mixed)
+  .set_name("orc_multithreaded_read_decode_mixed")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+NVBENCH_BENCH(BM_orc_multithreaded_read_fixed_width)
+  .set_name("orc_multithreaded_read_decode_fixed_width")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+NVBENCH_BENCH(BM_orc_multithreaded_read_string)
+  .set_name("orc_multithreaded_read_decode_string")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+NVBENCH_BENCH(BM_orc_multithreaded_read_list)
+  .set_name("orc_multithreaded_read_decode_list")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8});
+
+// mixed data types: fixed width, strings
+NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_mixed)
+  .set_name("orc_multithreaded_read_decode_chunked_mixed")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+
+NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_fixed_width)
+  .set_name("orc_multithreaded_read_decode_chunked_fixed_width")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+
+NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_string)
+  .set_name("orc_multithreaded_read_decode_chunked_string")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+
+NVBENCH_BENCH(BM_orc_multithreaded_read_chunked_list)
+  .set_name("orc_multithreaded_read_decode_chunked_list")
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {1000})
+  .add_int64_axis("total_data_size", total_data_size)
+  .add_int64_axis("num_threads", thread_range)
+  .add_int64_axis("num_cols", {4})
+  .add_int64_axis("run_length", {8})
+  .add_int64_axis("input_limit", {640 * 1024 * 1024})
+  .add_int64_axis("output_limit", {640 * 1024 * 1024});

From 24fe359425b080594b05bab040699a1468483474 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 14 Jun 2024 09:35:13 -0400
Subject: [PATCH 362/842] Remove CCCL 2.2 patches as we now always use 2.5+
 (#15969)

Now that https://github.com/rapidsai/rapids-cmake/pull/607 has been merged we can drop support for patching CCCL 2.2

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Paul Taylor (https://github.com/trxcllnt)

URL: https://github.com/rapidsai/cudf/pull/15969
---
 .../thirdparty/patches/cccl_override.json     | 35 --------------
 .../patches/revert_pr_211_cccl_2.5.0.diff     | 47 -------------------
 .../thrust_disable_64bit_dispatching.diff     | 38 +++++++--------
 ..._disable_64bit_dispatching_cccl_2.5.0.diff | 25 ----------
 .../thrust_faster_scan_compile_times.diff     | 30 ++++++------
 ..._faster_scan_compile_times_cccl_2.5.0.diff | 39 ---------------
 .../thrust_faster_sort_compile_times.diff     | 32 ++++++-------
 ..._faster_sort_compile_times_cccl_2.5.0.diff | 39 ---------------
 8 files changed, 50 insertions(+), 235 deletions(-)
 delete mode 100644 cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
 delete mode 100644 cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff
 delete mode 100644 cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff
 delete mode 100644 cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff

diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index 059f713e7a5..e61102dffac 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -3,60 +3,25 @@
   "packages" : {
     "CCCL" : {
       "patches" : [
-        {
-          "file" : "cccl/bug_fixes.diff",
-          "issue" : "CCCL installs header-search.cmake files in nondeterministic order and has a typo in checking target creation that leads to duplicates",
-          "fixed_in" : "2.3"
-        },
-        {
-          "file" : "cccl/hide_kernels.diff",
-          "issue" : "Mark all cub and thrust kernels with hidden visibility [https://github.com/nvidia/cccl/pulls/443]",
-          "fixed_in" : "2.3"
-        },
         {
           "file" : "cccl/revert_pr_211.diff",
           "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
           "fixed_in" : ""
         },
-        {
-          "file" : "${current_json_dir}/revert_pr_211_cccl_2.5.0.diff",
-          "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
-          "fixed_in" : ""
-        },
-        {
-          "file": "cccl/kernel_pointer_hiding.diff",
-          "issue": "Hide APIs that accept kernel pointers [https://github.com/NVIDIA/cccl/pull/1395]",
-          "fixed_in": "2.4"
-        },
         {
           "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
           "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
           "fixed_in" : ""
         },
-        {
-          "file" : "${current_json_dir}/thrust_disable_64bit_dispatching_cccl_2.5.0.diff",
-          "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
-          "fixed_in" : ""
-        },
         {
           "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff",
           "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
           "fixed_in" : ""
         },
-        {
-          "file" : "${current_json_dir}/thrust_faster_sort_compile_times_cccl_2.5.0.diff",
-          "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
-          "fixed_in" : ""
-        },
         {
           "file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff",
           "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
           "fixed_in" : ""
-        },
-        {
-          "file" : "${current_json_dir}/thrust_faster_scan_compile_times_cccl_2.5.0.diff",
-          "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
-          "fixed_in" : ""
         }
       ]
     }
diff --git a/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
deleted file mode 100644
index 27ff16744f5..00000000000
--- a/cpp/cmake/thirdparty/patches/revert_pr_211_cccl_2.5.0.diff
+++ /dev/null
@@ -1,47 +0,0 @@
-diff --git a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
-index 046eb83c0..8047c9701 100644
---- a/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
-+++ b/thrust/thrust/system/cuda/detail/internal/copy_device_to_device.h
-@@ -53,41 +53,15 @@ namespace cuda_cub
- 
- namespace __copy
- {
--template <class Derived, class InputIt, class OutputIt>
--OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
--  execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::true_type)
--{
--  typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
--  const auto n = thrust::distance(first, last);
--  if (n > 0)
--  {
--    cudaError status;
--    status = trivial_copy_device_to_device(
--      policy,
--      reinterpret_cast<InputTy*>(thrust::raw_pointer_cast(&*result)),
--      reinterpret_cast<InputTy const*>(thrust::raw_pointer_cast(&*first)),
--      n);
--    cuda_cub::throw_on_error(status, "__copy:: D->D: failed");
--  }
--
--  return result + n;
--}
- 
- template <class Derived, class InputIt, class OutputIt>
- OutputIt THRUST_RUNTIME_FUNCTION device_to_device(
--  execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result, thrust::detail::false_type)
-+  execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
- {
-   typedef typename thrust::iterator_traits<InputIt>::value_type InputTy;
-   return cuda_cub::transform(policy, first, last, result, thrust::identity<InputTy>());
- }
- 
--template <class Derived, class InputIt, class OutputIt>
--OutputIt THRUST_RUNTIME_FUNCTION
--device_to_device(execution_policy<Derived>& policy, InputIt first, InputIt last, OutputIt result)
--{
--  return device_to_device(
--    policy, first, last, result, typename is_indirectly_trivially_relocatable_to<InputIt, OutputIt>::type());
--}
- } // namespace __copy
- 
- } // namespace cuda_cub
diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
index d3f1a26781f..6ae1e1c917b 100644
--- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
+++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
@@ -1,25 +1,25 @@
 diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
-index d0e3f94ec..5c32a9c60 100644
+index 2a3cc4e33..8fb337b26 100644
 --- a/thrust/thrust/system/cuda/detail/dispatch.h
 +++ b/thrust/thrust/system/cuda/detail/dispatch.h
-@@ -32,8 +32,7 @@
-         status = call arguments; \
-     } \
-     else { \
--        auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
--        status = call arguments; \
-+        throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
-     }
-
+@@ -44,8 +44,7 @@
+   }                                                                                   \
+   else                                                                                \
+   {                                                                                   \
+-    auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
+-    status                             = call arguments;                              \
++    throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
+   }
+ 
  /**
-@@ -52,9 +51,7 @@
-         status = call arguments; \
-     } \
-     else { \
--        auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1); \
--        auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2); \
--        status = call arguments; \
-+        throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
-     }
+@@ -66,9 +65,7 @@
+   }                                                                                          \
+   else                                                                                       \
+   {                                                                                          \
+-    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1);      \
+-    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2);      \
+-    status                              = call arguments;                                    \
++    throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
+   }
  /**
   * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff
deleted file mode 100644
index 6ae1e1c917b..00000000000
--- a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching_cccl_2.5.0.diff
+++ /dev/null
@@ -1,25 +0,0 @@
-diff --git a/thrust/thrust/system/cuda/detail/dispatch.h b/thrust/thrust/system/cuda/detail/dispatch.h
-index 2a3cc4e33..8fb337b26 100644
---- a/thrust/thrust/system/cuda/detail/dispatch.h
-+++ b/thrust/thrust/system/cuda/detail/dispatch.h
-@@ -44,8 +44,7 @@
-   }                                                                                   \
-   else                                                                                \
-   {                                                                                   \
--    auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
--    status                             = call arguments;                              \
-+    throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
-   }
- 
- /**
-@@ -66,9 +65,7 @@
-   }                                                                                          \
-   else                                                                                       \
-   {                                                                                          \
--    auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1);      \
--    auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2);      \
--    status                              = call arguments;                                    \
-+    throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
-   }
- /**
-  * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff
index a606e21b92d..fee46046194 100644
--- a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff
+++ b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff
@@ -1,23 +1,23 @@
 diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
-index 84b6ccffd..25a237f93 100644
+index 0606485bb..dbb99ff13 100644
 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
 +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
-@@ -808,7 +808,7 @@ struct DeviceRadixSortPolicy
-
-
-     /// SM60 (GP100)
--    struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
-+    struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy
+   };
+ 
+   /// SM60 (GP100)
+-  struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
++  struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+   {
+     enum
      {
-         enum {
-             PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 6.9B 32b keys/s (Quadro P100)
 diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
-index 994adc095..d3e6719a7 100644
+index f39613adb..75bd16ff9 100644
 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh
 +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
-@@ -479,7 +479,7 @@ struct DeviceReducePolicy
+@@ -488,7 +488,7 @@ struct DeviceReducePolicy
    };
-
+ 
    /// SM60
 -  struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
 +  struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
@@ -25,15 +25,15 @@ index 994adc095..d3e6719a7 100644
      static constexpr int threads_per_block  = 256;
      static constexpr int items_per_thread   = 16;
 diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
-index 0ea5c41ad..1bcd8a111 100644
+index 419908c4e..6ab0840e1 100644
 --- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
 +++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
-@@ -303,7 +303,7 @@ struct DeviceScanPolicy
+@@ -339,7 +339,7 @@ struct DeviceScanPolicy
    /// SM600
    struct Policy600
        : DefaultTuning
 -      , ChainedPolicy<600, Policy600, Policy520>
 +      , ChainedPolicy<600, Policy600, Policy600>
    {};
-
+ 
    /// SM800
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff
deleted file mode 100644
index fee46046194..00000000000
--- a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times_cccl_2.5.0.diff
+++ /dev/null
@@ -1,39 +0,0 @@
-diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
-index 0606485bb..dbb99ff13 100644
---- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh
-+++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh
-@@ -1085,7 +1085,7 @@ struct DeviceRadixSortPolicy
-   };
- 
-   /// SM60 (GP100)
--  struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
-+  struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
-   {
-     enum
-     {
-diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh
-index f39613adb..75bd16ff9 100644
---- a/cub/cub/device/dispatch/dispatch_reduce.cuh
-+++ b/cub/cub/device/dispatch/dispatch_reduce.cuh
-@@ -488,7 +488,7 @@ struct DeviceReducePolicy
-   };
- 
-   /// SM60
--  struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
-+  struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
-   {
-     static constexpr int threads_per_block  = 256;
-     static constexpr int items_per_thread   = 16;
-diff --git a/cub/cub/device/dispatch/tuning/tuning_scan.cuh b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
-index 419908c4e..6ab0840e1 100644
---- a/cub/cub/device/dispatch/tuning/tuning_scan.cuh
-+++ b/cub/cub/device/dispatch/tuning/tuning_scan.cuh
-@@ -339,7 +339,7 @@ struct DeviceScanPolicy
-   /// SM600
-   struct Policy600
-       : DefaultTuning
--      , ChainedPolicy<600, Policy600, Policy520>
-+      , ChainedPolicy<600, Policy600, Policy600>
-   {};
- 
-   /// SM800
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff
index c34b6433d10..cb0cc55f4d2 100644
--- a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff
+++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff
@@ -1,39 +1,39 @@
 diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
-index dc07ef6c2..a066c14da 100644
+index eb76ebb0b..c6c529a50 100644
 --- a/cub/cub/block/block_merge_sort.cuh
 +++ b/cub/cub/block/block_merge_sort.cuh
-@@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared,
+@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
    KeyT key1 = keys_shared[keys1_beg];
    KeyT key2 = keys_shared[keys2_beg];
-
+ 
 -#pragma unroll
 +#pragma unroll 1
    for (int item = 0; item < ITEMS_PER_THREAD; ++item)
    {
-     bool p = (keys2_beg < keys2_end) &&
-@@ -383,7 +383,7 @@ public:
+     bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
+@@ -376,7 +376,7 @@ public:
        //
        KeyT max_key = oob_default;
-
--      #pragma unroll
-+      #pragma unroll 1
+ 
+-#pragma unroll
++#pragma unroll 1
        for (int item = 1; item < ITEMS_PER_THREAD; ++item)
        {
          if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
 diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh
-index 5d4867896..b42fb5f00 100644
+index 7d9e8622f..da5627306 100644
 --- a/cub/cub/thread/thread_sort.cuh
 +++ b/cub/cub/thread/thread_sort.cuh
-@@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD],
+@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE
  {
-   constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
-
--  #pragma unroll
-+  #pragma unroll 1
+   constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;
+ 
+-#pragma unroll
++#pragma unroll 1
    for (int i = 0; i < ITEMS_PER_THREAD; ++i)
    {
--  #pragma unroll
-+  #pragma unroll 1
+-#pragma unroll
++#pragma unroll 1
      for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
      {
        if (compare_op(keys[j + 1], keys[j]))
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff
deleted file mode 100644
index cb0cc55f4d2..00000000000
--- a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times_cccl_2.5.0.diff
+++ /dev/null
@@ -1,39 +0,0 @@
-diff --git a/cub/cub/block/block_merge_sort.cuh b/cub/cub/block/block_merge_sort.cuh
-index eb76ebb0b..c6c529a50 100644
---- a/cub/cub/block/block_merge_sort.cuh
-+++ b/cub/cub/block/block_merge_sort.cuh
-@@ -95,7 +95,7 @@ _CCCL_DEVICE _CCCL_FORCEINLINE void SerialMerge(
-   KeyT key1 = keys_shared[keys1_beg];
-   KeyT key2 = keys_shared[keys2_beg];
- 
--#pragma unroll
-+#pragma unroll 1
-   for (int item = 0; item < ITEMS_PER_THREAD; ++item)
-   {
-     bool p = (keys2_beg < keys2_end) && ((keys1_beg >= keys1_end) || compare_op(key2, key1));
-@@ -376,7 +376,7 @@ public:
-       //
-       KeyT max_key = oob_default;
- 
--#pragma unroll
-+#pragma unroll 1
-       for (int item = 1; item < ITEMS_PER_THREAD; ++item)
-       {
-         if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
-diff --git a/cub/cub/thread/thread_sort.cuh b/cub/cub/thread/thread_sort.cuh
-index 7d9e8622f..da5627306 100644
---- a/cub/cub/thread/thread_sort.cuh
-+++ b/cub/cub/thread/thread_sort.cuh
-@@ -87,10 +87,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD], ValueT (&items)[ITEMS_PER_THRE
- {
-   constexpr bool KEYS_ONLY = ::cuda::std::is_same<ValueT, NullType>::value;
- 
--#pragma unroll
-+#pragma unroll 1
-   for (int i = 0; i < ITEMS_PER_THREAD; ++i)
-   {
--#pragma unroll
-+#pragma unroll 1
-     for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
-     {
-       if (compare_op(keys[j + 1], keys[j]))

From 374ee13adaf18503ee671b652f76a3ccb9dc118b Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 14 Jun 2024 15:28:53 +0100
Subject: [PATCH 363/842] Fix exclude regex in pre-commit clang-format hook
 (#16030)

The clang-tidy changes in #15894 introduce a new exclude regex list to the pre-commit clang-format hook. However, it was a single character too long, ending with a |. Consequently, the exclude regex matched the empty string, and hence excluded every C++ file.

Fix this, and apply formatting changes to the files that were modified in the interim and were not clang-format compatible.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16030
---
 .pre-commit-config.yaml                       |   2 +-
 .../io/orc/orc_reader_multithreaded.cpp       | 107 +++++++++---------
 cpp/tests/interop/from_arrow_test.cpp         |   5 +-
 3 files changed, 58 insertions(+), 56 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index cc08b832e69..f8c4f4b9143 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -60,7 +60,7 @@ repos:
           (?x)^(
             ^cpp/src/io/parquet/ipc/Schema_generated.h|
             ^cpp/src/io/parquet/ipc/Message_generated.h|
-            ^cpp/include/cudf_test/cxxopts.hpp|
+            ^cpp/include/cudf_test/cxxopts.hpp
           )
   - repo: https://github.com/sirosen/texthooks
     rev: 0.6.6
diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
index ffbbc6f8464..aa0ee39a179 100644
--- a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
@@ -50,11 +50,11 @@ std::string get_label(std::string const& test_name, nvbench::state const& state)
 std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
   nvbench::state& state, std::vector<cudf::type_id> const& d_types)
 {
-  auto const cardinality = state.get_int64("cardinality");
-  auto const run_length  = state.get_int64("run_length");
-  auto const num_cols    = state.get_int64("num_cols");
-  size_t const num_files            = get_num_read_threads(state);
-  size_t const per_file_data_size   = get_read_size(state);
+  auto const cardinality          = state.get_int64("cardinality");
+  auto const run_length           = state.get_int64("run_length");
+  auto const num_cols             = state.get_int64("num_cols");
+  size_t const num_files          = get_num_read_threads(state);
+  size_t const per_file_data_size = get_read_size(state);
 
   std::vector<cuio_source_sink_pair> source_sink_vector;
 
@@ -86,7 +86,7 @@ void BM_orc_multithreaded_read_common(nvbench::state& state,
                                       std::vector<cudf::type_id> const& d_types,
                                       std::string const& label)
 {
-  auto const data_size = state.get_int64("total_data_size");
+  auto const data_size   = state.get_int64("total_data_size");
   auto const num_threads = state.get_int64("num_threads");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
@@ -104,24 +104,24 @@ void BM_orc_multithreaded_read_common(nvbench::state& state,
   {
     cudf::scoped_range range{("(read) " + label).c_str()};
     state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
-              [&](nvbench::launch& launch, auto& timer) {
-                auto read_func = [&](int index) {
-                  auto const stream = streams[index % num_threads];
-                  cudf::io::orc_reader_options read_opts =
-                    cudf::io::orc_reader_options::builder(source_info_vector[index]);
-                  cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource());
-                };
-
-                threads.paused = true;
-                for (size_t i = 0; i < num_files; ++i) {
-                  threads.submit(read_func, i);
-                }
-                timer.start();
-                threads.paused = false;
-                threads.wait_for_tasks();
-                cudf::detail::join_streams(streams, cudf::get_default_stream());
-                timer.stop();
-              });
+               [&](nvbench::launch& launch, auto& timer) {
+                 auto read_func = [&](int index) {
+                   auto const stream = streams[index % num_threads];
+                   cudf::io::orc_reader_options read_opts =
+                     cudf::io::orc_reader_options::builder(source_info_vector[index]);
+                   cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource());
+                 };
+
+                 threads.paused = true;
+                 for (size_t i = 0; i < num_files; ++i) {
+                   threads.submit(read_func, i);
+                 }
+                 timer.start();
+                 threads.paused = false;
+                 threads.wait_for_tasks();
+                 cudf::detail::join_streams(streams, cudf::get_default_stream());
+                 timer.stop();
+               });
   }
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
@@ -184,34 +184,35 @@ void BM_orc_multithreaded_read_chunked_common(nvbench::state& state,
     cudf::scoped_range range{("(read) " + label).c_str()};
     std::vector<cudf::io::table_with_metadata> chunks;
     state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
-              [&](nvbench::launch& launch, auto& timer) {
-                auto read_func = [&](int index) {
-                  auto const stream = streams[index % num_threads];
-                  cudf::io::orc_reader_options read_opts =
-                    cudf::io::orc_reader_options::builder(source_info_vector[index]);
-                  // divide chunk limits by number of threads so the number of chunks produced is the
-                  // same for all cases. this seems better than the alternative, which is to keep the
-                  // limits the same. if we do that, as the number of threads goes up, the number of
-                  // chunks goes down - so are actually benchmarking the same thing in that case?
-                  auto reader = cudf::io::chunked_orc_reader(
-                    output_limit / num_threads, input_limit / num_threads, read_opts, stream);
-
-                  // read all the chunks
-                  do {
-                    auto table = reader.read_chunk();
-                  } while (reader.has_next());
-                };
-
-                threads.paused = true;
-                for (size_t i = 0; i < num_files; ++i) {
-                  threads.submit(read_func, i);
-                }
-                timer.start();
-                threads.paused = false;
-                threads.wait_for_tasks();
-                cudf::detail::join_streams(streams, cudf::get_default_stream());
-                timer.stop();
-              });
+               [&](nvbench::launch& launch, auto& timer) {
+                 auto read_func = [&](int index) {
+                   auto const stream = streams[index % num_threads];
+                   cudf::io::orc_reader_options read_opts =
+                     cudf::io::orc_reader_options::builder(source_info_vector[index]);
+                   // divide chunk limits by number of threads so the number of chunks produced is
+                   // the same for all cases. this seems better than the alternative, which is to
+                   // keep the limits the same. if we do that, as the number of threads goes up, the
+                   // number of chunks goes down - so are actually benchmarking the same thing in
+                   // that case?
+                   auto reader = cudf::io::chunked_orc_reader(
+                     output_limit / num_threads, input_limit / num_threads, read_opts, stream);
+
+                   // read all the chunks
+                   do {
+                     auto table = reader.read_chunk();
+                   } while (reader.has_next());
+                 };
+
+                 threads.paused = true;
+                 for (size_t i = 0; i < num_files; ++i) {
+                   threads.submit(read_func, i);
+                 }
+                 timer.start();
+                 threads.paused = false;
+                 threads.wait_for_tasks();
+                 cudf::detail::join_streams(streams, cudf::get_default_stream());
+                 timer.stop();
+               });
   }
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
@@ -249,7 +250,7 @@ void BM_orc_multithreaded_read_chunked_list(nvbench::state& state)
   cudf::scoped_range range{label.c_str()};
   BM_orc_multithreaded_read_chunked_common(state, {cudf::type_id::LIST}, label);
 }
-auto const thread_range  = std::vector<nvbench::int64_t>{1, 2, 4, 8};
+auto const thread_range    = std::vector<nvbench::int64_t>{1, 2, 4, 8};
 auto const total_data_size = std::vector<nvbench::int64_t>{512 * 1024 * 1024, 1024 * 1024 * 1024};
 
 // mixed data types: fixed width and strings
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index af20a5c772f..6eaa1a07e08 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -50,7 +50,8 @@ std::unique_ptr<cudf::table> get_cudf_table()
                                                               {true, false, true, true, true});
   columns.emplace_back(std::move(cudf::dictionary::encode(col4)));
   columns.emplace_back(cudf::test::fixed_width_column_wrapper<bool>(
-                         {true, false, true, false, true}, {true, false, true, true, false}).release());
+                         {true, false, true, false, true}, {true, false, true, true, false})
+                         .release());
   columns.emplace_back(cudf::test::strings_column_wrapper(
                          {
                            "",
@@ -338,7 +339,7 @@ TEST_F(FromArrowTest, ChunkedArray)
     std::vector<std::shared_ptr<arrow::Array>>{dict_array1, dict_array2});
   auto boolean_array =
     get_arrow_array<bool>({true, false, true, false, true}, {true, false, true, true, false});
-  auto boolean_chunked_array = std::make_shared<arrow::ChunkedArray>(boolean_array);
+  auto boolean_chunked_array      = std::make_shared<arrow::ChunkedArray>(boolean_array);
   auto large_string_chunked_array = std::make_shared<arrow::ChunkedArray>(
     std::vector<std::shared_ptr<arrow::Array>>{large_string_array_1});
 

From 2297f9a61e2f4153ab2e8a0631f7cfe7971ead14 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 14 Jun 2024 17:43:17 +0100
Subject: [PATCH 364/842] Fix initialization error in to_arrow for empty string
 views (#16033)

When converting an empty string view to arrow, we don't bother with copies from device, but rather create the arrow arrays directly. The offset buffer is therefore a singleton int32 array with zero in it.

Previously, the initialization of this array was incorrect, since mutable_data() returns a uint8_t pointer, and so setting the single element could leave 24 of the 32 bits uninitialized.

Fix this by using memset instead to zero out the full buffer.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16033
---
 cpp/src/interop/to_arrow.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 47aee982c32..2b3aa2f08f1 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -292,9 +292,9 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::string_view>(
   auto child_arrays      = fetch_child_array(input_view, {{}, {}}, ar_mr, stream);
   if (child_arrays.empty()) {
     // Empty string will have only one value in offset of 4 bytes
-    auto tmp_offset_buffer               = allocate_arrow_buffer(4, ar_mr);
-    auto tmp_data_buffer                 = allocate_arrow_buffer(0, ar_mr);
-    tmp_offset_buffer->mutable_data()[0] = 0;
+    auto tmp_offset_buffer = allocate_arrow_buffer(sizeof(int32_t), ar_mr);
+    auto tmp_data_buffer   = allocate_arrow_buffer(0, ar_mr);
+    memset(tmp_offset_buffer->mutable_data(), 0, sizeof(int32_t));
 
     return std::make_shared<arrow::StringArray>(
       0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer));

From 5facc8cde15cc8301adb0c06fc682f558828fbc8 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 14 Jun 2024 07:12:09 -1000
Subject: [PATCH 365/842] Enable ruff TCH: typing imports under if
 TYPE_CHECKING (#16015)

Reduces some unnecessary imports for running cudf and nicely delineates which imports are meant for typing purposes

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16015
---
 docs/cudf/source/conf.py                      |  6 ++++
 pyproject.toml                                |  2 +-
 python/cudf/cudf/_typing.py                   |  3 +-
 python/cudf/cudf/core/_base_index.py          |  9 ++++--
 python/cudf/cudf/core/buffer/spill_manager.py |  6 ++--
 python/cudf/cudf/core/column/categorical.py   | 20 +++++++++----
 python/cudf/cudf/core/column/column.py        |  8 +++--
 python/cudf/cudf/core/column/datetime.py      | 16 +++++-----
 python/cudf/cudf/core/column/decimal.py       |  6 ++--
 python/cudf/cudf/core/column/lists.py         |  6 ++--
 python/cudf/cudf/core/column/numerical.py     | 29 +++++++++++++------
 .../cudf/cudf/core/column/numerical_base.py   |  6 ++--
 python/cudf/cudf/core/column/string.py        |  9 +++---
 python/cudf/cudf/core/column/struct.py        |  5 +++-
 python/cudf/cudf/core/column/timedelta.py     |  6 ++--
 python/cudf/cudf/core/dataframe.py            |  5 +++-
 python/cudf/cudf/core/dtypes.py               |  6 ++--
 python/cudf/cudf/core/frame.py                | 10 +++++--
 python/cudf/cudf/core/index.py                |  5 +++-
 python/cudf/cudf/core/indexed_frame.py        | 15 ++++++----
 python/cudf/cudf/core/multiindex.py           |  9 ++++--
 python/cudf/cudf/core/series.py               | 15 ++++++----
 python/cudf/cudf/core/single_column_frame.py  | 13 +++++----
 23 files changed, 143 insertions(+), 72 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index e9c760e288e..108f12bc099 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -554,6 +554,12 @@ def on_missing_reference(app, env, node, contnode):
 nitpick_ignore = [
     ("py:class", "SeriesOrIndex"),
     ("py:class", "Dtype"),
+    # The following are erroneously warned due to
+    # https://github.com/sphinx-doc/sphinx/issues/11225
+    ("py:class", "pa.Array"),
+    ("py:class", "ScalarLike"),
+    ("py:class", "ParentType"),
+    ("py:class", "ColumnLike"),
     # TODO: Remove this when we figure out why typing_extensions doesn't seem
     # to map types correctly for intersphinx
     ("py:class", "typing_extensions.Self"),
diff --git a/pyproject.toml b/pyproject.toml
index d343b237ee7..c602240a0b7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ quiet-level = 3
 line-length = 79
 
 [tool.ruff.lint]
-select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418"]
+select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH"]
 ignore = [
     # whitespace before :
     "E203",
diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py
index 206173919e1..34c96cc8cb3 100644
--- a/python/cudf/cudf/_typing.py
+++ b/python/cudf/cudf/_typing.py
@@ -5,9 +5,10 @@
 
 import numpy as np
 from pandas import Period, Timedelta, Timestamp
-from pandas.api.extensions import ExtensionDtype
 
 if TYPE_CHECKING:
+    from pandas.api.extensions import ExtensionDtype
+
     import cudf
 
 # Backwards compat: mypy >= 0.790 rejects Type[NotImplemented], but
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 5d0f7c4ede4..b29fc475b29 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -4,9 +4,8 @@
 
 import pickle
 import warnings
-from collections.abc import Generator
 from functools import cached_property
-from typing import Any, Literal, Set, Tuple
+from typing import TYPE_CHECKING, Any, Literal, Set, Tuple
 
 import pandas as pd
 from typing_extensions import Self
@@ -31,12 +30,16 @@
 )
 from cudf.core.abc import Serializable
 from cudf.core.column import ColumnBase, column
-from cudf.core.column_accessor import ColumnAccessor
 from cudf.errors import MixedTypeError
 from cudf.utils import ioutils
 from cudf.utils.dtypes import can_convert_to_column, is_mixed_with_object_dtype
 from cudf.utils.utils import _is_same_name
 
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from cudf.core.column_accessor import ColumnAccessor
+
 
 class BaseIndex(Serializable):
     """Base class for all cudf Index types."""
diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
index cd81149bdb8..7bcf97302aa 100644
--- a/python/cudf/cudf/core/buffer/spill_manager.py
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -13,15 +13,17 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
-from typing import Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
 
 import rmm.mr
 
-from cudf.core.buffer.spillable_buffer import SpillableBufferOwner
 from cudf.options import get_option
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.string import format_bytes
 
+if TYPE_CHECKING:
+    from cudf.core.buffer.spillable_buffer import SpillableBufferOwner
+
 _spill_cudf_nvtx_annotate = partial(
     _cudf_nvtx_annotate, domain="cudf_python-spill"
 )
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index de20b2ace1d..97c2ce5cf1f 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -3,21 +3,17 @@
 from __future__ import annotations
 
 import warnings
-from collections import abc
 from functools import cached_property
 from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Tuple, cast
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from numba import cuda
 from typing_extensions import Self
 
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.transform import bools_to_mask
-from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
-from cudf.core.buffer import Buffer
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import CategoricalDtype, IntervalDtype
@@ -29,7 +25,19 @@
 )
 
 if TYPE_CHECKING:
-    from cudf._typing import SeriesOrIndex, SeriesOrSingleColumnIndex
+    from collections import abc
+
+    import numba.cuda
+
+    from cudf._typing import (
+        ColumnBinaryOperand,
+        ColumnLike,
+        Dtype,
+        ScalarLike,
+        SeriesOrIndex,
+        SeriesOrSingleColumnIndex,
+    )
+    from cudf.core.buffer import Buffer
     from cudf.core.column import (
         ColumnBase,
         DatetimeColumn,
@@ -868,7 +876,7 @@ def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase":
 
     def data_array_view(
         self, *, mode="write"
-    ) -> cuda.devicearray.DeviceNDArray:
+    ) -> numba.cuda.devicearray.DeviceNDArray:
         return self.codes.data_array_view(mode=mode)
 
     def unique(self) -> CategoricalColumn:
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 75fc31ddbce..dc937dc0469 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2,13 +2,13 @@
 
 from __future__ import annotations
 
-import builtins
 import pickle
 from collections import abc
 from functools import cached_property
 from itertools import chain
 from types import SimpleNamespace
 from typing import (
+    TYPE_CHECKING,
     Any,
     Dict,
     List,
@@ -49,7 +49,6 @@
 )
 from cudf._lib.transform import bools_to_mask
 from cudf._lib.types import size_type_dtype
-from cudf._typing import ColumnLike, Dtype, ScalarLike
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     _is_pandas_nullable_extension_dtype,
@@ -89,6 +88,11 @@
 )
 from cudf.utils.utils import _array_ufunc, mask_dtype
 
+if TYPE_CHECKING:
+    import builtins
+
+    from cudf._typing import ColumnLike, Dtype, ScalarLike
+
 if PANDAS_GE_210:
     NumpyExtensionArray = pd.arrays.NumpyExtensionArray
 else:
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 057169aa7e1..e24d85bfedf 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -19,22 +19,22 @@
 from cudf import _lib as libcudf
 from cudf._lib.labeling import label_bins
 from cudf._lib.search import search_sorted
-from cudf._typing import (
-    ColumnBinaryOperand,
-    DatetimeLikeScalar,
-    Dtype,
-    DtypeObj,
-    ScalarLike,
-)
 from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
 from cudf.core._compat import PANDAS_GE_220
-from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
 from cudf.utils.utils import _all_bools_with_nulls
 
 if TYPE_CHECKING:
+    from cudf._typing import (
+        ColumnBinaryOperand,
+        DatetimeLikeScalar,
+        Dtype,
+        DtypeObj,
+        ScalarLike,
+    )
+    from cudf.core.buffer import Buffer
     from cudf.core.column.numerical import NumericalColumn
 
 if PANDAS_GE_220:
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 3a0f6649e21..9c1bedc9926 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -4,7 +4,7 @@
 
 import warnings
 from decimal import Decimal
-from typing import Any, Optional, Sequence, Union, cast
+from typing import TYPE_CHECKING, Any, Optional, Sequence, Union, cast
 
 import cupy as cp
 import numpy as np
@@ -16,7 +16,6 @@
 from cudf._lib.strings.convert.convert_fixed_point import (
     from_decimal as cpp_from_decimal,
 )
-from cudf._typing import ColumnBinaryOperand, Dtype
 from cudf.api.types import is_integer_dtype, is_scalar
 from cudf.core.buffer import as_buffer
 from cudf.core.column import ColumnBase
@@ -31,6 +30,9 @@
 
 from .numerical_base import NumericalBaseColumn
 
+if TYPE_CHECKING:
+    from cudf._typing import ColumnBinaryOperand, Dtype
+
 
 class DecimalBaseColumn(NumericalBaseColumn):
     """Base column for decimal32, decimal64 or decimal128 columns"""
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 8f8ee46c796..080ba949d62 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from functools import cached_property
-from typing import List, Optional, Sequence, Tuple, Union
+from typing import TYPE_CHECKING, List, Optional, Sequence, Tuple, Union
 
 import numpy as np
 import pandas as pd
@@ -26,13 +26,15 @@
 )
 from cudf._lib.strings.convert.convert_lists import format_list_column
 from cudf._lib.types import size_type_dtype
-from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethods, ParentType
 from cudf.core.dtypes import ListDtype
 from cudf.core.missing import NA
 
+if TYPE_CHECKING:
+    from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
+
 
 class ListColumn(ColumnBase):
     dtype: ListDtype
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 1952d7eeb71..6af67e02bb4 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -3,7 +3,16 @@
 from __future__ import annotations
 
 import functools
-from typing import Any, Callable, Optional, Sequence, Tuple, Union, cast
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+    cast,
+)
 
 import cupy as cp
 import numpy as np
@@ -14,13 +23,6 @@
 from cudf import _lib as libcudf
 from cudf._lib import pylibcudf
 from cudf._lib.types import size_type_dtype
-from cudf._typing import (
-    ColumnBinaryOperand,
-    ColumnLike,
-    Dtype,
-    DtypeObj,
-    ScalarLike,
-)
 from cudf.api.types import (
     is_bool_dtype,
     is_float_dtype,
@@ -28,7 +30,6 @@
     is_integer_dtype,
     is_scalar,
 )
-from cudf.core.buffer import Buffer
 from cudf.core.column import (
     ColumnBase,
     as_column,
@@ -48,6 +49,16 @@
 
 from .numerical_base import NumericalBaseColumn
 
+if TYPE_CHECKING:
+    from cudf._typing import (
+        ColumnBinaryOperand,
+        ColumnLike,
+        Dtype,
+        DtypeObj,
+        ScalarLike,
+    )
+    from cudf.core.buffer import Buffer
+
 _unaryop_map = {
     "ASIN": "ARCSIN",
     "ACOS": "ARCCOS",
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index d38ec9cf30f..bd48054a951 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -3,17 +3,19 @@
 
 from __future__ import annotations
 
-from typing import Optional, cast
+from typing import TYPE_CHECKING, Optional, cast
 
 import numpy as np
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._typing import ScalarLike
 from cudf.core.column import ColumnBase
 from cudf.core.missing import NA
 from cudf.core.mixins import Scannable
 
+if TYPE_CHECKING:
+    from cudf._typing import ScalarLike
+
 
 class NumericalBaseColumn(ColumnBase, Scannable):
     """A column composed of numerical data.
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index ad7dbe5e52e..87df2d2f1f1 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -16,11 +16,9 @@
     overload,
 )
 
-import cupy
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from numba import cuda
 from typing_extensions import Self
 
 import cudf
@@ -30,7 +28,6 @@
 from cudf._lib.column import Column
 from cudf._lib.types import size_type_dtype
 from cudf.api.types import is_integer, is_scalar, is_string_dtype
-from cudf.core.buffer import Buffer
 from cudf.core.column import column, datetime
 from cudf.core.column.column import ColumnBase
 from cudf.core.column.methods import ColumnMethods
@@ -46,6 +43,9 @@ def str_to_boolean(column: StringColumn):
 
 
 if TYPE_CHECKING:
+    import cupy
+    import numba.cuda
+
     from cudf._typing import (
         ColumnBinaryOperand,
         ColumnLike,
@@ -53,6 +53,7 @@ def str_to_boolean(column: StringColumn):
         ScalarLike,
         SeriesOrIndex,
     )
+    from cudf.core.buffer import Buffer
 
 
 _str_to_numeric_typecast_functions = {
@@ -5598,7 +5599,7 @@ def any(self, skipna: bool = True) -> bool:
 
     def data_array_view(
         self, *, mode="write"
-    ) -> cuda.devicearray.DeviceNDArray:
+    ) -> numba.cuda.devicearray.DeviceNDArray:
         raise ValueError("Cannot get an array view of a StringColumn")
 
     @property
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 6dd35570b95..c2ce787eeae 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -2,17 +2,20 @@
 from __future__ import annotations
 
 from functools import cached_property
+from typing import TYPE_CHECKING
 
 import pandas as pd
 import pyarrow as pa
 
 import cudf
-from cudf._typing import Dtype
 from cudf.core.column import ColumnBase
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import StructDtype
 from cudf.core.missing import NA
 
+if TYPE_CHECKING:
+    from cudf._typing import Dtype
+
 
 class StructColumn(ColumnBase):
     """
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index c6af052b56f..0af847f38af 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -4,7 +4,7 @@
 
 import datetime
 import functools
-from typing import Any, Optional, Sequence, cast
+from typing import TYPE_CHECKING, Any, Optional, Sequence, cast
 
 import numpy as np
 import pandas as pd
@@ -13,13 +13,15 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype
 from cudf.api.types import is_scalar, is_timedelta64_dtype
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column import ColumnBase, column, string
 from cudf.utils.dtypes import np_to_pa_dtype
 from cudf.utils.utils import _all_bools_with_nulls
 
+if TYPE_CHECKING:
+    from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype
+
 _unit_to_nanoseconds_conversion = {
     "ns": 1,
     "us": 1_000,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7438b0237d5..70820fa8e00 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -15,6 +15,7 @@
 from collections import abc, defaultdict
 from collections.abc import Iterator
 from typing import (
+    TYPE_CHECKING,
     Any,
     Callable,
     Dict,
@@ -41,7 +42,6 @@
 import cudf
 import cudf.core.common
 from cudf import _lib as libcudf
-from cudf._typing import ColumnLike, Dtype, NotImplementedType
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
@@ -99,6 +99,9 @@
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
 
+if TYPE_CHECKING:
+    from cudf._typing import ColumnLike, Dtype, NotImplementedType
+
 _cupy_nan_methods_map = {
     "min": "nanmin",
     "max": "nanmax",
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 4729233ee6e..b1282040e60 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -6,7 +6,7 @@
 import textwrap
 import warnings
 from functools import cached_property
-from typing import Any, Callable, Dict, List, Tuple, Type, Union
+from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, Type, Union
 
 import numpy as np
 import pandas as pd
@@ -19,9 +19,11 @@
 from cudf._typing import Dtype
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.abc import Serializable
-from cudf.core.buffer import Buffer
 from cudf.utils.docutils import doc_apply
 
+if TYPE_CHECKING:
+    from cudf.core.buffer import Buffer
+
 
 def dtype(arbitrary):
     """
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 01b56f1edc4..ffaa90ef915 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6,10 +6,10 @@
 import itertools
 import operator
 import pickle
-import types
 import warnings
 from collections import abc
 from typing import (
+    TYPE_CHECKING,
     Any,
     Callable,
     Dict,
@@ -31,7 +31,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._typing import Dtype
 from cudf.api.types import is_dtype_equal, is_scalar
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
@@ -48,6 +47,11 @@
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf
 
+if TYPE_CHECKING:
+    from types import ModuleType
+
+    from cudf._typing import Dtype
+
 
 # TODO: It looks like Frame is missing a declaration of `copy`, need to add
 class Frame(BinaryOperand, Scannable):
@@ -410,7 +414,7 @@ def __arrow_array__(self, type=None):
     def _to_array(
         self,
         get_array: Callable,
-        module: types.ModuleType,
+        module: ModuleType,
         copy: bool,
         dtype: Union[Dtype, None] = None,
         na_value=None,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 732e5cdb01a..655f7607b37 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -5,10 +5,10 @@
 import operator
 import pickle
 import warnings
-from collections.abc import Generator
 from functools import cache, cached_property
 from numbers import Number
 from typing import (
+    TYPE_CHECKING,
     Any,
     List,
     Literal,
@@ -71,6 +71,9 @@
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
 
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
 
 class IndexMeta(type):
     """Custom metaclass for Index that overrides instance/subclass tests."""
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index fdc78005996..75614fa46c7 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -9,6 +9,7 @@
 import warnings
 from collections import Counter, abc
 from typing import (
+    TYPE_CHECKING,
     Any,
     Callable,
     Dict,
@@ -31,12 +32,6 @@
 
 import cudf
 import cudf._lib as libcudf
-from cudf._typing import (
-    ColumnLike,
-    DataFrameOrSeries,
-    Dtype,
-    NotImplementedType,
-)
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -70,6 +65,14 @@
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.utils import _warn_no_dask_cudf
 
+if TYPE_CHECKING:
+    from cudf._typing import (
+        ColumnLike,
+        DataFrameOrSeries,
+        Dtype,
+        NotImplementedType,
+    )
+
 doc_reset_index_template = """
         Reset the index of the {klass}, or a level of it.
 
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 6d3520e33cf..865d9660b1d 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -8,10 +8,9 @@
 import pickle
 import warnings
 from collections import abc
-from collections.abc import Generator
 from functools import cached_property
 from numbers import Integral
-from typing import Any, List, MutableMapping, Tuple, Union
+from typing import TYPE_CHECKING, Any, List, MutableMapping, Tuple, Union
 
 import cupy as cp
 import numpy as np
@@ -20,7 +19,6 @@
 import cudf
 import cudf._lib as libcudf
 from cudf._lib.types import size_type_dtype
-from cudf._typing import DataFrameOrSeries
 from cudf.api.extensions import no_default
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
 from cudf.core import column
@@ -36,6 +34,11 @@
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
+if TYPE_CHECKING:
+    from collections.abc import Generator
+
+    from cudf._typing import DataFrameOrSeries
+
 
 def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
     """Makes best effort to convert an array of indices into a python slice.
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index a52b583d3b4..1b1e82333cf 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -10,6 +10,7 @@
 from collections import abc
 from shutil import get_terminal_size
 from typing import (
+    TYPE_CHECKING,
     Any,
     Dict,
     Literal,
@@ -27,12 +28,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._typing import (
-    ColumnLike,
-    DataFrameOrSeries,
-    NotImplementedType,
-    ScalarLike,
-)
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -85,6 +80,14 @@
 )
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 
+if TYPE_CHECKING:
+    from cudf._typing import (
+        ColumnLike,
+        DataFrameOrSeries,
+        NotImplementedType,
+        ScalarLike,
+    )
+
 
 def _format_percentile_names(percentiles):
     return [f"{int(x * 100)}%" for x in percentiles]
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index acc74129a29..6fd4e857e02 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -3,15 +3,11 @@
 
 from __future__ import annotations
 
-from typing import Any, Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
 
-import cupy
-import numpy
-import pyarrow as pa
 from typing_extensions import Self
 
 import cudf
-from cudf._typing import NotImplementedType, ScalarLike
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
@@ -25,6 +21,13 @@
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.utils import NotIterable
 
+if TYPE_CHECKING:
+    import cupy
+    import numpy
+    import pyarrow as pa
+
+    from cudf._typing import NotImplementedType, ScalarLike
+
 
 class SingleColumnFrame(Frame, NotIterable):
     """A one-dimensional frame.

From 9225633e83ca09592c5a144c523f46e95c6e9d75 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 14 Jun 2024 07:13:00 -1000
Subject: [PATCH 366/842] Avoid redefining Frame._get_columns_by_label in
 subclasses (#15912)

`Frame._get_columns_by_label` was redefined in `Series` and `DataFrame` to handle some special edge cases in `DataFrame.__getitem__` and empty `Series`

By making `_from_data_like_self` more consistent in preserving external properties and moving special casing, we can only define `Frame._get_columns_by_label` once

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/15912
---
 python/cudf/cudf/core/dataframe.py     | 36 +++++++-------------------
 python/cudf/cudf/core/frame.py         | 28 +++++++++++---------
 python/cudf/cudf/core/indexed_frame.py |  4 +--
 python/cudf/cudf/core/series.py        | 20 +++++---------
 4 files changed, 34 insertions(+), 54 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 70820fa8e00..80260c7699b 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1348,7 +1348,16 @@ def __getitem__(self, arg):
         8  8  8  8
         """
         if _is_scalar_or_zero_d_array(arg) or isinstance(arg, tuple):
-            return self._get_columns_by_label(arg, downcast=True)
+            out = self._get_columns_by_label(arg)
+            if is_scalar(arg):
+                nlevels = 1
+            elif isinstance(arg, tuple):
+                nlevels = len(arg)
+            if self._data.multiindex is False or nlevels == self._data.nlevels:
+                out = self._constructor_sliced._from_data(out._data)
+                out.index = self.index
+                out.name = arg
+            return out
 
         elif isinstance(arg, slice):
             return self._slice(arg)
@@ -1993,31 +2002,6 @@ def _repr_html_(self):
     def _repr_latex_(self):
         return self._get_renderable_dataframe().to_pandas()._repr_latex_()
 
-    @_cudf_nvtx_annotate
-    def _get_columns_by_label(
-        self, labels, *, downcast=False
-    ) -> Self | Series:
-        """
-        Return columns of dataframe by `labels`
-
-        If downcast is True, try and downcast from a DataFrame to a Series
-        """
-        ca = self._data.select_by_label(labels)
-        if downcast:
-            if is_scalar(labels):
-                nlevels = 1
-            elif isinstance(labels, tuple):
-                nlevels = len(labels)
-            if self._data.multiindex is False or nlevels == self._data.nlevels:
-                out = self._constructor_sliced._from_data(
-                    ca, index=self.index, name=labels
-                )
-                return out
-        out = self.__class__._from_data(
-            ca, index=self.index, columns=ca.to_pandas_index()
-        )
-        return out
-
     def _make_operands_and_index_for_binop(
         self,
         other: Any,
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index ffaa90ef915..ee310cfcb58 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -136,12 +136,19 @@ def deserialize(cls, header, frames):
     @classmethod
     @_cudf_nvtx_annotate
     def _from_data(cls, data: MutableMapping) -> Self:
+        """
+        Construct cls from a ColumnAccessor-like mapping.
+        """
         obj = cls.__new__(cls)
         Frame.__init__(obj, data)
         return obj
 
     @_cudf_nvtx_annotate
     def _from_data_like_self(self, data: MutableMapping) -> Self:
+        """
+        Return type(self) from a ColumnAccessor-like mapping but
+        with the external properties, e.g. .index, .name, of self.
+        """
         return self._from_data(data)
 
     @_cudf_nvtx_annotate
@@ -355,12 +362,13 @@ def equals(self, other) -> bool:
         )
 
     @_cudf_nvtx_annotate
-    def _get_columns_by_label(self, labels, *, downcast=False) -> Self:
+    def _get_columns_by_label(self, labels) -> Self:
         """
-        Returns columns of the Frame specified by `labels`
+        Returns columns of the Frame specified by `labels`.
 
+        Akin to cudf.DataFrame(...).loc[:, labels]
         """
-        return self.__class__._from_data(self._data.select_by_label(labels))
+        return self._from_data_like_self(self._data.select_by_label(labels))
 
     @property
     @_cudf_nvtx_annotate
@@ -1438,14 +1446,10 @@ def _get_sorted_inds(
         Get the indices required to sort self according to the columns
         specified in by.
         """
-
-        to_sort = [
-            *(
-                self
-                if by is None
-                else self._get_columns_by_label(list(by), downcast=False)
-            )._columns
-        ]
+        if by is None:
+            to_sort = self._columns
+        else:
+            to_sort = self._get_columns_by_label(list(by))._columns
 
         if is_scalar(ascending):
             ascending_lst = [ascending] * len(to_sort)
@@ -1453,7 +1457,7 @@ def _get_sorted_inds(
             ascending_lst = list(ascending)
 
         return libcudf.sort.order_by(
-            to_sort,
+            list(to_sort),
             ascending_lst,
             na_position,
             stable=True,
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 75614fa46c7..3a4f4874e35 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -309,8 +309,8 @@ def _from_data(
 
     @_cudf_nvtx_annotate
     def _from_data_like_self(self, data: MutableMapping):
-        out = self._from_data(data, self.index)
-        out._data._level_names = self._data._level_names
+        out = super()._from_data_like_self(data)
+        out.index = self.index
         return out
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 1b1e82333cf..ebf6910ca5f 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -685,6 +685,12 @@ def _from_data(
             out.name = name
         return out
 
+    @_cudf_nvtx_annotate
+    def _from_data_like_self(self, data: MutableMapping):
+        out = super()._from_data_like_self(data)
+        out.name = self.name
+        return out
+
     @_cudf_nvtx_annotate
     def __contains__(self, item):
         return item in self.index
@@ -859,20 +865,6 @@ def deserialize(cls, header, frames):
 
         return obj
 
-    def _get_columns_by_label(self, labels, *, downcast=False) -> Self:
-        """Return the column specified by `labels`
-
-        For cudf.Series, either the column, or an empty series is returned.
-        Parameter `downcast` does not have effects.
-        """
-        ca = self._data.select_by_label(labels)
-
-        return (
-            self.__class__._from_data(data=ca, index=self.index)
-            if len(ca) > 0
-            else self.__class__(dtype=self.dtype, name=self.name)
-        )
-
     @_cudf_nvtx_annotate
     def drop(
         self,

From 9dc5e8c2836fa2e54831d25b7f051e031bf553b9 Mon Sep 17 00:00:00 2001
From: Ben Jarmak <104460670+jarmak-nv@users.noreply.github.com>
Date: Fri, 14 Jun 2024 13:31:29 -0400
Subject: [PATCH 367/842] Project automation update: skip if not in project
 (#16035)

This PR adds another condition to when we should run the automation work. PRs aren't always in the cuDF Python project so when this is the case we should skip the job rather than attempting to run it and have it throw an error.

Authors:
  - Ben Jarmak (https://github.com/jarmak-nv)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16035
---
 .github/workflows/pr_issue_status_automation.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index 837963c3286..8ca971dc28d 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -35,7 +35,7 @@ jobs:
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
       uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.08
-      if: github.event.pull_request.state == 'open'
+      if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
         PROJECT_ID: "PVT_kwDOAp2shc4AiNzl"
@@ -51,7 +51,7 @@ jobs:
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
       uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08
-      if: github.event.pull_request.state == 'open'
+      if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
         PROJECT_ID: "PVT_kwDOAp2shc4AiNzl"

From f89cc07b50d3f89e7da8f98afb5fe8f9d9cf33c6 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 14 Jun 2024 13:22:49 -0500
Subject: [PATCH 368/842] Add `codecov` coverage for `pandas_tests` (#14513)

Fixes: #14496

This PR enables code-coverage for `pandas` tests that are run in cudf CI in pandas accelerator mode.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14513
---
 ci/cudf_pandas_scripts/run_tests.sh               | 11 ++++++++++-
 python/cudf/cudf_pandas_tests/test_cudf_pandas.py |  3 +++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 78945d37f22..1c3b99953fb 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -5,6 +5,10 @@
 
 set -eoxu pipefail
 
+RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}
+RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"}
+mkdir -p "${RAPIDS_TESTS_DIR}" "${RAPIDS_COVERAGE_DIR}"
+
 # Function to display script usage
 function display_usage {
     echo "Usage: $0 [--no-cudf]"
@@ -36,4 +40,9 @@ else
     python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,cudf-pandas-tests]
 fi
 
-python -m pytest -p cudf.pandas ./python/cudf/cudf_pandas_tests/
+python -m pytest -p cudf.pandas \
+    --cov-config=./python/cudf/.coveragerc \
+    --cov=cudf \
+    --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \
+    --cov-report=term \
+    ./python/cudf/cudf_pandas_tests/
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index c251e4a197e..5be4d350c0b 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -464,6 +464,9 @@ def test_options_mode():
     assert xpd.options.mode.copy_on_write == pd.options.mode.copy_on_write
 
 
+# Codecov and Profiler interfere with each-other,
+# hence we don't want to run code-cov on this test.
+@pytest.mark.no_cover
 def test_profiler():
     pytest.importorskip("cudf")
 

From 2ad502efe5f9c927b5bc0e5a80820b99f6630e1b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 14 Jun 2024 10:50:41 -1000
Subject: [PATCH 369/842] Fix nunique for `MultiIndex`, `DataFrame`, and all NA
 case with `dropna=False` (#15962)

Fixes 3 bugs with `nunique`

* `MultiIndex.nunique` returning a `dict` instead of an `int`
* `.nunique(dropna=False)` with all `NA`s returning 0 instead of 1
* `DataFrame.nunique` preserving column class and type in the resulting `Series.index`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/15962
---
 cpp/src/stream_compaction/distinct_count.cu  |  6 +++++-
 python/cudf/cudf/core/dataframe.py           |  8 +++++---
 python/cudf/cudf/core/frame.py               |  7 +++----
 python/cudf/cudf/core/index.py               |  2 +-
 python/cudf/cudf/core/multiindex.py          |  5 +++++
 python/cudf/cudf/core/single_column_frame.py |  2 --
 python/cudf/cudf/tests/test_dataframe.py     | 14 ++++++++++++++
 python/cudf/cudf/tests/test_multiindex.py    | 11 +++++++++++
 python/cudf/cudf/tests/test_series.py        | 10 ++++++++++
 9 files changed, 54 insertions(+), 11 deletions(-)

diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index b7aadbe14fa..99ca89cc021 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -187,7 +187,11 @@ cudf::size_type distinct_count(column_view const& input,
                                nan_policy nan_handling,
                                rmm::cuda_stream_view stream)
 {
-  if (0 == input.size() or input.null_count() == input.size()) { return 0; }
+  if (0 == input.size()) { return 0; }
+
+  if (input.null_count() == input.size()) {
+    return static_cast<size_type>(null_handling == null_policy::INCLUDE);
+  }
 
   auto count = detail::distinct_count(table_view{{input}}, null_equality::EQUAL, stream);
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 80260c7699b..d8d46a6df73 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7462,7 +7462,7 @@ def __dataframe__(
             self, nan_as_null=nan_as_null, allow_copy=allow_copy
         )
 
-    def nunique(self, axis=0, dropna=True):
+    def nunique(self, axis=0, dropna: bool = True) -> Series:
         """
         Count number of distinct elements in specified axis.
         Return Series with number of distinct elements. Can ignore NaN values.
@@ -7490,8 +7490,10 @@ def nunique(self, axis=0, dropna=True):
         """
         if axis != 0:
             raise NotImplementedError("axis parameter is not supported yet.")
-
-        return cudf.Series(super().nunique(dropna=dropna))
+        counts = [col.distinct_count(dropna=dropna) for col in self._columns]
+        return self._constructor_sliced(
+            counts, index=self._data.to_pandas_index()
+        )
 
     def _sample_axis_1(
         self,
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index ee310cfcb58..6a1ef05b1f9 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1903,10 +1903,9 @@ def nunique(self, dropna: bool = True):
         dict
             Name and unique value counts of each column in frame.
         """
-        return {
-            name: col.distinct_count(dropna=dropna)
-            for name, col in self._data.items()
-        }
+        raise NotImplementedError(
+            f"{type(self).__name__} does not implement nunique"
+        )
 
     @staticmethod
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 655f7607b37..11d09e470ff 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -898,7 +898,7 @@ def __array__(self, dtype=None):
         )
 
     @_cudf_nvtx_annotate
-    def nunique(self) -> int:
+    def nunique(self, dropna: bool = True) -> int:
         return len(self)
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 865d9660b1d..91488e06f4e 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1749,6 +1749,11 @@ def fillna(self, value):
     def unique(self):
         return self.drop_duplicates(keep="first")
 
+    @_cudf_nvtx_annotate
+    def nunique(self, dropna: bool = True) -> int:
+        mi = self.dropna(how="all") if dropna else self
+        return len(mi.unique())
+
     def _clean_nulls_from_index(self):
         """
         Convert all na values(if any) in MultiIndex object
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 6fd4e857e02..43b5dc76f13 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -338,8 +338,6 @@ def nunique(self, dropna: bool = True) -> int:
         int
             Number of unique values in the column.
         """
-        if self._column.null_count == len(self):
-            return 0
         return self._column.distinct_count(dropna=dropna)
 
     def _get_elements_from_column(self, arg) -> Union[ScalarLike, ColumnBase]:
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 98e9f9881c7..649821b9b7c 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9966,6 +9966,20 @@ def test_dataframe_nunique(data):
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "columns",
+    [
+        pd.RangeIndex(2, name="foo"),
+        pd.MultiIndex.from_arrays([[1, 2], [2, 3]], names=["foo", 1]),
+        pd.Index([3, 5], dtype=np.int8, name="foo"),
+    ],
+)
+def test_nunique_preserve_column_in_index(columns):
+    df = cudf.DataFrame([[1, 2]], columns=columns)
+    result = df.nunique().index.to_pandas()
+    assert_eq(result, columns, exact=True)
+
+
 @pytest.mark.parametrize(
     "data",
     [{"key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}],
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index f143112a45f..7b95e4f9a44 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -2162,3 +2162,14 @@ def test_multi_index_contains_hashable():
         lfunc_args_and_kwargs=((),),
         rfunc_args_and_kwargs=((),),
     )
+
+
+@pytest.mark.parametrize("array", [[1, 2], [1, None], [None, None]])
+@pytest.mark.parametrize("dropna", [True, False])
+def test_nunique(array, dropna):
+    arrays = [array, [3, 4]]
+    gidx = cudf.MultiIndex.from_arrays(arrays)
+    pidx = pd.MultiIndex.from_arrays(arrays)
+    result = gidx.nunique(dropna=dropna)
+    expected = pidx.nunique(dropna=dropna)
+    assert result == expected
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 30189e1ac8a..52956c230ba 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2851,3 +2851,13 @@ def test_nans_to_nulls_noop_copies_column(value):
     ser1 = cudf.Series([value])
     ser2 = ser1.nans_to_nulls()
     assert ser1._column is not ser2._column
+
+
+@pytest.mark.parametrize("dropna", [False, True])
+def test_nunique_all_null(dropna):
+    data = [None, None]
+    pd_ser = pd.Series(data)
+    cudf_ser = cudf.Series(data)
+    result = pd_ser.nunique(dropna=dropna)
+    expected = cudf_ser.nunique(dropna=dropna)
+    assert result == expected

From 74b382637e69d39df292c59938b5911d9ca3bdf9 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Fri, 14 Jun 2024 17:01:35 -0500
Subject: [PATCH 370/842] Fix decimal -> float cast in ast code (#16038)

Fix decimal -> float cast in ast code that was missed during the earlier code refactoring for making the cast explicit.

This closes [issue 16023](https://github.com/rapidsai/cudf/issues/16023)

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16038
---
 cpp/include/cudf/ast/detail/operators.hpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index b618f33a6e5..c483d459833 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/types.hpp>
+#include <cudf/unary.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -819,7 +820,17 @@ struct operator_functor<ast_operator::NOT, false> {
 template <typename To>
 struct cast {
   static constexpr auto arity{1};
-  template <typename From>
+  template <typename From, typename std::enable_if_t<is_fixed_point<From>()>* = nullptr>
+  __device__ inline auto operator()(From f) -> To
+  {
+    if constexpr (cuda::std::is_floating_point_v<To>) {
+      return convert_fixed_to_floating<To>(f);
+    } else {
+      return static_cast<To>(f);
+    }
+  }
+
+  template <typename From, typename cuda::std::enable_if_t<!is_fixed_point<From>()>* = nullptr>
   __device__ inline auto operator()(From f) -> decltype(static_cast<To>(f))
   {
     return static_cast<To>(f);

From e9ebdea49d24f645a6ca5ff6d79e0525a114f5fc Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 17 Jun 2024 12:29:54 +0100
Subject: [PATCH 371/842] Delete unused code from stringfunction evaluator
 (#16032)

When introducing the handling of regex contains, we replicated the handlers for some other supported string functions. This means we can delete some code.

Additionally, migrate the contains tests to live with the other string function tests, and add coverage of exceptional cases.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16032
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 36 ++-----
 python/cudf_polars/tests/conftest.py          | 10 ++
 .../cudf_polars/tests/expressions/test_agg.py |  5 -
 .../tests/expressions/test_distinct.py        |  9 +-
 .../tests/expressions/test_numeric_binops.py  |  5 -
 .../tests/expressions/test_stringfunction.py  | 97 ++++++++++++++++---
 python/cudf_polars/tests/test_string.py       | 61 ------------
 7 files changed, 102 insertions(+), 121 deletions(-)
 create mode 100644 python/cudf_polars/tests/conftest.py
 delete mode 100644 python/cudf_polars/tests/test_string.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 03c1db68dbd..0605bba6642 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -688,13 +688,12 @@ def do_evaluate(
                     else pat.obj
                 )
                 return Column(plc.strings.find.contains(column.obj, pattern))
-            else:
-                assert isinstance(arg, Literal)
-                prog = plc.strings.regex_program.RegexProgram.create(
-                    arg.value.as_py(),
-                    flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
-                )
-                return Column(plc.strings.contains.contains_re(column.obj, prog))
+            assert isinstance(arg, Literal)
+            prog = plc.strings.regex_program.RegexProgram.create(
+                arg.value.as_py(),
+                flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
+            )
+            return Column(plc.strings.contains.contains_re(column.obj, prog))
         columns = [
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
@@ -725,26 +724,9 @@ def do_evaluate(
                     else prefix.obj,
                 )
             )
-        else:
-            columns = [
-                child.evaluate(df, context=context, mapping=mapping)
-                for child in self.children
-            ]
-            if self.name == pl_expr.StringFunction.Lowercase:
-                (column,) = columns
-                return Column(plc.strings.case.to_lower(column.obj))
-            elif self.name == pl_expr.StringFunction.Uppercase:
-                (column,) = columns
-                return Column(plc.strings.case.to_upper(column.obj))
-            elif self.name == pl_expr.StringFunction.EndsWith:
-                column, suffix = columns
-                return Column(plc.strings.find.ends_with(column.obj, suffix.obj))
-            elif self.name == pl_expr.StringFunction.StartsWith:
-                column, suffix = columns
-                return Column(plc.strings.find.starts_with(column.obj, suffix.obj))
-            raise NotImplementedError(
-                f"StringFunction {self.name}"
-            )  # pragma: no cover; handled by init raising
+        raise NotImplementedError(
+            f"StringFunction {self.name}"
+        )  # pragma: no cover; handled by init raising
 
 
 class Sort(Expr):
diff --git a/python/cudf_polars/tests/conftest.py b/python/cudf_polars/tests/conftest.py
new file mode 100644
index 00000000000..9bbce6bc080
--- /dev/null
+++ b/python/cudf_polars/tests/conftest.py
@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+
+@pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"], scope="session")
+def with_nulls(request):
+    return request.param
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index 79018c80bf3..b044bbb2885 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -20,11 +20,6 @@ def dtype(request):
     return request.param
 
 
-@pytest.fixture(params=[False, True], ids=["no-nulls", "with-nulls"])
-def with_nulls(request):
-    return request.param
-
-
 @pytest.fixture(
     params=[
         False,
diff --git a/python/cudf_polars/tests/expressions/test_distinct.py b/python/cudf_polars/tests/expressions/test_distinct.py
index 22865a7ce22..143dd7e9f0f 100644
--- a/python/cudf_polars/tests/expressions/test_distinct.py
+++ b/python/cudf_polars/tests/expressions/test_distinct.py
@@ -9,11 +9,6 @@
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
-@pytest.fixture(params=[False, True], ids=["no-nulls", "nulls"])
-def nullable(request):
-    return request.param
-
-
 @pytest.fixture(
     params=["is_first_distinct", "is_last_distinct", "is_unique", "is_duplicated"]
 )
@@ -22,9 +17,9 @@ def op(request):
 
 
 @pytest.fixture
-def df(nullable):
+def df(with_nulls):
     values: list[int | None] = [1, 2, 3, 1, 1, 7, 3, 2, 7, 8, 1]
-    if nullable:
+    if with_nulls:
         values[1] = None
         values[4] = None
     return pl.LazyFrame({"a": values})
diff --git a/python/cudf_polars/tests/expressions/test_numeric_binops.py b/python/cudf_polars/tests/expressions/test_numeric_binops.py
index 548aebf0875..7eefc59d927 100644
--- a/python/cudf_polars/tests/expressions/test_numeric_binops.py
+++ b/python/cudf_polars/tests/expressions/test_numeric_binops.py
@@ -29,11 +29,6 @@ def rtype(request):
     return request.param
 
 
-@pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"])
-def with_nulls(request):
-    return request.param
-
-
 @pytest.fixture(
     params=[
         pl.Expr.eq,
diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py
index 198f35d376b..3c498fe7286 100644
--- a/python/cudf_polars/tests/expressions/test_stringfunction.py
+++ b/python/cudf_polars/tests/expressions/test_stringfunction.py
@@ -2,22 +2,39 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+from functools import partial
+
 import pytest
 
 import polars as pl
 
-from cudf_polars import translate_ir
+from cudf_polars import execute_with_cudf, translate_ir
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
-def test_supported_stringfunction_expression():
-    ldf = pl.LazyFrame(
-        {
-            "a": ["a", "b", "cdefg", "h", "Wıth ünιcοde"],  # noqa: RUF001
-            "b": [0, 3, 1, -1, None],
-        }
-    )
+@pytest.fixture
+def ldf(with_nulls):
+    a = [
+        "AbC",
+        "de",
+        "FGHI",
+        "j",
+        "kLm",
+        "nOPq",
+        "",
+        "RsT",
+        "sada",
+        "uVw",
+        "h",
+        "Wıth ünιcοde",  # noqa: RUF001
+    ]
+    if with_nulls:
+        a[4] = None
+        a[-3] = None
+    return pl.LazyFrame({"a": a, "b": range(len(a))})
 
+
+def test_supported_stringfunction_expression(ldf):
     query = ldf.select(
         pl.col("a").str.starts_with("Z"),
         pl.col("a").str.ends_with("h").alias("endswith_h"),
@@ -27,15 +44,63 @@ def test_supported_stringfunction_expression():
     assert_gpu_result_equal(query)
 
 
-def test_unsupported_stringfunction():
-    ldf = pl.LazyFrame(
-        {
-            "a": ["a", "b", "cdefg", "h", "Wıth ünιcοde"],  # noqa: RUF001
-            "b": [0, 3, 1, -1, None],
-        }
-    )
-
+def test_unsupported_stringfunction(ldf):
     q = ldf.select(pl.col("a").str.count_matches("e", literal=True))
 
     with pytest.raises(NotImplementedError):
         _ = translate_ir(q._ldf.visit())
+
+
+def test_contains_re_non_strict_raises(ldf):
+    q = ldf.select(pl.col("a").str.contains(".", strict=False))
+
+    with pytest.raises(NotImplementedError):
+        _ = translate_ir(q._ldf.visit())
+
+
+def test_contains_re_non_literal_raises(ldf):
+    q = ldf.select(pl.col("a").str.contains(pl.col("b"), literal=False))
+
+    with pytest.raises(NotImplementedError):
+        _ = translate_ir(q._ldf.visit())
+
+
+@pytest.mark.parametrize(
+    "substr",
+    [
+        "A",
+        "de",
+        ".*",
+        "^a",
+        "^A",
+        "[^a-z]",
+        "[a-z]{3,}",
+        "^[A-Z]{2,}",
+        "j|u",
+    ],
+)
+def test_contains_regex(ldf, substr):
+    query = ldf.select(pl.col("a").str.contains(substr))
+    assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize(
+    "literal", ["A", "de", "FGHI", "j", "kLm", "nOPq", "RsT", "uVw"]
+)
+def test_contains_literal(ldf, literal):
+    query = ldf.select(pl.col("a").str.contains(pl.lit(literal), literal=True))
+    assert_gpu_result_equal(query)
+
+
+def test_contains_column(ldf):
+    query = ldf.select(pl.col("a").str.contains(pl.col("a"), literal=True))
+    assert_gpu_result_equal(query)
+
+
+def test_contains_invalid(ldf):
+    query = ldf.select(pl.col("a").str.contains("["))
+
+    with pytest.raises(pl.exceptions.ComputeError):
+        query.collect()
+    with pytest.raises(pl.exceptions.ComputeError):
+        query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True))
diff --git a/python/cudf_polars/tests/test_string.py b/python/cudf_polars/tests/test_string.py
deleted file mode 100644
index f1a080d040f..00000000000
--- a/python/cudf_polars/tests/test_string.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-from __future__ import annotations
-
-from functools import partial
-
-import pytest
-
-import polars as pl
-
-from cudf_polars.callback import execute_with_cudf
-from cudf_polars.testing.asserts import assert_gpu_result_equal
-
-
-@pytest.fixture
-def ldf():
-    return pl.DataFrame(
-        {"a": ["AbC", "de", "FGHI", "j", "kLm", "nOPq", None, "RsT", None, "uVw"]}
-    ).lazy()
-
-
-@pytest.mark.parametrize(
-    "substr",
-    [
-        "A",
-        "de",
-        ".*",
-        "^a",
-        "^A",
-        "[^a-z]",
-        "[a-z]{3,}",
-        "^[A-Z]{2,}",
-        "j|u",
-    ],
-)
-def test_contains_regex(ldf, substr):
-    query = ldf.select(pl.col("a").str.contains(substr))
-    assert_gpu_result_equal(query)
-
-
-@pytest.mark.parametrize(
-    "literal", ["A", "de", "FGHI", "j", "kLm", "nOPq", "RsT", "uVw"]
-)
-def test_contains_literal(ldf, literal):
-    query = ldf.select(pl.col("a").str.contains(pl.lit(literal), literal=True))
-    assert_gpu_result_equal(query)
-
-
-def test_contains_column(ldf):
-    query = ldf.select(pl.col("a").str.contains(pl.col("a"), literal=True))
-    assert_gpu_result_equal(query)
-
-
-@pytest.mark.parametrize("pat", ["["])
-def test_contains_invalid(ldf, pat):
-    query = ldf.select(pl.col("a").str.contains(pat))
-
-    with pytest.raises(pl.exceptions.ComputeError):
-        query.collect()
-    with pytest.raises(pl.exceptions.ComputeError):
-        query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True))

From a023d5fd189b52996c00a4b3132171bb3f41a02d Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 17 Jun 2024 09:31:01 -0500
Subject: [PATCH 372/842] Return `FrozenList` for `Index.names` (#16047)

Fixes: #16046

This PR returns `FrozenList` for `Index.names` instead of `tuple`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16047
---
 python/cudf/cudf/core/_base_index.py     | 4 ++--
 python/dask_cudf/dask_cudf/io/parquet.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index b29fc475b29..e5945f8860e 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -342,9 +342,9 @@ def deserialize(cls, header, frames):
     @property
     def names(self):
         """
-        Returns a tuple containing the name of the Index.
+        Returns a FrozenList containing the name of the Index.
         """
-        return (self.name,)
+        return pd.core.indexes.frozen.FrozenList([self.name])
 
     @names.setter
     def names(self, values):
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index ba8b1e89721..810a804e428 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -316,7 +316,7 @@ def read_partition(
 
             if index and (index[0] in df.columns):
                 df = df.set_index(index[0])
-            elif index is False and df.index.names != (None,):
+            elif index is False and df.index.names != [None]:
                 # If index=False, we shouldn't have a named index
                 df.reset_index(inplace=True)
 

From 107753ccaacdb62287c4dd4351e5caf3bf8bc62a Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 17 Jun 2024 15:43:13 +0100
Subject: [PATCH 373/842] Remove mapfunction nodes that don't exist/aren't
 supported (#15991)

We can't correctly implemented merge_sorted to match polars because libcudf's implementation is not stable wrt input order. drop_nulls is no longer implemented as a MapFunction, but instead a boolean filter.

Finally, add coverage of the mapfunctions we do handle.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/15991
---
 python/cudf_polars/cudf_polars/dsl/ir.py     | 56 ++++++--------------
 python/cudf_polars/tests/test_mapfunction.py | 43 +++++++++++++++
 2 files changed, 58 insertions(+), 41 deletions(-)
 create mode 100644 python/cudf_polars/tests/test_mapfunction.py

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 9fb2468e4e9..7f0920e1b57 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -286,13 +286,18 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         pdf = pl.DataFrame._from_pydf(self.df)
         if self.projection is not None:
             pdf = pdf.select(self.projection)
-        # TODO: goes away when libcudf supports large strings
         table = pdf.to_arrow()
         schema = table.schema
         for i, field in enumerate(schema):
+            # TODO: Nested types
             if field.type == pa.large_string():
-                # TODO: Nested types
+                # TODO: goes away when libcudf supports large strings
                 schema = schema.set(i, pa.field(field.name, pa.string()))
+            elif isinstance(field.type, pa.LargeListType):
+                # TODO: goes away when libcudf supports large lists
+                schema = schema.set(
+                    i, pa.field(field.name, pa.list_(field.type.field(0)))
+                )
         table = table.cast(schema)
         df = DataFrame.from_table(
             plc.interop.from_arrow(table), list(self.schema.keys())
@@ -850,9 +855,11 @@ class MapFunction(IR):
 
     _NAMES: ClassVar[frozenset[str]] = frozenset(
         [
-            "drop_nulls",
             "rechunk",
-            "merge_sorted",
+            # libcudf merge is not stable wrt order of inputs, since
+            # it uses a priority queue to manage the tables it produces.
+            # See: https://github.com/rapidsai/cudf/issues/16010
+            # "merge_sorted",
             "rename",
             "explode",
         ]
@@ -869,46 +876,13 @@ def __post_init__(self) -> None:
                 # polars requires that all to-explode columns have the
                 # same sub-shapes
                 raise NotImplementedError("Explode with more than one column")
-        elif self.name == "merge_sorted":
-            assert isinstance(self.df, Union)
-            (key_column,) = self.options
-            if key_column not in self.df.dfs[0].schema:
-                raise ValueError(f"Key column {key_column} not found")
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
-        if self.name == "merge_sorted":
-            # merge_sorted operates on Union inputs
-            # but if we evaluate the Union then we can't unpick the
-            # pieces, so we dive inside and evaluate the pieces by hand
-            assert isinstance(self.df, Union)
-            first, *rest = (c.evaluate(cache=cache) for c in self.df.dfs)
-            (key_column,) = self.options
-            if not all(first.column_names == r.column_names for r in rest):
-                raise ValueError("DataFrame shapes/column names don't match")
-            # Already validated that key_column is in column names
-            index = first.column_names.index(key_column)
-            return DataFrame.from_table(
-                plc.merge.merge_sorted(
-                    [first.table, *(df.table for df in rest)],
-                    [index],
-                    [plc.types.Order.ASCENDING],
-                    [plc.types.NullOrder.BEFORE],
-                ),
-                first.column_names,
-            ).sorted_like(first, subset={key_column})
-        elif self.name == "rechunk":
+        if self.name == "rechunk":
             # No-op in our data model
-            return self.df.evaluate(cache=cache)
-        elif self.name == "drop_nulls":
-            df = self.df.evaluate(cache=cache)
-            (subset,) = self.options
-            subset = set(subset)
-            indices = [i for i, name in enumerate(df.column_names) if name in subset]
-            return DataFrame.from_table(
-                plc.stream_compaction.drop_nulls(df.table, indices, len(indices)),
-                df.column_names,
-            ).sorted_like(df)
+            # Don't think this appears in a plan tree from python
+            return self.df.evaluate(cache=cache)  # pragma: no cover
         elif self.name == "rename":
             df = self.df.evaluate(cache=cache)
             # final tag is "swapping" which is useful for the
@@ -924,7 +898,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 plc.lists.explode_outer(df.table, index), df.column_names
             ).sorted_like(df, subset=subset)
         else:
-            raise AssertionError("Should never be reached")
+            raise AssertionError("Should never be reached")  # pragma: no cover
 
 
 @dataclasses.dataclass(slots=True)
diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py
new file mode 100644
index 00000000000..ec6b3f3fc0a
--- /dev/null
+++ b/python/cudf_polars/tests/test_mapfunction.py
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars import translate_ir
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+def test_merge_sorted_raises():
+    df1 = pl.LazyFrame({"a": [1, 6, 9], "b": [1, -10, 4]})
+    df2 = pl.LazyFrame({"a": [-1, 5, 11, 20], "b": [2, 7, -4, None]})
+    df3 = pl.LazyFrame({"a": [-10, 20, 21], "b": [1, 2, 3]})
+
+    q = df1.merge_sorted(df2, key="a").merge_sorted(df3, key="a")
+
+    with pytest.raises(NotImplementedError):
+        _ = translate_ir(q._ldf.visit())
+
+
+def test_explode_multiple_raises():
+    df = pl.LazyFrame({"a": [[1, 2], [3, 4]], "b": [[5, 6], [7, 8]]})
+    q = df.explode("a", "b")
+
+    with pytest.raises(NotImplementedError):
+        _ = translate_ir(q._ldf.visit())
+
+
+@pytest.mark.parametrize("column", ["a", "b"])
+def test_explode_single(column):
+    df = pl.LazyFrame(
+        {
+            "a": [[1, 2], [3, 4], None],
+            "b": [[5, 6], [7, 8], [9, 10]],
+            "c": [None, 11, 12],
+        }
+    )
+    q = df.explode(column)
+
+    assert_gpu_result_equal(q)

From 87f6a7e15bb7d8dc0d8733392567fb647074b2fd Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 17 Jun 2024 06:21:10 -1000
Subject: [PATCH 374/842] Add ruff rules to avoid importing from typing
 (#16040)

Enabled the following ruff rules to update typing annotations according to PEP585 and PEP604

https://docs.astral.sh/ruff/rules/future-rewritable-type-annotation/
https://docs.astral.sh/ruff/rules/non-pep604-annotation/
https://docs.astral.sh/ruff/rules/non-pep585-annotation/

The changes were made by running `pre-commit run ruff --all-files` with `fix = True` and `unsafe-fixes = True` locally

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Mike Sarahan (https://github.com/msarahan)

URL: https://github.com/rapidsai/cudf/pull/16040
---
 pyproject.toml                                |   2 +-
 python/cudf/cudf/_lib/column.pyi              |  46 ++++----
 python/cudf/cudf/api/types.py                 |   4 +-
 python/cudf/cudf/core/_base_index.py          |   6 +-
 .../cudf/cudf/core/_internals/expressions.py  |  12 +-
 python/cudf/cudf/core/_internals/timezones.py |  19 +--
 python/cudf/cudf/core/_internals/where.py     |  15 ++-
 python/cudf/cudf/core/buffer/buffer.py        |  14 +--
 .../core/buffer/exposure_tracked_buffer.py    |   4 +-
 python/cudf/cudf/core/buffer/spill_manager.py |  20 ++--
 .../cudf/cudf/core/buffer/spillable_buffer.py |  18 +--
 python/cudf/cudf/core/buffer/utils.py         |  20 ++--
 python/cudf/cudf/core/column/categorical.py   |  46 ++++----
 python/cudf/cudf/core/column/column.py        |  94 +++++++--------
 python/cudf/cudf/core/column/datetime.py      |  22 ++--
 python/cudf/cudf/core/column/decimal.py       |   8 +-
 python/cudf/cudf/core/column/lists.py         |  10 +-
 python/cudf/cudf/core/column/methods.py       |   4 +-
 python/cudf/cudf/core/column/numerical.py     |  35 +++---
 .../cudf/cudf/core/column/numerical_base.py   |  16 +--
 python/cudf/cudf/core/column/string.py        | 109 ++++++++----------
 python/cudf/cudf/core/column/timedelta.py     |  18 +--
 python/cudf/cudf/core/column_accessor.py      |  25 ++--
 python/cudf/cudf/core/dataframe.py            |  51 +++-----
 python/cudf/cudf/core/df_protocol.py          |  44 +++----
 python/cudf/cudf/core/dtypes.py               |  27 ++---
 python/cudf/cudf/core/frame.py                |  49 +++-----
 python/cudf/cudf/core/groupby/groupby.py      |  19 +--
 python/cudf/cudf/core/index.py                |  28 ++---
 python/cudf/cudf/core/indexed_frame.py        |  58 ++++------
 python/cudf/cudf/core/indexing_utils.py       |   8 +-
 python/cudf/cudf/core/join/_join_helpers.py   |   6 +-
 python/cudf/cudf/core/join/join.py            |   6 +-
 python/cudf/cudf/core/mixins/binops.pyi       |   6 +-
 python/cudf/cudf/core/mixins/reductions.pyi   |   4 +-
 python/cudf/cudf/core/mixins/scans.pyi        |   4 +-
 python/cudf/cudf/core/multiindex.py           |  18 +--
 python/cudf/cudf/core/reshape.py              |  15 ++-
 python/cudf/cudf/core/series.py               |  30 ++---
 python/cudf/cudf/core/single_column_frame.py  |  12 +-
 python/cudf/cudf/core/subword_tokenizer.py    |   3 +-
 python/cudf/cudf/core/tools/datetimes.py      |  13 ++-
 python/cudf/cudf/core/udf/groupby_typing.py   |   8 +-
 python/cudf/cudf/core/udf/utils.py            |   5 +-
 python/cudf/cudf/io/parquet.py                |  18 +--
 python/cudf/cudf/options.py                   |  11 +-
 python/cudf/cudf/pandas/fast_slow_proxy.py    |  40 +++----
 python/cudf/cudf/pandas/module_accelerator.py |   6 +-
 python/cudf/cudf/pandas/profiler.py           |  12 +-
 .../cudf/cudf/pylibcudf_tests/common/utils.py |   7 +-
 .../test_avro_reader_fastavro_integration.py  |   5 +-
 python/cudf/cudf/tests/test_df_protocol.py    |   5 +-
 python/cudf/cudf/tests/test_spilling.py       |   8 +-
 python/cudf/cudf/utils/applyutils.py          |   5 +-
 python/cudf/cudf/utils/queryutils.py          |   7 +-
 python/cudf/cudf/utils/utils.py               |   4 +-
 .../cudf_pandas_tests/test_fast_slow_proxy.py |   1 +
 python/dask_cudf/dask_cudf/groupby.py         |   4 +-
 58 files changed, 504 insertions(+), 610 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index c602240a0b7..2f59864894b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ quiet-level = 3
 line-length = 79
 
 [tool.ruff.lint]
-select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH"]
+select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH", "FA", "UP006", "UP007"]
 ignore = [
     # whitespace before :
     "E203",
diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi
index c667286fc16..bcab009c102 100644
--- a/python/cudf/cudf/_lib/column.pyi
+++ b/python/cudf/cudf/_lib/column.pyi
@@ -2,8 +2,6 @@
 
 from __future__ import annotations
 
-from typing import Dict, Optional, Tuple
-
 from typing_extensions import Self
 
 from cudf._typing import Dtype, DtypeObj, ScalarLike
@@ -11,27 +9,27 @@ from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase
 
 class Column:
-    _data: Optional[Buffer]
-    _mask: Optional[Buffer]
-    _base_data: Optional[Buffer]
-    _base_mask: Optional[Buffer]
+    _data: Buffer | None
+    _mask: Buffer | None
+    _base_data: Buffer | None
+    _base_mask: Buffer | None
     _dtype: DtypeObj
     _size: int
     _offset: int
     _null_count: int
-    _children: Tuple[ColumnBase, ...]
-    _base_children: Tuple[ColumnBase, ...]
-    _distinct_count: Dict[bool, int]
+    _children: tuple[ColumnBase, ...]
+    _base_children: tuple[ColumnBase, ...]
+    _distinct_count: dict[bool, int]
 
     def __init__(
         self,
-        data: Optional[Buffer],
+        data: Buffer | None,
         size: int,
         dtype: Dtype,
-        mask: Optional[Buffer] = None,
-        offset: Optional[int] = None,
-        null_count: Optional[int] = None,
-        children: Tuple[ColumnBase, ...] = (),
+        mask: Buffer | None = None,
+        offset: int | None = None,
+        null_count: int | None = None,
+        children: tuple[ColumnBase, ...] = (),
     ) -> None: ...
     @property
     def base_size(self) -> int: ...
@@ -40,9 +38,9 @@ class Column:
     @property
     def size(self) -> int: ...
     @property
-    def base_data(self) -> Optional[Buffer]: ...
+    def base_data(self) -> Buffer | None: ...
     @property
-    def data(self) -> Optional[Buffer]: ...
+    def data(self) -> Buffer | None: ...
     @property
     def data_ptr(self) -> int: ...
     def set_base_data(self, value: Buffer) -> None: ...
@@ -50,25 +48,25 @@ class Column:
     def nullable(self) -> bool: ...
     def has_nulls(self, include_nan: bool = False) -> bool: ...
     @property
-    def base_mask(self) -> Optional[Buffer]: ...
+    def base_mask(self) -> Buffer | None: ...
     @property
-    def mask(self) -> Optional[Buffer]: ...
+    def mask(self) -> Buffer | None: ...
     @property
     def mask_ptr(self) -> int: ...
-    def set_base_mask(self, value: Optional[Buffer]) -> None: ...
-    def set_mask(self, value: Optional[Buffer]) -> Self: ...
+    def set_base_mask(self, value: Buffer | None) -> None: ...
+    def set_mask(self, value: Buffer | None) -> Self: ...
     @property
     def null_count(self) -> int: ...
     @property
     def offset(self) -> int: ...
     @property
-    def base_children(self) -> Tuple[ColumnBase, ...]: ...
+    def base_children(self) -> tuple[ColumnBase, ...]: ...
     @property
-    def children(self) -> Tuple[ColumnBase, ...]: ...
-    def set_base_children(self, value: Tuple[ColumnBase, ...]) -> None: ...
+    def children(self) -> tuple[ColumnBase, ...]: ...
+    def set_base_children(self, value: tuple[ColumnBase, ...]) -> None: ...
     def _mimic_inplace(
         self, other_col: ColumnBase, inplace=False
-    ) -> Optional[Self]: ...
+    ) -> Self | None: ...
 
     # TODO: The val parameter should be Scalar, not ScalarLike
     @staticmethod
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index 42b1524bd76..d97e9c815b6 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -8,7 +8,7 @@
 from collections import abc
 from functools import wraps
 from inspect import isclass
-from typing import List, Union, cast
+from typing import cast
 
 import cupy as cp
 import numpy as np
@@ -219,7 +219,7 @@ def wrapped_func(obj):
 
 
 def _union_categoricals(
-    to_union: List[Union[cudf.Series, cudf.CategoricalIndex]],
+    to_union: list[cudf.Series | cudf.CategoricalIndex],
     sort_categories: bool = False,
     ignore_order: bool = False,
 ):
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index e5945f8860e..e71e45e410e 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -5,7 +5,7 @@
 import pickle
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Literal, Set, Tuple
+from typing import TYPE_CHECKING, Any, Literal
 
 import pandas as pd
 from typing_extensions import Self
@@ -44,11 +44,11 @@
 class BaseIndex(Serializable):
     """Base class for all cudf Index types."""
 
-    _accessors: Set[Any] = set()
+    _accessors: set[Any] = set()
     _data: ColumnAccessor
 
     @property
-    def _columns(self) -> Tuple[Any, ...]:
+    def _columns(self) -> tuple[Any, ...]:
         raise NotImplementedError
 
     @cached_property
diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py
index 5cb9f0363e0..393a68dd844 100644
--- a/python/cudf/cudf/core/_internals/expressions.py
+++ b/python/cudf/cudf/core/_internals/expressions.py
@@ -1,8 +1,8 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import ast
 import functools
-from typing import List, Tuple
 
 from cudf._lib.expressions import (
     ASTOperator,
@@ -98,9 +98,9 @@ class libcudfASTVisitor(ast.NodeVisitor):
         The column names used to map the names in an expression.
     """
 
-    def __init__(self, col_names: Tuple[str]):
-        self.stack: List[Expression] = []
-        self.nodes: List[Expression] = []
+    def __init__(self, col_names: tuple[str]):
+        self.stack: list[Expression] = []
+        self.nodes: list[Expression] = []
         self.col_names = col_names
 
     @property
@@ -218,7 +218,7 @@ def visit_Call(self, node):
 
 
 @functools.lru_cache(256)
-def parse_expression(expr: str, col_names: Tuple[str]):
+def parse_expression(expr: str, col_names: tuple[str]):
     visitor = libcudfASTVisitor(col_names)
     visitor.visit(ast.parse(expr))
     return visitor
diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
index f04cae719c2..269fcf3e37f 100644
--- a/python/cudf/cudf/core/_internals/timezones.py
+++ b/python/cudf/cudf/core/_internals/timezones.py
@@ -1,20 +1,23 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import os
 import zoneinfo
 from functools import lru_cache
-from typing import Literal, Tuple
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 
 from cudf._lib.timezone import make_timezone_transition_table
 from cudf.core.column.column import as_column
-from cudf.core.column.datetime import DatetimeColumn
-from cudf.core.column.timedelta import TimeDeltaColumn
+
+if TYPE_CHECKING:
+    from cudf.core.column.datetime import DatetimeColumn
+    from cudf.core.column.timedelta import TimeDeltaColumn
 
 
 @lru_cache(maxsize=20)
-def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
+def get_tz_data(zone_name: str) -> tuple[DatetimeColumn, TimeDeltaColumn]:
     """
     Return timezone data (transition times and UTC offsets) for the
     given IANA time zone.
@@ -40,7 +43,7 @@ def get_tz_data(zone_name: str) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
 
 def _find_and_read_tzfile_tzpath(
     zone_name: str,
-) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
+) -> tuple[DatetimeColumn, TimeDeltaColumn]:
     for search_path in zoneinfo.TZPATH:
         if os.path.isfile(os.path.join(search_path, zone_name)):
             return _read_tzfile_as_columns(search_path, zone_name)
@@ -49,7 +52,7 @@ def _find_and_read_tzfile_tzpath(
 
 def _find_and_read_tzfile_tzdata(
     zone_name: str,
-) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
+) -> tuple[DatetimeColumn, TimeDeltaColumn]:
     import importlib.resources
 
     package_base = "tzdata.zoneinfo"
@@ -78,7 +81,7 @@ def _find_and_read_tzfile_tzdata(
 
 def _read_tzfile_as_columns(
     tzdir, zone_name: str
-) -> Tuple[DatetimeColumn, TimeDeltaColumn]:
+) -> tuple[DatetimeColumn, TimeDeltaColumn]:
     transition_times_and_offsets = make_timezone_transition_table(
         tzdir, zone_name
     )
@@ -92,7 +95,7 @@ def _read_tzfile_as_columns(
 
 def check_ambiguous_and_nonexistent(
     ambiguous: Literal["NaT"], nonexistent: Literal["NaT"]
-) -> Tuple[Literal["NaT"], Literal["NaT"]]:
+) -> tuple[Literal["NaT"], Literal["NaT"]]:
     if ambiguous != "NaT":
         raise NotImplementedError(
             "Only ambiguous='NaT' is currently supported"
diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index ef6b10f66c1..44ce0ddef25 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -1,18 +1,17 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import warnings
-from typing import Tuple, Union
+from typing import TYPE_CHECKING
 
 import numpy as np
 
 import cudf
-from cudf._typing import ScalarLike
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     is_bool_dtype,
     is_scalar,
 )
-from cudf.core.column import ColumnBase
 from cudf.core.dtypes import CategoricalDtype
 from cudf.utils.dtypes import (
     _can_cast,
@@ -21,6 +20,10 @@
     is_mixed_with_object_dtype,
 )
 
+if TYPE_CHECKING:
+    from cudf._typing import ScalarLike
+    from cudf.core.column import ColumnBase
+
 
 def _normalize_categorical(input_col, other):
     if isinstance(input_col, cudf.core.column.CategoricalColumn):
@@ -41,9 +44,9 @@ def _normalize_categorical(input_col, other):
 
 def _check_and_cast_columns_with_other(
     source_col: ColumnBase,
-    other: Union[ScalarLike, ColumnBase],
+    other: ScalarLike | ColumnBase,
     inplace: bool,
-) -> Tuple[ColumnBase, Union[ScalarLike, ColumnBase]]:
+) -> tuple[ColumnBase, ScalarLike | ColumnBase]:
     # Returns type-casted `source_col` & `other` based on `inplace`.
     source_dtype = source_col.dtype
     if isinstance(source_dtype, CategoricalDtype):
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index bf6f9f1a3c1..80dbbe4c048 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -6,7 +6,7 @@
 import pickle
 import weakref
 from types import SimpleNamespace
-from typing import Any, Dict, Literal, Mapping, Optional, Tuple
+from typing import Any, Literal, Mapping
 
 import numpy
 from typing_extensions import Self
@@ -42,7 +42,7 @@ def host_memory_allocation(nbytes: int) -> memoryview:
 def cuda_array_interface_wrapper(
     ptr: int,
     size: int,
-    owner: Optional[object] = None,
+    owner: object | None = None,
     readonly=False,
     typestr="|u1",
     version=0,
@@ -278,7 +278,7 @@ def get_ptr(self, *, mode: Literal["read", "write"]) -> int:
         return self._ptr
 
     def memoryview(
-        self, *, offset: int = 0, size: Optional[int] = None
+        self, *, offset: int = 0, size: int | None = None
     ) -> memoryview:
         """Read-only access to the buffer through host memory."""
         size = self._size if size is None else size
@@ -319,7 +319,7 @@ def __init__(
         *,
         owner: BufferOwner,
         offset: int = 0,
-        size: Optional[int] = None,
+        size: int | None = None,
     ) -> None:
         size = owner.size if size is None else size
         if size < 0:
@@ -414,7 +414,7 @@ def __cuda_array_interface__(self) -> Mapping:
             "version": 0,
         }
 
-    def serialize(self) -> Tuple[dict, list]:
+    def serialize(self) -> tuple[dict, list]:
         """Serialize the buffer into header and frames.
 
         The frames can be a mixture of memoryview, Buffer, and BufferOwner
@@ -427,7 +427,7 @@ def serialize(self) -> Tuple[dict, list]:
             serializable metadata required to reconstruct the object. The
             second element is a list containing single frame.
         """
-        header: Dict[str, Any] = {}
+        header: dict[str, Any] = {}
         header["type-serialized"] = pickle.dumps(type(self))
         header["owner-type-serialized"] = pickle.dumps(type(self._owner))
         header["frame_count"] = 1
@@ -480,7 +480,7 @@ def __str__(self) -> str:
         )
 
 
-def get_ptr_and_size(array_interface: Mapping) -> Tuple[int, int]:
+def get_ptr_and_size(array_interface: Mapping) -> tuple[int, int]:
     """Retrieve the pointer and size from an array interface.
 
     Raises ValueError if array isn't C-contiguous.
diff --git a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
index 15f00fc670d..0bd8d6054b3 100644
--- a/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
+++ b/python/cudf/cudf/core/buffer/exposure_tracked_buffer.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Literal, Mapping, Optional
+from typing import Literal, Mapping
 
 from typing_extensions import Self
 
@@ -27,7 +27,7 @@ def __init__(
         self,
         owner: BufferOwner,
         offset: int = 0,
-        size: Optional[int] = None,
+        size: int | None = None,
     ) -> None:
         super().__init__(owner=owner, offset=offset, size=size)
         self.owner._slices.add(self)
diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
index 7bcf97302aa..762cd7f9e86 100644
--- a/python/cudf/cudf/core/buffer/spill_manager.py
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -13,7 +13,7 @@
 from contextlib import contextmanager
 from dataclasses import dataclass
 from functools import partial
-from typing import TYPE_CHECKING, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING
 
 import rmm.mr
 
@@ -39,7 +39,7 @@ def get_traceback() -> str:
 
 def get_rmm_memory_resource_stack(
     mr: rmm.mr.DeviceMemoryResource,
-) -> List[rmm.mr.DeviceMemoryResource]:
+) -> list[rmm.mr.DeviceMemoryResource]:
     """Get the RMM resource stack
 
     Parameters
@@ -99,14 +99,14 @@ class Expose:
         total_nbytes: int = 0
         spilled_nbytes: int = 0
 
-    spill_totals: Dict[Tuple[str, str], Tuple[int, float]]
+    spill_totals: dict[tuple[str, str], tuple[int, float]]
 
     def __init__(self, level) -> None:
         self.lock = threading.Lock()
         self.level = level
         self.spill_totals = defaultdict(lambda: (0, 0))
         # Maps each traceback to a Expose
-        self.exposes: Dict[str, SpillStatistics.Expose] = {}
+        self.exposes: dict[str, SpillStatistics.Expose] = {}
 
     def log_spill(self, src: str, dst: str, nbytes: int, time: float) -> None:
         """Log a (un-)spilling event
@@ -227,7 +227,7 @@ class SpillManager:
     def __init__(
         self,
         *,
-        device_memory_limit: Optional[int] = None,
+        device_memory_limit: int | None = None,
         statistic_level: int = 0,
     ) -> None:
         self._lock = threading.Lock()
@@ -298,7 +298,7 @@ def add(self, buffer: SpillableBufferOwner) -> None:
 
     def buffers(
         self, order_by_access_time: bool = False
-    ) -> Tuple[SpillableBufferOwner, ...]:
+    ) -> tuple[SpillableBufferOwner, ...]:
         """Get all managed buffers
 
         Parameters
@@ -347,7 +347,7 @@ def spill_device_memory(self, nbytes: int) -> int:
                     buf.lock.release()
         return spilled
 
-    def spill_to_device_limit(self, device_limit: Optional[int] = None) -> int:
+    def spill_to_device_limit(self, device_limit: int | None = None) -> int:
         """Try to spill device memory until device limit
 
         Notice, by default this is a no-op.
@@ -402,10 +402,10 @@ def __repr__(self) -> str:
 #   - Initialized to None (spilling disabled)
 #   - Initialized to a SpillManager instance (spilling enabled)
 _global_manager_uninitialized: bool = True
-_global_manager: Optional[SpillManager] = None
+_global_manager: SpillManager | None = None
 
 
-def set_global_manager(manager: Optional[SpillManager]) -> None:
+def set_global_manager(manager: SpillManager | None) -> None:
     """Set the global manager, which if None disables spilling"""
 
     global _global_manager, _global_manager_uninitialized
@@ -419,7 +419,7 @@ def set_global_manager(manager: Optional[SpillManager]) -> None:
     _global_manager_uninitialized = False
 
 
-def get_global_manager() -> Optional[SpillManager]:
+def get_global_manager() -> SpillManager | None:
     """Get the global manager or None if spilling is disabled"""
     global _global_manager_uninitialized
     if _global_manager_uninitialized:
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index 49258fea9ab..eb57a371965 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -7,7 +7,7 @@
 import time
 import weakref
 from threading import RLock
-from typing import TYPE_CHECKING, Any, Dict, List, Literal, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Literal
 
 import numpy
 from typing_extensions import Self
@@ -88,10 +88,10 @@ class SpillableBufferOwner(BufferOwner):
     lock: RLock
     _spill_locks: weakref.WeakSet
     _last_accessed: float
-    _ptr_desc: Dict[str, Any]
+    _ptr_desc: dict[str, Any]
     _manager: SpillManager
 
-    def _finalize_init(self, ptr_desc: Dict[str, Any]) -> None:
+    def _finalize_init(self, ptr_desc: dict[str, Any]) -> None:
         """Finish initialization of the spillable buffer
 
         This implements the common initialization that `from_device_memory`
@@ -297,7 +297,7 @@ def get_ptr(self, *, mode: Literal["read", "write"]) -> int:
             self._last_accessed = time.monotonic()
         return self._ptr
 
-    def memory_info(self) -> Tuple[int, int, str]:
+    def memory_info(self) -> tuple[int, int, str]:
         """Get pointer, size, and device type of this buffer.
 
         Warning, it is not safe to access the pointer value without
@@ -341,7 +341,7 @@ def __cuda_array_interface__(self) -> dict:
         }
 
     def memoryview(
-        self, *, offset: int = 0, size: Optional[int] = None
+        self, *, offset: int = 0, size: int | None = None
     ) -> memoryview:
         size = self._size if size is None else size
         with self.lock:
@@ -388,11 +388,11 @@ def spillable(self) -> bool:
     def spill_lock(self, spill_lock: SpillLock) -> None:
         self._owner.spill_lock(spill_lock=spill_lock)
 
-    def memory_info(self) -> Tuple[int, int, str]:
+    def memory_info(self) -> tuple[int, int, str]:
         (ptr, _, device_type) = self._owner.memory_info()
         return (ptr + self._offset, self.nbytes, device_type)
 
-    def serialize(self) -> Tuple[dict, list]:
+    def serialize(self) -> tuple[dict, list]:
         """Serialize the Buffer
 
         Normally, we would use `[self]` as the frames. This would work but
@@ -411,8 +411,8 @@ def serialize(self) -> Tuple[dict, list]:
         given to `.deserialize()`, otherwise we would have a `Buffer` pointing
         to memory already owned by an existing `SpillableBufferOwner`.
         """
-        header: Dict[str, Any] = {}
-        frames: List[Buffer | memoryview]
+        header: dict[str, Any] = {}
+        frames: list[Buffer | memoryview]
         with self._owner.lock:
             header["type-serialized"] = pickle.dumps(self.__class__)
             header["owner-type-serialized"] = pickle.dumps(type(self._owner))
diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py
index 3346d05ed4a..42a1501c914 100644
--- a/python/cudf/cudf/core/buffer/utils.py
+++ b/python/cudf/cudf/core/buffer/utils.py
@@ -4,7 +4,7 @@
 
 import threading
 from contextlib import ContextDecorator
-from typing import Any, Dict, Optional, Tuple, Type, Union
+from typing import Any
 
 from cudf.core.buffer.buffer import (
     Buffer,
@@ -22,7 +22,7 @@
 from cudf.options import get_option
 
 
-def get_buffer_owner(data: Any) -> Optional[BufferOwner]:
+def get_buffer_owner(data: Any) -> BufferOwner | None:
     """Get the owner of `data`, if one exists
 
     Search through the stack of data owners in order to find an
@@ -47,10 +47,10 @@ def get_buffer_owner(data: Any) -> Optional[BufferOwner]:
 
 
 def as_buffer(
-    data: Union[int, Any],
+    data: int | Any,
     *,
-    size: Optional[int] = None,
-    owner: Optional[object] = None,
+    size: int | None = None,
+    owner: object | None = None,
     exposed: bool = False,
 ) -> Buffer:
     """Factory function to wrap `data` in a Buffer object.
@@ -117,8 +117,8 @@ def as_buffer(
         )
 
     # Find the buffer types to return based on the current config
-    owner_class: Type[BufferOwner]
-    buffer_class: Type[Buffer]
+    owner_class: type[BufferOwner]
+    buffer_class: type[Buffer]
     if get_global_manager() is not None:
         owner_class = SpillableBufferOwner
         buffer_class = SpillableBuffer
@@ -161,7 +161,7 @@ def as_buffer(
     return buffer_class(owner=owner, offset=ptr - base_ptr, size=size)
 
 
-_thread_spill_locks: Dict[int, Tuple[Optional[SpillLock], int]] = {}
+_thread_spill_locks: dict[int, tuple[SpillLock | None, int]] = {}
 
 
 def _push_thread_spill_lock() -> None:
@@ -193,7 +193,7 @@ class acquire_spill_lock(ContextDecorator):
     pushing and popping from `_thread_spill_locks` using its thread ID.
     """
 
-    def __enter__(self) -> Optional[SpillLock]:
+    def __enter__(self) -> SpillLock | None:
         _push_thread_spill_lock()
         return get_spill_lock()
 
@@ -201,7 +201,7 @@ def __exit__(self, *exc):
         _pop_thread_spill_lock()
 
 
-def get_spill_lock() -> Union[SpillLock, None]:
+def get_spill_lock() -> SpillLock | None:
     """Return a spill lock within the context of `acquire_spill_lock` or None
 
     Returns None, if spilling is disabled.
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 97c2ce5cf1f..f538180805b 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -4,7 +4,7 @@
 
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Mapping, Optional, Sequence, Tuple, cast
+from typing import TYPE_CHECKING, Any, Mapping, Sequence, cast
 
 import numpy as np
 import pandas as pd
@@ -139,7 +139,7 @@ def ordered(self) -> bool:
         """
         return self._column.ordered
 
-    def as_ordered(self) -> Optional[SeriesOrIndex]:
+    def as_ordered(self) -> SeriesOrIndex | None:
         """
         Set the Categorical to be ordered.
 
@@ -175,7 +175,7 @@ def as_ordered(self) -> Optional[SeriesOrIndex]:
         """
         return self._return_or_inplace(self._column.as_ordered(ordered=True))
 
-    def as_unordered(self) -> Optional[SeriesOrIndex]:
+    def as_unordered(self) -> SeriesOrIndex | None:
         """
         Set the Categorical to be unordered.
 
@@ -222,7 +222,7 @@ def as_unordered(self) -> Optional[SeriesOrIndex]:
         """
         return self._return_or_inplace(self._column.as_ordered(ordered=False))
 
-    def add_categories(self, new_categories: Any) -> Optional[SeriesOrIndex]:
+    def add_categories(self, new_categories: Any) -> SeriesOrIndex | None:
         """
         Add new categories.
 
@@ -294,7 +294,7 @@ def add_categories(self, new_categories: Any) -> Optional[SeriesOrIndex]:
     def remove_categories(
         self,
         removals: Any,
-    ) -> Optional[SeriesOrIndex]:
+    ) -> SeriesOrIndex | None:
         """
         Remove the specified categories.
 
@@ -370,7 +370,7 @@ def set_categories(
         new_categories: Any,
         ordered: bool = False,
         rename: bool = False,
-    ) -> Optional[SeriesOrIndex]:
+    ) -> SeriesOrIndex | None:
         """
         Set the categories to the specified new_categories.
 
@@ -443,7 +443,7 @@ def reorder_categories(
         self,
         new_categories: Any,
         ordered: bool = False,
-    ) -> Optional[SeriesOrIndex]:
+    ) -> SeriesOrIndex | None:
         """
         Reorder categories as specified in new_categories.
 
@@ -521,8 +521,8 @@ class CategoricalColumn(column.ColumnBase):
     """
 
     dtype: cudf.core.dtypes.CategoricalDtype
-    _codes: Optional[NumericalColumn]
-    _children: Tuple[NumericalColumn]
+    _codes: NumericalColumn | None
+    _children: tuple[NumericalColumn]
     _VALID_REDUCTIONS = {
         "max",
         "min",
@@ -539,11 +539,11 @@ class CategoricalColumn(column.ColumnBase):
     def __init__(
         self,
         dtype: CategoricalDtype,
-        mask: Optional[Buffer] = None,
-        size: Optional[int] = None,
+        mask: Buffer | None = None,
+        size: int | None = None,
         offset: int = 0,
-        null_count: Optional[int] = None,
-        children: Tuple["column.ColumnBase", ...] = (),
+        null_count: int | None = None,
+        children: tuple["column.ColumnBase", ...] = (),
     ):
         if size is None:
             for child in children:
@@ -590,23 +590,23 @@ def set_base_data(self, value):
 
     def _process_values_for_isin(
         self, values: Sequence
-    ) -> Tuple[ColumnBase, ColumnBase]:
+    ) -> tuple[ColumnBase, ColumnBase]:
         lhs = self
         # We need to convert values to same type as self,
         # hence passing dtype=self.dtype
         rhs = cudf.core.column.as_column(values, dtype=self.dtype)
         return lhs, rhs
 
-    def set_base_mask(self, value: Optional[Buffer]):
+    def set_base_mask(self, value: Buffer | None):
         super().set_base_mask(value)
         self._codes = None
 
-    def set_base_children(self, value: Tuple[ColumnBase, ...]):
+    def set_base_children(self, value: tuple[ColumnBase, ...]):
         super().set_base_children(value)
         self._codes = None
 
     @property
-    def children(self) -> Tuple[NumericalColumn]:
+    def children(self) -> tuple[NumericalColumn]:
         if self._children is None:
             codes_column = self.base_children[0]
             start = self.offset * codes_column.dtype.itemsize
@@ -693,9 +693,7 @@ def _fill(
         libcudf.filling.fill_in_place(result.codes, begin, end, fill_scalar)
         return result
 
-    def slice(
-        self, start: int, stop: int, stride: Optional[int] = None
-    ) -> Self:
+    def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
         codes = self.codes.slice(start, stop, stride)
         return cast(
             Self,
@@ -714,7 +712,7 @@ def slice(
     def _reduce(
         self,
         op: str,
-        skipna: Optional[bool] = None,
+        skipna: bool | None = None,
         min_count: int = 0,
         *args,
         **kwargs,
@@ -1073,7 +1071,7 @@ def notnull(self) -> ColumnBase:
     def fillna(
         self,
         fill_value: Any = None,
-        method: Optional[str] = None,
+        method: str | None = None,
     ) -> Self:
         """
         Fill null values with *fill_value*
@@ -1207,7 +1205,7 @@ def memory_usage(self) -> int:
 
     def _mimic_inplace(
         self, other_col: ColumnBase, inplace: bool = False
-    ) -> Optional[Self]:
+    ) -> Self | None:
         out = super()._mimic_inplace(other_col, inplace=inplace)
         if inplace and isinstance(other_col, CategoricalColumn):
             self._codes = other_col._codes
@@ -1468,7 +1466,7 @@ def _create_empty_categorical_column(
 
 
 def pandas_categorical_as_column(
-    categorical: ColumnLike, codes: Optional[ColumnLike] = None
+    categorical: ColumnLike, codes: ColumnLike | None = None
 ) -> CategoricalColumn:
     """Creates a CategoricalColumn from a pandas.Categorical
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index dc937dc0469..c4e715aeb45 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -7,19 +7,7 @@
 from functools import cached_property
 from itertools import chain
 from types import SimpleNamespace
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Dict,
-    List,
-    Literal,
-    MutableSequence,
-    Optional,
-    Sequence,
-    Tuple,
-    Union,
-    cast,
-)
+from typing import TYPE_CHECKING, Any, Literal, MutableSequence, Sequence, cast
 
 import cupy
 import numpy as np
@@ -394,7 +382,7 @@ def _fill(
         begin: int,
         end: int,
         inplace: bool = False,
-    ) -> Optional[Self]:
+    ) -> Self | None:
         if end <= begin or begin >= self.size:
             return self if inplace else self.copy()
 
@@ -532,9 +520,7 @@ def element_indexing(self, index: int):
             raise IndexError("single positional indexer is out-of-bounds")
         return libcudf.copying.get_element(self, idx).value
 
-    def slice(
-        self, start: int, stop: int, stride: Optional[int] = None
-    ) -> Self:
+    def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
         stride = 1 if stride is None else stride
         if start < 0:
             start = start + len(self)
@@ -570,7 +556,7 @@ def __setitem__(self, key: Any, value: Any):
             else as_column(value, dtype=self.dtype)
         )
 
-        out: Optional[ColumnBase]  # If None, no need to perform mimic inplace.
+        out: ColumnBase | None  # If None, no need to perform mimic inplace.
         if isinstance(key, slice):
             out = self._scatter_by_slice(key, value_normalized)
         else:
@@ -593,8 +579,8 @@ def _wrap_binop_normalization(self, other):
     def _scatter_by_slice(
         self,
         key: builtins.slice,
-        value: Union[cudf.core.scalar.Scalar, ColumnBase],
-    ) -> Optional[Self]:
+        value: cudf.core.scalar.Scalar | ColumnBase,
+    ) -> Self | None:
         """If this function returns None, it's either a no-op (slice is empty),
         or the inplace replacement is already performed (fill-in-place).
         """
@@ -630,7 +616,7 @@ def _scatter_by_slice(
     def _scatter_by_column(
         self,
         key: cudf.core.column.NumericalColumn,
-        value: Union[cudf.core.scalar.Scalar, ColumnBase],
+        value: cudf.core.scalar.Scalar | ColumnBase,
     ) -> Self:
         if is_bool_dtype(key.dtype):
             # `key` is boolean mask
@@ -667,7 +653,7 @@ def _scatter_by_column(
             ]._with_type_metadata(self.dtype)
 
     def _check_scatter_key_length(
-        self, num_keys: int, value: Union[cudf.core.scalar.Scalar, ColumnBase]
+        self, num_keys: int, value: cudf.core.scalar.Scalar | ColumnBase
     ) -> None:
         """`num_keys` is the number of keys to scatter. Should equal to the
         number of rows in ``value`` if ``value`` is a column.
@@ -682,7 +668,7 @@ def _check_scatter_key_length(
     def fillna(
         self,
         fill_value: Any = None,
-        method: Optional[str] = None,
+        method: str | None = None,
     ) -> Self:
         """Fill null values with ``value``.
 
@@ -740,7 +726,7 @@ def indices_of(
             [as_column(range(0, len(self)), dtype=size_type_dtype)], mask
         )[0]
 
-    def _find_first_and_last(self, value: ScalarLike) -> Tuple[int, int]:
+    def _find_first_and_last(self, value: ScalarLike) -> tuple[int, int]:
         indices = self.indices_of(value)
         if n := len(indices):
             return (
@@ -856,7 +842,7 @@ def isin(self, values: Sequence) -> ColumnBase:
 
     def _process_values_for_isin(
         self, values: Sequence
-    ) -> Tuple[ColumnBase, ColumnBase]:
+    ) -> tuple[ColumnBase, ColumnBase]:
         """
         Helper function for `isin` which pre-process `values` based on `self`.
         """
@@ -868,7 +854,7 @@ def _process_values_for_isin(
             rhs = rhs.astype(lhs.dtype)
         return lhs, rhs
 
-    def _isin_earlystop(self, rhs: ColumnBase) -> Union[ColumnBase, None]:
+    def _isin_earlystop(self, rhs: ColumnBase) -> ColumnBase | None:
         """
         Helper function for `isin` which determines possibility of
         early-stopping or not.
@@ -1070,7 +1056,7 @@ def as_string_column(
 
     def as_decimal_column(
         self, dtype: Dtype
-    ) -> Union["cudf.core.column.decimal.DecimalBaseColumn"]:
+    ) -> "cudf.core.column.decimal.DecimalBaseColumn":
         raise NotImplementedError
 
     def apply_boolean_mask(self, mask) -> ColumnBase:
@@ -1154,7 +1140,7 @@ def unique(self) -> ColumnBase:
             self.dtype
         )
 
-    def serialize(self) -> Tuple[dict, list]:
+    def serialize(self) -> tuple[dict, list]:
         # data model:
 
         # Serialization produces a nested metadata "header" and a flattened
@@ -1167,7 +1153,7 @@ def serialize(self) -> Tuple[dict, list]:
         # cudf native or foreign some special-casing is required here for
         # serialization.
 
-        header: Dict[Any, Any] = {}
+        header: dict[Any, Any] = {}
         frames = []
         header["type-serialized"] = pickle.dumps(type(self))
         try:
@@ -1200,7 +1186,7 @@ def serialize(self) -> Tuple[dict, list]:
 
     @classmethod
     def deserialize(cls, header: dict, frames: list) -> ColumnBase:
-        def unpack(header, frames) -> Tuple[Any, list]:
+        def unpack(header, frames) -> tuple[Any, list]:
             count = header["frame_count"]
             klass = pickle.loads(header["type-serialized"])
             obj = klass.deserialize(header, frames[:count])
@@ -1247,13 +1233,13 @@ def nans_to_nulls(self: Self) -> Self:
 
     def normalize_binop_value(
         self, other: ScalarLike
-    ) -> Union[ColumnBase, ScalarLike]:
+    ) -> ColumnBase | ScalarLike:
         raise NotImplementedError
 
     def _reduce(
         self,
         op: str,
-        skipna: Optional[bool] = None,
+        skipna: bool | None = None,
         min_count: int = 0,
         *args,
         **kwargs,
@@ -1274,8 +1260,8 @@ def _reduce(
         return preprocessed
 
     def _process_for_reduction(
-        self, skipna: Optional[bool] = None, min_count: int = 0
-    ) -> Union[ColumnBase, ScalarLike]:
+        self, skipna: bool | None = None, min_count: int = 0
+    ) -> ColumnBase | ScalarLike:
         if skipna is None:
             skipna = True
 
@@ -1315,8 +1301,8 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
     def _label_encoding(
         self,
         cats: ColumnBase,
-        dtype: Optional[Dtype] = None,
-        na_sentinel: Optional[ScalarLike] = None,
+        dtype: Dtype | None = None,
+        na_sentinel: ScalarLike | None = None,
     ):
         """
         Convert each value in `self` into an integer code, with `cats`
@@ -1389,9 +1375,9 @@ def _return_sentinel_column():
 
 def column_empty_like(
     column: ColumnBase,
-    dtype: Optional[Dtype] = None,
+    dtype: Dtype | None = None,
     masked: bool = False,
-    newsize: Optional[int] = None,
+    newsize: int | None = None,
 ) -> ColumnBase:
     """Allocate a new column like the given *column*"""
     if dtype is None:
@@ -1446,7 +1432,7 @@ def column_empty(
 ) -> ColumnBase:
     """Allocate a new column like the given row_count and dtype."""
     dtype = cudf.dtype(dtype)
-    children = ()  # type: Tuple[ColumnBase, ...]
+    children: tuple[ColumnBase, ...] = ()
 
     if isinstance(dtype, StructDtype):
         data = None
@@ -1496,14 +1482,14 @@ def column_empty(
 
 
 def build_column(
-    data: Union[Buffer, None],
+    data: Buffer | None,
     dtype: Dtype,
     *,
-    size: Optional[int] = None,
-    mask: Optional[Buffer] = None,
+    size: int | None = None,
+    mask: Buffer | None = None,
     offset: int = 0,
-    null_count: Optional[int] = None,
-    children: Tuple[ColumnBase, ...] = (),
+    null_count: int | None = None,
+    children: tuple[ColumnBase, ...] = (),
 ) -> ColumnBase:
     """
     Build a Column of the appropriate type from the given parameters
@@ -1665,10 +1651,10 @@ def build_column(
 def build_categorical_column(
     categories: ColumnBase,
     codes: ColumnBase,
-    mask: Optional[Buffer] = None,
-    size: Optional[int] = None,
+    mask: Buffer | None = None,
+    size: int | None = None,
     offset: int = 0,
-    null_count: Optional[int] = None,
+    null_count: int | None = None,
     ordered: bool = False,
 ) -> "cudf.core.column.CategoricalColumn":
     """
@@ -1715,7 +1701,7 @@ def check_invalid_array(shape: tuple, dtype):
         raise TypeError("Unsupported type float16")
 
 
-def as_memoryview(arbitrary: Any) -> Optional[memoryview]:
+def as_memoryview(arbitrary: Any) -> memoryview | None:
     try:
         return memoryview(arbitrary)
     except TypeError:
@@ -1724,9 +1710,9 @@ def as_memoryview(arbitrary: Any) -> Optional[memoryview]:
 
 def as_column(
     arbitrary: Any,
-    nan_as_null: Optional[bool] = None,
-    dtype: Optional[Dtype] = None,
-    length: Optional[int] = None,
+    nan_as_null: bool | None = None,
+    dtype: Dtype | None = None,
+    length: int | None = None,
 ):
     """Create a Column from an arbitrary object
 
@@ -2199,7 +2185,7 @@ def _mask_from_cuda_array_interface_desc(obj, cai_mask) -> Buffer:
         raise NotImplementedError(f"Cannot infer mask from typestr {typestr}")
 
 
-def serialize_columns(columns: list[ColumnBase]) -> Tuple[List[dict], List]:
+def serialize_columns(columns: list[ColumnBase]) -> tuple[list[dict], list]:
     """
     Return the headers and frames resulting
     from serializing a list of Column
@@ -2216,7 +2202,7 @@ def serialize_columns(columns: list[ColumnBase]) -> Tuple[List[dict], List]:
     frames : list
         list of frames
     """
-    headers: List[Dict[Any, Any]] = []
+    headers: list[dict[Any, Any]] = []
     frames = []
 
     if len(columns) > 0:
@@ -2228,7 +2214,7 @@ def serialize_columns(columns: list[ColumnBase]) -> Tuple[List[dict], List]:
     return headers, frames
 
 
-def deserialize_columns(headers: List[dict], frames: List) -> List[ColumnBase]:
+def deserialize_columns(headers: list[dict], frames: list) -> list[ColumnBase]:
     """
     Construct a list of Columns from a list of headers
     and frames.
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index e24d85bfedf..7fdebda7d76 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -8,7 +8,7 @@
 import locale
 import re
 from locale import nl_langinfo
-from typing import TYPE_CHECKING, Any, Literal, Optional, Sequence, Tuple, cast
+from typing import TYPE_CHECKING, Any, Literal, Sequence, cast
 
 import numpy as np
 import pandas as pd
@@ -242,10 +242,10 @@ def __init__(
         self,
         data: Buffer,
         dtype: DtypeObj,
-        mask: Optional[Buffer] = None,
-        size: Optional[int] = None,  # TODO: make non-optional
+        mask: Buffer | None = None,
+        size: int | None = None,  # TODO: make non-optional
         offset: int = 0,
-        null_count: Optional[int] = None,
+        null_count: int | None = None,
     ):
         dtype = cudf.dtype(dtype)
         if dtype.kind != "M":
@@ -499,7 +499,7 @@ def mean(
 
     def std(
         self,
-        skipna: Optional[bool] = None,
+        skipna: bool | None = None,
         min_count: int = 0,
         dtype: Dtype = np.float64,
         ddof: int = 1,
@@ -511,7 +511,7 @@ def std(
             * _unit_to_nanoseconds_conversion[self.time_unit],
         ).as_unit(self.time_unit)
 
-    def median(self, skipna: Optional[bool] = None) -> pd.Timestamp:
+    def median(self, skipna: bool | None = None) -> pd.Timestamp:
         return pd.Timestamp(
             self.as_numerical_column("int64").median(skipna=skipna),
             unit=self.time_unit,
@@ -631,7 +631,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
     def fillna(
         self,
         fill_value: Any = None,
-        method: Optional[str] = None,
+        method: str | None = None,
     ) -> Self:
         if fill_value is not None:
             if cudf.utils.utils._isnat(fill_value):
@@ -703,7 +703,7 @@ def _with_type_metadata(self, dtype):
 
     def _find_ambiguous_and_nonexistent(
         self, zone_name: str
-    ) -> Tuple[NumericalColumn, NumericalColumn] | Tuple[bool, bool]:
+    ) -> tuple[NumericalColumn, NumericalColumn] | tuple[bool, bool]:
         """
         Recognize ambiguous and nonexistent timestamps for the given timezone.
 
@@ -822,10 +822,10 @@ def __init__(
         self,
         data: Buffer,
         dtype: pd.DatetimeTZDtype,
-        mask: Optional[Buffer] = None,
-        size: Optional[int] = None,
+        mask: Buffer | None = None,
+        size: int | None = None,
         offset: int = 0,
-        null_count: Optional[int] = None,
+        null_count: int | None = None,
     ):
         super().__init__(
             data=data,
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 9c1bedc9926..e9d9b4933e5 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -4,7 +4,7 @@
 
 import warnings
 from decimal import Decimal
-from typing import TYPE_CHECKING, Any, Optional, Sequence, Union, cast
+from typing import TYPE_CHECKING, Any, Sequence, cast
 
 import cupy as cp
 import numpy as np
@@ -49,7 +49,7 @@ def __cuda_array_interface__(self):
     def as_decimal_column(
         self,
         dtype: Dtype,
-    ) -> Union["DecimalBaseColumn"]:
+    ) -> "DecimalBaseColumn":
         if (
             isinstance(dtype, cudf.core.dtypes.DecimalDtype)
             and dtype.scale < self.dtype.scale
@@ -138,7 +138,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str):
     def fillna(
         self,
         fill_value: Any = None,
-        method: Optional[str] = None,
+        method: str | None = None,
     ) -> Self:
         """Fill null values with ``value``.
 
@@ -199,7 +199,7 @@ def normalize_binop_value(self, other):
         return NotImplemented
 
     def _decimal_quantile(
-        self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
+        self, q: float | Sequence[float], interpolation: str, exact: bool
     ) -> ColumnBase:
         quant = [float(q)] if not isinstance(q, (Sequence, np.ndarray)) else q
         # get sorted indices and exclude nulls
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 080ba949d62..c548db67344 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from functools import cached_property
-from typing import TYPE_CHECKING, List, Optional, Sequence, Tuple, Union
+from typing import TYPE_CHECKING, Sequence
 
 import numpy as np
 import pandas as pd
@@ -167,7 +167,7 @@ def set_base_data(self, value):
         else:
             super().set_base_data(value)
 
-    def set_base_children(self, value: Tuple[ColumnBase, ...]):
+    def set_base_children(self, value: tuple[ColumnBase, ...]):
         super().set_base_children(value)
         _, values = value
         self._dtype = cudf.ListDtype(element_type=values.dtype)
@@ -269,7 +269,7 @@ def _transform_leaves(self, func, *args, **kwargs) -> Self:
         # as ``self``, but with the leaf column transformed
         # by applying ``func`` to it
 
-        cc: List[ListColumn] = []
+        cc: list[ListColumn] = []
         c: ColumnBase = self
 
         while isinstance(c, ListColumn):
@@ -320,7 +320,7 @@ def __init__(self, parent: ParentType):
     def get(
         self,
         index: int,
-        default: Optional[Union[ScalarLike, ColumnLike]] = None,
+        default: ScalarLike | ColumnLike | None = None,
     ) -> ParentType:
         """
         Extract element at the given index from each list in a Series of lists.
@@ -424,7 +424,7 @@ def contains(self, search_key: ScalarLike) -> ParentType:
             contains_scalar(self._column, cudf.Scalar(search_key))
         )
 
-    def index(self, search_key: Union[ScalarLike, ColumnLike]) -> ParentType:
+    def index(self, search_key: ScalarLike | ColumnLike) -> ParentType:
         """
         Returns integers representing the index of the search key for each row.
 
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index 7f7355c571a..7c6f4e05577 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from typing import Optional, Union, overload
+from typing import Union, overload
 
 from typing_extensions import Literal
 
@@ -52,7 +52,7 @@ def _return_or_inplace(
         inplace: bool = False,
         expand: bool = False,
         retain_index: bool = True,
-    ) -> Optional[ParentType]: ...
+    ) -> ParentType | None: ...
 
     def _return_or_inplace(
         self, new_col, inplace=False, expand=False, retain_index=True
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 6af67e02bb4..098cf43421b 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -3,16 +3,7 @@
 from __future__ import annotations
 
 import functools
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Optional,
-    Sequence,
-    Tuple,
-    Union,
-    cast,
-)
+from typing import TYPE_CHECKING, Any, Callable, Sequence, cast
 
 import cupy as cp
 import numpy as np
@@ -85,10 +76,10 @@ def __init__(
         self,
         data: Buffer,
         dtype: DtypeObj,
-        mask: Optional[Buffer] = None,
-        size: Optional[int] = None,  # TODO: make this non-optional
+        mask: Buffer | None = None,
+        size: int | None = None,  # TODO: make this non-optional
         offset: int = 0,
-        null_count: Optional[int] = None,
+        null_count: int | None = None,
     ):
         dtype = cudf.dtype(dtype)
 
@@ -179,7 +170,7 @@ def __setitem__(self, key: Any, value: Any):
         else:
             device_value = device_value.astype(self.dtype)
 
-        out: Optional[ColumnBase]  # If None, no need to perform mimic inplace.
+        out: ColumnBase | None  # If None, no need to perform mimic inplace.
         if isinstance(key, slice):
             out = self._scatter_by_slice(key, device_value)
         else:
@@ -196,7 +187,7 @@ def __setitem__(self, key: Any, value: Any):
         if out:
             self._mimic_inplace(out, inplace=True)
 
-    def unary_operator(self, unaryop: Union[str, Callable]) -> ColumnBase:
+    def unary_operator(self, unaryop: str | Callable) -> ColumnBase:
         if callable(unaryop):
             return libcudf.transform.transform(self, unaryop)
 
@@ -302,7 +293,7 @@ def nans_to_nulls(self: Self) -> Self:
 
     def normalize_binop_value(
         self, other: ScalarLike
-    ) -> Union[ColumnBase, cudf.Scalar]:
+    ) -> ColumnBase | cudf.Scalar:
         if isinstance(other, ColumnBase):
             if not isinstance(other, NumericalColumn):
                 return NotImplemented
@@ -422,7 +413,7 @@ def nan_count(self) -> int:
 
     def _process_values_for_isin(
         self, values: Sequence
-    ) -> Tuple[ColumnBase, ColumnBase]:
+    ) -> tuple[ColumnBase, ColumnBase]:
         lhs = cast("cudf.core.column.ColumnBase", self)
         try:
             rhs = as_column(values, nan_as_null=False)
@@ -456,12 +447,12 @@ def _process_values_for_isin(
 
         return lhs, rhs
 
-    def _can_return_nan(self, skipna: Optional[bool] = None) -> bool:
+    def _can_return_nan(self, skipna: bool | None = None) -> bool:
         return not skipna and self.has_nulls(include_nan=True)
 
     def _process_for_reduction(
-        self, skipna: Optional[bool] = None, min_count: int = 0
-    ) -> Union[NumericalColumn, ScalarLike]:
+        self, skipna: bool | None = None, min_count: int = 0
+    ) -> NumericalColumn | ScalarLike:
         skipna = True if skipna is None else skipna
 
         if self._can_return_nan(skipna=skipna):
@@ -544,7 +535,7 @@ def find_and_replace(
     def fillna(
         self,
         fill_value: Any = None,
-        method: Optional[str] = None,
+        method: str | None = None,
     ) -> Self:
         """
         Fill null values with *fill_value*
@@ -730,7 +721,7 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
 
 
 def _normalize_find_and_replace_input(
-    input_column_dtype: DtypeObj, col_to_normalize: Union[ColumnBase, list]
+    input_column_dtype: DtypeObj, col_to_normalize: ColumnBase | list
 ) -> ColumnBase:
     normalized_column = column.as_column(
         col_to_normalize,
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index bd48054a951..95c78c5efcb 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -3,7 +3,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Optional, cast
+from typing import TYPE_CHECKING, cast
 
 import numpy as np
 
@@ -42,10 +42,10 @@ class NumericalBaseColumn(ColumnBase, Scannable):
         "cummax",
     }
 
-    def _can_return_nan(self, skipna: Optional[bool] = None) -> bool:
+    def _can_return_nan(self, skipna: bool | None = None) -> bool:
         return not skipna and self.has_nulls()
 
-    def kurtosis(self, skipna: Optional[bool] = None) -> float:
+    def kurtosis(self, skipna: bool | None = None) -> float:
         skipna = True if skipna is None else skipna
 
         if len(self) == 0 or self._can_return_nan(skipna=skipna):
@@ -70,7 +70,7 @@ def kurtosis(self, skipna: Optional[bool] = None) -> float:
         kurt = term_one_section_one * term_one_section_two - 3 * term_two
         return kurt
 
-    def skew(self, skipna: Optional[bool] = None) -> ScalarLike:
+    def skew(self, skipna: bool | None = None) -> ScalarLike:
         skipna = True if skipna is None else skipna
 
         if len(self) == 0 or self._can_return_nan(skipna=skipna):
@@ -142,7 +142,7 @@ def quantile(
 
     def mean(
         self,
-        skipna: Optional[bool] = None,
+        skipna: bool | None = None,
         min_count: int = 0,
         dtype=np.float64,
     ):
@@ -152,7 +152,7 @@ def mean(
 
     def var(
         self,
-        skipna: Optional[bool] = None,
+        skipna: bool | None = None,
         min_count: int = 0,
         dtype=np.float64,
         ddof=1,
@@ -163,7 +163,7 @@ def var(
 
     def std(
         self,
-        skipna: Optional[bool] = None,
+        skipna: bool | None = None,
         min_count: int = 0,
         dtype=np.float64,
         ddof=1,
@@ -172,7 +172,7 @@ def std(
             "std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
         )
 
-    def median(self, skipna: Optional[bool] = None) -> NumericalBaseColumn:
+    def median(self, skipna: bool | None = None) -> NumericalBaseColumn:
         skipna = True if skipna is None else skipna
 
         if self._can_return_nan(skipna=skipna):
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 87df2d2f1f1..2451a9cc0af 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5,16 +5,7 @@
 import re
 import warnings
 from functools import cached_property
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Optional,
-    Sequence,
-    Tuple,
-    Union,
-    cast,
-    overload,
-)
+from typing import TYPE_CHECKING, Any, Sequence, cast, overload
 
 import numpy as np
 import pandas as pd
@@ -257,13 +248,13 @@ def byte_count(self) -> SeriesOrIndex:
 
     @overload
     def cat(
-        self, sep: Optional[str] = None, na_rep: Optional[str] = None
+        self, sep: str | None = None, na_rep: str | None = None
     ) -> str: ...
 
     @overload
     def cat(
-        self, others, sep: Optional[str] = None, na_rep: Optional[str] = None
-    ) -> Union[SeriesOrIndex, "cudf.core.column.string.StringColumn"]: ...
+        self, others, sep: str | None = None, na_rep: str | None = None
+    ) -> SeriesOrIndex | "cudf.core.column.string.StringColumn": ...
 
     def cat(self, others=None, sep=None, na_rep=None):
         """
@@ -641,7 +632,7 @@ def extract(
 
     def contains(
         self,
-        pat: Union[str, Sequence],
+        pat: str | Sequence,
         case: bool = True,
         flags: int = 0,
         na=np.nan,
@@ -792,7 +783,7 @@ def contains(
             result_col = libstrings.contains_multiple(input_column, pat)
         return self._return_or_inplace(result_col)
 
-    def like(self, pat: str, esc: Optional[str] = None) -> SeriesOrIndex:
+    def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex:
         """
         Test if a like pattern matches a string of a Series or Index.
 
@@ -863,7 +854,7 @@ def like(self, pat: str, esc: Optional[str] = None) -> SeriesOrIndex:
 
     def repeat(
         self,
-        repeats: Union[int, Sequence],
+        repeats: int | Sequence,
     ) -> SeriesOrIndex:
         """
         Duplicate each string in the Series or Index.
@@ -920,8 +911,8 @@ def repeat(
 
     def replace(
         self,
-        pat: Union[str, Sequence],
-        repl: Union[str, Sequence],
+        pat: str | Sequence,
+        repl: str | Sequence,
         n: int = -1,
         case=None,
         flags: int = 0,
@@ -1074,9 +1065,9 @@ def replace_with_backrefs(self, pat: str, repl: str) -> SeriesOrIndex:
 
     def slice(
         self,
-        start: Optional[int] = None,
-        stop: Optional[int] = None,
-        step: Optional[int] = None,
+        start: int | None = None,
+        stop: int | None = None,
+        step: int | None = None,
     ) -> SeriesOrIndex:
         """
         Slice substrings from each element in the Series or Index.
@@ -2051,7 +2042,7 @@ def istitle(self) -> SeriesOrIndex:
         return self._return_or_inplace(libstrings.is_title(self._column))
 
     def filter_alphanum(
-        self, repl: Optional[str] = None, keep: bool = True
+        self, repl: str | None = None, keep: bool = True
     ) -> SeriesOrIndex:
         """
         Remove non-alphanumeric characters from strings in this column.
@@ -2138,9 +2129,9 @@ def slice_from(
 
     def slice_replace(
         self,
-        start: Optional[int] = None,
-        stop: Optional[int] = None,
-        repl: Optional[str] = None,
+        start: int | None = None,
+        stop: int | None = None,
+        repl: str | None = None,
     ) -> SeriesOrIndex:
         """
         Replace the specified section of each string with a new string.
@@ -2228,9 +2219,7 @@ def slice_replace(
             ),
         )
 
-    def insert(
-        self, start: int = 0, repl: Optional[str] = None
-    ) -> SeriesOrIndex:
+    def insert(self, start: int = 0, repl: str | None = None) -> SeriesOrIndex:
         """
         Insert the specified string into each string in the specified
         position.
@@ -2410,10 +2399,10 @@ def get_json_object(
 
     def split(
         self,
-        pat: Optional[str] = None,
+        pat: str | None = None,
         n: int = -1,
         expand: bool = False,
-        regex: Optional[bool] = None,
+        regex: bool | None = None,
     ) -> SeriesOrIndex:
         """
         Split strings around given separator/delimiter.
@@ -2578,10 +2567,10 @@ def split(
 
     def rsplit(
         self,
-        pat: Optional[str] = None,
+        pat: str | None = None,
         n: int = -1,
         expand: bool = False,
-        regex: Optional[bool] = None,
+        regex: bool | None = None,
     ) -> SeriesOrIndex:
         """
         Split strings around given separator/delimiter.
@@ -3233,7 +3222,7 @@ def rjust(self, width: int, fillchar: str = " ") -> SeriesOrIndex:
             libstrings.rjust(self._column, width, fillchar)
         )
 
-    def strip(self, to_strip: Optional[str] = None) -> SeriesOrIndex:
+    def strip(self, to_strip: str | None = None) -> SeriesOrIndex:
         r"""
         Remove leading and trailing characters.
 
@@ -3292,7 +3281,7 @@ def strip(self, to_strip: Optional[str] = None) -> SeriesOrIndex:
             libstrings.strip(self._column, cudf.Scalar(to_strip, "str"))
         )
 
-    def lstrip(self, to_strip: Optional[str] = None) -> SeriesOrIndex:
+    def lstrip(self, to_strip: str | None = None) -> SeriesOrIndex:
         r"""
         Remove leading and trailing characters.
 
@@ -3339,7 +3328,7 @@ def lstrip(self, to_strip: Optional[str] = None) -> SeriesOrIndex:
             libstrings.lstrip(self._column, cudf.Scalar(to_strip, "str"))
         )
 
-    def rstrip(self, to_strip: Optional[str] = None) -> SeriesOrIndex:
+    def rstrip(self, to_strip: str | None = None) -> SeriesOrIndex:
         r"""
         Remove leading and trailing characters.
 
@@ -3844,7 +3833,7 @@ def endswith(self, pat: str) -> SeriesOrIndex:
 
         return self._return_or_inplace(result_col)
 
-    def startswith(self, pat: Union[str, Sequence]) -> SeriesOrIndex:
+    def startswith(self, pat: str | Sequence) -> SeriesOrIndex:
         """
         Test if the start of each string element matches a pattern.
 
@@ -3996,7 +3985,7 @@ def removeprefix(self, prefix: str) -> SeriesOrIndex:
         return self._return_or_inplace(result)
 
     def find(
-        self, sub: str, start: int = 0, end: Optional[int] = None
+        self, sub: str, start: int = 0, end: int | None = None
     ) -> SeriesOrIndex:
         """
         Return lowest indexes in each strings in the Series/Index
@@ -4053,7 +4042,7 @@ def find(
         return self._return_or_inplace(result_col)
 
     def rfind(
-        self, sub: str, start: int = 0, end: Optional[int] = None
+        self, sub: str, start: int = 0, end: int | None = None
     ) -> SeriesOrIndex:
         """
         Return highest indexes in each strings in the Series/Index
@@ -4114,7 +4103,7 @@ def rfind(
         return self._return_or_inplace(result_col)
 
     def index(
-        self, sub: str, start: int = 0, end: Optional[int] = None
+        self, sub: str, start: int = 0, end: int | None = None
     ) -> SeriesOrIndex:
         """
         Return lowest indexes in each strings where the substring
@@ -4176,7 +4165,7 @@ def index(
             return result
 
     def rindex(
-        self, sub: str, start: int = 0, end: Optional[int] = None
+        self, sub: str, start: int = 0, end: int | None = None
     ) -> SeriesOrIndex:
         """
         Return highest indexes in each strings where the substring
@@ -4443,7 +4432,7 @@ def translate(self, table: dict) -> SeriesOrIndex:
         )
 
     def filter_characters(
-        self, table: dict, keep: bool = True, repl: Optional[str] = None
+        self, table: dict, keep: bool = True, repl: str | None = None
     ) -> SeriesOrIndex:
         """
         Remove characters from each string using the character ranges
@@ -4924,7 +4913,7 @@ def ngrams_tokenize(
         )
 
     def replace_tokens(
-        self, targets, replacements, delimiter: Optional[str] = None
+        self, targets, replacements, delimiter: str | None = None
     ) -> SeriesOrIndex:
         """
         The targets tokens are searched for within each string in the series
@@ -5009,8 +4998,8 @@ def replace_tokens(
     def filter_tokens(
         self,
         min_token_length: int,
-        replacement: Optional[str] = None,
-        delimiter: Optional[str] = None,
+        replacement: str | None = None,
+        delimiter: str | None = None,
     ) -> SeriesOrIndex:
         """
         Remove tokens from within each string in the series that are
@@ -5279,7 +5268,7 @@ def edit_distance_matrix(self) -> SeriesOrIndex:
         )
 
     def minhash(
-        self, seeds: Optional[ColumnLike] = None, width: int = 4
+        self, seeds: ColumnLike | None = None, width: int = 4
     ) -> SeriesOrIndex:
         """
         Compute the minhash of a strings column.
@@ -5322,7 +5311,7 @@ def minhash(
         )
 
     def minhash64(
-        self, seeds: Optional[ColumnLike] = None, width: int = 4
+        self, seeds: ColumnLike | None = None, width: int = 4
     ) -> SeriesOrIndex:
         """
         Compute the minhash of a strings column.
@@ -5436,8 +5425,8 @@ class StringColumn(column.ColumnBase):
         respectively
     """
 
-    _start_offset: Optional[int]
-    _end_offset: Optional[int]
+    _start_offset: int | None
+    _end_offset: int | None
 
     _VALID_BINARY_OPERATIONS = {
         "__eq__",
@@ -5461,12 +5450,12 @@ class StringColumn(column.ColumnBase):
 
     def __init__(
         self,
-        data: Optional[Buffer] = None,
-        mask: Optional[Buffer] = None,
-        size: Optional[int] = None,  # TODO: make non-optional
+        data: Buffer | None = None,
+        mask: Buffer | None = None,
+        size: int | None = None,  # TODO: make non-optional
         offset: int = 0,
-        null_count: Optional[int] = None,
-        children: Tuple["column.ColumnBase", ...] = (),
+        null_count: int | None = None,
+        children: tuple["column.ColumnBase", ...] = (),
     ):
         dtype = cudf.api.types.dtype("object")
 
@@ -5634,8 +5623,8 @@ def to_arrow(self) -> pa.Array:
 
     def sum(
         self,
-        skipna: Optional[bool] = None,
-        dtype: Optional[Dtype] = None,
+        skipna: bool | None = None,
+        dtype: Dtype | None = None,
         min_count: int = 0,
     ):
         result_col = self._process_for_reduction(
@@ -5852,7 +5841,7 @@ def find_and_replace(
     def fillna(
         self,
         fill_value: Any = None,
-        method: Optional[str] = None,
+        method: str | None = None,
     ) -> Self:
         if fill_value is not None:
             if not is_scalar(fill_value):
@@ -5864,9 +5853,7 @@ def fillna(
                 fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
         return super().fillna(fill_value, method=method)
 
-    def normalize_binop_value(
-        self, other
-    ) -> Union[column.ColumnBase, cudf.Scalar]:
+    def normalize_binop_value(self, other) -> column.ColumnBase | cudf.Scalar:
         if (
             isinstance(other, (column.ColumnBase, cudf.Scalar))
             and other.dtype == "object"
@@ -5930,8 +5917,8 @@ def _binaryop(
 
                 # Explicit types are necessary because mypy infers ColumnBase
                 # rather than StringColumn and sometimes forgets Scalar.
-                lhs: Union[cudf.Scalar, StringColumn]
-                rhs: Union[cudf.Scalar, StringColumn]
+                lhs: cudf.Scalar | StringColumn
+                rhs: cudf.Scalar | StringColumn
                 lhs, rhs = (other, self) if reflect else (self, other)
 
                 return cast(
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 0af847f38af..8eec84b64f7 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -4,7 +4,7 @@
 
 import datetime
 import functools
-from typing import TYPE_CHECKING, Any, Optional, Sequence, cast
+from typing import TYPE_CHECKING, Any, Sequence, cast
 
 import numpy as np
 import pandas as pd
@@ -77,10 +77,10 @@ def __init__(
         self,
         data: Buffer,
         dtype: Dtype,
-        size: Optional[int] = None,  # TODO: make non-optional
-        mask: Optional[Buffer] = None,
+        size: int | None = None,  # TODO: make non-optional
+        mask: Buffer | None = None,
         offset: int = 0,
-        null_count: Optional[int] = None,
+        null_count: int | None = None,
     ):
         dtype = cudf.dtype(dtype)
         if dtype.kind != "m":
@@ -255,7 +255,7 @@ def time_unit(self) -> str:
     def fillna(
         self,
         fill_value: Any = None,
-        method: Optional[str] = None,
+        method: str | None = None,
     ) -> Self:
         if fill_value is not None:
             if cudf.utils.utils._isnat(fill_value):
@@ -316,7 +316,7 @@ def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta:
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
-    def median(self, skipna: Optional[bool] = None) -> pd.Timedelta:
+    def median(self, skipna: bool | None = None) -> pd.Timedelta:
         return pd.Timedelta(
             self.as_numerical_column("int64").median(skipna=skipna),
             unit=self.time_unit,
@@ -346,9 +346,9 @@ def quantile(
 
     def sum(
         self,
-        skipna: Optional[bool] = None,
+        skipna: bool | None = None,
         min_count: int = 0,
-        dtype: Optional[Dtype] = None,
+        dtype: Dtype | None = None,
     ) -> pd.Timedelta:
         return pd.Timedelta(
             # Since sum isn't overridden in Numerical[Base]Column, mypy only
@@ -362,7 +362,7 @@ def sum(
 
     def std(
         self,
-        skipna: Optional[bool] = None,
+        skipna: bool | None = None,
         min_count: int = 0,
         dtype: Dtype = np.float64,
         ddof: int = 1,
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 9f3de061ee8..1bf9a393566 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -6,16 +6,7 @@
 import sys
 from collections import abc
 from functools import cached_property, reduce
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Dict,
-    Mapping,
-    Optional,
-    Tuple,
-    Union,
-)
+from typing import TYPE_CHECKING, Any, Callable, Mapping
 
 import numpy as np
 import pandas as pd
@@ -98,13 +89,13 @@ class ColumnAccessor(abc.MutableMapping):
         column length and type
     """
 
-    _data: "Dict[Any, ColumnBase]"
+    _data: "dict[Any, ColumnBase]"
     multiindex: bool
-    _level_names: Tuple[Any, ...]
+    _level_names: tuple[Any, ...]
 
     def __init__(
         self,
-        data: Union[abc.MutableMapping, ColumnAccessor, None] = None,
+        data: abc.MutableMapping | ColumnAccessor | None = None,
         multiindex: bool = False,
         level_names=None,
         rangeindex: bool = False,
@@ -210,7 +201,7 @@ def _from_columns_like_self(
         )
 
     @property
-    def level_names(self) -> Tuple[Any, ...]:
+    def level_names(self) -> tuple[Any, ...]:
         if self._level_names is None or len(self._level_names) == 0:
             return tuple((None,) * max(1, self.nlevels))
         else:
@@ -237,11 +228,11 @@ def nrows(self) -> int:
             return len(next(iter(self.values())))
 
     @cached_property
-    def names(self) -> Tuple[Any, ...]:
+    def names(self) -> tuple[Any, ...]:
         return tuple(self.keys())
 
     @cached_property
-    def columns(self) -> Tuple[ColumnBase, ...]:
+    def columns(self) -> tuple[ColumnBase, ...]:
         return tuple(self.values())
 
     @cached_property
@@ -610,7 +601,7 @@ def _pad_key(self, key: Any, pad_value="") -> Any:
         return key + (pad_value,) * (self.nlevels - len(key))
 
     def rename_levels(
-        self, mapper: Union[Mapping[Any, Any], Callable], level: Optional[int]
+        self, mapper: Mapping[Any, Any] | Callable, level: int | None
     ) -> ColumnAccessor:
         """
         Rename the specified levels of the given ColumnAccessor
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d8d46a6df73..065b13561ab 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -14,20 +14,7 @@
 import warnings
 from collections import abc, defaultdict
 from collections.abc import Iterator
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Dict,
-    List,
-    Literal,
-    MutableMapping,
-    Optional,
-    Set,
-    Tuple,
-    Union,
-    cast,
-)
+from typing import TYPE_CHECKING, Any, Callable, Literal, MutableMapping, cast
 
 import cupy
 import numba
@@ -684,7 +671,7 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     """
 
     _PROTECTED_KEYS = frozenset(("_data", "_index"))
-    _accessors: Set[Any] = set()
+    _accessors: set[Any] = set()
     _loc_indexer_type = _DataFrameLocIndexer
     _iloc_indexer_type = _DataFrameIlocIndexer
     _groupby = DataFrameGroupBy
@@ -1123,7 +1110,7 @@ def _init_from_dict_like(
     def _from_data(
         cls,
         data: MutableMapping,
-        index: Optional[BaseIndex] = None,
+        index: BaseIndex | None = None,
         columns: Any = None,
     ) -> DataFrame:
         out = super()._from_data(data=data, index=index)
@@ -1553,7 +1540,7 @@ def _get_numeric_data(self):
         return self[columns]
 
     @_cudf_nvtx_annotate
-    def assign(self, **kwargs: Union[Callable[[Self], Any], Any]):
+    def assign(self, **kwargs: Callable[[Self], Any] | Any):
         """
         Assign columns to DataFrame from keyword arguments.
 
@@ -2009,12 +1996,10 @@ def _make_operands_and_index_for_binop(
         fill_value: Any = None,
         reflect: bool = False,
         can_reindex: bool = False,
-    ) -> Tuple[
-        Union[
-            Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]],
-            NotImplementedType,
-        ],
-        Optional[BaseIndex],
+    ) -> tuple[
+        dict[str | None, tuple[ColumnBase, Any, bool, Any]]
+        | NotImplementedType,
+        BaseIndex | None,
         bool,
     ]:
         lhs, rhs = self._data, other
@@ -2119,8 +2104,8 @@ def from_dict(
         cls,
         data: dict,
         orient: str = "columns",
-        dtype: Optional[Dtype] = None,
-        columns: Optional[list] = None,
+        dtype: Dtype | None = None,
+        columns: list | None = None,
     ) -> DataFrame:
         """
         Construct DataFrame from dict of array-like or dicts.
@@ -4584,7 +4569,7 @@ def apply(
     def applymap(
         self,
         func: Callable[[Any], Any],
-        na_action: Union[str, None] = None,
+        na_action: str | None = None,
         **kwargs,
     ) -> DataFrame:
         """
@@ -4617,7 +4602,7 @@ def applymap(
     def map(
         self,
         func: Callable[[Any], Any],
-        na_action: Union[str, None] = None,
+        na_action: str | None = None,
         **kwargs,
     ) -> DataFrame:
         """
@@ -7498,7 +7483,7 @@ def nunique(self, axis=0, dropna: bool = True) -> Series:
     def _sample_axis_1(
         self,
         n: int,
-        weights: Optional[ColumnLike],
+        weights: ColumnLike | None,
         replace: bool,
         random_state: np.random.RandomState,
         ignore_index: bool,
@@ -7523,11 +7508,11 @@ def _sample_axis_1(
 
     def _from_columns_like_self(
         self,
-        columns: List[ColumnBase],
-        column_names: Optional[abc.Iterable[str]] = None,
-        index_names: Optional[List[str]] = None,
+        columns: list[ColumnBase],
+        column_names: abc.Iterable[str] | None = None,
+        index_names: list[str] | None = None,
         *,
-        override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None,
+        override_dtypes: abc.Iterable[Dtype | None] | None = None,
     ) -> DataFrame:
         result = super()._from_columns_like_self(
             columns,
@@ -8128,7 +8113,7 @@ def _setitem_with_dataframe(
     input_df: DataFrame,
     replace_df: DataFrame,
     input_cols: Any = None,
-    mask: Optional[ColumnBase] = None,
+    mask: ColumnBase | None = None,
     ignore_index: bool = False,
 ):
     """
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 62ded8ac6f1..9cd573aceb9 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -1,17 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import enum
 from collections import abc
-from typing import (
-    Any,
-    Dict,
-    Iterable,
-    Mapping,
-    Optional,
-    Sequence,
-    Tuple,
-    cast,
-)
+from typing import Any, Iterable, Mapping, Sequence, Tuple, cast
 
 import cupy as cp
 import numpy as np
@@ -109,7 +101,7 @@ def __dlpack__(self):
         except ValueError:
             raise TypeError(f"dtype {self._dtype} unsupported by `dlpack`")
 
-    def __dlpack_device__(self) -> Tuple[_Device, int]:
+    def __dlpack_device__(self) -> tuple[_Device, int]:
         """
         _Device type and _Device ID for where the data in the buffer resides.
         """
@@ -265,7 +257,7 @@ def _dtype_from_cudfdtype(self, dtype) -> ProtoDtype:
         return (kind, bitwidth, format_str, endianness)
 
     @property
-    def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]:
+    def describe_categorical(self) -> tuple[bool, bool, dict[int, Any]]:
         """
         If the dtype is categorical, there are two options:
 
@@ -298,7 +290,7 @@ def describe_categorical(self) -> Tuple[bool, bool, Dict[int, Any]]:
         return ordered, is_dictionary, mapping
 
     @property
-    def describe_null(self) -> Tuple[int, Any]:
+    def describe_null(self) -> tuple[int, Any]:
         """
         Return the missing value (or "null") representation the column dtype
         uses, as a tuple ``(kind, value)``.
@@ -338,7 +330,7 @@ def null_count(self) -> int:
         return self._col.null_count
 
     @property
-    def metadata(self) -> Dict[str, Any]:
+    def metadata(self) -> dict[str, Any]:
         """
         Store specific metadata of the column.
         """
@@ -351,7 +343,7 @@ def num_chunks(self) -> int:
         return 1
 
     def get_chunks(
-        self, n_chunks: Optional[int] = None
+        self, n_chunks: int | None = None
     ) -> Iterable["_CuDFColumn"]:
         """
         Return an iterable yielding the chunks.
@@ -362,7 +354,7 @@ def get_chunks(
 
     def get_buffers(
         self,
-    ) -> Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]]:
+    ) -> Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None]:
         """
         Return a dictionary containing the underlying buffers.
 
@@ -400,7 +392,7 @@ def get_buffers(
 
     def _get_validity_buffer(
         self,
-    ) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]:
+    ) -> tuple[_CuDFBuffer, ProtoDtype] | None:
         """
         Return the buffer containing the mask values
         indicating missing data and the buffer's associated dtype.
@@ -433,7 +425,7 @@ def _get_validity_buffer(
 
     def _get_offsets_buffer(
         self,
-    ) -> Optional[Tuple[_CuDFBuffer, ProtoDtype]]:
+    ) -> tuple[_CuDFBuffer, ProtoDtype] | None:
         """
         Return the buffer containing the offset values for
         variable-size binary data (e.g., variable-length strings)
@@ -461,7 +453,7 @@ def _get_offsets_buffer(
 
     def _get_data_buffer(
         self,
-    ) -> Tuple[_CuDFBuffer, ProtoDtype]:
+    ) -> tuple[_CuDFBuffer, ProtoDtype]:
         """
         Return the buffer containing the data and
                the buffer's associated dtype.
@@ -588,7 +580,7 @@ def select_columns_by_name(self, names: Sequence[str]) -> "_CuDFDataFrame":
         )
 
     def get_chunks(
-        self, n_chunks: Optional[int] = None
+        self, n_chunks: int | None = None
     ) -> Iterable["_CuDFDataFrame"]:
         """
         Return an iterator yielding the chunks.
@@ -745,9 +737,9 @@ def from_dataframe(
 
 def _protocol_to_cudf_column_numeric(
     col, allow_copy: bool
-) -> Tuple[
+) -> tuple[
     cudf.core.column.ColumnBase,
-    Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]],
+    Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None],
 ]:
     """
     Convert an int, uint, float or bool protocol column
@@ -822,9 +814,9 @@ def protocol_dtype_to_cupy_dtype(_dtype: ProtoDtype) -> cp.dtype:
 
 def _protocol_to_cudf_column_categorical(
     col, allow_copy: bool
-) -> Tuple[
+) -> tuple[
     cudf.core.column.ColumnBase,
-    Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]],
+    Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None],
 ]:
     """
     Convert a categorical column to a Series instance
@@ -857,9 +849,9 @@ def _protocol_to_cudf_column_categorical(
 
 def _protocol_to_cudf_column_string(
     col, allow_copy: bool
-) -> Tuple[
+) -> tuple[
     cudf.core.column.ColumnBase,
-    Mapping[str, Optional[Tuple[_CuDFBuffer, ProtoDtype]]],
+    Mapping[str, tuple[_CuDFBuffer, ProtoDtype] | None],
 ]:
     """
     Convert a string ColumnObject to cudf Column object.
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index b1282040e60..034849d0e71 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import decimal
 import operator
@@ -6,7 +7,7 @@
 import textwrap
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, Type, Union
+from typing import TYPE_CHECKING, Any, Callable
 
 import numpy as np
 import pandas as pd
@@ -16,12 +17,12 @@
 from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
 
 import cudf
-from cudf._typing import Dtype
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.abc import Serializable
 from cudf.utils.docutils import doc_apply
 
 if TYPE_CHECKING:
+    from cudf._typing import Dtype
     from cudf.core.buffer import Buffer
 
 
@@ -84,11 +85,11 @@ def dtype(arbitrary):
 
 
 def _decode_type(
-    cls: Type,
+    cls: type,
     header: dict,
     frames: list,
-    is_valid_class: Callable[[Type, Type], bool] = operator.is_,
-) -> Tuple[dict, list, Type]:
+    is_valid_class: Callable[[type, type], bool] = operator.is_,
+) -> tuple[dict, list, type]:
     """Decode metadata-encoded type and check validity
 
     Parameters
@@ -481,8 +482,8 @@ def __repr__(self):
     def __hash__(self):
         return hash(self._typ)
 
-    def serialize(self) -> Tuple[dict, list]:
-        header: Dict[str, Dtype] = {}
+    def serialize(self) -> tuple[dict, list]:
+        header: dict[str, Dtype] = {}
         header["type-serialized"] = pickle.dumps(type(self))
 
         frames = []
@@ -627,13 +628,13 @@ def __repr__(self):
     def __hash__(self):
         return hash(self._typ)
 
-    def serialize(self) -> Tuple[dict, list]:
-        header: Dict[str, Any] = {}
+    def serialize(self) -> tuple[dict, list]:
+        header: dict[str, Any] = {}
         header["type-serialized"] = pickle.dumps(type(self))
 
-        frames: List[Buffer] = []
+        frames: list[Buffer] = []
 
-        fields: Dict[str, Union[bytes, Tuple[Any, Tuple[int, int]]]] = {}
+        fields: dict[str, bytes | tuple[Any, tuple[int, int]]] = {}
 
         for k, dtype in self.fields.items():
             if isinstance(dtype, _BaseDtype):
@@ -823,7 +824,7 @@ def _from_decimal(cls, decimal):
         precision = max(len(metadata.digits), -metadata.exponent)
         return cls(precision, -metadata.exponent)
 
-    def serialize(self) -> Tuple[dict, list]:
+    def serialize(self) -> tuple[dict, list]:
         return (
             {
                 "type-serialized": pickle.dumps(type(self)),
@@ -946,7 +947,7 @@ def __eq__(self, other):
     def __hash__(self):
         return hash((self.subtype, self.closed))
 
-    def serialize(self) -> Tuple[dict, list]:
+    def serialize(self) -> tuple[dict, list]:
         header = {
             "type-serialized": pickle.dumps(type(self)),
             "fields": pickle.dumps((self.subtype, self.closed)),
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 6a1ef05b1f9..c58a0161ee0 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -8,18 +8,7 @@
 import pickle
 import warnings
 from collections import abc
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Dict,
-    List,
-    Literal,
-    MutableMapping,
-    Optional,
-    Tuple,
-    Union,
-)
+from typing import TYPE_CHECKING, Any, Callable, Literal, MutableMapping
 
 # TODO: The `numpy` import is needed for typing purposes during doc builds
 # only, need to figure out why the `np` alias is insufficient then remove.
@@ -83,11 +72,11 @@ def _num_rows(self) -> int:
         return self._data.nrows
 
     @property
-    def _column_names(self) -> Tuple[Any, ...]:
+    def _column_names(self) -> tuple[Any, ...]:
         return self._data.names
 
     @property
-    def _columns(self) -> Tuple[ColumnBase, ...]:
+    def _columns(self) -> tuple[ColumnBase, ...]:
         return self._data.columns
 
     @property
@@ -154,10 +143,10 @@ def _from_data_like_self(self, data: MutableMapping) -> Self:
     @_cudf_nvtx_annotate
     def _from_columns_like_self(
         self,
-        columns: List[ColumnBase],
-        column_names: Optional[abc.Iterable[str]] = None,
+        columns: list[ColumnBase],
+        column_names: abc.Iterable[str] | None = None,
         *,
-        override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None,
+        override_dtypes: abc.Iterable[Dtype | None] | None = None,
     ):
         """Construct a Frame from a list of columns with metadata from self.
 
@@ -172,7 +161,7 @@ def _from_columns_like_self(
     @_cudf_nvtx_annotate
     def _mimic_inplace(
         self, result: Self, inplace: bool = False
-    ) -> Optional[Self]:
+    ) -> Self | None:
         if inplace:
             for col in self._data:
                 if col in result._data:
@@ -424,15 +413,15 @@ def _to_array(
         get_array: Callable,
         module: ModuleType,
         copy: bool,
-        dtype: Union[Dtype, None] = None,
+        dtype: Dtype | None = None,
         na_value=None,
-    ) -> Union[cupy.ndarray, numpy.ndarray]:
+    ) -> cupy.ndarray | numpy.ndarray:
         # Internal function to implement to_cupy and to_numpy, which are nearly
         # identical except for the attribute they access to generate values.
 
         def to_array(
             col: ColumnBase, dtype: np.dtype
-        ) -> Union[cupy.ndarray, numpy.ndarray]:
+        ) -> cupy.ndarray | numpy.ndarray:
             if na_value is not None:
                 col = col.fillna(na_value)
             array = get_array(col)
@@ -485,7 +474,7 @@ def to_array(
     @_cudf_nvtx_annotate
     def to_cupy(
         self,
-        dtype: Union[Dtype, None] = None,
+        dtype: Dtype | None = None,
         copy: bool = False,
         na_value=None,
     ) -> cupy.ndarray:
@@ -519,7 +508,7 @@ def to_cupy(
     @_cudf_nvtx_annotate
     def to_numpy(
         self,
-        dtype: Union[Dtype, None] = None,
+        dtype: Dtype | None = None,
         copy: bool = True,
         na_value=None,
     ) -> numpy.ndarray:
@@ -552,7 +541,7 @@ def to_numpy(
         )
 
     @_cudf_nvtx_annotate
-    def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]:
+    def where(self, cond, other=None, inplace: bool = False) -> Self | None:
         """
         Replace values where the condition is False.
 
@@ -628,11 +617,11 @@ def where(self, cond, other=None, inplace: bool = False) -> Optional[Self]:
     def fillna(
         self,
         value=None,
-        method: Optional[Literal["ffill", "bfill", "pad", "backfill"]] = None,
+        method: Literal["ffill", "bfill", "pad", "backfill"] | None = None,
         axis=None,
         inplace: bool = False,
         limit=None,
-    ) -> Optional[Self]:
+    ) -> Self | None:
         """Fill null values with ``value`` or specified ``method``.
 
         Parameters
@@ -1047,7 +1036,7 @@ def _copy_type_metadata(
         self,
         other: Self,
         *,
-        override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None,
+        override_dtypes: abc.Iterable[Dtype | None] | None = None,
     ) -> Self:
         """
         Copy type metadata from each column of `other` to the corresponding
@@ -1495,7 +1484,7 @@ def _unaryop(self, op):
     @_cudf_nvtx_annotate
     def _colwise_binop(
         cls,
-        operands: Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]],
+        operands: dict[str | None, tuple[ColumnBase, Any, bool, Any]],
         fn: str,
     ):
         """Implement binary ops between two frame-like objects.
@@ -1910,8 +1899,8 @@ def nunique(self, dropna: bool = True):
     @staticmethod
     @_cudf_nvtx_annotate
     def _repeat(
-        columns: List[ColumnBase], repeats, axis=None
-    ) -> List[ColumnBase]:
+        columns: list[ColumnBase], repeats, axis=None
+    ) -> list[ColumnBase]:
         if axis is not None:
             raise NotImplementedError(
                 "Only axis=`None` supported at this time."
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index aa96051ea51..d08268eea3a 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import copy
 import itertools
@@ -7,7 +8,7 @@
 import warnings
 from collections import abc
 from functools import cached_property
-from typing import Any, Iterable, List, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any, Iterable
 
 import cupy as cp
 import numpy as np
@@ -20,7 +21,6 @@
 from cudf._lib.reshape import interleave_columns
 from cudf._lib.sort import segmented_sort_by_key
 from cudf._lib.types import size_type_dtype
-from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
 from cudf.api.extensions import no_default
 from cudf.api.types import is_bool_dtype, is_list_like, is_numeric_dtype
 from cudf.core._compat import PANDAS_LT_300
@@ -34,6 +34,9 @@
 from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
 from cudf.utils.utils import GetAttrGetItemMixin
 
+if TYPE_CHECKING:
+    from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
+
 
 def _deprecate_collect():
     warnings.warn(
@@ -1033,11 +1036,11 @@ def ngroup(self, ascending=True):
 
     def sample(
         self,
-        n: Optional[int] = None,
-        frac: Optional[float] = None,
+        n: int | None = None,
+        frac: float | None = None,
         replace: bool = False,
-        weights: Union[abc.Sequence, "cudf.Series", None] = None,
-        random_state: Union[np.random.RandomState, int, None] = None,
+        weights: abc.Sequence | "cudf.Series" | None = None,
+        random_state: np.random.RandomState | int | None = None,
     ):
         """Return a random sample of items in each group.
 
@@ -1222,7 +1225,7 @@ def _grouped(self, *, include_groups: bool = True):
 
     def _normalize_aggs(
         self, aggs: MultiColumnAggType
-    ) -> Tuple[Iterable[Any], Tuple[ColumnBase, ...], List[List[AggType]]]:
+    ) -> tuple[Iterable[Any], tuple[ColumnBase, ...], list[list[AggType]]]:
         """
         Normalize aggs to a list of list of aggregations, where `out[i]`
         is a list of aggregations for column `self.obj[i]`. We support three
@@ -1237,7 +1240,7 @@ def _normalize_aggs(
         Each agg can be string or lambda functions.
         """
 
-        aggs_per_column: Iterable[Union[AggType, Iterable[AggType]]]
+        aggs_per_column: Iterable[AggType | Iterable[AggType]]
         if isinstance(aggs, dict):
             column_names, aggs_per_column = aggs.keys(), aggs.values()
             columns = tuple(self.obj._data[col] for col in column_names)
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 11d09e470ff..13fa187842d 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -7,17 +7,7 @@
 import warnings
 from functools import cache, cached_property
 from numbers import Number
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    List,
-    Literal,
-    MutableMapping,
-    Optional,
-    Tuple,
-    Union,
-    cast,
-)
+from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast
 
 import cupy
 import numpy as np
@@ -101,10 +91,10 @@ def __subclasscheck__(self, subclass):
 
 
 def _lexsorted_equal_range(
-    idx: Union[Index, cudf.MultiIndex],
+    idx: Index | cudf.MultiIndex,
     key_as_table: Frame,
     is_sorted: bool,
-) -> Tuple[int, int, Optional[ColumnBase]]:
+) -> tuple[int, int, ColumnBase | None]:
     """Get equal range for key in lexicographically sorted index. If index
     is not sorted when called, a sort will take place and `sort_inds` is
     returned. Otherwise `None` is returned in that position.
@@ -2858,7 +2848,7 @@ class IntervalIndex(Index):
     def __init__(
         self,
         data,
-        closed: Optional[Literal["left", "right", "neither", "both"]] = None,
+        closed: Literal["left", "right", "neither", "both"] | None = None,
         dtype=None,
         copy: bool = False,
         name=None,
@@ -2917,9 +2907,7 @@ def closed(self):
     def from_breaks(
         cls,
         breaks,
-        closed: Optional[
-            Literal["left", "right", "neither", "both"]
-        ] = "right",
+        closed: Literal["left", "right", "neither", "both"] | None = "right",
         name=None,
         copy: bool = False,
         dtype=None,
@@ -3106,7 +3094,7 @@ def _getdefault_name(values, name):
 
 
 @_cudf_nvtx_annotate
-def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex:
+def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex:
     """
     An internal Utility function to concat RangeIndex objects.
     """
@@ -3147,7 +3135,7 @@ def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex:
 
 
 @_cudf_nvtx_annotate
-def _extended_gcd(a: int, b: int) -> Tuple[int, int, int]:
+def _extended_gcd(a: int, b: int) -> tuple[int, int, int]:
     """
     Extended Euclidean algorithms to solve Bezout's identity:
        a*x + b*y = gcd(x, y)
@@ -3197,7 +3185,7 @@ def _get_nearest_indexer(
     index: Index,
     positions: cudf.Series,
     target_col: cudf.core.column.ColumnBase,
-    tolerance: Union[int, float],
+    tolerance: int | float,
 ):
     """
     Get the indexer for the nearest index labels; requires an index with
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 3a4f4874e35..06da62306e8 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -12,15 +12,9 @@
     TYPE_CHECKING,
     Any,
     Callable,
-    Dict,
-    List,
     Literal,
     MutableMapping,
-    Optional,
-    Tuple,
-    Type,
     TypeVar,
-    Union,
     cast,
 )
 from uuid import uuid4
@@ -258,8 +252,8 @@ class IndexedFrame(Frame):
     """
 
     # mypy can't handle bound type variables as class members
-    _loc_indexer_type: Type[_LocIndexerClass]  # type: ignore
-    _iloc_indexer_type: Type[_IlocIndexerClass]  # type: ignore
+    _loc_indexer_type: type[_LocIndexerClass]  # type: ignore
+    _iloc_indexer_type: type[_IlocIndexerClass]  # type: ignore
     _index: cudf.core.index.BaseIndex
     _groupby = GroupBy
     _resampler = _Resampler
@@ -294,14 +288,14 @@ def _num_rows(self) -> int:
         return len(self.index)
 
     @property
-    def _index_names(self) -> Tuple[Any, ...]:  # TODO: Tuple[str]?
+    def _index_names(self) -> tuple[Any, ...]:  # TODO: Tuple[str]?
         return self.index._data.names
 
     @classmethod
     def _from_data(
         cls,
         data: MutableMapping,
-        index: Optional[BaseIndex] = None,
+        index: BaseIndex | None = None,
     ):
         out = super()._from_data(data)
         out._index = RangeIndex(out._data.nrows) if index is None else index
@@ -316,11 +310,11 @@ def _from_data_like_self(self, data: MutableMapping):
     @_cudf_nvtx_annotate
     def _from_columns_like_self(
         self,
-        columns: List[ColumnBase],
-        column_names: Optional[abc.Iterable[str]] = None,
-        index_names: Optional[List[str]] = None,
+        columns: list[ColumnBase],
+        column_names: abc.Iterable[str] | None = None,
+        index_names: list[str] | None = None,
         *,
-        override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None,
+        override_dtypes: abc.Iterable[Dtype | None] | None = None,
     ) -> Self:
         """Construct a `Frame` from a list of columns with metadata from self.
 
@@ -368,7 +362,7 @@ def __round__(self, digits=0):
 
     def _mimic_inplace(
         self, result: Self, inplace: bool = False
-    ) -> Optional[Self]:
+    ) -> Self | None:
         if inplace:
             self._index = result.index
         return super()._mimic_inplace(result, inplace)
@@ -1788,7 +1782,7 @@ def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         )
 
     @_cudf_nvtx_annotate
-    def mask(self, cond, other=None, inplace: bool = False) -> Optional[Self]:
+    def mask(self, cond, other=None, inplace: bool = False) -> Self | None:
         """
         Replace values where the condition is True.
 
@@ -1924,7 +1918,7 @@ def _copy_type_metadata(
         other: Self,
         include_index: bool = True,
         *,
-        override_dtypes: Optional[abc.Iterable[Optional[Dtype]]] = None,
+        override_dtypes: abc.Iterable[Dtype | None] | None = None,
     ) -> Self:
         """
         Copy type metadata from each column of `other` to the corresponding
@@ -4670,9 +4664,9 @@ def sample(
     def _sample_axis_0(
         self,
         n: int,
-        weights: Optional[ColumnLike],
+        weights: ColumnLike | None,
         replace: bool,
-        random_state: Union[np.random.RandomState, cp.random.RandomState],
+        random_state: np.random.RandomState | cp.random.RandomState,
         ignore_index: bool,
     ):
         try:
@@ -4695,7 +4689,7 @@ def _sample_axis_0(
     def _sample_axis_1(
         self,
         n: int,
-        weights: Optional[ColumnLike],
+        weights: ColumnLike | None,
         replace: bool,
         random_state: np.random.RandomState,
         ignore_index: bool,
@@ -4742,12 +4736,10 @@ def _make_operands_and_index_for_binop(
         fill_value: Any = None,
         reflect: bool = False,
         can_reindex: bool = False,
-    ) -> Tuple[
-        Union[
-            Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]],
-            NotImplementedType,
-        ],
-        Optional[cudf.BaseIndex],
+    ) -> tuple[
+        dict[str | None, tuple[ColumnBase, Any, bool, Any]]
+        | NotImplementedType,
+        cudf.BaseIndex | None,
         bool,
     ]:
         raise NotImplementedError(
@@ -6328,8 +6320,8 @@ def _check_duplicate_level_names(specified, level_names):
 
 @_cudf_nvtx_annotate
 def _get_replacement_values_for_columns(
-    to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any]
-) -> Tuple[Dict[Any, bool], Dict[Any, Any], Dict[Any, Any]]:
+    to_replace: Any, value: Any, columns_dtype_map: dict[Any, Any]
+) -> tuple[dict[Any, bool], dict[Any, Any], dict[Any, Any]]:
     """
     Returns a per column mapping for the values to be replaced, new
     values to be replaced with and if all the values are empty.
@@ -6354,9 +6346,9 @@ def _get_replacement_values_for_columns(
         A dict mapping of all columns and the corresponding values
         to be replaced with.
     """
-    to_replace_columns: Dict[Any, Any] = {}
-    values_columns: Dict[Any, Any] = {}
-    all_na_columns: Dict[Any, Any] = {}
+    to_replace_columns: dict[Any, Any] = {}
+    values_columns: dict[Any, Any] = {}
+    all_na_columns: dict[Any, Any] = {}
 
     if is_scalar(to_replace) and is_scalar(value):
         to_replace_columns = {col: [to_replace] for col in columns_dtype_map}
@@ -6496,8 +6488,8 @@ def _is_series(obj):
 @_cudf_nvtx_annotate
 def _drop_rows_by_labels(
     obj: DataFrameOrSeries,
-    labels: Union[ColumnLike, abc.Iterable, str],
-    level: Union[int, str],
+    labels: ColumnLike | abc.Iterable | str,
+    level: int | str,
     errors: str,
 ) -> DataFrameOrSeries:
     """Remove rows specified by `labels`.
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index 7242de9964f..73a1cd26367 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -1,9 +1,9 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
 from dataclasses import dataclass
-from typing import Any, List, Tuple, Union
+from typing import Any, List, Union
 
 from typing_extensions import TypeAlias
 
@@ -59,7 +59,7 @@ class ScalarIndexer:
 
 
 def destructure_iloc_key(
-    key: Any, frame: Union[cudf.Series, cudf.DataFrame]
+    key: Any, frame: cudf.Series | cudf.DataFrame
 ) -> tuple[Any, ...]:
     """
     Destructure a potentially tuple-typed key into row and column indexers.
@@ -124,7 +124,7 @@ def destructure_iloc_key(
 
 def destructure_dataframe_iloc_indexer(
     key: Any, frame: cudf.DataFrame
-) -> Tuple[Any, Tuple[bool, ColumnLabels]]:
+) -> tuple[Any, tuple[bool, ColumnLabels]]:
     """Destructure an index key for DataFrame iloc getitem.
 
     Parameters
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index 05cbb4429b9..dd0a4f666a1 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -4,7 +4,7 @@
 
 import warnings
 from collections import abc
-from typing import TYPE_CHECKING, Any, Tuple, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import numpy as np
 
@@ -51,7 +51,7 @@ def set(self, obj: cudf.DataFrame, value: ColumnBase, validate=False):
 
 def _match_join_keys(
     lcol: ColumnBase, rcol: ColumnBase, how: str
-) -> Tuple[ColumnBase, ColumnBase]:
+) -> tuple[ColumnBase, ColumnBase]:
     # Casts lcol and rcol to a common dtype for use as join keys. If no casting
     # is necessary, they are returned as is.
 
@@ -133,7 +133,7 @@ def _match_join_keys(
 
 def _match_categorical_dtypes_both(
     lcol: CategoricalColumn, rcol: CategoricalColumn, how: str
-) -> Tuple[ColumnBase, ColumnBase]:
+) -> tuple[ColumnBase, ColumnBase]:
     ltype, rtype = lcol.dtype, rcol.dtype
 
     # when both are ordered and both have the same categories,
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index da999441ca3..ce81c1fc5b1 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -2,7 +2,7 @@
 from __future__ import annotations
 
 import itertools
-from typing import Any, ClassVar, List, Optional
+from typing import Any, ClassVar
 
 import cudf
 from cudf import _lib as libcudf
@@ -370,7 +370,7 @@ def _merge_results(
         else:
             multiindex_columns = False
 
-        index: Optional[cudf.BaseIndex]
+        index: cudf.BaseIndex | None
         if self._using_right_index:
             # right_index and left_on
             index = left_result.index
@@ -398,7 +398,7 @@ def _sort_result(self, result: cudf.DataFrame) -> cudf.DataFrame:
         # This is taken care of by using a stable sort here, and (in
         # pandas-compat mode) reordering the gather maps before
         # producing the input result.
-        by: List[Any] = []
+        by: list[Any] = []
         if self._using_left_index and self._using_right_index:
             by.extend(result.index._data.columns)
         if not self._using_left_index:
diff --git a/python/cudf/cudf/core/mixins/binops.pyi b/python/cudf/cudf/core/mixins/binops.pyi
index 8587b2dea48..6be73e25332 100644
--- a/python/cudf/cudf/core/mixins/binops.pyi
+++ b/python/cudf/cudf/core/mixins/binops.pyi
@@ -1,12 +1,12 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-from typing import Any, Set, Tuple, TypeVar
+from typing import Any, TypeVar
 
 # Note: It may be possible to define a narrower bound here eventually.
 BinaryOperandType = TypeVar("BinaryOperandType", bound="Any")
 
 class BinaryOperand:
-    _SUPPORTED_BINARY_OPERATIONS: Set
+    _SUPPORTED_BINARY_OPERATIONS: set
 
     def _binaryop(self, other: BinaryOperandType, op: str): ...
     def __add__(self, other): ...
@@ -36,4 +36,4 @@ class BinaryOperand:
     def __gt__(self, other): ...
     def __ge__(self, other): ...
     @staticmethod
-    def _check_reflected_op(op) -> Tuple[bool, str]: ...
+    def _check_reflected_op(op) -> tuple[bool, str]: ...
diff --git a/python/cudf/cudf/core/mixins/reductions.pyi b/python/cudf/cudf/core/mixins/reductions.pyi
index dbaafdb5cd2..1c2126002ad 100644
--- a/python/cudf/cudf/core/mixins/reductions.pyi
+++ b/python/cudf/cudf/core/mixins/reductions.pyi
@@ -1,9 +1,7 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-from typing import Set
-
 class Reducible:
-    _SUPPORTED_REDUCTIONS: Set
+    _SUPPORTED_REDUCTIONS: set
 
     def sum(self): ...
     def product(self): ...
diff --git a/python/cudf/cudf/core/mixins/scans.pyi b/python/cudf/cudf/core/mixins/scans.pyi
index 37995241b1f..5190750c698 100644
--- a/python/cudf/cudf/core/mixins/scans.pyi
+++ b/python/cudf/cudf/core/mixins/scans.pyi
@@ -1,9 +1,7 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-from typing import Set
-
 class Scannable:
-    _SUPPORTED_SCANS: Set
+    _SUPPORTED_SCANS: set
 
     def cumsum(self): ...
     def cumprod(self): ...
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 91488e06f4e..832cc003d2e 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -10,7 +10,7 @@
 from collections import abc
 from functools import cached_property
 from numbers import Integral
-from typing import TYPE_CHECKING, Any, List, MutableMapping, Tuple, Union
+from typing import TYPE_CHECKING, Any, MutableMapping
 
 import cupy as cp
 import numpy as np
@@ -40,7 +40,7 @@
     from cudf._typing import DataFrameOrSeries
 
 
-def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
+def _maybe_indices_to_slice(indices: cp.ndarray) -> slice | cp.ndarray:
     """Makes best effort to convert an array of indices into a python slice.
     If the conversion is not possible, return input. `indices` are expected
     to be valid.
@@ -849,9 +849,10 @@ def _index_and_downcast(self, result, index, index_key):
     def _get_row_major(
         self,
         df: DataFrameOrSeries,
-        row_tuple: Union[
-            numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]]
-        ],
+        row_tuple: numbers.Number
+        | slice
+        | tuple[Any, ...]
+        | list[tuple[Any, ...]],
     ) -> DataFrameOrSeries:
         if pd.api.types.is_bool_dtype(
             list(row_tuple) if isinstance(row_tuple, tuple) else row_tuple
@@ -874,9 +875,10 @@ def _get_row_major(
     @_cudf_nvtx_annotate
     def _validate_indexer(
         self,
-        indexer: Union[
-            numbers.Number, slice, Tuple[Any, ...], List[Tuple[Any, ...]]
-        ],
+        indexer: numbers.Number
+        | slice
+        | tuple[Any, ...]
+        | list[tuple[Any, ...]],
     ):
         if isinstance(indexer, numbers.Number):
             return
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 53239cb7ea0..903c4fe7df5 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import itertools
 import warnings
-from typing import Dict, Optional
+from typing import TYPE_CHECKING
 
 import numpy as np
 import pandas as pd
@@ -10,13 +11,15 @@
 import cudf
 from cudf._lib.transform import one_hot_encode
 from cudf._lib.types import size_type_dtype
-from cudf._typing import Dtype
 from cudf.api.extensions import no_default
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.column import ColumnBase, as_column, column_empty_like
 from cudf.core.column.categorical import CategoricalColumn
 from cudf.utils.dtypes import min_unsigned_type
 
+if TYPE_CHECKING:
+    from cudf._typing import Dtype
+
 _AXIS_MAP = {0: 0, 1: 1, "index": 0, "columns": 1}
 
 
@@ -1217,10 +1220,10 @@ def _get_unique(column, dummy_na):
 def _one_hot_encode_column(
     column: ColumnBase,
     categories: ColumnBase,
-    prefix: Optional[str],
-    prefix_sep: Optional[str],
-    dtype: Optional[Dtype],
-) -> Dict[str, ColumnBase]:
+    prefix: str | None,
+    prefix_sep: str | None,
+    dtype: Dtype | None,
+) -> dict[str, ColumnBase]:
     """Encode a single column with one hot encoding. The return dictionary
     contains pairs of (category, encodings). The keys may be prefixed with
     `prefix`, separated with category name with `prefix_sep`. The encoding
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index ebf6910ca5f..e532948fd11 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -9,17 +9,7 @@
 import warnings
 from collections import abc
 from shutil import get_terminal_size
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Dict,
-    Literal,
-    MutableMapping,
-    Optional,
-    Set,
-    Tuple,
-    Union,
-)
+from typing import TYPE_CHECKING, Any, Literal, MutableMapping
 
 import cupy
 import numpy as np
@@ -285,7 +275,7 @@ class _SeriesLocIndexer(_FrameIndexer):
     """
 
     @_cudf_nvtx_annotate
-    def __getitem__(self, arg: Any) -> Union[ScalarLike, DataFrameOrSeries]:
+    def __getitem__(self, arg: Any) -> ScalarLike | DataFrameOrSeries:
         if isinstance(arg, pd.MultiIndex):
             arg = cudf.from_pandas(arg)
 
@@ -464,7 +454,7 @@ class Series(SingleColumnFrame, IndexedFrame, Serializable):
         If ``False``, leaves ``np.nan`` values as is.
     """
 
-    _accessors: Set[Any] = set()
+    _accessors: set[Any] = set()
     _loc_indexer_type = _SeriesLocIndexer
     _iloc_indexer_type = _SeriesIlocIndexer
     _groupby = SeriesGroupBy
@@ -677,7 +667,7 @@ def __init__(
     def _from_data(
         cls,
         data: MutableMapping,
-        index: Optional[BaseIndex] = None,
+        index: BaseIndex | None = None,
         name: Any = no_default,
     ) -> Series:
         out = super()._from_data(data=data, index=index)
@@ -1311,7 +1301,7 @@ def map(self, arg, na_action=None) -> "Series":
     def _getitem_preprocessed(
         self,
         spec: indexing_utils.IndexingSpec,
-    ) -> Union[Self, ScalarLike]:
+    ) -> Self | ScalarLike:
         """Get subset of entries given structured data
 
         Parameters
@@ -1473,12 +1463,10 @@ def _make_operands_and_index_for_binop(
         fill_value: Any = None,
         reflect: bool = False,
         can_reindex: bool = False,
-    ) -> Tuple[
-        Union[
-            Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]],
-            NotImplementedType,
-        ],
-        Optional[BaseIndex],
+    ) -> tuple[
+        dict[str | None, tuple[ColumnBase, Any, bool, Any]]
+        | NotImplementedType,
+        BaseIndex | None,
         bool,
     ]:
         # Specialize binops to align indices.
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 43b5dc76f13..23a2c828a04 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -3,7 +3,7 @@
 
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union
+from typing import TYPE_CHECKING, Any
 
 from typing_extensions import Self
 
@@ -274,10 +274,10 @@ def _make_operands_for_binop(
         other: Any,
         fill_value: Any = None,
         reflect: bool = False,
-    ) -> Union[
-        Dict[Optional[str], Tuple[ColumnBase, Any, bool, Any]],
-        NotImplementedType,
-    ]:
+    ) -> (
+        dict[str | None, tuple[ColumnBase, Any, bool, Any]]
+        | NotImplementedType
+    ):
         """Generate the dictionary of operands used for a binary operation.
 
         Parameters
@@ -340,7 +340,7 @@ def nunique(self, dropna: bool = True) -> int:
         """
         return self._column.distinct_count(dropna=dropna)
 
-    def _get_elements_from_column(self, arg) -> Union[ScalarLike, ColumnBase]:
+    def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase:
         # A generic method for getting elements from a column that supports a
         # wide range of different inputs. This method should only used where
         # _absolutely_ necessary, since in almost all cases a more specific
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
index 24c49e3662a..9e59b134b73 100644
--- a/python/cudf/cudf/core/subword_tokenizer.py
+++ b/python/cudf/cudf/core/subword_tokenizer.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import warnings
-from typing import Union
 
 import cupy as cp
 
@@ -60,7 +59,7 @@ def __call__(
         max_num_rows: int,
         add_special_tokens: bool = True,
         padding: str = "max_length",
-        truncation: Union[bool, str] = False,
+        truncation: bool | str = False,
         stride: int = 0,
         return_tensors: str = "cp",
         return_token_type_ids: bool = False,
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index f002a838fa9..29130130732 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import math
 import re
 import warnings
-from typing import Literal, Optional, Sequence, Union
+from typing import Literal, Sequence
 
 import cupy as cp
 import numpy as np
@@ -61,7 +62,7 @@ def to_datetime(
     dayfirst: bool = False,
     yearfirst: bool = False,
     utc: bool = False,
-    format: Optional[str] = None,
+    format: str | None = None,
     exact: bool = True,
     unit: str = "ns",
     infer_datetime_format: bool = True,
@@ -313,7 +314,7 @@ def _process_col(
     unit: str,
     dayfirst: bool,
     infer_datetime_format: bool,
-    format: Optional[str],
+    format: str | None,
     utc: bool,
 ):
     if col.dtype.kind == "f":
@@ -707,7 +708,7 @@ def _from_freqstr(cls, freqstr: str) -> Self:
     @classmethod
     def _from_pandas_ticks_or_weeks(
         cls,
-        tick: Union[pd.tseries.offsets.Tick, pd.tseries.offsets.Week],
+        tick: pd.tseries.offsets.Tick | pd.tseries.offsets.Week,
     ) -> Self:
         return cls(**{cls._TICK_OR_WEEK_TO_UNITS[type(tick)]: tick.n})
 
@@ -725,7 +726,7 @@ def _maybe_as_fast_pandas_offset(self):
 
 
 def _isin_datetimelike(
-    lhs: Union[column.TimeDeltaColumn, column.DatetimeColumn], values: Sequence
+    lhs: column.TimeDeltaColumn | column.DatetimeColumn, values: Sequence
 ) -> column.ColumnBase:
     """
     Check whether values are contained in the
@@ -784,7 +785,7 @@ def date_range(
     name=None,
     closed: Literal["left", "right", "both", "neither"] = "both",
     *,
-    unit: Optional[str] = None,
+    unit: str | None = None,
 ):
     """Return a fixed frequency DatetimeIndex.
 
diff --git a/python/cudf/cudf/core/udf/groupby_typing.py b/python/cudf/cudf/core/udf/groupby_typing.py
index 72088493074..dffd7db2f71 100644
--- a/python/cudf/cudf/core/udf/groupby_typing.py
+++ b/python/cudf/cudf/core/udf/groupby_typing.py
@@ -1,5 +1,7 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-from typing import Any, Dict
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+from typing import Any
 
 import numba
 from numba import cuda, types
@@ -124,7 +126,7 @@ def __init__(self, dmm, fe_type):
         super().__init__(dmm, fe_type, members)
 
 
-call_cuda_functions: Dict[Any, Any] = {}
+call_cuda_functions: dict[Any, Any] = {}
 
 
 def _register_cuda_binary_reduction_caller(funcname, lty, rty, retty):
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index bc1f4f2557e..f1704e4ea78 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -1,8 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import functools
 import os
-from typing import Any, Callable, Dict
+from typing import Any, Callable
 
 import cachetools
 import cupy as cp
@@ -57,7 +58,7 @@
 MASK_BITSIZE = np.dtype("int32").itemsize * 8
 
 precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
-launch_arg_getters: Dict[Any, Any] = {}
+launch_arg_getters: dict[Any, Any] = {}
 
 
 @functools.cache
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index dbdb2093b72..58b104b84e9 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -10,7 +10,7 @@
 from collections import defaultdict
 from contextlib import ExitStack
 from functools import partial, reduce
-from typing import Callable, Dict, List, Optional, Tuple
+from typing import Callable
 from uuid import uuid4
 
 import numpy as np
@@ -679,7 +679,7 @@ def read_parquet(
     return df
 
 
-def _normalize_filters(filters: list | None) -> List[List[tuple]] | None:
+def _normalize_filters(filters: list | None) -> list[list[tuple]] | None:
     # Utility to normalize and validate the `filters`
     # argument to `read_parquet`
     if not filters:
@@ -709,7 +709,7 @@ def _validate_predicate(item):
 
 
 def _apply_post_filters(
-    df: cudf.DataFrame, filters: List[List[tuple]] | None
+    df: cudf.DataFrame, filters: list[list[tuple]] | None
 ) -> cudf.DataFrame:
     """Apply DNF filters to an in-memory DataFrame
 
@@ -738,7 +738,7 @@ def _handle_is(column: cudf.Series, value, *, negate) -> cudf.Series:
             )
         return ~column.isna() if negate else column.isna()
 
-    handlers: Dict[str, Callable] = {
+    handlers: dict[str, Callable] = {
         "==": operator.eq,
         "!=": operator.ne,
         "<": operator.lt,
@@ -1311,7 +1311,7 @@ def __init__(
     ) -> None:
         if isinstance(path, str) and path.startswith("s3://"):
             self.fs_meta = {"is_s3": True, "actual_path": path}
-            self.dir_: Optional[tempfile.TemporaryDirectory] = (
+            self.dir_: tempfile.TemporaryDirectory | None = (
                 tempfile.TemporaryDirectory()
             )
             self.path = self.dir_.name
@@ -1328,12 +1328,12 @@ def __init__(
         self.partition_cols = partition_cols
         # Collection of `ParquetWriter`s, and the corresponding
         # partition_col values they're responsible for
-        self._chunked_writers: List[
-            Tuple[libparquet.ParquetWriter, List[str], str]
+        self._chunked_writers: list[
+            tuple[libparquet.ParquetWriter, list[str], str]
         ] = []
         # Map of partition_col values to their ParquetWriter's index
         # in self._chunked_writers for reverse lookup
-        self.path_cw_map: Dict[str, int] = {}
+        self.path_cw_map: dict[str, int] = {}
         self.storage_options = storage_options
         self.filename = file_name_prefix
         self.max_file_size = max_file_size
@@ -1345,7 +1345,7 @@ def __init__(
                 )
             self.max_file_size = _parse_bytes(max_file_size)
 
-        self._file_sizes: Dict[str, int] = {}
+        self._file_sizes: dict[str, int] = {}
 
     @_cudf_nvtx_annotate
     def write_table(self, df):
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index efa8eabd8b8..fb5a963f008 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -1,11 +1,14 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import os
 import textwrap
-from collections.abc import Container
 from contextlib import ContextDecorator
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, Optional
+from typing import TYPE_CHECKING, Any, Callable
+
+if TYPE_CHECKING:
+    from collections.abc import Container
 
 
 @dataclass
@@ -16,7 +19,7 @@ class Option:
     validator: Callable
 
 
-_OPTIONS: Dict[str, Option] = {}
+_OPTIONS: dict[str, Option] = {}
 
 
 def _env_get_int(name, default):
@@ -123,7 +126,7 @@ def _build_option_description(name, opt):
     )
 
 
-def describe_option(name: Optional[str] = None):
+def describe_option(name: str | None = None):
     """Prints the description of an option.
 
     If `name` is unspecified, prints the description of all available options.
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 128913e5746..1540c6850e7 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -12,17 +12,7 @@
 import warnings
 from collections.abc import Iterator
 from enum import IntEnum
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    Literal,
-    Mapping,
-    Optional,
-    Set,
-    Tuple,
-    Type,
-)
+from typing import Any, Callable, Literal, Mapping
 
 import numpy as np
 
@@ -118,12 +108,12 @@ def make_final_proxy_type(
     *,
     fast_to_slow: Callable,
     slow_to_fast: Callable,
-    module: Optional[str] = None,
+    module: str | None = None,
     additional_attributes: Mapping[str, Any] | None = None,
     postprocess: Callable[[_FinalProxy, Any, Any], Any] | None = None,
-    bases: Tuple = (),
-    metaclasses: Tuple = (),
-) -> Type[_FinalProxy]:
+    bases: tuple = (),
+    metaclasses: tuple = (),
+) -> type[_FinalProxy]:
     """
     Defines a fast-slow proxy type for a pair of "final" fast and slow
     types. Final types are types for which known operations exist for
@@ -270,8 +260,8 @@ def make_intermediate_proxy_type(
     fast_type: type,
     slow_type: type,
     *,
-    module: Optional[str] = None,
-) -> Type[_IntermediateProxy]:
+    module: str | None = None,
+) -> type[_IntermediateProxy]:
     """
     Defines a proxy type for a pair of "intermediate" fast and slow
     types. Intermediate types are the types of the results of
@@ -613,13 +603,13 @@ class _IntermediateProxy(_FastSlowProxy):
     `make_intermediate_proxy_type` to create subtypes.
     """
 
-    _method_chain: Tuple[Callable, Tuple, Dict]
+    _method_chain: tuple[Callable, tuple, dict]
 
     @classmethod
     def _fsproxy_wrap(
         cls,
         obj: Any,
-        method_chain: Tuple[Callable, Tuple, Dict],
+        method_chain: tuple[Callable, tuple, dict],
     ):
         """
         Parameters
@@ -955,7 +945,7 @@ def _fast_slow_function_call(
 def _transform_arg(
     arg: Any,
     attribute_name: Literal["_fsproxy_slow", "_fsproxy_fast"],
-    seen: Set[int],
+    seen: set[int],
 ) -> Any:
     """
     Transform "arg" into its corresponding slow (or fast) type.
@@ -1052,7 +1042,7 @@ def _fast_arg(arg: Any) -> Any:
     """
     Transform "arg" into its corresponding fast type.
     """
-    seen: Set[int] = set()
+    seen: set[int] = set()
     return _transform_arg(arg, "_fsproxy_fast", seen)
 
 
@@ -1060,7 +1050,7 @@ def _slow_arg(arg: Any) -> Any:
     """
     Transform "arg" into its corresponding slow type.
     """
-    seen: Set[int] = set()
+    seen: set[int] = set()
     return _transform_arg(arg, "_fsproxy_slow", seen)
 
 
@@ -1137,7 +1127,7 @@ def _is_function_or_method(obj: Any) -> bool:
 def _replace_closurevars(
     f: types.FunctionType,
     attribute_name: Literal["_fsproxy_slow", "_fsproxy_fast"],
-    seen: Set[int],
+    seen: set[int],
 ) -> Callable[..., Any]:
     """
     Return a copy of `f` with its closure variables replaced with
@@ -1199,10 +1189,10 @@ def is_proxy_object(obj: Any) -> bool:
     return False
 
 
-NUMPY_TYPES: Set[str] = set(np.sctypeDict.values())
+NUMPY_TYPES: set[str] = set(np.sctypeDict.values())
 
 
-_SPECIAL_METHODS: Set[str] = {
+_SPECIAL_METHODS: set[str] = {
     "__abs__",
     "__add__",
     "__and__",
diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py
index 1d431c6d882..f82e300e83d 100644
--- a/python/cudf/cudf/pandas/module_accelerator.py
+++ b/python/cudf/cudf/pandas/module_accelerator.py
@@ -17,7 +17,7 @@
 from abc import abstractmethod
 from importlib._bootstrap import _ImportLockContext as ImportLock
 from types import ModuleType
-from typing import Any, ContextManager, Dict, NamedTuple, Tuple
+from typing import Any, ContextManager, NamedTuple
 
 from typing_extensions import Self
 
@@ -377,7 +377,7 @@ class ModuleAccelerator(ModuleAcceleratorBase):
     attempts to call the fast version first).
     """
 
-    _denylist: Tuple[str]
+    _denylist: tuple[str]
     _use_fast_lib: bool
     _use_fast_lib_lock: threading.RLock
     _module_cache_prefix: str = "_slow_lib_"
@@ -519,7 +519,7 @@ def disabled(self):
     def getattr_real_or_wrapped(
         name: str,
         *,
-        real: Dict[str, Any],
+        real: dict[str, Any],
         wrapped_objs,
         loader: ModuleAccelerator,
     ) -> Any:
diff --git a/python/cudf/cudf/pandas/profiler.py b/python/cudf/cudf/pandas/profiler.py
index 0dbd333ce4f..0fb41fc0b26 100644
--- a/python/cudf/cudf/pandas/profiler.py
+++ b/python/cudf/cudf/pandas/profiler.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
 
 import inspect
 import operator
@@ -8,7 +9,6 @@
 import sys
 import time
 from collections import defaultdict
-from typing import Union
 
 from rich.console import Console
 from rich.syntax import Syntax
@@ -119,12 +119,10 @@ def __exit__(self, *args, **kwargs):
 
     @staticmethod
     def get_namespaced_function_name(
-        func_obj: Union[
-            _FunctionProxy,
-            _MethodProxy,
-            type[_FinalProxy],
-            type[_IntermediateProxy],
-        ],
+        func_obj: _FunctionProxy
+        | _MethodProxy
+        | type[_FinalProxy]
+        | type[_IntermediateProxy],
     ):
         if isinstance(func_obj, _MethodProxy):
             return func_obj._fsproxy_slow.__qualname__
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index 54d38f1a8cf..bf927e661fe 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -1,6 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-
-from typing import Optional, Union
+from __future__ import annotations
 
 import pyarrow as pa
 import pytest
@@ -10,7 +9,7 @@
 
 def metadata_from_arrow_array(
     pa_array: pa.Array,
-) -> Optional[plc.interop.ColumnMetadata]:
+) -> plc.interop.ColumnMetadata | None:
     metadata = None
     if pa.types.is_list(dtype := pa_array.type) or pa.types.is_struct(dtype):
         metadata = plc.interop.ColumnMetadata(
@@ -25,7 +24,7 @@ def metadata_from_arrow_array(
 
 
 def assert_column_eq(
-    lhs: Union[pa.Array, plc.Column], rhs: Union[pa.Array, plc.Column]
+    lhs: pa.Array | plc.Column, rhs: pa.Array | plc.Column
 ) -> None:
     """Verify that a pylibcudf array and PyArrow array are equal."""
     # Nested types require children metadata to be passed to the conversion function.
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index 0e38b10ed52..238e8d990cc 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -11,10 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
+
 import datetime
 import io
 import pathlib
-from typing import Optional
 
 import fastavro
 import numpy as np
@@ -292,7 +293,7 @@ def test_can_detect_dtypes_from_avro_logical_type(
     assert_eq(expected, actual)
 
 
-def get_days_from_epoch(date: Optional[datetime.date]) -> Optional[int]:
+def get_days_from_epoch(date: datetime.date | None) -> int | None:
     if date is None:
         return None
     return (date - datetime.date(1970, 1, 1)).days
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index a22b678ebe6..8ce4da792a4 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
-from typing import Any, Tuple
+from typing import Any
 
 import cupy as cp
 import pandas as pd
@@ -64,7 +65,7 @@ def assert_validity_equal(protocol_buffer, cudf_buffer, size, null, valid):
         raise NotImplementedError()
 
 
-def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
+def assert_buffer_equal(buffer_and_dtype: tuple[_CuDFBuffer, Any], cudfcol):
     buf, dtype = buffer_and_dtype
     device_id = cp.asarray(cudfcol.data).device.id
     assert buf.__dlpack_device__() == (2, device_id)
diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py
index 913a958b4c2..59b8e6d2e70 100644
--- a/python/cudf/cudf/tests/test_spilling.py
+++ b/python/cudf/cudf/tests/test_spilling.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import contextlib
 import importlib
@@ -7,7 +8,6 @@
 import warnings
 import weakref
 from concurrent.futures import ThreadPoolExecutor
-from typing import List, Tuple
 
 import cupy
 import numpy as np
@@ -107,7 +107,7 @@ def single_column_df_base_data(df: cudf.DataFrame) -> SpillableBuffer:
 gen_df_data_nbytes = single_column_df()._data._data["a"].data.nbytes
 
 
-def spilled_and_unspilled(manager: SpillManager) -> Tuple[int, int]:
+def spilled_and_unspilled(manager: SpillManager) -> tuple[int, int]:
     """Get bytes spilled and unspilled known by the manager"""
     spilled = sum(buf.size for buf in manager.buffers() if buf.is_spilled)
     unspilled = sum(
@@ -661,7 +661,7 @@ def test_statistics(manager: SpillManager):
 def test_statistics_expose(manager: SpillManager):
     assert len(manager.statistics.spill_totals) == 0
 
-    buffers: List[SpillableBuffer] = [
+    buffers: list[SpillableBuffer] = [
         as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False)
         for _ in range(10)
     ]
@@ -687,7 +687,7 @@ def test_statistics_expose(manager: SpillManager):
     assert stat.spilled_nbytes == 0
 
     # Create and spill 10 new buffers
-    buffers: List[SpillableBuffer] = [
+    buffers: list[SpillableBuffer] = [
         as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False)
         for _ in range(10)
     ]
diff --git a/python/cudf/cudf/utils/applyutils.py b/python/cudf/cudf/utils/applyutils.py
index d57303ca122..cd7fe5ee023 100644
--- a/python/cudf/cudf/utils/applyutils.py
+++ b/python/cudf/cudf/utils/applyutils.py
@@ -1,7 +1,8 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import functools
-from typing import Any, Dict
+from typing import Any
 
 import cupy as cp
 from numba import cuda
@@ -339,7 +340,7 @@ def chunk_wise_kernel(nrows, chunks, {args}):
     return kernel
 
 
-_cache: Dict[Any, Any] = dict()
+_cache: dict[Any, Any] = dict()
 
 
 @functools.wraps(_make_row_wise_kernel)
diff --git a/python/cudf/cudf/utils/queryutils.py b/python/cudf/cudf/utils/queryutils.py
index 239438afd24..78aeac425f7 100644
--- a/python/cudf/cudf/utils/queryutils.py
+++ b/python/cudf/cudf/utils/queryutils.py
@@ -1,8 +1,9 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import ast
 import datetime
-from typing import Any, Dict
+from typing import Any
 
 import numpy as np
 from numba import cuda
@@ -114,7 +115,7 @@ def _check_error(tree):
         raise QuerySyntaxError("too many expressions")
 
 
-_cache: Dict[Any, Any] = {}
+_cache: dict[Any, Any] = {}
 
 
 def query_compile(expr):
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 95621cf9519..2e4dfc4bb14 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import decimal
 import functools
 import os
 import traceback
 import warnings
-from typing import FrozenSet, Set, Union
 
 import numpy as np
 import pandas as pd
@@ -218,7 +218,7 @@ class GetAttrGetItemMixin:
     # `__setstate__`, but this class may be used in complex multiple
     # inheritance hierarchies that might also override serialization.  The
     # solution here is a minimally invasive change that avoids such conflicts.
-    _PROTECTED_KEYS: Union[FrozenSet[str], Set[str]] = frozenset()
+    _PROTECTED_KEYS: frozenset[str] | set[str] = frozenset()
 
     def __getattr__(self, key):
         if key in self._PROTECTED_KEYS:
diff --git a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
index 39bf07c49de..a75a20a4681 100644
--- a/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
+++ b/python/cudf/cudf_pandas_tests/test_fast_slow_proxy.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2023-2024 NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
 
 import inspect
 from functools import partial
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index ef47ea436c7..2e72461b43d 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -1,7 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 from functools import wraps
-from typing import Set
 
 import numpy as np
 import pandas as pd
@@ -695,7 +695,7 @@ def _aggs_optimized(arg, supported: set):
     """Check that aggregations in `arg` are a subset of `supported`"""
     if isinstance(arg, (list, dict)):
         if isinstance(arg, dict):
-            _global_set: Set[str] = set()
+            _global_set: set[str] = set()
             for col in arg:
                 if isinstance(arg[col], list):
                     _global_set = _global_set.union(set(arg[col]))

From 6ff4b4b27e7cc9b750146626531dca0a2a5307c4 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 17 Jun 2024 09:00:13 -1000
Subject: [PATCH 375/842] Standardize and type `Series.dt` methods (#15987)

Most of these operate on the underlying column and return `Series` with the same `index` and `name` as the input, so standardized how these methods construct the result from the output column

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15987
---
 python/cudf/cudf/core/column/datetime.py  |  10 +
 python/cudf/cudf/core/column/timedelta.py |   8 +-
 python/cudf/cudf/core/index.py            |  12 +-
 python/cudf/cudf/core/series.py           | 246 +++++++++-------------
 python/cudf/cudf/core/tools/datetimes.py  |  19 --
 5 files changed, 120 insertions(+), 175 deletions(-)

diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 7fdebda7d76..9ac761b6be1 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -377,6 +377,16 @@ def floor(self, freq: str) -> ColumnBase:
     def round(self, freq: str) -> ColumnBase:
         return libcudf.datetime.round_datetime(self, freq)
 
+    def isocalendar(self) -> dict[str, ColumnBase]:
+        return {
+            field: self.as_string_column("str", format=directive).astype(
+                "uint32"
+            )
+            for field, directive in zip(
+                ["year", "week", "day"], ["%G", "%V", "%u"]
+            )
+        }
+
     def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
         if isinstance(other, (cudf.Scalar, ColumnBase, cudf.DateOffset)):
             return other
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 8eec84b64f7..26b449f1863 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -392,7 +392,7 @@ def corr(self, other: TimeDeltaColumn) -> float:
             other.as_numerical_column("int64")
         )
 
-    def components(self, index=None) -> "cudf.DataFrame":
+    def components(self) -> dict[str, ColumnBase]:
         """
         Return a Dataframe of the components of the Timedeltas.
 
@@ -484,11 +484,7 @@ def components(self, index=None) -> "cudf.DataFrame":
             if self.nullable:
                 res_col = res_col.set_mask(self.mask)
             data[name] = res_col
-
-        return cudf.DataFrame(
-            data=data,
-            index=index,
-        )
+        return data
 
     @property
     def days(self) -> "cudf.core.column.NumericalColumn":
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 13fa187842d..df21d392311 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2157,7 +2157,7 @@ def month_name(self, locale: str | None = None) -> Index:
         return Index._from_data({self.name: month_names})
 
     @_cudf_nvtx_annotate
-    def isocalendar(self):
+    def isocalendar(self) -> cudf.DataFrame:
         """
         Returns a DataFrame with the year, week, and day
         calculated according to the ISO 8601 standard.
@@ -2176,7 +2176,10 @@ def isocalendar(self):
         2020-05-31 08:00:00  2020    22    7
         1999-12-31 18:40:00  1999    52    5
         """
-        return cudf.core.tools.datetimes._to_iso_calendar(self)
+        ca = cudf.core.column_accessor.ColumnAccessor(
+            self._column.isocalendar(), verify=False
+        )
+        return cudf.DataFrame._from_data(ca, index=self)
 
     @_cudf_nvtx_annotate
     def to_pandas(
@@ -2546,7 +2549,10 @@ def components(self):
         Return a dataframe of the components (days, hours, minutes,
         seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas.
         """
-        return self._values.components()
+        ca = cudf.core.column_accessor.ColumnAccessor(
+            self._column.components(), verify=False
+        )
+        return cudf.DataFrame._from_data(ca)
 
     @property
     def inferred_freq(self):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index e532948fd11..c0716d7709a 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -778,9 +778,9 @@ def dt(self):
         ------
             TypeError if the Series does not contain datetimelike values.
         """
-        if isinstance(self._column, DatetimeColumn):
+        if self.dtype.kind == "M":
             return DatetimeProperties(self)
-        elif isinstance(self._column, TimeDeltaColumn):
+        elif self.dtype.kind == "m":
             return TimedeltaProperties(self)
         else:
             raise AttributeError(
@@ -3677,7 +3677,21 @@ def wrapper(self, other, level=None, fill_value=None, axis=0):
     setattr(Series, binop, make_binop_func(binop))
 
 
-class DatetimeProperties:
+class BaseDatelikeProperties:
+    """
+    Base accessor class for Series values.
+    """
+
+    def __init__(self, series: Series):
+        self.series = series
+
+    def _return_result_like_self(self, column: ColumnBase) -> Series:
+        """Return the method result like self.series"""
+        data = ColumnAccessor({self.series.name: column}, verify=False)
+        return self.series._from_data_like_self(data)
+
+
+class DatetimeProperties(BaseDatelikeProperties):
     """
     Accessor object for datetimelike properties of the Series values.
 
@@ -3727,12 +3741,9 @@ class DatetimeProperties:
     dtype: int16
     """
 
-    def __init__(self, series):
-        self.series = series
-
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def year(self):
+    def year(self) -> Series:
         """
         The year of the datetime.
 
@@ -3757,7 +3768,7 @@ def year(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def month(self):
+    def month(self) -> Series:
         """
         The month as January=1, December=12.
 
@@ -3782,7 +3793,7 @@ def month(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def day(self):
+    def day(self) -> Series:
         """
         The day of the datetime.
 
@@ -3807,7 +3818,7 @@ def day(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def hour(self):
+    def hour(self) -> Series:
         """
         The hours of the datetime.
 
@@ -3832,7 +3843,7 @@ def hour(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def minute(self):
+    def minute(self) -> Series:
         """
         The minutes of the datetime.
 
@@ -3857,7 +3868,7 @@ def minute(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def second(self):
+    def second(self) -> Series:
         """
         The seconds of the datetime.
 
@@ -3882,7 +3893,7 @@ def second(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def microsecond(self):
+    def microsecond(self) -> Series:
         """
         The microseconds of the datetime.
 
@@ -3903,22 +3914,18 @@ def microsecond(self):
         2    2
         dtype: int32
         """
-        return Series(
-            data=(
-                # Need to manually promote column to int32 because
-                # pandas-matching binop behaviour requires that this
-                # __mul__ returns an int16 column.
-                self.series._column.get_dt_field("millisecond").astype("int32")
-                * cudf.Scalar(1000, dtype="int32")
-            )
-            + self.series._column.get_dt_field("microsecond"),
-            index=self.series.index,
-            name=self.series.name,
-        )
+        micro = self.series._column.get_dt_field("microsecond")
+        # Need to manually promote column to int32 because
+        # pandas-matching binop behaviour requires that this
+        # __mul__ returns an int16 column.
+        extra = self.series._column.get_dt_field("millisecond").astype(
+            "int32"
+        ) * cudf.Scalar(1000, dtype="int32")
+        return self._return_result_like_self(micro + extra)
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def nanosecond(self):
+    def nanosecond(self) -> Series:
         """
         The nanoseconds of the datetime.
 
@@ -3943,7 +3950,7 @@ def nanosecond(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def weekday(self):
+    def weekday(self) -> Series:
         """
         The day of the week with Monday=0, Sunday=6.
 
@@ -3980,7 +3987,7 @@ def weekday(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def dayofweek(self):
+    def dayofweek(self) -> Series:
         """
         The day of the week with Monday=0, Sunday=6.
 
@@ -4017,7 +4024,7 @@ def dayofweek(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def dayofyear(self):
+    def dayofyear(self) -> Series:
         """
         The day of the year, from 1-365 in non-leap years and
         from 1-366 in leap years.
@@ -4055,7 +4062,7 @@ def dayofyear(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def day_of_year(self):
+    def day_of_year(self) -> Series:
         """
         The day of the year, from 1-365 in non-leap years and
         from 1-366 in leap years.
@@ -4093,7 +4100,7 @@ def day_of_year(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_leap_year(self):
+    def is_leap_year(self) -> Series:
         """
         Boolean indicator if the date belongs to a leap year.
 
@@ -4144,15 +4151,11 @@ def is_leap_year(self):
         dtype: bool
         """
         res = libcudf.datetime.is_leap_year(self.series._column).fillna(False)
-        return Series._from_data(
-            ColumnAccessor({None: res}),
-            index=self.series.index,
-            name=self.series.name,
-        )
+        return self._return_result_like_self(res)
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def quarter(self):
+    def quarter(self) -> Series:
         """
         Integer indicator for which quarter of the year the date belongs in.
 
@@ -4178,14 +4181,10 @@ def quarter(self):
         res = libcudf.datetime.extract_quarter(self.series._column).astype(
             np.int8
         )
-        return Series._from_data(
-            {None: res},
-            index=self.series.index,
-            name=self.series.name,
-        )
+        return self._return_result_like_self(res)
 
     @_cudf_nvtx_annotate
-    def day_name(self, locale=None):
+    def day_name(self, locale: str | None = None) -> Series:
         """
         Return the day names. Currently supports English locale only.
 
@@ -4216,11 +4215,8 @@ def day_name(self, locale=None):
         7     Saturday
         dtype: object
         """
-        day_names = self.series._column.get_day_names(locale)
-        return Series._from_data(
-            ColumnAccessor({None: day_names}),
-            index=self.series.index,
-            name=self.series.name,
+        return self._return_result_like_self(
+            self.series._column.get_day_names(locale)
         )
 
     @_cudf_nvtx_annotate
@@ -4249,15 +4245,12 @@ def month_name(self, locale: str | None = None) -> Series:
         5    February
         dtype: object
         """
-        month_names = self.series._column.get_month_names(locale)
-        return Series._from_data(
-            ColumnAccessor({None: month_names}),
-            index=self.series.index,
-            name=self.series.name,
+        return self._return_result_like_self(
+            self.series._column.get_month_names(locale)
         )
 
     @_cudf_nvtx_annotate
-    def isocalendar(self):
+    def isocalendar(self) -> cudf.DataFrame:
         """
         Returns a DataFrame with the year, week, and day
         calculated according to the ISO 8601 standard.
@@ -4298,11 +4291,14 @@ def isocalendar(self):
         1    <NA>
         Name: year, dtype: object
         """
-        return cudf.core.tools.datetimes._to_iso_calendar(self)
+        ca = ColumnAccessor(self.series._column.isocalendar(), verify=False)
+        return self.series._constructor_expanddim._from_data(
+            ca, index=self.series.index
+        )
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_month_start(self):
+    def is_month_start(self) -> Series:
         """
         Booleans indicating if dates are the first day of the month.
         """
@@ -4310,7 +4306,7 @@ def is_month_start(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def days_in_month(self):
+    def days_in_month(self) -> Series:
         """
         Get the total number of days in the month that the date falls on.
 
@@ -4353,16 +4349,13 @@ def days_in_month(self):
         11    31
         dtype: int16
         """
-        res = libcudf.datetime.days_in_month(self.series._column)
-        return Series._from_data(
-            ColumnAccessor({None: res}),
-            index=self.series.index,
-            name=self.series.name,
+        return self._return_result_like_self(
+            libcudf.datetime.days_in_month(self.series._column)
         )
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_month_end(self):
+    def is_month_end(self) -> Series:
         """
         Boolean indicator if the date is the last day of the month.
 
@@ -4399,17 +4392,13 @@ def is_month_end(self):
         8    False
         dtype: bool
         """  # noqa: E501
-        last_day = libcudf.datetime.last_day_of_month(self.series._column)
-        last_day = Series._from_data(
-            ColumnAccessor({None: last_day}),
-            index=self.series.index,
-            name=self.series.name,
-        )
+        last_day_col = libcudf.datetime.last_day_of_month(self.series._column)
+        last_day = self._return_result_like_self(last_day_col)
         return (self.day == last_day.dt.day).fillna(False)
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_quarter_start(self):
+    def is_quarter_start(self) -> Series:
         """
         Boolean indicator if the date is the first day of a quarter.
 
@@ -4450,15 +4439,11 @@ def is_quarter_start(self):
         )
 
         result = ((day == cudf.Scalar(1)) & first_month).fillna(False)
-        return Series._from_data(
-            {None: result},
-            index=self.series.index,
-            name=self.series.name,
-        )
+        return self._return_result_like_self(result)
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_quarter_end(self):
+    def is_quarter_end(self) -> Series:
         """
         Boolean indicator if the date is the last day of a quarter.
 
@@ -4501,15 +4486,11 @@ def is_quarter_end(self):
         )
 
         result = ((day == last_day) & last_month).fillna(False)
-        return Series._from_data(
-            {None: result},
-            index=self.series.index,
-            name=self.series.name,
-        )
+        return self._return_result_like_self(result)
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_year_start(self):
+    def is_year_start(self) -> Series:
         """
         Boolean indicator if the date is the first day of the year.
 
@@ -4536,15 +4517,11 @@ def is_year_start(self):
         outcol = self.series._column.get_dt_field(
             "day_of_year"
         ) == cudf.Scalar(1)
-        return Series._from_data(
-            {None: outcol.fillna(False)},
-            index=self.series.index,
-            name=self.series.name,
-        )
+        return self._return_result_like_self(outcol.fillna(False))
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def is_year_end(self):
+    def is_year_end(self) -> Series:
         """
         Boolean indicator if the date is the last day of the year.
 
@@ -4574,22 +4551,16 @@ def is_year_end(self):
         leap = day_of_year == cudf.Scalar(366)
         non_leap = day_of_year == cudf.Scalar(365)
         result = cudf._lib.copying.copy_if_else(leap, non_leap, leap_dates)
-        result = result.fillna(False)
-        return Series._from_data(
-            {None: result},
-            index=self.series.index,
-            name=self.series.name,
-        )
+        return self._return_result_like_self(result.fillna(False))
 
     @_cudf_nvtx_annotate
-    def _get_dt_field(self, field):
-        out_column = self.series._column.get_dt_field(field)
-        return Series(
-            data=out_column, index=self.series.index, name=self.series.name
+    def _get_dt_field(self, field: str) -> Series:
+        return self._return_result_like_self(
+            self.series._column.get_dt_field(field)
         )
 
     @_cudf_nvtx_annotate
-    def ceil(self, freq):
+    def ceil(self, freq: str) -> Series:
         """
         Perform ceil operation on the data to the specified freq.
 
@@ -4619,14 +4590,10 @@ def ceil(self, freq):
         2   2001-01-01 00:06:00
         dtype: datetime64[ns]
         """
-        out_column = self.series._column.ceil(freq)
-
-        return Series._from_data(
-            data={self.series.name: out_column}, index=self.series.index
-        )
+        return self._return_result_like_self(self.series._column.ceil(freq))
 
     @_cudf_nvtx_annotate
-    def floor(self, freq):
+    def floor(self, freq: str) -> Series:
         """
         Perform floor operation on the data to the specified freq.
 
@@ -4656,14 +4623,10 @@ def floor(self, freq):
         2   2001-01-01 00:05:00
         dtype: datetime64[ns]
         """
-        out_column = self.series._column.floor(freq)
-
-        return Series._from_data(
-            data={self.series.name: out_column}, index=self.series.index
-        )
+        return self._return_result_like_self(self.series._column.floor(freq))
 
     @_cudf_nvtx_annotate
-    def round(self, freq):
+    def round(self, freq: str) -> Series:
         """
         Perform round operation on the data to the specified freq.
 
@@ -4696,14 +4659,10 @@ def round(self, freq):
         2   2001-01-01 00:05:00
         dtype: datetime64[ns]
         """
-        out_column = self.series._column.round(freq)
-
-        return Series._from_data(
-            data={self.series.name: out_column}, index=self.series.index
-        )
+        return self._return_result_like_self(self.series._column.round(freq))
 
     @_cudf_nvtx_annotate
-    def strftime(self, date_format, *args, **kwargs):
+    def strftime(self, date_format: str, *args, **kwargs) -> Series:
         """
         Convert to Series using specified ``date_format``.
 
@@ -4777,11 +4736,10 @@ def strftime(self, date_format, *args, **kwargs):
                     f"https://github.com/rapidsai/cudf/issues/5991 "
                     f"for tracking purposes."
                 )
-        str_col = self.series._column.as_string_column(
-            dtype="str", format=date_format
-        )
-        return Series(
-            data=str_col, index=self.series.index, name=self.series.name
+        return self._return_result_like_self(
+            self.series._column.as_string_column(
+                dtype="str", format=date_format
+            )
         )
 
     @copy_docstring(DatetimeIndex.tz_localize)
@@ -4790,17 +4748,13 @@ def tz_localize(
         tz: str | None,
         ambiguous: Literal["NaT"] = "NaT",
         nonexistent: Literal["NaT"] = "NaT",
-    ):
-        result_col = self.series._column.tz_localize(
-            tz, ambiguous, nonexistent
-        )
-        return Series._from_data(
-            data={self.series.name: result_col},
-            index=self.series.index,
+    ) -> Series:
+        return self._return_result_like_self(
+            self.series._column.tz_localize(tz, ambiguous, nonexistent)
         )
 
     @copy_docstring(DatetimeIndex.tz_convert)
-    def tz_convert(self, tz: str | None):
+    def tz_convert(self, tz: str | None) -> Series:
         """
         Parameters
         ----------
@@ -4810,13 +4764,12 @@ def tz_convert(self, tz: str | None):
             A `tz` of None will convert to UTC and remove the
             timezone information.
         """
-        result_col = self.series._column.tz_convert(tz)
-        return Series._from_data(
-            {self.series.name: result_col}, index=self.series.index
+        return self._return_result_like_self(
+            self.series._column.tz_convert(tz)
         )
 
 
-class TimedeltaProperties:
+class TimedeltaProperties(BaseDatelikeProperties):
     """
     Accessor object for timedelta-like properties of the Series values.
 
@@ -4884,12 +4837,9 @@ class TimedeltaProperties:
     dtype: int64
     """
 
-    def __init__(self, series):
-        self.series = series
-
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def days(self):
+    def days(self) -> Series:
         """
         Number of days.
 
@@ -4921,7 +4871,7 @@ def days(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def seconds(self):
+    def seconds(self) -> Series:
         """
         Number of seconds (>= 0 and less than 1 day).
 
@@ -4960,7 +4910,7 @@ def seconds(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def microseconds(self):
+    def microseconds(self) -> Series:
         """
         Number of microseconds (>= 0 and less than 1 second).
 
@@ -4992,7 +4942,7 @@ def microseconds(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def nanoseconds(self):
+    def nanoseconds(self) -> Series:
         """
         Return the number of nanoseconds (n), where 0 <= n < 1 microsecond.
 
@@ -5024,7 +4974,7 @@ def nanoseconds(self):
 
     @property  # type: ignore
     @_cudf_nvtx_annotate
-    def components(self):
+    def components(self) -> cudf.DataFrame:
         """
         Return a Dataframe of the components of the Timedeltas.
 
@@ -5050,13 +5000,15 @@ def components(self):
         3      0      0       35       35           656             0            0
         4     37     13       12       14           234             0            0
         """  # noqa: E501
-        return self.series._column.components(index=self.series.index)
+        ca = ColumnAccessor(self.series._column.components(), verify=False)
+        return self.series._constructor_expanddim._from_data(
+            ca, index=self.series.index
+        )
 
     @_cudf_nvtx_annotate
-    def _get_td_field(self, field):
-        out_column = getattr(self.series._column, field)
-        return Series(
-            data=out_column, index=self.series.index, name=self.series.name
+    def _get_td_field(self, field: str) -> Series:
+        return self._return_result_like_self(
+            getattr(self.series._column, field)
         )
 
 
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 29130130732..397bfe1d472 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -1048,22 +1048,3 @@ def _offset_to_nanoseconds_lower_bound(offset: DateOffset) -> int:
         + kwds.get("microseconds", 0) * 10**3
         + kwds.get("nanoseconds", 0)
     )
-
-
-def _to_iso_calendar(arg):
-    formats = ["%G", "%V", "%u"]
-    if not isinstance(arg, (cudf.Index, cudf.core.series.DatetimeProperties)):
-        raise AttributeError(
-            "Can only use .isocalendar accessor with series or index"
-        )
-    if isinstance(arg, cudf.Index):
-        iso_params = [
-            arg._column.as_string_column(arg.dtype, fmt) for fmt in formats
-        ]
-        index = arg._column
-    elif isinstance(arg.series, cudf.Series):
-        iso_params = [arg.strftime(fmt) for fmt in formats]
-        index = arg.series.index
-
-    data = dict(zip(["year", "week", "day"], iso_params))
-    return cudf.DataFrame(data, index=index, dtype=np.int32)

From 282a5d93da6632a726a5d8c809373b6d612e72bc Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 17 Jun 2024 20:34:40 +0100
Subject: [PATCH 376/842] Fix implemention of any, all, and isbetween (#15993)

Add tests covering BooleanFunction implementations.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/15993
---
 python/cudf_polars/cudf_polars/dsl/expr.py    |  30 +++-
 .../tests/expressions/test_booleanfunction.py | 129 ++++++++++++++++++
 2 files changed, 152 insertions(+), 7 deletions(-)
 create mode 100644 python/cudf_polars/tests/expressions/test_booleanfunction.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 0605bba6642..c92e0714d54 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -512,13 +512,17 @@ def do_evaluate(
         ]
         if self.name == pl_expr.BooleanFunction.Any:
             (column,) = columns
-            return plc.Column.from_scalar(
-                plc.reduce.reduce(column.obj, plc.aggregation.any(), self.dtype), 1
+            return Column(
+                plc.Column.from_scalar(
+                    plc.reduce.reduce(column.obj, plc.aggregation.any(), self.dtype), 1
+                )
             )
         elif self.name == pl_expr.BooleanFunction.All:
             (column,) = columns
-            return plc.Column.from_scalar(
-                plc.reduce.reduce(column.obj, plc.aggregation.all(), self.dtype), 1
+            return Column(
+                plc.Column.from_scalar(
+                    plc.reduce.reduce(column.obj, plc.aggregation.all(), self.dtype), 1
+                )
             )
         if self.name == pl_expr.BooleanFunction.IsNull:
             (column,) = columns
@@ -612,20 +616,32 @@ def do_evaluate(
             column, lo, hi = columns
             (closed,) = self.options
             lop, rop = self._BETWEEN_OPS[closed]
+            lo_obj = (
+                lo.obj_scalar
+                if lo.is_scalar and lo.obj.size() != column.obj.size()
+                else lo.obj
+            )
+            hi_obj = (
+                hi.obj_scalar
+                if hi.is_scalar and hi.obj.size() != column.obj.size()
+                else hi.obj
+            )
             return Column(
                 plc.binaryop.binary_operation(
                     plc.binaryop.binary_operation(
-                        column.obj, lo.obj, lop, output_type=self.dtype
+                        column.obj, lo_obj, lop, output_type=self.dtype
                     ),
                     plc.binaryop.binary_operation(
-                        column.obj, hi.obj, rop, output_type=self.dtype
+                        column.obj, hi_obj, rop, output_type=self.dtype
                     ),
                     plc.binaryop.BinaryOperator.LOGICAL_AND,
                     self.dtype,
                 )
             )
         else:
-            raise NotImplementedError(f"BooleanFunction {self.name}")
+            raise NotImplementedError(
+                f"BooleanFunction {self.name}"
+            )  # pragma: no cover; handled by init raising
 
 
 class StringFunction(Expr):
diff --git a/python/cudf_polars/tests/expressions/test_booleanfunction.py b/python/cudf_polars/tests/expressions/test_booleanfunction.py
new file mode 100644
index 00000000000..951b749e670
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_booleanfunction.py
@@ -0,0 +1,129 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"])
+def has_nulls(request):
+    return request.param
+
+
+@pytest.mark.parametrize(
+    "ignore_nulls",
+    [
+        pytest.param(
+            False, marks=pytest.mark.xfail(reason="No support for Kleene logic")
+        ),
+        True,
+    ],
+)
+def test_booleanfunction_reduction(ignore_nulls):
+    ldf = pl.LazyFrame(
+        {
+            "a": [1, 2, 3.0, 2, 5],
+            "b": [0, 3, 1, -1, None],
+            "c": [1, 6, 5, 3, 2],
+        }
+    )
+
+    query = ldf.select(
+        (pl.col("a") > 3).any(ignore_nulls=ignore_nulls),
+        (pl.col("b") > 2).all(ignore_nulls=ignore_nulls),
+    )
+
+    assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.Expr.is_null,
+        pl.Expr.is_not_null,
+        pl.Expr.is_nan,
+        pl.Expr.is_not_nan,
+    ],
+    ids=lambda f: f"{f.__name__}()",
+)
+@pytest.mark.parametrize("has_nans", [False, True], ids=["no_nans", "nans"])
+def test_boolean_function_unary(request, expr, has_nans, has_nulls):
+    if has_nulls and expr in (pl.Expr.is_nan, pl.Expr.is_not_nan):
+        request.applymarker(
+            pytest.mark.xfail(
+                reason="Need to copy null mask since is_{not_}nan(null) => null"
+            )
+        )
+
+    values: list[float | None] = [1, 2, 3, 4, 5]
+    if has_nans:
+        values[3] = float("nan")
+    if has_nulls:
+        values[0] = None
+
+    df = pl.LazyFrame({"a": pl.Series(values, dtype=pl.Float32())})
+
+    q = df.select(expr(pl.col("a")))
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.xfail(reason="Evaluation handlers not yet implemented")
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.col("a").is_finite(),
+        pl.col("a").is_infinite(),
+        pl.col("a").is_in(pl.col("b")),
+    ],
+)
+def test_unsupported_boolean_function(expr):
+    df = pl.LazyFrame({"a": [1, float("nan"), 2, 4], "b": [1, 2, 3, 4]})
+
+    q = df.select(expr)
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("closed", ["both", "left", "right", "none"])
+@pytest.mark.parametrize(
+    "bounds", [(1, 2), (-1, 10), (11, 10), (pl.col("lo"), pl.col("hi"))]
+)
+def test_boolean_isbetween(closed, bounds):
+    df = pl.LazyFrame(
+        {"a": [1, float("nan"), 2, 4], "lo": [1, 2, 2, 3], "hi": [10, 4, 2, 4]}
+    )
+
+    q = df.select(pl.col("a").is_between(*bounds, closed=closed))
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "expr", [pl.any_horizontal("*"), pl.all_horizontal("*")], ids=["any", "all"]
+)
+@pytest.mark.parametrize("wide", [False, True], ids=["narrow", "wide"])
+def test_boolean_horizontal(request, expr, has_nulls, wide):
+    if has_nulls:
+        request.applymarker(pytest.mark.xfail(reason="No support for Kleene logic"))
+    ldf = pl.LazyFrame(
+        {
+            "a": [False, False, False, False, False, True],
+            "b": [False, False, True, None, None, True],
+            "c": [False, True, True, False, True, True],
+        }
+    )
+    if not has_nulls:
+        ldf = ldf.select(pl.col("a"), pl.col("c"))
+
+    # To see the All/Any Horizontal nodes, we need a dataframe with
+    # more than 128 columns
+    if wide:
+        ldf = ldf.with_columns(pl.col("c").alias(f"col{i}") for i in range(128))
+    q = ldf.select(expr)
+
+    assert_gpu_result_equal(q)

From bcdfe914ebff93144bd890e4814688e298d1813f Mon Sep 17 00:00:00 2001
From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
Date: Mon, 17 Jun 2024 14:25:52 -0700
Subject: [PATCH 377/842] Expose stream parameter to public rolling APIs
 (#15865)

Add stream parameter to public rolling APIs.

- `rolling()`

- `grouped_rolling_window()`
- `grouped_time_range_rolling_window()`
- `grouped_range_rolling_window()`

Authors:
  - Srinivas Yadav (https://github.com/srinivasyadav18)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/15865
---
 cpp/include/cudf/rolling.hpp                  |  20 ++
 .../cudf/rolling/range_window_bounds.hpp      |  16 +-
 cpp/src/rolling/grouped_rolling.cu            | 158 ++++++-----
 cpp/src/rolling/range_window_bounds.cpp       |  39 +--
 cpp/src/rolling/rolling.cu                    |  17 +-
 cpp/tests/CMakeLists.txt                      |   3 +-
 cpp/tests/streams/rolling_test.cpp            | 246 ++++++++++++++++++
 7 files changed, 397 insertions(+), 102 deletions(-)
 create mode 100644 cpp/tests/streams/rolling_test.cpp

diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp
index 2cd34f48265..d55322dd3e8 100644
--- a/cpp/include/cudf/rolling.hpp
+++ b/cpp/include/cudf/rolling.hpp
@@ -57,6 +57,7 @@ namespace cudf {
  * @param[in] min_periods Minimum number of observations in window required to have a value,
  *                        otherwise element `i` is null.
  * @param[in] agg The rolling window aggregation type (SUM, MAX, MIN, etc.)
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns   A nullable output column containing the rolling window results
@@ -67,6 +68,7 @@ std::unique_ptr<column> rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& agg,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -77,6 +79,7 @@ std::unique_ptr<column> rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& agg,
+ *            rmm::cuda_stream_view stream,
  *            rmm::device_async_resource_ref mr)
  *
  * @param default_outputs A column of per-row default values to be returned instead
@@ -90,6 +93,7 @@ std::unique_ptr<column> rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& agg,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -227,6 +231,7 @@ struct window_bounds {
  * @param[in] min_periods Minimum number of observations in window required to have a value,
  *                        otherwise element `i` is null.
  * @param[in] aggr The rolling window aggregation type (SUM, MAX, MIN, etc.)
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns   A nullable output column containing the rolling window results
@@ -238,6 +243,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -249,6 +255,7 @@ std::unique_ptr<column> grouped_rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& aggr,
+ *            rmm::cuda_stream_view stream,
  *            rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> grouped_rolling_window(
@@ -258,6 +265,7 @@ std::unique_ptr<column> grouped_rolling_window(
   window_bounds following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -269,6 +277,7 @@ std::unique_ptr<column> grouped_rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& aggr,
+ *            rmm::cuda_stream_view stream,,
  *            rmm::device_async_resource_ref mr)
  *
  * @param default_outputs A column of per-row default values to be returned instead
@@ -283,6 +292,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -295,6 +305,7 @@ std::unique_ptr<column> grouped_rolling_window(
  *            size_type following_window,
  *            size_type min_periods,
  *            rolling_aggregation const& aggr,
+ *            rmm::cuda_stream_view stream,
  *            rmm::device_async_resource_ref mr)
  */
 std::unique_ptr<column> grouped_rolling_window(
@@ -305,6 +316,7 @@ std::unique_ptr<column> grouped_rolling_window(
   window_bounds following_window,
   size_type min_periods,
   rolling_aggregation const& aggr,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -387,6 +399,7 @@ std::unique_ptr<column> grouped_rolling_window(
  * @param[in] min_periods Minimum number of observations in window required to have a value,
  *                        otherwise element `i` is null.
  * @param[in] aggr The rolling window aggregation type (SUM, MAX, MIN, etc.)
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns   A nullable output column containing the rolling window results
@@ -400,6 +413,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
   size_type following_window_in_days,
   size_type min_periods,
   rolling_aggregation const& aggr,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -415,6 +429,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
  *                size_type following_window_in_days,
  *                size_type min_periods,
  *                rolling_aggregation const& aggr,
+ *                rmm::cuda_stream_view stream,
  *                rmm::device_async_resource_ref mr)
  *
  * The `preceding_window_in_days` and `following_window_in_days` are specified as a `window_bounds`
@@ -429,6 +444,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
   window_bounds following_window_in_days,
   size_type min_periods,
   rolling_aggregation const& aggr,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -536,6 +552,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
  * @param[in] min_periods Minimum number of observations in window required to have a value,
  *                        otherwise element `i` is null.
  * @param[in] aggr The rolling window aggregation type (SUM, MAX, MIN, etc.)
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns   A nullable output column containing the rolling window results
@@ -549,6 +566,7 @@ std::unique_ptr<column> grouped_range_rolling_window(
   range_window_bounds const& following,
   size_type min_periods,
   rolling_aggregation const& aggr,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -582,6 +600,7 @@ std::unique_ptr<column> grouped_range_rolling_window(
  * @param[in] min_periods Minimum number of observations in window required to have a value,
  *                        otherwise element `i` is null.
  * @param[in] agg The rolling window aggregation type (sum, max, min, etc.)
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned column's device memory
  *
  * @returns   A nullable output column containing the rolling window results
@@ -592,6 +611,7 @@ std::unique_ptr<column> rolling_window(
   column_view const& following_window,
   size_type min_periods,
   rolling_aggregation const& agg,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/rolling/range_window_bounds.hpp b/cpp/include/cudf/rolling/range_window_bounds.hpp
index 81885ade2f0..a9ee12cea27 100644
--- a/cpp/include/cudf/rolling/range_window_bounds.hpp
+++ b/cpp/include/cudf/rolling/range_window_bounds.hpp
@@ -56,18 +56,22 @@ struct range_window_bounds {
    * @brief Factory method to construct a bounded window boundary.
    *
    * @param boundary Finite window boundary
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @return A bounded window boundary object
    */
-  static range_window_bounds get(scalar const& boundary);
+  static range_window_bounds get(scalar const& boundary,
+                                 rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   /**
    * @brief Factory method to construct a window boundary
    *  limited to the value of the current row
    *
    * @param type The datatype of the window boundary
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @return  A "current row" window boundary object
    */
-  static range_window_bounds current_row(data_type type);
+  static range_window_bounds current_row(data_type type,
+                                         rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   /**
    * @brief Whether or not the window is bounded to the current row
@@ -81,9 +85,11 @@ struct range_window_bounds {
    * @brief Factory method to construct an unbounded window boundary.
    *
    * @param type The datatype of the window boundary
+   * @param stream CUDA stream used for device memory operations and kernel launches
    * @return  An unbounded window boundary object
    */
-  static range_window_bounds unbounded(data_type type);
+  static range_window_bounds unbounded(data_type type,
+                                       rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   /**
    * @brief Whether or not the window is unbounded
@@ -107,7 +113,9 @@ struct range_window_bounds {
   extent_type _extent{extent_type::UNBOUNDED};
   std::shared_ptr<scalar> _range_scalar{nullptr};  // To enable copy construction/assignment.
 
-  range_window_bounds(extent_type extent_, std::unique_ptr<scalar> range_scalar_);
+  range_window_bounds(extent_type extent_,
+                      std::unique_ptr<scalar> range_scalar_,
+                      rmm::cuda_stream_view = cudf::get_default_stream());
 };
 
 /** @} */  // end of group
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index d461ed7a109..1158bf22494 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -40,59 +40,6 @@
 #include <thrust/partition.h>
 
 namespace cudf {
-std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
-                                               column_view const& input,
-                                               size_type preceding_window,
-                                               size_type following_window,
-                                               size_type min_periods,
-                                               rolling_aggregation const& aggr,
-                                               rmm::device_async_resource_ref mr)
-{
-  return grouped_rolling_window(group_keys,
-                                input,
-                                window_bounds::get(preceding_window),
-                                window_bounds::get(following_window),
-                                min_periods,
-                                aggr,
-                                mr);
-}
-
-std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
-                                               column_view const& input,
-                                               window_bounds preceding_window,
-                                               window_bounds following_window,
-                                               size_type min_periods,
-                                               rolling_aggregation const& aggr,
-                                               rmm::device_async_resource_ref mr)
-{
-  return grouped_rolling_window(group_keys,
-                                input,
-                                empty_like(input)->view(),
-                                preceding_window,
-                                following_window,
-                                min_periods,
-                                aggr,
-                                mr);
-}
-
-std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
-                                               column_view const& input,
-                                               column_view const& default_outputs,
-                                               size_type preceding_window,
-                                               size_type following_window,
-                                               size_type min_periods,
-                                               rolling_aggregation const& aggr,
-                                               rmm::device_async_resource_ref mr)
-{
-  return grouped_rolling_window(group_keys,
-                                input,
-                                default_outputs,
-                                window_bounds::get(preceding_window),
-                                window_bounds::get(following_window),
-                                min_periods,
-                                aggr,
-                                mr);
-}
 
 namespace detail {
 
@@ -237,8 +184,8 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
 
   if (group_keys.num_columns() == 0) {
     // No Groupby columns specified. Treat as one big group.
-    return rolling_window(
-      input, default_outputs, preceding_window, following_window, min_periods, aggr, mr);
+    return detail::rolling_window(
+      input, default_outputs, preceding_window, following_window, min_periods, aggr, stream, mr);
   }
 
   using sort_groupby_helper = cudf::groupby::detail::sort::sort_groupby_helper;
@@ -306,6 +253,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                                window_bounds following_window_bounds,
                                                size_type min_periods,
                                                rolling_aggregation const& aggr,
+                                               rmm::cuda_stream_view stream,
                                                rmm::device_async_resource_ref mr)
 {
   return detail::grouped_rolling_window(group_keys,
@@ -315,7 +263,67 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                         following_window_bounds,
                                         min_periods,
                                         aggr,
-                                        cudf::get_default_stream(),
+                                        stream,
+                                        mr);
+}
+
+std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
+                                               column_view const& input,
+                                               size_type preceding_window,
+                                               size_type following_window,
+                                               size_type min_periods,
+                                               rolling_aggregation const& aggr,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  return grouped_rolling_window(group_keys,
+                                input,
+                                window_bounds::get(preceding_window),
+                                window_bounds::get(following_window),
+                                min_periods,
+                                aggr,
+                                stream,
+                                mr);
+}
+
+std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
+                                               column_view const& input,
+                                               window_bounds preceding_window,
+                                               window_bounds following_window,
+                                               size_type min_periods,
+                                               rolling_aggregation const& aggr,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  return detail::grouped_rolling_window(group_keys,
+                                        input,
+                                        empty_like(input)->view(),
+                                        preceding_window,
+                                        following_window,
+                                        min_periods,
+                                        aggr,
+                                        stream,
+                                        mr);
+}
+
+std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
+                                               column_view const& input,
+                                               column_view const& default_outputs,
+                                               size_type preceding_window,
+                                               size_type following_window,
+                                               size_type min_periods,
+                                               rolling_aggregation const& aggr,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  return detail::grouped_rolling_window(group_keys,
+                                        input,
+                                        default_outputs,
+                                        window_bounds::get(preceding_window),
+                                        window_bounds::get(following_window),
+                                        min_periods,
+                                        aggr,
+                                        stream,
                                         mr);
 }
 
@@ -1047,14 +1055,15 @@ struct dispatch_grouped_range_rolling_window {
  */
 struct to_duration_bounds {
   template <typename OrderBy, std::enable_if_t<cudf::is_timestamp<OrderBy>(), void>* = nullptr>
-  range_window_bounds operator()(size_type num_days) const
+  range_window_bounds operator()(size_type num_days, rmm::cuda_stream_view stream) const
   {
     using DurationT = typename OrderBy::duration;
-    return range_window_bounds::get(duration_scalar<DurationT>{duration_D{num_days}, true});
+    return range_window_bounds::get(duration_scalar<DurationT>{duration_D{num_days}, true, stream},
+                                    stream);
   }
 
   template <typename OrderBy, std::enable_if_t<!cudf::is_timestamp<OrderBy>(), void>* = nullptr>
-  range_window_bounds operator()(size_type) const
+  range_window_bounds operator()(size_type, rmm::cuda_stream_view) const
   {
     CUDF_FAIL("Expected timestamp orderby column.");
   }
@@ -1085,9 +1094,11 @@ data_type get_duration_type_for(cudf::data_type timestamp_type)
  * @param timestamp_type Data-type of the orderby column to which the `num_days` is to be adapted.
  * @return range_window_bounds A `range_window_bounds` to be used with the new API.
  */
-range_window_bounds to_range_bounds(cudf::size_type num_days, cudf::data_type timestamp_type)
+range_window_bounds to_range_bounds(cudf::size_type num_days,
+                                    cudf::data_type timestamp_type,
+                                    rmm::cuda_stream_view stream)
 {
-  return cudf::type_dispatcher(timestamp_type, to_duration_bounds{}, num_days);
+  return cudf::type_dispatcher(timestamp_type, to_duration_bounds{}, num_days, stream);
 }
 
 /**
@@ -1101,11 +1112,13 @@ range_window_bounds to_range_bounds(cudf::size_type num_days, cudf::data_type ti
  * @return range_window_bounds A `range_window_bounds` to be used with the new API.
  */
 range_window_bounds to_range_bounds(cudf::window_bounds const& days_bounds,
-                                    cudf::data_type timestamp_type)
+                                    cudf::data_type timestamp_type,
+                                    rmm::cuda_stream_view stream)
 {
   return days_bounds.is_unbounded()
-           ? range_window_bounds::unbounded(get_duration_type_for(timestamp_type))
-           : cudf::type_dispatcher(timestamp_type, to_duration_bounds{}, days_bounds.value());
+           ? range_window_bounds::unbounded(get_duration_type_for(timestamp_type), stream)
+           : cudf::type_dispatcher(
+               timestamp_type, to_duration_bounds{}, days_bounds.value(), stream);
 }
 
 }  // namespace
@@ -1199,11 +1212,12 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
                                                           size_type following_window_in_days,
                                                           size_type min_periods,
                                                           rolling_aggregation const& aggr,
+                                                          rmm::cuda_stream_view stream,
                                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  auto preceding = to_range_bounds(preceding_window_in_days, timestamp_column.type());
-  auto following = to_range_bounds(following_window_in_days, timestamp_column.type());
+  auto preceding = to_range_bounds(preceding_window_in_days, timestamp_column.type(), stream);
+  auto following = to_range_bounds(following_window_in_days, timestamp_column.type(), stream);
 
   return detail::grouped_range_rolling_window(group_keys,
                                               timestamp_column,
@@ -1213,7 +1227,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
                                               following,
                                               min_periods,
                                               aggr,
-                                              cudf::get_default_stream(),
+                                              stream,
                                               mr);
 }
 
@@ -1237,13 +1251,14 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
                                                           window_bounds following_window_in_days,
                                                           size_type min_periods,
                                                           rolling_aggregation const& aggr,
+                                                          rmm::cuda_stream_view stream,
                                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   range_window_bounds preceding =
-    to_range_bounds(preceding_window_in_days, timestamp_column.type());
+    to_range_bounds(preceding_window_in_days, timestamp_column.type(), stream);
   range_window_bounds following =
-    to_range_bounds(following_window_in_days, timestamp_column.type());
+    to_range_bounds(following_window_in_days, timestamp_column.type(), stream);
 
   return detail::grouped_range_rolling_window(group_keys,
                                               timestamp_column,
@@ -1253,7 +1268,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
                                               following,
                                               min_periods,
                                               aggr,
-                                              cudf::get_default_stream(),
+                                              stream,
                                               mr);
 }
 
@@ -1277,6 +1292,7 @@ std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_key
                                                      range_window_bounds const& following,
                                                      size_type min_periods,
                                                      rolling_aggregation const& aggr,
+                                                     rmm::cuda_stream_view stream,
                                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -1288,7 +1304,7 @@ std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_key
                                               following,
                                               min_periods,
                                               aggr,
-                                              cudf::get_default_stream(),
+                                              stream,
                                               mr);
 }
 
diff --git a/cpp/src/rolling/range_window_bounds.cpp b/cpp/src/rolling/range_window_bounds.cpp
index 68e80c6e84e..69792136c64 100644
--- a/cpp/src/rolling/range_window_bounds.cpp
+++ b/cpp/src/rolling/range_window_bounds.cpp
@@ -32,7 +32,8 @@ namespace {
  */
 struct range_scalar_constructor {
   template <typename T, CUDF_ENABLE_IF(not detail::is_supported_range_type<T>())>
-  std::unique_ptr<scalar> operator()(scalar const& range_scalar_) const
+  std::unique_ptr<scalar> operator()(scalar const& range_scalar_,
+                                     rmm::cuda_stream_view stream) const
   {
     CUDF_FAIL(
       "Unsupported range type. "
@@ -40,51 +41,57 @@ struct range_scalar_constructor {
   }
 
   template <typename T, CUDF_ENABLE_IF(cudf::is_duration<T>())>
-  std::unique_ptr<scalar> operator()(scalar const& range_scalar_) const
+  std::unique_ptr<scalar> operator()(scalar const& range_scalar_,
+                                     rmm::cuda_stream_view stream) const
   {
     return std::make_unique<duration_scalar<T>>(
-      static_cast<duration_scalar<T> const&>(range_scalar_));
+      static_cast<duration_scalar<T> const&>(range_scalar_), stream);
   }
 
   template <typename T, CUDF_ENABLE_IF(cudf::is_numeric<T>() && not cudf::is_boolean<T>())>
-  std::unique_ptr<scalar> operator()(scalar const& range_scalar_) const
+  std::unique_ptr<scalar> operator()(scalar const& range_scalar_,
+                                     rmm::cuda_stream_view stream) const
   {
-    return std::make_unique<numeric_scalar<T>>(
-      static_cast<numeric_scalar<T> const&>(range_scalar_));
+    return std::make_unique<numeric_scalar<T>>(static_cast<numeric_scalar<T> const&>(range_scalar_),
+                                               stream);
   }
 
   template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_point<T>())>
-  std::unique_ptr<scalar> operator()(scalar const& range_scalar_) const
+  std::unique_ptr<scalar> operator()(scalar const& range_scalar_,
+                                     rmm::cuda_stream_view stream) const
   {
     return std::make_unique<fixed_point_scalar<T>>(
-      static_cast<fixed_point_scalar<T> const&>(range_scalar_));
+      static_cast<fixed_point_scalar<T> const&>(range_scalar_), stream);
   }
 };
 }  // namespace
 
-range_window_bounds::range_window_bounds(extent_type extent_, std::unique_ptr<scalar> range_scalar_)
+range_window_bounds::range_window_bounds(extent_type extent_,
+                                         std::unique_ptr<scalar> range_scalar_,
+                                         rmm::cuda_stream_view stream)
   : _extent{extent_}, _range_scalar{std::move(range_scalar_)}
 {
   CUDF_EXPECTS(_range_scalar.get(), "Range window scalar cannot be null.");
   CUDF_EXPECTS(_extent == extent_type::UNBOUNDED || _extent == extent_type::CURRENT_ROW ||
-                 _range_scalar->is_valid(),
+                 _range_scalar->is_valid(stream),
                "Bounded Range window scalar must be valid.");
 }
 
-range_window_bounds range_window_bounds::unbounded(data_type type)
+range_window_bounds range_window_bounds::unbounded(data_type type, rmm::cuda_stream_view stream)
 {
-  return {extent_type::UNBOUNDED, make_default_constructed_scalar(type)};
+  return {extent_type::UNBOUNDED, make_default_constructed_scalar(type, stream), stream};
 }
 
-range_window_bounds range_window_bounds::current_row(data_type type)
+range_window_bounds range_window_bounds::current_row(data_type type, rmm::cuda_stream_view stream)
 {
-  return {extent_type::CURRENT_ROW, make_default_constructed_scalar(type)};
+  return {extent_type::CURRENT_ROW, make_default_constructed_scalar(type, stream), stream};
 }
 
-range_window_bounds range_window_bounds::get(scalar const& boundary)
+range_window_bounds range_window_bounds::get(scalar const& boundary, rmm::cuda_stream_view stream)
 {
   return {extent_type::BOUNDED,
-          cudf::type_dispatcher(boundary.type(), range_scalar_constructor{}, boundary)};
+          cudf::type_dispatcher(boundary.type(), range_scalar_constructor{}, boundary, stream),
+          stream};
 }
 
 }  // namespace cudf
diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu
index a308ed8a7a6..e612bd01118 100644
--- a/cpp/src/rolling/rolling.cu
+++ b/cpp/src/rolling/rolling.cu
@@ -32,17 +32,12 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type following_window,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rolling_window(input,
-                                default_outputs,
-                                preceding_window,
-                                following_window,
-                                min_periods,
-                                agg,
-                                cudf::get_default_stream(),
-                                mr);
+  return detail::rolling_window(
+    input, default_outputs, preceding_window, following_window, min_periods, agg, stream, mr);
 }
 
 // Applies a fixed-size rolling window function to the values in a column, without default specified
@@ -51,6 +46,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        size_type following_window,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
@@ -62,7 +58,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                 following_window,
                                 min_periods,
                                 agg,
-                                cudf::get_default_stream(),
+                                stream,
                                 mr);
 }
 
@@ -72,11 +68,12 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        column_view const& following_window,
                                        size_type min_periods,
                                        rolling_aggregation const& agg,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::rolling_window(
-    input, preceding_window, following_window, min_periods, agg, cudf::get_default_stream(), mr);
+    input, preceding_window, following_window, min_periods, agg, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index f6d762cc2ec..b153c4984c5 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -693,10 +693,11 @@ ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testi
 ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing)
+ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_ROLLING_TEST streams/rolling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing)
 ConfigureTest(
   STREAM_STRINGS_TEST
   streams/strings/case_test.cpp
diff --git a/cpp/tests/streams/rolling_test.cpp b/cpp/tests/streams/rolling_test.cpp
new file mode 100644
index 00000000000..b352ad2c0d2
--- /dev/null
+++ b/cpp/tests/streams/rolling_test.cpp
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/rolling.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
+
+class RollingTest : public cudf::test::BaseFixture {};
+
+TEST_F(RollingTest, FixedSize)
+{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> input({1, 2, 3, 4, 5, 6, 7, 8, 9});
+
+  cudf::rolling_window(input,
+                       2,
+                       3,
+                       1,
+                       *cudf::make_min_aggregation<cudf::rolling_aggregation>(),
+                       cudf::test::get_default_stream());
+}
+
+TEST_F(RollingTest, FixedSizeDefault)
+{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> input({1, 2, 3, 4, 5, 6, 7, 8, 9});
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> defaults({42, 42, 42, 42, 9, 9, 7, 1, 1});
+
+  cudf::rolling_window(input,
+                       defaults,
+                       2,
+                       3,
+                       1,
+                       *cudf::make_lead_aggregation<cudf::rolling_aggregation>(1),
+                       cudf::test::get_default_stream());
+}
+
+TEST_F(RollingTest, VariableSize)
+{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> input({1, 2, 3, 4, 5, 6, 7, 8, 9});
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> preceding({2, 2, 2, 2, 3, 3, 3, 3, 3});
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> following({3, 3, 3, 3, 3, 2, 2, 2, 2});
+
+  cudf::rolling_window(input,
+                       preceding,
+                       following,
+                       1,
+                       *cudf::make_min_aggregation<cudf::rolling_aggregation>(),
+                       cudf::test::get_default_stream());
+}
+
+class GroupedRollingTest : public cudf::test::BaseFixture {};
+
+TEST_F(GroupedRollingTest, FixedSize)
+{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> input({1, 2, 3, 4, 5, 6, 7, 8, 9});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> key_0({1, 1, 1, 2, 2, 2, 3, 3, 3});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> key_1({4, 4, 4, 5, 5, 5, 6, 6, 6});
+
+  cudf::table_view grouping_keys{std::vector<cudf::column_view>{key_0, key_1}};
+
+  cudf::grouped_rolling_window(grouping_keys,
+                               input,
+                               2,
+                               3,
+                               1,
+                               *cudf::make_min_aggregation<cudf::rolling_aggregation>(),
+                               cudf::test::get_default_stream());
+}
+
+TEST_F(GroupedRollingTest, FixedSizeDefault)
+{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> input({1, 2, 3, 4, 5, 6, 7, 8, 9});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> key_0({1, 1, 1, 2, 2, 2, 3, 3, 3});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> key_1({4, 4, 4, 5, 5, 5, 6, 6, 6});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> defaults({42, 42, 42, 42, 9, 9, 7, 1, 1});
+
+  cudf::table_view grouping_keys{std::vector<cudf::column_view>{key_0, key_1}};
+
+  cudf::grouped_rolling_window(grouping_keys,
+                               input,
+                               defaults,
+                               2,
+                               3,
+                               1,
+                               *cudf::make_lead_aggregation<cudf::rolling_aggregation>(1),
+                               cudf::test::get_default_stream());
+}
+
+TEST_F(GroupedRollingTest, WindowBounds)
+{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> input({1, 2, 3, 4, 5, 6, 7, 8, 9});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> key_0({1, 1, 1, 2, 2, 2, 3, 3, 3});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> key_1({4, 4, 4, 5, 5, 5, 6, 6, 6});
+
+  auto const unbounded_preceding = cudf::window_bounds::unbounded();
+  auto const following           = cudf::window_bounds::get(1L);
+
+  cudf::table_view grouping_keys{std::vector<cudf::column_view>{key_0, key_1}};
+
+  cudf::grouped_rolling_window(grouping_keys,
+                               input,
+                               unbounded_preceding,
+                               following,
+                               1,
+                               *cudf::make_min_aggregation<cudf::rolling_aggregation>(),
+                               cudf::test::get_default_stream());
+}
+
+TEST_F(GroupedRollingTest, WindowBoundsDefault)
+{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> input({1, 2, 3, 4, 5, 6, 7, 8, 9});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> key_0({1, 1, 1, 2, 2, 2, 3, 3, 3});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> key_1({4, 4, 4, 5, 5, 5, 6, 6, 6});
+
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> defaults({42, 42, 42, 42, 9, 9, 7, 1, 1});
+
+  auto const unbounded_preceding = cudf::window_bounds::unbounded();
+  auto const following           = cudf::window_bounds::get(1L);
+
+  cudf::table_view grouping_keys{std::vector<cudf::column_view>{key_0, key_1}};
+
+  cudf::grouped_rolling_window(grouping_keys,
+                               input,
+                               defaults,
+                               unbounded_preceding,
+                               following,
+                               1,
+                               *cudf::make_lead_aggregation<cudf::rolling_aggregation>(1),
+                               cudf::test::get_default_stream());
+}
+
+class GroupedTimeRollingTest : public cudf::test::BaseFixture {};
+
+TEST_F(GroupedTimeRollingTest, FixedSize)
+{
+  auto const grp_col =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  auto const agg_col = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
+  auto const time_col =
+    cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+
+  auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
+  auto const preceding     = 1L;
+  auto const following     = 1L;
+  auto const min_periods   = 1L;
+  cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    preceding,
+    following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>(),
+    cudf::test::get_default_stream());
+}
+
+TEST_F(GroupedTimeRollingTest, WindowBounds)
+{
+  auto const grp_col =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  auto const agg_col = cudf::test::fixed_width_column_wrapper<cudf::size_type>{
+    {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
+  auto const time_col =
+    cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep>{
+      {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+
+  auto const grouping_keys       = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
+  auto const unbounded_preceding = cudf::window_bounds::unbounded();
+  auto const following           = cudf::window_bounds::get(1L);
+
+  auto const min_periods = 1L;
+  cudf::grouped_time_range_rolling_window(
+    grouping_keys,
+    time_col,
+    cudf::order::ASCENDING,
+    agg_col,
+    unbounded_preceding,
+    following,
+    min_periods,
+    *cudf::make_count_aggregation<cudf::rolling_aggregation>(),
+    cudf::test::get_default_stream());
+}
+
+class GroupedRangeRollingTest : public cudf::test::BaseFixture {};
+
+TEST_F(GroupedRangeRollingTest, RangeWindowBounds)
+{
+  auto const grp_col = cudf::test::fixed_width_column_wrapper<int>{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  auto const agg_col = cudf::test::fixed_width_column_wrapper<int>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+                                                                   {1, 1, 1, 1, 1, 0, 1, 1, 1, 1}};
+
+  auto const order_by = cudf::test::fixed_width_column_wrapper<int>{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+                                                                    {0, 0, 0, 0, 1, 1, 1, 1, 1, 1}};
+
+  cudf::range_window_bounds preceding = cudf::range_window_bounds::get(
+    cudf::numeric_scalar<int>{int{1}, true, cudf::test::get_default_stream()},
+    cudf::test::get_default_stream());
+
+  cudf::range_window_bounds following = cudf::range_window_bounds::get(
+    cudf::numeric_scalar<int>{int{1}, true, cudf::test::get_default_stream()},
+    cudf::test::get_default_stream());
+
+  auto const min_periods = cudf::size_type{1};
+
+  auto const grouping_keys = cudf::table_view{std::vector<cudf::column_view>{grp_col}};
+
+  cudf::grouped_range_rolling_window(grouping_keys,
+                                     order_by,
+                                     cudf::order::ASCENDING,
+                                     agg_col,
+                                     preceding,
+                                     following,
+                                     min_periods,
+                                     *cudf::make_count_aggregation<cudf::rolling_aggregation>(),
+                                     cudf::test::get_default_stream());
+}

From 56e84425e84029b9b7c2ba07f0b8bbfd94846a40 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 17 Jun 2024 17:55:55 -0400
Subject: [PATCH 378/842] Fix target counting in strings char-parallel replace
 (#16017)

Replace `thrust::count_if` call across int64 characters to use a custom kernel instead.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Srinivas Yadav (https://github.com/srinivasyadav18)

URL: https://github.com/rapidsai/cudf/pull/16017
---
 cpp/src/strings/replace/replace.cu            | 37 +++++++++++++++++--
 cpp/tests/CMakeLists.txt                      |  2 +-
 ...ny_strings_tests.cpp => replace_tests.cpp} | 24 +++++++++++-
 3 files changed, 56 insertions(+), 7 deletions(-)
 rename cpp/tests/large_strings/{many_strings_tests.cpp => replace_tests.cpp} (72%)

diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index 501e6d547e6..f7a3a3aea5c 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -238,6 +238,31 @@ struct replace_parallel_chars_fn {
   cudf::size_type maxrepl;
 };
 
+template <int64_t block_size, size_type bytes_per_thread>
+CUDF_KERNEL void count_targets_kernel(replace_parallel_chars_fn fn,
+                                      int64_t chars_bytes,
+                                      int64_t* d_output)
+{
+  auto const idx      = cudf::detail::grid_1d::global_thread_id();
+  auto const byte_idx = static_cast<int64_t>(idx) * bytes_per_thread;
+  auto const lane_idx = static_cast<cudf::size_type>(threadIdx.x);
+
+  using block_reduce = cub::BlockReduce<int64_t, block_size>;
+  __shared__ typename block_reduce::TempStorage temp_storage;
+
+  int64_t count = 0;
+  // each thread processes multiple bytes
+  for (auto i = byte_idx; (i < (byte_idx + bytes_per_thread)) && (i < chars_bytes); ++i) {
+    count += fn.has_target(i);
+  }
+  auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum());
+
+  if ((lane_idx == 0) && (total > 0)) {
+    cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{*d_output};
+    ref.fetch_add(total, cuda::std::memory_order_relaxed);
+  }
+}
+
 std::unique_ptr<column> replace_character_parallel(strings_column_view const& input,
                                                    string_view const& d_target,
                                                    string_view const& d_replacement,
@@ -260,10 +285,14 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
 
   // Count the number of targets in the entire column.
   // Note this may over-count in the case where a target spans adjacent strings.
-  auto target_count = thrust::count_if(rmm::exec_policy_nosync(stream),
-                                       thrust::make_counting_iterator<int64_t>(0),
-                                       thrust::make_counting_iterator<int64_t>(chars_bytes),
-                                       [fn] __device__(int64_t idx) { return fn.has_target(idx); });
+  rmm::device_scalar<int64_t> d_target_count(0, stream);
+  constexpr int64_t block_size         = 512;
+  constexpr size_type bytes_per_thread = 4;
+  auto const num_blocks                = util::div_rounding_up_safe(
+    util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
+  count_targets_kernel<block_size, bytes_per_thread>
+    <<<num_blocks, block_size, 0, stream.value()>>>(fn, chars_bytes, d_target_count.data());
+  auto target_count = d_target_count.value(stream);
 
   // Create a vector of every target position in the chars column.
   // These may also include overlapping targets which will be resolved later.
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b153c4984c5..329edbe4d36 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -571,9 +571,9 @@ ConfigureTest(
   large_strings/concatenate_tests.cpp
   large_strings/case_tests.cpp
   large_strings/large_strings_fixture.cpp
-  large_strings/many_strings_tests.cpp
   large_strings/merge_tests.cpp
   large_strings/parquet_tests.cpp
+  large_strings/replace_tests.cpp
   large_strings/reshape_tests.cpp
   large_strings/split_strings_tests.cpp
   GPUS 1
diff --git a/cpp/tests/large_strings/many_strings_tests.cpp b/cpp/tests/large_strings/replace_tests.cpp
similarity index 72%
rename from cpp/tests/large_strings/many_strings_tests.cpp
rename to cpp/tests/large_strings/replace_tests.cpp
index 73fbb21d014..aa65ec0c010 100644
--- a/cpp/tests/large_strings/many_strings_tests.cpp
+++ b/cpp/tests/large_strings/replace_tests.cpp
@@ -21,6 +21,7 @@
 #include <cudf/concatenate.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
@@ -28,9 +29,9 @@
 #include <limits>
 #include <vector>
 
-struct StringsManyTest : public cudf::test::StringsLargeTest {};
+struct ReplaceTest : public cudf::test::StringsLargeTest {};
 
-TEST_F(StringsManyTest, Replace)
+TEST_F(ReplaceTest, ReplaceLong)
 {
   auto const expected = this->very_long_column();
   auto const view     = cudf::column_view(expected);
@@ -65,3 +66,22 @@ TEST_F(StringsManyTest, Replace)
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(c, expected);
   }
 }
+
+TEST_F(ReplaceTest, ReplaceWide)
+{
+  auto const expected   = this->long_column();
+  auto const view       = cudf::column_view(expected);
+  auto const multiplier = 10;
+  auto const separator  = cudf::string_scalar("|");
+  auto const input      = cudf::strings::concatenate(
+    cudf::table_view(std::vector<cudf::column_view>(multiplier, view)), separator);
+
+  auto const input_view = cudf::strings_column_view(input->view());
+  auto const target     = cudf::string_scalar("3");  // fake the actual replace;
+  auto const repl       = cudf::string_scalar("3");  // logic still builds the output
+  auto result           = cudf::strings::replace(input_view, target, repl);
+
+  auto sv = cudf::strings_column_view(result->view());
+  EXPECT_EQ(sv.offsets().type(), cudf::data_type{cudf::type_id::INT64});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input->view(), result->view());
+}

From 7ff2764d7538c954694f77f1006b52c7cdfe9533 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 17 Jun 2024 18:48:32 -0700
Subject: [PATCH 379/842] Fix `atomic_ref` scope when multiple blocks are
 updating the same output (#16051)

in a few places, `thread_scope_block` is used even where there threads from multiple blocks update the same location. This PR changes these to `thread_scope_device` to avoid UB with sufficiently large inputs.

Have not ran benchmarks to evaluate the impact.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16051
---
 cpp/src/strings/case.cu          | 2 +-
 cpp/src/strings/replace/multi.cu | 2 +-
 cpp/src/strings/split/split.cuh  | 2 +-
 cpp/src/text/tokenize.cu         | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index c1688d20791..27befdea209 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -294,7 +294,7 @@ CUDF_KERNEL void has_multibytes_kernel(char const* d_input_chars,
   auto const mb_total = block_reduce(temp_storage).Reduce(mb_count, cub::Sum());
 
   if ((lane_idx == 0) && (mb_total > 0)) {
-    cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
+    cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{*d_output};
     ref.fetch_add(mb_total, cuda::std::memory_order_relaxed);
   }
 }
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 8e5c5cf60b8..43a3d69091a 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -283,7 +283,7 @@ CUDF_KERNEL void count_targets(replace_multi_parallel_fn fn, int64_t chars_bytes
   auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum());
 
   if ((lane_idx == 0) && (total > 0)) {
-    cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
+    cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{*d_output};
     ref.fetch_add(total, cuda::std::memory_order_relaxed);
   }
 }
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index ae3c0b3aa12..23614ac0733 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -327,7 +327,7 @@ CUDF_KERNEL void count_delimiters_kernel(Tokenizer tokenizer,
   auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum());
 
   if ((lane_idx == 0) && (total > 0)) {
-    cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
+    cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{*d_output};
     ref.fetch_add(total, cuda::std::memory_order_relaxed);
   }
 }
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index 25406bce759..3ce6064d9c2 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -121,7 +121,7 @@ CUDF_KERNEL void count_characters(uint8_t const* d_chars, int64_t chars_bytes, i
   auto const total = block_reduce(temp_storage).Reduce(count, cub::Sum());
 
   if ((lane_idx == 0) && (total > 0)) {
-    cuda::atomic_ref<int64_t, cuda::thread_scope_block> ref{*d_output};
+    cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{*d_output};
     ref.fetch_add(total, cuda::std::memory_order_relaxed);
   }
 }

From 0bdf934f6017402b88c9c0fe798013203af8c39f Mon Sep 17 00:00:00 2001
From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
Date: Mon, 17 Jun 2024 19:46:16 -0700
Subject: [PATCH 380/842] Reduce conditional_join nvbench configurations
 (#16036)

The current **JOIN_NVBENCH** uses three table sizes `1000, 100'000, 10'000'000` for all the **join** benchmarks.

But, **coditional** **joins** perform **X * Y** operations, and the large table size explodes the bench runtime. Hence we need to benchmark on smaller tables only.

This PR reduces **nvbench configurations** from **36** to **16** by using only two smaller table sizes `1000, 100'000` which lowers the overall benchmark runtime significantly.

Authors:
  - Srinivas Yadav (https://github.com/srinivasyadav18)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16036
---
 cpp/benchmarks/join/conditional_join.cu | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/cpp/benchmarks/join/conditional_join.cu b/cpp/benchmarks/join/conditional_join.cu
index e332d09d31b..2deb888cc5c 100644
--- a/cpp/benchmarks/join/conditional_join.cu
+++ b/cpp/benchmarks/join/conditional_join.cu
@@ -16,6 +16,8 @@
 
 #include "join_common.hpp"
 
+auto const CONDITIONAL_JOIN_SIZE_RANGE = std::vector<nvbench::int64_t>{1000, 100'000};
+
 template <typename Key, bool Nullable>
 void nvbench_conditional_inner_join(nvbench::state& state,
                                     nvbench::type_list<Key, nvbench::enum_type<Nullable>>)
@@ -46,12 +48,12 @@ NVBENCH_BENCH_TYPES(nvbench_conditional_inner_join,
                     NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
   .set_name("conditional_inner_join")
   .set_type_axes_names({"Key", "Nullable"})
-  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+  .add_int64_axis("left_size", CONDITIONAL_JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", CONDITIONAL_JOIN_SIZE_RANGE);
 
 NVBENCH_BENCH_TYPES(nvbench_conditional_left_join,
                     NVBENCH_TYPE_AXES(JOIN_KEY_TYPE_RANGE, JOIN_NULLABLE_RANGE))
   .set_name("conditional_left_join")
   .set_type_axes_names({"Key", "Nullable"})
-  .add_int64_axis("left_size", JOIN_SIZE_RANGE)
-  .add_int64_axis("right_size", JOIN_SIZE_RANGE);
+  .add_int64_axis("left_size", CONDITIONAL_JOIN_SIZE_RANGE)
+  .add_int64_axis("right_size", CONDITIONAL_JOIN_SIZE_RANGE);

From dcc153b67c48909a7bd5fcecfd4ccc91844e55ec Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 18 Jun 2024 09:07:37 +0100
Subject: [PATCH 381/842] Remove deprecated ExtContext node (#16001)

Polars deprecated with_context in the alpha for version 1, and will remove it for version 1. So let's not bother implementing it. Also add some no-cover pragmas to unreachable code in the translation DSL layer.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16001
---
 python/cudf_polars/cudf_polars/dsl/ir.py      | 31 +++++--------------
 .../cudf_polars/cudf_polars/dsl/translate.py  | 21 +++++--------
 python/cudf_polars/tests/test_extcontext.py   | 23 --------------
 3 files changed, 14 insertions(+), 61 deletions(-)
 delete mode 100644 python/cudf_polars/tests/test_extcontext.py

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 7f0920e1b57..83957e4286d 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -17,7 +17,7 @@
 import itertools
 import types
 from functools import cache
-from typing import TYPE_CHECKING, Any, Callable, ClassVar, NoReturn
+from typing import TYPE_CHECKING, Any, Callable, ClassVar
 
 import pyarrow as pa
 from typing_extensions import assert_never
@@ -56,7 +56,6 @@
     "MapFunction",
     "Union",
     "HConcat",
-    "ExtContext",
 ]
 
 
@@ -153,7 +152,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             since the translation phase should pick up things that we
             cannot handle.
         """
-        raise NotImplementedError
+        raise NotImplementedError(
+            f"Evaluation of plan {type(self).__name__}"
+        )  # pragma: no cover
 
 
 @dataclasses.dataclass(slots=True)
@@ -346,7 +347,9 @@ class Reduce(IR):
     expr: list[expr.NamedExpr]
     """List of expressions to evaluate to form the new dataframe."""
 
-    def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
+    def evaluate(
+        self, *, cache: MutableMapping[int, DataFrame]
+    ) -> DataFrame:  # pragma: no cover; polars doesn't emit this node yet
         """Evaluate and return a dataframe."""
         df = self.df.evaluate(cache=cache)
         columns = broadcast(*(e.evaluate(df) for e in self.expr))
@@ -938,23 +941,3 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(
             list(itertools.chain.from_iterable(df.columns for df in dfs)),
         )
-
-
-@dataclasses.dataclass(slots=True)
-class ExtContext(IR):
-    """
-    Concatenate dataframes horizontally.
-
-    Prefer HConcat, since this is going to be deprecated on the polars side.
-    """
-
-    df: IR
-    """Input."""
-    extra: list[IR]
-    """List of extra inputs."""
-
-    def __post_init__(self) -> NoReturn:
-        """Validate preconditions."""
-        raise NotImplementedError(
-            "ExtContext will be deprecated, use horizontal concat instead."
-        )
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index adde3b1a9dc..41bc3032bc5 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -63,7 +63,9 @@ def __exit__(self, *args: Any) -> None:
 def _translate_ir(
     node: Any, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
-    raise NotImplementedError(f"Translation for {type(node).__name__}")
+    raise NotImplementedError(
+        f"Translation for {type(node).__name__}"
+    )  # pragma: no cover
 
 
 @_translate_ir.register
@@ -172,7 +174,7 @@ def _(
 @_translate_ir.register
 def _(
     node: pl_ir.Reduce, visitor: NodeTraverser, schema: dict[str, plc.DataType]
-) -> ir.IR:
+) -> ir.IR:  # pragma: no cover; polars doesn't emit this node yet
     with set_node(visitor, node.input):
         inp = translate_ir(visitor, n=None)
         exprs = [translate_named_expr(visitor, n=e) for e in node.expr]
@@ -256,17 +258,6 @@ def _(
     return ir.HConcat(schema, [translate_ir(visitor, n=n) for n in node.inputs])
 
 
-@_translate_ir.register
-def _(
-    node: pl_ir.ExtContext, visitor: NodeTraverser, schema: dict[str, plc.DataType]
-) -> ir.IR:
-    return ir.ExtContext(
-        schema,
-        translate_ir(visitor, n=node.input),
-        [translate_ir(visitor, n=n) for n in node.contexts],
-    )
-
-
 def translate_ir(visitor: NodeTraverser, *, n: int | None = None) -> ir.IR:
     """
     Translate a polars-internal IR node to our representation.
@@ -333,7 +324,9 @@ def translate_named_expr(
 def _translate_expr(
     node: Any, visitor: NodeTraverser, dtype: plc.DataType
 ) -> expr.Expr:
-    raise NotImplementedError(f"Translation for {type(node).__name__}")
+    raise NotImplementedError(
+        f"Translation for {type(node).__name__}"
+    )  # pragma: no cover
 
 
 @_translate_expr.register
diff --git a/python/cudf_polars/tests/test_extcontext.py b/python/cudf_polars/tests/test_extcontext.py
deleted file mode 100644
index 9daf88b4338..00000000000
--- a/python/cudf_polars/tests/test_extcontext.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
-# SPDX-License-Identifier: Apache-2.0
-from __future__ import annotations
-
-import pytest
-
-import polars as pl
-
-from cudf_polars.testing.asserts import assert_gpu_result_equal
-
-
-def test_extcontext():
-    ldf = pl.DataFrame(
-        {
-            "a": [1, 2, 3, 4, 5, 6, 7],
-            "b": [1, 1, 1, 1, 1, 1, 1],
-        }
-    ).lazy()
-    ldf2 = ldf.select((pl.col("b") + pl.col("a")).alias("c"))
-    query = ldf.with_context(ldf2).select(pl.col("b"), pl.col("c"))
-    with pytest.raises(pl.exceptions.ComputeError):
-        # ExtContext to be deprecated so we're not implementing it.
-        assert_gpu_result_equal(query)

From 102d30add77e9a618d38e8eba6fa1f8472e7c10c Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 18 Jun 2024 07:41:49 -1000
Subject: [PATCH 382/842] Remove `override_dtypes` and `include_index` from
 `Frame._copy_type_metadata` (#16043)

* `override_dtypes` logic was only needed for `.explode`. I think it's appropriate to make it a postprocessing step in that function
* `include_index` logic was able to be transferred more simply to `IndexedFrame._from_columns_like_self`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16043
---
 python/cudf/cudf/core/_base_index.py     |   4 +-
 python/cudf/cudf/core/dataframe.py       |   6 --
 python/cudf/cudf/core/frame.py           |  26 +-----
 python/cudf/cudf/core/index.py           |  25 ++----
 python/cudf/cudf/core/indexed_frame.py   | 101 +++++++----------------
 python/cudf/cudf/core/multiindex.py      |   6 +-
 python/cudf/cudf/tests/test_dataframe.py |  18 ++++
 7 files changed, 63 insertions(+), 123 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index e71e45e410e..ad73cd57f7d 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -282,9 +282,7 @@ def __contains__(self, item):
         hash(item)
         return item in self._values
 
-    def _copy_type_metadata(
-        self, other: Self, *, override_dtypes=None
-    ) -> Self:
+    def _copy_type_metadata(self: Self, other: Self) -> Self:
         raise NotImplementedError
 
     def get_level_values(self, level):
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 065b13561ab..76bb9d2a8ed 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7361,9 +7361,6 @@ def explode(self, column, ignore_index=False):
         3     4  44
         3     5  44
         """
-        if column not in self._column_names:
-            raise KeyError(column)
-
         return super()._explode(column, ignore_index)
 
     def pct_change(
@@ -7511,14 +7508,11 @@ def _from_columns_like_self(
         columns: list[ColumnBase],
         column_names: abc.Iterable[str] | None = None,
         index_names: list[str] | None = None,
-        *,
-        override_dtypes: abc.Iterable[Dtype | None] | None = None,
     ) -> DataFrame:
         result = super()._from_columns_like_self(
             columns,
             column_names,
             index_names,
-            override_dtypes=override_dtypes,
         )
         result._set_columns_like(self._data)
         return result
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index c58a0161ee0..38bff3946d6 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import copy
-import itertools
 import operator
 import pickle
 import warnings
@@ -80,7 +79,7 @@ def _columns(self) -> tuple[ColumnBase, ...]:
         return self._data.columns
 
     @property
-    def _dtypes(self) -> abc.Iterator:
+    def _dtypes(self) -> abc.Iterable:
         return zip(self._data.names, (col.dtype for col in self._data.columns))
 
     @property
@@ -145,8 +144,6 @@ def _from_columns_like_self(
         self,
         columns: list[ColumnBase],
         column_names: abc.Iterable[str] | None = None,
-        *,
-        override_dtypes: abc.Iterable[Dtype | None] | None = None,
     ):
         """Construct a Frame from a list of columns with metadata from self.
 
@@ -156,7 +153,7 @@ def _from_columns_like_self(
             column_names = self._column_names
         data = dict(zip(column_names, columns))
         frame = self.__class__._from_data(data)
-        return frame._copy_type_metadata(self, override_dtypes=override_dtypes)
+        return frame._copy_type_metadata(self)
 
     @_cudf_nvtx_annotate
     def _mimic_inplace(
@@ -1032,29 +1029,14 @@ def _positions_from_column_names(self, column_names) -> list[int]:
         ]
 
     @_cudf_nvtx_annotate
-    def _copy_type_metadata(
-        self,
-        other: Self,
-        *,
-        override_dtypes: abc.Iterable[Dtype | None] | None = None,
-    ) -> Self:
+    def _copy_type_metadata(self: Self, other: Self) -> Self:
         """
         Copy type metadata from each column of `other` to the corresponding
         column of `self`.
 
-        If override_dtypes is provided, any non-None entry
-        will be used in preference to the relevant column of other to
-        provide the new dtype.
-
         See `ColumnBase._with_type_metadata` for more information.
         """
-        if override_dtypes is None:
-            override_dtypes = itertools.repeat(None)
-        dtypes = (
-            dtype if dtype is not None else col.dtype
-            for (dtype, col) in zip(override_dtypes, other._data.values())
-        )
-        for (name, col), dtype in zip(self._data.items(), dtypes):
+        for (name, col), (_, dtype) in zip(self._data.items(), other._dtypes):
             self._data.set_by_label(
                 name, col._with_type_metadata(dtype), validate=False
             )
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index df21d392311..1c5d05d2d87 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -62,7 +62,7 @@
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
 
 if TYPE_CHECKING:
-    from collections.abc import Generator
+    from collections.abc import Generator, Iterable
 
 
 class IndexMeta(type):
@@ -232,9 +232,7 @@ def __init__(
                     raise ValueError("Step must not be zero.") from err
                 raise
 
-    def _copy_type_metadata(
-        self, other: RangeIndex, *, override_dtypes=None
-    ) -> Self:
+    def _copy_type_metadata(self: Self, other: Self) -> Self:
         # There is no metadata to be copied for RangeIndex since it does not
         # have an underlying column.
         return self
@@ -485,6 +483,10 @@ def dtype(self):
         dtype = np.dtype(np.int64)
         return _maybe_convert_to_default_type(dtype)
 
+    @property
+    def _dtypes(self) -> Iterable:
+        return [(self.name, self.dtype)]
+
     @_cudf_nvtx_annotate
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
@@ -1115,15 +1117,6 @@ def _binaryop(
             return ret.values
         return ret
 
-    # Override just to make mypy happy.
-    @_cudf_nvtx_annotate
-    def _copy_type_metadata(
-        self, other: Self, *, override_dtypes=None
-    ) -> Self:
-        return super()._copy_type_metadata(
-            other, override_dtypes=override_dtypes
-        )
-
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def _values(self):
@@ -1769,10 +1762,8 @@ def __init__(
                 raise ValueError("No unique frequency found")
 
     @_cudf_nvtx_annotate
-    def _copy_type_metadata(
-        self: DatetimeIndex, other: DatetimeIndex, *, override_dtypes=None
-    ) -> Index:
-        super()._copy_type_metadata(other, override_dtypes=override_dtypes)
+    def _copy_type_metadata(self: Self, other: Self) -> Self:
+        super()._copy_type_metadata(other)
         self._freq = _validate_freq(other._freq)
         return self
 
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 06da62306e8..f1b74adefed 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -313,17 +313,11 @@ def _from_columns_like_self(
         columns: list[ColumnBase],
         column_names: abc.Iterable[str] | None = None,
         index_names: list[str] | None = None,
-        *,
-        override_dtypes: abc.Iterable[Dtype | None] | None = None,
     ) -> Self:
         """Construct a `Frame` from a list of columns with metadata from self.
 
         If `index_names` is set, the first `len(index_names)` columns are
         used to construct the index of the frame.
-
-        If override_dtypes is provided then any non-None entry will be
-        used for the dtype of the matching column in preference to the
-        dtype of the column in self.
         """
         if column_names is None:
             column_names = self._column_names
@@ -337,22 +331,24 @@ def _from_columns_like_self(
             index = _index_from_data(
                 dict(enumerate(columns[:n_index_columns]))
             )
+            index = index._copy_type_metadata(self.index)
+            # TODO: Should this if statement be handled in Index._copy_type_metadata?
+            if (
+                isinstance(self.index, cudf.CategoricalIndex)
+                and not isinstance(index, cudf.CategoricalIndex)
+            ) or (
+                isinstance(self.index, cudf.MultiIndex)
+                and not isinstance(index, cudf.MultiIndex)
+            ):
+                index = type(self.index)._from_data(index._data)
             if isinstance(index, cudf.MultiIndex):
                 index.names = index_names
             else:
                 index.name = index_names[0]
 
         data = dict(zip(column_names, data_columns))
-        frame = self.__class__._from_data(data)
-
-        if index is not None:
-            # TODO: triage why using the setter here breaks dask_cuda.ProxifyHostFile
-            frame._index = index
-        return frame._copy_type_metadata(
-            self,
-            include_index=bool(index_names),
-            override_dtypes=override_dtypes,
-        )
+        frame = type(self)._from_data(data, index)
+        return frame._copy_type_metadata(self)
 
     def __round__(self, digits=0):
         # Shouldn't be added to BinaryOperand
@@ -1913,45 +1909,6 @@ def nans_to_nulls(self):
             self._data._from_columns_like_self(result)
         )
 
-    def _copy_type_metadata(
-        self,
-        other: Self,
-        include_index: bool = True,
-        *,
-        override_dtypes: abc.Iterable[Dtype | None] | None = None,
-    ) -> Self:
-        """
-        Copy type metadata from each column of `other` to the corresponding
-        column of `self`.
-        See `ColumnBase._with_type_metadata` for more information.
-        """
-        super()._copy_type_metadata(other, override_dtypes=override_dtypes)
-        if (
-            include_index
-            and self.index is not None
-            and other.index is not None
-        ):
-            self.index._copy_type_metadata(other.index)
-            # When other.index is a CategoricalIndex, the current index
-            # will be a NumericalIndex with an underlying CategoricalColumn
-            # (the above _copy_type_metadata call will have converted the
-            # column). Calling cudf.Index on that column generates the
-            # appropriate index.
-            if isinstance(
-                other.index, cudf.core.index.CategoricalIndex
-            ) and not isinstance(self.index, cudf.core.index.CategoricalIndex):
-                self.index = cudf.Index(
-                    cast("cudf.Index", self.index)._column,
-                    name=self.index.name,
-                )
-            elif isinstance(other.index, cudf.MultiIndex) and not isinstance(
-                self.index, cudf.MultiIndex
-            ):
-                self.index = cudf.MultiIndex._from_data(
-                    self.index._data, name=self.index.name
-                )
-        return self
-
     @_cudf_nvtx_annotate
     def interpolate(
         self,
@@ -5195,36 +5152,36 @@ def _explode(self, explode_column: Any, ignore_index: bool):
         # duplicated. If ignore_index is set, the original index is not
         # exploded and will be replaced with a `RangeIndex`.
         if not isinstance(self._data[explode_column].dtype, ListDtype):
-            data = self._data.copy(deep=True)
-            idx = None if ignore_index else self.index.copy(deep=True)
-            return self.__class__._from_data(data, index=idx)
+            result = self.copy()
+            if ignore_index:
+                result.index = RangeIndex(len(result))
+            return result
 
         column_index = self._column_names.index(explode_column)
-        if not ignore_index and self.index is not None:
-            index_offset = self.index.nlevels
+        if not ignore_index:
+            idx_cols = self.index._columns
         else:
-            index_offset = 0
+            idx_cols = ()
 
         exploded = libcudf.lists.explode_outer(
-            [
-                *(self.index._data.columns if not ignore_index else ()),
-                *self._columns,
-            ],
-            column_index + index_offset,
+            [*idx_cols, *self._columns],
+            column_index + len(idx_cols),
         )
         # We must copy inner datatype of the exploded list column to
         # maintain struct dtype key names
-        exploded_dtype = cast(
+        element_type = cast(
             ListDtype, self._columns[column_index].dtype
         ).element_type
+        exploded = [
+            column._with_type_metadata(element_type)
+            if i == column_index
+            else column
+            for i, column in enumerate(exploded, start=-len(idx_cols))
+        ]
         return self._from_columns_like_self(
             exploded,
             self._column_names,
-            self._index_names if not ignore_index else None,
-            override_dtypes=(
-                exploded_dtype if i == column_index else None
-                for i in range(len(self._columns))
-            ),
+            self.index.names if not ignore_index else None,
         )
 
     @_cudf_nvtx_annotate
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 832cc003d2e..a01242d957d 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -37,6 +37,8 @@
 if TYPE_CHECKING:
     from collections.abc import Generator
 
+    from typing_extensions import Self
+
     from cudf._typing import DataFrameOrSeries
 
 
@@ -2100,9 +2102,7 @@ def _intersection(self, other, sort=None):
         return midx
 
     @_cudf_nvtx_annotate
-    def _copy_type_metadata(
-        self: MultiIndex, other: MultiIndex, *, override_dtypes=None
-    ) -> MultiIndex:
+    def _copy_type_metadata(self: Self, other: Self) -> Self:
         res = super()._copy_type_metadata(other)
         if isinstance(other, MultiIndex):
             res._names = other._names
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 649821b9b7c..3661e13bd39 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9466,6 +9466,24 @@ def test_explode(data, labels, ignore_index, p_index, label_to_explode):
     assert_eq(expect, got, check_dtype=False)
 
 
+def test_explode_preserve_categorical():
+    gdf = cudf.DataFrame(
+        {
+            "A": [[1, 2], None, [2, 3]],
+            "B": cudf.Series([0, 1, 2], dtype="category"),
+        }
+    )
+    result = gdf.explode("A")
+    expected = cudf.DataFrame(
+        {
+            "A": [1, 2, None, 2, 3],
+            "B": cudf.Series([0, 0, 1, 2, 2], dtype="category"),
+        }
+    )
+    expected.index = cudf.Index([0, 0, 1, 2, 2])
+    assert_eq(result, expected)
+
+
 @pytest.mark.parametrize(
     "df,ascending,expected",
     [

From 231cb716baf44b64e0284e23ae9666500de7d593 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 18 Jun 2024 11:50:46 -0700
Subject: [PATCH 383/842] Fix a size overflow bug in hash groupby (#16053)

This PR fixes a size overflow bug discovered by @matal-nvidia. It converts the groupby problem size to `int64_t` so it won't overflow if larger than `INT_MAX / 2` with 50% hash table occupancy.

Unit tests for this scenario will saturate device memory and take longer than necessary, making them likely not worth adding.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16053
---
 cpp/src/groupby/hash/groupby.cu                  | 3 ++-
 java/src/test/java/ai/rapids/cudf/TableTest.java | 3 ++-
 python/cudf/cudf/core/groupby/groupby.py         | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 0ec293ae3f0..5fe4a5eb30f 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -553,7 +553,8 @@ std::unique_ptr<table> groupby(table_view const& keys,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
 {
-  auto const num_keys            = keys.num_rows();
+  // convert to int64_t to avoid potential overflow with large `keys`
+  auto const num_keys            = static_cast<int64_t>(keys.num_rows());
   auto const null_keys_are_equal = null_equality::EQUAL;
   auto const has_null            = nullate::DYNAMIC{cudf::has_nested_nulls(keys)};
 
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index dc6eb55fc6a..050bcbb268f 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -7838,11 +7838,12 @@ void testSumWithStrings() {
         .build();
          Table result = t.groupBy(0).aggregate(
              GroupByAggregation.sum().onColumn(1));
+         Table sorted = result.orderBy(OrderByArg.asc(0));
          Table expected = new Table.TestBuilder()
              .column("1-URGENT", "3-MEDIUM")
              .column(5289L + 5303L, 5203L + 5206L)
              .build()) {
-      assertTablesAreEqual(expected, result);
+      assertTablesAreEqual(expected, sorted);
     }
   }
 
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index d08268eea3a..77b54a583d3 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1308,7 +1308,7 @@ def pipe(self, func, *args, **kwargs):
         To get the difference between each groups maximum and minimum value
         in one pass, you can do
 
-        >>> df.groupby('A').pipe(lambda x: x.max() - x.min())
+        >>> df.groupby('A', sort=True).pipe(lambda x: x.max() - x.min())
            B
         A
         a  2

From fc4b3d3ecbf95ee9afdcd509554bbeb5367a3059 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 18 Jun 2024 09:02:05 -1000
Subject: [PATCH 384/842] Reduce deep copies in Index ops (#16054)

1. Changed `Index.rename(inplace=False)` to shallow copy which matches pandas behavior. Let me know if there's a reason why we should deep copy here.
2. Made `RangeIndex.unique` return a shallow copy like pandas.
3. Made `Index.dropna` with no NA's shallow copy like pandas.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16054
---
 python/cudf/cudf/core/_base_index.py |  6 +++---
 python/cudf/cudf/core/index.py       |  5 +++--
 python/cudf/cudf/tests/test_index.py | 25 +++++++++++++++++++++++--
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index ad73cd57f7d..caf07b286cd 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1120,7 +1120,7 @@ def difference(self, other, sort=None):
         res_name = _get_result_name(self.name, other.name)
 
         if is_mixed_with_object_dtype(self, other) or len(other) == 0:
-            difference = self.copy().unique()
+            difference = self.unique()
             difference.name = res_name
             if sort is True:
                 return difference.sort_values()
@@ -1744,7 +1744,7 @@ def rename(self, name, inplace=False):
             self.name = name
             return None
         else:
-            out = self.copy(deep=True)
+            out = self.copy(deep=False)
             out.name = name
             return out
 
@@ -2068,7 +2068,7 @@ def dropna(self, how="any"):
             raise ValueError(f"{how=} must be 'any' or 'all'")
         try:
             if not self.hasnans:
-                return self.copy()
+                return self.copy(deep=False)
         except NotImplementedError:
             pass
         # This is to be consistent with IndexedFrame.dropna to handle nans
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 1c5d05d2d87..71658695b80 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -528,7 +528,7 @@ def memory_usage(self, deep: bool = False) -> int:
 
     def unique(self) -> Self:
         # RangeIndex always has unique values
-        return self
+        return self.copy()
 
     @_cudf_nvtx_annotate
     def __mul__(self, other):
@@ -3197,7 +3197,8 @@ def _get_nearest_indexer(
     )
     right_indexer = _get_indexer_basic(
         index=index,
-        positions=positions.copy(deep=True),
+        # positions no longer used so don't copy
+        positions=positions,
         method="backfill",
         target_col=target_col,
         tolerance=tolerance,
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 3d6c71ebc1b..a59836df5ba 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -252,10 +252,10 @@ def test_index_rename_inplace():
     pds = pd.Index([1, 2, 3], name="asdf")
     gds = Index(pds)
 
-    # inplace=False should yield a deep copy
+    # inplace=False should yield a shallow copy
     gds_renamed_deep = gds.rename("new_name", inplace=False)
 
-    assert gds_renamed_deep._values.data_ptr != gds._values.data_ptr
+    assert gds_renamed_deep._values.data_ptr == gds._values.data_ptr
 
     # inplace=True returns none
     expected_ptr = gds._values.data_ptr
@@ -3214,6 +3214,27 @@ def test_rangeindex_dropna():
     assert_eq(result, expected)
 
 
+def test_rangeindex_unique_shallow_copy():
+    ri_pandas = pd.RangeIndex(1)
+    result = ri_pandas.unique()
+    assert result is not ri_pandas
+
+    ri_cudf = cudf.RangeIndex(1)
+    result = ri_cudf.unique()
+    assert result is not ri_cudf
+    assert_eq(result, ri_cudf)
+
+
+def test_rename_shallow_copy():
+    idx = pd.Index([1])
+    result = idx.rename("a")
+    assert idx.to_numpy(copy=False) is result.to_numpy(copy=False)
+
+    idx = cudf.Index([1])
+    result = idx.rename("a")
+    assert idx._column is result._column
+
+
 @pytest.mark.parametrize("data", [range(2), [10, 11, 12]])
 def test_index_contains_hashable(data):
     gidx = cudf.Index(data)

From 2ddbe2a0665066fe8a5021b23c9268ce91ce67a2 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 18 Jun 2024 20:06:04 +0100
Subject: [PATCH 385/842] Test behaviour of containers (#15994)

This ensures we cover all implementation.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15994
---
 .../cudf_polars/containers/column.py          |  2 +-
 .../cudf_polars/tests/containers/__init__.py  |  6 ++
 .../tests/containers/test_column.py           | 70 ++++++++++++++
 .../tests/containers/test_dataframe.py        | 92 +++++++++++++++++++
 4 files changed, 169 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf_polars/tests/containers/__init__.py
 create mode 100644 python/cudf_polars/tests/containers/test_column.py
 create mode 100644 python/cudf_polars/tests/containers/test_dataframe.py

diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 156dd395d64..28685f0c4ed 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -130,7 +130,7 @@ def copy(self) -> Self:
     def mask_nans(self) -> Self:
         """Return a copy of self with nans masked out."""
         if self.nan_count > 0:
-            raise NotImplementedError
+            raise NotImplementedError("Need to port transform.hpp to pylibcudf")
         return self.copy()
 
     @functools.cached_property
diff --git a/python/cudf_polars/tests/containers/__init__.py b/python/cudf_polars/tests/containers/__init__.py
new file mode 100644
index 00000000000..4611d642f14
--- /dev/null
+++ b/python/cudf_polars/tests/containers/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py
new file mode 100644
index 00000000000..3291d8db161
--- /dev/null
+++ b/python/cudf_polars/tests/containers/test_column.py
@@ -0,0 +1,70 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pyarrow
+import pytest
+
+import cudf._lib.pylibcudf as plc
+
+from cudf_polars.containers import Column
+
+
+def test_non_scalar_access_raises():
+    column = Column(
+        plc.column_factories.make_numeric_column(
+            plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID
+        )
+    )
+    with pytest.raises(ValueError):
+        _ = column.obj_scalar
+
+
+@pytest.mark.parametrize("length", [0, 1])
+def test_length_leq_one_always_sorted(length):
+    column = Column(
+        plc.column_factories.make_numeric_column(
+            plc.DataType(plc.TypeId.INT8), length, plc.MaskState.ALL_VALID
+        )
+    )
+    assert column.is_sorted == plc.types.Sorted.YES
+    column.set_sorted(
+        is_sorted=plc.types.Sorted.NO,
+        order=plc.types.Order.ASCENDING,
+        null_order=plc.types.NullOrder.AFTER,
+    )
+    assert column.is_sorted == plc.types.Sorted.YES
+
+
+def test_shallow_copy():
+    column = Column(
+        plc.column_factories.make_numeric_column(
+            plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID
+        )
+    )
+    copy = column.copy()
+    copy = copy.set_sorted(
+        is_sorted=plc.types.Sorted.YES,
+        order=plc.types.Order.ASCENDING,
+        null_order=plc.types.NullOrder.AFTER,
+    )
+    assert column.is_sorted == plc.types.Sorted.NO
+    assert copy.is_sorted == plc.types.Sorted.YES
+
+
+@pytest.mark.parametrize("typeid", [plc.TypeId.INT8, plc.TypeId.FLOAT32])
+def test_mask_nans(typeid):
+    dtype = plc.DataType(typeid)
+    values = pyarrow.array([0, 0, 0], type=plc.interop.to_arrow(dtype))
+    column = Column(plc.interop.from_arrow(values))
+    masked = column.mask_nans()
+    assert column.obj is masked.obj
+
+
+def test_mask_nans_float_with_nan_notimplemented():
+    dtype = plc.DataType(plc.TypeId.FLOAT32)
+    values = pyarrow.array([0, 0, float("nan")], type=plc.interop.to_arrow(dtype))
+    column = Column(plc.interop.from_arrow(values))
+    with pytest.raises(NotImplementedError):
+        _ = column.mask_nans()
diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py
new file mode 100644
index 00000000000..2e385e39eef
--- /dev/null
+++ b/python/cudf_polars/tests/containers/test_dataframe.py
@@ -0,0 +1,92 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import cudf._lib.pylibcudf as plc
+
+from cudf_polars.containers import DataFrame, NamedColumn
+
+
+def test_select_missing_raises():
+    df = DataFrame(
+        [
+            NamedColumn(
+                plc.column_factories.make_numeric_column(
+                    plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID
+                ),
+                "a",
+            )
+        ]
+    )
+    with pytest.raises(ValueError):
+        df.select(["b", "a"])
+
+
+def test_replace_missing_raises():
+    df = DataFrame(
+        [
+            NamedColumn(
+                plc.column_factories.make_numeric_column(
+                    plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID
+                ),
+                "a",
+            )
+        ]
+    )
+    replacement = df.columns[0].copy(new_name="b")
+    with pytest.raises(ValueError):
+        df.replace_columns(replacement)
+
+
+def test_from_table_wrong_names():
+    table = plc.Table(
+        [
+            plc.column_factories.make_numeric_column(
+                plc.DataType(plc.TypeId.INT8), 1, plc.MaskState.ALL_VALID
+            )
+        ]
+    )
+    with pytest.raises(ValueError):
+        DataFrame.from_table(table, ["a", "b"])
+
+
+def test_sorted_like_raises_mismatching_names():
+    df = DataFrame(
+        [
+            NamedColumn(
+                plc.column_factories.make_numeric_column(
+                    plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID
+                ),
+                "a",
+            )
+        ]
+    )
+    like = df.copy().rename_columns({"a": "b"})
+    with pytest.raises(ValueError):
+        df.sorted_like(like)
+
+
+def test_shallow_copy():
+    column = NamedColumn(
+        plc.column_factories.make_numeric_column(
+            plc.DataType(plc.TypeId.INT8), 2, plc.MaskState.ALL_VALID
+        ),
+        "a",
+    )
+    column.set_sorted(
+        is_sorted=plc.types.Sorted.YES,
+        order=plc.types.Order.ASCENDING,
+        null_order=plc.types.NullOrder.AFTER,
+    )
+    df = DataFrame([column])
+    copy = df.copy()
+    copy.columns[0].set_sorted(
+        is_sorted=plc.types.Sorted.NO,
+        order=plc.types.Order.ASCENDING,
+        null_order=plc.types.NullOrder.AFTER,
+    )
+    assert df.columns[0].is_sorted == plc.types.Sorted.YES
+    assert copy.columns[0].is_sorted == plc.types.Sorted.NO

From 9bc794aa355c8e4c42fbc611fe9d496c20a4db90 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 18 Jun 2024 20:06:45 +0100
Subject: [PATCH 386/842] Coverage of binops where one or both operands are a
 scalar (#15998)

Just needed the tests here.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15998
---
 .../tests/expressions/test_numeric_binops.py         | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/python/cudf_polars/tests/expressions/test_numeric_binops.py b/python/cudf_polars/tests/expressions/test_numeric_binops.py
index 7eefc59d927..b6bcd0026fa 100644
--- a/python/cudf_polars/tests/expressions/test_numeric_binops.py
+++ b/python/cudf_polars/tests/expressions/test_numeric_binops.py
@@ -99,3 +99,15 @@ def test_numeric_binop(df, binop):
     q = df.select(binop(left, right))
 
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("left_scalar", [False, True])
+@pytest.mark.parametrize("right_scalar", [False, True])
+def test_binop_with_scalar(left_scalar, right_scalar):
+    df = pl.LazyFrame({"a": [1, 2, 3], "b": [5, 6, 7]})
+
+    lop = pl.lit(2) if left_scalar else pl.col("a")
+    rop = pl.lit(6) if right_scalar else pl.col("b")
+    q = df.select(lop / rop)
+
+    assert_gpu_result_equal(q)

From c83e5b3fdd7f9fe8a08c4f6874fbf847bba70c53 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Tue, 18 Jun 2024 16:22:44 -0400
Subject: [PATCH 387/842] Fix JSON multi-source reading when total source size
 exceeds `INT_MAX` bytes (#15930)

Fixes #15917.

- [X] Batched read and parse operations
- [x] Fail when any single source file exceeds `INT_MAX` bytes. This case will be handled with a chunked reader later.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15930
---
 cpp/include/cudf/io/types.hpp          |  13 +++
 cpp/src/io/json/read_json.cu           | 121 +++++++++++++++++++++----
 cpp/tests/CMakeLists.txt               |   1 +
 cpp/tests/large_strings/json_tests.cpp |  58 ++++++++++++
 4 files changed, 177 insertions(+), 16 deletions(-)
 create mode 100644 cpp/tests/large_strings/json_tests.cpp

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 0dab1c606de..0c96268f6c7 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -256,6 +256,19 @@ struct column_name_info {
   }
 
   column_name_info() = default;
+
+  /**
+   * @brief Compares two column name info structs for equality
+   *
+   * @param rhs column name info struct to compare against
+   * @return boolean indicating if this and rhs are equal
+   */
+  bool operator==(column_name_info const& rhs) const
+  {
+    return ((name == rhs.name) && (is_nullable == rhs.is_nullable) &&
+            (is_binary == rhs.is_binary) && (type_length == rhs.type_length) &&
+            (children == rhs.children));
+  };
 };
 
 /**
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index e999be8f83a..74001e5e01a 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -18,7 +18,9 @@
 #include "io/json/nested_json.hpp"
 #include "read_json.hpp"
 
+#include <cudf/concatenate.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/json.hpp>
@@ -76,7 +78,7 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   auto constexpr num_delimiter_chars = 1;
 
   if (compression == compression_type::NONE) {
-    std::vector<size_type> delimiter_map{};
+    std::vector<size_t> delimiter_map{};
     std::vector<size_t> prefsum_source_sizes(sources.size());
     std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
     delimiter_map.reserve(sources.size());
@@ -84,7 +86,7 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
     std::transform_inclusive_scan(sources.begin(),
                                   sources.end(),
                                   prefsum_source_sizes.begin(),
-                                  std::plus<int>{},
+                                  std::plus<size_t>{},
                                   [](std::unique_ptr<datasource> const& s) { return s->size(); });
     auto upper =
       std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset);
@@ -259,6 +261,33 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
     readbufspan.size() - first_delim_pos - shift_for_nonzero_offset);
 }
 
+table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
+                               json_reader_options const& reader_opts,
+                               rmm::cuda_stream_view stream,
+                               rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  datasource::owning_buffer<rmm::device_uvector<char>> bufview =
+    get_record_range_raw_input(sources, reader_opts, stream);
+
+  // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
+  // invoke pre-processing FST
+  if (reader_opts.is_enabled_normalize_single_quotes()) {
+    normalize_single_quotes(bufview, stream, rmm::mr::get_current_device_resource());
+  }
+
+  // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is
+  // enabled, invoke pre-processing FST
+  if (reader_opts.is_enabled_normalize_whitespace()) {
+    normalize_whitespace(bufview, stream, rmm::mr::get_current_device_resource());
+  }
+
+  auto buffer =
+    cudf::device_span<char const>(reinterpret_cast<char const*>(bufview.data()), bufview.size());
+  stream.synchronize();
+  return device_parse_nested_json(buffer, reader_opts, stream, mr);
+}
+
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
@@ -278,25 +307,85 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                  "Multiple inputs are supported only for JSON Lines format");
   }
 
-  datasource::owning_buffer<rmm::device_uvector<char>> bufview =
-    get_record_range_raw_input(sources, reader_opts, stream);
+  std::for_each(sources.begin(), sources.end(), [](auto const& source) {
+    CUDF_EXPECTS(source->size() < std::numeric_limits<int>::max(),
+                 "The size of each source file must be less than INT_MAX bytes");
+  });
 
-  // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
-  // invoke pre-processing FST
-  if (reader_opts.is_enabled_normalize_single_quotes()) {
-    normalize_single_quotes(bufview, stream, rmm::mr::get_current_device_resource());
+  constexpr size_t batch_size_ub = std::numeric_limits<int>::max();
+  size_t const chunk_offset      = reader_opts.get_byte_range_offset();
+  size_t chunk_size              = reader_opts.get_byte_range_size();
+  chunk_size                     = !chunk_size ? sources_size(sources, 0, 0) : chunk_size;
+
+  // Identify the position of starting source file from which to begin batching based on
+  // byte range offset. If the offset is larger than the sum of all source
+  // sizes, then start_source is total number of source files i.e. no file is read
+  size_t const start_source = [&]() {
+    size_t sum = 0;
+    for (size_t src_idx = 0; src_idx < sources.size(); ++src_idx) {
+      if (sum + sources[src_idx]->size() > chunk_offset) return src_idx;
+      sum += sources[src_idx]->size();
+    }
+    return sources.size();
+  }();
+
+  // Construct batches of source files, with starting position of batches indicated by
+  // batch_positions. The size of each batch i.e. the sum of sizes of the source files in the batch
+  // is capped at INT_MAX bytes.
+  size_t cur_size = 0;
+  std::vector<size_t> batch_positions;
+  std::vector<size_t> batch_sizes;
+  batch_positions.push_back(0);
+  for (size_t i = start_source; i < sources.size(); i++) {
+    cur_size += sources[i]->size();
+    if (cur_size >= batch_size_ub) {
+      batch_positions.push_back(i);
+      batch_sizes.push_back(cur_size - sources[i]->size());
+      cur_size = sources[i]->size();
+    }
   }
+  batch_positions.push_back(sources.size());
+  batch_sizes.push_back(cur_size);
 
-  // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is
-  // enabled, invoke pre-processing FST
-  if (reader_opts.is_enabled_normalize_whitespace()) {
-    normalize_whitespace(bufview, stream, rmm::mr::get_current_device_resource());
+  // If there is a single batch, then we can directly return the table without the
+  // unnecessary concatenate
+  if (batch_sizes.size() == 1) return read_batch(sources, reader_opts, stream, mr);
+
+  std::vector<cudf::io::table_with_metadata> partial_tables;
+  json_reader_options batched_reader_opts{reader_opts};
+
+  // Dispatch individual batches to read_batch and push the resulting table into
+  // partial_tables array. Note that the reader options need to be updated for each
+  // batch to adjust byte range offset and byte range size.
+  for (size_t i = 0; i < batch_sizes.size(); i++) {
+    batched_reader_opts.set_byte_range_size(std::min(batch_sizes[i], chunk_size));
+    partial_tables.emplace_back(read_batch(
+      host_span<std::unique_ptr<datasource>>(sources.begin() + batch_positions[i],
+                                             batch_positions[i + 1] - batch_positions[i]),
+      batched_reader_opts,
+      stream,
+      rmm::mr::get_current_device_resource()));
+    if (chunk_size <= batch_sizes[i]) break;
+    chunk_size -= batch_sizes[i];
+    batched_reader_opts.set_byte_range_offset(0);
   }
 
-  auto buffer =
-    cudf::device_span<char const>(reinterpret_cast<char const*>(bufview.data()), bufview.size());
-  stream.synchronize();
-  return device_parse_nested_json(buffer, reader_opts, stream, mr);
+  auto expects_schema_equality =
+    std::all_of(partial_tables.begin() + 1,
+                partial_tables.end(),
+                [&gt = partial_tables[0].metadata.schema_info](auto& ptbl) {
+                  return ptbl.metadata.schema_info == gt;
+                });
+  CUDF_EXPECTS(expects_schema_equality,
+               "Mismatch in JSON schema across batches in multi-source multi-batch reading");
+
+  auto partial_table_views = std::vector<cudf::table_view>(partial_tables.size());
+  std::transform(partial_tables.begin(),
+                 partial_tables.end(),
+                 partial_table_views.begin(),
+                 [](auto const& table) { return table.tbl->view(); });
+  return table_with_metadata{cudf::concatenate(partial_table_views, stream, mr),
+                             {partial_tables[0].metadata.schema_info}};
 }
 
 }  // namespace cudf::io::json::detail
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 329edbe4d36..eda470d2309 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -570,6 +570,7 @@ ConfigureTest(
   LARGE_STRINGS_TEST
   large_strings/concatenate_tests.cpp
   large_strings/case_tests.cpp
+  large_strings/json_tests.cpp
   large_strings/large_strings_fixture.cpp
   large_strings/merge_tests.cpp
   large_strings/parquet_tests.cpp
diff --git a/cpp/tests/large_strings/json_tests.cpp b/cpp/tests/large_strings/json_tests.cpp
new file mode 100644
index 00000000000..bf16d131ba7
--- /dev/null
+++ b/cpp/tests/large_strings/json_tests.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "large_strings_fixture.hpp"
+
+#include <cudf/io/json.hpp>
+#include <cudf/utilities/span.hpp>
+
+struct JsonLargeReaderTest : public cudf::test::StringsLargeTest {};
+
+TEST_F(JsonLargeReaderTest, MultiBatch)
+{
+  std::string json_string             = R"(
+    { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+  constexpr size_t expected_file_size = std::numeric_limits<int>::max() / 2;
+  std::size_t const log_repetitions =
+    static_cast<std::size_t>(std::ceil(std::log2(expected_file_size / json_string.size())));
+
+  json_string.reserve(json_string.size() * (1UL << log_repetitions));
+  std::size_t numrows = 4;
+  for (std::size_t i = 0; i < log_repetitions; i++) {
+    json_string += json_string;
+    numrows <<= 1;
+  }
+
+  constexpr int num_sources = 2;
+  std::vector<cudf::host_span<char>> hostbufs(
+    num_sources, cudf::host_span<char>(json_string.data(), json_string.size()));
+
+  // Initialize parsing options (reading json lines)
+  cudf::io::json_reader_options json_lines_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{
+        cudf::host_span<cudf::host_span<char>>(hostbufs.data(), hostbufs.size())})
+      .lines(true)
+      .compression(cudf::io::compression_type::NONE)
+      .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
+
+  // Read full test data via existing, nested JSON lines reader
+  cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options);
+  ASSERT_EQ(current_reader_table.tbl->num_rows(), numrows * num_sources);
+}

From f536e3017205be8b09f3dc2cfd448dc9c5a94d5d Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 19 Jun 2024 16:50:48 +0100
Subject: [PATCH 388/842] Add basic tests of dataframe scan (#16003)

Also assert that unsupported file scan operations raise.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16003
---
 python/cudf_polars/cudf_polars/dsl/ir.py      |  4 +-
 .../cudf_polars/testing/asserts.py            | 34 ++++++++++++++-
 python/cudf_polars/docs/overview.md           | 18 ++++++++
 .../cudf_polars/tests/test_dataframescan.py   | 43 +++++++++++++++++++
 python/cudf_polars/tests/test_scan.py         | 13 +++++-
 python/cudf_polars/tests/testing/__init__.py  |  6 +++
 .../cudf_polars/tests/testing/test_asserts.py | 35 +++++++++++++++
 7 files changed, 150 insertions(+), 3 deletions(-)
 create mode 100644 python/cudf_polars/tests/test_dataframescan.py
 create mode 100644 python/cudf_polars/tests/testing/__init__.py
 create mode 100644 python/cudf_polars/tests/testing/test_asserts.py

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 83957e4286d..3ccefac6b0a 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -196,7 +196,9 @@ def __post_init__(self) -> None:
         if self.file_options.n_rows is not None:
             raise NotImplementedError("row limit in scan")
         if self.typ not in ("csv", "parquet"):
-            raise NotImplementedError(f"Unhandled scan type: {self.typ}")
+            raise NotImplementedError(
+                f"Unhandled scan type: {self.typ}"
+            )  # pragma: no cover; polars raises on the rust side for now
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
index 3edaa427432..a9a4ae5f0a6 100644
--- a/python/cudf_polars/cudf_polars/testing/asserts.py
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -11,6 +11,7 @@
 from polars.testing.asserts import assert_frame_equal
 
 from cudf_polars.callback import execute_with_cudf
+from cudf_polars.dsl.translate import translate_ir
 
 if TYPE_CHECKING:
     from collections.abc import Mapping
@@ -19,7 +20,7 @@
 
     from cudf_polars.typing import OptimizationArgs
 
-__all__: list[str] = ["assert_gpu_result_equal"]
+__all__: list[str] = ["assert_gpu_result_equal", "assert_ir_translation_raises"]
 
 
 def assert_gpu_result_equal(
@@ -84,3 +85,34 @@ def assert_gpu_result_equal(
         atol=atol,
         categorical_as_str=categorical_as_str,
     )
+
+
+def assert_ir_translation_raises(q: pl.LazyFrame, *exceptions: type[Exception]) -> None:
+    """
+    Assert that translation of a query raises an exception.
+
+    Parameters
+    ----------
+    q
+        Query to translate.
+    exceptions
+        Exceptions that one expects might be raised.
+
+    Returns
+    -------
+    None
+        If translation successfully raised the specified exceptions.
+
+    Raises
+    ------
+    AssertionError
+       If the specified exceptions were not raised.
+    """
+    try:
+        _ = translate_ir(q._ldf.visit())
+    except exceptions:
+        return
+    except Exception as e:
+        raise AssertionError(f"Translation DID NOT RAISE {exceptions}") from e
+    else:
+        raise AssertionError(f"Translation DID NOT RAISE {exceptions}")
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index b50d01c26db..874bb849747 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -224,6 +224,24 @@ def test_whatever():
     assert_gpu_result_equal(query)
 ```
 
+## Test coverage and asserting failure modes
+
+Where translation of a query should fail due to the feature being
+unsupported we should test this. To assert that _translation_ raises
+an exception (usually `NotImplementedError`), use the utility function
+`assert_ir_translation_raises`:
+
+```python
+from cudf_polars.testing.asserts import assert_ir_translation_raises
+
+
+def test_whatever():
+    unsupported_query = ...
+    assert_ir_translation_raises(unsupported_query, NotImplementedError)
+```
+
+This test will fail if translation does not raise.
+
 # Debugging
 
 If the callback execution fails during the polars `collect` call, we
diff --git a/python/cudf_polars/tests/test_dataframescan.py b/python/cudf_polars/tests/test_dataframescan.py
new file mode 100644
index 00000000000..1ffe06ac562
--- /dev/null
+++ b/python/cudf_polars/tests/test_dataframescan.py
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize(
+    "subset",
+    [
+        None,
+        ["a", "c"],
+        ["b", "c", "d"],
+        ["b", "d"],
+        ["b", "c"],
+        ["c", "e"],
+        ["d", "e"],
+        pl.selectors.string(),
+        pl.selectors.integer(),
+    ],
+)
+@pytest.mark.parametrize("predicate_pushdown", [False, True])
+def test_scan_drop_nulls(subset, predicate_pushdown):
+    df = pl.LazyFrame(
+        {
+            "a": [1, 2, 3, 4],
+            "b": [None, 4, 5, None],
+            "c": [6, 7, None, None],
+            "d": [8, None, 9, 10],
+            "e": [None, None, "A", None],
+        }
+    )
+    # Drop nulls are pushed into filters
+    q = df.drop_nulls(subset)
+
+    assert_gpu_result_equal(
+        q, collect_kwargs={"predicate_pushdown": predicate_pushdown}
+    )
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index b2443e357e2..f129cc7ca32 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -6,7 +6,10 @@
 
 import polars as pl
 
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 @pytest.fixture(
@@ -86,3 +89,11 @@ def test_scan(df, columns, mask):
     if columns is not None:
         q = df.select(*columns)
     assert_gpu_result_equal(q)
+
+
+def test_scan_unsupported_raises(tmp_path):
+    df = pl.DataFrame({"a": [1, 2, 3]})
+
+    df.write_ndjson(tmp_path / "df.json")
+    q = pl.scan_ndjson(tmp_path / "df.json")
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/testing/__init__.py b/python/cudf_polars/tests/testing/__init__.py
new file mode 100644
index 00000000000..4611d642f14
--- /dev/null
+++ b/python/cudf_polars/tests/testing/__init__.py
@@ -0,0 +1,6 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+__all__: list[str] = []
diff --git a/python/cudf_polars/tests/testing/test_asserts.py b/python/cudf_polars/tests/testing/test_asserts.py
new file mode 100644
index 00000000000..5bc2fe1efb7
--- /dev/null
+++ b/python/cudf_polars/tests/testing/test_asserts.py
@@ -0,0 +1,35 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
+
+
+def test_translation_assert_raises():
+    df = pl.LazyFrame({"a": [1, 2, 3]})
+
+    # This should succeed
+    assert_gpu_result_equal(df)
+
+    with pytest.raises(AssertionError):
+        # This should fail, because we can translate this query.
+        assert_ir_translation_raises(df, NotImplementedError)
+
+    class E(Exception):
+        pass
+
+    unsupported = df.group_by("a").agg(pl.col("a").cum_max().alias("b"))
+    # Unsupported query should raise NotImplementedError
+    assert_ir_translation_raises(unsupported, NotImplementedError)
+
+    with pytest.raises(AssertionError):
+        # This should fail, because we can't translate this query, but it doesn't raise E.
+        assert_ir_translation_raises(unsupported, E)

From ac3c8dddda2fac2cb02c8a8ee58d827c00ddf867 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 24 Jun 2024 08:09:36 -0400
Subject: [PATCH 389/842] Fix memory size in
 create_byte_range_infos_consecutive (#16012)

Fixes over allocated memory for range vector in `cudf::io::text::create_byte_range_infos_consecutive`

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16012
---
 cpp/src/io/text/byte_range_info.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/text/byte_range_info.cpp b/cpp/src/io/text/byte_range_info.cpp
index 290e0451839..6a7836ed4e1 100644
--- a/cpp/src/io/text/byte_range_info.cpp
+++ b/cpp/src/io/text/byte_range_info.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@ std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_b
   auto range_size = util::div_rounding_up_safe(total_bytes, range_count);
   auto ranges     = std::vector<byte_range_info>();
 
-  ranges.reserve(range_size);
+  ranges.reserve(range_count);
 
   for (int64_t i = 0; i < range_count; i++) {
     auto offset = i * range_size;

From ed41668eee28350183ceda29daf56c3ac7fa78ed Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Mon, 24 Jun 2024 07:57:22 -0700
Subject: [PATCH 390/842] Add test of interoperability of cuDF and arrow
 BYTE_STREAM_SPLIT encoders (#15832)

BYTE_STREAM_SPLIT encoding was recently added to cuDF (#15311). The Parquet specification was recently changed (https://github.com/apache/parquet-format/pull/229) to extend the datatypes that can be encoded as BYTE_STREAM_SPLIT, and this was only recently implemented in arrow (https://github.com/apache/arrow/pull/40094). This PR adds a check that cuDF and arrow can produce compatible files using BYTE_STREAM_SPLIT encoding.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15832
---
 python/cudf/cudf/tests/test_parquet.py | 55 ++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)

diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 2596fe8cd37..af79f361b43 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -2947,6 +2947,61 @@ def test_per_column_options_string_col(tmpdir, encoding):
     assert encoding in fmd.row_group(0).column(0).encodings
 
 
+@pytest.mark.parametrize(
+    "num_rows",
+    [200, 10000],
+)
+def test_parquet_bss_round_trip(tmpdir, num_rows):
+    def flba(i):
+        hasher = hashlib.sha256()
+        hasher.update(i.to_bytes(4, "little"))
+        return hasher.digest()
+
+    # use pyarrow to write table of types that support BYTE_STREAM_SPLIT encoding
+    rows_per_rowgroup = 5000
+    fixed_data = pa.array(
+        [flba(i) for i in range(num_rows)], type=pa.binary(32)
+    )
+    i32_data = pa.array(list(range(num_rows)), type=pa.int32())
+    i64_data = pa.array(list(range(num_rows)), type=pa.int64())
+    f32_data = pa.array([float(i) for i in range(num_rows)], type=pa.float32())
+    f64_data = pa.array([float(i) for i in range(num_rows)], type=pa.float64())
+    padf = pa.Table.from_arrays(
+        [fixed_data, i32_data, i64_data, f32_data, f64_data],
+        names=["flba", "i32", "i64", "f32", "f64"],
+    )
+    padf_fname = tmpdir.join("padf.parquet")
+    pq.write_table(
+        padf,
+        padf_fname,
+        column_encoding="BYTE_STREAM_SPLIT",
+        use_dictionary=False,
+        row_group_size=rows_per_rowgroup,
+    )
+
+    # round trip data with cudf
+    cdf = cudf.read_parquet(padf_fname)
+    cdf_fname = tmpdir.join("cdf.parquet")
+    cdf.to_parquet(
+        cdf_fname,
+        column_type_length={"flba": 32},
+        column_encoding={
+            "flba": "BYTE_STREAM_SPLIT",
+            "i32": "BYTE_STREAM_SPLIT",
+            "i64": "BYTE_STREAM_SPLIT",
+            "f32": "BYTE_STREAM_SPLIT",
+            "f64": "BYTE_STREAM_SPLIT",
+        },
+        row_group_size_rows=rows_per_rowgroup,
+    )
+
+    # now read back in with pyarrow to test it was written properly by cudf
+    padf2 = pq.read_table(padf_fname)
+    padf3 = pq.read_table(cdf_fname)
+    assert_eq(padf2, padf3)
+    assert_eq(padf2.schema[0].type, padf3.schema[0].type)
+
+
 def test_parquet_reader_rle_boolean(datadir):
     fname = datadir / "rle_boolean_encoding.parquet"
 

From c33e0a349b2d0c2a626364845e616cfd3d04afc6 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 24 Jun 2024 17:18:19 +0100
Subject: [PATCH 391/842] Add coverage for both expression and dataframe filter
 (#16002)

Note that expression filter with literals does not work because broadcasting is not implemented. It is also the case that the result could be computed without broadcasting in the case of scalars with some data introspection, but we do not do that here.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16002
---
 .../tests/expressions/test_filter.py          | 30 ++++++++++++++-----
 python/cudf_polars/tests/test_filter.py       | 26 ++++++++++++++++
 2 files changed, 49 insertions(+), 7 deletions(-)
 create mode 100644 python/cudf_polars/tests/test_filter.py

diff --git a/python/cudf_polars/tests/expressions/test_filter.py b/python/cudf_polars/tests/expressions/test_filter.py
index 783403d764c..1a8e994e3aa 100644
--- a/python/cudf_polars/tests/expressions/test_filter.py
+++ b/python/cudf_polars/tests/expressions/test_filter.py
@@ -2,19 +2,35 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import pytest
+
 import polars as pl
 
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
-def test_filter():
-    ldf = pl.DataFrame(
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pytest.param(
+            pl.lit(value=False),
+            marks=pytest.mark.xfail(reason="Expression filter does not handle scalars"),
+        ),
+        pl.col("c"),
+        pl.col("b") > 2,
+    ],
+)
+@pytest.mark.parametrize("predicate_pushdown", [False, True])
+def test_filter_expression(expr, predicate_pushdown):
+    ldf = pl.LazyFrame(
         {
             "a": [1, 2, 3, 4, 5, 6, 7],
-            "b": [1, 1, 1, 1, 1, 1, 1],
+            "b": [0, 3, 1, 5, 6, 1, 0],
+            "c": [None, True, False, False, True, True, False],
         }
-    ).lazy()
+    )
 
-    # group-by is just to avoid the filter being pushed into the scan.
-    query = ldf.group_by(pl.col("a")).agg(pl.col("b").sum()).filter(pl.col("b") < 1)
-    assert_gpu_result_equal(query)
+    query = ldf.select(pl.col("a").filter(expr))
+    assert_gpu_result_equal(
+        query, collect_kwargs={"predicate_pushdown": predicate_pushdown}
+    )
diff --git a/python/cudf_polars/tests/test_filter.py b/python/cudf_polars/tests/test_filter.py
new file mode 100644
index 00000000000..f39b348144b
--- /dev/null
+++ b/python/cudf_polars/tests/test_filter.py
@@ -0,0 +1,26 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize("expr", [pl.col("c"), pl.col("b") < 1, pl.lit(value=True)])
+@pytest.mark.parametrize("predicate_pushdown", [False, True])
+def test_filter(expr, predicate_pushdown):
+    ldf = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [1, 1, 1, 1, 1, 1, 1],
+            "c": [True, False, False, True, True, True, None],
+        }
+    ).lazy()
+
+    query = ldf.filter(expr)
+    assert_gpu_result_equal(
+        query, collect_kwargs={"predicate_pushdown": predicate_pushdown}
+    )

From f3183c11a71f90cd1096d95f6ded5ecf38b49a55 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 24 Jun 2024 17:24:24 +0100
Subject: [PATCH 392/842] Add full coverage for whole-frame Agg expressions
 (#15997)

Also add more expansive comments on the unreachable paths.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/15997
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 58 ++++++++-----------
 .../cudf_polars/tests/expressions/test_agg.py | 14 +++++
 2 files changed, 38 insertions(+), 34 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index c92e0714d54..73f3c1ce289 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -952,7 +952,9 @@ def __init__(
         self.options = options
         self.children = (value,)
         if name not in Agg._SUPPORTED:
-            raise NotImplementedError(f"Unsupported aggregation {name=}")
+            raise NotImplementedError(
+                f"Unsupported aggregation {name=}"
+            )  # pragma: no cover; all valid aggs are supported
         # TODO: nan handling in groupby case
         if name == "min":
             req = plc.aggregation.min()
@@ -978,7 +980,9 @@ def __init__(
         elif name == "count":
             req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE)
         else:
-            raise NotImplementedError
+            raise NotImplementedError(
+                f"Unreachable, {name=} is incorrectly listed in _SUPPORTED"
+            )  # pragma: no cover
         self.request = req
         op = getattr(self, f"_{name}", None)
         if op is None:
@@ -988,7 +992,9 @@ def __init__(
         elif name in {"count", "first", "last"}:
             pass
         else:
-            raise AssertionError
+            raise NotImplementedError(
+                f"Unreachable, supported agg {name=} has no implementation"
+            )  # pragma: no cover
         self.op = op
 
     _SUPPORTED: ClassVar[frozenset[str]] = frozenset(
@@ -1010,11 +1016,15 @@ def __init__(
     def collect_agg(self, *, depth: int) -> AggInfo:
         """Collect information about aggregations in groupbys."""
         if depth >= 1:
-            raise NotImplementedError("Nested aggregations in groupby")
+            raise NotImplementedError(
+                "Nested aggregations in groupby"
+            )  # pragma: no cover; check_agg trips first
         (child,) = self.children
         ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
         if self.request is None:
-            raise NotImplementedError(f"Aggregation {self.name} in groupby")
+            raise NotImplementedError(
+                f"Aggregation {self.name} in groupby"
+            )  # pragma: no cover; __init__ trips first
         return AggInfo([(expr, self.request, self)])
 
     def _reduce(
@@ -1024,10 +1034,7 @@ def _reduce(
             plc.Column.from_scalar(
                 plc.reduce.reduce(column.obj, request, self.dtype),
                 1,
-            ),
-            is_sorted=plc.types.Sorted.YES,
-            order=plc.types.Order.ASCENDING,
-            null_order=plc.types.NullOrder.BEFORE,
+            )
         )
 
     def _count(self, column: Column) -> Column:
@@ -1040,10 +1047,7 @@ def _count(self, column: Column) -> Column:
                     ),
                 ),
                 1,
-            ),
-            is_sorted=plc.types.Sorted.YES,
-            order=plc.types.Order.ASCENDING,
-            null_order=plc.types.NullOrder.BEFORE,
+            )
         )
 
     def _min(self, column: Column, *, propagate_nans: bool) -> Column:
@@ -1054,10 +1058,7 @@ def _min(self, column: Column, *, propagate_nans: bool) -> Column:
                         pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
                     ),
                     1,
-                ),
-                is_sorted=plc.types.Sorted.YES,
-                order=plc.types.Order.ASCENDING,
-                null_order=plc.types.NullOrder.BEFORE,
+                )
             )
         if column.nan_count > 0:
             column = column.mask_nans()
@@ -1071,31 +1072,18 @@ def _max(self, column: Column, *, propagate_nans: bool) -> Column:
                         pa.scalar(float("nan"), type=plc.interop.to_arrow(self.dtype))
                     ),
                     1,
-                ),
-                is_sorted=plc.types.Sorted.YES,
-                order=plc.types.Order.ASCENDING,
-                null_order=plc.types.NullOrder.BEFORE,
+                )
             )
         if column.nan_count > 0:
             column = column.mask_nans()
         return self._reduce(column, request=plc.aggregation.max())
 
     def _first(self, column: Column) -> Column:
-        return Column(
-            plc.copying.slice(column.obj, [0, 1])[0],
-            is_sorted=plc.types.Sorted.YES,
-            order=plc.types.Order.ASCENDING,
-            null_order=plc.types.NullOrder.BEFORE,
-        )
+        return Column(plc.copying.slice(column.obj, [0, 1])[0])
 
     def _last(self, column: Column) -> Column:
         n = column.obj.size()
-        return Column(
-            plc.copying.slice(column.obj, [n - 1, n])[0],
-            is_sorted=plc.types.Sorted.YES,
-            order=plc.types.Order.ASCENDING,
-            null_order=plc.types.NullOrder.BEFORE,
-        )
+        return Column(plc.copying.slice(column.obj, [n - 1, n])[0])
 
     def do_evaluate(
         self,
@@ -1106,7 +1094,9 @@ def do_evaluate(
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
         if context is not ExecutionContext.FRAME:
-            raise NotImplementedError(f"Agg in context {context}")
+            raise NotImplementedError(
+                f"Agg in context {context}"
+            )  # pragma: no cover; unreachable
         (child,) = self.children
         return self.op(child.evaluate(df, context=context, mapping=mapping))
 
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index b044bbb2885..2ffa1c4af6d 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -56,3 +56,17 @@ def test_agg(df, agg):
         with pytest.raises(AssertionError):
             assert_gpu_result_equal(q)
     assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False)
+
+
+@pytest.mark.parametrize(
+    "propagate_nans",
+    [pytest.param(False, marks=pytest.mark.xfail(reason="Need to mask nans")), True],
+    ids=["mask_nans", "propagate_nans"],
+)
+@pytest.mark.parametrize("op", ["min", "max"])
+def test_agg_float_with_nans(propagate_nans, op):
+    df = pl.LazyFrame({"a": [1, 2, float("nan")]})
+    op = getattr(pl.Expr, f"nan_{op}" if propagate_nans else op)
+    q = df.select(op(pl.col("a")))
+
+    assert_gpu_result_equal(q)

From 0c6b828118fa371e3fd333718bc872085373a076 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 24 Jun 2024 07:05:37 -1000
Subject: [PATCH 393/842] Restrict the allowed pandas timezone objects in cudf
 (#16013)

Since cudf's timezone support is based on the OS's tz data and hence `zoneinfo`, cudf cannot naturally support the variety of timezone objects supported by pandas (`pytz`, `dateutil`, etc). Therefore:

* In pandas compatible mode, only accept pandas objects with zoneinfo timezones.
* Otherwise, try to convert the pandas timezone to an equivalent zoneinfo object e.g. `pytz.timezone("US/Pacific")`-> `zoneinfo.ZoneInfo("US/Pacific")`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16013
---
 python/cudf/cudf/core/_internals/timezones.py | 33 ++++++++++++++-
 python/cudf/cudf/core/column/column.py        | 16 ++++++++
 python/cudf/cudf/core/column/datetime.py      | 33 +++++++--------
 .../tests/indexes/datetime/test_indexing.py   | 12 +++---
 .../indexes/datetime/test_time_specific.py    | 13 +++---
 .../cudf/tests/series/test_datetimelike.py    | 40 ++++++++++++++++---
 6 files changed, 108 insertions(+), 39 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
index 269fcf3e37f..29cb9d7bd12 100644
--- a/python/cudf/cudf/core/_internals/timezones.py
+++ b/python/cudf/cudf/core/_internals/timezones.py
@@ -1,21 +1,50 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 from __future__ import annotations
 
+import datetime
 import os
 import zoneinfo
 from functools import lru_cache
 from typing import TYPE_CHECKING, Literal
 
 import numpy as np
+import pandas as pd
 
+import cudf
 from cudf._lib.timezone import make_timezone_transition_table
-from cudf.core.column.column import as_column
 
 if TYPE_CHECKING:
     from cudf.core.column.datetime import DatetimeColumn
     from cudf.core.column.timedelta import TimeDeltaColumn
 
 
+def get_compatible_timezone(dtype: pd.DatetimeTZDtype) -> pd.DatetimeTZDtype:
+    """Convert dtype.tz object to zoneinfo object if possible."""
+    tz = dtype.tz
+    if isinstance(tz, zoneinfo.ZoneInfo):
+        return dtype
+    if cudf.get_option("mode.pandas_compatible"):
+        raise NotImplementedError(
+            f"{tz} must be a zoneinfo.ZoneInfo object in pandas_compatible mode."
+        )
+    elif (tzname := getattr(tz, "zone", None)) is not None:
+        # pytz-like
+        key = tzname
+    elif (tz_file := getattr(tz, "_filename", None)) is not None:
+        # dateutil-like
+        key = tz_file.split("zoneinfo/")[-1]
+    elif isinstance(tz, datetime.tzinfo):
+        # Try to get UTC-like tzinfos
+        reference = datetime.datetime.now()
+        key = tz.tzname(reference)
+        if not (isinstance(key, str) and key.lower() == "utc"):
+            raise NotImplementedError(f"cudf does not support {tz}")
+    else:
+        raise NotImplementedError(f"cudf does not support {tz}")
+    new_tz = zoneinfo.ZoneInfo(key)
+    return pd.DatetimeTZDtype(dtype.unit, new_tz)
+
+
 @lru_cache(maxsize=20)
 def get_tz_data(zone_name: str) -> tuple[DatetimeColumn, TimeDeltaColumn]:
     """
@@ -87,6 +116,8 @@ def _read_tzfile_as_columns(
     )
 
     if not transition_times_and_offsets:
+        from cudf.core.column.column import as_column
+
         # this happens for UTC-like zones
         min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]")
         return (as_column([min_date]), as_column([np.timedelta64(0, "s")]))
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index c4e715aeb45..586689e2ee3 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -47,6 +47,7 @@
     is_string_dtype,
 )
 from cudf.core._compat import PANDAS_GE_210
+from cudf.core._internals.timezones import get_compatible_timezone
 from cudf.core.abc import Serializable
 from cudf.core.buffer import (
     Buffer,
@@ -1854,6 +1855,21 @@ def as_column(
             arbitrary.dtype,
             (pd.CategoricalDtype, pd.IntervalDtype, pd.DatetimeTZDtype),
         ):
+            if isinstance(arbitrary.dtype, pd.DatetimeTZDtype):
+                new_tz = get_compatible_timezone(arbitrary.dtype)
+                arbitrary = arbitrary.astype(new_tz)
+            if isinstance(arbitrary.dtype, pd.CategoricalDtype) and isinstance(
+                arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
+            ):
+                new_tz = get_compatible_timezone(
+                    arbitrary.dtype.categories.dtype
+                )
+                new_cats = arbitrary.dtype.categories.astype(new_tz)
+                new_dtype = pd.CategoricalDtype(
+                    categories=new_cats, ordered=arbitrary.dtype.ordered
+                )
+                arbitrary = arbitrary.astype(new_dtype)
+
             return as_column(
                 pa.array(arbitrary, from_pandas=True),
                 nan_as_null=nan_as_null,
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 9ac761b6be1..d88553361dd 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -21,6 +21,11 @@
 from cudf._lib.search import search_sorted
 from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
 from cudf.core._compat import PANDAS_GE_220
+from cudf.core._internals.timezones import (
+    check_ambiguous_and_nonexistent,
+    get_compatible_timezone,
+    get_tz_data,
+)
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
@@ -282,8 +287,6 @@ def __contains__(self, item: ScalarLike) -> bool:
 
     @functools.cached_property
     def time_unit(self) -> str:
-        if isinstance(self.dtype, pd.DatetimeTZDtype):
-            return self.dtype.unit
         return np.datetime_data(self.dtype)[0]
 
     @property
@@ -725,8 +728,6 @@ def _find_ambiguous_and_nonexistent(
         transitions occur in the time zone database for the given timezone.
         If no transitions occur, the tuple `(False, False)` is returned.
         """
-        from cudf.core._internals.timezones import get_tz_data
-
         transition_times, offsets = get_tz_data(zone_name)
         offsets = offsets.astype(f"timedelta64[{self.time_unit}]")  # type: ignore[assignment]
 
@@ -785,26 +786,22 @@ def tz_localize(
         ambiguous: Literal["NaT"] = "NaT",
         nonexistent: Literal["NaT"] = "NaT",
     ):
-        from cudf.core._internals.timezones import (
-            check_ambiguous_and_nonexistent,
-            get_tz_data,
-        )
-
         if tz is None:
             return self.copy()
         ambiguous, nonexistent = check_ambiguous_and_nonexistent(
             ambiguous, nonexistent
         )
-        dtype = pd.DatetimeTZDtype(self.time_unit, tz)
+        dtype = get_compatible_timezone(pd.DatetimeTZDtype(self.time_unit, tz))
+        tzname = dtype.tz.key
         ambiguous_col, nonexistent_col = self._find_ambiguous_and_nonexistent(
-            tz
+            tzname
         )
         localized = self._scatter_by_column(
             self.isnull() | (ambiguous_col | nonexistent_col),
             cudf.Scalar(cudf.NaT, dtype=self.dtype),
         )
 
-        transition_times, offsets = get_tz_data(tz)
+        transition_times, offsets = get_tz_data(tzname)
         transition_times_local = (transition_times + offsets).astype(
             localized.dtype
         )
@@ -845,7 +842,7 @@ def __init__(
             offset=offset,
             null_count=null_count,
         )
-        self._dtype = dtype
+        self._dtype = get_compatible_timezone(dtype)
 
     def to_pandas(
         self,
@@ -865,6 +862,10 @@ def to_arrow(self):
             self._local_time.to_arrow(), str(self.dtype.tz)
         )
 
+    @functools.cached_property
+    def time_unit(self) -> str:
+        return self.dtype.unit
+
     @property
     def _utc_time(self):
         """Return UTC time as naive timestamps."""
@@ -880,8 +881,6 @@ def _utc_time(self):
     @property
     def _local_time(self):
         """Return the local time as naive timestamps."""
-        from cudf.core._internals.timezones import get_tz_data
-
         transition_times, offsets = get_tz_data(str(self.dtype.tz))
         transition_times = transition_times.astype(_get_base_dtype(self.dtype))
         indices = search_sorted([transition_times], [self], "right") - 1
@@ -911,10 +910,6 @@ def __repr__(self):
         )
 
     def tz_localize(self, tz: str | None, ambiguous="NaT", nonexistent="NaT"):
-        from cudf.core._internals.timezones import (
-            check_ambiguous_and_nonexistent,
-        )
-
         if tz is None:
             return self._local_time
         ambiguous, nonexistent = check_ambiguous_and_nonexistent(
diff --git a/python/cudf/cudf/tests/indexes/datetime/test_indexing.py b/python/cudf/cudf/tests/indexes/datetime/test_indexing.py
index f2c2d9a263b..ee4d0f7e816 100644
--- a/python/cudf/cudf/tests/indexes/datetime/test_indexing.py
+++ b/python/cudf/cudf/tests/indexes/datetime/test_indexing.py
@@ -1,4 +1,5 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import zoneinfo
 
 import pandas as pd
 
@@ -7,13 +8,10 @@
 
 
 def test_slice_datetimetz_index():
+    tz = zoneinfo.ZoneInfo("US/Eastern")
     data = ["2001-01-01", "2001-01-02", None, None, "2001-01-03"]
-    pidx = pd.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize(
-        "US/Eastern"
-    )
-    idx = cudf.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize(
-        "US/Eastern"
-    )
+    pidx = pd.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize(tz)
+    idx = cudf.DatetimeIndex(data, dtype="datetime64[ns]").tz_localize(tz)
     expected = pidx[1:4]
     got = idx[1:4]
     assert_eq(expected, got)
diff --git a/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py b/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py
index b28ef131025..77b32b8ce89 100644
--- a/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py
+++ b/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py
@@ -1,4 +1,6 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
+import zoneinfo
+
 import pandas as pd
 
 import cudf
@@ -6,24 +8,21 @@
 
 
 def test_tz_localize():
+    tz = zoneinfo.ZoneInfo("America/New_York")
     pidx = pd.date_range("2001-01-01", "2001-01-02", freq="1s")
     pidx = pidx.astype("<M8[ns]")
     idx = cudf.from_pandas(pidx)
     assert pidx.dtype == idx.dtype
-    assert_eq(
-        pidx.tz_localize("America/New_York"),
-        idx.tz_localize("America/New_York"),
-    )
+    assert_eq(pidx.tz_localize(tz), idx.tz_localize(tz))
 
 
 def test_tz_convert():
+    tz = zoneinfo.ZoneInfo("America/New_York")
     pidx = pd.date_range("2023-01-01", periods=3, freq="h")
     idx = cudf.from_pandas(pidx)
     pidx = pidx.tz_localize("UTC")
     idx = idx.tz_localize("UTC")
-    assert_eq(
-        pidx.tz_convert("America/New_York"), idx.tz_convert("America/New_York")
-    )
+    assert_eq(pidx.tz_convert(tz), idx.tz_convert(tz))
 
 
 def test_delocalize_naive():
diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py
index 58ffc610c3c..302ef19852d 100644
--- a/python/cudf/cudf/tests/series/test_datetimelike.py
+++ b/python/cudf/cudf/tests/series/test_datetimelike.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
+import datetime
 import os
+import zoneinfo
 
 import pandas as pd
 import pytest
@@ -70,7 +72,7 @@ def test_localize_ambiguous(request, unit, zone_name):
         dtype=f"datetime64[{unit}]",
     )
     expect = s.to_pandas().dt.tz_localize(
-        zone_name, ambiguous="NaT", nonexistent="NaT"
+        zoneinfo.ZoneInfo(zone_name), ambiguous="NaT", nonexistent="NaT"
     )
     got = s.dt.tz_localize(zone_name)
     assert_eq(expect, got)
@@ -96,7 +98,7 @@ def test_localize_nonexistent(request, unit, zone_name):
         dtype=f"datetime64[{unit}]",
     )
     expect = s.to_pandas().dt.tz_localize(
-        zone_name, ambiguous="NaT", nonexistent="NaT"
+        zoneinfo.ZoneInfo(zone_name), ambiguous="NaT", nonexistent="NaT"
     )
     got = s.dt.tz_localize(zone_name)
     assert_eq(expect, got)
@@ -130,6 +132,9 @@ def test_delocalize_naive():
     "to_tz", ["Europe/London", "America/Chicago", "UTC", None]
 )
 def test_convert(from_tz, to_tz):
+    from_tz = zoneinfo.ZoneInfo(from_tz)
+    if to_tz is not None:
+        to_tz = zoneinfo.ZoneInfo(to_tz)
     ps = pd.Series(pd.date_range("2023-01-01", periods=3, freq="h"))
     gs = cudf.from_pandas(ps)
     ps = ps.dt.tz_localize(from_tz)
@@ -169,6 +174,8 @@ def test_convert_from_naive():
     ],
 )
 def test_convert_edge_cases(data, original_timezone, target_timezone):
+    original_timezone = zoneinfo.ZoneInfo(original_timezone)
+    target_timezone = zoneinfo.ZoneInfo(target_timezone)
     ps = pd.Series(data, dtype="datetime64[s]").dt.tz_localize(
         original_timezone
     )
@@ -229,10 +236,33 @@ def test_tz_convert_naive_typeerror():
     "klass", ["Series", "DatetimeIndex", "Index", "CategoricalIndex"]
 )
 def test_from_pandas_obj_tz_aware(klass):
-    tz_aware_data = [
-        pd.Timestamp("2020-01-01", tz="UTC").tz_convert("US/Pacific")
-    ]
+    tz = zoneinfo.ZoneInfo("US/Pacific")
+    tz_aware_data = [pd.Timestamp("2020-01-01", tz="UTC").tz_convert(tz)]
     pandas_obj = getattr(pd, klass)(tz_aware_data)
     result = cudf.from_pandas(pandas_obj)
     expected = getattr(cudf, klass)(tz_aware_data)
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "klass", ["Series", "DatetimeIndex", "Index", "CategoricalIndex"]
+)
+def test_from_pandas_obj_tz_aware_unsupported(klass):
+    tz = datetime.timezone(datetime.timedelta(hours=1))
+    tz_aware_data = [pd.Timestamp("2020-01-01", tz="UTC").tz_convert(tz)]
+    pandas_obj = getattr(pd, klass)(tz_aware_data)
+    with pytest.raises(NotImplementedError):
+        cudf.from_pandas(pandas_obj)
+
+
+@pytest.mark.parametrize(
+    "klass", ["Series", "DatetimeIndex", "Index", "CategoricalIndex"]
+)
+def test_pandas_compatible_non_zoneinfo_raises(klass):
+    pytz = pytest.importorskip("pytz")
+    tz = pytz.timezone("US/Pacific")
+    tz_aware_data = [pd.Timestamp("2020-01-01", tz="UTC").tz_convert(tz)]
+    pandas_obj = getattr(pd, klass)(tz_aware_data)
+    with cudf.option_context("mode.pandas_compatible", True):
+        with pytest.raises(NotImplementedError):
+            cudf.from_pandas(pandas_obj)

From 525ca7e02c2ea57f70faa8414d05ef6398559308 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 24 Jun 2024 18:25:10 +0100
Subject: [PATCH 394/842] Add tests of expression-based sort and sort-by
 (#16008)

We only need stable vs unstable variants for the sort-by case, since when sorting a single column by itself there is no distinction between stable and unstable.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16008
---
 .../tests/expressions/test_sort.py            | 53 +++++++++++++++++++
 1 file changed, 53 insertions(+)
 create mode 100644 python/cudf_polars/tests/expressions/test_sort.py

diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py
new file mode 100644
index 00000000000..0195266f5c6
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_sort.py
@@ -0,0 +1,53 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import itertools
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize("descending", [False, True])
+@pytest.mark.parametrize("nulls_last", [False, True])
+def test_sort_expression(descending, nulls_last):
+    ldf = pl.LazyFrame(
+        {
+            "a": [5, -1, 3, 4, None, 8, 6, 7, None],
+        }
+    )
+
+    query = ldf.select(pl.col("a").sort(descending=descending, nulls_last=nulls_last))
+    assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize(
+    "descending", itertools.combinations_with_replacement([False, True], 3)
+)
+@pytest.mark.parametrize(
+    "nulls_last", itertools.combinations_with_replacement([False, True], 3)
+)
+@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"])
+def test_sort_by_expression(descending, nulls_last, maintain_order):
+    ldf = pl.LazyFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+            "b": [1, 2, 2, 3, 9, 5, -1, 2, -2, 16],
+            "c": ["a", "A", "b", "b", "c", "d", "A", "Z", "ä", "̈Ä"],
+        }
+    )
+
+    query = ldf.select(
+        pl.col("a").sort_by(
+            pl.col("b"),
+            pl.col("c"),
+            pl.col("b") + pl.col("a"),
+            descending=descending,
+            nulls_last=nulls_last,
+            maintain_order=maintain_order,
+        )
+    )
+    assert_gpu_result_equal(query, check_row_order=maintain_order)

From 4d4cdce2128398444a15f705d05ca062a6f0300f Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 24 Jun 2024 18:51:51 +0100
Subject: [PATCH 395/842] Add full coverage of utility functions (#15995)

The datetime conversion tests just test that we can round-trip correctly for now.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/15995
---
 .../cudf_polars/cudf_polars/utils/dtypes.py   |  4 +--
 .../cudf_polars/cudf_polars/utils/sorting.py  |  4 +--
 python/cudf_polars/pyproject.toml             |  7 ++++
 .../tests/expressions/test_datetime_basic.py  | 34 +++++++++++++++++++
 python/cudf_polars/tests/utils/test_dtypes.py | 31 +++++++++++++++++
 .../cudf_polars/tests/utils/test_sorting.py   | 21 ++++++++++++
 6 files changed, 97 insertions(+), 4 deletions(-)
 create mode 100644 python/cudf_polars/tests/expressions/test_datetime_basic.py
 create mode 100644 python/cudf_polars/tests/utils/test_dtypes.py
 create mode 100644 python/cudf_polars/tests/utils/test_sorting.py

diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 7b0049daf11..3d4a643e1fc 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -70,7 +70,7 @@ def from_polars(dtype: pl.DataType) -> plc.DataType:
             return plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
         elif dtype.time_unit == "ns":
             return plc.DataType(plc.TypeId.TIMESTAMP_NANOSECONDS)
-        assert dtype.time_unit is not None
+        assert dtype.time_unit is not None  # pragma: no cover
         assert_never(dtype.time_unit)
     elif isinstance(dtype, pl.Duration):
         if dtype.time_unit == "ms":
@@ -79,7 +79,7 @@ def from_polars(dtype: pl.DataType) -> plc.DataType:
             return plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
         elif dtype.time_unit == "ns":
             return plc.DataType(plc.TypeId.DURATION_NANOSECONDS)
-        assert dtype.time_unit is not None
+        assert dtype.time_unit is not None  # pragma: no cover
         assert_never(dtype.time_unit)
     elif isinstance(dtype, pl.String):
         return plc.DataType(plc.TypeId.STRING)
diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py
index 24fd449dd88..57f94c4ec4c 100644
--- a/python/cudf_polars/cudf_polars/utils/sorting.py
+++ b/python/cudf_polars/cudf_polars/utils/sorting.py
@@ -43,8 +43,8 @@ def sort_order(
         for d in descending
     ]
     null_precedence = []
-    # TODO: use strict=True when we drop py39
-    assert len(descending) == len(nulls_last)
+    if len(descending) != len(nulls_last) or len(descending) != num_keys:
+        raise ValueError("Mismatching length of arguments in sort_order")
     for asc, null_last in zip(column_order, nulls_last):
         if (asc == plc.types.Order.ASCENDING) ^ (not null_last):
             null_precedence.append(plc.types.NullOrder.AFTER)
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index face04b9bd8..effa4861e0c 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -52,6 +52,13 @@ version = {file = "cudf_polars/VERSION"}
 [tool.pytest.ini_options]
 xfail_strict = true
 
+[tool.coverage.report]
+exclude_also = [
+  "if TYPE_CHECKING:",
+  "class .*\\bProtocol\\):",
+  "assert_never\\("
+]
+
 [tool.ruff]
 line-length = 88
 indent-width = 4
diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py
new file mode 100644
index 00000000000..6ba2a1dce1e
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    [
+        pl.Date(),
+        pl.Datetime("ms"),
+        pl.Datetime("us"),
+        pl.Datetime("ns"),
+        pl.Duration("ms"),
+        pl.Duration("us"),
+        pl.Duration("ns"),
+    ],
+    ids=repr,
+)
+def test_datetime_dataframe_scan(dtype):
+    ldf = pl.DataFrame(
+        {
+            "a": pl.Series([1, 2, 3, 4, 5, 6, 7], dtype=dtype),
+            "b": pl.Series([3, 4, 5, 6, 7, 8, 9], dtype=pl.UInt16),
+        }
+    ).lazy()
+
+    query = ldf.select(pl.col("b"), pl.col("a"))
+    assert_gpu_result_equal(query)
diff --git a/python/cudf_polars/tests/utils/test_dtypes.py b/python/cudf_polars/tests/utils/test_dtypes.py
new file mode 100644
index 00000000000..535fdd846a0
--- /dev/null
+++ b/python/cudf_polars/tests/utils/test_dtypes.py
@@ -0,0 +1,31 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.utils.dtypes import from_polars
+
+
+@pytest.mark.parametrize(
+    "pltype",
+    [
+        pl.Time(),
+        pl.Struct({"a": pl.Int8, "b": pl.Float32}),
+        pl.Datetime("ms", time_zone="US/Pacific"),
+        pl.Array(pl.Int8, 2),
+        pl.Binary(),
+        pl.Categorical(),
+        pl.Enum(["a", "b"]),
+        pl.Field("a", pl.Int8),
+        pl.Object(),
+        pl.Unknown(),
+    ],
+    ids=repr,
+)
+def test_unhandled_dtype_conversion_raises(pltype):
+    with pytest.raises(NotImplementedError):
+        _ = from_polars(pltype)
diff --git a/python/cudf_polars/tests/utils/test_sorting.py b/python/cudf_polars/tests/utils/test_sorting.py
new file mode 100644
index 00000000000..4e98a3a7ce7
--- /dev/null
+++ b/python/cudf_polars/tests/utils/test_sorting.py
@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+from cudf_polars.utils.sorting import sort_order
+
+
+@pytest.mark.parametrize(
+    "descending,nulls_last,num_keys",
+    [
+        ([True], [False, True], 3),
+        ([True, True], [False, True, False], 3),
+        ([False, True], [True], 3),
+    ],
+)
+def test_sort_order_raises_mismatch(descending, nulls_last, num_keys):
+    with pytest.raises(ValueError):
+        _ = sort_order(descending, nulls_last=nulls_last, num_keys=num_keys)

From 9987410c4baa275c9ae46801112bc4b6d8d6b057 Mon Sep 17 00:00:00 2001
From: Ed Seidl <etseidl@users.noreply.github.com>
Date: Mon, 24 Jun 2024 11:16:56 -0700
Subject: [PATCH 396/842] Account for FIXED_LEN_BYTE_ARRAY when calculating
 fragment sizes in Parquet writer (#16064)

The number of rows per fragment will be off by a factor of 4 for FIXED_LEN_BYTE_ARRAY columns. This results in many more fragments than are necessary to achieve user requested page size limits. This PR shifts where the determination of whether a column has fixed-width data to a location where knowledge of the schema can be used.

Authors:
  - Ed Seidl (https://github.com/etseidl)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16064
---
 cpp/src/io/parquet/writer_impl.cu | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index ca15b532d07..bed4dbc5a66 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -296,19 +296,6 @@ size_t column_size(column_view const& column, rmm::cuda_stream_view stream)
   CUDF_FAIL("Unexpected compound type");
 }
 
-// checks to see if the given column has a fixed size.  This doesn't
-// check every row, so assumes string and list columns are not fixed, even
-// if each row is the same width.
-// TODO: update this if FIXED_LEN_BYTE_ARRAY is ever supported for writes.
-bool is_col_fixed_width(column_view const& column)
-{
-  if (column.type().id() == type_id::STRUCT) {
-    return std::all_of(column.child_begin(), column.child_end(), is_col_fixed_width);
-  }
-
-  return is_fixed_width(column.type());
-}
-
 /**
  * @brief Extends SchemaElement to add members required in constructing parquet_column_view
  *
@@ -946,6 +933,15 @@ struct parquet_column_view {
     return schema_node.converted_type.value_or(UNKNOWN);
   }
 
+  // Checks to see if the given column has a fixed-width data type. This doesn't
+  // check every value, so it assumes string and list columns are not fixed-width, even
+  // if each value has the same size.
+  [[nodiscard]] bool is_fixed_width() const
+  {
+    // lists and strings are not fixed width
+    return max_rep_level() == 0 and physical_type() != Type::BYTE_ARRAY;
+  }
+
   std::vector<std::string> const& get_path_in_schema() { return path_in_schema; }
 
   // LIST related member functions
@@ -1764,7 +1760,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     // unbalanced in final page sizes, so using 4 which seems to be a good
     // compromise at smoothing things out without getting fragment sizes too small.
     auto frag_size_fn = [&](auto const& col, size_t col_size) {
-      int const target_frags_per_page = is_col_fixed_width(col) ? 1 : 4;
+      int const target_frags_per_page = col.is_fixed_width() ? 1 : 4;
       auto const avg_len =
         target_frags_per_page * util::div_rounding_up_safe<size_t>(col_size, input.num_rows());
       if (avg_len > 0) {
@@ -1775,8 +1771,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
       }
     };
 
-    std::transform(single_streams_table.begin(),
-                   single_streams_table.end(),
+    std::transform(parquet_columns.begin(),
+                   parquet_columns.end(),
                    column_sizes.begin(),
                    column_frag_size.begin(),
                    frag_size_fn);

From f583879e2fb90c104dee259b676e836ed6e60ca0 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Mon, 24 Jun 2024 13:40:08 -0500
Subject: [PATCH 397/842] More safely parse CUDA versions when subprocess
 output is contaminated (#16067)

In some user environments, calling a subprocess may produce output that confuses the version parsing machinery inside `_ptxcompiler`. Since the affected functions are vendored from the real `ptxcompiler` package for the purposes of using them with CUDA 12, this fix will only these situations for CUDA 12+.

Closes https://github.com/rapidsai/cudf/issues/16016.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16067
---
 python/cudf/cudf/utils/_ptxcompiler.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/utils/_ptxcompiler.py b/python/cudf/cudf/utils/_ptxcompiler.py
index 54f5ea08ee1..9d7071d55a5 100644
--- a/python/cudf/cudf/utils/_ptxcompiler.py
+++ b/python/cudf/cudf/utils/_ptxcompiler.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,11 +14,14 @@
 
 import math
 import os
+import re
 import subprocess
 import sys
 import warnings
 
 NO_DRIVER = (math.inf, math.inf)
+START_TAG = "_VER_START"
+END_TAG = "_VER_END"
 
 NUMBA_CHECK_VERSION_CMD = """\
 from ctypes import c_int, byref
@@ -28,7 +31,7 @@
 drv_major = dv.value // 1000
 drv_minor = (dv.value - (drv_major * 1000)) // 10
 run_major, run_minor = cuda.runtime.get_version()
-print(f'{drv_major} {drv_minor} {run_major} {run_minor}')
+print(f'_VER_START{drv_major} {drv_minor} {run_major} {run_minor}_VER_END')
 """
 
 
@@ -61,7 +64,11 @@ def get_versions():
         warnings.warn(msg, UserWarning)
         return NO_DRIVER
 
-    versions = [int(s) for s in cp.stdout.strip().split()]
+    pattern = r"_VER_START(.*?)_VER_END"
+
+    ver_str = re.search(pattern, cp.stdout.decode()).group(1)
+
+    versions = [int(s) for s in ver_str.strip().split()]
     driver_version = tuple(versions[:2])
     runtime_version = tuple(versions[2:])
 

From bd76bf6b293b7f17a846df8392c18d92ced2b40f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Mon, 24 Jun 2024 13:43:33 -0500
Subject: [PATCH 398/842] cuDF/libcudf exponentially weighted moving averages
 (#9027)

Adds an exponentially weighted moving average aggregation to `cudf::scan` and plumbs it up through `cudf.Series.ewm`, similar to `pandas.Series.ewm`.

partially resolves https://github.com/rapidsai/cudf/issues/1263

Authors:
  - https://github.com/brandon-b-miller
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/9027
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/cudf/aggregation.hpp              |  41 ++-
 .../cudf/detail/aggregation/aggregation.hpp   |  44 +++
 cpp/src/aggregation/aggregation.cpp           |  22 ++
 cpp/src/reductions/scan/ewm.cu                | 330 ++++++++++++++++++
 cpp/src/reductions/scan/scan.cuh              |   7 +
 cpp/src/reductions/scan/scan_inclusive.cu     |   3 +-
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/reductions/ewm_tests.cpp            | 101 ++++++
 .../source/user_guide/api_docs/dataframe.rst  |   1 +
 .../source/user_guide/api_docs/series.rst     |   1 +
 python/cudf/cudf/_lib/aggregation.pyx         |   8 +
 .../cudf/cudf/_lib/pylibcudf/aggregation.pxd  |   3 +
 .../cudf/cudf/_lib/pylibcudf/aggregation.pyx  |  26 ++
 .../_lib/pylibcudf/libcudf/aggregation.pxd    |   8 +
 python/cudf/cudf/core/indexed_frame.py        |  28 +-
 python/cudf/cudf/core/window/__init__.py      |   4 +-
 python/cudf/cudf/core/window/ewm.py           | 200 +++++++++++
 python/cudf/cudf/core/window/rolling.py       |  22 +-
 python/cudf/cudf/pandas/_wrappers/pandas.py   |   2 +-
 python/cudf/cudf/tests/test_ewm.py            |  46 +++
 21 files changed, 892 insertions(+), 7 deletions(-)
 create mode 100644 cpp/src/reductions/scan/ewm.cu
 create mode 100644 cpp/tests/reductions/ewm_tests.cpp
 create mode 100644 python/cudf/cudf/core/window/ewm.py
 create mode 100644 python/cudf/cudf/tests/test_ewm.py

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index aab0a9b2d49..5fd68bfb26c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -502,6 +502,7 @@ add_library(
   src/reductions/product.cu
   src/reductions/reductions.cpp
   src/reductions/scan/rank_scan.cu
+  src/reductions/scan/ewm.cu
   src/reductions/scan/scan.cpp
   src/reductions/scan/scan_exclusive.cu
   src/reductions/scan/scan_inclusive.cu
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index d458c831f19..3c1023017be 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -103,6 +103,7 @@ class aggregation {
     NUNIQUE,         ///< count number of unique elements
     NTH_ELEMENT,     ///< get the nth element
     ROW_NUMBER,      ///< get row-number of current index (relative to rolling window)
+    EWMA,            ///< get exponential weighted moving average at current index
     RANK,            ///< get rank of current index
     COLLECT_LIST,    ///< collect values into a list
     COLLECT_SET,     ///< collect values into a list without duplicate entries
@@ -250,6 +251,8 @@ class segmented_reduce_aggregation : public virtual aggregation {
 enum class udf_type : bool { CUDA, PTX };
 /// Type of correlation method.
 enum class correlation_type : int32_t { PEARSON, KENDALL, SPEARMAN };
+/// Type of treatment of EWM input values' first value
+enum class ewm_history : int32_t { INFINITE, FINITE };
 
 /// Factory to create a SUM aggregation
 /// @return A SUM aggregation object
@@ -411,6 +414,42 @@ std::unique_ptr<Base> make_nth_element_aggregation(
 template <typename Base = aggregation>
 std::unique_ptr<Base> make_row_number_aggregation();
 
+/**
+ * @brief Factory to create an EWMA aggregation
+ *
+ * `EWMA` returns a non-nullable column with the same type as the input,
+ * whose values are the exponentially weighted moving average of the input
+ * sequence. Let these values be known as the y_i.
+ *
+ * EWMA aggregations are parameterized by a center of mass (`com`) which
+ * affects the contribution of the previous values (y_0 ... y_{i-1}) in
+ * computing the y_i.
+ *
+ * EWMA aggregations are also parameterized by a history `cudf::ewm_history`.
+ * Special considerations have to be given to the mathematical treatment of
+ * the first value of the input sequence. There are two approaches to this,
+ * one which considers the first value of the sequence to be the exponential
+ * weighted moving average of some infinite history of data, and one which
+ * takes the first value to be the only datapoint known. These assumptions
+ * lead to two different formulas for the y_i. `ewm_history` selects which.
+ *
+ * EWMA aggregations have special null handling. Nulls have two effects. The
+ * first is to propagate forward the last valid value as far as it has been
+ * computed. This could be thought of as the nulls not affecting the average
+ * in any way. The second effect changes the way the y_i are computed. Since
+ * a moving average is conceptually designed to weight contributing values by
+ * their recency, nulls ought to count as valid periods even though they do
+ * not change the average. For example, if the input sequence is {1, NULL, 3}
+ * then when computing y_2 one should weigh y_0 as if it occurs two periods
+ * before y_2 rather than just one.
+ *
+ * @param center_of_mass the center of mass.
+ * @param history which assumption to make about the first value
+ * @return A EWM aggregation object
+ */
+template <typename Base = aggregation>
+std::unique_ptr<Base> make_ewma_aggregation(double const center_of_mass, ewm_history history);
+
 /**
  * @brief Factory to create a RANK aggregation
  *
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index edee83783b8..843414817e3 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -76,6 +76,8 @@ class simple_aggregations_collector {  // Declares the interface for the simple
                                                           class nth_element_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class row_number_aggregation const& agg);
+  virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
+                                                          class ewma_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(data_type col_type,
                                                           class rank_aggregation const& agg);
   virtual std::vector<std::unique_ptr<aggregation>> visit(
@@ -141,6 +143,7 @@ class aggregation_finalizer {  // Declares the interface for the finalizer
   virtual void visit(class correlation_aggregation const& agg);
   virtual void visit(class tdigest_aggregation const& agg);
   virtual void visit(class merge_tdigest_aggregation const& agg);
+  virtual void visit(class ewma_aggregation const& agg);
 };
 
 /**
@@ -667,6 +670,40 @@ class row_number_aggregation final : public rolling_aggregation {
   void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
 };
 
+/**
+ * @brief Derived class for specifying an ewma aggregation
+ */
+class ewma_aggregation final : public scan_aggregation {
+ public:
+  double const center_of_mass;
+  cudf::ewm_history history;
+
+  ewma_aggregation(double const center_of_mass, cudf::ewm_history history)
+    : aggregation{EWMA}, center_of_mass{center_of_mass}, history{history}
+  {
+  }
+
+  std::unique_ptr<aggregation> clone() const override
+  {
+    return std::make_unique<ewma_aggregation>(*this);
+  }
+
+  std::vector<std::unique_ptr<aggregation>> get_simple_aggregations(
+    data_type col_type, simple_aggregations_collector& collector) const override
+  {
+    return collector.visit(col_type, *this);
+  }
+
+  bool is_equal(aggregation const& _other) const override
+  {
+    if (!this->aggregation::is_equal(_other)) { return false; }
+    auto const& other = dynamic_cast<ewma_aggregation const&>(_other);
+    return this->center_of_mass == other.center_of_mass and this->history == other.history;
+  }
+
+  void finalize(aggregation_finalizer& finalizer) const override { finalizer.visit(*this); }
+};
+
 /**
  * @brief Derived class for specifying a rank aggregation
  */
@@ -1336,6 +1373,11 @@ struct target_type_impl<Source, aggregation::ROW_NUMBER> {
   using type = size_type;
 };
 
+template <typename Source>
+struct target_type_impl<Source, aggregation::EWMA> {
+  using type = double;
+};
+
 // Always use size_type accumulator for RANK
 template <typename Source>
 struct target_type_impl<Source, aggregation::RANK> {
@@ -1536,6 +1578,8 @@ CUDF_HOST_DEVICE inline decltype(auto) aggregation_dispatcher(aggregation::Kind
       return f.template operator()<aggregation::TDIGEST>(std::forward<Ts>(args)...);
     case aggregation::MERGE_TDIGEST:
       return f.template operator()<aggregation::MERGE_TDIGEST>(std::forward<Ts>(args)...);
+    case aggregation::EWMA:
+      return f.template operator()<aggregation::EWMA>(std::forward<Ts>(args)...);
     default: {
 #ifndef __CUDA_ARCH__
       CUDF_FAIL("Unsupported aggregation.");
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index adee9147740..5422304c5cb 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -154,6 +154,12 @@ std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   return visit(col_type, static_cast<aggregation const&>(agg));
 }
 
+std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
+  data_type col_type, ewma_aggregation const& agg)
+{
+  return visit(col_type, static_cast<aggregation const&>(agg));
+}
+
 std::vector<std::unique_ptr<aggregation>> simple_aggregations_collector::visit(
   data_type col_type, rank_aggregation const& agg)
 {
@@ -333,6 +339,11 @@ void aggregation_finalizer::visit(row_number_aggregation const& agg)
   visit(static_cast<aggregation const&>(agg));
 }
 
+void aggregation_finalizer::visit(ewma_aggregation const& agg)
+{
+  visit(static_cast<aggregation const&>(agg));
+}
+
 void aggregation_finalizer::visit(rank_aggregation const& agg)
 {
   visit(static_cast<aggregation const&>(agg));
@@ -665,6 +676,17 @@ std::unique_ptr<Base> make_row_number_aggregation()
 template std::unique_ptr<aggregation> make_row_number_aggregation<aggregation>();
 template std::unique_ptr<rolling_aggregation> make_row_number_aggregation<rolling_aggregation>();
 
+/// Factory to create an EWMA aggregation
+template <typename Base>
+std::unique_ptr<Base> make_ewma_aggregation(double const com, cudf::ewm_history history)
+{
+  return std::make_unique<detail::ewma_aggregation>(com, history);
+}
+template std::unique_ptr<aggregation> make_ewma_aggregation<aggregation>(double const com,
+                                                                         cudf::ewm_history history);
+template std::unique_ptr<scan_aggregation> make_ewma_aggregation<scan_aggregation>(
+  double const com, cudf::ewm_history history);
+
 /// Factory to create a RANK aggregation
 template <typename Base>
 std::unique_ptr<Base> make_rank_aggregation(rank_method method,
diff --git a/cpp/src/reductions/scan/ewm.cu b/cpp/src/reductions/scan/ewm.cu
new file mode 100644
index 00000000000..3fa2de450ad
--- /dev/null
+++ b/cpp/src/reductions/scan/ewm.cu
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scan.cuh"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/functional>
+#include <thrust/scan.h>
+#include <thrust/transform_scan.h>
+
+namespace cudf {
+namespace detail {
+
+template <typename T>
+using pair_type = thrust::pair<T, T>;
+
+/**
+ * @brief functor to be summed over in a prefix sum such that
+ * the recurrence in question is solved. See
+ * G. E. Blelloch. Prefix sums and their applications. Technical Report
+ * CMU-CS-90-190, Nov. 1990. S. 1.4
+ * for details
+ */
+template <typename T>
+class recurrence_functor {
+ public:
+  __device__ pair_type<T> operator()(pair_type<T> ci, pair_type<T> cj)
+  {
+    return {ci.first * cj.first, ci.second * cj.first + cj.second};
+  }
+};
+
+template <typename T>
+struct ewma_functor_base {
+  T beta;
+  const pair_type<T> IDENTITY{1.0, 0.0};
+};
+
+template <typename T, bool is_numerator>
+struct ewma_adjust_nulls_functor : public ewma_functor_base<T> {
+  __device__ pair_type<T> operator()(thrust::tuple<bool, int, T> const data)
+  {
+    // Not const to allow for updating the input value
+    auto [valid, exp, input] = data;
+    if (!valid) { return this->IDENTITY; }
+    if constexpr (not is_numerator) { input = 1; }
+
+    // The value is non-null, but nulls preceding it
+    // must adjust the second element of the pair
+    T const beta = this->beta;
+    return {beta * ((exp != 0) ? pow(beta, exp) : 1), input};
+  }
+};
+
+template <typename T, bool is_numerator>
+struct ewma_adjust_no_nulls_functor : public ewma_functor_base<T> {
+  __device__ pair_type<T> operator()(T const data)
+  {
+    T const beta = this->beta;
+    if constexpr (is_numerator) {
+      return {beta, data};
+    } else {
+      return {beta, 1.0};
+    }
+  }
+};
+
+template <typename T>
+struct ewma_noadjust_nulls_functor : public ewma_functor_base<T> {
+  /*
+    In the null case, a denominator actually has to be computed. The formula is
+    y_{i+1} = (1 - alpha)x_{i-1} + alpha x_i, but really there is a "denominator"
+    which is the sum of the weights: alpha + (1 - alpha) == 1. If a null is
+    encountered, that means that the "previous" value is downweighted by a
+    factor (for each missing value). For example with a single null:
+    data = {x_0, NULL, x_1},
+    y_2 = (1 - alpha)**2 x_0 + alpha * x_2 / (alpha + (1-alpha)**2)
+
+    As such, the pairs must be updated before summing like the adjusted case to
+    properly downweight the previous values. But now but we also need to compute
+    the normalization factors and divide the results into them at the end.
+  */
+  __device__ pair_type<T> operator()(thrust::tuple<T, size_type, bool, size_type> const data)
+  {
+    T const beta                              = this->beta;
+    auto const [input, index, valid, nullcnt] = data;
+    if (index == 0) {
+      return {beta, input};
+    } else {
+      if (!valid) { return this->IDENTITY; }
+      // preceding value is valid, return normal pair
+      if (nullcnt == 0) { return {beta, (1.0 - beta) * input}; }
+      // one or more preceding values is null, adjust by how many
+      T const factor = (1.0 - beta) + pow(beta, nullcnt + 1);
+      return {(beta * (pow(beta, nullcnt)) / factor), ((1.0 - beta) * input) / factor};
+    }
+  }
+};
+
+template <typename T>
+struct ewma_noadjust_no_nulls_functor : public ewma_functor_base<T> {
+  __device__ pair_type<T> operator()(thrust::tuple<T, size_type> const data)
+  {
+    T const beta              = this->beta;
+    auto const [input, index] = data;
+    if (index == 0) {
+      return {beta, input};
+    } else {
+      return {beta, (1.0 - beta) * input};
+    }
+  }
+};
+
+/**
+* @brief Return an array whose values y_i are the number of null entries
+* in between the last valid entry of the input and the current index.
+* Example: {1, NULL, 3, 4, NULL, NULL, 7}
+        -> {0, 0     1, 0, 0,    1,    2}
+*/
+rmm::device_uvector<cudf::size_type> null_roll_up(column_view const& input,
+                                                  rmm::cuda_stream_view stream)
+{
+  rmm::device_uvector<cudf::size_type> output(input.size(), stream);
+
+  auto device_view = column_device_view::create(input);
+  auto invalid_it  = thrust::make_transform_iterator(
+    cudf::detail::make_validity_iterator(*device_view),
+    cuda::proclaim_return_type<int>([] __device__(int valid) -> int { return 1 - valid; }));
+
+  // valid mask {1, 0, 1, 0, 0, 1} leads to output array {0, 0, 1, 0, 1, 2}
+  thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                invalid_it,
+                                invalid_it + input.size() - 1,
+                                invalid_it,
+                                std::next(output.begin()));
+  return output;
+}
+
+template <typename T>
+rmm::device_uvector<T> compute_ewma_adjust(column_view const& input,
+                                           T const beta,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref mr)
+{
+  rmm::device_uvector<T> output(input.size(), stream);
+  rmm::device_uvector<pair_type<T>> pairs(input.size(), stream);
+
+  if (input.has_nulls()) {
+    rmm::device_uvector<cudf::size_type> nullcnt = null_roll_up(input, stream);
+    auto device_view                             = column_device_view::create(input);
+    auto valid_it = cudf::detail::make_validity_iterator(*device_view);
+    auto data =
+      thrust::make_zip_iterator(thrust::make_tuple(valid_it, nullcnt.begin(), input.begin<T>()));
+
+    thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+                                     data,
+                                     data + input.size(),
+                                     pairs.begin(),
+                                     ewma_adjust_nulls_functor<T, true>{beta},
+                                     recurrence_functor<T>{});
+    thrust::transform(rmm::exec_policy(stream),
+                      pairs.begin(),
+                      pairs.end(),
+                      output.begin(),
+                      [] __device__(pair_type<T> pair) -> T { return pair.second; });
+
+    thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+                                     data,
+                                     data + input.size(),
+                                     pairs.begin(),
+                                     ewma_adjust_nulls_functor<T, false>{beta},
+                                     recurrence_functor<T>{});
+
+  } else {
+    thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+                                     input.begin<T>(),
+                                     input.end<T>(),
+                                     pairs.begin(),
+                                     ewma_adjust_no_nulls_functor<T, true>{beta},
+                                     recurrence_functor<T>{});
+    thrust::transform(rmm::exec_policy(stream),
+                      pairs.begin(),
+                      pairs.end(),
+                      output.begin(),
+                      [] __device__(pair_type<T> pair) -> T { return pair.second; });
+    auto itr = thrust::make_counting_iterator<size_type>(0);
+
+    thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+                                     itr,
+                                     itr + input.size(),
+                                     pairs.begin(),
+                                     ewma_adjust_no_nulls_functor<T, false>{beta},
+                                     recurrence_functor<T>{});
+  }
+
+  thrust::transform(
+    rmm::exec_policy(stream),
+    pairs.begin(),
+    pairs.end(),
+    output.begin(),
+    output.begin(),
+    [] __device__(pair_type<T> pair, T numerator) -> T { return numerator / pair.second; });
+
+  return output;
+}
+
+template <typename T>
+rmm::device_uvector<T> compute_ewma_noadjust(column_view const& input,
+                                             T const beta,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  rmm::device_uvector<T> output(input.size(), stream);
+  rmm::device_uvector<pair_type<T>> pairs(input.size(), stream);
+  rmm::device_uvector<cudf::size_type> nullcnt =
+    [&input, stream]() -> rmm::device_uvector<cudf::size_type> {
+    if (input.has_nulls()) {
+      return null_roll_up(input, stream);
+    } else {
+      return rmm::device_uvector<cudf::size_type>(input.size(), stream);
+    }
+  }();
+  // denominators are all 1 and do not need to be computed
+  // pairs are all (beta, 1-beta x_i) except for the first one
+
+  if (!input.has_nulls()) {
+    auto data = thrust::make_zip_iterator(
+      thrust::make_tuple(input.begin<T>(), thrust::make_counting_iterator<size_type>(0)));
+    thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+                                     data,
+                                     data + input.size(),
+                                     pairs.begin(),
+                                     ewma_noadjust_no_nulls_functor<T>{beta},
+                                     recurrence_functor<T>{});
+
+  } else {
+    auto device_view = column_device_view::create(input);
+    auto valid_it    = detail::make_validity_iterator(*device_view);
+
+    auto data = thrust::make_zip_iterator(thrust::make_tuple(
+      input.begin<T>(), thrust::make_counting_iterator<size_type>(0), valid_it, nullcnt.begin()));
+
+    thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+                                     data,
+                                     data + input.size(),
+                                     pairs.begin(),
+                                     ewma_noadjust_nulls_functor<T>{beta},
+                                     recurrence_functor<T>());
+  }
+
+  // copy the second elements to the output for now
+  thrust::transform(rmm::exec_policy(stream),
+                    pairs.begin(),
+                    pairs.end(),
+                    output.begin(),
+                    [] __device__(pair_type<T> pair) -> T { return pair.second; });
+  return output;
+}
+
+struct ewma_functor {
+  template <typename T, CUDF_ENABLE_IF(!std::is_floating_point<T>::value)>
+  std::unique_ptr<column> operator()(scan_aggregation const& agg,
+                                     column_view const& input,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::device_async_resource_ref mr)
+  {
+    CUDF_FAIL("Unsupported type for EWMA.");
+  }
+
+  template <typename T, CUDF_ENABLE_IF(std::is_floating_point<T>::value)>
+  std::unique_ptr<column> operator()(scan_aggregation const& agg,
+                                     column_view const& input,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::device_async_resource_ref mr)
+  {
+    auto const ewma_agg       = dynamic_cast<ewma_aggregation const*>(&agg);
+    auto const history        = ewma_agg->history;
+    auto const center_of_mass = ewma_agg->center_of_mass;
+
+    // center of mass is easier for the user, but the recurrences are
+    // better expressed in terms of the derived parameter `beta`
+    T const beta = center_of_mass / (center_of_mass + 1.0);
+
+    auto result = [&]() {
+      if (history == cudf::ewm_history::INFINITE) {
+        return compute_ewma_adjust(input, beta, stream, mr);
+      } else {
+        return compute_ewma_noadjust(input, beta, stream, mr);
+      }
+    }();
+    return std::make_unique<column>(cudf::data_type(cudf::type_to_id<T>()),
+                                    input.size(),
+                                    result.release(),
+                                    rmm::device_buffer{},
+                                    0);
+  }
+};
+
+std::unique_ptr<column> exponentially_weighted_moving_average(column_view const& input,
+                                                              scan_aggregation const& agg,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::device_async_resource_ref mr)
+{
+  return type_dispatcher(input.type(), ewma_functor{}, agg, input, stream, mr);
+}
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/reductions/scan/scan.cuh b/cpp/src/reductions/scan/scan.cuh
index aeb9e516cd4..6c237741ac3 100644
--- a/cpp/src/reductions/scan/scan.cuh
+++ b/cpp/src/reductions/scan/scan.cuh
@@ -36,6 +36,12 @@ std::pair<rmm::device_buffer, size_type> mask_scan(column_view const& input_view
                                                    rmm::cuda_stream_view stream,
                                                    rmm::device_async_resource_ref mr);
 
+// exponentially weighted moving average of the input
+std::unique_ptr<column> exponentially_weighted_moving_average(column_view const& input,
+                                                              scan_aggregation const& agg,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::device_async_resource_ref mr);
+
 template <template <typename> typename DispatchFn>
 std::unique_ptr<column> scan_agg_dispatch(column_view const& input,
                                           scan_aggregation const& agg,
@@ -59,6 +65,7 @@ std::unique_ptr<column> scan_agg_dispatch(column_view const& input,
       if (is_fixed_point(input.type())) CUDF_FAIL("decimal32/64/128 cannot support product scan");
       return type_dispatcher<dispatch_storage_type>(
         input.type(), DispatchFn<DeviceProduct>(), input, output_mask, stream, mr);
+    case aggregation::EWMA: return exponentially_weighted_moving_average(input, agg, stream, mr);
     default: CUDF_FAIL("Unsupported aggregation operator for scan");
   }
 }
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index ad2eaa6a471..7c02a8d1b99 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -182,7 +182,8 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
 
   auto output = scan_agg_dispatch<scan_dispatcher>(
     input, agg, static_cast<bitmask_type*>(mask.data()), stream, mr);
-  output->set_null_mask(std::move(mask), null_count);
+  // Use the null mask produced by the op for EWM
+  if (agg.kind != aggregation::EWMA) { output->set_null_mask(std::move(mask), null_count); }
 
   // If the input is a structs column, we also need to push down nulls from the parent output column
   // into the children columns.
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index eda470d2309..9f14455f42d 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -205,6 +205,7 @@ ConfigureTest(
 ConfigureTest(
   REDUCTIONS_TEST
   reductions/collect_ops_tests.cpp
+  reductions/ewm_tests.cpp
   reductions/rank_tests.cpp
   reductions/reduction_tests.cpp
   reductions/scan_tests.cpp
diff --git a/cpp/tests/reductions/ewm_tests.cpp b/cpp/tests/reductions/ewm_tests.cpp
new file mode 100644
index 00000000000..09cec688509
--- /dev/null
+++ b/cpp/tests/reductions/ewm_tests.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "scan_tests.hpp"
+
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/reduction.hpp>
+
+template <typename T>
+struct TypedEwmScanTest : BaseScanTest<T> {
+  inline void test_ungrouped_ewma_scan(cudf::column_view const& input,
+                                       cudf::column_view const& expect_vals,
+                                       cudf::scan_aggregation const& agg,
+                                       cudf::null_policy null_handling)
+  {
+    auto col_out = cudf::scan(input, agg, cudf::scan_type::INCLUSIVE, null_handling);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expect_vals, col_out->view());
+  }
+};
+
+TYPED_TEST_SUITE(TypedEwmScanTest, cudf::test::FloatingPointTypes);
+
+TYPED_TEST(TypedEwmScanTest, Ewm)
+{
+  auto const v = make_vector<TypeParam>({1.0, 2.0, 3.0, 4.0, 5.0});
+  auto col     = this->make_column(v);
+
+  auto const expected_ewma_vals_adjust = cudf::test::fixed_width_column_wrapper<TypeParam>{
+    {1.0, 1.75, 2.61538461538461497469, 3.54999999999999982236, 4.52066115702479365268}};
+
+  auto const expected_ewma_vals_noadjust =
+    cudf::test::fixed_width_column_wrapper<TypeParam>{{1.0,
+                                                       1.66666666666666651864,
+                                                       2.55555555555555535818,
+                                                       3.51851851851851815667,
+                                                       4.50617283950617242283}};
+
+  this->test_ungrouped_ewma_scan(
+    *col,
+    expected_ewma_vals_adjust,
+    *cudf::make_ewma_aggregation<cudf::scan_aggregation>(0.5, cudf::ewm_history::INFINITE),
+    cudf::null_policy::INCLUDE);
+  this->test_ungrouped_ewma_scan(
+    *col,
+    expected_ewma_vals_noadjust,
+    *cudf::make_ewma_aggregation<cudf::scan_aggregation>(0.5, cudf::ewm_history::FINITE),
+    cudf::null_policy::INCLUDE);
+}
+
+TYPED_TEST(TypedEwmScanTest, EwmWithNulls)
+{
+  auto const v = make_vector<TypeParam>({1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0});
+  auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 0, 1, 0, 0, 1, 1});
+  auto col     = this->make_column(v, b);
+
+  auto const expected_ewma_vals_adjust =
+    cudf::test::fixed_width_column_wrapper<TypeParam>{{1.0,
+                                                       1.0,
+                                                       2.79999999999999982236,
+                                                       2.79999999999999982236,
+                                                       2.79999999999999982236,
+                                                       5.87351778656126466416,
+                                                       6.70977596741344139986}};
+
+  auto const expected_ewma_vals_noadjust =
+    cudf::test::fixed_width_column_wrapper<TypeParam>{{1.0,
+                                                       1.0,
+                                                       2.71428571428571441260,
+                                                       2.71428571428571441260,
+                                                       2.71428571428571441260,
+                                                       5.82706766917293172980,
+                                                       6.60902255639097724327}};
+
+  this->test_ungrouped_ewma_scan(
+    *col,
+    expected_ewma_vals_adjust,
+    *cudf::make_ewma_aggregation<cudf::scan_aggregation>(0.5, cudf::ewm_history::INFINITE),
+    cudf::null_policy::INCLUDE);
+  this->test_ungrouped_ewma_scan(
+    *col,
+    expected_ewma_vals_noadjust,
+    *cudf::make_ewma_aggregation<cudf::scan_aggregation>(0.5, cudf::ewm_history::FINITE),
+    cudf::null_policy::INCLUDE);
+}
diff --git a/docs/cudf/source/user_guide/api_docs/dataframe.rst b/docs/cudf/source/user_guide/api_docs/dataframe.rst
index 70e4bd060ca..02fd9f7b396 100644
--- a/docs/cudf/source/user_guide/api_docs/dataframe.rst
+++ b/docs/cudf/source/user_guide/api_docs/dataframe.rst
@@ -137,6 +137,7 @@ Computations / descriptive stats
    DataFrame.describe
    DataFrame.diff
    DataFrame.eval
+   DataFrame.ewm
    DataFrame.kurt
    DataFrame.kurtosis
    DataFrame.max
diff --git a/docs/cudf/source/user_guide/api_docs/series.rst b/docs/cudf/source/user_guide/api_docs/series.rst
index 5dc87a97337..48a7dc8ff87 100644
--- a/docs/cudf/source/user_guide/api_docs/series.rst
+++ b/docs/cudf/source/user_guide/api_docs/series.rst
@@ -138,6 +138,7 @@ Computations / descriptive stats
    Series.describe
    Series.diff
    Series.digitize
+   Series.ewm
    Series.factorize
    Series.kurt
    Series.max
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 11f801ba772..1616c24eec2 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -58,6 +58,14 @@ class Aggregation:
             if dropna else pylibcudf.types.NullPolicy.INCLUDE
         ))
 
+    @classmethod
+    def ewma(cls, com=1.0, adjust=True):
+        return cls(pylibcudf.aggregation.ewma(
+            com,
+            pylibcudf.aggregation.EWMHistory.INFINITE
+            if adjust else pylibcudf.aggregation.EWMHistory.FINITE
+        ))
+
     @classmethod
     def size(cls):
         return cls(pylibcudf.aggregation.count(pylibcudf.types.NullPolicy.INCLUDE))
diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
index 8526728656b..0981d0e855a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
@@ -6,6 +6,7 @@ from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     Kind as kind_t,
     aggregation,
     correlation_type,
+    ewm_history,
     groupby_aggregation,
     groupby_scan_aggregation,
     rank_method,
@@ -80,6 +81,8 @@ cpdef Aggregation argmax()
 
 cpdef Aggregation argmin()
 
+cpdef Aggregation ewma(float center_of_mass, ewm_history history)
+
 cpdef Aggregation nunique(null_policy null_handling = *)
 
 cpdef Aggregation nth_element(size_type n, null_policy null_handling = *)
diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
index 7bb64e32a1b..eed2f6de585 100644
--- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
@@ -8,6 +8,7 @@ from libcpp.utility cimport move
 from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     aggregation,
     correlation_type,
+    ewm_history,
     groupby_aggregation,
     groupby_scan_aggregation,
     make_all_aggregation,
@@ -19,6 +20,7 @@ from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     make_correlation_aggregation,
     make_count_aggregation,
     make_covariance_aggregation,
+    make_ewma_aggregation,
     make_max_aggregation,
     make_mean_aggregation,
     make_median_aggregation,
@@ -52,6 +54,8 @@ from cudf._lib.pylibcudf.libcudf.types cimport (
 from cudf._lib.pylibcudf.libcudf.aggregation import Kind  # no-cython-lint
 from cudf._lib.pylibcudf.libcudf.aggregation import \
     correlation_type as CorrelationType  # no-cython-lint
+from cudf._lib.pylibcudf.libcudf.aggregation import \
+    ewm_history as EWMHistory  # no-cython-lint
 from cudf._lib.pylibcudf.libcudf.aggregation import \
     rank_method as RankMethod  # no-cython-lint
 from cudf._lib.pylibcudf.libcudf.aggregation import \
@@ -202,6 +206,28 @@ cpdef Aggregation max():
     return Aggregation.from_libcudf(move(make_max_aggregation[aggregation]()))
 
 
+cpdef Aggregation ewma(float center_of_mass, ewm_history history):
+    """Create a EWMA aggregation.
+
+    For details, see :cpp:func:`make_ewma_aggregation`.
+
+    Parameters
+    ----------
+    center_of_mass : float
+        The decay in terms of the center of mass
+    history : ewm_history
+        Whether or not to treat the history as infinite.
+
+    Returns
+    -------
+    Aggregation
+        The EWMA aggregation.
+    """
+    return Aggregation.from_libcudf(
+        move(make_ewma_aggregation[aggregation](center_of_mass, history))
+    )
+
+
 cpdef Aggregation count(null_policy null_handling = null_policy.EXCLUDE):
     """Create a count aggregation.
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
index 8c14bc45723..fe04db52094 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
@@ -79,6 +79,10 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
         KENDALL
         SPEARMAN
 
+    cpdef enum class ewm_history(int32_t):
+        INFINITE
+        FINITE
+
     cpdef enum class rank_method(int32_t):
         FIRST
         AVERAGE
@@ -143,6 +147,10 @@ cdef extern from "cudf/aggregation.hpp" namespace "cudf" nogil:
         string user_defined_aggregator,
         data_type output_type) except +
 
+    cdef unique_ptr[T] make_ewma_aggregation[T](
+        double com, ewm_history adjust
+    ) except +
+
     cdef unique_ptr[T] make_correlation_aggregation[T](
         correlation_type type, size_type min_periods) except +
 
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index f1b74adefed..7515cb2c177 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -52,7 +52,7 @@
     _post_process_output_col,
     _return_arr_from_dtype,
 )
-from cudf.core.window import Rolling
+from cudf.core.window import ExponentialMovingWindow, Rolling
 from cudf.utils import docutils, ioutils
 from cudf.utils._numba import _CUDFNumbaConfig
 from cudf.utils.docutils import copy_docstring
@@ -1853,6 +1853,32 @@ def rolling(
             win_type=win_type,
         )
 
+    @copy_docstring(ExponentialMovingWindow)
+    def ewm(
+        self,
+        com: float | None = None,
+        span: float | None = None,
+        halflife: float | None = None,
+        alpha: float | None = None,
+        min_periods: int | None = 0,
+        adjust: bool = True,
+        ignore_na: bool = False,
+        axis: int = 0,
+        times: str | np.ndarray | None = None,
+    ):
+        return ExponentialMovingWindow(
+            self,
+            com=com,
+            span=span,
+            halflife=halflife,
+            alpha=alpha,
+            min_periods=min_periods,
+            adjust=adjust,
+            ignore_na=ignore_na,
+            axis=axis,
+            times=times,
+        )
+
     @_cudf_nvtx_annotate
     def nans_to_nulls(self):
         """
diff --git a/python/cudf/cudf/core/window/__init__.py b/python/cudf/cudf/core/window/__init__.py
index 8ea3eb0179b..23522588d33 100644
--- a/python/cudf/cudf/core/window/__init__.py
+++ b/python/cudf/cudf/core/window/__init__.py
@@ -1,3 +1,3 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION
-
+# Copyright (c) 2019-2024, NVIDIA CORPORATION
+from cudf.core.window.ewm import ExponentialMovingWindow
 from cudf.core.window.rolling import Rolling
diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py
new file mode 100644
index 00000000000..21693e106bd
--- /dev/null
+++ b/python/cudf/cudf/core/window/ewm.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import numpy as np
+
+from cudf._lib.reduce import scan
+from cudf.api.types import is_numeric_dtype
+from cudf.core.window.rolling import _RollingBase
+
+
+class ExponentialMovingWindow(_RollingBase):
+    r"""
+    Provide exponential weighted (EW) functions.
+    Available EW functions: ``mean()``
+    Exactly one parameter: ``com``, ``span``, ``halflife``, or ``alpha``
+    must be provided.
+
+    Parameters
+    ----------
+    com : float, optional
+        Specify decay in terms of center of mass,
+        :math:`\alpha = 1 / (1 + com)`, for :math:`com \geq 0`.
+    span : float, optional
+        Specify decay in terms of span,
+        :math:`\alpha = 2 / (span + 1)`, for :math:`span \geq 1`.
+    halflife : float, str, timedelta, optional
+        Specify decay in terms of half-life,
+        :math:`\alpha = 1 - \exp\left(-\ln(2) / halflife\right)`, for
+        :math:`halflife > 0`.
+    alpha : float, optional
+        Specify smoothing factor :math:`\alpha` directly,
+        :math:`0 < \alpha \leq 1`.
+    min_periods : int, default 0
+        Not Supported
+    adjust : bool, default True
+        Controls assumptions about the first value in the sequence.
+        https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.ewm.html
+        for details.
+    ignore_na : bool, default False
+        Not Supported
+    axis : {0, 1}, default 0
+        Not Supported
+    times : str, np.ndarray, Series, default None
+        Not Supported
+
+    Returns
+    -------
+    ``ExponentialMovingWindow`` object
+
+    Notes
+    -----
+    cuDF input data may contain both nulls and nan values. For the purposes
+    of this method, they are taken to have the same meaning, meaning nulls
+    in cuDF will affect the result the same way that nan values would using
+    the equivalent pandas method.
+
+    .. pandas-compat::
+        **cudf.core.window.ExponentialMovingWindow**
+
+        The parameters ``min_periods``, ``ignore_na``, ``axis``, and ``times``
+        are not yet supported. Behavior is defined only for data that begins
+        with a valid (non-null) element.
+
+        Currently, only ``mean`` is a supported method.
+
+    Examples
+    --------
+    >>> df = cudf.DataFrame({'B': [0, 1, 2, cudf.NA, 4]})
+    >>> df
+          B
+    0     0
+    1     1
+    2     2
+    3  <NA>
+    4     4
+    >>> df.ewm(com=0.5).mean()
+              B
+    0  0.000000
+    1  0.750000
+    2  1.615385
+    3  1.615385
+    4  3.670213
+
+    >>> df.ewm(com=0.5, adjust=False).mean()
+              B
+    0  0.000000
+    1  0.666667
+    2  1.555556
+    3  1.555556
+    4  3.650794
+    """
+
+    def __init__(
+        self,
+        obj,
+        com: float | None = None,
+        span: float | None = None,
+        halflife: float | None = None,
+        alpha: float | None = None,
+        min_periods: int | None = 0,
+        adjust: bool = True,
+        ignore_na: bool = False,
+        axis: int = 0,
+        times: str | np.ndarray | None = None,
+    ):
+        if (min_periods, ignore_na, axis, times) != (0, False, 0, None):
+            raise NotImplementedError(
+                "The parameters `min_periods`, `ignore_na`, "
+                "`axis`, and `times` are not yet supported."
+            )
+
+        self.obj = obj
+        self.adjust = adjust
+        self.com = get_center_of_mass(com, span, halflife, alpha)
+
+    def mean(self):
+        """
+        Calculate the ewm (exponential weighted moment) mean.
+        """
+        return self._apply_agg("ewma")
+
+    def var(self, bias):
+        raise NotImplementedError("ewmvar not yet supported.")
+
+    def std(self, bias):
+        raise NotImplementedError("ewmstd not yet supported.")
+
+    def corr(self, other):
+        raise NotImplementedError("ewmcorr not yet supported.")
+
+    def cov(self, other):
+        raise NotImplementedError("ewmcov not yet supported.")
+
+    def _apply_agg_series(self, sr, agg_name):
+        if not is_numeric_dtype(sr.dtype):
+            raise TypeError("No numeric types to aggregate")
+
+        # libcudf ewm has special casing for nulls only
+        # and come what may with nans. It treats those nulls like
+        # pandas does nans in the same positions mathematically.
+        # as such we need to convert the nans to nulls before
+        # passing them in.
+        to_libcudf_column = sr._column.astype("float64").nans_to_nulls()
+
+        return self.obj._from_data_like_self(
+            self.obj._data._from_columns_like_self(
+                [
+                    scan(
+                        agg_name,
+                        to_libcudf_column,
+                        True,
+                        com=self.com,
+                        adjust=self.adjust,
+                    )
+                ]
+            )
+        )
+
+
+def get_center_of_mass(
+    comass: float | None,
+    span: float | None,
+    halflife: float | None,
+    alpha: float | None,
+) -> float:
+    valid_count = count_not_none(comass, span, halflife, alpha)
+    if valid_count > 1:
+        raise ValueError(
+            "comass, span, halflife, and alpha are mutually exclusive"
+        )
+
+    # Convert to center of mass; domain checks ensure 0 < alpha <= 1
+    if comass is not None:
+        if comass < 0:
+            raise ValueError("comass must satisfy: comass >= 0")
+    elif span is not None:
+        if span < 1:
+            raise ValueError("span must satisfy: span >= 1")
+        comass = (span - 1) / 2
+    elif halflife is not None:
+        if halflife <= 0:
+            raise ValueError("halflife must satisfy: halflife > 0")
+        decay = 1 - np.exp(np.log(0.5) / halflife)
+        comass = 1 / decay - 1
+    elif alpha is not None:
+        if alpha <= 0 or alpha > 1:
+            raise ValueError("alpha must satisfy: 0 < alpha <= 1")
+        comass = (1 - alpha) / alpha
+    else:
+        raise ValueError("Must pass one of comass, span, halflife, or alpha")
+
+    return float(comass)
+
+
+def count_not_none(*args) -> int:
+    """
+    Returns the count of arguments that are not None.
+    """
+    return sum(x is not None for x in args)
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 7d140a1ffa5..29391c68471 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -14,7 +14,27 @@
 from cudf.utils.utils import GetAttrGetItemMixin
 
 
-class Rolling(GetAttrGetItemMixin, Reducible):
+class _RollingBase:
+    """
+    Contains methods common to all kinds of rolling
+    """
+
+    def _apply_agg_dataframe(self, df, agg_name):
+        result_df = cudf.DataFrame({})
+        for i, col_name in enumerate(df.columns):
+            result_col = self._apply_agg_series(df[col_name], agg_name)
+            result_df.insert(i, col_name, result_col)
+        result_df.index = df.index
+        return result_df
+
+    def _apply_agg(self, agg_name):
+        if isinstance(self.obj, cudf.Series):
+            return self._apply_agg_series(self.obj, agg_name)
+        else:
+            return self._apply_agg_dataframe(self.obj, agg_name)
+
+
+class Rolling(GetAttrGetItemMixin, _RollingBase, Reducible):
     """
     Rolling window calculations.
 
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 698dd946022..0ba432d6d0e 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -789,7 +789,7 @@ def Index__new__(cls, *args, **kwargs):
 
 ExponentialMovingWindow = make_intermediate_proxy_type(
     "ExponentialMovingWindow",
-    _Unusable,
+    cudf.core.window.ewm.ExponentialMovingWindow,
     pd.core.window.ewm.ExponentialMovingWindow,
 )
 
diff --git a/python/cudf/cudf/tests/test_ewm.py b/python/cudf/cudf/tests/test_ewm.py
new file mode 100644
index 00000000000..0861d2363ce
--- /dev/null
+++ b/python/cudf/cudf/tests/test_ewm.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+import pytest
+
+import cudf
+from cudf.testing._utils import assert_eq
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        [1.0, 2.0, 3.0, 4.0, 5.0],
+        [5.0, cudf.NA, 3.0, cudf.NA, 8.5],
+        [5.0, cudf.NA, 3.0, cudf.NA, cudf.NA, 4.5],
+        [5.0, cudf.NA, 3.0, 4.0, cudf.NA, 5.0],
+    ],
+)
+@pytest.mark.parametrize(
+    "params",
+    [
+        {"com": 0.1},
+        {"com": 0.5},
+        {"span": 1.5},
+        {"span": 2.5},
+        {"halflife": 0.5},
+        {"halflife": 1.5},
+        {"alpha": 0.1},
+        {"alpha": 0.5},
+    ],
+)
+@pytest.mark.parametrize("adjust", [True, False])
+def test_ewma(data, params, adjust):
+    """
+    The most basic test asserts that we obtain
+    the same numerical values as pandas for various
+    sets of keyword arguemnts that effect the raw
+    coefficients of the formula
+    """
+    params["adjust"] = adjust
+
+    gsr = cudf.Series(data, dtype="float64")
+    psr = gsr.to_pandas()
+
+    expect = psr.ewm(**params).mean()
+    got = gsr.ewm(**params).mean()
+
+    assert_eq(expect, got)

From 114ee8d8a21893542d4c350434ed5211b207cbe9 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 25 Jun 2024 00:24:02 +0100
Subject: [PATCH 399/842] Extend coverage of groupby and rolling window nodes
 (#15999)

Just raise for the rolling expressions for now since we have yet to implement them.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15999
---
 python/cudf_polars/cudf_polars/dsl/expr.py    |  2 +
 python/cudf_polars/cudf_polars/dsl/ir.py      |  6 +--
 .../cudf_polars/cudf_polars/dsl/translate.py  |  8 +++-
 .../tests/expressions/test_rolling.py         | 41 +++++++++++++++++++
 python/cudf_polars/tests/test_groupby.py      | 29 +++++++++++--
 5 files changed, 78 insertions(+), 8 deletions(-)
 create mode 100644 python/cudf_polars/tests/expressions/test_rolling.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 73f3c1ce289..871134665af 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -898,6 +898,7 @@ def __init__(self, dtype: plc.DataType, options: Any, agg: Expr) -> None:
         super().__init__(dtype)
         self.options = options
         self.children = (agg,)
+        raise NotImplementedError("Rolling window not implemented")
 
 
 class GroupedRollingWindow(Expr):
@@ -909,6 +910,7 @@ def __init__(self, dtype: plc.DataType, options: Any, agg: Expr, *by: Expr) -> N
         super().__init__(dtype)
         self.options = options
         self.children = (agg, *by)
+        raise NotImplementedError("Grouped rolling window not implemented")
 
 
 class Cast(Expr):
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 3ccefac6b0a..b3dd6ae7cc3 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -427,8 +427,6 @@ def check_agg(agg: expr.Expr) -> int:
         if isinstance(agg, (expr.BinOp, expr.Cast)):
             return max(GroupBy.check_agg(child) for child in agg.children)
         elif isinstance(agg, expr.Agg):
-            if agg.name == "implode":
-                raise NotImplementedError("implode in groupby")
             return 1 + max(GroupBy.check_agg(child) for child in agg.children)
         elif isinstance(agg, (expr.Len, expr.Col, expr.Literal)):
             return 0
@@ -440,7 +438,9 @@ def __post_init__(self) -> None:
         if self.options.rolling is None and self.maintain_order:
             raise NotImplementedError("Maintaining order in groupby")
         if self.options.rolling:
-            raise NotImplementedError("rolling window/groupby")
+            raise NotImplementedError(
+                "rolling window/groupby"
+            )  # pragma: no cover; rollingwindow constructor has already raised
         if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests):
             raise NotImplementedError("Nested aggregations in groupby")
         self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests]
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 41bc3032bc5..5d289885f47 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -10,6 +10,7 @@
 from typing import Any
 
 import pyarrow as pa
+from typing_extensions import assert_never
 
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
@@ -354,17 +355,20 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
 @_translate_expr.register
 def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
     # TODO: raise in groupby?
-    if node.partition_by is None:
+    if isinstance(node.options, pl_expr.RollingGroupOptions):
+        # pl.col("a").rolling(...)
         return expr.RollingWindow(
             dtype, node.options, translate_expr(visitor, n=node.function)
         )
-    else:
+    elif isinstance(node.options, pl_expr.WindowMapping):
+        # pl.col("a").over(...)
         return expr.GroupedRollingWindow(
             dtype,
             node.options,
             translate_expr(visitor, n=node.function),
             *(translate_expr(visitor, n=n) for n in node.partition_by),
         )
+    assert_never(node.options)
 
 
 @_translate_expr.register
diff --git a/python/cudf_polars/tests/expressions/test_rolling.py b/python/cudf_polars/tests/expressions/test_rolling.py
new file mode 100644
index 00000000000..d4920d35f14
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_rolling.py
@@ -0,0 +1,41 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars import translate_ir
+
+
+def test_rolling():
+    dates = [
+        "2020-01-01 13:45:48",
+        "2020-01-01 16:42:13",
+        "2020-01-01 16:45:09",
+        "2020-01-02 18:12:48",
+        "2020-01-03 19:45:32",
+        "2020-01-08 23:16:43",
+    ]
+    df = (
+        pl.DataFrame({"dt": dates, "a": [3, 7, 5, 9, 2, 1]})
+        .with_columns(pl.col("dt").str.strptime(pl.Datetime))
+        .lazy()
+    )
+    q = df.with_columns(
+        sum_a=pl.sum("a").rolling(index_column="dt", period="2d"),
+        min_a=pl.min("a").rolling(index_column="dt", period="2d"),
+        max_a=pl.max("a").rolling(index_column="dt", period="2d"),
+    )
+    with pytest.raises(NotImplementedError):
+        _ = translate_ir(q._ldf.visit())
+
+
+def test_grouped_rolling():
+    df = pl.LazyFrame({"a": [1, 2, 3, 4, 5, 6], "b": [1, 2, 1, 3, 1, 2]})
+
+    q = df.select(pl.col("a").min().over("b"))
+    with pytest.raises(NotImplementedError):
+        _ = translate_ir(q._ldf.visit())
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index d06a7ecf105..e70f923b097 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -6,6 +6,7 @@
 
 import polars as pl
 
+from cudf_polars import translate_ir
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
@@ -43,6 +44,7 @@ def keys(request):
         [pl.col("float") + pl.col("int")],
         [pl.col("float").max() - pl.col("int").min()],
         [pl.col("float").mean(), pl.col("int").std()],
+        [(pl.col("float") - pl.lit(2)).max()],
     ],
     ids=lambda aggs: "-".join(map(str, aggs)),
 )
@@ -72,7 +74,28 @@ def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs):
     if not maintain_order:
         sort_keys = list(q.schema.keys())[: len(keys)]
         q = q.sort(*sort_keys)
-    # from cudf_polars.dsl.translate import translate_ir
-    # ir = translate_ir(q._ldf.visit())
-    # from IPython import embed; embed()
+
     assert_gpu_result_equal(q, check_exact=False)
+
+
+def test_groupby_len(df, keys):
+    q = df.group_by(*keys).agg(pl.len())
+
+    # TODO: polars returns UInt32, libcudf returns Int32
+    with pytest.raises(AssertionError):
+        assert_gpu_result_equal(q, check_row_order=False)
+    assert_gpu_result_equal(q, check_dtypes=False, check_row_order=False)
+
+
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.col("float").is_not_null(),
+        (pl.col("int").max() + pl.col("float").min()).max(),
+    ],
+)
+def test_groupby_unsupported(df, expr):
+    q = df.group_by("key1").agg(expr)
+
+    with pytest.raises(NotImplementedError):
+        _ = translate_ir(q._ldf.visit())

From b9a0b72773a3adf4ba9ae267911d8970f0db53b0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 24 Jun 2024 14:10:52 -1000
Subject: [PATCH 400/842] Prevent bad ColumnAccessor state after
 .sort_index(axis=1, ignore_index=True) (#16061)

Before `ColumnAccessor.names` was modified with new column labels without the `ColumnAccessor._data` dict keys being modified. This could hide a subtle bug if one later uses `ColumnAccessor._data.keys()` to access column labels instead of `ColumnAccessor.names`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16061
---
 python/cudf/cudf/core/indexed_frame.py   | 21 +++++++++++++++++----
 python/cudf/cudf/tests/test_dataframe.py |  6 ++++++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 7515cb2c177..5cae4a857ee 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -2727,11 +2727,24 @@ def sort_index(
             if ignore_index:
                 out = out.reset_index(drop=True)
         else:
-            labels = sorted(self._data.names, reverse=not ascending)
-            out = self[labels]
+            labels = sorted(self._column_names, reverse=not ascending)
+            result_columns = (self._data[label] for label in labels)
             if ignore_index:
-                out._data.rangeindex = True
-                out._data.names = list(range(self._num_columns))
+                ca = ColumnAccessor(
+                    dict(enumerate(result_columns)),
+                    rangeindex=True,
+                    verify=False,
+                )
+            else:
+                ca = ColumnAccessor(
+                    dict(zip(labels, result_columns)),
+                    rangeindex=self._data.rangeindex,
+                    multiindex=self._data.multiindex,
+                    level_names=self._data.level_names,
+                    label_dtype=self._data.label_dtype,
+                    verify=False,
+                )
+            out = self._from_data_like_self(ca)
 
         return self._mimic_inplace(out, inplace=inplace)
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 3661e13bd39..cfa2a4aa8fd 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -3660,6 +3660,12 @@ def test_dataframe_mulitindex_sort_index(
         assert_eq(expected, got)
 
 
+def test_sort_index_axis_1_ignore_index_true_columnaccessor_state_names():
+    gdf = cudf.DataFrame([[1, 2, 3]], columns=["b", "a", "c"])
+    result = gdf.sort_index(axis=1, ignore_index=True)
+    assert result._data.names == tuple(result._data.keys())
+
+
 @pytest.mark.parametrize("dtype", dtypes + ["category"])
 def test_dataframe_0_row_dtype(dtype):
     if dtype == "category":

From ac0f79a92a77ed15d03124f2a37fb5d4364e45db Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 25 Jun 2024 11:00:11 -0400
Subject: [PATCH 401/842] Improve multibyte-split byte-range performance
 (#16019)

Changes the `cudf::io::text::multibyte_split()` function to use `std::ifstream::seekg()` to skip bytes instead of `std::ifstream::ignore()` for a file input source.
The `seekg()` function is significantly faster for large files.

Also fixed the multibyte-split benchmark to correctly access the chars buffer after generating an input strings column.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16019
---
 cpp/benchmarks/io/text/multibyte_split.cpp      | 5 ++---
 cpp/src/io/text/data_chunk_source_factories.cpp | 8 ++++++--
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
index 67705863d41..4bfef9767ca 100644
--- a/cpp/benchmarks/io/text/multibyte_split.cpp
+++ b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -85,8 +85,7 @@ static cudf::string_scalar create_random_input(int32_t num_chars,
 
   // extract the chars from the returned strings column.
   auto input_column_contents = input_column->release();
-  auto chars_column_contents = input_column_contents.children[1]->release();
-  auto chars_buffer          = chars_column_contents.data.release();
+  auto chars_buffer          = input_column_contents.data.release();
 
   // turn the chars in to a string scalar.
   return cudf::string_scalar(std::move(*chars_buffer));
@@ -218,7 +217,7 @@ NVBENCH_BENCH_TYPES(bench_multibyte_split,
 NVBENCH_BENCH_TYPES(bench_multibyte_split, NVBENCH_TYPE_AXES(source_type_list))
   .set_name("multibyte_split_source")
   .set_min_samples(4)
-  .add_int64_axis("strip_delimiters", {1})
+  .add_int64_axis("strip_delimiters", {0, 1})
   .add_int64_axis("delim_size", {1})
   .add_int64_axis("delim_percent", {1})
   .add_int64_power_of_two_axis("size_approx", {15, 30})
diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index 596ca3458c8..58faa0ebfe4 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -120,7 +120,11 @@ class istream_data_chunk_reader : public data_chunk_reader {
   {
   }
 
-  void skip_bytes(std::size_t size) override { _datastream->ignore(size); };
+  void skip_bytes(std::size_t size) override
+  {
+    // 20% faster than _datastream->ignore(size) for large files
+    _datastream->seekg(_datastream->tellg() + static_cast<std::ifstream::pos_type>(size));
+  };
 
   std::unique_ptr<device_data_chunk> get_next_chunk(std::size_t read_size,
                                                     rmm::cuda_stream_view stream) override
@@ -265,7 +269,7 @@ class file_data_chunk_source : public data_chunk_source {
   [[nodiscard]] std::unique_ptr<data_chunk_reader> create_reader() const override
   {
     return std::make_unique<istream_data_chunk_reader>(
-      std::make_unique<std::ifstream>(_filename, std::ifstream::in));
+      std::make_unique<std::ifstream>(_filename, std::ifstream::in | std::ifstream::binary));
   }
 
  private:

From 1bc1f45e345387110796fae5553bb447223ac8d7 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 25 Jun 2024 16:14:54 +0100
Subject: [PATCH 402/842] fast_slow_proxy: Don't import assert_eq at top-level
 (#16063)

The testing._utils module imports pytest, which is not advertised as a default run dependency of cudf, so we must avoid importing it in the proxy wrappers at top-level.

Since what we need in the proxy wrappers for pandas debugging is the `assert_eq` function (which does not need pytest), move it to `testing.testing` (where it more naturally fits with the other assertion functions anyway). This removes the need for pytest when running the fast-slow-proxy wrappers.

- Closes #16062.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16063
---
 .../user_guide/api_docs/general_utilities.rst |   2 +
 .../cudf/_fuzz_testing/tests/fuzz_test_csv.py |   4 +-
 .../_fuzz_testing/tests/fuzz_test_json.py     |   4 +-
 python/cudf/cudf/_fuzz_testing/utils.py       |   2 +-
 python/cudf/cudf/pandas/fast_slow_proxy.py    |   2 +-
 python/cudf/cudf/testing/__init__.py          |   4 +-
 python/cudf/cudf/testing/_utils.py            |  78 -----------
 python/cudf/cudf/testing/testing.py           | 100 ++++++++++++++
 python/cudf/cudf/tests/conftest.py            |   4 +-
 .../cudf/tests/dataframe/test_conversion.py   |   4 +-
 .../tests/dataframe/test_io_serialization.py  |   2 +-
 .../cudf/tests/groupby/test_computation.py    |   4 +-
 .../cudf/tests/groupby/test_groupby_obj.py    |   2 +-
 .../cudf/cudf/tests/groupby/test_indexing.py  |   2 +-
 .../cudf/cudf/tests/groupby/test_transform.py |   2 +-
 .../tests/indexes/datetime/test_indexing.py   |   2 +-
 .../indexes/datetime/test_time_specific.py    |   2 +-
 .../cudf/cudf/tests/indexes/test_interval.py  |   2 +-
 .../cudf/cudf/tests/input_output/test_text.py |   4 +-
 .../cudf/cudf/tests/series/test_conversion.py |   4 +-
 .../cudf/tests/series/test_datetimelike.py    |   2 +-
 python/cudf/cudf/tests/test_apply_rows.py     |   5 +-
 python/cudf/cudf/tests/test_applymap.py       |   4 +-
 python/cudf/cudf/tests/test_array_function.py |   2 +-
 python/cudf/cudf/tests/test_array_ufunc.py    |   7 +-
 .../test_avro_reader_fastavro_integration.py  |   2 +-
 python/cudf/cudf/tests/test_binops.py         | 130 +++++++++---------
 python/cudf/cudf/tests/test_categorical.py    |   7 +-
 python/cudf/cudf/tests/test_column.py         |   3 +-
 .../cudf/cudf/tests/test_column_accessor.py   |   2 +-
 python/cudf/cudf/tests/test_concat.py         |   7 +-
 python/cudf/cudf/tests/test_contains.py       |   8 +-
 python/cudf/cudf/tests/test_copying.py        |   3 +-
 python/cudf/cudf/tests/test_csv.py            |   3 +-
 python/cudf/cudf/tests/test_cuda_apply.py     |   4 +-
 .../cudf/tests/test_cuda_array_interface.py   |   8 +-
 .../cudf/cudf/tests/test_custom_accessor.py   |   2 +-
 python/cudf/cudf/tests/test_cut.py            |   2 +-
 python/cudf/cudf/tests/test_dataframe.py      |   4 +-
 python/cudf/cudf/tests/test_dataframe_copy.py |   5 +-
 python/cudf/cudf/tests/test_datasets.py       |   2 +-
 python/cudf/cudf/tests/test_datetime.py       |   2 +-
 python/cudf/cudf/tests/test_decimal.py        |   4 +-
 python/cudf/cudf/tests/test_df_protocol.py    |   2 +-
 python/cudf/cudf/tests/test_dlpack.py         |   2 +-
 python/cudf/cudf/tests/test_dropna.py         |   2 +-
 python/cudf/cudf/tests/test_dtypes.py         |   2 +-
 python/cudf/cudf/tests/test_duplicates.py     |   3 +-
 python/cudf/cudf/tests/test_ewm.py            |   2 +-
 python/cudf/cudf/tests/test_factorize.py      |   2 +-
 python/cudf/cudf/tests/test_feather.py        |   5 +-
 python/cudf/cudf/tests/test_gcs.py            |   4 +-
 python/cudf/cudf/tests/test_groupby.py        |   2 +-
 python/cudf/cudf/tests/test_hdf.py            |   3 +-
 python/cudf/cudf/tests/test_hdfs.py           |   4 +-
 python/cudf/cudf/tests/test_index.py          |   2 +-
 python/cudf/cudf/tests/test_indexing.py       |   3 +-
 python/cudf/cudf/tests/test_interpolate.py    |   7 +-
 python/cudf/cudf/tests/test_interval.py       |   2 +-
 python/cudf/cudf/tests/test_join_order.py     |   2 +-
 python/cudf/cudf/tests/test_joining.py        |   2 +-
 python/cudf/cudf/tests/test_json.py           |   2 +-
 python/cudf/cudf/tests/test_list.py           |   8 +-
 python/cudf/cudf/tests/test_monotonic.py      |   2 +-
 python/cudf/cudf/tests/test_multiindex.py     |   8 +-
 python/cudf/cudf/tests/test_numerical.py      |   3 +-
 python/cudf/cudf/tests/test_numpy_interop.py  |   4 +-
 python/cudf/cudf/tests/test_onehot.py         |   2 +-
 python/cudf/cudf/tests/test_orc.py            |   3 +-
 python/cudf/cudf/tests/test_pack.py           |   2 +-
 python/cudf/cudf/tests/test_pandas_interop.py |   4 +-
 python/cudf/cudf/tests/test_parquet.py        |   8 +-
 python/cudf/cudf/tests/test_pickling.py       |   2 +-
 python/cudf/cudf/tests/test_quantiles.py      |   5 +-
 python/cudf/cudf/tests/test_query.py          |   4 +-
 python/cudf/cudf/tests/test_query_mask.py     |   4 +-
 python/cudf/cudf/tests/test_rank.py           |   3 +-
 python/cudf/cudf/tests/test_reductions.py     |   9 +-
 python/cudf/cudf/tests/test_replace.py        |   2 +-
 python/cudf/cudf/tests/test_resampling.py     |   2 +-
 python/cudf/cudf/tests/test_reshape.py        |   8 +-
 python/cudf/cudf/tests/test_rolling.py        |   2 +-
 python/cudf/cudf/tests/test_s3.py             |   2 +-
 python/cudf/cudf/tests/test_scan.py           |  10 +-
 python/cudf/cudf/tests/test_search.py         |   3 +-
 python/cudf/cudf/tests/test_serialize.py      |   3 +-
 python/cudf/cudf/tests/test_series.py         |   2 +-
 python/cudf/cudf/tests/test_seriesmap.py      |   5 +-
 python/cudf/cudf/tests/test_setitem.py        |   7 +-
 python/cudf/cudf/tests/test_sorting.py        |   2 +-
 python/cudf/cudf/tests/test_spilling.py       |   2 +-
 python/cudf/cudf/tests/test_stats.py          |   7 +-
 python/cudf/cudf/tests/test_string.py         |   2 +-
 python/cudf/cudf/tests/test_string_udfs.py    |   3 +-
 python/cudf/cudf/tests/test_struct.py         |   5 +-
 python/cudf/cudf/tests/test_testing.py        |   3 +-
 python/cudf/cudf/tests/test_timedelta.py      |   4 +-
 python/cudf/cudf/tests/test_udf_masked_ops.py |   2 +-
 python/cudf/cudf/tests/test_unaops.py         |   6 +-
 .../cudf/tests/text/test_subword_tokenizer.py |   2 +-
 .../cudf/cudf/tests/text/test_text_methods.py |   2 +-
 .../cudf_pandas_tests/test_cudf_pandas.py     |   2 +-
 .../custreamz/custreamz/tests/test_kafka.py   |   4 +-
 .../dask_cudf/tests/test_accessor.py          |   3 +-
 python/dask_cudf/dask_cudf/tests/test_core.py |   2 +-
 .../dask_cudf/tests/test_distributed.py       |   2 +-
 106 files changed, 328 insertions(+), 343 deletions(-)

diff --git a/docs/cudf/source/user_guide/api_docs/general_utilities.rst b/docs/cudf/source/user_guide/api_docs/general_utilities.rst
index d9c53c3fbbd..8d0edc0b100 100644
--- a/docs/cudf/source/user_guide/api_docs/general_utilities.rst
+++ b/docs/cudf/source/user_guide/api_docs/general_utilities.rst
@@ -8,6 +8,8 @@ Testing functions
    :toctree: api/
 
    cudf.testing.testing.assert_column_equal
+   cudf.testing.testing.assert_eq
    cudf.testing.testing.assert_frame_equal
    cudf.testing.testing.assert_index_equal
+   cudf.testing.testing.assert_neq
    cudf.testing.testing.assert_series_equal
diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py
index f8f674fecec..d90f3ea1aca 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_csv.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import sys
 from io import StringIO
@@ -13,7 +13,7 @@
     compare_content,
     run_test,
 )
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @pythonfuzz(data_handle=CSVReader)
diff --git a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py
index 2f5e6204f7c..69e9437be93 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py
+++ b/python/cudf/cudf/_fuzz_testing/tests/fuzz_test_json.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import io
 import sys
@@ -9,7 +9,7 @@
 from cudf._fuzz_testing.json import JSONReader, JSONWriter
 from cudf._fuzz_testing.main import pythonfuzz
 from cudf._fuzz_testing.utils import ALL_POSSIBLE_VALUES, run_test
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @pythonfuzz(data_handle=JSONReader)
diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index d685174f3c2..e6dfe2eae62 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -8,7 +8,7 @@
 import pyarrow as pa
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 from cudf.utils.dtypes import (
     pandas_dtypes_to_np_dtypes,
     pyarrow_dtypes_to_pandas_dtypes,
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 1540c6850e7..dfb729cae6b 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -17,7 +17,7 @@
 import numpy as np
 
 from ..options import _env_get_bool
-from ..testing._utils import assert_eq
+from ..testing import assert_eq
 from .annotation import nvtx
 
 
diff --git a/python/cudf/cudf/testing/__init__.py b/python/cudf/cudf/testing/__init__.py
index 1843344bc81..4e92b43b9f9 100644
--- a/python/cudf/cudf/testing/__init__.py
+++ b/python/cudf/cudf/testing/__init__.py
@@ -1,7 +1,9 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from cudf.testing.testing import (
+    assert_eq,
     assert_frame_equal,
     assert_index_equal,
+    assert_neq,
     assert_series_equal,
 )
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index e067d15af4c..a6a2d4eea00 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -2,12 +2,10 @@
 
 import itertools
 import string
-import warnings
 from collections import abc
 from contextlib import contextmanager
 from decimal import Decimal
 
-import cupy
 import numpy as np
 import pandas as pd
 import pytest
@@ -15,7 +13,6 @@
 from numba.core.typing.templates import AbstractTemplate
 from numba.cuda.cudadecl import registry as cuda_decl_registry
 from numba.cuda.cudaimpl import lower as cuda_lower
-from pandas import testing as tm
 
 import cudf
 from cudf._lib.null_mask import bitmask_allocation_size_bytes
@@ -113,81 +110,6 @@ def count_zero(arr):
     return np.count_nonzero(arr == 0)
 
 
-def assert_eq(left, right, **kwargs):
-    """Assert that two cudf-like things are equivalent
-
-    This equality test works for pandas/cudf dataframes/series/indexes/scalars
-    in the same way, and so makes it easier to perform parametrized testing
-    without switching between assert_frame_equal/assert_series_equal/...
-    functions.
-    """
-    # dtypes that we support but Pandas doesn't will convert to
-    # `object`. Check equality before that happens:
-    if kwargs.get("check_dtype", True):
-        if hasattr(left, "dtype") and hasattr(right, "dtype"):
-            if isinstance(
-                left.dtype, cudf.core.dtypes._BaseDtype
-            ) and not isinstance(
-                left.dtype, cudf.CategoricalDtype
-            ):  # leave categorical comparison to Pandas
-                assert_eq(left.dtype, right.dtype)
-
-    if hasattr(left, "to_pandas"):
-        left = left.to_pandas()
-    if hasattr(right, "to_pandas"):
-        right = right.to_pandas()
-    if isinstance(left, cupy.ndarray):
-        left = cupy.asnumpy(left)
-    if isinstance(right, cupy.ndarray):
-        right = cupy.asnumpy(right)
-
-    if isinstance(left, (pd.DataFrame, pd.Series, pd.Index)):
-        # TODO: A warning is emitted from the function
-        # pandas.testing.assert_[series, frame, index]_equal for some inputs:
-        # "DeprecationWarning: elementwise comparison failed; this will raise
-        # an error in the future."
-        # or "FutureWarning: elementwise ..."
-        # This warning comes from a call from pandas to numpy. It is ignored
-        # here because it cannot be fixed within cudf.
-        with warnings.catch_warnings():
-            warnings.simplefilter(
-                "ignore", (DeprecationWarning, FutureWarning)
-            )
-            if isinstance(left, pd.DataFrame):
-                tm.assert_frame_equal(left, right, **kwargs)
-            elif isinstance(left, pd.Series):
-                tm.assert_series_equal(left, right, **kwargs)
-            else:
-                tm.assert_index_equal(left, right, **kwargs)
-
-    elif isinstance(left, np.ndarray) and isinstance(right, np.ndarray):
-        if np.issubdtype(left.dtype, np.floating) and np.issubdtype(
-            right.dtype, np.floating
-        ):
-            assert np.allclose(left, right, equal_nan=True)
-        else:
-            assert np.array_equal(left, right)
-    else:
-        # Use the overloaded __eq__ of the operands
-        if left == right:
-            return True
-        elif any(np.issubdtype(type(x), np.floating) for x in (left, right)):
-            np.testing.assert_almost_equal(left, right)
-        else:
-            np.testing.assert_equal(left, right)
-    return True
-
-
-def assert_neq(left, right, **kwargs):
-    __tracebackhide__ = True
-    try:
-        assert_eq(left, right, **kwargs)
-    except AssertionError:
-        pass
-    else:
-        raise AssertionError
-
-
 def assert_exceptions_equal(
     lfunc,
     rfunc,
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index dffbbe92fc1..e56c8d867cb 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -2,9 +2,12 @@
 
 from __future__ import annotations
 
+import warnings
+
 import cupy as cp
 import numpy as np
 import pandas as pd
+from pandas import testing as tm
 
 import cudf
 from cudf._lib.unary import is_nan
@@ -708,3 +711,100 @@ def assert_frame_equal(
             atol=atol,
             obj=f'Column name="{col}"',
         )
+
+
+def assert_eq(left, right, **kwargs):
+    """Assert that two cudf-like things are equivalent
+
+    Parameters
+    ----------
+    left
+        Object to compare
+    right
+        Object to compare
+    kwargs
+        Keyword arguments to control behaviour of comparisons. See
+        :func:`assert_frame_equal`, :func:`assert_series_equal`, and
+        :func:`assert_index_equal`.
+
+    Notes
+    -----
+    This equality test works for pandas/cudf dataframes/series/indexes/scalars
+    in the same way, and so makes it easier to perform parametrized testing
+    without switching between assert_frame_equal/assert_series_equal/...
+    functions.
+
+    Raises
+    ------
+    AssertionError
+        If the two objects do not compare equal.
+    """
+    # dtypes that we support but Pandas doesn't will convert to
+    # `object`. Check equality before that happens:
+    if kwargs.get("check_dtype", True):
+        if hasattr(left, "dtype") and hasattr(right, "dtype"):
+            if isinstance(
+                left.dtype, cudf.core.dtypes._BaseDtype
+            ) and not isinstance(
+                left.dtype, cudf.CategoricalDtype
+            ):  # leave categorical comparison to Pandas
+                assert_eq(left.dtype, right.dtype)
+
+    if hasattr(left, "to_pandas"):
+        left = left.to_pandas()
+    if hasattr(right, "to_pandas"):
+        right = right.to_pandas()
+    if isinstance(left, cp.ndarray):
+        left = cp.asnumpy(left)
+    if isinstance(right, cp.ndarray):
+        right = cp.asnumpy(right)
+
+    if isinstance(left, (pd.DataFrame, pd.Series, pd.Index)):
+        # TODO: A warning is emitted from the function
+        # pandas.testing.assert_[series, frame, index]_equal for some inputs:
+        # "DeprecationWarning: elementwise comparison failed; this will raise
+        # an error in the future."
+        # or "FutureWarning: elementwise ..."
+        # This warning comes from a call from pandas to numpy. It is ignored
+        # here because it cannot be fixed within cudf.
+        with warnings.catch_warnings():
+            warnings.simplefilter(
+                "ignore", (DeprecationWarning, FutureWarning)
+            )
+            if isinstance(left, pd.DataFrame):
+                tm.assert_frame_equal(left, right, **kwargs)
+            elif isinstance(left, pd.Series):
+                tm.assert_series_equal(left, right, **kwargs)
+            else:
+                tm.assert_index_equal(left, right, **kwargs)
+
+    elif isinstance(left, np.ndarray) and isinstance(right, np.ndarray):
+        if np.issubdtype(left.dtype, np.floating) and np.issubdtype(
+            right.dtype, np.floating
+        ):
+            assert np.allclose(left, right, equal_nan=True)
+        else:
+            assert np.array_equal(left, right)
+    else:
+        # Use the overloaded __eq__ of the operands
+        if left == right:
+            return True
+        elif any(np.issubdtype(type(x), np.floating) for x in (left, right)):
+            np.testing.assert_almost_equal(left, right)
+        else:
+            np.testing.assert_equal(left, right)
+    return True
+
+
+def assert_neq(left, right, **kwargs):
+    """Assert that two cudf-like things are not equal.
+
+    Provides the negation of the meaning of :func:`assert_eq`.
+    """
+    __tracebackhide__ = True
+    try:
+        assert_eq(left, right, **kwargs)
+    except AssertionError:
+        pass
+    else:
+        raise AssertionError
diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py
index 30d8f1c8422..437bc4cba67 100644
--- a/python/cudf/cudf/tests/conftest.py
+++ b/python/cudf/cudf/tests/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import itertools
 import os
@@ -11,7 +11,7 @@
 import rmm  # noqa: F401
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 _CURRENT_DIRECTORY = str(pathlib.Path(__file__).resolve().parent)
 
diff --git a/python/cudf/cudf/tests/dataframe/test_conversion.py b/python/cudf/cudf/tests/dataframe/test_conversion.py
index fa7e5ec1d4c..d1de7245634 100644
--- a/python/cudf/cudf/tests/dataframe/test_conversion.py
+++ b/python/cudf/cudf/tests/dataframe/test_conversion.py
@@ -1,9 +1,9 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 import pandas as pd
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 def test_convert_dtypes():
diff --git a/python/cudf/cudf/tests/dataframe/test_io_serialization.py b/python/cudf/cudf/tests/dataframe/test_io_serialization.py
index ad81609470c..57948afe1d8 100644
--- a/python/cudf/cudf/tests/dataframe/test_io_serialization.py
+++ b/python/cudf/cudf/tests/dataframe/test_io_serialization.py
@@ -8,7 +8,7 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/groupby/test_computation.py b/python/cudf/cudf/tests/groupby/test_computation.py
index 04c56ef7462..630fcdc4dce 100644
--- a/python/cudf/cudf/tests/groupby/test_computation.py
+++ b/python/cudf/cudf/tests/groupby/test_computation.py
@@ -1,9 +1,9 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 import pandas as pd
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @pytest.mark.parametrize("method", ["average", "min", "max", "first", "dense"])
diff --git a/python/cudf/cudf/tests/groupby/test_groupby_obj.py b/python/cudf/cudf/tests/groupby/test_groupby_obj.py
index 04b483e08dc..ab2b16d263c 100644
--- a/python/cudf/cudf/tests/groupby/test_groupby_obj.py
+++ b/python/cudf/cudf/tests/groupby/test_groupby_obj.py
@@ -2,7 +2,7 @@
 from numpy.testing import assert_array_equal
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 def test_groupby_14955():
diff --git a/python/cudf/cudf/tests/groupby/test_indexing.py b/python/cudf/cudf/tests/groupby/test_indexing.py
index 57e8bc1c2d8..43b6183fca5 100644
--- a/python/cudf/cudf/tests/groupby/test_indexing.py
+++ b/python/cudf/cudf/tests/groupby/test_indexing.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 def test_rank_return_type_compatible_mode():
diff --git a/python/cudf/cudf/tests/groupby/test_transform.py b/python/cudf/cudf/tests/groupby/test_transform.py
index 78d7fbfd879..f7138036ddf 100644
--- a/python/cudf/cudf/tests/groupby/test_transform.py
+++ b/python/cudf/cudf/tests/groupby/test_transform.py
@@ -4,7 +4,7 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @pytest.fixture(params=[False, True], ids=["no-null-keys", "null-keys"])
diff --git a/python/cudf/cudf/tests/indexes/datetime/test_indexing.py b/python/cudf/cudf/tests/indexes/datetime/test_indexing.py
index ee4d0f7e816..4c0ce2ed191 100644
--- a/python/cudf/cudf/tests/indexes/datetime/test_indexing.py
+++ b/python/cudf/cudf/tests/indexes/datetime/test_indexing.py
@@ -4,7 +4,7 @@
 import pandas as pd
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 def test_slice_datetimetz_index():
diff --git a/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py b/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py
index 77b32b8ce89..7cc629270b1 100644
--- a/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py
+++ b/python/cudf/cudf/tests/indexes/datetime/test_time_specific.py
@@ -4,7 +4,7 @@
 import pandas as pd
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 def test_tz_localize():
diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py
index d59041e32d5..87b76ab7609 100644
--- a/python/cudf/cudf/tests/indexes/test_interval.py
+++ b/python/cudf/cudf/tests/indexes/test_interval.py
@@ -7,7 +7,7 @@
 import cudf
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.core.index import IntervalIndex, interval_range
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 def test_interval_constructor_default_closed():
diff --git a/python/cudf/cudf/tests/input_output/test_text.py b/python/cudf/cudf/tests/input_output/test_text.py
index acba13bb5b0..e9406d080d4 100644
--- a/python/cudf/cudf/tests/input_output/test_text.py
+++ b/python/cudf/cudf/tests/input_output/test_text.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from io import StringIO
 
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @pytest.fixture(scope="module")
diff --git a/python/cudf/cudf/tests/series/test_conversion.py b/python/cudf/cudf/tests/series/test_conversion.py
index 43ac35e41a6..e1dd359e1ba 100644
--- a/python/cudf/cudf/tests/series/test_conversion.py
+++ b/python/cudf/cudf/tests/series/test_conversion.py
@@ -1,9 +1,9 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 import pandas as pd
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/series/test_datetimelike.py b/python/cudf/cudf/tests/series/test_datetimelike.py
index 302ef19852d..cea86a5499e 100644
--- a/python/cudf/cudf/tests/series/test_datetimelike.py
+++ b/python/cudf/cudf/tests/series/test_datetimelike.py
@@ -9,7 +9,7 @@
 
 import cudf
 from cudf import date_range
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 def _get_all_zones():
diff --git a/python/cudf/cudf/tests/test_apply_rows.py b/python/cudf/cudf/tests/test_apply_rows.py
index 8870eb421c7..a11022c1a17 100644
--- a/python/cudf/cudf/tests/test_apply_rows.py
+++ b/python/cudf/cudf/tests/test_apply_rows.py
@@ -1,10 +1,11 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import pytest
 
 import cudf
 from cudf.core.column import column
-from cudf.testing._utils import assert_eq, gen_rand_series
+from cudf.testing import assert_eq
+from cudf.testing._utils import gen_rand_series
 
 
 def _kernel_multiply(a, b, out):
diff --git a/python/cudf/cudf/tests/test_applymap.py b/python/cudf/cudf/tests/test_applymap.py
index d720e6ce2ce..ce1dcce5887 100644
--- a/python/cudf/cudf/tests/test_applymap.py
+++ b/python/cudf/cudf/tests/test_applymap.py
@@ -4,7 +4,7 @@
 
 from cudf import NA, DataFrame
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
-from cudf.testing import _utils as utils
+from cudf.testing import assert_eq
 
 
 @pytest.mark.skipif(
@@ -46,7 +46,7 @@ def test_applymap_dataframe(data, func, na_action, request):
     with pytest.warns(FutureWarning):
         got = gdf.applymap(func, na_action=na_action)
 
-    utils.assert_eq(expect, got, check_dtype=False)
+    assert_eq(expect, got, check_dtype=False)
 
 
 def test_applymap_raise_cases():
diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py
index e6b89e2c5fa..773141ee71a 100644
--- a/python/cudf/cudf/tests/test_array_function.py
+++ b/python/cudf/cudf/tests/test_array_function.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 # To determine if NEP18 is available in the current version of NumPy we simply
diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py
index b036c1f13f3..41b9188f036 100644
--- a/python/cudf/cudf/tests/test_array_ufunc.py
+++ b/python/cudf/cudf/tests/test_array_ufunc.py
@@ -15,11 +15,8 @@
     PANDAS_LT_300,
     PANDAS_VERSION,
 )
-from cudf.testing._utils import (
-    assert_eq,
-    expect_warning_if,
-    set_random_null_mask_inplace,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import expect_warning_if, set_random_null_mask_inplace
 
 _UFUNCS = [
     obj
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index 238e8d990cc..2ec1d1d2f28 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -23,7 +23,7 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 from cudf.testing.dataset_generator import rand_dataframe
 
 
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index fa371914c3e..7d8c3b53115 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -15,7 +15,7 @@
 from cudf import Index, Series
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.core.buffer.spill_manager import get_global_manager
-from cudf.testing import _utils as utils
+from cudf.testing import _utils as utils, assert_eq
 from cudf.utils.dtypes import (
     BOOL_TYPES,
     DATETIME_TYPES,
@@ -194,7 +194,7 @@ def test_series_binop(binop, obj_class):
     if obj_class == "Index":
         result = Series(result)
 
-    utils.assert_eq(result, expect)
+    assert_eq(result, expect)
 
 
 @pytest.mark.parametrize("binop", _binops)
@@ -318,7 +318,7 @@ def test_series_compare_nulls(cmpop, dtypes):
     expect[expect_mask] = cmpop(lser[expect_mask], rser[expect_mask])
 
     got = cmpop(lser, rser)
-    utils.assert_eq(expect, got)
+    assert_eq(expect, got)
 
 
 @pytest.fixture
@@ -349,7 +349,7 @@ def test_str_series_compare_str(
         Series.from_pandas(str_series_cmp_data), "a"
     )
 
-    utils.assert_eq(expect, got.to_pandas(nullable=True))
+    assert_eq(expect, got.to_pandas(nullable=True))
 
 
 def test_str_series_compare_str_reflected(
@@ -360,7 +360,7 @@ def test_str_series_compare_str_reflected(
         "a", Series.from_pandas(str_series_cmp_data)
     )
 
-    utils.assert_eq(expect, got.to_pandas(nullable=True))
+    assert_eq(expect, got.to_pandas(nullable=True))
 
 
 def test_str_series_compare_num(
@@ -371,7 +371,7 @@ def test_str_series_compare_num(
         Series.from_pandas(str_series_cmp_data), cmp_scalar
     )
 
-    utils.assert_eq(expect, got.to_pandas(nullable=True))
+    assert_eq(expect, got.to_pandas(nullable=True))
 
 
 def test_str_series_compare_num_reflected(
@@ -382,7 +382,7 @@ def test_str_series_compare_num_reflected(
         cmp_scalar, Series.from_pandas(str_series_cmp_data)
     )
 
-    utils.assert_eq(expect, got.to_pandas(nullable=True))
+    assert_eq(expect, got.to_pandas(nullable=True))
 
 
 @pytest.mark.parametrize("obj_class", ["Series", "Index"])
@@ -612,12 +612,12 @@ def test_different_shapes_and_columns(binop):
     # Empty frame on the right side
     pd_frame = binop(pd.DataFrame({"x": [1, 2]}), pd.DataFrame({}))
     cd_frame = binop(cudf.DataFrame({"x": [1, 2]}), cudf.DataFrame({}))
-    utils.assert_eq(cd_frame, pd_frame)
+    assert_eq(cd_frame, pd_frame)
 
     # Empty frame on the left side
     pd_frame = pd.DataFrame({}) + pd.DataFrame({"x": [1, 2]})
     cd_frame = cudf.DataFrame({}) + cudf.DataFrame({"x": [1, 2]})
-    utils.assert_eq(cd_frame, pd_frame)
+    assert_eq(cd_frame, pd_frame)
 
     # Note: the below rely on a discrepancy between cudf and pandas
     # While pandas inserts columns in alphabetical order, cudf inserts in the
@@ -627,12 +627,12 @@ def test_different_shapes_and_columns(binop):
     # More rows on the left side
     pd_frame = pd.DataFrame({"x": [1, 2, 3]}) + pd.DataFrame({"y": [1, 2]})
     cd_frame = cudf.DataFrame({"x": [1, 2, 3]}) + cudf.DataFrame({"y": [1, 2]})
-    utils.assert_eq(cd_frame, pd_frame)
+    assert_eq(cd_frame, pd_frame)
 
     # More rows on the right side
     pd_frame = pd.DataFrame({"x": [1, 2]}) + pd.DataFrame({"y": [1, 2, 3]})
     cd_frame = cudf.DataFrame({"x": [1, 2]}) + cudf.DataFrame({"y": [1, 2, 3]})
-    utils.assert_eq(cd_frame, pd_frame)
+    assert_eq(cd_frame, pd_frame)
 
 
 @pytest.mark.parametrize("binop", _binops)
@@ -650,7 +650,7 @@ def test_different_shapes_and_same_columns(binop):
     )
     # cast x as float64 so it matches pandas dtype
     cd_frame["x"] = cd_frame["x"].astype(np.float64)
-    utils.assert_eq(cd_frame, pd_frame)
+    assert_eq(cd_frame, pd_frame)
 
 
 @pytest.mark.parametrize("binop", _binops)
@@ -680,7 +680,7 @@ def test_different_shapes_and_columns_with_unaligned_indices(binop):
     # cast x and y as float64 so it matches pandas dtype
     cd_frame["x"] = cd_frame["x"].astype(np.float64)
     cd_frame["y"] = cd_frame["y"].astype(np.float64)
-    utils.assert_eq(cd_frame, pd_frame)
+    assert_eq(cd_frame, pd_frame)
 
     pdf1 = pd.DataFrame({"x": [1, 1]}, index=["a", "a"])
     pdf2 = pd.DataFrame({"x": [2]}, index=["a"])
@@ -688,7 +688,7 @@ def test_different_shapes_and_columns_with_unaligned_indices(binop):
     gdf2 = cudf.DataFrame.from_pandas(pdf2)
     pd_frame = binop(pdf1, pdf2)
     cd_frame = binop(gdf1, gdf2)
-    utils.assert_eq(pd_frame, cd_frame)
+    assert_eq(pd_frame, cd_frame)
 
 
 @pytest.mark.parametrize(
@@ -717,12 +717,12 @@ def test_df_different_index_shape(df2, binop):
 def test_boolean_scalar_binop(op):
     psr = pd.Series(np.random.choice([True, False], 10))
     gsr = cudf.from_pandas(psr)
-    utils.assert_eq(op(psr, True), op(gsr, True))
-    utils.assert_eq(op(psr, False), op(gsr, False))
+    assert_eq(op(psr, True), op(gsr, True))
+    assert_eq(op(psr, False), op(gsr, False))
 
     # cuDF scalar
-    utils.assert_eq(op(psr, True), op(gsr, cudf.Scalar(True)))
-    utils.assert_eq(op(psr, False), op(gsr, cudf.Scalar(False)))
+    assert_eq(op(psr, True), op(gsr, cudf.Scalar(True)))
+    assert_eq(op(psr, False), op(gsr, cudf.Scalar(False)))
 
 
 @pytest.mark.parametrize("func", _operators_arithmetic)
@@ -747,7 +747,7 @@ def test_operator_func_between_series(dtype, func, has_nulls, fill_value):
         pdf_series_b, fill_value=fill_value
     )
 
-    utils.assert_eq(pdf_result, gdf_result)
+    assert_eq(pdf_result, gdf_result)
 
 
 @pytest.mark.parametrize("func", _operators_arithmetic)
@@ -773,7 +773,7 @@ def test_operator_func_series_and_scalar(
         scalar, fill_value=fill_value
     )
 
-    utils.assert_eq(pdf_series_result, gdf_series_result)
+    assert_eq(pdf_series_result, gdf_series_result)
 
 
 _permu_values = [0, 1, None, np.nan]
@@ -812,9 +812,9 @@ def test_operator_func_between_series_logical(
         and np.isnan(fill_value)
     ):
         with pytest.raises(AssertionError):
-            utils.assert_eq(expect, got)
+            assert_eq(expect, got)
         return
-    utils.assert_eq(expect, got)
+    assert_eq(expect, got)
 
 
 @pytest.mark.parametrize("dtype", ["float32", "float64"])
@@ -851,7 +851,7 @@ def test_operator_func_series_and_scalar_logical(
     expect = pdf_series_result
     got = gdf_series_result.to_pandas(nullable=True)
 
-    utils.assert_eq(expect, got)
+    assert_eq(expect, got)
 
 
 @pytest.mark.parametrize("func", _operators_arithmetic)
@@ -887,7 +887,7 @@ def gen_df():
     got = getattr(gdf1, func)(gdf2, fill_value=fill_value)
     expect = getattr(pdf1, func)(pdf2, fill_value=fill_value)[list(got._data)]
 
-    utils.assert_eq(expect, got)
+    assert_eq(expect, got)
 
 
 @pytest.mark.parametrize("func", _operators_comparison)
@@ -923,7 +923,7 @@ def gen_df():
     got = getattr(gdf1, func)(gdf2)
     expect = getattr(pdf1, func)(pdf2)[list(got._data)]
 
-    utils.assert_eq(expect, got)
+    assert_eq(expect, got)
 
 
 @pytest.mark.parametrize(
@@ -949,7 +949,7 @@ def gen_df():
 def test_binop_bool_uint(func, rhs):
     psr = pd.Series([True, False, False])
     gsr = cudf.from_pandas(psr)
-    utils.assert_eq(
+    assert_eq(
         getattr(psr, func)(rhs), getattr(gsr, func)(rhs), check_dtype=False
     )
 
@@ -977,7 +977,7 @@ def test_floordiv_zero_float64(series_dtype, divisor_dtype, scalar_divisor):
     else:
         pd_div = pd.Series([0], dtype=divisor_dtype)
         cudf_div = cudf.from_pandas(pd_div)
-    utils.assert_eq(sr // pd_div, cr // cudf_div)
+    assert_eq(sr // pd_div, cr // cudf_div)
 
 
 @pytest.mark.parametrize("scalar_divisor", [False, True])
@@ -1023,27 +1023,27 @@ def test_floordiv_zero_bool(scalar_divisor):
 def test_rmod_zero_nan(dtype):
     sr = pd.Series([1, 1, 0], dtype=dtype)
     cr = cudf.from_pandas(sr)
-    utils.assert_eq(1 % sr, 1 % cr)
+    assert_eq(1 % sr, 1 % cr)
     expected_dtype = np.float64 if cr.dtype.kind != "f" else dtype
-    utils.assert_eq(1 % cr, cudf.Series([0, 0, None], dtype=expected_dtype))
+    assert_eq(1 % cr, cudf.Series([0, 0, None], dtype=expected_dtype))
 
 
 def test_series_misc_binop():
     pds = pd.Series([1, 2, 4], name="abc xyz")
     gds = cudf.Series([1, 2, 4], name="abc xyz")
 
-    utils.assert_eq(pds + 1, gds + 1)
-    utils.assert_eq(1 + pds, 1 + gds)
+    assert_eq(pds + 1, gds + 1)
+    assert_eq(1 + pds, 1 + gds)
 
-    utils.assert_eq(pds + pds, gds + gds)
+    assert_eq(pds + pds, gds + gds)
 
     pds1 = pd.Series([1, 2, 4], name="hello world")
     gds1 = cudf.Series([1, 2, 4], name="hello world")
 
-    utils.assert_eq(pds + pds1, gds + gds1)
-    utils.assert_eq(pds1 + pds, gds1 + gds)
+    assert_eq(pds + pds1, gds + gds1)
+    assert_eq(pds1 + pds, gds1 + gds)
 
-    utils.assert_eq(pds1 + pds + 5, gds1 + gds + 5)
+    assert_eq(pds1 + pds + 5, gds1 + gds + 5)
 
 
 def test_int8_float16_binop():
@@ -1051,7 +1051,7 @@ def test_int8_float16_binop():
     b = np.float16(2)
     expect = cudf.Series([0.5])
     got = a / b
-    utils.assert_eq(expect, got, check_dtype=False)
+    assert_eq(expect, got, check_dtype=False)
 
 
 @pytest.mark.parametrize("dtype", ["int64", "float64", "str"])
@@ -1061,7 +1061,7 @@ def test_vector_to_none_binops(dtype):
     expect = Series([None] * 4).astype(dtype)
     got = data + None
 
-    utils.assert_eq(expect, got)
+    assert_eq(expect, got)
 
 
 def dtype_scalar(val, dtype):
@@ -1747,12 +1747,12 @@ def test_datetime_dateoffset_binaryop(
     expect = op(psr, poffset)
     got = op(gsr, goffset)
 
-    utils.assert_eq(expect, got)
+    assert_eq(expect, got)
 
     expect = op(psr, -poffset)
     got = op(gsr, -goffset)
 
-    utils.assert_eq(expect, got)
+    assert_eq(expect, got)
 
 
 @pytest.mark.parametrize(
@@ -1793,7 +1793,7 @@ def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op):
     expect = op(psr, poffset)
     got = op(gsr, goffset)
 
-    utils.assert_eq(expect, got)
+    assert_eq(expect, got)
 
 
 @pytest.mark.parametrize("n_periods", [0, 1, -1, 12, -12])
@@ -1840,7 +1840,7 @@ def test_datetime_dateoffset_binaryop_reflected(
 
     # TODO: Remove check_dtype once we get some clarity on:
     # https://github.com/pandas-dev/pandas/issues/57448
-    utils.assert_eq(expect, got, check_dtype=False)
+    assert_eq(expect, got, check_dtype=False)
 
     with pytest.raises(TypeError):
         poffset - psr
@@ -1878,7 +1878,7 @@ def test_binops_with_lhs_numpy_scalar(frame, dtype):
     expected = data.to_pandas() == val
     got = data == val
 
-    utils.assert_eq(expected, got)
+    assert_eq(expected, got)
 
 
 @pytest.mark.parametrize(
@@ -2302,7 +2302,7 @@ def test_binops_decimal(op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype):
 
     got = op(a, b)
     assert expect.dtype == got.dtype
-    utils.assert_eq(expect, got)
+    assert_eq(expect, got)
 
 
 @pytest.mark.parametrize(
@@ -2355,7 +2355,7 @@ def test_binops_reflect_decimal(
 
     got = getattr(a, op)(b)
     assert expect.dtype == got.dtype
-    utils.assert_eq(expect, got)
+    assert_eq(expect, got)
 
 
 @pytest.mark.parametrize("powers", [0, 1, 2, 3])
@@ -2371,7 +2371,7 @@ def test_binops_decimal_pow(powers):
     )
     ps = s.to_pandas()
 
-    utils.assert_eq(s**powers, ps**powers, check_dtype=False)
+    assert_eq(s**powers, ps**powers, check_dtype=False)
 
 
 def test_binops_raise_error():
@@ -2554,7 +2554,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
 
     actual = op(lhs, rhs)
 
-    utils.assert_eq(expected, actual)
+    assert_eq(expected, actual)
 
 
 @pytest.mark.parametrize(
@@ -2804,7 +2804,7 @@ def decimal_series(input, dtype):
 
     got = op(lhs, rhs)
     assert expect.dtype == got.dtype
-    utils.assert_eq(expect, got)
+    assert_eq(expect, got)
 
 
 @pytest.mark.parametrize(
@@ -2979,7 +2979,7 @@ def test_binops_decimal_scalar_compare(args, reflected):
 
     actual = op(lhs, rhs)
 
-    utils.assert_eq(expected, actual)
+    assert_eq(expected, actual)
 
 
 @pytest.mark.parametrize(
@@ -3042,7 +3042,7 @@ def test_equality_ops_index_mismatch(fn):
     expected = getattr(pa, fn)(pb)
     actual = getattr(a, fn)(b).to_pandas(nullable=True)
 
-    utils.assert_eq(expected, actual)
+    assert_eq(expected, actual)
 
 
 def generate_test_null_equals_columnops_data():
@@ -3132,7 +3132,7 @@ def test_empty_column(binop, data, scalar):
     got = binop(gdf, scalar)
     expected = binop(pdf, scalar)
 
-    utils.assert_eq(expected, got)
+    assert_eq(expected, got)
 
 
 @pytest.mark.parametrize(
@@ -3179,7 +3179,7 @@ def test_binops_dot(df, other):
     expected = pdf @ host_other
     got = df @ other
 
-    utils.assert_eq(expected, got)
+    assert_eq(expected, got)
 
 
 def test_binop_dot_preserve_index():
@@ -3187,7 +3187,7 @@ def test_binop_dot_preserve_index():
     df = cudf.DataFrame(np.eye(2), columns=["A", "B"], index=["A", "B"])
     result = ser @ df
     expected = ser.to_pandas() @ df.to_pandas()
-    utils.assert_eq(result, expected)
+    assert_eq(result, expected)
 
 
 def test_binop_series_with_repeated_index():
@@ -3198,7 +3198,7 @@ def test_binop_series_with_repeated_index():
     gsr2 = cudf.from_pandas(psr2)
     expected = psr1 - psr2
     got = gsr1 - gsr2
-    utils.assert_eq(expected, got)
+    assert_eq(expected, got)
 
 
 def test_binop_integer_power_series_series():
@@ -3209,7 +3209,7 @@ def test_binop_integer_power_series_series():
     ps_exponent = gs_exponent.to_pandas()
     expected = ps_base**ps_exponent
     got = gs_base**gs_exponent
-    utils.assert_eq(expected, got)
+    assert_eq(expected, got)
 
 
 def test_binop_integer_power_series_scalar():
@@ -3219,7 +3219,7 @@ def test_binop_integer_power_series_scalar():
     ps_base = gs_base.to_pandas()
     expected = ps_base**exponent.value
     got = gs_base**exponent
-    utils.assert_eq(expected, got)
+    assert_eq(expected, got)
 
 
 def test_binop_integer_power_series_int():
@@ -3229,7 +3229,7 @@ def test_binop_integer_power_series_int():
     ps_base = gs_base.to_pandas()
     expected = ps_base**exponent
     got = gs_base**exponent
-    utils.assert_eq(expected, got)
+    assert_eq(expected, got)
 
 
 def test_binop_integer_power_scalar_series():
@@ -3239,7 +3239,7 @@ def test_binop_integer_power_scalar_series():
     ps_exponent = gs_exponent.to_pandas()
     expected = base.value**ps_exponent
     got = base**gs_exponent
-    utils.assert_eq(expected, got)
+    assert_eq(expected, got)
 
 
 def test_binop_integer_power_scalar_scalar():
@@ -3248,7 +3248,7 @@ def test_binop_integer_power_scalar_scalar():
     exponent = cudf.Scalar(1)
     expected = base.value**exponent.value
     got = base**exponent
-    utils.assert_eq(expected, got)
+    assert_eq(expected, got)
 
 
 def test_binop_integer_power_scalar_int():
@@ -3257,7 +3257,7 @@ def test_binop_integer_power_scalar_int():
     exponent = 1
     expected = base.value**exponent
     got = base**exponent
-    utils.assert_eq(expected, got)
+    assert_eq(expected, got)
 
 
 def test_binop_integer_power_int_series():
@@ -3267,7 +3267,7 @@ def test_binop_integer_power_int_series():
     ps_exponent = gs_exponent.to_pandas()
     expected = base**ps_exponent
     got = base**gs_exponent
-    utils.assert_eq(expected, got)
+    assert_eq(expected, got)
 
 
 def test_binop_integer_power_int_scalar():
@@ -3276,7 +3276,7 @@ def test_binop_integer_power_int_scalar():
     exponent = cudf.Scalar(1)
     expected = base**exponent.value
     got = base**exponent
-    utils.assert_eq(expected, got)
+    assert_eq(expected, got)
 
 
 def test_numpy_int_scalar_binop():
@@ -3291,7 +3291,7 @@ def test_binop_index_series(op):
     actual = op(gi, gs)
     expected = op(gi.to_pandas(), gs.to_pandas())
 
-    utils.assert_eq(expected, actual)
+    assert_eq(expected, actual)
 
 
 @pytest.mark.parametrize("name1", utils.SERIES_OR_INDEX_NAMES)
@@ -3307,7 +3307,7 @@ def test_binop_index_dt_td_series_with_names(name1, name2):
         expected = gi.to_pandas() + gs.to_pandas()
     actual = gi + gs
 
-    utils.assert_eq(expected, actual)
+    assert_eq(expected, actual)
 
 
 @pytest.mark.parametrize("data1", [[1, 2, 3], [10, 11, None]])
@@ -3319,9 +3319,9 @@ def test_binop_eq_ne_index_series(data1, data2):
     actual = gi == gs
     expected = gi.to_pandas() == gs.to_pandas()
 
-    utils.assert_eq(expected, actual)
+    assert_eq(expected, actual)
 
     actual = gi != gs
     expected = gi.to_pandas() != gs.to_pandas()
 
-    utils.assert_eq(expected, actual)
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index c36595192e4..9b6029582ce 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -11,11 +11,8 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import (
-    NUMERIC_TYPES,
-    assert_eq,
-    assert_exceptions_equal,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import NUMERIC_TYPES, assert_exceptions_equal
 
 
 @contextmanager
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index a8a297c155f..ea919c786b9 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -9,7 +9,8 @@
 import cudf
 from cudf._lib.transform import mask_to_bools
 from cudf.core.column.column import as_column
-from cudf.testing._utils import assert_eq, assert_exceptions_equal
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal
 from cudf.utils import dtypes as dtypeutils
 
 dtypes = sorted(
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
index f1f6097d6a9..f3343c37d1d 100644
--- a/python/cudf/cudf/tests/test_column_accessor.py
+++ b/python/cudf/cudf/tests/test_column_accessor.py
@@ -6,7 +6,7 @@
 
 import cudf
 from cudf.core.column_accessor import ColumnAccessor
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 simple_test_data = [
     {},
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index 4b43a33c8c8..c1c03de48d4 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -10,11 +10,8 @@
 
 import cudf
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
-from cudf.testing._utils import (
-    assert_eq,
-    assert_exceptions_equal,
-    expect_warning_if,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal, expect_warning_if
 
 
 @contextmanager
diff --git a/python/cudf/cudf/tests/test_contains.py b/python/cudf/cudf/tests/test_contains.py
index a65ab1780b6..fe86df99d35 100644
--- a/python/cudf/cudf/tests/test_contains.py
+++ b/python/cudf/cudf/tests/test_contains.py
@@ -9,12 +9,8 @@
 import cudf
 from cudf import Series
 from cudf.core.index import Index, RangeIndex
-from cudf.testing._utils import (
-    DATETIME_TYPES,
-    NUMERIC_TYPES,
-    TIMEDELTA_TYPES,
-    assert_eq,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES
 
 
 def cudf_date_series(start, stop, freq):
diff --git a/python/cudf/cudf/tests/test_copying.py b/python/cudf/cudf/tests/test_copying.py
index 0bc9ffa8004..9b6f82ec705 100644
--- a/python/cudf/cudf/tests/test_copying.py
+++ b/python/cudf/cudf/tests/test_copying.py
@@ -8,7 +8,8 @@
 import cudf
 from cudf import Series
 from cudf.core.buffer.spill_manager import get_global_manager
-from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES, assert_eq
+from cudf.testing import assert_eq
+from cudf.testing._utils import NUMERIC_TYPES, OTHER_TYPES
 
 pytestmark = pytest.mark.spilling
 
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 5009a7f2628..09617306606 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -18,7 +18,8 @@
 import cudf
 from cudf import read_csv
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
-from cudf.testing._utils import assert_eq, assert_exceptions_equal
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal
 
 
 def make_numeric_dataframe(nrows, dtype):
diff --git a/python/cudf/cudf/tests/test_cuda_apply.py b/python/cudf/cudf/tests/test_cuda_apply.py
index 7fdf9754534..dc892caba3b 100644
--- a/python/cudf/cudf/tests/test_cuda_apply.py
+++ b/python/cudf/cudf/tests/test_cuda_apply.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 """
 Test method that apply GPU kernel to a frame.
@@ -9,7 +9,7 @@
 from numba import cuda
 
 from cudf import DataFrame
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @pytest.mark.parametrize("nelem", [1, 2, 64, 128, 129])
diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index 06d63561fc1..29f2f46e3c7 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -11,12 +11,8 @@
 
 import cudf
 from cudf.core.buffer.spill_manager import get_global_manager
-from cudf.testing._utils import (
-    DATETIME_TYPES,
-    NUMERIC_TYPES,
-    TIMEDELTA_TYPES,
-    assert_eq,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES
 
 
 @pytest.mark.parametrize("dtype", NUMERIC_TYPES + DATETIME_TYPES)
diff --git a/python/cudf/cudf/tests/test_custom_accessor.py b/python/cudf/cudf/tests/test_custom_accessor.py
index 5ffe255d0f8..278e63f3e8b 100644
--- a/python/cudf/cudf/tests/test_custom_accessor.py
+++ b/python/cudf/cudf/tests/test_custom_accessor.py
@@ -4,7 +4,7 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @cudf.api.extensions.register_dataframe_accessor("point")
diff --git a/python/cudf/cudf/tests/test_cut.py b/python/cudf/cudf/tests/test_cut.py
index 24c1eaa8f02..3f31da035aa 100644
--- a/python/cudf/cudf/tests/test_cut.py
+++ b/python/cudf/cudf/tests/test_cut.py
@@ -9,7 +9,7 @@
 import pytest
 
 from cudf.core.cut import cut
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index cfa2a4aa8fd..05ee8346afa 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -30,14 +30,12 @@
 from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.core.column import column
 from cudf.errors import MixedTypeError
-from cudf.testing import _utils as utils
+from cudf.testing import _utils as utils, assert_eq, assert_neq
 from cudf.testing._utils import (
     ALL_TYPES,
     DATETIME_TYPES,
     NUMERIC_TYPES,
-    assert_eq,
     assert_exceptions_equal,
-    assert_neq,
     does_not_raise,
     expect_warning_if,
     gen_rand,
diff --git a/python/cudf/cudf/tests/test_dataframe_copy.py b/python/cudf/cudf/tests/test_dataframe_copy.py
index fec52d82ab1..45bd31ef58e 100644
--- a/python/cudf/cudf/tests/test_dataframe_copy.py
+++ b/python/cudf/cudf/tests/test_dataframe_copy.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 from copy import copy, deepcopy
 
 import cupy as cp
@@ -7,7 +7,8 @@
 import pytest
 
 from cudf.core.dataframe import DataFrame
-from cudf.testing._utils import ALL_TYPES, assert_eq, assert_neq
+from cudf.testing import assert_eq, assert_neq
+from cudf.testing._utils import ALL_TYPES
 
 """
 DataFrame copy expectations
diff --git a/python/cudf/cudf/tests/test_datasets.py b/python/cudf/cudf/tests/test_datasets.py
index 8e5e5ab66c4..7f4e249a6d7 100644
--- a/python/cudf/cudf/tests/test_datasets.py
+++ b/python/cudf/cudf/tests/test_datasets.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 def test_dataset_timeseries():
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index e3ecaafae5b..092e9790c63 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -15,10 +15,10 @@
 from cudf import DataFrame, Series
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.core.index import DatetimeIndex
+from cudf.testing import assert_eq
 from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
-    assert_eq,
     assert_exceptions_equal,
     expect_warning_if,
 )
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index 0745e5aba48..c41a938f6ea 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 import decimal
 from decimal import Decimal
@@ -11,12 +11,12 @@
 import cudf
 from cudf.core.column import Decimal32Column, Decimal64Column, NumericalColumn
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype
+from cudf.testing import assert_eq
 from cudf.testing._utils import (
     FLOAT_TYPES,
     INTEGER_TYPES,
     SIGNED_TYPES,
     _decimal_series,
-    assert_eq,
     expect_warning_if,
 )
 
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index 8ce4da792a4..7f48e414180 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -20,7 +20,7 @@
     from_dataframe,
     protocol_dtype_to_cupy_dtype,
 )
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @pytest.fixture(
diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py
index 7ea3979b0f1..ebcc35784ee 100644
--- a/python/cudf/cudf/tests/test_dlpack.py
+++ b/python/cudf/cudf/tests/test_dlpack.py
@@ -9,7 +9,7 @@
 from packaging import version
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 nelems = [0, 3, 10]
 dtype = [np.uint16, np.int32, np.float64]
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index c3c8ed922f0..ed0cf0053ea 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index 0efd8d9781c..edb534a3618 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -17,7 +17,7 @@
     ListDtype,
     StructDtype,
 )
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 from cudf.utils.dtypes import np_to_pa_dtype
 
 
diff --git a/python/cudf/cudf/tests/test_duplicates.py b/python/cudf/cudf/tests/test_duplicates.py
index 161b245953b..0b4ed52ba96 100644
--- a/python/cudf/cudf/tests/test_duplicates.py
+++ b/python/cudf/cudf/tests/test_duplicates.py
@@ -9,7 +9,8 @@
 
 import cudf
 from cudf import concat
-from cudf.testing._utils import assert_eq, assert_exceptions_equal
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal
 
 # most tests are similar to pandas drop_duplicates
 
diff --git a/python/cudf/cudf/tests/test_ewm.py b/python/cudf/cudf/tests/test_ewm.py
index 0861d2363ce..6cb3c19d5a8 100644
--- a/python/cudf/cudf/tests/test_ewm.py
+++ b/python/cudf/cudf/tests/test_ewm.py
@@ -2,7 +2,7 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_factorize.py b/python/cudf/cudf/tests/test_factorize.py
index f8782681f62..47f9180dcb1 100644
--- a/python/cudf/cudf/tests/test_factorize.py
+++ b/python/cudf/cudf/tests/test_factorize.py
@@ -7,7 +7,7 @@
 
 import cudf
 from cudf import DataFrame, Index
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @pytest.mark.parametrize("ncats,nelem", [(2, 2), (2, 10), (10, 100)])
diff --git a/python/cudf/cudf/tests/test_feather.py b/python/cudf/cudf/tests/test_feather.py
index 12a325fa4e8..7e5523bb8c7 100644
--- a/python/cudf/cudf/tests/test_feather.py
+++ b/python/cudf/cudf/tests/test_feather.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 import os
 from string import ascii_letters
@@ -9,7 +9,8 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import NUMERIC_TYPES, assert_eq
+from cudf.testing import assert_eq
+from cudf.testing._utils import NUMERIC_TYPES
 
 
 @pytest.fixture(params=[0, 1, 10, 100])
diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py
index a677ace18ec..fc22d8bc0ea 100644
--- a/python/cudf/cudf/tests/test_gcs.py
+++ b/python/cudf/cudf/tests/test_gcs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import io
 import os
@@ -8,7 +8,7 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 gcsfs = pytest.importorskip("gcsfs")
 
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 674f694a224..826a0e52f57 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -28,11 +28,11 @@
 from cudf.core.udf._ops import arith_ops, comparison_ops, unary_ops
 from cudf.core.udf.groupby_typing import SUPPORTED_GROUPBY_NUMPY_TYPES
 from cudf.core.udf.utils import UDFError, precompiled
+from cudf.testing import assert_eq
 from cudf.testing._utils import (
     DATETIME_TYPES,
     SIGNED_TYPES,
     TIMEDELTA_TYPES,
-    assert_eq,
     assert_exceptions_equal,
     expect_warning_if,
 )
diff --git a/python/cudf/cudf/tests/test_hdf.py b/python/cudf/cudf/tests/test_hdf.py
index d420c95cfb4..430ed973f19 100644
--- a/python/cudf/cudf/tests/test_hdf.py
+++ b/python/cudf/cudf/tests/test_hdf.py
@@ -8,7 +8,8 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import NUMERIC_TYPES, UNSIGNED_TYPES, assert_eq
+from cudf.testing import assert_eq
+from cudf.testing._utils import NUMERIC_TYPES, UNSIGNED_TYPES
 
 pytest.importorskip("tables")
 
diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py
index f8de16f8609..098b5192d4a 100644
--- a/python/cudf/cudf/tests/test_hdfs.py
+++ b/python/cudf/cudf/tests/test_hdfs.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import os
 from io import BytesIO
@@ -10,7 +10,7 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 if not os.environ.get("RUN_HDFS_TESTS"):
     pytestmark = pytest.mark.skip("Env not configured to run HDFS tests")
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index a59836df5ba..05dcd85df6a 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -18,6 +18,7 @@
 from cudf.api.extensions import no_default
 from cudf.api.types import is_bool_dtype
 from cudf.core.index import CategoricalIndex, DatetimeIndex, Index, RangeIndex
+from cudf.testing import assert_eq
 from cudf.testing._utils import (
     ALL_TYPES,
     FLOAT_TYPES,
@@ -28,7 +29,6 @@
     UNSIGNED_TYPES,
     assert_column_memory_eq,
     assert_column_memory_ne,
-    assert_eq,
     assert_exceptions_equal,
     expect_warning_if,
 )
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 009e48a8669..7005cbc6834 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -11,10 +11,9 @@
 
 import cudf
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
-from cudf.testing import _utils as utils
+from cudf.testing import _utils as utils, assert_eq
 from cudf.testing._utils import (
     INTEGER_TYPES,
-    assert_eq,
     assert_exceptions_equal,
     expect_warning_if,
 )
diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py
index a0e90cc89a2..4a0dc331e1a 100644
--- a/python/cudf/cudf/tests/test_interpolate.py
+++ b/python/cudf/cudf/tests/test_interpolate.py
@@ -4,11 +4,8 @@
 
 import cudf
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
-from cudf.testing._utils import (
-    assert_eq,
-    assert_exceptions_equal,
-    expect_warning_if,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal, expect_warning_if
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index 013f4439ad5..1b395c09ba8 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -6,7 +6,7 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_join_order.py b/python/cudf/cudf/tests/test_join_order.py
index 8d71a6c05b8..9ea4ba007d2 100644
--- a/python/cudf/cudf/tests/test_join_order.py
+++ b/python/cudf/cudf/tests/test_join_order.py
@@ -8,7 +8,7 @@
 
 import cudf
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @pytest.fixture(params=[False, True], ids=["unsorted", "sorted"])
diff --git a/python/cudf/cudf/tests/test_joining.py b/python/cudf/cudf/tests/test_joining.py
index f36774daab2..b1ce69e58ef 100644
--- a/python/cudf/cudf/tests/test_joining.py
+++ b/python/cudf/cudf/tests/test_joining.py
@@ -9,11 +9,11 @@
 import cudf
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.core.dtypes import CategoricalDtype, Decimal64Dtype, Decimal128Dtype
+from cudf.testing import assert_eq
 from cudf.testing._utils import (
     INTEGER_TYPES,
     NUMERIC_TYPES,
     TIMEDELTA_TYPES,
-    assert_eq,
     assert_exceptions_equal,
     expect_warning_if,
 )
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index ba6a8f94719..297040b6d95 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -14,11 +14,11 @@
 
 import cudf
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+from cudf.testing import assert_eq
 from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
     TIMEDELTA_TYPES,
-    assert_eq,
     expect_warning_if,
 )
 
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index f04cb8a91a4..f76143cb381 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -12,12 +12,8 @@
 from cudf import NA
 from cudf._lib.copying import get_element
 from cudf.api.types import is_scalar
-from cudf.testing._utils import (
-    DATETIME_TYPES,
-    NUMERIC_TYPES,
-    TIMEDELTA_TYPES,
-    assert_eq,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py
index 3c627a5fe89..0896d91570e 100644
--- a/python/cudf/cudf/tests/test_monotonic.py
+++ b/python/cudf/cudf/tests/test_monotonic.py
@@ -12,7 +12,7 @@
 import cudf
 from cudf import Index, MultiIndex, Series
 from cudf.core.index import CategoricalIndex, DatetimeIndex, RangeIndex
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @pytest.mark.parametrize("testrange", [(10, 20, 1), (0, -10, -1), (5, 5, 1)])
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 7b95e4f9a44..07c2e9c3fcf 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -21,12 +21,8 @@
 import cudf
 from cudf.api.extensions import no_default
 from cudf.core.column import as_column
-from cudf.testing._utils import (
-    assert_eq,
-    assert_exceptions_equal,
-    assert_neq,
-    expect_warning_if,
-)
+from cudf.testing import assert_eq, assert_neq
+from cudf.testing._utils import assert_exceptions_equal, expect_warning_if
 
 
 @contextmanager
diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py
index 03081208739..1b0589254f5 100644
--- a/python/cudf/cudf/tests/test_numerical.py
+++ b/python/cudf/cudf/tests/test_numerical.py
@@ -5,7 +5,8 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import NUMERIC_TYPES, assert_eq, expect_warning_if
+from cudf.testing import assert_eq
+from cudf.testing._utils import NUMERIC_TYPES, expect_warning_if
 from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes
 
 
diff --git a/python/cudf/cudf/tests/test_numpy_interop.py b/python/cudf/cudf/tests/test_numpy_interop.py
index 46324a85bb4..fa664d52ecf 100644
--- a/python/cudf/cudf/tests/test_numpy_interop.py
+++ b/python/cudf/cudf/tests/test_numpy_interop.py
@@ -1,10 +1,10 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import numpy as np
 import pytest
 
 from cudf import DataFrame, Series
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 def test_to_records_noindex():
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index cd0055ad78b..154e1e19072 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -7,7 +7,7 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 pytestmark = pytest.mark.spilling
 
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index b83b8f08a8b..e0884a5819a 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -15,9 +15,8 @@
 import cudf
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.io.orc import ORCWriter
-from cudf.testing import assert_frame_equal
+from cudf.testing import assert_eq, assert_frame_equal
 from cudf.testing._utils import (
-    assert_eq,
     expect_warning_if,
     gen_rand_series,
     supported_numpy_dtypes,
diff --git a/python/cudf/cudf/tests/test_pack.py b/python/cudf/cudf/tests/test_pack.py
index da506a8d5b2..ad78621c5fa 100644
--- a/python/cudf/cudf/tests/test_pack.py
+++ b/python/cudf/cudf/tests/test_pack.py
@@ -20,7 +20,7 @@
 
 from cudf import DataFrame, Index, Series
 from cudf._lib.copying import pack, unpack
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 def test_sizeof_packed_dataframe():
diff --git a/python/cudf/cudf/tests/test_pandas_interop.py b/python/cudf/cudf/tests/test_pandas_interop.py
index 78cf5b998e8..5782437e394 100644
--- a/python/cudf/cudf/tests/test_pandas_interop.py
+++ b/python/cudf/cudf/tests/test_pandas_interop.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
 
 import cudf
 from cudf import DataFrame
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 def test_to_pandas():
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index af79f361b43..e1e7952605b 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -28,12 +28,8 @@
     ParquetWriter,
     merge_parquet_filemetadata,
 )
-from cudf.testing import dataset_generator as dg
-from cudf.testing._utils import (
-    TIMEDELTA_TYPES,
-    assert_eq,
-    set_random_null_mask_inplace,
-)
+from cudf.testing import assert_eq, dataset_generator as dg
+from cudf.testing._utils import TIMEDELTA_TYPES, set_random_null_mask_inplace
 
 
 @contextmanager
diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py
index 13a07ef8adc..719e8a33285 100644
--- a/python/cudf/cudf/tests/test_pickling.py
+++ b/python/cudf/cudf/tests/test_pickling.py
@@ -8,7 +8,7 @@
 
 from cudf import DataFrame, Index, RangeIndex, Series
 from cudf.core.buffer import as_buffer
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 pytestmark = pytest.mark.spilling
 
diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py
index 8b126073a0f..7d8303df0c3 100644
--- a/python/cudf/cudf/tests/test_quantiles.py
+++ b/python/cudf/cudf/tests/test_quantiles.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import re
 
@@ -6,7 +6,8 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq, assert_exceptions_equal
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal
 
 
 def test_single_q():
diff --git a/python/cudf/cudf/tests/test_query.py b/python/cudf/cudf/tests/test_query.py
index cf9e70d85c7..b12209fd3b9 100644
--- a/python/cudf/cudf/tests/test_query.py
+++ b/python/cudf/cudf/tests/test_query.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 
 import datetime
@@ -11,7 +11,7 @@
 
 import cudf
 from cudf import DataFrame
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 from cudf.utils import queryutils
 
 _params_query_parser = []
diff --git a/python/cudf/cudf/tests/test_query_mask.py b/python/cudf/cudf/tests/test_query_mask.py
index ae5171f28d4..9372681187d 100644
--- a/python/cudf/cudf/tests/test_query_mask.py
+++ b/python/cudf/cudf/tests/test_query_mask.py
@@ -1,11 +1,11 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 _data = [
     {"a": [0, 1.0, 2.0, None, np.nan, None, 3, 5]},
diff --git a/python/cudf/cudf/tests/test_rank.py b/python/cudf/cudf/tests/test_rank.py
index 1a5f25e320f..4c1d8ce92ae 100644
--- a/python/cudf/cudf/tests/test_rank.py
+++ b/python/cudf/cudf/tests/test_rank.py
@@ -7,7 +7,8 @@
 import pytest
 
 from cudf import DataFrame
-from cudf.testing._utils import assert_eq, assert_exceptions_equal
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal
 
 
 @pytest.fixture
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index c6ffa1d2bc7..1247fa362ce 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -11,13 +11,8 @@
 import cudf
 from cudf import Series
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
-from cudf.testing import _utils as utils
-from cudf.testing._utils import (
-    NUMERIC_TYPES,
-    assert_eq,
-    expect_warning_if,
-    gen_rand,
-)
+from cudf.testing import _utils as utils, assert_eq
+from cudf.testing._utils import NUMERIC_TYPES, expect_warning_if, gen_rand
 
 params_dtype = NUMERIC_TYPES
 
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index 9466398964a..d4fe5ff3bb5 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -12,10 +12,10 @@
 import cudf
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
+from cudf.testing import assert_eq
 from cudf.testing._utils import (
     INTEGER_TYPES,
     NUMERIC_TYPES,
-    assert_eq,
     assert_exceptions_equal,
     expect_warning_if,
 )
diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py
index d7a3fea1273..95fa8e9a50a 100644
--- a/python/cudf/cudf/tests/test_resampling.py
+++ b/python/cudf/cudf/tests/test_resampling.py
@@ -5,7 +5,7 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 def assert_resample_results_equal(lhs, rhs, **kwargs):
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index daa1e70808f..50db4302b75 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -10,12 +10,8 @@
 import cudf
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.core.buffer.spill_manager import get_global_manager
-from cudf.testing._utils import (
-    ALL_TYPES,
-    DATETIME_TYPES,
-    NUMERIC_TYPES,
-    assert_eq,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES
 
 pytest_xfail = pytest.mark.xfail
 pytestmark = pytest.mark.spilling
diff --git a/python/cudf/cudf/tests/test_rolling.py b/python/cudf/cudf/tests/test_rolling.py
index 1d1d7ae8d29..135870f7359 100644
--- a/python/cudf/cudf/tests/test_rolling.py
+++ b/python/cudf/cudf/tests/test_rolling.py
@@ -7,7 +7,7 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 from cudf.testing.dataset_generator import rand_dataframe
 
 
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index cdce17eeb76..a44bf791767 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -12,7 +12,7 @@
 from fsspec.core import get_fs_token_paths
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 moto = pytest.importorskip("moto", minversion="3.1.6")
 boto3 = pytest.importorskip("boto3")
diff --git a/python/cudf/cudf/tests/test_scan.py b/python/cudf/cudf/tests/test_scan.py
index 4cbc2197cfd..b76566b00e2 100644
--- a/python/cudf/cudf/tests/test_scan.py
+++ b/python/cudf/cudf/tests/test_scan.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from itertools import product
 
@@ -8,12 +8,8 @@
 
 import cudf
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
-from cudf.testing._utils import (
-    INTEGER_TYPES,
-    NUMERIC_TYPES,
-    assert_eq,
-    gen_rand,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import INTEGER_TYPES, NUMERIC_TYPES, gen_rand
 
 params_sizes = [0, 1, 2, 5]
 
diff --git a/python/cudf/cudf/tests/test_search.py b/python/cudf/cudf/tests/test_search.py
index 3ba652ff6c0..65943518113 100644
--- a/python/cudf/cudf/tests/test_search.py
+++ b/python/cudf/cudf/tests/test_search.py
@@ -5,7 +5,8 @@
 import pytest
 
 import cudf
-from cudf.testing._utils import assert_eq, gen_rand, random_bitmask
+from cudf.testing import assert_eq
+from cudf.testing._utils import gen_rand, random_bitmask
 
 
 @pytest.mark.parametrize("side", ["left", "right"])
diff --git a/python/cudf/cudf/tests/test_serialize.py b/python/cudf/cudf/tests/test_serialize.py
index f26d78e7783..0b892a51895 100644
--- a/python/cudf/cudf/tests/test_serialize.py
+++ b/python/cudf/cudf/tests/test_serialize.py
@@ -9,8 +9,7 @@
 import pytest
 
 import cudf
-from cudf.testing import _utils as utils
-from cudf.testing._utils import assert_eq
+from cudf.testing import _utils as utils, assert_eq
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 52956c230ba..87ec365868b 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -17,11 +17,11 @@
 from cudf.api.extensions import no_default
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.errors import MixedTypeError
+from cudf.testing import assert_eq
 from cudf.testing._utils import (
     NUMERIC_TYPES,
     SERIES_OR_INDEX_NAMES,
     TIMEDELTA_TYPES,
-    assert_eq,
     assert_exceptions_equal,
     expect_warning_if,
     gen_rand,
diff --git a/python/cudf/cudf/tests/test_seriesmap.py b/python/cudf/cudf/tests/test_seriesmap.py
index 9da08e483c9..3d8b6a79d2a 100644
--- a/python/cudf/cudf/tests/test_seriesmap.py
+++ b/python/cudf/cudf/tests/test_seriesmap.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from itertools import product
 from math import floor
@@ -9,7 +9,8 @@
 
 import cudf
 from cudf import Series
-from cudf.testing._utils import assert_eq, assert_exceptions_equal
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal
 
 
 def test_series_map_basic():
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index ff2f7bd41f2..69122cdbafa 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -6,11 +6,8 @@
 
 import cudf
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
-from cudf.testing._utils import (
-    assert_eq,
-    assert_exceptions_equal,
-    expect_warning_if,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal, expect_warning_if
 
 
 @pytest.mark.parametrize("df", [pd.DataFrame({"a": [1, 2, 3]})])
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index 449f21721f4..a8ffce6e88b 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -10,10 +10,10 @@
 from cudf import DataFrame, Series
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.core.column import NumericalColumn
+from cudf.testing import assert_eq
 from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
-    assert_eq,
     assert_exceptions_equal,
     expect_warning_if,
 )
diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py
index 59b8e6d2e70..7af83a99d60 100644
--- a/python/cudf/cudf/tests/test_spilling.py
+++ b/python/cudf/cudf/tests/test_spilling.py
@@ -39,7 +39,7 @@
     SpillableBufferOwner,
     SpillLock,
 )
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 if get_global_manager() is not None:
     pytest.skip(
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index 27811d0fcde..d5f63fdab77 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -11,11 +11,8 @@
 from cudf.api.extensions import no_default
 from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.datasets import randomdata
-from cudf.testing._utils import (
-    assert_eq,
-    assert_exceptions_equal,
-    expect_warning_if,
-)
+from cudf.testing import assert_eq
+from cudf.testing._utils import assert_exceptions_equal, expect_warning_if
 
 params_dtypes = [np.int32, np.uint32, np.float32, np.float64]
 methods = ["min", "max", "sum", "mean", "var", "std"]
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 801c530da43..f447759d010 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -17,10 +17,10 @@
 from cudf import concat
 from cudf.core.column.string import StringColumn
 from cudf.core.index import Index
+from cudf.testing import assert_eq
 from cudf.testing._utils import (
     DATETIME_TYPES,
     NUMERIC_TYPES,
-    assert_eq,
     assert_exceptions_equal,
 )
 from cudf.utils import dtypes as dtypeutils
diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py
index 5dbb86fe27d..4432d2afc8e 100644
--- a/python/cudf/cudf/tests/test_string_udfs.py
+++ b/python/cudf/cudf/tests/test_string_udfs.py
@@ -21,7 +21,8 @@
     udf_string,
 )
 from cudf.core.udf.utils import _get_extensionty_size, _ptx_file
-from cudf.testing._utils import assert_eq, sv_to_udf_str
+from cudf.testing import assert_eq
+from cudf.testing._utils import sv_to_udf_str
 from cudf.utils._numba import _CUDFNumbaConfig
 
 _PTX_FILE = _ptx_file()
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index 60d9516f385..e91edc9eec6 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import numpy as np
 import pandas as pd
@@ -7,7 +7,8 @@
 
 import cudf
 from cudf.core.dtypes import StructDtype
-from cudf.testing._utils import DATETIME_TYPES, TIMEDELTA_TYPES, assert_eq
+from cudf.testing import assert_eq
+from cudf.testing._utils import DATETIME_TYPES, TIMEDELTA_TYPES
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py
index 1994536f395..c3620db3880 100644
--- a/python/cudf/cudf/tests/test_testing.py
+++ b/python/cudf/cudf/tests/test_testing.py
@@ -17,9 +17,8 @@
     OTHER_TYPES,
     assert_column_memory_eq,
     assert_column_memory_ne,
-    assert_eq,
 )
-from cudf.testing.testing import assert_column_equal
+from cudf.testing.testing import assert_column_equal, assert_eq
 
 
 @pytest.fixture(
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index 0c591965361..c4a2349f535 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -9,8 +9,8 @@
 import pytest
 
 import cudf
-from cudf.testing import _utils as utils
-from cudf.testing._utils import assert_eq, assert_exceptions_equal
+from cudf.testing import _utils as utils, assert_eq
+from cudf.testing._utils import assert_exceptions_equal
 
 _TIMEDELTA_DATA = [
     [1000000, 200000, 3000000],
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 4843decedba..087d10b8295 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -17,9 +17,9 @@
 )
 from cudf.core.udf.api import Masked
 from cudf.core.udf.utils import precompiled
+from cudf.testing import assert_eq
 from cudf.testing._utils import (
     _decimal_series,
-    assert_eq,
     parametrize_numeric_dtypes_pairwise,
     sv_to_udf_str,
 )
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index 15d9d03d4a7..dbbf4fba3a6 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import itertools
 import operator
@@ -10,7 +10,7 @@
 
 import cudf
 from cudf import Series
-from cudf.testing import _utils as utils
+from cudf.testing import _utils as utils, assert_eq
 
 _unaops = [operator.abs, operator.invert, operator.neg, np.ceil, np.floor]
 
@@ -128,4 +128,4 @@ def test_scalar_no_negative_bools():
 def test_series_bool_neg():
     sr = Series([True, False, True, None, False, None, True, True])
     psr = sr.to_pandas(nullable=True)
-    utils.assert_eq((-sr).to_pandas(nullable=True), -psr, check_dtype=True)
+    assert_eq((-sr).to_pandas(nullable=True), -psr, check_dtype=True)
diff --git a/python/cudf/cudf/tests/text/test_subword_tokenizer.py b/python/cudf/cudf/tests/text/test_subword_tokenizer.py
index b21edc0477f..78b58344374 100644
--- a/python/cudf/cudf/tests/text/test_subword_tokenizer.py
+++ b/python/cudf/cudf/tests/text/test_subword_tokenizer.py
@@ -7,7 +7,7 @@
 
 import cudf
 from cudf.core.subword_tokenizer import SubwordTokenizer
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @pytest.fixture(scope="module")
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 36f7f3de828..52179f55da3 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -9,7 +9,7 @@
 import cudf
 from cudf.core.byte_pair_encoding import BytePairEncoder
 from cudf.core.tokenize_vocabulary import TokenizeVocabulary
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 def test_tokenize():
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 5be4d350c0b..eed5037cbea 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1175,7 +1175,7 @@ def test_intermediates_are_proxied():
 
 def test_from_dataframe():
     cudf = pytest.importorskip("cudf")
-    from cudf.testing._utils import assert_eq
+    from cudf.testing import assert_eq
 
     data = {"foo": [1, 2, 3], "bar": [4, 5, 6]}
 
diff --git a/python/custreamz/custreamz/tests/test_kafka.py b/python/custreamz/custreamz/tests/test_kafka.py
index ad3b829544b..3a3c4e994d0 100644
--- a/python/custreamz/custreamz/tests/test_kafka.py
+++ b/python/custreamz/custreamz/tests/test_kafka.py
@@ -1,8 +1,8 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 import confluent_kafka as ck
 import pytest
 
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 
 @pytest.mark.parametrize("commit_offset", [1, 45, 100, 22, 1000, 10])
diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py
index 58d28f0597e..6f04b5737da 100644
--- a/python/dask_cudf/dask_cudf/tests/test_accessor.py
+++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py
@@ -9,7 +9,8 @@
 from dask import dataframe as dd
 
 from cudf import DataFrame, Series, date_range
-from cudf.testing._utils import assert_eq, does_not_raise
+from cudf.testing import assert_eq
+from cudf.testing._utils import does_not_raise
 
 import dask_cudf
 from dask_cudf.tests.utils import xfail_dask_expr
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 7f8a619ae22..174923c2c7e 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -795,7 +795,7 @@ def test_dataframe_set_index():
         pddf = dd.from_pandas(pdf, npartitions=4)
         pddf = pddf.set_index("str")
 
-        from cudf.testing._utils import assert_eq
+        from cudf.testing import assert_eq
 
         assert_eq(ddf.compute(), pddf.compute())
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py
index 07fdb25dff9..be10b0d4843 100644
--- a/python/dask_cudf/dask_cudf/tests/test_distributed.py
+++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py
@@ -9,7 +9,7 @@
 from distributed.utils_test import cleanup, loop, loop_in_thread  # noqa: F401
 
 import cudf
-from cudf.testing._utils import assert_eq
+from cudf.testing import assert_eq
 
 import dask_cudf
 

From bc08662fd6c08635af78faaf4bc8a909f85a3f8a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 25 Jun 2024 07:37:19 -1000
Subject: [PATCH 403/842] Refactor fillna logic to push specifics toward Frame
 subclasses and Column subclasses (#15957)

Essentially 2 reorganizations

1. `Frame.fillna` input argument logic was pushed toward its subclasses `Series`/`DataFrame`/`IndexedFrame` where appripriate
2. `Column.fillna` was made generic. Column subclasses now implement `_validate_fillna_value` used by `Column.fillna` to validate the fill value

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15957
---
 python/cudf/cudf/core/column/categorical.py | 79 ++++++++------------
 python/cudf/cudf/core/column/column.py      | 21 +++++-
 python/cudf/cudf/core/column/datetime.py    | 21 +-----
 python/cudf/cudf/core/column/decimal.py     | 39 ++++------
 python/cudf/cudf/core/column/numerical.py   | 63 ++++------------
 python/cudf/cudf/core/column/string.py      | 18 +----
 python/cudf/cudf/core/column/timedelta.py   | 19 +----
 python/cudf/cudf/core/dataframe.py          | 26 +++++++
 python/cudf/cudf/core/frame.py              | 81 ++++++++-------------
 python/cudf/cudf/core/indexed_frame.py      | 23 ------
 python/cudf/cudf/core/series.py             | 14 +---
 python/cudf/cudf/tests/test_series.py       | 12 +++
 12 files changed, 155 insertions(+), 261 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index f538180805b..231af30c06d 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1068,51 +1068,34 @@ def notnull(self) -> ColumnBase:
 
         return result
 
-    def fillna(
-        self,
-        fill_value: Any = None,
-        method: str | None = None,
-    ) -> Self:
-        """
-        Fill null values with *fill_value*
-        """
-        if fill_value is not None:
-            fill_is_scalar = np.isscalar(fill_value)
-
-            if fill_is_scalar:
-                if fill_value == _DEFAULT_CATEGORICAL_VALUE:
-                    fill_value = self.codes.dtype.type(fill_value)
-                else:
-                    try:
-                        fill_value = self._encode(fill_value)
-                        fill_value = self.codes.dtype.type(fill_value)
-                    except ValueError as err:
-                        err_msg = "fill value must be in categories"
-                        raise ValueError(err_msg) from err
+    def _validate_fillna_value(
+        self, fill_value: ScalarLike | ColumnLike
+    ) -> cudf.Scalar | ColumnBase:
+        """Align fill_value for .fillna based on column type."""
+        if cudf.api.types.is_scalar(fill_value):
+            if fill_value != _DEFAULT_CATEGORICAL_VALUE:
+                try:
+                    fill_value = self._encode(fill_value)
+                except ValueError as err:
+                    raise ValueError(
+                        f"{fill_value=} must be in categories"
+                    ) from err
+            return cudf.Scalar(fill_value, dtype=self.codes.dtype)
+        else:
+            fill_value = column.as_column(fill_value, nan_as_null=False)
+            if isinstance(fill_value.dtype, CategoricalDtype):
+                if self.dtype != fill_value.dtype:
+                    raise TypeError(
+                        "Cannot set a categorical with another without identical categories"
+                    )
             else:
-                fill_value = column.as_column(fill_value, nan_as_null=False)
-                if isinstance(fill_value, CategoricalColumn):
-                    if self.dtype != fill_value.dtype:
-                        raise TypeError(
-                            "Cannot set a Categorical with another, "
-                            "without identical categories"
-                        )
-                # TODO: only required if fill_value has a subset of the
-                # categories:
-                fill_value = fill_value._set_categories(
-                    self.categories,
-                    is_unique=True,
-                )
-                fill_value = column.as_column(fill_value.codes).astype(
-                    self.codes.dtype
+                raise TypeError(
+                    "Cannot set a categorical with non-categorical data"
                 )
-
-        # Validation of `fill_value` will have to be performed
-        # before returning self.
-        if not self.nullable:
-            return self
-
-        return super().fillna(fill_value, method=method)
+            fill_value = fill_value._set_categories(
+                self.categories,
+            )
+            return fill_value.codes.astype(self.codes.dtype)
 
     def indices_of(
         self, value: ScalarLike
@@ -1372,11 +1355,13 @@ def _set_categories(
         if not (is_unique or new_cats.is_unique):
             new_cats = cudf.Series(new_cats)._column.unique()
 
+        if cur_cats.equals(new_cats, check_dtypes=True):
+            # TODO: Internal usages don't always need a copy; add a copy keyword
+            # as_ordered shallow copies
+            return self.copy().as_ordered(ordered=ordered)
+
         cur_codes = self.codes
-        max_cat_size = (
-            len(cur_cats) if len(cur_cats) > len(new_cats) else len(new_cats)
-        )
-        out_code_dtype = min_unsigned_type(max_cat_size)
+        out_code_dtype = min_unsigned_type(max(len(cur_cats), len(new_cats)))
 
         cur_order = column.as_column(range(len(cur_codes)))
         old_codes = column.as_column(
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 586689e2ee3..dfcdfbb9d91 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -666,15 +666,32 @@ def _check_scatter_key_length(
                 f"{num_keys}"
             )
 
+    def _validate_fillna_value(
+        self, fill_value: ScalarLike | ColumnLike
+    ) -> cudf.Scalar | ColumnBase:
+        """Align fill_value for .fillna based on column type."""
+        if is_scalar(fill_value):
+            return cudf.Scalar(fill_value, dtype=self.dtype)
+        return as_column(fill_value)
+
     def fillna(
         self,
-        fill_value: Any = None,
-        method: str | None = None,
+        fill_value: ScalarLike | ColumnLike,
+        method: Literal["ffill", "bfill", None] = None,
     ) -> Self:
         """Fill null values with ``value``.
 
         Returns a copy with null filled.
         """
+        if not self.has_nulls(include_nan=True):
+            return self.copy()
+        elif method is None:
+            if is_scalar(fill_value) and libcudf.scalar._is_null_host_scalar(
+                fill_value
+            ):
+                return self.copy()
+            else:
+                fill_value = self._validate_fillna_value(fill_value)
         return libcudf.replace.replace_nulls(
             input_col=self.nans_to_nulls(),
             replacement=fill_value,
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index d88553361dd..121076b69ce 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -8,18 +8,17 @@
 import locale
 import re
 from locale import nl_langinfo
-from typing import TYPE_CHECKING, Any, Literal, Sequence, cast
+from typing import TYPE_CHECKING, Literal, Sequence, cast
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from typing_extensions import Self
 
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib.labeling import label_bins
 from cudf._lib.search import search_sorted
-from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
+from cudf.api.types import is_datetime64_dtype, is_timedelta64_dtype
 from cudf.core._compat import PANDAS_GE_220
 from cudf.core._internals.timezones import (
     check_ambiguous_and_nonexistent,
@@ -641,22 +640,6 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         else:
             return result_col
 
-    def fillna(
-        self,
-        fill_value: Any = None,
-        method: str | None = None,
-    ) -> Self:
-        if fill_value is not None:
-            if cudf.utils.utils._isnat(fill_value):
-                return self.copy(deep=True)
-            if is_scalar(fill_value):
-                if not isinstance(fill_value, cudf.Scalar):
-                    fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
-            else:
-                fill_value = column.as_column(fill_value, nan_as_null=False)
-
-        return super().fillna(fill_value, method)
-
     def indices_of(
         self, value: ScalarLike
     ) -> cudf.core.column.NumericalColumn:
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index e9d9b4933e5..d66908b5f94 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -4,12 +4,11 @@
 
 import warnings
 from decimal import Decimal
-from typing import TYPE_CHECKING, Any, Sequence, cast
+from typing import TYPE_CHECKING, Sequence, cast
 
 import cupy as cp
 import numpy as np
 import pyarrow as pa
-from typing_extensions import Self
 
 import cudf
 from cudf import _lib as libcudf
@@ -31,7 +30,7 @@
 from .numerical_base import NumericalBaseColumn
 
 if TYPE_CHECKING:
-    from cudf._typing import ColumnBinaryOperand, Dtype
+    from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
 
 
 class DecimalBaseColumn(NumericalBaseColumn):
@@ -135,30 +134,20 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str):
 
         return result
 
-    def fillna(
-        self,
-        fill_value: Any = None,
-        method: str | None = None,
-    ) -> Self:
-        """Fill null values with ``value``.
-
-        Returns a copy with null filled.
-        """
+    def _validate_fillna_value(
+        self, fill_value: ScalarLike | ColumnLike
+    ) -> cudf.Scalar | ColumnBase:
+        """Align fill_value for .fillna based on column type."""
         if isinstance(fill_value, (int, Decimal)):
-            fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
-        elif (
-            isinstance(fill_value, DecimalBaseColumn)
-            or isinstance(fill_value, cudf.core.column.NumericalColumn)
-            and is_integer_dtype(fill_value.dtype)
+            return cudf.Scalar(fill_value, dtype=self.dtype)
+        elif isinstance(fill_value, ColumnBase) and (
+            isinstance(self.dtype, DecimalDtype) or self.dtype.kind in "iu"
         ):
-            fill_value = fill_value.astype(self.dtype)
-        else:
-            raise TypeError(
-                "Decimal columns only support using fillna with decimal and "
-                "integer values"
-            )
-
-        return super().fillna(fill_value, method=method)
+            return fill_value.astype(self.dtype)
+        raise TypeError(
+            "Decimal columns only support using fillna with decimal and "
+            "integer values"
+        )
 
     def normalize_binop_value(self, other):
         if isinstance(other, ColumnBase):
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 098cf43421b..76c64e1aea0 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -532,57 +532,26 @@ def find_and_replace(
             replaced, df._data["old"], df._data["new"]
         )
 
-    def fillna(
-        self,
-        fill_value: Any = None,
-        method: str | None = None,
-    ) -> Self:
-        """
-        Fill null values with *fill_value*
-        """
-        col = self.nans_to_nulls()
-
-        if col.null_count == 0:
-            return col
-
-        if method is not None:
-            return super().fillna(fill_value, method)
-
-        if fill_value is None:
-            raise ValueError("Must specify either 'fill_value' or 'method'")
-
-        if (
-            isinstance(fill_value, cudf.Scalar)
-            and fill_value.dtype == col.dtype
-        ):
-            return super().fillna(fill_value, method)
-
-        if np.isscalar(fill_value):
-            # cast safely to the same dtype as self
-            fill_value_casted = col.dtype.type(fill_value)
-            if not np.isnan(fill_value) and (fill_value_casted != fill_value):
+    def _validate_fillna_value(
+        self, fill_value: ScalarLike | ColumnLike
+    ) -> cudf.Scalar | ColumnBase:
+        """Align fill_value for .fillna based on column type."""
+        if is_scalar(fill_value):
+            cudf_obj = cudf.Scalar(fill_value)
+            if not as_column(cudf_obj).can_cast_safely(self.dtype):
                 raise TypeError(
                     f"Cannot safely cast non-equivalent "
-                    f"{type(fill_value).__name__} to {col.dtype.name}"
+                    f"{type(fill_value).__name__} to {self.dtype.name}"
                 )
-            fill_value = cudf.Scalar(fill_value_casted)
         else:
-            fill_value = column.as_column(fill_value, nan_as_null=False)
-            if is_integer_dtype(col.dtype):
-                # cast safely to the same dtype as self
-                if fill_value.dtype != col.dtype:
-                    new_fill_value = fill_value.astype(col.dtype)
-                    if not (new_fill_value == fill_value).all():
-                        raise TypeError(
-                            f"Cannot safely cast non-equivalent "
-                            f"{fill_value.dtype.type.__name__} to "
-                            f"{col.dtype.type.__name__}"
-                        )
-                    fill_value = new_fill_value
-            else:
-                fill_value = fill_value.astype(col.dtype)
-
-        return super().fillna(fill_value, method)
+            cudf_obj = as_column(fill_value, nan_as_null=False)
+            if not cudf_obj.can_cast_safely(self.dtype):  # type: ignore[attr-defined]
+                raise TypeError(
+                    f"Cannot safely cast non-equivalent "
+                    f"{cudf_obj.dtype.type.__name__} to "
+                    f"{self.dtype.type.__name__}"
+                )
+        return cudf_obj.astype(self.dtype)
 
     def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
         """
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 2451a9cc0af..936cd1eccb0 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5,12 +5,11 @@
 import re
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Sequence, cast, overload
+from typing import TYPE_CHECKING, Sequence, cast, overload
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from typing_extensions import Self
 
 import cudf
 import cudf.api.types
@@ -5838,21 +5837,6 @@ def find_and_replace(
             res = self
         return libcudf.replace.replace(res, df._data["old"], df._data["new"])
 
-    def fillna(
-        self,
-        fill_value: Any = None,
-        method: str | None = None,
-    ) -> Self:
-        if fill_value is not None:
-            if not is_scalar(fill_value):
-                fill_value = column.as_column(fill_value, dtype=self.dtype)
-            elif cudf._lib.scalar._is_null_host_scalar(fill_value):
-                # Trying to fill <NA> with <NA> value? Return copy.
-                return self.copy(deep=True)
-            else:
-                fill_value = cudf.Scalar(fill_value, dtype=self.dtype)
-        return super().fillna(fill_value, method=method)
-
     def normalize_binop_value(self, other) -> column.ColumnBase | cudf.Scalar:
         if (
             isinstance(other, (column.ColumnBase, cudf.Scalar))
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 26b449f1863..8f41bcb6422 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -4,12 +4,11 @@
 
 import datetime
 import functools
-from typing import TYPE_CHECKING, Any, Sequence, cast
+from typing import TYPE_CHECKING, Sequence, cast
 
 import numpy as np
 import pandas as pd
 import pyarrow as pa
-from typing_extensions import Self
 
 import cudf
 from cudf import _lib as libcudf
@@ -252,22 +251,6 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
     def time_unit(self) -> str:
         return np.datetime_data(self.dtype)[0]
 
-    def fillna(
-        self,
-        fill_value: Any = None,
-        method: str | None = None,
-    ) -> Self:
-        if fill_value is not None:
-            if cudf.utils.utils._isnat(fill_value):
-                return self.copy(deep=True)
-            if is_scalar(fill_value):
-                fill_value = cudf.Scalar(fill_value)
-                dtype = self.dtype
-                fill_value = fill_value.astype(dtype)
-            else:
-                fill_value = column.as_column(fill_value, nan_as_null=False)
-        return super().fillna(fill_value, method)
-
     def as_numerical_column(
         self, dtype: Dtype
     ) -> "cudf.core.column.NumericalColumn":
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 76bb9d2a8ed..f0d8157011d 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2980,6 +2980,32 @@ def set_index(
         df.index = idx
         return df if not inplace else None
 
+    @_cudf_nvtx_annotate
+    def fillna(
+        self, value=None, method=None, axis=None, inplace=False, limit=None
+    ):  # noqa: D102
+        if isinstance(value, (pd.Series, pd.DataFrame)):
+            value = cudf.from_pandas(value)
+        if isinstance(value, cudf.Series):
+            # Align value.index to self.columns
+            value = value.reindex(self._column_names)
+        elif isinstance(value, cudf.DataFrame):
+            if not self.index.equals(value.index):
+                # Align value.index to self.index
+                value = value.reindex(self.index)
+            value = dict(value.items())
+        elif isinstance(value, abc.Mapping):
+            # Align value.indexes to self.index
+            value = {
+                key: value.reindex(self.index)
+                if isinstance(value, cudf.Series)
+                else value
+                for key, value in value.items()
+            }
+        return super().fillna(
+            value=value, method=method, axis=axis, inplace=inplace, limit=limit
+        )
+
     @_cudf_nvtx_annotate
     def where(self, cond, other=None, inplace=False):
         from cudf.core._internals.where import (
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 38bff3946d6..8ca71180c00 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-import copy
 import operator
 import pickle
 import warnings
@@ -20,6 +19,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf.api.types import is_dtype_equal, is_scalar
+from cudf.core._compat import PANDAS_LT_300
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import (
     ColumnBase,
@@ -38,7 +38,7 @@
 if TYPE_CHECKING:
     from types import ModuleType
 
-    from cudf._typing import Dtype
+    from cudf._typing import Dtype, ScalarLike
 
 
 # TODO: It looks like Frame is missing a declaration of `copy`, need to add
@@ -613,8 +613,8 @@ def where(self, cond, other=None, inplace: bool = False) -> Self | None:
     @_cudf_nvtx_annotate
     def fillna(
         self,
-        value=None,
-        method: Literal["ffill", "bfill", "pad", "backfill"] | None = None,
+        value: None | ScalarLike | cudf.Series = None,
+        method: Literal["ffill", "bfill", "pad", "backfill", None] = None,
         axis=None,
         inplace: bool = False,
         limit=None,
@@ -725,6 +725,16 @@ def fillna(
             raise ValueError("Cannot specify both 'value' and 'method'.")
 
         if method:
+            # Do not remove until pandas 3.0 support is added.
+            assert (
+                PANDAS_LT_300
+            ), "Need to drop after pandas-3.0 support is added."
+            warnings.warn(
+                f"{type(self).__name__}.fillna with 'method' is "
+                "deprecated and will raise in a future version. "
+                "Use obj.ffill() or obj.bfill() instead.",
+                FutureWarning,
+            )
             if method not in {"ffill", "bfill", "pad", "backfill"}:
                 raise NotImplementedError(
                     f"Fill method {method} is not supported"
@@ -734,57 +744,24 @@ def fillna(
             elif method == "backfill":
                 method = "bfill"
 
-        # TODO: This logic should be handled in different subclasses since
-        # different Frames support different types of values.
-        if isinstance(value, cudf.Series):
-            value = value.reindex(self._data.names)
-        elif isinstance(value, cudf.DataFrame):
-            if not self.index.equals(value.index):  # type: ignore[attr-defined]
-                value = value.reindex(self.index)  # type: ignore[attr-defined]
-            else:
-                value = value
-        elif not isinstance(value, abc.Mapping):
-            value = {name: copy.deepcopy(value) for name in self._data.names}
-        else:
-            value = {
-                key: value.reindex(self.index)  # type: ignore[attr-defined]
-                if isinstance(value, cudf.Series)
-                else value
-                for key, value in value.items()
-            }
-
-        filled_data = {}
-        for col_name, col in self._data.items():
-            if col_name in value and method is None:
-                replace_val = value[col_name]
-            else:
-                replace_val = None
-            should_fill = (
-                (
-                    col_name in value
-                    and col.has_nulls(include_nan=True)
-                    and not libcudf.scalar._is_null_host_scalar(replace_val)
-                )
-                or method is not None
-                or (
-                    isinstance(col, cudf.core.column.CategoricalColumn)
-                    and not libcudf.scalar._is_null_host_scalar(replace_val)
-                )
+        if is_scalar(value):
+            value = {name: value for name in self._column_names}
+        elif not isinstance(value, (abc.Mapping, cudf.Series)):
+            raise TypeError(
+                f'"value" parameter must be a scalar, dict '
+                f"or Series, but you passed a "
+                f'"{type(value).__name__}"'
             )
-            if should_fill:
-                filled_data[col_name] = col.fillna(replace_val, method)
-            else:
-                filled_data[col_name] = col.copy(deep=True)
+
+        filled_columns = [
+            col.fillna(value[name], method) if name in value else col.copy()
+            for name, col in self._data.items()
+        ]
 
         return self._mimic_inplace(
-            self._from_data(
-                data=ColumnAccessor(
-                    data=filled_data,
-                    multiindex=self._data.multiindex,
-                    level_names=self._data.level_names,
-                    rangeindex=self._data.rangeindex,
-                    label_dtype=self._data.label_dtype,
-                    verify=False,
+            self._from_data_like_self(
+                self._data._from_columns_like_self(
+                    filled_columns, verify=False
                 )
             ),
             inplace=inplace,
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 5cae4a857ee..280a6e92eab 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3217,29 +3217,6 @@ def _split(self, splits, keep_index=True):
             for i in range(len(splits) + 1)
         ]
 
-    @_cudf_nvtx_annotate
-    def fillna(
-        self, value=None, method=None, axis=None, inplace=False, limit=None
-    ):  # noqa: D102
-        if method is not None:
-            # Do not remove until pandas 3.0 support is added.
-            assert (
-                PANDAS_LT_300
-            ), "Need to drop after pandas-3.0 support is added."
-            warnings.warn(
-                f"{type(self).__name__}.fillna with 'method' is "
-                "deprecated and will raise in a future version. "
-                "Use obj.ffill() or obj.bfill() instead.",
-                FutureWarning,
-            )
-        old_index = self.index
-        ret = super().fillna(value, method, axis, inplace, limit)
-        if inplace:
-            self.index = old_index
-        else:
-            ret.index = old_index
-        return ret
-
     @_cudf_nvtx_annotate
     def bfill(self, value=None, axis=None, inplace=None, limit=None):
         """
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index c0716d7709a..15ad0813601 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1797,20 +1797,12 @@ def fillna(
     ):
         if isinstance(value, pd.Series):
             value = Series.from_pandas(value)
-
-        if not (is_scalar(value) or isinstance(value, (abc.Mapping, Series))):
-            raise TypeError(
-                f'"value" parameter must be a scalar, dict '
-                f"or Series, but you passed a "
-                f'"{type(value).__name__}"'
-            )
-
-        if isinstance(value, (abc.Mapping, Series)):
+        elif isinstance(value, abc.Mapping):
             value = Series(value)
+        if isinstance(value, cudf.Series):
             if not self.index.equals(value.index):
                 value = value.reindex(self.index)
-            value = value._column
-
+            value = {self.name: value._column}
         return super().fillna(
             value=value, method=method, axis=axis, inplace=inplace, limit=limit
         )
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 87ec365868b..467d0c46ae7 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -1054,6 +1054,18 @@ def test_fillna_with_nan(data, nan_as_null, fill_value):
     assert_eq(expected, actual)
 
 
+def test_fillna_categorical_with_non_categorical_raises():
+    ser = cudf.Series([1, None], dtype="category")
+    with pytest.raises(TypeError):
+        ser.fillna(cudf.Series([1, 2]))
+
+
+def test_fillna_categorical_with_different_categories_raises():
+    ser = cudf.Series([1, None], dtype="category")
+    with pytest.raises(TypeError):
+        ser.fillna(cudf.Series([1, 2]), dtype="category")
+
+
 def test_series_mask_mixed_dtypes_error():
     s = cudf.Series(["a", "b", "c"])
     with pytest.raises(

From e4bd9e85f6aeab1b1debd03672fdd34ecd43fedf Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Tue, 25 Jun 2024 11:32:39 -0700
Subject: [PATCH 404/842] Add support to ArrowDataSource in SourceInfo (#16050)

ArrowDataSources weren't previously supported in SourceInfo.
(since we didn't need it for Avro).

Adding it now so we can pass tests for orc reader and co.
(even though ArrowDataSource may potentially be removed in the future)

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16050
---
 python/cudf/cudf/_lib/csv.pyx                 |  2 +-
 python/cudf/cudf/_lib/io/CMakeLists.txt       |  4 ++--
 python/cudf/cudf/_lib/io/utils.pyx            |  2 +-
 python/cudf/cudf/_lib/orc.pyx                 |  2 +-
 python/cudf/cudf/_lib/parquet.pyx             |  2 +-
 .../cudf/_lib/pylibcudf/io/CMakeLists.txt     |  4 ++--
 .../cudf/cudf/_lib/pylibcudf/io/__init__.pxd  |  2 +-
 .../cudf/cudf/_lib/pylibcudf/io/__init__.py   |  2 +-
 .../_lib/{ => pylibcudf}/io/datasource.pxd    |  0
 .../_lib/{ => pylibcudf}/io/datasource.pyx    |  0
 python/cudf/cudf/_lib/pylibcudf/io/types.pyx  | 23 ++++++++++++++----
 .../cudf/pylibcudf_tests/test_source_info.py  | 24 ++++++++++++++++++-
 python/cudf_kafka/cudf_kafka/_lib/kafka.pxd   |  2 +-
 13 files changed, 53 insertions(+), 16 deletions(-)
 rename python/cudf/cudf/_lib/{ => pylibcudf}/io/datasource.pxd (100%)
 rename python/cudf/cudf/_lib/{ => pylibcudf}/io/datasource.pyx (100%)

diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 0b0bbdb2589..c706351a683 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -8,7 +8,7 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-from cudf._lib.io.datasource cimport Datasource, NativeFileDatasource
+from cudf._lib.pylibcudf.io.datasource cimport Datasource, NativeFileDatasource
 from cudf._lib.pylibcudf.libcudf.types cimport data_type
 from cudf._lib.types cimport dtype_to_data_type
 
diff --git a/python/cudf/cudf/_lib/io/CMakeLists.txt b/python/cudf/cudf/_lib/io/CMakeLists.txt
index 2408fa1c12f..620229a1275 100644
--- a/python/cudf/cudf/_lib/io/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/io/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources datasource.pyx utils.pyx)
+set(cython_sources utils.pyx)
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
   CXX
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index 3c14ec46122..1d7c56888d9 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -8,7 +8,7 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.io.datasource cimport Datasource
+from cudf._lib.pylibcudf.io.datasource cimport Datasource
 from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
 from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 from cudf._lib.pylibcudf.libcudf.io.types cimport (
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index d3e6053ef4b..9609e3131b4 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -23,12 +23,12 @@ except ImportError:
 
 cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
 from cudf._lib.column cimport Column
-from cudf._lib.io.datasource cimport NativeFileDatasource
 from cudf._lib.io.utils cimport (
     make_sink_info,
     make_source_info,
     update_column_struct_field_names,
 )
+from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
 from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
 from cudf._lib.pylibcudf.libcudf.io.orc cimport (
     chunked_orc_writer_options,
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index f6f9cfa9a7c..7914ed7e9d9 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -37,12 +37,12 @@ cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
 cimport cudf._lib.pylibcudf.libcudf.types as cudf_types
 from cudf._lib.column cimport Column
 from cudf._lib.expressions cimport Expression
-from cudf._lib.io.datasource cimport NativeFileDatasource
 from cudf._lib.io.utils cimport (
     make_sinks_info,
     make_source_info,
     update_struct_field_names,
 )
+from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
 from cudf._lib.pylibcudf.libcudf.expressions cimport expression
 from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
     chunked_parquet_reader as cpp_chunked_parquet_reader,
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
index 2cfec101bab..32f0f5543e4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources avro.pyx types.pyx)
+set(cython_sources avro.pyx datasource.pyx types.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
@@ -21,5 +21,5 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
 )
 
-set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_types)
+set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_types)
 link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
index 250292746c1..cfd6d2cd281 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . cimport avro, types
+from . cimport avro, datasource, types
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
index 5242c741911..a54ba1834dc 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, types
+from . import avro, datasource, types
 from .types import SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/io/datasource.pxd b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/io/datasource.pxd
rename to python/cudf/cudf/_lib/pylibcudf/io/datasource.pxd
diff --git a/python/cudf/cudf/_lib/io/datasource.pyx b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/io/datasource.pyx
rename to python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
index cd777232b33..ab3375da662 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
@@ -4,6 +4,8 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+from cudf._lib.pylibcudf.io.datasource cimport Datasource
+from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 from cudf._lib.pylibcudf.libcudf.io.types cimport (
     host_buffer,
     source_info,
@@ -56,9 +58,8 @@ cdef class SourceInfo:
 
     Parameters
     ----------
-    sources : List[Union[str, os.PathLike, bytes, io.BytesIO]]
-        A homogeneous list of sources (this can be a string filename,
-        an os.PathLike, bytes, or an io.BytesIO) to read from.
+    sources : List[Union[str, os.PathLike, bytes, io.BytesIO, DataSource]]
+        A homogeneous list of sources to read from.
 
         Mixing different types of sources will raise a `ValueError`.
     """
@@ -68,6 +69,7 @@ cdef class SourceInfo:
             raise ValueError("Need to pass at least one source")
 
         cdef vector[string] c_files
+        cdef vector[datasource*] c_datasources
 
         if isinstance(sources[0], (os.PathLike, str)):
             c_files.reserve(len(sources))
@@ -84,6 +86,13 @@ cdef class SourceInfo:
 
             self.c_obj = move(source_info(c_files))
             return
+        elif isinstance(sources[0], Datasource):
+            for csrc in sources:
+                if not isinstance(csrc, Datasource):
+                    raise ValueError("All sources must be of the same type!")
+                c_datasources.push_back((<Datasource>csrc).get_datasource())
+            self.c_obj = move(source_info(c_datasources))
+            return
 
         # TODO: host_buffer is deprecated API, use host_span instead
         cdef vector[host_buffer] c_host_buffers
@@ -106,5 +115,11 @@ cdef class SourceInfo:
                 c_buffer = bio.getbuffer()  # check if empty?
                 c_host_buffers.push_back(host_buffer(<char*>&c_buffer[0],
                                                      c_buffer.shape[0]))
+        else:
+            raise ValueError("Sources must be a list of str/paths, "
+                             "bytes, io.BytesIO, or a Datasource")
+
+        if empty_buffer is True:
+            c_host_buffers.push_back(host_buffer(<char*>NULL, 0))
 
-        self.c_obj = source_info(c_host_buffers)
+        self.c_obj = move(source_info(c_host_buffers))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_source_info.py b/python/cudf/cudf/pylibcudf_tests/test_source_info.py
index 71a3ecbcc30..019321b7259 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_source_info.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_source_info.py
@@ -2,13 +2,21 @@
 
 import io
 
+import pyarrow as pa
 import pytest
 
 import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.io.datasource import NativeFileDatasource
 
 
 @pytest.mark.parametrize(
-    "source", ["a.txt", b"hello world", io.BytesIO(b"hello world")]
+    "source",
+    [
+        "a.txt",
+        b"hello world",
+        io.BytesIO(b"hello world"),
+        NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
+    ],
 )
 def test_source_info_ctor(source, tmp_path):
     if isinstance(source, str):
@@ -28,6 +36,10 @@ def test_source_info_ctor(source, tmp_path):
         ["a.txt", "a.txt"],
         [b"hello world", b"hello there"],
         [io.BytesIO(b"hello world"), io.BytesIO(b"hello there")],
+        [
+            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
+            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
+        ],
     ],
 )
 def test_source_info_ctor_multiple(sources, tmp_path):
@@ -54,6 +66,11 @@ def test_source_info_ctor_multiple(sources, tmp_path):
             io.BytesIO(b"hello there"),
             b"hello world",
         ],
+        [
+            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
+            "awef.txt",
+            b"hello world",
+        ],
     ],
 )
 def test_source_info_ctor_mixing_invalid(sources, tmp_path):
@@ -67,3 +84,8 @@ def test_source_info_ctor_mixing_invalid(sources, tmp_path):
             sources[i] = str(file)
     with pytest.raises(ValueError):
         plc.io.SourceInfo(sources)
+
+
+def test_source_info_invalid():
+    with pytest.raises(ValueError):
+        plc.io.SourceInfo([123])
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
index 84a3a32646d..2de0bf39785 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
@@ -7,7 +7,7 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from cudf._lib.io.datasource cimport Datasource
+from cudf._lib.pylibcudf.io.datasource cimport Datasource
 from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 
 
From cdfb550f442e846623c721082128a095f02efff9 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Tue, 25 Jun 2024 14:39:17 -0400
Subject: [PATCH 405/842] Add ast cast test (#16045)

Add test for AST cast-to-float64

Resolves #16023

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16045
---
 cpp/tests/ast/transform_tests.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index ef1d09e5652..6b350c137d0 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -65,6 +65,22 @@ TEST_F(TransformTest, ColumnReference)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
 }
 
+TEST_F(TransformTest, BasicAdditionDoubleCast)
+{
+  auto c_0 = column_wrapper<double>{3, 20, 1, 50};
+  std::vector<__int128_t> data1{10, 7, 20, 0};
+  auto c_1 = cudf::test::fixed_point_column_wrapper<__int128_t>(
+    data1.begin(), data1.end(), numeric::scale_type{0});
+  auto table      = cudf::table_view{{c_0, c_1}};
+  auto col_ref_0  = cudf::ast::column_reference(0);
+  auto col_ref_1  = cudf::ast::column_reference(1);
+  auto cast       = cudf::ast::operation(cudf::ast::ast_operator::CAST_TO_FLOAT64, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, cast);
+  auto expected   = column_wrapper<double>{13, 27, 21, 50};
+  auto result     = cudf::compute_column(table, expression);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
+}
+
 TEST_F(TransformTest, Literal)
 {
   auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};

From 892e7d850b060adbf198d4a5f6bfefa4941f77fb Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 26 Jun 2024 16:12:00 +0100
Subject: [PATCH 406/842] Expose and then implement support for cross joins in
 cudf-polars (#16097)

libcudf supports cross joins, but until now this wasn't exposed to python. Do that in pylibcudf and implement the evaluation rule in cudf-polars.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16097
---
 python/cudf/cudf/_lib/pylibcudf/join.pxd      |  2 ++
 python/cudf/cudf/_lib/pylibcudf/join.pyx      | 30 +++++++++++++++----
 .../cudf/cudf/_lib/pylibcudf/libcudf/join.pxd |  5 ++++
 python/cudf/cudf/pylibcudf_tests/test_join.py | 29 ++++++++++++++++++
 python/cudf_polars/cudf_polars/dsl/ir.py      | 29 +++++++++++++-----
 python/cudf_polars/tests/test_join.py         | 24 ++++++++++++---
 6 files changed, 102 insertions(+), 17 deletions(-)
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_join.py

diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pxd b/python/cudf/cudf/_lib/pylibcudf/join.pxd
index f560eeef06d..83b4776c16e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/join.pxd
@@ -35,3 +35,5 @@ cpdef Column left_anti_join(
     Table right_keys,
     null_equality nulls_equal
 )
+
+cpdef Table cross_join(Table left, Table right)
diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pyx b/python/cudf/cudf/_lib/pylibcudf/join.pyx
index cf2a6a8187f..308b1b39291 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/join.pyx
@@ -2,13 +2,14 @@
 
 from cython.operator import dereference
 
-from libcpp.memory cimport make_unique
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 
 from rmm._lib.device_buffer cimport device_buffer
 
 from cudf._lib.pylibcudf.libcudf cimport join as cpp_join
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.pylibcudf.libcudf.types cimport (
     data_type,
     null_equality,
@@ -88,7 +89,6 @@ cpdef tuple left_join(
     nulls_equal : NullEquality
         Should nulls compare equal?
 
-
     Returns
     -------
     Tuple[Column, Column]
@@ -122,7 +122,6 @@ cpdef tuple full_join(
     nulls_equal : NullEquality
         Should nulls compare equal?
 
-
     Returns
     -------
     Tuple[Column, Column]
@@ -156,7 +155,6 @@ cpdef Column left_semi_join(
     nulls_equal : NullEquality
         Should nulls compare equal?
 
-
     Returns
     -------
     Column
@@ -190,7 +188,6 @@ cpdef Column left_anti_join(
     nulls_equal : NullEquality
         Should nulls compare equal?
 
-
     Returns
     -------
     Column
@@ -204,3 +201,26 @@ cpdef Column left_anti_join(
             nulls_equal
         )
     return _column_from_gather_map(move(c_result))
+
+
+cpdef Table cross_join(Table left, Table right):
+    """Perform a cross join on two tables.
+
+    For details see :cpp:func:`cross_join`.
+
+    Parameters
+    ----------
+    left : Table
+        The left table to join.
+    right: Table
+        The right table to join.
+
+    Returns
+    -------
+    Table
+        The result of cross joining the two inputs.
+    """
+    cdef unique_ptr[table] result
+    with nogil:
+        result = move(cpp_join.cross_join(left.view(), right.view()))
+    return Table.from_libcudf(move(result))
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
index 89a30f0f255..32cd17f7c11 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
@@ -70,3 +70,8 @@ cdef extern from "cudf/join.hpp" namespace "cudf" nogil:
         const table_view right_keys,
         null_equality nulls_equal,
     ) except +
+
+    cdef unique_ptr[table] cross_join(
+        const table_view left,
+        const table_view right,
+    ) except +
diff --git a/python/cudf/cudf/pylibcudf_tests/test_join.py b/python/cudf/cudf/pylibcudf_tests/test_join.py
new file mode 100644
index 00000000000..eb25ed915b1
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_join.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import numpy as np
+import pyarrow as pa
+from utils import assert_table_eq
+
+from cudf._lib import pylibcudf as plc
+
+
+def test_cross_join():
+    left = pa.Table.from_arrays([[0, 1, 2], [3, 4, 5]], names=["a", "b"])
+    right = pa.Table.from_arrays(
+        [[6, 7, 8, 9], [10, 11, 12, 13]], names=["c", "d"]
+    )
+
+    pleft = plc.interop.from_arrow(left)
+    pright = plc.interop.from_arrow(right)
+
+    expect = pa.Table.from_arrays(
+        [
+            *(np.repeat(c.to_numpy(), len(right)) for c in left.columns),
+            *(np.tile(c.to_numpy(), len(left)) for c in right.columns),
+        ],
+        names=["a", "b", "c", "d"],
+    )
+
+    got = plc.join.cross_join(pleft, pright)
+
+    assert_table_eq(expect, got)
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index b3dd6ae7cc3..4ad6e75fb2e 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -503,7 +503,7 @@ class Join(IR):
     right_on: list[expr.NamedExpr]
     """List of expressions used as keys in the right frame."""
     options: tuple[
-        Literal["inner", "left", "full", "leftsemi", "leftanti"],
+        Literal["inner", "left", "full", "leftsemi", "leftanti", "cross"],
         bool,
         tuple[int, int] | None,
         str | None,
@@ -518,11 +518,6 @@ class Join(IR):
     - coalesce: should key columns be coalesced (only makes sense for outer joins)
     """
 
-    def __post_init__(self) -> None:
-        """Validate preconditions."""
-        if self.options[0] == "cross":
-            raise NotImplementedError("cross join not implemented")
-
     @cache
     @staticmethod
     def _joiners(
@@ -567,6 +562,26 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         left = self.left.evaluate(cache=cache)
         right = self.right.evaluate(cache=cache)
+        how, join_nulls, zlice, suffix, coalesce = self.options
+        suffix = "_right" if suffix is None else suffix
+        if how == "cross":
+            # Separate implementation, since cross_join returns the
+            # result, not the gather maps
+            columns = plc.join.cross_join(left.table, right.table).columns()
+            left_cols = [
+                NamedColumn(new, old.name).sorted_like(old)
+                for new, old in zip(columns[: left.num_columns], left.columns)
+            ]
+            right_cols = [
+                NamedColumn(
+                    new,
+                    old.name
+                    if old.name not in left.column_names_set
+                    else f"{old.name}{suffix}",
+                )
+                for new, old in zip(columns[left.num_columns :], right.columns)
+            ]
+            return DataFrame([*left_cols, *right_cols])
         left_on = DataFrame(
             broadcast(
                 *(e.evaluate(left) for e in self.left_on), target_length=left.num_rows
@@ -578,13 +593,11 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 target_length=right.num_rows,
             )
         )
-        how, join_nulls, zlice, suffix, coalesce = self.options
         null_equality = (
             plc.types.NullEquality.EQUAL
             if join_nulls
             else plc.types.NullEquality.UNEQUAL
         )
-        suffix = "_right" if suffix is None else suffix
         join_fn, left_policy, right_policy = Join._joiners(how)
         if right_policy is None:
             # Semi join
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index f4a4704f3cc..81166b0b2f6 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -16,10 +16,6 @@
         "left",
         "semi",
         "anti",
-        pytest.param(
-            "cross",
-            marks=pytest.mark.xfail(reason="cross join not implemented"),
-        ),
         "full",
     ],
 )
@@ -55,3 +51,23 @@ def test_join(how, coalesce, join_nulls, join_expr):
         right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=coalesce
     )
     assert_gpu_result_equal(query, check_row_order=False)
+
+
+def test_cross_join():
+    left = pl.DataFrame(
+        {
+            "a": [1, 2, 3, 1, None],
+            "b": [1, 2, 3, 4, 5],
+            "c": [2, 3, 4, 5, 6],
+        }
+    ).lazy()
+    right = pl.DataFrame(
+        {
+            "a": [1, 4, 3, 7, None, None],
+            "c": [2, 3, 4, 5, 6, 7],
+        }
+    ).lazy()
+
+    q = left.join(right, how="cross")
+
+    assert_gpu_result_equal(q)

From d53e409f06058d59671e0377b40a71a08789fccf Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 26 Jun 2024 10:22:44 -0500
Subject: [PATCH 407/842] Fix `is_monotonic_*` APIs to include `nan's` (#16085)

Fixes: #15776

This PR changes `is_monotonic_*` API's to factor in `np.nan` while performing the operations.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16085
---
 python/cudf/cudf/core/column/column.py   |  4 ++--
 python/cudf/cudf/tests/test_monotonic.py | 19 +++++++++++++++++++
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index dfcdfbb9d91..5db6fd904a9 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -927,13 +927,13 @@ def is_unique(self) -> bool:
 
     @property
     def is_monotonic_increasing(self) -> bool:
-        return not self.has_nulls() and libcudf.sort.is_sorted(
+        return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
             [self], [True], None
         )
 
     @property
     def is_monotonic_decreasing(self) -> bool:
-        return not self.has_nulls() and libcudf.sort.is_sorted(
+        return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
             [self], [False], None
         )
 
diff --git a/python/cudf/cudf/tests/test_monotonic.py b/python/cudf/cudf/tests/test_monotonic.py
index 0896d91570e..790e84559a9 100644
--- a/python/cudf/cudf/tests/test_monotonic.py
+++ b/python/cudf/cudf/tests/test_monotonic.py
@@ -33,11 +33,13 @@ def test_range_index(testrange):
     "testlist",
     [
         [1, 2, 3, 4],
+        [1, 2, 3, 4, None],
         [1, 2, 3, 3, 4],
         [10, 9, 8, 7],
         [10, 9, 8, 8, 7],
         ["c", "d", "e", "f"],
         ["c", "d", "e", "e", "f"],
+        ["c", "d", "e", "f", None],
         ["z", "y", "x", "r"],
         ["z", "y", "x", "x", "r"],
     ],
@@ -51,6 +53,23 @@ def test_generic_index(testlist):
     assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing
 
 
+@pytest.mark.parametrize(
+    "testlist",
+    [
+        [1, 2, 3, 4, np.nan],
+        [10, 9, 8, np.nan, 7],
+        [10, 9, 8, 8, 7, np.nan],
+    ],
+)
+def test_float_index(testlist):
+    index_pd = pd.Index(testlist)
+    index = cudf.from_pandas(index_pd, nan_as_null=False)
+
+    assert index.is_unique == index_pd.is_unique
+    assert index.is_monotonic_increasing == index_pd.is_monotonic_increasing
+    assert index.is_monotonic_decreasing == index_pd.is_monotonic_decreasing
+
+
 @pytest.mark.parametrize(
     "testlist",
     [

From bfaddd3bcccac3ef38bf7c5d0e6fd55267b2f3ab Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 26 Jun 2024 11:49:21 -0400
Subject: [PATCH 408/842] Add exception when trying to create large strings
 with cudf::test::strings_column_wrapper (#16049)

Throws an exception in the `cudf::test::strings_column_wrapper` if the column size (accumulated offset values) would exceed max size_type.
Large strings created by the wrapper are not supported and discouraged due to the size and time impact on testing and CI.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16049
---
 cpp/include/cudf_test/column_wrapper.hpp | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 47d17988775..7363f965af8 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -314,7 +314,12 @@ auto make_chars_and_offsets(StringsIterator begin, StringsIterator end, Validity
   for (auto str = begin; str < end; ++str) {
     std::string tmp = (*v++) ? std::string(*str) : std::string{};
     chars.insert(chars.end(), std::cbegin(tmp), std::cend(tmp));
-    offsets.push_back(offsets.back() + tmp.length());
+    auto const last_offset = static_cast<std::size_t>(offsets.back());
+    auto const next_offset = last_offset + tmp.length();
+    CUDF_EXPECTS(
+      next_offset < static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+      "Cannot use strings_column_wrapper to build a large strings column");
+    offsets.push_back(static_cast<cudf::size_type>(next_offset));
   }
   return std::pair(std::move(chars), std::move(offsets));
 };

From e0b8ab01deb66ea0726373ecb9cc77cbbf0666cd Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 26 Jun 2024 11:00:25 -0500
Subject: [PATCH 409/842] Migrate string `slice` APIs to `pylibcudf` (#15988)

This PR introduces pylibcudf string `slice` APIs and migrates the cuDF cython to use them. Part of https://github.com/rapidsai/cudf/issues/15162

Authors:
  - https://github.com/brandon-b-miller
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15988
---
 .../api_docs/pylibcudf/strings/index.rst      |   1 +
 .../api_docs/pylibcudf/strings/slice.rst      |   6 +
 .../libcudf/scalar/scalar_factories.pxd       |   1 +
 .../_lib/pylibcudf/strings/CMakeLists.txt     |   2 +-
 .../cudf/_lib/pylibcudf/strings/__init__.pxd  |   1 +
 .../cudf/_lib/pylibcudf/strings/__init__.py   |   1 +
 .../cudf/_lib/pylibcudf/strings/slice.pxd     |  15 +++
 .../cudf/_lib/pylibcudf/strings/slice.pyx     | 102 +++++++++++++++
 python/cudf/cudf/_lib/strings/substring.pyx   |  88 ++++---------
 .../cudf/pylibcudf_tests/test_string_slice.py | 116 ++++++++++++++++++
 10 files changed, 270 insertions(+), 63 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_string_slice.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index bfaef732555..cecf1ccc9bb 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -6,3 +6,4 @@ strings
 
     contains
     replace
+    slice
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst
new file mode 100644
index 00000000000..0ee5af71c03
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst
@@ -0,0 +1,6 @@
+=====
+slice
+=====
+
+.. automodule:: cudf._lib.pylibcudf.strings.slice
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
index 5c4e5bf346f..c8220df8938 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
@@ -8,3 +8,4 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil:
     cdef unique_ptr[scalar] make_string_scalar(const string & _string) except +
+    cdef unique_ptr[scalar] make_fixed_width_scalar[T](T value) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
index cb7f71b1912..b499a127541 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
@@ -13,7 +13,7 @@
 # =============================================================================
 
 set(cython_sources capitalize.pyx case.pyx char_types.pyx contains.pyx find.pyx regex_flags.pyx
-                   regex_program.pyx replace.pyx
+                   regex_program.pyx replace.pyx slice.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
index 959aa94737d..d1f632d6d8e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
@@ -9,4 +9,5 @@ from . cimport (
     regex_flags,
     regex_program,
     replace,
+    slice,
 )
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
index b7384913286..ef102aff2af 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
@@ -9,4 +9,5 @@
     regex_flags,
     regex_program,
     replace,
+    slice,
 )
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd
new file mode 100644
index 00000000000..7d8d0006ef4
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd
@@ -0,0 +1,15 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.scalar cimport Scalar
+
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
+
+cpdef Column slice_strings(
+    Column input,
+    ColumnOrScalar start=*,
+    ColumnOrScalar stop=*,
+    Scalar step=*
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx
new file mode 100644
index 00000000000..df75134fb71
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx
@@ -0,0 +1,102 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.pylibcudf.column cimport Column
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_fixed_width_scalar as cpp_make_fixed_width_scalar,
+)
+from cudf._lib.pylibcudf.libcudf.strings cimport substring as cpp_slice
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.scalar cimport Scalar
+
+from cython.operator import dereference
+
+
+cpdef Column slice_strings(
+    Column input,
+    ColumnOrScalar start=None,
+    ColumnOrScalar stop=None,
+    Scalar step=None
+):
+    """Perform a slice operation on a strings column.
+
+    ``start`` and ``stop`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`. But ``step`` must be a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    For details, see :cpp:func:`cudf::strings::slice_strings`.
+
+    Parameters
+    ----------
+    input : Column
+        Strings column for this operation
+    start : Union[Column, Scalar]
+        The start character position or positions.
+    stop : Union[Column, Scalar]
+        The end character position or positions
+    step : Scalar
+        Distance between input characters retrieved
+
+    Returns
+    -------
+    pylibcudf.Column
+        The result of the slice operation
+    """
+    cdef unique_ptr[column] c_result
+    cdef numeric_scalar[size_type]* cpp_start
+    cdef numeric_scalar[size_type]* cpp_stop
+    cdef numeric_scalar[size_type]* cpp_step
+
+    if input is None:
+        raise ValueError("input cannot be None")
+
+    if ColumnOrScalar is Column:
+        if step is not None:
+            raise ValueError("Column-wise slice does not support step")
+
+        if start is None or stop is None:
+            raise ValueError(
+                "start and stop must be provided for Column-wise slice"
+            )
+
+        with nogil:
+            c_result = cpp_slice.slice_strings(
+                input.view(),
+                start.view(),
+                stop.view()
+            )
+
+    elif ColumnOrScalar is Scalar:
+        if start is None:
+            start = Scalar.from_libcudf(
+                cpp_make_fixed_width_scalar(0)
+            )
+        if stop is None:
+            stop = Scalar.from_libcudf(
+                cpp_make_fixed_width_scalar(0)
+            )
+        if step is None:
+            step = Scalar.from_libcudf(
+                cpp_make_fixed_width_scalar(1)
+            )
+
+        cpp_start = <numeric_scalar[size_type]*>start.c_obj.get()
+        cpp_stop = <numeric_scalar[size_type]*>stop.c_obj.get()
+        cpp_step = <numeric_scalar[size_type]*>step.c_obj.get()
+
+        with nogil:
+            c_result = cpp_slice.slice_strings(
+                input.view(),
+                dereference(cpp_start),
+                dereference(cpp_stop),
+                dereference(cpp_step)
+            )
+    else:
+        raise ValueError("start, stop, and step must be either Column or Scalar")
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/substring.pyx b/python/cudf/cudf/_lib/strings/substring.pyx
index 170c1016b89..706c21c0634 100644
--- a/python/cudf/cudf/_lib/strings/substring.pyx
+++ b/python/cudf/cudf/_lib/strings/substring.pyx
@@ -2,24 +2,16 @@
 
 import numpy as np
 
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
-
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.substring cimport (
-    slice_strings as cpp_slice_strings,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from cudf._lib.scalar import as_device_scalar
 
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
 from cudf._lib.scalar cimport DeviceScalar
 
+import cudf._lib.pylibcudf as plc
+
 
 @acquire_spill_lock()
 def slice_strings(Column source_strings,
@@ -32,30 +24,18 @@ def slice_strings(Column source_strings,
     performed in steps by skipping `step` number of
     characters in a string.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-
     cdef DeviceScalar start_scalar = as_device_scalar(start, np.int32)
     cdef DeviceScalar end_scalar = as_device_scalar(end, np.int32)
     cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32)
 
-    cdef numeric_scalar[size_type]* start_numeric_scalar = \
-        <numeric_scalar[size_type]*>(
-            start_scalar.get_raw_ptr())
-    cdef numeric_scalar[size_type]* end_numeric_scalar = \
-        <numeric_scalar[size_type]*>(end_scalar.get_raw_ptr())
-    cdef numeric_scalar[size_type]* step_numeric_scalar = \
-        <numeric_scalar[size_type]*>(step_scalar.get_raw_ptr())
-
-    with nogil:
-        c_result = move(cpp_slice_strings(
-            source_view,
-            start_numeric_scalar[0],
-            end_numeric_scalar[0],
-            step_numeric_scalar[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.slice.slice_strings(
+            source_strings.to_pylibcudf(mode="read"),
+            start_scalar.c_value,
+            end_scalar.c_value,
+            step_scalar.c_value
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -67,19 +47,13 @@ def slice_from(Column source_strings,
     at given starts and stops positions. `starts` and `stops`
     here are positions per element in the string-column.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
-    cdef column_view starts_view = starts.view()
-    cdef column_view stops_view = stops.view()
-
-    with nogil:
-        c_result = move(cpp_slice_strings(
-            source_view,
-            starts_view,
-            stops_view
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.slice.slice_strings(
+            source_strings.to_pylibcudf(mode="read"),
+            starts.to_pylibcudf(mode="read"),
+            stops.to_pylibcudf(mode="read")
+        )
+    )
 
 
 @acquire_spill_lock()
@@ -90,8 +64,7 @@ def get(Column source_strings,
     character from each input string. The index of
     characters required can be controlled by passing `index`.
     """
-    cdef unique_ptr[column] c_result
-    cdef column_view source_view = source_strings.view()
+
     if index < 0:
         next_index = index - 1
         step = -1
@@ -102,20 +75,11 @@ def get(Column source_strings,
     cdef DeviceScalar end_scalar = as_device_scalar(next_index, np.int32)
     cdef DeviceScalar step_scalar = as_device_scalar(step, np.int32)
 
-    cdef numeric_scalar[size_type]* start_numeric_scalar = \
-        <numeric_scalar[size_type]*>(
-            start_scalar.get_raw_ptr())
-    cdef numeric_scalar[size_type]* end_numeric_scalar = \
-        <numeric_scalar[size_type]*>(end_scalar.get_raw_ptr())
-    cdef numeric_scalar[size_type]* step_numeric_scalar = \
-        <numeric_scalar[size_type]*>(step_scalar.get_raw_ptr())
-
-    with nogil:
-        c_result = move(cpp_slice_strings(
-            source_view,
-            start_numeric_scalar[0],
-            end_numeric_scalar[0],
-            step_numeric_scalar[0]
-        ))
-
-    return Column.from_unique_ptr(move(c_result))
+    return Column.from_pylibcudf(
+        plc.strings.slice.slice_strings(
+            source_strings.to_pylibcudf(mode="read"),
+            start_scalar.c_value,
+            end_scalar.c_value,
+            step_scalar.c_value
+        )
+    )
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_slice.py b/python/cudf/cudf/pylibcudf_tests/test_string_slice.py
new file mode 100644
index 00000000000..bd63987b30f
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_string_slice.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.fixture(scope="module")
+def pa_col():
+    return pa.array(["AbC", "123abc", "", " ", None])
+
+
+@pytest.fixture(scope="module")
+def plc_col(pa_col):
+    return plc.interop.from_arrow(pa_col)
+
+
+@pytest.fixture(
+    scope="module",
+    params=[(1, 3, 1), (0, 3, -1), (3, 2, 1), (1, 5, 5), (1, 100, 2)],
+)
+def pa_start_stop_step(request):
+    return tuple(pa.scalar(x, type=pa.int32()) for x in request.param)
+
+
+@pytest.fixture(scope="module")
+def plc_start_stop_step(pa_start_stop_step):
+    return tuple(plc.interop.from_arrow(x) for x in pa_start_stop_step)
+
+
+@pytest.fixture(scope="module")
+def pa_starts_col():
+    return pa.array([0, 1, 3, -1, 100])
+
+
+@pytest.fixture(scope="module")
+def plc_starts_col(pa_starts_col):
+    return plc.interop.from_arrow(pa_starts_col)
+
+
+@pytest.fixture(scope="module")
+def pa_stops_col():
+    return pa.array([1, 3, 4, -1, 100])
+
+
+@pytest.fixture(scope="module")
+def plc_stops_col(pa_stops_col):
+    return plc.interop.from_arrow(pa_stops_col)
+
+
+def test_slice(pa_col, plc_col, pa_start_stop_step, plc_start_stop_step):
+    pa_start, pa_stop, pa_step = pa_start_stop_step
+    plc_start, plc_stop, plc_step = plc_start_stop_step
+
+    def slice_string(st, start, stop, step):
+        return st[start:stop:step] if st is not None else None
+
+    expected = pa.array(
+        [
+            slice_string(x, pa_start.as_py(), pa_stop.as_py(), pa_step.as_py())
+            for x in pa_col.to_pylist()
+        ],
+        type=pa.string(),
+    )
+
+    got = plc.strings.slice.slice_strings(
+        plc_col, start=plc_start, stop=plc_stop, step=plc_step
+    )
+
+    assert_column_eq(expected, got)
+
+
+def test_slice_column(
+    pa_col, plc_col, pa_starts_col, plc_starts_col, pa_stops_col, plc_stops_col
+):
+    def slice_string(st, start, stop):
+        if stop < 0:
+            stop = len(st)
+        return st[start:stop] if st is not None else None
+
+    expected = pa.array(
+        [
+            slice_string(x, start, stop)
+            for x, start, stop in zip(
+                pa_col.to_pylist(),
+                pa_starts_col.to_pylist(),
+                pa_stops_col.to_pylist(),
+            )
+        ],
+        type=pa.string(),
+    )
+
+    got = plc.strings.slice.slice_strings(
+        plc_col, plc_starts_col, plc_stops_col
+    )
+
+    assert_column_eq(expected, got)
+
+
+def test_slice_invalid(plc_col, plc_starts_col, plc_stops_col):
+    with pytest.raises(TypeError):
+        # no maching signature
+        plc.strings.slice.slice_strings(None, pa_starts_col, pa_stops_col)
+    with pytest.raises(ValueError):
+        # signature found but wrong value passed
+        plc.strings.slice.slice_strings(plc_col, plc_starts_col, None)
+    with pytest.raises(TypeError):
+        # no matching signature (2nd arg)
+        plc.strings.slice.slice_strings(plc_col, None, plc_stops_col)
+    with pytest.raises(TypeError):
+        # can't provide step for columnwise api
+        plc.strings.slice.slice_strings(
+            plc_col, plc_starts_col, plc_stops_col, plc_starts_col
+        )

From 65b64f675d5e87e45b7350782eab88293b633a49 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 26 Jun 2024 11:55:16 -0500
Subject: [PATCH 410/842] Fix segfault in conditional join (#16094)

Closes #16066.

I found a bug that would cause the reported segfault and have fixed it in this PR. When the right table has zero rows, conditional left anti-joins were returning a vector of indices containing garbage data.

Along the way, I refactored several parts of the conditional join tests and added coverage for more cases.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16094
---
 cpp/src/join/conditional_join.cu         | 13 +---
 cpp/tests/join/conditional_join_tests.cu | 92 +++++++++++++++++-------
 2 files changed, 70 insertions(+), 35 deletions(-)

diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index f02dee5f7f5..97a06d5a923 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -48,8 +48,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
 {
   if (right.num_rows() == 0) {
     switch (join_type) {
-      case join_kind::LEFT_ANTI_JOIN:
-        return std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
+      case join_kind::LEFT_ANTI_JOIN: return get_trivial_left_join_indices(left, stream, mr).first;
       case join_kind::LEFT_SEMI_JOIN:
         return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
       default: CUDF_FAIL("Invalid join kind."); break;
@@ -96,10 +95,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
     join_size = size.value(stream);
   }
 
-  if (left.num_rows() == 0) {
-    return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
-  }
-
   rmm::device_scalar<size_type> write_index(0, stream);
 
   auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
@@ -149,8 +144,7 @@ conditional_join(table_view const& left,
       // with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
       case join_kind::LEFT_ANTI_JOIN:
-      case join_kind::FULL_JOIN:
-        return get_trivial_left_join_indices(left, stream, rmm::mr::get_current_device_resource());
+      case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left, stream, mr);
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN:
@@ -169,8 +163,7 @@ conditional_join(table_view const& left,
                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       // Full joins need to return the trivial complement.
       case join_kind::FULL_JOIN: {
-        auto ret_flipped =
-          get_trivial_left_join_indices(right, stream, rmm::mr::get_current_device_resource());
+        auto ret_flipped = get_trivial_left_join_indices(right, stream, mr);
         return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
       }
       default: CUDF_FAIL("Invalid join kind."); break;
diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu
index 79968bcd7f4..7ab4a2ea465 100644
--- a/cpp/tests/join/conditional_join_tests.cu
+++ b/cpp/tests/join/conditional_join_tests.cu
@@ -20,6 +20,7 @@
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -222,21 +223,25 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
              std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs)
   {
     auto result_size = this->join_size(left, right, predicate);
-    EXPECT_TRUE(result_size == expected_outputs.size());
-
-    auto result = this->join(left, right, predicate);
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs;
-    for (size_t i = 0; i < result.first->size(); ++i) {
-      // Note: Not trying to be terribly efficient here since these tests are
-      // small, otherwise a batch copy to host before constructing the tuples
-      // would be important.
-      result_pairs.push_back({result.first->element(i, cudf::get_default_stream()),
-                              result.second->element(i, cudf::get_default_stream())});
-    }
+    EXPECT_EQ(result_size, expected_outputs.size());
+
+    auto result     = this->join(left, right, predicate);
+    auto lhs_result = cudf::detail::make_std_vector_sync(*result.first, cudf::get_default_stream());
+    auto rhs_result =
+      cudf::detail::make_std_vector_sync(*result.second, cudf::get_default_stream());
+    std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs(lhs_result.size());
+    std::transform(lhs_result.begin(),
+                   lhs_result.end(),
+                   rhs_result.begin(),
+                   result_pairs.begin(),
+                   [](cudf::size_type lhs, cudf::size_type rhs) {
+                     return std::pair{lhs, rhs};
+                   });
     std::sort(result_pairs.begin(), result_pairs.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
 
-    EXPECT_TRUE(std::equal(expected_outputs.begin(), expected_outputs.end(), result_pairs.begin()));
+    EXPECT_TRUE(std::equal(
+      expected_outputs.begin(), expected_outputs.end(), result_pairs.begin(), result_pairs.end()));
   }
 
   /*
@@ -411,6 +416,11 @@ TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnLeftEmpty)
   this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
 };
 
+TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {});
+};
+
 TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnTwoRowAllEqual)
 {
   this->test({{0, 1}}, {{0, 0}}, left_zero_eq_right_zero, {{0, 0}, {0, 1}});
@@ -600,6 +610,14 @@ TYPED_TEST(ConditionalLeftJoinTest, TestOneColumnLeftEmpty)
   this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
 };
 
+TYPED_TEST(ConditionalLeftJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}},
+             {{}},
+             left_zero_eq_right_zero,
+             {{0, JoinNoneValue}, {1, JoinNoneValue}, {2, JoinNoneValue}});
+};
+
 TYPED_TEST(ConditionalLeftJoinTest, TestCompareRandomToHash)
 {
   auto [left, right] = gen_random_repeated_columns<TypeParam>();
@@ -666,6 +684,14 @@ TYPED_TEST(ConditionalFullJoinTest, TestOneColumnLeftEmpty)
              {{JoinNoneValue, 0}, {JoinNoneValue, 1}, {JoinNoneValue, 2}});
 };
 
+TYPED_TEST(ConditionalFullJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}},
+             {{}},
+             left_zero_eq_right_zero,
+             {{0, JoinNoneValue}, {1, JoinNoneValue}, {2, JoinNoneValue}});
+};
+
 TYPED_TEST(ConditionalFullJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}},
@@ -705,20 +731,16 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
     auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] =
       this->parse_input(left_data, right_data);
     auto result_size = this->join_size(left, right, predicate);
-    EXPECT_TRUE(result_size == expected_outputs.size());
-
-    auto result = this->join(left, right, predicate);
-    std::vector<cudf::size_type> resulting_indices;
-    for (size_t i = 0; i < result->size(); ++i) {
-      // Note: Not trying to be terribly efficient here since these tests are
-      // small, otherwise a batch copy to host before constructing the tuples
-      // would be important.
-      resulting_indices.push_back(result->element(i, cudf::get_default_stream()));
-    }
-    std::sort(resulting_indices.begin(), resulting_indices.end());
+    EXPECT_EQ(result_size, expected_outputs.size());
+
+    auto result         = this->join(left, right, predicate);
+    auto result_indices = cudf::detail::make_std_vector_sync(*result, cudf::get_default_stream());
+    std::sort(result_indices.begin(), result_indices.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
-    EXPECT_TRUE(
-      std::equal(resulting_indices.begin(), resulting_indices.end(), expected_outputs.begin()));
+    EXPECT_TRUE(std::equal(result_indices.begin(),
+                           result_indices.end(),
+                           expected_outputs.begin(),
+                           expected_outputs.end()));
   }
 
   void _compare_to_hash_join(std::unique_ptr<rmm::device_uvector<cudf::size_type>> const& result,
@@ -826,6 +848,16 @@ struct ConditionalLeftSemiJoinTest : public ConditionalJoinSingleReturnTest<T> {
 
 TYPED_TEST_SUITE(ConditionalLeftSemiJoinTest, cudf::test::IntegralTypesNotBool);
 
+TYPED_TEST(ConditionalLeftSemiJoinTest, TestOneColumnLeftEmpty)
+{
+  this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
+};
+
+TYPED_TEST(ConditionalLeftSemiJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {});
+};
+
 TYPED_TEST(ConditionalLeftSemiJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}}, {{0, 1, 3}, {30, 40, 50}}, left_zero_eq_right_zero, {0, 1});
@@ -873,6 +905,16 @@ struct ConditionalLeftAntiJoinTest : public ConditionalJoinSingleReturnTest<T> {
 
 TYPED_TEST_SUITE(ConditionalLeftAntiJoinTest, cudf::test::IntegralTypesNotBool);
 
+TYPED_TEST(ConditionalLeftAntiJoinTest, TestOneColumnLeftEmpty)
+{
+  this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
+};
+
+TYPED_TEST(ConditionalLeftAntiJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {0, 1, 2});
+};
+
 TYPED_TEST(ConditionalLeftAntiJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}}, {{0, 1, 3}, {30, 40, 50}}, left_zero_eq_right_zero, {2});

From f1efa40fb73fcd3f50813f5426064e5f2f1d48cc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 26 Jun 2024 07:49:20 -1000
Subject: [PATCH 411/842] Reduce/clean copy usage in Series, reshaping (#16080)

* Clean up copy usages in `concat`
* Avoid always shallow copying in `unstack`
* Don't extra copy pandas objects in the Series constructor

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16080
---
 python/cudf/cudf/core/reshape.py | 62 +++++++++++---------------------
 python/cudf/cudf/core/series.py  |  4 ++-
 2 files changed, 24 insertions(+), 42 deletions(-)

diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 903c4fe7df5..1120642947b 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -300,51 +300,31 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         obj = objs[0]
         if ignore_index:
             if axis == 1:
-                result = cudf.DataFrame._from_data(
-                    data=obj._data.copy(deep=True),
-                    index=obj.index.copy(deep=True),
-                )
-                # The DataFrame constructor for dict-like data (such as the
-                # ColumnAccessor given by obj._data here) will drop any columns
-                # in the data that are not in `columns`, so we have to rename
-                # after construction.
-                result.columns = pd.RangeIndex(len(obj._data.names))
-            else:
                 if isinstance(obj, cudf.Series):
-                    result = cudf.Series._from_data(
-                        data=obj._data.copy(deep=True),
-                        index=cudf.RangeIndex(len(obj)),
-                    )
-                elif isinstance(obj, pd.Series):
-                    result = cudf.Series(
-                        data=obj,
-                        index=cudf.RangeIndex(len(obj)),
-                    )
+                    result = obj.to_frame()
                 else:
-                    result = cudf.DataFrame._from_data(
-                        data=obj._data.copy(deep=True),
-                        index=cudf.RangeIndex(len(obj)),
-                    )
+                    result = obj.copy(deep=True)
+                result.columns = pd.RangeIndex(len(result._data))
+            else:
+                result = type(obj)._from_data(
+                    data=obj._data.copy(deep=True),
+                    index=cudf.RangeIndex(len(obj)),
+                )
+        elif axis == 0:
+            result = obj.copy(deep=True)
         else:
-            if axis == 0:
-                result = obj.copy()
+            if isinstance(obj, cudf.Series):
+                result = obj.to_frame()
             else:
-                data = obj._data.copy(deep=True)
-                if isinstance(obj, cudf.Series) and obj.name is None:
-                    # If the Series has no name, pandas renames it to 0.
-                    data[0] = data.pop(None)
-                result = cudf.DataFrame._from_data(
-                    data, index=obj.index.copy(deep=True)
+                result = obj.copy(deep=True)
+            if keys is not None and isinstance(result, cudf.DataFrame):
+                k = keys[0]
+                result.columns = cudf.MultiIndex.from_tuples(
+                    [
+                        (k, *c) if isinstance(c, tuple) else (k, c)
+                        for c in result._column_names
+                    ]
                 )
-                if keys is not None:
-                    if isinstance(result, cudf.DataFrame):
-                        k = keys[0]
-                        result.columns = cudf.MultiIndex.from_tuples(
-                            [
-                                (k, *c) if isinstance(c, tuple) else (k, c)
-                                for c in result._column_names
-                            ]
-                        )
 
         if isinstance(result, cudf.Series) and axis == 0:
             # sort has no effect for series concatted along axis 0
@@ -1179,7 +1159,6 @@ def unstack(df, level, fill_value=None):
     if pd.api.types.is_list_like(level):
         if not level:
             return df
-    df = df.copy(deep=False)
     if not isinstance(df.index, cudf.MultiIndex):
         dtype = df._columns[0].dtype
         for col in df._columns:
@@ -1195,6 +1174,7 @@ def unstack(df, level, fill_value=None):
         )
         return res
     else:
+        df = df.copy(deep=False)
         columns = df.index._poplevels(level)
         index = df.index
     result = _pivot(df, index, columns)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 15ad0813601..ea25d482578 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -584,7 +584,7 @@ def __init__(
             data = {}
 
         if isinstance(data, (pd.Series, pd.Index, BaseIndex, Series)):
-            if copy:
+            if copy and not isinstance(data, (pd.Series, pd.Index)):
                 data = data.copy(deep=True)
             name_from_data = data.name
             column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
@@ -3434,6 +3434,7 @@ def rename(self, index=None, copy=True):
     @_cudf_nvtx_annotate
     def add_prefix(self, prefix):
         return Series._from_data(
+            # TODO: Change to deep=False when copy-on-write is default
             data=self._data.copy(deep=True),
             index=prefix + self.index.astype(str),
         )
@@ -3441,6 +3442,7 @@ def add_prefix(self, prefix):
     @_cudf_nvtx_annotate
     def add_suffix(self, suffix):
         return Series._from_data(
+            # TODO: Change to deep=False when copy-on-write is default
             data=self._data.copy(deep=True),
             index=self.index.astype(str) + suffix,
         )

From e7cf69dc932636e9dfd32ea1bebefddc3f31d2f2 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 26 Jun 2024 13:28:15 -0500
Subject: [PATCH 412/842] Implement chunked column wise concat in chunked
 parquet reader (#16052)

This PR implements column wise concat in chunked parquet reader which prevents over-utilizing memory for the regular concat.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16052
---
 python/cudf/cudf/_lib/parquet.pyx      | 36 +++++++++++++++++++++-----
 python/cudf/cudf/io/parquet.py         | 20 +++++++++-----
 python/cudf/cudf/tests/test_parquet.py | 11 ++++++++
 3 files changed, 54 insertions(+), 13 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 7914ed7e9d9..d1ec5be9e62 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -20,6 +20,7 @@ from cudf.api.types import is_list_like
 
 from cudf._lib.utils cimport data_from_unique_ptr
 
+from cudf._lib import pylibcudf
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
 from libc.stdint cimport uint8_t
@@ -70,8 +71,11 @@ from cudf._lib.utils cimport table_view_from_table
 
 from pyarrow.lib import NativeFile
 
+from cudf._lib.concat import concat_columns
 from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
 
+from cudf._lib.utils cimport data_from_pylibcudf_table
+
 
 cdef class BufferArrayFromVector:
     cdef Py_ssize_t length
@@ -878,14 +882,32 @@ cdef class ParquetReader:
         return df
 
     def read(self):
-        dfs = []
+        dfs = self._read_chunk()
+        column_names = dfs._column_names
+        concatenated_columns = list(dfs._columns)
+        del dfs
         while self._has_next():
-            dfs.append(self._read_chunk())
-        df = cudf.concat(dfs)
-        df = _process_metadata(df, self.result_meta, self.names, self.row_groups,
-                               self.filepaths_or_buffers, self.pa_buffers,
-                               self.allow_range_index, self.cpp_use_pandas_metadata)
-        return df
+            new_chunk = list(self._read_chunk()._columns)
+            for i in range(len(column_names)):
+                concatenated_columns[i] = concat_columns(
+                    [concatenated_columns[i], new_chunk[i]]
+                )
+                # Must drop any residual GPU columns to save memory
+                new_chunk[i] = None
+
+        dfs = cudf.DataFrame._from_data(
+            *data_from_pylibcudf_table(
+                pylibcudf.Table(
+                    [col.to_pylibcudf(mode="read") for col in concatenated_columns]
+                ),
+                column_names=column_names,
+                index_names=None
+                )
+            )
+
+        return _process_metadata(dfs, self.result_meta, self.names, self.row_groups,
+                                 self.filepaths_or_buffers, self.pa_buffers,
+                                 self.allow_range_index, self.cpp_use_pandas_metadata)
 
 cpdef merge_filemetadata(object filemetadata_list):
     """
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 58b104b84e9..2a838ca7417 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -908,12 +908,20 @@ def _read_parquet(
                 "cudf engine doesn't support the "
                 f"following positional arguments: {list(args)}"
             )
-        return libparquet.read_parquet(
-            filepaths_or_buffers,
-            columns=columns,
-            row_groups=row_groups,
-            use_pandas_metadata=use_pandas_metadata,
-        )
+        if cudf.get_option("mode.pandas_compatible"):
+            return libparquet.ParquetReader(
+                filepaths_or_buffers,
+                columns=columns,
+                row_groups=row_groups,
+                use_pandas_metadata=use_pandas_metadata,
+            ).read()
+        else:
+            return libparquet.read_parquet(
+                filepaths_or_buffers,
+                columns=columns,
+                row_groups=row_groups,
+                use_pandas_metadata=use_pandas_metadata,
+            )
     else:
         if (
             isinstance(filepaths_or_buffers, list)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index e1e7952605b..588bc87d268 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3485,3 +3485,14 @@ def test_parquet_chunked_reader(
     )
     actual = reader.read()
     assert_eq(expected, actual)
+
+
+def test_parquet_reader_pandas_compatibility():
+    df = pd.DataFrame(
+        {"a": [1, 2, 3, 4] * 10000, "b": ["av", "qw", "hi", "xyz"] * 10000}
+    )
+    buffer = BytesIO()
+    df.to_parquet(buffer)
+    with cudf.option_context("mode.pandas_compatible", True):
+        expected = cudf.read_parquet(buffer)
+    assert_eq(expected, df)

From 7ca4f480cc93d333398d2c5dd32802a60ebb5366 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 26 Jun 2024 15:54:15 -0500
Subject: [PATCH 413/842] Explain line profiler and how to know which functions
 are GPU-accelerated. (#16079)

The `cudf.pandas` docs could be more explicit about which functions are accelerated. The answer is "most of the cudf API is accelerated, and you can check the results with the profiler." I've answered this question a few times so I wanted to document it.

This closes #16074.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16079
---
 .../_static/cudf-pandas-line-profile.png      | Bin 0 -> 15125 bytes
 docs/cudf/source/cudf_pandas/faq.md           |  16 +++++++++
 docs/cudf/source/cudf_pandas/usage.md         |  34 ++++++++++++++++--
 .../cudf/source/user_guide/api_docs/index.rst |   2 ++
 4 files changed, 49 insertions(+), 3 deletions(-)
 create mode 100644 docs/cudf/source/_static/cudf-pandas-line-profile.png

diff --git a/docs/cudf/source/_static/cudf-pandas-line-profile.png b/docs/cudf/source/_static/cudf-pandas-line-profile.png
new file mode 100644
index 0000000000000000000000000000000000000000..1d5a07c72eb06ab9270ab13e06aa31d55d4981ca
GIT binary patch
literal 15125
zcmeIZcT|&Y_vecR5l~TSBJCka7m?mU)KH`e2uN2zy7XREdXN&3-b9)ZYC>-jX`xFe
zRDsX~NC`+nnFOBaecy3@bIw|4%{lALTJs0FNyyE8wY~RefA@8TKUY(@PIi}!h=}OA
zlHyZMBBD!G_}|f2FXCVO-xRmtFBe=i6&@3n^f9dB-(0qoQ<WnkDvu^VdQFUfPwJ?s
z=R!nu+m-NhVa9^llZZ&7Q0b}M3s2*X^waKp9e&GOCSKo5ZZ~+c4C=qMLf*L*#9Hd0
zS{68Wf3A)54rBGjtV%DH+nN1o->#V5dj0(w?cS3wj@Z}1U!)T_?*J}pB;4dnyBA0)
zseF;1-Y(eX@~!R@=oqFzU0qjuEU_ecHN!i@tI?CS*Hu?rSG!*jlak?|ajbrTZyXWP
zw+p~4gzK+I;0uJS)N^yfg@}TRj&S*Q>5o5u0r_I)<-T27-jn+{l#h}7IP0y#+=>{L
zI(R1&<|Kd0ypToCJf^_!EKJAVBJ@xt#QIgzHCW(lcl@tNr=n1=?ZUfd<RJ9-hje-G
zUkj*QYB5h@Bz!@jC(2A(g-Y*dTk}K7JO93S=-a~s&m;_pq4)xEk~9&~IIpa4pZ>Sn
zW&f`Rua-ZPY=EUXICn;0?-LQds{V|){(-521k`Z9hY3GPG*q>6KLYn3ffIv~mx$g?
ze*lPH_5Q)cxdVOZLqt?&@EmnTj$+~x(3r-8f{4hS95nUUik#19ME7^}1)^{Ea$x$)
z(pD5yUV7&b=5!9kKTo3+5sIZv>}OcJ^7Y->zjfaDuA~>PW?Yl^KWmXfBVgC)8~gcF
zYX-g$TJ;q^@Bvms`afx?`ut|UvEL4*dJ`5_k8NlTmHoYB7A<`cNP)3tTWO}U=~sH|
zm1#kB7&_~RX63lCi5m~_SxBr~d@0b6-snW(+fVZd%<=1FBqG0FTw)th&7n3Zd$R2?
zN!H&8i;BrYyeaJO3~0cNgsQm;tC=)lo9qi;-vW+iwxcOnL20Jn(OC#0qKEOq)Q6k>
zCu@g%CrhOs-MwViSq^HoTiFx=*koOQbH2Q_FUuuf?poX9$Ijvaca=7gi7njl!HKp$
zcCrPxSj!|Z>NuWSza@2`DBAOEdPIMn6nWSC)$d;lrBb`Ap|akY!;M>Jnf^;70Rqx}
z>6FvUQF@~BQH>|!tB{PkJwNN!M7{u>jCGc5?ZBBL549`B#tU)E`)T(J5Iz@SBt%5@
zxwNq<Zi~n8ntNjlzJoTd^Cy-~vs~d1yf;%UMX95w`4U+NJBH<b*kqix3&CFkj`mr1
zDrC)z=01y_9yO{LvzT2m*!{W3Cit6s^hcoDX+}eUt7gvoDe>@^!S>vYKF2OlO;hPL
zdUYH42EG-?GerSvoBpS<iaYsD2mN(#xREZ<FaQbVg0OA0+jcy2<IbWwVtEuNvoAU8
zV$tEBQ<<5uiCb~-uRRTW@%cy5<XU*NukUYEC2^IMPi-epUxx4gQ4M-E<WREzcqB!1
z9dC6SUrZ$CA<iF7j2k_>hQ+pj#Fb7RRliaiUP6<(9?Uv4dU1|vwmo+naM9(cx5kz?
zE<G`g?5xttS3@c%$oHv9{eETHOA)Y}KC(Hry>-&2Wl=`Kbn>=_^NROPBhk~717@Lm
zX1}Jv4%a;Hy`PBzSTDi;bP=W|ul3pbsHEC}6Zp}AXq{iJ%NN`f9iq=EXw3yc-6LK;
zw%?ndM8YgYiTMq&xDwGZi%wfnN%vmkldp%G_bai2eCLB2u6i>a9mA`AFl)8l74$El
zz4H1-TNnL4@%9_r@9eao;S|o|$M-~Sh=>?`u(_<+;WDS#M)l+!4jG&?`pU((_AlSN
zwy@5gboiflZ>PMYx4!jm+*?`-N%DHQU?GH7J<WUV&>=;{jxu*4_3pq$xbH$InCYoE
zroUg-W`D2RO9g1j&EIzw85b+bf&Ibw$~}tpMJ8o|<L0klFcOkXzr(~OWOl!-Ll@FC
zcfrA1ZMY0KDU}y1%@HwUbu0Dr$;)YloK3UlX%Cd}OtqP8LAuX=Zva2DmF!WXXr{1F
zzRIIKz4Z1bWYfqAE+sPAdjCsl$rKtH{UTomr?54lp0RZ@VA)$^8<2B>=qDmeS^l-Q
z`;W!e`gu$K?x^aEZtI=OnY6ax#h12qkhp1~!|`q65{|*7W3|)yku4L65nh`B=hYY0
z@8GO&$Jw!!18r}oh9w`m7x(L~^j^rsnVeeW&9wy7A7Zyo_jP;OCW#th*;;bb55t=b
zww>gEkZrf1`ocF|ICv#HZrSvQHy$odxy)`=oDAp6uAWS8Bsf<kRV~knlx>R-d;rrp
zpU>Uewl0P>R&uMHR2UTvFi)w>mzyQI-1t6kAExjj*@92R3yECE-O?Q3BO(%u-CJJ{
z!VG><4l!bB7#XtzJ?Oyl*cz)w)YGW4)P2J|y$f69GmhtprjR`<11tKpjXc;W$Mjnp
z()uY}^4^1PG~>-+KBE5^Ig4n#u3@9Vxi<}p7W|+Sme@~l1N(gEO`jfpD?r^V&R@d)
z{mqttxZ9ZZEAr(W_n^o7X+mERPoxjLW}{<{G=fxMb0s&VpHDaBt@t!9A|o<zJr667
zk~TixCL)^1HMs~~2PvB~x+X9N0Kq16UaMPsAwLv*9p7@_GWzKah*a`2GcHu$!F>G$
zYa{FEXV~2AFOfD2sO)f_Kbdt8XtoS~Yl^)}oL<l>#%dKDChC4RzPLm>*H3nb<CnWd
zhO~WH25VDz-%>?oBBe2DR;BFG&k-4MdDO_0E*nhmScnR2U$eo-AlTotpg|(jG!Gfk
z<SWoJET?@cfe*b5Uap;pQl1C;%|=e?Z#e2Ma&7tUzJ~R0i#1+<jrH@ZT~Z)73T4o&
zQdIT%i_0<sSXi@yO=?n|--OR%Q)PWNBP=&&^&%?GQ~_Q=VpRsz-k$?%7KYm(;EW~q
zjZH{&svmYT;={)FUTL>&f4FoG*MqVBjrVq}gYs34#c7m&$lR!#96o9(I_8aDlSkgg
zQ|bnCd3w?mO9xLj8WE-ACNq-9JqG$U3&#+XlX1W4d}v3L(Xq0xV6XAsNPMbvu_IPi
z*3x0|3eh`1Io)?hb@N&95AKfG-Tu?a`*%G8ev!;KSs=qp9z8pid=u@v8*;SN-G9;_
z5HI_?4BLhiIPahxStvVwF_nCwXU>+-@ft^_SHbCn(-wM@I8w1~)JmT&qs!5b{q4!(
zyV5`4=`1|8dtJ5EqLS9$%B!mprQ@@U*a45e2z9>`_52xWBOf)p!_PI-oI{`9mYOZN
zeBrHOCTEY4<}lDf5luew&Fp(C)_=0g?C-NDN|)!xGiHb8lfE2q#dl@L&!nkjG7mb9
z>{c`O-#3L5uYDpWdbjWl?*5`-dDd@vGI~PURmOsMaA|&B-?h8|TP^&hw<A%OE~dzB
z?g#oJ=i9yYvIql1zHgCiGCKS_BrzDxNp!#ffaQMGd$f4?n$}3>W}!5PyY3JA`T->K
zPpa9yv^`O#7biM;_?2CPn9eyKg0#T7&5{R;AH$eWR^s^%PW^xQF(3Ex$uOUQ8`BMe
zH)5_p<8;#+4^AbpTlerGzdX-Ga&2aJgZc9YZ6qc0AtqGzxJOxb<%Mpla<T2qGe)Bq
zyH8W2`2s8+?MogzT2(bv43xG!Fy21l3knu>p1~Vq<M65NMT?_t@HqJ<wXS)*@ZRSZ
zNv=K=O<y<%k#oNDv@DT^m)f3xYMH2jIg+_}jRqeC_5!rMu)4GfXmMl$poQ#=rj+^a
zWhs34Lg~7hIx^Ng6M%67mv75DX<U>>z)r*m;9jY}Av5C7QE&c1H#V%nox;tlbT%WW
zyF)8-U`(IV=~i>`cE5TyLb>H{BIEc08Oc7~Jv|Av@7f(YJti(FTaxXFVER-b!*RNO
zaB9X_yjh@&Y7wkzPku5@M1782B}QUgJOiENW3J>+YjEm(0dth<Y95e-Q|zyDVSfcB
zIf}gJK591IIP-o|P=nhaxaUh-_BW>4S6JQL`z*cYQ1SD={W*RR1MLYP{YO9k7a(X+
zDdG~vcxI(oelI*IcD|Zvh=d<1+%Nr^!gPfh8nN>uIC?$%PHoDyxYu|8W@Y3#HlxKx
zTb_c6jTz3E`V6nHgmgZ<e$6F1G#o(QAa=)s1drV3pAqg}Vxn-V0~jbRStPri&qPu^
zxR$b(ACIVrG)gM3EB`qf?*V+|XVmroO1Fo3cIJ`;0|i5|m$}c|Vp#9E^Ai<pi@laY
zByAPlBQ6KF{A{uBj~Vk1zWMlS@*N{75CQrPEB{99;U42(GYVH(g3P}}DBL2<7pm}|
z`2wk2WiiiKnb)|CpELZ&_&E~?fqV}!lO8fPUTjnTi%TRPn|rl~kIOtPYn>PR(hbJ=
zqO0OedxocLZrgn_Z1PuD+p{udKJD?5_217Im2v5u7UPO;7|}l3UYNaJ?GQ_AUt%0n
zu8ZDQ40v#_+5|4eUFJV;Ih0m#U|ctg6u4pO(reylu@l<IzCBckZy-KFiGdIjl*JlO
z=x?Vu_PS4iRJ!q!&u9_n1P7A<iK!Vj!Q2flXWy$G{F4F1Rr!cC)mw`z>a#;*4hn_;
z?n{D}xmKxJzX?&tQ}E(NbxGZ*X%%hrt-B+?Z>#aIlY)FJ##d@~?|D2{&5T>tpIV7j
zke`gGJsqK6uXSS=FTkQ`{ZA%DH?1?g`N!*(n)ZgPCZ%vo-H;Ohg=G^-N$$UpO@NO6
zB`Totj-3pyvb2c310}f}cri6Epr8SlJ5Yjc_|2FPu}xPMqzI2#0W%xmXQn70kv9q|
ztTY@8V3D$~)2`MD)g6G0+**$^V0RiGzxO+oQ8=x@dH}n6k<@07D1pUA4d20?54E1!
zr}^{;aVbcb+28KTK>$(j_xbYRbQT0#a!PmqI*&7@8X8#gqRpZFN3!@l=swNlWH|MY
z;yW9M9>9U8fmOMPJfA>CFX;nVDG1%Rh8E%V`t1v)2JS;gkv)BZUiG^(X>Qw`rb4Bf
zbLJPe)?ccoRV5t0X|%A>zs~Lbl-mNNyz0_#RC&6qywc=v_QSEnBAQ`5hC8NpI87wC
zj4Ig$$_5lE<uWif7WX;1_%*q4q<-ds=}h8uM7rb)b5=)-?K^c;O#1iyM@A~C^r~wh
zpY$qp-j*=%OX)Csc*VIvqVqGvi&qzC#7jzxw=E*SyI(4=><Oz#k|f=~^sd*_dB;+)
z6w&E>5NR1;)8v~tEK!k;S_@>q;ykJtZK)yuHg0S;uU*KN*9HP&cq(zaLz{8En(x3#
znfh>0SslYsu@eH?$mo4&v6H<EGyd4-zu3!1w>#4WP%sAh4O+T<kIvW9pp6Dt7?o&>
zfuA^R2t*)1vCA)}<rV5P(z;(=dy}&2j+7oWe$F|WWv2?HM=;1n7JRzPj1K7$M`9Zh
zc}q=jA{&c-FJ?m#XwG$quALWDNMT-4YFd*q#G0u7i=jYr4siZD7t~|Y<iOJk?~geW
zD@uMnAM6`{RVQAoM^~nEKnpjAEp2a1R@apyEW-_|d)+-;m$nOcH}41s)5Vz`n(^fg
z6;wCP$X0sPjJ|XZXg^lJy4d>W@dKf{y6o9uhkk<TI7p3{+$(8%)i4bc@6w^JJw*$7
z_u6qei&PgP=F}@5FabYyj&M#XGQMe{x-Mj!q5JIVHxZq&wXYQs9!Ykc)SBF4!&U|r
z1us=!YkLc=aSg9rxc_7+?K48~xu6e3v^kivNsj{5(A7H9-FtYzbUfPV)R)d2f6XlM
zW&Wj-ipb0MTEH9fh5Rn_rQ2G>ZKA&odAU~Im9TL)BeIZkiRmY)14wH+;oE&Lt^Zo*
zbiP^p#n5Gye?Qs$@jRchkI}C<?C{-@i1yfO_DUCS=u9{Ngv>E@jWueMd@LM(@>@ON
z2>t8U*QDZ%-oKpJzgHQ$mu+}a>B*#C=L7Ps!mRy?Ty}rLW!L95T5Aj=zbD7rayn#o
zZd}P-?9o;68?DwEve8t&o>!>VX<cCH90Aqtp6pmC3~xeM7)&rDpQ_|4%DxUY?;{0C
zT@r?gHjq9@8R)8D1RBelX)z?kIn9bZM?JEfMqy*3XP;<w8*0!C?6LUmDTY{(1jnmQ
zl&21Kwuk-1O7qb<qR_euP1&=XnUZ1bCr}0|uNwGc^Tn&2(26w2%7W5ws)BX2dQCK(
z)9DRf&JF^m?IQHE4~IHN=8L(ZI9zb2h@hp`jL>c7c-4mR!M5zM^4hz>f;@@5vQo|;
zAND@y1jJD4$LDvgyv$25<uRz}E9<ns3hCCVo&n8$9+(bKX}NRM5yu}dZx)muD~{KM
zt_}BO?6TLG%kvD)a6%cZMpC%SZtHcv04A{Yb{B>w>jQymLaDl_ZO(g7m<K`kT~g6+
zS)5k~x}&I(@cKPPPAFy}=i-vF@??)9nAKoO@yISj+}7S`{5win5(2@vOk1(ck`}p4
zems8tF#;;8QRhj;AFG?~Xt79E`U}LFkRZrwzJAcLo;gx!^DJM2w-Rf!)5>?pR#>yB
zrOI5i`xa|s_{hiTGBPbL3FmRTWS0+s{ztuNafc7e78L?-6~Lkm1E!=v8j?y90iS7V
zo-q>H!LXqV@X*@ehY-Z71*8;!OJ3l1{ylPJUkZrWylmn~VN&40>cR;%_Wn3y`Z~^s
z_pRVI1VJsTUzA+nATap@`HOYP5X3OEFL=*IM44kEl@sr#?{q>m#CW3osbMMclQC|y
zCi^-3u&k<)sV41g#s%li873r+5J#h1HSz(CZHPh-Mku_1ih<N!<sU2QOZ^#3ny!Sp
zACpr68PgFM)H+y^Q_tDW0?6(qM&Dr!*t+1cV0Cr=gTyazaTK*z1JE=D^U*~Tlx8hb
zig*L&rsjOM&zbF`4WUL#(^Au=Yo)@nXThK&AUUo}sW(8z8d!BHyg_Y;3+I$L91fE$
zo5XEjqt@$N(khi{;(p%c-oM1jpI$7Lx<dx0XKDBH;H+9g?rD}Ha=q@%kg2%hUH!e%
zXLIdQ)--}}Qb^n!nJD+NkZ`Vc=#*M@fpzb*g7K^J!Yzf_DVm&Wh45P{JTv95m;ci$
z=sPM<<%g`K&8C8Gn2Bc>>8LM-$;Q9QTQ4p}<Y)c{qCl0hL9e!VY2DTt<_0<CB~vcf
ziyW}!eyo3yPy4+AV41>5B~plZBgbD%%*;J?Z9_fsl;b`Yexj3w5QrHFN{sW>>nesc
z0@H<MgzAFs3y!kkoB-eoz=37qQs|plW)G!?gy9?OmVDGE;B|+mGg#!_ZMfU(IGFWq
zIH;iyxn~Oh5tf16lYH`;DJ|2R@_b%nL<6heDeP$;J2!OU-dv*Ktg8DilKE05mo7gq
z;M32fXWrLZ+JDuuO{zCD@EiOUQt;^xGdc|K;u8Sf5SP}Brkgev4jA?tUvI7od-IuW
zk79&hKwS`dXBTOXa{JXwbIQg*rU{06YdC}ZYD&CJkZr?Qd6uqP_q6=!-Tpez6x)cM
z5;(9fHU*|viZDVwhN|XkNj8D*JNY;B_(ue1^bRA#TW5F}+G+ptbJ95~3)_vnUCSzd
zs6UvvSw)3Br5v-JGA&<((W&?xxrbq4t87l$z+<G;x=E(mk$A*|b^P*QD8Q2xtJetG
zGVc<2VwA&-CvCGl2p3oEUHP5$iYdR0RUz`T_8ZPyo6PzTMqD3~L~kNqbE}SAdU+Pi
zw9UV!65@9TM`6BsgwvFFS|yd2XlM6RF547CmeG;HOy5y;<HFIZ7K`u5iq5{kLow7b
zY7Osj$Biq$H#hvHyFtKWkRdab*9j`8>;6oK33!9}`&)R=(k<cCPcM=;Iw_<SIMY;9
zd!!K2#k0)+f>wf3&FSp-9K{xEd;%LwO6^L~e8YF$9tX8OPTYwPR~)HwC?B`pKe2l@
zDC;+vt3my*Tk*WhoFOhI=JHkjbL<%??Td5nJm&vEooP7|H44;wP>4xhnEXXEm`UE5
z04ys&<B*0MplInc@C4h^sxhTCII3pKQR3SVKJ%BK5kWT}FNNbHCP5f4naDN<A3k_R
zSu^nA=3|7^KwI`zQGKDS&Ld11QGG`m(F;KKpqm2fc;h1I>iCcGy8560f}pKaCI53f
zUSCIO6ZG{D07^W(bKo5&L1VAAA!zK+%^zQ7*?o$K61>j-&zSy?#4ZOWWeMsbEbnvf
zKN~^;mY^6f0`UKfr-}c!Y8(dS>$&vvXHS;Q`)4*5e4bmZ_?ZyEk2s=|oP&Z;S?K>v
zicb{I6S6-?3SW^Rq^MI?a)0MHLG-+L@uM1zrRNokNPYKT40hBPfQkR{emaG{F=@3)
z?O}7eq@R7$O|%hTRqy0j=8Y=#K5|hrBO^)rgZu|ejs_`8j}c7r|2a3^P8#M;9ry&v
zhhDVb;u&@iGs!$vdyHLl#9{Sbf;V39d5AqmY=i!_n#?W$MPwr$_#EFBV+Ywi7_D+|
z)N<W#k002-vYVT~*|;C+Q!0%yEt^JlMXGgGd$Oe6gvt_zz#D|-UDD>M)jRb5zF=X1
zIc_ev3j!*@MWM=fFfV<m-4Pq;uR8Yk7`@Sm3;|(pqV5=R4wxaZsx<T1kNPYjtcKmA
zsx6L453iR?=|1$36hG-Zq=m<6qF5KPK{0uRl>@u2Ym-)!<&ZTrDgZAUQ_0ds(n(>u
z)QnWJgLB0LC2wgukN-SC1kilUDfJ$4$htW6bNE)XfO%i1il}}P*0#|%idYy%+i<{3
zU#o^T*8+;rDmWP40#l#Hr2^_Yj&rSil%Jqq!krJXytRI&D}%p(<ar;_M4UW&4V+Bz
zDGjS;8)Bxf{x)17hR?fZJr9(ZoYcUz(;sd^jS2C~wZw*z=5%+oq(J%bYueC@rV+zl
z#Sx*rtg@m}iYV~L^D5bCwR&|C@3uVCiA*uTOS~69PnJI%G%eodoN#<UI;6@$Y5z<F
z<h#DVp%w)!b(mel=X~^r4<e1OC7AYU0IC7^;*Yyj^Bpw#?<8)mWWR644gtw1qz|G5
zxb%P-b9pw85i$f5ZC|h%odZjBg>cy(TD6JN55FWYex~G6JvsW-{x%n#tE;x;%FNdy
zm!v602Gh{g5Nn6Quvh#7%jPt|gt1{J_|>yjZg78&8X3mU9xs$Er@m-fT*0@w-u*@&
zoF18-9TA`&GYs}Fv6K8V9a{rXmfSWd?}s0(RBuMQY&MwOFGaW`rflV|7DNyTzhlHI
zO|Ld&XFqvL<GFqxLmNW|rkL75I28-n@#N%H6~E?lF(+ddAAJ02B`S=Nw#>Fh`Gddg
zUKkzR^NDK_TJoTBS6xc8(*moqy(v!YE6=jn$B$dC!0jK!T+qsw8s^*GcY*GH{8Cug
ztZN%4r6kH<y6AOLO*&Jt`8l5vak-f{ABW)6@m5d6UVS^A2L}G0C`q#qSut7-4rZDV
zi$pe;`v#nvk5XW~rFU*D$r`i7U$)eBzjvdYRjoxRfsPgZu17q1S)Lp>F*%ti*NZ0J
z@>5>)<bn!)y=_N1^X;$WP90_*XIk;EReGY-?3@BD<~Y1@gz1IZ3iTC!4wxS2Zxhl-
zjBZ7b$?{a$kgW(pZmg;)4`?_53~+%R8E)eX<Es0=^kpS)lW@`6;b+87=;An6vWvh_
zBjQB;Ta$*MPqWXgrno%F?s&X*RI(0a=g?T=B<Qx~pby5a93%Y#s$8;(@fwOAD*<%7
z9V6TIoc173_a1)WsvK15?z*s3Jke+mwB~VA<YGie*pW*zN-;AwuLhx&Ken=rf~}J-
zU5;#y>O2#b&gY+dF0yJr@55c7{|X;7zwz&xOMUI))Z%QP7WOXznaGl8Oj)^koD+MB
zI}057{?+9Wh>(4QeQH~e!{e@xIyD;N`2}{6rPnzYzQplH&0#qg;1zZtGj4?&Kli?`
zroNdvj(Vw{_j)4w#tnD*rL>-GMBn;b=7QwLpN$UXH5S%PvrWQ-twn%L|9RR4<$M^o
zbD?a;@3qAlIj|pZg8D^&%eq$T!{F3SHcPI!v`Eh#kX`B9juIQn$BGh!kzmrTz0o}}
zU24ovQQ^NzrFjH37Ch8>WAakO&L~E@e3Pof%0?m>eJAAl$u3&aJC9K`#QLF##5f~f
zMc~Do!@xIx_{H<_5WucmtwH4<@lbiCLl98@k2c$vflRY<S>U>fSE4~#NuC=uNUmf4
zfwdIylYD@MRF_8|u5I2}?J7&~`QmGAuZ{4^sls%QljZx|pF1&xyq6KHKbBsYL7eQ|
zRON?%8XLO-ukQp0joe?nA(qsna*jdC>+%ted4xqT6WcA3=uEHr?0}Tz9syPEWI+Z+
zlbFv1GrUVtK2i&3rL8><R7M2F2>%gvGx+!I%?@V5vlGl*dGQ|H_uqK%KL$BU(@hqc
zy2Mw#!N2O@AMw%a3I5yoYI<j29Bs*}pw&hEGX<6}12ZoC$q>Ul0<*++7e)06A|Qby
z=Hw&JkmW?4yl<ZU|6s`fDn`OHAVog|Lz?(7$8)7KuY@NPMRxp%*SgQ4n(>nQ9-Yty
zJp7$yuS9=JtZUTCTNg#%)AQmL*)wQ!%mxzRP9q?j*wTsePm(#~Xep19sT!B%QA~I9
zetzlf$<IE<-i>v2oKN(llBuJgeH#Q(R=2&U#q)QfvG-a&UW#o@+@z>V-2H7igWHD5
zHf|>?H}7|s&PJsLrzGvxNW2j(xmLgTVx|E)0pM3acRRfPvSucM-j9o6{O@?P8;`IF
z=JwN1sgTb`21XN$dkU@ZiBL@HqV~$!fk$H7O6*w(KTS%vLu@gf$tA#w!ge;TxCMcX
z&(ZdNAz~?iu&jgcB9+(XN9x0uE8R;l?x>_1C=Syh3G`oI&)!&++R@2JI7kUh%m5AD
zi)9?XjMYC!(b~@({_N|em7gTlBX|`(buf<2@T-1DZ`T+lljy8XL!NGj97MrAT7F#w
z%{(%_fZi1?U7?Rcm!tqDAHRuL?Nh4Cle}xWM&|NkE-}nQv0JP%FYRG<PnqD#z}JLo
zxVL+*_m|NsYOg!9`XkXKXRN{)Ur~T4h=@{Pqwg;qs6j0U=}~zdQ*fkSbN2#f9AJf5
zgwKLZl!6FY!7#hjEOFP`HrZs<Ht&Z`mBqVH8xh{>?Li$*F-9$N0&J2h(K>G$Ic#6+
zD|$%CyYE9$Hdyt>ShK0u>ZwxJKYj{v0|kxvT)%MKx5(fK5x!MQoXN`rE%4Z^g#?F-
zZJv{w9qM~hyKvu+iYu~?4p-b@?;eDP&O`y|kvl0yMqFqPxlNL??TG{ILYJ2JKH5xB
zx230^ZR98|0mu66@KtQCnSJ8uNDa=Uaof(aU&MAQoW<p(-&D}j8ZS+3>*6oP>DMyH
zo{h1G14iYPC<Y1iOHVaY<m)rS%Ccj)VHR4h>|Gm9h$+61+7y;NbQzD<L%8593p1A_
z`WbB+$eLwj^^R~lrgM}*I5zpA!wq(f0y(jdK<G-6L=o3U&Vf9?_HE&+Ld5aS-AiI8
z=e;sKl9Vf!u6djgE0r2F+mrB!Bou;(o8qMI-#8+5(jUDjK+660#1-kTBa@gaZQ%5E
zJzCiUGrH#SQ(nQ4@e3dNy4JPQ6Sa3+P20ne=mYh|XD*d&B$gbhBiAmI61$4Efpd)g
zH(!u!=PAnsUFP%8r*K}IPVR9Ji3!d=gIo2u8{Jlm3D>n+b4_r%N!M+-4|QCGFXd!t
zxeu4MeAP=V%TiGuq_<SBnpR4pxt$)xzm{T`G#GBzJ0NEMc{}6K<*#BCheIWsSXT||
zk-dpxQ_(E`o7~}d4PKyLi`ZQ8VK|Sd1s~nI8Vg*Rfc`%R8x`QfeLfT)pM}t2+>7q7
z<13aSt9|b_G3{aufHY>4LT8xe+n&{YQK11b4?{Ir(+Ch-)_3hAtL`T=cyeN|=i@G}
zhM+14BD=FQ<5StOoAE|a4-GyfCB={Le!kdpS4JvzFTD(oT8A}N+Ch*hG8XI4E1o=)
z6w^)%@ou<MSy*AgTz|bZ#>g004SZ6ePc!rw$Q$qg4YQsdTDDm`T9L7=YG@`~5HF*&
zfJ}B6o1|3*(p1iqRcq%5NE<^gE!^;;a#!FdQD=B-3qq?z9C-fC4Ou}=Z{`)BsO{L%
z#D|n0y#<0>l&(IV8W{R(X=ScFjkL4Z*q>biwx(b2@N0n%A+flJm-2<F#FOfj2t6u9
zfH%}|P*Fj*^^Z|8hv)gn7~=aYp_XhV$l-GY*Gx6|t+2rm&^?siNF}%Fg5RL*ddziy
zH){<=VV?L95&3Z%=XB7kyS){3rl3jIYtd;1T`>^xik}t2R4xzcSv#OlM1zE<biVNJ
zD|Y)0P0TzWw|BfD#lSzB$z;Io)pI$ICtRdD2QgTxO?-+xIj6tJ;iS2&WVkGZI~Bw!
zS!9}K2gGnUerTA68aX@9FlI-ArCA@z8HcYfEsb+Qxn|RQA5Jvwc12$SvcClqstIf*
z<;1o^C&FDykXjlj2DgCehZbAdN+hY~+Kasbv^B~vmSJCX$=^*BkQ05J_lfjjD9^Rl
zHg}&$m)PQh(}ShfOcrHVU#LcPio_<0+Q1duRb_*E<Y&^M*r-@vWwtG$rxF;hZ=iv4
z<Xp|#ICo;apK<_+Z2fsZMF)!fvfB(n_tzH0;!GNi!ETSMz9QbI73+&od~YtvM~p_N
z!*$B}biwHeW2$6^WCAi3wbDJI%L2`5^F#D%uY+j<kOtO|2+0*SG<Tdlf4GL!<1c5G
zYVpZ7r^8oTLDsp3Dv=<vD%+Ev=~?E8(NF$3Mx`FTaXn;0QbU>9+%8c;0j>kcdaQTL
z_N7FEq~TpHVW*t8oY0JL&NzB;wR3myq8-vy&lV$vT%?zu5wA+O3b3I%+@pj)l<;4T
zo{l8W7a~y&)A*73O4)v2es_ubMz8vnMJB|HEwZGHX!d|~3ex>qN+*9K2UQ@Gde}XQ
z4Zo%5DDz@FTQ=YWxA>ShXEraS%hfXeto^~4b-%LjkzL}diw3<TFI2$4-kIghe9~Eb
zjk;>@Ra!1I<h}H|aCkUOJ#<!B@=F=gu2+L~VQ@gaEc9+A_aaIzt&A>k%)bi8Dvv{F
zX|*XQO*Er0cH3i^EA4^PeCOc$K>qHP>>XaYBDM5yca6Uys=Ql4OW{tRnl8=rv}F4I
zFel!IP#hbhKyJ-kk^8wNj8`-l-T7pF-UD#dm#AGPS6ThkdKx~JHjb|zlSZ7B!biBF
zr~3MAG8=wJ%~7%O6Xr&evS6h$4$~wXQ6R9-<6n+VS3mHpJlxysMm9p^Nm(nQh)0HB
za_+6({PJsT29Z)OWTE|LjbX1K@y~*j^48bPE&E?d-d<HM$vYJi_}R)Z9=OQv@J;B}
zY<6$G<naPqlzo|k>%I~a^`8CJ-7scyU`AjHl6(e76Pcu%`Hku6O}6%SmComUPw!*b
z=*Ez0Iwg1@XMd4}DCka}xoH>j9z|8F?EbqGNgjRY#a^GHJZ8R*J+g!MAtjDNYq9}@
zaVr6E1iM8-5XY88e17GK!@B|citxhTTcJiRd(U8Am(&c|Im~I#0&n<wTQgZGo8~0h
zGrQm+=>*4ssu4rgf;)`CljcQ=CE5-qq1BU{A3r`UqT$i}B2S@H<fjG|?xv*LvU*6q
zI+qkENy3|6W^NgH2`k((!zt`y7nRB>8v)poJd6L}HdDWD*cwV0QT#?a!tHi_g;dUG
z4D8<N9ID3rgFGelIUz8jSO9=B<?}i<F;IpKYzul{+*gv6%I(~#z&a!Au+!zA1}Kee
zyu)QJjc1qt(DYE*G3BZgeDju=?p`Xd*ye6dnz|V}T#}Qo;A!HuAeH}4Z2SW>%{yNz
zy9?BB6c->MlCB(w$+gh>)<{^Mu8;ec96HbUF-;Uc^~Z1R@N(SwF6?(j^&Nf}z*$M+
zPsJ<R8@N>ePW7ie`9Bill+^%1iX5J`YSx>&{AYIYe-z69r}F84?%+7%foOb!)=7Zu
zu?J~VVqSA8GJ))mN+wuc)Sjd09{r)^{=J98F`M`wJ@@~11BV=#@-R~NGysmHNa;V8
ztMTZSI+>sxGJtS5(0XC98x)&U<+X0d&Sf4v{Gl~|$EyST?t}gmq;!Aet$mjD86bcI
zo34FN4no)7eaCO1%gV=qzvcAjQv87X;|YSybS{gZ3z-*ixa@Giav(r7$&2c62V(#;
zN}F+Z9~iNO@5lR;`Yp_Kn|QicM8ue%XTS)35SnxpYeO?C<hM%K_}p1?+HWZJx-h@U
z&0>1y?8#qWpH^8gX$fmMKtK&@vr(*P2D(9breFjB?zr1`tM0wrrudn}>chS~>$7J7
z_gu*u*mssW49<k_M`i|W9HsJQBXR`JZ7@f_qzH#u9<W(zz2Yo<oNwHDq=ro7PH`Az
z`#Q!F%x64uy4s%M<P>VxOPT}<_dctb;&;zf(<*rt+HE=LoOI?hZB?H(PvQsH;WT8^
z4=3usF_=G-AP{g83eh*RZHer2fV;Qr3?I!cqxWStv!*ArE1Sc?rAx=-lccQ`_D#Pa
zi6-#rtsUWkFf)@jn5K*R2Z+_+2Y;LcQ|fo;*$?*sE`!eLD2x&}bkg9!pA^=(o@(gJ
zxFrlW3OBeP3mY@2)cOWLWfAuAYir8!*i(Y#Hx|}S&*Ov144y9Cwx$kW`LN(O<ZbXc
z_(OdhX@Kk_XO1&r!T+LUwoXaA{TuxxXMLEkLS#*@i{$QcrwIMec;Q7mewcU>*gE&Q
z%k4bVFHC3EZ@hTB0h|XNg=V@8>-1Dsn`a|rz2@;sX(U=^dtW`nb9*qEHULm(XRy9s
za#=UXldaylsH6es>b6n>&Z39f7P9#~U>g+S99yIP>HXe?yn7+OQx9$^uBQafMXELR
z8jF9G+053b6Q0&onbrV`uM}3F7_~JP+Ay~#zQRS5wIs;YBG*$lC{WX<F{#p8{1Ak_
zgBdJwGNboW;zu^womSnz4K3$?cT(Eq{j?UBE^B4%_4lFjoufjHzfvC3OXT>v=U2>c
zx<$to(+P0ffw}#IpLWJ5_F5CB*=0IH>a|!$;#pAJ2RH%u7Yd6t%yBRGbXD}J^ccKz
zs#+dECN_bK=&ZJGV7)S_;pmW<HY)y^yB>0K{H`?YJ`YrDOzTnPos8vzI7%&4nQTT#
zNCz2i!F@*EeP8~`or%u=j7ibBFaWJZzqt^~TKu?`PeR#$Z;+W}GC~fl<VT1q35$)!
zAcGvu5|!;ah6zhuMlRJj$g0i8GiMwbGX0o__cilDu(GgG&r()Vi)ZN2bM>5Nl{hw)
zwx^A6xuDUEU01F%ygj3j$p{q<bDw6<=jM=x7V7y&;KtFX1#cpT!c_Nvw8XDImMYzt
z$LZ8~goUL*oQ$UGfWZBag5J{}8&CDgbB7X#i(=iC2r%*mkwJ36)nlv&&hIf61A&ro
zv0Ktup8zk3@r>&rl{ErYWz%RRzMIzI(#@-p4#n?Mf2xJDxxp*;7Sjfmq(-ue(dQ2f
z^;1>n1%4{irY;t=i@UzCwva-}Pfdwg(m-9udNbfR3*m)FzSv(c_+Kpd{j8++8rOVb
zZvCS>2Z8{+<XI#uD*24a_*xKG0IAGHI6tjjZtl+mrDZmL0OVVi-U601@EzU<sT2}g
zv%p14@!p8g8c)yZ@$LR=wJF#DgVsvj-TU>hC;RJ^FfYeOO;w7?9?v_2In`bbY%^9l
zQ$mC>tfkVEXHHG<!|$fU3*C*1D&S&flAXF{ZB&-8J~&r+S>$7<F1T1RvUtGaI#7qh
z_JkXnilG*l^s(iHO0TF<C)4vHKAhQF$blQReZsR^C}1^He(338gN1Dh38TG2PJ<oi
zmMeW_M|m*b{#reh=x0V1#kH)2i2oiD3r%x6%|iwzt-q)7DV5%dV(>vC=cumva)PP#
zXss&p<d}?CYAQM_xd`Fqp~3_8$bM|DjAozC0d#DpP{Thm*L?~Ti1Jy_%R%sG$D=IO
zLvG4i`R{+Qc%!QiKC>Nic!@$nB;I}ZNHkP;J3@xLM#d>r$xp1~1+p#99fP3cMwWcB
znvKZ4CiRrvYBf7yO_#fPzc6zjbe|D(rNZPK+iAm%fcqyw!Bi_pZj8YFHV;k<X83Vj
zR+;oOz8|9*JexqZx@@$nbproD^T<;k5`MQ#eOWbb=xkIyzuQJt%!;<Q^}{R}8&IuJ
z(|=z8KdI6l6N}}^pdf&PW&b^&y_cg$*$$A;h#VQb3+D%vcaS@!4tK|2)iwd59*Pw}
zdv8{yLRMul&n!G90*N=S^}~9-5lva?)a4TXkQ3PZsR1a##3wb)S0_P=%Ln0SMjqw%
z)Ul=LQ^l!mN1fA-wt1;3E%m&%_dzAKD*1c$N!N}PUuwO0<$HGVUm<N@0JfC=`3@zf
z6GnN&W_C8tbGNdY?uY2Q-4YVG2@gg&MSvKPU-0>@lBNrc!;LM^EH5V?q1Lv7Mz&Md
z&+8pz-VnF#`Ca)?_H+WNq4ZduutYS%rA<CgX6u(+D|rR%`#Lgg1*=Wb@4c<lhu(8{
zyR022?5BFIkXSN4OQlXWH|32AdH4?APc@94jk`kBqap{dyOw+g?XHdn;K23=zEx>S
zTUC-igOMqC5gL7uFj_kA0isv_sX}*1mY1$BuJM5w+GifjyVZ{%cO54G%9;N6hDrS9
zPdRY3NBfxEM)fJo_?=V=XbQMDrqM5A$;ZAKR6+Nz3iBV8c6=&;7zo2ialj@=z}n_M
zmjVwY1A2RBzJ>+~G2n0d&n>I?8thpWj_A*7GzQe;aUu0vB+Wk+cfz4CJYoNRC=6fT
z`QN*s{r{V-0AC4v!AVFgZ(vAYXc9_X%q&4SV+m=;f0a0V`;T*fe<uyE&JGq5WNX6N
zzjN98|GOR0f1E{(#!w*_-@9P{YF{0~zc@(eQJ9+BX81f}ZWYTujo$vh+85s5Qlxcu
rw)34I@gIje|LZfKjx`=zr<cmDQy1+-t2+s&L6v}NPfH%Z`taWXH<h@6

literal 0
HcmV?d00001

diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index 55976740105..cdf32216619 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -53,6 +53,22 @@ print(pd)
 <module 'pandas' (ModuleAccelerator(fast=cudf, slow=pandas))>
 ```
 
+## Which functions will run on the GPU?
+
+Generally, `cudf.pandas` will accelerate all the features in the
+{ref}`cuDF API <cudf-api>` on the GPU. There are some exceptions. For
+example, some functions are GPU-accelerated by cuDF but do not support
+every combination of keyword arguments. In cases like unsupported
+keyword arguments, cuDF is not able to provide GPU acceleration and
+`cudf.pandas` will fall back to the CPU.
+
+The most accurate way to assess which functions run on the GPU is to try
+running the code while using the `cudf.pandas` profiling features. The
+profiler will indicate which functions ran on GPU / CPU. To improve
+performance, try to use only functionality that can run entirely on GPU.
+This helps reduce the number of memory transfers needed to fallback to
+CPU.
+
 ## Does it work with third-party libraries?
 
 `cudf.pandas` is tested with numerous popular third-party libraries.
diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md
index 376784439aa..0398a8d7086 100644
--- a/docs/cudf/source/cudf_pandas/usage.md
+++ b/docs/cudf/source/cudf_pandas/usage.md
@@ -63,16 +63,22 @@ back to CPU for certain operations. Running your code with the
 `cudf.pandas.profile` magic generates a report showing which
 operations used the GPU and which used the CPU. This can help you
 identify parts of your code that could be rewritten to be more
-GPU-friendly:
+GPU-friendly.
+
+### Using the Function Profiler
+
+First, enable `cudf.pandas`:
 
 ```python
 %load_ext cudf.pandas
 import pandas as pd
 ```
 
+Next, use the IPython/Jupyter magic `cudf.pandas.profile`:
+
 ```python
 %%cudf.pandas.profile
-df = pd.DataFrame({'a': [0, 1, 2], 'b': [3,4,3]})
+df = pd.DataFrame({'a': [0, 1, 2], 'b': [3, 4, 3]})
 
 df.min(axis=1)
 out = df.groupby('a').filter(
@@ -80,13 +86,35 @@ out = df.groupby('a').filter(
 )
 ```
 
+This gives a profiler output after the cell runs, shown below.
+
 ![cudf-pandas-profile](../_static/cudf-pandas-profile.png)
 
 When an operation falls back to using the CPU, it's typically because
 that operation isn't implemented by cuDF. The profiler generates a
 handy link to report the missing functionality to the cuDF team.
 
-To profile a script being run from the command-line, pass the
+### Using the Line Profiler
+
+There is a line profiler activated by the IPython/Jupyter magic `cudf.pandas.line_profile`:
+
+```python
+%%cudf.pandas.line_profile
+df = pd.DataFrame({'a': [0, 1, 2], 'b': [3, 4, 3]})
+
+df.min(axis=1)
+out = df.groupby('a').filter(
+    lambda group: len(group) > 1
+)
+```
+
+The output of the line profiler shows the source code and how much time each line spent executing on the GPU and CPU.
+
+![cudf-pandas-line-profile](../_static/cudf-pandas-line-profile.png)
+
+### Profiling from the command line
+
+To profile a script being run from the command line, pass the
 `--profile` argument:
 
 ```bash
diff --git a/docs/cudf/source/user_guide/api_docs/index.rst b/docs/cudf/source/user_guide/api_docs/index.rst
index b3442908531..5f26a921012 100644
--- a/docs/cudf/source/user_guide/api_docs/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/index.rst
@@ -1,3 +1,5 @@
+.. _cudf-api:
+
 =============
 API reference
 =============

From 563556e13d081a6ed07fd5b3577f64e95a1717d0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 26 Jun 2024 13:25:04 -1000
Subject: [PATCH 414/842] Reduce (shallow) copies in DataFrame ops (#16060)

In particular for ops which only modify the axes

* Reduce multiple shallow copies in `DataFrame.rename`
* Avoid a shallow copy in `DataFrame.to_arrow` until necessary

Also fixes a bug in `DataFrame.rename` to maintain the original `dtype` of the `columns` after renaming

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16060
---
 python/cudf/cudf/core/column_accessor.py |  4 +-
 python/cudf/cudf/core/dataframe.py       | 48 +++++++++++++-----------
 python/cudf/cudf/tests/test_dataframe.py |  8 ++++
 3 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 1bf9a393566..f30a557efb0 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -472,6 +472,7 @@ def swaplevel(self, i=-2, j=-1):
             new_keys[n][i], new_keys[n][j] = row[j], row[i]
             new_dict.update({row: tuple(new_keys[n])})
 
+        # TODO: Change to deep=False when copy-on-write is default
         new_data = {new_dict[k]: v.copy(deep=True) for k, v in self.items()}
 
         # swap level_names for i and j
@@ -669,10 +670,11 @@ def rename_column(x):
                 raise ValueError("Duplicate column names are not allowed")
 
         data = dict(zip(new_col_names, self.values()))
-        return self.__class__(
+        return type(self)(
             data=data,
             level_names=self.level_names,
             multiindex=self.multiindex,
+            label_dtype=self.label_dtype,
             verify=False,
         )
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f0d8157011d..f7f5ef792d6 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1121,8 +1121,6 @@ def _from_data(
     @staticmethod
     @_cudf_nvtx_annotate
     def _align_input_series_indices(data, index):
-        data = data.copy()
-
         input_series = [
             Series(val)
             for val in data.values()
@@ -1142,6 +1140,7 @@ def _align_input_series_indices(data, index):
                 )
                 index = aligned_input_series[0].index
 
+            data = data.copy()
             for name, val in data.items():
                 if isinstance(val, (pd.Series, Series, dict)):
                     data[name] = aligned_input_series.pop(0)
@@ -2969,6 +2968,7 @@ def set_index(
             idx = MultiIndex._from_data(dict(enumerate(data_to_add)))
             idx.names = names
 
+        # TODO: Change to deep=False when copy-on-write is default
         df = self if inplace else self.copy(deep=True)
 
         if verify_integrity and not idx.is_unique:
@@ -3565,6 +3565,9 @@ def rename(
             mapper if columns is None and axis in (1, "columns") else columns
         )
 
+        result = self if inplace else self.copy(deep=copy)
+
+        out_index = None
         if index:
             if (
                 any(isinstance(item, str) for item in index.values())
@@ -3586,36 +3589,36 @@ def rename(
                 )
                 out_index._data[level] = column.as_column(level_values)
                 out_index._compute_levels_and_codes()
-                out = DataFrame(index=out_index)
             else:
                 to_replace = list(index.keys())
                 vals = list(index.values())
                 is_all_na = vals.count(None) == len(vals)
 
                 try:
-                    index_data = {
-                        name: col.find_and_replace(to_replace, vals, is_all_na)
-                        for name, col in self.index._data.items()
-                    }
+                    out_index = _index_from_data(
+                        {
+                            name: col.find_and_replace(
+                                to_replace, vals, is_all_na
+                            )
+                            for name, col in self.index._data.items()
+                        }
+                    )
                 except OverflowError:
-                    index_data = self.index._data.copy(deep=True)
+                    pass
 
-                out = DataFrame(index=_index_from_data(index_data))
-        else:
-            out = DataFrame(index=self.index)
+        if out_index is not None:
+            result.index = out_index
 
         if columns:
-            out._data = self._data.rename_levels(mapper=columns, level=level)
-        else:
-            out._data = self._data.copy(deep=copy)
+            result._data = result._data.rename_levels(
+                mapper=columns, level=level
+            )
 
-        if inplace:
-            self._data = out._data
-        else:
-            return out.copy(deep=copy)
+        return result
 
     @_cudf_nvtx_annotate
     def add_prefix(self, prefix):
+        # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
         out.columns = [
             prefix + col_name for col_name in list(self._data.keys())
@@ -3624,6 +3627,7 @@ def add_prefix(self, prefix):
 
     @_cudf_nvtx_annotate
     def add_suffix(self, suffix):
+        # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
         out.columns = [
             col_name + suffix for col_name in list(self._data.keys())
@@ -3956,7 +3960,8 @@ def swaplevel(self, i=-2, j=-1, axis=0):
                            weight    1.0    0.8
                            length    0.3    0.2
         """
-        result = self.copy()
+        # TODO: Change to deep=False when copy-on-write is default
+        result = self.copy(deep=True)
 
         # To get axis number
         axis = self._get_axis_from_axis_arg(axis)
@@ -4027,7 +4032,7 @@ def transpose(self):
 
         # Set the old column names as the new index
         result = self.__class__._from_data(
-            {i: col for i, col in enumerate(result_columns)},
+            ColumnAccessor(dict(enumerate(result_columns)), verify=False),
             index=as_index(index),
         )
         # Set the old index as the new column names
@@ -5528,7 +5533,7 @@ def to_arrow(self, preserve_index=None):
         b: [[4,5,6]]
         """
 
-        data = self.copy(deep=False)
+        data = self
         index_descr = []
         write_index = preserve_index is not False
         keep_range_index = write_index and preserve_index is None
@@ -5556,6 +5561,7 @@ def to_arrow(self, preserve_index=None):
                     index_descr = (
                         index.names if index.name is not None else ("index",)
                     )
+                data = data.copy(deep=False)
                 for gen_name, col_name in zip(index_descr, index._data.names):
                     data._insert(
                         data.shape[1],
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 05ee8346afa..fc7fd87d4c5 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10024,6 +10024,14 @@ def test_dataframe_rename_duplicate_column():
         gdf.rename(columns={"a": "b"}, inplace=True)
 
 
+def test_dataframe_rename_columns_keep_type():
+    gdf = cudf.DataFrame([[1, 2, 3]])
+    gdf.columns = cudf.Index([4, 5, 6], dtype=np.int8)
+    result = gdf.rename({4: 50}, axis="columns").columns
+    expected = pd.Index([50, 5, 6], dtype=np.int8)
+    assert_eq(result, expected)
+
+
 @pytest_unmark_spilling
 @pytest.mark.skipif(
     PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,

From 6eac9207ca0804aeca64c83c533e16ad5963b0ba Mon Sep 17 00:00:00 2001
From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
Date: Wed, 26 Jun 2024 17:16:57 -0700
Subject: [PATCH 415/842] Refactor distinct with hashset-based algorithms
 (#15984)

Refactor **distinct** algorithm to use `cuco::static_set`.

Authors:
  - Srinivas Yadav (https://github.com/srinivasyadav18)
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15984
---
 cpp/src/stream_compaction/distinct.cu         | 146 ++++++--------
 cpp/src/stream_compaction/distinct_count.cu   |   3 +-
 cpp/src/stream_compaction/distinct_helpers.cu | 189 ++++++++++--------
 .../stream_compaction/distinct_helpers.hpp    |  58 +++---
 .../stream_compaction_common.cuh              |   5 +-
 .../stream_compaction_common.hpp              |  35 ----
 cpp/src/stream_compaction/unique.cu           |   1 -
 7 files changed, 208 insertions(+), 229 deletions(-)
 delete mode 100644 cpp/src/stream_compaction/stream_compaction_common.hpp

diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index a6f15cc49ec..e5cf29f3ebf 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -17,28 +17,62 @@
 #include "distinct_helpers.hpp"
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/stream_compaction.hpp>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <cuda/functional>
-#include <thrust/copy.h>
-#include <thrust/distance.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/discard_iterator.h>
-
 #include <utility>
 #include <vector>
 
 namespace cudf {
 namespace detail {
+namespace {
+/**
+ * @brief Invokes the given `func` with desired the row equality
+ *
+ * @tparam HasNested Flag indicating whether there are nested columns in the input
+ * @tparam Func Type of the helper function doing `distinct` check
+ *
+ * @param compare_nulls Control whether nulls should be compared as equal or not
+ * @param compare_nans Control whether floating-point NaNs values should be compared as equal or not
+ * @param has_nulls Flag indicating whether the input has nulls or not
+ * @param row_equal Self table comparator
+ * @param func The input functor to invoke
+ */
+template <bool HasNested, typename Func>
+rmm::device_uvector<cudf::size_type> dipatch_row_equal(
+  null_equality compare_nulls,
+  nan_equality compare_nans,
+  bool has_nulls,
+  cudf::experimental::row::equality::self_comparator row_equal,
+  Func&& func)
+{
+  if (compare_nans == nan_equality::ALL_EQUAL) {
+    auto const d_equal = row_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{has_nulls},
+      compare_nulls,
+      cudf::experimental::row::equality::nan_equal_physical_equality_comparator{});
+    return func(d_equal);
+  } else {
+    auto const d_equal = row_equal.equal_to<HasNested>(
+      nullate::DYNAMIC{has_nulls},
+      compare_nulls,
+      cudf::experimental::row::equality::physical_equality_comparator{});
+    return func(d_equal);
+  }
+}
+}  // namespace
 
 rmm::device_uvector<size_type> distinct_indices(table_view const& input,
                                                 duplicate_keep_option keep,
@@ -47,97 +81,39 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
                                                 rmm::cuda_stream_view stream,
                                                 rmm::device_async_resource_ref mr)
 {
-  if (input.num_rows() == 0 or input.num_columns() == 0) {
+  auto const num_rows = input.num_rows();
+
+  if (num_rows == 0 or input.num_columns() == 0) {
     return rmm::device_uvector<size_type>(0, stream, mr);
   }
 
-  auto map = hash_map_type{compute_hash_table_size(input.num_rows()),
-                           cuco::empty_key{-1},
-                           cuco::empty_value{std::numeric_limits<size_type>::min()},
-                           cudf::detail::cuco_allocator{stream},
-                           stream.value()};
-
   auto const preprocessed_input =
     cudf::experimental::row::hash::preprocessed_table::create(input, stream);
   auto const has_nulls          = nullate::DYNAMIC{cudf::has_nested_nulls(input)};
   auto const has_nested_columns = cudf::detail::has_nested_columns(input);
 
-  auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
-  auto const key_hasher = row_hasher.device_hasher(has_nulls);
-
-  auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);
-
-  auto const pair_iter = cudf::detail::make_counting_transform_iterator(
-    size_type{0},
-    cuda::proclaim_return_type<cuco::pair<size_type, size_type>>(
-      [] __device__(size_type const i) { return cuco::make_pair(i, i); }));
-
-  auto const insert_keys = [&](auto const value_comp) {
-    if (has_nested_columns) {
-      auto const key_equal = row_comp.equal_to<true>(has_nulls, nulls_equal, value_comp);
-      map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
-    } else {
-      auto const key_equal = row_comp.equal_to<false>(has_nulls, nulls_equal, value_comp);
-      map.insert(pair_iter, pair_iter + input.num_rows(), key_hasher, key_equal, stream.value());
-    }
+  auto const row_hash  = cudf::experimental::row::hash::row_hasher(preprocessed_input);
+  auto const row_equal = cudf::experimental::row::equality::self_comparator(preprocessed_input);
+
+  auto const helper_func = [&](auto const& d_equal) {
+    using RowHasher = std::decay_t<decltype(d_equal)>;
+    auto set        = hash_set_type<RowHasher>{num_rows,
+                                               0.5,  // desired load factor
+                                               cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+                                               d_equal,
+                                               {row_hash.device_hasher(has_nulls)},
+                                               {},
+                                               {},
+                                               cudf::detail::cuco_allocator{stream},
+                                               stream.value()};
+    return detail::reduce_by_row(set, num_rows, keep, stream, mr);
   };
 
-  if (nans_equal == nan_equality::ALL_EQUAL) {
-    using nan_equal_comparator =
-      cudf::experimental::row::equality::nan_equal_physical_equality_comparator;
-    insert_keys(nan_equal_comparator{});
+  if (cudf::detail::has_nested_columns(input)) {
+    return dipatch_row_equal<true>(nulls_equal, nans_equal, has_nulls, row_equal, helper_func);
   } else {
-    using nan_unequal_comparator = cudf::experimental::row::equality::physical_equality_comparator;
-    insert_keys(nan_unequal_comparator{});
+    return dipatch_row_equal<false>(nulls_equal, nans_equal, has_nulls, row_equal, helper_func);
   }
-
-  auto output_indices = rmm::device_uvector<size_type>(map.get_size(), stream, mr);
-
-  // If we don't care about order, just gather indices of distinct keys taken from map.
-  if (keep == duplicate_keep_option::KEEP_ANY) {
-    map.retrieve_all(output_indices.begin(), thrust::make_discard_iterator(), stream.value());
-    return output_indices;
-  }
-
-  // For other keep options, reduce by row on rows that compare equal.
-  auto const reduction_results = reduce_by_row(map,
-                                               std::move(preprocessed_input),
-                                               input.num_rows(),
-                                               has_nulls,
-                                               has_nested_columns,
-                                               keep,
-                                               nulls_equal,
-                                               nans_equal,
-                                               stream,
-                                               rmm::mr::get_current_device_resource());
-
-  // Extract the desired output indices from reduction results.
-  auto const map_end = [&] {
-    if (keep == duplicate_keep_option::KEEP_NONE) {
-      // Reduction results with `KEEP_NONE` are either group sizes of equal rows, or `0`.
-      // Thus, we only output index of the rows in the groups having group size of `1`.
-      return thrust::copy_if(rmm::exec_policy(stream),
-                             thrust::make_counting_iterator(0),
-                             thrust::make_counting_iterator(input.num_rows()),
-                             output_indices.begin(),
-                             [reduction_results = reduction_results.begin()] __device__(
-                               auto const idx) { return reduction_results[idx] == size_type{1}; });
-    }
-
-    // Reduction results with `KEEP_FIRST` and `KEEP_LAST` are row indices of the first/last row in
-    // each group of equal rows (which are the desired output indices), or the value given by
-    // `reduction_init_value()`.
-    return thrust::copy_if(rmm::exec_policy(stream),
-                           reduction_results.begin(),
-                           reduction_results.end(),
-                           output_indices.begin(),
-                           [init_value = reduction_init_value(keep)] __device__(auto const idx) {
-                             return idx != init_value;
-                           });
-  }();
-
-  output_indices.resize(thrust::distance(output_indices.begin(), map_end), stream);
-  return output_indices;
 }
 
 std::unique_ptr<table> distinct(table_view const& input,
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 99ca89cc021..9843bb889f4 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -15,16 +15,17 @@
  */
 
 #include "stream_compaction_common.cuh"
-#include "stream_compaction_common.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/stream_compaction.hpp>
+#include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/src/stream_compaction/distinct_helpers.cu b/cpp/src/stream_compaction/distinct_helpers.cu
index 13e89b15bb7..c3a004b7f28 100644
--- a/cpp/src/stream_compaction/distinct_helpers.cu
+++ b/cpp/src/stream_compaction/distinct_helpers.cu
@@ -16,96 +16,127 @@
 
 #include "distinct_helpers.hpp"
 
-#include <cudf/detail/hash_reduce_by_row.cuh>
-
-#include <rmm/resource_ref.hpp>
+#include <cuda/functional>
+#include <cuda/std/atomic>
 
 namespace cudf::detail {
 
-namespace {
-/**
- * @brief The functor to find the first/last/all duplicate row for rows that compared equal.
- */
-template <typename MapView, typename KeyHasher, typename KeyEqual>
-struct reduce_fn : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type> {
-  duplicate_keep_option const keep;
-
-  reduce_fn(MapView const& d_map,
-            KeyHasher const& d_hasher,
-            KeyEqual const& d_equal,
-            duplicate_keep_option const keep,
-            size_type* const d_output)
-    : reduce_by_row_fn_base<MapView, KeyHasher, KeyEqual, size_type>{d_map,
-                                                                     d_hasher,
-                                                                     d_equal,
-                                                                     d_output},
-      keep{keep}
-  {
+template <typename RowHasher>
+rmm::device_uvector<size_type> reduce_by_row(hash_set_type<RowHasher>& set,
+                                             size_type num_rows,
+                                             duplicate_keep_option keep,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  auto output_indices = rmm::device_uvector<size_type>(num_rows, stream, mr);
+
+  // If we don't care about order, just gather indices of distinct keys taken from set.
+  if (keep == duplicate_keep_option::KEEP_ANY) {
+    auto const iter = thrust::counting_iterator<cudf::size_type>{0};
+    set.insert_async(iter, iter + num_rows, stream.value());
+    auto const output_end = set.retrieve_all(output_indices.begin(), stream.value());
+    output_indices.resize(thrust::distance(output_indices.begin(), output_end), stream);
+    return output_indices;
   }
 
-  __device__ void operator()(size_type const idx) const
-  {
-    auto const out_ptr = this->get_output_ptr(idx);
-
-    if (keep == duplicate_keep_option::KEEP_FIRST) {
-      // Store the smallest index of all rows that are equal.
-      atomicMin(out_ptr, idx);
-    } else if (keep == duplicate_keep_option::KEEP_LAST) {
-      // Store the greatest index of all rows that are equal.
-      atomicMax(out_ptr, idx);
-    } else {
-      // Count the number of rows in each group of rows that are compared equal.
-      atomicAdd(out_ptr, size_type{1});
+  auto reduction_results = rmm::device_uvector<size_type>(num_rows, stream, mr);
+  thrust::uninitialized_fill(rmm::exec_policy_nosync(stream),
+                             reduction_results.begin(),
+                             reduction_results.end(),
+                             reduction_init_value(keep));
+
+  auto set_ref = set.ref(cuco::op::insert_and_find);
+
+  thrust::for_each(rmm::exec_policy_nosync(stream),
+                   thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(num_rows),
+                   [set_ref, keep, reduction_results = reduction_results.begin()] __device__(
+                     size_type const idx) mutable {
+                     auto const [inserted_idx_ptr, _] = set_ref.insert_and_find(idx);
+
+                     auto ref = cuda::atomic_ref<size_type, cuda::thread_scope_device>{
+                       reduction_results[*inserted_idx_ptr]};
+                     if (keep == duplicate_keep_option::KEEP_FIRST) {
+                       // Store the smallest index of all rows that are equal.
+                       ref.fetch_min(idx, cuda::memory_order_relaxed);
+                     } else if (keep == duplicate_keep_option::KEEP_LAST) {
+                       // Store the greatest index of all rows that are equal.
+                       ref.fetch_max(idx, cuda::memory_order_relaxed);
+                     } else {
+                       // Count the number of rows in each group of rows that are compared equal.
+                       ref.fetch_add(size_type{1}, cuda::memory_order_relaxed);
+                     }
+                   });
+
+  auto const map_end = [&] {
+    if (keep == duplicate_keep_option::KEEP_NONE) {
+      // Reduction results with `KEEP_NONE` are either group sizes of equal rows, or `0`.
+      // Thus, we only output index of the rows in the groups having group size of `1`.
+      return thrust::copy_if(
+        rmm::exec_policy(stream),
+        thrust::make_counting_iterator(0),
+        thrust::make_counting_iterator(num_rows),
+        output_indices.begin(),
+        cuda::proclaim_return_type<bool>(
+          [reduction_results = reduction_results.begin()] __device__(auto const idx) {
+            return reduction_results[idx] == size_type{1};
+          }));
     }
-  }
-};
 
-/**
- * @brief The builder to construct an instance of `reduce_fn` functor base on the given
- * value of the `duplicate_keep_option` member variable.
- */
-struct reduce_func_builder {
-  duplicate_keep_option const keep;
-
-  template <typename MapView, typename KeyHasher, typename KeyEqual>
-  auto build(MapView const& d_map,
-             KeyHasher const& d_hasher,
-             KeyEqual const& d_equal,
-             size_type* const d_output)
-  {
-    return reduce_fn<MapView, KeyHasher, KeyEqual>{d_map, d_hasher, d_equal, keep, d_output};
-  }
-};
+    // Reduction results with `KEEP_FIRST` and `KEEP_LAST` are row indices of the first/last row in
+    // each group of equal rows (which are the desired output indices), or the value given by
+    // `reduction_init_value()`.
+    return thrust::copy_if(
+      rmm::exec_policy(stream),
+      reduction_results.begin(),
+      reduction_results.end(),
+      output_indices.begin(),
+      cuda::proclaim_return_type<bool>([init_value = reduction_init_value(keep)] __device__(
+                                         auto const idx) { return idx != init_value; }));
+  }();
 
-}  // namespace
+  output_indices.resize(thrust::distance(output_indices.begin(), map_end), stream);
+  return output_indices;
+}
 
-// This function is split from `distinct.cu` to improve compile time.
-rmm::device_uvector<size_type> reduce_by_row(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
+template rmm::device_uvector<size_type> reduce_by_row(
+  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+    false,
+    cudf::nullate::DYNAMIC,
+    cudf::experimental::row::equality::nan_equal_physical_equality_comparator>>& set,
   size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
   duplicate_keep_option keep,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
   rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  CUDF_EXPECTS(keep != duplicate_keep_option::KEEP_ANY,
-               "This function should not be called with KEEP_ANY");
-
-  return hash_reduce_by_row(map,
-                            preprocessed_input,
-                            num_rows,
-                            has_nulls,
-                            has_nested_columns,
-                            nulls_equal,
-                            nans_equal,
-                            reduce_func_builder{keep},
-                            reduction_init_value(keep),
-                            stream,
-                            mr);
-}
+  rmm::device_async_resource_ref mr);
+
+template rmm::device_uvector<size_type> reduce_by_row(
+  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+    true,
+    cudf::nullate::DYNAMIC,
+    cudf::experimental::row::equality::nan_equal_physical_equality_comparator>>& set,
+  size_type num_rows,
+  duplicate_keep_option keep,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template rmm::device_uvector<size_type> reduce_by_row(
+  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+    false,
+    cudf::nullate::DYNAMIC,
+    cudf::experimental::row::equality::physical_equality_comparator>>& set,
+  size_type num_rows,
+  duplicate_keep_option keep,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template rmm::device_uvector<size_type> reduce_by_row(
+  hash_set_type<cudf::experimental::row::equality::device_row_comparator<
+    true,
+    cudf::nullate::DYNAMIC,
+    cudf::experimental::row::equality::physical_equality_comparator>>& set,
+  size_type num_rows,
+  duplicate_keep_option keep,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
 
 }  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp
index 40f97e00ce5..fca67c98873 100644
--- a/cpp/src/stream_compaction/distinct_helpers.hpp
+++ b/cpp/src/stream_compaction/distinct_helpers.hpp
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#include "stream_compaction_common.hpp"
-
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
@@ -24,6 +23,12 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cuco/static_set.cuh>
+#include <cuda/functional>
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/iterator/counting_iterator.h>
+
 namespace cudf::detail {
 
 /**
@@ -42,13 +47,28 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
   }
 }
 
+template <typename RowHasher>
+using hash_set_type =
+  cuco::static_set<size_type,
+                   cuco::extent<int64_t>,
+                   cuda::thread_scope_device,
+                   RowHasher,
+                   cuco::linear_probing<1,
+                                        cudf::experimental::row::hash::device_row_hasher<
+                                          cudf::hashing::detail::default_hash,
+                                          cudf::nullate::DYNAMIC>>,
+                   cudf::detail::cuco_allocator,
+                   cuco::storage<1>>;
+
 /**
- * @brief Perform a reduction on groups of rows that are compared equal.
+ * @brief Perform a reduction on groups of rows that are compared equal and returns output indices
+ * of the occurrences of the distinct elements based on `keep` parameter.
  *
  * This is essentially a reduce-by-key operation with keys are non-contiguous rows and are compared
- * equal. A hash table is used to find groups of equal rows.
+ * equal. A hash set is used to find groups of equal rows.
  *
  * Depending on the `keep` parameter, the reduction operation for each row group is:
+ * - If `keep == KEEP_ANY` : order does not matter.
  * - If `keep == KEEP_FIRST`: min of row indices in the group.
  * - If `keep == KEEP_LAST`: max of row indices in the group.
  * - If `keep == KEEP_NONE`: count of equivalent rows (group size).
@@ -59,30 +79,18 @@ auto constexpr reduction_init_value(duplicate_keep_option keep)
  * the `reduction_init_value()` function. Then, the reduction result for each row group is written
  * into the output array at the index of an unspecified row in the group.
  *
- * @param map The auxiliary map to perform reduction
- * @param preprocessed_input The preprocessed of the input rows for computing row hashing and row
- *        comparisons
+ * @param set The auxiliary set to perform reduction
+ * @param set_size The number of elements in set
  * @param num_rows The number of all input rows
- * @param has_nulls Indicate whether the input rows has any nulls at any nested levels
- * @param has_nested_columns Indicates whether the input table has any nested columns
  * @param keep The parameter to determine what type of reduction to perform
- * @param nulls_equal Flag to specify whether null elements should be considered as equal
- * @param nans_equal Flag to specify whether NaN values in floating point column should be
- *        considered equal.
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned vector
- * @return A device_uvector containing the reduction results
+ * @return A device_uvector containing the output indices
  */
-rmm::device_uvector<size_type> reduce_by_row(
-  hash_map_type const& map,
-  std::shared_ptr<cudf::experimental::row::equality::preprocessed_table> const preprocessed_input,
-  size_type num_rows,
-  cudf::nullate::DYNAMIC has_nulls,
-  bool has_nested_columns,
-  duplicate_keep_option keep,
-  null_equality nulls_equal,
-  nan_equality nans_equal,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr);
-
+template <typename RowHasher>
+rmm::device_uvector<size_type> reduce_by_row(hash_set_type<RowHasher>& set,
+                                             size_type num_rows,
+                                             duplicate_keep_option keep,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr);
 }  // namespace cudf::detail
diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh
index 839672d6a56..0f9bc18e258 100644
--- a/cpp/src/stream_compaction/stream_compaction_common.cuh
+++ b/cpp/src/stream_compaction/stream_compaction_common.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,8 @@
  */
 #pragma once
 
-#include "stream_compaction_common.hpp"
-
 #include <cudf/stream_compaction.hpp>
+#include <cudf/utilities/bit.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp
deleted file mode 100644
index 13795f49781..00000000000
--- a/cpp/src/stream_compaction/stream_compaction_common.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/detail/cuco_helpers.hpp>
-#include <cudf/hashing/detail/helper_functions.cuh>
-#include <cudf/table/row_operators.cuh>
-#include <cudf/table/table_device_view.cuh>
-
-#include <cuco/static_map.cuh>
-#include <cuda/std/atomic>
-
-#include <limits>
-
-namespace cudf {
-namespace detail {
-
-using hash_map_type = cuco::legacy::
-  static_map<size_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator>;
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/src/stream_compaction/unique.cu b/cpp/src/stream_compaction/unique.cu
index c1f8b17938c..edb47984d13 100644
--- a/cpp/src/stream_compaction/unique.cu
+++ b/cpp/src/stream_compaction/unique.cu
@@ -15,7 +15,6 @@
  */
 
 #include "stream_compaction_common.cuh"
-#include "stream_compaction_common.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>

From f267b1f068ec3e8fd49599fc28afa2fc0464118b Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Wed, 26 Jun 2024 19:44:54 -0700
Subject: [PATCH 416/842] Kernel copy for pinned memory (#15934)

Issue https://github.com/rapidsai/cudf/issues/15620

Added an API that enables users to set the threshold under which we perform pinned memory copies using a kernel. The default threshold is zero, so there's no change in default behavior.
The API currently only impacts `hostdevice_vector` H<->D synchronization.

The PR adds wrappers for `cudaMemcpyAsync` so we can implement configurable behavior for pageable copies as well (e.g. copy to pinned + kernel copy).

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/15934
---
 cpp/CMakeLists.txt                            |  1 +
 .../cudf/detail/utilities/cuda_memcpy.hpp     | 53 ++++++++++++++
 cpp/include/cudf/utilities/pinned_memory.hpp  | 16 +++++
 cpp/src/io/utilities/hostdevice_vector.hpp    | 13 ++--
 cpp/src/utilities/cuda_memcpy.cu              | 71 +++++++++++++++++++
 cpp/src/utilities/pinned_memory.cpp           | 14 ++++
 6 files changed, 160 insertions(+), 8 deletions(-)
 create mode 100644 cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
 create mode 100644 cpp/src/utilities/cuda_memcpy.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5fd68bfb26c..35cf90411f2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -662,6 +662,7 @@ add_library(
   src/unary/math_ops.cu
   src/unary/nan_ops.cu
   src/unary/null_ops.cu
+  src/utilities/cuda_memcpy.cu
   src/utilities/default_stream.cpp
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
new file mode 100644
index 00000000000..b66c461ab12
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::detail {
+
+enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };
+
+/**
+ * @brief Asynchronously copies data between the host and device.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination memory address
+ * @param src Source memory address
+ * @param size Number of bytes to copy
+ * @param kind Type of host memory
+ * @param stream CUDA stream used for the copy
+ */
+void cuda_memcpy_async(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+
+/**
+ * @brief Synchronously copies data between the host and device.
+ *
+ * Implementation may use different strategies depending on the size and type of host data.
+ *
+ * @param dst Destination memory address
+ * @param src Source memory address
+ * @param size Number of bytes to copy
+ * @param kind Type of host memory
+ * @param stream CUDA stream used for the copy
+ */
+void cuda_memcpy(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
index b423eab6d38..3e2fa43cb50 100644
--- a/cpp/include/cudf/utilities/pinned_memory.hpp
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -55,4 +55,20 @@ struct pinned_mr_options {
  */
 bool config_default_pinned_memory_resource(pinned_mr_options const& opts);
 
+/**
+ * @brief Set the threshold size for using kernels for pinned memory copies.
+ *
+ * @param threshold The threshold size in bytes. If the size of the copy is less than this
+ * threshold, the copy will be done using kernels. If the size is greater than or equal to this
+ * threshold, the copy will be done using cudaMemcpyAsync.
+ */
+void set_kernel_pinned_copy_threshold(size_t threshold);
+
+/**
+ * @brief Get the threshold size for using kernels for pinned memory copies.
+ *
+ * @return The threshold size in bytes.
+ */
+size_t get_kernel_pinned_copy_threshold();
+
 }  // namespace cudf
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index 9acd6a1e3a9..aed745c42dd 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -18,6 +18,7 @@
 
 #include "hostdevice_span.hpp"
 
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -124,26 +125,22 @@ class hostdevice_vector {
 
   void host_to_device_async(rmm::cuda_stream_view stream)
   {
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(device_ptr(), host_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
+    cuda_memcpy_async(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   void host_to_device_sync(rmm::cuda_stream_view stream)
   {
-    host_to_device_async(stream);
-    stream.synchronize();
+    cuda_memcpy(device_ptr(), host_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   void device_to_host_async(rmm::cuda_stream_view stream)
   {
-    CUDF_CUDA_TRY(
-      cudaMemcpyAsync(host_ptr(), device_ptr(), size_bytes(), cudaMemcpyDefault, stream.value()));
+    cuda_memcpy_async(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   void device_to_host_sync(rmm::cuda_stream_view stream)
   {
-    device_to_host_async(stream);
-    stream.synchronize();
+    cuda_memcpy(host_ptr(), device_ptr(), size_bytes(), host_memory_kind::PINNED, stream);
   }
 
   /**
diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
new file mode 100644
index 00000000000..3d0822d8545
--- /dev/null
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/copy.h>
+
+namespace cudf::detail {
+
+namespace {
+
+void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
+{
+  if (size == 0) return;
+
+  if (size < get_kernel_pinned_copy_threshold()) {
+    thrust::copy_n(rmm::exec_policy_nosync(stream),
+                   static_cast<const char*>(src),
+                   size,
+                   static_cast<char*>(dst));
+  } else {
+    CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
+  }
+}
+
+void copy_pageable(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
+{
+  if (size == 0) return;
+
+  CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
+}
+
+};  // namespace
+
+void cuda_memcpy_async(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
+{
+  if (kind == host_memory_kind::PINNED) {
+    copy_pinned(dst, src, size, stream);
+  } else if (kind == host_memory_kind::PAGEABLE) {
+    copy_pageable(dst, src, size, stream);
+  } else {
+    CUDF_FAIL("Unsupported host memory kind");
+  }
+}
+
+void cuda_memcpy(
+  void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream)
+{
+  cuda_memcpy_async(dst, src, size, kind, stream);
+  stream.synchronize();
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/pinned_memory.cpp
index e90b7969b4d..3ea4293fc60 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/pinned_memory.cpp
@@ -211,4 +211,18 @@ bool config_default_pinned_memory_resource(pinned_mr_options const& opts)
   return did_configure;
 }
 
+CUDF_EXPORT auto& kernel_pinned_copy_threshold()
+{
+  // use cudaMemcpyAsync for all pinned copies
+  static std::atomic<size_t> threshold = 0;
+  return threshold;
+}
+
+void set_kernel_pinned_copy_threshold(size_t threshold)
+{
+  kernel_pinned_copy_threshold() = threshold;
+}
+
+size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(); }
+
 }  // namespace cudf

From e98d456a77621591aa6f9a3d63c191a29cf1689b Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 27 Jun 2024 02:14:08 -0700
Subject: [PATCH 417/842] Fix pylibcudf Table.num_rows for 0 columns case and
 add interop to docs (#16108)

There was a bug where Table.num_rows raised when we had 0 columns instead of returning 0.
I also added interop to the docs since that was missing.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16108
---
 .../user_guide/api_docs/pylibcudf/index.rst   |  1 +
 .../user_guide/api_docs/pylibcudf/interop.rst |  6 +++++
 python/cudf/cudf/_lib/pylibcudf/table.pyx     |  2 ++
 .../cudf/cudf/pylibcudf_tests/test_table.py   | 22 +++++++++++++++++++
 4 files changed, 31 insertions(+)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_table.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index f98298ff052..e9dad705cbf 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -19,6 +19,7 @@ This page provides API documentation for pylibcudf.
     gpumemoryview
     groupby
     io/index.rst
+    interop
     join
     lists
     merge
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst
new file mode 100644
index 00000000000..881ab8d7be4
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst
@@ -0,0 +1,6 @@
+=======
+interop
+=======
+
+.. automodule:: cudf._lib.pylibcudf.interop
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/cudf/cudf/_lib/pylibcudf/table.pyx
index d93ac78721b..d91fa0474b0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/table.pyx
@@ -83,6 +83,8 @@ cdef class Table:
 
     cpdef int num_rows(self):
         """The number of rows in this table."""
+        if self.num_columns() == 0:
+            return 0
         return self._columns[0].size()
 
     cpdef list columns(self):
diff --git a/python/cudf/cudf/pylibcudf_tests/test_table.py b/python/cudf/cudf/pylibcudf_tests/test_table.py
new file mode 100644
index 00000000000..cf1d51f6491
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_table.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pytest
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.mark.parametrize(
+    "arrow_tbl",
+    [
+        pa.table([]),
+        pa.table({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]}),
+        pa.table({"a": [1, 2, 3]}),
+        pa.table({"a": [1], "b": [2], "c": [3]}),
+    ],
+)
+def test_table_shape(arrow_tbl):
+    plc_tbl = plc.interop.from_arrow(arrow_tbl)
+
+    plc_tbl_shape = (plc_tbl.num_rows(), plc_tbl.num_columns())
+    assert plc_tbl_shape == arrow_tbl.shape

From fa8284ddb2de808573d5b21cc9e650578ddf6acc Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 27 Jun 2024 11:51:41 +0100
Subject: [PATCH 418/842] Adapt to polars upstream changes and turn on CI
 testing (#16081)

They changed the semantics of join keys when those keys are expressions to more closely match SQL.

Dtype inference is also tighter, so update tests to adapt to those changes, and some other small deprecation warnings.

Finish the final missing coverage piece and turn on testing in CI (failing if we don't hit 100% coverage as well).

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16081
---
 .github/workflows/pr.yaml                     | 12 +++
 ci/test_cudf_polars.sh                        | 68 +++++++++++++++++
 python/cudf_polars/cudf_polars/dsl/ir.py      | 70 +++++++++---------
 .../cudf_polars/typing/__init__.py            | 74 ++++++++++---------
 .../cudf_polars/tests/expressions/test_agg.py |  4 +-
 .../tests/expressions/test_booleanfunction.py | 12 ++-
 .../tests/expressions/test_rolling.py         | 12 ++-
 .../tests/expressions/test_stringfunction.py  | 16 ++--
 python/cudf_polars/tests/test_groupby.py      | 11 +--
 python/cudf_polars/tests/test_join.py         | 16 +++-
 python/cudf_polars/tests/test_mapfunction.py  | 32 ++++++--
 python/cudf_polars/tests/test_python_scan.py  |  7 +-
 python/cudf_polars/tests/test_union.py        | 12 +--
 13 files changed, 234 insertions(+), 112 deletions(-)
 create mode 100755 ci/test_cudf_polars.sh

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index cb582df21e0..a35802f2ab0 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,6 +25,7 @@ jobs:
       - docs-build
       - wheel-build-cudf
       - wheel-tests-cudf
+      - test-cudf-polars
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
       - devcontainer
@@ -132,6 +133,17 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
+  test-cudf-polars:
+    needs: wheel-build-cudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      # This always runs, but only fails if this PR touches code in
+      # pylibcudf or cudf_polars
+      script: "ci/test_cudf_polars.sh"
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
diff --git a/ci/test_cudf_polars.sh b/ci/test_cudf_polars.sh
new file mode 100755
index 00000000000..669e049ab26
--- /dev/null
+++ b/ci/test_cudf_polars.sh
@@ -0,0 +1,68 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -eou pipefail
+
+# We will only fail these tests if the PR touches code in pylibcudf
+# or cudf_polars itself.
+# Note, the three dots mean we are doing diff between the merge-base
+# of upstream and HEAD. So this is asking, "does _this branch_ touch
+# files in cudf_polars/pylibcudf", rather than "are there changes
+# between upstream and this branch which touch cudf_polars/pylibcudf"
+# TODO: is the target branch exposed anywhere in an environment variable?
+if [ -n "$(git diff --name-only origin/branch-24.08...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
+then
+    HAS_CHANGES=1
+else
+    HAS_CHANGES=0
+fi
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+
+RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
+RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
+mkdir -p "${RAPIDS_TESTS_DIR}"
+
+rapids-logger "Install cudf wheel"
+# echo to expand wildcard before adding `[extra]` requires for pip
+python -m pip install $(echo ./dist/cudf*.whl)[test]
+
+rapids-logger "Install polars (allow pre-release versions)"
+python -m pip install 'polars>=1.0.0a0'
+
+rapids-logger "Install cudf_polars"
+python -m pip install --no-deps python/cudf_polars
+
+rapids-logger "Run cudf_polars tests"
+
+function set_exitcode()
+{
+    EXITCODE=$?
+}
+EXITCODE=0
+trap set_exitcode ERR
+set +e
+
+python -m pytest \
+       --cache-clear \
+       --cov cudf_polars \
+       --cov-fail-under=100 \
+       --cov-config=python/cudf_polars/pyproject.toml \
+       --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf_polars.xml" \
+       python/cudf_polars/tests
+
+trap ERR
+set -e
+
+if [ ${EXITCODE} != 0 ]; then
+    rapids-logger "Testing FAILED: exitcode ${EXITCODE}"
+else
+    rapids-logger "Testing PASSED"
+fi
+
+if [ ${HAS_CHANGES} == 1 ]; then
+    exit ${EXITCODE}
+else
+    exit 0
+fi
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 4ad6e75fb2e..3f5f3c74050 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -123,7 +123,7 @@ def broadcast(
     ]
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class IR:
     """Abstract plan node, representing an unevaluated dataframe."""
 
@@ -157,7 +157,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         )  # pragma: no cover
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class PythonScan(IR):
     """Representation of input from a python function."""
 
@@ -171,7 +171,7 @@ def __post_init__(self):
         raise NotImplementedError("PythonScan not implemented")
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Scan(IR):
     """Input from files."""
 
@@ -248,7 +248,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return df.filter(mask)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Cache(IR):
     """
     Return a cached plan node.
@@ -269,7 +269,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return cache.setdefault(self.key, self.value.evaluate(cache=cache))
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class DataFrameScan(IR):
     """
     Input from an existing polars DataFrame.
@@ -315,7 +315,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             return df
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Select(IR):
     """Produce a new dataframe selecting given expressions from an input."""
 
@@ -336,7 +336,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(columns)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Reduce(IR):
     """
     Produce a new dataframe selecting given expressions from an input.
@@ -389,7 +389,7 @@ def placeholder_column(n: int) -> plc.Column:
     )
 
 
-@dataclasses.dataclass(slots=False)
+@dataclasses.dataclass
 class GroupBy(IR):
     """Perform a groupby."""
 
@@ -490,7 +490,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame([*result_keys, *results]).slice(self.options.slice)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Join(IR):
     """A join of two dataframes."""
 
@@ -518,8 +518,16 @@ class Join(IR):
     - coalesce: should key columns be coalesced (only makes sense for outer joins)
     """
 
-    @cache
+    def __post_init__(self) -> None:
+        """Validate preconditions."""
+        if any(
+            isinstance(e.value, expr.Literal)
+            for e in itertools.chain(self.left_on, self.right_on)
+        ):
+            raise NotImplementedError("Join with literal as join key.")
+
     @staticmethod
+    @cache
     def _joiners(
         how: Literal["inner", "left", "full", "leftsemi", "leftanti"],
     ) -> tuple[
@@ -582,17 +590,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 for new, old in zip(columns[left.num_columns :], right.columns)
             ]
             return DataFrame([*left_cols, *right_cols])
-        left_on = DataFrame(
-            broadcast(
-                *(e.evaluate(left) for e in self.left_on), target_length=left.num_rows
-            )
-        )
-        right_on = DataFrame(
-            broadcast(
-                *(e.evaluate(right) for e in self.right_on),
-                target_length=right.num_rows,
-            )
-        )
+        # TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184
+        left_on = DataFrame(broadcast(*(e.evaluate(left) for e in self.left_on)))
+        right_on = DataFrame(broadcast(*(e.evaluate(right) for e in self.right_on)))
         null_equality = (
             plc.types.NullEquality.EQUAL
             if join_nulls
@@ -602,13 +602,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         if right_policy is None:
             # Semi join
             lg = join_fn(left_on.table, right_on.table, null_equality)
-            left = left.replace_columns(*left_on.columns)
             table = plc.copying.gather(left.table, lg, left_policy)
             result = DataFrame.from_table(table, left.column_names)
         else:
             lg, rg = join_fn(left_on.table, right_on.table, null_equality)
-            left = left.replace_columns(*left_on.columns)
-            right = right.replace_columns(*right_on.columns)
             if coalesce and how == "inner":
                 right = right.discard_columns(right_on.column_names_set)
             left = DataFrame.from_table(
@@ -642,7 +639,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return result.slice(zlice)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class HStack(IR):
     """Add new columns to a dataframe."""
 
@@ -671,7 +668,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return df.with_columns(columns)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Distinct(IR):
     """Produce a new dataframe with distinct rows."""
 
@@ -741,7 +738,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return result.slice(self.zlice)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Sort(IR):
     """Sort a dataframe."""
 
@@ -810,7 +807,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(columns).slice(self.zlice)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Slice(IR):
     """Slice a dataframe."""
 
@@ -827,7 +824,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return df.slice((self.offset, self.length))
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Filter(IR):
     """Filter a dataframe with a boolean mask."""
 
@@ -843,7 +840,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return df.filter(mask)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Projection(IR):
     """Select a subset of columns from a dataframe."""
 
@@ -860,7 +857,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         return DataFrame(columns)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class MapFunction(IR):
     """Apply some function to a dataframe."""
 
@@ -894,6 +891,13 @@ def __post_init__(self) -> None:
                 # polars requires that all to-explode columns have the
                 # same sub-shapes
                 raise NotImplementedError("Explode with more than one column")
+        elif self.name == "rename":
+            old, new, _ = self.options
+            # TODO: perhaps polars should validate renaming in the IR?
+            if len(new) != len(set(new)) or (
+                set(new) & (set(self.df.schema.keys() - set(old)))
+            ):
+                raise NotImplementedError("Duplicate new names in rename.")
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -919,7 +923,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             raise AssertionError("Should never be reached")  # pragma: no cover
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class Union(IR):
     """Concatenate dataframes vertically."""
 
@@ -943,7 +947,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         ).slice(self.zlice)
 
 
-@dataclasses.dataclass(slots=True)
+@dataclasses.dataclass
 class HConcat(IR):
     """Concatenate dataframes horizontally."""
 
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index 6d597a91724..c04eac41bb7 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -6,7 +6,7 @@
 from __future__ import annotations
 
 from collections.abc import Mapping
-from typing import TYPE_CHECKING, Literal, Protocol, TypeAlias
+from typing import TYPE_CHECKING, Literal, Protocol, Union
 
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
@@ -15,43 +15,45 @@
 if TYPE_CHECKING:
     from typing import Callable
 
+    from typing_extensions import TypeAlias
+
     import polars as pl
 
-IR: TypeAlias = (
-    pl_ir.PythonScan
-    | pl_ir.Scan
-    | pl_ir.Cache
-    | pl_ir.DataFrameScan
-    | pl_ir.Select
-    | pl_ir.GroupBy
-    | pl_ir.Join
-    | pl_ir.HStack
-    | pl_ir.Distinct
-    | pl_ir.Sort
-    | pl_ir.Slice
-    | pl_ir.Filter
-    | pl_ir.SimpleProjection
-    | pl_ir.MapFunction
-    | pl_ir.Union
-    | pl_ir.HConcat
-    | pl_ir.ExtContext
-)
-
-Expr: TypeAlias = (
-    pl_expr.Function
-    | pl_expr.Window
-    | pl_expr.Literal
-    | pl_expr.Sort
-    | pl_expr.SortBy
-    | pl_expr.Gather
-    | pl_expr.Filter
-    | pl_expr.Cast
-    | pl_expr.Column
-    | pl_expr.Agg
-    | pl_expr.BinaryExpr
-    | pl_expr.Len
-    | pl_expr.PyExprIR
-)
+IR: TypeAlias = Union[
+    pl_ir.PythonScan,
+    pl_ir.Scan,
+    pl_ir.Cache,
+    pl_ir.DataFrameScan,
+    pl_ir.Select,
+    pl_ir.GroupBy,
+    pl_ir.Join,
+    pl_ir.HStack,
+    pl_ir.Distinct,
+    pl_ir.Sort,
+    pl_ir.Slice,
+    pl_ir.Filter,
+    pl_ir.SimpleProjection,
+    pl_ir.MapFunction,
+    pl_ir.Union,
+    pl_ir.HConcat,
+    pl_ir.ExtContext,
+]
+
+Expr: TypeAlias = Union[
+    pl_expr.Function,
+    pl_expr.Window,
+    pl_expr.Literal,
+    pl_expr.Sort,
+    pl_expr.SortBy,
+    pl_expr.Gather,
+    pl_expr.Filter,
+    pl_expr.Cast,
+    pl_expr.Column,
+    pl_expr.Agg,
+    pl_expr.BinaryExpr,
+    pl_expr.Len,
+    pl_expr.PyExprIR,
+]
 
 Schema: TypeAlias = Mapping[str, plc.DataType]
 
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index 2ffa1c4af6d..267d0a99692 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -52,7 +52,7 @@ def test_agg(df, agg):
 
     # https://github.com/rapidsai/cudf/issues/15852
     check_dtypes = agg not in {"n_unique", "median"}
-    if not check_dtypes and q.schema["a"] != pl.Float64:
+    if not check_dtypes and q.collect_schema()["a"] != pl.Float64:
         with pytest.raises(AssertionError):
             assert_gpu_result_equal(q)
     assert_gpu_result_equal(q, check_dtypes=check_dtypes, check_exact=False)
@@ -65,7 +65,7 @@ def test_agg(df, agg):
 )
 @pytest.mark.parametrize("op", ["min", "max"])
 def test_agg_float_with_nans(propagate_nans, op):
-    df = pl.LazyFrame({"a": [1, 2, float("nan")]})
+    df = pl.LazyFrame({"a": pl.Series([1, 2, float("nan")], dtype=pl.Float64())})
     op = getattr(pl.Expr, f"nan_{op}" if propagate_nans else op)
     q = df.select(op(pl.col("a")))
 
diff --git a/python/cudf_polars/tests/expressions/test_booleanfunction.py b/python/cudf_polars/tests/expressions/test_booleanfunction.py
index 951b749e670..a52fba26528 100644
--- a/python/cudf_polars/tests/expressions/test_booleanfunction.py
+++ b/python/cudf_polars/tests/expressions/test_booleanfunction.py
@@ -26,7 +26,7 @@ def has_nulls(request):
 def test_booleanfunction_reduction(ignore_nulls):
     ldf = pl.LazyFrame(
         {
-            "a": [1, 2, 3.0, 2, 5],
+            "a": pl.Series([1, 2, 3.0, 2, 5], dtype=pl.Float64()),
             "b": [0, 3, 1, -1, None],
             "c": [1, 6, 5, 3, 2],
         }
@@ -82,7 +82,9 @@ def test_boolean_function_unary(request, expr, has_nans, has_nulls):
     ],
 )
 def test_unsupported_boolean_function(expr):
-    df = pl.LazyFrame({"a": [1, float("nan"), 2, 4], "b": [1, 2, 3, 4]})
+    df = pl.LazyFrame(
+        {"a": pl.Series([1, float("nan"), 2, 4], dtype=pl.Float64()), "b": [1, 2, 3, 4]}
+    )
 
     q = df.select(expr)
 
@@ -95,7 +97,11 @@ def test_unsupported_boolean_function(expr):
 )
 def test_boolean_isbetween(closed, bounds):
     df = pl.LazyFrame(
-        {"a": [1, float("nan"), 2, 4], "lo": [1, 2, 2, 3], "hi": [10, 4, 2, 4]}
+        {
+            "a": pl.Series([1, float("nan"), 2, 4], dtype=pl.Float32()),
+            "lo": [1, 2, 2, 3],
+            "hi": [10, 4, 2, 4],
+        }
     )
 
     q = df.select(pl.col("a").is_between(*bounds, closed=closed))
diff --git a/python/cudf_polars/tests/expressions/test_rolling.py b/python/cudf_polars/tests/expressions/test_rolling.py
index d4920d35f14..992efe0ba79 100644
--- a/python/cudf_polars/tests/expressions/test_rolling.py
+++ b/python/cudf_polars/tests/expressions/test_rolling.py
@@ -3,11 +3,9 @@
 
 from __future__ import annotations
 
-import pytest
-
 import polars as pl
 
-from cudf_polars import translate_ir
+from cudf_polars.testing.asserts import assert_ir_translation_raises
 
 
 def test_rolling():
@@ -29,13 +27,13 @@ def test_rolling():
         min_a=pl.min("a").rolling(index_column="dt", period="2d"),
         max_a=pl.max("a").rolling(index_column="dt", period="2d"),
     )
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 def test_grouped_rolling():
     df = pl.LazyFrame({"a": [1, 2, 3, 4, 5, 6], "b": [1, 2, 1, 3, 1, 2]})
 
     q = df.select(pl.col("a").min().over("b"))
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py
index 3c498fe7286..9729e765948 100644
--- a/python/cudf_polars/tests/expressions/test_stringfunction.py
+++ b/python/cudf_polars/tests/expressions/test_stringfunction.py
@@ -8,8 +8,11 @@
 
 import polars as pl
 
-from cudf_polars import execute_with_cudf, translate_ir
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars import execute_with_cudf
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 @pytest.fixture
@@ -47,22 +50,19 @@ def test_supported_stringfunction_expression(ldf):
 def test_unsupported_stringfunction(ldf):
     q = ldf.select(pl.col("a").str.count_matches("e", literal=True))
 
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 def test_contains_re_non_strict_raises(ldf):
     q = ldf.select(pl.col("a").str.contains(".", strict=False))
 
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 def test_contains_re_non_literal_raises(ldf):
     q = ldf.select(pl.col("a").str.contains(pl.col("b"), literal=False))
 
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index e70f923b097..aefad59eb91 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -6,8 +6,10 @@
 
 import polars as pl
 
-from cudf_polars import translate_ir
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 @pytest.fixture
@@ -72,7 +74,7 @@ def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs):
     q = df.group_by(*keys, maintain_order=maintain_order).agg(*exprs)
 
     if not maintain_order:
-        sort_keys = list(q.schema.keys())[: len(keys)]
+        sort_keys = list(q.collect_schema().keys())[: len(keys)]
         q = q.sort(*sort_keys)
 
     assert_gpu_result_equal(q, check_exact=False)
@@ -97,5 +99,4 @@ def test_groupby_len(df, keys):
 def test_groupby_unsupported(df, expr):
     q = df.group_by("key1").agg(expr)
 
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 81166b0b2f6..89f6fd3455b 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -6,7 +6,10 @@
 
 import polars as pl
 
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 @pytest.mark.parametrize(
@@ -71,3 +74,14 @@ def test_cross_join():
     q = left.join(right, how="cross")
 
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "left_on,right_on", [(pl.col("a"), pl.lit(2)), (pl.lit(2), pl.col("a"))]
+)
+def test_join_literal_key_unsupported(left_on, right_on):
+    left = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
+    right = pl.LazyFrame({"a": [1, 2, 3], "b": [5, 6, 7]})
+    q = left.join(right, left_on=left_on, right_on=right_on, how="inner")
+
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_mapfunction.py b/python/cudf_polars/tests/test_mapfunction.py
index ec6b3f3fc0a..77032108e6f 100644
--- a/python/cudf_polars/tests/test_mapfunction.py
+++ b/python/cudf_polars/tests/test_mapfunction.py
@@ -6,8 +6,10 @@
 
 import polars as pl
 
-from cudf_polars import translate_ir
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 def test_merge_sorted_raises():
@@ -17,16 +19,14 @@ def test_merge_sorted_raises():
 
     q = df1.merge_sorted(df2, key="a").merge_sorted(df3, key="a")
 
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 def test_explode_multiple_raises():
     df = pl.LazyFrame({"a": [[1, 2], [3, 4]], "b": [[5, 6], [7, 8]]})
     q = df.explode("a", "b")
 
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
 
 
 @pytest.mark.parametrize("column", ["a", "b"])
@@ -41,3 +41,23 @@ def test_explode_single(column):
     q = df.explode(column)
 
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("mapping", [{"b": "a"}, {"a": "c", "b": "c"}])
+def test_rename_duplicate_raises(mapping):
+    df = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
+
+    q = df.rename(mapping)
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize(
+    "mapping", [{}, {"b": "c"}, {"b": "a", "a": "b"}, {"a": "c", "b": "d"}]
+)
+def test_rename_columns(mapping):
+    df = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
+
+    q = df.rename(mapping)
+
+    assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/test_python_scan.py b/python/cudf_polars/tests/test_python_scan.py
index c03474e3dc8..fd8453b77c4 100644
--- a/python/cudf_polars/tests/test_python_scan.py
+++ b/python/cudf_polars/tests/test_python_scan.py
@@ -2,11 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
-import pytest
-
 import polars as pl
 
-from cudf_polars import translate_ir
+from cudf_polars.testing.asserts import assert_ir_translation_raises
 
 
 def test_python_scan():
@@ -14,7 +12,6 @@ def source(with_columns, predicate, nrows):
         return pl.DataFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int8())})
 
     q = pl.LazyFrame._scan_python_function({"a": pl.Int8}, source, pyarrow=False)
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(q._ldf.visit())
+    assert_ir_translation_raises(q, NotImplementedError)
 
     assert q.collect().equals(source(None, None, None))
diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py
index 6c9122bc260..b021d832910 100644
--- a/python/cudf_polars/tests/test_union.py
+++ b/python/cudf_polars/tests/test_union.py
@@ -2,12 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
-import pytest
-
 import polars as pl
 
-from cudf_polars import translate_ir
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 def test_union():
@@ -31,8 +31,8 @@ def test_union_schema_mismatch_raises():
     ).lazy()
     ldf2 = ldf.select(pl.col("a").cast(pl.Float32))
     query = pl.concat([ldf, ldf2], how="diagonal")
-    with pytest.raises(NotImplementedError):
-        _ = translate_ir(query._ldf.visit())
+
+    assert_ir_translation_raises(query, NotImplementedError)
 
 
 def test_concat_vertical():

From 5d49fe6a7fae839b2be16ae8cd6899d287855359 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Thu, 27 Jun 2024 10:44:21 -0500
Subject: [PATCH 419/842] Fix unnecessarily strict check in parquet chunked
 reader for choosing split locations. (#16099)

This is a fix that somehow didn't make it into the initial wave of bug fixes for the parquet chunked reader earlier this year.

The code that determines where to do splits needs to be sure it always chooses a location such that the pages that are selected always enclose at least one full row for a list column.  This means that you need to see at least 1 full row (2 row boundaries) in the group of pages.  The weaklogic was only checking if you had 1 full row within the very last page in the selection, which is unnecessarily strict.  We actually ran into some data out in the wild where this was hit.

This PR changes the logic to include all pages within the chunk when doing the check instead of just the last one.

Authors:
  - https://github.com/nvdbaranec
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16099
---
 cpp/src/io/parquet/reader_impl_chunking.cu   | 13 ++++++++-----
 cpp/src/io/parquet/reader_impl_preprocess.cu |  3 ++-
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 9ad5a2d6e8d..d371ef5de93 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -337,7 +337,8 @@ int64_t find_next_split(int64_t cur_pos,
                         size_t cur_row_index,
                         size_t cur_cumulative_size,
                         cudf::host_span<cumulative_page_info const> sizes,
-                        size_t size_limit)
+                        size_t size_limit,
+                        size_t min_row_count)
 {
   auto const start = thrust::make_transform_iterator(
     sizes.begin(),
@@ -357,7 +358,7 @@ int64_t find_next_split(int64_t cur_pos,
   // this guarantees that even if we cannot fit the set of rows represented by our where our cur_pos
   // is, we will still move forward instead of failing.
   while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
-         (sizes[split_pos].end_row_index == cur_row_index)) {
+         (sizes[split_pos].end_row_index - cur_row_index < min_row_count)) {
     split_pos++;
   }
 
@@ -657,8 +658,10 @@ std::tuple<rmm::device_uvector<page_span>, size_t, size_t> compute_next_subpass(
   auto const start_index = find_start_index(h_aggregated_info, start_row);
   auto const cumulative_size =
     start_row == 0 || start_index == 0 ? 0 : h_aggregated_info[start_index - 1].size_bytes;
+  // when choosing subpasses, we need to guarantee at least 2 rows in the included pages so that all
+  // list columns have a clear start and end.
   auto const end_index =
-    find_next_split(start_index, start_row, cumulative_size, h_aggregated_info, size_limit);
+    find_next_split(start_index, start_row, cumulative_size, h_aggregated_info, size_limit, 2);
   auto const end_row = h_aggregated_info[end_index].end_row_index;
 
   // for each column, collect the set of pages that spans start_row / end_row
@@ -703,8 +706,8 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
   size_t cur_cumulative_size = 0;
   auto const max_row         = min(skip_rows + num_rows, h_aggregated_info.back().end_row_index);
   while (cur_row_index < max_row) {
-    auto const split_pos =
-      find_next_split(cur_pos, cur_row_index, cur_cumulative_size, h_aggregated_info, size_limit);
+    auto const split_pos = find_next_split(
+      cur_pos, cur_row_index, cur_cumulative_size, h_aggregated_info, size_limit, 1);
 
     auto const start_row = cur_row_index;
     cur_row_index        = min(max_row, h_aggregated_info[split_pos].end_row_index);
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 9df5c362cdd..f28a7311ccb 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1436,7 +1436,8 @@ void reader::impl::preprocess_subpass_pages(read_mode mode, size_t chunk_read_li
     // subpass since we know that will safely completed.
     bool const is_list = chunk.max_level[level_type::REPETITION] > 0;
     if (is_list && max_col_row < last_pass_row) {
-      size_t const min_col_row = static_cast<size_t>(chunk.start_row + last_page.chunk_row);
+      auto const& first_page   = subpass.pages[page_index];
+      size_t const min_col_row = static_cast<size_t>(chunk.start_row + first_page.chunk_row);
       CUDF_EXPECTS((max_col_row - min_col_row) > 1, "Unexpected short subpass");
       max_col_row--;
     }

From a71c249f9f320ecb61aa8135bbda300122e43491 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 27 Jun 2024 14:29:31 -0500
Subject: [PATCH 420/842] Fix dtype errors in `StringArrays` (#16111)

This PR adds proxy classes for `ArrowStringArray` and `ArrowStringArrayNumpySemantics` that will increase the pandas test pass rate by 1%.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16111
---
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 16 +++++++++++++
 .../cudf/pandas/scripts/run-pandas-tests.sh   |  3 ++-
 .../cudf_pandas_tests/test_cudf_pandas.py     | 23 +++++++++++++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 0ba432d6d0e..a64bf7772fe 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -522,6 +522,22 @@ def Index__new__(cls, *args, **kwargs):
     },
 )
 
+ArrowStringArrayNumpySemantics = make_final_proxy_type(
+    "ArrowStringArrayNumpySemantics",
+    _Unusable,
+    pd.core.arrays.string_arrow.ArrowStringArrayNumpySemantics,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+)
+
+ArrowStringArray = make_final_proxy_type(
+    "ArrowStringArray",
+    _Unusable,
+    pd.core.arrays.string_arrow.ArrowStringArray,
+    fast_to_slow=_Unusable(),
+    slow_to_fast=_Unusable(),
+)
+
 StringDtype = make_final_proxy_type(
     "StringDtype",
     _Unusable,
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index cd9f90d50fe..a66f63c09b3 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -133,7 +133,8 @@ and not test_s3_roundtrip"
 TEST_THAT_CRASH_PYTEST_WORKERS="not test_bitmasks_pyarrow \
 and not test_large_string_pyarrow \
 and not test_interchange_from_corrected_buffer_dtypes \
-and not test_eof_states"
+and not test_eof_states \
+and not test_array_tz"
 
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index eed5037cbea..0d46e2e9311 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1533,3 +1533,26 @@ def test_is_proxy_object():
     assert is_proxy_object(np_arr_proxy)
     assert is_proxy_object(s1)
     assert not is_proxy_object(s2)
+
+
+def test_arrow_string_arrays():
+    cu_s = xpd.Series(["a", "b", "c"])
+    pd_s = pd.Series(["a", "b", "c"])
+
+    cu_arr = xpd.arrays.ArrowStringArray._from_sequence(
+        cu_s, dtype=xpd.StringDtype("pyarrow")
+    )
+    pd_arr = pd.arrays.ArrowStringArray._from_sequence(
+        pd_s, dtype=pd.StringDtype("pyarrow")
+    )
+
+    tm.assert_equal(cu_arr, pd_arr)
+
+    cu_arr = xpd.core.arrays.string_arrow.ArrowStringArray._from_sequence(
+        cu_s, dtype=xpd.StringDtype("pyarrow_numpy")
+    )
+    pd_arr = pd.core.arrays.string_arrow.ArrowStringArray._from_sequence(
+        pd_s, dtype=pd.StringDtype("pyarrow_numpy")
+    )
+
+    tm.assert_equal(cu_arr, pd_arr)

From 2ed69c9e830d90a8e565ea23ba1813e594a9f4d9 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 27 Jun 2024 10:11:09 -1000
Subject: [PATCH 421/842] Ensure MultiIndex.to_frame deep copies columns
 (#16110)

Additionally, this allows simplification in `MultiIndex.__repr__` which avoids a shallow copy and also caught a bug where `NaT` was not supposed to be quoted

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16110
---
 python/cudf/cudf/core/multiindex.py | 88 ++++++++++-------------------
 python/cudf/cudf/tests/test_repr.py | 10 ++--
 2 files changed, 35 insertions(+), 63 deletions(-)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index a01242d957d..547c14cdc99 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -23,6 +23,7 @@
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
 from cudf.core import column
 from cudf.core._base_index import _return_get_indexer_result
+from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame
 from cudf.core.index import (
     BaseIndex,
@@ -446,45 +447,26 @@ def __repr__(self):
             )
             preprocess = self.take(indices)
         else:
-            preprocess = self.copy(deep=False)
-
-        if any(col.has_nulls() for col in preprocess._data.columns):
-            preprocess_df = preprocess.to_frame(index=False)
-            for name, col in preprocess._data.items():
-                if isinstance(
-                    col,
-                    (
-                        column.datetime.DatetimeColumn,
-                        column.timedelta.TimeDeltaColumn,
-                    ),
-                ):
-                    preprocess_df[name] = col.astype("str").fillna(
-                        str(cudf.NaT)
-                    )
+            preprocess = self
 
-            tuples_list = list(
-                zip(
-                    *list(
-                        map(lambda val: pd.NA if val is None else val, col)
-                        for col in preprocess_df.to_arrow()
-                        .to_pydict()
-                        .values()
-                    )
-                )
-            )
+        arrays = []
+        for name, col in zip(self.names, preprocess._columns):
+            try:
+                pd_idx = col.to_pandas(nullable=True)
+            except NotImplementedError:
+                pd_idx = col.to_pandas(nullable=False)
+            pd_idx.name = name
+            arrays.append(pd_idx)
 
-            preprocess = preprocess.to_pandas(nullable=True)
-            preprocess.values[:] = tuples_list
-        else:
-            preprocess = preprocess.to_pandas(nullable=True)
+        preprocess_pd = pd.MultiIndex.from_arrays(arrays)
 
-        output = repr(preprocess)
+        output = repr(preprocess_pd)
         output_prefix = self.__class__.__name__ + "("
         output = output.lstrip(output_prefix)
         lines = output.split("\n")
 
         if len(lines) > 1:
-            if "length=" in lines[-1] and len(self) != len(preprocess):
+            if "length=" in lines[-1] and len(self) != len(preprocess_pd):
                 last_line = lines[-1]
                 length_index = last_line.index("length=")
                 last_line = last_line[:length_index] + f"length={len(self)})"
@@ -1022,42 +1004,32 @@ def to_frame(self, index=True, name=no_default, allow_duplicates=False):
         a c  a  c
         b d  b  d
         """
-        # TODO: Currently this function makes a shallow copy, which is
-        # incorrect. We want to make a deep copy, otherwise further
-        # modifications of the resulting DataFrame will affect the MultiIndex.
         if name is no_default:
             column_names = [
                 level if name is None else name
                 for level, name in enumerate(self.names)
             ]
+        elif not is_list_like(name):
+            raise TypeError(
+                "'name' must be a list / sequence of column names."
+            )
+        elif len(name) != len(self.levels):
+            raise ValueError(
+                "'name' should have the same length as "
+                "number of levels on index."
+            )
         else:
-            if not is_list_like(name):
-                raise TypeError(
-                    "'name' must be a list / sequence of column names."
-                )
-            if len(name) != len(self.levels):
-                raise ValueError(
-                    "'name' should have the same length as "
-                    "number of levels on index."
-                )
             column_names = name
 
-        all_none_names = None
-        if not (
-            all_none_names := all(x is None for x in column_names)
-        ) and len(column_names) != len(set(column_names)):
+        if len(column_names) != len(set(column_names)):
             raise ValueError("Duplicate column names are not allowed")
-        df = cudf.DataFrame._from_data(
-            data=self._data,
-            columns=column_names
-            if name is not no_default and not all_none_names
-            else None,
+        ca = ColumnAccessor(
+            dict(zip(column_names, (col.copy() for col in self._columns))),
+            verify=False,
+        )
+        return cudf.DataFrame._from_data(
+            data=ca, index=self if index else None
         )
-
-        if index:
-            df = df.set_index(self)
-
-        return df
 
     @_cudf_nvtx_annotate
     def get_level_values(self, level):
@@ -1243,7 +1215,7 @@ def values(self):
 
     @classmethod
     @_cudf_nvtx_annotate
-    def from_frame(cls, df, names=None):
+    def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
         """
         Make a MultiIndex from a DataFrame.
 
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 8f65bd26bd1..193d64a9e7f 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1210,7 +1210,7 @@ def test_multiindex_repr(pmi, max_seq_items):
             .index,
             textwrap.dedent(
                 """
-                MultiIndex([('abc',                       'NaT', 0.345),
+                MultiIndex([('abc',                         NaT, 0.345),
                             ( <NA>, '0 days 00:00:00.000000001',  <NA>),
                             ('xyz', '0 days 00:00:00.000000002', 100.0),
                             ( <NA>, '0 days 00:00:00.000000003',  10.0)],
@@ -1252,10 +1252,10 @@ def test_multiindex_repr(pmi, max_seq_items):
             .index,
             textwrap.dedent(
                 """
-            MultiIndex([('NaT', <NA>),
-                        ('NaT', <NA>),
-                        ('NaT', <NA>),
-                        ('NaT', <NA>)],
+            MultiIndex([(NaT, <NA>),
+                        (NaT, <NA>),
+                        (NaT, <NA>),
+                        (NaT, <NA>)],
                     names=['b', 'a'])
             """
             ),

From c847b98291bd41f98ac417becf0c53293a392ce3 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 27 Jun 2024 21:33:29 +0100
Subject: [PATCH 422/842] Finish implementation of cudf-polars boolean function
 handlers (#16098)

The missing nodes were `is_in`, `not` (both easy), `is_finite` and `is_infinite` (obtained by translating to `contains` calls).

While here, remove the implementation of `IsBetween` and just translate to an expression with binary operations. This removes the need for special-casing scalar arguments to `IsBetween` and reproducing the code for binop evaluation.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16098
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 67 +++++++++++--------
 .../cudf_polars/cudf_polars/dsl/translate.py  | 10 +++
 .../tests/expressions/test_booleanfunction.py | 48 +++++++++++--
 3 files changed, 90 insertions(+), 35 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 871134665af..97325161650 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -443,12 +443,12 @@ def __init__(
         ):
             # With ignore_nulls == False, polars uses Kleene logic
             raise NotImplementedError(f"Kleene logic for {self.name}")
-        if self.name in (
-            pl_expr.BooleanFunction.IsFinite,
-            pl_expr.BooleanFunction.IsInfinite,
-            pl_expr.BooleanFunction.IsIn,
+        if self.name == pl_expr.BooleanFunction.IsIn and not all(
+            c.dtype == self.children[0].dtype for c in self.children
         ):
-            raise NotImplementedError(f"{self.name}")
+            # TODO: If polars IR doesn't put the casts in, we need to
+            # mimic the supertype promotion rules.
+            raise NotImplementedError("IsIn doesn't support supertype casting")
 
     @staticmethod
     def _distinct(
@@ -506,6 +506,33 @@ def do_evaluate(
         mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
+        if self.name in (
+            pl_expr.BooleanFunction.IsFinite,
+            pl_expr.BooleanFunction.IsInfinite,
+        ):
+            # Avoid evaluating the child if the dtype tells us it's unnecessary.
+            (child,) = self.children
+            is_finite = self.name == pl_expr.BooleanFunction.IsFinite
+            if child.dtype.id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
+                value = plc.interop.from_arrow(
+                    pa.scalar(value=is_finite, type=plc.interop.to_arrow(self.dtype))
+                )
+                return Column(plc.Column.from_scalar(value, df.num_rows))
+            needles = child.evaluate(df, context=context, mapping=mapping)
+            to_search = [-float("inf"), float("inf")]
+            if is_finite:
+                # NaN is neither finite not infinite
+                to_search.append(float("nan"))
+            haystack = plc.interop.from_arrow(
+                pa.array(
+                    to_search,
+                    type=plc.interop.to_arrow(needles.obj.type()),
+                )
+            )
+            result = plc.search.contains(haystack, needles.obj)
+            if is_finite:
+                result = plc.unary.unary_operation(result, plc.unary.UnaryOperator.NOT)
+            return Column(result)
         columns = [
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
@@ -612,31 +639,13 @@ def do_evaluate(
                     (c.obj for c in columns),
                 )
             )
-        elif self.name == pl_expr.BooleanFunction.IsBetween:
-            column, lo, hi = columns
-            (closed,) = self.options
-            lop, rop = self._BETWEEN_OPS[closed]
-            lo_obj = (
-                lo.obj_scalar
-                if lo.is_scalar and lo.obj.size() != column.obj.size()
-                else lo.obj
-            )
-            hi_obj = (
-                hi.obj_scalar
-                if hi.is_scalar and hi.obj.size() != column.obj.size()
-                else hi.obj
-            )
+        elif self.name == pl_expr.BooleanFunction.IsIn:
+            needles, haystack = columns
+            return Column(plc.search.contains(haystack.obj, needles.obj))
+        elif self.name == pl_expr.BooleanFunction.Not:
+            (column,) = columns
             return Column(
-                plc.binaryop.binary_operation(
-                    plc.binaryop.binary_operation(
-                        column.obj, lo_obj, lop, output_type=self.dtype
-                    ),
-                    plc.binaryop.binary_operation(
-                        column.obj, hi_obj, rop, output_type=self.dtype
-                    ),
-                    plc.binaryop.BinaryOperator.LOGICAL_AND,
-                    self.dtype,
-                )
+                plc.unary.unary_operation(column.obj, plc.unary.UnaryOperator.NOT)
             )
         else:
             raise NotImplementedError(
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 5d289885f47..742e5a591ee 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -342,6 +342,16 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
             *(translate_expr(visitor, n=n) for n in node.input),
         )
     elif isinstance(name, pl_expr.BooleanFunction):
+        if name == pl_expr.BooleanFunction.IsBetween:
+            column, lo, hi = (translate_expr(visitor, n=n) for n in node.input)
+            (closed,) = options
+            lop, rop = expr.BooleanFunction._BETWEEN_OPS[closed]
+            return expr.BinOp(
+                dtype,
+                plc.binaryop.BinaryOperator.LOGICAL_AND,
+                expr.BinOp(dtype, lop, column, lo),
+                expr.BinOp(dtype, rop, column, hi),
+            )
         return expr.BooleanFunction(
             dtype,
             name,
diff --git a/python/cudf_polars/tests/expressions/test_booleanfunction.py b/python/cudf_polars/tests/expressions/test_booleanfunction.py
index a52fba26528..97421008669 100644
--- a/python/cudf_polars/tests/expressions/test_booleanfunction.py
+++ b/python/cudf_polars/tests/expressions/test_booleanfunction.py
@@ -6,7 +6,10 @@
 
 import polars as pl
 
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 
 @pytest.fixture(params=[False, True], ids=["no_nulls", "nulls"])
@@ -67,23 +70,26 @@ def test_boolean_function_unary(request, expr, has_nans, has_nulls):
 
     df = pl.LazyFrame({"a": pl.Series(values, dtype=pl.Float32())})
 
-    q = df.select(expr(pl.col("a")))
+    q = df.select(expr(pl.col("a")), expr(pl.col("a")).not_().alias("b"))
 
     assert_gpu_result_equal(q)
 
 
-@pytest.mark.xfail(reason="Evaluation handlers not yet implemented")
 @pytest.mark.parametrize(
     "expr",
     [
         pl.col("a").is_finite(),
         pl.col("a").is_infinite(),
-        pl.col("a").is_in(pl.col("b")),
+        [pl.col("a").is_infinite(), pl.col("b").is_finite()],
     ],
 )
-def test_unsupported_boolean_function(expr):
+def test_boolean_finite(expr):
     df = pl.LazyFrame(
-        {"a": pl.Series([1, float("nan"), 2, 4], dtype=pl.Float64()), "b": [1, 2, 3, 4]}
+        {
+            "a": pl.Series([1, float("nan"), 2, float("inf")], dtype=pl.Float64()),
+            "b": [1, 2, 3, 4],
+            "c": pl.Series([1, 2, 3, 4], dtype=pl.Float64()),
+        }
     )
 
     q = df.select(expr)
@@ -133,3 +139,33 @@ def test_boolean_horizontal(request, expr, has_nulls, wide):
     q = ldf.select(expr)
 
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.col("a").is_in(pl.col("b")),
+        pl.col("a").is_in(pl.col("c")),
+        pl.col("c").is_in(pl.col("d")),
+    ],
+)
+def test_boolean_is_in(expr):
+    ldf = pl.LazyFrame(
+        {
+            "a": pl.Series([1, 2, 3], dtype=pl.Int64()),
+            "b": pl.Series([3, 4, 2], dtype=pl.Int64()),
+            "c": pl.Series([1, None, 3], dtype=pl.Int64()),
+            "d": pl.Series([10, None, 11], dtype=pl.Int64()),
+        }
+    )
+
+    q = ldf.select(expr)
+
+    assert_gpu_result_equal(q)
+
+
+def test_boolean_is_in_raises_unsupported():
+    ldf = pl.LazyFrame({"a": pl.Series([1, 2, 3], dtype=pl.Int64)})
+    q = ldf.select(pl.col("a").is_in(pl.lit(1, dtype=pl.Int32())))
+
+    assert_ir_translation_raises(q, NotImplementedError)

From e35da6b3df55bfa7b8d5df12c35039740566cb21 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 28 Jun 2024 09:54:03 +0100
Subject: [PATCH 423/842] Implement Ternary copy_if_else (#16114)

A straightforward evaluation using `copy_if_else`.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16114
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 29 +++++++++++++++++++
 .../cudf_polars/cudf_polars/dsl/translate.py  | 10 +++++++
 .../tests/expressions/test_when_then.py       | 27 +++++++++++++++++
 3 files changed, 66 insertions(+)
 create mode 100644 python/cudf_polars/tests/expressions/test_when_then.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 97325161650..17d7d15e4e5 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -51,6 +51,7 @@
     "GroupedRollingWindow",
     "Cast",
     "Agg",
+    "Ternary",
     "BinOp",
 ]
 
@@ -1112,6 +1113,34 @@ def do_evaluate(
         return self.op(child.evaluate(df, context=context, mapping=mapping))
 
 
+class Ternary(Expr):
+    __slots__ = ("children",)
+    _non_child = ("dtype",)
+    children: tuple[Expr, Expr, Expr]
+
+    def __init__(
+        self, dtype: plc.DataType, when: Expr, then: Expr, otherwise: Expr
+    ) -> None:
+        super().__init__(dtype)
+        self.children = (when, then, otherwise)
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        when, then, otherwise = (
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        )
+        then_obj = then.obj_scalar if then.is_scalar else then.obj
+        otherwise_obj = otherwise.obj_scalar if otherwise.is_scalar else otherwise.obj
+        return Column(plc.copying.copy_if_else(then_obj, otherwise_obj, when.obj))
+
+
 class BinOp(Expr):
     __slots__ = ("op", "children")
     _non_child = ("dtype", "op")
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 742e5a591ee..953ff636cce 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -446,6 +446,16 @@ def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Ex
     )
 
 
+@_translate_expr.register
+def _(node: pl_expr.Ternary, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+    return expr.Ternary(
+        dtype,
+        translate_expr(visitor, n=node.predicate),
+        translate_expr(visitor, n=node.truthy),
+        translate_expr(visitor, n=node.falsy),
+    )
+
+
 @_translate_expr.register
 def _(
     node: pl_expr.BinaryExpr, visitor: NodeTraverser, dtype: plc.DataType
diff --git a/python/cudf_polars/tests/expressions/test_when_then.py b/python/cudf_polars/tests/expressions/test_when_then.py
new file mode 100644
index 00000000000..cf1c0fe7fce
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_when_then.py
@@ -0,0 +1,27 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize("then_scalar", [False, True])
+@pytest.mark.parametrize("otherwise_scalar", [False, True])
+@pytest.mark.parametrize("expr", [pl.col("c"), pl.col("c").is_not_null()])
+def test_when_then(then_scalar, otherwise_scalar, expr):
+    ldf = pl.LazyFrame(
+        {
+            "a": [1, 2, 3, 4, 5, 6, 7],
+            "b": [10, 13, 11, 15, 16, 11, 10],
+            "c": [None, True, False, False, True, True, False],
+        }
+    )
+
+    then = pl.lit(10) if then_scalar else pl.col("a")
+    otherwise = pl.lit(-2) if otherwise_scalar else pl.col("b")
+    q = ldf.select(pl.when(expr).then(then).otherwise(otherwise))
+    assert_gpu_result_equal(q)

From 6b04fd3b704efdae7d39d09beba026fcbca5f996 Mon Sep 17 00:00:00 2001
From: "Mads R. B. Kristensen" <madsbk@gmail.com>
Date: Fri, 28 Jun 2024 12:31:18 +0200
Subject: [PATCH 424/842] Memory Profiling (#15866)

Use [RMM's new memory profiler](https://github.com/rapidsai/rmm/pull/1563) to profile all functions already decorated with `_cudf_nvtx_annotate`.

Example
```python
import cudf
from cudf.utils.performance_tracking import print_memory_report

cudf.set_option("memory_profiling", True)

df1 = cudf.DataFrame({"a": [1, 2, 3]})
df2 = cudf.DataFrame({"a": [2, 2, 3]})
df3 = df1.merge(df2)

print_memory_report()
```

Output:
```
Memory Profiling
================

Ordered by: memory_peak

ncalls     memory_peak    memory_total  filename:lineno(function)
     1             272             688  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/dataframe.py:4072(DataFrame.merge)
     2              32              64  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/dataframe.py:1043(DataFrame._init_from_dict_like)
     2              32              64  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/dataframe.py:690(DataFrame.__init__)
     2               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/dataframe.py:1131(DataFrame._align_input_series_indices)
     7               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/index.py:214(RangeIndex.__init__)
     6               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/index.py:424(RangeIndex.__len__)
     4               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/frame.py:271(Frame.__len__)
     2               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/dataframe.py:3195(DataFrame._insert)
     2               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/index.py:270(RangeIndex.name)
     2               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/index.py:369(RangeIndex.copy)
     5               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/frame.py:134(Frame._from_data)
     2               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/frame.py:1039(Frame._copy_type_metadata)
     2               0               0  /home/mkristensen/apps/miniforge3/envs/rmm-cudf-0527/lib/python3.11/site-packages/cudf/core/indexed_frame.py:315(IndexedFrame._from_columns_like_self)
```

Authors:
  - Mads R. B. Kristensen (https://github.com/madsbk)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15866
---
 .../cudf/source/user_guide/api_docs/index.rst |   1 +
 .../api_docs/performance_tracking.rst         |  12 +
 docs/cudf/source/user_guide/index.md          |   1 +
 .../source/user_guide/memory-profiling.md     |  44 ++++
 python/cudf/cudf/core/buffer/spill_manager.py |   4 +-
 .../cudf/cudf/core/buffer/spillable_buffer.py |   7 +-
 python/cudf/cudf/core/dataframe.py            | 180 +++++++-------
 python/cudf/cudf/core/frame.py                | 110 ++++-----
 python/cudf/cudf/core/groupby/groupby.py      |  60 ++---
 python/cudf/cudf/core/index.py                | 228 +++++++++---------
 python/cudf/cudf/core/indexed_frame.py        | 144 +++++------
 python/cudf/cudf/core/multiindex.py           | 130 +++++-----
 python/cudf/cudf/core/series.py               | 228 +++++++++---------
 python/cudf/cudf/core/single_column_frame.py  |  42 ++--
 python/cudf/cudf/core/udf/groupby_utils.py    |   4 +-
 python/cudf/cudf/core/udf/utils.py            |   6 +-
 python/cudf/cudf/io/csv.py                    |   6 +-
 python/cudf/cudf/io/parquet.py                |  28 +--
 python/cudf/cudf/io/text.py                   |   6 +-
 python/cudf/cudf/options.py                   |  14 ++
 .../cudf/tests/test_performance_tracking.py   |  41 ++++
 python/cudf/cudf/utils/nvtx_annotation.py     |  30 ---
 .../cudf/cudf/utils/performance_tracking.py   |  82 +++++++
 python/cudf/cudf/utils/utils.py               |   5 +-
 python/dask_cudf/dask_cudf/backends.py        |  40 +--
 python/dask_cudf/dask_cudf/core.py            |  62 ++---
 python/dask_cudf/dask_cudf/groupby.py         |  72 +++---
 python/dask_cudf/dask_cudf/sorting.py         |  16 +-
 28 files changed, 885 insertions(+), 718 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/performance_tracking.rst
 create mode 100644 docs/cudf/source/user_guide/memory-profiling.md
 create mode 100644 python/cudf/cudf/tests/test_performance_tracking.py
 delete mode 100644 python/cudf/cudf/utils/nvtx_annotation.py
 create mode 100644 python/cudf/cudf/utils/performance_tracking.py

diff --git a/docs/cudf/source/user_guide/api_docs/index.rst b/docs/cudf/source/user_guide/api_docs/index.rst
index 5f26a921012..d05501f4a4a 100644
--- a/docs/cudf/source/user_guide/api_docs/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/index.rst
@@ -26,3 +26,4 @@ This page provides a list of all publicly accessible modules, methods and classe
     options
     extension_dtypes
     pylibcudf/index.rst
+    performance_tracking
diff --git a/docs/cudf/source/user_guide/api_docs/performance_tracking.rst b/docs/cudf/source/user_guide/api_docs/performance_tracking.rst
new file mode 100644
index 00000000000..9da79e69fb2
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/performance_tracking.rst
@@ -0,0 +1,12 @@
+.. _api.performance_tracking:
+
+====================
+Performance Tracking
+====================
+
+.. currentmodule:: cudf.utils.performance_tracking
+.. autosummary::
+   :toctree: api/
+
+   get_memory_records
+   print_memory_report
diff --git a/docs/cudf/source/user_guide/index.md b/docs/cudf/source/user_guide/index.md
index 486368c3b8b..df4e4795a08 100644
--- a/docs/cudf/source/user_guide/index.md
+++ b/docs/cudf/source/user_guide/index.md
@@ -16,5 +16,6 @@ options
 performance-comparisons/index
 PandasCompat
 copy-on-write
+memory-profiling
 pandas-2.0-breaking-changes
 ```
diff --git a/docs/cudf/source/user_guide/memory-profiling.md b/docs/cudf/source/user_guide/memory-profiling.md
new file mode 100644
index 00000000000..ab5433685e6
--- /dev/null
+++ b/docs/cudf/source/user_guide/memory-profiling.md
@@ -0,0 +1,44 @@
+(memory-profiling-user-doc)=
+
+# Memory Profiling
+
+Peak memory usage is a common concern in GPU programming because GPU memory is typically smaller than available CPU memory. To easily identify memory hotspots, cuDF provides a memory profiler. It comes with an overhead so avoid using it in performance-sensitive code.
+
+## Enabling Memory Profiling
+
+First, enable memory profiling in RMM by calling {py:func}`rmm.statistics.enable_statistics()`. This adds a statistics resource adaptor to the current RMM memory resource, which enables cuDF to access memory profiling information. See the [RMM documentation](https://docs.rapids.ai/api/rmm/stable/guide/#memory-statistics-and-profiling) for more details.
+
+Second, enable memory profiling in cuDF by setting the `memory_profiling` option to `True`. Use {py:func}`cudf.set_option` or set the environment variable ``CUDF_MEMORY_PROFILING=1`` prior to the launch of the Python interpreter.
+
+To get the result of the profiling, use {py:func}`cudf.utils.performance_tracking.print_memory_report` or access the raw profiling data by using: {py:func}`cudf.utils.performance_tracking.get_memory_records`.
+
+### Example
+In the following, we enable profiling, do some work, and then print the profiling results:
+
+```python
+>>> import cudf
+>>> from cudf.utils.performance_tracking import print_memory_report
+>>> from rmm.statistics import enable_statistics
+>>> enable_statistics()
+>>> cudf.set_option("memory_profiling", True)
+>>> cudf.DataFrame({"a": [1, 2, 3]})  # Some work
+   a
+0  1
+1  2
+2  3
+>>> print_memory_report()  # Pretty print the result of the profiling
+Memory Profiling
+================
+
+Legends:
+ncalls       - number of times the function or code block was called
+memory_peak  - peak memory allocated in function or code block (in bytes)
+memory_total - total memory allocated in function or code block (in bytes)
+
+Ordered by: memory_peak
+
+ncalls memory_peak memory_total filename:lineno(function)
+     1          32           32 cudf/core/dataframe.py:690(DataFrame.__init__)
+     2           0            0 cudf/core/index.py:214(RangeIndex.__init__)
+     6           0            0 cudf/core/index.py:424(RangeIndex.__len__)
+```
diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
index 762cd7f9e86..ed351a6b107 100644
--- a/python/cudf/cudf/core/buffer/spill_manager.py
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -18,14 +18,14 @@
 import rmm.mr
 
 from cudf.options import get_option
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.string import format_bytes
 
 if TYPE_CHECKING:
     from cudf.core.buffer.spillable_buffer import SpillableBufferOwner
 
 _spill_cudf_nvtx_annotate = partial(
-    _cudf_nvtx_annotate, domain="cudf_python-spill"
+    _performance_tracking, domain="cudf_python-spill"
 )
 
 
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index eb57a371965..4c9e524ee05 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -10,6 +10,7 @@
 from typing import TYPE_CHECKING, Any, Literal
 
 import numpy
+import nvtx
 from typing_extensions import Self
 
 import rmm
@@ -21,7 +22,7 @@
     host_memory_allocation,
 )
 from cudf.core.buffer.exposure_tracked_buffer import ExposureTrackedBuffer
-from cudf.utils.nvtx_annotation import _get_color_for_nvtx, annotate
+from cudf.utils.performance_tracking import _get_color_for_nvtx
 from cudf.utils.string import format_bytes
 
 if TYPE_CHECKING:
@@ -200,7 +201,7 @@ def spill(self, target: str = "cpu") -> None:
                 )
 
             if (ptr_type, target) == ("gpu", "cpu"):
-                with annotate(
+                with nvtx.annotate(
                     message="SpillDtoH",
                     color=_get_color_for_nvtx("SpillDtoH"),
                     domain="cudf_python-spill",
@@ -218,7 +219,7 @@ def spill(self, target: str = "cpu") -> None:
                 # trigger a new call to this buffer's `spill()`.
                 # Therefore, it is important that spilling-on-demand doesn't
                 # try to unspill an already locked buffer!
-                with annotate(
+                with nvtx.annotate(
                     message="SpillHtoD",
                     color=_get_color_for_nvtx("SpillHtoD"),
                     domain="cudf_python-spill",
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f7f5ef792d6..3fc29582c4c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -83,7 +83,7 @@
     min_scalar_type,
     numeric_normalize_types,
 )
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
 
 if TYPE_CHECKING:
@@ -145,7 +145,7 @@ def __setitem__(self, key, value):
             key = (key, slice(None))
         return self._setitem_tuple_arg(key, value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _can_downcast_to_series(self, df, arg):
         """
         This method encapsulates the logic used
@@ -188,7 +188,7 @@ def _can_downcast_to_series(self, df, arg):
                 return True
         return False
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _downcast_to_series(self, df, arg):
         """
         "Downcast" from a DataFrame to a Series
@@ -233,11 +233,11 @@ class _DataFrameLocIndexer(_DataFrameIndexer):
     For selection by label.
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _getitem_scalar(self, arg):
         return self._frame[arg[1]].loc[arg[0]]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _getitem_tuple_arg(self, arg):
         from uuid import uuid4
 
@@ -363,7 +363,7 @@ def _getitem_tuple_arg(self, arg):
             return self._downcast_to_series(df, arg)
         return df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _setitem_tuple_arg(self, key, value):
         if (
             isinstance(self._frame.index, MultiIndex)
@@ -532,7 +532,7 @@ def __getitem__(self, arg):
             return frame._empty_like(keep_index=True)
         assert_never(row_spec)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _setitem_tuple_arg(self, key, value):
         columns_df = self._frame._from_data(
             self._frame._data.select_by_index(key[1]), self._frame.index
@@ -677,7 +677,7 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     _groupby = DataFrameGroupBy
     _resampler = DataFrameResampler
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -859,7 +859,7 @@ def __init__(
             columns, pd.MultiIndex
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _init_from_series_list(self, data, columns, index):
         if index is None:
             # When `index` is `None`, the final index of
@@ -972,7 +972,7 @@ def _init_from_series_list(self, data, columns, index):
         else:
             self._data.rangeindex = True
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _init_from_list_like(self, data, index=None, columns=None):
         if index is None:
             index = RangeIndex(start=0, stop=len(data))
@@ -1030,7 +1030,7 @@ def _init_from_list_like(self, data, index=None, columns=None):
             )
             self._data.label_dtype = getattr(columns, "dtype", None)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _init_from_dict_like(
         self, data, index=None, columns=None, nan_as_null=None
     ):
@@ -1119,7 +1119,7 @@ def _from_data(
         return out
 
     @staticmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _align_input_series_indices(data, index):
         input_series = [
             Series(val)
@@ -1187,7 +1187,7 @@ def deserialize(cls, header, frames):
         return obj
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def shape(self):
         """Returns a tuple representing the dimensionality of the DataFrame."""
         return self._num_rows, self._num_columns
@@ -1270,7 +1270,7 @@ def __setattr__(self, key, col):
         else:
             super().__setattr__(key, col)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, arg):
         """
         If *arg* is a ``str`` or ``int`` type, return the column Series.
@@ -1364,7 +1364,7 @@ def __getitem__(self, arg):
                 f"__getitem__ on type {type(arg)} is not supported"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __setitem__(self, arg, value):
         """Add/set column by *arg or DataFrame*"""
         if isinstance(arg, DataFrame):
@@ -1482,7 +1482,7 @@ def __setitem__(self, arg, value):
     def __delitem__(self, name):
         self._drop_column(name)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, index=True, deep=False):
         mem_usage = [col.memory_usage for col in self._data.columns]
         names = [str(name) for name in self._data.names]
@@ -1494,7 +1494,7 @@ def memory_usage(self, index=True, deep=False):
             index=as_index(names),
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_function__(self, func, types, args, kwargs):
         if "out" in kwargs or not all(
             issubclass(t, (Series, DataFrame)) for t in types
@@ -1528,7 +1528,7 @@ def __array_function__(self, func, types, args, kwargs):
         return NotImplemented
 
     # The _get_numeric_data method is necessary for dask compatibility.
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_numeric_data(self):
         """Return a dataframe with only numeric data types"""
         columns = [
@@ -1538,7 +1538,7 @@ def _get_numeric_data(self):
         ]
         return self[columns]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def assign(self, **kwargs: Callable[[Self], Any] | Any):
         """
         Assign columns to DataFrame from keyword arguments.
@@ -1571,7 +1571,7 @@ def assign(self, **kwargs: Callable[[Self], Any] | Any):
         return new_df
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _concat(
         cls, objs, axis=0, join="outer", ignore_index=False, sort=False
     ):
@@ -1963,12 +1963,12 @@ def _get_renderable_dataframe(self):
 
         return output
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __repr__(self):
         output = self._get_renderable_dataframe()
         return self._clean_renderable_dataframe(output)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _repr_html_(self):
         lines = (
             self._get_renderable_dataframe()
@@ -1984,7 +1984,7 @@ def _repr_html_(self):
             lines.append("</div>")
         return "\n".join(lines)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _repr_latex_(self):
         return self._get_renderable_dataframe().to_pandas()._repr_latex_()
 
@@ -2098,7 +2098,7 @@ def _make_operands_and_index_for_binop(
         return operands, index, can_use_self_column_name
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_dict(
         cls,
         data: dict,
@@ -2233,7 +2233,7 @@ def from_dict(
                 f"parameter. Got '{orient}' instead"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_dict(
         self,
         orient: str = "dict",
@@ -2354,7 +2354,7 @@ def to_dict(
 
         return self.to_pandas().to_dict(orient=orient, into=into)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def scatter_by_map(
         self, map_index, map_size=None, keep_index=True, debug: bool = False
     ):
@@ -2447,7 +2447,7 @@ def scatter_by_map(
 
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def update(
         self,
         other,
@@ -2542,23 +2542,23 @@ def update(
 
         self._mimic_inplace(source_df, inplace=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __iter__(self):
         return iter(self._column_names)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __contains__(self, item):
         # This must check against containment in the pandas Index and not
         # self._column_names to handle NA, None, nan, etc. correctly.
         return item in self._data.to_pandas_index()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def items(self):
         """Iterate over column names and series pairs"""
         for k in self:
             yield (k, self[k])
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:
         ret = super().equals(other)
         # If all other checks matched, validate names.
@@ -2591,13 +2591,13 @@ def at(self):
         "index is absolutely necessary. For checking if the columns are a "
         "MultiIndex, use _data.multiindex."
     )
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def columns(self):
         """Returns a tuple of columns"""
         return self._data.to_pandas_index()
 
     @columns.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def columns(self, columns):
         multiindex = False
         rangeindex = False
@@ -2665,7 +2665,7 @@ def _set_columns_like(self, other: ColumnAccessor) -> None:
             verify=False,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def reindex(
         self,
         labels=None,
@@ -2813,7 +2813,7 @@ def reindex(
             fill_value=fill_value,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def set_index(
         self,
         keys,
@@ -2980,7 +2980,7 @@ def set_index(
         df.index = idx
         return df if not inplace else None
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
     ):  # noqa: D102
@@ -3006,7 +3006,7 @@ def fillna(
             value=value, method=method, axis=axis, inplace=inplace, limit=limit
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         from cudf.core._internals.where import (
             _check_and_cast_columns_with_other,
@@ -3163,7 +3163,7 @@ def reset_index(
             inplace=inplace,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def insert(self, loc, name, value, nan_as_null=no_default):
         """Add a column to DataFrame at the index specified by loc.
 
@@ -3189,7 +3189,7 @@ def insert(self, loc, name, value, nan_as_null=no_default):
             ignore_index=False,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
         """
         Same as `insert`, with additional `ignore_index` param.
@@ -3271,7 +3271,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
         self._data.insert(name, value, loc=loc)
 
     @property  # type:ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def axes(self):
         """
         Return a list representing the axes of the DataFrame.
@@ -3363,7 +3363,7 @@ def diff(self, periods=1, axis=0):
 
         return self - self.shift(periods=periods)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop_duplicates(
         self,
         subset=None,
@@ -3451,14 +3451,14 @@ def drop_duplicates(
 
         return self._mimic_inplace(outdf, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pop(self, item):
         """Return a column and drop it from the DataFrame."""
         popped = self[item]
         del self[item]
         return popped
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rename(
         self,
         mapper=None,
@@ -3616,7 +3616,7 @@ def rename(
 
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def add_prefix(self, prefix):
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
@@ -3625,7 +3625,7 @@ def add_prefix(self, prefix):
         ]
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def add_suffix(self, suffix):
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
@@ -3634,7 +3634,7 @@ def add_suffix(self, suffix):
         ]
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def agg(self, aggs, axis=None):
         """
         Aggregate using one or more operations over the specified axis.
@@ -3770,7 +3770,7 @@ def agg(self, aggs, axis=None):
         else:
             raise ValueError("argument must be a string, list or dict")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nlargest(self, n, columns, keep="first"):
         """Return the first *n* rows ordered by *columns* in descending order.
 
@@ -3910,7 +3910,7 @@ def nsmallest(self, n, columns, keep="first"):
         """
         return self._n_largest_or_smallest(False, n, columns, keep)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def swaplevel(self, i=-2, j=-1, axis=0):
         """
         Swap level i with level j.
@@ -3977,7 +3977,7 @@ def swaplevel(self, i=-2, j=-1, axis=0):
 
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def transpose(self):
         """Transpose index and columns.
 
@@ -4041,7 +4041,7 @@ def transpose(self):
 
     T = property(transpose, doc=transpose.__doc__)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def melt(self, **kwargs):
         """Unpivots a DataFrame from wide format to long format,
         optionally leaving identifier variables set.
@@ -4071,7 +4071,7 @@ def melt(self, **kwargs):
 
         return melt(self, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def merge(
         self,
         right,
@@ -4224,7 +4224,7 @@ def merge(
             suffixes=suffixes,
         ).perform_merge()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def join(
         self,
         other,
@@ -4273,7 +4273,7 @@ def join(
         )
         return df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         groupby_doc_template.format(
             ret=textwrap.dedent(
@@ -4407,7 +4407,7 @@ def query(self, expr, local_dict=None):
                 BooleanMask.from_column_unchecked(boolmask)
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def apply(
         self, func, axis=1, raw=False, result_type=None, args=(), **kwargs
     ):
@@ -4691,7 +4691,7 @@ def _func(x):  # pragma: no cover
 
         return DataFrame._from_data(result, index=self.index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @applyutils.doc_apply()
     def apply_rows(
         self,
@@ -4770,7 +4770,7 @@ def apply_rows(
             cache_key=cache_key,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @applyutils.doc_applychunks()
     def apply_chunks(
         self,
@@ -4837,7 +4837,7 @@ def apply_chunks(
             tpb=tpb,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def partition_by_hash(self, columns, nparts, keep_index=True):
         """Partition the dataframe by the hashed value of data in *columns*.
 
@@ -5181,7 +5181,7 @@ def _sizeof_fmt(num, size_qualifier):
 
         cudf.utils.ioutils.buffer_write_lines(buf, lines)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_describe()
     def describe(
         self,
@@ -5243,7 +5243,7 @@ def describe(
                 )
             return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.DataFrame:
@@ -5333,7 +5333,7 @@ def to_pandas(
         return out_df
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_pandas(cls, dataframe, nan_as_null=no_default):
         """
         Convert from a Pandas DataFrame.
@@ -5406,7 +5406,7 @@ def from_pandas(cls, dataframe, nan_as_null=no_default):
             )
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrow(cls, table):
         """
         Convert from PyArrow Table to DataFrame.
@@ -5492,7 +5492,7 @@ def from_arrow(cls, table):
 
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_arrow(self, preserve_index=None):
         """
         Convert to a PyArrow Table.
@@ -5582,7 +5582,7 @@ def to_arrow(self, preserve_index=None):
 
         return out.replace_schema_metadata(metadata)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_records(self, index=True):
         """Convert to a numpy recarray
 
@@ -5606,7 +5606,7 @@ def to_records(self, index=True):
         return ret
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         """
         Convert structured or record ndarray to DataFrame.
@@ -5685,7 +5685,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         return df
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
         """Convert a numpy/cupy array to DataFrame.
 
@@ -5763,7 +5763,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
             index=index,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def interpolate(
         self,
         method="linear",
@@ -5793,7 +5793,7 @@ def interpolate(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quantile(
         self,
         q=0.5,
@@ -5936,7 +5936,7 @@ def quantile(
         result.index = cudf.Index(list(map(float, qs)), dtype="float64")
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isin(self, values):
         """
         Whether each element in the DataFrame is contained in values.
@@ -6080,7 +6080,7 @@ def make_false_column_like_self():
     #
     # Stats
     #
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
         """Prepare a DataFrame for CuPy-based row-wise operations."""
 
@@ -6132,7 +6132,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
             coerced = coerced.astype("int64", copy=False)
         return coerced, mask, common_dtype
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def count(self, axis=0, numeric_only=False):
         """
         Count ``non-NA`` cells for each column or row.
@@ -6184,7 +6184,7 @@ def count(self, axis=0, numeric_only=False):
         "columns": 1,
     }
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _reduce(
         self,
         op,
@@ -6308,7 +6308,7 @@ def _reduce(
         else:
             raise ValueError(f"Invalid value of {axis=} received for {op}")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _scan(
         self,
         op,
@@ -6325,7 +6325,7 @@ def _scan(
         elif axis == 1:
             return self._apply_cupy_method_axis_1(op, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def mode(self, axis=0, numeric_only=False, dropna=True):
         """
         Get the mode(s) of each element along the selected axis.
@@ -6432,17 +6432,17 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
 
         return df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def all(self, axis=0, bool_only=None, skipna=True, **kwargs):
         obj = self.select_dtypes(include="bool") if bool_only else self
         return super(DataFrame, obj).all(axis, skipna, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def any(self, axis=0, bool_only=None, skipna=True, **kwargs):
         obj = self.select_dtypes(include="bool") if bool_only else self
         return super(DataFrame, obj).any(axis, skipna, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
         # This method uses cupy to perform scans and reductions along rows of a
         # DataFrame. Since cuDF is designed around columnar storage and
@@ -6542,7 +6542,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
             result_df._set_columns_like(prepared._data)
             return result_df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _columns_view(self, columns):
         """
         Return a subset of the DataFrame's columns as a view.
@@ -6551,7 +6551,7 @@ def _columns_view(self, columns):
             {col: self._data[col] for col in columns}, index=self.index
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def select_dtypes(self, include=None, exclude=None):
         """Return a subset of the DataFrame's columns based on the column dtypes.
 
@@ -6816,7 +6816,7 @@ def to_orc(
             index=index,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def stack(self, level=-1, dropna=no_default, future_stack=False):
         """Stack the prescribed level(s) from columns to index
 
@@ -7161,7 +7161,7 @@ def unnamed_group_generator():
         else:
             return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cov(self, **kwargs):
         """Compute the covariance matrix of a DataFrame.
 
@@ -7216,7 +7216,7 @@ def corr(self, method="pearson", min_periods=None):
         df._set_columns_like(self._data)
         return df
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_struct(self, name=None):
         """
         Return a struct Series composed of the columns of the DataFrame.
@@ -7250,7 +7250,7 @@ def to_struct(self, name=None):
             name=name,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def keys(self):
         """
         Get the columns.
@@ -7310,14 +7310,14 @@ def iterrows(self):
             "if you wish to iterate over each row."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @copy_docstring(reshape.pivot)
     def pivot(self, *, columns, index=no_default, values=no_default):
         return cudf.core.reshape.pivot(
             self, index=index, columns=columns, values=values
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @copy_docstring(reshape.pivot_table)
     def pivot_table(
         self,
@@ -7346,14 +7346,14 @@ def pivot_table(
             sort=sort,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @copy_docstring(reshape.unstack)
     def unstack(self, level=-1, fill_value=None):
         return cudf.core.reshape.unstack(
             self, level=level, fill_value=fill_value
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def explode(self, column, ignore_index=False):
         """
         Transform each element of a list-like to a row, replicating index
@@ -7549,7 +7549,7 @@ def _from_columns_like_self(
         result._set_columns_like(self._data)
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def interleave_columns(self):
         """
         Interleave Series columns of a table into a single column.
@@ -7597,7 +7597,7 @@ def interleave_columns(self):
             {None: libcudf.reshape.interleave_columns([*self._columns])}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def eval(self, expr: str, inplace: bool = False, **kwargs):
         """Evaluate a string describing operations on DataFrame columns.
 
@@ -7953,7 +7953,7 @@ def func(left, right, output):
     )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def from_pandas(obj, nan_as_null=no_default):
     """
     Convert certain Pandas objects into the cudf equivalent.
@@ -8080,7 +8080,7 @@ def from_pandas(obj, nan_as_null=no_default):
         )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def merge(left, right, *args, **kwargs):
     if isinstance(left, Series):
         left = left.to_frame()
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 8ca71180c00..9bac75dc6ac 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -32,7 +32,7 @@
 from cudf.core.mixins import BinaryOperand, Scannable
 from cudf.utils import ioutils
 from cudf.utils.dtypes import find_common_type
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _array_ufunc, _warn_no_dask_cudf
 
 if TYPE_CHECKING:
@@ -86,7 +86,7 @@ def _dtypes(self) -> abc.Iterable:
     def ndim(self) -> int:
         raise NotImplementedError()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def serialize(self):
         # TODO: See if self._data can be serialized outright
         header = {
@@ -101,7 +101,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def deserialize(cls, header, frames):
         cls_deserialize = pickle.loads(header["type-serialized"])
         column_names = pickle.loads(header["column_names"])
@@ -122,7 +122,7 @@ def deserialize(cls, header, frames):
         return cls_deserialize._from_data(col_accessor)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data(cls, data: MutableMapping) -> Self:
         """
         Construct cls from a ColumnAccessor-like mapping.
@@ -131,7 +131,7 @@ def _from_data(cls, data: MutableMapping) -> Self:
         Frame.__init__(obj, data)
         return obj
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data_like_self(self, data: MutableMapping) -> Self:
         """
         Return type(self) from a ColumnAccessor-like mapping but
@@ -139,7 +139,7 @@ def _from_data_like_self(self, data: MutableMapping) -> Self:
         """
         return self._from_data(data)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_columns_like_self(
         self,
         columns: list[ColumnBase],
@@ -155,7 +155,7 @@ def _from_columns_like_self(
         frame = self.__class__._from_data(data)
         return frame._copy_type_metadata(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _mimic_inplace(
         self, result: Self, inplace: bool = False
     ) -> Self | None:
@@ -171,7 +171,7 @@ def _mimic_inplace(
             return result
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def size(self) -> int:
         """
         Return the number of elements in the underlying data.
@@ -263,11 +263,11 @@ def memory_usage(self, deep=False):
         """
         raise NotImplementedError
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __len__(self) -> int:
         return self._num_rows
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self:
         casted = (
             col.astype(dtype.get(col_name, col.dtype), copy=copy)
@@ -276,7 +276,7 @@ def astype(self, dtype: dict[Any, Dtype], copy: bool = False) -> Self:
         ca = self._data._from_columns_like_self(casted, verify=False)
         return self._from_data_like_self(ca)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:
         """
         Test whether two objects contain the same elements.
@@ -347,7 +347,7 @@ def equals(self, other) -> bool:
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_columns_by_label(self, labels) -> Self:
         """
         Returns columns of the Frame specified by `labels`.
@@ -357,7 +357,7 @@ def _get_columns_by_label(self, labels) -> Self:
         return self._from_data_like_self(self._data.select_by_label(labels))
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values(self) -> cupy.ndarray:
         """
         Return a CuPy representation of the DataFrame.
@@ -373,7 +373,7 @@ def values(self) -> cupy.ndarray:
         return self.to_cupy()
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values_host(self) -> np.ndarray:
         """
         Return a NumPy representation of the data.
@@ -388,7 +388,7 @@ def values_host(self) -> np.ndarray:
         """
         return self.to_numpy()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array__(self, dtype=None):
         raise TypeError(
             "Implicit conversion to a host NumPy array via __array__ is not "
@@ -397,14 +397,14 @@ def __array__(self, dtype=None):
             "using .to_numpy()."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __arrow_array__(self, type=None):
         raise TypeError(
             "Implicit conversion to a host PyArrow object via __arrow_array__ "
             "is not allowed. Consider using .to_arrow()"
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _to_array(
         self,
         get_array: Callable,
@@ -468,7 +468,7 @@ def to_array(
     # particular, we need to benchmark how much of the overhead is coming from
     # (potentially unavoidable) local copies in to_cupy and how much comes from
     # inefficiencies in the implementation.
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_cupy(
         self,
         dtype: Dtype | None = None,
@@ -502,7 +502,7 @@ def to_cupy(
             na_value,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_numpy(
         self,
         dtype: Dtype | None = None,
@@ -537,7 +537,7 @@ def to_numpy(
             lambda col: col.values_host, numpy, copy, dtype, na_value
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace: bool = False) -> Self | None:
         """
         Replace values where the condition is False.
@@ -610,7 +610,7 @@ def where(self, cond, other=None, inplace: bool = False) -> Self | None:
         """
         raise NotImplementedError
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(
         self,
         value: None | ScalarLike | cudf.Series = None,
@@ -767,14 +767,14 @@ def fillna(
             inplace=inplace,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _drop_column(self, name):
         """Drop a column by *name*"""
         if name not in self._data:
             raise KeyError(f"column '{name}' does not exist")
         del self._data[name]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _quantile_table(
         self,
         q: float,
@@ -808,7 +808,7 @@ def _quantile_table(
         )
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrow(cls, data: pa.Table) -> Self:
         """Convert from PyArrow Table to Frame
 
@@ -968,7 +968,7 @@ def from_arrow(cls, data: pa.Table) -> Self:
 
         return cls._from_data({name: result[name] for name in column_names})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_arrow(self):
         """
         Convert to arrow Table
@@ -992,7 +992,7 @@ def to_arrow(self):
             {str(name): col.to_arrow() for name, col in self._data.items()}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _positions_from_column_names(self, column_names) -> list[int]:
         """Map each column name into their positions in the frame.
 
@@ -1005,7 +1005,7 @@ def _positions_from_column_names(self, column_names) -> list[int]:
             if name in set(column_names)
         ]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _copy_type_metadata(self: Self, other: Self) -> Self:
         """
         Copy type metadata from each column of `other` to the corresponding
@@ -1020,7 +1020,7 @@ def _copy_type_metadata(self: Self, other: Self) -> Self:
 
         return self
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isna(self):
         """
         Identify missing values.
@@ -1101,7 +1101,7 @@ def isna(self):
     # Alias for isna
     isnull = isna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def notna(self):
         """
         Identify non-missing values.
@@ -1182,7 +1182,7 @@ def notna(self):
     # Alias for notna
     notnull = notna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def searchsorted(
         self,
         values,
@@ -1296,7 +1296,7 @@ def searchsorted(
         else:
             return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def argsort(
         self,
         by=None,
@@ -1383,7 +1383,7 @@ def argsort(
             by=by, ascending=ascending, na_position=na_position
         ).values
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_sorted_inds(
         self,
         by=None,
@@ -1411,7 +1411,7 @@ def _get_sorted_inds(
             stable=True,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _split(self, splits):
         """Split a frame with split points in ``splits``. Returns a list of
         Frames of length `len(splits) + 1`.
@@ -1426,13 +1426,13 @@ def _split(self, splits):
             for split_idx in range(len(splits) + 1)
         ]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _encode(self):
         columns, indices = libcudf.transform.table_encode([*self._columns])
         keys = self._from_columns_like_self(columns)
         return keys, indices
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _unaryop(self, op):
         data_columns = (col.unary_operator(op) for col in self._columns)
         return self._from_data_like_self(
@@ -1440,7 +1440,7 @@ def _unaryop(self, op):
         )
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _colwise_binop(
         cls,
         operands: dict[str | None, tuple[ColumnBase, Any, bool, Any]],
@@ -1519,11 +1519,11 @@ def _colwise_binop(
 
         return output
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return _array_ufunc(self, ufunc, method, inputs, kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @acquire_spill_lock()
     def _apply_cupy_ufunc_to_operands(
         self, ufunc, cupy_func, operands, **kwargs
@@ -1565,7 +1565,7 @@ def _apply_cupy_ufunc_to_operands(
         return data
 
     # Unary logical operators
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __neg__(self):
         """Negate for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
@@ -1579,30 +1579,30 @@ def __neg__(self):
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __pos__(self):
         return self.copy(deep=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __abs__(self):
         return self._unaryop("abs")
 
     # Reductions
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_axis_from_axis_arg(cls, axis):
         try:
             return cls._SUPPORT_AXIS_LOOKUP[axis]
         except KeyError:
             raise ValueError(f"No axis named {axis} for object type {cls}")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _reduce(self, *args, **kwargs):
         raise NotImplementedError(
             f"Reductions are not supported for objects of type {type(self)}."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def min(
         self,
         axis=0,
@@ -1653,7 +1653,7 @@ def min(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def max(
         self,
         axis=0,
@@ -1701,7 +1701,7 @@ def max(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def all(self, axis=0, skipna=True, **kwargs):
         """
         Return whether all elements are True in DataFrame.
@@ -1754,7 +1754,7 @@ def all(self, axis=0, skipna=True, **kwargs):
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def any(self, axis=0, skipna=True, **kwargs):
         """
         Return whether any elements is True in DataFrame.
@@ -1807,26 +1807,26 @@ def any(self, axis=0, skipna=True, **kwargs):
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @ioutils.doc_to_dlpack()
     def to_dlpack(self):
         """{docstring}"""
 
         return cudf.io.dlpack.to_dlpack(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __str__(self):
         return repr(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __deepcopy__(self, memo):
         return self.copy(deep=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __copy__(self):
         return self.copy(deep=False)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __invert__(self):
         """Bitwise invert (~) for integral dtypes, logical NOT for bools."""
         return self._from_data_like_self(
@@ -1835,7 +1835,7 @@ def __invert__(self):
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nunique(self, dropna: bool = True):
         """
         Returns a per column mapping with counts of unique values for
@@ -1856,7 +1856,7 @@ def nunique(self, dropna: bool = True):
         )
 
     @staticmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _repeat(
         columns: list[ColumnBase], repeats, axis=None
     ) -> list[ColumnBase]:
@@ -1870,7 +1870,7 @@ def _repeat(
 
         return libcudf.filling.repeat(columns, repeats)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @_warn_no_dask_cudf
     def __dask_tokenize__(self):
         from dask.base import normalize_token
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 77b54a583d3..eccb3acabf6 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -31,7 +31,7 @@
 from cudf.core.mixins import Reducible, Scannable
 from cudf.core.multiindex import MultiIndex
 from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin
 
 if TYPE_CHECKING:
@@ -392,7 +392,7 @@ def indices(self):
             zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1]))
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_group(self, name, obj=None):
         """
         Construct DataFrame from group with provided name.
@@ -436,7 +436,7 @@ def get_group(self, name, obj=None):
             )
         return obj.iloc[self.indices[name]]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def size(self):
         """
         Return the size of each group.
@@ -451,7 +451,7 @@ def size(self):
             .agg("size")
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cumcount(self):
         """
         Return the cumulative count of keys in each group.
@@ -467,7 +467,7 @@ def cumcount(self):
             .agg("cumcount")
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rank(
         self,
         method="average",
@@ -521,7 +521,7 @@ def _groupby(self):
             [*self.grouping.keys._columns], dropna=self._dropna
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def agg(self, func):
         """
         Apply aggregation(s) to the groups.
@@ -821,7 +821,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool):
         else:
             return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def head(self, n: int = 5, *, preserve_order: bool = True):
         """Return first n rows of each group
 
@@ -874,7 +874,7 @@ def head(self, n: int = 5, *, preserve_order: bool = True):
             n, take_head=True, preserve_order=preserve_order
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def tail(self, n: int = 5, *, preserve_order: bool = True):
         """Return last n rows of each group
 
@@ -928,7 +928,7 @@ def tail(self, n: int = 5, *, preserve_order: bool = True):
             n, take_head=False, preserve_order=preserve_order
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nth(self, n):
         """
         Return the nth row from each group.
@@ -949,7 +949,7 @@ def nth(self, n):
         del self.obj._data["__groupbynth_order__"]
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ngroup(self, ascending=True):
         """
         Number each group from 0 to the number of groups - 1.
@@ -1261,7 +1261,7 @@ def _normalize_aggs(
         ]
         return column_names, columns, normalized_aggs
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pipe(self, func, *args, **kwargs):
         """
         Apply a function `func` with arguments to this GroupBy
@@ -1316,7 +1316,7 @@ def pipe(self, func, *args, **kwargs):
         """
         return cudf.core.common.pipe(self, func, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _jit_groupby_apply(
         self, function, group_names, offsets, group_keys, grouped_values, *args
     ):
@@ -1327,7 +1327,7 @@ def _jit_groupby_apply(
             chunk_results, group_names, group_keys, grouped_values
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _iterative_groupby_apply(
         self, function, group_names, offsets, group_keys, grouped_values, *args
     ):
@@ -1415,7 +1415,7 @@ def _post_process_chunk_results(
                 result.index = cudf.MultiIndex._from_data(index_data)
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def apply(
         self, function, *args, engine="auto", include_groups: bool = True
     ):
@@ -1573,7 +1573,7 @@ def mult(df):
             result = result.reset_index()
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def apply_grouped(self, function, **kwargs):
         """Apply a transformation function over the grouped chunk.
 
@@ -1712,7 +1712,7 @@ def rolling_avg(val, avg):
         kwargs.update({"chunks": offsets})
         return grouped_values.apply_chunks(function, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _broadcast(self, values):
         """
         Broadcast the results of an aggregation to the group
@@ -1736,7 +1736,7 @@ def _broadcast(self, values):
             values.index = self.obj.index
         return values
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def transform(self, function):
         """Apply an aggregation, then broadcast the result to the group size.
 
@@ -1801,7 +1801,7 @@ def rolling(self, *args, **kwargs):
         """
         return cudf.core.window.rolling.RollingGroupby(self, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def count(self, dropna=True):
         """Compute the number of values in each column.
 
@@ -1816,7 +1816,7 @@ def func(x):
 
         return self.agg(func)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def describe(self, include=None, exclude=None):
         """
         Generate descriptive statistics that summarizes the central tendency,
@@ -1888,7 +1888,7 @@ def describe(self, include=None, exclude=None):
         )
         return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def corr(self, method="pearson", min_periods=1):
         """
         Compute pairwise correlation of columns, excluding NA/null values.
@@ -1950,7 +1950,7 @@ def corr(self, method="pearson", min_periods=1):
             lambda x: x.corr(method, min_periods), "Correlation"
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cov(self, min_periods=0, ddof=1):
         """
         Compute the pairwise covariance among the columns of a DataFrame,
@@ -2129,7 +2129,7 @@ def _cov_or_corr(self, func, method_name):
 
         return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def var(self, ddof=1):
         """Compute the column-wise variance of the values in each group.
 
@@ -2145,7 +2145,7 @@ def func(x):
 
         return self.agg(func)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def std(self, ddof=1):
         """Compute the column-wise std of the values in each group.
 
@@ -2161,7 +2161,7 @@ def func(x):
 
         return self.agg(func)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quantile(self, q=0.5, interpolation="linear"):
         """Compute the column-wise quantiles of the values in each group.
 
@@ -2179,18 +2179,18 @@ def func(x):
 
         return self.agg(func)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def collect(self):
         """Get a list of all the values for each column in each group."""
         _deprecate_collect()
         return self.agg(list)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def unique(self):
         """Get a list of the unique values for each column in each group."""
         return self.agg("unique")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def diff(self, periods=1, axis=0):
         """Get the difference between the values in each group.
 
@@ -2258,7 +2258,7 @@ def bfill(self, limit=None):
 
         return self._scan_fill("bfill", limit)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(
         self,
         value=None,
@@ -2325,7 +2325,7 @@ def fillna(
             value=value, inplace=inplace, axis=axis, limit=limit
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         """
         Shift each group by ``periods`` positions.
@@ -2388,7 +2388,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         result = self._mimic_pandas_order(result)
         return result._copy_type_metadata(values)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pct_change(
         self,
         periods=1,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 71658695b80..e069f8d0ea6 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -58,7 +58,7 @@
     is_mixed_with_object_dtype,
     numeric_normalize_types,
 )
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
 
 if TYPE_CHECKING:
@@ -204,7 +204,7 @@ class RangeIndex(BaseIndex, BinaryOperand):
 
     _range: range
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self, start, stop=None, step=1, dtype=None, copy=False, name=None
     ):
@@ -259,17 +259,17 @@ def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
         return codes, uniques
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self):
         return self._name
 
     @name.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self, value):
         self._name = value
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def start(self) -> int:
         """
         The value of the `start` parameter (0 if this was not supplied).
@@ -277,7 +277,7 @@ def start(self) -> int:
         return self._range.start
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def stop(self) -> int:
         """
         The value of the stop parameter.
@@ -285,7 +285,7 @@ def stop(self) -> int:
         return self._range.stop
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def step(self) -> int:
         """
         The value of the step parameter.
@@ -293,12 +293,12 @@ def step(self) -> int:
         return self._range.step
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _num_rows(self) -> int:
         return len(self)
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _values(self):
         if len(self) > 0:
             return column.as_column(self._range, dtype=self.dtype)
@@ -330,18 +330,18 @@ def _is_interval(self) -> bool:
         return False
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hasnans(self) -> bool:
         return False
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _data(self):
         return cudf.core.column_accessor.ColumnAccessor(
             {self.name: self._values}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __contains__(self, item):
         hash(item)
         if isinstance(item, bool) or not isinstance(
@@ -357,7 +357,7 @@ def __contains__(self, item):
         except (ValueError, OverflowError):
             return False
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def copy(self, name=None, deep=False):
         """
         Make a copy of this object.
@@ -377,7 +377,7 @@ def copy(self, name=None, deep=False):
 
         return RangeIndex(self._range, name=name)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(self, dtype, copy: bool = True):
         if is_dtype_equal(dtype, self.dtype):
             return self
@@ -386,15 +386,15 @@ def astype(self, dtype, copy: bool = True):
     def fillna(self, value, downcast=None):
         return self.copy()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop_duplicates(self, keep="first"):
         return self
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def duplicated(self, keep="first") -> cupy.ndarray:
         return cupy.zeros(len(self), dtype=bool)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __repr__(self):
         return (
             f"{self.__class__.__name__}(start={self.start}, stop={self.stop}"
@@ -408,15 +408,15 @@ def __repr__(self):
         )
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def size(self) -> int:
         return len(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __len__(self):
         return len(self._range)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, index):
         if isinstance(index, slice):
             sl_start, sl_stop, sl_step = index.indices(len(self))
@@ -435,13 +435,13 @@ def __getitem__(self, index):
             return self.start + index * self.step
         return self._as_int_index()[index]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:
         if isinstance(other, RangeIndex):
             return self._range == other._range
         return self._as_int_index().equals(other)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def serialize(self):
         header = {}
         header["index_column"] = {}
@@ -462,7 +462,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def deserialize(cls, header, frames):
         h = header["index_column"]
         name = pickle.loads(header["name"])
@@ -472,7 +472,7 @@ def deserialize(cls, header, frames):
         return RangeIndex(start=start, stop=stop, step=step, name=name)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dtype(self):
         """
         `dtype` of the range of values in RangeIndex.
@@ -487,7 +487,7 @@ def dtype(self):
     def _dtypes(self) -> Iterable:
         return [(self.name, self.dtype)]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.RangeIndex:
@@ -508,16 +508,16 @@ def is_unique(self) -> bool:
         return True
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_increasing(self) -> bool:
         return self.step > 0 or len(self) <= 1
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_decreasing(self):
         return self.step < 0 or len(self) <= 1
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, deep: bool = False) -> int:
         if deep:
             warnings.warn(
@@ -530,7 +530,7 @@ def unique(self) -> Self:
         # RangeIndex always has unique values
         return self.copy()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __mul__(self, other):
         # Multiplication by raw ints must return a RangeIndex to match pandas.
         if isinstance(other, cudf.Scalar) and other.dtype.kind in "iu":
@@ -547,24 +547,24 @@ def __mul__(self, other):
             )
         return self._as_int_index().__mul__(other)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __rmul__(self, other):
         # Multiplication is commutative.
         return self.__mul__(other)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _as_int_index(self):
         # Convert self to an integer index. This method is used to perform ops
         # that are not defined directly on RangeIndex.
         return cudf.Index._from_data(self._data)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return self._as_int_index().__array_ufunc__(
             ufunc, method, *inputs, **kwargs
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_indexer(self, target, limit=None, method=None, tolerance=None):
         target_col = cudf.core.column.as_column(target)
         if method is not None or not isinstance(
@@ -594,7 +594,7 @@ def get_indexer(self, target, limit=None, method=None, tolerance=None):
             locs[valid] = len(self) - 1 - locs[valid]
         return locs
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_loc(self, key):
         if not is_scalar(key):
             raise TypeError("Should be a scalar-like")
@@ -608,7 +608,7 @@ def get_loc(self, key):
             raise KeyError(key)
         return idx_int
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _union(self, other, sort=None):
         if isinstance(other, RangeIndex):
             # Variable suffixes are of the
@@ -685,7 +685,7 @@ def _union(self, other, sort=None):
             self._as_int_index()._union(other, sort=sort)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _intersection(self, other, sort=None):
         if not isinstance(other, RangeIndex):
             return self._try_reconstruct_range_index(
@@ -733,7 +733,7 @@ def _intersection(self, other, sort=None):
 
         return self._try_reconstruct_range_index(new_index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def difference(self, other, sort=None):
         if isinstance(other, RangeIndex) and self.equals(other):
             return self[:0]._get_reconciled_name_object(other)
@@ -785,14 +785,14 @@ def sort_values(
         else:
             return sorted_index
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _gather(self, gather_map, nullify=False, check_bounds=True):
         gather_map = cudf.core.column.as_column(gather_map)
         return cudf.Index._from_data(
             {self.name: self._values.take(gather_map, nullify, check_bounds)}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _apply_boolean_mask(self, boolean_mask):
         return cudf.Index._from_data(
             {self.name: self._values.apply_boolean_mask(boolean_mask)}
@@ -838,21 +838,21 @@ def join(
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _column(self):
         return self._as_int_index()._column
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _columns(self):
         return self._as_int_index()._columns
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values_host(self) -> np.ndarray:
         return np.arange(start=self.start, stop=self.stop, step=self.step)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def argsort(
         self,
         ascending=True,
@@ -865,19 +865,19 @@ def argsort(
         else:
             return cupy.arange(len(self))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         return self._as_int_index().where(cond, other, inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_numpy(self) -> np.ndarray:
         return self.values_host
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_cupy(self) -> cupy.ndarray:
         return self.values
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_arrow(self) -> pa.Array:
         return pa.array(self._range, type=pa.from_numpy_dtype(self.dtype))
 
@@ -889,23 +889,23 @@ def __array__(self, dtype=None):
             "using .to_numpy()."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nunique(self, dropna: bool = True) -> int:
         return len(self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isna(self) -> cupy.ndarray:
         return cupy.zeros(len(self), dtype=bool)
 
     isnull = isna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def notna(self) -> cupy.ndarray:
         return cupy.ones(len(self), dtype=bool)
 
     notnull = isna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _minmax(self, meth: str):
         no_steps = len(self) - 1
         if no_steps == -1:
@@ -1004,12 +1004,12 @@ class Index(SingleColumnFrame, BaseIndex, metaclass=IndexMeta):
         Column's, the data Column will be cloned to adopt this name.
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(self, data, **kwargs):
         name = _getdefault_name(data, name=kwargs.get("name"))
         super().__init__({name: data})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         ret = super().__array_ufunc__(ufunc, method, *inputs, **kwargs)
 
@@ -1046,7 +1046,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
         return NotImplemented
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self:
         out = super()._from_data(data=data)
         if name is not no_default:
@@ -1054,7 +1054,7 @@ def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self:
         return out
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data_like_self(
         cls, data: MutableMapping, name: Any = no_default
     ) -> Self:
@@ -1064,7 +1064,7 @@ def _from_data_like_self(
         return out
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrow(cls, obj):
         try:
             return cls(ColumnBase.from_arrow(obj))
@@ -1118,12 +1118,12 @@ def _binaryop(
         return ret
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _values(self):
         return self._column
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _concat(cls, objs):
         non_empties = [index for index in objs if len(index)]
         if len(objs) != len(non_empties):
@@ -1166,16 +1166,16 @@ def _concat(cls, objs):
         result.name = name
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, deep=False):
         return self._column.memory_usage
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_unique(self):
         return self._column.is_unique
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:
         if not isinstance(other, BaseIndex) or len(self) != len(other):
             return False
@@ -1198,7 +1198,7 @@ def equals(self, other) -> bool:
         except TypeError:
             return False
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def copy(self, name=None, deep=False):
         """
         Make a copy of this object.
@@ -1221,11 +1221,11 @@ def copy(self, name=None, deep=False):
             {name: self._values.copy(True) if deep else self._values}
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(self, dtype, copy: bool = True):
         return super().astype({self.name: dtype}, copy)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_indexer(self, target, method=None, limit=None, tolerance=None):
         if is_scalar(target):
             raise TypeError("Should be a sequence")
@@ -1297,7 +1297,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
 
         return _return_get_indexer_result(result_series.to_cupy())
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_loc(self, key):
         if not is_scalar(key):
             raise TypeError("Should be a scalar-like")
@@ -1333,7 +1333,7 @@ def get_loc(self, key):
         mask[true_inds] = True
         return mask
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __repr__(self):
         max_seq_items = pd.get_option("max_seq_items") or len(self)
         mr = 0
@@ -1419,7 +1419,7 @@ def __repr__(self):
         lines.append(f"{prior_to_dtype} {keywords})")
         return "\n".join(lines)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, index):
         res = self._get_elements_from_column(index)
         if isinstance(res, ColumnBase):
@@ -1427,20 +1427,20 @@ def __getitem__(self, index):
         return res
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dtype(self):
         """
         `dtype` of the underlying values in Index.
         """
         return self._values.dtype
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isna(self):
         return self._column.isnull().values
 
     isnull = isna
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def notna(self):
         return self._column.notnull().values
 
@@ -1470,11 +1470,11 @@ def _is_interval(self):
         return False
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hasnans(self):
         return self._column.has_nulls(include_nan=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def argsort(
         self,
         axis=0,
@@ -1518,7 +1518,7 @@ def repeat(self, repeats, axis=None):
             Frame._repeat([*self._columns], repeats, axis), self._column_names
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
@@ -1615,7 +1615,7 @@ def _indices_of(self, value):
 
     @copy_docstring(StringMethods)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def str(self):
         if is_string_dtype(self.dtype):
             return StringMethods(parent=self)
@@ -1698,7 +1698,7 @@ class DatetimeIndex(Index):
                   dtype='datetime64[ns]', name='a')
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -1761,7 +1761,7 @@ def __init__(
             ):
                 raise ValueError("No unique frequency found")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _copy_type_metadata(self: Self, other: Self) -> Self:
         super()._copy_type_metadata(other)
         self._freq = _validate_freq(other._freq)
@@ -1783,7 +1783,7 @@ def __getitem__(self, index):
             return pd.Timestamp(value)
         return value
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def copy(self, name=None, deep=False):
         idx_copy = super().copy(name=name, deep=deep)
         return idx_copy._copy_type_metadata(self)
@@ -1801,7 +1801,7 @@ def searchsorted(
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def year(self):
         """
         The year of the datetime.
@@ -1820,7 +1820,7 @@ def year(self):
         return self._get_dt_field("year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def month(self):
         """
         The month as January=1, December=12.
@@ -1839,7 +1839,7 @@ def month(self):
         return self._get_dt_field("month")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day(self):
         """
         The day of the datetime.
@@ -1858,7 +1858,7 @@ def day(self):
         return self._get_dt_field("day")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hour(self):
         """
         The hours of the datetime.
@@ -1879,7 +1879,7 @@ def hour(self):
         return self._get_dt_field("hour")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def minute(self):
         """
         The minutes of the datetime.
@@ -1900,7 +1900,7 @@ def minute(self):
         return self._get_dt_field("minute")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def second(self):
         """
         The seconds of the datetime.
@@ -1921,7 +1921,7 @@ def second(self):
         return self._get_dt_field("second")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def microsecond(self):
         """
         The microseconds of the datetime.
@@ -1952,7 +1952,7 @@ def microsecond(self):
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nanosecond(self):
         """
         The nanoseconds of the datetime.
@@ -1974,7 +1974,7 @@ def nanosecond(self):
         return self._get_dt_field("nanosecond")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def weekday(self):
         """
         The day of the week with Monday=0, Sunday=6.
@@ -1996,7 +1996,7 @@ def weekday(self):
         return self._get_dt_field("weekday")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dayofweek(self):
         """
         The day of the week with Monday=0, Sunday=6.
@@ -2018,7 +2018,7 @@ def dayofweek(self):
         return self._get_dt_field("weekday")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dayofyear(self):
         """
         The day of the year, from 1-365 in non-leap years and
@@ -2041,7 +2041,7 @@ def dayofyear(self):
         return self._get_dt_field("day_of_year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day_of_year(self):
         """
         The day of the year, from 1-365 in non-leap years and
@@ -2064,7 +2064,7 @@ def day_of_year(self):
         return self._get_dt_field("day_of_year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_leap_year(self):
         """
         Boolean indicator if the date belongs to a leap year.
@@ -2083,7 +2083,7 @@ def is_leap_year(self):
         return cupy.asarray(res)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quarter(self):
         """
         Integer indicator for which quarter of the year the date belongs in.
@@ -2108,7 +2108,7 @@ def quarter(self):
         res = extract_quarter(self._values)
         return Index(res, dtype="int8")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day_name(self, locale: str | None = None) -> Index:
         """
         Return the day names. Currently supports English locale only.
@@ -2128,7 +2128,7 @@ def day_name(self, locale: str | None = None) -> Index:
         day_names = self._column.get_day_names(locale)
         return Index._from_data({self.name: day_names})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def month_name(self, locale: str | None = None) -> Index:
         """
         Return the month names. Currently supports English locale only.
@@ -2147,7 +2147,7 @@ def month_name(self, locale: str | None = None) -> Index:
         month_names = self._column.get_month_names(locale)
         return Index._from_data({self.name: month_names})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isocalendar(self) -> cudf.DataFrame:
         """
         Returns a DataFrame with the year, week, and day
@@ -2172,7 +2172,7 @@ def isocalendar(self) -> cudf.DataFrame:
         )
         return cudf.DataFrame._from_data(ca, index=self)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.DatetimeIndex:
@@ -2181,7 +2181,7 @@ def to_pandas(
             result.freq = self._freq._maybe_as_fast_pandas_offset()
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_dt_field(self, field):
         out_column = self._values.get_dt_field(field)
         # column.column_empty_like always returns a Column object
@@ -2198,7 +2198,7 @@ def _get_dt_field(self, field):
     def _is_boolean(self):
         return False
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ceil(self, freq):
         """
         Perform ceil operation on the data to the specified freq.
@@ -2231,7 +2231,7 @@ def ceil(self, freq):
 
         return self.__class__._from_data({self.name: out_column})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def floor(self, freq):
         """
         Perform floor operation on the data to the specified freq.
@@ -2264,7 +2264,7 @@ def floor(self, freq):
 
         return self.__class__._from_data({self.name: out_column})
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def round(self, freq):
         """
         Perform round operation on the data to the specified freq.
@@ -2452,7 +2452,7 @@ class TimedeltaIndex(Index):
                   dtype='timedelta64[s]', name='delta-index')
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -2500,7 +2500,7 @@ def __getitem__(self, index):
         return value
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def days(self):
         """
         Number of days for each element.
@@ -2509,7 +2509,7 @@ def days(self):
         return Index(self._values.days, name=self.name, dtype="int64")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def seconds(self):
         """
         Number of seconds (>= 0 and less than 1 day) for each element.
@@ -2517,7 +2517,7 @@ def seconds(self):
         return Index(self._values.seconds, name=self.name, dtype="int32")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def microseconds(self):
         """
         Number of microseconds (>= 0 and less than 1 second) for each element.
@@ -2525,7 +2525,7 @@ def microseconds(self):
         return Index(self._values.microseconds, name=self.name, dtype="int32")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nanoseconds(self):
         """
         Number of nanoseconds (>= 0 and less than 1 microsecond) for each
@@ -2534,7 +2534,7 @@ def nanoseconds(self):
         return Index(self._values.nanoseconds, name=self.name, dtype="int32")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def components(self):
         """
         Return a dataframe of the components (days, hours, minutes,
@@ -2612,7 +2612,7 @@ class CategoricalIndex(Index):
     CategoricalIndex([1, 2, 3, <NA>], categories=[1, 2, 3], ordered=False, dtype='category', name='a')
     """  # noqa: E501
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -2667,7 +2667,7 @@ def __init__(
         super().__init__(data, name=name)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def codes(self):
         """
         The category codes of this categorical.
@@ -2675,7 +2675,7 @@ def codes(self):
         return Index(self._values.codes)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def categories(self):
         """
         The categories of this categorical.
@@ -2689,7 +2689,7 @@ def _is_categorical(self):
         return True
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def interval_range(
     start=None,
     end=None,
@@ -2841,7 +2841,7 @@ class IntervalIndex(Index):
     IntervalIndex
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data,
@@ -2900,7 +2900,7 @@ def closed(self):
         return self.dtype.closed
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_breaks(
         cls,
         breaks,
@@ -2975,7 +2975,7 @@ def _clean_nulls_from_index(self):
         return self
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def as_index(
     arbitrary, nan_as_null=no_default, copy=False, name=no_default, dtype=None
 ) -> BaseIndex:
@@ -3090,7 +3090,7 @@ def _getdefault_name(values, name):
     return name
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex:
     """
     An internal Utility function to concat RangeIndex objects.
@@ -3131,7 +3131,7 @@ def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex:
     return RangeIndex(start, stop, step)
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _extended_gcd(a: int, b: int) -> tuple[int, int, int]:
     """
     Extended Euclidean algorithms to solve Bezout's identity:
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 280a6e92eab..72bd3c45fa6 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -56,7 +56,7 @@
 from cudf.utils import docutils, ioutils
 from cudf.utils._numba import _CUDFNumbaConfig
 from cudf.utils.docutils import copy_docstring
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _warn_no_dask_cudf
 
 if TYPE_CHECKING:
@@ -301,13 +301,13 @@ def _from_data(
         out._index = RangeIndex(out._data.nrows) if index is None else index
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data_like_self(self, data: MutableMapping):
         out = super()._from_data_like_self(data)
         out.index = self.index
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_columns_like_self(
         self,
         columns: list[ColumnBase],
@@ -363,7 +363,7 @@ def _mimic_inplace(
             self._index = result.index
         return super()._mimic_inplace(result, inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _scan(self, op, axis=None, skipna=True):
         """
         Return {op_name} of the {cls}.
@@ -439,7 +439,7 @@ def _check_data_index_length_match(self) -> None:
             )
 
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def empty(self):
         """
         Indicator whether DataFrame or Series is empty.
@@ -501,7 +501,7 @@ def empty(self):
         """
         return self.size == 0
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @ioutils.doc_to_json()
     def to_json(self, path_or_buf=None, *args, **kwargs):
         """{docstring}"""
@@ -510,14 +510,14 @@ def to_json(self, path_or_buf=None, *args, **kwargs):
             self, path_or_buf=path_or_buf, *args, **kwargs
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @ioutils.doc_to_hdf()
     def to_hdf(self, path_or_buf, key, *args, **kwargs):
         """{docstring}"""
 
         cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_string(self):
         r"""
         Convert to string
@@ -606,7 +606,7 @@ def copy(self, deep: bool = True) -> Self:
             self.index.copy(deep=False),
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def equals(self, other) -> bool:  # noqa: D102
         return super().equals(other) and self.index.equals(other.index)
 
@@ -632,7 +632,7 @@ def index(self, value):
 
         self._index = value
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def replace(
         self,
         to_replace=None,
@@ -900,7 +900,7 @@ def replace(
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def clip(self, lower=None, upper=None, inplace=False, axis=1):
         """
         Trim values at input threshold(s).
@@ -1026,7 +1026,7 @@ def clip(self, lower=None, upper=None, inplace=False, axis=1):
         )
         return self._mimic_inplace(output, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def abs(self):
         """
         Return a Series/DataFrame with absolute numeric value of each element.
@@ -1052,7 +1052,7 @@ def abs(self):
         """
         return self._unaryop("abs")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dot(self, other, reflect=False):
         """
         Get dot product of frame and other, (binary operator `dot`).
@@ -1159,15 +1159,15 @@ def dot(self, other, reflect=False):
             )
         return result.item()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __matmul__(self, other):
         return self.dot(other)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __rmatmul__(self, other):
         return self.dot(other, reflect=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def head(self, n=5):
         """
         Return the first `n` rows.
@@ -1246,7 +1246,7 @@ def head(self, n=5):
         """
         return self.iloc[:n]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def tail(self, n=5):
         """
         Returns the last n rows as a new DataFrame or Series
@@ -1277,7 +1277,7 @@ def tail(self, n=5):
 
         return self.iloc[-n:]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pipe(self, func, *args, **kwargs):
         """
         Apply ``func(self, *args, **kwargs)``.
@@ -1324,7 +1324,7 @@ def pipe(self, func, *args, **kwargs):
         """
         return cudf.core.common.pipe(self, func, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sum(
         self,
         axis=no_default,
@@ -1385,7 +1385,7 @@ def sum(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def product(
         self,
         axis=no_default,
@@ -1452,7 +1452,7 @@ def product(
     # Alias for pandas compatibility.
     prod = product
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         """
         Return the mean of the values for the requested axis.
@@ -1541,7 +1541,7 @@ def median(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def std(
         self,
         axis=no_default,
@@ -1600,7 +1600,7 @@ def std(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def var(
         self,
         axis=no_default,
@@ -1658,7 +1658,7 @@ def var(
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         """
         Return Fisher's unbiased kurtosis of a sample.
@@ -1718,7 +1718,7 @@ def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs):
     # Alias for kurtosis.
     kurt = kurtosis
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         """
         Return unbiased Fisher-Pearson skew of a sample.
@@ -1777,7 +1777,7 @@ def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs):
             **kwargs,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def mask(self, cond, other=None, inplace: bool = False) -> Self | None:
         """
         Replace values where the condition is True.
@@ -1839,7 +1839,7 @@ def mask(self, cond, other=None, inplace: bool = False) -> Self | None:
 
         return self.where(cond=~cond, other=other, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @copy_docstring(Rolling)
     def rolling(
         self, window, min_periods=None, center=False, axis=0, win_type=None
@@ -1879,7 +1879,7 @@ def ewm(
             times=times,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nans_to_nulls(self):
         """
         Convert nans (if any) to nulls
@@ -1935,7 +1935,7 @@ def nans_to_nulls(self):
             self._data._from_columns_like_self(result)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def interpolate(
         self,
         method="linear",
@@ -2034,7 +2034,7 @@ def interpolate(
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         """Shift values by `periods` positions."""
         axis = self._get_axis_from_axis_arg(axis)
@@ -2050,7 +2050,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
             self._data._from_columns_like_self(data_columns)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def truncate(self, before=None, after=None, axis=0, copy=True):
         """
         Truncate a Series or DataFrame before and after some index value.
@@ -2398,7 +2398,7 @@ def iloc(self):
         return self._iloc_indexer_type(self)
 
     @property  # type:ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def axes(self):
         """
         Return a list representing the axes of the Series.
@@ -2530,7 +2530,7 @@ def squeeze(self, axis: Literal["index", "columns", 0, 1, None] = None):
         )
         return self.iloc[indexer]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def scale(self):
         """
         Scale values to [0, 1] in float64
@@ -2565,7 +2565,7 @@ def scale(self):
         scaled.index = self.index.copy(deep=False)
         return scaled
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sort_index(
         self,
         axis=0,
@@ -3070,7 +3070,7 @@ def drop_duplicates(
             self.index.names if not ignore_index else None,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def duplicated(self, subset=None, keep="first"):
         """
         Return boolean Series denoting duplicate rows.
@@ -3180,7 +3180,7 @@ def duplicated(self, subset=None, keep="first"):
         )
         return cudf.Series(result, index=self.index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _empty_like(self, keep_index=True) -> Self:
         result = self._from_columns_like_self(
             libcudf.copying.columns_empty_like(
@@ -3217,7 +3217,7 @@ def _split(self, splits, keep_index=True):
             for i in range(len(splits) + 1)
         ]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def bfill(self, value=None, axis=None, inplace=None, limit=None):
         """
         Synonym for :meth:`Series.fillna` with ``method='bfill'``.
@@ -3236,7 +3236,7 @@ def bfill(self, value=None, axis=None, inplace=None, limit=None):
                 limit=limit,
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def backfill(self, value=None, axis=None, inplace=None, limit=None):
         """
         Synonym for :meth:`Series.fillna` with ``method='bfill'``.
@@ -3256,7 +3256,7 @@ def backfill(self, value=None, axis=None, inplace=None, limit=None):
         )
         return self.bfill(value=value, axis=axis, inplace=inplace, limit=limit)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ffill(self, value=None, axis=None, inplace=None, limit=None):
         """
         Synonym for :meth:`Series.fillna` with ``method='ffill'``.
@@ -3275,7 +3275,7 @@ def ffill(self, value=None, axis=None, inplace=None, limit=None):
                 limit=limit,
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pad(self, value=None, axis=None, inplace=None, limit=None):
         """
         Synonym for :meth:`Series.fillna` with ``method='ffill'``.
@@ -3415,7 +3415,7 @@ def add_suffix(self, suffix):
         raise NotImplementedError
 
     @acquire_spill_lock()
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _apply(self, func, kernel_getter, *args, **kwargs):
         """Apply `func` across the rows of the frame."""
         if kwargs:
@@ -3626,7 +3626,7 @@ def _align_to_index(
         out.index.names = self.index.names
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _reindex(
         self,
         column_names,
@@ -4154,7 +4154,7 @@ def dropna(
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _drop_na_columns(self, how="any", subset=None, thresh=None):
         """
         Drop columns containing nulls
@@ -4471,7 +4471,7 @@ def last(self, offset):
             slice_func=lambda i: self.iloc[i:],
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sample(
         self,
         n=None,
@@ -4751,7 +4751,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
 
         return NotImplemented
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def repeat(self, repeats, axis=None):
         """Repeats elements consecutively.
 
@@ -4949,7 +4949,7 @@ def astype(
                 raise e
             return self
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop(
         self,
         labels=None,
@@ -5161,7 +5161,7 @@ def drop(
         if not inplace:
             return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _explode(self, explode_column: Any, ignore_index: bool):
         # Helper function for `explode` in `Series` and `Dataframe`, explodes a
         # specified nested column. Other columns' corresponding rows are
@@ -5200,7 +5200,7 @@ def _explode(self, explode_column: Any, ignore_index: bool):
             self.index.names if not ignore_index else None,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def tile(self, count):
         """Repeats the rows `count` times to form a new Frame.
 
@@ -5233,7 +5233,7 @@ def tile(self, count):
             index_names=self._index_names,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def groupby(
         self,
         by=None,
@@ -5283,7 +5283,7 @@ def groupby(
             )
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Addition",
@@ -5324,7 +5324,7 @@ def add(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__add__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Addition",
@@ -5365,7 +5365,7 @@ def radd(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__radd__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Subtraction",
@@ -5408,7 +5408,7 @@ def subtract(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
     sub = subtract
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Subtraction",
@@ -5449,7 +5449,7 @@ def rsub(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rsub__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Multiplication",
@@ -5492,7 +5492,7 @@ def multiply(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
     mul = multiply
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Multiplication",
@@ -5533,7 +5533,7 @@ def rmul(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rmul__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Modulo",
@@ -5574,7 +5574,7 @@ def mod(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__mod__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Modulo",
@@ -5615,7 +5615,7 @@ def rmod(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rmod__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Exponential",
@@ -5656,7 +5656,7 @@ def pow(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__pow__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Exponential",
@@ -5697,7 +5697,7 @@ def rpow(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rpow__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Integer division",
@@ -5738,7 +5738,7 @@ def floordiv(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__floordiv__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Integer division",
@@ -5779,7 +5779,7 @@ def rfloordiv(self, other, axis, level=None, fill_value=None):  # noqa: D102
 
         return self._binaryop(other, "__rfloordiv__", fill_value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Floating division",
@@ -5824,7 +5824,7 @@ def truediv(self, other, axis, level=None, fill_value=None):  # noqa: D102
     div = truediv
     divide = truediv
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Floating division",
@@ -5868,7 +5868,7 @@ def rtruediv(self, other, axis, level=None, fill_value=None):  # noqa: D102
     # Alias for rtruediv
     rdiv = rtruediv
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Equal to",
@@ -5908,7 +5908,7 @@ def eq(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__eq__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Not equal to",
@@ -5948,7 +5948,7 @@ def ne(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__ne__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Less than",
@@ -5988,7 +5988,7 @@ def lt(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__lt__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Less than or equal to",
@@ -6028,7 +6028,7 @@ def le(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__le__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Greater than",
@@ -6068,7 +6068,7 @@ def gt(self, other, axis="columns", level=None, fill_value=None):  # noqa: D102
             other=other, op="__gt__", fill_value=fill_value, can_reindex=True
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_binop_template.format(
             operation="Greater than or equal to",
@@ -6123,7 +6123,7 @@ def _preprocess_subset(self, subset):
             raise KeyError(f"columns {diff} do not exist")
         return subset
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rank(
         self,
         axis=0,
@@ -6291,7 +6291,7 @@ def _check_duplicate_level_names(specified, level_names):
         )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _get_replacement_values_for_columns(
     to_replace: Any, value: Any, columns_dtype_map: dict[Any, Any]
 ) -> tuple[dict[Any, bool], dict[Any, Any], dict[Any, Any]]:
@@ -6458,7 +6458,7 @@ def _is_series(obj):
     return isinstance(obj, Frame) and obj.ndim == 1 and obj.index is not None
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _drop_rows_by_labels(
     obj: DataFrameOrSeries,
     labels: ColumnLike | abc.Iterable | str,
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 547c14cdc99..7657fa9e234 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -32,7 +32,7 @@
 )
 from cudf.core.join._join_helpers import _match_join_keys
 from cudf.utils.dtypes import is_column_like
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name
 
 if TYPE_CHECKING:
@@ -126,7 +126,7 @@ class MultiIndex(Frame, BaseIndex, NotIterable):
                )
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         levels=None,
@@ -211,12 +211,12 @@ def __init__(
         self.names = names
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def names(self):
         return self._names
 
     @names.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def names(self, value):
         if value is None:
             value = [None] * self.nlevels
@@ -242,13 +242,13 @@ def names(self, value):
             )
         self._names = pd.core.indexes.frozen.FrozenList(value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_series(self, index=None, name=None):
         raise NotImplementedError(
             "MultiIndex.to_series isn't implemented yet."
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(self, dtype, copy: bool = True):
         if not is_object_dtype(dtype):
             raise TypeError(
@@ -257,7 +257,7 @@ def astype(self, dtype, copy: bool = True):
             )
         return self
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rename(self, names, inplace=False):
         """
         Alter MultiIndex level names
@@ -304,7 +304,7 @@ def rename(self, names, inplace=False):
         """
         return self.set_names(names, level=None, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def set_names(self, names, level=None, inplace=False):
         names_is_list_like = is_list_like(names)
         level_is_list_like = is_list_like(level)
@@ -342,7 +342,7 @@ def set_names(self, names, level=None, inplace=False):
         return self._set_names(names=names, inplace=inplace)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data(
         cls,
         data: MutableMapping,
@@ -354,16 +354,16 @@ def _from_data(
         return obj
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self):
         return self._name
 
     @name.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self, value):
         self._name = value
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def copy(
         self,
         names=None,
@@ -432,7 +432,7 @@ def copy(
 
         return mi
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __repr__(self):
         max_seq_items = pd.get_option("display.max_seq_items") or len(self)
 
@@ -484,7 +484,7 @@ def _codes_frame(self):
 
     @property  # type: ignore
     @_external_only_api("Use ._codes_frame instead")
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def codes(self):
         """
         Returns the codes of the underlying MultiIndex.
@@ -510,13 +510,13 @@ def get_slice_bound(self, label, side, kind=None):
         raise NotImplementedError()
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nlevels(self):
         """Integer number of levels in this MultiIndex."""
         return self._num_columns
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def levels(self):
         """
         Returns list of levels in the MultiIndex
@@ -548,12 +548,12 @@ def levels(self):
         return self._levels
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ndim(self) -> int:
         """Dimension of the data. For MultiIndex ndim is always 2."""
         return 2
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_level_label(self, level):
         """Get name of the level.
 
@@ -570,7 +570,7 @@ def _get_level_label(self, level):
         else:
             return self._data.names[level]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isin(self, values, level=None):
         """Return a boolean array where the index values are in values.
 
@@ -669,7 +669,7 @@ def where(self, cond, other=None, inplace=False):
             ".where is not supported for MultiIndex operations"
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _compute_levels_and_codes(self):
         levels = []
 
@@ -683,7 +683,7 @@ def _compute_levels_and_codes(self):
         self._levels = levels
         self._codes = cudf.DataFrame._from_data(codes)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _compute_validity_mask(self, index, row_tuple, max_length):
         """Computes the valid set of indices of values in the lookup"""
         lookup = cudf.DataFrame()
@@ -731,7 +731,7 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
                     raise KeyError(row)
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
         # Instructions for Slicing
         # if tuple, get first and last elements of tuple
@@ -761,7 +761,7 @@ def _get_valid_indices_by_tuple(self, index, row_tuple, max_length):
             return row_tuple
         return self._compute_validity_mask(index, row_tuple, max_length)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _index_and_downcast(self, result, index, index_key):
         if isinstance(index_key, (numbers.Number, slice)):
             index_key = [index_key]
@@ -829,7 +829,7 @@ def _index_and_downcast(self, result, index, index_key):
             result.index = index
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_row_major(
         self,
         df: DataFrameOrSeries,
@@ -856,7 +856,7 @@ def _get_row_major(
         final = self._index_and_downcast(result, result.index, row_tuple)
         return final
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _validate_indexer(
         self,
         indexer: numbers.Number
@@ -884,7 +884,7 @@ def _validate_indexer(
             for i in indexer:
                 self._validate_indexer(i)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __eq__(self, other):
         if isinstance(other, MultiIndex):
             return np.array(
@@ -898,12 +898,12 @@ def __eq__(self, other):
         return NotImplemented
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def size(self):
         # The size of a MultiIndex is only dependent on the number of rows.
         return self._num_rows
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def take(self, indices):
         if isinstance(indices, cudf.Series) and indices.has_nulls:
             raise ValueError("Column must have no nulls.")
@@ -911,7 +911,7 @@ def take(self, indices):
         obj.names = self.names
         return obj
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def serialize(self):
         header, frames = super().serialize()
         # Overwrite the names in _data with the true names.
@@ -919,7 +919,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def deserialize(cls, header, frames):
         # Spoof the column names to construct the frame, then set manually.
         column_names = pickle.loads(header["column_names"])
@@ -927,7 +927,7 @@ def deserialize(cls, header, frames):
         obj = super().deserialize(header, frames)
         return obj._set_names(column_names)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, index):
         flatten = isinstance(index, int)
 
@@ -954,7 +954,7 @@ def __getitem__(self, index):
             result._levels = self._levels
         return result
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_frame(self, index=True, name=no_default, allow_duplicates=False):
         """
         Create a DataFrame with the levels of the MultiIndex as columns.
@@ -1031,7 +1031,7 @@ def to_frame(self, index=True, name=no_default, allow_duplicates=False):
             data=ca, index=self if index else None
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_level_values(self, level):
         """
         Return the values at the requested level
@@ -1087,7 +1087,7 @@ def _is_interval(self):
         return False
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _concat(cls, objs):
         source_data = [o.to_frame(index=False) for o in objs]
 
@@ -1107,7 +1107,7 @@ def _concat(cls, objs):
         return cudf.MultiIndex.from_frame(source_data, names=names)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_tuples(cls, tuples, names=None):
         """
         Convert list of tuples to MultiIndex.
@@ -1145,12 +1145,12 @@ def from_tuples(cls, tuples, names=None):
         pdi = pd.MultiIndex.from_tuples(tuples, names=names)
         return cls.from_pandas(pdi)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_numpy(self):
         return self.values_host
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values_host(self):
         """
         Return a numpy representation of the MultiIndex.
@@ -1178,7 +1178,7 @@ def values_host(self):
         return self.to_pandas().values
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values(self):
         """
         Return a CuPy representation of the MultiIndex.
@@ -1214,7 +1214,7 @@ def values(self):
         return self.to_frame(index=False).values
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
         """
         Make a MultiIndex from a DataFrame.
@@ -1289,7 +1289,7 @@ def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
         return obj
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_product(cls, arrays, names=None):
         """
         Make a MultiIndex from the cartesian product of multiple iterables.
@@ -1331,7 +1331,7 @@ def from_product(cls, arrays, names=None):
         return cls.from_pandas(pdi)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrays(
         cls,
         arrays,
@@ -1390,7 +1390,7 @@ def from_arrays(
             codes=codes, levels=levels, sortorder=sortorder, names=names
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _poplevels(self, level):
         """
         Remove and return the specified levels from self.
@@ -1441,7 +1441,7 @@ def _poplevels(self, level):
 
         return popped
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def swaplevel(self, i=-2, j=-1):
         """
         Swap level i with level j.
@@ -1492,7 +1492,7 @@ def swaplevel(self, i=-2, j=-1):
             midx = midx.set_names(self.names)
         return midx
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def droplevel(self, level=-1):
         """
         Removes the specified levels from the MultiIndex.
@@ -1555,7 +1555,7 @@ def droplevel(self, level=-1):
         else:
             return mi
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
     ) -> pd.MultiIndex:
@@ -1572,7 +1572,7 @@ def to_pandas(
         )
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default):
         """
         Convert from a Pandas MultiIndex
@@ -1607,7 +1607,7 @@ def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default):
         )
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_unique(self):
         return len(self) == len(self.unique())
 
@@ -1615,7 +1615,7 @@ def is_unique(self):
     def dtype(self):
         return np.dtype("O")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _is_sorted(self, ascending=None, null_position=None) -> bool:
         """
         Returns a boolean indicating whether the data of the MultiIndex are sorted
@@ -1661,7 +1661,7 @@ def _is_sorted(self, ascending=None, null_position=None) -> bool:
         )
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_increasing(self) -> bool:
         """
         Return if the index is monotonic increasing
@@ -1670,7 +1670,7 @@ def is_monotonic_increasing(self) -> bool:
         return self._is_sorted(ascending=None, null_position=None)
 
     @cached_property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_decreasing(self) -> bool:
         """
         Return if the index is monotonic decreasing
@@ -1680,7 +1680,7 @@ def is_monotonic_decreasing(self) -> bool:
             ascending=[False] * len(self.levels), null_position=None
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(self, value):
         """
         Fill null values with the specified value.
@@ -1721,11 +1721,11 @@ def fillna(self, value):
 
         return super().fillna(value=value)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def unique(self):
         return self.drop_duplicates(keep="first")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nunique(self, dropna: bool = True) -> int:
         mi = self.dropna(how="all") if dropna else self
         return len(mi.unique())
@@ -1740,7 +1740,7 @@ def _clean_nulls_from_index(self):
             index_df._clean_nulls_from_dataframe(index_df), names=self.names
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, deep=False):
         usage = sum(col.memory_usage for col in self._data.columns)
         if self.levels:
@@ -1751,13 +1751,13 @@ def memory_usage(self, deep=False):
                 usage += col.memory_usage
         return usage
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def difference(self, other, sort=None):
         if hasattr(other, "to_pandas"):
             other = other.to_pandas()
         return cudf.from_pandas(self.to_pandas().difference(other, sort))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def append(self, other):
         """
         Append a collection of MultiIndex objects together
@@ -1820,7 +1820,7 @@ def append(self, other):
 
         return MultiIndex._concat(to_concat)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_function__(self, func, types, args, kwargs):
         cudf_df_module = MultiIndex
 
@@ -1867,7 +1867,7 @@ def _level_index_from_level(self, level):
                 ) from None
             return level
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_indexer(self, target, method=None, limit=None, tolerance=None):
         if tolerance is not None:
             raise NotImplementedError(
@@ -1926,7 +1926,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
 
         return _return_get_indexer_result(result_series.to_cupy())
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def get_loc(self, key):
         is_sorted = (
             self.is_monotonic_increasing or self.is_monotonic_decreasing
@@ -2000,7 +2000,7 @@ def _maybe_match_names(self, other):
             for self_name, other_name in zip(self.names, other.names)
         ]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def union(self, other, sort=None):
         if not isinstance(other, MultiIndex):
             msg = "other must be a MultiIndex or a list of tuples"
@@ -2024,7 +2024,7 @@ def union(self, other, sort=None):
 
         return self._union(other, sort=sort)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _union(self, other, sort=None):
         # TODO: When to_frame is refactored to return a
         # deep copy in future, we should push most of the common
@@ -2050,7 +2050,7 @@ def _union(self, other, sort=None):
             return midx.sort_values()
         return midx
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _intersection(self, other, sort=None):
         if self.names != other.names:
             deep = True
@@ -2073,14 +2073,14 @@ def _intersection(self, other, sort=None):
             return midx.sort_values()
         return midx
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _copy_type_metadata(self: Self, other: Self) -> Self:
         res = super()._copy_type_metadata(other)
         if isinstance(other, MultiIndex):
             res._names = other._names
         return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _split_columns_by_levels(
         self, levels: tuple, *, in_levels: bool
     ) -> Generator[tuple[Any, column.ColumnBase], None, None]:
@@ -2099,7 +2099,7 @@ def _split_columns_by_levels(
             elif not in_levels and i not in level_indices:
                 yield name, col
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _new_index_for_reset_index(
         self, levels: tuple | None, name
     ) -> None | BaseIndex:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index ea25d482578..9acf5294b72 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -68,7 +68,7 @@
     is_mixed_with_object_dtype,
     to_cudf_compatible_scalar,
 )
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 if TYPE_CHECKING:
     from cudf._typing import (
@@ -179,7 +179,7 @@ class _SeriesIlocIndexer(_FrameIndexer):
 
     _frame: cudf.Series
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, arg):
         indexing_spec = indexing_utils.parse_row_iloc_indexer(
             indexing_utils.destructure_series_iloc_indexer(arg, self._frame),
@@ -187,7 +187,7 @@ def __getitem__(self, arg):
         )
         return self._frame._getitem_preprocessed(indexing_spec)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __setitem__(self, key, value):
         if isinstance(key, tuple):
             key = list(key)
@@ -274,7 +274,7 @@ class _SeriesLocIndexer(_FrameIndexer):
     Label-based selection
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, arg: Any) -> ScalarLike | DataFrameOrSeries:
         if isinstance(arg, pd.MultiIndex):
             arg = cudf.from_pandas(arg)
@@ -301,7 +301,7 @@ def __getitem__(self, arg: Any) -> ScalarLike | DataFrameOrSeries:
 
         return self._frame.iloc[arg]
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __setitem__(self, key, value):
         try:
             key = self._loc_to_iloc(key)
@@ -476,7 +476,7 @@ def _constructor_expanddim(self):
         return cudf.DataFrame
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_categorical(cls, categorical, codes=None):
         """Creates from a pandas.Categorical
 
@@ -517,7 +517,7 @@ def from_categorical(cls, categorical, codes=None):
         return Series(data=col)
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_masked_array(cls, data, mask, null_count=None):
         """Create a Series with null-mask.
         This is equivalent to:
@@ -566,7 +566,7 @@ def from_masked_array(cls, data, mask, null_count=None):
         col = as_column(data).set_mask(mask)
         return cls(data=col)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         data=None,
@@ -663,7 +663,7 @@ def __init__(
         self._check_data_index_length_match()
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data(
         cls,
         data: MutableMapping,
@@ -675,18 +675,18 @@ def _from_data(
             out.name = name
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _from_data_like_self(self, data: MutableMapping):
         out = super()._from_data_like_self(data)
         out.name = self.name
         return out
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __contains__(self, item):
         return item in self.index
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_pandas(cls, s: pd.Series, nan_as_null=no_default):
         """
         Convert from a Pandas Series.
@@ -735,7 +735,7 @@ def from_pandas(cls, s: pd.Series, nan_as_null=no_default):
         return result
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_unique(self):
         """Return boolean if values in the object are unique.
 
@@ -746,7 +746,7 @@ def is_unique(self):
         return self._column.is_unique
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dt(self):
         """
         Accessor object for datetime-like properties of the Series values.
@@ -788,7 +788,7 @@ def dt(self):
             )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hasnans(self):
         """
         Return True if there are any NaNs or nulls.
@@ -829,7 +829,7 @@ def hasnans(self):
         """
         return self._column.has_nulls(include_nan=True)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def serialize(self):
         header, frames = super().serialize()
 
@@ -842,7 +842,7 @@ def serialize(self):
         return header, frames
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def deserialize(cls, header, frames):
         index_nframes = header["index_frame_count"]
         obj = super().deserialize(
@@ -855,7 +855,7 @@ def deserialize(cls, header, frames):
 
         return obj
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop(
         self,
         labels=None,
@@ -884,7 +884,7 @@ def tolist(self):  # noqa: D102
 
     to_list = tolist
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_dict(self, into: type[dict] = dict) -> dict:
         """
         Convert Series to {label -> value} dict or dict-like object.
@@ -923,7 +923,7 @@ def to_dict(self, into: type[dict] = dict) -> dict:
         """
         return self.to_pandas().to_dict(into=into)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def reindex(self, *args, **kwargs):
         """
         Conform Series to new index.
@@ -996,7 +996,7 @@ def reindex(self, *args, **kwargs):
         series.name = self.name
         return series
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         doc_reset_index_template.format(
             klass="Series",
@@ -1081,7 +1081,7 @@ def reset_index(
             inplace=inplace,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_frame(self, name=None):
         """Convert Series into a DataFrame
 
@@ -1124,13 +1124,13 @@ def to_frame(self, name=None):
 
         return cudf.DataFrame({col: self._column}, index=self.index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def memory_usage(self, index=True, deep=False):
         return self._column.memory_usage + (
             self.index.memory_usage() if index else 0
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __array_function__(self, func, types, args, kwargs):
         if "out" in kwargs or not all(issubclass(t, Series) for t in types):
             return NotImplemented
@@ -1191,7 +1191,7 @@ def __array_function__(self, func, types, args, kwargs):
 
         return NotImplemented
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def map(self, arg, na_action=None) -> "Series":
         """
         Map values of Series according to input correspondence.
@@ -1333,7 +1333,7 @@ def _getitem_preprocessed(
             return self._empty_like(keep_index=True)
         assert_never(spec)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __getitem__(self, arg):
         if isinstance(arg, slice):
             return self.iloc[arg]
@@ -1344,7 +1344,7 @@ def __getitem__(self, arg):
 
     items = SingleColumnFrame.__iter__
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __setitem__(self, key, value):
         if isinstance(key, slice):
             self.iloc[key] = value
@@ -1495,36 +1495,36 @@ def _make_operands_and_index_for_binop(
 
     @copy_docstring(CategoricalAccessor)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cat(self):
         return CategoricalAccessor(parent=self)
 
     @copy_docstring(StringMethods)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def str(self):
         return StringMethods(parent=self)
 
     @copy_docstring(ListMethods)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def list(self):
         return ListMethods(parent=self)
 
     @copy_docstring(StructMethods)  # type: ignore
     @property
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def struct(self):
         return StructMethods(parent=self)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dtype(self):
         """The dtype of the Series."""
         return self._column.dtype
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _concat(cls, objs, axis=0, index=True):
         # Concatenate index if not provided
         if index is True:
@@ -1590,25 +1590,25 @@ def _concat(cls, objs, axis=0, index=True):
         return cls(data=col, index=index, name=name)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def valid_count(self):
         """Number of non-null values"""
         return len(self) - self._column.null_count
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def null_count(self):
         """Number of null values"""
         return self._column.null_count
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nullable(self):
         """A boolean indicating whether a null-mask is needed"""
         return self._column.nullable
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def has_nulls(self):
         """
         Indicator whether Series contains null values.
@@ -1637,7 +1637,7 @@ def has_nulls(self):
         """
         return self._column.has_nulls()
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dropna(self, axis=0, inplace=False, how=None):
         """
         Return a Series with null values removed.
@@ -1717,7 +1717,7 @@ def dropna(self, axis=0, inplace=False, how=None):
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
         """
         Return Series with duplicate values removed.
@@ -1791,7 +1791,7 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
 
         return self._mimic_inplace(result, inplace=inplace)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
     ):
@@ -1896,7 +1896,7 @@ def between(self, left, right, inclusive="both") -> Series:
             )
         return self._from_data({self.name: lmask & rmask}, self.index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def all(self, axis=0, bool_only=None, skipna=True, **kwargs):
         if bool_only not in (None, True):
             raise NotImplementedError(
@@ -1904,7 +1904,7 @@ def all(self, axis=0, bool_only=None, skipna=True, **kwargs):
             )
         return super().all(axis, skipna, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def any(self, axis=0, bool_only=None, skipna=True, **kwargs):
         if bool_only not in (None, True):
             raise NotImplementedError(
@@ -1912,7 +1912,7 @@ def any(self, axis=0, bool_only=None, skipna=True, **kwargs):
             )
         return super().any(axis, skipna, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_pandas(
         self,
         *,
@@ -2004,7 +2004,7 @@ def to_pandas(
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def data(self):
         """The gpu buffer for the data
 
@@ -2029,12 +2029,12 @@ def data(self):
         return self._column.data
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nullmask(self):
         """The gpu buffer for the null-mask"""
         return cudf.Series(self._column.nullmask)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def astype(
         self,
         dtype,
@@ -2051,13 +2051,13 @@ def astype(
             dtype = {self.name: dtype}
         return super().astype(dtype, copy, errors)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sort_index(self, axis=0, *args, **kwargs):
         if axis not in (0, "index"):
             raise ValueError("Only axis=0 is valid for Series.")
         return super().sort_index(axis=axis, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def sort_values(
         self,
         axis=0,
@@ -2112,7 +2112,7 @@ def sort_values(
             ignore_index=ignore_index,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nlargest(self, n=5, keep="first"):
         """Returns a new Series of the *n* largest element.
 
@@ -2175,7 +2175,7 @@ def nlargest(self, n=5, keep="first"):
         """
         return self._n_largest_or_smallest(True, n, [self.name], keep)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nsmallest(self, n=5, keep="first"):
         """
         Returns a new Series of the *n* smallest element.
@@ -2251,7 +2251,7 @@ def nsmallest(self, n=5, keep="first"):
         """
         return self._n_largest_or_smallest(False, n, [self.name], keep)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def argsort(
         self,
         axis=0,
@@ -2274,7 +2274,7 @@ def argsort(
         obj.name = self.name
         return obj
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def replace(self, to_replace=None, value=no_default, *args, **kwargs):
         if is_dict_like(to_replace) and value not in {None, no_default}:
             raise ValueError(
@@ -2284,7 +2284,7 @@ def replace(self, to_replace=None, value=no_default, *args, **kwargs):
 
         return super().replace(to_replace, value, *args, **kwargs)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def update(self, other):
         """
         Modify Series in place using values from passed Series.
@@ -2390,7 +2390,7 @@ def update(self, other):
         self.mask(mask, other, inplace=True)
 
     # UDF related
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def apply(self, func, convert_dtype=True, args=(), **kwargs):
         """
         Apply a scalar function to the values of a Series.
@@ -2535,7 +2535,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
     #
     # Stats
     #
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def count(self):
         """
         Return number of non-NA/null observations in the Series
@@ -2559,7 +2559,7 @@ def count(self):
         """
         return self.valid_count
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def mode(self, dropna=True):
         """
         Return the mode(s) of the dataset.
@@ -2630,7 +2630,7 @@ def mode(self, dropna=True):
             {self.name: val_counts.index.sort_values()}, name=self.name
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def round(self, decimals=0, how="half_even"):
         if not is_integer(decimals):
             raise ValueError(
@@ -2639,7 +2639,7 @@ def round(self, decimals=0, how="half_even"):
         decimals = int(decimals)
         return super().round(decimals, how)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def cov(self, other, min_periods=None):
         """
         Compute covariance with Series, excluding missing values.
@@ -2690,7 +2690,7 @@ def cov(self, other, min_periods=None):
                 f"{other.dtype}"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def transpose(self):
         """Return the transpose, which is by definition self."""
 
@@ -2698,7 +2698,7 @@ def transpose(self):
 
     T = property(transpose, doc=transpose.__doc__)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def duplicated(self, keep="first"):
         """
         Indicate duplicate Series values.
@@ -2778,7 +2778,7 @@ def duplicated(self, keep="first"):
         """
         return super().duplicated(keep=keep)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def corr(self, other, method="pearson", min_periods=None):
         """Calculates the sample correlation between two Series,
         excluding missing values.
@@ -2830,7 +2830,7 @@ def corr(self, other, method="pearson", min_periods=None):
                 f"cannot perform corr with types {self.dtype}, {other.dtype}"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def autocorr(self, lag=1):
         """Compute the lag-N autocorrelation. This method computes the Pearson
         correlation between the Series and its shifted self.
@@ -2856,7 +2856,7 @@ def autocorr(self, lag=1):
         """
         return self.corr(self.shift(lag))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isin(self, values):
         """Check whether values are contained in Series.
 
@@ -2926,7 +2926,7 @@ def isin(self, values):
             {self.name: self._column.isin(values)}, index=self.index
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def unique(self):
         """
         Returns unique values of this Series.
@@ -2961,7 +2961,7 @@ def unique(self):
             return res.values
         return Series(res, name=self.name)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def value_counts(
         self,
         normalize=False,
@@ -3116,7 +3116,7 @@ def value_counts(
         res.name = result_name
         return res
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quantile(
         self, q=0.5, interpolation="linear", exact=True, quant_index=True
     ):
@@ -3195,7 +3195,7 @@ def quantile(
         )
 
     @docutils.doc_describe()
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def describe(
         self,
         percentiles=None,
@@ -3240,7 +3240,7 @@ def describe(
             name=self.name,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def digitize(self, bins, right=False):
         """Return the indices of the bins to which each value belongs.
 
@@ -3276,7 +3276,7 @@ def digitize(self, bins, right=False):
             cudf.core.column.numerical.digitize(self._column, bins, right)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def diff(self, periods=1):
         """First discrete difference of element.
 
@@ -3347,7 +3347,7 @@ def diff(self, periods=1):
 
         return self - self.shift(periods=periods)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     @docutils.doc_apply(
         groupby_doc_template.format(
             ret=textwrap.dedent(
@@ -3385,7 +3385,7 @@ def groupby(
             dropna,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def rename(self, index=None, copy=True):
         """
         Alter Series name
@@ -3431,7 +3431,7 @@ def rename(self, index=None, copy=True):
         out_data = self._data.copy(deep=copy)
         return Series._from_data(out_data, self.index, name=index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def add_prefix(self, prefix):
         return Series._from_data(
             # TODO: Change to deep=False when copy-on-write is default
@@ -3439,7 +3439,7 @@ def add_prefix(self, prefix):
             index=prefix + self.index.astype(str),
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def add_suffix(self, suffix):
         return Series._from_data(
             # TODO: Change to deep=False when copy-on-write is default
@@ -3447,7 +3447,7 @@ def add_suffix(self, suffix):
             index=self.index.astype(str) + suffix,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def keys(self):
         """
         Return alias for index.
@@ -3491,7 +3491,7 @@ def keys(self):
         """
         return self.index
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def explode(self, ignore_index=False):
         """
         Transform each element of a list-like to a row, replicating index
@@ -3528,7 +3528,7 @@ def explode(self, ignore_index=False):
         """
         return super()._explode(self.name, ignore_index)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def pct_change(
         self, periods=1, fill_method=no_default, limit=no_default, freq=None
     ):
@@ -3602,7 +3602,7 @@ def pct_change(
         change = diff / data.shift(periods=periods, freq=freq)
         return change
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
@@ -3736,7 +3736,7 @@ class DatetimeProperties(BaseDatelikeProperties):
     """
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def year(self) -> Series:
         """
         The year of the datetime.
@@ -3761,7 +3761,7 @@ def year(self) -> Series:
         return self._get_dt_field("year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def month(self) -> Series:
         """
         The month as January=1, December=12.
@@ -3786,7 +3786,7 @@ def month(self) -> Series:
         return self._get_dt_field("month")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day(self) -> Series:
         """
         The day of the datetime.
@@ -3811,7 +3811,7 @@ def day(self) -> Series:
         return self._get_dt_field("day")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def hour(self) -> Series:
         """
         The hours of the datetime.
@@ -3836,7 +3836,7 @@ def hour(self) -> Series:
         return self._get_dt_field("hour")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def minute(self) -> Series:
         """
         The minutes of the datetime.
@@ -3861,7 +3861,7 @@ def minute(self) -> Series:
         return self._get_dt_field("minute")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def second(self) -> Series:
         """
         The seconds of the datetime.
@@ -3886,7 +3886,7 @@ def second(self) -> Series:
         return self._get_dt_field("second")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def microsecond(self) -> Series:
         """
         The microseconds of the datetime.
@@ -3918,7 +3918,7 @@ def microsecond(self) -> Series:
         return self._return_result_like_self(micro + extra)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nanosecond(self) -> Series:
         """
         The nanoseconds of the datetime.
@@ -3943,7 +3943,7 @@ def nanosecond(self) -> Series:
         return self._get_dt_field("nanosecond")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def weekday(self) -> Series:
         """
         The day of the week with Monday=0, Sunday=6.
@@ -3980,7 +3980,7 @@ def weekday(self) -> Series:
         return self._get_dt_field("weekday")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dayofweek(self) -> Series:
         """
         The day of the week with Monday=0, Sunday=6.
@@ -4017,7 +4017,7 @@ def dayofweek(self) -> Series:
         return self._get_dt_field("weekday")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def dayofyear(self) -> Series:
         """
         The day of the year, from 1-365 in non-leap years and
@@ -4055,7 +4055,7 @@ def dayofyear(self) -> Series:
         return self._get_dt_field("day_of_year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day_of_year(self) -> Series:
         """
         The day of the year, from 1-365 in non-leap years and
@@ -4093,7 +4093,7 @@ def day_of_year(self) -> Series:
         return self._get_dt_field("day_of_year")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_leap_year(self) -> Series:
         """
         Boolean indicator if the date belongs to a leap year.
@@ -4148,7 +4148,7 @@ def is_leap_year(self) -> Series:
         return self._return_result_like_self(res)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def quarter(self) -> Series:
         """
         Integer indicator for which quarter of the year the date belongs in.
@@ -4177,7 +4177,7 @@ def quarter(self) -> Series:
         )
         return self._return_result_like_self(res)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def day_name(self, locale: str | None = None) -> Series:
         """
         Return the day names. Currently supports English locale only.
@@ -4213,7 +4213,7 @@ def day_name(self, locale: str | None = None) -> Series:
             self.series._column.get_day_names(locale)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def month_name(self, locale: str | None = None) -> Series:
         """
         Return the month names. Currently supports English locale only.
@@ -4243,7 +4243,7 @@ def month_name(self, locale: str | None = None) -> Series:
             self.series._column.get_month_names(locale)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def isocalendar(self) -> cudf.DataFrame:
         """
         Returns a DataFrame with the year, week, and day
@@ -4291,7 +4291,7 @@ def isocalendar(self) -> cudf.DataFrame:
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_month_start(self) -> Series:
         """
         Booleans indicating if dates are the first day of the month.
@@ -4299,7 +4299,7 @@ def is_month_start(self) -> Series:
         return (self.day == 1).fillna(False)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def days_in_month(self) -> Series:
         """
         Get the total number of days in the month that the date falls on.
@@ -4348,7 +4348,7 @@ def days_in_month(self) -> Series:
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_month_end(self) -> Series:
         """
         Boolean indicator if the date is the last day of the month.
@@ -4391,7 +4391,7 @@ def is_month_end(self) -> Series:
         return (self.day == last_day.dt.day).fillna(False)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_quarter_start(self) -> Series:
         """
         Boolean indicator if the date is the first day of a quarter.
@@ -4436,7 +4436,7 @@ def is_quarter_start(self) -> Series:
         return self._return_result_like_self(result)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_quarter_end(self) -> Series:
         """
         Boolean indicator if the date is the last day of a quarter.
@@ -4483,7 +4483,7 @@ def is_quarter_end(self) -> Series:
         return self._return_result_like_self(result)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_year_start(self) -> Series:
         """
         Boolean indicator if the date is the first day of the year.
@@ -4514,7 +4514,7 @@ def is_year_start(self) -> Series:
         return self._return_result_like_self(outcol.fillna(False))
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_year_end(self) -> Series:
         """
         Boolean indicator if the date is the last day of the year.
@@ -4547,13 +4547,13 @@ def is_year_end(self) -> Series:
         result = cudf._lib.copying.copy_if_else(leap, non_leap, leap_dates)
         return self._return_result_like_self(result.fillna(False))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_dt_field(self, field: str) -> Series:
         return self._return_result_like_self(
             self.series._column.get_dt_field(field)
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ceil(self, freq: str) -> Series:
         """
         Perform ceil operation on the data to the specified freq.
@@ -4586,7 +4586,7 @@ def ceil(self, freq: str) -> Series:
         """
         return self._return_result_like_self(self.series._column.ceil(freq))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def floor(self, freq: str) -> Series:
         """
         Perform floor operation on the data to the specified freq.
@@ -4619,7 +4619,7 @@ def floor(self, freq: str) -> Series:
         """
         return self._return_result_like_self(self.series._column.floor(freq))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def round(self, freq: str) -> Series:
         """
         Perform round operation on the data to the specified freq.
@@ -4655,7 +4655,7 @@ def round(self, freq: str) -> Series:
         """
         return self._return_result_like_self(self.series._column.round(freq))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def strftime(self, date_format: str, *args, **kwargs) -> Series:
         """
         Convert to Series using specified ``date_format``.
@@ -4832,7 +4832,7 @@ class TimedeltaProperties(BaseDatelikeProperties):
     """
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def days(self) -> Series:
         """
         Number of days.
@@ -4864,7 +4864,7 @@ def days(self) -> Series:
         return self._get_td_field("days")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def seconds(self) -> Series:
         """
         Number of seconds (>= 0 and less than 1 day).
@@ -4903,7 +4903,7 @@ def seconds(self) -> Series:
         return self._get_td_field("seconds")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def microseconds(self) -> Series:
         """
         Number of microseconds (>= 0 and less than 1 second).
@@ -4935,7 +4935,7 @@ def microseconds(self) -> Series:
         return self._get_td_field("microseconds")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nanoseconds(self) -> Series:
         """
         Return the number of nanoseconds (n), where 0 <= n < 1 microsecond.
@@ -4967,7 +4967,7 @@ def nanoseconds(self) -> Series:
         return self._get_td_field("nanoseconds")
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def components(self) -> cudf.DataFrame:
         """
         Return a Dataframe of the components of the Timedeltas.
@@ -4999,14 +4999,14 @@ def components(self) -> cudf.DataFrame:
             ca, index=self.series.index
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _get_td_field(self, field: str) -> Series:
         return self._return_result_like_self(
             getattr(self.series._column, field)
         )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _align_indices(series_list, how="outer", allow_non_unique=False):
     """
     Internal util to align the indices of a list of Series objects
@@ -5069,7 +5069,7 @@ def _align_indices(series_list, how="outer", allow_non_unique=False):
 
 
 @acquire_spill_lock()
-@_cudf_nvtx_annotate
+@_performance_tracking
 def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
     r"""Returns a boolean array where two arrays are equal within a tolerance.
 
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 23a2c828a04..f9555aee6a2 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -18,7 +18,7 @@
 )
 from cudf.core.column import ColumnBase, as_column
 from cudf.core.frame import Frame
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import NotIterable
 
 if TYPE_CHECKING:
@@ -41,7 +41,7 @@ class SingleColumnFrame(Frame, NotIterable):
         "index": 0,
     }
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _reduce(
         self,
         op,
@@ -62,7 +62,7 @@ def _reduce(
         except AttributeError:
             raise TypeError(f"cannot perform {op} with type {self.dtype}")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _scan(self, op, axis=None, *args, **kwargs):
         if axis not in (None, 0):
             raise NotImplementedError("axis parameter is not implemented yet")
@@ -70,24 +70,24 @@ def _scan(self, op, axis=None, *args, **kwargs):
         return super()._scan(op, axis=axis, *args, **kwargs)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self):
         """Get the name of this object."""
         return next(iter(self._column_names))
 
     @name.setter  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def name(self, value):
         self._data[value] = self._data.pop(self.name)
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def ndim(self) -> int:  # noqa: D401
         """Number of dimensions of the underlying data, by definition 1."""
         return 1
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def shape(self) -> tuple[int]:
         """Get a tuple representing the dimensionality of the Index."""
         return (len(self),)
@@ -99,27 +99,27 @@ def __bool__(self):
         )
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _num_columns(self) -> int:
         return 1
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _column(self) -> ColumnBase:
         return next(iter(self._columns))
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values(self) -> cupy.ndarray:  # noqa: D102
         return self._column.values
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def values_host(self) -> numpy.ndarray:  # noqa: D102
         return self._column.values_host
 
     @classmethod
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def from_arrow(cls, array) -> Self:
         """Create from PyArrow Array/ChunkedArray.
 
@@ -150,7 +150,7 @@ def from_arrow(cls, array) -> Self:
         """
         return cls(ColumnBase.from_arrow(array))
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def to_arrow(self) -> pa.Array:
         """
         Convert to a PyArrow Array.
@@ -182,7 +182,7 @@ def to_arrow(self) -> pa.Array:
         return self._column.to_arrow()
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_unique(self) -> bool:
         """Return boolean if values in the object are unique.
 
@@ -193,7 +193,7 @@ def is_unique(self) -> bool:
         return self._column.is_unique
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_increasing(self) -> bool:
         """Return boolean if values in the object are monotonically increasing.
 
@@ -204,7 +204,7 @@ def is_monotonic_increasing(self) -> bool:
         return self._column.is_monotonic_increasing
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def is_monotonic_decreasing(self) -> bool:
         """Return boolean if values in the object are monotonically decreasing.
 
@@ -215,7 +215,7 @@ def is_monotonic_decreasing(self) -> bool:
         return self._column.is_monotonic_decreasing
 
     @property  # type: ignore
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __cuda_array_interface__(self):
         # While the parent column class has a `__cuda_array_interface__` method
         # defined, it is not implemented for all column types. When it is not
@@ -229,7 +229,7 @@ def __cuda_array_interface__(self):
                 "'__cuda_array_interface__'"
             )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def factorize(
         self, sort: bool = False, use_na_sentinel: bool = True
     ) -> tuple[cupy.ndarray, cudf.Index]:
@@ -268,7 +268,7 @@ def factorize(
             use_na_sentinel=use_na_sentinel,
         )
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def _make_operands_for_binop(
         self,
         other: Any,
@@ -323,7 +323,7 @@ def _make_operands_for_binop(
 
         return {result_name: (self._column, other, reflect, fill_value)}
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def nunique(self, dropna: bool = True) -> int:
         """
         Return count of unique values for the column.
@@ -369,7 +369,7 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase:
                 return self._column.apply_boolean_mask(arg)
             raise NotImplementedError(f"Unknown indexer {type(arg)}")
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def where(self, cond, other=None, inplace=False):
         from cudf.core._internals.where import (
             _check_and_cast_columns_with_other,
diff --git a/python/cudf/cudf/core/udf/groupby_utils.py b/python/cudf/cudf/core/udf/groupby_utils.py
index 06d9296ca0f..265b87350ae 100644
--- a/python/cudf/cudf/core/udf/groupby_utils.py
+++ b/python/cudf/cudf/core/udf/groupby_utils.py
@@ -30,7 +30,7 @@
     _supported_dtypes_from_frame,
 )
 from cudf.utils._numba import _CUDFNumbaConfig
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 
 def _get_frame_groupby_type(dtype, index_dtype):
@@ -126,7 +126,7 @@ def _get_groupby_apply_kernel(frame, func, args):
     return kernel, return_type
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def jit_groupby_apply(offsets, grouped_values, function, *args):
     """
     Main entrypoint for JIT Groupby.apply via Numba.
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index f1704e4ea78..d616761cb3b 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -38,7 +38,7 @@
     STRING_TYPES,
     TIMEDELTA_TYPES,
 )
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import initfunc
 
 # Maximum size of a string column is 2 GiB
@@ -71,7 +71,7 @@ def _ptx_file():
     )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _get_udf_return_type(argty, func: Callable, args=()):
     """
     Get the return type of a masked UDF for a given set of argument dtypes. It
@@ -236,7 +236,7 @@ def _generate_cache_key(frame, func: Callable, args, suffix="__APPLY_UDF"):
     )
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _compile_or_get(
     frame, func, args, kernel_getter=None, suffix="__APPLY_UDF"
 ):
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index f07764e2ce4..e909d96309e 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -12,10 +12,10 @@
 from cudf.api.types import is_scalar
 from cudf.utils import ioutils
 from cudf.utils.dtypes import _maybe_convert_to_default_type
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 @ioutils.doc_read_csv()
 def read_csv(
     filepath_or_buffer,
@@ -151,7 +151,7 @@ def read_csv(
     return df
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 @ioutils.doc_to_csv()
 def to_csv(
     df,
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 2a838ca7417..7733e770d99 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -22,7 +22,7 @@
 from cudf.api.types import is_list_like
 from cudf.core.column import as_column, build_categorical_column, column_empty
 from cudf.utils import ioutils
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 BYTE_SIZES = {
     "kb": 1000,
@@ -50,7 +50,7 @@
 }
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _write_parquet(
     df,
     paths,
@@ -130,7 +130,7 @@ def _write_parquet(
 
 # Logic chosen to match: https://arrow.apache.org/
 # docs/_modules/pyarrow/parquet.html#write_to_dataset
-@_cudf_nvtx_annotate
+@_performance_tracking
 def write_to_dataset(
     df,
     root_path,
@@ -318,7 +318,7 @@ def write_to_dataset(
 
 
 @ioutils.doc_read_parquet_metadata()
-@_cudf_nvtx_annotate
+@_performance_tracking
 def read_parquet_metadata(filepath_or_buffer):
     """{docstring}"""
     # Multiple sources are passed as a list. If a single source is passed,
@@ -360,7 +360,7 @@ def read_parquet_metadata(filepath_or_buffer):
     return libparquet.read_parquet_metadata(filepaths_or_buffers)
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _process_dataset(
     paths,
     fs,
@@ -515,7 +515,7 @@ def _process_dataset(
 
 
 @ioutils.doc_read_parquet()
-@_cudf_nvtx_annotate
+@_performance_tracking
 def read_parquet(
     filepath_or_buffer,
     engine="cudf",
@@ -785,7 +785,7 @@ def _handle_is(column: cudf.Series, value, *, negate) -> cudf.Series:
         return df
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _parquet_to_frame(
     paths_or_buffers,
     *args,
@@ -885,7 +885,7 @@ def _parquet_to_frame(
         return dfs[0]
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _read_parquet(
     filepaths_or_buffers,
     engine,
@@ -941,7 +941,7 @@ def _read_parquet(
 
 
 @ioutils.doc_to_parquet()
-@_cudf_nvtx_annotate
+@_performance_tracking
 def to_parquet(
     df,
     path,
@@ -1107,7 +1107,7 @@ def _get_estimated_file_size(df):
     return file_size
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _get_partitioned(
     df,
     root_path,
@@ -1145,7 +1145,7 @@ def _get_partitioned(
     return full_paths, metadata_file_paths, grouped_df, part_offsets, filename
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 def _get_groups_and_offsets(
     df,
     partition_cols,
@@ -1305,7 +1305,7 @@ class ParquetDatasetWriter:
 
     """
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def __init__(
         self,
         path,
@@ -1355,7 +1355,7 @@ def __init__(
 
         self._file_sizes: dict[str, int] = {}
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def write_table(self, df):
         """
         Write a dataframe to the file/dataset
@@ -1486,7 +1486,7 @@ def write_table(self, df):
             self.path_cw_map.update({k: new_cw_idx for k in new_paths})
             self._chunked_writers[-1][0].write_table(grouped_df, part_info)
 
-    @_cudf_nvtx_annotate
+    @_performance_tracking
     def close(self, return_metadata=False):
         """
         Close all open files and optionally return footer metadata as a binary
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 0e19972f6e0..4329480bb2c 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -1,14 +1,14 @@
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
 from io import BytesIO, StringIO
 
 import cudf
 from cudf._lib import text as libtext
 from cudf.utils import ioutils
-from cudf.utils.nvtx_annotation import _cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _performance_tracking
 
 
-@_cudf_nvtx_annotate
+@_performance_tracking
 @ioutils.doc_read_text()
 def read_text(
     filepath_or_buffer,
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index fb5a963f008..1f539e7f266 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -311,6 +311,20 @@ def _integer_and_none_validator(val):
     _make_contains_validator([False, True]),
 )
 
+_register_option(
+    "memory_profiling",
+    _env_get_bool("CUDF_MEMORY_PROFILING", False),
+    textwrap.dedent(
+        """
+        If set to `False`, disables memory profiling.
+        If set to `True`, enables memory profiling.
+        Read more at: :ref:`memory-profiling-user-doc`
+        \tValid values are True or False. Default is False.
+    """
+    ),
+    _make_contains_validator([False, True]),
+)
+
 
 class option_context(ContextDecorator):
     """
diff --git a/python/cudf/cudf/tests/test_performance_tracking.py b/python/cudf/cudf/tests/test_performance_tracking.py
new file mode 100644
index 00000000000..e886b77af3f
--- /dev/null
+++ b/python/cudf/cudf/tests/test_performance_tracking.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from io import StringIO
+
+import pytest
+
+import rmm.mr
+import rmm.statistics
+
+import cudf
+from cudf.utils.performance_tracking import (
+    get_memory_records,
+    print_memory_report,
+)
+
+
+@pytest.fixture
+def rmm_reset():
+    """Fixture to reset the RMM resource before and after the test"""
+    mr = rmm.mr.get_current_device_resource()
+    try:
+        rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource())
+        yield
+    finally:
+        rmm.mr.set_current_device_resource(mr)
+
+
+def test_memory_profiling(rmm_reset):
+    df1 = cudf.DataFrame({"a": [1, 2, 3]})
+    assert len(get_memory_records()) == 0
+
+    rmm.statistics.enable_statistics()
+    cudf.set_option("memory_profiling", True)
+
+    df1.merge(df1)
+
+    assert len(get_memory_records()) > 0
+
+    out = StringIO()
+    print_memory_report(file=out)
+    assert "DataFrame.merge" in out.getvalue()
diff --git a/python/cudf/cudf/utils/nvtx_annotation.py b/python/cudf/cudf/utils/nvtx_annotation.py
deleted file mode 100644
index a4404e51232..00000000000
--- a/python/cudf/cudf/utils/nvtx_annotation.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-
-import hashlib
-from functools import partial
-
-from nvtx import annotate
-
-_NVTX_COLORS = ["green", "blue", "purple", "rapids"]
-
-
-def _get_color_for_nvtx(name):
-    m = hashlib.sha256()
-    m.update(name.encode())
-    hash_value = int(m.hexdigest(), 16)
-    idx = hash_value % len(_NVTX_COLORS)
-    return _NVTX_COLORS[idx]
-
-
-def _cudf_nvtx_annotate(func, domain="cudf_python"):
-    """Decorator for applying nvtx annotations to methods in cudf."""
-    return annotate(
-        message=func.__qualname__,
-        color=_get_color_for_nvtx(func.__qualname__),
-        domain=domain,
-    )(func)
-
-
-_dask_cudf_nvtx_annotate = partial(
-    _cudf_nvtx_annotate, domain="dask_cudf_python"
-)
diff --git a/python/cudf/cudf/utils/performance_tracking.py b/python/cudf/cudf/utils/performance_tracking.py
new file mode 100644
index 00000000000..30c891d0d5a
--- /dev/null
+++ b/python/cudf/cudf/utils/performance_tracking.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import contextlib
+import functools
+import hashlib
+import sys
+
+import nvtx
+
+import rmm.statistics
+
+from cudf.options import get_option
+
+_NVTX_COLORS = ["green", "blue", "purple", "rapids"]
+
+
+def _get_color_for_nvtx(name):
+    m = hashlib.sha256()
+    m.update(name.encode())
+    hash_value = int(m.hexdigest(), 16)
+    idx = hash_value % len(_NVTX_COLORS)
+    return _NVTX_COLORS[idx]
+
+
+def _performance_tracking(func, domain="cudf_python"):
+    """Decorator for applying performance tracking (if enabled)."""
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        with contextlib.ExitStack() as stack:
+            if get_option("memory_profiling"):
+                # NB: the user still needs to call `rmm.statistics.enable_statistics()`
+                #     to enable memory profiling.
+                stack.enter_context(
+                    rmm.statistics.profiler(
+                        name=rmm.statistics._get_descriptive_name_of_object(
+                            func
+                        )
+                    )
+                )
+            if nvtx.enabled():
+                stack.enter_context(
+                    nvtx.annotate(
+                        message=func.__qualname__,
+                        color=_get_color_for_nvtx(func.__qualname__),
+                        domain=domain,
+                    )
+                )
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
+_dask_cudf_performance_tracking = functools.partial(
+    _performance_tracking, domain="dask_cudf_python"
+)
+
+
+def get_memory_records() -> (
+    dict[str, rmm.statistics.ProfilerRecords.MemoryRecord]
+):
+    """Get the memory records from the memory profiling
+
+    Returns
+    -------
+    Dict that maps function names to memory records. Empty if
+    memory profiling is disabled
+    """
+    return rmm.statistics.default_profiler_records.records
+
+
+def print_memory_report(file=sys.stdout) -> None:
+    """Pretty print the result of the memory profiling
+
+    Parameters
+    ----------
+    file
+        The output stream
+    """
+    print(rmm.statistics.default_profiler_records.report(), file=file)
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 2e4dfc4bb14..7347ec7866a 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -159,8 +159,9 @@ def _external_only_api(func, alternative=""):
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
         # Check the immediately preceding frame to see if it's in cudf.
-        frame, lineno = next(traceback.walk_stack(None))
-        fn = frame.f_code.co_filename
+        pre_frame = traceback.extract_stack(limit=2)[0]
+        fn = pre_frame.filename
+        lineno = pre_frame.lineno
         if _cudf_root in fn and _tests_root not in fn:
             raise RuntimeError(
                 f"External-only API called in {fn} at line {lineno}. "
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index d250589e389..1f55a59ea55 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -43,7 +43,7 @@
 
 import cudf
 from cudf.api.types import is_string_dtype
-from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 from .core import DataFrame, Index, Series
 
@@ -53,7 +53,7 @@
 
 
 @meta_nonempty.register(cudf.BaseIndex)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _nonempty_index(idx):
     if isinstance(idx, cudf.core.index.RangeIndex):
         return cudf.core.index.RangeIndex(2, name=idx.name)
@@ -100,7 +100,7 @@ def _nest_list_data(data, leaf_type):
     return data
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _get_non_empty_data(s):
     if isinstance(s, cudf.core.column.CategoricalColumn):
         categories = (
@@ -147,7 +147,7 @@ def _get_non_empty_data(s):
 
 
 @meta_nonempty.register(cudf.Series)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _nonempty_series(s, idx=None):
     if idx is None:
         idx = _nonempty_index(s.index)
@@ -157,7 +157,7 @@ def _nonempty_series(s, idx=None):
 
 
 @meta_nonempty.register(cudf.DataFrame)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def meta_nonempty_cudf(x):
     idx = meta_nonempty(x.index)
     columns_with_dtype = dict()
@@ -182,18 +182,18 @@ def meta_nonempty_cudf(x):
 
 
 @make_meta_dispatch.register((cudf.Series, cudf.DataFrame))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def make_meta_cudf(x, index=None):
     return x.head(0)
 
 
 @make_meta_dispatch.register(cudf.BaseIndex)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def make_meta_cudf_index(x, index=None):
     return x[:0]
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _empty_series(name, dtype, index=None):
     if isinstance(dtype, str) and dtype == "category":
         return cudf.Series(
@@ -203,7 +203,7 @@ def _empty_series(name, dtype, index=None):
 
 
 @make_meta_obj.register(object)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def make_meta_object_cudf(x, index=None):
     """Create an empty cudf object containing the desired metadata.
 
@@ -274,7 +274,7 @@ def make_meta_object_cudf(x, index=None):
 
 
 @concat_dispatch.register((cudf.DataFrame, cudf.Series, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def concat_cudf(
     dfs,
     axis=0,
@@ -299,13 +299,13 @@ def concat_cudf(
 @categorical_dtype_dispatch.register(
     (cudf.DataFrame, cudf.Series, cudf.BaseIndex)
 )
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def categorical_dtype_cudf(categories=None, ordered=False):
     return cudf.CategoricalDtype(categories=categories, ordered=ordered)
 
 
 @tolist_dispatch.register((cudf.Series, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def tolist_cudf(obj):
     return obj.to_pandas().tolist()
 
@@ -313,7 +313,7 @@ def tolist_cudf(obj):
 @is_categorical_dtype_dispatch.register(
     (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype, Series)
 )
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def is_categorical_dtype_cudf(obj):
     return cudf.api.types._is_categorical_dtype(obj)
 
@@ -324,7 +324,7 @@ def get_grouper_cudf(obj):
 
 
 @percentile_lookup.register((cudf.Series, cp.ndarray, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def percentile_cudf(a, q, interpolation="linear"):
     # Cudf dispatch to the equivalent of `np.percentile`:
     # https://numpy.org/doc/stable/reference/generated/numpy.percentile.html
@@ -400,7 +400,7 @@ def _table_to_cudf(obj, table, self_destruct=None, **kwargs):
 
 
 @union_categoricals_dispatch.register((cudf.Series, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def union_categoricals_cudf(
     to_union, sort_categories=False, ignore_order=False
 ):
@@ -410,7 +410,7 @@ def union_categoricals_cudf(
 
 
 @hash_object_dispatch.register((cudf.DataFrame, cudf.Series))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def hash_object_cudf(frame, index=True):
     if index:
         frame = frame.reset_index()
@@ -418,7 +418,7 @@ def hash_object_cudf(frame, index=True):
 
 
 @hash_object_dispatch.register(cudf.BaseIndex)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def hash_object_cudf_index(ind, index=None):
     if isinstance(ind, cudf.MultiIndex):
         return ind.to_frame(index=False).hash_values()
@@ -428,7 +428,7 @@ def hash_object_cudf_index(ind, index=None):
 
 
 @group_split_dispatch.register((cudf.Series, cudf.DataFrame))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def group_split_cudf(df, c, k, ignore_index=False):
     return dict(
         zip(
@@ -443,7 +443,7 @@ def group_split_cudf(df, c, k, ignore_index=False):
 
 
 @sizeof_dispatch.register(cudf.DataFrame)
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def sizeof_cudf_dataframe(df):
     return int(
         sum(col.memory_usage for col in df._data.columns)
@@ -452,7 +452,7 @@ def sizeof_cudf_dataframe(df):
 
 
 @sizeof_dispatch.register((cudf.Series, cudf.BaseIndex))
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def sizeof_cudf_series_index(obj):
     return obj.memory_usage()
 
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 3bd455a3a57..aab56e3a1b0 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -22,7 +22,7 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 from dask_cudf import sorting
 from dask_cudf.accessors import ListMethods, StructMethods
@@ -53,7 +53,7 @@ def __repr__(self):
         s = "<dask_cudf.%s | %d tasks | %d npartitions>"
         return s % (type(self).__name__, len(self.dask), self.npartitions)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def to_dask_dataframe(self, **kwargs):
         """Create a dask.dataframe object from a dask_cudf object
 
@@ -92,7 +92,7 @@ class DataFrame(_Frame, dd.core.DataFrame):
 
     _partition_type = cudf.DataFrame
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def _assign_column(self, k, v):
         def assigner(df, k, v):
             out = df.copy()
@@ -102,7 +102,7 @@ def assigner(df, k, v):
         meta = assigner(self._meta, k, dask_make_meta(v))
         return self.map_partitions(assigner, k, v, meta=meta)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None):
         import uuid
 
@@ -123,7 +123,7 @@ def do_apply_rows(df, func, incols, outcols, kwargs):
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def merge(self, other, shuffle_method=None, **kwargs):
         on = kwargs.pop("on", None)
         if isinstance(on, tuple):
@@ -136,7 +136,7 @@ def merge(self, other, shuffle_method=None, **kwargs):
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def join(self, other, shuffle_method=None, **kwargs):
         # CuDF doesn't support "right" join yet
         how = kwargs.pop("how", "left")
@@ -155,7 +155,7 @@ def join(self, other, shuffle_method=None, **kwargs):
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def set_index(
         self,
         other,
@@ -237,7 +237,7 @@ def set_index(
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def sort_values(
         self,
         by,
@@ -275,14 +275,14 @@ def sort_values(
             return df.reset_index(drop=True)
         return df
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def to_parquet(self, path, *args, **kwargs):
         """Calls dask.dataframe.io.to_parquet with CudfEngine backend"""
         from dask_cudf.io import to_parquet
 
         return to_parquet(self, path, *args, **kwargs)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def to_orc(self, path, **kwargs):
         """Calls dask_cudf.io.to_orc"""
         from dask_cudf.io import to_orc
@@ -290,7 +290,7 @@ def to_orc(self, path, **kwargs):
         return to_orc(self, path, **kwargs)
 
     @derived_from(pd.DataFrame)
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def var(
         self,
         axis=None,
@@ -324,28 +324,28 @@ def var(
             return _parallel_var(self, meta, skipna, split_every, out)
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def shuffle(self, *args, shuffle_method=None, **kwargs):
         """Wraps dask.dataframe DataFrame.shuffle method"""
         return super().shuffle(
             *args, shuffle_method=_get_shuffle_method(shuffle_method), **kwargs
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def groupby(self, by=None, **kwargs):
         from .groupby import CudfDataFrameGroupBy
 
         return CudfDataFrameGroupBy(self, by=by, **kwargs)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def sum_of_squares(x):
     x = x.astype("f8")._column
     outcol = libcudf.reduce.reduce("sum_of_squares", x)
     return cudf.Series(outcol)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def var_aggregate(x2, x, n, ddof):
     try:
         with warnings.catch_warnings(record=True):
@@ -358,12 +358,12 @@ def var_aggregate(x2, x, n, ddof):
         return np.float64(np.nan)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def nlargest_agg(x, **kwargs):
     return cudf.concat(x).nlargest(**kwargs)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def nsmallest_agg(x, **kwargs):
     return cudf.concat(x).nsmallest(**kwargs)
 
@@ -371,7 +371,7 @@ def nsmallest_agg(x, **kwargs):
 class Series(_Frame, dd.core.Series):
     _partition_type = cudf.Series
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def count(self, split_every=False):
         return reduction(
             [self],
@@ -381,14 +381,14 @@ def count(self, split_every=False):
             meta="i8",
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def mean(self, split_every=False):
         sum = self.sum(split_every=split_every)
         n = self.count(split_every=split_every)
         return sum / n
 
     @derived_from(pd.DataFrame)
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def var(
         self,
         axis=None,
@@ -417,19 +417,19 @@ def var(
         else:
             return _parallel_var(self, meta, skipna, split_every, out)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def groupby(self, *args, **kwargs):
         from .groupby import CudfSeriesGroupBy
 
         return CudfSeriesGroupBy(self, *args, **kwargs)
 
     @property  # type: ignore
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def list(self):
         return ListMethods(self)
 
     @property  # type: ignore
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def struct(self):
         return StructMethods(self)
 
@@ -438,7 +438,7 @@ class Index(Series, dd.core.Index):
     _partition_type = cudf.Index  # type: ignore
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _naive_var(ddf, meta, skipna, ddof, split_every, out):
     num = ddf._get_numeric_data()
     x = 1.0 * num.sum(skipna=skipna, split_every=split_every)
@@ -453,7 +453,7 @@ def _naive_var(ddf, meta, skipna, ddof, split_every, out):
     return handle_out(out, result)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _parallel_var(ddf, meta, skipna, split_every, out):
     def _local_var(x, skipna):
         if skipna:
@@ -520,7 +520,7 @@ def _finalize_var(vals):
     return handle_out(out, result)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _extract_meta(x):
     """
     Extract internal cache data (``_meta``) from dask_cudf objects
@@ -536,7 +536,7 @@ def _extract_meta(x):
     return x
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _emulate(func, *args, **kwargs):
     """
     Apply a function using args / kwargs. If arguments contain dd.DataFrame /
@@ -546,7 +546,7 @@ def _emulate(func, *args, **kwargs):
         return func(*_extract_meta(args), **_extract_meta(kwargs))
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def align_partitions(args):
     """Align partitions between dask_cudf objects.
 
@@ -563,7 +563,7 @@ def align_partitions(args):
     return args
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def reduction(
     args,
     chunk=None,
@@ -702,7 +702,7 @@ def reduction(
     return dd.core.new_dd_object(graph, b, meta, (None, None))
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
     from dask_cudf import QUERY_PLANNING_ON
 
@@ -746,7 +746,7 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None):
 )
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def from_dask_dataframe(df):
     """
     Convert a Dask :class:`dask.dataframe.DataFrame` to a Dask-cuDF
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index 2e72461b43d..bbbcde17b51 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -16,7 +16,7 @@
 
 import cudf
 from cudf.core.groupby.groupby import _deprecate_collect
-from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 from dask_cudf.sorting import _deprecate_shuffle_kwarg
 
@@ -56,13 +56,13 @@ def wrapper(*args, **kwargs):
 
 
 class CudfDataFrameGroupBy(DataFrameGroupBy):
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def __init__(self, *args, sort=None, **kwargs):
         self.sep = kwargs.pop("sep", "___")
         self.as_index = kwargs.pop("as_index", True)
         super().__init__(*args, sort=sort, **kwargs)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def __getitem__(self, key):
         if isinstance(key, list):
             g = CudfDataFrameGroupBy(
@@ -84,7 +84,7 @@ def __getitem__(self, key):
         g._meta = g._meta[key]
         return g
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def _make_groupby_method_aggs(self, agg_name):
         """Create aggs dictionary for aggregation methods"""
 
@@ -92,7 +92,7 @@ def _make_groupby_method_aggs(self, agg_name):
             return {c: agg_name for c in self.obj.columns if c not in self.by}
         return {c: agg_name for c in self.obj.columns if c != self.by}
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def count(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -102,7 +102,7 @@ def count(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def mean(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -112,7 +112,7 @@ def mean(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def std(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -122,7 +122,7 @@ def std(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def var(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -132,7 +132,7 @@ def var(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def sum(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -142,7 +142,7 @@ def sum(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def min(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -152,7 +152,7 @@ def min(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def max(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -162,7 +162,7 @@ def max(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def collect(self, split_every=None, split_out=1):
         _deprecate_collect()
@@ -173,7 +173,7 @@ def collect(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def first(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -183,7 +183,7 @@ def first(self, split_every=None, split_out=1):
             split_out,
         )
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def last(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -194,7 +194,7 @@ def last(self, split_every=None, split_out=1):
         )
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def aggregate(
         self, arg, split_every=None, split_out=1, shuffle_method=None
     ):
@@ -231,13 +231,13 @@ def aggregate(
 
 
 class CudfSeriesGroupBy(SeriesGroupBy):
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def __init__(self, *args, sort=None, **kwargs):
         self.sep = kwargs.pop("sep", "___")
         self.as_index = kwargs.pop("as_index", True)
         super().__init__(*args, sort=sort, **kwargs)
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def count(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -247,7 +247,7 @@ def count(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def mean(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -257,7 +257,7 @@ def mean(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def std(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -267,7 +267,7 @@ def std(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def var(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -277,7 +277,7 @@ def var(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def sum(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -287,7 +287,7 @@ def sum(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def min(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -297,7 +297,7 @@ def min(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def max(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -307,7 +307,7 @@ def max(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def collect(self, split_every=None, split_out=1):
         _deprecate_collect()
@@ -318,7 +318,7 @@ def collect(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def first(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -328,7 +328,7 @@ def first(self, split_every=None, split_out=1):
             split_out,
         )[self._slice]
 
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     @_check_groupby_optimized
     def last(self, split_every=None, split_out=1):
         return _make_groupby_agg_call(
@@ -339,7 +339,7 @@ def last(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_deprecate_shuffle_kwarg
-    @_dask_cudf_nvtx_annotate
+    @_dask_cudf_performance_tracking
     def aggregate(
         self, arg, split_every=None, split_out=1, shuffle_method=None
     ):
@@ -429,7 +429,7 @@ def _shuffle_aggregate(
     return result
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def groupby_agg(
     ddf,
     gb_cols,
@@ -641,7 +641,7 @@ def groupby_agg(
     )
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _make_groupby_agg_call(
     gb, aggs, split_every, split_out, shuffle_method=None
 ):
@@ -663,7 +663,7 @@ def _make_groupby_agg_call(
     )
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _redirect_aggs(arg):
     """Redirect aggregations to their corresponding name in cuDF"""
     redirects = {
@@ -690,7 +690,7 @@ def _redirect_aggs(arg):
     return redirects.get(arg, arg)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _aggs_optimized(arg, supported: set):
     """Check that aggregations in `arg` are a subset of `supported`"""
     if isinstance(arg, (list, dict)):
@@ -712,7 +712,7 @@ def _aggs_optimized(arg, supported: set):
     return False
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _groupby_optimized(gb):
     """Check that groupby input can use dask-cudf optimized codepath"""
     return isinstance(gb.obj, DaskDataFrame) and (
@@ -730,7 +730,7 @@ def _make_name(col_name, sep="_"):
     return sep.join(name for name in col_name if name != "")
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep):
     """Initial partition-level aggregation task.
 
@@ -768,7 +768,7 @@ def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep):
     return gb[sorted(output_columns)]
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _tree_node_agg(df, gb_cols, dropna, sort, sep):
     """Node in groupby-aggregation reduction tree.
 
@@ -807,7 +807,7 @@ def _tree_node_agg(df, gb_cols, dropna, sort, sep):
     return gb[sorted(output_columns)]
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
     """Calculate variance (given count, sum, and sum-squared columns)."""
 
@@ -829,7 +829,7 @@ def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1):
     return var
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _finalize_gb_agg(
     gb_in,
     gb_cols,
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index f3774e20d32..a2ba4d1878e 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -18,7 +18,7 @@
 
 import cudf
 from cudf.api.types import _is_categorical_dtype
-from cudf.utils.nvtx_annotation import _dask_cudf_nvtx_annotate
+from cudf.utils.performance_tracking import _dask_cudf_performance_tracking
 
 _SHUFFLE_SUPPORT = ("tasks", "p2p")  # "disk" not supported
 
@@ -48,14 +48,14 @@ def wrapper(*args, **kwargs):
     return wrapper
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def set_index_post(df, index_name, drop, column_dtype):
     df2 = df.set_index(index_name, drop=drop)
     df2.columns = df2.columns.astype(column_dtype)
     return df2
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _set_partitions_pre(s, divisions, ascending=True, na_position="last"):
     if ascending:
         partitions = divisions.searchsorted(s, side="right") - 1
@@ -72,7 +72,7 @@ def _set_partitions_pre(s, divisions, ascending=True, na_position="last"):
     return partitions
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _quantile(a, q):
     n = len(a)
     if not len(a):
@@ -83,7 +83,7 @@ def _quantile(a, q):
     )
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def merge_quantiles(finalq, qs, vals):
     """Combine several quantile calculations of different data.
     [NOTE: Same logic as dask.array merge_percentiles]
@@ -146,7 +146,7 @@ def _append_counts(val, count):
     return rv.reset_index(drop=True)
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def _approximate_quantile(df, q):
     """Approximate quantiles of DataFrame or Series.
     [NOTE: Same logic as dask.dataframe Series quantile]
@@ -220,7 +220,7 @@ def set_quantile_index(df):
     return df
 
 
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def quantile_divisions(df, by, npartitions):
     qn = np.linspace(0.0, 1.0, npartitions + 1).tolist()
     divisions = _approximate_quantile(df[by], qn).compute()
@@ -257,7 +257,7 @@ def quantile_divisions(df, by, npartitions):
 
 
 @_deprecate_shuffle_kwarg
-@_dask_cudf_nvtx_annotate
+@_dask_cudf_performance_tracking
 def sort_values(
     df,
     by,

From 57862a3ab1324bc8dbea4133485bb99044bc2742 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 28 Jun 2024 08:43:12 -0400
Subject: [PATCH 425/842] stable_distinct public api now has a stream parameter
 (#16068)

As part of https://github.com/rapidsai/cudf/pull/15982 we determined that the cudf  `stable_distinct` public API needs to be updated so that a user provided stream can be provided.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Srinivas Yadav (https://github.com/srinivasyadav18)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16068
---
 cpp/include/cudf/detail/stream_compaction.hpp |   2 -
 cpp/include/cudf/stream_compaction.hpp        |   2 +
 cpp/src/stream_compaction/stable_distinct.cu  |   4 +-
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/streams/stream_compaction_test.cpp  | 235 ++++++++++++++++++
 5 files changed, 240 insertions(+), 4 deletions(-)
 create mode 100644 cpp/tests/streams/stream_compaction_test.cpp

diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index e2974789ea1..e3ef4190fd2 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -88,8 +88,6 @@ std::unique_ptr<table> distinct(table_view const& input,
 
 /**
  * @copydoc cudf::stable_distinct
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> stable_distinct(table_view const& input,
                                        std::vector<size_type> const& keys,
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index c386b3a22b4..181af11adb8 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -320,6 +320,7 @@ std::unique_ptr<column> distinct_indices(
  * @param keep Copy any, first, last, or none of the found duplicates
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
  * @param nans_equal Flag to specify whether NaN elements should be considered as equal
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned table
  * @return Table with distinct rows, preserving input order
  */
@@ -329,6 +330,7 @@ std::unique_ptr<table> stable_distinct(
   duplicate_keep_option keep        = duplicate_keep_option::KEEP_ANY,
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu
index 27b5a92ab69..074d4fd7d1a 100644
--- a/cpp/src/stream_compaction/stable_distinct.cu
+++ b/cpp/src/stream_compaction/stable_distinct.cu
@@ -79,11 +79,11 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
                                        duplicate_keep_option keep,
                                        null_equality nulls_equal,
                                        nan_equality nans_equal,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::stable_distinct(
-    input, keys, keep, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
+  return detail::stable_distinct(input, keys, keep, nulls_equal, nans_equal, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 9f14455f42d..eef09954647 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -700,6 +700,7 @@ ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_ROLLING_TEST streams/rolling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_STREAM_COMPACTION_TEST streams/stream_compaction_test.cpp STREAM_MODE testing)
 ConfigureTest(
   STREAM_STRINGS_TEST
   streams/strings/case_test.cpp
diff --git a/cpp/tests/streams/stream_compaction_test.cpp b/cpp/tests/streams/stream_compaction_test.cpp
new file mode 100644
index 00000000000..56443870602
--- /dev/null
+++ b/cpp/tests/streams/stream_compaction_test.cpp
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <cudf/copying.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cmath>
+
+auto constexpr null{0};  // null at current level
+auto constexpr XXX{0};   // null pushed down from parent level
+auto constexpr NaN          = std::numeric_limits<double>::quiet_NaN();
+auto constexpr KEEP_ANY     = cudf::duplicate_keep_option::KEEP_ANY;
+auto constexpr KEEP_FIRST   = cudf::duplicate_keep_option::KEEP_FIRST;
+auto constexpr KEEP_LAST    = cudf::duplicate_keep_option::KEEP_LAST;
+auto constexpr KEEP_NONE    = cudf::duplicate_keep_option::KEEP_NONE;
+auto constexpr NULL_EQUAL   = cudf::null_equality::EQUAL;
+auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL;
+auto constexpr NAN_EQUAL    = cudf::nan_equality::ALL_EQUAL;
+auto constexpr NAN_UNEQUAL  = cudf::nan_equality::UNEQUAL;
+
+using int32s_col = cudf::test::fixed_width_column_wrapper<int32_t>;
+using floats_col = cudf::test::fixed_width_column_wrapper<float>;
+
+using cudf::nan_policy;
+using cudf::null_equality;
+using cudf::null_policy;
+using cudf::test::iterators::no_nulls;
+using cudf::test::iterators::null_at;
+using cudf::test::iterators::nulls_at;
+
+struct StableDistinctKeepAny : public cudf::test::BaseFixture {};
+
+struct StableDistinctKeepFirstLastNone : public cudf::test::BaseFixture {};
+
+TEST_F(StableDistinctKeepAny, NoNullsTableWithNaNs)
+{
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+  auto const col1  = int32s_col{6, 6, 6, 1, 1, 1, 3, 5, 8, 5};
+  auto const col2  = floats_col{6, 6, 6, 1, 1, 1, 3, 4, 9, 4};
+  auto const keys1 = int32s_col{20, 20, 20, 15, 15, 15, 20, 19, 21, 9};
+  auto const keys2 = floats_col{19., 19., 19., NaN, NaN, NaN, 20., 20., 9., 21.};
+
+  auto const input   = cudf::table_view{{col1, col2, keys1, keys2}};
+  auto const key_idx = std::vector<cudf::size_type>{2, 3};
+
+  // NaNs are unequal.
+  {
+    auto const exp_col1  = int32s_col{6, 1, 1, 1, 3, 5, 8, 5};
+    auto const exp_col2  = floats_col{6, 1, 1, 1, 3, 4, 9, 4};
+    auto const exp_keys1 = int32s_col{20, 15, 15, 15, 20, 19, 21, 9};
+    auto const exp_keys2 = floats_col{19., NaN, NaN, NaN, 20., 20., 9., 21.};
+    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // NaNs are equal.
+  {
+    auto const exp_col1  = int32s_col{6, 1, 3, 5, 8, 5};
+    auto const exp_col2  = floats_col{6, 1, 3, 4, 9, 4};
+    auto const exp_keys1 = int32s_col{20, 15, 20, 19, 21, 9};
+    auto const exp_keys2 = floats_col{19., NaN, 20., 20., 9., 21.};
+    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepAny, InputWithNullsAndNaNs)
+{
+  auto constexpr null{0.0};  // shadow the global `null` variable of type int
+
+  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
+  // groups for equivalent keys because KEEP_ANY is nondeterministic.
+  auto const col   = int32s_col{5, 4, 4, 1, 1, 1, 8, 8, 1};
+  auto const keys  = floats_col{{20., null, null, NaN, NaN, NaN, 19., 19., 21.}, nulls_at({1, 2})};
+  auto const input = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // Nulls are equal, NaNs are unequal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 1, 1, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, NaN, NaN, NaN, 19., 21.}, null_at(1)};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // Nulls are equal, NaNs are equal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, NaN, 19., 21.}, null_at(1)};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // Nulls are unequal, NaNs are unequal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 4, 1, 1, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, null, NaN, NaN, NaN, 19., 21.}, nulls_at({1, 2})};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // Nulls are unequal, NaNs are equal.
+  {
+    auto const exp_col  = int32s_col{5, 4, 4, 1, 8, 1};
+    auto const exp_keys = floats_col{{20., null, null, NaN, 19., 21.}, nulls_at({1, 2})};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_ANY, NULL_UNEQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsEqual)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const col     = int32s_col{0, 1, 2, 3, 4, 5, 6};
+  auto const keys    = floats_col{20., NaN, NaN, 19., 21., 19., 22.};
+  auto const input   = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // KEEP_FIRST
+  {
+    auto const exp_col  = int32s_col{0, 1, 3, 4, 6};
+    auto const exp_keys = floats_col{20., NaN, 19., 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_FIRST, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_col  = int32s_col{0, 2, 4, 5, 6};
+    auto const exp_keys = floats_col{20., NaN, 21., 19., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_LAST, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_col  = int32s_col{0, 4, 6};
+    auto const exp_keys = floats_col{20., 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_NONE, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}
+
+TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsUnequal)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const col     = int32s_col{0, 1, 2, 3, 4, 5, 6, 7};
+  auto const keys    = floats_col{20., NaN, NaN, 19., 21., 19., 22., 20.};
+  auto const input   = cudf::table_view{{col, keys}};
+  auto const key_idx = std::vector<cudf::size_type>{1};
+
+  // KEEP_FIRST
+  {
+    auto const exp_col  = int32s_col{0, 1, 2, 3, 4, 6};
+    auto const exp_keys = floats_col{20., NaN, NaN, 19., 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_FIRST, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_col  = int32s_col{1, 2, 4, 5, 6, 7};
+    auto const exp_keys = floats_col{NaN, NaN, 21., 19., 22., 20.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_LAST, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_col  = int32s_col{1, 2, 4, 6};
+    auto const exp_keys = floats_col{NaN, NaN, 21., 22.};
+    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+
+    auto const result = cudf::stable_distinct(
+      input, key_idx, KEEP_NONE, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+}

From 2b547dc70c7f42b671cdc3e75946b123301779f0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 28 Jun 2024 03:11:01 -1000
Subject: [PATCH 426/842] Add ensure_index to not unnecessarily shallow copy
 cudf.Index (#16117)

The `cudf.Index` constructor will shallow copy a `cudf.Index` input. Sometimes, we just need to make sure an input is a `cudf.Index`, so created `ensure_index` (pandas has something similar) so we don't shallow copy these inputs unnecessarily

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16117
---
 python/cudf/cudf/core/_base_index.py     |  6 ++++-
 python/cudf/cudf/core/algorithms.py      |  4 ++--
 python/cudf/cudf/core/cut.py             |  2 +-
 python/cudf/cudf/core/dataframe.py       | 29 ++++++++++++++----------
 python/cudf/cudf/core/index.py           | 13 ++++++++++-
 python/cudf/cudf/core/indexed_frame.py   | 11 ++++-----
 python/cudf/cudf/core/multiindex.py      |  3 ++-
 python/cudf/cudf/core/series.py          | 12 ++++------
 python/cudf/cudf/tests/test_dataframe.py | 24 ++++++++++++++++++++
 9 files changed, 73 insertions(+), 31 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index caf07b286cd..e160fa697ee 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1104,7 +1104,11 @@ def difference(self, other, sort=None):
                 f"of [None, False, True]; {sort} was passed."
             )
 
-        other = cudf.Index(other, name=getattr(other, "name", self.name))
+        if not isinstance(other, BaseIndex):
+            other = cudf.Index(
+                other,
+                name=getattr(other, "name", self.name),
+            )
 
         if not len(other):
             res = self._get_reconciled_name_object(other).unique()
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 51a32e29886..e8b82ff60c2 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -6,7 +6,7 @@
 
 from cudf.core.column import as_column
 from cudf.core.copy_types import BooleanMask
-from cudf.core.index import Index, RangeIndex
+from cudf.core.index import RangeIndex, ensure_index
 from cudf.core.indexed_frame import IndexedFrame
 from cudf.core.scalar import Scalar
 from cudf.options import get_option
@@ -107,7 +107,7 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
         dtype="int64" if get_option("mode.pandas_compatible") else None,
     ).values
 
-    return labels, cats.values if return_cupy_array else Index(cats)
+    return labels, cats.values if return_cupy_array else ensure_index(cats)
 
 
 def _linear_interpolation(column, index=None):
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index 54c5e829e8a..d9f62f51f92 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -292,7 +292,7 @@ def cut(
     )
 
     # we return a categorical index, as we don't have a Categorical method
-    categorical_index = cudf.Index(col)
+    categorical_index = cudf.CategoricalIndex._from_data({None: col})
 
     if isinstance(orig_x, (pd.Series, cudf.Series)):
         # if we have a series input we return a series output
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 3fc29582c4c..4dfeb68b7ba 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -58,7 +58,12 @@
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.copy_types import BooleanMask
 from cudf.core.groupby.groupby import DataFrameGroupBy, groupby_doc_template
-from cudf.core.index import BaseIndex, RangeIndex, _index_from_data, as_index
+from cudf.core.index import (
+    BaseIndex,
+    RangeIndex,
+    _index_from_data,
+    ensure_index,
+)
 from cudf.core.indexed_frame import (
     IndexedFrame,
     _FrameIndexer,
@@ -338,7 +343,7 @@ def _getitem_tuple_arg(self, arg):
                                 range(len(tmp_arg[0]))
                             )
                         },
-                        index=as_index(tmp_arg[0]),
+                        index=cudf.Index(tmp_arg[0]),
                     )
                     columns_df[cantor_name] = column.as_column(
                         range(len(columns_df))
@@ -702,7 +707,7 @@ def __init__(
                     data = data.reindex(index)
                     index = data.index
                 else:
-                    index = cudf.Index(index)
+                    index = ensure_index(index)
             else:
                 index = data.index
 
@@ -751,7 +756,7 @@ def __init__(
             if index is None:
                 self._index = RangeIndex(0)
             else:
-                self._index = cudf.Index(index)
+                self._index = ensure_index(index)
             if columns is not None:
                 rangeindex = isinstance(
                     columns, (range, pd.RangeIndex, cudf.RangeIndex)
@@ -909,7 +914,7 @@ def _init_from_series_list(self, data, columns, index):
                         f"not match length of index ({index_length})"
                     )
 
-            final_index = cudf.Index(index)
+            final_index = ensure_index(index)
 
         series_lengths = list(map(len, data))
         data = numeric_normalize_types(*data)
@@ -977,9 +982,9 @@ def _init_from_list_like(self, data, index=None, columns=None):
         if index is None:
             index = RangeIndex(start=0, stop=len(data))
         else:
-            index = cudf.Index(index)
+            index = ensure_index(index)
 
-        self._index = cudf.Index(index)
+        self._index = index
         # list-of-dicts case
         if len(data) > 0 and isinstance(data[0], dict):
             data = DataFrame.from_pandas(pd.DataFrame(data))
@@ -1085,7 +1090,7 @@ def _init_from_dict_like(
 
             self._index = RangeIndex(0, num_rows)
         else:
-            self._index = cudf.Index(index)
+            self._index = ensure_index(index)
 
         if len(data):
             self._data.multiindex = True
@@ -1491,7 +1496,7 @@ def memory_usage(self, index=True, deep=False):
             names.append("Index")
         return Series._from_data(
             data={None: as_column(mem_usage)},
-            index=as_index(names),
+            index=cudf.Index(names),
         )
 
     @_performance_tracking
@@ -4033,7 +4038,7 @@ def transpose(self):
         # Set the old column names as the new index
         result = self.__class__._from_data(
             ColumnAccessor(dict(enumerate(result_columns)), verify=False),
-            index=as_index(index),
+            index=cudf.Index(index),
         )
         # Set the old index as the new column names
         result.columns = columns
@@ -5657,7 +5662,7 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
             }
 
         if not is_scalar(index):
-            new_index = cudf.Index(index)
+            new_index = ensure_index(index)
         else:
             new_index = None
 
@@ -5741,7 +5746,7 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
             }
 
         if index is not None:
-            index = cudf.Index(index)
+            index = ensure_index(index)
 
         if isinstance(columns, (pd.Index, cudf.Index)):
             level_names = tuple(columns.names)
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index e069f8d0ea6..b398ee2343e 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -65,6 +65,17 @@
     from collections.abc import Generator, Iterable
 
 
+def ensure_index(index_like: Any) -> BaseIndex:
+    """
+    Ensure an Index is returned.
+
+    Avoids a shallow copy compared to calling cudf.Index(...)
+    """
+    if not isinstance(index_like, BaseIndex):
+        return cudf.Index(index_like)
+    return index_like
+
+
 class IndexMeta(type):
     """Custom metaclass for Index that overrides instance/subclass tests."""
 
@@ -1569,7 +1580,7 @@ def append(self, other):
                 to_concat.append(obj)
         else:
             this = self
-            other = cudf.Index(other)
+            other = ensure_index(other)
 
             if len(this) == 0 or len(other) == 0:
                 # we'll filter out empties later in ._concat
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 72bd3c45fa6..ff10051c52d 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -33,7 +33,6 @@
     is_list_like,
     is_scalar,
 )
-from cudf.core._base_index import BaseIndex
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import ColumnBase, as_column
@@ -42,7 +41,7 @@
 from cudf.core.dtypes import ListDtype
 from cudf.core.frame import Frame
 from cudf.core.groupby.groupby import GroupBy
-from cudf.core.index import Index, RangeIndex, _index_from_data
+from cudf.core.index import RangeIndex, _index_from_data, ensure_index
 from cudf.core.missing import NA
 from cudf.core.multiindex import MultiIndex
 from cudf.core.resample import _Resampler
@@ -66,6 +65,8 @@
         Dtype,
         NotImplementedType,
     )
+    from cudf.core._base_index import BaseIndex
+
 
 doc_reset_index_template = """
         Reset the index of the {klass}, or a level of it.
@@ -627,9 +628,7 @@ def index(self, value):
                 f"new values have {len(value)} elements"
             )
         # avoid unnecessary cast to Index
-        if not isinstance(value, BaseIndex):
-            value = Index(value)
-
+        value = ensure_index(value)
         self._index = value
 
     @_performance_tracking
@@ -3595,7 +3594,7 @@ def _align_to_index(
         sort: bool = True,
         allow_non_unique: bool = False,
     ) -> Self:
-        index = cudf.Index(index)
+        index = ensure_index(index)
 
         if self.index.equals(index):
             return self
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 7657fa9e234..9cbe863142b 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -29,6 +29,7 @@
     BaseIndex,
     _get_indexer_basic,
     _lexsorted_equal_range,
+    ensure_index,
 )
 from cudf.core.join._join_helpers import _match_join_keys
 from cudf.utils.dtypes import is_column_like
@@ -173,7 +174,7 @@ def __init__(
                     "codes and is inconsistent!"
                 )
 
-        levels = [cudf.Index(level) for level in levels]
+        levels = [ensure_index(level) for level in levels]
 
         if len(levels) != len(codes._data):
             raise ValueError(
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 9acf5294b72..97b6bbec2d4 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -48,7 +48,7 @@
 from cudf.core.column.struct import StructMethods
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.groupby.groupby import SeriesGroupBy, groupby_doc_template
-from cudf.core.index import BaseIndex, DatetimeIndex, RangeIndex, as_index
+from cudf.core.index import BaseIndex, DatetimeIndex, RangeIndex, ensure_index
 from cudf.core.indexed_frame import (
     IndexedFrame,
     _FrameIndexer,
@@ -588,10 +588,8 @@ def __init__(
                 data = data.copy(deep=True)
             name_from_data = data.name
             column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
-            if isinstance(data, pd.Series):
-                index_from_data = cudf.Index(data.index)
-            elif isinstance(data, Series):
-                index_from_data = data.index
+            if isinstance(data, (pd.Series, Series)):
+                index_from_data = ensure_index(data.index)
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
                 "Use cudf.Series._from_data for constructing a Series from "
@@ -642,7 +640,7 @@ def __init__(
             name = name_from_data
 
         if index is not None:
-            index = cudf.Index(index)
+            index = ensure_index(index)
 
         if index_from_data is not None:
             first_index = index_from_data
@@ -3191,7 +3189,7 @@ def quantile(
 
         return Series._from_data(
             data={self.name: result},
-            index=as_index(np_array_q) if quant_index else None,
+            index=cudf.Index(np_array_q) if quant_index else None,
         )
 
     @docutils.doc_describe()
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index fc7fd87d4c5..f40106a30f4 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -11078,3 +11078,27 @@ def test_dataframe_loc_int_float(dtype1, dtype2):
     expected = pdf.loc[pidx]
 
     assert_eq(actual, expected, check_index_type=True, check_dtype=True)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        cudf.DataFrame(range(2)),
+        None,
+        [cudf.Series(range(2))],
+        [[0], [1]],
+        {1: range(2)},
+        cupy.arange(2),
+    ],
+)
+def test_init_with_index_no_shallow_copy(data):
+    idx = cudf.RangeIndex(2)
+    df = cudf.DataFrame(data, index=idx)
+    assert df.index is idx
+
+
+def test_from_records_with_index_no_shallow_copy():
+    idx = cudf.RangeIndex(2)
+    data = np.array([(1.0, 2), (3.0, 4)], dtype=[("x", "<f8"), ("y", "<i8")])
+    df = cudf.DataFrame(data.view(np.recarray), index=idx)
+    assert df.index is idx

From 224ac5bad11465d0486af80e7935eac482269805 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 28 Jun 2024 09:26:37 -0400
Subject: [PATCH 427/842] Add libcudf public/detail API pattern to developer
 guide (#16086)

Adds specific description for the public API to detail API function pattern to the libcudf developer guide.
Also fixes some formatting issues and broken link.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16086
---
 .../developer_guide/DEVELOPER_GUIDE.md        | 60 +++++++++++--------
 1 file changed, 34 insertions(+), 26 deletions(-)

diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index ff80c2daab8..0d097541692 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -1,4 +1,4 @@
-# libcudf C++ Developer Guide
+# libcudf C++ Developer Guide {#DEVELOPER_GUIDE}
 
 This document serves as a guide for contributors to libcudf C++ code. Developers should also refer
 to these additional files for further documentation of libcudf best practices.
@@ -469,7 +469,7 @@ libcudf throws under different circumstances, see the [section on error handling
 
 # libcudf API and Implementation
 
-## Streams
+## Streams {#streams}
 
 libcudf is in the process of adding support for asynchronous execution using
 CUDA streams. In order to facilitate the usage of streams, all new libcudf APIs
@@ -486,33 +486,37 @@ use only asynchronous versions of CUDA APIs with the stream parameter.
 
 In order to make the `detail` API callable from other libcudf functions, it should be exposed in a
 header placed in the `cudf/cpp/include/detail/` directory.
+The declaration is not necessary if no other libcudf functions call the `detail` function.
 
 For example:
 
 ```c++
 // cpp/include/cudf/header.hpp
-void external_function(...);
+void external_function(...,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 // cpp/include/cudf/detail/header.hpp
 namespace detail{
-void external_function(..., rmm::cuda_stream_view stream)
+void external_function(..., rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 } // namespace detail
 
 // cudf/src/implementation.cpp
 namespace detail{
-    // Use the stream parameter in the detail implementation.
-    void external_function(..., rmm::cuda_stream_view stream){
-        // Implementation uses the stream with async APIs.
-        rmm::device_buffer buff(...,stream);
-        CUDF_CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
-        kernel<<<..., stream>>>(...);
-        thrust::algorithm(rmm::exec_policy(stream), ...);
-    }
+// Use the stream parameter in the detail implementation.
+void external_function(..., rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr){
+  // Implementation uses the stream with async APIs.
+  rmm::device_buffer buff(..., stream, mr);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(...,stream.value()));
+  kernel<<<..., stream>>>(...);
+  thrust::algorithm(rmm::exec_policy(stream), ...);
+}
 } // namespace detail
 
-void external_function(...){
-    CUDF_FUNC_RANGE(); // Generates an NVTX range for the lifetime of this function.
-    detail::external_function(..., cudf::get_default_stream());
+void external_function(..., rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE(); // Generates an NVTX range for the lifetime of this function.
+  detail::external_function(..., stream, mr);
 }
 ```
 
@@ -703,28 +707,28 @@ The preferred style for how inputs are passed in and outputs are returned is the
     - `column_view const&`
   - Tables:
     - `table_view const&`
-    - Scalar:
-        - `scalar const&`
-    - Everything else:
-       - Trivial or inexpensively copied types
-          - Pass by value
-       - Non-trivial or expensive to copy types
-          - Pass by `const&`
+  - Scalar:
+    - `scalar const&`
+  - Everything else:
+    - Trivial or inexpensively copied types
+      - Pass by value
+    - Non-trivial or expensive to copy types
+      - Pass by `const&`
 - In/Outs
   - Columns:
     - `mutable_column_view&`
   - Tables:
     - `mutable_table_view&`
-    - Everything else:
-        - Pass by via raw pointer
+  - Everything else:
+    - Pass by via raw pointer
 - Outputs
   - Outputs should be *returned*, i.e., no output parameters
   - Columns:
     - `std::unique_ptr<column>`
   - Tables:
     - `std::unique_ptr<table>`
-    - Scalars:
-        - `std::unique_ptr<scalar>`
+  - Scalars:
+    - `std::unique_ptr<scalar>`
 
 
 ### Multiple Return Values
@@ -908,6 +912,10 @@ functions that are specific to columns of Strings. These functions reside in the
 namespace. Similarly, functionality used exclusively for unit testing is in the `cudf::test::`
 namespace.
 
+The public function is expected to contain a call to `CUDF_FUNC_RANGE()` followed by a call to
+a `detail` function with same name and parameters as the public function.
+See the [Streams](#streams) section for an example of this pattern.
+
 ### Internal
 
 Many functions are not meant for public use, so place them in either the `detail` or an *anonymous*

From 673d766836b7e6e8c80afe32cd9a4b4da2cecf58 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Fri, 28 Jun 2024 09:38:57 -0400
Subject: [PATCH 428/842] Make binary operators work between fixed-point and
 floating args (#16116)

Some of the binary operators in cuDF don't work between fixed_point and floating-point numbers after [this earlier PR](https://github.com/rapidsai/cudf/pull/15438) removed the ability to construct and implicitly cast fixed_point numbers from floating point numbers. This PR restores that functionality by detecting and performing the necessary explicit casts, and adds tests for the supported operators.

Note that the `binary_op_has_common_type` code is modeled after `has_common_type` found in traits.hpp.

This closes [issue 16090](https://github.com/rapidsai/cudf/issues/16090)

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16116
---
 cpp/include/cudf/binaryop.hpp                 | 50 ++++++++++++++++
 cpp/src/binaryop/compiled/binary_ops.cuh      | 14 ++++-
 cpp/src/binaryop/compiled/util.cpp            | 12 ++--
 .../binop-compiled-fixed_point-test.cpp       | 58 +++++++++++++++++++
 4 files changed, 125 insertions(+), 9 deletions(-)

diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 5e41a871f32..22dad11e109 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -91,6 +91,56 @@ enum class binary_operator : int32_t {
                      ///< (null, false) is null, and (valid, valid) == LOGICAL_OR(valid, valid)
   INVALID_BINARY     ///< invalid operation
 };
+
+/// Binary operation common type default
+template <typename L, typename R, typename = void>
+struct binary_op_common_type {};
+
+/// Binary operation common type specialization
+template <typename L, typename R>
+struct binary_op_common_type<L, R, std::enable_if_t<has_common_type_v<L, R>>> {
+  /// The common type of the template parameters
+  using type = std::common_type_t<L, R>;
+};
+
+/// Binary operation common type specialization
+template <typename L, typename R>
+struct binary_op_common_type<
+  L,
+  R,
+  std::enable_if_t<is_fixed_point<L>() && cuda::std::is_floating_point_v<R>>> {
+  /// The common type of the template parameters
+  using type = L;
+};
+
+/// Binary operation common type specialization
+template <typename L, typename R>
+struct binary_op_common_type<
+  L,
+  R,
+  std::enable_if_t<is_fixed_point<R>() && cuda::std::is_floating_point_v<L>>> {
+  /// The common type of the template parameters
+  using type = R;
+};
+
+/// Binary operation common type helper
+template <typename L, typename R>
+using binary_op_common_type_t = typename binary_op_common_type<L, R>::type;
+
+namespace detail {
+template <typename AlwaysVoid, typename L, typename R>
+struct binary_op_has_common_type_impl : std::false_type {};
+
+template <typename L, typename R>
+struct binary_op_has_common_type_impl<std::void_t<binary_op_common_type_t<L, R>>, L, R>
+  : std::true_type {};
+}  // namespace detail
+
+/// Checks if binary operation types have a common type
+template <typename L, typename R>
+constexpr inline bool binary_op_has_common_type_v =
+  detail::binary_op_has_common_type_impl<void, L, R>::value;
+
 /**
  * @brief Performs a binary operation between a scalar and a column.
  *
diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh
index 5177e7d4bda..c6af0c3c58a 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/binary_ops.cuh
@@ -49,9 +49,16 @@ struct type_casted_accessor {
                                         column_device_view const& col,
                                         bool is_scalar) const
   {
-    if constexpr (column_device_view::has_element_accessor<Element>() and
-                  std::is_convertible_v<Element, CastType>)
-      return static_cast<CastType>(col.element<Element>(is_scalar ? 0 : i));
+    if constexpr (column_device_view::has_element_accessor<Element>()) {
+      auto const element = col.element<Element>(is_scalar ? 0 : i);
+      if constexpr (std::is_convertible_v<Element, CastType>) {
+        return static_cast<CastType>(element);
+      } else if constexpr (is_fixed_point<Element>() && cuda::std::is_floating_point_v<CastType>) {
+        return convert_fixed_to_floating<CastType>(element);
+      } else if constexpr (is_fixed_point<CastType>() && cuda::std::is_floating_point_v<Element>) {
+        return convert_floating_to_fixed<CastType>(element, numeric::scale_type{0});
+      }
+    }
     return {};
   }
 };
@@ -159,6 +166,7 @@ struct ops2_wrapper {
       TypeRhs y   = rhs.element<TypeRhs>(is_rhs_scalar ? 0 : i);
       auto result = [&]() {
         if constexpr (std::is_same_v<BinaryOperator, ops::NullEquals> or
+                      std::is_same_v<BinaryOperator, ops::NullNotEquals> or
                       std::is_same_v<BinaryOperator, ops::NullLogicalAnd> or
                       std::is_same_v<BinaryOperator, ops::NullLogicalOr> or
                       std::is_same_v<BinaryOperator, ops::NullMax> or
diff --git a/cpp/src/binaryop/compiled/util.cpp b/cpp/src/binaryop/compiled/util.cpp
index 2b6a4f58895..b62c5f1f4e1 100644
--- a/cpp/src/binaryop/compiled/util.cpp
+++ b/cpp/src/binaryop/compiled/util.cpp
@@ -31,8 +31,8 @@ struct common_type_functor {
   template <typename TypeLhs, typename TypeRhs>
   std::optional<data_type> operator()() const
   {
-    if constexpr (cudf::has_common_type_v<TypeLhs, TypeRhs>) {
-      using TypeCommon = std::common_type_t<TypeLhs, TypeRhs>;
+    if constexpr (binary_op_has_common_type_v<TypeLhs, TypeRhs>) {
+      using TypeCommon = binary_op_common_type_t<TypeLhs, TypeRhs>;
       return data_type{type_to_id<TypeCommon>()};
     }
 
@@ -85,8 +85,8 @@ struct is_binary_operation_supported {
   {
     if constexpr (column_device_view::has_element_accessor<TypeLhs>() and
                   column_device_view::has_element_accessor<TypeRhs>()) {
-      if constexpr (has_common_type_v<TypeLhs, TypeRhs>) {
-        using common_t = std::common_type_t<TypeLhs, TypeRhs>;
+      if constexpr (binary_op_has_common_type_v<TypeLhs, TypeRhs>) {
+        using common_t = binary_op_common_type_t<TypeLhs, TypeRhs>;
         return std::is_invocable_v<BinaryOperator, common_t, common_t>;
       } else {
         return std::is_invocable_v<BinaryOperator, TypeLhs, TypeRhs>;
@@ -102,8 +102,8 @@ struct is_binary_operation_supported {
     if constexpr (column_device_view::has_element_accessor<TypeLhs>() and
                   column_device_view::has_element_accessor<TypeRhs>()) {
       if (has_mutable_element_accessor(out_type) or is_fixed_point(out_type)) {
-        if constexpr (has_common_type_v<TypeLhs, TypeRhs>) {
-          using common_t = std::common_type_t<TypeLhs, TypeRhs>;
+        if constexpr (binary_op_has_common_type_v<TypeLhs, TypeRhs>) {
+          using common_t = binary_op_common_type_t<TypeLhs, TypeRhs>;
           if constexpr (std::is_invocable_v<BinaryOperator, common_t, common_t>) {
             using ReturnType = std::invoke_result_t<BinaryOperator, common_t, common_t>;
             return is_constructible<ReturnType>(out_type) or
diff --git a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
index 6d097b2ff12..89824eb6511 100644
--- a/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
+++ b/cpp/tests/binaryop/binop-compiled-fixed_point-test.cpp
@@ -843,3 +843,61 @@ TYPED_TEST(FixedPointTest_64_128_Reps, FixedPoint_64_128_ComparisonTests)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(h->view(), falses);
   }
 }
+
+template <typename ResultType>
+void test_fixed_floating(cudf::binary_operator op,
+                         double floating_value,
+                         int decimal_value,
+                         int decimal_scale,
+                         ResultType expected)
+{
+  auto const scale       = numeric::scale_type{decimal_scale};
+  auto const result_type = cudf::data_type(cudf::type_to_id<ResultType>());
+  auto const nullable =
+    (op == cudf::binary_operator::NULL_EQUALS || op == cudf::binary_operator::NULL_NOT_EQUALS ||
+     op == cudf::binary_operator::NULL_MIN || op == cudf::binary_operator::NULL_MAX);
+
+  cudf::test::fixed_width_column_wrapper<double> floating_col({floating_value});
+  cudf::test::fixed_point_column_wrapper<int> decimal_col({decimal_value}, scale);
+
+  auto result = binary_operation(floating_col, decimal_col, op, result_type);
+
+  if constexpr (cudf::is_fixed_point<ResultType>()) {
+    using wrapper_type      = cudf::test::fixed_point_column_wrapper<typename ResultType::rep>;
+    auto const expected_col = nullable ? wrapper_type({expected.value()}, {true}, expected.scale())
+                                       : wrapper_type({expected.value()}, expected.scale());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, *result.get());
+  } else {
+    using wrapper_type = cudf::test::fixed_width_column_wrapper<ResultType>;
+    auto const expected_col =
+      nullable ? wrapper_type({expected}, {true}) : wrapper_type({expected});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col, *result.get());
+  }
+}
+
+TYPED_TEST(FixedPointCompiledTest, FixedPointWithFloating)
+{
+  using namespace numeric;
+
+  // BOOLEAN
+  test_fixed_floating(cudf::binary_operator::EQUAL, 1.0, 10, -1, true);
+  test_fixed_floating(cudf::binary_operator::NOT_EQUAL, 1.0, 10, -1, false);
+  test_fixed_floating(cudf::binary_operator::LESS, 2.0, 10, -1, false);
+  test_fixed_floating(cudf::binary_operator::GREATER, 2.0, 10, -1, true);
+  test_fixed_floating(cudf::binary_operator::LESS_EQUAL, 2.0, 20, -1, true);
+  test_fixed_floating(cudf::binary_operator::GREATER_EQUAL, 2.0, 30, -1, false);
+  test_fixed_floating(cudf::binary_operator::NULL_EQUALS, 1.0, 10, -1, true);
+  test_fixed_floating(cudf::binary_operator::NULL_NOT_EQUALS, 1.0, 10, -1, false);
+
+  // PRIMARY ARITHMETIC
+  auto const decimal_result = numeric::decimal32(4, numeric::scale_type{0});
+  test_fixed_floating(cudf::binary_operator::ADD, 1.0, 30, -1, decimal_result);
+  test_fixed_floating(cudf::binary_operator::SUB, 6.0, 20, -1, decimal_result);
+  test_fixed_floating(cudf::binary_operator::MUL, 2.0, 20, -1, decimal_result);
+  test_fixed_floating(cudf::binary_operator::DIV, 8.0, 2, 0, decimal_result);
+  test_fixed_floating(cudf::binary_operator::MOD, 9.0, 50, -1, decimal_result);
+
+  // OTHER ARITHMETIC
+  test_fixed_floating(cudf::binary_operator::NULL_MAX, 4.0, 20, -1, decimal_result);
+  test_fixed_floating(cudf::binary_operator::NULL_MIN, 4.0, 200, -1, decimal_result);
+}

From c40e0cc8dae8922c2633f5359609a1d063ae7f26 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 28 Jun 2024 10:10:31 -0400
Subject: [PATCH 429/842] Add support for proxy `np.flatiter` objects (#16107)

Closes #15388

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16107
---
 python/cudf/cudf/pandas/_wrappers/numpy.py        | 13 +++++++++++++
 python/cudf/cudf_pandas_tests/test_cudf_pandas.py | 10 ++++++++++
 2 files changed, 23 insertions(+)

diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index c445be46f58..3b012169676 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -129,6 +129,19 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor):
     },
 )
 
+
+flatiter = make_final_proxy_type(
+    "flatiter",
+    cupy.flatiter,
+    numpy.flatiter,
+    fast_to_slow=lambda fast: cupy.asnumpy(fast.base).flat,
+    slow_to_fast=lambda slow: cupy.asarray(slow).flat,
+    additional_attributes={
+        "__array__": array_method,
+    },
+)
+
+
 # Mapping flags between slow and fast types
 _ndarray_flags = make_intermediate_proxy_type(
     "_ndarray_flags",
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 0d46e2e9311..f51ce103677 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1535,6 +1535,16 @@ def test_is_proxy_object():
     assert not is_proxy_object(s2)
 
 
+def test_numpy_cupy_flatiter(series):
+    cp = pytest.importorskip("cupy")
+
+    _, s = series
+    arr = s.values
+
+    assert type(arr.flat._fsproxy_fast) == cp.flatiter
+    assert type(arr.flat._fsproxy_slow) == np.flatiter
+
+
 def test_arrow_string_arrays():
     cu_s = xpd.Series(["a", "b", "c"])
     pd_s = pd.Series(["a", "b", "c"])

From 565c0d1c3a08c9bd7eafa70278a8744097f8ef04 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 28 Jun 2024 10:16:55 -0400
Subject: [PATCH 430/842] Migrate lists/contains to pylibcudf (#15981)

Part of #15162.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15981
---
 cpp/include/cudf/lists/lists_column_view.hpp  |   3 +-
 python/cudf/cudf/_lib/lists.pyx               |  72 +++-------
 python/cudf/cudf/_lib/pylibcudf/column.pxd    |   4 +
 python/cudf/cudf/_lib/pylibcudf/column.pyx    |   9 ++
 .../_lib/pylibcudf/libcudf/lists/contains.pxd |  29 +++-
 .../libcudf/lists/lists_column_view.pxd       |   1 +
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  10 ++
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 124 +++++++++++++++++-
 .../cudf/cudf/pylibcudf_tests/test_lists.py   |  98 +++++++++++++-
 9 files changed, 281 insertions(+), 69 deletions(-)

diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp
index 57a4f724c2d..3397cb0ca1d 100644
--- a/cpp/include/cudf/lists/lists_column_view.hpp
+++ b/cpp/include/cudf/lists/lists_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,6 +38,7 @@ namespace cudf {
  */
 class lists_column_view : private column_view {
  public:
+  lists_column_view() = default;
   /**
    * @brief Construct a new lists column view object from a column view.
    *
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 5d406f5c85f..0ad09dba717 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -9,10 +9,6 @@ from libcpp.utility cimport move
 from cudf._lib.column cimport Column
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.lists.contains cimport (
-    contains,
-    index_of as cpp_index_of,
-)
 from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
     count_elements as cpp_count_elements,
 )
@@ -26,7 +22,6 @@ from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
 from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
     distinct as cpp_distinct,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     null_equality,
@@ -34,11 +29,12 @@ from cudf._lib.pylibcudf.libcudf.types cimport (
     order,
     size_type,
 )
-from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf._lib import pylibcudf
 
+from cudf._lib.pylibcudf cimport Scalar
+
 
 @acquire_spill_lock()
 def count_elements(Column col):
@@ -153,64 +149,36 @@ def extract_element_column(Column col, Column index):
 
 
 @acquire_spill_lock()
-def contains_scalar(Column col, object py_search_key):
-
-    cdef DeviceScalar search_key = py_search_key.device_value
-
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+def contains_scalar(Column col, py_search_key):
+    return Column.from_pylibcudf(
+        pylibcudf.lists.contains(
+            col.to_pylibcudf(mode="read"),
+            <Scalar> py_search_key.device_value.c_value,
+        )
     )
-    cdef const scalar* search_key_value = search_key.get_raw_ptr()
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(contains(
-            list_view.get()[0],
-            search_key_value[0],
-        ))
-    result = Column.from_unique_ptr(move(c_result))
-    return result
 
 
 @acquire_spill_lock()
 def index_of_scalar(Column col, object py_search_key):
-
-    cdef DeviceScalar search_key = py_search_key.device_value
-
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.index_of(
+            col.to_pylibcudf(mode="read"),
+            <Scalar> py_search_key.device_value.c_value,
+            True,
+        )
     )
-    cdef const scalar* search_key_value = search_key.get_raw_ptr()
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_index_of(
-            list_view.get()[0],
-            search_key_value[0],
-        ))
-    return Column.from_unique_ptr(move(c_result))
 
 
 @acquire_spill_lock()
 def index_of_column(Column col, Column search_keys):
-
-    cdef column_view keys_view = search_keys.view()
-
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.index_of(
+            col.to_pylibcudf(mode="read"),
+            search_keys.to_pylibcudf(mode="read"),
+            True,
+        )
     )
 
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_index_of(
-            list_view.get()[0],
-            keys_view,
-        ))
-    return Column.from_unique_ptr(move(c_result))
-
 
 @acquire_spill_lock()
 def concatenate_rows(list source_columns):
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd
index e121e856865..d13791d95cf 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd
@@ -8,6 +8,9 @@ from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
 from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type, size_type
 
 from .gpumemoryview cimport gpumemoryview
@@ -56,3 +59,4 @@ cdef class ListColumnView:
     cdef Column _column
     cpdef child(self)
     cpdef offsets(self)
+    cdef lists_column_view view(self) nogil
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index e726eca154f..e0cf8b7ee32 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -348,6 +348,15 @@ cdef class ListColumnView:
         """The offsets column of the underlying list column."""
         return self._column.child(1)
 
+    cdef lists_column_view view(self) nogil:
+        """Generate a libcudf lists_column_view to pass to libcudf algorithms.
+
+        This method is for pylibcudf's functions to use to generate inputs when
+        calling libcudf algorithms, and should generally not be needed by users
+        (even direct pylibcudf Cython users).
+        """
+        return lists_column_view(self._column.view())
+
 
 @functools.cache
 def _datatype_from_dtype_desc(desc):
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
index 721679f35c7..82aed7d70a0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 
 from cudf._lib.exception_handler cimport cudf_exception_handler
@@ -12,17 +13,33 @@ from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
+
+    cpdef enum class duplicate_find_option(int32_t):
+        FIND_FIRST
+        FIND_LAST
+
     cdef unique_ptr[column] contains(
-        lists_column_view lists,
-        scalar search_key,
+        const lists_column_view& lists,
+        const scalar& search_key,
+    ) except +cudf_exception_handler
+
+    cdef unique_ptr[column] contains(
+        const lists_column_view& lists,
+        const column_view& search_keys,
+    ) except +cudf_exception_handler
+
+    cdef unique_ptr[column] contains_nulls(
+        const lists_column_view& lists,
     ) except +cudf_exception_handler
 
     cdef unique_ptr[column] index_of(
-        lists_column_view lists,
-        scalar search_key,
+        const lists_column_view& lists,
+        const scalar& search_key,
+        duplicate_find_option find_option,
     ) except +cudf_exception_handler
 
     cdef unique_ptr[column] index_of(
-        lists_column_view lists,
-        column_view search_keys,
+        const lists_column_view& lists,
+        const column_view& search_keys,
+        duplicate_find_option find_option,
     ) except +cudf_exception_handler
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
index dbafc415e45..fd21e7b334b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
@@ -9,6 +9,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil:
     cdef cppclass lists_column_view(column_view):
+        lists_column_view() except +
         lists_column_view(const column_view& lists_column) except +
         column_view parent() except +
         column_view offsets() except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index 2d2a5b2a9ea..2ccf0139e90 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -5,11 +5,21 @@ from libcpp cimport bool
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
+from .scalar cimport Scalar
 from .table cimport Table
 
+ctypedef fused ColumnOrScalar:
+    Column
+    Scalar
 
 cpdef Table explode_outer(Table, size_type explode_column_idx)
 
 cpdef Column concatenate_rows(Table)
 
 cpdef Column concatenate_list_elements(Column, bool dropna)
+
+cpdef Column contains(Column, ColumnOrScalar)
+
+cpdef Column contains_nulls(Column)
+
+cpdef Column index_of(Column, ColumnOrScalar, bool)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index 069c9da31c2..a94d940accd 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -1,11 +1,15 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from cython.operator cimport dereference
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists cimport explode as cpp_explode
+from cudf._lib.pylibcudf.libcudf.lists cimport (
+    contains as cpp_contains,
+    explode as cpp_explode,
+)
 from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
     concatenate_list_elements as cpp_concatenate_list_elements,
     concatenate_null_policy,
@@ -13,8 +17,10 @@ from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
 )
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.lists cimport ColumnOrScalar
 
-from .column cimport Column
+from .column cimport Column, ListColumnView
+from .scalar cimport Scalar
 from .table cimport Table
 
 
@@ -71,15 +77,15 @@ cpdef Column concatenate_list_elements(Column input, bool dropna):
     ----------
     input : Column
         The input column
+    dropna : bool
+        If true, null list elements will be ignored
+        from concatenation. Otherwise any input null values will result in
+        the corresponding output row being set to null.
 
     Returns
     -------
     Column
         A new Column of concatenated list elements
-    dropna : bool
-        If true, null list elements will be ignored
-        from concatenation. Otherwise any input null values will result in
-        the corresponding output row being set to null.
     """
     cdef concatenate_null_policy null_policy = (
         concatenate_null_policy.IGNORE if dropna
@@ -94,3 +100,109 @@ cpdef Column concatenate_list_elements(Column input, bool dropna):
         ))
 
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column contains(Column input, ColumnOrScalar search_key):
+    """Create a column of bool values indicating whether
+    the search_key is contained in the input.
+
+    ``search_key`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    For details, see :cpp:func:`contains`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    search_key : Union[Column, Scalar]
+        The search key.
+
+    Returns
+    -------
+    Column
+        A new Column of bools indicating if the search_key was
+        found in the list column.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+
+    if not isinstance(search_key, (Column, Scalar)):
+        raise TypeError("Must pass a Column or Scalar")
+
+    with nogil:
+        c_result = move(cpp_contains.contains(
+            list_view.view(),
+            search_key.view() if ColumnOrScalar is Column else dereference(
+                search_key.get()
+            ),
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column contains_nulls(Column input):
+    """Create a column of bool values indicating whether
+    each row in the lists column contains a null value.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+
+    Returns
+    -------
+    Column
+        A new Column of bools indicating if the list column
+        contains a null value.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+    with nogil:
+        c_result = move(cpp_contains.contains_nulls(list_view.view()))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_option):
+    """Create a column of index values indicating the position of a search
+    key row within the corresponding list row in the lists column.
+
+    ``search_key`` may be a
+    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
+    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+
+    For details, see :cpp:func:`index_of`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    search_key : Union[Column, Scalar]
+        The search key.
+    find_first_option : bool
+        If true, index_of returns the first match.
+        Otherwise the last match is returned.
+
+    Returns
+    -------
+    Column
+        A new Column of index values that indicate where in the
+        list column tthe search_key was found. An index value
+        of -1 indicates that the search_key was not found.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+    cdef cpp_contains.duplicate_find_option find_option = (
+        cpp_contains.duplicate_find_option.FIND_FIRST if find_first_option
+        else cpp_contains.duplicate_find_option.FIND_LAST
+    )
+
+    with nogil:
+        c_result = move(cpp_contains.index_of(
+            list_view.view(),
+            search_key.view() if ColumnOrScalar is Column else dereference(
+                search_key.get()
+            ),
+            find_option,
+        ))
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index b21af8ea11c..c781126e388 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -7,15 +7,28 @@
 from cudf._lib import pylibcudf as plc
 
 
-def test_concatenate_rows():
-    test_data = [[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]]
+@pytest.fixture
+def test_data():
+    return [[[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]]]
 
-    arrow_tbl = pa.Table.from_arrays(test_data, names=["a", "b"])
+
+@pytest.fixture
+def scalar():
+    return pa.scalar(1)
+
+
+@pytest.fixture
+def column():
+    return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32())
+
+
+def test_concatenate_rows(test_data):
+    arrow_tbl = pa.Table.from_arrays(test_data[0], names=["a", "b"])
     plc_tbl = plc.interop.from_arrow(arrow_tbl)
 
     res = plc.lists.concatenate_rows(plc_tbl)
 
-    expect = pa.array([pair[0] + pair[1] for pair in zip(*test_data)])
+    expect = pa.array([pair[0] + pair[1] for pair in zip(*test_data[0])])
 
     assert_column_eq(expect, res)
 
@@ -44,3 +57,80 @@ def test_concatenate_list_elements(test_data, dropna, expected):
     expect = pa.array(expected)
 
     assert_column_eq(expect, res)
+
+
+def test_contains_scalar(test_data, scalar):
+    list_column = test_data[0][0]
+    arr = pa.array(list_column)
+
+    plc_column = plc.interop.from_arrow(arr)
+    plc_scalar = plc.interop.from_arrow(scalar)
+    res = plc.lists.contains(plc_column, plc_scalar)
+
+    expect = pa.array([True, False, False, False])
+
+    assert_column_eq(expect, res)
+
+
+def test_contains_list_column(test_data):
+    list_column1 = test_data[0][0]
+    list_column2 = [1, 3, 5, 1]
+    arr1 = pa.array(list_column1)
+    arr2 = pa.array(list_column2)
+
+    plc_column1 = plc.interop.from_arrow(arr1)
+    plc_column2 = plc.interop.from_arrow(arr2)
+    res = plc.lists.contains(plc_column1, plc_column2)
+
+    expect = pa.array([True, False, True, False])
+
+    assert_column_eq(expect, res)
+
+
+@pytest.mark.parametrize(
+    "list_column, expected",
+    [
+        (
+            [[1, None], [1, 3, 4], [5, None]],
+            [True, False, True],
+        ),
+        (
+            [[1, None], None, [5]],
+            [True, None, False],
+        ),
+    ],
+)
+def test_contains_nulls(list_column, expected):
+    arr = pa.array(list_column)
+    plc_column = plc.interop.from_arrow(arr)
+    res = plc.lists.contains_nulls(plc_column)
+
+    expect = pa.array(expected)
+
+    assert_column_eq(expect, res)
+
+
+def test_index_of_scalar(test_data, scalar):
+    list_column = test_data[0][0]
+    arr = pa.array(list_column)
+
+    plc_column = plc.interop.from_arrow(arr)
+    plc_scalar = plc.interop.from_arrow(scalar)
+    res = plc.lists.index_of(plc_column, plc_scalar, True)
+
+    expect = pa.array([1, -1, -1, -1], type=pa.int32())
+
+    assert_column_eq(expect, res)
+
+
+def test_index_of_list_column(test_data, column):
+    list_column = test_data[0][0]
+    arr1 = pa.array(list_column)
+    arr2, expect = column
+    plc_column1 = plc.interop.from_arrow(arr1)
+    plc_column2 = plc.interop.from_arrow(arr2)
+    res = plc.lists.index_of(plc_column1, plc_column2, True)
+
+    expect = pa.array(column[1], type=pa.int32())
+
+    assert_column_eq(expect, res)

From e434fdbc546dd1810c750abdd086f07b694782b2 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 28 Jun 2024 10:57:01 -0400
Subject: [PATCH 431/842] Update libcudf compiler requirements in contributing
 doc (#16103)

Updates the compiler requirements in the contributing document.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16103
---
 CONTRIBUTING.md | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 98c2ec0a22e..4fbc28fa6e1 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -71,15 +71,14 @@ for a minimal build of libcudf without using conda are also listed below.
 
 Compilers:
 
-* `gcc` version 9.3+
-* `nvcc` version 11.5+
-* `cmake` version 3.26.4+
+* `gcc` version 11.4+
+* `nvcc` version 11.8+
+* `cmake` version 3.29.6+
 
-CUDA/GPU:
+CUDA/GPU Runtime:
 
-* CUDA 11.5+
-* NVIDIA driver 450.80.02+
-* Volta architecture or better (Compute Capability >=7.0)
+* CUDA 11.4+
+* Volta architecture or better ([Compute Capability](https://docs.nvidia.com/deploy/cuda-compatibility/) >=7.0)
 
 You can obtain CUDA from
 [https://developer.nvidia.com/cuda-downloads](https://developer.nvidia.com/cuda-downloads).

From a4b951a6c140c05178edb61d8e28f51a4b430e15 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Fri, 28 Jun 2024 10:20:42 -0500
Subject: [PATCH 432/842] Templatization of fixed-width parquet decoding
 kernels. (#15911)

This PR merges all of the fixed-width parquet decoding kernels into a single templatized kernel that can be selectively instantiated with desired features (dictionary/no-dictionary, nested/non-nested, etc).  It also adds support for (non-list) nested columns in this path. So structs do not have to use the much slower general decode kernel any more.

A new benchmark was added specific to structs containing only fixed width columns.  I added this because the performance improvement is fairly high (+20%) but we don't see it in the normal struct benchmarks because they include (and are dominated by) string decode times.  The new benchmark shows:

Before this PR:
```
| data_type |    io_type    | cardinality | run_length | bytes_per_second | peak_memory_usage | encoded_file_size |
|-----------|---------------|-------------|------------|------------------|-------------------|-------------------|
|    STRUCT | DEVICE_BUFFER |           0 |          1 |      21071216823 |         1.047 GiB |       511.675 MiB |
|    STRUCT | DEVICE_BUFFER |        1000 |          1 |      18974392387 |       821.312 MiB |       128.884 MiB |
|    STRUCT | DEVICE_BUFFER |           0 |         32 |      20429356824 |      621.787 MiB  |        28.141 MiB |
|    STRUCT | DEVICE_BUFFER |        1000 |         32 |      20572327813 |       598.421 MiB |        16.475 MiB |
```

After this PR:

```
| data_type |    io_type    | cardinality | run_length | bytes_per_second | peak_memory_usage | encoded_file_size |
|-----------|---------------|-------------|------------|------------------|-------------------|-------------------|
|    STRUCT | DEVICE_BUFFER |           0 |          1 |      25805996399 |         1.047 GiB |       511.675 MiB |
|    STRUCT | DEVICE_BUFFER |        1000 |          1 |      22422306660 |       821.312 MiB |       128.884 MiB |
|    STRUCT | DEVICE_BUFFER |           0 |         32 |      24460694014 |       621.787 MiB |        28.141 MiB |
|    STRUCT | DEVICE_BUFFER |        1000 |         32 |      24674861214 |       598.421 MiB |        16.475 MiB |
```

Split-page decoding for fixed-width types + structs are also going through this new path. New test added.

This brings us closer to eliminating the "general" kernel.  The only things left that run through it are lists and booleans.

This is PR 1 of 2, with the followup moving a lot of code around.  At this point, I think it makes sense to start consolidating our files a bit.

I also left some breadcrumbs (a few small commented out code blocks) in the core kernel `gpuDecodePageDataGeneric` for the next step of adding list support. They can be removed if people don't like them.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/15911
---
 .../io/parquet/parquet_reader_input.cpp       |  50 +-
 cpp/src/io/parquet/decode_fixed.cu            | 896 ++++++++++--------
 cpp/src/io/parquet/page_hdr.cu                |  16 +-
 cpp/src/io/parquet/parquet_gpu.hpp            |  46 +-
 cpp/src/io/parquet/reader_impl.cpp            |  57 +-
 cpp/tests/io/parquet_writer_test.cpp          |  97 +-
 6 files changed, 703 insertions(+), 459 deletions(-)

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
index 019e0f30fe9..7563c823454 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -59,20 +59,18 @@ void parquet_read_common(cudf::size_type num_rows_to_read,
 }
 
 template <data_type DataType>
-void BM_parquet_read_data(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
+void BM_parquet_read_data_common(nvbench::state& state,
+                                 data_profile const& profile,
+                                 nvbench::type_list<nvbench::enum_type<DataType>>)
 {
   auto const d_type      = get_type_or_group(static_cast<int32_t>(DataType));
-  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
-  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
   auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
   auto const compression = cudf::io::compression_type::SNAPPY;
   cuio_source_sink_pair source_sink(source_type);
 
   auto const num_rows_written = [&]() {
-    auto const tbl = create_random_table(
-      cycle_dtypes(d_type, num_cols),
-      table_size_bytes{data_size},
-      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const tbl =
+      create_random_table(cycle_dtypes(d_type, num_cols), table_size_bytes{data_size}, profile);
     auto const view = tbl->view();
 
     cudf::io::parquet_writer_options write_opts =
@@ -85,6 +83,32 @@ void BM_parquet_read_data(nvbench::state& state, nvbench::type_list<nvbench::enu
   parquet_read_common(num_rows_written, num_cols, source_sink, state);
 }
 
+template <data_type DataType>
+void BM_parquet_read_data(nvbench::state& state,
+                          nvbench::type_list<nvbench::enum_type<DataType>> type_list)
+{
+  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  BM_parquet_read_data_common<DataType>(
+    state, data_profile_builder().cardinality(cardinality).avg_run_length(run_length), type_list);
+}
+
+template <data_type DataType>
+void BM_parquet_read_fixed_width_struct(nvbench::state& state,
+                                        nvbench::type_list<nvbench::enum_type<DataType>> type_list)
+{
+  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  std::vector<cudf::type_id> s_types{
+    cudf::type_id::INT32, cudf::type_id::FLOAT32, cudf::type_id::INT64};
+  BM_parquet_read_data_common<DataType>(state,
+                                        data_profile_builder()
+                                          .cardinality(cardinality)
+                                          .avg_run_length(run_length)
+                                          .struct_types(s_types),
+                                        type_list);
+}
+
 void BM_parquet_read_io_compression(nvbench::state& state)
 {
   auto const d_type = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
@@ -247,3 +271,13 @@ NVBENCH_BENCH(BM_parquet_read_io_small_mixed)
   .add_int64_axis("cardinality", {0, 1000})
   .add_int64_axis("run_length", {1, 32})
   .add_int64_axis("num_string_cols", {1, 2, 3});
+
+// a benchmark for structs that only contain fixed-width types
+using d_type_list_struct_only = nvbench::enum_type_list<data_type::STRUCT>;
+NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only))
+  .set_name("parquet_read_fixed_width_struct")
+  .set_type_axes_names({"data_type"})
+  .add_string_axis("io_type", {"DEVICE_BUFFER"})
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index bfd89200786..ea80ae73c2f 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -24,136 +24,11 @@ namespace cudf::io::parquet::detail {
 
 namespace {
 
-constexpr int decode_block_size = 128;
-constexpr int rolling_buf_size  = decode_block_size * 2;
-// the required number of runs in shared memory we will need to provide the
-// rle_stream object
-constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<decode_block_size>();
-
-template <bool nullable, typename level_t, typename state_buf>
-static __device__ int gpuUpdateValidityOffsetsAndRowIndicesFlat(
-  int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
-{
-  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
-  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
-
-  auto& ni = s->nesting_info[0];
-
-  // how many (input) values we've processed in the page so far
-  int value_count = s->input_value_count;
-  int valid_count = ni.valid_count;
-
-  // cap by last row so that we don't process any rows past what we want to output.
-  int const first_row                 = s->first_row;
-  int const last_row                  = first_row + s->num_rows;
-  int const capped_target_value_count = min(target_value_count, last_row);
-
-  int const valid_map_offset      = ni.valid_map_offset;
-  int const row_index_lower_bound = s->row_index_lower_bound;
-
-  __syncthreads();
-
-  while (value_count < capped_target_value_count) {
-    int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
-
-    // definition level. only need to process for nullable columns
-    int d = 0;
-    if constexpr (nullable) {
-      d = t < batch_size
-            ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
-            : -1;
-    }
-
-    int const thread_value_count = t + 1;
-    int const block_value_count  = batch_size;
-
-    // compute our row index, whether we're in row bounds, and validity
-    int const row_index     = (thread_value_count + value_count) - 1;
-    int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
-    int is_valid;
-    if constexpr (nullable) {
-      is_valid = ((d > 0) && in_row_bounds) ? 1 : 0;
-    } else {
-      is_valid = in_row_bounds;
-    }
-
-    // thread and block validity count
-    int thread_valid_count, block_valid_count;
-    if constexpr (nullable) {
-      using block_scan = cub::BlockScan<int, decode_block_size>;
-      __shared__ typename block_scan::TempStorage scan_storage;
-      block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
-      __syncthreads();
-
-      // validity is processed per-warp
-      //
-      // nested schemas always read and write to the same bounds (that is, read and write
-      // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
-      // at the first value, even if that is before first_row, because we cannot trivially jump to
-      // the correct position to start reading. since we are about to write the validity vector
-      // here we need to adjust our computed mask to take into account the write row bounds.
-      int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
-      int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
-      int warp_null_count   = 0;
-      if (write_start >= 0) {
-        uint32_t const warp_validity_mask = ballot(is_valid);
-        // lane 0 from each warp writes out validity
-        if ((t % cudf::detail::warp_size) == 0) {
-          int const vindex = (value_count + thread_value_count) - 1;  // absolute input value index
-          int const bit_offset = (valid_map_offset + vindex + write_start) -
-                                 first_row;  // absolute bit offset into the output validity map
-          int const write_end =
-            cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
-          int const bit_count = write_end - write_start;
-          warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
-
-          store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
-        }
-      }
-
-      // sum null counts. we have to do it this way instead of just incrementing by (value_count -
-      // valid_count) because valid_count also includes rows that potentially start before our row
-      // bounds. if we could come up with a way to clean that up, we could remove this and just
-      // compute it directly at the end of the kernel.
-      size_type const block_null_count =
-        cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
-      if (t == 0) { ni.null_count += block_null_count; }
-    }
-    // trivial for non-nullable columns
-    else {
-      thread_valid_count = thread_value_count;
-      block_valid_count  = block_value_count;
-    }
-
-    // output offset
-    if (is_valid) {
-      int const dst_pos = (value_count + thread_value_count) - 1;
-      int const src_pos = (valid_count + thread_valid_count) - 1;
-      sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
-    }
-
-    // update stuff
-    value_count += block_value_count;
-    valid_count += block_valid_count;
-  }
-
-  if (t == 0) {
-    // update valid value count for decoding and total # of values we've processed
-    ni.valid_count       = valid_count;
-    ni.value_count       = value_count;
-    s->nz_count          = valid_count;
-    s->input_value_count = value_count;
-    s->input_row_count   = value_count;
-  }
-
-  return valid_count;
-}
-
-template <typename state_buf>
-__device__ inline void gpuDecodeValues(
+template <int block_size, typename state_buf>
+__device__ inline void gpuDecodeFixedWidthValues(
   page_state_s* s, state_buf* const sb, int start, int end, int t)
 {
-  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int num_warps      = block_size / cudf::detail::warp_size;
   constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
@@ -217,18 +92,22 @@ __device__ inline void gpuDecodeValues(
   }
 }
 
-template <typename state_buf>
-__device__ inline void gpuDecodeSplitValues(page_state_s* s,
-                                            state_buf* const sb,
-                                            int start,
-                                            int end)
+template <int block_size, typename state_buf>
+struct decode_fixed_width_values_func {
+  __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t)
+  {
+    gpuDecodeFixedWidthValues<block_size, state_buf>(s, sb, start, end, t);
+  }
+};
+
+template <int block_size, typename state_buf>
+__device__ inline void gpuDecodeFixedWidthSplitValues(
+  page_state_s* s, state_buf* const sb, int start, int end, int t)
 {
   using cudf::detail::warp_size;
-  constexpr int num_warps      = decode_block_size / warp_size;
+  constexpr int num_warps      = block_size / warp_size;
   constexpr int max_batch_size = num_warps * warp_size;
 
-  auto const t = threadIdx.x;
-
   PageNestingDecodeInfo* nesting_info_base = s->nesting_info;
   int const dtype                          = s->col.physical_type;
   auto const data_len                      = thrust::distance(s->data_start, s->data_end);
@@ -307,266 +186,293 @@ __device__ inline void gpuDecodeSplitValues(page_state_s* s,
   }
 }
 
-// is the page marked nullable or not
-__device__ inline bool is_nullable(page_state_s* s)
-{
-  auto const lvl           = level_type::DEFINITION;
-  auto const max_def_level = s->col.max_level[lvl];
-  return max_def_level > 0;
-}
+template <int block_size, typename state_buf>
+struct decode_fixed_width_split_values_func {
+  __device__ inline void operator()(page_state_s* s, state_buf* const sb, int start, int end, int t)
+  {
+    gpuDecodeFixedWidthSplitValues<block_size, state_buf>(s, sb, start, end, t);
+  }
+};
 
-// for a nullable page, check to see if it could have nulls
-__device__ inline bool has_nulls(page_state_s* s)
+template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+static __device__ int gpuUpdateValidityAndRowIndicesNested(
+  int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
-  auto const lvl      = level_type::DEFINITION;
-  auto const init_run = s->initial_rle_run[lvl];
-  // literal runs, lets assume they could hold nulls
-  if (is_literal_run(init_run)) { return true; }
-
-  // repeated run with number of items in the run not equal
-  // to the rows in the page, assume that means we could have nulls
-  if (s->page.num_input_values != (init_run >> 1)) { return true; }
-
-  auto const lvl_bits = s->col.level_bits[lvl];
-  auto const run_val  = lvl_bits == 0 ? 0 : s->initial_rle_value[lvl];
-
-  // the encoded repeated value isn't valid, we have (all) nulls
-  return run_val != s->col.max_level[lvl];
-}
+  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
-/**
- * @brief Kernel for computing fixed width non dictionary column data stored in the pages
- *
- * This function will write the page data and the page data's validity to the
- * output specified in the page's column chunk. If necessary, additional
- * conversion will be performed to translate from the Parquet datatype to
- * desired output datatype.
- *
- * @param pages List of pages
- * @param chunks List of column chunks
- * @param min_row Row index to start reading at
- * @param num_rows Maximum number of rows to read
- * @param error_code Error code to set if an error is encountered
- */
-template <typename level_t>
-CUDF_KERNEL void __launch_bounds__(decode_block_size)
-  gpuDecodePageDataFixed(PageInfo* pages,
-                         device_span<ColumnChunkDesc const> chunks,
-                         size_t min_row,
-                         size_t num_rows,
-                         kernel_error::pointer error_code)
-{
-  __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
-                                                1,                 // unused in this kernel
-                                                1>                 // unused in this kernel
-    state_buffers;
+  // how many (input) values we've processed in the page so far
+  int value_count = s->input_value_count;
 
-  page_state_s* const s = &state_g;
-  auto* const sb        = &state_buffers;
-  int const page_idx    = blockIdx.x;
-  int const t           = threadIdx.x;
-  PageInfo* pp          = &pages[page_idx];
+  // cap by last row so that we don't process any rows past what we want to output.
+  int const first_row                 = s->first_row;
+  int const last_row                  = first_row + s->num_rows;
+  int const capped_target_value_count = min(target_value_count, last_row);
 
-  if (!(BitAnd(pages[page_idx].kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT))) { return; }
+  int const row_index_lower_bound = s->row_index_lower_bound;
 
-  // must come after the kernel mask check
-  [[maybe_unused]] null_count_back_copier _{s, t};
+  int const max_depth = s->col.max_nesting_depth - 1;
+  __syncthreads();
 
-  if (!setupLocalPageInfo(s,
-                          pp,
-                          chunks,
-                          min_row,
-                          num_rows,
-                          mask_filter{decode_kernel_mask::FIXED_WIDTH_NO_DICT},
-                          page_processing_stage::DECODE)) {
-    return;
-  }
+  while (value_count < capped_target_value_count) {
+    int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-  // the level stream decoders
-  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
-  rle_stream<level_t, decode_block_size, rolling_buf_size> def_decoder{def_runs};
+    // definition level. only need to process for nullable columns
+    int d = 0;
+    if constexpr (nullable) {
+      if (def) {
+        d = t < batch_size
+              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
+              : -1;
+      } else {
+        d = t < batch_size ? 1 : -1;
+      }
+    }
 
-  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
-  if (s->num_rows == 0) { return; }
+    int const thread_value_count = t + 1;
+    int const block_value_count  = batch_size;
 
-  bool const nullable            = is_nullable(s);
-  bool const nullable_with_nulls = nullable && has_nulls(s);
+    // compute our row index, whether we're in row bounds, and validity
+    int const row_index           = (thread_value_count + value_count) - 1;
+    int const in_row_bounds       = (row_index >= row_index_lower_bound) && (row_index < last_row);
+    int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
+    int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
+
+    // iterate by depth
+    for (int d_idx = 0; d_idx <= max_depth; d_idx++) {
+      auto& ni = s->nesting_info[d_idx];
+
+      int is_valid;
+      if constexpr (nullable) {
+        is_valid = ((d >= ni.max_def_level) && in_row_bounds) ? 1 : 0;
+      } else {
+        is_valid = in_row_bounds;
+      }
 
-  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
-  level_t* const def = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
-  if (nullable_with_nulls) {
-    def_decoder.init(s->col.level_bits[level_type::DEFINITION],
-                     s->abs_lvl_start[level_type::DEFINITION],
-                     s->abs_lvl_end[level_type::DEFINITION],
-                     def,
-                     s->page.num_input_values);
-  }
-  __syncthreads();
+      // thread and block validity count
+      int thread_valid_count, block_valid_count;
+      if constexpr (nullable) {
+        using block_scan = cub::BlockScan<int, decode_block_size>;
+        __shared__ typename block_scan::TempStorage scan_storage;
+        block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
+        __syncthreads();
+
+        // validity is processed per-warp
+        //
+        // nested schemas always read and write to the same bounds (that is, read and write
+        // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
+        // at the first value, even if that is before first_row, because we cannot trivially jump to
+        // the correct position to start reading. since we are about to write the validity vector
+        // here we need to adjust our computed mask to take into account the write row bounds.
+        int warp_null_count = 0;
+        if (write_start >= 0 && ni.valid_map != nullptr) {
+          int const valid_map_offset        = ni.valid_map_offset;
+          uint32_t const warp_validity_mask = ballot(is_valid);
+          // lane 0 from each warp writes out validity
+          if ((t % cudf::detail::warp_size) == 0) {
+            int const vindex =
+              (value_count + thread_value_count) - 1;  // absolute input value index
+            int const bit_offset = (valid_map_offset + vindex + write_start) -
+                                   first_row;  // absolute bit offset into the output validity map
+            int const write_end = cudf::detail::warp_size -
+                                  __clz(in_write_row_bounds);  // last bit in the warp to store
+            int const bit_count = write_end - write_start;
+            warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
+
+            store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
+          }
+        }
 
-  // We use two counters in the loop below: processed_count and valid_count.
-  // - processed_count: number of rows out of num_input_values that we have decoded so far.
-  //   the definition stream returns the number of total rows it has processed in each call
-  //   to decode_next and we accumulate in process_count.
-  // - valid_count: number of non-null rows we have decoded so far. In each iteration of the
-  //   loop below, we look at the number of valid items (which could be all for non-nullable),
-  //   and valid_count is that running count.
-  int processed_count = 0;
-  int valid_count     = 0;
-  // the core loop. decode batches of level stream data using rle_stream objects
-  // and pass the results to gpuDecodeValues
-  while (s->error == 0 && processed_count < s->page.num_input_values) {
-    int next_valid_count;
+        // sum null counts. we have to do it this way instead of just incrementing by (value_count -
+        // valid_count) because valid_count also includes rows that potentially start before our row
+        // bounds. if we could come up with a way to clean that up, we could remove this and just
+        // compute it directly at the end of the kernel.
+        size_type const block_null_count =
+          cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
+        if (t == 0) { ni.null_count += block_null_count; }
+      }
+      // trivial for non-nullable columns
+      else {
+        thread_valid_count = thread_value_count;
+        block_valid_count  = block_value_count;
+      }
 
-    // only need to process definition levels if the column has nulls
-    if (nullable_with_nulls) {
-      processed_count += def_decoder.decode_next(t);
-      __syncthreads();
+      // if this is valid and we're at the leaf, output dst_pos
+      __syncthreads();  // handle modification of ni.value_count from below
+      if (is_valid && d_idx == max_depth) {
+        // for non-list types, the value count is always the same across
+        int const dst_pos = (value_count + thread_value_count) - 1;
+        int const src_pos = (ni.valid_count + thread_valid_count) - 1;
+        sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+      }
+      __syncthreads();  // handle modification of ni.value_count from below
 
-      next_valid_count =
-        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
+      // update stuff
+      if (t == 0) { ni.valid_count += block_valid_count; }
     }
-    // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
-    // this function call entirely since all it will ever generate is a mapping of (i -> i) for
-    // nz_idx.  gpuDecodeValues would be the only work that happens.
-    else {
-      processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t);
-    }
-    __syncthreads();
 
-    // decode the values themselves
-    gpuDecodeValues(s, sb, valid_count, next_valid_count, t);
-    __syncthreads();
+    value_count += block_value_count;
+  }
 
-    valid_count = next_valid_count;
+  if (t == 0) {
+    // update valid value count for decoding and total # of values we've processed
+    s->nz_count          = s->nesting_info[max_depth].valid_count;
+    s->input_value_count = value_count;
+    s->input_row_count   = value_count;
   }
-  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
+
+  __syncthreads();
+  return s->nesting_info[max_depth].valid_count;
 }
 
-/**
- * @brief Kernel for computing fixed width dictionary column data stored in the pages
- *
- * This function will write the page data and the page data's validity to the
- * output specified in the page's column chunk. If necessary, additional
- * conversion will be performed to translate from the Parquet datatype to
- * desired output datatype.
- *
- * @param pages List of pages
- * @param chunks List of column chunks
- * @param min_row Row index to start reading at
- * @param num_rows Maximum number of rows to read
- * @param error_code Error code to set if an error is encountered
- */
-template <typename level_t>
-CUDF_KERNEL void __launch_bounds__(decode_block_size)
-  gpuDecodePageDataFixedDict(PageInfo* pages,
-                             device_span<ColumnChunkDesc const> chunks,
-                             size_t min_row,
-                             size_t num_rows,
-                             kernel_error::pointer error_code)
+template <int decode_block_size, bool nullable, typename level_t, typename state_buf>
+static __device__ int gpuUpdateValidityAndRowIndicesFlat(
+  int32_t target_value_count, page_state_s* s, state_buf* sb, level_t const* const def, int t)
 {
-  __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
-                                                rolling_buf_size,  // dictionary
-                                                1>                 // unused in this kernel
-    state_buffers;
-
-  page_state_s* const s = &state_g;
-  auto* const sb        = &state_buffers;
-  int const page_idx    = blockIdx.x;
-  int const t           = threadIdx.x;
-  PageInfo* pp          = &pages[page_idx];
+  constexpr int num_warps      = decode_block_size / cudf::detail::warp_size;
+  constexpr int max_batch_size = num_warps * cudf::detail::warp_size;
 
-  if (!(BitAnd(pages[page_idx].kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT))) { return; }
+  auto& ni = s->nesting_info[0];
 
-  // must come after the kernel mask check
-  [[maybe_unused]] null_count_back_copier _{s, t};
+  // how many (input) values we've processed in the page so far
+  int value_count = s->input_value_count;
+  int valid_count = ni.valid_count;
 
-  if (!setupLocalPageInfo(s,
-                          pp,
-                          chunks,
-                          min_row,
-                          num_rows,
-                          mask_filter{decode_kernel_mask::FIXED_WIDTH_DICT},
-                          page_processing_stage::DECODE)) {
-    return;
-  }
+  // cap by last row so that we don't process any rows past what we want to output.
+  int const first_row                 = s->first_row;
+  int const last_row                  = first_row + s->num_rows;
+  int const capped_target_value_count = min(target_value_count, last_row);
 
-  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
-  rle_stream<level_t, decode_block_size, rolling_buf_size> def_decoder{def_runs};
+  int const valid_map_offset      = ni.valid_map_offset;
+  int const row_index_lower_bound = s->row_index_lower_bound;
 
-  __shared__ rle_run<uint32_t> dict_runs[rle_run_buffer_size];
-  rle_stream<uint32_t, decode_block_size, rolling_buf_size> dict_stream{dict_runs};
+  __syncthreads();
 
-  // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
-  if (s->num_rows == 0) { return; }
+  while (value_count < capped_target_value_count) {
+    int const batch_size = min(max_batch_size, capped_target_value_count - value_count);
 
-  bool const nullable            = is_nullable(s);
-  bool const nullable_with_nulls = nullable && has_nulls(s);
+    // definition level. only need to process for nullable columns
+    int d = 0;
+    if constexpr (nullable) {
+      if (def) {
+        d = t < batch_size
+              ? static_cast<int>(def[rolling_index<state_buf::nz_buf_size>(value_count + t)])
+              : -1;
+      } else {
+        d = t < batch_size ? 1 : -1;
+      }
+    }
 
-  // initialize the stream decoders (requires values computed in setupLocalPageInfo)
-  level_t* const def = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
-  if (nullable_with_nulls) {
-    def_decoder.init(s->col.level_bits[level_type::DEFINITION],
-                     s->abs_lvl_start[level_type::DEFINITION],
-                     s->abs_lvl_end[level_type::DEFINITION],
-                     def,
-                     s->page.num_input_values);
-  }
+    int const thread_value_count = t + 1;
+    int const block_value_count  = batch_size;
 
-  dict_stream.init(
-    s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values);
-  __syncthreads();
+    // compute our row index, whether we're in row bounds, and validity
+    int const row_index     = (thread_value_count + value_count) - 1;
+    int const in_row_bounds = (row_index >= row_index_lower_bound) && (row_index < last_row);
+    int is_valid;
+    if constexpr (nullable) {
+      is_valid = ((d > 0) && in_row_bounds) ? 1 : 0;
+    } else {
+      is_valid = in_row_bounds;
+    }
 
-  // We use two counters in the loop below: processed_count and valid_count.
-  // - processed_count: number of rows out of num_input_values that we have decoded so far.
-  //   the definition stream returns the number of total rows it has processed in each call
-  //   to decode_next and we accumulate in process_count.
-  // - valid_count: number of non-null rows we have decoded so far. In each iteration of the
-  //   loop below, we look at the number of valid items (which could be all for non-nullable),
-  //   and valid_count is that running count.
-  int processed_count = 0;
-  int valid_count     = 0;
+    // thread and block validity count
+    int thread_valid_count, block_valid_count;
+    if constexpr (nullable) {
+      using block_scan = cub::BlockScan<int, decode_block_size>;
+      __shared__ typename block_scan::TempStorage scan_storage;
+      block_scan(scan_storage).InclusiveSum(is_valid, thread_valid_count, block_valid_count);
+      __syncthreads();
 
-  // the core loop. decode batches of level stream data using rle_stream objects
-  // and pass the results to gpuDecodeValues
-  while (s->error == 0 && processed_count < s->page.num_input_values) {
-    int next_valid_count;
+      // validity is processed per-warp
+      //
+      // nested schemas always read and write to the same bounds (that is, read and write
+      // positions are already pre-bounded by first_row/num_rows). flat schemas will start reading
+      // at the first value, even if that is before first_row, because we cannot trivially jump to
+      // the correct position to start reading. since we are about to write the validity vector
+      // here we need to adjust our computed mask to take into account the write row bounds.
+      int const in_write_row_bounds = ballot(row_index >= first_row && row_index < last_row);
+      int const write_start = __ffs(in_write_row_bounds) - 1;  // first bit in the warp to store
+      int warp_null_count   = 0;
+      if (write_start >= 0) {
+        uint32_t const warp_validity_mask = ballot(is_valid);
+        // lane 0 from each warp writes out validity
+        if ((t % cudf::detail::warp_size) == 0) {
+          int const vindex = (value_count + thread_value_count) - 1;  // absolute input value index
+          int const bit_offset = (valid_map_offset + vindex + write_start) -
+                                 first_row;  // absolute bit offset into the output validity map
+          int const write_end =
+            cudf::detail::warp_size - __clz(in_write_row_bounds);  // last bit in the warp to store
+          int const bit_count = write_end - write_start;
+          warp_null_count     = bit_count - __popc(warp_validity_mask >> write_start);
 
-    // only need to process definition levels if the column has nulls
-    if (nullable_with_nulls) {
-      processed_count += def_decoder.decode_next(t);
-      __syncthreads();
+          store_validity(bit_offset, ni.valid_map, warp_validity_mask >> write_start, bit_count);
+        }
+      }
 
-      // count of valid items in this batch
-      next_valid_count =
-        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
+      // sum null counts. we have to do it this way instead of just incrementing by (value_count -
+      // valid_count) because valid_count also includes rows that potentially start before our row
+      // bounds. if we could come up with a way to clean that up, we could remove this and just
+      // compute it directly at the end of the kernel.
+      size_type const block_null_count =
+        cudf::detail::single_lane_block_sum_reduce<decode_block_size, 0>(warp_null_count);
+      if (t == 0) { ni.null_count += block_null_count; }
     }
-    // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
-    // this function call entirely since all it will ever generate is a mapping of (i -> i) for
-    // nz_idx.  gpuDecodeValues would be the only work that happens.
+    // trivial for non-nullable columns
     else {
-      processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t);
+      thread_valid_count = thread_value_count;
+      block_valid_count  = block_value_count;
     }
-    __syncthreads();
 
-    // We want to limit the number of dictionary items we decode, that correspond to
-    // the rows we have processed in this iteration that are valid.
-    // We know the number of valid rows to process with: next_valid_count - valid_count.
-    dict_stream.decode_next(t, next_valid_count - valid_count);
-    __syncthreads();
+    // output offset
+    if (is_valid) {
+      int const dst_pos = (value_count + thread_value_count) - 1;
+      int const src_pos = (valid_count + thread_valid_count) - 1;
+      sb->nz_idx[rolling_index<state_buf::nz_buf_size>(src_pos)] = dst_pos;
+    }
 
-    // decode the values themselves
-    gpuDecodeValues(s, sb, valid_count, next_valid_count, t);
-    __syncthreads();
+    // update stuff
+    value_count += block_value_count;
+    valid_count += block_valid_count;
+  }
 
-    valid_count = next_valid_count;
+  if (t == 0) {
+    // update valid value count for decoding and total # of values we've processed
+    ni.valid_count       = valid_count;
+    ni.value_count       = value_count;  // TODO: remove? this is unused in the non-list path
+    s->nz_count          = valid_count;
+    s->input_value_count = value_count;
+    s->input_row_count   = value_count;
   }
-  if (t == 0 and s->error != 0) { set_error(s->error, error_code); }
+
+  return valid_count;
+}
+
+// is the page marked nullable or not
+__device__ inline bool is_nullable(page_state_s* s)
+{
+  auto const lvl           = level_type::DEFINITION;
+  auto const max_def_level = s->col.max_level[lvl];
+  return max_def_level > 0;
+}
+
+// for a nullable page, check to see if it could have nulls
+__device__ inline bool maybe_has_nulls(page_state_s* s)
+{
+  auto const lvl      = level_type::DEFINITION;
+  auto const init_run = s->initial_rle_run[lvl];
+  // literal runs, lets assume they could hold nulls
+  if (is_literal_run(init_run)) { return true; }
+
+  // repeated run with number of items in the run not equal
+  // to the rows in the page, assume that means we could have nulls
+  if (s->page.num_input_values != (init_run >> 1)) { return true; }
+
+  auto const lvl_bits = s->col.level_bits[lvl];
+  auto const run_val  = lvl_bits == 0 ? 0 : s->initial_rle_value[lvl];
+
+  // the encoded repeated value isn't valid, we have (all) nulls
+  return run_val != s->col.max_level[lvl];
 }
 
 /**
@@ -583,19 +489,28 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
  * @param num_rows Maximum number of rows to read
  * @param error_code Error code to set if an error is encountered
  */
-template <typename level_t>
-CUDF_KERNEL void __launch_bounds__(decode_block_size)
-  gpuDecodeSplitPageDataFlat(PageInfo* pages,
-                             device_span<ColumnChunkDesc const> chunks,
-                             size_t min_row,
-                             size_t num_rows,
-                             kernel_error::pointer error_code)
+template <typename level_t,
+          int decode_block_size_t,
+          decode_kernel_mask kernel_mask_t,
+          bool has_dict_t,
+          bool has_nesting_t,
+          template <int block_size, typename state_buf>
+          typename DecodeValuesFunc>
+CUDF_KERNEL void __launch_bounds__(decode_block_size_t)
+  gpuDecodePageDataGeneric(PageInfo* pages,
+                           device_span<ColumnChunkDesc const> chunks,
+                           size_t min_row,
+                           size_t num_rows,
+                           kernel_error::pointer error_code)
 {
+  constexpr int rolling_buf_size    = decode_block_size_t * 2;
+  constexpr int rle_run_buffer_size = rle_stream_required_run_buffer_size<decode_block_size_t>();
+
   __shared__ __align__(16) page_state_s state_g;
-  __shared__ __align__(16) page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
-                                                1,                 // unused in this kernel
-                                                1>                 // unused in this kernel
-    state_buffers;
+  using state_buf_t = page_state_buffers_s<rolling_buf_size,  // size of nz_idx buffer
+                                           has_dict_t ? rolling_buf_size : 1,
+                                           1>;
+  __shared__ __align__(16) state_buf_t state_buffers;
 
   page_state_s* const s = &state_g;
   auto* const sb        = &state_buffers;
@@ -603,9 +518,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   int const t           = threadIdx.x;
   PageInfo* pp          = &pages[page_idx];
 
-  if (!(BitAnd(pages[page_idx].kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT))) {
-    return;
-  }
+  if (!(BitAnd(pages[page_idx].kernel_mask, kernel_mask_t))) { return; }
 
   // must come after the kernel mask check
   [[maybe_unused]] null_count_back_copier _{s, t};
@@ -615,30 +528,70 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
                           chunks,
                           min_row,
                           num_rows,
-                          mask_filter{decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT},
+                          mask_filter{kernel_mask_t},
                           page_processing_stage::DECODE)) {
     return;
   }
 
-  // the level stream decoders
-  __shared__ rle_run<level_t> def_runs[rle_run_buffer_size];
-  rle_stream<level_t, decode_block_size, rolling_buf_size> def_decoder{def_runs};
-
   // if we have no work to do (eg, in a skip_rows/num_rows case) in this page.
   if (s->num_rows == 0) { return; }
 
-  bool const nullable            = is_nullable(s);
-  bool const nullable_with_nulls = nullable && has_nulls(s);
+  DecodeValuesFunc<decode_block_size_t, state_buf_t> decode_values;
+
+  bool const nullable             = is_nullable(s);
+  bool const should_process_nulls = nullable && maybe_has_nulls(s);
+
+  // shared buffer. all shared memory is suballocated out of here
+  // constexpr int shared_rep_size = has_lists_t ? cudf::util::round_up_unsafe(rle_run_buffer_size *
+  // sizeof(rle_run<level_t>), size_t{16}) : 0;
+  constexpr int shared_dict_size =
+    has_dict_t
+      ? cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<uint32_t>), size_t{16})
+      : 0;
+  constexpr int shared_def_size =
+    cudf::util::round_up_unsafe(rle_run_buffer_size * sizeof(rle_run<level_t>), size_t{16});
+  constexpr int shared_buf_size = /*shared_rep_size +*/ shared_dict_size + shared_def_size;
+  __shared__ __align__(16) uint8_t shared_buf[shared_buf_size];
+
+  // setup all shared memory buffers
+  int shared_offset = 0;
+  /*
+  rle_run<level_t> *rep_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
+  if constexpr (has_lists_t){
+    shared_offset += shared_rep_size;
+  }
+  */
+  rle_run<uint32_t>* dict_runs = reinterpret_cast<rle_run<uint32_t>*>(shared_buf + shared_offset);
+  if constexpr (has_dict_t) { shared_offset += shared_dict_size; }
+  rle_run<level_t>* def_runs = reinterpret_cast<rle_run<level_t>*>(shared_buf + shared_offset);
 
   // initialize the stream decoders (requires values computed in setupLocalPageInfo)
+  rle_stream<level_t, decode_block_size_t, rolling_buf_size> def_decoder{def_runs};
   level_t* const def = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::DEFINITION]);
-  if (nullable_with_nulls) {
+  if (should_process_nulls) {
     def_decoder.init(s->col.level_bits[level_type::DEFINITION],
                      s->abs_lvl_start[level_type::DEFINITION],
                      s->abs_lvl_end[level_type::DEFINITION],
                      def,
                      s->page.num_input_values);
   }
+  /*
+  rle_stream<level_t, decode_block_size_t, rolling_buf_size> rep_decoder{rep_runs};
+  level_t* const rep = reinterpret_cast<level_t*>(pp->lvl_decode_buf[level_type::REPETITION]);
+  if constexpr(has_lists_t){
+    rep_decoder.init(s->col.level_bits[level_type::REPETITION],
+                     s->abs_lvl_start[level_type::REPETITION],
+                     s->abs_lvl_end[level_type::REPETITION],
+                     rep,
+                     s->page.num_input_values);
+  }
+  */
+
+  rle_stream<uint32_t, decode_block_size_t, rolling_buf_size> dict_stream{dict_runs};
+  if constexpr (has_dict_t) {
+    dict_stream.init(
+      s->dict_bits, s->data_start, s->data_end, sb->dict_idx, s->page.num_input_values);
+  }
   __syncthreads();
 
   // We use two counters in the loop below: processed_count and valid_count.
@@ -655,26 +608,47 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
   while (s->error == 0 && processed_count < s->page.num_input_values) {
     int next_valid_count;
 
-    // only need to process definition levels if the column has nulls
-    if (nullable_with_nulls) {
+    // only need to process definition levels if this is a nullable column
+    if (should_process_nulls) {
       processed_count += def_decoder.decode_next(t);
       __syncthreads();
 
-      next_valid_count =
-        gpuUpdateValidityOffsetsAndRowIndicesFlat<true, level_t>(processed_count, s, sb, def, t);
+      if constexpr (has_nesting_t) {
+        next_valid_count = gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, true, level_t>(
+          processed_count, s, sb, def, t);
+      } else {
+        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, true, level_t>(
+          processed_count, s, sb, def, t);
+      }
     }
     // if we wanted to split off the skip_rows/num_rows case into a separate kernel, we could skip
     // this function call entirely since all it will ever generate is a mapping of (i -> i) for
-    // nz_idx.  gpuDecodeValues would be the only work that happens.
+    // nz_idx.  gpuDecodeFixedWidthValues would be the only work that happens.
     else {
       processed_count += min(rolling_buf_size, s->page.num_input_values - processed_count);
-      next_valid_count = gpuUpdateValidityOffsetsAndRowIndicesFlat<false, level_t>(
-        processed_count, s, sb, nullptr, t);
+
+      if constexpr (has_nesting_t) {
+        next_valid_count =
+          gpuUpdateValidityAndRowIndicesNested<decode_block_size_t, false, level_t>(
+            processed_count, s, sb, nullptr, t);
+      } else {
+        next_valid_count = gpuUpdateValidityAndRowIndicesFlat<decode_block_size_t, false, level_t>(
+          processed_count, s, sb, nullptr, t);
+      }
     }
     __syncthreads();
 
+    // if we have dictionary data
+    if constexpr (has_dict_t) {
+      // We want to limit the number of dictionary items we decode, that correspond to
+      // the rows we have processed in this iteration that are valid.
+      // We know the number of valid rows to process with: next_valid_count - valid_count.
+      dict_stream.decode_next(t, next_valid_count - valid_count);
+      __syncthreads();
+    }
+
     // decode the values themselves
-    gpuDecodeSplitValues(s, sb, valid_count, next_valid_count);
+    decode_values(s, sb, valid_count, next_valid_count, t);
     __syncthreads();
 
     valid_count = next_valid_count;
@@ -689,18 +663,55 @@ void __host__ DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                                   size_t num_rows,
                                   size_t min_row,
                                   int level_type_size,
+                                  bool has_nesting,
                                   kernel_error::pointer error_code,
                                   rmm::cuda_stream_view stream)
 {
+  constexpr int decode_block_size = 128;
+
   dim3 dim_block(decode_block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
   if (level_type_size == 1) {
-    gpuDecodePageDataFixed<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED,
+                               false,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT,
+                               false,
+                               false,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   } else {
-    gpuDecodePageDataFixed<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED,
+                               false,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_NO_DICT,
+                               false,
+                               false,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   }
 }
 
@@ -709,40 +720,113 @@ void __host__ DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pa
                                       size_t num_rows,
                                       size_t min_row,
                                       int level_type_size,
+                                      bool has_nesting,
                                       kernel_error::pointer error_code,
                                       rmm::cuda_stream_view stream)
 {
-  //  dim3 dim_block(decode_block_size, 1); // decode_block_size = 128 threads per block
-  // 1 full warp, and 1 warp of 1 thread
+  constexpr int decode_block_size = 128;
+
   dim3 dim_block(decode_block_size, 1);  // decode_block_size = 128 threads per block
   dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
-    gpuDecodePageDataFixedDict<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT_NESTED,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT,
+                               true,
+                               false,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   } else {
-    gpuDecodePageDataFixedDict<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT_NESTED,
+                               true,
+                               true,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::FIXED_WIDTH_DICT,
+                               true,
+                               false,
+                               decode_fixed_width_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   }
 }
 
-void __host__ DecodeSplitPageDataFlat(cudf::detail::hostdevice_span<PageInfo> pages,
-                                      cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
-                                      size_t num_rows,
-                                      size_t min_row,
-                                      int level_type_size,
-                                      kernel_error::pointer error_code,
-                                      rmm::cuda_stream_view stream)
+void __host__
+DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
+                              cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                              size_t num_rows,
+                              size_t min_row,
+                              int level_type_size,
+                              bool has_nesting,
+                              kernel_error::pointer error_code,
+                              rmm::cuda_stream_view stream)
 {
+  constexpr int decode_block_size = 128;
+
   dim3 dim_block(decode_block_size, 1);  // decode_block_size = 128 threads per block
   dim3 dim_grid(pages.size(), 1);        // 1 thread block per page => # blocks
 
   if (level_type_size == 1) {
-    gpuDecodeSplitPageDataFlat<uint8_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
+                               true,
+                               true,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint8_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
+                               true,
+                               false,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   } else {
-    gpuDecodeSplitPageDataFlat<uint16_t><<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    if (has_nesting) {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
+                               true,
+                               true,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    } else {
+      gpuDecodePageDataGeneric<uint16_t,
+                               decode_block_size,
+                               decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
+                               true,
+                               false,
+                               decode_fixed_width_split_values_func>
+        <<<dim_grid, dim_block, 0, stream.value()>>>(
+          pages.device_ptr(), chunks, min_row, num_rows, error_code);
+    }
   }
 }
 
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index cf0dd85e490..d604642be54 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -145,6 +145,11 @@ __device__ inline bool is_nested(ColumnChunkDesc const& chunk)
   return chunk.max_nesting_depth > 1;
 }
 
+__device__ inline bool is_list(ColumnChunkDesc const& chunk)
+{
+  return chunk.max_level[level_type::REPETITION] > 0;
+}
+
 __device__ inline bool is_byte_array(ColumnChunkDesc const& chunk)
 {
   return chunk.physical_type == BYTE_ARRAY;
@@ -178,14 +183,17 @@ __device__ decode_kernel_mask kernel_mask_for_page(PageInfo const& page,
     return decode_kernel_mask::STRING;
   }
 
-  if (!is_nested(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
+  if (!is_list(chunk) && !is_byte_array(chunk) && !is_boolean(chunk)) {
     if (page.encoding == Encoding::PLAIN) {
-      return decode_kernel_mask::FIXED_WIDTH_NO_DICT;
+      return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED
+                              : decode_kernel_mask::FIXED_WIDTH_NO_DICT;
     } else if (page.encoding == Encoding::PLAIN_DICTIONARY ||
                page.encoding == Encoding::RLE_DICTIONARY) {
-      return decode_kernel_mask::FIXED_WIDTH_DICT;
+      return is_nested(chunk) ? decode_kernel_mask::FIXED_WIDTH_DICT_NESTED
+                              : decode_kernel_mask::FIXED_WIDTH_DICT;
     } else if (page.encoding == Encoding::BYTE_STREAM_SPLIT) {
-      return decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT;
+      return is_nested(chunk) ? decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED
+                              : decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT;
     }
   }
 
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index d82c6f0de59..efc1f5ebab1 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -207,16 +207,20 @@ enum level_type {
  * Used to control which decode kernels to run.
  */
 enum class decode_kernel_mask {
-  NONE                   = 0,
-  GENERAL                = (1 << 0),  // Run catch-all decode kernel
-  STRING                 = (1 << 1),  // Run decode kernel for string data
-  DELTA_BINARY           = (1 << 2),  // Run decode kernel for DELTA_BINARY_PACKED data
-  DELTA_BYTE_ARRAY       = (1 << 3),  // Run decode kernel for DELTA_BYTE_ARRAY encoded data
-  DELTA_LENGTH_BA        = (1 << 4),  // Run decode kernel for DELTA_LENGTH_BYTE_ARRAY encoded data
-  FIXED_WIDTH_NO_DICT    = (1 << 5),  // Run decode kernel for fixed width non-dictionary pages
-  FIXED_WIDTH_DICT       = (1 << 6),  // Run decode kernel for fixed width dictionary pages
-  BYTE_STREAM_SPLIT      = (1 << 7),  // Run decode kernel for BYTE_STREAM_SPLIT encoded data
-  BYTE_STREAM_SPLIT_FLAT = (1 << 8),  // Same as above but with a flat schema
+  NONE                = 0,
+  GENERAL             = (1 << 0),  // Run catch-all decode kernel
+  STRING              = (1 << 1),  // Run decode kernel for string data
+  DELTA_BINARY        = (1 << 2),  // Run decode kernel for DELTA_BINARY_PACKED data
+  DELTA_BYTE_ARRAY    = (1 << 3),  // Run decode kernel for DELTA_BYTE_ARRAY encoded data
+  DELTA_LENGTH_BA     = (1 << 4),  // Run decode kernel for DELTA_LENGTH_BYTE_ARRAY encoded data
+  FIXED_WIDTH_NO_DICT = (1 << 5),  // Run decode kernel for fixed width non-dictionary pages
+  FIXED_WIDTH_DICT    = (1 << 6),  // Run decode kernel for fixed width dictionary pages
+  BYTE_STREAM_SPLIT   = (1 << 7),  // Run decode kernel for BYTE_STREAM_SPLIT encoded data
+  BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT = (1 << 8),  // Same as above but for flat, fixed-width data
+  BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED =
+    (1 << 9),                              // Same as above but for nested, fixed-width data
+  FIXED_WIDTH_NO_DICT_NESTED = (1 << 10),  // Run decode kernel for fixed width non-dictionary pages
+  FIXED_WIDTH_DICT_NESTED    = (1 << 11),  // Run decode kernel for fixed width dictionary pages
 };
 
 // mask representing all the ways in which a string can be encoded
@@ -888,6 +892,7 @@ void DecodeDeltaLengthByteArray(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -896,6 +901,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
                          std::size_t num_rows,
                          size_t min_row,
                          int level_type_size,
+                         bool has_nesting,
                          kernel_error::pointer error_code,
                          rmm::cuda_stream_view stream);
 
@@ -910,6 +916,7 @@ void DecodePageDataFixed(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
@@ -918,11 +925,12 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
                              std::size_t num_rows,
                              size_t min_row,
                              int level_type_size,
+                             bool has_nesting,
                              kernel_error::pointer error_code,
                              rmm::cuda_stream_view stream);
 
 /**
- * @brief Launches kernel for reading dictionary fixed width column data stored in the pages
+ * @brief Launches kernel for reading fixed width column data stored in the pages
  *
  * The page data will be written to the output pointed to in the page's
  * associated column chunk.
@@ -932,16 +940,18 @@ void DecodePageDataFixedDict(cudf::detail::hostdevice_span<PageInfo> pages,
  * @param[in] num_rows Total number of rows to read
  * @param[in] min_row Minimum number of rows to read
  * @param[in] level_type_size Size in bytes of the type for level decoding
+ * @param[in] has_nesting Whether or not the data contains nested (but not list) data.
  * @param[out] error_code Error code for kernel failures
  * @param[in] stream CUDA stream to use
  */
-void DecodeSplitPageDataFlat(cudf::detail::hostdevice_span<PageInfo> pages,
-                             cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
-                             std::size_t num_rows,
-                             size_t min_row,
-                             int level_type_size,
-                             kernel_error::pointer error_code,
-                             rmm::cuda_stream_view stream);
+void DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
+                                   cudf::detail::hostdevice_span<ColumnChunkDesc const> chunks,
+                                   std::size_t num_rows,
+                                   size_t min_row,
+                                   int level_type_size,
+                                   bool has_nesting,
+                                   kernel_error::pointer error_code,
+                                   rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for initializing encoder row group fragments
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 1bd2fae281c..f705f6626e7 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -267,14 +267,27 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
   }
 
   // launch byte stream split decoder
-  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FLAT) != 0) {
-    DecodeSplitPageDataFlat(subpass.pages,
-                            pass.chunks,
-                            num_rows,
-                            skip_rows,
-                            level_type_size,
-                            error_code.data(),
-                            streams[s_idx++]);
+  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT) != 0) {
+    DecodeSplitPageFixedWidthData(subpass.pages,
+                                  pass.chunks,
+                                  num_rows,
+                                  skip_rows,
+                                  level_type_size,
+                                  false,
+                                  error_code.data(),
+                                  streams[s_idx++]);
+  }
+
+  // launch byte stream split decoder, for nested columns
+  if (BitAnd(kernel_mask, decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED) != 0) {
+    DecodeSplitPageFixedWidthData(subpass.pages,
+                                  pass.chunks,
+                                  num_rows,
+                                  skip_rows,
+                                  level_type_size,
+                                  true,
+                                  error_code.data(),
+                                  streams[s_idx++]);
   }
 
   // launch byte stream split decoder
@@ -288,22 +301,50 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num
                         streams[s_idx++]);
   }
 
+  // launch fixed width type decoder
   if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT) != 0) {
     DecodePageDataFixed(subpass.pages,
                         pass.chunks,
                         num_rows,
                         skip_rows,
                         level_type_size,
+                        false,
+                        error_code.data(),
+                        streams[s_idx++]);
+  }
+
+  // launch fixed width type decoder, for nested columns
+  if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_NO_DICT_NESTED) != 0) {
+    DecodePageDataFixed(subpass.pages,
+                        pass.chunks,
+                        num_rows,
+                        skip_rows,
+                        level_type_size,
+                        true,
                         error_code.data(),
                         streams[s_idx++]);
   }
 
+  // launch fixed width type decoder with dictionaries
   if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT) != 0) {
     DecodePageDataFixedDict(subpass.pages,
                             pass.chunks,
                             num_rows,
                             skip_rows,
                             level_type_size,
+                            false,
+                            error_code.data(),
+                            streams[s_idx++]);
+  }
+
+  // launch fixed width type decoder with dictionaries, for nested columns
+  if (BitAnd(kernel_mask, decode_kernel_mask::FIXED_WIDTH_DICT_NESTED) != 0) {
+    DecodePageDataFixedDict(subpass.pages,
+                            pass.chunks,
+                            num_rows,
+                            skip_rows,
+                            level_type_size,
+                            true,
                             error_code.data(),
                             streams[s_idx++]);
   }
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index 84ab83e33d0..a1f4c7b81d8 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -1785,7 +1785,8 @@ TEST_F(ParquetWriterTest, DeltaBinaryStartsWithNulls)
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
-TEST_F(ParquetWriterTest, ByteStreamSplit)
+std::pair<std::unique_ptr<cudf::table>, cudf::io::table_input_metadata>
+make_byte_stream_split_table(bool as_struct)
 {
   constexpr auto num_rows = 100;
   std::mt19937 engine{31337};
@@ -1802,24 +1803,73 @@ TEST_F(ParquetWriterTest, ByteStreamSplit)
   // throw in a list to make sure both decoders are working
   auto col4 = make_parquet_list_col<int32_t>(engine, num_rows, 5, true);
 
-  auto expected = table_view{{col0, col1, col2, col3, *col4}};
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.reserve(5);
+  columns.push_back(col0.release());
+  columns.push_back(col1.release());
+  columns.push_back(col2.release());
+  columns.push_back(col3.release());
+  columns.push_back(std::move(col4));
+
+  return [&]() -> std::pair<std::unique_ptr<cudf::table>, cudf::io::table_input_metadata> {
+    auto const encoding = cudf::io::column_encoding::BYTE_STREAM_SPLIT;
+
+    // make as a nested struct
+    if (as_struct) {
+      auto valids =
+        cudf::detail::make_counting_transform_iterator(0, [](int i) { return i % 2 == 0; });
+      auto [null_mask, null_count] = cudf::test::detail::make_null_mask(valids, valids + num_rows);
+
+      std::vector<std::unique_ptr<cudf::column>> table_cols;
+      table_cols.push_back(
+        cudf::make_structs_column(num_rows, std::move(columns), null_count, std::move(null_mask)));
+
+      auto tbl      = std::make_unique<cudf::table>(std::move(table_cols));
+      auto expected = table_view{*tbl};
+
+      cudf::io::table_input_metadata expected_metadata(expected);
+      expected_metadata.column_metadata[0].set_name("struct");
+      expected_metadata.column_metadata[0].set_encoding(encoding);
+
+      expected_metadata.column_metadata[0].child(0).set_name("int32s");
+      expected_metadata.column_metadata[0].child(1).set_name("int64s");
+      expected_metadata.column_metadata[0].child(2).set_name("floats");
+      expected_metadata.column_metadata[0].child(3).set_name("doubles");
+      expected_metadata.column_metadata[0].child(4).set_name("int32list");
+      for (int idx = 0; idx <= 3; idx++) {
+        expected_metadata.column_metadata[0].child(idx).set_encoding(encoding);
+      }
+      expected_metadata.column_metadata[0].child(4).child(1).set_encoding(encoding);
 
-  cudf::io::table_input_metadata expected_metadata(expected);
-  expected_metadata.column_metadata[0].set_name("int32s");
-  expected_metadata.column_metadata[1].set_name("int64s");
-  expected_metadata.column_metadata[2].set_name("floats");
-  expected_metadata.column_metadata[3].set_name("doubles");
-  expected_metadata.column_metadata[4].set_name("int32list");
-  auto const encoding = cudf::io::column_encoding::BYTE_STREAM_SPLIT;
-  for (int i = 0; i <= 3; i++) {
-    expected_metadata.column_metadata[i].set_encoding(encoding);
-  }
+      return {std::move(tbl), expected_metadata};
+    }
+
+    // make flat
+    auto tbl      = std::make_unique<cudf::table>(std::move(columns));
+    auto expected = table_view{*tbl};
 
-  expected_metadata.column_metadata[4].child(1).set_encoding(encoding);
+    cudf::io::table_input_metadata expected_metadata(expected);
+    expected_metadata.column_metadata[0].set_name("int32s");
+    expected_metadata.column_metadata[1].set_name("int64s");
+    expected_metadata.column_metadata[2].set_name("floats");
+    expected_metadata.column_metadata[3].set_name("doubles");
+    expected_metadata.column_metadata[4].set_name("int32list");
+    for (int idx = 0; idx <= 3; idx++) {
+      expected_metadata.column_metadata[idx].set_encoding(encoding);
+    }
+
+    expected_metadata.column_metadata[4].child(1).set_encoding(encoding);
+    return {std::move(tbl), expected_metadata};
+  }();
+}
+
+TEST_F(ParquetWriterTest, ByteStreamSplit)
+{
+  auto [expected, expected_metadata] = make_byte_stream_split_table(false);
 
   auto const filepath = temp_env->get_temp_filepath("ByteStreamSplit.parquet");
   cudf::io::parquet_writer_options out_opts =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected)
       .metadata(expected_metadata);
   cudf::io::write_parquet(out_opts);
 
@@ -1827,7 +1877,24 @@ TEST_F(ParquetWriterTest, ByteStreamSplit)
     cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
   auto result = cudf::io::read_parquet(in_opts);
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, result.tbl->view());
+}
+
+TEST_F(ParquetWriterTest, ByteStreamSplitStruct)
+{
+  auto [expected, expected_metadata] = make_byte_stream_split_table(true);
+
+  auto const filepath = temp_env->get_temp_filepath("ByteStreamSplitStruct.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected)
+      .metadata(expected_metadata);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, result.tbl->view());
 }
 
 TEST_F(ParquetWriterTest, DecimalByteStreamSplit)

From 78f4a8a3f639677358bce83a699f92c90476ae75 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 28 Jun 2024 11:26:27 -0400
Subject: [PATCH 433/842] Move common string utilities to public api (#16070)

As part of https://github.com/rapidsai/cudf/pull/15982 a subset of the strings utility functions have been identified as being worth expsosing as part of the cudf public API.

The `create_string_vector_from_column`, `get_offset64_threshold`, and `is_large_strings_enabled` are now made part of the public `cudf::strings` api.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - David Wendt (https://github.com/davidwendt)
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16070
---
 .../cudf/strings/detail/strings_children.cuh  |  7 ++-
 cpp/include/cudf/strings/utilities.hpp        | 62 +++++++++++++++++++
 cpp/src/strings/utilities.cu                  | 22 +++++--
 cpp/tests/column/factories_test.cpp           |  4 +-
 cpp/tests/copying/concatenate_tests.cpp       |  8 +--
 cpp/tests/strings/array_tests.cpp             |  4 +-
 cpp/tests/strings/repeat_strings_tests.cpp    |  4 +-
 .../strings/src/strings/udf/udf_apis.cu       |  4 +-
 8 files changed, 95 insertions(+), 20 deletions(-)
 create mode 100644 cpp/include/cudf/strings/utilities.hpp

diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index f105a6dc546..f5f3982a5d6 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -21,6 +21,7 @@
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -81,11 +82,11 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
   auto const total_bytes =
     cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);
 
-  auto const threshold = get_offset64_threshold();
-  CUDF_EXPECTS(is_large_strings_enabled() || (total_bytes < threshold),
+  auto const threshold = cudf::strings::get_offset64_threshold();
+  CUDF_EXPECTS(cudf::strings::is_large_strings_enabled() || (total_bytes < threshold),
                "Size of output exceeds the column size limit",
                std::overflow_error);
-  if (total_bytes >= get_offset64_threshold()) {
+  if (total_bytes >= cudf::strings::get_offset64_threshold()) {
     // recompute as int64 offsets when above the threshold
     offsets_column = make_numeric_column(
       data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
diff --git a/cpp/include/cudf/strings/utilities.hpp b/cpp/include/cudf/strings/utilities.hpp
new file mode 100644
index 00000000000..ae445282382
--- /dev/null
+++ b/cpp/include/cudf/strings/utilities.hpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
+
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+namespace CUDF_EXPORT cudf {
+namespace strings {
+
+/**
+ * @brief Creates a string_view vector from a strings column.
+ *
+ * @param strings Strings column instance.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned vector's device memory.
+ * @return Device vector of string_views
+ */
+rmm::device_uvector<string_view> create_string_vector_from_column(
+  cudf::strings_column_view const strings,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Return the threshold size for a strings column to use int64 offsets
+ *
+ * A computed size above this threshold should using int64 offsets, otherwise
+ * int32 offsets. By default this function will return std::numeric_limits<int32_t>::max().
+ * This value can be overridden at runtime using the environment variable
+ * LIBCUDF_LARGE_STRINGS_THRESHOLD.
+ *
+ * @return size in bytes
+ */
+int64_t get_offset64_threshold();
+
+/**
+ * @brief Checks if large strings is enabled
+ *
+ * This checks the setting in the environment variable LIBCUDF_LARGE_STRINGS_ENABLED.
+ *
+ * @return true if large strings are supported
+ */
+bool is_large_strings_enabled();
+
+}  // namespace strings
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 18e726a6d7d..101004a5d06 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -13,16 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include "strings/char_types/char_cases.h"
 #include "strings/char_types/char_flags.h"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/char_tables.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -36,8 +37,7 @@
 #include <cstdlib>
 #include <string>
 
-namespace cudf {
-namespace strings {
+namespace cudf::strings {
 namespace detail {
 
 /**
@@ -175,5 +175,17 @@ int64_t get_offset_value(cudf::column_view const& offsets,
 }
 
 }  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+
+rmm::device_uvector<string_view> create_string_vector_from_column(
+  cudf::strings_column_view const strings,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::create_string_vector_from_column(strings, stream, mr);
+}
+
+int64_t get_offset64_threshold() { return detail::get_offset64_threshold(); }
+bool is_large_strings_enabled() { return detail::is_large_strings_enabled(); }
+
+}  // namespace cudf::strings
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index dca36eaa4e7..603187f0330 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -24,7 +24,7 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -762,7 +762,7 @@ TEST_F(ColumnFactoryTest, FromStructScalarNull) { struct_from_scalar(false); }
 
 TEST_F(ColumnFactoryTest, FromScalarErrors)
 {
-  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+  if (cudf::strings::is_large_strings_enabled()) { return; }
   cudf::string_scalar ss("hello world");
   EXPECT_THROW(cudf::make_column_from_scalar(ss, 214748365), std::overflow_error);
 
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 078e0ef9bae..054441788d0 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -29,7 +29,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/filling.hpp>
-#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -189,7 +189,7 @@ TEST_F(StringColumnTest, ConcatenateManyColumns)
 
 TEST_F(StringColumnTest, ConcatenateTooLarge)
 {
-  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+  if (cudf::strings::is_large_strings_enabled()) { return; }
 
   std::string big_str(1000000, 'a');  // 1 million bytes x 5 = 5 million bytes
   cudf::test::strings_column_wrapper input{big_str, big_str, big_str, big_str, big_str};
@@ -379,7 +379,7 @@ TEST_F(OverflowTest, OverflowTest)
   }
 
   // string column, overflow on chars
-  if (!cudf::strings::detail::is_large_strings_enabled()) {
+  if (!cudf::strings::is_large_strings_enabled()) {
     constexpr auto size = static_cast<cudf::size_type>(static_cast<uint32_t>(1024) * 1024 * 1024);
 
     // try and concatenate 6 string columns of with 1 billion chars in each
@@ -502,7 +502,7 @@ TEST_F(OverflowTest, Presliced)
   }
 
   // strings, overflow on chars
-  if (!cudf::strings::detail::is_large_strings_enabled()) {
+  if (!cudf::strings::is_large_strings_enabled()) {
     constexpr cudf::size_type total_chars_size = 1024 * 1024 * 1024;
     constexpr cudf::size_type string_size      = 64;
     constexpr cudf::size_type num_rows         = total_chars_size / string_size;
diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp
index a1bb87a43fb..9c0ecaa52c0 100644
--- a/cpp/tests/strings/array_tests.cpp
+++ b/cpp/tests/strings/array_tests.cpp
@@ -23,8 +23,8 @@
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/sorting.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -153,7 +153,7 @@ TEST_F(StringsColumnTest, GatherZeroSizeStringsColumn)
 
 TEST_F(StringsColumnTest, GatherTooBig)
 {
-  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+  if (cudf::strings::is_large_strings_enabled()) { return; }
 
   std::vector<int8_t> h_chars(3000000);
   cudf::test::fixed_width_column_wrapper<int8_t> chars(h_chars.begin(), h_chars.end());
diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp
index 0539895c5f4..aa4d9320d7c 100644
--- a/cpp/tests/strings/repeat_strings_tests.cpp
+++ b/cpp/tests/strings/repeat_strings_tests.cpp
@@ -20,9 +20,9 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/utilities.hpp>
 
 using namespace cudf::test::iterators;
 
@@ -221,7 +221,7 @@ TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesInvalidInput)
 
 TEST_F(RepeatStringsTest, StringsColumnWithColumnRepeatTimesOverflowOutput)
 {
-  if (cudf::strings::detail::is_large_strings_enabled()) { return; }
+  if (cudf::strings::is_large_strings_enabled()) { return; }
 
   auto const strs    = strs_col{"1", "12", "123", "1234", "12345", "123456", "1234567"};
   auto const strs_cv = cudf::strings_column_view(strs);
diff --git a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
index 941e61e6787..b924995cf4b 100644
--- a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
+++ b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
@@ -15,10 +15,10 @@
  */
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/udf/udf_apis.hpp>
 #include <cudf/strings/udf/udf_string.cuh>
+#include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -57,7 +57,7 @@ std::unique_ptr<rmm::device_buffer> to_string_view_array(cudf::column_view const
                                                          rmm::cuda_stream_view stream)
 {
   return std::make_unique<rmm::device_buffer>(
-    std::move(cudf::strings::detail::create_string_vector_from_column(
+    std::move(cudf::strings::create_string_vector_from_column(
                 cudf::strings_column_view(input), stream, rmm::mr::get_current_device_resource())
                 .release()));
 }

From fb12d980342833a9d7092a19717eedad22328e6a Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Fri, 28 Jun 2024 12:14:58 -0400
Subject: [PATCH 434/842] Installed cudf header use cudf::allocate_like
 (#16087)

Remove usage of non public cudf::allocate_like from implementations in headers we install

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16087
---
 cpp/include/cudf/detail/copy_if.cuh       |  6 +++---
 cpp/include/cudf/detail/gather.cuh        | 13 ++++++-------
 cpp/src/copying/sample.cu                 |  1 +
 cpp/src/lists/copying/segmented_gather.cu |  1 +
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index c98057d077a..b6310e6cd2f 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -18,7 +18,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/copy.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -242,8 +242,8 @@ struct scatter_gather_functor {
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
   {
-    auto output_column = cudf::detail::allocate_like(
-      input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
+    auto output_column =
+      cudf::allocate_like(input, output_size, cudf::mask_allocation_policy::RETAIN, stream, mr);
     auto output = output_column->mutable_view();
 
     bool has_valid = input.nullable();
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index c9d350ce983..5977c7341c1 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <cudf/detail/copy.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/indexalator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/assert.cuh>
@@ -217,10 +217,9 @@ struct column_gatherer_impl<Element, std::enable_if_t<is_rep_layout_compatible<E
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
   {
-    auto const num_rows = cudf::distance(gather_map_begin, gather_map_end);
-    auto const policy   = cudf::mask_allocation_policy::NEVER;
-    auto destination_column =
-      cudf::detail::allocate_like(source_column, num_rows, policy, stream, mr);
+    auto const num_rows     = cudf::distance(gather_map_begin, gather_map_end);
+    auto const policy       = cudf::mask_allocation_policy::NEVER;
+    auto destination_column = cudf::allocate_like(source_column, num_rows, policy, stream, mr);
 
     gather_helper(source_column.data<Element>(),
                   source_column.size(),
@@ -413,8 +412,8 @@ struct column_gatherer_impl<dictionary32> {
     auto keys_copy = std::make_unique<column>(dictionary.keys(), stream, mr);
     // Perform gather on just the indices
     column_view indices = dictionary.get_indices_annotated();
-    auto new_indices    = cudf::detail::allocate_like(
-      indices, output_count, cudf::mask_allocation_policy::NEVER, stream, mr);
+    auto new_indices =
+      cudf::allocate_like(indices, output_count, cudf::mask_allocation_policy::NEVER, stream, mr);
     gather_helper(
       cudf::detail::indexalator_factory::make_input_iterator(indices),
       indices.size(),
diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu
index f8e3a9a83e3..ba00527f6b6 100644
--- a/cpp/src/copying/sample.cu
+++ b/cpp/src/copying/sample.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 89b1a126fc5..779eca438db 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/copy_range.cuh>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/indexalator.cuh>

From df88cf5ffccd8a454f17ba686dcb5ec0d7a045b3 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 28 Jun 2024 15:40:52 -0500
Subject: [PATCH 435/842] Use size_t to allow large conditional joins (#16127)

The conditional join kernels were using `cudf::size_type` where `std::size_t` was needed. This PR fixes that bug, which caused `cudaErrorIllegalAddress` as shown in #16115. This closes #16115.

I did not add tests because we typically do not test very large workloads. However, I committed the test and reverted it in this PR, so there is a record of my validation code.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - https://github.com/nvdbaranec
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16127
---
 cpp/src/join/conditional_join.cu          |   5 +-
 cpp/src/join/conditional_join_kernels.cuh | 124 ++++++++++++++++++++--
 cpp/src/join/join_common_utils.cuh        |  95 -----------------
 3 files changed, 117 insertions(+), 107 deletions(-)

diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 97a06d5a923..d4ef2747c9d 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -95,7 +95,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
     join_size = size.value(stream);
   }
 
-  rmm::device_scalar<size_type> write_index(0, stream);
+  rmm::device_scalar<std::size_t> write_index(0, stream);
 
   auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
@@ -232,13 +232,14 @@ conditional_join(table_view const& left,
                      std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
-  rmm::device_scalar<size_type> write_index(0, stream);
+  rmm::device_scalar<std::size_t> write_index(0, stream);
 
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
   auto const& join_output_l = left_indices->data();
   auto const& join_output_r = right_indices->data();
+
   if (has_nulls) {
     conditional_join<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, true>
       <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index 1e16c451f5a..62769862f54 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -29,6 +29,110 @@
 namespace cudf {
 namespace detail {
 
+/**
+ * @brief Adds a pair of indices to the shared memory cache
+ *
+ * @param[in] first The first index in the pair
+ * @param[in] second The second index in the pair
+ * @param[in,out] current_idx_shared Pointer to shared index that determines
+ * where in the shared memory cache the pair will be written
+ * @param[in] warp_id The ID of the warp of the calling the thread
+ * @param[out] joined_shared_l Pointer to the shared memory cache for left indices
+ * @param[out] joined_shared_r Pointer to the shared memory cache for right indices
+ */
+__inline__ __device__ void add_pair_to_cache(size_type const first,
+                                             size_type const second,
+                                             std::size_t* current_idx_shared,
+                                             int const warp_id,
+                                             size_type* joined_shared_l,
+                                             size_type* joined_shared_r)
+{
+  cuda::atomic_ref<std::size_t, cuda::thread_scope_block> ref{*(current_idx_shared + warp_id)};
+  std::size_t my_current_idx = ref.fetch_add(1, cuda::memory_order_relaxed);
+  // It's guaranteed to fit into the shared cache
+  joined_shared_l[my_current_idx] = first;
+  joined_shared_r[my_current_idx] = second;
+}
+
+__inline__ __device__ void add_left_to_cache(size_type const first,
+                                             std::size_t* current_idx_shared,
+                                             int const warp_id,
+                                             size_type* joined_shared_l)
+{
+  cuda::atomic_ref<std::size_t, cuda::thread_scope_block> ref{*(current_idx_shared + warp_id)};
+  std::size_t my_current_idx      = ref.fetch_add(1, cuda::memory_order_relaxed);
+  joined_shared_l[my_current_idx] = first;
+}
+
+template <int num_warps, cudf::size_type output_cache_size>
+__device__ void flush_output_cache(unsigned int const activemask,
+                                   std::size_t const max_size,
+                                   int const warp_id,
+                                   int const lane_id,
+                                   std::size_t* current_idx,
+                                   std::size_t current_idx_shared[num_warps],
+                                   size_type join_shared_l[num_warps][output_cache_size],
+                                   size_type join_shared_r[num_warps][output_cache_size],
+                                   size_type* join_output_l,
+                                   size_type* join_output_r)
+{
+  // count how many active threads participating here which could be less than warp_size
+  int const num_threads     = __popc(activemask);
+  std::size_t output_offset = 0;
+
+  if (0 == lane_id) {
+    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*current_idx};
+    output_offset = ref.fetch_add(current_idx_shared[warp_id], cuda::memory_order_relaxed);
+  }
+
+  // No warp sync is necessary here because we are assuming that ShuffleIndex
+  // is internally using post-CUDA 9.0 synchronization-safe primitives
+  // (__shfl_sync instead of __shfl). __shfl is technically not guaranteed to
+  // be safe by the compiler because it is not required by the standard to
+  // converge divergent branches before executing.
+  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
+
+  for (std::size_t shared_out_idx = static_cast<std::size_t>(lane_id);
+       shared_out_idx < current_idx_shared[warp_id];
+       shared_out_idx += num_threads) {
+    std::size_t thread_offset = output_offset + shared_out_idx;
+    if (thread_offset < max_size) {
+      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
+      join_output_r[thread_offset] = join_shared_r[warp_id][shared_out_idx];
+    }
+  }
+}
+
+template <int num_warps, cudf::size_type output_cache_size>
+__device__ void flush_output_cache(unsigned int const activemask,
+                                   std::size_t const max_size,
+                                   int const warp_id,
+                                   int const lane_id,
+                                   std::size_t* current_idx,
+                                   std::size_t current_idx_shared[num_warps],
+                                   size_type join_shared_l[num_warps][output_cache_size],
+                                   size_type* join_output_l)
+{
+  int const num_threads     = __popc(activemask);
+  std::size_t output_offset = 0;
+
+  if (0 == lane_id) {
+    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*current_idx};
+    output_offset = ref.fetch_add(current_idx_shared[warp_id], cuda::memory_order_relaxed);
+  }
+
+  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
+
+  for (std::size_t shared_out_idx = static_cast<std::size_t>(lane_id);
+       shared_out_idx < current_idx_shared[warp_id];
+       shared_out_idx += num_threads) {
+    std::size_t thread_offset = output_offset + shared_out_idx;
+    if (thread_offset < max_size) {
+      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
+    }
+  }
+}
+
 /**
  * @brief Computes the output size of joining the left table to the right table.
  *
@@ -103,14 +207,14 @@ CUDF_KERNEL void compute_conditional_join_output_size(
     }
   }
 
-  using BlockReduce = cub::BlockReduce<cudf::size_type, block_size>;
+  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
 
   // Add block counter to global counter
   if (threadIdx.x == 0) {
     cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*output_size};
-    ref.fetch_add(block_counter, cuda::std::memory_order_relaxed);
+    ref.fetch_add(block_counter, cuda::memory_order_relaxed);
   }
 }
 
@@ -143,13 +247,13 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,
                                   join_kind join_type,
                                   cudf::size_type* join_output_l,
                                   cudf::size_type* join_output_r,
-                                  cudf::size_type* current_idx,
+                                  std::size_t* current_idx,
                                   cudf::ast::detail::expression_device_view device_expression_data,
-                                  cudf::size_type const max_size,
+                                  std::size_t const max_size,
                                   bool const swap_tables)
 {
   constexpr int num_warps = block_size / detail::warp_size;
-  __shared__ cudf::size_type current_idx_shared[num_warps];
+  __shared__ std::size_t current_idx_shared[num_warps];
   __shared__ cudf::size_type join_shared_l[num_warps][output_cache_size];
   __shared__ cudf::size_type join_shared_r[num_warps][output_cache_size];
 
@@ -183,7 +287,7 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,
 
   if (outer_row_index < outer_num_rows) {
     bool found_match = false;
-    for (thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
+    for (cudf::thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
          ++inner_row_index) {
       auto output_dest           = cudf::ast::detail::value_expression_result<bool, has_nulls>();
       auto const left_row_index  = swap_tables ? inner_row_index : outer_row_index;
@@ -277,12 +381,12 @@ CUDF_KERNEL void conditional_join_anti_semi(
   table_device_view right_table,
   join_kind join_type,
   cudf::size_type* join_output_l,
-  cudf::size_type* current_idx,
+  std::size_t* current_idx,
   cudf::ast::detail::expression_device_view device_expression_data,
-  cudf::size_type const max_size)
+  std::size_t const max_size)
 {
   constexpr int num_warps = block_size / detail::warp_size;
-  __shared__ cudf::size_type current_idx_shared[num_warps];
+  __shared__ std::size_t current_idx_shared[num_warps];
   __shared__ cudf::size_type join_shared_l[num_warps][output_cache_size];
 
   extern __shared__ char raw_intermediate_storage[];
@@ -310,7 +414,7 @@ CUDF_KERNEL void conditional_join_anti_semi(
   for (cudf::thread_index_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
        outer_row_index += stride) {
     bool found_match = false;
-    for (thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
+    for (cudf::thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
          ++inner_row_index) {
       auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
 
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 31f267d5cfb..3d0f3e4340d 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -262,101 +262,6 @@ struct valid_range {
   }
 };
 
-/**
- * @brief Adds a pair of indices to the shared memory cache
- *
- * @param[in] first The first index in the pair
- * @param[in] second The second index in the pair
- * @param[in,out] current_idx_shared Pointer to shared index that determines
- * where in the shared memory cache the pair will be written
- * @param[in] warp_id The ID of the warp of the calling the thread
- * @param[out] joined_shared_l Pointer to the shared memory cache for left indices
- * @param[out] joined_shared_r Pointer to the shared memory cache for right indices
- */
-__inline__ __device__ void add_pair_to_cache(size_type const first,
-                                             size_type const second,
-                                             size_type* current_idx_shared,
-                                             int const warp_id,
-                                             size_type* joined_shared_l,
-                                             size_type* joined_shared_r)
-{
-  size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
-  // its guaranteed to fit into the shared cache
-  joined_shared_l[my_current_idx] = first;
-  joined_shared_r[my_current_idx] = second;
-}
-
-__inline__ __device__ void add_left_to_cache(size_type const first,
-                                             size_type* current_idx_shared,
-                                             int const warp_id,
-                                             size_type* joined_shared_l)
-{
-  size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
-
-  joined_shared_l[my_current_idx] = first;
-}
-
-template <int num_warps, cudf::size_type output_cache_size>
-__device__ void flush_output_cache(unsigned int const activemask,
-                                   cudf::size_type const max_size,
-                                   int const warp_id,
-                                   int const lane_id,
-                                   cudf::size_type* current_idx,
-                                   cudf::size_type current_idx_shared[num_warps],
-                                   size_type join_shared_l[num_warps][output_cache_size],
-                                   size_type join_shared_r[num_warps][output_cache_size],
-                                   size_type* join_output_l,
-                                   size_type* join_output_r)
-{
-  // count how many active threads participating here which could be less than warp_size
-  int const num_threads         = __popc(activemask);
-  cudf::size_type output_offset = 0;
-
-  if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
-
-  // No warp sync is necessary here because we are assuming that ShuffleIndex
-  // is internally using post-CUDA 9.0 synchronization-safe primitives
-  // (__shfl_sync instead of __shfl). __shfl is technically not guaranteed to
-  // be safe by the compiler because it is not required by the standard to
-  // converge divergent branches before executing.
-  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
-
-  for (int shared_out_idx = lane_id; shared_out_idx < current_idx_shared[warp_id];
-       shared_out_idx += num_threads) {
-    cudf::size_type thread_offset = output_offset + shared_out_idx;
-    if (thread_offset < max_size) {
-      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
-      join_output_r[thread_offset] = join_shared_r[warp_id][shared_out_idx];
-    }
-  }
-}
-
-template <int num_warps, cudf::size_type output_cache_size>
-__device__ void flush_output_cache(unsigned int const activemask,
-                                   cudf::size_type const max_size,
-                                   int const warp_id,
-                                   int const lane_id,
-                                   cudf::size_type* current_idx,
-                                   cudf::size_type current_idx_shared[num_warps],
-                                   size_type join_shared_l[num_warps][output_cache_size],
-                                   size_type* join_output_l)
-{
-  int const num_threads         = __popc(activemask);
-  cudf::size_type output_offset = 0;
-
-  if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
-
-  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
-
-  for (int shared_out_idx = lane_id; shared_out_idx < current_idx_shared[warp_id];
-       shared_out_idx += num_threads) {
-    cudf::size_type thread_offset = output_offset + shared_out_idx;
-    if (thread_offset < max_size) {
-      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
-    }
-  }
-}
-
 }  // namespace detail
 
 }  // namespace cudf

From 3c3edfef406288e164cc80ab82f9c64c0b88d0bd Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 28 Jun 2024 13:58:22 -0700
Subject: [PATCH 436/842] Update implementations to build with the latest cuco
 (#15938)

This PR updates existing libcudf to accommodate a cuco breaking change introduced in https://github.com/NVIDIA/cuCollections/pull/479. It helps avoid breaking cudf when bumping the cuco version in `rapids-cmake`.

Redundant equal/hash overloads will be removed once the version bump is done on the `rapids-cmake` end.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15938
---
 .../cudf/detail/distinct_hash_join.cuh        | 22 +++++++++++-
 cpp/src/join/distinct_hash_join.cu            | 10 +++---
 cpp/src/search/contains_table.cu              | 35 ++++++++++++++-----
 cpp/src/text/bpe/byte_pair_encoding.cuh       | 13 +++++++
 cpp/src/text/vocabulary_tokenize.cu           |  8 +++++
 5 files changed, 74 insertions(+), 14 deletions(-)

diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
index de3d23e9470..1ef8b3b120a 100644
--- a/cpp/include/cudf/detail/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -42,6 +42,9 @@ template <typename Equal>
 struct comparator_adapter {
   comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {}
 
+  // suppress "function was declared but never referenced warning"
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 177
   __device__ constexpr auto operator()(
     cuco::pair<hash_value_type, lhs_index_type> const&,
     cuco::pair<hash_value_type, lhs_index_type> const&) const noexcept
@@ -50,6 +53,14 @@ struct comparator_adapter {
     return false;
   }
 
+  __device__ constexpr auto operator()(
+    cuco::pair<hash_value_type, rhs_index_type> const&,
+    cuco::pair<hash_value_type, rhs_index_type> const&) const noexcept
+  {
+    // All build table keys are distinct thus `false` no matter what
+    return false;
+  }
+
   __device__ constexpr auto operator()(
     cuco::pair<hash_value_type, lhs_index_type> const& lhs,
     cuco::pair<hash_value_type, rhs_index_type> const& rhs) const noexcept
@@ -58,6 +69,15 @@ struct comparator_adapter {
     return _d_equal(lhs.second, rhs.second);
   }
 
+  __device__ constexpr auto operator()(
+    cuco::pair<hash_value_type, rhs_index_type> const& lhs,
+    cuco::pair<hash_value_type, lhs_index_type> const& rhs) const noexcept
+  {
+    if (lhs.first != rhs.first) { return false; }
+    return _d_equal(lhs.second, rhs.second);
+  }
+#pragma nv_diagnostic pop
+
  private:
   Equal _d_equal;
 };
@@ -94,7 +114,7 @@ struct distinct_hash_join {
   using cuco_storage_type   = cuco::storage<1>;
 
   /// Hash table type
-  using hash_table_type = cuco::static_set<cuco::pair<hash_value_type, lhs_index_type>,
+  using hash_table_type = cuco::static_set<cuco::pair<hash_value_type, rhs_index_type>,
                                            cuco::extent<size_type>,
                                            cuda::thread_scope_device,
                                            comparator_adapter<d_equal_type>,
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index 5048da25e86..daa1bf17c0d 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -54,7 +54,7 @@ auto prepare_device_equal(
   cudf::null_equality compare_nulls)
 {
   auto const two_table_equal =
-    cudf::experimental::row::equality::two_table_comparator(build, probe);
+    cudf::experimental::row::equality::two_table_comparator(probe, build);
   return comparator_adapter{two_table_equal.equal_to<HasNested == cudf::has_nested::YES>(
     nullate::DYNAMIC{has_nulls}, compare_nulls)};
 }
@@ -113,7 +113,7 @@ distinct_hash_join<HasNested>::distinct_hash_join(cudf::table_view const& build,
     _hash_table{build.num_rows(),
                 CUCO_DESIRED_LOAD_FACTOR,
                 cuco::empty_key{cuco::pair{std::numeric_limits<hash_value_type>::max(),
-                                           lhs_index_type{JoinNoneValue}}},
+                                           rhs_index_type{JoinNoneValue}}},
                 prepare_device_equal<HasNested>(
                   _preprocessed_build, _preprocessed_probe, has_nulls, compare_nulls),
                 {},
@@ -131,7 +131,7 @@ distinct_hash_join<HasNested>::distinct_hash_join(cudf::table_view const& build,
   auto const d_hasher   = row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls});
 
   auto const iter = cudf::detail::make_counting_transform_iterator(
-    0, build_keys_fn<decltype(d_hasher), lhs_index_type>{d_hasher});
+    0, build_keys_fn<decltype(d_hasher), rhs_index_type>{d_hasher});
 
   size_type const build_table_num_rows{build.num_rows()};
   if (this->_nulls_equal == cudf::null_equality::EQUAL or (not cudf::nullable(this->_build))) {
@@ -174,7 +174,7 @@ distinct_hash_join<HasNested>::inner_join(rmm::cuda_stream_view stream,
     cudf::experimental::row::hash::row_hasher{this->_preprocessed_probe};
   auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls});
   auto const iter           = cudf::detail::make_counting_transform_iterator(
-    0, build_keys_fn<decltype(d_probe_hasher), rhs_index_type>{d_probe_hasher});
+    0, build_keys_fn<decltype(d_probe_hasher), lhs_index_type>{d_probe_hasher});
 
   auto const build_indices_begin =
     thrust::make_transform_output_iterator(build_indices->begin(), output_fn{});
@@ -216,7 +216,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> distinct_hash_join<HasNested>::l
       cudf::experimental::row::hash::row_hasher{this->_preprocessed_probe};
     auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls});
     auto const iter           = cudf::detail::make_counting_transform_iterator(
-      0, build_keys_fn<decltype(d_probe_hasher), rhs_index_type>{d_probe_hasher});
+      0, build_keys_fn<decltype(d_probe_hasher), lhs_index_type>{d_probe_hasher});
 
     auto const output_begin =
       thrust::make_transform_output_iterator(build_indices->begin(), output_fn{});
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index 466f9093194..fbb0f6cb0f5 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -53,12 +53,12 @@ struct hasher_adapter {
 
   __device__ constexpr auto operator()(lhs_index_type idx) const noexcept
   {
-    return _haystack_hasher(static_cast<size_type>(idx));
+    return _needle_hasher(static_cast<size_type>(idx));
   }
 
   __device__ constexpr auto operator()(rhs_index_type idx) const noexcept
   {
-    return _needle_hasher(static_cast<size_type>(idx));
+    return _haystack_hasher(static_cast<size_type>(idx));
   }
 
  private:
@@ -76,6 +76,9 @@ struct comparator_adapter {
   {
   }
 
+  // suppress "function was declared but never referenced warning"
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 177
   __device__ constexpr auto operator()(lhs_index_type lhs_index,
                                        lhs_index_type rhs_index) const noexcept
   {
@@ -85,12 +88,28 @@ struct comparator_adapter {
     return _self_equal(lhs, rhs);
   }
 
+  __device__ constexpr auto operator()(rhs_index_type lhs_index,
+                                       rhs_index_type rhs_index) const noexcept
+  {
+    auto const lhs = static_cast<size_type>(lhs_index);
+    auto const rhs = static_cast<size_type>(rhs_index);
+
+    return _self_equal(lhs, rhs);
+  }
+
   __device__ constexpr auto operator()(lhs_index_type lhs_index,
                                        rhs_index_type rhs_index) const noexcept
   {
     return _two_table_equal(lhs_index, rhs_index);
   }
 
+  __device__ constexpr auto operator()(rhs_index_type lhs_index,
+                                       lhs_index_type rhs_index) const noexcept
+  {
+    return _two_table_equal(lhs_index, rhs_index);
+  }
+#pragma nv_diagnostic pop
+
  private:
   SelfEqual const _self_equal;
   TwoTableEqual const _two_table_equal;
@@ -210,26 +229,26 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
 
   auto const self_equal = cudf::experimental::row::equality::self_comparator(preprocessed_haystack);
   auto const two_table_equal = cudf::experimental::row::equality::two_table_comparator(
-    preprocessed_haystack, preprocessed_needles);
+    preprocessed_needles, preprocessed_haystack);
 
   // The output vector.
   auto contained = rmm::device_uvector<bool>(needles.num_rows(), stream, mr);
 
   auto const haystack_iter = cudf::detail::make_counting_transform_iterator(
-    size_type{0}, cuda::proclaim_return_type<lhs_index_type>([] __device__(auto idx) {
-      return lhs_index_type{idx};
-    }));
-  auto const needles_iter = cudf::detail::make_counting_transform_iterator(
     size_type{0}, cuda::proclaim_return_type<rhs_index_type>([] __device__(auto idx) {
       return rhs_index_type{idx};
     }));
+  auto const needles_iter = cudf::detail::make_counting_transform_iterator(
+    size_type{0}, cuda::proclaim_return_type<lhs_index_type>([] __device__(auto idx) {
+      return lhs_index_type{idx};
+    }));
 
   auto const helper_func =
     [&](auto const& d_self_equal, auto const& d_two_table_equal, auto const& probing_scheme) {
       auto const d_equal = comparator_adapter{d_self_equal, d_two_table_equal};
 
       auto set = cuco::static_set{cuco::extent{compute_hash_table_size(haystack.num_rows())},
-                                  cuco::empty_key{lhs_index_type{-1}},
+                                  cuco::empty_key{rhs_index_type{-1}},
                                   d_equal,
                                   probing_scheme,
                                   {},
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cuh b/cpp/src/text/bpe/byte_pair_encoding.cuh
index 2ad22fd4e46..3bb574748b6 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cuh
+++ b/cpp/src/text/bpe/byte_pair_encoding.cuh
@@ -96,6 +96,14 @@ struct bpe_equal {
     auto const right = d_strings.element<cudf::string_view>(lhs + 1);
     return (left == rhs.first) && (right == rhs.second);
   }
+  // used by find
+  __device__ bool operator()(merge_pair_type const& lhs, cudf::size_type rhs) const noexcept
+  {
+    rhs *= 2;
+    auto const left  = d_strings.element<cudf::string_view>(rhs);
+    auto const right = d_strings.element<cudf::string_view>(rhs + 1);
+    return (left == lhs.first) && (right == lhs.second);
+  }
 };
 
 using bpe_probe_scheme = cuco::linear_probing<1, bpe_hasher>;
@@ -154,6 +162,11 @@ struct mp_equal {
     auto const left = d_strings.element<cudf::string_view>(lhs);
     return left == rhs;
   }
+  __device__ bool operator()(cudf::string_view const& lhs, cudf::size_type rhs) const noexcept
+  {
+    auto const right = d_strings.element<cudf::string_view>(rhs);
+    return lhs == right;
+  }
 };
 
 using mp_probe_scheme = cuco::linear_probing<1, mp_hasher>;
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index f012f7ce09a..ea09f5d17af 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -86,10 +86,18 @@ struct vocab_equal {
     return lhs == rhs;  // all rows are expected to be unique
   }
   // used by find
+  // suppress "function was declared but never referenced warning"
+#pragma nv_diagnostic push
+#pragma nv_diag_suppress 177
   __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
   {
     return d_strings.element<cudf::string_view>(lhs) == rhs;
   }
+  __device__ bool operator()(cudf::string_view const& lhs, cudf::size_type rhs) const noexcept
+  {
+    return d_strings.element<cudf::string_view>(rhs) == lhs;
+  }
+#pragma nv_diagnostic pop
 };
 
 using probe_scheme        = cuco::linear_probing<1, vocab_hasher>;

From 599ce95aa6c49ae1560b9617e18ed328f9f6a508 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 1 Jul 2024 09:35:35 +0100
Subject: [PATCH 437/842] Implement handlers for series literal in cudf-polars
 (#16113)

A query plan can contain a "literal" polars Series. Often, for example, when calling a contains-like function. To translate these, introduce a new `LiteralColumn` node to capture the concept and add an evaluation rule (converting from arrow).

Since list-dtype Series need the same casting treatment as in dataframe scan case, factor the casting out into a utility, and take the opportunity to handled casting of nested lists correctly.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16113
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 32 ++++++-
 python/cudf_polars/cudf_polars/dsl/ir.py      | 20 ++--
 .../cudf_polars/cudf_polars/dsl/translate.py  |  3 +
 .../cudf_polars/cudf_polars/utils/dtypes.py   | 81 +++++++++++++++-
 .../tests/expressions/test_literal.py         | 96 +++++++++++++++++++
 .../cudf_polars/tests/test_dataframescan.py   | 19 ++++
 6 files changed, 239 insertions(+), 12 deletions(-)
 create mode 100644 python/cudf_polars/tests/expressions/test_literal.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 17d7d15e4e5..16cfd9b9749 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -27,11 +27,12 @@
 import cudf._lib.pylibcudf as plc
 
 from cudf_polars.containers import Column, NamedColumn
-from cudf_polars.utils import sorting
+from cudf_polars.utils import dtypes, sorting
 
 if TYPE_CHECKING:
     from collections.abc import Mapping, Sequence
 
+    import polars.polars as plrs
     import polars.type_aliases as pl_types
 
     from cudf_polars.containers import DataFrame
@@ -369,6 +370,29 @@ def do_evaluate(
         return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1))
 
 
+class LiteralColumn(Expr):
+    __slots__ = ("value",)
+    _non_child = ("dtype", "value")
+    value: pa.Array[Any, Any]
+    children: tuple[()]
+
+    def __init__(self, dtype: plc.DataType, value: plrs.PySeries) -> None:
+        super().__init__(dtype)
+        data = value.to_arrow()
+        self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        # datatype of pyarrow array is correct by construction.
+        return Column(plc.interop.from_arrow(self.value))
+
+
 class Col(Expr):
     __slots__ = ("name",)
     _non_child = ("dtype", "name")
@@ -1156,6 +1180,12 @@ def __init__(
         super().__init__(dtype)
         self.op = op
         self.children = (left, right)
+        if (
+            op in (plc.binaryop.BinaryOperator.ADD, plc.binaryop.BinaryOperator.SUB)
+            and ({left.dtype.id(), right.dtype.id()}.issubset(dtypes.TIMELIKE_TYPES))
+            and not dtypes.have_compatible_resolution(left.dtype.id(), right.dtype.id())
+        ):
+            raise NotImplementedError("Casting rules for timelike types")
 
     _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = {
         pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 3f5f3c74050..abe26b14a90 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -29,7 +29,7 @@
 
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import DataFrame, NamedColumn
-from cudf_polars.utils import sorting
+from cudf_polars.utils import dtypes, sorting
 
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
@@ -130,6 +130,11 @@ class IR:
     schema: Schema
     """Mapping from column names to their data types."""
 
+    def __post_init__(self):
+        """Validate preconditions."""
+        if any(dtype.id() == plc.TypeId.EMPTY for dtype in self.schema.values()):
+            raise NotImplementedError("Cannot make empty columns.")
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """
         Evaluate the node and return a dataframe.
@@ -292,15 +297,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         table = pdf.to_arrow()
         schema = table.schema
         for i, field in enumerate(schema):
-            # TODO: Nested types
-            if field.type == pa.large_string():
-                # TODO: goes away when libcudf supports large strings
-                schema = schema.set(i, pa.field(field.name, pa.string()))
-            elif isinstance(field.type, pa.LargeListType):
-                # TODO: goes away when libcudf supports large lists
-                schema = schema.set(
-                    i, pa.field(field.name, pa.list_(field.type.field(0)))
-                )
+            schema = schema.set(
+                i, pa.field(field.name, dtypes.downcast_arrow_lists(field.type))
+            )
+        # No-op if the schema is unchanged.
         table = table.cast(schema)
         df = DataFrame.from_table(
             plc.interop.from_arrow(table), list(self.schema.keys())
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 953ff636cce..f4bf07ae1e0 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -12,6 +12,7 @@
 import pyarrow as pa
 from typing_extensions import assert_never
 
+import polars.polars as plrs
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
 import cudf._lib.pylibcudf as plc
@@ -383,6 +384,8 @@ def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr
 
 @_translate_expr.register
 def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
+    if isinstance(node.value, plrs.PySeries):
+        return expr.LiteralColumn(dtype, node.value)
     value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype))
     return expr.Literal(dtype, value)
 
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 3d4a643e1fc..507acb5d33a 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -7,13 +7,92 @@
 
 from functools import cache
 
+import pyarrow as pa
 from typing_extensions import assert_never
 
 import polars as pl
 
 import cudf._lib.pylibcudf as plc
 
-__all__ = ["from_polars"]
+__all__ = ["from_polars", "downcast_arrow_lists", "have_compatible_resolution"]
+
+
+TIMELIKE_TYPES: frozenset[plc.TypeId] = frozenset(
+    [
+        plc.TypeId.TIMESTAMP_MILLISECONDS,
+        plc.TypeId.TIMESTAMP_MICROSECONDS,
+        plc.TypeId.TIMESTAMP_NANOSECONDS,
+        plc.TypeId.TIMESTAMP_DAYS,
+        plc.TypeId.DURATION_MILLISECONDS,
+        plc.TypeId.DURATION_MICROSECONDS,
+        plc.TypeId.DURATION_NANOSECONDS,
+    ]
+)
+
+
+def have_compatible_resolution(lid: plc.TypeId, rid: plc.TypeId):
+    """
+    Do two datetime typeids have matching resolution for a binop.
+
+    Parameters
+    ----------
+    lid
+       Left type id
+    rid
+       Right type id
+
+    Returns
+    -------
+    True if resolutions are compatible, False otherwise.
+
+    Notes
+    -----
+    Polars has different casting rules for combining
+    datetimes/durations than libcudf, and while we don't encode the
+    casting rules fully, just reject things we can't handle.
+
+    Precondition for correctness: both lid and rid are timelike.
+    """
+    if lid == rid:
+        return True
+    # Timestamps are smaller than durations in the libcudf enum.
+    lid, rid = sorted([lid, rid])
+    if lid == plc.TypeId.TIMESTAMP_MILLISECONDS:
+        return rid == plc.TypeId.DURATION_MILLISECONDS
+    elif lid == plc.TypeId.TIMESTAMP_MICROSECONDS:
+        return rid == plc.TypeId.DURATION_MICROSECONDS
+    elif lid == plc.TypeId.TIMESTAMP_NANOSECONDS:
+        return rid == plc.TypeId.DURATION_NANOSECONDS
+    return False
+
+
+def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType:
+    """
+    Sanitize an arrow datatype from polars.
+
+    Parameters
+    ----------
+    typ
+        Arrow type to sanitize
+
+    Returns
+    -------
+    Sanitized arrow type
+
+    Notes
+    -----
+    As well as arrow ``ListType``s, polars can produce
+    ``LargeListType``s and ``FixedSizeListType``s, these are not
+    currently handled by libcudf, so we attempt to cast them all into
+    normal ``ListType``s on the arrow side before consuming the arrow
+    data.
+    """
+    if isinstance(typ, pa.LargeListType):
+        return pa.list_(downcast_arrow_lists(typ.value_type))
+    # We don't have to worry about diving into struct types for now
+    # since those are always NotImplemented before we get here.
+    assert not isinstance(typ, pa.StructType)
+    return typ
 
 
 @cache
diff --git a/python/cudf_polars/tests/expressions/test_literal.py b/python/cudf_polars/tests/expressions/test_literal.py
new file mode 100644
index 00000000000..55e688428bd
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_literal.py
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
+from cudf_polars.utils import dtypes
+
+
+@pytest.fixture(
+    params=[
+        None,
+        pl.Int8(),
+        pl.Int16(),
+        pl.Int32(),
+        pl.Int64(),
+        pl.UInt8(),
+        pl.UInt16(),
+        pl.UInt32(),
+        pl.UInt64(),
+    ]
+)
+def integer(request):
+    return pl.lit(10, dtype=request.param)
+
+
+@pytest.fixture(params=[None, pl.Float32(), pl.Float64()])
+def float(request):
+    return pl.lit(1.0, dtype=request.param)
+
+
+def test_numeric_literal(integer, float):
+    df = pl.LazyFrame({})
+
+    q = df.select(integer=integer, float_=float, sum_=integer + float)
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.fixture(
+    params=[pl.Date(), pl.Datetime("ms"), pl.Datetime("us"), pl.Datetime("ns")]
+)
+def timestamp(request):
+    return pl.lit(10_000, dtype=request.param)
+
+
+@pytest.fixture(params=[pl.Duration("ms"), pl.Duration("us"), pl.Duration("ns")])
+def timedelta(request):
+    return pl.lit(9_000, dtype=request.param)
+
+
+def test_timelike_literal(timestamp, timedelta):
+    df = pl.LazyFrame({})
+
+    q = df.select(
+        time=timestamp,
+        delta=timedelta,
+        adjusted=timestamp + timedelta,
+        two_delta=timedelta + timedelta,
+    )
+    schema = q.collect_schema()
+    time_type = schema["time"]
+    delta_type = schema["delta"]
+    if dtypes.have_compatible_resolution(
+        dtypes.from_polars(time_type).id(), dtypes.from_polars(delta_type).id()
+    ):
+        assert_gpu_result_equal(q)
+    else:
+        assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_select_literal_series():
+    df = pl.LazyFrame({})
+
+    q = df.select(
+        a=pl.Series(["a", "b", "c"], dtype=pl.String()),
+        b=pl.Series([[1, 2], [3], None], dtype=pl.List(pl.UInt16())),
+        c=pl.Series([[[1]], [], [[1, 2, 3, 4]]], dtype=pl.List(pl.List(pl.Float32()))),
+    )
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("expr", [pl.lit(None), pl.lit(10, dtype=pl.Decimal())])
+def test_unsupported_literal_raises(expr):
+    df = pl.LazyFrame({})
+
+    q = df.select(expr)
+
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/test_dataframescan.py b/python/cudf_polars/tests/test_dataframescan.py
index 1ffe06ac562..b5c0fb7be9f 100644
--- a/python/cudf_polars/tests/test_dataframescan.py
+++ b/python/cudf_polars/tests/test_dataframescan.py
@@ -41,3 +41,22 @@ def test_scan_drop_nulls(subset, predicate_pushdown):
     assert_gpu_result_equal(
         q, collect_kwargs={"predicate_pushdown": predicate_pushdown}
     )
+
+
+def test_can_convert_lists():
+    df = pl.LazyFrame(
+        {
+            "a": pl.Series([[1, 2], [3]], dtype=pl.List(pl.Int8())),
+            "b": pl.Series([[1], [2]], dtype=pl.List(pl.UInt16())),
+            "c": pl.Series(
+                [
+                    [["1", "2", "3"], ["4", "567"]],
+                    [["8", "9"], []],
+                ],
+                dtype=pl.List(pl.List(pl.String())),
+            ),
+            "d": pl.Series([[[1, 2]], []], dtype=pl.List(pl.List(pl.UInt16()))),
+        }
+    )
+
+    assert_gpu_result_equal(df)

From e932fbd9dd59aafd17b41b80a8b94424e8d367a2 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 1 Jul 2024 09:17:32 -0700
Subject: [PATCH 438/842] Add patch for incorrect cuco noexcept clauses
 (#16077)

[cuco previously marked a number of methods as noexcept that can in fact
throw exceptions](https://github.com/nvidia/cuCollections/issues/510).
This causes problems for cudf functions that call these methods. The
issue [was fixed in cuco
upstream](https://github.com/NVIDIA/cuCollections/pull/511), but we
cannot easily update to the latest commit of cuco, especially in a patch
fix for 24.06. This PR instead adds a rapids-cmake patch for the cuco
clone to address this issue. The patch may be removed once we update to
a commit of cuco that contains the necessary fix.

Resolves #16059
---
 cpp/cmake/thirdparty/get_cucollections.cmake  |   7 +-
 .../thirdparty/patches/cuco_noexcept.diff     | 227 ++++++++++++++++++
 .../thirdparty/patches/cuco_override.json     |  14 ++
 3 files changed, 247 insertions(+), 1 deletion(-)
 create mode 100644 cpp/cmake/thirdparty/patches/cuco_noexcept.diff
 create mode 100644 cpp/cmake/thirdparty/patches/cuco_override.json

diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index 9758958b44f..6ec35ddcaf1 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -15,6 +15,11 @@
 # This function finds cuCollections and performs any additional configuration.
 function(find_and_configure_cucollections)
   include(${rapids-cmake-dir}/cpm/cuco.cmake)
+  include(${rapids-cmake-dir}/cpm/package_override.cmake)
+
+  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
+  rapids_cpm_package_override("${cudf_patch_dir}/cuco_override.json")
+
   if(BUILD_SHARED_LIBS)
     rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports)
   else()
diff --git a/cpp/cmake/thirdparty/patches/cuco_noexcept.diff b/cpp/cmake/thirdparty/patches/cuco_noexcept.diff
new file mode 100644
index 00000000000..0f334c0e81f
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/cuco_noexcept.diff
@@ -0,0 +1,227 @@
+diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh
+index 7f9de01..5228193 100644
+--- a/include/cuco/aow_storage.cuh
++++ b/include/cuco/aow_storage.cuh
+@@ -81,7 +81,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
+    * @param size Number of windows to (de)allocate
+    * @param allocator Allocator used for (de)allocating device storage
+    */
+-  explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}) noexcept;
++  explicit constexpr aow_storage(Extent size, Allocator const& allocator = {});
+ 
+   aow_storage(aow_storage&&) = default;  ///< Move constructor
+   /**
+@@ -122,7 +122,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
+    * @param key Key to which all keys in `slots` are initialized
+    * @param stream Stream used for executing the kernel
+    */
+-  void initialize(value_type key, cuda_stream_ref stream = {}) noexcept;
++  void initialize(value_type key, cuda_stream_ref stream = {});
+ 
+   /**
+    * @brief Asynchronously initializes each slot in the AoW storage to contain `key`.
+diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
+index c2c9c14..8ac4236 100644
+--- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh
++++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
+@@ -125,7 +125,7 @@ class open_addressing_impl {
+                                  KeyEqual const& pred,
+                                  ProbingScheme const& probing_scheme,
+                                  Allocator const& alloc,
+-                                 cuda_stream_ref stream) noexcept
++                                 cuda_stream_ref stream)
+     : empty_slot_sentinel_{empty_slot_sentinel},
+       erased_key_sentinel_{this->extract_key(empty_slot_sentinel)},
+       predicate_{pred},
+@@ -233,7 +233,7 @@ class open_addressing_impl {
+    *
+    * @param stream CUDA stream this operation is executed in
+    */
+-  void clear(cuda_stream_ref stream) noexcept { storage_.initialize(empty_slot_sentinel_, stream); }
++  void clear(cuda_stream_ref stream) { storage_.initialize(empty_slot_sentinel_, stream); }
+ 
+   /**
+    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
+@@ -599,7 +599,7 @@ class open_addressing_impl {
+    *
+    * @return The number of elements in the container
+    */
+-  [[nodiscard]] size_type size(cuda_stream_ref stream) const noexcept
++  [[nodiscard]] size_type size(cuda_stream_ref stream) const
+   {
+     auto counter =
+       detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
+diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
+index e17a145..3fa1d02 100644
+--- a/include/cuco/detail/static_map/static_map.inl
++++ b/include/cuco/detail/static_map/static_map.inl
+@@ -123,7 +123,7 @@ template <class Key,
+           class Allocator,
+           class Storage>
+ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
+-  cuda_stream_ref stream) noexcept
++  cuda_stream_ref stream)
+ {
+   impl_->clear(stream);
+ }
+@@ -215,7 +215,7 @@ template <class Key,
+           class Storage>
+ template <typename InputIt>
+ void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
+-  insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream) noexcept
++  insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream)
+ {
+   return this->insert_or_assign_async(first, last, stream);
+   stream.synchronize();
+@@ -465,7 +465,7 @@ template <class Key,
+           class Storage>
+ static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+ static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
+-  cuda_stream_ref stream) const noexcept
++  cuda_stream_ref stream) const
+ {
+   return impl_->size(stream);
+ }
+diff --git a/include/cuco/detail/static_multiset/static_multiset.inl b/include/cuco/detail/static_multiset/static_multiset.inl
+index 174f9bc..582926b 100644
+--- a/include/cuco/detail/static_multiset/static_multiset.inl
++++ b/include/cuco/detail/static_multiset/static_multiset.inl
+@@ -97,7 +97,7 @@ template <class Key,
+           class Allocator,
+           class Storage>
+ void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
+-  cuda_stream_ref stream) noexcept
++  cuda_stream_ref stream)
+ {
+   impl_->clear(stream);
+ }
+@@ -183,7 +183,7 @@ template <class Key,
+           class Storage>
+ static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+ static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
+-  cuda_stream_ref stream) const noexcept
++  cuda_stream_ref stream) const
+ {
+   return impl_->size(stream);
+ }
+diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl
+index 645013f..d3cece0 100644
+--- a/include/cuco/detail/static_set/static_set.inl
++++ b/include/cuco/detail/static_set/static_set.inl
+@@ -98,7 +98,7 @@ template <class Key,
+           class Allocator,
+           class Storage>
+ void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
+-  cuda_stream_ref stream) noexcept
++  cuda_stream_ref stream)
+ {
+   impl_->clear(stream);
+ }
+@@ -429,7 +429,7 @@ template <class Key,
+           class Storage>
+ static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
+ static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
+-  cuda_stream_ref stream) const noexcept
++  cuda_stream_ref stream) const
+ {
+   return impl_->size(stream);
+ }
+diff --git a/include/cuco/detail/storage/aow_storage.inl b/include/cuco/detail/storage/aow_storage.inl
+index 3547f4c..94b7f98 100644
+--- a/include/cuco/detail/storage/aow_storage.inl
++++ b/include/cuco/detail/storage/aow_storage.inl
+@@ -32,8 +32,8 @@
+ namespace cuco {
+ 
+ template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
+-constexpr aow_storage<T, WindowSize, Extent, Allocator>::aow_storage(
+-  Extent size, Allocator const& allocator) noexcept
++constexpr aow_storage<T, WindowSize, Extent, Allocator>::aow_storage(Extent size,
++                                                                     Allocator const& allocator)
+   : detail::aow_storage_base<T, WindowSize, Extent>{size},
+     allocator_{allocator},
+     window_deleter_{capacity(), allocator_},
+@@ -64,7 +64,7 @@ aow_storage<T, WindowSize, Extent, Allocator>::ref() const noexcept
+ 
+ template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
+ void aow_storage<T, WindowSize, Extent, Allocator>::initialize(value_type key,
+-                                                               cuda_stream_ref stream) noexcept
++                                                               cuda_stream_ref stream)
+ {
+   this->initialize_async(key, stream);
+   stream.synchronize();
+diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
+index c86e90c..95da423 100644
+--- a/include/cuco/static_map.cuh
++++ b/include/cuco/static_map.cuh
+@@ -269,7 +269,7 @@ class static_map {
+    *
+    * @param stream CUDA stream this operation is executed in
+    */
+-  void clear(cuda_stream_ref stream = {}) noexcept;
++  void clear(cuda_stream_ref stream = {});
+ 
+   /**
+    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
+@@ -387,7 +387,7 @@ class static_map {
+    * @param stream CUDA stream used for insert
+    */
+   template <typename InputIt>
+-  void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept;
++  void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {});
+ 
+   /**
+    * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k`
+@@ -690,7 +690,7 @@ class static_map {
+    * @param stream CUDA stream used to get the number of inserted elements
+    * @return The number of elements in the container
+    */
+-  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
++  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
+ 
+   /**
+    * @brief Gets the maximum number of elements the hash map can hold.
+diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh
+index 0daf103..fbcbc9c 100644
+--- a/include/cuco/static_multiset.cuh
++++ b/include/cuco/static_multiset.cuh
+@@ -235,7 +235,7 @@ class static_multiset {
+    *
+    * @param stream CUDA stream this operation is executed in
+    */
+-  void clear(cuda_stream_ref stream = {}) noexcept;
++  void clear(cuda_stream_ref stream = {});
+ 
+   /**
+    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
+@@ -339,7 +339,7 @@ class static_multiset {
+    * @param stream CUDA stream used to get the number of inserted elements
+    * @return The number of elements in the container
+    */
+-  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
++  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
+ 
+   /**
+    * @brief Gets the maximum number of elements the multiset can hold.
+diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh
+index a069939..3517f84 100644
+--- a/include/cuco/static_set.cuh
++++ b/include/cuco/static_set.cuh
+@@ -240,7 +240,7 @@ class static_set {
+    *
+    * @param stream CUDA stream this operation is executed in
+    */
+-  void clear(cuda_stream_ref stream = {}) noexcept;
++  void clear(cuda_stream_ref stream = {});
+ 
+   /**
+    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
+@@ -687,7 +687,7 @@ class static_set {
+    * @param stream CUDA stream used to get the number of inserted elements
+    * @return The number of elements in the container
+    */
+-  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
++  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
+ 
+   /**
+    * @brief Gets the maximum number of elements the hash set can hold.
diff --git a/cpp/cmake/thirdparty/patches/cuco_override.json b/cpp/cmake/thirdparty/patches/cuco_override.json
new file mode 100644
index 00000000000..ae0a9a4b4f0
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/cuco_override.json
@@ -0,0 +1,14 @@
+
+{
+  "packages" : {
+    "cuco" : {
+      "patches" : [
+        {
+          "file" : "${current_json_dir}/cuco_noexcept.diff",
+          "issue" : "Remove erroneous noexcept clauses on cuco functions that may throw [https://github.com/rapidsai/cudf/issues/16059]",
+          "fixed_in" : ""
+        }
+      ]
+    }
+  }
+}

From 4e34a20a31fae2546f9cfbaa520d7561b80563c7 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 1 Jul 2024 11:18:25 -0500
Subject: [PATCH 439/842] Backport: Fix segfault in conditional join (#16094)
 (#16100)

Backports #16094 to 24.06 for inclusion in a hotfix release.
---
 cpp/src/join/conditional_join.cu         | 13 +---
 cpp/tests/join/conditional_join_tests.cu | 92 +++++++++++++++++-------
 2 files changed, 70 insertions(+), 35 deletions(-)

diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index f02dee5f7f5..97a06d5a923 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -48,8 +48,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
 {
   if (right.num_rows() == 0) {
     switch (join_type) {
-      case join_kind::LEFT_ANTI_JOIN:
-        return std::make_unique<rmm::device_uvector<size_type>>(left.num_rows(), stream, mr);
+      case join_kind::LEFT_ANTI_JOIN: return get_trivial_left_join_indices(left, stream, mr).first;
       case join_kind::LEFT_SEMI_JOIN:
         return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
       default: CUDF_FAIL("Invalid join kind."); break;
@@ -96,10 +95,6 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
     join_size = size.value(stream);
   }
 
-  if (left.num_rows() == 0) {
-    return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);
-  }
-
   rmm::device_scalar<size_type> write_index(0, stream);
 
   auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
@@ -149,8 +144,7 @@ conditional_join(table_view const& left,
       // with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
       case join_kind::LEFT_ANTI_JOIN:
-      case join_kind::FULL_JOIN:
-        return get_trivial_left_join_indices(left, stream, rmm::mr::get_current_device_resource());
+      case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left, stream, mr);
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
       case join_kind::LEFT_SEMI_JOIN:
@@ -169,8 +163,7 @@ conditional_join(table_view const& left,
                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       // Full joins need to return the trivial complement.
       case join_kind::FULL_JOIN: {
-        auto ret_flipped =
-          get_trivial_left_join_indices(right, stream, rmm::mr::get_current_device_resource());
+        auto ret_flipped = get_trivial_left_join_indices(right, stream, mr);
         return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
       }
       default: CUDF_FAIL("Invalid join kind."); break;
diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu
index 79968bcd7f4..7ab4a2ea465 100644
--- a/cpp/tests/join/conditional_join_tests.cu
+++ b/cpp/tests/join/conditional_join_tests.cu
@@ -20,6 +20,7 @@
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/join.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -222,21 +223,25 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
              std::vector<std::pair<cudf::size_type, cudf::size_type>> expected_outputs)
   {
     auto result_size = this->join_size(left, right, predicate);
-    EXPECT_TRUE(result_size == expected_outputs.size());
-
-    auto result = this->join(left, right, predicate);
-    std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs;
-    for (size_t i = 0; i < result.first->size(); ++i) {
-      // Note: Not trying to be terribly efficient here since these tests are
-      // small, otherwise a batch copy to host before constructing the tuples
-      // would be important.
-      result_pairs.push_back({result.first->element(i, cudf::get_default_stream()),
-                              result.second->element(i, cudf::get_default_stream())});
-    }
+    EXPECT_EQ(result_size, expected_outputs.size());
+
+    auto result     = this->join(left, right, predicate);
+    auto lhs_result = cudf::detail::make_std_vector_sync(*result.first, cudf::get_default_stream());
+    auto rhs_result =
+      cudf::detail::make_std_vector_sync(*result.second, cudf::get_default_stream());
+    std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs(lhs_result.size());
+    std::transform(lhs_result.begin(),
+                   lhs_result.end(),
+                   rhs_result.begin(),
+                   result_pairs.begin(),
+                   [](cudf::size_type lhs, cudf::size_type rhs) {
+                     return std::pair{lhs, rhs};
+                   });
     std::sort(result_pairs.begin(), result_pairs.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
 
-    EXPECT_TRUE(std::equal(expected_outputs.begin(), expected_outputs.end(), result_pairs.begin()));
+    EXPECT_TRUE(std::equal(
+      expected_outputs.begin(), expected_outputs.end(), result_pairs.begin(), result_pairs.end()));
   }
 
   /*
@@ -411,6 +416,11 @@ TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnLeftEmpty)
   this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
 };
 
+TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {});
+};
+
 TYPED_TEST(ConditionalInnerJoinTest, TestOneColumnTwoRowAllEqual)
 {
   this->test({{0, 1}}, {{0, 0}}, left_zero_eq_right_zero, {{0, 0}, {0, 1}});
@@ -600,6 +610,14 @@ TYPED_TEST(ConditionalLeftJoinTest, TestOneColumnLeftEmpty)
   this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
 };
 
+TYPED_TEST(ConditionalLeftJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}},
+             {{}},
+             left_zero_eq_right_zero,
+             {{0, JoinNoneValue}, {1, JoinNoneValue}, {2, JoinNoneValue}});
+};
+
 TYPED_TEST(ConditionalLeftJoinTest, TestCompareRandomToHash)
 {
   auto [left, right] = gen_random_repeated_columns<TypeParam>();
@@ -666,6 +684,14 @@ TYPED_TEST(ConditionalFullJoinTest, TestOneColumnLeftEmpty)
              {{JoinNoneValue, 0}, {JoinNoneValue, 1}, {JoinNoneValue, 2}});
 };
 
+TYPED_TEST(ConditionalFullJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}},
+             {{}},
+             left_zero_eq_right_zero,
+             {{0, JoinNoneValue}, {1, JoinNoneValue}, {2, JoinNoneValue}});
+};
+
 TYPED_TEST(ConditionalFullJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}},
@@ -705,20 +731,16 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
     auto [left_wrappers, right_wrappers, left_columns, right_columns, left, right] =
       this->parse_input(left_data, right_data);
     auto result_size = this->join_size(left, right, predicate);
-    EXPECT_TRUE(result_size == expected_outputs.size());
-
-    auto result = this->join(left, right, predicate);
-    std::vector<cudf::size_type> resulting_indices;
-    for (size_t i = 0; i < result->size(); ++i) {
-      // Note: Not trying to be terribly efficient here since these tests are
-      // small, otherwise a batch copy to host before constructing the tuples
-      // would be important.
-      resulting_indices.push_back(result->element(i, cudf::get_default_stream()));
-    }
-    std::sort(resulting_indices.begin(), resulting_indices.end());
+    EXPECT_EQ(result_size, expected_outputs.size());
+
+    auto result         = this->join(left, right, predicate);
+    auto result_indices = cudf::detail::make_std_vector_sync(*result, cudf::get_default_stream());
+    std::sort(result_indices.begin(), result_indices.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
-    EXPECT_TRUE(
-      std::equal(resulting_indices.begin(), resulting_indices.end(), expected_outputs.begin()));
+    EXPECT_TRUE(std::equal(result_indices.begin(),
+                           result_indices.end(),
+                           expected_outputs.begin(),
+                           expected_outputs.end()));
   }
 
   void _compare_to_hash_join(std::unique_ptr<rmm::device_uvector<cudf::size_type>> const& result,
@@ -826,6 +848,16 @@ struct ConditionalLeftSemiJoinTest : public ConditionalJoinSingleReturnTest<T> {
 
 TYPED_TEST_SUITE(ConditionalLeftSemiJoinTest, cudf::test::IntegralTypesNotBool);
 
+TYPED_TEST(ConditionalLeftSemiJoinTest, TestOneColumnLeftEmpty)
+{
+  this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
+};
+
+TYPED_TEST(ConditionalLeftSemiJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {});
+};
+
 TYPED_TEST(ConditionalLeftSemiJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}}, {{0, 1, 3}, {30, 40, 50}}, left_zero_eq_right_zero, {0, 1});
@@ -873,6 +905,16 @@ struct ConditionalLeftAntiJoinTest : public ConditionalJoinSingleReturnTest<T> {
 
 TYPED_TEST_SUITE(ConditionalLeftAntiJoinTest, cudf::test::IntegralTypesNotBool);
 
+TYPED_TEST(ConditionalLeftAntiJoinTest, TestOneColumnLeftEmpty)
+{
+  this->test({{}}, {{3, 4, 5}}, left_zero_eq_right_zero, {});
+};
+
+TYPED_TEST(ConditionalLeftAntiJoinTest, TestOneColumnRightEmpty)
+{
+  this->test({{3, 4, 5}}, {{}}, left_zero_eq_right_zero, {0, 1, 2});
+};
+
 TYPED_TEST(ConditionalLeftAntiJoinTest, TestTwoColumnThreeRowSomeEqual)
 {
   this->test({{0, 1, 2}, {10, 20, 30}}, {{0, 1, 3}, {30, 40, 50}}, left_zero_eq_right_zero, {2});

From 5efd72f64e3b1e25337c30ba0ab246051d3fe396 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 1 Jul 2024 07:37:12 -1000
Subject: [PATCH 440/842] Ensure cudf objects can astype to any type when empty
 (#16106)

pandas allows objects to `astype` to any other type if the object is empty. The PR mirrors that behavior for cudf.

This PR also more consistently uses `astype` instead of `as_*_column` and fixes a bug in `IntervalDtype.__eq__` discovered when writing a unit test for this bug.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16106
---
 python/cudf/cudf/core/column/column.py    |  9 ++++++
 python/cudf/cudf/core/column/datetime.py  | 36 +++++++++++----------
 python/cudf/cudf/core/column/decimal.py   |  2 +-
 python/cudf/cudf/core/column/interval.py  | 26 +++++++--------
 python/cudf/cudf/core/column/timedelta.py | 34 +++++++++++---------
 python/cudf/cudf/core/dataframe.py        |  2 +-
 python/cudf/cudf/core/dtypes.py           |  2 +-
 python/cudf/cudf/core/frame.py            |  4 +--
 python/cudf/cudf/core/indexing_utils.py   |  2 +-
 python/cudf/cudf/core/series.py           |  8 +++--
 python/cudf/cudf/core/tools/numeric.py    | 14 ++++----
 python/cudf/cudf/tests/test_interval.py   |  6 ++++
 python/cudf/cudf/tests/test_series.py     | 39 +++++++++++++++++++++++
 13 files changed, 121 insertions(+), 63 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 5db6fd904a9..e7a2863da8c 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -959,6 +959,15 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
         raise NotImplementedError()
 
     def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
+        if len(self) == 0:
+            dtype = cudf.dtype(dtype)
+            if self.dtype == dtype:
+                if copy:
+                    return self.copy()
+                else:
+                    return self
+            else:
+                return column_empty(0, dtype=dtype, masked=self.nullable)
         if copy:
             col = self.copy()
         else:
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 121076b69ce..c10aceba9f4 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -280,8 +280,8 @@ def __contains__(self, item: ScalarLike) -> bool:
             return False
         elif ts.tzinfo is not None:
             ts = ts.tz_convert(None)
-        return ts.to_numpy().astype("int64") in self.as_numerical_column(
-            "int64"
+        return ts.to_numpy().astype("int64") in cast(
+            "cudf.core.column.NumericalColumn", self.astype("int64")
         )
 
     @functools.cached_property
@@ -503,9 +503,9 @@ def mean(
         self, skipna=None, min_count: int = 0, dtype=np.float64
     ) -> ScalarLike:
         return pd.Timestamp(
-            self.as_numerical_column("int64").mean(
-                skipna=skipna, min_count=min_count, dtype=dtype
-            ),
+            cast(
+                "cudf.core.column.NumericalColumn", self.astype("int64")
+            ).mean(skipna=skipna, min_count=min_count, dtype=dtype),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -517,7 +517,7 @@ def std(
         ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
-            self.as_numerical_column("int64").std(
+            cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
                 skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
             )
             * _unit_to_nanoseconds_conversion[self.time_unit],
@@ -525,7 +525,9 @@ def std(
 
     def median(self, skipna: bool | None = None) -> pd.Timestamp:
         return pd.Timestamp(
-            self.as_numerical_column("int64").median(skipna=skipna),
+            cast(
+                "cudf.core.column.NumericalColumn", self.astype("int64")
+            ).median(skipna=skipna),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -534,18 +536,18 @@ def cov(self, other: DatetimeColumn) -> float:
             raise TypeError(
                 f"cannot perform cov with types {self.dtype}, {other.dtype}"
             )
-        return self.as_numerical_column("int64").cov(
-            other.as_numerical_column("int64")
-        )
+        return cast(
+            "cudf.core.column.NumericalColumn", self.astype("int64")
+        ).cov(cast("cudf.core.column.NumericalColumn", other.astype("int64")))
 
     def corr(self, other: DatetimeColumn) -> float:
         if not isinstance(other, DatetimeColumn):
             raise TypeError(
                 f"cannot perform corr with types {self.dtype}, {other.dtype}"
             )
-        return self.as_numerical_column("int64").corr(
-            other.as_numerical_column("int64")
-        )
+        return cast(
+            "cudf.core.column.NumericalColumn", self.astype("int64")
+        ).corr(cast("cudf.core.column.NumericalColumn", other.astype("int64")))
 
     def quantile(
         self,
@@ -554,7 +556,7 @@ def quantile(
         exact: bool,
         return_scalar: bool,
     ) -> ColumnBase:
-        result = self.as_numerical_column("int64").quantile(
+        result = self.astype("int64").quantile(
             q=q,
             interpolation=interpolation,
             exact=exact,
@@ -645,12 +647,12 @@ def indices_of(
     ) -> cudf.core.column.NumericalColumn:
         value = column.as_column(
             pd.to_datetime(value), dtype=self.dtype
-        ).as_numerical_column("int64")
-        return self.as_numerical_column("int64").indices_of(value)
+        ).astype("int64")
+        return self.astype("int64").indices_of(value)
 
     @property
     def is_unique(self) -> bool:
-        return self.as_numerical_column("int64").is_unique
+        return self.astype("int64").is_unique
 
     def isin(self, values: Sequence) -> ColumnBase:
         return cudf.core.tools.datetimes._isin_datetimelike(self, values)
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index d66908b5f94..3e238d65cff 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -157,7 +157,7 @@ def normalize_binop_value(self, other):
                         "Decimal columns only support binary operations with "
                         "integer numerical columns."
                     )
-                other = other.as_decimal_column(
+                other = other.astype(
                     self.dtype.__class__(self.dtype.__class__.MAX_PRECISION, 0)
                 )
             elif not isinstance(other, DecimalBaseColumn):
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index f24ca3fdad1..d09a1f66539 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -4,7 +4,7 @@
 
 import cudf
 from cudf.core.column import StructColumn
-from cudf.core.dtypes import CategoricalDtype, IntervalDtype
+from cudf.core.dtypes import IntervalDtype
 
 
 class IntervalColumn(StructColumn):
@@ -87,20 +87,16 @@ def copy(self, deep=True):
 
     def as_interval_column(self, dtype):
         if isinstance(dtype, IntervalDtype):
-            if isinstance(self.dtype, CategoricalDtype):
-                new_struct = self._get_decategorized_column()
-                return IntervalColumn.from_struct_column(new_struct)
-            else:
-                return IntervalColumn(
-                    size=self.size,
-                    dtype=dtype,
-                    mask=self.mask,
-                    offset=self.offset,
-                    null_count=self.null_count,
-                    children=tuple(
-                        child.astype(dtype.subtype) for child in self.children
-                    ),
-                )
+            return IntervalColumn(
+                size=self.size,
+                dtype=dtype,
+                mask=self.mask,
+                offset=self.offset,
+                null_count=self.null_count,
+                children=tuple(
+                    child.astype(dtype.subtype) for child in self.children
+                ),
+            )
         else:
             raise ValueError("dtype must be IntervalDtype")
 
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 8f41bcb6422..5a0171bbbdc 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -107,7 +107,9 @@ def __contains__(self, item: DatetimeLikeScalar) -> bool:
             # np.timedelta64 raises ValueError, hence `item`
             # cannot exist in `self`.
             return False
-        return item.view("int64") in self.as_numerical_column("int64")
+        return item.view("int64") in cast(
+            "cudf.core.column.NumericalColumn", self.astype("int64")
+        )
 
     @property
     def values(self):
@@ -132,9 +134,7 @@ def to_arrow(self) -> pa.Array:
                 self.mask_array_view(mode="read").copy_to_host()
             )
         data = pa.py_buffer(
-            self.as_numerical_column("int64")
-            .data_array_view(mode="read")
-            .copy_to_host()
+            self.astype("int64").data_array_view(mode="read").copy_to_host()
         )
         pa_dtype = np_to_pa_dtype(self.dtype)
         return pa.Array.from_buffers(
@@ -295,13 +295,17 @@ def as_timedelta_column(
 
     def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta:
         return pd.Timedelta(
-            self.as_numerical_column("int64").mean(skipna=skipna, dtype=dtype),
+            cast(
+                "cudf.core.column.NumericalColumn", self.astype("int64")
+            ).mean(skipna=skipna, dtype=dtype),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
     def median(self, skipna: bool | None = None) -> pd.Timedelta:
         return pd.Timedelta(
-            self.as_numerical_column("int64").median(skipna=skipna),
+            cast(
+                "cudf.core.column.NumericalColumn", self.astype("int64")
+            ).median(skipna=skipna),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -315,7 +319,7 @@ def quantile(
         exact: bool,
         return_scalar: bool,
     ) -> ColumnBase:
-        result = self.as_numerical_column("int64").quantile(
+        result = self.astype("int64").quantile(
             q=q,
             interpolation=interpolation,
             exact=exact,
@@ -337,7 +341,7 @@ def sum(
             # Since sum isn't overridden in Numerical[Base]Column, mypy only
             # sees the signature from Reducible (which doesn't have the extra
             # parameters from ColumnBase._reduce) so we have to ignore this.
-            self.as_numerical_column("int64").sum(  # type: ignore
+            self.astype("int64").sum(  # type: ignore
                 skipna=skipna, min_count=min_count, dtype=dtype
             ),
             unit=self.time_unit,
@@ -351,7 +355,7 @@ def std(
         ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
-            self.as_numerical_column("int64").std(
+            cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
                 skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype
             ),
             unit=self.time_unit,
@@ -362,18 +366,18 @@ def cov(self, other: TimeDeltaColumn) -> float:
             raise TypeError(
                 f"cannot perform cov with types {self.dtype}, {other.dtype}"
             )
-        return self.as_numerical_column("int64").cov(
-            other.as_numerical_column("int64")
-        )
+        return cast(
+            "cudf.core.column.NumericalColumn", self.astype("int64")
+        ).cov(cast("cudf.core.column.NumericalColumn", other.astype("int64")))
 
     def corr(self, other: TimeDeltaColumn) -> float:
         if not isinstance(other, TimeDeltaColumn):
             raise TypeError(
                 f"cannot perform corr with types {self.dtype}, {other.dtype}"
             )
-        return self.as_numerical_column("int64").corr(
-            other.as_numerical_column("int64")
-        )
+        return cast(
+            "cudf.core.column.NumericalColumn", self.astype("int64")
+        ).corr(cast("cudf.core.column.NumericalColumn", other.astype("int64")))
 
     def components(self) -> dict[str, ColumnBase]:
         """
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 4dfeb68b7ba..b249410c2e4 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2404,7 +2404,7 @@ def scatter_by_map(
         if isinstance(map_index, cudf.core.column.StringColumn):
             cat_index = cast(
                 cudf.core.column.CategoricalColumn,
-                map_index.as_categorical_column("category"),
+                map_index.astype("category"),
             )
             map_index = cat_index.codes
             warnings.warn(
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 034849d0e71..de715191c08 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -937,7 +937,7 @@ def to_pandas(self) -> pd.IntervalDtype:
     def __eq__(self, other):
         if isinstance(other, str):
             # This means equality isn't transitive but mimics pandas
-            return other == self.name
+            return other in (self.name, str(self))
         return (
             type(self) == type(other)
             and self.subtype == other.subtype
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 9bac75dc6ac..253d200f7d4 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -927,7 +927,7 @@ def from_arrow(cls, data: pa.Table) -> Self:
                 # of column is 0 (i.e., empty) then we will have an
                 # int8 column in result._data[name] returned by libcudf,
                 # which needs to be type-casted to 'category' dtype.
-                result[name] = result[name].as_categorical_column("category")
+                result[name] = result[name].astype("category")
             elif (
                 pandas_dtypes.get(name) == "empty"
                 and np_dtypes.get(name) == "object"
@@ -936,7 +936,7 @@ def from_arrow(cls, data: pa.Table) -> Self:
                 # is specified as 'empty' and np_dtypes as 'object',
                 # hence handling this special case to type-cast the empty
                 # float column to str column.
-                result[name] = result[name].as_string_column(cudf.dtype("str"))
+                result[name] = result[name].astype(cudf.dtype("str"))
             elif name in data.column_names and isinstance(
                 data[name].type,
                 (
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index 73a1cd26367..a5fed02cbed 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -229,7 +229,7 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec:
     else:
         key = cudf.core.column.as_column(key)
         if isinstance(key, cudf.core.column.CategoricalColumn):
-            key = key.as_numerical_column(key.codes.dtype)
+            key = key.astype(key.codes.dtype)
         if is_bool_dtype(key.dtype):
             return MaskIndexer(BooleanMask(key, n))
         elif len(key) == 0:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 97b6bbec2d4..4a60470fafa 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3107,10 +3107,12 @@ def value_counts(
         # Pandas returns an IntervalIndex as the index of res
         # this condition makes sure we do too if bins is given
         if bins is not None and len(res) == len(res.index.categories):
-            int_index = IntervalColumn.as_interval_column(
-                res.index._column, res.index.categories.dtype
+            interval_col = IntervalColumn.from_struct_column(
+                res.index._column._get_decategorized_column()
+            )
+            res.index = cudf.IntervalIndex._from_data(
+                {res.index.name: interval_col}
             )
-            res.index = int_index
         res.name = result_name
         return res
 
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 68b23f1e059..ef6b86a04a7 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -115,11 +115,11 @@ def to_numeric(arg, errors="raise", downcast=None):
     dtype = col.dtype
 
     if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype):
-        col = col.as_numerical_column(cudf.dtype("int64"))
+        col = col.astype(cudf.dtype("int64"))
     elif isinstance(dtype, CategoricalDtype):
         cat_dtype = col.dtype.type
         if _is_non_decimal_numeric_dtype(cat_dtype):
-            col = col.as_numerical_column(cat_dtype)
+            col = col.astype(cat_dtype)
         else:
             try:
                 col = _convert_str_col(
@@ -146,8 +146,8 @@ def to_numeric(arg, errors="raise", downcast=None):
         raise ValueError("Unrecognized datatype")
 
     # str->float conversion may require lower precision
-    if col.dtype == cudf.dtype("f"):
-        col = col.as_numerical_column("d")
+    if col.dtype == cudf.dtype("float32"):
+        col = col.astype("float64")
 
     if downcast:
         if downcast == "float":
@@ -205,7 +205,7 @@ def _convert_str_col(col, errors, _downcast=None):
 
     is_integer = libstrings.is_integer(col)
     if is_integer.all():
-        return col.as_numerical_column(dtype=cudf.dtype("i8"))
+        return col.astype(dtype=cudf.dtype("i8"))
 
     col = _proc_inf_empty_strings(col)
 
@@ -218,9 +218,9 @@ def _convert_str_col(col, errors, _downcast=None):
                     "limited by float32 precision."
                 )
             )
-            return col.as_numerical_column(dtype=cudf.dtype("f"))
+            return col.astype(dtype=cudf.dtype("float32"))
         else:
-            return col.as_numerical_column(dtype=cudf.dtype("d"))
+            return col.astype(dtype=cudf.dtype("float64"))
     else:
         if errors == "coerce":
             col = libcudf.string_casting.stod(col)
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index 1b395c09ba8..5eeea87d8e0 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -188,3 +188,9 @@ def test_from_pandas_intervaldtype():
     result = cudf.from_pandas(dtype)
     expected = cudf.IntervalDtype("int64", closed="left")
     assert_eq(result, expected)
+
+
+def test_intervaldtype_eq_string_with_attributes():
+    dtype = cudf.IntervalDtype("int64", closed="left")
+    assert dtype == "interval"
+    assert dtype == "interval[int64, left]"
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 467d0c46ae7..f2501041f25 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2873,3 +2873,42 @@ def test_nunique_all_null(dropna):
     result = pd_ser.nunique(dropna=dropna)
     expected = cudf_ser.nunique(dropna=dropna)
     assert result == expected
+
+
+@pytest.mark.parametrize(
+    "type1",
+    [
+        "category",
+        "interval[int64, right]",
+        "int64",
+        "float64",
+        "str",
+        "datetime64[ns]",
+        "timedelta64[ns]",
+    ],
+)
+@pytest.mark.parametrize(
+    "type2",
+    [
+        "category",
+        "interval[int64, right]",
+        "int64",
+        "float64",
+        "str",
+        "datetime64[ns]",
+        "timedelta64[ns]",
+    ],
+)
+@pytest.mark.parametrize(
+    "as_dtype", [lambda x: x, cudf.dtype], ids=["string", "object"]
+)
+@pytest.mark.parametrize("copy", [True, False])
+def test_empty_astype_always_castable(type1, type2, as_dtype, copy):
+    ser = cudf.Series([], dtype=as_dtype(type1))
+    result = ser.astype(as_dtype(type2), copy=copy)
+    expected = cudf.Series([], dtype=as_dtype(type2))
+    assert_eq(result, expected)
+    if not copy and cudf.dtype(type1) == cudf.dtype(type2):
+        assert ser._column is result._column
+    else:
+        assert ser._column is not result._column

From e41242094092f9ed31fd4d04f8a30107c1ffb2ff Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 1 Jul 2024 11:24:52 -0700
Subject: [PATCH 441/842] Backport #16038 to 24.06 (#16101)

Backporting #16038 for a patch release.

---------

Co-authored-by: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
---
 cpp/include/cudf/ast/detail/operators.hpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index b618f33a6e5..c483d459833 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/types.hpp>
+#include <cudf/unary.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -819,7 +820,17 @@ struct operator_functor<ast_operator::NOT, false> {
 template <typename To>
 struct cast {
   static constexpr auto arity{1};
-  template <typename From>
+  template <typename From, typename std::enable_if_t<is_fixed_point<From>()>* = nullptr>
+  __device__ inline auto operator()(From f) -> To
+  {
+    if constexpr (cuda::std::is_floating_point_v<To>) {
+      return convert_fixed_to_floating<To>(f);
+    } else {
+      return static_cast<To>(f);
+    }
+  }
+
+  template <typename From, typename cuda::std::enable_if_t<!is_fixed_point<From>()>* = nullptr>
   __device__ inline auto operator()(From f) -> decltype(static_cast<To>(f))
   {
     return static_cast<To>(f);

From b691b1c1cd99a5721230ac8db2afa8ad99835b9c Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 1 Jul 2024 14:25:11 -0400
Subject: [PATCH 442/842] Add stream parameter to
 cudf::io::text::multibyte_split (#16034)

Adds stream support the `cudf::io::text::multibyte_split` API.
Also adds a stream test and deprecates an overloaded API.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16034
---
 cpp/include/cudf/io/text/byte_range_info.hpp  | 15 +++-
 .../cudf/io/text/data_chunk_source.hpp        | 10 ++-
 cpp/include/cudf/io/text/multibyte_split.hpp  | 27 ++++++-
 cpp/src/io/text/multibyte_split.cu            | 19 ++---
 cpp/tests/CMakeLists.txt                      |  1 +
 cpp/tests/io/text/multibyte_split_test.cpp    | 81 ++++++++++++-------
 cpp/tests/streams/io/multibyte_split_test.cpp | 36 +++++++++
 docs/cudf/source/conf.py                      |  2 +-
 8 files changed, 141 insertions(+), 50 deletions(-)
 create mode 100644 cpp/tests/streams/io/multibyte_split_test.cpp

diff --git a/cpp/include/cudf/io/text/byte_range_info.hpp b/cpp/include/cudf/io/text/byte_range_info.hpp
index 0086432d003..60ee867f058 100644
--- a/cpp/include/cudf/io/text/byte_range_info.hpp
+++ b/cpp/include/cudf/io/text/byte_range_info.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,17 +24,22 @@
 namespace cudf {
 namespace io {
 namespace text {
+/**
+ * @addtogroup io_readers
+ * @{
+ * @file
+ */
 
 /**
  * @brief stores offset and size used to indicate a byte range
  */
 class byte_range_info {
  private:
-  int64_t _offset;  ///< offset in bytes
-  int64_t _size;    ///< size in bytes
+  int64_t _offset{};  ///< offset in bytes
+  int64_t _size{};    ///< size in bytes
 
  public:
-  constexpr byte_range_info() noexcept : _offset(0), _size(0) {}
+  constexpr byte_range_info() = default;
   /**
    * @brief Constructs a byte_range_info object
    *
@@ -104,6 +109,8 @@ std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_b
  */
 byte_range_info create_byte_range_info_max();
 
+/** @} */  // end of group
+
 }  // namespace text
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp
index 28204c82780..13aff4b3b8f 100644
--- a/cpp/include/cudf/io/text/data_chunk_source.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,12 @@ namespace cudf {
 namespace io {
 namespace text {
 
+/**
+ * @addtogroup io_readers
+ * @{
+ * @file
+ */
+
 /**
  * @brief A contract guaranteeing stream-ordered memory access to the underlying device data.
  *
@@ -110,6 +116,8 @@ class data_chunk_source {
   [[nodiscard]] virtual std::unique_ptr<data_chunk_reader> create_reader() const = 0;
 };
 
+/** @} */  // end of group
+
 }  // namespace text
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index 7abae7c754b..e29ab78ae46 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -30,6 +30,11 @@
 namespace cudf {
 namespace io {
 namespace text {
+/**
+ * @addtogroup io_readers
+ * @{
+ * @file
+ */
 
 /**
  * @brief Parsing options for multibyte_split.
@@ -79,6 +84,7 @@ struct parse_options {
  * @param source The source string
  * @param delimiter UTF-8 encoded string for which to find offsets in the source
  * @param options the parsing options to use (including byte range)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Memory resource to use for the device memory allocation
  * @return The strings found by splitting the source by the delimiter within the relevant byte
  * range.
@@ -87,17 +93,30 @@ std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
   parse_options options             = {},
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<cudf::column> multibyte_split(
+/**
+ * @brief Splits the source text into a strings column using a multiple byte delimiter.
+ *
+ * @deprecated Since 24.08
+ *
+ * @param source The source input data encoded in UTF-8
+ * @param delimiter UTF-8 encoded string for which to find offsets in the source
+ * @param byte_range The position and size within `source` to produce the column from
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Memory resource to use for the device memory allocation
+ * @return The strings found by splitting the source by the delimiter within the relevant byte
+ * range.
+ */
+[[deprecated]] std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
   std::optional<byte_range_info> byte_range,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
-                                              std::string const& delimiter,
-                                              rmm::device_async_resource_ref mr);
+/** @} */  // end of group
 
 }  // namespace text
 }  // namespace io
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 9c406369068..51dc0ca90af 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -565,35 +565,32 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 
 }  // namespace detail
 
+// deprecated in 24.08
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               std::optional<byte_range_info> byte_range,
+                                              rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr)
 {
-  return multibyte_split(
-    source, delimiter, parse_options{byte_range.value_or(create_byte_range_info_max())}, mr);
+  return multibyte_split(source,
+                         delimiter,
+                         parse_options{byte_range.value_or(create_byte_range_info_max())},
+                         stream,
+                         mr);
 }
 
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               parse_options options,
+                                              rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr)
 {
-  auto stream = cudf::get_default_stream();
-
   auto result = detail::multibyte_split(
     source, delimiter, options.byte_range, options.strip_delimiters, stream, mr);
 
   return result;
 }
 
-std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
-                                              std::string const& delimiter,
-                                              rmm::device_async_resource_ref mr)
-{
-  return multibyte_split(source, delimiter, parse_options{}, mr);
-}
-
 }  // namespace text
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index eef09954647..244bcb7d897 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -691,6 +691,7 @@ ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_MULTIBYTE_SPLIT_TEST streams/io/multibyte_split_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
index 36338253c9b..408d54bd5ff 100644
--- a/cpp/tests/io/text/multibyte_split_test.cpp
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -97,10 +97,9 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange)
   auto expected = strings_column_wrapper{"abcdefg:"};
 
   auto source = cudf::io::text::make_source(host_input);
-  auto out    = cudf::io::text::multibyte_split(
-    *source,
-    delimiter,
-    cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size())});
+  cudf::io::text::parse_options options{
+    cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size())}};
+  auto out = cudf::io::text::multibyte_split(*source, delimiter, options);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
 }
@@ -113,10 +112,9 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange2)
   auto expected = strings_column_wrapper{"abcdefg:"};
 
   auto source = cudf::io::text::make_source(host_input);
-  auto out    = cudf::io::text::multibyte_split(
-    *source,
-    delimiter,
-    cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size() - 1)});
+  cudf::io::text::parse_options options{
+    cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size() - 1)}};
+  auto out = cudf::io::text::multibyte_split(*source, delimiter, options);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
 }
@@ -277,9 +275,12 @@ TEST_F(MultibyteSplitTest, LargeInputMultipleRange)
   auto source    = cudf::io::text::make_source(host_input);
 
   auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
-  auto out0        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
-  auto out1        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
-  auto out2        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
+  auto out0        = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]});
+  auto out1 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]});
+  auto out2 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]});
 
   auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
   auto out       = cudf::concatenate(out_views);
@@ -303,9 +304,12 @@ TEST_F(MultibyteSplitTest, LargeInputSparseMultipleRange)
   auto source                           = cudf::io::text::make_source(host_input);
 
   auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
-  auto out0        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
-  auto out1        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
-  auto out2        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
+  auto out0        = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]});
+  auto out1 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]});
+  auto out2 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]});
 
   auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
   auto out       = cudf::concatenate(out_views);
@@ -327,9 +331,12 @@ TEST_F(MultibyteSplitTest, LargeInputMultipleRangeSingleByte)
   auto source    = cudf::io::text::make_source(host_input);
 
   auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
-  auto out0        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
-  auto out1        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
-  auto out2        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
+  auto out0        = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]});
+  auto out1 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]});
+  auto out2 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]});
 
   auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
   auto out       = cudf::concatenate(out_views);
@@ -352,9 +359,12 @@ TEST_F(MultibyteSplitTest, LargeInputSparseMultipleRangeSingleByte)
   auto source                       = cudf::io::text::make_source(host_input);
 
   auto byte_ranges = cudf::io::text::create_byte_range_infos_consecutive(host_input.size(), 3);
-  auto out0        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[0]);
-  auto out1        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[1]);
-  auto out2        = cudf::io::text::multibyte_split(*source, delimiter, byte_ranges[2]);
+  auto out0        = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[0]});
+  auto out1 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[1]});
+  auto out2 = cudf::io::text::multibyte_split(
+    *source, delimiter, cudf::io::text::parse_options{byte_ranges[2]});
 
   auto out_views = std::vector<cudf::column_view>({out0->view(), out1->view(), out2->view()});
   auto out       = cudf::concatenate(out_views);
@@ -383,9 +393,14 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRanges)
     SCOPED_TRACE(split1);
     for (int split2 = split1 + 1; split2 < size; split2++) {
       SCOPED_TRACE(split2);
-      auto out1 = multibyte_split(*source, delimiter, byte_range_info{0, split1});
-      auto out2 = multibyte_split(*source, delimiter, byte_range_info{split1, split2 - split1});
-      auto out3 = multibyte_split(*source, delimiter, byte_range_info{split2, size - split2});
+      auto out1 = multibyte_split(
+        *source, delimiter, cudf::io::text::parse_options{byte_range_info{0, split1}});
+      auto out2 =
+        multibyte_split(*source,
+                        delimiter,
+                        cudf::io::text::parse_options{byte_range_info{split1, split2 - split1}});
+      auto out3 = multibyte_split(
+        *source, delimiter, cudf::io::text::parse_options{byte_range_info{split2, size - split2}});
 
       auto out_views = std::vector<cudf::column_view>({out1->view(), out2->view(), out3->view()});
       auto out       = cudf::concatenate(out_views);
@@ -416,9 +431,14 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRangesSingleByte)
     SCOPED_TRACE(split1);
     for (int split2 = split1 + 1; split2 < size; split2++) {
       SCOPED_TRACE(split2);
-      auto out1 = multibyte_split(*source, delimiter, byte_range_info{0, split1});
-      auto out2 = multibyte_split(*source, delimiter, byte_range_info{split1, split2 - split1});
-      auto out3 = multibyte_split(*source, delimiter, byte_range_info{split2, size - split2});
+      auto out1 = multibyte_split(
+        *source, delimiter, cudf::io::text::parse_options{byte_range_info{0, split1}});
+      auto out2 =
+        multibyte_split(*source,
+                        delimiter,
+                        cudf::io::text::parse_options{byte_range_info{split1, split2 - split1}});
+      auto out3 = multibyte_split(
+        *source, delimiter, cudf::io::text::parse_options{byte_range_info{split2, size - split2}});
 
       auto out_views = std::vector<cudf::column_view>({out1->view(), out2->view(), out3->view()});
       auto out       = cudf::concatenate(out_views);
@@ -441,7 +461,8 @@ TEST_F(MultibyteSplitTest, SingletonRangeAtEnd)
   auto source     = make_source(host_input);
   auto expected   = strings_column_wrapper{};
 
-  auto out = multibyte_split(*source, delimiter, byte_range_info{5, 1});
+  auto out =
+    multibyte_split(*source, delimiter, cudf::io::text::parse_options{byte_range_info{5, 1}});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, cudf::test::debug_output_level::ALL_ERRORS);
 }
@@ -480,7 +501,8 @@ TEST_F(MultibyteSplitTest, EmptyRange)
   auto source     = make_source(host_input);
   auto expected   = strings_column_wrapper{};
 
-  auto out = multibyte_split(*source, delimiter, byte_range_info{4, 0});
+  auto out =
+    multibyte_split(*source, delimiter, cudf::io::text::parse_options{byte_range_info{4, 0}});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, cudf::test::debug_output_level::ALL_ERRORS);
 }
@@ -493,7 +515,8 @@ TEST_F(MultibyteSplitTest, EmptyRangeSingleByte)
   auto source     = make_source(host_input);
   auto expected   = strings_column_wrapper{};
 
-  auto out = multibyte_split(*source, delimiter, byte_range_info{3, 0});
+  auto out =
+    multibyte_split(*source, delimiter, cudf::io::text::parse_options{byte_range_info{3, 0}});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, cudf::test::debug_output_level::ALL_ERRORS);
 }
diff --git a/cpp/tests/streams/io/multibyte_split_test.cpp b/cpp/tests/streams/io/multibyte_split_test.cpp
new file mode 100644
index 00000000000..b0eff1d3340
--- /dev/null
+++ b/cpp/tests/streams/io/multibyte_split_test.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/io/text/byte_range_info.hpp>
+#include <cudf/io/text/data_chunk_source_factories.hpp>
+#include <cudf/io/text/multibyte_split.hpp>
+
+#include <string>
+
+class MultibyteSplitTest : public cudf::test::BaseFixture {};
+
+TEST_F(MultibyteSplitTest, Reader)
+{
+  auto delimiter  = std::string(":");
+  auto host_input = std::string("abc:def");
+  auto source     = cudf::io::text::make_source(host_input);
+  cudf::io::text::parse_options options{};
+  auto result =
+    cudf::io::text::multibyte_split(*source, delimiter, options, cudf::test::get_default_stream());
+}
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 108f12bc099..c3c14ac8cad 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -372,7 +372,7 @@ def _generate_namespaces(namespaces):
 _all_namespaces = _generate_namespaces(
     {
         # Note that io::datasource is actually a nested class
-        "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression"},
+        "cudf": {"io", "io::datasource", "strings", "ast", "ast::expression", "io::text"},
         "numeric": {},
         "nvtext": {},
     }

From 760c15cbd4231e4987149b3a5d68fdcd22654dce Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Mon, 1 Jul 2024 14:27:30 -0400
Subject: [PATCH 443/842] Use verify-alpha-spec hook (#16144)

With the deployment of rapids-build-backend, we need to make sure our dependencies have alpha specs.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16144
---
 .pre-commit-config.yaml                          |  3 ++-
 conda/environments/all_cuda-118_arch-x86_64.yaml |  5 ++---
 conda/environments/all_cuda-122_arch-x86_64.yaml |  7 +++----
 dependencies.yaml                                | 10 +++++-----
 4 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f8c4f4b9143..d0457d2c641 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -149,7 +149,7 @@ repos:
       - id: ruff-format
         files: python/.*$
   - repo: https://github.com/rapidsai/pre-commit-hooks
-    rev: v0.0.3
+    rev: v0.2.0
     hooks:
       - id: verify-copyright
         exclude: |
@@ -158,6 +158,7 @@ repos:
             cpp/src/io/parquet/ipc/Message_generated[.]h$|
             cpp/src/io/parquet/ipc/Schema_generated[.]h$
           )
+      - id: verify-alpha-spec
 
 default_language_version:
       python: python3
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 946e2d1cd32..cc9238ab80a 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -26,7 +26,6 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.8.*
 - dask-cuda==24.8.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
@@ -44,10 +43,10 @@ dependencies:
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
-- libkvikio==24.8.*
+- libkvikio==24.8.*,>=0.0.0a0
 - libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.8.*
+- librmm==24.8.*,>=0.0.0a0
 - make
 - moto>=4.0.8
 - msgpack-python
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index f069616ddbe..9fecd452248 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -27,7 +27,6 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.8.*
 - dask-cuda==24.8.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
@@ -43,10 +42,10 @@ dependencies:
 - libarrow==16.1.0.*
 - libcufile-dev
 - libcurand-dev
-- libkvikio==24.8.*
+- libkvikio==24.8.*,>=0.0.0a0
 - libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.8.*
+- librmm==24.8.*,>=0.0.0a0
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -66,7 +65,7 @@ dependencies:
 - pre-commit
 - pyarrow==16.1.0.*
 - pydata-sphinx-theme!=0.14.2
-- pynvjitlink
+- pynvjitlink>=0.0.0a0
 - pytest-benchmark
 - pytest-cases>=3.8.2
 - pytest-cov
diff --git a/dependencies.yaml b/dependencies.yaml
index 38ec30a8033..9efbc47896c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -287,8 +287,8 @@ dependencies:
       - output_types: conda
         packages:
           - fmt>=10.1.1,<11
-          - librmm==24.8.*
-          - libkvikio==24.8.*
+          - librmm==24.8.*,>=0.0.0a0
+          - libkvikio==24.8.*,>=0.0.0a0
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==3.0.6
@@ -500,7 +500,7 @@ dependencies:
       - output_types: [conda]
         packages:
           - breathe>=4.35.0
-          - dask-cuda==24.8.*
+          - dask-cuda==24.8.*,>=0.0.0a0
           - *doxygen
           - make
           - myst-nb
@@ -582,7 +582,7 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - pynvjitlink
+              - pynvjitlink>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
               - cubinlinker
@@ -592,7 +592,7 @@ dependencies:
           - matrix: {cuda: "12.*"}
             packages:
               - rmm-cu12==24.8.*,>=0.0.0a0
-              - pynvjitlink-cu12
+              - pynvjitlink-cu12>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
               - rmm-cu11==24.8.*,>=0.0.0a0

From dfab1b589e5907b324dc1688f6dab862d194012c Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 1 Jul 2024 15:33:42 -0500
Subject: [PATCH 444/842] Backport: Use size_t to allow large conditional joins
 (#16127) (#16133)

Backports #16127 to 24.06 for inclusion in a hotfix release.

---------

Co-authored-by: Vyas Ramasubramani <vyasr@nvidia.com>
---
 cpp/src/join/conditional_join.cu          |   5 +-
 cpp/src/join/conditional_join_kernels.cuh | 124 ++++++++++++++++++++--
 cpp/src/join/join_common_utils.cuh        |  95 -----------------
 3 files changed, 117 insertions(+), 107 deletions(-)

diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 97a06d5a923..d4ef2747c9d 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -95,7 +95,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_join_anti_semi(
     join_size = size.value(stream);
   }
 
-  rmm::device_scalar<size_type> write_index(0, stream);
+  rmm::device_scalar<std::size_t> write_index(0, stream);
 
   auto left_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
@@ -232,13 +232,14 @@ conditional_join(table_view const& left,
                      std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
   }
 
-  rmm::device_scalar<size_type> write_index(0, stream);
+  rmm::device_scalar<std::size_t> write_index(0, stream);
 
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
 
   auto const& join_output_l = left_indices->data();
   auto const& join_output_r = right_indices->data();
+
   if (has_nulls) {
     conditional_join<DEFAULT_JOIN_BLOCK_SIZE, DEFAULT_JOIN_CACHE_SIZE, true>
       <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
diff --git a/cpp/src/join/conditional_join_kernels.cuh b/cpp/src/join/conditional_join_kernels.cuh
index 1e16c451f5a..62769862f54 100644
--- a/cpp/src/join/conditional_join_kernels.cuh
+++ b/cpp/src/join/conditional_join_kernels.cuh
@@ -29,6 +29,110 @@
 namespace cudf {
 namespace detail {
 
+/**
+ * @brief Adds a pair of indices to the shared memory cache
+ *
+ * @param[in] first The first index in the pair
+ * @param[in] second The second index in the pair
+ * @param[in,out] current_idx_shared Pointer to shared index that determines
+ * where in the shared memory cache the pair will be written
+ * @param[in] warp_id The ID of the warp of the calling the thread
+ * @param[out] joined_shared_l Pointer to the shared memory cache for left indices
+ * @param[out] joined_shared_r Pointer to the shared memory cache for right indices
+ */
+__inline__ __device__ void add_pair_to_cache(size_type const first,
+                                             size_type const second,
+                                             std::size_t* current_idx_shared,
+                                             int const warp_id,
+                                             size_type* joined_shared_l,
+                                             size_type* joined_shared_r)
+{
+  cuda::atomic_ref<std::size_t, cuda::thread_scope_block> ref{*(current_idx_shared + warp_id)};
+  std::size_t my_current_idx = ref.fetch_add(1, cuda::memory_order_relaxed);
+  // It's guaranteed to fit into the shared cache
+  joined_shared_l[my_current_idx] = first;
+  joined_shared_r[my_current_idx] = second;
+}
+
+__inline__ __device__ void add_left_to_cache(size_type const first,
+                                             std::size_t* current_idx_shared,
+                                             int const warp_id,
+                                             size_type* joined_shared_l)
+{
+  cuda::atomic_ref<std::size_t, cuda::thread_scope_block> ref{*(current_idx_shared + warp_id)};
+  std::size_t my_current_idx      = ref.fetch_add(1, cuda::memory_order_relaxed);
+  joined_shared_l[my_current_idx] = first;
+}
+
+template <int num_warps, cudf::size_type output_cache_size>
+__device__ void flush_output_cache(unsigned int const activemask,
+                                   std::size_t const max_size,
+                                   int const warp_id,
+                                   int const lane_id,
+                                   std::size_t* current_idx,
+                                   std::size_t current_idx_shared[num_warps],
+                                   size_type join_shared_l[num_warps][output_cache_size],
+                                   size_type join_shared_r[num_warps][output_cache_size],
+                                   size_type* join_output_l,
+                                   size_type* join_output_r)
+{
+  // count how many active threads participating here which could be less than warp_size
+  int const num_threads     = __popc(activemask);
+  std::size_t output_offset = 0;
+
+  if (0 == lane_id) {
+    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*current_idx};
+    output_offset = ref.fetch_add(current_idx_shared[warp_id], cuda::memory_order_relaxed);
+  }
+
+  // No warp sync is necessary here because we are assuming that ShuffleIndex
+  // is internally using post-CUDA 9.0 synchronization-safe primitives
+  // (__shfl_sync instead of __shfl). __shfl is technically not guaranteed to
+  // be safe by the compiler because it is not required by the standard to
+  // converge divergent branches before executing.
+  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
+
+  for (std::size_t shared_out_idx = static_cast<std::size_t>(lane_id);
+       shared_out_idx < current_idx_shared[warp_id];
+       shared_out_idx += num_threads) {
+    std::size_t thread_offset = output_offset + shared_out_idx;
+    if (thread_offset < max_size) {
+      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
+      join_output_r[thread_offset] = join_shared_r[warp_id][shared_out_idx];
+    }
+  }
+}
+
+template <int num_warps, cudf::size_type output_cache_size>
+__device__ void flush_output_cache(unsigned int const activemask,
+                                   std::size_t const max_size,
+                                   int const warp_id,
+                                   int const lane_id,
+                                   std::size_t* current_idx,
+                                   std::size_t current_idx_shared[num_warps],
+                                   size_type join_shared_l[num_warps][output_cache_size],
+                                   size_type* join_output_l)
+{
+  int const num_threads     = __popc(activemask);
+  std::size_t output_offset = 0;
+
+  if (0 == lane_id) {
+    cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*current_idx};
+    output_offset = ref.fetch_add(current_idx_shared[warp_id], cuda::memory_order_relaxed);
+  }
+
+  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
+
+  for (std::size_t shared_out_idx = static_cast<std::size_t>(lane_id);
+       shared_out_idx < current_idx_shared[warp_id];
+       shared_out_idx += num_threads) {
+    std::size_t thread_offset = output_offset + shared_out_idx;
+    if (thread_offset < max_size) {
+      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
+    }
+  }
+}
+
 /**
  * @brief Computes the output size of joining the left table to the right table.
  *
@@ -103,14 +207,14 @@ CUDF_KERNEL void compute_conditional_join_output_size(
     }
   }
 
-  using BlockReduce = cub::BlockReduce<cudf::size_type, block_size>;
+  using BlockReduce = cub::BlockReduce<std::size_t, block_size>;
   __shared__ typename BlockReduce::TempStorage temp_storage;
   std::size_t block_counter = BlockReduce(temp_storage).Sum(thread_counter);
 
   // Add block counter to global counter
   if (threadIdx.x == 0) {
     cuda::atomic_ref<std::size_t, cuda::thread_scope_device> ref{*output_size};
-    ref.fetch_add(block_counter, cuda::std::memory_order_relaxed);
+    ref.fetch_add(block_counter, cuda::memory_order_relaxed);
   }
 }
 
@@ -143,13 +247,13 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,
                                   join_kind join_type,
                                   cudf::size_type* join_output_l,
                                   cudf::size_type* join_output_r,
-                                  cudf::size_type* current_idx,
+                                  std::size_t* current_idx,
                                   cudf::ast::detail::expression_device_view device_expression_data,
-                                  cudf::size_type const max_size,
+                                  std::size_t const max_size,
                                   bool const swap_tables)
 {
   constexpr int num_warps = block_size / detail::warp_size;
-  __shared__ cudf::size_type current_idx_shared[num_warps];
+  __shared__ std::size_t current_idx_shared[num_warps];
   __shared__ cudf::size_type join_shared_l[num_warps][output_cache_size];
   __shared__ cudf::size_type join_shared_r[num_warps][output_cache_size];
 
@@ -183,7 +287,7 @@ CUDF_KERNEL void conditional_join(table_device_view left_table,
 
   if (outer_row_index < outer_num_rows) {
     bool found_match = false;
-    for (thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
+    for (cudf::thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
          ++inner_row_index) {
       auto output_dest           = cudf::ast::detail::value_expression_result<bool, has_nulls>();
       auto const left_row_index  = swap_tables ? inner_row_index : outer_row_index;
@@ -277,12 +381,12 @@ CUDF_KERNEL void conditional_join_anti_semi(
   table_device_view right_table,
   join_kind join_type,
   cudf::size_type* join_output_l,
-  cudf::size_type* current_idx,
+  std::size_t* current_idx,
   cudf::ast::detail::expression_device_view device_expression_data,
-  cudf::size_type const max_size)
+  std::size_t const max_size)
 {
   constexpr int num_warps = block_size / detail::warp_size;
-  __shared__ cudf::size_type current_idx_shared[num_warps];
+  __shared__ std::size_t current_idx_shared[num_warps];
   __shared__ cudf::size_type join_shared_l[num_warps][output_cache_size];
 
   extern __shared__ char raw_intermediate_storage[];
@@ -310,7 +414,7 @@ CUDF_KERNEL void conditional_join_anti_semi(
   for (cudf::thread_index_type outer_row_index = start_idx; outer_row_index < outer_num_rows;
        outer_row_index += stride) {
     bool found_match = false;
-    for (thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
+    for (cudf::thread_index_type inner_row_index(0); inner_row_index < inner_num_rows;
          ++inner_row_index) {
       auto output_dest = cudf::ast::detail::value_expression_result<bool, has_nulls>();
 
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 31f267d5cfb..3d0f3e4340d 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -262,101 +262,6 @@ struct valid_range {
   }
 };
 
-/**
- * @brief Adds a pair of indices to the shared memory cache
- *
- * @param[in] first The first index in the pair
- * @param[in] second The second index in the pair
- * @param[in,out] current_idx_shared Pointer to shared index that determines
- * where in the shared memory cache the pair will be written
- * @param[in] warp_id The ID of the warp of the calling the thread
- * @param[out] joined_shared_l Pointer to the shared memory cache for left indices
- * @param[out] joined_shared_r Pointer to the shared memory cache for right indices
- */
-__inline__ __device__ void add_pair_to_cache(size_type const first,
-                                             size_type const second,
-                                             size_type* current_idx_shared,
-                                             int const warp_id,
-                                             size_type* joined_shared_l,
-                                             size_type* joined_shared_r)
-{
-  size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
-  // its guaranteed to fit into the shared cache
-  joined_shared_l[my_current_idx] = first;
-  joined_shared_r[my_current_idx] = second;
-}
-
-__inline__ __device__ void add_left_to_cache(size_type const first,
-                                             size_type* current_idx_shared,
-                                             int const warp_id,
-                                             size_type* joined_shared_l)
-{
-  size_type my_current_idx{atomicAdd(current_idx_shared + warp_id, size_type(1))};
-
-  joined_shared_l[my_current_idx] = first;
-}
-
-template <int num_warps, cudf::size_type output_cache_size>
-__device__ void flush_output_cache(unsigned int const activemask,
-                                   cudf::size_type const max_size,
-                                   int const warp_id,
-                                   int const lane_id,
-                                   cudf::size_type* current_idx,
-                                   cudf::size_type current_idx_shared[num_warps],
-                                   size_type join_shared_l[num_warps][output_cache_size],
-                                   size_type join_shared_r[num_warps][output_cache_size],
-                                   size_type* join_output_l,
-                                   size_type* join_output_r)
-{
-  // count how many active threads participating here which could be less than warp_size
-  int const num_threads         = __popc(activemask);
-  cudf::size_type output_offset = 0;
-
-  if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
-
-  // No warp sync is necessary here because we are assuming that ShuffleIndex
-  // is internally using post-CUDA 9.0 synchronization-safe primitives
-  // (__shfl_sync instead of __shfl). __shfl is technically not guaranteed to
-  // be safe by the compiler because it is not required by the standard to
-  // converge divergent branches before executing.
-  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
-
-  for (int shared_out_idx = lane_id; shared_out_idx < current_idx_shared[warp_id];
-       shared_out_idx += num_threads) {
-    cudf::size_type thread_offset = output_offset + shared_out_idx;
-    if (thread_offset < max_size) {
-      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
-      join_output_r[thread_offset] = join_shared_r[warp_id][shared_out_idx];
-    }
-  }
-}
-
-template <int num_warps, cudf::size_type output_cache_size>
-__device__ void flush_output_cache(unsigned int const activemask,
-                                   cudf::size_type const max_size,
-                                   int const warp_id,
-                                   int const lane_id,
-                                   cudf::size_type* current_idx,
-                                   cudf::size_type current_idx_shared[num_warps],
-                                   size_type join_shared_l[num_warps][output_cache_size],
-                                   size_type* join_output_l)
-{
-  int const num_threads         = __popc(activemask);
-  cudf::size_type output_offset = 0;
-
-  if (0 == lane_id) { output_offset = atomicAdd(current_idx, current_idx_shared[warp_id]); }
-
-  output_offset = cub::ShuffleIndex<detail::warp_size>(output_offset, 0, activemask);
-
-  for (int shared_out_idx = lane_id; shared_out_idx < current_idx_shared[warp_id];
-       shared_out_idx += num_threads) {
-    cudf::size_type thread_offset = output_offset + shared_out_idx;
-    if (thread_offset < max_size) {
-      join_output_l[thread_offset] = join_shared_l[warp_id][shared_out_idx];
-    }
-  }
-}
-
 }  // namespace detail
 
 }  // namespace cudf

From 781794bb52448f617351ed96441a8e2fdb765dd7 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 1 Jul 2024 14:59:04 -0700
Subject: [PATCH 445/842] Backport #16045 to 24.06 (#16102)

Backporting #16045 for a patch release.

---------

Co-authored-by: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
---
 cpp/tests/ast/transform_tests.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp
index ef1d09e5652..6b350c137d0 100644
--- a/cpp/tests/ast/transform_tests.cpp
+++ b/cpp/tests/ast/transform_tests.cpp
@@ -65,6 +65,22 @@ TEST_F(TransformTest, ColumnReference)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
 }
 
+TEST_F(TransformTest, BasicAdditionDoubleCast)
+{
+  auto c_0 = column_wrapper<double>{3, 20, 1, 50};
+  std::vector<__int128_t> data1{10, 7, 20, 0};
+  auto c_1 = cudf::test::fixed_point_column_wrapper<__int128_t>(
+    data1.begin(), data1.end(), numeric::scale_type{0});
+  auto table      = cudf::table_view{{c_0, c_1}};
+  auto col_ref_0  = cudf::ast::column_reference(0);
+  auto col_ref_1  = cudf::ast::column_reference(1);
+  auto cast       = cudf::ast::operation(cudf::ast::ast_operator::CAST_TO_FLOAT64, col_ref_1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, cast);
+  auto expected   = column_wrapper<double>{13, 27, 21, 50};
+  auto result     = cudf::compute_column(table, expression);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view(), verbosity);
+}
+
 TEST_F(TransformTest, Literal)
 {
   auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};

From 08552f816ddf21288448997e4998c3e1e0e58f5f Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 2 Jul 2024 03:12:50 +0100
Subject: [PATCH 446/842] Update cudf-polars for v1 release of polars (#16149)

Minor changes to the IR, which we adapt to, and request `polars>=1.0` in dependencies.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16149
---
 ci/test_cudf_polars.sh                          |  4 +---
 dependencies.yaml                               |  2 +-
 python/cudf_polars/cudf_polars/dsl/expr.py      |  6 +++---
 python/cudf_polars/cudf_polars/dsl/ir.py        | 11 +++++++++--
 python/cudf_polars/cudf_polars/dsl/translate.py |  6 ++++--
 python/cudf_polars/pyproject.toml               |  2 +-
 6 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/ci/test_cudf_polars.sh b/ci/test_cudf_polars.sh
index 669e049ab26..95fb4b431bf 100755
--- a/ci/test_cudf_polars.sh
+++ b/ci/test_cudf_polars.sh
@@ -28,10 +28,8 @@ rapids-logger "Install cudf wheel"
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/cudf*.whl)[test]
 
-rapids-logger "Install polars (allow pre-release versions)"
-python -m pip install 'polars>=1.0.0a0'
-
 rapids-logger "Install cudf_polars"
+python -m pip install 'polars>=1.0'
 python -m pip install --no-deps python/cudf_polars
 
 rapids-logger "Run cudf_polars tests"
diff --git a/dependencies.yaml b/dependencies.yaml
index 9efbc47896c..e3f8a72e76c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -603,7 +603,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=0.20.30
+          - polars>=1.0
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 16cfd9b9749..fe859c8d958 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -978,15 +978,15 @@ def collect_agg(self, *, depth: int) -> AggInfo:
 class Agg(Expr):
     __slots__ = ("name", "options", "op", "request", "children")
     _non_child = ("dtype", "name", "options")
-    children: tuple[Expr]
+    children: tuple[Expr, ...]
 
     def __init__(
-        self, dtype: plc.DataType, name: str, options: Any, value: Expr
+        self, dtype: plc.DataType, name: str, options: Any, *children: Expr
     ) -> None:
         super().__init__(dtype)
         self.name = name
         self.options = options
-        self.children = (value,)
+        self.children = children
         if name not in Agg._SUPPORTED:
             raise NotImplementedError(
                 f"Unsupported aggregation {name=}"
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index abe26b14a90..9b3096becd4 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -15,6 +15,7 @@
 
 import dataclasses
 import itertools
+import json
 import types
 from functools import cache
 from typing import TYPE_CHECKING, Any, Callable, ClassVar
@@ -180,8 +181,10 @@ def __post_init__(self):
 class Scan(IR):
     """Input from files."""
 
-    typ: Any
+    typ: str
     """What type of file are we reading? Parquet, CSV, etc..."""
+    options: tuple[Any, ...]
+    """Type specific options, as json-encoded strings."""
     paths: list[str]
     """List of paths to read from."""
     file_options: Any
@@ -211,17 +214,21 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         with_columns = options.with_columns
         row_index = options.row_index
         if self.typ == "csv":
+            opts, cloud_opts = map(json.loads, self.options)
             df = DataFrame.from_cudf(
                 cudf.concat(
                     [cudf.read_csv(p, usecols=with_columns) for p in self.paths]
                 )
             )
         elif self.typ == "parquet":
+            opts, cloud_opts = map(json.loads, self.options)
             cdf = cudf.read_parquet(self.paths, columns=with_columns)
             assert isinstance(cdf, cudf.DataFrame)
             df = DataFrame.from_cudf(cdf)
         else:
-            assert_never(self.typ)
+            raise NotImplementedError(
+                f"Unhandled scan type: {self.typ}"
+            )  # pragma: no cover; post init trips first
         if row_index is not None:
             name, offset = row_index
             dtype = self.schema[name]
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index f4bf07ae1e0..a2fdb3c3d79 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -87,9 +87,11 @@ def _(
 def _(
     node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
+    typ, *options = node.scan_type
     return ir.Scan(
         schema,
-        node.scan_type,
+        typ,
+        tuple(options),
         node.paths,
         node.file_options,
         translate_named_expr(visitor, n=node.predicate)
@@ -445,7 +447,7 @@ def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Ex
         dtype,
         node.name,
         node.options,
-        translate_expr(visitor, n=node.arguments),
+        *(translate_expr(visitor, n=n) for n in node.arguments),
     )
 
 
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index effa4861e0c..bf4673fcc50 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -20,7 +20,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cudf==24.8.*,>=0.0.0a0",
-    "polars>=0.20.30",
+    "polars>=1.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",

From a4be7bd1365ec7ede5191a4b5d74e7c514a2b5fe Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 2 Jul 2024 00:50:42 -0700
Subject: [PATCH 447/842] Use Arrow C Data Interface functions for Python
 interop (#15904)

This PR replaces the internals of `from_arrow` in pylibcudf with an implementation that uses the [Arrow C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) using the [Python Capsule interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html). This allows us to decouple our Python builds from using pyarrow Cython (partially, we haven't replaced the `to_arrow` conversion yet) and it will also allow us to support any other Python package that is a producer of the data interface.

To support the above functionality, the following additional changes were needed in this PR:
- Added the ability to produce cudf tables from `ArrowArrayStream` objects since that is what `pyarrow.Table` produces. This function is a simple wrapper around the existing `from_arrrow(ArrowArray)` API.
- Added support for the large strings type, for which support has improved throughout cudf since the `from_arrow_host` API was added and for which we now require a basic overload for tests to pass. I did not add corresponding support for `from_arrow_device` to avoid ballooning the scope of this PR, so that work can be done in a follow-up.
- Proper handling of `type_id::EMPTY` in concatenate because the most natural implementation of the ArrowArrayStream processing is to run `from_arrow` on each chunk and then concatenate the outputs, and from the Python side we can produce chunks of all null arrays from arrow.

Contributes to #14926

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Robert Maynard (https://github.com/robertmaynard)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/15904
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/cudf/interop.hpp                  |  38 ++++-
 cpp/src/copying/concatenate.cu                |  28 +++-
 cpp/src/interop/arrow_utilities.cpp           |   3 +-
 cpp/src/interop/from_arrow_device.cu          |   3 +
 cpp/src/interop/from_arrow_host.cu            |  32 +++-
 cpp/src/interop/from_arrow_stream.cu          | 143 ++++++++++++++++++
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/copying/concatenate_tests.cpp       |  60 ++++++++
 cpp/tests/interop/from_arrow_stream_test.cpp  | 121 +++++++++++++++
 cpp/tests/interop/nanoarrow_utils.hpp         |   3 +
 python/cudf/cudf/_lib/pylibcudf/interop.pyx   |  36 ++++-
 .../cudf/_lib/pylibcudf/libcudf/interop.pxd   |  20 +++
 python/cudf/cudf/tests/test_series.py         |   2 -
 14 files changed, 466 insertions(+), 25 deletions(-)
 create mode 100644 cpp/src/interop/from_arrow_stream.cu
 create mode 100644 cpp/tests/interop/from_arrow_stream_test.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 35cf90411f2..54070ab6f5a 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -365,6 +365,7 @@ add_library(
   src/interop/to_arrow_device.cu
   src/interop/from_arrow_device.cu
   src/interop/from_arrow_host.cu
+  src/interop/from_arrow_stream.cu
   src/interop/to_arrow_schema.cpp
   src/interop/detail/arrow_allocator.cpp
   src/io/avro/avro.cpp
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 56ec62fa6e1..502ffb9ba4f 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -50,6 +50,8 @@ struct ArrowSchema;
 
 struct ArrowArray;
 
+struct ArrowArrayStream;
+
 namespace cudf {
 /**
  * @addtogroup interop_dlpack
@@ -367,10 +369,11 @@ std::unique_ptr<cudf::scalar> from_arrow(
  * @param mr Device memory resource used to allocate `cudf::table`
  * @return cudf table generated from given arrow data
  */
-std::unique_ptr<cudf::table> from_arrow(ArrowSchema const* schema,
-                                        ArrowArray const* input,
-                                        rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+std::unique_ptr<cudf::table> from_arrow(
+  ArrowSchema const* schema,
+  ArrowArray const* input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input
@@ -385,10 +388,11 @@ std::unique_ptr<cudf::table> from_arrow(ArrowSchema const* schema,
  * @param mr Device memory resource used to allocate `cudf::column`
  * @return cudf column generated from given arrow data
  */
-std::unique_ptr<cudf::column> from_arrow_column(ArrowSchema const* schema,
-                                                ArrowArray const* input,
-                                                rmm::cuda_stream_view stream,
-                                                rmm::mr::device_memory_resource* mr);
+std::unique_ptr<cudf::column> from_arrow_column(
+  ArrowSchema const* schema,
+  ArrowArray const* input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::table` from given ArrowDeviceArray input
@@ -414,6 +418,24 @@ std::unique_ptr<table> from_arrow_host(
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Create `cudf::table` from given ArrowArrayStream input
+ *
+ * @throws std::invalid_argument if input is NULL
+ *
+ * The conversion WILL release the input ArrayArrayStream and its constituent
+ * arrays or schema since Arrow streams are not suitable for multiple reads.
+ *
+ * @param input `ArrowArrayStream` pointer to object that will produce ArrowArray data
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to perform cuda allocation
+ * @return cudf table generated from the given Arrow data
+ */
+std::unique_ptr<table> from_arrow_stream(
+  ArrowArrayStream* input,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Create `cudf::column` from given ArrowDeviceArray input
  *
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 47e74a5cb48..6acbafd24fb 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -463,10 +463,6 @@ void traverse_children::operator()<cudf::list_view>(host_span<column_view const>
  */
 void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(cudf::all_have_same_types(cols.begin(), cols.end()),
-               "Type mismatch in columns to concatenate.",
-               cudf::data_type_error);
-
   // total size of all concatenated rows
   size_t const total_row_count =
     std::accumulate(cols.begin(), cols.end(), std::size_t{}, [](size_t a, auto const& b) {
@@ -476,6 +472,21 @@ void bounds_and_type_check(host_span<column_view const> cols, rmm::cuda_stream_v
                "Total number of concatenated rows exceeds the column size limit",
                std::overflow_error);
 
+  if (std::any_of(cols.begin(), cols.end(), [](column_view const& c) {
+        return c.type().id() == cudf::type_id::EMPTY;
+      })) {
+    CUDF_EXPECTS(
+      std::all_of(cols.begin(),
+                  cols.end(),
+                  [](column_view const& c) { return c.type().id() == cudf::type_id::EMPTY; }),
+      "Mismatch in columns to concatenate.",
+      cudf::data_type_error);
+    return;
+  }
+  CUDF_EXPECTS(cudf::all_have_same_types(cols.begin(), cols.end()),
+               "Type mismatch in columns to concatenate.",
+               cudf::data_type_error);
+
   // traverse children
   cudf::type_dispatcher(cols.front().type(), traverse_children{}, cols, stream);
 }
@@ -498,6 +509,15 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_conc
     return empty_like(columns_to_concat.front());
   }
 
+  // For empty columns, we can just create an EMPTY column of the appropriate length.
+  if (columns_to_concat.front().type().id() == cudf::type_id::EMPTY) {
+    auto length = std::accumulate(
+      columns_to_concat.begin(), columns_to_concat.end(), 0, [](auto a, auto const& b) {
+        return a + b.size();
+      });
+    return std::make_unique<column>(
+      data_type(type_id::EMPTY), length, rmm::device_buffer{}, rmm::device_buffer{}, length);
+  }
   return type_dispatcher<dispatch_storage_type>(
     columns_to_concat.front().type(), concatenate_dispatch{columns_to_concat, stream, mr});
 }
diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp
index dd9e9600a87..605d813ed1e 100644
--- a/cpp/src/interop/arrow_utilities.cpp
+++ b/cpp/src/interop/arrow_utilities.cpp
@@ -39,7 +39,8 @@ data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view)
     case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32);
     case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64);
     case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS);
-    case NANOARROW_TYPE_STRING: return data_type(type_id::STRING);
+    case NANOARROW_TYPE_STRING:
+    case NANOARROW_TYPE_LARGE_STRING: return data_type(type_id::STRING);
     case NANOARROW_TYPE_LIST: return data_type(type_id::LIST);
     case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32);
     case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT);
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
index 002a8ec1f14..73c1a474310 100644
--- a/cpp/src/interop/from_arrow_device.cu
+++ b/cpp/src/interop/from_arrow_device.cu
@@ -143,6 +143,9 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::string_view>(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
+  CUDF_EXPECTS(schema->type != NANOARROW_TYPE_LARGE_STRING,
+               "Large strings are not yet supported in from_arrow_device",
+               cudf::data_type_error);
   if (input->length == 0) {
     return std::make_tuple<column_view, owned_columns_t>(
       {type,
diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu
index 854a1d68fdc..b7e07056686 100644
--- a/cpp/src/interop/from_arrow_host.cu
+++ b/cpp/src/interop/from_arrow_host.cu
@@ -188,8 +188,16 @@ std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::string_v
 
   // chars_column does not contain any nulls, they are tracked by the parent string column
   // itself instead. So we pass nullptr for the validity bitmask.
-  size_type const char_data_length =
-    reinterpret_cast<int32_t const*>(offset_buffers[1])[input->length + input->offset];
+  int64_t const char_data_length = [&]() {
+    if (schema->type == NANOARROW_TYPE_LARGE_STRING) {
+      return reinterpret_cast<int64_t const*>(offset_buffers[1])[input->length + input->offset];
+    } else if (schema->type == NANOARROW_TYPE_STRING) {
+      return static_cast<int64_t>(
+        reinterpret_cast<int32_t const*>(offset_buffers[1])[input->length + input->offset]);
+    } else {
+      CUDF_FAIL("Unsupported string type", cudf::data_type_error);
+    }
+  }();
   void const* char_buffers[2] = {nullptr, input->buffers[2]};
   ArrowArray char_array       = {
           .length     = char_data_length,
@@ -210,15 +218,27 @@ std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<cudf::string_v
   // offset and char data columns for us.
   ArrowSchemaView view;
   NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, offset_schema.get(), nullptr));
-  auto offsets_column =
-    this->operator()<int32_t>(&view, &offsets_array, data_type(type_id::INT32), true);
+  auto offsets_column = [&]() {
+    if (schema->type == NANOARROW_TYPE_LARGE_STRING) {
+      return this->operator()<int64_t>(&view, &offsets_array, data_type(type_id::INT64), true);
+    } else if (schema->type == NANOARROW_TYPE_STRING) {
+      return this->operator()<int32_t>(&view, &offsets_array, data_type(type_id::INT32), true);
+    } else {
+      CUDF_FAIL("Unsupported string type", cudf::data_type_error);
+    }
+  }();
   NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&view, char_data_schema.get(), nullptr));
-  auto chars_column = this->operator()<int8_t>(&view, &char_array, data_type(type_id::INT8), true);
 
+  rmm::device_buffer chars(char_data_length, stream, mr);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(),
+                                reinterpret_cast<uint8_t const*>(char_array.buffers[1]),
+                                chars.size(),
+                                cudaMemcpyDefault,
+                                stream.value()));
   auto const num_rows = offsets_column->size() - 1;
   auto out_col        = make_strings_column(num_rows,
                                      std::move(offsets_column),
-                                     std::move(chars_column->release().data.release()[0]),
+                                     std::move(chars),
                                      input->null_count,
                                      std::move(*get_mask_buffer(input)));
 
diff --git a/cpp/src/interop/from_arrow_stream.cu b/cpp/src/interop/from_arrow_stream.cu
new file mode 100644
index 00000000000..0c85b561944
--- /dev/null
+++ b/cpp/src/interop/from_arrow_stream.cu
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arrow_utilities.hpp"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/concatenate.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/table/table.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow.hpp>
+
+#include <memory>
+#include <stdexcept>
+#include <utility>
+#include <vector>
+
+namespace cudf {
+namespace detail {
+
+namespace {
+
+std::unique_ptr<column> make_empty_column_from_schema(ArrowSchema const* schema,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::mr::device_memory_resource* mr)
+{
+  ArrowSchemaView schema_view;
+  NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, nullptr));
+
+  auto const type{arrow_to_cudf_type(&schema_view)};
+  switch (type.id()) {
+    case type_id::EMPTY: {
+      return std::make_unique<column>(
+        data_type(type_id::EMPTY), 0, rmm::device_buffer{}, rmm::device_buffer{}, 0);
+    }
+    case type_id::LIST: {
+      return cudf::make_lists_column(0,
+                                     cudf::make_empty_column(data_type{type_id::INT32}),
+                                     make_empty_column_from_schema(schema->children[0], stream, mr),
+                                     0,
+                                     {},
+                                     stream,
+                                     mr);
+    }
+    case type_id::STRUCT: {
+      std::vector<std::unique_ptr<column>> child_columns;
+      child_columns.reserve(schema->n_children);
+      std::transform(
+        schema->children,
+        schema->children + schema->n_children,
+        std::back_inserter(child_columns),
+        [&](auto const& child) { return make_empty_column_from_schema(child, stream, mr); });
+      return cudf::make_structs_column(0, std::move(child_columns), 0, {}, stream, mr);
+    }
+    default: {
+      return cudf::make_empty_column(type);
+    }
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<table> from_arrow_stream(ArrowArrayStream* input,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_EXPECTS(input != nullptr, "input ArrowArrayStream must not be NULL", std::invalid_argument);
+
+  // Potential future optimization: Since the from_arrow API accepts an
+  // ArrowSchema we're allocating one here instead of using a view, which we
+  // could avoid with a different underlying implementation.
+  ArrowSchema schema;
+  NANOARROW_THROW_NOT_OK(ArrowArrayStreamGetSchema(input, &schema, nullptr));
+
+  std::vector<std::unique_ptr<cudf::table>> chunks;
+  ArrowArray chunk;
+  while (true) {
+    NANOARROW_THROW_NOT_OK(ArrowArrayStreamGetNext(input, &chunk, nullptr));
+    if (chunk.release == nullptr) { break; }
+    chunks.push_back(from_arrow(&schema, &chunk, stream, mr));
+    chunk.release(&chunk);
+  }
+  input->release(input);
+
+  if (chunks.empty()) {
+    if (schema.n_children == 0) {
+      schema.release(&schema);
+      return std::make_unique<cudf::table>();
+    }
+
+    // If there are no chunks but the schema has children, we need to construct a suitable empty
+    // table.
+    std::vector<std::unique_ptr<cudf::column>> columns;
+    columns.reserve(chunks.size());
+    std::transform(
+      schema.children,
+      schema.children + schema.n_children,
+      std::back_inserter(columns),
+      [&](auto const& child) { return make_empty_column_from_schema(child, stream, mr); });
+    schema.release(&schema);
+    return std::make_unique<cudf::table>(std::move(columns));
+  }
+
+  schema.release(&schema);
+
+  auto chunk_views = std::vector<table_view>{};
+  chunk_views.reserve(chunks.size());
+  std::transform(
+    chunks.begin(), chunks.end(), std::back_inserter(chunk_views), [](auto const& chunk) {
+      return chunk->view();
+    });
+  return cudf::detail::concatenate(chunk_views, stream, mr);
+}
+
+}  // namespace detail
+
+std::unique_ptr<table> from_arrow_stream(ArrowArrayStream* input,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::from_arrow_stream(input, stream, mr);
+}
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 244bcb7d897..0eab9ba61d8 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -273,6 +273,7 @@ ConfigureTest(
   interop/from_arrow_test.cpp
   interop/from_arrow_device_test.cpp
   interop/from_arrow_host_test.cpp
+  interop/from_arrow_stream_test.cpp
   interop/dlpack_test.cpp
   EXTRA_LIB
   nanoarrow
diff --git a/cpp/tests/copying/concatenate_tests.cpp b/cpp/tests/copying/concatenate_tests.cpp
index 054441788d0..18140c34abd 100644
--- a/cpp/tests/copying/concatenate_tests.cpp
+++ b/cpp/tests/copying/concatenate_tests.cpp
@@ -1667,3 +1667,63 @@ TEST_F(DictionaryConcatTest, ErrorsTest)
   std::vector<cudf::column_view> empty;
   EXPECT_THROW(cudf::concatenate(empty), cudf::logic_error);
 }
+
+struct EmptyColumnTest : public cudf::test::BaseFixture {};
+
+TEST_F(EmptyColumnTest, SimpleTest)
+{
+  std::vector<cudf::column> columns;
+  constexpr auto num_copies = 10;
+  constexpr auto num_rows   = 10;
+  for (auto i = 0; i < num_copies; ++i) {
+    columns.emplace_back(cudf::data_type(cudf::type_id::EMPTY),
+                         num_rows,
+                         rmm::device_buffer{},
+                         rmm::device_buffer{},
+                         0);
+  }
+
+  // Create views from columns
+  std::vector<cudf::column_view> views;
+  for (auto& col : columns) {
+    views.push_back(col.view());
+  }
+  auto result = cudf::concatenate(views);
+
+  ASSERT_EQ(result->size(), num_copies * num_rows);
+  ASSERT_EQ(result->type().id(), cudf::type_id::EMPTY);
+}
+
+struct TableOfEmptyColumnsTest : public cudf::test::BaseFixture {};
+
+TEST_F(TableOfEmptyColumnsTest, SimpleTest)
+{
+  std::vector<cudf::table> tables;
+  constexpr auto num_copies  = 10;
+  constexpr auto num_rows    = 10;
+  constexpr auto num_columns = 10;
+  for (auto i = 0; i < num_copies; ++i) {
+    std::vector<std::unique_ptr<cudf::column>> columns;
+    for (auto j = 0; j < num_columns; ++j) {
+      columns.push_back(std::make_unique<cudf::column>(cudf::data_type(cudf::type_id::EMPTY),
+                                                       num_rows,
+                                                       rmm::device_buffer{},
+                                                       rmm::device_buffer{},
+                                                       0));
+    }
+    tables.emplace_back(std::move(columns));
+  }
+
+  // Create views from columns
+  std::vector<cudf::table_view> views;
+  for (auto& tbl : tables) {
+    views.push_back(tbl.view());
+  }
+  auto result = cudf::concatenate(views);
+
+  ASSERT_EQ(result->num_rows(), num_copies * num_rows);
+  ASSERT_EQ(result->num_columns(), num_columns);
+  for (auto i = 0; i < num_columns; ++i) {
+    ASSERT_EQ(result->get_column(i).type().id(), cudf::type_id::EMPTY);
+  }
+}
diff --git a/cpp/tests/interop/from_arrow_stream_test.cpp b/cpp/tests/interop/from_arrow_stream_test.cpp
new file mode 100644
index 00000000000..418ec057303
--- /dev/null
+++ b/cpp/tests/interop/from_arrow_stream_test.cpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nanoarrow_utils.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/testing_main.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/dictionary/encode.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/type_checks.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+struct VectorOfArrays {
+  std::vector<nanoarrow::UniqueArray> arrays;
+  nanoarrow::UniqueSchema schema;
+  size_t index{0};
+
+  static int get_schema(ArrowArrayStream* stream, ArrowSchema* out_schema)
+  {
+    auto private_data = static_cast<VectorOfArrays*>(stream->private_data);
+    ArrowSchemaDeepCopy(private_data->schema.get(), out_schema);
+    return 0;
+  }
+
+  static int get_next(ArrowArrayStream* stream, ArrowArray* out_array)
+  {
+    auto private_data = static_cast<VectorOfArrays*>(stream->private_data);
+    if (private_data->index >= private_data->arrays.size()) {
+      out_array->release = nullptr;
+      return 0;
+    }
+    ArrowArrayMove(private_data->arrays[private_data->index++].get(), out_array);
+    return 0;
+  }
+
+  static const char* get_last_error(ArrowArrayStream* stream) { return nullptr; }
+
+  static void release(ArrowArrayStream* stream)
+  {
+    delete static_cast<VectorOfArrays*>(stream->private_data);
+  }
+};
+
+struct FromArrowStreamTest : public cudf::test::BaseFixture {};
+
+void makeStreamFromArrays(std::vector<nanoarrow::UniqueArray> arrays,
+                          nanoarrow::UniqueSchema schema,
+                          ArrowArrayStream* out)
+{
+  auto* private_data  = new VectorOfArrays{std::move(arrays), std::move(schema)};
+  out->get_schema     = VectorOfArrays::get_schema;
+  out->get_next       = VectorOfArrays::get_next;
+  out->get_last_error = VectorOfArrays::get_last_error;
+  out->release        = VectorOfArrays::release;
+  out->private_data   = private_data;
+}
+
+TEST_F(FromArrowStreamTest, BasicTest)
+{
+  constexpr auto num_copies = 3;
+  std::vector<std::unique_ptr<cudf::table>> tables;
+  // The schema is unique across all tables.
+  nanoarrow::UniqueSchema schema;
+  std::vector<nanoarrow::UniqueArray> arrays;
+  for (auto i = 0; i < num_copies; ++i) {
+    auto [tbl, sch, arr] = get_nanoarrow_host_tables(0);
+    tables.push_back(std::move(tbl));
+    arrays.push_back(std::move(arr));
+    if (i == 0) { sch.move(schema.get()); }
+  }
+  std::vector<cudf::table_view> table_views;
+  for (auto const& table : tables) {
+    table_views.push_back(table->view());
+  }
+  auto expected = cudf::concatenate(table_views);
+
+  ArrowArrayStream stream;
+  makeStreamFromArrays(std::move(arrays), std::move(schema), &stream);
+  auto result = cudf::from_arrow_stream(&stream);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result->view());
+}
+
+TEST_F(FromArrowStreamTest, EmptyTest)
+{
+  auto [tbl, sch, arr] = get_nanoarrow_host_tables(0);
+  std::vector<cudf::table_view> table_views{tbl->view()};
+  auto expected = cudf::concatenate(table_views);
+
+  ArrowArrayStream stream;
+  makeStreamFromArrays({}, std::move(sch), &stream);
+  auto result = cudf::from_arrow_stream(&stream);
+  cudf::have_same_types(expected->view(), result->view());
+}
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index 94c4372e74a..4147728b2a6 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -375,3 +375,6 @@ nanoarrow::UniqueArray get_nanoarrow_list_array(std::initializer_list<T> data,
 
 std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, generated_test_data>
 get_nanoarrow_cudf_table(cudf::size_type length);
+
+std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, nanoarrow::UniqueArray>
+get_nanoarrow_host_tables(cudf::size_type length);
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
index 07e9d1ead11..adf7e1fd7e8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
@@ -1,5 +1,6 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
+from cpython cimport pycapsule
 from cython.operator cimport dereference
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.utility cimport move
@@ -11,9 +12,15 @@ from functools import singledispatch
 
 from pyarrow import lib as pa
 
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.interop cimport (
+    ArrowArray,
+    ArrowArrayStream,
+    ArrowSchema,
     column_metadata,
     from_arrow as cpp_from_arrow,
+    from_arrow_column as cpp_from_arrow_column,
+    from_arrow_stream as cpp_from_arrow_stream,
     to_arrow as cpp_to_arrow,
 )
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
@@ -124,11 +131,15 @@ def _from_arrow_datatype(pyarrow_object):
 def _from_arrow_table(pyarrow_object, *, DataType data_type=None):
     if data_type is not None:
         raise ValueError("data_type may not be passed for tables")
-    cdef shared_ptr[pa.CTable] arrow_table = pa.pyarrow_unwrap_table(pyarrow_object)
+    stream = pyarrow_object.__arrow_c_stream__()
+    cdef ArrowArrayStream* c_stream = (
+        <ArrowArrayStream*>pycapsule.PyCapsule_GetPointer(stream, "arrow_array_stream")
+    )
 
     cdef unique_ptr[table] c_result
     with nogil:
-        c_result = move(cpp_from_arrow(dereference(arrow_table)))
+        # The libcudf function here will release the stream.
+        c_result = move(cpp_from_arrow_stream(c_stream))
 
     return Table.from_libcudf(move(c_result))
 
@@ -190,8 +201,25 @@ def _from_arrow_scalar(pyarrow_object, *, DataType data_type=None):
 def _from_arrow_column(pyarrow_object, *, DataType data_type=None):
     if data_type is not None:
         raise ValueError("data_type may not be passed for arrays")
-    pa_table = pa.table([pyarrow_object], [""])
-    return from_arrow(pa_table).columns()[0]
+
+    schema, array = pyarrow_object.__arrow_c_array__()
+    cdef ArrowSchema* c_schema = (
+        <ArrowSchema*>pycapsule.PyCapsule_GetPointer(schema, "arrow_schema")
+    )
+    cdef ArrowArray* c_array = (
+        <ArrowArray*>pycapsule.PyCapsule_GetPointer(array, "arrow_array")
+    )
+
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = move(cpp_from_arrow_column(c_schema, c_array))
+
+    # The capsule destructors should release automatically for us, but we
+    # choose to do it explicitly here for clarity.
+    c_schema.release(c_schema)
+    c_array.release(c_array)
+
+    return Column.from_libcudf(move(c_result))
 
 
 @singledispatch
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
index 471b78505fb..2151da28d4b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
@@ -7,6 +7,7 @@ from pyarrow.lib cimport CScalar, CTable
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
@@ -16,6 +17,19 @@ cdef extern from "dlpack/dlpack.h" nogil:
     ctypedef struct DLManagedTensor:
         void(*deleter)(DLManagedTensor*) except +
 
+
+# The Arrow structs are not namespaced.
+cdef extern from "cudf/interop.hpp" nogil:
+    cdef struct ArrowSchema:
+        void (*release)(ArrowSchema*) noexcept nogil
+
+    cdef struct ArrowArray:
+        void (*release)(ArrowArray*) noexcept nogil
+
+    cdef struct ArrowArrayStream:
+        void (*release)(ArrowArrayStream*) noexcept nogil
+
+
 cdef extern from "cudf/interop.hpp" namespace "cudf" \
         nogil:
     cdef unique_ptr[table] from_dlpack(const DLManagedTensor* tensor
@@ -42,3 +56,9 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \
         const scalar& input,
         column_metadata metadata,
     ) except +
+
+    cdef unique_ptr[table] from_arrow_stream(ArrowArrayStream* input) except +
+    cdef unique_ptr[column] from_arrow_column(
+        const ArrowSchema* schema,
+        const ArrowArray* input
+    ) except +
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index f2501041f25..8ed78d804bf 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2757,8 +2757,6 @@ def test_series_from_large_string(pa_type):
 
     assert_eq(expected, got)
 
-    assert pa_string_array.equals(got.to_arrow())
-
 
 @pytest.mark.parametrize(
     "scalar",

From a1447c78b8290277b7dbc680479de0c9f4ce0b19 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 2 Jul 2024 09:34:29 -0400
Subject: [PATCH 448/842] Promote has_nested_columns to cudf public API
 (#16131)

The `has_nested_columns` functionality is used in numerous tests. It looks like it should be part of our stable public API.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16131
---
 .../cudf/table/experimental/row_operators.cuh | 12 +++----
 cpp/include/cudf/table/table_view.hpp         | 19 ++++++++--
 cpp/src/table/table_view.cpp                  |  9 ++---
 .../table/experimental_row_operator_tests.cu  | 36 +++++++++----------
 .../table/row_operator_tests_utilities.cu     |  4 +--
 .../table/row_operator_tests_utilities2.cu    |  2 +-
 6 files changed, 44 insertions(+), 38 deletions(-)

diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index e9b81a525fc..c181ac7d402 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -252,7 +252,7 @@ using optional_dremel_view = thrust::optional<detail::dremel_device_view const>;
  *
  * @tparam has_nested_columns compile-time optimization for primitive types.
  *         This template parameter is to be used by the developer by querying
- *         `cudf::detail::has_nested_columns(input)`. `true` compiles operator
+ *         `cudf::has_nested_columns(input)`. `true` compiles operator
  *         overloads for nested types, while `false` only compiles operator
  *         overloads for primitive types.
  * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
@@ -1014,7 +1014,7 @@ class self_comparator {
    *
    * @tparam has_nested_columns compile-time optimization for primitive types.
    *         This template parameter is to be used by the developer by querying
-   *         `cudf::detail::has_nested_columns(input)`. `true` compiles operator
+   *         `cudf::has_nested_columns(input)`. `true` compiles operator
    *         overloads for nested types, while `false` only compiles operator
    *         overloads for primitive types.
    * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
@@ -1186,7 +1186,7 @@ class two_table_comparator {
    *
    * @tparam has_nested_columns compile-time optimization for primitive types.
    *         This template parameter is to be used by the developer by querying
-   *         `cudf::detail::has_nested_columns(input)`. `true` compiles operator
+   *         `cudf::has_nested_columns(input)`. `true` compiles operator
    *         overloads for nested types, while `false` only compiles operator
    *         overloads for primitive types.
    * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
@@ -1326,7 +1326,7 @@ struct nan_equal_physical_equality_comparator {
  *
  * @tparam has_nested_columns compile-time optimization for primitive types.
  *         This template parameter is to be used by the developer by querying
- *         `cudf::detail::has_nested_columns(input)`. `true` compiles operator
+ *         `cudf::has_nested_columns(input)`. `true` compiles operator
  *         overloads for nested types, while `false` only compiles operator
  *         overloads for primitive types.
  * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
@@ -1643,7 +1643,7 @@ class self_comparator {
    *
    * @tparam has_nested_columns compile-time optimization for primitive types.
    *         This template parameter is to be used by the developer by querying
-   *         `cudf::detail::has_nested_columns(input)`. `true` compiles operator
+   *         `cudf::has_nested_columns(input)`. `true` compiles operator
    *         overloads for nested types, while `false` only compiles operator
    *         overloads for primitive types.
    * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
@@ -1757,7 +1757,7 @@ class two_table_comparator {
    *
    * @tparam has_nested_columns compile-time optimization for primitive types.
    *         This template parameter is to be used by the developer by querying
-   *         `cudf::detail::has_nested_columns(input)`. `true` compiles operator
+   *         `cudf::has_nested_columns(input)`. `true` compiles operator
    *         overloads for nested types, while `false` only compiles operator
    *         overloads for primitive types.
    * @tparam Nullate A cudf::nullate type describing whether to check for nulls.
diff --git a/cpp/include/cudf/table/table_view.hpp b/cpp/include/cudf/table/table_view.hpp
index a71e0558dec..4a990f67ce4 100644
--- a/cpp/include/cudf/table/table_view.hpp
+++ b/cpp/include/cudf/table/table_view.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <algorithm>
 #include <vector>
@@ -32,7 +33,7 @@
  * passed by value.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @brief Base class for a table of `ColumnView`s
@@ -123,7 +124,10 @@ class table_view_base {
    * @param column_index The index of the desired column
    * @return A reference to the desired column
    */
-  [[nodiscard]] ColumnView const& column(size_type column_index) const;
+  [[nodiscard]] ColumnView const& column(size_type column_index) const
+  {
+    return _columns.at(column_index);
+  }
 
   /**
    * @brief Returns the number of columns
@@ -174,8 +178,17 @@ class table_view_base {
  * @return Whether nested columns exist in the input table
  */
 bool has_nested_columns(table_view const& table);
+
 }  // namespace detail
 
+/**
+ * @brief Determine if any nested columns exist in a given table.
+ *
+ * @param table The input table
+ * @return Whether nested columns exist in the input table
+ */
+bool has_nested_columns(table_view const& table);
+
 /**
  * @brief A set of cudf::column_view's of the same size.
  *
@@ -374,4 +387,4 @@ extern template bool is_relationally_comparable<mutable_table_view>(mutable_tabl
                                                                     mutable_table_view const& rhs);
 // @endcond
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/table/table_view.cpp b/cpp/src/table/table_view.cpp
index 13832b0d9dc..8a5340dc20d 100644
--- a/cpp/src/table/table_view.cpp
+++ b/cpp/src/table/table_view.cpp
@@ -52,12 +52,6 @@ auto concatenate_column_views(std::vector<ViewType> const& views)
   return concat_cols;
 }
 
-template <typename ColumnView>
-ColumnView const& table_view_base<ColumnView>::column(size_type column_index) const
-{
-  return _columns.at(column_index);
-}
-
 // Explicit instantiation for a table of `column_view`s
 template class table_view_base<column_view>;
 
@@ -172,6 +166,7 @@ bool has_nested_columns(table_view const& table)
   return std::any_of(
     table.begin(), table.end(), [](column_view const& col) { return is_nested(col.type()); });
 }
-
 }  // namespace detail
+
+bool has_nested_columns(table_view const& table) { return detail::has_nested_columns(table); }
 }  // namespace cudf
diff --git a/cpp/tests/table/experimental_row_operator_tests.cu b/cpp/tests/table/experimental_row_operator_tests.cu
index 896cc7a82d4..0d9e4e27f2c 100644
--- a/cpp/tests/table/experimental_row_operator_tests.cu
+++ b/cpp/tests/table/experimental_row_operator_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -109,15 +109,14 @@ TYPED_TEST(TypedTableViewTest, TestSortSameTableFromTwoTables)
   auto const lhs       = cudf::table_view{{col1}};
   auto const empty_rhs = cudf::table_view{{col2}};
 
-  auto const stream    = cudf::get_default_stream();
-  auto const test_sort = [stream](auto const& preprocessed,
-                                  auto const& input,
-                                  auto const& comparator,
-                                  auto const& expected) {
-    auto const order = sorted_order(
-      preprocessed, input.num_rows(), cudf::detail::has_nested_columns(input), comparator, stream);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view());
-  };
+  auto const stream = cudf::get_default_stream();
+  auto const test_sort =
+    [stream](
+      auto const& preprocessed, auto const& input, auto const& comparator, auto const& expected) {
+      auto const order = sorted_order(
+        preprocessed, input.num_rows(), cudf::has_nested_columns(input), comparator, stream);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view());
+    };
 
   auto const test_sort_two_tables = [&](auto const& preprocessed_lhs,
                                         auto const& preprocessed_empty_rhs) {
@@ -188,15 +187,14 @@ TYPED_TEST(TypedTableViewTest, TestSortSameTableFromTwoTablesWithListsOfStructs)
   auto const lhs          = cudf::table_view{{*col1}};
   auto const empty_rhs    = cudf::table_view{{*col2}};
 
-  auto const stream    = cudf::get_default_stream();
-  auto const test_sort = [stream](auto const& preprocessed,
-                                  auto const& input,
-                                  auto const& comparator,
-                                  auto const& expected) {
-    auto const order = sorted_order(
-      preprocessed, input.num_rows(), cudf::detail::has_nested_columns(input), comparator, stream);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view());
-  };
+  auto const stream = cudf::get_default_stream();
+  auto const test_sort =
+    [stream](
+      auto const& preprocessed, auto const& input, auto const& comparator, auto const& expected) {
+      auto const order = sorted_order(
+        preprocessed, input.num_rows(), cudf::has_nested_columns(input), comparator, stream);
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, order->view());
+    };
 
   auto const test_sort_two_tables = [&](auto const& preprocessed_lhs,
                                         auto const& preprocessed_empty_rhs) {
diff --git a/cpp/tests/table/row_operator_tests_utilities.cu b/cpp/tests/table/row_operator_tests_utilities.cu
index cfffa1cdd54..6127864987d 100644
--- a/cpp/tests/table/row_operator_tests_utilities.cu
+++ b/cpp/tests/table/row_operator_tests_utilities.cu
@@ -42,7 +42,7 @@ std::unique_ptr<cudf::column> two_table_comparison(cudf::table_view lhs,
   auto output = cudf::make_numeric_column(
     cudf::data_type(cudf::type_id::BOOL8), lhs.num_rows(), cudf::mask_state::UNALLOCATED);
 
-  if (cudf::detail::has_nested_columns(lhs) || cudf::detail::has_nested_columns(rhs)) {
+  if (cudf::has_nested_columns(lhs) || cudf::has_nested_columns(rhs)) {
     thrust::transform(rmm::exec_policy(stream),
                       lhs_it,
                       lhs_it + lhs.num_rows(),
@@ -129,7 +129,7 @@ std::unique_ptr<cudf::column> two_table_equality(cudf::table_view lhs,
   auto output = cudf::make_numeric_column(
     cudf::data_type(cudf::type_id::BOOL8), lhs.num_rows(), cudf::mask_state::UNALLOCATED);
 
-  if (cudf::detail::has_nested_columns(lhs) or cudf::detail::has_nested_columns(rhs)) {
+  if (cudf::has_nested_columns(lhs) or cudf::has_nested_columns(rhs)) {
     auto const equal_comparator =
       table_comparator.equal_to<true>(cudf::nullate::NO{}, cudf::null_equality::EQUAL, comparator);
 
diff --git a/cpp/tests/table/row_operator_tests_utilities2.cu b/cpp/tests/table/row_operator_tests_utilities2.cu
index 057d9ee1004..17d274eba13 100644
--- a/cpp/tests/table/row_operator_tests_utilities2.cu
+++ b/cpp/tests/table/row_operator_tests_utilities2.cu
@@ -41,7 +41,7 @@ std::unique_ptr<cudf::column> self_comparison(cudf::table_view input,
   auto output = cudf::make_numeric_column(
     cudf::data_type(cudf::type_id::BOOL8), input.num_rows(), cudf::mask_state::UNALLOCATED);
 
-  if (cudf::detail::has_nested_columns(input)) {
+  if (cudf::has_nested_columns(input)) {
     thrust::transform(rmm::exec_policy(stream),
                       thrust::make_counting_iterator(0),
                       thrust::make_counting_iterator(input.num_rows()),

From 1a4c2aa38c6e7de8c6937b787a1263a4ccddadea Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Tue, 2 Jul 2024 07:38:18 -0700
Subject: [PATCH 449/842] Start migrating I/O writers to pylibcudf (starting
 with JSON) (#15952)

Switches the JSON writer to use pylibcudf.
xref #15162

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15952
---
 .../api_docs/pylibcudf/io/index.rst           |   1 +
 .../user_guide/api_docs/pylibcudf/io/json.rst |   6 +
 python/cudf/cudf/_lib/json.pyx                |  98 +++-----
 .../cudf/_lib/pylibcudf/io/CMakeLists.txt     |   6 +-
 .../cudf/cudf/_lib/pylibcudf/io/__init__.pxd  |   2 +-
 .../cudf/cudf/_lib/pylibcudf/io/__init__.py   |   4 +-
 python/cudf/cudf/_lib/pylibcudf/io/avro.pyx   |   4 +-
 python/cudf/cudf/_lib/pylibcudf/io/json.pxd   |  18 ++
 python/cudf/cudf/_lib/pylibcudf/io/json.pyx   |  68 ++++++
 python/cudf/cudf/_lib/pylibcudf/io/types.pxd  |  11 +
 python/cudf/cudf/_lib/pylibcudf/io/types.pyx  | 125 +++++++++-
 .../cudf/cudf/pylibcudf_tests/common/utils.py | 122 ++++++++--
 python/cudf/cudf/pylibcudf_tests/conftest.py  | 104 ++++++--
 .../pylibcudf_tests/{ => io}/test_avro.py     |   0
 .../cudf/cudf/pylibcudf_tests/io/test_json.py | 116 +++++++++
 .../test_source_sink_info.py}                 |  34 ++-
 .../cudf/cudf/pylibcudf_tests/test_copying.py | 226 +++++++++++++-----
 17 files changed, 768 insertions(+), 177 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/json.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/json.pyx
 rename python/cudf/cudf/pylibcudf_tests/{ => io}/test_avro.py (100%)
 create mode 100644 python/cudf/cudf/pylibcudf_tests/io/test_json.py
 rename python/cudf/cudf/pylibcudf_tests/{test_source_info.py => io/test_source_sink_info.py} (72%)

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
index 0d53ac92db9..bde6d8094ce 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -16,3 +16,4 @@ I/O Functions
     :maxdepth: 1
 
     avro
+    json
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst
new file mode 100644
index 00000000000..6aeae1f322a
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst
@@ -0,0 +1,6 @@
+====
+JSON
+====
+
+.. automodule:: cudf._lib.pylibcudf.io.json
+   :members:
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index a8fef907bad..22e34feb547 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -9,38 +9,27 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 from libcpp.map cimport map
-from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-from cudf._lib.column cimport Column
-from cudf._lib.io.utils cimport (
-    make_sink_info,
-    make_source_info,
-    update_struct_field_names,
-)
-from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
+from cudf._lib.io.utils cimport make_source_info, update_struct_field_names
 from cudf._lib.pylibcudf.libcudf.io.json cimport (
     json_reader_options,
     json_recovery_mode_t,
-    json_writer_options,
     read_json as libcudf_read_json,
     schema_element,
-    write_json as libcudf_write_json,
 )
 from cudf._lib.pylibcudf.libcudf.io.types cimport (
-    column_name_info,
     compression_type,
-    sink_info,
-    table_metadata,
     table_with_metadata,
 )
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
 from cudf._lib.types cimport dtype_to_data_type
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport data_from_unique_ptr
+
+import cudf._lib.pylibcudf as plc
 
 
 cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines):
@@ -175,45 +164,27 @@ def write_json(
     --------
     cudf.to_json
     """
-    cdef table_view input_table_view = table_view_from_table(
-        table, ignore_index=True
-    )
-
-    cdef unique_ptr[data_sink] data_sink_c
-    cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c)
-    cdef string na_c = na_rep.encode()
-    cdef bool include_nulls_c = include_nulls
-    cdef bool lines_c = lines
-    cdef int rows_per_chunk_c = rows_per_chunk
-    cdef string true_value_c = 'true'.encode()
-    cdef string false_value_c = 'false'.encode()
-    cdef table_metadata tbl_meta
-
-    num_index_cols_meta = 0
-    cdef column_name_info child_info
-    for i, name in enumerate(table._column_names, num_index_cols_meta):
-        child_info.name = name.encode()
-        tbl_meta.schema_info.push_back(child_info)
-        _set_col_children_metadata(
-            table[name]._column,
-            tbl_meta.schema_info[i]
-        )
+    cdef list colnames = []
 
-    cdef json_writer_options options = move(
-        json_writer_options.builder(sink_info_c, input_table_view)
-        .metadata(tbl_meta)
-        .na_rep(na_c)
-        .include_nulls(include_nulls_c)
-        .lines(lines_c)
-        .rows_per_chunk(rows_per_chunk_c)
-        .true_value(true_value_c)
-        .false_value(false_value_c)
-        .build()
-    )
+    for name in table._column_names:
+        colnames.append((name, _dtype_to_names_list(table[name]._column)))
 
     try:
-        with nogil:
-            libcudf_write_json(options)
+        plc.io.json.write_json(
+            plc.io.SinkInfo([path_or_buf]),
+            plc.io.TableWithMetadata(
+                plc.Table([
+                    c.to_pylibcudf(mode="read") for c in table._columns
+                ]),
+                colnames
+            ),
+            na_rep,
+            include_nulls,
+            lines,
+            rows_per_chunk,
+            true_value="true",
+            false_value="false"
+        )
     except OverflowError:
         raise OverflowError(
             f"Writing JSON file with rows_per_chunk={rows_per_chunk} failed. "
@@ -254,23 +225,12 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
         )
     return dtype_to_data_type(dtype)
 
-cdef _set_col_children_metadata(Column col,
-                                column_name_info& col_meta):
-    cdef column_name_info child_info
+
+def _dtype_to_names_list(col):
     if isinstance(col.dtype, cudf.StructDtype):
-        for i, (child_col, name) in enumerate(
-            zip(col.children, list(col.dtype.fields))
-        ):
-            child_info.name = name.encode()
-            col_meta.children.push_back(child_info)
-            _set_col_children_metadata(
-                child_col, col_meta.children[i]
-            )
+        return [(name, _dtype_to_names_list(child))
+                for name, child in zip(col.dtype.fields, col.children)]
     elif isinstance(col.dtype, cudf.ListDtype):
-        for i, child_col in enumerate(col.children):
-            col_meta.children.push_back(child_info)
-            _set_col_children_metadata(
-                child_col, col_meta.children[i]
-            )
-    else:
-        return
+        return [("", _dtype_to_names_list(child))
+                for child in col.children]
+    return []
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
index 32f0f5543e4..084b341ec48 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources avro.pyx datasource.pyx types.pyx)
+set(cython_sources avro.pyx datasource.pyx json.pyx types.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
@@ -21,5 +21,7 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
 )
 
-set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_types)
+set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_json
+                                pylibcudf_io_types
+)
 link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
index cfd6d2cd281..ef4c65b277e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . cimport avro, datasource, types
+from . cimport avro, datasource, json, types
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
index a54ba1834dc..fb4e4c7e4bb 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, datasource, types
-from .types import SourceInfo, TableWithMetadata
+from . import avro, datasource, json, types
+from .types import SinkInfo, SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
index 946e0896fc8..538bd8aa322 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
@@ -19,7 +19,7 @@ cpdef TableWithMetadata read_avro(
     size_type num_rows = -1
 ):
     """
-    Reads an Avro dataset into a set of columns.
+    Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`.
 
     Parameters
     ----------
@@ -36,7 +36,7 @@ cpdef TableWithMetadata read_avro(
     Returns
     -------
     TableWithMetadata
-        The Table and its corresponding metadata that was read in.
+        The Table and its corresponding metadata (column names) that were read in.
     """
     cdef vector[string] c_columns
     if columns is not None and len(columns) > 0:
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
new file mode 100644
index 00000000000..a91d574131f
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+
+cpdef void write_json(
+    SinkInfo sink_info,
+    TableWithMetadata tbl,
+    str na_rep = *,
+    bool include_nulls = *,
+    bool lines = *,
+    size_type rows_per_chunk = *,
+    str true_value = *,
+    str false_value = *
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
new file mode 100644
index 00000000000..7530eba3803
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
@@ -0,0 +1,68 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.limits cimport numeric_limits
+from libcpp.string cimport string
+
+from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata
+from cudf._lib.pylibcudf.libcudf.io.json cimport (
+    json_writer_options,
+    write_json as cpp_write_json,
+)
+from cudf._lib.pylibcudf.libcudf.io.types cimport table_metadata
+from cudf._lib.pylibcudf.types cimport size_type
+
+
+cpdef void write_json(
+    SinkInfo sink_info,
+    TableWithMetadata table_w_meta,
+    str na_rep = "",
+    bool include_nulls = False,
+    bool lines = False,
+    size_type rows_per_chunk = numeric_limits[size_type].max(),
+    str true_value = "true",
+    str false_value = "false"
+):
+    """
+    Writes a :py:class:`~cudf._lib.pylibcudf.table.Table` to JSON format.
+
+    Parameters
+    ----------
+    sink_info: SinkInfo
+        The SinkInfo object to write the JSON to.
+    table_w_meta: TableWithMetadata
+        The TableWithMetadata object containing the Table to write
+    na_rep: str, default ""
+        The string representation for null values.
+    include_nulls: bool, default False
+        Enables/Disables output of nulls as 'null'.
+    lines: bool, default False
+        If `True`, write output in the JSON lines format.
+    rows_per_chunk: size_type, defaults to length of the input table
+        The maximum number of rows to write at a time.
+    true_value: str, default "true"
+        The string representation for values != 0 in INT8 types.
+    false_value: str, default "false"
+        The string representation for values == 0 in INT8 types.
+    """
+    cdef table_metadata tbl_meta = table_w_meta.metadata
+    cdef string na_rep_c = na_rep.encode()
+
+    cdef json_writer_options options = (
+        json_writer_options.builder(sink_info.c_obj, table_w_meta.tbl.view())
+        .metadata(tbl_meta)
+        .na_rep(na_rep_c)
+        .include_nulls(include_nulls)
+        .lines(lines)
+        .build()
+    )
+
+    if rows_per_chunk != numeric_limits[size_type].max():
+        options.set_rows_per_chunk(rows_per_chunk)
+    if true_value != "true":
+        options.set_true_value(<string>true_value.encode())
+    if false_value != "false":
+        options.set_false_value(<string>false_value.encode())
+
+    with nogil:
+        cpp_write_json(options)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
index aa846a47343..88daf54f33b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
@@ -1,4 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp.memory cimport unique_ptr
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
 from cudf._lib.pylibcudf.libcudf.io.types cimport (
     column_encoding,
     column_in_metadata,
@@ -22,8 +26,15 @@ cdef class TableWithMetadata:
     cdef public Table tbl
     cdef table_metadata metadata
 
+    cdef vector[column_name_info] _make_column_info(self, list column_names)
+
     @staticmethod
     cdef TableWithMetadata from_libcudf(table_with_metadata& tbl)
 
 cdef class SourceInfo:
     cdef source_info c_obj
+
+cdef class SinkInfo:
+    # This vector just exists to keep the unique_ptrs to the sinks alive
+    cdef vector[unique_ptr[data_sink]] sink_storage
+    cdef sink_info c_obj
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
index ab3375da662..f94e20970a4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
@@ -1,17 +1,23 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from cpython.buffer cimport PyBUF_READ
+from cpython.memoryview cimport PyMemoryView_FromMemory
+from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.pylibcudf.io.datasource cimport Datasource
+from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
 from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    column_name_info,
     host_buffer,
     source_info,
     table_with_metadata,
 )
 
+import codecs
 import errno
 import io
 import os
@@ -22,7 +28,39 @@ cdef class TableWithMetadata:
     (e.g. column names)
 
     For details, see :cpp:class:`cudf::io::table_with_metadata`.
+
+    Parameters
+    ----------
+    tbl : Table
+        The input table.
+    column_names : list
+        A list of tuples each containing the name of each column
+        and the names of its child columns (in the same format).
+        e.g.
+        [("id", []), ("name", [("first", []), ("last", [])])]
+
     """
+    def __init__(self, Table tbl, list column_names):
+        self.tbl = tbl
+
+        self.metadata.schema_info = self._make_column_info(column_names)
+
+    cdef vector[column_name_info] _make_column_info(self, list column_names):
+        cdef vector[column_name_info] col_name_infos
+        cdef column_name_info info
+
+        col_name_infos.reserve(len(column_names))
+
+        for name, child_names in column_names:
+            if not isinstance(name, str):
+                raise ValueError("Column name must be a string!")
+
+            info.name = <string> name.encode()
+            info.children = self._make_column_info(child_names)
+
+            col_name_infos.push_back(info)
+
+        return col_name_infos
 
     @property
     def columns(self):
@@ -51,6 +89,7 @@ cdef class TableWithMetadata:
         out.metadata = tbl_with_meta.metadata
         return out
 
+
 cdef class SourceInfo:
     """A class containing details on a source to read from.
 
@@ -119,7 +158,87 @@ cdef class SourceInfo:
             raise ValueError("Sources must be a list of str/paths, "
                              "bytes, io.BytesIO, or a Datasource")
 
-        if empty_buffer is True:
-            c_host_buffers.push_back(host_buffer(<char*>NULL, 0))
+        self.c_obj = source_info(c_host_buffers)
+
+
+# Adapts a python io.IOBase object as a libcudf IO data_sink. This lets you
+# write from cudf to any python file-like object (File/BytesIO/SocketIO etc)
+cdef cppclass iobase_data_sink(data_sink):
+    object buf
+
+    iobase_data_sink(object buf_):
+        this.buf = buf_
+
+    void host_write(const void * data, size_t size) with gil:
+        if isinstance(buf, io.TextIOBase):
+            buf.write(PyMemoryView_FromMemory(<char*>data, size, PyBUF_READ)
+                      .tobytes().decode())
+        else:
+            buf.write(PyMemoryView_FromMemory(<char*>data, size, PyBUF_READ))
+
+    void flush() with gil:
+        buf.flush()
+
+    size_t bytes_written() with gil:
+        return buf.tell()
+
+
+cdef class SinkInfo:
+    """A class containing details on a source to read from.
+
+    For details, see :cpp:class:`cudf::io::sink_info`.
+
+    Parameters
+    ----------
+    sinks : list of str, PathLike, BytesIO, StringIO
+
+        A homogeneous list of sinks (this can be a string filename,
+        bytes, or one of the Python I/O classes) to read from.
+
+        Mixing different types of sinks will raise a `ValueError`.
+    """
+
+    def __init__(self, list sinks):
+        cdef vector[data_sink *] data_sinks
+        cdef vector[string] paths
+
+        if not sinks:
+            raise ValueError("Need to pass at least one sink")
+
+        if isinstance(sinks[0], os.PathLike):
+            sinks = [os.path.expanduser(s) for s in sinks]
+
+        cdef object initial_sink_cls = type(sinks[0])
+
+        if not all(isinstance(s, initial_sink_cls) for s in sinks):
+            raise ValueError("All sinks must be of the same type!")
+
+        if initial_sink_cls in {io.StringIO, io.BytesIO, io.TextIOBase}:
+            data_sinks.reserve(len(sinks))
+            if isinstance(sinks[0], (io.StringIO, io.BytesIO)):
+                for s in sinks:
+                    self.sink_storage.push_back(
+                        unique_ptr[data_sink](new iobase_data_sink(s))
+                    )
+            elif isinstance(sinks[0], io.TextIOBase):
+                for s in sinks:
+                    if codecs.lookup(s).name not in ('utf-8', 'ascii'):
+                        raise NotImplementedError(f"Unsupported encoding {s.encoding}")
+                    self.sink_storage.push_back(
+                        unique_ptr[data_sink](new iobase_data_sink(s.buffer))
+                    )
+            data_sinks.push_back(self.sink_storage.back().get())
+        elif initial_sink_cls is str:
+            paths.reserve(len(sinks))
+            for s in sinks:
+                paths.push_back(<string> s.encode())
+        else:
+            raise TypeError(
+                "Unrecognized input type: {}".format(type(sinks[0]))
+            )
 
-        self.c_obj = move(source_info(c_host_buffers))
+        if data_sinks.size() > 0:
+            self.c_obj = sink_info(data_sinks)
+        else:
+            # we don't have sinks so we must have paths to sinks
+            self.c_obj = sink_info(paths)
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index bf927e661fe..f8bfe340ae5 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -1,24 +1,39 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from __future__ import annotations
 
+import io
+import os
+
 import pyarrow as pa
 import pytest
 
 from cudf._lib import pylibcudf as plc
 
 
-def metadata_from_arrow_array(
-    pa_array: pa.Array,
+def metadata_from_arrow_type(
+    pa_type: pa.Array,
+    name: str = "",
 ) -> plc.interop.ColumnMetadata | None:
-    metadata = None
-    if pa.types.is_list(dtype := pa_array.type) or pa.types.is_struct(dtype):
+    metadata = plc.interop.ColumnMetadata(name)  # None
+    if pa.types.is_list(pa_type):
+        child_meta = [plc.interop.ColumnMetadata("offsets")]
+        for i in range(pa_type.num_fields):
+            field_meta = metadata_from_arrow_type(
+                pa_type.field(i).type, pa_type.field(i).name
+            )
+            child_meta.append(field_meta)
+        metadata = plc.interop.ColumnMetadata(name, child_meta)
+    elif pa.types.is_struct(pa_type):
+        child_meta = []
+        for i in range(pa_type.num_fields):
+            field_meta = metadata_from_arrow_type(
+                pa_type.field(i).type, pa_type.field(i).name
+            )
+            child_meta.append(field_meta)
         metadata = plc.interop.ColumnMetadata(
-            "",
+            name,
             # libcudf does not store field names, so just match pyarrow's.
-            [
-                plc.interop.ColumnMetadata(pa_array.type.field(i).name)
-                for i in range(pa_array.type.num_fields)
-            ],
+            child_meta,
         )
     return metadata
 
@@ -32,13 +47,13 @@ def assert_column_eq(
         rhs, plc.Column
     ):
         rhs = plc.interop.to_arrow(
-            rhs, metadata=metadata_from_arrow_array(lhs)
+            rhs, metadata=metadata_from_arrow_type(lhs.type)
         )
     elif isinstance(lhs, plc.Column) and isinstance(
         rhs, (pa.Array, pa.ChunkedArray)
     ):
         lhs = plc.interop.to_arrow(
-            lhs, metadata=metadata_from_arrow_array(rhs)
+            lhs, metadata=metadata_from_arrow_type(rhs.type)
         )
     else:
         raise ValueError(
@@ -94,21 +109,16 @@ def is_signed_integer(plc_dtype: plc.DataType):
     )
 
 
-def is_unsigned_integer(plc_dtype: plc.DataType):
-    return plc_dtype.id() in (
-        plc.TypeId.UINT8,
-        plc.TypeId.UINT16,
-        plc.TypeId.UINT32,
-        plc.TypeId.UINT64,
-    )
-
-
 def is_integer(plc_dtype: plc.DataType):
     return plc_dtype.id() in (
         plc.TypeId.INT8,
         plc.TypeId.INT16,
         plc.TypeId.INT32,
         plc.TypeId.INT64,
+        plc.TypeId.UINT8,
+        plc.TypeId.UINT16,
+        plc.TypeId.UINT32,
+        plc.TypeId.UINT64,
     )
 
 
@@ -135,8 +145,80 @@ def is_fixed_width(plc_dtype: plc.DataType):
     )
 
 
+def nesting_level(typ) -> tuple[int, int]:
+    """Return list and struct nesting of a pyarrow type."""
+    if isinstance(typ, pa.ListType):
+        list_, struct = nesting_level(typ.value_type)
+        return list_ + 1, struct
+    elif isinstance(typ, pa.StructType):
+        lists, structs = map(max, zip(*(nesting_level(t.type) for t in typ)))
+        return lists, structs + 1
+    else:
+        return 0, 0
+
+
+def is_nested_struct(typ):
+    return nesting_level(typ)[1] > 1
+
+
+def is_nested_list(typ):
+    return nesting_level(typ)[0] > 1
+
+
+def sink_to_str(sink):
+    """
+    Takes a sink (e.g. StringIO/BytesIO, filepath, etc.)
+    and reads in the contents into a string (str not bytes)
+    for comparison
+    """
+    if isinstance(sink, (str, os.PathLike)):
+        with open(sink, "r") as f:
+            str_result = f.read()
+    elif isinstance(sink, io.BytesIO):
+        sink.seek(0)
+        str_result = sink.read().decode()
+    else:
+        sink.seek(0)
+        str_result = sink.read()
+    return str_result
+
+
+NUMERIC_PA_TYPES = [pa.int64(), pa.float64(), pa.uint64()]
+STRING_PA_TYPES = [pa.string()]
+BOOL_PA_TYPES = [pa.bool_()]
+LIST_PA_TYPES = [
+    pa.list_(pa.int64()),
+    # Nested case
+    pa.list_(pa.list_(pa.int64())),
+]
+
 # We must explicitly specify this type via a field to ensure we don't include
 # nullability accidentally.
 DEFAULT_STRUCT_TESTING_TYPE = pa.struct(
     [pa.field("v", pa.int64(), nullable=False)]
 )
+NESTED_STRUCT_TESTING_TYPE = pa.struct(
+    [
+        pa.field("a", pa.int64(), nullable=False),
+        pa.field(
+            "b_struct",
+            pa.struct([pa.field("b", pa.float64(), nullable=False)]),
+            nullable=False,
+        ),
+    ]
+)
+
+DEFAULT_PA_STRUCT_TESTING_TYPES = [
+    DEFAULT_STRUCT_TESTING_TYPE,
+    NESTED_STRUCT_TESTING_TYPE,
+]
+
+DEFAULT_PA_TYPES = (
+    NUMERIC_PA_TYPES
+    + STRING_PA_TYPES
+    + BOOL_PA_TYPES
+    + LIST_PA_TYPES
+    + DEFAULT_PA_STRUCT_TESTING_TYPES
+)
+
+ALL_PA_TYPES = DEFAULT_PA_TYPES
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
index b169bbdee5b..e4760ea7ac8 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -1,9 +1,12 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 # Tell ruff it's OK that some imports occur after the sys.path.insert
 # ruff: noqa: E402
+import io
 import os
+import pathlib
 import sys
 
+import numpy as np
 import pyarrow as pa
 import pytest
 
@@ -11,7 +14,7 @@
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
 
-from utils import DEFAULT_STRUCT_TESTING_TYPE
+from utils import ALL_PA_TYPES, DEFAULT_PA_TYPES, NUMERIC_PA_TYPES
 
 
 # This fixture defines the standard set of types that all tests should default to
@@ -20,14 +23,7 @@
 # across modules. Otherwise it may be defined on a per-module basis.
 @pytest.fixture(
     scope="session",
-    params=[
-        pa.int64(),
-        pa.float64(),
-        pa.string(),
-        pa.bool_(),
-        pa.list_(pa.int64()),
-        DEFAULT_STRUCT_TESTING_TYPE,
-    ],
+    params=DEFAULT_PA_TYPES,
 )
 def pa_type(request):
     return request.param
@@ -35,16 +31,96 @@ def pa_type(request):
 
 @pytest.fixture(
     scope="session",
-    params=[
-        pa.int64(),
-        pa.float64(),
-        pa.uint64(),
-    ],
+    params=NUMERIC_PA_TYPES,
 )
 def numeric_pa_type(request):
     return request.param
 
 
+# TODO: Consider adding another fixture/adapting this
+# fixture to consider nullability
+@pytest.fixture(scope="session", params=[0, 100])
+def table_data(request):
+    """
+    Returns (TableWithMetadata, pa_table).
+
+    This is the default fixture you should be using for testing
+    pylibcudf I/O writers.
+
+    Contains one of each category (e.g. int, bool, list, struct)
+    of dtypes.
+    """
+    nrows = request.param
+
+    table_dict = {}
+    # Colnames in the format expected by
+    # plc.io.TableWithMetadata
+    colnames = []
+
+    np.random.seed(42)
+
+    for typ in ALL_PA_TYPES:
+        rand_vals = np.random.randint(0, nrows, nrows)
+        child_colnames = []
+
+        def _generate_nested_data(typ):
+            child_colnames = []
+
+            # recurse to get vals for children
+            rand_arrs = []
+            for i in range(typ.num_fields):
+                rand_arr, grandchild_colnames = _generate_nested_data(
+                    typ.field(i).type
+                )
+                rand_arrs.append(rand_arr)
+                child_colnames.append((typ.field(i).name, grandchild_colnames))
+
+            if isinstance(typ, pa.StructType):
+                pa_array = pa.StructArray.from_arrays(
+                    [rand_arr for rand_arr in rand_arrs],
+                    names=[typ.field(i).name for i in range(typ.num_fields)],
+                )
+            elif isinstance(typ, pa.ListType):
+                pa_array = pa.array(
+                    [list(row_vals) for row_vals in zip(rand_arrs[0])],
+                    type=typ,
+                )
+                child_colnames.append(("", grandchild_colnames))
+            else:
+                # typ is scalar type
+                pa_array = pa.array(rand_vals).cast(typ)
+            return pa_array, child_colnames
+
+        if isinstance(typ, (pa.ListType, pa.StructType)):
+            rand_arr, child_colnames = _generate_nested_data(typ)
+        else:
+            rand_arr = pa.array(rand_vals).cast(typ)
+
+        table_dict[f"col_{typ}"] = rand_arr
+        colnames.append((f"col_{typ}", child_colnames))
+
+    pa_table = pa.Table.from_pydict(table_dict)
+
+    return plc.io.TableWithMetadata(
+        plc.interop.from_arrow(pa_table), column_names=colnames
+    ), pa_table
+
+
+@pytest.fixture(
+    params=["a.txt", pathlib.Path("a.txt"), io.BytesIO, io.StringIO],
+)
+def source_or_sink(request, tmp_path):
+    fp_or_buf = request.param
+    if isinstance(fp_or_buf, str):
+        return f"{tmp_path}/{fp_or_buf}"
+    elif isinstance(fp_or_buf, os.PathLike):
+        return tmp_path.joinpath(fp_or_buf)
+    elif issubclass(fp_or_buf, io.IOBase):
+        # Must construct io.StringIO/io.BytesIO inside
+        # fixture, or we'll end up re-using it
+        return fp_or_buf()
+
+
 @pytest.fixture(
     scope="session", params=[opt for opt in plc.types.Interpolation]
 )
diff --git a/python/cudf/cudf/pylibcudf_tests/test_avro.py b/python/cudf/cudf/pylibcudf_tests/io/test_avro.py
similarity index 100%
rename from python/cudf/cudf/pylibcudf_tests/test_avro.py
rename to python/cudf/cudf/pylibcudf_tests/io/test_avro.py
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_json.py b/python/cudf/cudf/pylibcudf_tests/io/test_json.py
new file mode 100644
index 00000000000..d6b8bfa6976
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_json.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import io
+
+import pyarrow as pa
+import pytest
+from utils import sink_to_str
+
+import cudf._lib.pylibcudf as plc
+
+
+@pytest.mark.parametrize("rows_per_chunk", [8, 100])
+@pytest.mark.parametrize("lines", [True, False])
+def test_write_json_basic(table_data, source_or_sink, lines, rows_per_chunk):
+    plc_table_w_meta, pa_table = table_data
+    sink = source_or_sink
+
+    plc.io.json.write_json(
+        plc.io.SinkInfo([sink]),
+        plc_table_w_meta,
+        lines=lines,
+        rows_per_chunk=rows_per_chunk,
+    )
+
+    exp = pa_table.to_pandas()
+
+    # Convert everything to string to make
+    # comparisons easier
+    str_result = sink_to_str(sink)
+
+    pd_result = exp.to_json(orient="records", lines=lines)
+
+    assert str_result == pd_result
+
+
+@pytest.mark.parametrize("include_nulls", [True, False])
+@pytest.mark.parametrize("na_rep", ["null", "awef", ""])
+def test_write_json_nulls(na_rep, include_nulls):
+    names = ["a", "b"]
+    pa_tbl = pa.Table.from_arrays(
+        [pa.array([1.0, 2.0, None]), pa.array([True, None, False])],
+        names=names,
+    )
+    plc_tbl = plc.interop.from_arrow(pa_tbl)
+    plc_tbl_w_meta = plc.io.types.TableWithMetadata(
+        plc_tbl, column_names=[(name, []) for name in names]
+    )
+
+    sink = io.StringIO()
+
+    plc.io.json.write_json(
+        plc.io.SinkInfo([sink]),
+        plc_tbl_w_meta,
+        na_rep=na_rep,
+        include_nulls=include_nulls,
+    )
+
+    exp = pa_tbl.to_pandas()
+
+    # Convert everything to string to make
+    # comparisons easier
+    str_result = sink_to_str(sink)
+    pd_result = exp.to_json(orient="records")
+
+    if not include_nulls:
+        # No equivalent in pandas, so we just
+        # sanity check by making sure na_rep
+        # doesn't appear in the output
+
+        # don't quote null
+        for name in names:
+            assert f'{{"{name}":{na_rep}}}' not in str_result
+        return
+
+    # pandas doesn't suppport na_rep
+    # let's just manually do str.replace
+    pd_result = pd_result.replace("null", na_rep)
+
+    assert str_result == pd_result
+
+
+@pytest.mark.parametrize("true_value", ["True", "correct"])
+@pytest.mark.parametrize("false_value", ["False", "wrong"])
+def test_write_json_bool_opts(true_value, false_value):
+    names = ["a"]
+    pa_tbl = pa.Table.from_arrays([pa.array([True, None, False])], names=names)
+    plc_tbl = plc.interop.from_arrow(pa_tbl)
+    plc_tbl_w_meta = plc.io.types.TableWithMetadata(
+        plc_tbl, column_names=[(name, []) for name in names]
+    )
+
+    sink = io.StringIO()
+
+    plc.io.json.write_json(
+        plc.io.SinkInfo([sink]),
+        plc_tbl_w_meta,
+        include_nulls=True,
+        na_rep="null",
+        true_value=true_value,
+        false_value=false_value,
+    )
+
+    exp = pa_tbl.to_pandas()
+
+    # Convert everything to string to make
+    # comparisons easier
+    str_result = sink_to_str(sink)
+    pd_result = exp.to_json(orient="records")
+
+    # pandas doesn't suppport na_rep
+    # let's just manually do str.replace
+    if true_value != "true":
+        pd_result = pd_result.replace("true", true_value)
+    if false_value != "false":
+        pd_result = pd_result.replace("false", false_value)
+
+    assert str_result == pd_result
diff --git a/python/cudf/cudf/pylibcudf_tests/test_source_info.py b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
similarity index 72%
rename from python/cudf/cudf/pylibcudf_tests/test_source_info.py
rename to python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
index 019321b7259..287dd8f21c8 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_source_info.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
@@ -9,6 +9,21 @@
 from cudf._lib.pylibcudf.io.datasource import NativeFileDatasource
 
 
+@pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo])
+def io_class(request):
+    return request.param
+
+
+def _skip_invalid_sinks(io_class, sink):
+    """
+    Skip invalid sinks for SinkInfo
+    """
+    if io_class is plc.io.SinkInfo and isinstance(
+        sink, (bytes, NativeFileDatasource)
+    ):
+        pytest.skip(f"{sink} is not a valid input for SinkInfo")
+
+
 @pytest.mark.parametrize(
     "source",
     [
@@ -18,16 +33,15 @@
         NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
     ],
 )
-def test_source_info_ctor(source, tmp_path):
+def test_source_info_ctor(io_class, source, tmp_path):
     if isinstance(source, str):
         file = tmp_path / source
         file.write_bytes("hello world".encode("utf-8"))
         source = str(file)
 
-    plc.io.SourceInfo([source])
+    _skip_invalid_sinks(io_class, source)
 
-    # TODO: test contents of source_info buffer is correct
-    # once buffers are exposed on python side
+    io_class([source])
 
 
 @pytest.mark.parametrize(
@@ -42,7 +56,7 @@ def test_source_info_ctor(source, tmp_path):
         ],
     ],
 )
-def test_source_info_ctor_multiple(sources, tmp_path):
+def test_source_info_ctor_multiple(io_class, sources, tmp_path):
     for i in range(len(sources)):
         source = sources[i]
         if isinstance(source, str):
@@ -50,10 +64,9 @@ def test_source_info_ctor_multiple(sources, tmp_path):
             file.write_bytes("hello world".encode("utf-8"))
             sources[i] = str(file)
 
-    plc.io.SourceInfo(sources)
+        _skip_invalid_sinks(io_class, source)
 
-    # TODO: test contents of source_info buffer is correct
-    # once buffers are exposed on python side
+    io_class(sources)
 
 
 @pytest.mark.parametrize(
@@ -73,7 +86,7 @@ def test_source_info_ctor_multiple(sources, tmp_path):
         ],
     ],
 )
-def test_source_info_ctor_mixing_invalid(sources, tmp_path):
+def test_source_info_ctor_mixing_invalid(io_class, sources, tmp_path):
     # Unlike the previous test
     # don't create files so that they are missing
     for i in range(len(sources)):
@@ -82,8 +95,9 @@ def test_source_info_ctor_mixing_invalid(sources, tmp_path):
             file = tmp_path / source
             file.write_bytes("hello world".encode("utf-8"))
             sources[i] = str(file)
+        _skip_invalid_sinks(io_class, source)
     with pytest.raises(ValueError):
-        plc.io.SourceInfo(sources)
+        io_class(sources)
 
 
 def test_source_info_invalid():
diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py
index da3ca3a6d1e..0a6df198d46 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_copying.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py
@@ -5,19 +5,24 @@
 import pytest
 from utils import (
     DEFAULT_STRUCT_TESTING_TYPE,
+    NESTED_STRUCT_TESTING_TYPE,
     assert_column_eq,
     assert_table_eq,
     cudf_raises,
     is_fixed_width,
     is_floating,
     is_integer,
+    is_nested_list,
+    is_nested_struct,
     is_string,
-    metadata_from_arrow_array,
+    metadata_from_arrow_type,
 )
 
 from cudf._lib import pylibcudf as plc
 
 
+# TODO: consider moving this to conftest and "pairing"
+# it with pa_type, so that they don't get out of sync
 # TODO: Test nullable data
 @pytest.fixture(scope="module")
 def input_column(pa_type):
@@ -28,10 +33,27 @@ def input_column(pa_type):
     elif pa.types.is_boolean(pa_type):
         pa_array = pa.array([True, True, False], type=pa_type)
     elif pa.types.is_list(pa_type):
-        # TODO: Add heterogenous sizes
-        pa_array = pa.array([[1], [2], [3]], type=pa_type)
+        if pa_type.value_type == pa.int64():
+            pa_array = pa.array([[1], [2, 3], [3]], type=pa_type)
+        elif (
+            isinstance(pa_type.value_type, pa.ListType)
+            and pa_type.value_type.value_type == pa.int64()
+        ):
+            pa_array = pa.array([[[1]], [[2, 3]], [[3]]], type=pa_type)
+        else:
+            raise ValueError("Unsupported type " + pa_type.value_type)
     elif pa.types.is_struct(pa_type):
-        pa_array = pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type)
+        if not is_nested_struct(pa_type):
+            pa_array = pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type)
+        else:
+            pa_array = pa.array(
+                [
+                    {"a": 1, "b_struct": {"b": 1.0}},
+                    {"a": 2, "b_struct": {"b": 2.0}},
+                    {"a": 3, "b_struct": {"b": 3.0}},
+                ],
+                type=pa_type,
+            )
     else:
         raise ValueError("Unsupported type")
     return pa_array, plc.interop.from_arrow(pa_array)
@@ -55,13 +77,37 @@ def target_column(pa_type):
             [False, True, True, False, True, False], type=pa_type
         )
     elif pa.types.is_list(pa_type):
-        # TODO: Add heterogenous sizes
-        pa_array = pa.array([[4], [5], [6], [7], [8], [9]], type=pa_type)
+        if pa_type.value_type == pa.int64():
+            pa_array = pa.array(
+                [[4], [5, 6], [7], [8], [9], [10]], type=pa_type
+            )
+        elif (
+            isinstance(pa_type.value_type, pa.ListType)
+            and pa_type.value_type.value_type == pa.int64()
+        ):
+            pa_array = pa.array(
+                [[[4]], [[5, 6]], [[7]], [[8]], [[9]], [[10]]], type=pa_type
+            )
+        else:
+            raise ValueError("Unsupported type")
     elif pa.types.is_struct(pa_type):
-        pa_array = pa.array(
-            [{"v": 4}, {"v": 5}, {"v": 6}, {"v": 7}, {"v": 8}, {"v": 9}],
-            type=pa_type,
-        )
+        if not is_nested_struct(pa_type):
+            pa_array = pa.array(
+                [{"v": 4}, {"v": 5}, {"v": 6}, {"v": 7}, {"v": 8}, {"v": 9}],
+                type=pa_type,
+            )
+        else:
+            pa_array = pa.array(
+                [
+                    {"a": 4, "b_struct": {"b": 4.0}},
+                    {"a": 5, "b_struct": {"b": 5.0}},
+                    {"a": 6, "b_struct": {"b": 6.0}},
+                    {"a": 7, "b_struct": {"b": 7.0}},
+                    {"a": 8, "b_struct": {"b": 8.0}},
+                    {"a": 9, "b_struct": {"b": 9.0}},
+                ],
+                type=pa_type,
+            )
     else:
         raise ValueError("Unsupported type")
     return pa_array, plc.interop.from_arrow(pa_array)
@@ -96,10 +142,22 @@ def source_scalar(pa_type):
     elif pa.types.is_boolean(pa_type):
         pa_scalar = pa.scalar(False, type=pa_type)
     elif pa.types.is_list(pa_type):
-        # TODO: Longer list?
-        pa_scalar = pa.scalar([1], type=pa_type)
+        if pa_type.value_type == pa.int64():
+            pa_scalar = pa.scalar([1, 2, 3, 4], type=pa_type)
+        elif (
+            isinstance(pa_type.value_type, pa.ListType)
+            and pa_type.value_type.value_type == pa.int64()
+        ):
+            pa_scalar = pa.scalar([[1, 2, 3, 4]], type=pa_type)
+        else:
+            raise ValueError("Unsupported type")
     elif pa.types.is_struct(pa_type):
-        pa_scalar = pa.scalar({"v": 1}, type=pa_type)
+        if not is_nested_struct(pa_type):
+            pa_scalar = pa.scalar({"v": 1}, type=pa_type)
+        else:
+            pa_scalar = pa.scalar(
+                {"a": 1, "b_struct": {"b": 1.0}}, type=pa_type
+            )
     else:
         raise ValueError("Unsupported type")
     return pa_scalar, plc.interop.from_arrow(pa_scalar)
@@ -196,27 +254,54 @@ def test_scatter_table(
             )
 
         if pa.types.is_list(dtype := pa_target_table[0].type):
-            expected = pa.table(
-                [pa.array([[4], [1], [2], [3], [8], [9]])] * 3, [""] * 3
-            )
+            if is_nested_list(dtype):
+                expected = pa.table(
+                    [pa.array([[[4]], [[1]], [[2, 3]], [[3]], [[9]], [[10]]])]
+                    * 3,
+                    [""] * 3,
+                )
+            else:
+                expected = pa.table(
+                    [pa.array([[4], [1], [2, 3], [3], [9], [10]])] * 3,
+                    [""] * 3,
+                )
         elif pa.types.is_struct(dtype):
-            expected = pa.table(
-                [
-                    pa.array(
-                        [
-                            {"v": 4},
-                            {"v": 1},
-                            {"v": 2},
-                            {"v": 3},
-                            {"v": 8},
-                            {"v": 9},
-                        ],
-                        type=DEFAULT_STRUCT_TESTING_TYPE,
-                    )
-                ]
-                * 3,
-                [""] * 3,
-            )
+            if is_nested_struct(dtype):
+                expected = pa.table(
+                    [
+                        pa.array(
+                            [
+                                {"a": 4, "b_struct": {"b": 4.0}},
+                                {"a": 1, "b_struct": {"b": 1.0}},
+                                {"a": 2, "b_struct": {"b": 2.0}},
+                                {"a": 3, "b_struct": {"b": 3.0}},
+                                {"a": 8, "b_struct": {"b": 8.0}},
+                                {"a": 9, "b_struct": {"b": 9.0}},
+                            ],
+                            type=NESTED_STRUCT_TESTING_TYPE,
+                        )
+                    ]
+                    * 3,
+                    [""] * 3,
+                )
+            else:
+                expected = pa.table(
+                    [
+                        pa.array(
+                            [
+                                {"v": 4},
+                                {"v": 1},
+                                {"v": 2},
+                                {"v": 3},
+                                {"v": 8},
+                                {"v": 9},
+                            ],
+                            type=DEFAULT_STRUCT_TESTING_TYPE,
+                        )
+                    ]
+                    * 3,
+                    [""] * 3,
+                )
     else:
         expected = _pyarrow_boolean_mask_scatter_table(
             pa_source_table,
@@ -627,6 +712,7 @@ def test_split_column_out_of_bounds(target_column):
 
 def test_split_table(target_table):
     pa_target_table, plc_target_table = target_table
+
     upper_bounds = [1, 3, 5]
     lower_bounds = [0] + upper_bounds[:-1]
     result = plc.copying.split(plc_target_table, upper_bounds)
@@ -718,6 +804,7 @@ def test_copy_if_else_column_scalar(
     pa_target_column, plc_target_column = target_column
     pa_source_scalar, plc_source_scalar = source_scalar
     pa_mask, plc_mask = mask
+
     args = (
         (plc_target_column, plc_source_scalar)
         if array_left
@@ -766,27 +853,58 @@ def test_boolean_mask_scatter_from_table(
             )
 
         if pa.types.is_list(dtype := pa_target_table[0].type):
-            expected = pa.table(
-                [pa.array([[1], [5], [2], [7], [3], [9]])] * 3, [""] * 3
-            )
+            if is_nested_list(dtype):
+                expected = pa.table(
+                    [
+                        pa.array(
+                            [[[1]], [[5, 6]], [[2, 3]], [[8]], [[3]], [[10]]]
+                        )
+                    ]
+                    * 3,
+                    [""] * 3,
+                )
+            else:
+                expected = pa.table(
+                    [pa.array([[1], [5, 6], [2, 3], [8], [3], [10]])] * 3,
+                    [""] * 3,
+                )
         elif pa.types.is_struct(dtype):
-            expected = pa.table(
-                [
-                    pa.array(
-                        [
-                            {"v": 1},
-                            {"v": 5},
-                            {"v": 2},
-                            {"v": 7},
-                            {"v": 3},
-                            {"v": 9},
-                        ],
-                        type=DEFAULT_STRUCT_TESTING_TYPE,
-                    )
-                ]
-                * 3,
-                [""] * 3,
-            )
+            if is_nested_struct(dtype):
+                expected = pa.table(
+                    [
+                        pa.array(
+                            [
+                                {"a": 1, "b_struct": {"b": 1.0}},
+                                {"a": 5, "b_struct": {"b": 5.0}},
+                                {"a": 2, "b_struct": {"b": 2.0}},
+                                {"a": 7, "b_struct": {"b": 7.0}},
+                                {"a": 3, "b_struct": {"b": 3.0}},
+                                {"a": 9, "b_struct": {"b": 9.0}},
+                            ],
+                            type=NESTED_STRUCT_TESTING_TYPE,
+                        )
+                    ]
+                    * 3,
+                    [""] * 3,
+                )
+            else:
+                expected = pa.table(
+                    [
+                        pa.array(
+                            [
+                                {"v": 1},
+                                {"v": 5},
+                                {"v": 2},
+                                {"v": 7},
+                                {"v": 3},
+                                {"v": 9},
+                            ],
+                            type=DEFAULT_STRUCT_TESTING_TYPE,
+                        )
+                    ]
+                    * 3,
+                    [""] * 3,
+                )
     else:
         expected = _pyarrow_boolean_mask_scatter_table(
             pa_source_table, pa_mask, pa_target_table
@@ -887,7 +1005,7 @@ def test_get_element(input_column):
 
     assert (
         plc.interop.to_arrow(
-            result, metadata_from_arrow_array(pa_input_column)
+            result, metadata_from_arrow_type(pa_input_column.type)
         ).as_py()
         == pa_input_column[index].as_py()
     )

From 64325a1bafeb97e8399e497cc9f4f6ffaee0fd14 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 2 Jul 2024 11:52:02 -0400
Subject: [PATCH 450/842] Run DFG after verify-alpha-spec (#16151)

Because `verify-alpha-spec` potentially modifies `dependencies.yaml`, we want to run DFG after it. This should have been included in #16144 but was forgotten.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16151
---
 .pre-commit-config.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index d0457d2c641..bbcd78d051f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -136,11 +136,6 @@ repos:
             .*test.*|
             ^CHANGELOG.md$
           )
-  - repo: https://github.com/rapidsai/dependency-file-generator
-    rev: v1.13.11
-    hooks:
-      - id: rapids-dependency-file-generator
-        args: ["--clean"]
   - repo: https://github.com/astral-sh/ruff-pre-commit
     rev: v0.4.8
     hooks:
@@ -159,6 +154,11 @@ repos:
             cpp/src/io/parquet/ipc/Schema_generated[.]h$
           )
       - id: verify-alpha-spec
+  - repo: https://github.com/rapidsai/dependency-file-generator
+    rev: v1.13.11
+    hooks:
+      - id: rapids-dependency-file-generator
+        args: ["--clean"]
 
 default_language_version:
       python: python3

From 04e3aa9ffad64cf6682b5d1677d9df66a44d8f53 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 2 Jul 2024 09:55:13 -0700
Subject: [PATCH 451/842] Remove the (unused) implementation of
 `host_parse_nested_json` (#16135)

Follow-up for #15537 and #15813 to remove some missed code.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16135
---
 cpp/src/io/json/nested_json_gpu.cu | 125 -----------------------------
 1 file changed, 125 deletions(-)

diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 031edfde4f6..a007754ef4f 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -2244,131 +2244,6 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
   return {};
 }
 
-table_with_metadata host_parse_nested_json(device_span<SymbolT const> d_input,
-                                           cudf::io::json_reader_options const& options,
-                                           rmm::cuda_stream_view stream,
-                                           rmm::device_async_resource_ref mr)
-{
-  // Range of orchestrating/encapsulating function
-  CUDF_FUNC_RANGE();
-
-  auto const h_input = cudf::detail::make_std_vector_async(d_input, stream);
-
-  auto const new_line_delimited_json = options.is_enabled_lines();
-
-  // Get internal JSON column
-  json_column root_column{};
-  std::stack<tree_node> data_path{};
-
-  constexpr uint32_t row_offset_zero            = 0;
-  constexpr uint32_t token_begin_offset_zero    = 0;
-  constexpr uint32_t token_end_offset_zero      = 0;
-  constexpr uint32_t node_init_child_count_zero = 0;
-
-  // Whether the tokenizer stage should keep quote characters for string values
-  // If the tokenizer keeps the quote characters, they may be stripped during type casting
-  constexpr bool include_quote_chars = true;
-
-  // We initialize the very root node and root column, which represent the JSON document being
-  // parsed. That root node is a list node and that root column is a list column. The column has the
-  // root node as its only row. The values parsed from the JSON input will be treated as follows:
-  // (1) For JSON lines: we expect to find a list of JSON values that all
-  // will be inserted into this root list column. (2) For regular JSON: we expect to have only a
-  // single value (list, struct, string, number, literal) that will be inserted into this root
-  // column.
-  root_column.append_row(
-    row_offset_zero, json_col_t::ListColumn, token_begin_offset_zero, token_end_offset_zero, 1);
-
-  // Push the root node onto the stack for the data path
-  data_path.push({&root_column, row_offset_zero, nullptr, node_init_child_count_zero});
-
-  make_json_column(
-    root_column, data_path, h_input, d_input, options, include_quote_chars, stream, mr);
-
-  // data_root refers to the root column of the data represented by the given JSON string
-  auto const& data_root =
-    new_line_delimited_json ? root_column : root_column.child_columns.begin()->second;
-
-  // Zero row entries
-  if (data_root.type == json_col_t::ListColumn && data_root.child_columns.empty()) {
-    return table_with_metadata{std::make_unique<table>(std::vector<std::unique_ptr<column>>{})};
-  }
-
-  // Verify that we were in fact given a list of structs (or in JSON speech: an array of objects)
-  auto constexpr single_child_col_count = 1;
-  CUDF_EXPECTS(data_root.type == json_col_t::ListColumn and
-                 data_root.child_columns.size() == single_child_col_count and
-                 data_root.child_columns.begin()->second.type == json_col_t::StructColumn,
-               "Currently the nested JSON parser only supports an array of (nested) objects");
-
-  // Slice off the root list column, which has only a single row that contains all the structs
-  auto const& root_struct_col = data_root.child_columns.begin()->second;
-
-  // Initialize meta data to be populated while recursing through the tree of columns
-  std::vector<std::unique_ptr<column>> out_columns;
-  std::vector<column_name_info> out_column_names;
-
-  // Iterate over the struct's child columns and convert to cudf column
-  size_type column_index = 0;
-  for (auto const& col_name : root_struct_col.column_order) {
-    auto const& json_col = root_struct_col.child_columns.find(col_name)->second;
-    // Insert this columns name into the schema
-    out_column_names.emplace_back(col_name);
-
-    std::optional<schema_element> child_schema_element = std::visit(
-      cudf::detail::visitor_overload{
-        [column_index](std::vector<data_type> const& user_dtypes) -> std::optional<schema_element> {
-          auto ret = (static_cast<std::size_t>(column_index) < user_dtypes.size())
-                       ? std::optional<schema_element>{{user_dtypes[column_index]}}
-                       : std::optional<schema_element>{};
-#ifdef NJP_DEBUG_PRINT
-          std::cout << "Column by index: #" << column_index << ", type id: "
-                    << (ret.has_value() ? std::to_string(static_cast<int>(ret->type.id())) : "n/a")
-                    << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children"
-                    << "\n";
-#endif
-          return ret;
-        },
-        [col_name](
-          std::map<std::string, data_type> const& user_dtypes) -> std::optional<schema_element> {
-          auto ret = (user_dtypes.find(col_name) != std::end(user_dtypes))
-                       ? std::optional<schema_element>{{user_dtypes.find(col_name)->second}}
-                       : std::optional<schema_element>{};
-#ifdef NJP_DEBUG_PRINT
-          std::cout << "Column by flat name: '" << col_name << "', type id: "
-                    << (ret.has_value() ? std::to_string(static_cast<int>(ret->type.id())) : "n/a")
-                    << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children"
-                    << "\n";
-#endif
-          return ret;
-        },
-        [col_name](std::map<std::string, schema_element> const& user_dtypes)
-          -> std::optional<schema_element> {
-          auto ret = (user_dtypes.find(col_name) != std::end(user_dtypes))
-                       ? user_dtypes.find(col_name)->second
-                       : std::optional<schema_element>{};
-#ifdef NJP_DEBUG_PRINT
-          std::cout << "Column by nested name: #" << col_name << ", type id: "
-                    << (ret.has_value() ? std::to_string(static_cast<int>(ret->type.id())) : "n/a")
-                    << ", with " << (ret.has_value() ? ret->child_types.size() : 0) << " children"
-                    << "\n";
-#endif
-          return ret;
-        }},
-      options.get_dtypes());
-
-    // Get this JSON column's cudf column and schema info
-    auto [cudf_col, col_name_info] =
-      json_column_to_cudf_column(json_col, d_input, options, child_schema_element, stream, mr);
-    out_column_names.back().children = std::move(col_name_info);
-    out_columns.emplace_back(std::move(cudf_col));
-
-    column_index++;
-  }
-
-  return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {out_column_names}};
-}
-
 }  // namespace detail
 }  // namespace cudf::io::json
 

From 31ed9fd1eab1b2d4a5d0a839357ed53530daea97 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 2 Jul 2024 13:07:36 -0500
Subject: [PATCH 452/842] Use provided memory resource for allocating mixed
 join results. (#16153)

This PR fixes a few places where certain code paths for mixed joins are not using the user-provided memory resource.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16153
---
 cpp/src/join/mixed_join.cu      | 7 ++-----
 cpp/src/join/mixed_join_semi.cu | 4 +---
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index 42e0e4f45ee..90748e6f322 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -82,9 +82,7 @@ mixed_join(
       // Left and full joins all return all the row indices from
       // left with a corresponding NULL from the right.
       case join_kind::LEFT_JOIN:
-      case join_kind::FULL_JOIN:
-        return get_trivial_left_join_indices(
-          left_conditional, stream, rmm::mr::get_current_device_resource());
+      case join_kind::FULL_JOIN: return get_trivial_left_join_indices(left_conditional, stream, mr);
       // Inner joins return empty output because no matches can exist.
       case join_kind::INNER_JOIN:
         return std::pair(std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr),
@@ -100,8 +98,7 @@ mixed_join(
                          std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr));
       // Full joins need to return the trivial complement.
       case join_kind::FULL_JOIN: {
-        auto ret_flipped = get_trivial_left_join_indices(
-          right_conditional, stream, rmm::mr::get_current_device_resource());
+        auto ret_flipped = get_trivial_left_join_indices(right_conditional, stream, mr);
         return std::pair(std::move(ret_flipped.second), std::move(ret_flipped.first));
       }
       default: CUDF_FAIL("Invalid join kind."); break;
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index 8500b248fcf..c147ea3c253 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -117,9 +117,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
       // Anti and semi return all the row indices from left
       // with a corresponding NULL from the right.
       case join_kind::LEFT_ANTI_JOIN:
-        return get_trivial_left_join_indices(
-                 left_conditional, stream, rmm::mr::get_current_device_resource())
-          .first;
+        return get_trivial_left_join_indices(left_conditional, stream, mr).first;
       // Inner and left semi joins return empty output because no matches can exist.
       case join_kind::LEFT_SEMI_JOIN:
         return std::make_unique<rmm::device_uvector<size_type>>(0, stream, mr);

From 3bd9975e867c9d2a077ed50fa339cecfd9bc8d9b Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 2 Jul 2024 15:20:03 -0400
Subject: [PATCH 453/842] Add compile option to enable large strings support
 (#16037)

Adds `CUDF_LARGE_STRINGS_DISABLED` compile-time option to disable large strings support.
The default is to now enable large strings support with this PR.

This changes the default behavior of the `LIBCUDF_LARGE_STRINGS_ENABLED` environment variable -- when the variable is not set. If the environment variable is not set, then the default behavior depends on the compile option.
If `CUDF_LARGE_STRINGS_DISABLED` is compiled `ON` then setting `LIBCUDF_LARGE_STRINGS_ENABLED=1` will turn it **on** at runtime.
If `CUDF_LARGE_STRINGS_DISABLED` is not compiled on then setting `LIBCUDF_LARGE_STRINGS_ENABLED=0` will turn it **off** at runtime.

This PR also sets `CUDF_LARGE_STRINGS_DISABLED=OFF` by default in the `build.sh`

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Jason Lowe (https://github.com/jlowe)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16037
---
 build.sh                              |  9 ++++++++-
 ci/test_java.sh                       |  3 +++
 cpp/CMakeLists.txt                    |  7 +++++++
 cpp/src/strings/utilities.cu          |  5 +++++
 python/cudf/cudf/tests/test_column.py | 11 -----------
 5 files changed, 23 insertions(+), 12 deletions(-)

diff --git a/build.sh b/build.sh
index 4291c88ea12..52bb1e64d16 100755
--- a/build.sh
+++ b/build.sh
@@ -17,7 +17,7 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp  --show_depr_warn --ptds -h --build_metrics --incl_cache_stats"
+VALIDARGS="clean libcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp  --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings"
 HELP="$0 [clean] [libcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
    clean                         - remove all existing build artifacts and configuration (start
                                    over)
@@ -39,6 +39,7 @@ HELP="$0 [clean] [libcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [li
    --opensource_nvcomp           - disable use of proprietary nvcomp extensions
    --show_depr_warn              - show cmake deprecation warnings
    --ptds                        - enable per-thread default stream
+   --disable_large_strings       - disable large strings support
    --build_metrics               - generate build metrics report for libcudf
    --incl_cache_stats            - include cache statistics in build metrics report
    --cmake-args=\\\"<args>\\\"   - pass arbitrary list of CMake configuration options (escape all quotes in argument)
@@ -69,6 +70,7 @@ BUILD_DISABLE_DEPRECATION_WARNINGS=ON
 BUILD_PER_THREAD_DEFAULT_STREAM=OFF
 BUILD_REPORT_METRICS=OFF
 BUILD_REPORT_INCL_CACHE_STATS=OFF
+BUILD_DISABLE_LARGE_STRINGS=OFF
 USE_PROPRIETARY_NVCOMP=ON
 PYTHON_ARGS_FOR_INSTALL="-m pip install --no-build-isolation --no-deps --config-settings rapidsai.disable-cuda=true"
 
@@ -153,6 +155,7 @@ function buildLibCudfJniInDocker {
                 -DCUDF_ENABLE_ARROW_S3=OFF \
                 -DBUILD_TESTS=OFF \
                 -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=ON \
+                -DCUDF_LARGE_STRINGS_DISABLED=ON \
                 -DRMM_LOGGING_LEVEL=OFF \
                 -DBUILD_SHARED_LIBS=OFF && \
              cmake --build . --parallel ${PARALLEL_LEVEL} && \
@@ -239,6 +242,9 @@ if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_CUDF_CPP"* ]]; then
     EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS} -DFIND_CUDF_CPP=ON"
 fi
 
+if hasArg --disable_large_strings; then
+    BUILD_DISABLE_LARGE_STRINGS="ON"
+fi
 
 # If clean given, run it prior to any other steps
 if hasArg clean; then
@@ -292,6 +298,7 @@ if buildAll || hasArg libcudf; then
           -DBUILD_BENCHMARKS=${BUILD_BENCHMARKS} \
           -DDISABLE_DEPRECATION_WARNINGS=${BUILD_DISABLE_DEPRECATION_WARNINGS} \
           -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=${BUILD_PER_THREAD_DEFAULT_STREAM} \
+          -DCUDF_LARGE_STRINGS_DISABLED=${BUILD_DISABLE_LARGE_STRINGS} \
           -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
           ${EXTRA_CMAKE_ARGS}
 
diff --git a/ci/test_java.sh b/ci/test_java.sh
index 9713eb192d2..629ad11014a 100755
--- a/ci/test_java.sh
+++ b/ci/test_java.sh
@@ -39,6 +39,9 @@ EXITCODE=0
 trap "EXITCODE=1" ERR
 set +e
 
+# disable large strings
+export LIBCUDF_LARGE_STRINGS_ENABLED=0
+
 rapids-logger "Run Java tests"
 pushd java
 mvn test -B -DCUDF_JNI_ENABLE_PROFILING=OFF
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 54070ab6f5a..2811711d58c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -52,6 +52,8 @@ option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON)
 option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON)
 mark_as_advanced(CUDF_BUILD_TESTUTIL)
 option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON)
+option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF)
+mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED)
 option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF)
 option(CUDF_ENABLE_ARROW_ORC "Build the Arrow ORC adapter" OFF)
 option(CUDF_ENABLE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF)
@@ -783,6 +785,11 @@ if(NOT USE_NVTX)
   target_compile_definitions(cudf PUBLIC NVTX_DISABLE)
 endif()
 
+# Disable large strings support
+if(CUDF_LARGE_STRINGS_DISABLED)
+  target_compile_definitions(cudf PRIVATE CUDF_LARGE_STRINGS_DISABLED)
+endif()
+
 # Define RMM logging level
 target_compile_definitions(cudf PRIVATE "RMM_LOGGING_LEVEL=LIBCUDF_LOGGING_LEVEL")
 
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 101004a5d06..f70598f33be 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -158,8 +158,13 @@ int64_t get_offset64_threshold()
 
 bool is_large_strings_enabled()
 {
+  // default depends on compile-time switch but can be overridden by the environment variable
   auto const env = std::getenv("LIBCUDF_LARGE_STRINGS_ENABLED");
+#ifdef CUDF_LARGE_STRINGS_DISABLED
   return env != nullptr && std::string(env) == "1";
+#else
+  return env == nullptr || std::string(env) == "1";
+#endif
 }
 
 int64_t get_offset_value(cudf::column_view const& offsets,
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index ea919c786b9..c288155112c 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -515,17 +515,6 @@ def test_build_series_from_nullable_pandas_dtype(pd_dtype, expect_dtype):
     np.testing.assert_array_equal(expect_mask, got_mask)
 
 
-def test_concatenate_large_column_strings():
-    num_strings = 1_000_000
-    string_scale_f = 100
-
-    s_1 = cudf.Series(["very long string " * string_scale_f] * num_strings)
-    s_2 = cudf.Series(["very long string " * string_scale_f] * num_strings)
-
-    with pytest.raises(OverflowError):
-        cudf.concat([s_1, s_2])
-
-
 @pytest.mark.parametrize(
     "alias,expect_dtype",
     [

From f534e2026a8437190be0b3ea441b1b622b72cef6 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 2 Jul 2024 16:20:03 -0400
Subject: [PATCH 454/842] cudf::merge public API now support passing a user
 stream (#16124)

Expands the `cudf::merge` function to support a user stream

Found as part of https://github.com/rapidsai/cudf/pull/15982 when building benchmarks

Authors:
  - Robert Maynard (https://github.com/robertmaynard)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16124
---
 cpp/include/cudf/detail/merge.hpp |   1 +
 cpp/include/cudf/merge.hpp        |   3 +-
 cpp/src/merge/merge.cu            |   4 +-
 cpp/tests/CMakeLists.txt          |   1 +
 cpp/tests/streams/merge_test.cpp  | 137 ++++++++++++++++++++++++++++++
 5 files changed, 143 insertions(+), 3 deletions(-)
 create mode 100644 cpp/tests/streams/merge_test.cpp

diff --git a/cpp/include/cudf/detail/merge.hpp b/cpp/include/cudf/detail/merge.hpp
index 837eda0d7b5..56ac0554403 100644
--- a/cpp/include/cudf/detail/merge.hpp
+++ b/cpp/include/cudf/detail/merge.hpp
@@ -46,6 +46,7 @@ using index_vector = rmm::device_uvector<index_type>;
  *            std::vector<cudf::size_type> const& key_cols,
  *            std::vector<cudf::order> const& column_order,
  *            std::vector<cudf::null_order> const& null_precedence,
+ *            rmm::cuda_stream_view stream,
  *            rmm::device_async_resource_ref mr)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
diff --git a/cpp/include/cudf/merge.hpp b/cpp/include/cudf/merge.hpp
index 29aa3ffe934..301e56c19b8 100644
--- a/cpp/include/cudf/merge.hpp
+++ b/cpp/include/cudf/merge.hpp
@@ -97,6 +97,7 @@ namespace cudf {
  * @param[in] column_order Sort order types of columns indexed by key_cols
  * @param[in] null_precedence Array indicating the order of nulls with respect
  * to non-nulls for the indexing columns (key_cols)
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  *
  * @returns A table containing sorted data from all input tables
@@ -106,7 +107,7 @@ std::unique_ptr<cudf::table> merge(
   std::vector<cudf::size_type> const& key_cols,
   std::vector<cudf::order> const& column_order,
   std::vector<cudf::null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                         = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr                    = rmm::mr::get_current_device_resource());
-
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 630cf328579..7ecaa0fba56 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -694,11 +694,11 @@ std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merg
                                    std::vector<cudf::size_type> const& key_cols,
                                    std::vector<cudf::order> const& column_order,
                                    std::vector<cudf::null_order> const& null_precedence,
+                                   rmm::cuda_stream_view stream,
                                    rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::merge(
-    tables_to_merge, key_cols, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::merge(tables_to_merge, key_cols, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 0eab9ba61d8..8e2017ccb97 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -692,6 +692,7 @@ ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_MERGE_TEST streams/merge_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_MULTIBYTE_SPLIT_TEST streams/io/multibyte_split_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_NULL_MASK_TEST streams/null_mask_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_ORCIO_TEST streams/io/orc_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/merge_test.cpp b/cpp/tests/streams/merge_test.cpp
new file mode 100644
index 00000000000..1dfe877878d
--- /dev/null
+++ b/cpp/tests/streams/merge_test.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/testing_main.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/merge.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
+
+#include <vector>
+
+template <typename T>
+class MergeTest_ : public cudf::test::BaseFixture {};
+
+TYPED_TEST_SUITE(MergeTest_, cudf::test::FixedWidthTypes);
+
+TYPED_TEST(MergeTest_, MergeIsZeroWhenShouldNotBeZero)
+{
+  using columnFactoryT = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>;
+
+  columnFactoryT leftColWrap1({1, 2, 3, 4, 5});
+  cudf::test::fixed_width_column_wrapper<TypeParam> rightColWrap1{};
+
+  std::vector<cudf::size_type> key_cols{0};
+  std::vector<cudf::order> column_order;
+  column_order.push_back(cudf::order::ASCENDING);
+  std::vector<cudf::null_order> null_precedence(column_order.size(), cudf::null_order::AFTER);
+
+  cudf::table_view left_view{{leftColWrap1}};
+  cudf::table_view right_view{{rightColWrap1}};
+  cudf::table_view expected{{leftColWrap1}};
+
+  auto result = cudf::merge({left_view, right_view},
+                            key_cols,
+                            column_order,
+                            null_precedence,
+                            cudf::test::get_default_stream());
+
+  int expected_len = 5;
+  ASSERT_EQ(result->num_rows(), expected_len);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result->view());
+}
+
+TYPED_TEST(MergeTest_, SingleTableInput)
+{
+  cudf::size_type inputRows = 40;
+
+  auto sequence = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+  cudf::test::fixed_width_column_wrapper<TypeParam, typename decltype(sequence)::value_type>
+    colWrap1(sequence, sequence + inputRows);
+
+  std::vector<cudf::size_type> key_cols{0};
+  std::vector<cudf::order> column_order{cudf::order::ASCENDING};
+  std::vector<cudf::null_order> null_precedence{};
+
+  cudf::table_view left_view{{colWrap1}};
+
+  std::unique_ptr<cudf::table> p_outputTable;
+  CUDF_EXPECT_NO_THROW(
+    p_outputTable = cudf::merge(
+      {left_view}, key_cols, column_order, null_precedence, cudf::test::get_default_stream()));
+
+  auto input_column_view{left_view.column(0)};
+  auto output_column_view{p_outputTable->view().column(0)};
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(input_column_view, output_column_view);
+}
+
+class MergeTest : public cudf::test::BaseFixture {};
+
+TEST_F(MergeTest, KeysWithNulls)
+{
+  cudf::size_type nrows = 13200;  // Ensures that thrust::merge uses more than one tile/block
+  auto data_iter        = thrust::make_counting_iterator<int32_t>(0);
+  auto valids1 =
+    cudf::detail::make_counting_transform_iterator(0, [](auto row) { return row % 10 != 0; });
+  cudf::test::fixed_width_column_wrapper<int32_t> data1(data_iter, data_iter + nrows, valids1);
+  auto valids2 =
+    cudf::detail::make_counting_transform_iterator(0, [](auto row) { return row % 15 != 0; });
+  cudf::test::fixed_width_column_wrapper<int32_t> data2(data_iter, data_iter + nrows, valids2);
+  auto all_data = cudf::concatenate(std::vector<cudf::column_view>{{data1, data2}},
+                                    cudf::test::get_default_stream());
+
+  std::vector<cudf::order> column_orders{cudf::order::ASCENDING, cudf::order::DESCENDING};
+  std::vector<cudf::null_order> null_precedences{cudf::null_order::AFTER, cudf::null_order::BEFORE};
+
+  for (auto co : column_orders)
+    for (auto np : null_precedences) {
+      std::vector<cudf::order> column_order{co};
+      std::vector<cudf::null_order> null_precedence{np};
+      auto sorted1 = cudf::sort(cudf::table_view({data1}),
+                                column_order,
+                                null_precedence,
+                                cudf::test::get_default_stream())
+                       ->release();
+      auto col1    = sorted1.front()->view();
+      auto sorted2 = cudf::sort(cudf::table_view({data2}),
+                                column_order,
+                                null_precedence,
+                                cudf::test::get_default_stream())
+                       ->release();
+      auto col2 = sorted2.front()->view();
+
+      auto result     = cudf::merge({cudf::table_view({col1}), cudf::table_view({col2})},
+                                    {0},
+                                column_order,
+                                null_precedence,
+                                cudf::test::get_default_stream());
+      auto sorted_all = cudf::sort(cudf::table_view({all_data->view()}),
+                                   column_order,
+                                   null_precedence,
+                                   cudf::test::get_default_stream());
+      CUDF_TEST_EXPECT_COLUMNS_EQUAL(sorted_all->view().column(0), result->view().column(0));
+    }
+}
+
+CUDF_TEST_PROGRAM_MAIN()

From 9b69d88866aca94b3a7eabbb2e6a82cce6f55e60 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 2 Jul 2024 18:00:30 -0400
Subject: [PATCH 455/842] Fix unused-return-value debug build error in
 from_arrow_stream_test.cpp (#16168)

Fixes a debug build error reporting an unused return value in `from_arrow_stream_test.cpp`
```
g++ -DFMT_HEADER_ONLY=1 -DLIBCUDACXX_ENABLE_EXPERIMENTAL_MEMORY_RESOURCE -DNANOARROW_DEBUG -DSPDLOG_ACTIVE_LEVEL=SPDLOG_LEVEL_INFO -DSPDLOG_FMT_EXTERNAL -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA -DTHRUST_DISABLE_ABI_NAMESPACE -DTHRUST_HOST_SYSTEM=THRUST_HOST_SYSTEM_CPP -DTHRUST_IGNORE_ABI_NAMESPACE_ERROR -I/cudf/cpp -I/cudf/cpp/src -I/cudf/cpp/build/_deps/dlpack-src/include -I/cudf/cpp/build/_deps/jitify-src -I/cudf/cpp/include -I/cudf/cpp/build/include -I/cudf/cpp/build/_deps/cccl-src/thrust/thrust/cmake/../.. -I/cudf/cpp/build/_deps/cccl-src/libcudacxx/lib/cmake/libcudacxx/../../../include -I/cudf/cpp/build/_deps/cccl-src/cub/cub/cmake/../.. -I/cudf/cpp/build/_deps/nanoarrow-src/src -I/cudf/cpp/build/_deps/nanoarrow-build/generated -isystem /cudf/cpp/build/_deps/gtest-src/googlemock/include -isystem /cudf/cpp/build/_deps/gtest-src/googlemock -isystem /cudf/cpp/build/_deps/gtest-src/googletest/include -isystem /cudf/cpp/build/_deps/gtest-src/googletest -fvisibility-inlines-hidden -fmessage-length=0 -march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe -isystem /conda/envs/rapids/include -fdiagnostics-color=always  -I/conda/envs/rapids/targets/x86_64-linux/include  -L/conda/envs/rapids/targets/x86_64-linux/lib -L/conda/envs/rapids/targets/x86_64-linux/lib/stubs -g -std=gnu++17 -fPIE -Wall -Werror -Wno-unknown-pragmas -Wno-error=deprecated-declarations -pthread -MD -MT tests/CMakeFiles/INTEROP_TEST.dir/interop/from_arrow_stream_test.cpp.o -MF tests/CMakeFiles/INTEROP_TEST.dir/interop/from_arrow_stream_test.cpp.o.d -o tests/CMakeFiles/INTEROP_TEST.dir/interop/from_arrow_stream_test.cpp.o -c /cudf/cpp/tests/interop/from_arrow_stream_test.cpp
/cudf/cpp/tests/interop/from_arrow_stream_test.cpp: In static member function 'static int VectorOfArrays::get_schema(ArrowArrayStream*, ArrowSchema*)':
/cudf/cpp/tests/interop/from_arrow_stream_test.cpp:49:24: error: ignoring return value of 'ArrowErrorCode cudfArrowSchemaDeepCopy(const ArrowSchema*, ArrowSchema*)' declared with attribute 'warn_unused_result' [-Werror=unused-result]
   49 |     ArrowSchemaDeepCopy(private_data->schema.get(), out_schema);
cc1plus: all warnings being treated as errors

```
Adding a variable decorated with `[[maybe_unused]]` clears the error.
Error introduced in #15904

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/16168
---
 cpp/tests/interop/from_arrow_stream_test.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/tests/interop/from_arrow_stream_test.cpp b/cpp/tests/interop/from_arrow_stream_test.cpp
index 418ec057303..80a2e4b2ffd 100644
--- a/cpp/tests/interop/from_arrow_stream_test.cpp
+++ b/cpp/tests/interop/from_arrow_stream_test.cpp
@@ -46,7 +46,8 @@ struct VectorOfArrays {
   static int get_schema(ArrowArrayStream* stream, ArrowSchema* out_schema)
   {
     auto private_data = static_cast<VectorOfArrays*>(stream->private_data);
-    ArrowSchemaDeepCopy(private_data->schema.get(), out_schema);
+
+    [[maybe_unused]] auto rc = ArrowSchemaDeepCopy(private_data->schema.get(), out_schema);
     return 0;
   }
 

From 25febbcade60d5eefb5568cdc036c845d29dc932 Mon Sep 17 00:00:00 2001
From: Jihoon Son <ghoonson@gmail.com>
Date: Tue, 2 Jul 2024 16:05:38 -0700
Subject: [PATCH 456/842] Add throughput metrics for
 REDUCTION_BENCH/REDUCTION_NVBENCH benchmarks (#16126)

This PR addresses https://github.com/rapidsai/cudf/issues/13735 for reduction benchmarks. There are 3 new utils added.

- `int64_t estimate_size(cudf::table_view)` returns a size estimate for the given table. https://github.com/rapidsai/cudf/pull/13984 was a previous attempt to add a similar utility, but this implementation uses `cudf::row_bit_count()` as suggested in https://github.com/rapidsai/cudf/pull/13984#issuecomment-2189916570 instead of manually estimating the size.
- `void set_items_processed(State& state, int64_t items_processed_per_iteration)` is a thin wrapper of `State.SetItemsProcessed()`. This wrapper takes `items_processed_per_iteration` as a parameter instead of `total_items_processed`. This could be useful to avoid repeating `State.iterations() * items_processed_per_iteration` in each benchmark class.
- `void set_throughputs(nvbench::state& state)` is added as a workaround for https://github.com/NVIDIA/nvbench/issues/175. We sometimes want to set throughput statistics after `state.exec()` calls especially when it is hard to estimate the result size upfront.

Here are snippets of reduction benchmarks after this change.

```
$ cpp/build/benchmarks/REDUCTION_BENCH
...
-----------------------------------------------------------------------------------------------------------------
Benchmark                                                       Time             CPU   Iterations UserCounters...
-----------------------------------------------------------------------------------------------------------------
Reduction/bool_all/10000/manual_time                        10257 ns        26845 ns        68185 bytes_per_second=929.907M/s items_per_second=975.078M/s
Reduction/bool_all/100000/manual_time                       11000 ns        27454 ns        63634 bytes_per_second=8.46642G/s items_per_second=9.09075G/s
Reduction/bool_all/1000000/manual_time                      12671 ns        28658 ns        55261 bytes_per_second=73.5018G/s items_per_second=78.922G/s
...

$ cpp/build/benchmarks/REDUCTION_NVBENCH
...
## rank_scan

### [0] NVIDIA RTX A5500

|        T        | null_probability | data_size | Samples |  CPU Time  | Noise  |  GPU Time  | Noise |  Elem/s  | GlobalMem BW |  BWUtil   |
|-----------------|------------------|-----------|---------|------------|--------|------------|-------|----------|--------------|-----------|
|             I32 |                0 |     10000 |  16992x |  33.544 us | 14.95% |  29.446 us | 5.58% |  82.321M |   5.596 TB/s |   728.54% |
|             I32 |              0.1 |     10000 |  16512x |  34.358 us | 13.66% |  30.292 us | 2.87% |  80.020M |   5.286 TB/s |   688.17% |
|             I32 |              0.5 |     10000 |  16736x |  34.058 us | 14.31% |  29.890 us | 3.40% |  81.097M |   5.430 TB/s |   706.89% |
...
```

Note that, when the data type is a 1-byte-width type in the google benchmark result summary, `bytes_per_second` appears to be smaller than `items_per_second`. This is because the former is a multiple of 1000 whereas the latter is a multiple of 1024. They are in fact the same number.

Implementation-wise, these are what I'm not sure if I made a best decision.
- Each of new utils above is declared and defined in different files. I did this because I could not find a good place to have them all, and they seem to belong to different utilities. Please let me know if there is a better place for them.
- All the new utils are defined in the global namespace since other util functions seem to have been defined in the same way. Please let me know if this is not the convention.

Authors:
  - Jihoon Son (https://github.com/jihoonson)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16126
---
 cpp/benchmarks/CMakeLists.txt                 |  9 ++-
 cpp/benchmarks/common/benchmark_utilities.cpp | 27 +++++++++
 cpp/benchmarks/common/benchmark_utilities.hpp | 41 +++++++++++++
 cpp/benchmarks/common/nvbench_utilities.cpp   | 60 +++++++++++++++++++
 cpp/benchmarks/common/nvbench_utilities.hpp   | 31 ++++++++++
 cpp/benchmarks/common/table_utilities.cpp     | 41 +++++++++++++
 cpp/benchmarks/common/table_utilities.hpp     | 41 +++++++++++++
 cpp/benchmarks/reduction/anyall.cpp           |  8 ++-
 cpp/benchmarks/reduction/dictionary.cpp       | 10 +++-
 cpp/benchmarks/reduction/minmax.cpp           | 13 +++-
 cpp/benchmarks/reduction/rank.cpp             | 13 +++-
 cpp/benchmarks/reduction/reduce.cpp           |  8 ++-
 cpp/benchmarks/reduction/scan.cpp             | 11 +++-
 cpp/benchmarks/reduction/scan_structs.cpp     | 16 ++++-
 14 files changed, 314 insertions(+), 15 deletions(-)
 create mode 100644 cpp/benchmarks/common/benchmark_utilities.cpp
 create mode 100644 cpp/benchmarks/common/benchmark_utilities.hpp
 create mode 100644 cpp/benchmarks/common/nvbench_utilities.cpp
 create mode 100644 cpp/benchmarks/common/nvbench_utilities.hpp
 create mode 100644 cpp/benchmarks/common/table_utilities.cpp
 create mode 100644 cpp/benchmarks/common/table_utilities.hpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 8a48126e195..a5b248135c1 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -40,8 +40,13 @@ target_include_directories(
 
 # Use an OBJECT library so we only compile these helper source files only once
 add_library(
-  cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
-                               synchronization/synchronization.cpp io/cuio_common.cpp
+  cudf_benchmark_common OBJECT
+  "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
+  synchronization/synchronization.cpp
+  io/cuio_common.cpp
+  common/table_utilities.cpp
+  common/benchmark_utilities.cpp
+  common/nvbench_utilities.cpp
 )
 target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env>)
 add_custom_command(
diff --git a/cpp/benchmarks/common/benchmark_utilities.cpp b/cpp/benchmarks/common/benchmark_utilities.cpp
new file mode 100644
index 00000000000..0b9fc17e779
--- /dev/null
+++ b/cpp/benchmarks/common/benchmark_utilities.cpp
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "benchmark_utilities.hpp"
+
+void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration)
+{
+  state.SetItemsProcessed(state.iterations() * items_processed_per_iteration);
+}
+
+void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration)
+{
+  state.SetBytesProcessed(state.iterations() * bytes_processed_per_iteration);
+}
diff --git a/cpp/benchmarks/common/benchmark_utilities.hpp b/cpp/benchmarks/common/benchmark_utilities.hpp
new file mode 100644
index 00000000000..c5c80e73674
--- /dev/null
+++ b/cpp/benchmarks/common/benchmark_utilities.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+/**
+ * @brief Sets the number of items processed during the benchmark.
+ *
+ * This function could be used instead of ::benchmark::State.SetItemsProcessed()
+ * to avoid repeatedly computing ::benchmark::State.iterations() * items_processed_per_iteration.
+ *
+ * @param state the benchmark state
+ * @param items_processed_per_iteration number of items processed per iteration
+ */
+void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration);
+
+/**
+ * @brief Sets the number of bytes processed during the benchmark.
+ *
+ * This function could be used instead of ::benchmark::State.SetItemsProcessed()
+ * to avoid repeatedly computing ::benchmark::State.iterations() * bytes_processed_per_iteration.
+ *
+ * @param state the benchmark state
+ * @param bytes_processed_per_iteration number of bytes processed per iteration
+ */
+void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration);
diff --git a/cpp/benchmarks/common/nvbench_utilities.cpp b/cpp/benchmarks/common/nvbench_utilities.cpp
new file mode 100644
index 00000000000..c740eaa52f4
--- /dev/null
+++ b/cpp/benchmarks/common/nvbench_utilities.cpp
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nvbench_utilities.hpp"
+
+#include <nvbench/nvbench.cuh>
+
+// This function is copied over from
+// https://github.com/NVIDIA/nvbench/blob/a171514056e5d6a7f52a035dd6c812fa301d4f4f/nvbench/detail/measure_cold.cu#L190-L224.
+void set_throughputs(nvbench::state& state)
+{
+  double avg_cuda_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+
+  if (const auto items = state.get_element_count(); items != 0) {
+    auto& summ = state.add_summary("nv/cold/bw/item_rate");
+    summ.set_string("name", "Elem/s");
+    summ.set_string("hint", "item_rate");
+    summ.set_string("description", "Number of input elements processed per second");
+    summ.set_float64("value", static_cast<double>(items) / avg_cuda_time);
+  }
+
+  if (const auto bytes = state.get_global_memory_rw_bytes(); bytes != 0) {
+    const auto avg_used_gmem_bw = static_cast<double>(bytes) / avg_cuda_time;
+    {
+      auto& summ = state.add_summary("nv/cold/bw/global/bytes_per_second");
+      summ.set_string("name", "GlobalMem BW");
+      summ.set_string("hint", "byte_rate");
+      summ.set_string("description",
+                      "Number of bytes read/written per second to the CUDA "
+                      "device's global memory");
+      summ.set_float64("value", avg_used_gmem_bw);
+    }
+
+    {
+      const auto peak_gmem_bw =
+        static_cast<double>(state.get_device()->get_global_memory_bus_bandwidth());
+
+      auto& summ = state.add_summary("nv/cold/bw/global/utilization");
+      summ.set_string("name", "BWUtil");
+      summ.set_string("hint", "percentage");
+      summ.set_string("description",
+                      "Global device memory utilization as a percentage of the "
+                      "device's peak bandwidth");
+      summ.set_float64("value", avg_used_gmem_bw / peak_gmem_bw);
+    }
+  }
+}
diff --git a/cpp/benchmarks/common/nvbench_utilities.hpp b/cpp/benchmarks/common/nvbench_utilities.hpp
new file mode 100644
index 00000000000..98d879efac5
--- /dev/null
+++ b/cpp/benchmarks/common/nvbench_utilities.hpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace nvbench {
+struct state;
+}
+
+/**
+ * @brief Sets throughput statistics, such as "Elem/s", "GlobalMem BW", and "BWUtil" for the
+ * nvbench results summary.
+ *
+ * This function could be used to work around a known issue that the throughput statistics
+ * should be added before the nvbench::state.exec() call, otherwise they will not be printed
+ * in the summary. See https://github.com/NVIDIA/nvbench/issues/175 for more details.
+ */
+void set_throughputs(nvbench::state& state);
diff --git a/cpp/benchmarks/common/table_utilities.cpp b/cpp/benchmarks/common/table_utilities.cpp
new file mode 100644
index 00000000000..a6fbdac9fb8
--- /dev/null
+++ b/cpp/benchmarks/common/table_utilities.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "table_utilities.hpp"
+
+#include <cudf/reduction.hpp>
+#include <cudf/transform.hpp>
+
+#include <cmath>
+
+int64_t estimate_size(cudf::column_view const& col)
+{
+  return estimate_size(cudf::table_view({col}));
+}
+
+int64_t estimate_size(cudf::table_view const& view)
+{
+  // Compute the size in bits for each row.
+  auto const row_sizes = cudf::row_bit_count(view);
+  // Accumulate the row sizes to compute a sum.
+  auto const agg = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+  cudf::data_type sum_dtype{cudf::type_id::INT64};
+  auto const total_size_scalar = cudf::reduce(*row_sizes, *agg, sum_dtype);
+  auto const total_size_in_bits =
+    static_cast<cudf::numeric_scalar<int64_t>*>(total_size_scalar.get())->value();
+  // Convert the size in bits to the size in bytes.
+  return static_cast<int64_t>(std::ceil(static_cast<double>(total_size_in_bits) / 8));
+}
diff --git a/cpp/benchmarks/common/table_utilities.hpp b/cpp/benchmarks/common/table_utilities.hpp
new file mode 100644
index 00000000000..04ee847d397
--- /dev/null
+++ b/cpp/benchmarks/common/table_utilities.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/table/table_view.hpp>
+
+/**
+ * @brief Estimates the column size in bytes.
+ *
+ * @remark As this function internally uses cudf::row_bit_count() to estimate each row size
+ * and accumulates them, the returned estimate may be an inexact approximation in some
+ * cases. See cudf::row_bit_count() for more details.
+ *
+ * @param view The column view to estimate its size
+ */
+int64_t estimate_size(cudf::column_view const& view);
+
+/**
+ * @brief Estimates the table size in bytes.
+ *
+ * @remark As this function internally uses cudf::row_bit_count() to estimate each row size
+ * and accumulates them, the returned estimate may be an inexact approximation in some
+ * cases. See cudf::row_bit_count() for more details.
+ *
+ * @param view The table view to estimate its size
+ */
+int64_t estimate_size(cudf::table_view const& view);
diff --git a/cpp/benchmarks/reduction/anyall.cpp b/cpp/benchmarks/reduction/anyall.cpp
index 8b1e71c1585..e9d23881764 100644
--- a/cpp/benchmarks/reduction/anyall.cpp
+++ b/cpp/benchmarks/reduction/anyall.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
@@ -42,6 +44,10 @@ void BM_reduction_anyall(benchmark::State& state,
     cuda_event_timer timer(state, true);
     auto result = cudf::reduce(*values, *agg, output_dtype);
   }
+
+  // The benchmark takes a column and produces one scalar.
+  set_items_processed(state, column_size + 1);
+  set_bytes_processed(state, estimate_size(values->view()) + cudf::size_of(output_dtype));
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/dictionary.cpp b/cpp/benchmarks/reduction/dictionary.cpp
index c1c44c919ac..5095337dbb3 100644
--- a/cpp/benchmarks/reduction/dictionary.cpp
+++ b/cpp/benchmarks/reduction/dictionary.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
@@ -52,6 +53,13 @@ void BM_reduction_dictionary(benchmark::State& state,
     cuda_event_timer timer(state, true);
     auto result = cudf::reduce(*values, *agg, output_dtype);
   }
+
+  // The benchmark takes a column and produces two scalars.
+  set_items_processed(state, column_size + 1);
+
+  // We don't set the metrics for the size read/written as row_bit_count() doesn't
+  // support the dictionary type yet (and so is estimate_size()).
+  // See https://github.com/rapidsai/cudf/issues/16121 for details.
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/minmax.cpp b/cpp/benchmarks/reduction/minmax.cpp
index 963c26692e7..050f2887221 100644
--- a/cpp/benchmarks/reduction/minmax.cpp
+++ b/cpp/benchmarks/reduction/minmax.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
@@ -28,14 +30,19 @@ template <typename type>
 void BM_reduction(benchmark::State& state)
 {
   cudf::size_type const column_size{(cudf::size_type)state.range(0)};
-  auto const dtype = cudf::type_to_id<type>();
+  auto const dtype_id = cudf::type_to_id<type>();
   auto const input_column =
-    create_random_column(dtype, row_count{column_size}, data_profile_builder().no_validity());
+    create_random_column(dtype_id, row_count{column_size}, data_profile_builder().no_validity());
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
     auto result = cudf::minmax(*input_column);
   }
+
+  // The benchmark takes a column and produces two scalars.
+  set_items_processed(state, column_size + 2);
+  cudf::data_type dtype = cudf::data_type{dtype_id};
+  set_bytes_processed(state, estimate_size(input_column->view()) + 2 * cudf::size_of(dtype));
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp
index e55f3b9e09f..14876c80d3e 100644
--- a/cpp/benchmarks/reduction/rank.cpp
+++ b/cpp/benchmarks/reduction/rank.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 
 #include <cudf/detail/scan.hpp>
 #include <cudf/filling.hpp>
@@ -39,11 +41,18 @@ static void nvbench_reduction_scan(nvbench::state& state, nvbench::type_list<typ
   auto const new_tbl = cudf::repeat(table->view(), 2);
   cudf::column_view input(new_tbl->view().column(0));
 
+  std::unique_ptr<cudf::column> result = nullptr;
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     rmm::cuda_stream_view stream_view{launch.get_stream()};
-    auto result = cudf::detail::inclusive_dense_rank_scan(
+    result = cudf::detail::inclusive_dense_rank_scan(
       input, stream_view, rmm::mr::get_current_device_resource());
   });
+
+  state.add_element_count(input.size());
+  state.add_global_memory_reads(estimate_size(input));
+  state.add_global_memory_writes(estimate_size(result->view()));
+
+  set_throughputs(state);
 }
 
 using data_type = nvbench::type_list<int32_t, cudf::list_view>;
diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp
index 5bd3e2e3bba..63c96f4fe9e 100644
--- a/cpp/benchmarks/reduction/reduce.cpp
+++ b/cpp/benchmarks/reduction/reduce.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
@@ -46,6 +48,10 @@ void BM_reduction(benchmark::State& state, std::unique_ptr<cudf::reduce_aggregat
     cuda_event_timer timer(state, true);
     auto result = cudf::reduce(*input_column, *agg, output_dtype);
   }
+
+  // The benchmark takes a column and produces two scalars.
+  set_items_processed(state, column_size + 1);
+  set_bytes_processed(state, estimate_size(input_column->view()) + cudf::size_of(output_dtype));
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/scan.cpp b/cpp/benchmarks/reduction/scan.cpp
index 8c9883ece9c..dc05aad9807 100644
--- a/cpp/benchmarks/reduction/scan.cpp
+++ b/cpp/benchmarks/reduction/scan.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
@@ -34,11 +36,16 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
   auto const column = create_random_column(dtype, row_count{n_rows});
   if (!include_nulls) column->set_null_mask(rmm::device_buffer{}, 0);
 
+  std::unique_ptr<cudf::column> result = nullptr;
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
-    auto result = cudf::scan(
+    result = cudf::scan(
       *column, *cudf::make_min_aggregation<cudf::scan_aggregation>(), cudf::scan_type::INCLUSIVE);
   }
+
+  // The benchmark takes a column and produces a new column of the same size as input.
+  set_items_processed(state, n_rows * 2);
+  set_bytes_processed(state, estimate_size(column->view()) + estimate_size(result->view()));
 }
 
 #define SCAN_BENCHMARK_DEFINE(name, type, nulls)                          \
diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp
index ee97b54fbef..a781f75a314 100644
--- a/cpp/benchmarks/reduction/scan_structs.cpp
+++ b/cpp/benchmarks/reduction/scan_structs.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,8 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/scan.hpp>
@@ -45,16 +47,24 @@ static void nvbench_structs_scan(nvbench::state& state)
   auto [null_mask, null_count] = create_random_null_mask(size, null_probability);
   auto const input             = cudf::make_structs_column(
     size, std::move(data_table->release()), null_count, std::move(null_mask));
+  auto input_view = input->view();
 
   auto const agg         = cudf::make_min_aggregation<cudf::scan_aggregation>();
   auto const null_policy = static_cast<cudf::null_policy>(state.get_int64("null_policy"));
   auto const stream      = cudf::get_default_stream();
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  std::unique_ptr<cudf::column> result = nullptr;
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto const result = cudf::detail::scan_inclusive(
-      *input, *agg, null_policy, stream, rmm::mr::get_current_device_resource());
+    result = cudf::detail::scan_inclusive(
+      input_view, *agg, null_policy, stream, rmm::mr::get_current_device_resource());
   });
+
+  state.add_element_count(input_view.size());
+  state.add_global_memory_reads(estimate_size(input_view));
+  state.add_global_memory_writes(estimate_size(result->view()));
+
+  set_throughputs(state);
 }
 
 NVBENCH_BENCH(nvbench_structs_scan)

From 3aedeeaaaa08bb99695bbbc34098a5660e4c94e0 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 3 Jul 2024 15:40:23 -0500
Subject: [PATCH 457/842] `cudf-polars` string slicing (#16082)

This PR plumbs the libcudf/pylibcudf `slice_strings` function through to cudf-polars. Depends on https://github.com/rapidsai/cudf/pull/15988

Authors:
  - https://github.com/brandon-b-miller
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16082
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 36 +++++++++++++++
 .../tests/expressions/test_stringfunction.py  | 46 +++++++++++++++++++
 2 files changed, 82 insertions(+)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index fe859c8d958..cfc2947f8de 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -703,6 +703,7 @@ def _validate_input(self):
             pl_expr.StringFunction.EndsWith,
             pl_expr.StringFunction.StartsWith,
             pl_expr.StringFunction.Contains,
+            pl_expr.StringFunction.Slice,
         ):
             raise NotImplementedError(f"String function {self.name}")
         if self.name == pl_expr.StringFunction.Contains:
@@ -716,6 +717,11 @@ def _validate_input(self):
                     raise NotImplementedError(
                         "Regex contains only supports a scalar pattern"
                     )
+        elif self.name == pl_expr.StringFunction.Slice:
+            if not all(isinstance(child, Literal) for child in self.children[1:]):
+                raise NotImplementedError(
+                    "Slice only supports literal start and stop values"
+                )
 
     def do_evaluate(
         self,
@@ -744,6 +750,36 @@ def do_evaluate(
                 flags=plc.strings.regex_flags.RegexFlags.DEFAULT,
             )
             return Column(plc.strings.contains.contains_re(column.obj, prog))
+        elif self.name == pl_expr.StringFunction.Slice:
+            child, expr_offset, expr_length = self.children
+            assert isinstance(expr_offset, Literal)
+            assert isinstance(expr_length, Literal)
+
+            column = child.evaluate(df, context=context, mapping=mapping)
+            # libcudf slices via [start,stop).
+            # polars slices with offset + length where start == offset
+            # stop = start + length. Negative values for start look backward
+            # from the last element of the string. If the end index would be
+            # below zero, an empty string is returned.
+            # Do this maths on the host
+            start = expr_offset.value.as_py()
+            length = expr_length.value.as_py()
+
+            if length == 0:
+                stop = start
+            else:
+                # No length indicates a scan to the end
+                # The libcudf equivalent is a null stop
+                stop = start + length if length else None
+                if length and start < 0 and length >= -start:
+                    stop = None
+            return Column(
+                plc.strings.slice.slice_strings(
+                    column.obj,
+                    plc.interop.from_arrow(pa.scalar(start, type=pa.int32())),
+                    plc.interop.from_arrow(pa.scalar(stop, type=pa.int32())),
+                )
+            )
         columns = [
             child.evaluate(df, context=context, mapping=mapping)
             for child in self.children
diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py
index 9729e765948..8cf65dd51ac 100644
--- a/python/cudf_polars/tests/expressions/test_stringfunction.py
+++ b/python/cudf_polars/tests/expressions/test_stringfunction.py
@@ -37,6 +37,30 @@ def ldf(with_nulls):
     return pl.LazyFrame({"a": a, "b": range(len(a))})
 
 
+slice_cases = [
+    (1, 3),
+    (0, 3),
+    (0, 0),
+    (-3, 1),
+    (-100, 5),
+    (1, 1),
+    (100, 100),
+    (-3, 4),
+    (-3, 3),
+]
+
+
+@pytest.fixture(params=slice_cases)
+def slice_column_data(ldf, request):
+    start, length = request.param
+    if length:
+        return ldf.with_columns(
+            pl.lit(start).alias("start"), pl.lit(length).alias("length")
+        )
+    else:
+        return ldf.with_columns(pl.lit(start).alias("start"))
+
+
 def test_supported_stringfunction_expression(ldf):
     query = ldf.select(
         pl.col("a").str.starts_with("Z"),
@@ -104,3 +128,25 @@ def test_contains_invalid(ldf):
         query.collect()
     with pytest.raises(pl.exceptions.ComputeError):
         query.collect(post_opt_callback=partial(execute_with_cudf, raise_on_fail=True))
+
+
+@pytest.mark.parametrize("offset", [1, -1, 0, 100, -100])
+def test_slice_scalars_offset(ldf, offset):
+    query = ldf.select(pl.col("a").str.slice(offset))
+    assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize("offset,length", slice_cases)
+def test_slice_scalars_length_and_offset(ldf, offset, length):
+    query = ldf.select(pl.col("a").str.slice(offset, length))
+    assert_gpu_result_equal(query)
+
+
+def test_slice_column(slice_column_data):
+    if "length" in slice_column_data.collect_schema():
+        query = slice_column_data.select(
+            pl.col("a").str.slice(pl.col("start"), pl.col("length"))
+        )
+    else:
+        query = slice_column_data.select(pl.col("a").str.slice(pl.col("start")))
+    assert_ir_translation_raises(query, NotImplementedError)

From 39de5a2527b297ba79c625993a49b28c3baf5b00 Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Thu, 4 Jul 2024 06:49:06 +1000
Subject: [PATCH 458/842] Refactor from_arrow_device/host to use resource_ref
 (#16160)

Fixes #16159

Also fixes typos / leftovers in  dictionary `add_keys` copydocs.

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16160
---
 .../cudf/dictionary/detail/update_keys.hpp    | 10 +++----
 cpp/include/cudf/interop.hpp                  | 29 ++++++++++---------
 cpp/src/interop/from_arrow_device.cu          | 27 ++++++++---------
 cpp/src/interop/from_arrow_host.cu            | 19 ++++++------
 cpp/src/interop/from_arrow_stream.cu          |  6 ++--
 5 files changed, 47 insertions(+), 44 deletions(-)

diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp
index e8486a80afc..9cdda773dbb 100644
--- a/cpp/include/cudf/dictionary/detail/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp
@@ -29,7 +29,7 @@ namespace dictionary {
 namespace detail {
 /**
  * @copydoc cudf::dictionary::add_keys(dictionary_column_view const&,column_view
- * const&,mm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -40,7 +40,7 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
 
 /**
  * @copydoc cudf::dictionary::remove_keys(dictionary_column_view const&,column_view
- * const&,mm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -51,7 +51,7 @@ std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_col
 
 /**
  * @copydoc cudf::dictionary::remove_unused_keys(dictionary_column_view
- * const&,mm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -61,7 +61,7 @@ std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& diction
 
 /**
  * @copydoc cudf::dictionary::set_keys(dictionary_column_view
- * const&,mm::mr::device_memory_resource*)
+ * const&,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -72,7 +72,7 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
 
 /**
  * @copydoc
- * cudf::dictionary::match_dictionaries(std::vector<cudf::dictionary_column_view>,mm::mr::device_memory_resource*)
+ * cudf::dictionary::match_dictionaries(std::vector<cudf::dictionary_column_view>,rmm::device_async_resource_ref)
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 502ffb9ba4f..11f6ce2bad7 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -39,6 +39,7 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <utility>
 
@@ -372,8 +373,8 @@ std::unique_ptr<cudf::scalar> from_arrow(
 std::unique_ptr<cudf::table> from_arrow(
   ArrowSchema const* schema,
   ArrowArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input
@@ -391,8 +392,8 @@ std::unique_ptr<cudf::table> from_arrow(
 std::unique_ptr<cudf::column> from_arrow_column(
   ArrowSchema const* schema,
   ArrowArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::table` from given ArrowDeviceArray input
@@ -415,8 +416,8 @@ std::unique_ptr<cudf::column> from_arrow_column(
 std::unique_ptr<table> from_arrow_host(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::table` from given ArrowArrayStream input
@@ -433,8 +434,8 @@ std::unique_ptr<table> from_arrow_host(
  */
 std::unique_ptr<table> from_arrow_stream(
   ArrowArrayStream* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Create `cudf::column` from given ArrowDeviceArray input
@@ -456,8 +457,8 @@ std::unique_ptr<table> from_arrow_stream(
 std::unique_ptr<column> from_arrow_host_column(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray
@@ -537,8 +538,8 @@ using unique_table_view_t =
 unique_table_view_t from_arrow_device(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief typedef for a unique_ptr to a `cudf::column_view` with custom deleter
@@ -580,8 +581,8 @@ using unique_column_view_t =
 unique_column_view_t from_arrow_device_column(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
-  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace cudf
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
index 73c1a474310..e1d289e67a3 100644
--- a/cpp/src/interop/from_arrow_device.cu
+++ b/cpp/src/interop/from_arrow_device.cu
@@ -35,6 +35,7 @@
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
@@ -56,7 +57,7 @@ struct dispatch_from_arrow_device {
                               data_type,
                               bool,
                               rmm::cuda_stream_view,
-                              rmm::mr::device_memory_resource*)
+                              rmm::device_async_resource_ref)
   {
     CUDF_FAIL("Unsupported type in from_arrow_device", cudf::data_type_error);
   }
@@ -68,7 +69,7 @@ struct dispatch_from_arrow_device {
                               data_type type,
                               bool skip_mask,
                               rmm::cuda_stream_view,
-                              rmm::mr::device_memory_resource*)
+                              rmm::device_async_resource_ref mr)
   {
     size_type const num_rows   = input->length;
     size_type const offset     = input->offset;
@@ -90,7 +91,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema,
                             data_type type,
                             bool skip_mask,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr);
+                            rmm::device_async_resource_ref mr);
 
 template <>
 dispatch_tuple_t dispatch_from_arrow_device::operator()<bool>(ArrowSchemaView* schema,
@@ -98,7 +99,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<bool>(ArrowSchemaView* s
                                                               data_type type,
                                                               bool skip_mask,
                                                               rmm::cuda_stream_view stream,
-                                                              rmm::mr::device_memory_resource* mr)
+                                                              rmm::device_async_resource_ref mr)
 {
   if (input->length == 0) {
     return std::make_tuple<column_view, owned_columns_t>(
@@ -141,7 +142,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::string_view>(
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema->type != NANOARROW_TYPE_LARGE_STRING,
                "Large strings are not yet supported in from_arrow_device",
@@ -182,7 +183,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::dictionary32>(
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   ArrowSchemaView keys_schema_view;
   NANOARROW_THROW_NOT_OK(
@@ -238,7 +239,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::struct_view>(
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   std::vector<column_view> children;
   owned_columns_t out_owned_cols;
@@ -283,7 +284,7 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::list_view>(
   data_type type,
   bool skip_mask,
   rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   size_type const num_rows   = input->length;
   size_type const offset     = input->offset;
@@ -324,7 +325,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema,
                             data_type type,
                             bool skip_mask,
                             rmm::cuda_stream_view stream,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   return type.id() != type_id::EMPTY
            ? std::move(type_dispatcher(
@@ -342,7 +343,7 @@ dispatch_tuple_t get_column(ArrowSchemaView* schema,
 unique_table_view_t from_arrow_device(ArrowSchema const* schema,
                                       ArrowDeviceArray const* input,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema != nullptr && input != nullptr,
                "input ArrowSchema and ArrowDeviceArray must not be NULL",
@@ -397,7 +398,7 @@ unique_table_view_t from_arrow_device(ArrowSchema const* schema,
 unique_column_view_t from_arrow_device_column(ArrowSchema const* schema,
                                               ArrowDeviceArray const* input,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema != nullptr && input != nullptr,
                "input ArrowSchema and ArrowDeviceArray must not be NULL",
@@ -429,7 +430,7 @@ unique_column_view_t from_arrow_device_column(ArrowSchema const* schema,
 unique_table_view_t from_arrow_device(ArrowSchema const* schema,
                                       ArrowDeviceArray const* input,
                                       rmm::cuda_stream_view stream,
-                                      rmm::mr::device_memory_resource* mr)
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -439,7 +440,7 @@ unique_table_view_t from_arrow_device(ArrowSchema const* schema,
 unique_column_view_t from_arrow_device_column(ArrowSchema const* schema,
                                               ArrowDeviceArray const* input,
                                               rmm::cuda_stream_view stream,
-                                              rmm::mr::device_memory_resource* mr)
+                                              rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu
index b7e07056686..b3087dedf98 100644
--- a/cpp/src/interop/from_arrow_host.cu
+++ b/cpp/src/interop/from_arrow_host.cu
@@ -38,6 +38,7 @@
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
@@ -49,7 +50,7 @@ namespace {
 
 struct dispatch_copy_from_arrow_host {
   rmm::cuda_stream_view stream;
-  rmm::mr::device_memory_resource* mr;
+  rmm::device_async_resource_ref mr;
 
   std::unique_ptr<rmm::device_buffer> get_mask_buffer(ArrowArray const* array)
   {
@@ -131,7 +132,7 @@ std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
                                         data_type type,
                                         bool skip_mask,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr);
+                                        rmm::device_async_resource_ref mr);
 
 template <>
 std::unique_ptr<column> dispatch_copy_from_arrow_host::operator()<bool>(ArrowSchemaView* schema,
@@ -388,7 +389,7 @@ std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
                                         data_type type,
                                         bool skip_mask,
                                         rmm::cuda_stream_view stream,
-                                        rmm::mr::device_memory_resource* mr)
+                                        rmm::device_async_resource_ref mr)
 {
   return type.id() != type_id::EMPTY
            ? std::move(type_dispatcher(
@@ -405,7 +406,7 @@ std::unique_ptr<column> get_column_copy(ArrowSchemaView* schema,
 std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
                                        ArrowDeviceArray const* input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema != nullptr && input != nullptr,
                "input ArrowSchema and ArrowDeviceArray must not be NULL",
@@ -441,7 +442,7 @@ std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
 std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
                                                ArrowDeviceArray const* input,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(schema != nullptr && input != nullptr,
                "input ArrowSchema and ArrowDeviceArray must not be NULL",
@@ -462,7 +463,7 @@ std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
 std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
                                        ArrowDeviceArray const* input,
                                        rmm::cuda_stream_view stream,
-                                       rmm::mr::device_memory_resource* mr)
+                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -472,7 +473,7 @@ std::unique_ptr<table> from_arrow_host(ArrowSchema const* schema,
 std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
                                                ArrowDeviceArray const* input,
                                                rmm::cuda_stream_view stream,
-                                               rmm::mr::device_memory_resource* mr)
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -482,7 +483,7 @@ std::unique_ptr<column> from_arrow_host_column(ArrowSchema const* schema,
 std::unique_ptr<table> from_arrow(ArrowSchema const* schema,
                                   ArrowArray const* input,
                                   rmm::cuda_stream_view stream,
-                                  rmm::mr::device_memory_resource* mr)
+                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -497,7 +498,7 @@ std::unique_ptr<table> from_arrow(ArrowSchema const* schema,
 std::unique_ptr<column> from_arrow_column(ArrowSchema const* schema,
                                           ArrowArray const* input,
                                           rmm::cuda_stream_view stream,
-                                          rmm::mr::device_memory_resource* mr)
+                                          rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/interop/from_arrow_stream.cu b/cpp/src/interop/from_arrow_stream.cu
index 0c85b561944..578105aa90a 100644
--- a/cpp/src/interop/from_arrow_stream.cu
+++ b/cpp/src/interop/from_arrow_stream.cu
@@ -41,7 +41,7 @@ namespace {
 
 std::unique_ptr<column> make_empty_column_from_schema(ArrowSchema const* schema,
                                                       rmm::cuda_stream_view stream,
-                                                      rmm::mr::device_memory_resource* mr)
+                                                      rmm::device_async_resource_ref mr)
 {
   ArrowSchemaView schema_view;
   NANOARROW_THROW_NOT_OK(ArrowSchemaViewInit(&schema_view, schema, nullptr));
@@ -81,7 +81,7 @@ std::unique_ptr<column> make_empty_column_from_schema(ArrowSchema const* schema,
 
 std::unique_ptr<table> from_arrow_stream(ArrowArrayStream* input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(input != nullptr, "input ArrowArrayStream must not be NULL", std::invalid_argument);
 
@@ -135,7 +135,7 @@ std::unique_ptr<table> from_arrow_stream(ArrowArrayStream* input,
 
 std::unique_ptr<table> from_arrow_stream(ArrowArrayStream* input,
                                          rmm::cuda_stream_view stream,
-                                         rmm::mr::device_memory_resource* mr)
+                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::from_arrow_stream(input, stream, mr);

From dab6a447ca418073ec50c4e95aee5f0448fc95c2 Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Wed, 3 Jul 2024 15:30:24 -0700
Subject: [PATCH 459/842] Add environment-agnostic
 `ci/run_cudf_polars_pytest.sh` (#16178)

Adds environment-agnostic `ci/run_cudf_polars_pytest.sh` script, similar to the scripts added in https://github.com/rapidsai/cudf/pull/14992.

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16178
---
 ci/run_cudf_polars_pytests.sh | 11 +++++++++++
 ci/test_cudf_polars.sh        |  6 ++----
 2 files changed, 13 insertions(+), 4 deletions(-)
 create mode 100755 ci/run_cudf_polars_pytests.sh

diff --git a/ci/run_cudf_polars_pytests.sh b/ci/run_cudf_polars_pytests.sh
new file mode 100755
index 00000000000..78683b057a5
--- /dev/null
+++ b/ci/run_cudf_polars_pytests.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# It is essential to cd into python/cudf_polars as `pytest-xdist` + `coverage` seem to work only at this directory level.
+
+# Support invoking run_cudf_polars_pytests.sh outside the script directory
+cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_polars/
+
+pytest --cache-clear "$@" tests
diff --git a/ci/test_cudf_polars.sh b/ci/test_cudf_polars.sh
index 95fb4b431bf..ca98c4dadb3 100755
--- a/ci/test_cudf_polars.sh
+++ b/ci/test_cudf_polars.sh
@@ -42,13 +42,11 @@ EXITCODE=0
 trap set_exitcode ERR
 set +e
 
-python -m pytest \
-       --cache-clear \
+./ci/run_cudf_polars_pytests.sh \
        --cov cudf_polars \
        --cov-fail-under=100 \
        --cov-config=python/cudf_polars/pyproject.toml \
-       --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf_polars.xml" \
-       python/cudf_polars/tests
+       --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf_polars.xml"
 
 trap ERR
 set -e

From 769e94ffcebaabe33ddec4ab8f178f6d1c7545aa Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Wed, 3 Jul 2024 15:31:28 -0700
Subject: [PATCH 460/842] Make `test_python_cudf_pandas` generate
 `requirements.txt` (#16181)

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16181
---
 dependencies.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index e3f8a72e76c..6d4ba0c38d1 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -755,7 +755,7 @@ dependencies:
           - {matrix: null, packages: *cupy_packages_cu11}
   test_python_pandas_cudf:
     common:
-      - output_types: pyproject
+      - output_types: [requirements, pyproject]
         packages:
           # dependencies to run pandas tests
           # https://github.com/pandas-dev/pandas/blob/main/environment.yml
@@ -766,7 +766,7 @@ dependencies:
           - pytest-reportlog
   test_python_cudf_pandas:
     common:
-      - output_types: pyproject
+      - output_types: [requirements, pyproject]
         packages:
           - ipython
           - openpyxl

From aa4033c5fe0be9e3d235d5722f1030c60b04e34d Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 4 Jul 2024 10:10:02 +0100
Subject: [PATCH 461/842] Cast count aggs to correct dtype in translation
 (#16192)

Polars default dtypes for some aggregations, particularly count, don't match ours, so insert casts.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16192
---
 python/cudf_polars/cudf_polars/dsl/translate.py | 17 +++++++++++++----
 python/cudf_polars/tests/test_groupby.py        |  5 +----
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index a2fdb3c3d79..0019b3aa98a 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -432,8 +432,11 @@ def _(node: pl_expr.Cast, visitor: NodeTraverser, dtype: plc.DataType) -> expr.E
     # Push casts into literals so we can handle Cast(Literal(Null))
     if isinstance(inner, expr.Literal):
         return expr.Literal(dtype, inner.value.cast(plc.interop.to_arrow(dtype)))
-    else:
-        return expr.Cast(dtype, inner)
+    elif isinstance(inner, expr.Cast):
+        # Translation of Len/Count-agg put in a cast, remove double
+        # casts if we have one.
+        (inner,) = inner.children
+    return expr.Cast(dtype, inner)
 
 
 @_translate_expr.register
@@ -443,12 +446,15 @@ def _(node: pl_expr.Column, visitor: NodeTraverser, dtype: plc.DataType) -> expr
 
 @_translate_expr.register
 def _(node: pl_expr.Agg, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
-    return expr.Agg(
+    value = expr.Agg(
         dtype,
         node.name,
         node.options,
         *(translate_expr(visitor, n=n) for n in node.arguments),
     )
+    if value.name == "count" and value.dtype.id() != plc.TypeId.INT32:
+        return expr.Cast(value.dtype, value)
+    return value
 
 
 @_translate_expr.register
@@ -475,7 +481,10 @@ def _(
 
 @_translate_expr.register
 def _(node: pl_expr.Len, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
-    return expr.Len(dtype)
+    value = expr.Len(dtype)
+    if dtype.id() != plc.TypeId.INT32:
+        return expr.Cast(dtype, value)
+    return value  # pragma: no cover; never reached since polars len has uint32 dtype
 
 
 def translate_expr(visitor: NodeTraverser, *, n: int) -> expr.Expr:
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index aefad59eb91..8a6732b7063 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -83,10 +83,7 @@ def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs):
 def test_groupby_len(df, keys):
     q = df.group_by(*keys).agg(pl.len())
 
-    # TODO: polars returns UInt32, libcudf returns Int32
-    with pytest.raises(AssertionError):
-        assert_gpu_result_equal(q, check_row_order=False)
-    assert_gpu_result_equal(q, check_dtypes=False, check_row_order=False)
+    assert_gpu_result_equal(q, check_row_order=False)
 
 
 @pytest.mark.parametrize(

From 5f57bc9034311f5461981644dec86c9c2e3434c7 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 4 Jul 2024 11:55:36 +0100
Subject: [PATCH 462/842] Some small fixes in cudf-polars (#16191)

These catch a few more edge cases.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16191
---
 python/cudf_polars/cudf_polars/callback.py          | 13 +++++++++++--
 .../cudf_polars/cudf_polars/containers/dataframe.py |  6 +++++-
 python/cudf_polars/cudf_polars/dsl/ir.py            |  2 ++
 python/cudf_polars/tests/test_union.py              |  9 +++++++++
 4 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index 979087d5273..764cdd3b3ca 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -34,7 +34,12 @@ def _callback(
         return ir.evaluate(cache={}).to_polars()
 
 
-def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None:
+def execute_with_cudf(
+    nt: NodeTraverser,
+    *,
+    raise_on_fail: bool = False,
+    exception: type[Exception] | tuple[type[Exception], ...] = Exception,
+) -> None:
     """
     A post optimization callback that attempts to execute the plan with cudf.
 
@@ -47,11 +52,15 @@ def execute_with_cudf(nt: NodeTraverser, *, raise_on_fail: bool = False) -> None
         Should conversion raise an exception rather than continuing
         without setting a callback.
 
+    exception
+        Optional exception, or tuple of exceptions, to catch during
+        translation. Defaults to ``Exception``.
+
     The NodeTraverser is mutated if the libcudf executor can handle the plan.
     """
     try:
         with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
             nt.set_udf(partial(_callback, translate_ir(nt)))
-    except NotImplementedError:
+    except exception:
         if raise_on_fail:
             raise
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index ec8d00c3123..d86656578d7 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+import itertools
 from functools import cached_property
 from typing import TYPE_CHECKING, cast
 
@@ -160,7 +161,10 @@ def with_columns(self, columns: Sequence[NamedColumn]) -> Self:
         -----
         If column names overlap, newer names replace older ones.
         """
-        return type(self)([*self.columns, *columns])
+        columns = list(
+            {c.name: c for c in itertools.chain(self.columns, columns)}.values()
+        )
+        return type(self)(columns)
 
     def discard_columns(self, names: Set[str]) -> Self:
         """Drop columns by name."""
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 9b3096becd4..31a0be004ea 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -96,6 +96,8 @@ def broadcast(
     ``target_length`` is provided and not all columns are length-1
     (i.e. ``n != 1``), then ``target_length`` must be equal to ``n``.
     """
+    if len(columns) == 0:
+        return []
     lengths: set[int] = {column.obj.size() for column in columns}
     if lengths == {1}:
         if target_length is None:
diff --git a/python/cudf_polars/tests/test_union.py b/python/cudf_polars/tests/test_union.py
index b021d832910..865b95a7d91 100644
--- a/python/cudf_polars/tests/test_union.py
+++ b/python/cudf_polars/tests/test_union.py
@@ -46,3 +46,12 @@ def test_concat_vertical():
     q = pl.concat([ldf, ldf2], how="vertical")
 
     assert_gpu_result_equal(q)
+
+
+def test_concat_diagonal_empty():
+    df1 = pl.LazyFrame()
+    df2 = pl.LazyFrame({"a": [1, 2]})
+
+    q = pl.concat([df1, df2], how="diagonal_relaxed")
+
+    assert_gpu_result_equal(q, collect_kwargs={"no_optimization": True})

From c1c62f1c02cf3929fb7536d67d14a24a9e2950ea Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 4 Jul 2024 04:31:06 -1000
Subject: [PATCH 463/842] Fix `memory_usage` when calculating nested list
 column (#16193)

The offset column of a nested empty list column may be empty as discussed in https://github.com/rapidsai/cudf/issues/16164. `ListColumn.memory_usage` assumed that this column was non-empty

Unblocks https://github.com/rapidsai/cuspatial/pull/1400

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16193
---
 python/cudf/cudf/core/column/lists.py | 11 ++++++++---
 python/cudf/cudf/tests/test_list.py   | 27 +++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index c548db67344..1992d471947 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -73,10 +73,15 @@ def memory_usage(self):
             child0_size = (
                 current_base_child.size + 1 - current_offset
             ) * current_base_child.base_children[0].dtype.itemsize
-            current_offset = current_base_child.base_children[
-                0
-            ].element_indexing(current_offset)
             n += child0_size
+            current_offset_col = current_base_child.base_children[0]
+            if not len(current_offset_col):
+                # See https://github.com/rapidsai/cudf/issues/16164 why
+                # offset column can be uninitialized
+                break
+            current_offset = current_offset_col.element_indexing(
+                current_offset
+            )
             current_base_child = current_base_child.base_children[1]
 
         n += (
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index f76143cb381..ec9d7995b05 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -12,6 +12,7 @@
 from cudf import NA
 from cudf._lib.copying import get_element
 from cudf.api.types import is_scalar
+from cudf.core.column.column import column_empty
 from cudf.testing import assert_eq
 from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, TIMEDELTA_TYPES
 
@@ -926,3 +927,29 @@ def test_list_iterate_error():
 def test_list_struct_list_memory_usage():
     df = cudf.DataFrame({"a": [[{"b": [1]}]]})
     assert df.memory_usage().sum() == 16
+
+
+def test_empty_nested_list_uninitialized_offsets_memory_usage():
+    col = column_empty(0, cudf.ListDtype(cudf.ListDtype("int64")))
+    nested_col = col.children[1]
+    empty_inner = type(nested_col)(
+        size=nested_col.size,
+        dtype=nested_col.dtype,
+        mask=nested_col.mask,
+        offset=nested_col.offset,
+        null_count=nested_col.null_count,
+        children=(
+            column_empty(0, nested_col.children[0].dtype),
+            nested_col.children[1],
+        ),
+    )
+    col_empty_offset = type(col)(
+        size=col.size,
+        dtype=col.dtype,
+        mask=col.mask,
+        offset=col.offset,
+        null_count=col.null_count,
+        children=(column_empty(0, col.children[0].dtype), empty_inner),
+    )
+    ser = cudf.Series._from_data({None: col_empty_offset})
+    assert ser.memory_usage() == 8

From f3a1216bb9bac07667b05cef01fe007fe6dc52ce Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 4 Jul 2024 12:49:10 -0400
Subject: [PATCH 464/842] Migrate lists/modifying to pylibcudf (#16185)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16185
---
 .../_lib/pylibcudf/libcudf/lists/reverse.pxd  | 14 ++++++++++
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  2 ++
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 26 +++++++++++++++++++
 .../cudf/cudf/pylibcudf_tests/test_lists.py   | 12 +++++++++
 4 files changed, 54 insertions(+)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd

diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd
new file mode 100644
index 00000000000..0382a5d42c3
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+
+
+cdef extern from "cudf/lists/reverse.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] reverse(
+        const lists_column_view& lists_column,
+    ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index 2ccf0139e90..c9d0a84e8ac 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -23,3 +23,5 @@ cpdef Column contains(Column, ColumnOrScalar)
 cpdef Column contains_nulls(Column)
 
 cpdef Column index_of(Column, ColumnOrScalar, bool)
+
+cpdef Column reverse(Column)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index a94d940accd..651f1346f88 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -9,6 +9,7 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.lists cimport (
     contains as cpp_contains,
     explode as cpp_explode,
+    reverse as cpp_reverse,
 )
 from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
     concatenate_list_elements as cpp_concatenate_list_elements,
@@ -206,3 +207,28 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o
             find_option,
         ))
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column reverse(Column input):
+    """Reverse the element order within each list of the input column.
+
+    For details, see :cpp:func:`reverse`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+
+    Returns
+    -------
+    Column
+        A new Column with reversed lists.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+
+    with nogil:
+        c_result = move(cpp_reverse.reverse(
+            list_view.view(),
+        ))
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index c781126e388..58a1dcf8d56 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -134,3 +134,15 @@ def test_index_of_list_column(test_data, column):
     expect = pa.array(column[1], type=pa.int32())
 
     assert_column_eq(expect, res)
+
+
+def test_reverse(test_data):
+    list_column = test_data[0][0]
+    arr = pa.array(list_column)
+    plc_column = plc.interop.from_arrow(arr)
+
+    res = plc.lists.reverse(plc_column)
+
+    expect = pa.array([lst[::-1] for lst in list_column])
+
+    assert_column_eq(expect, res)

From ae422187743af5b9081028de7405b9ded73787b8 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 5 Jul 2024 12:27:50 +0100
Subject: [PATCH 465/842] Expose type traits to pylibcudf (#16197)

Rather than recreating the classification, OAOO by using the libcudf definitions.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16197
---
 .../user_guide/api_docs/pylibcudf/index.rst   |   7 +-
 .../user_guide/api_docs/pylibcudf/traits.rst  |   6 +
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |   1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |   3 +
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |   4 +
 .../pylibcudf/libcudf/utilities/traits.pxd    |  27 ++++
 python/cudf/cudf/_lib/pylibcudf/traits.pxd    |  25 +++
 python/cudf/cudf/_lib/pylibcudf/traits.pyx    | 151 ++++++++++++++++++
 .../cudf/cudf/pylibcudf_tests/common/utils.py |  39 -----
 .../cudf/cudf/pylibcudf_tests/test_copying.py |  47 +++---
 .../cudf/cudf/pylibcudf_tests/test_traits.py  | 110 +++++++++++++
 python/cudf_polars/cudf_polars/dsl/expr.py    |   3 +-
 .../cudf_polars/cudf_polars/utils/dtypes.py   |  13 --
 13 files changed, 361 insertions(+), 75 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/traits.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/traits.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_traits.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index e9dad705cbf..bd6f0f77357 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -18,22 +18,22 @@ This page provides API documentation for pylibcudf.
     filling
     gpumemoryview
     groupby
-    io/index.rst
     interop
     join
     lists
     merge
     quantiles
     reduce
+    replace
     reshape
     rolling
     round
     scalar
     search
-    stream_compaction
     sorting
-    replace
+    stream_compaction
     table
+    traits
     types
     unary
 
@@ -41,4 +41,5 @@ This page provides API documentation for pylibcudf.
     :maxdepth: 2
     :caption: Subpackages
 
+    io/index.rst
     strings/index.rst
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
new file mode 100644
index 00000000000..294ca8dc78c
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
@@ -0,0 +1,6 @@
+======
+traits
+======
+
+.. automodule:: cudf._lib.pylibcudf.traits
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 0a198f431a7..d22096081af 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -38,6 +38,7 @@ set(cython_sources
     stream_compaction.pyx
     sorting.pyx
     table.pyx
+    traits.pyx
     types.pyx
     unary.pyx
     utils.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 5131df9a5cd..d4d615cde34 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -23,6 +23,7 @@ from . cimport (
     sorting,
     stream_compaction,
     strings,
+    traits,
     types,
     unary,
 )
@@ -54,12 +55,14 @@ __all__ = [
     "quantiles",
     "reduce",
     "replace",
+    "reshape",
     "rolling",
     "round",
     "search",
     "stream_compaction",
     "strings",
     "sorting",
+    "traits",
     "types",
     "unary",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 43a9e2aca31..91f8acaf682 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -23,6 +23,7 @@
     sorting,
     stream_compaction,
     strings,
+    traits,
     types,
     unary,
 )
@@ -35,6 +36,7 @@
 __all__ = [
     "Column",
     "DataType",
+    "MaskState",
     "Scalar",
     "Table",
     "TypeId",
@@ -54,12 +56,14 @@
     "quantiles",
     "reduce",
     "replace",
+    "reshape",
     "rolling",
     "round",
     "search",
     "stream_compaction",
     "strings",
     "sorting",
+    "traits",
     "types",
     "unary",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd
new file mode 100644
index 00000000000..0cc58af735b
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd
@@ -0,0 +1,27 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.libcudf.types cimport data_type
+
+
+cdef extern from "cudf/utilities/traits.hpp" namespace "cudf" nogil:
+    cdef bool is_relationally_comparable(data_type)
+    cdef bool is_equality_comparable(data_type)
+    cdef bool is_numeric(data_type)
+    cdef bool is_index_type(data_type)
+    cdef bool is_unsigned(data_type)
+    cdef bool is_integral(data_type)
+    cdef bool is_integral_not_bool(data_type)
+    cdef bool is_floating_point(data_type)
+    cdef bool is_boolean(data_type)
+    cdef bool is_timestamp(data_type)
+    cdef bool is_fixed_point(data_type)
+    cdef bool is_duration(data_type)
+    cdef bool is_chrono(data_type)
+    cdef bool is_dictionary(data_type)
+    cdef bool is_fixed_width(data_type)
+    cdef bool is_compound(data_type)
+    cdef bool is_nested(data_type)
+    cdef bool is_bit_castable(data_type, data_type)
diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pxd b/python/cudf/cudf/_lib/pylibcudf/traits.pxd
new file mode 100644
index 00000000000..668fa775202
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/traits.pxd
@@ -0,0 +1,25 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+from .types cimport DataType
+
+
+cpdef bool is_relationally_comparable(DataType typ)
+cpdef bool is_equality_comparable(DataType typ)
+cpdef bool is_numeric(DataType typ)
+cpdef bool is_index_type(DataType typ)
+cpdef bool is_unsigned(DataType typ)
+cpdef bool is_integral(DataType typ)
+cpdef bool is_integral_not_bool(DataType typ)
+cpdef bool is_floating_point(DataType typ)
+cpdef bool is_boolean(DataType typ)
+cpdef bool is_timestamp(DataType typ)
+cpdef bool is_fixed_point(DataType typ)
+cpdef bool is_duration(DataType typ)
+cpdef bool is_chrono(DataType typ)
+cpdef bool is_dictionary(DataType typ)
+cpdef bool is_fixed_width(DataType typ)
+cpdef bool is_compound(DataType typ)
+cpdef bool is_nested(DataType typ)
+cpdef bool is_bit_castable(DataType source, DataType target)
diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pyx b/python/cudf/cudf/_lib/pylibcudf/traits.pyx
new file mode 100644
index 00000000000..d2370f8d641
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/traits.pyx
@@ -0,0 +1,151 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+from cudf._lib.pylibcudf.libcudf.utilities cimport traits
+
+from .types cimport DataType
+
+
+cpdef bool is_relationally_comparable(DataType typ):
+    """Checks if the given data type supports relational comparisons.
+
+    For details, see :cpp:func:`is_relationally_comparable`.
+    """
+    return traits.is_relationally_comparable(typ.c_obj)
+
+
+cpdef bool is_equality_comparable(DataType typ):
+    """Checks if the given data type supports equality comparisons.
+
+    For details, see :cpp:func:`is_equality_comparable`.
+    """
+    return traits.is_equality_comparable(typ.c_obj)
+
+
+cpdef bool is_numeric(DataType typ):
+    """Checks if the given data type is numeric.
+
+    For details, see :cpp:func:`is_numeric`.
+    """
+    return traits.is_numeric(typ.c_obj)
+
+
+cpdef bool is_index_type(DataType typ):
+    """Checks if the given data type is an index type.
+
+    For details, see :cpp:func:`is_index_type`.
+    """
+    return traits.is_index_type(typ.c_obj)
+
+
+cpdef bool is_unsigned(DataType typ):
+    """Checks if the given data type is an unsigned type.
+
+    For details, see :cpp:func:`is_unsigned`.
+    """
+    return traits.is_unsigned(typ.c_obj)
+
+
+cpdef bool is_integral(DataType typ):
+    """Checks if the given data type is an integral type.
+
+    For details, see :cpp:func:`is_integral`.
+    """
+    return traits.is_integral(typ.c_obj)
+
+
+cpdef bool is_integral_not_bool(DataType typ):
+    """Checks if the given data type is an integral type excluding booleans.
+
+    For details, see :cpp:func:`is_integral_not_bool`.
+    """
+    return traits.is_integral_not_bool(typ.c_obj)
+
+
+cpdef bool is_floating_point(DataType typ):
+    """Checks if the given data type is a floating point type.
+
+    For details, see :cpp:func:`is_floating_point`.
+    """
+    return traits.is_floating_point(typ.c_obj)
+
+
+cpdef bool is_boolean(DataType typ):
+    """Checks if the given data type is a boolean type.
+
+    For details, see :cpp:func:`is_boolean`.
+    """
+    return traits.is_boolean(typ.c_obj)
+
+
+cpdef bool is_timestamp(DataType typ):
+    """Checks if the given data type is a timestamp type.
+
+    For details, see :cpp:func:`is_timestamp`.
+    """
+    return traits.is_timestamp(typ.c_obj)
+
+
+cpdef bool is_fixed_point(DataType typ):
+    """Checks if the given data type is a fixed point type.
+
+    For details, see :cpp:func:`is_fixed_point`.
+    """
+    return traits.is_fixed_point(typ.c_obj)
+
+
+cpdef bool is_duration(DataType typ):
+    """Checks if the given data type is a duration type.
+
+    For details, see :cpp:func:`is_duration`.
+    """
+    return traits.is_duration(typ.c_obj)
+
+
+cpdef bool is_chrono(DataType typ):
+    """Checks if the given data type is a chrono type.
+
+    For details, see :cpp:func:`is_chrono`.
+    """
+    return traits.is_chrono(typ.c_obj)
+
+
+cpdef bool is_dictionary(DataType typ):
+    """Checks if the given data type is a dictionary type.
+
+    For details, see :cpp:func:`is_dictionary`.
+    """
+    return traits.is_dictionary(typ.c_obj)
+
+
+cpdef bool is_fixed_width(DataType typ):
+    """Checks if the given data type is a fixed width type.
+
+    For details, see :cpp:func:`is_fixed_width`.
+    """
+    return traits.is_fixed_width(typ.c_obj)
+
+
+cpdef bool is_compound(DataType typ):
+    """Checks if the given data type is a compound type.
+
+    For details, see :cpp:func:`is_compound`.
+    """
+    return traits.is_compound(typ.c_obj)
+
+
+cpdef bool is_nested(DataType typ):
+    """Checks if the given data type is a nested type.
+
+    For details, see :cpp:func:`is_nested`.
+    """
+    return traits.is_nested(typ.c_obj)
+
+
+cpdef bool is_bit_castable(DataType source, DataType target):
+    """Checks if the source type is bit-castable to the target type.
+
+    For details, see :cpp:func:`is_bit_castable`.
+    """
+    return traits.is_bit_castable(source.c_obj, target.c_obj)
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index f8bfe340ae5..d41e6c720bf 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -102,49 +102,10 @@ def cudf_raises(expected_exception: BaseException, *args, **kwargs):
     return pytest.raises(expected_exception, *args, **kwargs)
 
 
-# TODO: Consider moving these type utilities into pylibcudf.types itself.
-def is_signed_integer(plc_dtype: plc.DataType):
-    return (
-        plc.TypeId.INT8.value <= plc_dtype.id().value <= plc.TypeId.INT64.value
-    )
-
-
-def is_integer(plc_dtype: plc.DataType):
-    return plc_dtype.id() in (
-        plc.TypeId.INT8,
-        plc.TypeId.INT16,
-        plc.TypeId.INT32,
-        plc.TypeId.INT64,
-        plc.TypeId.UINT8,
-        plc.TypeId.UINT16,
-        plc.TypeId.UINT32,
-        plc.TypeId.UINT64,
-    )
-
-
-def is_floating(plc_dtype: plc.DataType):
-    return plc_dtype.id() in (
-        plc.TypeId.FLOAT32,
-        plc.TypeId.FLOAT64,
-    )
-
-
-def is_boolean(plc_dtype: plc.DataType):
-    return plc_dtype.id() == plc.TypeId.BOOL8
-
-
 def is_string(plc_dtype: plc.DataType):
     return plc_dtype.id() == plc.TypeId.STRING
 
 
-def is_fixed_width(plc_dtype: plc.DataType):
-    return (
-        is_integer(plc_dtype)
-        or is_floating(plc_dtype)
-        or is_boolean(plc_dtype)
-    )
-
-
 def nesting_level(typ) -> tuple[int, int]:
     """Return list and struct nesting of a pyarrow type."""
     if isinstance(typ, pa.ListType):
diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py
index 0a6df198d46..f27fe4e942e 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_copying.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py
@@ -9,9 +9,6 @@
     assert_column_eq,
     assert_table_eq,
     cudf_raises,
-    is_fixed_width,
-    is_floating,
-    is_integer,
     is_nested_list,
     is_nested_struct,
     is_string,
@@ -359,9 +356,9 @@ def test_scatter_table_type_mismatch(source_table, index_column, target_table):
     _, plc_index_column = index_column
     _, plc_target_table = target_table
     with cudf_raises(TypeError):
-        if is_integer(
+        if plc.traits.is_integral_not_bool(
             dtype := plc_target_table.columns()[0].type()
-        ) or is_floating(dtype):
+        ) or plc.traits.is_floating_point(dtype):
             pa_array = pa.array([True] * plc_source_table.num_rows())
         else:
             pa_array = pa.array([1] * plc_source_table.num_rows())
@@ -428,9 +425,9 @@ def test_scatter_scalars_type_mismatch(index_column, target_table):
     _, plc_index_column = index_column
     _, plc_target_table = target_table
     with cudf_raises(TypeError):
-        if is_integer(
+        if plc.traits.is_integral_not_bool(
             dtype := plc_target_table.columns()[0].type()
-        ) or is_floating(dtype):
+        ) or plc.traits.is_floating_point(dtype):
             plc_source_scalar = [plc.interop.from_arrow(pa.scalar(True))]
         else:
             plc_source_scalar = [plc.interop.from_arrow(pa.scalar(1))]
@@ -458,7 +455,7 @@ def test_empty_like_table(source_table):
 @pytest.mark.parametrize("size", [None, 10])
 def test_allocate_like(input_column, size):
     _, plc_input_column = input_column
-    if is_fixed_width(plc_input_column.type()):
+    if plc.traits.is_fixed_width(plc_input_column.type()):
         result = plc.copying.allocate_like(
             plc_input_column,
             plc.copying.MaskAllocationPolicy.RETAIN,
@@ -484,7 +481,7 @@ def test_copy_range_in_place(
 
     pa_target_column, _ = target_column
 
-    if not is_fixed_width(mutable_target_column.type()):
+    if not plc.traits.is_fixed_width(mutable_target_column.type()):
         with pytest.raises(TypeError):
             plc.copying.copy_range_in_place(
                 plc_input_column,
@@ -516,7 +513,7 @@ def test_copy_range_in_place_out_of_bounds(
 ):
     _, plc_input_column = input_column
 
-    if is_fixed_width(mutable_target_column.type()):
+    if plc.traits.is_fixed_width(mutable_target_column.type()):
         with cudf_raises(IndexError):
             plc.copying.copy_range_in_place(
                 plc_input_column,
@@ -528,7 +525,9 @@ def test_copy_range_in_place_out_of_bounds(
 
 
 def test_copy_range_in_place_different_types(mutable_target_column):
-    if is_integer(dtype := mutable_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := mutable_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
     else:
         plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
@@ -548,7 +547,7 @@ def test_copy_range_in_place_null_mismatch(
 ):
     pa_input_column, _ = input_column
 
-    if is_fixed_width(mutable_target_column.type()):
+    if plc.traits.is_fixed_width(mutable_target_column.type()):
         pa_input_column = pc.if_else(
             _pyarrow_index_to_mask([0], len(pa_input_column)),
             pa_input_column,
@@ -568,7 +567,9 @@ def test_copy_range_in_place_null_mismatch(
 def test_copy_range(input_column, target_column):
     pa_input_column, plc_input_column = input_column
     pa_target_column, plc_target_column = target_column
-    if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype):
+    if plc.traits.is_fixed_width(
+        dtype := plc_target_column.type()
+    ) or is_string(dtype):
         result = plc.copying.copy_range(
             plc_input_column,
             plc_target_column,
@@ -610,7 +611,9 @@ def test_copy_range_out_of_bounds(input_column, target_column):
 
 def test_copy_range_different_types(target_column):
     _, plc_target_column = target_column
-    if is_integer(dtype := plc_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := plc_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         plc_input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
     else:
         plc_input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
@@ -629,7 +632,9 @@ def test_shift(target_column, source_scalar):
     pa_source_scalar, plc_source_scalar = source_scalar
     pa_target_column, plc_target_column = target_column
     shift = 2
-    if is_fixed_width(dtype := plc_target_column.type()) or is_string(dtype):
+    if plc.traits.is_fixed_width(
+        dtype := plc_target_column.type()
+    ) or is_string(dtype):
         result = plc.copying.shift(plc_target_column, shift, plc_source_scalar)
         expected = pa.concat_arrays(
             [pa.array([pa_source_scalar] * shift), pa_target_column[:-shift]]
@@ -642,7 +647,9 @@ def test_shift(target_column, source_scalar):
 
 def test_shift_type_mismatch(target_column):
     _, plc_target_column = target_column
-    if is_integer(dtype := plc_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := plc_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         fill_value = plc.interop.from_arrow(pa.scalar("a"))
     else:
         fill_value = plc.interop.from_arrow(pa.scalar(1))
@@ -747,7 +754,9 @@ def test_copy_if_else_column_column(target_column, mask, source_scalar):
 def test_copy_if_else_wrong_type(target_column, mask):
     _, plc_target_column = target_column
     _, plc_mask = mask
-    if is_integer(dtype := plc_target_column.type()) or is_floating(dtype):
+    if plc.traits.is_integral_not_bool(
+        dtype := plc_target_column.type()
+    ) or plc.traits.is_floating_point(dtype):
         plc_input_column = plc.interop.from_arrow(
             pa.array(["a"] * plc_target_column.size())
         )
@@ -951,9 +960,9 @@ def test_boolean_mask_scatter_from_wrong_num_true(source_table, target_table):
 def test_boolean_mask_scatter_from_wrong_col_type(target_table, mask):
     _, plc_target_table = target_table
     _, plc_mask = mask
-    if is_integer(
+    if plc.traits.is_integral_not_bool(
         dtype := plc_target_table.columns()[0].type()
-    ) or is_floating(dtype):
+    ) or plc.traits.is_floating_point(dtype):
         input_column = plc.interop.from_arrow(pa.array(["a", "b", "c"]))
     else:
         input_column = plc.interop.from_arrow(pa.array([1, 2, 3]))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_traits.py b/python/cudf/cudf/pylibcudf_tests/test_traits.py
new file mode 100644
index 00000000000..6c22cb02f21
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_traits.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib import pylibcudf as plc
+
+
+def test_is_relationally_comparable():
+    assert plc.traits.is_relationally_comparable(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_relationally_comparable(
+        plc.DataType(plc.TypeId.LIST)
+    )
+
+
+def test_is_equality_comparable():
+    assert plc.traits.is_equality_comparable(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_equality_comparable(plc.DataType(plc.TypeId.LIST))
+
+
+def test_is_numeric():
+    assert plc.traits.is_numeric(plc.DataType(plc.TypeId.FLOAT64))
+    assert not plc.traits.is_numeric(plc.DataType(plc.TypeId.LIST))
+
+
+def test_is_index_type():
+    assert plc.traits.is_index_type(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_index_type(plc.DataType(plc.TypeId.BOOL8))
+
+
+def test_is_unsigned():
+    assert plc.traits.is_unsigned(plc.DataType(plc.TypeId.UINT8))
+    assert not plc.traits.is_unsigned(plc.DataType(plc.TypeId.INT8))
+
+
+def test_is_integral():
+    assert plc.traits.is_integral(plc.DataType(plc.TypeId.BOOL8))
+    assert not plc.traits.is_integral(plc.DataType(plc.TypeId.DECIMAL32))
+
+
+def test_is_integral_not_bool():
+    assert plc.traits.is_integral_not_bool(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_integral_not_bool(plc.DataType(plc.TypeId.BOOL8))
+
+
+def test_is_floating_point():
+    assert plc.traits.is_floating_point(plc.DataType(plc.TypeId.FLOAT64))
+    assert not plc.traits.is_floating_point(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_boolean():
+    assert plc.traits.is_boolean(plc.DataType(plc.TypeId.BOOL8))
+    assert not plc.traits.is_boolean(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_timestamp():
+    assert plc.traits.is_timestamp(
+        plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
+    )
+    assert not plc.traits.is_timestamp(
+        plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
+    )
+
+
+def test_is_fixed_point():
+    assert plc.traits.is_fixed_point(plc.DataType(plc.TypeId.DECIMAL128))
+    assert not plc.traits.is_fixed_point(plc.DataType(plc.TypeId.FLOAT32))
+
+
+def test_is_duration():
+    assert plc.traits.is_duration(
+        plc.DataType(plc.TypeId.DURATION_MICROSECONDS)
+    )
+    assert not plc.traits.is_duration(
+        plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
+    )
+
+
+def test_is_chrono():
+    assert plc.traits.is_chrono(plc.DataType(plc.TypeId.DURATION_MICROSECONDS))
+    assert plc.traits.is_chrono(
+        plc.DataType(plc.TypeId.TIMESTAMP_MICROSECONDS)
+    )
+    assert not plc.traits.is_chrono(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_dictionary():
+    assert plc.traits.is_dictionary(plc.DataType(plc.TypeId.DICTIONARY32))
+    assert not plc.traits.is_dictionary(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_fixed_width():
+    assert plc.traits.is_fixed_width(plc.DataType(plc.TypeId.INT8))
+    assert not plc.traits.is_fixed_width(plc.DataType(plc.TypeId.STRING))
+
+
+def test_is_compound():
+    assert plc.traits.is_compound(plc.DataType(plc.TypeId.STRUCT))
+    assert not plc.traits.is_compound(plc.DataType(plc.TypeId.UINT8))
+
+
+def test_is_nested():
+    assert plc.traits.is_nested(plc.DataType(plc.TypeId.STRUCT))
+    assert not plc.traits.is_nested(plc.DataType(plc.TypeId.STRING))
+
+
+def test_is_bit_castable():
+    assert plc.traits.is_bit_castable(
+        plc.DataType(plc.TypeId.INT8), plc.DataType(plc.TypeId.UINT8)
+    )
+    assert not plc.traits.is_bit_castable(
+        plc.DataType(plc.TypeId.UINT8), plc.DataType(plc.TypeId.UINT16)
+    )
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index cfc2947f8de..69bc85b109d 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -1218,7 +1218,8 @@ def __init__(
         self.children = (left, right)
         if (
             op in (plc.binaryop.BinaryOperator.ADD, plc.binaryop.BinaryOperator.SUB)
-            and ({left.dtype.id(), right.dtype.id()}.issubset(dtypes.TIMELIKE_TYPES))
+            and plc.traits.is_chrono(left.dtype)
+            and plc.traits.is_chrono(right.dtype)
             and not dtypes.have_compatible_resolution(left.dtype.id(), right.dtype.id())
         ):
             raise NotImplementedError("Casting rules for timelike types")
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 507acb5d33a..918cd024fa2 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -17,19 +17,6 @@
 __all__ = ["from_polars", "downcast_arrow_lists", "have_compatible_resolution"]
 
 
-TIMELIKE_TYPES: frozenset[plc.TypeId] = frozenset(
-    [
-        plc.TypeId.TIMESTAMP_MILLISECONDS,
-        plc.TypeId.TIMESTAMP_MICROSECONDS,
-        plc.TypeId.TIMESTAMP_NANOSECONDS,
-        plc.TypeId.TIMESTAMP_DAYS,
-        plc.TypeId.DURATION_MILLISECONDS,
-        plc.TypeId.DURATION_MICROSECONDS,
-        plc.TypeId.DURATION_NANOSECONDS,
-    ]
-)
-
-
 def have_compatible_resolution(lid: plc.TypeId, rid: plc.TypeId):
     """
     Do two datetime typeids have matching resolution for a binop.

From 37defc6b943094921200146c5f6042a91e68c75a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 5 Jul 2024 09:44:05 -0400
Subject: [PATCH 466/842] Use strings concatenate to support large strings in
 CSV writer (#16148)

Changes the CSV writer logic to use `cudf::strings::concatenate` instead of `cudf::strings::join_strings` when output size exceeds `join_strings` limit.

Closes #16137

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16148
---
 cpp/src/io/csv/writer_impl.cu | 38 ++++++++++++++++++++++++++---------
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 7c4d5711281..63eb0b03c5f 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -25,6 +25,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/fill.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/detail/csv.hpp>
@@ -372,15 +373,33 @@ void write_chunked(data_sink* out_sink,
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
 
   cudf::string_scalar newline{options.get_line_terminator(), true, stream};
-  auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view,
-                                                            newline,
-                                                            string_scalar{"", false, stream},
-                                                            stream,
-                                                            rmm::mr::get_current_device_resource());
-  strings_column_view strings_column{p_str_col_w_nl->view()};
 
-  auto total_num_bytes      = strings_column.chars_size(stream);
-  char const* ptr_all_bytes = strings_column.chars_begin(stream);
+  // use strings concatenate to build the final CSV output in device memory
+  auto contents_w_nl = [&] {
+    auto const total_size =
+      str_column_view.chars_size(stream) + (newline.size() * str_column_view.size());
+    auto const empty_str = string_scalar("", true, stream);
+    // use join_strings when the output will be less than 2GB
+    if (total_size < static_cast<int64_t>(std::numeric_limits<size_type>::max())) {
+      return cudf::strings::detail::join_strings(str_column_view, newline, empty_str, stream, mr)
+        ->release();
+    }
+    auto nl_col = cudf::make_column_from_scalar(newline, str_column_view.size(), stream);
+    // convert the last element into an empty string by resetting the last offset value
+    auto& offsets     = nl_col->child(strings_column_view::offsets_column_index);
+    auto offsets_view = offsets.mutable_view();
+    cudf::fill_in_place(offsets_view,
+                        offsets.size() - 1,  // set the last element with
+                        offsets.size(),      // the value from 2nd to last element
+                        *cudf::detail::get_element(offsets.view(), offsets.size() - 2, stream, mr),
+                        stream);
+    auto const nl_tbl = cudf::table_view({str_column_view.parent(), nl_col->view()});
+    return cudf::strings::detail::concatenate(
+             nl_tbl, empty_str, empty_str, strings::separator_on_nulls::NO, stream, mr)
+      ->release();
+  }();
+  auto const total_num_bytes = contents_w_nl.data->size();
+  auto const ptr_all_bytes   = static_cast<char const*>(contents_w_nl.data->data());
 
   if (out_sink->is_device_write_preferred(total_num_bytes)) {
     // Direct write from device memory
@@ -491,7 +510,8 @@ void write_csv(data_sink* out_sink,
           str_table_view.column(0), options_narep, stream, rmm::mr::get_current_device_resource());
       }();
 
-      write_chunked(out_sink, str_concat_col->view(), options, stream, mr);
+      write_chunked(
+        out_sink, str_concat_col->view(), options, stream, rmm::mr::get_current_device_resource());
     }
   }
 }

From 7dd69452bb72ca8cc440af52cb6ca8386950c264 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 5 Jul 2024 07:58:30 -0700
Subject: [PATCH 467/842] CI: Build wheels for cudf-polars (#16156)

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16156
---
 .github/workflows/pr.yaml                     | 16 +++++++++++++---
 ci/build_wheel_cudf_polars.sh                 | 11 +++++++++++
 ci/run_cudf_polars_pytests.sh                 |  2 +-
 ...df_polars.sh => test_wheel_cudf_polars.sh} | 19 +++++++------------
 ci/test_wheel_dask_cudf.sh                    |  2 +-
 python/cudf_polars/pyproject.toml             |  2 --
 6 files changed, 33 insertions(+), 19 deletions(-)
 create mode 100755 ci/build_wheel_cudf_polars.sh
 rename ci/{test_cudf_polars.sh => test_wheel_cudf_polars.sh} (70%)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index a35802f2ab0..ceee9074b93 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,7 +25,8 @@ jobs:
       - docs-build
       - wheel-build-cudf
       - wheel-tests-cudf
-      - test-cudf-polars
+      - wheel-build-cudf-polars
+      - wheel-tests-cudf-polars
       - wheel-build-dask-cudf
       - wheel-tests-dask-cudf
       - devcontainer
@@ -133,9 +134,18 @@ jobs:
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
-  test-cudf-polars:
+  wheel-build-cudf-polars:
     needs: wheel-build-cudf
     secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      script: "ci/build_wheel_cudf_polars.sh"
+  wheel-tests-cudf-polars:
+    needs: wheel-build-cudf-polars
+    secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -143,7 +153,7 @@ jobs:
       build_type: pull-request
       # This always runs, but only fails if this PR touches code in
       # pylibcudf or cudf_polars
-      script: "ci/test_cudf_polars.sh"
+      script: "ci/test_wheel_cudf_polars.sh"
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
diff --git a/ci/build_wheel_cudf_polars.sh b/ci/build_wheel_cudf_polars.sh
new file mode 100755
index 00000000000..9c945e11c00
--- /dev/null
+++ b/ci/build_wheel_cudf_polars.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir="python/cudf_polars"
+
+./ci/build_wheel.sh ${package_dir}
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-upload-wheels-to-s3 ${package_dir}/dist
diff --git a/ci/run_cudf_polars_pytests.sh b/ci/run_cudf_polars_pytests.sh
index 78683b057a5..c10612a065a 100755
--- a/ci/run_cudf_polars_pytests.sh
+++ b/ci/run_cudf_polars_pytests.sh
@@ -8,4 +8,4 @@ set -euo pipefail
 # Support invoking run_cudf_polars_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/cudf_polars/
 
-pytest --cache-clear "$@" tests
+python -m pytest --cache-clear "$@" tests
diff --git a/ci/test_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
similarity index 70%
rename from ci/test_cudf_polars.sh
rename to ci/test_wheel_cudf_polars.sh
index ca98c4dadb3..900acd5d473 100755
--- a/ci/test_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -18,19 +18,14 @@ else
 fi
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
-RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
-RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
-mkdir -p "${RAPIDS_TESTS_DIR}"
-
-rapids-logger "Install cudf wheel"
-# echo to expand wildcard before adding `[extra]` requires for pip
-python -m pip install $(echo ./dist/cudf*.whl)[test]
+# Download the cudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+python -m pip install ./local-cudf-dep/cudf*.whl
 
 rapids-logger "Install cudf_polars"
-python -m pip install 'polars>=1.0'
-python -m pip install --no-deps python/cudf_polars
+python -m pip install $(echo ./dist/cudf_polars*.whl)[test]
 
 rapids-logger "Run cudf_polars tests"
 
@@ -45,8 +40,8 @@ set +e
 ./ci/run_cudf_polars_pytests.sh \
        --cov cudf_polars \
        --cov-fail-under=100 \
-       --cov-config=python/cudf_polars/pyproject.toml \
-       --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf_polars.xml"
+       --cov-config=./pyproject.toml \
+       --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-polars.xml"
 
 trap ERR
 set -e
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 2b20b9d9ce4..c3800d3cc25 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -8,7 +8,7 @@ RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="
 
 # Download the cudf built in the previous step
 RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
-python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
+python -m pip install ./local-cudf-dep/cudf*.whl
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index bf4673fcc50..0b559f7a8e9 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -182,5 +182,3 @@ docstring-code-format = true
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../dependencies.yaml"
-# Pure python
-disable-cuda = true

From c978181a3a721ed75cf016c6f083648c65bd24cd Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 5 Jul 2024 16:11:07 +0100
Subject: [PATCH 468/842] Implement translation for some unary functions and a
 single datetime extraction (#16173)

- Closes #16169

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16173
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 124 ++++++++++++++++++
 python/cudf_polars/cudf_polars/dsl/ir.py      |   2 +-
 .../cudf_polars/cudf_polars/dsl/translate.py  |  19 ++-
 .../tests/expressions/test_datetime_basic.py  |  28 ++++
 .../tests/expressions/test_round.py           |  32 +++++
 .../tests/expressions/test_unique.py          |  24 ++++
 python/cudf_polars/tests/test_groupby.py      |   2 +
 7 files changed, 228 insertions(+), 3 deletions(-)
 create mode 100644 python/cudf_polars/tests/expressions/test_round.py
 create mode 100644 python/cudf_polars/tests/expressions/test_unique.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 69bc85b109d..93cb9db7cbd 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -44,6 +44,7 @@
     "Col",
     "BooleanFunction",
     "StringFunction",
+    "TemporalFunction",
     "Sort",
     "SortBy",
     "Gather",
@@ -815,6 +816,129 @@ def do_evaluate(
         )  # pragma: no cover; handled by init raising
 
 
+class TemporalFunction(Expr):
+    __slots__ = ("name", "options", "children")
+    _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(
+        self,
+        dtype: plc.DataType,
+        name: pl_expr.TemporalFunction,
+        options: tuple[Any, ...],
+        *children: Expr,
+    ) -> None:
+        super().__init__(dtype)
+        self.options = options
+        self.name = name
+        self.children = children
+        if self.name != pl_expr.TemporalFunction.Year:
+            raise NotImplementedError(f"String function {self.name}")
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        columns = [
+            child.evaluate(df, context=context, mapping=mapping)
+            for child in self.children
+        ]
+        if self.name == pl_expr.TemporalFunction.Year:
+            (column,) = columns
+            return Column(plc.datetime.extract_year(column.obj))
+        raise NotImplementedError(
+            f"TemporalFunction {self.name}"
+        )  # pragma: no cover; init trips first
+
+
+class UnaryFunction(Expr):
+    __slots__ = ("name", "options", "children")
+    _non_child = ("dtype", "name", "options")
+    children: tuple[Expr, ...]
+
+    def __init__(
+        self, dtype: plc.DataType, name: str, options: tuple[Any, ...], *children: Expr
+    ) -> None:
+        super().__init__(dtype)
+        self.name = name
+        self.options = options
+        self.children = children
+        if self.name not in ("round", "unique"):
+            raise NotImplementedError(f"Unary function {name=}")
+
+    def do_evaluate(
+        self,
+        df: DataFrame,
+        *,
+        context: ExecutionContext = ExecutionContext.FRAME,
+        mapping: Mapping[Expr, Column] | None = None,
+    ) -> Column:
+        """Evaluate this expression given a dataframe for context."""
+        if self.name == "round":
+            (decimal_places,) = self.options
+            (values,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            return Column(
+                plc.round.round(
+                    values.obj, decimal_places, plc.round.RoundingMethod.HALF_UP
+                )
+            ).sorted_like(values)
+        elif self.name == "unique":
+            (maintain_order,) = self.options
+            (values,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            # Only one column, so keep_any is the same as keep_first
+            # for stable distinct
+            keep = plc.stream_compaction.DuplicateKeepOption.KEEP_ANY
+            if values.is_sorted:
+                maintain_order = True
+                result = plc.stream_compaction.unique(
+                    plc.Table([values.obj]),
+                    [0],
+                    keep,
+                    plc.types.NullEquality.EQUAL,
+                )
+            else:
+                distinct = (
+                    plc.stream_compaction.stable_distinct
+                    if maintain_order
+                    else plc.stream_compaction.distinct
+                )
+                result = distinct(
+                    plc.Table([values.obj]),
+                    [0],
+                    keep,
+                    plc.types.NullEquality.EQUAL,
+                    plc.types.NanEquality.ALL_EQUAL,
+                )
+            (column,) = result.columns()
+            if maintain_order:
+                return Column(column).sorted_like(values)
+            return Column(column)
+        raise NotImplementedError(
+            f"Unimplemented unary function {self.name=}"
+        )  # pragma: no cover; init trips first
+
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        if depth == 1:
+            # inside aggregation, need to pre-evaluate, groupby
+            # construction has checked that we don't have nested aggs,
+            # so stop the recursion and return ourselves for pre-eval
+            return AggInfo([(self, plc.aggregation.collect_list(), self)])
+        else:
+            (child,) = self.children
+            return child.collect_agg(depth=depth)
+
+
 class Sort(Expr):
     __slots__ = ("options", "children")
     _non_child = ("dtype", "options")
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 31a0be004ea..6b552642e88 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -433,7 +433,7 @@ def check_agg(agg: expr.Expr) -> int:
         NotImplementedError
             For unsupported expression nodes.
         """
-        if isinstance(agg, (expr.BinOp, expr.Cast)):
+        if isinstance(agg, (expr.BinOp, expr.Cast, expr.UnaryFunction)):
             return max(GroupBy.check_agg(child) for child in agg.children)
         elif isinstance(agg, expr.Agg):
             return 1 + max(GroupBy.check_agg(child) for child in agg.children)
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 0019b3aa98a..5a1e682abe7 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -361,8 +361,23 @@ def _(node: pl_expr.Function, visitor: NodeTraverser, dtype: plc.DataType) -> ex
             options,
             *(translate_expr(visitor, n=n) for n in node.input),
         )
-    else:
-        raise NotImplementedError(f"No handler for Expr function node with {name=}")
+    elif isinstance(name, pl_expr.TemporalFunction):
+        return expr.TemporalFunction(
+            dtype,
+            name,
+            options,
+            *(translate_expr(visitor, n=n) for n in node.input),
+        )
+    elif isinstance(name, str):
+        return expr.UnaryFunction(
+            dtype,
+            name,
+            options,
+            *(translate_expr(visitor, n=n) for n in node.input),
+        )
+    raise NotImplementedError(
+        f"No handler for Expr function node with {name=}"
+    )  # pragma: no cover; polars raises on the rust side for now
 
 
 @_translate_expr.register
diff --git a/python/cudf_polars/tests/expressions/test_datetime_basic.py b/python/cudf_polars/tests/expressions/test_datetime_basic.py
index 6ba2a1dce1e..218101bf87c 100644
--- a/python/cudf_polars/tests/expressions/test_datetime_basic.py
+++ b/python/cudf_polars/tests/expressions/test_datetime_basic.py
@@ -2,6 +2,9 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import datetime
+from operator import methodcaller
+
 import pytest
 
 import polars as pl
@@ -32,3 +35,28 @@ def test_datetime_dataframe_scan(dtype):
 
     query = ldf.select(pl.col("b"), pl.col("a"))
     assert_gpu_result_equal(query)
+
+
+@pytest.mark.parametrize(
+    "field",
+    [
+        methodcaller("year"),
+        pytest.param(
+            methodcaller("day"),
+            marks=pytest.mark.xfail(reason="day extraction not implemented"),
+        ),
+    ],
+)
+def test_datetime_extract(field):
+    ldf = pl.LazyFrame(
+        {"dates": [datetime.date(2024, 1, 1), datetime.date(2024, 10, 11)]}
+    )
+    q = ldf.select(field(pl.col("dates").dt))
+
+    with pytest.raises(AssertionError):
+        # polars produces int32, libcudf produces int16 for the year extraction
+        # libcudf can lose data here.
+        # https://github.com/rapidsai/cudf/issues/16196
+        assert_gpu_result_equal(q)
+
+    assert_gpu_result_equal(q, check_dtypes=False)
diff --git a/python/cudf_polars/tests/expressions/test_round.py b/python/cudf_polars/tests/expressions/test_round.py
new file mode 100644
index 00000000000..3af3a0ce6d1
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_round.py
@@ -0,0 +1,32 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import math
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.fixture(params=[pl.Float32, pl.Float64])
+def dtype(request):
+    return request.param
+
+
+@pytest.fixture
+def df(dtype, with_nulls):
+    a = [-math.e, 10, 22.5, 1.5, 2.5, -1.5, math.pi, 8]
+    if with_nulls:
+        a[2] = None
+        a[-1] = None
+    return pl.LazyFrame({"a": a}, schema={"a": dtype})
+
+
+@pytest.mark.parametrize("decimals", [0, 2, 4])
+def test_round(df, decimals):
+    q = df.select(pl.col("a").round(decimals=decimals))
+
+    assert_gpu_result_equal(q, check_exact=False)
diff --git a/python/cudf_polars/tests/expressions/test_unique.py b/python/cudf_polars/tests/expressions/test_unique.py
new file mode 100644
index 00000000000..9b009a422c2
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_unique.py
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import assert_gpu_result_equal
+
+
+@pytest.mark.parametrize("maintain_order", [False, True], ids=["unstable", "stable"])
+@pytest.mark.parametrize("pre_sorted", [False, True], ids=["unsorted", "sorted"])
+def test_unique(maintain_order, pre_sorted):
+    ldf = pl.DataFrame(
+        {
+            "b": [1.5, 2.5, None, 1.5, 3, float("nan"), 3],
+        }
+    ).lazy()
+    if pre_sorted:
+        ldf = ldf.sort("b")
+
+    query = ldf.select(pl.col("b").unique(maintain_order=maintain_order))
+    assert_gpu_result_equal(query, check_row_order=maintain_order)
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index 8a6732b7063..b84e2c16b43 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -47,6 +47,8 @@ def keys(request):
         [pl.col("float").max() - pl.col("int").min()],
         [pl.col("float").mean(), pl.col("int").std()],
         [(pl.col("float") - pl.lit(2)).max()],
+        [pl.col("float").sum().round(decimals=1)],
+        [pl.col("float").round(decimals=1).sum()],
     ],
     ids=lambda aggs: "-".join(map(str, aggs)),
 )

From a583c97ca977041e3cc3399739e29962982d6aad Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 5 Jul 2024 13:45:38 -0400
Subject: [PATCH 469/842] Fix cudf::strings::replace_multiple hang on empty
 target (#16167)

Fixes logic in `cudf::strings::replace_multiple` to ignore empty targets correctly in the `replace_multi_fn` functor.
Also updated the doxygen and added a gtest for this case.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16167
---
 cpp/include/cudf/strings/replace.hpp |  2 +-
 cpp/src/strings/replace/multi.cu     |  9 ++++-----
 cpp/tests/strings/replace_tests.cpp  | 17 +++++++++++++++++
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index a19aa9be0c0..a714f762a19 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -122,7 +122,7 @@ std::unique_ptr<column> replace_slice(
  * If a target string is found, it is replaced by the corresponding entry in the repls column.
  * All occurrences found in each string are replaced.
  *
- * This does not use regex to match targets in the string.
+ * This does not use regex to match targets in the string. Empty string targets are ignored.
  *
  * Null string entries will return null output string entries.
  *
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 43a3d69091a..2ca22f0e017 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -451,8 +451,8 @@ struct replace_multi_fn {
     while (spos < d_str.size_bytes()) {
       for (int tgt_idx = 0; tgt_idx < d_targets.size(); ++tgt_idx) {
         auto const d_tgt = d_targets.element<string_view>(tgt_idx);
-        if ((d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) &&    // check fit
-            (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0))  // and match
+        if (!d_tgt.empty() && (d_tgt.size_bytes() <= (d_str.size_bytes() - spos)) &&  // check fit
+            (d_tgt.compare(in_ptr + spos, d_tgt.size_bytes()) == 0))                  // and match
         {
           auto const d_repl = (d_repls.size() == 1) ? d_repls.element<string_view>(0)
                                                     : d_repls.element<string_view>(tgt_idx);
@@ -468,9 +468,8 @@ struct replace_multi_fn {
       }
       ++spos;
     }
-    if (out_ptr)  // copy remainder
-    {
-      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);
+    if (out_ptr) {
+      memcpy(out_ptr, in_ptr + lpos, d_str.size_bytes() - lpos);  // copy remainder
     } else {
       d_sizes[idx] = bytes;
     }
diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp
index 3aa7467d156..6c4afbb435a 100644
--- a/cpp/tests/strings/replace_tests.cpp
+++ b/cpp/tests/strings/replace_tests.cpp
@@ -532,6 +532,23 @@ TEST_F(StringsReplaceTest, ReplaceMultiLong)
   }
 }
 
+TEST_F(StringsReplaceTest, EmptyTarget)
+{
+  auto const input = cudf::test::strings_column_wrapper({"hello", "world", "", "accénted"});
+  auto const sv    = cudf::strings_column_view(input);
+
+  auto const targets = cudf::test::strings_column_wrapper({"e", "", "d"});
+  auto const tv      = cudf::strings_column_view(targets);
+
+  auto const repls = cudf::test::strings_column_wrapper({"E", "_", "D"});
+  auto const rv    = cudf::strings_column_view(repls);
+
+  // empty target should be ignored
+  auto results  = cudf::strings::replace_multiple(sv, tv, rv);
+  auto expected = cudf::test::strings_column_wrapper({"hEllo", "worlD", "", "accéntED"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+}
+
 TEST_F(StringsReplaceTest, EmptyStringsColumn)
 {
   auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();

From f6b355d7761ee3ecc0b243f09dc0c1d3b214a7ad Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 5 Jul 2024 16:49:23 -0500
Subject: [PATCH 470/842] skip CMake 3.30.0 (#16202)

Contributes to https://github.com/rapidsai/build-planning/issues/80

Adds constraints to avoid pulling in CMake 3.30.0, for the reasons described in that issue.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16202
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-122_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/conda_build_config.yaml       | 2 +-
 conda/recipes/cudf_kafka/conda_build_config.yaml | 2 +-
 conda/recipes/libcudf/conda_build_config.yaml    | 2 +-
 dependencies.yaml                                | 2 +-
 python/cudf/pyproject.toml                       | 2 +-
 python/cudf_kafka/pyproject.toml                 | 2 +-
 8 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index cc9238ab80a..b8d73a01f96 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cachetools
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4
+- cmake>=3.26.4,!=3.30.0
 - cramjam
 - cubinlinker
 - cuda-nvtx=11.8
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 9fecd452248..c32d21c5d36 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -15,7 +15,7 @@ dependencies:
 - cachetools
 - clang-tools=16.0.6
 - clang==16.0.6
-- cmake>=3.26.4
+- cmake>=3.26.4,!=3.30.0
 - cramjam
 - cuda-cudart-dev
 - cuda-nvcc
diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml
index d399e440edd..af894cccda0 100644
--- a/conda/recipes/cudf/conda_build_config.yaml
+++ b/conda/recipes/cudf/conda_build_config.yaml
@@ -11,7 +11,7 @@ c_stdlib_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
 
 cuda_compiler:
   - cuda-nvcc
diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml
index d399e440edd..af894cccda0 100644
--- a/conda/recipes/cudf_kafka/conda_build_config.yaml
+++ b/conda/recipes/cudf_kafka/conda_build_config.yaml
@@ -11,7 +11,7 @@ c_stdlib_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
 
 cuda_compiler:
   - cuda-nvcc
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index c01178bf732..4f99411e978 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -17,7 +17,7 @@ c_stdlib_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.26.4"
+  - ">=3.26.4,!=3.30.0"
 
 libarrow_version:
   - "==16.1.0"
diff --git a/dependencies.yaml b/dependencies.yaml
index 6d4ba0c38d1..27621ff9a3f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -243,7 +243,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - &cmake_ver cmake>=3.26.4
+          - &cmake_ver cmake>=3.26.4,!=3.30.0
           - &ninja ninja
   build_all:
     common:
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 20b731624df..dcb33b1fc1a 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -121,7 +121,7 @@ skip = [
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
 requires = [
-    "cmake>=3.26.4",
+    "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 11e18cd4f32..badfdf06d15 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -101,7 +101,7 @@ regex = "(?P<value>.*)"
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
 requires = [
-    "cmake>=3.26.4",
+    "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
     "ninja",
     "numpy==1.23.*",

From d9a3728d37e0223afd9cfa525bd7ac8b43b39e63 Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Mon, 8 Jul 2024 09:18:30 -0700
Subject: [PATCH 471/842] Define PTDS for the stream hook libs (#16182)

We must define `CUDA_API_PER_THREAD_DEFAULT_STREAM` for the stream hook lib, since `cudaLaunchKernel` in CUDA 12.4+ is now a macro that expands to a different function when it's not defined.

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/16182
---
 cpp/CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2811711d58c..7999ada9282 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -925,6 +925,11 @@ if(CUDF_BUILD_STREAMS_TEST_UTIL)
     add_library(
       ${_tgt} SHARED src/utilities/stacktrace.cpp tests/utilities/identify_stream_usage.cpp
     )
+    if(CUDF_USE_PER_THREAD_DEFAULT_STREAM)
+      target_compile_definitions(
+        ${_tgt} PUBLIC CUDA_API_PER_THREAD_DEFAULT_STREAM CUDF_USE_PER_THREAD_DEFAULT_STREAM
+      )
+    endif()
 
     set_target_properties(
       ${_tgt}

From 6169ee17d31669d8930576003bc3ebaadca8a1fa Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Mon, 8 Jul 2024 12:28:52 -0400
Subject: [PATCH 472/842] Add missing methods to lists/list_column_view.pxd in
 pylibcudf (#16175)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16175
---
 .../cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
index fd21e7b334b..8917a6ac899 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
@@ -10,7 +10,9 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil:
     cdef cppclass lists_column_view(column_view):
         lists_column_view() except +
+        lists_column_view(const lists_column_view& lists_column) except +
         lists_column_view(const column_view& lists_column) except +
+        lists_column_view& operator=(const lists_column_view&) except +
         column_view parent() except +
         column_view offsets() except +
         column_view child() except +

From 036e0ef5b99fd6ea09061af45854d28e44d21212 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Mon, 8 Jul 2024 10:06:13 -0700
Subject: [PATCH 473/842] Migrate JSON reader to pylibcudf (#15966)

Switches the JSON reader to use pylibcudf.
xref #15162

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15966
---
 python/cudf/cudf/_lib/io/utils.pxd            |   4 +
 python/cudf/cudf/_lib/io/utils.pyx            |  27 ++
 python/cudf/cudf/_lib/json.pyx                | 127 ++++----
 python/cudf/cudf/_lib/pylibcudf/io/json.pxd   |  23 +-
 python/cudf/cudf/_lib/pylibcudf/io/json.pyx   | 122 +++++++-
 python/cudf/cudf/_lib/pylibcudf/io/types.pxd  |   5 +
 python/cudf/cudf/_lib/pylibcudf/io/types.pyx  |  57 +++-
 .../_lib/pylibcudf/libcudf/CMakeLists.txt     |   1 +
 .../_lib/pylibcudf/libcudf/io/CMakeLists.txt  |  26 ++
 .../cudf/_lib/pylibcudf/libcudf/io/json.pxd   |   8 +-
 .../cudf/_lib/pylibcudf/libcudf/io/json.pyx   |   0
 .../cudf/_lib/pylibcudf/libcudf/io/types.pyx  |   0
 python/cudf/cudf/_lib/utils.pyx               |   2 +-
 .../cudf/cudf/pylibcudf_tests/common/utils.py |  84 +++++-
 python/cudf/cudf/pylibcudf_tests/conftest.py  |   5 +
 .../cudf/cudf/pylibcudf_tests/io/test_avro.py |   2 +-
 .../cudf/cudf/pylibcudf_tests/io/test_json.py | 275 +++++++++++++++++-
 python/cudf/cudf/tests/test_json.py           |   7 +-
 18 files changed, 674 insertions(+), 101 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pyx

diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index 252d986843a..680a87c789e 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -16,6 +16,10 @@ cdef source_info make_source_info(list src) except*
 cdef sink_info make_sinks_info(
     list src, vector[unique_ptr[data_sink]] & data) except*
 cdef sink_info make_sink_info(src, unique_ptr[data_sink] & data) except*
+cdef add_df_col_struct_names(
+    df,
+    child_names_dict
+)
 cdef update_struct_field_names(
     table,
     vector[column_name_info]& schema_info)
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index 1d7c56888d9..58956b9e9b7 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -147,10 +147,37 @@ cdef cppclass iobase_data_sink(data_sink):
         return buf.tell()
 
 
+cdef add_df_col_struct_names(df, child_names_dict):
+    for name, child_names in child_names_dict.items():
+        col = df._data[name]
+
+        df._data[name] = update_col_struct_field_names(col, child_names)
+
+
+cdef update_col_struct_field_names(Column col, child_names):
+    if col.children:
+        children = list(col.children)
+        for i, (child, names) in enumerate(zip(children, child_names.values())):
+            children[i] = update_col_struct_field_names(
+                child,
+                names
+            )
+        col.set_base_children(tuple(children))
+
+    if isinstance(col.dtype, StructDtype):
+        col = col._rename_fields(
+            child_names.keys()
+        )
+
+    return col
+
+
 cdef update_struct_field_names(
     table,
     vector[column_name_info]& schema_info
 ):
+    # Deprecated, remove in favor of add_col_struct_names
+    # when a reader is ported to pylibcudf
     for i, (name, col) in enumerate(table._data.items()):
         table._data[name] = update_column_struct_field_names(
             col, schema_info[i]
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 22e34feb547..9c646e3357b 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -8,26 +8,16 @@ import cudf
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
-from libcpp.map cimport map
-from libcpp.string cimport string
-from libcpp.utility cimport move
-from libcpp.vector cimport vector
 
 cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-from cudf._lib.io.utils cimport make_source_info, update_struct_field_names
-from cudf._lib.pylibcudf.libcudf.io.json cimport (
-    json_reader_options,
-    json_recovery_mode_t,
-    read_json as libcudf_read_json,
-    schema_element,
-)
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
-    compression_type,
-    table_with_metadata,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from cudf._lib.io.utils cimport add_df_col_struct_names
+from cudf._lib.pylibcudf.io.types cimport compression_type
+from cudf._lib.pylibcudf.libcudf.io.json cimport json_recovery_mode_t
+from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
+from cudf._lib.pylibcudf.types cimport DataType
 from cudf._lib.types cimport dtype_to_data_type
-from cudf._lib.utils cimport data_from_unique_ptr
+from cudf._lib.utils cimport data_from_pylibcudf_io
 
 import cudf._lib.pylibcudf as plc
 
@@ -62,6 +52,7 @@ cpdef read_json(object filepaths_or_buffers,
     # If input data is a JSON string (or StringIO), hold a reference to
     # the encoded memoryview externally to ensure the encoded buffer
     # isn't destroyed before calling libcudf `read_json()`
+
     for idx in range(len(filepaths_or_buffers)):
         if isinstance(filepaths_or_buffers[idx], io.StringIO):
             filepaths_or_buffers[idx] = \
@@ -71,17 +62,7 @@ cpdef read_json(object filepaths_or_buffers,
             filepaths_or_buffers[idx] = filepaths_or_buffers[idx].encode()
 
     # Setup arguments
-    cdef vector[data_type] c_dtypes_list
-    cdef map[string, schema_element] c_dtypes_schema_map
     cdef cudf_io_types.compression_type c_compression
-    # Determine byte read offsets if applicable
-    cdef size_type c_range_offset = (
-        byte_range[0] if byte_range is not None else 0
-    )
-    cdef size_type c_range_size = (
-        byte_range[1] if byte_range is not None else 0
-    )
-    cdef bool c_lines = lines
 
     if compression is not None:
         if compression == 'gzip':
@@ -94,56 +75,50 @@ cpdef read_json(object filepaths_or_buffers,
             c_compression = cudf_io_types.compression_type.AUTO
     else:
         c_compression = cudf_io_types.compression_type.NONE
-    is_list_like_dtypes = False
+
+    processed_dtypes = None
+
     if dtype is False:
         raise ValueError("False value is unsupported for `dtype`")
     elif dtype is not True:
+        processed_dtypes = []
         if isinstance(dtype, abc.Mapping):
             for k, v in dtype.items():
-                c_dtypes_schema_map[str(k).encode()] = \
-                    _get_cudf_schema_element_from_dtype(v)
+                # Make sure keys are string
+                k = str(k)
+                lib_type, child_types = _get_cudf_schema_element_from_dtype(v)
+                processed_dtypes.append((k, lib_type, child_types))
         elif isinstance(dtype, abc.Collection):
-            is_list_like_dtypes = True
-            c_dtypes_list.reserve(len(dtype))
             for col_dtype in dtype:
-                c_dtypes_list.push_back(
-                    _get_cudf_data_type_from_dtype(
-                        col_dtype))
+                processed_dtypes.append(
+                    # Ignore child columns since we cannot specify their dtypes
+                    # when passing a list
+                    _get_cudf_schema_element_from_dtype(col_dtype)[0]
+                )
         else:
             raise TypeError("`dtype` must be 'list like' or 'dict'")
 
-    cdef json_reader_options opts = move(
-        json_reader_options.builder(make_source_info(filepaths_or_buffers))
-        .compression(c_compression)
-        .lines(c_lines)
-        .byte_range_offset(c_range_offset)
-        .byte_range_size(c_range_size)
-        .recovery_mode(_get_json_recovery_mode(on_bad_lines))
-        .build()
+    table_w_meta = plc.io.json.read_json(
+        plc.io.SourceInfo(filepaths_or_buffers),
+        processed_dtypes,
+        c_compression,
+        lines,
+        byte_range_offset = byte_range[0] if byte_range is not None else 0,
+        byte_range_size = byte_range[1] if byte_range is not None else 0,
+        keep_quotes = keep_quotes,
+        mixed_types_as_string = mixed_types_as_string,
+        prune_columns = prune_columns,
+        recovery_mode = _get_json_recovery_mode(on_bad_lines)
     )
-    if is_list_like_dtypes:
-        opts.set_dtypes(c_dtypes_list)
-    else:
-        opts.set_dtypes(c_dtypes_schema_map)
-
-    opts.enable_keep_quotes(keep_quotes)
-    opts.enable_mixed_types_as_string(mixed_types_as_string)
-    opts.enable_prune_columns(prune_columns)
-
-    # Read JSON
-    cdef cudf_io_types.table_with_metadata c_result
 
-    with nogil:
-        c_result = move(libcudf_read_json(opts))
-
-    meta_names = [info.name.decode() for info in c_result.metadata.schema_info]
-    df = cudf.DataFrame._from_data(*data_from_unique_ptr(
-        move(c_result.tbl),
-        column_names=meta_names
-    ))
-
-    update_struct_field_names(df, c_result.metadata.schema_info)
+    df = cudf.DataFrame._from_data(
+        *data_from_pylibcudf_io(
+            table_w_meta
+        )
+    )
 
+    # Post-processing to add in struct column names
+    add_df_col_struct_names(df, table_w_meta.child_names)
     return df
 
 
@@ -192,28 +167,32 @@ def write_json(
         )
 
 
-cdef schema_element _get_cudf_schema_element_from_dtype(object dtype) except *:
-    cdef schema_element s_element
-    cdef data_type lib_type
+cdef _get_cudf_schema_element_from_dtype(object dtype) except *:
     dtype = cudf.dtype(dtype)
     if isinstance(dtype, cudf.CategoricalDtype):
         raise NotImplementedError(
             "CategoricalDtype as dtype is not yet "
             "supported in JSON reader"
         )
-    lib_type = dtype_to_data_type(dtype)
-    s_element.type = lib_type
+
+    lib_type = DataType.from_libcudf(dtype_to_data_type(dtype))
+    child_types = []
+
     if isinstance(dtype, cudf.StructDtype):
         for name, child_type in dtype.fields.items():
-            s_element.child_types[name.encode()] = \
+            child_lib_type, grandchild_types = \
                 _get_cudf_schema_element_from_dtype(child_type)
+            child_types.append((name, child_lib_type, grandchild_types))
     elif isinstance(dtype, cudf.ListDtype):
-        s_element.child_types["offsets".encode()] = \
-            _get_cudf_schema_element_from_dtype(cudf.dtype("int32"))
-        s_element.child_types["element".encode()] = \
+        child_lib_type, grandchild_types = \
             _get_cudf_schema_element_from_dtype(dtype.element_type)
 
-    return s_element
+        child_types = [
+            ("offsets", DataType.from_libcudf(data_type(type_id.INT32)), []),
+            ("element", child_lib_type, grandchild_types)
+        ]
+
+    return lib_type, child_types
 
 
 cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
index a91d574131f..f7f733a493d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
@@ -1,11 +1,30 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-
 from libcpp cimport bool
 
-from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata
+from cudf._lib.pylibcudf.io.types cimport (
+    SinkInfo,
+    SourceInfo,
+    TableWithMetadata,
+    compression_type,
+)
+from cudf._lib.pylibcudf.libcudf.io.json cimport json_recovery_mode_t
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
+cpdef TableWithMetadata read_json(
+    SourceInfo source_info,
+    list dtypes = *,
+    compression_type compression = *,
+    bool lines = *,
+    size_type byte_range_offset = *,
+    size_type byte_range_size = *,
+    bool keep_quotes = *,
+    bool mixed_types_as_string = *,
+    bool prune_columns = *,
+    json_recovery_mode_t recovery_mode = *,
+)
+
+
 cpdef void write_json(
     SinkInfo sink_info,
     TableWithMetadata tbl,
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
index 7530eba3803..354cb4981de 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
@@ -1,16 +1,130 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-
 from libcpp cimport bool
 from libcpp.limits cimport numeric_limits
+from libcpp.map cimport map
 from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
 
-from cudf._lib.pylibcudf.io.types cimport SinkInfo, TableWithMetadata
+from cudf._lib.pylibcudf.io.types cimport (
+    SinkInfo,
+    SourceInfo,
+    TableWithMetadata,
+)
 from cudf._lib.pylibcudf.libcudf.io.json cimport (
+    json_reader_options,
+    json_recovery_mode_t,
     json_writer_options,
+    read_json as cpp_read_json,
+    schema_element,
     write_json as cpp_write_json,
 )
-from cudf._lib.pylibcudf.libcudf.io.types cimport table_metadata
-from cudf._lib.pylibcudf.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    compression_type,
+    table_metadata,
+    table_with_metadata,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from cudf._lib.pylibcudf.types cimport DataType
+
+
+cdef map[string, schema_element] _generate_schema_map(list dtypes):
+    cdef map[string, schema_element] schema_map
+    cdef schema_element s_elem
+    cdef string c_name
+
+    for name, dtype, child_dtypes in dtypes:
+        if not (isinstance(name, str) and
+                isinstance(dtype, DataType) and
+                isinstance(child_dtypes, list)):
+
+            raise ValueError("Must pass a list of a tuple containing "
+                             "(column_name, column_dtype, list of child_dtypes)")
+
+        c_name = <str>name.encode()
+
+        s_elem.type = (<DataType>dtype).c_obj
+        s_elem.child_types = _generate_schema_map(child_dtypes)
+
+        schema_map[c_name] = s_elem
+    return schema_map
+
+
+cpdef TableWithMetadata read_json(
+    SourceInfo source_info,
+    list dtypes = None,
+    compression_type compression = compression_type.AUTO,
+    bool lines = False,
+    size_type byte_range_offset = 0,
+    size_type byte_range_size = 0,
+    bool keep_quotes = False,
+    bool mixed_types_as_string = False,
+    bool prune_columns = False,
+    json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL,
+):
+    """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`.
+
+    Parameters
+    ----------
+    source_info : SourceInfo
+        The SourceInfo object to read the JSON file from.
+    dtypes : list, default None
+        Set data types for the columns in the JSON file.
+
+        Each element of the list has the format
+        (column_name, column_dtype, list of child dtypes), where
+        the list of child dtypes is an empty list if the child is not
+        a nested type (list or struct dtype), and is of format
+        (column_child_name, column_child_type, list of grandchild dtypes).
+    compression_type: CompressionType, default CompressionType.AUTO
+        The compression format of the JSON source.
+    byte_range_offset : size_type, default 0
+        Number of bytes to skip from source start.
+    byte_range_size : size_type, default 0
+        Number of bytes to read. By default, will read all bytes.
+    keep_quotes : bool, default False
+        Whether the reader should keep quotes of string values.
+    prune_columns : bool, default False
+        Whether to only read columns specified in dtypes.
+    recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL
+        Whether to raise an error or set corresponding values to null
+        when encountering an invalid JSON line.
+
+    Returns
+    -------
+    TableWithMetadata
+        The Table and its corresponding metadata (column names) that were read in.
+    """
+    cdef vector[data_type] types_vec
+    cdef json_reader_options opts = move(
+        json_reader_options.builder(source_info.c_obj)
+        .compression(compression)
+        .lines(lines)
+        .byte_range_offset(byte_range_offset)
+        .byte_range_size(byte_range_size)
+        .recovery_mode(recovery_mode)
+        .build()
+    )
+
+    if dtypes is not None:
+        if isinstance(dtypes[0], tuple):
+            opts.set_dtypes(move(_generate_schema_map(dtypes)))
+        else:
+            for dtype in dtypes:
+                types_vec.push_back((<DataType>dtype).c_obj)
+            opts.set_dtypes(types_vec)
+
+    opts.enable_keep_quotes(keep_quotes)
+    opts.enable_mixed_types_as_string(mixed_types_as_string)
+    opts.enable_prune_columns(prune_columns)
+
+    # Read JSON
+    cdef table_with_metadata c_result
+
+    with nogil:
+        c_result = move(cpp_read_json(opts))
+
+    return TableWithMetadata.from_libcudf(c_result)
 
 
 cpdef void write_json(
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
index 88daf54f33b..ab223c16a72 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
@@ -28,6 +28,11 @@ cdef class TableWithMetadata:
 
     cdef vector[column_name_info] _make_column_info(self, list column_names)
 
+    cdef list _make_columns_list(self, dict child_dict)
+
+    @staticmethod
+    cdef dict _parse_col_names(vector[column_name_info] infos)
+
     @staticmethod
     cdef TableWithMetadata from_libcudf(table_with_metadata& tbl)
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
index f94e20970a4..df0b729b711 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
@@ -22,6 +22,11 @@ import errno
 import io
 import os
 
+from cudf._lib.pylibcudf.libcudf.io.json import \
+    json_recovery_mode_t as JSONRecoveryMode  # no-cython-lint
+from cudf._lib.pylibcudf.libcudf.io.types import \
+    compression_type as CompressionType  # no-cython-lint
+
 
 cdef class TableWithMetadata:
     """A container holding a table and its associated metadata
@@ -69,16 +74,44 @@ cdef class TableWithMetadata:
         """
         return self.tbl.columns()
 
-    @property
-    def column_names(self):
+    cdef list _make_columns_list(self, dict child_dict):
+        cdef list names = []
+        for child in child_dict:
+            grandchildren = self._make_columns_list(child_dict[child])
+            names.append((child, grandchildren))
+        return names
+
+    def column_names(self, include_children=False):
         """
         Return a list containing the column names of the table
         """
         cdef list names = []
+        cdef str name
+        cdef dict child_names = self.child_names
         for col_info in self.metadata.schema_info:
-            # TODO: Handle nesting (columns with child columns)
-            assert col_info.children.size() == 0, "Child column names are not handled!"
-            names.append(col_info.name.decode())
+            name = col_info.name.decode()
+            if include_children:
+                children = self._make_columns_list(child_names[name])
+                names.append((name, children))
+            else:
+                names.append(name)
+        return names
+
+    @property
+    def child_names(self):
+        """
+        Return a dictionary mapping the names of columns with children
+        to the names of their child columns
+        """
+        return TableWithMetadata._parse_col_names(self.metadata.schema_info)
+
+    @staticmethod
+    cdef dict _parse_col_names(vector[column_name_info] infos):
+        cdef dict child_names = dict()
+        cdef dict names = dict()
+        for col_info in infos:
+            child_names = TableWithMetadata._parse_col_names(col_info.children)
+            names[col_info.name.decode()] = child_names
         return names
 
     @staticmethod
@@ -137,6 +170,15 @@ cdef class SourceInfo:
         cdef vector[host_buffer] c_host_buffers
         cdef const unsigned char[::1] c_buffer
         cdef bint empty_buffer = False
+        cdef list new_sources = []
+
+        if isinstance(sources[0], io.StringIO):
+            for buffer in sources:
+                if not isinstance(buffer, io.StringIO):
+                    raise ValueError("All sources must be of the same type!")
+                new_sources.append(buffer.read().encode())
+            sources = new_sources
+
         if isinstance(sources[0], bytes):
             empty_buffer = True
             for buffer in sources:
@@ -156,7 +198,10 @@ cdef class SourceInfo:
                                                      c_buffer.shape[0]))
         else:
             raise ValueError("Sources must be a list of str/paths, "
-                             "bytes, io.BytesIO, or a Datasource")
+                             "bytes, io.BytesIO, io.StringIO, or a Datasource")
+
+        if empty_buffer is True:
+            c_host_buffers.push_back(host_buffer(<char*>NULL, 0))
 
         self.c_obj = source_info(c_host_buffers)
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
index 6c66d01ca57..699e85ce567 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
@@ -22,4 +22,5 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp
 )
+add_subdirectory(io)
 add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt
new file mode 100644
index 00000000000..6831063ecb9
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt
@@ -0,0 +1,26 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources json.pyx types.pyx)
+
+set(linked_libraries cudf::cudf)
+
+rapids_cython_create_modules(
+  CXX
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_io_
+)
+
+set(targets_using_arrow_headers cpp_io_json cpp_io_types)
+link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
index 2e50cccd132..86621ae184f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int32_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
@@ -19,9 +19,9 @@ cdef extern from "cudf/io/json.hpp" \
         data_type type
         map[string, schema_element] child_types
 
-    cdef enum json_recovery_mode_t:
-        FAIL "cudf::io::json_recovery_mode_t::FAIL"
-        RECOVER_WITH_NULL "cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL"
+    cpdef enum class json_recovery_mode_t(int32_t):
+        FAIL
+        RECOVER_WITH_NULL
 
     cdef cppclass json_reader_options:
         json_reader_options() except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index de6b9f690b6..f136cd997a7 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -322,7 +322,7 @@ cdef data_from_pylibcudf_io(tbl_with_meta):
     """
     return _data_from_columns(
         columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns],
-        column_names=tbl_with_meta.column_names,
+        column_names=tbl_with_meta.column_names(include_children=False),
         index_names=None
     )
 
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index d41e6c720bf..46603ff32b8 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -8,13 +8,14 @@
 import pytest
 
 from cudf._lib import pylibcudf as plc
+from cudf._lib.pylibcudf.io.types import CompressionType
 
 
 def metadata_from_arrow_type(
     pa_type: pa.Array,
     name: str = "",
 ) -> plc.interop.ColumnMetadata | None:
-    metadata = plc.interop.ColumnMetadata(name)  # None
+    metadata = plc.interop.ColumnMetadata(name)
     if pa.types.is_list(pa_type):
         child_meta = [plc.interop.ColumnMetadata("offsets")]
         for i in range(pa_type.num_fields):
@@ -39,9 +40,25 @@ def metadata_from_arrow_type(
 
 
 def assert_column_eq(
-    lhs: pa.Array | plc.Column, rhs: pa.Array | plc.Column
+    lhs: pa.Array | plc.Column,
+    rhs: pa.Array | plc.Column,
+    check_field_nullability=True,
 ) -> None:
-    """Verify that a pylibcudf array and PyArrow array are equal."""
+    """Verify that a pylibcudf array and PyArrow array are equal.
+
+    Parameters
+    ----------
+    lhs: Union[pa.Array, plc.Column]
+        The array with the expected values
+    rhs: Union[pa.Array, plc.Column]
+        The array to check
+    check_field_nullability:
+        For list/struct dtypes, whether to check if the nullable attributes
+        on child fields are equal.
+
+        Useful for checking roundtripping of lossy formats like JSON that may not
+        preserve this information.
+    """
     # Nested types require children metadata to be passed to the conversion function.
     if isinstance(lhs, (pa.Array, pa.ChunkedArray)) and isinstance(
         rhs, plc.Column
@@ -65,6 +82,33 @@ def assert_column_eq(
     if isinstance(rhs, pa.ChunkedArray):
         rhs = rhs.combine_chunks()
 
+    def _make_fields_nullable(typ):
+        new_fields = []
+        for i in range(typ.num_fields):
+            child_field = typ.field(i)
+            if not child_field.nullable:
+                child_type = child_field.type
+                if isinstance(child_field.type, (pa.StructType, pa.ListType)):
+                    child_type = _make_fields_nullable(child_type)
+                new_fields.append(
+                    pa.field(child_field.name, child_type, nullable=True)
+                )
+            else:
+                new_fields.append(child_field)
+
+        if isinstance(typ, pa.StructType):
+            return pa.struct(new_fields)
+        elif isinstance(typ, pa.ListType):
+            return pa.list_(new_fields[0])
+        return typ
+
+    if not check_field_nullability:
+        rhs_type = _make_fields_nullable(rhs.type)
+        rhs = rhs.cast(rhs_type)
+
+        lhs_type = _make_fields_nullable(lhs.type)
+        lhs = rhs.cast(lhs_type)
+
     assert lhs.equals(rhs)
 
 
@@ -78,20 +122,24 @@ def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None:
 
 
 def assert_table_and_meta_eq(
-    plc_table_w_meta: plc.io.types.TableWithMetadata, pa_table: pa.Table
+    pa_table: pa.Table,
+    plc_table_w_meta: plc.io.types.TableWithMetadata,
+    check_field_nullability=True,
 ) -> None:
     """Verify that the pylibcudf TableWithMetadata and PyArrow table are equal"""
 
     plc_table = plc_table_w_meta.tbl
 
     plc_shape = (plc_table.num_rows(), plc_table.num_columns())
-    assert plc_shape == pa_table.shape
+    assert (
+        plc_shape == pa_table.shape
+    ), f"{plc_shape} is not equal to {pa_table.shape}"
 
     for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns):
-        assert_column_eq(plc_col, pa_col)
+        assert_column_eq(pa_col, plc_col, check_field_nullability)
 
     # Check column name equality
-    assert plc_table_w_meta.column_names == pa_table.column_names
+    assert plc_table_w_meta.column_names() == pa_table.column_names
 
 
 def cudf_raises(expected_exception: BaseException, *args, **kwargs):
@@ -182,4 +230,26 @@ def sink_to_str(sink):
     + DEFAULT_PA_STRUCT_TESTING_TYPES
 )
 
+# Map pylibcudf compression types to pandas ones
+# Not all compression types map cleanly, read the comments to learn more!
+# If a compression type is unsupported, it maps to False.
+
+COMPRESSION_TYPE_TO_PANDAS = {
+    CompressionType.NONE: None,
+    # Users of this dict will have to special case
+    # AUTO
+    CompressionType.AUTO: None,
+    CompressionType.GZIP: "gzip",
+    CompressionType.BZIP2: "bz2",
+    CompressionType.ZIP: "zip",
+    CompressionType.XZ: "xz",
+    CompressionType.ZSTD: "zstd",
+    # Unsupported
+    CompressionType.ZLIB: False,
+    CompressionType.LZ4: False,
+    CompressionType.LZO: False,
+    # These only work for parquet
+    CompressionType.SNAPPY: "snappy",
+    CompressionType.BROTLI: "brotli",
+}
 ALL_PA_TYPES = DEFAULT_PA_TYPES
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
index e4760ea7ac8..39832eb4bba 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -121,6 +121,11 @@ def source_or_sink(request, tmp_path):
         return fp_or_buf()
 
 
+@pytest.fixture(params=[opt for opt in plc.io.types.CompressionType])
+def compression_type(request):
+    return request.param
+
+
 @pytest.fixture(
     scope="session", params=[opt for opt in plc.types.Interpolation]
 )
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_avro.py b/python/cudf/cudf/pylibcudf_tests/io/test_avro.py
index d6cd86768cd..061d6792ce3 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_avro.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_avro.py
@@ -120,4 +120,4 @@ def test_read_avro(avro_dtypes, avro_dtype_data, row_opts, columns, nullable):
     if columns != []:
         expected = expected.select(columns)
 
-    assert_table_and_meta_eq(res, expected)
+    assert_table_and_meta_eq(expected, res)
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_json.py b/python/cudf/cudf/pylibcudf_tests/io/test_json.py
index d6b8bfa6976..c13eaf40625 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_json.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_json.py
@@ -1,11 +1,49 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import io
 
+import pandas as pd
 import pyarrow as pa
 import pytest
-from utils import sink_to_str
+from utils import (
+    COMPRESSION_TYPE_TO_PANDAS,
+    assert_table_and_meta_eq,
+    sink_to_str,
+)
 
 import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.io.types import CompressionType
+
+
+def make_json_source(path_or_buf, pa_table, **kwargs):
+    """
+    Uses pandas to write a pyarrow Table to a JSON file.
+
+    The caller is responsible for making sure that no arguments
+    unsupported by pandas are passed in.
+    """
+    df = pa_table.to_pandas()
+    if "compression" in kwargs:
+        kwargs["compression"] = COMPRESSION_TYPE_TO_PANDAS[
+            kwargs["compression"]
+        ]
+    df.to_json(path_or_buf, orient="records", **kwargs)
+    if isinstance(path_or_buf, io.IOBase):
+        path_or_buf.seek(0)
+    return path_or_buf
+
+
+def write_json_bytes(source, json_str):
+    """
+    Write a JSON string to the source
+    """
+    if not isinstance(source, io.IOBase):
+        with open(source, "w") as source_f:
+            source_f.write(json_str)
+    else:
+        if isinstance(source, io.BytesIO):
+            json_str = json_str.encode("utf-8")
+        source.write(json_str)
+        source.seek(0)
 
 
 @pytest.mark.parametrize("rows_per_chunk", [8, 100])
@@ -114,3 +152,238 @@ def test_write_json_bool_opts(true_value, false_value):
         pd_result = pd_result.replace("false", false_value)
 
     assert str_result == pd_result
+
+
+@pytest.mark.parametrize("lines", [True, False])
+def test_read_json_basic(
+    table_data, source_or_sink, lines, compression_type, request
+):
+    if compression_type in {
+        # Not supported by libcudf
+        CompressionType.SNAPPY,
+        CompressionType.XZ,
+        CompressionType.ZSTD,
+        # Not supported by pandas
+        # TODO: find a way to test these
+        CompressionType.BROTLI,
+        CompressionType.LZ4,
+        CompressionType.LZO,
+        CompressionType.ZLIB,
+    }:
+        pytest.skip("unsupported compression type by pandas/libcudf")
+
+    # can't compress non-binary data with pandas
+    if isinstance(source_or_sink, io.StringIO):
+        compression_type = CompressionType.NONE
+
+    _, pa_table = table_data
+
+    source = make_json_source(
+        source_or_sink, pa_table, lines=lines, compression=compression_type
+    )
+
+    request.applymarker(
+        pytest.mark.xfail(
+            condition=(
+                len(pa_table) > 0
+                and compression_type
+                not in {CompressionType.NONE, CompressionType.AUTO}
+            ),
+            # note: wasn't able to narrow down the specific types that were failing
+            # seems to be a little non-deterministic, but always fails with
+            # cudaErrorInvalidValue invalid argument
+            reason="libcudf json reader crashes on compressed non empty table_data",
+        )
+    )
+
+    if isinstance(source, io.IOBase):
+        source.seek(0)
+
+    res = plc.io.json.read_json(
+        plc.io.SourceInfo([source]),
+        compression=compression_type,
+        lines=lines,
+    )
+
+    # Adjustments to correct for the fact orient=records is lossy
+    #  and doesn't
+    # 1) preserve colnames when zero rows in table
+    # 2) preserve struct nullability
+    # 3) differentiate int64/uint64
+    if len(pa_table) == 0:
+        pa_table = pa.table([])
+
+    new_fields = []
+    for i in range(len(pa_table.schema)):
+        curr_field = pa_table.schema.field(i)
+        if curr_field.type == pa.uint64():
+            try:
+                curr_field = curr_field.with_type(pa.int64())
+            except OverflowError:
+                # There will be no confusion, values are too large
+                # for int64 anyways
+                pass
+        new_fields.append(curr_field)
+
+    pa_table = pa_table.cast(pa.schema(new_fields))
+
+    # Convert non-nullable struct fields to nullable fields
+    # since nullable=False cannot roundtrip through orient='records'
+    # JSON format
+    assert_table_and_meta_eq(pa_table, res, check_field_nullability=False)
+
+
+def test_read_json_dtypes(table_data, source_or_sink):
+    # Simple test for dtypes where we read in
+    # all numeric data as floats
+    _, pa_table = table_data
+    source = make_json_source(
+        source_or_sink,
+        pa_table,
+        lines=True,
+    )
+
+    dtypes = []
+    new_fields = []
+    for i in range(len(pa_table.schema)):
+        field = pa_table.schema.field(i)
+        child_types = []
+
+        def get_child_types(typ):
+            typ_child_types = []
+            for i in range(typ.num_fields):
+                curr_field = typ.field(i)
+                typ_child_types.append(
+                    (
+                        curr_field.name,
+                        curr_field.type,
+                        get_child_types(curr_field.type),
+                    )
+                )
+            return typ_child_types
+
+        plc_type = plc.interop.from_arrow(field.type)
+        if pa.types.is_integer(field.type) or pa.types.is_unsigned_integer(
+            field.type
+        ):
+            plc_type = plc.interop.from_arrow(pa.float64())
+            field = field.with_type(pa.float64())
+
+        dtypes.append((field.name, plc_type, child_types))
+
+        new_fields.append(field)
+
+    new_schema = pa.schema(new_fields)
+
+    res = plc.io.json.read_json(
+        plc.io.SourceInfo([source]), dtypes=dtypes, lines=True
+    )
+    new_table = pa_table.cast(new_schema)
+
+    # orient=records is lossy
+    # and doesn't preserve column names when there's zero rows in the table
+    if len(new_table) == 0:
+        new_table = pa.table([])
+
+    assert_table_and_meta_eq(new_table, res, check_field_nullability=False)
+
+
+@pytest.mark.parametrize("chunk_size", [10, 15, 20])
+def test_read_json_lines_byte_range(source_or_sink, chunk_size):
+    source = source_or_sink
+    if isinstance(source_or_sink, io.StringIO):
+        pytest.skip("byte_range doesn't work on StringIO")
+
+    json_str = "[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9]\n"
+    write_json_bytes(source, json_str)
+
+    tbls_w_meta = []
+    for chunk_start in range(0, len(json_str.encode("utf-8")), chunk_size):
+        tbls_w_meta.append(
+            plc.io.json.read_json(
+                plc.io.SourceInfo([source]),
+                lines=True,
+                byte_range_offset=chunk_start,
+                byte_range_size=chunk_start + chunk_size,
+            )
+        )
+
+    if isinstance(source, io.IOBase):
+        source.seek(0)
+    exp = pd.read_json(source, orient="records", lines=True)
+
+    # TODO: can do this operation using pylibcudf
+    tbls = []
+    for tbl_w_meta in tbls_w_meta:
+        if tbl_w_meta.tbl.num_rows() > 0:
+            tbls.append(plc.interop.to_arrow(tbl_w_meta.tbl))
+    full_tbl = pa.concat_tables(tbls)
+
+    full_tbl_plc = plc.io.TableWithMetadata(
+        plc.interop.from_arrow(full_tbl),
+        tbls_w_meta[0].column_names(include_children=True),
+    )
+    assert_table_and_meta_eq(pa.Table.from_pandas(exp), full_tbl_plc)
+
+
+@pytest.mark.parametrize("keep_quotes", [True, False])
+def test_read_json_lines_keep_quotes(keep_quotes, source_or_sink):
+    source = source_or_sink
+
+    json_bytes = '["a", "b", "c"]\n'
+    write_json_bytes(source, json_bytes)
+
+    tbl_w_meta = plc.io.json.read_json(
+        plc.io.SourceInfo([source]), lines=True, keep_quotes=keep_quotes
+    )
+
+    template = "{0}"
+    if keep_quotes:
+        template = '"{0}"'
+
+    exp = pa.Table.from_arrays(
+        [
+            [template.format("a")],
+            [template.format("b")],
+            [template.format("c")],
+        ],
+        names=["0", "1", "2"],
+    )
+
+    assert_table_and_meta_eq(exp, tbl_w_meta)
+
+
+@pytest.mark.parametrize(
+    "recovery_mode", [opt for opt in plc.io.types.JSONRecoveryMode]
+)
+def test_read_json_lines_recovery_mode(recovery_mode, source_or_sink):
+    source = source_or_sink
+
+    json_bytes = '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n'
+    write_json_bytes(source, json_bytes)
+
+    if recovery_mode == plc.io.types.JSONRecoveryMode.FAIL:
+        with pytest.raises(RuntimeError):
+            plc.io.json.read_json(
+                plc.io.SourceInfo([source]),
+                lines=True,
+                recovery_mode=recovery_mode,
+            )
+    else:
+        # Recover case (bad values replaced with nulls)
+        tbl_w_meta = plc.io.json.read_json(
+            plc.io.SourceInfo([source]),
+            lines=True,
+            recovery_mode=recovery_mode,
+        )
+        exp = pa.Table.from_arrays(
+            [[1, 2, None, 3], [10, 11, None, 12]], names=["a", "b"]
+        )
+        assert_table_and_meta_eq(exp, tbl_w_meta)
+
+
+# TODO: Add tests for these!
+# Tests were not added in the initial PR porting the JSON reader to pylibcudf
+# to save time (and since there are no existing tests for these in Python cuDF)
+# mixed_types_as_string = mixed_types_as_string,
+# prune_columns = prune_columns,
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 297040b6d95..9222f6d23db 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -1077,8 +1077,13 @@ def test_json_dtypes_nested_data():
     )
 
     pdf = pd.read_json(
-        StringIO(expected_json_str), orient="records", lines=True
+        StringIO(expected_json_str),
+        orient="records",
+        lines=True,
     )
+
+    assert_eq(df, pdf)
+
     pdf.columns = pdf.columns.astype("str")
     pa_table_pdf = pa.Table.from_pandas(
         pdf, schema=df.to_arrow().schema, safe=False

From 2664427d5eb427cb4c7682d51a37fde71f7c6c8f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 8 Jul 2024 14:00:30 -0400
Subject: [PATCH 474/842] Add single offset to an empty ListArray in
 cudf::to_arrow (#16201)

Closes #16164

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

URL: https://github.com/rapidsai/cudf/pull/16201
---
 cpp/src/interop/to_arrow.cu | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 2b3aa2f08f1..62b85891adb 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -376,7 +376,12 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::list_view>(
     metadata.children_meta.empty() ? std::vector<column_metadata>{{}, {}} : metadata.children_meta;
   auto child_arrays = fetch_child_array(input_view, children_meta, ar_mr, stream);
   if (child_arrays.empty()) {
-    return std::make_shared<arrow::ListArray>(arrow::list(arrow::null()), 0, nullptr, nullptr);
+    // Empty list will have only one value in offset of 4 bytes
+    auto tmp_offset_buffer = allocate_arrow_buffer(sizeof(int32_t), ar_mr);
+    memset(tmp_offset_buffer->mutable_data(), 0, sizeof(int32_t));
+
+    return std::make_shared<arrow::ListArray>(
+      arrow::list(arrow::null()), 0, std::move(tmp_offset_buffer), nullptr);
   }
 
   auto offset_buffer = child_arrays[0]->data()->buffers[1];

From e9cb7dd7d3d9b810c4575cbdbead8148d85e990f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 8 Jul 2024 08:36:34 -1000
Subject: [PATCH 475/842] Support at/iat indexers in cudf.pandas (#16177)

closes #16112

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16177
---
 python/cudf/cudf/core/dataframe.py            | 12 ++++++++++--
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 12 ++++++++++++
 .../cudf_pandas_tests/test_cudf_pandas.py     | 19 +++++++++++++++++++
 3 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b249410c2e4..3e5ff9c18b5 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -462,6 +462,10 @@ def _setitem_tuple_arg(self, key, value):
                             self._frame[col].loc[key[0]] = value[i]
 
 
+class _DataFrameAtIndexer(_DataFrameLocIndexer):
+    pass
+
+
 class _DataFrameIlocIndexer(_DataFrameIndexer):
     """
     For selection by index.
@@ -584,6 +588,10 @@ def _setitem_tuple_arg(self, key, value):
                         self._frame[col].iloc[key[0]] = value[i]
 
 
+class _DataFrameiAtIndexer(_DataFrameIlocIndexer):
+    pass
+
+
 class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     """
     A GPU Dataframe object.
@@ -2581,14 +2589,14 @@ def iat(self):
         """
         Alias for ``DataFrame.iloc``; provided for compatibility with Pandas.
         """
-        return self.iloc
+        return _DataFrameiAtIndexer(self)
 
     @property
     def at(self):
         """
         Alias for ``DataFrame.loc``; provided for compatibility with Pandas.
         """
-        return self.loc
+        return _DataFrameAtIndexer(self)
 
     @property  # type: ignore
     @_external_only_api(
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index a64bf7772fe..dd6f6fe76ba 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -775,6 +775,18 @@ def Index__new__(cls, *args, **kwargs):
     pd.core.indexing._LocIndexer,
 )
 
+_AtIndexer = make_intermediate_proxy_type(
+    "_AtIndexer",
+    cudf.core.dataframe._DataFrameAtIndexer,
+    pd.core.indexing._AtIndexer,
+)
+
+_iAtIndexer = make_intermediate_proxy_type(
+    "_iAtIndexer",
+    cudf.core.dataframe._DataFrameiAtIndexer,
+    pd.core.indexing._iAtIndexer,
+)
+
 FixedForwardWindowIndexer = make_final_proxy_type(
     "FixedForwardWindowIndexer",
     _Unusable,
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index f51ce103677..b0aeaba3916 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1566,3 +1566,22 @@ def test_arrow_string_arrays():
     )
 
     tm.assert_equal(cu_arr, pd_arr)
+
+
+@pytest.mark.parametrize("indexer", ["at", "iat"])
+def test_at_iat(indexer):
+    df = xpd.DataFrame(range(3))
+    result = getattr(df, indexer)[0, 0]
+    assert result == 0
+
+    getattr(df, indexer)[0, 0] = 1
+    expected = pd.DataFrame([1, 1, 2])
+    tm.assert_frame_equal(df, expected)
+
+
+def test_at_setitem_empty():
+    df = xpd.DataFrame({"name": []}, dtype="float64")
+    df.at[0, "name"] = 1.0
+    df.at[0, "new"] = 2.0
+    expected = pd.DataFrame({"name": [1.0], "new": [2.0]})
+    tm.assert_frame_equal(df, expected)

From cc8c86857df92801561d2fa3311d8da85895ff33 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Mon, 8 Jul 2024 16:54:45 -0500
Subject: [PATCH 476/842] Disable large string support for Java build (#16216)

Disables libcudf large string support for the Java bindings build. The Java bindings need to be updated to handle large strings which is tracked by #16215.

Closes #16199.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16216
---
 java/README.md             | 10 +++++++---
 java/ci/build-in-docker.sh |  3 ++-
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/java/README.md b/java/README.md
index 2d8e2190fee..0d9e060b7cd 100644
--- a/java/README.md
+++ b/java/README.md
@@ -51,9 +51,13 @@ CUDA 11.0:
 ## Build From Source
 
 Build [libcudf](../cpp) first, and make sure the JDK is installed and available. Specify
-the cmake option `-DCUDF_USE_ARROW_STATIC=ON -DCUDF_ENABLE_ARROW_S3=OFF` when building so
-that Apache Arrow is linked statically to libcudf, as this will help create a jar that
-does not require Arrow and its dependencies to be available in the runtime environment.
+the following cmake options to the libcudf build:
+```
+-DCUDF_LARGE_STRINGS_DISABLED=ON -DCUDF_USE_ARROW_STATIC=ON -DCUDF_ENABLE_ARROW_S3=OFF
+```
+These options:
+- Disable large string support, see https://github.com/rapidsai/cudf/issues/16215
+- Statically link Arrow to libcudf to remove Arrow as a runtime dependency.
 
 After building libcudf, the Java bindings can be built via Maven, e.g.:
 ```
diff --git a/java/ci/build-in-docker.sh b/java/ci/build-in-docker.sh
index 72b1742f7cb..5a429bdc739 100755
--- a/java/ci/build-in-docker.sh
+++ b/java/ci/build-in-docker.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 #
-# Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -58,6 +58,7 @@ cmake .. -G"${CMAKE_GENERATOR}" \
          -DCMAKE_INSTALL_PREFIX=$INSTALL_PREFIX \
          -DCUDA_STATIC_RUNTIME=$ENABLE_CUDA_STATIC_RUNTIME \
          -DUSE_NVTX=$ENABLE_NVTX \
+         -DCUDF_LARGE_STRINGS_DISABLED=ON \
          -DCUDF_USE_ARROW_STATIC=ON \
          -DCUDF_ENABLE_ARROW_S3=OFF \
          -DBUILD_TESTS=$BUILD_CPP_TESTS \

From 58b7dc9f186c1860d4f9df80188bf21214381b1b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 8 Jul 2024 16:01:25 -1000
Subject: [PATCH 477/842] interpolate returns new column if no values are
 interpolated (#16158)

While cleaning up the `interpolate` implementation, I noticed that a interpolation no-op did not return a new column.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16158
---
 python/cudf/cudf/core/algorithms.py        | 61 ++++++++--------------
 python/cudf/cudf/core/indexed_frame.py     | 14 +++--
 python/cudf/cudf/core/multiindex.py        |  4 +-
 python/cudf/cudf/tests/test_interpolate.py |  6 +++
 4 files changed, 39 insertions(+), 46 deletions(-)

diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index e8b82ff60c2..6c69fbd2637 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -1,17 +1,22 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
 import warnings
+from typing import TYPE_CHECKING
 
 import cupy as cp
 import numpy as np
 
 from cudf.core.column import as_column
-from cudf.core.copy_types import BooleanMask
 from cudf.core.index import RangeIndex, ensure_index
-from cudf.core.indexed_frame import IndexedFrame
 from cudf.core.scalar import Scalar
 from cudf.options import get_option
 from cudf.utils.dtypes import can_convert_to_column
 
+if TYPE_CHECKING:
+    from cudf.core.column.column import ColumnBase
+    from cudf.core.index import BaseIndex
+
 
 def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
     """Encode the input values as integer labels
@@ -110,55 +115,31 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
     return labels, cats.values if return_cupy_array else ensure_index(cats)
 
 
-def _linear_interpolation(column, index=None):
-    """
-    Interpolate over a float column. Implicitly assumes that values are
-    evenly spaced with respect to the x-axis, for example the data
-    [1.0, NaN, 3.0] will be interpolated assuming the NaN is half way
-    between the two valid values, yielding [1.0, 2.0, 3.0]
-    """
-
-    index = RangeIndex(start=0, stop=len(column), step=1)
-    return _index_or_values_interpolation(column, index=index)
-
-
-def _index_or_values_interpolation(column, index=None):
+def _interpolation(column: ColumnBase, index: BaseIndex) -> ColumnBase:
     """
     Interpolate over a float column. assumes a linear interpolation
     strategy using the index of the data to denote spacing of the x
     values. For example the data and index [1.0, NaN, 4.0], [1, 3, 4]
-    would result in [1.0, 3.0, 4.0]
+    would result in [1.0, 3.0, 4.0].
     """
     # figure out where the nans are
-    mask = cp.isnan(column)
+    mask = column.isnull()
 
     # trivial cases, all nan or no nans
-    num_nan = mask.sum()
-    if num_nan == 0 or num_nan == len(column):
-        return column
+    if not mask.any() or mask.all():
+        return column.copy()
 
-    to_interp = IndexedFrame(data={None: column}, index=index)
-    known_x_and_y = to_interp._apply_boolean_mask(
-        BooleanMask(~mask, len(to_interp))
-    )
-
-    known_x = known_x_and_y.index.to_cupy()
-    known_y = known_x_and_y._data.columns[0].values
+    valid_locs = ~mask
+    if isinstance(index, RangeIndex):
+        # Each point is evenly spaced, index values don't matter
+        known_x = cp.flatnonzero(valid_locs.values)
+    else:
+        known_x = index._column.apply_boolean_mask(valid_locs).values  # type: ignore[attr-defined]
+    known_y = column.apply_boolean_mask(valid_locs).values
 
     result = cp.interp(index.to_cupy(), known_x, known_y)
 
     # find the first nan
-    first_nan_idx = (mask == 0).argmax().item()
+    first_nan_idx = valid_locs.values.argmax().item()
     result[:first_nan_idx] = np.nan
-    return result
-
-
-def get_column_interpolator(method):
-    interpolator = {
-        "linear": _linear_interpolation,
-        "index": _index_or_values_interpolation,
-        "values": _index_or_values_interpolation,
-    }.get(method, None)
-    if not interpolator:
-        raise ValueError(f"Interpolation method `{method}` not found")
-    return interpolator
+    return as_column(result)
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index ff10051c52d..63fa96d0db0 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -26,6 +26,8 @@
 
 import cudf
 import cudf._lib as libcudf
+import cudf.core
+import cudf.core.algorithms
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
@@ -1987,6 +1989,8 @@ def interpolate(
                 "Use obj.ffill() or obj.bfill() instead.",
                 FutureWarning,
             )
+        elif method not in {"linear", "values", "index"}:
+            raise ValueError(f"Interpolation method `{method}` not found")
 
         data = self
 
@@ -2000,7 +2004,10 @@ def interpolate(
                 )
             )
 
-        interpolator = cudf.core.algorithms.get_column_interpolator(method)
+        if method == "linear":
+            interp_index = RangeIndex(self._num_rows)
+        else:
+            interp_index = data.index
         columns = []
         for col in data._columns:
             if isinstance(col, cudf.core.column.StringColumn):
@@ -2012,8 +2019,9 @@ def interpolate(
             if col.nullable:
                 col = col.astype("float64").fillna(np.nan)
 
-            # Interpolation methods may or may not need the index
-            columns.append(interpolator(col, index=data.index))
+            columns.append(
+                cudf.core.algorithms._interpolation(col, index=interp_index)
+            )
 
         result = self._from_data_like_self(
             self._data._from_columns_like_self(columns)
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 9cbe863142b..dbbd1eab6c8 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -23,6 +23,7 @@
 from cudf.api.types import is_integer, is_list_like, is_object_dtype
 from cudf.core import column
 from cudf.core._base_index import _return_get_indexer_result
+from cudf.core.algorithms import factorize
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame
 from cudf.core.index import (
@@ -1373,9 +1374,6 @@ def from_arrays(
                     (2, 'blue')],
                    names=['number', 'color'])
         """
-        # Imported here due to circular import
-        from cudf.core.algorithms import factorize
-
         error_msg = "Input must be a list / sequence of array-likes."
         if not is_list_like(arrays):
             raise TypeError(error_msg)
diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py
index 4a0dc331e1a..a4f0b9fc97e 100644
--- a/python/cudf/cudf/tests/test_interpolate.py
+++ b/python/cudf/cudf/tests/test_interpolate.py
@@ -135,3 +135,9 @@ def test_interpolate_dataframe_error_cases(data, kwargs):
         lfunc_args_and_kwargs=([], kwargs),
         rfunc_args_and_kwargs=([], kwargs),
     )
+
+
+def test_interpolate_noop_new_column():
+    ser = cudf.Series([1.0, 2.0, 3.0])
+    result = ser.interpolate()
+    assert ser._column is not result._column

From cf88f8e045b279cbe5caa2e19ffadc7c6400aa58 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 8 Jul 2024 16:04:51 -1000
Subject: [PATCH 478/842] Defer copying in Column.astype(copy=True) (#16095)

Avoids:

1. Copying `self` when the `astype` would already produce a new column with its own data
2. Copying `self` when the `astype` would raise an Exception

Also cleans up some `as_categorical_column` logic.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16095
---
 python/cudf/cudf/core/column/categorical.py | 20 ++---
 python/cudf/cudf/core/column/column.py      | 91 ++++++++++-----------
 2 files changed, 51 insertions(+), 60 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 231af30c06d..cec7d5e6663 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1113,24 +1113,18 @@ def is_monotonic_decreasing(self) -> bool:
     def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn:
         if isinstance(dtype, str) and dtype == "category":
             return self
+        if isinstance(dtype, pd.CategoricalDtype):
+            dtype = cudf.CategoricalDtype.from_pandas(dtype)
         if (
-            isinstance(
-                dtype, (cudf.core.dtypes.CategoricalDtype, pd.CategoricalDtype)
-            )
-            and (dtype.categories is None)
-            and (dtype.ordered is None)
+            isinstance(dtype, cudf.CategoricalDtype)
+            and dtype.categories is None
+            and dtype.ordered is None
         ):
             return self
-
-        if isinstance(dtype, pd.CategoricalDtype):
-            dtype = CategoricalDtype(
-                categories=dtype.categories, ordered=dtype.ordered
-            )
-
-        if not isinstance(dtype, CategoricalDtype):
+        elif not isinstance(dtype, CategoricalDtype):
             raise ValueError("dtype must be CategoricalDtype")
 
-        if not isinstance(self.categories, type(dtype.categories._values)):
+        if not isinstance(self.categories, type(dtype.categories._column)):
             # If both categories are of different Column types,
             # return a column full of Nulls.
             return _create_empty_categorical_column(self, dtype)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index e7a2863da8c..adc783c20c4 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -962,59 +962,59 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
         if len(self) == 0:
             dtype = cudf.dtype(dtype)
             if self.dtype == dtype:
-                if copy:
-                    return self.copy()
-                else:
-                    return self
+                result = self
             else:
-                return column_empty(0, dtype=dtype, masked=self.nullable)
-        if copy:
-            col = self.copy()
-        else:
-            col = self
-        if dtype == "category":
+                result = column_empty(0, dtype=dtype, masked=self.nullable)
+        elif dtype == "category":
             # TODO: Figure out why `cudf.dtype("category")`
             # astype's different than just the string
-            return col.as_categorical_column(dtype)
+            result = self.as_categorical_column(dtype)
         elif (
             isinstance(dtype, str)
             and dtype == "interval"
             and isinstance(self.dtype, cudf.IntervalDtype)
         ):
             # astype("interval") (the string only) should no-op
-            return col
-        was_object = dtype == object or dtype == np.dtype(object)
-        dtype = cudf.dtype(dtype)
-        if self.dtype == dtype:
-            return col
-        elif isinstance(dtype, CategoricalDtype):
-            return col.as_categorical_column(dtype)
-        elif isinstance(dtype, IntervalDtype):
-            return col.as_interval_column(dtype)
-        elif isinstance(dtype, (ListDtype, StructDtype)):
-            if not col.dtype == dtype:
-                raise NotImplementedError(
-                    f"Casting {self.dtype} columns not currently supported"
-                )
-            return col
-        elif isinstance(dtype, cudf.core.dtypes.DecimalDtype):
-            return col.as_decimal_column(dtype)
-        elif dtype.kind == "M":
-            return col.as_datetime_column(dtype)
-        elif dtype.kind == "m":
-            return col.as_timedelta_column(dtype)
-        elif dtype.kind == "O":
-            if cudf.get_option("mode.pandas_compatible") and was_object:
-                raise ValueError(
-                    f"Casting to {dtype} is not supported, use "
-                    "`.astype('str')` instead."
-                )
-            return col.as_string_column(dtype)
+            result = self
         else:
-            return col.as_numerical_column(dtype)
+            was_object = dtype == object or dtype == np.dtype(object)
+            dtype = cudf.dtype(dtype)
+            if self.dtype == dtype:
+                result = self
+            elif isinstance(dtype, CategoricalDtype):
+                result = self.as_categorical_column(dtype)
+            elif isinstance(dtype, IntervalDtype):
+                result = self.as_interval_column(dtype)
+            elif isinstance(dtype, (ListDtype, StructDtype)):
+                if not self.dtype == dtype:
+                    raise NotImplementedError(
+                        f"Casting {self.dtype} columns not currently supported"
+                    )
+                result = self
+            elif isinstance(dtype, cudf.core.dtypes.DecimalDtype):
+                result = self.as_decimal_column(dtype)
+            elif dtype.kind == "M":
+                result = self.as_datetime_column(dtype)
+            elif dtype.kind == "m":
+                result = self.as_timedelta_column(dtype)
+            elif dtype.kind == "O":
+                if cudf.get_option("mode.pandas_compatible") and was_object:
+                    raise ValueError(
+                        f"Casting to {dtype} is not supported, use "
+                        "`.astype('str')` instead."
+                    )
+                result = self.as_string_column(dtype)
+            else:
+                result = self.as_numerical_column(dtype)
+
+        if copy and result is self:
+            return result.copy()
+        return result
 
     def as_categorical_column(self, dtype) -> ColumnBase:
-        if isinstance(dtype, (cudf.CategoricalDtype, pd.CategoricalDtype)):
+        if isinstance(dtype, pd.CategoricalDtype):
+            dtype = cudf.CategoricalDtype.from_pandas(dtype)
+        if isinstance(dtype, cudf.CategoricalDtype):
             ordered = dtype.ordered
         else:
             ordered = False
@@ -1023,14 +1023,11 @@ def as_categorical_column(self, dtype) -> ColumnBase:
         if (
             isinstance(dtype, cudf.CategoricalDtype)
             and dtype._categories is not None
-        ) or (
-            isinstance(dtype, pd.CategoricalDtype)
-            and dtype.categories is not None
         ):
-            labels = self._label_encoding(cats=as_column(dtype.categories))
-
+            cat_col = dtype._categories
+            labels = self._label_encoding(cats=cat_col)
             return build_categorical_column(
-                categories=as_column(dtype.categories),
+                categories=cat_col,
                 codes=labels,
                 mask=self.mask,
                 ordered=dtype.ordered,

From 65e4e99d702aedbbfd489840d112faecfaeb43b9 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 8 Jul 2024 23:10:23 -0500
Subject: [PATCH 479/842] Remove CCCL patch for PR 211. (#16207)

While upgrading CCCL, we ran into a test failure in cuSpatial. We added a patch to revert some changes from CCCL but the root cause was a bug in cuSpatial. I have fixed that bug here: https://github.com/rapidsai/cuspatial/pull/1402

Once that PR is merged, we can remove this CCCL patch.

See also:
- rapids-cmake patch removal: https://github.com/rapidsai/rapids-cmake/pull/640
- Original rapids-cmake patch: https://github.com/rapidsai/rapids-cmake/pull/511
- CCCL epic to remove RAPIDS patches: https://github.com/NVIDIA/cccl/issues/1939

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/16207
---
 cpp/cmake/thirdparty/patches/cccl_override.json | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index e61102dffac..2f29578f7ae 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -3,11 +3,6 @@
   "packages" : {
     "CCCL" : {
       "patches" : [
-        {
-          "file" : "cccl/revert_pr_211.diff",
-          "issue" : "thrust::copy introduced a change in behavior that causes failures with cudaErrorInvalidValue.",
-          "fixed_in" : ""
-        },
         {
           "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
           "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",

From b693e79b1813276700f70c2cb251d6fef71851a1 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 9 Jul 2024 13:22:35 +0100
Subject: [PATCH 480/842] Handler csv reader options in cudf-polars (#16211)

Previously we were just relying on the default cudf read_csv options which doesn't do the right thing if the user has configured things.

Now that polars passes through the information to us, we can handle things properly, and raise for unsupported cases.

While here, update to new polars release and adapt tests to bug fixes that have been made upstream.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16211
---
 python/cudf/cudf/_lib/csv.pyx                 |   2 +-
 python/cudf_polars/cudf_polars/dsl/expr.py    |   4 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      | 104 +++++++++++++++--
 .../cudf_polars/cudf_polars/dsl/translate.py  |  12 +-
 python/cudf_polars/tests/test_scan.py         | 107 ++++++++++++++++--
 5 files changed, 206 insertions(+), 23 deletions(-)

diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index c706351a683..9fecff5f5f6 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -450,7 +450,7 @@ def read_csv(
                     col_name = df._data.names[index]
                     df._data[col_name] = df._data[col_name].astype(col_dtype)
 
-    if names is not None and isinstance(names[0], (int)):
+    if names is not None and len(names) and isinstance(names[0], (int)):
         df.columns = [int(x) for x in df._data]
 
     # Set index if the index_col parameter is passed
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 93cb9db7cbd..f83d9e82d30 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -32,7 +32,7 @@
 if TYPE_CHECKING:
     from collections.abc import Mapping, Sequence
 
-    import polars.polars as plrs
+    import polars as pl
     import polars.type_aliases as pl_types
 
     from cudf_polars.containers import DataFrame
@@ -377,7 +377,7 @@ class LiteralColumn(Expr):
     value: pa.Array[Any, Any]
     children: tuple[()]
 
-    def __init__(self, dtype: plc.DataType, value: plrs.PySeries) -> None:
+    def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
         super().__init__(dtype)
         data = value.to_arrow()
         self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 6b552642e88..b32fa9c273e 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -15,9 +15,9 @@
 
 import dataclasses
 import itertools
-import json
 import types
 from functools import cache
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Callable, ClassVar
 
 import pyarrow as pa
@@ -185,8 +185,10 @@ class Scan(IR):
 
     typ: str
     """What type of file are we reading? Parquet, CSV, etc..."""
-    options: tuple[Any, ...]
-    """Type specific options, as json-encoded strings."""
+    reader_options: dict[str, Any]
+    """Reader-specific options, as dictionary."""
+    cloud_options: dict[str, Any] | None
+    """Cloud-related authentication options, currently ignored."""
     paths: list[str]
     """List of paths to read from."""
     file_options: Any
@@ -206,9 +208,33 @@ def __post_init__(self) -> None:
         if self.file_options.n_rows is not None:
             raise NotImplementedError("row limit in scan")
         if self.typ not in ("csv", "parquet"):
+            raise NotImplementedError(f"Unhandled scan type: {self.typ}")
+        if self.cloud_options is not None and any(
+            self.cloud_options[k] is not None for k in ("aws", "azure", "gcp")
+        ):
             raise NotImplementedError(
-                f"Unhandled scan type: {self.typ}"
-            )  # pragma: no cover; polars raises on the rust side for now
+                "Read from cloud storage"
+            )  # pragma: no cover; no test yet
+        if self.typ == "csv":
+            if self.reader_options["skip_rows_after_header"] != 0:
+                raise NotImplementedError("Skipping rows after header in CSV reader")
+            parse_options = self.reader_options["parse_options"]
+            if (
+                null_values := parse_options["null_values"]
+            ) is not None and "Named" in null_values:
+                raise NotImplementedError(
+                    "Per column null value specification not supported for CSV reader"
+                )
+            if (
+                comment := parse_options["comment_prefix"]
+            ) is not None and "Multi" in comment:
+                raise NotImplementedError(
+                    "Multi-character comment prefix not supported for CSV reader"
+                )
+            if not self.reader_options["has_header"]:
+                # Need to do some file introspection to get the number
+                # of columns so that column projection works right.
+                raise NotImplementedError("Reading CSV without header")
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -216,14 +242,70 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         with_columns = options.with_columns
         row_index = options.row_index
         if self.typ == "csv":
-            opts, cloud_opts = map(json.loads, self.options)
-            df = DataFrame.from_cudf(
-                cudf.concat(
-                    [cudf.read_csv(p, usecols=with_columns) for p in self.paths]
+            dtype_map = {
+                name: cudf._lib.types.PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[typ.id()]
+                for name, typ in self.schema.items()
+            }
+            parse_options = self.reader_options["parse_options"]
+            sep = chr(parse_options["separator"])
+            quote = chr(parse_options["quote_char"])
+            eol = chr(parse_options["eol_char"])
+            if self.reader_options["schema"] is not None:
+                # Reader schema provides names
+                column_names = list(self.reader_options["schema"]["inner"].keys())
+            else:
+                # file provides column names
+                column_names = None
+            usecols = with_columns
+            # TODO: support has_header=False
+            header = 0
+
+            # polars defaults to no null recognition
+            null_values = [""]
+            if parse_options["null_values"] is not None:
+                ((typ, nulls),) = parse_options["null_values"].items()
+                if typ == "AllColumnsSingle":
+                    # Single value
+                    null_values.append(nulls)
+                else:
+                    # List of values
+                    null_values.extend(nulls)
+            if parse_options["comment_prefix"] is not None:
+                comment = chr(parse_options["comment_prefix"]["Single"])
+            else:
+                comment = None
+            decimal = "," if parse_options["decimal_comma"] else "."
+
+            # polars skips blank lines at the beginning of the file
+            pieces = []
+            for p in self.paths:
+                skiprows = self.reader_options["skip_rows"]
+                # TODO: read_csv expands globs which we should not do,
+                # because polars will already have handled them.
+                path = Path(p)
+                with path.open() as f:
+                    while f.readline() == "\n":
+                        skiprows += 1
+                pieces.append(
+                    cudf.read_csv(
+                        path,
+                        sep=sep,
+                        quotechar=quote,
+                        lineterminator=eol,
+                        names=column_names,
+                        header=header,
+                        usecols=usecols,
+                        na_filter=True,
+                        na_values=null_values,
+                        keep_default_na=False,
+                        skiprows=skiprows,
+                        comment=comment,
+                        decimal=decimal,
+                        dtype=dtype_map,
+                    )
                 )
-            )
+            df = DataFrame.from_cudf(cudf.concat(pieces))
         elif self.typ == "parquet":
-            opts, cloud_opts = map(json.loads, self.options)
             cdf = cudf.read_parquet(self.paths, columns=with_columns)
             assert isinstance(cdf, cudf.DataFrame)
             df = DataFrame.from_cudf(cdf)
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index 5a1e682abe7..dec45679c75 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+import json
 from contextlib import AbstractContextManager, nullcontext
 from functools import singledispatch
 from typing import Any
@@ -12,6 +13,7 @@
 import pyarrow as pa
 from typing_extensions import assert_never
 
+import polars as pl
 import polars.polars as plrs
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
@@ -88,10 +90,16 @@ def _(
     node: pl_ir.Scan, visitor: NodeTraverser, schema: dict[str, plc.DataType]
 ) -> ir.IR:
     typ, *options = node.scan_type
+    if typ == "ndjson":
+        (reader_options,) = map(json.loads, options)
+        cloud_options = None
+    else:
+        reader_options, cloud_options = map(json.loads, options)
     return ir.Scan(
         schema,
         typ,
-        tuple(options),
+        reader_options,
+        cloud_options,
         node.paths,
         node.file_options,
         translate_named_expr(visitor, n=node.predicate)
@@ -402,7 +410,7 @@ def _(node: pl_expr.Window, visitor: NodeTraverser, dtype: plc.DataType) -> expr
 @_translate_expr.register
 def _(node: pl_expr.Literal, visitor: NodeTraverser, dtype: plc.DataType) -> expr.Expr:
     if isinstance(node.value, plrs.PySeries):
-        return expr.LiteralColumn(dtype, node.value)
+        return expr.LiteralColumn(dtype, pl.Series._from_pyseries(node.value))
     value = pa.scalar(node.value, type=plc.interop.to_arrow(dtype))
     return expr.Literal(dtype, value)
 
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index f129cc7ca32..c41a94da14b 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -22,22 +22,22 @@ def row_index(request):
 
 @pytest.fixture(
     params=[
-        (None, 0),
+        None,
         pytest.param(
-            (2, 1), marks=pytest.mark.xfail(reason="No handling of row limit in scan")
+            2, marks=pytest.mark.xfail(reason="No handling of row limit in scan")
         ),
         pytest.param(
-            (3, 0), marks=pytest.mark.xfail(reason="No handling of row limit in scan")
+            3, marks=pytest.mark.xfail(reason="No handling of row limit in scan")
         ),
     ],
     ids=["all-rows", "n_rows-with-skip", "n_rows-no-skip"],
 )
-def n_rows_skip_rows(request):
+def n_rows(request):
     return request.param
 
 
 @pytest.fixture(params=["csv", "parquet"])
-def df(request, tmp_path, row_index, n_rows_skip_rows):
+def df(request, tmp_path, row_index, n_rows):
     df = pl.DataFrame(
         {
             "a": [1, 2, 3, None],
@@ -46,14 +46,12 @@ def df(request, tmp_path, row_index, n_rows_skip_rows):
         }
     )
     name, offset = row_index
-    n_rows, skip_rows = n_rows_skip_rows
     if request.param == "csv":
         df.write_csv(tmp_path / "file.csv")
         return pl.scan_csv(
             tmp_path / "file.csv",
             row_index_name=name,
             row_index_offset=offset,
-            skip_rows_after_header=skip_rows,
             n_rows=n_rows,
         )
     else:
@@ -97,3 +95,98 @@ def test_scan_unsupported_raises(tmp_path):
     df.write_ndjson(tmp_path / "df.json")
     q = pl.scan_ndjson(tmp_path / "df.json")
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_row_index_projected_out(tmp_path):
+    df = pl.DataFrame({"a": [1, 2, 3]})
+
+    df.write_parquet(tmp_path / "df.pq")
+
+    q = pl.scan_parquet(tmp_path / "df.pq").with_row_index().select(pl.col("a"))
+
+    assert_gpu_result_equal(q)
+
+
+def test_scan_csv_column_renames_projection_schema(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+
+    q = pl.scan_csv(
+        tmp_path / "test.csv",
+        with_column_names=lambda names: [f"{n}_suffix" for n in names],
+        schema_overrides={
+            "foo_suffix": pl.String(),
+            "bar_suffix": pl.Int8(),
+            "baz_suffix": pl.UInt16(),
+        },
+    )
+
+    assert_gpu_result_equal(q)
+
+
+def test_scan_csv_skip_after_header_not_implemented(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2,3\n3,4,5""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", skip_rows_after_header=1)
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_csv_null_values_per_column_not_implemented(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2,3\n3,4,5""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", null_values={"foo": "1", "baz": "5"})
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_csv_comment_str_not_implemented(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n// 1,2,3\n3,4,5""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", comment_prefix="// ")
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_csv_comment_char(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n# 1,2,3\n3,4,5""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", comment_prefix="#")
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("nulls", [None, "3", ["3", "5"]])
+def test_scan_csv_null_values(tmp_path, nulls):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2,3\n3,4,5\n5,,2""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", null_values=nulls)
+
+    assert_gpu_result_equal(q)
+
+
+def test_scan_csv_decimal_comma(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""foo|bar|baz\n1,23|2,34|3,56\n1""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", separator="|", decimal_comma=True)
+
+    assert_gpu_result_equal(q)
+
+
+def test_scan_csv_skip_initial_empty_rows(tmp_path):
+    with (tmp_path / "test.csv").open("w") as f:
+        f.write("""\n\n\n\nfoo|bar|baz\n1|2|3\n1""")
+
+    q = pl.scan_csv(tmp_path / "test.csv", separator="|", skip_rows=1, has_header=False)
+
+    assert_ir_translation_raises(q, NotImplementedError)
+
+    q = pl.scan_csv(tmp_path / "test.csv", separator="|", skip_rows=1)
+
+    assert_gpu_result_equal(q)

From 75966deef548754a5a7f5fb49f1cf5b1be991363 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Tue, 9 Jul 2024 06:59:56 -0700
Subject: [PATCH 481/842] Publish cudf-polars nightlies (#16213)

Publish nightlies for cudf-polars.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16213
---
 .github/workflows/build.yaml | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index c5679cc5141..2e5959338b0 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -108,6 +108,28 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: dask_cudf
+  wheel-build-cudf-polars:
+    needs: wheel-publish-cudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_cudf_polars.sh
+  wheel-publish-cudf-polars:
+    needs: wheel-build-cudf-polars
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: cudf_polars
   trigger-pandas-tests:
     if: inputs.build_type == 'nightly'
     needs: wheel-build-cudf

From 433e959deab26ccf1eb9b75b8ea3e21659da4f0a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 9 Jul 2024 10:45:05 -0400
Subject: [PATCH 482/842] Free temp memory no longer needed in multibyte_split
 processing (#16091)

Updates the `multibyte_split` logic to free temporary memory once the chars and offsets have been resolved. This gives room to the remaining processing if more temp memory is required.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/16091
---
 cpp/src/io/text/multibyte_split.cu | 324 ++++++++++++++---------------
 1 file changed, 162 insertions(+), 162 deletions(-)

diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 51dc0ca90af..be2e2b9a79c 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -55,6 +55,8 @@
 #include <numeric>
 #include <optional>
 
+namespace cudf::io::text {
+namespace detail {
 namespace {
 
 using cudf::io::text::detail::multistate;
@@ -299,11 +301,6 @@ CUDF_KERNEL __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel(
 
 }  // namespace
 
-namespace cudf {
-namespace io {
-namespace text {
-namespace detail {
-
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               byte_range_info byte_range,
@@ -336,173 +333,181 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   CUDF_EXPECTS(delimiter.size() < multistate::max_segment_value,
                "delimiter contains too many total tokens to produce a deterministic result.");
 
-  auto const concurrency = 2;
-
-  // must be at least 32 when using warp-reduce on partials
-  // must be at least 1 more than max possible concurrent tiles
-  // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s
-  auto num_tile_states = std::max(32, TILES_PER_CHUNK * concurrency + 32);
-  auto tile_multistates =
-    scan_tile_state<multistate>(num_tile_states, stream, rmm::mr::get_current_device_resource());
-  auto tile_offsets =
-    scan_tile_state<output_offset>(num_tile_states, stream, rmm::mr::get_current_device_resource());
-
-  multibyte_split_init_kernel<<<TILES_PER_CHUNK,
-                                THREADS_PER_TILE,
-                                0,
-                                stream.value()>>>(  //
-    -TILES_PER_CHUNK,
-    TILES_PER_CHUNK,
-    tile_multistates,
-    tile_offsets,
-    cudf::io::text::detail::scan_tile_status::oob);
-
-  auto multistate_seed = multistate();
-  multistate_seed.enqueue(0, 0);  // this represents the first state in the pattern.
-
-  // Seeding the tile state with an identity value allows the 0th tile to follow the same logic as
-  // the Nth tile, assuming it can look up an inclusive prefix. Without this seed, the 0th block
-  // would have to follow separate logic.
-  cudf::detail::device_single_thread(
-    [tm = scan_tile_state_view<multistate>(tile_multistates),
-     to = scan_tile_state_view<output_offset>(tile_offsets),
-     multistate_seed] __device__() mutable {
-      tm.set_inclusive_prefix(-1, multistate_seed);
-      to.set_inclusive_prefix(-1, 0);
-    },
-    stream);
-
-  auto reader               = source.create_reader();
-  auto chunk_offset         = std::max<byte_offset>(0, byte_range.offset() - delimiter.size());
-  auto const byte_range_end = byte_range.offset() + byte_range.size();
-  reader->skip_bytes(chunk_offset);
-  // amortize output chunk allocations over 8 worst-case outputs. This limits the overallocation
-  constexpr auto max_growth = 8;
-  output_builder<byte_offset> row_offset_storage(ITEMS_PER_CHUNK, max_growth, stream);
-  output_builder<char> char_storage(ITEMS_PER_CHUNK, max_growth, stream);
-
-  auto streams = cudf::detail::fork_streams(stream, concurrency);
-
-  cudaEvent_t last_launch_event;
-  CUDF_CUDA_TRY(cudaEventCreate(&last_launch_event));
-
-  auto& read_stream     = streams[0];
-  auto& scan_stream     = streams[1];
-  auto chunk            = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream);
-  int64_t base_tile_idx = 0;
+  auto chunk_offset = std::max<byte_offset>(0, byte_range.offset() - delimiter.size());
   std::optional<byte_offset> first_row_offset;
-  std::optional<byte_offset> last_row_offset;
-  bool found_last_offset = false;
   if (byte_range.offset() == 0) { first_row_offset = 0; }
-  std::swap(read_stream, scan_stream);
-
-  while (chunk->size() > 0) {
-    // if we found the last delimiter, or didn't find delimiters inside the byte range at all: abort
-    if (last_row_offset.has_value() or
-        (not first_row_offset.has_value() and chunk_offset >= byte_range_end)) {
-      break;
-    }
-
-    auto tiles_in_launch =
-      cudf::util::div_rounding_up_safe(chunk->size(), static_cast<std::size_t>(ITEMS_PER_TILE));
-
-    auto row_offsets = row_offset_storage.next_output(scan_stream);
+  std::optional<byte_offset> last_row_offset;
 
-    // reset the next chunk of tile state
-    multibyte_split_init_kernel<<<tiles_in_launch,
+  auto [global_offsets, chars] = [&] {
+    // must be at least 32 when using warp-reduce on partials
+    // must be at least 1 more than max possible concurrent tiles
+    // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s
+    auto const concurrency = 2;
+    auto num_tile_states   = std::max(32, TILES_PER_CHUNK * concurrency + 32);
+    auto tile_multistates =
+      scan_tile_state<multistate>(num_tile_states, stream, rmm::mr::get_current_device_resource());
+    auto tile_offsets = scan_tile_state<output_offset>(
+      num_tile_states, stream, rmm::mr::get_current_device_resource());
+
+    multibyte_split_init_kernel<<<TILES_PER_CHUNK,
                                   THREADS_PER_TILE,
                                   0,
-                                  scan_stream.value()>>>(  //
-      base_tile_idx,
-      tiles_in_launch,
+                                  stream.value()>>>(  //
+      -TILES_PER_CHUNK,
+      TILES_PER_CHUNK,
       tile_multistates,
-      tile_offsets);
+      tile_offsets,
+      cudf::io::text::detail::scan_tile_status::oob);
 
-    CUDF_CUDA_TRY(cudaStreamWaitEvent(scan_stream.value(), last_launch_event));
+    auto multistate_seed = multistate();
+    multistate_seed.enqueue(0, 0);  // this represents the first state in the pattern.
 
-    if (delimiter.size() == 1) {
-      // the single-byte case allows for a much more efficient kernel, so we special-case it
-      byte_split_kernel<<<tiles_in_launch,
-                          THREADS_PER_TILE,
-                          0,
-                          scan_stream.value()>>>(  //
-        base_tile_idx,
-        chunk_offset,
-        row_offset_storage.size(),
-        tile_offsets,
-        delimiter[0],
-        *chunk,
-        row_offsets);
-    } else {
-      multibyte_split_kernel<<<tiles_in_launch,
-                               THREADS_PER_TILE,
-                               0,
-                               scan_stream.value()>>>(  //
+    // Seeding the tile state with an identity value allows the 0th tile to follow the same logic as
+    // the Nth tile, assuming it can look up an inclusive prefix. Without this seed, the 0th block
+    // would have to follow separate logic.
+    cudf::detail::device_single_thread(
+      [tm = scan_tile_state_view<multistate>(tile_multistates),
+       to = scan_tile_state_view<output_offset>(tile_offsets),
+       multistate_seed] __device__() mutable {
+        tm.set_inclusive_prefix(-1, multistate_seed);
+        to.set_inclusive_prefix(-1, 0);
+      },
+      stream);
+
+    auto reader               = source.create_reader();
+    auto const byte_range_end = byte_range.offset() + byte_range.size();
+    reader->skip_bytes(chunk_offset);
+    // amortize output chunk allocations over 8 worst-case outputs. This limits the overallocation
+    constexpr auto max_growth = 8;
+    output_builder<byte_offset> row_offset_storage(ITEMS_PER_CHUNK, max_growth, stream);
+    output_builder<char> char_storage(ITEMS_PER_CHUNK, max_growth, stream);
+
+    auto streams = cudf::detail::fork_streams(stream, concurrency);
+
+    cudaEvent_t last_launch_event;
+    CUDF_CUDA_TRY(cudaEventCreate(&last_launch_event));
+
+    auto& read_stream      = streams[0];
+    auto& scan_stream      = streams[1];
+    auto chunk             = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream);
+    int64_t base_tile_idx  = 0;
+    bool found_last_offset = false;
+    std::swap(read_stream, scan_stream);
+
+    while (chunk->size() > 0) {
+      // if we found the last delimiter, or didn't find delimiters inside the byte range at all:
+      // abort
+      if (last_row_offset.has_value() or
+          (not first_row_offset.has_value() and chunk_offset >= byte_range_end)) {
+        break;
+      }
+
+      auto tiles_in_launch =
+        cudf::util::div_rounding_up_safe(chunk->size(), static_cast<std::size_t>(ITEMS_PER_TILE));
+
+      auto row_offsets = row_offset_storage.next_output(scan_stream);
+
+      // reset the next chunk of tile state
+      multibyte_split_init_kernel<<<tiles_in_launch,
+                                    THREADS_PER_TILE,
+                                    0,
+                                    scan_stream.value()>>>(  //
         base_tile_idx,
-        chunk_offset,
-        row_offset_storage.size(),
+        tiles_in_launch,
         tile_multistates,
-        tile_offsets,
-        {device_delim.data(), static_cast<std::size_t>(device_delim.size())},
-        *chunk,
-        row_offsets);
-    }
+        tile_offsets);
+
+      CUDF_CUDA_TRY(cudaStreamWaitEvent(scan_stream.value(), last_launch_event));
+
+      if (delimiter.size() == 1) {
+        // the single-byte case allows for a much more efficient kernel, so we special-case it
+        byte_split_kernel<<<tiles_in_launch,
+                            THREADS_PER_TILE,
+                            0,
+                            scan_stream.value()>>>(  //
+          base_tile_idx,
+          chunk_offset,
+          row_offset_storage.size(),
+          tile_offsets,
+          delimiter[0],
+          *chunk,
+          row_offsets);
+      } else {
+        multibyte_split_kernel<<<tiles_in_launch,
+                                 THREADS_PER_TILE,
+                                 0,
+                                 scan_stream.value()>>>(  //
+          base_tile_idx,
+          chunk_offset,
+          row_offset_storage.size(),
+          tile_multistates,
+          tile_offsets,
+          {device_delim.data(), static_cast<std::size_t>(device_delim.size())},
+          *chunk,
+          row_offsets);
+      }
 
-    // load the next chunk
-    auto next_chunk = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream);
-    // while that is running, determine how many offsets we output (synchronizes)
-    auto const new_offsets = [&] {
-      auto const new_offsets_unclamped =
-        tile_offsets.get_inclusive_prefix(base_tile_idx + tiles_in_launch - 1, scan_stream) -
-        static_cast<output_offset>(row_offset_storage.size());
-      // if we are not in the last chunk, we can use all offsets
-      if (chunk_offset + static_cast<output_offset>(chunk->size()) < byte_range_end) {
-        return new_offsets_unclamped;
+      // load the next chunk
+      auto next_chunk = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream);
+      // while that is running, determine how many offsets we output (synchronizes)
+      auto const new_offsets = [&] {
+        auto const new_offsets_unclamped =
+          tile_offsets.get_inclusive_prefix(base_tile_idx + tiles_in_launch - 1, scan_stream) -
+          static_cast<output_offset>(row_offset_storage.size());
+        // if we are not in the last chunk, we can use all offsets
+        if (chunk_offset + static_cast<output_offset>(chunk->size()) < byte_range_end) {
+          return new_offsets_unclamped;
+        }
+        // if we are in the last chunk, we need to find the first out-of-bounds offset
+        auto const it = thrust::make_counting_iterator(output_offset{});
+        auto const end_loc =
+          *thrust::find_if(rmm::exec_policy_nosync(scan_stream),
+                           it,
+                           it + new_offsets_unclamped,
+                           [row_offsets, byte_range_end] __device__(output_offset i) {
+                             return row_offsets[i] >= byte_range_end;
+                           });
+        // if we had no out-of-bounds offset, we copy all offsets
+        if (end_loc == new_offsets_unclamped) { return end_loc; }
+        // otherwise we copy only up to (including) the first out-of-bounds delimiter
+        found_last_offset = true;
+        return end_loc + 1;
+      }();
+      row_offset_storage.advance_output(new_offsets, scan_stream);
+      // determine if we found the first or last field offset for the byte range
+      if (new_offsets > 0 and not first_row_offset) {
+        first_row_offset = row_offset_storage.front_element(scan_stream);
+      }
+      if (found_last_offset) { last_row_offset = row_offset_storage.back_element(scan_stream); }
+      // copy over the characters we need, if we already encountered the first field delimiter
+      if (first_row_offset.has_value()) {
+        auto const begin =
+          chunk->data() + std::max<byte_offset>(0, *first_row_offset - chunk_offset);
+        auto const sentinel = last_row_offset.value_or(std::numeric_limits<byte_offset>::max());
+        auto const end =
+          chunk->data() + std::min<byte_offset>(sentinel - chunk_offset, chunk->size());
+        auto const output_size = end - begin;
+        auto char_output       = char_storage.next_output(scan_stream);
+        thrust::copy(rmm::exec_policy_nosync(scan_stream), begin, end, char_output.begin());
+        char_storage.advance_output(output_size, scan_stream);
       }
-      // if we are in the last chunk, we need to find the first out-of-bounds offset
-      auto const it = thrust::make_counting_iterator(output_offset{});
-      auto const end_loc =
-        *thrust::find_if(rmm::exec_policy_nosync(scan_stream),
-                         it,
-                         it + new_offsets_unclamped,
-                         [row_offsets, byte_range_end] __device__(output_offset i) {
-                           return row_offsets[i] >= byte_range_end;
-                         });
-      // if we had no out-of-bounds offset, we copy all offsets
-      if (end_loc == new_offsets_unclamped) { return end_loc; }
-      // otherwise we copy only up to (including) the first out-of-bounds delimiter
-      found_last_offset = true;
-      return end_loc + 1;
-    }();
-    row_offset_storage.advance_output(new_offsets, scan_stream);
-    // determine if we found the first or last field offset for the byte range
-    if (new_offsets > 0 and not first_row_offset) {
-      first_row_offset = row_offset_storage.front_element(scan_stream);
-    }
-    if (found_last_offset) { last_row_offset = row_offset_storage.back_element(scan_stream); }
-    // copy over the characters we need, if we already encountered the first field delimiter
-    if (first_row_offset.has_value()) {
-      auto const begin = chunk->data() + std::max<byte_offset>(0, *first_row_offset - chunk_offset);
-      auto const sentinel = last_row_offset.value_or(std::numeric_limits<byte_offset>::max());
-      auto const end =
-        chunk->data() + std::min<byte_offset>(sentinel - chunk_offset, chunk->size());
-      auto const output_size = end - begin;
-      auto char_output       = char_storage.next_output(scan_stream);
-      thrust::copy(rmm::exec_policy_nosync(scan_stream), begin, end, char_output.begin());
-      char_storage.advance_output(output_size, scan_stream);
-    }
 
-    CUDF_CUDA_TRY(cudaEventRecord(last_launch_event, scan_stream.value()));
+      CUDF_CUDA_TRY(cudaEventRecord(last_launch_event, scan_stream.value()));
 
-    std::swap(read_stream, scan_stream);
-    base_tile_idx += tiles_in_launch;
-    chunk_offset += chunk->size();
-    chunk = std::move(next_chunk);
-  }
+      std::swap(read_stream, scan_stream);
+      base_tile_idx += tiles_in_launch;
+      chunk_offset += chunk->size();
+      chunk = std::move(next_chunk);
+    }
+
+    CUDF_CUDA_TRY(cudaEventDestroy(last_launch_event));
 
-  CUDF_CUDA_TRY(cudaEventDestroy(last_launch_event));
+    cudf::detail::join_streams(streams, stream);
 
-  cudf::detail::join_streams(streams, stream);
+    auto chars          = char_storage.gather(stream, mr);
+    auto global_offsets = row_offset_storage.gather(stream, mr);
+    return std::pair{std::move(global_offsets), std::move(chars)};
+  }();
 
   // if the input was empty, we didn't find a delimiter at all,
   // or the first delimiter was also the last: empty output
@@ -511,9 +516,6 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
     return make_empty_column(type_id::STRING);
   }
 
-  auto chars          = char_storage.gather(stream, mr);
-  auto global_offsets = row_offset_storage.gather(stream, mr);
-
   // insert an offset at the beginning if we started at the beginning of the input
   bool const insert_begin = first_row_offset.value_or(0) == 0;
   // insert an offset at the end if we have not terminated the last row
@@ -591,6 +593,4 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   return result;
 }
 
-}  // namespace text
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::text

From 341e014ed22e7da1e4b8db66a1d7b6fd5fba98e9 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 9 Jul 2024 15:47:38 -0400
Subject: [PATCH 483/842] Support `pd.read_pickle` and `pd.to_pickle` in
 `cudf.pandas` (#16105)

Closes #15459

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16105
---
 python/cudf/cudf/pandas/_wrappers/pandas.py       | 6 ++++++
 python/cudf/cudf_pandas_tests/test_cudf_pandas.py | 7 +++++++
 2 files changed, 13 insertions(+)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index dd6f6fe76ba..3f94fc18980 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -919,6 +919,12 @@ def Index__new__(cls, *args, **kwargs):
 
 _eval_func = _FunctionProxy(_Unusable(), pd.eval)
 
+register_proxy_func(pd.read_pickle)(
+    _FunctionProxy(_Unusable(), pd.read_pickle)
+)
+
+register_proxy_func(pd.to_pickle)(_FunctionProxy(_Unusable(), pd.to_pickle))
+
 
 def _get_eval_locals_and_globals(level, local_dict=None, global_dict=None):
     frame = sys._getframe(level + 3)
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index b0aeaba3916..bc864a48e9d 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1080,6 +1080,13 @@ def test_pickle(obj):
 
     tm.assert_equal(obj, copy)
 
+    with tempfile.TemporaryFile() as f:
+        xpd.to_pickle(obj, f)
+        f.seek(0)
+        copy = xpd.read_pickle(f)
+
+    tm.assert_equal(obj, copy)
+
 
 def test_dataframe_query():
     cudf_pandas_df = xpd.DataFrame({"foo": [1, 2, 3], "bar": [4, 5, 6]})

From 7cc01befa61d7957093bf32b99b4cac1364761f7 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 9 Jul 2024 14:05:11 -0700
Subject: [PATCH 484/842] Parallelize `gpuInitStringDescriptors` for fixed
 length byte array data (#16109)

Closes #14113

This PR parallelizes the `gpuInitStringDescriptors` function for the fixed length byte array (FLBA) data at either warp or thread block level via cooperative groups. The function continues to execute serially (thread rank 0 in the group) for variable length arrays.

CC: @etseidl

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/16109
---
 cpp/src/io/parquet/decode_preprocess.cu  |  5 +-
 cpp/src/io/parquet/page_data.cu          |  7 ++-
 cpp/src/io/parquet/page_decode.cuh       | 69 +++++++++++++++---------
 cpp/src/io/parquet/page_string_decode.cu | 10 +++-
 4 files changed, 60 insertions(+), 31 deletions(-)

diff --git a/cpp/src/io/parquet/decode_preprocess.cu b/cpp/src/io/parquet/decode_preprocess.cu
index e49801e6172..62f1ee88036 100644
--- a/cpp/src/io/parquet/decode_preprocess.cu
+++ b/cpp/src/io/parquet/decode_preprocess.cu
@@ -26,6 +26,8 @@
 
 namespace cudf::io::parquet::detail {
 
+namespace cg = cooperative_groups;
+
 namespace {
 
 // # of threads we're decoding with
@@ -163,7 +165,8 @@ __device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
       // For V1, the choice is an overestimate (s->dict_size), or an exact number that's
       // expensive to compute. For now we're going with the latter.
       else {
-        str_len = gpuInitStringDescriptors<true, unused_state_buf>(s, nullptr, target_pos, t);
+        str_len = gpuInitStringDescriptors<true, unused_state_buf>(
+          s, nullptr, target_pos, cg::this_thread_block());
       }
       break;
 
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 7207173b82f..e0d50d7ccf9 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -23,6 +23,8 @@
 
 namespace cudf::io::parquet::detail {
 
+namespace cg = cooperative_groups;
+
 namespace {
 
 constexpr int decode_block_size = 128;
@@ -277,6 +279,7 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     }
     // this needs to be here to prevent warp 3 modifying src_pos before all threads have read it
     __syncthreads();
+    auto const tile_warp = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
     if (t < 32) {
       // decode repetition and definition levels.
       // - update validity vectors
@@ -298,9 +301,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
         src_target_pos = gpuDecodeRleBooleans(s, sb, src_target_pos, t & 0x1f);
       } else if (s->col.physical_type == BYTE_ARRAY or
                  s->col.physical_type == FIXED_LEN_BYTE_ARRAY) {
-        gpuInitStringDescriptors<false>(s, sb, src_target_pos, t & 0x1f);
+        gpuInitStringDescriptors<false>(s, sb, src_target_pos, tile_warp);
       }
-      if (t == 32) { s->dict_pos = src_target_pos; }
+      if (tile_warp.thread_rank() == 0) { s->dict_pos = src_target_pos; }
     } else {
       // WARP1..WARP3: Decode values
       int const dtype = s->col.physical_type;
diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index b1f8e6dd5fe..a3f91f6859b 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -21,6 +21,7 @@
 #include "parquet_gpu.hpp"
 #include "rle_stream.cuh"
 
+#include <cooperative_groups.h>
 #include <cuda/atomic>
 #include <cuda/std/tuple>
 
@@ -420,46 +421,62 @@ inline __device__ int gpuDecodeRleBooleans(page_state_s* s, state_buf* sb, int t
  * @param[in,out] s Page state input/output
  * @param[out] sb Page state buffer output
  * @param[in] target_pos Target output position
- * @param[in] t Thread ID
+ * @param[in] g Cooperative group (thread block or tile)
  * @tparam sizes_only True if only sizes are to be calculated
  * @tparam state_buf Typename of the `state_buf` (usually inferred)
+ * @tparam thread_group Typename of the cooperative group (inferred)
  *
  * @return Total length of strings processed
  */
-template <bool sizes_only, typename state_buf>
-__device__ size_type
-gpuInitStringDescriptors(page_state_s* s, [[maybe_unused]] state_buf* sb, int target_pos, int t)
+template <bool sizes_only, typename state_buf, typename thread_group>
+__device__ size_type gpuInitStringDescriptors(page_state_s* s,
+                                              [[maybe_unused]] state_buf* sb,
+                                              int target_pos,
+                                              thread_group const& g)
 {
-  int pos       = s->dict_pos;
-  int total_len = 0;
+  int const t         = g.thread_rank();
+  int const dict_size = s->dict_size;
+  int k               = s->dict_val;
+  int pos             = s->dict_pos;
+  int total_len       = 0;
+
+  // All group threads can participate for fixed len byte arrays.
+  if (s->col.physical_type == FIXED_LEN_BYTE_ARRAY) {
+    int const dtype_len_in = s->dtype_len_in;
+    total_len              = min((target_pos - pos) * dtype_len_in, dict_size - s->dict_val);
+    if constexpr (!sizes_only) {
+      for (pos += t, k += t * dtype_len_in; pos < target_pos; pos += g.size()) {
+        sb->str_len[rolling_index<state_buf::str_buf_size>(pos)] =
+          (k < dict_size) ? dtype_len_in : 0;
+        // dict_idx is upperbounded by dict_size.
+        sb->dict_idx[rolling_index<state_buf::dict_buf_size>(pos)] = k;
+        // Increment k if needed.
+        if (k < dict_size) { k = min(k + (g.size() * dtype_len_in), dict_size); }
+      }
+    }
+    // Only thread_rank = 0 updates the s->dict_val
+    if (!t) { s->dict_val += total_len; }
+  }
+  // This step is purely serial for byte arrays
+  else {
+    if (!t) {
+      uint8_t const* cur = s->data_start;
 
-  // This step is purely serial
-  if (!t) {
-    uint8_t const* cur = s->data_start;
-    int dict_size      = s->dict_size;
-    int k              = s->dict_val;
-
-    while (pos < target_pos) {
-      int len = 0;
-      if (s->col.physical_type == FIXED_LEN_BYTE_ARRAY) {
-        if (k < dict_size) { len = s->dtype_len_in; }
-      } else {
+      for (int len = 0; pos < target_pos; pos++, len = 0) {
         if (k + 4 <= dict_size) {
           len = (cur[k]) | (cur[k + 1] << 8) | (cur[k + 2] << 16) | (cur[k + 3] << 24);
           k += 4;
           if (k + len > dict_size) { len = 0; }
         }
+        if constexpr (!sizes_only) {
+          sb->dict_idx[rolling_index<state_buf::dict_buf_size>(pos)] = k;
+          sb->str_len[rolling_index<state_buf::str_buf_size>(pos)]   = len;
+        }
+        k += len;
+        total_len += len;
       }
-      if constexpr (!sizes_only) {
-        sb->dict_idx[rolling_index<state_buf::dict_buf_size>(pos)] = k;
-        sb->str_len[rolling_index<state_buf::str_buf_size>(pos)]   = len;
-      }
-      k += len;
-      total_len += len;
-      pos++;
+      s->dict_val = k;
     }
-    s->dict_val = k;
-    __threadfence_block();
   }
 
   return total_len;
diff --git a/cpp/src/io/parquet/page_string_decode.cu b/cpp/src/io/parquet/page_string_decode.cu
index 58e8a09d5b6..ca74a1c2ba0 100644
--- a/cpp/src/io/parquet/page_string_decode.cu
+++ b/cpp/src/io/parquet/page_string_decode.cu
@@ -31,6 +31,8 @@
 
 namespace cudf::io::parquet::detail {
 
+namespace cg = cooperative_groups;
+
 namespace {
 
 constexpr int preprocess_block_size    = 512;
@@ -1006,6 +1008,10 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
     }
     // this needs to be here to prevent warp 1/2 modifying src_pos before all threads have read it
     __syncthreads();
+
+    // Create a warp sized thread block tile
+    auto const tile_warp = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
+
     if (t < 32) {
       // decode repetition and definition levels.
       // - update validity vectors
@@ -1020,9 +1026,9 @@ CUDF_KERNEL void __launch_bounds__(decode_block_size)
       if (s->dict_base) {
         src_target_pos = gpuDecodeDictionaryIndices<false>(s, sb, src_target_pos, lane_id).first;
       } else {
-        gpuInitStringDescriptors<false>(s, sb, src_target_pos, lane_id);
+        gpuInitStringDescriptors<false>(s, sb, src_target_pos, tile_warp);
       }
-      if (t == 32) { s->dict_pos = src_target_pos; }
+      if (tile_warp.thread_rank() == 0) { s->dict_pos = src_target_pos; }
     } else {
       int const me = t - out_thread0;
 

From 248b2de61e6b9df5fec5a15019d6db4dc52cbc01 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 9 Jul 2024 18:50:19 -0400
Subject: [PATCH 485/842] Migrate pylibcudf lists gathering (#16170)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16170
---
 .../_lib/pylibcudf/libcudf/lists/gather.pxd   |  4 +--
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  2 ++
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 32 +++++++++++++++++++
 .../cudf/cudf/pylibcudf_tests/test_lists.py   | 14 ++++++++
 4 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
index 17b4c1877a6..ab7ed141365 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
@@ -10,6 +10,6 @@ from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
 
 cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil:
     cdef unique_ptr[column] segmented_gather(
-        const lists_column_view source_column,
-        const lists_column_view gather_map_list
+        const lists_column_view& source_column,
+        const lists_column_view& gather_map_list
     ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index c9d0a84e8ac..c9c43751a43 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -25,3 +25,5 @@ cpdef Column contains_nulls(Column)
 cpdef Column index_of(Column, ColumnOrScalar, bool)
 
 cpdef Column reverse(Column)
+
+cpdef Column segmented_gather(Column, Column)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index 651f1346f88..9c56f1139c6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -9,6 +9,7 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.lists cimport (
     contains as cpp_contains,
     explode as cpp_explode,
+    gather as cpp_gather,
     reverse as cpp_reverse,
 )
 from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
@@ -232,3 +233,34 @@ cpdef Column reverse(Column input):
             list_view.view(),
         ))
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column segmented_gather(Column input, Column gather_map_list):
+    """Create a column with elements gathered based on the indices in gather_map_list
+
+    For details, see :cpp:func:`segmented_gather`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    gather_map_list : Column
+        The indices of the lists column to gather.
+
+    Returns
+    -------
+    Column
+        A new Column with elements in list of rows
+        gathered based on gather_map_list
+    """
+
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view1 = input.list_view()
+    cdef ListColumnView list_view2 = gather_map_list.list_view()
+
+    with nogil:
+        c_result = move(cpp_gather.segmented_gather(
+            list_view1.view(),
+            list_view2.view(),
+        ))
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index 58a1dcf8d56..0d95579acb3 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -146,3 +146,17 @@ def test_reverse(test_data):
     expect = pa.array([lst[::-1] for lst in list_column])
 
     assert_column_eq(expect, res)
+
+
+def test_segmented_gather(test_data):
+    list_column1 = test_data[0][0]
+    list_column2 = test_data[0][1]
+
+    plc_column1 = plc.interop.from_arrow(pa.array(list_column1))
+    plc_column2 = plc.interop.from_arrow(pa.array(list_column2))
+
+    res = plc.lists.segmented_gather(plc_column2, plc_column1)
+
+    expect = pa.array([[8, 9], [14], [0], [0, 0]])
+
+    assert_column_eq(expect, res)

From 67bd3669947da33fc56eb2b397ebbdb66223119e Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 9 Jul 2024 16:29:08 -0700
Subject: [PATCH 486/842] Support `arrow:schema` in Parquet writer to
 faithfully roundtrip `duration` types with Arrow (#15875)

Closes #15847

This PR adds the support to construct and write base64-encoded serialized `arrow:schema`-type IPC message to parquet file footer to allow faithfully roundtrip with Arrow via Parquet for `duration` type.

### Answered
- [x] Only construct and write `arrow:schema` if  asked by the user via `store_schema` argument (cudf) or `write_arrow_schema` (libcudf). i.e. Default these variables to `false` otherwise.
- [x] The internal/libcudf variable name for `store_schema` can stay `write_arrow_schema` and it should be fine. This has been done to disambiguate which schema (arrow or parquet) we are talking about.
- [x] Separate PR: `int96_timestamps` cannot be deprecated/removed in cuDF as Spark is actively using it. #15901
- [x] cuDF Parquet writer supports `decimal32` and `decimal64` [fixed types](https://github.com/rapidsai/cudf/blob/branch-24.08/cpp/src/io/parquet/writer_impl.cu#L561). These are not directly supported by Arrow so we will [convert](https://github.com/rapidsai/cudf/blob/branch-24.08/cpp/src/interop/to_arrow.cu#L155) `decimal32/decimal64` columns to `decimal128`.
- [x] `is_col_nullable()` function moved to `writer_impl_helpers.cpp` along with some other helper functions.
- [x] A common `convert_data_to_decimal128` can be separated out and used in `writer_impl.cu` and `to_arrow.cu`. Tracking in a separate issue. #16194

CC @vuule @etseidl @nvdbaranec @GregoryKimball @galipremsagar for vis.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/15875
---
 cpp/CMakeLists.txt                            |   2 +
 cpp/include/cudf/io/parquet.hpp               |  25 ++
 cpp/src/io/functions.cpp                      |  18 +
 cpp/src/io/parquet/arrow_schema_writer.cpp    | 388 ++++++++++++++++++
 cpp/src/io/parquet/arrow_schema_writer.hpp    |  53 +++
 cpp/src/io/parquet/parquet_common.hpp         |  10 +
 cpp/src/io/parquet/reader_impl_helpers.cpp    |  15 +-
 cpp/src/io/parquet/reader_impl_helpers.hpp    |   5 +-
 cpp/src/io/parquet/writer_impl.cu             | 337 +++++++++------
 cpp/src/io/parquet/writer_impl.hpp            |   1 +
 cpp/src/io/parquet/writer_impl_helpers.cpp    | 131 ++++++
 cpp/src/io/parquet/writer_impl_helpers.hpp    |  97 +++++
 cpp/tests/io/parquet_writer_test.cpp          |  89 +++-
 python/cudf/cudf/_lib/parquet.pyx             |  11 +-
 .../_lib/pylibcudf/libcudf/io/parquet.pxd     |   5 +
 python/cudf/cudf/io/parquet.py                |  11 +
 python/cudf/cudf/tests/test_parquet.py        | 379 ++++++++++++++---
 python/cudf/cudf/utils/ioutils.py             |   6 +
 18 files changed, 1386 insertions(+), 197 deletions(-)
 create mode 100644 cpp/src/io/parquet/arrow_schema_writer.cpp
 create mode 100644 cpp/src/io/parquet/arrow_schema_writer.hpp
 create mode 100644 cpp/src/io/parquet/writer_impl_helpers.cpp
 create mode 100644 cpp/src/io/parquet/writer_impl_helpers.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7999ada9282..903cff27be4 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -409,6 +409,7 @@ add_library(
   src/io/orc/stripe_init.cu
   src/datetime/timezone.cpp
   src/io/orc/writer_impl.cu
+  src/io/parquet/arrow_schema_writer.cpp
   src/io/parquet/compact_protocol_reader.cpp
   src/io/parquet/compact_protocol_writer.cpp
   src/io/parquet/decode_preprocess.cu
@@ -425,6 +426,7 @@ add_library(
   src/io/parquet/reader_impl_helpers.cpp
   src/io/parquet/reader_impl_preprocess.cu
   src/io/parquet/writer_impl.cu
+  src/io/parquet/writer_impl_helpers.cpp
   src/io/parquet/decode_fixed.cu
   src/io/statistics/orc_column_statistics.cu
   src/io/statistics/parquet_column_statistics.cu
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 431f14af522..4d98cae73a7 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -597,6 +597,8 @@ class parquet_writer_options_base {
   // Parquet writer can write timestamps as UTC
   // Defaults to true because libcudf timestamps are implicitly UTC
   bool _write_timestamps_as_UTC = true;
+  // Whether to write ARROW schema
+  bool _write_arrow_schema = false;
   // Maximum size of each row group (unless smaller than a single page)
   size_t _row_group_size_bytes = default_row_group_size_bytes;
   // Maximum number of rows in row group (unless smaller than a single page)
@@ -689,6 +691,13 @@ class parquet_writer_options_base {
    */
   [[nodiscard]] auto is_enabled_utc_timestamps() const { return _write_timestamps_as_UTC; }
 
+  /**
+   * @brief Returns `true` if arrow schema will be written
+   *
+   * @return `true` if arrow schema will be written
+   */
+  [[nodiscard]] auto is_enabled_write_arrow_schema() const { return _write_arrow_schema; }
+
   /**
    * @brief Returns maximum row group size, in bytes.
    *
@@ -824,6 +833,13 @@ class parquet_writer_options_base {
    */
   void enable_utc_timestamps(bool val);
 
+  /**
+   * @brief Sets preference for writing arrow schema. Write arrow schema if set to `true`.
+   *
+   * @param val Boolean value to enable/disable writing of arrow schema.
+   */
+  void enable_write_arrow_schema(bool val);
+
   /**
    * @brief Sets the maximum row group size, in bytes.
    *
@@ -1084,6 +1100,15 @@ class parquet_writer_options_builder_base {
    * @return this for chaining
    */
   BuilderT& utc_timestamps(bool enabled);
+
+  /**
+   * @brief Set to true if arrow schema is to be written
+   *
+   * @param enabled Boolean value to enable/disable writing of arrow schema
+   * @return this for chaining
+   */
+  BuilderT& write_arrow_schema(bool enabled);
+
   /**
    * @brief Set to true if V2 page headers are to be written.
    *
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 5daa55d4552..b4ece9cec66 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -762,6 +762,9 @@ void parquet_writer_options_base::set_compression(compression_type compression)
 
 void parquet_writer_options_base::enable_int96_timestamps(bool req)
 {
+  CUDF_EXPECTS(not req or not is_enabled_write_arrow_schema(),
+               "INT96 timestamps and arrow schema cannot be simultaneously "
+               "enabled as INT96 timestamps are deprecated in Arrow.");
   _write_timestamps_as_int96 = req;
 }
 
@@ -770,6 +773,14 @@ void parquet_writer_options_base::enable_utc_timestamps(bool val)
   _write_timestamps_as_UTC = val;
 }
 
+void parquet_writer_options_base::enable_write_arrow_schema(bool val)
+{
+  CUDF_EXPECTS(not val or not is_enabled_int96_timestamps(),
+               "arrow schema and INT96 timestamps cannot be simultaneously "
+               "enabled as INT96 timestamps are deprecated in Arrow.");
+  _write_arrow_schema = val;
+}
+
 void parquet_writer_options_base::set_row_group_size_bytes(size_t size_bytes)
 {
   CUDF_EXPECTS(
@@ -974,6 +985,13 @@ BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::utc_timestamp
   return static_cast<BuilderT&>(*this);
 }
 
+template <class BuilderT, class OptionsT>
+BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::write_arrow_schema(bool enabled)
+{
+  _options.enable_write_arrow_schema(enabled);
+  return static_cast<BuilderT&>(*this);
+}
+
 template <class BuilderT, class OptionsT>
 BuilderT& parquet_writer_options_builder_base<BuilderT, OptionsT>::write_v2_headers(bool enabled)
 {
diff --git a/cpp/src/io/parquet/arrow_schema_writer.cpp b/cpp/src/io/parquet/arrow_schema_writer.cpp
new file mode 100644
index 00000000000..ddf65e9020f
--- /dev/null
+++ b/cpp/src/io/parquet/arrow_schema_writer.cpp
@@ -0,0 +1,388 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file arrow_schema_writer.cpp
+ * @brief Arrow IPC schema writer implementation
+ */
+
+#include "arrow_schema_writer.hpp"
+
+#include "io/parquet/parquet_common.hpp"
+#include "io/utilities/base64_utilities.hpp"
+#include "ipc/Message_generated.h"
+#include "ipc/Schema_generated.h"
+#include "writer_impl_helpers.hpp"
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+namespace cudf::io::parquet::detail {
+
+using namespace cudf::io::detail;
+
+namespace {
+
+// Copied over from arrow source for better code readability
+namespace flatbuf       = cudf::io::parquet::flatbuf;
+using FlatBufferBuilder = flatbuffers::FlatBufferBuilder;
+using DictionaryOffset  = flatbuffers::Offset<flatbuf::DictionaryEncoding>;
+using FieldOffset       = flatbuffers::Offset<flatbuf::Field>;
+using Offset            = flatbuffers::Offset<void>;
+using FBString          = flatbuffers::Offset<flatbuffers::String>;
+
+/**
+ * @brief Recursively construct the arrow schema (fields) tree
+ *
+ * @param fbb The root flatbuffer builder object instance
+ * @param column A view of the column
+ * @param column_metadata Metadata of the column
+ * @param write_mode Flag to indicate that we are guaranteeing a single table write
+ * @param utc_timestamps Flag to indicate if timestamps are UTC
+ *
+ * @return Flatbuffer offset to the constructed field
+ */
+FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
+                                     cudf::detail::LinkedColPtr const& column,
+                                     column_in_metadata const& column_metadata,
+                                     single_write_mode const write_mode,
+                                     bool const utc_timestamps);
+
+/**
+ * @brief Functor to convert cudf column metadata to arrow schema field metadata
+ */
+struct dispatch_to_flatbuf {
+  FlatBufferBuilder& fbb;
+  cudf::detail::LinkedColPtr const& col;
+  column_in_metadata const& col_meta;
+  single_write_mode const write_mode;
+  bool const utc_timestamps;
+  Offset& field_offset;
+  flatbuf::Type& field_type_id;
+  std::vector<FieldOffset>& children;
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, bool>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Bool;
+    field_offset  = flatbuf::CreateBool(fbb).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, int8_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 8, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, int16_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 16, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, int32_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 32, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, int64_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 64, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, uint8_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 8, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, uint16_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 16, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, uint32_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 32, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, uint64_t>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Int;
+    field_offset  = flatbuf::CreateInt(fbb, 64, std::numeric_limits<T>::is_signed).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, float>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_FloatingPoint;
+    field_offset  = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_SINGLE).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, double>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_FloatingPoint;
+    field_offset  = flatbuf::CreateFloatingPoint(fbb, flatbuf::Precision::Precision_DOUBLE).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Utf8View;
+    field_offset  = flatbuf::CreateUtf8View(fbb).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_D>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Date;
+    // Date type (Set unit type to DAY for arrows's Date32)
+    field_offset = flatbuf::CreateDate(fbb, flatbuf::DateUnit_DAY).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_s>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Timestamp;
+    // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
+    field_offset = flatbuf::CreateTimestamp(
+                     fbb, flatbuf::TimeUnit_SECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
+                     .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_ms>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Timestamp;
+    // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
+    field_offset =
+      flatbuf::CreateTimestamp(
+        fbb, flatbuf::TimeUnit_MILLISECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
+        .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_us>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Timestamp;
+    // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
+    field_offset =
+      flatbuf::CreateTimestamp(
+        fbb, flatbuf::TimeUnit_MICROSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
+        .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Timestamp;
+    // Use one of the strings: "UTC", "Etc/UTC" or "+00:00" to indicate a native UTC timestamp
+    field_offset =
+      flatbuf::CreateTimestamp(
+        fbb, flatbuf::TimeUnit_NANOSECOND, (utc_timestamps) ? fbb.CreateString("UTC") : 0)
+        .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
+  {
+    // `duration_D` is written as TimeType as `duration_D` is not a valid arrow type.
+    //  This also allows for easy and faithful roundtripping with cudf.
+    field_type_id = flatbuf::Type_Time;
+    field_offset  = flatbuf::CreateTime(fbb, flatbuf::TimeUnit_MILLISECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_s>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Duration;
+    field_offset  = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_SECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_ms>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Duration;
+    field_offset  = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MILLISECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_us>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Duration;
+    field_offset  = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_MICROSECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::duration_ns>, void> operator()()
+  {
+    field_type_id = flatbuf::Type_Duration;
+    field_offset  = flatbuf::CreateDuration(fbb, flatbuf::TimeUnit_NANOSECOND).Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<cudf::is_fixed_point<T>(), void> operator()()
+  {
+    field_type_id = flatbuf::Type_Decimal;
+    field_offset  = flatbuf::CreateDecimal(fbb,
+                                          (col_meta.is_decimal_precision_set())
+                                             ? col_meta.get_decimal_precision()
+                                             : MAX_DECIMAL128_PRECISION,
+                                          col->type().scale(),
+                                          128)
+                     .Union();
+  }
+
+  template <typename T>
+  std::enable_if_t<cudf::is_nested<T>(), void> operator()()
+  {
+    // Lists are represented differently in arrow and cuDF.
+    // cuDF representation: List<int>: "col_name" : { "list", "element:int" } (2 children)
+    // arrow schema representation: List<int>: "col_name" : { "list<item:int>" } (1 child)
+    // Hence, we only need to process the second child of the list.
+    if constexpr (std::is_same_v<T, cudf::list_view>) {
+      children.emplace_back(make_arrow_schema_fields(
+        fbb, col->children[1], col_meta.child(1), write_mode, utc_timestamps));
+      field_type_id = flatbuf::Type_List;
+      field_offset  = flatbuf::CreateList(fbb).Union();
+    }
+
+    // Traverse the struct in DFS manner and process children fields.
+    else if constexpr (std::is_same_v<T, cudf::struct_view>) {
+      std::transform(thrust::make_counting_iterator(0UL),
+                     thrust::make_counting_iterator(col->children.size()),
+                     std::back_inserter(children),
+                     [&](auto const idx) {
+                       return make_arrow_schema_fields(
+                         fbb, col->children[idx], col_meta.child(idx), write_mode, utc_timestamps);
+                     });
+      field_type_id = flatbuf::Type_Struct_;
+      field_offset  = flatbuf::CreateStruct_(fbb).Union();
+    }
+  }
+
+  template <typename T>
+  std::enable_if_t<cudf::is_dictionary<T>(), void> operator()()
+  {
+    // `dictionary32` columns are not written to parquet by cudf.
+    CUDF_FAIL("Dictionary columns are not supported for writing");
+  }
+};
+
+FieldOffset make_arrow_schema_fields(FlatBufferBuilder& fbb,
+                                     cudf::detail::LinkedColPtr const& column,
+                                     column_in_metadata const& column_metadata,
+                                     single_write_mode const write_mode,
+                                     bool const utc_timestamps)
+{
+  // Variables to be set by the dispatch_to_flatbuf functor
+  Offset field_offset         = 0;
+  flatbuf::Type field_type_id = flatbuf::Type_NONE;
+  std::vector<FieldOffset> children;
+
+  cudf::type_dispatcher(column->type(),
+                        dispatch_to_flatbuf{fbb,
+                                            column,
+                                            column_metadata,
+                                            write_mode,
+                                            utc_timestamps,
+                                            field_offset,
+                                            field_type_id,
+                                            children});
+
+  // push to field offsets vector
+  return flatbuf::CreateField(
+    fbb,
+    fbb.CreateString(column_metadata.get_name()),                    // name
+    is_output_column_nullable(column, column_metadata, write_mode),  // nullable
+    field_type_id,                                                   // type id
+    field_offset,                                                    // field offset
+    {0},                                                             // DictionaryOffset
+    fbb.CreateVector(children.data(), children.size()));             // children vector
+}
+
+}  // namespace
+
+std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
+                                               table_input_metadata const& metadata,
+                                               single_write_mode const write_mode,
+                                               bool const utc_timestamps)
+{
+  // Lambda function to convert int32 to a string of uint8 bytes
+  auto const convert_int32_to_byte_string = [&](int32_t const value) {
+    std::array<uint8_t, sizeof(int32_t)> buffer;
+    std::memcpy(buffer.data(), &value, sizeof(int32_t));
+    return std::string(reinterpret_cast<char*>(buffer.data()), buffer.size());
+  };
+
+  // Instantiate a flatbuffer builder
+  FlatBufferBuilder fbb;
+
+  // Create an empty field offset vector and reserve space for linked columns
+  std::vector<FieldOffset> field_offsets;
+  field_offsets.reserve(linked_columns.size());
+
+  // populate field offsets (aka schema fields)
+  std::transform(thrust::make_zip_iterator(
+                   thrust::make_tuple(linked_columns.begin(), metadata.column_metadata.begin())),
+                 thrust::make_zip_iterator(
+                   thrust::make_tuple(linked_columns.end(), metadata.column_metadata.end())),
+                 std::back_inserter(field_offsets),
+                 [&](auto const& elem) {
+                   return make_arrow_schema_fields(
+                     fbb, thrust::get<0>(elem), thrust::get<1>(elem), write_mode, utc_timestamps);
+                 });
+
+  // Build an arrow:schema flatbuffer using the field offset vector and use it as the header to
+  // create an ipc message flatbuffer
+  fbb.Finish(flatbuf::CreateMessage(
+    fbb,
+    flatbuf::MetadataVersion_V5,    // Metadata version V5 (latest)
+    flatbuf::MessageHeader_Schema,  // Schema type message header
+    flatbuf::CreateSchema(fbb,
+                          flatbuf::Endianness::Endianness_Little,
+                          fbb.CreateVector(field_offsets))
+      .Union(),                                // arrow:schema built from the field vector
+    SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH  // Body length is zero for schema type ipc message
+    ));
+
+  // Construct the final string and store it here to use its view in base64_encode
+  std::string const ipc_message =
+    convert_int32_to_byte_string(IPC_CONTINUATION_TOKEN) +
+    // Since the schema type ipc message doesn't have a body, the flatbuffer size is equal to the
+    // ipc message's metadata length
+    convert_int32_to_byte_string(fbb.GetSize()) +
+    std::string(reinterpret_cast<char*>(fbb.GetBufferPointer()), fbb.GetSize());
+
+  // Encode the final ipc message string to base64 and return
+  return cudf::io::detail::base64_encode(ipc_message);
+}
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/arrow_schema_writer.hpp b/cpp/src/io/parquet/arrow_schema_writer.hpp
new file mode 100644
index 00000000000..9bc435bf6c8
--- /dev/null
+++ b/cpp/src/io/parquet/arrow_schema_writer.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file arrow_schema_writer.hpp
+ * @brief Arrow IPC schema writer implementation
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/linked_column.hpp>
+#include <cudf/io/data_sink.hpp>
+#include <cudf/io/detail/parquet.hpp>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/types.hpp>
+
+namespace cudf::io::parquet::detail {
+
+/**
+ * @brief Construct and return arrow schema from input parquet schema
+ *
+ * Recursively traverses through parquet schema to construct the arrow schema tree.
+ * Serializes the arrow schema tree and stores it as the header (or metadata) of
+ * an otherwise empty ipc message using flatbuffers. The ipc message is then prepended
+ * with header size (padded for 16 byte alignment) and a continuation string. The final
+ * string is base64 encoded and returned.
+ *
+ * @param linked_columns Vector of table column views
+ * @param metadata Metadata of the columns of the table
+ * @param write_mode Flag to indicate that we are guaranteeing a single table write
+ * @param utc_timestamps Flag to indicate if timestamps are UTC
+ *
+ * @return The constructed arrow ipc message string
+ */
+std::string construct_arrow_schema_ipc_message(cudf::detail::LinkedColVector const& linked_columns,
+                                               table_input_metadata const& metadata,
+                                               cudf::io::detail::single_write_mode const write_mode,
+                                               bool const utc_timestamps);
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_common.hpp b/cpp/src/io/parquet/parquet_common.hpp
index 8507eca047e..e42c259b1bf 100644
--- a/cpp/src/io/parquet/parquet_common.hpp
+++ b/cpp/src/io/parquet/parquet_common.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cstdint>
+#include <string>
 
 namespace cudf::io::parquet::detail {
 
@@ -26,6 +27,15 @@ auto constexpr MAX_DECIMAL32_PRECISION  = 9;
 auto constexpr MAX_DECIMAL64_PRECISION  = 18;
 auto constexpr MAX_DECIMAL128_PRECISION = 38;  // log10(2^(sizeof(int128_t) * 8 - 1) - 1)
 
+// Constants copied from arrow source and renamed to match the case
+int32_t constexpr MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL         = sizeof(int32_t);
+int32_t constexpr MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t);
+int32_t constexpr IPC_CONTINUATION_TOKEN                             = -1;
+std::string const ARROW_SCHEMA_KEY                                   = "ARROW:schema";
+
+// Schema type ipc message has zero length body
+int64_t constexpr SCHEMA_HEADER_TYPE_IPC_MESSAGE_BODYLENGTH = 0;
+
 /**
  * @brief Basic data types in Parquet, determines how data is physically stored
  */
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index ebd4affd099..d1e9a823d3b 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -564,14 +564,14 @@ aggregate_reader_metadata::aggregate_reader_metadata(
   // Collect and apply arrow:schema from Parquet's key value metadata section
   if (use_arrow_schema) { apply_arrow_schema(); }
 
-  // Erase "ARROW:schema" from the output pfm if exists
+  // Erase ARROW_SCHEMA_KEY from the output pfm if exists
   std::for_each(
-    keyval_maps.begin(), keyval_maps.end(), [](auto& pfm) { pfm.erase("ARROW:schema"); });
+    keyval_maps.begin(), keyval_maps.end(), [](auto& pfm) { pfm.erase(ARROW_SCHEMA_KEY); });
 }
 
 arrow_schema_data_types aggregate_reader_metadata::collect_arrow_schema() const
 {
-  // Check the key_value metadata for ARROW:schema, decode and walk it
+  // Check the key_value metadata for arrow schema, decode and walk it
   // Function to convert from flatbuf::duration type to cudf::type_id
   auto const duration_from_flatbuffer = [](flatbuf::Duration const* duration) {
     // TODO: we only need this for arrow::DurationType for now. Else, we can take in a
@@ -645,9 +645,7 @@ arrow_schema_data_types aggregate_reader_metadata::collect_arrow_schema() const
       return true;
     };
 
-  // TODO: Should we check if any file has the "ARROW:schema" key
-  // Or if all files have the same "ARROW:schema"?
-  auto const it = keyval_maps[0].find("ARROW:schema");
+  auto const it = keyval_maps[0].find(ARROW_SCHEMA_KEY);
   if (it == keyval_maps[0].end()) { return {}; }
 
   // Decode the base64 encoded ipc message string
@@ -788,11 +786,6 @@ void aggregate_reader_metadata::apply_arrow_schema()
 std::optional<std::string_view> aggregate_reader_metadata::decode_ipc_message(
   std::string_view const serialized_message) const
 {
-  // Constants copied from arrow source and renamed to match the case
-  constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_INITIAL         = sizeof(int32_t);
-  constexpr int32_t MESSAGE_DECODER_NEXT_REQUIRED_SIZE_METADATA_LENGTH = sizeof(int32_t);
-  constexpr int32_t IPC_CONTINUATION_TOKEN                             = -1;
-
   // message buffer
   auto message_buf = serialized_message.data();
   // current message (buffer) size
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 9aeb19a7723..6bfa8519c76 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -117,6 +117,9 @@ struct metadata : public FileMetaData {
   void sanitize_schema();
 };
 
+/**
+ * @brief Class to extract data types from arrow schema tree
+ */
 struct arrow_schema_data_types {
   std::vector<arrow_schema_data_types> children;
   data_type type{type_id::EMPTY};
@@ -142,7 +145,7 @@ class aggregate_reader_metadata {
     const;
 
   /**
-   * @brief Decodes and constructs the arrow schema from the "ARROW:schema" IPC message
+   * @brief Decodes and constructs the arrow schema from the ARROW_SCHEMA_KEY IPC message
    * in key value metadata section of Parquet file footer
    */
   [[nodiscard]] arrow_schema_data_types collect_arrow_schema() const;
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index bed4dbc5a66..66b4fce16fe 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -19,6 +19,7 @@
  * @brief cuDF-IO parquet writer class implementation
  */
 
+#include "arrow_schema_writer.hpp"
 #include "compact_protocol_reader.hpp"
 #include "compact_protocol_writer.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
@@ -30,6 +31,7 @@
 #include "parquet_common.hpp"
 #include "parquet_gpu.cuh"
 #include "writer_impl.hpp"
+#include "writer_impl_helpers.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/copying.hpp>
@@ -39,9 +41,6 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/detail/utilities.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -70,7 +69,8 @@ struct aggregate_writer_metadata {
                             host_span<std::map<std::string, std::string> const> kv_md,
                             host_span<SchemaElement const> tbl_schema,
                             size_type num_columns,
-                            statistics_freq stats_granularity)
+                            statistics_freq stats_granularity,
+                            std::string const arrow_schema_ipc_message)
     : version(1),
       schema(std::vector<SchemaElement>(tbl_schema.begin(), tbl_schema.end())),
       files(partitions.size())
@@ -92,6 +92,13 @@ struct aggregate_writer_metadata {
                        return KeyValue{kv.first, kv.second};
                      });
     }
+
+    // Append arrow schema to the key-value metadata
+    if (not arrow_schema_ipc_message.empty()) {
+      std::for_each(this->files.begin(), this->files.end(), [&](auto& file) {
+        file.key_value_metadata.emplace_back(KeyValue{ARROW_SCHEMA_KEY, arrow_schema_ipc_message});
+      });
+    }
   }
 
   aggregate_writer_metadata(aggregate_writer_metadata const&) = default;
@@ -182,26 +189,6 @@ struct aggregate_writer_metadata {
 
 namespace {
 
-/**
- * @brief Function that translates GDF compression to parquet compression.
- *
- * @param compression The compression type
- * @return The supported Parquet compression
- */
-Compression to_parquet_compression(compression_type compression)
-{
-  switch (compression) {
-    case compression_type::AUTO:
-    case compression_type::SNAPPY: return Compression::SNAPPY;
-    case compression_type::ZSTD: return Compression::ZSTD;
-    case compression_type::LZ4:
-      // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
-      return Compression::LZ4_RAW;
-    case compression_type::NONE: return Compression::UNCOMPRESSED;
-    default: CUDF_FAIL("Unsupported compression type");
-  }
-}
-
 /**
  * @brief Convert a mask of encodings to a vector.
  *
@@ -326,6 +313,7 @@ struct leaf_schema_fn {
   column_in_metadata const& col_meta;
   bool timestamp_is_int96;
   bool timestamp_is_utc;
+  bool write_arrow_schema;
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, bool>, void> operator()()
@@ -493,10 +481,11 @@ struct leaf_schema_fn {
     }
   }
 
-  //  unsupported outside cudf for parquet 1.0.
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
   {
+    // duration_D is based on int32_t and not a valid arrow duration type so simply convert to
+    // time32(ms).
     col_schema.type           = Type::INT32;
     col_schema.converted_type = ConvertedType::TIME_MILLIS;
     col_schema.stats_dtype    = statistics_dtype::dtype_int32;
@@ -507,62 +496,86 @@ struct leaf_schema_fn {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_s>, void> operator()()
   {
-    col_schema.type           = Type::INT32;
-    col_schema.converted_type = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
-    col_schema.ts_scale       = 1000;
-    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+    // If writing arrow schema, no logical type nor converted type is necessary
+    if (write_arrow_schema) {
+      col_schema.type        = Type::INT64;
+      col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    } else {
+      // Write as Time32 logical type otherwise. Parquet TIME_MILLIS annotates INT32
+      col_schema.type           = Type::INT32;
+      col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+      col_schema.converted_type = ConvertedType::TIME_MILLIS;
+      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+      col_schema.ts_scale       = 1000;
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ms>, void> operator()()
   {
-    col_schema.type           = Type::INT32;
-    col_schema.converted_type = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype    = statistics_dtype::dtype_int32;
-    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+    // If writing arrow schema, no logical type nor converted type is necessary
+    if (write_arrow_schema) {
+      col_schema.type        = Type::INT64;
+      col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    } else {
+      // Write as Time32 logical type otherwise. Parquet TIME_MILLIS annotates INT32
+      col_schema.type           = Type::INT32;
+      col_schema.stats_dtype    = statistics_dtype::dtype_int32;
+      col_schema.converted_type = ConvertedType::TIME_MILLIS;
+      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MILLIS}};
+    }
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_us>, void> operator()()
   {
-    col_schema.type           = Type::INT64;
-    col_schema.converted_type = ConvertedType::TIME_MICROS;
-    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
-    col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}};
+    col_schema.type        = Type::INT64;
+    col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    // Only write as time64 logical type if not writing arrow schema
+    if (not write_arrow_schema) {
+      col_schema.converted_type = ConvertedType::TIME_MICROS;
+      col_schema.logical_type   = LogicalType{TimeType{timestamp_is_utc, TimeUnit::MICROS}};
+    }
   }
 
-  //  unsupported outside cudf for parquet 1.0.
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ns>, void> operator()()
   {
-    col_schema.type         = Type::INT64;
-    col_schema.stats_dtype  = statistics_dtype::dtype_int64;
-    col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}};
+    col_schema.type        = Type::INT64;
+    col_schema.stats_dtype = statistics_dtype::dtype_int64;
+    // Only write as time64 logical type if not writing arrow schema
+    if (not write_arrow_schema) {
+      col_schema.logical_type = LogicalType{TimeType{timestamp_is_utc, TimeUnit::NANOS}};
+    }
   }
 
   template <typename T>
   std::enable_if_t<cudf::is_fixed_point<T>(), void> operator()()
   {
-    if (std::is_same_v<T, numeric::decimal32>) {
-      col_schema.type              = Type::INT32;
-      col_schema.stats_dtype       = statistics_dtype::dtype_int32;
-      col_schema.decimal_precision = MAX_DECIMAL32_PRECISION;
-      col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL32_PRECISION}};
-    } else if (std::is_same_v<T, numeric::decimal64>) {
-      col_schema.type              = Type::INT64;
-      col_schema.stats_dtype       = statistics_dtype::dtype_decimal64;
-      col_schema.decimal_precision = MAX_DECIMAL64_PRECISION;
-      col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL64_PRECISION}};
-    } else if (std::is_same_v<T, numeric::decimal128>) {
+    // If writing arrow schema, then convert d32 and d64 to d128
+    if (write_arrow_schema or std::is_same_v<T, numeric::decimal128>) {
       col_schema.type              = Type::FIXED_LEN_BYTE_ARRAY;
       col_schema.type_length       = sizeof(__int128_t);
       col_schema.stats_dtype       = statistics_dtype::dtype_decimal128;
       col_schema.decimal_precision = MAX_DECIMAL128_PRECISION;
       col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL128_PRECISION}};
     } else {
-      CUDF_FAIL("Unsupported fixed point type for parquet writer");
+      if (std::is_same_v<T, numeric::decimal32>) {
+        col_schema.type              = Type::INT32;
+        col_schema.stats_dtype       = statistics_dtype::dtype_int32;
+        col_schema.decimal_precision = MAX_DECIMAL32_PRECISION;
+        col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL32_PRECISION}};
+      } else if (std::is_same_v<T, numeric::decimal64>) {
+        col_schema.type              = Type::INT64;
+        col_schema.stats_dtype       = statistics_dtype::dtype_decimal64;
+        col_schema.decimal_precision = MAX_DECIMAL64_PRECISION;
+        col_schema.logical_type      = LogicalType{DecimalType{0, MAX_DECIMAL64_PRECISION}};
+      } else {
+        CUDF_FAIL("Unsupported fixed point type for parquet writer");
+      }
     }
+
+    // Write logical and converted types, decimal scale and precision
     col_schema.converted_type = ConvertedType::DECIMAL;
     col_schema.decimal_scale = -col->type().scale();  // parquet and cudf disagree about scale signs
     col_schema.logical_type->decimal_type->scale = -col->type().scale();
@@ -590,33 +603,19 @@ struct leaf_schema_fn {
   }
 };
 
-inline bool is_col_nullable(cudf::detail::LinkedColPtr const& col,
-                            column_in_metadata const& col_meta,
-                            single_write_mode write_mode)
-{
-  if (col_meta.is_nullability_defined()) {
-    CUDF_EXPECTS(col_meta.nullable() or col->null_count() == 0,
-                 "Mismatch in metadata prescribed nullability and input column. "
-                 "Metadata for input column with nulls cannot prescribe nullability = false");
-    return col_meta.nullable();
-  }
-  // For chunked write, when not provided nullability, we assume the worst case scenario
-  // that all columns are nullable.
-  return write_mode == single_write_mode::NO or col->nullable();
-}
-
 /**
  * @brief Construct schema from input columns and per-column input options
  *
  * Recursively traverses through linked_columns and corresponding metadata to construct schema tree.
  * The resulting schema tree is stored in a vector in pre-order traversal order.
  */
-std::vector<schema_tree_node> construct_schema_tree(
+std::vector<schema_tree_node> construct_parquet_schema_tree(
   cudf::detail::LinkedColVector const& linked_columns,
   table_input_metadata& metadata,
   single_write_mode write_mode,
   bool int96_timestamps,
-  bool utc_timestamps)
+  bool utc_timestamps,
+  bool write_arrow_schema)
 {
   std::vector<schema_tree_node> schema;
   schema_tree_node root{};
@@ -629,7 +628,7 @@ std::vector<schema_tree_node> construct_schema_tree(
 
   std::function<void(cudf::detail::LinkedColPtr const&, column_in_metadata&, size_t)> add_schema =
     [&](cudf::detail::LinkedColPtr const& col, column_in_metadata& col_meta, size_t parent_idx) {
-      bool const col_nullable = is_col_nullable(col, col_meta, write_mode);
+      bool const col_nullable = is_output_column_nullable(col, col_meta, write_mode);
 
       auto set_field_id = [&schema, parent_idx](schema_tree_node& s,
                                                 column_in_metadata const& col_meta) {
@@ -854,7 +853,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         right_child_meta.set_name("value");
         // check the repetition type of key is required i.e. the col should be non-nullable
         auto key_col = col->children[lists_column_view::child_column_index]->children[0];
-        CUDF_EXPECTS(!is_col_nullable(key_col, left_child_meta, write_mode),
+        CUDF_EXPECTS(!is_output_column_nullable(key_col, left_child_meta, write_mode),
                      "key column cannot be nullable. For chunked writing, explicitly set the "
                      "nullability to false in metadata");
         // process key
@@ -886,7 +885,8 @@ std::vector<schema_tree_node> construct_schema_tree(
 
         cudf::type_dispatcher(
           col->type(),
-          leaf_schema_fn{col_schema, col, col_meta, timestamp_is_int96, utc_timestamps});
+          leaf_schema_fn{
+            col_schema, col, col_meta, timestamp_is_int96, utc_timestamps, write_arrow_schema});
 
         col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
         col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
@@ -1148,7 +1148,6 @@ void calculate_page_fragments(device_span<PageFragment> frag,
  *
  * @param frag_stats output statistics
  * @param frags Input page fragments
- * @param int96_timestamps Flag to indicate if timestamps will be written as INT96
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
 void gather_fragment_statistics(device_span<statistics_chunk> frag_stats,
@@ -1164,32 +1163,6 @@ void gather_fragment_statistics(device_span<statistics_chunk> frag_stats,
   stream.synchronize();
 }
 
-auto to_nvcomp_compression_type(Compression codec)
-{
-  if (codec == Compression::SNAPPY) return nvcomp::compression_type::SNAPPY;
-  if (codec == Compression::ZSTD) return nvcomp::compression_type::ZSTD;
-  // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
-  if (codec == Compression::LZ4_RAW) return nvcomp::compression_type::LZ4;
-  CUDF_FAIL("Unsupported compression type");
-}
-
-auto page_alignment(Compression codec)
-{
-  if (codec == Compression::UNCOMPRESSED or
-      nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) {
-    return 1u;
-  }
-
-  return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(codec));
-}
-
-size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize)
-{
-  if (codec == Compression::UNCOMPRESSED) return 0;
-
-  return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize);
-}
-
 auto init_page_sizes(hostdevice_2dvector<EncColumnChunk>& chunks,
                      device_span<parquet_column_device_view const> col_desc,
                      uint32_t num_columns,
@@ -1629,23 +1602,127 @@ size_t column_index_buffer_size(EncColumnChunk* ck,
 }
 
 /**
- * @brief Fill the table metadata with default column names.
+ * @brief Convert decimal32 and decimal64 data to decimal128 and return the device vector
  *
- * @param table_meta The table metadata to fill
+ * @tparam DecimalType to convert from
+ *
+ * @param column A view of the input columns
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return A device vector containing the converted decimal128 data
  */
-void fill_table_meta(std::unique_ptr<table_input_metadata> const& table_meta)
+template <typename DecimalType>
+rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& column,
+                                                           rmm::cuda_stream_view stream)
 {
-  // Fill unnamed columns' names in table_meta
-  std::function<void(column_in_metadata&, std::string)> add_default_name =
-    [&](column_in_metadata& col_meta, std::string default_name) {
-      if (col_meta.get_name().empty()) col_meta.set_name(default_name);
-      for (size_type i = 0; i < col_meta.num_children(); ++i) {
-        add_default_name(col_meta.child(i), col_meta.get_name() + "_" + std::to_string(i));
-      }
-    };
-  for (size_t i = 0; i < table_meta->column_metadata.size(); ++i) {
-    add_default_name(table_meta->column_metadata[i], "_col" + std::to_string(i));
-  }
+  size_type constexpr BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DecimalType);
+
+  rmm::device_uvector<__int128_t> d128_buffer(column.size(), stream);
+
+  thrust::for_each(rmm::exec_policy_nosync(stream),
+                   thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(column.size()),
+                   [in  = column.begin<DecimalType>(),
+                    out = reinterpret_cast<DecimalType*>(d128_buffer.data()),
+                    BIT_WIDTH_RATIO] __device__(auto in_idx) {
+                     auto const out_idx = in_idx * BIT_WIDTH_RATIO;
+                     // The lowest order bits are the value, the remainder
+                     // simply matches the sign bit to satisfy the two's
+                     // complement integer representation of negative numbers.
+                     out[out_idx] = in[in_idx];
+#pragma unroll BIT_WIDTH_RATIO - 1
+                     for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
+                       out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
+                     }
+                   });
+
+  return d128_buffer;
+}
+
+/**
+ * @brief Function to convert decimal32 and decimal64 columns to decimal128 data,
+ *        update the input table metadata, and return a new vector of column views.
+ *
+ * @param[in,out] table_meta The table metadata
+ * @param[in,out] d128_vectors Vector containing the computed decimal128 data buffers.
+ * @param input The input table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return A device vector containing the converted decimal128 data
+ */
+std::vector<column_view> convert_decimal_columns_and_metadata(
+  table_input_metadata& table_meta,
+  std::vector<rmm::device_uvector<__int128_t>>& d128_vectors,
+  table_view const& table,
+  rmm::cuda_stream_view stream)
+{
+  // Lambda function to convert each decimal32/decimal64 column to decimal128.
+  std::function<column_view(column_view, column_in_metadata&)> convert_column =
+    [&](column_view column, column_in_metadata& metadata) -> column_view {
+    // Vector of passable-by-reference children column views
+    std::vector<column_view> converted_children;
+
+    // Process children column views first
+    std::transform(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(column.num_children()),
+      std::back_inserter(converted_children),
+      [&](auto const idx) { return convert_column(column.child(idx), metadata.child(idx)); });
+
+    // Process this column view. Only convert if decimal32 and decimal64 column.
+    switch (column.type().id()) {
+      case type_id::DECIMAL32:
+        // Convert data to decimal128 type
+        d128_vectors.emplace_back(convert_data_to_decimal128<int32_t>(column, stream));
+        // Update metadata
+        metadata.set_decimal_precision(MAX_DECIMAL32_PRECISION);
+        metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
+        // Create a new column view from the d128 data vector
+        return {data_type{type_id::DECIMAL128, column.type().scale()},
+                column.size(),
+                d128_vectors.back().data(),
+                column.null_mask(),
+                column.null_count(),
+                column.offset(),
+                converted_children};
+      case type_id::DECIMAL64:
+        // Convert data to decimal128 type
+        d128_vectors.emplace_back(convert_data_to_decimal128<int64_t>(column, stream));
+        // Update metadata
+        metadata.set_decimal_precision(MAX_DECIMAL64_PRECISION);
+        metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
+        // Create a new column view from the d128 data vector
+        return {data_type{type_id::DECIMAL128, column.type().scale()},
+                column.size(),
+                d128_vectors.back().data(),
+                column.null_mask(),
+                column.null_count(),
+                column.offset(),
+                converted_children};
+      default:
+        // Update the children vector keeping everything else the same
+        return {column.type(),
+                column.size(),
+                column.head(),
+                column.null_mask(),
+                column.null_count(),
+                column.offset(),
+                converted_children};
+    }
+  };
+
+  // Vector of converted column views
+  std::vector<column_view> converted_column_views;
+
+  // Convert each column view
+  std::transform(
+    thrust::make_zip_iterator(
+      thrust::make_tuple(table.begin(), table_meta.column_metadata.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(table.end(), table_meta.column_metadata.end())),
+    std::back_inserter(converted_column_views),
+    [&](auto elem) { return convert_column(thrust::get<0>(elem), thrust::get<1>(elem)); });
+
+  return converted_column_views;
 }
 
 /**
@@ -1698,12 +1775,22 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                                    bool int96_timestamps,
                                    bool utc_timestamps,
                                    bool write_v2_headers,
+                                   bool write_arrow_schema,
                                    host_span<std::unique_ptr<data_sink> const> out_sink,
                                    rmm::cuda_stream_view stream)
 {
-  auto vec = table_to_linked_columns(input);
-  auto schema_tree =
-    construct_schema_tree(vec, table_meta, write_mode, int96_timestamps, utc_timestamps);
+  // Container to store decimal128 converted data if needed
+  std::vector<rmm::device_uvector<__int128_t>> d128_vectors;
+
+  // Convert decimal32/decimal64 data to decimal128 if writing arrow schema
+  // and initialize LinkedColVector
+  auto vec = table_to_linked_columns(
+    (write_arrow_schema)
+      ? table_view({convert_decimal_columns_and_metadata(table_meta, d128_vectors, input, stream)})
+      : input);
+
+  auto schema_tree = construct_parquet_schema_tree(
+    vec, table_meta, write_mode, int96_timestamps, utc_timestamps, write_arrow_schema);
   // Construct parquet_column_views from the schema tree leaf nodes.
   std::vector<parquet_column_view> parquet_columns;
 
@@ -1826,7 +1913,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   std::unique_ptr<aggregate_writer_metadata> agg_meta;
   if (!curr_agg_meta) {
     agg_meta = std::make_unique<aggregate_writer_metadata>(
-      partitions, kv_meta, this_table_schema, num_columns, stats_granularity);
+      partitions,
+      kv_meta,
+      this_table_schema,
+      num_columns,
+      stats_granularity,
+      (write_arrow_schema)
+        ? construct_arrow_schema_ipc_message(vec, table_meta, write_mode, utc_timestamps)
+        : "");
   } else {
     agg_meta = std::make_unique<aggregate_writer_metadata>(*curr_agg_meta);
 
@@ -2307,6 +2401,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
+    _write_arrow_schema(options.is_enabled_write_arrow_schema()),
     _sorting_columns(options.get_sorting_columns()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
@@ -2337,6 +2432,7 @@ writer::impl::impl(std::vector<std::unique_ptr<data_sink>> sinks,
     _int96_timestamps(options.is_enabled_int96_timestamps()),
     _utc_timestamps(options.is_enabled_utc_timestamps()),
     _write_v2_headers(options.is_enabled_write_v2_headers()),
+    _write_arrow_schema(options.is_enabled_write_arrow_schema()),
     _sorting_columns(options.get_sorting_columns()),
     _column_index_truncate_length(options.get_column_index_truncate_length()),
     _kv_meta(options.get_key_value_metadata()),
@@ -2378,7 +2474,7 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
   CUDF_EXPECTS(not _closed, "Data has already been flushed to out and closed");
 
   if (not _table_meta) { _table_meta = std::make_unique<table_input_metadata>(input); }
-  fill_table_meta(_table_meta);
+  fill_table_meta(*_table_meta);
 
   // All kinds of memory allocation and data compressions/encoding are performed here.
   // If any error occurs, such as out-of-memory exception, the internal state of the current
@@ -2415,6 +2511,7 @@ void writer::impl::write(table_view const& input, std::vector<partition_info> co
                                            _int96_timestamps,
                                            _utc_timestamps,
                                            _write_v2_headers,
+                                           _write_arrow_schema,
                                            _out_sink,
                                            _stream);
     } catch (...) {  // catch any exception type
diff --git a/cpp/src/io/parquet/writer_impl.hpp b/cpp/src/io/parquet/writer_impl.hpp
index 784f78f06d5..63128faf993 100644
--- a/cpp/src/io/parquet/writer_impl.hpp
+++ b/cpp/src/io/parquet/writer_impl.hpp
@@ -156,6 +156,7 @@ class writer::impl {
   bool const _int96_timestamps;
   bool const _utc_timestamps;
   bool const _write_v2_headers;
+  bool const _write_arrow_schema;
   std::optional<std::vector<sorting_column>> _sorting_columns;
   int32_t const _column_index_truncate_length;
   std::vector<std::map<std::string, std::string>> const _kv_meta;  // Optional user metadata.
diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp
new file mode 100644
index 00000000000..e2f09f872d3
--- /dev/null
+++ b/cpp/src/io/parquet/writer_impl_helpers.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file writer_impl_helpers.cpp
+ * @brief Helper function implementation for Parquet writer
+ */
+
+#include "writer_impl_helpers.hpp"
+
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+
+namespace cudf::io::parquet::detail {
+
+using namespace cudf::io::detail;
+
+Compression to_parquet_compression(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::AUTO:
+    case compression_type::SNAPPY: return Compression::SNAPPY;
+    case compression_type::ZSTD: return Compression::ZSTD;
+    case compression_type::LZ4:
+      // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
+      return Compression::LZ4_RAW;
+    case compression_type::NONE: return Compression::UNCOMPRESSED;
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+}
+
+nvcomp::compression_type to_nvcomp_compression_type(Compression codec)
+{
+  switch (codec) {
+    case Compression::SNAPPY: return nvcomp::compression_type::SNAPPY;
+    case Compression::ZSTD: return nvcomp::compression_type::ZSTD;
+    // Parquet refers to LZ4 as "LZ4_RAW"; Parquet's "LZ4" is not standard LZ4
+    case Compression::LZ4_RAW: return nvcomp::compression_type::LZ4;
+    default: CUDF_FAIL("Unsupported compression type");
+  }
+}
+
+uint32_t page_alignment(Compression codec)
+{
+  if (codec == Compression::UNCOMPRESSED or
+      nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) {
+    return 1u;
+  }
+
+  return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(codec));
+}
+
+size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize)
+{
+  if (codec == Compression::UNCOMPRESSED) return 0;
+
+  return compress_max_output_chunk_size(to_nvcomp_compression_type(codec), compression_blocksize);
+}
+
+void fill_table_meta(table_input_metadata& table_meta)
+{
+  // Fill unnamed columns' names in table_meta
+  std::function<void(column_in_metadata&, std::string)> add_default_name =
+    [&](column_in_metadata& col_meta, std::string default_name) {
+      if (col_meta.get_name().empty()) col_meta.set_name(default_name);
+      for (size_type i = 0; i < col_meta.num_children(); ++i) {
+        add_default_name(col_meta.child(i), col_meta.get_name() + "_" + std::to_string(i));
+      }
+    };
+  for (size_t i = 0; i < table_meta.column_metadata.size(); ++i) {
+    add_default_name(table_meta.column_metadata[i], "_col" + std::to_string(i));
+  }
+}
+
+[[nodiscard]] size_t column_size(column_view const& column, rmm::cuda_stream_view stream)
+{
+  if (column.is_empty()) { return 0; }
+
+  if (is_fixed_width(column.type())) {
+    return size_of(column.type()) * column.size();
+  } else if (column.type().id() == type_id::STRING) {
+    auto const scol = strings_column_view(column);
+    return cudf::strings::detail::get_offset_value(
+             scol.offsets(), column.size() + column.offset(), stream) -
+           cudf::strings::detail::get_offset_value(scol.offsets(), column.offset(), stream);
+  } else if (column.type().id() == type_id::STRUCT) {
+    auto const scol = structs_column_view(column);
+    size_t ret      = 0;
+    for (int i = 0; i < scol.num_children(); i++) {
+      ret += column_size(scol.get_sliced_child(i, stream), stream);
+    }
+    return ret;
+  } else if (column.type().id() == type_id::LIST) {
+    auto const lcol = lists_column_view(column);
+    return column_size(lcol.get_sliced_child(stream), stream);
+  }
+
+  CUDF_FAIL("Unexpected compound type");
+}
+
+[[nodiscard]] bool is_output_column_nullable(cudf::detail::LinkedColPtr const& column,
+                                             column_in_metadata const& column_metadata,
+                                             single_write_mode write_mode)
+{
+  if (column_metadata.is_nullability_defined()) {
+    CUDF_EXPECTS(column_metadata.nullable() or column->null_count() == 0,
+                 "Mismatch in metadata prescribed nullability and input column. "
+                 "Metadata for input column with nulls cannot prescribe nullability = false");
+    return column_metadata.nullable();
+  }
+  // For chunked write, when not provided nullability, we assume the worst case scenario
+  // that all columns are nullable.
+  return write_mode == single_write_mode::NO or column->nullable();
+}
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/writer_impl_helpers.hpp b/cpp/src/io/parquet/writer_impl_helpers.hpp
new file mode 100644
index 00000000000..a85411594e9
--- /dev/null
+++ b/cpp/src/io/parquet/writer_impl_helpers.hpp
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * @file writer_impl_helpers.hpp
+ * @brief Helper function implementation for Parquet writer
+ */
+
+#pragma once
+#include "io/comp/nvcomp_adapter.hpp"
+#include "parquet_common.hpp"
+
+#include <cudf/detail/utilities/linked_column.hpp>
+#include <cudf/io/detail/parquet.hpp>
+
+namespace cudf::io::parquet::detail {
+
+/**
+ * @brief Function that translates GDF compression to parquet compression.
+ *
+ * @param compression The compression type
+ * @return The supported Parquet compression
+ */
+Compression to_parquet_compression(compression_type compression);
+
+/**
+ * @brief Function that translates the given compression codec to nvcomp compression type.
+ *
+ * @param codec Compression codec
+ * @return Translated nvcomp compression type
+ */
+nvcomp::compression_type to_nvcomp_compression_type(Compression codec);
+
+/**
+ * @brief Function that computes input alignment requirements for the given compression type.
+ *
+ * @param codec Compression codec
+ * @return Required alignment
+ */
+uint32_t page_alignment(Compression codec);
+
+/**
+ * @brief Gets the maximum compressed chunk size for the largest chunk uncompressed chunk in the
+ *        batch.
+ *
+ * @param codec Compression codec
+ * @param compression_blocksize Size of the largest uncompressed chunk in the batch
+ * @return Maximum compressed chunk size
+ */
+size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize);
+
+/**
+ * @brief Fill the table metadata with default column names.
+ *
+ * @param table_meta The table metadata to fill
+ */
+void fill_table_meta(table_input_metadata& table_meta);
+
+/**
+ * @brief Compute size (in bytes) of the data stored in the given column.
+ *
+ * @param column The input column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The data size of the input
+ */
+[[nodiscard]] size_t column_size(column_view const& column, rmm::cuda_stream_view stream);
+
+/**
+ * @brief Indicates if the column should be marked as nullable in the output schema
+ *
+ * Returns `true` if the input column is nullable or if the write mode is not set to
+ * write the table all at once instead of chunked.
+ *
+ * @param column A view of the (linked) column
+ * @param column_metadata Metadata of the column
+ * @param write_mode Flag to indicate that we are guaranteeing a single table write
+ *
+ * @return Whether the column is nullable.
+ */
+[[nodiscard]] bool is_output_column_nullable(cudf::detail::LinkedColPtr const& column,
+                                             column_in_metadata const& column_metadata,
+                                             ::cudf::io::detail::single_write_mode write_mode);
+
+}  // namespace cudf::io::parquet::detail
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index a1f4c7b81d8..e07ebe25322 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -35,7 +35,7 @@
 using cudf::test::iterators::no_nulls;
 
 template <typename mask_op_t>
-void test_durations(mask_op_t mask_op, bool use_byte_stream_split)
+void test_durations(mask_op_t mask_op, bool use_byte_stream_split, bool arrow_schema)
 {
   std::default_random_engine generator;
   std::uniform_int_distribution<int> distribution_d(0, 30);
@@ -76,20 +76,27 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split)
 
   auto filepath = temp_env->get_temp_filepath("Durations.parquet");
   cudf::io::parquet_writer_options out_opts =
-    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .write_arrow_schema(arrow_schema);
+
   cudf::io::write_parquet(out_opts);
 
   cudf::io::parquet_reader_options in_opts =
-    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+      .use_arrow_schema(arrow_schema);
   auto result = cudf::io::read_parquet(in_opts);
 
   auto durations_d_got =
     cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view());
 
-  auto durations_s_got =
-    cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view());
+  if (arrow_schema) {
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, result.tbl->view().column(1));
+  } else {
+    auto durations_s_got =
+      cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view());
+  }
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(2));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(3));
@@ -98,10 +105,15 @@ void test_durations(mask_op_t mask_op, bool use_byte_stream_split)
 
 TEST_F(ParquetWriterTest, Durations)
 {
-  test_durations([](auto i) { return true; }, false);
-  test_durations([](auto i) { return (i % 2) != 0; }, false);
-  test_durations([](auto i) { return (i % 3) != 0; }, false);
-  test_durations([](auto i) { return false; }, false);
+  test_durations([](auto i) { return true; }, false, false);
+  test_durations([](auto i) { return (i % 2) != 0; }, false, false);
+  test_durations([](auto i) { return (i % 3) != 0; }, false, false);
+  test_durations([](auto i) { return false; }, false, false);
+
+  test_durations([](auto i) { return true; }, false, true);
+  test_durations([](auto i) { return (i % 2) != 0; }, false, true);
+  test_durations([](auto i) { return (i % 3) != 0; }, false, true);
+  test_durations([](auto i) { return false; }, false, true);
 }
 
 TEST_F(ParquetWriterTest, MultiIndex)
@@ -493,6 +505,50 @@ TEST_F(ParquetWriterTest, DecimalWrite)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, table);
 }
 
+TEST_F(ParquetWriterTest, DecimalWriteWithArrowSchema)
+{
+  constexpr cudf::size_type num_rows = 500;
+  auto seq_col0                      = random_values<int32_t>(num_rows);
+  auto seq_col1                      = random_values<int64_t>(num_rows);
+
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
+
+  auto col0 = cudf::test::fixed_point_column_wrapper<int32_t>{
+    seq_col0.begin(), seq_col0.end(), valids, numeric::scale_type{5}};
+  auto col1 = cudf::test::fixed_point_column_wrapper<int64_t>{
+    seq_col1.begin(), seq_col1.end(), valids, numeric::scale_type{-9}};
+
+  auto table = table_view({col0, col1});
+
+  auto filepath = temp_env->get_temp_filepath("DecimalWriteWithArrowSchema.parquet");
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table)
+      .write_arrow_schema(true);
+
+  cudf::io::table_input_metadata expected_metadata(table);
+  // verify success if equal precision is given
+  expected_metadata.column_metadata[0].set_decimal_precision(
+    cudf::io::parquet::detail::MAX_DECIMAL32_PRECISION);
+  expected_metadata.column_metadata[1].set_decimal_precision(
+    cudf::io::parquet::detail::MAX_DECIMAL64_PRECISION);
+  args.set_metadata(std::move(expected_metadata));
+  cudf::io::write_parquet(args);
+
+  auto expected_col0 = cudf::test::fixed_point_column_wrapper<__int128_t>{
+    seq_col0.begin(), seq_col0.end(), valids, numeric::scale_type{5}};
+  auto expected_col1 = cudf::test::fixed_point_column_wrapper<__int128_t>{
+    seq_col1.begin(), seq_col1.end(), valids, numeric::scale_type{-9}};
+
+  auto expected_table = table_view({expected_col0, expected_col1});
+
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected_table);
+}
+
 TEST_F(ParquetWriterTest, RowGroupSizeInvalid)
 {
   auto const unused_table = std::make_unique<table>();
@@ -1935,10 +1991,15 @@ TEST_F(ParquetWriterTest, DecimalByteStreamSplit)
 
 TEST_F(ParquetWriterTest, DurationByteStreamSplit)
 {
-  test_durations([](auto i) { return true; }, true);
-  test_durations([](auto i) { return (i % 2) != 0; }, true);
-  test_durations([](auto i) { return (i % 3) != 0; }, true);
-  test_durations([](auto i) { return false; }, true);
+  test_durations([](auto i) { return true; }, true, false);
+  test_durations([](auto i) { return (i % 2) != 0; }, true, false);
+  test_durations([](auto i) { return (i % 3) != 0; }, true, false);
+  test_durations([](auto i) { return false; }, true, false);
+
+  test_durations([](auto i) { return true; }, true, true);
+  test_durations([](auto i) { return (i % 2) != 0; }, true, true);
+  test_durations([](auto i) { return (i % 3) != 0; }, true, true);
+  test_durations([](auto i) { return false; }, true, true);
 }
 
 TEST_F(ParquetWriterTest, WriteFixedLenByteArray)
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index d1ec5be9e62..158fb6051c3 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -440,6 +440,7 @@ def write_parquet(
     object column_encoding=None,
     object column_type_length=None,
     object output_as_binary=None,
+    write_arrow_schema=False,
 ):
     """
     Cython function to call into libcudf API, see `write_parquet`.
@@ -544,6 +545,7 @@ def write_parquet(
         .write_v2_headers(header_version == "2.0")
         .dictionary_policy(dict_policy)
         .utc_timestamps(False)
+        .write_arrow_schema(write_arrow_schema)
         .build()
     )
     if partitions_info is not None:
@@ -623,6 +625,9 @@ cdef class ParquetWriter:
         If ``True``, enable dictionary encoding for Parquet page data
         subject to ``max_dictionary_size`` constraints.
         If ``False``, disable dictionary encoding for Parquet page data.
+    store_schema : bool, default False
+        If ``True``, enable computing and writing arrow schema to Parquet
+        file footer's key-value metadata section for faithful round-tripping.
     See Also
     --------
     cudf.io.parquet.write_parquet
@@ -641,6 +646,7 @@ cdef class ParquetWriter:
     cdef size_type max_page_size_rows
     cdef size_t max_dictionary_size
     cdef cudf_io_types.dictionary_policy dict_policy
+    cdef bool write_arrow_schema
 
     def __cinit__(self, object filepath_or_buffer, object index=None,
                   object compression="snappy", str statistics="ROWGROUP",
@@ -649,7 +655,8 @@ cdef class ParquetWriter:
                   int max_page_size_bytes=524288,
                   int max_page_size_rows=20000,
                   int max_dictionary_size=1048576,
-                  bool use_dictionary=True):
+                  bool use_dictionary=True,
+                  bool store_schema=False):
         filepaths_or_buffers = (
             list(filepath_or_buffer)
             if is_list_like(filepath_or_buffer)
@@ -670,6 +677,7 @@ cdef class ParquetWriter:
             if use_dictionary
             else cudf_io_types.dictionary_policy.NEVER
         )
+        self.write_arrow_schema = store_schema
 
     def write_table(self, table, object partitions_info=None):
         """ Writes a single table to the file """
@@ -788,6 +796,7 @@ cdef class ParquetWriter:
                 .max_page_size_bytes(self.max_page_size_bytes)
                 .max_page_size_rows(self.max_page_size_rows)
                 .max_dictionary_size(self.max_dictionary_size)
+                .write_arrow_schema(self.write_arrow_schema)
                 .build()
             )
             args.set_dictionary_policy(self.dict_policy)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
index 0ef6553db56..c38f39f7749 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
@@ -78,6 +78,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         size_t get_max_page_size_bytes() except +
         size_type get_max_page_size_rows() except +
         size_t get_max_dictionary_size() except +
+        bool is_enabled_write_arrow_schema() except +
 
         void set_metadata(
             cudf_io_types.table_input_metadata m
@@ -103,6 +104,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_max_page_size_rows(size_type val) except +
         void set_max_dictionary_size(size_t val) except +
         void enable_write_v2_headers(bool val) except +
+        void enable_write_arrow_schema(bool val) except +
         void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except +
 
     cdef cppclass parquet_writer_options(parquet_writer_options_base):
@@ -143,6 +145,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         BuilderT& utc_timestamps(
             bool enabled
         ) except +
+        BuilderT& write_arrow_schema(
+            bool enabled
+        ) except +
         BuilderT& row_group_size_bytes(
             size_t val
         ) except +
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 7733e770d99..fd0792b5edb 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -73,6 +73,7 @@ def _write_parquet(
     column_encoding=None,
     column_type_length=None,
     output_as_binary=None,
+    write_arrow_schema=True,
 ):
     if is_list_like(paths) and len(paths) > 1:
         if partitions_info is None:
@@ -110,6 +111,7 @@ def _write_parquet(
         "column_encoding": column_encoding,
         "column_type_length": column_type_length,
         "output_as_binary": output_as_binary,
+        "write_arrow_schema": write_arrow_schema,
     }
     if all(ioutils.is_fsspec_open_file(buf) for buf in paths_or_bufs):
         with ExitStack() as stack:
@@ -154,6 +156,7 @@ def write_to_dataset(
     column_encoding=None,
     column_type_length=None,
     output_as_binary=None,
+    store_schema=False,
 ):
     """Wraps `to_parquet` to write partitioned Parquet datasets.
     For each combination of partition group and value,
@@ -242,6 +245,9 @@ def write_to_dataset(
     output_as_binary : set, optional, default None
         If a column name is present in the set, that column will be output as
         unannotated binary, rather than the default 'UTF-8'.
+    store_schema : bool, default False
+        If ``True``, enable computing and writing arrow schema to Parquet
+        file footer's key-value metadata section for faithful round-tripping.
     """
 
     fs = ioutils._ensure_filesystem(fs, root_path, storage_options)
@@ -285,6 +291,7 @@ def write_to_dataset(
             column_encoding=column_encoding,
             column_type_length=column_type_length,
             output_as_binary=output_as_binary,
+            store_schema=store_schema,
         )
 
     else:
@@ -312,6 +319,7 @@ def write_to_dataset(
             column_encoding=column_encoding,
             column_type_length=column_type_length,
             output_as_binary=output_as_binary,
+            store_schema=store_schema,
         )
 
     return metadata
@@ -968,6 +976,7 @@ def to_parquet(
     column_encoding=None,
     column_type_length=None,
     output_as_binary=None,
+    store_schema=False,
     *args,
     **kwargs,
 ):
@@ -1023,6 +1032,7 @@ def to_parquet(
                 column_encoding=column_encoding,
                 column_type_length=column_type_length,
                 output_as_binary=output_as_binary,
+                store_schema=store_schema,
             )
 
         partition_info = (
@@ -1055,6 +1065,7 @@ def to_parquet(
             column_encoding=column_encoding,
             column_type_length=column_type_length,
             output_as_binary=output_as_binary,
+            write_arrow_schema=store_schema,
         )
 
     else:
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 588bc87d268..ff0c9040737 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1617,7 +1617,11 @@ def test_parquet_writer_int96_timestamps(tmpdir, pdf, gdf):
     assert_eq(pdf, gdf)
 
     # Write out the gdf using the GPU accelerated writer with INT96 timestamps
-    gdf.to_parquet(gdf_fname.strpath, index=None, int96_timestamps=True)
+    gdf.to_parquet(
+        gdf_fname.strpath,
+        index=None,
+        int96_timestamps=True,
+    )
 
     assert os.path.exists(gdf_fname)
 
@@ -1789,10 +1793,11 @@ def test_parquet_write_bytes_io(simple_gdf):
     assert_eq(cudf.read_parquet(output), simple_gdf)
 
 
-def test_parquet_writer_bytes_io(simple_gdf):
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_writer_bytes_io(simple_gdf, store_schema):
     output = BytesIO()
 
-    writer = ParquetWriter(output)
+    writer = ParquetWriter(output, store_schema=store_schema)
     writer.write_table(simple_gdf)
     writer.write_table(simple_gdf)
     writer.close()
@@ -2124,7 +2129,8 @@ def test_parquet_writer_chunked_partitioned_context(tmpdir_factory):
 
 
 @pytest.mark.parametrize("cols", [None, ["b"]])
-def test_parquet_write_to_dataset(tmpdir_factory, cols):
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_write_to_dataset(tmpdir_factory, cols, store_schema):
     dir1 = tmpdir_factory.mktemp("dir1")
     dir2 = tmpdir_factory.mktemp("dir2")
     if cols is None:
@@ -2140,7 +2146,7 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols):
             "b": np.random.choice(np.arange(4), size=size),
         }
     )
-    gdf.to_parquet(dir1, partition_cols=cols)
+    gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema)
     cudf.io.write_to_dataset(gdf, dir2, partition_cols=cols)
 
     # Read back with cudf
@@ -2156,7 +2162,7 @@ def test_parquet_write_to_dataset(tmpdir_factory, cols):
         }
     )
     with pytest.raises(ValueError):
-        gdf.to_parquet(dir1, partition_cols=cols)
+        gdf.to_parquet(dir1, partition_cols=cols, store_schema=store_schema)
 
 
 @pytest.mark.parametrize(
@@ -2386,7 +2392,8 @@ def test_parquet_writer_list_large_mixed(tmpdir):
     assert_eq(expect, got)
 
 
-def test_parquet_writer_list_chunked(tmpdir):
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_writer_list_chunked(tmpdir, store_schema):
     table1 = cudf.DataFrame(
         {
             "a": list_gen(string_gen, 128, 80, 50),
@@ -2407,7 +2414,7 @@ def test_parquet_writer_list_chunked(tmpdir):
     expect = cudf.concat([table1, table2])
     expect = expect.reset_index(drop=True)
 
-    writer = ParquetWriter(fname)
+    writer = ParquetWriter(fname, store_schema=store_schema)
     writer.write_table(table1)
     writer.write_table(table2)
     writer.close()
@@ -2542,6 +2549,10 @@ def normalized_equals(value1, value2):
         value1 = None
     if value2 is pd.NA or value2 is pd.NaT:
         value2 = None
+    if isinstance(value1, np.datetime64):
+        value1 = pd.Timestamp(value1).to_pydatetime()
+    if isinstance(value2, np.datetime64):
+        value2 = pd.Timestamp(value2).to_pydatetime()
     if isinstance(value1, pd.Timestamp):
         value1 = value1.to_pydatetime()
     if isinstance(value2, pd.Timestamp):
@@ -2550,6 +2561,9 @@ def normalized_equals(value1, value2):
         value1 = value1.replace(tzinfo=None)
     if isinstance(value2, datetime.datetime):
         value2 = value2.replace(tzinfo=None)
+    if isinstance(value1, pd.Timedelta):
+        unit = "ms" if value1.unit == "s" else value1.unit
+        value2 = pd.Timedelta(value2, unit=unit)
 
     # if one is datetime then both values are datetimes now
     if isinstance(value1, datetime.datetime):
@@ -2563,7 +2577,8 @@ def normalized_equals(value1, value2):
 
 
 @pytest.mark.parametrize("add_nulls", [True, False])
-def test_parquet_writer_statistics(tmpdir, pdf, add_nulls):
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_writer_statistics(tmpdir, pdf, add_nulls, store_schema):
     file_path = tmpdir.join("cudf.parquet")
     if "col_category" in pdf.columns:
         pdf = pdf.drop(columns=["col_category", "col_bool"])
@@ -2580,7 +2595,7 @@ def test_parquet_writer_statistics(tmpdir, pdf, add_nulls):
     if add_nulls:
         for col in gdf:
             set_random_null_mask_inplace(gdf[col])
-    gdf.to_parquet(file_path, index=False)
+    gdf.to_parquet(file_path, index=False, store_schema=store_schema)
 
     # Read back from pyarrow
     pq_file = pq.ParquetFile(file_path)
@@ -3205,7 +3220,8 @@ def test_parquet_writer_zstd():
         assert_eq(expected, got)
 
 
-def test_parquet_writer_time_delta_physical_type():
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_writer_time_delta_physical_type(store_schema):
     df = cudf.DataFrame(
         {
             "s": cudf.Series([1], dtype="timedelta64[s]"),
@@ -3217,22 +3233,35 @@ def test_parquet_writer_time_delta_physical_type():
         }
     )
     buffer = BytesIO()
-    df.to_parquet(buffer)
+    df.to_parquet(buffer, store_schema=store_schema)
 
     got = pd.read_parquet(buffer)
-    expected = pd.DataFrame(
-        {
-            "s": ["00:00:01"],
-            "ms": ["00:00:00.002000"],
-            "us": ["00:00:00.000003"],
-            "ns": ["00:00:00.000004"],
-        },
-        dtype="str",
-    )
+
+    if store_schema:
+        expected = pd.DataFrame(
+            {
+                "s": ["0 days 00:00:01"],
+                "ms": ["0 days 00:00:00.002000"],
+                "us": ["0 days 00:00:00.000003"],
+                "ns": ["0 days 00:00:00.000004"],
+            },
+            dtype="str",
+        )
+    else:
+        expected = pd.DataFrame(
+            {
+                "s": ["00:00:01"],
+                "ms": ["00:00:00.002000"],
+                "us": ["00:00:00.000003"],
+                "ns": ["00:00:00.000004"],
+            },
+            dtype="str",
+        )
     assert_eq(got.astype("str"), expected)
 
 
-def test_parquet_roundtrip_time_delta():
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_roundtrip_time_delta(store_schema):
     num_rows = 12345
     df = cudf.DataFrame(
         {
@@ -3255,10 +3284,11 @@ def test_parquet_roundtrip_time_delta():
         }
     )
     buffer = BytesIO()
-    df.to_parquet(buffer)
-    # TODO: Remove `check_dtype` once following issue is fixed in arrow:
-    # https://github.com/apache/arrow/issues/33321
+    df.to_parquet(buffer, store_schema=store_schema)
+    # `check_dtype` cannot be removed here as timedelta64[s] will change to `timedelta[ms]`
     assert_eq(df, cudf.read_parquet(buffer), check_dtype=False)
+    if store_schema:
+        assert_eq(df, pd.read_parquet(buffer))
 
 
 def test_parquet_reader_malformed_file(datadir):
@@ -3420,35 +3450,87 @@ def test_parquet_reader_roundtrip_with_arrow_schema():
     # Check results for reader with schema
     assert_eq(expected, got)
 
+    # Reset buffer
+    buffer = BytesIO()
 
-def test_parquet_reader_roundtrip_structs_with_arrow_schema():
-    # Ensure that the structs with duration types are faithfully being
-    # roundtripped across Parquet with arrow schema
-    pdf = pd.DataFrame(
-        {
-            "struct": {
-                "payload": {
-                    "Domain": {
-                        "Name": "abc",
-                        "Id": {"Name": "host", "Value": "127.0.0.8"},
-                        "Duration": datetime.timedelta(minutes=12),
-                    },
-                    "StreamId": "12345678",
-                    "Duration": datetime.timedelta(minutes=4),
-                    "Offset": None,
-                    "Resource": [
-                        {
-                            "Name": "ZoneName",
-                            "Value": "RAPIDS",
-                            "Duration": datetime.timedelta(seconds=1),
-                        }
-                    ],
+    # Write to buffer with cudf
+    expected.to_parquet(buffer, store_schema=True)
+
+    # Read parquet with arrow schema
+    got = cudf.read_parquet(buffer)
+    # Convert to cudf table for an apple to apple comparison
+    expected = cudf.from_pandas(pdf)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        # struct
+        [
+            {"a": 1, "b": 2},
+            {"a": 10, "b": 20},
+            {"a": None, "b": 22},
+            {"a": None, "b": None},
+            {"a": 15, "b": None},
+        ],
+        # struct-of-list
+        [
+            {"a": 1, "b": 2, "c": [1, 2, 3]},
+            {"a": 10, "b": 20, "c": [4, 5]},
+            {"a": None, "b": 22, "c": [6]},
+            {"a": None, "b": None, "c": None},
+            {"a": 15, "b": None, "c": [-1, -2]},
+            None,
+            {"a": 100, "b": 200, "c": [-10, None, -20]},
+        ],
+        # list-of-struct
+        [
+            [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}],
+            None,
+            [{"a": 10, "b": 20}],
+            [{"a": 100, "b": 200}, {"a": None, "b": 300}, None],
+        ],
+        # struct-of-struct
+        [
+            {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2},
+            {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4},
+            {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6},
+            {"a": 7, "b": None, "c": 8},
+            {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None},
+            None,
+            {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10},
+        ],
+        # struct-with-mixed-types
+        [
+            {
+                "struct": {
+                    "payload": {
+                        "Domain": {
+                            "Name": "abc",
+                            "Id": {"Name": "host", "Value": "127.0.0.8"},
+                            "Duration": datetime.timedelta(minutes=12),
+                        },
+                        "StreamId": "12345678",
+                        "Duration": datetime.timedelta(minutes=4),
+                        "Offset": None,
+                        "Resource": [
+                            {
+                                "Name": "ZoneName",
+                                "Value": "RAPIDS",
+                                "Duration": datetime.timedelta(seconds=1),
+                            }
+                        ],
+                    }
                 }
             }
-        }
-    )
+        ],
+    ],
+)
+def test_parquet_reader_roundtrip_structs_with_arrow_schema(tmpdir, data):
+    # Ensure that the structs with duration types are faithfully being
+    # roundtripped across Parquet with arrow schema
+    pdf = pd.DataFrame({"struct": pd.Series(data)})
 
-    # Reset the buffer and write parquet with arrow
     buffer = BytesIO()
     pdf.to_parquet(buffer, engine="pyarrow")
 
@@ -3460,6 +3542,203 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema():
     # Check results
     assert_eq(expected, got)
 
+    # Reset buffer
+    buffer = BytesIO()
+
+    # Write to buffer with cudf
+    expected.to_parquet(buffer, store_schema=True)
+
+    # Read parquet with arrow schema
+    got = cudf.read_parquet(buffer)
+    # Convert to cudf table for an apple to apple comparison
+    expected = cudf.from_pandas(pdf)
+
+    # Check results
+    assert_eq(expected, got)
+
+
+@pytest.mark.parametrize("index", [None, True, False])
+def test_parquet_writer_roundtrip_with_arrow_schema(index):
+    # Ensure that the concrete and nested types are faithfully being roundtripped
+    # across Parquet with arrow schema
+    expected = cudf.DataFrame(
+        {
+            "s": cudf.Series([None, None, None], dtype="timedelta64[s]"),
+            "us": cudf.Series([None, 3456, None], dtype="timedelta64[us]"),
+            "duration_list": list(
+                [
+                    [
+                        datetime.timedelta(minutes=7, seconds=4),
+                        datetime.timedelta(minutes=7),
+                    ],
+                    [
+                        None,
+                        None,
+                    ],
+                    [
+                        datetime.timedelta(minutes=7, seconds=4),
+                        None,
+                    ],
+                ]
+            ),
+            "int64": cudf.Series([-1234, 123, 4123], dtype="int64"),
+            "uint32": cudf.Series([1234, 123, 4123], dtype="uint32"),
+            "list": list([[1, 2], [1, 2], [1, 2]]),
+            "bool": cudf.Series([True, None, False], dtype=bool),
+            "fixed32": cudf.Series([0.00, 1.0, None]).astype(
+                cudf.Decimal32Dtype(7, 2)
+            ),
+            "fixed64": cudf.Series([0.00, 1.0, None]).astype(
+                cudf.Decimal64Dtype(7, 2)
+            ),
+            "fixed128": cudf.Series([0.00, 1.0, None]).astype(
+                cudf.Decimal128Dtype(7, 2)
+            ),
+            "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"),
+            "map": cudf.Series(["cat", "dog", "lion"]).map(
+                {"cat": "kitten", "dog": "puppy", "lion": "cub"}
+            ),
+        }
+    )
+
+    # Write to Parquet with arrow schema for faithful roundtrip
+    buffer = BytesIO()
+    expected.to_parquet(buffer, store_schema=True, index=index)
+
+    # Convert decimal types to d128
+    expected = expected.astype({"fixed32": cudf.Decimal128Dtype(9, 2)})
+    expected = expected.astype({"fixed64": cudf.Decimal128Dtype(18, 2)})
+
+    # Read parquet with pyarrow, pandas and cudf readers
+    got = cudf.DataFrame.from_arrow(pq.read_table(buffer))
+    got2 = cudf.DataFrame.from_pandas(pd.read_parquet(buffer))
+    got3 = cudf.read_parquet(buffer)
+
+    # drop the index column for comparison: __index_level_0__
+    if index:
+        got.drop(columns="__index_level_0__", inplace=True)
+        got2.drop(columns="__index_level_0__", inplace=True)
+
+    # Check results
+    assert_eq(expected, got)
+    assert_eq(expected, got2)
+    assert_eq(expected, got3)
+
+
+def test_parquet_writer_int96_timestamps_and_arrow_schema():
+    df = cudf.DataFrame(
+        {
+            "timestamp": cudf.Series(
+                [1234, 123, 4123], dtype="datetime64[ms]"
+            ),
+        }
+    )
+
+    # Output buffer
+    buffer = BytesIO()
+
+    # Writing out parquet with both INT96 timestamps and arrow_schema
+    # enabled should throw an exception.
+    with pytest.raises(RuntimeError):
+        df.to_parquet(buffer, int96_timestamps=True, store_schema=True)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        # struct
+        [
+            {"a": 1, "b": 2},
+            {"a": 10, "b": 20},
+            {"a": None, "b": 22},
+            {"a": None, "b": None},
+            {"a": 15, "b": None},
+        ],
+        # struct-of-list
+        [
+            {"a": 1, "b": 2, "c": [1, 2, 3]},
+            {"a": 10, "b": 20, "c": [4, 5]},
+            {"a": None, "b": 22, "c": [6]},
+            {"a": None, "b": None, "c": None},
+            {"a": 15, "b": None, "c": [-1, -2]},
+            None,
+            {"a": 100, "b": 200, "c": [-10, None, -20]},
+        ],
+        # list-of-struct
+        [
+            [{"a": 1, "b": 2}, {"a": 2, "b": 3}, {"a": 4, "b": 5}],
+            None,
+            [{"a": 10, "b": 20}],
+            [{"a": 100, "b": 200}, {"a": None, "b": 300}, None],
+        ],
+        # struct-of-struct
+        [
+            {"a": 1, "b": {"inner_a": 10, "inner_b": 20}, "c": 2},
+            {"a": 3, "b": {"inner_a": 30, "inner_b": 40}, "c": 4},
+            {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6},
+            {"a": 7, "b": None, "c": 8},
+            {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None},
+            None,
+            {"a": None, "b": {"inner_a": None, "inner_b": 100}, "c": 10},
+        ],
+        # struct-with-mixed-types
+        [
+            {
+                "struct": {
+                    "payload": {
+                        "Domain": {
+                            "Name": "abc",
+                            "Id": {"Name": "host", "Value": "127.0.0.8"},
+                            "Duration": datetime.timedelta(minutes=12),
+                        },
+                        "StreamId": "12345678",
+                        "Duration": datetime.timedelta(minutes=4),
+                        "Offset": None,
+                        "Resource": [
+                            {
+                                "Name": "ZoneName",
+                                "Value": "RAPIDS",
+                                "Duration": datetime.timedelta(seconds=1),
+                            }
+                        ],
+                    }
+                }
+            }
+        ],
+    ],
+)
+@pytest.mark.parametrize("index", [None, True, False])
+def test_parquet_writer_roundtrip_structs_with_arrow_schema(
+    tmpdir, data, index
+):
+    # Ensure that the structs are faithfully being roundtripped across
+    # Parquet with arrow schema
+    pa_expected = pa.Table.from_pydict({"struct": data})
+
+    expected = cudf.DataFrame.from_arrow(pa_expected)
+
+    # Write expected data frame to Parquet with arrow schema
+    buffer = BytesIO()
+    expected.to_parquet(buffer, store_schema=True, index=index)
+
+    # Read Parquet with pyarrow
+    pa_got = pq.read_table(buffer)
+
+    # drop the index column for comparison: __index_level_0__
+    if index:
+        pa_got = pa_got.drop(columns="__index_level_0__")
+
+    # Check results
+    assert_eq(pa_expected, pa_got)
+
+    # Convert to cuDF table and also read Parquet with cuDF reader
+    got = cudf.DataFrame.from_arrow(pa_got)
+    got2 = cudf.read_parquet(buffer)
+
+    # Check results
+    assert_eq(expected, got)
+    assert_eq(expected, got2)
+
 
 @pytest.mark.parametrize("chunk_read_limit", [0, 240, 1024000000])
 @pytest.mark.parametrize("pass_read_limit", [0, 240, 1024000000])
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 0209c692935..76c7f2bfdb8 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -322,6 +322,12 @@
 output_as_binary : set, optional, default None
     If a column name is present in the set, that column will be output as
     unannotated binary, rather than the default 'UTF-8'.
+store_schema : bool, default False
+    If ``True``, writes arrow schema to Parquet file footer's key-value
+    metadata section to faithfully round-trip ``duration`` types with arrow.
+    This cannot be used with ``int96_timestamps`` enabled as int96 timestamps
+    are deprecated in arrow. Also, all decimal32 and decimal64 columns will be
+    converted to decimal128 as arrow only supports decimal128 and decimal256 types.
 **kwargs
     Additional parameters will be passed to execution engines other
     than ``cudf``.

From f592e9c4bfcc2d8e887ad5f96e5167ee0ee2c73a Mon Sep 17 00:00:00 2001
From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
Date: Tue, 9 Jul 2024 21:00:57 -0700
Subject: [PATCH 487/842] Add groupby_max multi-threaded benchmark (#16154)

This PR adds **groupby_max** multi-threaded benchmark. The benchmark runs multiple **max groupby aggregations** concurrently using one CUDA stream per host thread.

Closes #16134

Authors:
  - Srinivas Yadav (https://github.com/srinivasyadav18)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16154
---
 cpp/benchmarks/CMakeLists.txt                 |   4 +-
 cpp/benchmarks/groupby/group_max.cpp          |  16 ++-
 .../groupby/group_max_multithreaded.cpp       | 102 ++++++++++++++++++
 3 files changed, 115 insertions(+), 7 deletions(-)
 create mode 100644 cpp/benchmarks/groupby/group_max_multithreaded.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index a5b248135c1..ff431c7f260 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -231,8 +231,8 @@ ConfigureBench(
 )
 
 ConfigureNVBench(
-  GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_nunique.cpp groupby/group_rank.cpp
-  groupby/group_struct_keys.cpp
+  GROUPBY_NVBENCH groupby/group_max.cpp groupby/group_max_multithreaded.cpp
+  groupby/group_nunique.cpp groupby/group_rank.cpp groupby/group_struct_keys.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp
index 01ca23ebbf8..f41285008c4 100644
--- a/cpp/benchmarks/groupby/group_max.cpp
+++ b/cpp/benchmarks/groupby/group_max.cpp
@@ -48,20 +48,25 @@ void groupby_max_helper(nvbench::state& state,
       cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
   }();
 
+  auto const num_aggregations = state.get_int64("num_aggregations");
+
   auto keys_view = keys->view();
   auto gb_obj    = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view}));
 
   std::vector<cudf::groupby::aggregation_request> requests;
-  requests.emplace_back(cudf::groupby::aggregation_request());
-  requests[0].values = vals->view();
-  requests[0].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+  for (int64_t i = 0; i < num_aggregations; i++) {
+    requests.emplace_back(cudf::groupby::aggregation_request());
+    requests[i].values = vals->view();
+    requests[i].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+  }
 
   auto const mem_stats_logger = cudf::memory_stats_logger();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
   auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(num_rows) / elapsed_time / 1'000'000., "Mrows/s");
+  state.add_element_count(
+    static_cast<double>(num_rows * num_aggregations) / elapsed_time / 1'000'000., "Mrows/s");
   state.add_buffer_size(
     mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
 }
@@ -91,7 +96,8 @@ NVBENCH_BENCH_TYPES(bench_groupby_max,
   .set_name("groupby_max")
   .add_int64_axis("cardinality", {0})
   .add_int64_power_of_two_axis("num_rows", {12, 18, 24})
-  .add_float64_axis("null_probability", {0, 0.1, 0.9});
+  .add_float64_axis("null_probability", {0, 0.1, 0.9})
+  .add_int64_axis("num_aggregations", {1, 2, 4, 8, 16, 32});
 
 NVBENCH_BENCH_TYPES(bench_groupby_max_cardinality, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t>))
   .set_name("groupby_max_cardinality")
diff --git a/cpp/benchmarks/groupby/group_max_multithreaded.cpp b/cpp/benchmarks/groupby/group_max_multithreaded.cpp
new file mode 100644
index 00000000000..3b8faba618f
--- /dev/null
+++ b/cpp/benchmarks/groupby/group_max_multithreaded.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+
+#include <cudf/detail/utilities/stream_pool.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/thread_pool.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+template <typename Type>
+void bench_groupby_max_multithreaded(nvbench::state& state, nvbench::type_list<Type>)
+{
+  auto const cardinality      = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const num_rows         = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const null_probability = state.get_float64("null_probability");
+  auto const num_threads      = state.get_int64("num_threads");
+  auto const num_aggregations = state.get_int64("num_aggregations");
+
+  auto const keys = [&] {
+    data_profile const profile =
+      data_profile_builder()
+        .cardinality(cardinality)
+        .no_validity()
+        .distribution(cudf::type_to_id<int32_t>(), distribution_id::UNIFORM, 0, num_rows);
+    return create_random_column(cudf::type_to_id<int32_t>(), row_count{num_rows}, profile);
+  }();
+
+  auto const vals = [&] {
+    auto builder = data_profile_builder().cardinality(0).distribution(
+      cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, num_rows);
+    if (null_probability > 0) {
+      builder.null_probability(null_probability);
+    } else {
+      builder.no_validity();
+    }
+    return create_random_column(
+      cudf::type_to_id<Type>(), row_count{num_rows}, data_profile{builder});
+  }();
+
+  auto keys_view = keys->view();
+  auto gb_obj    = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view}));
+
+  auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
+  cudf::detail::thread_pool threads(num_threads);
+
+  std::vector<std::vector<cudf::groupby::aggregation_request>> requests(num_threads);
+  for (auto& thread_requests : requests) {
+    for (int64_t j = 0; j < num_aggregations; j++) {
+      thread_requests.emplace_back();
+      thread_requests.back().values = vals->view();
+      thread_requests.back().aggregations.push_back(
+        cudf::make_max_aggregation<cudf::groupby_aggregation>());
+    }
+  }
+
+  auto const mem_stats_logger = cudf::memory_stats_logger();
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      auto perform_agg = [&](int64_t index) { gb_obj.aggregate(requests[index], streams[index]); };
+      timer.start();
+      for (int64_t i = 0; i < num_threads; ++i) {
+        threads.submit(perform_agg, i);
+      }
+      threads.wait_for_tasks();
+      cudf::detail::join_streams(streams, cudf::get_default_stream());
+      cudf::get_default_stream().synchronize();
+      timer.stop();
+    });
+
+  auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(
+    static_cast<double>(num_rows * num_threads * num_aggregations) / elapsed_time / 1'000'000.,
+    "Mrows/s");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+}
+
+NVBENCH_BENCH_TYPES(bench_groupby_max_multithreaded,
+                    NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, int64_t, float, double>))
+  .set_name("groupby_max_multithreaded")
+  .add_int64_axis("cardinality", {0})
+  .add_int64_power_of_two_axis("num_rows", {12, 18})
+  .add_float64_axis("null_probability", {0, 0.1, 0.9})
+  .add_int64_axis("num_aggregations", {1})
+  .add_int64_axis("num_threads", {1, 2, 4, 8});

From 7b8169ab1b042b790622c0178bc41e3045a99305 Mon Sep 17 00:00:00 2001
From: nvdbaranec <56695930+nvdbaranec@users.noreply.github.com>
Date: Wed, 10 Jul 2024 10:57:41 -0500
Subject: [PATCH 488/842] Disable dict support for split-page kernel in the
 parquet reader. (#16128)

Dictionary support for this particular flavor of kernel was being compiled in. Harmless, but caused an unneeded increase in shared memory usage.  This PR disables it.

Authors:
  - https://github.com/nvdbaranec

Approvers:
  - Ed Seidl (https://github.com/etseidl)
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/16128
---
 cpp/src/io/parquet/decode_fixed.cu | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/src/io/parquet/decode_fixed.cu b/cpp/src/io/parquet/decode_fixed.cu
index ea80ae73c2f..8a866141c4b 100644
--- a/cpp/src/io/parquet/decode_fixed.cu
+++ b/cpp/src/io/parquet/decode_fixed.cu
@@ -792,7 +792,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
-                               true,
+                               false,
                                true,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
@@ -801,7 +801,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
       gpuDecodePageDataGeneric<uint8_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
-                               true,
+                               false,
                                false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
@@ -812,7 +812,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_NESTED,
-                               true,
+                               false,
                                true,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(
@@ -821,7 +821,7 @@ DecodeSplitPageFixedWidthData(cudf::detail::hostdevice_span<PageInfo> pages,
       gpuDecodePageDataGeneric<uint16_t,
                                decode_block_size,
                                decode_kernel_mask::BYTE_STREAM_SPLIT_FIXED_WIDTH_FLAT,
-                               true,
+                               false,
                                false,
                                decode_fixed_width_split_values_func>
         <<<dim_grid, dim_block, 0, stream.value()>>>(

From 11a5174a4639f980e9502b1eccc39ee3b4587d11 Mon Sep 17 00:00:00 2001
From: Robert Maynard <robertjmaynard@gmail.com>
Date: Wed, 10 Jul 2024 14:10:24 -0400
Subject: [PATCH 489/842] Promote IO support queries to cudf API (#16125)

Promote the ability to query the status of cufile and nvcomp support to the public API. It seems like these kind of questions would want to be asked by external users.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16125
---
 cpp/include/cudf/io/config_utils.hpp          |  53 +++++++++
 cpp/include/cudf/io/nvcomp_adapter.hpp        | 106 ++++++++++++++++++
 cpp/include/cudf/utilities/logger.hpp         |   8 +-
 cpp/src/io/comp/nvcomp_adapter.cpp            |   8 +-
 cpp/src/io/comp/nvcomp_adapter.hpp            |  67 +----------
 cpp/src/io/orc/reader_impl_decode.cu          |   2 +-
 cpp/src/io/orc/stripe_enc.cu                  |   2 +-
 cpp/src/io/orc/writer_impl.cu                 |   1 +
 cpp/src/io/parquet/reader_impl_chunking.cu    |   6 +-
 cpp/src/io/parquet/writer_impl.cu             |   2 +-
 cpp/src/io/text/bgzip_data_chunk_source.cu    |   2 +-
 cpp/src/io/utilities/config_utils.cpp         |   8 +-
 cpp/src/io/utilities/data_sink.cpp            |   5 +-
 cpp/src/io/utilities/datasource.cpp           |   7 +-
 cpp/src/io/utilities/file_io_utilities.cpp    |   4 +-
 .../{config_utils.hpp => getenv_or.hpp}       |  42 +------
 16 files changed, 199 insertions(+), 124 deletions(-)
 create mode 100644 cpp/include/cudf/io/config_utils.hpp
 create mode 100644 cpp/include/cudf/io/nvcomp_adapter.hpp
 rename cpp/src/io/utilities/{config_utils.hpp => getenv_or.hpp} (63%)

diff --git a/cpp/include/cudf/io/config_utils.hpp b/cpp/include/cudf/io/config_utils.hpp
new file mode 100644
index 00000000000..1827ba0e3e6
--- /dev/null
+++ b/cpp/include/cudf/io/config_utils.hpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/utilities/export.hpp>
+
+namespace CUDF_EXPORT cudf {
+namespace io::cufile_integration {
+
+/**
+ * @brief Returns true if cuFile and its compatibility mode are enabled.
+ */
+bool is_always_enabled();
+
+/**
+ * @brief Returns true if only direct IO through cuFile is enabled (compatibility mode is disabled).
+ */
+bool is_gds_enabled();
+
+/**
+ * @brief Returns true if KvikIO is enabled.
+ */
+bool is_kvikio_enabled();
+
+}  // namespace io::cufile_integration
+
+namespace io::nvcomp_integration {
+
+/**
+ * @brief Returns true if all nvCOMP uses are enabled.
+ */
+bool is_all_enabled();
+
+/**
+ * @brief Returns true if stable nvCOMP use is enabled.
+ */
+bool is_stable_enabled();
+
+}  // namespace io::nvcomp_integration
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/nvcomp_adapter.hpp b/cpp/include/cudf/io/nvcomp_adapter.hpp
new file mode 100644
index 00000000000..f3260d0cb53
--- /dev/null
+++ b/cpp/include/cudf/io/nvcomp_adapter.hpp
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/export.hpp>
+
+#include <optional>
+#include <string>
+
+namespace CUDF_EXPORT cudf {
+namespace io::nvcomp {
+
+enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4 };
+
+/**
+ * @brief Set of parameters that impact whether nvCOMP features are enabled.
+ *
+ */
+struct feature_status_parameters {
+  int lib_major_version;                 ///< major version
+  int lib_minor_version;                 ///< minor version
+  int lib_patch_version;                 ///< patch version
+  bool are_all_integrations_enabled;     ///< all integrations
+  bool are_stable_integrations_enabled;  ///< stable integrations
+  int compute_capability_major;          ///< cuda compute major version
+
+  /**
+   * @brief Default Constructor
+   */
+  feature_status_parameters();
+
+  /**
+   * @brief feature_status_parameters Constructor
+   *
+   * @param major positive integer representing major value of nvcomp
+   * @param minor positive integer representing minor value of nvcomp
+   * @param patch positive integer representing patch value of nvcomp
+   * @param all_enabled if all integrations are enabled
+   * @param stable_enabled if stable integrations are enabled
+   * @param cc_major CUDA compute capability
+   */
+  feature_status_parameters(
+    int major, int minor, int patch, bool all_enabled, bool stable_enabled, int cc_major)
+    : lib_major_version{major},
+      lib_minor_version{minor},
+      lib_patch_version{patch},
+      are_all_integrations_enabled{all_enabled},
+      are_stable_integrations_enabled{stable_enabled},
+      compute_capability_major{cc_major}
+  {
+  }
+};
+
+/**
+ * @brief Equality operator overload. Required to use `feature_status_parameters` as a map key.
+ */
+inline bool operator==(feature_status_parameters const& lhs, feature_status_parameters const& rhs)
+{
+  return lhs.lib_major_version == rhs.lib_major_version and
+         lhs.lib_minor_version == rhs.lib_minor_version and
+         lhs.lib_patch_version == rhs.lib_patch_version and
+         lhs.are_all_integrations_enabled == rhs.are_all_integrations_enabled and
+         lhs.are_stable_integrations_enabled == rhs.are_stable_integrations_enabled and
+         lhs.compute_capability_major == rhs.compute_capability_major;
+}
+
+/**
+ * @brief If a compression type is disabled through nvCOMP, returns the reason as a string.
+ *
+ * Result depends on nvCOMP version and environment variables.
+ *
+ * @param compression Compression type
+ * @param params Optional parameters to query status with different configurations
+ * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled
+ */
+[[nodiscard]] std::optional<std::string> is_compression_disabled(
+  compression_type compression, feature_status_parameters params = feature_status_parameters());
+
+/**
+ * @brief If a decompression type is disabled through nvCOMP, returns the reason as a string.
+ *
+ * Result depends on nvCOMP version and environment variables.
+ *
+ * @param compression Compression type
+ * @param params Optional parameters to query status with different configurations
+ * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled
+ */
+[[nodiscard]] std::optional<std::string> is_decompression_disabled(
+  compression_type compression, feature_status_parameters params = feature_status_parameters());
+
+}  // namespace io::nvcomp
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/logger.hpp b/cpp/include/cudf/utilities/logger.hpp
index a39df064f44..45d5d1b12e1 100644
--- a/cpp/include/cudf/utilities/logger.hpp
+++ b/cpp/include/cudf/utilities/logger.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,11 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <spdlog/spdlog.h>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief Returns the global logger.
@@ -43,4 +45,4 @@ namespace cudf {
  */
 spdlog::logger& logger();
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index f8920bf82c2..0e34c96debd 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -13,11 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include "nvcomp_adapter.hpp"
 
-#include "io/utilities/config_utils.hpp"
 #include "nvcomp_adapter.cuh"
 
+#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/io/config_utils.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <nvcomp/lz4.h>
@@ -472,8 +474,8 @@ feature_status_parameters::feature_status_parameters()
   : lib_major_version{NVCOMP_MAJOR_VERSION},
     lib_minor_version{NVCOMP_MINOR_VERSION},
     lib_patch_version{NVCOMP_PATCH_VERSION},
-    are_all_integrations_enabled{detail::nvcomp_integration::is_all_enabled()},
-    are_stable_integrations_enabled{detail::nvcomp_integration::is_stable_enabled()}
+    are_all_integrations_enabled{nvcomp_integration::is_all_enabled()},
+    are_stable_integrations_enabled{nvcomp_integration::is_stable_enabled()}
 {
   int device;
   CUDF_CUDA_TRY(cudaGetDevice(&device));
diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
index 1a680a050fd..43c79e32375 100644
--- a/cpp/src/io/comp/nvcomp_adapter.hpp
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -17,8 +17,9 @@
 #pragma once
 
 #include "gpuinflate.hpp"
-#include "io/utilities/config_utils.hpp"
 
+#include <cudf/io/config_utils.hpp>
+#include <cudf/io/nvcomp_adapter.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -27,70 +28,6 @@
 #include <optional>
 
 namespace cudf::io::nvcomp {
-
-enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4 };
-
-/**
- * @brief Set of parameters that impact whether the use nvCOMP features is enabled.
- */
-struct feature_status_parameters {
-  int lib_major_version;
-  int lib_minor_version;
-  int lib_patch_version;
-  bool are_all_integrations_enabled;
-  bool are_stable_integrations_enabled;
-  int compute_capability_major;
-
-  feature_status_parameters();
-  feature_status_parameters(
-    int major, int minor, int patch, bool all_enabled, bool stable_enabled, int cc_major)
-    : lib_major_version{major},
-      lib_minor_version{minor},
-      lib_patch_version{patch},
-      are_all_integrations_enabled{all_enabled},
-      are_stable_integrations_enabled{stable_enabled},
-      compute_capability_major{cc_major}
-  {
-  }
-};
-
-/**
- * @brief Equality operator overload. Required to use `feature_status_parameters` as a map key.
- */
-inline bool operator==(feature_status_parameters const& lhs, feature_status_parameters const& rhs)
-{
-  return lhs.lib_major_version == rhs.lib_major_version and
-         lhs.lib_minor_version == rhs.lib_minor_version and
-         lhs.lib_patch_version == rhs.lib_patch_version and
-         lhs.are_all_integrations_enabled == rhs.are_all_integrations_enabled and
-         lhs.are_stable_integrations_enabled == rhs.are_stable_integrations_enabled and
-         lhs.compute_capability_major == rhs.compute_capability_major;
-}
-
-/**
- * @brief If a compression type is disabled through nvCOMP, returns the reason as a string.
- *
- * Result cab depend on nvCOMP version and environment variables.
- *
- * @param compression Compression type
- * @param params Optional parameters to query status with different configurations
- * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled
- */
-[[nodiscard]] std::optional<std::string> is_compression_disabled(
-  compression_type compression, feature_status_parameters params = feature_status_parameters());
-
-/**
- * @brief If a decompression type is disabled through nvCOMP, returns the reason as a string.
- *
- * Result can depend on nvCOMP version and environment variables.
- *
- * @param compression Compression type
- * @param params Optional parameters to query status with different configurations
- * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled
- */
-[[nodiscard]] std::optional<std::string> is_decompression_disabled(
-  compression_type compression, feature_status_parameters params = feature_status_parameters());
-
 /**
  * @brief Device batch decompression of given type.
  *
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 72eb41b1360..8e20505d3ff 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -19,13 +19,13 @@
 #include "io/orc/reader_impl.hpp"
 #include "io/orc/reader_impl_chunking.hpp"
 #include "io/orc/reader_impl_helpers.hpp"
-#include "io/utilities/config_utils.hpp"
 #include "io/utilities/hostdevice_span.hpp"
 
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/config_utils.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
 
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index b6fc4e3510f..805959327ac 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -16,12 +16,12 @@
 
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/utilities/block_utils.cuh"
-#include "io/utilities/config_utils.hpp"
 #include "io/utilities/time_utils.cuh"
 #include "orc_gpu.hpp"
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/orc_types.hpp>
 #include <cudf/lists/lists_column_view.hpp>
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index e9e031a407a..4cb20bb7518 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -27,6 +27,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index d371ef5de93..3da303e6928 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -16,7 +16,6 @@
 
 #include "compact_protocol_reader.hpp"
 #include "io/comp/nvcomp_adapter.hpp"
-#include "io/utilities/config_utils.hpp"
 #include "io/utilities/time_utils.cuh"
 #include "reader_impl.hpp"
 #include "reader_impl_chunking.hpp"
@@ -25,6 +24,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/config_utils.hpp>
 
 #include <rmm/exec_policy.hpp>
 
@@ -862,7 +862,7 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
         gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
         break;
       case SNAPPY:
-        if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
+        if (cudf::io::nvcomp_integration::is_stable_enabled()) {
           nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
                                      d_comp_in,
                                      d_comp_out,
@@ -1071,7 +1071,7 @@ struct get_decomp_scratch {
       case BROTLI: return get_gpu_debrotli_scratch_size(di.num_pages);
 
       case SNAPPY:
-        if (cudf::io::detail::nvcomp_integration::is_stable_enabled()) {
+        if (cudf::io::nvcomp_integration::is_stable_enabled()) {
           return cudf::io::nvcomp::batched_decompress_temp_size(
             cudf::io::nvcomp::compression_type::SNAPPY,
             di.num_pages,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 66b4fce16fe..8413e716224 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -27,7 +27,6 @@
 #include "io/parquet/parquet_gpu.hpp"
 #include "io/statistics/column_statistics.cuh"
 #include "io/utilities/column_utils.cuh"
-#include "io/utilities/config_utils.hpp"
 #include "parquet_common.hpp"
 #include "parquet_gpu.cuh"
 #include "writer_impl.hpp"
@@ -38,6 +37,7 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/linked_column.hpp>
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index 0e3ce779089..badcd3f58f9 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -16,12 +16,12 @@
 
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/text/device_data_chunks.hpp"
-#include "io/utilities/config_utils.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/config_utils.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/detail/bgzip_utils.hpp>
 #include <cudf/utilities/default_stream.hpp>
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index 20ac89b4d53..a3afbd52896 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -14,14 +14,16 @@
  * limitations under the License.
  */
 
-#include "config_utils.hpp"
+#include "getenv_or.hpp"
 
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <cstdlib>
+#include <sstream>
 #include <string>
 
-namespace cudf::io::detail {
+namespace cudf::io {
 
 namespace cufile_integration {
 
@@ -80,4 +82,4 @@ bool is_stable_enabled() { return is_all_enabled() or get_env_policy() == usage_
 
 }  // namespace nvcomp_integration
 
-}  // namespace cudf::io::detail
+}  // namespace cudf::io
diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp
index a6cbbcd84a6..1dbb9369115 100644
--- a/cpp/src/io/utilities/data_sink.cpp
+++ b/cpp/src/io/utilities/data_sink.cpp
@@ -15,8 +15,9 @@
  */
 
 #include "file_io_utilities.hpp"
-#include "io/utilities/config_utils.hpp"
 
+#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/io/config_utils.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -40,7 +41,7 @@ class file_sink : public data_sink {
     _output_stream.open(filepath, std::ios::out | std::ios::binary | std::ios::trunc);
     if (!_output_stream.is_open()) { detail::throw_on_file_open_failure(filepath, true); }
 
-    if (detail::cufile_integration::is_kvikio_enabled()) {
+    if (cufile_integration::is_kvikio_enabled()) {
       _kvikio_file = kvikio::FileHandle(filepath, "w");
       CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index ca8932322bf..c8a438fc40b 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -15,9 +15,10 @@
  */
 
 #include "file_io_utilities.hpp"
-#include "io/utilities/config_utils.hpp"
 
+#include <cudf/detail/utilities/logger.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/config_utils.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
@@ -44,7 +45,7 @@ class file_source : public datasource {
   explicit file_source(char const* filepath) : _file(filepath, O_RDONLY)
   {
     detail::force_init_cuda_context();
-    if (detail::cufile_integration::is_kvikio_enabled()) {
+    if (cufile_integration::is_kvikio_enabled()) {
       _kvikio_file = kvikio::FileHandle(filepath);
       CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.",
                     _kvikio_file.is_compat_mode_on() ? "on" : "off");
@@ -433,7 +434,7 @@ std::unique_ptr<datasource> datasource::create(std::string const& filepath,
                                                size_t size)
 {
 #ifdef CUFILE_FOUND
-  if (detail::cufile_integration::is_always_enabled()) {
+  if (cufile_integration::is_always_enabled()) {
     // avoid mmap as GDS is expected to be used for most reads
     return std::make_unique<direct_read_source>(filepath.c_str());
   }
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index a9d4f19c848..9fe5959436d 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -16,9 +16,11 @@
 
 #include "file_io_utilities.hpp"
 
-#include "io/utilities/config_utils.hpp"
+#include "getenv_or.hpp"
 
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/logger.hpp>
+#include <cudf/io/config_utils.hpp>
 
 #include <rmm/device_buffer.hpp>
 
diff --git a/cpp/src/io/utilities/config_utils.hpp b/cpp/src/io/utilities/getenv_or.hpp
similarity index 63%
rename from cpp/src/io/utilities/config_utils.hpp
rename to cpp/src/io/utilities/getenv_or.hpp
index 74df1375e6f..3fd97a00b61 100644
--- a/cpp/src/io/utilities/config_utils.hpp
+++ b/cpp/src/io/utilities/getenv_or.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,15 +13,16 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #pragma once
 
 #include <cudf/detail/utilities/logger.hpp>
 
+#include <cstdlib>
 #include <sstream>
 #include <string>
 
-namespace cudf::io::detail {
-
+namespace {
 /**
  * @brief Returns the value of the environment variable, or a default value if the variable is not
  * present.
@@ -45,37 +46,4 @@ T getenv_or(std::string_view env_var_name, T default_val)
   return converted_val;
 }
 
-namespace cufile_integration {
-
-/**
- * @brief Returns true if cuFile and its compatibility mode are enabled.
- */
-bool is_always_enabled();
-
-/**
- * @brief Returns true if only direct IO through cuFile is enabled (compatibility mode is disabled).
- */
-bool is_gds_enabled();
-
-/**
- * @brief Returns true if KvikIO is enabled.
- */
-bool is_kvikio_enabled();
-
-}  // namespace cufile_integration
-
-namespace nvcomp_integration {
-
-/**
- * @brief Returns true if all nvCOMP uses are enabled.
- */
-bool is_all_enabled();
-
-/**
- * @brief Returns true if stable nvCOMP use is enabled.
- */
-bool is_stable_enabled();
-
-}  // namespace nvcomp_integration
-
-}  // namespace cudf::io::detail
+}  // namespace

From 261f911958ee9ad76109953dfc920c08da4c6fe6 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 10 Jul 2024 17:02:44 -0400
Subject: [PATCH 490/842] Migrate lists/extract to pylibcudf (#16071)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16071
---
 python/cudf/cudf/_lib/lists.pyx               | 36 ++++++-------------
 .../_lib/pylibcudf/libcudf/lists/extract.pxd  |  6 ++--
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  6 ++++
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 31 +++++++++++++++-
 .../cudf/cudf/pylibcudf_tests/test_lists.py   | 21 +++++++++++
 5 files changed, 70 insertions(+), 30 deletions(-)

diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 0ad09dba717..ceae1b148aa 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -8,11 +8,9 @@ from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
     count_elements as cpp_count_elements,
 )
-from cudf._lib.pylibcudf.libcudf.lists.extract cimport extract_list_element
 from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
     lists_column_view,
 )
@@ -116,37 +114,23 @@ def sort_lists(Column col, bool ascending, str na_position):
 
 @acquire_spill_lock()
 def extract_element_scalar(Column col, size_type index):
-    # shared_ptr required because lists_column_view has no default
-    # ctor
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.extract_list_element(
+            col.to_pylibcudf(mode="read"),
+            index,
+        )
     )
 
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(extract_list_element(list_view.get()[0], index))
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
-
 
 @acquire_spill_lock()
 def extract_element_column(Column col, Column index):
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.extract_list_element(
+            col.to_pylibcudf(mode="read"),
+            index.to_pylibcudf(mode="read"),
+        )
     )
 
-    cdef column_view index_view = index.view()
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(extract_list_element(list_view.get()[0], index_view))
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
-
 
 @acquire_spill_lock()
 def contains_scalar(Column col, py_search_key):
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
index caa12f41914..53609ba8830 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
@@ -11,10 +11,10 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 cdef extern from "cudf/lists/extract.hpp" namespace "cudf::lists" nogil:
     cdef unique_ptr[column] extract_list_element(
-        const lists_column_view,
+        const lists_column_view&,
         size_type
     ) except +
     cdef unique_ptr[column] extract_list_element(
-        const lists_column_view,
-        column_view
+        const lists_column_view&,
+        const column_view&
     ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index c9c43751a43..38a479e4791 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -12,6 +12,10 @@ ctypedef fused ColumnOrScalar:
     Column
     Scalar
 
+ctypedef fused ColumnOrSizeType:
+    Column
+    size_type
+
 cpdef Table explode_outer(Table, size_type explode_column_idx)
 
 cpdef Column concatenate_rows(Table)
@@ -27,3 +31,5 @@ cpdef Column index_of(Column, ColumnOrScalar, bool)
 cpdef Column reverse(Column)
 
 cpdef Column segmented_gather(Column, Column)
+
+cpdef Column extract_list_element(Column, ColumnOrSizeType)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index 9c56f1139c6..19c961aa014 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -17,9 +17,12 @@ from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
     concatenate_null_policy,
     concatenate_rows as cpp_concatenate_rows,
 )
+from cudf._lib.pylibcudf.libcudf.lists.extract cimport (
+    extract_list_element as cpp_extract_list_element,
+)
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.pylibcudf.lists cimport ColumnOrScalar
+from cudf._lib.pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType
 
 from .column cimport Column, ListColumnView
 from .scalar cimport Scalar
@@ -264,3 +267,29 @@ cpdef Column segmented_gather(Column input, Column gather_map_list):
             list_view2.view(),
         ))
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column extract_list_element(Column input, ColumnOrSizeType index):
+    """Create a column of extracted list elements.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    index : Union[Column, size_type]
+        The selection index or indices.
+
+    Returns
+    -------
+    Column
+        A new Column with elements extracted.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+
+    with nogil:
+        c_result = move(cpp_extract_list_element(
+            list_view.view(),
+            index.view() if ColumnOrSizeType is Column else index,
+        ))
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index 0d95579acb3..07ecaed5012 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -160,3 +160,24 @@ def test_segmented_gather(test_data):
     expect = pa.array([[8, 9], [14], [0], [0, 0]])
 
     assert_column_eq(expect, res)
+
+
+def test_extract_list_element_scalar(test_data):
+    arr = pa.array(test_data[0][0])
+    plc_column = plc.interop.from_arrow(arr)
+
+    res = plc.lists.extract_list_element(plc_column, 0)
+    expect = pa.compute.list_element(test_data[0][0], 0)
+
+    assert_column_eq(expect, res)
+
+
+def test_extract_list_element_column(test_data):
+    arr = pa.array(test_data[0][0])
+    plc_column = plc.interop.from_arrow(arr)
+    indices = plc.interop.from_arrow(pa.array([0, 1, -4, -1]))
+
+    res = plc.lists.extract_list_element(plc_column, indices)
+    expect = pa.array([0, None, None, 7])
+
+    assert_column_eq(expect, res)

From 64e3e8d4259eff85a4d2708333b0cfb43a3e79e3 Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Wed, 10 Jul 2024 14:23:00 -0700
Subject: [PATCH 491/842] Remove `mr` param from `write_csv` and `write_json`
 (#16231)

Fixes #16200

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16231
---
 cpp/include/cudf/io/csv.hpp         |  4 +-
 cpp/include/cudf/io/detail/csv.hpp  |  4 +-
 cpp/include/cudf/io/detail/json.hpp |  4 +-
 cpp/include/cudf/io/json.hpp        |  4 +-
 cpp/src/io/csv/writer_impl.cu       |  6 +--
 cpp/src/io/functions.cpp            | 14 ++-----
 cpp/src/io/json/write_json.cu       |  8 ++--
 cpp/tests/io/json_test.cpp          |  4 +-
 cpp/tests/io/json_writer.cpp        | 64 +++++++++--------------------
 9 files changed, 34 insertions(+), 78 deletions(-)

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index 68bb7fba00e..cc361f0918e 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -1756,11 +1756,9 @@ class csv_writer_options_builder {
  *
  * @param options Settings for controlling writing behavior
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource to use for device memory allocation
  */
 void write_csv(csv_writer_options const& options,
-               rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-               rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 50c1a7c163d..2a70fa888f4 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -49,14 +49,12 @@ table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
  * @param column_names Column names for the output CSV
  * @param options Settings for controlling behavior
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource to use for device memory allocation
  */
 void write_csv(data_sink* sink,
                table_view const& table,
                host_span<std::string const> column_names,
                csv_writer_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::device_async_resource_ref mr);
+               rmm::cuda_stream_view stream);
 
 }  // namespace csv
 }  // namespace detail
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 540a584908d..6ff1c12831b 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -46,13 +46,11 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
  * @param table The set of columns
  * @param options Settings for controlling behavior
  * @param stream CUDA stream used for device memory operations and kernel launches.
- * @param mr Device memory resource to use for device memory allocation
  */
 void write_json(data_sink* sink,
                 table_view const& table,
                 json_writer_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::device_async_resource_ref mr);
+                rmm::cuda_stream_view stream);
 
 /**
  * @brief Normalize single quotes to double quotes using FST
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 8de690482f9..7af90766ad0 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -1018,11 +1018,9 @@ class json_writer_options_builder {
  *
  * @param options Settings for controlling writing behavior
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource to use for device memory allocation
  */
 void write_json(json_writer_options const& options,
-                rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 63eb0b03c5f..00a6dcb2286 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -430,13 +430,13 @@ void write_csv(data_sink* out_sink,
                table_view const& table,
                host_span<std::string const> user_column_names,
                csv_writer_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::device_async_resource_ref mr)
+               rmm::cuda_stream_view stream)
 {
   // write header: column names separated by delimiter:
   // (even for tables with no rows)
   //
-  write_chunked_begin(out_sink, table, user_column_names, options, stream, mr);
+  write_chunked_begin(
+    out_sink, table, user_column_names, options, stream, rmm::mr::get_current_device_resource());
 
   if (table.num_rows() > 0) {
     // no need to check same-size columns constraint; auto-enforced by table_view
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index b4ece9cec66..6d2834206d4 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -215,9 +215,7 @@ table_with_metadata read_json(json_reader_options options,
   return json::detail::read_json(datasources, options, stream, mr);
 }
 
-void write_json(json_writer_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::device_async_resource_ref mr)
+void write_json(json_writer_options const& options, rmm::cuda_stream_view stream)
 {
   auto sinks = make_datasinks(options.get_sink());
   CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for JSON writing");
@@ -226,8 +224,7 @@ void write_json(json_writer_options const& options,
     sinks[0].get(),
     options.get_table(),
     options,
-    stream,
-    mr);
+    stream);
 }
 
 table_with_metadata read_csv(csv_reader_options options,
@@ -252,9 +249,7 @@ table_with_metadata read_csv(csv_reader_options options,
 }
 
 // Freeform API wraps the detail writer class API
-void write_csv(csv_writer_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::device_async_resource_ref mr)
+void write_csv(csv_writer_options const& options, rmm::cuda_stream_view stream)
 {
   using namespace cudf::io::detail;
 
@@ -266,8 +261,7 @@ void write_csv(csv_writer_options const& options,
     options.get_table(),
     options.get_names(),
     options,
-    stream,
-    mr);
+    stream);
 }
 
 raw_orc_statistics read_raw_orc_statistics(source_info const& src_info,
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 997d6fd99f8..c688c809e04 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -805,8 +805,7 @@ void write_chunked(data_sink* out_sink,
                    strings_column_view const& str_column_view,
                    int const skip_last_chars,
                    json_writer_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::device_async_resource_ref mr)
+                   rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
@@ -829,8 +828,7 @@ void write_chunked(data_sink* out_sink,
 void write_json(data_sink* out_sink,
                 table_view const& table,
                 json_writer_options const& options,
-                rmm::cuda_stream_view stream,
-                rmm::device_async_resource_ref mr)
+                rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   std::vector<column_name_info> user_column_names = [&]() {
@@ -912,7 +910,7 @@ void write_json(data_sink* out_sink,
       bool const include_line_terminator =
         (&sub_view != &vector_views.back()) or options.is_enabled_lines();
       auto const skip_last_chars = (include_line_terminator ? 0 : line_terminator.size());
-      write_chunked(out_sink, str_concat_col->view(), skip_last_chars, options, stream, mr);
+      write_chunked(out_sink, str_concat_col->view(), skip_last_chars, options, stream);
     }
   } else {
     if (options.is_enabled_lines()) {
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 9c76c344157..993ab82f423 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -1400,9 +1400,7 @@ TEST_F(JsonReaderTest, JsonLongString)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
 
   cudf::column_view int16_with_mask(repeat_times);
   cudf::column_view int16(
diff --git a/cpp/tests/io/json_writer.cpp b/cpp/tests/io/json_writer.cpp
index 946b939f456..2c4e29a01b9 100644
--- a/cpp/tests/io/json_writer.cpp
+++ b/cpp/tests/io/json_writer.cpp
@@ -51,16 +51,14 @@ TEST_F(JsonWriterTest, EmptyInput)
                        .build();
 
   // Empty columns in table
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   std::string const expected = R"([])";
   EXPECT_EQ(expected, std::string(out_buffer.data(), out_buffer.size()));
 
   // Empty columns in table - JSON Lines
   out_buffer.clear();
   out_options.enable_lines(true);
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   std::string const expected_lines = "\n";
   EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
 
@@ -68,8 +66,7 @@ TEST_F(JsonWriterTest, EmptyInput)
   cudf::table_view tbl_view2{};
   out_options.set_table(tbl_view2);
   out_buffer.clear();
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   EXPECT_EQ(expected_lines, std::string(out_buffer.data(), out_buffer.size()));
 }
 
@@ -94,22 +91,17 @@ TEST_F(JsonWriterTest, ErrorCases)
                        .build();
 
   // not enough column names
-  EXPECT_THROW(
-    cudf::io::write_json(
-      out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
-    cudf::logic_error);
+  EXPECT_THROW(cudf::io::write_json(out_options, cudf::test::get_default_stream()),
+               cudf::logic_error);
 
   mt.schema_info.emplace_back("int16");
   out_options.set_metadata(mt);
-  EXPECT_NO_THROW(cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()));
+  EXPECT_NO_THROW(cudf::io::write_json(out_options, cudf::test::get_default_stream()));
 
   // chunk_rows must be at least 8
   out_options.set_rows_per_chunk(0);
-  EXPECT_THROW(
-    cudf::io::write_json(
-      out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
-    cudf::logic_error);
+  EXPECT_THROW(cudf::io::write_json(out_options, cudf::test::get_default_stream()),
+               cudf::logic_error);
 }
 
 TEST_F(JsonWriterTest, PlainTable)
@@ -131,9 +123,7 @@ TEST_F(JsonWriterTest, PlainTable)
                            .lines(false)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
 
   std::string const expected =
     R"([{"col1":"a","col2":"d","int":1,"float":1.5,"int16":null},{"col1":"b","col2":"e","int":2,"float":2.5,"int16":2},{"col1":"c","col2":"f","int":3,"float":3.5,"int16":null}])";
@@ -163,9 +153,7 @@ TEST_F(JsonWriterTest, SimpleNested)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
   std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]}
 {"a":6,"b":7,"c":{"d":8},"f":10.5}
 {"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]}
@@ -197,9 +185,7 @@ TEST_F(JsonWriterTest, MixedNested)
                            .lines(false)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
   std::string const expected =
     R"([{"a":1,"b":2,"c":{"d":[3]},"f":5.5,"g":[{"h":1}]},)"
     R"({"a":6,"b":7,"c":{"d":[8]},"f":10.5},)"
@@ -232,8 +218,7 @@ TEST_F(JsonWriterTest, WriteReadNested)
                        .na_rep("null")
                        .build();
 
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   std::string const expected = R"({"a":1,"b":2,"c":{"d":3},"f":5.5,"g":[1]}
 {"a":6,"b":7,"c":{"d":8},"f":10.5}
 {"a":1,"b":2,"c":{"e":4},"f":5.5,"g":[2,null]}
@@ -308,8 +293,7 @@ TEST_F(JsonWriterTest, WriteReadNested)
   mt.schema_info[2].children.clear();
   out_options.set_metadata(mt);
   out_buffer.clear();
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
 
   in_options = cudf::io::json_reader_options::builder(
                  cudf::io::source_info{out_buffer.data(), out_buffer.size()})
@@ -332,8 +316,7 @@ TEST_F(JsonWriterTest, WriteReadNested)
   // without column names
   out_options.set_metadata(cudf::io::table_metadata{});
   out_buffer.clear();
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   in_options = cudf::io::json_reader_options::builder(
                  cudf::io::source_info{out_buffer.data(), out_buffer.size()})
                  .lines(true)
@@ -371,8 +354,7 @@ TEST_F(JsonWriterTest, SpecialChars)
                        .na_rep("null")
                        .build();
 
-  cudf::io::write_json(
-    out_options, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+  cudf::io::write_json(out_options, cudf::test::get_default_stream());
   std::string const expected = R"({"\"a\"":1,"'b'":"abcd"}
 {"\"a\"":6,"'b'":"b\b\f\n\r\t"}
 {"\"a\"":1,"'b'":"\"c\""}
@@ -405,9 +387,7 @@ TEST_F(JsonWriterTest, NullList)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
   std::string const expected = R"({"a":[null],"b":[[1,2,3],[null],[null,null,null],[4,null,5]]}
 {"a":[2,null,null,3],"b":null}
 {"a":[null,null,4],"b":[[2,null],null]}
@@ -446,9 +426,7 @@ TEST_F(JsonWriterTest, ChunkedNested)
                            .na_rep("null")
                            .rows_per_chunk(8);
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
   std::string const expected =
     R"({"a":1,"b":-2,"c":{},"e":[{"f":1}]}
 {"a":2,"b":-2,"c":{}}
@@ -504,9 +482,7 @@ TEST_F(JsonWriterTest, StructAllNullCombinations)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
   std::string const expected = R"({}
 {"e":1}
 {"d":1}
@@ -568,9 +544,7 @@ TEST_F(JsonWriterTest, Unicode)
                            .lines(true)
                            .na_rep("null");
 
-  cudf::io::write_json(options_builder.build(),
-                       cudf::test::get_default_stream(),
-                       rmm::mr::get_current_device_resource());
+  cudf::io::write_json(options_builder.build(), cudf::test::get_default_stream());
 
   std::string const expected =
     R"({"col1":"\"\\\/\b\f\n\r\t","col2":"C\u10ae\u226a\u31f3\u434f\u51f9\u6ca6\u738b\u8fbf\u9fb8\ua057\ubbdc\uc2a4\ud3f6\ue4fe\ufd20","int16":null}

From 3c83ce451446dfd556bd14ad8537b0189226a0e5 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Wed, 10 Jul 2024 18:58:31 -0600
Subject: [PATCH 492/842] New Decimal <--> Floating conversion (#15905)

This PR contains the main algorithm for the new decimal <--> floating conversion code. This algorithm was written to address the precision issues described [here](https://github.com/rapidsai/cudf/issues/14169).

### Summary
* The new algorithm is more accurate than the previous code, but it is also far more complex.
* It can perform conversions that were not even possible in the old code due to overflow (decimal32/64/128 conversions only worked for scale factors up to 10^9/18/38, respectively). Now the entire floating-point range is convertible, including denormals.
* This new algorithm is significantly faster in some parts of the conversion phase-space, and in some parts slightly slower.

### Previous PR's
These contain the supporting parts of this work:
* [Explicit conversion PR](https://github.com/rapidsai/cudf/pull/15438)
* [Benchmarking PR](https://github.com/rapidsai/cudf/pull/15334)
* [Powers-of-10 PR](https://github.com/rapidsai/cudf/pull/15353)
* [Utilities PR](https://github.com/rapidsai/cudf/pull/15359). These utilities are updated here to support denormals.

### Algorithm Outline
We convert floating -> (integer) decimal by:
* Extract the floating-point mantissa (converted to integer) and power-of-2
* For float we use a uint64 to contain our data during the below shifting/scaling, for double uint128_t
* In this shifting integer, we alternately apply the extracted powers-of-2 (bit-shifts, until they're all used) and scale-factor powers-of-10 (multiply/divide) as needed to reach the desired scale factor.

Decimal -> floating is just the reverse operation.

### Supplemental Changes
* Testing: Add decimal128, add precise-conversion tests. Remove kludges due to inaccurate conversions. Add test for zeroes.
* Benchmarking: Enable regions of conversion phase-space for benchmarking that were not possible in the old algorithm.
* Unary: Cleanup by using CUDF_ENABLE_IF.  Call new conversion code for base-10 fixed-point.

### Performance for various conversions/input-ranges
* Note: F32/F64 is float/double

New algorithm is **FASTER** by:
* F64             --> decimal64:   60% for E8    --> E15
* F64             --> decimal128: 13% for E-8  --> E-15
* F64             --> decimal128: 22% for E8    --> E15
* F64             --> decimal128: 27% for E31  --> E38
* decimal32   --> F64:             18% for E-3   --> E4
* decimal64   --> F64:             27% for E-14 --> E-7
* decimal64   --> F64:             17% for E-3   --> E4
* decimal128 --> F64:             21% for E-14 --> E-7
* decimal128 --> F64:             11% for E-3   --> E4
* decimal128 --> F64:             13% for E31   --> E38

New algorithm is **SLOWER** by:
* F32             --> decimal32:     3% for E-3   --> E4
* F32             --> decimal64:     2% for E-14   --> E14
* F64             --> decimal32:     3% for E-3   --> E4
* decimal32   --> F32:               5% for E-3   --> E4
* decimal128 --> F64:             36% for E-37 --> E-30

Other kernels:
* The PYMOD binary-op benchmark is 7% slower.

### Performance discussion
* Many conversions have identical speed, indicating these algorithms are often fast and we are instead bottlenecked on overheads such as getting the input to the gpu in the first place.
* F64 conversions are often much faster than the old algorithm as the new algorithm completely avoids the FP64 pipeline. Other than the cast to double itself, all of the operations are on integers. Thus we don't have threads competing with each other and taking turns for access to the floating-point cores.
* The conversions are slightly slower for floats with powers-of-10 near zero.  Presumably this is due to code overhead for e.g., handling a large range of inputs, UB-checks for bit shifts, branches for denormals, etc.
* The conversion is slower for decimal128 conversions with very small exponents, which requires several large divisions (128bit divided by 64bit).
* The PYMOD kernel is slower due to register pressure from the introduction of the new division routines in the earlier PR. Even though this benchmark does not perform decimal <--> floating conversions, it gets hit because of inlined template code in the kernel increasing the code/register pressure.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/15905
---
 cpp/benchmarks/decimal/convert_floating.cpp   |  17 -
 .../cudf/fixed_point/floating_conversion.hpp  | 964 +++++++++++++++---
 cpp/include/cudf/unary.hpp                    |  35 +-
 cpp/tests/fixed_point/fixed_point_tests.cpp   | 129 ++-
 .../java/ai/rapids/cudf/ColumnVectorTest.java |   6 +-
 python/cudf/cudf/tests/test_decimal.py        |   2 +-
 6 files changed, 958 insertions(+), 195 deletions(-)

diff --git a/cpp/benchmarks/decimal/convert_floating.cpp b/cpp/benchmarks/decimal/convert_floating.cpp
index a367036c494..ac09c3400cb 100644
--- a/cpp/benchmarks/decimal/convert_floating.cpp
+++ b/cpp/benchmarks/decimal/convert_floating.cpp
@@ -32,8 +32,6 @@ void bench_cast_decimal(nvbench::state& state, nvbench::type_list<InputType, Out
 
   static constexpr bool is_double =
     std::is_same_v<InputType, double> || std::is_same_v<OutputType, double>;
-  static constexpr bool is_32bit =
-    std::is_same_v<InputType, numeric::decimal32> || std::is_same_v<OutputType, numeric::decimal32>;
   static constexpr bool is_128bit = std::is_same_v<InputType, numeric::decimal128> ||
                                     std::is_same_v<OutputType, numeric::decimal128>;
 
@@ -69,21 +67,6 @@ void bench_cast_decimal(nvbench::state& state, nvbench::type_list<InputType, Out
     return;
   }
 
-  // The current float <--> decimal conversion algorithm is limited
-  static constexpr bool is_64bit = !is_32bit && !is_128bit;
-  if (is_32bit && (exp_mode != 3)) {
-    state.skip("Decimal32 conversion only works up to scale factors of 10^9.");
-    return;
-  }
-  if (is_64bit && ((exp_mode < 2) || (exp_mode > 4))) {
-    state.skip("Decimal64 conversion only works up to scale factors of 10^18.");
-    return;
-  }
-  if (is_128bit && ((exp_mode == 0) || (exp_mode == 6))) {
-    state.skip("Decimal128 conversion only works up to scale factors of 10^38.");
-    return;
-  }
-
   // Type IDs
   auto const input_id  = cudf::type_to_id<InputType>();
   auto const output_id = cudf::type_to_id<OutputType>();
diff --git a/cpp/include/cudf/fixed_point/floating_conversion.hpp b/cpp/include/cudf/fixed_point/floating_conversion.hpp
index 2c3a5c5629d..c64ae8877d4 100644
--- a/cpp/include/cudf/fixed_point/floating_conversion.hpp
+++ b/cpp/include/cudf/fixed_point/floating_conversion.hpp
@@ -18,6 +18,7 @@
 
 #include <cudf/utilities/traits.hpp>
 
+#include <cuda/std/cmath>
 #include <cuda/std/limits>
 #include <cuda/std/type_traits>
 
@@ -34,6 +35,49 @@ namespace numeric {
 
 namespace detail {
 
+/**
+ * @brief Determine the number of significant bits in an integer
+ *
+ * @tparam T Type of input integer value. Must be either uint32_t, uint64_t, or __uint128_t
+ * @param value The integer whose bits are being counted
+ * @return The number of significant bits: the # of bits - # of leading zeroes
+ */
+template <typename T,
+          CUDF_ENABLE_IF(std::is_same_v<T, uint32_t> || std::is_same_v<T, uint64_t> ||
+                         std::is_same_v<T, __uint128_t>)>
+CUDF_HOST_DEVICE inline int count_significant_bits(T value)
+{
+#ifdef __CUDA_ARCH__
+  if constexpr (std::is_same_v<T, uint64_t>) {
+    return 64 - __clzll(static_cast<int64_t>(value));
+  } else if constexpr (std::is_same_v<T, uint32_t>) {
+    return 32 - __clz(static_cast<int32_t>(value));
+  } else if constexpr (std::is_same_v<T, __uint128_t>) {
+    // 128 bit type, must break up into high and low components
+    auto const high_bits = static_cast<int64_t>(value >> 64);
+    auto const low_bits  = static_cast<int64_t>(value);
+    return 128 - (__clzll(high_bits) + static_cast<int>(high_bits == 0) * __clzll(low_bits));
+  }
+#else
+  // Undefined behavior to call __builtin_clzll() with zero in gcc and clang
+  if (value == 0) { return 0; }
+
+  if constexpr (std::is_same_v<T, uint64_t>) {
+    return 64 - __builtin_clzll(value);
+  } else if constexpr (std::is_same_v<T, uint32_t>) {
+    return 32 - __builtin_clz(value);
+  } else if constexpr (std::is_same_v<T, __uint128_t>) {
+    // 128 bit type, must break up into high and low components
+    auto const high_bits = static_cast<uint64_t>(value >> 64);
+    if (high_bits == 0) {
+      return 64 - __builtin_clzll(static_cast<uint64_t>(value));
+    } else {
+      return 128 - __builtin_clzll(high_bits);
+    }
+  }
+#endif
+}
+
 /**
  * @brief Helper struct for getting and setting the components of a floating-point value
  *
@@ -62,27 +106,28 @@ struct floating_converter {
   // The low 23 / 52 bits (for float / double) are the mantissa.
   // The mantissa is normalized. There is an understood 1 bit to the left of the binary point.
   // The value of the mantissa is in the range [1, 2).
-  /// # mantissa bits (-1 for understood bit)
-  static constexpr int num_mantissa_bits = cuda::std::numeric_limits<FloatingType>::digits - 1;
+  /// # significand bits (includes understood bit)
+  static constexpr int num_significand_bits = cuda::std::numeric_limits<FloatingType>::digits;
+  /// # stored mantissa bits (-1 for understood bit)
+  static constexpr int num_stored_mantissa_bits = num_significand_bits - 1;
   /// The mask for the understood bit
-  static constexpr IntegralType understood_bit_mask = (IntegralType(1) << num_mantissa_bits);
+  static constexpr IntegralType understood_bit_mask = (IntegralType(1) << num_stored_mantissa_bits);
   /// The mask to select the mantissa
   static constexpr IntegralType mantissa_mask = understood_bit_mask - 1;
 
   // And in between are the bits used to store the biased power-of-2 exponent.
   /// # exponents bits (-1 for sign bit)
-  static constexpr int num_exponent_bits = num_floating_bits - num_mantissa_bits - 1;
+  static constexpr int num_exponent_bits = num_floating_bits - num_stored_mantissa_bits - 1;
   /// The mask for the exponents, unshifted
   static constexpr IntegralType unshifted_exponent_mask =
     (IntegralType(1) << num_exponent_bits) - 1;
   /// The mask to select the exponents
-  static constexpr IntegralType exponent_mask = unshifted_exponent_mask << num_mantissa_bits;
+  static constexpr IntegralType exponent_mask = unshifted_exponent_mask << num_stored_mantissa_bits;
 
   // To store positive and negative exponents as unsigned values, the stored value for
   // the power-of-2 is exponent + bias. The bias is 127 for floats and 1023 for doubles.
   /// 127 / 1023 for float / double
-  static constexpr IntegralType exponent_bias =
-    cuda::std::numeric_limits<FloatingType>::max_exponent - 1;
+  static constexpr int exponent_bias = cuda::std::numeric_limits<FloatingType>::max_exponent - 1;
 
   /**
    * @brief Reinterpret the bits of a floating-point value as an integer
@@ -113,15 +158,15 @@ struct floating_converter {
   }
 
   /**
-   * @brief Extracts the integral significand of a bit-casted floating-point number
+   * @brief Checks whether the bit-casted floating-point value is +/-0
    *
-   * @param integer_rep The bit-casted floating value to extract the exponent from
-   * @return The integral significand, bit-shifted to a (large) whole number
+   * @param integer_rep The bit-casted floating value to check if is +/-0
+   * @return True if is a zero, else false
    */
-  CUDF_HOST_DEVICE inline static IntegralType get_base2_value(IntegralType integer_rep)
+  CUDF_HOST_DEVICE inline static bool is_zero(IntegralType integer_rep)
   {
-    // Extract the significand, setting the high bit for the understood 1/2
-    return (integer_rep & mantissa_mask) | understood_bit_mask;
+    // It's a zero if every non-sign bit is zero
+    return ((integer_rep & ~sign_mask) == 0);
   }
 
   /**
@@ -137,40 +182,59 @@ struct floating_converter {
   }
 
   /**
-   * @brief Extracts the exponent of a bit-casted floating-point number
+   * @brief Extracts the significand and exponent of a bit-casted floating-point number,
+   * shifted for denormals.
    *
-   * @note This returns INT_MIN for +/-0, +/-inf, NaN's, and denormals
-   * For all of these cases, the decimal fixed_point number should be set to zero
+   * @note Zeros/inf/NaN not handled.
    *
    * @param integer_rep The bit-casted floating value to extract the exponent from
-   * @return The stored base-2 exponent, or INT_MIN for special values
+   * @return The stored base-2 exponent and significand, shifted for denormals
    */
-  CUDF_HOST_DEVICE inline static int get_exp2(IntegralType integer_rep)
+  CUDF_HOST_DEVICE inline static std::pair<IntegralType, int> get_significand_and_pow2(
+    IntegralType integer_rep)
   {
-    // First extract the exponent bits and handle its special values.
-    // To minimize branching, all of these special cases will return INT_MIN.
-    // For all of these cases, the decimal fixed_point number should be set to zero.
+    // Extract the significand
+    auto significand = (integer_rep & mantissa_mask);
+
+    // Extract the exponent bits.
     auto const exponent_bits = integer_rep & exponent_mask;
+
+    // Notes on special values of exponent_bits:
+    // bits = exponent_mask is +/-inf or NaN, but those are handled prior to input.
+    // bits = 0 is either a denormal (handled below) or a zero (handled earlier by caller).
+    int floating_pow2;
     if (exponent_bits == 0) {
-      // Because of the understood set-bit not stored in the mantissa, it is not possible
-      // to store the value zero directly. Instead both +/-0 and denormals are represented with
-      // the exponent bits set to zero.
-      // Thus it's fastest to just floor (generally unwanted) denormals to zero.
-      return INT_MIN;
-    } else if (exponent_bits == exponent_mask) {
-      //+/-inf and NaN values are stored with all of the exponent bits set.
-      // As none of these are representable by integers, we'll return the same value for all cases.
-      return INT_MIN;
+      // Denormal values are 2^(1 - exponent_bias) * Sum_i(B_i * 2^-i)
+      // Where i is the i-th mantissa bit (counting from the LEFT, starting at 1),
+      // and B_i is the value of that bit (0 or 1)
+      // So e.g. for the minimum denormal, only the lowest bit is set:
+      // FLT_TRUE_MIN = 2^(1 - 127) * 2^-23 = 2^-149
+      // DBL_TRUE_MIN = 2^(1 - 1023) * 2^-52 = 2^-1074
+      floating_pow2 = 1 - exponent_bias;
+
+      // Line-up denormal to same (understood) bit as normal numbers
+      // This is so bit-shifting starts at the same bit index
+      auto const lineup_shift = num_significand_bits - count_significant_bits(significand);
+      significand <<= lineup_shift;
+      floating_pow2 -= lineup_shift;
+    } else {
+      // Extract the exponent value: shift the bits down and subtract the bias.
+      auto const shifted_exponent_bits = exponent_bits >> num_stored_mantissa_bits;
+      floating_pow2                    = static_cast<int>(shifted_exponent_bits) - exponent_bias;
+
+      // Set the high bit for the understood 1/2
+      significand |= understood_bit_mask;
     }
 
-    // Extract the exponent value: shift the bits down and subtract the bias.
-    using SignedIntegralType                       = cuda::std::make_signed_t<IntegralType>;
-    SignedIntegralType const shifted_exponent_bits = exponent_bits >> num_mantissa_bits;
-    return shifted_exponent_bits - static_cast<SignedIntegralType>(exponent_bias);
+    // To convert the mantissa to an integer, we effectively applied #-mantissa-bits
+    // powers of 2 to convert the fractional value to an integer, so subtract them off here
+    int const pow2 = floating_pow2 - num_stored_mantissa_bits;
+
+    return {significand, pow2};
   }
 
   /**
-   * @brief Sets the sign bit of a positive floating-point number
+   * @brief Sets the sign bit of a floating-point number
    *
    * @param floating The floating-point value to set the sign of. Must be positive.
    * @param is_negative The sign bit to set for the floating-point number
@@ -192,83 +256,60 @@ struct floating_converter {
   /**
    * @brief Adds to the base-2 exponent of a floating-point number
    *
+   * @note The caller must guarantee that the input is a positive (> 0) whole number.
+   *
    * @param floating The floating value to add to the exponent of. Must be positive.
-   * @param exp2 The power-of-2 to add to the floating-point number
-   * @return The input floating-point value * 2^exp2
+   * @param pow2 The power-of-2 to add to the floating-point number
+   * @return The input floating-point value * 2^pow2
    */
-  CUDF_HOST_DEVICE inline static FloatingType add_exp2(FloatingType floating, int exp2)
+  CUDF_HOST_DEVICE inline static FloatingType add_pow2(FloatingType floating, int pow2)
   {
+    // Note that the input floating-point number is positive (& whole), so we don't have to
+    // worry about the sign here; the sign will be set later in set_is_negative()
+
     // Convert floating to integer
     auto integer_rep = bit_cast_to_integer(floating);
 
     // Extract the currently stored (biased) exponent
+    using SignedType   = std::make_signed_t<IntegralType>;
     auto exponent_bits = integer_rep & exponent_mask;
-    auto stored_exp2   = exponent_bits >> num_mantissa_bits;
+    auto stored_pow2   = static_cast<SignedType>(exponent_bits >> num_stored_mantissa_bits);
 
     // Add the additional power-of-2
-    stored_exp2 += exp2;
+    stored_pow2 += pow2;
 
     // Check for exponent over/under-flow.
-    // Note that the input floating-point number is always positive, so we don't have to
-    // worry about the sign here; the sign will be set later in set_is_negative()
-    if (stored_exp2 <= 0) {
-      return 0.0;
-    } else if (stored_exp2 >= unshifted_exponent_mask) {
+    if (stored_pow2 <= 0) {
+      // Denormal (zero handled prior to input)
+
+      // Early out if bit shift will zero it anyway.
+      // Note: We must handle this explicitly, as too-large a bit-shift is UB
+      auto const bit_shift = -stored_pow2 + 1;  //+1 due to understood bit set below
+      if (bit_shift > num_stored_mantissa_bits) { return 0.0; }
+
+      // Clear the exponent bits (zero means 2^-126/2^-1022 w/ no understood bit)
+      integer_rep &= (~exponent_mask);
+
+      // The input floating-point number has an "understood" bit that we need to set
+      // prior to bit-shifting. Set the understood bit.
+      integer_rep |= understood_bit_mask;
+
+      // Convert to denormal: bit shift off the low bits
+      integer_rep >>= bit_shift;
+    } else if (stored_pow2 >= static_cast<SignedType>(unshifted_exponent_mask)) {
+      // Overflow: Set infinity
       return cuda::std::numeric_limits<FloatingType>::infinity();
     } else {
-      // Clear existing exponent bits and set new ones
-      exponent_bits = stored_exp2 << num_mantissa_bits;
+      // Normal number: Clear existing exponent bits and set new ones
+      exponent_bits = static_cast<IntegralType>(stored_pow2) << num_stored_mantissa_bits;
       integer_rep &= (~exponent_mask);
       integer_rep |= exponent_bits;
-
-      // Convert back to float
-      return bit_cast_to_floating(integer_rep);
     }
-  }
-};
 
-/**
- * @brief Determine the number of significant bits in an integer
- *
- * @tparam T Type of input integer value. Must be either uint32_t, uint64_t, or __uint128_t
- * @param value The integer whose bits are being counted
- * @return The number of significant bits: the # of bits - # of leading zeroes
- */
-template <typename T,
-          CUDF_ENABLE_IF(std::is_same_v<T, uint32_t> || std::is_same_v<T, uint64_t> ||
-                         std::is_same_v<T, __uint128_t>)>
-CUDF_HOST_DEVICE inline int count_significant_bits(T value)
-{
-#ifdef __CUDA_ARCH__
-  if constexpr (std::is_same_v<T, uint64_t>) {
-    return 64 - __clzll(static_cast<int64_t>(value));
-  } else if constexpr (std::is_same_v<T, uint32_t>) {
-    return 32 - __clz(static_cast<int32_t>(value));
-  } else if constexpr (std::is_same_v<T, __uint128_t>) {
-    // 128 bit type, must break up into high and low components
-    auto const high_bits = static_cast<int64_t>(value >> 64);
-    auto const low_bits  = static_cast<int64_t>(value);
-    return 128 - (__clzll(high_bits) + static_cast<int>(high_bits == 0) * __clzll(low_bits));
-  }
-#else
-  // Undefined behavior to call __builtin_clzll() with zero in gcc and clang
-  if (value == 0) { return 0; }
-
-  if constexpr (std::is_same_v<T, uint64_t>) {
-    return 64 - __builtin_clzll(value);
-  } else if constexpr (std::is_same_v<T, uint32_t>) {
-    return 32 - __builtin_clz(value);
-  } else if constexpr (std::is_same_v<T, __uint128_t>) {
-    // 128 bit type, must break up into high and low components
-    auto const high_bits = static_cast<uint64_t>(value >> 64);
-    if (high_bits == 0) {
-      return 64 - __builtin_clzll(static_cast<uint64_t>(value));
-    } else {
-      return 128 - __builtin_clzll(high_bits);
-    }
+    // Convert back to float
+    return bit_cast_to_floating(integer_rep);
   }
-#endif
-}
+};
 
 /**
  * @brief Recursively calculate a signed large power of 10 (>= 10^19) that can only be stored in an
@@ -276,18 +317,18 @@ CUDF_HOST_DEVICE inline int count_significant_bits(T value)
  *
  * @note Intended to be run at compile time.
  *
- * @tparam Exp10 The power of 10 to calculate
- * @return Returns 10^Exp10
+ * @tparam Pow10 The power of 10 to calculate
+ * @return Returns 10^Pow10
  */
-template <int Exp10>
+template <int Pow10>
 constexpr __uint128_t large_power_of_10()
 {
   // Stop at 10^19 to speed up compilation; literals can be used for smaller powers of 10.
-  static_assert(Exp10 >= 19);
-  if constexpr (Exp10 == 19)
+  static_assert(Pow10 >= 19);
+  if constexpr (Pow10 == 19)
     return __uint128_t(10000000000000000000ULL);
   else
-    return large_power_of_10<Exp10 - 1>() * __uint128_t(10);
+    return large_power_of_10<Pow10 - 1>() * __uint128_t(10);
 }
 
 /**
@@ -295,11 +336,11 @@ constexpr __uint128_t large_power_of_10()
  *
  * @tparam T Type of value to be divided-from.
  * @param value The number to be divided-from.
- * @param exp10 The power-of-10 of the denominator, from 0 to 9 inclusive.
- * @return Returns value / 10^exp10
+ * @param pow10 The power-of-10 of the denominator, from 0 to 9 inclusive.
+ * @return Returns value / 10^pow10
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int pow10)
 {
   // Computing division this way is much faster than the alternatives.
   // Division is not implemented in GPU hardware, and the compiler will often implement it as a
@@ -309,7 +350,7 @@ CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10)
 
   // Instead, if the compiler can see exactly what number it is dividing by, it can
   // produce much more optimal assembly, doing bit shifting, multiplies by a constant, etc.
-  // For the compiler to see the value though, array lookup (with exp10 as the index)
+  // For the compiler to see the value though, array lookup (with pow10 as the index)
   // is not sufficient: We have to use a switch statement. Although this introduces a branch,
   // it is still much faster than doing the divide any other way.
   // Perhaps an array can be used in C++23 with the assume attribute?
@@ -325,7 +366,7 @@ CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10)
   // introduces too much pressure on the kernels that use this code, slowing down their benchmarks.
   // It also dramatically slows down the compile time.
 
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value / 10U;
     case 2: return value / 100U;
@@ -345,14 +386,14 @@ CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int exp10)
  *
  * @tparam T Type of value to be divided-from.
  * @param value The number to be divided-from.
- * @param exp10 The power-of-10 of the denominator, from 0 to 19 inclusive.
- * @return Returns value / 10^exp10
+ * @param pow10 The power-of-10 of the denominator, from 0 to 19 inclusive.
+ * @return Returns value / 10^pow10
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int pow10)
 {
   // See comments in divide_power10_32bit() for discussion.
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value / 10U;
     case 2: return value / 100U;
@@ -382,14 +423,14 @@ CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int exp10)
  *
  * @tparam T Type of value to be divided-from.
  * @param value The number to be divided-from.
- * @param exp10 The power-of-10 of the denominator, from 0 to 38 inclusive.
- * @return Returns value / 10^exp10.
+ * @param pow10 The power-of-10 of the denominator, from 0 to 38 inclusive.
+ * @return Returns value / 10^pow10.
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int pow10)
 {
   // See comments in divide_power10_32bit() for an introduction.
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value / 10U;
     case 2: return value / 100U;
@@ -438,14 +479,14 @@ CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int exp10)
  *
  * @tparam T Type of value to be multiplied.
  * @param value The number to be multiplied.
- * @param exp10 The power-of-10 of the multiplier, from 0 to 9 inclusive.
- * @return Returns value * 10^exp10
+ * @param pow10 The power-of-10 of the multiplier, from 0 to 9 inclusive.
+ * @return Returns value * 10^pow10
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int pow10)
 {
   // See comments in divide_power10_32bit() for discussion.
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value * 10U;
     case 2: return value * 100U;
@@ -465,14 +506,14 @@ CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int exp10)
  *
  * @tparam T Type of value to be multiplied.
  * @param value The number to be multiplied.
- * @param exp10 The power-of-10 of the multiplier, from 0 to 19 inclusive.
- * @return Returns value * 10^exp10
+ * @param pow10 The power-of-10 of the multiplier, from 0 to 19 inclusive.
+ * @return Returns value * 10^pow10
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int pow10)
 {
   // See comments in divide_power10_32bit() for discussion.
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value * 10U;
     case 2: return value * 100U;
@@ -502,14 +543,14 @@ CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int exp10)
  *
  * @tparam T Type of value to be multiplied.
  * @param value The number to be multiplied.
- * @param exp10 The power-of-10 of the multiplier, from 0 to 38 inclusive.
- * @return Returns value * 10^exp10.
+ * @param pow10 The power-of-10 of the multiplier, from 0 to 38 inclusive.
+ * @return Returns value * 10^pow10.
  */
-template <typename T, typename cuda::std::enable_if_t<cuda::std::is_unsigned_v<T>>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T multiply_power10_128bit(T value, int exp10)
+template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10_128bit(T value, int pow10)
 {
   // See comments in divide_power10_128bit() for discussion.
-  switch (exp10) {
+  switch (pow10) {
     case 0: return value;
     case 1: return value * 10U;
     case 2: return value * 100U;
@@ -556,59 +597,678 @@ CUDF_HOST_DEVICE inline constexpr T multiply_power10_128bit(T value, int exp10)
 /**
  * @brief Multiply an integer by a power of 10.
  *
- * @note Use this function if you have no a-priori knowledge of what exp10 might be.
+ * @note Use this function if you have no a-priori knowledge of what pow10 might be.
  * If you do, prefer calling the bit-size-specific versions
  *
  * @tparam Rep Representation type needed for integer exponentiation
  * @tparam T Integral type of value to be multiplied.
  * @param value The number to be multiplied.
- * @param exp10 The power-of-10 of the multiplier.
- * @return Returns value * 10^exp10
+ * @param pow10 The power-of-10 of the multiplier.
+ * @return Returns value * 10^pow10
  */
-template <typename Rep,
-          typename T,
-          typename cuda::std::enable_if_t<(cuda::std::is_unsigned_v<T>)>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T multiply_power10(T value, int exp10)
+template <typename Rep, typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T multiply_power10(T value, int pow10)
 {
-  // Use this function if you have no knowledge of what exp10 might be
+  // Use this function if you have no knowledge of what pow10 might be
   // If you do, prefer calling the bit-size-specific versions
   if constexpr (sizeof(Rep) <= 4) {
-    return multiply_power10_32bit(value, exp10);
+    return multiply_power10_32bit(value, pow10);
   } else if constexpr (sizeof(Rep) <= 8) {
-    return multiply_power10_64bit(value, exp10);
+    return multiply_power10_64bit(value, pow10);
   } else {
-    return multiply_power10_128bit(value, exp10);
+    return multiply_power10_128bit(value, pow10);
   }
 }
 
 /**
  * @brief Divide an integer by a power of 10.
  *
- * @note Use this function if you have no a-priori knowledge of what exp10 might be.
+ * @note Use this function if you have no a-priori knowledge of what pow10 might be.
  * If you do, prefer calling the bit-size-specific versions
  *
  * @tparam Rep Representation type needed for integer exponentiation
  * @tparam T Integral type of value to be divided-from.
  * @param value The number to be divided-from.
- * @param exp10 The power-of-10 of the denominator.
- * @return Returns value / 10^exp10
+ * @param pow10 The power-of-10 of the denominator.
+ * @return Returns value / 10^pow10
  */
-template <typename Rep,
-          typename T,
-          typename cuda::std::enable_if_t<(cuda::std::is_unsigned_v<T>)>* = nullptr>
-CUDF_HOST_DEVICE inline constexpr T divide_power10(T value, int exp10)
+template <typename Rep, typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
+CUDF_HOST_DEVICE inline constexpr T divide_power10(T value, int pow10)
 {
-  // Use this function if you have no knowledge of what exp10 might be
+  // Use this function if you have no knowledge of what pow10 might be
   // If you do, prefer calling the bit-size-specific versions
   if constexpr (sizeof(Rep) <= 4) {
-    return divide_power10_32bit(value, exp10);
+    return divide_power10_32bit(value, pow10);
   } else if constexpr (sizeof(Rep) <= 8) {
-    return divide_power10_64bit(value, exp10);
+    return divide_power10_64bit(value, pow10);
   } else {
-    return divide_power10_128bit(value, exp10);
+    return divide_power10_128bit(value, pow10);
   }
 }
 
+/**
+ * @brief Perform a bit-shift left, guarding against undefined behavior
+ *
+ * @tparam IntegerType Type of input unsigned integer value
+ * @param value The integer whose bits are being shifted
+ * @param bit_shift The number of bits to shift left
+ * @return The bit-shifted integer, except max value if UB would occur
+ */
+template <typename IntegerType, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<IntegerType>)>
+CUDF_HOST_DEVICE inline IntegerType guarded_left_shift(IntegerType value, int bit_shift)
+{
+  // Bit shifts larger than this are undefined behavior
+  constexpr int max_safe_bit_shift = cuda::std::numeric_limits<IntegerType>::digits - 1;
+  return (bit_shift <= max_safe_bit_shift) ? value << bit_shift
+                                           : cuda::std::numeric_limits<IntegerType>::max();
+}
+
+/**
+ * @brief Perform a bit-shift right, guarding against undefined behavior
+ *
+ * @tparam IntegerType Type of input unsigned integer value
+ * @param value The integer whose bits are being shifted
+ * @param bit_shift The number of bits to shift right
+ * @return The bit-shifted integer, which is zero on underflow
+ */
+template <typename IntegerType, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<IntegerType>)>
+CUDF_HOST_DEVICE inline IntegerType guarded_right_shift(IntegerType value, int bit_shift)
+{
+  // Bit shifts larger than this are undefined behavior
+  constexpr int max_safe_bit_shift = cuda::std::numeric_limits<IntegerType>::digits - 1;
+  return (bit_shift <= max_safe_bit_shift) ? value >> bit_shift : 0;
+}
+
+/**
+ * @brief Helper struct with common constants needed by the floating <--> decimal conversions
+ */
+template <typename FloatingType>
+struct shifting_constants {
+  /// Whether the type is double
+  static constexpr bool is_double = cuda::std::is_same_v<FloatingType, double>;
+
+  /// Integer type that can hold the value of the significand
+  using IntegerRep = std::conditional_t<is_double, uint64_t, uint32_t>;
+
+  /// Num bits needed to hold the significand
+  static constexpr auto num_significand_bits = cuda::std::numeric_limits<FloatingType>::digits;
+
+  /// Shift data back and forth in space of a type with 2x the starting bits, to give us enough room
+  using ShiftingRep = std::conditional_t<is_double, __uint128_t, uint64_t>;
+
+  // The significand of a float / double is 24 / 53 bits
+  // However, to uniquely represent each double / float as different #'s in decimal
+  // you need 17 / 9 digits (from std::numeric_limits<T>::max_digits10)
+  // To represent 10^17 / 10^9, you need 57 / 30 bits
+  // So we need to keep track of at least this # of bits during shifting to ensure no info is lost
+
+  // We will be alternately shifting our data back and forth by powers of 2 and 10 to convert
+  // between floating and decimal (see shifting functions for details).
+
+  // To iteratively shift back and forth, our 2's (bit-) and 10's (divide-/multiply-) shifts must
+  // be of nearly the same magnitude, or else we'll over-/under-flow our shifting integer
+
+  // 2^10 is approximately 10^3, so the largest shifts will have a 10/3 ratio
+  // The difference between 2^10 and 10^3 is 1024/1000: 2.4%
+  // So every time we shift by 10 bits and 3 decimal places, the 2s shift is an extra 2.4%
+
+  // This 2.4% error compounds each time we do an iteration.
+  // The min (normal) float is 2^-126.
+  // Min denormal: 2^-126 * 2^-23 (mantissa bits): 2^-149 = ~1.4E-45
+  // With our 10/3 shifting ratio, 149 (bit-shifts) * (3 / 10) = 44.7 (10s-shifts)
+  // 10^(-44.7) = 2E-45, which is off by ~1.4x from 1.4E-45
+
+  // Similarly, the min (normal) double is 2^-1022.
+  // Min denormal: 2^-1022 * 2^-52 (mantissa bits): 2^-1074 = 4.94E-324
+  // With our 10/3 shifting ratio, 1074 (bit-shifts) * (3 / 10) = 322.2 (10s-shifts)
+  // 10^(-322.2) = 6.4E-323, which is off by ~13.2x from 4.94E-324
+
+  // To account for this compounding error, we can either complicate our loop code (slow),
+  // or use extra bits (in the direction we're shifting the 2s!) to compensate:
+  // 4 extra bits for doubles (2^4 = 16 > 13.2x error), 1 extra for floats (2 > 1.4x error)
+  /// # buffer bits to account for shifting error
+  static constexpr int num_2s_shift_buffer_bits = is_double ? 4 : 1;
+
+  // How much room do we have for shifting?
+  // Float: 64-bit ShiftingRep - 31 (rep + buffer) = 33 bits. 2^33 = 8.6E9
+  // Double: 128-bit ShiftingRep - 61 (rep + buffer) = 67 bits. 2^67 = 1.5E20
+  // Thus for double / float we can shift up to 20 / 9 decimal places at once
+
+  // But, we need to stick to our 10-bits / 3-decimals shift ratio to not over/under-flow.
+  // To simplify our loop code, we'll keep to this ratio by instead shifting a max of
+  // 18 / 9 decimal places, for double / float (60 / 30 bits)
+  /// Max at-once decimal place shift
+  static constexpr int max_digits_shift = is_double ? 18 : 9;
+  /// Max at-once bit shift
+  static constexpr int max_bits_shift = max_digits_shift * 10 / 3;
+
+  // Pre-calculate 10^max_digits_shift. Note that 10^18 / 10^9 fits within IntegerRep
+  /// 10^max_digits_shift
+  static constexpr auto max_digits_shift_pow =
+    multiply_power10<IntegerRep>(IntegerRep(1), max_digits_shift);
+};
+
+/**
+ * @brief Add half a bit to integer rep of floating point if conversion causes truncation
+ *
+ * @note This fixes problems like 1.2 (value = 1.1999...) at scale -1 -> 11
+ *
+ * @tparam FloatingType Type of integer holding the floating-point significand
+ * @param floating The floating-point number to convert
+ * @param integer_rep The integer representation of the floating-point significand
+ * @param pow2 The power of 2 that needs to be applied to the significand
+ * @param pow10 The power of 10 that needs to be applied to the significand
+ * @return integer_rep, shifted 1 and ++'d if the conversion to decimal causes truncation
+ */
+template <typename FloatingType, CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE cuda::std::pair<typename floating_converter<FloatingType>::IntegralType, int>
+add_half_if_truncates(FloatingType floating,
+                      typename floating_converter<FloatingType>::IntegralType integer_rep,
+                      int pow2,
+                      int pow10)
+{
+  // The user-supplied scale may truncate information, so we need to talk about rounding.
+  // We have chosen not to round, so we want 1.23456f with scale -4 to be decimal 12345
+
+  // But if we don't round at all, 1.2 (double) with scale -1 is 11 instead of 12!
+  // Why? Because 1.2 (double) is actually stored as 1.1999999... which we truncate to 1.1
+  // While correct (given our choice to truncate), this is surprising and undesirable.
+  // This problem happens because 1.2 is not perfectly representable in floating point,
+  // and the value 1.199999... happened to be closer to 1.2 than the next value (1.2000...1...)
+
+  // If the scale truncates information (we didn't choose to keep exactly 1.1999...), how
+  // do we make sure we store 1.2?  We'll add half an ulp! (unit in the last place)
+  // Then 1.1999... becomes 1.2000...1... which truncates to 1.2.
+  // And if it had been 1.2000...1..., adding half an ulp still truncates to 1.2
+
+  // Why 1/2 an ulp? Because that's all that is needed. The reason we have this problem in the
+  // first place is because the compiler rounded (e.g.) 1.2 to the nearest floating point number.
+  // The distance of this rounding is at most 1/2 ulp, otherwise we'd have rounded the other way.
+
+  // How do we add 1/2 an ulp? Just shift the bits left (updating pow2) and add 1.
+  // We'll always shift up so every input to the conversion algorithm is aligned the same way.
+
+  // If we add a full ulp we run into issues where we add too much and get the wrong result.
+  // This is because (e.g.) 2^23 = 8.4E6 which is not quite 7 digits of precision.
+  // So if we want 7 digits, that may "barely" truncate information; adding a 1 ulp is overkill.
+
+  // So when does the user-supplied scale truncate info?
+  // For powers > 0: When the 10s (scale) shift is larger than the corresponding bit-shift.
+  // For powers < 0: When the 10s shift is less than the corresponding bit-shift.
+
+  // Corresponding bit-shift:
+  // 2^10 is approximately 10^3, but this is off by 1.024%
+  // 1.024^30 is 2.03704, so this is high by one bit for every 30*3 = 90 powers of 10
+  // So 10^N = 2^(10*N/3 - N/90) = 2^(299*N/90)
+  // Do comparison without dividing, which loses information:
+  // Note: if shift is "equal," still truncates if pow2 < 0 (shifting UP by 2s, 2^10 > 10^3)
+  int const pow2_term  = 90 * pow2;
+  int const pow10_term = 299 * pow10;
+  bool const conversion_truncates =
+    (pow10_term > pow2_term) || ((pow2_term == pow10_term) && (pow2 < 0));
+
+  // However, don't add a half-bit if the input is a whole number!
+  // This is only for errors introduced by rounding decimal fractions!
+  bool const is_whole_number = (cuda::std::floor(floating) == floating);
+  bool const add_half_bit    = conversion_truncates && !is_whole_number;
+
+  // Add half a bit on truncation (shift to make room and update pow2)
+  integer_rep <<= 1;
+  --pow2;
+  integer_rep += static_cast<decltype(integer_rep)>(add_half_bit);
+
+  return {integer_rep, pow2};
+}
+
+/**
+ * @brief Perform base-2 -> base-10 fixed-point conversion for pow10 > 0
+ *
+ * @tparam Rep The type of the storage for the decimal value
+ * @tparam FloatingType The type of the original floating-point value we are converting from
+ * @param base2_value The base-2 fixed-point value we are converting from
+ * @param pow2 The number of powers of 2 to apply to convert from base-2
+ * @param pow10 The number of powers of 10 to apply to reach the desired scale factor
+ * @return Magnitude of the converted-to decimal integer
+ */
+template <typename Rep,
+          typename FloatingType,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline cuda::std::make_unsigned_t<Rep> shift_to_decimal_pospow(
+  typename shifting_constants<FloatingType>::IntegerRep const base2_value, int pow2, int pow10)
+{
+  // To convert to decimal, we need to apply the input powers of 2 and 10
+  // The result will be (integer) base2_value * (2^pow2) / (10^pow10)
+  // Output type is ShiftingRep
+
+  // Here pow10 > 0 and pow2 > 0, so we need to shift left by 2s and divide by 10s.
+  // We'll iterate back and forth between them, shifting up by 2s
+  // and down by 10s until all of the powers have been applied.
+
+  // However the input base2_value type has virtually no spare room to shift our data
+  // without over- or under-flowing and losing precision.
+  // So we'll cast up to ShiftingRep: uint64 for float's, __uint128_t for double's
+  using Constants   = shifting_constants<FloatingType>;
+  using ShiftingRep = typename Constants::ShiftingRep;
+  auto shifting_rep = static_cast<ShiftingRep>(base2_value);
+
+  // We want to start with our significand bits at the top of the shifting range,
+  // so that we don't lose information we need on intermediary right-shifts.
+  // Note that since we're shifting 2s up, we need num_2s_shift_buffer_bits space on the high side,
+  // For all numbers this bit shift is a fixed distance, due to the understood 2^0 bit.
+  // Note that shift_from is +1 due to shift in add_half_if_truncates()
+  static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::num_2s_shift_buffer_bits;
+  static constexpr int shift_from  = Constants::num_significand_bits + 1;
+  static constexpr int max_init_shift = shift_up_to - shift_from;
+
+  // If our total bit shift is less than this, we don't need to iterate
+  using UnsignedRep = cuda::std::make_unsigned_t<Rep>;
+  if (pow2 <= max_init_shift) {
+    // Shift bits left, divide by 10s to apply the scale factor, and we're done.
+    shifting_rep = divide_power10<ShiftingRep>(shifting_rep << pow2, pow10);
+    // NOTE: Cast can overflow!
+    return static_cast<UnsignedRep>(shifting_rep);
+  }
+
+  // We need to iterate. Do the combined initial shift
+  shifting_rep <<= max_init_shift;
+  pow2 -= max_init_shift;
+
+  // Iterate, dividing by 10s and shifting up by 2s until we're almost done
+  while (pow10 > Constants::max_digits_shift) {
+    // More decimal places to shift than we have room: Divide the max number of 10s
+    shifting_rep /= Constants::max_digits_shift_pow;
+    pow10 -= Constants::max_digits_shift;
+
+    // If our remaining bit shift is less than the max, we're finished iterating
+    if (pow2 <= Constants::max_bits_shift) {
+      // Shift bits left, divide by 10s to apply the scale factor, and we're done.
+      shifting_rep = divide_power10<ShiftingRep>(shifting_rep << pow2, pow10);
+
+      // NOTE: Cast can overflow!
+      return static_cast<UnsignedRep>(shifting_rep);
+    }
+
+    // Shift the max number of bits left again
+    shifting_rep <<= Constants::max_bits_shift;
+    pow2 -= Constants::max_bits_shift;
+  }
+
+  // Last 10s-shift: Divide all remaining decimal places, shift all remaining bits, then bail
+  // Note: This divide result may not fit in the low half of the bit range
+  // But the divisor is less than the max-shift, and thus fits within 64 / 32 bits
+  if constexpr (Constants::is_double) {
+    shifting_rep = divide_power10_64bit(shifting_rep, pow10);
+  } else {
+    shifting_rep = divide_power10_32bit(shifting_rep, pow10);
+  }
+
+  // Final bit shift: Shift may be large, guard against UB
+  // NOTE: This can overflow (both cast and shift)!
+  return guarded_left_shift(static_cast<UnsignedRep>(shifting_rep), pow2);
+}
+
+/**
+ * @brief Perform base-2 -> base-10 fixed-point conversion for pow10 < 0
+ *
+ * @tparam Rep The type of the storage for the decimal value
+ * @tparam FloatingType The type of the original floating-point value we are converting from
+ * @param base2_value The base-2 fixed-point value we are converting from
+ * @param pow2 The number of powers of 2 to apply to convert from base-2
+ * @param pow10 The number of powers of 10 to apply to reach the desired scale factor
+ * @return Magnitude of the converted-to decimal integer
+ */
+template <typename Rep,
+          typename FloatingType,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline cuda::std::make_unsigned_t<Rep> shift_to_decimal_negpow(
+  typename shifting_constants<FloatingType>::IntegerRep base2_value, int pow2, int pow10)
+{
+  // This is similar to shift_to_decimal_pospow(), except pow10 < 0 & pow2 < 0
+  // See comments in that function for details.
+  // Instead here we need to multiply by 10s and shift right by 2s
+
+  // ShiftingRep: uint64 for float's, __uint128_t for double's
+  using Constants   = shifting_constants<FloatingType>;
+  using ShiftingRep = typename Constants::ShiftingRep;
+  auto shifting_rep = static_cast<ShiftingRep>(base2_value);
+
+  // Convert to using positive values so we don't have keep negating
+  int pow10_mag = -pow10;
+  int pow2_mag  = -pow2;
+
+  // For performing final 10s-shift
+  using UnsignedRep        = cuda::std::make_unsigned_t<Rep>;
+  auto final_shifts_low10s = [&]() {
+    // Last 10s-shift: multiply all remaining decimal places, shift all remaining bits, then bail
+    // The multiplier is less than the max-shift, and thus fits within 64 / 32 bits
+    if constexpr (Constants::is_double) {
+      shifting_rep = multiply_power10_64bit(shifting_rep, pow10_mag);
+    } else {
+      shifting_rep = multiply_power10_32bit(shifting_rep, pow10_mag);
+    }
+
+    // Final bit shifting: Shift may be large, guard against UB
+    return static_cast<UnsignedRep>(guarded_right_shift(shifting_rep, pow2_mag));
+  };
+
+  // If our total decimal shift is less than the max, we don't need to iterate
+  if (pow10_mag <= Constants::max_digits_shift) { return final_shifts_low10s(); }
+
+  // We want to start by lining up our bits to the top of the shifting range,
+  // except our first operation is a multiply, so not quite that far
+  // We are bit-shifting down, so we need extra bits on the low-side, which this has.
+  // Note that shift_from is +1 due to shift in add_half_if_truncates()
+  static constexpr int shift_up_to        = sizeof(ShiftingRep) * 8 - Constants::max_bits_shift;
+  static constexpr int shift_from         = Constants::num_significand_bits + 1;
+  static constexpr int num_init_bit_shift = shift_up_to - shift_from;
+
+  // Perform initial shift
+  shifting_rep <<= num_init_bit_shift;
+  pow2_mag += num_init_bit_shift;
+
+  // Iterate, multiplying by 10s and shifting down by 2s until we're almost done
+  do {
+    // More decimal places to shift than we have room: Multiply the max number of 10s
+    shifting_rep *= Constants::max_digits_shift_pow;
+    pow10_mag -= Constants::max_digits_shift;
+
+    // If our remaining bit shift is less than the max, we're finished iterating
+    if (pow2_mag <= Constants::max_bits_shift) {
+      // Last bit-shift: Shift all remaining bits, apply the remaining scale, then bail
+      shifting_rep >>= pow2_mag;
+
+      // We need to convert to the output rep for the final scale-factor multiply, because if (e.g.)
+      // float -> dec128 and some large pow10_mag, it might overflow the 64bit shifting rep.
+      // It's not needed for pow10 > 0 because we're dividing by 10s there instead of multiplying.
+      // NOTE: This can overflow! (Both multiply and cast)
+      return multiply_power10<UnsignedRep>(static_cast<UnsignedRep>(shifting_rep), pow10_mag);
+    }
+
+    // More bits to shift than we have room: Shift the max number of 2s
+    shifting_rep >>= Constants::max_bits_shift;
+    pow2_mag -= Constants::max_bits_shift;
+  } while (pow10_mag > Constants::max_digits_shift);
+
+  // Do our final shifts
+  return final_shifts_low10s();
+}
+
+/**
+ * @brief Perform base-2 -> base-10 fixed-point conversion
+ *
+ * @tparam Rep The type of integer we are converting to, to store the decimal value
+ * @tparam FloatingType The type of floating-point object we are converting from
+ * @param base2_value The base-2 fixed-point value we are converting from
+ * @param pow2 The number of powers of 2 to apply to convert from base-2
+ * @param pow10 The number of powers of 10 to apply to reach the desired scale factor
+ * @return Integer representation of the floating-point value, given the desired scale
+ */
+template <typename Rep,
+          typename FloatingType,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline cuda::std::make_unsigned_t<Rep> convert_floating_to_integral_shifting(
+  typename floating_converter<FloatingType>::IntegralType base2_value, int pow10, int pow2)
+{
+  // Apply the powers of 2 and 10 to convert to decimal.
+  // The result will be base2_value * (2^pow2) / (10^pow10)
+
+  // Note that while this code is branchy, the decimal scale factor is part of the
+  // column type itself, so every thread will take the same branches on pow10.
+  // Also data within a column tends to be similar, so they will often take the
+  // same branches on pow2 as well.
+
+  // NOTE: some returns here can overflow (e.g. ShiftingRep -> UnsignedRep)
+  using UnsignedRep = cuda::std::make_unsigned_t<Rep>;
+  if (pow10 == 0) {
+    // NOTE: Left Bit-shift can overflow! As can cast! (e.g. double -> decimal32)
+    // Bit shifts may be large, guard against UB
+    if (pow2 >= 0) {
+      return guarded_left_shift(static_cast<UnsignedRep>(base2_value), pow2);
+    } else {
+      return static_cast<UnsignedRep>(guarded_right_shift(base2_value, -pow2));
+    }
+  } else if (pow10 > 0) {
+    if (pow2 <= 0) {
+      // Power-2/10 shifts both downward: order doesn't matter, apply and bail.
+      // Guard against shift being undefined behavior
+      auto const shifted = guarded_right_shift(base2_value, -pow2);
+      return static_cast<UnsignedRep>(divide_power10<decltype(shifted)>(shifted, pow10));
+    }
+    return shift_to_decimal_pospow<Rep, FloatingType>(base2_value, pow2, pow10);
+  } else {  // pow10 < 0
+    if (pow2 >= 0) {
+      // Power-2/10 shifts both upward: order doesn't matter, apply and bail.
+      // NOTE: Either shift, multiply, or cast (e.g. double -> decimal32) can overflow!
+      auto const shifted = guarded_left_shift(static_cast<UnsignedRep>(base2_value), pow2);
+      return multiply_power10<UnsignedRep>(shifted, -pow10);
+    }
+    return shift_to_decimal_negpow<Rep, FloatingType>(base2_value, pow2, pow10);
+  }
+}
+
+/**
+ * @brief Perform floating-point -> integer decimal conversion
+ *
+ * @tparam Rep The type of integer we are converting to, to store the decimal value
+ * @tparam FloatingType The type of floating-point object we are converting from
+ * @param floating The floating point value to convert
+ * @param scale The desired base-10 scale factor: decimal value = returned value * 10^scale
+ * @return Integer representation of the floating-point value, given the desired scale
+ */
+template <typename Rep,
+          typename FloatingType,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline Rep convert_floating_to_integral(FloatingType const& floating,
+                                                         scale_type const& scale)
+{
+  // Extract components of the floating point number
+  using converter        = floating_converter<FloatingType>;
+  auto const integer_rep = converter::bit_cast_to_integer(floating);
+  if (converter::is_zero(integer_rep)) { return 0; }
+
+  // Note that the significand here is an unsigned integer with sizeof(FloatingType)
+  auto const is_negative                  = converter::get_is_negative(integer_rep);
+  auto const [significand, floating_pow2] = converter::get_significand_and_pow2(integer_rep);
+
+  // Add half a bit if truncating to yield expected value, see function for discussion.
+  auto const pow10 = static_cast<int>(scale);
+  auto const [base2_value, pow2] =
+    add_half_if_truncates(floating, significand, floating_pow2, pow10);
+
+  // Apply the powers of 2 and 10 to convert to decimal.
+  auto const magnitude =
+    convert_floating_to_integral_shifting<Rep, FloatingType>(base2_value, pow10, pow2);
+
+  // Reapply the sign and return
+  // NOTE: Cast can overflow!
+  auto const signed_magnitude = static_cast<Rep>(magnitude);
+  return is_negative ? -signed_magnitude : signed_magnitude;
+}
+
+/**
+ * @brief Perform base-10 -> base-2 fixed-point conversion for pow10 > 0
+ *
+ * @tparam DecimalRep The decimal integer type we are converting from
+ * @tparam FloatingType The type of floating point object we are converting to
+ * @param decimal_rep The decimal integer to convert
+ * @param pow10 The number of powers of 10 to apply to undo the scale factor
+ * @return A pair of the base-2 value and the remaining powers of 2 to be applied
+ */
+template <typename FloatingType,
+          typename DecimalRep,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline auto shift_to_binary_pospow(DecimalRep decimal_rep, int pow10)
+{
+  // This is the reverse of shift_to_decimal_pospow(), see that for more details.
+
+  // ShiftingRep: uint64 for float's, __uint128_t for double's
+  using Constants   = shifting_constants<FloatingType>;
+  using ShiftingRep = typename Constants::ShiftingRep;
+
+  // We want to start by lining up our bits to the top of the shifting range,
+  // except our first operation is a multiply, so not quite that far
+  // We are bit-shifting down, so we need extra bits on the low-side, which this has.
+  static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::max_bits_shift;
+  int const shift_from             = count_significant_bits(decimal_rep);
+  int const num_init_bit_shift     = shift_up_to - shift_from;
+  int pow2                         = -num_init_bit_shift;
+
+  // Perform the initial bit shift
+  ShiftingRep shifting_rep;
+  if constexpr (sizeof(ShiftingRep) < sizeof(DecimalRep)) {
+    // Shift within DecimalRep before dropping to the smaller ShiftingRep
+    decimal_rep  = (pow2 >= 0) ? (decimal_rep >> pow2) : (decimal_rep << -pow2);
+    shifting_rep = static_cast<ShiftingRep>(decimal_rep);
+  } else {
+    // Scale up to ShiftingRep before shifting
+    shifting_rep = static_cast<ShiftingRep>(decimal_rep);
+    shifting_rep = (pow2 >= 0) ? (shifting_rep >> pow2) : (shifting_rep << -pow2);
+  }
+
+  // Iterate, multiplying by 10s and shifting down by 2s until we're almost done
+  while (pow10 > Constants::max_digits_shift) {
+    // More decimal places to shift than we have room: Multiply the max number of 10s
+    shifting_rep *= Constants::max_digits_shift_pow;
+    pow10 -= Constants::max_digits_shift;
+
+    // Then make more room by bit shifting down by the max # of 2s
+    shifting_rep >>= Constants::max_bits_shift;
+    pow2 += Constants::max_bits_shift;
+  }
+
+  // Last 10s-shift: multiply all remaining decimal places
+  // The multiplier is less than the max-shift, and thus fits within 64 / 32 bits
+  if constexpr (Constants::is_double) {
+    shifting_rep = multiply_power10_64bit(shifting_rep, pow10);
+  } else {
+    shifting_rep = multiply_power10_32bit(shifting_rep, pow10);
+  }
+
+  // Our shifting_rep is now the integer mantissa, return it and the powers of 2
+  return std::pair{shifting_rep, pow2};
+}
+
+/**
+ * @brief Perform base-10 -> base-2 fixed-point conversion for pow10 < 0
+ *
+ * @tparam DecimalRep The decimal integer type we are converting from
+ * @tparam FloatingType The type of floating point object we are converting to
+ * @param decimal_rep The decimal integer to convert
+ * @param pow10 The number of powers of 10 to apply to undo the scale factor
+ * @return A pair of the base-2 value and the remaining powers of 2 to be applied
+ */
+template <typename FloatingType,
+          typename DecimalRep,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline auto shift_to_binary_negpow(DecimalRep decimal_rep, int const pow10)
+{
+  // This is the reverse of shift_to_decimal_negpow(), see that for more details.
+
+  // ShiftingRep: uint64 for float's, __uint128_t for double's
+  using Constants   = shifting_constants<FloatingType>;
+  using ShiftingRep = typename Constants::ShiftingRep;
+
+  // We want to start with our significand bits at the top of the shifting range,
+  // so that we lose minimal information we need on intermediary right-shifts.
+  // Note that since we're shifting 2s up, we need num_2s_shift_buffer_bits space on the high side
+  static constexpr int shift_up_to = sizeof(ShiftingRep) * 8 - Constants::num_2s_shift_buffer_bits;
+  int const shift_from             = count_significant_bits(decimal_rep);
+  int const num_init_bit_shift     = shift_up_to - shift_from;
+  int pow2                         = -num_init_bit_shift;
+
+  // Perform the initial bit shift
+  ShiftingRep shifting_rep;
+  if constexpr (sizeof(ShiftingRep) < sizeof(DecimalRep)) {
+    // Shift within DecimalRep before dropping to the smaller ShiftingRep
+    decimal_rep  = (pow2 >= 0) ? (decimal_rep >> pow2) : (decimal_rep << -pow2);
+    shifting_rep = static_cast<ShiftingRep>(decimal_rep);
+  } else {
+    // Scale up to ShiftingRep before shifting
+    shifting_rep = static_cast<ShiftingRep>(decimal_rep);
+    shifting_rep = (pow2 >= 0) ? (shifting_rep >> pow2) : (shifting_rep << -pow2);
+  }
+
+  // Convert to using positive values upfront, simpler than doing later.
+  int pow10_mag = -pow10;
+
+  // Iterate, dividing by 10s and shifting up by 2s until we're almost done
+  while (pow10_mag > Constants::max_digits_shift) {
+    // More decimal places to shift than we have room: Divide the max number of 10s
+    shifting_rep /= Constants::max_digits_shift_pow;
+    pow10_mag -= Constants::max_digits_shift;
+
+    // Then make more room by bit shifting up by the max # of 2s
+    shifting_rep <<= Constants::max_bits_shift;
+    pow2 -= Constants::max_bits_shift;
+  }
+
+  // Last 10s-shift: Divdie all remaining decimal places.
+  // This divide result may not fit in the low half of the bit range
+  // But the divisor is less than the max-shift, and thus fits within 64 / 32 bits
+  if constexpr (Constants::is_double) {
+    shifting_rep = divide_power10_64bit(shifting_rep, pow10_mag);
+  } else {
+    shifting_rep = divide_power10_32bit(shifting_rep, pow10_mag);
+  }
+
+  // Our shifting_rep is now the integer mantissa, return it and the powers of 2
+  return std::pair{shifting_rep, pow2};
+}
+
+/**
+ * @brief Perform integer decimal -> floating-point conversion
+ *
+ * @tparam FloatingType The type of floating-point object we are converting to
+ * @tparam Rep The decimal integer type we are converting from
+ * @param value The decimal integer to convert
+ * @param scale The base-10 scale factor for the input integer
+ * @return Floating-point representation of the scaled integral value
+ */
+template <typename FloatingType,
+          typename Rep,
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<FloatingType>)>
+CUDF_HOST_DEVICE inline FloatingType convert_integral_to_floating(Rep const& value,
+                                                                  scale_type const& scale)
+{
+  // Check the sign of the input
+  bool const is_negative = (value < 0);
+
+  // Convert to unsigned for bit counting/shifting
+  using UnsignedType        = cuda::std::make_unsigned_t<Rep>;
+  auto const unsigned_value = [&]() -> UnsignedType {
+    // Must guard against minimum value, as we can't just negate it: not representable.
+    if (value == cuda::std::numeric_limits<Rep>::min()) { return static_cast<UnsignedType>(value); }
+
+    // No abs function for 128bit types, so have to do it manually.
+    if constexpr (cuda::std::is_same_v<Rep, __int128_t>) {
+      return static_cast<UnsignedType>(is_negative ? -value : value);
+    } else {
+      return cuda::std::abs(value);
+    }
+  }();
+
+  // Shift by powers of 2 and 10 to get our integer mantissa
+  auto const [mantissa, pow2] = [&]() {
+    auto const pow10 = static_cast<int32_t>(scale);
+    if (pow10 >= 0) {
+      return shift_to_binary_pospow<FloatingType>(unsigned_value, pow10);
+    } else {  // pow10 < 0
+      return shift_to_binary_negpow<FloatingType>(unsigned_value, pow10);
+    }
+  }();
+
+  // Zero has special exponent bits, just handle it here
+  if (mantissa == 0) { return FloatingType(0.0f); }
+
+  // Cast our integer mantissa to floating point
+  auto const floating = static_cast<FloatingType>(mantissa);  // IEEE-754 rounds to even
+
+  // Apply the sign and the remaining powers of 2
+  using converter      = floating_converter<FloatingType>;
+  auto const magnitude = converter::add_pow2(floating, pow2);
+  return converter::set_is_negative(magnitude, is_negative);
+}
+
 }  // namespace detail
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 74c8bc67d3a..8a515335351 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/fixed_point/floating_conversion.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/traits.hpp>
@@ -50,14 +51,19 @@ namespace cudf {
  */
 template <typename Fixed,
           typename Floating,
-          typename cuda::std::enable_if_t<is_fixed_point<Fixed>() &&
-                                          cuda::std::is_floating_point_v<Floating>>* = nullptr>
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<Floating>&& is_fixed_point<Fixed>())>
 CUDF_HOST_DEVICE Fixed convert_floating_to_fixed(Floating floating, numeric::scale_type scale)
 {
-  using Rep          = typename Fixed::rep;
-  auto const shifted = numeric::detail::shift<Rep, Fixed::rad>(floating, scale);
-  numeric::scaled_integer<Rep> scaled{static_cast<Rep>(shifted), scale};
-  return Fixed(scaled);
+  using Rep        = typename Fixed::rep;
+  auto const value = [&]() {
+    if constexpr (Fixed::rad == numeric::Radix::BASE_10) {
+      return numeric::detail::convert_floating_to_integral<Rep>(floating, scale);
+    } else {
+      return static_cast<Rep>(numeric::detail::shift<Rep, Fixed::rad>(floating, scale));
+    }
+  }();
+
+  return Fixed(numeric::scaled_integer<Rep>{value, scale});
 }
 
 /**
@@ -75,14 +81,17 @@ CUDF_HOST_DEVICE Fixed convert_floating_to_fixed(Floating floating, numeric::sca
  */
 template <typename Floating,
           typename Fixed,
-          typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<Floating> &&
-                                          is_fixed_point<Fixed>()>* = nullptr>
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<Floating>&& is_fixed_point<Fixed>())>
 CUDF_HOST_DEVICE Floating convert_fixed_to_floating(Fixed fixed)
 {
-  using Rep         = typename Fixed::rep;
-  auto const casted = static_cast<Floating>(fixed.value());
-  auto const scale  = numeric::scale_type{-fixed.scale()};
-  return numeric::detail::shift<Rep, Fixed::rad>(casted, scale);
+  using Rep = typename Fixed::rep;
+  if constexpr (Fixed::rad == numeric::Radix::BASE_10) {
+    return numeric::detail::convert_integral_to_floating<Floating>(fixed.value(), fixed.scale());
+  } else {
+    auto const casted = static_cast<Floating>(fixed.value());
+    auto const scale  = numeric::scale_type{-fixed.scale()};
+    return numeric::detail::shift<Rep, Fixed::rad>(casted, scale);
+  }
 }
 
 /**
@@ -95,7 +104,7 @@ CUDF_HOST_DEVICE Floating convert_fixed_to_floating(Fixed fixed)
  */
 template <typename Floating,
           typename Input,
-          typename cuda::std::enable_if_t<cuda::std::is_floating_point_v<Floating>>* = nullptr>
+          CUDF_ENABLE_IF(cuda::std::is_floating_point_v<Floating>)>
 CUDF_HOST_DEVICE Floating convert_to_floating(Input input)
 {
   if constexpr (is_fixed_point<Input>()) {
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cpp b/cpp/tests/fixed_point/fixed_point_tests.cpp
index ab7984d4b03..a222289216d 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cpp
+++ b/cpp/tests/fixed_point/fixed_point_tests.cpp
@@ -38,7 +38,7 @@ struct FixedPointTest : public cudf::test::BaseFixture {};
 template <typename T>
 struct FixedPointTestAllReps : public cudf::test::BaseFixture {};
 
-using RepresentationTypes = ::testing::Types<int32_t, int64_t>;
+using RepresentationTypes = ::testing::Types<int32_t, int64_t, __int128_t>;
 
 TYPED_TEST_SUITE(FixedPointTestAllReps, RepresentationTypes);
 
@@ -53,6 +53,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXConstruction)
   auto num4 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-4));
   auto num5 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-5));
   auto num6 = cudf::convert_floating_to_fixed<decimalXX>(1.234567, scale_type(-6));
+  auto num7 = cudf::convert_floating_to_fixed<decimalXX>(0.0, scale_type(-4));
 
   EXPECT_EQ(1, cudf::convert_fixed_to_floating<double>(num0));
   EXPECT_EQ(1.2, cudf::convert_fixed_to_floating<double>(num1));
@@ -61,6 +62,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXConstruction)
   EXPECT_EQ(1.2345, cudf::convert_fixed_to_floating<double>(num4));
   EXPECT_EQ(1.23456, cudf::convert_fixed_to_floating<double>(num5));
   EXPECT_EQ(1.234567, cudf::convert_fixed_to_floating<double>(num6));
+  EXPECT_EQ(0.0, cudf::convert_fixed_to_floating<double>(num7));
 }
 
 TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction)
@@ -74,6 +76,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction)
   auto num4 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-4));
   auto num5 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-5));
   auto num6 = cudf::convert_floating_to_fixed<decimalXX>(-1.234567, scale_type(-6));
+  auto num7 = cudf::convert_floating_to_fixed<decimalXX>(-0.0, scale_type(-4));
 
   EXPECT_EQ(-1, cudf::convert_fixed_to_floating<double>(num0));
   EXPECT_EQ(-1.2, cudf::convert_fixed_to_floating<double>(num1));
@@ -82,6 +85,7 @@ TYPED_TEST(FixedPointTestAllReps, SimpleNegativeDecimalXXConstruction)
   EXPECT_EQ(-1.2345, cudf::convert_fixed_to_floating<double>(num4));
   EXPECT_EQ(-1.23456, cudf::convert_fixed_to_floating<double>(num5));
   EXPECT_EQ(-1.234567, cudf::convert_fixed_to_floating<double>(num6));
+  EXPECT_EQ(-0.0, cudf::convert_fixed_to_floating<double>(num7));
 }
 
 TYPED_TEST(FixedPointTestAllReps, PaddedDecimalXXConstruction)
@@ -99,14 +103,10 @@ TYPED_TEST(FixedPointTestAllReps, PaddedDecimalXXConstruction)
 
   EXPECT_EQ(1.1, cudf::convert_fixed_to_floating<double>(a));
   EXPECT_EQ(1.01, cudf::convert_fixed_to_floating<double>(b));
-  EXPECT_EQ(1,
-            cudf::convert_fixed_to_floating<double>(
-              c));  // intentional (inherited problem from floating point)
+  EXPECT_EQ(1.001, cudf::convert_fixed_to_floating<double>(c));
   EXPECT_EQ(1.0001, cudf::convert_fixed_to_floating<double>(d));
   EXPECT_EQ(1.00001, cudf::convert_fixed_to_floating<double>(e));
-  EXPECT_EQ(1,
-            cudf::convert_fixed_to_floating<double>(
-              f));  // intentional (inherited problem from floating point)
+  EXPECT_EQ(1.000001, cudf::convert_fixed_to_floating<double>(f));
 
   EXPECT_TRUE(1.000123 - cudf::convert_fixed_to_floating<double>(x) <
               std::numeric_limits<double>::epsilon());
@@ -153,6 +153,119 @@ TYPED_TEST(FixedPointTestAllReps, MoreSimpleBinaryFPConstruction)
   EXPECT_EQ(2.0625, cudf::convert_fixed_to_floating<double>(num1));
 }
 
+TEST_F(FixedPointTest, PreciseFloatDecimal64Construction)
+{
+  // Need 9 decimal digits to uniquely represent all floats (numeric_limits::max_digits10()).
+  // Precise conversion: set the scale factor to 9 less than the order-of-magnitude.
+  // But with -9 scale factor decimal32 can overflow: use decimal64 instead.
+
+  // Positive Exponent
+  {
+    auto num0 = cudf::convert_floating_to_fixed<decimal64>(3.141593E7f, scale_type(-2));
+    auto num1 = cudf::convert_floating_to_fixed<decimal64>(3.141593E12f, scale_type(3));
+    auto num2 = cudf::convert_floating_to_fixed<decimal64>(3.141593E17f, scale_type(8));
+    auto num3 = cudf::convert_floating_to_fixed<decimal64>(3.141593E22f, scale_type(13));
+    auto num4 = cudf::convert_floating_to_fixed<decimal64>(3.141593E27f, scale_type(18));
+    auto num5 = cudf::convert_floating_to_fixed<decimal64>(3.141593E32f, scale_type(23));
+    auto num6 = cudf::convert_floating_to_fixed<decimal64>(3.141593E37f, scale_type(28));
+
+    EXPECT_EQ(3.141593E7f, cudf::convert_fixed_to_floating<float>(num0));
+    EXPECT_EQ(3.141593E12f, cudf::convert_fixed_to_floating<float>(num1));
+    EXPECT_EQ(3.141593E17f, cudf::convert_fixed_to_floating<float>(num2));
+    EXPECT_EQ(3.141593E22f, cudf::convert_fixed_to_floating<float>(num3));
+    EXPECT_EQ(3.141593E27f, cudf::convert_fixed_to_floating<float>(num4));
+    EXPECT_EQ(3.141593E32f, cudf::convert_fixed_to_floating<float>(num5));
+    EXPECT_EQ(3.141593E37f, cudf::convert_fixed_to_floating<float>(num6));
+  }
+
+  // Negative Exponent
+  {
+    auto num0 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-7f, scale_type(-16));
+    auto num1 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-12f, scale_type(-21));
+    auto num2 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-17f, scale_type(-26));
+    auto num3 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-22f, scale_type(-31));
+    auto num4 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-27f, scale_type(-36));
+    auto num5 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-32f, scale_type(-41));
+    auto num6 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-37f, scale_type(-47));
+
+    EXPECT_EQ(3.141593E-7f, cudf::convert_fixed_to_floating<float>(num0));
+    EXPECT_EQ(3.141593E-12f, cudf::convert_fixed_to_floating<float>(num1));
+    EXPECT_EQ(3.141593E-17f, cudf::convert_fixed_to_floating<float>(num2));
+    EXPECT_EQ(3.141593E-22f, cudf::convert_fixed_to_floating<float>(num3));
+    EXPECT_EQ(3.141593E-27f, cudf::convert_fixed_to_floating<float>(num4));
+    EXPECT_EQ(3.141593E-32f, cudf::convert_fixed_to_floating<float>(num5));
+    EXPECT_EQ(3.141593E-37f, cudf::convert_fixed_to_floating<float>(num6));
+
+    // Denormals
+    auto num7  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-39f, scale_type(-48));
+    auto num8  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-41f, scale_type(-50));
+    auto num9  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-43f, scale_type(-52));
+    auto num10 = cudf::convert_floating_to_fixed<decimal64>(FLT_TRUE_MIN, scale_type(-54));
+
+    EXPECT_EQ(3.141593E-39f, cudf::convert_fixed_to_floating<float>(num7));
+    EXPECT_EQ(3.141593E-41f, cudf::convert_fixed_to_floating<float>(num8));
+    EXPECT_EQ(3.141593E-43f, cudf::convert_fixed_to_floating<float>(num9));
+    EXPECT_EQ(FLT_TRUE_MIN, cudf::convert_fixed_to_floating<float>(num10));
+  }
+}
+
+TEST_F(FixedPointTest, PreciseDoubleDecimal64Construction)
+{
+  // Need 17 decimal digits to uniquely represent all doubles (numeric_limits::max_digits10()).
+  // Precise conversion: set the scale factor to 17 less than the order-of-magnitude.
+
+  using decimal64 = fixed_point<int64_t, Radix::BASE_10>;
+
+  // Positive Exponent
+  {
+    auto num0 = cudf::convert_floating_to_fixed<decimal64>(3.141593E8, scale_type(-9));
+    auto num1 = cudf::convert_floating_to_fixed<decimal64>(3.141593E58, scale_type(41));
+    auto num2 = cudf::convert_floating_to_fixed<decimal64>(3.141593E108, scale_type(91));
+    auto num3 = cudf::convert_floating_to_fixed<decimal64>(3.141593E158, scale_type(141));
+    auto num4 = cudf::convert_floating_to_fixed<decimal64>(3.141593E208, scale_type(191));
+    auto num5 = cudf::convert_floating_to_fixed<decimal64>(3.141593E258, scale_type(241));
+    auto num6 = cudf::convert_floating_to_fixed<decimal64>(3.141593E307, scale_type(290));
+
+    EXPECT_EQ(3.141593E8, cudf::convert_fixed_to_floating<double>(num0));
+    EXPECT_EQ(3.141593E58, cudf::convert_fixed_to_floating<double>(num1));
+    EXPECT_EQ(3.141593E108, cudf::convert_fixed_to_floating<double>(num2));
+    EXPECT_EQ(3.141593E158, cudf::convert_fixed_to_floating<double>(num3));
+    EXPECT_EQ(3.141593E208, cudf::convert_fixed_to_floating<double>(num4));
+    EXPECT_EQ(3.141593E258, cudf::convert_fixed_to_floating<double>(num5));
+    EXPECT_EQ(3.141593E307, cudf::convert_fixed_to_floating<double>(num6));
+  }
+
+  // Negative Exponent
+  {
+    auto num0 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-8, scale_type(-25));
+    auto num1 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-58, scale_type(-75));
+    auto num2 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-108, scale_type(-125));
+    auto num3 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-158, scale_type(-175));
+    auto num4 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-208, scale_type(-225));
+    auto num5 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-258, scale_type(-275));
+    auto num6 = cudf::convert_floating_to_fixed<decimal64>(3.141593E-308, scale_type(-325));
+
+    EXPECT_EQ(3.141593E-8, cudf::convert_fixed_to_floating<double>(num0));
+    EXPECT_EQ(3.141593E-58, cudf::convert_fixed_to_floating<double>(num1));
+    EXPECT_EQ(3.141593E-108, cudf::convert_fixed_to_floating<double>(num2));
+    EXPECT_EQ(3.141593E-158, cudf::convert_fixed_to_floating<double>(num3));
+    EXPECT_EQ(3.141593E-208, cudf::convert_fixed_to_floating<double>(num4));
+    EXPECT_EQ(3.141593E-258, cudf::convert_fixed_to_floating<double>(num5));
+    EXPECT_EQ(3.141593E-308, cudf::convert_fixed_to_floating<double>(num6));
+
+    // Denormals
+    auto num7  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-309, scale_type(-326));
+    auto num8  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-314, scale_type(-331));
+    auto num9  = cudf::convert_floating_to_fixed<decimal64>(3.141593E-319, scale_type(-336));
+    auto num10 = cudf::convert_floating_to_fixed<decimal64>(DBL_TRUE_MIN, scale_type(-341));
+
+    EXPECT_EQ(3.141593E-309, cudf::convert_fixed_to_floating<double>(num7));
+    EXPECT_EQ(3.141593E-314, cudf::convert_fixed_to_floating<double>(num8));
+    EXPECT_EQ(3.141593E-319, cudf::convert_fixed_to_floating<double>(num9));
+    EXPECT_EQ(DBL_TRUE_MIN, cudf::convert_fixed_to_floating<double>(num10));
+  }
+}
+
 TYPED_TEST(FixedPointTestAllReps, SimpleDecimalXXMath)
 {
   using decimalXX = fixed_point<TypeParam, Radix::BASE_10>;
@@ -442,8 +555,6 @@ void float_vector_test(ValueType const initial_value,
                        int32_t const scale,
                        Binop binop)
 {
-  using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
-
   std::vector<decimal32> vec1(size);
   std::vector<ValueType> vec2(size);
 
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 1d6a3b3304a..7136b162c13 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3509,9 +3509,9 @@ void testCastFloatToDecimal() {
   @Test
   void testCastDoubleToDecimal() {
     testCastNumericToDecimalsAndBack(DType.FLOAT64, false, 0,
-        () -> ColumnVector.fromBoxedDoubles(1.0, 2.1, -3.23, null, 2.41281, (double) Long.MAX_VALUE),
-        () -> ColumnVector.fromBoxedDoubles(1.0, 2.0, -3.0, null, 2.0, (double) Long.MAX_VALUE),
-        new Long[]{1L, 2L, -3L, null, 2L, Long.MAX_VALUE}
+        () -> ColumnVector.fromBoxedDoubles(1.0, 2.1, -3.23, null, 2.41281, (double) Integer.MAX_VALUE),
+        () -> ColumnVector.fromBoxedDoubles(1.0, 2.0, -3.0, null, 2.0, (double) Integer.MAX_VALUE),
+        new Long[]{1L, 2L, -3L, null, 2L, (long) Integer.MAX_VALUE}
     );
     testCastNumericToDecimalsAndBack(DType.FLOAT64, false, -2,
         () -> ColumnVector.fromBoxedDoubles(1.0, 2.1, -3.23, null, 2.41281, -55.01999),
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index c41a938f6ea..65f739bc74a 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -97,7 +97,7 @@ def test_typecast_from_float_to_decimal(request, data, from_dtype, to_dtype):
         pytest.mark.xfail(
             condition=version.parse(pa.__version__) >= version.parse("13.0.0")
             and from_dtype == np.dtype("float32")
-            and to_dtype.precision > 7,
+            and to_dtype.precision > 12,
             reason="https://github.com/rapidsai/cudf/issues/14169",
         )
     )

From 2b2058de941289ca343cb1d3a3eb143a84998dfd Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 11 Jul 2024 07:19:12 -0400
Subject: [PATCH 493/842] Add custom name setter and getter for proxy objects
 in `cudf.pandas` (#16234)

Closes #14524

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16234
---
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 50 +++++++++++++++++--
 .../cudf_pandas_tests/test_cudf_pandas.py     | 40 +++++++++++++++
 2 files changed, 87 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 3f94fc18980..d3a3488081a 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -260,6 +260,23 @@ def Index__new__(cls, *args, **kwargs):
     return self
 
 
+def name(self):
+    return self._fsproxy_wrapped._name
+
+
+def Index__setattr__(self, name, value):
+    if name.startswith("_"):
+        object.__setattr__(self, name, value)
+        return
+    if name == "name":
+        setattr(self._fsproxy_wrapped, "_name", value)
+    if name == "names":
+        setattr(self._fsproxy_wrapped, "_names", value)
+    return _FastSlowAttribute("__setattr__").__get__(self, type(self))(
+        name, value
+    )
+
+
 Index = make_final_proxy_type(
     "Index",
     cudf.Index,
@@ -277,11 +294,13 @@ def Index__new__(cls, *args, **kwargs):
         "__iter__": custom_iter,
         "__init__": _DELETE,
         "__new__": Index__new__,
+        "__setattr__": Index__setattr__,
         "_constructor": _FastSlowAttribute("_constructor"),
         "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
         "_accessors": set(),
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
+        "name": property(name),
     },
 )
 
@@ -292,7 +311,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=lambda fast: fast.to_pandas(),
     slow_to_fast=cudf.from_pandas,
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
+        "name": property(name),
+    },
 )
 
 SparseDtype = make_final_proxy_type(
@@ -319,7 +342,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=lambda fast: fast.to_pandas(),
     slow_to_fast=cudf.from_pandas,
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
+        "name": property(name),
+    },
 )
 
 Categorical = make_final_proxy_type(
@@ -350,6 +377,8 @@ def Index__new__(cls, *args, **kwargs):
         "__init__": _DELETE,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
+        "__setattr__": Index__setattr__,
+        "name": property(name),
     },
 )
 
@@ -385,6 +414,8 @@ def Index__new__(cls, *args, **kwargs):
         "__init__": _DELETE,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
+        "__setattr__": Index__setattr__,
+        "name": property(name),
     },
 )
 
@@ -441,6 +472,8 @@ def Index__new__(cls, *args, **kwargs):
         "__init__": _DELETE,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
+        "__setattr__": Index__setattr__,
+        "name": property(name),
     },
 )
 
@@ -474,6 +507,11 @@ def Index__new__(cls, *args, **kwargs):
     additional_attributes={"__hash__": _FastSlowAttribute("__hash__")},
 )
 
+
+def names(self):
+    return self._fsproxy_wrapped._names
+
+
 MultiIndex = make_final_proxy_type(
     "MultiIndex",
     cudf.MultiIndex,
@@ -481,7 +519,11 @@ def Index__new__(cls, *args, **kwargs):
     fast_to_slow=lambda fast: fast.to_pandas(),
     slow_to_fast=cudf.from_pandas,
     bases=(Index,),
-    additional_attributes={"__init__": _DELETE},
+    additional_attributes={
+        "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
+        "name": property(names),
+    },
 )
 
 TimeGrouper = make_intermediate_proxy_type(
@@ -669,6 +711,8 @@ def Index__new__(cls, *args, **kwargs):
         "__init__": _DELETE,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
+        "__setattr__": Index__setattr__,
+        "name": property(name),
     },
 )
 
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index bc864a48e9d..6292022d8e4 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1592,3 +1592,43 @@ def test_at_setitem_empty():
     df.at[0, "new"] = 2.0
     expected = pd.DataFrame({"name": [1.0], "new": [2.0]})
     tm.assert_frame_equal(df, expected)
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        xpd.Index([1, 2, 3], name="foo"),
+        xpd.Index(["a", "b", "c"], name="foo"),
+        xpd.RangeIndex(start=0, stop=3, step=1, name="foo"),
+        xpd.CategoricalIndex(["a", "b", "a"], name="foo"),
+        xpd.DatetimeIndex(
+            ["2024-04-24", "2025-04-24", "2026-04-24"], name="foo"
+        ),
+        xpd.TimedeltaIndex(["1 days", "2 days", "3 days"], name="foo"),
+        xpd.PeriodIndex(
+            ["2024-06", "2023-06", "2022-06"], freq="M", name="foo"
+        ),
+        xpd.IntervalIndex.from_breaks([0, 1, 2, 3], name="foo"),
+        xpd.MultiIndex.from_tuples(
+            [(1, "a"), (2, "b"), (3, "c")], names=["foo1", "bar1"]
+        ),
+    ],
+)
+def test_change_index_name(index):
+    s = xpd.Series([1, 2, object()], index=index)
+    df = xpd.DataFrame({"values": [1, 2, object()]}, index=index)
+
+    if isinstance(index, xpd.MultiIndex):
+        names = ["foo2", "bar2"]
+        s.index.names = names
+        df.index.names = names
+
+        assert s.index.names == names
+        assert df.index.names == names
+    else:
+        name = "bar"
+        s.index.name = name
+        df.index.name = name
+
+        assert s.index.name == name
+        assert df.index.name == name

From b06d883486e8e1e1afeb9406eebb2d2429de96a1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 11 Jul 2024 10:41:07 -0400
Subject: [PATCH 494/842] Allow only scale=0 fixed-point values in
 fixed_width_column_wrapper (#16120)

The `cudf::test::fixed_width_column_wrapper` supports all fixed-width type including fixed-point types. However, there is no mechanism to specify the fixed-point scale value which is common for the entire column and stored in the column's type.
This fixes the case by throwing an error if a non-zero scale is specified for the input values in a fixed-point `fixed_width_column_wrapper` instance.

Also fixed several tests that incorrectly specified a non-zero scale.

Closes #16092

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Paul Mattione (https://github.com/pmattione-nvidia)

URL: https://github.com/rapidsai/cudf/pull/16120
---
 cpp/include/cudf_test/column_wrapper.hpp      |  3 +
 cpp/tests/io/orc_test.cpp                     | 55 +++++--------------
 cpp/tests/io/parquet_v2_test.cpp              | 34 ++++--------
 .../reshape/interleave_columns_tests.cpp      | 17 +++---
 cpp/tests/streams/io/csv_test.cpp             | 24 +++-----
 cpp/tests/streams/io/orc_test.cpp             | 20 ++-----
 cpp/tests/streams/io/parquet_test.cpp         | 18 ++----
 7 files changed, 52 insertions(+), 119 deletions(-)

diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 7363f965af8..2abd6f0abac 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -226,6 +226,9 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
   using namespace numeric;
   using RepType = typename ElementTo::rep;
 
+  CUDF_EXPECTS(std::all_of(begin, end, [](ElementFrom v) { return v.scale() == 0; }),
+               "Only zero-scale fixed-point values are supported");
+
   auto to_rep            = [](ElementTo fp) { return fp.value(); };
   auto transformer_begin = thrust::make_transform_iterator(begin, to_rep);
   auto const size        = cudf::distance(begin, end);
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index b5e080f3cc5..39ba62952b4 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -54,9 +54,9 @@ using int32_col   = column_wrapper<int32_t>;
 using int64_col   = column_wrapper<int64_t>;
 using float32_col = column_wrapper<float>;
 using float64_col = column_wrapper<double>;
-using dec32_col   = column_wrapper<numeric::decimal32>;
-using dec64_col   = column_wrapper<numeric::decimal64>;
-using dec128_col  = column_wrapper<numeric::decimal128>;
+using dec32_col   = cudf::test::fixed_point_column_wrapper<numeric::decimal32::rep>;
+using dec64_col   = cudf::test::fixed_point_column_wrapper<numeric::decimal64::rep>;
+using dec128_col  = cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep>;
 using struct_col  = cudf::test::structs_column_wrapper;
 template <typename T>
 using list_col = cudf::test::lists_column_wrapper<T>;
@@ -355,12 +355,6 @@ TEST_F(OrcWriterTest, MultiColumn)
   auto col4_data = random_values<float>(num_rows);
   auto col5_data = random_values<double>(num_rows);
   auto col6_vals = random_values<int64_t>(num_rows);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{col6_vals[i], numeric::scale_type{12}};
-  });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{col6_vals[i], numeric::scale_type{-12}};
-  });
 
   bool_col col0(col0_data.begin(), col0_data.end());
   int8_col col1(col1_data.begin(), col1_data.end());
@@ -368,8 +362,8 @@ TEST_F(OrcWriterTest, MultiColumn)
   int32_col col3(col3_data.begin(), col3_data.end());
   float32_col col4(col4_data.begin(), col4_data.end());
   float64_col col5(col5_data.begin(), col5_data.end());
-  dec128_col col6(col6_data, col6_data + num_rows);
-  dec128_col col7(col7_data, col7_data + num_rows);
+  dec128_col col6{col6_vals.begin(), col6_vals.end(), numeric::scale_type{12}};
+  dec128_col col7{col6_vals.begin(), col6_vals.end(), numeric::scale_type{-12}};
 
   list_col<int64_t> col8{
     {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}};
@@ -416,9 +410,6 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   auto col4_data = random_values<float>(num_rows);
   auto col5_data = random_values<double>(num_rows);
   auto col6_vals = random_values<int32_t>(num_rows);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{col6_vals[i], numeric::scale_type{2}};
-  });
   auto col0_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2); });
   auto col1_mask =
@@ -438,7 +429,7 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   int32_col col3{col3_data.begin(), col3_data.end(), col3_mask};
   float32_col col4{col4_data.begin(), col4_data.end(), col4_mask};
   float64_col col5{col5_data.begin(), col5_data.end(), col5_mask};
-  dec64_col col6{col6_data, col6_data + num_rows, col6_mask};
+  dec64_col col6{col6_vals.begin(), col6_vals.end(), col6_mask, numeric::scale_type{2}};
   list_col<int32_t> col7{
     {{9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}, {}, {-1, -2}},
     col0_mask};
@@ -541,14 +532,11 @@ TEST_F(OrcWriterTest, SlicedTable)
   auto seq_col0  = random_values<int32_t>(num_rows);
   auto seq_col2  = random_values<float>(num_rows);
   auto vals_col3 = random_values<int32_t>(num_rows);
-  auto seq_col3  = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{vals_col3[i], numeric::scale_type{2}};
-  });
 
   int32_col col0(seq_col0.begin(), seq_col0.end());
   str_col col1(strings.begin(), strings.end());
   float32_col col2(seq_col2.begin(), seq_col2.end());
-  dec64_col col3(seq_col3, seq_col3 + num_rows);
+  dec64_col col3{vals_col3.begin(), vals_col3.end(), numeric::scale_type{2}};
 
   list_col<int64_t> col4{
     {9, 8}, {7, 6, 5}, {}, {4}, {3, 2, 1, 0}, {20, 21, 22, 23, 24}, {}, {66, 666}};
@@ -1213,11 +1201,8 @@ TEST_P(OrcWriterTestDecimal, Decimal64)
 
   // Using int16_t because scale causes values to overflow if they already require 32 bits
   auto const vals = random_values<int32_t>(num_rows);
-  auto data       = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{vals[i], numeric::scale_type{scale}};
-  });
   auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 7 == 0; });
-  dec64_col col{data, data + num_rows, mask};
+  dec64_col col{vals.begin(), vals.end(), mask, numeric::scale_type{scale}};
   cudf::table_view tbl({static_cast<cudf::column_view>(col)});
 
   auto filepath = temp_env->get_temp_filepath("Decimal64.orc");
@@ -1244,11 +1229,8 @@ TEST_F(OrcWriterTest, Decimal32)
 
   // Using int16_t because scale causes values to overflow if they already require 32 bits
   auto const vals = random_values<int16_t>(num_rows);
-  auto data       = cudf::detail::make_counting_transform_iterator(0, [&vals](auto i) {
-    return numeric::decimal32{vals[i], numeric::scale_type{2}};
-  });
   auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 13; });
-  dec32_col col{data, data + num_rows, mask};
+  dec32_col col{vals.begin(), vals.end(), mask, numeric::scale_type{2}};
   cudf::table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("Decimal32.orc");
@@ -1527,12 +1509,9 @@ TEST_F(OrcReaderTest, DecimalOptions)
 {
   constexpr auto num_rows = 10;
   auto col_vals           = random_values<int64_t>(num_rows);
-  auto col_data           = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{col_vals[i], numeric::scale_type{2}};
-  });
   auto mask = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 == 0; });
 
-  dec128_col col{col_data, col_data + num_rows, mask};
+  dec128_col col{col_vals.begin(), col_vals.end(), mask, numeric::scale_type{2}};
   table_view expected({col});
 
   cudf::io::table_input_metadata expected_metadata(expected);
@@ -1555,15 +1534,9 @@ TEST_F(OrcWriterTest, DecimalOptionsNested)
 {
   auto const num_rows = 100;
 
-  auto dec_vals  = random_values<int32_t>(num_rows);
-  auto dec1_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal64{dec_vals[i], numeric::scale_type{2}};
-  });
-  auto dec2_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{dec_vals[i], numeric::scale_type{2}};
-  });
-  dec64_col dec1_col(dec1_data, dec1_data + num_rows);
-  dec128_col dec2_col(dec2_data, dec2_data + num_rows);
+  auto dec_vals = random_values<int32_t>(num_rows);
+  dec64_col dec1_col{dec_vals.begin(), dec_vals.end(), numeric::scale_type{2}};
+  dec128_col dec2_col{dec_vals.begin(), dec_vals.end(), numeric::scale_type{2}};
   auto child_struct_col = cudf::test::structs_column_wrapper{dec1_col, dec2_col};
 
   auto int_vals = random_values<int32_t>(num_rows);
@@ -1974,7 +1947,7 @@ TEST_F(OrcStatisticsTest, Empty)
   int32_col col0{};
   float64_col col1{};
   str_col col2{};
-  dec64_col col3{};
+  dec64_col col3{{}, numeric::scale_type{0}};
   column_wrapper<cudf::timestamp_ns, cudf::timestamp_ns::rep> col4;
   bool_col col5{};
   table_view expected({col0, col1, col2, col3, col4, col5});
diff --git a/cpp/tests/io/parquet_v2_test.cpp b/cpp/tests/io/parquet_v2_test.cpp
index f106fd5a487..9e66fc9409f 100644
--- a/cpp/tests/io/parquet_v2_test.cpp
+++ b/cpp/tests/io/parquet_v2_test.cpp
@@ -47,15 +47,6 @@ TEST_P(ParquetV2Test, MultiColumn)
   auto col6_vals = random_values<int16_t>(num_rows);
   auto col7_vals = random_values<int32_t>(num_rows);
   auto col8_vals = random_values<int64_t>(num_rows);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&col6_vals](auto i) {
-    return numeric::decimal32{col6_vals[i], numeric::scale_type{5}};
-  });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&col7_vals](auto i) {
-    return numeric::decimal64{col7_vals[i], numeric::scale_type{-5}};
-  });
-  auto col8_data = cudf::detail::make_counting_transform_iterator(0, [&col8_vals](auto i) {
-    return numeric::decimal128{col8_vals[i], numeric::scale_type{-6}};
-  });
 
   // column_wrapper<bool> col0{col0_data.begin(), col0_data.end(), no_nulls()};
   column_wrapper<int8_t> col1{col1_data.begin(), col1_data.end(), no_nulls()};
@@ -63,9 +54,13 @@ TEST_P(ParquetV2Test, MultiColumn)
   column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), no_nulls()};
   column_wrapper<float> col4{col4_data.begin(), col4_data.end(), no_nulls()};
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), no_nulls()};
-  column_wrapper<numeric::decimal32> col6{col6_data, col6_data + num_rows, no_nulls()};
-  column_wrapper<numeric::decimal64> col7{col7_data, col7_data + num_rows, no_nulls()};
-  column_wrapper<numeric::decimal128> col8{col8_data, col8_data + num_rows, no_nulls()};
+
+  cudf::test::fixed_point_column_wrapper<numeric::decimal32::rep> col6(
+    col6_vals.begin(), col6_vals.end(), no_nulls(), numeric::scale_type{5});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal64::rep> col7(
+    col7_vals.begin(), col7_vals.end(), no_nulls(), numeric::scale_type{-5});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col8(
+    col8_vals.begin(), col8_vals.end(), no_nulls(), numeric::scale_type{-6});
 
   auto expected = table_view{{col1, col2, col3, col4, col5, col6, col7, col8}};
 
@@ -109,14 +104,6 @@ TEST_P(ParquetV2Test, MultiColumnWithNulls)
   auto col5_data = random_values<double>(num_rows);
   auto col6_vals = random_values<int32_t>(num_rows);
   auto col7_vals = random_values<int64_t>(num_rows);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&col6_vals](auto i) {
-    return numeric::decimal32{col6_vals[i], numeric::scale_type{-2}};
-  });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&col7_vals](auto i) {
-    return numeric::decimal64{col7_vals[i], numeric::scale_type{-8}};
-  });
-  // auto col0_mask = cudf::detail::make_counting_transform_iterator(
-  //    0, [](auto i) { return (i % 2); });
   auto col1_mask =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i < 10); });
   auto col2_mask = no_nulls();
@@ -138,8 +125,11 @@ TEST_P(ParquetV2Test, MultiColumnWithNulls)
   column_wrapper<int32_t> col3{col3_data.begin(), col3_data.end(), col3_mask};
   column_wrapper<float> col4{col4_data.begin(), col4_data.end(), col4_mask};
   column_wrapper<double> col5{col5_data.begin(), col5_data.end(), col5_mask};
-  column_wrapper<numeric::decimal32> col6{col6_data, col6_data + num_rows, col6_mask};
-  column_wrapper<numeric::decimal64> col7{col7_data, col7_data + num_rows, col7_mask};
+
+  cudf::test::fixed_point_column_wrapper<numeric::decimal32::rep> col6(
+    col6_vals.begin(), col6_vals.end(), col6_mask, numeric::scale_type{-2});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal64::rep> col7(
+    col7_vals.begin(), col7_vals.end(), col7_mask, numeric::scale_type{-8});
 
   auto expected = table_view{{/*col0, */ col1, col2, col3, col4, col5, col6, col7}};
 
diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp
index bc7488bbf9e..de155c35a5e 100644
--- a/cpp/tests/reshape/interleave_columns_tests.cpp
+++ b/cpp/tests/reshape/interleave_columns_tests.cpp
@@ -363,19 +363,16 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointInterleave)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
+  using RepType   = typename decimalXX::rep;
 
   for (int i = 0; i > -4; --i) {
-    auto const ONE  = decimalXX{1, scale_type{i}};
-    auto const TWO  = decimalXX{2, scale_type{i}};
-    auto const FOUR = decimalXX{4, scale_type{i}};
-    auto const FIVE = decimalXX{5, scale_type{i}};
+    auto const a = cudf::test::fixed_point_column_wrapper<RepType>({1, 4}, scale_type{i});
+    auto const b = cudf::test::fixed_point_column_wrapper<RepType>({2, 5}, scale_type{i});
 
-    auto const a = cudf::test::fixed_width_column_wrapper<decimalXX>({ONE, FOUR});
-    auto const b = cudf::test::fixed_width_column_wrapper<decimalXX>({TWO, FIVE});
-
-    auto const input    = cudf::table_view{std::vector<cudf::column_view>{a, b}};
-    auto const expected = cudf::test::fixed_width_column_wrapper<decimalXX>({ONE, TWO, FOUR, FIVE});
-    auto const actual   = cudf::interleave_columns(input);
+    auto const input = cudf::table_view{std::vector<cudf::column_view>{a, b}};
+    auto const expected =
+      cudf::test::fixed_point_column_wrapper<RepType>({1, 2, 4, 5}, scale_type{i});
+    auto const actual = cudf::interleave_columns(input);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, actual->view());
   }
diff --git a/cpp/tests/streams/io/csv_test.cpp b/cpp/tests/streams/io/csv_test.cpp
index 6e27db02d56..42894a0ebcb 100644
--- a/cpp/tests/streams/io/csv_test.cpp
+++ b/cpp/tests/streams/io/csv_test.cpp
@@ -39,12 +39,6 @@ TEST_F(CSVTest, CSVWriter)
 
   std::vector<size_t> zeros(num_rows, 0);
   std::vector<size_t> ones(num_rows, 1);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{ones[i], numeric::scale_type{12}};
-  });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{ones[i], numeric::scale_type{-12}};
-  });
 
   cudf::test::fixed_width_column_wrapper<bool> col0(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<int8_t> col1(zeros.begin(), zeros.end());
@@ -52,8 +46,10 @@ TEST_F(CSVTest, CSVWriter)
   cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6(col6_data, col6_data + num_rows);
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7(col7_data, col7_data + num_rows);
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col6(
+    ones.begin(), ones.end(), numeric::scale_type{12});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col7(
+    ones.begin(), ones.end(), numeric::scale_type{-12});
 
   std::vector<std::string> col8_data(num_rows, "rapids");
   cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end());
@@ -72,12 +68,6 @@ TEST_F(CSVTest, CSVReader)
 
   std::vector<size_t> zeros(num_rows, 0);
   std::vector<size_t> ones(num_rows, 1);
-  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{ones[i], numeric::scale_type{12}};
-  });
-  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-    return numeric::decimal128{ones[i], numeric::scale_type{-12}};
-  });
 
   cudf::test::fixed_width_column_wrapper<bool> col0(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<int8_t> col1(zeros.begin(), zeros.end());
@@ -85,8 +75,10 @@ TEST_F(CSVTest, CSVReader)
   cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6(col6_data, col6_data + num_rows);
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7(col7_data, col7_data + num_rows);
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col6(
+    ones.begin(), ones.end(), numeric::scale_type{12});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col7(
+    ones.begin(), ones.end(), numeric::scale_type{-12});
 
   std::vector<std::string> col8_data(num_rows, "rapids");
   cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end());
diff --git a/cpp/tests/streams/io/orc_test.cpp b/cpp/tests/streams/io/orc_test.cpp
index 401c7049381..cc43bf15b5d 100644
--- a/cpp/tests/streams/io/orc_test.cpp
+++ b/cpp/tests/streams/io/orc_test.cpp
@@ -59,22 +59,10 @@ cudf::table construct_table()
   cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros_iterator, zeros_iterator + num_rows);
   cudf::test::fixed_width_column_wrapper<float> col4(zeros_iterator, zeros_iterator + num_rows);
   cudf::test::fixed_width_column_wrapper<double> col5(zeros_iterator, zeros_iterator + num_rows);
-
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6 = [&ones_iterator] {
-    auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-      return numeric::decimal128{ones_iterator[i], numeric::scale_type{12}};
-    });
-    return cudf::test::fixed_width_column_wrapper<numeric::decimal128>(col6_data,
-                                                                       col6_data + num_rows);
-  }();
-
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7 = [&ones_iterator] {
-    auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-      return numeric::decimal128{ones_iterator[i], numeric::scale_type{-12}};
-    });
-    return cudf::test::fixed_width_column_wrapper<numeric::decimal128>(col7_data,
-                                                                       col7_data + num_rows);
-  }();
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col6(
+    ones_iterator, ones_iterator + num_rows, numeric::scale_type{12});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col7(
+    ones_iterator, ones_iterator + num_rows, numeric::scale_type{-12});
 
   cudf::test::lists_column_wrapper<int64_t> col8 = [] {
     auto col8_mask =
diff --git a/cpp/tests/streams/io/parquet_test.cpp b/cpp/tests/streams/io/parquet_test.cpp
index b277d184e3a..9d2dec2d697 100644
--- a/cpp/tests/streams/io/parquet_test.cpp
+++ b/cpp/tests/streams/io/parquet_test.cpp
@@ -55,20 +55,10 @@ cudf::table construct_table()
   cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
   cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6 = [&ones] {
-    auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-      return numeric::decimal128{ones[i], numeric::scale_type{12}};
-    });
-    return cudf::test::fixed_width_column_wrapper<numeric::decimal128>(col6_data,
-                                                                       col6_data + num_rows);
-  }();
-  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7 = [&ones] {
-    auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
-      return numeric::decimal128{ones[i], numeric::scale_type{-12}};
-    });
-    return cudf::test::fixed_width_column_wrapper<numeric::decimal128>(col7_data,
-                                                                       col7_data + num_rows);
-  }();
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col6(
+    ones.begin(), ones.end(), numeric::scale_type{12});
+  cudf::test::fixed_point_column_wrapper<numeric::decimal128::rep> col7(
+    ones.begin(), ones.end(), numeric::scale_type{-12});
 
   cudf::test::lists_column_wrapper<int64_t> col8{
     {1, 1}, {1, 1, 1}, {}, {1}, {1, 1, 1, 1}, {1, 1, 1, 1, 1}, {}, {1, -1}, {}, {-1, -1}};

From 53de73d3010ce4fa3b27ab53d14d58312d4793dc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 11 Jul 2024 07:59:39 -1000
Subject: [PATCH 495/842] Add Column.strftime/strptime instead of overloading
 `as_string/datetime/timedelta_column` (#16243)

`Column.as_string/datetime/timedelta_column` had a `format` argument that was not used for columns that weren't these types or didn't require conversion to these types.

This PR introduces a `strftime` and `strptime` on the column that will handle this `format` argument.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16243
---
 python/cudf/cudf/core/column/categorical.py |  24 +---
 python/cudf/cudf/core/column/column.py      |  14 +--
 python/cudf/cudf/core/column/datetime.py    | 126 +++++++++-----------
 python/cudf/cudf/core/column/decimal.py     |   4 +-
 python/cudf/cudf/core/column/lists.py       |   8 +-
 python/cudf/cudf/core/column/numerical.py   |  12 +-
 python/cudf/cudf/core/column/string.py      |  91 +++++++-------
 python/cudf/cudf/core/column/timedelta.py   |  30 ++---
 python/cudf/cudf/core/series.py             |   4 +-
 python/cudf/cudf/core/tools/datetimes.py    |  38 +++---
 10 files changed, 150 insertions(+), 201 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index cec7d5e6663..f763d3b4b0c 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -1136,26 +1136,14 @@ def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn:
     def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
         return self._get_decategorized_column().as_numerical_column(dtype)
 
-    def as_string_column(
-        self, dtype, format: str | None = None
-    ) -> StringColumn:
-        return self._get_decategorized_column().as_string_column(
-            dtype, format=format
-        )
+    def as_string_column(self) -> StringColumn:
+        return self._get_decategorized_column().as_string_column()
 
-    def as_datetime_column(
-        self, dtype, format: str | None = None
-    ) -> DatetimeColumn:
-        return self._get_decategorized_column().as_datetime_column(
-            dtype, format
-        )
+    def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
+        return self._get_decategorized_column().as_datetime_column(dtype)
 
-    def as_timedelta_column(
-        self, dtype, format: str | None = None
-    ) -> TimeDeltaColumn:
-        return self._get_decategorized_column().as_timedelta_column(
-            dtype, format
-        )
+    def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn:
+        return self._get_decategorized_column().as_timedelta_column(dtype)
 
     def _get_decategorized_column(self) -> ColumnBase:
         if self.null_count == len(self):
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index adc783c20c4..f633d527681 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1003,7 +1003,7 @@ def astype(self, dtype: Dtype, copy: bool = False) -> ColumnBase:
                         f"Casting to {dtype} is not supported, use "
                         "`.astype('str')` instead."
                     )
-                result = self.as_string_column(dtype)
+                result = self.as_string_column()
             else:
                 result = self.as_numerical_column(dtype)
 
@@ -1059,8 +1059,8 @@ def as_numerical_column(
         raise NotImplementedError
 
     def as_datetime_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.DatetimeColumn":
+        self, dtype: Dtype
+    ) -> cudf.core.column.DatetimeColumn:
         raise NotImplementedError
 
     def as_interval_column(
@@ -1069,13 +1069,11 @@ def as_interval_column(
         raise NotImplementedError
 
     def as_timedelta_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.TimeDeltaColumn":
+        self, dtype: Dtype
+    ) -> cudf.core.column.TimeDeltaColumn:
         raise NotImplementedError
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
+    def as_string_column(self) -> cudf.core.column.StringColumn:
         raise NotImplementedError
 
     def as_decimal_column(
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index c10aceba9f4..214e84028d2 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -178,43 +178,6 @@ def _resolve_mixed_dtypes(
     return cudf.dtype(f"{base_type}[{units[max(lhs_unit, rhs_unit)]}]")
 
 
-def _get_datetime_format(col, dtype, time_unit):
-    format = _dtype_to_format_conversion.get(dtype.name, "%Y-%m-%d %H:%M:%S")
-    if format.endswith("f"):
-        sub_second_res_len = 3
-    else:
-        sub_second_res_len = 0
-
-    has_nanos = time_unit in {"ns"} and col.get_dt_field("nanosecond").any()
-    has_micros = (
-        time_unit in {"ns", "us"} and col.get_dt_field("microsecond").any()
-    )
-    has_millis = (
-        time_unit in {"ns", "us", "ms"}
-        and col.get_dt_field("millisecond").any()
-    )
-    has_seconds = col.get_dt_field("second").any()
-    has_minutes = col.get_dt_field("minute").any()
-    has_hours = col.get_dt_field("hour").any()
-    if sub_second_res_len:
-        if has_nanos:
-            # format should be intact and rest of the
-            # following conditions shouldn't execute.
-            pass
-        elif has_micros:
-            format = format[:-sub_second_res_len] + "%6f"
-        elif has_millis:
-            format = format[:-sub_second_res_len] + "%3f"
-        elif has_seconds or has_minutes or has_hours:
-            format = format[:-4]
-        else:
-            format = format.split(" ")[0]
-    else:
-        if not (has_seconds or has_minutes or has_hours):
-            format = format.split(" ")[0]
-    return format
-
-
 class DatetimeColumn(column.ColumnBase):
     """
     A Column implementation for Date-time types.
@@ -381,9 +344,7 @@ def round(self, freq: str) -> ColumnBase:
 
     def isocalendar(self) -> dict[str, ColumnBase]:
         return {
-            field: self.as_string_column("str", format=directive).astype(
-                "uint32"
-            )
+            field: self.strftime(format=directive).astype("uint32")
             for field, directive in zip(
                 ["year", "week", "day"], ["%G", "%V", "%u"]
             )
@@ -445,17 +406,12 @@ def normalize_binop_value(self, other: DatetimeLikeScalar) -> ScalarLike:
 
         return NotImplemented
 
-    def as_datetime_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> DatetimeColumn:
-        dtype = cudf.dtype(dtype)
+    def as_datetime_column(self, dtype: Dtype) -> DatetimeColumn:
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype=dtype)
 
-    def as_timedelta_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.TimeDeltaColumn":
+    def as_timedelta_column(self, dtype: Dtype) -> None:  # type: ignore[override]
         raise TypeError(
             f"cannot astype a datetimelike from {self.dtype} to {dtype}"
         )
@@ -472,32 +428,63 @@ def as_numerical_column(
         )
         return cast("cudf.core.column.NumericalColumn", col.astype(dtype))
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
-        if format is None:
-            format = _dtype_to_format_conversion.get(
-                self.dtype.name, "%Y-%m-%d %H:%M:%S"
+    def strftime(self, format: str) -> cudf.core.column.StringColumn:
+        if len(self) == 0:
+            return cast(
+                cudf.core.column.StringColumn,
+                column.column_empty(0, dtype="object", masked=False),
             )
-            if cudf.get_option("mode.pandas_compatible"):
-                format = _get_datetime_format(
-                    self, dtype=self.dtype, time_unit=self.time_unit
-                )
         if format in _DATETIME_SPECIAL_FORMATS:
             names = as_column(_DATETIME_NAMES)
         else:
             names = cudf.core.column.column_empty(
                 0, dtype="object", masked=False
             )
-        if len(self) > 0:
-            return string._datetime_to_str_typecast_functions[
-                cudf.dtype(self.dtype)
-            ](self, format, names)
-        else:
-            return cast(
-                "cudf.core.column.StringColumn",
-                column.column_empty(0, dtype="object", masked=False),
+        return string._datetime_to_str_typecast_functions[self.dtype](
+            self, format, names
+        )
+
+    def as_string_column(self) -> cudf.core.column.StringColumn:
+        format = _dtype_to_format_conversion.get(
+            self.dtype.name, "%Y-%m-%d %H:%M:%S"
+        )
+        if cudf.get_option("mode.pandas_compatible"):
+            if format.endswith("f"):
+                sub_second_res_len = 3
+            else:
+                sub_second_res_len = 0
+
+            has_nanos = (
+                self.time_unit in {"ns"}
+                and self.get_dt_field("nanosecond").any()
             )
+            has_micros = (
+                self.time_unit in {"ns", "us"}
+                and self.get_dt_field("microsecond").any()
+            )
+            has_millis = (
+                self.time_unit in {"ns", "us", "ms"}
+                and self.get_dt_field("millisecond").any()
+            )
+            has_seconds = self.get_dt_field("second").any()
+            has_minutes = self.get_dt_field("minute").any()
+            has_hours = self.get_dt_field("hour").any()
+            if sub_second_res_len:
+                if has_nanos:
+                    # format should be intact and rest of the
+                    # following conditions shouldn't execute.
+                    pass
+                elif has_micros:
+                    format = format[:-sub_second_res_len] + "%6f"
+                elif has_millis:
+                    format = format[:-sub_second_res_len] + "%3f"
+                elif has_seconds or has_minutes or has_hours:
+                    format = format[:-4]
+                else:
+                    format = format.split(" ")[0]
+            elif not (has_seconds or has_minutes or has_hours):
+                format = format.split(" ")[0]
+        return self.strftime(format)
 
     def mean(
         self, skipna=None, min_count: int = 0, dtype=np.float64
@@ -872,10 +859,11 @@ def _local_time(self):
         offsets_from_utc = offsets.take(indices, nullify=True)
         return self + offsets_from_utc
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
-        return self._local_time.as_string_column(dtype, format)
+    def strftime(self, format: str) -> cudf.core.column.StringColumn:
+        return self._local_time.strftime(format)
+
+    def as_string_column(self) -> cudf.core.column.StringColumn:
+        return self._local_time.as_string_column()
 
     def get_dt_field(self, field: str) -> ColumnBase:
         return libcudf.datetime.extract_datetime_component(
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 3e238d65cff..a63055ed527 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -62,9 +62,7 @@ def as_decimal_column(
             return self
         return libcudf.unary.cast(self, dtype)
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
+    def as_string_column(self) -> cudf.core.column.StringColumn:
         if len(self) > 0:
             return cpp_from_decimal(self)
         else:
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 1992d471947..cc15e78314e 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -253,15 +253,11 @@ def from_sequences(
         )
         return res
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
+    def as_string_column(self) -> cudf.core.column.StringColumn:
         """
         Create a strings column from a list column
         """
-        lc = self._transform_leaves(
-            lambda col, dtype: col.as_string_column(dtype), dtype
-        )
+        lc = self._transform_leaves(lambda col: col.as_string_column())
 
         # Separator strings to match the Python format
         separators = as_column([", ", "[", "]"])
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 76c64e1aea0..a0550bff72b 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -331,9 +331,7 @@ def int2ip(self) -> "cudf.core.column.StringColumn":
 
         return libcudf.string_casting.int2ip(self)
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
+    def as_string_column(self) -> cudf.core.column.StringColumn:
         if len(self) > 0:
             return string._numeric_to_str_typecast_functions[
                 cudf.dtype(self.dtype)
@@ -345,8 +343,8 @@ def as_string_column(
             )
 
     def as_datetime_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.DatetimeColumn":
+        self, dtype: Dtype
+    ) -> cudf.core.column.DatetimeColumn:
         return cast(
             "cudf.core.column.DatetimeColumn",
             build_column(
@@ -359,8 +357,8 @@ def as_datetime_column(
         )
 
     def as_timedelta_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.TimeDeltaColumn":
+        self, dtype: Dtype
+    ) -> cudf.core.column.TimeDeltaColumn:
         return cast(
             "cudf.core.column.TimeDeltaColumn",
             build_column(
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 936cd1eccb0..96f9cdfd655 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5669,16 +5669,25 @@ def as_numerical_column(
         result_col = _str_to_numeric_typecast_functions[out_dtype](string_col)
         return result_col
 
-    def _as_datetime_or_timedelta_column(self, dtype, format):
-        if len(self) == 0:
-            return cudf.core.column.column_empty(0, dtype=dtype)
-
-        # Check for None strings
-        if (self == "None").any():
-            raise ValueError("Could not convert `None` value to datetime")
-
-        is_nat = self == "NaT"
-        if dtype.kind == "M":
+    def strptime(
+        self, dtype: Dtype, format: str
+    ) -> cudf.core.column.DatetimeColumn | cudf.core.column.TimeDeltaColumn:
+        if dtype.kind not in "Mm":  # type: ignore[union-attr]
+            raise ValueError(
+                f"dtype must be datetime or timedelta type, not {dtype}"
+            )
+        elif self.null_count == len(self):
+            return column.column_empty(len(self), dtype=dtype, masked=True)  # type: ignore[return-value]
+        elif (self == "None").any():
+            raise ValueError(
+                "Cannot convert `None` value to datetime or timedelta."
+            )
+        elif dtype.kind == "M":  # type: ignore[union-attr]
+            if format.endswith("%z"):
+                raise NotImplementedError(
+                    "cuDF does not yet support timezone-aware datetimes"
+                )
+            is_nat = self == "NaT"
             without_nat = self.apply_boolean_mask(is_nat.unary_operator("not"))
             all_same_length = (
                 libstrings.count_characters(without_nat).distinct_count(
@@ -5699,61 +5708,43 @@ def _as_datetime_or_timedelta_column(self, dtype, format):
             if not valid.all():
                 raise ValueError(f"Column contains invalid data for {format=}")
 
-        casting_func = (
-            str_cast.timestamp2int
-            if dtype.type == np.datetime64
-            else str_cast.timedelta2int
-        )
+            casting_func = str_cast.timestamp2int
+            add_back_nat = is_nat.any()
+        elif dtype.kind == "m":  # type: ignore[union-attr]
+            casting_func = str_cast.timedelta2int
+            add_back_nat = False
+
         result_col = casting_func(self, dtype, format)
 
-        if is_nat.any():
+        if add_back_nat:
             result_col[is_nat] = None
 
         return result_col
 
     def as_datetime_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.DatetimeColumn":
-        out_dtype = cudf.api.types.dtype(dtype)
-
-        # infer on host from the first not na element
-        # or return all null column if all values
-        # are null in current column
-        if format is None:
-            if self.null_count == len(self):
-                return cast(
-                    "cudf.core.column.DatetimeColumn",
-                    column.column_empty(
-                        len(self), dtype=out_dtype, masked=True
-                    ),
-                )
-            else:
-                format = datetime.infer_format(
-                    self.apply_boolean_mask(self.notnull()).element_indexing(0)
-                )
-
-        if format.endswith("%z"):
-            raise NotImplementedError(
-                "cuDF does not yet support timezone-aware datetimes"
-            )
-        return self._as_datetime_or_timedelta_column(out_dtype, format)
+        self, dtype: Dtype
+    ) -> cudf.core.column.DatetimeColumn:
+        not_null = self.apply_boolean_mask(self.notnull())
+        if len(not_null) == 0:
+            # We should hit the self.null_count == len(self) condition
+            # so format doesn't matter
+            format = ""
+        else:
+            # infer on host from the first not na element
+            format = datetime.infer_format(not_null.element_indexing(0))
+        return self.strptime(dtype, format)  # type: ignore[return-value]
 
     def as_timedelta_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.TimeDeltaColumn":
-        out_dtype = cudf.api.types.dtype(dtype)
-        if format is None:
-            format = "%D days %H:%M:%S"
-        return self._as_datetime_or_timedelta_column(out_dtype, format)
+        self, dtype: Dtype
+    ) -> cudf.core.column.TimeDeltaColumn:
+        return self.strptime(dtype, "%D days %H:%M:%S")  # type: ignore[return-value]
 
     def as_decimal_column(
         self, dtype: Dtype
     ) -> "cudf.core.column.DecimalBaseColumn":
         return libstrings.to_decimal(self, dtype)
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> StringColumn:
+    def as_string_column(self) -> StringColumn:
         return self
 
     @property
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 5a0171bbbdc..2cbed9212de 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -263,32 +263,26 @@ def as_numerical_column(
         )
         return cast("cudf.core.column.NumericalColumn", col.astype(dtype))
 
-    def as_datetime_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.DatetimeColumn":
+    def as_datetime_column(self, dtype: Dtype) -> None:  # type: ignore[override]
         raise TypeError(
             f"cannot astype a timedelta from {self.dtype} to {dtype}"
         )
 
-    def as_string_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> "cudf.core.column.StringColumn":
-        if format is None:
-            format = "%D days %H:%M:%S"
-        if len(self) > 0:
-            return string._timedelta_to_str_typecast_functions[
-                cudf.dtype(self.dtype)
-            ](self, format=format)
-        else:
+    def strftime(self, format: str) -> cudf.core.column.StringColumn:
+        if len(self) == 0:
             return cast(
-                "cudf.core.column.StringColumn",
+                cudf.core.column.StringColumn,
                 column.column_empty(0, dtype="object", masked=False),
             )
+        else:
+            return string._timedelta_to_str_typecast_functions[self.dtype](
+                self, format=format
+            )
 
-    def as_timedelta_column(
-        self, dtype: Dtype, format: str | None = None
-    ) -> TimeDeltaColumn:
-        dtype = cudf.dtype(dtype)
+    def as_string_column(self) -> cudf.core.column.StringColumn:
+        return self.strftime("%D days %H:%M:%S")
+
+    def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn:
         if dtype == self.dtype:
             return self
         return libcudf.unary.cast(self, dtype=dtype)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 4a60470fafa..8c8fa75918c 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4731,9 +4731,7 @@ def strftime(self, date_format: str, *args, **kwargs) -> Series:
                     f"for tracking purposes."
                 )
         return self._return_result_like_self(
-            self.series._column.as_string_column(
-                dtype="str", format=date_format
-            )
+            self.series._column.strftime(format=date_format)
         )
 
     @copy_docstring(DatetimeIndex.tz_localize)
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 397bfe1d472..064e8fc667d 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -216,25 +216,25 @@ def to_datetime(
                 + arg[unit_rev["day"]].astype("str").str.zfill(2)
             )
             format = "%Y-%m-%d"
-            col = new_series._column.as_datetime_column(
-                "datetime64[s]", format=format
-            )
-
             for u in ["h", "m", "s", "ms", "us", "ns"]:
                 value = unit_rev.get(u)
                 if value is not None and value in arg:
                     arg_col = arg._data[value]
-                    if arg_col.dtype.kind in ("f"):
-                        col = new_series._column.as_datetime_column(
-                            "datetime64[ns]", format=format
+                    if arg_col.dtype.kind == "f":
+                        col = new_series._column.strptime(
+                            cudf.dtype("datetime64[ns]"), format=format
                         )
                         break
-                    elif arg_col.dtype.kind in ("O"):
+                    elif arg_col.dtype.kind == "O":
                         if not cpp_is_integer(arg_col).all():
-                            col = new_series._column.as_datetime_column(
-                                "datetime64[ns]", format=format
+                            col = new_series._column.strptime(
+                                cudf.dtype("datetime64[ns]"), format=format
                             )
                             break
+            else:
+                col = new_series._column.strptime(
+                    cudf.dtype("datetime64[s]"), format=format
+                )
 
             times_column = None
             for u in ["h", "m", "s", "ms", "us", "ns"]:
@@ -334,15 +334,15 @@ def _process_col(
             col = (
                 col.astype("int")
                 .astype("str")
-                .as_datetime_column(
-                    dtype="datetime64[us]"
+                .strptime(
+                    dtype=cudf.dtype("datetime64[us]")
                     if "%f" in format
-                    else "datetime64[s]",
+                    else cudf.dtype("datetime64[s]"),
                     format=format,
                 )
             )
         else:
-            col = col.as_datetime_column(dtype="datetime64[ns]")
+            col = col.astype(dtype="datetime64[ns]")
 
     elif col.dtype.kind in "iu":
         if unit in ("D", "h", "m"):
@@ -353,11 +353,11 @@ def _process_col(
             col = col * factor
 
         if format is not None:
-            col = col.astype("str").as_datetime_column(
-                dtype=_unit_dtype_map[unit], format=format
+            col = col.astype("str").strptime(
+                dtype=cudf.dtype(_unit_dtype_map[unit]), format=format
             )
         else:
-            col = col.as_datetime_column(dtype=_unit_dtype_map[unit])
+            col = col.astype(dtype=cudf.dtype(_unit_dtype_map[unit]))
 
     elif col.dtype.kind == "O":
         if unit not in (None, "ns") or col.null_count == len(col):
@@ -384,8 +384,8 @@ def _process_col(
                     element=col.element_indexing(0),
                     dayfirst=dayfirst,
                 )
-            col = col.as_datetime_column(
-                dtype=_unit_dtype_map[unit],
+            col = col.strptime(
+                dtype=cudf.dtype(_unit_dtype_map[unit]),
                 format=format,
             )
     elif col.dtype.kind != "M":

From adee00aca95a749ecdf86975ae6d5b7fa1c01733 Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Thu, 11 Jul 2024 12:46:08 -0700
Subject: [PATCH 496/842] remove `cuco_noexcept.diff` (#16254)

This PR removes the cuDF `cuco_noexcept.diff` patch since it no longer applies after https://github.com/rapidsai/rapids-cmake/pull/628.

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16254
---
 cpp/cmake/thirdparty/get_cucollections.cmake  |   4 -
 .../thirdparty/patches/cuco_noexcept.diff     | 227 ------------------
 .../thirdparty/patches/cuco_override.json     |  14 --
 3 files changed, 245 deletions(-)
 delete mode 100644 cpp/cmake/thirdparty/patches/cuco_noexcept.diff
 delete mode 100644 cpp/cmake/thirdparty/patches/cuco_override.json

diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index 6ec35ddcaf1..fb82b0f5ff3 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -15,10 +15,6 @@
 # This function finds cuCollections and performs any additional configuration.
 function(find_and_configure_cucollections)
   include(${rapids-cmake-dir}/cpm/cuco.cmake)
-  include(${rapids-cmake-dir}/cpm/package_override.cmake)
-
-  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
-  rapids_cpm_package_override("${cudf_patch_dir}/cuco_override.json")
 
   if(BUILD_SHARED_LIBS)
     rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports)
diff --git a/cpp/cmake/thirdparty/patches/cuco_noexcept.diff b/cpp/cmake/thirdparty/patches/cuco_noexcept.diff
deleted file mode 100644
index 0f334c0e81f..00000000000
--- a/cpp/cmake/thirdparty/patches/cuco_noexcept.diff
+++ /dev/null
@@ -1,227 +0,0 @@
-diff --git a/include/cuco/aow_storage.cuh b/include/cuco/aow_storage.cuh
-index 7f9de01..5228193 100644
---- a/include/cuco/aow_storage.cuh
-+++ b/include/cuco/aow_storage.cuh
-@@ -81,7 +81,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
-    * @param size Number of windows to (de)allocate
-    * @param allocator Allocator used for (de)allocating device storage
-    */
--  explicit constexpr aow_storage(Extent size, Allocator const& allocator = {}) noexcept;
-+  explicit constexpr aow_storage(Extent size, Allocator const& allocator = {});
- 
-   aow_storage(aow_storage&&) = default;  ///< Move constructor
-   /**
-@@ -122,7 +122,7 @@ class aow_storage : public detail::aow_storage_base<T, WindowSize, Extent> {
-    * @param key Key to which all keys in `slots` are initialized
-    * @param stream Stream used for executing the kernel
-    */
--  void initialize(value_type key, cuda_stream_ref stream = {}) noexcept;
-+  void initialize(value_type key, cuda_stream_ref stream = {});
- 
-   /**
-    * @brief Asynchronously initializes each slot in the AoW storage to contain `key`.
-diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
-index c2c9c14..8ac4236 100644
---- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh
-+++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh
-@@ -125,7 +125,7 @@ class open_addressing_impl {
-                                  KeyEqual const& pred,
-                                  ProbingScheme const& probing_scheme,
-                                  Allocator const& alloc,
--                                 cuda_stream_ref stream) noexcept
-+                                 cuda_stream_ref stream)
-     : empty_slot_sentinel_{empty_slot_sentinel},
-       erased_key_sentinel_{this->extract_key(empty_slot_sentinel)},
-       predicate_{pred},
-@@ -233,7 +233,7 @@ class open_addressing_impl {
-    *
-    * @param stream CUDA stream this operation is executed in
-    */
--  void clear(cuda_stream_ref stream) noexcept { storage_.initialize(empty_slot_sentinel_, stream); }
-+  void clear(cuda_stream_ref stream) { storage_.initialize(empty_slot_sentinel_, stream); }
- 
-   /**
-    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
-@@ -599,7 +599,7 @@ class open_addressing_impl {
-    *
-    * @return The number of elements in the container
-    */
--  [[nodiscard]] size_type size(cuda_stream_ref stream) const noexcept
-+  [[nodiscard]] size_type size(cuda_stream_ref stream) const
-   {
-     auto counter =
-       detail::counter_storage<size_type, thread_scope, allocator_type>{this->allocator()};
-diff --git a/include/cuco/detail/static_map/static_map.inl b/include/cuco/detail/static_map/static_map.inl
-index e17a145..3fa1d02 100644
---- a/include/cuco/detail/static_map/static_map.inl
-+++ b/include/cuco/detail/static_map/static_map.inl
-@@ -123,7 +123,7 @@ template <class Key,
-           class Allocator,
-           class Storage>
- void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
--  cuda_stream_ref stream) noexcept
-+  cuda_stream_ref stream)
- {
-   impl_->clear(stream);
- }
-@@ -215,7 +215,7 @@ template <class Key,
-           class Storage>
- template <typename InputIt>
- void static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::
--  insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream) noexcept
-+  insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream)
- {
-   return this->insert_or_assign_async(first, last, stream);
-   stream.synchronize();
-@@ -465,7 +465,7 @@ template <class Key,
-           class Storage>
- static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
- static_map<Key, T, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
--  cuda_stream_ref stream) const noexcept
-+  cuda_stream_ref stream) const
- {
-   return impl_->size(stream);
- }
-diff --git a/include/cuco/detail/static_multiset/static_multiset.inl b/include/cuco/detail/static_multiset/static_multiset.inl
-index 174f9bc..582926b 100644
---- a/include/cuco/detail/static_multiset/static_multiset.inl
-+++ b/include/cuco/detail/static_multiset/static_multiset.inl
-@@ -97,7 +97,7 @@ template <class Key,
-           class Allocator,
-           class Storage>
- void static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
--  cuda_stream_ref stream) noexcept
-+  cuda_stream_ref stream)
- {
-   impl_->clear(stream);
- }
-@@ -183,7 +183,7 @@ template <class Key,
-           class Storage>
- static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
- static_multiset<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
--  cuda_stream_ref stream) const noexcept
-+  cuda_stream_ref stream) const
- {
-   return impl_->size(stream);
- }
-diff --git a/include/cuco/detail/static_set/static_set.inl b/include/cuco/detail/static_set/static_set.inl
-index 645013f..d3cece0 100644
---- a/include/cuco/detail/static_set/static_set.inl
-+++ b/include/cuco/detail/static_set/static_set.inl
-@@ -98,7 +98,7 @@ template <class Key,
-           class Allocator,
-           class Storage>
- void static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::clear(
--  cuda_stream_ref stream) noexcept
-+  cuda_stream_ref stream)
- {
-   impl_->clear(stream);
- }
-@@ -429,7 +429,7 @@ template <class Key,
-           class Storage>
- static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size_type
- static_set<Key, Extent, Scope, KeyEqual, ProbingScheme, Allocator, Storage>::size(
--  cuda_stream_ref stream) const noexcept
-+  cuda_stream_ref stream) const
- {
-   return impl_->size(stream);
- }
-diff --git a/include/cuco/detail/storage/aow_storage.inl b/include/cuco/detail/storage/aow_storage.inl
-index 3547f4c..94b7f98 100644
---- a/include/cuco/detail/storage/aow_storage.inl
-+++ b/include/cuco/detail/storage/aow_storage.inl
-@@ -32,8 +32,8 @@
- namespace cuco {
- 
- template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
--constexpr aow_storage<T, WindowSize, Extent, Allocator>::aow_storage(
--  Extent size, Allocator const& allocator) noexcept
-+constexpr aow_storage<T, WindowSize, Extent, Allocator>::aow_storage(Extent size,
-+                                                                     Allocator const& allocator)
-   : detail::aow_storage_base<T, WindowSize, Extent>{size},
-     allocator_{allocator},
-     window_deleter_{capacity(), allocator_},
-@@ -64,7 +64,7 @@ aow_storage<T, WindowSize, Extent, Allocator>::ref() const noexcept
- 
- template <typename T, int32_t WindowSize, typename Extent, typename Allocator>
- void aow_storage<T, WindowSize, Extent, Allocator>::initialize(value_type key,
--                                                               cuda_stream_ref stream) noexcept
-+                                                               cuda_stream_ref stream)
- {
-   this->initialize_async(key, stream);
-   stream.synchronize();
-diff --git a/include/cuco/static_map.cuh b/include/cuco/static_map.cuh
-index c86e90c..95da423 100644
---- a/include/cuco/static_map.cuh
-+++ b/include/cuco/static_map.cuh
-@@ -269,7 +269,7 @@ class static_map {
-    *
-    * @param stream CUDA stream this operation is executed in
-    */
--  void clear(cuda_stream_ref stream = {}) noexcept;
-+  void clear(cuda_stream_ref stream = {});
- 
-   /**
-    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
-@@ -387,7 +387,7 @@ class static_map {
-    * @param stream CUDA stream used for insert
-    */
-   template <typename InputIt>
--  void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {}) noexcept;
-+  void insert_or_assign(InputIt first, InputIt last, cuda_stream_ref stream = {});
- 
-   /**
-    * @brief For any key-value pair `{k, v}` in the range `[first, last)`, if a key equivalent to `k`
-@@ -690,7 +690,7 @@ class static_map {
-    * @param stream CUDA stream used to get the number of inserted elements
-    * @return The number of elements in the container
-    */
--  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
-+  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
- 
-   /**
-    * @brief Gets the maximum number of elements the hash map can hold.
-diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh
-index 0daf103..fbcbc9c 100644
---- a/include/cuco/static_multiset.cuh
-+++ b/include/cuco/static_multiset.cuh
-@@ -235,7 +235,7 @@ class static_multiset {
-    *
-    * @param stream CUDA stream this operation is executed in
-    */
--  void clear(cuda_stream_ref stream = {}) noexcept;
-+  void clear(cuda_stream_ref stream = {});
- 
-   /**
-    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
-@@ -339,7 +339,7 @@ class static_multiset {
-    * @param stream CUDA stream used to get the number of inserted elements
-    * @return The number of elements in the container
-    */
--  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
-+  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
- 
-   /**
-    * @brief Gets the maximum number of elements the multiset can hold.
-diff --git a/include/cuco/static_set.cuh b/include/cuco/static_set.cuh
-index a069939..3517f84 100644
---- a/include/cuco/static_set.cuh
-+++ b/include/cuco/static_set.cuh
-@@ -240,7 +240,7 @@ class static_set {
-    *
-    * @param stream CUDA stream this operation is executed in
-    */
--  void clear(cuda_stream_ref stream = {}) noexcept;
-+  void clear(cuda_stream_ref stream = {});
- 
-   /**
-    * @brief Asynchronously erases all elements from the container. After this call, `size()` returns
-@@ -687,7 +687,7 @@ class static_set {
-    * @param stream CUDA stream used to get the number of inserted elements
-    * @return The number of elements in the container
-    */
--  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const noexcept;
-+  [[nodiscard]] size_type size(cuda_stream_ref stream = {}) const;
- 
-   /**
-    * @brief Gets the maximum number of elements the hash set can hold.
diff --git a/cpp/cmake/thirdparty/patches/cuco_override.json b/cpp/cmake/thirdparty/patches/cuco_override.json
deleted file mode 100644
index ae0a9a4b4f0..00000000000
--- a/cpp/cmake/thirdparty/patches/cuco_override.json
+++ /dev/null
@@ -1,14 +0,0 @@
-
-{
-  "packages" : {
-    "cuco" : {
-      "patches" : [
-        {
-          "file" : "${current_json_dir}/cuco_noexcept.diff",
-          "issue" : "Remove erroneous noexcept clauses on cuco functions that may throw [https://github.com/rapidsai/cudf/issues/16059]",
-          "fixed_in" : ""
-        }
-      ]
-    }
-  }
-}

From cd2d53b23fb37b8f68fe59571454f7b95ff98e2f Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 11 Jul 2024 21:25:35 +0100
Subject: [PATCH 497/842] Expose reflection to check if casting between two
 types is supported (#16239)

In cudf-polars we need to check if a cast between two datatypes is supported (and fallback, or generate different code if not).

Let's ask libcudf to be the source of truth for when a cast is supported.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Matthew Roeschke (https://github.com/mroeschke)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/16239
---
 cpp/include/cudf/unary.hpp                    | 10 +++++++++
 cpp/src/unary/cast_ops.cu                     | 16 ++++++++++++++
 .../cudf/_lib/pylibcudf/libcudf/unary.pxd     |  2 ++
 python/cudf/cudf/_lib/pylibcudf/unary.pxd     |  4 ++++
 python/cudf/cudf/_lib/pylibcudf/unary.pyx     | 21 +++++++++++++++++++
 .../cudf/cudf/pylibcudf_tests/test_unary.py   | 19 +++++++++++++++++
 6 files changed, 72 insertions(+)
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_unary.py

diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 8a515335351..1609c72f175 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -211,6 +211,16 @@ std::unique_ptr<column> cast(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Check if a cast between two datatypes is supported.
+ *
+ * @param from source type
+ * @param to   target type
+ *
+ * @returns true if the cast is supported.
+ */
+bool is_supported_cast(data_type from, data_type to) noexcept;
+
 /**
  * @brief Creates a column of `type_id::BOOL8` elements indicating the presence of `NaN` values
  * in a column of floating point values.
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 64427326d87..ec21813705a 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -28,6 +28,7 @@
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -459,6 +460,14 @@ std::unique_ptr<column> cast(column_view const& input,
   return type_dispatcher(input.type(), detail::dispatch_unary_cast_from{input}, type, stream, mr);
 }
 
+struct is_supported_cast_impl {
+  template <typename From, typename To>
+  bool operator()() const
+  {
+    return is_supported_cast<From, To>();
+  }
+};
+
 }  // namespace detail
 
 std::unique_ptr<column> cast(column_view const& input,
@@ -470,4 +479,11 @@ std::unique_ptr<column> cast(column_view const& input,
   return detail::cast(input, type, stream, mr);
 }
 
+bool is_supported_cast(data_type from, data_type to) noexcept
+{
+  // No matching detail API call/nvtx annotation, since this doesn't
+  // launch a kernel.
+  return double_type_dispatcher(from, to, detail::is_supported_cast_impl{});
+}
+
 }  // namespace cudf
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
index 7f8ae2b7617..2a1b189af51 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
@@ -43,5 +44,6 @@ cdef extern from "cudf/unary.hpp" namespace "cudf" nogil:
     cdef extern unique_ptr[column] cast(
         column_view input,
         data_type out_type) except +
+    cdef extern bool is_supported_cast(data_type from_, data_type to) noexcept
     cdef extern unique_ptr[column] is_nan(column_view input) except +
     cdef extern unique_ptr[column] is_not_nan(column_view input) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/unary.pxd b/python/cudf/cudf/_lib/pylibcudf/unary.pxd
index 4aa4543bb80..d07df838172 100644
--- a/python/cudf/cudf/_lib/pylibcudf/unary.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/unary.pxd
@@ -1,5 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from libcpp cimport bool
+
 from cudf._lib.pylibcudf.libcudf.unary cimport unary_operator
 
 from .column cimport Column
@@ -17,3 +19,5 @@ cpdef Column cast(Column input, DataType data_type)
 cpdef Column is_nan(Column input)
 
 cpdef Column is_not_nan(Column input)
+
+cpdef bool is_supported_cast(DataType from_, DataType to)
diff --git a/python/cudf/cudf/_lib/pylibcudf/unary.pyx b/python/cudf/cudf/_lib/pylibcudf/unary.pyx
index 0879b501a49..8da46f0a832 100644
--- a/python/cudf/cudf/_lib/pylibcudf/unary.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/unary.pyx
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
@@ -154,3 +155,23 @@ cpdef Column is_not_nan(Column input):
         result = move(cpp_unary.is_not_nan(input.view()))
 
     return Column.from_libcudf(move(result))
+
+cpdef bool is_supported_cast(DataType from_, DataType to):
+    """Check if a cast between datatypes is supported.
+
+    For details, see :cpp:func:`is_supported_cast`.
+
+    Parameters
+    ----------
+    from_
+        The source datatype
+    to
+        The target datatype
+
+    Returns
+    -------
+    bool
+        True if the cast is supported.
+    """
+    with nogil:
+        return cpp_unary.is_supported_cast(from_.c_obj, to.c_obj)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_unary.py b/python/cudf/cudf/pylibcudf_tests/test_unary.py
new file mode 100644
index 00000000000..b5e4f0cb0e8
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_unary.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib import pylibcudf as plc
+
+
+def test_is_supported_cast():
+    assert plc.unary.is_supported_cast(
+        plc.DataType(plc.TypeId.INT8), plc.DataType(plc.TypeId.UINT64)
+    )
+    assert plc.unary.is_supported_cast(
+        plc.DataType(plc.TypeId.DURATION_MILLISECONDS),
+        plc.DataType(plc.TypeId.UINT64),
+    )
+    assert not plc.unary.is_supported_cast(
+        plc.DataType(plc.TypeId.INT32), plc.DataType(plc.TypeId.TIMESTAMP_DAYS)
+    )
+    assert not plc.unary.is_supported_cast(
+        plc.DataType(plc.TypeId.INT32), plc.DataType(plc.TypeId.STRING)
+    )

From dddeb120d0cf8fc33f7f1a07149221fdb2a29e7a Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Thu, 11 Jul 2024 16:25:12 -0700
Subject: [PATCH 498/842] Fix ArrowDeviceArray interface to pass address of
 event (#16058)

the `sync_event` member of `ArrowDeviceArray` needs to be a pointer to a `cudaEvent_t`, currently we're returning the `cudaEvent_t` directly. We need to be passing the address of the event. Thankfully this is a single line change, plus adding a test to confirm.

Authors:
  - Matt Topol (https://github.com/zeroshade)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/16058
---
 cpp/src/interop/to_arrow_device.cu         |  2 +-
 cpp/tests/interop/to_arrow_device_test.cpp | 26 ++++++++++++++++++++++
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index ebfd6605977..b9d3a59e647 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -603,7 +603,7 @@ unique_device_array_t create_device_array(nanoarrow::UniqueArray&& out,
   });
   result->device_id          = rmm::get_current_cuda_device().value();
   result->device_type        = ARROW_DEVICE_CUDA;
-  result->sync_event         = private_data->sync_event;
+  result->sync_event         = &private_data->sync_event;
   result->array              = private_data->parent;  // makes a shallow copy
   result->array.private_data = private_data.release();
   result->array.release      = &detail::ArrowDeviceArrayRelease;
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 860544b8606..8903f09b82b 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -352,11 +352,15 @@ TEST_F(ToArrowDeviceTest, EmptyTable)
   auto got_arrow_device = cudf::to_arrow_device(table->view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_device->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_device->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_device->sync_event)));
   compare_arrays(schema.get(), arr.get(), &got_arrow_device->array);
 
   got_arrow_device = cudf::to_arrow_device(std::move(*table));
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_device->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_device->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_device->sync_event)));
   compare_arrays(schema.get(), arr.get(), &got_arrow_device->array);
 }
 
@@ -386,6 +390,8 @@ TEST_F(ToArrowDeviceTest, DateTimeTable)
   auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
 
   EXPECT_EQ(data.size(), got_arrow_array->array.length);
   EXPECT_EQ(0, got_arrow_array->array.null_count);
@@ -402,6 +408,8 @@ TEST_F(ToArrowDeviceTest, DateTimeTable)
   got_arrow_array = cudf::to_arrow_device(std::move(input));
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
 
   EXPECT_EQ(data.size(), got_arrow_array->array.length);
   EXPECT_EQ(0, got_arrow_array->array.null_count);
@@ -456,6 +464,8 @@ TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
   auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
 
   EXPECT_EQ(data.size(), got_arrow_array->array.length);
   EXPECT_EQ(0, got_arrow_array->array.null_count);
@@ -472,6 +482,8 @@ TYPED_TEST(ToArrowDeviceTestDurationsTest, DurationTable)
   got_arrow_array = cudf::to_arrow_device(std::move(input));
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
 
   EXPECT_EQ(data.size(), got_arrow_array->array.length);
   EXPECT_EQ(0, got_arrow_array->array.null_count);
@@ -538,6 +550,8 @@ TEST_F(ToArrowDeviceTest, NestedList)
   auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
   compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
   got_arrow_array = cudf::to_arrow_device(std::move(input));
@@ -682,11 +696,15 @@ TEST_F(ToArrowDeviceTest, StructColumn)
   auto got_arrow_array = cudf::to_arrow_device(input.view());
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
   compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
   got_arrow_array = cudf::to_arrow_device(std::move(input));
   EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
   EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+  ASSERT_CUDA_SUCCEEDED(
+    cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
   compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 }
 
@@ -755,11 +773,15 @@ TEST_F(ToArrowDeviceTest, FixedPoint64Table)
     auto got_arrow_array = cudf::to_arrow_device(input.view());
     ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
     ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
     compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
     got_arrow_array = cudf::to_arrow_device(std::move(input));
     ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
     ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
     compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
   }
 }
@@ -802,11 +824,15 @@ TEST_F(ToArrowDeviceTest, FixedPoint128Table)
     auto got_arrow_array = cudf::to_arrow_device(input.view());
     EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
     EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
     compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
 
     got_arrow_array = cudf::to_arrow_device(std::move(input));
     EXPECT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
     EXPECT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
     compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
   }
 }

From 30e3209894d78fe7d5927cde62b6c5975257958a Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 12 Jul 2024 14:59:27 +0100
Subject: [PATCH 499/842] Assert valid metadata is passed in to_arrow for
 list_view (#16198)

When converting a list column to arrow with metadata, one must provide metadata information for both the offset and value columns, or none at all. This is not completely obvious (perhaps we only need the metadata for the inner value column), so explicitly assert this case.

- Closes #16069

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16198
---
 cpp/src/interop/to_arrow.cu | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 62b85891adb..8c4be1b50a5 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -365,6 +365,9 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::list_view>(
   arrow::MemoryPool* ar_mr,
   rmm::cuda_stream_view stream)
 {
+  CUDF_EXPECTS(metadata.children_meta.empty() ||
+                 metadata.children_meta.size() == static_cast<std::size_t>(input.num_children()),
+               "Number of field names and number of children do not match\n");
   std::unique_ptr<column> tmp_column = nullptr;
   if ((input.offset() != 0) or
       ((input.num_children() == 2) and (input.child(0).size() - 1 != input.size()))) {

From 1ff74612a17336131ef2d1b00f83be177e1af128 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 12 Jul 2024 08:12:18 -0700
Subject: [PATCH 500/842] Improve the test data for pylibcudf I/O tests
 (#16247)

Don't just use random integers for every data type.

Decided not to use hypothesis since I don't think there's a good way to re-use the table across calls
(and I would like to keep the runtime of pylibcudf tests down).

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16247
---
 .../cudf/cudf/pylibcudf_tests/common/utils.py | 40 +++++++++
 python/cudf/cudf/pylibcudf_tests/conftest.py  | 70 ++++++++++++++-
 .../cudf/cudf/pylibcudf_tests/io/test_json.py | 85 ++++---------------
 3 files changed, 124 insertions(+), 71 deletions(-)

diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index 46603ff32b8..efb192b3251 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -174,6 +174,21 @@ def is_nested_list(typ):
     return nesting_level(typ)[0] > 1
 
 
+def write_source_str(source, input_str):
+    """
+    Write a string to the source
+    (useful for testing CSV/JSON I/O)
+    """
+    if not isinstance(source, io.IOBase):
+        with open(source, "w") as source_f:
+            source_f.write(input_str)
+    else:
+        if isinstance(source, io.BytesIO):
+            input_str = input_str.encode("utf-8")
+        source.write(input_str)
+        source.seek(0)
+
+
 def sink_to_str(sink):
     """
     Takes a sink (e.g. StringIO/BytesIO, filepath, etc.)
@@ -192,6 +207,31 @@ def sink_to_str(sink):
     return str_result
 
 
+def make_source(path_or_buf, pa_table, format, **kwargs):
+    """
+    Write a pyarrow Table to a specific format using pandas
+    by dispatching to the appropriate to_* call.
+    The caller is responsible for making sure that no arguments
+    unsupported by pandas are passed in.
+    """
+    df = pa_table.to_pandas()
+    mode = "w"
+    if "compression" in kwargs:
+        kwargs["compression"] = COMPRESSION_TYPE_TO_PANDAS[
+            kwargs["compression"]
+        ]
+        if kwargs["compression"] is not None and format != "json":
+            # pandas json method only supports mode="w"/"a"
+            mode = "wb"
+    if format == "json":
+        df.to_json(path_or_buf, mode=mode, **kwargs)
+    elif format == "csv":
+        df.to_csv(path_or_buf, mode=mode, **kwargs)
+    if isinstance(path_or_buf, io.IOBase):
+        path_or_buf.seek(0)
+    return path_or_buf
+
+
 NUMERIC_PA_TYPES = [pa.int64(), pa.float64(), pa.uint64()]
 STRING_PA_TYPES = [pa.string()]
 BOOL_PA_TYPES = [pa.bool_()]
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
index 39832eb4bba..3ef1e40b630 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -11,6 +11,7 @@
 import pytest
 
 import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.io.types import CompressionType
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
 
@@ -37,6 +38,37 @@ def numeric_pa_type(request):
     return request.param
 
 
+def _get_vals_of_type(pa_type, length, seed):
+    """
+    Returns an list-like of random values of that type
+    """
+    rng = np.random.default_rng(seed=seed)
+    if pa_type == pa.int64():
+        half = length // 2
+        negs = rng.integers(-length, 0, half, dtype=np.int64)
+        pos = rng.integers(0, length, length - half, dtype=np.int64)
+        return np.concatenate([negs, pos])
+    elif pa_type == pa.uint64():
+        return rng.integers(0, length, length, dtype=np.uint64)
+    elif pa_type == pa.float64():
+        # Round to 6 decimal places or else we have problems comparing our
+        # output to pandas due to floating point/rounding differences
+        return rng.uniform(-length, length, length).round(6)
+    elif pa_type == pa.bool_():
+        return rng.integers(0, 2, length, dtype=bool)
+    elif pa_type == pa.string():
+        # Generate random ASCII strings
+        strs = []
+        for _ in range(length):
+            chrs = rng.integers(33, 128, length)
+            strs.append("".join(chr(x) for x in chrs))
+        return strs
+    else:
+        raise NotImplementedError(
+            f"random data generation not implemented for {pa_type}"
+        )
+
+
 # TODO: Consider adding another fixture/adapting this
 # fixture to consider nullability
 @pytest.fixture(scope="session", params=[0, 100])
@@ -57,10 +89,9 @@ def table_data(request):
     # plc.io.TableWithMetadata
     colnames = []
 
-    np.random.seed(42)
+    seed = 42
 
     for typ in ALL_PA_TYPES:
-        rand_vals = np.random.randint(0, nrows, nrows)
         child_colnames = []
 
         def _generate_nested_data(typ):
@@ -88,13 +119,17 @@ def _generate_nested_data(typ):
                 child_colnames.append(("", grandchild_colnames))
             else:
                 # typ is scalar type
-                pa_array = pa.array(rand_vals).cast(typ)
+                pa_array = pa.array(
+                    _get_vals_of_type(typ, nrows, seed=seed), type=typ
+                )
             return pa_array, child_colnames
 
         if isinstance(typ, (pa.ListType, pa.StructType)):
             rand_arr, child_colnames = _generate_nested_data(typ)
         else:
-            rand_arr = pa.array(rand_vals).cast(typ)
+            rand_arr = pa.array(
+                _get_vals_of_type(typ, nrows, seed=seed), type=typ
+            )
 
         table_dict[f"col_{typ}"] = rand_arr
         colnames.append((f"col_{typ}", child_colnames))
@@ -121,6 +156,33 @@ def source_or_sink(request, tmp_path):
         return fp_or_buf()
 
 
+unsupported_types = {
+    # Not supported by pandas
+    # TODO: find a way to test these
+    CompressionType.SNAPPY,
+    CompressionType.BROTLI,
+    CompressionType.LZ4,
+    CompressionType.LZO,
+    CompressionType.ZLIB,
+}
+
+unsupported_text_compression_types = unsupported_types.union(
+    {
+        # compressions not supported by libcudf
+        # for csv/json
+        CompressionType.XZ,
+        CompressionType.ZSTD,
+    }
+)
+
+
+@pytest.fixture(
+    params=set(CompressionType).difference(unsupported_text_compression_types)
+)
+def text_compression_type(request):
+    return request.param
+
+
 @pytest.fixture(params=[opt for opt in plc.io.types.CompressionType])
 def compression_type(request):
     return request.param
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_json.py b/python/cudf/cudf/pylibcudf_tests/io/test_json.py
index c13eaf40625..4239f2438bb 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_json.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_json.py
@@ -5,45 +5,17 @@
 import pyarrow as pa
 import pytest
 from utils import (
-    COMPRESSION_TYPE_TO_PANDAS,
     assert_table_and_meta_eq,
+    make_source,
     sink_to_str,
+    write_source_str,
 )
 
 import cudf._lib.pylibcudf as plc
 from cudf._lib.pylibcudf.io.types import CompressionType
 
-
-def make_json_source(path_or_buf, pa_table, **kwargs):
-    """
-    Uses pandas to write a pyarrow Table to a JSON file.
-
-    The caller is responsible for making sure that no arguments
-    unsupported by pandas are passed in.
-    """
-    df = pa_table.to_pandas()
-    if "compression" in kwargs:
-        kwargs["compression"] = COMPRESSION_TYPE_TO_PANDAS[
-            kwargs["compression"]
-        ]
-    df.to_json(path_or_buf, orient="records", **kwargs)
-    if isinstance(path_or_buf, io.IOBase):
-        path_or_buf.seek(0)
-    return path_or_buf
-
-
-def write_json_bytes(source, json_str):
-    """
-    Write a JSON string to the source
-    """
-    if not isinstance(source, io.IOBase):
-        with open(source, "w") as source_f:
-            source_f.write(json_str)
-    else:
-        if isinstance(source, io.BytesIO):
-            json_str = json_str.encode("utf-8")
-        source.write(json_str)
-        source.seek(0)
+# Shared kwargs to pass to make_source
+_COMMON_JSON_SOURCE_KWARGS = {"format": "json", "orient": "records"}
 
 
 @pytest.mark.parametrize("rows_per_chunk", [8, 100])
@@ -156,21 +128,9 @@ def test_write_json_bool_opts(true_value, false_value):
 
 @pytest.mark.parametrize("lines", [True, False])
 def test_read_json_basic(
-    table_data, source_or_sink, lines, compression_type, request
+    table_data, source_or_sink, lines, text_compression_type
 ):
-    if compression_type in {
-        # Not supported by libcudf
-        CompressionType.SNAPPY,
-        CompressionType.XZ,
-        CompressionType.ZSTD,
-        # Not supported by pandas
-        # TODO: find a way to test these
-        CompressionType.BROTLI,
-        CompressionType.LZ4,
-        CompressionType.LZO,
-        CompressionType.ZLIB,
-    }:
-        pytest.skip("unsupported compression type by pandas/libcudf")
+    compression_type = text_compression_type
 
     # can't compress non-binary data with pandas
     if isinstance(source_or_sink, io.StringIO):
@@ -178,22 +138,12 @@ def test_read_json_basic(
 
     _, pa_table = table_data
 
-    source = make_json_source(
-        source_or_sink, pa_table, lines=lines, compression=compression_type
-    )
-
-    request.applymarker(
-        pytest.mark.xfail(
-            condition=(
-                len(pa_table) > 0
-                and compression_type
-                not in {CompressionType.NONE, CompressionType.AUTO}
-            ),
-            # note: wasn't able to narrow down the specific types that were failing
-            # seems to be a little non-deterministic, but always fails with
-            # cudaErrorInvalidValue invalid argument
-            reason="libcudf json reader crashes on compressed non empty table_data",
-        )
+    source = make_source(
+        source_or_sink,
+        pa_table,
+        lines=lines,
+        compression=compression_type,
+        **_COMMON_JSON_SOURCE_KWARGS,
     )
 
     if isinstance(source, io.IOBase):
@@ -237,10 +187,11 @@ def test_read_json_dtypes(table_data, source_or_sink):
     # Simple test for dtypes where we read in
     # all numeric data as floats
     _, pa_table = table_data
-    source = make_json_source(
+    source = make_source(
         source_or_sink,
         pa_table,
         lines=True,
+        **_COMMON_JSON_SOURCE_KWARGS,
     )
 
     dtypes = []
@@ -295,7 +246,7 @@ def test_read_json_lines_byte_range(source_or_sink, chunk_size):
         pytest.skip("byte_range doesn't work on StringIO")
 
     json_str = "[1, 2, 3]\n[4, 5, 6]\n[7, 8, 9]\n"
-    write_json_bytes(source, json_str)
+    write_source_str(source, json_str)
 
     tbls_w_meta = []
     for chunk_start in range(0, len(json_str.encode("utf-8")), chunk_size):
@@ -331,7 +282,7 @@ def test_read_json_lines_keep_quotes(keep_quotes, source_or_sink):
     source = source_or_sink
 
     json_bytes = '["a", "b", "c"]\n'
-    write_json_bytes(source, json_bytes)
+    write_source_str(source, json_bytes)
 
     tbl_w_meta = plc.io.json.read_json(
         plc.io.SourceInfo([source]), lines=True, keep_quotes=keep_quotes
@@ -359,8 +310,8 @@ def test_read_json_lines_keep_quotes(keep_quotes, source_or_sink):
 def test_read_json_lines_recovery_mode(recovery_mode, source_or_sink):
     source = source_or_sink
 
-    json_bytes = '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n'
-    write_json_bytes(source, json_bytes)
+    json_str = '{"a":1,"b":10}\n{"a":2,"b":11}\nabc\n{"a":3,"b":12}\n'
+    write_source_str(source, json_str)
 
     if recovery_mode == plc.io.types.JSONRecoveryMode.FAIL:
         with pytest.raises(RuntimeError):

From 4fc8e790bf0671bba85a94e29deb4f3bc511a416 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 12 Jul 2024 17:08:55 +0100
Subject: [PATCH 501/842] Handle nans in groupby-aggregations in polars
 executor (#16233)

Polars `min` and `max` by default ignore nans (treating them as nulls), to mimic this behaviour we must mask out nans before performing a min/max aggregation.

Do this by exposing `nans_to_nulls` in pylibcudf and implementing a `with_mask` method on pylibcudf Columns.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16233
---
 .../user_guide/api_docs/pylibcudf/index.rst   |  1 +
 .../api_docs/pylibcudf/transform.rst          |  6 +++
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |  1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |  2 +
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |  2 +
 python/cudf/cudf/_lib/pylibcudf/column.pxd    |  1 +
 python/cudf/cudf/_lib/pylibcudf/column.pyx    | 32 +++++++++++--
 .../cudf/_lib/pylibcudf/gpumemoryview.pyx     |  1 +
 python/cudf/cudf/_lib/pylibcudf/transform.pxd |  7 +++
 python/cudf/cudf/_lib/pylibcudf/transform.pyx | 35 +++++++++++++++
 python/cudf/cudf/_lib/transform.pyx           | 17 +++----
 python/cudf/cudf/pylibcudf_tests/conftest.py  | 11 ++++-
 .../cudf/pylibcudf_tests/test_transform.py    | 32 +++++++++++++
 .../cudf_polars/containers/column.py          | 45 +++++++++++++------
 python/cudf_polars/cudf_polars/dsl/expr.py    | 12 ++++-
 .../tests/containers/test_column.py           | 20 ++++++---
 .../cudf_polars/tests/expressions/test_agg.py | 25 ++++++++---
 python/cudf_polars/tests/test_groupby.py      | 24 ++++++++++
 18 files changed, 230 insertions(+), 44 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/transform.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/transform.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_transform.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index bd6f0f77357..5899d272160 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -34,6 +34,7 @@ This page provides API documentation for pylibcudf.
     stream_compaction
     table
     traits
+    transform
     types
     unary
 
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst
new file mode 100644
index 00000000000..ef04bbad7e6
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst
@@ -0,0 +1,6 @@
+=========
+transform
+=========
+
+.. automodule:: cudf._lib.pylibcudf.transform
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index d22096081af..a2d11bbea6e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -39,6 +39,7 @@ set(cython_sources
     sorting.pyx
     table.pyx
     traits.pyx
+    transform.pyx
     types.pyx
     unary.pyx
     utils.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index d4d615cde34..da2b7806203 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -24,6 +24,7 @@ from . cimport (
     stream_compaction,
     strings,
     traits,
+    transform,
     types,
     unary,
 )
@@ -63,6 +64,7 @@ __all__ = [
     "strings",
     "sorting",
     "traits",
+    "transform",
     "types",
     "unary",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index 91f8acaf682..acbc84d7177 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -24,6 +24,7 @@
     stream_compaction,
     strings,
     traits,
+    transform,
     types,
     unary,
 )
@@ -64,6 +65,7 @@
     "strings",
     "sorting",
     "traits",
+    "transform",
     "types",
     "unary",
 ]
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/cudf/cudf/_lib/pylibcudf/column.pxd
index d13791d95cf..13ee0a70681 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pxd
@@ -50,6 +50,7 @@ cdef class Column:
     cpdef gpumemoryview null_mask(self)
     cpdef list children(self)
     cpdef Column copy(self)
+    cpdef Column with_mask(self, gpumemoryview, size_type)
 
     cpdef ListColumnView list_view(self)
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index e0cf8b7ee32..cb96c1d9fce 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -175,6 +175,32 @@ cdef class Column:
             children,
         )
 
+    cpdef Column with_mask(self, gpumemoryview mask, size_type null_count):
+        """Augment this column with a new null mask.
+
+        Parameters
+        ----------
+        mask : gpumemoryview
+            New mask (or None to unset the mask)
+        null_count : int
+            New null count. If this is incorrect, bad things happen.
+
+        Returns
+        -------
+        New Column object sharing data with self (except for the mask which is new).
+        """
+        if mask is None and null_count > 0:
+            raise ValueError("Empty mask must have null count of zero")
+        return Column(
+            self._data_type,
+            self._size,
+            self._data,
+            mask,
+            null_count,
+            self._offset,
+            self._children,
+        )
+
     @staticmethod
     cdef Column from_column_view(const column_view& cv, Column owner):
         """Create a Column from a libcudf column_view.
@@ -250,7 +276,7 @@ cdef class Column:
         column is in use.
         """
         data = gpumemoryview(obj)
-        iface = data.__cuda_array_interface__()
+        iface = data.__cuda_array_interface__
         if iface.get('mask') is not None:
             raise ValueError("mask not yet supported.")
 
@@ -400,8 +426,8 @@ def is_c_contiguous(
     itemsize : int
         Size of an element in bytes.
 
-    Return
-    ------
+    Returns
+    -------
     bool
         The boolean answer.
     """
diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx
index a2f5b2ac387..0904022a944 100644
--- a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx
@@ -22,5 +22,6 @@ cdef class gpumemoryview:
         # TODO: Need to respect readonly
         self.ptr = cai["data"][0]
 
+    @property
     def __cuda_array_interface__(self):
         return self.obj.__cuda_array_interface__
diff --git a/python/cudf/cudf/_lib/pylibcudf/transform.pxd b/python/cudf/cudf/_lib/pylibcudf/transform.pxd
new file mode 100644
index 00000000000..4b21feffe25
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/transform.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from .column cimport Column
+from .gpumemoryview cimport gpumemoryview
+
+
+cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input)
diff --git a/python/cudf/cudf/_lib/pylibcudf/transform.pyx b/python/cudf/cudf/_lib/pylibcudf/transform.pyx
new file mode 100644
index 00000000000..a734e71b820
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/transform.pyx
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move, pair
+
+from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+
+from cudf._lib.pylibcudf.libcudf cimport transform as cpp_transform
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+from .column cimport Column
+from .gpumemoryview cimport gpumemoryview
+
+
+cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input):
+    """Create a null mask preserving existing nulls and converting nans to null.
+
+    Parameters
+    ----------
+    input : Column
+        Column to produce new mask from.
+
+    Returns
+    -------
+    Two-tuple of a gpumemoryview wrapping the null mask and the new null count.
+    """
+    cdef pair[unique_ptr[device_buffer], size_type] c_result
+
+    with nogil:
+        c_result = move(cpp_transform.nans_to_nulls(input.view()))
+
+    return (
+        gpumemoryview(DeviceBuffer.c_from_unique_ptr(move(c_result.first))),
+        c_result.second
+    )
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index b325173f20d..86a4a60eef1 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -20,6 +20,7 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 cimport cudf._lib.pylibcudf.libcudf.transform as libcudf_transform
 from cudf._lib.column cimport Column
 from cudf._lib.expressions cimport Expression
+from cudf._lib.pylibcudf cimport transform as plc_transform
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.expressions cimport expression
@@ -82,18 +83,10 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit):
 
 @acquire_spill_lock()
 def nans_to_nulls(Column input):
-    cdef column_view c_input = input.view()
-    cdef pair[unique_ptr[device_buffer], size_type] c_output
-    cdef unique_ptr[device_buffer] c_buffer
-
-    with nogil:
-        c_output = move(libcudf_transform.nans_to_nulls(c_input))
-        c_buffer = move(c_output.first)
-
-    if c_output.second == 0:
-        return None
-
-    return as_buffer(DeviceBuffer.c_from_unique_ptr(move(c_buffer)))
+    (mask, _) = plc_transform.nans_to_nulls(
+        input.to_pylibcudf(mode="read")
+    )
+    return as_buffer(mask)
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
index 3ef1e40b630..53e207f29cb 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -203,6 +203,15 @@ def sorted_opt(request):
     return request.param
 
 
-@pytest.fixture(scope="session", params=[False, True])
+@pytest.fixture(
+    scope="session", params=[False, True], ids=["without_nulls", "with_nulls"]
+)
 def has_nulls(request):
     return request.param
+
+
+@pytest.fixture(
+    scope="session", params=[False, True], ids=["without_nans", "with_nans"]
+)
+def has_nans(request):
+    return request.param
diff --git a/python/cudf/cudf/pylibcudf_tests/test_transform.py b/python/cudf/cudf/pylibcudf_tests/test_transform.py
new file mode 100644
index 00000000000..312939888dd
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_transform.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import math
+
+import pyarrow as pa
+from utils import assert_column_eq
+
+from cudf._lib import pylibcudf as plc
+
+
+def test_nans_to_nulls(has_nans):
+    if has_nans:
+        values = [1, float("nan"), float("nan"), None, 3, None]
+    else:
+        values = [1, 4, 5, None, 3, None]
+
+    replaced = [
+        None if (v is None or (v is not None and math.isnan(v))) else v
+        for v in values
+    ]
+
+    h_input = pa.array(values, type=pa.float32())
+    input = plc.interop.from_arrow(h_input)
+    assert input.null_count() == h_input.null_count
+    expect = pa.array(replaced, type=pa.float32())
+
+    mask, null_count = plc.transform.nans_to_nulls(input)
+
+    assert null_count == expect.null_count
+    got = input.with_mask(mask, null_count)
+
+    assert_column_eq(expect, got)
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 28685f0c4ed..af67059844e 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -128,24 +128,29 @@ def copy(self) -> Self:
         )
 
     def mask_nans(self) -> Self:
-        """Return a copy of self with nans masked out."""
-        if self.nan_count > 0:
-            raise NotImplementedError("Need to port transform.hpp to pylibcudf")
+        """Return a shallow copy of self with nans masked out."""
+        if plc.traits.is_floating_point(self.obj.type()):
+            old_count = self.obj.null_count()
+            mask, new_count = plc.transform.nans_to_nulls(self.obj)
+            result = type(self)(self.obj.with_mask(mask, new_count))
+            if old_count == new_count:
+                return result.sorted_like(self)
+            return result
         return self.copy()
 
     @functools.cached_property
     def nan_count(self) -> int:
         """Return the number of NaN values in the column."""
-        if self.obj.type().id() not in (plc.TypeId.FLOAT32, plc.TypeId.FLOAT64):
-            return 0
-        return plc.interop.to_arrow(
-            plc.reduce.reduce(
-                plc.unary.is_nan(self.obj),
-                plc.aggregation.sum(),
-                # TODO: pylibcudf needs to have a SizeType DataType singleton
-                plc.DataType(plc.TypeId.INT32),
-            )
-        ).as_py()
+        if plc.traits.is_floating_point(self.obj.type()):
+            return plc.interop.to_arrow(
+                plc.reduce.reduce(
+                    plc.unary.is_nan(self.obj),
+                    plc.aggregation.sum(),
+                    # TODO: pylibcudf needs to have a SizeType DataType singleton
+                    plc.DataType(plc.TypeId.INT32),
+                )
+            ).as_py()
+        return 0
 
 
 class NamedColumn(Column):
@@ -187,3 +192,17 @@ def copy(self, *, new_name: str | None = None) -> Self:
             order=self.order,
             null_order=self.null_order,
         )
+
+    def mask_nans(self) -> Self:
+        """Return a shallow copy of self with nans masked out."""
+        # Annoying, the inheritance is not right (can't call the
+        # super-type mask_nans), but will sort that by refactoring
+        # later.
+        if plc.traits.is_floating_point(self.obj.type()):
+            old_count = self.obj.null_count()
+            mask, new_count = plc.transform.nans_to_nulls(self.obj)
+            result = type(self)(self.obj.with_mask(mask, new_count), self.name)
+            if old_count == new_count:
+                return result.sorted_like(self)
+            return result
+        return self.copy()
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index f83d9e82d30..adf266bab81 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -867,7 +867,7 @@ def __init__(
         self.name = name
         self.options = options
         self.children = children
-        if self.name not in ("round", "unique"):
+        if self.name not in ("round", "unique", "mask_nans"):
             raise NotImplementedError(f"Unary function {name=}")
 
     def do_evaluate(
@@ -878,6 +878,9 @@ def do_evaluate(
         mapping: Mapping[Expr, Column] | None = None,
     ) -> Column:
         """Evaluate this expression given a dataframe for context."""
+        if self.name == "mask_nans":
+            (child,) = self.children
+            return child.evaluate(df, context=context, mapping=mapping).mask_nans()
         if self.name == "round":
             (decimal_places,) = self.options
             (values,) = (
@@ -1215,12 +1218,19 @@ def collect_agg(self, *, depth: int) -> AggInfo:
             raise NotImplementedError(
                 "Nested aggregations in groupby"
             )  # pragma: no cover; check_agg trips first
+        if (isminmax := self.name in {"min", "max"}) and self.options:
+            raise NotImplementedError("Nan propagation in groupby for min/max")
         (child,) = self.children
         ((expr, _, _),) = child.collect_agg(depth=depth + 1).requests
         if self.request is None:
             raise NotImplementedError(
                 f"Aggregation {self.name} in groupby"
             )  # pragma: no cover; __init__ trips first
+        if isminmax and plc.traits.is_floating_point(self.dtype):
+            assert expr is not None
+            # Ignore nans in these groupby aggs, do this by masking
+            # nans in the input
+            expr = UnaryFunction(self.dtype, "mask_nans", (), expr)
         return AggInfo([(expr, self.request, self)])
 
     def _reduce(
diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py
index 3291d8db161..4f3c0de5975 100644
--- a/python/cudf_polars/tests/containers/test_column.py
+++ b/python/cudf_polars/tests/containers/test_column.py
@@ -3,12 +3,14 @@
 
 from __future__ import annotations
 
+from functools import partial
+
 import pyarrow
 import pytest
 
 import cudf._lib.pylibcudf as plc
 
-from cudf_polars.containers import Column
+from cudf_polars.containers import Column, NamedColumn
 
 
 def test_non_scalar_access_raises():
@@ -54,17 +56,21 @@ def test_shallow_copy():
 
 
 @pytest.mark.parametrize("typeid", [plc.TypeId.INT8, plc.TypeId.FLOAT32])
-def test_mask_nans(typeid):
+@pytest.mark.parametrize("constructor", [Column, partial(NamedColumn, name="name")])
+def test_mask_nans(typeid, constructor):
     dtype = plc.DataType(typeid)
     values = pyarrow.array([0, 0, 0], type=plc.interop.to_arrow(dtype))
-    column = Column(plc.interop.from_arrow(values))
+    column = constructor(plc.interop.from_arrow(values))
     masked = column.mask_nans()
-    assert column.obj is masked.obj
+    assert column.obj.null_count() == masked.obj.null_count()
 
 
-def test_mask_nans_float_with_nan_notimplemented():
+def test_mask_nans_float():
     dtype = plc.DataType(plc.TypeId.FLOAT32)
     values = pyarrow.array([0, 0, float("nan")], type=plc.interop.to_arrow(dtype))
     column = Column(plc.interop.from_arrow(values))
-    with pytest.raises(NotImplementedError):
-        _ = column.mask_nans()
+    masked = column.mask_nans()
+    expect = pyarrow.array([0, 0, None], type=plc.interop.to_arrow(dtype))
+    got = pyarrow.array(plc.interop.to_arrow(masked.obj))
+
+    assert expect == got
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index 267d0a99692..e53fd7f8615 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -59,14 +59,25 @@ def test_agg(df, agg):
 
 
 @pytest.mark.parametrize(
-    "propagate_nans",
-    [pytest.param(False, marks=pytest.mark.xfail(reason="Need to mask nans")), True],
-    ids=["mask_nans", "propagate_nans"],
+    "op", [pl.Expr.min, pl.Expr.nan_min, pl.Expr.max, pl.Expr.nan_max]
 )
-@pytest.mark.parametrize("op", ["min", "max"])
-def test_agg_float_with_nans(propagate_nans, op):
-    df = pl.LazyFrame({"a": pl.Series([1, 2, float("nan")], dtype=pl.Float64())})
-    op = getattr(pl.Expr, f"nan_{op}" if propagate_nans else op)
+def test_agg_float_with_nans(op):
+    df = pl.LazyFrame(
+        {
+            "a": pl.Series([1, 2, float("nan")], dtype=pl.Float64()),
+            "b": pl.Series([1, 2, None], dtype=pl.Int8()),
+        }
+    )
+    q = df.select(op(pl.col("a")), op(pl.col("b")))
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.xfail(reason="https://github.com/pola-rs/polars/issues/17513")
+@pytest.mark.parametrize("op", [pl.Expr.max, pl.Expr.min])
+def test_agg_singleton(op):
+    df = pl.LazyFrame({"a": pl.Series([float("nan")])})
+
     q = df.select(op(pl.col("a")))
 
     assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index b84e2c16b43..81306397b9f 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -99,3 +99,27 @@ def test_groupby_unsupported(df, expr):
     q = df.group_by("key1").agg(expr)
 
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.xfail(reason="https://github.com/pola-rs/polars/issues/17513")
+def test_groupby_minmax_with_nan():
+    df = pl.LazyFrame(
+        {"key": [1, 2, 2, 2], "value": [float("nan"), 1, -1, float("nan")]}
+    )
+
+    q = df.group_by("key").agg(
+        pl.col("value").max().alias("max"), pl.col("value").min().alias("min")
+    )
+
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("op", [pl.Expr.nan_max, pl.Expr.nan_min])
+def test_groupby_nan_minmax_raises(op):
+    df = pl.LazyFrame(
+        {"key": [1, 2, 2, 2], "value": [float("nan"), 1, -1, float("nan")]}
+    )
+
+    q = df.group_by("key").agg(op(pl.col("value")))
+
+    assert_ir_translation_raises(q, NotImplementedError)

From f79ca04fe792107a69b5ccf18f41d65c44957dbd Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 12 Jul 2024 07:11:19 -1000
Subject: [PATCH 502/842] Add docstring for from_dataframe (#16260)

xref https://github.com/rapidsai/cudf/issues/16238

Mainly direct users to use `from_pandas` instead of `from_dataframe` if the user has a `pandas.DataFrame`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16260
---
 python/cudf/cudf/core/dataframe.py   | 21 ++++++++++++++++++++-
 python/cudf/cudf/core/df_protocol.py |  2 +-
 2 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 3e5ff9c18b5..2be59f87483 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -7849,7 +7849,26 @@ def value_counts(
         return result
 
 
-def from_dataframe(df, allow_copy=False):
+def from_dataframe(df, allow_copy: bool = False) -> DataFrame:
+    """
+    Build a :class:`DataFrame` from an object supporting the dataframe interchange protocol.
+
+    .. note::
+
+        If you have a ``pandas.DataFrame``, use :func:`from_pandas` instead.
+
+    Parameters
+    ----------
+    df : DataFrameXchg
+        Object supporting the interchange protocol, i.e. ``__dataframe__`` method.
+    allow_copy : bool, default: True
+        Whether to allow copying the memory to perform the conversion
+        (if false then zero-copy approach is requested).
+
+    Returns
+    -------
+    :class:`DataFrame`
+    """
     return df_protocol.from_dataframe(df, allow_copy=allow_copy)
 
 
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index 9cd573aceb9..a70a42c04af 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -648,7 +648,7 @@ def __dataframe__(
 
 def from_dataframe(
     df: DataFrameObject, allow_copy: bool = False
-) -> _CuDFDataFrame:
+) -> cudf.DataFrame:
     """
     Construct a ``DataFrame`` from ``df`` if it supports the
     dataframe interchange protocol (``__dataframe__``).

From 1737e70a006740b157624599b86929c01940fa3c Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 12 Jul 2024 19:33:00 +0100
Subject: [PATCH 503/842] Expose sorted groupby parameters to pylibcudf
 (#16240)

And plumb through to cudf-polars.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16240
---
 python/cudf/cudf/_lib/pylibcudf/groupby.pxd   |  4 ++
 python/cudf/cudf/_lib/pylibcudf/groupby.pyx   | 38 +++++++++++---
 .../cudf_polars/containers/column.py          | 41 ++++++++++++++-
 .../cudf_polars/containers/dataframe.py       | 45 ++++++++++++++--
 python/cudf_polars/cudf_polars/dsl/expr.py    | 29 ++++++++++-
 python/cudf_polars/cudf_polars/dsl/ir.py      | 29 ++++-------
 .../tests/containers/test_dataframe.py        | 51 +++++++++++++++++++
 .../cudf_polars/tests/expressions/test_agg.py |  8 +--
 .../tests/expressions/test_sort.py            | 31 +++++++++++
 python/cudf_polars/tests/test_groupby.py      | 39 ++++++++++++--
 10 files changed, 274 insertions(+), 41 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
index c6c146b0445..eaa05c26986 100644
--- a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
@@ -16,6 +16,7 @@ from cudf._lib.pylibcudf.libcudf.groupby cimport (
     scan_request,
 )
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, order
 
 from .column cimport Column
 from .table cimport Table
@@ -38,6 +39,9 @@ cdef class GroupByRequest:
 cdef class GroupBy:
     cdef unique_ptr[groupby] c_obj
     cdef Table _keys
+    cdef unique_ptr[vector[order]] _column_order
+    cdef unique_ptr[vector[null_order]] _null_precedence
+
     cpdef tuple aggregate(self, list requests)
     cpdef tuple scan(self, list requests)
     cpdef tuple shift(self, Table values, list offset, list fill_values)
diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
index 46fe61025ce..f5bb46ca6a2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
@@ -2,7 +2,7 @@
 
 from cython.operator cimport dereference
 from libcpp.functional cimport reference_wrapper
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
@@ -22,7 +22,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from .aggregation cimport Aggregation
 from .column cimport Column
 from .table cimport Table
-from .types cimport null_policy, sorted
+from .types cimport null_order, null_policy, order, sorted
 from .utils cimport _as_vector
 
 
@@ -87,17 +87,43 @@ cdef class GroupBy:
     keys : Table
         The columns to group by.
     null_handling : null_policy, optional
-        Whether or not to include null rows in ``keys``. Default is null_policy.EXCLUDE.
+        Whether or not to include null rows in `keys`.
+        Default is ``null_policy.EXCLUDE``.
     keys_are_sorted : sorted, optional
-        Whether the keys are already sorted. Default is sorted.NO.
+        Whether the keys are already sorted. Default is ``sorted.NO``.
+    column_order : list[order]
+        Indicates the order of each column. Default is ``order.ASCENDING``.
+        Ignored if `keys_are_sorted` is ``sorted.NO``.
+    null_precedence : list[null_order]
+        Indicates the ordering of null values in each column.
+        Default is ``null_order.AFTER``. Ignored if `keys_are_sorted` is ``sorted.NO``.
     """
     def __init__(
         self,
         Table keys,
         null_policy null_handling=null_policy.EXCLUDE,
-        sorted keys_are_sorted=sorted.NO
+        sorted keys_are_sorted=sorted.NO,
+        list column_order=None,
+        list null_precedence=None,
     ):
-        self.c_obj.reset(new groupby(keys.view(), null_handling, keys_are_sorted))
+        self._column_order = make_unique[vector[order]]()
+        self._null_precedence = make_unique[vector[null_order]]()
+        if column_order is not None:
+            for o in column_order:
+                dereference(self._column_order).push_back(<order?>o)
+        if null_precedence is not None:
+            for o in null_precedence:
+                dereference(self._null_precedence).push_back(<null_order?>o)
+
+        self.c_obj.reset(
+            new groupby(
+                keys.view(),
+                null_handling,
+                keys_are_sorted,
+                dereference(self._column_order.get()),
+                dereference(self._null_precedence.get()),
+            )
+        )
         # keep a reference to the keys table so it doesn't get
         # deallocated from under us:
         self._keys = keys
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index af67059844e..42aba0fcdc0 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -13,6 +13,8 @@
 if TYPE_CHECKING:
     from typing_extensions import Self
 
+    import polars as pl
+
 __all__: list[str] = ["Column", "NamedColumn"]
 
 
@@ -76,12 +78,49 @@ def sorted_like(self, like: Column, /) -> Self:
 
         See Also
         --------
-        set_sorted
+        set_sorted, copy_metadata
         """
         return self.set_sorted(
             is_sorted=like.is_sorted, order=like.order, null_order=like.null_order
         )
 
+    def copy_metadata(self, from_: pl.Series, /) -> Self:
+        """
+        Copy metadata from a host series onto self.
+
+        Parameters
+        ----------
+        from_
+            Polars series to copy metadata from
+
+        Returns
+        -------
+        Self with metadata set.
+
+        See Also
+        --------
+        set_sorted, sorted_like
+        """
+        if len(from_) <= 1:
+            return self
+        ascending = from_.flags["SORTED_ASC"]
+        descending = from_.flags["SORTED_DESC"]
+        if ascending or descending:
+            has_null_first = from_.item(0) is None
+            has_null_last = from_.item(-1) is None
+            order = (
+                plc.types.Order.ASCENDING if ascending else plc.types.Order.DESCENDING
+            )
+            null_order = plc.types.NullOrder.BEFORE
+            if (descending and has_null_first) or (ascending and has_null_last):
+                null_order = plc.types.NullOrder.AFTER
+            return self.set_sorted(
+                is_sorted=plc.types.Sorted.YES,
+                order=order,
+                null_order=null_order,
+            )
+        return self
+
     def set_sorted(
         self,
         *,
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index d86656578d7..cbeadf1426a 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -9,16 +9,18 @@
 from functools import cached_property
 from typing import TYPE_CHECKING, cast
 
+import pyarrow as pa
+
 import polars as pl
 
 import cudf._lib.pylibcudf as plc
 
 from cudf_polars.containers.column import NamedColumn
+from cudf_polars.utils import dtypes
 
 if TYPE_CHECKING:
     from collections.abc import Mapping, Sequence, Set
 
-    import pyarrow as pa
     from typing_extensions import Self
 
     import cudf
@@ -50,8 +52,16 @@ def to_polars(self) -> pl.DataFrame:
             self.table,
             [plc.interop.ColumnMetadata(name=c.name) for c in self.columns],
         )
-
-        return cast(pl.DataFrame, pl.from_arrow(table))
+        return cast(pl.DataFrame, pl.from_arrow(table)).with_columns(
+            *(
+                pl.col(c.name).set_sorted(
+                    descending=c.order == plc.types.Order.DESCENDING
+                )
+                if c.is_sorted
+                else pl.col(c.name)
+                for c in self.columns
+            )
+        )
 
     @cached_property
     def column_names_set(self) -> frozenset[str]:
@@ -83,6 +93,35 @@ def from_cudf(cls, df: cudf.DataFrame) -> Self:
             ]
         )
 
+    @classmethod
+    def from_polars(cls, df: pl.DataFrame) -> Self:
+        """
+        Create from a polars dataframe.
+
+        Parameters
+        ----------
+        df
+            Polars dataframe to convert
+
+        Returns
+        -------
+        New dataframe representing the input.
+        """
+        table = df.to_arrow()
+        schema = table.schema
+        for i, field in enumerate(schema):
+            schema = schema.set(
+                i, pa.field(field.name, dtypes.downcast_arrow_lists(field.type))
+            )
+        # No-op if the schema is unchanged.
+        d_table = plc.interop.from_arrow(table.cast(schema))
+        return cls(
+            [
+                NamedColumn(column, h_col.name).copy_metadata(h_col)
+                for column, h_col in zip(d_table.columns(), df.iter_columns())
+            ]
+        )
+
     @classmethod
     def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
         """
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index adf266bab81..f37cb3f475c 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -867,7 +867,7 @@ def __init__(
         self.name = name
         self.options = options
         self.children = children
-        if self.name not in ("round", "unique", "mask_nans"):
+        if self.name not in ("mask_nans", "round", "setsorted", "unique"):
             raise NotImplementedError(f"Unary function {name=}")
 
     def do_evaluate(
@@ -926,6 +926,33 @@ def do_evaluate(
             if maintain_order:
                 return Column(column).sorted_like(values)
             return Column(column)
+        elif self.name == "setsorted":
+            (column,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            (asc,) = self.options
+            order = (
+                plc.types.Order.ASCENDING
+                if asc == "ascending"
+                else plc.types.Order.DESCENDING
+            )
+            null_order = plc.types.NullOrder.BEFORE
+            if column.obj.null_count() > 0 and (n := column.obj.size()) > 1:
+                # PERF: This invokes four stream synchronisations!
+                has_nulls_first = not plc.copying.get_element(column.obj, 0).is_valid()
+                has_nulls_last = not plc.copying.get_element(
+                    column.obj, n - 1
+                ).is_valid()
+                if (order == plc.types.Order.DESCENDING and has_nulls_first) or (
+                    order == plc.types.Order.ASCENDING and has_nulls_last
+                ):
+                    null_order = plc.types.NullOrder.AFTER
+            return column.set_sorted(
+                is_sorted=plc.types.Sorted.YES,
+                order=order,
+                null_order=null_order,
+            )
         raise NotImplementedError(
             f"Unimplemented unary function {self.name=}"
         )  # pragma: no cover; init trips first
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index b32fa9c273e..5e6544ef77c 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -30,7 +30,7 @@
 
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import DataFrame, NamedColumn
-from cudf_polars.utils import dtypes, sorting
+from cudf_polars.utils import sorting
 
 if TYPE_CHECKING:
     from collections.abc import MutableMapping
@@ -385,17 +385,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         pdf = pl.DataFrame._from_pydf(self.df)
         if self.projection is not None:
             pdf = pdf.select(self.projection)
-        table = pdf.to_arrow()
-        schema = table.schema
-        for i, field in enumerate(schema):
-            schema = schema.set(
-                i, pa.field(field.name, dtypes.downcast_arrow_lists(field.type))
-            )
-        # No-op if the schema is unchanged.
-        table = table.cast(schema)
-        df = DataFrame.from_table(
-            plc.interop.from_arrow(table), list(self.schema.keys())
-        )
+        df = DataFrame.from_polars(pdf)
         assert all(
             c.obj.type() == dtype for c, dtype in zip(df.columns, self.schema.values())
         )
@@ -542,16 +532,17 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         keys = broadcast(
             *(k.evaluate(df) for k in self.keys), target_length=df.num_rows
         )
-        # TODO: use sorted information, need to expose column_order
-        # and null_precedence in pylibcudf groupby constructor
-        # sorted = (
-        #     plc.types.Sorted.YES
-        #     if all(k.is_sorted for k in keys)
-        #     else plc.types.Sorted.NO
-        # )
+        sorted = (
+            plc.types.Sorted.YES
+            if all(k.is_sorted for k in keys)
+            else plc.types.Sorted.NO
+        )
         grouper = plc.groupby.GroupBy(
             plc.Table([k.obj for k in keys]),
             null_handling=plc.types.NullPolicy.INCLUDE,
+            keys_are_sorted=sorted,
+            column_order=[k.order for k in keys],
+            null_precedence=[k.null_order for k in keys],
         )
         # TODO: uniquify
         requests = []
diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py
index 2e385e39eef..87508e17407 100644
--- a/python/cudf_polars/tests/containers/test_dataframe.py
+++ b/python/cudf_polars/tests/containers/test_dataframe.py
@@ -5,6 +5,8 @@
 
 import pytest
 
+import polars as pl
+
 import cudf._lib.pylibcudf as plc
 
 from cudf_polars.containers import DataFrame, NamedColumn
@@ -90,3 +92,52 @@ def test_shallow_copy():
     )
     assert df.columns[0].is_sorted == plc.types.Sorted.YES
     assert copy.columns[0].is_sorted == plc.types.Sorted.NO
+
+
+def test_sorted_flags_preserved_empty():
+    df = pl.DataFrame({"a": pl.Series([], dtype=pl.Int8())})
+    df.select(pl.col("a").sort())
+
+    gf = DataFrame.from_polars(df)
+
+    (a,) = gf.columns
+
+    assert a.is_sorted == plc.types.Sorted.YES
+
+    assert df.flags == gf.to_polars().flags
+
+
+@pytest.mark.parametrize("nulls_last", [True, False])
+def test_sorted_flags_preserved(with_nulls, nulls_last):
+    values = [1, 2, -1, 2, 4, 5]
+    if with_nulls:
+        values[4] = None
+    df = pl.DataFrame({"a": values, "b": values, "c": values})
+
+    df = df.select(
+        pl.col("a").sort(descending=False, nulls_last=nulls_last),
+        pl.col("b").sort(descending=True, nulls_last=nulls_last),
+        pl.col("c"),
+    )
+
+    gf = DataFrame.from_polars(df)
+
+    a_null_order = (
+        plc.types.NullOrder.AFTER
+        if nulls_last and with_nulls
+        else plc.types.NullOrder.BEFORE
+    )
+    b_null_order = (
+        plc.types.NullOrder.AFTER
+        if not nulls_last and with_nulls
+        else plc.types.NullOrder.BEFORE
+    )
+    a, b, c = gf.columns
+    assert a.is_sorted == plc.types.Sorted.YES
+    assert a.order == plc.types.Order.ASCENDING
+    assert a.null_order == a_null_order
+    assert b.is_sorted == plc.types.Sorted.YES
+    assert b.order == plc.types.Order.DESCENDING
+    assert b.null_order == b_null_order
+    assert c.is_sorted == plc.types.Sorted.NO
+    assert df.flags == gf.to_polars().flags
diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py
index e53fd7f8615..245bde3acab 100644
--- a/python/cudf_polars/tests/expressions/test_agg.py
+++ b/python/cudf_polars/tests/expressions/test_agg.py
@@ -20,13 +20,7 @@ def dtype(request):
     return request.param
 
 
-@pytest.fixture(
-    params=[
-        False,
-        pytest.param(True, marks=pytest.mark.xfail(reason="No handler for set_sorted")),
-    ],
-    ids=["unsorted", "sorted"],
-)
+@pytest.fixture(params=[False, True], ids=["unsorted", "sorted"])
 def is_sorted(request):
     return request.param
 
diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py
index 0195266f5c6..d46df92db94 100644
--- a/python/cudf_polars/tests/expressions/test_sort.py
+++ b/python/cudf_polars/tests/expressions/test_sort.py
@@ -8,6 +8,9 @@
 
 import polars as pl
 
+import cudf._lib.pylibcudf as plc
+
+from cudf_polars import translate_ir
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
 
@@ -51,3 +54,31 @@ def test_sort_by_expression(descending, nulls_last, maintain_order):
         )
     )
     assert_gpu_result_equal(query, check_row_order=maintain_order)
+
+
+@pytest.mark.parametrize("descending", [False, True])
+@pytest.mark.parametrize("nulls_last", [False, True])
+def test_setsorted(descending, nulls_last, with_nulls):
+    values = sorted([1, 2, 3, 4, 5, 6, -2], reverse=descending)
+    if with_nulls:
+        values[-1 if nulls_last else 0] = None
+    df = pl.LazyFrame({"a": values})
+
+    q = df.set_sorted("a", descending=descending)
+
+    assert_gpu_result_equal(q)
+
+    df = translate_ir(q._ldf.visit()).evaluate(cache={})
+
+    (a,) = df.columns
+
+    assert a.is_sorted == plc.types.Sorted.YES
+    null_order = (
+        plc.types.NullOrder.AFTER
+        if (descending ^ nulls_last) and with_nulls
+        else plc.types.NullOrder.BEFORE
+    )
+    assert a.null_order == null_order
+    assert a.order == (
+        plc.types.Order.DESCENDING if descending else plc.types.Order.ASCENDING
+    )
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index 81306397b9f..50adca01950 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import itertools
+
 import pytest
 
 import polars as pl
@@ -26,12 +28,12 @@ def df():
 
 @pytest.fixture(
     params=[
-        ["key1"],
-        ["key2"],
+        [pl.col("key1")],
+        [pl.col("key2")],
         [pl.col("key1") * pl.col("key2")],
-        ["key1", "key2"],
+        [pl.col("key1"), pl.col("key2")],
         [pl.col("key1") == pl.col("key2")],
-        ["key2", pl.col("key1") == pl.lit(1, dtype=pl.Int64)],
+        [pl.col("key2"), pl.col("key1") == pl.lit(1, dtype=pl.Int64)],
     ],
     ids=lambda keys: "-".join(map(str, keys)),
 )
@@ -82,6 +84,35 @@ def test_groupby(df: pl.LazyFrame, maintain_order, keys, exprs):
     assert_gpu_result_equal(q, check_exact=False)
 
 
+def test_groupby_sorted_keys(df: pl.LazyFrame, keys, exprs):
+    sorted_keys = [
+        key.sort(descending=descending)
+        for key, descending in zip(keys, itertools.cycle([False, True]))
+    ]
+
+    q = df.group_by(*sorted_keys).agg(*exprs)
+
+    schema = q.collect_schema()
+    sort_keys = list(schema.keys())[: len(keys)]
+    # Multiple keys don't do sorting
+    qsorted = q.sort(*sort_keys)
+    if len(keys) > 1:
+        with pytest.raises(AssertionError):
+            # https://github.com/pola-rs/polars/issues/17556
+            assert_gpu_result_equal(q, check_exact=False)
+        if schema[sort_keys[1]] == pl.Boolean():
+            # https://github.com/pola-rs/polars/issues/17557
+            with pytest.raises(AssertionError):
+                assert_gpu_result_equal(qsorted, check_exact=False)
+        else:
+            assert_gpu_result_equal(qsorted, check_exact=False)
+    elif schema[sort_keys[0]] == pl.Boolean():
+        # Boolean keys don't do sorting, so we get random order
+        assert_gpu_result_equal(qsorted, check_exact=False)
+    else:
+        assert_gpu_result_equal(q, check_exact=False)
+
+
 def test_groupby_len(df, keys):
     q = df.group_by(*keys).agg(pl.len())
 

From 1cbd9eb327f4290ef402234f0cb65b93df01ba0a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 12 Jul 2024 15:01:24 -0400
Subject: [PATCH 504/842] Update contains_tests.cpp to use public cudf::slice
 (#16253)

Changes the `cpp/tests/lists/contains_test.cpp` to use `cudf::slice` instead of `cudf::detail::slice()`

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16253
---
 cpp/tests/lists/contains_tests.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp
index 718ee83cf09..8fb2b403051 100644
--- a/cpp/tests/lists/contains_tests.cpp
+++ b/cpp/tests/lists/contains_tests.cpp
@@ -224,9 +224,8 @@ TYPED_TEST(TypedContainsTest, SlicedLists)
 
   {
     // First Slice.
-    auto sliced_column_1 =
-      cudf::detail::slice(search_space, {1, 8}, cudf::get_default_stream()).front();
-    auto search_key_one = create_scalar_search_key<T>(1);
+    auto sliced_column_1 = cudf::slice(search_space, {1, 8}, cudf::get_default_stream()).front();
+    auto search_key_one  = create_scalar_search_key<T>(1);
     {
       // CONTAINS
       auto result          = cudf::lists::contains(sliced_column_1, *search_key_one);
@@ -257,9 +256,8 @@ TYPED_TEST(TypedContainsTest, SlicedLists)
 
   {
     // Second Slice.
-    auto sliced_column_2 =
-      cudf::detail::slice(search_space, {3, 10}, cudf::get_default_stream()).front();
-    auto search_key_one = create_scalar_search_key<T>(1);
+    auto sliced_column_2 = cudf::slice(search_space, {3, 10}, cudf::get_default_stream()).front();
+    auto search_key_one  = create_scalar_search_key<T>(1);
     {
       // CONTAINS
       auto result          = cudf::lists::contains(sliced_column_2, *search_key_one);

From 99ad73d5c1b95374255a7abb20911a26eb292fa5 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 12 Jul 2024 12:38:37 -0700
Subject: [PATCH 505/842] Remove temporary functor overloads required by cuco
 version bump (#16242)

This is a follow-up of #15938. It removes the temporary workaround no longer needed after the cuco version bump.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16242
---
 .../cudf/detail/distinct_hash_join.cuh        | 20 -------------------
 cpp/src/search/contains_table.cu              | 19 ------------------
 cpp/src/text/bpe/byte_pair_encoding.cuh       | 13 ------------
 cpp/src/text/vocabulary_tokenize.cu           |  8 --------
 4 files changed, 60 deletions(-)

diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
index 1ef8b3b120a..c3bc3ad89fa 100644
--- a/cpp/include/cudf/detail/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -42,17 +42,6 @@ template <typename Equal>
 struct comparator_adapter {
   comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {}
 
-  // suppress "function was declared but never referenced warning"
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 177
-  __device__ constexpr auto operator()(
-    cuco::pair<hash_value_type, lhs_index_type> const&,
-    cuco::pair<hash_value_type, lhs_index_type> const&) const noexcept
-  {
-    // All build table keys are distinct thus `false` no matter what
-    return false;
-  }
-
   __device__ constexpr auto operator()(
     cuco::pair<hash_value_type, rhs_index_type> const&,
     cuco::pair<hash_value_type, rhs_index_type> const&) const noexcept
@@ -69,15 +58,6 @@ struct comparator_adapter {
     return _d_equal(lhs.second, rhs.second);
   }
 
-  __device__ constexpr auto operator()(
-    cuco::pair<hash_value_type, rhs_index_type> const& lhs,
-    cuco::pair<hash_value_type, lhs_index_type> const& rhs) const noexcept
-  {
-    if (lhs.first != rhs.first) { return false; }
-    return _d_equal(lhs.second, rhs.second);
-  }
-#pragma nv_diagnostic pop
-
  private:
   Equal _d_equal;
 };
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index fbb0f6cb0f5..4fb983dc5a6 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -76,18 +76,6 @@ struct comparator_adapter {
   {
   }
 
-  // suppress "function was declared but never referenced warning"
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 177
-  __device__ constexpr auto operator()(lhs_index_type lhs_index,
-                                       lhs_index_type rhs_index) const noexcept
-  {
-    auto const lhs = static_cast<size_type>(lhs_index);
-    auto const rhs = static_cast<size_type>(rhs_index);
-
-    return _self_equal(lhs, rhs);
-  }
-
   __device__ constexpr auto operator()(rhs_index_type lhs_index,
                                        rhs_index_type rhs_index) const noexcept
   {
@@ -103,13 +91,6 @@ struct comparator_adapter {
     return _two_table_equal(lhs_index, rhs_index);
   }
 
-  __device__ constexpr auto operator()(rhs_index_type lhs_index,
-                                       lhs_index_type rhs_index) const noexcept
-  {
-    return _two_table_equal(lhs_index, rhs_index);
-  }
-#pragma nv_diagnostic pop
-
  private:
   SelfEqual const _self_equal;
   TwoTableEqual const _two_table_equal;
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cuh b/cpp/src/text/bpe/byte_pair_encoding.cuh
index 3bb574748b6..a2e441c3284 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cuh
+++ b/cpp/src/text/bpe/byte_pair_encoding.cuh
@@ -89,14 +89,6 @@ struct bpe_equal {
     return lhs == rhs;  // all rows are unique
   }
   // used by find
-  __device__ bool operator()(cudf::size_type lhs, merge_pair_type const& rhs) const noexcept
-  {
-    lhs *= 2;
-    auto const left  = d_strings.element<cudf::string_view>(lhs);
-    auto const right = d_strings.element<cudf::string_view>(lhs + 1);
-    return (left == rhs.first) && (right == rhs.second);
-  }
-  // used by find
   __device__ bool operator()(merge_pair_type const& lhs, cudf::size_type rhs) const noexcept
   {
     rhs *= 2;
@@ -157,11 +149,6 @@ struct mp_equal {
     return left == right;
   }
   // used by find
-  __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
-  {
-    auto const left = d_strings.element<cudf::string_view>(lhs);
-    return left == rhs;
-  }
   __device__ bool operator()(cudf::string_view const& lhs, cudf::size_type rhs) const noexcept
   {
     auto const right = d_strings.element<cudf::string_view>(rhs);
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index ea09f5d17af..97abb1487d8 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -86,18 +86,10 @@ struct vocab_equal {
     return lhs == rhs;  // all rows are expected to be unique
   }
   // used by find
-  // suppress "function was declared but never referenced warning"
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 177
-  __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
-  {
-    return d_strings.element<cudf::string_view>(lhs) == rhs;
-  }
   __device__ bool operator()(cudf::string_view const& lhs, cudf::size_type rhs) const noexcept
   {
     return d_strings.element<cudf::string_view>(rhs) == lhs;
   }
-#pragma nv_diagnostic pop
 };
 
 using probe_scheme        = cuco::linear_probing<1, vocab_hasher>;

From 390e6fec3c6b3c257c93d38c3a999f2e4c9706e1 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 12 Jul 2024 12:10:11 -1000
Subject: [PATCH 506/842] Clean up state variables in MultiIndex (#16203)

MultiIndex sets it's own state variables outside of `__init__` and allows some uninitialized private variables that may be called in other methods. This PR now ensures these state variables are always initialized in `__init__`, `_from_data` and `_simple_new`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16203
---
 python/cudf/cudf/core/dataframe.py        |  22 +-
 python/cudf/cudf/core/multiindex.py       | 275 +++++++++++-----------
 python/cudf/cudf/tests/test_multiindex.py |  16 +-
 python/cudf/cudf/tests/test_repr.py       |  12 +-
 4 files changed, 152 insertions(+), 173 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 2be59f87483..f110b788789 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3593,15 +3593,15 @@ def rename(
 
             if level is not None and isinstance(self.index, MultiIndex):
                 level = self.index._get_level_label(level)
-                out_index = self.index.copy(deep=copy)
-                level_values = out_index.get_level_values(level)
-                level_values.to_frame().replace(
+                level_values = self.index.get_level_values(level)
+                ca = self.index._data.copy(deep=copy)
+                ca[level] = level_values._column.find_and_replace(
                     to_replace=list(index.keys()),
-                    value=list(index.values()),
-                    inplace=True,
+                    replacement=list(index.values()),
+                )
+                out_index = type(self.index)._from_data(
+                    ca, name=self.index.name
                 )
-                out_index._data[level] = column.as_column(level_values)
-                out_index._compute_levels_and_codes()
             else:
                 to_replace = list(index.keys())
                 vals = list(index.values())
@@ -7058,12 +7058,8 @@ def stack(self, level=-1, dropna=no_default, future_stack=False):
         # Assemble the final index
         new_index_columns = [*repeated_index._columns, *tiled_index]
         index_names = [*self.index.names, *unique_named_levels.names]
-        new_index = MultiIndex.from_frame(
-            DataFrame._from_data(
-                dict(zip(range(0, len(new_index_columns)), new_index_columns))
-            ),
-            names=index_names,
-        )
+        new_index = MultiIndex._from_data(dict(enumerate(new_index_columns)))
+        new_index.names = index_names
 
         # Compute the column indices that serves as the input for
         # `interleave_columns`
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index dbbd1eab6c8..6503dae6ff5 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -7,9 +7,7 @@
 import operator
 import pickle
 import warnings
-from collections import abc
 from functools import cached_property
-from numbers import Integral
 from typing import TYPE_CHECKING, Any, MutableMapping
 
 import cupy as cp
@@ -20,7 +18,7 @@
 import cudf._lib as libcudf
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
-from cudf.api.types import is_integer, is_list_like, is_object_dtype
+from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar
 from cudf.core import column
 from cudf.core._base_index import _return_get_indexer_result
 from cudf.core.algorithms import factorize
@@ -64,6 +62,20 @@ def _maybe_indices_to_slice(indices: cp.ndarray) -> slice | cp.ndarray:
     return indices
 
 
+def _compute_levels_and_codes(
+    data: MutableMapping,
+) -> tuple[list[cudf.Index], list[column.ColumnBase]]:
+    """Return MultiIndex level and codes from a ColumnAccessor-like mapping."""
+    levels = []
+    codes = []
+    for col in data.values():
+        code, cats = factorize(col)
+        codes.append(column.as_column(code.astype(np.int64)))
+        levels.append(cats)
+
+    return levels, codes
+
+
 class MultiIndex(Frame, BaseIndex, NotIterable):
     """A multi-level or hierarchical index.
 
@@ -146,50 +158,36 @@ def __init__(
             raise NotImplementedError(
                 "Use `names`, `name` is not yet supported"
             )
-        if len(levels) == 0:
-            raise ValueError("Must pass non-zero number of levels/codes")
-        if not isinstance(codes, cudf.DataFrame) and not isinstance(
-            codes[0], (abc.Sequence, np.ndarray, cp.ndarray)
-        ):
-            raise TypeError("Codes is not a Sequence of sequences")
-
-        if copy:
-            if isinstance(codes, cudf.DataFrame):
-                codes = codes.copy(deep=True)
-            if len(levels) > 0 and isinstance(
-                levels[0], (cudf.Index, cudf.Series)
-            ):
-                levels = [level.copy(deep=True) for level in levels]
-
-        if not isinstance(codes, cudf.DataFrame):
-            if len(levels) == len(codes):
-                codes = cudf.DataFrame._from_data(
-                    {
-                        i: column.as_column(code).astype(np.int64)
-                        for i, code in enumerate(codes)
-                    }
-                )
-            else:
-                raise ValueError(
-                    "MultiIndex has unequal number of levels and "
-                    "codes and is inconsistent!"
-                )
-
-        levels = [ensure_index(level) for level in levels]
-
-        if len(levels) != len(codes._data):
-            raise ValueError(
-                "MultiIndex has unequal number of levels and "
-                "codes and is inconsistent!"
-            )
-        if len({c.size for c in codes._data.columns}) != 1:
+        if levels is None or codes is None:
+            raise TypeError("Must pass both levels and codes")
+        elif not (is_list_like(levels) and len(levels) > 0):
+            raise ValueError("Must pass non-zero length sequence of levels")
+        elif not (is_list_like(codes) and len(codes) > 0):
+            raise ValueError("Must pass non-zero length sequence of codes")
+        elif len(codes) != len(levels):
             raise ValueError(
-                "MultiIndex length of codes does not match "
-                "and is inconsistent!"
+                f"levels must have the same length ({len(levels)}) "
+                f"as codes ({len(codes)})."
             )
 
+        new_levels = []
+        for level in levels:
+            new_level = ensure_index(level)
+            if copy and new_level is level:
+                new_level = new_level.copy(deep=True)
+            new_levels.append(new_level)
+
+        new_codes = []
+        for code in codes:
+            if not (is_list_like(code) or is_column_like(code)):
+                raise TypeError("Each code must be list-like")
+            new_code = column.as_column(code).astype("int64")
+            if copy and new_code is code:
+                new_code = new_code.copy(deep=True)
+            new_codes.append(new_code)
+
         source_data = {}
-        for (column_name, code), level in zip(codes._data.items(), levels):
+        for i, (code, level) in enumerate(zip(new_codes, new_levels)):
             if len(code):
                 lo, hi = libcudf.reduce.minmax(code)
                 if lo.value < -1 or hi.value > len(level) - 1:
@@ -202,13 +200,11 @@ def __init__(
             result_col = libcudf.copying.gather(
                 [level._column], code, nullify=True
             )
-            source_data[column_name] = result_col[0]._with_type_metadata(
-                level.dtype
-            )
+            source_data[i] = result_col[0]._with_type_metadata(level.dtype)
 
-        super().__init__(source_data)
-        self._levels = levels
-        self._codes = codes
+        super().__init__(ColumnAccessor(source_data))
+        self._levels = new_levels
+        self._codes = new_codes
         self._name = None
         self.names = names
 
@@ -350,10 +346,37 @@ def _from_data(
         data: MutableMapping,
         name: Any = None,
     ) -> MultiIndex:
-        obj = cls.from_frame(cudf.DataFrame._from_data(data=data))
-        if name is not None:
-            obj.name = name
-        return obj
+        """
+        Use when you have a ColumnAccessor-like mapping but no codes and levels.
+        """
+        levels, codes = _compute_levels_and_codes(data)
+        return cls._simple_new(
+            data=ColumnAccessor(data),
+            levels=levels,
+            codes=codes,
+            names=pd.core.indexes.frozen.FrozenList(data.keys()),
+            name=name,
+        )
+
+    @classmethod
+    def _simple_new(
+        cls,
+        data: ColumnAccessor,
+        levels: list[cudf.Index],
+        codes: list[column.ColumnBase],
+        names: pd.core.indexes.frozen.FrozenList,
+        name: Any = None,
+    ) -> Self:
+        """
+        Use when you have a ColumnAccessor-like mapping, codes, and levels.
+        """
+        mi = object.__new__(cls)
+        mi._data = data
+        mi._levels = levels
+        mi._codes = codes
+        mi._names = names
+        mi._name = name
+        return mi
 
     @property  # type: ignore
     @_performance_tracking
@@ -421,18 +444,17 @@ def copy(
         2020-08-28 AMZN  3401.80
                    MSFT   228.91
         """
-
-        mi = MultiIndex._from_data(self._data.copy(deep=deep))
-        if self._levels is not None:
-            mi._levels = [idx.copy(deep=deep) for idx in self._levels]
-        if self._codes is not None:
-            mi._codes = self._codes.copy(deep)
         if names is not None:
-            mi.names = names
-        elif self.names is not None:
-            mi.names = self.names.copy()
-
-        return mi
+            names = pd.core.indexes.frozen.FrozenList(names)
+        else:
+            names = self.names
+        return type(self)._simple_new(
+            data=self._data.copy(deep=deep),
+            levels=[idx.copy(deep=deep) for idx in self._levels],
+            codes=[code.copy(deep=deep) for code in self._codes],
+            names=names,
+            name=name,
+        )
 
     @_performance_tracking
     def __repr__(self):
@@ -478,14 +500,8 @@ def __repr__(self):
         data_output = "\n".join(lines)
         return output_prefix + data_output
 
-    @property
-    def _codes_frame(self):
-        if self._codes is None:
-            self._compute_levels_and_codes()
-        return self._codes
-
     @property  # type: ignore
-    @_external_only_api("Use ._codes_frame instead")
+    @_external_only_api("Use ._codes instead")
     @_performance_tracking
     def codes(self):
         """
@@ -505,7 +521,7 @@ def codes(self):
         FrozenList([[0, 1, 2], [0, 1, 2]])
         """
         return pd.core.indexes.frozen.FrozenList(
-            col.values for col in self._codes_frame._columns
+            col.values for col in self._codes
         )
 
     def get_slice_bound(self, label, side, kind=None):
@@ -519,13 +535,13 @@ def nlevels(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def levels(self):
+    def levels(self) -> list[cudf.Index]:
         """
         Returns list of levels in the MultiIndex
 
         Returns
         -------
-        List of Series objects
+        List of Index objects
 
         Examples
         --------
@@ -545,9 +561,9 @@ def levels(self):
         >>> midx.levels
         [Index([1, 2, 3], dtype='int64', name='a'), Index([10, 11, 12], dtype='int64', name='b')]
         """  # noqa: E501
-        if self._levels is None:
-            self._compute_levels_and_codes()
-        return self._levels
+        return [
+            idx.rename(name) for idx, name in zip(self._levels, self.names)
+        ]
 
     @property  # type: ignore
     @_performance_tracking
@@ -566,11 +582,10 @@ def _get_level_label(self, level):
             else if level is index of the level, then level
             label will be returned as per the index.
         """
-
-        if level in self._data.names:
+        if level in self.names:
             return level
         else:
-            return self._data.names[level]
+            return self.names[level]
 
     @_performance_tracking
     def isin(self, values, level=None):
@@ -671,20 +686,6 @@ def where(self, cond, other=None, inplace=False):
             ".where is not supported for MultiIndex operations"
         )
 
-    @_performance_tracking
-    def _compute_levels_and_codes(self):
-        levels = []
-
-        codes = {}
-        for name, col in self._data.items():
-            code, cats = cudf.Series._from_data({None: col}).factorize()
-            cats.name = name
-            codes[name] = code.astype(np.int64)
-            levels.append(cats)
-
-        self._levels = levels
-        self._codes = cudf.DataFrame._from_data(codes)
-
     @_performance_tracking
     def _compute_validity_mask(self, index, row_tuple, max_length):
         """Computes the valid set of indices of values in the lookup"""
@@ -823,7 +824,7 @@ def _index_and_downcast(self, result, index, index_key):
                 result.names = index.names[size:]
             index = MultiIndex(
                 levels=index.levels[size:],
-                codes=index._codes_frame.iloc[:, size:],
+                codes=index._codes[size:],
                 names=index.names[size:],
             )
 
@@ -933,28 +934,29 @@ def deserialize(cls, header, frames):
     def __getitem__(self, index):
         flatten = isinstance(index, int)
 
-        if isinstance(index, (Integral, abc.Sequence)):
-            index = np.array(index)
-        elif isinstance(index, slice):
+        if isinstance(index, slice):
             start, stop, step = index.indices(len(self))
-            index = column.as_column(range(start, stop, step))
-        result = MultiIndex.from_frame(
-            self.to_frame(index=False, name=range(0, self.nlevels)).take(
-                index
-            ),
-            names=self.names,
+            idx = range(start, stop, step)
+        elif is_scalar(index):
+            idx = [index]
+        else:
+            idx = index
+
+        indexer = column.as_column(idx)
+        ca = self._data._from_columns_like_self(
+            (col.take(indexer) for col in self._columns), verify=False
+        )
+        codes = [code.take(indexer) for code in self._codes]
+        result = type(self)._simple_new(
+            data=ca, codes=codes, levels=self._levels, names=self.names
         )
 
         # we are indexing into a single row of the MultiIndex,
         # return that row as a tuple:
         if flatten:
             return result.to_pandas()[0]
-
-        if self._codes_frame is not None:
-            result._codes = self._codes_frame.take(index)
-        if self._levels is not None:
-            result._levels = self._levels
-        return result
+        else:
+            return result
 
     @_performance_tracking
     def to_frame(self, index=True, name=no_default, allow_duplicates=False):
@@ -1270,25 +1272,12 @@ def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
                     ('NJ', 'Precip')],
                    names=['state', 'observation'])
         """
-        obj = cls.__new__(cls)
-        super(cls, obj).__init__()
-
-        source_data = df.copy(deep=False)
-        source_data.reset_index(drop=True, inplace=True)
-        if isinstance(source_data, pd.DataFrame):
-            source_data = cudf.DataFrame.from_pandas(source_data)
-
-        names = names if names is not None else source_data._data.names
-        # if names are unique
-        # try using those as the source_data column names:
-        if len(dict.fromkeys(names)) == len(names):
-            source_data.columns = names
-        obj._name = None
-        obj._data = source_data._data
-        obj.names = names
-        obj._codes = None
-        obj._levels = None
-        return obj
+        if isinstance(df, pd.DataFrame):
+            source_data = cudf.DataFrame.from_pandas(df)
+        else:
+            source_data = df
+        names = names if names is not None else source_data._column_names
+        return cls.from_arrays(source_data._columns, names=names)
 
     @classmethod
     @_performance_tracking
@@ -1436,7 +1425,7 @@ def _poplevels(self, level):
 
         # update self
         self.names = names
-        self._compute_levels_and_codes()
+        self._levels, self._codes = _compute_levels_and_codes(self._data)
 
         return popped
 
@@ -1560,13 +1549,19 @@ def to_pandas(
     ) -> pd.MultiIndex:
         # cudf uses np.iinfo(size_type_dtype).min as missing code
         # pandas uses -1 as missing code
-        pd_codes = self._codes_frame.replace(np.iinfo(size_type_dtype).min, -1)
+        pd_codes = (
+            code.find_and_replace(
+                column.as_column(np.iinfo(size_type_dtype).min, length=1),
+                column.as_column(-1, length=1),
+            )
+            for code in self._codes
+        )
         return pd.MultiIndex(
             levels=[
                 level.to_pandas(nullable=nullable, arrow_type=arrow_type)
                 for level in self.levels
             ],
-            codes=[col.values_host for col in pd_codes._columns],
+            codes=[col.values_host for col in pd_codes],
             names=self.names,
         )
 
@@ -1741,13 +1736,9 @@ def _clean_nulls_from_index(self):
 
     @_performance_tracking
     def memory_usage(self, deep=False):
-        usage = sum(col.memory_usage for col in self._data.columns)
-        if self.levels:
-            for level in self.levels:
-                usage += level.memory_usage(deep=deep)
-        if self._codes_frame:
-            for col in self._codes_frame._data.columns:
-                usage += col.memory_usage
+        usage = sum(col.memory_usage for col in self._columns)
+        usage += sum(level.memory_usage(deep=deep) for level in self._levels)
+        usage += sum(code.memory_usage for code in self._codes)
         return usage
 
     @_performance_tracking
@@ -2043,7 +2034,7 @@ def _union(self, other, sort=None):
             ignore_index=True,
         )
 
-        midx = MultiIndex.from_frame(result_df.iloc[:, : self.nlevels])
+        midx = type(self)._from_data(result_df.iloc[:, : self.nlevels]._data)
         midx.names = self.names if self.names == other.names else None
         if sort in {None, True} and len(other):
             return midx.sort_values()
@@ -2067,7 +2058,8 @@ def _intersection(self, other, sort=None):
             self_df.columns = col_names
 
         result_df = cudf.merge(self_df, other_df, how="inner")
-        midx = self.__class__.from_frame(result_df, names=res_name)
+        midx = type(self)._from_data(result_df._data)
+        midx.names = res_name
         if sort in {None, True} and len(other):
             return midx.sort_values()
         return midx
@@ -2077,6 +2069,7 @@ def _copy_type_metadata(self: Self, other: Self) -> Self:
         res = super()._copy_type_metadata(other)
         if isinstance(other, MultiIndex):
             res._names = other._names
+        self._levels, self._codes = _compute_levels_and_codes(res._data)
         return res
 
     @_performance_tracking
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 07c2e9c3fcf..1941eec91eb 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -832,25 +832,17 @@ def test_multiindex_copy_deep(data, copy_on_write, deep):
 
         # Assert ._levels identity
         lptrs = [
-            lv._data._data[None].base_data.get_ptr(mode="read")
-            for lv in mi1._levels
+            lv._column.base_data.get_ptr(mode="read") for lv in mi1._levels
         ]
         rptrs = [
-            lv._data._data[None].base_data.get_ptr(mode="read")
-            for lv in mi2._levels
+            lv._column.base_data.get_ptr(mode="read") for lv in mi2._levels
         ]
 
         assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
 
         # Assert ._codes identity
-        lptrs = [
-            c.base_data.get_ptr(mode="read")
-            for _, c in mi1._codes._data.items()
-        ]
-        rptrs = [
-            c.base_data.get_ptr(mode="read")
-            for _, c in mi2._codes._data.items()
-        ]
+        lptrs = [c.base_data.get_ptr(mode="read") for c in mi1._codes]
+        rptrs = [c.base_data.get_ptr(mode="read") for c in mi2._codes]
 
         assert all((x == y) == same_ref for x, y in zip(lptrs, rptrs))
 
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 193d64a9e7f..a013745f71e 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -186,13 +186,11 @@ def test_MI():
         }
     )
     levels = [["a", "b", "c", "d"], ["w", "x", "y", "z"], ["m", "n"]]
-    codes = cudf.DataFrame(
-        {
-            "a": [0, 0, 0, 0, 1, 1, 2, 2, 3, 3],
-            "b": [0, 1, 2, 3, 0, 1, 2, 3, 0, 1],
-            "c": [0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
-        }
-    )
+    codes = [
+        [0, 0, 0, 0, 1, 1, 2, 2, 3, 3],
+        [0, 1, 2, 3, 0, 1, 2, 3, 0, 1],
+        [0, 1, 0, 1, 0, 1, 0, 1, 0, 1],
+    ]
     pd.options.display.max_rows = 999
     pd.options.display.max_columns = 0
     gdf = gdf.set_index(cudf.MultiIndex(levels=levels, codes=codes))

From 954ce6d5a64190b7d71cc6f94e7fa4a87ae34598 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 12 Jul 2024 18:23:27 -0500
Subject: [PATCH 507/842] Add low memory JSON reader for `cudf.pandas` (#16204)

Fixes: #16122

This PR introduces low-memory JSON reading for `cudf.pandas` `read_json`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/16204
---
 cpp/src/io/json/read_json.cu                |   3 +-
 cpp/src/io/utilities/datasource.cpp         |   2 +-
 python/cudf/cudf/_lib/json.pyx              |  63 ++++---
 python/cudf/cudf/_lib/pylibcudf/io/json.pxd |  11 ++
 python/cudf/cudf/_lib/pylibcudf/io/json.pyx | 176 +++++++++++++++++---
 python/cudf/cudf/_lib/utils.pxd             |   1 +
 python/cudf/cudf/tests/test_csv.py          |   2 +-
 python/cudf/cudf/tests/test_json.py         |  16 ++
 8 files changed, 228 insertions(+), 46 deletions(-)

diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 74001e5e01a..9cd39038348 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -193,7 +193,8 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
   size_t chunk_size                         = reader_opts.get_byte_range_size();
 
   CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset,
-               "Invalid offsetting");
+               "Invalid offsetting",
+               std::invalid_argument);
   auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
   chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size;
 
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index c8a438fc40b..91be154e09d 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -217,7 +217,7 @@ class memory_mapped_source : public file_source {
 
   void map(int fd, size_t offset, size_t size)
   {
-    CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file");
+    CUDF_EXPECTS(offset < _file.size(), "Offset is past end of file", std::overflow_error);
 
     // Offset for `mmap()` must be page aligned
     _map_offset = offset & ~(sysconf(_SC_PAGESIZE) - 1);
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 9c646e3357b..853dd431099 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -10,6 +10,7 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp cimport bool
 
 cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+from cudf._lib.column cimport Column
 from cudf._lib.io.utils cimport add_df_col_struct_names
 from cudf._lib.pylibcudf.io.types cimport compression_type
 from cudf._lib.pylibcudf.libcudf.io.json cimport json_recovery_mode_t
@@ -17,7 +18,7 @@ from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type
 from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
 from cudf._lib.pylibcudf.types cimport DataType
 from cudf._lib.types cimport dtype_to_data_type
-from cudf._lib.utils cimport data_from_pylibcudf_io
+from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
 
 import cudf._lib.pylibcudf as plc
 
@@ -98,28 +99,48 @@ cpdef read_json(object filepaths_or_buffers,
         else:
             raise TypeError("`dtype` must be 'list like' or 'dict'")
 
-    table_w_meta = plc.io.json.read_json(
-        plc.io.SourceInfo(filepaths_or_buffers),
-        processed_dtypes,
-        c_compression,
-        lines,
-        byte_range_offset = byte_range[0] if byte_range is not None else 0,
-        byte_range_size = byte_range[1] if byte_range is not None else 0,
-        keep_quotes = keep_quotes,
-        mixed_types_as_string = mixed_types_as_string,
-        prune_columns = prune_columns,
-        recovery_mode = _get_json_recovery_mode(on_bad_lines)
-    )
-
-    df = cudf.DataFrame._from_data(
-        *data_from_pylibcudf_io(
-            table_w_meta
+    if cudf.get_option("mode.pandas_compatible") and lines:
+        res_cols, res_col_names, res_child_names = plc.io.json.chunked_read_json(
+            plc.io.SourceInfo(filepaths_or_buffers),
+            processed_dtypes,
+            c_compression,
+            keep_quotes = keep_quotes,
+            mixed_types_as_string = mixed_types_as_string,
+            prune_columns = prune_columns,
+            recovery_mode = _get_json_recovery_mode(on_bad_lines)
+        )
+        df = cudf.DataFrame._from_data(
+            *_data_from_columns(
+                columns=[Column.from_pylibcudf(plc) for plc in res_cols],
+                column_names=res_col_names,
+                index_names=None
+               )
+            )
+        add_df_col_struct_names(df, res_child_names)
+        return df
+    else:
+        table_w_meta = plc.io.json.read_json(
+            plc.io.SourceInfo(filepaths_or_buffers),
+            processed_dtypes,
+            c_compression,
+            lines,
+            byte_range_offset = byte_range[0] if byte_range is not None else 0,
+            byte_range_size = byte_range[1] if byte_range is not None else 0,
+            keep_quotes = keep_quotes,
+            mixed_types_as_string = mixed_types_as_string,
+            prune_columns = prune_columns,
+            recovery_mode = _get_json_recovery_mode(on_bad_lines)
+        )
+
+        df = cudf.DataFrame._from_data(
+            *data_from_pylibcudf_io(
+                table_w_meta
+            )
         )
-    )
 
-    # Post-processing to add in struct column names
-    add_df_col_struct_names(df, table_w_meta.child_names)
-    return df
+        # Post-processing to add in struct column names
+        add_df_col_struct_names(df, table_w_meta.child_names)
+        return df
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
index f7f733a493d..2e0e92a054f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
@@ -35,3 +35,14 @@ cpdef void write_json(
     str true_value = *,
     str false_value = *
 )
+
+cpdef tuple chunked_read_json(
+    SourceInfo source_info,
+    list dtypes = *,
+    compression_type compression = *,
+    bool keep_quotes = *,
+    bool mixed_types_as_string = *,
+    bool prune_columns = *,
+    json_recovery_mode_t recovery_mode = *,
+    int chunk_size= *,
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
index 354cb4981de..2710ee60075 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
@@ -6,6 +6,7 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+from cudf._lib.pylibcudf.concatenate cimport concatenate
 from cudf._lib.pylibcudf.io.types cimport (
     SinkInfo,
     SourceInfo,
@@ -50,6 +51,144 @@ cdef map[string, schema_element] _generate_schema_map(list dtypes):
     return schema_map
 
 
+cdef json_reader_options _setup_json_reader_options(
+        SourceInfo source_info,
+        list dtypes,
+        compression_type compression,
+        bool lines,
+        size_type byte_range_offset,
+        size_type byte_range_size,
+        bool keep_quotes,
+        bool mixed_types_as_string,
+        bool prune_columns,
+        json_recovery_mode_t recovery_mode):
+
+    cdef vector[data_type] types_vec
+    cdef json_reader_options opts = move(
+        json_reader_options.builder(source_info.c_obj)
+        .compression(compression)
+        .lines(lines)
+        .byte_range_offset(byte_range_offset)
+        .byte_range_size(byte_range_size)
+        .recovery_mode(recovery_mode)
+        .build()
+    )
+
+    if dtypes is not None:
+        if isinstance(dtypes[0], tuple):
+            opts.set_dtypes(move(_generate_schema_map(dtypes)))
+        else:
+            for dtype in dtypes:
+                types_vec.push_back((<DataType>dtype).c_obj)
+            opts.set_dtypes(types_vec)
+
+    opts.enable_keep_quotes(keep_quotes)
+    opts.enable_mixed_types_as_string(mixed_types_as_string)
+    opts.enable_prune_columns(prune_columns)
+    return opts
+
+
+cpdef tuple chunked_read_json(
+    SourceInfo source_info,
+    list dtypes = None,
+    compression_type compression = compression_type.AUTO,
+    bool keep_quotes = False,
+    bool mixed_types_as_string = False,
+    bool prune_columns = False,
+    json_recovery_mode_t recovery_mode = json_recovery_mode_t.FAIL,
+    int chunk_size=100_000_000,
+):
+    """Reads an JSON file into a :py:class:`~.types.TableWithMetadata`.
+
+    Parameters
+    ----------
+    source_info : SourceInfo
+        The SourceInfo object to read the JSON file from.
+    dtypes : list, default None
+        Set data types for the columns in the JSON file.
+
+        Each element of the list has the format
+        (column_name, column_dtype, list of child dtypes), where
+        the list of child dtypes is an empty list if the child is not
+        a nested type (list or struct dtype), and is of format
+        (column_child_name, column_child_type, list of grandchild dtypes).
+    compression: CompressionType, default CompressionType.AUTO
+        The compression format of the JSON source.
+    keep_quotes : bool, default False
+        Whether the reader should keep quotes of string values.
+    mixed_types_as_string : bool, default False
+        If True, mixed type columns are returned as string columns.
+        If `False` parsing mixed type columns will thrown an error.
+    prune_columns : bool, default False
+        Whether to only read columns specified in dtypes.
+    recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL
+        Whether to raise an error or set corresponding values to null
+        when encountering an invalid JSON line.
+    chunk_size : int, default 100_000_000 bytes.
+        The number of bytes to be read in chunks.
+        The chunk_size should be set to at least row_size.
+
+    Returns
+    -------
+    tuple
+        A tuple of (columns, column_name, child_names)
+    """
+    cdef size_type c_range_size = (
+        chunk_size if chunk_size is not None else 0
+    )
+    cdef json_reader_options opts = _setup_json_reader_options(
+        source_info=source_info,
+        dtypes=dtypes,
+        compression=compression,
+        lines=True,
+        byte_range_offset=0,
+        byte_range_size=0,
+        keep_quotes=keep_quotes,
+        mixed_types_as_string=mixed_types_as_string,
+        prune_columns=prune_columns,
+        recovery_mode=recovery_mode,
+    )
+
+    # Read JSON
+    cdef table_with_metadata c_result
+
+    final_columns = []
+    meta_names = None
+    child_names = None
+    i = 0
+    while True:
+        opts.set_byte_range_offset(c_range_size * i)
+        opts.set_byte_range_size(c_range_size)
+
+        try:
+            with nogil:
+                c_result = move(cpp_read_json(opts))
+        except (ValueError, OverflowError):
+            break
+        if meta_names is None:
+            meta_names = [info.name.decode() for info in c_result.metadata.schema_info]
+        if child_names is None:
+            child_names = TableWithMetadata._parse_col_names(
+                c_result.metadata.schema_info
+            )
+        new_chunk = [
+            col for col in TableWithMetadata.from_libcudf(
+                c_result).columns
+        ]
+
+        if len(final_columns) == 0:
+            final_columns = new_chunk
+        else:
+            for col_idx in range(len(meta_names)):
+                final_columns[col_idx] = concatenate(
+                    [final_columns[col_idx], new_chunk[col_idx]]
+                )
+                # Must drop any residual GPU columns to save memory
+                new_chunk[col_idx] = None
+        i += 1
+    return (final_columns, meta_names, child_names)
+
+
 cpdef TableWithMetadata read_json(
     SourceInfo source_info,
     list dtypes = None,
@@ -76,7 +215,7 @@ cpdef TableWithMetadata read_json(
         the list of child dtypes is an empty list if the child is not
         a nested type (list or struct dtype), and is of format
         (column_child_name, column_child_type, list of grandchild dtypes).
-    compression_type: CompressionType, default CompressionType.AUTO
+    compression: CompressionType, default CompressionType.AUTO
         The compression format of the JSON source.
     byte_range_offset : size_type, default 0
         Number of bytes to skip from source start.
@@ -84,6 +223,9 @@ cpdef TableWithMetadata read_json(
         Number of bytes to read. By default, will read all bytes.
     keep_quotes : bool, default False
         Whether the reader should keep quotes of string values.
+    mixed_types_as_string : bool, default False
+        If True, mixed type columns are returned as string columns.
+        If `False` parsing mixed type columns will thrown an error.
     prune_columns : bool, default False
         Whether to only read columns specified in dtypes.
     recover_mode : JSONRecoveryMode, default JSONRecoveryMode.FAIL
@@ -95,29 +237,19 @@ cpdef TableWithMetadata read_json(
     TableWithMetadata
         The Table and its corresponding metadata (column names) that were read in.
     """
-    cdef vector[data_type] types_vec
-    cdef json_reader_options opts = move(
-        json_reader_options.builder(source_info.c_obj)
-        .compression(compression)
-        .lines(lines)
-        .byte_range_offset(byte_range_offset)
-        .byte_range_size(byte_range_size)
-        .recovery_mode(recovery_mode)
-        .build()
+    cdef json_reader_options opts = _setup_json_reader_options(
+        source_info=source_info,
+        dtypes=dtypes,
+        compression=compression,
+        lines=lines,
+        byte_range_offset=byte_range_offset,
+        byte_range_size=byte_range_size,
+        keep_quotes=keep_quotes,
+        mixed_types_as_string=mixed_types_as_string,
+        prune_columns=prune_columns,
+        recovery_mode=recovery_mode,
     )
 
-    if dtypes is not None:
-        if isinstance(dtypes[0], tuple):
-            opts.set_dtypes(move(_generate_schema_map(dtypes)))
-        else:
-            for dtype in dtypes:
-                types_vec.push_back((<DataType>dtype).c_obj)
-            opts.set_dtypes(types_vec)
-
-    opts.enable_keep_quotes(keep_quotes)
-    opts.enable_mixed_types_as_string(mixed_types_as_string)
-    opts.enable_prune_columns(prune_columns)
-
     # Read JSON
     cdef table_with_metadata c_result
 
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index 99850d549a1..1d55f7218dc 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -19,3 +19,4 @@ cdef table_view table_view_from_table(tbl, ignore_index=*) except*
 cdef columns_from_unique_ptr(unique_ptr[table] c_tbl)
 cdef columns_from_table_view(table_view tv, object owners)
 cdef columns_from_pylibcudf_table(tbl)
+cdef _data_from_columns(columns, column_names, index_names=*)
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 09617306606..a22a627523f 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1191,7 +1191,7 @@ def test_csv_reader_byte_range_type_corner_case(tmpdir):
     ).to_csv(fname, chunksize=100000)
 
     byte_range = (2_147_483_648, 0)
-    with pytest.raises(RuntimeError, match="Offset is past end of file"):
+    with pytest.raises(OverflowError, match="Offset is past end of file"):
         cudf.read_csv(fname, byte_range=byte_range, header=None)
 
 
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 9222f6d23db..7771afd692f 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -1428,3 +1428,19 @@ def test_json_reader_on_bad_lines(on_bad_lines):
                 orient="records",
                 on_bad_lines=on_bad_lines,
             )
+
+
+def test_chunked_json_reader():
+    df = cudf.DataFrame(
+        {
+            "a": ["aaaa"] * 9_00_00_00,
+            "b": list(range(0, 9_00_00_00)),
+        }
+    )
+    buf = BytesIO()
+    df.to_json(buf, lines=True, orient="records", engine="cudf")
+    buf.seek(0)
+    df = df.to_pandas()
+    with cudf.option_context("mode.pandas_compatible", True):
+        gdf = cudf.read_json(buf, lines=True)
+    assert_eq(df, gdf)

From c4ee4a7a8f7513dc31dd29124bbbf797f0d5c8fc Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 15 Jul 2024 08:26:22 -0500
Subject: [PATCH 508/842] Add multi-file support to `dask_cudf.read_json`
 (#16057)

Dask cuDF often benefits from a larger partition sizes than pandas-backed Dask DataFrame. This motivates the ability to easily "aggregate" multiple json files into each partition using `dask_cudf.read_json`. This PR introduces the `aggregate_files` argument (defaults to `True`) to make it easier to accomplish multi-file DataFrame partitions.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/cudf/pull/16057
---
 python/dask_cudf/dask_cudf/backends.py        |  15 +-
 python/dask_cudf/dask_cudf/io/json.py         | 146 +++++++++++++++++-
 .../dask_cudf/dask_cudf/io/tests/test_json.py |  29 ++++
 3 files changed, 173 insertions(+), 17 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 1f55a59ea55..4bdb5d921ec 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -667,17 +667,10 @@ def from_dict(
         )
 
     @staticmethod
-    def read_json(*args, engine="auto", **kwargs):
-        return _default_backend(
-            dd.read_json,
-            *args,
-            engine=(
-                partial(cudf.read_json, engine=engine)
-                if isinstance(engine, str)
-                else engine
-            ),
-            **kwargs,
-        )
+    def read_json(*args, **kwargs):
+        from dask_cudf.io.json import read_json as read_json_impl
+
+        return read_json_impl(*args, **kwargs)
 
     @staticmethod
     def read_orc(*args, **kwargs):
diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py
index 2a6ad603414..8705d98e9d6 100644
--- a/python/dask_cudf/dask_cudf/io/json.py
+++ b/python/dask_cudf/dask_cudf/io/json.py
@@ -1,15 +1,71 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from functools import partial
 
+import numpy as np
+from fsspec.core import get_compression, get_fs_token_paths
+
 import dask
+from dask.utils import parse_bytes
 
 import cudf
+from cudf.core.column import as_column
+from cudf.utils.ioutils import _is_local_filesystem
 
 from dask_cudf.backends import _default_backend
 
 
-def read_json(url_path, engine="auto", **kwargs):
+def _read_json_partition(
+    paths,
+    fs=None,
+    include_path_column=False,
+    path_converter=None,
+    **kwargs,
+):
+    # Transfer all data up front for remote storage
+    sources = (
+        paths
+        if fs is None
+        else fs.cat_ranges(
+            paths,
+            [0] * len(paths),
+            fs.sizes(paths),
+        )
+    )
+
+    if include_path_column:
+        # Add "path" column.
+        # Must iterate over sources sequentially
+        if not isinstance(include_path_column, str):
+            include_path_column = "path"
+        converted_paths = (
+            paths
+            if path_converter is None
+            else [path_converter(path) for path in paths]
+        )
+        dfs = []
+        for i, source in enumerate(sources):
+            df = cudf.read_json(source, **kwargs)
+            df[include_path_column] = as_column(
+                converted_paths[i], length=len(df)
+            )
+            dfs.append(df)
+        return cudf.concat(dfs)
+    else:
+        # Pass sources directly to cudf
+        return cudf.read_json(sources, **kwargs)
+
+
+def read_json(
+    url_path,
+    engine="auto",
+    blocksize=None,
+    orient="records",
+    lines=None,
+    compression="infer",
+    aggregate_files=True,
+    **kwargs,
+):
     """Read JSON data into a :class:`.DataFrame`.
 
     This function wraps :func:`dask.dataframe.read_json`, and passes
@@ -30,7 +86,13 @@ def read_json(url_path, engine="auto", **kwargs):
         data. The default value is "auto", so that
         ``engine=partial(cudf.read_json, engine="auto")`` will be
         passed to :func:`dask.dataframe.read_json` by default.
-
+    aggregate_files : bool or int
+        Whether to map multiple files to each output partition. If True,
+        the `blocksize` argument will be used to determine the number of
+        files in each partition. If any one file is larger than `blocksize`,
+        the `aggregate_files` argument will be ignored. If an integer value
+        is specified, the `blocksize` argument will be ignored, and that
+        number of files will be mapped to each partition. Default is True.
     **kwargs :
         Key-word arguments to pass through to :func:`dask.dataframe.read_json`.
 
@@ -60,9 +122,77 @@ def read_json(url_path, engine="auto", **kwargs):
 
     """
 
-    # TODO: Add optimized code path to leverage the
-    # `byte_range` argument in `cudf.read_json` for
-    # local storage (see `dask_cudf.read_csv`)
+    if lines is None:
+        lines = orient == "records"
+    if orient != "records" and lines:
+        raise ValueError(
+            'Line-delimited JSON is only available with orient="records".'
+        )
+    if blocksize and (orient != "records" or not lines):
+        raise ValueError(
+            "JSON file chunking only allowed for JSON-lines"
+            "input (orient='records', lines=True)."
+        )
+
+    inputs = []
+    if aggregate_files and blocksize or int(aggregate_files) > 1:
+        # Attempt custom read if we are mapping multiple files
+        # to each output partition. Otherwise, upstream logic
+        # is sufficient.
+
+        storage_options = kwargs.get("storage_options", {})
+        fs, _, paths = get_fs_token_paths(
+            url_path, mode="rb", storage_options=storage_options
+        )
+        if isinstance(aggregate_files, int) and aggregate_files > 1:
+            # Map a static file count to each partition
+            inputs = [
+                paths[offset : offset + aggregate_files]
+                for offset in range(0, len(paths), aggregate_files)
+            ]
+        elif aggregate_files is True and blocksize:
+            # Map files dynamically (using blocksize)
+            file_sizes = fs.sizes(paths)  # NOTE: This can be slow
+            blocksize = parse_bytes(blocksize)
+            if all([file_size <= blocksize for file_size in file_sizes]):
+                counts = np.unique(
+                    np.floor(np.cumsum(file_sizes) / blocksize),
+                    return_counts=True,
+                )[1]
+                offsets = np.concatenate([[0], counts.cumsum()])
+                inputs = [
+                    paths[offsets[i] : offsets[i + 1]]
+                    for i in range(len(offsets) - 1)
+                ]
+
+    if inputs:
+        # Inputs were successfully populated.
+        # Use custom _read_json_partition function
+        # to generate each partition.
+
+        compression = get_compression(
+            url_path[0] if isinstance(url_path, list) else url_path,
+            compression,
+        )
+        _kwargs = dict(
+            orient=orient,
+            lines=lines,
+            compression=compression,
+            include_path_column=kwargs.get("include_path_column", False),
+            path_converter=kwargs.get("path_converter"),
+        )
+        if not _is_local_filesystem(fs):
+            _kwargs["fs"] = fs
+        # TODO: Generate meta more efficiently
+        meta = _read_json_partition(inputs[0][:1], **_kwargs)
+        return dask.dataframe.from_map(
+            _read_json_partition,
+            inputs,
+            meta=meta,
+            **_kwargs,
+        )
+
+    # Fall back to dask.dataframe.read_json
     return _default_backend(
         dask.dataframe.read_json,
         url_path,
@@ -71,5 +201,9 @@ def read_json(url_path, engine="auto", **kwargs):
             if isinstance(engine, str)
             else engine
         ),
+        blocksize=blocksize,
+        orient=orient,
+        lines=lines,
+        compression=compression,
         **kwargs,
     )
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py
index dc780478794..abafbffd197 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_json.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
+import math
 import os
 
 import pandas as pd
@@ -97,3 +98,31 @@ def test_read_json_nested(tmp_path):
         # Ensure not passing kwargs also reads the file.
         actual = dask_cudf.read_json(f)
         dd.assert_eq(actual, actual_pd)
+
+
+def test_read_json_aggregate_files(tmp_path):
+    df1 = dask.datasets.timeseries(
+        dtypes={"x": int, "y": int}, freq="120s"
+    ).reset_index(drop=True)
+    json_path = str(tmp_path / "data-*.json")
+    df1.to_json(json_path)
+
+    df2 = dask_cudf.read_json(json_path, aggregate_files=2)
+    assert df2.npartitions == math.ceil(df1.npartitions / 2)
+    dd.assert_eq(df1, df2, check_index=False)
+
+    df2 = dask_cudf.read_json(
+        json_path, aggregate_files=True, blocksize="1GiB"
+    )
+    assert df2.npartitions == 1
+    dd.assert_eq(df1, df2, check_index=False)
+
+    for include_path_column, name in [(True, "path"), ("file", "file")]:
+        df2 = dask_cudf.read_json(
+            json_path,
+            aggregate_files=2,
+            include_path_column=include_path_column,
+        )
+        assert name in df2.columns
+        assert len(df2[name].compute().unique()) == df1.npartitions
+        dd.assert_eq(df1, df2.drop(columns=[name]), check_index=False)

From 1889c7c0f517c95143016a6e391275144a034f7a Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Mon, 15 Jul 2024 20:32:15 +0200
Subject: [PATCH 509/842] MAINT: Adapt to NumPy 2 promotion changes (#16141)

Splitting out the non API changes from gh-15897, the Scalar API change is required for the tests to pass with NumPy 2, but almost all changes should be relatively straight forward here on their own.

(I will add inline comments.)

---

This PR does not fix integer comparisons, there are currently no tests that run into these.

xref: https://github.com/rapidsai/build-planning/issues/38

Authors:
  - Sebastian Berg (https://github.com/seberg)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16141
---
 python/cudf/cudf/core/_internals/where.py   | 24 +++++++++++-------
 python/cudf/cudf/core/column/categorical.py |  4 ++-
 python/cudf/cudf/core/column/numerical.py   | 27 ++++++++++++++++-----
 python/cudf/cudf/tests/test_binops.py       | 21 +++++++++++++---
 python/cudf/cudf/tests/test_doctests.py     | 13 +++++++++-
 python/cudf/cudf/tests/test_dtypes.py       |  1 -
 6 files changed, 69 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 44ce0ddef25..f3183e6029d 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -54,13 +54,17 @@ def _check_and_cast_columns_with_other(
 
     other_is_scalar = is_scalar(other)
     if other_is_scalar:
-        if (isinstance(other, float) and not np.isnan(other)) and (
-            source_dtype.type(other) != other
-        ):
-            raise TypeError(
-                f"Cannot safely cast non-equivalent "
-                f"{type(other).__name__} to {source_dtype.name}"
-            )
+        if isinstance(other, float) and not np.isnan(other):
+            try:
+                is_safe = source_dtype.type(other) == other
+            except OverflowError:
+                is_safe = False
+
+            if not is_safe:
+                raise TypeError(
+                    f"Cannot safely cast non-equivalent "
+                    f"{type(other).__name__} to {source_dtype.name}"
+                )
 
         if cudf.utils.utils.is_na_like(other):
             return _normalize_categorical(
@@ -84,8 +88,10 @@ def _check_and_cast_columns_with_other(
             )
         return _normalize_categorical(source_col, other.astype(source_dtype))
 
-    if _is_non_decimal_numeric_dtype(source_dtype) and _can_cast(
-        other, source_dtype
+    if (
+        _is_non_decimal_numeric_dtype(source_dtype)
+        and not other_is_scalar  # can-cast fails for Python scalars
+        and _can_cast(other, source_dtype)
     ):
         common_dtype = source_dtype
     elif (
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index f763d3b4b0c..9aaccca349d 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -47,7 +47,9 @@
     )
 
 
-_DEFAULT_CATEGORICAL_VALUE = -1
+# Using np.int8(-1) to allow silent wrap-around when casting to uint
+# it may make sense to make this dtype specific or a function.
+_DEFAULT_CATEGORICAL_VALUE = np.int8(-1)
 
 
 class CategoricalAccessor(ColumnMethods):
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index a0550bff72b..b8fa00e9643 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -301,15 +301,28 @@ def normalize_binop_value(
         if isinstance(other, cudf.Scalar):
             if self.dtype == other.dtype:
                 return other
+
             # expensive device-host transfer just to
             # adjust the dtype
             other = other.value
+
+            # NumPy 2 needs a Python scalar to do weak promotion, but
+            # pandas forces weak promotion always
+            # TODO: We could use 0, 0.0, and 0j for promotion to avoid copies.
+            if other.dtype.kind in "ifc":
+                other = other.item()
+        elif not isinstance(other, (int, float, complex)):
+            # Go via NumPy to get the value
+            other = np.array(other)
+            if other.dtype.kind in "ifc":
+                other = other.item()
+
         # Try and match pandas and hence numpy. Deduce the common
-        # dtype via the _value_ of other, and the dtype of self. TODO:
-        # When NEP50 is accepted, this might want changed or
-        # simplified.
-        # This is not at all simple:
-        # np.result_type(np.int64(0), np.uint8)
+        # dtype via the _value_ of other, and the dtype of self on NumPy 1.x
+        # with NumPy 2, we force weak promotion even for our/NumPy scalars
+        # to match pandas 2.2.
+        # Weak promotion is not at all simple:
+        # np.result_type(0, np.uint8)
         #   => np.uint8
         # np.result_type(np.asarray([0], dtype=np.int64), np.uint8)
         #   => np.int64
@@ -626,7 +639,9 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
             min_, max_ = iinfo.min, iinfo.max
 
             # best we can do is hope to catch it here and avoid compare
-            if (self.min() >= min_) and (self.max() <= max_):
+            # Use Python floats, which have precise comparison for float64.
+            # NOTE(seberg): it would make sense to limit to the mantissa range.
+            if (float(self.min()) >= min_) and (float(self.max()) <= max_):
                 filled = self.fillna(0)
                 return (cudf.Series(filled) % 1 == 0).all()
             else:
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 7d8c3b53115..5265278db4c 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -539,7 +539,14 @@ def test_series_reflected_ops_scalar(func, dtype, obj_class):
     if obj_class == "Index":
         gs = Index(gs)
 
-    gs_result = func(gs)
+    try:
+        gs_result = func(gs)
+    except OverflowError:
+        # An error is fine, if pandas raises the same error:
+        with pytest.raises(OverflowError):
+            func(random_series)
+
+        return
 
     # class typing
     if obj_class == "Index":
@@ -589,7 +596,14 @@ def test_series_reflected_ops_cudf_scalar(funcs, dtype, obj_class):
     if obj_class == "Index":
         gs = Index(gs)
 
-    gs_result = gpu_func(gs)
+    try:
+        gs_result = gpu_func(gs)
+    except OverflowError:
+        # An error is fine, if pandas raises the same error:
+        with pytest.raises(OverflowError):
+            cpu_func(random_series)
+
+        return
 
     # class typing
     if obj_class == "Index":
@@ -770,7 +784,8 @@ def test_operator_func_series_and_scalar(
         fill_value=fill_value,
     )
     pdf_series_result = getattr(pdf_series, func)(
-        scalar, fill_value=fill_value
+        np.array(scalar)[()] if use_cudf_scalar else scalar,
+        fill_value=fill_value,
     )
 
     assert_eq(pdf_series_result, gdf_series_result)
diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py
index 0da5c6b04d6..794660cffcb 100644
--- a/python/cudf/cudf/tests/test_doctests.py
+++ b/python/cudf/cudf/tests/test_doctests.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 import contextlib
 import doctest
 import inspect
@@ -8,6 +8,7 @@
 
 import numpy as np
 import pytest
+from packaging import version
 
 import cudf
 
@@ -80,6 +81,16 @@ def chdir_to_tmp_path(cls, tmp_path):
         yield
         os.chdir(original_directory)
 
+    @pytest.fixture(autouse=True)
+    def prinoptions(cls):
+        # TODO: NumPy now prints scalars as `np.int8(1)`, etc. this should
+        #       be adapted evantually.
+        if version.parse(np.__version__) >= version.parse("2.0"):
+            with np.printoptions(legacy="1.25"):
+                yield
+        else:
+            yield
+
     @pytest.mark.parametrize(
         "docstring",
         itertools.chain(*[_find_doctests_in_obj(mod) for mod in tests]),
diff --git a/python/cudf/cudf/tests/test_dtypes.py b/python/cudf/cudf/tests/test_dtypes.py
index edb534a3618..c62b5889fdd 100644
--- a/python/cudf/cudf/tests/test_dtypes.py
+++ b/python/cudf/cudf/tests/test_dtypes.py
@@ -341,7 +341,6 @@ def test_dtype(in_dtype, expect):
         np.complex128,
         complex,
         "S",
-        "a",
         "V",
         "float16",
         np.float16,

From 128f0c917bbc3342f9eca12ca2bf714c88206256 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Mon, 15 Jul 2024 20:34:14 +0200
Subject: [PATCH 510/842] API: Check for integer overflows when creating scalar
 form python int (#16140)

This aligns with NumPy, which deprecated this since a while and raises an error now on NumPy 2, for example for `Scalar(-1, dtype=np.uint8)`.

Since it aligns with NumPy, the DeprecationWarning of earlier NumPy versions is inherited for those.

This (or similar handling) is required to be compatible with NumPy 2/pandas, since the default needs to be to reject operation when values are out of bounds for e.g. `uint8_series + 1000`, the 1000 should not be silently cast to a `uint8`.

---

Split from gh-15897

xref: https://github.com/rapidsai/build-planning/issues/38

Authors:
  - Sebastian Berg (https://github.com/seberg)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16140
---
 python/cudf/cudf/tests/test_scalar.py | 17 +++++++++++++++++
 python/cudf/cudf/tests/test_unaops.py |  5 ++++-
 python/cudf/cudf/utils/dtypes.py      | 14 ++++++++------
 3 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index 05a91a8fea3..195231e9960 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -8,6 +8,7 @@
 import pandas as pd
 import pyarrow as pa
 import pytest
+from packaging import version
 
 import rmm
 
@@ -253,6 +254,22 @@ def test_generic_null_scalar_construction_fails(value):
         cudf.Scalar(value)
 
 
+@pytest.mark.parametrize(
+    "value, dtype", [(1000, "uint8"), (2**30, "int16"), (-1, "uint16")]
+)
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_scalar_out_of_bounds_pyint_fails(value, dtype):
+    # Test that we align with NumPy on scalar creation behavior from
+    # Python integers.
+    if version.parse(np.__version__) >= version.parse("2.0"):
+        with pytest.raises(OverflowError):
+            cudf.Scalar(value, dtype)
+    else:
+        # NumPy allowed this, but it gives a DeprecationWarning on newer
+        # versions (which cudf did not used to do).
+        assert cudf.Scalar(value, dtype).value == np.dtype(dtype).type(value)
+
+
 @pytest.mark.parametrize(
     "dtype", NUMERIC_TYPES + DATETIME_TYPES + TIMEDELTA_TYPES + ["object"]
 )
diff --git a/python/cudf/cudf/tests/test_unaops.py b/python/cudf/cudf/tests/test_unaops.py
index dbbf4fba3a6..5f5d79c1dce 100644
--- a/python/cudf/cudf/tests/test_unaops.py
+++ b/python/cudf/cudf/tests/test_unaops.py
@@ -81,7 +81,10 @@ def generate_valid_scalar_unaop_combos():
 @pytest.mark.parametrize("slr,dtype,op", generate_valid_scalar_unaop_combos())
 def test_scalar_unary_operations(slr, dtype, op):
     slr_host = np.array([slr])[0].astype(cudf.dtype(dtype))
-    slr_device = cudf.Scalar(slr, dtype=dtype)
+    # The scalar may be out of bounds, so go via array force-cast
+    # NOTE: This is a change in behavior
+    slr = np.array(slr).astype(dtype)[()]
+    slr_device = cudf.Scalar(slr)
 
     expect = op(slr_host)
     got = op(slr_device)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 2aa3129ab30..0dec857ea96 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -253,16 +253,18 @@ def to_cudf_compatible_scalar(val, dtype=None):
     elif isinstance(val, datetime.timedelta):
         val = np.timedelta64(val)
 
-    val = _maybe_convert_to_default_type(
-        cudf.api.types.pandas_dtype(type(val))
-    ).type(val)
-
     if dtype is not None:
-        if isinstance(val, str) and np.dtype(dtype).kind == "M":
+        dtype = np.dtype(dtype)
+        if isinstance(val, str) and dtype.kind == "M":
             # pd.Timestamp can handle str, but not np.str_
             val = pd.Timestamp(str(val)).to_datetime64().astype(dtype)
         else:
-            val = val.astype(dtype)
+            # At least datetimes cannot be converted to scalar via dtype.type:
+            val = np.array(val, dtype)[()]
+    else:
+        val = _maybe_convert_to_default_type(
+            cudf.api.types.pandas_dtype(type(val))
+        ).type(val)
 
     if val.dtype.type is np.datetime64:
         time_unit, _ = np.datetime_data(val.dtype)

From ceb73d91c090882ec69642a78b7d791a1bf220fe Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 15 Jul 2024 12:45:51 -0700
Subject: [PATCH 511/842] Make nvcomp adapter compatible with new version
 macros (#16245)

New nvcomp version changed the names of the version macros. This PR adds "aliasing" to the old names so rest of the code is not affected.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - MithunR (https://github.com/mythrocks)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16245
---
 cpp/src/io/comp/nvcomp_adapter.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index 0e34c96debd..5d0c6a8c83b 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -37,6 +37,13 @@
 #include NVCOMP_ZSTD_HEADER
 #endif
 
+// When building with nvcomp 4.0 or newer, map the new version macros to the old ones
+#ifndef NVCOMP_MAJOR_VERSION
+#define NVCOMP_MAJOR_VERSION NVCOMP_VER_MAJOR
+#define NVCOMP_MINOR_VERSION NVCOMP_VER_MINOR
+#define NVCOMP_PATCH_VERSION NVCOMP_VER_PATCH
+#endif
+
 #define NVCOMP_HAS_ZSTD_DECOMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 3))
 
 #define NVCOMP_HAS_ZSTD_COMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 4))

From 04330f2e9e73ac71a86666c55d0fe7248eaf8db6 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 15 Jul 2024 10:23:07 -1000
Subject: [PATCH 512/842] Fix convert_dtypes with
 convert_integer=False/convert_floating=True (#15964)

If `convert_integer=False`, there should be no attempt to convert to integer

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15964
---
 python/cudf/cudf/core/indexed_frame.py        | 34 +++++++++++--------
 .../cudf/cudf/tests/series/test_conversion.py | 13 +++++++
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 63fa96d0db0..30b68574960 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -6235,13 +6235,13 @@ def rank(
 
     def convert_dtypes(
         self,
-        infer_objects=True,
-        convert_string=True,
-        convert_integer=True,
-        convert_boolean=True,
-        convert_floating=True,
+        infer_objects: bool = True,
+        convert_string: bool = True,
+        convert_integer: bool = True,
+        convert_boolean: bool = True,
+        convert_floating: bool = True,
         dtype_backend=None,
-    ):
+    ) -> Self:
         """
         Convert columns to the best possible nullable dtypes.
 
@@ -6252,17 +6252,21 @@ def convert_dtypes(
         All other dtypes are always returned as-is as all dtypes in
         cudf are nullable.
         """
-        result = self.copy()
-
-        if convert_floating:
-            # cast any floating columns to int64 if
-            # they are all integer data:
-            for name, col in result._data.items():
+        if not (convert_floating and convert_integer):
+            return self.copy()
+        else:
+            cols = []
+            for col in self._columns:
                 if col.dtype.kind == "f":
                     col = col.fillna(0)
-                    if cp.allclose(col, col.astype("int64")):
-                        result._data[name] = col.astype("int64")
-        return result
+                    as_int = col.astype("int64")
+                    if cp.allclose(col, as_int):
+                        cols.append(as_int)
+                        continue
+                cols.append(col)
+            return self._from_data_like_self(
+                self._data._from_columns_like_self(cols, verify=False)
+            )
 
     @_warn_no_dask_cudf
     def __dask_tokenize__(self):
diff --git a/python/cudf/cudf/tests/series/test_conversion.py b/python/cudf/cudf/tests/series/test_conversion.py
index e1dd359e1ba..1d680d7860d 100644
--- a/python/cudf/cudf/tests/series/test_conversion.py
+++ b/python/cudf/cudf/tests/series/test_conversion.py
@@ -31,5 +31,18 @@ def test_convert_dtypes(data, dtype):
     assert_eq(expect, got)
 
 
+def test_convert_integer_false_convert_floating_true():
+    data = [1.000000000000000000000000001, 1]
+    expected = pd.Series(data).convert_dtypes(
+        convert_integer=False, convert_floating=True
+    )
+    result = (
+        cudf.Series(data)
+        .convert_dtypes(convert_integer=False, convert_floating=True)
+        .to_pandas(nullable=True)
+    )
+    assert_eq(result, expected)
+
+
 # Now write the same test, but construct a DataFrame
 # as input instead of parametrizing:

From dba46e7a8957b8389b69e820485e319a1d314017 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 15 Jul 2024 15:21:50 -1000
Subject: [PATCH 513/842] Replace is_datetime/timedelta_dtype checks with .kind
 checks (#16262)

It appears this was called when we already had a dtype object so can instead just simply check the .kind attribute

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16262
---
 python/cudf/cudf/_fuzz_testing/utils.py   |  3 +-
 python/cudf/cudf/core/column/datetime.py  |  7 +--
 python/cudf/cudf/core/column/timedelta.py |  4 +-
 python/cudf/cudf/core/dataframe.py        |  7 ++-
 python/cudf/cudf/core/scalar.py           |  8 +---
 python/cudf/cudf/core/tools/numeric.py    |  9 +---
 python/cudf/cudf/tests/test_binops.py     |  7 +--
 python/cudf/cudf/tests/test_dataframe.py  |  4 +-
 python/cudf/cudf/tests/test_list.py       |  7 +--
 python/cudf/cudf/tests/test_scalar.py     | 11 +----
 python/cudf/cudf/utils/dtypes.py          | 56 ++++++++---------------
 11 files changed, 37 insertions(+), 86 deletions(-)

diff --git a/python/cudf/cudf/_fuzz_testing/utils.py b/python/cudf/cudf/_fuzz_testing/utils.py
index e6dfe2eae62..8ce92e1c0f6 100644
--- a/python/cudf/cudf/_fuzz_testing/utils.py
+++ b/python/cudf/cudf/_fuzz_testing/utils.py
@@ -192,8 +192,7 @@ def convert_nulls_to_none(records, df):
         col
         for col in df.columns
         if df[col].dtype in pandas_dtypes_to_np_dtypes
-        or pd.api.types.is_datetime64_dtype(df[col].dtype)
-        or pd.api.types.is_timedelta64_dtype(df[col].dtype)
+        or df[col].dtype.kind in "mM"
     ]
 
     for record in records:
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 214e84028d2..409c44f6eee 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -18,7 +18,6 @@
 from cudf import _lib as libcudf
 from cudf._lib.labeling import label_bins
 from cudf._lib.search import search_sorted
-from cudf.api.types import is_datetime64_dtype, is_timedelta64_dtype
 from cudf.core._compat import PANDAS_GE_220
 from cudf.core._internals.timezones import (
     check_ambiguous_and_nonexistent,
@@ -565,10 +564,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
 
         # We check this on `other` before reflection since we already know the
         # dtype of `self`.
-        other_is_timedelta = is_timedelta64_dtype(other.dtype)
-        other_is_datetime64 = not other_is_timedelta and is_datetime64_dtype(
-            other.dtype
-        )
+        other_is_timedelta = other.dtype.kind == "m"
+        other_is_datetime64 = other.dtype.kind == "M"
         lhs, rhs = (other, self) if reflect else (self, other)
         out_dtype = None
 
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 2cbed9212de..36d7d9f9614 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -12,7 +12,7 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf.api.types import is_scalar, is_timedelta64_dtype
+from cudf.api.types import is_scalar
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column import ColumnBase, column, string
 from cudf.utils.dtypes import np_to_pa_dtype
@@ -153,7 +153,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         this: ColumnBinaryOperand = self
         out_dtype = None
 
-        if is_timedelta64_dtype(other.dtype):
+        if other.dtype.kind == "m":
             # TODO: pandas will allow these operators to work but return false
             # when comparing to non-timedelta dtypes. We should do the same.
             if op in {
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f110b788789..2aa1b95e2d1 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -33,7 +33,6 @@
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
     is_bool_dtype,
-    is_datetime_dtype,
     is_dict_like,
     is_dtype_equal,
     is_list_like,
@@ -6113,7 +6112,7 @@ def _prepare_for_rowwise_op(self, method, skipna, numeric_only):
         else:
             filtered = self.copy(deep=False)
 
-        is_pure_dt = all(is_datetime_dtype(dt) for dt in filtered.dtypes)
+        is_pure_dt = all(dt.kind == "M" for dt in filtered.dtypes)
 
         common_dtype = find_common_type(filtered.dtypes)
         if (
@@ -6510,7 +6509,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
                         cudf.utils.dtypes.get_min_float_dtype(
                             prepared._data[col]
                         )
-                        if not is_datetime_dtype(common_dtype)
+                        if common_dtype.kind != "M"
                         else cudf.dtype("float64")
                     )
                     .fillna(np.nan)
@@ -6537,7 +6536,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
             result_dtype = (
                 common_dtype
                 if method in type_coerced_methods
-                or is_datetime_dtype(common_dtype)
+                or (common_dtype is not None and common_dtype.kind == "M")
                 else None
             )
             result = column.as_column(result, dtype=result_dtype)
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index 29460d8c67e..f6331aa1f49 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -8,7 +8,7 @@
 import pyarrow as pa
 
 import cudf
-from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
+from cudf.api.types import is_scalar
 from cudf.core.dtypes import ListDtype, StructDtype
 from cudf.core.missing import NA, NaT
 from cudf.core.mixins import BinaryOperand
@@ -245,11 +245,7 @@ def _preprocess_host_value(self, value, dtype):
             dtype = cudf.dtype(dtype)
 
         if not valid:
-            value = (
-                NaT
-                if is_datetime64_dtype(dtype) or is_timedelta64_dtype(dtype)
-                else NA
-            )
+            value = NaT if dtype.kind in "mM" else NA
 
         return value, dtype
 
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index ef6b86a04a7..466d46f7dca 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -8,12 +8,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib import strings as libstrings
-from cudf.api.types import (
-    _is_non_decimal_numeric_dtype,
-    is_datetime_dtype,
-    is_string_dtype,
-    is_timedelta_dtype,
-)
+from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype
 from cudf.core.column import as_column
 from cudf.core.dtypes import CategoricalDtype
 from cudf.utils.dtypes import can_convert_to_column
@@ -114,7 +109,7 @@ def to_numeric(arg, errors="raise", downcast=None):
     col = as_column(arg)
     dtype = col.dtype
 
-    if is_datetime_dtype(dtype) or is_timedelta_dtype(dtype):
+    if dtype.kind in "mM":
         col = col.astype(cudf.dtype("int64"))
     elif isinstance(dtype, CategoricalDtype):
         cat_dtype = col.dtype.type
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 5265278db4c..503b1a975b4 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -1694,12 +1694,7 @@ def test_scalar_null_binops(op, dtype_l, dtype_r):
     rhs = cudf.Scalar(cudf.NA, dtype=dtype_r)
 
     result = op(lhs, rhs)
-    assert result.value is (
-        cudf.NaT
-        if cudf.api.types.is_datetime64_dtype(result.dtype)
-        or cudf.api.types.is_timedelta64_dtype(result.dtype)
-        else cudf.NA
-    )
+    assert result.value is (cudf.NaT if result.dtype.kind in "mM" else cudf.NA)
 
     # make sure dtype is the same as had there been a valid scalar
     valid_lhs = cudf.Scalar(1, dtype=dtype_l)
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index f40106a30f4..7ccf83e424c 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -5457,9 +5457,7 @@ def test_rowwise_ops_datetime_dtypes(data, op, skipna, numeric_only):
     gdf = cudf.DataFrame(data)
     pdf = gdf.to_pandas()
 
-    if not numeric_only and not all(
-        cudf.api.types.is_datetime64_dtype(dt) for dt in gdf.dtypes
-    ):
+    if not numeric_only and not all(dt.kind == "M" for dt in gdf.dtypes):
         with pytest.raises(TypeError):
             got = getattr(gdf, op)(
                 axis=1, skipna=skipna, numeric_only=numeric_only
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index ec9d7995b05..36bcaa66d7d 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -694,12 +694,7 @@ def test_list_scalar_host_construction_null(elem_type, nesting_level):
         dtype = cudf.ListDtype(dtype)
 
     slr = cudf.Scalar(None, dtype=dtype)
-    assert slr.value is (
-        cudf.NaT
-        if cudf.api.types.is_datetime64_dtype(slr.dtype)
-        or cudf.api.types.is_timedelta64_dtype(slr.dtype)
-        else cudf.NA
-    )
+    assert slr.value is (cudf.NaT if slr.dtype.kind in "mM" else cudf.NA)
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_scalar.py b/python/cudf/cudf/tests/test_scalar.py
index 195231e9960..f2faf4343b6 100644
--- a/python/cudf/cudf/tests/test_scalar.py
+++ b/python/cudf/cudf/tests/test_scalar.py
@@ -212,9 +212,7 @@ def test_scalar_roundtrip(value):
 )
 def test_null_scalar(dtype):
     s = cudf.Scalar(None, dtype=dtype)
-    if cudf.api.types.is_datetime64_dtype(
-        dtype
-    ) or cudf.api.types.is_timedelta64_dtype(dtype):
+    if s.dtype.kind in "mM":
         assert s.value is cudf.NaT
     else:
         assert s.value is cudf.NA
@@ -369,12 +367,7 @@ def test_scalar_implicit_int_conversion(value):
 @pytest.mark.parametrize("dtype", sorted(set(ALL_TYPES) - {"category"}))
 def test_scalar_invalid_implicit_conversion(cls, dtype):
     try:
-        cls(
-            pd.NaT
-            if cudf.api.types.is_datetime64_dtype(dtype)
-            or cudf.api.types.is_timedelta64_dtype(dtype)
-            else pd.NA
-        )
+        cls(pd.NaT if cudf.dtype(dtype).kind in "mM" else pd.NA)
     except TypeError as e:
         with pytest.raises(TypeError, match=re.escape(str(e))):
             slr = cudf.Scalar(None, dtype=dtype)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 0dec857ea96..59e5ec1df04 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -424,9 +424,7 @@ def get_time_unit(obj):
 
 def _get_nan_for_dtype(dtype):
     dtype = cudf.dtype(dtype)
-    if pd.api.types.is_datetime64_dtype(
-        dtype
-    ) or pd.api.types.is_timedelta64_dtype(dtype):
+    if dtype.kind in "mM":
         time_unit, _ = np.datetime_data(dtype)
         return dtype.type("nat", time_unit)
     elif dtype.kind == "f":
@@ -527,16 +525,14 @@ def find_common_type(dtypes):
             return cudf.dtype("O")
 
     # Aggregate same types
-    dtypes = set(dtypes)
+    dtypes = {cudf.dtype(dtype) for dtype in dtypes}
+    if len(dtypes) == 1:
+        return dtypes.pop()
 
     if any(
         isinstance(dtype, cudf.core.dtypes.DecimalDtype) for dtype in dtypes
     ):
-        if all(
-            cudf.api.types.is_decimal_dtype(dtype)
-            or cudf.api.types.is_numeric_dtype(dtype)
-            for dtype in dtypes
-        ):
+        if all(cudf.api.types.is_numeric_dtype(dtype) for dtype in dtypes):
             return _find_common_type_decimal(
                 [
                     dtype
@@ -546,40 +542,28 @@ def find_common_type(dtypes):
             )
         else:
             return cudf.dtype("O")
-    if any(isinstance(dtype, cudf.ListDtype) for dtype in dtypes):
-        if len(dtypes) == 1:
-            return dtypes.get(0)
-        else:
-            # TODO: As list dtypes allow casting
-            # to identical types, improve this logic of returning a
-            # common dtype, for example:
-            # ListDtype(int64) & ListDtype(int32) common
-            # dtype could be ListDtype(int64).
-            raise NotImplementedError(
-                "Finding a common type for `ListDtype` is currently "
-                "not supported"
-            )
-    if any(isinstance(dtype, cudf.StructDtype) for dtype in dtypes):
-        if len(dtypes) == 1:
-            return dtypes.get(0)
-        else:
-            raise NotImplementedError(
-                "Finding a common type for `StructDtype` is currently "
-                "not supported"
-            )
+    elif any(
+        isinstance(dtype, (cudf.ListDtype, cudf.StructDtype))
+        for dtype in dtypes
+    ):
+        # TODO: As list dtypes allow casting
+        # to identical types, improve this logic of returning a
+        # common dtype, for example:
+        # ListDtype(int64) & ListDtype(int32) common
+        # dtype could be ListDtype(int64).
+        raise NotImplementedError(
+            "Finding a common type for `ListDtype` or `StructDtype` is currently "
+            "not supported"
+        )
 
     # Corner case 1:
     # Resort to np.result_type to handle "M" and "m" types separately
-    dt_dtypes = set(
-        filter(lambda t: cudf.api.types.is_datetime_dtype(t), dtypes)
-    )
+    dt_dtypes = set(filter(lambda t: t.kind == "M", dtypes))
     if len(dt_dtypes) > 0:
         dtypes = dtypes - dt_dtypes
         dtypes.add(np.result_type(*dt_dtypes))
 
-    td_dtypes = set(
-        filter(lambda t: pd.api.types.is_timedelta64_dtype(t), dtypes)
-    )
+    td_dtypes = set(filter(lambda t: t.kind == "m", dtypes))
     if len(td_dtypes) > 0:
         dtypes = dtypes - td_dtypes
         dtypes.add(np.result_type(*td_dtypes))

From 47a0a87db454cc767ab5f74beb2198a480d6f2c0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 15 Jul 2024 16:13:29 -1000
Subject: [PATCH 514/842] Type & reduce cupy usage (#16277)

There are some cupy usages that don't seem _strictly_ necessary (generating starting data, array type conversion) in some APIs. IMO we should prefer using CPU data/the existing data structure/Column ops over cupy when possible

closes https://github.com/rapidsai/cudf/issues/12133

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16277
---
 python/cudf/cudf/core/_base_index.py      |  4 ++-
 python/cudf/cudf/core/column/column.py    |  8 +++---
 python/cudf/cudf/core/column/datetime.py  |  6 ++--
 python/cudf/cudf/core/column/numerical.py | 10 ++-----
 python/cudf/cudf/core/cut.py              |  6 ++--
 python/cudf/cudf/core/dataframe.py        | 18 +++++++-----
 python/cudf/cudf/core/frame.py            |  6 ++--
 python/cudf/cudf/core/groupby/groupby.py  | 23 ++++++++-------
 python/cudf/cudf/core/index.py            | 34 ++++++++++++-----------
 python/cudf/cudf/core/multiindex.py       | 13 +++++----
 python/cudf/cudf/core/tools/datetimes.py  |  9 +++---
 python/cudf/cudf/tests/test_datetime.py   | 15 ++--------
 12 files changed, 74 insertions(+), 78 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index e160fa697ee..9ba2d161619 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -38,6 +38,8 @@
 if TYPE_CHECKING:
     from collections.abc import Generator
 
+    import cupy
+
     from cudf.core.column_accessor import ColumnAccessor
 
 
@@ -2001,7 +2003,7 @@ def drop_duplicates(
             self._column_names,
         )
 
-    def duplicated(self, keep="first"):
+    def duplicated(self, keep="first") -> cupy.ndarray:
         """
         Indicate duplicate index values.
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index f633d527681..fd3664ecac4 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -721,7 +721,7 @@ def notnull(self) -> ColumnBase:
         return result
 
     def indices_of(
-        self, value: ScalarLike | Self
+        self, value: ScalarLike
     ) -> cudf.core.column.NumericalColumn:
         """
         Find locations of value in the column
@@ -735,10 +735,10 @@ def indices_of(
         -------
         Column of indices that match value
         """
-        if not isinstance(value, ColumnBase):
-            value = as_column([value], dtype=self.dtype)
+        if not is_scalar(value):
+            raise ValueError("value must be a scalar")
         else:
-            assert len(value) == 1
+            value = as_column(value, dtype=self.dtype, length=1)
         mask = libcudf.search.contains(value, self)
         return apply_boolean_mask(
             [as_column(range(0, len(self)), dtype=size_type_dtype)], mask
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 409c44f6eee..004a059af95 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -629,9 +629,9 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
     def indices_of(
         self, value: ScalarLike
     ) -> cudf.core.column.NumericalColumn:
-        value = column.as_column(
-            pd.to_datetime(value), dtype=self.dtype
-        ).astype("int64")
+        value = (
+            pd.to_datetime(value).to_numpy().astype(self.dtype).astype("int64")
+        )
         return self.astype("int64").indices_of(value)
 
     @property
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index b8fa00e9643..7f05a5f91a1 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -5,7 +5,6 @@
 import functools
 from typing import TYPE_CHECKING, Any, Callable, Sequence, cast
 
-import cupy as cp
 import numpy as np
 import pandas as pd
 from typing_extensions import Self
@@ -13,7 +12,6 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib import pylibcudf
-from cudf._lib.types import size_type_dtype
 from cudf.api.types import (
     is_bool_dtype,
     is_float_dtype,
@@ -131,12 +129,8 @@ def indices_of(self, value: ScalarLike) -> NumericalColumn:
             and self.dtype.kind in {"c", "f"}
             and np.isnan(value)
         ):
-            return column.as_column(
-                cp.argwhere(
-                    cp.isnan(self.data_array_view(mode="read"))
-                ).flatten(),
-                dtype=size_type_dtype,
-            )
+            nan_col = libcudf.unary.is_nan(self)
+            return nan_col.indices_of(True)
         else:
             return super().indices_of(value)
 
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index d9f62f51f92..197f46ee9fe 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -188,9 +188,6 @@ def cut(
         # adjust bin edges decimal precision
         int_label_bins = np.around(bins, precision)
 
-    # the inputs is a column of the values in the array x
-    input_arr = as_column(x)
-
     # checking for the correct inclusivity values
     if right:
         closed = "right"
@@ -242,6 +239,9 @@ def cut(
                 labels if len(set(labels)) == len(labels) else None
             )
 
+    # the inputs is a column of the values in the array x
+    input_arr = as_column(x)
+
     if isinstance(bins, pd.IntervalIndex):
         # get the left and right edges of the bins as columns
         # we cannot typecast an IntervalIndex, so we need to
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 2aa1b95e2d1..2121e623c1c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -429,7 +429,7 @@ def _setitem_tuple_arg(self, key, value):
 
             else:
                 value = cupy.asarray(value)
-                if cupy.ndim(value) == 2:
+                if value.ndim == 2:
                     # If the inner dimension is 1, it's broadcastable to
                     # all columns of the dataframe.
                     indexed_shape = columns_df.loc[key[0]].shape
@@ -566,7 +566,7 @@ def _setitem_tuple_arg(self, key, value):
             # TODO: consolidate code path with identical counterpart
             # in `_DataFrameLocIndexer._setitem_tuple_arg`
             value = cupy.asarray(value)
-            if cupy.ndim(value) == 2:
+            if value.ndim == 2:
                 indexed_shape = columns_df.iloc[key[0]].shape
                 if value.shape[1] == 1:
                     if value.shape[0] != indexed_shape[0]:
@@ -2199,8 +2199,8 @@ def from_dict(
 
         orient = orient.lower()
         if orient == "index":
-            if len(data) > 0 and isinstance(
-                next(iter(data.values())), (cudf.Series, cupy.ndarray)
+            if isinstance(
+                next(iter(data.values()), None), (cudf.Series, cupy.ndarray)
             ):
                 result = cls(data).T
                 result.columns = (
@@ -5698,7 +5698,13 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
 
     @classmethod
     @_performance_tracking
-    def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
+    def _from_arrays(
+        cls,
+        data: np.ndarray | cupy.ndarray,
+        index=None,
+        columns=None,
+        nan_as_null=False,
+    ):
         """Convert a numpy/cupy array to DataFrame.
 
         Parameters
@@ -5716,8 +5722,6 @@ def _from_arrays(cls, data, index=None, columns=None, nan_as_null=False):
         -------
         DataFrame
         """
-
-        data = cupy.asarray(data)
         if data.ndim != 1 and data.ndim != 2:
             raise ValueError(
                 f"records dimension expected 1 or 2 but found: {data.ndim}"
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 253d200f7d4..802751e47ad 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1189,7 +1189,7 @@ def searchsorted(
         side: Literal["left", "right"] = "left",
         ascending: bool = True,
         na_position: Literal["first", "last"] = "last",
-    ):
+    ) -> ScalarLike | cupy.ndarray:
         """Find indices where elements should be inserted to maintain order
 
         Parameters
@@ -1527,7 +1527,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
     @acquire_spill_lock()
     def _apply_cupy_ufunc_to_operands(
         self, ufunc, cupy_func, operands, **kwargs
-    ):
+    ) -> list[dict[Any, ColumnBase]]:
         # Note: There are some operations that may be supported by libcudf but
         # are not supported by pandas APIs. In particular, libcudf binary
         # operations support logical and/or operations as well as
@@ -1538,7 +1538,7 @@ def _apply_cupy_ufunc_to_operands(
         # without cupy.
 
         mask = None
-        data = [{} for _ in range(ufunc.nout)]
+        data: list[dict[Any, ColumnBase]] = [{} for _ in range(ufunc.nout)]
         for name, (left, right, _, _) in operands.items():
             cupy_inputs = []
             for inp in (left, right) if ufunc.nin == 2 else (left,):
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index eccb3acabf6..8659d7c2392 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -35,7 +35,12 @@
 from cudf.utils.utils import GetAttrGetItemMixin
 
 if TYPE_CHECKING:
-    from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
+    from cudf._typing import (
+        AggType,
+        DataFrameOrSeries,
+        MultiColumnAggType,
+        ScalarLike,
+    )
 
 
 def _deprecate_collect():
@@ -357,7 +362,7 @@ def groups(self):
         )
 
     @cached_property
-    def indices(self):
+    def indices(self) -> dict[ScalarLike, cp.ndarray]:
         """
         Dict {group name -> group indices}.
 
@@ -1015,18 +1020,16 @@ def ngroup(self, ascending=True):
 
         if ascending:
             # Count ascending from 0 to num_groups - 1
-            group_ids = cudf.Series._from_data({None: cp.arange(num_groups)})
+            groups = range(num_groups)
         elif has_null_group:
             # Count descending from num_groups - 1 to 0, but subtract one more
             # for the null group making it num_groups - 2 to -1.
-            group_ids = cudf.Series._from_data(
-                {None: cp.arange(num_groups - 2, -2, -1)}
-            )
+            groups = range(num_groups - 2, -2, -1)
         else:
             # Count descending from num_groups - 1 to 0
-            group_ids = cudf.Series._from_data(
-                {None: cp.arange(num_groups - 1, -1, -1)}
-            )
+            groups = range(num_groups - 1, -1, -1)
+
+        group_ids = cudf.Series._from_data({None: as_column(groups)})
 
         if has_null_group:
             group_ids.iloc[-1] = cudf.NA
@@ -1713,7 +1716,7 @@ def rolling_avg(val, avg):
         return grouped_values.apply_chunks(function, **kwargs)
 
     @_performance_tracking
-    def _broadcast(self, values):
+    def _broadcast(self, values: cudf.Series) -> cudf.Series:
         """
         Broadcast the results of an aggregation to the group
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index b398ee2343e..4164f981fca 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -103,7 +103,7 @@ def __subclasscheck__(self, subclass):
 
 def _lexsorted_equal_range(
     idx: Index | cudf.MultiIndex,
-    key_as_table: Frame,
+    keys: list[ColumnBase],
     is_sorted: bool,
 ) -> tuple[int, int, ColumnBase | None]:
     """Get equal range for key in lexicographically sorted index. If index
@@ -118,13 +118,13 @@ def _lexsorted_equal_range(
         sort_vals = idx
     lower_bound = search_sorted(
         [*sort_vals._data.columns],
-        [*key_as_table._columns],
+        keys,
         side="left",
         ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
     upper_bound = search_sorted(
         [*sort_vals._data.columns],
-        [*key_as_table._columns],
+        keys,
         side="right",
         ascending=sort_vals.is_monotonic_increasing,
     ).element_indexing(0)
@@ -260,7 +260,9 @@ def searchsorted(
         ), "Invalid ascending flag"
         return search_range(value, self._range, side=side)
 
-    def factorize(self, sort: bool = False, use_na_sentinel: bool = True):
+    def factorize(
+        self, sort: bool = False, use_na_sentinel: bool = True
+    ) -> tuple[cupy.ndarray, Self]:
         if sort and self.step < 0:
             codes = cupy.arange(len(self) - 1, -1, -1)
             uniques = self[::-1]
@@ -753,15 +755,16 @@ def difference(self, other, sort=None):
             super().difference(other, sort=sort)
         )
 
-    def _try_reconstruct_range_index(self, index):
-        if isinstance(index, RangeIndex) or index.dtype.kind == "f":
+    def _try_reconstruct_range_index(
+        self, index: BaseIndex
+    ) -> Self | BaseIndex:
+        if isinstance(index, RangeIndex) or index.dtype.kind not in "iu":
             return index
         # Evenly spaced values can return a
         # RangeIndex instead of a materialized Index.
-        if not index._column.has_nulls():
+        if not index._column.has_nulls():  # type: ignore[attr-defined]
             uniques = cupy.unique(cupy.diff(index.values))
-            if len(uniques) == 1 and uniques[0].get() != 0:
-                diff = uniques[0].get()
+            if len(uniques) == 1 and (diff := uniques[0].get()) != 0:
                 new_range = range(index[0], index[-1] + diff, diff)
                 return type(self)(new_range, name=index.name)
         return index
@@ -1309,7 +1312,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
         return _return_get_indexer_result(result_series.to_cupy())
 
     @_performance_tracking
-    def get_loc(self, key):
+    def get_loc(self, key) -> int | slice | cupy.ndarray:
         if not is_scalar(key):
             raise TypeError("Should be a scalar-like")
 
@@ -1317,9 +1320,8 @@ def get_loc(self, key):
             self.is_monotonic_increasing or self.is_monotonic_decreasing
         )
 
-        target_as_table = cudf.core.frame.Frame({"None": as_column([key])})
         lower_bound, upper_bound, sort_inds = _lexsorted_equal_range(
-            self, target_as_table, is_sorted
+            self, [as_column([key])], is_sorted
         )
 
         if lower_bound == upper_bound:
@@ -1330,7 +1332,7 @@ def get_loc(self, key):
             return (
                 lower_bound
                 if is_sorted
-                else sort_inds.element_indexing(lower_bound)
+                else sort_inds.element_indexing(lower_bound)  # type: ignore[union-attr]
             )
 
         if is_sorted:
@@ -1339,8 +1341,8 @@ def get_loc(self, key):
             return slice(lower_bound, upper_bound)
 
         # Not sorted and not unique. Return a boolean mask
-        mask = cupy.full(self._data.nrows, False)
-        true_inds = sort_inds.slice(lower_bound, upper_bound).values
+        mask = cupy.full(len(self), False)
+        true_inds = sort_inds.slice(lower_bound, upper_bound).values  # type: ignore[union-attr]
         mask[true_inds] = True
         return mask
 
@@ -2076,7 +2078,7 @@ def day_of_year(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def is_leap_year(self):
+    def is_leap_year(self) -> cupy.ndarray:
         """
         Boolean indicator if the date belongs to a leap year.
 
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 6503dae6ff5..3ed72ff812a 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1926,17 +1926,18 @@ def get_loc(self, key):
 
         # Handle partial key search. If length of `key` is less than `nlevels`,
         # Only search levels up to `len(key)` level.
-        key_as_table = cudf.core.frame.Frame(
-            {i: column.as_column(k, length=1) for i, k in enumerate(key)}
-        )
         partial_index = self.__class__._from_data(
-            data=self._data.select_by_index(slice(key_as_table._num_columns))
+            data=self._data.select_by_index(slice(len(key)))
         )
         (
             lower_bound,
             upper_bound,
             sort_inds,
-        ) = _lexsorted_equal_range(partial_index, key_as_table, is_sorted)
+        ) = _lexsorted_equal_range(
+            partial_index,
+            [column.as_column(k, length=1) for k in key],
+            is_sorted,
+        )
 
         if lower_bound == upper_bound:
             raise KeyError(key)
@@ -1961,7 +1962,7 @@ def get_loc(self, key):
             return true_inds
 
         # Not sorted and not unique. Return a boolean mask
-        mask = cp.full(self._data.nrows, False)
+        mask = cp.full(len(self), False)
         mask[true_inds] = True
         return mask
 
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 064e8fc667d..c6e2b5d10e1 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -6,7 +6,6 @@
 import warnings
 from typing import Literal, Sequence
 
-import cupy as cp
 import numpy as np
 import pandas as pd
 import pandas.tseries.offsets as pd_offset
@@ -894,7 +893,7 @@ def date_range(
         # integers and divide the number range evenly with `periods` elements.
         start = cudf.Scalar(start, dtype=dtype).value.astype("int64")
         end = cudf.Scalar(end, dtype=dtype).value.astype("int64")
-        arr = cp.linspace(start=start, stop=end, num=periods)
+        arr = np.linspace(start=start, stop=end, num=periods)
         result = cudf.core.column.as_column(arr).astype("datetime64[ns]")
         return cudf.DatetimeIndex._from_data({name: result}).tz_localize(tz)
 
@@ -991,8 +990,10 @@ def date_range(
         stop = end_estim.astype("int64")
         start = start.value.astype("int64")
         step = _offset_to_nanoseconds_lower_bound(offset)
-        arr = cp.arange(start=start, stop=stop, step=step, dtype="int64")
-        res = cudf.core.column.as_column(arr).astype("datetime64[ns]")
+        arr = range(int(start), int(stop), step)
+        res = cudf.core.column.as_column(arr, dtype="int64").astype(
+            "datetime64[ns]"
+        )
 
     return cudf.DatetimeIndex._from_data({name: res}, freq=freq).tz_localize(
         tz
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 092e9790c63..7ab9ff2ef23 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -1534,18 +1534,7 @@ def test_date_range_start_end_periods(start, end, periods):
     )
 
 
-def test_date_range_start_end_freq(request, start, end, freq):
-    request.applymarker(
-        pytest.mark.xfail(
-            condition=(
-                start == "1831-05-08 15:23:21"
-                and end == "1996-11-21 04:05:30"
-                and freq == "110546789ms"
-            ),
-            reason="https://github.com/rapidsai/cudf/issues/12133",
-        )
-    )
-
+def test_date_range_start_end_freq(start, end, freq):
     if isinstance(freq, str):
         _gfreq = _pfreq = freq
     else:
@@ -1561,7 +1550,7 @@ def test_date_range_start_end_freq(request, start, end, freq):
     )
 
 
-def test_date_range_start_freq_periods(request, start, freq, periods):
+def test_date_range_start_freq_periods(start, freq, periods):
     if isinstance(freq, str):
         _gfreq = _pfreq = freq
     else:

From beda22ed28030bbed2faaa5a49509255f11976aa Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 15 Jul 2024 16:29:05 -1000
Subject: [PATCH 515/842] Replace is_bool_type with checking .dtype.kind
 (#16255)

It appears this was called when we already had a dtype object so can instead just simply check the .kind attribute

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16255
---
 python/cudf/cudf/core/_base_index.py         |  9 +++------
 python/cudf/cudf/core/_internals/where.py    |  8 ++------
 python/cudf/cudf/core/column/column.py       |  7 +++----
 python/cudf/cudf/core/column/numerical.py    |  5 ++---
 python/cudf/cudf/core/dataframe.py           | 13 ++++++-------
 python/cudf/cudf/core/groupby/groupby.py     |  4 ++--
 python/cudf/cudf/core/indexing_utils.py      |  3 +--
 python/cudf/cudf/core/multiindex.py          |  4 ----
 python/cudf/cudf/core/series.py              | 11 +++++------
 python/cudf/cudf/core/single_column_frame.py |  3 +--
 python/cudf/cudf/tests/test_dataframe.py     |  2 +-
 python/cudf/cudf/tests/test_index.py         |  5 ++---
 12 files changed, 28 insertions(+), 46 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 9ba2d161619..479f87bb78b 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -20,7 +20,6 @@
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
 from cudf.api.types import (
-    is_bool_dtype,
     is_integer,
     is_integer_dtype,
     is_list_like,
@@ -610,10 +609,8 @@ def union(self, other, sort=None):
             )
 
         if cudf.get_option("mode.pandas_compatible"):
-            if (
-                is_bool_dtype(self.dtype) and not is_bool_dtype(other.dtype)
-            ) or (
-                not is_bool_dtype(self.dtype) and is_bool_dtype(other.dtype)
+            if (self.dtype.kind == "b" and other.dtype.kind != "b") or (
+                self.dtype.kind != "b" and other.dtype.kind == "b"
             ):
                 # Bools + other types will result in mixed type.
                 # This is not yet consistent in pandas and specific to APIs.
@@ -2154,7 +2151,7 @@ def _apply_boolean_mask(self, boolean_mask):
         Rows corresponding to `False` is dropped.
         """
         boolean_mask = cudf.core.column.as_column(boolean_mask)
-        if not is_bool_dtype(boolean_mask.dtype):
+        if boolean_mask.dtype.kind != "b":
             raise ValueError("boolean_mask is not boolean type.")
 
         return self._from_columns_like_self(
diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index f3183e6029d..4a36be76b6d 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -7,11 +7,7 @@
 import numpy as np
 
 import cudf
-from cudf.api.types import (
-    _is_non_decimal_numeric_dtype,
-    is_bool_dtype,
-    is_scalar,
-)
+from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.dtypes import CategoricalDtype
 from cudf.utils.dtypes import (
     _can_cast,
@@ -112,7 +108,7 @@ def _check_and_cast_columns_with_other(
         other = cudf.Scalar(other)
 
     if is_mixed_with_object_dtype(other, source_col) or (
-        is_bool_dtype(source_dtype) and not is_bool_dtype(common_dtype)
+        source_dtype.kind == "b" and common_dtype.kind != "b"
     ):
         raise TypeError(mixed_err)
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index fd3664ecac4..dbdf501e022 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -41,7 +41,6 @@
     _is_non_decimal_numeric_dtype,
     _is_pandas_nullable_extension_dtype,
     infer_dtype,
-    is_bool_dtype,
     is_dtype_equal,
     is_scalar,
     is_string_dtype,
@@ -619,7 +618,7 @@ def _scatter_by_column(
         key: cudf.core.column.NumericalColumn,
         value: cudf.core.scalar.Scalar | ColumnBase,
     ) -> Self:
-        if is_bool_dtype(key.dtype):
+        if key.dtype.kind == "b":
             # `key` is boolean mask
             if len(key) != len(self):
                 raise ValueError(
@@ -644,7 +643,7 @@ def _scatter_by_column(
 
         self._check_scatter_key_length(num_keys, value)
 
-        if is_bool_dtype(key.dtype):
+        if key.dtype.kind == "b":
             return libcudf.copying.boolean_mask_scatter([value], [self], key)[
                 0
             ]._with_type_metadata(self.dtype)
@@ -1083,7 +1082,7 @@ def as_decimal_column(
 
     def apply_boolean_mask(self, mask) -> ColumnBase:
         mask = as_column(mask)
-        if not is_bool_dtype(mask.dtype):
+        if mask.dtype.kind != "b":
             raise ValueError("boolean_mask is not boolean type.")
 
         return apply_boolean_mask([self], mask)[0]._with_type_metadata(
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 7f05a5f91a1..cea68c88c90 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -13,7 +13,6 @@
 from cudf import _lib as libcudf
 from cudf._lib import pylibcudf
 from cudf.api.types import (
-    is_bool_dtype,
     is_float_dtype,
     is_integer,
     is_integer_dtype,
@@ -159,7 +158,7 @@ def __setitem__(self, key: Any, value: Any):
             else as_column(value)
         )
 
-        if not is_bool_dtype(self.dtype) and is_bool_dtype(device_value.dtype):
+        if self.dtype.kind != "b" and device_value.dtype.kind == "b":
             raise TypeError(f"Invalid value {value} for dtype {self.dtype}")
         else:
             device_value = device_value.astype(self.dtype)
@@ -264,7 +263,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                     f"{self.dtype.type.__name__} and "
                     f"{other.dtype.type.__name__}"
                 )
-            if is_bool_dtype(self.dtype) or is_bool_dtype(other.dtype):
+            if self.dtype.kind == "b" or other.dtype.kind == "b":
                 out_dtype = "bool"
 
         if (
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 2121e623c1c..b3d938829c9 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -32,7 +32,6 @@
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
-    is_bool_dtype,
     is_dict_like,
     is_dtype_equal,
     is_list_like,
@@ -171,7 +170,7 @@ def _can_downcast_to_series(self, df, arg):
             ):
                 return False
             else:
-                if is_bool_dtype(as_column(arg[0]).dtype) and not isinstance(
+                if as_column(arg[0]).dtype.kind == "b" and not isinstance(
                     arg[1], slice
                 ):
                     return True
@@ -320,7 +319,7 @@ def _getitem_tuple_arg(self, arg):
                     tmp_arg[1],
                 )
 
-                if is_bool_dtype(tmp_arg[0].dtype):
+                if tmp_arg[0].dtype.kind == "b":
                     df = columns_df._apply_boolean_mask(
                         BooleanMask(tmp_arg[0], len(columns_df))
                     )
@@ -3678,8 +3677,8 @@ def agg(self, aggs, axis=None):
         """
         dtypes = [self[col].dtype for col in self._column_names]
         common_dtype = find_common_type(dtypes)
-        if not is_bool_dtype(common_dtype) and any(
-            is_bool_dtype(dtype) for dtype in dtypes
+        if common_dtype.kind != "b" and any(
+            dtype.kind == "b" for dtype in dtypes
         ):
             raise MixedTypeError("Cannot create a column with mixed types")
 
@@ -6305,8 +6304,8 @@ def _reduce(
                     and any(
                         not is_object_dtype(dtype) for dtype in source_dtypes
                     )
-                    or not is_bool_dtype(common_dtype)
-                    and any(is_bool_dtype(dtype) for dtype in source_dtypes)
+                    or common_dtype.kind != "b"
+                    and any(dtype.kind == "b" for dtype in source_dtypes)
                 ):
                     raise TypeError(
                         "Columns must all have the same dtype to "
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 8659d7c2392..d2c75715be2 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -22,7 +22,7 @@
 from cudf._lib.sort import segmented_sort_by_key
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
-from cudf.api.types import is_bool_dtype, is_list_like, is_numeric_dtype
+from cudf.api.types import is_list_like, is_numeric_dtype
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.abc import Serializable
 from cudf.core.column.column import ColumnBase, StructDtype, as_column
@@ -1534,7 +1534,7 @@ def mult(df):
                 # For `sum` & `product`, boolean types
                 # will need to result in `int64` type.
                 for name, col in res._data.items():
-                    if is_bool_dtype(col.dtype):
+                    if col.dtype.kind == "b":
                         res._data[name] = col.astype("int")
             return res
 
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index a5fed02cbed..9c81b0eb607 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -10,7 +10,6 @@
 import cudf
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
-    is_bool_dtype,
     is_integer,
     is_integer_dtype,
 )
@@ -230,7 +229,7 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec:
         key = cudf.core.column.as_column(key)
         if isinstance(key, cudf.core.column.CategoricalColumn):
             key = key.astype(key.codes.dtype)
-        if is_bool_dtype(key.dtype):
+        if key.dtype.kind == "b":
             return MaskIndexer(BooleanMask(key, n))
         elif len(key) == 0:
             return EmptyIndexer()
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 3ed72ff812a..ff4b06c6334 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -841,10 +841,6 @@ def _get_row_major(
         | tuple[Any, ...]
         | list[tuple[Any, ...]],
     ) -> DataFrameOrSeries:
-        if pd.api.types.is_bool_dtype(
-            list(row_tuple) if isinstance(row_tuple, tuple) else row_tuple
-        ):
-            return df[row_tuple]
         if isinstance(row_tuple, slice):
             if row_tuple.start is None:
                 row_tuple = slice(self[0], row_tuple.stop, row_tuple.step)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 8c8fa75918c..e12cc3d52fb 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -22,7 +22,6 @@
 from cudf.api.types import (
     _is_non_decimal_numeric_dtype,
     _is_scalar_or_zero_d_array,
-    is_bool_dtype,
     is_dict_like,
     is_integer,
     is_integer_dtype,
@@ -221,10 +220,10 @@ def __setitem__(self, key, value):
                     f"Cannot assign {value=} to "
                     f"non-float dtype={self._frame.dtype}"
                 )
-            elif (
-                self._frame.dtype.kind == "b"
-                and not is_bool_dtype(value)
-                and value not in {None, cudf.NA}
+            elif self._frame.dtype.kind == "b" and not (
+                value in {None, cudf.NA}
+                or isinstance(value, (np.bool_, bool))
+                or (isinstance(value, cudf.Scalar) and value.dtype.kind == "b")
             ):
                 raise MixedTypeError(
                     f"Cannot assign {value=} to "
@@ -3221,7 +3220,7 @@ def describe(
             percentiles = np.array([0.25, 0.5, 0.75])
 
         dtype = "str"
-        if is_bool_dtype(self.dtype):
+        if self.dtype.kind == "b":
             data = _describe_categorical(self, percentiles)
         elif isinstance(self._column, cudf.core.column.NumericalColumn):
             data = _describe_numeric(self, percentiles)
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index f9555aee6a2..04c7db7a53c 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -11,7 +11,6 @@
 from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
-    is_bool_dtype,
     is_integer,
     is_integer_dtype,
     is_numeric_dtype,
@@ -361,7 +360,7 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase:
                 arg = cudf.core.column.column_empty(0, dtype="int32")
             if is_integer_dtype(arg.dtype):
                 return self._column.take(arg)
-            if is_bool_dtype(arg.dtype):
+            if arg.dtype.kind == "b":
                 if (bn := len(arg)) != (n := len(self)):
                     raise IndexError(
                         f"Boolean mask has wrong length: {bn} not {n}"
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 7ccf83e424c..2009fc49ce5 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -5234,7 +5234,7 @@ def test_rowwise_ops(data, op, skipna, numeric_only):
             else (pdf[column].notna().count() == 0)
         )
         or cudf.api.types.is_numeric_dtype(pdf[column].dtype)
-        or cudf.api.types.is_bool_dtype(pdf[column].dtype)
+        or pdf[column].dtype.kind == "b"
         for column in pdf
     ):
         with pytest.raises(TypeError):
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 05dcd85df6a..9eba6122d26 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -16,7 +16,6 @@
 
 import cudf
 from cudf.api.extensions import no_default
-from cudf.api.types import is_bool_dtype
 from cudf.core.index import CategoricalIndex, DatetimeIndex, Index, RangeIndex
 from cudf.testing import assert_eq
 from cudf.testing._utils import (
@@ -2397,8 +2396,8 @@ def test_intersection_index(idx1, idx2, sort, pandas_compatible):
             expected,
             actual,
             exact=False
-            if (is_bool_dtype(idx1.dtype) and not is_bool_dtype(idx2.dtype))
-            or (not is_bool_dtype(idx1.dtype) or is_bool_dtype(idx2.dtype))
+            if (idx1.dtype.kind == "b" and idx2.dtype.kind != "b")
+            or (idx1.dtype.kind != "b" or idx2.dtype.kind == "b")
             else True,
         )
 

From 669db3ea4a0c24a343c5619dd00904ad22ea215b Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 16 Jul 2024 14:24:58 +0100
Subject: [PATCH 516/842] Fix logic in to_arrow for empty list column (#16279)

An empty list column need not have empty children, it just needs to have zero length. In this case, the offsets array will have zero length, and we need to create a temporary buffer.

Now that this branch runs, fix two errors in the construction of the arrow array:

1. The element type, if there are children, should be taken from the child array;
2. If the child arrays are empty, we must make an empty null array, rather than passing a null pointer as the values array, otherwise we hit a segfault inside arrow.

The previous fix in #16201 correctly handled the empty children case (except for point two), but not the first case, which we do here.

Since we we're previously going down this code path (child_arrays was never empty), we never hit the latent segfault from point two.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/16279
---
 cpp/src/interop/to_arrow.cu | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 8c4be1b50a5..622a3aba4bb 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -378,13 +378,11 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::list_view>(
   auto children_meta =
     metadata.children_meta.empty() ? std::vector<column_metadata>{{}, {}} : metadata.children_meta;
   auto child_arrays = fetch_child_array(input_view, children_meta, ar_mr, stream);
-  if (child_arrays.empty()) {
-    // Empty list will have only one value in offset of 4 bytes
-    auto tmp_offset_buffer = allocate_arrow_buffer(sizeof(int32_t), ar_mr);
-    memset(tmp_offset_buffer->mutable_data(), 0, sizeof(int32_t));
-
-    return std::make_shared<arrow::ListArray>(
-      arrow::list(arrow::null()), 0, std::move(tmp_offset_buffer), nullptr);
+  if (child_arrays.empty() || child_arrays[0]->data()->length == 0) {
+    auto element_type = child_arrays.empty() ? arrow::null() : child_arrays[1]->type();
+    auto result       = arrow::MakeEmptyArray(arrow::list(element_type), ar_mr);
+    CUDF_EXPECTS(result.ok(), "Failed to construct empty arrow list array\n");
+    return result.ValueUnsafe();
   }
 
   auto offset_buffer = child_arrays[0]->data()->buffers[1];

From a6de6cc23702ed71b80625f461a90e910a33642f Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Tue, 16 Jul 2024 15:42:12 +0100
Subject: [PATCH 517/842] Introduce version file so we can conditionally handle
 things in tests (#16280)

We decided we would attempt to support a range of versions back to 1.0. We'll test with oldest and newest versions we support. To facilitate, introduce some versioning constants.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16280
---
 python/cudf_polars/cudf_polars/dsl/ir.py      |  7 ++++-
 .../cudf_polars/cudf_polars/utils/versions.py | 28 +++++++++++++++++++
 python/cudf_polars/tests/test_scan.py         |  5 ++++
 3 files changed, 39 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf_polars/cudf_polars/utils/versions.py

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 5e6544ef77c..cce0c4a3d94 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -313,7 +313,12 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             raise NotImplementedError(
                 f"Unhandled scan type: {self.typ}"
             )  # pragma: no cover; post init trips first
-        if row_index is not None:
+        if (
+            row_index is not None
+            # TODO: remove condition when dropping support for polars 1.0
+            # https://github.com/pola-rs/polars/pull/17363
+            and row_index[0] in self.schema
+        ):
             name, offset = row_index
             dtype = self.schema[name]
             step = plc.interop.from_arrow(
diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
new file mode 100644
index 00000000000..a9ac14c25aa
--- /dev/null
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -0,0 +1,28 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Version utilities so that cudf_polars supports a range of polars versions."""
+
+# ruff: noqa: SIM300
+from __future__ import annotations
+
+from packaging.version import parse
+
+from polars import __version__
+
+POLARS_VERSION = parse(__version__)
+
+POLARS_VERSION_GE_10 = POLARS_VERSION >= parse("1.0")
+POLARS_VERSION_GE_11 = POLARS_VERSION >= parse("1.1")
+POLARS_VERSION_GE_12 = POLARS_VERSION >= parse("1.2")
+POLARS_VERSION_GT_10 = POLARS_VERSION > parse("1.0")
+POLARS_VERSION_GT_11 = POLARS_VERSION > parse("1.1")
+POLARS_VERSION_GT_12 = POLARS_VERSION > parse("1.2")
+
+POLARS_VERSION_LE_12 = POLARS_VERSION <= parse("1.2")
+POLARS_VERSION_LE_11 = POLARS_VERSION <= parse("1.1")
+POLARS_VERSION_LT_12 = POLARS_VERSION < parse("1.2")
+POLARS_VERSION_LT_11 = POLARS_VERSION < parse("1.1")
+
+if POLARS_VERSION < parse("1.0"):  # pragma: no cover
+    raise ImportError("cudf_polars requires py-polars v1.0 or greater.")
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index c41a94da14b..d0c41090433 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -10,6 +10,7 @@
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
+from cudf_polars.utils import versions
 
 
 @pytest.fixture(
@@ -97,6 +98,10 @@ def test_scan_unsupported_raises(tmp_path):
     assert_ir_translation_raises(q, NotImplementedError)
 
 
+@pytest.mark.xfail(
+    versions.POLARS_VERSION_LT_11,
+    reason="https://github.com/pola-rs/polars/issues/15730",
+)
 def test_scan_row_index_projected_out(tmp_path):
     df = pl.DataFrame({"a": [1, 2, 3]})
 

From 3418f915d1a1ff82a72918d978924dfad2645a5a Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 16 Jul 2024 12:38:47 -0500
Subject: [PATCH 518/842] Introduce dedicated options for low memory readers
 (#16289)

This PR disables low memory readers by default in `cudf.pandas` and instead gives a provision to enable them with dedicated options.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16289
---
 python/cudf/cudf/_lib/json.pyx         |  2 +-
 python/cudf/cudf/io/parquet.py         |  2 +-
 python/cudf/cudf/options.py            | 26 ++++++++++++++++++++++++++
 python/cudf/cudf/tests/test_json.py    |  2 +-
 python/cudf/cudf/tests/test_parquet.py |  2 +-
 5 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 853dd431099..03bf9ed8b75 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -99,7 +99,7 @@ cpdef read_json(object filepaths_or_buffers,
         else:
             raise TypeError("`dtype` must be 'list like' or 'dict'")
 
-    if cudf.get_option("mode.pandas_compatible") and lines:
+    if cudf.get_option("io.json.low_memory") and lines:
         res_cols, res_col_names, res_child_names = plc.io.json.chunked_read_json(
             plc.io.SourceInfo(filepaths_or_buffers),
             processed_dtypes,
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index fd0792b5edb..02b26ea1c01 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -916,7 +916,7 @@ def _read_parquet(
                 "cudf engine doesn't support the "
                 f"following positional arguments: {list(args)}"
             )
-        if cudf.get_option("mode.pandas_compatible"):
+        if cudf.get_option("io.parquet.low_memory"):
             return libparquet.ParquetReader(
                 filepaths_or_buffers,
                 columns=columns,
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index 1f539e7f266..94e73021cec 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -325,6 +325,32 @@ def _integer_and_none_validator(val):
     _make_contains_validator([False, True]),
 )
 
+_register_option(
+    "io.parquet.low_memory",
+    False,
+    textwrap.dedent(
+        """
+        If set to `False`, reads entire parquet in one go.
+        If set to `True`, reads parquet file in chunks.
+        \tValid values are True or False. Default is False.
+    """
+    ),
+    _make_contains_validator([False, True]),
+)
+
+_register_option(
+    "io.json.low_memory",
+    False,
+    textwrap.dedent(
+        """
+        If set to `False`, reads entire json in one go.
+        If set to `True`, reads json file in chunks.
+        \tValid values are True or False. Default is False.
+    """
+    ),
+    _make_contains_validator([False, True]),
+)
+
 
 class option_context(ContextDecorator):
     """
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 7771afd692f..c81c2d1d94b 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -1441,6 +1441,6 @@ def test_chunked_json_reader():
     df.to_json(buf, lines=True, orient="records", engine="cudf")
     buf.seek(0)
     df = df.to_pandas()
-    with cudf.option_context("mode.pandas_compatible", True):
+    with cudf.option_context("io.json.low_memory", True):
         gdf = cudf.read_json(buf, lines=True)
     assert_eq(df, gdf)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index ff0c9040737..ecb7fd44422 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3772,6 +3772,6 @@ def test_parquet_reader_pandas_compatibility():
     )
     buffer = BytesIO()
     df.to_parquet(buffer)
-    with cudf.option_context("mode.pandas_compatible", True):
+    with cudf.option_context("io.parquet.low_memory", True):
         expected = cudf.read_parquet(buffer)
     assert_eq(expected, df)

From e2b7e4370c8513811e9c72b30f499a5614b49f7c Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 16 Jul 2024 14:20:00 -0400
Subject: [PATCH 519/842] Build and test with CUDA 12.5.1 (#16259)

This PR updates the latest CUDA build/test version 12.2.2 to 12.5.1.

Contributes to https://github.com/rapidsai/build-planning/issues/73

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/16259
---
 .../cuda12.2-conda/devcontainer.json          |  8 ++--
 .devcontainer/cuda12.2-pip/devcontainer.json  | 10 ++--
 .github/workflows/build.yaml                  | 20 ++++----
 .github/workflows/pandas-tests.yaml           |  4 +-
 .github/workflows/pr.yaml                     | 48 +++++++++----------
 .../workflows/pr_issue_status_automation.yml  |  6 +--
 .github/workflows/test.yaml                   | 22 ++++-----
 CONTRIBUTING.md                               |  2 +-
 README.md                                     |  2 +-
 ..._64.yaml => all_cuda-125_arch-x86_64.yaml} |  4 +-
 dependencies.yaml                             |  6 ++-
 11 files changed, 68 insertions(+), 64 deletions(-)
 rename conda/environments/{all_cuda-122_arch-x86_64.yaml => all_cuda-125_arch-x86_64.yaml} (97%)

diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json
index 05bf9173d25..fadce01d060 100644
--- a/.devcontainer/cuda12.2-conda/devcontainer.json
+++ b/.devcontainer/cuda12.2-conda/devcontainer.json
@@ -3,7 +3,7 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.2",
+      "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
       "BASE": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04"
     }
@@ -11,7 +11,7 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
@@ -20,7 +20,7 @@
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.2-envs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config,conda/pkgs,conda/${localWorkspaceFolderBasename}-cuda12.5-envs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent",
@@ -29,7 +29,7 @@
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.conda/pkgs,target=/home/coder/.conda/pkgs,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.2-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.conda/${localWorkspaceFolderBasename}-cuda12.5-envs,target=/home/coder/.conda/envs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index 74420214726..026eb540952 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -3,15 +3,15 @@
     "context": "${localWorkspaceFolder}/.devcontainer",
     "dockerfile": "${localWorkspaceFolder}/.devcontainer/Dockerfile",
     "args": {
-      "CUDA": "12.2",
+      "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.2-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.5-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.2-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
@@ -20,7 +20,7 @@
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
   ],
-  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs}"],
+  "initializeCommand": ["/bin/bash", "-c", "mkdir -m 0755 -p ${localWorkspaceFolder}/../.{aws,cache,config/pip,local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs}"],
   "postAttachCommand": ["/bin/bash", "-c", "if [ ${CODESPACES:-false} = 'true' ]; then . devcontainer-utils-post-attach-command; . rapids-post-attach-command; fi"],
   "workspaceFolder": "/home/coder",
   "workspaceMount": "source=${localWorkspaceFolder},target=/home/coder/cudf,type=bind,consistency=consistent",
@@ -28,7 +28,7 @@
     "source=${localWorkspaceFolder}/../.aws,target=/home/coder/.aws,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.cache,target=/home/coder/.cache,type=bind,consistency=consistent",
     "source=${localWorkspaceFolder}/../.config,target=/home/coder/.config,type=bind,consistency=consistent",
-    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.2-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
+    "source=${localWorkspaceFolder}/../.local/share/${localWorkspaceFolderBasename}-cuda12.5-venvs,target=/home/coder/.local/share/venvs,type=bind,consistency=consistent"
   ],
   "customizations": {
     "vscode": {
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 2e5959338b0..937080572ad 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -111,7 +111,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index a8643923a4d..1516cb09449 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,9 +17,9 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
       with:
-        matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
+        matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
         build_type: nightly
         branch: ${{ inputs.branch }}
         date: ${{ inputs.date }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index ceee9074b93..1fe64e7f318 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -34,41 +34,41 @@ jobs:
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.5.1
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.5.1
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       script: "ci/test_python_cudf.sh"
@@ -76,14 +76,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -93,7 +93,7 @@ jobs:
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -103,7 +103,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -113,7 +113,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -123,21 +123,21 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-cudf-polars:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -146,7 +146,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -157,7 +157,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -166,7 +166,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -174,10 +174,10 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@cuda-12.5.1
     with:
       arch: '["amd64"]'
-      cuda: '["12.2"]'
+      cuda: '["12.5"]'
       build_command: |
         sccache -z;
         build-all -DBUILD_BENCHMARKS=ON --verbose;
@@ -185,7 +185,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
@@ -194,9 +194,9 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
     with:
-      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and .CUDA_VER == "12.2.2" ))
+      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
       build_type: pull-request
       script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
       # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit.
@@ -204,7 +204,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
         node_type: cpu4
         build_type: pull-request
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index 8ca971dc28d..2a8ebd30993 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@cuda-12.5.1
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@cuda-12.5.1
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@cuda-12.5.1
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 36c9088d93c..73f8d726e77 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -54,7 +54,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -85,7 +85,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,7 +97,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -106,7 +106,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -117,7 +117,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 4fbc28fa6e1..f9cdde7c2b7 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -104,7 +104,7 @@ Instructions for a minimal build environment without conda are included below.
 # create the conda environment (assuming in base `cudf` directory)
 # note: RAPIDS currently doesn't support `channel_priority: strict`;
 # use `channel_priority: flexible` instead
-conda env create --name cudf_dev --file conda/environments/all_cuda-122_arch-x86_64.yaml
+conda env create --name cudf_dev --file conda/environments/all_cuda-125_arch-x86_64.yaml
 # activate the environment
 conda activate cudf_dev
 ```
diff --git a/README.md b/README.md
index 17d2df9a936..1ab6a2d7457 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=24.08 python=3.11 cuda-version=12.2
+    cudf=24.08 python=3.11 cuda-version=12.5
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
similarity index 97%
rename from conda/environments/all_cuda-122_arch-x86_64.yaml
rename to conda/environments/all_cuda-125_arch-x86_64.yaml
index c32d21c5d36..3f5fae49cbb 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -23,7 +23,7 @@ dependencies:
 - cuda-nvtx-dev
 - cuda-python>=12.0,<13.0a0
 - cuda-sanitizer-api
-- cuda-version=12.2
+- cuda-version=12.5
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
@@ -96,4 +96,4 @@ dependencies:
 - zlib>=1.2.13
 - pip:
   - git+https://github.com/python-streamz/streamz.git@master
-name: all_cuda-122_arch-x86_64
+name: all_cuda-125_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index 27621ff9a3f..67ed3773b44 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -3,7 +3,7 @@ files:
   all:
     output: conda
     matrix:
-      cuda: ["11.8", "12.2"]
+      cuda: ["11.8", "12.5"]
       arch: [x86_64]
     includes:
       - build_base
@@ -402,6 +402,10 @@ dependencies:
               cuda: "12.2"
             packages:
               - cuda-version=12.2
+          - matrix:
+              cuda: "12.5"
+            packages:
+              - cuda-version=12.5
   cuda:
     specific:
       - output_types: conda

From 05ea7c9cf6a0fd39384e2044b4c9b46f543d4ad0 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Tue, 16 Jul 2024 12:51:20 -0700
Subject: [PATCH 520/842] Fix tests for polars 1.2 (#16292)

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16292
---
 python/cudf_polars/tests/test_groupby.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index 50adca01950..b07d8e38217 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -12,6 +12,7 @@
     assert_gpu_result_equal,
     assert_ir_translation_raises,
 )
+from cudf_polars.utils import versions
 
 
 @pytest.fixture
@@ -100,7 +101,7 @@ def test_groupby_sorted_keys(df: pl.LazyFrame, keys, exprs):
         with pytest.raises(AssertionError):
             # https://github.com/pola-rs/polars/issues/17556
             assert_gpu_result_equal(q, check_exact=False)
-        if schema[sort_keys[1]] == pl.Boolean():
+        if versions.POLARS_VERSION_LT_12 and schema[sort_keys[1]] == pl.Boolean():
             # https://github.com/pola-rs/polars/issues/17557
             with pytest.raises(AssertionError):
                 assert_gpu_result_equal(qsorted, check_exact=False)

From 62191103032706371d76ce83c6ec59d13376b231 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 16 Jul 2024 16:23:49 -0400
Subject: [PATCH 521/842] [BUG] Make name attr of Index fast slow attrs
 (#16270)

Debugging the spike in failures from #16234

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16270
---
 python/cudf/cudf/pandas/_wrappers/pandas.py | 36 ++++++++-------------
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index d3a3488081a..59a243dd7c4 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -260,18 +260,14 @@ def Index__new__(cls, *args, **kwargs):
     return self
 
 
-def name(self):
-    return self._fsproxy_wrapped._name
-
-
 def Index__setattr__(self, name, value):
     if name.startswith("_"):
         object.__setattr__(self, name, value)
         return
     if name == "name":
-        setattr(self._fsproxy_wrapped, "_name", value)
+        setattr(self._fsproxy_wrapped, "name", value)
     if name == "names":
-        setattr(self._fsproxy_wrapped, "_names", value)
+        setattr(self._fsproxy_wrapped, "names", value)
     return _FastSlowAttribute("__setattr__").__get__(self, type(self))(
         name, value
     )
@@ -300,7 +296,7 @@ def Index__setattr__(self, name, value):
         "_accessors": set(),
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
-        "name": property(name),
+        "name": _FastSlowAttribute("name"),
     },
 )
 
@@ -314,7 +310,7 @@ def Index__setattr__(self, name, value):
     additional_attributes={
         "__init__": _DELETE,
         "__setattr__": Index__setattr__,
-        "name": property(name),
+        "name": _FastSlowAttribute("name"),
     },
 )
 
@@ -345,7 +341,7 @@ def Index__setattr__(self, name, value):
     additional_attributes={
         "__init__": _DELETE,
         "__setattr__": Index__setattr__,
-        "name": property(name),
+        "name": _FastSlowAttribute("name"),
     },
 )
 
@@ -375,10 +371,10 @@ def Index__setattr__(self, name, value):
     bases=(Index,),
     additional_attributes={
         "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
-        "__setattr__": Index__setattr__,
-        "name": property(name),
+        "name": _FastSlowAttribute("name"),
     },
 )
 
@@ -412,10 +408,10 @@ def Index__setattr__(self, name, value):
     bases=(Index,),
     additional_attributes={
         "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
-        "__setattr__": Index__setattr__,
-        "name": property(name),
+        "name": _FastSlowAttribute("name"),
     },
 )
 
@@ -470,10 +466,10 @@ def Index__setattr__(self, name, value):
     bases=(Index,),
     additional_attributes={
         "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
-        "__setattr__": Index__setattr__,
-        "name": property(name),
+        "name": _FastSlowAttribute("name"),
     },
 )
 
@@ -508,10 +504,6 @@ def Index__setattr__(self, name, value):
 )
 
 
-def names(self):
-    return self._fsproxy_wrapped._names
-
-
 MultiIndex = make_final_proxy_type(
     "MultiIndex",
     cudf.MultiIndex,
@@ -522,7 +514,7 @@ def names(self):
     additional_attributes={
         "__init__": _DELETE,
         "__setattr__": Index__setattr__,
-        "name": property(names),
+        "names": _FastSlowAttribute("names"),
     },
 )
 
@@ -709,10 +701,10 @@ def names(self):
     bases=(Index,),
     additional_attributes={
         "__init__": _DELETE,
+        "__setattr__": Index__setattr__,
         "_data": _FastSlowAttribute("_data", private=True),
         "_mask": _FastSlowAttribute("_mask", private=True),
-        "__setattr__": Index__setattr__,
-        "name": property(name),
+        "name": _FastSlowAttribute("name"),
     },
 )
 

From 6a954e299d97f69a62fd184529fa7d5f29c0e09f Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Tue, 16 Jul 2024 15:02:20 -0700
Subject: [PATCH 522/842] Migrate expressions to pylibcudf (#16056)

xref #15162

Migrates expresions to use pylibcudf.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16056
---
 .../api_docs/pylibcudf/datetime.rst           |   6 +-
 .../api_docs/pylibcudf/expressions.rst        |   6 +
 .../user_guide/api_docs/pylibcudf/index.rst   |   1 +
 python/cudf/cudf/_lib/CMakeLists.txt          |   1 -
 python/cudf/cudf/_lib/__init__.py             |   3 +-
 python/cudf/cudf/_lib/expressions.pyx         | 156 --------------
 python/cudf/cudf/_lib/parquet.pyx             |   2 +-
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |   1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |   1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |   1 +
 .../cudf/_lib/{ => pylibcudf}/expressions.pxd |  29 ++-
 .../cudf/cudf/_lib/pylibcudf/expressions.pyx  | 195 ++++++++++++++++++
 .../_lib/pylibcudf/libcudf/CMakeLists.txt     |   4 +-
 .../_lib/pylibcudf/libcudf/expressions.pxd    | 103 ++++-----
 .../_lib/pylibcudf/libcudf/expressions.pyx    |   0
 python/cudf/cudf/_lib/transform.pyx           |   2 +-
 .../cudf/cudf/core/_internals/expressions.py  |  11 +-
 .../cudf/pylibcudf_tests/test_expressions.py  |  50 +++++
 18 files changed, 335 insertions(+), 237 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst
 delete mode 100644 python/cudf/cudf/_lib/expressions.pyx
 rename python/cudf/cudf/_lib/{ => pylibcudf}/expressions.pxd (50%)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/expressions.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_expressions.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
index ebf5fab3052..558268ea495 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
@@ -1,6 +1,6 @@
-=======
-copying
-=======
+========
+datetime
+========
 
 .. automodule:: cudf._lib.pylibcudf.datetime
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst
new file mode 100644
index 00000000000..03f769ee861
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst
@@ -0,0 +1,6 @@
+===========
+expressions
+===========
+
+.. automodule:: cudf._lib.pylibcudf.expressions
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 5899d272160..505765bba0f 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -15,6 +15,7 @@ This page provides API documentation for pylibcudf.
     concatenate
     copying
     datetime
+    expressions
     filling
     gpumemoryview
     groupby
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 5a067e84f56..38b7e9ebe04 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -21,7 +21,6 @@ set(cython_sources
     copying.pyx
     csv.pyx
     datetime.pyx
-    expressions.pyx
     filling.pyx
     groupby.pyx
     hash.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 18b95f5f2e1..34c0e29d0b1 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 import numpy as np
 
 from . import (
@@ -8,7 +8,6 @@
     copying,
     csv,
     datetime,
-    expressions,
     filling,
     groupby,
     hash,
diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx
deleted file mode 100644
index 3fb29279ed7..00000000000
--- a/python/cudf/cudf/_lib/expressions.pyx
+++ /dev/null
@@ -1,156 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-
-from enum import Enum
-
-import numpy as np
-
-from cython.operator cimport dereference
-from libc.stdint cimport int64_t
-from libcpp.memory cimport make_unique, unique_ptr
-from libcpp.string cimport string
-from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf cimport expressions as libcudf_exp
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport (
-    timestamp_ms,
-    timestamp_us,
-)
-
-# Necessary for proper casting, see below.
-ctypedef int32_t underlying_type_ast_operator
-
-
-# Aliases for simplicity
-ctypedef unique_ptr[libcudf_exp.expression] expression_ptr
-
-
-class ASTOperator(Enum):
-    ADD = libcudf_exp.ast_operator.ADD
-    SUB = libcudf_exp.ast_operator.SUB
-    MUL = libcudf_exp.ast_operator.MUL
-    DIV = libcudf_exp.ast_operator.DIV
-    TRUE_DIV = libcudf_exp.ast_operator.TRUE_DIV
-    FLOOR_DIV = libcudf_exp.ast_operator.FLOOR_DIV
-    MOD = libcudf_exp.ast_operator.MOD
-    PYMOD = libcudf_exp.ast_operator.PYMOD
-    POW = libcudf_exp.ast_operator.POW
-    EQUAL = libcudf_exp.ast_operator.EQUAL
-    NULL_EQUAL = libcudf_exp.ast_operator.NULL_EQUAL
-    NOT_EQUAL = libcudf_exp.ast_operator.NOT_EQUAL
-    LESS = libcudf_exp.ast_operator.LESS
-    GREATER = libcudf_exp.ast_operator.GREATER
-    LESS_EQUAL = libcudf_exp.ast_operator.LESS_EQUAL
-    GREATER_EQUAL = libcudf_exp.ast_operator.GREATER_EQUAL
-    BITWISE_AND = libcudf_exp.ast_operator.BITWISE_AND
-    BITWISE_OR = libcudf_exp.ast_operator.BITWISE_OR
-    BITWISE_XOR = libcudf_exp.ast_operator.BITWISE_XOR
-    LOGICAL_AND = libcudf_exp.ast_operator.LOGICAL_AND
-    NULL_LOGICAL_AND = libcudf_exp.ast_operator.NULL_LOGICAL_AND
-    LOGICAL_OR = libcudf_exp.ast_operator.LOGICAL_OR
-    NULL_LOGICAL_OR = libcudf_exp.ast_operator.NULL_LOGICAL_OR
-    # Unary operators
-    IDENTITY = libcudf_exp.ast_operator.IDENTITY
-    IS_NULL = libcudf_exp.ast_operator.IS_NULL
-    SIN = libcudf_exp.ast_operator.SIN
-    COS = libcudf_exp.ast_operator.COS
-    TAN = libcudf_exp.ast_operator.TAN
-    ARCSIN = libcudf_exp.ast_operator.ARCSIN
-    ARCCOS = libcudf_exp.ast_operator.ARCCOS
-    ARCTAN = libcudf_exp.ast_operator.ARCTAN
-    SINH = libcudf_exp.ast_operator.SINH
-    COSH = libcudf_exp.ast_operator.COSH
-    TANH = libcudf_exp.ast_operator.TANH
-    ARCSINH = libcudf_exp.ast_operator.ARCSINH
-    ARCCOSH = libcudf_exp.ast_operator.ARCCOSH
-    ARCTANH = libcudf_exp.ast_operator.ARCTANH
-    EXP = libcudf_exp.ast_operator.EXP
-    LOG = libcudf_exp.ast_operator.LOG
-    SQRT = libcudf_exp.ast_operator.SQRT
-    CBRT = libcudf_exp.ast_operator.CBRT
-    CEIL = libcudf_exp.ast_operator.CEIL
-    FLOOR = libcudf_exp.ast_operator.FLOOR
-    ABS = libcudf_exp.ast_operator.ABS
-    RINT = libcudf_exp.ast_operator.RINT
-    BIT_INVERT = libcudf_exp.ast_operator.BIT_INVERT
-    NOT = libcudf_exp.ast_operator.NOT
-
-
-class TableReference(Enum):
-    LEFT = libcudf_exp.table_reference.LEFT
-    RIGHT = libcudf_exp.table_reference.RIGHT
-
-
-# Note that this function only currently supports numeric literals. libcudf
-# expressions don't really support other types yet though, so this isn't
-# restrictive at the moment.
-cdef class Literal(Expression):
-    def __cinit__(self, value):
-        if isinstance(value, int):
-            self.c_scalar.reset(new numeric_scalar[int64_t](value, True))
-            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
-                <numeric_scalar[int64_t] &>dereference(self.c_scalar)
-            ))
-        elif isinstance(value, float):
-            self.c_scalar.reset(new numeric_scalar[double](value, True))
-            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
-                <numeric_scalar[double] &>dereference(self.c_scalar)
-            ))
-        elif isinstance(value, str):
-            self.c_scalar.reset(new string_scalar(value.encode(), True))
-            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
-                <string_scalar &>dereference(self.c_scalar)
-            ))
-        elif isinstance(value, np.datetime64):
-            scale, _ = np.datetime_data(value.dtype)
-            int_value = value.astype(np.int64)
-            if scale == "ms":
-                self.c_scalar.reset(new timestamp_scalar[timestamp_ms](
-                    <int64_t>int_value, True)
-                )
-                self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
-                    <timestamp_scalar[timestamp_ms] &>dereference(self.c_scalar)
-                ))
-            elif scale == "us":
-                self.c_scalar.reset(new timestamp_scalar[timestamp_us](
-                    <int64_t>int_value, True)
-                )
-                self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
-                    <timestamp_scalar[timestamp_us] &>dereference(self.c_scalar)
-                ))
-            else:
-                raise NotImplementedError(
-                    f"Unhandled datetime scale {scale=}"
-                )
-        else:
-            raise NotImplementedError(
-                f"Don't know how to make literal with type {type(value)}"
-            )
-
-
-cdef class ColumnReference(Expression):
-    def __cinit__(self, size_type index):
-        self.c_obj = <expression_ptr>move(make_unique[libcudf_exp.column_reference](
-            index
-        ))
-
-
-cdef class Operation(Expression):
-    def __cinit__(self, op, Expression left, Expression right=None):
-        cdef libcudf_exp.ast_operator op_value = <libcudf_exp.ast_operator>(
-            <underlying_type_ast_operator> op.value
-        )
-
-        if right is None:
-            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.operation](
-                op_value, dereference(left.c_obj)
-            ))
-        else:
-            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.operation](
-                op_value, dereference(left.c_obj), dereference(right.c_obj)
-            ))
-
-cdef class ColumnNameReference(Expression):
-    def __cinit__(self, string name):
-        self.c_obj = <expression_ptr> \
-            move(make_unique[libcudf_exp.column_name_reference](name))
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 158fb6051c3..e7959d21e01 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -37,12 +37,12 @@ cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
 cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
 cimport cudf._lib.pylibcudf.libcudf.types as cudf_types
 from cudf._lib.column cimport Column
-from cudf._lib.expressions cimport Expression
 from cudf._lib.io.utils cimport (
     make_sinks_info,
     make_source_info,
     update_struct_field_names,
 )
+from cudf._lib.pylibcudf.expressions cimport Expression
 from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
 from cudf._lib.pylibcudf.libcudf.expressions cimport expression
 from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index a2d11bbea6e..0800fa18e94 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -20,6 +20,7 @@ set(cython_sources
     concatenate.pyx
     copying.pyx
     datetime.pyx
+    expressions.pyx
     filling.pyx
     gpumemoryview.pyx
     groupby.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index da2b7806203..26e89b818d3 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -8,6 +8,7 @@ from . cimport (
     concatenate,
     copying,
     datetime,
+    expressions,
     filling,
     groupby,
     join,
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index acbc84d7177..e89a5ed9f96 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -7,6 +7,7 @@
     concatenate,
     copying,
     datetime,
+    expressions,
     filling,
     groupby,
     interop,
diff --git a/python/cudf/cudf/_lib/expressions.pxd b/python/cudf/cudf/_lib/pylibcudf/expressions.pxd
similarity index 50%
rename from python/cudf/cudf/_lib/expressions.pxd
rename to python/cudf/cudf/_lib/pylibcudf/expressions.pxd
index 4a20c5fc545..64825b89d9f 100644
--- a/python/cudf/cudf/_lib/expressions.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/expressions.pxd
@@ -1,36 +1,31 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-
-from libc.stdint cimport int32_t, int64_t
+# Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
 
 from cudf._lib.pylibcudf.libcudf.expressions cimport (
-    column_reference,
+    ast_operator,
     expression,
-    literal,
-    operation,
-)
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
-    numeric_scalar,
-    scalar,
-    string_scalar,
-    timestamp_scalar,
+    table_reference,
 )
 
+from .scalar cimport Scalar
+
 
 cdef class Expression:
     cdef unique_ptr[expression] c_obj
 
-
 cdef class Literal(Expression):
-    cdef unique_ptr[scalar] c_scalar
-
+    # Hold on to input scalar so it doesn't get gc'ed
+    cdef Scalar scalar
 
 cdef class ColumnReference(Expression):
     pass
 
-
 cdef class Operation(Expression):
-    pass
+    # Hold on to the input expressions so
+    # they don't get gc'ed
+    cdef Expression right
+    cdef Expression left
 
 cdef class ColumnNameReference(Expression):
     pass
diff --git a/python/cudf/cudf/_lib/pylibcudf/expressions.pyx b/python/cudf/cudf/_lib/pylibcudf/expressions.pyx
new file mode 100644
index 00000000000..38de11406ad
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/expressions.pyx
@@ -0,0 +1,195 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from cudf._lib.pylibcudf.libcudf.expressions import \
+    ast_operator as ASTOperator  # no-cython-lint
+from cudf._lib.pylibcudf.libcudf.expressions import \
+    table_reference as TableReference  # no-cython-lint
+
+from cython.operator cimport dereference
+from libc.stdint cimport int32_t, int64_t
+from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.string cimport string
+from libcpp.utility cimport move
+
+from cudf._lib.pylibcudf.libcudf cimport expressions as libcudf_exp
+from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
+    duration_scalar,
+    numeric_scalar,
+    string_scalar,
+    timestamp_scalar,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type, type_id
+from cudf._lib.pylibcudf.libcudf.wrappers.durations cimport (
+    duration_ms,
+    duration_ns,
+    duration_s,
+    duration_us,
+)
+from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport (
+    timestamp_ms,
+    timestamp_ns,
+    timestamp_s,
+    timestamp_us,
+)
+
+from .scalar cimport Scalar
+from .traits cimport is_chrono, is_numeric
+from .types cimport DataType
+
+# Aliases for simplicity
+ctypedef unique_ptr[libcudf_exp.expression] expression_ptr
+
+cdef class Literal(Expression):
+    """
+    A literal value used in an abstract syntax tree.
+
+    For details, see :cpp:class:`cudf::ast::literal`.
+
+    Parameters
+    ----------
+    value : Scalar
+        The Scalar value of the Literal.
+        Must be either numeric, string, or a timestamp/duration scalar.
+    """
+    def __cinit__(self, Scalar value):
+        self.scalar = value
+        cdef DataType typ = value.type()
+        cdef type_id tid = value.type().id()
+        if not (is_numeric(typ) or is_chrono(typ) or tid == type_id.STRING):
+            raise ValueError(
+                "Only numeric, string, or timestamp/duration scalars are accepted"
+            )
+        # TODO: Accept type-erased scalar in AST C++ code
+        # Then a lot of this code can be deleted
+        if tid == type_id.INT64:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[int64_t] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.INT32:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[int32_t] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.FLOAT64:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[double] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.FLOAT32:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <numeric_scalar[float] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.STRING:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <string_scalar &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.TIMESTAMP_NANOSECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <timestamp_scalar[timestamp_ns] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.TIMESTAMP_MICROSECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <timestamp_scalar[timestamp_us] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.TIMESTAMP_MILLISECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <timestamp_scalar[timestamp_ms] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.TIMESTAMP_MILLISECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <timestamp_scalar[timestamp_ms] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.TIMESTAMP_SECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <timestamp_scalar[timestamp_s] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.DURATION_NANOSECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <duration_scalar[duration_ns] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.DURATION_MICROSECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <duration_scalar[duration_us] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.DURATION_MILLISECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <duration_scalar[duration_ms] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.DURATION_MILLISECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <duration_scalar[duration_ms] &>dereference(self.scalar.c_obj)
+            ))
+        elif tid == type_id.DURATION_SECONDS:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.literal](
+                <duration_scalar[duration_s] &>dereference(self.scalar.c_obj)
+            ))
+        else:
+            raise NotImplementedError(
+                f"Don't know how to make literal with type id {tid}"
+            )
+
+cdef class ColumnReference(Expression):
+    """
+    An expression referring to data from a column in a table.
+
+    For details, see :cpp:class:`cudf::ast::column_reference`.
+
+    Parameters
+    ----------
+    index : size_type
+        The index of this column in the table
+        (provided when the expression is evaluated).
+    table_source : TableReference, default TableReferenece.LEFT
+        Which table to use in cases with two tables (e.g. joins)
+    """
+    def __cinit__(
+        self,
+        size_type index,
+        table_reference table_source=table_reference.LEFT
+    ):
+        self.c_obj = <expression_ptr>move(make_unique[libcudf_exp.column_reference](
+            index, table_source
+        ))
+
+
+cdef class Operation(Expression):
+    """
+    An operation expression holds an operator and zero or more operands.
+
+    For details, see :cpp:class:`cudf::ast::operation`.
+
+    Parameters
+    ----------
+    op : Operator
+    left : Expression
+        Left input expression (left operand)
+    right: Expression, default None
+        Right input expression (right operand).
+        You should only pass this if the input expression is a binary operation.
+    """
+    def __cinit__(self, ast_operator op, Expression left, Expression right=None):
+        self.left = left
+        self.right = right
+        if right is None:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.operation](
+                op, dereference(left.c_obj)
+            ))
+        else:
+            self.c_obj = <expression_ptr> move(make_unique[libcudf_exp.operation](
+                op, dereference(left.c_obj), dereference(right.c_obj)
+            ))
+
+cdef class ColumnNameReference(Expression):
+    """
+    An expression referring to data from a column in a table.
+
+    For details, see :cpp:class:`cudf::ast::column_name_reference`.
+
+    Parameters
+    ----------
+    column_name : str
+        Name of this column in the table metadata
+        (provided when the expression is evaluated).
+    """
+    def __cinit__(self, str name):
+        self.c_obj = <expression_ptr> \
+            move(make_unique[libcudf_exp.column_name_reference](
+                <string>(name.encode("utf-8"))
+            ))
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
index 699e85ce567..b04e94f1546 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
@@ -12,8 +12,8 @@
 # the License.
 # =============================================================================
 
-set(cython_sources aggregation.pyx binaryop.pyx copying.pyx reduce.pyx replace.pyx round.pyx
-                   stream_compaction.pyx types.pyx unary.pyx
+set(cython_sources aggregation.pyx binaryop.pyx copying.pyx expressions.pyx reduce.pyx replace.pyx
+                   round.pyx stream_compaction.pyx types.pyx unary.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
index 279d969db50..427e16d4ff8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
@@ -1,5 +1,6 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
@@ -14,63 +15,63 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil:
-    ctypedef enum ast_operator:
+    cpdef enum class ast_operator(int32_t):
         # Binary operators
-        ADD "cudf::ast::ast_operator::ADD"
-        SUB "cudf::ast::ast_operator::SUB"
-        MUL "cudf::ast::ast_operator::MUL"
-        DIV "cudf::ast::ast_operator::DIV"
-        TRUE_DIV "cudf::ast::ast_operator::TRUE_DIV"
-        FLOOR_DIV "cudf::ast::ast_operator::FLOOR_DIV"
-        MOD "cudf::ast::ast_operator::MOD"
-        PYMOD "cudf::ast::ast_operator::PYMOD"
-        POW "cudf::ast::ast_operator::POW"
-        EQUAL "cudf::ast::ast_operator::EQUAL"
-        NULL_EQUAL "cudf::ast::ast_operator::NULL_EQUAL"
-        NOT_EQUAL "cudf::ast::ast_operator::NOT_EQUAL"
-        LESS "cudf::ast::ast_operator::LESS"
-        GREATER "cudf::ast::ast_operator::GREATER"
-        LESS_EQUAL "cudf::ast::ast_operator::LESS_EQUAL"
-        GREATER_EQUAL "cudf::ast::ast_operator::GREATER_EQUAL"
-        BITWISE_AND "cudf::ast::ast_operator::BITWISE_AND"
-        BITWISE_OR "cudf::ast::ast_operator::BITWISE_OR"
-        BITWISE_XOR "cudf::ast::ast_operator::BITWISE_XOR"
-        NULL_LOGICAL_AND "cudf::ast::ast_operator::NULL_LOGICAL_AND"
-        LOGICAL_AND "cudf::ast::ast_operator::LOGICAL_AND"
-        NULL_LOGICAL_OR "cudf::ast::ast_operator::NULL_LOGICAL_OR"
-        LOGICAL_OR "cudf::ast::ast_operator::LOGICAL_OR"
+        ADD
+        SUB
+        MUL
+        DIV
+        TRUE_DIV
+        FLOOR_DIV
+        MOD
+        PYMOD
+        POW
+        EQUAL
+        NULL_EQUAL
+        NOT_EQUAL
+        LESS
+        GREATER
+        LESS_EQUAL
+        GREATER_EQUAL
+        BITWISE_AND
+        BITWISE_OR
+        BITWISE_XOR
+        NULL_LOGICAL_AND
+        LOGICAL_AND
+        NULL_LOGICAL_OR
+        LOGICAL_OR
         # Unary operators
-        IDENTITY "cudf::ast::ast_operator::IDENTITY"
-        IS_NULL "cudf::ast::ast_operator::IS_NULL"
-        SIN "cudf::ast::ast_operator::SIN"
-        COS "cudf::ast::ast_operator::COS"
-        TAN "cudf::ast::ast_operator::TAN"
-        ARCSIN "cudf::ast::ast_operator::ARCSIN"
-        ARCCOS "cudf::ast::ast_operator::ARCCOS"
-        ARCTAN "cudf::ast::ast_operator::ARCTAN"
-        SINH "cudf::ast::ast_operator::SINH"
-        COSH "cudf::ast::ast_operator::COSH"
-        TANH "cudf::ast::ast_operator::TANH"
-        ARCSINH "cudf::ast::ast_operator::ARCSINH"
-        ARCCOSH "cudf::ast::ast_operator::ARCCOSH"
-        ARCTANH "cudf::ast::ast_operator::ARCTANH"
-        EXP "cudf::ast::ast_operator::EXP"
-        LOG "cudf::ast::ast_operator::LOG"
-        SQRT "cudf::ast::ast_operator::SQRT"
-        CBRT "cudf::ast::ast_operator::CBRT"
-        CEIL "cudf::ast::ast_operator::CEIL"
-        FLOOR "cudf::ast::ast_operator::FLOOR"
-        ABS "cudf::ast::ast_operator::ABS"
-        RINT "cudf::ast::ast_operator::RINT"
-        BIT_INVERT "cudf::ast::ast_operator::BIT_INVERT"
-        NOT "cudf::ast::ast_operator::NOT"
+        IDENTITY
+        IS_NULL
+        SIN
+        COS
+        TAN
+        ARCSIN
+        ARCCOS
+        ARCTAN
+        SINH
+        COSH
+        TANH
+        ARCSINH
+        ARCCOSH
+        ARCTANH
+        EXP
+        LOG
+        SQRT
+        CBRT
+        CEIL
+        FLOOR
+        ABS
+        RINT
+        BIT_INVERT
+        NOT
 
     cdef cppclass expression:
         pass
 
-    ctypedef enum table_reference:
-        LEFT "cudf::ast::table_reference::LEFT"
-        RIGHT "cudf::ast::table_reference::RIGHT"
+    cpdef enum class table_reference(int32_t):
+        LEFT
+        RIGHT
 
     cdef cppclass literal(expression):
         # Due to https://github.com/cython/cython/issues/3198, we need to
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx b/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 86a4a60eef1..622725e06a3 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -19,8 +19,8 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 cimport cudf._lib.pylibcudf.libcudf.transform as libcudf_transform
 from cudf._lib.column cimport Column
-from cudf._lib.expressions cimport Expression
 from cudf._lib.pylibcudf cimport transform as plc_transform
+from cudf._lib.pylibcudf.expressions cimport Expression
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.expressions cimport expression
diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py
index 393a68dd844..63714a78572 100644
--- a/python/cudf/cudf/core/_internals/expressions.py
+++ b/python/cudf/cudf/core/_internals/expressions.py
@@ -4,7 +4,10 @@
 import ast
 import functools
 
-from cudf._lib.expressions import (
+import pyarrow as pa
+
+import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.expressions import (
     ASTOperator,
     ColumnReference,
     Expression,
@@ -122,7 +125,9 @@ def visit_Constant(self, node):
                 f"Unsupported literal {repr(node.value)} of type "
                 "{type(node.value).__name__}"
             )
-        self.stack.append(Literal(node.value))
+        self.stack.append(
+            Literal(plc.interop.from_arrow(pa.scalar(node.value)))
+        )
 
     def visit_UnaryOp(self, node):
         self.visit(node.operand)
@@ -132,7 +137,7 @@ def visit_UnaryOp(self, node):
             # operand, so there's no way to know whether this should be a float
             # or an int. We should maybe see what Spark does, and this will
             # probably require casting.
-            self.nodes.append(Literal(-1))
+            self.nodes.append(Literal(plc.interop.from_arrow(pa.scalar(-1))))
             op = ASTOperator.MUL
             self.stack.append(Operation(op, self.nodes[-1], self.nodes[-2]))
         elif isinstance(node.op, ast.UAdd):
diff --git a/python/cudf/cudf/pylibcudf_tests/test_expressions.py b/python/cudf/cudf/pylibcudf_tests/test_expressions.py
new file mode 100644
index 00000000000..f661512caad
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_expressions.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import pyarrow as pa
+import pytest
+
+import cudf._lib.pylibcudf as plc
+
+# We can't really evaluate these expressions, so just make sure
+# construction works properly
+
+
+def test_literal_construction_invalid():
+    with pytest.raises(ValueError):
+        plc.expressions.Literal(
+            plc.interop.from_arrow(pa.scalar(None, type=pa.list_(pa.int64())))
+        )
+
+
+@pytest.mark.parametrize(
+    "tableref",
+    [
+        plc.expressions.TableReference.LEFT,
+        plc.expressions.TableReference.RIGHT,
+    ],
+)
+def test_columnref_construction(tableref):
+    plc.expressions.ColumnReference(1.0, tableref)
+
+
+def test_columnnameref_construction():
+    plc.expressions.ColumnNameReference("abc")
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    [
+        # Unary op
+        {
+            "op": plc.expressions.ASTOperator.IDENTITY,
+            "left": plc.expressions.ColumnReference(1),
+        },
+        # Binop
+        {
+            "op": plc.expressions.ASTOperator.ADD,
+            "left": plc.expressions.ColumnReference(1),
+            "right": plc.expressions.ColumnReference(2),
+        },
+    ],
+)
+def test_astoperation_construction(kwargs):
+    plc.expressions.Operation(**kwargs)

From 2f8d514b1687164a94bbe89da1dab8eb37682b35 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 16 Jul 2024 20:15:25 -0400
Subject: [PATCH 523/842] Remove xml from sort_ninja_log.py utility (#16274)

Removes xml support from the `sort_ninja_log.py` utility. The xml support was experimental for possible use with Jenkins reporting that never materialized.
This script is used in build.sh generally when running local builds.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Srinivas Yadav (https://github.com/srinivasyadav18)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16274
---
 cpp/scripts/sort_ninja_log.py | 58 ++++++-----------------------------
 1 file changed, 9 insertions(+), 49 deletions(-)

diff --git a/cpp/scripts/sort_ninja_log.py b/cpp/scripts/sort_ninja_log.py
index 3fe503f749e..42f84e4d0c7 100755
--- a/cpp/scripts/sort_ninja_log.py
+++ b/cpp/scripts/sort_ninja_log.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 import argparse
 import os
@@ -9,14 +9,12 @@
 from xml.dom import minidom
 
 parser = argparse.ArgumentParser()
-parser.add_argument(
-    "log_file", type=str, default=".ninja_log", help=".ninja_log file"
-)
+parser.add_argument("log_file", type=str, default=".ninja_log", help=".ninja_log file")
 parser.add_argument(
     "--fmt",
     type=str,
     default="csv",
-    choices=["csv", "xml", "html"],
+    choices=["csv", "html"],
     help="output format (to stdout)",
 )
 parser.add_argument(
@@ -37,6 +35,7 @@
 output_fmt = args.fmt
 cmp_file = args.cmp_log
 
+
 # build a map of the log entries
 def build_log_map(log_file):
     entries = {}
@@ -68,37 +67,6 @@ def build_log_map(log_file):
     return entries
 
 
-# output results in XML format
-def output_xml(entries, sorted_list, args):
-    root = ET.Element("testsuites")
-    testsuite = ET.Element(
-        "testsuite",
-        attrib={
-            "name": "build-time",
-            "tests": str(len(sorted_list)),
-            "failures": str(0),
-            "errors": str(0),
-        },
-    )
-    root.append(testsuite)
-    for name in sorted_list:
-        entry = entries[name]
-        build_time = float(entry[1] - entry[0]) / 1000
-        item = ET.Element(
-            "testcase",
-            attrib={
-                "classname": "BuildTime",
-                "name": name,
-                "time": str(build_time),
-            },
-        )
-        testsuite.append(item)
-
-    tree = ET.ElementTree(root)
-    xmlstr = minidom.parseString(ET.tostring(root)).toprettyxml(indent="   ")
-    print(xmlstr)
-
-
 # utility converts a millisecond value to a column width in pixels
 def time_to_width(value, end):
     # map a value from (0,end) to (0,1000)
@@ -282,9 +250,7 @@ def output_html(entries, sorted_list, cmp_entries, args):
 
     # output detail table in build-time descending order
     print("<table id='detail' bgcolor='#EEEEEE'>")
-    print(
-        "<tr><th>File</th>", "<th>Compile time</th>", "<th>Size</th>", sep=""
-    )
+    print("<tr><th>File</th>", "<th>Compile time</th>", "<th>Size</th>", sep="")
     if cmp_entries:
         print("<th>t-cmp</th>", sep="")
     print("</tr>")
@@ -303,9 +269,7 @@ def output_html(entries, sorted_list, cmp_entries, args):
         print("<td align='right'>", build_time_str, "</td>", sep="", end="")
         print("<td align='right'>", file_size_str, "</td>", sep="", end="")
         # output diff column
-        cmp_entry = (
-            cmp_entries[name] if cmp_entries and name in cmp_entries else None
-        )
+        cmp_entry = cmp_entries[name] if cmp_entries and name in cmp_entries else None
         if cmp_entry:
             diff_time = build_time - (cmp_entry[1] - cmp_entry[0])
             diff_time_str = format_build_time(diff_time)
@@ -353,7 +317,7 @@ def output_html(entries, sorted_list, cmp_entries, args):
         print(
             "<tr><td",
             white,
-            ">time change &lt; 20%% or build time &lt; 1 minute</td></tr>",
+            ">time change &lt; 20% or build time &lt; 1 minute</td></tr>",
         )
         print("</table>")
 
@@ -370,9 +334,7 @@ def output_csv(entries, sorted_list, cmp_entries, args):
         entry = entries[name]
         build_time = entry[1] - entry[0]
         file_size = entry[2]
-        cmp_entry = (
-            cmp_entries[name] if cmp_entries and name in cmp_entries else None
-        )
+        cmp_entry = cmp_entries[name] if cmp_entries and name in cmp_entries else None
         print(build_time, file_size, name, sep=",", end="")
         if cmp_entry:
             diff_time = build_time - (cmp_entry[1] - cmp_entry[0])
@@ -396,9 +358,7 @@ def output_csv(entries, sorted_list, cmp_entries, args):
 # load the comparison build log if available
 cmp_entries = build_log_map(cmp_file) if cmp_file else None
 
-if output_fmt == "xml":
-    output_xml(entries, sorted_list, args)
-elif output_fmt == "html":
+if output_fmt == "html":
     output_html(entries, sorted_list, cmp_entries, args)
 else:
     output_csv(entries, sorted_list, cmp_entries, args)

From 093bcc94ccf156a7e39339a7c4bb7e86543187de Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 16 Jul 2024 20:16:07 -0400
Subject: [PATCH 524/842] Update cudf::detail::grid_1d to use thread_index_type
 (#16276)

Updates the `cudf::detail::grid_1d` to use `thread_index_type` instead of `int` and `size_type` for the number threads and blocks.
This has become important for launching kernels with more threads than max `size_type` total bytes for warp-per-row and thread-per-byte algorithms.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16276
---
 cpp/include/cudf/detail/utilities/cuda.cuh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index f1775c6d6d7..5007af7f9f1 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -41,8 +41,8 @@ static constexpr size_type warp_size{32};
  */
 class grid_1d {
  public:
-  int const num_threads_per_block;
-  int const num_blocks;
+  thread_index_type const num_threads_per_block;
+  thread_index_type const num_blocks;
   /**
    * @param overall_num_elements The number of elements the kernel needs to
    * handle/process, in its main, one-dimensional/linear input (e.g. one or more
@@ -55,9 +55,9 @@ class grid_1d {
    * than a single element; this affects the number of threads the grid must
    * contain
    */
-  grid_1d(cudf::size_type overall_num_elements,
-          cudf::size_type num_threads_per_block,
-          cudf::size_type elements_per_thread = 1)
+  grid_1d(thread_index_type overall_num_elements,
+          thread_index_type num_threads_per_block,
+          thread_index_type elements_per_thread = 1)
     : num_threads_per_block(num_threads_per_block),
       num_blocks(util::div_rounding_up_safe(overall_num_elements,
                                             elements_per_thread * num_threads_per_block))

From aa466aaf91bc329cc4fced9b9a3426d79bfe7ffc Mon Sep 17 00:00:00 2001
From: Robert Maynard <robertjmaynard@gmail.com>
Date: Wed, 17 Jul 2024 10:48:17 -0400
Subject: [PATCH 525/842] Move kernel vis over to CUDF_HIDDEN (#16165)

Use CUDF_HIDDEN instead of the raw `__attribute__((visibility("hidden")))`  for symbol visibility controls on the CUDA kernels that we call from multiple TUs.  This is primarily a style change so that we have consistent visibility markup across the entire project

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16165
---
 cpp/src/join/mixed_join_kernel.cuh      |  3 ++-
 cpp/src/join/mixed_join_kernels_semi.cu |  3 ++-
 cpp/src/join/mixed_join_size_kernel.cuh | 28 ++++++++++++-------------
 3 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh
index 0fc1c3718b1..ea59f23c77f 100644
--- a/cpp/src/join/mixed_join_kernel.cuh
+++ b/cpp/src/join/mixed_join_kernel.cuh
@@ -24,6 +24,7 @@
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <cooperative_groups.h>
@@ -38,7 +39,7 @@ namespace cg = cooperative_groups;
 #pragma GCC diagnostic ignored "-Wattributes"
 
 template <cudf::size_type block_size, bool has_nulls>
-__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
+CUDF_HIDDEN __launch_bounds__(block_size) __global__
   void mixed_join(table_device_view left_table,
                   table_device_view right_table,
                   table_device_view probe,
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index 01e3fe09b38..1f31eaa7878 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -22,6 +22,7 @@
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <cub/cub.cuh>
@@ -34,7 +35,7 @@ namespace cg = cooperative_groups;
 #pragma GCC diagnostic ignored "-Wattributes"
 
 template <cudf::size_type block_size, bool has_nulls>
-__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
+CUDF_HIDDEN __launch_bounds__(block_size) __global__
   void mixed_join_semi(table_device_view left_table,
                        table_device_view right_table,
                        table_device_view probe,
diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh
index 618e7a9082e..00a90f8273f 100644
--- a/cpp/src/join/mixed_join_size_kernel.cuh
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -22,6 +22,7 @@
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <cooperative_groups.h>
@@ -35,20 +36,19 @@ namespace cg = cooperative_groups;
 #pragma GCC diagnostic ignored "-Wattributes"
 
 template <int block_size, bool has_nulls>
-__attribute__((visibility("hidden"))) __launch_bounds__(block_size) __global__
-  void compute_mixed_join_output_size(
-    table_device_view left_table,
-    table_device_view right_table,
-    table_device_view probe,
-    table_device_view build,
-    row_hash const hash_probe,
-    row_equality const equality_probe,
-    join_kind const join_type,
-    cudf::detail::mixed_multimap_type::device_view hash_table_view,
-    ast::detail::expression_device_view device_expression_data,
-    bool const swap_tables,
-    std::size_t* output_size,
-    cudf::device_span<cudf::size_type> matches_per_row)
+CUDF_HIDDEN __launch_bounds__(block_size) __global__ void compute_mixed_join_output_size(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_hash const hash_probe,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  std::size_t* output_size,
+  cudf::device_span<cudf::size_type> matches_per_row)
 {
   // The (required) extern storage of the shared memory array leads to
   // conflicting declarations between different templates. The easiest

From 9db6723f2f2fe3451f0a5b81b7a43597358913ea Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Wed, 17 Jul 2024 09:54:09 -0700
Subject: [PATCH 526/842] Rename `.devcontainer`s for CUDA 12.5 (#16293)

Follow up to PR: https://github.com/rapidsai/cudf/pull/16259
Partially addresses issue: https://github.com/rapidsai/build-planning/issues/73

Renames the `.devcontainer`s for CUDA 12.5

Authors:
  - https://github.com/jakirkham

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Paul Taylor (https://github.com/trxcllnt)

URL: https://github.com/rapidsai/cudf/pull/16293
---
 .../{cuda12.2-conda => cuda12.5-conda}/devcontainer.json          | 0
 .devcontainer/{cuda12.2-pip => cuda12.5-pip}/devcontainer.json    | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename .devcontainer/{cuda12.2-conda => cuda12.5-conda}/devcontainer.json (100%)
 rename .devcontainer/{cuda12.2-pip => cuda12.5-pip}/devcontainer.json (100%)

diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
similarity index 100%
rename from .devcontainer/cuda12.2-conda/devcontainer.json
rename to .devcontainer/cuda12.5-conda/devcontainer.json
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
similarity index 100%
rename from .devcontainer/cuda12.2-pip/devcontainer.json
rename to .devcontainer/cuda12.5-pip/devcontainer.json

From 1dd63ea8b28339c3b4a351b82dd81d425d985ba3 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 17 Jul 2024 08:47:32 -1000
Subject: [PATCH 527/842] Short circuit some Column methods (#16246)

Adds some short circuiting, possibly cached checks (e.g. all values unique, no-NAs, monotonicity), to `dropna`, `isnull`, `notnull`, `argsort`, `unique` and `sort_values` allowing these ops to just copy / return a "simplified" result

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16246
---
 python/cudf/cudf/_lib/column.pyx       | 12 ++++---
 python/cudf/cudf/core/column/column.py | 50 ++++++++++++++++++++------
 2 files changed, 46 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 7155017b7af..e030147fdd3 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -202,11 +202,13 @@ cdef class Column:
 
     def _clear_cache(self):
         self._distinct_count = {}
-        try:
-            del self.memory_usage
-        except AttributeError:
-            # `self.memory_usage` was never called before, So ignore.
-            pass
+        attrs = ("memory_usage", "is_monotonic_increasing", "is_monotonic_decreasing")
+        for attr in attrs:
+            try:
+                delattr(self, attr)
+            except AttributeError:
+                # attr was not called yet, so ignore.
+                pass
         self._null_count = None
 
     def set_mask(self, value):
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index dbdf501e022..9467bbeed15 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -274,7 +274,10 @@ def any(self, skipna: bool = True) -> bool:
         return libcudf.reduce.reduce("any", self, dtype=np.bool_)
 
     def dropna(self) -> Self:
-        return drop_nulls([self])[0]._with_type_metadata(self.dtype)
+        if self.has_nulls():
+            return drop_nulls([self])[0]._with_type_metadata(self.dtype)
+        else:
+            return self.copy()
 
     def to_arrow(self) -> pa.Array:
         """Convert to PyArrow Array
@@ -699,6 +702,9 @@ def fillna(
 
     def isnull(self) -> ColumnBase:
         """Identify missing values in a Column."""
+        if not self.has_nulls(include_nan=self.dtype.kind == "f"):
+            return as_column(False, length=len(self))
+
         result = libcudf.unary.is_null(self)
 
         if self.dtype.kind == "f":
@@ -710,6 +716,9 @@ def isnull(self) -> ColumnBase:
 
     def notnull(self) -> ColumnBase:
         """Identify non-missing values in a Column."""
+        if not self.has_nulls(include_nan=self.dtype.kind == "f"):
+            return as_column(True, length=len(self))
+
         result = libcudf.unary.is_valid(self)
 
         if self.dtype.kind == "f":
@@ -922,15 +931,16 @@ def as_mask(self) -> Buffer:
 
     @property
     def is_unique(self) -> bool:
+        # distinct_count might already be cached
         return self.distinct_count(dropna=False) == len(self)
 
-    @property
+    @cached_property
     def is_monotonic_increasing(self) -> bool:
         return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
             [self], [True], None
         )
 
-    @property
+    @cached_property
     def is_monotonic_decreasing(self) -> bool:
         return not self.has_nulls(include_nan=True) and libcudf.sort.is_sorted(
             [self], [False], None
@@ -941,6 +951,10 @@ def sort_values(
         ascending: bool = True,
         na_position: str = "last",
     ) -> ColumnBase:
+        if (not ascending and self.is_monotonic_decreasing) or (
+            ascending and self.is_monotonic_increasing
+        ):
+            return self.copy()
         return libcudf.sort.sort(
             [self], column_order=[ascending], null_precedence=[na_position]
         )[0]
@@ -1090,11 +1104,22 @@ def apply_boolean_mask(self, mask) -> ColumnBase:
         )
 
     def argsort(
-        self, ascending: bool = True, na_position: str = "last"
-    ) -> "cudf.core.column.NumericalColumn":
-        return libcudf.sort.order_by(
-            [self], [ascending], na_position, stable=True
-        )
+        self,
+        ascending: bool = True,
+        na_position: Literal["first", "last"] = "last",
+    ) -> cudf.core.column.NumericalColumn:
+        if (ascending and self.is_monotonic_increasing) or (
+            not ascending and self.is_monotonic_decreasing
+        ):
+            return as_column(range(len(self)))
+        elif (ascending and self.is_monotonic_decreasing) or (
+            not ascending and self.is_monotonic_increasing
+        ):
+            return as_column(range(len(self) - 1, -1, -1))
+        else:
+            return libcudf.sort.order_by(
+                [self], [ascending], na_position, stable=True
+            )
 
     def __arrow_array__(self, type=None):
         raise TypeError(
@@ -1157,9 +1182,12 @@ def unique(self) -> ColumnBase:
         """
         Get unique values in the data
         """
-        return drop_duplicates([self], keep="first")[0]._with_type_metadata(
-            self.dtype
-        )
+        if self.is_unique:
+            return self.copy()
+        else:
+            return drop_duplicates([self], keep="first")[
+                0
+            ]._with_type_metadata(self.dtype)
 
     def serialize(self) -> tuple[dict, list]:
         # data model:

From 8b767e5c237840e0a35848bff7ed479ec5c56bb1 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Wed, 17 Jul 2024 13:33:45 -0600
Subject: [PATCH 528/842] Remove decimal/floating 64/128bit switches due to
 register pressure (#16287)

The decimal <--> floating conversion PR reduced the performance of some of the AST and BINARYOP kernels due to register pressure.  This removes the switches that are the primary source of the register pressure, falling back to the old ipow() method for 64bit and 128bit integers.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16287
---
 cpp/include/cudf/fixed_point/fixed_point.hpp  |   4 +-
 .../cudf/fixed_point/floating_conversion.hpp  | 138 +-----------------
 2 files changed, 6 insertions(+), 136 deletions(-)

diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index 6c3c3b4da07..c9cbc603226 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -84,8 +84,8 @@ template <typename Rep,
           Radix Base,
           typename T,
           typename cuda::std::enable_if_t<(cuda::std::is_same_v<int32_t, T> &&
-                                           is_supported_representation_type<Rep>())>* = nullptr>
-CUDF_HOST_DEVICE inline Rep ipow(T exponent)
+                                           cuda::std::is_integral_v<Rep>)>* = nullptr>
+CUDF_HOST_DEVICE inline constexpr Rep ipow(T exponent)
 {
   cudf_assert(exponent >= 0 && "integer exponentiation with negative exponent is not possible.");
 
diff --git a/cpp/include/cudf/fixed_point/floating_conversion.hpp b/cpp/include/cudf/fixed_point/floating_conversion.hpp
index c64ae8877d4..f12177c6a4b 100644
--- a/cpp/include/cudf/fixed_point/floating_conversion.hpp
+++ b/cpp/include/cudf/fixed_point/floating_conversion.hpp
@@ -392,30 +392,7 @@ CUDF_HOST_DEVICE inline T divide_power10_32bit(T value, int pow10)
 template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
 CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int pow10)
 {
-  // See comments in divide_power10_32bit() for discussion.
-  switch (pow10) {
-    case 0: return value;
-    case 1: return value / 10U;
-    case 2: return value / 100U;
-    case 3: return value / 1000U;
-    case 4: return value / 10000U;
-    case 5: return value / 100000U;
-    case 6: return value / 1000000U;
-    case 7: return value / 10000000U;
-    case 8: return value / 100000000U;
-    case 9: return value / 1000000000U;
-    case 10: return value / 10000000000ULL;
-    case 11: return value / 100000000000ULL;
-    case 12: return value / 1000000000000ULL;
-    case 13: return value / 10000000000000ULL;
-    case 14: return value / 100000000000000ULL;
-    case 15: return value / 1000000000000000ULL;
-    case 16: return value / 10000000000000000ULL;
-    case 17: return value / 100000000000000000ULL;
-    case 18: return value / 1000000000000000000ULL;
-    case 19: return value / 10000000000000000000ULL;
-    default: return 0;
-  }
+  return value / ipow<uint64_t, Radix::BASE_10>(pow10);
 }
 
 /**
@@ -429,49 +406,7 @@ CUDF_HOST_DEVICE inline T divide_power10_64bit(T value, int pow10)
 template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
 CUDF_HOST_DEVICE inline constexpr T divide_power10_128bit(T value, int pow10)
 {
-  // See comments in divide_power10_32bit() for an introduction.
-  switch (pow10) {
-    case 0: return value;
-    case 1: return value / 10U;
-    case 2: return value / 100U;
-    case 3: return value / 1000U;
-    case 4: return value / 10000U;
-    case 5: return value / 100000U;
-    case 6: return value / 1000000U;
-    case 7: return value / 10000000U;
-    case 8: return value / 100000000U;
-    case 9: return value / 1000000000U;
-    case 10: return value / 10000000000ULL;
-    case 11: return value / 100000000000ULL;
-    case 12: return value / 1000000000000ULL;
-    case 13: return value / 10000000000000ULL;
-    case 14: return value / 100000000000000ULL;
-    case 15: return value / 1000000000000000ULL;
-    case 16: return value / 10000000000000000ULL;
-    case 17: return value / 100000000000000000ULL;
-    case 18: return value / 1000000000000000000ULL;
-    case 19: return value / 10000000000000000000ULL;
-    case 20: return value / large_power_of_10<20>();
-    case 21: return value / large_power_of_10<21>();
-    case 22: return value / large_power_of_10<22>();
-    case 23: return value / large_power_of_10<23>();
-    case 24: return value / large_power_of_10<24>();
-    case 25: return value / large_power_of_10<25>();
-    case 26: return value / large_power_of_10<26>();
-    case 27: return value / large_power_of_10<27>();
-    case 28: return value / large_power_of_10<28>();
-    case 29: return value / large_power_of_10<29>();
-    case 30: return value / large_power_of_10<30>();
-    case 31: return value / large_power_of_10<31>();
-    case 32: return value / large_power_of_10<32>();
-    case 33: return value / large_power_of_10<33>();
-    case 34: return value / large_power_of_10<34>();
-    case 35: return value / large_power_of_10<35>();
-    case 36: return value / large_power_of_10<36>();
-    case 37: return value / large_power_of_10<37>();
-    case 38: return value / large_power_of_10<38>();
-    default: return 0;
-  }
+  return value / ipow<__uint128_t, Radix::BASE_10>(pow10);
 }
 
 /**
@@ -512,30 +447,7 @@ CUDF_HOST_DEVICE inline constexpr T multiply_power10_32bit(T value, int pow10)
 template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
 CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int pow10)
 {
-  // See comments in divide_power10_32bit() for discussion.
-  switch (pow10) {
-    case 0: return value;
-    case 1: return value * 10U;
-    case 2: return value * 100U;
-    case 3: return value * 1000U;
-    case 4: return value * 10000U;
-    case 5: return value * 100000U;
-    case 6: return value * 1000000U;
-    case 7: return value * 10000000U;
-    case 8: return value * 100000000U;
-    case 9: return value * 1000000000U;
-    case 10: return value * 10000000000ULL;
-    case 11: return value * 100000000000ULL;
-    case 12: return value * 1000000000000ULL;
-    case 13: return value * 10000000000000ULL;
-    case 14: return value * 100000000000000ULL;
-    case 15: return value * 1000000000000000ULL;
-    case 16: return value * 10000000000000000ULL;
-    case 17: return value * 100000000000000000ULL;
-    case 18: return value * 1000000000000000000ULL;
-    case 19: return value * 10000000000000000000ULL;
-    default: return 0;
-  }
+  return value * ipow<uint64_t, Radix::BASE_10>(pow10);
 }
 
 /**
@@ -549,49 +461,7 @@ CUDF_HOST_DEVICE inline constexpr T multiply_power10_64bit(T value, int pow10)
 template <typename T, CUDF_ENABLE_IF(cuda::std::is_unsigned_v<T>)>
 CUDF_HOST_DEVICE inline constexpr T multiply_power10_128bit(T value, int pow10)
 {
-  // See comments in divide_power10_128bit() for discussion.
-  switch (pow10) {
-    case 0: return value;
-    case 1: return value * 10U;
-    case 2: return value * 100U;
-    case 3: return value * 1000U;
-    case 4: return value * 10000U;
-    case 5: return value * 100000U;
-    case 6: return value * 1000000U;
-    case 7: return value * 10000000U;
-    case 8: return value * 100000000U;
-    case 9: return value * 1000000000U;
-    case 10: return value * 10000000000ULL;
-    case 11: return value * 100000000000ULL;
-    case 12: return value * 1000000000000ULL;
-    case 13: return value * 10000000000000ULL;
-    case 14: return value * 100000000000000ULL;
-    case 15: return value * 1000000000000000ULL;
-    case 16: return value * 10000000000000000ULL;
-    case 17: return value * 100000000000000000ULL;
-    case 18: return value * 1000000000000000000ULL;
-    case 19: return value * 10000000000000000000ULL;
-    case 20: return value * large_power_of_10<20>();
-    case 21: return value * large_power_of_10<21>();
-    case 22: return value * large_power_of_10<22>();
-    case 23: return value * large_power_of_10<23>();
-    case 24: return value * large_power_of_10<24>();
-    case 25: return value * large_power_of_10<25>();
-    case 26: return value * large_power_of_10<26>();
-    case 27: return value * large_power_of_10<27>();
-    case 28: return value * large_power_of_10<28>();
-    case 29: return value * large_power_of_10<29>();
-    case 30: return value * large_power_of_10<30>();
-    case 31: return value * large_power_of_10<31>();
-    case 32: return value * large_power_of_10<32>();
-    case 33: return value * large_power_of_10<33>();
-    case 34: return value * large_power_of_10<34>();
-    case 35: return value * large_power_of_10<35>();
-    case 36: return value * large_power_of_10<36>();
-    case 37: return value * large_power_of_10<37>();
-    case 38: return value * large_power_of_10<38>();
-    default: return 0;
-  }
+  return value * ipow<__uint128_t, Radix::BASE_10>(pow10);
 }
 
 /**

From 34dea6fe40fc20966b48257853865111df4a687f Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Wed, 17 Jul 2024 15:27:40 -0700
Subject: [PATCH 529/842] Add TPC-H inspired examples for Libcudf (#16088)

This PR adds a suite of `libcudf` examples with queries inspired from the TPC-H benchmarks. This PR also adds some reusable helper functions to perform operations such as joins, groubys, and orderbys for a cleaner and modular implementation of the queries.

# Queries implemented so far:
- [x] Query 1
- [X] Query 5
- [X] Query 6
- [X] Query 9

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16088
---
 cpp/examples/build.sh                  |   1 +
 cpp/examples/parquet_io/parquet_io.cpp |   4 +-
 cpp/examples/parquet_io/parquet_io.hpp |  31 --
 cpp/examples/tpch/CMakeLists.txt       |  32 ++
 cpp/examples/tpch/README.md            |  38 ++
 cpp/examples/tpch/q1.cpp               | 174 ++++++++++
 cpp/examples/tpch/q5.cpp               | 169 +++++++++
 cpp/examples/tpch/q6.cpp               | 137 ++++++++
 cpp/examples/tpch/q9.cpp               | 182 ++++++++++
 cpp/examples/tpch/utils.hpp            | 457 +++++++++++++++++++++++++
 cpp/examples/utilities/timer.hpp       |  54 +++
 11 files changed, 1247 insertions(+), 32 deletions(-)
 create mode 100644 cpp/examples/tpch/CMakeLists.txt
 create mode 100644 cpp/examples/tpch/README.md
 create mode 100644 cpp/examples/tpch/q1.cpp
 create mode 100644 cpp/examples/tpch/q5.cpp
 create mode 100644 cpp/examples/tpch/q6.cpp
 create mode 100644 cpp/examples/tpch/q9.cpp
 create mode 100644 cpp/examples/tpch/utils.hpp
 create mode 100644 cpp/examples/utilities/timer.hpp

diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index bde6ef7d69c..dce81fb1677 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -57,6 +57,7 @@ build_example() {
 }
 
 build_example basic
+build_example tpch
 build_example strings
 build_example nested_types
 build_example parquet_io
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index 8be17db3781..274a2599189 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -16,6 +16,8 @@
 
 #include "parquet_io.hpp"
 
+#include "../utilities/timer.hpp"
+
 /**
  * @file parquet_io.cpp
  * @brief Demonstrates usage of the libcudf APIs to read and write
@@ -140,7 +142,7 @@ int main(int argc, char const** argv)
             << page_stat_string << ".." << std::endl;
 
   // `timer` is automatically started here
-  Timer timer;
+  cudf::examples::timer timer;
   write_parquet(input->view(), metadata, output_filepath, encoding, compression, page_stats);
   timer.print_elapsed_millis();
 
diff --git a/cpp/examples/parquet_io/parquet_io.hpp b/cpp/examples/parquet_io/parquet_io.hpp
index d2fc359a2fe..e27cbec4fce 100644
--- a/cpp/examples/parquet_io/parquet_io.hpp
+++ b/cpp/examples/parquet_io/parquet_io.hpp
@@ -124,34 +124,3 @@ std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(bool is_
 
   return std::nullopt;
 }
-
-/**
- * @brief Light-weight timer for parquet reader and writer instrumentation
- *
- * Timer object constructed from std::chrono, instrumenting at microseconds
- * precision. Can display elapsed durations at milli and micro second
- * scales. Timer starts at object construction.
- */
-class Timer {
- public:
-  using micros = std::chrono::microseconds;
-  using millis = std::chrono::milliseconds;
-
-  Timer() { reset(); }
-  void reset() { start_time = std::chrono::high_resolution_clock::now(); }
-  auto elapsed() { return (std::chrono::high_resolution_clock::now() - start_time); }
-  void print_elapsed_micros()
-  {
-    std::cout << "Elapsed Time: " << std::chrono::duration_cast<micros>(elapsed()).count()
-              << "us\n\n";
-  }
-  void print_elapsed_millis()
-  {
-    std::cout << "Elapsed Time: " << std::chrono::duration_cast<millis>(elapsed()).count()
-              << "ms\n\n";
-  }
-
- private:
-  using time_point_t = std::chrono::time_point<std::chrono::high_resolution_clock>;
-  time_point_t start_time;
-};
diff --git a/cpp/examples/tpch/CMakeLists.txt b/cpp/examples/tpch/CMakeLists.txt
new file mode 100644
index 00000000000..1b91d07e148
--- /dev/null
+++ b/cpp/examples/tpch/CMakeLists.txt
@@ -0,0 +1,32 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+cmake_minimum_required(VERSION 3.26.4)
+
+include(../set_cuda_architecture.cmake)
+
+rapids_cuda_init_architectures(tpch_example)
+rapids_cuda_set_architectures(RAPIDS)
+
+project(
+  tpch_example
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+include(../fetch_dependencies.cmake)
+
+add_executable(tpch_q1 q1.cpp)
+target_link_libraries(tpch_q1 PRIVATE cudf::cudf)
+target_compile_features(tpch_q1 PRIVATE cxx_std_17)
+
+add_executable(tpch_q5 q5.cpp)
+target_link_libraries(tpch_q5 PRIVATE cudf::cudf)
+target_compile_features(tpch_q5 PRIVATE cxx_std_17)
+
+add_executable(tpch_q6 q6.cpp)
+target_link_libraries(tpch_q6 PRIVATE cudf::cudf)
+target_compile_features(tpch_q6 PRIVATE cxx_std_17)
+
+add_executable(tpch_q9 q9.cpp)
+target_link_libraries(tpch_q9 PRIVATE cudf::cudf)
+target_compile_features(tpch_q9 PRIVATE cxx_std_17)
diff --git a/cpp/examples/tpch/README.md b/cpp/examples/tpch/README.md
new file mode 100644
index 00000000000..1ea71ae9824
--- /dev/null
+++ b/cpp/examples/tpch/README.md
@@ -0,0 +1,38 @@
+# TPC-H Inspired Examples
+
+Implements TPC-H queries using `libcudf`. We leverage the data generator (wrapper around official TPC-H datagen) from [Apache Datafusion](https://github.com/apache/datafusion) for generating data in Parquet format.
+
+## Requirements
+
+- Rust
+
+## Generating the Dataset
+
+1. Clone the datafusion repository.
+```bash
+git clone git@github.com:apache/datafusion.git
+```
+
+2. Run the data generator. The data will be placed in a `data/` subdirectory.
+```bash
+cd datafusion/benchmarks/
+./bench.sh data tpch
+
+# for scale factor 10,
+./bench.sh data tpch10
+```
+
+## Running Queries
+
+1. Build the examples.
+```bash
+cd cpp/examples
+./build.sh
+```
+The TPC-H query binaries would be built inside `examples/tpch/build`.
+
+2. Execute the queries.
+```bash
+./tpch/build/tpch_q1
+```
+A parquet file named `q1.parquet` would be generated holding the results of the query.
diff --git a/cpp/examples/tpch/q1.cpp b/cpp/examples/tpch/q1.cpp
new file mode 100644
index 00000000000..1bdf039da4a
--- /dev/null
+++ b/cpp/examples/tpch/q1.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../utilities/timer.hpp"
+#include "utils.hpp"
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+/**
+ * @file q1.cpp
+ * @brief Implement query 1 of the TPC-H benchmark.
+ *
+ * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
+ *
+ * select
+ *    l_returnflag,
+ *    l_linestatus,
+ *    sum(l_quantity) as sum_qty,
+ *    sum(l_extendedprice) as sum_base_price,
+ *    sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
+ *    sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+ *    avg(l_quantity) as avg_qty,
+ *    avg(l_extendedprice) as avg_price,
+ *    avg(l_discount) as avg_disc,
+ *    count(*) as count_order
+ * from
+ *    lineitem
+ * where
+ *    l_shipdate <= date '1998-09-02'
+ * group by
+ *    l_returnflag,
+ *    l_linestatus
+ * order by
+ *    l_returnflag,
+ *    l_linestatus;
+ */
+
+/**
+ * @brief Calculate the discount price column
+ *
+ * @param discount The discount column
+ * @param extendedprice The extended price column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_disc_price(
+  cudf::column_view const& discount,
+  cudf::column_view const& extendedprice,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const one = cudf::numeric_scalar<double>(1);
+  auto const one_minus_discount =
+    cudf::binary_operation(one, discount, cudf::binary_operator::SUB, discount.type(), stream, mr);
+  auto const disc_price_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto disc_price            = cudf::binary_operation(extendedprice,
+                                           one_minus_discount->view(),
+                                           cudf::binary_operator::MUL,
+                                           disc_price_type,
+                                           stream,
+                                           mr);
+  return disc_price;
+}
+
+/**
+ * @brief Calculate the charge column
+ *
+ * @param tax The tax column
+ * @param disc_price The discount price column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_charge(
+  cudf::column_view const& tax,
+  cudf::column_view const& disc_price,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const one = cudf::numeric_scalar<double>(1);
+  auto const one_plus_tax =
+    cudf::binary_operation(one, tax, cudf::binary_operator::ADD, tax.type(), stream, mr);
+  auto const charge_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto charge            = cudf::binary_operation(
+    disc_price, one_plus_tax->view(), cudf::binary_operator::MUL, charge_type, stream, mr);
+  return charge;
+}
+
+int main(int argc, char const** argv)
+{
+  auto const args = parse_args(argc, argv);
+
+  // Use a memory pool
+  auto resource = create_memory_resource(args.memory_resource_type);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  cudf::examples::timer timer;
+
+  // Define the column projections and filter predicate for `lineitem` table
+  std::vector<std::string> const lineitem_cols = {"l_returnflag",
+                                                  "l_linestatus",
+                                                  "l_quantity",
+                                                  "l_extendedprice",
+                                                  "l_discount",
+                                                  "l_shipdate",
+                                                  "l_orderkey",
+                                                  "l_tax"};
+  auto const shipdate_ref                      = cudf::ast::column_reference(std::distance(
+    lineitem_cols.begin(), std::find(lineitem_cols.begin(), lineitem_cols.end(), "l_shipdate")));
+  auto shipdate_upper =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1998, 9, 2), true);
+  auto const shipdate_upper_literal = cudf::ast::literal(shipdate_upper);
+  auto lineitem_pred                = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::LESS_EQUAL, shipdate_ref, shipdate_upper_literal);
+
+  // Read out the `lineitem` table from parquet file
+  auto lineitem =
+    read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred));
+
+  // Calculate the discount price and charge columns and append to lineitem table
+  auto disc_price =
+    calc_disc_price(lineitem->column("l_discount"), lineitem->column("l_extendedprice"));
+  auto charge = calc_charge(lineitem->column("l_tax"), disc_price->view());
+  (*lineitem).append(disc_price, "disc_price").append(charge, "charge");
+
+  // Perform the group by operation
+  auto const groupedby_table = apply_groupby(
+    lineitem,
+    groupby_context_t{
+      {"l_returnflag", "l_linestatus"},
+      {
+        {"l_extendedprice",
+         {{cudf::aggregation::Kind::SUM, "sum_base_price"},
+          {cudf::aggregation::Kind::MEAN, "avg_price"}}},
+        {"l_quantity",
+         {{cudf::aggregation::Kind::SUM, "sum_qty"}, {cudf::aggregation::Kind::MEAN, "avg_qty"}}},
+        {"l_discount",
+         {
+           {cudf::aggregation::Kind::MEAN, "avg_disc"},
+         }},
+        {"disc_price",
+         {
+           {cudf::aggregation::Kind::SUM, "sum_disc_price"},
+         }},
+        {"charge",
+         {{cudf::aggregation::Kind::SUM, "sum_charge"},
+          {cudf::aggregation::Kind::COUNT_ALL, "count_order"}}},
+      }});
+
+  // Perform the order by operation
+  auto const orderedby_table = apply_orderby(groupedby_table,
+                                             {"l_returnflag", "l_linestatus"},
+                                             {cudf::order::ASCENDING, cudf::order::ASCENDING});
+
+  timer.print_elapsed_millis();
+
+  // Write query result to a parquet file
+  orderedby_table->to_parquet("q1.parquet");
+  return 0;
+}
diff --git a/cpp/examples/tpch/q5.cpp b/cpp/examples/tpch/q5.cpp
new file mode 100644
index 00000000000..e56850b94d6
--- /dev/null
+++ b/cpp/examples/tpch/q5.cpp
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../utilities/timer.hpp"
+#include "utils.hpp"
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+/**
+ * @file q5.cpp
+ * @brief Implement query 5 of the TPC-H benchmark.
+ *
+ * create view customer as select * from '/tables/scale-1/customer.parquet';
+ * create view orders as select * from '/tables/scale-1/orders.parquet';
+ * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
+ * create view supplier as select * from '/tables/scale-1/supplier.parquet';
+ * create view nation as select * from '/tables/scale-1/nation.parquet';
+ * create view region as select * from '/tables/scale-1/region.parquet';
+ *
+ * select
+ *    n_name,
+ *    sum(l_extendedprice * (1 - l_discount)) as revenue
+ * from
+ *    customer,
+ *    orders,
+ *    lineitem,
+ *    supplier,
+ *    nation,
+ *    region
+ * where
+ *     c_custkey = o_custkey
+ *    and l_orderkey = o_orderkey
+ *    and l_suppkey = s_suppkey
+ *    and c_nationkey = s_nationkey
+ *    and s_nationkey = n_nationkey
+ *    and n_regionkey = r_regionkey
+ *    and r_name = 'ASIA'
+ *    and o_orderdate >= date '1994-01-01'
+ *    and o_orderdate < date '1995-01-01'
+ * group by
+ *    n_name
+ * order by
+ *    revenue desc;
+ */
+
+/**
+ * @brief Calculate the revenue column
+ *
+ * @param extendedprice The extended price column
+ * @param discount The discount column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+  cudf::column_view const& extendedprice,
+  cudf::column_view const& discount,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const one = cudf::numeric_scalar<double>(1);
+  auto const one_minus_discount =
+    cudf::binary_operation(one, discount, cudf::binary_operator::SUB, discount.type(), stream, mr);
+  auto const revenue_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto revenue            = cudf::binary_operation(extendedprice,
+                                        one_minus_discount->view(),
+                                        cudf::binary_operator::MUL,
+                                        revenue_type,
+                                        stream,
+                                        mr);
+  return revenue;
+}
+
+int main(int argc, char const** argv)
+{
+  auto const args = parse_args(argc, argv);
+
+  // Use a memory pool
+  auto resource = create_memory_resource(args.memory_resource_type);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  cudf::examples::timer timer;
+
+  // Define the column projection and filter predicate for the `orders` table
+  std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
+  auto const o_orderdate_ref                 = cudf::ast::column_reference(std::distance(
+    orders_cols.begin(), std::find(orders_cols.begin(), orders_cols.end(), "o_orderdate")));
+  auto o_orderdate_lower =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1994, 1, 1), true);
+  auto const o_orderdate_lower_limit = cudf::ast::literal(o_orderdate_lower);
+  auto const o_orderdate_pred_lower  = cudf::ast::operation(
+    cudf::ast::ast_operator::GREATER_EQUAL, o_orderdate_ref, o_orderdate_lower_limit);
+  auto o_orderdate_upper =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1995, 1, 1), true);
+  auto const o_orderdate_upper_limit = cudf::ast::literal(o_orderdate_upper);
+  auto const o_orderdate_pred_upper =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, o_orderdate_ref, o_orderdate_upper_limit);
+  auto orders_pred = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::LOGICAL_AND, o_orderdate_pred_lower, o_orderdate_pred_upper);
+
+  // Define the column projection and filter predicate for the `region` table
+  std::vector<std::string> const region_cols = {"r_regionkey", "r_name"};
+  auto const r_name_ref                      = cudf::ast::column_reference(std::distance(
+    region_cols.begin(), std::find(region_cols.begin(), region_cols.end(), "r_name")));
+  auto r_name_value                          = cudf::string_scalar("ASIA");
+  auto const r_name_literal                  = cudf::ast::literal(r_name_value);
+  auto region_pred                           = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::EQUAL, r_name_ref, r_name_literal);
+
+  // Read out the tables from parquet files
+  // while pushing down the column projections and filter predicates
+  auto const customer =
+    read_parquet(args.dataset_dir + "/customer.parquet", {"c_custkey", "c_nationkey"});
+  auto const orders =
+    read_parquet(args.dataset_dir + "/orders.parquet", orders_cols, std::move(orders_pred));
+  auto const lineitem = read_parquet(args.dataset_dir + "/lineitem.parquet",
+                                     {"l_orderkey", "l_suppkey", "l_extendedprice", "l_discount"});
+  auto const supplier =
+    read_parquet(args.dataset_dir + "/supplier.parquet", {"s_suppkey", "s_nationkey"});
+  auto const nation =
+    read_parquet(args.dataset_dir + "/nation.parquet", {"n_nationkey", "n_regionkey", "n_name"});
+  auto const region =
+    read_parquet(args.dataset_dir + "/region.parquet", region_cols, std::move(region_pred));
+
+  // Perform the joins
+  auto const join_a = apply_inner_join(region, nation, {"r_regionkey"}, {"n_regionkey"});
+  auto const join_b = apply_inner_join(join_a, customer, {"n_nationkey"}, {"c_nationkey"});
+  auto const join_c = apply_inner_join(join_b, orders, {"c_custkey"}, {"o_custkey"});
+  auto const join_d = apply_inner_join(join_c, lineitem, {"o_orderkey"}, {"l_orderkey"});
+  auto joined_table =
+    apply_inner_join(supplier, join_d, {"s_suppkey", "s_nationkey"}, {"l_suppkey", "n_nationkey"});
+
+  // Calculate and append the `revenue` column
+  auto revenue =
+    calc_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
+  (*joined_table).append(revenue, "revenue");
+
+  // Perform the groupby operation
+  auto const groupedby_table =
+    apply_groupby(joined_table,
+                  groupby_context_t{{"n_name"},
+                                    {
+                                      {"revenue", {{cudf::aggregation::Kind::SUM, "revenue"}}},
+                                    }});
+
+  // Perform the order by operation
+  auto const orderedby_table =
+    apply_orderby(groupedby_table, {"revenue"}, {cudf::order::DESCENDING});
+
+  timer.print_elapsed_millis();
+
+  // Write query result to a parquet file
+  orderedby_table->to_parquet("q5.parquet");
+  return 0;
+}
diff --git a/cpp/examples/tpch/q6.cpp b/cpp/examples/tpch/q6.cpp
new file mode 100644
index 00000000000..f11b3d6ab3b
--- /dev/null
+++ b/cpp/examples/tpch/q6.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../utilities/timer.hpp"
+#include "utils.hpp"
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+/**
+ * @file q6.cpp
+ * @brief Implement query 6 of the TPC-H benchmark.
+ *
+ * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
+ *
+ * select
+ *    sum(l_extendedprice * l_discount) as revenue
+ * from
+ *    lineitem
+ * where
+ *    l_shipdate >= date '1994-01-01'
+ *    and l_shipdate < date '1995-01-01'
+ *    and l_discount >= 0.05
+ *    and l_discount <= 0.07
+ *    and l_quantity < 24;
+ */
+
+/**
+ * @brief Calculate the revenue column
+ *
+ * @param extendedprice The extended price column
+ * @param discount The discount column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+  cudf::column_view const& extendedprice,
+  cudf::column_view const& discount,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const revenue_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto revenue            = cudf::binary_operation(
+    extendedprice, discount, cudf::binary_operator::MUL, revenue_type, stream, mr);
+  return revenue;
+}
+
+int main(int argc, char const** argv)
+{
+  auto const args = parse_args(argc, argv);
+
+  // Use a memory pool
+  auto resource = create_memory_resource(args.memory_resource_type);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  cudf::examples::timer timer;
+
+  // Read out the `lineitem` table from parquet file
+  std::vector<std::string> const lineitem_cols = {
+    "l_extendedprice", "l_discount", "l_shipdate", "l_quantity"};
+  auto const shipdate_ref = cudf::ast::column_reference(std::distance(
+    lineitem_cols.begin(), std::find(lineitem_cols.begin(), lineitem_cols.end(), "l_shipdate")));
+  auto shipdate_lower =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1994, 1, 1), true);
+  auto const shipdate_lower_literal = cudf::ast::literal(shipdate_lower);
+  auto shipdate_upper =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1995, 1, 1), true);
+  auto const shipdate_upper_literal = cudf::ast::literal(shipdate_upper);
+  auto const shipdate_pred_a        = cudf::ast::operation(
+    cudf::ast::ast_operator::GREATER_EQUAL, shipdate_ref, shipdate_lower_literal);
+  auto const shipdate_pred_b =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, shipdate_ref, shipdate_upper_literal);
+  auto lineitem_pred = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b);
+  auto lineitem =
+    read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred));
+
+  // Cast the discount and quantity columns to float32 and append to lineitem table
+  auto discout_float =
+    cudf::cast(lineitem->column("l_discount"), cudf::data_type{cudf::type_id::FLOAT32});
+  auto quantity_float =
+    cudf::cast(lineitem->column("l_quantity"), cudf::data_type{cudf::type_id::FLOAT32});
+
+  (*lineitem).append(discout_float, "l_discount_float").append(quantity_float, "l_quantity_float");
+
+  // Apply the filters
+  auto const discount_ref = cudf::ast::column_reference(lineitem->col_id("l_discount_float"));
+  auto const quantity_ref = cudf::ast::column_reference(lineitem->col_id("l_quantity_float"));
+
+  auto discount_lower               = cudf::numeric_scalar<float_t>(0.05);
+  auto const discount_lower_literal = cudf::ast::literal(discount_lower);
+  auto discount_upper               = cudf::numeric_scalar<float_t>(0.07);
+  auto const discount_upper_literal = cudf::ast::literal(discount_upper);
+  auto quantity_upper               = cudf::numeric_scalar<float_t>(24);
+  auto const quantity_upper_literal = cudf::ast::literal(quantity_upper);
+
+  auto const discount_pred_a = cudf::ast::operation(
+    cudf::ast::ast_operator::GREATER_EQUAL, discount_ref, discount_lower_literal);
+
+  auto const discount_pred_b =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, discount_ref, discount_upper_literal);
+  auto const discount_pred =
+    cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, discount_pred_a, discount_pred_b);
+  auto const quantity_pred =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, quantity_ref, quantity_upper_literal);
+  auto const discount_quantity_pred =
+    cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, discount_pred, quantity_pred);
+  auto const filtered_table = apply_filter(lineitem, discount_quantity_pred);
+
+  // Calculate the `revenue` column
+  auto revenue =
+    calc_revenue(filtered_table->column("l_extendedprice"), filtered_table->column("l_discount"));
+
+  // Sum the `revenue` column
+  auto const revenue_view = revenue->view();
+  auto const result_table = apply_reduction(revenue_view, cudf::aggregation::Kind::SUM, "revenue");
+
+  timer.print_elapsed_millis();
+
+  // Write query result to a parquet file
+  result_table->to_parquet("q6.parquet");
+  return 0;
+}
diff --git a/cpp/examples/tpch/q9.cpp b/cpp/examples/tpch/q9.cpp
new file mode 100644
index 00000000000..d3c218253f9
--- /dev/null
+++ b/cpp/examples/tpch/q9.cpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../utilities/timer.hpp"
+#include "utils.hpp"
+
+#include <cudf/column/column.hpp>
+#include <cudf/datetime.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/contains.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+/**
+ * @file q9.cpp
+ * @brief Implement query 9 of the TPC-H benchmark.
+ *
+ * create view part as select * from '/tables/scale-1/part.parquet';
+ * create view supplier as select * from '/tables/scale-1/supplier.parquet';
+ * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
+ * create view partsupp as select * from '/tables/scale-1/partsupp.parquet';
+ * create view orders as select * from '/tables/scale-1/orders.parquet';
+ * create view nation as select * from '/tables/scale-1/nation.parquet';
+ *
+ * select
+ *    nation,
+ *    o_year,
+ *    sum(amount) as sum_profit
+ * from
+ *     (
+ *        select
+ *            n_name as nation,
+ *            extract(year from o_orderdate) as o_year,
+ *            l_extendedprice * (1 - l_discount) - ps_supplycost * l_quantity as amount
+ *        from
+ *            part,
+ *            supplier,
+ *            lineitem,
+ *            partsupp,
+ *            orders,
+ *            nation
+ *        where
+ *           s_suppkey = l_suppkey
+ *           and ps_suppkey = l_suppkey
+ *           and ps_partkey = l_partkey
+ *           and p_partkey = l_partkey
+ *           and o_orderkey = l_orderkey
+ *           and s_nationkey = n_nationkey
+ *           and p_name like '%green%'
+ *     ) as profit
+ * group by
+ *     nation,
+ *     o_year
+ * order by
+ *     nation,
+ *     o_year desc;
+ */
+
+/**
+ * @brief Calculate the amount column
+ *
+ * @param discount The discount column
+ * @param extendedprice The extended price column
+ * @param supplycost The supply cost column
+ * @param quantity The quantity column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_amount(
+  cudf::column_view const& discount,
+  cudf::column_view const& extendedprice,
+  cudf::column_view const& supplycost,
+  cudf::column_view const& quantity,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const one = cudf::numeric_scalar<double>(1);
+  auto const one_minus_discount =
+    cudf::binary_operation(one, discount, cudf::binary_operator::SUB, discount.type());
+  auto const extendedprice_discounted_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto const extendedprice_discounted      = cudf::binary_operation(extendedprice,
+                                                               one_minus_discount->view(),
+                                                               cudf::binary_operator::MUL,
+                                                               extendedprice_discounted_type,
+                                                               stream,
+                                                               mr);
+  auto const supplycost_quantity_type      = cudf::data_type{cudf::type_id::FLOAT64};
+  auto const supplycost_quantity           = cudf::binary_operation(
+    supplycost, quantity, cudf::binary_operator::MUL, supplycost_quantity_type);
+  auto amount = cudf::binary_operation(extendedprice_discounted->view(),
+                                       supplycost_quantity->view(),
+                                       cudf::binary_operator::SUB,
+                                       extendedprice_discounted->type(),
+                                       stream,
+                                       mr);
+  return amount;
+}
+
+int main(int argc, char const** argv)
+{
+  auto const args = parse_args(argc, argv);
+
+  // Use a memory pool
+  auto resource = create_memory_resource(args.memory_resource_type);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  cudf::examples::timer timer;
+
+  // Read out the table from parquet files
+  auto const lineitem = read_parquet(
+    args.dataset_dir + "/lineitem.parquet",
+    {"l_suppkey", "l_partkey", "l_orderkey", "l_extendedprice", "l_discount", "l_quantity"});
+  auto const nation = read_parquet(args.dataset_dir + "/nation.parquet", {"n_nationkey", "n_name"});
+  auto const orders =
+    read_parquet(args.dataset_dir + "/orders.parquet", {"o_orderkey", "o_orderdate"});
+  auto const part     = read_parquet(args.dataset_dir + "/part.parquet", {"p_partkey", "p_name"});
+  auto const partsupp = read_parquet(args.dataset_dir + "/partsupp.parquet",
+                                     {"ps_suppkey", "ps_partkey", "ps_supplycost"});
+  auto const supplier =
+    read_parquet(args.dataset_dir + "/supplier.parquet", {"s_suppkey", "s_nationkey"});
+
+  // Generating the `profit` table
+  // Filter the part table using `p_name like '%green%'`
+  auto const p_name = part->table().column(1);
+  auto const mask =
+    cudf::strings::like(cudf::strings_column_view(p_name), cudf::string_scalar("%green%"));
+  auto const part_filtered = apply_mask(part, mask);
+
+  // Perform the joins
+  auto const join_a = apply_inner_join(supplier, nation, {"s_nationkey"}, {"n_nationkey"});
+  auto const join_b = apply_inner_join(partsupp, join_a, {"ps_suppkey"}, {"s_suppkey"});
+  auto const join_c = apply_inner_join(lineitem, part_filtered, {"l_partkey"}, {"p_partkey"});
+  auto const join_d = apply_inner_join(orders, join_c, {"o_orderkey"}, {"l_orderkey"});
+  auto const joined_table =
+    apply_inner_join(join_d, join_b, {"l_suppkey", "l_partkey"}, {"s_suppkey", "ps_partkey"});
+
+  // Calculate the `nation`, `o_year`, and `amount` columns
+  auto n_name = std::make_unique<cudf::column>(joined_table->column("n_name"));
+  auto o_year = cudf::datetime::extract_year(joined_table->column("o_orderdate"));
+  auto amount = calc_amount(joined_table->column("l_discount"),
+                            joined_table->column("l_extendedprice"),
+                            joined_table->column("ps_supplycost"),
+                            joined_table->column("l_quantity"));
+
+  // Put together the `profit` table
+  std::vector<std::unique_ptr<cudf::column>> profit_columns;
+  profit_columns.push_back(std::move(n_name));
+  profit_columns.push_back(std::move(o_year));
+  profit_columns.push_back(std::move(amount));
+
+  auto profit_table = std::make_unique<cudf::table>(std::move(profit_columns));
+  auto const profit = std::make_unique<table_with_names>(
+    std::move(profit_table), std::vector<std::string>{"nation", "o_year", "amount"});
+
+  // Perform the groupby operation
+  auto const groupedby_table = apply_groupby(
+    profit,
+    groupby_context_t{{"nation", "o_year"},
+                      {{"amount", {{cudf::groupby_aggregation::SUM, "sum_profit"}}}}});
+
+  // Perform the orderby operation
+  auto const orderedby_table = apply_orderby(
+    groupedby_table, {"nation", "o_year"}, {cudf::order::ASCENDING, cudf::order::DESCENDING});
+
+  timer.print_elapsed_millis();
+
+  // Write query result to a parquet file
+  orderedby_table->to_parquet("q9.parquet");
+  return 0;
+}
diff --git a/cpp/examples/tpch/utils.hpp b/cpp/examples/tpch/utils.hpp
new file mode 100644
index 00000000000..e586da2c802
--- /dev/null
+++ b/cpp/examples/tpch/utils.hpp
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/join.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/unary.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <ctime>
+
+// RMM memory resource creation utilities
+inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
+inline auto make_pool()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+    make_cuda(), rmm::percent_of_free_device_memory(50));
+}
+inline auto make_managed() { return std::make_shared<rmm::mr::managed_memory_resource>(); }
+inline auto make_managed_pool()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+    make_managed(), rmm::percent_of_free_device_memory(50));
+}
+inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
+  std::string const& mode)
+{
+  if (mode == "cuda") return make_cuda();
+  if (mode == "pool") return make_pool();
+  if (mode == "managed") return make_managed();
+  if (mode == "managed_pool") return make_managed_pool();
+  CUDF_FAIL("Unknown rmm_mode parameter: " + mode +
+            "\nExpecting: cuda, pool, managed, or managed_pool");
+}
+
+/**
+ * @brief A class to represent a table with column names attached
+ */
+class table_with_names {
+ public:
+  table_with_names(std::unique_ptr<cudf::table> tbl, std::vector<std::string> col_names)
+    : tbl(std::move(tbl)), col_names(col_names)
+  {
+  }
+  /**
+   * @brief Return the table view
+   */
+  [[nodiscard]] cudf::table_view table() const { return tbl->view(); }
+  /**
+   * @brief Return the column view for a given column name
+   *
+   * @param col_name The name of the column
+   */
+  [[nodiscard]] cudf::column_view column(std::string const& col_name) const
+  {
+    return tbl->view().column(col_id(col_name));
+  }
+  /**
+   * @param Return the column names of the table
+   */
+  [[nodiscard]] std::vector<std::string> column_names() const { return col_names; }
+  /**
+   * @brief Translate a column name to a column index
+   *
+   * @param col_name The name of the column
+   */
+  [[nodiscard]] cudf::size_type col_id(std::string const& col_name) const
+  {
+    CUDF_FUNC_RANGE();
+    auto it = std::find(col_names.begin(), col_names.end(), col_name);
+    if (it == col_names.end()) { throw std::runtime_error("Column not found"); }
+    return std::distance(col_names.begin(), it);
+  }
+  /**
+   * @brief Append a column to the table
+   *
+   * @param col The column to append
+   * @param col_name The name of the appended column
+   */
+  table_with_names& append(std::unique_ptr<cudf::column>& col, std::string const& col_name)
+  {
+    CUDF_FUNC_RANGE();
+    auto cols = tbl->release();
+    cols.push_back(std::move(col));
+    tbl = std::make_unique<cudf::table>(std::move(cols));
+    col_names.push_back(col_name);
+    return (*this);
+  }
+  /**
+   * @brief Select a subset of columns from the table
+   *
+   * @param col_names The names of the columns to select
+   */
+  [[nodiscard]] cudf::table_view select(std::vector<std::string> const& col_names) const
+  {
+    CUDF_FUNC_RANGE();
+    std::vector<cudf::size_type> col_indices;
+    for (auto const& col_name : col_names) {
+      col_indices.push_back(col_id(col_name));
+    }
+    return tbl->select(col_indices);
+  }
+  /**
+   * @brief Write the table to a parquet file
+   *
+   * @param filepath The path to the parquet file
+   */
+  void to_parquet(std::string const& filepath) const
+  {
+    CUDF_FUNC_RANGE();
+    auto const sink_info = cudf::io::sink_info(filepath);
+    cudf::io::table_metadata metadata;
+    metadata.schema_info =
+      std::vector<cudf::io::column_name_info>(col_names.begin(), col_names.end());
+    auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
+    auto builder = cudf::io::parquet_writer_options::builder(sink_info, tbl->view());
+    builder.metadata(table_input_metadata);
+    auto const options = builder.build();
+    cudf::io::write_parquet(options);
+  }
+
+ private:
+  std::unique_ptr<cudf::table> tbl;
+  std::vector<std::string> col_names;
+};
+
+/**
+ * @brief Concatenate two vectors
+ *
+ * @param lhs The left vector
+ * @param rhs The right vector
+ */
+template <typename T>
+std::vector<T> concat(std::vector<T> const& lhs, std::vector<T> const& rhs)
+{
+  std::vector<T> result;
+  result.reserve(lhs.size() + rhs.size());
+  std::copy(lhs.begin(), lhs.end(), std::back_inserter(result));
+  std::copy(rhs.begin(), rhs.end(), std::back_inserter(result));
+  return result;
+}
+
+/**
+ * @brief Inner join two tables and gather the result
+ *
+ * @param left_input The left input table
+ * @param right_input The right input table
+ * @param left_on The columns to join on in the left table
+ * @param right_on The columns to join on in the right table
+ * @param compare_nulls The null equality policy
+ */
+[[nodiscard]] std::unique_ptr<cudf::table> join_and_gather(
+  cudf::table_view const& left_input,
+  cudf::table_view const& right_input,
+  std::vector<cudf::size_type> const& left_on,
+  std::vector<cudf::size_type> const& right_on,
+  cudf::null_equality compare_nulls)
+{
+  CUDF_FUNC_RANGE();
+  constexpr auto oob_policy                          = cudf::out_of_bounds_policy::DONT_CHECK;
+  auto const left_selected                           = left_input.select(left_on);
+  auto const right_selected                          = right_input.select(right_on);
+  auto const [left_join_indices, right_join_indices] = cudf::inner_join(
+    left_selected, right_selected, compare_nulls, rmm::mr::get_current_device_resource());
+
+  auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
+  auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
+
+  auto const left_indices_col  = cudf::column_view{left_indices_span};
+  auto const right_indices_col = cudf::column_view{right_indices_span};
+
+  auto const left_result  = cudf::gather(left_input, left_indices_col, oob_policy);
+  auto const right_result = cudf::gather(right_input, right_indices_col, oob_policy);
+
+  auto joined_cols = left_result->release();
+  auto right_cols  = right_result->release();
+  joined_cols.insert(joined_cols.end(),
+                     std::make_move_iterator(right_cols.begin()),
+                     std::make_move_iterator(right_cols.end()));
+  return std::make_unique<cudf::table>(std::move(joined_cols));
+}
+
+/**
+ * @brief Apply an inner join operation to two tables
+ *
+ * @param left_input The left input table
+ * @param right_input The right input table
+ * @param left_on The columns to join on in the left table
+ * @param right_on The columns to join on in the right table
+ * @param compare_nulls The null equality policy
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_inner_join(
+  std::unique_ptr<table_with_names> const& left_input,
+  std::unique_ptr<table_with_names> const& right_input,
+  std::vector<std::string> const& left_on,
+  std::vector<std::string> const& right_on,
+  cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
+{
+  CUDF_FUNC_RANGE();
+  std::vector<cudf::size_type> left_on_indices;
+  std::vector<cudf::size_type> right_on_indices;
+  std::transform(
+    left_on.begin(), left_on.end(), std::back_inserter(left_on_indices), [&](auto const& col_name) {
+      return left_input->col_id(col_name);
+    });
+  std::transform(right_on.begin(),
+                 right_on.end(),
+                 std::back_inserter(right_on_indices),
+                 [&](auto const& col_name) { return right_input->col_id(col_name); });
+  auto table = join_and_gather(
+    left_input->table(), right_input->table(), left_on_indices, right_on_indices, compare_nulls);
+  return std::make_unique<table_with_names>(
+    std::move(table), concat(left_input->column_names(), right_input->column_names()));
+}
+
+/**
+ * @brief Apply a filter predicated to a table
+ *
+ * @param table The input table
+ * @param predicate The filter predicate
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_filter(
+  std::unique_ptr<table_with_names> const& table, cudf::ast::operation const& predicate)
+{
+  CUDF_FUNC_RANGE();
+  auto const boolean_mask = cudf::compute_column(table->table(), predicate);
+  auto result_table       = cudf::apply_boolean_mask(table->table(), boolean_mask->view());
+  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
+}
+
+/**
+ * @brief Apply a boolean mask to a table
+ *
+ * @param table The input table
+ * @param mask The boolean mask
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_mask(
+  std::unique_ptr<table_with_names> const& table, std::unique_ptr<cudf::column> const& mask)
+{
+  CUDF_FUNC_RANGE();
+  auto result_table = cudf::apply_boolean_mask(table->table(), mask->view());
+  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
+}
+
+struct groupby_context_t {
+  std::vector<std::string> keys;
+  std::unordered_map<std::string, std::vector<std::pair<cudf::aggregation::Kind, std::string>>>
+    values;
+};
+
+/**
+ * @brief Apply a groupby operation to a table
+ *
+ * @param table The input table
+ * @param ctx The groupby context
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_groupby(
+  std::unique_ptr<table_with_names> const& table, groupby_context_t const& ctx)
+{
+  CUDF_FUNC_RANGE();
+  auto const keys = table->select(ctx.keys);
+  cudf::groupby::groupby groupby_obj(keys);
+  std::vector<std::string> result_column_names;
+  result_column_names.insert(result_column_names.end(), ctx.keys.begin(), ctx.keys.end());
+  std::vector<cudf::groupby::aggregation_request> requests;
+  for (auto& [value_col, aggregations] : ctx.values) {
+    requests.emplace_back(cudf::groupby::aggregation_request());
+    for (auto& agg : aggregations) {
+      if (agg.first == cudf::aggregation::Kind::SUM) {
+        requests.back().aggregations.push_back(
+          cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+      } else if (agg.first == cudf::aggregation::Kind::MEAN) {
+        requests.back().aggregations.push_back(
+          cudf::make_mean_aggregation<cudf::groupby_aggregation>());
+      } else if (agg.first == cudf::aggregation::Kind::COUNT_ALL) {
+        requests.back().aggregations.push_back(
+          cudf::make_count_aggregation<cudf::groupby_aggregation>());
+      } else {
+        throw std::runtime_error("Unsupported aggregation");
+      }
+      result_column_names.push_back(agg.second);
+    }
+    requests.back().values = table->column(value_col);
+  }
+  auto agg_results = groupby_obj.aggregate(requests);
+  std::vector<std::unique_ptr<cudf::column>> result_columns;
+  for (size_t i = 0; i < agg_results.first->num_columns(); i++) {
+    auto col = std::make_unique<cudf::column>(agg_results.first->get_column(i));
+    result_columns.push_back(std::move(col));
+  }
+  for (size_t i = 0; i < agg_results.second.size(); i++) {
+    for (size_t j = 0; j < agg_results.second[i].results.size(); j++) {
+      result_columns.push_back(std::move(agg_results.second[i].results[j]));
+    }
+  }
+  auto result_table = std::make_unique<cudf::table>(std::move(result_columns));
+  return std::make_unique<table_with_names>(std::move(result_table), result_column_names);
+}
+
+/**
+ * @brief Apply an order by operation to a table
+ *
+ * @param table The input table
+ * @param sort_keys The sort keys
+ * @param sort_key_orders The sort key orders
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_orderby(
+  std::unique_ptr<table_with_names> const& table,
+  std::vector<std::string> const& sort_keys,
+  std::vector<cudf::order> const& sort_key_orders)
+{
+  CUDF_FUNC_RANGE();
+  std::vector<cudf::column_view> column_views;
+  for (auto& key : sort_keys) {
+    column_views.push_back(table->column(key));
+  }
+  auto result_table =
+    cudf::sort_by_key(table->table(), cudf::table_view{column_views}, sort_key_orders);
+  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
+}
+
+/**
+ * @brief Apply a reduction operation to a column
+ *
+ * @param column The input column
+ * @param agg_kind The aggregation kind
+ * @param col_name The name of the output column
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_reduction(
+  cudf::column_view const& column,
+  cudf::aggregation::Kind const& agg_kind,
+  std::string const& col_name)
+{
+  CUDF_FUNC_RANGE();
+  auto const agg            = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+  auto const result         = cudf::reduce(column, *agg, column.type());
+  cudf::size_type const len = 1;
+  auto col                  = cudf::make_column_from_scalar(*result, len);
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(col));
+  auto result_table                  = std::make_unique<cudf::table>(std::move(columns));
+  std::vector<std::string> col_names = {col_name};
+  return std::make_unique<table_with_names>(std::move(result_table), col_names);
+}
+
+/**
+ * @brief Read a parquet file into a table
+ *
+ * @param filename The path to the parquet file
+ * @param columns The columns to read
+ * @param predicate The filter predicate to pushdown
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> read_parquet(
+  std::string const& filename,
+  std::vector<std::string> const& columns                = {},
+  std::unique_ptr<cudf::ast::operation> const& predicate = nullptr)
+{
+  CUDF_FUNC_RANGE();
+  auto const source = cudf::io::source_info(filename);
+  auto builder      = cudf::io::parquet_reader_options_builder(source);
+  if (!columns.empty()) { builder.columns(columns); }
+  if (predicate) { builder.filter(*predicate); }
+  auto const options       = builder.build();
+  auto table_with_metadata = cudf::io::read_parquet(options);
+  std::vector<std::string> column_names;
+  for (auto const& col_info : table_with_metadata.metadata.schema_info) {
+    column_names.push_back(col_info.name);
+  }
+  return std::make_unique<table_with_names>(std::move(table_with_metadata.tbl), column_names);
+}
+
+/**
+ * @brief Generate the `std::tm` structure from year, month, and day
+ *
+ * @param year The year
+ * @param month The month
+ * @param day The day
+ */
+std::tm make_tm(int year, int month, int day)
+{
+  std::tm tm{};
+  tm.tm_year = year - 1900;
+  tm.tm_mon  = month - 1;
+  tm.tm_mday = day;
+  return tm;
+}
+
+/**
+ * @brief Calculate the number of days since the UNIX epoch
+ *
+ * @param year The year
+ * @param month The month
+ * @param day The day
+ */
+int32_t days_since_epoch(int year, int month, int day)
+{
+  std::tm tm             = make_tm(year, month, day);
+  std::tm epoch          = make_tm(1970, 1, 1);
+  std::time_t time       = std::mktime(&tm);
+  std::time_t epoch_time = std::mktime(&epoch);
+  double diff            = std::difftime(time, epoch_time) / (60 * 60 * 24);
+  return static_cast<int32_t>(diff);
+}
+
+struct tpch_example_args {
+  std::string dataset_dir;
+  std::string memory_resource_type;
+};
+
+/**
+ * @brief Parse command line arguments into a struct
+ *
+ * @param argc The number of command line arguments
+ * @param argv The command line arguments
+ */
+tpch_example_args parse_args(int argc, char const** argv)
+{
+  if (argc < 3) {
+    std::string usage_message = "Usage: " + std::string(argv[0]) +
+                                " <dataset_dir> <memory_resource_type>\n The query result will be "
+                                "saved to a parquet file named q{query_no}.parquet in the current "
+                                "working directory ";
+    throw std::runtime_error(usage_message);
+  }
+  tpch_example_args args;
+  args.dataset_dir          = argv[1];
+  args.memory_resource_type = argv[2];
+  return args;
+}
diff --git a/cpp/examples/utilities/timer.hpp b/cpp/examples/utilities/timer.hpp
new file mode 100644
index 00000000000..65fa92e74cf
--- /dev/null
+++ b/cpp/examples/utilities/timer.hpp
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <chrono>
+#include <iostream>
+
+namespace cudf {
+namespace examples {
+/**
+ * @brief Light-weight timer for measuring elapsed time.
+ *
+ * A timer object constructed from std::chrono, instrumenting at microseconds
+ * precision. Can display elapsed durations at milli and micro second
+ * scales. The timer starts at object construction.
+ */
+class timer {
+ public:
+  using micros = std::chrono::microseconds;
+  using millis = std::chrono::milliseconds;
+
+  timer() { reset(); }
+  void reset() { start_time = std::chrono::high_resolution_clock::now(); }
+  auto elapsed() const { return (std::chrono::high_resolution_clock::now() - start_time); }
+  void print_elapsed_micros() const
+  {
+    std::cout << "Elapsed Time: " << std::chrono::duration_cast<micros>(elapsed()).count()
+              << "us\n\n";
+  }
+  void print_elapsed_millis() const
+  {
+    std::cout << "Elapsed Time: " << std::chrono::duration_cast<millis>(elapsed()).count()
+              << "ms\n\n";
+  }
+
+ private:
+  using time_point_t = std::chrono::time_point<std::chrono::high_resolution_clock>;
+  time_point_t start_time;
+};
+
+}  // namespace examples
+};  // namespace cudf

From c4471c4ee81ed967f1818bc03c5f7829b15cfe56 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 18 Jul 2024 10:44:04 -0400
Subject: [PATCH 530/842] Fix split_record for all empty strings column
 (#16291)

Fixes `cudf::strings::split_record` handling of an all empty strings column. This caused a kernel launch with no threads eventually reporting a CUDA error. A new gtest was added to check this condition and includes tests for `rsplit_record` as well.

Closes #16284

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16291
---
 cpp/src/strings/split/split.cuh   |  6 ++++++
 cpp/tests/strings/split_tests.cpp | 20 ++++++++++++++++++++
 2 files changed, 26 insertions(+)

diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 23614ac0733..4d7096c02ca 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -357,6 +357,12 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
   auto const chars_bytes =
     get_offset_value(input.offsets(), input.offset() + strings_count, stream) -
     get_offset_value(input.offsets(), input.offset(), stream);
+  if (chars_bytes == 0) {
+    auto offsets = cudf::make_column_from_scalar(
+      numeric_scalar<int32_t>(0, true, stream), strings_count + 1, stream, mr);
+    auto tokens = rmm::device_uvector<string_index_pair>(0, stream);
+    return std::pair{std::move(offsets), std::move(tokens)};
+  }
   auto const d_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
 
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index d53c64ed539..4c020cb4c29 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -307,6 +307,26 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
 }
 
+TEST_F(StringsSplitTest, SplitRecordAllEmpty)
+{
+  auto input     = cudf::test::strings_column_wrapper({"", "", "", ""});
+  auto sv        = cudf::strings_column_view(input);
+  auto delimiter = cudf::string_scalar("s");
+  auto empty     = cudf::string_scalar("");
+
+  using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{}, LCW{}, LCW{}, LCW{}});
+  auto result = cudf::strings::split_record(sv, delimiter);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+  result = cudf::strings::split_record(sv, empty);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+
+  result = cudf::strings::rsplit_record(sv, delimiter);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+  result = cudf::strings::rsplit_record(sv, empty);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+}
+
 TEST_F(StringsSplitTest, MultiByteDelimiters)
 {
   // Overlapping delimiters

From faddc8c3d37e5cf8ec69341118218c245e087c26 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 18 Jul 2024 08:03:55 -0700
Subject: [PATCH 531/842] Migrate CSV reader to pylibcudf (#16011)

xref #15162

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16011
---
 .../user_guide/api_docs/pylibcudf/io/csv.rst  |   6 +
 .../api_docs/pylibcudf/io/index.rst           |   1 +
 python/cudf/cudf/_lib/csv.pyx                 | 436 ++++++------------
 .../cudf/_lib/pylibcudf/io/CMakeLists.txt     |   6 +-
 .../cudf/cudf/_lib/pylibcudf/io/__init__.pxd  |   1 +
 .../cudf/cudf/_lib/pylibcudf/io/__init__.py   |   2 +-
 python/cudf/cudf/_lib/pylibcudf/io/csv.pyx    | 264 +++++++++++
 python/cudf/cudf/_lib/pylibcudf/io/types.pxd  |   3 +
 python/cudf/cudf/_lib/pylibcudf/io/types.pyx  |   2 +-
 python/cudf/cudf/_lib/types.pyx               |   3 +
 .../cudf/cudf/pylibcudf_tests/common/utils.py |  43 +-
 python/cudf/cudf/pylibcudf_tests/conftest.py  |  14 +
 .../cudf/cudf/pylibcudf_tests/io/test_csv.py  | 280 +++++++++++
 13 files changed, 751 insertions(+), 310 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/csv.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/io/test_csv.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst
new file mode 100644
index 00000000000..5a2276f8b2d
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst
@@ -0,0 +1,6 @@
+===
+CSV
+===
+
+.. automodule:: cudf._lib.pylibcudf.io.csv
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
index bde6d8094ce..697bce739de 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -16,4 +16,5 @@ I/O Functions
     :maxdepth: 1
 
     avro
+    csv
     json
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 9fecff5f5f6..099b61d62ae 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -1,7 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-from libcpp.map cimport map
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -9,8 +8,12 @@ from libcpp.vector cimport vector
 
 cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
 from cudf._lib.pylibcudf.io.datasource cimport Datasource, NativeFileDatasource
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
-from cudf._lib.types cimport dtype_to_data_type
+from cudf._lib.types cimport dtype_to_pylibcudf_type
+
+import errno
+import os
+from collections import abc
+from io import BytesIO, StringIO
 
 import numpy as np
 import pandas as pd
@@ -18,65 +21,24 @@ import pandas as pd
 import cudf
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-
-import errno
-import os
-from collections import abc
-from enum import IntEnum
-from io import BytesIO, StringIO
-
-from libc.stdint cimport int32_t
 from libcpp cimport bool
 
-from cudf._lib.io.utils cimport make_sink_info, make_source_info
+from cudf._lib.io.utils cimport make_sink_info
 from cudf._lib.pylibcudf.libcudf.io.csv cimport (
-    csv_reader_options,
     csv_writer_options,
-    read_csv as cpp_read_csv,
     write_csv as cpp_write_csv,
 )
 from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
-    compression_type,
-    quote_style,
-    sink_info,
-    source_info,
-    table_with_metadata,
-)
+from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type, sink_info
 from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
+from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table
 
 from pyarrow.lib import NativeFile
 
+import cudf._lib.pylibcudf as plc
 from cudf.api.types import is_hashable
 
-ctypedef int32_t underlying_type_t_compression
-
-
-class Compression(IntEnum):
-    INFER = (
-        <underlying_type_t_compression> compression_type.AUTO
-    )
-    SNAPPY = (
-        <underlying_type_t_compression> compression_type.SNAPPY
-    )
-    GZIP = (
-        <underlying_type_t_compression> compression_type.GZIP
-    )
-    BZ2 = (
-        <underlying_type_t_compression> compression_type.BZIP2
-    )
-    BROTLI = (
-        <underlying_type_t_compression> compression_type.BROTLI
-    )
-    ZIP = (
-        <underlying_type_t_compression> compression_type.ZIP
-    )
-    XZ = (
-        <underlying_type_t_compression> compression_type.XZ
-    )
-
+from cudf._lib.pylibcudf.types cimport DataType
 
 CSV_HEX_TYPE_MAP = {
     "hex": np.dtype("int64"),
@@ -84,234 +46,6 @@ CSV_HEX_TYPE_MAP = {
     "hex32": np.dtype("int32")
 }
 
-cdef csv_reader_options make_csv_reader_options(
-    object datasource,
-    object lineterminator,
-    object quotechar,
-    int quoting,
-    bool doublequote,
-    object header,
-    bool mangle_dupe_cols,
-    object usecols,
-    object delimiter,
-    bool delim_whitespace,
-    bool skipinitialspace,
-    object names,
-    object dtype,
-    int skipfooter,
-    int skiprows,
-    bool dayfirst,
-    object compression,
-    object thousands,
-    object decimal,
-    object true_values,
-    object false_values,
-    object nrows,
-    object byte_range,
-    bool skip_blank_lines,
-    object parse_dates,
-    object comment,
-    object na_values,
-    bool keep_default_na,
-    bool na_filter,
-    object prefix,
-    object index_col,
-) except *:
-    cdef source_info c_source_info = make_source_info([datasource])
-    cdef compression_type c_compression
-    cdef vector[string] c_names
-    cdef size_t c_byte_range_offset = (
-        byte_range[0] if byte_range is not None else 0
-    )
-    cdef size_t c_byte_range_size = (
-        byte_range[1] if byte_range is not None else 0
-    )
-    cdef vector[int] c_use_cols_indexes
-    cdef vector[string] c_use_cols_names
-    cdef size_type c_nrows = nrows if nrows is not None else -1
-    cdef quote_style c_quoting
-    cdef vector[string] c_parse_dates_names
-    cdef vector[int] c_parse_dates_indexes
-    cdef vector[string] c_hex_col_names
-    cdef vector[data_type] c_dtypes_list
-    cdef map[string, data_type] c_dtypes_map
-    cdef vector[int] c_hex_col_indexes
-    cdef vector[string] c_true_values
-    cdef vector[string] c_false_values
-    cdef vector[string] c_na_values
-
-    # Reader settings
-    if compression is None:
-        c_compression = compression_type.NONE
-    else:
-        compression = str(compression)
-        compression = Compression[compression.upper()]
-        c_compression = <compression_type> (
-            <underlying_type_t_compression> compression
-        )
-
-    if quoting == 1:
-        c_quoting = quote_style.ALL
-    elif quoting == 2:
-        c_quoting = quote_style.NONNUMERIC
-    elif quoting == 3:
-        c_quoting = quote_style.NONE
-    else:
-        # Default value
-        c_quoting = quote_style.MINIMAL
-
-    cdef csv_reader_options csv_reader_options_c = move(
-        csv_reader_options.builder(c_source_info)
-        .compression(c_compression)
-        .mangle_dupe_cols(mangle_dupe_cols)
-        .byte_range_offset(c_byte_range_offset)
-        .byte_range_size(c_byte_range_size)
-        .nrows(c_nrows)
-        .skiprows(skiprows)
-        .skipfooter(skipfooter)
-        .quoting(c_quoting)
-        .lineterminator(ord(lineterminator))
-        .quotechar(ord(quotechar))
-        .decimal(ord(decimal))
-        .delim_whitespace(delim_whitespace)
-        .skipinitialspace(skipinitialspace)
-        .skip_blank_lines(skip_blank_lines)
-        .doublequote(doublequote)
-        .keep_default_na(keep_default_na)
-        .na_filter(na_filter)
-        .dayfirst(dayfirst)
-        .build()
-    )
-
-    if names is not None:
-        # explicitly mentioned name, so don't check header
-        if header is None or header == 'infer':
-            csv_reader_options_c.set_header(-1)
-        else:
-            csv_reader_options_c.set_header(header)
-
-        c_names.reserve(len(names))
-        for name in names:
-            c_names.push_back(str(name).encode())
-        csv_reader_options_c.set_names(c_names)
-    else:
-        if header is None:
-            csv_reader_options_c.set_header(-1)
-        elif header == 'infer':
-            csv_reader_options_c.set_header(0)
-        else:
-            csv_reader_options_c.set_header(header)
-
-    if prefix is not None:
-        csv_reader_options_c.set_prefix(prefix.encode())
-
-    if usecols is not None:
-        all_int = all(isinstance(col, int) for col in usecols)
-        if all_int:
-            c_use_cols_indexes.reserve(len(usecols))
-            c_use_cols_indexes = usecols
-            csv_reader_options_c.set_use_cols_indexes(c_use_cols_indexes)
-        else:
-            c_use_cols_names.reserve(len(usecols))
-            for col_name in usecols:
-                c_use_cols_names.push_back(
-                    str(col_name).encode()
-                )
-            csv_reader_options_c.set_use_cols_names(c_use_cols_names)
-
-    if delimiter is not None:
-        csv_reader_options_c.set_delimiter(ord(delimiter))
-
-    if thousands is not None:
-        csv_reader_options_c.set_thousands(ord(thousands))
-
-    if comment is not None:
-        csv_reader_options_c.set_comment(ord(comment))
-
-    if parse_dates is not None:
-        if isinstance(parse_dates, abc.Mapping):
-            raise NotImplementedError(
-                "`parse_dates`: dictionaries are unsupported")
-        if not isinstance(parse_dates, abc.Iterable):
-            raise NotImplementedError(
-                "`parse_dates`: an iterable is required")
-        for col in parse_dates:
-            if isinstance(col, str):
-                c_parse_dates_names.push_back(str(col).encode())
-            elif isinstance(col, int):
-                c_parse_dates_indexes.push_back(col)
-            else:
-                raise NotImplementedError(
-                    "`parse_dates`: Nesting is unsupported")
-        csv_reader_options_c.set_parse_dates(c_parse_dates_names)
-        csv_reader_options_c.set_parse_dates(c_parse_dates_indexes)
-
-    if dtype is not None:
-        if isinstance(dtype, abc.Mapping):
-            for k, v in dtype.items():
-                col_type = v
-                if is_hashable(v) and v in CSV_HEX_TYPE_MAP:
-                    col_type = CSV_HEX_TYPE_MAP[v]
-                    c_hex_col_names.push_back(str(k).encode())
-
-                c_dtypes_map[str(k).encode()] = \
-                    _get_cudf_data_type_from_dtype(
-                        cudf.dtype(col_type))
-            csv_reader_options_c.set_dtypes(c_dtypes_map)
-            csv_reader_options_c.set_parse_hex(c_hex_col_names)
-        elif (
-            cudf.api.types.is_scalar(dtype) or
-            isinstance(dtype, (
-                np.dtype, pd.api.extensions.ExtensionDtype, type
-            ))
-        ):
-            c_dtypes_list.reserve(1)
-            if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP:
-                dtype = CSV_HEX_TYPE_MAP[dtype]
-                c_hex_col_indexes.push_back(0)
-
-            c_dtypes_list.push_back(
-                _get_cudf_data_type_from_dtype(dtype)
-            )
-            csv_reader_options_c.set_dtypes(c_dtypes_list)
-            csv_reader_options_c.set_parse_hex(c_hex_col_indexes)
-        elif isinstance(dtype, abc.Collection):
-            c_dtypes_list.reserve(len(dtype))
-            for index, col_dtype in enumerate(dtype):
-                if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP:
-                    col_dtype = CSV_HEX_TYPE_MAP[col_dtype]
-                    c_hex_col_indexes.push_back(index)
-
-                c_dtypes_list.push_back(
-                    _get_cudf_data_type_from_dtype(col_dtype)
-                )
-            csv_reader_options_c.set_dtypes(c_dtypes_list)
-            csv_reader_options_c.set_parse_hex(c_hex_col_indexes)
-        else:
-            raise ValueError(
-                "dtype should be a scalar/str/list-like/dict-like"
-            )
-
-    if true_values is not None:
-        c_true_values.reserve(len(true_values))
-        for tv in true_values:
-            c_true_values.push_back(tv.encode())
-        csv_reader_options_c.set_true_values(c_true_values)
-
-    if false_values is not None:
-        c_false_values.reserve(len(false_values))
-        for fv in false_values:
-            c_false_values.push_back(fv.encode())
-        csv_reader_options_c.set_false_values(c_false_values)
-
-    if na_values is not None:
-        c_na_values.reserve(len(na_values))
-        for nv in na_values:
-            c_na_values.push_back(nv.encode())
-        csv_reader_options_c.set_na_values(c_na_values)
-
-    return csv_reader_options_c
-
 
 def validate_args(
     object delimiter,
@@ -381,7 +115,6 @@ def read_csv(
     bool na_filter=True,
     object prefix=None,
     object index_col=None,
-    **kwargs,
 ):
     """
     Cython function to call into libcudf API, see `read_csv`.
@@ -413,23 +146,120 @@ def read_csv(
     if delimiter is None:
         delimiter = sep
 
-    cdef csv_reader_options read_csv_options_c = make_csv_reader_options(
-        datasource, lineterminator, quotechar, quoting, doublequote,
-        header, mangle_dupe_cols, usecols, delimiter, delim_whitespace,
-        skipinitialspace, names, dtype, skipfooter, skiprows, dayfirst,
-        compression, thousands, decimal, true_values, false_values, nrows,
-        byte_range, skip_blank_lines, parse_dates, comment, na_values,
-        keep_default_na, na_filter, prefix, index_col)
+    delimiter = str(delimiter)
+
+    if byte_range is None:
+        byte_range = (0, 0)
+
+    if compression is None:
+        c_compression = compression_type.NONE
+    else:
+        compression_map = {
+            "infer": compression_type.AUTO,
+            "gzip": compression_type.GZIP,
+            "bz2": compression_type.BZIP2,
+            "zip": compression_type.ZIP,
+        }
+        c_compression = compression_map[compression]
 
-    cdef table_with_metadata c_result
-    with nogil:
-        c_result = move(cpp_read_csv(read_csv_options_c))
+    # We need this later when setting index cols
+    orig_header = header
+
+    if names is not None:
+        # explicitly mentioned name, so don't check header
+        if header is None or header == 'infer':
+            header = -1
+        else:
+            header = header
+        names = list(names)
+    else:
+        if header is None:
+            header = -1
+        elif header == 'infer':
+            header = 0
 
-    meta_names = [info.name.decode() for info in c_result.metadata.schema_info]
-    df = cudf.DataFrame._from_data(*data_from_unique_ptr(
-        move(c_result.tbl),
-        column_names=meta_names
-    ))
+    hex_cols = []
+
+    new_dtypes = []
+    if dtype is not None:
+        if isinstance(dtype, abc.Mapping):
+            new_dtypes = dict()
+            for k, v in dtype.items():
+                col_type = v
+                if is_hashable(v) and v in CSV_HEX_TYPE_MAP:
+                    col_type = CSV_HEX_TYPE_MAP[v]
+                    hex_cols.append(str(k))
+
+                new_dtypes[k] = _get_plc_data_type_from_dtype(
+                    cudf.dtype(col_type)
+                )
+        elif (
+            cudf.api.types.is_scalar(dtype) or
+            isinstance(dtype, (
+                np.dtype, pd.api.extensions.ExtensionDtype, type
+            ))
+        ):
+            if is_hashable(dtype) and dtype in CSV_HEX_TYPE_MAP:
+                dtype = CSV_HEX_TYPE_MAP[dtype]
+                hex_cols.append(0)
+
+            new_dtypes.append(
+                _get_plc_data_type_from_dtype(dtype)
+            )
+        elif isinstance(dtype, abc.Collection):
+            for index, col_dtype in enumerate(dtype):
+                if is_hashable(col_dtype) and col_dtype in CSV_HEX_TYPE_MAP:
+                    col_dtype = CSV_HEX_TYPE_MAP[col_dtype]
+                    hex_cols.append(index)
+
+                new_dtypes.append(
+                    _get_plc_data_type_from_dtype(col_dtype)
+                )
+        else:
+            raise ValueError(
+                "dtype should be a scalar/str/list-like/dict-like"
+            )
+
+    lineterminator = str(lineterminator)
+
+    df = cudf.DataFrame._from_data(
+        *data_from_pylibcudf_io(
+            plc.io.csv.read_csv(
+                plc.io.SourceInfo([datasource]),
+                lineterminator=lineterminator,
+                quotechar = quotechar,
+                quoting = quoting,
+                doublequote = doublequote,
+                header = header,
+                mangle_dupe_cols = mangle_dupe_cols,
+                usecols = usecols,
+                delimiter = delimiter,
+                delim_whitespace = delim_whitespace,
+                skipinitialspace = skipinitialspace,
+                col_names = names,
+                dtypes = new_dtypes,
+                skipfooter = skipfooter,
+                skiprows = skiprows,
+                dayfirst = dayfirst,
+                compression = c_compression,
+                thousands = thousands,
+                decimal = decimal,
+                true_values = true_values,
+                false_values = false_values,
+                nrows = nrows if nrows is not None else -1,
+                byte_range_offset = byte_range[0],
+                byte_range_size = byte_range[1],
+                skip_blank_lines = skip_blank_lines,
+                parse_dates = parse_dates,
+                parse_hex = hex_cols,
+                comment = comment,
+                na_values = na_values,
+                keep_default_na = keep_default_na,
+                na_filter = na_filter,
+                prefix = prefix,
+            )
+        )
+    )
 
     if dtype is not None:
         if isinstance(dtype, abc.Mapping):
@@ -459,7 +289,7 @@ def read_csv(
             index_col_name = df._data.select_by_index(index_col).names[0]
             df = df.set_index(index_col_name)
             if isinstance(index_col_name, str) and \
-                    names is None and header in ("infer",):
+                    names is None and orig_header == "infer":
                 if index_col_name.startswith("Unnamed:"):
                     # TODO: Try to upstream it to libcudf
                     # csv reader in future
@@ -550,7 +380,7 @@ def write_csv(
         )
 
 
-cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
+cdef DataType _get_plc_data_type_from_dtype(object dtype) except *:
     # TODO: Remove this work-around Dictionary types
     # in libcudf are fully mapped to categorical columns:
     # https://github.com/rapidsai/cudf/issues/3960
@@ -561,36 +391,36 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *:
 
     if isinstance(dtype, str):
         if str(dtype) == "date32":
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_DAYS
             )
         elif str(dtype) in ("date", "date64"):
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_MILLISECONDS
             )
         elif str(dtype) == "timestamp":
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_MILLISECONDS
             )
         elif str(dtype) == "timestamp[us]":
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_MICROSECONDS
             )
         elif str(dtype) == "timestamp[s]":
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_SECONDS
             )
         elif str(dtype) == "timestamp[ms]":
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_MILLISECONDS
             )
         elif str(dtype) == "timestamp[ns]":
-            return libcudf_types.data_type(
+            return DataType(
                 libcudf_types.type_id.TIMESTAMP_NANOSECONDS
             )
 
     dtype = cudf.dtype(dtype)
-    return dtype_to_data_type(dtype)
+    return dtype_to_pylibcudf_type(dtype)
 
 
 def columns_apply_na_rep(column_names, na_rep):
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
index 084b341ec48..8dd08d11dc8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources avro.pyx datasource.pyx json.pyx types.pyx)
+set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx types.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
@@ -21,7 +21,7 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
 )
 
-set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_datasource pylibcudf_io_json
-                                pylibcudf_io_types
+set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_csv pylibcudf_io_datasource
+                                pylibcudf_io_json pylibcudf_io_types
 )
 link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
index ef4c65b277e..5b3272d60e0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
@@ -1,4 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+# CSV is removed since it is def not cpdef (to force kw-only arguments)
 from . cimport avro, datasource, json, types
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
index fb4e4c7e4bb..e17deaa4663 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, datasource, json, types
+from . import avro, csv, datasource, json, types
 from .types import SinkInfo, SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/csv.pyx b/python/cudf/cudf/_lib/pylibcudf/io/csv.pyx
new file mode 100644
index 00000000000..e9efb5befee
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/csv.pyx
@@ -0,0 +1,264 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.map cimport map
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from cudf._lib.pylibcudf.libcudf.io.csv cimport (
+    csv_reader_options,
+    read_csv as cpp_read_csv,
+)
+from cudf._lib.pylibcudf.libcudf.io.types cimport (
+    compression_type,
+    quote_style,
+    table_with_metadata,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from cudf._lib.pylibcudf.types cimport DataType
+
+
+cdef tuple _process_parse_dates_hex(list cols):
+    cdef vector[string] str_cols
+    cdef vector[int] int_cols
+    for col in cols:
+        if isinstance(col, str):
+            str_cols.push_back(col.encode())
+        else:
+            int_cols.push_back(col)
+    return str_cols, int_cols
+
+cdef vector[string] _make_str_vector(list vals):
+    cdef vector[string] res
+    for val in vals:
+        res.push_back((<str?>val).encode())
+    return res
+
+
+def read_csv(
+    SourceInfo source_info,
+    *,
+    compression_type compression = compression_type.AUTO,
+    size_t byte_range_offset = 0,
+    size_t byte_range_size = 0,
+    list col_names = None,
+    str prefix = "",
+    bool mangle_dupe_cols = True,
+    list usecols = None,
+    size_type nrows = -1,
+    size_type skiprows = 0,
+    size_type skipfooter = 0,
+    size_type header = 0,
+    str lineterminator = "\n",
+    str delimiter = None,
+    str thousands = None,
+    str decimal = ".",
+    str comment = None,
+    bool delim_whitespace = False,
+    bool skipinitialspace = False,
+    bool skip_blank_lines = True,
+    quote_style quoting = quote_style.MINIMAL,
+    str quotechar = '"',
+    bool doublequote = True,
+    list parse_dates = None,
+    list parse_hex = None,
+    # Technically this should be dict/list
+    # but using a fused type prevents using None as default
+    object dtypes = None,
+    list true_values = None,
+    list false_values = None,
+    list na_values = None,
+    bool keep_default_na = True,
+    bool na_filter = True,
+    bool dayfirst = False,
+    # Note: These options are supported by the libcudf reader
+    # but are not exposed here since there is no demand for them
+    # on the Python side yet.
+    # bool detect_whitespace_around_quotes = False,
+    # DataType timestamp_type = DataType(type_id.EMPTY),
+):
+    """Reads a CSV file into a :py:class:`~.types.TableWithMetadata`.
+
+    Parameters
+    ----------
+    source_info : SourceInfo
+        The SourceInfo to read the CSV file from.
+    compression : compression_type, default CompressionType.AUTO
+        The compression format of the CSV source.
+    byte_range_offset : size_type, default 0
+        Number of bytes to skip from source start.
+    byte_range_size : size_type, default 0
+        Number of bytes to read. By default, will read all bytes.
+    col_names : list, default None
+        The column names to use.
+    prefix : string, default ''
+        The prefix to apply to the column names.
+    mangle_dupe_cols : bool, default True
+        If True, rename duplicate column names.
+    usecols : list, default None
+        Specify the string column names/integer column indices of columns to be read.
+    nrows : size_type, default -1
+        The number of rows to read.
+    skiprows : size_type, default 0
+        The number of rows to skip from the start before reading
+    skipfooter : size_type, default 0
+        The number of rows to skip from the end
+    header : size_type, default 0
+        The index of the row that will be used for header names.
+        Pass -1 to use default column names.
+    lineterminator : str, default '\\n'
+        The character used to determine the end of a line.
+    delimiter : str, default ","
+        The character used to separate fields in a row.
+    thousands : str, default None
+        The character used as the thousands separator.
+        Cannot match delimiter.
+    decimal : str, default '.'
+        The character used as the decimal separator.
+        Cannot match delimiter.
+    comment : str, default None
+        The character used to identify the start of a comment line.
+        (which will be skipped by the reader)
+    delim_whitespace : bool, default False
+        If True, treat whitespace as the field delimiter.
+    skipinitialspace : bool, default False
+        If True, skip whitespace after the delimiter.
+    skip_blank_lines : bool, default True
+        If True, ignore empty lines (otherwise line values are parsed as null).
+    quoting : QuoteStyle, default QuoteStyle.MINIMAL
+        The quoting style used in the input CSV data. One of
+        { QuoteStyle.MINIMAL, QuoteStyle.ALL, QuoteStyle.NONNUMERIC, QuoteStyle.NONE }
+    quotechar : str, default '"'
+        The character used to indicate quoting.
+    doublequote : bool, default True
+        If True, a quote inside a value is double-quoted.
+    parse_dates : list, default None
+        A list of integer column indices/string column names
+        of columns to read as datetime.
+    parse_hex : list, default None
+        A list of integer column indices/string column names
+        of columns to read as hexadecimal.
+    dtypes : Union[Dict[str, DataType], List[DataType]], default None
+        A list of data types or a dictionary mapping column names
+        to a DataType.
+    true_values : List[str], default None
+        A list of additional values to recognize as True.
+    false_values : List[str], default None
+        A list of additional values to recognize as False.
+    na_values : List[str], default None
+        A list of additional values to recognize as null.
+    keep_default_na : bool, default True
+        Whether to keep the built-in default N/A values.
+    na_filter : bool, default True
+        Whether to detect missing values. If False, can
+        improve performance.
+    dayfirst : bool, default False
+        If True, interpret dates as being in the DD/MM format.
+
+    Returns
+    -------
+    TableWithMetadata
+        The Table and its corresponding metadata (column names) that were read in.
+    """
+    cdef vector[string] c_parse_dates_names
+    cdef vector[int] c_parse_dates_indexes
+    cdef vector[int] c_parse_hex_names
+    cdef vector[int] c_parse_hex_indexes
+    cdef vector[data_type] c_dtypes_list
+    cdef map[string, data_type] c_dtypes_map
+
+    cdef csv_reader_options options = move(
+        csv_reader_options.builder(source_info.c_obj)
+        .compression(compression)
+        .mangle_dupe_cols(mangle_dupe_cols)
+        .byte_range_offset(byte_range_offset)
+        .byte_range_size(byte_range_size)
+        .nrows(nrows)
+        .skiprows(skiprows)
+        .skipfooter(skipfooter)
+        .quoting(quoting)
+        .lineterminator(ord(lineterminator))
+        .quotechar(ord(quotechar))
+        .decimal(ord(decimal))
+        .delim_whitespace(delim_whitespace)
+        .skipinitialspace(skipinitialspace)
+        .skip_blank_lines(skip_blank_lines)
+        .doublequote(doublequote)
+        .keep_default_na(keep_default_na)
+        .na_filter(na_filter)
+        .dayfirst(dayfirst)
+        .build()
+    )
+
+    options.set_header(header)
+
+    if col_names is not None:
+        options.set_names([str(name).encode() for name in col_names])
+
+    if prefix is not None:
+        options.set_prefix(prefix.encode())
+
+    if usecols is not None:
+        if all([isinstance(col, int) for col in usecols]):
+            options.set_use_cols_indexes(list(usecols))
+        else:
+            options.set_use_cols_names([str(name).encode() for name in usecols])
+
+    if delimiter is not None:
+        options.set_delimiter(ord(delimiter))
+
+    if thousands is not None:
+        options.set_thousands(ord(thousands))
+
+    if comment is not None:
+        options.set_comment(ord(comment))
+
+    if parse_dates is not None:
+        if not all([isinstance(col, (str, int)) for col in parse_dates]):
+            raise NotImplementedError(
+                    "`parse_dates`: Must pass a list of column names/indices")
+
+        # Set both since users are allowed to mix column names and indices
+        c_parse_dates_names, c_parse_dates_indexes = \
+            _process_parse_dates_hex(parse_dates)
+        options.set_parse_dates(c_parse_dates_names)
+        options.set_parse_dates(c_parse_dates_indexes)
+
+    if parse_hex is not None:
+        if not all([isinstance(col, (str, int)) for col in parse_hex]):
+            raise NotImplementedError(
+                    "`parse_hex`: Must pass a list of column names/indices")
+
+        # Set both since users are allowed to mix column names and indices
+        c_parse_hex_names, c_parse_hex_indexes = _process_parse_dates_hex(parse_hex)
+        options.set_parse_hex(c_parse_hex_names)
+        options.set_parse_hex(c_parse_hex_indexes)
+
+    if isinstance(dtypes, list):
+        for dtype in dtypes:
+            c_dtypes_list.push_back((<DataType?>dtype).c_obj)
+        options.set_dtypes(c_dtypes_list)
+    elif isinstance(dtypes, dict):
+        # dtypes_t is dict
+        for k, v in dtypes.items():
+            c_dtypes_map[str(k).encode()] = (<DataType?>v).c_obj
+        options.set_dtypes(c_dtypes_map)
+    elif dtypes is not None:
+        raise TypeError("dtypes must either by a list/dict")
+
+    if true_values is not None:
+        options.set_true_values(_make_str_vector(true_values))
+
+    if false_values is not None:
+        options.set_false_values(_make_str_vector(false_values))
+
+    if na_values is not None:
+        options.set_na_values(_make_str_vector(na_values))
+
+    cdef table_with_metadata c_result
+    with nogil:
+        c_result = move(cpp_read_csv(options))
+
+    return TableWithMetadata.from_libcudf(c_result)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
index ab223c16a72..0094bf6032c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
@@ -38,6 +38,9 @@ cdef class TableWithMetadata:
 
 cdef class SourceInfo:
     cdef source_info c_obj
+    # Keep the bytes converted from stringio alive
+    # (otherwise we end up with a use after free when they get gc'ed)
+    cdef list byte_sources
 
 cdef class SinkInfo:
     # This vector just exists to keep the unique_ptrs to the sinks alive
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
index df0b729b711..68498ff88f4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
@@ -178,7 +178,7 @@ cdef class SourceInfo:
                     raise ValueError("All sources must be of the same type!")
                 new_sources.append(buffer.read().encode())
             sources = new_sources
-
+            self.byte_sources = sources
         if isinstance(sources[0], bytes):
             empty_buffer = True
             for buffer in sources:
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index 895e1afc502..fc672caa574 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -239,6 +239,9 @@ cdef dtype_from_column_view(column_view cv):
         ]
 
 cdef libcudf_types.data_type dtype_to_data_type(dtype) except *:
+    # Note: This function is to be phased out in favor of
+    # dtype_to_pylibcudf_type which will return a pylibcudf
+    # DataType object
     cdef libcudf_types.type_id tid
     if isinstance(dtype, cudf.ListDtype):
         tid = libcudf_types.type_id.LIST
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index efb192b3251..e029edfa2ed 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -4,6 +4,7 @@
 import io
 import os
 
+import numpy as np
 import pyarrow as pa
 import pytest
 
@@ -109,7 +110,10 @@ def _make_fields_nullable(typ):
         lhs_type = _make_fields_nullable(lhs.type)
         lhs = rhs.cast(lhs_type)
 
-    assert lhs.equals(rhs)
+    if pa.types.is_floating(lhs.type) and pa.types.is_floating(rhs.type):
+        np.testing.assert_array_almost_equal(lhs, rhs)
+    else:
+        assert lhs.equals(rhs)
 
 
 def assert_table_eq(pa_table: pa.Table, plc_table: plc.Table) -> None:
@@ -125,6 +129,8 @@ def assert_table_and_meta_eq(
     pa_table: pa.Table,
     plc_table_w_meta: plc.io.types.TableWithMetadata,
     check_field_nullability=True,
+    check_types_if_empty=True,
+    check_names=True,
 ) -> None:
     """Verify that the pylibcudf TableWithMetadata and PyArrow table are equal"""
 
@@ -135,11 +141,17 @@ def assert_table_and_meta_eq(
         plc_shape == pa_table.shape
     ), f"{plc_shape} is not equal to {pa_table.shape}"
 
+    if not check_types_if_empty and plc_table.num_rows() == 0:
+        return
+
     for plc_col, pa_col in zip(plc_table.columns(), pa_table.columns):
         assert_column_eq(pa_col, plc_col, check_field_nullability)
 
     # Check column name equality
-    assert plc_table_w_meta.column_names() == pa_table.column_names
+    if check_names:
+        assert (
+            plc_table_w_meta.column_names() == pa_table.column_names
+        ), f"{plc_table_w_meta.column_names()} != {pa_table.column_names}"
 
 
 def cudf_raises(expected_exception: BaseException, *args, **kwargs):
@@ -174,6 +186,33 @@ def is_nested_list(typ):
     return nesting_level(typ)[0] > 1
 
 
+def _convert_numeric_types_to_floating(pa_table):
+    """
+    Useful little helper for testing the
+    dtypes option in I/O readers.
+
+    Returns a tuple containing the pylibcudf dtypes
+    and the new pyarrow schema
+    """
+    dtypes = []
+    new_fields = []
+    for i in range(len(pa_table.schema)):
+        field = pa_table.schema.field(i)
+        child_types = []
+
+        plc_type = plc.interop.from_arrow(field.type)
+        if pa.types.is_integer(field.type) or pa.types.is_unsigned_integer(
+            field.type
+        ):
+            plc_type = plc.interop.from_arrow(pa.float64())
+            field = field.with_type(pa.float64())
+
+        dtypes.append((field.name, plc_type, child_types))
+
+        new_fields.append(field)
+    return dtypes, new_fields
+
+
 def write_source_str(source, input_str):
     """
     Write a string to the source
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
index 53e207f29cb..4a7194a6d8d 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -141,6 +141,20 @@ def _generate_nested_data(typ):
     ), pa_table
 
 
+@pytest.fixture(params=[(0, 0), ("half", 0), (-1, "half")])
+def nrows_skiprows(table_data, request):
+    """
+    Parametrized nrows fixture that accompanies table_data
+    """
+    _, pa_table = table_data
+    nrows, skiprows = request.param
+    if nrows == "half":
+        nrows = len(pa_table) // 2
+    if skiprows == "half":
+        skiprows = (len(pa_table) - nrows) // 2
+    return nrows, skiprows
+
+
 @pytest.fixture(
     params=["a.txt", pathlib.Path("a.txt"), io.BytesIO, io.StringIO],
 )
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_csv.py b/python/cudf/cudf/pylibcudf_tests/io/test_csv.py
new file mode 100644
index 00000000000..95326a8b681
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_csv.py
@@ -0,0 +1,280 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import io
+import os
+from io import StringIO
+
+import pandas as pd
+import pyarrow as pa
+import pytest
+from utils import (
+    _convert_numeric_types_to_floating,
+    assert_table_and_meta_eq,
+    make_source,
+    write_source_str,
+)
+
+import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.io.types import CompressionType
+
+# Shared kwargs to pass to make_source
+_COMMON_CSV_SOURCE_KWARGS = {
+    "format": "csv",
+    "index": False,
+}
+
+
+@pytest.fixture(scope="module")
+def csv_table_data(table_data):
+    """
+    Like the table_data but with nested types dropped
+    since the CSV reader can't handle that
+    uint64 is also dropped since it can get confused with int64
+    """
+    _, pa_table = table_data
+    pa_table = pa_table.drop_columns(
+        [
+            "col_uint64",
+            "col_list<item: int64>",
+            "col_list<item: list<item: int64>>",
+            "col_struct<v: int64 not null>",
+            "col_struct<a: int64 not null, b_struct: struct<b: double not null> not null>",
+        ]
+    )
+    return plc.interop.from_arrow(pa_table), pa_table
+
+
+@pytest.mark.parametrize("delimiter", [",", ";"])
+def test_read_csv_basic(
+    csv_table_data,
+    source_or_sink,
+    text_compression_type,
+    nrows_skiprows,
+    delimiter,
+):
+    _, pa_table = csv_table_data
+    compression_type = text_compression_type
+    nrows, skiprows = nrows_skiprows
+
+    # can't compress non-binary data with pandas
+    if isinstance(source_or_sink, io.StringIO):
+        compression_type = CompressionType.NONE
+
+    source = make_source(
+        source_or_sink,
+        pa_table,
+        compression=compression_type,
+        sep=delimiter,
+        **_COMMON_CSV_SOURCE_KWARGS,
+    )
+
+    # Rename the table (by reversing the names) to test names argument
+    pa_table = pa_table.rename_columns(pa_table.column_names[::-1])
+    column_names = pa_table.column_names
+
+    # Adapt to nrows/skiprows
+    pa_table = pa_table.slice(
+        offset=skiprows, length=nrows if nrows != -1 else None
+    )
+
+    res = plc.io.csv.read_csv(
+        plc.io.SourceInfo([source]),
+        delimiter=delimiter,
+        compression=compression_type,
+        col_names=column_names,
+        nrows=nrows,
+        skiprows=skiprows,
+    )
+
+    assert_table_and_meta_eq(
+        pa_table,
+        res,
+        check_types_if_empty=False,
+        check_names=False if skiprows > 0 and column_names is None else True,
+    )
+
+
+# Note: make sure chunk size is big enough so that dtype inference
+# infers correctly
+@pytest.mark.parametrize("chunk_size", [1000, 5999])
+def test_read_csv_byte_range(table_data, chunk_size, tmp_path):
+    _, pa_table = table_data
+    if len(pa_table) == 0:
+        # pandas writes nothing when we have empty table
+        # and header=None
+        pytest.skip("Don't test empty table case")
+    source = f"{tmp_path}/a.csv"
+    source = make_source(
+        source, pa_table, header=False, **_COMMON_CSV_SOURCE_KWARGS
+    )
+    file_size = os.stat(source).st_size
+    tbls_w_meta = []
+    for segment in range((file_size + chunk_size - 1) // chunk_size):
+        tbls_w_meta.append(
+            plc.io.csv.read_csv(
+                plc.io.SourceInfo([source]),
+                byte_range_offset=segment * chunk_size,
+                byte_range_size=chunk_size,
+                header=-1,
+                col_names=pa_table.column_names,
+            )
+        )
+    if isinstance(source, io.IOBase):
+        source.seek(0)
+    exp = pd.read_csv(source, names=pa_table.column_names, header=None)
+    tbls = []
+    for tbl_w_meta in tbls_w_meta:
+        if tbl_w_meta.tbl.num_rows() > 0:
+            tbls.append(plc.interop.to_arrow(tbl_w_meta.tbl))
+    full_tbl = pa.concat_tables(tbls)
+
+    full_tbl_plc = plc.io.TableWithMetadata(
+        plc.interop.from_arrow(full_tbl),
+        tbls_w_meta[0].column_names(include_children=True),
+    )
+    assert_table_and_meta_eq(pa.Table.from_pandas(exp), full_tbl_plc)
+
+
+@pytest.mark.parametrize("usecols", [None, ["col_int64", "col_bool"], [0, 1]])
+def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols):
+    # Simple test for dtypes where we read in
+    # all numeric data as floats
+    _, pa_table = csv_table_data
+
+    source = make_source(
+        source_or_sink,
+        pa_table,
+        **_COMMON_CSV_SOURCE_KWARGS,
+    )
+    # Adjust table for usecols
+    if usecols is not None:
+        pa_table = pa_table.select(usecols)
+
+    dtypes, new_fields = _convert_numeric_types_to_floating(pa_table)
+    # Extract the dtype out of the (name, type, child_types) tuple
+    # (read_csv doesn't support this format since it doesn't support nested columns)
+    dtypes = {name: dtype for name, dtype, _ in dtypes}
+
+    new_schema = pa.schema(new_fields)
+
+    res = plc.io.csv.read_csv(
+        plc.io.SourceInfo([source]), dtypes=dtypes, usecols=usecols
+    )
+    new_table = pa_table.cast(new_schema)
+
+    assert_table_and_meta_eq(new_table, res)
+
+
+@pytest.mark.parametrize("skip_blanks", [True, False])
+@pytest.mark.parametrize("decimal, quotechar", [(".", "'"), ("_", '"')])
+@pytest.mark.parametrize("lineterminator", ["\n", "\r\n"])
+def test_read_csv_parse_options(
+    source_or_sink, decimal, quotechar, skip_blanks, lineterminator
+):
+    lines = [
+        "# first comment line",
+        "# third comment line",
+        "1,2,3,4_4,'z'",
+        '4,5,6,5_5,""',
+        "7,8,9,9_87,'123'",
+        "# last comment line",
+        "1,1,1,10_11,abc",
+    ]
+    buffer = lineterminator.join(lines)
+
+    write_source_str(source_or_sink, buffer)
+
+    plc_table_w_meta = plc.io.csv.read_csv(
+        plc.io.SourceInfo([source_or_sink]),
+        comment="#",
+        decimal=decimal,
+        skip_blank_lines=skip_blanks,
+        quotechar=quotechar,
+    )
+    df = pd.read_csv(
+        StringIO(buffer),
+        comment="#",
+        decimal=decimal,
+        skip_blank_lines=skip_blanks,
+        quotechar=quotechar,
+    )
+    assert_table_and_meta_eq(pa.Table.from_pandas(df), plc_table_w_meta)
+
+
+@pytest.mark.parametrize("na_filter", [True, False])
+@pytest.mark.parametrize("na_values", [["n/a"], ["NV_NAN"]])
+@pytest.mark.parametrize("keep_default_na", [True, False])
+def test_read_csv_na_values(
+    source_or_sink, na_filter, na_values, keep_default_na
+):
+    lines = ["a,b,c", "n/a,NaN,NV_NAN", "1.0,2.0,3.0"]
+    buffer = "\n".join(lines)
+
+    write_source_str(source_or_sink, buffer)
+
+    plc_table_w_meta = plc.io.csv.read_csv(
+        plc.io.SourceInfo([source_or_sink]),
+        na_filter=na_filter,
+        na_values=na_values if na_filter else None,
+        keep_default_na=keep_default_na,
+    )
+    df = pd.read_csv(
+        StringIO(buffer),
+        na_filter=na_filter,
+        na_values=na_values if na_filter else None,
+        keep_default_na=keep_default_na,
+    )
+    assert_table_and_meta_eq(pa.Table.from_pandas(df), plc_table_w_meta)
+
+
+@pytest.mark.parametrize("header", [0, 10, -1])
+def test_read_csv_header(csv_table_data, source_or_sink, header):
+    _, pa_table = csv_table_data
+
+    source = make_source(
+        source_or_sink,
+        pa_table,
+        **_COMMON_CSV_SOURCE_KWARGS,
+    )
+
+    plc_table_w_meta = plc.io.csv.read_csv(
+        plc.io.SourceInfo([source]), header=header
+    )
+    if header > 0:
+        if header < len(pa_table):
+            names_row = pa_table.take([header - 1]).to_pylist()[0].values()
+            pa_table = pa_table.slice(header)
+            col_names = [str(name) for name in names_row]
+            pa_table = pa_table.rename_columns(col_names)
+        else:
+            pa_table = pa.table([])
+    elif header < 0:
+        # neg header means use user-provided names (in this case nothing)
+        # (the original column names are now data)
+        tbl_dict = pa_table.to_pydict()
+        new_tbl_dict = {}
+        for i, (name, vals) in enumerate(tbl_dict.items()):
+            str_vals = [str(val) for val in vals]
+            new_tbl_dict[str(i)] = [name] + str_vals
+        pa_table = pa.table(new_tbl_dict)
+
+    assert_table_and_meta_eq(
+        pa_table,
+        plc_table_w_meta,
+        check_types_if_empty=False,
+    )
+
+
+# TODO: test these
+# str prefix = "",
+# bool mangle_dupe_cols = True,
+# size_type skipfooter = 0,
+# str thousands = None,
+# bool delim_whitespace = False,
+# bool skipinitialspace = False,
+# quote_style quoting = quote_style.MINIMAL,
+# bool doublequote = True,
+# bool detect_whitespace_around_quotes = False,
+# list parse_dates = None,
+# list true_values = None,
+# list false_values = None,
+# bool dayfirst = False,

From c6c21d7f9281f295e32ff72c95f95b600470df0e Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Thu, 18 Jul 2024 12:41:21 -0700
Subject: [PATCH 532/842] Drop `{{ pin_compatible('numpy', max_pin='x') }}`
 (#16301)

Part of issue: https://github.com/rapidsai/build-planning/issues/82

Drop `{{ pin_compatible('numpy', max_pin='x') }}` as it is no longer needed. `numpy` has its own `run_exports`, which constraints `numpy` to an API compatible version. More details in issue: https://github.com/orgs/rapidsai/projects/132

So `cudf` now uses that in its recipe builds. Also update `requirements/run` to set the `numpy` lower bound to `1.23` as required by us.

Lastly add todo comments for NumPy 2 update lines.

Authors:
  - https://github.com/jakirkham

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16301
---
 conda/recipes/cudf/meta.yaml | 4 +++-
 dependencies.yaml            | 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 3cdc2050631..9137f099ad1 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -64,6 +64,7 @@ requirements:
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.7.0
     - dlpack >=0.8,<1.0
+    # TODO: Change to `2.0` for NumPy 2
     - numpy 1.23
     - pyarrow ==16.1.0.*
     - libcudf ={{ version }}
@@ -82,7 +83,8 @@ requirements:
     - pandas >=2.0,<2.2.3dev0
     - cupy >=12.0.0
     - numba >=0.57
-    - {{ pin_compatible('numpy', max_pin='x') }}
+    # TODO: Update `numpy` in `host` when dropping `<2.0a0`
+    - numpy >=1.23,<2.0a0
     - {{ pin_compatible('pyarrow', max_pin='x.x') }}
     - libcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 67ed3773b44..a19574b7658 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -323,6 +323,7 @@ dependencies:
         packages:
           # Hard pin the patch version used during the build.
           # Sync with conda build constraint & wheel run constraint.
+          # TODO: Change to `2.0.*` for NumPy 2
           - numpy==1.23.*
   build_python_cudf:
     common:
@@ -551,6 +552,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - fsspec>=0.6.0
+          # TODO: Update `numpy` in `build_python_common` when dropping `<2.0a0`
           - numpy>=1.23,<2.0a0
           - pandas>=2.0,<2.2.3dev0
   run_cudf:

From aeef0a1f4159d4c87f987d20225401040973d10f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 18 Jul 2024 16:56:30 -0400
Subject: [PATCH 533/842] Remove hash_character_ngrams dependency from
 jaccard_index (#16241)

Removes internal dependency of `nvtext::hash_character_ngrams` from `nvtext::jaccard_index`.
Works around the size-type limit imposed by `hash_character_ngrams` which returns a `list` column.
This also specializes the hashing logic for the jaccard calculation specifically.

The overall algorithm has not changed. Code has moved around a bit and internal list-columns have been replaced with just offsets and values vectors.

Closes #16157

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/16241
---
 cpp/benchmarks/text/jaccard.cpp |   4 +-
 cpp/src/text/jaccard.cu         | 478 ++++++++++++++++++++++----------
 2 files changed, 339 insertions(+), 143 deletions(-)

diff --git a/cpp/benchmarks/text/jaccard.cpp b/cpp/benchmarks/text/jaccard.cpp
index d05c195d077..d5b74da6773 100644
--- a/cpp/benchmarks/text/jaccard.cpp
+++ b/cpp/benchmarks/text/jaccard.cpp
@@ -59,6 +59,6 @@ static void bench_jaccard(nvbench::state& state)
 
 NVBENCH_BENCH(bench_jaccard)
   .set_name("jaccard")
-  .add_int64_axis("num_rows", {1024, 4096, 8192, 16364, 32768, 262144})
-  .add_int64_axis("row_width", {128, 512, 2048})
+  .add_int64_axis("num_rows", {32768, 131072, 262144})
+  .add_int64_axis("row_width", {128, 512, 1024, 2048})
   .add_int64_axis("substring_width", {5, 10});
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index 9cf934165f6..e465fb79c89 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -19,16 +19,19 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
-#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <nvtext/detail/generate_ngrams.hpp>
 #include <nvtext/jaccard.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
@@ -36,127 +39,375 @@
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/sequence.h>
 #include <thrust/transform.h>
 
 namespace nvtext {
 namespace detail {
 namespace {
 
+constexpr cudf::thread_index_type block_size       = 256;
+constexpr cudf::thread_index_type bytes_per_thread = 4;
+
 /**
  * @brief Retrieve the row data (span) for the given column/row-index
  *
- * @param d_input Input lists column
+ * @param values Flat vector of all values
+ * @param offsets Offsets identifying rows within values
  * @param idx Row index to retrieve
  * @return A device-span of the row values
  */
-__device__ auto get_row(cudf::column_device_view const& d_input, cudf::size_type idx)
+__device__ auto get_row(uint32_t const* values, int64_t const* offsets, cudf::size_type row_idx)
 {
-  auto const offsets =
-    d_input.child(cudf::lists_column_view::offsets_column_index).data<cudf::size_type>();
-  auto const offset = offsets[idx];
-  auto const size   = offsets[idx + 1] - offset;
-  auto const begin =
-    d_input.child(cudf::lists_column_view::child_column_index).data<uint32_t>() + offset;
+  auto const offset = offsets[row_idx];
+  auto const size   = offsets[row_idx + 1] - offset;
+  auto const begin  = values + offset;
   return cudf::device_span<uint32_t const>(begin, size);
 }
 
 /**
- * @brief Count the unique values within each row of the input column
+ * @brief Kernel to count the unique values within each row of the input column
+ *
+ * This is called with a warp per row.
  *
- * This is called with a warp per row
+ * @param d_values Sorted hash values to count uniqueness
+ * @param d_offsets Offsets to each set of row elements in d_values
+ * @param rows Number of rows in the output
+ * @param d_results Number of unique values in each row
  */
-struct sorted_unique_fn {
-  cudf::column_device_view const d_input;
-  cudf::size_type* d_results;
+CUDF_KERNEL void sorted_unique_fn(uint32_t const* d_values,
+                                  int64_t const* d_offsets,
+                                  cudf::size_type rows,
+                                  cudf::size_type* d_results)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size)) { return; }
 
-  // warp per row
-  __device__ void operator()(cudf::size_type idx) const
-  {
-    using warp_reduce = cub::WarpReduce<cudf::size_type>;
-    __shared__ typename warp_reduce::TempStorage temp_storage;
+  using warp_reduce = cub::WarpReduce<cudf::size_type>;
+  __shared__ typename warp_reduce::TempStorage temp_storage;
 
-    auto const row_idx  = idx / cudf::detail::warp_size;
-    auto const lane_idx = idx % cudf::detail::warp_size;
-    auto const row      = get_row(d_input, row_idx);
-    auto const begin    = row.begin();
+  auto const row_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+  auto const row      = get_row(d_values, d_offsets, row_idx);
+  auto const begin    = row.begin();
 
-    cudf::size_type count = 0;
-    for (auto itr = begin + lane_idx; itr < row.end(); itr += cudf::detail::warp_size) {
-      count += (itr == begin || *itr != *(itr - 1));
-    }
-    auto const result = warp_reduce(temp_storage).Sum(count);
-    if (lane_idx == 0) { d_results[row_idx] = result; }
+  cudf::size_type count = 0;
+  for (auto itr = begin + lane_idx; itr < row.end(); itr += cudf::detail::warp_size) {
+    count += (itr == begin || *itr != *(itr - 1));
   }
-};
+  auto const result = warp_reduce(temp_storage).Sum(count);
+  if (lane_idx == 0) { d_results[row_idx] = result; }
+}
 
-rmm::device_uvector<cudf::size_type> compute_unique_counts(cudf::column_view const& input,
+/**
+ * @brief Count the unique values within each row of the input column
+ *
+ * @param values Sorted hash values to count uniqueness
+ * @param offsets Offsets to each set of row elements in d_values
+ * @param rows Number of rows in the output
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Number of unique values
+ */
+rmm::device_uvector<cudf::size_type> compute_unique_counts(uint32_t const* values,
+                                                           int64_t const* offsets,
+                                                           cudf::size_type rows,
                                                            rmm::cuda_stream_view stream)
 {
-  auto const d_input = cudf::column_device_view::create(input, stream);
-  auto d_results     = rmm::device_uvector<cudf::size_type>(input.size(), stream);
-  sorted_unique_fn fn{*d_input, d_results.data()};
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::counting_iterator<cudf::size_type>(0),
-                     input.size() * cudf::detail::warp_size,
-                     fn);
+  auto d_results        = rmm::device_uvector<cudf::size_type>(rows, stream);
+  auto const num_blocks = cudf::util::div_rounding_up_safe(
+    static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size, block_size);
+  sorted_unique_fn<<<num_blocks, block_size, 0, stream.value()>>>(
+    values, offsets, rows, d_results.data());
   return d_results;
 }
 
+/**
+ * @brief Kernel to count the number of common values within each row of the 2 input columns
+ *
+ * This is called with a warp per row.
+ *
+ * @param d_values1 Sorted hash values to check against d_values2
+ * @param d_offsets1 Offsets to each set of row elements in d_values1
+ * @param d_values2 Sorted hash values to check against d_values1
+ * @param d_offsets2 Offsets to each set of row elements in d_values2
+ * @param rows Number of rows in the output
+ * @param d_results Number of common values in each row
+ */
+CUDF_KERNEL void sorted_intersect_fn(uint32_t const* d_values1,
+                                     int64_t const* d_offsets1,
+                                     uint32_t const* d_values2,
+                                     int64_t const* d_offsets2,
+                                     cudf::size_type rows,
+                                     cudf::size_type* d_results)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size)) { return; }
+
+  using warp_reduce = cub::WarpReduce<cudf::size_type>;
+  __shared__ typename warp_reduce::TempStorage temp_storage;
+
+  auto const row_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+
+  auto const needles  = get_row(d_values1, d_offsets1, row_idx);
+  auto const haystack = get_row(d_values2, d_offsets2, row_idx);
+
+  auto begin     = haystack.begin();
+  auto const end = haystack.end();
+
+  cudf::size_type count = 0;
+  for (auto itr = needles.begin() + lane_idx; itr < needles.end() && begin < end;
+       itr += cudf::detail::warp_size) {
+    if (itr != needles.begin() && *itr == *(itr - 1)) { continue; }  // skip duplicates
+    // search haystack for this needle (*itr)
+    auto const found = thrust::lower_bound(thrust::seq, begin, end, *itr);
+    count += (found != end) && (*found == *itr);  // increment if found;
+    begin = found;                                // shorten the next lower-bound range
+  }
+  // sum up the counts across this warp
+  auto const result = warp_reduce(temp_storage).Sum(count);
+  if (lane_idx == 0) { d_results[row_idx] = result; }
+}
+
 /**
  * @brief Count the number of common values within each row of the 2 input columns
  *
- * This is called with a warp per row
+ * @param d_values1 Sorted hash values to check against d_values2
+ * @param d_offsets1 Offsets to each set of row elements in d_values1
+ * @param d_values2 Sorted hash values to check against d_values1
+ * @param d_offsets2 Offsets to each set of row elements in d_values2
+ * @param rows Number of rows in the output
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Number of common values
  */
-struct sorted_intersect_fn {
-  cudf::column_device_view const d_input1;
-  cudf::column_device_view const d_input2;
-  cudf::size_type* d_results;
+rmm::device_uvector<cudf::size_type> compute_intersect_counts(uint32_t const* values1,
+                                                              int64_t const* offsets1,
+                                                              uint32_t const* values2,
+                                                              int64_t const* offsets2,
+                                                              cudf::size_type rows,
+                                                              rmm::cuda_stream_view stream)
+{
+  auto d_results        = rmm::device_uvector<cudf::size_type>(rows, stream);
+  auto const num_blocks = cudf::util::div_rounding_up_safe(
+    static_cast<cudf::thread_index_type>(rows) * cudf::detail::warp_size, block_size);
+  sorted_intersect_fn<<<num_blocks, block_size, 0, stream.value()>>>(
+    values1, offsets1, values2, offsets2, rows, d_results.data());
+  return d_results;
+}
 
-  // warp per row
-  __device__ void operator()(cudf::size_type idx) const
-  {
-    using warp_reduce = cub::WarpReduce<cudf::size_type>;
-    __shared__ typename warp_reduce::TempStorage temp_storage;
+/**
+ * @brief Counts the number of substrings in each row of the given strings column
+ *
+ * Each warp processes a single string.
+ * Formula is `count = max(1, str.length() - width + 1)`
+ * If a string has less than width characters (but not empty), the count is 1
+ * since the entire string is still hashed.
+ *
+ * @param d_strings Input column of strings
+ * @param width Substring size in characters
+ * @param d_counts Output number of substring per row of input
+ */
+CUDF_KERNEL void count_substrings_kernel(cudf::column_device_view const d_strings,
+                                         cudf::size_type width,
+                                         int64_t* d_counts)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(d_strings.size()) * cudf::detail::warp_size)) {
+    return;
+  }
 
-    auto const row_idx  = idx / cudf::detail::warp_size;
-    auto const lane_idx = idx % cudf::detail::warp_size;
+  auto const str_idx = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  if (d_strings.is_null(str_idx)) {
+    d_counts[str_idx] = 0;
+    return;
+  }
 
-    auto const needles  = get_row(d_input1, row_idx);
-    auto const haystack = get_row(d_input2, row_idx);
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) {
+    d_counts[str_idx] = 0;
+    return;
+  }
 
-    auto begin     = haystack.begin();
-    auto const end = haystack.end();
+  using warp_reduce = cub::WarpReduce<cudf::size_type>;
+  __shared__ typename warp_reduce::TempStorage temp_storage;
 
-    // TODO: investigate cuCollections device-side static-map to match row values
+  auto const end        = d_str.data() + d_str.size_bytes();
+  auto const lane_idx   = idx % cudf::detail::warp_size;
+  cudf::size_type count = 0;
+  for (auto itr = d_str.data() + (lane_idx * bytes_per_thread); itr < end;
+       itr += cudf::detail::warp_size * bytes_per_thread) {
+    for (auto s = itr; (s < (itr + bytes_per_thread)) && (s < end); ++s) {
+      count += static_cast<cudf::size_type>(cudf::strings::detail::is_begin_utf8_char(*s));
+    }
+  }
+  auto const char_count = warp_reduce(temp_storage).Sum(count);
+  if (lane_idx == 0) { d_counts[str_idx] = std::max(1, char_count - width + 1); }
+}
+
+/**
+ * @brief Kernel to hash the substrings for each input row
+ *
+ * Each warp processes a single string.
+ * Substrings of string "hello world" with width=4 produce:
+ *   "hell", "ello", "llo ", "lo w", "o wo", " wor", "worl", "orld"
+ * Each of these substrings is hashed and the hash stored in d_results
+ *
+ * @param d_strings Input column of strings
+ * @param width Substring size in characters
+ * @param d_output_offsets Offsets into d_results
+ * @param d_results Hash values for each substring
+ */
+CUDF_KERNEL void substring_hash_kernel(cudf::column_device_view const d_strings,
+                                       cudf::size_type width,
+                                       int64_t const* d_output_offsets,
+                                       uint32_t* d_results)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(d_strings.size()) * cudf::detail::warp_size)) {
+    return;
+  }
 
-    cudf::size_type count = 0;
-    for (auto itr = needles.begin() + lane_idx; itr < needles.end() && begin < end;
-         itr += cudf::detail::warp_size) {
-      if (itr != needles.begin() && *itr == *(itr - 1)) { continue; }  // skip duplicates
-      // search haystack for this needle (*itr)
-      auto const found = thrust::lower_bound(thrust::seq, begin, end, *itr);
-      count += (found != end) && (*found == *itr);  // increment if found;
-      begin = found;                                // shorten the next lower-bound range
+  auto const str_idx  = idx / cudf::detail::warp_size;
+  auto const lane_idx = idx % cudf::detail::warp_size;
+
+  if (d_strings.is_null(str_idx)) { return; }
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) { return; }
+
+  __shared__ uint32_t hvs[block_size];  // temp store for hash values
+
+  auto const hasher     = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
+  auto const end        = d_str.data() + d_str.size_bytes();
+  auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1;
+
+  auto d_hashes = d_results + d_output_offsets[str_idx];
+  auto itr      = d_str.data() + lane_idx;
+  for (auto i = 0; i < warp_count; ++i) {
+    uint32_t hash = 0;
+    if (itr < end && cudf::strings::detail::is_begin_utf8_char(*itr)) {
+      // resolve substring
+      auto const sub_str =
+        cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
+      auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(sub_str, width);
+      // hash only if we have the full width of characters or this is the beginning of the string
+      if ((left == 0) || (itr == d_str.data())) { hash = hasher(cudf::string_view(itr, bytes)); }
     }
-    // sum up the counts across this warp
-    auto const result = warp_reduce(temp_storage).Sum(count);
-    if (lane_idx == 0) { d_results[row_idx] = result; }
+    hvs[threadIdx.x] = hash;  // store hash into shared memory
+    __syncwarp();
+    if (lane_idx == 0) {
+      // copy valid hash values for this warp into d_hashes
+      auto const hashes     = &hvs[threadIdx.x];
+      auto const hashes_end = hashes + cudf::detail::warp_size;
+      d_hashes =
+        thrust::copy_if(thrust::seq, hashes, hashes_end, d_hashes, [](auto h) { return h != 0; });
+    }
+    __syncwarp();
+    itr += cudf::detail::warp_size;
   }
-};
+}
 
-rmm::device_uvector<cudf::size_type> compute_intersect_counts(cudf::column_view const& input1,
-                                                              cudf::column_view const& input2,
-                                                              rmm::cuda_stream_view stream)
+void segmented_sort(uint32_t const* input,
+                    uint32_t* output,
+                    int64_t items,
+                    cudf::size_type segments,
+                    int64_t const* offsets,
+                    rmm::cuda_stream_view stream)
 {
-  auto const d_input1 = cudf::column_device_view::create(input1, stream);
-  auto const d_input2 = cudf::column_device_view::create(input2, stream);
-  auto d_results      = rmm::device_uvector<cudf::size_type>(input1.size(), stream);
-  sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()};
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::counting_iterator<cudf::size_type>(0),
-                     input1.size() * cudf::detail::warp_size,
-                     fn);
-  return d_results;
+  rmm::device_buffer temp;
+  std::size_t temp_bytes = 0;
+  cub::DeviceSegmentedSort::SortKeys(
+    temp.data(), temp_bytes, input, output, items, segments, offsets, offsets + 1, stream.value());
+  temp = rmm::device_buffer(temp_bytes, stream);
+  cub::DeviceSegmentedSort::SortKeys(
+    temp.data(), temp_bytes, input, output, items, segments, offsets, offsets + 1, stream.value());
+}
+
+/**
+ * @brief Create hashes for each substring
+ *
+ * The hashes are sorted using a segmented-sort as setup to
+ * perform the unique and intersect operations.
+ *
+ * @param input Input strings column to hash
+ * @param width Substring width in characters
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return The sorted hash values and offsets to each row
+ */
+std::pair<rmm::device_uvector<uint32_t>, rmm::device_uvector<int64_t>> hash_substrings(
+  cudf::strings_column_view const& input, cudf::size_type width, rmm::cuda_stream_view stream)
+{
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+
+  // count substrings
+  auto offsets          = rmm::device_uvector<int64_t>(input.size() + 1, stream);
+  auto const num_blocks = cudf::util::div_rounding_up_safe(
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size);
+  count_substrings_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+    *d_strings, width, offsets.data());
+  auto const total_hashes =
+    cudf::detail::sizes_to_offsets(offsets.begin(), offsets.end(), offsets.begin(), stream);
+
+  // hash substrings
+  rmm::device_uvector<uint32_t> hashes(total_hashes, stream);
+  substring_hash_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+    *d_strings, width, offsets.data(), hashes.data());
+
+  // sort hashes
+  rmm::device_uvector<uint32_t> sorted(total_hashes, stream);
+  if (total_hashes < static_cast<int64_t>(std::numeric_limits<int>::max())) {
+    segmented_sort(
+      hashes.begin(), sorted.begin(), sorted.size(), input.size(), offsets.begin(), stream);
+  } else {
+    // The CUB segmented sort can only handle max<int> total values
+    // so this code calls it in sections.
+    auto const section_size   = std::numeric_limits<int>::max() / 2L;
+    auto const sort_sections  = cudf::util::div_rounding_up_safe(total_hashes, section_size);
+    auto const offset_indices = [&] {
+      // build a set of indices that point to offsets subsections
+      auto sub_offsets = rmm::device_uvector<int64_t>(sort_sections + 1, stream);
+      thrust::sequence(
+        rmm::exec_policy(stream), sub_offsets.begin(), sub_offsets.end(), 0L, section_size);
+      auto indices = rmm::device_uvector<int64_t>(sub_offsets.size(), stream);
+      thrust::lower_bound(rmm::exec_policy(stream),
+                          offsets.begin(),
+                          offsets.end(),
+                          sub_offsets.begin(),
+                          sub_offsets.end(),
+                          indices.begin());
+      return cudf::detail::make_std_vector_sync(indices, stream);
+    }();
+
+    // Call segmented sort with the sort sections
+    for (auto i = 0L; i < sort_sections; ++i) {
+      auto const index1 = offset_indices[i];
+      auto const index2 = std::min(offset_indices[i + 1], static_cast<int64_t>(offsets.size() - 1));
+      auto const offset1 = offsets.element(index1, stream);
+      auto const offset2 = offsets.element(index2, stream);
+
+      auto const num_items    = offset2 - offset1;
+      auto const num_segments = index2 - index1;
+
+      // There is a bug in the CUB segmented sort and the workaround is to
+      // shift the offset values so the first offset is 0.
+      // This transform can be removed once the bug is fixed.
+      auto sort_offsets = rmm::device_uvector<int64_t>(num_segments + 1, stream);
+      thrust::transform(rmm::exec_policy(stream),
+                        offsets.begin() + index1,
+                        offsets.begin() + index2 + 1,
+                        sort_offsets.begin(),
+                        [offset1] __device__(auto const o) { return o - offset1; });
+
+      segmented_sort(hashes.begin() + offset1,
+                     sorted.begin() + offset1,
+                     num_items,
+                     num_segments,
+                     sort_offsets.begin(),
+                     stream);
+    }
+  }
+  return std::make_pair(std::move(sorted), std::move(offsets));
 }
 
 /**
@@ -186,62 +437,6 @@ struct jaccard_fn {
   }
 };
 
-/**
- * @brief Create hashes for each substring
- *
- * Uses the hash_character_ngrams to hash substrings of the input column.
- * This returns a lists column where each row is the hashes for the substrings
- * of the corresponding input string row.
- *
- * The hashes are then sorted using a segmented-sort as setup to
- * perform the unique and intersect operations.
- */
-std::unique_ptr<cudf::column> hash_substrings(cudf::strings_column_view const& col,
-                                              cudf::size_type width,
-                                              rmm::cuda_stream_view stream)
-{
-  auto hashes = hash_character_ngrams(col, width, stream, rmm::mr::get_current_device_resource());
-  auto const input   = cudf::lists_column_view(hashes->view());
-  auto const offsets = input.offsets_begin();
-  auto const data    = input.child().data<uint32_t>();
-
-  rmm::device_uvector<uint32_t> sorted(input.child().size(), stream);
-
-  // this is wicked fast and much faster than using cudf::lists::detail::sort_list
-  rmm::device_buffer d_temp_storage;
-  size_t temp_storage_bytes = 0;
-  cub::DeviceSegmentedSort::SortKeys(d_temp_storage.data(),
-                                     temp_storage_bytes,
-                                     data,
-                                     sorted.data(),
-                                     sorted.size(),
-                                     input.size(),
-                                     offsets,
-                                     offsets + 1,
-                                     stream.value());
-  d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream};
-  cub::DeviceSegmentedSort::SortKeys(d_temp_storage.data(),
-                                     temp_storage_bytes,
-                                     data,
-                                     sorted.data(),
-                                     sorted.size(),
-                                     input.size(),
-                                     offsets,
-                                     offsets + 1,
-                                     stream.value());
-
-  auto contents = hashes->release();
-  // the offsets are taken from the hashes column since they are the same
-  // before and after the segmented-sort
-  return cudf::make_lists_column(
-    col.size(),
-    std::move(contents.children.front()),
-    std::make_unique<cudf::column>(std::move(sorted), rmm::device_buffer{}, 0),
-    0,
-    rmm::device_buffer{},
-    stream,
-    rmm::mr::get_current_device_resource());
-}
 }  // namespace
 
 std::unique_ptr<cudf::column> jaccard_index(cudf::strings_column_view const& input1,
@@ -261,13 +456,14 @@ std::unique_ptr<cudf::column> jaccard_index(cudf::strings_column_view const& inp
 
   auto const [d_uniques1, d_uniques2, d_intersects] = [&] {
     // build hashes of the substrings
-    auto const hash1 = hash_substrings(input1, width, stream);
-    auto const hash2 = hash_substrings(input2, width, stream);
+    auto const [hash1, offsets1] = hash_substrings(input1, width, stream);
+    auto const [hash2, offsets2] = hash_substrings(input2, width, stream);
 
     // compute the unique counts in each set and the intersection counts
-    auto d_uniques1   = compute_unique_counts(hash1->view(), stream);
-    auto d_uniques2   = compute_unique_counts(hash2->view(), stream);
-    auto d_intersects = compute_intersect_counts(hash1->view(), hash2->view(), stream);
+    auto d_uniques1   = compute_unique_counts(hash1.data(), offsets1.data(), input1.size(), stream);
+    auto d_uniques2   = compute_unique_counts(hash2.data(), offsets2.data(), input2.size(), stream);
+    auto d_intersects = compute_intersect_counts(
+      hash1.data(), offsets1.data(), hash2.data(), offsets2.data(), input1.size(), stream);
 
     return std::tuple{std::move(d_uniques1), std::move(d_uniques2), std::move(d_intersects)};
   }();

From 4acca4d57303f52907aa158a2ef996c9d42a73d6 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 18 Jul 2024 11:07:07 -1000
Subject: [PATCH 534/842] Use Column.can_cast_safely instead of some ad-hoc
 dtype functions in .where (#16303)

There were a couple of dedicated functions in `python/cudf/cudf/utils/dtypes.py` specific to `.where` that could be subsumed by `Column.can_cast_safely`.

The minor downside is that we need to cast where's argument to a Column first, but IMO it's probably OK given the deduplication

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16303
---
 python/cudf/cudf/core/_internals/where.py | 78 ++++++++++++++----
 python/cudf/cudf/utils/dtypes.py          | 96 +----------------------
 2 files changed, 62 insertions(+), 112 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 4a36be76b6d..6003a0f6aea 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -9,12 +9,7 @@
 import cudf
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.dtypes import CategoricalDtype
-from cudf.utils.dtypes import (
-    _can_cast,
-    _dtype_can_hold_element,
-    find_common_type,
-    is_mixed_with_object_dtype,
-)
+from cudf.utils.dtypes import find_common_type, is_mixed_with_object_dtype
 
 if TYPE_CHECKING:
     from cudf._typing import ScalarLike
@@ -44,6 +39,8 @@ def _check_and_cast_columns_with_other(
     inplace: bool,
 ) -> tuple[ColumnBase, ScalarLike | ColumnBase]:
     # Returns type-casted `source_col` & `other` based on `inplace`.
+    from cudf.core.column import as_column
+
     source_dtype = source_col.dtype
     if isinstance(source_dtype, CategoricalDtype):
         return _normalize_categorical(source_col, other)
@@ -84,17 +81,9 @@ def _check_and_cast_columns_with_other(
             )
         return _normalize_categorical(source_col, other.astype(source_dtype))
 
-    if (
-        _is_non_decimal_numeric_dtype(source_dtype)
-        and not other_is_scalar  # can-cast fails for Python scalars
-        and _can_cast(other, source_dtype)
-    ):
-        common_dtype = source_dtype
-    elif (
-        isinstance(source_col, cudf.core.column.NumericalColumn)
-        and other_is_scalar
-        and _dtype_can_hold_element(source_dtype, other)
-    ):
+    if _is_non_decimal_numeric_dtype(source_dtype) and as_column(
+        other
+    ).can_cast_safely(source_dtype):
         common_dtype = source_dtype
     else:
         common_dtype = find_common_type(
@@ -130,3 +119,58 @@ def _make_categorical_like(result, column):
             ordered=column.ordered,
         )
     return result
+
+
+def _can_cast(from_dtype, to_dtype):
+    """
+    Utility function to determine if we can cast
+    from `from_dtype` to `to_dtype`. This function primarily calls
+    `np.can_cast` but with some special handling around
+    cudf specific dtypes.
+    """
+    if cudf.utils.utils.is_na_like(from_dtype):
+        return True
+    if isinstance(from_dtype, type):
+        from_dtype = cudf.dtype(from_dtype)
+    if isinstance(to_dtype, type):
+        to_dtype = cudf.dtype(to_dtype)
+
+    # TODO : Add precision & scale checking for
+    # decimal types in future
+
+    if isinstance(from_dtype, cudf.core.dtypes.DecimalDtype):
+        if isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
+            return True
+        elif isinstance(to_dtype, np.dtype):
+            if to_dtype.kind in {"i", "f", "u", "U", "O"}:
+                return True
+            else:
+                return False
+    elif isinstance(from_dtype, np.dtype):
+        if isinstance(to_dtype, np.dtype):
+            return np.can_cast(from_dtype, to_dtype)
+        elif isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
+            if from_dtype.kind in {"i", "f", "u", "U", "O"}:
+                return True
+            else:
+                return False
+        elif isinstance(to_dtype, cudf.core.types.CategoricalDtype):
+            return True
+        else:
+            return False
+    elif isinstance(from_dtype, cudf.core.dtypes.ListDtype):
+        # TODO: Add level based checks too once casting of
+        # list columns is supported
+        if isinstance(to_dtype, cudf.core.dtypes.ListDtype):
+            return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type)
+        else:
+            return False
+    elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype):
+        if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype):
+            return True
+        elif isinstance(to_dtype, np.dtype):
+            return np.can_cast(from_dtype._categories.dtype, to_dtype)
+        else:
+            return False
+    else:
+        return np.can_cast(from_dtype, to_dtype)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 59e5ec1df04..af912bee342 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -10,8 +10,6 @@
 from pandas.core.dtypes.common import infer_dtype_from_object
 
 import cudf
-from cudf._typing import DtypeObj
-from cudf.api.types import is_bool, is_float, is_integer
 
 """Map numpy dtype to pyarrow types.
 Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special
@@ -584,61 +582,6 @@ def _dtype_pandas_compatible(dtype):
     return dtype
 
 
-def _can_cast(from_dtype, to_dtype):
-    """
-    Utility function to determine if we can cast
-    from `from_dtype` to `to_dtype`. This function primarily calls
-    `np.can_cast` but with some special handling around
-    cudf specific dtypes.
-    """
-    if cudf.utils.utils.is_na_like(from_dtype):
-        return True
-    if isinstance(from_dtype, type):
-        from_dtype = cudf.dtype(from_dtype)
-    if isinstance(to_dtype, type):
-        to_dtype = cudf.dtype(to_dtype)
-
-    # TODO : Add precision & scale checking for
-    # decimal types in future
-
-    if isinstance(from_dtype, cudf.core.dtypes.DecimalDtype):
-        if isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
-            return True
-        elif isinstance(to_dtype, np.dtype):
-            if to_dtype.kind in {"i", "f", "u", "U", "O"}:
-                return True
-            else:
-                return False
-    elif isinstance(from_dtype, np.dtype):
-        if isinstance(to_dtype, np.dtype):
-            return np.can_cast(from_dtype, to_dtype)
-        elif isinstance(to_dtype, cudf.core.dtypes.DecimalDtype):
-            if from_dtype.kind in {"i", "f", "u", "U", "O"}:
-                return True
-            else:
-                return False
-        elif isinstance(to_dtype, cudf.core.types.CategoricalDtype):
-            return True
-        else:
-            return False
-    elif isinstance(from_dtype, cudf.core.dtypes.ListDtype):
-        # TODO: Add level based checks too once casting of
-        # list columns is supported
-        if isinstance(to_dtype, cudf.core.dtypes.ListDtype):
-            return np.can_cast(from_dtype.leaf_type, to_dtype.leaf_type)
-        else:
-            return False
-    elif isinstance(from_dtype, cudf.core.dtypes.CategoricalDtype):
-        if isinstance(to_dtype, cudf.core.dtypes.CategoricalDtype):
-            return True
-        elif isinstance(to_dtype, np.dtype):
-            return np.can_cast(from_dtype._categories.dtype, to_dtype)
-        else:
-            return False
-    else:
-        return np.can_cast(from_dtype, to_dtype)
-
-
 def _maybe_convert_to_default_type(dtype):
     """Convert `dtype` to default if specified by user.
 
@@ -661,44 +604,7 @@ def _maybe_convert_to_default_type(dtype):
     return dtype
 
 
-def _dtype_can_hold_range(rng: range, dtype: np.dtype) -> bool:
-    if not len(rng):
-        return True
-    return np.can_cast(rng[0], dtype) and np.can_cast(rng[-1], dtype)
-
-
-def _dtype_can_hold_element(dtype: np.dtype, element) -> bool:
-    if dtype.kind in {"i", "u"}:
-        if isinstance(element, range):
-            if _dtype_can_hold_range(element, dtype):
-                return True
-            return False
-
-        elif is_integer(element) or (
-            is_float(element) and element.is_integer()
-        ):
-            info = np.iinfo(dtype)
-            if info.min <= element <= info.max:
-                return True
-            return False
-
-    elif dtype.kind == "f":
-        if is_integer(element) or is_float(element):
-            casted = dtype.type(element)
-            if np.isnan(casted) or casted == element:
-                return True
-            # otherwise e.g. overflow see TestCoercionFloat32
-            return False
-
-    elif dtype.kind == "b":
-        if is_bool(element):
-            return True
-        return False
-
-    raise NotImplementedError(f"Unsupported dtype: {dtype}")
-
-
-def _get_base_dtype(dtype: DtypeObj) -> DtypeObj:
+def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype:
     # TODO: replace the use of this function with just `dtype.base`
     # when Pandas 2.1.0 is the minimum version we support:
     # https://github.com/pandas-dev/pandas/pull/52706

From debbef0bc12f523054740432983030dd0b24f9c4 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 19 Jul 2024 15:12:56 +0100
Subject: [PATCH 535/842] Update vendored thread_pool implementation (#16210)

Since we introduced the vendored thread_pool in #8752, upstream has introduced some new features, and particularly now uses condition variables/notification to handle when there are no tasks in the queue. This avoids the issue described in #16209 where the thread pool by default artificially introduces a delay of 1000microseconds to all tasks whenever the task queue is emptied.

- Closes #16209

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/16210
---
 cpp/CMakeLists.txt                            |   4 +-
 .../groupby/group_max_multithreaded.cpp       |  10 +-
 .../io/orc/orc_reader_multithreaded.cpp       |  26 +-
 .../io/parquet/parquet_reader_multithread.cpp |  26 +-
 cpp/cmake/thirdparty/get_thread_pool.cmake    |  31 ++
 cpp/include/cudf/utilities/thread_pool.hpp    | 381 ------------------
 cpp/src/io/utilities/file_io_utilities.cpp    |   6 +-
 cpp/src/io/utilities/file_io_utilities.hpp    |   7 +-
 8 files changed, 66 insertions(+), 425 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/get_thread_pool.cmake
 delete mode 100644 cpp/include/cudf/utilities/thread_pool.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 903cff27be4..65347bd6689 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -216,6 +216,8 @@ include(cmake/thirdparty/get_fmt.cmake)
 include(cmake/thirdparty/get_spdlog.cmake)
 # find nanoarrow
 include(cmake/thirdparty/get_nanoarrow.cmake)
+# find thread_pool
+include(cmake/thirdparty/get_thread_pool.cmake)
 
 # Workaround until https://github.com/rapidsai/rapids-cmake/issues/176 is resolved
 if(NOT BUILD_SHARED_LIBS)
@@ -804,7 +806,7 @@ add_dependencies(cudf jitify_preprocess_run)
 # Specify the target module library dependencies
 target_link_libraries(
   cudf
-  PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm
+  PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS_thread_pool>
   PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
           kvikio::kvikio $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
 )
diff --git a/cpp/benchmarks/groupby/group_max_multithreaded.cpp b/cpp/benchmarks/groupby/group_max_multithreaded.cpp
index 3b8faba618f..bf1a1a5fcf7 100644
--- a/cpp/benchmarks/groupby/group_max_multithreaded.cpp
+++ b/cpp/benchmarks/groupby/group_max_multithreaded.cpp
@@ -20,8 +20,8 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/groupby.hpp>
 #include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/thread_pool.hpp>
 
+#include <BS_thread_pool.hpp>
 #include <nvbench/nvbench.cuh>
 
 template <typename Type>
@@ -58,7 +58,7 @@ void bench_groupby_max_multithreaded(nvbench::state& state, nvbench::type_list<T
   auto gb_obj    = cudf::groupby::groupby(cudf::table_view({keys_view, keys_view, keys_view}));
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
 
   std::vector<std::vector<cudf::groupby::aggregation_request>> requests(num_threads);
   for (auto& thread_requests : requests) {
@@ -75,10 +75,8 @@ void bench_groupby_max_multithreaded(nvbench::state& state, nvbench::type_list<T
     nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
       auto perform_agg = [&](int64_t index) { gb_obj.aggregate(requests[index], streams[index]); };
       timer.start();
-      for (int64_t i = 0; i < num_threads; ++i) {
-        threads.submit(perform_agg, i);
-      }
-      threads.wait_for_tasks();
+      threads.detach_sequence(decltype(num_threads){0}, num_threads, perform_agg);
+      threads.wait();
       cudf::detail::join_streams(streams, cudf::get_default_stream());
       cudf::get_default_stream().synchronize();
       timer.stop();
diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
index aa0ee39a179..e91bf06fdfa 100644
--- a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
@@ -24,8 +24,8 @@
 #include <cudf/io/orc.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
-#include <cudf/utilities/thread_pool.hpp>
 
+#include <BS_thread_pool.hpp>
 #include <nvbench/nvbench.cuh>
 
 #include <vector>
@@ -90,7 +90,7 @@ void BM_orc_multithreaded_read_common(nvbench::state& state,
   auto const num_threads = state.get_int64("num_threads");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
 
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
@@ -112,13 +112,11 @@ void BM_orc_multithreaded_read_common(nvbench::state& state,
                    cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource());
                  };
 
-                 threads.paused = true;
-                 for (size_t i = 0; i < num_files; ++i) {
-                   threads.submit(read_func, i);
-                 }
+                 threads.pause();
+                 threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                  timer.start();
-                 threads.paused = false;
-                 threads.wait_for_tasks();
+                 threads.unpause();
+                 threads.wait();
                  cudf::detail::join_streams(streams, cudf::get_default_stream());
                  timer.stop();
                });
@@ -170,7 +168,7 @@ void BM_orc_multithreaded_read_chunked_common(nvbench::state& state,
   size_t const output_limit = state.get_int64("output_limit");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
@@ -203,13 +201,11 @@ void BM_orc_multithreaded_read_chunked_common(nvbench::state& state,
                    } while (reader.has_next());
                  };
 
-                 threads.paused = true;
-                 for (size_t i = 0; i < num_files; ++i) {
-                   threads.submit(read_func, i);
-                 }
+                 threads.pause();
+                 threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                  timer.start();
-                 threads.paused = false;
-                 threads.wait_for_tasks();
+                 threads.unpause();
+                 threads.wait();
                  cudf::detail::join_streams(streams, cudf::get_default_stream());
                  timer.stop();
                });
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index b4c8ed78ed8..9e76ebb71ab 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -23,10 +23,10 @@
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
-#include <cudf/utilities/thread_pool.hpp>
 
 #include <nvtx3/nvtx3.hpp>
 
+#include <BS_thread_pool.hpp>
 #include <nvbench/nvbench.cuh>
 
 #include <vector>
@@ -93,7 +93,7 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
   auto const num_threads = state.get_int64("num_threads");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
 
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
@@ -114,13 +114,11 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
                  cudf::io::read_parquet(read_opts, stream, rmm::mr::get_current_device_resource());
                };
 
-               threads.paused = true;
-               for (size_t i = 0; i < num_files; ++i) {
-                 threads.submit(read_func, i);
-               }
+               threads.pause();
+               threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                timer.start();
-               threads.paused = false;
-               threads.wait_for_tasks();
+               threads.unpause();
+               threads.wait();
                cudf::detail::join_streams(streams, cudf::get_default_stream());
                timer.stop();
              });
@@ -176,7 +174,7 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
   size_t const output_limit = state.get_int64("output_limit");
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
-  cudf::detail::thread_pool threads(num_threads);
+  BS::thread_pool threads(num_threads);
   auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
@@ -207,13 +205,11 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
                  } while (reader.has_next());
                };
 
-               threads.paused = true;
-               for (size_t i = 0; i < num_files; ++i) {
-                 threads.submit(read_func, i);
-               }
+               threads.pause();
+               threads.detach_sequence(decltype(num_files){0}, num_files, read_func);
                timer.start();
-               threads.paused = false;
-               threads.wait_for_tasks();
+               threads.unpause();
+               threads.wait();
                cudf::detail::join_streams(streams, cudf::get_default_stream());
                timer.stop();
              });
diff --git a/cpp/cmake/thirdparty/get_thread_pool.cmake b/cpp/cmake/thirdparty/get_thread_pool.cmake
new file mode 100644
index 00000000000..264257c7199
--- /dev/null
+++ b/cpp/cmake/thirdparty/get_thread_pool.cmake
@@ -0,0 +1,31 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+# This function finds rmm and sets any additional necessary environment variables.
+function(find_and_configure_thread_pool)
+  rapids_cpm_find(
+    BS_thread_pool 4.1.0
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/bshoshany/thread-pool.git
+    GIT_TAG 097aa718f25d44315cadb80b407144ad455ee4f9
+    GIT_SHALLOW TRUE
+  )
+  if(NOT TARGET BS_thread_pool)
+    add_library(BS_thread_pool INTERFACE)
+    target_include_directories(BS_thread_pool INTERFACE ${BS_thread_pool_SOURCE_DIR}/include)
+    target_compile_definitions(BS_thread_pool INTERFACE "BS_THREAD_POOL_ENABLE_PAUSE=1")
+  endif()
+endfunction()
+
+find_and_configure_thread_pool()
diff --git a/cpp/include/cudf/utilities/thread_pool.hpp b/cpp/include/cudf/utilities/thread_pool.hpp
deleted file mode 100644
index c8c3eb097c4..00000000000
--- a/cpp/include/cudf/utilities/thread_pool.hpp
+++ /dev/null
@@ -1,381 +0,0 @@
-/*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-/**
- * Modified from https://github.com/bshoshany/thread-pool
- * @copyright Copyright (c) 2021 Barak Shoshany. Licensed under the MIT license.
- *            See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
- */
-
-#include <atomic>       // std::atomic
-#include <chrono>       // std::chrono
-#include <cstdint>      // std::int_fast64_t, std::uint_fast32_t
-#include <functional>   // std::function
-#include <future>       // std::future, std::promise
-#include <memory>       // std::shared_ptr, std::unique_ptr
-#include <mutex>        // std::mutex, std::scoped_lock
-#include <queue>        // std::queue
-#include <thread>       // std::this_thread, std::thread
-#include <type_traits>  // std::decay_t, std::enable_if_t, std::is_void_v, std::invoke_result_t
-#include <utility>      // std::move, std::swap
-
-namespace cudf {
-namespace detail {
-
-/**
- * @brief A C++17 thread pool class. The user submits tasks to be executed into a queue. Whenever a
- * thread becomes available, it pops a task from the queue and executes it. Each task is
- * automatically assigned a future, which can be used to wait for the task to finish executing
- * and/or obtain its eventual return value.
- */
-class thread_pool {
-  using ui32 = int;
-
- public:
-  /**
-   * @brief Construct a new thread pool.
-   *
-   * @param _thread_count The number of threads to use. The default value is the total number of
-   * hardware threads available, as reported by the implementation. With a hyperthreaded CPU, this
-   * will be twice the number of CPU cores. If the argument is zero, the default value will be used
-   * instead.
-   */
-  thread_pool(ui32 const& _thread_count = std::thread::hardware_concurrency())
-    : thread_count(_thread_count ? _thread_count : std::thread::hardware_concurrency()),
-      threads(new std::thread[_thread_count ? _thread_count : std::thread::hardware_concurrency()])
-  {
-    create_threads();
-  }
-
-  /**
-   * @brief Destruct the thread pool. Waits for all tasks to complete, then destroys all threads.
-   * Note that if the variable paused is set to true, then any tasks still in the queue will never
-   * be executed.
-   */
-  ~thread_pool()
-  {
-    wait_for_tasks();
-    running = false;
-    destroy_threads();
-  }
-
-  /**
-   * @brief Get the number of tasks currently waiting in the queue to be executed by the threads.
-   *
-   * @return The number of queued tasks.
-   */
-  [[nodiscard]] size_t get_tasks_queued() const
-  {
-    std::scoped_lock const lock(queue_mutex);
-    return tasks.size();
-  }
-
-  /**
-   * @brief Get the number of tasks currently being executed by the threads.
-   *
-   * @return The number of running tasks.
-   */
-  [[nodiscard]] ui32 get_tasks_running() const { return tasks_total - (ui32)get_tasks_queued(); }
-
-  /**
-   * @brief Get the total number of unfinished tasks - either still in the queue, or running in a
-   * thread.
-   *
-   * @return The total number of tasks.
-   */
-  [[nodiscard]] ui32 get_tasks_total() const { return tasks_total; }
-
-  /**
-   * @brief Get the number of threads in the pool.
-   *
-   * @return The number of threads.
-   */
-  [[nodiscard]] ui32 get_thread_count() const { return thread_count; }
-
-  /**
-   * @brief Parallelize a loop by splitting it into blocks, submitting each block separately to the
-   * thread pool, and waiting for all blocks to finish executing. The loop will be equivalent to:
-   * for (T i = first_index; i <= last_index; i++) loop(i);
-   *
-   * @tparam T The type of the loop index. Should be a signed or unsigned integer.
-   * @tparam F The type of the function to loop through.
-   * @param first_index The first index in the loop (inclusive).
-   * @param last_index The last index in the loop (inclusive).
-   * @param loop The function to loop through. Should take exactly one argument, the loop index.
-   * @param num_tasks The maximum number of tasks to split the loop into. The default is to use the
-   * number of threads in the pool.
-   */
-  template <typename T, typename F>
-  void parallelize_loop(T first_index, T last_index, F const& loop, ui32 num_tasks = 0)
-  {
-    if (num_tasks == 0) num_tasks = thread_count;
-    if (last_index < first_index) std::swap(last_index, first_index);
-    size_t total_size = last_index - first_index + 1;
-    size_t block_size = total_size / num_tasks;
-    if (block_size == 0) {
-      block_size = 1;
-      num_tasks  = (ui32)total_size > 1 ? (ui32)total_size : 1;
-    }
-    std::atomic<ui32> blocks_running = 0;
-    for (ui32 t = 0; t < num_tasks; t++) {
-      T start = (T)(t * block_size + first_index);
-      T end   = (t == num_tasks - 1) ? last_index : (T)((t + 1) * block_size + first_index - 1);
-      blocks_running++;
-      push_task([start, end, &loop, &blocks_running] {
-        for (T i = start; i <= end; i++)
-          loop(i);
-        blocks_running--;
-      });
-    }
-    while (blocks_running != 0) {
-      sleep_or_yield();
-    }
-  }
-
-  /**
-   * @brief Push a function with no arguments or return value into the task queue.
-   *
-   * @tparam F The type of the function.
-   * @param task The function to push.
-   */
-  template <typename F>
-  void push_task(F const& task)
-  {
-    tasks_total++;
-    {
-      std::scoped_lock const lock(queue_mutex);
-      tasks.push(std::function<void()>(task));
-    }
-  }
-
-  /**
-   * @brief Push a function with arguments, but no return value, into the task queue.
-   * @details The function is wrapped inside a lambda in order to hide the arguments, as the tasks
-   * in the queue must be of type std::function<void()>, so they cannot have any arguments or return
-   * value. If no arguments are provided, the other overload will be used, in order to avoid the
-   * (slight) overhead of using a lambda.
-   *
-   * @tparam F The type of the function.
-   * @tparam A The types of the arguments.
-   * @param task The function to push.
-   * @param args The arguments to pass to the function.
-   */
-  template <typename F, typename... A>
-  void push_task(F const& task, A const&... args)
-  {
-    push_task([task, args...] { task(args...); });
-  }
-
-  /**
-   * @brief Reset the number of threads in the pool. Waits for all currently running tasks to be
-   * completed, then destroys all threads in the pool and creates a new thread pool with the new
-   * number of threads. Any tasks that were waiting in the queue before the pool was reset will then
-   * be executed by the new threads. If the pool was paused before resetting it, the new pool will
-   * be paused as well.
-   *
-   * @param _thread_count The number of threads to use. The default value is the total number of
-   * hardware threads available, as reported by the implementation. With a hyperthreaded CPU, this
-   * will be twice the number of CPU cores. If the argument is zero, the default value will be used
-   * instead.
-   */
-  void reset(ui32 const& _thread_count = std::thread::hardware_concurrency())
-  {
-    bool was_paused = paused;
-    paused          = true;
-    wait_for_tasks();
-    running = false;
-    destroy_threads();
-    thread_count = _thread_count ? _thread_count : std::thread::hardware_concurrency();
-    threads      = std::make_unique<std::thread[]>(thread_count);
-    paused       = was_paused;
-    create_threads();
-    running = true;
-  }
-
-  /**
-   * @brief Submit a function with zero or more arguments and a return value into the task queue,
-   * and get a future for its eventual returned value.
-   *
-   * @tparam F The type of the function.
-   * @tparam A The types of the zero or more arguments to pass to the function.
-   * @tparam R The return type of the function.
-   * @param task The function to submit.
-   * @param args The zero or more arguments to pass to the function.
-   * @return A future to be used later to obtain the function's returned value, waiting for it to
-   * finish its execution if needed.
-   */
-  template <typename F,
-            typename... A,
-            typename R = std::invoke_result_t<std::decay_t<F>, std::decay_t<A>...>>
-  std::future<R> submit(F const& task, A const&... args)
-  {
-    std::shared_ptr<std::promise<R>> promise(new std::promise<R>);
-    std::future<R> future = promise->get_future();
-    push_task([task, args..., promise] {
-      try {
-        if constexpr (std::is_void_v<R>) {
-          task(args...);
-          promise->set_value();
-        } else {
-          promise->set_value(task(args...));
-        }
-      } catch (...) {
-        promise->set_exception(std::current_exception());
-      };
-    });
-    return future;
-  }
-
-  /**
-   * @brief Wait for tasks to be completed. Normally, this function waits for all tasks, both those
-   * that are currently running in the threads and those that are still waiting in the queue.
-   * However, if the variable paused is set to true, this function only waits for the currently
-   * running tasks (otherwise it would wait forever). To wait for a specific task, use submit()
-   * instead, and call the wait() member function of the generated future.
-   */
-  void wait_for_tasks()
-  {
-    while (true) {
-      if (!paused) {
-        if (tasks_total == 0) break;
-      } else {
-        if (get_tasks_running() == 0) break;
-      }
-      sleep_or_yield();
-    }
-  }
-
-  /**
-   * @brief An atomic variable indicating to the workers to pause. When set to true, the workers
-   * temporarily stop popping new tasks out of the queue, although any tasks already executed will
-   * keep running until they are done. Set to false again to resume popping tasks.
-   */
-  std::atomic<bool> paused = false;
-
-  /**
-   * @brief The duration, in microseconds, that the worker function should sleep for when it cannot
-   * find any tasks in the queue. If set to 0, then instead of sleeping, the worker function will
-   * execute std::this_thread::yield() if there are no tasks in the queue. The default value is
-   * 1000.
-   */
-  ui32 sleep_duration = 1000;
-
- private:
-  /**
-   * @brief Create the threads in the pool and assign a worker to each thread.
-   */
-  void create_threads()
-  {
-    for (ui32 i = 0; i < thread_count; i++) {
-      threads[i] = std::thread(&thread_pool::worker, this);
-    }
-  }
-
-  /**
-   * @brief Destroy the threads in the pool by joining them.
-   */
-  void destroy_threads()
-  {
-    for (ui32 i = 0; i < thread_count; i++) {
-      threads[i].join();
-    }
-  }
-
-  /**
-   * @brief Try to pop a new task out of the queue.
-   *
-   * @param task A reference to the task. Will be populated with a function if the queue is not
-   * empty.
-   * @return true if a task was found, false if the queue is empty.
-   */
-  bool pop_task(std::function<void()>& task)
-  {
-    std::scoped_lock const lock(queue_mutex);
-    if (tasks.empty())
-      return false;
-    else {
-      task = std::move(tasks.front());
-      tasks.pop();
-      return true;
-    }
-  }
-
-  /**
-   * @brief Sleep for sleep_duration microseconds. If that variable is set to zero, yield instead.
-   *
-   */
-  void sleep_or_yield()
-  {
-    if (sleep_duration)
-      std::this_thread::sleep_for(std::chrono::microseconds(sleep_duration));
-    else
-      std::this_thread::yield();
-  }
-
-  /**
-   * @brief A worker function to be assigned to each thread in the pool. Continuously pops tasks out
-   * of the queue and executes them, as long as the atomic variable running is set to true.
-   */
-  void worker()
-  {
-    while (running) {
-      std::function<void()> task;
-      if (!paused && pop_task(task)) {
-        task();
-        tasks_total--;
-      } else {
-        sleep_or_yield();
-      }
-    }
-  }
-
-  /**
-   * @brief A mutex to synchronize access to the task queue by different threads.
-   */
-  mutable std::mutex queue_mutex;
-
-  /**
-   * @brief An atomic variable indicating to the workers to keep running. When set to false, the
-   * workers permanently stop working.
-   */
-  std::atomic<bool> running = true;
-
-  /**
-   * @brief A queue of tasks to be executed by the threads.
-   */
-  std::queue<std::function<void()>> tasks;
-
-  /**
-   * @brief The number of threads in the pool.
-   */
-  ui32 thread_count;
-
-  /**
-   * @brief A smart pointer to manage the memory allocated for the threads.
-   */
-  std::unique_ptr<std::thread[]> threads;
-
-  /**
-   * @brief An atomic variable to keep track of the total number of unfinished tasks - either still
-   * in the queue, or running in a thread.
-   */
-  std::atomic<ui32> tasks_total = 0;
-};
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index 9fe5959436d..d7b54399f8d 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -223,7 +223,6 @@ cufile_input_impl::cufile_input_impl(std::string const& filepath)
     // The benefit from multithreaded read plateaus around 16 threads
     pool(getenv_or("LIBCUDF_CUFILE_THREAD_COUNT", 16))
 {
-  pool.sleep_duration = 10;
 }
 
 namespace {
@@ -232,14 +231,15 @@ template <typename DataT,
           typename F,
           typename ResultT = std::invoke_result_t<F, DataT*, size_t, size_t>>
 std::vector<std::future<ResultT>> make_sliced_tasks(
-  F function, DataT* ptr, size_t offset, size_t size, cudf::detail::thread_pool& pool)
+  F function, DataT* ptr, size_t offset, size_t size, BS::thread_pool& pool)
 {
   constexpr size_t default_max_slice_size = 4 * 1024 * 1024;
   static auto const max_slice_size = getenv_or("LIBCUDF_CUFILE_SLICE_SIZE", default_max_slice_size);
   auto const slices                = make_file_io_slices(size, max_slice_size);
   std::vector<std::future<ResultT>> slice_tasks;
   std::transform(slices.cbegin(), slices.cend(), std::back_inserter(slice_tasks), [&](auto& slice) {
-    return pool.submit(function, ptr + slice.offset, slice.size, offset + slice.offset);
+    return pool.submit_task(
+      [&] { return function(ptr + slice.offset, slice.size, offset + slice.offset); });
   });
   return slice_tasks;
 }
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 91ef41fba6e..441bede200d 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -19,8 +19,7 @@
 #ifdef CUFILE_FOUND
 #include <cudf_test/file_utilities.hpp>
 
-#include <cudf/utilities/thread_pool.hpp>
-
+#include <BS_thread_pool.hpp>
 #include <cufile.h>
 #endif
 
@@ -150,7 +149,7 @@ class cufile_input_impl final : public cufile_input {
  private:
   cufile_shim const* shim = nullptr;
   cufile_registered_file const cf_file;
-  cudf::detail::thread_pool pool;
+  BS::thread_pool pool;
 };
 
 /**
@@ -167,7 +166,7 @@ class cufile_output_impl final : public cufile_output {
  private:
   cufile_shim const* shim = nullptr;
   cufile_registered_file const cf_file;
-  cudf::detail::thread_pool pool;
+  BS::thread_pool pool;
 };
 #else
 

From 8ff27ed5bcaf8fc5fc8d1f546dee30c59861c320 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 19 Jul 2024 15:15:20 +0100
Subject: [PATCH 536/842] Support Literals in groupby-agg (#16218)

To do this, we just need to collect the appropriate aggregation information, and broadcast literals to the correct size.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16218
---
 python/cudf_polars/cudf_polars/dsl/expr.py | 15 +++++++++++++++
 python/cudf_polars/cudf_polars/dsl/ir.py   |  4 ++--
 python/cudf_polars/tests/test_groupby.py   | 17 +++++++++++++++++
 3 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index f37cb3f475c..a034d55120a 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -370,6 +370,10 @@ def do_evaluate(
         # datatype of pyarrow scalar is correct by construction.
         return Column(plc.Column.from_scalar(plc.interop.from_arrow(self.value), 1))
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([])
+
 
 class LiteralColumn(Expr):
     __slots__ = ("value",)
@@ -382,6 +386,13 @@ def __init__(self, dtype: plc.DataType, value: pl.Series) -> None:
         data = value.to_arrow()
         self.value = data.cast(dtypes.downcast_arrow_lists(data.type))
 
+    def get_hash(self) -> int:
+        """Compute a hash of the column."""
+        # This is stricter than necessary, but we only need this hash
+        # for identity in groupby replacements so it's OK. And this
+        # way we avoid doing potentially expensive compute.
+        return hash((type(self), self.dtype, id(self.value)))
+
     def do_evaluate(
         self,
         df: DataFrame,
@@ -393,6 +404,10 @@ def do_evaluate(
         # datatype of pyarrow array is correct by construction.
         return Column(plc.interop.from_arrow(self.value))
 
+    def collect_agg(self, *, depth: int) -> AggInfo:
+        """Collect information about aggregations in groupbys."""
+        return AggInfo([])
+
 
 class Col(Expr):
     __slots__ = ("name",)
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index cce0c4a3d94..01834ab75a5 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -514,7 +514,7 @@ def check_agg(agg: expr.Expr) -> int:
             return max(GroupBy.check_agg(child) for child in agg.children)
         elif isinstance(agg, expr.Agg):
             return 1 + max(GroupBy.check_agg(child) for child in agg.children)
-        elif isinstance(agg, (expr.Len, expr.Col, expr.Literal)):
+        elif isinstance(agg, (expr.Len, expr.Col, expr.Literal, expr.LiteralColumn)):
             return 0
         else:
             raise NotImplementedError(f"No handler for {agg=}")
@@ -574,7 +574,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         results = [
             req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests
         ]
-        return DataFrame([*result_keys, *results]).slice(self.options.slice)
+        return DataFrame(broadcast(*result_keys, *results)).slice(self.options.slice)
 
 
 @dataclasses.dataclass
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index b07d8e38217..b650fee5079 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -155,3 +155,20 @@ def test_groupby_nan_minmax_raises(op):
     q = df.group_by("key").agg(op(pl.col("value")))
 
     assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize("key", [1, pl.col("key1")])
+@pytest.mark.parametrize(
+    "expr",
+    [
+        pl.lit(1).alias("value"),
+        pl.lit([[4, 5, 6]]).alias("value"),
+        pl.col("float") * (1 - pl.col("int")),
+        [pl.lit(2).alias("value"), pl.col("float") * 2],
+    ],
+)
+def test_groupby_literal_in_agg(df, key, expr):
+    # check_row_order=False doesn't work for list aggregations
+    # so just sort by the group key
+    q = df.group_by(key).agg(expr).sort(key, maintain_order=True)
+    assert_gpu_result_equal(q)

From 9a713e3adb8abb1f41de0445b8ea896fdb48c560 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 19 Jul 2024 10:34:16 -0400
Subject: [PATCH 537/842] Migrate lists/count_elements to pylibcudf (#16072)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16072
---
 python/cudf/cudf/_lib/lists.pyx               | 18 +++----------
 .../libcudf/lists/count_elements.pxd          |  2 +-
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  2 ++
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 27 +++++++++++++++++++
 .../cudf/cudf/pylibcudf_tests/test_lists.py   | 10 +++++++
 5 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index ceae1b148aa..76f37c3b845 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -8,9 +8,6 @@ from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
-    count_elements as cpp_count_elements,
-)
 from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
     lists_column_view,
 )
@@ -36,19 +33,10 @@ from cudf._lib.pylibcudf cimport Scalar
 
 @acquire_spill_lock()
 def count_elements(Column col):
-
-    # shared_ptr required because lists_column_view has no default
-    # ctor
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
+    return Column.from_pylibcudf(
+        pylibcudf.lists.count_elements(
+            col.to_pylibcudf(mode="read"))
     )
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(cpp_count_elements(list_view.get()[0]))
-
-    result = Column.from_unique_ptr(move(c_result))
-    return result
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
index 38bdd4db0bb..ba57a839fbc 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
@@ -9,4 +9,4 @@ from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
 
 
 cdef extern from "cudf/lists/count_elements.hpp" namespace "cudf::lists" nogil:
-    cdef unique_ptr[column] count_elements(const lists_column_view) except +
+    cdef unique_ptr[column] count_elements(const lists_column_view&) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index 38a479e4791..38eb575ee8d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -33,3 +33,5 @@ cpdef Column reverse(Column)
 cpdef Column segmented_gather(Column, Column)
 
 cpdef Column extract_list_element(Column, ColumnOrSizeType)
+
+cpdef Column count_elements(Column)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index 19c961aa014..ea469642dd5 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -17,6 +17,9 @@ from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
     concatenate_null_policy,
     concatenate_rows as cpp_concatenate_rows,
 )
+from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
+    count_elements as cpp_count_elements,
+)
 from cudf._lib.pylibcudf.libcudf.lists.extract cimport (
     extract_list_element as cpp_extract_list_element,
 )
@@ -293,3 +296,27 @@ cpdef Column extract_list_element(Column input, ColumnOrSizeType index):
             index.view() if ColumnOrSizeType is Column else index,
         ))
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column count_elements(Column input):
+    """Count the number of rows in each
+    list element in the given lists column.
+    For details, see :cpp:func:`count_elements`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column
+
+    Returns
+    -------
+    Column
+        A new Column of the lengths of each list element
+    """
+    cdef ListColumnView list_view = input.list_view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_count_elements(list_view.view()))
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index 07ecaed5012..7cfed884f90 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -181,3 +181,13 @@ def test_extract_list_element_column(test_data):
     expect = pa.array([0, None, None, 7])
 
     assert_column_eq(expect, res)
+
+
+def test_count_elements(test_data):
+    arr = pa.array(test_data[0][1])
+    plc_column = plc.interop.from_arrow(arr)
+    res = plc.lists.count_elements(plc_column)
+
+    expect = pa.array([1, 1, 0, 3], type=pa.int32())
+
+    assert_column_eq(expect, res)

From 2bbeee95ec338c30c0c876dc6a58376fbb0a5a06 Mon Sep 17 00:00:00 2001
From: Ray Bell <rayjohnbell0@gmail.com>
Date: Fri, 19 Jul 2024 12:43:49 -0400
Subject: [PATCH 538/842] DOC: use intersphinx mapping in pandas-compat ext
 (#15846)

~~If https://github.com/rapidsai/cudf/pull/15704 is merged~~

This PR changes the header in the admonition (pandas compat box) to be hyperlinked to the pandas docs instead of just text. See https://raybellwaves.github.io/compatsphinxext/compat.html which is the docs of a minimal repo where I have been testing

Authors:
  - Ray Bell (https://github.com/raybellwaves)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15846
---
 .../source/developer_guide/documentation.md   |  2 +-
 python/cudf/cudf/core/column/lists.py         | 12 +++++-
 python/cudf/cudf/core/column/string.py        | 16 ++++----
 python/cudf/cudf/core/dataframe.py            | 37 ++++++++++---------
 python/cudf/cudf/core/frame.py                | 10 ++---
 python/cudf/cudf/core/groupby/groupby.py      |  9 +++--
 python/cudf/cudf/core/indexed_frame.py        | 28 +++++++-------
 python/cudf/cudf/core/series.py               | 14 +++----
 python/cudf/cudf/core/tools/numeric.py        |  2 +-
 python/cudf/cudf/core/window/ewm.py           |  2 +-
 10 files changed, 72 insertions(+), 60 deletions(-)

diff --git a/docs/cudf/source/developer_guide/documentation.md b/docs/cudf/source/developer_guide/documentation.md
index c8da689479c..4f5a57fec02 100644
--- a/docs/cudf/source/developer_guide/documentation.md
+++ b/docs/cudf/source/developer_guide/documentation.md
@@ -164,7 +164,7 @@ The directive should be used inside docstrings like so:
 Docstring body
 
 .. pandas-compat::
-    **$API_NAME**
+    :meth:`pandas.DataFrame.METHOD`
 
     Explanation of differences
 ```
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index cc15e78314e..46b844413f7 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -646,9 +646,17 @@ def sort_values(
         dtype: list
 
         .. pandas-compat::
-            **ListMethods.sort_values**
+            `pandas.Series.list.sort_values`
 
-            The ``inplace`` and ``kind`` arguments are currently not supported.
+            This method does not exist in pandas but it can be run
+            as:
+
+            >>> import pandas as pd
+            >>> s = pd.Series([[3, 2, 1], [2, 4, 3]])
+            >>> print(s.apply(sorted))
+            0    [1, 2, 3]
+            1    [2, 3, 4]
+            dtype: object
         """
         if inplace:
             raise NotImplementedError("`inplace` not currently implemented.")
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 96f9cdfd655..ec95c50f455 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -612,7 +612,7 @@ def extract(
         dtype: object
 
         .. pandas-compat::
-            **StringMethods.extract**
+            :meth:`pandas.Series.str.extract`
 
             The `flags` parameter currently only supports re.DOTALL and
             re.MULTILINE.
@@ -738,7 +738,7 @@ def contains(
         dtype: bool
 
         .. pandas-compat::
-            **StringMethods.contains**
+            :meth:`pandas.Series.str.contains`
 
             The parameters `case` and `na` are not yet supported and will
             raise a NotImplementedError if anything other than the default
@@ -974,7 +974,7 @@ def replace(
         dtype: object
 
         .. pandas-compat::
-            **StringMethods.replace**
+            :meth:`pandas.Series.str.replace`
 
             The parameters `case` and `flags` are not yet supported and will
             raise a `NotImplementedError` if anything other than the default
@@ -2803,7 +2803,7 @@ def partition(self, sep: str = " ", expand: bool = True) -> SeriesOrIndex:
                    )
 
         .. pandas-compat::
-            **StringMethods.partition**
+            :meth:`pandas.Series.str.partition`
 
             The parameter `expand` is not yet supported and will raise a
             `NotImplementedError` if anything other than the default
@@ -3527,7 +3527,7 @@ def count(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         Index([0, 0, 2, 1], dtype='int64')
 
         .. pandas-compat::
-            **StringMethods.count**
+            :meth:`pandas.Series.str.count`
 
             -   `flags` parameter currently only supports re.DOTALL
                 and re.MULTILINE.
@@ -3607,7 +3607,7 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         dtype: list
 
         .. pandas-compat::
-            **StringMethods.findall**
+            :meth:`pandas.Series.str.findall`
 
             The `flags` parameter currently only supports re.DOTALL and
             re.MULTILINE.
@@ -3811,7 +3811,7 @@ def endswith(self, pat: str) -> SeriesOrIndex:
         dtype: bool
 
         .. pandas-compat::
-            **StringMethods.endswith**
+            :meth:`pandas.Series.str.endswith`
 
             `na` parameter is not yet supported, as cudf uses
             native strings instead of Python objects.
@@ -4264,7 +4264,7 @@ def match(
         dtype: bool
 
         .. pandas-compat::
-            **StringMethods.match**
+            :meth:`pandas.Series.str.match`
 
             Parameters `case` and `na` are currently not supported.
             The `flags` parameter currently only supports re.DOTALL and
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index b3d938829c9..f06e45277e2 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2750,7 +2750,7 @@ def reindex(
         Chrome                200          0.02
 
         .. pandas-compat::
-            **DataFrame.reindex**
+            :meth:`pandas.DataFrame.reindex`
 
             Note: One difference from Pandas is that ``NA`` is used for rows
             that do not match, rather than ``NaN``. One side effect of this is
@@ -3350,7 +3350,7 @@ def diff(self, periods=1, axis=0):
         5     2     5    20
 
         .. pandas-compat::
-            **DataFrame.diff**
+            :meth:`pandas.DataFrame.diff`
 
             Diff currently only supports numeric dtype columns.
         """
@@ -3555,7 +3555,7 @@ def rename(
         30  3  6
 
         .. pandas-compat::
-            **DataFrame.rename**
+            :meth:`pandas.DataFrame.rename`
 
             * Not Supporting: level
 
@@ -3670,7 +3670,7 @@ def agg(self, aggs, axis=None):
             ``DataFrame`` is returned.
 
         .. pandas-compat::
-            **DataFrame.agg**
+            :meth:`pandas.DataFrame.agg`
 
             * Not supporting: ``axis``, ``*args``, ``**kwargs``
 
@@ -3843,7 +3843,7 @@ def nlargest(self, n, columns, keep="first"):
         Brunei      434000    12128      BN
 
         .. pandas-compat::
-            **DataFrame.nlargest**
+            :meth:`pandas.DataFrame.nlargest`
 
             - Only a single column is supported in *columns*
         """
@@ -3915,7 +3915,7 @@ def nsmallest(self, n, columns, keep="first"):
         Nauru         337000  182      NR
 
         .. pandas-compat::
-            **DataFrame.nsmallest**
+            :meth:`pandas.DataFrame.nsmallest`
 
             - Only a single column is supported in *columns*
         """
@@ -3997,7 +3997,7 @@ def transpose(self):
         a new (ncol x nrow) dataframe. self is (nrow x ncol)
 
         .. pandas-compat::
-            **DataFrame.transpose, DataFrame.T**
+            :meth:`pandas.DataFrame.transpose`, :attr:`pandas.DataFrame.T`
 
             Not supporting *copy* because default and only behavior is
             copy=True
@@ -4188,7 +4188,7 @@ def merge(
         from both sides.
 
         .. pandas-compat::
-            **DataFrame.merge**
+            :meth:`pandas.DataFrame.merge`
 
             DataFrames merges in cuDF result in non-deterministic row
             ordering.
@@ -4263,7 +4263,7 @@ def join(
         joined : DataFrame
 
         .. pandas-compat::
-            **DataFrame.join**
+            :meth:`pandas.DataFrame.join`
 
             - *other* must be a single DataFrame for now.
             - *on* is not supported yet due to lack of multi-index support.
@@ -4385,7 +4385,7 @@ def query(self, expr, local_dict=None):
         1 2018-10-08
 
         .. pandas-compat::
-            **DataFrame.query**
+            :meth:`pandas.DataFrame.query`
 
             One difference from pandas is that ``query`` currently only
             supports numeric, datetime, timedelta, or bool dtypes.
@@ -5447,10 +5447,11 @@ def from_arrow(cls, table):
         2  3  6
 
         .. pandas-compat::
-            **DataFrame.from_arrow**
+            `pandas.DataFrame.from_arrow`
 
-            -   Does not support automatically setting index column(s) similar
-                to how ``to_pandas`` works for PyArrow Tables.
+            This method does not exist in pandas but it is similar to
+            how :meth:`pyarrow.Table.to_pandas` works for PyArrow Tables i.e.
+            it does not support automatically setting index column(s).
         """
         index_col = None
         col_index_names = None
@@ -5884,7 +5885,7 @@ def quantile(
         0.5  2.5  55.0
 
         .. pandas-compat::
-            **DataFrame.quantile**
+            :meth:`pandas.DataFrame.quantile`
 
             One notable difference from Pandas is when DataFrame is of
             non-numeric types and result is expected to be a Series in case of
@@ -6174,7 +6175,7 @@ def count(self, axis=0, numeric_only=False):
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.count**
+            :meth:`pandas.DataFrame.count`
 
             Parameters currently not supported are `axis` and `numeric_only`.
         """
@@ -6412,7 +6413,7 @@ def mode(self, axis=0, numeric_only=False, dropna=True):
         1  <NA>    2.0
 
         .. pandas-compat::
-            **DataFrame.mode**
+            :meth:`pandas.DataFrame.transpose`
 
             ``axis`` parameter is currently not supported.
         """
@@ -7594,7 +7595,7 @@ def interleave_columns(self):
         The interleaved columns as a single column
 
         .. pandas-compat::
-            **DataFrame.interleave_columns**
+            `pandas.DataFrame.interleave_columns`
 
             This method does not exist in pandas but it can be run
             as ``pd.Series(np.vstack(df.to_numpy()).reshape((-1,)))``.
@@ -7696,7 +7697,7 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
         4  5   2   7  3
 
         .. pandas-compat::
-            **DataFrame.eval**
+            :meth:`pandas.DataFrame.eval`
 
             * Additional kwargs are not supported.
             * Bitwise and logical operators are not dtype-dependent.
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 802751e47ad..111225a5fc2 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -591,7 +591,7 @@ def where(self, cond, other=None, inplace: bool = False) -> Self | None:
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.where, Series.where**
+            :meth:`pandas.DataFrame.where`, :meth:`pandas.Series.where`
 
             Note that ``where`` treats missing values as falsy,
             in parallel with pandas treatment of nullable data:
@@ -1641,7 +1641,7 @@ def min(
         1
 
         .. pandas-compat::
-            **DataFrame.min, Series.min**
+            :meth:`pandas.DataFrame.min`, :meth:`pandas.Series.min`
 
             Parameters currently not supported are `level`, `numeric_only`.
         """
@@ -1689,7 +1689,7 @@ def max(
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.max, Series.max**
+            :meth:`pandas.DataFrame.max`, :meth:`pandas.Series.max`
 
             Parameters currently not supported are `level`, `numeric_only`.
         """
@@ -1742,7 +1742,7 @@ def all(self, axis=0, skipna=True, **kwargs):
         dtype: bool
 
         .. pandas-compat::
-            **DataFrame.all, Series.all**
+            :meth:`pandas.DataFrame.all`, :meth:`pandas.Series.all`
 
             Parameters currently not supported are `axis`, `bool_only`,
             `level`.
@@ -1795,7 +1795,7 @@ def any(self, axis=0, skipna=True, **kwargs):
         dtype: bool
 
         .. pandas-compat::
-            **DataFrame.any, Series.any**
+            :meth:`pandas.DataFrame.any`, :meth:`pandas.Series.any`
 
             Parameters currently not supported are `axis`, `bool_only`,
             `level`.
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index d2c75715be2..3f91be71f29 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -744,7 +744,8 @@ def _reduce(
             Computed {op} of values within each group.
 
         .. pandas-compat::
-            **{cls}.{op}**
+            :meth:`pandas.core.groupby.DataFrameGroupBy.{op}`,
+             :meth:`pandas.core.groupby.SeriesGroupBy.{op}`
 
             The numeric_only, min_count
         """
@@ -1482,7 +1483,8 @@ def mult(df):
           6    2    6   12
 
         .. pandas-compat::
-            **GroupBy.apply**
+            :meth:`pandas.core.groupby.DataFrameGroupBy.apply`,
+             :meth:`pandas.core.groupby.SeriesGroupBy.apply`
 
             cuDF's ``groupby.apply`` is limited compared to pandas.
             In some situations, Pandas returns the grouped keys as part of
@@ -2358,7 +2360,8 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
             Object shifted within each group.
 
         .. pandas-compat::
-            **GroupBy.shift**
+            :meth:`pandas.core.groupby.DataFrameGroupBy.shift`,
+             :meth:`pandas.core.groupby.SeriesGroupBy.shift`
 
             Parameter ``freq`` is unsupported.
         """
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 30b68574960..77675edc0f0 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -497,7 +497,7 @@ def empty(self):
         True
 
         .. pandas-compat::
-            **DataFrame.empty, Series.empty**
+            :attr:`pandas.DataFrame.empty`, :attr:`pandas.Series.empty`
 
             If DataFrame/Series contains only `null` values, it is still not
             considered empty. See the example above.
@@ -831,7 +831,7 @@ def replace(
         4    4    9  e
 
         .. pandas-compat::
-            **DataFrame.replace, Series.replace**
+            :meth:`pandas.DataFrame.replace`, :meth:`pandas.Series.replace`
 
             Parameters that are currently not supported are: `limit`, `regex`,
             `method`
@@ -1372,7 +1372,7 @@ def sum(
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.sum, Series.sum**
+           :meth:`pandas.DataFrame.sum`, :meth:`pandas.Series.sum`
 
             Parameters currently not supported are `level`, `numeric_only`.
         """
@@ -1433,7 +1433,7 @@ def product(
         dtype: int64
 
         .. pandas-compat::
-            **DataFrame.product, Series.product**
+            :meth:`pandas.DataFrame.product`, :meth:`pandas.Series.product`
 
             Parameters currently not supported are level`, `numeric_only`.
         """
@@ -1530,7 +1530,7 @@ def median(
         17.0
 
         .. pandas-compat::
-            **DataFrame.median, Series.median**
+            :meth:`pandas.DataFrame.median`, :meth:`pandas.Series.median`
 
             Parameters currently not supported are `level` and `numeric_only`.
         """
@@ -1586,7 +1586,7 @@ def std(
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.std, Series.std**
+            :meth:`pandas.DataFrame.std`, :meth:`pandas.Series.std`
 
             Parameters currently not supported are `level` and
             `numeric_only`
@@ -1645,7 +1645,7 @@ def var(
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.var, Series.var**
+            :meth:`pandas.DataFrame.var`, :meth:`pandas.Series.var`
 
             Parameters currently not supported are `level` and
             `numeric_only`
@@ -1701,7 +1701,7 @@ def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.kurtosis**
+            :meth:`pandas.DataFrame.kurtosis`
 
             Parameters currently not supported are `level` and `numeric_only`
         """
@@ -1763,7 +1763,7 @@ def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         dtype: float64
 
         .. pandas-compat::
-            **DataFrame.skew, Series.skew, Frame.skew**
+            :meth:`pandas.DataFrame.skew`, :meth:`pandas.Series.skew`
 
             The `axis` parameter is not currently supported.
         """
@@ -2229,7 +2229,7 @@ def truncate(self, before=None, after=None, axis=0, copy=True):
         2021-01-01 23:45:27  1  2
 
         .. pandas-compat::
-            **DataFrame.truncate, Series.truncate**
+            :meth:`pandas.DataFrame.truncate`, :meth:`pandas.Series.truncate`
 
             The ``copy`` parameter is only present for API compatibility, but
             ``copy=False`` is not supported. This method always generates a
@@ -2665,7 +2665,7 @@ def sort_index(
         2  3  1
 
         .. pandas-compat::
-            **DataFrame.sort_index, Series.sort_index**
+            :meth:`pandas.DataFrame.sort_index`, :meth:`pandas.Series.sort_index`
 
             * Not supporting: kind, sort_remaining=False
         """
@@ -3497,7 +3497,7 @@ def sort_values(
         1  1  2
 
         .. pandas-compat::
-            **DataFrame.sort_values, Series.sort_values**
+            :meth:`pandas.DataFrame.sort_values`, :meth:`pandas.Series.sort_values`
 
             * Support axis='index' only.
             * Not supporting: inplace, kind
@@ -4008,7 +4008,7 @@ def resample(
 
 
         .. pandas-compat::
-            **DataFrame.resample, Series.resample**
+            :meth:`pandas.DataFrame.resample`, :meth:`pandas.Series.resample`
 
             Note that the dtype of the index (or the 'on' column if using
             'on=') in the result will be of a frequency closest to the
@@ -4564,7 +4564,7 @@ def sample(
         1  2  4
 
         .. pandas-compat::
-            **DataFrame.sample, Series.sample**
+            :meth:`pandas.DataFrame.sample`, :meth:`pandas.Series.sample`
 
             When sampling from ``axis=0/'index'``, ``random_state`` can be
             either a numpy random state (``numpy.random.RandomState``)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index e12cc3d52fb..c9d24890d15 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -960,7 +960,7 @@ def reindex(self, *args, **kwargs):
         dtype: int64
 
         .. pandas-compat::
-            **Series.reindex**
+            :meth:`pandas.Series.reindex`
 
             Note: One difference from Pandas is that ``NA`` is used for rows
             that do not match, rather than ``NaN``. One side effect of this is
@@ -1243,7 +1243,7 @@ def map(self, arg, na_action=None) -> "Series":
         dtype: int64
 
         .. pandas-compat::
-            **Series.map**
+            :meth:`pandas.Series.map`
 
             Please note map currently only supports fixed-width numeric
             type functions.
@@ -2094,7 +2094,7 @@ def sort_values(
         dtype: int64
 
         .. pandas-compat::
-            **Series.sort_values**
+            :meth:`pandas.Series.sort_values`
 
             * Support axis='index' only.
             * The inplace and kind argument is currently unsupported
@@ -2550,7 +2550,7 @@ def count(self):
         5
 
         .. pandas-compat::
-            **Series.count**
+            :meth:`pandas.Series.count`
 
             Parameters currently not supported is `level`.
         """
@@ -2661,7 +2661,7 @@ def cov(self, other, min_periods=None):
         -0.015750000000000004
 
         .. pandas-compat::
-            **Series.cov**
+            :meth:`pandas.Series.cov`
 
             `min_periods` parameter is not yet supported.
         """
@@ -3422,7 +3422,7 @@ def rename(self, index=None, copy=True):
         'numeric_series'
 
         .. pandas-compat::
-            **Series.rename**
+            :meth:`pandas.Series.rename`
 
             - Supports scalar values only for changing name attribute
             - The ``inplace`` and ``level`` is not supported
@@ -4702,7 +4702,7 @@ def strftime(self, date_format: str, *args, **kwargs) -> Series:
         dtype: object
 
         .. pandas-compat::
-            **series.DatetimeProperties.strftime**
+            :meth:`pandas.DatetimeIndex.strftime`
 
             The following date format identifiers are not yet
             supported: ``%c``, ``%x``,``%X``
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 466d46f7dca..07158e4ee61 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -80,7 +80,7 @@ def to_numeric(arg, errors="raise", downcast=None):
     dtype: float64
 
     .. pandas-compat::
-        **cudf.to_numeric**
+        :func:`pandas.to_numeric`
 
         An important difference from pandas is that this function does not
         accept mixed numeric/non-numeric type sequences.
diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py
index 21693e106bd..bb153d4b549 100644
--- a/python/cudf/cudf/core/window/ewm.py
+++ b/python/cudf/cudf/core/window/ewm.py
@@ -56,7 +56,7 @@ class ExponentialMovingWindow(_RollingBase):
     the equivalent pandas method.
 
     .. pandas-compat::
-        **cudf.core.window.ExponentialMovingWindow**
+        :meth:`pandas.DataFrame.ewm`
 
         The parameters ``min_periods``, ``ignore_na``, ``axis``, and ``times``
         are not yet supported. Behavior is defined only for data that begins

From d5ab48d4f2586d2e45234463c1bbe877ce76afe8 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Fri, 19 Jul 2024 14:32:54 -0400
Subject: [PATCH 539/842] Use workflow branch 24.08 again (#16314)

After updating everything to CUDA 12.5.1, use `shared-workflows@branch-24.08` again.

Contributes to https://github.com/rapidsai/build-planning/issues/73

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/16314
---
 .github/workflows/build.yaml                  | 20 ++++-----
 .github/workflows/pandas-tests.yaml           |  2 +-
 .github/workflows/pr.yaml                     | 44 +++++++++----------
 .../workflows/pr_issue_status_automation.yml  |  6 +--
 .github/workflows/test.yaml                   | 22 +++++-----
 5 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 937080572ad..2e5959338b0 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -111,7 +111,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index 1516cb09449..5a937b2f362 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,7 +17,7 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
       with:
         matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
         build_type: nightly
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 1fe64e7f318..d5dfc9e1ff5 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -34,41 +34,41 @@ jobs:
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: pull-request
       script: "ci/test_python_cudf.sh"
@@ -76,14 +76,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: pull-request
       script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -93,7 +93,7 @@ jobs:
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -103,7 +103,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -113,7 +113,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -123,21 +123,21 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-cudf-polars:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -146,7 +146,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -157,7 +157,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -166,7 +166,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -174,7 +174,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
@@ -185,7 +185,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
@@ -194,7 +194,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
       build_type: pull-request
@@ -204,7 +204,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
         node_type: cpu4
         build_type: pull-request
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index 2a8ebd30993..8ca971dc28d 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@cuda-12.5.1
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.08
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@cuda-12.5.1
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.08
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@cuda-12.5.1
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 73f8d726e77..36c9088d93c 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -54,7 +54,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -85,7 +85,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,7 +97,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -106,7 +106,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -117,7 +117,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@cuda-12.5.1
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

From f364fdcd44540b6d5403f1d08acbebfff4e78bd4 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Fri, 19 Jul 2024 14:56:13 -0400
Subject: [PATCH 540/842] DOC v24.10 Updates [skip ci]

---
 .../cuda11.8-conda/devcontainer.json          |  6 +--
 .devcontainer/cuda11.8-pip/devcontainer.json  |  6 +--
 .../cuda12.5-conda/devcontainer.json          |  6 +--
 .devcontainer/cuda12.5-pip/devcontainer.json  |  6 +--
 .github/workflows/build.yaml                  | 20 ++++-----
 .github/workflows/pandas-tests.yaml           |  2 +-
 .github/workflows/pr.yaml                     | 44 +++++++++----------
 .github/workflows/test.yaml                   | 22 +++++-----
 README.md                                     |  2 +-
 VERSION                                       |  2 +-
 ci/test_wheel_cudf_polars.sh                  |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             | 10 ++---
 .../all_cuda-125_arch-x86_64.yaml             | 10 ++---
 cpp/examples/versions.cmake                   |  2 +-
 dependencies.yaml                             | 32 +++++++-------
 java/ci/README.md                             |  4 +-
 java/pom.xml                                  |  2 +-
 python/cudf/pyproject.toml                    |  4 +-
 python/cudf_kafka/pyproject.toml              |  2 +-
 python/cudf_polars/docs/overview.md           |  2 +-
 python/cudf_polars/pyproject.toml             |  2 +-
 python/custreamz/pyproject.toml               |  4 +-
 python/dask_cudf/pyproject.toml               |  6 +--
 23 files changed, 99 insertions(+), 99 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 8423fe21c29..7a1361e52c5 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 4945d6cf753..64d7cd54130 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda11.8-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda11.8-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda11.8-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-conda/devcontainer.json b/.devcontainer/cuda12.5-conda/devcontainer.json
index fadce01d060..c1924243506 100644
--- a/.devcontainer/cuda12.5-conda/devcontainer.json
+++ b/.devcontainer/cuda12.5-conda/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-mambaforge-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-conda"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.5-pip/devcontainer.json b/.devcontainer/cuda12.5-pip/devcontainer.json
index 026eb540952..beab2940176 100644
--- a/.devcontainer/cuda12.5-pip/devcontainer.json
+++ b/.devcontainer/cuda12.5-pip/devcontainer.json
@@ -5,17 +5,17 @@
     "args": {
       "CUDA": "12.5",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.08-cpp-cuda12.5-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.10-cpp-cuda12.5-ubuntu22.04"
     }
   },
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.08-cuda12.5-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-24.10-cuda12.5-pip"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.8": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.10": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 2e5959338b0..2fc39c06fad 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -101,7 +101,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -111,7 +111,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index 5a937b2f362..cf0c2b377dd 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,7 +17,7 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
       with:
         matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
         build_type: nightly
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index d5dfc9e1ff5..c2e7f64f952 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -34,41 +34,41 @@ jobs:
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/test_python_cudf.sh"
@@ -76,14 +76,14 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/test_python_other.sh"
   conda-java-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -93,7 +93,7 @@ jobs:
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -103,7 +103,7 @@ jobs:
   conda-notebook-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -113,7 +113,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -123,21 +123,21 @@ jobs:
   wheel-build-cudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-cudf-polars:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -146,7 +146,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -157,7 +157,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -166,7 +166,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -174,7 +174,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
@@ -185,7 +185,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
@@ -194,7 +194,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
       build_type: pull-request
@@ -204,7 +204,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
         node_type: cpu4
         build_type: pull-request
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 36c9088d93c..9feea050b19 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -54,7 +54,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -85,7 +85,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,7 +97,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -106,7 +106,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -117,7 +117,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.08
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/README.md b/README.md
index 1ab6a2d7457..fd8b0365807 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=24.08 python=3.11 cuda-version=12.5
+    cudf=24.10 python=3.11 cuda-version=12.5
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/VERSION b/VERSION
index ec8489fda92..7c7ba04436f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.08.00
+24.10.00
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index 900acd5d473..cc9f5788685 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -10,7 +10,7 @@ set -eou pipefail
 # files in cudf_polars/pylibcudf", rather than "are there changes
 # between upstream and this branch which touch cudf_polars/pylibcudf"
 # TODO: is the target branch exposed anywhere in an environment variable?
-if [ -n "$(git diff --name-only origin/branch-24.08...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
+if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
 then
     HAS_CHANGES=1
 else
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index b8d73a01f96..b1a1cc3c68e 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -26,7 +26,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.8.*,>=0.0.0a0
+- dask-cuda==24.10.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -43,10 +43,10 @@ dependencies:
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
-- libkvikio==24.8.*,>=0.0.0a0
+- libkvikio==24.10.*,>=0.0.0a0
 - libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.8.*,>=0.0.0a0
+- librmm==24.10.*,>=0.0.0a0
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -77,9 +77,9 @@ dependencies:
 - python>=3.9,<3.12
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
 - rich
-- rmm==24.8.*,>=0.0.0a0
+- rmm==24.10.*,>=0.0.0a0
 - s3fs>=2022.3.0
 - scikit-build-core>=0.7.0
 - scipy
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 3f5fae49cbb..1017b11779c 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -27,7 +27,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-cuda==24.8.*,>=0.0.0a0
+- dask-cuda==24.10.*,>=0.0.0a0
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -42,10 +42,10 @@ dependencies:
 - libarrow==16.1.0.*
 - libcufile-dev
 - libcurand-dev
-- libkvikio==24.8.*,>=0.0.0a0
+- libkvikio==24.10.*,>=0.0.0a0
 - libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
-- librmm==24.8.*,>=0.0.0a0
+- librmm==24.10.*,>=0.0.0a0
 - make
 - moto>=4.0.8
 - msgpack-python
@@ -75,9 +75,9 @@ dependencies:
 - python>=3.9,<3.12
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
-- rapids-dask-dependency==24.8.*,>=0.0.0a0
+- rapids-dask-dependency==24.10.*,>=0.0.0a0
 - rich
-- rmm==24.8.*,>=0.0.0a0
+- rmm==24.10.*,>=0.0.0a0
 - s3fs>=2022.3.0
 - scikit-build-core>=0.7.0
 - scipy
diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake
index 144b3d3721b..44493011673 100644
--- a/cpp/examples/versions.cmake
+++ b/cpp/examples/versions.cmake
@@ -12,4 +12,4 @@
 # the License.
 # =============================================================================
 
-set(CUDF_TAG branch-24.08)
+set(CUDF_TAG branch-24.10)
diff --git a/dependencies.yaml b/dependencies.yaml
index a19574b7658..a90ac64387b 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -287,8 +287,8 @@ dependencies:
       - output_types: conda
         packages:
           - fmt>=10.1.1,<11
-          - librmm==24.8.*,>=0.0.0a0
-          - libkvikio==24.8.*,>=0.0.0a0
+          - librmm==24.10.*,>=0.0.0a0
+          - libkvikio==24.10.*,>=0.0.0a0
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==3.0.6
@@ -329,7 +329,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &rmm_conda rmm==24.8.*,>=0.0.0a0
+          - &rmm_conda rmm==24.10.*,>=0.0.0a0
           - pip
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
@@ -345,10 +345,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages: &build_python_packages_cu12
-              - rmm-cu12==24.8.*,>=0.0.0a0
+              - rmm-cu12==24.10.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages: &build_python_packages_cu11
-              - rmm-cu11==24.8.*,>=0.0.0a0
+              - rmm-cu11==24.10.*,>=0.0.0a0
           - {matrix: null, packages: [*rmm_conda] }
   libarrow_build:
     common:
@@ -505,7 +505,7 @@ dependencies:
       - output_types: [conda]
         packages:
           - breathe>=4.35.0
-          - dask-cuda==24.8.*,>=0.0.0a0
+          - dask-cuda==24.10.*,>=0.0.0a0
           - *doxygen
           - make
           - myst-nb
@@ -597,11 +597,11 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - rmm-cu12==24.8.*,>=0.0.0a0
+              - rmm-cu12==24.10.*,>=0.0.0a0
               - pynvjitlink-cu12>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - rmm-cu11==24.8.*,>=0.0.0a0
+              - rmm-cu11==24.10.*,>=0.0.0a0
               - cubinlinker-cu11
               - ptxcompiler-cu11
           - {matrix: null, packages: [cubinlinker, ptxcompiler, *rmm_conda]}
@@ -614,7 +614,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - rapids-dask-dependency==24.8.*,>=0.0.0a0
+          - rapids-dask-dependency==24.10.*,>=0.0.0a0
   run_custreamz:
     common:
       - output_types: conda
@@ -700,13 +700,13 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask-cuda==24.8.*,>=0.0.0a0
+          - dask-cuda==24.10.*,>=0.0.0a0
           - *numba
   depends_on_cudf:
     common:
       - output_types: conda
         packages:
-          - &cudf_conda cudf==24.8.*,>=0.0.0a0
+          - &cudf_conda cudf==24.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -718,16 +718,16 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf-cu12==24.8.*,>=0.0.0a0
+              - cudf-cu12==24.10.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf-cu11==24.8.*,>=0.0.0a0
+              - cudf-cu11==24.10.*,>=0.0.0a0
           - {matrix: null, packages: [*cudf_conda]}
   depends_on_cudf_kafka:
     common:
       - output_types: conda
         packages:
-          - &cudf_kafka_conda cudf_kafka==24.8.*,>=0.0.0a0
+          - &cudf_kafka_conda cudf_kafka==24.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -739,10 +739,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - cudf_kafka-cu12==24.8.*,>=0.0.0a0
+              - cudf_kafka-cu12==24.10.*,>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - cudf_kafka-cu11==24.8.*,>=0.0.0a0
+              - cudf_kafka-cu11==24.10.*,>=0.0.0a0
           - {matrix: null, packages: [*cudf_kafka_conda]}
   depends_on_cupy:
     common:
diff --git a/java/ci/README.md b/java/ci/README.md
index 49481efab6b..ccb9efb50b6 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.8.0-devel-rocky8 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.08
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-24.10
 ```
 
 ### Build cuDF jar with devtoolset
@@ -47,4 +47,4 @@ scl enable gcc-toolset-11 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-24.08.0-SNAPSHOT-cuda11.jar.
+You can find the cuDF jar in java/target/ like cudf-24.10.0-SNAPSHOT-cuda11.jar.
diff --git a/java/pom.xml b/java/pom.xml
index 70230e6bc71..9694e741f16 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>24.08.0-SNAPSHOT</version>
+    <version>24.10.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index dcb33b1fc1a..da57622dec7 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -31,7 +31,7 @@ dependencies = [
     "ptxcompiler",
     "pyarrow>=16.1.0,<16.2.0a0",
     "rich",
-    "rmm==24.8.*,>=0.0.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
@@ -126,7 +126,7 @@ requires = [
     "ninja",
     "numpy==1.23.*",
     "pyarrow==16.1.0.*",
-    "rmm==24.8.*,>=0.0.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [tool.scikit-build]
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index badfdf06d15..bff1a9b8493 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -18,7 +18,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.optional-dependencies]
diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md
index 874bb849747..6cd36136bf8 100644
--- a/python/cudf_polars/docs/overview.md
+++ b/python/cudf_polars/docs/overview.md
@@ -8,7 +8,7 @@ You will need:
    preferred configuration. Or else, use
    [rustup](https://www.rust-lang.org/tools/install)
 2. A [cudf development
-   environment](https://github.com/rapidsai/cudf/blob/branch-24.08/CONTRIBUTING.md#setting-up-your-build-environment).
+   environment](https://github.com/rapidsai/cudf/blob/branch-24.10/CONTRIBUTING.md#setting-up-your-build-environment).
    The combined devcontainer works, or whatever your favourite approach is.
 
 > ![NOTE] These instructions will get simpler as we merge code in.
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 0b559f7a8e9..393a7510c89 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
     "polars>=1.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 7b99e041b54..59ce15ac4ef 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -20,8 +20,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "confluent-kafka>=1.9.0,<1.10.0a0",
-    "cudf==24.8.*,>=0.0.0a0",
-    "cudf_kafka==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
+    "cudf_kafka==24.10.*,>=0.0.0a0",
     "streamz",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 9b2e3a5a7b1..4968ff0b076 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -19,12 +19,12 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.8.*,>=0.0.0a0",
+    "cudf==24.10.*,>=0.0.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
     "numpy>=1.23,<2.0a0",
     "pandas>=2.0,<2.2.3dev0",
-    "rapids-dask-dependency==24.8.*,>=0.0.0a0",
+    "rapids-dask-dependency==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
@@ -45,7 +45,7 @@ cudf = "dask_cudf.backends:CudfDXBackendEntrypoint"
 
 [project.optional-dependencies]
 test = [
-    "dask-cuda==24.8.*,>=0.0.0a0",
+    "dask-cuda==24.10.*,>=0.0.0a0",
     "numba>=0.57",
     "pytest-cov",
     "pytest-xdist",

From dc62177a64a5fb4d6521f346ff0f44c2ede740f6 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 19 Jul 2024 20:17:42 +0100
Subject: [PATCH 541/842] Preserve order in left join for cudf-polars (#16268)

Unlike all other joins, polars provides an ordering guarantee for left joins. By default libcudf does not, so we need to order the gather maps in this case.

While here, because it requires another hard-coding of `int32` for something that should be `size_type`, expose `type_to_id` in cython and plumb it through.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16268
---
 python/cudf/cudf/_lib/pylibcudf/join.pyx      | 15 +----
 .../libcudf/utilities/type_dispatcher.pxd     |  7 +++
 python/cudf/cudf/_lib/pylibcudf/types.pyx     |  7 ++-
 python/cudf/cudf/_lib/types.pyx               |  4 +-
 .../cudf_polars/containers/column.py          |  3 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      | 58 +++++++++++++++++++
 python/cudf_polars/tests/test_join.py         |  2 +-
 7 files changed, 78 insertions(+), 18 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd

diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pyx b/python/cudf/cudf/_lib/pylibcudf/join.pyx
index 308b1b39291..2ded84d84d1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/join.pyx
@@ -10,12 +10,7 @@ from rmm._lib.device_buffer cimport device_buffer
 from cudf._lib.pylibcudf.libcudf cimport join as cpp_join
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport (
-    data_type,
-    null_equality,
-    size_type,
-    type_id,
-)
+from cudf._lib.pylibcudf.libcudf.types cimport null_equality
 
 from .column cimport Column
 from .table cimport Table
@@ -23,15 +18,11 @@ from .table cimport Table
 
 cdef Column _column_from_gather_map(cpp_join.gather_map_type gather_map):
     # helper to convert a gather map to a Column
-    cdef device_buffer c_empty
-    cdef size_type size = dereference(gather_map.get()).size()
     return Column.from_libcudf(
         move(
             make_unique[column](
-                data_type(type_id.INT32),
-                size,
-                dereference(gather_map.get()).release(),
-                move(c_empty),
+                move(dereference(gather_map.get())),
+                device_buffer(),
                 0
             )
         )
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd
new file mode 100644
index 00000000000..890fca3a662
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from cudf._lib.pylibcudf.libcudf.types cimport type_id
+
+
+cdef extern from "cudf/utilities/type_dispatcher.hpp" namespace "cudf" nogil:
+    cdef type_id type_to_id[T]()
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index 6dbb287f3c4..c45c6071bb3 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -2,7 +2,8 @@
 
 from libc.stdint cimport int32_t
 
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
+from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type, type_id
+from cudf._lib.pylibcudf.libcudf.utilities.type_dispatcher cimport type_to_id
 
 from cudf._lib.pylibcudf.libcudf.types import type_id as TypeId  # no-cython-lint, isort:skip
 from cudf._lib.pylibcudf.libcudf.types import nan_policy as NanPolicy  # no-cython-lint, isort:skip
@@ -67,3 +68,7 @@ cdef class DataType:
         cdef DataType ret = DataType.__new__(DataType, type_id.EMPTY)
         ret.c_obj = dt
         return ret
+
+
+SIZE_TYPE = DataType(type_to_id[size_type]())
+SIZE_TYPE_ID = SIZE_TYPE.id()
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index fc672caa574..253fdf7b0d9 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -21,8 +21,6 @@ from cudf._lib.types cimport (
 import cudf
 from cudf._lib import pylibcudf
 
-size_type_dtype = np.dtype("int32")
-
 
 class TypeId(IntEnum):
     EMPTY = <underlying_type_t_type_id> libcudf_types.type_id.EMPTY
@@ -150,6 +148,8 @@ datetime_unit_map = {
     TypeId.TIMESTAMP_NANOSECONDS: "ns",
 }
 
+size_type_dtype = LIBCUDF_TO_SUPPORTED_NUMPY_TYPES[pylibcudf.types.SIZE_TYPE_ID]
+
 
 class Interpolation(IntEnum):
     LINEAR = (
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 42aba0fcdc0..02018548b2c 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -185,8 +185,7 @@ def nan_count(self) -> int:
                 plc.reduce.reduce(
                     plc.unary.is_nan(self.obj),
                     plc.aggregation.sum(),
-                    # TODO: pylibcudf needs to have a SizeType DataType singleton
-                    plc.DataType(plc.TypeId.INT32),
+                    plc.types.SIZE_TYPE,
                 )
             ).as_py()
         return 0
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 01834ab75a5..0b14530e0ed 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -653,6 +653,59 @@ def _joiners(
         else:
             assert_never(how)
 
+    def _reorder_maps(
+        self,
+        left_rows: int,
+        lg: plc.Column,
+        left_policy: plc.copying.OutOfBoundsPolicy,
+        right_rows: int,
+        rg: plc.Column,
+        right_policy: plc.copying.OutOfBoundsPolicy,
+    ) -> list[plc.Column]:
+        """
+        Reorder gather maps to satisfy polars join order restrictions.
+
+        Parameters
+        ----------
+        left_rows
+            Number of rows in left table
+        lg
+            Left gather map
+        left_policy
+            Nullify policy for left map
+        right_rows
+            Number of rows in right table
+        rg
+            Right gather map
+        right_policy
+            Nullify policy for right map
+
+        Returns
+        -------
+        list of reordered left and right gather maps.
+
+        Notes
+        -----
+        For a left join, the polars result preserves the order of the
+        left keys, and is stable wrt the right keys. For all other
+        joins, there is no order obligation.
+        """
+        dt = plc.interop.to_arrow(plc.types.SIZE_TYPE)
+        init = plc.interop.from_arrow(pa.scalar(0, type=dt))
+        step = plc.interop.from_arrow(pa.scalar(1, type=dt))
+        left_order = plc.copying.gather(
+            plc.Table([plc.filling.sequence(left_rows, init, step)]), lg, left_policy
+        )
+        right_order = plc.copying.gather(
+            plc.Table([plc.filling.sequence(right_rows, init, step)]), rg, right_policy
+        )
+        return plc.sorting.stable_sort_by_key(
+            plc.Table([lg, rg]),
+            plc.Table([*left_order.columns(), *right_order.columns()]),
+            [plc.types.Order.ASCENDING, plc.types.Order.ASCENDING],
+            [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER],
+        ).columns()
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         left = self.left.evaluate(cache=cache)
@@ -693,6 +746,11 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             result = DataFrame.from_table(table, left.column_names)
         else:
             lg, rg = join_fn(left_on.table, right_on.table, null_equality)
+            if how == "left":
+                # Order of left table is preserved
+                lg, rg = self._reorder_maps(
+                    left.num_rows, lg, left_policy, right.num_rows, rg, right_policy
+                )
             if coalesce and how == "inner":
                 right = right.discard_columns(right_on.column_names_set)
             left = DataFrame.from_table(
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 89f6fd3455b..1ffbf3c0ef4 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -53,7 +53,7 @@ def test_join(how, coalesce, join_nulls, join_expr):
     query = left.join(
         right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=coalesce
     )
-    assert_gpu_result_equal(query, check_row_order=False)
+    assert_gpu_result_equal(query, check_row_order=how == "left")
 
 
 def test_cross_join():

From cb570fe6d7dc7ebdd6c8c030916ba27bef277b5e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 10:45:30 -1000
Subject: [PATCH 542/842] Deprecate dtype= parameter in reduction methods
 (#16313)

In terms of pandas alignment, this argument doesn't exist in reduction ops. Additionally, the same result can be easily achieved by calling `astype` after the operation, and it appears libcudf does not support any arbitrary casting to an output type.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16313
---
 python/cudf/cudf/_lib/reduce.pyx               | 15 ++++++++++-----
 python/cudf/cudf/core/column/column.py         | 11 ++++++++---
 python/cudf/cudf/core/column/datetime.py       |  9 +++------
 python/cudf/cudf/core/column/numerical.py      | 17 +++++++++--------
 python/cudf/cudf/core/column/numerical_base.py | 11 +++--------
 python/cudf/cudf/core/column/timedelta.py      |  7 +++----
 python/cudf/cudf/tests/test_reductions.py      | 15 +++++++++------
 7 files changed, 45 insertions(+), 40 deletions(-)

diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 56bfa0ba332..64634b7a6f9 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -1,4 +1,5 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+import warnings
 
 import cudf
 from cudf.core.buffer import acquire_spill_lock
@@ -26,11 +27,15 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
         A numpy data type to use for the output, defaults
         to the same type as the input column
     """
-
-    col_dtype = (
-        dtype if dtype is not None
-        else incol._reduction_result_dtype(reduction_op)
-    )
+    if dtype is not None:
+        warnings.warn(
+            "dtype is deprecated and will be remove in a future release. "
+            "Cast the result (e.g. .astype) after the operation instead.",
+            FutureWarning
+        )
+        col_dtype = dtype
+    else:
+        col_dtype = incol._reduction_result_dtype(reduction_op)
 
     # check empty case
     if len(incol) <= incol.null_count:
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 9467bbeed15..5e77aa87e4e 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -261,7 +261,7 @@ def all(self, skipna: bool = True) -> bool:
         if self.null_count == self.size:
             return True
 
-        return libcudf.reduce.reduce("all", self, dtype=np.bool_)
+        return libcudf.reduce.reduce("all", self)
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
@@ -271,7 +271,7 @@ def any(self, skipna: bool = True) -> bool:
         elif skipna and self.null_count == self.size:
             return False
 
-        return libcudf.reduce.reduce("any", self, dtype=np.bool_)
+        return libcudf.reduce.reduce("any", self)
 
     def dropna(self) -> Self:
         if self.has_nulls():
@@ -1305,7 +1305,10 @@ def _reduce(
             skipna=skipna, min_count=min_count
         )
         if isinstance(preprocessed, ColumnBase):
-            return libcudf.reduce.reduce(op, preprocessed, **kwargs)
+            dtype = kwargs.pop("dtype", None)
+            return libcudf.reduce.reduce(
+                op, preprocessed, dtype=dtype, **kwargs
+            )
         return preprocessed
 
     def _process_for_reduction(
@@ -1336,6 +1339,8 @@ def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
         Determine the correct dtype to pass to libcudf based on
         the input dtype, data dtype, and specific reduction op
         """
+        if reduction_op in {"any", "all"}:
+            return np.dtype(np.bool_)
         return self.dtype
 
     def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 004a059af95..a4538179415 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -485,13 +485,11 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
                 format = format.split(" ")[0]
         return self.strftime(format)
 
-    def mean(
-        self, skipna=None, min_count: int = 0, dtype=np.float64
-    ) -> ScalarLike:
+    def mean(self, skipna=None, min_count: int = 0) -> ScalarLike:
         return pd.Timestamp(
             cast(
                 "cudf.core.column.NumericalColumn", self.astype("int64")
-            ).mean(skipna=skipna, min_count=min_count, dtype=dtype),
+            ).mean(skipna=skipna, min_count=min_count),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -499,12 +497,11 @@ def std(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype: Dtype = np.float64,
         ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
             cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
-                skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+                skipna=skipna, min_count=min_count, ddof=ddof
             )
             * _unit_to_nanoseconds_conversion[self.time_unit],
         ).as_unit(self.time_unit)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index cea68c88c90..ba080863722 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -395,7 +395,7 @@ def all(self, skipna: bool = True) -> bool:
         if result_col.null_count == result_col.size:
             return True
 
-        return libcudf.reduce.reduce("all", result_col, dtype=np.bool_)
+        return libcudf.reduce.reduce("all", result_col)
 
     def any(self, skipna: bool = True) -> bool:
         # Early exit for fast cases.
@@ -406,7 +406,7 @@ def any(self, skipna: bool = True) -> bool:
         elif skipna and result_col.null_count == result_col.size:
             return False
 
-        return libcudf.reduce.reduce("any", result_col, dtype=np.bool_)
+        return libcudf.reduce.reduce("any", result_col)
 
     @functools.cached_property
     def nan_count(self) -> int:
@@ -684,15 +684,16 @@ def to_pandas(
             return super().to_pandas(nullable=nullable, arrow_type=arrow_type)
 
     def _reduction_result_dtype(self, reduction_op: str) -> Dtype:
-        col_dtype = self.dtype
         if reduction_op in {"sum", "product"}:
-            col_dtype = (
-                col_dtype if col_dtype.kind == "f" else np.dtype("int64")
-            )
+            if self.dtype.kind == "f":
+                return self.dtype
+            return np.dtype("int64")
         elif reduction_op == "sum_of_squares":
-            col_dtype = np.result_dtype(col_dtype, np.dtype("uint64"))
+            return np.result_dtype(self.dtype, np.dtype("uint64"))
+        elif reduction_op in {"var", "std", "mean"}:
+            return np.dtype("float64")
 
-        return col_dtype
+        return super()._reduction_result_dtype(reduction_op)
 
 
 def _normalize_find_and_replace_input(
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index 95c78c5efcb..f41010062c8 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -144,32 +144,27 @@ def mean(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype=np.float64,
     ):
-        return self._reduce(
-            "mean", skipna=skipna, min_count=min_count, dtype=dtype
-        )
+        return self._reduce("mean", skipna=skipna, min_count=min_count)
 
     def var(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype=np.float64,
         ddof=1,
     ):
         return self._reduce(
-            "var", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+            "var", skipna=skipna, min_count=min_count, ddof=ddof
         )
 
     def std(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype=np.float64,
         ddof=1,
     ):
         return self._reduce(
-            "std", skipna=skipna, min_count=min_count, dtype=dtype, ddof=ddof
+            "std", skipna=skipna, min_count=min_count, ddof=ddof
         )
 
     def median(self, skipna: bool | None = None) -> NumericalBaseColumn:
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 36d7d9f9614..59ea1cc002c 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -287,11 +287,11 @@ def as_timedelta_column(self, dtype: Dtype) -> TimeDeltaColumn:
             return self
         return libcudf.unary.cast(self, dtype=dtype)
 
-    def mean(self, skipna=None, dtype: Dtype = np.float64) -> pd.Timedelta:
+    def mean(self, skipna=None) -> pd.Timedelta:
         return pd.Timedelta(
             cast(
                 "cudf.core.column.NumericalColumn", self.astype("int64")
-            ).mean(skipna=skipna, dtype=dtype),
+            ).mean(skipna=skipna),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
 
@@ -345,12 +345,11 @@ def std(
         self,
         skipna: bool | None = None,
         min_count: int = 0,
-        dtype: Dtype = np.float64,
         ddof: int = 1,
     ) -> pd.Timedelta:
         return pd.Timedelta(
             cast("cudf.core.column.NumericalColumn", self.astype("int64")).std(
-                skipna=skipna, min_count=min_count, ddof=ddof, dtype=dtype
+                skipna=skipna, min_count=min_count, ddof=ddof
             ),
             unit=self.time_unit,
         ).as_unit(self.time_unit)
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index 1247fa362ce..8be6463c699 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -248,16 +248,11 @@ def test_sum_masked(nelem):
 
 def test_sum_boolean():
     s = Series(np.arange(100000))
-    got = (s > 1).sum(dtype=np.int32)
+    got = (s > 1).sum()
     expect = 99998
 
     assert expect == got
 
-    got = (s > 1).sum(dtype=np.bool_)
-    expect = True
-
-    assert expect == got
-
 
 def test_date_minmax():
     np_data = np.random.normal(size=10**3)
@@ -371,3 +366,11 @@ def test_reduction_column_multiindex():
     result = df.mean()
     expected = df.to_pandas().mean()
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("op", ["sum", "product"])
+def test_dtype_deprecated(op):
+    ser = cudf.Series(range(5))
+    with pytest.warns(FutureWarning):
+        result = getattr(ser, op)(dtype=np.dtype(np.int8))
+    assert isinstance(result, np.int8)

From 3df4ac28423b99e4dd88570da8d55e2e5af2e1bc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 10:46:18 -1000
Subject: [PATCH 543/842] Remove squeeze argument from groupby (#16312)

In pandas, this argument was deprecated in pandas 1.x and removed in pandas 2.x. xref https://github.com/pandas-dev/pandas/pull/33218

Looks like in cudf this argument was never implemented, so to align with pandas, I think it should be OK to just remove this argument

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16312
---
 python/cudf/cudf/core/dataframe.py     | 2 --
 python/cudf/cudf/core/indexed_frame.py | 6 ------
 python/cudf/cudf/core/series.py        | 2 --
 3 files changed, 10 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f06e45277e2..8f8baec0af4 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4306,7 +4306,6 @@ def groupby(
         as_index=True,
         sort=no_default,
         group_keys=False,
-        squeeze=False,
         observed=True,
         dropna=True,
     ):
@@ -4317,7 +4316,6 @@ def groupby(
             as_index,
             sort,
             group_keys,
-            squeeze,
             observed,
             dropna,
         )
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 77675edc0f0..576596f6f7d 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -5249,7 +5249,6 @@ def groupby(
         as_index=True,
         sort=no_default,
         group_keys=False,
-        squeeze=False,
         observed=True,
         dropna=True,
     ):
@@ -5259,11 +5258,6 @@ def groupby(
         if axis not in (0, "index"):
             raise NotImplementedError("axis parameter is not yet implemented")
 
-        if squeeze is not False:
-            raise NotImplementedError(
-                "squeeze parameter is not yet implemented"
-            )
-
         if not observed:
             raise NotImplementedError(
                 "observed parameter is not yet implemented"
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index c9d24890d15..baaa2eb46a1 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3368,7 +3368,6 @@ def groupby(
         as_index=True,
         sort=no_default,
         group_keys=False,
-        squeeze=False,
         observed=True,
         dropna=True,
     ):
@@ -3379,7 +3378,6 @@ def groupby(
             as_index,
             sort,
             group_keys,
-            squeeze,
             observed,
             dropna,
         )

From 18f5fe0010fd42f604a340cd025a9ca9e122c6f5 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 19 Jul 2024 14:41:39 -0700
Subject: [PATCH 544/842] Fix polars for 1.2.1 (#16316)

I think Polars made a breaking change in a patch release.
At least the error we're getting looks like the error from
https://github.com/pola-rs/polars/pull/17606.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16316
---
 python/cudf_polars/cudf_polars/utils/versions.py |  1 +
 python/cudf_polars/tests/test_groupby.py         | 13 ++++++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/python/cudf_polars/cudf_polars/utils/versions.py b/python/cudf_polars/cudf_polars/utils/versions.py
index a9ac14c25aa..9807cffb384 100644
--- a/python/cudf_polars/cudf_polars/utils/versions.py
+++ b/python/cudf_polars/cudf_polars/utils/versions.py
@@ -15,6 +15,7 @@
 POLARS_VERSION_GE_10 = POLARS_VERSION >= parse("1.0")
 POLARS_VERSION_GE_11 = POLARS_VERSION >= parse("1.1")
 POLARS_VERSION_GE_12 = POLARS_VERSION >= parse("1.2")
+POLARS_VERSION_GE_121 = POLARS_VERSION >= parse("1.2.1")
 POLARS_VERSION_GT_10 = POLARS_VERSION > parse("1.0")
 POLARS_VERSION_GT_11 = POLARS_VERSION > parse("1.1")
 POLARS_VERSION_GT_12 = POLARS_VERSION > parse("1.2")
diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py
index b650fee5079..a75825ef3d3 100644
--- a/python/cudf_polars/tests/test_groupby.py
+++ b/python/cudf_polars/tests/test_groupby.py
@@ -157,7 +157,18 @@ def test_groupby_nan_minmax_raises(op):
     assert_ir_translation_raises(q, NotImplementedError)
 
 
-@pytest.mark.parametrize("key", [1, pl.col("key1")])
+@pytest.mark.parametrize(
+    "key",
+    [
+        pytest.param(
+            1,
+            marks=pytest.mark.xfail(
+                versions.POLARS_VERSION_GE_121, reason="polars 1.2.1 disallows this"
+            ),
+        ),
+        pl.col("key1"),
+    ],
+)
 @pytest.mark.parametrize(
     "expr",
     [

From fa0d89d9b4b4152b919999b5f01b1e68407469c5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 11:46:28 -1000
Subject: [PATCH 545/842] Clean unneeded/redudant dtype utils (#16309)

* Replace `min_scalar_type` with `min_signed_type` (the former just called the latter)
* Replace `numeric_normalize_types` with `find_common_dtype` followed by a column `astype`
* Removed `_NUMPY_SCTYPES` with just hardcoding the integer/floating types or using `np.integer`/`np.floating`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16309
---
 python/cudf/cudf/core/column/column.py    |  6 +++---
 python/cudf/cudf/core/column/numerical.py | 12 +++++++----
 python/cudf/cudf/core/dataframe.py        | 22 ++++---------------
 python/cudf/cudf/core/index.py            | 22 +++++++++----------
 python/cudf/cudf/utils/dtypes.py          | 26 ++++++-----------------
 5 files changed, 32 insertions(+), 56 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 5e77aa87e4e..89f0f79cb7c 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -71,7 +71,7 @@
     get_time_unit,
     is_column_like,
     is_mixed_with_object_dtype,
-    min_scalar_type,
+    min_signed_type,
     min_unsigned_type,
 )
 from cudf.utils.utils import _array_ufunc, mask_dtype
@@ -1356,7 +1356,7 @@ def _label_encoding(
         self,
         cats: ColumnBase,
         dtype: Dtype | None = None,
-        na_sentinel: ScalarLike | None = None,
+        na_sentinel: cudf.Scalar | None = None,
     ):
         """
         Convert each value in `self` into an integer code, with `cats`
@@ -1396,7 +1396,7 @@ def _return_sentinel_column():
             return as_column(na_sentinel, dtype=dtype, length=len(self))
 
         if dtype is None:
-            dtype = min_scalar_type(max(len(cats), na_sentinel), 8)
+            dtype = min_signed_type(max(len(cats), na_sentinel.value), 8)
 
         if is_mixed_with_object_dtype(self, cats):
             return _return_sentinel_column()
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index ba080863722..b55284f1aff 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -29,10 +29,10 @@
 from cudf.core.mixins import BinaryOperand
 from cudf.errors import MixedTypeError
 from cudf.utils.dtypes import (
+    find_common_type,
     min_column_type,
     min_signed_type,
     np_dtypes_to_pandas_dtypes,
-    numeric_normalize_types,
 )
 
 from .numerical_base import NumericalBaseColumn
@@ -517,11 +517,15 @@ def find_and_replace(
             )
         elif len(replacement_col) == 1 and len(to_replace_col) == 0:
             return self.copy()
-        to_replace_col, replacement_col, replaced = numeric_normalize_types(
-            to_replace_col, replacement_col, self
+        common_type = find_common_type(
+            (to_replace_col.dtype, replacement_col.dtype, self.dtype)
         )
+        replaced = self.astype(common_type)
         df = cudf.DataFrame._from_data(
-            {"old": to_replace_col, "new": replacement_col}
+            {
+                "old": to_replace_col.astype(common_type),
+                "new": replacement_col.astype(common_type),
+            }
         )
         df = df.drop_duplicates(subset=["old"], keep="last", ignore_index=True)
         if df._data["old"].null_count == 1:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 8f8baec0af4..904bd4ccb2e 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -83,8 +83,7 @@
     cudf_dtype_from_pydata_dtype,
     find_common_type,
     is_column_like,
-    min_scalar_type,
-    numeric_normalize_types,
+    min_signed_type,
 )
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api
@@ -103,20 +102,6 @@
     "var": "nanvar",
 }
 
-_numeric_reduction_ops = (
-    "mean",
-    "min",
-    "max",
-    "sum",
-    "product",
-    "prod",
-    "std",
-    "var",
-    "kurtosis",
-    "kurt",
-    "skew",
-)
-
 
 def _shape_mismatch_error(x, y):
     raise ValueError(
@@ -923,7 +908,8 @@ def _init_from_series_list(self, data, columns, index):
             final_index = ensure_index(index)
 
         series_lengths = list(map(len, data))
-        data = numeric_normalize_types(*data)
+        common_dtype = find_common_type([obj.dtype for obj in data])
+        data = [obj.astype(common_dtype) for obj in data]
         if series_lengths.count(series_lengths[0]) == len(series_lengths):
             # Calculating the final dataframe columns by
             # getting union of all `index` of the Series objects.
@@ -8304,7 +8290,7 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
             )._column.unique()
             # Set the column dtype to the codes' dtype. The categories
             # will be re-assigned at the end
-            dtypes[idx] = min_scalar_type(len(categories[idx]))
+            dtypes[idx] = min_signed_type(len(categories[idx]))
         # Otherwise raise an error if columns have different dtypes
         elif not all(is_dtype_equal(c.dtype, dtypes[idx]) for c in cols):
             raise ValueError("All columns must be the same type")
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 4164f981fca..cd52a34e35e 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -52,11 +52,9 @@
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils.docutils import copy_docstring
 from cudf.utils.dtypes import (
-    _NUMPY_SCTYPES,
     _maybe_convert_to_default_type,
     find_common_type,
     is_mixed_with_object_dtype,
-    numeric_normalize_types,
 )
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
@@ -357,12 +355,10 @@ def _data(self):
     @_performance_tracking
     def __contains__(self, item):
         hash(item)
-        if isinstance(item, bool) or not isinstance(
-            item,
-            tuple(
-                _NUMPY_SCTYPES["int"] + _NUMPY_SCTYPES["float"] + [int, float]
-            ),
-        ):
+        if not isinstance(item, (np.floating, np.integer, int, float)):
+            return False
+        elif isinstance(item, (np.timedelta64, np.datetime64, bool)):
+            # Cases that would pass the above check
             return False
         try:
             int_item = int(item)
@@ -1601,9 +1597,13 @@ def append(self, other):
                         f"either one of them to same dtypes."
                     )
 
-                if isinstance(self._values, cudf.core.column.NumericalColumn):
-                    if self.dtype != other.dtype:
-                        this, other = numeric_normalize_types(self, other)
+                if (
+                    isinstance(self._column, cudf.core.column.NumericalColumn)
+                    and self.dtype != other.dtype
+                ):
+                    common_type = find_common_type((self.dtype, other.dtype))
+                    this = this.astype(common_type)
+                    other = other.astype(common_type)
                 to_concat = [this, other]
 
         return self._concat(to_concat)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index af912bee342..69c268db149 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -89,10 +89,6 @@
 BOOL_TYPES = {"bool"}
 ALL_TYPES = NUMERIC_TYPES | DATETIME_TYPES | TIMEDELTA_TYPES | OTHER_TYPES
 
-# The NumPy scalar types are a bit of a mess as they align with the C types
-# so for now we use the `sctypes` dict (although it was made private in 2.0)
-_NUMPY_SCTYPES = np.sctypes if hasattr(np, "sctypes") else np._core.sctypes
-
 
 def np_to_pa_dtype(dtype):
     """Util to convert numpy dtype to PyArrow dtype."""
@@ -114,12 +110,6 @@ def np_to_pa_dtype(dtype):
     return _np_pa_dtypes[cudf.dtype(dtype).type]
 
 
-def numeric_normalize_types(*args):
-    """Cast all args to a common type using numpy promotion logic"""
-    dtype = np.result_type(*[a.dtype for a in args])
-    return [a.astype(dtype) for a in args]
-
-
 def _find_common_type_decimal(dtypes):
     # Find the largest scale and the largest difference between
     # precision and scale of the columns to be concatenated
@@ -330,32 +320,28 @@ def can_convert_to_column(obj):
     return is_column_like(obj) or cudf.api.types.is_list_like(obj)
 
 
-def min_scalar_type(a, min_size=8):
-    return min_signed_type(a, min_size=min_size)
-
-
-def min_signed_type(x, min_size=8):
+def min_signed_type(x: int, min_size: int = 8) -> np.dtype:
     """
     Return the smallest *signed* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in _NUMPY_SCTYPES["int"]:
+    for int_dtype in (np.int8, np.int16, np.int32, np.int64):
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if np.iinfo(int_dtype).min <= x <= np.iinfo(int_dtype).max:
-                return int_dtype
+                return np.dtype(int_dtype)
     # resort to using `int64` and let numpy raise appropriate exception:
     return np.int64(x).dtype
 
 
-def min_unsigned_type(x, min_size=8):
+def min_unsigned_type(x: int, min_size: int = 8) -> np.dtype:
     """
     Return the smallest *unsigned* integer dtype
     that can represent the integer ``x``
     """
-    for int_dtype in _NUMPY_SCTYPES["uint"]:
+    for int_dtype in (np.uint8, np.uint16, np.uint32, np.uint64):
         if (cudf.dtype(int_dtype).itemsize * 8) >= min_size:
             if 0 <= x <= np.iinfo(int_dtype).max:
-                return int_dtype
+                return np.dtype(int_dtype)
     # resort to using `uint64` and let numpy raise appropriate exception:
     return np.uint64(x).dtype
 

From 910989eb8fb87b2e896aa032260705c27cce71e0 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 19 Jul 2024 15:48:37 -0600
Subject: [PATCH 546/842] Rename gather/scatter benchmarks to clarify coalesced
 behavior. (#16083)

The benchmark names `coalesce_x` and `coalesce_o` are not very clear. This PR renames them to `coalesced` and `shuffled`. This was discussed with @GregoryKimball.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/16083
---
 cpp/benchmarks/copying/gather.cu              | 6 +++---
 cpp/benchmarks/copying/scatter.cu             | 6 +++---
 cpp/benchmarks/lists/copying/scatter_lists.cu | 6 +++---
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/benchmarks/copying/gather.cu b/cpp/benchmarks/copying/gather.cu
index eeb0149fb3a..985166f7298 100644
--- a/cpp/benchmarks/copying/gather.cu
+++ b/cpp/benchmarks/copying/gather.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -71,5 +71,5 @@ void BM_gather(benchmark::State& state)
     ->Ranges({{1 << 10, 1 << 26}, {1, 8}})                     \
     ->UseManualTime();
 
-GBM_BENCHMARK_DEFINE(double_coalesce_x, double, true);
-GBM_BENCHMARK_DEFINE(double_coalesce_o, double, false);
+GBM_BENCHMARK_DEFINE(double_coalesced, double, true);
+GBM_BENCHMARK_DEFINE(double_shuffled, double, false);
diff --git a/cpp/benchmarks/copying/scatter.cu b/cpp/benchmarks/copying/scatter.cu
index a521dc82739..c27480b69f4 100644
--- a/cpp/benchmarks/copying/scatter.cu
+++ b/cpp/benchmarks/copying/scatter.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -74,5 +74,5 @@ void BM_scatter(benchmark::State& state)
     ->Ranges({{1 << 10, 1 << 25}, {1, 8}})                      \
     ->UseManualTime();
 
-SBM_BENCHMARK_DEFINE(double_coalesce_x, double, true);
-SBM_BENCHMARK_DEFINE(double_coalesce_o, double, false);
+SBM_BENCHMARK_DEFINE(double_coalesced, double, true);
+SBM_BENCHMARK_DEFINE(double_shuffled, double, false);
diff --git a/cpp/benchmarks/lists/copying/scatter_lists.cu b/cpp/benchmarks/lists/copying/scatter_lists.cu
index dbc3234dabf..570decf410f 100644
--- a/cpp/benchmarks/lists/copying/scatter_lists.cu
+++ b/cpp/benchmarks/lists/copying/scatter_lists.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -143,5 +143,5 @@ void BM_lists_scatter(::benchmark::State& state)
     ->Ranges({{1 << 10, 1 << 25}, {64, 2048}}) /* 1K-1B rows, 64-2048 elements */ \
     ->UseManualTime();
 
-SBM_BENCHMARK_DEFINE(double_type_colesce_o, double, true);
-SBM_BENCHMARK_DEFINE(double_type_colesce_x, double, false);
+SBM_BENCHMARK_DEFINE(double_coalesced, double, true);
+SBM_BENCHMARK_DEFINE(double_shuffled, double, false);

From 6e37afc7c9e177b307c41950e52453bd5906af44 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 11:52:27 -1000
Subject: [PATCH 547/842] Make __bool__ raise for more cudf objects (#16311)

To match pandas, this PR makes `DataFrame`, `MultiIndex` and `RangeIndex` raise on `__bool__`.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16311
---
 python/cudf/cudf/core/_base_index.py         | 6 ++++++
 python/cudf/cudf/core/frame.py               | 6 ++++++
 python/cudf/cudf/core/single_column_frame.py | 6 ------
 python/cudf/cudf/tests/test_csv.py           | 2 +-
 python/cudf/cudf/tests/test_dataframe.py     | 9 +++++++++
 python/cudf/cudf/tests/test_index.py         | 9 +++++++++
 python/cudf/cudf/tests/test_multiindex.py    | 9 +++++++++
 7 files changed, 40 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 479f87bb78b..657acc41b18 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -62,6 +62,12 @@ def copy(self, deep: bool = True) -> Self:
     def __len__(self):
         raise NotImplementedError
 
+    def __bool__(self):
+        raise ValueError(
+            f"The truth value of a {type(self).__name__} is ambiguous. Use "
+            "a.empty, a.bool(), a.item(), a.any() or a.all()."
+        )
+
     @property
     def size(self):
         # The size of an index is always its length irrespective of dimension.
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 111225a5fc2..e3a2e840902 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1587,6 +1587,12 @@ def __pos__(self):
     def __abs__(self):
         return self._unaryop("abs")
 
+    def __bool__(self):
+        raise ValueError(
+            f"The truth value of a {type(self).__name__} is ambiguous. Use "
+            "a.empty, a.bool(), a.item(), a.any() or a.all()."
+        )
+
     # Reductions
     @classmethod
     @_performance_tracking
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 04c7db7a53c..7efe13d9b45 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -91,12 +91,6 @@ def shape(self) -> tuple[int]:
         """Get a tuple representing the dimensionality of the Index."""
         return (len(self),)
 
-    def __bool__(self):
-        raise TypeError(
-            f"The truth value of a {type(self)} is ambiguous. Use "
-            "a.empty, a.bool(), a.item(), a.any() or a.all()."
-        )
-
     @property  # type: ignore
     @_performance_tracking
     def _num_columns(self) -> int:
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index a22a627523f..0525b02b698 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1617,7 +1617,7 @@ def test_csv_reader_partial_dtype(dtype):
         StringIO('"A","B","C"\n0,1,2'), dtype=dtype, usecols=["A", "C"]
     )
 
-    assert names_df == header_df
+    assert_eq(names_df, header_df)
     assert all(names_df.dtypes == ["int16", "int64"])
 
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 2009fc49ce5..53ed5d728cb 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -11100,3 +11100,12 @@ def test_from_records_with_index_no_shallow_copy():
     data = np.array([(1.0, 2), (3.0, 4)], dtype=[("x", "<f8"), ("y", "<i8")])
     df = cudf.DataFrame(data.view(np.recarray), index=idx)
     assert df.index is idx
+
+
+def test_bool_raises():
+    assert_exceptions_equal(
+        lfunc=bool,
+        rfunc=bool,
+        lfunc_args_and_kwargs=[[cudf.DataFrame()]],
+        rfunc_args_and_kwargs=[[pd.DataFrame()]],
+    )
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 9eba6122d26..722a64cb553 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -3294,3 +3294,12 @@ def test_index_assignment_no_shallow_copy(index):
     df = cudf.DataFrame(range(1))
     df.index = index
     assert df.index is index
+
+
+def test_bool_rangeindex_raises():
+    assert_exceptions_equal(
+        lfunc=bool,
+        rfunc=bool,
+        lfunc_args_and_kwargs=[[pd.RangeIndex(0)]],
+        rfunc_args_and_kwargs=[[cudf.RangeIndex(0)]],
+    )
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 1941eec91eb..2c00d48266c 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -2161,3 +2161,12 @@ def test_nunique(array, dropna):
     result = gidx.nunique(dropna=dropna)
     expected = pidx.nunique(dropna=dropna)
     assert result == expected
+
+
+def test_bool_raises():
+    assert_exceptions_equal(
+        lfunc=bool,
+        rfunc=bool,
+        lfunc_args_and_kwargs=[[cudf.MultiIndex.from_arrays([range(1)])]],
+        rfunc_args_and_kwargs=[[pd.MultiIndex.from_arrays([range(1)])]],
+    )

From ecc27a1140c0c287091f6a1291dfaf7ccd82cb19 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 11:55:40 -1000
Subject: [PATCH 548/842] Align more DataFrame APIs with pandas (#16310)

I have a script that did some signature comparisons between `pandas.DataFrame` and `cudf.DataFrame` API and it appears some signatures have changed between the pandas 1.x and 2.x release. The API changes in this PR are mostly adding implementations or adding missing keyword argument (although they might not be implemented). The APIs affected are:

* `__init__`
* `__array__`
* `__arrow_c_stream__`
* `to_dict`
* `where`
* `add_prefix`
* `join`
* `apply`
* `to_records`
* `from_records`
* `unstack`
* `pct_change`
* `sort_values`

Marking as breaking as I ensured some added keywords are in the same positions as pandas and therefore might break users who are using purely positional arguments.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16310
---
 python/cudf/cudf/core/dataframe.py     | 169 +++++++++++++++++++++++--
 python/cudf/cudf/core/frame.py         |   2 +-
 python/cudf/cudf/core/indexed_frame.py |  13 +-
 python/cudf/cudf/core/reshape.py       |   7 +-
 python/cudf/cudf/core/series.py        |  32 ++++-
 5 files changed, 202 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 904bd4ccb2e..7e07078c95b 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -594,6 +594,9 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     dtype : dtype, default None
         Data type to force. Only a single dtype is allowed.
         If None, infer.
+    copy : bool or None, default None
+        Copy data from inputs.
+        Currently not implemented.
     nan_as_null : bool, Default True
         If ``None``/``True``, converts ``np.nan`` values to
         ``null`` values.
@@ -680,8 +683,11 @@ def __init__(
         index=None,
         columns=None,
         dtype=None,
+        copy=None,
         nan_as_null=no_default,
     ):
+        if copy is not None:
+            raise NotImplementedError("copy is not currently implemented.")
         super().__init__()
         if nan_as_null is no_default:
             nan_as_null = not cudf.get_option("mode.pandas_compatible")
@@ -1524,6 +1530,25 @@ def __array_function__(self, func, types, args, kwargs):
             pass
         return NotImplemented
 
+    def __arrow_c_stream__(self, requested_schema=None):
+        """
+        Export the cudf DataFrame as an Arrow C stream PyCapsule.
+
+        Parameters
+        ----------
+        requested_schema : PyCapsule, default None
+            The schema to which the dataframe should be casted, passed as a
+            PyCapsule containing a C ArrowSchema representation of the
+            requested schema. Currently not implemented.
+
+        Returns
+        -------
+        PyCapsule
+        """
+        if requested_schema is not None:
+            raise NotImplementedError("requested_schema is not supported")
+        return self.to_arrow().__arrow_c_stream__()
+
     # The _get_numeric_data method is necessary for dask compatibility.
     @_performance_tracking
     def _get_numeric_data(self):
@@ -2235,6 +2260,7 @@ def to_dict(
         self,
         orient: str = "dict",
         into: type[dict] = dict,
+        index: bool = True,
     ) -> dict | list[dict]:
         """
         Convert the DataFrame to a dictionary.
@@ -2268,6 +2294,13 @@ def to_dict(
             instance of the mapping type you want.  If you want a
             collections.defaultdict, you must pass it initialized.
 
+        index : bool, default True
+            Whether to include the index item (and index_names item if `orient`
+            is 'tight') in the returned dictionary. Can only be ``False``
+            when `orient` is 'split' or 'tight'. Note that when `orient` is
+            'records', this parameter does not take effect (index item always
+            not included).
+
         Returns
         -------
         dict, list or collections.abc.Mapping
@@ -2349,7 +2382,7 @@ def to_dict(
                 raise TypeError(f"unsupported type: {into}")
             return cons(self.items())  # type: ignore[misc]
 
-        return self.to_pandas().to_dict(orient=orient, into=into)
+        return self.to_pandas().to_dict(orient=orient, into=into, index=index)
 
     @_performance_tracking
     def scatter_by_map(
@@ -3004,7 +3037,12 @@ def fillna(
         )
 
     @_performance_tracking
-    def where(self, cond, other=None, inplace=False):
+    def where(self, cond, other=None, inplace=False, axis=None, level=None):
+        if axis is not None:
+            raise NotImplementedError("axis is not supported.")
+        elif level is not None:
+            raise NotImplementedError("level is not supported.")
+
         from cudf.core._internals.where import (
             _check_and_cast_columns_with_other,
             _make_categorical_like,
@@ -3614,7 +3652,9 @@ def rename(
         return result
 
     @_performance_tracking
-    def add_prefix(self, prefix):
+    def add_prefix(self, prefix, axis=None):
+        if axis is not None:
+            raise NotImplementedError("axis is currently not implemented.")
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
         out.columns = [
@@ -4230,6 +4270,7 @@ def join(
         lsuffix="",
         rsuffix="",
         sort=False,
+        validate: str | None = None,
     ):
         """Join columns with other DataFrame on index or on a key column.
 
@@ -4243,6 +4284,16 @@ def join(
             column names when avoiding conflicts.
         sort : bool
             Set to True to ensure sorted ordering.
+        validate : str, optional
+            If specified, checks if join is of specified type.
+
+            * "one_to_one" or "1:1": check if join keys are unique in both left
+              and right datasets.
+            * "one_to_many" or "1:m": check if join keys are unique in left dataset.
+            * "many_to_one" or "m:1": check if join keys are unique in right dataset.
+            * "many_to_many" or "m:m": allowed, but does not result in checks.
+
+            Currently not supported.
 
         Returns
         -------
@@ -4256,6 +4307,10 @@ def join(
         """
         if on is not None:
             raise NotImplementedError("The on parameter is not yet supported")
+        elif validate is not None:
+            raise NotImplementedError(
+                "The validate parameter is not yet supported"
+            )
 
         df = self.merge(
             other,
@@ -4404,7 +4459,16 @@ def query(self, expr, local_dict=None):
 
     @_performance_tracking
     def apply(
-        self, func, axis=1, raw=False, result_type=None, args=(), **kwargs
+        self,
+        func,
+        axis=1,
+        raw=False,
+        result_type=None,
+        args=(),
+        by_row: Literal[False, "compat"] = "compat",
+        engine: Literal["python", "numba"] = "python",
+        engine_kwargs: dict[str, bool] | None = None,
+        **kwargs,
     ):
         """
         Apply a function along an axis of the DataFrame.
@@ -4432,6 +4496,25 @@ def apply(
             Not yet supported
         args: tuple
             Positional arguments to pass to func in addition to the dataframe.
+        by_row : False or "compat", default "compat"
+            Only has an effect when ``func`` is a listlike or dictlike of funcs
+            and the func isn't a string.
+            If "compat", will if possible first translate the func into pandas
+            methods (e.g. ``Series().apply(np.sum)`` will be translated to
+            ``Series().sum()``). If that doesn't work, will try call to apply again with
+            ``by_row=True`` and if that fails, will call apply again with
+            ``by_row=False`` (backward compatible).
+            If False, the funcs will be passed the whole Series at once.
+
+            Currently not supported.
+
+        engine : {'python', 'numba'}, default 'python'
+            Unused. Added for compatibility with pandas.
+        engine_kwargs : dict
+            Unused. Added for compatibility with pandas.
+        **kwargs
+            Additional keyword arguments to pass as keywords arguments to
+            `func`.
 
         Examples
         --------
@@ -4582,13 +4665,17 @@ def apply(
         <https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs.html>
         """
         if axis != 1:
-            raise ValueError(
+            raise NotImplementedError(
                 "DataFrame.apply currently only supports row wise ops"
             )
         if raw:
-            raise ValueError("The `raw` kwarg is not yet supported.")
+            raise NotImplementedError("The `raw` kwarg is not yet supported.")
         if result_type is not None:
-            raise ValueError("The `result_type` kwarg is not yet supported.")
+            raise NotImplementedError(
+                "The `result_type` kwarg is not yet supported."
+            )
+        if by_row != "compat":
+            raise NotImplementedError("by_row is currently not supported.")
 
         return self._apply(func, _get_row_kernel, *args, **kwargs)
 
@@ -5489,7 +5576,7 @@ def from_arrow(cls, table):
         return out
 
     @_performance_tracking
-    def to_arrow(self, preserve_index=None):
+    def to_arrow(self, preserve_index=None) -> pa.Table:
         """
         Convert to a PyArrow Table.
 
@@ -5579,18 +5666,36 @@ def to_arrow(self, preserve_index=None):
         return out.replace_schema_metadata(metadata)
 
     @_performance_tracking
-    def to_records(self, index=True):
+    def to_records(self, index=True, column_dtypes=None, index_dtypes=None):
         """Convert to a numpy recarray
 
         Parameters
         ----------
         index : bool
             Whether to include the index in the output.
+        column_dtypes : str, type, dict, default None
+            If a string or type, the data type to store all columns. If
+            a dictionary, a mapping of column names and indices (zero-indexed)
+            to specific data types. Currently not supported.
+        index_dtypes : str, type, dict, default None
+            If a string or type, the data type to store all index levels. If
+            a dictionary, a mapping of index level names and indices
+            (zero-indexed) to specific data types.
+            This mapping is applied only if `index=True`.
+            Currently not supported.
 
         Returns
         -------
         numpy recarray
         """
+        if column_dtypes is not None:
+            raise NotImplementedError(
+                "column_dtypes is currently not supported."
+            )
+        elif index_dtypes is not None:
+            raise NotImplementedError(
+                "column_dtypes is currently not supported."
+            )
         members = [("index", self.index.dtype)] if index else []
         members += [(col, self[col].dtype) for col in self._data.names]
         dtype = np.dtype(members)
@@ -5603,7 +5708,16 @@ def to_records(self, index=True):
 
     @classmethod
     @_performance_tracking
-    def from_records(cls, data, index=None, columns=None, nan_as_null=False):
+    def from_records(
+        cls,
+        data,
+        index=None,
+        exclude=None,
+        columns=None,
+        coerce_float: bool = False,
+        nrows: int | None = None,
+        nan_as_null=False,
+    ):
         """
         Convert structured or record ndarray to DataFrame.
 
@@ -5613,13 +5727,32 @@ def from_records(cls, data, index=None, columns=None, nan_as_null=False):
         index : str, array-like
             The name of the index column in *data*.
             If None, the default index is used.
+        exclude : sequence, default None
+            Columns or fields to exclude.
+            Currently not implemented.
         columns : list of str
             List of column names to include.
+        coerce_float : bool, default False
+            Attempt to convert values of non-string, non-numeric objects (like
+            decimal.Decimal) to floating point, useful for SQL result sets.
+            Currently not implemented.
+        nrows : int, default None
+            Number of rows to read if data is an iterator.
+            Currently not implemented.
 
         Returns
         -------
         DataFrame
         """
+        if exclude is not None:
+            raise NotImplementedError("exclude is currently not supported.")
+        if coerce_float is not False:
+            raise NotImplementedError(
+                "coerce_float is currently not supported."
+            )
+        if nrows is not None:
+            raise NotImplementedError("nrows is currently not supported.")
+
         if data.ndim != 1 and data.ndim != 2:
             raise ValueError(
                 f"records dimension expected 1 or 2 but found {data.ndim}"
@@ -7344,9 +7477,9 @@ def pivot_table(
 
     @_performance_tracking
     @copy_docstring(reshape.unstack)
-    def unstack(self, level=-1, fill_value=None):
+    def unstack(self, level=-1, fill_value=None, sort: bool = True):
         return cudf.core.reshape.unstack(
-            self, level=level, fill_value=fill_value
+            self, level=level, fill_value=fill_value, sort=sort
         )
 
     @_performance_tracking
@@ -7392,7 +7525,12 @@ def explode(self, column, ignore_index=False):
         return super()._explode(column, ignore_index)
 
     def pct_change(
-        self, periods=1, fill_method=no_default, limit=no_default, freq=None
+        self,
+        periods=1,
+        fill_method=no_default,
+        limit=no_default,
+        freq=None,
+        **kwargs,
     ):
         """
         Calculates the percent change between sequential elements
@@ -7417,6 +7555,9 @@ def pct_change(
         freq : str, optional
             Increment to use from time series API.
             Not yet implemented.
+        **kwargs
+            Additional keyword arguments are passed into
+            `DataFrame.shift`.
 
         Returns
         -------
@@ -7462,7 +7603,7 @@ def pct_change(
             data = self.fillna(method=fill_method, limit=limit)
 
         return data.diff(periods=periods) / data.shift(
-            periods=periods, freq=freq
+            periods=periods, freq=freq, **kwargs
         )
 
     def __dataframe__(
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index e3a2e840902..c82e073d7b7 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -389,7 +389,7 @@ def values_host(self) -> np.ndarray:
         return self.to_numpy()
 
     @_performance_tracking
-    def __array__(self, dtype=None):
+    def __array__(self, dtype=None, copy=None):
         raise TypeError(
             "Implicit conversion to a host NumPy array via __array__ is not "
             "allowed, To explicitly construct a GPU matrix, consider using "
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 576596f6f7d..60cd142db4b 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3302,7 +3302,7 @@ def pad(self, value=None, axis=None, inplace=None, limit=None):
         )
         return self.ffill(value=value, axis=axis, inplace=inplace, limit=limit)
 
-    def add_prefix(self, prefix):
+    def add_prefix(self, prefix, axis=None):
         """
         Prefix labels with string `prefix`.
 
@@ -3464,6 +3464,7 @@ def sort_values(
         kind="quicksort",
         na_position="last",
         ignore_index=False,
+        key=None,
     ):
         """Sort by the values along either axis.
 
@@ -3479,6 +3480,14 @@ def sort_values(
             'first' puts nulls at the beginning, 'last' puts nulls at the end
         ignore_index : bool, default False
             If True, index will not be sorted.
+        key : callable, optional
+            Apply the key function to the values
+            before sorting. This is similar to the ``key`` argument in the
+            builtin ``sorted`` function, with the notable difference that
+            this ``key`` function should be *vectorized*. It should expect a
+            ``Series`` and return a Series with the same shape as the input.
+            It will be applied to each column in `by` independently.
+            Currently not supported.
 
         Returns
         -------
@@ -3518,6 +3527,8 @@ def sort_values(
             )
         if axis != 0:
             raise NotImplementedError("`axis` not currently implemented.")
+        if key is not None:
+            raise NotImplementedError("key is not currently supported.")
 
         if len(self) == 0:
             return self
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 1120642947b..b538ae34b6f 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -1060,7 +1060,7 @@ def pivot(data, columns=None, index=no_default, values=no_default):
     return result
 
 
-def unstack(df, level, fill_value=None):
+def unstack(df, level, fill_value=None, sort: bool = True):
     """
     Pivot one or more levels of the (necessarily hierarchical) index labels.
 
@@ -1080,6 +1080,9 @@ def unstack(df, level, fill_value=None):
         levels of the index to pivot
     fill_value
         Non-functional argument provided for compatibility with Pandas.
+    sort : bool, default True
+        Sort the level(s) in the resulting MultiIndex columns.
+
 
     Returns
     -------
@@ -1156,6 +1159,8 @@ def unstack(df, level, fill_value=None):
 
     if fill_value is not None:
         raise NotImplementedError("fill_value is not supported.")
+    elif sort is False:
+        raise NotImplementedError(f"{sort=} is not supported.")
     if pd.api.types.is_list_like(level):
         if not level:
             return df
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index baaa2eb46a1..b1e63806934 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2063,6 +2063,7 @@ def sort_values(
         kind="quicksort",
         na_position="last",
         ignore_index=False,
+        key=None,
     ):
         """Sort by the values along either axis.
 
@@ -2076,6 +2077,14 @@ def sort_values(
             'first' puts nulls at the beginning, 'last' puts nulls at the end
         ignore_index : bool, default False
             If True, index will not be sorted.
+        key : callable, optional
+            Apply the key function to the values
+            before sorting. This is similar to the ``key`` argument in the
+            builtin ``sorted`` function, with the notable difference that
+            this ``key`` function should be *vectorized*. It should expect a
+            ``Series`` and return a Series with the same shape as the input.
+            It will be applied to each column in `by` independently.
+            Currently not supported.
 
         Returns
         -------
@@ -2107,6 +2116,7 @@ def sort_values(
             kind=kind,
             na_position=na_position,
             ignore_index=ignore_index,
+            key=key,
         )
 
     @_performance_tracking
@@ -3429,7 +3439,9 @@ def rename(self, index=None, copy=True):
         return Series._from_data(out_data, self.index, name=index)
 
     @_performance_tracking
-    def add_prefix(self, prefix):
+    def add_prefix(self, prefix, axis=None):
+        if axis is not None:
+            raise NotImplementedError("axis is currently not implemented.")
         return Series._from_data(
             # TODO: Change to deep=False when copy-on-write is default
             data=self._data.copy(deep=True),
@@ -3527,7 +3539,12 @@ def explode(self, ignore_index=False):
 
     @_performance_tracking
     def pct_change(
-        self, periods=1, fill_method=no_default, limit=no_default, freq=None
+        self,
+        periods=1,
+        fill_method=no_default,
+        limit=no_default,
+        freq=None,
+        **kwargs,
     ):
         """
         Calculates the percent change between sequential elements
@@ -3552,6 +3569,9 @@ def pct_change(
         freq : str, optional
             Increment to use from time series API.
             Not yet implemented.
+        **kwargs
+            Additional keyword arguments are passed into
+            `Series.shift`.
 
         Returns
         -------
@@ -3596,11 +3616,15 @@ def pct_change(
             warnings.simplefilter("ignore")
             data = self.fillna(method=fill_method, limit=limit)
         diff = data.diff(periods=periods)
-        change = diff / data.shift(periods=periods, freq=freq)
+        change = diff / data.shift(periods=periods, freq=freq, **kwargs)
         return change
 
     @_performance_tracking
-    def where(self, cond, other=None, inplace=False):
+    def where(self, cond, other=None, inplace=False, axis=None, level=None):
+        if axis is not None:
+            raise NotImplementedError("axis is not supported.")
+        elif level is not None:
+            raise NotImplementedError("level is not supported.")
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
             self._from_data_like_self(

From 57ed7fce6742abc96a8fd65216f032bad5937a2f Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Fri, 19 Jul 2024 17:24:55 -0500
Subject: [PATCH 549/842] Add tests for `pylibcudf` binaryops (#15470)

This PR implements a more general approach to testing binaryops that originally came up in https://github.com/rapidsai/cudf/pull/15279. This PR can possibly supersede that one.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15470
---
 cpp/include/cudf/binaryop.hpp                 |  11 +
 cpp/src/binaryop/binaryop.cpp                 |   7 +-
 .../binaryop/binop-verify-input-test.cpp      |   4 +-
 python/cudf/cudf/_lib/pylibcudf/binaryop.pxd  |   9 +
 python/cudf/cudf/_lib/pylibcudf/binaryop.pyx  |  35 +
 .../cudf/_lib/pylibcudf/libcudf/binaryop.pxd  |  39 +-
 .../cudf/cudf/pylibcudf_tests/common/utils.py |  10 +
 .../cudf/pylibcudf_tests/test_binaryops.py    | 786 ++++++++++++++++++
 8 files changed, 889 insertions(+), 12 deletions(-)
 create mode 100644 python/cudf/cudf/pylibcudf_tests/test_binaryops.py

diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 22dad11e109..c74c91e39c2 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -290,6 +290,17 @@ cudf::data_type binary_operation_fixed_point_output_type(binary_operator op,
 
 namespace binops {
 
+/**
+ * @brief Returns true if the binary operator is supported for the given input types.
+ *
+ * @param out The output data type
+ * @param lhs The left-hand cudf::data_type
+ * @param rhs The right-hand cudf::data_type
+ * @param op The binary operator
+ * @return true if the binary operator is supported for the given input types
+ */
+bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op);
+
 /**
  * @brief Computes output valid mask for op between a column and a scalar
  *
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 8ac1491547d..3ac8547baad 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -50,6 +50,11 @@
 namespace cudf {
 namespace binops {
 
+bool is_supported_operation(data_type out, data_type lhs, data_type rhs, binary_operator op)
+{
+  return cudf::binops::compiled::is_supported_operation(out, lhs, rhs, op);
+}
+
 /**
  * @brief Computes output valid mask for op between a column and a scalar
  */
@@ -194,7 +199,7 @@ std::unique_ptr<column> binary_operation(LhsType const& lhs,
                                          rmm::device_async_resource_ref mr)
 {
   if constexpr (std::is_same_v<LhsType, column_view> and std::is_same_v<RhsType, column_view>)
-    CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match");
+    CUDF_EXPECTS(lhs.size() == rhs.size(), "Column sizes don't match", std::invalid_argument);
 
   if (lhs.type().id() == type_id::STRING and rhs.type().id() == type_id::STRING and
       output_type.id() == type_id::STRING and
diff --git a/cpp/tests/binaryop/binop-verify-input-test.cpp b/cpp/tests/binaryop/binop-verify-input-test.cpp
index 1346dcd4666..def6e94452e 100644
--- a/cpp/tests/binaryop/binop-verify-input-test.cpp
+++ b/cpp/tests/binaryop/binop-verify-input-test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -42,5 +42,5 @@ TEST_F(BinopVerifyInputTest, Vector_Vector_ErrorSecondOperandVectorZeroSize)
 
   EXPECT_THROW(cudf::binary_operation(
                  lhs, rhs, cudf::binary_operator::ADD, cudf::data_type(cudf::type_id::INT64)),
-               cudf::logic_error);
+               std::invalid_argument);
 }
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
index 9a8c8e49dcf..2411e28ac66 100644
--- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
@@ -1,5 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+from libcpp cimport bool
+
 from cudf._lib.pylibcudf.libcudf.binaryop cimport binary_operator
 
 from .column cimport Column
@@ -22,3 +24,10 @@ cpdef Column binary_operation(
     binary_operator op,
     DataType output_type
 )
+
+cpdef bool is_supported_operation(
+    DataType out,
+    DataType lhs,
+    DataType rhs,
+    binary_operator op
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
index c1d669c3c1c..44d9f4ad04a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
@@ -2,6 +2,7 @@
 
 from cython.operator import dereference
 
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
@@ -84,3 +85,37 @@ cpdef Column binary_operation(
         raise ValueError(f"Invalid arguments {lhs} and {rhs}")
 
     return Column.from_libcudf(move(result))
+
+
+cpdef bool is_supported_operation(
+    DataType out,
+    DataType lhs,
+    DataType rhs,
+    binary_operator op
+):
+    """Check if an operation is supported for the given data types.
+
+    For details, see :cpp:func::is_supported_operation`.
+
+    Parameters
+    ----------
+    out : DataType
+        The output data type.
+    lhs : DataType
+        The left hand side data type.
+    rhs : DataType
+        The right hand side data type.
+    op : BinaryOperator
+        The operation to check.
+    Returns
+    -------
+    bool
+        True if the operation is supported, False otherwise
+    """
+
+    return cpp_binaryop.is_supported_operation(
+        out.c_obj,
+        lhs.c_obj,
+        rhs.c_obj,
+        op
+    )
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
index 0eda7d34ff9..b34fea6a775 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
@@ -1,9 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
+from cudf._lib.exception_handler cimport cudf_exception_handler
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
@@ -19,9 +21,20 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         TRUE_DIV
         FLOOR_DIV
         MOD
+        PMOD
         PYMOD
         POW
         INT_POW
+        LOG_BASE
+        ATAN2
+        SHIFT_LEFT
+        SHIFT_RIGHT
+        SHIFT_RIGHT_UNSIGNED
+        BITWISE_AND
+        BITWISE_OR
+        BITWISE_XOR
+        LOGICAL_AND
+        LOGICAL_OR
         EQUAL
         NOT_EQUAL
         LESS
@@ -29,38 +42,46 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         LESS_EQUAL
         GREATER_EQUAL
         NULL_EQUALS
+        NULL_MAX
+        NULL_MIN
         NULL_NOT_EQUALS
-        BITWISE_AND
-        BITWISE_OR
-        BITWISE_XOR
-        LOGICAL_AND
-        LOGICAL_OR
         GENERIC_BINARY
+        NULL_LOGICAL_AND
+        NULL_LOGICAL_OR
+        INVALID_BINARY
 
     cdef unique_ptr[column] binary_operation (
         const scalar& lhs,
         const column_view& rhs,
         binary_operator op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const scalar& rhs,
         binary_operator op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         binary_operator op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         const string& op,
         data_type output_type
-    ) except +
+    ) except +cudf_exception_handler
+
+cdef extern from "cudf/binaryop.hpp" namespace "cudf::binops" nogil:
+    cdef bool is_supported_operation(
+        data_type output_type,
+        data_type lhs_type,
+        data_type rhs_type,
+        binary_operator op
+    ) except +cudf_exception_handler
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index e029edfa2ed..ed2c5ca06c9 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -111,6 +111,16 @@ def _make_fields_nullable(typ):
         lhs = rhs.cast(lhs_type)
 
     if pa.types.is_floating(lhs.type) and pa.types.is_floating(rhs.type):
+        lhs_nans = pa.compute.is_nan(lhs)
+        rhs_nans = pa.compute.is_nan(rhs)
+        assert lhs_nans.equals(rhs_nans)
+
+        if pa.compute.any(lhs_nans) or pa.compute.any(rhs_nans):
+            # masks must be equal at this point
+            mask = pa.compute.fill_null(pa.compute.invert(lhs_nans), True)
+            lhs = lhs.filter(mask)
+            rhs = rhs.filter(mask)
+
         np.testing.assert_array_almost_equal(lhs, rhs)
     else:
         assert lhs.equals(rhs)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_binaryops.py b/python/cudf/cudf/pylibcudf_tests/test_binaryops.py
new file mode 100644
index 00000000000..a83caf39ead
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/test_binaryops.py
@@ -0,0 +1,786 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import math
+
+import numpy as np
+import pyarrow as pa
+import pytest
+from utils import assert_column_eq
+
+from cudf._lib import pylibcudf as plc
+
+
+def idfn(param):
+    ltype, rtype, outtype, plc_op, _ = param
+    params = (plc_op.name, ltype, rtype, outtype)
+    return "-".join(map(str, params))
+
+
+@pytest.fixture(params=[True, False], ids=["nulls", "no_nulls"])
+def nulls(request):
+    return request.param
+
+
+def make_col(dtype, nulls):
+    if dtype == "int64":
+        data = [1, 2, 3, 4, 5]
+        pa_type = pa.int64()
+    elif dtype == "uint64":
+        data = [1, 2, 3, 4, 5]
+        pa_type = pa.uint64()
+    elif dtype == "float64":
+        data = [1.0, 2.0, 3.0, 4.0, 5.0]
+        pa_type = pa.float64()
+    elif dtype == "bool":
+        data = [True, False, True, False, True]
+        pa_type = pa.bool_()
+    elif dtype == "timestamp64[ns]":
+        data = [
+            np.datetime64("2022-01-01"),
+            np.datetime64("2022-01-02"),
+            np.datetime64("2022-01-03"),
+            np.datetime64("2022-01-04"),
+            np.datetime64("2022-01-05"),
+        ]
+        pa_type = pa.timestamp("ns")
+    elif dtype == "timedelta64[ns]":
+        data = [
+            np.timedelta64(1, "ns"),
+            np.timedelta64(2, "ns"),
+            np.timedelta64(3, "ns"),
+            np.timedelta64(4, "ns"),
+            np.timedelta64(5, "ns"),
+        ]
+        pa_type = pa.duration("ns")
+    else:
+        raise ValueError("Unsupported dtype")
+
+    if nulls:
+        data[3] = None
+
+    return pa.array(data, type=pa_type)
+
+
+@pytest.fixture
+def pa_data(request, nulls):
+    ltype, rtype, outtype = request.param
+    values = make_col(ltype, nulls), make_col(rtype, nulls), outtype
+    return values
+
+
+@pytest.fixture
+def plc_data(pa_data):
+    lhs, rhs, outtype = pa_data
+    return (
+        plc.interop.from_arrow(lhs),
+        plc.interop.from_arrow(rhs),
+        plc.interop.from_arrow(pa.from_numpy_dtype(np.dtype(outtype))),
+    )
+
+
+@pytest.fixture
+def tests(request, nulls):
+    ltype, rtype, py_outtype, plc_op, py_op = request.param
+    pa_lhs, pa_rhs = make_col(ltype, nulls), make_col(rtype, nulls)
+    plc_lhs, plc_rhs = (
+        plc.interop.from_arrow(pa_lhs),
+        plc.interop.from_arrow(pa_rhs),
+    )
+    plc_dtype = plc.interop.from_arrow(
+        pa.from_numpy_dtype(np.dtype(py_outtype))
+    )
+    return (
+        pa_lhs,
+        pa_rhs,
+        py_outtype,
+        plc_lhs,
+        plc_rhs,
+        plc_dtype,
+        py_op,
+        plc_op,
+    )
+
+
+def custom_pyop(func):
+    def wrapper(x, y):
+        x = x.to_pylist()
+        y = y.to_pylist()
+
+        def inner(x, y):
+            if x is None or y is None:
+                return None
+            return func(x, y)
+
+        return pa.array([inner(x, y) for x, y in zip(x, y)])
+
+    return wrapper
+
+
+@custom_pyop
+def py_floordiv(x, y):
+    return x // y
+
+
+@custom_pyop
+def py_pmod(x, y):
+    return (x % y + y) % y
+
+
+@custom_pyop
+def py_mod(x, y):
+    return x % y
+
+
+@custom_pyop
+def py_atan2(x, y):
+    return math.atan2(x, y)
+
+
+@custom_pyop
+def py_shift_right_unsigned(x, y):
+    unsigned_x = np.uint32(x)
+    result = unsigned_x >> y
+    return result
+
+
+@pytest.mark.parametrize(
+    "tests",
+    [
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.ADD,
+            pa.compute.add,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.ADD,
+            pa.compute.add,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.ADD,
+            pa.compute.add,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SUB,
+            pa.compute.subtract,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SUB,
+            pa.compute.subtract,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SUB,
+            pa.compute.subtract,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.MUL,
+            pa.compute.multiply,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.MUL,
+            pa.compute.multiply,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.MUL,
+            pa.compute.multiply,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.TRUE_DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.TRUE_DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.TRUE_DIV,
+            pa.compute.divide,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.FLOOR_DIV,
+            py_floordiv,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.FLOOR_DIV,
+            py_floordiv,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.FLOOR_DIV,
+            py_floordiv,
+        ),
+        ("int64", "int64", "int64", plc.binaryop.BinaryOperator.MOD, py_mod),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.MOD,
+            py_mod,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.MOD,
+            py_mod,
+        ),
+        ("int64", "int64", "int64", plc.binaryop.BinaryOperator.PMOD, py_pmod),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.PMOD,
+            py_pmod,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.PMOD,
+            py_pmod,
+        ),
+        ("int64", "int64", "int64", plc.binaryop.BinaryOperator.PYMOD, py_mod),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.PYMOD,
+            py_mod,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.PYMOD,
+            py_mod,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.INT_POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.INT_POW,
+            pa.compute.power,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.INT_POW,
+            pa.compute.power,
+        ),
+        (
+            "float64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOG_BASE,
+            pa.compute.logb,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOG_BASE,
+            pa.compute.logb,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.LOG_BASE,
+            pa.compute.logb,
+        ),
+        (
+            "float64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.ATAN2,
+            py_atan2,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.ATAN2,
+            py_atan2,
+        ),
+        (
+            "int64",
+            "int64",
+            "timedelta64[ns]",
+            plc.binaryop.BinaryOperator.ATAN2,
+            py_atan2,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SHIFT_LEFT,
+            pa.compute.shift_left,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SHIFT_LEFT,
+            pa.compute.shift_left,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SHIFT_LEFT,
+            pa.compute.shift_left,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT,
+            pa.compute.shift_right,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT,
+            pa.compute.shift_right,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT,
+            pa.compute.shift_right,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT_UNSIGNED,
+            py_shift_right_unsigned,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT_UNSIGNED,
+            py_shift_right_unsigned,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.SHIFT_RIGHT_UNSIGNED,
+            py_shift_right_unsigned,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.BITWISE_AND,
+            pa.compute.bit_wise_and,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.BITWISE_AND,
+            pa.compute.bit_wise_and,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.BITWISE_AND,
+            pa.compute.bit_wise_and,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.BITWISE_OR,
+            pa.compute.bit_wise_or,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.BITWISE_OR,
+            pa.compute.bit_wise_or,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.BITWISE_OR,
+            pa.compute.bit_wise_or,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.BITWISE_XOR,
+            pa.compute.bit_wise_xor,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.BITWISE_XOR,
+            pa.compute.bit_wise_xor,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.BITWISE_XOR,
+            pa.compute.bit_wise_xor,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.EQUAL,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.EQUAL,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.NOT_EQUAL,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NOT_EQUAL,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.LESS,
+            pa.compute.less,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LESS,
+            pa.compute.less,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.GREATER,
+            pa.compute.greater,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.GREATER,
+            pa.compute.greater,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+            pa.compute.less_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.LESS_EQUAL,
+            pa.compute.less_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "bool",
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            pa.compute.greater_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.GREATER_EQUAL,
+            pa.compute.greater_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_EQUALS,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_EQUALS,
+            pa.compute.equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.NULL_MAX,
+            pa.compute.max_element_wise,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_MAX,
+            pa.compute.max_element_wise,
+        ),
+        (
+            "int64",
+            "int64",
+            "datetime64[ns]",
+            plc.binaryop.BinaryOperator.NULL_MIN,
+            pa.compute.min_element_wise,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_MIN,
+            pa.compute.min_element_wise,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_NOT_EQUALS,
+            pa.compute.not_equal,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_AND,
+            pa.compute.and_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "float64",
+            "float64",
+            plc.binaryop.BinaryOperator.NULL_LOGICAL_OR,
+            pa.compute.or_,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.GENERIC_BINARY,
+            None,
+        ),
+        (
+            "int64",
+            "int64",
+            "int64",
+            plc.binaryop.BinaryOperator.INVALID_BINARY,
+            None,
+        ),
+    ],
+    indirect=True,
+    ids=idfn,
+)
+def test_binaryops(tests):
+    (
+        pa_lhs,
+        pa_rhs,
+        py_outtype,
+        plc_lhs,
+        plc_rhs,
+        plc_outtype,
+        py_op,
+        plc_op,
+    ) = tests
+
+    def get_result():
+        return plc.binaryop.binary_operation(
+            plc_lhs,
+            plc_rhs,
+            plc_op,
+            plc_outtype,
+        )
+
+    if not plc.binaryop.is_supported_operation(
+        plc_outtype, plc_lhs.type(), plc_rhs.type(), plc_op
+    ):
+        with pytest.raises(TypeError):
+            get_result()
+    else:
+        expect = py_op(pa_lhs, pa_rhs).cast(py_outtype)
+        got = get_result()
+        assert_column_eq(expect, got)

From 7d3083254c0503b07f82af32188120f42acef860 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 12:48:39 -1000
Subject: [PATCH 550/842] Replace np.isscalar/issubdtype checks with
 is_scalar/.kind checks (#16275)

* `is_scalar` also handles cudf.Scalars which should be handled internally
* `issubdtype` can largely be replaced by checking the `.kind` attribute on the dtype

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16275
---
 python/cudf/cudf/core/_internals/where.py   |  2 +-
 python/cudf/cudf/core/column/column.py      | 10 +++----
 python/cudf/cudf/core/column/datetime.py    |  2 +-
 python/cudf/cudf/core/column/lists.py       |  9 ++++---
 python/cudf/cudf/core/column/numerical.py   | 28 +++++++-------------
 python/cudf/cudf/core/join/_join_helpers.py | 29 ++++++---------------
 python/cudf/cudf/core/series.py             |  2 +-
 python/cudf/cudf/testing/testing.py         | 10 +++----
 python/cudf/cudf/utils/dtypes.py            |  4 +--
 9 files changed, 37 insertions(+), 59 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 6003a0f6aea..18ab32d2c9e 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -47,7 +47,7 @@ def _check_and_cast_columns_with_other(
 
     other_is_scalar = is_scalar(other)
     if other_is_scalar:
-        if isinstance(other, float) and not np.isnan(other):
+        if isinstance(other, (float, np.floating)) and not np.isnan(other):
             try:
                 is_safe = source_dtype.type(other) == other
             except OverflowError:
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 89f0f79cb7c..da735c22c52 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1458,9 +1458,10 @@ def column_empty_like(
     return column_empty(row_count, dtype, masked)
 
 
-def _has_any_nan(arbitrary):
+def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
+    """Check if an object dtype Series or array contains NaN."""
     return any(
-        ((isinstance(x, float) or isinstance(x, np.floating)) and np.isnan(x))
+        isinstance(x, (float, np.floating)) and np.isnan(x)
         for x in np.asarray(arbitrary)
     )
 
@@ -2312,9 +2313,8 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase:
     # Notice, we can always cast pure null columns
     not_null_col_dtypes = [o.dtype for o in objs if o.null_count != len(o)]
     if len(not_null_col_dtypes) and all(
-        _is_non_decimal_numeric_dtype(dtyp)
-        and np.issubdtype(dtyp, np.datetime64)
-        for dtyp in not_null_col_dtypes
+        _is_non_decimal_numeric_dtype(dtype) and dtype.kind == "M"
+        for dtype in not_null_col_dtypes
     ):
         common_dtype = find_common_type(not_null_col_dtypes)
         # Cast all columns to the common dtype
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index a4538179415..73902789c11 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -639,7 +639,7 @@ def isin(self, values: Sequence) -> ColumnBase:
         return cudf.core.tools.datetimes._isin_datetimelike(self, values)
 
     def can_cast_safely(self, to_dtype: Dtype) -> bool:
-        if np.issubdtype(to_dtype, np.datetime64):
+        if to_dtype.kind == "M":  # type: ignore[union-attr]
             to_res, _ = np.datetime_data(to_dtype)
             self_res, _ = np.datetime_data(self.dtype)
 
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 46b844413f7..1b7cd95b3d0 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -564,10 +564,11 @@ def take(self, lists_indices: ColumnLike) -> ParentType:
             raise ValueError(
                 "lists_indices and list column is of different " "size."
             )
-        if not _is_non_decimal_numeric_dtype(
-            lists_indices_col.children[1].dtype
-        ) or not np.issubdtype(
-            lists_indices_col.children[1].dtype, np.integer
+        if (
+            not _is_non_decimal_numeric_dtype(
+                lists_indices_col.children[1].dtype
+            )
+            or lists_indices_col.children[1].dtype.kind not in "iu"
         ):
             raise TypeError(
                 "lists_indices should be column of values of index types."
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index b55284f1aff..5e07bbab40c 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -225,25 +225,17 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 tmp = self if reflect else other
                 # Guard against division by zero for integers.
                 if (
-                    (tmp.dtype.type in int_float_dtype_mapping)
-                    and (tmp.dtype.type != np.bool_)
-                    and (
-                        (
-                            (
-                                np.isscalar(tmp)
-                                or (
-                                    isinstance(tmp, cudf.Scalar)
-                                    # host to device copy
-                                    and tmp.is_valid()
-                                )
-                            )
-                            and (0 == tmp)
-                        )
-                        or ((isinstance(tmp, NumericalColumn)) and (0 in tmp))
-                    )
+                    tmp.dtype.type in int_float_dtype_mapping
+                    and tmp.dtype.kind != "b"
                 ):
-                    out_dtype = cudf.dtype("float64")
-
+                    if isinstance(tmp, NumericalColumn) and 0 in tmp:
+                        out_dtype = cudf.dtype("float64")
+                    elif isinstance(tmp, cudf.Scalar):
+                        if tmp.is_valid() and tmp == 0:
+                            # tmp == 0 can return NA
+                            out_dtype = cudf.dtype("float64")
+                    elif is_scalar(tmp) and tmp == 0:
+                        out_dtype = cudf.dtype("float64")
         if op in {
             "__lt__",
             "__gt__",
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index dd0a4f666a1..32c84763401 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -9,7 +9,7 @@
 import numpy as np
 
 import cudf
-from cudf.api.types import is_decimal_dtype, is_dtype_equal
+from cudf.api.types import is_decimal_dtype, is_dtype_equal, is_numeric_dtype
 from cudf.core.column import CategoricalColumn
 from cudf.core.dtypes import CategoricalDtype
 
@@ -88,38 +88,25 @@ def _match_join_keys(
         )
 
     if (
-        np.issubdtype(ltype, np.number)
-        and np.issubdtype(rtype, np.number)
-        and not (
-            np.issubdtype(ltype, np.timedelta64)
-            or np.issubdtype(rtype, np.timedelta64)
-        )
+        is_numeric_dtype(ltype)
+        and is_numeric_dtype(rtype)
+        and not (ltype.kind == "m" or rtype.kind == "m")
     ):
         common_type = (
             max(ltype, rtype)
             if ltype.kind == rtype.kind
             else np.result_type(ltype, rtype)
         )
-    elif (
-        np.issubdtype(ltype, np.datetime64)
-        and np.issubdtype(rtype, np.datetime64)
-    ) or (
-        np.issubdtype(ltype, np.timedelta64)
-        and np.issubdtype(rtype, np.timedelta64)
+    elif (ltype.kind == "M" and rtype.kind == "M") or (
+        ltype.kind == "m" and rtype.kind == "m"
     ):
         common_type = max(ltype, rtype)
-    elif (
-        np.issubdtype(ltype, np.datetime64)
-        or np.issubdtype(ltype, np.timedelta64)
-    ) and not rcol.fillna(0).can_cast_safely(ltype):
+    elif ltype.kind in "mM" and not rcol.fillna(0).can_cast_safely(ltype):
         raise TypeError(
             f"Cannot join between {ltype} and {rtype}, please type-cast both "
             "columns to the same type."
         )
-    elif (
-        np.issubdtype(rtype, np.datetime64)
-        or np.issubdtype(rtype, np.timedelta64)
-    ) and not lcol.fillna(0).can_cast_safely(rtype):
+    elif rtype.kind in "mM" and not lcol.fillna(0).can_cast_safely(rtype):
         raise TypeError(
             f"Cannot join between {rtype} and {ltype}, please type-cast both "
             "columns to the same type."
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index b1e63806934..eb077179562 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -213,7 +213,7 @@ def __setitem__(self, key, value):
                         and self._frame.dtype.categories.dtype.kind == "f"
                     )
                 )
-                and isinstance(value, (np.float32, np.float64))
+                and isinstance(value, np.floating)
                 and np.isnan(value)
             ):
                 raise MixedTypeError(
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index e56c8d867cb..c2072d90e98 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -158,12 +158,12 @@ def assert_column_equal(
             return True
 
     if check_datetimelike_compat:
-        if np.issubdtype(left.dtype, np.datetime64):
+        if left.dtype.kind == "M":
             right = right.astype(left.dtype)
-        elif np.issubdtype(right.dtype, np.datetime64):
+        elif right.dtype.kind == "M":
             left = left.astype(right.dtype)
 
-        if np.issubdtype(left.dtype, np.datetime64):
+        if left.dtype.kind == "M":
             if not left.equals(right):
                 raise AssertionError(
                     f"[datetimelike_compat=True] {left.values} "
@@ -779,9 +779,7 @@ def assert_eq(left, right, **kwargs):
                 tm.assert_index_equal(left, right, **kwargs)
 
     elif isinstance(left, np.ndarray) and isinstance(right, np.ndarray):
-        if np.issubdtype(left.dtype, np.floating) and np.issubdtype(
-            right.dtype, np.floating
-        ):
+        if left.dtype.kind == "f" and right.dtype.kind == "f":
             assert np.allclose(left, right, equal_nan=True)
         else:
             assert np.array_equal(left, right)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index 69c268db149..c0de5274742 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -359,10 +359,10 @@ def min_column_type(x, expected_type):
     if x.null_count == len(x):
         return x.dtype
 
-    if np.issubdtype(x.dtype, np.floating):
+    if x.dtype.kind == "f":
         return get_min_float_dtype(x)
 
-    elif np.issubdtype(expected_type, np.integer):
+    elif cudf.dtype(expected_type).kind in "iu":
         max_bound_dtype = np.min_scalar_type(x.max())
         min_bound_dtype = np.min_scalar_type(x.min())
         result_type = np.promote_types(max_bound_dtype, min_bound_dtype)

From 4c46628eaf7ba16a2a181ceb3311f315cd4932dc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 12:51:07 -1000
Subject: [PATCH 551/842] Mark cudf._typing as a typing module in ruff (#16318)

Additionally breaks up the prior, single-line of `select` rules that are enabled.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16318
---
 pyproject.toml                    | 64 ++++++++++++++++++++++++++++++-
 python/cudf/cudf/core/resample.py |  6 ++-
 2 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2f59864894b..e15cb7b3cdd 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,69 @@ quiet-level = 3
 line-length = 79
 
 [tool.ruff.lint]
-select = ["E", "F", "W", "D201", "D204", "D206", "D207", "D208", "D209", "D210", "D211", "D214", "D215", "D300", "D301", "D403", "D405", "D406", "D407", "D408", "D409", "D410", "D411", "D412", "D414", "D418", "TCH", "FA", "UP006", "UP007"]
+typing-modules = ["cudf._typing"]
+select = [
+    # pycodestyle Error
+    "E",
+    # Pyflakes
+    "F",
+    # pycodestyle Warning
+    "W",
+    # no-blank-line-before-function
+    "D201",
+    # one-blank-line-after-class
+    "D204",
+    # indent-with-spaces
+    "D206",
+    # under-indentation
+    "D207",
+    # over-indentation
+    "D208",
+    # new-line-after-last-paragraph
+    "D209",
+    # surrounding-whitespace
+    "D210",
+    # blank-line-before-class
+    "D211",
+    # section-not-over-indented
+    "D214",
+    # section-underline-not-over-indented
+    "D215",
+    # triple-single-quotes
+    "D300",
+    # escape-sequence-in-docstring
+    "D301",
+    # first-line-capitalized
+    "D403",
+    # capitalize-section-name
+    "D405",
+    # new-line-after-section-name
+    "D406",
+    # dashed-underline-after-section
+    "D407",
+    # section-underline-after-name
+    "D408",
+    # section-underline-matches-section-length
+    "D409",
+    # no-blank-line-after-section
+    "D410",
+    # no-blank-line-before-section
+    "D411",
+    # blank-lines-between-header-and-content
+    "D412",
+    # empty-docstring-section
+    "D414",
+    # overload-with-docstring
+    "D418",
+    # flake8-type-checking
+    "TCH",
+    # flake8-future-annotations
+    "FA",
+    # non-pep585-annotation
+    "UP006",
+    # non-pep604-annotation
+    "UP007"
+]
 ignore = [
     # whitespace before :
     "E203",
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index cdd4ec6f8e5..4e0c5bd86b9 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -13,9 +13,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import annotations
 
 import pickle
 import warnings
+from typing import TYPE_CHECKING
 
 import numpy as np
 import pandas as pd
@@ -23,7 +25,6 @@
 import cudf
 import cudf._lib.labeling
 import cudf.core.index
-from cudf._typing import DataFrameOrSeries
 from cudf.core.groupby.groupby import (
     DataFrameGroupBy,
     GroupBy,
@@ -31,6 +32,9 @@
     _Grouping,
 )
 
+if TYPE_CHECKING:
+    from cudf._typing import DataFrameOrSeries
+
 
 class _Resampler(GroupBy):
     grouping: "_ResampleGrouping"

From 5dde41d7f7533180ecd355bac248a7ed18adcc10 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 13:08:36 -1000
Subject: [PATCH 552/842] Replace is_float/integer_dtype checks with .kind
 checks (#16261)

It appears this was called when we already had a dtype object so can instead just simply check the .kind attribute

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16261
---
 python/cudf/cudf/api/types.py                |  2 +-
 python/cudf/cudf/core/_base_index.py         | 19 +++----------
 python/cudf/cudf/core/column/column.py       | 29 ++++++++++----------
 python/cudf/cudf/core/column/decimal.py      |  4 +--
 python/cudf/cudf/core/column/numerical.py    | 13 +++------
 python/cudf/cudf/core/index.py               | 13 +++++----
 python/cudf/cudf/core/indexing_utils.py      |  8 ++----
 python/cudf/cudf/core/series.py              |  7 ++---
 python/cudf/cudf/core/single_column_frame.py |  3 +-
 python/cudf/cudf/tests/test_dataframe.py     |  2 +-
 python/cudf/cudf/utils/dtypes.py             | 28 +++++++++----------
 11 files changed, 52 insertions(+), 76 deletions(-)

diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index d97e9c815b6..294ae2fd985 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -90,7 +90,7 @@ def is_integer(obj):
     bool
     """
     if isinstance(obj, cudf.Scalar):
-        return pd.api.types.is_integer_dtype(obj.dtype)
+        return obj.dtype.kind in "iu"
     return pd.api.types.is_integer(obj)
 
 
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 657acc41b18..c38352009de 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -19,14 +19,7 @@
 )
 from cudf._lib.types import size_type_dtype
 from cudf.api.extensions import no_default
-from cudf.api.types import (
-    is_integer,
-    is_integer_dtype,
-    is_list_like,
-    is_scalar,
-    is_signed_integer_dtype,
-    is_unsigned_integer_dtype,
-)
+from cudf.api.types import is_integer, is_list_like, is_scalar
 from cudf.core.abc import Serializable
 from cudf.core.column import ColumnBase, column
 from cudf.errors import MixedTypeError
@@ -621,12 +614,8 @@ def union(self, other, sort=None):
                 # Bools + other types will result in mixed type.
                 # This is not yet consistent in pandas and specific to APIs.
                 raise MixedTypeError("Cannot perform union with mixed types")
-            if (
-                is_signed_integer_dtype(self.dtype)
-                and is_unsigned_integer_dtype(other.dtype)
-            ) or (
-                is_unsigned_integer_dtype(self.dtype)
-                and is_signed_integer_dtype(other.dtype)
+            if (self.dtype.kind == "i" and other.dtype.kind == "u") or (
+                self.dtype.kind == "u" and other.dtype.kind == "i"
             ):
                 # signed + unsigned types will result in
                 # mixed type for union in pandas.
@@ -2103,7 +2092,7 @@ def _gather(self, gather_map, nullify=False, check_bounds=True):
 
         # TODO: For performance, the check and conversion of gather map should
         # be done by the caller. This check will be removed in future release.
-        if not is_integer_dtype(gather_map.dtype):
+        if gather_map.dtype.kind not in "iu":
             gather_map = gather_map.astype(size_type_dtype)
 
         if not _gather_map_is_valid(
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index da735c22c52..32e6aade65b 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2219,25 +2219,26 @@ def as_column(
                 and arbitrary.null_count > 0
             ):
                 arbitrary = arbitrary.cast(pa.float64())
-            if cudf.get_option(
-                "default_integer_bitwidth"
-            ) and pa.types.is_integer(arbitrary.type):
-                dtype = _maybe_convert_to_default_type("int")
-            elif cudf.get_option(
-                "default_float_bitwidth"
-            ) and pa.types.is_floating(arbitrary.type):
-                dtype = _maybe_convert_to_default_type("float")
+            if (
+                cudf.get_option("default_integer_bitwidth")
+                and pa.types.is_integer(arbitrary.type)
+            ) or (
+                cudf.get_option("default_float_bitwidth")
+                and pa.types.is_floating(arbitrary.type)
+            ):
+                dtype = _maybe_convert_to_default_type(
+                    cudf.dtype(arbitrary.type.to_pandas_dtype())
+                )
         except (pa.ArrowInvalid, pa.ArrowTypeError, TypeError):
             arbitrary = pd.Series(arbitrary)
-            if cudf.get_option(
-                "default_integer_bitwidth"
-            ) and arbitrary.dtype.kind in set("iu"):
-                dtype = _maybe_convert_to_default_type("int")
-            elif (
+            if (
+                cudf.get_option("default_integer_bitwidth")
+                and arbitrary.dtype.kind in set("iu")
+            ) or (
                 cudf.get_option("default_float_bitwidth")
                 and arbitrary.dtype.kind == "f"
             ):
-                dtype = _maybe_convert_to_default_type("float")
+                dtype = _maybe_convert_to_default_type(arbitrary.dtype)
         return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
 
 
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index a63055ed527..6a7f338b065 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -15,7 +15,7 @@
 from cudf._lib.strings.convert.convert_fixed_point import (
     from_decimal as cpp_from_decimal,
 )
-from cudf.api.types import is_integer_dtype, is_scalar
+from cudf.api.types import is_scalar
 from cudf.core.buffer import as_buffer
 from cudf.core.column import ColumnBase
 from cudf.core.dtypes import (
@@ -150,7 +150,7 @@ def _validate_fillna_value(
     def normalize_binop_value(self, other):
         if isinstance(other, ColumnBase):
             if isinstance(other, cudf.core.column.NumericalColumn):
-                if not is_integer_dtype(other.dtype):
+                if other.dtype.kind not in "iu":
                     raise TypeError(
                         "Decimal columns only support binary operations with "
                         "integer numerical columns."
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 5e07bbab40c..f9404eb3b40 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -12,12 +12,7 @@
 import cudf
 from cudf import _lib as libcudf
 from cudf._lib import pylibcudf
-from cudf.api.types import (
-    is_float_dtype,
-    is_integer,
-    is_integer_dtype,
-    is_scalar,
-)
+from cudf.api.types import is_integer, is_scalar
 from cudf.core.column import (
     ColumnBase,
     as_column,
@@ -249,7 +244,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             out_dtype = "bool"
 
         if op in {"__and__", "__or__", "__xor__"}:
-            if is_float_dtype(self.dtype) or is_float_dtype(other.dtype):
+            if self.dtype.kind == "f" or other.dtype.kind == "f":
                 raise TypeError(
                     f"Operation 'bitwise {op[2:-2]}' not supported between "
                     f"{self.dtype.type.__name__} and "
@@ -260,8 +255,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
 
         if (
             op == "__pow__"
-            and is_integer_dtype(self.dtype)
-            and (is_integer(other) or is_integer_dtype(other.dtype))
+            and self.dtype.kind in "iu"
+            and (is_integer(other) or other.dtype.kind in "iu")
         ):
             op = "INT_POW"
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index cd52a34e35e..ae20fcd5d9c 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1456,18 +1456,19 @@ def notna(self):
     notnull = notna
 
     def _is_numeric(self):
-        return isinstance(
-            self._values, cudf.core.column.NumericalColumn
-        ) and self.dtype != cudf.dtype("bool")
+        return (
+            isinstance(self._values, cudf.core.column.NumericalColumn)
+            and self.dtype.kind != "b"
+        )
 
     def _is_boolean(self):
-        return self.dtype == cudf.dtype("bool")
+        return self.dtype.kind == "b"
 
     def _is_integer(self):
-        return cudf.api.types.is_integer_dtype(self.dtype)
+        return self.dtype.kind in "iu"
 
     def _is_floating(self):
-        return cudf.api.types.is_float_dtype(self.dtype)
+        return self.dtype.kind == "f"
 
     def _is_object(self):
         return isinstance(self._values, cudf.core.column.StringColumn)
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index 9c81b0eb607..a0089242909 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -8,11 +8,7 @@
 from typing_extensions import TypeAlias
 
 import cudf
-from cudf.api.types import (
-    _is_scalar_or_zero_d_array,
-    is_integer,
-    is_integer_dtype,
-)
+from cudf.api.types import _is_scalar_or_zero_d_array, is_integer
 from cudf.core.copy_types import BooleanMask, GatherMap
 
 
@@ -233,7 +229,7 @@ def parse_row_iloc_indexer(key: Any, n: int) -> IndexingSpec:
             return MaskIndexer(BooleanMask(key, n))
         elif len(key) == 0:
             return EmptyIndexer()
-        elif is_integer_dtype(key.dtype):
+        elif key.dtype.kind in "iu":
             return MapIndexer(GatherMap(key, n, nullify=False))
         else:
             raise TypeError(
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index eb077179562..d8dbaa897e7 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -24,7 +24,6 @@
     _is_scalar_or_zero_d_array,
     is_dict_like,
     is_integer,
-    is_integer_dtype,
     is_scalar,
 )
 from cudf.core import indexing_utils
@@ -356,12 +355,10 @@ def _loc_to_iloc(self, arg):
             )
             if not _is_non_decimal_numeric_dtype(index_dtype) and not (
                 isinstance(index_dtype, cudf.CategoricalDtype)
-                and is_integer_dtype(index_dtype.categories.dtype)
+                and index_dtype.categories.dtype.kind in "iu"
             ):
                 # TODO: switch to cudf.utils.dtypes.is_integer(arg)
-                if isinstance(arg, cudf.Scalar) and is_integer_dtype(
-                    arg.dtype
-                ):
+                if isinstance(arg, cudf.Scalar) and arg.dtype.kind in "iu":
                     # Do not remove until pandas 3.0 support is added.
                     assert (
                         PANDAS_LT_300
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 7efe13d9b45..b93528f9693 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -12,7 +12,6 @@
 from cudf.api.types import (
     _is_scalar_or_zero_d_array,
     is_integer,
-    is_integer_dtype,
     is_numeric_dtype,
 )
 from cudf.core.column import ColumnBase, as_column
@@ -352,7 +351,7 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase:
             arg = as_column(arg)
             if len(arg) == 0:
                 arg = cudf.core.column.column_empty(0, dtype="int32")
-            if is_integer_dtype(arg.dtype):
+            if arg.dtype.kind in "iu":
                 return self._column.take(arg)
             if arg.dtype.kind == "b":
                 if (bn := len(arg)) != (n := len(self)):
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 53ed5d728cb..e2ce5c03b70 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10833,7 +10833,7 @@ def test_dataframe_contains(name, contains, other_names):
         expectation = contains is cudf.NA and name is cudf.NA
         assert (contains in pdf) == expectation
         assert (contains in gdf) == expectation
-    elif pd.api.types.is_float_dtype(gdf.columns.dtype):
+    elif gdf.columns.dtype.kind == "f":
         # In some cases, the columns are converted to an Index[float] based on
         # the other column names. That casts name values from None to np.nan.
         expectation = contains is np.nan and (name is None or name is np.nan)
diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py
index c0de5274742..b0788bcc0fc 100644
--- a/python/cudf/cudf/utils/dtypes.py
+++ b/python/cudf/cudf/utils/dtypes.py
@@ -1,7 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import datetime
 from decimal import Decimal
+from typing import TYPE_CHECKING
 
 import cupy as cp
 import numpy as np
@@ -11,6 +13,9 @@
 
 import cudf
 
+if TYPE_CHECKING:
+    from cudf._typing import DtypeObj
+
 """Map numpy dtype to pyarrow types.
 Note that np.bool_ bitwidth (8) is different from pa.bool_ (1). Special
 handling is required when converting a Boolean column into arrow.
@@ -568,25 +573,18 @@ def _dtype_pandas_compatible(dtype):
     return dtype
 
 
-def _maybe_convert_to_default_type(dtype):
+def _maybe_convert_to_default_type(dtype: DtypeObj) -> DtypeObj:
     """Convert `dtype` to default if specified by user.
 
     If not specified, return as is.
     """
-    if cudf.get_option("default_integer_bitwidth"):
-        if cudf.api.types.is_signed_integer_dtype(dtype):
-            return cudf.dtype(
-                f'i{cudf.get_option("default_integer_bitwidth")//8}'
-            )
-        elif cudf.api.types.is_unsigned_integer_dtype(dtype):
-            return cudf.dtype(
-                f'u{cudf.get_option("default_integer_bitwidth")//8}'
-            )
-    if cudf.get_option(
-        "default_float_bitwidth"
-    ) and cudf.api.types.is_float_dtype(dtype):
-        return cudf.dtype(f'f{cudf.get_option("default_float_bitwidth")//8}')
-
+    if ib := cudf.get_option("default_integer_bitwidth"):
+        if dtype.kind == "i":
+            return cudf.dtype(f"i{ib//8}")
+        elif dtype.kind == "u":
+            return cudf.dtype(f"u{ib//8}")
+    if (fb := cudf.get_option("default_float_bitwidth")) and dtype.kind == "f":
+        return cudf.dtype(f"f{fb//8}")
     return dtype
 
 
From e169e8e4273e4d317e3f27c810c5b137dd75adb3 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 19 Jul 2024 16:36:03 -0700
Subject: [PATCH 553/842] Implement read_csv in cudf-polars using pylibcudf
 (#16307)

Replace cudf-classic with pylibcudf for CSV reading in cudf-polars

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16307
---
 python/cudf_polars/cudf_polars/dsl/ir.py | 50 ++++++++++++------------
 python/cudf_polars/tests/test_scan.py    | 38 ++++++++++++++++++
 2 files changed, 64 insertions(+), 24 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 0b14530e0ed..a84fe73810e 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -242,10 +242,6 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         with_columns = options.with_columns
         row_index = options.row_index
         if self.typ == "csv":
-            dtype_map = {
-                name: cudf._lib.types.PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[typ.id()]
-                for name, typ in self.schema.items()
-            }
             parse_options = self.reader_options["parse_options"]
             sep = chr(parse_options["separator"])
             quote = chr(parse_options["quote_char"])
@@ -280,31 +276,37 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             pieces = []
             for p in self.paths:
                 skiprows = self.reader_options["skip_rows"]
-                # TODO: read_csv expands globs which we should not do,
-                # because polars will already have handled them.
                 path = Path(p)
                 with path.open() as f:
                     while f.readline() == "\n":
                         skiprows += 1
-                pieces.append(
-                    cudf.read_csv(
-                        path,
-                        sep=sep,
-                        quotechar=quote,
-                        lineterminator=eol,
-                        names=column_names,
-                        header=header,
-                        usecols=usecols,
-                        na_filter=True,
-                        na_values=null_values,
-                        keep_default_na=False,
-                        skiprows=skiprows,
-                        comment=comment,
-                        decimal=decimal,
-                        dtype=dtype_map,
-                    )
+                tbl_w_meta = plc.io.csv.read_csv(
+                    plc.io.SourceInfo([path]),
+                    delimiter=sep,
+                    quotechar=quote,
+                    lineterminator=eol,
+                    col_names=column_names,
+                    header=header,
+                    usecols=usecols,
+                    na_filter=True,
+                    na_values=null_values,
+                    keep_default_na=False,
+                    skiprows=skiprows,
+                    comment=comment,
+                    decimal=decimal,
+                    dtypes=self.schema,
+                )
+                pieces.append(tbl_w_meta)
+            tables, colnames = zip(
+                *(
+                    (piece.tbl, piece.column_names(include_children=False))
+                    for piece in pieces
                 )
-            df = DataFrame.from_cudf(cudf.concat(pieces))
+            )
+            df = DataFrame.from_table(
+                plc.concatenate.concatenate(list(tables)),
+                colnames[0],
+            )
         elif self.typ == "parquet":
             cdf = cudf.read_parquet(self.paths, columns=with_columns)
             assert isinstance(cdf, cudf.DataFrame)
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index d0c41090433..0981a96a34a 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -2,6 +2,8 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import os
+
 import pytest
 
 import polars as pl
@@ -129,6 +131,42 @@ def test_scan_csv_column_renames_projection_schema(tmp_path):
     assert_gpu_result_equal(q)
 
 
+@pytest.mark.parametrize(
+    "filename,glob",
+    [
+        (["test1.csv", "test2.csv"], True),
+        ("test*.csv", True),
+        # Make sure we don't expand glob when
+        # trying to read a file like test*.csv
+        # when glob=False
+        ("test*.csv", False),
+    ],
+)
+def test_scan_csv_multi(tmp_path, filename, glob):
+    with (tmp_path / "test1.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    with (tmp_path / "test2.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    with (tmp_path / "test*.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    os.chdir(tmp_path)
+    q = pl.scan_csv(filename, glob=glob)
+
+    assert_gpu_result_equal(q)
+
+
+def test_scan_csv_multi_differing_colnames(tmp_path):
+    with (tmp_path / "test1.csv").open("w") as f:
+        f.write("""foo,bar,baz\n1,2\n3,4,5""")
+    with (tmp_path / "test2.csv").open("w") as f:
+        f.write("""abc,def,ghi\n1,2\n3,4,5""")
+    q = pl.scan_csv(
+        [tmp_path / "test1.csv", tmp_path / "test2.csv"],
+    )
+    with pytest.raises(pl.exceptions.ComputeError):
+        q.explain()
+
+
 def test_scan_csv_skip_after_header_not_implemented(tmp_path):
     with (tmp_path / "test.csv").open("w") as f:
         f.write("""foo,bar,baz\n1,2,3\n3,4,5""")

From 535db9b26ed1a57e4275f4a6f11b04ebeee21248 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 19 Jul 2024 17:28:14 -0700
Subject: [PATCH 554/842] Deprecate Arrow support in I/O (#16132)

Contributes to https://github.com/rapidsai/cudf/issues/15193

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16132
---
 .../cudf/_lib/pylibcudf/io/datasource.pyx     |  10 +-
 python/cudf/cudf/io/csv.py                    |   2 +-
 python/cudf/cudf/io/orc.py                    |  33 +++--
 python/cudf/cudf/io/parquet.py                |  40 ++++--
 .../io/test_source_sink_info.py               |  21 +--
 python/cudf/cudf/tests/test_csv.py            |   5 +-
 python/cudf/cudf/tests/test_gcs.py            |   3 +-
 python/cudf/cudf/tests/test_parquet.py        |  19 +--
 python/cudf/cudf/tests/test_s3.py             | 136 ++++++++++--------
 python/cudf/cudf/utils/ioutils.py             |  78 ++++++++--
 python/cudf/cudf/utils/utils.py               |  26 ++++
 .../dask_cudf/dask_cudf/io/tests/test_s3.py   |   6 +-
 12 files changed, 247 insertions(+), 132 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
index aa7fa0efdaf..8f265f585de 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
@@ -7,6 +7,8 @@ from pyarrow.lib cimport NativeFile
 from cudf._lib.pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
 from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
 
+import warnings
+
 
 cdef class Datasource:
     cdef datasource* get_datasource(self) except * nogil:
@@ -16,10 +18,16 @@ cdef class Datasource:
 
 cdef class NativeFileDatasource(Datasource):
 
-    def __cinit__(self, NativeFile native_file,):
+    def __cinit__(self, NativeFile native_file):
 
         cdef shared_ptr[CRandomAccessFile] ra_src
 
+        warnings.warn(
+            "Support for reading pyarrow's NativeFile is deprecated "
+            "and will be removed in a future release of cudf.",
+            FutureWarning,
+        )
+
         ra_src = native_file.get_random_access_file()
         self.c_datasource.reset(new arrow_io_source(ra_src))
 
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index e909d96309e..0f2820a01e9 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -50,7 +50,7 @@ def read_csv(
     comment=None,
     delim_whitespace=False,
     byte_range=None,
-    use_python_file_object=True,
+    use_python_file_object=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 7082a85237a..289292b5182 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -10,6 +10,7 @@
 from cudf._lib import orc as liborc
 from cudf.api.types import is_list_like
 from cudf.utils import ioutils
+from cudf.utils.utils import maybe_filter_deprecation
 
 
 def _make_empty_df(filepath_or_buffer, columns):
@@ -280,7 +281,7 @@ def read_orc(
     num_rows=None,
     use_index=True,
     timestamp_type=None,
-    use_python_file_object=True,
+    use_python_file_object=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
@@ -320,6 +321,9 @@ def read_orc(
             )
 
     filepaths_or_buffers = []
+    have_nativefile = any(
+        isinstance(source, pa.NativeFile) for source in filepath_or_buffer
+    )
     for source in filepath_or_buffer:
         if ioutils.is_directory(
             path_or_data=source, storage_options=storage_options
@@ -360,17 +364,24 @@ def read_orc(
             stripes = selected_stripes
 
     if engine == "cudf":
-        return DataFrame._from_data(
-            *liborc.read_orc(
-                filepaths_or_buffers,
-                columns,
-                stripes,
-                skiprows,
-                num_rows,
-                use_index,
-                timestamp_type,
+        # Don't want to warn if use_python_file_object causes us to get
+        # a NativeFile (there is a separate deprecation warning for that)
+        with maybe_filter_deprecation(
+            not have_nativefile,
+            message="Support for reading pyarrow's NativeFile is deprecated",
+            category=FutureWarning,
+        ):
+            return DataFrame._from_data(
+                *liborc.read_orc(
+                    filepaths_or_buffers,
+                    columns,
+                    stripes,
+                    skiprows,
+                    num_rows,
+                    use_index,
+                    timestamp_type,
+                )
             )
-        )
     else:
         from pyarrow import orc
 
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 02b26ea1c01..0f0a240b5d0 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -15,6 +15,7 @@
 
 import numpy as np
 import pandas as pd
+import pyarrow as pa
 from pyarrow import dataset as ds
 
 import cudf
@@ -23,6 +24,7 @@
 from cudf.core.column import as_column, build_categorical_column, column_empty
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
+from cudf.utils.utils import maybe_filter_deprecation
 
 BYTE_SIZES = {
     "kb": 1000,
@@ -350,7 +352,7 @@ def read_parquet_metadata(filepath_or_buffer):
             path_or_data=source,
             compression=None,
             fs=fs,
-            use_python_file_object=True,
+            use_python_file_object=None,
             open_file_options=None,
             storage_options=None,
             bytes_per_thread=None,
@@ -532,7 +534,7 @@ def read_parquet(
     filters=None,
     row_groups=None,
     use_pandas_metadata=True,
-    use_python_file_object=True,
+    use_python_file_object=None,
     categorical_partitions=True,
     open_file_options=None,
     bytes_per_thread=None,
@@ -615,6 +617,9 @@ def read_parquet(
             row_groups=row_groups,
             fs=fs,
         )
+    have_nativefile = any(
+        isinstance(source, pa.NativeFile) for source in filepath_or_buffer
+    )
     for source in filepath_or_buffer:
         tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
             path_or_data=source,
@@ -662,19 +667,26 @@ def read_parquet(
         )
 
     # Convert parquet data to a cudf.DataFrame
-    df = _parquet_to_frame(
-        filepaths_or_buffers,
-        engine,
-        *args,
-        columns=columns,
-        row_groups=row_groups,
-        use_pandas_metadata=use_pandas_metadata,
-        partition_keys=partition_keys,
-        partition_categories=partition_categories,
-        dataset_kwargs=dataset_kwargs,
-        **kwargs,
-    )
 
+    # Don't want to warn if use_python_file_object causes us to get
+    # a NativeFile (there is a separate deprecation warning for that)
+    with maybe_filter_deprecation(
+        not have_nativefile,
+        message="Support for reading pyarrow's NativeFile is deprecated",
+        category=FutureWarning,
+    ):
+        df = _parquet_to_frame(
+            filepaths_or_buffers,
+            engine,
+            *args,
+            columns=columns,
+            row_groups=row_groups,
+            use_pandas_metadata=use_pandas_metadata,
+            partition_keys=partition_keys,
+            partition_categories=partition_categories,
+            dataset_kwargs=dataset_kwargs,
+            **kwargs,
+        )
     # Apply filters row-wise (if any are defined), and return
     df = _apply_post_filters(df, filters)
     if projected_columns:
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
index 287dd8f21c8..438c482b77a 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
@@ -2,11 +2,9 @@
 
 import io
 
-import pyarrow as pa
 import pytest
 
 import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.io.datasource import NativeFileDatasource
 
 
 @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo])
@@ -18,10 +16,8 @@ def _skip_invalid_sinks(io_class, sink):
     """
     Skip invalid sinks for SinkInfo
     """
-    if io_class is plc.io.SinkInfo and isinstance(
-        sink, (bytes, NativeFileDatasource)
-    ):
-        pytest.skip(f"{sink} is not a valid input for SinkInfo")
+    if io_class is plc.io.SinkInfo and isinstance(sink, bytes):
+        pytest.skip("bytes is not a valid input for SinkInfo")
 
 
 @pytest.mark.parametrize(
@@ -30,7 +26,6 @@ def _skip_invalid_sinks(io_class, sink):
         "a.txt",
         b"hello world",
         io.BytesIO(b"hello world"),
-        NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
     ],
 )
 def test_source_info_ctor(io_class, source, tmp_path):
@@ -47,13 +42,12 @@ def test_source_info_ctor(io_class, source, tmp_path):
 @pytest.mark.parametrize(
     "sources",
     [
+        ["a.txt"],
+        [b"hello world"],
+        [io.BytesIO(b"hello world")],
         ["a.txt", "a.txt"],
         [b"hello world", b"hello there"],
         [io.BytesIO(b"hello world"), io.BytesIO(b"hello there")],
-        [
-            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
-            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
-        ],
     ],
 )
 def test_source_info_ctor_multiple(io_class, sources, tmp_path):
@@ -79,11 +73,6 @@ def test_source_info_ctor_multiple(io_class, sources, tmp_path):
             io.BytesIO(b"hello there"),
             b"hello world",
         ],
-        [
-            NativeFileDatasource(pa.PythonFile(io.BytesIO(), mode="r")),
-            "awef.txt",
-            b"hello world",
-        ],
     ],
 )
 def test_source_info_ctor_mixing_invalid(io_class, sources, tmp_path):
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 0525b02b698..6a21cb1b9d7 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -1085,8 +1085,9 @@ def test_csv_reader_arrow_nativefile(path_or_buf):
     # Arrow FileSystem interface
     expect = cudf.read_csv(path_or_buf("filepath"))
     fs, path = pa_fs.FileSystem.from_uri(path_or_buf("filepath"))
-    with fs.open_input_file(path) as fil:
-        got = cudf.read_csv(fil)
+    with pytest.warns(FutureWarning):
+        with fs.open_input_file(path) as fil:
+            got = cudf.read_csv(fil)
 
     assert_eq(expect, got)
 
diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py
index fc22d8bc0ea..28fdfb5c2f1 100644
--- a/python/cudf/cudf/tests/test_gcs.py
+++ b/python/cudf/cudf/tests/test_gcs.py
@@ -46,7 +46,8 @@ def mock_size(*args):
     # use_python_file_object=True, because the pyarrow
     # `open_input_file` command will fail (since it doesn't
     # use the monkey-patched `open` definition)
-    got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False)
+    with pytest.warns(FutureWarning):
+        got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False)
     assert_eq(pdf, got)
 
     # AbstractBufferedFile -> PythonFile conversion
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index ecb7fd44422..f2820d9c112 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -711,7 +711,8 @@ def test_parquet_reader_arrow_nativefile(parquet_path_or_buf):
     expect = cudf.read_parquet(parquet_path_or_buf("filepath"))
     fs, path = pa_fs.FileSystem.from_uri(parquet_path_or_buf("filepath"))
     with fs.open_input_file(path) as fil:
-        got = cudf.read_parquet(fil)
+        with pytest.warns(FutureWarning):
+            got = cudf.read_parquet(fil)
 
     assert_eq(expect, got)
 
@@ -726,16 +727,18 @@ def test_parquet_reader_use_python_file_object(
     fs, _, paths = get_fs_token_paths(parquet_path_or_buf("filepath"))
 
     # Pass open fsspec file
-    with fs.open(paths[0], mode="rb") as fil:
-        got1 = cudf.read_parquet(
-            fil, use_python_file_object=use_python_file_object
-        )
+    with pytest.warns(FutureWarning):
+        with fs.open(paths[0], mode="rb") as fil:
+            got1 = cudf.read_parquet(
+                fil, use_python_file_object=use_python_file_object
+            )
     assert_eq(expect, got1)
 
     # Pass path only
-    got2 = cudf.read_parquet(
-        paths[0], use_python_file_object=use_python_file_object
-    )
+    with pytest.warns(FutureWarning):
+        got2 = cudf.read_parquet(
+            paths[0], use_python_file_object=use_python_file_object
+        )
     assert_eq(expect, got2)
 
 
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index a44bf791767..3ae318d3bf5 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -138,22 +138,24 @@ def test_read_csv(s3_base, s3so, pdf, bytes_per_thread):
     buffer = pdf.to_csv(index=False)
 
     # Use fsspec file object
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_csv(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            bytes_per_thread=bytes_per_thread,
-            use_python_file_object=False,
-        )
+    with pytest.warns(FutureWarning):
+        with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+            got = cudf.read_csv(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                bytes_per_thread=bytes_per_thread,
+                use_python_file_object=False,
+            )
     assert_eq(pdf, got)
 
     # Use Arrow PythonFile object
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_csv(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            use_python_file_object=True,
-        )
+    with pytest.warns(FutureWarning):
+        with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+            got = cudf.read_csv(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                use_python_file_object=True,
+            )
     assert_eq(pdf, got)
 
 
@@ -166,8 +168,9 @@ def test_read_csv_arrow_nativefile(s3_base, s3so, pdf):
         fs = pa_fs.S3FileSystem(
             endpoint_override=s3so["client_kwargs"]["endpoint_url"],
         )
-        with fs.open_input_file(f"{bucket}/{fname}") as fil:
-            got = cudf.read_csv(fil)
+        with pytest.warns(FutureWarning):
+            with fs.open_input_file(f"{bucket}/{fname}") as fil:
+                got = cudf.read_csv(fil)
 
     assert_eq(pdf, got)
 
@@ -184,17 +187,18 @@ def test_read_csv_byte_range(
 
     # Use fsspec file object
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_csv(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            byte_range=(74, 73),
-            bytes_per_thread=bytes_per_thread
-            if not use_python_file_object
-            else None,
-            header=None,
-            names=["Integer", "Float", "Integer2", "String", "Boolean"],
-            use_python_file_object=use_python_file_object,
-        )
+        with pytest.warns(FutureWarning):
+            got = cudf.read_csv(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                byte_range=(74, 73),
+                bytes_per_thread=bytes_per_thread
+                if not use_python_file_object
+                else None,
+                header=None,
+                names=["Integer", "Float", "Integer2", "String", "Boolean"],
+                use_python_file_object=use_python_file_object,
+            )
 
     assert_eq(pdf.iloc[-2:].reset_index(drop=True), got)
 
@@ -241,18 +245,19 @@ def test_read_parquet(
     # Check direct path handling
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got1 = cudf.read_parquet(
-            f"s3://{bucket}/{fname}",
-            open_file_options=(
-                {"precache_options": {"method": precache}}
-                if use_python_file_object
-                else None
-            ),
-            storage_options=s3so,
-            bytes_per_thread=bytes_per_thread,
-            columns=columns,
-            use_python_file_object=use_python_file_object,
-        )
+        with pytest.warns(FutureWarning):
+            got1 = cudf.read_parquet(
+                f"s3://{bucket}/{fname}",
+                open_file_options=(
+                    {"precache_options": {"method": precache}}
+                    if use_python_file_object
+                    else None
+                ),
+                storage_options=s3so,
+                bytes_per_thread=bytes_per_thread,
+                columns=columns,
+                use_python_file_object=use_python_file_object,
+            )
     expect = pdf[columns] if columns else pdf
     assert_eq(expect, got1)
 
@@ -263,12 +268,13 @@ def test_read_parquet(
             f"s3://{bucket}/{fname}", storage_options=s3so
         )[0]
         with fs.open(f"s3://{bucket}/{fname}", mode="rb") as f:
-            got2 = cudf.read_parquet(
-                f,
-                bytes_per_thread=bytes_per_thread,
-                columns=columns,
-                use_python_file_object=use_python_file_object,
-            )
+            with pytest.warns(FutureWarning):
+                got2 = cudf.read_parquet(
+                    f,
+                    bytes_per_thread=bytes_per_thread,
+                    columns=columns,
+                    use_python_file_object=use_python_file_object,
+                )
     assert_eq(expect, got2)
 
 
@@ -353,11 +359,12 @@ def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns):
     pdf.to_parquet(path=buffer)
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        fs = pa_fs.S3FileSystem(
-            endpoint_override=s3so["client_kwargs"]["endpoint_url"],
-        )
-        with fs.open_input_file(f"{bucket}/{fname}") as fil:
-            got = cudf.read_parquet(fil, columns=columns)
+        with pytest.warns(FutureWarning):
+            fs = pa_fs.S3FileSystem(
+                endpoint_override=s3so["client_kwargs"]["endpoint_url"],
+            )
+            with fs.open_input_file(f"{bucket}/{fname}") as fil:
+                got = cudf.read_parquet(fil, columns=columns)
 
     expect = pdf[columns] if columns else pdf
     assert_eq(expect, got)
@@ -372,12 +379,13 @@ def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache):
     buffer.seek(0)
     filters = [("String", "==", "Omega")]
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_parquet(
-            f"s3://{bucket}/{fname}",
-            storage_options=s3so,
-            filters=filters,
-            open_file_options={"precache_options": {"method": precache}},
-        )
+        with pytest.warns(FutureWarning):
+            got = cudf.read_parquet(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                filters=filters,
+                open_file_options={"precache_options": {"method": precache}},
+            )
 
     # All row-groups should be filtered out
     assert_eq(pdf_ext.iloc[:0], got.reset_index(drop=True))
@@ -449,12 +457,13 @@ def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns):
         buffer = f.read()
 
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        got = cudf.read_orc(
-            f"s3://{bucket}/{fname}",
-            columns=columns,
-            storage_options=s3so,
-            use_python_file_object=use_python_file_object,
-        )
+        with pytest.warns(FutureWarning):
+            got = cudf.read_orc(
+                f"s3://{bucket}/{fname}",
+                columns=columns,
+                storage_options=s3so,
+                use_python_file_object=use_python_file_object,
+            )
 
     if columns:
         expect = expect[columns]
@@ -475,8 +484,9 @@ def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns):
         fs = pa_fs.S3FileSystem(
             endpoint_override=s3so["client_kwargs"]["endpoint_url"],
         )
-        with fs.open_input_file(f"{bucket}/{fname}") as fil:
-            got = cudf.read_orc(fil, columns=columns)
+        with pytest.warns(FutureWarning):
+            with fs.open_input_file(f"{bucket}/{fname}") as fil:
+                got = cudf.read_orc(fil, columns=columns)
 
     if columns:
         expect = expect[columns]
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 76c7f2bfdb8..80555750b3a 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -6,6 +6,7 @@
 import warnings
 from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper
 from threading import Thread
+from typing import Callable
 
 import fsspec
 import fsspec.implementations.local
@@ -15,6 +16,7 @@
 from pyarrow import PythonFile as ArrowPythonFile
 from pyarrow.lib import NativeFile
 
+from cudf.api.extensions import no_default
 from cudf.core._compat import PANDAS_LT_300
 from cudf.utils.docutils import docfmt_partial
 
@@ -24,7 +26,6 @@
 except ImportError:
     fsspec_parquet = None
 
-
 _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
 _ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024
 
@@ -86,7 +87,7 @@
 1       20  rapids
 2       30      ai
 """.format(remote_data_sources=_docstring_remote_sources)
-doc_read_avro = docfmt_partial(docstring=_docstring_read_avro)
+doc_read_avro: Callable = docfmt_partial(docstring=_docstring_read_avro)
 
 _docstring_read_parquet_metadata = """
 Read a Parquet file's metadata and schema
@@ -174,15 +175,23 @@
     columns are also loaded.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. Setting this argument to `False`
-    will require the entire file to be copied to host memory, and is highly
-    discouraged.
+    AbstractBufferedFile objects at IO time.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers in the future.
 open_file_options : dict, optional
     Dictionary of key-value pairs to pass to the function used to open remote
     files. By default, this will be `fsspec.parquet.open_parquet_file`. To
     deactivate optimized precaching, set the "method" to `None` under the
     "precache_options" key. Note that the `open_file_func` key can also be
     used to specify a custom file-open function.
+
+    .. deprecated:: 24.08
+        `open_file_options` is deprecated as it was intended for
+        pyarrow file inputs, which will no longer be accepted as
+        input/output cudf readers/writers in the future.
 bytes_per_thread : int, default None
     Determines the number of bytes to be allocated per thread to read the
     files in parallel. When there is a file of large size, we get slightly
@@ -468,8 +477,12 @@
     If True, use row index if available for faster seeking.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. This option is likely to improve
-    performance when making small reads from larger ORC files.
+    AbstractBufferedFile objects at IO time.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers in the future.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -934,7 +947,7 @@
 --------
 cudf.DataFrame.to_hdf : Write a HDF file from a DataFrame.
 """
-doc_read_hdf = docfmt_partial(docstring=_docstring_read_hdf)
+doc_read_hdf: Callable = docfmt_partial(docstring=_docstring_read_hdf)
 
 _docstring_to_hdf = """
 Write the contained data to an HDF5 file using HDFStore.
@@ -1006,7 +1019,7 @@
 cudf.DataFrame.to_parquet : Write a DataFrame to the binary parquet format.
 cudf.DataFrame.to_feather : Write out feather-format for DataFrames.
 """
-doc_to_hdf = docfmt_partial(docstring=_docstring_to_hdf)
+doc_to_hdf: Callable = docfmt_partial(docstring=_docstring_to_hdf)
 
 _docstring_read_feather = """
 Load an feather object from the file path, returning a DataFrame.
@@ -1188,8 +1201,12 @@
     the end of the range.
 use_python_file_object : boolean, default True
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time. This option is likely to improve
-    performance when making small reads from larger CSV files.
+    AbstractBufferedFile objects at IO time.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers in the future.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -1409,7 +1426,7 @@
 result : Series
 
 """
-doc_read_text = docfmt_partial(docstring=_docstring_text_datasource)
+doc_read_text: Callable = docfmt_partial(docstring=_docstring_text_datasource)
 
 
 _docstring_get_reader_filepath_or_buffer = """
@@ -1430,9 +1447,19 @@
 use_python_file_object : boolean, default False
     If True, Arrow-backed PythonFile objects will be used in place
     of fsspec AbstractBufferedFile objects.
+
+    .. deprecated:: 24.08
+        `use_python_file_object` is deprecated and will be removed in a future
+        version of cudf, as PyArrow NativeFiles will no longer be accepted as
+        input/output in cudf readers/writers.
 open_file_options : dict, optional
     Optional dictionary of keyword arguments to pass to
     `_open_remote_files` (used for remote storage only).
+
+    .. deprecated:: 24.08
+        `open_file_options` is deprecated as it was intended for
+        pyarrow file inputs, which will no longer be accepted as
+        input/output cudf readers/writers in the future.
 allow_raw_text_input : boolean, default False
     If True, this indicates the input `path_or_data` could be a raw text
     input and will not check for its existence in the filesystem. If False,
@@ -1708,7 +1735,8 @@ def get_reader_filepath_or_buffer(
     mode="rb",
     fs=None,
     iotypes=(BytesIO, NativeFile),
-    use_python_file_object=False,
+    # no_default aliases to False
+    use_python_file_object=no_default,
     open_file_options=None,
     allow_raw_text_input=False,
     storage_options=None,
@@ -1720,6 +1748,30 @@ def get_reader_filepath_or_buffer(
 
     path_or_data = stringify_pathlike(path_or_data)
 
+    if use_python_file_object is no_default:
+        use_python_file_object = False
+    elif use_python_file_object is not None:
+        warnings.warn(
+            "The 'use_python_file_object' keyword is deprecated and "
+            "will be removed in a future version.",
+            FutureWarning,
+        )
+    else:
+        # Preserve the readers (e.g. read_csv) default of True
+        # if no use_python_file_object option is specified by the user
+        # for now (note: this is different from the default for this
+        # function of False)
+        # TODO: when non-pyarrow file reading perf is good enough
+        # we can default this to False
+        use_python_file_object = True
+
+    if open_file_options is not None:
+        warnings.warn(
+            "The 'open_file_options' keyword is deprecated and "
+            "will be removed in a future version.",
+            FutureWarning,
+        )
+
     if isinstance(path_or_data, str):
         # Get a filesystem object if one isn't already available
         paths = [path_or_data]
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 7347ec7866a..c9b343e0f9f 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -6,6 +6,7 @@
 import os
 import traceback
 import warnings
+from contextlib import contextmanager
 
 import numpy as np
 import pandas as pd
@@ -403,3 +404,28 @@ def _all_bools_with_nulls(lhs, rhs, bool_fill_value):
     if result_mask is not None:
         result_col = result_col.set_mask(result_mask.as_mask())
     return result_col
+
+
+@contextmanager
+def maybe_filter_deprecation(
+    condition: bool, message: str, category: type[Warning]
+):
+    """Conditionally filter a warning category.
+
+    Parameters
+    ----------
+    condition
+        If true, filter the warning
+    message
+        Message to match, passed to :func:`warnings.filterwarnings`
+    category
+        Category of warning, passed to :func:`warnings.filterwarnings`
+    """
+    with warnings.catch_warnings():
+        if condition:
+            warnings.filterwarnings(
+                "ignore",
+                message,
+                category=category,
+            )
+        yield
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index a67404da4fe..3947c69aaa5 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -138,5 +138,7 @@ def test_read_parquet(s3_base, s3so, open_file_options):
             storage_options=s3so,
             open_file_options=open_file_options,
         )
-        assert df.a.sum().compute() == 10
-        assert df.b.sum().compute() == 9
+        with pytest.warns(FutureWarning):
+            assert df.a.sum().compute() == 10
+        with pytest.warns(FutureWarning):
+            assert df.b.sum().compute() == 9

From 75335f6af51bde6be68c1fb0a6caa8030b9eda3e Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 19 Jul 2024 18:21:27 -0700
Subject: [PATCH 555/842] Report number of rows per file read by PQ reader when
 no row selection and fix segfault in chunked PQ reader when skip_rows > 0
 (#16195)

Closes #15389
Closes #16186

This PR adds the capability to calculate and report the number of rows read from each data source into the table returned by the Parquet reader (both chunked and normal). The returned vector of counts is only valid (non-empty) when row selection (AST filter) is not being used.

This PR also fixes a segfault in chunked parquet reader when skip_rows > 0 and the number of passes > 1. This segfault was being caused by a couple of arithmetic errors when computing the (start_row, num_row)  for row_group_info, pass, column chunk descriptor structs.

Both changes were added to this PR as changes and the gtests from the former work were needed to implement the segfault fix.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16195
---
 cpp/include/cudf/io/types.hpp                 |   3 +
 cpp/src/io/parquet/reader_impl.cpp            |  86 +++-
 cpp/src/io/parquet/reader_impl.hpp            |  31 +-
 cpp/src/io/parquet/reader_impl_chunking.cu    |  53 ++-
 cpp/src/io/parquet/reader_impl_chunking.hpp   |   6 +
 cpp/src/io/parquet/reader_impl_helpers.cpp    |  32 +-
 cpp/src/io/parquet/reader_impl_helpers.hpp    |  20 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  19 +-
 cpp/tests/io/parquet_chunked_reader_test.cu   | 385 ++++++++++++++++++
 cpp/tests/io/parquet_reader_test.cpp          | 203 +++++++++
 .../cudf/_lib/pylibcudf/libcudf/io/types.pxd  |   1 +
 11 files changed, 796 insertions(+), 43 deletions(-)

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 0c96268f6c7..431a5e7be83 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -277,6 +277,9 @@ struct column_name_info {
 struct table_metadata {
   std::vector<column_name_info>
     schema_info;  //!< Detailed name information for the entire output hierarchy
+  std::vector<size_t> num_rows_per_source;  //!< Number of rows read from each data source.
+                                            //!< Currently only computed for Parquet readers if no
+                                            //!< AST filters being used. Empty vector otherwise.
   std::map<std::string, std::string> user_data;  //!< Format-dependent metadata of the first input
                                                  //!< file as key-values pairs (deprecated)
   std::vector<std::unordered_map<std::string, std::string>>
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index f705f6626e7..68ec61ead0a 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -26,6 +26,7 @@
 
 #include <rmm/resource_ref.hpp>
 
+#include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
 
 #include <bitset>
@@ -549,7 +550,17 @@ table_with_metadata reader::impl::read_chunk_internal(read_mode mode)
   out_columns.reserve(_output_buffers.size());
 
   // no work to do (this can happen on the first pass if we have no rows to read)
-  if (!has_more_work()) { return finalize_output(out_metadata, out_columns); }
+  if (!has_more_work()) {
+    // Check if number of rows per source should be included in output metadata.
+    if (include_output_num_rows_per_source()) {
+      // Empty dataframe case: Simply initialize to a list of zeros
+      out_metadata.num_rows_per_source =
+        std::vector<size_t>(_file_itm_data.num_rows_per_source.size(), 0);
+    }
+
+    // Finalize output
+    return finalize_output(mode, out_metadata, out_columns);
+  }
 
   auto& pass            = *_pass_itm_data;
   auto& subpass         = *pass.subpass;
@@ -585,11 +596,80 @@ table_with_metadata reader::impl::read_chunk_internal(read_mode mode)
     }
   }
 
+  // Check if number of rows per source should be included in output metadata.
+  if (include_output_num_rows_per_source()) {
+    // For chunked reading, compute the output number of rows per source
+    if (mode == read_mode::CHUNKED_READ) {
+      out_metadata.num_rows_per_source =
+        calculate_output_num_rows_per_source(read_info.skip_rows, read_info.num_rows);
+    }
+    // Simply move the number of rows per file if reading all at once
+    else {
+      // Move is okay here as we are reading in one go.
+      out_metadata.num_rows_per_source = std::move(_file_itm_data.num_rows_per_source);
+    }
+  }
+
   // Add empty columns if needed. Filter output columns based on filter.
-  return finalize_output(out_metadata, out_columns);
+  return finalize_output(mode, out_metadata, out_columns);
+}
+
+std::vector<size_t> reader::impl::calculate_output_num_rows_per_source(size_t const chunk_start_row,
+                                                                       size_t const chunk_num_rows)
+{
+  // Handle base cases.
+  if (_file_itm_data.num_rows_per_source.size() == 0) {
+    return {};
+  } else if (_file_itm_data.num_rows_per_source.size() == 1) {
+    return {chunk_num_rows};
+  }
+
+  std::vector<size_t> num_rows_per_source(_file_itm_data.num_rows_per_source.size(), 0);
+
+  // Subtract global skip rows from the start_row as we took care of that when computing
+  // _file_itm_data.num_rows_per_source
+  auto const start_row = chunk_start_row - _file_itm_data.global_skip_rows;
+  auto const end_row   = start_row + chunk_num_rows;
+  CUDF_EXPECTS(start_row <= end_row and end_row <= _file_itm_data.global_num_rows,
+               "Encountered invalid output chunk row bounds.");
+
+  // Copy reference to a const local variable for better readability
+  auto const& partial_sum_nrows_source = _file_itm_data.exclusive_sum_num_rows_per_source;
+
+  // Binary search start_row and end_row in exclusive_sum_num_rows_per_source vector
+  auto const start_iter =
+    std::upper_bound(partial_sum_nrows_source.cbegin(), partial_sum_nrows_source.cend(), start_row);
+  auto const end_iter =
+    (end_row == _file_itm_data.global_skip_rows + _file_itm_data.global_num_rows)
+      ? partial_sum_nrows_source.cend() - 1
+      : std::upper_bound(start_iter, partial_sum_nrows_source.cend(), end_row);
+
+  // Compute the array offset index for both iterators
+  auto const start_idx = std::distance(partial_sum_nrows_source.cbegin(), start_iter);
+  auto const end_idx   = std::distance(partial_sum_nrows_source.cbegin(), end_iter);
+
+  CUDF_EXPECTS(start_idx <= end_idx,
+               "Encountered invalid source files indexes for output chunk row bounds");
+
+  // If the entire chunk is from the same source file, then the count is simply num_rows
+  if (start_idx == end_idx) {
+    num_rows_per_source[start_idx] = chunk_num_rows;
+  } else {
+    // Compute the number of rows from the first source file
+    num_rows_per_source[start_idx] = partial_sum_nrows_source[start_idx] - start_row;
+    // Compute the number of rows from the last source file
+    num_rows_per_source[end_idx] = end_row - partial_sum_nrows_source[end_idx - 1];
+    // Simply copy the number of rows for each source in range: (start_idx, end_idx)
+    std::copy(_file_itm_data.num_rows_per_source.cbegin() + start_idx + 1,
+              _file_itm_data.num_rows_per_source.cbegin() + end_idx,
+              num_rows_per_source.begin() + start_idx + 1);
+  }
+
+  return num_rows_per_source;
 }
 
-table_with_metadata reader::impl::finalize_output(table_metadata& out_metadata,
+table_with_metadata reader::impl::finalize_output(read_mode mode,
+                                                  table_metadata& out_metadata,
                                                   std::vector<std::unique_ptr<column>>& out_columns)
 {
   // Create empty columns as needed (this can happen if we've ended up with no actual data to read)
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 3b8e80a29e6..5e3cc4301f9 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -262,11 +262,13 @@ class reader::impl {
    * @brief Finalize the output table by adding empty columns for the non-selected columns in
    * schema.
    *
+   * @param read_mode Value indicating if the data sources are read all at once or chunk by chunk
    * @param out_metadata The output table metadata
    * @param out_columns The columns for building the output table
    * @return The output table along with columns' metadata
    */
-  table_with_metadata finalize_output(table_metadata& out_metadata,
+  table_with_metadata finalize_output(read_mode mode,
+                                      table_metadata& out_metadata,
                                       std::vector<std::unique_ptr<column>>& out_columns);
 
   /**
@@ -336,11 +338,36 @@ class reader::impl {
              : true;
   }
 
+  /**
+   * @brief Check if this is the first output chunk
+   *
+   * @return True if this is the first output chunk
+   */
   [[nodiscard]] bool is_first_output_chunk() const
   {
     return _file_itm_data._output_chunk_count == 0;
   }
 
+  /**
+   * @brief Check if number of rows per source should be included in output metadata.
+   *
+   * @return True if AST filter is not present
+   */
+  [[nodiscard]] bool include_output_num_rows_per_source() const
+  {
+    return not _expr_conv.get_converted_expr().has_value();
+  }
+
+  /**
+   * @brief Calculate the number of rows read from each source in the output chunk
+   *
+   * @param chunk_start_row The offset of the first row in the output chunk
+   * @param chunk_num_rows The number of rows in the the output chunk
+   * @return Vector of number of rows from each respective data source in the output chunk
+   */
+  [[nodiscard]] std::vector<size_t> calculate_output_num_rows_per_source(size_t chunk_start_row,
+                                                                         size_t chunk_num_rows);
+
   rmm::cuda_stream_view _stream;
   rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
 
@@ -387,7 +414,7 @@ class reader::impl {
 
   // chunked reading happens in 2 parts:
   //
-  // At the top level, the entire file is divided up into "passes" omn which we try and limit the
+  // At the top level, the entire file is divided up into "passes" on which we try and limit the
   // total amount of temporary memory (compressed data, decompressed data) in use
   // via _input_pass_read_limit.
   //
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 3da303e6928..05e0d8c0111 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -1232,22 +1232,22 @@ void reader::impl::setup_next_pass(read_mode mode)
       pass.skip_rows = _file_itm_data.global_skip_rows;
       pass.num_rows  = _file_itm_data.global_num_rows;
     } else {
-      auto const global_start_row = _file_itm_data.global_skip_rows;
-      auto const global_end_row   = global_start_row + _file_itm_data.global_num_rows;
-      auto const start_row =
-        std::max(_file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass],
-                 global_start_row);
-      auto const end_row =
-        std::min(_file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass + 1],
-                 global_end_row);
-
-      // skip_rows is always global in the sense that it is relative to the first row of
-      // everything we will be reading, regardless of what pass we are on.
-      // num_rows is how many rows we are reading this pass.
-      pass.skip_rows =
-        global_start_row +
+      // pass_start_row and pass_end_row are computed from the selected row groups relative to the
+      // global_skip_rows.
+      auto const pass_start_row =
         _file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass];
-      pass.num_rows = end_row - start_row;
+      auto const pass_end_row =
+        std::min(_file_itm_data.input_pass_start_row_count[_file_itm_data._current_input_pass + 1],
+                 _file_itm_data.global_num_rows);
+
+      // pass.skip_rows is always global in the sense that it is relative to the first row of
+      // the data source (global row number 0), regardless of what pass we are on. Therefore,
+      // we must re-add global_skip_rows to the pass_start_row which is relative to the
+      // global_skip_rows.
+      pass.skip_rows = _file_itm_data.global_skip_rows + pass_start_row;
+      // num_rows is how many rows we are reading this pass. Since this is a difference, adding
+      // global_skip_rows to both variables is redundant.
+      pass.num_rows = pass_end_row - pass_start_row;
     }
 
     // load page information for the chunk. this retrieves the compressed bytes for all the
@@ -1509,6 +1509,7 @@ void reader::impl::create_global_chunk_info()
 
   // Initialize column chunk information
   auto remaining_rows = num_rows;
+  auto skip_rows      = _file_itm_data.global_skip_rows;
   for (auto const& rg : row_groups_info) {
     auto const& row_group      = _metadata->get_row_group(rg.index, rg.source_index);
     auto const row_group_start = rg.start_row;
@@ -1561,7 +1562,12 @@ void reader::impl::create_global_chunk_info()
                                        schema.type == BYTE_ARRAY and _strings_to_categorical));
     }
 
-    remaining_rows -= row_group_rows;
+    // Adjust for skip_rows when updating the remaining rows after the first group
+    remaining_rows -=
+      (skip_rows) ? std::min<int>(rg.start_row + row_group.num_rows - skip_rows, remaining_rows)
+                  : row_group_rows;
+    // Set skip_rows = 0 as it is no longer needed for subsequent row_groups
+    skip_rows = 0;
   }
 }
 
@@ -1598,6 +1604,9 @@ void reader::impl::compute_input_passes()
   _file_itm_data.input_pass_row_group_offsets.push_back(0);
   _file_itm_data.input_pass_start_row_count.push_back(0);
 
+  // To handle global_skip_rows when computing input passes
+  int skip_rows = _file_itm_data.global_skip_rows;
+
   for (size_t cur_rg_index = 0; cur_rg_index < row_groups_info.size(); cur_rg_index++) {
     auto const& rgi       = row_groups_info[cur_rg_index];
     auto const& row_group = _metadata->get_row_group(rgi.index, rgi.source_index);
@@ -1606,6 +1615,14 @@ void reader::impl::compute_input_passes()
     auto const [compressed_rg_size, _ /*compressed + uncompressed*/] =
       get_row_group_size(row_group);
 
+    // We must use the effective size of the first row group we are reading to accurately calculate
+    // the first non-zero input_pass_start_row_count.
+    auto const row_group_rows =
+      (skip_rows) ? rgi.start_row + row_group.num_rows - skip_rows : row_group.num_rows;
+
+    //  Set skip_rows = 0 as it is no longer needed for subsequent row_groups
+    skip_rows = 0;
+
     // can we add this row group
     if (cur_pass_byte_size + compressed_rg_size >= comp_read_limit) {
       // A single row group (the current one) is larger than the read limit:
@@ -1613,7 +1630,7 @@ void reader::impl::compute_input_passes()
       // row group
       if (cur_rg_start == cur_rg_index) {
         _file_itm_data.input_pass_row_group_offsets.push_back(cur_rg_index + 1);
-        _file_itm_data.input_pass_start_row_count.push_back(cur_row_count + row_group.num_rows);
+        _file_itm_data.input_pass_start_row_count.push_back(cur_row_count + row_group_rows);
         cur_rg_start       = cur_rg_index + 1;
         cur_pass_byte_size = 0;
       }
@@ -1627,7 +1644,7 @@ void reader::impl::compute_input_passes()
     } else {
       cur_pass_byte_size += compressed_rg_size;
     }
-    cur_row_count += row_group.num_rows;
+    cur_row_count += row_group_rows;
   }
 
   // add the last pass if necessary
diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp
index b959c793011..3a3cdd34a58 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.hpp
+++ b/cpp/src/io/parquet/reader_impl_chunking.hpp
@@ -41,6 +41,12 @@ struct file_intermediate_data {
   // is not capped by global_skip_rows and global_num_rows.
   std::vector<std::size_t> input_pass_start_row_count{};
 
+  // number of rows to be read from each data source
+  std::vector<std::size_t> num_rows_per_source{};
+
+  // partial sum of the number of rows per data source
+  std::vector<std::size_t> exclusive_sum_num_rows_per_source{};
+
   size_t _current_input_pass{0};  // current input pass index
   size_t _output_chunk_count{0};  // how many output chunks we have produced
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index d1e9a823d3b..581c44d024b 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -945,7 +945,7 @@ std::vector<std::string> aggregate_reader_metadata::get_pandas_index_names() con
   return names;
 }
 
-std::tuple<int64_t, size_type, std::vector<row_group_info>>
+std::tuple<int64_t, size_type, std::vector<row_group_info>, std::vector<size_t>>
 aggregate_reader_metadata::select_row_groups(
   host_span<std::vector<size_type> const> row_group_indices,
   int64_t skip_rows_opt,
@@ -976,6 +976,9 @@ aggregate_reader_metadata::select_row_groups(
                      static_cast<size_type>(from_opts.second)};
   }();
 
+  // Get number of rows in each data source
+  std::vector<size_t> num_rows_per_source(per_file_metadata.size(), 0);
+
   if (!row_group_indices.empty()) {
     CUDF_EXPECTS(row_group_indices.size() == per_file_metadata.size(),
                  "Must specify row groups for each source");
@@ -989,28 +992,45 @@ aggregate_reader_metadata::select_row_groups(
         selection.emplace_back(rowgroup_idx, rows_to_read, src_idx);
         // if page-level indexes are present, then collect extra chunk and page info.
         column_info_for_row_group(selection.back(), 0);
-        rows_to_read += get_row_group(rowgroup_idx, src_idx).num_rows;
+        auto const rows_this_rg = get_row_group(rowgroup_idx, src_idx).num_rows;
+        rows_to_read += rows_this_rg;
+        num_rows_per_source[src_idx] += rows_this_rg;
       }
     }
   } else {
     size_type count = 0;
     for (size_t src_idx = 0; src_idx < per_file_metadata.size(); ++src_idx) {
       auto const& fmd = per_file_metadata[src_idx];
-      for (size_t rg_idx = 0; rg_idx < fmd.row_groups.size(); ++rg_idx) {
+      for (size_t rg_idx = 0;
+           rg_idx < fmd.row_groups.size() and count < rows_to_skip + rows_to_read;
+           ++rg_idx) {
         auto const& rg             = fmd.row_groups[rg_idx];
         auto const chunk_start_row = count;
         count += rg.num_rows;
         if (count > rows_to_skip || count == 0) {
+          // start row of this row group adjusted with rows_to_skip
+          num_rows_per_source[src_idx] += count;
+          num_rows_per_source[src_idx] -=
+            (chunk_start_row <= rows_to_skip) ? rows_to_skip : chunk_start_row;
+
+          // We need the unadjusted start index of this row group to correctly initialize
+          // ColumnChunkDesc for this row group in create_global_chunk_info() and calculate
+          // the row offset for the first pass in compute_input_passes().
           selection.emplace_back(rg_idx, chunk_start_row, src_idx);
-          // if page-level indexes are present, then collect extra chunk and page info.
+
+          // If page-level indexes are present, then collect extra chunk and page info.
+          // The page indexes rely on absolute row numbers, not adjusted for skip_rows.
           column_info_for_row_group(selection.back(), chunk_start_row);
         }
-        if (count >= rows_to_skip + rows_to_read) { break; }
+        // Adjust the number of rows for the last source file.
+        if (count >= rows_to_skip + rows_to_read) {
+          num_rows_per_source[src_idx] -= count - rows_to_skip - rows_to_read;
+        }
       }
     }
   }
 
-  return {rows_to_skip, rows_to_read, std::move(selection)};
+  return {rows_to_skip, rows_to_read, std::move(selection), std::move(num_rows_per_source)};
 }
 
 std::tuple<std::vector<input_column_info>,
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 6bfa8519c76..309132a5347 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -282,17 +282,17 @@ class aggregate_reader_metadata {
    * @param output_column_schemas schema indices of output columns
    * @param filter Optional AST expression to filter row groups based on Column chunk statistics
    * @param stream CUDA stream used for device memory operations and kernel launches
-   * @return A tuple of corrected row_start, row_count and list of row group indexes and its
-   *         starting row
+   * @return A tuple of corrected row_start, row_count, list of row group indexes and its
+   *         starting row, and list of number of rows per source.
    */
-  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<row_group_info>> select_row_groups(
-    host_span<std::vector<size_type> const> row_group_indices,
-    int64_t row_start,
-    std::optional<size_type> const& row_count,
-    host_span<data_type const> output_dtypes,
-    host_span<int const> output_column_schemas,
-    std::optional<std::reference_wrapper<ast::expression const>> filter,
-    rmm::cuda_stream_view stream) const;
+  [[nodiscard]] std::tuple<int64_t, size_type, std::vector<row_group_info>, std::vector<size_t>>
+  select_row_groups(host_span<std::vector<size_type> const> row_group_indices,
+                    int64_t row_start,
+                    std::optional<size_type> const& row_count,
+                    host_span<data_type const> output_dtypes,
+                    host_span<int const> output_column_schemas,
+                    std::optional<std::reference_wrapper<ast::expression const>> filter,
+                    rmm::cuda_stream_view stream) const;
 
   /**
    * @brief Filters and reduces down to a selection of columns
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index f28a7311ccb..ff47dfc4cf3 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -1235,8 +1235,10 @@ void reader::impl::preprocess_file(read_mode mode)
                    [](auto const& col) { return col.type; });
   }
 
-  std::tie(
-    _file_itm_data.global_skip_rows, _file_itm_data.global_num_rows, _file_itm_data.row_groups) =
+  std::tie(_file_itm_data.global_skip_rows,
+           _file_itm_data.global_num_rows,
+           _file_itm_data.row_groups,
+           _file_itm_data.num_rows_per_source) =
     _metadata->select_row_groups(_options.row_group_indices,
                                  _options.skip_rows,
                                  _options.num_rows,
@@ -1245,9 +1247,18 @@ void reader::impl::preprocess_file(read_mode mode)
                                  _expr_conv.get_converted_expr(),
                                  _stream);
 
+  // Inclusive scan the number of rows per source
+  if (not _expr_conv.get_converted_expr().has_value() and mode == read_mode::CHUNKED_READ) {
+    _file_itm_data.exclusive_sum_num_rows_per_source.resize(
+      _file_itm_data.num_rows_per_source.size());
+    thrust::inclusive_scan(_file_itm_data.num_rows_per_source.cbegin(),
+                           _file_itm_data.num_rows_per_source.cend(),
+                           _file_itm_data.exclusive_sum_num_rows_per_source.begin());
+  }
+
   // check for page indexes
-  _has_page_index = std::all_of(_file_itm_data.row_groups.begin(),
-                                _file_itm_data.row_groups.end(),
+  _has_page_index = std::all_of(_file_itm_data.row_groups.cbegin(),
+                                _file_itm_data.row_groups.cend(),
                                 [](auto const& row_group) { return row_group.has_page_index(); });
 
   if (_file_itm_data.global_num_rows > 0 && not _file_itm_data.row_groups.empty() &&
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index cff85647725..2917852235c 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -149,6 +149,33 @@ auto chunked_read(std::string const& filepath,
   return chunked_read(vpath, output_limit, input_limit);
 }
 
+auto const read_table_and_nrows_per_source(cudf::io::chunked_parquet_reader const& reader)
+{
+  auto out_tables       = std::vector<std::unique_ptr<cudf::table>>{};
+  int num_chunks        = 0;
+  auto nrows_per_source = std::vector<size_t>{};
+  while (reader.has_next()) {
+    auto chunk = reader.read_chunk();
+    out_tables.emplace_back(std::move(chunk.tbl));
+    num_chunks++;
+    if (nrows_per_source.empty()) {
+      nrows_per_source = std::move(chunk.metadata.num_rows_per_source);
+    } else {
+      std::transform(chunk.metadata.num_rows_per_source.cbegin(),
+                     chunk.metadata.num_rows_per_source.cend(),
+                     nrows_per_source.begin(),
+                     nrows_per_source.begin(),
+                     std::plus<size_t>());
+    }
+  }
+  auto out_tviews = std::vector<cudf::table_view>{};
+  for (auto const& tbl : out_tables) {
+    out_tviews.emplace_back(tbl->view());
+  }
+
+  return std::tuple(cudf::concatenate(out_tviews), num_chunks, nrows_per_source);
+}
+
 }  // namespace
 
 struct ParquetChunkedReaderTest : public cudf::test::BaseFixture {};
@@ -1477,3 +1504,361 @@ TEST_F(ParquetChunkedReaderTest, TestChunkedReadOutOfBoundChunks)
     CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
   }
 }
+
+TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSource)
+{
+  constexpr int num_rows          = 10'723;  // A prime number
+  constexpr int rows_in_row_group = 500;
+
+  // Table with single col of random int64 values
+  auto const int64_data = random_values<int64_t>(num_rows);
+  auto int64_col        = int64s_col(int64_data.begin(), int64_data.end()).release();
+
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  input_columns.emplace_back(std::move(int64_col));
+
+  // Write to Parquet
+  auto const [expected, filepath] = write_file(input_columns,
+                                               "num_rows_per_source",
+                                               false,
+                                               false,
+                                               cudf::io::default_max_page_size_bytes,
+                                               rows_in_row_group);
+
+  // Chunked-read single data source entirely
+  {
+    auto constexpr output_read_limit = 1'500;
+    auto constexpr pass_read_limit   = 3'500;
+
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{filepath}).build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result->view());
+    EXPECT_EQ(num_rows_per_source.size(), 1);
+    EXPECT_EQ(num_rows_per_source[0], num_rows);
+  }
+
+  // Chunked-read rows_to_read rows skipping rows_to_skip from single data source
+  {
+    auto const rows_to_skip          = 1'237;
+    auto const rows_to_read          = 7'232;
+    auto constexpr output_read_limit = 1'500;
+    auto constexpr pass_read_limit   = 3'500;
+
+    auto const options = cudf::io::parquet_reader_options_builder(cudf::io::source_info{filepath})
+                           .skip_rows(rows_to_skip)
+                           .num_rows(rows_to_read)
+                           .build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    auto int64_col_selected = int64s_col(int64_data.begin() + rows_to_skip,
+                                         int64_data.begin() + rows_to_skip + rows_to_read)
+                                .release();
+
+    cudf::table_view const expected_selected({int64_col_selected->view()});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+    EXPECT_EQ(num_rows_per_source.size(), 1);
+    EXPECT_EQ(num_rows_per_source[0], rows_to_read);
+  }
+
+  // Chunked-read two data sources skipping the first entire file completely
+  {
+    auto constexpr rows_to_skip      = 15'723;
+    auto constexpr output_read_limit = 1'024'000;
+    auto constexpr pass_read_limit   = 1'024'000;
+
+    auto constexpr nsources = 2;
+    std::vector<std::string> const datasources(nsources, filepath);
+
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{datasources})
+        .skip_rows(rows_to_skip)
+        .build();
+
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    auto int64_col_selected =
+      int64s_col(int64_data.begin() + rows_to_skip - num_rows, int64_data.end()).release();
+
+    cudf::table_view const expected_selected({int64_col_selected->view()});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+    EXPECT_EQ(num_rows_per_source.size(), 2);
+    EXPECT_EQ(num_rows_per_source[0], 0);
+    EXPECT_EQ(num_rows_per_source[1], nsources * num_rows - rows_to_skip);
+  }
+
+  // Chunked-read from single data source skipping rows_to_skip
+  {
+    auto const rows_to_skip          = 1'237;
+    auto constexpr output_read_limit = 1'500;
+    auto constexpr pass_read_limit   = 1'800;
+
+    auto const options = cudf::io::parquet_reader_options_builder(cudf::io::source_info{filepath})
+                           .skip_rows(rows_to_skip)
+                           .build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    auto int64_col_selected =
+      int64s_col(int64_data.begin() + rows_to_skip, int64_data.end()).release();
+
+    cudf::table_view const expected_selected({int64_col_selected->view()});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+    EXPECT_EQ(num_rows_per_source.size(), 1);
+    EXPECT_EQ(num_rows_per_source[0], num_rows - rows_to_skip);
+  }
+
+  // Filtered chunked-read from single data source
+  {
+    int64_t const max_value          = int64_data[int64_data.size() / 2];
+    auto constexpr output_read_limit = 1'500;
+    auto constexpr pass_read_limit   = 3'500;
+    auto literal_value               = cudf::numeric_scalar<int64_t>{max_value};
+    auto literal                     = cudf::ast::literal{literal_value};
+    auto col_ref                     = cudf::ast::column_reference(0);
+    auto filter_expression =
+      cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, col_ref, literal);
+
+    auto const options = cudf::io::parquet_reader_options_builder(cudf::io::source_info{filepath})
+                           .filter(filter_expression)
+                           .build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    std::vector<int64_t> int64_data_filtered;
+    int64_data_filtered.reserve(num_rows);
+    std::copy_if(
+      int64_data.begin(), int64_data.end(), std::back_inserter(int64_data_filtered), [=](auto val) {
+        return val <= max_value;
+      });
+
+    auto int64_col_filtered =
+      int64s_col(int64_data_filtered.begin(), int64_data_filtered.end()).release();
+
+    cudf::table_view expected_filtered({int64_col_filtered->view()});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_filtered, result->view());
+    EXPECT_TRUE(num_rows_per_source.empty());
+  }
+}
+
+TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSourceMultipleSources)
+{
+  constexpr int num_rows          = 10'723;  // A prime number
+  constexpr int rows_in_row_group = 500;
+
+  // Table with single col of random int64 values
+  auto const int64_data = random_values<int64_t>(num_rows);
+  auto int64_col        = int64s_col(int64_data.begin(), int64_data.end()).release();
+
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  input_columns.emplace_back(std::move(int64_col));
+
+  // Write to Parquet
+  auto const [expected, filepath] = write_file(input_columns,
+                                               "num_rows_per_source",
+                                               false,
+                                               false,
+                                               cudf::io::default_max_page_size_bytes,
+                                               rows_in_row_group);
+
+  // Function to initialize a vector of expected counts per source
+  auto initialize_expected_counts =
+    [](int const nsources, int const num_rows, int const rows_to_skip, int const rows_to_read) {
+      // Initialize expected_counts
+      std::vector<size_t> expected_counts(nsources, num_rows);
+
+      // Adjust expected_counts for rows_to_skip
+      int64_t counter = 0;
+      for (auto& nrows : expected_counts) {
+        if (counter < rows_to_skip) {
+          counter += nrows;
+          nrows = (counter >= rows_to_skip) ? counter - rows_to_skip : 0;
+        } else {
+          break;
+        }
+      }
+
+      // Reset the counter
+      counter = 0;
+
+      // Adjust expected_counts for rows_to_read
+      for (auto& nrows : expected_counts) {
+        if (counter < rows_to_read) {
+          counter += nrows;
+          nrows = (counter >= rows_to_read) ? rows_to_read - counter + nrows : nrows;
+        } else if (counter > rows_to_read) {
+          nrows = 0;
+        }
+      }
+
+      return expected_counts;
+    };
+
+  // Chunked-read six data sources entirely
+  {
+    auto const nsources              = 6;
+    auto constexpr output_read_limit = 15'000;
+    auto constexpr pass_read_limit   = 35'000;
+    std::vector<std::string> const datasources(nsources, filepath);
+
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{datasources}).build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    // Initialize expected_counts
+    std::vector<size_t> const expected_counts(nsources, num_rows);
+
+    EXPECT_EQ(num_rows_per_source.size(), nsources);
+    EXPECT_TRUE(
+      std::equal(expected_counts.cbegin(), expected_counts.cend(), num_rows_per_source.cbegin()));
+  }
+
+  // Chunked-read rows_to_read rows skipping rows_to_skip from eight data sources
+  {
+    auto const rows_to_skip          = 25'571;
+    auto const rows_to_read          = 41'232;
+    auto constexpr output_read_limit = 15'000;
+    auto constexpr pass_read_limit   = 35'000;
+    auto const nsources              = 8;
+    std::vector<int64_t> int64_selected_data{};
+    int64_selected_data.reserve(nsources * num_rows);
+
+    std::for_each(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(nsources),
+      [&](auto const i) {
+        std::copy(int64_data.begin(), int64_data.end(), std::back_inserter(int64_selected_data));
+      });
+
+    std::vector<std::string> const datasources(nsources, filepath);
+
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{datasources})
+        .skip_rows(rows_to_skip)
+        .num_rows(rows_to_read)
+        .build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    // Initialize expected_counts
+    auto const expected_counts =
+      initialize_expected_counts(nsources, num_rows, rows_to_skip, rows_to_read);
+
+    // Initialize expected table
+    auto int64_col_selected = int64s_col(int64_selected_data.begin() + rows_to_skip,
+                                         int64_selected_data.begin() + +rows_to_skip + rows_to_read)
+                                .release();
+
+    cudf::table_view const expected_selected({int64_col_selected->view()});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+    EXPECT_EQ(num_rows_per_source.size(), nsources);
+    EXPECT_TRUE(
+      std::equal(expected_counts.cbegin(), expected_counts.cend(), num_rows_per_source.cbegin()));
+  }
+
+  // Chunked-read four data sources skipping three files completely
+  {
+    auto const nsources              = 4;
+    int constexpr rows_to_skip       = num_rows * 3 + 1;
+    auto constexpr output_read_limit = 15'000;
+    auto constexpr pass_read_limit   = 35'000;
+    std::vector<int64_t> int64_selected_data{};
+    int64_selected_data.reserve(nsources * num_rows);
+
+    std::for_each(
+      thrust::make_counting_iterator(0),
+      thrust::make_counting_iterator(nsources),
+      [&](auto const i) {
+        std::copy(int64_data.begin(), int64_data.end(), std::back_inserter(int64_selected_data));
+      });
+
+    std::vector<std::string> const datasources(nsources, filepath);
+    auto const options =
+      cudf::io::parquet_reader_options_builder(cudf::io::source_info{datasources})
+        .skip_rows(rows_to_skip)
+        .build();
+    auto const reader = cudf::io::chunked_parquet_reader(
+      output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+    auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+    // Initialize expected_counts
+    auto const expected_counts =
+      initialize_expected_counts(nsources, num_rows, rows_to_skip, num_rows * nsources);
+
+    // Initialize expected table
+    auto int64_col_selected =
+      int64s_col(int64_selected_data.begin() + rows_to_skip, int64_selected_data.end()).release();
+
+    cudf::table_view const expected_selected({int64_col_selected->view()});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+    EXPECT_EQ(num_rows_per_source.size(), nsources);
+    EXPECT_TRUE(
+      std::equal(expected_counts.cbegin(), expected_counts.cend(), num_rows_per_source.cbegin()));
+  }
+}
+
+TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSourceEmptyTable)
+{
+  auto constexpr output_read_limit = 4'500;
+  auto constexpr pass_read_limit   = 8'500;
+  auto const nsources              = 10;
+
+  // Table with single col of random int64 values
+  auto int64_empty_col = int64s_col{}.release();
+
+  std::vector<std::unique_ptr<cudf::column>> input_empty_columns;
+  input_empty_columns.emplace_back(std::move(int64_empty_col));
+
+  // Write to Parquet
+  auto const [expected_empty, filepath_empty] = write_file(input_empty_columns,
+                                                           "num_rows_per_source_empty",
+                                                           false,
+                                                           false,
+                                                           cudf::io::default_max_page_size_bytes,
+                                                           500);
+
+  std::vector<std::string> const datasources(nsources, filepath_empty);
+
+  auto const options =
+    cudf::io::parquet_reader_options_builder(cudf::io::source_info{datasources}).build();
+  auto const reader = cudf::io::chunked_parquet_reader(
+    output_read_limit, pass_read_limit, options, cudf::get_default_stream());
+
+  auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
+
+  // Initialize expected_counts
+  std::vector<size_t> const expected_counts(nsources, 0);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_empty->view(), result->view());
+
+  EXPECT_EQ(num_chunks, 1);
+  EXPECT_EQ(num_rows_per_source.size(), nsources);
+  EXPECT_TRUE(
+    std::equal(expected_counts.cbegin(), expected_counts.cend(), num_rows_per_source.cbegin()));
+}
diff --git a/cpp/tests/io/parquet_reader_test.cpp b/cpp/tests/io/parquet_reader_test.cpp
index 2edf9e0aee6..6c61535359f 100644
--- a/cpp/tests/io/parquet_reader_test.cpp
+++ b/cpp/tests/io/parquet_reader_test.cpp
@@ -2243,6 +2243,209 @@ TEST_F(ParquetReaderTest, StringsWithPageStats)
   }
 }
 
+TEST_F(ParquetReaderTest, NumRowsPerSource)
+{
+  int constexpr num_rows          = 10'723;  // A prime number
+  int constexpr rows_in_row_group = 500;
+
+  // Table with single col of random int64 values
+  auto const int64_data = random_values<int64_t>(num_rows);
+  column_wrapper<int64_t> const int64_col{
+    int64_data.begin(), int64_data.end(), cudf::test::iterators::no_nulls()};
+  cudf::table_view const expected({int64_col});
+
+  // Write to Parquet
+  auto const filepath = temp_env->get_temp_filepath("NumRowsPerSource.parquet");
+  auto const out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .row_group_size_rows(rows_in_row_group)
+      .build();
+  cudf::io::write_parquet(out_opts);
+
+  // Read single data source entirely
+  {
+    auto const in_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).build();
+    auto const result = cudf::io::read_parquet(in_opts);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
+    EXPECT_EQ(result.metadata.num_rows_per_source.size(), 1);
+    EXPECT_EQ(result.metadata.num_rows_per_source[0], num_rows);
+  }
+
+  // Read rows_to_read rows skipping rows_to_skip from single data source
+  {
+    auto constexpr rows_to_skip = 557;  // a prime number != rows_in_row_group
+    auto constexpr rows_to_read = 7'232;
+    auto const in_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+                           .skip_rows(rows_to_skip)
+                           .num_rows(rows_to_read)
+                           .build();
+    auto const result = cudf::io::read_parquet(in_opts);
+    column_wrapper<int64_t> int64_col_selected{int64_data.begin() + rows_to_skip,
+                                               int64_data.begin() + rows_to_skip + rows_to_read,
+                                               cudf::test::iterators::no_nulls()};
+
+    cudf::table_view const expected_selected({int64_col_selected});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result.tbl->view());
+    EXPECT_EQ(result.metadata.num_rows_per_source.size(), 1);
+    EXPECT_EQ(result.metadata.num_rows_per_source[0], rows_to_read);
+  }
+
+  // Filtered read from single data source
+  {
+    auto constexpr max_value = 100;
+    auto literal_value       = cudf::numeric_scalar<int64_t>{max_value};
+    auto literal             = cudf::ast::literal{literal_value};
+    auto col_ref             = cudf::ast::column_reference(0);
+    auto filter_expression =
+      cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, col_ref, literal);
+
+    auto const in_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+                           .filter(filter_expression)
+                           .build();
+
+    std::vector<int64_t> int64_data_filtered;
+    int64_data_filtered.reserve(num_rows);
+    std::copy_if(
+      int64_data.begin(), int64_data.end(), std::back_inserter(int64_data_filtered), [=](auto val) {
+        return val <= max_value;
+      });
+    column_wrapper<int64_t> int64_col_filtered{
+      int64_data_filtered.begin(), int64_data_filtered.end(), cudf::test::iterators::no_nulls()};
+
+    cudf::table_view expected_filtered({int64_col_filtered});
+
+    auto const result = cudf::io::read_parquet(in_opts);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_filtered, result.tbl->view());
+    EXPECT_EQ(result.metadata.num_rows_per_source.size(), 0);
+  }
+
+  // Read two data sources skipping the first entire file completely
+  {
+    auto constexpr rows_to_skip = 15'723;
+    auto constexpr nsources     = 2;
+    std::vector<std::string> const datasources(nsources, filepath);
+
+    auto const in_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{datasources})
+        .skip_rows(rows_to_skip)
+        .build();
+
+    auto const result = cudf::io::read_parquet(in_opts);
+
+    column_wrapper<int64_t> int64_col_selected{int64_data.begin() + rows_to_skip - num_rows,
+                                               int64_data.end(),
+                                               cudf::test::iterators::no_nulls()};
+
+    cudf::table_view const expected_selected({int64_col_selected});
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result.tbl->view());
+    EXPECT_EQ(result.metadata.num_rows_per_source.size(), 2);
+    EXPECT_EQ(result.metadata.num_rows_per_source[0], 0);
+    EXPECT_EQ(result.metadata.num_rows_per_source[1], nsources * num_rows - rows_to_skip);
+  }
+
+  // Read ten data sources entirely
+  {
+    auto constexpr nsources = 10;
+    std::vector<std::string> const datasources(nsources, filepath);
+
+    auto const in_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{datasources}).build();
+    auto const result = cudf::io::read_parquet(in_opts);
+
+    // Initialize expected_counts
+    std::vector<size_t> const expected_counts(nsources, num_rows);
+
+    EXPECT_EQ(result.metadata.num_rows_per_source.size(), nsources);
+    EXPECT_TRUE(std::equal(expected_counts.cbegin(),
+                           expected_counts.cend(),
+                           result.metadata.num_rows_per_source.cbegin()));
+  }
+
+  // Read rows_to_read rows skipping rows_to_skip (> two sources) from ten data sources
+  {
+    auto constexpr rows_to_skip = 25'999;
+    auto constexpr rows_to_read = 47'232;
+
+    auto constexpr nsources = 10;
+    std::vector<std::string> const datasources(nsources, filepath);
+
+    auto const in_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{datasources})
+        .skip_rows(rows_to_skip)
+        .num_rows(rows_to_read)
+        .build();
+
+    auto const result = cudf::io::read_parquet(in_opts);
+
+    // Initialize expected_counts
+    std::vector<size_t> expected_counts(nsources, num_rows);
+
+    // Adjust expected_counts for rows_to_skip
+    int64_t counter = 0;
+    for (auto& nrows : expected_counts) {
+      if (counter < rows_to_skip) {
+        counter += nrows;
+        nrows = (counter >= rows_to_skip) ? counter - rows_to_skip : 0;
+      } else {
+        break;
+      }
+    }
+
+    // Reset the counter
+    counter = 0;
+
+    // Adjust expected_counts for rows_to_read
+    for (auto& nrows : expected_counts) {
+      if (counter < rows_to_read) {
+        counter += nrows;
+        nrows = (counter >= rows_to_read) ? rows_to_read - counter + nrows : nrows;
+      } else if (counter > rows_to_read) {
+        nrows = 0;
+      }
+    }
+
+    EXPECT_EQ(result.metadata.num_rows_per_source.size(), nsources);
+    EXPECT_TRUE(std::equal(expected_counts.cbegin(),
+                           expected_counts.cend(),
+                           result.metadata.num_rows_per_source.cbegin()));
+  }
+}
+
+TEST_F(ParquetReaderTest, NumRowsPerSourceEmptyTable)
+{
+  auto const nsources = 10;
+
+  column_wrapper<int64_t> const int64_empty_col{};
+  cudf::table_view const expected_empty({int64_empty_col});
+
+  // Write to Parquet
+  auto const filepath_empty = temp_env->get_temp_filepath("NumRowsPerSourceEmpty.parquet");
+  auto const out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath_empty}, expected_empty)
+      .build();
+  cudf::io::write_parquet(out_opts);
+
+  // Read from Parquet
+  std::vector<std::string> const datasources(nsources, filepath_empty);
+
+  auto const in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{datasources}).build();
+  auto const result = cudf::io::read_parquet(in_opts);
+
+  // Initialize expected_counts
+  std::vector<size_t> const expected_counts(nsources, 0);
+
+  EXPECT_EQ(result.metadata.num_rows_per_source.size(), nsources);
+  EXPECT_TRUE(std::equal(expected_counts.cbegin(),
+                         expected_counts.cend(),
+                         result.metadata.num_rows_per_source.cbegin()));
+}
+
 ///////////////////
 // metadata tests
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
index 8d87deb1472..0a6bddcd907 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
@@ -81,6 +81,7 @@ cdef extern from "cudf/io/types.hpp" \
         map[string, string] user_data
         vector[unordered_map[string, string]] per_file_user_data
         vector[column_name_info] schema_info
+        vector[size_t] num_rows_per_source
 
     cdef cppclass table_with_metadata:
         unique_ptr[table] tbl

From 26a3799d2ff9ffb2aa72d63bb388b4bee70b3440 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 19 Jul 2024 16:49:01 -1000
Subject: [PATCH 556/842] Make ColumnAccessor strictly require a mapping of
 columns (#16285)

`ColumnAccessor` had a default `data=None` argument and initialized an empty dict in the `__init__` if `data` was not passed. This PR now makes `data` a required argument.

Additionally if `verify=True`, the `__init__` would call `as_column` on each `data.values()` allowing non-`ColumnBase` inputs. This PR now avoids this call and makes the caller responsible for ensuring the inputs are `ColumnBase`s

Also, adds a few `verify=False` internally where we know we are passing columns from a libcudf op or reconstructing from another `ColumnAccessor`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16285
---
 python/cudf/cudf/core/_base_index.py          |   4 +-
 python/cudf/cudf/core/column_accessor.py      |  64 +++---
 python/cudf/cudf/core/dataframe.py            |  24 ++-
 python/cudf/cudf/core/frame.py                |   2 +-
 python/cudf/cudf/core/groupby/groupby.py      |   4 +-
 python/cudf/cudf/core/index.py                |   4 +-
 python/cudf/cudf/core/indexed_frame.py        |   1 +
 python/cudf/cudf/core/reshape.py              |  12 +-
 python/cudf/cudf/core/series.py               |  27 ++-
 .../cudf/cudf/tests/test_column_accessor.py   | 190 ++++++++++++------
 10 files changed, 211 insertions(+), 121 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index c38352009de..8fad82c5c46 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -98,7 +98,7 @@ def astype(self, dtype, copy: bool = True):
         """
         raise NotImplementedError
 
-    def argsort(self, *args, **kwargs):
+    def argsort(self, *args, **kwargs) -> cupy.ndarray:
         """Return the integer indices that would sort the index.
 
         Parameters vary by subclass.
@@ -1520,7 +1520,7 @@ def sort_values(
         ascending=True,
         na_position="last",
         key=None,
-    ):
+    ) -> Self | tuple[Self, cupy.ndarray]:
         """
         Return a sorted copy of the index, and optionally return the indices
         that sorted the index itself.
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index f30a557efb0..819d351b2c4 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -16,6 +16,8 @@
 from cudf.core import column
 
 if TYPE_CHECKING:
+    from typing_extensions import Self
+
     from cudf._typing import Dtype
     from cudf.core.column import ColumnBase
 
@@ -86,58 +88,58 @@ class ColumnAccessor(abc.MutableMapping):
         (default=None).
     verify : bool, optional
         For non ColumnAccessor inputs, whether to verify
-        column length and type
+        column length and data.values() are all Columns
     """
 
-    _data: "dict[Any, ColumnBase]"
-    multiindex: bool
+    _data: dict[Any, ColumnBase]
     _level_names: tuple[Any, ...]
 
     def __init__(
         self,
-        data: abc.MutableMapping | ColumnAccessor | None = None,
+        data: abc.MutableMapping[Any, ColumnBase] | Self,
         multiindex: bool = False,
         level_names=None,
         rangeindex: bool = False,
         label_dtype: Dtype | None = None,
         verify: bool = True,
     ):
-        self.rangeindex = rangeindex
-        self.label_dtype = label_dtype
-        if data is None:
-            data = {}
-        # TODO: we should validate the keys of `data`
         if isinstance(data, ColumnAccessor):
-            multiindex = multiindex or data.multiindex
-            level_names = level_names or data.level_names
             self._data = data._data
-            self.multiindex = multiindex
-            self._level_names = level_names
-            self.rangeindex = data.rangeindex
-            self.label_dtype = data.label_dtype
-        else:
+            self._level_names = data.level_names
+            self.multiindex: bool = data.multiindex
+            self.rangeindex: bool = data.rangeindex
+            self.label_dtype: Dtype | None = data.label_dtype
+        elif isinstance(data, abc.MutableMapping):
             # This code path is performance-critical for copies and should be
             # modified with care.
-            data = dict(data)
             if data and verify:
-                result = {}
                 # Faster than next(iter(data.values()))
                 column_length = len(data[next(iter(data))])
-                for k, v in data.items():
-                    # Much faster to avoid the function call if possible; the
-                    # extra isinstance is negligible if we do have to make a
-                    # column from something else.
-                    if not isinstance(v, column.ColumnBase):
-                        v = column.as_column(v)
-                    if len(v) != column_length:
+                # TODO: we should validate the keys of `data`
+                for col in data.values():
+                    if not isinstance(col, column.ColumnBase):
+                        raise ValueError(
+                            f"All data.values() must be Column, not {type(col).__name__}"
+                        )
+                    if len(col) != column_length:
                         raise ValueError("All columns must be of equal length")
-                    result[k] = v
-                self._data = result
-            else:
-                self._data = data
 
+            if not isinstance(data, dict):
+                data = dict(data)
+            self._data = data
+
+            if rangeindex and multiindex:
+                raise ValueError(
+                    f"{rangeindex=} and {multiindex=} cannot both be True."
+                )
+            self.rangeindex = rangeindex
             self.multiindex = multiindex
+            self.label_dtype = label_dtype
             self._level_names = level_names
+        else:
+            raise ValueError(
+                f"data must be a ColumnAccessor or MutableMapping, not {type(data).__name__}"
+            )
 
     def __iter__(self):
         return iter(self._data)
@@ -161,7 +163,9 @@ def __repr__(self) -> str:
         type_info = (
             f"{self.__class__.__name__}("
             f"multiindex={self.multiindex}, "
-            f"level_names={self.level_names})"
+            f"level_names={self.level_names}, "
+            f"rangeindex={self.rangeindex}, "
+            f"label_dtype={self.label_dtype})"
         )
         column_info = "\n".join(
             [f"{name}: {col.dtype}" for name, col in self.items()]
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7e07078c95b..dbc7f10b569 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -475,6 +475,7 @@ def __getitem__(self, arg):
                     {key: ca._data[key] for key in column_names},
                     multiindex=ca.multiindex,
                     level_names=ca.level_names,
+                    verify=False,
                 ),
                 index=index,
             )
@@ -485,6 +486,7 @@ def __getitem__(self, arg):
                     {key: ca._data[key] for key in column_names},
                     multiindex=ca.multiindex,
                     level_names=ca.level_names,
+                    verify=False,
                 ),
                 index=index,
             )
@@ -771,6 +773,7 @@ def __init__(
                     else None,
                     rangeindex=rangeindex,
                     label_dtype=label_dtype,
+                    verify=False,
                 )
         elif isinstance(data, ColumnAccessor):
             raise TypeError(
@@ -931,7 +934,7 @@ def _init_from_series_list(self, data, columns, index):
                     )
                 if not series.index.equals(final_columns):
                     series = series.reindex(final_columns)
-                self._data[idx] = column.as_column(series._column)
+                self._data[idx] = series._column
 
             # Setting `final_columns` to self._index so
             # that the resulting `transpose` will be have
@@ -2958,7 +2961,7 @@ def set_index(
             # label-like
             if is_scalar(col) or isinstance(col, tuple):
                 if col in self._column_names:
-                    data_to_add.append(self[col])
+                    data_to_add.append(self[col]._column)
                     names.append(col)
                     if drop:
                         to_drop.append(col)
@@ -2973,7 +2976,7 @@ def set_index(
             elif isinstance(
                 col, (cudf.Series, cudf.Index, pd.Series, pd.Index)
             ):
-                data_to_add.append(col)
+                data_to_add.append(as_column(col))
                 names.append(col.name)
             else:
                 try:
@@ -4769,7 +4772,7 @@ def _func(x):  # pragma: no cover
         result = {}
         for name, col in self._data.items():
             apply_sr = Series._from_data({None: col})
-            result[name] = apply_sr.apply(_func)
+            result[name] = apply_sr.apply(_func)._column
 
         return DataFrame._from_data(result, index=self.index)
 
@@ -5806,6 +5809,7 @@ def from_records(
                 ),
                 level_names=level_names,
                 label_dtype=getattr(columns, "dtype", None),
+                verify=False,
             ),
             index=new_index,
         )
@@ -5892,6 +5896,7 @@ def _from_arrays(
                 ),
                 level_names=level_names,
                 label_dtype=getattr(columns, "dtype", None),
+                verify=False,
             ),
             index=index,
         )
@@ -6302,10 +6307,9 @@ def count(self, axis=0, numeric_only=False):
         length = len(self)
         return Series._from_data(
             {
-                None: [
-                    length - self._data[col].null_count
-                    for col in self._data.names
-                ]
+                None: as_column(
+                    [length - col.null_count for col in self._columns]
+                )
             },
             cudf.Index(self._data.names),
         )
@@ -7374,7 +7378,9 @@ def to_struct(self, name=None):
             offset=0,
         )
         return cudf.Series._from_data(
-            cudf.core.column_accessor.ColumnAccessor({name: col}),
+            cudf.core.column_accessor.ColumnAccessor(
+                {name: col}, verify=False
+            ),
             index=self.index,
             name=name,
         )
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index c82e073d7b7..04ecae4ba85 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1305,7 +1305,7 @@ def argsort(
         order=None,
         ascending=True,
         na_position="last",
-    ):
+    ) -> cupy.ndarray:
         """Return the integer indices that would sort the Series values.
 
         Parameters
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 3f91be71f29..1646c5042fd 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -1360,7 +1360,9 @@ def _post_process_chunk_results(
         if isinstance(chunk_results, ColumnBase) or cudf.api.types.is_scalar(
             chunk_results[0]
         ):
-            data = {None: chunk_results}
+            data = ColumnAccessor(
+                {None: as_column(chunk_results)}, verify=False
+            )
             ty = cudf.Series if self._as_index else cudf.DataFrame
             result = ty._from_data(data, index=group_names)
             result.index.names = self.grouping.names
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index ae20fcd5d9c..73b7298410a 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -349,7 +349,7 @@ def hasnans(self) -> bool:
     @_performance_tracking
     def _data(self):
         return cudf.core.column_accessor.ColumnAccessor(
-            {self.name: self._values}
+            {self.name: self._values}, verify=False
         )
 
     @_performance_tracking
@@ -1492,7 +1492,7 @@ def argsort(
         order=None,
         ascending=True,
         na_position="last",
-    ):
+    ) -> cupy.ndarray:
         """Return the integer indices that would sort the index.
 
         Parameters
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 60cd142db4b..e75b51e0d43 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -6229,6 +6229,7 @@ def rank(
                     multiindex=self._data.multiindex,
                     level_names=self._data.level_names,
                     label_dtype=self._data.label_dtype,
+                    verify=False,
                 ),
             )
         else:
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index b538ae34b6f..a542c5f5969 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -932,14 +932,10 @@ def _pivot(df, index, columns):
     index_labels, index_idx = index._encode()
     column_labels = columns_labels.to_pandas().to_flat_index()
 
-    # the result of pivot always has a multicolumn
-    result = cudf.core.column_accessor.ColumnAccessor(
-        multiindex=True, level_names=(None,) + columns._data.names
-    )
-
     def as_tuple(x):
         return x if isinstance(x, tuple) else (x,)
 
+    result = {}
     for v in df:
         names = [as_tuple(v) + as_tuple(name) for name in column_labels]
         nrows = len(index_labels)
@@ -964,8 +960,12 @@ def as_tuple(x):
                 }
             )
 
+    # the result of pivot always has a multicolumn
+    ca = cudf.core.column_accessor.ColumnAccessor(
+        result, multiindex=True, level_names=(None,) + columns._data.names
+    )
     return cudf.DataFrame._from_data(
-        result, index=cudf.Index(index_labels, name=index.name)
+        ca, index=cudf.Index(index_labels, name=index.name)
     )
 
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index d8dbaa897e7..94c33eed37a 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2263,20 +2263,19 @@ def argsort(
         order=None,
         ascending=True,
         na_position="last",
-    ):
-        obj = self.__class__._from_data(
-            {
-                None: super().argsort(
-                    axis=axis,
-                    kind=kind,
-                    order=order,
-                    ascending=ascending,
-                    na_position=na_position,
-                )
-            }
+    ) -> Self:
+        col = as_column(
+            super().argsort(
+                axis=axis,
+                kind=kind,
+                order=order,
+                ascending=ascending,
+                na_position=na_position,
+            )
+        )
+        return self._from_data_like_self(
+            self._data._from_columns_like_self([col])
         )
-        obj.name = self.name
-        return obj
 
     @_performance_tracking
     def replace(self, to_replace=None, value=no_default, *args, **kwargs):
@@ -2631,7 +2630,7 @@ def mode(self, dropna=True):
             val_counts = val_counts[val_counts == val_counts.iloc[0]]
 
         return Series._from_data(
-            {self.name: val_counts.index.sort_values()}, name=self.name
+            {self.name: val_counts.index.sort_values()._column}, name=self.name
         )
 
     @_performance_tracking
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
index f3343c37d1d..e84e1433c10 100644
--- a/python/cudf/cudf/tests/test_column_accessor.py
+++ b/python/cudf/cudf/tests/test_column_accessor.py
@@ -5,28 +5,35 @@
 import pytest
 
 import cudf
+from cudf.core.column import as_column
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.testing import assert_eq
 
 simple_test_data = [
     {},
-    {"a": []},
-    {"a": [1]},
-    {"a": ["a"]},
-    {"a": [1, 2, 3], "b": ["a", "b", "c"]},
+    {"a": as_column([])},
+    {"a": as_column([1])},
+    {"a": as_column(["a"])},
+    {"a": as_column([1, 2, 3]), "b": as_column(["a", "b", "c"])},
 ]
 
 mi_test_data = [
-    {("a", "b"): [1, 2, 4], ("a", "c"): [2, 3, 4]},
-    {("a", "b"): [1, 2, 3], ("a", ""): [2, 3, 4]},
-    {("a", "b"): [1, 2, 4], ("c", "d"): [2, 3, 4]},
-    {("a", "b"): [1, 2, 3], ("a", "c"): [2, 3, 4], ("b", ""): [4, 5, 6]},
+    {("a", "b"): as_column([1, 2, 4]), ("a", "c"): as_column([2, 3, 4])},
+    {("a", "b"): as_column([1, 2, 3]), ("a", ""): as_column([2, 3, 4])},
+    {("a", "b"): as_column([1, 2, 4]), ("c", "d"): as_column([2, 3, 4])},
+    {
+        ("a", "b"): as_column([1, 2, 3]),
+        ("a", "c"): as_column([2, 3, 4]),
+        ("b", ""): as_column([4, 5, 6]),
+    },
 ]
 
 
 def check_ca_equal(lhs, rhs):
     assert lhs.level_names == rhs.level_names
     assert lhs.multiindex == rhs.multiindex
+    assert lhs.rangeindex == rhs.rangeindex
+    assert lhs.label_dtype == rhs.label_dtype
     for l_key, r_key in zip(lhs, rhs):
         assert l_key == r_key
         assert_eq(lhs[l_key], rhs[r_key])
@@ -58,19 +65,26 @@ def test_to_pandas_simple(simple_data):
     # to ignore this `inferred_type` comparison, we pass exact=False.
     assert_eq(
         ca.to_pandas_index(),
-        pd.DataFrame(simple_data).columns,
+        pd.DataFrame(
+            {key: value.values_host for key, value in simple_data.items()}
+        ).columns,
         exact=False,
     )
 
 
 def test_to_pandas_multiindex(mi_data):
     ca = ColumnAccessor(mi_data, multiindex=True)
-    assert_eq(ca.to_pandas_index(), pd.DataFrame(mi_data).columns)
+    assert_eq(
+        ca.to_pandas_index(),
+        pd.DataFrame(
+            {key: value.values_host for key, value in mi_data.items()}
+        ).columns,
+    )
 
 
 def test_to_pandas_multiindex_names():
     ca = ColumnAccessor(
-        {("a", "b"): [1, 2, 3], ("c", "d"): [3, 4, 5]},
+        {("a", "b"): as_column([1, 2, 3]), ("c", "d"): as_column([3, 4, 5])},
         multiindex=True,
         level_names=("foo", "bar"),
     )
@@ -108,16 +122,20 @@ def test_column_size_mismatch():
     differing sizes throws an error.
     """
     with pytest.raises(ValueError):
-        ColumnAccessor({"a": [1], "b": [1, 2]})
+        ColumnAccessor({"a": as_column([1]), "b": as_column([1, 2])})
 
 
 def test_select_by_label_simple():
     """
     Test getting a column by label
     """
-    ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4]})
-    check_ca_equal(ca.select_by_label("a"), ColumnAccessor({"a": [1, 2, 3]}))
-    check_ca_equal(ca.select_by_label("b"), ColumnAccessor({"b": [2, 3, 4]}))
+    ca = ColumnAccessor({"a": as_column([1, 2, 3]), "b": as_column([2, 3, 4])})
+    check_ca_equal(
+        ca.select_by_label("a"), ColumnAccessor({"a": as_column([1, 2, 3])})
+    )
+    check_ca_equal(
+        ca.select_by_label("b"), ColumnAccessor({"b": as_column([2, 3, 4])})
+    )
 
 
 def test_select_by_label_multiindex():
@@ -126,40 +144,62 @@ def test_select_by_label_multiindex():
     """
     ca = ColumnAccessor(
         {
-            ("a", "b", "c"): [1, 2, 3],
-            ("a", "b", "e"): [2, 3, 4],
-            ("b", "x", ""): [4, 5, 6],
-            ("a", "d", "e"): [3, 4, 5],
+            ("a", "b", "c"): as_column([1, 2, 3]),
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("b", "x", ""): as_column([4, 5, 6]),
+            ("a", "d", "e"): as_column([3, 4, 5]),
         },
         multiindex=True,
     )
 
     expect = ColumnAccessor(
-        {("b", "c"): [1, 2, 3], ("b", "e"): [2, 3, 4], ("d", "e"): [3, 4, 5]},
+        {
+            ("b", "c"): as_column([1, 2, 3]),
+            ("b", "e"): as_column([2, 3, 4]),
+            ("d", "e"): as_column([3, 4, 5]),
+        },
         multiindex=True,
     )
     got = ca.select_by_label("a")
     check_ca_equal(expect, got)
 
-    expect = ColumnAccessor({"c": [1, 2, 3], "e": [2, 3, 4]}, multiindex=False)
+    expect = ColumnAccessor(
+        {"c": as_column([1, 2, 3]), "e": as_column([2, 3, 4])},
+        multiindex=False,
+    )
     got = ca.select_by_label(("a", "b"))
     check_ca_equal(expect, got)
 
     expect = ColumnAccessor(
-        {("b", "c"): [1, 2, 3], ("b", "e"): [2, 3, 4], ("d", "e"): [3, 4, 5]},
+        {
+            ("b", "c"): as_column([1, 2, 3]),
+            ("b", "e"): as_column([2, 3, 4]),
+            ("d", "e"): as_column([3, 4, 5]),
+        },
         multiindex=True,
     )
     got = ca.select_by_label("a")
     check_ca_equal(expect, got)
 
-    expect = ColumnAccessor({"c": [1, 2, 3], "e": [2, 3, 4]}, multiindex=False)
+    expect = ColumnAccessor(
+        {"c": as_column([1, 2, 3]), "e": as_column([2, 3, 4])},
+        multiindex=False,
+    )
     got = ca.select_by_label(("a", "b"))
     check_ca_equal(expect, got)
 
 
 def test_select_by_label_simple_slice():
-    ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4], "c": [3, 4, 5]})
-    expect = ColumnAccessor({"b": [2, 3, 4], "c": [3, 4, 5]})
+    ca = ColumnAccessor(
+        {
+            "a": as_column([1, 2, 3]),
+            "b": as_column([2, 3, 4]),
+            "c": as_column([3, 4, 5]),
+        }
+    )
+    expect = ColumnAccessor(
+        {"b": as_column([2, 3, 4]), "c": as_column([3, 4, 5])}
+    )
     got = ca.select_by_label(slice("b", "c"))
     check_ca_equal(expect, got)
 
@@ -167,10 +207,10 @@ def test_select_by_label_simple_slice():
 def test_select_by_label_multiindex_slice():
     ca = ColumnAccessor(
         {
-            ("a", "b", "c"): [1, 2, 3],
-            ("a", "b", "e"): [2, 3, 4],
-            ("a", "d", "e"): [3, 4, 5],
-            ("b", "x", ""): [4, 5, 6],
+            ("a", "b", "c"): as_column([1, 2, 3]),
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("a", "d", "e"): as_column([3, 4, 5]),
+            ("b", "x", ""): as_column([4, 5, 6]),
         },
         multiindex=True,
     )  # pandas needs columns to be sorted to do slicing with multiindex
@@ -180,9 +220,9 @@ def test_select_by_label_multiindex_slice():
 
     expect = ColumnAccessor(
         {
-            ("a", "b", "e"): [2, 3, 4],
-            ("a", "d", "e"): [3, 4, 5],
-            ("b", "x", ""): [4, 5, 6],
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("a", "d", "e"): as_column([3, 4, 5]),
+            ("b", "x", ""): as_column([4, 5, 6]),
         },
         multiindex=True,
     )
@@ -191,8 +231,16 @@ def test_select_by_label_multiindex_slice():
 
 
 def test_by_label_list():
-    ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4], "c": [3, 4, 5]})
-    expect = ColumnAccessor({"b": [2, 3, 4], "c": [3, 4, 5]})
+    ca = ColumnAccessor(
+        {
+            "a": as_column([1, 2, 3]),
+            "b": as_column([2, 3, 4]),
+            "c": as_column([3, 4, 5]),
+        }
+    )
+    expect = ColumnAccessor(
+        {"b": as_column([2, 3, 4]), "c": as_column([3, 4, 5])}
+    )
     got = ca.select_by_label(["b", "c"])
     check_ca_equal(expect, got)
 
@@ -201,9 +249,13 @@ def test_select_by_index_simple():
     """
     Test getting a column by label
     """
-    ca = ColumnAccessor({"a": [1, 2, 3], "b": [2, 3, 4]})
-    check_ca_equal(ca.select_by_index(0), ColumnAccessor({"a": [1, 2, 3]}))
-    check_ca_equal(ca.select_by_index(1), ColumnAccessor({"b": [2, 3, 4]}))
+    ca = ColumnAccessor({"a": as_column([1, 2, 3]), "b": as_column([2, 3, 4])})
+    check_ca_equal(
+        ca.select_by_index(0), ColumnAccessor({"a": as_column([1, 2, 3])})
+    )
+    check_ca_equal(
+        ca.select_by_index(1), ColumnAccessor({"b": as_column([2, 3, 4])})
+    )
     check_ca_equal(ca.select_by_index([0, 1]), ca)
     check_ca_equal(ca.select_by_index(slice(0, None)), ca)
 
@@ -214,19 +266,19 @@ def test_select_by_index_multiindex():
     """
     ca = ColumnAccessor(
         {
-            ("a", "b", "c"): [1, 2, 3],
-            ("a", "b", "e"): [2, 3, 4],
-            ("b", "x", ""): [4, 5, 6],
-            ("a", "d", "e"): [3, 4, 5],
+            ("a", "b", "c"): as_column([1, 2, 3]),
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("b", "x", ""): as_column([4, 5, 6]),
+            ("a", "d", "e"): as_column([3, 4, 5]),
         },
         multiindex=True,
     )
 
     expect = ColumnAccessor(
         {
-            ("a", "b", "c"): [1, 2, 3],
-            ("a", "b", "e"): [2, 3, 4],
-            ("b", "x", ""): [4, 5, 6],
+            ("a", "b", "c"): as_column([1, 2, 3]),
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("b", "x", ""): as_column([4, 5, 6]),
         },
         multiindex=True,
     )
@@ -235,9 +287,9 @@ def test_select_by_index_multiindex():
 
     expect = ColumnAccessor(
         {
-            ("a", "b", "c"): [1, 2, 3],
-            ("a", "b", "e"): [2, 3, 4],
-            ("a", "d", "e"): [3, 4, 5],
+            ("a", "b", "c"): as_column([1, 2, 3]),
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("a", "d", "e"): as_column([3, 4, 5]),
         },
         multiindex=True,
     )
@@ -248,10 +300,10 @@ def test_select_by_index_multiindex():
 def test_select_by_index_empty():
     ca = ColumnAccessor(
         {
-            ("a", "b", "c"): [1, 2, 3],
-            ("a", "b", "e"): [2, 3, 4],
-            ("b", "x", ""): [4, 5, 6],
-            ("a", "d", "e"): [3, 4, 5],
+            ("a", "b", "c"): as_column([1, 2, 3]),
+            ("a", "b", "e"): as_column([2, 3, 4]),
+            ("b", "x", ""): as_column([4, 5, 6]),
+            ("a", "d", "e"): as_column([3, 4, 5]),
         },
         multiindex=True,
     )
@@ -267,12 +319,20 @@ def test_select_by_index_empty():
 
 def test_replace_level_values_RangeIndex():
     ca = ColumnAccessor(
-        {("a"): [1, 2, 3], ("b"): [2, 3, 4], ("c"): [3, 4, 5]},
+        {
+            ("a"): as_column([1, 2, 3]),
+            ("b"): as_column([2, 3, 4]),
+            ("c"): as_column([3, 4, 5]),
+        },
         multiindex=False,
     )
 
     expect = ColumnAccessor(
-        {("f"): [1, 2, 3], ("b"): [2, 3, 4], ("c"): [3, 4, 5]},
+        {
+            ("f"): as_column([1, 2, 3]),
+            ("b"): as_column([2, 3, 4]),
+            ("c"): as_column([3, 4, 5]),
+        },
         multiindex=False,
     )
 
@@ -282,12 +342,20 @@ def test_replace_level_values_RangeIndex():
 
 def test_replace_level_values_MultiColumn():
     ca = ColumnAccessor(
-        {("a", 1): [1, 2, 3], ("a", 2): [2, 3, 4], ("b", 1): [3, 4, 5]},
+        {
+            ("a", 1): as_column([1, 2, 3]),
+            ("a", 2): as_column([2, 3, 4]),
+            ("b", 1): as_column([3, 4, 5]),
+        },
         multiindex=True,
     )
 
     expect = ColumnAccessor(
-        {("f", 1): [1, 2, 3], ("f", 2): [2, 3, 4], ("b", 1): [3, 4, 5]},
+        {
+            ("f", 1): as_column([1, 2, 3]),
+            ("f", 2): as_column([2, 3, 4]),
+            ("b", 1): as_column([3, 4, 5]),
+        },
         multiindex=True,
     )
 
@@ -303,7 +371,17 @@ def test_clear_nrows_empty_before():
 
 
 def test_clear_nrows_empty_after():
-    ca = ColumnAccessor({"new": [1]})
+    ca = ColumnAccessor({"new": as_column([1])})
     assert ca.nrows == 1
     del ca["new"]
     assert ca.nrows == 0
+
+
+def test_not_rangeindex_and_multiindex():
+    with pytest.raises(ValueError):
+        ColumnAccessor({}, multiindex=True, rangeindex=True)
+
+
+def test_data_values_not_column_raises():
+    with pytest.raises(ValueError):
+        ColumnAccessor({"a": [1]})

From c5b96003cef00b2635923d03edcd48a13821a61e Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 19 Jul 2024 20:04:19 -0700
Subject: [PATCH 557/842] Migrate Parquet reader to pylibcudf (#16078)

xref #15162

Migrates the parquet reader (and chunked parquet reader) to pylibcudf.

(Does not migrate the writers or the metadata reader yet).

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16078
---
 .../api_docs/pylibcudf/io/index.rst           |   1 +
 .../api_docs/pylibcudf/io/parquet.rst         |   6 +
 python/cudf/cudf/_lib/parquet.pyx             | 312 ++++++------------
 .../cudf/cudf/_lib/pylibcudf/expressions.pyx  |  11 +
 .../cudf/_lib/pylibcudf/io/CMakeLists.txt     |   4 +-
 .../cudf/cudf/_lib/pylibcudf/io/__init__.pxd  |   2 +-
 .../cudf/cudf/_lib/pylibcudf/io/__init__.py   |   2 +-
 .../cudf/cudf/_lib/pylibcudf/io/parquet.pxd   |  35 ++
 .../cudf/cudf/_lib/pylibcudf/io/parquet.pyx   | 204 ++++++++++++
 python/cudf/cudf/_lib/pylibcudf/io/types.pyx  |   8 +
 .../_lib/pylibcudf/libcudf/io/parquet.pxd     |   8 +-
 python/cudf/cudf/io/parquet.py                |   4 +-
 .../cudf/cudf/pylibcudf_tests/common/utils.py |  80 ++++-
 python/cudf/cudf/pylibcudf_tests/conftest.py  |  15 +
 .../cudf/pylibcudf_tests/io/test_parquet.py   | 109 ++++++
 python/cudf/cudf/tests/test_parquet.py        |   5 +-
 16 files changed, 581 insertions(+), 225 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
 create mode 100644 python/cudf/cudf/pylibcudf_tests/io/test_parquet.py

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
index 697bce739de..e2d342ffe47 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -18,3 +18,4 @@ I/O Functions
     avro
     csv
     json
+    parquet
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst
new file mode 100644
index 00000000000..9dfbadfa216
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst
@@ -0,0 +1,6 @@
+=======
+Parquet
+=======
+
+.. automodule:: cudf._lib.pylibcudf.io.parquet
+   :members:
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index e7959d21e01..a2eed94bb3c 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -18,16 +18,14 @@ from cython.operator cimport dereference
 
 from cudf.api.types import is_list_like
 
-from cudf._lib.utils cimport data_from_unique_ptr
+from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
 
-from cudf._lib import pylibcudf
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport make_unique, unique_ptr
-from libcpp.pair cimport pair
 from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
@@ -35,25 +33,20 @@ from libcpp.vector cimport vector
 
 cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
 cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-cimport cudf._lib.pylibcudf.libcudf.types as cudf_types
 from cudf._lib.column cimport Column
 from cudf._lib.io.utils cimport (
+    add_df_col_struct_names,
     make_sinks_info,
     make_source_info,
-    update_struct_field_names,
 )
 from cudf._lib.pylibcudf.expressions cimport Expression
 from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
-from cudf._lib.pylibcudf.libcudf.expressions cimport expression
+from cudf._lib.pylibcudf.io.parquet cimport ChunkedParquetReader
 from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
-    chunked_parquet_reader as cpp_chunked_parquet_reader,
     chunked_parquet_writer_options,
     merge_row_group_metadata as parquet_merge_metadata,
     parquet_chunked_writer as cpp_parquet_chunked_writer,
-    parquet_reader_options,
-    parquet_reader_options_builder,
     parquet_writer_options,
-    read_parquet as parquet_reader,
     write_parquet as parquet_writer,
 )
 from cudf._lib.pylibcudf.libcudf.io.parquet_metadata cimport (
@@ -63,19 +56,17 @@ from cudf._lib.pylibcudf.libcudf.io.parquet_metadata cimport (
 from cudf._lib.pylibcudf.libcudf.io.types cimport (
     column_in_metadata,
     table_input_metadata,
-    table_metadata,
 )
 from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.utils cimport table_view_from_table
 
 from pyarrow.lib import NativeFile
 
-from cudf._lib.concat import concat_columns
+import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf cimport Table
 from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
 
-from cudf._lib.utils cimport data_from_pylibcudf_table
-
 
 cdef class BufferArrayFromVector:
     cdef Py_ssize_t length
@@ -133,71 +124,37 @@ def _parse_metadata(meta):
     return file_is_range_index, file_index_cols, file_column_dtype
 
 
-cdef pair[parquet_reader_options, bool] _setup_parquet_reader_options(
-     cudf_io_types.source_info source,
-     vector[vector[size_type]] row_groups,
-     bool use_pandas_metadata,
-     Expression filters,
-     object columns):
-
-    cdef parquet_reader_options args
-    cdef parquet_reader_options_builder builder
-    cdef data_type cpp_timestamp_type = cudf_types.data_type(
-        cudf_types.type_id.EMPTY
-    )
-    builder = (
-        parquet_reader_options.builder(source)
-        .row_groups(row_groups)
-        .use_pandas_metadata(use_pandas_metadata)
-        .use_arrow_schema(True)
-        .timestamp_type(cpp_timestamp_type)
-    )
-    if filters is not None:
-        builder = builder.filter(<expression &>dereference(filters.c_obj.get()))
-
-    args = move(builder.build())
-    cdef vector[string] cpp_columns
-    allow_range_index = True
-    if columns is not None:
-        cpp_columns.reserve(len(columns))
-        allow_range_index = len(columns) > 0
-        for col in columns:
-            cpp_columns.push_back(str(col).encode())
-        args.set_columns(cpp_columns)
-    allow_range_index &= filters is None
-
-    return pair[parquet_reader_options, bool](args, allow_range_index)
-
 cdef object _process_metadata(object df,
-                              table_metadata table_meta,
                               list names,
+                              dict child_names,
+                              list per_file_user_data,
                               object row_groups,
                               object filepaths_or_buffers,
                               list pa_buffers,
                               bool allow_range_index,
                               bool use_pandas_metadata):
-    update_struct_field_names(df, table_meta.schema_info)
+
+    add_df_col_struct_names(df, child_names)
     index_col = None
     is_range_index = True
     column_index_type = None
     index_col_names = None
     meta = None
-    cdef vector[unordered_map[string, string]] per_file_user_data = \
-        table_meta.per_file_user_data
     for single_file in per_file_user_data:
+        if b'pandas' not in single_file:
+            continue
         json_str = single_file[b'pandas'].decode('utf-8')
-        if json_str != "":
-            meta = json.loads(json_str)
-            file_is_range_index, index_col, column_index_type = _parse_metadata(meta)
-            is_range_index &= file_is_range_index
-
-            if not file_is_range_index and index_col is not None \
-                    and index_col_names is None:
-                index_col_names = {}
-                for idx_col in index_col:
-                    for c in meta['columns']:
-                        if c['field_name'] == idx_col:
-                            index_col_names[idx_col] = c['name']
+        meta = json.loads(json_str)
+        file_is_range_index, index_col, column_index_type = _parse_metadata(meta)
+        is_range_index &= file_is_range_index
+
+        if not file_is_range_index and index_col is not None \
+                and index_col_names is None:
+            index_col_names = {}
+            for idx_col in index_col:
+                for c in meta['columns']:
+                    if c['field_name'] == idx_col:
+                        index_col_names[idx_col] = c['name']
 
     if meta is not None:
         # Book keep each column metadata as the order
@@ -297,6 +254,76 @@ cdef object _process_metadata(object df,
     return df
 
 
+def read_parquet_chunked(
+    filepaths_or_buffers,
+    columns=None,
+    row_groups=None,
+    use_pandas_metadata=True,
+    size_t chunk_read_limit=0,
+    size_t pass_read_limit=1024000000
+):
+    # Convert NativeFile buffers to NativeFileDatasource,
+    # but save original buffers in case we need to use
+    # pyarrow for metadata processing
+    # (See: https://github.com/rapidsai/cudf/issues/9599)
+
+    pa_buffers = []
+
+    new_bufs = []
+    for i, datasource in enumerate(filepaths_or_buffers):
+        if isinstance(datasource, NativeFile):
+            new_bufs.append(NativeFileDatasource(datasource))
+        else:
+            new_bufs.append(datasource)
+
+    # Note: If this function ever takes accepts filters
+    # allow_range_index needs to be False when a filter is passed
+    # (see read_parquet)
+    allow_range_index = columns is not None and len(columns) != 0
+
+    reader = ChunkedParquetReader(
+        plc.io.SourceInfo(new_bufs),
+        columns,
+        row_groups,
+        use_pandas_metadata,
+        chunk_read_limit=chunk_read_limit,
+        pass_read_limit=pass_read_limit
+    )
+
+    tbl_w_meta = reader.read_chunk()
+    column_names = tbl_w_meta.column_names(include_children=False)
+    child_names = tbl_w_meta.child_names
+    per_file_user_data = tbl_w_meta.per_file_user_data
+    concatenated_columns = tbl_w_meta.tbl.columns()
+
+    # save memory
+    del tbl_w_meta
+
+    cdef Table tbl
+    while reader.has_next():
+        tbl = reader.read_chunk().tbl
+
+        for i in range(tbl.num_columns()):
+            concatenated_columns[i] = plc.concatenate.concatenate(
+                [concatenated_columns[i], tbl._columns[i]]
+            )
+            # Drop residual columns to save memory
+            tbl._columns[i] = None
+
+    df = cudf.DataFrame._from_data(
+        *_data_from_columns(
+            columns=[Column.from_pylibcudf(plc) for plc in concatenated_columns],
+            column_names=column_names,
+            index_names=None
+        )
+    )
+    df = _process_metadata(df, column_names, child_names,
+                           per_file_user_data, row_groups,
+                           filepaths_or_buffers, pa_buffers,
+                           allow_range_index, use_pandas_metadata)
+    return df
+
+
 cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
                    use_pandas_metadata=True,
                    Expression filters=None):
@@ -322,33 +349,28 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
             pa_buffers.append(datasource)
             filepaths_or_buffers[i] = NativeFileDatasource(datasource)
 
-    cdef cudf_io_types.source_info source = make_source_info(
-        filepaths_or_buffers)
-
-    cdef vector[vector[size_type]] cpp_row_groups
-    if row_groups is not None:
-        cpp_row_groups = row_groups
-
-    # Setup parquet reader arguments
-    cdef parquet_reader_options args
-    cdef pair[parquet_reader_options, bool] c_res = _setup_parquet_reader_options(
-            source, cpp_row_groups, use_pandas_metadata, filters, columns)
-    args, allow_range_index = c_res.first, c_res.second
+    allow_range_index = True
+    if columns is not None and len(columns) == 0 or filters:
+        allow_range_index = False
 
     # Read Parquet
-    cdef cudf_io_types.table_with_metadata c_result
 
-    with nogil:
-        c_result = move(parquet_reader(args))
+    tbl_w_meta = plc.io.parquet.read_parquet(
+        plc.io.SourceInfo(filepaths_or_buffers),
+        columns,
+        row_groups,
+        filters,
+        convert_strings_to_categories = False,
+        use_pandas_metadata = use_pandas_metadata,
+    )
 
-    names = [info.name.decode() for info in c_result.metadata.schema_info]
+    df = cudf.DataFrame._from_data(
+        *data_from_pylibcudf_io(tbl_w_meta)
+    )
 
-    df = cudf.DataFrame._from_data(*data_from_unique_ptr(
-        move(c_result.tbl),
-        column_names=names
-    ))
-    df = _process_metadata(df, c_result.metadata, names, row_groups,
-                           filepaths_or_buffers, pa_buffers,
+    df = _process_metadata(df, tbl_w_meta.column_names(include_children=False),
+                           tbl_w_meta.child_names, tbl_w_meta.per_file_user_data,
+                           row_groups, filepaths_or_buffers, pa_buffers,
                            allow_range_index, use_pandas_metadata)
     return df
 
@@ -804,120 +826,6 @@ cdef class ParquetWriter:
         self.initialized = True
 
 
-cdef class ParquetReader:
-    cdef bool initialized
-    cdef unique_ptr[cpp_chunked_parquet_reader] reader
-    cdef size_t chunk_read_limit
-    cdef size_t pass_read_limit
-    cdef size_t row_group_size_bytes
-    cdef table_metadata result_meta
-    cdef vector[unordered_map[string, string]] per_file_user_data
-    cdef object pandas_meta
-    cdef list pa_buffers
-    cdef bool allow_range_index
-    cdef object row_groups
-    cdef object filepaths_or_buffers
-    cdef object names
-    cdef object column_index_type
-    cdef object index_col_names
-    cdef bool is_range_index
-    cdef object index_col
-    cdef bool cpp_use_pandas_metadata
-
-    def __cinit__(self, filepaths_or_buffers, columns=None, row_groups=None,
-                  use_pandas_metadata=True,
-                  size_t chunk_read_limit=0,
-                  size_t pass_read_limit=1024000000):
-
-        # Convert NativeFile buffers to NativeFileDatasource,
-        # but save original buffers in case we need to use
-        # pyarrow for metadata processing
-        # (See: https://github.com/rapidsai/cudf/issues/9599)
-
-        pa_buffers = []
-        for i, datasource in enumerate(filepaths_or_buffers):
-            if isinstance(datasource, NativeFile):
-                pa_buffers.append(datasource)
-                filepaths_or_buffers[i] = NativeFileDatasource(datasource)
-        self.pa_buffers = pa_buffers
-        cdef cudf_io_types.source_info source = make_source_info(
-            filepaths_or_buffers)
-
-        self.cpp_use_pandas_metadata = use_pandas_metadata
-
-        cdef vector[vector[size_type]] cpp_row_groups
-        if row_groups is not None:
-            cpp_row_groups = row_groups
-        cdef parquet_reader_options args
-        cdef pair[parquet_reader_options, bool] c_res = _setup_parquet_reader_options(
-            source, cpp_row_groups, use_pandas_metadata, None, columns)
-        args, self.allow_range_index = c_res.first, c_res.second
-
-        with nogil:
-            self.reader.reset(
-                new cpp_chunked_parquet_reader(
-                    chunk_read_limit,
-                    pass_read_limit,
-                    args
-                )
-            )
-        self.initialized = False
-        self.row_groups = row_groups
-        self.filepaths_or_buffers = filepaths_or_buffers
-
-    def _has_next(self):
-        cdef bool res
-        with nogil:
-            res = self.reader.get()[0].has_next()
-        return res
-
-    def _read_chunk(self):
-        # Read Parquet
-        cdef cudf_io_types.table_with_metadata c_result
-
-        with nogil:
-            c_result = move(self.reader.get()[0].read_chunk())
-
-        if not self.initialized:
-            self.names = [info.name.decode() for info in c_result.metadata.schema_info]
-            self.result_meta = c_result.metadata
-
-        df = cudf.DataFrame._from_data(*data_from_unique_ptr(
-            move(c_result.tbl),
-            column_names=self.names,
-        ))
-
-        self.initialized = True
-        return df
-
-    def read(self):
-        dfs = self._read_chunk()
-        column_names = dfs._column_names
-        concatenated_columns = list(dfs._columns)
-        del dfs
-        while self._has_next():
-            new_chunk = list(self._read_chunk()._columns)
-            for i in range(len(column_names)):
-                concatenated_columns[i] = concat_columns(
-                    [concatenated_columns[i], new_chunk[i]]
-                )
-                # Must drop any residual GPU columns to save memory
-                new_chunk[i] = None
-
-        dfs = cudf.DataFrame._from_data(
-            *data_from_pylibcudf_table(
-                pylibcudf.Table(
-                    [col.to_pylibcudf(mode="read") for col in concatenated_columns]
-                ),
-                column_names=column_names,
-                index_names=None
-                )
-            )
-
-        return _process_metadata(dfs, self.result_meta, self.names, self.row_groups,
-                                 self.filepaths_or_buffers, self.pa_buffers,
-                                 self.allow_range_index, self.cpp_use_pandas_metadata)
-
 cpdef merge_filemetadata(object filemetadata_list):
     """
     Cython function to call into libcudf API, see `merge_row_group_metadata`.
diff --git a/python/cudf/cudf/_lib/pylibcudf/expressions.pyx b/python/cudf/cudf/_lib/pylibcudf/expressions.pyx
index 38de11406ad..b983a617533 100644
--- a/python/cudf/cudf/_lib/pylibcudf/expressions.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/expressions.pyx
@@ -38,6 +38,17 @@ from .types cimport DataType
 # Aliases for simplicity
 ctypedef unique_ptr[libcudf_exp.expression] expression_ptr
 
+# Define this class just to have a docstring for it
+cdef class Expression:
+    """
+    The base class for all expression types.
+    This class cannot be instantiated directly, please
+    instantiate one of its child classes instead.
+
+    For details, see :cpp:class:`cudf::ast::expression`.
+    """
+    pass
+
 cdef class Literal(Expression):
     """
     A literal value used in an abstract syntax tree.
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
index 8dd08d11dc8..55bea4fc262 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 
-set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx types.pyx)
+set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx parquet.pyx types.pyx)
 
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
@@ -22,6 +22,6 @@ rapids_cython_create_modules(
 )
 
 set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_csv pylibcudf_io_datasource
-                                pylibcudf_io_json pylibcudf_io_types
+                                pylibcudf_io_json pylibcudf_io_parquet pylibcudf_io_types
 )
 link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
index 5b3272d60e0..62820048584 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 # CSV is removed since it is def not cpdef (to force kw-only arguments)
-from . cimport avro, datasource, json, types
+from . cimport avro, datasource, json, parquet, types
 from .types cimport SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
index e17deaa4663..27640f7d955 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/io/__init__.py
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from . import avro, csv, datasource, json, types
+from . import avro, csv, datasource, json, parquet, types
 from .types import SinkInfo, SourceInfo, TableWithMetadata
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
new file mode 100644
index 00000000000..027f215fb91
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libc.stdint cimport int64_t
+from libcpp cimport bool
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.expressions cimport Expression
+from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
+    chunked_parquet_reader as cpp_chunked_parquet_reader,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.types cimport DataType
+
+
+cdef class ChunkedParquetReader:
+    cdef unique_ptr[cpp_chunked_parquet_reader] reader
+
+    cpdef bool has_next(self)
+    cpdef TableWithMetadata read_chunk(self)
+
+
+cpdef read_parquet(
+    SourceInfo source_info,
+    list columns = *,
+    list row_groups = *,
+    Expression filters = *,
+    bool convert_strings_to_categories = *,
+    bool use_pandas_metadata = *,
+    int64_t skip_rows = *,
+    size_type num_rows = *,
+    # disabled see comment in parquet.pyx for more
+    # ReaderColumnSchema reader_column_schema = *,
+    # DataType timestamp_type = *
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
new file mode 100644
index 00000000000..96119e1b714
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
@@ -0,0 +1,204 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from cython.operator cimport dereference
+from libc.stdint cimport int64_t
+from libcpp cimport bool
+from libcpp.string cimport string
+from libcpp.utility cimport move
+from libcpp.vector cimport vector
+
+from cudf._lib.pylibcudf.expressions cimport Expression
+from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from cudf._lib.pylibcudf.libcudf.expressions cimport expression
+from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
+    chunked_parquet_reader as cpp_chunked_parquet_reader,
+    parquet_reader_options,
+    read_parquet as cpp_read_parquet,
+)
+from cudf._lib.pylibcudf.libcudf.io.types cimport table_with_metadata
+from cudf._lib.pylibcudf.libcudf.types cimport size_type
+
+
+cdef parquet_reader_options _setup_parquet_reader_options(
+    SourceInfo source_info,
+    list columns = None,
+    list row_groups = None,
+    Expression filters = None,
+    bool convert_strings_to_categories = False,
+    bool use_pandas_metadata = True,
+    int64_t skip_rows = 0,
+    size_type num_rows = -1,
+    # ReaderColumnSchema reader_column_schema = None,
+    # DataType timestamp_type = DataType(type_id.EMPTY)
+):
+    cdef vector[string] col_vec
+    cdef parquet_reader_options opts = (
+        parquet_reader_options.builder(source_info.c_obj)
+        .convert_strings_to_categories(convert_strings_to_categories)
+        .use_pandas_metadata(use_pandas_metadata)
+        .use_arrow_schema(True)
+        .build()
+    )
+    if row_groups is not None:
+        opts.set_row_groups(row_groups)
+    if num_rows != -1:
+        opts.set_num_rows(num_rows)
+    if skip_rows != 0:
+        opts.set_skip_rows(skip_rows)
+    if columns is not None:
+        col_vec.reserve(len(columns))
+        for col in columns:
+            col_vec.push_back(<string>str(col).encode())
+        opts.set_columns(col_vec)
+    if filters is not None:
+        opts.set_filter(<expression &>dereference(filters.c_obj.get()))
+    return opts
+
+
+cdef class ChunkedParquetReader:
+    """
+    Reads chunks of a Parquet file into a :py:class:`~.types.TableWithMetadata`.
+
+    Parameters
+    ----------
+    source_info : SourceInfo
+        The SourceInfo object to read the Parquet file from.
+    columns : list, default None
+        The names of the columns to be read
+    row_groups : list[list[size_type]], default None
+        List of row groups to be read.
+    use_pandas_metadata : bool, default True
+        If True, return metadata about the index column in
+        the per-file user metadata of the ``TableWithMetadata``
+    convert_strings_to_categories : bool, default False
+        Whether to convert string columns to the category type
+    skip_rows : int64_t, default 0
+        The number of rows to skip from the start of the file.
+    num_rows : size_type, default -1
+        The number of rows to read. By default, read the entire file.
+    chunk_read_limit : size_t, default 0
+        Limit on total number of bytes to be returned per read,
+        or 0 if there is no limit.
+    pass_read_limit : size_t, default 1024000000
+        Limit on the amount of memory used for reading and decompressing data
+        or 0 if there is no limit.
+    """
+    def __init__(
+        self,
+        SourceInfo source_info,
+        list columns=None,
+        list row_groups=None,
+        bool use_pandas_metadata=True,
+        bool convert_strings_to_categories=False,
+        int64_t skip_rows = 0,
+        size_type num_rows = -1,
+        size_t chunk_read_limit=0,
+        size_t pass_read_limit=1024000000
+    ):
+
+        cdef parquet_reader_options opts = _setup_parquet_reader_options(
+            source_info,
+            columns,
+            row_groups,
+            filters=None,
+            convert_strings_to_categories=convert_strings_to_categories,
+            use_pandas_metadata=use_pandas_metadata,
+            skip_rows=skip_rows,
+            num_rows=num_rows,
+        )
+
+        with nogil:
+            self.reader.reset(
+                new cpp_chunked_parquet_reader(
+                    chunk_read_limit,
+                    pass_read_limit,
+                    opts
+                )
+            )
+
+    cpdef bool has_next(self):
+        """
+        Returns True if there is another chunk in the Parquet file
+        to be read.
+
+        Returns
+        -------
+        True if we have not finished reading the file.
+        """
+        with nogil:
+            return self.reader.get()[0].has_next()
+
+    cpdef TableWithMetadata read_chunk(self):
+        """
+        Read the next chunk into a :py:class:`~.types.TableWithMetadata`
+
+        Returns
+        -------
+        TableWithMetadata
+            The Table and its corresponding metadata (column names) that were read in.
+        """
+        # Read Parquet
+        cdef table_with_metadata c_result
+
+        with nogil:
+            c_result = move(self.reader.get()[0].read_chunk())
+
+        return TableWithMetadata.from_libcudf(c_result)
+
+cpdef read_parquet(
+    SourceInfo source_info,
+    list columns = None,
+    list row_groups = None,
+    Expression filters = None,
+    bool convert_strings_to_categories = False,
+    bool use_pandas_metadata = True,
+    int64_t skip_rows = 0,
+    size_type num_rows = -1,
+    # Disabled, these aren't used by cudf-python
+    # we should only add them back in if there's user demand
+    # ReaderColumnSchema reader_column_schema = None,
+    # DataType timestamp_type = DataType(type_id.EMPTY)
+):
+    """Reads an Parquet file into a :py:class:`~.types.TableWithMetadata`.
+
+    Parameters
+    ----------
+    source_info : SourceInfo
+        The SourceInfo object to read the Parquet file from.
+    columns : list, default None
+        The string names of the columns to be read.
+    row_groups : list[list[size_type]], default None
+        List of row groups to be read.
+    filters : Expression, default None
+        An AST :py:class:`cudf._lib.pylibcudf.expressions.Expression`
+        to use for predicate pushdown.
+    convert_strings_to_categories : bool, default False
+        Whether to convert string columns to the category type
+    use_pandas_metadata : bool, default True
+        If True, return metadata about the index column in
+        the per-file user metadata of the ``TableWithMetadata``
+    skip_rows : int64_t, default 0
+        The number of rows to skip from the start of the file.
+    num_rows : size_type, default -1
+        The number of rows to read. By default, read the entire file.
+
+    Returns
+    -------
+    TableWithMetadata
+        The Table and its corresponding metadata (column names) that were read in.
+    """
+    cdef table_with_metadata c_result
+    cdef parquet_reader_options opts = _setup_parquet_reader_options(
+        source_info,
+        columns,
+        row_groups,
+        filters,
+        convert_strings_to_categories,
+        use_pandas_metadata,
+        skip_rows,
+        num_rows,
+    )
+
+    with nogil:
+        c_result = move(cpp_read_parquet(opts))
+
+    return TableWithMetadata.from_libcudf(c_result)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
index 68498ff88f4..95fa7d4c2ee 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
@@ -122,6 +122,14 @@ cdef class TableWithMetadata:
         out.metadata = tbl_with_meta.metadata
         return out
 
+    @property
+    def per_file_user_data(self):
+        """
+        Returns a list containing a dict
+        containing file-format specific metadata,
+        for each file being read in.
+        """
+        return self.metadata.per_file_user_data
 
 cdef class SourceInfo:
     """A class containing details on a source to read from.
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
index c38f39f7749..d86915c7da9 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
 from libcpp.functional cimport reference_wrapper
 from libcpp.map cimport map
@@ -27,8 +27,11 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
 
         # setter
 
+        void set_filter(expression &filter) except +
         void set_columns(vector[string] col_names) except +
+        void set_num_rows(size_type val) except +
         void set_row_groups(vector[vector[size_type]] row_grp) except +
+        void set_skip_rows(int64_t val) except +
         void enable_use_arrow_schema(bool val) except +
         void enable_use_pandas_metadata(bool val) except +
         void set_timestamp_type(data_type type) except +
@@ -49,6 +52,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_reader_options_builder& row_groups(
             vector[vector[size_type]] row_grp
         ) except +
+        parquet_reader_options_builder& convert_strings_to_categories(
+            bool val
+        ) except +
         parquet_reader_options_builder& use_pandas_metadata(
             bool val
         ) except +
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 0f0a240b5d0..7dab2f20100 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -929,12 +929,12 @@ def _read_parquet(
                 f"following positional arguments: {list(args)}"
             )
         if cudf.get_option("io.parquet.low_memory"):
-            return libparquet.ParquetReader(
+            return libparquet.read_parquet_chunked(
                 filepaths_or_buffers,
                 columns=columns,
                 row_groups=row_groups,
                 use_pandas_metadata=use_pandas_metadata,
-            ).read()
+            )
         else:
             return libparquet.read_parquet(
                 filepaths_or_buffers,
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index ed2c5ca06c9..e19ff58927f 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -7,6 +7,7 @@
 import numpy as np
 import pyarrow as pa
 import pytest
+from pyarrow.parquet import write_table as pq_write_table
 
 from cudf._lib import pylibcudf as plc
 from cudf._lib.pylibcudf.io.types import CompressionType
@@ -103,25 +104,68 @@ def _make_fields_nullable(typ):
             return pa.list_(new_fields[0])
         return typ
 
+    def _contains_type(parent_typ, typ_checker):
+        """
+        Check whether the parent or one of the children
+        satisfies the typ_checker.
+        """
+        if typ_checker(parent_typ):
+            return True
+        if pa.types.is_nested(parent_typ):
+            for i in range(parent_typ.num_fields):
+                if _contains_type(parent_typ.field(i).type, typ_checker):
+                    return True
+        return False
+
     if not check_field_nullability:
         rhs_type = _make_fields_nullable(rhs.type)
         rhs = rhs.cast(rhs_type)
 
         lhs_type = _make_fields_nullable(lhs.type)
-        lhs = rhs.cast(lhs_type)
-
-    if pa.types.is_floating(lhs.type) and pa.types.is_floating(rhs.type):
-        lhs_nans = pa.compute.is_nan(lhs)
-        rhs_nans = pa.compute.is_nan(rhs)
-        assert lhs_nans.equals(rhs_nans)
-
-        if pa.compute.any(lhs_nans) or pa.compute.any(rhs_nans):
-            # masks must be equal at this point
-            mask = pa.compute.fill_null(pa.compute.invert(lhs_nans), True)
-            lhs = lhs.filter(mask)
-            rhs = rhs.filter(mask)
+        lhs = lhs.cast(lhs_type)
 
-        np.testing.assert_array_almost_equal(lhs, rhs)
+    assert lhs.type == rhs.type, f"{lhs.type} != {rhs.type}"
+    if _contains_type(lhs.type, pa.types.is_floating) and _contains_type(
+        rhs.type, pa.types.is_floating
+    ):
+        # Flatten nested arrays to liststo do comparisons if nested
+        # This is so we can do approximate comparisons
+        # for floats in numpy
+        def _flatten_arrays(arr):
+            if pa.types.is_nested(arr.type):
+                flattened = arr.flatten()
+                flat_arrs = []
+                if isinstance(flattened, list):
+                    for flat_arr in flattened:
+                        flat_arrs += _flatten_arrays(flat_arr)
+                else:
+                    flat_arrs = [flattened]
+            else:
+                flat_arrs = [arr]
+            return flat_arrs
+
+        if isinstance(lhs, (pa.ListArray, pa.StructArray)):
+            lhs = _flatten_arrays(lhs)
+            rhs = _flatten_arrays(rhs)
+        else:
+            # Just a regular doublearray
+            lhs = [lhs]
+            rhs = [rhs]
+
+        for lh_arr, rh_arr in zip(lhs, rhs):
+            # Check NaNs positions match
+            # and then filter out nans
+            lhs_nans = pa.compute.is_nan(lh_arr)
+            rhs_nans = pa.compute.is_nan(rh_arr)
+            assert lhs_nans.equals(rhs_nans)
+
+            if pa.compute.any(lhs_nans) or pa.compute.any(rhs_nans):
+                # masks must be equal at this point
+                mask = pa.compute.fill_null(pa.compute.invert(lhs_nans), True)
+                lh_arr = lh_arr.filter(mask)
+                rh_arr = rh_arr.filter(mask)
+
+            np.testing.assert_array_almost_equal(lh_arr, rh_arr)
     else:
         assert lhs.equals(rhs)
 
@@ -276,6 +320,16 @@ def make_source(path_or_buf, pa_table, format, **kwargs):
         df.to_json(path_or_buf, mode=mode, **kwargs)
     elif format == "csv":
         df.to_csv(path_or_buf, mode=mode, **kwargs)
+    elif format == "parquet":
+        # The conversion to pandas is lossy (doesn't preserve
+        # nested types) so we
+        # will just use pyarrow directly to write this
+        pq_write_table(
+            pa_table,
+            pa.PythonFile(path_or_buf)
+            if isinstance(path_or_buf, io.IOBase)
+            else path_or_buf,
+        )
     if isinstance(path_or_buf, io.IOBase):
         path_or_buf.seek(0)
     return path_or_buf
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py
index 4a7194a6d8d..945e1689229 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/cudf/cudf/pylibcudf_tests/conftest.py
@@ -170,6 +170,21 @@ def source_or_sink(request, tmp_path):
         return fp_or_buf()
 
 
+@pytest.fixture(
+    params=["a.txt", pathlib.Path("a.txt"), io.BytesIO],
+)
+def binary_source_or_sink(request, tmp_path):
+    fp_or_buf = request.param
+    if isinstance(fp_or_buf, str):
+        return f"{tmp_path}/{fp_or_buf}"
+    elif isinstance(fp_or_buf, os.PathLike):
+        return tmp_path.joinpath(fp_or_buf)
+    elif issubclass(fp_or_buf, io.IOBase):
+        # Must construct io.StringIO/io.BytesIO inside
+        # fixture, or we'll end up re-using it
+        return fp_or_buf()
+
+
 unsupported_types = {
     # Not supported by pandas
     # TODO: find a way to test these
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py b/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py
new file mode 100644
index 00000000000..07d2ab3d69a
--- /dev/null
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+import pyarrow as pa
+import pyarrow.compute as pc
+import pytest
+from pyarrow.parquet import read_table
+from utils import assert_table_and_meta_eq, make_source
+
+import cudf._lib.pylibcudf as plc
+from cudf._lib.pylibcudf.expressions import (
+    ASTOperator,
+    ColumnNameReference,
+    ColumnReference,
+    Literal,
+    Operation,
+)
+
+# Shared kwargs to pass to make_source
+_COMMON_PARQUET_SOURCE_KWARGS = {"format": "parquet"}
+
+
+@pytest.mark.parametrize("columns", [None, ["col_int64", "col_bool"]])
+def test_read_parquet_basic(
+    table_data, binary_source_or_sink, nrows_skiprows, columns
+):
+    _, pa_table = table_data
+    nrows, skiprows = nrows_skiprows
+
+    source = make_source(
+        binary_source_or_sink, pa_table, **_COMMON_PARQUET_SOURCE_KWARGS
+    )
+
+    res = plc.io.parquet.read_parquet(
+        plc.io.SourceInfo([source]),
+        num_rows=nrows,
+        skip_rows=skiprows,
+        columns=columns,
+    )
+
+    if columns is not None:
+        pa_table = pa_table.select(columns)
+
+    # Adapt to nrows/skiprows
+    pa_table = pa_table.slice(
+        offset=skiprows, length=nrows if nrows != -1 else None
+    )
+
+    assert_table_and_meta_eq(pa_table, res, check_field_nullability=False)
+
+
+@pytest.mark.parametrize(
+    "pa_filters,plc_filters",
+    [
+        (
+            pc.field("col_int64") >= 10,
+            Operation(
+                ASTOperator.GREATER_EQUAL,
+                ColumnNameReference("col_int64"),
+                Literal(plc.interop.from_arrow(pa.scalar(10))),
+            ),
+        ),
+        (
+            (pc.field("col_int64") >= 10) & (pc.field("col_double") < 0),
+            Operation(
+                ASTOperator.LOGICAL_AND,
+                Operation(
+                    ASTOperator.GREATER_EQUAL,
+                    ColumnNameReference("col_int64"),
+                    Literal(plc.interop.from_arrow(pa.scalar(10))),
+                ),
+                Operation(
+                    ASTOperator.LESS,
+                    ColumnNameReference("col_double"),
+                    Literal(plc.interop.from_arrow(pa.scalar(0.0))),
+                ),
+            ),
+        ),
+        (
+            (pc.field(0) == 10),
+            Operation(
+                ASTOperator.EQUAL,
+                ColumnReference(0),
+                Literal(plc.interop.from_arrow(pa.scalar(10))),
+            ),
+        ),
+    ],
+)
+def test_read_parquet_filters(
+    table_data, binary_source_or_sink, pa_filters, plc_filters
+):
+    _, pa_table = table_data
+
+    source = make_source(
+        binary_source_or_sink, pa_table, **_COMMON_PARQUET_SOURCE_KWARGS
+    )
+
+    plc_table_w_meta = plc.io.parquet.read_parquet(
+        plc.io.SourceInfo([source]), filters=plc_filters
+    )
+    exp = read_table(source, filters=pa_filters)
+    assert_table_and_meta_eq(
+        exp, plc_table_w_meta, check_field_nullability=False
+    )
+
+
+# TODO: Test these options
+# list row_groups = None,
+# ^^^ This one is not tested since it's not in pyarrow/pandas, deprecate?
+# bool convert_strings_to_categories = False,
+# bool use_pandas_metadata = True
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index f2820d9c112..3806b901b10 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -22,7 +22,7 @@
 from pyarrow import fs as pa_fs, parquet as pq
 
 import cudf
-from cudf._lib.parquet import ParquetReader
+from cudf._lib.parquet import read_parquet_chunked
 from cudf.io.parquet import (
     ParquetDatasetWriter,
     ParquetWriter,
@@ -3755,7 +3755,7 @@ def test_parquet_chunked_reader(
     )
     buffer = BytesIO()
     df.to_parquet(buffer)
-    reader = ParquetReader(
+    actual = read_parquet_chunked(
         [buffer],
         chunk_read_limit=chunk_read_limit,
         pass_read_limit=pass_read_limit,
@@ -3765,7 +3765,6 @@ def test_parquet_chunked_reader(
     expected = cudf.read_parquet(
         buffer, use_pandas_metadata=use_pandas_metadata, row_groups=row_groups
     )
-    actual = reader.read()
     assert_eq(expected, actual)
 
 
From e6537de7474c91b4153542e6611c8a4e33a58caa Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 19 Jul 2024 20:10:40 -0700
Subject: [PATCH 558/842] Experimental support for configurable prefetching
 (#16020)

This PR adds experimental support for prefetching managed memory at a select few points in libcudf. A new configuration object is introduced for handling whether prefetching is enabled or disabled, and whether to print debug information about pointers being prefetched. Prefetching control is managed on a per API basis to enable profiling of the effects of prefetching different classes of data in different contexts. Prefetching in this PR always occurs on the default stream, so it will trigger synchronization with any blocking streams that the user has created. Turning on prefetching and then passing non-blocking to any libcudf APIs will trigger undefined behavior.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Thomas Li (https://github.com/lithomas1)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16020
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/cudf/column/column_view.hpp       |  54 ++++--
 cpp/include/cudf/detail/join.hpp              |   3 -
 cpp/include/cudf/strings/detail/gather.cuh    |   7 +-
 .../cudf/strings/detail/strings_children.cuh  |   2 +
 cpp/include/cudf/utilities/prefetch.hpp       | 155 ++++++++++++++++++
 cpp/src/column/column_view.cpp                |  42 +++++
 cpp/src/join/hash_join.cu                     |   2 +
 cpp/src/utilities/prefetch.cpp                |  89 ++++++++++
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |   1 +
 python/cudf/cudf/_lib/pylibcudf/__init__.pxd  |   3 +
 python/cudf/cudf/_lib/pylibcudf/__init__.py   |   3 +
 .../cudf/cudf/_lib/pylibcudf/experimental.pxd |  10 ++
 .../cudf/cudf/_lib/pylibcudf/experimental.pyx |  43 +++++
 .../_lib/pylibcudf/libcudf/experimental.pxd   |  16 ++
 15 files changed, 416 insertions(+), 15 deletions(-)
 create mode 100644 cpp/include/cudf/utilities/prefetch.hpp
 create mode 100644 cpp/src/utilities/prefetch.cpp
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/experimental.pxd
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/experimental.pyx
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/experimental.pxd

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 65347bd6689..5e79204a558 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -674,6 +674,7 @@ add_library(
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
   src/utilities/pinned_memory.cpp
+  src/utilities/prefetch.cpp
   src/utilities/stacktrace.cpp
   src/utilities/stream_pool.cpp
   src/utilities/traits.cpp
diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index 134e835911f..03352fdce13 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,9 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/prefetch.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -72,7 +74,7 @@ class column_view_base {
             CUDF_ENABLE_IF(std::is_same_v<T, void> or is_rep_layout_compatible<T>())>
   T const* head() const noexcept
   {
-    return static_cast<T const*>(_data);
+    return static_cast<T const*>(get_data());
   }
 
   /**
@@ -225,6 +227,17 @@ class column_view_base {
   [[nodiscard]] size_type offset() const noexcept { return _offset; }
 
  protected:
+  /**
+   * @brief Returns pointer to the base device memory allocation.
+   *
+   * The primary purpose of this function is to allow derived classes to
+   * override the fundamental properties of memory accesses without needing to
+   * change all of the different accessors for the underlying pointer.
+   *
+   * @return Typed pointer to underlying data
+   */
+  virtual void const* get_data() const noexcept { return _data; }
+
   data_type _type{type_id::EMPTY};   ///< Element type
   size_type _size{};                 ///< Number of elements
   void const* _data{};               ///< Pointer to device memory containing elements
@@ -236,7 +249,7 @@ class column_view_base {
                                      ///< Enables zero-copy slicing
 
   column_view_base()                        = default;
-  ~column_view_base()                       = default;
+  virtual ~column_view_base()               = default;
   column_view_base(column_view_base const&) = default;  ///< Copy constructor
   column_view_base(column_view_base&&)      = default;  ///< Move constructor
   /**
@@ -283,11 +296,6 @@ class column_view_base {
                    size_type null_count,
                    size_type offset = 0);
 };
-
-class mutable_column_view_base : public column_view_base {
- public:
- protected:
-};
 }  // namespace detail
 
 /**
@@ -323,7 +331,7 @@ class column_view : public detail::column_view_base {
 #ifdef __CUDACC__
 #pragma nv_exec_check_disable
 #endif
-  ~column_view() = default;
+  ~column_view() override = default;
 #ifdef __CUDACC__
 #pragma nv_exec_check_disable
 #endif
@@ -447,6 +455,18 @@ class column_view : public detail::column_view_base {
     return device_span<T const>(data<T>(), size());
   }
 
+ protected:
+  /**
+   * @brief Returns pointer to the base device memory allocation.
+   *
+   * The primary purpose of this function is to allow derived classes to
+   * override the fundamental properties of memory accesses without needing to
+   * change all of the different accessors for the underlying pointer.
+   *
+   * @return Typed pointer to underlying data
+   */
+  void const* get_data() const noexcept override;
+
  private:
   friend column_view bit_cast(column_view const& input, data_type type);
 
@@ -478,7 +498,7 @@ class mutable_column_view : public detail::column_view_base {
  public:
   mutable_column_view() = default;
 
-  ~mutable_column_view(){
+  ~mutable_column_view() override{
     // Needed so that the first instance of the implicit destructor for any TU isn't 'constructed'
     // from a host+device function marking the implicit version also as host+device
   };
@@ -572,7 +592,7 @@ class mutable_column_view : public detail::column_view_base {
   }
 
   /**
-   * @brief Return first element (accounting for offset) when underlying data is
+   * @brief Return first element (accounting for offset) after underlying data is
    * casted to the specified type.
    *
    * This function does not participate in overload resolution if `is_rep_layout_compatible<T>` is
@@ -665,6 +685,18 @@ class mutable_column_view : public detail::column_view_base {
    */
   operator column_view() const;
 
+ protected:
+  /**
+   * @brief Returns pointer to the base device memory allocation.
+   *
+   * The primary purpose of this function is to allow derived classes to
+   * override the fundamental properties of memory accesses without needing to
+   * change all of the different accessors for the underlying pointer.
+   *
+   * @return Typed pointer to underlying data
+   */
+  void const* get_data() const noexcept override;
+
  private:
   friend mutable_column_view bit_cast(mutable_column_view const& input, data_type type);
 
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index aabfff746ea..b4ec5f2cc69 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -40,9 +40,6 @@ class preprocessed_table;
 namespace cudf {
 namespace detail {
 
-// Forward declaration
-class cuco_allocator;
-
 constexpr int DEFAULT_JOIN_CG_SIZE = 2;
 
 enum class join_kind { INNER_JOIN, LEFT_JOIN, FULL_JOIN, LEFT_SEMI_JOIN, LEFT_ANTI_JOIN };
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index fcd74bebfe8..4369de317b3 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -18,11 +18,13 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/prefetch.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -230,7 +232,8 @@ rmm::device_uvector<char> gather_chars(StringIterator strings_begin,
   if (output_count == 0) return rmm::device_uvector<char>(0, stream, mr);
 
   auto chars_data = rmm::device_uvector<char>(chars_bytes, stream, mr);
-  auto d_chars    = chars_data.data();
+  cudf::experimental::prefetch::detail::prefetch("gather", chars_data, stream);
+  auto d_chars = chars_data.data();
 
   constexpr int warps_per_threadblock = 4;
   // String parallel strategy will be used if average string length is above this threshold.
@@ -312,6 +315,8 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
   // build chars column
   auto const offsets_view =
     cudf::detail::offsetalator_factory::make_input_iterator(out_offsets_column->view());
+  cudf::experimental::prefetch::detail::prefetch(
+    "gather", strings.chars_begin(stream), strings.chars_size(stream), stream);
   auto out_chars_data = gather_chars(
     d_strings->begin<string_view>(), begin, end, offsets_view, total_bytes, stream, mr);
 
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index f5f3982a5d6..55b59dd4ff2 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -23,6 +23,7 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/prefetch.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -186,6 +187,7 @@ auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
 
   // Now build the chars column
   rmm::device_uvector<char> chars(bytes, stream, mr);
+  cudf::experimental::prefetch::detail::prefetch("gather", chars, stream);
   size_and_exec_fn.d_chars = chars.data();
 
   // Execute the function fn again to fill in the chars data.
diff --git a/cpp/include/cudf/utilities/prefetch.hpp b/cpp/include/cudf/utilities/prefetch.hpp
new file mode 100644
index 00000000000..5ca6fd6f4b0
--- /dev/null
+++ b/cpp/include/cudf/utilities/prefetch.hpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <map>
+#include <string>
+#include <string_view>
+
+namespace cudf::experimental::prefetch {
+
+namespace detail {
+
+/**
+ * @brief A singleton class that manages the prefetching configuration.
+ */
+class PrefetchConfig {
+ public:
+  PrefetchConfig& operator=(const PrefetchConfig&) = delete;
+  PrefetchConfig(const PrefetchConfig&)            = delete;
+
+  /**
+   * @brief Get the singleton instance of the prefetching configuration.
+   *
+   * @return The singleton instance of the prefetching configuration.
+   */
+  static PrefetchConfig& instance();
+
+  /**
+   * @brief Get the value of a configuration key.
+   *
+   * @param key The configuration key.
+   * @return The value of the configuration key.
+   */
+  bool get(std::string_view key);
+  /**
+   * @brief Set the value of a configuration key.
+   *
+   * @param key The configuration key.
+   * @param value The value to set.
+   */
+  void set(std::string_view key, bool value);
+  /**
+   * @brief Enable or disable debug mode.
+   *
+   * In debug mode, the pointers being prefetched are printed to stderr.
+   */
+  bool debug{false};
+
+ private:
+  PrefetchConfig() = default;                 //< Private constructor to enforce singleton pattern
+  std::map<std::string, bool> config_values;  //< Map of configuration keys to values
+};
+
+/**
+ * @brief Enable prefetching for a particular structure or algorithm.
+ *
+ * @param key The key to enable prefetching for.
+ * @param ptr The pointer to prefetch.
+ * @param size The size of the memory region to prefetch.
+ * @param stream The stream to prefetch on.
+ * @param device_id The device to prefetch on.
+ */
+void prefetch(std::string_view key,
+              void const* ptr,
+              std::size_t size,
+              rmm::cuda_stream_view stream,
+              rmm::cuda_device_id device_id = rmm::get_current_cuda_device());
+
+/**
+ * @brief Enable prefetching for a particular structure or algorithm.
+ *
+ * @note This function will not throw exceptions, so it is safe to call in
+ * noexcept contexts. If an error occurs, the error code is returned. This
+ * function primarily exists for [mutable_]column_view::get_data and should be
+ * removed once an method for stream-ordered data pointer access is added to
+ * those data structures.
+ *
+ * @param key The key to enable prefetching for.
+ * @param ptr The pointer to prefetch.
+ * @param size The size of the memory region to prefetch.
+ * @param stream The stream to prefetch on.
+ * @param device_id The device to prefetch on.
+ */
+cudaError_t prefetch_noexcept(
+  std::string_view key,
+  void const* ptr,
+  std::size_t size,
+  rmm::cuda_stream_view stream,
+  rmm::cuda_device_id device_id = rmm::get_current_cuda_device()) noexcept;
+
+/**
+ * @brief Prefetch the data in a device_uvector.
+ *
+ * @note At present this function does not support stream-ordered execution. Prefetching always
+ * occurs on the default stream.
+ *
+ * @param key The key to enable prefetching for.
+ * @param v The device_uvector to prefetch.
+ * @param stream The stream to prefetch on.
+ * @param device_id The device to prefetch on.
+ */
+template <typename T>
+void prefetch(std::string_view key,
+              rmm::device_uvector<T> const& v,
+              rmm::cuda_stream_view stream,
+              rmm::cuda_device_id device_id = rmm::get_current_cuda_device())
+{
+  if (v.is_empty()) { return; }
+  prefetch(key, v.data(), v.size(), stream, device_id);
+}
+
+}  // namespace detail
+
+/**
+ * @brief Enable prefetching for a particular structure or algorithm.
+ *
+ * @param key The key to enable prefetching for.
+ */
+void enable_prefetching(std::string_view key);
+
+/**
+ * @brief Disable prefetching for a particular structure or algorithm.
+ *
+ * @param key The key to disable prefetching for.
+ */
+void disable_prefetching(std::string_view key);
+
+/**
+ * @brief Enable or disable debug mode.
+ *
+ * In debug mode, the pointers being prefetched are printed to stderr.
+ *
+ * @param enable Whether to enable or disable debug mode.
+ */
+void prefetch_debugging(bool enable);
+
+}  // namespace cudf::experimental::prefetch
diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index 4d16298c605..a9605efb362 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -15,8 +15,10 @@
  */
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/hashing/detail/hashing.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -27,10 +29,37 @@
 #include <algorithm>
 #include <exception>
 #include <numeric>
+#include <string>
 #include <vector>
 
 namespace cudf {
 namespace detail {
+namespace {
+
+template <typename ColumnView>
+void prefetch_col_data(ColumnView& col, void const* data_ptr, std::string_view key) noexcept
+{
+  if (cudf::experimental::prefetch::detail::PrefetchConfig::instance().get(key)) {
+    if (cudf::is_fixed_width(col.type())) {
+      cudf::experimental::prefetch::detail::prefetch_noexcept(
+        key, data_ptr, col.size() * size_of(col.type()), cudf::get_default_stream());
+    } else if (col.type().id() == type_id::STRING) {
+      strings_column_view scv{col};
+
+      cudf::experimental::prefetch::detail::prefetch_noexcept(
+        key,
+        data_ptr,
+        scv.chars_size(cudf::get_default_stream()) * sizeof(char),
+        cudf::get_default_stream());
+    } else {
+      std::cout << key << ": Unsupported type: " << static_cast<int32_t>(col.type().id())
+                << std::endl;
+    }
+  }
+}
+
+}  // namespace
+
 column_view_base::column_view_base(data_type type,
                                    size_type size,
                                    void const* data,
@@ -126,6 +155,7 @@ bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs)
 {
   return shallow_equivalent_impl(lhs, rhs);
 }
+
 }  // namespace detail
 
 // Immutable view constructor
@@ -175,6 +205,18 @@ mutable_column_view::operator column_view() const
   return column_view{_type, _size, _data, _null_mask, _null_count, _offset, std::move(child_views)};
 }
 
+void const* column_view::get_data() const noexcept
+{
+  detail::prefetch_col_data(*this, _data, "column_view::get_data");
+  return _data;
+}
+
+void const* mutable_column_view::get_data() const noexcept
+{
+  detail::prefetch_col_data(*this, _data, "mutable_column_view::get_data");
+  return _data;
+}
+
 size_type count_descendants(column_view parent)
 {
   auto descendants = [](auto const& child) { return count_descendants(child); };
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index b0184ff6a86..eb9b687630b 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -185,6 +185,8 @@ probe_join_hash_table(
 
   auto left_indices  = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
   auto right_indices = std::make_unique<rmm::device_uvector<size_type>>(join_size, stream, mr);
+  cudf::experimental::prefetch::detail::prefetch("hash_join", *left_indices, stream);
+  cudf::experimental::prefetch::detail::prefetch("hash_join", *right_indices, stream);
 
   auto const probe_nulls = cudf::nullate::DYNAMIC{has_nulls};
 
diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp
new file mode 100644
index 00000000000..21f2e40c82a
--- /dev/null
+++ b/cpp/src/utilities/prefetch.cpp
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/prefetch.hpp>
+
+#include <rmm/cuda_device.hpp>
+
+#include <iostream>
+
+namespace cudf::experimental::prefetch {
+
+namespace detail {
+
+PrefetchConfig& PrefetchConfig::instance()
+{
+  static PrefetchConfig instance;
+  return instance;
+}
+
+bool PrefetchConfig::get(std::string_view key)
+{
+  // Default to not prefetching
+  if (config_values.find(key.data()) == config_values.end()) {
+    return (config_values[key.data()] = false);
+  }
+  return config_values[key.data()];
+}
+void PrefetchConfig::set(std::string_view key, bool value) { config_values[key.data()] = value; }
+
+cudaError_t prefetch_noexcept(std::string_view key,
+                              void const* ptr,
+                              std::size_t size,
+                              rmm::cuda_stream_view stream,
+                              rmm::cuda_device_id device_id) noexcept
+{
+  if (PrefetchConfig::instance().get(key)) {
+    if (PrefetchConfig::instance().debug) {
+      std::cerr << "Prefetching " << size << " bytes for key " << key << " at location " << ptr
+                << std::endl;
+    }
+    auto result = cudaMemPrefetchAsync(ptr, size, device_id.value(), stream.value());
+    // Need to flush the CUDA error so that the context is not corrupted.
+    if (result == cudaErrorInvalidValue) { cudaGetLastError(); }
+    return result;
+  }
+  return cudaSuccess;
+}
+
+void prefetch(std::string_view key,
+              void const* ptr,
+              std::size_t size,
+              rmm::cuda_stream_view stream,
+              rmm::cuda_device_id device_id)
+{
+  auto result = prefetch_noexcept(key, ptr, size, stream, device_id);
+  // Ignore cudaErrorInvalidValue because that will be raised if prefetching is
+  // attempted on unmanaged memory.
+  if ((result != cudaErrorInvalidValue) && (result != cudaSuccess)) {
+    std::cerr << "Prefetch failed" << std::endl;
+    CUDF_CUDA_TRY(result);
+  }
+}
+
+}  // namespace detail
+
+void enable_prefetching(std::string_view key) { detail::PrefetchConfig::instance().set(key, true); }
+
+void disable_prefetching(std::string_view key)
+{
+  detail::PrefetchConfig::instance().set(key, false);
+}
+
+void prefetch_debugging(bool enable) { detail::PrefetchConfig::instance().debug = enable; }
+}  // namespace cudf::experimental::prefetch
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index 0800fa18e94..df4591baa71 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -20,6 +20,7 @@ set(cython_sources
     concatenate.pyx
     copying.pyx
     datetime.pyx
+    experimental.pyx
     expressions.pyx
     filling.pyx
     gpumemoryview.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
index 26e89b818d3..71f523fc3cd 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd
@@ -8,6 +8,7 @@ from . cimport (
     concatenate,
     copying,
     datetime,
+    experimental,
     expressions,
     filling,
     groupby,
@@ -48,6 +49,8 @@ __all__ = [
     "concatenate",
     "copying",
     "datetime",
+    "experimental",
+    "expressions",
     "filling",
     "gpumemoryview",
     "groupby",
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py
index e89a5ed9f96..9705eba84b1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py
@@ -7,6 +7,7 @@
     concatenate,
     copying,
     datetime,
+    experimental,
     expressions,
     filling,
     groupby,
@@ -48,6 +49,8 @@
     "concatenate",
     "copying",
     "datetime",
+    "experimental",
+    "expressions",
     "filling",
     "gpumemoryview",
     "groupby",
diff --git a/python/cudf/cudf/_lib/pylibcudf/experimental.pxd b/python/cudf/cudf/_lib/pylibcudf/experimental.pxd
new file mode 100644
index 00000000000..107c91c8365
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/experimental.pxd
@@ -0,0 +1,10 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+
+
+cpdef enable_prefetching(str key)
+
+cpdef disable_prefetching(str key)
+
+cpdef prefetch_debugging(bool enable)
diff --git a/python/cudf/cudf/_lib/pylibcudf/experimental.pyx b/python/cudf/cudf/_lib/pylibcudf/experimental.pyx
new file mode 100644
index 00000000000..1e2a682d879
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/experimental.pyx
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.string cimport string
+
+from cudf._lib.pylibcudf.libcudf cimport experimental as cpp_experimental
+
+
+cpdef enable_prefetching(str key):
+    """Turn on prefetch instructions for the given key.
+
+    Parameters
+    ----------
+    key : str
+        The key to enable prefetching for.
+    """
+    cdef string c_key = key.encode("utf-8")
+    cpp_experimental.enable_prefetching(c_key)
+
+
+cpdef disable_prefetching(str key):
+    """Turn off prefetch instructions for the given key.
+
+    Parameters
+    ----------
+    key : str
+        The key to disable prefetching for.
+    """
+    cdef string c_key = key.encode("utf-8")
+    cpp_experimental.disable_prefetching(c_key)
+
+
+cpdef prefetch_debugging(bool enable):
+    """Enable or disable prefetch debugging.
+
+    When enabled, any prefetch instructions will be logged to the console.
+
+    Parameters
+    ----------
+    enable : bool
+        Whether to enable or disable prefetch debugging.
+    """
+    cpp_experimental.prefetch_debugging(enable)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/experimental.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/experimental.pxd
new file mode 100644
index 00000000000..f280a382a04
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/experimental.pxd
@@ -0,0 +1,16 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+
+from libcpp cimport bool
+from libcpp.string cimport string
+
+
+cdef extern from "cudf/utilities/prefetch.hpp" \
+        namespace "cudf::experimental::prefetch" nogil:
+    # Not technically the right signature, but it's good enough to let Cython
+    # generate valid C++ code. It just means we'll be copying a host string
+    # extra, but that's OK. If we care we could generate string_view bindings,
+    # but there's no real rush so if we go that route we might as well
+    # contribute them upstream to Cython itself.
+    void enable_prefetching(string key)
+    void disable_prefetching(string key)
+    void prefetch_debugging(bool enable)

From 852b151002dc76e9f09d3529c80e4b589f1df9fc Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 22 Jul 2024 14:48:18 +0100
Subject: [PATCH 559/842] Fix issue in horizontal concat implementation in
 cudf-polars (#16271)

Shorter tables must be extended to the same length as the longest table.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16271
---
 python/cudf/cudf/_lib/pylibcudf/column.pyx    | 22 +++++
 .../libcudf/scalar/scalar_factories.pxd       |  3 +
 python/cudf/cudf/_lib/pylibcudf/scalar.pxd    |  4 +
 python/cudf/cudf/_lib/pylibcudf/scalar.pyx    | 20 ++++
 python/cudf_polars/cudf_polars/dsl/ir.py      | 39 ++++++++
 .../cudf_polars/cudf_polars/utils/dtypes.py   |  3 +-
 python/cudf_polars/tests/test_hconcat.py      |  9 ++
 python/cudf_polars/tests/test_join.py         | 93 ++++++++++---------
 python/cudf_polars/tests/utils/test_dtypes.py |  1 +
 9 files changed, 147 insertions(+), 47 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index cb96c1d9fce..a61e0629292 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -255,6 +255,28 @@ cdef class Column:
             c_result = move(make_column_from_scalar(dereference(c_scalar), size))
         return Column.from_libcudf(move(c_result))
 
+    @staticmethod
+    def all_null_like(Column like, size_type size):
+        """Create an all null column from a template.
+
+        Parameters
+        ----------
+        like : Column
+            Column whose type we should mimic
+        size : int
+            Number of rows in the resulting column.
+
+        Returns
+        -------
+        Column
+            An all-null column of `size` rows and type matching `like`.
+        """
+        cdef Scalar slr = Scalar.empty_like(like)
+        cdef unique_ptr[column] c_result
+        with nogil:
+            c_result = move(make_column_from_scalar(dereference(slr.get()), size))
+        return Column.from_libcudf(move(c_result))
+
     @staticmethod
     def from_cuda_array_interface_obj(object obj):
         """Create a Column from an object with a CUDA array interface.
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
index c8220df8938..8092c3d637d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
@@ -3,9 +3,12 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil:
     cdef unique_ptr[scalar] make_string_scalar(const string & _string) except +
     cdef unique_ptr[scalar] make_fixed_width_scalar[T](T value) except +
+
+    cdef unique_ptr[scalar] make_empty_scalar_like(const column_view &) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
index 3de86d93519..e6c9db2f1ac 100644
--- a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
@@ -7,6 +7,7 @@ from rmm._lib.memory_resource cimport DeviceMemoryResource
 
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 
+from .column cimport Column
 from .types cimport DataType
 
 
@@ -24,5 +25,8 @@ cdef class Scalar:
     cpdef DataType type(self)
     cpdef bool is_valid(self)
 
+    @staticmethod
+    cdef Scalar empty_like(Column column)
+
     @staticmethod
     cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*)
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
index 6799c37cea2..67730be07d8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
@@ -2,11 +2,16 @@
 
 from cython cimport no_gc_clear
 from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
 
 from rmm._lib.memory_resource cimport get_current_device_resource
 
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
+    make_empty_scalar_like,
+)
 
+from .column cimport Column
 from .types cimport DataType
 
 
@@ -46,6 +51,21 @@ cdef class Scalar:
         """True if the scalar is valid, false if not"""
         return self.get().is_valid()
 
+    @staticmethod
+    cdef Scalar empty_like(Column column):
+        """Construct a null scalar with the same type as column.
+
+        Parameters
+        ----------
+        column
+            Column to take type from
+
+        Returns
+        -------
+        New empty (null) scalar of the given type.
+        """
+        return Scalar.from_libcudf(move(make_empty_scalar_like(column.view())))
+
     @staticmethod
     cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=None):
         """Construct a Scalar object from a libcudf scalar.
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index a84fe73810e..b934869ffef 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -1101,9 +1101,48 @@ class HConcat(IR):
     dfs: list[IR]
     """List of inputs."""
 
+    @staticmethod
+    def _extend_with_nulls(table: plc.Table, *, nrows: int) -> plc.Table:
+        """
+        Extend a table with nulls.
+
+        Parameters
+        ----------
+        table
+            Table to extend
+        nrows
+            Number of additional rows
+
+        Returns
+        -------
+        New pylibcudf table.
+        """
+        return plc.concatenate.concatenate(
+            [
+                table,
+                plc.Table(
+                    [
+                        plc.Column.all_null_like(column, nrows)
+                        for column in table.columns()
+                    ]
+                ),
+            ]
+        )
+
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
         dfs = [df.evaluate(cache=cache) for df in self.dfs]
+        max_rows = max(df.num_rows for df in dfs)
+        # Horizontal concatenation extends shorter tables with nulls
+        dfs = [
+            df
+            if df.num_rows == max_rows
+            else DataFrame.from_table(
+                self._extend_with_nulls(df.table, nrows=max_rows - df.num_rows),
+                df.column_names,
+            )
+            for df in dfs
+        ]
         return DataFrame(
             list(itertools.chain.from_iterable(df.columns for df in dfs)),
         )
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 918cd024fa2..1279fe91d48 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -153,7 +153,8 @@ def from_polars(dtype: pl.DataType) -> plc.DataType:
         # TODO: Hopefully
         return plc.DataType(plc.TypeId.EMPTY)
     elif isinstance(dtype, pl.List):
-        # TODO: This doesn't consider the value type.
+        # Recurse to catch unsupported inner types
+        _ = from_polars(dtype.inner)
         return plc.DataType(plc.TypeId.LIST)
     else:
         raise NotImplementedError(f"{dtype=} conversion not supported")
diff --git a/python/cudf_polars/tests/test_hconcat.py b/python/cudf_polars/tests/test_hconcat.py
index 46cbb21b25a..4737aa18028 100644
--- a/python/cudf_polars/tests/test_hconcat.py
+++ b/python/cudf_polars/tests/test_hconcat.py
@@ -17,3 +17,12 @@ def test_hconcat():
     ldf2 = ldf.select((pl.col("a") + pl.col("b")).alias("c"))
     query = pl.concat([ldf, ldf2], how="horizontal")
     assert_gpu_result_equal(query)
+
+
+def test_hconcat_different_heights():
+    left = pl.LazyFrame({"a": [1, 2, 3, 4]})
+
+    right = pl.LazyFrame({"b": [[1], [2]], "c": ["a", "bcde"]})
+
+    q = pl.concat([left, right], how="horizontal")
+    assert_gpu_result_equal(q)
diff --git a/python/cudf_polars/tests/test_join.py b/python/cudf_polars/tests/test_join.py
index 1ffbf3c0ef4..1e880cdc6de 100644
--- a/python/cudf_polars/tests/test_join.py
+++ b/python/cudf_polars/tests/test_join.py
@@ -12,65 +12,68 @@
 )
 
 
-@pytest.mark.parametrize(
-    "how",
-    [
-        "inner",
-        "left",
-        "semi",
-        "anti",
-        "full",
-    ],
-)
-@pytest.mark.parametrize("coalesce", [False, True])
-@pytest.mark.parametrize(
-    "join_nulls", [False, True], ids=["nulls_not_equal", "nulls_equal"]
-)
-@pytest.mark.parametrize(
-    "join_expr",
-    [
-        pl.col("a"),
-        pl.col("a") * 2,
-        [pl.col("a"), pl.col("c") + 1],
-        ["c", "a"],
-    ],
-)
-def test_join(how, coalesce, join_nulls, join_expr):
-    left = pl.DataFrame(
+@pytest.fixture(params=[False, True], ids=["nulls_not_equal", "nulls_equal"])
+def join_nulls(request):
+    return request.param
+
+
+@pytest.fixture(params=["inner", "left", "semi", "anti", "full"])
+def how(request):
+    return request.param
+
+
+@pytest.fixture
+def left():
+    return pl.LazyFrame(
         {
             "a": [1, 2, 3, 1, None],
             "b": [1, 2, 3, 4, 5],
             "c": [2, 3, 4, 5, 6],
         }
-    ).lazy()
-    right = pl.DataFrame(
+    )
+
+
+@pytest.fixture
+def right():
+    return pl.LazyFrame(
         {
             "a": [1, 4, 3, 7, None, None],
             "c": [2, 3, 4, 5, 6, 7],
         }
-    ).lazy()
+    )
 
+
+@pytest.mark.parametrize(
+    "join_expr",
+    [
+        pl.col("a"),
+        pl.col("a") * 2,
+        [pl.col("a"), pl.col("c") + 1],
+        ["c", "a"],
+    ],
+)
+def test_non_coalesce_join(left, right, how, join_nulls, join_expr):
     query = left.join(
-        right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=coalesce
+        right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=False
     )
     assert_gpu_result_equal(query, check_row_order=how == "left")
 
 
-def test_cross_join():
-    left = pl.DataFrame(
-        {
-            "a": [1, 2, 3, 1, None],
-            "b": [1, 2, 3, 4, 5],
-            "c": [2, 3, 4, 5, 6],
-        }
-    ).lazy()
-    right = pl.DataFrame(
-        {
-            "a": [1, 4, 3, 7, None, None],
-            "c": [2, 3, 4, 5, 6, 7],
-        }
-    ).lazy()
+@pytest.mark.parametrize(
+    "join_expr",
+    [
+        pl.col("a"),
+        ["c", "a"],
+    ],
+)
+def test_coalesce_join(left, right, how, join_nulls, join_expr):
+    query = left.join(
+        right, on=join_expr, how=how, join_nulls=join_nulls, coalesce=True
+    )
+    assert_gpu_result_equal(query, check_row_order=False)
+
 
+def test_cross_join(left, right):
     q = left.join(right, how="cross")
 
     assert_gpu_result_equal(q)
@@ -79,9 +82,7 @@ def test_cross_join():
 @pytest.mark.parametrize(
     "left_on,right_on", [(pl.col("a"), pl.lit(2)), (pl.lit(2), pl.col("a"))]
 )
-def test_join_literal_key_unsupported(left_on, right_on):
-    left = pl.LazyFrame({"a": [1, 2, 3], "b": [3, 4, 5]})
-    right = pl.LazyFrame({"a": [1, 2, 3], "b": [5, 6, 7]})
+def test_join_literal_key_unsupported(left, right, left_on, right_on):
     q = left.join(right, left_on=left_on, right_on=right_on, how="inner")
 
     assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/utils/test_dtypes.py b/python/cudf_polars/tests/utils/test_dtypes.py
index 535fdd846a0..bbdb4faa256 100644
--- a/python/cudf_polars/tests/utils/test_dtypes.py
+++ b/python/cudf_polars/tests/utils/test_dtypes.py
@@ -16,6 +16,7 @@
         pl.Time(),
         pl.Struct({"a": pl.Int8, "b": pl.Float32}),
         pl.Datetime("ms", time_zone="US/Pacific"),
+        pl.List(pl.Datetime("ms", time_zone="US/Pacific")),
         pl.Array(pl.Int8, 2),
         pl.Binary(),
         pl.Categorical(),

From 135c99512e5f7a2d38f6a870ad6883ccb39a3cce Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 22 Jul 2024 04:13:32 -1000
Subject: [PATCH 560/842] Align Series APIs with pandas 2.x (#16333)

Similar to https://github.com/rapidsai/cudf/pull/16310, the follow APIs have been modified to adjust/add parameters

* `reindex`
* `reset_index`
* `add_suffix`
* `searchsorted`
* `clip`
* `mask`
* `shift`
* `dropna`
* `rename`
* `cov`
* `apply`
* `replace`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16333
---
 python/cudf/cudf/core/dataframe.py     |  19 ++-
 python/cudf/cudf/core/frame.py         |   9 +-
 python/cudf/cudf/core/indexed_frame.py |  87 +++++++++++--
 python/cudf/cudf/core/series.py        | 164 +++++++++++++++++++++----
 4 files changed, 240 insertions(+), 39 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index dbc7f10b569..288bdfd39b3 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2844,6 +2844,10 @@ def reindex(
             index=index,
             inplace=False,
             fill_value=fill_value,
+            level=level,
+            method=method,
+            limit=limit,
+            tolerance=tolerance,
         )
 
     @_performance_tracking
@@ -3187,7 +3191,14 @@ class  speed  type
         )
     )
     def reset_index(
-        self, level=None, drop=False, inplace=False, col_level=0, col_fill=""
+        self,
+        level=None,
+        drop=False,
+        inplace=False,
+        col_level=0,
+        col_fill="",
+        allow_duplicates: bool = False,
+        names: abc.Hashable | abc.Sequence[abc.Hashable] | None = None,
     ):
         return self._mimic_inplace(
             DataFrame._from_data(
@@ -3196,6 +3207,8 @@ def reset_index(
                     drop=drop,
                     col_level=col_level,
                     col_fill=col_fill,
+                    allow_duplicates=allow_duplicates,
+                    names=names,
                 )
             ),
             inplace=inplace,
@@ -3666,7 +3679,9 @@ def add_prefix(self, prefix, axis=None):
         return out
 
     @_performance_tracking
-    def add_suffix(self, suffix):
+    def add_suffix(self, suffix, axis=None):
+        if axis is not None:
+            raise NotImplementedError("axis is currently not implemented.")
         # TODO: Change to deep=False when copy-on-write is default
         out = self.copy(deep=True)
         out.columns = [
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 04ecae4ba85..32c313e42d3 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1187,6 +1187,7 @@ def searchsorted(
         self,
         values,
         side: Literal["left", "right"] = "left",
+        sorter=None,
         ascending: bool = True,
         na_position: Literal["first", "last"] = "last",
     ) -> ScalarLike | cupy.ndarray:
@@ -1199,6 +1200,10 @@ def searchsorted(
         side : str {'left', 'right'} optional, default 'left'
             If 'left', the index of the first suitable location found is given
             If 'right', return the last such index
+        sorter : 1-D array-like, optional
+            Optional array of integer indices that sort `self` into ascending
+            order. They are typically the result of ``np.argsort``.
+            Currently not supported.
         ascending : bool optional, default True
             Sorted Frame is in ascending order (otherwise descending)
         na_position : str {'last', 'first'} optional, default 'last'
@@ -1245,10 +1250,12 @@ def searchsorted(
         >>> df.searchsorted(values_df, ascending=False)
         array([4, 4, 4, 0], dtype=int32)
         """
-        # Call libcudf search_sorted primitive
+        # Note: pandas.DataFrame does not support searchsorted
 
         if na_position not in {"first", "last"}:
             raise ValueError(f"invalid na_position: {na_position}")
+        elif sorter is not None:
+            raise NotImplementedError("sorter is currently not supported.")
 
         scalar_flag = None
         if is_scalar(values):
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index e75b51e0d43..e14f8923c25 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -84,6 +84,9 @@
 {argument}
         inplace : bool, default False
             Modify the DataFrame in place (do not create a new object).
+        allow_duplicates : bool, default False
+            Allow duplicate column labels to be created.
+            Currently not supported.
 
         Returns
         -------
@@ -902,7 +905,7 @@ def replace(
         return self._mimic_inplace(result, inplace=inplace)
 
     @_performance_tracking
-    def clip(self, lower=None, upper=None, inplace=False, axis=1):
+    def clip(self, lower=None, upper=None, axis=1, inplace=False):
         """
         Trim values at input threshold(s).
 
@@ -1779,7 +1782,14 @@ def skew(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         )
 
     @_performance_tracking
-    def mask(self, cond, other=None, inplace: bool = False) -> Self | None:
+    def mask(
+        self,
+        cond,
+        other=None,
+        inplace: bool = False,
+        axis=None,
+        level=None,
+    ) -> Self | None:
         """
         Replace values where the condition is True.
 
@@ -1831,6 +1841,10 @@ def mask(self, cond, other=None, inplace: bool = False) -> Self | None:
         4       0
         dtype: int64
         """
+        if axis is not None:
+            raise NotImplementedError("axis is not supported.")
+        elif level is not None:
+            raise NotImplementedError("level is not supported.")
 
         if not hasattr(cond, "__invert__"):
             # We Invert `cond` below and call `where`, so
@@ -2042,13 +2056,26 @@ def interpolate(
         )
 
     @_performance_tracking
-    def shift(self, periods=1, freq=None, axis=0, fill_value=None):
+    def shift(
+        self,
+        periods=1,
+        freq=None,
+        axis=0,
+        fill_value=None,
+        suffix: str | None = None,
+    ):
         """Shift values by `periods` positions."""
         axis = self._get_axis_from_axis_arg(axis)
         if axis != 0:
-            raise ValueError("Only axis=0 is supported.")
+            raise NotImplementedError("Only axis=0 is supported.")
         if freq is not None:
-            raise ValueError("The freq argument is not yet supported.")
+            raise NotImplementedError(
+                "The freq argument is not yet supported."
+            )
+        if suffix is not None:
+            raise NotImplementedError(
+                "The suffix argument is not yet supported."
+            )
 
         data_columns = (
             col.shift(periods, fill_value) for col in self._columns
@@ -3225,7 +3252,9 @@ def _split(self, splits, keep_index=True):
         ]
 
     @_performance_tracking
-    def bfill(self, value=None, axis=None, inplace=None, limit=None):
+    def bfill(
+        self, value=None, axis=None, inplace=None, limit=None, limit_area=None
+    ):
         """
         Synonym for :meth:`Series.fillna` with ``method='bfill'``.
 
@@ -3233,6 +3262,9 @@ def bfill(self, value=None, axis=None, inplace=None, limit=None):
         -------
             Object with missing values filled or None if ``inplace=True``.
         """
+        if limit_area is not None:
+            raise NotImplementedError("limit_area is currently not supported.")
+
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", FutureWarning)
             return self.fillna(
@@ -3264,7 +3296,14 @@ def backfill(self, value=None, axis=None, inplace=None, limit=None):
         return self.bfill(value=value, axis=axis, inplace=inplace, limit=limit)
 
     @_performance_tracking
-    def ffill(self, value=None, axis=None, inplace=None, limit=None):
+    def ffill(
+        self,
+        value=None,
+        axis=None,
+        inplace=None,
+        limit=None,
+        limit_area: Literal["inside", "outside", None] = None,
+    ):
         """
         Synonym for :meth:`Series.fillna` with ``method='ffill'``.
 
@@ -3272,6 +3311,9 @@ def ffill(self, value=None, axis=None, inplace=None, limit=None):
         -------
             Object with missing values filled or None if ``inplace=True``.
         """
+        if limit_area is not None:
+            raise NotImplementedError("limit_area is currently not supported.")
+
         with warnings.catch_warnings():
             warnings.simplefilter("ignore", FutureWarning)
             return self.fillna(
@@ -3363,7 +3405,7 @@ def add_prefix(self, prefix, axis=None):
                 Use `Series.add_prefix` or `DataFrame.add_prefix`"
         )
 
-    def add_suffix(self, suffix):
+    def add_suffix(self, suffix, axis=None):
         """
         Suffix labels with string `suffix`.
 
@@ -3653,6 +3695,10 @@ def _reindex(
         index=None,
         inplace=False,
         fill_value=NA,
+        level=None,
+        method=None,
+        limit=None,
+        tolerance=None,
     ):
         """
         Helper for `.reindex`
@@ -3677,6 +3723,15 @@ def _reindex(
         -------
         Series or DataFrame
         """
+        if method is not None:
+            raise NotImplementedError("method is not currently supported.")
+        if level is not None:
+            raise NotImplementedError("level is not currently supported.")
+        if limit is not None:
+            raise NotImplementedError("limit is not currently supported.")
+        if tolerance is not None:
+            raise NotImplementedError("tolerance is not currently supported.")
+
         if dtypes is None:
             dtypes = {}
 
@@ -4303,8 +4358,22 @@ def take(self, indices, axis=0):
 
         return self._gather(GatherMap(indices, len(self), nullify=False))
 
-    def _reset_index(self, level, drop, col_level=0, col_fill=""):
+    def _reset_index(
+        self,
+        level,
+        drop,
+        col_level=0,
+        col_fill="",
+        allow_duplicates: bool = False,
+        names: abc.Hashable | abc.Sequence[abc.Hashable] | None = None,
+    ):
         """Shared path for DataFrame.reset_index and Series.reset_index."""
+        if allow_duplicates is not False:
+            raise NotImplementedError(
+                "allow_duplicates is not currently supported."
+            )
+        elif names is not None:
+            raise NotImplementedError("names is not currently supported.")
         if level is not None:
             if (
                 isinstance(level, int)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 94c33eed37a..8277ccf68fc 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -918,7 +918,18 @@ def to_dict(self, into: type[dict] = dict) -> dict:
         return self.to_pandas().to_dict(into=into)
 
     @_performance_tracking
-    def reindex(self, *args, **kwargs):
+    def reindex(
+        self,
+        index=None,
+        *,
+        axis=None,
+        method: str | None = None,
+        copy: bool = True,
+        level=None,
+        fill_value: ScalarLike | None = None,
+        limit: int | None = None,
+        tolerance=None,
+    ):
         """
         Conform Series to new index.
 
@@ -927,6 +938,8 @@ def reindex(self, *args, **kwargs):
         index : Index, Series-convertible, default None
             New labels / index to conform to,
             should be specified using keywords.
+        axis: int, default None
+            Unused.
         method: Not Supported
         copy : boolean, default True
         level: Not Supported
@@ -965,27 +978,23 @@ def reindex(self, *args, **kwargs):
             where it is cast to float in Pandas.
 
         """
-        if len(args) > 1:
-            raise TypeError(
-                "Only one positional argument ('index') is allowed"
-            )
-        if args:
-            (index,) = args
-            if "index" in kwargs:
-                raise TypeError(
-                    "'index' passed as both positional and keyword argument"
-                )
-        else:
-            index = kwargs.get("index", self.index)
+        if index is None:
+            index = self.index
+        if fill_value is None:
+            fill_value = cudf.NA
 
         name = self.name or 0
         series = self._reindex(
-            deep=kwargs.get("copy", True),
+            deep=copy,
             dtypes={name: self.dtype},
             index=index,
             column_names=[name],
             inplace=False,
-            fill_value=kwargs.get("fill_value", cudf.NA),
+            fill_value=fill_value,
+            level=level,
+            method=method,
+            limit=limit,
+            tolerance=tolerance,
         )
         series.name = self.name
         return series
@@ -1054,14 +1063,21 @@ def reindex(self, *args, **kwargs):
         )
     )
     def reset_index(
-        self, level=None, drop=False, name=no_default, inplace=False
+        self,
+        level=None,
+        drop=False,
+        name=no_default,
+        inplace=False,
+        allow_duplicates=False,
     ):
         if not drop and inplace:
             raise TypeError(
                 "Cannot reset_index inplace on a Series "
                 "to create a DataFrame"
             )
-        data, index = self._reset_index(level=level, drop=drop)
+        data, index = self._reset_index(
+            level=level, drop=drop, allow_duplicates=allow_duplicates
+        )
         if not drop:
             if name is no_default:
                 name = 0 if self.name is None else self.name
@@ -1632,7 +1648,9 @@ def has_nulls(self):
         return self._column.has_nulls()
 
     @_performance_tracking
-    def dropna(self, axis=0, inplace=False, how=None):
+    def dropna(
+        self, axis=0, inplace=False, how=None, ignore_index: bool = False
+    ):
         """
         Return a Series with null values removed.
 
@@ -1644,6 +1662,8 @@ def dropna(self, axis=0, inplace=False, how=None):
             If True, do operation inplace and return None.
         how : str, optional
             Not in use. Kept for compatibility.
+        ignore_index : bool, default ``False``
+            If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
 
         Returns
         -------
@@ -1709,6 +1729,9 @@ def dropna(self, axis=0, inplace=False, how=None):
 
         result = super().dropna(axis=axis)
 
+        if ignore_index:
+            result.index = RangeIndex(len(result))
+
         return self._mimic_inplace(result, inplace=inplace)
 
     @_performance_tracking
@@ -2046,10 +2069,31 @@ def astype(
         return super().astype(dtype, copy, errors)
 
     @_performance_tracking
-    def sort_index(self, axis=0, *args, **kwargs):
+    def sort_index(
+        self,
+        axis=0,
+        level=None,
+        ascending=True,
+        inplace=False,
+        kind=None,
+        na_position="last",
+        sort_remaining=True,
+        ignore_index=False,
+        key=None,
+    ):
         if axis not in (0, "index"):
             raise ValueError("Only axis=0 is valid for Series.")
-        return super().sort_index(axis=axis, *args, **kwargs)
+        return super().sort_index(
+            axis=axis,
+            level=level,
+            ascending=ascending,
+            inplace=inplace,
+            kind=kind,
+            na_position=na_position,
+            sort_remaining=sort_remaining,
+            ignore_index=ignore_index,
+            key=key,
+        )
 
     @_performance_tracking
     def sort_values(
@@ -2278,14 +2322,29 @@ def argsort(
         )
 
     @_performance_tracking
-    def replace(self, to_replace=None, value=no_default, *args, **kwargs):
+    def replace(
+        self,
+        to_replace=None,
+        value=no_default,
+        inplace=False,
+        limit=None,
+        regex=False,
+        method=no_default,
+    ):
         if is_dict_like(to_replace) and value not in {None, no_default}:
             raise ValueError(
                 "Series.replace cannot use dict-like to_replace and non-None "
                 "value"
             )
 
-        return super().replace(to_replace, value, *args, **kwargs)
+        return super().replace(
+            to_replace,
+            value,
+            inplace=inplace,
+            limit=limit,
+            regex=regex,
+            method=method,
+        )
 
     @_performance_tracking
     def update(self, other):
@@ -2394,7 +2453,14 @@ def update(self, other):
 
     # UDF related
     @_performance_tracking
-    def apply(self, func, convert_dtype=True, args=(), **kwargs):
+    def apply(
+        self,
+        func,
+        convert_dtype=True,
+        args=(),
+        by_row: Literal[False, "compat"] = "compat",
+        **kwargs,
+    ):
         """
         Apply a scalar function to the values of a Series.
         Similar to ``pandas.Series.apply``.
@@ -2421,6 +2487,18 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
             See examples for details.
         args : tuple
             Positional arguments passed to func after the series value.
+        by_row : False or "compat", default "compat"
+            If ``"compat"`` and func is a callable, func will be passed each element of
+            the Series, like ``Series.map``. If func is a list or dict of
+            callables, will first try to translate each func into pandas methods. If
+            that doesn't work, will try call to apply again with ``by_row="compat"``
+            and if that fails, will call apply again with ``by_row=False``
+            (backward compatible).
+            If False, the func will be passed the whole Series at once.
+
+            ``by_row`` has no effect when ``func`` is a string.
+
+            Currently not implemented.
         **kwargs
             Not supported
 
@@ -2530,6 +2608,8 @@ def apply(self, func, convert_dtype=True, args=(), **kwargs):
         """
         if convert_dtype is not True:
             raise ValueError("Series.apply only supports convert_dtype=True")
+        elif by_row != "compat":
+            raise NotImplementedError("by_row is currently not supported.")
 
         result = self._apply(func, _get_scalar_kernel, *args, **kwargs)
         result.name = self.name
@@ -2643,7 +2723,7 @@ def round(self, decimals=0, how="half_even"):
         return super().round(decimals, how)
 
     @_performance_tracking
-    def cov(self, other, min_periods=None):
+    def cov(self, other, min_periods=None, ddof: int | None = None):
         """
         Compute covariance with Series, excluding missing values.
 
@@ -2676,6 +2756,8 @@ def cov(self, other, min_periods=None):
             raise NotImplementedError(
                 "min_periods parameter is not implemented yet"
             )
+        if ddof is not None:
+            raise NotImplementedError("ddof parameter is not implemented yet")
 
         if self.empty or other.empty:
             return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
@@ -3389,7 +3471,15 @@ def groupby(
         )
 
     @_performance_tracking
-    def rename(self, index=None, copy=True):
+    def rename(
+        self,
+        index=None,
+        axis=None,
+        copy: bool = True,
+        inplace: bool = False,
+        level=None,
+        errors: Literal["ignore", "raise"] = "ignore",
+    ):
         """
         Alter Series name
 
@@ -3399,8 +3489,21 @@ def rename(self, index=None, copy=True):
         ----------
         index : Scalar, optional
             Scalar to alter the Series.name attribute
+        axis : {0 or 'index'}
+            Unused. Parameter needed for compatibility with DataFrame.
         copy : boolean, default True
             Also copy underlying data
+        inplace : bool, default False
+            Whether to return a new Series. If True the value of copy is ignored.
+            Currently not supported.
+        level : int or level name, default None
+            In case of MultiIndex, only rename labels in the specified level.
+            Currently not supported.
+        errors : {'ignore', 'raise'}, default 'ignore'
+            If 'raise', raise `KeyError` when a `dict-like mapper` or
+            `index` contains labels that are not present in the index being transformed.
+            If 'ignore', existing keys will be renamed and extra keys will be ignored.
+            Currently not supported.
 
         Returns
         -------
@@ -3429,8 +3532,13 @@ def rename(self, index=None, copy=True):
             :meth:`pandas.Series.rename`
 
             - Supports scalar values only for changing name attribute
-            - The ``inplace`` and ``level`` is not supported
         """
+        if inplace is not False:
+            raise NotImplementedError("inplace is currently not supported.")
+        if level is not None:
+            raise NotImplementedError("level is currently not supported.")
+        if errors != "ignore":
+            raise NotImplementedError("errors is currently not supported.")
         out_data = self._data.copy(deep=copy)
         return Series._from_data(out_data, self.index, name=index)
 
@@ -3445,7 +3553,9 @@ def add_prefix(self, prefix, axis=None):
         )
 
     @_performance_tracking
-    def add_suffix(self, suffix):
+    def add_suffix(self, suffix, axis=None):
+        if axis is not None:
+            raise NotImplementedError("axis is currently not implemented.")
         return Series._from_data(
             # TODO: Change to deep=False when copy-on-write is default
             data=self._data.copy(deep=True),

From 3053f42351b04e22d873f78f5bc49f8b20ff17ac Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Mon, 22 Jul 2024 10:56:39 -0700
Subject: [PATCH 561/842] Add missing `stream` param to dictionary factory APIs
 (#16319)

Add `stream` param to dictionary column factory functions. Partially solves #13744

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16319
---
 .../cudf/dictionary/dictionary_factories.hpp  | 13 ++++--
 cpp/src/dictionary/dictionary_factories.cu    | 13 ++++--
 cpp/tests/streams/dictionary_test.cpp         | 46 +++++++++++++++++++
 3 files changed, 64 insertions(+), 8 deletions(-)

diff --git a/cpp/include/cudf/dictionary/dictionary_factories.hpp b/cpp/include/cudf/dictionary/dictionary_factories.hpp
index 7cdfa3bf9e5..21f593e1aec 100644
--- a/cpp/include/cudf/dictionary/dictionary_factories.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_factories.hpp
@@ -87,12 +87,17 @@ std::unique_ptr<column> make_dictionary_column(
  * @param indices_column Indices to use for the new dictionary column.
  * @param null_mask Null mask for the output column.
  * @param null_count Number of nulls for the output column.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column.
  */
-std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_column,
-                                               std::unique_ptr<column> indices_column,
-                                               rmm::device_buffer&& null_mask,
-                                               size_type null_count);
+std::unique_ptr<column> make_dictionary_column(
+  std::unique_ptr<column> keys_column,
+  std::unique_ptr<column> indices_column,
+  rmm::device_buffer&& null_mask,
+  size_type null_count,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Construct a dictionary column by taking ownership of the provided keys
diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu
index 37f8fa7a05b..0617d71fa51 100644
--- a/cpp/src/dictionary/dictionary_factories.cu
+++ b/cpp/src/dictionary/dictionary_factories.cu
@@ -77,7 +77,9 @@ std::unique_ptr<column> make_dictionary_column(column_view const& keys_column,
 std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_column,
                                                std::unique_ptr<column> indices_column,
                                                rmm::device_buffer&& null_mask,
-                                               size_type null_count)
+                                               size_type null_count,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
 {
   CUDF_EXPECTS(!keys_column->has_nulls(), "keys column must not have nulls");
   CUDF_EXPECTS(!indices_column->has_nulls(), "indices column must not have nulls");
@@ -89,7 +91,7 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_colu
   children.emplace_back(std::move(keys_column));
   return std::make_unique<column>(data_type{type_id::DICTIONARY32},
                                   count,
-                                  rmm::device_buffer{},
+                                  rmm::device_buffer{0, stream, mr},
                                   std::move(null_mask),
                                   null_count,
                                   std::move(children));
@@ -134,8 +136,11 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys,
   auto indices_column = [&] {
     // If the types match, then just commandeer the column's data buffer.
     if (new_type.id() == indices_type) {
-      return std::make_unique<column>(
-        new_type, indices_size, std::move(*(contents.data.release())), rmm::device_buffer{}, 0);
+      return std::make_unique<column>(new_type,
+                                      indices_size,
+                                      std::move(*(contents.data.release())),
+                                      rmm::device_buffer{0, stream, mr},
+                                      0);
     }
     // If the new type does not match, then convert the data.
     cudf::column_view cast_view{
diff --git a/cpp/tests/streams/dictionary_test.cpp b/cpp/tests/streams/dictionary_test.cpp
index 9e81c8574b8..03e4cf47470 100644
--- a/cpp/tests/streams/dictionary_test.cpp
+++ b/cpp/tests/streams/dictionary_test.cpp
@@ -26,6 +26,52 @@
 
 class DictionaryTest : public cudf::test::BaseFixture {};
 
+TEST_F(DictionaryTest, FactoryColumnViews)
+{
+  cudf::test::strings_column_wrapper keys({"aaa", "ccc", "ddd", "www"});
+  cudf::test::fixed_width_column_wrapper<uint8_t> values{2, 0, 3, 1, 2, 2, 2, 3, 0};
+
+  auto dictionary = cudf::make_dictionary_column(keys, values, cudf::test::get_default_stream());
+  cudf::dictionary_column_view view(dictionary->view());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values);
+}
+
+TEST_F(DictionaryTest, FactoryColumns)
+{
+  std::vector<std::string> h_keys{"aaa", "ccc", "ddd", "www"};
+  cudf::test::strings_column_wrapper keys(h_keys.begin(), h_keys.end());
+  std::vector<uint8_t> h_values{2, 0, 3, 1, 2, 2, 2, 3, 0};
+  cudf::test::fixed_width_column_wrapper<uint8_t> values(h_values.begin(), h_values.end());
+
+  auto dictionary = cudf::make_dictionary_column(
+    keys.release(), values.release(), cudf::test::get_default_stream());
+  cudf::dictionary_column_view view(dictionary->view());
+
+  cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end());
+  cudf::test::fixed_width_column_wrapper<uint8_t> values_expected(h_values.begin(), h_values.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected);
+}
+
+TEST_F(DictionaryTest, FactoryColumnsNullMaskCount)
+{
+  std::vector<std::string> h_keys{"aaa", "ccc", "ddd", "www"};
+  cudf::test::strings_column_wrapper keys(h_keys.begin(), h_keys.end());
+  std::vector<uint8_t> h_values{2, 0, 3, 1, 2, 2, 2, 3, 0};
+  cudf::test::fixed_width_column_wrapper<uint8_t> values(h_values.begin(), h_values.end());
+
+  auto dictionary = cudf::make_dictionary_column(
+    keys.release(), values.release(), rmm::device_buffer{}, 0, cudf::test::get_default_stream());
+  cudf::dictionary_column_view view(dictionary->view());
+
+  cudf::test::strings_column_wrapper keys_expected(h_keys.begin(), h_keys.end());
+  cudf::test::fixed_width_column_wrapper<uint8_t> values_expected(h_values.begin(), h_values.end());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.keys(), keys_expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(view.indices(), values_expected);
+}
+
 TEST_F(DictionaryTest, Encode)
 {
   cudf::test::fixed_width_column_wrapper<int> col({1, 2, 3, 4, 5});

From e54b82c9f3499b35e7e789d41d2042a5d5a80810 Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Tue, 23 Jul 2024 05:03:04 +1000
Subject: [PATCH 562/842] Use resource_ref for upstream in
 stream_checking_resource_adaptor (#16187)

As we move toward replacing all `device_memory_resource` pointers with `resource_ref`s, there are some places that changes can be made ahead of RMM to simplify required changes as RMM is refactored.

In this PR I eliminate the unnecessary `Upstream` template parameter from `cudf_test::stream_checking_resource_adaptor`, and use a `device_async_resource` for the upstream resource.   A similar change will be made to all RMM resource adaptors, but this one can be done without deprecations since it is just a test utility.

Authors:
  - Mark Harris (https://github.com/harrism)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16187
---
 .../stream_checking_resource_adaptor.hpp      | 33 +++++++++----------
 cpp/include/cudf_test/testing_main.hpp        | 10 +++---
 2 files changed, 19 insertions(+), 24 deletions(-)

diff --git a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
index 5a077e86a0f..4f3c723d195 100644
--- a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
+++ b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
@@ -24,13 +24,11 @@
 
 #include <iostream>
 
+namespace cudf::test {
+
 /**
  * @brief Resource that verifies that the default stream is not used in any allocation.
- *
- * @tparam Upstream Type of the upstream resource used for
- * allocation/deallocation.
  */
-template <typename Upstream>
 class stream_checking_resource_adaptor final : public rmm::mr::device_memory_resource {
  public:
   /**
@@ -40,14 +38,13 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
    *
    * @param upstream The resource used for allocating/deallocating device memory
    */
-  stream_checking_resource_adaptor(Upstream* upstream,
+  stream_checking_resource_adaptor(rmm::device_async_resource_ref upstream,
                                    bool error_on_invalid_stream,
                                    bool check_default_stream)
     : upstream_{upstream},
       error_on_invalid_stream_{error_on_invalid_stream},
       check_default_stream_{check_default_stream}
   {
-    CUDF_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
   }
 
   stream_checking_resource_adaptor()                                                   = delete;
@@ -86,7 +83,7 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
   void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
   {
     verify_stream(stream);
-    return upstream_->allocate(bytes, stream);
+    return upstream_.allocate_async(bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
   }
 
   /**
@@ -101,7 +98,7 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
   void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) override
   {
     verify_stream(stream);
-    upstream_->deallocate(ptr, bytes, stream);
+    upstream_.deallocate_async(ptr, bytes, rmm::CUDA_ALLOCATION_ALIGNMENT, stream);
   }
 
   /**
@@ -113,8 +110,8 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
   [[nodiscard]] bool do_is_equal(device_memory_resource const& other) const noexcept override
   {
     if (this == &other) { return true; }
-    auto cast = dynamic_cast<stream_checking_resource_adaptor<Upstream> const*>(&other);
-    if (cast == nullptr) { return upstream_->is_equal(other); }
+    auto cast = dynamic_cast<stream_checking_resource_adaptor const*>(&other);
+    if (cast == nullptr) { return false; }
     return get_upstream_resource() == cast->get_upstream_resource();
   }
 
@@ -150,7 +147,8 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
     }
   }
 
-  Upstream* upstream_;            // the upstream resource used for satisfying allocation requests
+  rmm::device_async_resource_ref
+    upstream_;                    // the upstream resource used for satisfying allocation requests
   bool error_on_invalid_stream_;  // If true, throw an exception when the wrong stream is detected.
                                   // If false, simply print to stdout.
   bool check_default_stream_;  // If true, throw an exception when the default stream is observed.
@@ -162,13 +160,12 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
  * @brief Convenience factory to return a `stream_checking_resource_adaptor` around the
  * upstream resource `upstream`.
  *
- * @tparam Upstream Type of the upstream `device_memory_resource`.
- * @param upstream Pointer to the upstream resource
+ * @param upstream Reference to the upstream resource
  */
-template <typename Upstream>
-stream_checking_resource_adaptor<Upstream> make_stream_checking_resource_adaptor(
-  Upstream* upstream, bool error_on_invalid_stream, bool check_default_stream)
+inline stream_checking_resource_adaptor make_stream_checking_resource_adaptor(
+  rmm::device_async_resource_ref upstream, bool error_on_invalid_stream, bool check_default_stream)
 {
-  return stream_checking_resource_adaptor<Upstream>{
-    upstream, error_on_invalid_stream, check_default_stream};
+  return stream_checking_resource_adaptor{upstream, error_on_invalid_stream, check_default_stream};
 }
+
+}  // namespace cudf::test
diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp
index 66b831b917f..3ad4b127f80 100644
--- a/cpp/include/cudf_test/testing_main.hpp
+++ b/cpp/include/cudf_test/testing_main.hpp
@@ -32,8 +32,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
-namespace cudf {
-namespace test {
+namespace cudf::test {
 
 /// MR factory functions
 inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
@@ -91,8 +90,7 @@ inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
   CUDF_FAIL("Invalid RMM allocation mode: " + allocation_mode);
 }
 
-}  // namespace test
-}  // namespace cudf
+}  // namespace cudf::test
 
 /**
  * @brief Parses the cuDF test command line options.
@@ -182,8 +180,8 @@ inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
   auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();
   auto const error_on_invalid_stream = (stream_error_mode == "error");
   auto const check_default_stream    = (stream_mode == "new_cudf_default");
-  auto adaptor =
-    make_stream_checking_resource_adaptor(resource, error_on_invalid_stream, check_default_stream);
+  auto adaptor                       = cudf::test::make_stream_checking_resource_adaptor(
+    resource, error_on_invalid_stream, check_default_stream);
   if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {
     rmm::mr::set_current_device_resource(&adaptor);
   }

From e0a00c1fcb4b72b7abd29debe5b2f6b38081d39a Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Mon, 22 Jul 2024 12:03:24 -0700
Subject: [PATCH 563/842] Add `stream` param to list explode APIs (#16317)

Add `stream` param to list `explode*` APIs. Partially fixes https://github.com/rapidsai/cudf/issues/13744

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16317
---
 cpp/include/cudf/lists/explode.hpp        |  8 ++++
 cpp/include/cudf/lists/set_operations.hpp |  2 +-
 cpp/src/lists/explode.cu                  | 29 +++++++-----
 cpp/tests/streams/lists_test.cpp          | 57 ++++++++++++++++++++++-
 4 files changed, 82 insertions(+), 14 deletions(-)

diff --git a/cpp/include/cudf/lists/explode.hpp b/cpp/include/cudf/lists/explode.hpp
index 81d82dcfa09..303f182ce8c 100644
--- a/cpp/include/cudf/lists/explode.hpp
+++ b/cpp/include/cudf/lists/explode.hpp
@@ -66,6 +66,7 @@ namespace cudf {
  *
  * @param input_table Table to explode.
  * @param explode_column_idx Column index to explode inside the table.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
  * @return A new table with explode_col exploded.
@@ -73,6 +74,7 @@ namespace cudf {
 std::unique_ptr<table> explode(
   table_view const& input_table,
   size_type explode_column_idx,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -109,6 +111,7 @@ std::unique_ptr<table> explode(
  *
  * @param input_table Table to explode.
  * @param explode_column_idx Column index to explode inside the table.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
  * @return A new table with exploded value and position. The column order of return table is
@@ -117,6 +120,7 @@ std::unique_ptr<table> explode(
 std::unique_ptr<table> explode_position(
   table_view const& input_table,
   size_type explode_column_idx,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -152,6 +156,7 @@ std::unique_ptr<table> explode_position(
  *
  * @param input_table Table to explode.
  * @param explode_column_idx Column index to explode inside the table.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
  * @return A new table with explode_col exploded.
@@ -159,6 +164,7 @@ std::unique_ptr<table> explode_position(
 std::unique_ptr<table> explode_outer(
   table_view const& input_table,
   size_type explode_column_idx,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -196,6 +202,7 @@ std::unique_ptr<table> explode_outer(
  *
  * @param input_table Table to explode.
  * @param explode_column_idx Column index to explode inside the table.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
  * @return A new table with explode_col exploded.
@@ -203,6 +210,7 @@ std::unique_ptr<table> explode_outer(
 std::unique_ptr<table> explode_outer_position(
   table_view const& input_table,
   size_type explode_column_idx,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/lists/set_operations.hpp b/cpp/include/cudf/lists/set_operations.hpp
index b8abfd62461..871e66b2d83 100644
--- a/cpp/include/cudf/lists/set_operations.hpp
+++ b/cpp/include/cudf/lists/set_operations.hpp
@@ -53,8 +53,8 @@ namespace cudf::lists {
  * @param nulls_equal Flag to specify whether null elements should be considered as equal, default
  *        to be `UNEQUAL` which means only non-null elements are checked for overlapping
  * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal
- * @param mr Device memory resource used to allocate the returned object
  * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned object
  * @return A column of type BOOL containing the check results
  */
 std::unique_ptr<column> have_overlap(
diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu
index 370d7480578..46c4fc78a6f 100644
--- a/cpp/src/lists/explode.cu
+++ b/cpp/src/lists/explode.cu
@@ -229,8 +229,8 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
   if (null_or_empty_count == 0) {
     // performance penalty to run the below loop if there are no nulls or empty lists.
     // run simple explode instead
-    return include_position ? explode_position(input_table, explode_column_idx, stream, mr)
-                            : explode(input_table, explode_column_idx, stream, mr);
+    return include_position ? detail::explode_position(input_table, explode_column_idx, stream, mr)
+                            : detail::explode(input_table, explode_column_idx, stream, mr);
   }
 
   auto gather_map_size = sliced_child.size() + null_or_empty_count;
@@ -300,58 +300,63 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
 }  // namespace detail
 
 /**
- * @copydoc cudf::explode(table_view const&, size_type, rmm::device_async_resource_ref)
+ * @copydoc cudf::explode(table_view const&, size_type, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode(table_view const& input_table,
                                size_type explode_column_idx,
+                               rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
                "Unsupported non-list column");
-  return detail::explode(input_table, explode_column_idx, cudf::get_default_stream(), mr);
+  return detail::explode(input_table, explode_column_idx, stream, mr);
 }
 
 /**
- * @copydoc cudf::explode_position(table_view const&, size_type, rmm::device_async_resource_ref)
+ * @copydoc cudf::explode_position(table_view const&, size_type, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode_position(table_view const& input_table,
                                         size_type explode_column_idx,
+                                        rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
                "Unsupported non-list column");
-  return detail::explode_position(input_table, explode_column_idx, cudf::get_default_stream(), mr);
+  return detail::explode_position(input_table, explode_column_idx, stream, mr);
 }
 
 /**
- * @copydoc cudf::explode_outer(table_view const&, size_type, rmm::device_async_resource_ref)
+ * @copydoc cudf::explode_outer(table_view const&, size_type, rmm::cuda_stream_view,
+ * rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode_outer(table_view const& input_table,
                                      size_type explode_column_idx,
+                                     rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
                "Unsupported non-list column");
-  return detail::explode_outer(
-    input_table, explode_column_idx, false, cudf::get_default_stream(), mr);
+  return detail::explode_outer(input_table, explode_column_idx, false, stream, mr);
 }
 
 /**
  * @copydoc cudf::explode_outer_position(table_view const&, size_type,
- * rmm::device_async_resource_ref)
+ * rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> explode_outer_position(table_view const& input_table,
                                               size_type explode_column_idx,
+                                              rmm::cuda_stream_view stream,
                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
                "Unsupported non-list column");
-  return detail::explode_outer(
-    input_table, explode_column_idx, true, cudf::get_default_stream(), mr);
+  return detail::explode_outer(input_table, explode_column_idx, true, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/streams/lists_test.cpp b/cpp/tests/streams/lists_test.cpp
index 711e20e4b17..7963dced292 100644
--- a/cpp/tests/streams/lists_test.cpp
+++ b/cpp/tests/streams/lists_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 #include <cudf/lists/combine.hpp>
 #include <cudf/lists/contains.hpp>
 #include <cudf/lists/count_elements.hpp>
+#include <cudf/lists/explode.hpp>
 #include <cudf/lists/extract.hpp>
 #include <cudf/lists/filling.hpp>
 #include <cudf/lists/gather.hpp>
@@ -212,3 +213,57 @@ TEST_F(ListTest, HaveOverlap)
                             cudf::nan_equality::ALL_EQUAL,
                             cudf::test::get_default_stream());
 }
+
+TEST_F(ListTest, Explode)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> list_col_a{100, 200, 300};
+  cudf::test::lists_column_wrapper<int32_t> list_col_b{
+    cudf::test::lists_column_wrapper<int32_t>{1, 2, 7},
+    cudf::test::lists_column_wrapper<int32_t>{5, 6},
+    cudf::test::lists_column_wrapper<int32_t>{0, 3}};
+  cudf::test::strings_column_wrapper list_col_c{"string0", "string1", "string2"};
+  cudf::table_view lists_table({list_col_a, list_col_b, list_col_c});
+  cudf::explode(lists_table, 1, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ExplodePosition)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> list_col_a{100, 200, 300};
+  cudf::test::lists_column_wrapper<int32_t> list_col_b{
+    cudf::test::lists_column_wrapper<int32_t>{1, 2, 7},
+    cudf::test::lists_column_wrapper<int32_t>{5, 6},
+    cudf::test::lists_column_wrapper<int32_t>{0, 3}};
+  cudf::test::strings_column_wrapper list_col_c{"string0", "string1", "string2"};
+  cudf::table_view lists_table({list_col_a, list_col_b, list_col_c});
+  cudf::explode_position(lists_table, 1, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ExplodeOuter)
+{
+  constexpr auto null = 0;
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
+  cudf::test::lists_column_wrapper<int32_t> list_col_a{
+    cudf::test::lists_column_wrapper<int32_t>({1, null, 7}, valids),
+    cudf::test::lists_column_wrapper<int32_t>({5, null, 0, null}, valids),
+    cudf::test::lists_column_wrapper<int32_t>{},
+    cudf::test::lists_column_wrapper<int32_t>({0, null, 8}, valids)};
+  cudf::test::fixed_width_column_wrapper<int32_t> list_col_b{100, 200, 300, 400};
+  cudf::table_view lists_table({list_col_a, list_col_b});
+  cudf::explode_outer(lists_table, 0, cudf::test::get_default_stream());
+}
+
+TEST_F(ListTest, ExplodeOuterPosition)
+{
+  constexpr auto null = 0;
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
+  cudf::test::lists_column_wrapper<int32_t> list_col_a{
+    cudf::test::lists_column_wrapper<int32_t>({1, null, 7}, valids),
+    cudf::test::lists_column_wrapper<int32_t>({5, null, 0, null}, valids),
+    cudf::test::lists_column_wrapper<int32_t>{},
+    cudf::test::lists_column_wrapper<int32_t>({0, null, 8}, valids)};
+  cudf::test::fixed_width_column_wrapper<int32_t> list_col_b{100, 200, 300, 400};
+  cudf::table_view lists_table({list_col_a, list_col_b});
+  cudf::explode_outer_position(lists_table, 0, cudf::test::get_default_stream());
+}

From c14c8bf59fd1e97fe94c8dfd2db6df7f9a6c65ad Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Mon, 22 Jul 2024 12:03:56 -0700
Subject: [PATCH 564/842] Implement parquet reading using pylibcudf in
 cudf-polars (#16346)

Replace cudf-classic with pylibcudf for parquet reading in cudf-polars.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16346
---
 .../cudf_polars/containers/dataframe.py       | 12 ---------
 python/cudf_polars/cudf_polars/dsl/ir.py      | 26 +++++++++----------
 python/cudf_polars/tests/test_scan.py         | 10 +------
 3 files changed, 14 insertions(+), 34 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index cbeadf1426a..dba76855329 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -23,8 +23,6 @@
 
     from typing_extensions import Self
 
-    import cudf
-
     from cudf_polars.containers import Column
 
 
@@ -83,16 +81,6 @@ def num_rows(self) -> int:
         """Number of rows."""
         return 0 if len(self.columns) == 0 else self.table.num_rows()
 
-    @classmethod
-    def from_cudf(cls, df: cudf.DataFrame) -> Self:
-        """Create from a cudf dataframe."""
-        return cls(
-            [
-                NamedColumn(c.to_pylibcudf(mode="read"), name)
-                for name, c in df._data.items()
-            ]
-        )
-
     @classmethod
     def from_polars(cls, df: pl.DataFrame) -> Self:
         """
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index b934869ffef..e5691cba7dd 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -25,7 +25,6 @@
 
 import polars as pl
 
-import cudf
 import cudf._lib.pylibcudf as plc
 
 import cudf_polars.dsl.expr as expr
@@ -205,8 +204,6 @@ class Scan(IR):
 
     def __post_init__(self) -> None:
         """Validate preconditions."""
-        if self.file_options.n_rows is not None:
-            raise NotImplementedError("row limit in scan")
         if self.typ not in ("csv", "parquet"):
             raise NotImplementedError(f"Unhandled scan type: {self.typ}")
         if self.cloud_options is not None and any(
@@ -241,6 +238,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         options = self.file_options
         with_columns = options.with_columns
         row_index = options.row_index
+        nrows = self.file_options.n_rows if self.file_options.n_rows is not None else -1
         if self.typ == "csv":
             parse_options = self.reader_options["parse_options"]
             sep = chr(parse_options["separator"])
@@ -295,6 +293,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                     comment=comment,
                     decimal=decimal,
                     dtypes=self.schema,
+                    nrows=nrows,
                 )
                 pieces.append(tbl_w_meta)
             tables, colnames = zip(
@@ -308,9 +307,16 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 colnames[0],
             )
         elif self.typ == "parquet":
-            cdf = cudf.read_parquet(self.paths, columns=with_columns)
-            assert isinstance(cdf, cudf.DataFrame)
-            df = DataFrame.from_cudf(cdf)
+            tbl_w_meta = plc.io.parquet.read_parquet(
+                plc.io.SourceInfo(self.paths),
+                columns=with_columns,
+                num_rows=nrows,
+            )
+            df = DataFrame.from_table(
+                tbl_w_meta.tbl,
+                # TODO: consider nested column names?
+                tbl_w_meta.column_names(include_children=False),
+            )
         else:
             raise NotImplementedError(
                 f"Unhandled scan type: {self.typ}"
@@ -337,13 +343,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 null_order=plc.types.NullOrder.AFTER,
             )
             df = DataFrame([index, *df.columns])
-        # TODO: should be true, but not the case until we get
-        # cudf-classic out of the loop for IO since it converts date32
-        # to datetime.
-        # assert all(
-        #     c.obj.type() == dtype
-        #     for c, dtype in zip(df.columns, self.schema.values())
-        # )
+        assert all(c.obj.type() == self.schema[c.name] for c in df.columns)
         if self.predicate is None:
             return df
         else:
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index 0981a96a34a..642b6ae8a37 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -24,15 +24,7 @@ def row_index(request):
 
 
 @pytest.fixture(
-    params=[
-        None,
-        pytest.param(
-            2, marks=pytest.mark.xfail(reason="No handling of row limit in scan")
-        ),
-        pytest.param(
-            3, marks=pytest.mark.xfail(reason="No handling of row limit in scan")
-        ),
-    ],
+    params=[None, 2, 3],
     ids=["all-rows", "n_rows-with-skip", "n_rows-no-skip"],
 )
 def n_rows(request):

From 996cb8d870b7b6153802bde670435e8cd3b8775d Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Mon, 22 Jul 2024 16:15:16 -0400
Subject: [PATCH 565/842] Migrate lists/sorting to pylibcudf (#16179)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16179
---
 python/cudf/cudf/_lib/lists.pyx               | 28 +++------
 .../_lib/pylibcudf/libcudf/lists/sorting.pxd  |  6 ++
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  4 +-
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 57 ++++++++++++++++++-
 .../cudf/cudf/pylibcudf_tests/test_lists.py   | 46 +++++++++++++++
 5 files changed, 118 insertions(+), 23 deletions(-)

diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 76f37c3b845..50061f6e468 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -11,9 +11,6 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
     lists_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
-    sort_lists as cpp_sort_lists,
-)
 from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
     distinct as cpp_distinct,
 )
@@ -21,7 +18,6 @@ from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
     null_equality,
     null_order,
-    order,
     size_type,
 )
 from cudf._lib.utils cimport columns_from_pylibcudf_table
@@ -80,24 +76,14 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal):
 
 @acquire_spill_lock()
 def sort_lists(Column col, bool ascending, str na_position):
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
-    )
-    cdef order c_sort_order = (
-        order.ASCENDING if ascending else order.DESCENDING
-    )
-    cdef null_order c_null_prec = (
-        null_order.BEFORE if na_position == "first" else null_order.AFTER
-    )
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_sort_lists(list_view.get()[0], c_sort_order, c_null_prec)
+    return Column.from_pylibcudf(
+        pylibcudf.lists.sort_lists(
+            col.to_pylibcudf(mode="read"),
+            ascending,
+            null_order.BEFORE if na_position == "first" else null_order.AFTER,
+            False,
         )
-
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
index 145ab41302f..337ac73908b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
@@ -15,3 +15,9 @@ cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil:
         order column_order,
         null_order null_precedence
     ) except +
+
+    cdef unique_ptr[column] stable_sort_lists(
+        const lists_column_view source_column,
+        order column_order,
+        null_order null_precedence
+    ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index 38eb575ee8d..cacecae6010 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -2,7 +2,7 @@
 
 from libcpp cimport bool
 
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
@@ -35,3 +35,5 @@ cpdef Column segmented_gather(Column, Column)
 cpdef Column extract_list_element(Column, ColumnOrSizeType)
 
 cpdef Column count_elements(Column)
+
+cpdef Column sort_lists(Column, bool, null_order, bool stable = *)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index ea469642dd5..b5661a3e634 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -23,8 +23,12 @@ from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
 from cudf._lib.pylibcudf.libcudf.lists.extract cimport (
     extract_list_element as cpp_extract_list_element,
 )
+from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
+    sort_lists as cpp_sort_lists,
+    stable_sort_lists as cpp_stable_sort_lists,
+)
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, size_type
 from cudf._lib.pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType
 
 from .column cimport Column, ListColumnView
@@ -320,3 +324,54 @@ cpdef Column count_elements(Column input):
         c_result = move(cpp_count_elements(list_view.view()))
 
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column sort_lists(
+    Column input,
+    bool ascending,
+    null_order na_position,
+    bool stable = False
+):
+    """Sort the elements within a list in each row of a list column.
+
+    For details, see :cpp:func:`sort_lists`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    ascending : bool
+        If true, the sort order is ascending. Otherwise, the sort order is descending.
+    na_position : NullOrder
+        If na_position equals NullOrder.FIRST, then the null values in the output
+        column are placed first. Otherwise, they are be placed after.
+    stable: bool
+        If true :cpp:func:`stable_sort_lists` is used, Otherwise,
+        :cpp:func:`sort_lists` is used.
+
+    Returns
+    -------
+    Column
+        A new Column with elements in each list sorted.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+
+    cdef order c_sort_order = (
+        order.ASCENDING if ascending else order.DESCENDING
+    )
+
+    with nogil:
+        if stable:
+            c_result = move(cpp_stable_sort_lists(
+                    list_view.view(),
+                    c_sort_order,
+                    na_position,
+            ))
+        else:
+            c_result = move(cpp_sort_lists(
+                    list_view.view(),
+                    c_sort_order,
+                    na_position,
+            ))
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index 7cfed884f90..87472f6d59b 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -22,6 +22,11 @@ def column():
     return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32())
 
 
+@pytest.fixture
+def lists_column():
+    return [[4, 2, 3, 1], [1, 2, None, 4], [-10, 10, 10, 0]]
+
+
 def test_concatenate_rows(test_data):
     arrow_tbl = pa.Table.from_arrays(test_data[0], names=["a", "b"])
     plc_tbl = plc.interop.from_arrow(arrow_tbl)
@@ -191,3 +196,44 @@ def test_count_elements(test_data):
     expect = pa.array([1, 1, 0, 3], type=pa.int32())
 
     assert_column_eq(expect, res)
+
+
+@pytest.mark.parametrize(
+    "ascending,na_position,expected",
+    [
+        (
+            True,
+            plc.types.NullOrder.BEFORE,
+            [[1, 2, 3, 4], [None, 1, 2, 4], [-10, 0, 10, 10]],
+        ),
+        (
+            True,
+            plc.types.NullOrder.AFTER,
+            [[1, 2, 3, 4], [1, 2, 4, None], [-10, 0, 10, 10]],
+        ),
+        (
+            False,
+            plc.types.NullOrder.BEFORE,
+            [[4, 3, 2, 1], [4, 2, 1, None], [10, 10, 0, -10]],
+        ),
+        (
+            False,
+            plc.types.NullOrder.AFTER,
+            [[4, 3, 2, 1], [None, 4, 2, 1], [10, 10, 0, -10]],
+        ),
+        (
+            False,
+            plc.types.NullOrder.AFTER,
+            [[4, 3, 2, 1], [None, 4, 2, 1], [10, 10, 0, -10]],
+        ),
+    ],
+)
+def test_sort_lists(lists_column, ascending, na_position, expected):
+    plc_column = plc.interop.from_arrow(pa.array(lists_column))
+    res = plc.lists.sort_lists(plc_column, ascending, na_position, False)
+    res_stable = plc.lists.sort_lists(plc_column, ascending, na_position, True)
+
+    expect = pa.array(expected)
+
+    assert_column_eq(expect, res)
+    assert_column_eq(expect, res_stable)

From 81e65ee312af5133ca2b98d52efaeb29c274a825 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 22 Jul 2024 15:18:40 -0500
Subject: [PATCH 566/842] Fix docstring of `DataFrame.apply` (#16351)

This PR fixes docstring of `DataFrame.apply`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16351
---
 python/cudf/cudf/core/dataframe.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 288bdfd39b3..1d7136e61e3 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -4525,7 +4525,6 @@ def apply(
             If False, the funcs will be passed the whole Series at once.
 
             Currently not supported.
-
         engine : {'python', 'numba'}, default 'python'
             Unused. Added for compatibility with pandas.
         engine_kwargs : dict

From 0cac2a9d68341a38721be16132ead14cf4a0d70b Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 22 Jul 2024 14:18:21 -0700
Subject: [PATCH 567/842] Remove size constraints on source files in batched
 JSON reading (#16162)

Addresses https://github.com/rapidsai/cudf/issues/16138
The batched multi-source JSON reader fails when the size of any of the input source buffers exceeds `INT_MAX` bytes.
The goal of this PR is to remove this constraint by modifying the batching behavior of the reader.  Instead of constructing batches that include entire source files, the batches are now constructed at the granularity of byte ranges of size at most `INT_MAX` bytes,

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16162
---
 cpp/include/cudf/io/json.hpp                  |   4 +-
 cpp/src/io/json/read_json.cu                  | 139 +++++++++---------
 cpp/src/io/json/read_json.hpp                 |  18 ++-
 cpp/tests/CMakeLists.txt                      |  14 +-
 .../json_chunked_reader.cu}                   |  81 ++--------
 .../json_quote_normalization_test.cpp         |   0
 cpp/tests/io/{ => json}/json_test.cpp         |   0
 cpp/tests/io/{ => json}/json_tree.cpp         |   0
 .../io/{ => json}/json_type_cast_test.cu      |   0
 cpp/tests/io/json/json_utils.cuh              | 105 +++++++++++++
 .../json_whitespace_normalization_test.cu     |   0
 cpp/tests/io/{ => json}/json_writer.cpp       |   0
 cpp/tests/io/{ => json}/nested_json_test.cpp  |   0
 .../{json_tests.cpp => json_tests.cu}         |  45 +++++-
 14 files changed, 242 insertions(+), 164 deletions(-)
 rename cpp/tests/io/{json_chunked_reader.cpp => json/json_chunked_reader.cu} (64%)
 rename cpp/tests/io/{ => json}/json_quote_normalization_test.cpp (100%)
 rename cpp/tests/io/{ => json}/json_test.cpp (100%)
 rename cpp/tests/io/{ => json}/json_tree.cpp (100%)
 rename cpp/tests/io/{ => json}/json_type_cast_test.cu (100%)
 create mode 100644 cpp/tests/io/json/json_utils.cuh
 rename cpp/tests/io/{ => json}/json_whitespace_normalization_test.cu (100%)
 rename cpp/tests/io/{ => json}/json_writer.cpp (100%)
 rename cpp/tests/io/{ => json}/nested_json_test.cpp (100%)
 rename cpp/tests/large_strings/{json_tests.cpp => json_tests.cu} (50%)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 7af90766ad0..d47266fdd12 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -333,14 +333,14 @@ class json_reader_options {
    *
    * @param offset Number of bytes of offset
    */
-  void set_byte_range_offset(size_type offset) { _byte_range_offset = offset; }
+  void set_byte_range_offset(size_t offset) { _byte_range_offset = offset; }
 
   /**
    * @brief Set number of bytes to read.
    *
    * @param size Number of bytes to read
    */
-  void set_byte_range_size(size_type size) { _byte_range_size = size; }
+  void set_byte_range_size(size_t size) { _byte_range_size = size; }
 
   /**
    * @brief Set delimiter separating records in JSON lines
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 9cd39038348..0ba4dedfc34 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -148,20 +148,12 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   return buffer.first(uncomp_data.size());
 }
 
-size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::datasource>> sources,
-                                        json_reader_options const& reader_opts,
-                                        char const delimiter,
-                                        rmm::cuda_stream_view stream)
+size_t estimate_size_per_subchunk(size_t chunk_size)
 {
-  auto total_source_size = sources_size(sources, 0, 0) + (sources.size() - 1);
-  rmm::device_uvector<char> buffer(total_source_size, stream);
-  auto readbufspan = ingest_raw_input(buffer,
-                                      sources,
-                                      reader_opts.get_compression(),
-                                      reader_opts.get_byte_range_offset(),
-                                      reader_opts.get_byte_range_size(),
-                                      stream);
-  return find_first_delimiter(readbufspan, '\n', stream);
+  auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); };
+  // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to
+  // 10kb) and the byte range size
+  return geometric_mean(std::ceil((double)chunk_size / num_subchunks), min_subchunk_size);
 }
 
 /**
@@ -183,7 +175,6 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
   rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); };
 
   size_t const total_source_size            = sources_size(sources, 0, 0);
   auto constexpr num_delimiter_chars        = 1;
@@ -198,17 +189,8 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
   auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
   chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size;
 
-  // Some magic numbers
-  constexpr int num_subchunks               = 10;  // per chunk_size
-  constexpr size_t min_subchunk_size        = 10000;
-  int const num_subchunks_prealloced        = should_load_all_sources ? 0 : 3;
-  constexpr int estimated_compression_ratio = 4;
-
-  // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to
-  // 10kb) and the byte range size
-
-  size_t const size_per_subchunk =
-    geometric_mean(std::ceil((double)chunk_size / num_subchunks), min_subchunk_size);
+  int const num_subchunks_prealloced = should_load_all_sources ? 0 : max_subchunks_prealloced;
+  size_t const size_per_subchunk     = estimate_size_per_subchunk(chunk_size);
 
   // The allocation for single source compressed input is estimated by assuming a ~4:1
   // compression ratio. For uncompressed inputs, we can getter a better estimate using the idea
@@ -308,67 +290,78 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                  "Multiple inputs are supported only for JSON Lines format");
   }
 
-  std::for_each(sources.begin(), sources.end(), [](auto const& source) {
-    CUDF_EXPECTS(source->size() < std::numeric_limits<int>::max(),
-                 "The size of each source file must be less than INT_MAX bytes");
-  });
-
-  constexpr size_t batch_size_ub = std::numeric_limits<int>::max();
-  size_t const chunk_offset      = reader_opts.get_byte_range_offset();
+  /*
+   * The batched JSON reader enforces that the size of each batch is at most INT_MAX
+   * bytes (~2.14GB). Batches are defined to be byte range chunks - characterized by
+   * chunk offset and chunk size - that may span across multiple source files.
+   * Note that the batched reader does not work for compressed inputs or for regular
+   * JSON inputs.
+   */
+  size_t const total_source_size = sources_size(sources, 0, 0);
+  size_t chunk_offset            = reader_opts.get_byte_range_offset();
   size_t chunk_size              = reader_opts.get_byte_range_size();
-  chunk_size                     = !chunk_size ? sources_size(sources, 0, 0) : chunk_size;
-
-  // Identify the position of starting source file from which to begin batching based on
-  // byte range offset. If the offset is larger than the sum of all source
-  // sizes, then start_source is total number of source files i.e. no file is read
-  size_t const start_source = [&]() {
-    size_t sum = 0;
+  chunk_size                     = !chunk_size ? total_source_size - chunk_offset
+                                               : std::min(chunk_size, total_source_size - chunk_offset);
+
+  size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size);
+  size_t const batch_size_ub =
+    std::numeric_limits<int>::max() - (max_subchunks_prealloced * size_per_subchunk);
+
+  /*
+   * Identify the position (zero-indexed) of starting source file from which to begin
+   * batching based on byte range offset. If the offset is larger than the sum of all
+   * source sizes, then start_source is total number of source files i.e. no file is
+   * read
+   */
+
+  // Prefix sum of source file sizes
+  size_t pref_source_size = 0;
+  // Starting source file from which to being batching evaluated using byte range offset
+  size_t const start_source = [chunk_offset, &sources, &pref_source_size]() {
     for (size_t src_idx = 0; src_idx < sources.size(); ++src_idx) {
-      if (sum + sources[src_idx]->size() > chunk_offset) return src_idx;
-      sum += sources[src_idx]->size();
+      if (pref_source_size + sources[src_idx]->size() > chunk_offset) { return src_idx; }
+      pref_source_size += sources[src_idx]->size();
     }
     return sources.size();
   }();
-
-  // Construct batches of source files, with starting position of batches indicated by
-  // batch_positions. The size of each batch i.e. the sum of sizes of the source files in the batch
-  // is capped at INT_MAX bytes.
-  size_t cur_size = 0;
-  std::vector<size_t> batch_positions;
-  std::vector<size_t> batch_sizes;
-  batch_positions.push_back(0);
-  for (size_t i = start_source; i < sources.size(); i++) {
-    cur_size += sources[i]->size();
-    if (cur_size >= batch_size_ub) {
-      batch_positions.push_back(i);
-      batch_sizes.push_back(cur_size - sources[i]->size());
-      cur_size = sources[i]->size();
+  /*
+   * Construct batches of byte ranges spanning source files, with the starting position of batches
+   * indicated by `batch_offsets`. `pref_bytes_size` gives the bytes position from which the current
+   * batch begins, and `end_bytes_size` gives the terminal bytes position after which reading
+   * stops.
+   */
+  size_t pref_bytes_size = chunk_offset;
+  size_t end_bytes_size  = chunk_offset + chunk_size;
+  std::vector<size_t> batch_offsets{pref_bytes_size};
+  for (size_t i = start_source; i < sources.size() && pref_bytes_size < end_bytes_size;) {
+    pref_source_size += sources[i]->size();
+    // If the current source file can subsume multiple batches, we split the file until the
+    // boundary of the last batch exceeds the end of the file (indexed by `pref_source_size`)
+    while (pref_bytes_size < end_bytes_size &&
+           pref_source_size >= std::min(pref_bytes_size + batch_size_ub, end_bytes_size)) {
+      auto next_batch_size = std::min(batch_size_ub, end_bytes_size - pref_bytes_size);
+      batch_offsets.push_back(batch_offsets.back() + next_batch_size);
+      pref_bytes_size += next_batch_size;
     }
+    i++;
   }
-  batch_positions.push_back(sources.size());
-  batch_sizes.push_back(cur_size);
-
-  // If there is a single batch, then we can directly return the table without the
-  // unnecessary concatenate
-  if (batch_sizes.size() == 1) return read_batch(sources, reader_opts, stream, mr);
+  /*
+   * If there is a single batch, then we can directly return the table without the
+   * unnecessary concatenate. The size of batch_offsets is 1 if all sources are empty,
+   * or if end_bytes_size is larger than total_source_size.
+   */
+  if (batch_offsets.size() <= 2) return read_batch(sources, reader_opts, stream, mr);
 
   std::vector<cudf::io::table_with_metadata> partial_tables;
   json_reader_options batched_reader_opts{reader_opts};
-
   // Dispatch individual batches to read_batch and push the resulting table into
   // partial_tables array. Note that the reader options need to be updated for each
   // batch to adjust byte range offset and byte range size.
-  for (size_t i = 0; i < batch_sizes.size(); i++) {
-    batched_reader_opts.set_byte_range_size(std::min(batch_sizes[i], chunk_size));
-    partial_tables.emplace_back(read_batch(
-      host_span<std::unique_ptr<datasource>>(sources.begin() + batch_positions[i],
-                                             batch_positions[i + 1] - batch_positions[i]),
-      batched_reader_opts,
-      stream,
-      rmm::mr::get_current_device_resource()));
-    if (chunk_size <= batch_sizes[i]) break;
-    chunk_size -= batch_sizes[i];
-    batched_reader_opts.set_byte_range_offset(0);
+  for (size_t i = 0; i < batch_offsets.size() - 1; i++) {
+    batched_reader_opts.set_byte_range_offset(batch_offsets[i]);
+    batched_reader_opts.set_byte_range_size(batch_offsets[i + 1] - batch_offsets[i]);
+    partial_tables.emplace_back(
+      read_batch(sources, batched_reader_opts, stream, rmm::mr::get_current_device_resource()));
   }
 
   auto expects_schema_equality =
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index 0c30b4cad46..ff69f9b7627 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -29,6 +29,19 @@
 
 namespace cudf::io::json::detail {
 
+// Some magic numbers
+constexpr int num_subchunks               = 10;  // per chunk_size
+constexpr size_t min_subchunk_size        = 10000;
+constexpr int estimated_compression_ratio = 4;
+constexpr int max_subchunks_prealloced    = 3;
+
+device_span<char> ingest_raw_input(device_span<char> buffer,
+                                   host_span<std::unique_ptr<datasource>> sources,
+                                   compression_type compression,
+                                   size_t range_offset,
+                                   size_t range_size,
+                                   rmm::cuda_stream_view stream);
+
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
@@ -38,9 +51,4 @@ size_type find_first_delimiter(device_span<char const> d_data,
                                char const delimiter,
                                rmm::cuda_stream_view stream);
 
-size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::datasource>> sources,
-                                        json_reader_options const& reader_opts,
-                                        char const delimiter,
-                                        rmm::cuda_stream_view stream);
-
 }  // namespace cudf::io::json::detail
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 8e2017ccb97..05e9759632f 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -313,17 +313,17 @@ ConfigureTest(
   PERCENT 30
 )
 ConfigureTest(
-  JSON_TEST io/json_test.cpp io/json_chunked_reader.cpp
+  JSON_TEST io/json/json_test.cpp io/json/json_chunked_reader.cu
   GPUS 1
   PERCENT 30
 )
-ConfigureTest(JSON_WRITER_TEST io/json_writer.cpp)
-ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cu)
-ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp io/json_tree.cpp)
+ConfigureTest(JSON_WRITER_TEST io/json/json_writer.cpp)
+ConfigureTest(JSON_TYPE_CAST_TEST io/json/json_type_cast_test.cu)
+ConfigureTest(NESTED_JSON_TEST io/json/nested_json_test.cpp io/json/json_tree.cpp)
 ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
 ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
-ConfigureTest(JSON_QUOTE_NORMALIZATION io/json_quote_normalization_test.cpp)
-ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json_whitespace_normalization_test.cu)
+ConfigureTest(JSON_QUOTE_NORMALIZATION io/json/json_quote_normalization_test.cpp)
+ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json/json_whitespace_normalization_test.cu)
 ConfigureTest(
   DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp
   GPUS 1
@@ -572,7 +572,7 @@ ConfigureTest(
   LARGE_STRINGS_TEST
   large_strings/concatenate_tests.cpp
   large_strings/case_tests.cpp
-  large_strings/json_tests.cpp
+  large_strings/json_tests.cu
   large_strings/large_strings_fixture.cpp
   large_strings/merge_tests.cpp
   large_strings/parquet_tests.cpp
diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json/json_chunked_reader.cu
similarity index 64%
rename from cpp/tests/io/json_chunked_reader.cpp
rename to cpp/tests/io/json/json_chunked_reader.cu
index 23d54f7263c..b9dee54752c 100644
--- a/cpp/tests/io/json_chunked_reader.cpp
+++ b/cpp/tests/io/json/json_chunked_reader.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "io/json/read_json.hpp"
+#include "json_utils.cuh"
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
@@ -37,65 +37,6 @@ cudf::test::TempDirTestEnvironment* const temp_env =
   static_cast<cudf::test::TempDirTestEnvironment*>(
     ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
 
-// function to extract first delimiter in the string in each chunk,
-// collate together and form byte_range for each chunk,
-// parse separately.
-std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
-  cudf::host_span<std::unique_ptr<cudf::io::datasource>> sources,
-  cudf::io::json_reader_options const& reader_opts,
-  int32_t chunk_size,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  using namespace cudf::io::json::detail;
-  using cudf::size_type;
-  size_t total_source_size = 0;
-  for (auto const& source : sources) {
-    total_source_size += source->size();
-  }
-  size_t num_chunks                = (total_source_size + chunk_size - 1) / chunk_size;
-  constexpr size_type no_min_value = -1;
-
-  // Get the first delimiter in each chunk.
-  std::vector<size_type> first_delimiter_index(num_chunks);
-  auto reader_opts_chunk = reader_opts;
-  for (size_t i = 0; i < num_chunks; i++) {
-    auto const chunk_start = i * chunk_size;
-    reader_opts_chunk.set_byte_range_offset(chunk_start);
-    reader_opts_chunk.set_byte_range_size(chunk_size);
-    first_delimiter_index[i] =
-      find_first_delimiter_in_chunk(sources, reader_opts_chunk, '\n', stream);
-    if (first_delimiter_index[i] != no_min_value) { first_delimiter_index[i] += chunk_start; }
-  }
-
-  // Process and allocate record start, end for each worker.
-  using record_range = std::pair<size_type, size_type>;
-  std::vector<record_range> record_ranges;
-  record_ranges.reserve(num_chunks);
-  first_delimiter_index[0] = 0;
-  auto prev                = first_delimiter_index[0];
-  for (size_t i = 1; i < num_chunks; i++) {
-    if (first_delimiter_index[i] == no_min_value) continue;
-    record_ranges.emplace_back(prev, first_delimiter_index[i]);
-    prev = first_delimiter_index[i];
-  }
-  record_ranges.emplace_back(prev, total_source_size);
-
-  std::vector<cudf::io::table_with_metadata> tables;
-  // Process each chunk in parallel.
-  for (auto const& [chunk_start, chunk_end] : record_ranges) {
-    if (chunk_start == -1 or chunk_end == -1 or
-        static_cast<size_t>(chunk_start) >= total_source_size)
-      continue;
-    reader_opts_chunk.set_byte_range_offset(chunk_start);
-    reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start);
-    tables.push_back(read_json(sources, reader_opts_chunk, stream, mr));
-  }
-  // assume all records have same number of columns, and inferred same type. (or schema is passed)
-  // TODO a step before to merge all columns, types and infer final schema.
-  return tables;
-}
-
 TEST_F(JsonReaderTest, ByteRange_SingleSource)
 {
   std::string const json_string = R"(
@@ -118,11 +59,11 @@ TEST_F(JsonReaderTest, ByteRange_SingleSource)
 
   // Test for different chunk sizes
   for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500}) {
-    auto const tables = skeleton_for_parellel_chunk_reader(datasources,
-                                                           json_lines_options,
-                                                           chunk_size,
-                                                           cudf::get_default_stream(),
-                                                           rmm::mr::get_current_device_resource());
+    auto const tables = split_byte_range_reading(datasources,
+                                                 json_lines_options,
+                                                 chunk_size,
+                                                 cudf::get_default_stream(),
+                                                 rmm::mr::get_current_device_resource());
 
     auto table_views = std::vector<cudf::table_view>(tables.size());
     std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
@@ -213,11 +154,11 @@ TEST_F(JsonReaderTest, ByteRange_MultiSource)
 
   // Test for different chunk sizes
   for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500, 1000, 2000}) {
-    auto const tables = skeleton_for_parellel_chunk_reader(datasources,
-                                                           json_lines_options,
-                                                           chunk_size,
-                                                           cudf::get_default_stream(),
-                                                           rmm::mr::get_current_device_resource());
+    auto const tables = split_byte_range_reading(datasources,
+                                                 json_lines_options,
+                                                 chunk_size,
+                                                 cudf::get_default_stream(),
+                                                 rmm::mr::get_current_device_resource());
 
     auto table_views = std::vector<cudf::table_view>(tables.size());
     std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
diff --git a/cpp/tests/io/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp
similarity index 100%
rename from cpp/tests/io/json_quote_normalization_test.cpp
rename to cpp/tests/io/json/json_quote_normalization_test.cpp
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json/json_test.cpp
similarity index 100%
rename from cpp/tests/io/json_test.cpp
rename to cpp/tests/io/json/json_test.cpp
diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp
similarity index 100%
rename from cpp/tests/io/json_tree.cpp
rename to cpp/tests/io/json/json_tree.cpp
diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json/json_type_cast_test.cu
similarity index 100%
rename from cpp/tests/io/json_type_cast_test.cu
rename to cpp/tests/io/json/json_type_cast_test.cu
diff --git a/cpp/tests/io/json/json_utils.cuh b/cpp/tests/io/json/json_utils.cuh
new file mode 100644
index 00000000000..9383797d91b
--- /dev/null
+++ b/cpp/tests/io/json/json_utils.cuh
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "io/json/read_json.hpp"
+
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/json.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <numeric>
+
+// Helper function to test correctness of JSON byte range reading.
+// We split the input source files into a set of byte range chunks each of size
+// `chunk_size` and return an array of partial tables constructed from each chunk
+template <typename IndexType = std::int32_t>
+std::vector<cudf::io::table_with_metadata> split_byte_range_reading(
+  cudf::host_span<std::unique_ptr<cudf::io::datasource>> sources,
+  cudf::io::json_reader_options const& reader_opts,
+  IndexType chunk_size,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  auto total_source_size = [&sources]() {
+    return std::accumulate(sources.begin(), sources.end(), 0ul, [=](size_t sum, auto& source) {
+      auto const size = source->size();
+      return sum + size;
+    });
+  }();
+  auto find_first_delimiter_in_chunk =
+    [total_source_size, &sources, &stream](
+      cudf::io::json_reader_options const& reader_opts) -> IndexType {
+    rmm::device_uvector<char> buffer(total_source_size, stream);
+    auto readbufspan = cudf::io::json::detail::ingest_raw_input(buffer,
+                                                                sources,
+                                                                reader_opts.get_compression(),
+                                                                reader_opts.get_byte_range_offset(),
+                                                                reader_opts.get_byte_range_size(),
+                                                                stream);
+    // Note: we cannot reuse cudf::io::json::detail::find_first_delimiter since the
+    // return type of that function is size_type. However, when the chunk_size is
+    // larger than INT_MAX, the position of the delimiter can also be larger than
+    // INT_MAX. We do not encounter this overflow error in the detail function
+    // since the batched JSON reader splits the byte_range_size into chunk_sizes
+    // smaller than INT_MAX bytes
+    auto const first_delimiter_position_it =
+      thrust::find(rmm::exec_policy(stream), readbufspan.begin(), readbufspan.end(), '\n');
+    return first_delimiter_position_it != readbufspan.end()
+             ? thrust::distance(readbufspan.begin(), first_delimiter_position_it)
+             : -1;
+  };
+  size_t num_chunks                = (total_source_size + chunk_size - 1) / chunk_size;
+  constexpr IndexType no_min_value = -1;
+
+  // Get the first delimiter in each chunk.
+  std::vector<IndexType> first_delimiter_index(num_chunks);
+  auto reader_opts_chunk = reader_opts;
+  for (size_t i = 0; i < num_chunks; i++) {
+    auto const chunk_start = i * chunk_size;
+    // We are updating reader_opt_chunks to store offset and size information for the current chunk
+    reader_opts_chunk.set_byte_range_offset(chunk_start);
+    reader_opts_chunk.set_byte_range_size(chunk_size);
+    first_delimiter_index[i] = find_first_delimiter_in_chunk(reader_opts_chunk);
+  }
+
+  // Process and allocate record start, end for each worker.
+  using record_range = std::pair<size_t, size_t>;
+  std::vector<record_range> record_ranges;
+  record_ranges.reserve(num_chunks);
+  size_t prev = 0;
+  for (size_t i = 1; i < num_chunks; i++) {
+    // In the case where chunk_size is smaller than row size, the chunk needs to be skipped
+    if (first_delimiter_index[i] == no_min_value) continue;
+    size_t next = static_cast<size_t>(first_delimiter_index[i]) + (i * chunk_size);
+    record_ranges.emplace_back(prev, next);
+    prev = next;
+  }
+  record_ranges.emplace_back(prev, total_source_size);
+
+  std::vector<cudf::io::table_with_metadata> tables;
+  for (auto const& [chunk_start, chunk_end] : record_ranges) {
+    reader_opts_chunk.set_byte_range_offset(chunk_start);
+    reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start);
+    tables.push_back(cudf::io::json::detail::read_json(sources, reader_opts_chunk, stream, mr));
+  }
+  // assume all records have same number of columns, and inferred same type. (or schema is passed)
+  // TODO a step before to merge all columns, types and infer final schema.
+  return tables;
+}
diff --git a/cpp/tests/io/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu
similarity index 100%
rename from cpp/tests/io/json_whitespace_normalization_test.cu
rename to cpp/tests/io/json/json_whitespace_normalization_test.cu
diff --git a/cpp/tests/io/json_writer.cpp b/cpp/tests/io/json/json_writer.cpp
similarity index 100%
rename from cpp/tests/io/json_writer.cpp
rename to cpp/tests/io/json/json_writer.cpp
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp
similarity index 100%
rename from cpp/tests/io/nested_json_test.cpp
rename to cpp/tests/io/json/nested_json_test.cpp
diff --git a/cpp/tests/large_strings/json_tests.cpp b/cpp/tests/large_strings/json_tests.cu
similarity index 50%
rename from cpp/tests/large_strings/json_tests.cpp
rename to cpp/tests/large_strings/json_tests.cu
index bf16d131ba7..49abf7b484d 100644
--- a/cpp/tests/large_strings/json_tests.cpp
+++ b/cpp/tests/large_strings/json_tests.cu
@@ -14,8 +14,13 @@
  * limitations under the License.
  */
 
+#include "../io/json/json_utils.cuh"
 #include "large_strings_fixture.hpp"
 
+#include <cudf_test/table_utilities.hpp>
+
+#include <cudf/concatenate.hpp>
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -28,31 +33,57 @@ TEST_F(JsonLargeReaderTest, MultiBatch)
     { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
     { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
     { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
-  constexpr size_t expected_file_size = std::numeric_limits<int>::max() / 2;
+  constexpr size_t batch_size_ub      = std::numeric_limits<int>::max();
+  constexpr size_t expected_file_size = 1.5 * static_cast<double>(batch_size_ub);
   std::size_t const log_repetitions =
     static_cast<std::size_t>(std::ceil(std::log2(expected_file_size / json_string.size())));
 
   json_string.reserve(json_string.size() * (1UL << log_repetitions));
-  std::size_t numrows = 4;
   for (std::size_t i = 0; i < log_repetitions; i++) {
     json_string += json_string;
-    numrows <<= 1;
   }
 
   constexpr int num_sources = 2;
-  std::vector<cudf::host_span<char>> hostbufs(
-    num_sources, cudf::host_span<char>(json_string.data(), json_string.size()));
+  std::vector<cudf::host_span<std::byte>> hostbufs(
+    num_sources,
+    cudf::host_span<std::byte>(reinterpret_cast<std::byte*>(json_string.data()),
+                               json_string.size()));
 
   // Initialize parsing options (reading json lines)
   cudf::io::json_reader_options json_lines_options =
     cudf::io::json_reader_options::builder(
       cudf::io::source_info{
-        cudf::host_span<cudf::host_span<char>>(hostbufs.data(), hostbufs.size())})
+        cudf::host_span<cudf::host_span<std::byte>>(hostbufs.data(), hostbufs.size())})
       .lines(true)
       .compression(cudf::io::compression_type::NONE)
       .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
 
   // Read full test data via existing, nested JSON lines reader
   cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options);
-  ASSERT_EQ(current_reader_table.tbl->num_rows(), numrows * num_sources);
+
+  std::vector<std::unique_ptr<cudf::io::datasource>> datasources;
+  for (auto& hb : hostbufs) {
+    datasources.emplace_back(cudf::io::datasource::create(hb));
+  }
+  // Test for different chunk sizes
+  std::vector<size_t> chunk_sizes{
+    batch_size_ub / 4, batch_size_ub / 2, batch_size_ub, static_cast<size_t>(batch_size_ub * 2)};
+  for (auto chunk_size : chunk_sizes) {
+    auto const tables =
+      split_byte_range_reading<std::int64_t>(datasources,
+                                             json_lines_options,
+                                             chunk_size,
+                                             cudf::get_default_stream(),
+                                             rmm::mr::get_current_device_resource());
+
+    auto table_views = std::vector<cudf::table_view>(tables.size());
+    std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
+      return table.tbl->view();
+    });
+    auto result = cudf::concatenate(table_views);
+
+    // Verify that the data read via chunked reader matches the data read via nested JSON reader
+    // cannot use EQUAL due to concatenate removing null mask
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view());
+  }
 }

From c7b28ceeb46d2b921e30f081a9ed97745c91ff9e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 05:28:13 -0500
Subject: [PATCH 568/842] Add `drop_nulls` in `cudf-polars` (#16290)

Closes https://github.com/rapidsai/cudf/issues/16219

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16290
---
 python/cudf_polars/cudf_polars/dsl/expr.py  | 30 +++++++++-
 python/cudf_polars/tests/test_drop_nulls.py | 65 +++++++++++++++++++++
 2 files changed, 94 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf_polars/tests/test_drop_nulls.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index a034d55120a..8322d6bd6fb 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -882,7 +882,14 @@ def __init__(
         self.name = name
         self.options = options
         self.children = children
-        if self.name not in ("mask_nans", "round", "setsorted", "unique"):
+        if self.name not in (
+            "mask_nans",
+            "round",
+            "setsorted",
+            "unique",
+            "dropnull",
+            "fill_null",
+        ):
             raise NotImplementedError(f"Unary function {name=}")
 
     def do_evaluate(
@@ -968,6 +975,27 @@ def do_evaluate(
                 order=order,
                 null_order=null_order,
             )
+        elif self.name == "dropnull":
+            (column,) = (
+                child.evaluate(df, context=context, mapping=mapping)
+                for child in self.children
+            )
+            return Column(
+                plc.stream_compaction.drop_nulls(
+                    plc.Table([column.obj]), [0], 1
+                ).columns()[0]
+            )
+        elif self.name == "fill_null":
+            column = self.children[0].evaluate(df, context=context, mapping=mapping)
+            if isinstance(self.children[1], Literal):
+                arg = plc.interop.from_arrow(self.children[1].value)
+            else:
+                evaluated = self.children[1].evaluate(
+                    df, context=context, mapping=mapping
+                )
+                arg = evaluated.obj_scalar if evaluated.is_scalar else evaluated.obj
+            return Column(plc.replace.replace_nulls(column.obj, arg))
+
         raise NotImplementedError(
             f"Unimplemented unary function {self.name=}"
         )  # pragma: no cover; init trips first
diff --git a/python/cudf_polars/tests/test_drop_nulls.py b/python/cudf_polars/tests/test_drop_nulls.py
new file mode 100644
index 00000000000..5dfe9f66a97
--- /dev/null
+++ b/python/cudf_polars/tests/test_drop_nulls.py
@@ -0,0 +1,65 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
+
+
+@pytest.fixture(
+    params=[
+        [1, 2, 1, 3, 5, None, None],
+        [1.5, 2.5, None, 1.5, 3, float("nan"), 3],
+        [],
+        [None, None],
+        [1, 2, 3, 4, 5],
+    ]
+)
+def null_data(request):
+    is_empty = pl.Series(request.param).dtype == pl.Null
+    return pl.DataFrame(
+        {
+            "a": pl.Series(request.param, dtype=pl.Float64 if is_empty else None),
+            "b": pl.Series(request.param, dtype=pl.Float64 if is_empty else None),
+        }
+    ).lazy()
+
+
+def test_drop_null(null_data):
+    q = null_data.select(pl.col("a").drop_nulls())
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "value",
+    [0, pl.col("a").mean(), pl.col("b")],
+    ids=["scalar", "aggregation", "column_expression"],
+)
+def test_fill_null(null_data, value):
+    q = null_data.select(pl.col("a").fill_null(value))
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "strategy", ["forward", "backward", "min", "max", "mean", "zero", "one"]
+)
+def test_fill_null_with_strategy(null_data, strategy):
+    q = null_data.select(pl.col("a").fill_null(strategy=strategy))
+
+    # Not yet exposed to python from rust
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+@pytest.mark.parametrize("strategy", ["forward", "backward"])
+@pytest.mark.parametrize("limit", [0, 1, 2])
+def test_fill_null_with_limit(null_data, strategy, limit):
+    q = null_data.select(pl.col("a").fill_null(strategy=strategy, limit=limit))
+
+    # Not yet exposed to python from rust
+    assert_ir_translation_raises(q, NotImplementedError)

From e6d412cba7c23df7ee500c28257ed9281cea49b9 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 06:03:28 -0500
Subject: [PATCH 569/842] Fall back when casting a timestamp to numeric in
 cudf-polars (#16232)

This PR adds logic that falls back to CPU when a cudf-polars query would cast a timestamp column to a numeric type, an unsupported operation in libcudf, which should fix a few polars tests. It could be cleaned up a bit with some of the utilities that will be added in https://github.com/rapidsai/cudf/pull/16150.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16232
---
 python/cudf_polars/cudf_polars/dsl/expr.py    |  4 ++
 .../tests/expressions/test_casting.py         | 52 +++++++++++++++++++
 2 files changed, 56 insertions(+)
 create mode 100644 python/cudf_polars/tests/expressions/test_casting.py

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 8322d6bd6fb..9835e6f8461 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -1188,6 +1188,10 @@ class Cast(Expr):
     def __init__(self, dtype: plc.DataType, value: Expr) -> None:
         super().__init__(dtype)
         self.children = (value,)
+        if not plc.unary.is_supported_cast(self.dtype, value.dtype):
+            raise NotImplementedError(
+                f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}"
+            )
 
     def do_evaluate(
         self,
diff --git a/python/cudf_polars/tests/expressions/test_casting.py b/python/cudf_polars/tests/expressions/test_casting.py
new file mode 100644
index 00000000000..3e003054338
--- /dev/null
+++ b/python/cudf_polars/tests/expressions/test_casting.py
@@ -0,0 +1,52 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
+
+_supported_dtypes = [(pl.Int8(), pl.Int64())]
+
+_unsupported_dtypes = [
+    (pl.String(), pl.Int64()),
+]
+
+
+@pytest.fixture
+def dtypes(request):
+    return request.param
+
+
+@pytest.fixture
+def tests(dtypes):
+    fromtype, totype = dtypes
+    if fromtype == pl.String():
+        data = ["a", "b", "c"]
+    else:
+        data = [1, 2, 3]
+    return pl.DataFrame(
+        {
+            "a": pl.Series(data, dtype=fromtype),
+        }
+    ).lazy(), totype
+
+
+@pytest.mark.parametrize("dtypes", _supported_dtypes, indirect=True)
+def test_cast_supported(tests):
+    df, totype = tests
+    q = df.select(pl.col("a").cast(totype))
+    assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize("dtypes", _unsupported_dtypes, indirect=True)
+def test_cast_unsupported(tests):
+    df, totype = tests
+    assert_ir_translation_raises(
+        df.select(pl.col("a").cast(totype)), NotImplementedError
+    )

From ff30c0211109e14b1f6918fcc6c2e2b98f863a1f Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Tue, 23 Jul 2024 12:03:55 -0700
Subject: [PATCH 570/842] Fix compile warnings with `jni_utils.hpp` (#16336)

This fixes the compiler warnings with `jni_utils.hpp`, removing some `const` qualifiers that are redundant.

Closes https://github.com/rapidsai/cudf/issues/16335.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/16336
---
 java/src/main/native/include/jni_utils.hpp | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/java/src/main/native/include/jni_utils.hpp b/java/src/main/native/include/jni_utils.hpp
index ea04c1cda83..a3b4bfcb63e 100644
--- a/java/src/main/native/include/jni_utils.hpp
+++ b/java/src/main/native/include/jni_utils.hpp
@@ -284,7 +284,7 @@ class native_jArray {
     return data()[index];
   }
 
-  const N_TYPE* const data() const
+  N_TYPE const* data() const
   {
     init_data_ptr();
     return data_ptr;
@@ -296,17 +296,15 @@ class native_jArray {
     return data_ptr;
   }
 
-  const N_TYPE* const begin() const { return data(); }
+  N_TYPE const* begin() const { return data(); }
 
   N_TYPE* begin() { return data(); }
 
-  const N_TYPE* const end() const { return data() + size(); }
+  N_TYPE const* end() const { return data() + size(); }
 
   N_TYPE* end() { return data() + size(); }
 
-  const J_ARRAY_TYPE get_jArray() const { return orig; }
-
-  J_ARRAY_TYPE get_jArray() { return orig; }
+  J_ARRAY_TYPE get_jArray() const { return orig; }
 
   /**
    * @brief Conversion to std::vector
@@ -430,9 +428,7 @@ class native_jpointerArray {
   T* const* begin() const { return data(); }
   T* const* end() const { return data() + size(); }
 
-  const jlongArray get_jArray() const { return wrapped.get_jArray(); }
-
-  jlongArray get_jArray() { return wrapped.get_jArray(); }
+  jlongArray get_jArray() const { return wrapped.get_jArray(); }
 
   void assert_no_nulls() const
   {
@@ -624,7 +620,7 @@ class native_jstring {
     return true;
   }
 
-  const jstring get_jstring() const { return orig; }
+  jstring get_jstring() const { return orig; }
 
   ~native_jstring()
   {
@@ -753,13 +749,13 @@ class native_jstringArray {
     return cache[index];
   }
 
-  char const** const as_c_array() const
+  char const** as_c_array() const
   {
     init_c_cache();
     return c_cache.data();
   }
 
-  const std::vector<std::string> as_cpp_vector() const
+  std::vector<std::string> as_cpp_vector() const
   {
     init_cpp_cache();
     return cpp_cache;

From cd711913d2312ba158e34f5c03784a7b07f1583a Mon Sep 17 00:00:00 2001
From: Elias Stehle <3958403+elstehle@users.noreply.github.com>
Date: Wed, 24 Jul 2024 00:24:19 +0200
Subject: [PATCH 571/842] Adds write-coalescing code path optimization to FST
 (#16143)

This PR adds an optimized code path to the finite-state transducer (FST) that will use a shared memory-backed write buffer for the translated output and translated output indexes, if the the write buffer does not require allocating excessive amounts of shared memory (i.e., current heuristic is 24 KB/CTA). Writes are first buffered in shared memory and then collaboratively written out using coalesced writes to global memory.

## Benchmark results

Numbers are for libcudf's FST_NVBENCH for a 1.073 GB input. FST outputs one token per input symbol. Benchmarks run on V100 with 900 GB/s theoretical peak BW.
We compare the current FST implementation (old) to an FST implementaation that uses write-coalescing to gmem (new).

|                  | OLD throughput  (GB/s) | NEW throughput  (GB/s) | relative performance |   | 1st kernel, per byte: bytes read/written | 2nd kernel, per byte: bytes read/written | expected SOL (GB/s) | achieved SOL (old) | achieved SOL (new) |
|------------------|------------------------|------------------------|----------------------|---|------------------------------------------|------------------------------------------|---------------------|--------------------|--------------------|
| full             |                   15.7 |                  74.74 |                 476% |   |                                        1 |                                        6 |              102.86 |             15.26% |             72.66% |
| no out-indexes   |                 39.123 |                  105.8 |                 270% |   |                                        1 |                                        2 |              240.00 |             16.30% |             44.08% |
| no-output        |                 229.27 |                 178.92 |                  78% |   |                                        1 |                                        1 |              360.00 |             63.69% |             49.70% |
| out-indexes-only |                  24.95 |                   85.2 |                 341% |   |                                        1 |                                        5 |              120.00 |             20.79% |             71.00% |

Authors:
  - Elias Stehle (https://github.com/elstehle)

Approvers:
  - Shruti Shivakumar (https://github.com/shrshi)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16143
---
 cpp/benchmarks/io/fst.cu              |  16 +-
 cpp/src/io/fst/agent_dfa.cuh          | 371 ++++++++++++++++++++++----
 cpp/src/io/fst/dispatch_dfa.cuh       |   7 +-
 cpp/src/io/fst/lookup_tables.cuh      |  70 +++--
 cpp/src/io/json/json_normalization.cu |  26 +-
 cpp/src/io/json/nested_json_gpu.cu    |  25 +-
 cpp/tests/io/fst/common.hpp           |   4 +-
 cpp/tests/io/fst/fst_test.cu          |   4 +-
 8 files changed, 425 insertions(+), 98 deletions(-)

diff --git a/cpp/benchmarks/io/fst.cu b/cpp/benchmarks/io/fst.cu
index ad19bdfdfcb..31f1bf8e70f 100644
--- a/cpp/benchmarks/io/fst.cu
+++ b/cpp/benchmarks/io/fst.cu
@@ -95,7 +95,9 @@ void BM_FST_JSON(nvbench::state& state)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
@@ -134,7 +136,9 @@ void BM_FST_JSON_no_outidx(nvbench::state& state)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
@@ -171,7 +175,9 @@ void BM_FST_JSON_no_out(nvbench::state& state)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
@@ -209,7 +215,9 @@ void BM_FST_JSON_no_str(nvbench::state& state)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<max_translation_table_size>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<max_translation_table_size,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
index 2171764decd..bc5b94e2718 100644
--- a/cpp/src/io/fst/agent_dfa.cuh
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -18,7 +18,9 @@
 #include "in_reg_array.cuh"
 
 #include <cub/cub.cuh>
+#include <cuda/std/type_traits>
 #include <thrust/execution_policy.h>
+#include <thrust/iterator/discard_iterator.h>
 #include <thrust/sequence.h>
 
 namespace cudf::io::fst::detail {
@@ -44,9 +46,10 @@ using StateIndexT = uint32_t;
 template <int32_t NUM_ITEMS>
 struct VectorCompositeOp {
   template <typename VectorT>
-  __host__ __device__ __forceinline__ VectorT operator()(VectorT const& lhs, VectorT const& rhs)
+  __device__ __forceinline__ VectorT operator()(VectorT const& lhs, VectorT const& rhs)
   {
     VectorT res{};
+#pragma unroll
     for (int32_t i = 0; i < NUM_ITEMS; ++i) {
       res.Set(i, rhs.Get(lhs.Get(i)));
     }
@@ -57,61 +60,275 @@ struct VectorCompositeOp {
 /**
  * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the
  * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed
- * to emit any output symbol (the "transduced" output) and, if so, keeps track of how many symbols
- * it intends to write out and writing out such symbols to the given output iterators.
+ * to emit any output symbol (the "transduced" output) and, if so, keeps track of *how many* symbols
+ * it intends to write out.
+ */
+template <typename TransducerTableT>
+class DFACountCallbackWrapper {
+ public:
+  __device__ __forceinline__ DFACountCallbackWrapper(TransducerTableT transducer_table)
+    : transducer_table(transducer_table)
+  {
+  }
+
+  template <typename OffsetT>
+  __device__ __forceinline__ void Init(OffsetT const&)
+  {
+    out_count = 0;
+  }
+
+  template <typename CharIndexT, typename StateIndexT, typename SymbolIndexT, typename SymbolT>
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index,
+                                             StateIndexT const old_state,
+                                             StateIndexT const new_state,
+                                             SymbolIndexT const symbol_id,
+                                             SymbolT const read_symbol)
+  {
+    uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
+    out_count += count;
+  }
+
+  __device__ __forceinline__ void TearDown() {}
+  TransducerTableT const transducer_table;
+  uint32_t out_count{};
+};
+
+/**
+ * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the
+ * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed
+ * to emit any output symbol (the "transduced" output) and, if so, writes out such symbols to the
+ * given output iterators.
  *
+ * @tparam MaxTranslatedOutChars The maximum number of symbols that are written on a any given state
+ * transition
  * @tparam TransducerTableT The type implementing a transducer table that can be used for looking up
  * the symbols that are supposed to be emitted on a given state transition.
- * @tparam TransducedOutItT A Random-access output iterator type to which symbols returned by the
+ * @tparam TransducedOutItT A random-access output iterator type to which symbols returned by the
  * transducer table are assignable.
- * @tparam TransducedIndexOutItT A Random-access output iterator type to which indexes are written.
+ * @tparam TransducedIndexOutItT A random-access output iterator type to which indexes are written.
  */
-template <typename TransducerTableT, typename TransducedOutItT, typename TransducedIndexOutItT>
-class DFASimulationCallbackWrapper {
+template <int MaxTranslatedOutChars,
+          typename TransducerTableT,
+          typename TransducedOutItT,
+          typename TransducedIndexOutItT>
+class DFAWriteCallbackWrapper {
  public:
-  __host__ __device__ __forceinline__ DFASimulationCallbackWrapper(
-    TransducerTableT transducer_table, TransducedOutItT out_it, TransducedIndexOutItT out_idx_it)
-    : transducer_table(transducer_table), out_it(out_it), out_idx_it(out_idx_it), write(false)
+  __device__ __forceinline__ DFAWriteCallbackWrapper(TransducerTableT transducer_table,
+                                                     TransducedOutItT out_it,
+                                                     TransducedIndexOutItT out_idx_it,
+                                                     uint32_t out_offset,
+                                                     uint32_t /*tile_out_offset*/,
+                                                     uint32_t /*tile_in_offset*/,
+                                                     uint32_t /*tile_out_count*/)
+    : transducer_table(transducer_table),
+      out_it(out_it),
+      out_idx_it(out_idx_it),
+      out_offset(out_offset)
   {
   }
 
   template <typename OffsetT>
-  __host__ __device__ __forceinline__ void Init(OffsetT const& offset)
+  __device__ __forceinline__ void Init(OffsetT const& in_offset)
+  {
+    this->in_offset = in_offset;
+  }
+
+  template <typename CharIndexT,
+            typename StateIndexT,
+            typename SymbolIndexT,
+            typename SymbolT,
+            int MaxTranslatedOutChars_>
+  __device__ __forceinline__
+    typename ::cuda::std::enable_if<(MaxTranslatedOutChars_ <= 2), void>::type
+    ReadSymbol(CharIndexT const character_index,
+               StateIndexT const old_state,
+               StateIndexT const new_state,
+               SymbolIndexT const symbol_id,
+               SymbolT const read_symbol,
+               cub::Int2Type<MaxTranslatedOutChars_> /*MaxTranslatedOutChars*/)
+  {
+    uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
+
+#pragma unroll
+    for (uint32_t out_char = 0; out_char < MaxTranslatedOutChars_; out_char++) {
+      if (out_char < count) {
+        out_it[out_offset + out_char] =
+          transducer_table(old_state, symbol_id, out_char, read_symbol);
+        out_idx_it[out_offset + out_char] = in_offset + character_index;
+      }
+    }
+    out_offset += count;
+  }
+
+  template <typename CharIndexT,
+            typename StateIndexT,
+            typename SymbolIndexT,
+            typename SymbolT,
+            int MaxTranslatedOutChars_>
+  __device__ __forceinline__
+    typename ::cuda::std::enable_if<(MaxTranslatedOutChars_ > 2), void>::type
+    ReadSymbol(CharIndexT const character_index,
+               StateIndexT const old_state,
+               StateIndexT const new_state,
+               SymbolIndexT const symbol_id,
+               SymbolT const read_symbol,
+               cub::Int2Type<MaxTranslatedOutChars_>)
   {
-    this->offset = offset;
-    if (!write) out_count = 0;
+    uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
+
+    for (uint32_t out_char = 0; out_char < count; out_char++) {
+      out_it[out_offset + out_char] = transducer_table(old_state, symbol_id, out_char, read_symbol);
+      out_idx_it[out_offset + out_char] = in_offset + character_index;
+    }
+    out_offset += count;
   }
 
   template <typename CharIndexT, typename StateIndexT, typename SymbolIndexT, typename SymbolT>
-  __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index,
-                                                      StateIndexT const old_state,
-                                                      StateIndexT const new_state,
-                                                      SymbolIndexT const symbol_id,
-                                                      SymbolT const read_symbol)
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index,
+                                             StateIndexT const old_state,
+                                             StateIndexT const new_state,
+                                             SymbolIndexT const symbol_id,
+                                             SymbolT const read_symbol)
+  {
+    ReadSymbol(character_index,
+               old_state,
+               new_state,
+               symbol_id,
+               read_symbol,
+               cub::Int2Type<MaxTranslatedOutChars>{});
+  }
+
+  __device__ __forceinline__ void TearDown() {}
+
+ public:
+  TransducerTableT const transducer_table;
+  TransducedOutItT out_it;
+  TransducedIndexOutItT out_idx_it;
+  uint32_t out_offset;
+  uint32_t in_offset;
+};
+
+/**
+ * @brief A class whose ReadSymbol member function is invoked for each symbol being read from the
+ * input tape. The wrapper class looks up whether a state transition caused by a symbol is supposed
+ * to emit any output symbol (the "transduced" output) and, if so, writes out such symbols to the
+ * given output iterators. This class uses a shared memory-backed write buffer to coalesce writes to
+ * global memory.
+ *
+ * @tparam DiscardIndexOutput Whether to discard the indexes instead of writing them to the given
+ * output iterator
+ * @tparam DiscardTranslatedOutput Whether to discard the translated output symbols instead of
+ * writing them to the given output iterator
+ * @tparam NumWriteBufferItems The number of items to allocate in shared memory for the write
+ * buffer.
+ * @tparam OutputT The type of the translated items
+ * @tparam TransducerTableT The type implementing a transducer table that can be used for looking up
+ * the symbols that are supposed to be emitted on a given state transition.
+ * @tparam TransducedOutItT A random-access output iterator type to which symbols returned by the
+ * transducer table are assignable.
+ * @tparam TransducedIndexOutItT A random-access output iterator type to which indexes are written.
+ */
+template <bool DiscardIndexOutput,
+          bool DiscardTranslatedOutput,
+          int NumWriteBufferItems,
+          typename OutputT,
+          typename TransducerTableT,
+          typename TransducedOutItT,
+          typename TransducedIndexOutItT>
+class WriteCoalescingCallbackWrapper {
+  struct TempStorage_Offsets {
+    uint16_t compacted_offset[NumWriteBufferItems];
+  };
+  struct TempStorage_Symbols {
+    OutputT compacted_symbols[NumWriteBufferItems];
+  };
+  using offset_cache_t =
+    ::cuda::std::conditional_t<DiscardIndexOutput, cub::NullType, TempStorage_Offsets>;
+  using symbol_cache_t = ::cuda::std::
+    conditional_t<DiscardTranslatedOutput, cub::Uninitialized<cub::NullType>, TempStorage_Symbols>;
+  struct TempStorage_ : offset_cache_t, symbol_cache_t {};
+
+  __device__ __forceinline__ TempStorage_& PrivateStorage()
+  {
+    __shared__ TempStorage private_storage;
+    return private_storage.Alias();
+  }
+  TempStorage_& temp_storage;
+
+ public:
+  struct TempStorage : cub::Uninitialized<TempStorage_> {};
+
+  __device__ __forceinline__ WriteCoalescingCallbackWrapper(TransducerTableT transducer_table,
+                                                            TransducedOutItT out_it,
+                                                            TransducedIndexOutItT out_idx_it,
+                                                            uint32_t thread_out_offset,
+                                                            uint32_t tile_out_offset,
+                                                            uint32_t tile_in_offset,
+                                                            uint32_t tile_out_count)
+    : temp_storage(PrivateStorage()),
+      transducer_table(transducer_table),
+      out_it(out_it),
+      out_idx_it(out_idx_it),
+      thread_out_offset(thread_out_offset),
+      tile_out_offset(tile_out_offset),
+      tile_in_offset(tile_in_offset),
+      tile_out_count(tile_out_count)
+  {
+  }
+
+  template <typename OffsetT>
+  __device__ __forceinline__ void Init(OffsetT const& offset)
+  {
+    this->in_offset = offset;
+  }
+
+  template <typename CharIndexT, typename StateIndexT, typename SymbolIndexT, typename SymbolT>
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index,
+                                             StateIndexT const old_state,
+                                             StateIndexT const new_state,
+                                             SymbolIndexT const symbol_id,
+                                             SymbolT const read_symbol)
   {
     uint32_t const count = transducer_table(old_state, symbol_id, read_symbol);
-    if (write) {
-#if defined(__CUDA_ARCH__)
-#pragma unroll 1
-#endif
-      for (uint32_t out_char = 0; out_char < count; out_char++) {
-        out_it[out_count + out_char] =
+    for (uint32_t out_char = 0; out_char < count; out_char++) {
+      if constexpr (!DiscardIndexOutput) {
+        temp_storage.compacted_offset[thread_out_offset + out_char - tile_out_offset] =
+          in_offset + character_index - tile_in_offset;
+      }
+      if constexpr (!DiscardTranslatedOutput) {
+        temp_storage.compacted_symbols[thread_out_offset + out_char - tile_out_offset] =
           transducer_table(old_state, symbol_id, out_char, read_symbol);
-        out_idx_it[out_count + out_char] = offset + character_index;
       }
     }
-    out_count += count;
+    thread_out_offset += count;
   }
 
-  __host__ __device__ __forceinline__ void TearDown() {}
+  __device__ __forceinline__ void TearDown()
+  {
+    __syncthreads();
+    if constexpr (!DiscardTranslatedOutput) {
+      for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) {
+        out_it[tile_out_offset + out_char] = temp_storage.compacted_symbols[out_char];
+      }
+    }
+    if constexpr (!DiscardIndexOutput) {
+      for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) {
+        out_idx_it[tile_out_offset + out_char] =
+          temp_storage.compacted_offset[out_char] + tile_in_offset;
+      }
+    }
+    __syncthreads();
+  }
 
  public:
   TransducerTableT const transducer_table;
   TransducedOutItT out_it;
   TransducedIndexOutItT out_idx_it;
-  uint32_t out_count;
-  uint32_t offset;
-  bool write;
+  uint32_t thread_out_offset;
+  uint32_t tile_out_offset;
+  uint32_t tile_in_offset;
+  uint32_t in_offset;
+  uint32_t tile_out_count;
 };
 
 /**
@@ -125,17 +342,18 @@ class DFASimulationCallbackWrapper {
 template <int32_t NUM_INSTANCES, typename TransitionTableT>
 class StateVectorTransitionOp {
  public:
-  __host__ __device__ __forceinline__ StateVectorTransitionOp(
+  __device__ __forceinline__ StateVectorTransitionOp(
     TransitionTableT const& transition_table, std::array<StateIndexT, NUM_INSTANCES>& state_vector)
     : transition_table(transition_table), state_vector(state_vector)
   {
   }
 
   template <typename CharIndexT, typename SymbolIndexT, typename SymbolT>
-  __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
-                                                      SymbolIndexT const& read_symbol_id,
-                                                      SymbolT const& read_symbol) const
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
+                                             SymbolIndexT const& read_symbol_id,
+                                             SymbolT const& read_symbol) const
   {
+#pragma unroll
     for (int32_t i = 0; i < NUM_INSTANCES; ++i) {
       state_vector[i] = transition_table(state_vector[i], read_symbol_id);
     }
@@ -152,17 +370,17 @@ struct StateTransitionOp {
   TransitionTableT const& transition_table;
   CallbackOpT& callback_op;
 
-  __host__ __device__ __forceinline__ StateTransitionOp(TransitionTableT const& transition_table,
-                                                        StateIndexT state,
-                                                        CallbackOpT& callback_op)
+  __device__ __forceinline__ StateTransitionOp(TransitionTableT const& transition_table,
+                                               StateIndexT state,
+                                               CallbackOpT& callback_op)
     : transition_table(transition_table), state(state), callback_op(callback_op)
   {
   }
 
   template <typename CharIndexT, typename SymbolIndexT, typename SymbolT>
-  __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
-                                                      SymbolIndexT const& read_symbol_id,
-                                                      SymbolT const& read_symbol)
+  __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index,
+                                             SymbolIndexT const& read_symbol_id,
+                                             SymbolT const& read_symbol)
   {
     // Remember what state we were in before we made the transition
     StateIndexT previous_state = state;
@@ -420,7 +638,7 @@ struct AgentDFA {
     __syncthreads();
 
     // Thread's symbols
-    CharT* t_chars = &temp_storage.chars[threadIdx.x * SYMBOLS_PER_THREAD];
+    CharT const* t_chars = &temp_storage.chars[threadIdx.x * SYMBOLS_PER_THREAD];
 
     // Parse thread's symbols and transition the state-vector
     if (is_full_block) {
@@ -538,6 +756,43 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
   // The state transition vector passed on to the second stage of the algorithm
   StateVectorT out_state_vector;
 
+  using OutSymbolT = typename DfaT::OutSymbolT;
+  // static constexpr int32_t MIN_TRANSLATED_OUT = DfaT::MIN_TRANSLATED_OUT;
+  static constexpr int32_t num_max_translated_out = DfaT::MAX_TRANSLATED_OUT;
+  static constexpr bool discard_out_index =
+    ::cuda::std::is_same<TransducedIndexOutItT, thrust::discard_iterator<>>::value;
+  static constexpr bool discard_out_it =
+    ::cuda::std::is_same<TransducedOutItT, thrust::discard_iterator<>>::value;
+  using NonWriteCoalescingT =
+    DFAWriteCallbackWrapper<num_max_translated_out,
+                            decltype(dfa.InitTranslationTable(transducer_table_storage)),
+                            TransducedOutItT,
+                            TransducedIndexOutItT>;
+
+  using WriteCoalescingT =
+    WriteCoalescingCallbackWrapper<discard_out_index,
+                                   discard_out_it,
+                                   num_max_translated_out * SYMBOLS_PER_BLOCK,
+                                   OutSymbolT,
+                                   decltype(dfa.InitTranslationTable(transducer_table_storage)),
+                                   TransducedOutItT,
+                                   TransducedIndexOutItT>;
+
+  static constexpr bool is_translation_pass = (!IS_TRANS_VECTOR_PASS) || IS_SINGLE_PASS;
+
+  // Use write-coalescing only if the worst-case output size per tile fits into shared memory
+  static constexpr bool can_use_smem_cache =
+    (sizeof(typename WriteCoalescingT::TempStorage) + sizeof(typename AgentDfaSimT::TempStorage) +
+     sizeof(typename DfaT::SymbolGroupStorageT) + sizeof(typename DfaT::TransitionTableStorageT) +
+     sizeof(typename DfaT::TranslationTableStorageT)) < (48 * 1024);
+  static constexpr bool use_smem_cache =
+    is_translation_pass and
+    (sizeof(typename WriteCoalescingT::TempStorage) <= AgentDFAPolicy::SMEM_THRESHOLD) and
+    can_use_smem_cache;
+
+  using DFASimulationCallbackWrapperT =
+    typename cub::If<use_smem_cache, WriteCoalescingT, NonWriteCoalescingT>::Type;
+
   // Stage 1: Compute the state-transition vector
   if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) {
     // Keeping track of the state for each of the <NUM_STATES> state machines
@@ -576,7 +831,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
     // -> first block/tile: write out block aggregate as the "tile's" inclusive (i.e., the one that
     // incorporates all preceding blocks/tiles results)
     //------------------------------------------------------------------------------
-    if (IS_SINGLE_PASS) {
+    if constexpr (IS_SINGLE_PASS) {
       uint32_t tile_idx             = blockIdx.x;
       using StateVectorCompositeOpT = VectorCompositeOp<NUM_STATES>;
 
@@ -623,10 +878,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
     }
 
     // Perform finite-state machine simulation, computing size of transduced output
-    DFASimulationCallbackWrapper<decltype(dfa.InitTranslationTable(transducer_table_storage)),
-                                 TransducedOutItT,
-                                 TransducedIndexOutItT>
-      callback_wrapper(transducer_table, transduced_out_it, transduced_out_idx_it);
+    DFACountCallbackWrapper count_chars_callback_op{transducer_table};
 
     StateIndexT t_start_state = state;
     agent_dfa.GetThreadStateTransitions(symbol_matcher,
@@ -635,7 +887,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
                                         blockIdx.x * SYMBOLS_PER_BLOCK,
                                         num_chars,
                                         state,
-                                        callback_wrapper,
+                                        count_chars_callback_op,
                                         cub::Int2Type<IS_SINGLE_PASS>());
 
     __syncthreads();
@@ -650,15 +902,18 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
     __shared__ typename OffsetPrefixScanCallbackOpT_::TempStorage prefix_callback_temp_storage;
 
     uint32_t tile_idx = blockIdx.x;
+    uint32_t tile_out_offset{};
+    uint32_t tile_out_count{};
+    uint32_t thread_out_offset{};
     if (tile_idx == 0) {
       OffsetT block_aggregate = 0;
       OutOffsetBlockScan(scan_temp_storage)
-        .ExclusiveScan(callback_wrapper.out_count,
-                       callback_wrapper.out_count,
+        .ExclusiveScan(count_chars_callback_op.out_count,
+                       thread_out_offset,
                        static_cast<OffsetT>(0),
                        cub::Sum{},
                        block_aggregate);
-
+      tile_out_count = block_aggregate;
       if (threadIdx.x == 0 /*and not IS_LAST_TILE*/) {
         offset_tile_state.SetInclusive(0, block_aggregate);
       }
@@ -671,22 +926,28 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
         offset_tile_state, prefix_callback_temp_storage, cub::Sum{}, tile_idx);
 
       OutOffsetBlockScan(scan_temp_storage)
-        .ExclusiveScan(
-          callback_wrapper.out_count, callback_wrapper.out_count, cub::Sum{}, prefix_op);
-
+        .ExclusiveScan(count_chars_callback_op.out_count, thread_out_offset, cub::Sum{}, prefix_op);
+      tile_out_offset = prefix_op.GetExclusivePrefix();
+      tile_out_count  = prefix_op.GetBlockAggregate();
       if (tile_idx == gridDim.x - 1 && threadIdx.x == 0) {
         *d_num_transduced_out_it = prefix_op.GetInclusivePrefix();
       }
     }
 
-    callback_wrapper.write = true;
+    DFASimulationCallbackWrapperT write_translated_callback_op{transducer_table,
+                                                               transduced_out_it,
+                                                               transduced_out_idx_it,
+                                                               thread_out_offset,
+                                                               tile_out_offset,
+                                                               blockIdx.x * SYMBOLS_PER_BLOCK,
+                                                               tile_out_count};
     agent_dfa.GetThreadStateTransitions(symbol_matcher,
                                         transition_table,
                                         d_chars,
                                         blockIdx.x * SYMBOLS_PER_BLOCK,
                                         num_chars,
                                         t_start_state,
-                                        callback_wrapper,
+                                        write_translated_callback_op,
                                         cub::Int2Type<true>());
   }
 }
diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh
index be63ec6539f..ef5e9c8a78f 100644
--- a/cpp/src/io/fst/dispatch_dfa.cuh
+++ b/cpp/src/io/fst/dispatch_dfa.cuh
@@ -37,6 +37,11 @@ struct AgentDFAPolicy {
 
   // The number of symbols processed by each thread
   static constexpr int32_t ITEMS_PER_THREAD = _ITEMS_PER_THREAD;
+
+  // If the shared memory-backed write buffer exceeds this threshold, the FST will skip buffering
+  // the output in a write buffer and instead immediately write out to global memory, potentially
+  // resulting in non-coalesced writes
+  static constexpr std::size_t SMEM_THRESHOLD = 24 * 1024;
 };
 
 /**
@@ -49,7 +54,7 @@ struct DeviceFSMPolicy {
   struct Policy900 : cub::ChainedPolicy<900, Policy900, Policy900> {
     enum {
       BLOCK_THREADS    = 128,
-      ITEMS_PER_THREAD = 32,
+      ITEMS_PER_THREAD = 16,
     };
 
     using AgentDFAPolicy = AgentDFAPolicy<BLOCK_THREADS, ITEMS_PER_THREAD>;
diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh
index 5532a7f994b..ae1f81fd541 100644
--- a/cpp/src/io/fst/lookup_tables.cuh
+++ b/cpp/src/io/fst/lookup_tables.cuh
@@ -367,18 +367,18 @@ class TransitionTable {
 
   template <typename StateIdT>
   static KernelParameter InitDeviceTransitionTable(
-    std::array<std::array<StateIdT, MAX_NUM_SYMBOLS>, MAX_NUM_STATES> const& translation_table)
+    std::array<std::array<StateIdT, MAX_NUM_SYMBOLS>, MAX_NUM_STATES> const& transition_table)
   {
     KernelParameter init_data{};
-    // translation_table[state][symbol] -> new state
-    for (std::size_t state = 0; state < translation_table.size(); ++state) {
-      for (std::size_t symbol = 0; symbol < translation_table[state].size(); ++symbol) {
+    // transition_table[state][symbol] -> new state
+    for (std::size_t state = 0; state < transition_table.size(); ++state) {
+      for (std::size_t symbol = 0; symbol < transition_table[state].size(); ++symbol) {
         CUDF_EXPECTS(
-          static_cast<int64_t>(translation_table[state][symbol]) <=
+          static_cast<int64_t>(transition_table[state][symbol]) <=
             std::numeric_limits<ItemT>::max(),
           "Target state index value exceeds value representable by the transition table's type");
         init_data.transitions[symbol * MAX_NUM_STATES + state] =
-          static_cast<ItemT>(translation_table[state][symbol]);
+          static_cast<ItemT>(transition_table[state][symbol]);
       }
     }
 
@@ -494,6 +494,10 @@ class dfa_device_view {
   // This is a value queried by the DFA simulation algorithm
   static constexpr int32_t MAX_NUM_STATES = NUM_STATES;
 
+  using OutSymbolT                            = typename TranslationTableT::OutSymbolT;
+  static constexpr int32_t MIN_TRANSLATED_OUT = TranslationTableT::MIN_TRANSLATED_OUT;
+  static constexpr int32_t MAX_TRANSLATED_OUT = TranslationTableT::MAX_TRANSLATED_OUT;
+
   using SymbolGroupStorageT      = std::conditional_t<is_complex_op<SymbolGroupIdLookupT>::value,
                                                  typename SymbolGroupIdLookupT::TempStorage,
                                                  typename cub::NullType>;
@@ -542,24 +546,33 @@ class dfa_device_view {
  * @tparam OutSymbolT The symbol type being output
  * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output
  * symbols
- * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition
+ * @tparam MAX_NUM_SYMBOLS The maximum number of symbol groups supported by this lookup table
  * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support
+ * @tparam MIN_TRANSLATED_OUT_ The minimum number of symbols being output by a single state
+ * transition
+ * @tparam MAX_TRANSLATED_OUT_ The maximum number of symbols being output by a single state
+ * transition
  * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols
- * be used.
  */
-template <typename OutSymbolT,
+template <typename OutSymbolT_,
           typename OutSymbolOffsetT,
           int32_t MAX_NUM_SYMBOLS,
           int32_t MAX_NUM_STATES,
+          int32_t MIN_TRANSLATED_OUT_,
+          int32_t MAX_TRANSLATED_OUT_,
           int32_t MAX_TABLE_SIZE = (MAX_NUM_SYMBOLS * MAX_NUM_STATES)>
 class TransducerLookupTable {
  private:
   struct _TempStorage {
     OutSymbolOffsetT out_offset[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1];
-    OutSymbolT out_symbols[MAX_TABLE_SIZE];
+    OutSymbolT_ out_symbols[MAX_TABLE_SIZE];
   };
 
  public:
+  using OutSymbolT                            = OutSymbolT_;
+  static constexpr int32_t MIN_TRANSLATED_OUT = MIN_TRANSLATED_OUT_;
+  static constexpr int32_t MAX_TRANSLATED_OUT = MAX_TRANSLATED_OUT_;
+
   using TempStorage = cub::Uninitialized<_TempStorage>;
 
   struct KernelParameter {
@@ -567,6 +580,8 @@ class TransducerLookupTable {
                                                OutSymbolOffsetT,
                                                MAX_NUM_SYMBOLS,
                                                MAX_NUM_STATES,
+                                               MIN_TRANSLATED_OUT,
+                                               MAX_TRANSLATED_OUT,
                                                MAX_TABLE_SIZE>;
 
     OutSymbolOffsetT d_out_offsets[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1];
@@ -686,14 +701,19 @@ class TransducerLookupTable {
  * sequence of symbols that the finite-state transducer is supposed to output for each transition.
  *
  * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols
- * be used
+ * @tparam MIN_TRANSLATED_OUT The minimum number of symbols being output by a single state
+ * transition
+ * @tparam MAX_TRANSLATED_OUT The maximum number of symbols being output by a single state
+ * transition
  * @tparam OutSymbolT The symbol type being output
- * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition
+ * @tparam MAX_NUM_SYMBOLS The maximum number of symbol groups supported by this lookup table
  * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support
  * @param translation_table The translation table
  * @return A translation table of type `TransducerLookupTable`.
  */
 template <std::size_t MAX_TABLE_SIZE,
+          std::size_t MIN_TRANSLATED_OUT,
+          std::size_t MAX_TRANSLATED_OUT,
           typename OutSymbolT,
           std::size_t MAX_NUM_SYMBOLS,
           std::size_t MAX_NUM_STATES>
@@ -705,20 +725,30 @@ auto make_translation_table(std::array<std::array<std::vector<OutSymbolT>, MAX_N
                                                     OutSymbolOffsetT,
                                                     MAX_NUM_SYMBOLS,
                                                     MAX_NUM_STATES,
+                                                    MIN_TRANSLATED_OUT,
+                                                    MAX_TRANSLATED_OUT,
                                                     MAX_TABLE_SIZE>;
   return translation_table_t::InitDeviceTranslationTable(translation_table);
 }
 
-template <typename TranslationOpT>
+template <typename TranslationOpT,
+          typename OutSymbolT_,
+          std::int32_t MIN_TRANSLATED_OUT_,
+          std::int32_t MAX_TRANSLATED_OUT_>
 class TranslationOp {
  private:
   struct _TempStorage {};
 
  public:
+  using OutSymbolT                            = OutSymbolT_;
+  static constexpr int32_t MIN_TRANSLATED_OUT = MIN_TRANSLATED_OUT_;
+  static constexpr int32_t MAX_TRANSLATED_OUT = MAX_TRANSLATED_OUT_;
+
   using TempStorage = cub::Uninitialized<_TempStorage>;
 
   struct KernelParameter {
-    using LookupTableT = TranslationOp<TranslationOpT>;
+    using LookupTableT =
+      TranslationOp<TranslationOpT, OutSymbolT, MIN_TRANSLATED_OUT, MAX_TRANSLATED_OUT>;
     TranslationOpT translation_op;
   };
 
@@ -772,6 +802,10 @@ class TranslationOp {
  *
  * @tparam FunctorT A function object type that must implement two signatures: (1) with `(state_id,
  * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)`
+ * @tparam MIN_TRANSLATED_SYMBOLS The minimum number of translated output symbols for any given
+ * input symbol
+ * @tparam MAX_TRANSLATED_SYMBOLS The maximum number of translated output symbols for any given
+ * input symbol
  * @param map_op A function object that must implement two signatures: (1) with `(state_id,
  * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)`.
  * Invocations of the first signature, (1), must return the number of symbols that are emitted for
@@ -779,10 +813,14 @@ class TranslationOp {
  * that transition, where `i` corresponds to `relative_offse`
  * @return A translation table of type `TranslationO`
  */
-template <typename FunctorT>
+template <typename OutSymbolT,
+          std::size_t MIN_TRANSLATED_OUT,
+          std::size_t MAX_TRANSLATED_OUT,
+          typename FunctorT>
 auto make_translation_functor(FunctorT map_op)
 {
-  return TranslationOp<FunctorT>::InitDeviceTranslationTable(map_op);
+  return TranslationOp<FunctorT, OutSymbolT, MIN_TRANSLATED_OUT, MAX_TRANSLATED_OUT>::
+    InitDeviceTranslationTable(map_op);
 }
 
 /**
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index ca56a12eb36..760b2214365 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -302,11 +302,14 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<Symbo
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
-  auto parser = fst::detail::make_fst(
-    fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
-    fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
-    fst::detail::make_translation_functor(normalize_quotes::TransduceToNormalizedQuotes{}),
-    stream);
+  static constexpr std::int32_t min_out = 0;
+  static constexpr std::int32_t max_out = 2;
+  auto parser =
+    fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
+                          fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
+                          fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
+                            normalize_quotes::TransduceToNormalizedQuotes{}),
+                          stream);
 
   rmm::device_uvector<SymbolT> outbuf(indata.size() * 2, stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
@@ -327,11 +330,14 @@ void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<SymbolT>
                           rmm::cuda_stream_view stream,
                           rmm::device_async_resource_ref mr)
 {
-  auto parser = fst::detail::make_fst(
-    fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
-    fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
-    fst::detail::make_translation_functor(normalize_whitespace::TransduceToNormalizedWS{}),
-    stream);
+  static constexpr std::int32_t min_out = 0;
+  static constexpr std::int32_t max_out = 2;
+  auto parser =
+    fst::detail::make_fst(fst::detail::make_symbol_group_lut(normalize_whitespace::wna_sgs),
+                          fst::detail::make_transition_table(normalize_whitespace::wna_state_tt),
+                          fst::detail::make_translation_functor<SymbolT, min_out, max_out>(
+                            normalize_whitespace::TransduceToNormalizedWS{}),
+                          stream);
 
   rmm::device_uvector<SymbolT> outbuf(indata.size(), stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index a007754ef4f..8decaf034f3 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1455,11 +1455,14 @@ void get_stack_context(device_span<SymbolT const> json_in,
   constexpr auto max_translation_table_size =
     to_stack_op::NUM_SYMBOL_GROUPS * to_stack_op::TT_NUM_STATES;
 
-  auto json_to_stack_ops_fst = fst::detail::make_fst(
+  static constexpr auto min_translated_out = 0;
+  static constexpr auto max_translated_out = 1;
+  auto json_to_stack_ops_fst               = fst::detail::make_fst(
     fst::detail::make_symbol_group_lut(to_stack_op::get_sgid_lut(delimiter)),
     fst::detail::make_transition_table(to_stack_op::get_transition_table(stack_behavior)),
-    fst::detail::make_translation_table<max_translation_table_size>(
-      to_stack_op::get_translation_table(stack_behavior)),
+    fst::detail::
+      make_translation_table<max_translation_table_size, min_translated_out, max_translated_out>(
+        to_stack_op::get_translation_table(stack_behavior)),
     stream);
 
   // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end
@@ -1507,11 +1510,12 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
   // Instantiate FST for post-processing the token stream to remove all tokens that belong to an
   // invalid JSON line
   token_filter::UnwrapTokenFromSymbolOp sgid_op{};
-  auto filter_fst =
-    fst::detail::make_fst(fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op),
-                          fst::detail::make_transition_table(token_filter::transition_table),
-                          fst::detail::make_translation_functor(token_filter::TransduceToken{}),
-                          stream);
+  using symbol_t  = thrust::tuple<PdaTokenT, SymbolOffsetT>;
+  auto filter_fst = fst::detail::make_fst(
+    fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op),
+    fst::detail::make_transition_table(token_filter::transition_table),
+    fst::detail::make_translation_functor<symbol_t, 0, 2>(token_filter::TransduceToken{}),
+    stream);
 
   auto const mr = rmm::mr::get_current_device_resource();
   rmm::device_scalar<SymbolOffsetT> d_num_selected_tokens(stream, mr);
@@ -1598,7 +1602,8 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
       fst::detail::make_symbol_group_lookup_op(
         fix_stack_of_excess_chars::SymbolPairToSymbolGroupId{delimiter}),
       fst::detail::make_transition_table(fix_stack_of_excess_chars::transition_table),
-      fst::detail::make_translation_functor(fix_stack_of_excess_chars::TransduceInputOp{}),
+      fst::detail::make_translation_functor<StackSymbolT, 1, 1>(
+        fix_stack_of_excess_chars::TransduceInputOp{}),
       stream);
     fix_stack_of_excess_chars.Transduce(zip_in,
                                         static_cast<SymbolOffsetT>(json_in.size()),
@@ -1619,7 +1624,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   auto json_to_tokens_fst = fst::detail::make_fst(
     fst::detail::make_symbol_group_lookup_op(tokenizer_pda::PdaSymbolToSymbolGroupId{delimiter}),
     fst::detail::make_transition_table(tokenizer_pda::get_transition_table(format)),
-    fst::detail::make_translation_table<max_translation_table_size>(
+    fst::detail::make_translation_table<max_translation_table_size, 0, 3>(
       tokenizer_pda::get_translation_table(recover_from_error)),
     stream);
 
diff --git a/cpp/tests/io/fst/common.hpp b/cpp/tests/io/fst/common.hpp
index 382d21fabb8..0177300eda9 100644
--- a/cpp/tests/io/fst/common.hpp
+++ b/cpp/tests/io/fst/common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -69,6 +69,8 @@ std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const pda_s
    /* TT_ESC    */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}};
 
 // Translation table (i.e., for each transition, what are the symbols that we output)
+static constexpr auto min_translated_out = 1;
+static constexpr auto max_translated_out = 1;
 std::array<std::array<std::vector<char>, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const pda_out_tt{
   {/* IN_STATE         {      [      }      ]      "      \    OTHER */
    /* TT_OOS    */ {{{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}}},
diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu
index 4df0d3ae04d..8a8d3d39e0f 100644
--- a/cpp/tests/io/fst/fst_test.cu
+++ b/cpp/tests/io/fst/fst_test.cu
@@ -169,7 +169,9 @@ TEST_F(FstTest, GroundTruth)
   auto parser = cudf::io::fst::detail::make_fst(
     cudf::io::fst::detail::make_symbol_group_lut(pda_sgs),
     cudf::io::fst::detail::make_transition_table(pda_state_tt),
-    cudf::io::fst::detail::make_translation_table<TT_NUM_STATES * NUM_SYMBOL_GROUPS>(pda_out_tt),
+    cudf::io::fst::detail::make_translation_table<TT_NUM_STATES * NUM_SYMBOL_GROUPS,
+                                                  min_translated_out,
+                                                  max_translated_out>(pda_out_tt),
     stream);
 
   // Allocate device-side temporary storage & run algorithm

From 39f256c3397afc9c495cb819636abddb23f81dc0 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Tue, 23 Jul 2024 19:03:16 -0500
Subject: [PATCH 572/842] Fall back to CPU for unsupported libcudf binaryops in
 cudf-polars (#16188)

This PR adds logic that should trigger CPU fallback unsupported binary ops.

Authors:
  - https://github.com/brandon-b-miller
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16188
---
 python/cudf_polars/cudf_polars/dsl/expr.py    | 13 ++++---
 .../cudf_polars/cudf_polars/utils/dtypes.py   | 38 +------------------
 .../tests/expressions/test_literal.py         | 18 ++++++---
 3 files changed, 21 insertions(+), 48 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 9835e6f8461..6325feced94 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -1424,13 +1424,14 @@ def __init__(
         super().__init__(dtype)
         self.op = op
         self.children = (left, right)
-        if (
-            op in (plc.binaryop.BinaryOperator.ADD, plc.binaryop.BinaryOperator.SUB)
-            and plc.traits.is_chrono(left.dtype)
-            and plc.traits.is_chrono(right.dtype)
-            and not dtypes.have_compatible_resolution(left.dtype.id(), right.dtype.id())
+        if not plc.binaryop.is_supported_operation(
+            self.dtype, left.dtype, right.dtype, op
         ):
-            raise NotImplementedError("Casting rules for timelike types")
+            raise NotImplementedError(
+                f"Operation {op.name} not supported "
+                f"for types {left.dtype.id().name} and {right.dtype.id().name} "
+                f"with output type {self.dtype.id().name}"
+            )
 
     _MAPPING: ClassVar[dict[pl_expr.Operator, plc.binaryop.BinaryOperator]] = {
         pl_expr.Operator.Eq: plc.binaryop.BinaryOperator.EQUAL,
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index 1279fe91d48..cd68d021286 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -14,43 +14,7 @@
 
 import cudf._lib.pylibcudf as plc
 
-__all__ = ["from_polars", "downcast_arrow_lists", "have_compatible_resolution"]
-
-
-def have_compatible_resolution(lid: plc.TypeId, rid: plc.TypeId):
-    """
-    Do two datetime typeids have matching resolution for a binop.
-
-    Parameters
-    ----------
-    lid
-       Left type id
-    rid
-       Right type id
-
-    Returns
-    -------
-    True if resolutions are compatible, False otherwise.
-
-    Notes
-    -----
-    Polars has different casting rules for combining
-    datetimes/durations than libcudf, and while we don't encode the
-    casting rules fully, just reject things we can't handle.
-
-    Precondition for correctness: both lid and rid are timelike.
-    """
-    if lid == rid:
-        return True
-    # Timestamps are smaller than durations in the libcudf enum.
-    lid, rid = sorted([lid, rid])
-    if lid == plc.TypeId.TIMESTAMP_MILLISECONDS:
-        return rid == plc.TypeId.DURATION_MILLISECONDS
-    elif lid == plc.TypeId.TIMESTAMP_MICROSECONDS:
-        return rid == plc.TypeId.DURATION_MICROSECONDS
-    elif lid == plc.TypeId.TIMESTAMP_NANOSECONDS:
-        return rid == plc.TypeId.DURATION_NANOSECONDS
-    return False
+__all__ = ["from_polars", "downcast_arrow_lists"]
 
 
 def downcast_arrow_lists(typ: pa.DataType) -> pa.DataType:
diff --git a/python/cudf_polars/tests/expressions/test_literal.py b/python/cudf_polars/tests/expressions/test_literal.py
index 55e688428bd..5bd3131d1d7 100644
--- a/python/cudf_polars/tests/expressions/test_literal.py
+++ b/python/cudf_polars/tests/expressions/test_literal.py
@@ -6,6 +6,8 @@
 
 import polars as pl
 
+import cudf._lib.pylibcudf as plc
+
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
     assert_ir_translation_raises,
@@ -64,11 +66,17 @@ def test_timelike_literal(timestamp, timedelta):
         adjusted=timestamp + timedelta,
         two_delta=timedelta + timedelta,
     )
-    schema = q.collect_schema()
-    time_type = schema["time"]
-    delta_type = schema["delta"]
-    if dtypes.have_compatible_resolution(
-        dtypes.from_polars(time_type).id(), dtypes.from_polars(delta_type).id()
+    schema = {k: dtypes.from_polars(v) for k, v in q.collect_schema().items()}
+    if plc.binaryop.is_supported_operation(
+        schema["adjusted"],
+        schema["time"],
+        schema["delta"],
+        plc.binaryop.BinaryOperator.ADD,
+    ) and plc.binaryop.is_supported_operation(
+        schema["two_delta"],
+        schema["delta"],
+        schema["delta"],
+        plc.binaryop.BinaryOperator.ADD,
     ):
         assert_gpu_result_equal(q)
     else:

From f0efc8b36a8f43cfa027966265dcea052bb5c45d Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 23 Jul 2024 17:17:05 -0700
Subject: [PATCH 573/842] Modify `make_host_vector` and `make_device_uvector`
 factories to optionally use pinned memory and kernel copy (#16206)

Issue #15616

Modified `make_host_vector` functions to return `cudf::detail::host_vector`, which can use a pinned or a pageable memory resource. When pinned memory is used, the D2H copy is potentially done using a CUDA kernel.

Also added factories to create `host_vector`s without device data. These are useful to replace uses of `std::vector` and `thrust::host_vector` when the data eventually gets copied to the GPU.

Added `is_device_accessible` to `host_span`. With this, `make_device_uvector` can optionally use the kernel for the H2D copy.

Modified `cudf::detail::host_vector` to be derived from `thrust::host_vector`, to avoid issues with implicit conversion from `std::vector`.

Used `cudf::detail::host_vector` and its new factory functions wherever data ends up copied to the GPU.

Stopped using `thrust::copy_n` for the kernel copy path in `cuda_memcpy` because of an optimization that allows it to fall back to `cudaMemCpyAsync`. We now call a simple local kernel.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)
  - Alessandro Bellina (https://github.com/abellina)

URL: https://github.com/rapidsai/cudf/pull/16206
---
 cpp/CMakeLists.txt                            |   2 +-
 cpp/include/cudf/detail/gather.cuh            |   2 +-
 cpp/include/cudf/detail/null_mask.cuh         |   4 +-
 .../cudf/detail/utilities/host_memory.hpp     |  51 +++++++++
 .../cudf/detail/utilities/host_vector.hpp     |  24 +++-
 .../detail/utilities/vector_factories.hpp     | 106 ++++++++++++------
 cpp/include/cudf/io/text/detail/trie.hpp      |   4 +-
 cpp/include/cudf/lists/detail/dremel.hpp      |  10 +-
 cpp/include/cudf/utilities/pinned_memory.hpp  |  16 +++
 cpp/include/cudf/utilities/span.hpp           |  32 ++++++
 cpp/src/copying/concatenate.cu                |   6 +-
 cpp/src/copying/contiguous_split.cu           |   3 +-
 cpp/src/datetime/timezone.cpp                 |   6 +-
 cpp/src/dictionary/detail/concatenate.cu      |   2 +-
 cpp/src/io/avro/reader_impl.cu                |   8 +-
 cpp/src/io/csv/reader_impl.cu                 |  44 +++++---
 cpp/src/io/json/json_column.cu                |   4 +-
 cpp/src/io/json/nested_json_gpu.cu            |   6 +-
 cpp/src/io/json/read_json.cu                  |   3 +-
 cpp/src/io/orc/reader_impl_decode.cu          |  10 +-
 cpp/src/io/orc/stripe_enc.cu                  |   4 +-
 cpp/src/io/orc/writer_impl.cu                 |  50 +++++----
 cpp/src/io/orc/writer_impl.hpp                |   9 +-
 cpp/src/io/parquet/predicate_pushdown.cpp     |  20 ++--
 cpp/src/io/parquet/reader_impl_chunking.cu    |  78 +++++++------
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  10 +-
 cpp/src/io/parquet/writer_impl.cu             |   7 +-
 cpp/src/lists/dremel.cu                       |   6 +-
 cpp/src/strings/combine/join.cu               |   6 +-
 cpp/src/strings/convert/convert_datetime.cu   |   2 +-
 cpp/src/strings/copying/concatenate.cu        |   2 +-
 cpp/src/strings/filter_chars.cu               |   2 +-
 cpp/src/strings/replace/multi_re.cu           |   2 +-
 cpp/src/strings/translate.cu                  |   2 +-
 cpp/src/table/row_operators.cu                |   5 +-
 cpp/src/utilities/cuda_memcpy.cu              |  20 +++-
 .../{pinned_memory.cpp => host_memory.cpp}    |  86 +++++++++++++-
 cpp/tests/io/json/json_tree.cpp               |   6 +-
 cpp/tests/strings/integers_tests.cpp          |   4 +-
 .../utilities_tests/pinned_memory_tests.cpp   |  67 ++++++++++-
 40 files changed, 539 insertions(+), 192 deletions(-)
 create mode 100644 cpp/include/cudf/detail/utilities/host_memory.hpp
 rename cpp/src/utilities/{pinned_memory.cpp => host_memory.cpp} (73%)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 5e79204a558..a2c2dd3af4c 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -671,9 +671,9 @@ add_library(
   src/unary/null_ops.cu
   src/utilities/cuda_memcpy.cu
   src/utilities/default_stream.cpp
+  src/utilities/host_memory.cpp
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
-  src/utilities/pinned_memory.cpp
   src/utilities/prefetch.cpp
   src/utilities/stacktrace.cpp
   src/utilities/stream_pool.cpp
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 5977c7341c1..d3e9fc4974d 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -577,7 +577,7 @@ void gather_bitmask(table_view const& source,
   }
 
   // Make device array of target bitmask pointers
-  std::vector<bitmask_type*> target_masks(target.size());
+  auto target_masks = make_host_vector<bitmask_type*>(target.size(), stream);
   std::transform(target.begin(), target.end(), target_masks.begin(), [](auto const& col) {
     return col->mutable_view().null_mask();
   });
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index e62675cbc8c..ae6db5409cc 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -430,7 +430,9 @@ std::vector<size_type> segmented_count_bits(bitmask_type const* bitmask,
   if (num_segments == 0) { return std::vector<size_type>{}; }
 
   // Construct a contiguous host buffer of indices and copy to device.
-  auto const h_indices = std::vector<size_type>(indices_begin, indices_end);
+  auto h_indices = make_empty_host_vector<typename std::iterator_traits<IndexIterator>::value_type>(
+    std::distance(indices_begin, indices_end), stream);
+  std::copy(indices_begin, indices_end, std::back_inserter(h_indices));
   auto const d_indices =
     make_device_uvector_async(h_indices, stream, rmm::mr::get_current_device_resource());
 
diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp
new file mode 100644
index 00000000000..c6775a950c9
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/host_memory.hpp
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/utilities/export.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
+
+#include <rmm/resource_ref.hpp>
+
+#include <cstddef>
+
+namespace cudf::detail {
+/**
+ * @brief Get the memory resource to be used for pageable memory allocations.
+ *
+ * @return Reference to the pageable memory resource
+ */
+CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource();
+
+/**
+ * @brief Get the allocator to be used for the host memory allocation.
+ *
+ * @param size The number of elements of type T to allocate
+ * @param stream The stream to use for the allocation
+ * @return The allocator to be used for the host memory allocation
+ */
+template <typename T>
+rmm_host_allocator<T> get_host_allocator(std::size_t size, rmm::cuda_stream_view stream)
+{
+  if (size * sizeof(T) <= get_allocate_host_as_pinned_threshold()) {
+    return {get_pinned_memory_resource(), stream};
+  }
+  return {get_pageable_memory_resource(), stream};
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
index 2d14d0306cd..f4e5f718da4 100644
--- a/cpp/include/cudf/detail/utilities/host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -61,6 +61,10 @@ class rmm_host_allocator<void> {
   };
 };
 
+template <class DesiredProperty, class... Properties>
+inline constexpr bool contains_property =
+  (cuda::std::is_same_v<DesiredProperty, Properties> || ... || false);
+
 /*! \p rmm_host_allocator is a CUDA-specific host memory allocator
  *  that employs \c `rmm::host_async_resource_ref` for allocation.
  *
@@ -100,8 +104,12 @@ class rmm_host_allocator {
   /**
    * @brief Construct from a `cudf::host_async_resource_ref`
    */
-  rmm_host_allocator(rmm::host_async_resource_ref _mr, rmm::cuda_stream_view _stream)
-    : mr(_mr), stream(_stream)
+  template <class... Properties>
+  rmm_host_allocator(cuda::mr::async_resource_ref<cuda::mr::host_accessible, Properties...> _mr,
+                     rmm::cuda_stream_view _stream)
+    : mr(_mr),
+      stream(_stream),
+      _is_device_accessible{contains_property<cuda::mr::device_accessible, Properties...>}
   {
   }
 
@@ -173,15 +181,25 @@ class rmm_host_allocator {
    */
   inline bool operator!=(rmm_host_allocator const& x) const { return !operator==(x); }
 
+  bool is_device_accessible() const { return _is_device_accessible; }
+
  private:
   rmm::host_async_resource_ref mr;
   rmm::cuda_stream_view stream;
+  bool _is_device_accessible;
 };
 
 /**
  * @brief A vector class with rmm host memory allocator
  */
 template <typename T>
-using host_vector = thrust::host_vector<T, rmm_host_allocator<T>>;
+class host_vector : public thrust::host_vector<T, rmm_host_allocator<T>> {
+ public:
+  using base = thrust::host_vector<T, rmm_host_allocator<T>>;
+
+  host_vector(rmm_host_allocator<T> const& alloc) : base(alloc) {}
+
+  host_vector(size_t size, rmm_host_allocator<T> const& alloc) : base(size, alloc) {}
+};
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 20cb55bb1c7..45dc839c9bd 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -21,6 +21,8 @@
  * @file vector_factories.hpp
  */
 
+#include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/host_memory.hpp>
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -32,8 +34,6 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <thrust/host_vector.h>
-
 #include <vector>
 
 namespace cudf {
@@ -100,11 +100,12 @@ rmm::device_uvector<T> make_device_uvector_async(host_span<T const> source_data,
                                                  rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(ret.data(),
-                                source_data.data(),
-                                source_data.size() * sizeof(T),
-                                cudaMemcpyDefault,
-                                stream.value()));
+  auto const is_pinned = source_data.is_device_accessible();
+  cuda_memcpy_async(ret.data(),
+                    source_data.data(),
+                    source_data.size() * sizeof(T),
+                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                    stream);
   return ret;
 }
 
@@ -271,21 +272,11 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
   return make_device_uvector_sync(device_span<typename Container::value_type const>{c}, stream, mr);
 }
 
-// Utility function template to allow copying to either a thrust::host_vector or std::vector
-template <typename T, typename OutContainer>
-OutContainer make_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
-{
-  OutContainer result(v.size());
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
-  return result;
-}
-
 /**
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does not synchronize `stream`.
+ * @note This function does not synchronize `stream` after the copy.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -295,14 +286,17 @@ OutContainer make_vector_async(device_span<T const> v, rmm::cuda_stream_view str
 template <typename T>
 std::vector<T> make_std_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
-  return make_vector_async<T, std::vector<T>>(v, stream);
+  std::vector<T> result(v.size());
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
+    result.data(), v.data(), v.size() * sizeof(T), cudaMemcpyDefault, stream.value()));
+  return result;
 }
 
 /**
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a device
  * container
  *
- * @note This function synchronizes `stream`.
+ * @note This function synchronizes `stream` after the copy.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -324,7 +318,7 @@ std::vector<typename Container::value_type> make_std_vector_async(Container cons
  * @brief Synchronously construct a `std::vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does a synchronize on `stream`.
+ * @note This function does a synchronize on `stream` after the copy.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -361,11 +355,46 @@ std::vector<typename Container::value_type> make_std_vector_sync(Container const
   return make_std_vector_sync(device_span<typename Container::value_type const>{c}, stream);
 }
 
+/**
+ * @brief Construct a `cudf::detail::host_vector` of the given size.
+ *
+ * @note The returned vector may be using a pinned memory resource.
+ *
+ * @tparam T The type of the vector data
+ * @param size The number of elements in the created vector
+ * @param stream The stream on which to allocate memory
+ * @return A host_vector of the given size
+ */
+template <typename T>
+host_vector<T> make_host_vector(size_t size, rmm::cuda_stream_view stream)
+{
+  return host_vector<T>(size, get_host_allocator<T>(size, stream));
+}
+
+/**
+ * @brief Construct an empty `cudf::detail::host_vector` with the given capacity.
+ *
+ * @note The returned vector may be using a pinned memory resource.
+ *
+ * @tparam T The type of the vector data
+ * @param capacity Initial capacity of the vector
+ * @param stream The stream on which to allocate memory
+ * @return A host_vector with the given capacity
+ */
+template <typename T>
+host_vector<T> make_empty_host_vector(size_t capacity, rmm::cuda_stream_view stream)
+{
+  auto result = host_vector<T>(get_host_allocator<T>(capacity, stream));
+  result.reserve(capacity);
+  return result;
+}
+
 /**
  * @brief Asynchronously construct a `thrust::host_vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does not synchronize `stream`.
+ * @note This function does not synchronize `stream` after the copy. The returned vector may be
+ * using a pinned memory resource.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -373,16 +402,24 @@ std::vector<typename Container::value_type> make_std_vector_sync(Container const
  * @return The data copied to the host
  */
 template <typename T>
-thrust::host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
+host_vector<T> make_host_vector_async(device_span<T const> v, rmm::cuda_stream_view stream)
 {
-  return make_vector_async<T, thrust::host_vector<T>>(v, stream);
+  auto result          = make_host_vector<T>(v.size(), stream);
+  auto const is_pinned = result.get_allocator().is_device_accessible();
+  cuda_memcpy_async(result.data(),
+                    v.data(),
+                    v.size() * sizeof(T),
+                    is_pinned ? host_memory_kind::PINNED : host_memory_kind::PAGEABLE,
+                    stream);
+  return result;
 }
 
 /**
  * @brief Asynchronously construct a `std::vector` containing a copy of data from a device
  * container
  *
- * @note This function does not synchronize `stream`.
+ * @note This function does not synchronize `stream` after the copy. The returned vector may be
+ * using a pinned memory resource.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -394,8 +431,8 @@ template <
   typename Container,
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
-thrust::host_vector<typename Container::value_type> make_host_vector_async(
-  Container const& c, rmm::cuda_stream_view stream)
+host_vector<typename Container::value_type> make_host_vector_async(Container const& c,
+                                                                   rmm::cuda_stream_view stream)
 {
   return make_host_vector_async(device_span<typename Container::value_type const>{c}, stream);
 }
@@ -404,7 +441,8 @@ thrust::host_vector<typename Container::value_type> make_host_vector_async(
  * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a
  * `device_span`
  *
- * @note This function does a synchronize on `stream`.
+ * @note This function does a synchronize on `stream` after the copy. The returned vector may be
+ * using a pinned memory resource.
  *
  * @tparam T The type of the data to copy
  * @param source_data The device data to copy
@@ -412,7 +450,7 @@ thrust::host_vector<typename Container::value_type> make_host_vector_async(
  * @return The data copied to the host
  */
 template <typename T>
-thrust::host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_stream_view stream)
+host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_stream_view stream)
 {
   auto result = make_host_vector_async(v, stream);
   stream.synchronize();
@@ -423,7 +461,7 @@ thrust::host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_s
  * @brief Synchronously construct a `thrust::host_vector` containing a copy of data from a device
  * container
  *
- * @note This function synchronizes `stream`.
+ * @note This function synchronizes `stream` after the copy.
  *
  * @tparam Container The type of the container to copy from
  * @tparam T The type of the data to copy
@@ -435,8 +473,8 @@ template <
   typename Container,
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
-thrust::host_vector<typename Container::value_type> make_host_vector_sync(
-  Container const& c, rmm::cuda_stream_view stream)
+host_vector<typename Container::value_type> make_host_vector_sync(Container const& c,
+                                                                  rmm::cuda_stream_view stream)
 {
   return make_host_vector_sync(device_span<typename Container::value_type const>{c}, stream);
 }
@@ -444,7 +482,7 @@ thrust::host_vector<typename Container::value_type> make_host_vector_sync(
 /**
  * @brief Asynchronously construct a pinned `cudf::detail::host_vector` of the given size
  *
- * @note This function may not synchronize `stream`.
+ * @note This function may not synchronize `stream` after the copy.
  *
  * @tparam T The type of the vector data
  * @param size The number of elements in the created vector
@@ -460,7 +498,7 @@ host_vector<T> make_pinned_vector_async(size_t size, rmm::cuda_stream_view strea
 /**
  * @brief Synchronously construct a pinned `cudf::detail::host_vector` of the given size
  *
- * @note This function synchronizes `stream`.
+ * @note This function synchronizes `stream` after the copy.
  *
  * @tparam T The type of the vector data
  * @param size The number of elements in the created vector
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
index e0b9c7635e3..28862d97ede 100644
--- a/cpp/include/cudf/io/text/detail/trie.hpp
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -223,11 +223,11 @@ struct trie {
 
     match_length.emplace_back(0);
 
-    std::vector<trie_node> trie_nodes;
     auto token_counts = std::unordered_map<cudf::size_type, int32_t>();
+    auto trie_nodes   = cudf::detail::make_empty_host_vector<trie_node>(tokens.size(), stream);
 
     for (uint32_t i = 0; i < tokens.size(); i++) {
-      trie_nodes.emplace_back(trie_node{tokens[i], match_length[i], transitions[i]});
+      trie_nodes.push_back(trie_node{tokens[i], match_length[i], transitions[i]});
       token_counts[tokens[i]]++;
     }
 
diff --git a/cpp/include/cudf/lists/detail/dremel.hpp b/cpp/include/cudf/lists/detail/dremel.hpp
index d36a4091947..53448424827 100644
--- a/cpp/include/cudf/lists/detail/dremel.hpp
+++ b/cpp/include/cudf/lists/detail/dremel.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,8 +31,8 @@ struct dremel_device_view {
   size_type const* offsets;
   uint8_t const* rep_levels;
   uint8_t const* def_levels;
-  size_type const leaf_data_size;
-  uint8_t const max_def_level;
+  size_type leaf_data_size;
+  uint8_t max_def_level;
 };
 
 /**
@@ -45,8 +45,8 @@ struct dremel_data {
   rmm::device_uvector<uint8_t> rep_level;
   rmm::device_uvector<uint8_t> def_level;
 
-  size_type const leaf_data_size;
-  uint8_t const max_def_level;
+  size_type leaf_data_size;
+  uint8_t max_def_level;
 
   operator dremel_device_view() const
   {
diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
index 3e2fa43cb50..fa7e1b35327 100644
--- a/cpp/include/cudf/utilities/pinned_memory.hpp
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -71,4 +71,20 @@ void set_kernel_pinned_copy_threshold(size_t threshold);
  */
 size_t get_kernel_pinned_copy_threshold();
 
+/**
+ * @brief Set the threshold size for allocating host memory as pinned memory.
+ *
+ * @param threshold The threshold size in bytes. If the size of the allocation is less or equal to
+ * this threshold, the memory will be allocated as pinned memory. If the size is greater than this
+ * threshold, the memory will be allocated as pageable memory.
+ */
+void set_allocate_host_as_pinned_threshold(size_t threshold);
+
+/**
+ * @brief Get the threshold size for allocating host memory as pinned memory.
+ *
+ * @return The threshold size in bytes.
+ */
+size_t get_allocate_host_as_pinned_threshold();
+
 }  // namespace cudf
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index 3b35e60e034..c5054c733a7 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cudf/detail/utilities/host_vector.hpp>
+
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/device_vector.hpp>
@@ -257,6 +259,26 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
   {
   }
 
+  /// Constructor from a host_vector
+  /// @param in The host_vector to construct the span from
+  template <typename OtherT,
+            // Only supported containers of types convertible to T
+            std::enable_if_t<std::is_convertible_v<OtherT (*)[], T (*)[]>>* = nullptr>
+  constexpr host_span(cudf::detail::host_vector<OtherT>& in)
+    : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()}
+  {
+  }
+
+  /// Constructor from a const host_vector
+  /// @param in The host_vector to construct the span from
+  template <typename OtherT,
+            // Only supported containers of types convertible to T
+            std::enable_if_t<std::is_convertible_v<OtherT (*)[], T (*)[]>>* = nullptr>
+  constexpr host_span(cudf::detail::host_vector<OtherT> const& in)
+    : base(in.data(), in.size()), _is_device_accessible{in.get_allocator().is_device_accessible()}
+  {
+  }
+
   // Copy construction to support const conversion
   /// @param other The span to copy
   template <typename OtherT,
@@ -268,6 +290,16 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
     : base(other.data(), other.size())
   {
   }
+
+  /**
+   * @brief Returns whether the data is device accessible (e.g. pinned memory)
+   *
+   * @return true if the data is device accessible
+   */
+  [[nodiscard]] bool is_device_accessible() const { return _is_device_accessible; }
+
+ private:
+  bool _is_device_accessible{false};
 };
 
 // ===== device_span ===============================================================================
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 6acbafd24fb..4be3054b3dc 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -73,8 +73,8 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
   });
 
   // Assemble contiguous array of device views
-  auto device_views = thrust::host_vector<column_device_view>();
-  device_views.reserve(views.size());
+  auto device_views =
+    cudf::detail::make_empty_host_vector<column_device_view>(views.size(), stream);
   std::transform(device_view_owners.cbegin(),
                  device_view_owners.cend(),
                  std::back_inserter(device_views),
@@ -84,7 +84,7 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
     make_device_uvector_async(device_views, stream, rmm::mr::get_current_device_resource());
 
   // Compute the partition offsets
-  auto offsets = thrust::host_vector<size_t>(views.size() + 1);
+  auto offsets = cudf::detail::make_host_vector<size_t>(views.size() + 1, stream);
   thrust::transform_inclusive_scan(
     thrust::host,
     device_views.cbegin(),
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 37db2c74790..95544742fb7 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1539,7 +1539,8 @@ std::unique_ptr<chunk_iteration_state> chunk_iteration_state::create(
 
     std::vector<std::size_t> num_batches_per_iteration;
     std::vector<std::size_t> size_of_batches_per_iteration;
-    std::vector<std::size_t> accum_size_per_iteration;
+    auto accum_size_per_iteration =
+      cudf::detail::make_empty_host_vector<std::size_t>(h_offsets.size(), stream);
     std::size_t accum_size = 0;
     {
       auto current_offset_it = h_offsets.begin();
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index 1b0d201501b..7ca1b51df98 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -485,14 +485,12 @@ std::unique_ptr<table> make_timezone_transition_table(std::optional<std::string_
   CUDF_EXPECTS(transition_times.size() == offsets.size(),
                "Error reading TZif file for timezone " + std::string{timezone_name});
 
-  std::vector<timestamp_s> ttimes_typed;
-  ttimes_typed.reserve(transition_times.size());
+  auto ttimes_typed = make_empty_host_vector<timestamp_s>(transition_times.size(), stream);
   std::transform(transition_times.cbegin(),
                  transition_times.cend(),
                  std::back_inserter(ttimes_typed),
                  [](auto ts) { return timestamp_s{duration_s{ts}}; });
-  std::vector<duration_s> offsets_typed;
-  offsets_typed.reserve(offsets.size());
+  auto offsets_typed = make_empty_host_vector<duration_s>(offsets.size(), stream);
   std::transform(offsets.cbegin(), offsets.cend(), std::back_inserter(offsets_typed), [](auto ts) {
     return duration_s{ts};
   });
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index fdc3d9d0ecf..72828309425 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -105,7 +105,7 @@ struct compute_children_offsets_fn {
    */
   rmm::device_uvector<offsets_pair> create_children_offsets(rmm::cuda_stream_view stream)
   {
-    std::vector<offsets_pair> offsets(columns_ptrs.size());
+    auto offsets = cudf::detail::make_host_vector<offsets_pair>(columns_ptrs.size(), stream);
     thrust::transform_exclusive_scan(
       thrust::host,
       columns_ptrs.begin(),
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 814efe2b5a1..69a0e982a5b 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -554,9 +554,11 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
       auto d_global_dict_data = rmm::device_uvector<char>(0, stream);
 
       if (total_dictionary_entries > 0) {
-        auto h_global_dict      = std::vector<string_index_pair>(total_dictionary_entries);
-        auto h_global_dict_data = std::vector<char>(dictionary_data_size);
-        size_t dict_pos         = 0;
+        auto h_global_dict =
+          cudf::detail::make_host_vector<string_index_pair>(total_dictionary_entries, stream);
+        auto h_global_dict_data =
+          cudf::detail::make_host_vector<char>(dictionary_data_size, stream);
+        size_t dict_pos = 0;
 
         for (size_t i = 0; i < column_types.size(); ++i) {
           auto const col_idx          = selected_columns[i].first;
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 05faded651d..40d4372ae9d 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -567,7 +567,7 @@ void infer_column_types(parse_options const& parse_opts,
 }
 
 std::vector<column_buffer> decode_data(parse_options const& parse_opts,
-                                       std::vector<column_parse::flags> const& column_flags,
+                                       host_span<column_parse::flags const> column_flags,
                                        std::vector<std::string> const& column_names,
                                        device_span<char const> data,
                                        device_span<uint64_t const> row_offsets,
@@ -592,8 +592,8 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
     }
   }
 
-  thrust::host_vector<void*> h_data(num_active_columns);
-  thrust::host_vector<bitmask_type*> h_valid(num_active_columns);
+  auto h_data  = cudf::detail::make_host_vector<void*>(num_active_columns, stream);
+  auto h_valid = cudf::detail::make_host_vector<bitmask_type*>(num_active_columns, stream);
 
   for (int i = 0; i < num_active_columns; ++i) {
     h_data[i]  = out_buffers[i].data();
@@ -622,14 +622,16 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
   return out_buffers;
 }
 
-std::vector<data_type> determine_column_types(csv_reader_options const& reader_opts,
-                                              parse_options const& parse_opts,
-                                              host_span<std::string const> column_names,
-                                              device_span<char const> data,
-                                              device_span<uint64_t const> row_offsets,
-                                              int32_t num_records,
-                                              host_span<column_parse::flags> column_flags,
-                                              rmm::cuda_stream_view stream)
+cudf::detail::host_vector<data_type> determine_column_types(
+  csv_reader_options const& reader_opts,
+  parse_options const& parse_opts,
+  host_span<std::string const> column_names,
+  device_span<char const> data,
+  device_span<uint64_t const> row_offsets,
+  int32_t num_records,
+  host_span<column_parse::flags> column_flags,
+  cudf::size_type num_active_columns,
+  rmm::cuda_stream_view stream)
 {
   std::vector<data_type> column_types(column_flags.size());
 
@@ -653,7 +655,8 @@ std::vector<data_type> determine_column_types(csv_reader_options const& reader_o
                      stream);
 
   // compact column_types to only include active columns
-  std::vector<data_type> active_col_types;
+  auto active_col_types =
+    cudf::detail::make_empty_host_vector<data_type>(num_active_columns, stream);
   std::copy_if(column_types.cbegin(),
                column_types.cend(),
                std::back_inserter(active_col_types),
@@ -697,8 +700,10 @@ table_with_metadata read_csv(cudf::io::datasource* source,
 
   auto const num_actual_columns = static_cast<int32_t>(column_names.size());
   auto num_active_columns       = num_actual_columns;
-  auto column_flags             = std::vector<column_parse::flags>(
-    num_actual_columns, column_parse::enabled | column_parse::inferred);
+  auto column_flags =
+    cudf::detail::make_host_vector<column_parse::flags>(num_actual_columns, stream);
+  std::fill(
+    column_flags.begin(), column_flags.end(), column_parse::enabled | column_parse::inferred);
 
   // User did not pass column names to override names in the file
   // Process names from the file to remove empty and duplicated strings
@@ -842,8 +847,15 @@ table_with_metadata read_csv(cudf::io::datasource* source,
 
   // Exclude the end-of-data row from number of rows with actual data
   auto const num_records  = std::max(row_offsets.size(), 1ul) - 1;
-  auto const column_types = determine_column_types(
-    reader_opts, parse_opts, column_names, data, row_offsets, num_records, column_flags, stream);
+  auto const column_types = determine_column_types(reader_opts,
+                                                   parse_opts,
+                                                   column_names,
+                                                   data,
+                                                   row_offsets,
+                                                   num_records,
+                                                   column_flags,
+                                                   num_active_columns,
+                                                   stream);
 
   auto metadata    = table_metadata{};
   auto out_columns = std::vector<std::unique_ptr<cudf::column>>();
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 3e587768b11..17fa7abdffe 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -622,7 +622,7 @@ void make_device_json_column(device_span<SymbolT const> input,
   // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking
   std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
   // find column_ids which are values, but should be ignored in validity
-  std::vector<uint8_t> ignore_vals(num_columns, 0);
+  auto ignore_vals = cudf::detail::make_host_vector<uint8_t>(num_columns, stream);
   std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
   std::vector<uint8_t> is_pruned(num_columns, 0);
   columns.try_emplace(parent_node_sentinel, std::ref(root));
@@ -812,7 +812,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     return thrust::get<1>(a) < thrust::get<1>(b);
   });
   // move columns data to device.
-  std::vector<json_column_data> columns_data(num_columns);
+  auto columns_data = cudf::detail::make_host_vector<json_column_data>(num_columns, stream);
   for (auto& [col_id, col_ref] : columns) {
     if (col_id == parent_node_sentinel) continue;
     auto& col            = col_ref.get();
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 8decaf034f3..1e484d74679 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1703,10 +1703,8 @@ void make_json_column(json_column& root_column,
   auto const [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr);
 
   // Copy the JSON tokens to the host
-  thrust::host_vector<PdaTokenT> tokens =
-    cudf::detail::make_host_vector_async(d_tokens_gpu, stream);
-  thrust::host_vector<SymbolOffsetT> token_indices_gpu =
-    cudf::detail::make_host_vector_async(d_token_indices_gpu, stream);
+  auto tokens            = cudf::detail::make_host_vector_async(d_tokens_gpu, stream);
+  auto token_indices_gpu = cudf::detail::make_host_vector_async(d_token_indices_gpu, stream);
 
   // Make sure tokens have been copied to the host
   stream.synchronize();
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 0ba4dedfc34..590f70864b1 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -78,10 +78,9 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
   auto constexpr num_delimiter_chars = 1;
 
   if (compression == compression_type::NONE) {
-    std::vector<size_t> delimiter_map{};
+    auto delimiter_map = cudf::detail::make_empty_host_vector<size_t>(sources.size(), stream);
     std::vector<size_t> prefsum_source_sizes(sources.size());
     std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
-    delimiter_map.reserve(sources.size());
     size_t bytes_read = 0;
     std::transform_inclusive_scan(sources.begin(),
                                   sources.end(),
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index 8e20505d3ff..e3b9a048be8 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -492,11 +492,17 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
   if (num_stripes == 0) return;
 
   auto const num_columns = chunks.size().second;
-  std::vector<thrust::pair<size_type, uint32_t*>> prefix_sums_to_update;
+  auto const num_struct_cols =
+    std::count_if(chunks[0].begin(), chunks[0].end(), [](auto const& chunk) {
+      return chunk.type_kind == STRUCT;
+    });
+  auto prefix_sums_to_update =
+    cudf::detail::make_empty_host_vector<thrust::pair<size_type, uint32_t*>>(num_struct_cols,
+                                                                             stream);
   for (auto col_idx = 0ul; col_idx < num_columns; ++col_idx) {
     // Null counts sums are only needed for children of struct columns
     if (chunks[0][col_idx].type_kind == STRUCT) {
-      prefix_sums_to_update.emplace_back(col_idx, d_prefix_sums + num_stripes * col_idx);
+      prefix_sums_to_update.push_back({col_idx, d_prefix_sums + num_stripes * col_idx});
     }
   }
   auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 805959327ac..80f32512b98 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -1417,8 +1417,8 @@ void decimal_sizes_to_offsets(device_2dspan<rowgroup_rows const> rg_bounds,
   if (rg_bounds.count() == 0) return;
 
   // Convert map to a vector of views of the `elem_sizes` device buffers
-  std::vector<decimal_column_element_sizes> h_sizes;
-  h_sizes.reserve(elem_sizes.size());
+  auto h_sizes =
+    cudf::detail::make_empty_host_vector<decimal_column_element_sizes>(elem_sizes.size(), stream);
   std::transform(elem_sizes.begin(), elem_sizes.end(), std::back_inserter(h_sizes), [](auto& p) {
     return decimal_column_element_sizes{p.first, p.second};
   });
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 4cb20bb7518..f3b8cfbc836 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -444,14 +444,17 @@ namespace {
  */
 file_segmentation calculate_segmentation(host_span<orc_column_view const> columns,
                                          hostdevice_2dvector<rowgroup_rows>&& rowgroup_bounds,
-                                         stripe_size_limits max_stripe_size)
+                                         stripe_size_limits max_stripe_size,
+                                         rmm::cuda_stream_view stream)
 {
-  std::vector<stripe_rowgroups> infos;
-  auto const num_rowgroups = rowgroup_bounds.size().first;
-  size_t stripe_start      = 0;
-  size_t stripe_bytes      = 0;
-  size_type stripe_rows    = 0;
-  for (size_t rg_idx = 0; rg_idx < num_rowgroups; ++rg_idx) {
+  // Number of stripes is not known in advance. Only reserve a single element to use pinned memory
+  // resource if at all enabled.
+  auto infos                    = cudf::detail::make_empty_host_vector<stripe_rowgroups>(1, stream);
+  size_type const num_rowgroups = rowgroup_bounds.size().first;
+  size_type stripe_start        = 0;
+  size_t stripe_bytes           = 0;
+  size_type stripe_rows         = 0;
+  for (size_type rg_idx = 0; rg_idx < num_rowgroups; ++rg_idx) {
     auto const rowgroup_total_bytes =
       std::accumulate(columns.begin(), columns.end(), 0ul, [&](size_t total_size, auto const& col) {
         auto const rows = rowgroup_bounds[rg_idx][col.index()].size();
@@ -470,7 +473,9 @@ file_segmentation calculate_segmentation(host_span<orc_column_view const> column
     // Check if adding the current rowgroup to the stripe will make the stripe too large or long
     if ((rg_idx > stripe_start) && (stripe_bytes + rowgroup_total_bytes > max_stripe_size.bytes ||
                                     stripe_rows + rowgroup_rows_max > max_stripe_size.rows)) {
-      infos.emplace_back(infos.size(), stripe_start, rg_idx - stripe_start);
+      infos.push_back(stripe_rowgroups{static_cast<size_type>(infos.size()),
+                                       stripe_start,
+                                       static_cast<size_type>(rg_idx - stripe_start)});
       stripe_start = rg_idx;
       stripe_bytes = 0;
       stripe_rows  = 0;
@@ -479,7 +484,9 @@ file_segmentation calculate_segmentation(host_span<orc_column_view const> column
     stripe_bytes += rowgroup_total_bytes;
     stripe_rows += rowgroup_rows_max;
     if (rg_idx + 1 == num_rowgroups) {
-      infos.emplace_back(infos.size(), stripe_start, num_rowgroups - stripe_start);
+      infos.push_back(stripe_rowgroups{static_cast<size_type>(infos.size()),
+                                       stripe_start,
+                                       static_cast<size_type>(num_rowgroups - stripe_start)});
     }
   }
 
@@ -1336,7 +1343,7 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
     if (num_file_blobs == 0) { return {}; }
 
     // Create empty file stats and merge groups
-    std::vector<statistics_chunk> h_stat_chunks(num_file_blobs);
+    auto h_stat_chunks = cudf::detail::make_host_vector<statistics_chunk>(num_file_blobs, stream);
     cudf::detail::hostdevice_vector<statistics_merge_group> stats_merge(num_file_blobs, stream);
     // Fill in stats_merge and stat_chunks on the host
     for (auto i = 0u; i < num_file_blobs; ++i) {
@@ -1677,39 +1684,39 @@ struct pushdown_null_masks {
   // Owning vector for masks in device memory
   std::vector<rmm::device_uvector<bitmask_type>> data;
   // Pointers to pushdown masks in device memory. Can be same for multiple columns.
-  std::vector<bitmask_type const*> masks;
+  cudf::detail::host_vector<bitmask_type const*> masks;
 };
 
 pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
                                              rmm::cuda_stream_view stream)
 {
-  std::vector<bitmask_type const*> mask_ptrs;
-  mask_ptrs.reserve(orc_table.num_columns());
+  auto mask_ptrs =
+    cudf::detail::make_empty_host_vector<bitmask_type const*>(orc_table.num_columns(), stream);
   std::vector<rmm::device_uvector<bitmask_type>> pd_masks;
   for (auto const& col : orc_table.columns) {
     // Leaf columns don't need pushdown masks
     if (col.num_children() == 0) {
-      mask_ptrs.emplace_back(nullptr);
+      mask_ptrs.push_back({nullptr});
       continue;
     }
     auto const parent_pd_mask = col.is_child() ? mask_ptrs[col.parent_index()] : nullptr;
     auto const null_mask      = col.null_mask();
 
     if (null_mask == nullptr and parent_pd_mask == nullptr) {
-      mask_ptrs.emplace_back(nullptr);
+      mask_ptrs.push_back({nullptr});
       continue;
     }
     if (col.orc_kind() == STRUCT) {
       if (null_mask != nullptr and parent_pd_mask == nullptr) {
         // Reuse own null mask
-        mask_ptrs.emplace_back(null_mask);
+        mask_ptrs.push_back(null_mask);
       } else if (null_mask == nullptr and parent_pd_mask != nullptr) {
         // Reuse parent's pushdown mask
-        mask_ptrs.emplace_back(parent_pd_mask);
+        mask_ptrs.push_back(parent_pd_mask);
       } else {
         // Both are nullable, allocate new pushdown mask
         pd_masks.emplace_back(num_bitmask_words(col.size()), stream);
-        mask_ptrs.emplace_back(pd_masks.back().data());
+        mask_ptrs.push_back({pd_masks.back().data()});
 
         thrust::transform(rmm::exec_policy(stream),
                           null_mask,
@@ -1724,7 +1731,7 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
       auto const child_col = orc_table.column(col.child_begin()[0]);
       // pushdown mask applies to child column(s); use the child column size
       pd_masks.emplace_back(num_bitmask_words(child_col.size()), stream);
-      mask_ptrs.emplace_back(pd_masks.back().data());
+      mask_ptrs.push_back({pd_masks.back().data()});
       pushdown_lists_null_mask(col, orc_table.d_columns, parent_pd_mask, pd_masks.back(), stream);
     }
   }
@@ -1815,8 +1822,7 @@ orc_table_view make_orc_table_view(table_view const& table,
     append_orc_column(table.column(col_idx), nullptr, table_meta.column_metadata[col_idx]);
   }
 
-  std::vector<TypeKind> type_kinds;
-  type_kinds.reserve(orc_columns.size());
+  auto type_kinds = cudf::detail::make_empty_host_vector<TypeKind>(orc_columns.size(), stream);
   std::transform(
     orc_columns.cbegin(), orc_columns.cend(), std::back_inserter(type_kinds), [](auto& orc_column) {
       return orc_column.orc_kind();
@@ -2299,7 +2305,7 @@ auto convert_table_to_orc_data(table_view const& input,
 
   // Decide stripe boundaries based on rowgroups and char counts
   auto segmentation =
-    calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size);
+    calculate_segmentation(orc_table.columns, std::move(rowgroup_bounds), max_stripe_size, stream);
 
   auto stripe_dicts    = build_dictionaries(orc_table, segmentation, sort_dictionaries, stream);
   auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream);
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index bd082befe0c..f5f8b3cfed9 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -78,10 +78,9 @@ struct orc_table_view {
  * Provides a container-like interface to iterate over rowgroup indices.
  */
 struct stripe_rowgroups {
-  uint32_t id;     // stripe id
-  uint32_t first;  // first rowgroup in the stripe
-  uint32_t size;   // number of rowgroups in the stripe
-  stripe_rowgroups(uint32_t id, uint32_t first, uint32_t size) : id{id}, first{first}, size{size} {}
+  size_type id;     // stripe id
+  size_type first;  // first rowgroup in the stripe
+  size_type size;   // number of rowgroups in the stripe
   [[nodiscard]] auto cbegin() const { return thrust::make_counting_iterator(first); }
   [[nodiscard]] auto cend() const { return thrust::make_counting_iterator(first + size); }
 };
@@ -125,7 +124,7 @@ class orc_streams {
  */
 struct file_segmentation {
   hostdevice_2dvector<rowgroup_rows> rowgroups;
-  std::vector<stripe_rowgroups> stripes;
+  cudf::detail::host_vector<stripe_rowgroups> stripes;
 
   auto num_rowgroups() const noexcept { return rowgroups.size().first; }
   auto num_stripes() const noexcept { return stripes.size(); }
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 11f4a00ee8b..481c1e9fcdd 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -141,11 +141,11 @@ struct stats_caster {
       // Local struct to hold host columns
       struct host_column {
         // using thrust::host_vector because std::vector<bool> uses bitmap instead of byte per bool.
-        thrust::host_vector<T> val;
+        cudf::detail::host_vector<T> val;
         std::vector<bitmask_type> null_mask;
         cudf::size_type null_count = 0;
-        host_column(size_type total_row_groups)
-          : val(total_row_groups),
+        host_column(size_type total_row_groups, rmm::cuda_stream_view stream)
+          : val{cudf::detail::make_host_vector<T>(total_row_groups, stream)},
             null_mask(
               cudf::util::div_rounding_up_safe<size_type>(
                 cudf::bitmask_allocation_size_bytes(total_row_groups), sizeof(bitmask_type)),
@@ -170,8 +170,14 @@ struct stats_caster {
                                           rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr)
         {
-          std::vector<char> chars{};
-          std::vector<cudf::size_type> offsets(1, 0);
+          auto const total_char_count = std::accumulate(
+            host_strings.begin(), host_strings.end(), 0, [](auto sum, auto const& str) {
+              return sum + str.size_bytes();
+            });
+          auto chars = cudf::detail::make_empty_host_vector<char>(total_char_count, stream);
+          auto offsets =
+            cudf::detail::make_empty_host_vector<cudf::size_type>(host_strings.size() + 1, stream);
+          offsets.push_back(0);
           for (auto const& str : host_strings) {
             auto tmp =
               str.empty() ? std::string_view{} : std::string_view(str.data(), str.size_bytes());
@@ -206,8 +212,8 @@ struct stats_caster {
             null_count);
         }
       };  // local struct host_column
-      host_column min(total_row_groups);
-      host_column max(total_row_groups);
+      host_column min(total_row_groups, stream);
+      host_column max(total_row_groups, stream);
       size_type stats_idx = 0;
       for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) {
         for (auto const rg_idx : row_group_indices[src_idx]) {
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 05e0d8c0111..794750ab6d2 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -804,16 +804,16 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
   rmm::device_buffer decomp_pages(
     cudf::util::round_up_safe(total_decomp_size, BUFFER_PADDING_MULTIPLE), stream);
 
-  std::vector<device_span<uint8_t const>> comp_in;
-  comp_in.reserve(num_comp_pages);
-  std::vector<device_span<uint8_t>> comp_out;
-  comp_out.reserve(num_comp_pages);
+  auto comp_in =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t const>>(num_comp_pages, stream);
+  auto comp_out =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t>>(num_comp_pages, stream);
 
   // vectors to save v2 def and rep level data, if any
-  std::vector<device_span<uint8_t const>> copy_in;
-  copy_in.reserve(num_comp_pages);
-  std::vector<device_span<uint8_t>> copy_out;
-  copy_out.reserve(num_comp_pages);
+  auto copy_in =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t const>>(num_comp_pages, stream);
+  auto copy_out =
+    cudf::detail::make_empty_host_vector<device_span<uint8_t>>(num_comp_pages, stream);
 
   rmm::device_uvector<compression_result> comp_res(num_comp_pages, stream);
   thrust::fill(rmm::exec_policy_nosync(stream),
@@ -822,7 +822,6 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
                compression_result{0, compression_status::FAILURE});
 
   size_t decomp_offset = 0;
-  int32_t start_pos    = 0;
   for (auto const& codec : codecs) {
     if (codec.num_pages == 0) { continue; }
 
@@ -836,56 +835,64 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
       // input and output buffers. otherwise we'd have to keep both the compressed
       // and decompressed data.
       if (offset != 0) {
-        copy_in.emplace_back(page.page_data, offset);
-        copy_out.emplace_back(dst_base, offset);
+        copy_in.push_back({page.page_data, static_cast<size_t>(offset)});
+        copy_out.push_back({dst_base, static_cast<size_t>(offset)});
       }
-      comp_in.emplace_back(page.page_data + offset,
-                           static_cast<size_t>(page.compressed_page_size - offset));
-      comp_out.emplace_back(dst_base + offset,
-                            static_cast<size_t>(page.uncompressed_page_size - offset));
+      comp_in.push_back(
+        {page.page_data + offset, static_cast<size_t>(page.compressed_page_size - offset)});
+      comp_out.push_back(
+        {dst_base + offset, static_cast<size_t>(page.uncompressed_page_size - offset)});
       page.page_data = dst_base;
       decomp_offset += page.uncompressed_page_size;
     });
+  }
+  auto d_comp_in = cudf::detail::make_device_uvector_async(
+    comp_in, stream, rmm::mr::get_current_device_resource());
+  auto d_comp_out = cudf::detail::make_device_uvector_async(
+    comp_out, stream, rmm::mr::get_current_device_resource());
+
+  int32_t start_pos = 0;
+  for (auto const& codec : codecs) {
+    if (codec.num_pages == 0) { continue; }
+
+    device_span<device_span<uint8_t const> const> d_comp_in_view{d_comp_in.data() + start_pos,
+                                                                 codec.num_pages};
+
+    device_span<device_span<uint8_t> const> d_comp_out_view(d_comp_out.data() + start_pos,
+                                                            codec.num_pages);
 
-    host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
-                                                             codec.num_pages};
-    auto const d_comp_in = cudf::detail::make_device_uvector_async(
-      comp_in_view, stream, rmm::mr::get_current_device_resource());
-    host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
-                                                        codec.num_pages);
-    auto const d_comp_out = cudf::detail::make_device_uvector_async(
-      comp_out_view, stream, rmm::mr::get_current_device_resource());
     device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
 
     switch (codec.compression_type) {
       case GZIP:
-        gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
+        gpuinflate(
+          d_comp_in_view, d_comp_out_view, d_comp_res_view, gzip_header_included::YES, stream);
         break;
       case SNAPPY:
         if (cudf::io::nvcomp_integration::is_stable_enabled()) {
           nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
-                                     d_comp_in,
-                                     d_comp_out,
+                                     d_comp_in_view,
+                                     d_comp_out_view,
                                      d_comp_res_view,
                                      codec.max_decompressed_size,
                                      codec.total_decomp_size,
                                      stream);
         } else {
-          gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream);
+          gpu_unsnap(d_comp_in_view, d_comp_out, d_comp_res_view, stream);
         }
         break;
       case ZSTD:
         nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
-                                   d_comp_in,
-                                   d_comp_out,
+                                   d_comp_in_view,
+                                   d_comp_out_view,
                                    d_comp_res_view,
                                    codec.max_decompressed_size,
                                    codec.total_decomp_size,
                                    stream);
         break;
       case BROTLI:
-        gpu_debrotli(d_comp_in,
-                     d_comp_out,
+        gpu_debrotli(d_comp_in_view,
+                     d_comp_out_view,
                      d_comp_res_view,
                      debrotli_scratch.data(),
                      debrotli_scratch.size(),
@@ -893,8 +900,8 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
         break;
       case LZ4_RAW:
         nvcomp::batched_decompress(nvcomp::compression_type::LZ4,
-                                   d_comp_in,
-                                   d_comp_out,
+                                   d_comp_in_view,
+                                   d_comp_out_view,
                                    d_comp_res_view,
                                    codec.max_decompressed_size,
                                    codec.total_decomp_size,
@@ -1127,9 +1134,8 @@ void include_decompression_scratch_size(device_span<ColumnChunkDesc const> chunk
                                 decomp_sum{});
 
   // retrieve to host so we can call nvcomp to get compression scratch sizes
-  std::vector<decompression_info> h_decomp_info =
-    cudf::detail::make_std_vector_sync(decomp_info, stream);
-  std::vector<size_t> temp_cost(pages.size());
+  auto h_decomp_info = cudf::detail::make_host_vector_sync(decomp_info, stream);
+  auto temp_cost     = cudf::detail::make_host_vector<size_t>(pages.size(), stream);
   thrust::transform(thrust::host,
                     h_decomp_info.begin(),
                     h_decomp_info.end(),
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index ff47dfc4cf3..e006cc7d714 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -370,7 +370,7 @@ void fill_in_page_info(host_span<ColumnChunkDesc> chunks,
                        rmm::cuda_stream_view stream)
 {
   auto const num_pages = pages.size();
-  std::vector<page_index_info> page_indexes(num_pages);
+  auto page_indexes    = cudf::detail::make_host_vector<page_index_info>(num_pages, stream);
 
   for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
     auto const& chunk = chunks[c];
@@ -1031,8 +1031,8 @@ struct get_page_num_rows {
 };
 
 struct input_col_info {
-  int const schema_idx;
-  size_type const nesting_depth;
+  int schema_idx;
+  size_type nesting_depth;
 };
 
 /**
@@ -1523,8 +1523,8 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
 
   // compute output column sizes by examining the pages of the -input- columns
   if (has_lists) {
-    std::vector<input_col_info> h_cols_info;
-    h_cols_info.reserve(_input_columns.size());
+    auto h_cols_info =
+      cudf::detail::make_empty_host_vector<input_col_info>(_input_columns.size(), _stream);
     std::transform(_input_columns.cbegin(),
                    _input_columns.cend(),
                    std::back_inserter(h_cols_info),
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 8413e716224..2df71b77301 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1824,7 +1824,8 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   size_type max_page_fragment_size =
     max_page_fragment_size_opt.value_or(default_max_page_fragment_size);
 
-  std::vector<size_type> column_frag_size(num_columns, max_page_fragment_size);
+  auto column_frag_size = cudf::detail::make_host_vector<size_type>(num_columns, stream);
+  std::fill(column_frag_size.begin(), column_frag_size.end(), max_page_fragment_size);
 
   if (input.num_rows() > 0 && not max_page_fragment_size_opt.has_value()) {
     std::vector<size_t> column_sizes;
@@ -1880,7 +1881,9 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
 
   size_type num_fragments = std::reduce(num_frag_in_part.begin(), num_frag_in_part.end());
 
-  std::vector<int> part_frag_offset;  // Store the idx of the first fragment in each partition
+  auto part_frag_offset =
+    cudf::detail::make_empty_host_vector<int>(num_frag_in_part.size() + 1, stream);
+  // Store the idx of the first fragment in each partition
   std::exclusive_scan(
     num_frag_in_part.begin(), num_frag_in_part.end(), std::back_inserter(part_frag_offset), 0);
   part_frag_offset.push_back(part_frag_offset.back() + num_frag_in_part.back());
diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu
index 5625e1bf05c..50f40924478 100644
--- a/cpp/src/lists/dremel.cu
+++ b/cpp/src/lists/dremel.cu
@@ -257,10 +257,8 @@ dremel_data get_encoding(column_view h_col,
     },
     stream);
 
-  thrust::host_vector<size_type> column_offsets =
-    cudf::detail::make_host_vector_async(d_column_offsets, stream);
-  thrust::host_vector<size_type> column_ends =
-    cudf::detail::make_host_vector_async(d_column_ends, stream);
+  auto column_offsets = cudf::detail::make_host_vector_async(d_column_offsets, stream);
+  auto column_ends    = cudf::detail::make_host_vector_async(d_column_ends, stream);
   stream.synchronize();
 
   size_t max_vals_size = 0;
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index c4cc0dbe09d..b534e9b2e5b 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -169,8 +169,10 @@ std::unique_ptr<column> join_strings(strings_column_view const& input,
 
   // build the offsets: single string output has offsets [0,chars-size]
   auto offsets_column = [&] {
-    auto offsets = cudf::detail::make_device_uvector_async(
-      std::vector<size_type>({0, static_cast<size_type>(chars.size())}), stream, mr);
+    auto h_offsets = cudf::detail::make_host_vector<size_type>(2, stream);
+    h_offsets[0]   = 0;
+    h_offsets[1]   = chars.size();
+    auto offsets   = cudf::detail::make_device_uvector_async(h_offsets, stream, mr);
     return std::make_unique<column>(std::move(offsets), rmm::device_buffer{}, 0);
   }();
 
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 2f4ebf97264..64a2107e17a 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -123,7 +123,7 @@ struct format_compiler {
     : format(fmt), d_items(0, stream)
   {
     specifiers.insert(extra_specifiers.begin(), extra_specifiers.end());
-    std::vector<format_item> items;
+    auto items  = cudf::detail::make_empty_host_vector<format_item>(format.length(), stream);
     auto str    = format.data();
     auto length = format.length();
     while (length > 0) {
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 7622e39e735..352e0f9f41a 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -79,7 +79,7 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
 
   // Compute the partition offsets and size of offset column
   // Note: Using 64-bit size_t so we can detect overflow of 32-bit size_type
-  auto input_offsets = std::vector<size_t>(views.size() + 1);
+  auto input_offsets = cudf::detail::make_host_vector<size_t>(views.size() + 1, stream);
   auto offset_it     = std::next(input_offsets.begin());
   thrust::transform(
     thrust::host, views.begin(), views.end(), offset_it, [](auto const& col) -> size_t {
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index a34828fa97e..48620af8cad 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -129,7 +129,7 @@ std::unique_ptr<column> filter_characters(
 
   // convert input table for copy to device memory
   size_type table_size = static_cast<size_type>(characters_to_filter.size());
-  thrust::host_vector<char_range> htable(table_size);
+  auto htable          = cudf::detail::make_host_vector<char_range>(table_size, stream);
   std::transform(
     characters_to_filter.begin(), characters_to_filter.end(), htable.begin(), [](auto entry) {
       return char_range{entry.first, entry.second};
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index cd60a4296b9..31234ea42ec 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -171,7 +171,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
   auto d_buffer          = rmm::device_buffer(buffer_size, stream);
 
   // copy all the reprog_device instances to a device memory array
-  std::vector<reprog_device> progs;
+  auto progs = cudf::detail::make_empty_host_vector<reprog_device>(h_progs.size(), stream);
   std::transform(h_progs.begin(),
                  h_progs.end(),
                  std::back_inserter(progs),
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 16b22d0de4c..a242b008a54 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -97,7 +97,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
 
   size_type table_size = static_cast<size_type>(chars_table.size());
   // convert input table
-  thrust::host_vector<translate_table> htable(table_size);
+  auto htable = cudf::detail::make_host_vector<translate_table>(table_size, stream);
   std::transform(chars_table.begin(), chars_table.end(), htable.begin(), [](auto entry) {
     return translate_table{entry.first, entry.second};
   });
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index 13c31e8ae4c..2969557c78f 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -308,7 +308,10 @@ auto decompose_structs(table_view table,
 auto list_lex_preprocess(table_view const& table, rmm::cuda_stream_view stream)
 {
   std::vector<detail::dremel_data> dremel_data;
-  std::vector<detail::dremel_device_view> dremel_device_views;
+  auto const num_list_columns = std::count_if(
+    table.begin(), table.end(), [](auto const& col) { return col.type().id() == type_id::LIST; });
+  auto dremel_device_views =
+    cudf::detail::make_empty_host_vector<detail::dremel_device_view>(num_list_columns, stream);
   for (auto const& col : table) {
     if (col.type().id() == type_id::LIST) {
       dremel_data.push_back(detail::get_comparator_data(col, {}, false, stream));
diff --git a/cpp/src/utilities/cuda_memcpy.cu b/cpp/src/utilities/cuda_memcpy.cu
index 3d0822d8545..0efb881eb3e 100644
--- a/cpp/src/utilities/cuda_memcpy.cu
+++ b/cpp/src/utilities/cuda_memcpy.cu
@@ -14,6 +14,9 @@
  * limitations under the License.
  */
 
+#include "cudf/detail/utilities/integer_utils.hpp"
+
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/cuda_memcpy.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
@@ -26,15 +29,24 @@ namespace cudf::detail {
 
 namespace {
 
+// Simple kernel to copy between device buffers
+CUDF_KERNEL void copy_kernel(char const* src, char* dst, size_t n)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx < n) { dst[idx] = src[idx]; }
+}
+
 void copy_pinned(void* dst, void const* src, std::size_t size, rmm::cuda_stream_view stream)
 {
   if (size == 0) return;
 
   if (size < get_kernel_pinned_copy_threshold()) {
-    thrust::copy_n(rmm::exec_policy_nosync(stream),
-                   static_cast<const char*>(src),
-                   size,
-                   static_cast<char*>(dst));
+    const int block_size = 256;
+    auto const grid_size = cudf::util::div_rounding_up_safe<size_t>(size, block_size);
+    // We are explicitly launching the kernel here instead of calling a thrust function because the
+    // thrust function can potentially call cudaMemcpyAsync instead of using a kernel
+    copy_kernel<<<grid_size, block_size, 0, stream.value()>>>(
+      static_cast<char const*>(src), static_cast<char*>(dst), size);
   } else {
     CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, stream));
   }
diff --git a/cpp/src/utilities/pinned_memory.cpp b/cpp/src/utilities/host_memory.cpp
similarity index 73%
rename from cpp/src/utilities/pinned_memory.cpp
rename to cpp/src/utilities/host_memory.cpp
index 3ea4293fc60..7c3cea42023 100644
--- a/cpp/src/utilities/pinned_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -83,7 +83,7 @@ class fixed_pinned_pool_memory_resource {
   void deallocate_async(void* ptr,
                         std::size_t bytes,
                         std::size_t alignment,
-                        cuda::stream_ref stream) noexcept
+                        cuda::stream_ref stream)
   {
     if (bytes <= pool_size_ && ptr >= pool_begin_ && ptr < pool_end_) {
       pool_->deallocate_async(ptr, bytes, alignment, stream);
@@ -92,14 +92,14 @@ class fixed_pinned_pool_memory_resource {
     }
   }
 
-  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream) noexcept
+  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream)
   {
     return deallocate_async(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT, stream);
   }
 
   void deallocate(void* ptr,
                   std::size_t bytes,
-                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT) noexcept
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
   {
     deallocate_async(ptr, bytes, alignment, stream_);
     stream_.wait();
@@ -186,6 +186,61 @@ CUDF_EXPORT rmm::host_device_async_resource_ref& host_mr()
   return mr_ref;
 }
 
+class new_delete_memory_resource {
+ public:
+  void* allocate(std::size_t bytes, std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
+  {
+    try {
+      return rmm::detail::aligned_host_allocate(
+        bytes, alignment, [](std::size_t size) { return ::operator new(size); });
+    } catch (std::bad_alloc const& e) {
+      CUDF_FAIL("Failed to allocate memory: " + std::string{e.what()}, rmm::out_of_memory);
+    }
+  }
+
+  void* allocate_async(std::size_t bytes, [[maybe_unused]] cuda::stream_ref stream)
+  {
+    return allocate(bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT);
+  }
+
+  void* allocate_async(std::size_t bytes,
+                       std::size_t alignment,
+                       [[maybe_unused]] cuda::stream_ref stream)
+  {
+    return allocate(bytes, alignment);
+  }
+
+  void deallocate(void* ptr,
+                  std::size_t bytes,
+                  std::size_t alignment = rmm::RMM_DEFAULT_HOST_ALIGNMENT)
+  {
+    rmm::detail::aligned_host_deallocate(
+      ptr, bytes, alignment, [](void* ptr) { ::operator delete(ptr); });
+  }
+
+  void deallocate_async(void* ptr,
+                        std::size_t bytes,
+                        std::size_t alignment,
+                        [[maybe_unused]] cuda::stream_ref stream)
+  {
+    deallocate(ptr, bytes, alignment);
+  }
+
+  void deallocate_async(void* ptr, std::size_t bytes, cuda::stream_ref stream)
+  {
+    deallocate(ptr, bytes, rmm::RMM_DEFAULT_HOST_ALIGNMENT);
+  }
+
+  bool operator==(new_delete_memory_resource const& other) const { return true; }
+
+  bool operator!=(new_delete_memory_resource const& other) const { return !operator==(other); }
+
+  friend void get_property(new_delete_memory_resource const&, cuda::mr::host_accessible) noexcept {}
+};
+
+static_assert(cuda::mr::resource_with<new_delete_memory_resource, cuda::mr::host_accessible>,
+              "Pageable pool mr must be accessible from the host");
+
 }  // namespace
 
 rmm::host_device_async_resource_ref set_pinned_memory_resource(
@@ -225,4 +280,29 @@ void set_kernel_pinned_copy_threshold(size_t threshold)
 
 size_t get_kernel_pinned_copy_threshold() { return kernel_pinned_copy_threshold(); }
 
+CUDF_EXPORT auto& allocate_host_as_pinned_threshold()
+{
+  // use pageable memory for all host allocations
+  static std::atomic<size_t> threshold = 0;
+  return threshold;
+}
+
+void set_allocate_host_as_pinned_threshold(size_t threshold)
+{
+  allocate_host_as_pinned_threshold() = threshold;
+}
+
+size_t get_allocate_host_as_pinned_threshold() { return allocate_host_as_pinned_threshold(); }
+
+namespace detail {
+
+CUDF_EXPORT rmm::host_async_resource_ref get_pageable_memory_resource()
+{
+  static new_delete_memory_resource mr{};
+  static rmm::host_async_resource_ref mr_ref{mr};
+  return mr_ref;
+}
+
+}  // namespace detail
+
 }  // namespace cudf
diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp
index 7a72b77e1fb..8bcd5790e99 100644
--- a/cpp/tests/io/json/json_tree.cpp
+++ b/cpp/tests/io/json/json_tree.cpp
@@ -235,10 +235,8 @@ tree_meta_t2 get_tree_representation_cpu(
 {
   constexpr bool include_quote_char = true;
   // Copy the JSON tokens to the host
-  thrust::host_vector<cuio_json::PdaTokenT> tokens =
-    cudf::detail::make_host_vector_async(tokens_gpu, stream);
-  thrust::host_vector<cuio_json::SymbolOffsetT> token_indices =
-    cudf::detail::make_host_vector_async(token_indices_gpu1, stream);
+  auto tokens        = cudf::detail::make_host_vector_async(tokens_gpu, stream);
+  auto token_indices = cudf::detail::make_host_vector_async(token_indices_gpu1, stream);
 
   // Make sure tokens have been copied to the host
   stream.synchronize();
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 51e9b3bd0a0..7a038fa6d75 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -294,7 +294,7 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   std::iota(h_integers.begin(), h_integers.end(), -(TypeParam)(h_integers.size() / 2));
   h_integers.push_back(std::numeric_limits<TypeParam>::min());
   h_integers.push_back(std::numeric_limits<TypeParam>::max());
-  auto d_integers = cudf::detail::make_device_uvector_sync(
+  auto const d_integers = cudf::detail::make_device_uvector_sync(
     h_integers, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   auto integers      = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                             (cudf::size_type)d_integers.size());
@@ -308,8 +308,6 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   // convert to strings
   auto results_strings = cudf::strings::from_integers(integers->view());
 
-  // copy back to host
-  h_integers = cudf::detail::make_host_vector_sync(d_integers, cudf::get_default_stream());
   std::vector<std::string> h_strings;
   for (auto itr = h_integers.begin(); itr != h_integers.end(); ++itr)
     h_strings.push_back(std::to_string(*itr));
diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
index df9103640f4..93259fd63ee 100644
--- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp
+++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
@@ -18,16 +18,33 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-class PinnedMemoryTest : public cudf::test::BaseFixture {};
+class PinnedMemoryTest : public cudf::test::BaseFixture {
+  size_t prev_copy_threshold;
+  size_t prev_alloc_threshold;
 
-TEST(PinnedMemoryTest, MemoryResourceGetAndSet)
+ public:
+  PinnedMemoryTest()
+    : prev_copy_threshold{cudf::get_kernel_pinned_copy_threshold()},
+      prev_alloc_threshold{cudf::get_allocate_host_as_pinned_threshold()}
+  {
+  }
+  ~PinnedMemoryTest() override
+  {
+    cudf::set_kernel_pinned_copy_threshold(prev_copy_threshold);
+    cudf::set_allocate_host_as_pinned_threshold(prev_alloc_threshold);
+  }
+};
+
+TEST_F(PinnedMemoryTest, MemoryResourceGetAndSet)
 {
   // Global environment for temporary files
   auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
@@ -63,3 +80,49 @@ TEST(PinnedMemoryTest, MemoryResourceGetAndSet)
   // reset memory resource back
   cudf::set_pinned_memory_resource(last_mr);
 }
+
+TEST_F(PinnedMemoryTest, KernelCopyThresholdGetAndSet)
+{
+  cudf::set_kernel_pinned_copy_threshold(12345);
+  EXPECT_EQ(cudf::get_kernel_pinned_copy_threshold(), 12345);
+}
+
+TEST_F(PinnedMemoryTest, HostAsPinnedThresholdGetAndSet)
+{
+  cudf::set_allocate_host_as_pinned_threshold(12345);
+  EXPECT_EQ(cudf::get_allocate_host_as_pinned_threshold(), 12345);
+}
+
+TEST_F(PinnedMemoryTest, MakePinnedVector)
+{
+  cudf::set_allocate_host_as_pinned_threshold(0);
+
+  // should always use pinned memory
+  {
+    auto const vec = cudf::detail::make_pinned_vector_async<char>(1, cudf::get_default_stream());
+    EXPECT_TRUE(vec.get_allocator().is_device_accessible());
+  }
+}
+
+TEST_F(PinnedMemoryTest, MakeHostVector)
+{
+  cudf::set_allocate_host_as_pinned_threshold(7);
+
+  // allocate smaller than the threshold
+  {
+    auto const vec = cudf::detail::make_host_vector<int32_t>(1, cudf::get_default_stream());
+    EXPECT_TRUE(vec.get_allocator().is_device_accessible());
+  }
+
+  // allocate the same size as the threshold
+  {
+    auto const vec = cudf::detail::make_host_vector<char>(7, cudf::get_default_stream());
+    EXPECT_TRUE(vec.get_allocator().is_device_accessible());
+  }
+
+  // allocate larger than the threshold
+  {
+    auto const vec = cudf::detail::make_host_vector<int32_t>(2, cudf::get_default_stream());
+    EXPECT_FALSE(vec.get_allocator().is_device_accessible());
+  }
+}

From 75289c58f3d9ca11a51396e4adadfbd5f51856f5 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 23 Jul 2024 23:45:59 -0500
Subject: [PATCH 574/842] Rename PrefetchConfig to prefetch_config. (#16358)

This PR addresses a comment requesting a rename of `PrefetchConfig` to `prefetch_config`.

See: https://github.com/rapidsai/cudf/pull/16020#discussion_r1686284151

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Shruti Shivakumar (https://github.com/shrshi)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16358
---
 cpp/include/cudf/utilities/prefetch.hpp | 10 +++++-----
 cpp/src/column/column_view.cpp          |  2 +-
 cpp/src/utilities/prefetch.cpp          | 21 ++++++++++++---------
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/cpp/include/cudf/utilities/prefetch.hpp b/cpp/include/cudf/utilities/prefetch.hpp
index 5ca6fd6f4b0..88c634a7cc7 100644
--- a/cpp/include/cudf/utilities/prefetch.hpp
+++ b/cpp/include/cudf/utilities/prefetch.hpp
@@ -31,17 +31,17 @@ namespace detail {
 /**
  * @brief A singleton class that manages the prefetching configuration.
  */
-class PrefetchConfig {
+class prefetch_config {
  public:
-  PrefetchConfig& operator=(const PrefetchConfig&) = delete;
-  PrefetchConfig(const PrefetchConfig&)            = delete;
+  prefetch_config& operator=(const prefetch_config&) = delete;
+  prefetch_config(const prefetch_config&)            = delete;
 
   /**
    * @brief Get the singleton instance of the prefetching configuration.
    *
    * @return The singleton instance of the prefetching configuration.
    */
-  static PrefetchConfig& instance();
+  static prefetch_config& instance();
 
   /**
    * @brief Get the value of a configuration key.
@@ -65,7 +65,7 @@ class PrefetchConfig {
   bool debug{false};
 
  private:
-  PrefetchConfig() = default;                 //< Private constructor to enforce singleton pattern
+  prefetch_config() = default;                //< Private constructor to enforce singleton pattern
   std::map<std::string, bool> config_values;  //< Map of configuration keys to values
 };
 
diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index a9605efb362..b0f9e9f0e74 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -39,7 +39,7 @@ namespace {
 template <typename ColumnView>
 void prefetch_col_data(ColumnView& col, void const* data_ptr, std::string_view key) noexcept
 {
-  if (cudf::experimental::prefetch::detail::PrefetchConfig::instance().get(key)) {
+  if (cudf::experimental::prefetch::detail::prefetch_config::instance().get(key)) {
     if (cudf::is_fixed_width(col.type())) {
       cudf::experimental::prefetch::detail::prefetch_noexcept(
         key, data_ptr, col.size() * size_of(col.type()), cudf::get_default_stream());
diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp
index 21f2e40c82a..16f2c3a1202 100644
--- a/cpp/src/utilities/prefetch.cpp
+++ b/cpp/src/utilities/prefetch.cpp
@@ -26,13 +26,13 @@ namespace cudf::experimental::prefetch {
 
 namespace detail {
 
-PrefetchConfig& PrefetchConfig::instance()
+prefetch_config& prefetch_config::instance()
 {
-  static PrefetchConfig instance;
+  static prefetch_config instance;
   return instance;
 }
 
-bool PrefetchConfig::get(std::string_view key)
+bool prefetch_config::get(std::string_view key)
 {
   // Default to not prefetching
   if (config_values.find(key.data()) == config_values.end()) {
@@ -40,7 +40,7 @@ bool PrefetchConfig::get(std::string_view key)
   }
   return config_values[key.data()];
 }
-void PrefetchConfig::set(std::string_view key, bool value) { config_values[key.data()] = value; }
+void prefetch_config::set(std::string_view key, bool value) { config_values[key.data()] = value; }
 
 cudaError_t prefetch_noexcept(std::string_view key,
                               void const* ptr,
@@ -48,8 +48,8 @@ cudaError_t prefetch_noexcept(std::string_view key,
                               rmm::cuda_stream_view stream,
                               rmm::cuda_device_id device_id) noexcept
 {
-  if (PrefetchConfig::instance().get(key)) {
-    if (PrefetchConfig::instance().debug) {
+  if (prefetch_config::instance().get(key)) {
+    if (prefetch_config::instance().debug) {
       std::cerr << "Prefetching " << size << " bytes for key " << key << " at location " << ptr
                 << std::endl;
     }
@@ -78,12 +78,15 @@ void prefetch(std::string_view key,
 
 }  // namespace detail
 
-void enable_prefetching(std::string_view key) { detail::PrefetchConfig::instance().set(key, true); }
+void enable_prefetching(std::string_view key)
+{
+  detail::prefetch_config::instance().set(key, true);
+}
 
 void disable_prefetching(std::string_view key)
 {
-  detail::PrefetchConfig::instance().set(key, false);
+  detail::prefetch_config::instance().set(key, false);
 }
 
-void prefetch_debugging(bool enable) { detail::PrefetchConfig::instance().debug = enable; }
+void prefetch_debugging(bool enable) { detail::prefetch_config::instance().debug = enable; }
 }  // namespace cudf::experimental::prefetch

From 8c1749b40eaa983966ed3bece6bdd29a4316d18a Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Wed, 24 Jul 2024 01:19:10 -0400
Subject: [PATCH 575/842] Use rapids_cpm_bs_thread_pool() (#16360)

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16360
---
 cpp/CMakeLists.txt                         |  2 +-
 cpp/cmake/thirdparty/get_thread_pool.cmake | 20 +++++++-------------
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a2c2dd3af4c..b044545bb08 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -807,7 +807,7 @@ add_dependencies(cudf jitify_preprocess_run)
 # Specify the target module library dependencies
 target_link_libraries(
   cudf
-  PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS_thread_pool>
+  PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS::thread_pool>
   PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
           kvikio::kvikio $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
 )
diff --git a/cpp/cmake/thirdparty/get_thread_pool.cmake b/cpp/cmake/thirdparty/get_thread_pool.cmake
index 264257c7199..235bf409058 100644
--- a/cpp/cmake/thirdparty/get_thread_pool.cmake
+++ b/cpp/cmake/thirdparty/get_thread_pool.cmake
@@ -12,20 +12,14 @@
 # the License.
 # =============================================================================
 
-# This function finds rmm and sets any additional necessary environment variables.
+# Need to call rapids_cpm_bs_thread_pool to get support for an installed version of thread-pool and
+# to support installing it ourselves
 function(find_and_configure_thread_pool)
-  rapids_cpm_find(
-    BS_thread_pool 4.1.0
-    CPM_ARGS
-    GIT_REPOSITORY https://github.com/bshoshany/thread-pool.git
-    GIT_TAG 097aa718f25d44315cadb80b407144ad455ee4f9
-    GIT_SHALLOW TRUE
-  )
-  if(NOT TARGET BS_thread_pool)
-    add_library(BS_thread_pool INTERFACE)
-    target_include_directories(BS_thread_pool INTERFACE ${BS_thread_pool_SOURCE_DIR}/include)
-    target_compile_definitions(BS_thread_pool INTERFACE "BS_THREAD_POOL_ENABLE_PAUSE=1")
-  endif()
+  include(${rapids-cmake-dir}/cpm/bs_thread_pool.cmake)
+
+  # Find or install thread-pool
+  rapids_cpm_bs_thread_pool(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
+
 endfunction()
 
 find_and_configure_thread_pool()

From 62625f1bfcdb980186a1afbec41e420fdb4a7075 Mon Sep 17 00:00:00 2001
From: Matt Topol <zotthewizard@gmail.com>
Date: Wed, 24 Jul 2024 03:42:03 -0400
Subject: [PATCH 576/842] Host implementation of `to_arrow` using nanoarrow
 (#16297)

Adds the corresponding `to_arrow_host` functions for interop using `ArrowDeviceArray`. This includes updating the version of nanoarrow in use to pick up some bug fixes and features.

Authors:
  - Matt Topol (https://github.com/zeroshade)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16297
---
 cpp/CMakeLists.txt                         |    1 +
 cpp/cmake/thirdparty/get_nanoarrow.cmake   |    4 +-
 cpp/include/cudf/interop.hpp               |   80 +-
 cpp/include/cudf/interop/detail/arrow.hpp  |   53 -
 cpp/src/interop/arrow_utilities.cpp        |   31 +
 cpp/src/interop/arrow_utilities.hpp        |   43 +-
 cpp/src/interop/from_arrow_device.cu       |   10 +-
 cpp/src/interop/from_arrow_host.cu         |    2 +-
 cpp/src/interop/to_arrow.cu                |   33 +-
 cpp/src/interop/to_arrow_device.cu         |  101 +-
 cpp/src/interop/to_arrow_host.cu           |  428 ++++++++
 cpp/src/interop/to_arrow_schema.cpp        |    7 +-
 cpp/tests/CMakeLists.txt                   |    1 +
 cpp/tests/interop/nanoarrow_utils.hpp      |    9 +-
 cpp/tests/interop/to_arrow_device_test.cpp |    1 -
 cpp/tests/interop/to_arrow_host_test.cpp   | 1117 ++++++++++++++++++++
 16 files changed, 1760 insertions(+), 161 deletions(-)
 delete mode 100644 cpp/include/cudf/interop/detail/arrow.hpp
 create mode 100644 cpp/src/interop/to_arrow_host.cu
 create mode 100644 cpp/tests/interop/to_arrow_host_test.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index b044545bb08..24b683a930b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -367,6 +367,7 @@ add_library(
   src/interop/arrow_utilities.cpp
   src/interop/to_arrow.cu
   src/interop/to_arrow_device.cu
+  src/interop/to_arrow_host.cu
   src/interop/from_arrow_device.cu
   src/interop/from_arrow_host.cu
   src/interop/from_arrow_stream.cu
diff --git a/cpp/cmake/thirdparty/get_nanoarrow.cmake b/cpp/cmake/thirdparty/get_nanoarrow.cmake
index 025bff7d8f0..8df1b431095 100644
--- a/cpp/cmake/thirdparty/get_nanoarrow.cmake
+++ b/cpp/cmake/thirdparty/get_nanoarrow.cmake
@@ -17,11 +17,11 @@ function(find_and_configure_nanoarrow)
   # Currently we need to always build nanoarrow so we don't pickup a previous installed version
   set(CPM_DOWNLOAD_nanoarrow ON)
   rapids_cpm_find(
-    nanoarrow 0.5.0
+    nanoarrow 0.6.0.dev
     GLOBAL_TARGETS nanoarrow
     CPM_ARGS
     GIT_REPOSITORY https://github.com/apache/arrow-nanoarrow.git
-    GIT_TAG 11e73a8c85b45e3d49c8c541b4e1497a649fe03c
+    GIT_TAG 1e2664a70ec14907409cadcceb14d79b9670bcdb
     GIT_SHALLOW FALSE
     OPTIONS "BUILD_SHARED_LIBS OFF" "NANOARROW_NAMESPACE cudf"
   )
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 11f6ce2bad7..61f7d72a467 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -136,6 +136,8 @@ struct column_metadata {
  * Converts the `cudf::table_view` to `arrow::Table` with the provided
  * metadata `column_names`.
  *
+ * @deprecated Since 24.08. Use cudf::to_arrow_host instead.
+ *
  * @throws cudf::logic_error if `column_names` size doesn't match with number of columns.
  *
  * @param input table_view that needs to be converted to arrow Table
@@ -150,16 +152,19 @@ struct column_metadata {
  * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
  * converted to Arrow decimal128 of the precision 38.
  */
-std::shared_ptr<arrow::Table> to_arrow(table_view input,
-                                       std::vector<column_metadata> const& metadata = {},
-                                       rmm::cuda_stream_view stream = cudf::get_default_stream(),
-                                       arrow::MemoryPool* ar_mr     = arrow::default_memory_pool());
+[[deprecated]] std::shared_ptr<arrow::Table> to_arrow(
+  table_view input,
+  std::vector<column_metadata> const& metadata = {},
+  rmm::cuda_stream_view stream                 = cudf::get_default_stream(),
+  arrow::MemoryPool* ar_mr                     = arrow::default_memory_pool());
 
 /**
  * @brief Create `arrow::Scalar` from cudf scalar `input`
  *
  * Converts the `cudf::scalar` to `arrow::Scalar`.
  *
+ * @deprecated Since 24.08.
+ *
  * @param input scalar that needs to be converted to arrow Scalar
  * @param metadata Contains hierarchy of names of columns and children
  * @param stream CUDA stream used for device memory operations and kernel launches
@@ -172,10 +177,11 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
  * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
  * converted to Arrow decimal128 of the precision 38.
  */
-std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
-                                        column_metadata const& metadata = {},
-                                        rmm::cuda_stream_view stream = cudf::get_default_stream(),
-                                        arrow::MemoryPool* ar_mr = arrow::default_memory_pool());
+[[deprecated]] std::shared_ptr<arrow::Scalar> to_arrow(
+  cudf::scalar const& input,
+  column_metadata const& metadata = {},
+  rmm::cuda_stream_view stream    = cudf::get_default_stream(),
+  arrow::MemoryPool* ar_mr        = arrow::default_memory_pool());
 
 /**
  * @brief typedef for a unique_ptr to an ArrowSchema with custom deleter
@@ -329,15 +335,67 @@ unique_device_array_t to_arrow_device(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Copy table view data to host and create `ArrowDeviceArray` for it
+ *
+ * Populates the C struct ArrowDeviceArray, copying the cudf data to the host. The
+ * returned ArrowDeviceArray will have a device_type of CPU and will have no ties
+ * to the memory referenced by the table view passed in. The deleter for the
+ * returned unique_ptr will call the release callback on the ArrowDeviceArray
+ * automatically.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf, it will
+ * be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of precision 38.
+ *
+ * @param table Input table
+ * @param stream CUDA stream used for the device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray generated from input table
+ */
+unique_device_array_t to_arrow_host(
+  cudf::table_view const& table,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Copy column view data to host and create `ArrowDeviceArray` for it
+ *
+ * Populates the C struct ArrowDeviceArray, copying the cudf data to the host. The
+ * returned ArrowDeviceArray will have a device_type of CPU and will have no ties
+ * to the memory referenced by the column view passed in. The deleter for the
+ * returned unique_ptr will call the release callback on the ArrowDeviceArray
+ * automatically.
+ *
+ * @note For decimals, since the precision is not stored for them in libcudf, it will
+ * be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type
+ * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
+ * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
+ * converted to Arrow decimal128 of precision 38.
+ *
+ * @param col Input column
+ * @param stream CUDA stream used for the device memory operations and kernel launches
+ * @param mr Device memory resource used for any allocations during conversion
+ * @return ArrowDeviceArray generated from input column
+ */
+unique_device_array_t to_arrow_host(
+  cudf::column_view const& col,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Create `cudf::table` from given arrow Table input
  *
+ * @deprecated Since 24.08. Use cudf::from_arrow_host instead.
+ *
  * @param input arrow:Table that needs to be converted to `cudf::table`
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr    Device memory resource used to allocate `cudf::table`
  * @return cudf table generated from given arrow Table
  */
-std::unique_ptr<table> from_arrow(
+[[deprecated]] std::unique_ptr<table> from_arrow(
   arrow::Table const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
@@ -345,12 +403,14 @@ std::unique_ptr<table> from_arrow(
 /**
  * @brief Create `cudf::scalar` from given arrow Scalar input
  *
+ * @deprecated Since 24.08.
+ *
  * @param input `arrow::Scalar` that needs to be converted to `cudf::scalar`
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr    Device memory resource used to allocate `cudf::scalar`
  * @return cudf scalar generated from given arrow Scalar
  */
-std::unique_ptr<cudf::scalar> from_arrow(
+[[deprecated]] std::unique_ptr<cudf::scalar> from_arrow(
   arrow::Scalar const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/include/cudf/interop/detail/arrow.hpp b/cpp/include/cudf/interop/detail/arrow.hpp
deleted file mode 100644
index 906d48f636b..00000000000
--- a/cpp/include/cudf/interop/detail/arrow.hpp
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <nanoarrow/nanoarrow.hpp>
-
-// from Arrow C Device Data Interface
-// https://arrow.apache.org/docs/format/CDeviceDataInterface.html
-#ifndef ARROW_C_DEVICE_DATA_INTERFACE
-#define ARROW_C_DEVICE_DATA_INTERFACE
-
-// Device type for the allocated memory
-using ArrowDeviceType = int32_t;
-
-// The Arrow spec specifies using macros rather than enums here to avoid being
-// susceptible to changes in the underlying type chosen by the compiler, but
-// clang-tidy doesn't like this.
-// NOLINTBEGIN
-// CPU device, same as using ArrowArray directly
-#define ARROW_DEVICE_CPU 1
-// CUDA GPU Device
-#define ARROW_DEVICE_CUDA 2
-// Pinned CUDA CPU memory by cudaMallocHost
-#define ARROW_DEVICE_CUDA_HOST 3
-// CUDA managed/unified memory allocated by cudaMallocManaged
-#define ARROW_DEVICE_CUDA_MANAGED 13
-// NOLINTEND
-
-struct ArrowDeviceArray {
-  struct ArrowArray array;
-  int64_t device_id;
-  ArrowDeviceType device_type;
-  void* sync_event;
-
-  // reserved bytes for future expansion
-  int64_t reserved[3];
-};
-
-#endif  // ARROW_C_DEVICE_DATA_INTERFACE
diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp
index 605d813ed1e..4292552a800 100644
--- a/cpp/src/interop/arrow_utilities.cpp
+++ b/cpp/src/interop/arrow_utilities.cpp
@@ -16,9 +16,16 @@
 
 #include "arrow_utilities.hpp"
 
+#include <cudf/column/column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <rmm/exec_policy.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
 #include <nanoarrow/nanoarrow.h>
 
 namespace cudf {
@@ -83,9 +90,33 @@ ArrowType id_to_arrow_type(cudf::type_id id)
     case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT;
     case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE;
     case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32;
+    case cudf::type_id::DECIMAL128: return NANOARROW_TYPE_DECIMAL128;
     default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error);
   }
 }
 
+ArrowType id_to_arrow_storage_type(cudf::type_id id)
+{
+  switch (id) {
+    case cudf::type_id::TIMESTAMP_SECONDS:
+    case cudf::type_id::TIMESTAMP_MILLISECONDS:
+    case cudf::type_id::TIMESTAMP_MICROSECONDS:
+    case cudf::type_id::TIMESTAMP_NANOSECONDS: return NANOARROW_TYPE_INT64;
+    case cudf::type_id::DURATION_SECONDS:
+    case cudf::type_id::DURATION_MILLISECONDS:
+    case cudf::type_id::DURATION_MICROSECONDS:
+    case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TYPE_INT64;
+    default: return id_to_arrow_type(id);
+  }
+}
+
+int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column_view column)
+{
+  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(arr, storage_type));
+  arr->length     = column.size();
+  arr->null_count = column.null_count();
+  return NANOARROW_OK;
+}
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp
index 4e2628ab689..1cee3071fcb 100644
--- a/cpp/src/interop/arrow_utilities.hpp
+++ b/cpp/src/interop/arrow_utilities.hpp
@@ -18,8 +18,12 @@
 
 #include <cudf/types.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
 #include <nanoarrow/nanoarrow.h>
-#include <nanoarrow/nanoarrow_types.h>
 
 namespace cudf {
 namespace detail {
@@ -47,5 +51,42 @@ data_type arrow_to_cudf_type(ArrowSchemaView const* arrow_view);
  */
 ArrowType id_to_arrow_type(cudf::type_id id);
 
+/**
+ * @brief Map cudf column type id to the storage type for Arrow
+ *
+ * Specifically this is for handling the underlying storage type of
+ * timestamps and durations.
+ *
+ * @param id column type id
+ * @return ArrowType storage type
+ */
+ArrowType id_to_arrow_storage_type(cudf::type_id id);
+
+/**
+ * @brief Helper to initialize ArrowArray struct
+ *
+ * @param arr Pointer to ArrowArray to initialize
+ * @param storage_type The type to initialize with
+ * @param column view for column to get the length and null count from
+ * @return nanoarrow status code, should be NANOARROW_OK if there are no errors
+ */
+int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column_view column);
+
+/**
+ * @brief Helper to convert decimal values to 128-bit versions for Arrow compatibility
+ *
+ * The template parameter should be the underlying type of the data (e.g. int32_t for
+ * 32-bit decimal and int64_t for 64-bit decimal).
+ *
+ * @param input column_view of the data
+ * @param stream cuda stream to perform the operations on
+ * @param mr memory resource to allocate the returned device_uvector with
+ * @return unique_ptr to a device_buffer containing the upcasted data
+ */
+template <typename DeviceType>
+std::unique_ptr<rmm::device_buffer> decimals_to_arrow(cudf::column_view input,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::device_async_resource_ref mr);
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
index e1d289e67a3..440df571de0 100644
--- a/cpp/src/interop/from_arrow_device.cu
+++ b/cpp/src/interop/from_arrow_device.cu
@@ -25,7 +25,6 @@
 #include <cudf/detail/transform.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/interop/detail/arrow.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -39,6 +38,7 @@
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
+#include <nanoarrow/nanoarrow_device.h>
 
 namespace cudf {
 
@@ -144,9 +144,6 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::string_view>(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(schema->type != NANOARROW_TYPE_LARGE_STRING,
-               "Large strings are not yet supported in from_arrow_device",
-               cudf::data_type_error);
   if (input->length == 0) {
     return std::make_tuple<column_view, owned_columns_t>(
       {type,
@@ -158,12 +155,15 @@ dispatch_tuple_t dispatch_from_arrow_device::operator()<cudf::string_view>(
       {});
   }
 
-  auto offsets_view = column_view{data_type(type_id::INT32),
+  data_type offsets_type(type_id::INT32);
+  if (schema->type == NANOARROW_TYPE_LARGE_STRING) { offsets_type = data_type(type_id::INT64); }
+  auto offsets_view = column_view{offsets_type,
                                   static_cast<size_type>(input->offset + input->length) + 1,
                                   input->buffers[fixed_width_data_buffer_idx],
                                   nullptr,
                                   0,
                                   0};
+
   return std::make_tuple<column_view, owned_columns_t>(
     {type,
      static_cast<size_type>(input->length),
diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu
index b3087dedf98..efde8f2a463 100644
--- a/cpp/src/interop/from_arrow_host.cu
+++ b/cpp/src/interop/from_arrow_host.cu
@@ -28,7 +28,6 @@
 #include <cudf/detail/unary.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/interop/detail/arrow.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -42,6 +41,7 @@
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
+#include <nanoarrow/nanoarrow_device.h>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 622a3aba4bb..e89ecedc218 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "arrow_utilities.hpp"
 #include "detail/arrow_allocator.hpp"
 
 #include <cudf/column/column.hpp>
@@ -157,33 +158,17 @@ std::shared_ptr<arrow::Array> unsupported_decimals_to_arrow(column_view input,
                                                             arrow::MemoryPool* ar_mr,
                                                             rmm::cuda_stream_view stream)
 {
-  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
-
-  rmm::device_uvector<DeviceType> buf(input.size() * BIT_WIDTH_RATIO, stream);
-
-  auto count = thrust::make_counting_iterator(0);
-
-  thrust::for_each(
-    rmm::exec_policy(cudf::get_default_stream()),
-    count,
-    count + input.size(),
-    [in = input.begin<DeviceType>(), out = buf.data(), BIT_WIDTH_RATIO] __device__(auto in_idx) {
-      auto const out_idx = in_idx * BIT_WIDTH_RATIO;
-      // The lowest order bits are the value, the remainder
-      // simply matches the sign bit to satisfy the two's
-      // complement integer representation of negative numbers.
-      out[out_idx] = in[in_idx];
-#pragma unroll BIT_WIDTH_RATIO - 1
-      for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
-        out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
-      }
-    });
+  auto buf =
+    detail::decimals_to_arrow<DeviceType>(input, stream, rmm::mr::get_current_device_resource());
 
-  auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType);
+  auto const buf_size_in_bytes = buf->size();
   auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
 
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value()));
+  CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
+                                buf->data(),
+                                buf_size_in_bytes,
+                                cudaMemcpyDefault,
+                                stream.value()));
 
   auto type    = arrow::decimal(precision, -input.type().scale());
   auto mask    = fetch_mask_buffer(input, ar_mr, stream);
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index b9d3a59e647..2eb9b912054 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -24,7 +24,6 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/interop/detail/arrow.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -44,6 +43,7 @@
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
+#include <nanoarrow/nanoarrow_device.h>
 
 namespace cudf {
 namespace detail {
@@ -56,14 +56,6 @@ void device_buffer_finalize(ArrowBufferAllocator* allocator, uint8_t*, int64_t)
   delete unique_buffer;
 }
 
-int initialize_array(ArrowArray* arr, ArrowType storage_type, cudf::column_view column)
-{
-  NANOARROW_RETURN_NOT_OK(ArrowArrayInitFromType(arr, storage_type));
-  arr->length     = column.size();
-  arr->null_count = column.null_count();
-  return NANOARROW_OK;
-}
-
 template <typename>
 struct is_device_scalar : public std::false_type {};
 
@@ -99,21 +91,6 @@ int set_buffer(std::unique_ptr<T> device_buf, int64_t i, ArrowArray* out)
   return NANOARROW_OK;
 }
 
-ArrowType id_to_arrow_storage_type(cudf::type_id id)
-{
-  switch (id) {
-    case cudf::type_id::TIMESTAMP_SECONDS:
-    case cudf::type_id::TIMESTAMP_MILLISECONDS:
-    case cudf::type_id::TIMESTAMP_MICROSECONDS:
-    case cudf::type_id::TIMESTAMP_NANOSECONDS: return NANOARROW_TYPE_INT64;
-    case cudf::type_id::DURATION_SECONDS:
-    case cudf::type_id::DURATION_MILLISECONDS:
-    case cudf::type_id::DURATION_MICROSECONDS:
-    case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TYPE_INT64;
-    default: return id_to_arrow_type(id);
-  }
-}
-
 struct dispatch_to_arrow_device {
   template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
   int operator()(cudf::column&&, rmm::cuda_stream_view, rmm::device_async_resource_ref, ArrowArray*)
@@ -156,35 +133,15 @@ struct dispatch_to_arrow_device {
 };
 
 template <typename DeviceType>
-int decimals_to_arrow(cudf::column_view input,
-                      rmm::cuda_stream_view stream,
-                      rmm::device_async_resource_ref mr,
-                      ArrowArray* out)
+int construct_decimals(cudf::column_view input,
+                       rmm::cuda_stream_view stream,
+                       rmm::device_async_resource_ref mr,
+                       ArrowArray* out)
 {
   nanoarrow::UniqueArray tmp;
   NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, input));
 
-  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
-  auto buf =
-    std::make_unique<rmm::device_uvector<DeviceType>>(input.size() * BIT_WIDTH_RATIO, stream, mr);
-
-  auto count = thrust::counting_iterator<size_type>(0);
-
-  thrust::for_each(
-    rmm::exec_policy(stream, mr),
-    count,
-    count + input.size(),
-    [in = input.begin<DeviceType>(), out = buf->data(), BIT_WIDTH_RATIO] __device__(auto in_idx) {
-      auto const out_idx = in_idx * BIT_WIDTH_RATIO;
-      // the lowest order bits are the value, the remainder
-      // simply matches the sign bit to satisfy the two's
-      // complement integer representation of negative numbers.
-      out[out_idx] = in[in_idx];
-#pragma unroll BIT_WIDTH_RATIO - 1
-      for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
-        out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
-      }
-    });
+  auto buf = detail::decimals_to_arrow<DeviceType>(input, stream, mr);
   NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get()));
 
   ArrowArrayMove(tmp.get(), out);
@@ -198,7 +155,7 @@ int dispatch_to_arrow_device::operator()<numeric::decimal32>(cudf::column&& colu
                                                              ArrowArray* out)
 {
   using DeviceType = int32_t;
-  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column.view(), stream, mr, out));
+  NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column.view(), stream, mr, out));
   auto contents = column.release();
   NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
   return NANOARROW_OK;
@@ -211,7 +168,7 @@ int dispatch_to_arrow_device::operator()<numeric::decimal64>(cudf::column&& colu
                                                              ArrowArray* out)
 {
   using DeviceType = int64_t;
-  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column.view(), stream, mr, out));
+  NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column.view(), stream, mr, out));
   auto contents = column.release();
   NANOARROW_RETURN_NOT_OK(set_null_mask(contents, out));
   return NANOARROW_OK;
@@ -256,8 +213,15 @@ int dispatch_to_arrow_device::operator()<cudf::string_view>(cudf::column&& colum
                                                             rmm::device_async_resource_ref mr,
                                                             ArrowArray* out)
 {
+  ArrowType nanoarrow_type = NANOARROW_TYPE_STRING;
+  if (column.num_children() > 0 &&
+      column.child(cudf::strings_column_view::offsets_column_index).type().id() ==
+        cudf::type_id::INT64) {
+    nanoarrow_type = NANOARROW_TYPE_LARGE_STRING;
+  }
+
   nanoarrow::UniqueArray tmp;
-  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRING, column));
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), nanoarrow_type, column));
 
   if (column.size() == 0) {
     // the scalar zero here is necessary because the spec for string arrays states
@@ -265,8 +229,14 @@ int dispatch_to_arrow_device::operator()<cudf::string_view>(cudf::column&& colum
     // the case of a 0 length string array, there should be exactly 1 value, zero,
     // in the offsets buffer. While some arrow implementations may accept a zero-sized
     // offsets buffer, best practices would be to allocate the buffer with the single value.
-    auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
-    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    if (nanoarrow_type == NANOARROW_TYPE_STRING) {
+      auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
+      NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    } else {
+      auto zero = std::make_unique<rmm::device_scalar<int64_t>>(0, stream, mr);
+      NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    }
+
     ArrowArrayMove(tmp.get(), out);
     return NANOARROW_OK;
   }
@@ -436,7 +406,7 @@ template <>
 int dispatch_to_arrow_device_view::operator()<numeric::decimal32>(ArrowArray* out) const
 {
   using DeviceType = int32_t;
-  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column, stream, mr, out));
+  NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column, stream, mr, out));
   NANOARROW_RETURN_NOT_OK(set_null_mask(column, out));
   return NANOARROW_OK;
 }
@@ -445,7 +415,7 @@ template <>
 int dispatch_to_arrow_device_view::operator()<numeric::decimal64>(ArrowArray* out) const
 {
   using DeviceType = int64_t;
-  NANOARROW_RETURN_NOT_OK(decimals_to_arrow<DeviceType>(column, stream, mr, out));
+  NANOARROW_RETURN_NOT_OK(construct_decimals<DeviceType>(column, stream, mr, out));
   NANOARROW_RETURN_NOT_OK(set_null_mask(column, out));
   return NANOARROW_OK;
 }
@@ -481,13 +451,26 @@ int dispatch_to_arrow_device_view::operator()<bool>(ArrowArray* out) const
 template <>
 int dispatch_to_arrow_device_view::operator()<cudf::string_view>(ArrowArray* out) const
 {
+  ArrowType nanoarrow_type = NANOARROW_TYPE_STRING;
+  if (column.num_children() > 0 &&
+      column.child(cudf::strings_column_view::offsets_column_index).type().id() ==
+        cudf::type_id::INT64) {
+    nanoarrow_type = NANOARROW_TYPE_LARGE_STRING;
+  }
+
   nanoarrow::UniqueArray tmp;
-  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRING, column));
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), nanoarrow_type, column));
 
   if (column.size() == 0) {
     // https://github.com/rapidsai/cudf/pull/15047#discussion_r1546528552
-    auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
-    NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    if (nanoarrow_type == NANOARROW_TYPE_LARGE_STRING) {
+      auto zero = std::make_unique<rmm::device_scalar<int64_t>>(0, stream, mr);
+      NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    } else {
+      auto zero = std::make_unique<rmm::device_scalar<int32_t>>(0, stream, mr);
+      NANOARROW_RETURN_NOT_OK(set_buffer(std::move(zero), fixed_width_data_buffer_idx, tmp.get()));
+    }
+
     ArrowArrayMove(tmp.get(), out);
     return NANOARROW_OK;
   }
diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu
new file mode 100644
index 00000000000..c9e53ebaab7
--- /dev/null
+++ b/cpp/src/interop/to_arrow_host.cu
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arrow_utilities.hpp"
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/lists/lists_column_view.hpp>
+#include <cudf/null_mask.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/structs/structs_column_view.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/traits.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <thrust/for_each.h>
+#include <thrust/iterator/counting_iterator.h>
+
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow.hpp>
+#include <nanoarrow/nanoarrow_device.h>
+
+#include <iostream>
+
+namespace cudf {
+namespace detail {
+
+template <typename DeviceType>
+std::unique_ptr<rmm::device_buffer> decimals_to_arrow(cudf::column_view input,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::device_async_resource_ref mr)
+{
+  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
+  auto buf = std::make_unique<rmm::device_buffer>(input.size() * sizeof(__int128_t), stream, mr);
+
+  auto count = thrust::counting_iterator<size_type>(0);
+  thrust::for_each(rmm::exec_policy(stream, mr),
+                   count,
+                   count + input.size(),
+                   [in  = input.begin<DeviceType>(),
+                    out = reinterpret_cast<DeviceType*>(buf->data()),
+                    BIT_WIDTH_RATIO] __device__(auto in_idx) {
+                     auto const out_idx = in_idx * BIT_WIDTH_RATIO;
+                     // the lowest order bits are the value, the remainder
+                     // simply matches the sign bit to satisfy the two's
+                     // complement integer representation of negative numbers.
+                     out[out_idx] = in[in_idx];
+#pragma unroll BIT_WIDTH_RATIO - 1
+                     for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
+                       out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
+                     }
+                   });
+
+  return buf;
+}
+
+template std::unique_ptr<rmm::device_buffer> decimals_to_arrow<int32_t>(
+  cudf::column_view input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+template std::unique_ptr<rmm::device_buffer> decimals_to_arrow<int64_t>(
+  cudf::column_view input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+namespace {
+
+struct dispatch_to_arrow_host {
+  cudf::column_view column;
+  rmm::cuda_stream_view stream;
+  rmm::device_async_resource_ref mr;
+
+  int populate_validity_bitmap(ArrowBitmap* bitmap) const
+  {
+    if (!column.has_nulls()) { return NANOARROW_OK; }
+
+    NANOARROW_RETURN_NOT_OK(ArrowBitmapResize(bitmap, static_cast<int64_t>(column.size()), 0));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(bitmap->buffer.data,
+                                  (column.offset() > 0)
+                                    ? cudf::detail::copy_bitmask(column, stream, mr).data()
+                                    : column.null_mask(),
+                                  bitmap->buffer.size_bytes,
+                                  cudaMemcpyDefault,
+                                  stream.value()));
+    return NANOARROW_OK;
+  }
+
+  template <typename T>
+  int populate_data_buffer(device_span<T const> input, ArrowBuffer* buffer) const
+  {
+    NANOARROW_RETURN_NOT_OK(ArrowBufferResize(buffer, input.size_bytes(), 1));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(
+      buffer->data, input.data(), input.size_bytes(), cudaMemcpyDefault, stream.value()));
+    return NANOARROW_OK;
+  }
+
+  template <typename T,
+            CUDF_ENABLE_IF(!is_rep_layout_compatible<T>() && !cudf::is_fixed_point<T>())>
+  int operator()(ArrowArray*) const
+  {
+    CUDF_FAIL("Unsupported type for to_arrow_host", cudf::data_type_error);
+  }
+
+  template <typename T,
+            CUDF_ENABLE_IF(is_rep_layout_compatible<T>() || std::is_same_v<T, numeric::decimal128>)>
+  int operator()(ArrowArray* out) const
+  {
+    nanoarrow::UniqueArray tmp;
+
+    auto const storage_type = id_to_arrow_storage_type(column.type().id());
+    NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), storage_type, column));
+
+    NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+    using DataType = std::conditional_t<std::is_same_v<T, numeric::decimal128>, __int128_t, T>;
+    NANOARROW_RETURN_NOT_OK(
+      populate_data_buffer(device_span<DataType const>(column.data<DataType>(), column.size()),
+                           ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+
+  // convert decimal types from libcudf to arrow where those types are not directly
+  // supported by Arrow. These types must be fit into 128 bits, the smallest
+  // decimal resolution supported by Arrow
+  template <typename T,
+            CUDF_ENABLE_IF(!is_rep_layout_compatible<T>() &&
+                           (std::is_same_v<T, numeric::decimal32> ||
+                            std::is_same_v<T, numeric::decimal64>))>
+  int operator()(ArrowArray* out) const
+  {
+    using DeviceType = std::conditional_t<std::is_same_v<T, numeric::decimal32>, int32_t, int64_t>;
+    nanoarrow::UniqueArray tmp;
+    NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
+
+    NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+    auto buf = detail::decimals_to_arrow<DeviceType>(column, stream, mr);
+    NANOARROW_RETURN_NOT_OK(
+      populate_data_buffer(device_span<__int128_t const>(
+                             reinterpret_cast<const __int128_t*>(buf->data()), column.size()),
+                           ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+};
+
+int get_column(cudf::column_view column,
+               rmm::cuda_stream_view stream,
+               rmm::device_async_resource_ref mr,
+               ArrowArray* out);
+
+template <>
+int dispatch_to_arrow_host::operator()<bool>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column));
+
+  NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+  auto bitmask = bools_to_mask(column, stream, mr);
+  NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+    device_span<uint8_t const>(reinterpret_cast<const uint8_t*>(bitmask.first->data()),
+                               bitmask.first->size()),
+    ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_host::operator()<cudf::string_view>(ArrowArray* out) const
+{
+  ArrowType nanoarrow_type = NANOARROW_TYPE_STRING;
+  if (column.num_children() > 0 &&
+      column.child(cudf::strings_column_view::offsets_column_index).type().id() ==
+        cudf::type_id::INT64) {
+    nanoarrow_type = NANOARROW_TYPE_LARGE_STRING;
+  }
+
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), nanoarrow_type, column));
+
+  if (column.size() == 0) {
+    // initialize the offset buffer with a single zero by convention
+    if (nanoarrow_type == NANOARROW_TYPE_LARGE_STRING) {
+      NANOARROW_RETURN_NOT_OK(
+        ArrowBufferAppendInt64(ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx), 0));
+    } else {
+      NANOARROW_RETURN_NOT_OK(
+        ArrowBufferAppendInt32(ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx), 0));
+    }
+
+    ArrowArrayMove(tmp.get(), out);
+    return NANOARROW_OK;
+  }
+
+  NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+
+  auto const scv     = cudf::strings_column_view(column);
+  auto const offsets = scv.offsets();
+  if (offsets.type().id() == cudf::type_id::INT64) {
+    NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+      device_span<int64_t const>(offsets.data<int64_t>() + scv.offset(), scv.size() + 1),
+      ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+  } else {
+    NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+      device_span<int32_t const>(offsets.data<int32_t>() + scv.offset(), scv.size() + 1),
+      ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+  }
+
+  NANOARROW_RETURN_NOT_OK(
+    populate_data_buffer(device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)),
+                         ArrowArrayBuffer(tmp.get(), 2)));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_host::operator()<cudf::list_view>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_LIST, column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), 1));
+
+  NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+  auto const lcv = cudf::lists_column_view(column);
+
+  if (column.size() == 0) {
+    // initialize the offsets buffer with a single zero by convention for 0 length
+    NANOARROW_RETURN_NOT_OK(
+      ArrowBufferAppendInt32(ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx), 0));
+  } else {
+    NANOARROW_RETURN_NOT_OK(
+      populate_data_buffer(device_span<int32_t const>(lcv.offsets_begin(), (column.size() + 1)),
+                           ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+  }
+
+  NANOARROW_RETURN_NOT_OK(get_column(lcv.child(), stream, mr, tmp->children[0]));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_host::operator()<cudf::dictionary32>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_RETURN_NOT_OK(initialize_array(
+    tmp.get(),
+    id_to_arrow_type(column.child(cudf::dictionary_column_view::indices_column_index).type().id()),
+    column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateDictionary(tmp.get()));
+
+  NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+  auto dcv          = cudf::dictionary_column_view(column);
+  auto dict_indices = dcv.get_indices_annotated();
+  switch (dict_indices.type().id()) {
+    case type_id::INT8:
+    case type_id::UINT8:
+      NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+        device_span<int8_t const>(dict_indices.data<int8_t>(), dict_indices.size()),
+        ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+      break;
+    case type_id::INT16:
+    case type_id::UINT16:
+      NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+        device_span<int16_t const>(dict_indices.data<int16_t>(), dict_indices.size()),
+        ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+      break;
+    case type_id::INT32:
+    case type_id::UINT32:
+      NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+        device_span<int32_t const>(dict_indices.data<int32_t>(), dict_indices.size()),
+        ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+      break;
+    case type_id::INT64:
+    case type_id::UINT64:
+      NANOARROW_RETURN_NOT_OK(populate_data_buffer(
+        device_span<int64_t const>(dict_indices.data<int64_t>(), dict_indices.size()),
+        ArrowArrayBuffer(tmp.get(), fixed_width_data_buffer_idx)));
+      break;
+    default: CUDF_FAIL("unsupported type for dictionary indices");
+  }
+
+  NANOARROW_RETURN_NOT_OK(get_column(dcv.keys(), stream, mr, tmp->dictionary));
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+template <>
+int dispatch_to_arrow_host::operator()<cudf::struct_view>(ArrowArray* out) const
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_STRUCT, column));
+  NANOARROW_RETURN_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), column.num_children()));
+  NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
+
+  auto const scv = cudf::structs_column_view(column);
+
+  for (size_t i = 0; i < size_t(tmp->n_children); ++i) {
+    ArrowArray* child_ptr = tmp->children[i];
+    auto const child      = scv.get_sliced_child(i, stream);
+    NANOARROW_RETURN_NOT_OK(get_column(child, stream, mr, child_ptr));
+  }
+
+  ArrowArrayMove(tmp.get(), out);
+  return NANOARROW_OK;
+}
+
+int get_column(cudf::column_view column,
+               rmm::cuda_stream_view stream,
+               rmm::device_async_resource_ref mr,
+               ArrowArray* out)
+{
+  return column.type().id() != type_id::EMPTY
+           ? type_dispatcher(column.type(), dispatch_to_arrow_host{column, stream, mr}, out)
+           : initialize_array(out, NANOARROW_TYPE_NA, column);
+}
+
+unique_device_array_t create_device_array(nanoarrow::UniqueArray&& out)
+{
+  ArrowError err;
+  if (ArrowArrayFinishBuildingDefault(out.get(), &err) != NANOARROW_OK) {
+    std::cerr << err.message << std::endl;
+    CUDF_FAIL("failed to build");
+  }
+
+  unique_device_array_t result(new ArrowDeviceArray, [](ArrowDeviceArray* arr) {
+    if (arr->array.release != nullptr) { ArrowArrayRelease(&arr->array); }
+    delete arr;
+  });
+
+  result->device_id   = -1;
+  result->device_type = ARROW_DEVICE_CPU;
+  result->sync_event  = nullptr;
+  ArrowArrayMove(out.get(), &result->array);
+  return result;
+}
+
+}  // namespace
+
+unique_device_array_t to_arrow_host(cudf::table_view const& table,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
+{
+  nanoarrow::UniqueArray tmp;
+  NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRUCT));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayAllocateChildren(tmp.get(), table.num_columns()));
+  tmp->length     = table.num_rows();
+  tmp->null_count = 0;
+
+  for (cudf::size_type i = 0; i < table.num_columns(); ++i) {
+    auto child = tmp->children[i];
+    auto col   = table.column(i);
+    NANOARROW_THROW_NOT_OK(
+      cudf::type_dispatcher(col.type(), detail::dispatch_to_arrow_host{col, stream, mr}, child));
+  }
+
+  // wait for all the stream operations to complete before we return.
+  // this ensures that the host memory that we're returning will be populated
+  // before we return from this function.
+  stream.synchronize();
+
+  return create_device_array(std::move(tmp));
+}
+
+unique_device_array_t to_arrow_host(cudf::column_view const& col,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
+{
+  nanoarrow::UniqueArray tmp;
+
+  NANOARROW_THROW_NOT_OK(
+    cudf::type_dispatcher(col.type(), detail::dispatch_to_arrow_host{col, stream, mr}, tmp.get()));
+
+  // wait for all the stream operations to complete before we return.
+  // this ensures that the host memory that we're returning will be populated
+  // before we return from this function.
+  stream.synchronize();
+
+  return create_device_array(std::move(tmp));
+}
+
+}  // namespace detail
+
+unique_device_array_t to_arrow_host(cudf::column_view const& col,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow_host(col, stream, mr);
+}
+
+unique_device_array_t to_arrow_host(cudf::table_view const& table,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::to_arrow_host(table, stream, mr);
+}
+
+}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow_schema.cpp b/cpp/src/interop/to_arrow_schema.cpp
index 19915464236..b98ca8a7bed 100644
--- a/cpp/src/interop/to_arrow_schema.cpp
+++ b/cpp/src/interop/to_arrow_schema.cpp
@@ -20,7 +20,6 @@
 #include <cudf/detail/interop.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/interop/detail/arrow.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
@@ -120,7 +119,11 @@ int dispatch_to_arrow_type::operator()<cudf::string_view>(column_view input,
                                                           column_metadata const&,
                                                           ArrowSchema* out)
 {
-  return ArrowSchemaSetType(out, NANOARROW_TYPE_STRING);
+  return ((input.num_children() == 0 ||
+           input.child(cudf::strings_column_view::offsets_column_index).type().id() ==
+             type_id::INT32))
+           ? ArrowSchemaSetType(out, NANOARROW_TYPE_STRING)
+           : ArrowSchemaSetType(out, NANOARROW_TYPE_LARGE_STRING);
 }
 
 // these forward declarations are needed due to the recursive calls to them
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 05e9759632f..88187623930 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -270,6 +270,7 @@ ConfigureTest(
   INTEROP_TEST
   interop/to_arrow_device_test.cpp
   interop/to_arrow_test.cpp
+  interop/to_arrow_host_test.cpp
   interop/from_arrow_test.cpp
   interop/from_arrow_device_test.cpp
   interop/from_arrow_host_test.cpp
diff --git a/cpp/tests/interop/nanoarrow_utils.hpp b/cpp/tests/interop/nanoarrow_utils.hpp
index 4147728b2a6..a961f73d955 100644
--- a/cpp/tests/interop/nanoarrow_utils.hpp
+++ b/cpp/tests/interop/nanoarrow_utils.hpp
@@ -18,7 +18,6 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/interop/detail/arrow.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -29,6 +28,7 @@
 #include <cudf/wrappers/durations.hpp>
 
 #include <nanoarrow/nanoarrow.hpp>
+#include <nanoarrow/nanoarrow_device.h>
 
 struct generated_test_data {
   generated_test_data(cudf::size_type length)
@@ -211,6 +211,7 @@ DEFINE_NANOARROW_STORAGE(cudf::duration_us, INT64);
 DEFINE_NANOARROW_STORAGE(cudf::duration_ns, INT64);
 DEFINE_NANOARROW_STORAGE(uint8_t, UINT8);
 DEFINE_NANOARROW_STORAGE(int32_t, INT32);
+DEFINE_NANOARROW_STORAGE(__int128_t, DECIMAL128);
 
 #undef DEFINE_NANOARROW_STORAGE
 
@@ -255,8 +256,7 @@ std::enable_if_t<std::is_same_v<T, bool>, nanoarrow::UniqueArray> get_nanoarrow_
     ArrowBitmap out;
     ArrowBitmapInit(&out);
     NANOARROW_THROW_NOT_OK(ArrowBitmapResize(&out, b.size(), 1));
-    out.buffer.size_bytes = (b.size() >> 3) + ((b.size() & 7) != 0);
-    out.size_bits         = b.size();
+    std::memset(out.buffer.data, 0, out.buffer.size_bytes);
 
     for (size_t i = 0; i < b.size(); ++i) {
       ArrowBitSetTo(out.buffer.data, i, static_cast<uint8_t>(b[i]));
@@ -296,6 +296,7 @@ std::enable_if_t<std::is_same_v<T, cudf::string_view>, nanoarrow::UniqueArray> g
 {
   nanoarrow::UniqueArray tmp;
   NANOARROW_THROW_NOT_OK(ArrowArrayInitFromType(tmp.get(), NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(ArrowArrayValidityBitmap(tmp.get()), mask.size()));
   NANOARROW_THROW_NOT_OK(ArrowArrayStartAppending(tmp.get()));
   NANOARROW_THROW_NOT_OK(ArrowArrayReserve(tmp.get(), data.size()));
 
@@ -378,3 +379,5 @@ get_nanoarrow_cudf_table(cudf::size_type length);
 
 std::tuple<std::unique_ptr<cudf::table>, nanoarrow::UniqueSchema, nanoarrow::UniqueArray>
 get_nanoarrow_host_tables(cudf::size_type length);
+
+void slice_host_nanoarrow(ArrowArray* arr, int64_t start, int64_t end);
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 8903f09b82b..77da4039103 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -31,7 +31,6 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
-#include <cudf/interop/detail/arrow.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
diff --git a/cpp/tests/interop/to_arrow_host_test.cpp b/cpp/tests/interop/to_arrow_host_test.cpp
new file mode 100644
index 00000000000..fc0ed6c9352
--- /dev/null
+++ b/cpp/tests/interop/to_arrow_host_test.cpp
@@ -0,0 +1,1117 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nanoarrow_utils.hpp"
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/testing_main.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/copy.hpp>
+#include <cudf/detail/interop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/dictionary/encode.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <numeric>
+
+using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
+
+struct BaseToArrowHostFixture : public cudf::test::BaseFixture {
+  template <typename T>
+  std::enable_if_t<cudf::is_fixed_width<T>() and !std::is_same_v<T, bool>, void> compare_subset(
+    ArrowArrayView const* expected,
+    int64_t start_offset_expected,
+    ArrowArrayView const* actual,
+    int64_t start_offset_actual,
+    int64_t length)
+  {
+    for (int64_t i = 0; i < length; ++i) {
+      const bool is_null = ArrowArrayViewIsNull(expected, start_offset_expected + i);
+      EXPECT_EQ(is_null, ArrowArrayViewIsNull(actual, start_offset_actual + i));
+      if (is_null) continue;
+
+      const auto expected_val = ArrowArrayViewGetIntUnsafe(expected, start_offset_expected + i);
+      const auto actual_val   = ArrowArrayViewGetIntUnsafe(actual, start_offset_actual + i);
+
+      EXPECT_EQ(expected_val, actual_val);
+    }
+  }
+
+  template <typename T>
+  std::enable_if_t<std::is_same_v<T, cudf::string_view>, void> compare_subset(
+    ArrowArrayView const* expected,
+    int64_t start_offset_expected,
+    ArrowArrayView const* actual,
+    int64_t start_offset_actual,
+    int64_t length)
+  {
+    for (int64_t i = 0; i < length; ++i) {
+      const bool is_null = ArrowArrayViewIsNull(expected, start_offset_expected + i);
+      EXPECT_EQ(is_null, ArrowArrayViewIsNull(actual, start_offset_actual + i));
+      if (is_null) continue;
+
+      const auto expected_view = ArrowArrayViewGetBytesUnsafe(expected, start_offset_expected + i);
+      const auto actual_view   = ArrowArrayViewGetBytesUnsafe(actual, start_offset_actual + i);
+
+      EXPECT_EQ(expected_view.size_bytes, actual_view.size_bytes);
+      EXPECT_TRUE(
+        0 == std::memcmp(expected_view.data.data, actual_view.data.data, expected_view.size_bytes));
+    }
+  }
+
+  void compare_child_subset(ArrowArrayView const* expected,
+                            int64_t exp_start_offset,
+                            ArrowArrayView const* actual,
+                            int64_t act_start_offset,
+                            int64_t length)
+  {
+    EXPECT_EQ(expected->storage_type, actual->storage_type);
+    EXPECT_EQ(expected->n_children, actual->n_children);
+
+    switch (expected->storage_type) {
+      case NANOARROW_TYPE_LIST:
+        for (int64_t i = 0; i < length; ++i) {
+          const auto expected_start = exp_start_offset + i;
+          const auto actual_start   = act_start_offset + i;
+
+          // ArrowArrayViewIsNull accounts for the array offset, so we can properly
+          // compare the validity of indexes
+          const bool is_null = ArrowArrayViewIsNull(expected, expected_start);
+          EXPECT_EQ(is_null, ArrowArrayViewIsNull(actual, actual_start));
+          if (is_null) continue;
+
+          // ArrowArrayViewListChildOffset does not account for array offset, so we need
+          // to add the offset to the index in order to get the correct offset into the list
+          const int64_t start_offset_expected =
+            ArrowArrayViewListChildOffset(expected, expected->offset + expected_start);
+          const int64_t start_offset_actual =
+            ArrowArrayViewListChildOffset(actual, actual->offset + actual_start);
+
+          const int64_t end_offset_expected =
+            ArrowArrayViewListChildOffset(expected, expected->offset + expected_start + 1);
+          const int64_t end_offset_actual =
+            ArrowArrayViewListChildOffset(actual, actual->offset + actual_start + 1);
+
+          // verify the list lengths are the same
+          EXPECT_EQ(end_offset_expected - start_offset_expected,
+                    end_offset_actual - start_offset_actual);
+          // compare the list values
+          compare_child_subset(expected->children[0],
+                               start_offset_expected,
+                               actual->children[0],
+                               start_offset_actual,
+                               end_offset_expected - start_offset_expected);
+        }
+        break;
+      case NANOARROW_TYPE_STRUCT:
+        for (int64_t i = 0; i < length; ++i) {
+          SCOPED_TRACE("idx: " + std::to_string(i));
+          const auto expected_start = exp_start_offset + i;
+          const auto actual_start   = act_start_offset + i;
+
+          const bool is_null = ArrowArrayViewIsNull(expected, expected_start);
+          EXPECT_EQ(is_null, ArrowArrayViewIsNull(actual, actual_start));
+          if (is_null) continue;
+
+          for (int64_t child = 0; child < expected->n_children; ++child) {
+            SCOPED_TRACE("child: " + std::to_string(child));
+            compare_child_subset(expected->children[child],
+                                 expected_start + expected->offset,
+                                 actual->children[child],
+                                 actual_start + actual->offset,
+                                 1);
+          }
+        }
+        break;
+      case NANOARROW_TYPE_STRING:
+      case NANOARROW_TYPE_LARGE_STRING:
+      case NANOARROW_TYPE_BINARY:
+      case NANOARROW_TYPE_LARGE_BINARY:
+        compare_subset<cudf::string_view>(
+          expected, exp_start_offset, actual, act_start_offset, length);
+        break;
+      default:
+        compare_subset<int64_t>(expected, exp_start_offset, actual, act_start_offset, length);
+        break;
+    }
+  }
+
+  void compare_arrays(ArrowArrayView const* expected, ArrowArrayView const* actual)
+  {
+    EXPECT_EQ(expected->length, actual->length);
+    EXPECT_EQ(expected->null_count, actual->null_count);
+    EXPECT_EQ(expected->offset, actual->offset);
+    EXPECT_EQ(expected->n_children, actual->n_children);
+    EXPECT_EQ(expected->storage_type, actual->storage_type);
+
+    // cudf automatically pushes down nulls and purges non-empty, non-zero nulls
+    // from the children columns. So while we can memcmp the buffers for top
+    // level arrays, we need to do an "equivalence" comparison for nested
+    // arrays (lists and structs) by checking each index for null and skipping
+    // comparisons for children if null.
+    switch (expected->storage_type) {
+      case NANOARROW_TYPE_STRUCT:
+        // if we're a struct with no children, then we just skip
+        // attempting to compare the children
+        if (expected->n_children == 0) {
+          EXPECT_EQ(nullptr, actual->children);
+          break;
+        }
+        // otherwise we can fallthrough and do the same thing we do for lists
+      case NANOARROW_TYPE_LIST:
+        compare_child_subset(expected, 0, actual, 0, expected->length);
+        break;
+      default:
+        for (int64_t i = 0; i < actual->array->n_buffers; ++i) {
+          SCOPED_TRACE("buffer " + std::to_string(i));
+          auto expected_buf = expected->buffer_views[i];
+          auto actual_buf   = actual->buffer_views[i];
+
+          EXPECT_TRUE(0 == std::memcmp(expected_buf.data.data,
+                                       actual_buf.data.data,
+                                       expected_buf.size_bytes));
+        }
+    }
+
+    if (expected->dictionary != nullptr) {
+      EXPECT_NE(nullptr, actual->dictionary);
+      SCOPED_TRACE("dictionary");
+      compare_arrays(expected->dictionary, actual->dictionary);
+    } else {
+      EXPECT_EQ(nullptr, actual->dictionary);
+    }
+  }
+};
+
+struct ToArrowHostDeviceTest : public BaseToArrowHostFixture {};
+template <typename T>
+struct ToArrowHostDeviceTestDurationsTest : public BaseToArrowHostFixture {};
+
+TYPED_TEST_SUITE(ToArrowHostDeviceTestDurationsTest, cudf::test::DurationTypes);
+
+TEST_F(ToArrowHostDeviceTest, EmptyTable)
+{
+  auto [tbl, schema, arr] = get_nanoarrow_host_tables(0);
+
+  auto got_arrow_host = cudf::to_arrow_host(tbl->view());
+  EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+  EXPECT_EQ(-1, got_arrow_host->device_id);
+  EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+  ArrowArrayView expected, actual;
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, arr.get(), nullptr));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(&expected, &actual);
+
+  ArrowArrayViewReset(&expected);
+  ArrowArrayViewReset(&actual);
+}
+
+TEST_F(ToArrowHostDeviceTest, DateTimeTable)
+{
+  auto data = std::initializer_list<int64_t>{1, 2, 3, 4, 5, 6};
+  auto col =
+    cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep>(data);
+  cudf::table_view input_view({col});
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+  ArrowSchemaInit(expected_schema->children[0]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    expected_schema->children[0], NANOARROW_TYPE_TIMESTAMP, NANOARROW_TIME_UNIT_MILLI, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+  expected_schema->children[0]->flags = 0;
+
+  auto got_arrow_host = cudf::to_arrow_host(input_view);
+  EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+  EXPECT_EQ(-1, got_arrow_host->device_id);
+  EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+  ArrowArrayView expected, actual;
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+  expected.length              = data.size();
+  expected.children[0]->length = data.size();
+  ArrowArrayViewSetLength(expected.children[0], data.size());
+  expected.children[0]->buffer_views[0].data.data  = nullptr;
+  expected.children[0]->buffer_views[0].size_bytes = 0;
+  expected.children[0]->buffer_views[1].data.data  = data.begin();
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(&expected, &actual);
+  ArrowArrayViewReset(&actual);
+
+  got_arrow_host = cudf::to_arrow_host(input_view.column(0));
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  BaseToArrowHostFixture::compare_arrays(expected.children[0], &actual);
+  ArrowArrayViewReset(&actual);
+
+  ArrowArrayViewReset(&expected);
+  ArrowArrayViewReset(&actual);
+}
+
+TYPED_TEST(ToArrowHostDeviceTestDurationsTest, DurationTable)
+{
+  using T = TypeParam;
+
+  if (cudf::type_to_id<TypeParam>() == cudf::type_id::DURATION_DAYS) { return; }
+
+  auto data = {T{1}, T{2}, T{3}, T{4}, T{5}, T{6}};
+  auto col  = cudf::test::fixed_width_column_wrapper<T>(data);
+
+  cudf::table_view input_view({col});
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+
+  ArrowSchemaInit(expected_schema->children[0]);
+  const ArrowTimeUnit arrow_unit = [&] {
+    switch (cudf::type_to_id<TypeParam>()) {
+      case cudf::type_id::DURATION_SECONDS: return NANOARROW_TIME_UNIT_SECOND;
+      case cudf::type_id::DURATION_MILLISECONDS: return NANOARROW_TIME_UNIT_MILLI;
+      case cudf::type_id::DURATION_MICROSECONDS: return NANOARROW_TIME_UNIT_MICRO;
+      case cudf::type_id::DURATION_NANOSECONDS: return NANOARROW_TIME_UNIT_NANO;
+      default: CUDF_FAIL("Unsupported duration unit in arrow");
+    }
+  }();
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDateTime(
+    expected_schema->children[0], NANOARROW_TYPE_DURATION, arrow_unit, nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+  expected_schema->children[0]->flags = 0;
+
+  auto got_arrow_host = cudf::to_arrow_host(input_view);
+  EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+  EXPECT_EQ(-1, got_arrow_host->device_id);
+  EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+  ArrowArrayView expected, actual;
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+
+  expected.length              = data.size();
+  expected.children[0]->length = data.size();
+  ArrowArrayViewSetLength(expected.children[0], data.size());
+  expected.children[0]->buffer_views[0].data.data  = nullptr;
+  expected.children[0]->buffer_views[0].size_bytes = 0;
+  expected.children[0]->buffer_views[1].data.data  = data.begin();
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  BaseToArrowHostFixture::compare_arrays(&expected, &actual);
+  ArrowArrayViewReset(&actual);
+
+  got_arrow_host = cudf::to_arrow_host(input_view.column(0));
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  BaseToArrowHostFixture::compare_arrays(expected.children[0], &actual);
+  ArrowArrayViewReset(&actual);
+
+  ArrowArrayViewReset(&expected);
+}
+
+TEST_F(ToArrowHostDeviceTest, NestedList)
+{
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3 != 0; });
+  auto col = cudf::test::lists_column_wrapper<int64_t>(
+    {{{{{1, 2}, valids}, {{3, 4}, valids}, {5}}, {{6}, {{7, 8, 9}, valids}}}, valids});
+  cudf::table_view input_view({col});
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(expected_schema->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+  expected_schema->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(expected_schema->children[0]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0]->children[0], "element"));
+  expected_schema->children[0]->children[0]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(
+    expected_schema->children[0]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(expected_schema->children[0]->children[0]->children[0], "element"));
+  expected_schema->children[0]->children[0]->children[0]->flags = ARROW_FLAG_NULLABLE;
+
+  auto got_arrow_host = cudf::to_arrow_host(input_view);
+  EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+  EXPECT_EQ(-1, got_arrow_host->device_id);
+  EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+  auto list_arr = get_nanoarrow_list_array<int64_t>({6, 7, 8, 9}, {0, 1, 4}, {1, 0, 1, 1});
+  std::vector<int32_t> offset{0, 0, 2};
+
+  ArrowBitmap mask;
+  ArrowBitmapInit(&mask);
+  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&mask, 2));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 0, 1));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 1, 1));
+
+  nanoarrow::UniqueArray expected_arr;
+  EXPECT_EQ(NANOARROW_OK,
+            ArrowArrayInitFromSchema(expected_arr.get(), expected_schema.get(), nullptr));
+  expected_arr->length     = input_view.num_rows();
+  expected_arr->null_count = 0;
+
+  ArrowArraySetValidityBitmap(expected_arr->children[0], &mask);
+  expected_arr->children[0]->length     = input_view.num_rows();
+  expected_arr->children[0]->null_count = 1;
+  auto offset_buf                       = ArrowArrayBuffer(expected_arr->children[0], 1);
+  EXPECT_EQ(
+    NANOARROW_OK,
+    ArrowBufferAppend(
+      offset_buf, reinterpret_cast<void const*>(offset.data()), offset.size() * sizeof(int32_t)));
+  list_arr.move(expected_arr->children[0]->children[0]);
+  NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_arr.get(), nullptr));
+
+  ArrowArrayView expected, actual;
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_arr.get(), nullptr));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(&expected, &actual);
+  ArrowArrayViewReset(&actual);
+
+  got_arrow_host = cudf::to_arrow_host(input_view.column(0));
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(expected.children[0], &actual);
+  ArrowArrayViewReset(&actual);
+
+  ArrowArrayViewReset(&expected);
+}
+
+TEST_F(ToArrowHostDeviceTest, StructColumn)
+{
+  // Create cudf table
+  auto nested_type_field_names =
+    std::vector<std::vector<std::string>>{{"string", "integral", "bool", "nested_list", "struct"}};
+  auto str_col =
+    cudf::test::strings_column_wrapper{
+      "Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"}
+      .release();
+  auto str_col2 =
+    cudf::test::strings_column_wrapper{{"CUDF", "ROCKS", "EVERYWHERE"}, {0, 1, 0}}.release();
+  int num_rows{str_col->size()};
+  auto int_col = cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{48, 27, 25}}.release();
+  auto int_col2 =
+    cudf::test::fixed_width_column_wrapper<int32_t, int32_t>{{12, 24, 47}, {1, 0, 1}}.release();
+  auto bool_col = cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}}.release();
+  auto list_col =
+    cudf::test::lists_column_wrapper<int64_t>({{{1, 2}, {3, 4}, {5}}, {{{6}}}, {{7}, {8, 9}}})
+      .release();
+  vector_of_columns cols2;
+  cols2.push_back(std::move(str_col2));
+  cols2.push_back(std::move(int_col2));
+  auto [null_mask, null_count] =
+    cudf::bools_to_mask(cudf::test::fixed_width_column_wrapper<bool>{{true, true, false}});
+  auto sub_struct_col =
+    cudf::make_structs_column(num_rows, std::move(cols2), null_count, std::move(*null_mask));
+  vector_of_columns cols;
+  cols.push_back(std::move(str_col));
+  cols.push_back(std::move(int_col));
+  cols.push_back(std::move(bool_col));
+  cols.push_back(std::move(list_col));
+  cols.push_back(std::move(sub_struct_col));
+
+  auto struct_col = cudf::make_structs_column(num_rows, std::move(cols), 0, {});
+  cudf::table_view input_view({struct_col->view()});
+
+  nanoarrow::UniqueSchema expected_schema;
+  ArrowSchemaInit(expected_schema.get());
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+
+  ArrowSchemaInit(expected_schema->children[0]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema->children[0], 5));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+  expected_schema->children[0]->flags = 0;
+
+  auto child = expected_schema->children[0];
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[0], "string"));
+  child->children[0]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[1], "integral"));
+  child->children[1]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[2], NANOARROW_TYPE_BOOL));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[2], "bool"));
+  child->children[2]->flags = 0;
+
+  NANOARROW_THROW_NOT_OK(ArrowSchemaInitFromType(child->children[3], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3], "nested_list"));
+  child->children[3]->flags = 0;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0], NANOARROW_TYPE_LIST));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[3]->children[0], "element"));
+  child->children[3]->children[0]->flags = 0;
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[3]->children[0]->children[0], NANOARROW_TYPE_INT64));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaSetName(child->children[3]->children[0]->children[0], "element"));
+  child->children[3]->children[0]->children[0]->flags = 0;
+
+  ArrowSchemaInit(child->children[4]);
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(child->children[4], 2));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4], "struct"));
+
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[0], NANOARROW_TYPE_STRING));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[0], "string2"));
+  NANOARROW_THROW_NOT_OK(
+    ArrowSchemaInitFromType(child->children[4]->children[1], NANOARROW_TYPE_INT32));
+  NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(child->children[4]->children[1], "integral2"));
+
+  // create nanoarrow table
+  // first our underlying arrays
+  std::vector<std::string> str{"Samuel Vimes", "Carrot Ironfoundersson", "Angua von Überwald"};
+  std::vector<std::string> str2{"CUDF", "ROCKS", "EVERYWHERE"};
+  auto str_array  = get_nanoarrow_array<cudf::string_view>(str);
+  auto int_array  = get_nanoarrow_array<int32_t>({48, 27, 25});
+  auto str2_array = get_nanoarrow_array<cudf::string_view>(str2, {0, 1, 0});
+  // struct null will get pushed down and superimposed on this array
+  auto int2_array = get_nanoarrow_array<int32_t, uint8_t>({12, 24, 47}, {1, 0, 0});
+  auto bool_array = get_nanoarrow_array<bool>({true, true, false});
+  auto list_arr =
+    get_nanoarrow_list_array<int64_t>({1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 2, 4, 5, 6, 7, 9});
+  std::vector<int32_t> offset{0, 3, 4, 6};
+
+  nanoarrow::UniqueArray expected_arr;
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayInitFromSchema(expected_arr.get(), expected_schema.get(), nullptr));
+  expected_arr->length = input_view.num_rows();
+
+  auto array_a        = expected_arr->children[0];
+  auto view_a         = input_view.column(0);
+  array_a->length     = view_a.size();
+  array_a->null_count = view_a.null_count();
+
+  str_array.move(array_a->children[0]);
+  int_array.move(array_a->children[1]);
+  bool_array.move(array_a->children[2]);
+
+  array_a->children[3]->length     = input_view.num_rows();
+  array_a->children[3]->null_count = 0;
+
+  auto offset_buf = ArrowArrayBuffer(array_a->children[3], 1);
+  EXPECT_EQ(
+    NANOARROW_OK,
+    ArrowBufferAppend(
+      offset_buf, reinterpret_cast<void const*>(offset.data()), offset.size() * sizeof(int32_t)));
+  list_arr.move(array_a->children[3]->children[0]);
+
+  ArrowBitmap mask;
+  ArrowBitmapInit(&mask);
+  NANOARROW_THROW_NOT_OK(ArrowBitmapReserve(&mask, 3));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 1, 2));
+  NANOARROW_THROW_NOT_OK(ArrowBitmapAppend(&mask, 0, 1));
+
+  auto array_struct = array_a->children[4];
+  auto view_struct  = view_a.child(4);
+  ArrowArraySetValidityBitmap(array_struct, &mask);
+  array_struct->null_count = view_struct.null_count();
+  array_struct->length     = view_struct.size();
+
+  str2_array.move(array_struct->children[0]);
+  int2_array.move(array_struct->children[1]);
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_arr.get(), nullptr));
+
+  auto got_arrow_host = cudf::to_arrow_host(input_view);
+  EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+  EXPECT_EQ(-1, got_arrow_host->device_id);
+  EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+  ArrowArrayView expected, actual;
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_arr.get(), nullptr));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(&expected, &actual);
+  ArrowArrayViewReset(&actual);
+
+  got_arrow_host = cudf::to_arrow_host(input_view.column(0));
+  NANOARROW_THROW_NOT_OK(
+    ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(expected.children[0], &actual);
+  ArrowArrayViewReset(&actual);
+
+  ArrowArrayViewReset(&expected);
+}
+
+template <typename T>
+using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint32Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const col   = fp_wrapper<int32_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto const data = std::vector<__int128_t>{-1, 2, 3, 4, 5, 6};
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int32_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(data).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint64Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const col   = fp_wrapper<int64_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto const data = std::vector<__int128_t>{-1, 2, 3, 4, 5, 6};
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int64_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(data).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint128Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const col   = fp_wrapper<__int128_t>({-1, 2, 3, 4, 5, 6}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto const data = std::vector<__int128_t>{-1, 2, 3, 4, 5, 6};
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(data).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint32TableLarge)
+{
+  using namespace numeric;
+  auto constexpr NUM_ELEMENTS = 1000;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const iota  = thrust::make_counting_iterator(1);
+    auto const col   = fp_wrapper<int32_t>(iota, iota + NUM_ELEMENTS, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto expect_data = std::vector<__int128_t>(NUM_ELEMENTS);
+    std::iota(expect_data.begin(), expect_data.end(), 1);
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int32_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(expect_data).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint64TableLarge)
+{
+  using namespace numeric;
+  auto constexpr NUM_ELEMENTS = 1000;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const iota  = thrust::make_counting_iterator(1);
+    auto const col   = fp_wrapper<int64_t>(iota, iota + NUM_ELEMENTS, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto expect_data = std::vector<__int128_t>(NUM_ELEMENTS);
+    std::iota(expect_data.begin(), expect_data.end(), 1);
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int64_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(expect_data).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint128TableLarge)
+{
+  using namespace numeric;
+  auto constexpr NUM_ELEMENTS = 1000;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const iota  = thrust::make_counting_iterator(1);
+    auto const col   = fp_wrapper<__int128_t>(iota, iota + NUM_ELEMENTS, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    auto expect_data = std::vector<__int128_t>(NUM_ELEMENTS);
+    std::iota(expect_data.begin(), expect_data.end(), 1);
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(expect_data).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint32TableNullsSimple)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 0, 0};
+    auto const validity = std::vector<uint8_t>{1, 1, 1, 1, 1, 1, 0, 0};
+    auto const col =
+      fp_wrapper<int32_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int32_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(data, validity).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint64TableNullsSimple)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 0, 0};
+    auto const validity = std::vector<uint8_t>{1, 1, 1, 1, 1, 1, 0, 0};
+    auto const col =
+      fp_wrapper<int64_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int64_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(data, validity).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+TEST_F(ToArrowHostDeviceTest, FixedPoint128TableNullsSimple)
+{
+  using namespace numeric;
+
+  for (auto const scale : {3, 2, 1, 0, -1, -2, -3}) {
+    auto const data     = std::vector<__int128_t>{1, 2, 3, 4, 5, 6, 0, 0};
+    auto const validity = std::vector<uint8_t>{1, 1, 1, 1, 1, 1, 0, 0};
+    auto const col =
+      fp_wrapper<__int128_t>({1, 2, 3, 4, 5, 6, 0, 0}, {1, 1, 1, 1, 1, 1, 0, 0}, scale_type{scale});
+    auto const input = cudf::table_view({col});
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<__int128_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    get_nanoarrow_array<__int128_t>(data, validity).move(expected_array->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowArrayFinishBuildingDefault(expected_array.get(), nullptr));
+
+    auto got_arrow_host = cudf::to_arrow_host(input);
+    EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+    EXPECT_EQ(-1, got_arrow_host->device_id);
+    EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+    ArrowArrayView expected, actual;
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(&expected, &actual);
+    ArrowArrayViewReset(&actual);
+
+    got_arrow_host = cudf::to_arrow_host(input.column(0));
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayViewInitFromSchema(&actual, expected_schema->children[0], nullptr));
+    NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+    compare_arrays(expected.children[0], &actual);
+    ArrowArrayViewReset(&actual);
+
+    ArrowArrayViewReset(&expected);
+  }
+}
+
+struct ToArrowHostDeviceTestSlice
+  : public ToArrowHostDeviceTest,
+    public ::testing::WithParamInterface<std::tuple<cudf::size_type, cudf::size_type>> {};
+
+TEST_P(ToArrowHostDeviceTestSlice, SliceTest)
+{
+  auto [table, expected_schema, expected_array] = get_nanoarrow_host_tables(10000);
+  auto cudf_table_view                          = table->view();
+  auto const [start, end]                       = GetParam();
+
+  slice_host_nanoarrow(expected_array.get(), start, end);
+  auto sliced_cudf_table = cudf::slice(cudf_table_view, {start, end})[0];
+  auto got_arrow_host    = cudf::to_arrow_host(sliced_cudf_table);
+  EXPECT_EQ(ARROW_DEVICE_CPU, got_arrow_host->device_type);
+  EXPECT_EQ(-1, got_arrow_host->device_id);
+  EXPECT_EQ(nullptr, got_arrow_host->sync_event);
+
+  ArrowArrayView expected, actual;
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&expected, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&expected, expected_array.get(), nullptr));
+
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewInitFromSchema(&actual, expected_schema.get(), nullptr));
+  NANOARROW_THROW_NOT_OK(ArrowArrayViewSetArray(&actual, &got_arrow_host->array, nullptr));
+  compare_arrays(&expected, &actual);
+  ArrowArrayViewReset(&actual);
+
+  ArrowArrayViewReset(&expected);
+}
+
+INSTANTIATE_TEST_CASE_P(ToArrowHostDeviceTest,
+                        ToArrowHostDeviceTestSlice,
+                        ::testing::Values(std::make_tuple(0, 10000),
+                                          std::make_tuple(100, 3000),
+                                          std::make_tuple(0, 0),
+                                          std::make_tuple(0, 3000)));

From 743264f6ac924fdbec58fad666f989b14b901a98 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 24 Jul 2024 05:32:31 -0500
Subject: [PATCH 577/842] Warn on cuDF failure when `POLARS_VERBOSE` is true
 (#16308)

Just something quick to get us started here

Closes https://github.com/rapidsai/cudf/issues/16256

Authors:
  - https://github.com/brandon-b-miller
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16308
---
 python/cudf_polars/cudf_polars/callback.py | 12 +++++++-
 python/cudf_polars/tests/test_config.py    | 34 ++++++++++++++++++++++
 2 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf_polars/tests/test_config.py

diff --git a/python/cudf_polars/cudf_polars/callback.py b/python/cudf_polars/cudf_polars/callback.py
index 764cdd3b3ca..f31193aa938 100644
--- a/python/cudf_polars/cudf_polars/callback.py
+++ b/python/cudf_polars/cudf_polars/callback.py
@@ -5,11 +5,15 @@
 
 from __future__ import annotations
 
+import os
+import warnings
 from functools import partial
 from typing import TYPE_CHECKING
 
 import nvtx
 
+from polars.exceptions import PerformanceWarning
+
 from cudf_polars.dsl.translate import translate_ir
 
 if TYPE_CHECKING:
@@ -61,6 +65,12 @@ def execute_with_cudf(
     try:
         with nvtx.annotate(message="ConvertIR", domain="cudf_polars"):
             nt.set_udf(partial(_callback, translate_ir(nt)))
-    except exception:
+    except exception as e:
+        if bool(int(os.environ.get("POLARS_VERBOSE", 0))):
+            warnings.warn(
+                f"Query execution with GPU not supported, reason: {type(e)}: {e}",
+                PerformanceWarning,
+                stacklevel=2,
+            )
         if raise_on_fail:
             raise
diff --git a/python/cudf_polars/tests/test_config.py b/python/cudf_polars/tests/test_config.py
new file mode 100644
index 00000000000..5b4bba55552
--- /dev/null
+++ b/python/cudf_polars/tests/test_config.py
@@ -0,0 +1,34 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import pytest
+
+import polars as pl
+
+from cudf_polars.dsl.ir import IR
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
+
+
+def test_polars_verbose_warns(monkeypatch):
+    def raise_unimplemented(self):
+        raise NotImplementedError("We don't support this")
+
+    monkeypatch.setattr(IR, "__post_init__", raise_unimplemented)
+    q = pl.LazyFrame({})
+    # Ensure that things raise
+    assert_ir_translation_raises(q, NotImplementedError)
+    with (
+        pl.Config(verbose=True),
+        pytest.raises(pl.exceptions.ComputeError),
+        pytest.warns(
+            pl.exceptions.PerformanceWarning,
+            match="Query execution with GPU not supported",
+        ),
+    ):
+        # And ensure that collecting issues the correct warning.
+        assert_gpu_result_equal(q)

From 7191b74ce244518f17ef65e701f5a262f1c5cf8a Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 24 Jul 2024 03:55:48 -1000
Subject: [PATCH 578/842] Align Index __init__ APIs with pandas 2.x (#16362)

* It would be nice to have `Index`'s constructor to not go through `IndexMeta.__call__`, but I think that would be a separate effort
* There were a couple `verify_integrity` keyword arguments added that don't raise a `NotImplementedError` since there's not support, but I don't think it's worth making this case falling back in `cudf.pandas` as it's just a validation and won't affect further behavior with the object

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16362
---
 docs/cudf/source/conf.py            |  1 +
 python/cudf/cudf/core/index.py      | 48 ++++++++++++++++++++++-------
 python/cudf/cudf/core/multiindex.py |  2 +-
 3 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index c3c14ac8cad..f544536fb31 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -556,6 +556,7 @@ def on_missing_reference(app, env, node, contnode):
     ("py:class", "Dtype"),
     # The following are erroneously warned due to
     # https://github.com/sphinx-doc/sphinx/issues/11225
+    ("py:obj", "cudf.Index.values_host"),
     ("py:class", "pa.Array"),
     ("py:class", "ScalarLike"),
     ("py:class", "ParentType"),
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 73b7298410a..1c48b8f4f2d 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -78,6 +78,11 @@ class IndexMeta(type):
     """Custom metaclass for Index that overrides instance/subclass tests."""
 
     def __call__(cls, data, *args, **kwargs):
+        if kwargs.get("tupleize_cols", True) is not True:
+            raise NotImplementedError(
+                "tupleize_cols is currently not supported."
+            )
+
         if cls is Index:
             return as_index(
                 arbitrary=data,
@@ -997,21 +1002,23 @@ def __dask_tokenize__(self):
 
 class Index(SingleColumnFrame, BaseIndex, metaclass=IndexMeta):
     """
-    An array of orderable values that represent the indices of another Column
+    Immutable sequence used for indexing and alignment.
 
-    Attributes
-    ----------
-    _values: A Column object
-    name: A string
+    The basic object storing axis labels for all pandas objects.
 
     Parameters
     ----------
-    data : Column
-        The Column of data for this index
-    name : str optional
-        The name of the Index. If not provided, the Index adopts the value
-        Column's name. Otherwise if this name is different from the value
-        Column's, the data Column will be cloned to adopt this name.
+    data : array-like (1-dimensional)
+    dtype : str, numpy.dtype, or ExtensionDtype, optional
+        Data type for the output Index. If not specified, this will be
+        inferred from `data`.
+    copy : bool, default False
+        Copy input data.
+    name : object
+        Name to be stored in the index.
+    tupleize_cols : bool (default: True)
+        When True, attempt to create a MultiIndex if possible.
+        Currently not supported.
     """
 
     @_performance_tracking
@@ -1735,8 +1742,18 @@ def __init__(
         if tz is not None:
             raise NotImplementedError("tz is not yet supported")
         if normalize is not False:
+            warnings.warn(
+                "The 'normalize' keyword is "
+                "deprecated and will be removed in a future version. ",
+                FutureWarning,
+            )
             raise NotImplementedError("normalize == True is not yet supported")
         if closed is not None:
+            warnings.warn(
+                "The 'closed' keyword is "
+                "deprecated and will be removed in a future version. ",
+                FutureWarning,
+            )
             raise NotImplementedError("closed is not yet supported")
         if ambiguous != "raise":
             raise NotImplementedError("ambiguous is not yet supported")
@@ -2480,6 +2497,14 @@ def __init__(
         if freq is not None:
             raise NotImplementedError("freq is not yet supported")
 
+        if closed is not None:
+            warnings.warn(
+                "The 'closed' keyword is "
+                "deprecated and will be removed in a future version. ",
+                FutureWarning,
+            )
+            raise NotImplementedError("closed is not yet supported")
+
         if unit is not None:
             warnings.warn(
                 "The 'unit' keyword is "
@@ -2863,6 +2888,7 @@ def __init__(
         dtype=None,
         copy: bool = False,
         name=None,
+        verify_integrity: bool = True,
     ):
         name = _getdefault_name(data, name=name)
 
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index ff4b06c6334..dfc596bf279 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -150,7 +150,7 @@ def __init__(
         dtype=None,
         copy=False,
         name=None,
-        **kwargs,
+        verify_integrity=True,
     ):
         if sortorder is not None:
             raise NotImplementedError("sortorder is not yet supported")

From 8fcf72a787acb0168c97d11b8ab9130146e9b37e Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Wed, 24 Jul 2024 12:06:29 -0500
Subject: [PATCH 579/842] [JNI] Add setKernelPinnedCopyThreshold and
 setPinnedAllocationThreshold (#16288)

In 24.08 two new cuDF methods are being added, and the second method is still in flight (see: https://github.com/rapidsai/cudf/pull/16206):

```
cudf::set_kernel_pinned_copy_threshold
cudf::set_allocate_host_as_pinned_threshold
```

We'd like to expose these methods in our JNI layer. I created a Cudf.java with the two static methods, and put the definitions in CudfJni.cpp.

Marked as draft since I need https://github.com/rapidsai/cudf/pull/16206 to merge, and we are still testing it.

Authors:
  - Alessandro Bellina (https://github.com/abellina)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/16288
---
 java/src/main/java/ai/rapids/cudf/Cudf.java | 36 +++++++++++++++++++++
 java/src/main/native/src/CudfJni.cpp        | 25 ++++++++++++++
 2 files changed, 61 insertions(+)
 create mode 100644 java/src/main/java/ai/rapids/cudf/Cudf.java

diff --git a/java/src/main/java/ai/rapids/cudf/Cudf.java b/java/src/main/java/ai/rapids/cudf/Cudf.java
new file mode 100644
index 00000000000..d09e2f87ed4
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/Cudf.java
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf;
+
+public class Cudf {
+
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
+  /**
+   * cuDF copies that are smaller than the threshold will use a kernel to copy, instead
+   * of cudaMemcpyAsync.
+   */
+  public static native void setKernelPinnedCopyThreshold(long kernelPinnedCopyThreshold);
+
+  /**
+   * cudf allocations that are smaller than the threshold will use the pinned host
+   * memory resource.
+   */
+  public static native void setPinnedAllocationThreshold(long pinnedAllocationThreshold);
+}
diff --git a/java/src/main/native/src/CudfJni.cpp b/java/src/main/native/src/CudfJni.cpp
index 698a8f6ff02..2860dc2e4b2 100644
--- a/java/src/main/native/src/CudfJni.cpp
+++ b/java/src/main/native/src/CudfJni.cpp
@@ -18,6 +18,7 @@
 
 #include <cudf/copying.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 
 #include <sstream>
 
@@ -201,4 +202,28 @@ JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_Cuda_isPtdsEnabled(JNIEnv* env, j
   return cudf::jni::is_ptds_enabled;
 }
 
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cudf_setKernelPinnedCopyThreshold(JNIEnv* env,
+                                                                             jclass clazz,
+                                                                             jlong jthreshold)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto threshold = static_cast<std::size_t>(jthreshold);
+    cudf::set_kernel_pinned_copy_threshold(threshold);
+  }
+  CATCH_STD(env, )
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Cudf_setPinnedAllocationThreshold(JNIEnv* env,
+                                                                             jclass clazz,
+                                                                             jlong jthreshold)
+{
+  try {
+    cudf::jni::auto_set_device(env);
+    auto threshold = static_cast<std::size_t>(jthreshold);
+    cudf::set_allocate_host_as_pinned_threshold(threshold);
+  }
+  CATCH_STD(env, )
+}
+
 }  // extern "C"

From 73937fbabaeea76665663ed23688b1cac61b7ee9 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 24 Jul 2024 16:42:00 -0400
Subject: [PATCH 580/842] Migrate lists/filling to pylibcudf (#16189)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16189
---
 .../_lib/pylibcudf/libcudf/lists/filling.pxd  | 19 ++++++++++
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  2 +
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 38 +++++++++++++++++++
 .../cudf/cudf/pylibcudf_tests/test_lists.py   | 16 ++++++++
 4 files changed, 75 insertions(+)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/lists/filling.pxd

diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/filling.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/filling.pxd
new file mode 100644
index 00000000000..8403fd179f7
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/filling.pxd
@@ -0,0 +1,19 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+
+
+cdef extern from "cudf/lists/filling.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] sequences(
+        const column_view& starts,
+        const column_view& sizes,
+    ) except +
+
+    cdef unique_ptr[column] sequences(
+        const column_view& starts,
+        const column_view& steps,
+        const column_view& sizes,
+    ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index cacecae6010..6e9bd5ff76b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -36,4 +36,6 @@ cpdef Column extract_list_element(Column, ColumnOrSizeType)
 
 cpdef Column count_elements(Column)
 
+cpdef Column sequences(Column, Column, Column steps = *)
+
 cpdef Column sort_lists(Column, bool, null_order, bool stable = *)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index b5661a3e634..3837eaaca78 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -9,6 +9,7 @@ from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.lists cimport (
     contains as cpp_contains,
     explode as cpp_explode,
+    filling as cpp_filling,
     gather as cpp_gather,
     reverse as cpp_reverse,
 )
@@ -326,6 +327,43 @@ cpdef Column count_elements(Column input):
     return Column.from_libcudf(move(c_result))
 
 
+cpdef Column sequences(Column starts, Column sizes, Column steps = None):
+    """Create a lists column in which each row contains a sequence of
+    values specified by a tuple of (start, step, size) parameters.
+
+    For details, see :cpp:func:`sequences`.
+
+    Parameters
+    ----------
+    starts : Column
+        First values in the result sequences.
+    sizes : Column
+        Numbers of values in the result sequences.
+    steps : Optional[Column]
+        Increment values for the result sequences.
+
+    Returns
+    -------
+    Column
+        The result column containing generated sequences.
+    """
+    cdef unique_ptr[column] c_result
+
+    if steps is not None:
+        with nogil:
+            c_result = move(cpp_filling.sequences(
+                starts.view(),
+                steps.view(),
+                sizes.view(),
+            ))
+    else:
+        with nogil:
+            c_result = move(cpp_filling.sequences(
+                starts.view(),
+                sizes.view(),
+            ))
+    return Column.from_libcudf(move(c_result))
+
 cpdef Column sort_lists(
     Column input,
     bool ascending,
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index 87472f6d59b..0b2e0e00ce8 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -198,6 +198,22 @@ def test_count_elements(test_data):
     assert_column_eq(expect, res)
 
 
+def test_sequences():
+    starts = plc.interop.from_arrow(pa.array([0, 1, 2, 3, 4]))
+    steps = plc.interop.from_arrow(pa.array([2, 1, 1, 1, -3]))
+    sizes = plc.interop.from_arrow(pa.array([0, 2, 2, 1, 3]))
+
+    res1 = plc.lists.sequences(starts, sizes, steps)
+    res2 = plc.lists.sequences(starts, sizes)
+
+    expect1 = pa.array([[], [1, 2], [2, 3], [3], [4, 1, -2]])
+    expect2 = pa.array([[], [1, 2], [2, 3], [3], [4, 5, 6]])
+
+    assert_column_eq(expect1, res1)
+
+    assert_column_eq(expect2, res2)
+
+
 @pytest.mark.parametrize(
     "ascending,na_position,expected",
     [

From 8bba6dfad239b4fd69a82acbc5dd7707ba576cce Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 24 Jul 2024 18:16:03 -0400
Subject: [PATCH 581/842] Migrate lists/set_operations to pylibcudf (#16190)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16190
---
 .../libcudf/lists/set_operations.pxd          |  39 ++++
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |   8 +
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 203 +++++++++++++++++-
 .../cudf/cudf/pylibcudf_tests/test_lists.py   |  90 ++++++++
 4 files changed, 339 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/lists/set_operations.pxd

diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/set_operations.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/set_operations.pxd
new file mode 100644
index 00000000000..eb796897f87
--- /dev/null
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/set_operations.pxd
@@ -0,0 +1,39 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+
+from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
+    lists_column_view,
+)
+from cudf._lib.pylibcudf.libcudf.types cimport nan_equality, null_equality
+
+
+cdef extern from "cudf/lists/set_operations.hpp" namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] difference_distinct(
+        const lists_column_view& lhs,
+        const lists_column_view& rhs,
+        null_equality nulls_equal,
+        nan_equality nans_equal
+    ) except +
+
+    cdef unique_ptr[column] have_overlap(
+        const lists_column_view& lhs,
+        const lists_column_view& rhs,
+        null_equality nulls_equal,
+        nan_equality nans_equal
+    ) except +
+
+    cdef unique_ptr[column] intersect_distinct(
+        const lists_column_view& lhs,
+        const lists_column_view& rhs,
+        null_equality nulls_equal,
+        nan_equality nans_equal
+    ) except +
+
+    cdef unique_ptr[column] union_distinct(
+        const lists_column_view& lhs,
+        const lists_column_view& rhs,
+        null_equality nulls_equal,
+        nan_equality nans_equal
+    ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index 6e9bd5ff76b..4e2406c2aea 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -39,3 +39,11 @@ cpdef Column count_elements(Column)
 cpdef Column sequences(Column, Column, Column steps = *)
 
 cpdef Column sort_lists(Column, bool, null_order, bool stable = *)
+
+cpdef Column difference_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+
+cpdef Column have_overlap(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+
+cpdef Column intersect_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+
+cpdef Column union_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index 3837eaaca78..7555c8c6970 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -12,6 +12,7 @@ from cudf._lib.pylibcudf.libcudf.lists cimport (
     filling as cpp_filling,
     gather as cpp_gather,
     reverse as cpp_reverse,
+    set_operations as cpp_set_operations,
 )
 from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
     concatenate_list_elements as cpp_concatenate_list_elements,
@@ -29,7 +30,13 @@ from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
     stable_sort_lists as cpp_stable_sort_lists,
 )
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, size_type
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    nan_equality,
+    null_equality,
+    null_order,
+    order,
+    size_type,
+)
 from cudf._lib.pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType
 
 from .column cimport Column, ListColumnView
@@ -413,3 +420,197 @@ cpdef Column sort_lists(
                     na_position,
             ))
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column difference_distinct(
+    Column lhs,
+    Column rhs,
+    bool nulls_equal=True,
+    bool nans_equal=True
+):
+    """Create a column of index values indicating the position of a search
+    key row within the corresponding list row in the lists column.
+
+    For details, see :cpp:func:`difference_distinct`.
+
+    Parameters
+    ----------
+    lhs : Column
+        The input lists column of elements that may be included.
+    rhs : Column
+        The input lists column of elements to exclude.
+    nulls_equal : bool, default True
+        If true, null elements are considered equal. Otherwise, unequal.
+    nans_equal : bool, default True
+        If true, libcudf will treat nan elements from {-nan, +nan}
+        as equal. Otherwise, unequal. Otherwise, unequal.
+
+    Returns
+    -------
+    Column
+        A lists column containing the difference results.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView lhs_view = lhs.list_view()
+    cdef ListColumnView rhs_view = rhs.list_view()
+
+    cdef null_equality c_nulls_equal = (
+        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
+    )
+    cdef nan_equality c_nans_equal = (
+        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
+    )
+
+    with nogil:
+        c_result = move(cpp_set_operations.difference_distinct(
+            lhs_view.view(),
+            rhs_view.view(),
+            c_nulls_equal,
+            c_nans_equal,
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column have_overlap(
+    Column lhs,
+    Column rhs,
+    bool nulls_equal=True,
+    bool nans_equal=True
+):
+    """Check if lists at each row of the given lists columns overlap.
+
+    For details, see :cpp:func:`have_overlap`.
+
+    Parameters
+    ----------
+    lhs : Column
+        The input lists column for one side.
+    rhs : Column
+        The input lists column for the other side.
+    nulls_equal : bool, default True
+        If true, null elements are considered equal. Otherwise, unequal.
+    nans_equal : bool, default True
+        If true, libcudf will treat nan elements from {-nan, +nan}
+        as equal. Otherwise, unequal. Otherwise, unequal.
+
+    Returns
+    -------
+    Column
+        A column containing the check results.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView lhs_view = lhs.list_view()
+    cdef ListColumnView rhs_view = rhs.list_view()
+
+    cdef null_equality c_nulls_equal = (
+        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
+    )
+    cdef nan_equality c_nans_equal = (
+        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
+    )
+
+    with nogil:
+        c_result = move(cpp_set_operations.have_overlap(
+            lhs_view.view(),
+            rhs_view.view(),
+            c_nulls_equal,
+            c_nans_equal,
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column intersect_distinct(
+    Column lhs,
+    Column rhs,
+    bool nulls_equal=True,
+    bool nans_equal=True
+):
+    """Create a lists column of distinct elements common to two input lists columns.
+
+    For details, see :cpp:func:`intersect_distinct`.
+
+    Parameters
+    ----------
+    lhs : Column
+        The input lists column of elements that may be included.
+    rhs : Column
+        The input lists column of elements to exclude.
+    nulls_equal : bool, default True
+        If true, null elements are considered equal. Otherwise, unequal.
+    nans_equal : bool, default True
+        If true, libcudf will treat nan elements from {-nan, +nan}
+        as equal. Otherwise, unequal. Otherwise, unequal.
+
+    Returns
+    -------
+    Column
+        A lists column containing the intersection results.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView lhs_view = lhs.list_view()
+    cdef ListColumnView rhs_view = rhs.list_view()
+
+    cdef null_equality c_nulls_equal = (
+        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
+    )
+    cdef nan_equality c_nans_equal = (
+        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
+    )
+
+    with nogil:
+        c_result = move(cpp_set_operations.intersect_distinct(
+            lhs_view.view(),
+            rhs_view.view(),
+            c_nulls_equal,
+            c_nans_equal,
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column union_distinct(
+    Column lhs,
+    Column rhs,
+    bool nulls_equal=True,
+    bool nans_equal=True
+):
+    """Create a lists column of distinct elements found in
+    either of two input lists columns.
+
+    For details, see :cpp:func:`union_distinct`.
+
+    Parameters
+    ----------
+    lhs : Column
+        The input lists column of elements that may be included.
+    rhs : Column
+        The input lists column of elements to exclude.
+    nulls_equal : bool, default True
+        If true, null elements are considered equal. Otherwise, unequal.
+    nans_equal : bool, default True
+        If true, libcudf will treat nan elements from {-nan, +nan}
+        as equal. Otherwise, unequal. Otherwise, unequal.
+
+    Returns
+    -------
+    Column
+        A lists column containing the union results.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView lhs_view = lhs.list_view()
+    cdef ListColumnView rhs_view = rhs.list_view()
+
+    cdef null_equality c_nulls_equal = (
+        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
+    )
+    cdef nan_equality c_nans_equal = (
+        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
+    )
+
+    with nogil:
+        c_result = move(cpp_set_operations.union_distinct(
+            lhs_view.view(),
+            rhs_view.view(),
+            c_nulls_equal,
+            c_nans_equal,
+        ))
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index 0b2e0e00ce8..f135ab4ccff 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+import numpy as np
 import pyarrow as pa
 import pytest
 from utils import assert_column_eq
@@ -22,6 +23,13 @@ def column():
     return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32())
 
 
+@pytest.fixture
+def set_lists_column():
+    lhs = [[np.nan, np.nan, 2, 1, 2], [1, 2, 3], None, [4, None, 5]]
+    rhs = [[np.nan, 1, 2, 3], [4, 5], [None, 7, 8], [None, None]]
+    return lhs, rhs
+
+
 @pytest.fixture
 def lists_column():
     return [[4, 2, 3, 1], [1, 2, None, 4], [-10, 10, 10, 0]]
@@ -253,3 +261,85 @@ def test_sort_lists(lists_column, ascending, na_position, expected):
 
     assert_column_eq(expect, res)
     assert_column_eq(expect, res_stable)
+
+
+@pytest.mark.parametrize(
+    "set_operation,nans_equal,nulls_equal,expected",
+    [
+        (
+            plc.lists.difference_distinct,
+            True,
+            True,
+            [[], [1, 2, 3], None, [4, 5]],
+        ),
+        (
+            plc.lists.difference_distinct,
+            False,
+            True,
+            [[], [1, 2, 3], None, [4, None, 5]],
+        ),
+        (
+            plc.lists.have_overlap,
+            True,
+            True,
+            [True, False, None, True],
+        ),
+        (
+            plc.lists.have_overlap,
+            False,
+            False,
+            [True, False, None, False],
+        ),
+        (
+            plc.lists.intersect_distinct,
+            True,
+            True,
+            [[np.nan, 1, 2], [], None, [None]],
+        ),
+        (
+            plc.lists.intersect_distinct,
+            True,
+            False,
+            [[1, 2], [], None, [None]],
+        ),
+        (
+            plc.lists.union_distinct,
+            False,
+            True,
+            [
+                [np.nan, 2, 1, 3],
+                [1, 2, 3, 4, 5],
+                None,
+                [4, None, 5, None, None],
+            ],
+        ),
+        (
+            plc.lists.union_distinct,
+            False,
+            False,
+            [
+                [np.nan, np.nan, 2, 1, np.nan, 3],
+                [1, 2, 3, 4, 5],
+                None,
+                [4, None, 5, None, None],
+            ],
+        ),
+    ],
+)
+def test_set_operations(
+    set_lists_column, set_operation, nans_equal, nulls_equal, expected
+):
+    lhs, rhs = set_lists_column
+
+    res = set_operation(
+        plc.interop.from_arrow(pa.array(lhs)),
+        plc.interop.from_arrow(pa.array(rhs)),
+        nans_equal,
+        nulls_equal,
+    )
+
+    if set_operation != plc.lists.have_overlap:
+        expect = pa.array(expected, type=pa.list_(pa.float64()))
+    else:
+        expect = pa.array(expected)
+    assert_column_eq(expect, res)

From 59f65843b80d967f743841aee8489b6ae63b269a Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 24 Jul 2024 16:10:28 -0700
Subject: [PATCH 582/842] Gracefully CUDF_FAIL when `skip_rows > 0` in Chunked
 Parquet reader (#16385)

This PR must merge in cudf 24.08 to avoid unhandled expections.

Gracefully CUDF_FAIL in chunked parquet reader when `skip_rows>0` which may result in runtime exceptions like segfaults or an infinite loop. See #16186 for more information.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16385
---
 cpp/src/io/parquet/reader.cpp               |  5 ++++
 cpp/tests/io/parquet_chunked_reader_test.cu | 29 ++++++++++++++-------
 2 files changed, 24 insertions(+), 10 deletions(-)

diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index 8dfd68cd9b8..65dafb568c0 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -41,6 +41,11 @@ chunked_reader::chunked_reader(std::size_t chunk_read_limit,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr)
 {
+  // TODO: skip_rows not currently supported in chunked parquet reader until
+  // https://github.com/rapidsai/cudf/issues/16186 is closed
+  CUDF_EXPECTS(options.get_skip_rows() == 0,
+               "skip_rows > 0 is not currently supported in the Chunked Parquet reader.");
+
   _impl = std::make_unique<impl>(
     chunk_read_limit, pass_read_limit, std::move(sources), options, stream, mr);
 }
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index 2917852235c..66b36aeed63 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -1544,7 +1544,8 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSource)
 
   // Chunked-read rows_to_read rows skipping rows_to_skip from single data source
   {
-    auto const rows_to_skip          = 1'237;
+    // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
+    auto const rows_to_skip          = 0;  // 1'237
     auto const rows_to_read          = 7'232;
     auto constexpr output_read_limit = 1'500;
     auto constexpr pass_read_limit   = 3'500;
@@ -1571,7 +1572,8 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSource)
 
   // Chunked-read two data sources skipping the first entire file completely
   {
-    auto constexpr rows_to_skip      = 15'723;
+    // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
+    auto constexpr rows_to_skip      = 0;  // 15'723;
     auto constexpr output_read_limit = 1'024'000;
     auto constexpr pass_read_limit   = 1'024'000;
 
@@ -1588,20 +1590,25 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSource)
 
     auto const [result, num_chunks, num_rows_per_source] = read_table_and_nrows_per_source(reader);
 
+    // TODO: Enable code inside /* */ when https://github.com/rapidsai/cudf/issues/16186 is resolved
     auto int64_col_selected =
-      int64s_col(int64_data.begin() + rows_to_skip - num_rows, int64_data.end()).release();
+      int64s_col(int64_data.begin() /* + rows_to_skip - num_rows */, int64_data.end()).release();
 
     cudf::table_view const expected_selected({int64_col_selected->view()});
 
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+    // TODO: Enable the following check when https://github.com/rapidsai/cudf/issues/16186
+    // is resolved
+    // CUDF_TEST_EXPECT_TABLES_EQUAL(expected_selected, result->view());
+
     EXPECT_EQ(num_rows_per_source.size(), 2);
-    EXPECT_EQ(num_rows_per_source[0], 0);
-    EXPECT_EQ(num_rows_per_source[1], nsources * num_rows - rows_to_skip);
+    EXPECT_EQ(num_rows_per_source[0], num_rows /* 0 */);
+    EXPECT_EQ(num_rows_per_source[1], num_rows /* nsources * num_rows - rows_to_skip */);
   }
 
   // Chunked-read from single data source skipping rows_to_skip
   {
-    auto const rows_to_skip          = 1'237;
+    // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
+    auto const rows_to_skip          = 0;  // 1'237;
     auto constexpr output_read_limit = 1'500;
     auto constexpr pass_read_limit   = 1'800;
 
@@ -1736,7 +1743,8 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSourceMultipleSources)
 
   // Chunked-read rows_to_read rows skipping rows_to_skip from eight data sources
   {
-    auto const rows_to_skip          = 25'571;
+    // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
+    auto const rows_to_skip          = 0;  // 25'571;
     auto const rows_to_read          = 41'232;
     auto constexpr output_read_limit = 15'000;
     auto constexpr pass_read_limit   = 35'000;
@@ -1782,8 +1790,9 @@ TEST_F(ParquetChunkedReaderTest, TestNumRowsPerSourceMultipleSources)
 
   // Chunked-read four data sources skipping three files completely
   {
-    auto const nsources              = 4;
-    int constexpr rows_to_skip       = num_rows * 3 + 1;
+    auto const nsources = 4;
+    // TODO: rows_to_skip = 0 until https://github.com/rapidsai/cudf/issues/16186 is resolved
+    int constexpr rows_to_skip       = 0;  // num_rows * 3 + 1;
     auto constexpr output_read_limit = 15'000;
     auto constexpr pass_read_limit   = 35'000;
     std::vector<int64_t> int64_selected_data{};

From 29ce5c529ea9ea18edc32ab905f1ef076f266008 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 25 Jul 2024 01:29:41 +0200
Subject: [PATCH 583/842] Fix some issues with deprecated / removed cccl
 facilities (#16377)

`cub::If` has been deprecated and should not be used. There is a better alternative in `cuda::std::conditional_t`

`thrust::{binary, unary}_function` has been deprecated and does not serve a purpose similar to the removed `std::{binary, unary}_function`

Rather than relying on the type aliases one should use the `std::invoke` machinery

Authors:
  - Michael Schellenberger Costa (https://github.com/miscco)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Bernhard Manfred Gruber (https://github.com/bernhardmgruber)

URL: https://github.com/rapidsai/cudf/pull/16377
---
 cpp/benchmarks/common/generate_input.cu          | 2 +-
 cpp/include/cudf/detail/gather.cuh               | 2 +-
 cpp/src/io/fst/agent_dfa.cuh                     | 2 +-
 cpp/src/reductions/minmax.cu                     | 3 +--
 java/src/main/native/src/aggregation128_utils.cu | 2 +-
 5 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index 6df2cb44adc..0970003deb2 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -718,7 +718,7 @@ std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profi
 }
 
 template <typename T>
-struct clamp_down : public thrust::unary_function<T, T> {
+struct clamp_down {
   T max;
   clamp_down(T max) : max(max) {}
   __host__ __device__ T operator()(T x) const { return min(x, max); }
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index d3e9fc4974d..e8e95380815 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -518,7 +518,7 @@ struct column_gatherer_impl<struct_view> {
  * Positive indices are unchanged by this transformation.
  */
 template <typename map_type>
-struct index_converter : public thrust::unary_function<map_type, map_type> {
+struct index_converter {
   index_converter(size_type n_rows) : n_rows(n_rows) {}
 
   __device__ map_type operator()(map_type in) const { return ((in % n_rows) + n_rows) % n_rows; }
diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh
index bc5b94e2718..0e70984b39c 100644
--- a/cpp/src/io/fst/agent_dfa.cuh
+++ b/cpp/src/io/fst/agent_dfa.cuh
@@ -791,7 +791,7 @@ __launch_bounds__(int32_t(AgentDFAPolicy::BLOCK_THREADS)) CUDF_KERNEL
     can_use_smem_cache;
 
   using DFASimulationCallbackWrapperT =
-    typename cub::If<use_smem_cache, WriteCoalescingT, NonWriteCoalescingT>::Type;
+    cuda::std::conditional_t<use_smem_cache, WriteCoalescingT, NonWriteCoalescingT>;
 
   // Stage 1: Compute the state-transition vector
   if (IS_TRANS_VECTOR_PASS || IS_SINGLE_PASS) {
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index 2c1181972c5..6cb58786971 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -107,8 +107,7 @@ rmm::device_scalar<OutputType> reduce_device(InputIterator d_in,
  * respectively of the minimums and maximums of the input pairs.
  */
 template <typename T>
-struct minmax_binary_op
-  : public thrust::binary_function<minmax_pair<T>, minmax_pair<T>, minmax_pair<T>> {
+struct minmax_binary_op {
   __device__ minmax_pair<T> operator()(minmax_pair<T> const& lhs, minmax_pair<T> const& rhs) const
   {
     return minmax_pair<T>{thrust::min(lhs.min_val, rhs.min_val),
diff --git a/java/src/main/native/src/aggregation128_utils.cu b/java/src/main/native/src/aggregation128_utils.cu
index a32e7d27085..631df58b017 100644
--- a/java/src/main/native/src/aggregation128_utils.cu
+++ b/java/src/main/native/src/aggregation128_utils.cu
@@ -34,7 +34,7 @@
 namespace {
 
 // Functor to reassemble a 128-bit value from four 64-bit chunks with overflow detection.
-class chunk_assembler : public thrust::unary_function<cudf::size_type, __int128_t> {
+class chunk_assembler {
  public:
   chunk_assembler(bool* overflows,
                   uint64_t const* chunks0,

From ae4c7e3ce4fe100eb919ca00fa34461e44078ba9 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 24 Jul 2024 18:30:53 -0500
Subject: [PATCH 584/842] split up CUDA-suffixed dependencies in
 dependencies.yaml (#16183)

Contributes to https://github.com/rapidsai/build-planning/issues/31

Follow-up to #15245

RAPIDS DLFW builds prefer to build all RAPIDS packages together without CUDA suffixes, leading to the following set of requirements for `cudf` wheels built there:

* project name must be `cudf` (not `cudf-cu12`)
* all dependencies must be unsuffixed (e.g. `rmm` not `rmm-cu12`)
* the correct set of dependencies based on CUDA version must be expressed in the wheel metadata (e.g. `cubinlinker` and `ptxcompiler` on CUDA 11, `pynvjitlink` on CUDA 12)

To meet all 3 of those, this proposes decomposing CUDA-suffixed dependencies in `dependencies.yaml` into two lists... `cuda_suffixed="true"` and `cuda_suffixed="false"`.

That'd allow DLFW builds to do the following to meet its requirements:

```shell
pip wheel \
  -C rapidsai.disable-cuda=true \
  -C rapidsai.matrix-entry="cuda=12.5;cuda_suffixed=false" \
  .
```

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16183
---
 ci/release/update-version.sh      |  5 ++-
 dependencies.yaml                 | 71 ++++++++++++++++++++++---------
 python/cudf/pyproject.toml        |  1 +
 python/cudf_kafka/pyproject.toml  |  1 +
 python/cudf_polars/pyproject.toml |  1 +
 python/custreamz/pyproject.toml   |  1 +
 python/dask_cudf/pyproject.toml   |  1 +
 7 files changed, 59 insertions(+), 22 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index f629de64905..ad96aff3930 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -68,15 +68,18 @@ done
 # README.md update
 sed_runner "s/version == ${CURRENT_SHORT_TAG}/version == ${NEXT_SHORT_TAG}/g" README.md
 sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md
+sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" python/cudf_polars/docs/overview.md
+sed_runner "s/branch-${CURRENT_SHORT_TAG}/branch-${NEXT_SHORT_TAG}/g" python/cudf_polars/docs/overview.md
 
 # Libcudf examples update
 sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/versions.cmake
 
 # CI files
-for FILE in .github/workflows/*.yaml; do
+for FILE in .github/workflows/*.yaml .github/workflows/*.yml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
   sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE};
 done
+sed_runner "s/branch-[0-9]+\.[0-9]+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh
 
 # Java files
 NEXT_FULL_JAVA_TAG="${NEXT_SHORT_TAG}.${PATCH_PEP440}-SNAPSHOT"
diff --git a/dependencies.yaml b/dependencies.yaml
index a19574b7658..48433d8e5c1 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -329,7 +329,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &rmm_conda rmm==24.8.*,>=0.0.0a0
+          - &rmm_unsuffixed rmm==24.8.*,>=0.0.0a0
           - pip
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
@@ -343,13 +343,17 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
-            packages: &build_python_packages_cu12
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
               - rmm-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
-            packages: &build_python_packages_cu11
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
               - rmm-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*rmm_conda] }
+          - {matrix: null, packages: [*rmm_unsuffixed]}
   libarrow_build:
     common:
       - output_types: conda
@@ -567,7 +571,7 @@ dependencies:
           - typing_extensions>=4.0.0
       - output_types: conda
         packages:
-          - *rmm_conda
+          - *rmm_unsuffixed
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -588,23 +592,40 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - pynvjitlink>=0.0.0a0
+              - &pynvjitlink_unsuffixed pynvjitlink>=0.0.0a0
           - matrix: {cuda: "11.*"}
             packages:
-              - cubinlinker
-              - ptxcompiler
+              - &cubinlinker_unsuffixed cubinlinker
+              - &ptxcompiler_unsuffixed ptxcompiler
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - rmm-cu12==24.8.*,>=0.0.0a0
               - pynvjitlink-cu12>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "false"
+            packages:
+              - *rmm_unsuffixed
+              - *pynvjitlink_unsuffixed
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - rmm-cu11==24.8.*,>=0.0.0a0
               - cubinlinker-cu11
               - ptxcompiler-cu11
-          - {matrix: null, packages: [cubinlinker, ptxcompiler, *rmm_conda]}
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "false"
+            packages: &run_cudf_cu11_unsuffixed
+              - *cubinlinker_unsuffixed
+              - *ptxcompiler_unsuffixed
+              - *rmm_unsuffixed
+          - {matrix: null, packages: *run_cudf_cu11_unsuffixed}
   run_cudf_polars:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -706,7 +727,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - &cudf_conda cudf==24.8.*,>=0.0.0a0
+          - &cudf_unsuffixed cudf==24.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -716,18 +737,22 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - cudf-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - cudf-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*cudf_conda]}
+          - {matrix: null, packages: [*cudf_unsuffixed]}
   depends_on_cudf_kafka:
     common:
       - output_types: conda
         packages:
-          - &cudf_kafka_conda cudf_kafka==24.8.*,>=0.0.0a0
+          - &cudf_kafka_unsuffixed cudf_kafka==24.8.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -737,13 +762,17 @@ dependencies:
     specific:
       - output_types: [requirements, pyproject]
         matrices:
-          - matrix: {cuda: "12.*"}
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
             packages:
               - cudf_kafka-cu12==24.8.*,>=0.0.0a0
-          - matrix: {cuda: "11.*"}
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
             packages:
               - cudf_kafka-cu11==24.8.*,>=0.0.0a0
-          - {matrix: null, packages: [*cudf_kafka_conda]}
+          - {matrix: null, packages: [*cudf_kafka_unsuffixed]}
   depends_on_cupy:
     common:
       - output_types: conda
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index dcb33b1fc1a..30b0f6249f9 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -120,6 +120,7 @@ skip = [
 [tool.rapids-build-backend]
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index badfdf06d15..eba4e808a89 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -100,6 +100,7 @@ regex = "(?P<value>.*)"
 [tool.rapids-build-backend]
 build-backend = "scikit_build_core.build"
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 0b559f7a8e9..def1d086cc1 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -182,3 +182,4 @@ docstring-code-format = true
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 7b99e041b54..70d16c4b07f 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -49,6 +49,7 @@ Homepage = "https://github.com/rapidsai/cudf"
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
 
 [tool.setuptools]
 license-files = ["LICENSE"]
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 9b2e3a5a7b1..16e07428d6b 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -58,6 +58,7 @@ Homepage = "https://github.com/rapidsai/cudf"
 [tool.rapids-build-backend]
 build-backend = "setuptools.build_meta"
 dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
 
 [tool.setuptools]
 license-files = ["LICENSE"]

From a36dacb66325e03d3264482d35a5cf7e0b6c7a37 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Thu, 25 Jul 2024 00:31:40 +0100
Subject: [PATCH 585/842] Make C++ compilation warning free after #16297
 (#16379)

In https://github.com/rapidsai/cudf/pull/16297, we deprecated the use of `to_arrow` in favour of `to_arrow_host` and `to_arrow_device`. However, the scalar detail overload of `to_arrow` used the public table overload. So we get a warning when compiling internal libcudf code. Fix this by using the detail API, and fix a bug along the way where we were not passing through the arrow memory resource.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Michael Schellenberger Costa (https://github.com/miscco)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16379
---
 cpp/include/cudf/interop.hpp          | 13 ++++++++-----
 cpp/src/interop/to_arrow.cu           |  2 +-
 cpp/tests/interop/from_arrow_test.cpp |  9 +++++++++
 cpp/tests/interop/to_arrow_test.cpp   | 10 ++++++++++
 cpp/tests/streams/interop_test.cpp    |  9 +++++++++
 5 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 61f7d72a467..73bc205a095 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -152,7 +152,7 @@ struct column_metadata {
  * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
  * converted to Arrow decimal128 of the precision 38.
  */
-[[deprecated]] std::shared_ptr<arrow::Table> to_arrow(
+[[deprecated("Use cudf::to_arrow_host")]] std::shared_ptr<arrow::Table> to_arrow(
   table_view input,
   std::vector<column_metadata> const& metadata = {},
   rmm::cuda_stream_view stream                 = cudf::get_default_stream(),
@@ -177,7 +177,7 @@ struct column_metadata {
  * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
  * converted to Arrow decimal128 of the precision 38.
  */
-[[deprecated]] std::shared_ptr<arrow::Scalar> to_arrow(
+[[deprecated("Use cudf::to_arrow_host")]] std::shared_ptr<arrow::Scalar> to_arrow(
   cudf::scalar const& input,
   column_metadata const& metadata = {},
   rmm::cuda_stream_view stream    = cudf::get_default_stream(),
@@ -395,7 +395,7 @@ unique_device_array_t to_arrow_host(
  * @param mr    Device memory resource used to allocate `cudf::table`
  * @return cudf table generated from given arrow Table
  */
-[[deprecated]] std::unique_ptr<table> from_arrow(
+[[deprecated("Use cudf::from_arrow_host")]] std::unique_ptr<table> from_arrow(
   arrow::Table const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
@@ -403,14 +403,17 @@ unique_device_array_t to_arrow_host(
 /**
  * @brief Create `cudf::scalar` from given arrow Scalar input
  *
- * @deprecated Since 24.08.
+ * @deprecated Since 24.08. Use arrow's `MakeArrayFromScalar` on the
+ * input, followed by `ExportArray` to obtain something that can be
+ * consumed by `from_arrow_host`. Then use `cudf::get_element` to
+ * extract a device scalar from the column.
  *
  * @param input `arrow::Scalar` that needs to be converted to `cudf::scalar`
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr    Device memory resource used to allocate `cudf::scalar`
  * @return cudf scalar generated from given arrow Scalar
  */
-[[deprecated]] std::unique_ptr<cudf::scalar> from_arrow(
+[[deprecated("See docstring for migration strategies")]] std::unique_ptr<cudf::scalar> from_arrow(
   arrow::Scalar const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index e89ecedc218..6b163e3441e 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -458,7 +458,7 @@ std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
 {
   auto const column = cudf::make_column_from_scalar(input, 1, stream);
   cudf::table_view const tv{{column->view()}};
-  auto const arrow_table  = cudf::to_arrow(tv, {metadata}, stream);
+  auto const arrow_table  = detail::to_arrow(tv, {metadata}, stream, ar_mr);
   auto const ac           = arrow_table->column(0);
   auto const maybe_scalar = ac->GetScalar(0);
   if (!maybe_scalar.ok()) { CUDF_FAIL("Failed to produce a scalar"); }
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index 6eaa1a07e08..733e5814425 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+// These interop functions are deprecated. We keep the code in this
+// test and will migrate the tests to export the arrow C data
+// interface which we consume with from_arrow_host. For now, the tests
+// are commented out.
+
+#if 0
+
 #include <tests/interop/arrow_utils.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -595,3 +602,5 @@ TEST_F(FromArrowStructScalarTest, Basic)
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(lhs, cudf_struct_scalar->view());
 }
+
+#endif
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index a1ece0ce0f1..328ba210a3f 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+// These interop functions are deprecated. We keep the code in this
+// test and will migrate the tests to export via the arrow C data
+// interface with to_arrow_host which arrow can consume. For now, the
+// test is commented out.
+
+#if 0
+
 #include <tests/interop/arrow_utils.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -196,6 +203,7 @@ TEST_F(ToArrowTest, DateTimeTable)
   std::vector<std::shared_ptr<arrow::Field>> schema_vector({arrow::field("a", arr->type())});
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
 
+
   auto expected_arrow_table = arrow::Table::Make(schema, {arr});
 
   auto got_arrow_table = cudf::to_arrow(input_view, {{"a"}});
@@ -685,3 +693,5 @@ TEST_F(ToArrowStructScalarTest, Basic)
 }
 
 CUDF_TEST_PROGRAM_MAIN()
+
+#endif
diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp
index 9e4ee5a4a93..9ba862585d0 100644
--- a/cpp/tests/streams/interop_test.cpp
+++ b/cpp/tests/streams/interop_test.cpp
@@ -14,6 +14,13 @@
  * limitations under the License.
  */
 
+// These interop functions are deprecated. We keep the code in this
+// test and will migrate the tests to export via the arrow C data
+// interface with to_arrow_host which arrow can consume. For now, the
+// test is commented out.
+
+#if 0
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/default_stream.hpp>
@@ -67,3 +74,5 @@ TEST_F(ArrowTest, FromArrowScalar)
   auto arrow_scalar = arrow::MakeScalar(value);
   cudf::from_arrow(*arrow_scalar, cudf::test::get_default_stream());
 }
+
+#endif

From 5a3399bec868f44d13c003f172c665919096d8e8 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 24 Jul 2024 19:26:12 -0500
Subject: [PATCH 586/842] fix [tool.setuptools] reference in custreamz config
 (#16365)

Noticed this warning in logs from #16183

> _/python3.10/site-packages/setuptools/config/pyprojecttoml.py:70: _ToolsTypoInMetadata: Ignoring [tools.setuptools] in pyproject.toml, did you mean [tool.setuptools]?_

This fixes that.

## Notes for Reviewers

Intentionally targeting this at 24.10.

This misconfiguration has been in `custreamz` since the 23.04 release ([git blame link](https://github.com/rapidsai/cudf/blame/e6d412cba7c23df7ee500c28257ed9281cea49b9/python/custreamz/pyproject.toml#L60)).

I think the only effect might be that some test files are included in wheels when we don't want to.

I don't think the fix for it needs to be rushed into 24.08.

I searched across RAPIDS in case this was copied from somewhere else... don't see any other instances of this typo that need to be fixed.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16365
---
 python/custreamz/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index 59ce15ac4ef..4be94aa3368 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -57,7 +57,7 @@ zip-safe = false
 [tool.setuptools.dynamic]
 version = {file = "custreamz/VERSION"}
 
-[tools.setuptools.packages.find]
+[tool.setuptools.packages.find]
 include = [
     "custreamz",
     "custreamz.*",

From a33f520b370d048a22de031294311c241ab23858 Mon Sep 17 00:00:00 2001
From: David Gardner <96306125+dagardner-nv@users.noreply.github.com>
Date: Wed, 24 Jul 2024 18:42:16 -0700
Subject: [PATCH 587/842] Fix inconsistent usage of 'results' and 'records' in
 read-json.md (#15766)

* Fix inconsistent usage of 'results' and 'records' in `docs/cudf/source/user_guide/io/read-json.md`

Authors:
  - David Gardner (https://github.com/dagardner-nv)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15766
---
 docs/cudf/source/user_guide/io/read-json.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/cudf/source/user_guide/io/read-json.md b/docs/cudf/source/user_guide/io/read-json.md
index 7049c75d1c1..d2bb021a5b5 100644
--- a/docs/cudf/source/user_guide/io/read-json.md
+++ b/docs/cudf/source/user_guide/io/read-json.md
@@ -218,11 +218,11 @@ reads a JSON object as a single line and then extracts the
 # first read the JSON object with line=True
 >>> df = cudf.read_json(j, lines=True)
 >>> df
-             metadata                                            records
+             metadata                                            results
 0  {'vehicle': 'car'}  [{'id': 0, 'distance': 1.2}, {'id': 1, 'distan...
 
-# then explode the 'records' column
->>> df = df['records'].explode().struct.explode()
+# then explode the 'results' column
+>>> df = df['results'].explode().struct.explode()
 >>> df
    id  distance
 0   0       1.2

From 6486bb928dfb0e1817b0604572e2f5789d05c596 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 24 Jul 2024 22:24:46 -0400
Subject: [PATCH 588/842] Migrate lists/filtering to pylibcudf (#16184)

Apart of #15162

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16184
---
 python/cudf/cudf/_lib/lists.pyx               | 46 ++-------
 .../libcudf/lists/stream_compaction.pxd       |  7 +-
 python/cudf/cudf/_lib/pylibcudf/lists.pxd     |  4 +
 python/cudf/cudf/_lib/pylibcudf/lists.pyx     | 71 ++++++++++++++
 .../cudf/cudf/pylibcudf_tests/test_lists.py   | 94 ++++++++++++++-----
 5 files changed, 158 insertions(+), 64 deletions(-)

diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 50061f6e468..f6d9c8c404c 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -3,23 +3,9 @@
 from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
-from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
-from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
-from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
-    distinct as cpp_distinct,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport (
-    nan_equality,
-    null_equality,
-    null_order,
-    size_type,
-)
+from cudf._lib.pylibcudf.libcudf.types cimport null_order, size_type
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf._lib import pylibcudf
@@ -47,31 +33,13 @@ def explode_outer(list source_columns, int explode_column_idx):
 
 @acquire_spill_lock()
 def distinct(Column col, bool nulls_equal, bool nans_all_equal):
-    """
-    nulls_equal == True indicates that libcudf should treat any two nulls as
-    equal, and as unequal otherwise.
-    nans_all_equal == True indicates that libcudf should treat any two
-    elements from {-nan, +nan} as equal, and as unequal otherwise.
-    """
-    cdef shared_ptr[lists_column_view] list_view = (
-        make_shared[lists_column_view](col.view())
-    )
-    cdef null_equality c_nulls_equal = (
-        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
-    )
-    cdef nan_equality c_nans_equal = (
-        nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL
-    )
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_distinct(list_view.get()[0],
-                         c_nulls_equal,
-                         c_nans_equal)
+    return Column.from_pylibcudf(
+        pylibcudf.lists.distinct(
+            col.to_pylibcudf(mode="read"),
+            nulls_equal,
+            nans_all_equal,
         )
-    return Column.from_unique_ptr(move(c_result))
+    )
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd
index 22b91df7192..b1fcf7800b0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd
@@ -11,8 +11,13 @@ from cudf._lib.pylibcudf.libcudf.types cimport nan_equality, null_equality
 
 cdef extern from "cudf/lists/stream_compaction.hpp" \
         namespace "cudf::lists" nogil:
+    cdef unique_ptr[column] apply_boolean_mask(
+        const lists_column_view& lists_column,
+        const lists_column_view& boolean_mask,
+    ) except +
+
     cdef unique_ptr[column] distinct(
-        const lists_column_view lists_column,
+        const lists_column_view& lists_column,
         null_equality nulls_equal,
         nan_equality nans_equal
     ) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
index 4e2406c2aea..17619b489d2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pxd
@@ -47,3 +47,7 @@ cpdef Column have_overlap(Column, Column, bool nulls_equal=*, bool nans_equal=*)
 cpdef Column intersect_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)
 
 cpdef Column union_distinct(Column, Column, bool nulls_equal=*, bool nans_equal=*)
+
+cpdef Column apply_boolean_mask(Column, Column)
+
+cpdef Column distinct(Column, bool, bool)
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
index 7555c8c6970..c944fc35800 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/lists.pyx
@@ -29,6 +29,10 @@ from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
     sort_lists as cpp_sort_lists,
     stable_sort_lists as cpp_stable_sort_lists,
 )
+from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
+    apply_boolean_mask as cpp_apply_boolean_mask,
+    distinct as cpp_distinct,
+)
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.pylibcudf.libcudf.types cimport (
     nan_equality,
@@ -614,3 +618,70 @@ cpdef Column union_distinct(
             c_nans_equal,
         ))
     return Column.from_libcudf(move(c_result))
+
+
+cpdef Column apply_boolean_mask(Column input, Column boolean_mask):
+    """Filters elements in each row of the input lists column using a boolean mask
+
+    For details, see :cpp:func:`apply_boolean_mask`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    boolean_mask : Column
+        The boolean mask.
+
+    Returns
+    -------
+    Column
+        A Column of filtered elements based upon the boolean mask.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+    cdef ListColumnView mask_view = boolean_mask.list_view()
+    with nogil:
+        c_result = move(cpp_apply_boolean_mask(
+            list_view.view(),
+            mask_view.view(),
+        ))
+    return Column.from_libcudf(move(c_result))
+
+
+cpdef Column distinct(Column input, bool nulls_equal, bool nans_equal):
+    """Create a new list column without duplicate elements in each list.
+
+    For details, see :cpp:func:`distinct`.
+
+    Parameters
+    ----------
+    input : Column
+        The input column.
+    nulls_equal : bool
+        If true, null elements are considered equal. Otherwise, unequal.
+    nans_equal : bool
+        If true, libcudf will treat nan elements from {-nan, +nan}
+        as equal. Otherwise, unequal. Otherwise, unequal.
+
+    Returns
+    -------
+    Column
+        A new list column without duplicate elements in each list.
+    """
+    cdef unique_ptr[column] c_result
+    cdef ListColumnView list_view = input.list_view()
+
+    cdef null_equality c_nulls_equal = (
+        null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
+    )
+    cdef nan_equality c_nans_equal = (
+        nan_equality.ALL_EQUAL if nans_equal else nan_equality.UNEQUAL
+    )
+
+    with nogil:
+        c_result = move(cpp_distinct(
+            list_view.view(),
+            c_nulls_equal,
+            c_nans_equal,
+        ))
+    return Column.from_libcudf(move(c_result))
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/cudf/cudf/pylibcudf_tests/test_lists.py
index f135ab4ccff..33f95a7d364 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_lists.py
@@ -13,16 +13,26 @@ def test_data():
     return [[[[0, 1], [2], [5], [6, 7]], [[8], [9], [], [13, 14, 15]]]]
 
 
+@pytest.fixture
+def list_column():
+    return [[0, 1], [2], [5], [6, 7]]
+
+
 @pytest.fixture
 def scalar():
     return pa.scalar(1)
 
 
 @pytest.fixture
-def column():
+def search_key_column():
     return pa.array([3, 2, 5, 6]), pa.array([-1, 0, 0, 0], type=pa.int32())
 
 
+@pytest.fixture
+def bool_column():
+    return pa.array([[False, True], [True], [True], [True, True]])
+
+
 @pytest.fixture
 def set_lists_column():
     lhs = [[np.nan, np.nan, 2, 1, 2], [1, 2, 3], None, [4, None, 5]]
@@ -72,8 +82,7 @@ def test_concatenate_list_elements(test_data, dropna, expected):
     assert_column_eq(expect, res)
 
 
-def test_contains_scalar(test_data, scalar):
-    list_column = test_data[0][0]
+def test_contains_scalar(list_column, scalar):
     arr = pa.array(list_column)
 
     plc_column = plc.interop.from_arrow(arr)
@@ -85,9 +94,9 @@ def test_contains_scalar(test_data, scalar):
     assert_column_eq(expect, res)
 
 
-def test_contains_list_column(test_data):
-    list_column1 = test_data[0][0]
-    list_column2 = [1, 3, 5, 1]
+def test_contains_list_column(list_column, search_key_column):
+    list_column1 = list_column
+    list_column2, _ = search_key_column
     arr1 = pa.array(list_column1)
     arr2 = pa.array(list_column2)
 
@@ -95,7 +104,7 @@ def test_contains_list_column(test_data):
     plc_column2 = plc.interop.from_arrow(arr2)
     res = plc.lists.contains(plc_column1, plc_column2)
 
-    expect = pa.array([True, False, True, False])
+    expect = pa.array([False, True, True, True])
 
     assert_column_eq(expect, res)
 
@@ -123,8 +132,7 @@ def test_contains_nulls(list_column, expected):
     assert_column_eq(expect, res)
 
 
-def test_index_of_scalar(test_data, scalar):
-    list_column = test_data[0][0]
+def test_index_of_scalar(list_column, scalar):
     arr = pa.array(list_column)
 
     plc_column = plc.interop.from_arrow(arr)
@@ -136,21 +144,19 @@ def test_index_of_scalar(test_data, scalar):
     assert_column_eq(expect, res)
 
 
-def test_index_of_list_column(test_data, column):
-    list_column = test_data[0][0]
+def test_index_of_list_column(list_column, search_key_column):
     arr1 = pa.array(list_column)
-    arr2, expect = column
+    arr2, expect = search_key_column
     plc_column1 = plc.interop.from_arrow(arr1)
     plc_column2 = plc.interop.from_arrow(arr2)
     res = plc.lists.index_of(plc_column1, plc_column2, True)
 
-    expect = pa.array(column[1], type=pa.int32())
+    expect = pa.array(search_key_column[1], type=pa.int32())
 
     assert_column_eq(expect, res)
 
 
-def test_reverse(test_data):
-    list_column = test_data[0][0]
+def test_reverse(list_column):
     arr = pa.array(list_column)
     plc_column = plc.interop.from_arrow(arr)
 
@@ -162,8 +168,7 @@ def test_reverse(test_data):
 
 
 def test_segmented_gather(test_data):
-    list_column1 = test_data[0][0]
-    list_column2 = test_data[0][1]
+    list_column1, list_column2 = test_data[0]
 
     plc_column1 = plc.interop.from_arrow(pa.array(list_column1))
     plc_column2 = plc.interop.from_arrow(pa.array(list_column2))
@@ -175,19 +180,17 @@ def test_segmented_gather(test_data):
     assert_column_eq(expect, res)
 
 
-def test_extract_list_element_scalar(test_data):
-    arr = pa.array(test_data[0][0])
-    plc_column = plc.interop.from_arrow(arr)
+def test_extract_list_element_scalar(list_column):
+    plc_column = plc.interop.from_arrow(pa.array(list_column))
 
     res = plc.lists.extract_list_element(plc_column, 0)
-    expect = pa.compute.list_element(test_data[0][0], 0)
+    expect = pa.compute.list_element(list_column, 0)
 
     assert_column_eq(expect, res)
 
 
-def test_extract_list_element_column(test_data):
-    arr = pa.array(test_data[0][0])
-    plc_column = plc.interop.from_arrow(arr)
+def test_extract_list_element_column(list_column):
+    plc_column = plc.interop.from_arrow(pa.array(list_column))
     indices = plc.interop.from_arrow(pa.array([0, 1, -4, -1]))
 
     res = plc.lists.extract_list_element(plc_column, indices)
@@ -343,3 +346,46 @@ def test_set_operations(
     else:
         expect = pa.array(expected)
     assert_column_eq(expect, res)
+
+
+@pytest.mark.parametrize(
+    "nans_equal,nulls_equal,expected",
+    [
+        (True, True, [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]]),
+        (
+            False,
+            True,
+            [[np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, None, 5]],
+        ),
+        (
+            True,
+            False,
+            [[np.nan, np.nan, 0, 1, 2, 3], [3, 1, 2], None, [4, None, 5]],
+        ),
+        (
+            False,
+            False,
+            [
+                [np.nan, np.nan, 0, 1, 2, 3],
+                [3, 1, 2],
+                None,
+                [4, None, None, 5],
+            ],
+        ),
+    ],
+)
+def test_distinct(list_column, nans_equal, nulls_equal, expected):
+    list_column = [
+        [np.nan, np.nan, 0, 1, 2, 3, 2],
+        [3, 1, 2],
+        None,
+        [4, None, None, 5],
+    ]
+    arr = pa.array(list_column)
+    plc_column = plc.interop.from_arrow(arr)
+
+    res = plc.lists.distinct(plc_column, nans_equal, nulls_equal)
+
+    expect = pa.array(expected)
+
+    assert_column_eq(expect, res)

From 4aefcc7b2988346166b9a757fc837e93f6f0a3bb Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 24 Jul 2024 22:30:35 -0500
Subject: [PATCH 589/842] Add ability to prefetch in `cudf.pandas` and change
 default to managed pool (#16296)

This PR adds ability to prefetch in `cudf.pandas` based off of: https://github.com/rapidsai/rmm/pull/1608/

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16296
---
 ci/cudf_pandas_scripts/pandas-tests/run.sh    |  2 +-
 python/cudf/cudf/pandas/__init__.py           | 60 +++++++++----------
 python/cudf/cudf/pandas/__main__.py           | 15 ++++-
 .../cudf/pandas/scripts/run-pandas-tests.sh   |  2 +-
 4 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index abde5e5d160..48ee4a05628 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -19,7 +19,7 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
 mkdir -p "${RAPIDS_TESTS_DIR}"
 
 bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \
-  -n 10 \
+  -n 5 \
   --tb=no \
   -m "not slow" \
   --max-worker-restart=3 \
diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py
index ff445a63f74..bf88c950385 100644
--- a/python/cudf/cudf/pandas/__init__.py
+++ b/python/cudf/cudf/pandas/__init__.py
@@ -25,41 +25,39 @@ def install():
     global LOADED
     LOADED = loader is not None
 
-    if (rmm_mode := os.getenv("CUDF_PANDAS_RMM_MODE", None)) is not None:
-        # Check if a non-default memory resource is set
-        current_mr = rmm.mr.get_current_device_resource()
-        if not isinstance(current_mr, rmm.mr.CudaMemoryResource):
-            warnings.warn(
-                f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={str(rmm_mode)}",
-                UserWarning,
-            )
-        free_memory, _ = rmm.mr.available_device_memory()
-        free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
+    rmm_mode = os.getenv("CUDF_PANDAS_RMM_MODE", "managed_pool")
+    # Check if a non-default memory resource is set
+    current_mr = rmm.mr.get_current_device_resource()
+    if not isinstance(current_mr, rmm.mr.CudaMemoryResource):
+        warnings.warn(
+            f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={str(rmm_mode)}",
+            UserWarning,
+        )
+        return rmm_mode
 
-        if rmm_mode == "cuda":
-            mr = rmm.mr.CudaMemoryResource()
-            rmm.mr.set_current_device_resource(mr)
-        elif rmm_mode == "pool":
-            rmm.mr.set_current_device_resource(
-                rmm.mr.PoolMemoryResource(
-                    rmm.mr.get_current_device_resource(),
-                    initial_pool_size=free_memory,
-                )
-            )
-        elif rmm_mode == "async":
-            mr = rmm.mr.CudaAsyncMemoryResource(initial_pool_size=free_memory)
-            rmm.mr.set_current_device_resource(mr)
-        elif rmm_mode == "managed":
-            mr = rmm.mr.ManagedMemoryResource()
-            rmm.mr.set_current_device_resource(mr)
-        elif rmm_mode == "managed_pool":
-            mr = rmm.mr.PoolMemoryResource(
+    free_memory, _ = rmm.mr.available_device_memory()
+    free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
+    new_mr = current_mr
+    if rmm_mode == "pool":
+        new_mr = rmm.mr.PoolMemoryResource(
+            current_mr,
+            initial_pool_size=free_memory,
+        )
+    elif rmm_mode == "async":
+        new_mr = rmm.mr.CudaAsyncMemoryResource(initial_pool_size=free_memory)
+    elif rmm_mode == "managed":
+        new_mr = rmm.mr.PrefetchResourceAdaptor(rmm.mr.ManagedMemoryResource())
+    elif rmm_mode == "managed_pool":
+        new_mr = rmm.mr.PrefetchResourceAdaptor(
+            rmm.mr.PoolMemoryResource(
                 rmm.mr.ManagedMemoryResource(),
                 initial_pool_size=free_memory,
             )
-            rmm.mr.set_current_device_resource(mr)
-        else:
-            raise ValueError(f"Unsupported rmm mode: {rmm_mode}")
+        )
+    elif rmm_mode != "cuda":
+        raise ValueError(f"Unsupported {rmm_mode=}")
+    rmm.mr.set_current_device_resource(new_mr)
+    return rmm_mode
 
 
 def pytest_load_initial_conftests(early_config, parser, args):
diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py
index fb8569fa1d0..d4cb42d4c0b 100644
--- a/python/cudf/cudf/pandas/__main__.py
+++ b/python/cudf/cudf/pandas/__main__.py
@@ -1,4 +1,4 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
@@ -72,7 +72,7 @@ def main():
 
     args = parser.parse_args()
 
-    install()
+    rmm_mode = install()
     with profile(args.profile, args.line_profile, args.args[0]) as fn:
         args.args[0] = fn
         if args.module:
@@ -86,6 +86,17 @@ def main():
             sys.argv[:] = args.args
             runpy.run_path(args.args[0], run_name="__main__")
 
+    if "managed" in rmm_mode:
+        for key in {
+            "column_view::get_data",
+            "mutable_column_view::get_data",
+            "gather",
+            "hash_join",
+        }:
+            from cudf._lib import pylibcudf
+
+            pylibcudf.experimental.enable_prefetching(key)
+
 
 if __name__ == "__main__":
     main()
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index a66f63c09b3..9c65b74d081 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -137,7 +137,7 @@ and not test_eof_states \
 and not test_array_tz"
 
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
-PANDAS_CI="1" timeout 30m python -m pytest -p cudf.pandas \
+PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
     -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \
     --import-mode=importlib \

From d953676e9281125a5b8bd9be739c997611471771 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 25 Jul 2024 04:49:12 -0400
Subject: [PATCH 590/842] Hide visibility of non public symbols (#15982)

Converts cudf over to a system of explicit markup of what symbols should be used by consumers. This is done by compiling with `-fvisibility=hidden` and explicit markup via `CUDF_EXPORT` of components we want usable.

Due to issues with tests a portion of `include/` detail functions had to be marked as public API.

More concernning are that the tests leverage functions from `cpp/` that are never part of the installed headers. That set of files can be found at https://github.com/rapidsai/cudf/commit/16b365635ab0f86bb1cc6db5f036564e8290f3b1 and we should discuss how we should restructure cudf to remove these.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15982
---
 cpp/CMakeLists.txt                            |   4 +
 .../thirdparty/patches/cccl_override.json     |   5 +
 .../patches/cccl_symbol_visibility.diff       |  27 ++
 .../developer_guide/DEVELOPER_GUIDE.md        |  27 +-
 cpp/doxygen/developer_guide/DOCUMENTATION.md  |   6 +-
 cpp/include/cudf/aggregation.hpp              |   5 +-
 .../cudf/ast/detail/expression_parser.hpp     |  11 +-
 .../ast/detail/expression_transformer.hpp     |  10 +-
 cpp/include/cudf/ast/detail/operators.hpp     |   4 +-
 cpp/include/cudf/ast/expressions.hpp          |   4 +-
 cpp/include/cudf/binaryop.hpp                 |  20 +-
 cpp/include/cudf/column/column.hpp            |   4 +-
 .../cudf/column/column_device_view.cuh        |   4 +-
 cpp/include/cudf/column/column_factories.hpp  |   4 +-
 cpp/include/cudf/column/column_view.hpp       |   7 +-
 cpp/include/cudf/concatenate.hpp              |   5 +-
 cpp/include/cudf/contiguous_split.hpp         |  13 +-
 cpp/include/cudf/copying.hpp                  |   9 +-
 cpp/include/cudf/datetime.hpp                 |   5 +-
 .../cudf/detail/aggregation/aggregation.hpp   |   4 +-
 .../cudf/detail/aggregation/result_cache.hpp  |   6 +-
 cpp/include/cudf/detail/binaryop.hpp          |   5 +-
 cpp/include/cudf/detail/concatenate.hpp       |   5 +-
 cpp/include/cudf/detail/concatenate_masks.hpp |   5 +-
 cpp/include/cudf/detail/contiguous_split.hpp  |   4 +-
 cpp/include/cudf/detail/copy.hpp              |   4 +-
 cpp/include/cudf/detail/datetime.hpp          |   4 +-
 cpp/include/cudf/detail/fill.hpp              |   4 +-
 cpp/include/cudf/detail/gather.cuh            |   2 +-
 cpp/include/cudf/detail/gather.hpp            |   5 +-
 cpp/include/cudf/detail/groupby.hpp           |  13 +-
 .../detail/groupby/group_replace_nulls.hpp    |   4 +-
 .../cudf/detail/groupby/sort_helper.hpp       |  12 +-
 cpp/include/cudf/detail/interop.hpp           |   5 +-
 cpp/include/cudf/detail/is_element_valid.hpp  |   7 +-
 cpp/include/cudf/detail/join.hpp              |   6 +-
 cpp/include/cudf/detail/label_bins.hpp        |   4 +-
 cpp/include/cudf/detail/merge.hpp             |   6 +-
 cpp/include/cudf/detail/null_mask.hpp         |  12 +-
 cpp/include/cudf/detail/quantiles.hpp         |   5 +-
 cpp/include/cudf/detail/repeat.hpp            |   4 +-
 cpp/include/cudf/detail/replace.hpp           |   4 +-
 cpp/include/cudf/detail/reshape.hpp           |   4 +-
 cpp/include/cudf/detail/rolling.hpp           |   4 +-
 cpp/include/cudf/detail/round.hpp             |   4 +-
 cpp/include/cudf/detail/scan.hpp              |   7 +-
 cpp/include/cudf/detail/scatter.hpp           |   5 +-
 cpp/include/cudf/detail/search.hpp            |   8 +-
 cpp/include/cudf/detail/sequence.hpp          |   4 +-
 cpp/include/cudf/detail/sorting.hpp           |   4 +-
 cpp/include/cudf/detail/stream_compaction.hpp |   4 +-
 cpp/include/cudf/detail/structs/utilities.hpp |   8 +-
 cpp/include/cudf/detail/tdigest/tdigest.hpp   |  12 +-
 cpp/include/cudf/detail/timezone.hpp          |   7 +-
 cpp/include/cudf/detail/transform.hpp         |   5 +-
 cpp/include/cudf/detail/transpose.hpp         |   5 +-
 cpp/include/cudf/detail/unary.hpp             |   5 +-
 .../cudf/detail/utilities/alignment.hpp       |   6 +-
 .../cudf/detail/utilities/cuda_memcpy.hpp     |   8 +-
 .../cudf/detail/utilities/default_stream.hpp  |   8 +-
 .../cudf/detail/utilities/host_vector.hpp     |   7 +-
 .../cudf/detail/utilities/linked_column.hpp   |   9 +-
 .../cudf/detail/utilities/stacktrace.hpp      |  10 +-
 .../cudf/detail/utilities/stream_pool.hpp     |   7 +-
 .../detail/utilities/vector_factories.hpp     |   5 +-
 cpp/include/cudf/detail/valid_if.cuh          |   2 +-
 .../cudf/dictionary/detail/concatenate.hpp    |  10 +-
 cpp/include/cudf/dictionary/detail/encode.hpp |  10 +-
 cpp/include/cudf/dictionary/detail/merge.hpp  |  10 +-
 .../cudf/dictionary/detail/replace.hpp        |  10 +-
 cpp/include/cudf/dictionary/detail/search.hpp |   5 +-
 .../cudf/dictionary/detail/update_keys.hpp    |  10 +-
 .../dictionary/dictionary_column_view.hpp     |   6 +-
 .../cudf/dictionary/dictionary_factories.hpp  |   4 +-
 cpp/include/cudf/dictionary/encode.hpp        |   4 +-
 cpp/include/cudf/dictionary/search.hpp        |   4 +-
 cpp/include/cudf/dictionary/update_keys.hpp   |   4 +-
 cpp/include/cudf/filling.hpp                  |   5 +-
 cpp/include/cudf/fixed_point/fixed_point.hpp  |   4 +-
 .../cudf/fixed_point/floating_conversion.hpp  |   5 +-
 cpp/include/cudf/fixed_point/temporary.hpp    |   4 +-
 cpp/include/cudf/groupby.hpp                  |   5 +-
 cpp/include/cudf/hashing.hpp                  |   5 +-
 cpp/include/cudf/hashing/detail/hashing.hpp   |  10 +-
 cpp/include/cudf/interop.hpp                  |   5 +-
 cpp/include/cudf/io/arrow_io_source.hpp       |   8 +-
 cpp/include/cudf/io/avro.hpp                  |   4 +-
 cpp/include/cudf/io/csv.hpp                   |   4 +-
 cpp/include/cudf/io/data_sink.hpp             |   6 +-
 cpp/include/cudf/io/datasource.hpp            |   7 +-
 cpp/include/cudf/io/detail/avro.hpp           |  13 +-
 cpp/include/cudf/io/detail/csv.hpp            |  13 +-
 cpp/include/cudf/io/detail/json.hpp           |   7 +-
 cpp/include/cudf/io/detail/orc.hpp            |  13 +-
 cpp/include/cudf/io/detail/parquet.hpp        |  13 +-
 cpp/include/cudf/io/detail/tokenize_json.hpp  |   5 +-
 cpp/include/cudf/io/detail/utils.hpp          |  15 +-
 cpp/include/cudf/io/json.hpp                  |   4 +-
 cpp/include/cudf/io/orc.hpp                   |  14 +-
 cpp/include/cudf/io/orc_metadata.hpp          |   5 +-
 cpp/include/cudf/io/orc_types.hpp             |  10 +-
 cpp/include/cudf/io/parquet.hpp               |  18 +-
 cpp/include/cudf/io/parquet_metadata.hpp      |   5 +-
 cpp/include/cudf/io/text/byte_range_info.hpp  |   5 +-
 .../cudf/io/text/data_chunk_source.hpp        |   5 +-
 .../io/text/data_chunk_source_factories.hpp   |   9 +-
 .../cudf/io/text/detail/bgzip_utils.hpp       |   7 +-
 .../cudf/io/text/detail/multistate.hpp        |   8 +-
 .../cudf/io/text/detail/tile_state.hpp        |   6 +-
 cpp/include/cudf/io/text/detail/trie.hpp      |   5 +-
 cpp/include/cudf/io/text/multibyte_split.hpp  |   4 +-
 cpp/include/cudf/io/types.hpp                 |   8 +-
 cpp/include/cudf/join.hpp                     |  16 +-
 cpp/include/cudf/json/json.hpp                |   5 +-
 cpp/include/cudf/labeling/label_bins.hpp      |   4 +-
 cpp/include/cudf/lists/combine.hpp            |   5 +-
 cpp/include/cudf/lists/contains.hpp           |   5 +-
 cpp/include/cudf/lists/count_elements.hpp     |   5 +-
 cpp/include/cudf/lists/detail/combine.hpp     |  10 +-
 cpp/include/cudf/lists/detail/concatenate.hpp |  10 +-
 cpp/include/cudf/lists/detail/contains.hpp    |  10 +-
 cpp/include/cudf/lists/detail/copying.hpp     |  10 +-
 cpp/include/cudf/lists/detail/dremel.hpp      |   7 +-
 cpp/include/cudf/lists/detail/extract.hpp     |  10 +-
 cpp/include/cudf/lists/detail/gather.cuh      |   3 +
 .../cudf/lists/detail/interleave_columns.hpp  |  10 +-
 .../lists/detail/lists_column_factories.hpp   |  10 +-
 cpp/include/cudf/lists/detail/reverse.hpp     |   7 +-
 cpp/include/cudf/lists/detail/scatter.cuh     |  10 +-
 .../cudf/lists/detail/set_operations.hpp      |   6 +-
 cpp/include/cudf/lists/detail/sorting.hpp     |  10 +-
 .../cudf/lists/detail/stream_compaction.hpp   |   7 +-
 cpp/include/cudf/lists/explode.hpp            |   4 +-
 cpp/include/cudf/lists/extract.hpp            |   5 +-
 cpp/include/cudf/lists/filling.hpp            |   6 +-
 cpp/include/cudf/lists/gather.hpp             |   5 +-
 cpp/include/cudf/lists/list_device_view.cuh   |   4 +-
 cpp/include/cudf/lists/list_view.hpp          |   8 +-
 .../cudf/lists/lists_column_device_view.cuh   |   8 +-
 cpp/include/cudf/lists/lists_column_view.hpp  |   5 +-
 cpp/include/cudf/lists/reverse.hpp            |   7 +-
 cpp/include/cudf/lists/set_operations.hpp     |   6 +-
 cpp/include/cudf/lists/sorting.hpp            |   5 +-
 cpp/include/cudf/lists/stream_compaction.hpp  |   7 +-
 cpp/include/cudf/merge.hpp                    |   5 +-
 cpp/include/cudf/null_mask.hpp                |   5 +-
 cpp/include/cudf/partitioning.hpp             |   5 +-
 cpp/include/cudf/quantiles.hpp                |   5 +-
 cpp/include/cudf/reduction.hpp                |   5 +-
 .../cudf/reduction/detail/histogram.hpp       |   7 +-
 .../cudf/reduction/detail/reduction.hpp       |   7 +-
 .../reduction/detail/reduction_functions.hpp  |  11 +-
 .../detail/segmented_reduction_functions.hpp  |  11 +-
 cpp/include/cudf/replace.hpp                  |   5 +-
 cpp/include/cudf/reshape.hpp                  |   5 +-
 cpp/include/cudf/rolling.hpp                  |   5 +-
 .../cudf/rolling/range_window_bounds.hpp      |   5 +-
 cpp/include/cudf/round.hpp                    |   5 +-
 cpp/include/cudf/scalar/scalar.hpp            |   4 +-
 .../cudf/scalar/scalar_device_view.cuh        |   6 +-
 cpp/include/cudf/scalar/scalar_factories.hpp  |   4 +-
 cpp/include/cudf/search.hpp                   |   5 +-
 cpp/include/cudf/sorting.hpp                  |   5 +-
 cpp/include/cudf/stream_compaction.hpp        |   5 +-
 cpp/include/cudf/strings/attributes.hpp       |   4 +-
 cpp/include/cudf/strings/capitalize.hpp       |   4 +-
 cpp/include/cudf/strings/case.hpp             |   4 +-
 .../cudf/strings/char_types/char_cases.hpp    |   8 +-
 .../cudf/strings/char_types/char_types.hpp    |   4 +-
 .../strings/char_types/char_types_enum.hpp    |   6 +-
 cpp/include/cudf/strings/combine.hpp          |   4 +-
 cpp/include/cudf/strings/contains.hpp         |   4 +-
 .../cudf/strings/convert/convert_booleans.hpp |   4 +-
 .../cudf/strings/convert/convert_datetime.hpp |   4 +-
 .../strings/convert/convert_durations.hpp     |   4 +-
 .../strings/convert/convert_fixed_point.hpp   |   4 +-
 .../cudf/strings/convert/convert_floats.hpp   |   4 +-
 .../cudf/strings/convert/convert_integers.hpp |   4 +-
 .../cudf/strings/convert/convert_ipv4.hpp     |   4 +-
 .../cudf/strings/convert/convert_lists.hpp    |   4 +-
 .../cudf/strings/convert/convert_urls.hpp     |   4 +-
 .../cudf/strings/detail/char_tables.hpp       |  14 +-
 cpp/include/cudf/strings/detail/combine.hpp   |  11 +-
 .../cudf/strings/detail/concatenate.hpp       |  11 +-
 .../cudf/strings/detail/converters.hpp        |  11 +-
 .../cudf/strings/detail/copy_range.hpp        |  10 +-
 cpp/include/cudf/strings/detail/copying.hpp   |  11 +-
 cpp/include/cudf/strings/detail/fill.hpp      |  11 +-
 cpp/include/cudf/strings/detail/merge.hpp     |   7 +-
 cpp/include/cudf/strings/detail/replace.hpp   |  11 +-
 cpp/include/cudf/strings/detail/scan.hpp      |  10 +-
 cpp/include/cudf/strings/detail/utf8.hpp      |  10 +-
 cpp/include/cudf/strings/detail/utilities.hpp |  11 +-
 cpp/include/cudf/strings/extract.hpp          |   4 +-
 cpp/include/cudf/strings/find.hpp             |   4 +-
 cpp/include/cudf/strings/find_multiple.hpp    |   4 +-
 cpp/include/cudf/strings/findall.hpp          |   4 +-
 cpp/include/cudf/strings/padding.hpp          |   4 +-
 cpp/include/cudf/strings/regex/flags.hpp      |   8 +-
 .../cudf/strings/regex/regex_program.hpp      |   4 +-
 cpp/include/cudf/strings/repeat_strings.hpp   |   4 +-
 cpp/include/cudf/strings/replace.hpp          |   4 +-
 cpp/include/cudf/strings/replace_re.hpp       |   4 +-
 cpp/include/cudf/strings/reverse.hpp          |   4 +-
 cpp/include/cudf/strings/side_type.hpp        |   8 +-
 cpp/include/cudf/strings/slice.hpp            |   4 +-
 cpp/include/cudf/strings/split/partition.hpp  |   4 +-
 cpp/include/cudf/strings/split/split.hpp      |   4 +-
 cpp/include/cudf/strings/split/split_re.hpp   |   4 +-
 cpp/include/cudf/strings/string_view.cuh      |   5 +-
 cpp/include/cudf/strings/string_view.hpp      |   6 +-
 .../cudf/strings/strings_column_view.hpp      |   5 +-
 cpp/include/cudf/strings/strip.hpp            |   4 +-
 cpp/include/cudf/strings/translate.hpp        |   4 +-
 cpp/include/cudf/strings/wrap.hpp             |   4 +-
 .../cudf/structs/detail/concatenate.hpp       |  11 +-
 cpp/include/cudf/structs/detail/scan.hpp      |  11 +-
 cpp/include/cudf/structs/struct_view.hpp      |   6 +-
 .../structs/structs_column_device_view.cuh    |   6 +-
 .../cudf/structs/structs_column_view.hpp      |   6 +-
 .../cudf/table/experimental/row_operators.cuh |   4 +-
 cpp/include/cudf/table/row_operators.cuh      |   4 +-
 cpp/include/cudf/table/table.hpp              |   4 +-
 cpp/include/cudf/table/table_device_view.cuh  |   6 +-
 .../cudf/tdigest/tdigest_column_view.hpp      |   6 +-
 cpp/include/cudf/timezone.hpp                 |   6 +-
 cpp/include/cudf/transform.hpp                |   5 +-
 cpp/include/cudf/transpose.hpp                |   5 +-
 cpp/include/cudf/types.hpp                    |   6 +-
 cpp/include/cudf/unary.hpp                    |   5 +-
 cpp/include/cudf/utilities/bit.hpp            |   4 +-
 cpp/include/cudf/utilities/default_stream.hpp |   7 +-
 cpp/include/cudf/utilities/error.hpp          |   9 +-
 cpp/include/cudf/utilities/pinned_memory.hpp  |   6 +-
 cpp/include/cudf/utilities/prefetch.hpp       |   6 +-
 cpp/include/cudf/utilities/span.hpp           |   5 +-
 cpp/include/cudf/utilities/traits.cuh         |   6 +-
 cpp/include/cudf/utilities/traits.hpp         |   4 +-
 cpp/include/cudf/utilities/type_checks.hpp    |   4 +-
 .../cudf/utilities/type_dispatcher.hpp        |   6 +-
 cpp/include/cudf/wrappers/dictionary.hpp      |   4 +-
 cpp/include/cudf/wrappers/durations.hpp       |   6 +-
 cpp/include/cudf/wrappers/timestamps.hpp      |   5 +-
 cpp/include/cudf_test/base_fixture.hpp        |   5 +-
 cpp/include/cudf_test/column_utilities.hpp    |  12 +-
 cpp/include/cudf_test/column_wrapper.hpp      |   8 +-
 cpp/include/cudf_test/debug_utilities.hpp     |   9 +-
 cpp/include/cudf_test/default_stream.hpp      |   8 +-
 cpp/include/cudf_test/file_utilities.hpp      |   3 +-
 .../cudf_test/io_metadata_utilities.hpp       |   9 +-
 cpp/include/cudf_test/iterator_utilities.hpp  |   7 +-
 cpp/include/cudf_test/print_utilities.cuh     |   7 +-
 cpp/include/cudf_test/random.hpp              |   5 +-
 cpp/include/cudf_test/table_utilities.hpp     |   9 +-
 cpp/include/cudf_test/tdigest_utilities.cuh   |   7 +-
 cpp/include/cudf_test/testing_main.hpp        |   7 +-
 cpp/include/cudf_test/timestamp_utilities.cuh |   5 +-
 cpp/include/cudf_test/type_list_utilities.hpp |   8 +-
 cpp/include/cudf_test/type_lists.hpp          |   5 +-
 cpp/include/nvtext/byte_pair_encoding.hpp     |   5 +-
 cpp/include/nvtext/detail/generate_ngrams.hpp |   4 +-
 cpp/include/nvtext/detail/load_hash_file.hpp  |   4 +-
 cpp/include/nvtext/detail/tokenize.hpp        |   4 +-
 cpp/include/nvtext/edit_distance.hpp          |   5 +-
 cpp/include/nvtext/generate_ngrams.hpp        |   5 +-
 cpp/include/nvtext/jaccard.hpp                |   5 +-
 cpp/include/nvtext/minhash.hpp                |   5 +-
 cpp/include/nvtext/ngrams_tokenize.hpp        |   5 +-
 cpp/include/nvtext/normalize.hpp              |   5 +-
 cpp/include/nvtext/replace.hpp                |   5 +-
 cpp/include/nvtext/stemmer.hpp                |   5 +-
 cpp/include/nvtext/subword_tokenize.hpp       |   5 +-
 cpp/include/nvtext/tokenize.hpp               |   5 +-
 cpp/src/aggregation/aggregation.cpp           | 350 ++++++++++--------
 cpp/src/binaryop/compiled/binary_ops.cu       |   1 +
 cpp/src/bitmask/is_element_valid.cpp          |   5 +-
 cpp/src/copying/concatenate.cu                |   1 +
 cpp/src/copying/purge_nonempty_nulls.cu       |   1 +
 cpp/src/dictionary/set_keys.cu                |   1 +
 cpp/src/filling/calendrical_month_sequence.cu |   1 +
 cpp/src/io/comp/gpuinflate.hpp                |   7 +-
 cpp/src/io/functions.cpp                      |  13 +
 cpp/src/io/json/nested_json.hpp               |  18 +-
 cpp/src/io/json/read_json.hpp                 |   7 +-
 .../io/parquet/compact_protocol_reader.hpp    |   8 +-
 cpp/src/io/utilities/base64_utilities.hpp     |   8 +-
 cpp/src/io/utilities/data_casting.cu          |   4 +-
 cpp/src/io/utilities/file_io_utilities.hpp    |   5 +-
 cpp/src/io/utilities/row_selection.hpp        |   6 +-
 cpp/src/io/utilities/string_parsing.hpp       |   5 +-
 cpp/src/io/utilities/trie.cuh                 |   6 +-
 cpp/src/jit/parser.hpp                        |   6 +-
 cpp/src/lists/contains.cu                     |   1 +
 cpp/src/lists/copying/concatenate.cu          |   1 +
 cpp/src/lists/copying/segmented_gather.cu     |   1 +
 cpp/src/lists/set_operations.cu               |   1 +
 cpp/src/lists/stream_compaction/distinct.cu   |   1 +
 cpp/src/merge/merge.cu                        |   1 +
 cpp/src/partitioning/round_robin.cu           |   5 +-
 cpp/src/quantiles/quantile.cu                 |   1 +
 cpp/src/quantiles/quantiles.cu                |   1 +
 cpp/src/quantiles/tdigest/tdigest.cu          |   1 +
 cpp/src/reductions/scan/rank_scan.cu          |   1 +
 cpp/src/reductions/scan/scan_inclusive.cu     |   1 +
 cpp/src/reductions/segmented/reductions.cpp   |   2 +-
 cpp/src/reshape/interleave_columns.cu         |   1 +
 cpp/src/reshape/tile.cu                       |   1 +
 cpp/src/rolling/rolling.cu                    |   1 +
 cpp/src/scalar/scalar.cpp                     |   4 +-
 cpp/src/search/contains_column.cu             |   1 +
 cpp/src/search/contains_scalar.cu             |   2 +
 cpp/src/search/contains_table.cu              |   1 +
 cpp/src/search/search_ordered.cu              |   1 +
 cpp/src/strings/convert/convert_durations.cu  |   1 +
 cpp/src/strings/strings_scalar_factories.cpp  |   1 +
 cpp/src/strings/utilities.cu                  |   1 +
 cpp/src/transform/one_hot_encode.cu           |   1 +
 cpp/src/transform/row_bit_count.cu            |   1 +
 cpp/tests/utilities/random_seed.cpp           |   4 +-
 java/src/main/native/CMakeLists.txt           |   1 +
 java/src/main/native/src/TableJni.cpp         |   5 +-
 321 files changed, 1326 insertions(+), 956 deletions(-)
 create mode 100644 cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 24b683a930b..95c509efc5b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -711,8 +711,10 @@ set_target_properties(
              CXX_STANDARD_REQUIRED ON
              # For std:: support of __int128_t. Can be removed once using cuda::std
              CXX_EXTENSIONS ON
+             CXX_VISIBILITY_PRESET hidden
              CUDA_STANDARD 17
              CUDA_STANDARD_REQUIRED ON
+             CUDA_VISIBILITY_PRESET hidden
              POSITION_INDEPENDENT_CODE ON
              INTERFACE_POSITION_INDEPENDENT_CODE ON
 )
@@ -887,8 +889,10 @@ if(CUDF_BUILD_TESTUTIL)
                # set target compile options
                CXX_STANDARD 17
                CXX_STANDARD_REQUIRED ON
+               CXX_VISIBILITY_PRESET hidden
                CUDA_STANDARD 17
                CUDA_STANDARD_REQUIRED ON
+               CUDA_VISIBILITY_PRESET hidden
                POSITION_INDEPENDENT_CODE ON
                INTERFACE_POSITION_INDEPENDENT_CODE ON
   )
diff --git a/cpp/cmake/thirdparty/patches/cccl_override.json b/cpp/cmake/thirdparty/patches/cccl_override.json
index 2f29578f7ae..dcf9c1139f9 100644
--- a/cpp/cmake/thirdparty/patches/cccl_override.json
+++ b/cpp/cmake/thirdparty/patches/cccl_override.json
@@ -3,6 +3,11 @@
   "packages" : {
     "CCCL" : {
       "patches" : [
+        {
+          "file" : "${current_json_dir}/cccl_symbol_visibility.diff",
+          "issue" : "Correct symbol visibility issues in libcudacxx [https://github.com/NVIDIA/cccl/pull/1832/]",
+          "fixed_in" : "2.6"
+        },
         {
           "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
           "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
diff --git a/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff b/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff
new file mode 100644
index 00000000000..f745d5fa314
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/cccl_symbol_visibility.diff
@@ -0,0 +1,27 @@
+diff --git a/libcudacxx/include/cuda/std/detail/libcxx/include/__config b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
+index e7c62c031b..5db861853a 100644
+--- a/libcudacxx/include/cuda/std/detail/libcxx/include/__config
++++ b/libcudacxx/include/cuda/std/detail/libcxx/include/__config
+@@ -1049,7 +1049,6 @@ typedef __char32_t char32_t;
+ #      define _LIBCUDACXX_EXPORTED_FROM_ABI __declspec(dllimport)
+ #    endif
+ 
+-#    define _LIBCUDACXX_TYPE_VIS      _LIBCUDACXX_DLL_VIS
+ #    define _LIBCUDACXX_FUNC_VIS      _LIBCUDACXX_DLL_VIS
+ #    define _LIBCUDACXX_EXCEPTION_ABI _LIBCUDACXX_DLL_VIS
+ #    define _LIBCUDACXX_HIDDEN
+@@ -1448,14 +1447,6 @@ __sanitizer_annotate_contiguous_container(const void*, const void*, const void*,
+ #    define _LIBCUDACXX_WEAK __attribute__((__weak__))
+ #  endif
+ 
+-// Redefine some macros for internal use
+-#  if defined(__cuda_std__)
+-#    undef _LIBCUDACXX_FUNC_VIS
+-#    define _LIBCUDACXX_FUNC_VIS _LIBCUDACXX_INLINE_VISIBILITY
+-#    undef _LIBCUDACXX_TYPE_VIS
+-#    define _LIBCUDACXX_TYPE_VIS
+-#  endif // __cuda_std__
+-
+ // Thread API
+ #  ifndef _LIBCUDACXX_HAS_THREAD_API_EXTERNAL
+ #    if defined(_CCCL_COMPILER_NVRTC) || defined(__EMSCRIPTEN__)
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index 0d097541692..aa054ba93e9 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -52,15 +52,36 @@ header file in `cudf/cpp/include/cudf/`. For example, `cudf/cpp/include/cudf/cop
 contains the APIs for functions related to copying from one column to another. Note the `.hpp`
 file extension used to indicate a C++ header file.
 
-Header files should use the `#pragma once` include guard.
+External/public libcudf C++ API header files need to mark all symbols inside of them with `CUDF_EXPORT`.
+This is done by placing the macro on the `namespace cudf` as seen below. Markup on namespace
+require them not to be nested, so the `cudf` namespace must be kept by itself.
+
+```c++
+
+#pragma once
+
+namespace CUDF_EXPORT cudf {
+namespace lists {
+
+...
+
+
+} // namespace lists
+} // namespace CUDF_EXPORT cudf
+
+```
+
 
 The naming of external API headers should be consistent with the name of the folder that contains
 the source files that implement the API. For example, the implementation of the APIs found in
 `cudf/cpp/include/cudf/copying.hpp` are located in `cudf/src/copying`. Likewise, the unit tests for
 the APIs reside in `cudf/tests/copying/`.
 
-Internal API headers containing `detail` namespace definitions that are used across translation
-units inside libcudf should be placed in `include/cudf/detail`.
+Internal API headers containing `detail` namespace definitions that are either used across translation
+units inside libcudf should be placed in `include/cudf/detail`. Just like the public C++ API headers, any
+internal C++ API header requires `CUDF_EXPORT` markup on the `cudf` namespace so that the functions can be tested.
+
+All headers in cudf should use `#pragma once` for include guards.
 
 ## File extensions
 
diff --git a/cpp/doxygen/developer_guide/DOCUMENTATION.md b/cpp/doxygen/developer_guide/DOCUMENTATION.md
index b86f7db82b0..89376223baf 100644
--- a/cpp/doxygen/developer_guide/DOCUMENTATION.md
+++ b/cpp/doxygen/developer_guide/DOCUMENTATION.md
@@ -363,7 +363,7 @@ Here is an example of a doxygen description comment for a namespace declaration.
      *
      * This is the top-level namespace which contains all cuDF functions and types.
      */
-    namespace cudf {
+    namespace CUDF_EXPORT cudf {
 
 A description comment should be included only once for each unique namespace declaration.
 Otherwise, if more than one description is found, doxygen aggregates the descriptions in an arbitrary order in the output pages.
@@ -385,7 +385,7 @@ The existing groups have been carefully structured and named, so new groups shou
 
 When creating a new API, specify its group using the [\@ingroup](https://www.doxygen.nl/manual/commands.html#cmdingroup) tag and the group reference id from the [doxygen_groups.h](../include/doxygen_groups.h) file.
 
-    namespace cudf {
+    namespace CUDF_EXPORT cudf {
 
     /**
      * @brief ...
@@ -401,7 +401,7 @@ When creating a new API, specify its group using the [\@ingroup](https://www.dox
 
 You can also use the \@addtogroup with a `@{ ... @}` pair to automatically include doxygen comment blocks as part of a group.
 
-    namespace cudf {
+    namespace CUDF_EXPORT cudf {
     /**
      * @addtogroup transformation_fill
      * @{
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index 3c1023017be..f5f514d26d9 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <functional>
 #include <memory>
@@ -31,7 +32,7 @@
  * individual function documentation to see what aggregations are supported.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup aggregation_factories
  * @{
@@ -770,4 +771,4 @@ template <typename Base>
 std::unique_ptr<Base> make_merge_tdigest_aggregation(int max_centroids = 1000);
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index 38f7ac5291f..da552d95421 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -29,9 +29,8 @@
 #include <numeric>
 #include <optional>
 
-namespace cudf {
-namespace ast {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace ast::detail {
 
 /**
  * @brief Node data reference types.
@@ -328,8 +327,6 @@ class expression_parser {
   std::vector<generic_scalar_device_view> _literals;
 };
 
-}  // namespace detail
+}  // namespace ast::detail
 
-}  // namespace ast
-
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/ast/detail/expression_transformer.hpp b/cpp/include/cudf/ast/detail/expression_transformer.hpp
index a6529c338e6..3af1663abf8 100644
--- a/cpp/include/cudf/ast/detail/expression_transformer.hpp
+++ b/cpp/include/cudf/ast/detail/expression_transformer.hpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,8 @@
 
 #include <cudf/ast/expressions.hpp>
 
-namespace cudf::ast::detail {
+namespace CUDF_EXPORT cudf {
+namespace ast::detail {
 /**
  * @brief Base "visitor" pattern class with the `expression` class for expression transformer.
  *
@@ -61,4 +62,7 @@ class expression_transformer {
 
   virtual ~expression_transformer() {}
 };
-}  // namespace cudf::ast::detail
+
+}  // namespace ast::detail
+
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp
index c483d459833..46507700e21 100644
--- a/cpp/include/cudf/ast/detail/operators.hpp
+++ b/cpp/include/cudf/ast/detail/operators.hpp
@@ -29,7 +29,7 @@
 #include <utility>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 namespace ast {
 
@@ -1233,4 +1233,4 @@ CUDF_HOST_DEVICE inline cudf::size_type ast_operator_arity(ast_operator op)
 
 }  // namespace ast
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp
index 918271e3e4f..4299ee5f20f 100644
--- a/cpp/include/cudf/ast/expressions.hpp
+++ b/cpp/include/cudf/ast/expressions.hpp
@@ -23,7 +23,7 @@
 
 #include <cstdint>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace ast {
 /**
  * @addtogroup expressions
@@ -555,4 +555,4 @@ class column_name_reference : public expression {
 /** @} */  // end of group
 }  // namespace ast
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index c74c91e39c2..51199bb5792 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -18,13 +18,14 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup transformation_binaryops
@@ -316,8 +317,13 @@ std::pair<rmm::device_buffer, size_type> scalar_col_valid_mask_and(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-namespace compiled {
-namespace detail {
+}  // namespace binops
+
+/** @} */  // end of group
+}  // namespace CUDF_EXPORT cudf
+
+namespace CUDF_EXPORT cudf {
+namespace binops::compiled::detail {
 
 /**
  * @brief struct binary operation using `NaN` aware sorting physical element comparators
@@ -337,9 +343,5 @@ void apply_sorting_struct_binary_op(mutable_column_view& out,
                                     bool is_rhs_scalar,
                                     binary_operator op,
                                     rmm::cuda_stream_view stream);
-}  // namespace detail
-}  // namespace compiled
-}  // namespace binops
-
-/** @} */  // end of group
-}  // namespace cudf
+}  // namespace binops::compiled::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
index 22db25bdc83..5d1d74c3f28 100644
--- a/cpp/include/cudf/column/column.hpp
+++ b/cpp/include/cudf/column/column.hpp
@@ -36,7 +36,7 @@
  * @brief Class definition for cudf::column
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief A container of nullable device data as a column of elements.
@@ -332,4 +332,4 @@ class column {
 };
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 787e9c2c479..89fe59bfeaa 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -44,7 +44,7 @@
  * @brief Column device view class definitions
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief Indicates the presence of nulls at compile-time or runtime.
@@ -1527,4 +1527,4 @@ ColumnDeviceView* child_columns_to_device_array(ColumnViewIterator child_begin,
 }
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index dc4700576e6..c1f295b7ea8 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -27,7 +27,7 @@
 
 #include <thrust/pair.h>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup column_factories
  * @{
@@ -571,4 +571,4 @@ std::unique_ptr<column> make_dictionary_from_scalar(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/column/column_view.hpp b/cpp/include/cudf/column/column_view.hpp
index 03352fdce13..3ef7bafe727 100644
--- a/cpp/include/cudf/column/column_view.hpp
+++ b/cpp/include/cudf/column/column_view.hpp
@@ -31,8 +31,7 @@
  * @file column_view.hpp
  * @brief column view class definitions
  */
-
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @brief A non-owning, immutable view of device data as a column of elements,
@@ -296,6 +295,7 @@ class column_view_base {
                    size_type null_count,
                    size_type offset = 0);
 };
+
 }  // namespace detail
 
 /**
@@ -797,5 +797,6 @@ std::size_t shallow_hash(column_view const& input);
  * @return If `lhs` and `rhs` have equivalent shallow state
  */
 bool is_shallow_equivalent(column_view const& lhs, column_view const& rhs);
+
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/concatenate.hpp b/cpp/include/cudf/concatenate.hpp
index e7b55a2e6d0..0935bdf7def 100644
--- a/cpp/include/cudf/concatenate.hpp
+++ b/cpp/include/cudf/concatenate.hpp
@@ -18,6 +18,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -25,7 +26,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup copy_concatenate
  * @{
@@ -97,4 +98,4 @@ std::unique_ptr<table> concatenate(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp
index 0d4f20d1ef2..195dac25268 100644
--- a/cpp/include/cudf/contiguous_split.hpp
+++ b/cpp/include/cudf/contiguous_split.hpp
@@ -18,13 +18,14 @@
 
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup copy_split
@@ -124,8 +125,14 @@ std::vector<packed_table> contiguous_split(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 namespace detail {
+
+/**
+ * @brief A helper struct containing the state of contiguous_split, whether the caller
+ * is using the single-pass contiguous_split or chunked_pack.
+ *
+ */
 struct contiguous_split_state;
-};
+}  // namespace detail
 
 /**
  * @brief Perform a chunked "pack" operation of the input `table_view` using a user provided
@@ -338,4 +345,4 @@ table_view unpack(packed_columns const& input);
 table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data);
 
 /** @} */
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index b17cafb05ab..3c44ff48fdf 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -23,6 +23,7 @@
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -30,7 +31,7 @@
 #include <memory>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup column_copy
@@ -913,7 +914,7 @@ bool may_have_nonempty_nulls(column_view const& input);
  *
  * @code{.pseudo}
  * auto const lists   = lists_column_wrapper<int32_t>{ {0,1}, {2,3}, {4,5} }.release();
- * cudf::detail::set_null_mask(lists->null_mask(), 1, 2, false);
+ * cudf::set_null_mask(lists->null_mask(), 1, 2, false);
  *
  * lists[1] is now null, but the lists child column still stores `{2,3}`.
  * The lists column contents will be:
@@ -929,7 +930,7 @@ bool may_have_nonempty_nulls(column_view const& input);
  *
  * @code{.pseudo}
  * auto const strings = strings_column_wrapper{ "AB", "CD", "EF" }.release();
- * cudf::detail::set_null_mask(strings->null_mask(), 1, 2, false);
+ * cudf::set_null_mask(strings->null_mask(), 1, 2, false);
  *
  * strings[1] is now null, but the strings column still stores `"CD"`.
  * The lists column contents will be:
@@ -972,4 +973,4 @@ std::unique_ptr<column> purge_nonempty_nulls(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index 06b7d24f6cd..f7bed8bdc7e 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -28,7 +29,7 @@
  * @brief DateTime column APIs.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace datetime {
 /**
  * @addtogroup datetime_extract
@@ -401,4 +402,4 @@ std::unique_ptr<cudf::column> round_datetimes(
 /** @} */  // end of group
 
 }  // namespace datetime
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/aggregation/aggregation.hpp b/cpp/include/cudf/detail/aggregation/aggregation.hpp
index 843414817e3..b257eef1e9e 100644
--- a/cpp/include/cudf/detail/aggregation/aggregation.hpp
+++ b/cpp/include/cudf/detail/aggregation/aggregation.hpp
@@ -26,7 +26,7 @@
 #include <numeric>
 #include <utility>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 // Visitor pattern
@@ -1674,4 +1674,4 @@ constexpr inline bool is_valid_aggregation()
 bool is_valid_aggregation(data_type source, aggregation::Kind k);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/aggregation/result_cache.hpp b/cpp/include/cudf/detail/aggregation/result_cache.hpp
index 41eec156c47..ec5a511bb7c 100644
--- a/cpp/include/cudf/detail/aggregation/result_cache.hpp
+++ b/cpp/include/cudf/detail/aggregation/result_cache.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
 
 #include <unordered_map>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 struct pair_column_aggregation_equal_to {
   bool operator()(std::pair<column_view, aggregation const&> const& lhs,
@@ -66,4 +66,4 @@ class result_cache {
 };
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/binaryop.hpp b/cpp/include/cudf/detail/binaryop.hpp
index de1fde8bc96..fe739327a08 100644
--- a/cpp/include/cudf/detail/binaryop.hpp
+++ b/cpp/include/cudf/detail/binaryop.hpp
@@ -17,11 +17,12 @@
 
 #include <cudf/binaryop.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! Inner interfaces and implementations
 namespace detail {
 
@@ -77,4 +78,4 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr);
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/concatenate.hpp b/cpp/include/cudf/detail/concatenate.hpp
index 3e039175542..1be269710b2 100644
--- a/cpp/include/cudf/detail/concatenate.hpp
+++ b/cpp/include/cudf/detail/concatenate.hpp
@@ -19,6 +19,7 @@
 #include <cudf/concatenate.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -26,7 +27,7 @@
 
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! Inner interfaces and implementations
 namespace detail {
 /**
@@ -48,4 +49,4 @@ std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/concatenate_masks.hpp b/cpp/include/cudf/detail/concatenate_masks.hpp
index dd2fb471a7d..fc829361fde 100644
--- a/cpp/include/cudf/detail/concatenate_masks.hpp
+++ b/cpp/include/cudf/detail/concatenate_masks.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -24,7 +25,7 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! Inner interfaces and implementations
 namespace detail {
 
@@ -69,4 +70,4 @@ rmm::device_buffer concatenate_masks(host_span<column_view const> views,
                                      rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp
index 1467ed1aa67..52c51daa917 100644
--- a/cpp/include/cudf/detail/contiguous_split.hpp
+++ b/cpp/include/cudf/detail/contiguous_split.hpp
@@ -23,7 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -125,4 +125,4 @@ std::vector<uint8_t> pack_metadata(table_view const& table,
                                    metadata_builder& builder);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index f7430eb090d..2be432c0825 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -28,7 +28,7 @@
 
 #include <initializer_list>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @brief Constructs a zero-copy `column_view`/`mutable_column_view` of the
@@ -280,4 +280,4 @@ std::unique_ptr<column> purge_nonempty_nulls(column_view const& input,
                                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp
index a93c06d4371..95469de8ae6 100644
--- a/cpp/include/cudf/detail/datetime.hpp
+++ b/cpp/include/cudf/detail/datetime.hpp
@@ -23,7 +23,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace datetime {
 namespace detail {
 /**
@@ -174,4 +174,4 @@ std::unique_ptr<cudf::column> extract_quarter(cudf::column_view const& column,
 
 }  // namespace detail
 }  // namespace datetime
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/fill.hpp b/cpp/include/cudf/detail/fill.hpp
index 6996cda6974..82c6af8b611 100644
--- a/cpp/include/cudf/detail/fill.hpp
+++ b/cpp/include/cudf/detail/fill.hpp
@@ -25,7 +25,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -52,4 +52,4 @@ std::unique_ptr<column> fill(column_view const& input,
                              rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index d3e9fc4974d..073c37ccb77 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -571,7 +571,7 @@ void gather_bitmask(table_view const& source,
         not target[i]->nullable()) {
       auto const state =
         op == gather_bitmask_op::PASSTHROUGH ? mask_state::ALL_VALID : mask_state::UNINITIALIZED;
-      auto mask = detail::create_null_mask(target[i]->size(), state, stream, mr);
+      auto mask = cudf::create_null_mask(target[i]->size(), state, stream, mr);
       target[i]->set_null_mask(std::move(mask), 0);
     }
   }
diff --git a/cpp/include/cudf/detail/gather.hpp b/cpp/include/cudf/detail/gather.hpp
index 36824f56895..39cd43934e3 100644
--- a/cpp/include/cudf/detail/gather.hpp
+++ b/cpp/include/cudf/detail/gather.hpp
@@ -20,6 +20,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -27,7 +28,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 namespace detail {
 
@@ -84,4 +85,4 @@ std::unique_ptr<table> gather(table_view const& source_table,
                               rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/groupby.hpp b/cpp/include/cudf/detail/groupby.hpp
index 5a8c9b0a27f..36eae05ce39 100644
--- a/cpp/include/cudf/detail/groupby.hpp
+++ b/cpp/include/cudf/detail/groupby.hpp
@@ -25,10 +25,8 @@
 #include <memory>
 #include <utility>
 
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace hash {
+namespace CUDF_EXPORT cudf {
+namespace groupby::detail::hash {
 /**
  * @brief Indicates if a set of aggregation requests can be satisfied with a
  * hash-based groupby implementation.
@@ -47,8 +45,5 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   null_policy include_null_keys,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr);
-}  // namespace hash
-
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
+}  // namespace groupby::detail::hash
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
index 389c7952875..c0910b4d5ae 100644
--- a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
+++ b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
@@ -24,7 +24,7 @@
 
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace groupby {
 namespace detail {
 
@@ -45,4 +45,4 @@ std::unique_ptr<column> group_replace_nulls(cudf::column_view const& grouped_val
 
 }  // namespace detail
 }  // namespace groupby
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp
index 567efedb9b2..a411a890622 100644
--- a/cpp/include/cudf/detail/groupby/sort_helper.hpp
+++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp
@@ -25,10 +25,8 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace groupby {
-namespace detail {
-namespace sort {
+namespace CUDF_EXPORT cudf {
+namespace groupby::detail::sort {
 /**
  * @brief Helper class for computing sort-based groupby
  *
@@ -229,7 +227,5 @@ struct sort_groupby_helper {
   std::vector<null_order> _null_precedence;  ///< How to sort NULLs
 };
 
-}  // namespace sort
-}  // namespace detail
-}  // namespace groupby
-}  // namespace cudf
+}  // namespace groupby::detail::sort
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index 5b2b9b5e69d..0b9319ba663 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -34,12 +34,13 @@
 #include <cudf/interop.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
 #include <string>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -156,4 +157,4 @@ constexpr std::size_t max_precision()
 }
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/is_element_valid.hpp b/cpp/include/cudf/detail/is_element_valid.hpp
index 72a85d42eb3..4b74d12f306 100644
--- a/cpp/include/cudf/detail/is_element_valid.hpp
+++ b/cpp/include/cudf/detail/is_element_valid.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,10 +18,11 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -44,4 +45,4 @@ bool is_element_valid_sync(column_view const& col_view,
                            rmm::cuda_stream_view stream);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index b4ec5f2cc69..ff7da4462a2 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -34,10 +34,10 @@
 
 // Forward declaration
 namespace cudf::experimental::row::equality {
-class preprocessed_table;
+class CUDF_EXPORT preprocessed_table;
 }
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 constexpr int DEFAULT_JOIN_CG_SIZE = 2;
@@ -185,4 +185,4 @@ struct hash_join {
                     rmm::device_async_resource_ref mr) const;
 };
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/label_bins.hpp b/cpp/include/cudf/detail/label_bins.hpp
index 9f6dcce448d..92a417b0132 100644
--- a/cpp/include/cudf/detail/label_bins.hpp
+++ b/cpp/include/cudf/detail/label_bins.hpp
@@ -27,7 +27,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 namespace detail {
 
@@ -55,4 +55,4 @@ std::unique_ptr<column> label_bins(column_view const& input,
 
 /** @} */  // end of group
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/merge.hpp b/cpp/include/cudf/detail/merge.hpp
index 56ac0554403..72e34b76158 100644
--- a/cpp/include/cudf/detail/merge.hpp
+++ b/cpp/include/cudf/detail/merge.hpp
@@ -16,12 +16,14 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <rmm/device_uvector.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -59,4 +61,4 @@ std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merg
                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 04d8d663acb..67e3617d873 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -25,7 +25,7 @@
 
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -63,6 +63,7 @@ void set_null_mask(bitmask_type* bitmask,
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @return The number of non-zero bits in the specified range.
  */
+CUDF_EXPORT
 cudf::size_type count_set_bits(bitmask_type const* bitmask,
                                size_type start,
                                size_type stop,
@@ -82,6 +83,7 @@ cudf::size_type count_set_bits(bitmask_type const* bitmask,
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @return The number of zero bits in the specified range.
  */
+CUDF_EXPORT
 cudf::size_type count_unset_bits(bitmask_type const* bitmask,
                                  size_type start,
                                  size_type stop,
@@ -100,6 +102,7 @@ cudf::size_type count_unset_bits(bitmask_type const* bitmask,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  * @return A vector storing the number of non-zero bits in the specified ranges.
  */
+CUDF_EXPORT
 std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
                                                 host_span<size_type const> indices,
                                                 rmm::cuda_stream_view stream);
@@ -117,6 +120,7 @@ std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  * @return A vector storing the number of zero bits in the specified ranges.
  */
+CUDF_EXPORT
 std::vector<size_type> segmented_count_unset_bits(bitmask_type const* bitmask,
                                                   host_span<size_type const> indices,
                                                   rmm::cuda_stream_view stream);
@@ -137,6 +141,7 @@ std::vector<size_type> segmented_count_unset_bits(bitmask_type const* bitmask,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  * @return The number of valid elements in the specified range.
  */
+CUDF_EXPORT
 cudf::size_type valid_count(bitmask_type const* bitmask,
                             size_type start,
                             size_type stop,
@@ -169,6 +174,7 @@ cudf::size_type null_count(bitmask_type const* bitmask,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  * @return A vector storing the number of valid elements in each specified range.
  */
+CUDF_EXPORT
 std::vector<size_type> segmented_valid_count(bitmask_type const* bitmask,
                                              host_span<size_type const> indices,
                                              rmm::cuda_stream_view stream);
@@ -189,6 +195,7 @@ std::vector<size_type> segmented_valid_count(bitmask_type const* bitmask,
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  * @return A vector storing the number of null elements in each specified range.
  */
+CUDF_EXPORT
 std::vector<size_type> segmented_null_count(bitmask_type const* bitmask,
                                             host_span<size_type const> indices,
                                             rmm::cuda_stream_view stream);
@@ -220,6 +227,7 @@ rmm::device_buffer copy_bitmask(column_view const& view,
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
+CUDF_EXPORT
 std::pair<rmm::device_buffer, size_type> bitmask_and(host_span<bitmask_type const* const> masks,
                                                      host_span<size_type const> masks_begin_bits,
                                                      size_type mask_size_bits,
@@ -279,4 +287,4 @@ void set_all_valid_null_masks(column_view const& input,
 
 }  // namespace detail
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp
index 6c188d2ca68..23d5fb73ba3 100644
--- a/cpp/include/cudf/detail/quantiles.hpp
+++ b/cpp/include/cudf/detail/quantiles.hpp
@@ -18,11 +18,12 @@
 #include <cudf/quantiles.hpp>
 #include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -64,4 +65,4 @@ std::unique_ptr<column> percentile_approx(tdigest::tdigest_column_view const& in
                                           rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/repeat.hpp b/cpp/include/cudf/detail/repeat.hpp
index abb9e45a95c..e17f1b7c5fd 100644
--- a/cpp/include/cudf/detail/repeat.hpp
+++ b/cpp/include/cudf/detail/repeat.hpp
@@ -24,7 +24,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -51,4 +51,4 @@ std::unique_ptr<table> repeat(table_view const& input_table,
                               rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/replace.hpp b/cpp/include/cudf/detail/replace.hpp
index 46203bdf2f0..e2bd729861b 100644
--- a/cpp/include/cudf/detail/replace.hpp
+++ b/cpp/include/cudf/detail/replace.hpp
@@ -24,7 +24,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::replace_nulls(column_view const&, column_view const&,
@@ -102,4 +102,4 @@ std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
                                                  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp
index 7a1c3d6c4f0..30f8b88b116 100644
--- a/cpp/include/cudf/detail/reshape.hpp
+++ b/cpp/include/cudf/detail/reshape.hpp
@@ -24,7 +24,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::tile
@@ -46,4 +46,4 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
                                            rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/rolling.hpp b/cpp/include/cudf/detail/rolling.hpp
index ea6f38c421c..5bfa5679531 100644
--- a/cpp/include/cudf/detail/rolling.hpp
+++ b/cpp/include/cudf/detail/rolling.hpp
@@ -26,7 +26,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -49,4 +49,4 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                        rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/round.hpp b/cpp/include/cudf/detail/round.hpp
index 1a9c5c82c65..ba3ef1c1ce7 100644
--- a/cpp/include/cudf/detail/round.hpp
+++ b/cpp/include/cudf/detail/round.hpp
@@ -22,7 +22,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! Inner interfaces and implementations
 namespace detail {
 
@@ -39,4 +39,4 @@ std::unique_ptr<column> round(column_view const& input,
                               rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp
index 54c25d0157c..bd60309c5c3 100644
--- a/cpp/include/cudf/detail/scan.hpp
+++ b/cpp/include/cudf/detail/scan.hpp
@@ -17,11 +17,12 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -73,6 +74,7 @@ std::unique_ptr<column> scan_exclusive(column_view const& input,
  * @param mr Device memory resource used to allocate the returned scalar's device memory.
  * @returns Column with scan results.
  */
+CUDF_EXPORT
 std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        scan_aggregation const& agg,
                                        null_policy null_handling,
@@ -99,6 +101,7 @@ std::unique_ptr<column> inclusive_rank_scan(column_view const& order_by,
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return rank values.
  */
+CUDF_EXPORT
 std::unique_ptr<column> inclusive_dense_rank_scan(column_view const& order_by,
                                                   rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr);
@@ -117,4 +120,4 @@ std::unique_ptr<column> inclusive_one_normalized_percent_rank_scan(
   column_view const& order_by, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp
index 95ed6af8c3c..6691ddc5c09 100644
--- a/cpp/include/cudf/detail/scatter.hpp
+++ b/cpp/include/cudf/detail/scatter.hpp
@@ -19,6 +19,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -26,7 +27,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @brief Scatters the rows of the source table into a copy of the target table
@@ -144,4 +145,4 @@ std::unique_ptr<table> boolean_mask_scatter(
   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp
index e60b18f4c8d..72e2cf074bc 100644
--- a/cpp/include/cudf/detail/search.hpp
+++ b/cpp/include/cudf/detail/search.hpp
@@ -25,7 +25,9 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf::detail {
+namespace CUDF_EXPORT cudf {
+namespace detail {
+
 /**
  * @copydoc cudf::lower_bound
  *
@@ -92,6 +94,7 @@ std::unique_ptr<column> contains(column_view const& haystack,
  * @param mr Device memory resource used to allocate the returned vector
  * @return A vector of bools indicating if each row in `needles` has matching rows in `haystack`
  */
+CUDF_EXPORT
 rmm::device_uvector<bool> contains(table_view const& haystack,
                                    table_view const& needles,
                                    null_equality compare_nulls,
@@ -99,4 +102,5 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
                                    rmm::cuda_stream_view stream,
                                    rmm::device_async_resource_ref mr);
 
-}  // namespace cudf::detail
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/sequence.hpp b/cpp/include/cudf/detail/sequence.hpp
index a18a9d3b200..a08010a610f 100644
--- a/cpp/include/cudf/detail/sequence.hpp
+++ b/cpp/include/cudf/detail/sequence.hpp
@@ -23,7 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::sequence(size_type size, scalar const& init, scalar const& step,
@@ -65,4 +65,4 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
                                                          rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp
index 4ddba38a7e9..08cf329f199 100644
--- a/cpp/include/cudf/detail/sorting.hpp
+++ b/cpp/include/cudf/detail/sorting.hpp
@@ -26,7 +26,7 @@
 #include <memory>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -163,4 +163,4 @@ std::unique_ptr<table> stable_sort(table_view const& values,
                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index e3ef4190fd2..05194148a70 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -25,7 +25,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::drop_nulls(table_view const&, std::vector<size_type> const&,
@@ -148,4 +148,4 @@ cudf::size_type distinct_count(table_view const& input,
                                rmm::cuda_stream_view stream);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index beedc009c84..7de68035b19 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -25,9 +25,8 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <utility>
-
-namespace cudf::structs::detail {
+namespace CUDF_EXPORT cudf {
+namespace structs::detail {
 
 enum class column_nullability {
   MATCH_INCOMING,  ///< generate a null column if the incoming column has nulls
@@ -268,4 +267,5 @@ class flattened_table {
  */
 bool contains_null_structs(column_view const& col);
 
-}  // namespace cudf::structs::detail
+}  // namespace structs::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index bfd12c18fff..10eb3d389c7 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -18,14 +18,14 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace tdigest {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace tdigest::detail {
 
 /**
  * @brief Generate a tdigest column from a grouped, sorted set of numeric input values.
@@ -152,6 +152,7 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
  *
  * @returns An empty tdigest column.
  */
+CUDF_EXPORT
 std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr);
 
@@ -236,6 +237,5 @@ std::unique_ptr<scalar> reduce_merge_tdigest(column_view const& input,
                                              rmm::cuda_stream_view stream,
                                              rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace tdigest
-}  // namespace cudf
+}  // namespace tdigest::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp
index 037164aa297..c7798ff60ed 100644
--- a/cpp/include/cudf/detail/timezone.hpp
+++ b/cpp/include/cudf/detail/timezone.hpp
@@ -16,11 +16,13 @@
 #pragma once
 
 #include <cudf/timezone.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf::detail {
+namespace CUDF_EXPORT cudf {
+namespace detail {
 
 /**
  * @copydoc cudf::make_timezone_transition_table(std::optional<std::string_view>, std::string_view,
@@ -34,4 +36,5 @@ std::unique_ptr<table> make_timezone_transition_table(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-}  // namespace cudf::detail
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index 47e13fa2e5e..02849ef023c 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -19,11 +19,12 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::transform
@@ -112,4 +113,4 @@ std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
                                                 rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/transpose.hpp b/cpp/include/cudf/detail/transpose.hpp
index 1f8effc8103..559b2c32996 100644
--- a/cpp/include/cudf/detail/transpose.hpp
+++ b/cpp/include/cudf/detail/transpose.hpp
@@ -18,11 +18,12 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::transpose
@@ -34,4 +35,4 @@ std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input
                                                          rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index 5245cfdf079..bb05138bc8c 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -19,6 +19,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -26,7 +27,7 @@
 
 #include <thrust/transform.h>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @brief Creates a column of `type_id::BOOL8` elements by applying a predicate to every element
@@ -101,4 +102,4 @@ std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
                                    rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/alignment.hpp b/cpp/include/cudf/detail/utilities/alignment.hpp
index e52032fe104..2677eca34db 100644
--- a/cpp/include/cudf/detail/utilities/alignment.hpp
+++ b/cpp/include/cudf/detail/utilities/alignment.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -43,4 +43,4 @@ T* align_ptr_for_type(void* destination)
 }
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
index b66c461ab12..632d5a732ec 100644
--- a/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
+++ b/cpp/include/cudf/detail/utilities/cuda_memcpy.hpp
@@ -16,9 +16,12 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf::detail {
+namespace CUDF_EXPORT cudf {
+namespace detail {
 
 enum class host_memory_kind : uint8_t { PINNED, PAGEABLE };
 
@@ -50,4 +53,5 @@ void cuda_memcpy_async(
 void cuda_memcpy(
   void* dst, void const* src, size_t size, host_memory_kind kind, rmm::cuda_stream_view stream);
 
-}  // namespace cudf::detail
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/default_stream.hpp b/cpp/include/cudf/detail/utilities/default_stream.hpp
index fa438f142b7..f988355e6e0 100644
--- a/cpp/include/cudf/detail/utilities/default_stream.hpp
+++ b/cpp/include/cudf/detail/utilities/default_stream.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,10 +16,12 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 namespace detail {
 
@@ -33,4 +35,4 @@ extern rmm::cuda_stream_view const default_stream_value;
 
 }  // namespace detail
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
index f4e5f718da4..d4dd7b0d626 100644
--- a/cpp/include/cudf/detail/utilities/host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -18,6 +18,7 @@
 
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/aligned.hpp>
 #include <rmm/resource_ref.hpp>
@@ -28,7 +29,8 @@
 #include <limits>
 #include <new>  // for bad_alloc
 
-namespace cudf::detail {
+namespace CUDF_EXPORT cudf {
+namespace detail {
 
 /*! \p rmm_host_allocator is a CUDA-specific host memory allocator
  *  that employs \c a `rmm::host_async_resource_ref` for allocation.
@@ -202,4 +204,5 @@ class host_vector : public thrust::host_vector<T, rmm_host_allocator<T>> {
   host_vector(size_t size, rmm_host_allocator<T> const& alloc) : base(size, alloc) {}
 };
 
-}  // namespace cudf::detail
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/linked_column.hpp b/cpp/include/cudf/detail/utilities/linked_column.hpp
index 0feef0f1a44..0b388938754 100644
--- a/cpp/include/cudf/detail/utilities/linked_column.hpp
+++ b/cpp/include/cudf/detail/utilities/linked_column.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,11 +18,13 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <memory>
 #include <vector>
 
-namespace cudf::detail {
+namespace CUDF_EXPORT cudf {
+namespace detail {
 
 struct linked_column_view;
 
@@ -68,4 +70,5 @@ struct linked_column_view : public column_view_base {
  */
 LinkedColVector table_to_linked_columns(table_view const& table);
 
-}  // namespace cudf::detail
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/stacktrace.hpp b/cpp/include/cudf/detail/utilities/stacktrace.hpp
index c3ec9ce7a52..f54f5f3579a 100644
--- a/cpp/include/cudf/detail/utilities/stacktrace.hpp
+++ b/cpp/include/cudf/detail/utilities/stacktrace.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,12 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <string>
 
-namespace cudf::detail {
+namespace CUDF_EXPORT cudf {
+namespace detail {
 /**
  * @addtogroup utility_stacktrace
  * @{
@@ -44,4 +47,5 @@ std::string get_stacktrace(capture_last_stackframe capture_last_frame);
 
 /** @} */  // end of group
 
-}  // namespace cudf::detail
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/stream_pool.hpp b/cpp/include/cudf/detail/utilities/stream_pool.hpp
index 64c1d4ae514..dfe028bc5b7 100644
--- a/cpp/include/cudf/detail/utilities/stream_pool.hpp
+++ b/cpp/include/cudf/detail/utilities/stream_pool.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -23,7 +24,8 @@
 #include <cstddef>
 #include <vector>
 
-namespace cudf::detail {
+namespace CUDF_EXPORT cudf {
+namespace detail {
 
 class cuda_stream_pool {
  public:
@@ -122,4 +124,5 @@ cuda_stream_pool& global_cuda_stream_pool();
  */
 void join_streams(host_span<rmm::cuda_stream_view const> streams, rmm::cuda_stream_view stream);
 
-}  // namespace cudf::detail
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index 45dc839c9bd..a9d91cdeee1 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -26,6 +26,7 @@
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -36,7 +37,7 @@
 
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -515,4 +516,4 @@ host_vector<T> make_pinned_vector_sync(size_t size, rmm::cuda_stream_view stream
 
 }  // namespace detail
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index 64a3c4edf78..56a2c76b741 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -97,7 +97,7 @@ std::pair<rmm::device_buffer, size_type> valid_if(InputIterator begin,
 
   size_type size = thrust::distance(begin, end);
 
-  auto null_mask = detail::create_null_mask(size, mask_state::UNINITIALIZED, stream, mr);
+  auto null_mask = cudf::create_null_mask(size, mask_state::UNINITIALIZED, stream, mr);
 
   size_type null_count{0};
   if (size > 0) {
diff --git a/cpp/include/cudf/dictionary/detail/concatenate.hpp b/cpp/include/cudf/dictionary/detail/concatenate.hpp
index 55f3825b3ec..0eb17aa06f4 100644
--- a/cpp/include/cudf/dictionary/detail/concatenate.hpp
+++ b/cpp/include/cudf/dictionary/detail/concatenate.hpp
@@ -23,9 +23,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace dictionary {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace dictionary::detail {
 /**
  * @brief Returns a single column by vertically concatenating the given vector of
  * dictionary columns.
@@ -42,6 +41,5 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
                                     rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace dictionary
-}  // namespace cudf
+}  // namespace dictionary::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/detail/encode.hpp b/cpp/include/cudf/dictionary/detail/encode.hpp
index 3b5a3bbab56..cc7ffbd397f 100644
--- a/cpp/include/cudf/dictionary/detail/encode.hpp
+++ b/cpp/include/cudf/dictionary/detail/encode.hpp
@@ -23,9 +23,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace dictionary {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace dictionary::detail {
 /**
  * @brief Construct a dictionary column by dictionary encoding an existing column.
  *
@@ -84,6 +83,5 @@ std::unique_ptr<column> decode(dictionary_column_view const& dictionary_column,
  */
 data_type get_indices_type_for_size(size_type keys_size);
 
-}  // namespace detail
-}  // namespace dictionary
-}  // namespace cudf
+}  // namespace dictionary::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/detail/merge.hpp b/cpp/include/cudf/dictionary/detail/merge.hpp
index c4229690ff5..a1777d412fe 100644
--- a/cpp/include/cudf/dictionary/detail/merge.hpp
+++ b/cpp/include/cudf/dictionary/detail/merge.hpp
@@ -22,9 +22,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace dictionary {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace dictionary::detail {
 
 /**
  * @brief Merges two dictionary columns.
@@ -47,6 +46,5 @@ std::unique_ptr<column> merge(dictionary_column_view const& lcol,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace dictionary
-}  // namespace cudf
+}  // namespace dictionary::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/detail/replace.hpp b/cpp/include/cudf/dictionary/detail/replace.hpp
index 81a91d57169..1e1ee182fc5 100644
--- a/cpp/include/cudf/dictionary/detail/replace.hpp
+++ b/cpp/include/cudf/dictionary/detail/replace.hpp
@@ -23,9 +23,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace dictionary {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace dictionary::detail {
 
 /**
  * @brief Create a new dictionary column by replacing nulls with values
@@ -62,6 +61,5 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace dictionary
-}  // namespace cudf
+}  // namespace dictionary::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/detail/search.hpp b/cpp/include/cudf/dictionary/detail/search.hpp
index 2563b96b214..921acc258a9 100644
--- a/cpp/include/cudf/dictionary/detail/search.hpp
+++ b/cpp/include/cudf/dictionary/detail/search.hpp
@@ -18,11 +18,12 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace dictionary {
 namespace detail {
 
@@ -63,4 +64,4 @@ std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionar
 
 }  // namespace detail
 }  // namespace dictionary
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp
index 9cdda773dbb..9eb812eb8ee 100644
--- a/cpp/include/cudf/dictionary/detail/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp
@@ -24,9 +24,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace dictionary {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace dictionary::detail {
 /**
  * @copydoc cudf::dictionary::add_keys(dictionary_column_view const&,column_view
  * const&,rmm::device_async_resource_ref)
@@ -103,6 +102,5 @@ std::vector<std::unique_ptr<column>> match_dictionaries(
 std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_dictionaries(
   std::vector<table_view> tables, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace dictionary
-}  // namespace cudf
+}  // namespace dictionary::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/dictionary_column_view.hpp b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
index 9f2bc90c0b2..dc822fee38b 100644
--- a/cpp/include/cudf/dictionary/dictionary_column_view.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@
  * @brief Class definition for cudf::dictionary_column_view
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup dictionary_classes
  * @{
@@ -124,4 +124,4 @@ class dictionary_column_view : private column_view {
 namespace dictionary {  // defined here for doxygen output
 }
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/dictionary_factories.hpp b/cpp/include/cudf/dictionary/dictionary_factories.hpp
index 21f593e1aec..2f663c4af61 100644
--- a/cpp/include/cudf/dictionary/dictionary_factories.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_factories.hpp
@@ -23,7 +23,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup column_factories Factories
  * @{
@@ -127,4 +127,4 @@ std::unique_ptr<column> make_dictionary_column(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp
index 768e2be2b0d..9e68c947793 100644
--- a/cpp/include/cudf/dictionary/encode.hpp
+++ b/cpp/include/cudf/dictionary/encode.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace dictionary {
 /**
  * @addtogroup dictionary_encode
@@ -86,4 +86,4 @@ std::unique_ptr<column> decode(
 
 /** @} */  // end of group
 }  // namespace dictionary
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/search.hpp b/cpp/include/cudf/dictionary/search.hpp
index 1dff6dc1d5d..66275de33e9 100644
--- a/cpp/include/cudf/dictionary/search.hpp
+++ b/cpp/include/cudf/dictionary/search.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace dictionary {
 /**
  * @addtogroup dictionary_search
@@ -50,4 +50,4 @@ std::unique_ptr<scalar> get_index(
 
 /** @} */  // end of group
 }  // namespace dictionary
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/update_keys.hpp b/cpp/include/cudf/dictionary/update_keys.hpp
index ce7057359a1..c02e91f8d78 100644
--- a/cpp/include/cudf/dictionary/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/update_keys.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace dictionary {
 /**
  * @addtogroup dictionary_update
@@ -169,4 +169,4 @@ std::vector<std::unique_ptr<column>> match_dictionaries(
 
 /** @} */  // end of group
 }  // namespace dictionary
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp
index 90139e8634a..054f1e859f4 100644
--- a/cpp/include/cudf/filling.hpp
+++ b/cpp/include/cudf/filling.hpp
@@ -18,13 +18,14 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup transformation_fill
  * @{
@@ -244,4 +245,4 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp
index c9cbc603226..ea2f5d4b6ca 100644
--- a/cpp/include/cudf/fixed_point/fixed_point.hpp
+++ b/cpp/include/cudf/fixed_point/fixed_point.hpp
@@ -30,7 +30,7 @@
 #include <string>
 
 /// `fixed_point` and supporting types
-namespace numeric {
+namespace CUDF_EXPORT numeric {
 
 /**
  * @addtogroup fixed_point_classes
@@ -799,4 +799,4 @@ using decimal64  = fixed_point<int64_t, Radix::BASE_10>;     ///<  64-bit decima
 using decimal128 = fixed_point<__int128_t, Radix::BASE_10>;  ///< 128-bit decimal fixed point
 
 /** @} */  // end of group
-}  // namespace numeric
+}  // namespace CUDF_EXPORT numeric
diff --git a/cpp/include/cudf/fixed_point/floating_conversion.hpp b/cpp/include/cudf/fixed_point/floating_conversion.hpp
index f12177c6a4b..f0d50edccd1 100644
--- a/cpp/include/cudf/fixed_point/floating_conversion.hpp
+++ b/cpp/include/cudf/fixed_point/floating_conversion.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <cuda/std/cmath>
@@ -24,7 +25,7 @@
 
 #include <cstring>
 
-namespace numeric {
+namespace CUDF_EXPORT numeric {
 
 /**
  * @addtogroup floating_conversion
@@ -1142,4 +1143,4 @@ CUDF_HOST_DEVICE inline FloatingType convert_integral_to_floating(Rep const& val
 }  // namespace detail
 
 /** @} */  // end of group
-}  // namespace numeric
+}  // namespace CUDF_EXPORT numeric
diff --git a/cpp/include/cudf/fixed_point/temporary.hpp b/cpp/include/cudf/fixed_point/temporary.hpp
index 17dba6c2452..2bafe235058 100644
--- a/cpp/include/cudf/fixed_point/temporary.hpp
+++ b/cpp/include/cudf/fixed_point/temporary.hpp
@@ -24,7 +24,7 @@
 #include <algorithm>
 #include <string>
 
-namespace numeric {
+namespace CUDF_EXPORT numeric {
 namespace detail {
 
 template <typename T>
@@ -81,4 +81,4 @@ constexpr auto exp10(int32_t exponent)
 }
 
 }  // namespace detail
-}  // namespace numeric
+}  // namespace CUDF_EXPORT numeric
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index 831ef68ed15..f7df9c1aa9b 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -21,6 +21,7 @@
 #include <cudf/replace.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -31,7 +32,7 @@
 #include <utility>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! `groupby` APIs
 namespace groupby {
 namespace detail {
@@ -420,4 +421,4 @@ class groupby {
 };
 /** @} */
 }  // namespace groupby
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index 3c2f6dfe0d5..b8be2af6967 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -17,11 +17,12 @@
 
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup column_hash
@@ -187,4 +188,4 @@ std::unique_ptr<column> xxhash_64(
 }  // namespace hashing
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp
index 77266ceb48f..1a459430346 100644
--- a/cpp/include/cudf/hashing/detail/hashing.hpp
+++ b/cpp/include/cudf/hashing/detail/hashing.hpp
@@ -24,9 +24,8 @@
 #include <cstddef>
 #include <functional>
 
-namespace cudf {
-namespace hashing {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace hashing::detail {
 
 std::unique_ptr<column> murmurhash3_x86_32(table_view const& input,
                                            uint32_t seed,
@@ -109,9 +108,8 @@ constexpr std::size_t hash_combine(std::size_t lhs, std::size_t rhs)
   return lhs ^ (rhs + 0x9e37'79b9'7f4a'7c15 + (lhs << 6) + (lhs >> 2));
 }
 
-}  // namespace detail
-}  // namespace hashing
-}  // namespace cudf
+}  // namespace hashing::detail
+}  // namespace CUDF_EXPORT cudf
 
 // specialization of std::hash for cudf::data_type
 namespace std {
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 73bc205a095..9a8f87b4a46 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -36,6 +36,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -53,7 +54,7 @@ struct ArrowArray;
 
 struct ArrowArrayStream;
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup interop_dlpack
  * @{
@@ -648,4 +649,4 @@ unique_column_view_t from_arrow_device_column(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/arrow_io_source.hpp b/cpp/include/cudf/io/arrow_io_source.hpp
index d7a48c34e12..ed5c839cbb4 100644
--- a/cpp/include/cudf/io/arrow_io_source.hpp
+++ b/cpp/include/cudf/io/arrow_io_source.hpp
@@ -18,6 +18,8 @@
 
 #include "datasource.hpp"
 
+#include <cudf/utilities/export.hpp>
+
 #include <arrow/filesystem/filesystem.h>
 #include <arrow/io/interfaces.h>
 
@@ -25,7 +27,8 @@
 #include <string>
 #include <utility>
 
-namespace cudf::io {
+namespace CUDF_EXPORT cudf {
+namespace io {
 /**
  * @addtogroup io_datasources
  * @{
@@ -86,4 +89,5 @@ class arrow_io_source : public datasource {
 };
 
 /** @} */  // end of group
-}  // namespace cudf::io
+}  // namespace io
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp
index 8bc74eb574c..63f9ea3a624 100644
--- a/cpp/include/cudf/io/avro.hpp
+++ b/cpp/include/cudf/io/avro.hpp
@@ -28,7 +28,7 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 /**
  * @addtogroup io_readers
@@ -221,4 +221,4 @@ table_with_metadata read_avro(
 
 /** @} */  // end of group
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index cc361f0918e..bbb4636a5a3 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -31,7 +31,7 @@
 #include <variant>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 
 /**
@@ -1762,4 +1762,4 @@ void write_csv(csv_writer_options const& options,
 
 /** @} */  // end of group
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/data_sink.hpp b/cpp/include/cudf/io/data_sink.hpp
index 69d8a388d45..e1eb9c042c7 100644
--- a/cpp/include/cudf/io/data_sink.hpp
+++ b/cpp/include/cudf/io/data_sink.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,7 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! IO interfaces
 namespace io {
 
@@ -209,4 +209,4 @@ class data_sink {
 
 /** @} */  // end of group
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index 28263d466f3..b12fbe39a57 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/io/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -25,7 +26,7 @@
 #include <future>
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! IO interfaces
 namespace io {
 
@@ -376,4 +377,4 @@ class datasource {
 
 /** @} */  // end of group
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp
index fe9f935d2cc..13f695d6866 100644
--- a/cpp/include/cudf/io/detail/avro.hpp
+++ b/cpp/include/cudf/io/detail/avro.hpp
@@ -18,14 +18,13 @@
 
 #include <cudf/io/avro.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace io {
-namespace detail {
-namespace avro {
+namespace CUDF_EXPORT cudf {
+namespace io::detail::avro {
 
 /**
  * @brief Reads the entire dataset.
@@ -42,7 +41,5 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr);
 
-}  // namespace avro
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+}  // namespace io::detail::avro
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 2a70fa888f4..d4cad2f70fd 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -17,14 +17,13 @@
 #pragma once
 
 #include <cudf/io/csv.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace io {
-namespace detail {
-namespace csv {
+namespace CUDF_EXPORT cudf {
+namespace io::detail::csv {
 
 /**
  * @brief Reads the entire dataset.
@@ -56,7 +55,5 @@ void write_csv(data_sink* sink,
                csv_writer_options const& options,
                rmm::cuda_stream_view stream);
 
-}  // namespace csv
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+}  // namespace io::detail::csv
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 6ff1c12831b..42b10a78ce8 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -18,11 +18,13 @@
 
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf::io::json::detail {
+namespace CUDF_EXPORT cudf {
+namespace io::json::detail {
 
 /**
  * @brief Reads and returns the entire data set.
@@ -73,4 +75,5 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<char>
 void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
                           rmm::cuda_stream_view stream,
                           rmm::device_async_resource_ref mr);
-}  // namespace cudf::io::json::detail
+}  // namespace io::json::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 597ddd9cf0a..7538cf7d29c 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -21,6 +21,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
@@ -29,12 +30,13 @@
 #include <string>
 #include <vector>
 
-namespace cudf::io {
+namespace CUDF_EXPORT cudf {
+namespace io {
 
 // Forward declaration
-class orc_reader_options;
-class orc_writer_options;
-class chunked_orc_writer_options;
+class CUDF_EXPORT orc_reader_options;
+class CUDF_EXPORT orc_writer_options;
+class CUDF_EXPORT chunked_orc_writer_options;
 
 namespace orc::detail {
 
@@ -183,4 +185,5 @@ class writer {
 };
 
 }  // namespace orc::detail
-}  // namespace cudf::io
+}  // namespace io
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 21c870cb75e..a6945e0b7ab 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -24,6 +24,7 @@
 #include <cudf/io/parquet_metadata.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -32,12 +33,13 @@
 #include <string>
 #include <vector>
 
-namespace cudf::io {
+namespace CUDF_EXPORT cudf {
+namespace io {
 
 // Forward declaration
-class parquet_reader_options;
-class parquet_writer_options;
-class chunked_parquet_writer_options;
+class CUDF_EXPORT parquet_reader_options;
+class CUDF_EXPORT parquet_writer_options;
+class CUDF_EXPORT chunked_parquet_writer_options;
 
 namespace parquet::detail {
 
@@ -257,4 +259,5 @@ class writer {
  */
 parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> const> sources);
 }  // namespace parquet::detail
-}  // namespace cudf::io
+}  // namespace io
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/detail/tokenize_json.hpp b/cpp/include/cudf/io/detail/tokenize_json.hpp
index d08c4e7c65a..715eb855daa 100644
--- a/cpp/include/cudf/io/detail/tokenize_json.hpp
+++ b/cpp/include/cudf/io/detail/tokenize_json.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/io/json.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -117,7 +118,7 @@ enum token_t : PdaTokenT {
   NUM_TOKENS
 };
 
-namespace detail {
+namespace CUDF_EXPORT detail {
 
 /**
  * @brief Parses the given JSON string and emits a sequence of tokens that demarcate relevant
@@ -136,6 +137,6 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr);
 
-}  // namespace detail
+}  // namespace CUDF_EXPORT detail
 
 }  // namespace cudf::io::json
diff --git a/cpp/include/cudf/io/detail/utils.hpp b/cpp/include/cudf/io/detail/utils.hpp
index 7bbda21858d..d0da9b410ce 100644
--- a/cpp/include/cudf/io/detail/utils.hpp
+++ b/cpp/include/cudf/io/detail/utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,13 +16,14 @@
 
 #pragma once
 
-namespace cudf {
-namespace io {
-namespace detail {
+#include <cudf/utilities/export.hpp>
+
+namespace CUDF_EXPORT cudf {
+namespace io::detail {
 /**
  * @brief Whether writer writes in chunks or all at once
  */
 enum class single_write_mode : bool { YES, NO };
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+
+}  // namespace io::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index d47266fdd12..0cb39d15cd5 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -30,7 +30,7 @@
 #include <variant>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 /**
  * @addtogroup io_readers
@@ -1024,4 +1024,4 @@ void write_json(json_writer_options const& options,
 
 /** @} */  // end of group
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 623c1d9fc72..8d484b15872 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -20,6 +20,7 @@
 #include <cudf/io/types.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -31,7 +32,7 @@
 #include <utility>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 /**
  * @addtogroup io_readers
@@ -426,7 +427,7 @@ class chunked_orc_reader {
    *
    * This is added just to satisfy cython.
    */
-  chunked_orc_reader() = default;
+  chunked_orc_reader();
 
   /**
    * @brief Construct the reader from input/output size limits, output row granularity, along with
@@ -1429,7 +1430,12 @@ class orc_chunked_writer {
    * @brief Default constructor, this should never be used.
    *        This is added just to satisfy cython.
    */
-  orc_chunked_writer() = default;
+  orc_chunked_writer();
+
+  /**
+   * @brief virtual destructor, Added so we don't leak detail types.
+   */
+  ~orc_chunked_writer();
 
   /**
    * @brief Constructor with chunked writer options
@@ -1459,4 +1465,4 @@ class orc_chunked_writer {
 
 /** @} */  // end of group
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
index 35196a19349..3c6194bb721 100644
--- a/cpp/include/cudf/io/orc_metadata.hpp
+++ b/cpp/include/cudf/io/orc_metadata.hpp
@@ -23,12 +23,13 @@
 
 #include <cudf/io/orc_types.hpp>
 #include <cudf/io/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <optional>
 #include <variant>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 /**
  * @addtogroup io_types
@@ -387,4 +388,4 @@ orc_metadata read_orc_metadata(source_info const& src_info,
 
 /** @} */  // end of group
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/orc_types.hpp b/cpp/include/cudf/io/orc_types.hpp
index abd81d76579..f6c03814c9b 100644
--- a/cpp/include/cudf/io/orc_types.hpp
+++ b/cpp/include/cudf/io/orc_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,12 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <cstdint>
 
-namespace cudf::io::orc {
+namespace CUDF_EXPORT cudf {
+namespace io::orc {
 /**
  * @addtogroup io_types
  * @{
@@ -104,4 +107,5 @@ enum ProtofType : uint8_t {
 };
 
 /** @} */  // end of group
-}  // namespace cudf::io::orc
+}  // namespace io::orc
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 4d98cae73a7..12897ac77ef 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -21,6 +21,7 @@
 #include <cudf/io/types.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -32,7 +33,8 @@
 #include <utility>
 #include <vector>
 
-namespace cudf::io {
+namespace CUDF_EXPORT cudf {
+namespace io {
 /**
  * @addtogroup io_readers
  * @{
@@ -480,8 +482,9 @@ class chunked_parquet_reader {
    * @brief Default constructor, this should never be used.
    *
    * This is added just to satisfy cython.
+   * This is added to not leak detail API
    */
-  chunked_parquet_reader() = default;
+  chunked_parquet_reader();
 
   /**
    * @brief Constructor for chunked reader.
@@ -1380,8 +1383,9 @@ class parquet_chunked_writer {
   /**
    * @brief Default constructor, this should never be used.
    *        This is added just to satisfy cython.
+   *        This is added to not leak detail API
    */
-  parquet_chunked_writer() = default;
+  parquet_chunked_writer();
 
   /**
    * @brief Constructor with chunked writer options
@@ -1391,6 +1395,11 @@ class parquet_chunked_writer {
    */
   parquet_chunked_writer(chunked_parquet_writer_options const& options,
                          rmm::cuda_stream_view stream = cudf::get_default_stream());
+  /**
+   * @brief Default destructor.
+   *        This is added to not leak detail API
+   */
+  ~parquet_chunked_writer();
 
   /**
    * @brief Writes table to output.
@@ -1423,4 +1432,5 @@ class parquet_chunked_writer {
 
 /** @} */  // end of group
 
-}  // namespace cudf::io
+}  // namespace io
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/parquet_metadata.hpp b/cpp/include/cudf/io/parquet_metadata.hpp
index e0c406c180c..dbb1fd03dca 100644
--- a/cpp/include/cudf/io/parquet_metadata.hpp
+++ b/cpp/include/cudf/io/parquet_metadata.hpp
@@ -22,13 +22,14 @@
 #pragma once
 
 #include <cudf/io/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <optional>
 #include <string_view>
 #include <variant>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 /**
  * @addtogroup io_types
@@ -270,4 +271,4 @@ parquet_metadata read_parquet_metadata(source_info const& src_info);
 
 /** @} */  // end of group
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/text/byte_range_info.hpp b/cpp/include/cudf/io/text/byte_range_info.hpp
index 60ee867f058..7e9256be1d3 100644
--- a/cpp/include/cudf/io/text/byte_range_info.hpp
+++ b/cpp/include/cudf/io/text/byte_range_info.hpp
@@ -17,11 +17,12 @@
 #pragma once
 
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <cstdint>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 namespace text {
 /**
@@ -113,4 +114,4 @@ byte_range_info create_byte_range_info_max();
 
 }  // namespace text
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/text/data_chunk_source.hpp b/cpp/include/cudf/io/text/data_chunk_source.hpp
index 13aff4b3b8f..dd1d2331c1f 100644
--- a/cpp/include/cudf/io/text/data_chunk_source.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source.hpp
@@ -16,12 +16,13 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/device_buffer.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 namespace text {
 
@@ -120,4 +121,4 @@ class data_chunk_source {
 
 }  // namespace text
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
index 046994d33cc..42d0540b386 100644
--- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,12 +19,14 @@
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/text/data_chunk_source.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <memory>
 #include <string>
 
-namespace cudf::io::text {
+namespace CUDF_EXPORT cudf {
+namespace io::text {
 
 /**
  * @brief Creates a data source capable of producing device-buffered views of a datasource.
@@ -84,4 +86,5 @@ std::unique_ptr<data_chunk_source> make_source_from_bgzip_file(std::string_view
  */
 std::unique_ptr<data_chunk_source> make_source(cudf::string_scalar& data);
 
-}  // namespace cudf::io::text
+}  // namespace io::text
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp
index 515bcf16de2..11eb4518210 100644
--- a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp
+++ b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <zlib.h>
@@ -26,7 +27,8 @@
 #include <fstream>
 #include <limits>
 
-namespace cudf::io::text::detail::bgzip {
+namespace CUDF_EXPORT cudf {
+namespace io::text::detail::bgzip {
 
 struct header {
   int block_size;
@@ -109,4 +111,5 @@ void write_compressed_block(std::ostream& output_stream,
                             host_span<char const> pre_size_subfields  = {},
                             host_span<char const> post_size_subfields = {});
 
-}  // namespace cudf::io::text::detail::bgzip
+}  // namespace io::text::detail::bgzip
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/text/detail/multistate.hpp b/cpp/include/cudf/io/text/detail/multistate.hpp
index e4e47d8f010..32187b43d34 100644
--- a/cpp/include/cudf/io/text/detail/multistate.hpp
+++ b/cpp/include/cudf/io/text/detail/multistate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,11 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <cstdint>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 namespace text {
 namespace detail {
@@ -125,4 +127,4 @@ constexpr multistate operator+(multistate const& lhs, multistate const& rhs)
 }  // namespace detail
 }  // namespace text
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/text/detail/tile_state.hpp b/cpp/include/cudf/io/text/detail/tile_state.hpp
index aa9185b4983..3980a7fac02 100644
--- a/cpp/include/cudf/io/text/detail/tile_state.hpp
+++ b/cpp/include/cudf/io/text/detail/tile_state.hpp
@@ -16,12 +16,14 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_scan.cuh>
 #include <cuda/atomic>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 namespace text {
 namespace detail {
@@ -147,4 +149,4 @@ struct scan_tile_state_callback {
 }  // namespace detail
 }  // namespace text
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
index 28862d97ede..eee3fefc79f 100644
--- a/cpp/include/cudf/io/text/detail/trie.hpp
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/detail/multistate.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -30,7 +31,7 @@
 #include <unordered_map>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 namespace text {
 namespace detail {
@@ -248,4 +249,4 @@ struct trie {
 }  // namespace detail
 }  // namespace text
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index e29ab78ae46..8624a386d0f 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -27,7 +27,7 @@
 #include <memory>
 #include <optional>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace io {
 namespace text {
 /**
@@ -120,4 +120,4 @@ std::unique_ptr<cudf::column> multibyte_split(
 
 }  // namespace text
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 431a5e7be83..3df737413fa 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -33,16 +33,16 @@
 #include <utility>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! IO interfaces
 namespace io {
 class data_sink;
 class datasource;
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
 
 //! cuDF interfaces
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! IO interfaces
 namespace io {
 /**
@@ -1089,4 +1089,4 @@ class reader_column_schema {
 
 /** @} */  // end of group
 }  // namespace io
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index ba485bd6372..f4139721475 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -21,6 +21,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -32,7 +33,7 @@
 #include <utility>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief Enum to indicate whether the distinct join table has nested columns or not
@@ -43,13 +44,24 @@ enum class has_nested : bool { YES, NO };
 
 // forward declaration
 namespace hashing::detail {
+
+/**
+ * @brief Forward declaration for our Murmur Hash 3 implementation
+ */
 template <typename T>
 class MurmurHash3_x86_32;
 }  // namespace hashing::detail
 namespace detail {
+
+/**
+ * @brief Forward declaration for our hash join
+ */
 template <typename T>
 class hash_join;
 
+/**
+ * @brief Forward declaration for our distinct hash join
+ */
 template <cudf::has_nested HasNested>
 class distinct_hash_join;
 }  // namespace detail
@@ -1179,4 +1191,4 @@ std::size_t conditional_left_anti_join_size(
   ast::expression const& binary_predicate,
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/json/json.hpp b/cpp/include/cudf/json/json.hpp
index 385e8e54bdc..48d5dcf7727 100644
--- a/cpp/include/cudf/json/json.hpp
+++ b/cpp/include/cudf/json/json.hpp
@@ -17,13 +17,14 @@
 
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/optional.h>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup json_object
@@ -173,4 +174,4 @@ std::unique_ptr<cudf::column> get_json_object(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of doxygen group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/labeling/label_bins.hpp b/cpp/include/cudf/labeling/label_bins.hpp
index 9091e31a9ea..7eb25134ca5 100644
--- a/cpp/include/cudf/labeling/label_bins.hpp
+++ b/cpp/include/cudf/labeling/label_bins.hpp
@@ -24,7 +24,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup label_bins
@@ -79,4 +79,4 @@ std::unique_ptr<column> label_bins(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp
index 853562acfff..5a310e6651f 100644
--- a/cpp/include/cudf/lists/combine.hpp
+++ b/cpp/include/cudf/lists/combine.hpp
@@ -17,11 +17,12 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 //! Lists column APIs
 namespace lists {
@@ -102,4 +103,4 @@ std::unique_ptr<column> concatenate_list_elements(
 
 /** @} */  // end of group
 }  // namespace lists
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp
index 060882555aa..cd0a216488c 100644
--- a/cpp/include/cudf/lists/contains.hpp
+++ b/cpp/include/cudf/lists/contains.hpp
@@ -17,11 +17,12 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace lists {
 /**
  * @addtogroup lists_contains
@@ -182,4 +183,4 @@ std::unique_ptr<column> index_of(
 
 /** @} */  // end of group
 }  // namespace lists
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp
index 2b9f5aa5607..a6f2ea6e68a 100644
--- a/cpp/include/cudf/lists/count_elements.hpp
+++ b/cpp/include/cudf/lists/count_elements.hpp
@@ -17,11 +17,12 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace lists {
 /**
  * @addtogroup lists_elements
@@ -58,4 +59,4 @@ std::unique_ptr<column> count_elements(
 /** @} */  // end of lists_elements group
 
 }  // namespace lists
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/combine.hpp b/cpp/include/cudf/lists/detail/combine.hpp
index bd4c01bbb4b..07309da2814 100644
--- a/cpp/include/cudf/lists/detail/combine.hpp
+++ b/cpp/include/cudf/lists/detail/combine.hpp
@@ -21,9 +21,8 @@
 
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace lists {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 /**
  * @copydoc cudf::lists::concatenate_rows
  *
@@ -44,6 +43,5 @@ std::unique_ptr<column> concatenate_list_elements(column_view const& input,
                                                   rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace lists
-}  // namespace cudf
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/concatenate.hpp b/cpp/include/cudf/lists/detail/concatenate.hpp
index d67958ef260..edfa3355dcd 100644
--- a/cpp/include/cudf/lists/detail/concatenate.hpp
+++ b/cpp/include/cudf/lists/detail/concatenate.hpp
@@ -24,9 +24,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace lists {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @brief Returns a single column by concatenating the given vector of
@@ -48,6 +47,5 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
                                     rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace lists
-}  // namespace cudf
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/contains.hpp b/cpp/include/cudf/lists/detail/contains.hpp
index 638cc7afb81..1ca3651b55a 100644
--- a/cpp/include/cudf/lists/detail/contains.hpp
+++ b/cpp/include/cudf/lists/detail/contains.hpp
@@ -20,9 +20,8 @@
 
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace lists {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @copydoc cudf::lists::index_of(cudf::lists_column_view const&,
@@ -71,6 +70,5 @@ std::unique_ptr<column> contains(cudf::lists_column_view const& lists,
                                  cudf::column_view const& search_keys,
                                  rmm::cuda_stream_view stream,
                                  rmm::device_async_resource_ref mr);
-}  // namespace detail
-}  // namespace lists
-}  // namespace cudf
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/copying.hpp b/cpp/include/cudf/lists/detail/copying.hpp
index 18a70bba5e9..76154ae7064 100644
--- a/cpp/include/cudf/lists/detail/copying.hpp
+++ b/cpp/include/cudf/lists/detail/copying.hpp
@@ -20,9 +20,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace lists {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @brief Returns a new lists column created from a subset of the
@@ -49,6 +48,5 @@ std::unique_ptr<cudf::column> copy_slice(lists_column_view const& lists,
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace lists
-}  // namespace cudf
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/dremel.hpp b/cpp/include/cudf/lists/detail/dremel.hpp
index 53448424827..96ee30dd261 100644
--- a/cpp/include/cudf/lists/detail/dremel.hpp
+++ b/cpp/include/cudf/lists/detail/dremel.hpp
@@ -17,10 +17,12 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/device_uvector.hpp>
 
-namespace cudf::detail {
+namespace CUDF_EXPORT cudf {
+namespace detail {
 
 /**
  * @brief Device view for `dremel_data`.
@@ -213,4 +215,5 @@ dremel_data get_comparator_data(column_view input,
                                 std::vector<uint8_t> nullability,
                                 bool output_as_byte_array,
                                 rmm::cuda_stream_view stream);
-}  // namespace cudf::detail
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/extract.hpp b/cpp/include/cudf/lists/detail/extract.hpp
index 6f983d44bc9..e14b93ff912 100644
--- a/cpp/include/cudf/lists/detail/extract.hpp
+++ b/cpp/include/cudf/lists/detail/extract.hpp
@@ -20,9 +20,8 @@
 
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace lists {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @copydoc cudf::lists::extract_list_element(lists_column_view, size_type,
@@ -44,6 +43,5 @@ std::unique_ptr<column> extract_list_element(lists_column_view lists_column,
                                              rmm::cuda_stream_view stream,
                                              rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace lists
-}  // namespace cudf
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index 0cd77556f33..294282d7caa 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -21,6 +21,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -276,6 +277,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
  *
  * @returns column with elements gathered based on `gather_data`
  */
+CUDF_EXPORT
 std::unique_ptr<column> gather_list_nested(lists_column_view const& list,
                                            gather_data& gd,
                                            rmm::cuda_stream_view stream,
@@ -293,6 +295,7 @@ std::unique_ptr<column> gather_list_nested(lists_column_view const& list,
  *
  * @returns column with elements gathered based on `gather_data`
  */
+CUDF_EXPORT
 std::unique_ptr<column> gather_list_leaf(column_view const& column,
                                          gather_data const& gd,
                                          rmm::cuda_stream_view stream,
diff --git a/cpp/include/cudf/lists/detail/interleave_columns.hpp b/cpp/include/cudf/lists/detail/interleave_columns.hpp
index 3aff93840a9..ae8caa853f3 100644
--- a/cpp/include/cudf/lists/detail/interleave_columns.hpp
+++ b/cpp/include/cudf/lists/detail/interleave_columns.hpp
@@ -21,9 +21,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace lists {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @brief Returns a single column by interleaving rows of the given table of list elements.
@@ -50,6 +49,5 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace lists
-}  // namespace cudf
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/lists_column_factories.hpp b/cpp/include/cudf/lists/detail/lists_column_factories.hpp
index 192aee8d811..18d66f15b1e 100644
--- a/cpp/include/cudf/lists/detail/lists_column_factories.hpp
+++ b/cpp/include/cudf/lists/detail/lists_column_factories.hpp
@@ -23,9 +23,8 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace lists {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @brief Internal API to construct a lists column from a `list_scalar`, for public
@@ -67,6 +66,5 @@ std::unique_ptr<column> make_all_nulls_lists_column(size_type size,
                                                     rmm::cuda_stream_view stream,
                                                     rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace lists
-}  // namespace cudf
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/reverse.hpp b/cpp/include/cudf/lists/detail/reverse.hpp
index d099a0708b9..d10d7784e6c 100644
--- a/cpp/include/cudf/lists/detail/reverse.hpp
+++ b/cpp/include/cudf/lists/detail/reverse.hpp
@@ -16,10 +16,12 @@
 #pragma once
 
 #include <cudf/lists/reverse.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace cudf::lists::detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @copydoc cudf::lists::reverse
@@ -29,4 +31,5 @@ std::unique_ptr<column> reverse(lists_column_view const& input,
                                 rmm::cuda_stream_view stream,
                                 rmm::device_async_resource_ref mr);
 
-}  // namespace cudf::lists::detail
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index c550ad5b94f..be76e456900 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -239,11 +239,11 @@ std::unique_ptr<column> scatter(scalar const& slr,
   auto const num_rows = target.size();
   if (num_rows == 0) { return cudf::empty_like(target); }
 
-  auto lv        = static_cast<list_scalar const*>(&slr);
-  bool slr_valid = slr.is_valid(stream);
-  rmm::device_buffer null_mask =
-    slr_valid ? cudf::detail::create_null_mask(1, mask_state::UNALLOCATED, stream, mr)
-              : cudf::detail::create_null_mask(1, mask_state::ALL_NULL, stream, mr);
+  auto lv                      = static_cast<list_scalar const*>(&slr);
+  bool slr_valid               = slr.is_valid(stream);
+  rmm::device_buffer null_mask = slr_valid
+                                   ? cudf::create_null_mask(1, mask_state::UNALLOCATED, stream, mr)
+                                   : cudf::create_null_mask(1, mask_state::ALL_NULL, stream, mr);
   auto offset_column =
     make_numeric_column(data_type{type_to_id<size_type>()}, 2, mask_state::UNALLOCATED, stream, mr);
   thrust::sequence(rmm::exec_policy_nosync(stream),
diff --git a/cpp/include/cudf/lists/detail/set_operations.hpp b/cpp/include/cudf/lists/detail/set_operations.hpp
index 8746b1ba62a..abfcef72d47 100644
--- a/cpp/include/cudf/lists/detail/set_operations.hpp
+++ b/cpp/include/cudf/lists/detail/set_operations.hpp
@@ -24,7 +24,8 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf::lists::detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @copydoc cudf::list::have_overlap
@@ -75,4 +76,5 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
                                             rmm::device_async_resource_ref mr);
 
 /** @} */  // end of group
-}  // namespace cudf::lists::detail
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/sorting.hpp b/cpp/include/cudf/lists/detail/sorting.hpp
index e428ea84ce6..8cbfbbae769 100644
--- a/cpp/include/cudf/lists/detail/sorting.hpp
+++ b/cpp/include/cudf/lists/detail/sorting.hpp
@@ -20,9 +20,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace lists {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @copydoc cudf::lists::sort_lists
@@ -46,6 +45,5 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
                                           rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace lists
-}  // namespace cudf
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp
index f5e5b29bc8f..c11e07cd190 100644
--- a/cpp/include/cudf/lists/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/detail/stream_compaction.hpp
@@ -17,11 +17,13 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf::lists::detail {
+namespace CUDF_EXPORT cudf {
+namespace lists::detail {
 
 /**
  * @copydoc cudf::lists::apply_boolean_mask(lists_column_view const&, lists_column_view const&,
@@ -45,4 +47,5 @@ std::unique_ptr<column> distinct(lists_column_view const& input,
                                  rmm::cuda_stream_view stream,
                                  rmm::device_async_resource_ref mr);
 
-}  // namespace cudf::lists::detail
+}  // namespace lists::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/explode.hpp b/cpp/include/cudf/lists/explode.hpp
index 303f182ce8c..a3375887815 100644
--- a/cpp/include/cudf/lists/explode.hpp
+++ b/cpp/include/cudf/lists/explode.hpp
@@ -25,7 +25,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup column_reshape
  * @{
@@ -215,4 +215,4 @@ std::unique_ptr<table> explode_outer_position(
 
 /** @} */  // end of group
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/extract.hpp b/cpp/include/cudf/lists/extract.hpp
index 096d276fcfb..29a02308c66 100644
--- a/cpp/include/cudf/lists/extract.hpp
+++ b/cpp/include/cudf/lists/extract.hpp
@@ -18,11 +18,12 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace lists {
 /**
  * @addtogroup lists_extract
@@ -113,4 +114,4 @@ std::unique_ptr<column> extract_list_element(
 
 /** @} */  // end of group
 }  // namespace lists
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/filling.hpp b/cpp/include/cudf/lists/filling.hpp
index 1d840c76bf8..a1f3c37ad9e 100644
--- a/cpp/include/cudf/lists/filling.hpp
+++ b/cpp/include/cudf/lists/filling.hpp
@@ -25,7 +25,8 @@
 
 #include <memory>
 
-namespace cudf::lists {
+namespace CUDF_EXPORT cudf {
+namespace lists {
 /**
  * @addtogroup lists_filling
  * @{
@@ -113,4 +114,5 @@ std::unique_ptr<column> sequences(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf::lists
+}  // namespace lists
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/gather.hpp b/cpp/include/cudf/lists/gather.hpp
index a0d79c05098..6359e0488c9 100644
--- a/cpp/include/cudf/lists/gather.hpp
+++ b/cpp/include/cudf/lists/gather.hpp
@@ -19,11 +19,12 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace lists {
 /**
  * @addtogroup lists_gather
@@ -80,4 +81,4 @@ std::unique_ptr<column> segmented_gather(
 
 /** @} */  // end of group
 }  // namespace lists
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/list_device_view.cuh b/cpp/include/cudf/lists/list_device_view.cuh
index 170a20bd7f5..29b81135d64 100644
--- a/cpp/include/cudf/lists/list_device_view.cuh
+++ b/cpp/include/cudf/lists/list_device_view.cuh
@@ -25,7 +25,7 @@
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief A non-owning, immutable view of device data that represents
@@ -377,4 +377,4 @@ CUDF_HOST_DEVICE auto inline make_list_size_iterator(detail::lists_column_device
   return detail::make_counting_transform_iterator(0, list_size_functor{c});
 }
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/list_view.hpp b/cpp/include/cudf/lists/list_view.hpp
index a3f36a9330f..59ad9c9bcee 100644
--- a/cpp/include/cudf/lists/list_view.hpp
+++ b/cpp/include/cudf/lists/list_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,12 +16,14 @@
  */
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 /**
  * @file list_view.hpp
  * @brief Class definition for cudf::list_view.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief A non-owning, immutable view of device data that represents
@@ -29,4 +31,4 @@ namespace cudf {
  */
 class list_view {};
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/lists_column_device_view.cuh b/cpp/include/cudf/lists/lists_column_device_view.cuh
index 4d12ee1cab4..b3ec18a7913 100644
--- a/cpp/include/cudf/lists/lists_column_device_view.cuh
+++ b/cpp/include/cudf/lists/lists_column_device_view.cuh
@@ -21,9 +21,7 @@
 
 #include <cuda_runtime.h>
 
-namespace cudf {
-
-namespace detail {
+namespace cudf::detail {
 
 /**
  * @brief Given a column_device_view, an instance of this class provides a
@@ -116,6 +114,4 @@ class lists_column_device_view : private column_device_view {
   }
 };
 
-}  // namespace detail
-
-}  // namespace cudf
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/lists/lists_column_view.hpp b/cpp/include/cudf/lists/lists_column_view.hpp
index 3397cb0ca1d..b117a871b64 100644
--- a/cpp/include/cudf/lists/lists_column_view.hpp
+++ b/cpp/include/cudf/lists/lists_column_view.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -25,7 +26,7 @@
  * @brief Class definition for cudf::lists_column_view
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup lists_classes
@@ -137,4 +138,4 @@ class lists_column_view : private column_view {
   }
 };
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/reverse.hpp b/cpp/include/cudf/lists/reverse.hpp
index 34c40c5a3ba..f00e6e5117a 100644
--- a/cpp/include/cudf/lists/reverse.hpp
+++ b/cpp/include/cudf/lists/reverse.hpp
@@ -17,13 +17,15 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <memory>
 
-namespace cudf::lists {
+namespace CUDF_EXPORT cudf {
+namespace lists {
 /**
  * @addtogroup lists_modify
  * @{
@@ -54,4 +56,5 @@ std::unique_ptr<column> reverse(
 
 /** @} */  // end of doxygen group
 
-}  // namespace cudf::lists
+}  // namespace lists
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/set_operations.hpp b/cpp/include/cudf/lists/set_operations.hpp
index 871e66b2d83..55b1591fc44 100644
--- a/cpp/include/cudf/lists/set_operations.hpp
+++ b/cpp/include/cudf/lists/set_operations.hpp
@@ -23,7 +23,8 @@
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf::lists {
+namespace CUDF_EXPORT cudf {
+namespace lists {
 /**
  * @addtogroup set_operations
  * @{
@@ -177,4 +178,5 @@ std::unique_ptr<column> difference_distinct(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf::lists
+}  // namespace lists
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/sorting.hpp b/cpp/include/cudf/lists/sorting.hpp
index 78cea191bc5..39c71f6e9fa 100644
--- a/cpp/include/cudf/lists/sorting.hpp
+++ b/cpp/include/cudf/lists/sorting.hpp
@@ -18,11 +18,12 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace lists {
 /**
  * @addtogroup lists_sort
@@ -74,4 +75,4 @@ std::unique_ptr<column> stable_sort_lists(
 
 /** @} */  // end of group
 }  // namespace lists
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp
index 31f09d37560..28ef13cd870 100644
--- a/cpp/include/cudf/lists/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/stream_compaction.hpp
@@ -17,12 +17,14 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf::lists {
+namespace CUDF_EXPORT cudf {
+namespace lists {
 
 /**
  * @addtogroup lists_filtering
@@ -94,4 +96,5 @@ std::unique_ptr<column> distinct(
 
 /** @} */  // end of group
 
-}  // namespace cudf::lists
+}  // namespace lists
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/merge.hpp b/cpp/include/cudf/merge.hpp
index 301e56c19b8..83c6ff04500 100644
--- a/cpp/include/cudf/merge.hpp
+++ b/cpp/include/cudf/merge.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -24,7 +25,7 @@
 #include <memory>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup column_merge
  * @{
@@ -110,4 +111,4 @@ std::unique_ptr<cudf::table> merge(
   rmm::cuda_stream_view stream                         = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr                    = rmm::mr::get_current_device_resource());
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index 9e375df140b..70ca6aa29c5 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -25,7 +26,7 @@
 
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup column_nullmask
@@ -208,4 +209,4 @@ cudf::size_type null_count(bitmask_type const* bitmask,
                            size_type stop,
                            rmm::cuda_stream_view stream = cudf::get_default_stream());
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp
index 9ed56297908..6a53553063e 100644
--- a/cpp/include/cudf/partitioning.hpp
+++ b/cpp/include/cudf/partitioning.hpp
@@ -18,6 +18,7 @@
 
 #include <cudf/hashing.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -26,7 +27,7 @@
 #include <memory>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup reorder_partition
  * @{
@@ -254,4 +255,4 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> round_robi
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp
index a1c98ee4e9d..47eac2e72f9 100644
--- a/cpp/include/cudf/quantiles.hpp
+++ b/cpp/include/cudf/quantiles.hpp
@@ -20,11 +20,12 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup column_quantiles
  * @{
@@ -129,4 +130,4 @@ std::unique_ptr<column> percentile_approx(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp
index 52f39925a2d..e42ff5df15d 100644
--- a/cpp/include/cudf/reduction.hpp
+++ b/cpp/include/cudf/reduction.hpp
@@ -18,13 +18,14 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <optional>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup aggregation_reduction
  * @{
@@ -232,4 +233,4 @@ std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
 
 /** @} */  // end of group
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/reduction/detail/histogram.hpp b/cpp/include/cudf/reduction/detail/histogram.hpp
index f23c5a14e33..5b17df47ec7 100644
--- a/cpp/include/cudf/reduction/detail/histogram.hpp
+++ b/cpp/include/cudf/reduction/detail/histogram.hpp
@@ -19,6 +19,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -27,7 +28,8 @@
 #include <memory>
 #include <optional>
 
-namespace cudf::reduction::detail {
+namespace CUDF_EXPORT cudf {
+namespace reduction::detail {
 
 /**
  * @brief Compute the frequency for each distinct row in the input table.
@@ -55,4 +57,5 @@ compute_row_frequencies(table_view const& input,
  */
 [[nodiscard]] std::unique_ptr<column> make_empty_histogram_like(column_view const& values);
 
-}  // namespace cudf::reduction::detail
+}  // namespace reduction::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/reduction/detail/reduction.hpp b/cpp/include/cudf/reduction/detail/reduction.hpp
index 78f90a1e2c9..a15783fb460 100644
--- a/cpp/include/cudf/reduction/detail/reduction.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction.hpp
@@ -19,12 +19,14 @@
 #include <cudf/aggregation.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
 #include <optional>
 
-namespace cudf::reduction::detail {
+namespace CUDF_EXPORT cudf {
+namespace reduction::detail {
 
 /**
  * @copydoc cudf::reduce(column_view const&, reduce_aggregation const&, data_type,
@@ -39,4 +41,5 @@ std::unique_ptr<scalar> reduce(column_view const& col,
                                rmm::cuda_stream_view stream,
                                rmm::device_async_resource_ref mr);
 
-}  // namespace cudf::reduction::detail
+}  // namespace reduction::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/reduction/detail/reduction_functions.hpp b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
index 31d465619b9..fa21dc87e64 100644
--- a/cpp/include/cudf/reduction/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
@@ -20,15 +20,15 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <optional>
 
-namespace cudf {
-namespace reduction {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace reduction::detail {
 /**
  * @brief Computes sum of elements in input column
  *
@@ -352,6 +352,5 @@ std::unique_ptr<scalar> merge_sets(lists_column_view const& col,
                                    rmm::cuda_stream_view stream,
                                    rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace reduction
-}  // namespace cudf
+}  // namespace reduction::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
index 770ac6580ef..1c55b387454 100644
--- a/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
@@ -20,15 +20,15 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <optional>
 
-namespace cudf {
-namespace reduction {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace reduction::detail {
 
 /**
  * @brief Compute sum of each segment in the input column
@@ -354,6 +354,5 @@ std::unique_ptr<column> segmented_nunique(column_view const& col,
                                           rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace reduction
-}  // namespace cudf
+}  // namespace reduction::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/replace.hpp b/cpp/include/cudf/replace.hpp
index ae20e72f023..43aabd6c6c6 100644
--- a/cpp/include/cudf/replace.hpp
+++ b/cpp/include/cudf/replace.hpp
@@ -18,13 +18,14 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup transformation_replace
  * @{
@@ -308,4 +309,4 @@ void normalize_nans_and_zeros(mutable_column_view& in_out,
                               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp
index 26316be7fd4..a0a7fe694bb 100644
--- a/cpp/include/cudf/reshape.hpp
+++ b/cpp/include/cudf/reshape.hpp
@@ -19,13 +19,14 @@
 #include <cudf/column/column.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup column_reshape
  * @{
@@ -105,4 +106,4 @@ std::unique_ptr<column> byte_cast(
 
 /** @} */  // end of group
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp
index d55322dd3e8..5a8c454d8fc 100644
--- a/cpp/include/cudf/rolling.hpp
+++ b/cpp/include/cudf/rolling.hpp
@@ -18,13 +18,14 @@
 
 #include <cudf/rolling/range_window_bounds.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup aggregation_rolling
  * @{
@@ -615,4 +616,4 @@ std::unique_ptr<column> rolling_window(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/rolling/range_window_bounds.hpp b/cpp/include/cudf/rolling/range_window_bounds.hpp
index a9ee12cea27..21be609cbe6 100644
--- a/cpp/include/cudf/rolling/range_window_bounds.hpp
+++ b/cpp/include/cudf/rolling/range_window_bounds.hpp
@@ -17,8 +17,9 @@
 #pragma once
 
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/export.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup aggregation_rolling
  * @{
@@ -119,4 +120,4 @@ struct range_window_bounds {
 };
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/round.hpp b/cpp/include/cudf/round.hpp
index 85935f8f05c..ef144b328f7 100644
--- a/cpp/include/cudf/round.hpp
+++ b/cpp/include/cudf/round.hpp
@@ -17,11 +17,12 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup transformation_unaryops
@@ -78,4 +79,4 @@ std::unique_ptr<column> round(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index d78907b473a..2c5cc60fc70 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -32,7 +32,7 @@
  * @brief Class definitions for cudf::scalar
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup scalar_classes
  * @{
@@ -894,4 +894,4 @@ class struct_scalar : public scalar {
 };
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/scalar/scalar_device_view.cuh b/cpp/include/cudf/scalar/scalar_device_view.cuh
index 846da0bbe10..cbd3e9175ac 100644
--- a/cpp/include/cudf/scalar/scalar_device_view.cuh
+++ b/cpp/include/cudf/scalar/scalar_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,7 +24,7 @@
  * @brief Scalar device view class definitions
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @brief A non-owning view of scalar from device that is trivially copyable
@@ -440,4 +440,4 @@ auto get_scalar_device_view(fixed_point_scalar<T>& s)
   return fixed_point_scalar_device_view<T>(s.type(), s.data(), s.validity_data());
 }
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/scalar/scalar_factories.hpp b/cpp/include/cudf/scalar/scalar_factories.hpp
index 7dd4674a2fd..a422c3bfbe9 100644
--- a/cpp/include/cudf/scalar/scalar_factories.hpp
+++ b/cpp/include/cudf/scalar/scalar_factories.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup scalar_factories
  * @{
@@ -227,4 +227,4 @@ std::unique_ptr<scalar> make_struct_scalar(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/search.hpp b/cpp/include/cudf/search.hpp
index 2e50ba2d687..ad170ec726b 100644
--- a/cpp/include/cudf/search.hpp
+++ b/cpp/include/cudf/search.hpp
@@ -20,13 +20,14 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup column_search
  * @{
@@ -168,4 +169,4 @@ std::unique_ptr<column> contains(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index 79a00cbce42..4cb265a2a0b 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -19,6 +19,7 @@
 #include <cudf/aggregation.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -26,7 +27,7 @@
 #include <memory>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup column_sort
@@ -346,4 +347,4 @@ std::unique_ptr<table> stable_segmented_sort_by_key(
   rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index 181af11adb8..cfe404ff6ab 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -18,6 +18,7 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -25,7 +26,7 @@
 #include <memory>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup reorder_compact
  * @{
@@ -401,4 +402,4 @@ cudf::size_type distinct_count(table_view const& input,
                                null_equality nulls_equal = null_equality::EQUAL);
 
 /** @} */
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/attributes.hpp b/cpp/include/cudf/strings/attributes.hpp
index 26f906b3102..323290e907c 100644
--- a/cpp/include/cudf/strings/attributes.hpp
+++ b/cpp/include/cudf/strings/attributes.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 //! Strings column APIs
 namespace strings {
@@ -91,4 +91,4 @@ std::unique_ptr<column> code_points(
 /** @} */  // end of strings_apis group
 
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/capitalize.hpp b/cpp/include/cudf/strings/capitalize.hpp
index f8cbdc09748..420b46a05b2 100644
--- a/cpp/include/cudf/strings/capitalize.hpp
+++ b/cpp/include/cudf/strings/capitalize.hpp
@@ -23,7 +23,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_case
@@ -129,4 +129,4 @@ std::unique_ptr<column> is_title(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/case.hpp b/cpp/include/cudf/strings/case.hpp
index 5403fa8db7e..45f56a681a6 100644
--- a/cpp/include/cudf/strings/case.hpp
+++ b/cpp/include/cudf/strings/case.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_case
@@ -89,4 +89,4 @@ std::unique_ptr<column> swapcase(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/char_types/char_cases.hpp b/cpp/include/cudf/strings/char_types/char_cases.hpp
index 9eb63f71a2f..e5e619b8a50 100644
--- a/cpp/include/cudf/strings/char_types/char_cases.hpp
+++ b/cpp/include/cudf/strings/char_types/char_cases.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,9 @@
  */
 #pragma once
 
-namespace cudf {
+#include <cudf/utilities/export.hpp>
+
+namespace CUDF_EXPORT cudf {
 namespace strings {
 namespace detail {
 /**
@@ -31,4 +33,4 @@ void generate_special_mapping_hash_table();
 
 }  // namespace detail
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
index da7a238a400..a6af681eec6 100644
--- a/cpp/include/cudf/strings/char_types/char_types.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -23,7 +23,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_types
@@ -119,4 +119,4 @@ std::unique_ptr<column> filter_characters_of_type(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/char_types/char_types_enum.hpp b/cpp/include/cudf/strings/char_types/char_types_enum.hpp
index 8d248cb2ebf..a9142fdbda6 100644
--- a/cpp/include/cudf/strings/char_types/char_types_enum.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types_enum.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include <cstdint>
 #include <type_traits>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_types
@@ -80,4 +80,4 @@ constexpr string_character_types& operator|=(string_character_types& lhs,
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp
index 8cc735831b8..2cade813d78 100644
--- a/cpp/include/cudf/strings/combine.hpp
+++ b/cpp/include/cudf/strings/combine.hpp
@@ -24,7 +24,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_combine
@@ -334,4 +334,4 @@ std::unique_ptr<column> join_list_elements(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp
index f79a0f19e9c..59c9b2dea40 100644
--- a/cpp/include/cudf/strings/contains.hpp
+++ b/cpp/include/cudf/strings/contains.hpp
@@ -23,7 +23,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 
 struct regex_program;
@@ -209,4 +209,4 @@ std::unique_ptr<column> like(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/convert/convert_booleans.hpp b/cpp/include/cudf/strings/convert/convert_booleans.hpp
index 9c922361914..d79dd4a80ea 100644
--- a/cpp/include/cudf/strings/convert/convert_booleans.hpp
+++ b/cpp/include/cudf/strings/convert/convert_booleans.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_convert
@@ -72,4 +72,4 @@ std::unique_ptr<column> from_booleans(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/convert/convert_datetime.hpp b/cpp/include/cudf/strings/convert/convert_datetime.hpp
index b89384d718b..c3b3c91ab35 100644
--- a/cpp/include/cudf/strings/convert/convert_datetime.hpp
+++ b/cpp/include/cudf/strings/convert/convert_datetime.hpp
@@ -24,7 +24,7 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_convert
@@ -255,4 +255,4 @@ std::unique_ptr<column> from_timestamps(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/convert/convert_durations.hpp b/cpp/include/cudf/strings/convert/convert_durations.hpp
index 2db719a4f1f..8b69968a609 100644
--- a/cpp/include/cudf/strings/convert/convert_durations.hpp
+++ b/cpp/include/cudf/strings/convert/convert_durations.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_convert
@@ -133,4 +133,4 @@ std::unique_ptr<column> from_durations(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
index 9911bea1948..a9c5aea6343 100644
--- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
+++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_convert
@@ -130,4 +130,4 @@ std::unique_ptr<column> is_fixed_point(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/convert/convert_floats.hpp b/cpp/include/cudf/strings/convert/convert_floats.hpp
index feb5b528686..64e9bb776f4 100644
--- a/cpp/include/cudf/strings/convert/convert_floats.hpp
+++ b/cpp/include/cudf/strings/convert/convert_floats.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_convert
@@ -103,4 +103,4 @@ std::unique_ptr<column> is_float(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp
index 82696811fdc..62eb1fdda4d 100644
--- a/cpp/include/cudf/strings/convert/convert_integers.hpp
+++ b/cpp/include/cudf/strings/convert/convert_integers.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_convert
@@ -235,4 +235,4 @@ std::unique_ptr<column> integers_to_hex(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/convert/convert_ipv4.hpp b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
index 64f8a412ce9..04a04907c12 100644
--- a/cpp/include/cudf/strings/convert/convert_ipv4.hpp
+++ b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_convert
@@ -113,4 +113,4 @@ std::unique_ptr<column> is_ipv4(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp
index a88bbe99492..85b67907228 100644
--- a/cpp/include/cudf/strings/convert/convert_lists.hpp
+++ b/cpp/include/cudf/strings/convert/convert_lists.hpp
@@ -23,7 +23,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_convert
@@ -68,4 +68,4 @@ std::unique_ptr<column> format_list_column(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/convert/convert_urls.hpp b/cpp/include/cudf/strings/convert/convert_urls.hpp
index 30988d2ff0a..a42a5cd2407 100644
--- a/cpp/include/cudf/strings/convert/convert_urls.hpp
+++ b/cpp/include/cudf/strings/convert/convert_urls.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_convert
@@ -75,4 +75,4 @@ std::unique_ptr<column> url_decode(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/char_tables.hpp b/cpp/include/cudf/strings/detail/char_tables.hpp
index 0901076c835..5d6aff28826 100644
--- a/cpp/include/cudf/strings/detail/char_tables.hpp
+++ b/cpp/include/cudf/strings/detail/char_tables.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,12 @@
  */
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <cstdint>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 // Type for the character flags table.
 using character_flags_table_type = std::uint8_t;
 
@@ -101,6 +102,5 @@ constexpr uint16_t get_special_case_hash_index(uint32_t code_point)
   return static_cast<uint16_t>(code_point % special_case_prime);
 }
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp
index 25214055787..962191eae6a 100644
--- a/cpp/include/cudf/strings/detail/combine.hpp
+++ b/cpp/include/cudf/strings/detail/combine.hpp
@@ -21,13 +21,13 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 
 /**
  * @copydoc concatenate(table_view const&,string_scalar const&,string_scalar
@@ -68,6 +68,5 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                            rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/concatenate.hpp b/cpp/include/cudf/strings/detail/concatenate.hpp
index b5dd5b9516a..e038102ab1f 100644
--- a/cpp/include/cudf/strings/detail/concatenate.hpp
+++ b/cpp/include/cudf/strings/detail/concatenate.hpp
@@ -19,14 +19,14 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 /**
  * @brief Returns a single column by vertically concatenating the given vector of
  * strings columns.
@@ -47,6 +47,5 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
                                     rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/converters.hpp b/cpp/include/cudf/strings/detail/converters.hpp
index d212239264b..73a97499293 100644
--- a/cpp/include/cudf/strings/detail/converters.hpp
+++ b/cpp/include/cudf/strings/detail/converters.hpp
@@ -18,13 +18,13 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 
 /**
  * @copydoc to_integers(strings_column_view const&,data_type,rmm::device_async_resource_ref)
@@ -153,6 +153,5 @@ std::unique_ptr<column> from_fixed_point(column_view const& integers,
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/copy_range.hpp b/cpp/include/cudf/strings/detail/copy_range.hpp
index 192c5b833c6..71dcf9edaf3 100644
--- a/cpp/include/cudf/strings/detail/copy_range.hpp
+++ b/cpp/include/cudf/strings/detail/copy_range.hpp
@@ -21,9 +21,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 
 /**
  * @brief Internal API to copy a range of string elements out-of-place from
@@ -56,6 +55,5 @@ std::unique_ptr<column> copy_range(strings_column_view const& source,
                                    rmm::cuda_stream_view stream,
                                    rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/copying.hpp b/cpp/include/cudf/strings/detail/copying.hpp
index 240cac17188..b4d3362359d 100644
--- a/cpp/include/cudf/strings/detail/copying.hpp
+++ b/cpp/include/cudf/strings/detail/copying.hpp
@@ -19,13 +19,13 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 /**
  * @brief Returns a new strings column created from a subset of
  * of the strings column.
@@ -83,6 +83,5 @@ std::unique_ptr<column> shift(strings_column_view const& input,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/fill.hpp b/cpp/include/cudf/strings/detail/fill.hpp
index c5d005fbf75..1a3ff2c9166 100644
--- a/cpp/include/cudf/strings/detail/fill.hpp
+++ b/cpp/include/cudf/strings/detail/fill.hpp
@@ -19,13 +19,13 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 /**
  * @brief Returns a strings column replacing a range of rows
  * with the specified string.
@@ -50,6 +50,5 @@ std::unique_ptr<column> fill(strings_column_view const& strings,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/merge.hpp b/cpp/include/cudf/strings/detail/merge.hpp
index 35fd9c0593d..0aa5c0c2899 100644
--- a/cpp/include/cudf/strings/detail/merge.hpp
+++ b/cpp/include/cudf/strings/detail/merge.hpp
@@ -18,10 +18,12 @@
 #include <cudf/column/column.hpp>
 #include <cudf/detail/merge.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf ::strings ::detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 /**
  * @brief Merges two strings columns
  *
@@ -38,4 +40,5 @@ std::unique_ptr<column> merge(strings_column_view const& lhs,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr);
 
-}  // namespace cudf::strings::detail
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/replace.hpp b/cpp/include/cudf/strings/detail/replace.hpp
index 481d00f1bce..ab092555c48 100644
--- a/cpp/include/cudf/strings/detail/replace.hpp
+++ b/cpp/include/cudf/strings/detail/replace.hpp
@@ -19,13 +19,13 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 
 /**
  * @copydoc cudf::strings::replace(strings_column_view const&, string_scalar const&,
@@ -100,6 +100,5 @@ std::unique_ptr<cudf::column> find_and_replace_all(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/scan.hpp b/cpp/include/cudf/strings/detail/scan.hpp
index f32afa64a72..4991fd633d5 100644
--- a/cpp/include/cudf/strings/detail/scan.hpp
+++ b/cpp/include/cudf/strings/detail/scan.hpp
@@ -21,9 +21,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 /**
  * @brief Scan function for strings
  *
@@ -43,6 +42,5 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/utf8.hpp b/cpp/include/cudf/strings/detail/utf8.hpp
index 5587597cb51..85349a421b1 100644
--- a/cpp/include/cudf/strings/detail/utf8.hpp
+++ b/cpp/include/cudf/strings/detail/utf8.hpp
@@ -22,9 +22,8 @@
  * @brief Standalone string functions.
  */
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 
 /**
  * @brief This will return true if passed a continuation byte of a UTF-8 character.
@@ -206,6 +205,5 @@ constexpr cudf::char_utf8 codepoint_to_utf8(uint32_t unchr)
   return utf8;
 }
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index 4467a9d0023..1fa505501d8 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -18,15 +18,15 @@
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace strings {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace strings::detail {
 
 /**
  * @brief Create an offsets column to be a child of a strings column
@@ -96,6 +96,5 @@ int64_t get_offset_value(cudf::column_view const& offsets,
                          size_type index,
                          rmm::cuda_stream_view stream);
 
-}  // namespace detail
-}  // namespace strings
-}  // namespace cudf
+}  // namespace strings::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp
index 4138e1e59d5..2ef7308b802 100644
--- a/cpp/include/cudf/strings/extract.hpp
+++ b/cpp/include/cudf/strings/extract.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 
 struct regex_program;
@@ -104,4 +104,4 @@ std::unique_ptr<column> extract_all_record(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp
index c116dbc2fe1..efba6da9454 100644
--- a/cpp/include/cudf/strings/find.hpp
+++ b/cpp/include/cudf/strings/find.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_find
@@ -262,4 +262,4 @@ std::unique_ptr<column> ends_with(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp
index c2e82aa6f1a..dea08308ff0 100644
--- a/cpp/include/cudf/strings/find_multiple.hpp
+++ b/cpp/include/cudf/strings/find_multiple.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_find
@@ -63,4 +63,4 @@ std::unique_ptr<column> find_multiple(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp
index abc1d28ee4c..26249b6842c 100644
--- a/cpp/include/cudf/strings/findall.hpp
+++ b/cpp/include/cudf/strings/findall.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 
 struct regex_program;
@@ -70,4 +70,4 @@ std::unique_ptr<column> findall(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/padding.hpp b/cpp/include/cudf/strings/padding.hpp
index f1382d6ea29..11e35f717ae 100644
--- a/cpp/include/cudf/strings/padding.hpp
+++ b/cpp/include/cudf/strings/padding.hpp
@@ -23,7 +23,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_modify
@@ -96,4 +96,4 @@ std::unique_ptr<column> zfill(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp
index 44ca68439e7..f7108129dee 100644
--- a/cpp/include/cudf/strings/regex/flags.hpp
+++ b/cpp/include/cudf/strings/regex/flags.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,9 +15,11 @@
  */
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <cstdint>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 
 /**
@@ -86,4 +88,4 @@ enum class capture_groups : uint32_t {
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp
index 95c86ae0f8a..9da859d9c87 100644
--- a/cpp/include/cudf/strings/regex/regex_program.hpp
+++ b/cpp/include/cudf/strings/regex/regex_program.hpp
@@ -21,7 +21,7 @@
 #include <memory>
 #include <string>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 
 /**
@@ -135,4 +135,4 @@ struct regex_program {
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp
index cbf1edc8331..e160f75390b 100644
--- a/cpp/include/cudf/strings/repeat_strings.hpp
+++ b/cpp/include/cudf/strings/repeat_strings.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_copy
@@ -133,4 +133,4 @@ std::unique_ptr<column> repeat_strings(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index a714f762a19..5b4ffb98f99 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_replace
@@ -174,4 +174,4 @@ std::unique_ptr<column> replace_multiple(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp
index f61f9585144..6b487072cb2 100644
--- a/cpp/include/cudf/strings/replace_re.hpp
+++ b/cpp/include/cudf/strings/replace_re.hpp
@@ -25,7 +25,7 @@
 
 #include <optional>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 
 struct regex_program;
@@ -112,4 +112,4 @@ std::unique_ptr<column> replace_with_backrefs(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/reverse.hpp b/cpp/include/cudf/strings/reverse.hpp
index 86656693c8b..fbda2e5fe7c 100644
--- a/cpp/include/cudf/strings/reverse.hpp
+++ b/cpp/include/cudf/strings/reverse.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_modify
@@ -53,4 +53,4 @@ std::unique_ptr<column> reverse(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/side_type.hpp b/cpp/include/cudf/strings/side_type.hpp
index 5905e087deb..5b794261ad9 100644
--- a/cpp/include/cudf/strings/side_type.hpp
+++ b/cpp/include/cudf/strings/side_type.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +15,9 @@
  */
 #pragma once
 
-namespace cudf {
+#include <cudf/utilities/export.hpp>
+
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_modify
@@ -34,4 +36,4 @@ enum class side_type {
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/slice.hpp b/cpp/include/cudf/strings/slice.hpp
index e2be6abd344..b0da6976207 100644
--- a/cpp/include/cudf/strings/slice.hpp
+++ b/cpp/include/cudf/strings/slice.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_slice
@@ -114,4 +114,4 @@ std::unique_ptr<column> slice_strings(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/split/partition.hpp b/cpp/include/cudf/strings/split/partition.hpp
index 0a837034ba1..8f5ae752417 100644
--- a/cpp/include/cudf/strings/split/partition.hpp
+++ b/cpp/include/cudf/strings/split/partition.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_split
@@ -101,4 +101,4 @@ std::unique_ptr<table> rpartition(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/split/split.hpp b/cpp/include/cudf/strings/split/split.hpp
index d5c44406ca7..ca371d7abd1 100644
--- a/cpp/include/cudf/strings/split/split.hpp
+++ b/cpp/include/cudf/strings/split/split.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_split
@@ -245,4 +245,4 @@ std::unique_ptr<column> rsplit_record(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index 81595fa7ed4..96ef0b6e830 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -22,7 +22,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 
 struct regex_program;
@@ -263,4 +263,4 @@ std::unique_ptr<column> rsplit_record_re(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index 93cc787683b..abb26d7ccb4 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -18,6 +18,7 @@
 
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/strings/string_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #ifndef __CUDA_ARCH__
 #include <cudf/utilities/error.hpp>
@@ -35,7 +36,7 @@
 // This file should only include device code logic.
 // Host-only or host/device code should be defined in the string_view.hpp header file.
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 namespace detail {
 
@@ -448,4 +449,4 @@ __device__ inline size_type string_view::character_offset(size_type bytepos) con
   return strings::detail::characters_in_string(data(), bytepos);
 }
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
index afc7e027a4b..504c31057ae 100644
--- a/cpp/include/cudf/strings/string_view.hpp
+++ b/cpp/include/cudf/strings/string_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@
  * @brief Class definition for cudf::string_view.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 using char_utf8 = uint32_t;  ///< UTF-8 characters are 1-4 bytes
 
@@ -406,4 +406,4 @@ class string_view {
                                         size_type count) const;
 };
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/strings_column_view.hpp b/cpp/include/cudf/strings/strings_column_view.hpp
index 1e9e73cef4c..4a2512eb7c5 100644
--- a/cpp/include/cudf/strings/strings_column_view.hpp
+++ b/cpp/include/cudf/strings/strings_column_view.hpp
@@ -17,13 +17,14 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 /**
  * @file
  * @brief Class definition for cudf::strings_column_view
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup strings_classes
@@ -126,4 +127,4 @@ namespace strings {
 }  // namespace strings
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/strip.hpp b/cpp/include/cudf/strings/strip.hpp
index 6fb9bbc45e6..4cfba59c72c 100644
--- a/cpp/include/cudf/strings/strip.hpp
+++ b/cpp/include/cudf/strings/strip.hpp
@@ -23,7 +23,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_modify
@@ -71,4 +71,4 @@ std::unique_ptr<column> strip(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/translate.hpp b/cpp/include/cudf/strings/translate.hpp
index 9cd6b7d5974..531753f4a8c 100644
--- a/cpp/include/cudf/strings/translate.hpp
+++ b/cpp/include/cudf/strings/translate.hpp
@@ -25,7 +25,7 @@
 
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_modify
@@ -109,4 +109,4 @@ std::unique_ptr<column> filter_characters(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/wrap.hpp b/cpp/include/cudf/strings/wrap.hpp
index c05c33fbac8..465a9d15d00 100644
--- a/cpp/include/cudf/strings/wrap.hpp
+++ b/cpp/include/cudf/strings/wrap.hpp
@@ -21,7 +21,7 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace strings {
 /**
  * @addtogroup strings_modify
@@ -72,4 +72,4 @@ std::unique_ptr<column> wrap(
 
 /** @} */  // end of doxygen group
 }  // namespace strings
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/structs/detail/concatenate.hpp b/cpp/include/cudf/structs/detail/concatenate.hpp
index 5dc3169c0c4..16be868af52 100644
--- a/cpp/include/cudf/structs/detail/concatenate.hpp
+++ b/cpp/include/cudf/structs/detail/concatenate.hpp
@@ -18,13 +18,13 @@
 #include <cudf/column/column.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace structs {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace structs::detail {
 
 /**
  * @brief Returns a single column by concatenating the given vector of structs columns.
@@ -54,6 +54,5 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
                                     rmm::cuda_stream_view stream,
                                     rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace structs
-}  // namespace cudf
+}  // namespace structs::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/structs/detail/scan.hpp b/cpp/include/cudf/structs/detail/scan.hpp
index c97a8452ecd..6121f63d42f 100644
--- a/cpp/include/cudf/structs/detail/scan.hpp
+++ b/cpp/include/cudf/structs/detail/scan.hpp
@@ -17,13 +17,13 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
-namespace structs {
-namespace detail {
+namespace CUDF_EXPORT cudf {
+namespace structs::detail {
 /**
  * @brief Scan function for struct column type
  *
@@ -41,6 +41,5 @@ std::unique_ptr<column> scan_inclusive(column_view const& input,
                                        rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr);
 
-}  // namespace detail
-}  // namespace structs
-}  // namespace cudf
+}  // namespace structs::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/structs/struct_view.hpp b/cpp/include/cudf/structs/struct_view.hpp
index 75483709867..65fd3f78d1a 100644
--- a/cpp/include/cudf/structs/struct_view.hpp
+++ b/cpp/include/cudf/structs/struct_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@
  * @brief Class definition for cudf::struct_view.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief A non-owning, immutable view of device data that represents
@@ -29,4 +29,4 @@ namespace cudf {
  */
 class struct_view {};
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/structs/structs_column_device_view.cuh b/cpp/include/cudf/structs/structs_column_device_view.cuh
index 7580582631f..cf71ba87a20 100644
--- a/cpp/include/cudf/structs/structs_column_device_view.cuh
+++ b/cpp/include/cudf/structs/structs_column_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/types.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 namespace detail {
 
@@ -84,4 +84,4 @@ class structs_column_device_view : private column_device_view {
 
 }  // namespace detail
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/structs/structs_column_view.hpp b/cpp/include/cudf/structs/structs_column_view.hpp
index 4a50488ef00..19798f51656 100644
--- a/cpp/include/cudf/structs/structs_column_view.hpp
+++ b/cpp/include/cudf/structs/structs_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,7 @@
  * @brief Class definition for cudf::structs_column_view.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup structs_classes
@@ -98,4 +98,4 @@ class structs_column_view : public column_view {
     int index, rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
 };         // class structs_column_view;
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index c181ac7d402..f05e5f4ca5c 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -54,7 +54,7 @@
 #include <type_traits>
 #include <utility>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 namespace experimental {
 
@@ -2026,4 +2026,4 @@ class row_hasher {
 }  // namespace row
 
 }  // namespace experimental
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh
index 0e57d24f4b3..e3b65d77b4a 100644
--- a/cpp/include/cudf/table/row_operators.cuh
+++ b/cpp/include/cudf/table/row_operators.cuh
@@ -30,7 +30,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform_reduce.h>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief Result type of the `element_relational_comparator` function object.
@@ -635,4 +635,4 @@ class row_hasher {
   uint32_t _seed{DEFAULT_HASH_SEED};
 };
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp
index c4f14af53fb..be2af7ac653 100644
--- a/cpp/include/cudf/table/table.hpp
+++ b/cpp/include/cudf/table/table.hpp
@@ -31,7 +31,7 @@
  * @brief Class definition for cudf::table
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief A set of cudf::column's of the same size.
@@ -194,4 +194,4 @@ class table {
   size_type _num_rows{};
 };
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh
index 511013b585d..16d532ea2b8 100644
--- a/cpp/include/cudf/table/table_device_view.cuh
+++ b/cpp/include/cudf/table/table_device_view.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -32,7 +32,7 @@
  * @brief Table device view class definitions
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /**
@@ -271,4 +271,4 @@ auto contiguous_copy_column_device_views(HostTableView source_view, rmm::cuda_st
   return std::make_tuple(std::move(descendant_storage), d_columns);
 }
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/tdigest/tdigest_column_view.hpp b/cpp/include/cudf/tdigest/tdigest_column_view.hpp
index b2eb341df86..2f19efa5630 100644
--- a/cpp/include/cudf/tdigest/tdigest_column_view.hpp
+++ b/cpp/include/cudf/tdigest/tdigest_column_view.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,7 +18,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 //! Tdigest interfaces
 namespace tdigest {
 /**
@@ -132,4 +132,4 @@ class tdigest_column_view : private column_view {
 
 /** @} */  // end of group
 }  // namespace tdigest
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp
index 7f65128526e..8329c64e24f 100644
--- a/cpp/include/cudf/timezone.hpp
+++ b/cpp/include/cudf/timezone.hpp
@@ -15,6 +15,8 @@
  */
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
@@ -22,7 +24,7 @@
 #include <optional>
 #include <string>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 class table;
 
 // Cycle in which the time offsets repeat in Gregorian calendar
@@ -52,4 +54,4 @@ std::unique_ptr<table> make_timezone_transition_table(
   std::string_view timezone_name,
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index 7bb9fb7a42e..adc5bdb2af8 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -18,13 +18,14 @@
 
 #include <cudf/ast/expressions.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup transformation_transform
  * @{
@@ -248,4 +249,4 @@ std::unique_ptr<column> segmented_row_bit_count(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/transpose.hpp b/cpp/include/cudf/transpose.hpp
index c01a04afe87..f4433c46a06 100644
--- a/cpp/include/cudf/transpose.hpp
+++ b/cpp/include/cudf/transpose.hpp
@@ -17,11 +17,12 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup reshape_transpose
  * @{
@@ -48,4 +49,4 @@ std::pair<std::unique_ptr<column>, table_view> transpose(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/types.hpp b/cpp/include/cudf/types.hpp
index baf07fa3db6..409b8c825bb 100644
--- a/cpp/include/cudf/types.hpp
+++ b/cpp/include/cudf/types.hpp
@@ -36,6 +36,8 @@
 #define CUDF_KERNEL static
 #endif
 
+#include <cudf/utilities/export.hpp>
+
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -54,7 +56,7 @@ class device_buffer;
 
 }  // namespace rmm
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 // Forward declaration
 class column;
 class column_view;
@@ -344,4 +346,4 @@ inline bool operator!=(data_type const& lhs, data_type const& rhs) { return !(lh
 std::size_t size_of(data_type t);
 
 /** @} */
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 1609c72f175..55f4c1f5a23 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -20,6 +20,7 @@
 #include <cudf/fixed_point/floating_conversion.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -27,7 +28,7 @@
 
 #include <memory>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup transformation_unaryops
  * @{
@@ -259,4 +260,4 @@ std::unique_ptr<column> is_not_nan(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/bit.hpp b/cpp/include/cudf/utilities/bit.hpp
index 9bdc372419f..736796e610a 100644
--- a/cpp/include/cudf/utilities/bit.hpp
+++ b/cpp/include/cudf/utilities/bit.hpp
@@ -27,7 +27,7 @@
  * @brief Utilities for bit and bitmask operations.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 // @cond
 // Work around a bug in NVRTC that fails to compile assert() in constexpr
@@ -217,4 +217,4 @@ __device__ inline void clear_bit(bitmask_type* bitmask, size_type bit_index)
 }
 #endif
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/default_stream.hpp b/cpp/include/cudf/utilities/default_stream.hpp
index aacab996e8a..97a42243250 100644
--- a/cpp/include/cudf/utilities/default_stream.hpp
+++ b/cpp/include/cudf/utilities/default_stream.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,11 +17,12 @@
 #pragma once
 
 #include <cudf/detail/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup default_stream
  * @{
@@ -43,4 +44,4 @@ rmm::cuda_stream_view const get_default_stream();
 bool is_ptds_enabled();
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/error.hpp b/cpp/include/cudf/utilities/error.hpp
index f019f516b84..f847ce0f66a 100644
--- a/cpp/include/cudf/utilities/error.hpp
+++ b/cpp/include/cudf/utilities/error.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/detail/utilities/stacktrace.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <cuda.h>
 #include <cuda_runtime_api.h>
@@ -25,7 +26,7 @@
 #include <string>
 #include <type_traits>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup utility_error
  * @{
@@ -140,7 +141,7 @@ struct data_type_error : public std::invalid_argument, public stacktrace_recorde
 };
 /** @} */
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
 
 #define STRINGIFY_DETAIL(x) #x                   ///< Stringify a macro argument
 #define CUDF_STRINGIFY(x)   STRINGIFY_DETAIL(x)  ///< Stringify a macro argument
@@ -229,7 +230,7 @@ struct data_type_error : public std::invalid_argument, public stacktrace_recorde
 
 /// @endcond
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 // @cond
 inline void throw_cuda_error(cudaError_t error, char const* file, unsigned int line)
@@ -251,7 +252,7 @@ inline void throw_cuda_error(cudaError_t error, char const* file, unsigned int l
 }
 // @endcond
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
 
 /**
  * @brief Error checking macro for CUDA runtime API functions.
diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
index fa7e1b35327..623a033698f 100644
--- a/cpp/include/cudf/utilities/pinned_memory.hpp
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -16,11 +16,13 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <rmm/resource_ref.hpp>
 
 #include <optional>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief Set the rmm resource to be used for pinned memory allocations.
@@ -87,4 +89,4 @@ void set_allocate_host_as_pinned_threshold(size_t threshold);
  */
 size_t get_allocate_host_as_pinned_threshold();
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/prefetch.hpp b/cpp/include/cudf/utilities/prefetch.hpp
index 88c634a7cc7..49fca73a2c8 100644
--- a/cpp/include/cudf/utilities/prefetch.hpp
+++ b/cpp/include/cudf/utilities/prefetch.hpp
@@ -24,7 +24,8 @@
 #include <string>
 #include <string_view>
 
-namespace cudf::experimental::prefetch {
+namespace CUDF_EXPORT cudf {
+namespace experimental::prefetch {
 
 namespace detail {
 
@@ -152,4 +153,5 @@ void disable_prefetching(std::string_view key);
  */
 void prefetch_debugging(bool enable);
 
-}  // namespace cudf::experimental::prefetch
+}  // namespace experimental::prefetch
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index c5054c733a7..0daebc0dd8d 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/detail/utilities/host_vector.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
@@ -32,7 +33,7 @@
 #include <type_traits>
 #include <utility>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup utility_span
  * @{
@@ -539,4 +540,4 @@ template <class T>
 using device_2dspan = base_2dspan<T, device_span>;
 
 }  // namespace detail
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/traits.cuh b/cpp/include/cudf/utilities/traits.cuh
index 43587ffa583..5e52e9a9cd9 100644
--- a/cpp/include/cudf/utilities/traits.cuh
+++ b/cpp/include/cudf/utilities/traits.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,7 @@
 
 #include <cuda/std/atomic>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup utility_types
@@ -64,4 +64,4 @@ constexpr inline bool has_atomic_support(data_type type)
 
 /** @} */
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index d191e44228a..3f37ae02151 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -24,7 +24,7 @@
 
 #include <cuda/std/type_traits>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup utility_types
@@ -622,4 +622,4 @@ struct is_convertible<cudf::detail::timestamp<Duration1>, cudf::detail::timestam
 
 /** @} */
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/type_checks.hpp b/cpp/include/cudf/utilities/type_checks.hpp
index fd3b0581c11..4fcbca09d17 100644
--- a/cpp/include/cudf/utilities/type_checks.hpp
+++ b/cpp/include/cudf/utilities/type_checks.hpp
@@ -20,7 +20,7 @@
 
 #include <algorithm>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @brief Compare the types of two `column_view`s
@@ -147,4 +147,4 @@ inline bool all_have_same_types(ForwardIt first, ForwardIt last)
          });
 }
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp
index 1aad197b1e3..15b5f921c1b 100644
--- a/cpp/include/cudf/utilities/type_dispatcher.hpp
+++ b/cpp/include/cudf/utilities/type_dispatcher.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -31,7 +31,7 @@
  * @brief Defines the mapping between `cudf::type_id` runtime type information
  * and concrete C++ types.
  */
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup utility_dispatcher
  * @{
@@ -626,4 +626,4 @@ CUDF_HOST_DEVICE __forceinline__ constexpr decltype(auto) double_type_dispatcher
 std::string type_to_name(data_type type);
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/wrappers/dictionary.hpp b/cpp/include/cudf/wrappers/dictionary.hpp
index 95f4ac00a53..3b1958e7d4f 100644
--- a/cpp/include/cudf/wrappers/dictionary.hpp
+++ b/cpp/include/cudf/wrappers/dictionary.hpp
@@ -27,7 +27,7 @@
  * @brief Concrete type definition for dictionary columns.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 /**
  * @addtogroup dictionary_classes
  * @{
@@ -217,4 +217,4 @@ CUDF_HOST_DEVICE inline bool operator>(dictionary_wrapper<Integer> const& lhs,
 using dictionary32 = dictionary_wrapper<int32_t>;  ///< 32-bit integer indexed dictionary wrapper
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/wrappers/durations.hpp b/cpp/include/cudf/wrappers/durations.hpp
index 840dba4f4ba..8c321cba34a 100644
--- a/cpp/include/cudf/wrappers/durations.hpp
+++ b/cpp/include/cudf/wrappers/durations.hpp
@@ -16,9 +16,11 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <cuda/std/chrono>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 
 /**
  * @addtogroup timestamp_classes Timestamp
@@ -65,4 +67,4 @@ static_assert(sizeof(duration_us) == sizeof(typename duration_us::rep));
 static_assert(sizeof(duration_ns) == sizeof(typename duration_ns::rep));
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/wrappers/timestamps.hpp b/cpp/include/cudf/wrappers/timestamps.hpp
index 5194a3e8f96..1f5d54c6119 100644
--- a/cpp/include/cudf/wrappers/timestamps.hpp
+++ b/cpp/include/cudf/wrappers/timestamps.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
 #include <cudf/wrappers/durations.hpp>
 
 /**
@@ -23,7 +24,7 @@
  * @brief Concrete type definitions for int32_t and int64_t timestamps in
  * varying resolutions as durations since the UNIX epoch.
  */
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace detail {
 // TODO: Use chrono::utc_clock when available in libcu++?
 template <class Duration>
@@ -82,4 +83,4 @@ static_assert(sizeof(timestamp_us) == sizeof(typename timestamp_us::rep));
 static_assert(sizeof(timestamp_ns) == sizeof(typename timestamp_ns::rep));
 
 /** @} */  // end of group
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 0e35ff64af4..04bd51e9aa3 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -19,13 +19,14 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/file_utilities.hpp>
 
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 
 /**
@@ -99,4 +100,4 @@ class TempDirTestEnvironment : public ::testing::Environment {
 };
 
 }  // namespace test
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index c83599a8072..944c6195afb 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -24,11 +24,13 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/transform_iterator.h>
 
-namespace cudf::test {
+namespace CUDF_EXPORT cudf {
+namespace test {
 
 /**
  * @brief Verbosity level of output from column and table comparison functions.
@@ -194,7 +196,7 @@ std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view
  *  `column_view`'s data, and second is the column's bitmask.
  */
 template <typename T, std::enable_if_t<cudf::is_fixed_point<T>()>* = nullptr>
-std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view c);
+CUDF_EXPORT std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view c);
 
 /**
  * @brief Copies the data and bitmask of a `column_view` of strings
@@ -207,7 +209,8 @@ std::pair<thrust::host_vector<T>, std::vector<bitmask_type>> to_host(column_view
  * and second is the column's bitmask.
  */
 template <>
-std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(column_view c);
+CUDF_EXPORT std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to_host(
+  column_view c);
 //! @endcond
 
 /**
@@ -233,7 +236,8 @@ struct large_strings_enabler {
   void disable();
 };
 
-}  // namespace cudf::test
+}  // namespace test
+}  // namespace CUDF_EXPORT cudf
 
 // Macros for showing line of failure.
 #define CUDF_TEST_EXPECT_COLUMN_PROPERTIES_EQUAL(lhs, rhs)        \
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 2abd6f0abac..4e504ec1d30 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -24,7 +24,6 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/concatenate.hpp>
 #include <cudf/detail/iterator.cuh>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
@@ -33,6 +32,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -51,7 +51,7 @@
 #include <memory>
 #include <numeric>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 namespace detail {
 /**
@@ -1755,7 +1755,7 @@ class lists_column_wrapper : public detail::column_wrapper {
       normalize_column(lists_column_view(col).child(),
                        lists_column_view(expected_hierarchy).child()),
       col.null_count(),
-      cudf::detail::copy_bitmask(
+      cudf::copy_bitmask(
         col, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
       cudf::test::get_default_stream());
   }
@@ -1970,4 +1970,4 @@ class structs_column_wrapper : public detail::column_wrapper {
 };
 
 }  // namespace test
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/debug_utilities.hpp b/cpp/include/cudf_test/debug_utilities.hpp
index a0881490b82..049b4579316 100644
--- a/cpp/include/cudf_test/debug_utilities.hpp
+++ b/cpp/include/cudf_test/debug_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,10 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/null_mask.hpp>
+#include <cudf/utilities/export.hpp>
 
-namespace cudf::test {
+namespace CUDF_EXPORT cudf {
+namespace test {
 
 /**
  * @brief Formats a column view as a string
@@ -44,4 +46,5 @@ std::vector<std::string> to_strings(cudf::column_view const& col);
  */
 void print(cudf::column_view const& col, std::ostream& os = std::cout);
 
-}  // namespace cudf::test
+}  // namespace test
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/default_stream.hpp b/cpp/include/cudf_test/default_stream.hpp
index 1da97d71f44..4f63add3071 100644
--- a/cpp/include/cudf_test/default_stream.hpp
+++ b/cpp/include/cudf_test/default_stream.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,9 +16,11 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 
 /**
@@ -38,4 +40,4 @@ namespace test {
 rmm::cuda_stream_view const get_default_stream();
 
 }  // namespace test
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/file_utilities.hpp b/cpp/include/cudf_test/file_utilities.hpp
index defc6f95823..37347e563cd 100644
--- a/cpp/include/cudf_test/file_utilities.hpp
+++ b/cpp/include/cudf_test/file_utilities.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <ftw.h>
 
@@ -29,7 +30,7 @@
  * @brief RAII class for creating a temporary directory.
  *
  */
-class temp_directory {
+class CUDF_EXPORT temp_directory {
   std::string _path;
 
  public:
diff --git a/cpp/include/cudf_test/io_metadata_utilities.hpp b/cpp/include/cudf_test/io_metadata_utilities.hpp
index 6fd1a52239c..c18d427d905 100644
--- a/cpp/include/cudf_test/io_metadata_utilities.hpp
+++ b/cpp/include/cudf_test/io_metadata_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,10 @@
 #pragma once
 
 #include <cudf/io/types.hpp>
+#include <cudf/utilities/export.hpp>
 
-namespace cudf::test {
+namespace CUDF_EXPORT cudf {
+namespace test {
 
 void expect_metadata_equal(cudf::io::table_input_metadata in_meta,
                            cudf::io::table_metadata out_meta);
@@ -28,4 +30,5 @@ void expect_metadata_equal(cudf::io::table_input_metadata in_meta,
  */
 void expect_metadata_equal(cudf::io::table_metadata lhs_meta, cudf::io::table_metadata rhs_meta);
 
-}  // namespace cudf::test
+}  // namespace test
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/iterator_utilities.hpp b/cpp/include/cudf_test/iterator_utilities.hpp
index 10f6e77d889..8db0275d2f4 100644
--- a/cpp/include/cudf_test/iterator_utilities.hpp
+++ b/cpp/include/cudf_test/iterator_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,13 +18,14 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
 #include <iterator>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 namespace iterators {
 /**
@@ -136,4 +137,4 @@ template <class T>
 
 }  // namespace iterators
 }  // namespace test
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/print_utilities.cuh b/cpp/include/cudf_test/print_utilities.cuh
index ae6c8cef029..828188e65c3 100644
--- a/cpp/include/cudf_test/print_utilities.cuh
+++ b/cpp/include/cudf_test/print_utilities.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -25,7 +26,8 @@
 
 #include <type_traits>
 
-namespace cudf::test::print {
+namespace CUDF_EXPORT cudf {
+namespace test::print {
 
 constexpr int32_t hex_tag = 0;
 
@@ -137,4 +139,5 @@ void print_array(std::size_t count, rmm::cuda_stream_view stream, Ts... args)
   }
 }
 
-}  // namespace cudf::test::print
+}  // namespace test::print
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/random.hpp b/cpp/include/cudf_test/random.hpp
index f4d539ecffe..fe1fb0a14bf 100644
--- a/cpp/include/cudf_test/random.hpp
+++ b/cpp/include/cudf_test/random.hpp
@@ -16,11 +16,12 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <random>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 
 template <typename T, typename Enable = void>
@@ -170,4 +171,4 @@ class UniformRandomGenerator {
 };
 
 }  // namespace test
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/table_utilities.hpp b/cpp/include/cudf_test/table_utilities.hpp
index 79229df4cd9..5e60419d679 100644
--- a/cpp/include/cudf_test/table_utilities.hpp
+++ b/cpp/include/cudf_test/table_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,10 @@
 
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 
-namespace cudf::test::detail {
+namespace CUDF_EXPORT cudf {
+namespace test::detail {
 /**
  * @brief Verifies the property equality of two tables.
  *
@@ -57,7 +59,8 @@ void expect_tables_equal(cudf::table_view lhs, cudf::table_view rhs);
  */
 void expect_tables_equivalent(cudf::table_view lhs, cudf::table_view rhs);
 
-}  // namespace cudf::test::detail
+}  // namespace test::detail
+}  // namespace CUDF_EXPORT cudf
 
 // Macros for showing line of failure.
 #define CUDF_TEST_EXPECT_TABLE_PROPERTIES_EQUAL(lhs, rhs)        \
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index 742cd764a1f..5fd2403b0f2 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,6 +23,7 @@
 #include <cudf/groupby.hpp>
 #include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
@@ -37,7 +38,7 @@
 
 // for use with groupby and reduction aggregation tests.
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 
 using expected_value = thrust::tuple<size_type, double, double>;
@@ -583,4 +584,4 @@ void tdigest_merge_empty(MergeFunc merge_op)
 }
 
 }  // namespace test
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp
index 3ad4b127f80..9866253a9f8 100644
--- a/cpp/include/cudf_test/testing_main.hpp
+++ b/cpp/include/cudf_test/testing_main.hpp
@@ -20,6 +20,7 @@
 #include <cudf_test/stream_checking_resource_adaptor.hpp>
 
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/aligned.hpp>
 #include <rmm/cuda_stream_view.hpp>
@@ -32,7 +33,8 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
-namespace cudf::test {
+namespace CUDF_EXPORT cudf {
+namespace test {
 
 /// MR factory functions
 inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
@@ -90,7 +92,8 @@ inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
   CUDF_FAIL("Invalid RMM allocation mode: " + allocation_mode);
 }
 
-}  // namespace cudf::test
+}  // namespace test
+}  // namespace CUDF_EXPORT cudf
 
 /**
  * @brief Parses the cuDF test command line options.
diff --git a/cpp/include/cudf_test/timestamp_utilities.cuh b/cpp/include/cudf_test/timestamp_utilities.cuh
index ebd93862151..e0789210bf9 100644
--- a/cpp/include/cudf_test/timestamp_utilities.cuh
+++ b/cpp/include/cudf_test/timestamp_utilities.cuh
@@ -19,12 +19,13 @@
 #include <cudf_test/column_wrapper.hpp>
 
 #include <cudf/detail/iterator.cuh>
+#include <cudf/utilities/export.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
 #include <thrust/logical.h>
 #include <thrust/sequence.h>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 using time_point_ms =
   cuda::std::chrono::time_point<cuda::std::chrono::system_clock, cuda::std::chrono::milliseconds>;
@@ -75,4 +76,4 @@ inline cudf::test::fixed_width_column_wrapper<T, int64_t> generate_timestamps(in
 }
 
 }  // namespace test
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/type_list_utilities.hpp b/cpp/include/cudf_test/type_list_utilities.hpp
index b069a34afb8..1793a8ecce0 100644
--- a/cpp/include/cudf_test/type_list_utilities.hpp
+++ b/cpp/include/cudf_test/type_list_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 
 #include "cudf_gtest.hpp"
 
+#include <cudf/utilities/export.hpp>
+
 /**
  * @file type_list_utilities.hpp
  * @brief Utilities for creating type lists for typed tests in Google Test
@@ -68,7 +70,7 @@
  * increased compile-times. Use responsibly.
  */
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 // Utilities for creating parameters for typed tests on GoogleTest
 //
@@ -627,4 +629,4 @@ using Unique = typename UniqueImpl<TYPES>::type;
 
 }  // namespace test
 
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf_test/type_lists.hpp b/cpp/include/cudf_test/type_lists.hpp
index bbff45e2102..4cd01a09187 100644
--- a/cpp/include/cudf_test/type_lists.hpp
+++ b/cpp/include/cudf_test/type_lists.hpp
@@ -21,6 +21,7 @@
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf/wrappers/durations.hpp>
@@ -40,7 +41,7 @@
  * These lists should be used for consistency across tests as well as
  * future-proofing against the addition of any new types in the future.
  */
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace test {
 namespace detail {
 template <typename TYPES, std::size_t... Indices>
@@ -433,4 +434,4 @@ static constexpr std::array<cudf::type_id, 2> non_fixed_width_type_ids{cudf::typ
                                                                        cudf::type_id::STRING};
 
 }  // namespace test
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/nvtext/byte_pair_encoding.hpp b/cpp/include/nvtext/byte_pair_encoding.hpp
index 375d44e367a..6559933f696 100644
--- a/cpp/include/nvtext/byte_pair_encoding.hpp
+++ b/cpp/include/nvtext/byte_pair_encoding.hpp
@@ -20,10 +20,11 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 
 /**
  * @addtogroup nvtext_tokenize
@@ -132,4 +133,4 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
   rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/detail/generate_ngrams.hpp b/cpp/include/nvtext/detail/generate_ngrams.hpp
index c4b89b6d495..7c49421560d 100644
--- a/cpp/include/nvtext/detail/generate_ngrams.hpp
+++ b/cpp/include/nvtext/detail/generate_ngrams.hpp
@@ -20,7 +20,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 namespace detail {
 
 /**
@@ -35,4 +35,4 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
                                                     rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/detail/load_hash_file.hpp b/cpp/include/nvtext/detail/load_hash_file.hpp
index 0c27981f80b..438a4a9afdd 100644
--- a/cpp/include/nvtext/detail/load_hash_file.hpp
+++ b/cpp/include/nvtext/detail/load_hash_file.hpp
@@ -25,7 +25,7 @@
 #include <cstdint>
 #include <cstring>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 namespace detail {
 
 /**
@@ -47,4 +47,4 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/detail/tokenize.hpp b/cpp/include/nvtext/detail/tokenize.hpp
index d48027e4631..57ad008f1a9 100644
--- a/cpp/include/nvtext/detail/tokenize.hpp
+++ b/cpp/include/nvtext/detail/tokenize.hpp
@@ -23,7 +23,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 namespace detail {
 /**
  * @copydoc nvtext::tokenize(strings_column_view const&,string_scalar
@@ -70,4 +70,4 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& stri
                                            rmm::device_async_resource_ref mr);
 
 }  // namespace detail
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/edit_distance.hpp b/cpp/include/nvtext/edit_distance.hpp
index bfdfb4d1a1c..102f2cffa18 100644
--- a/cpp/include/nvtext/edit_distance.hpp
+++ b/cpp/include/nvtext/edit_distance.hpp
@@ -18,11 +18,12 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
 //! NVText APIs
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_edit_distance
  * @{
@@ -104,4 +105,4 @@ std::unique_ptr<cudf::column> edit_distance_matrix(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp
index bebe2e46023..ce79d985a49 100644
--- a/cpp/include/nvtext/generate_ngrams.hpp
+++ b/cpp/include/nvtext/generate_ngrams.hpp
@@ -18,10 +18,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_ngrams
  * @{
@@ -128,4 +129,4 @@ std::unique_ptr<cudf::column> hash_character_ngrams(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/jaccard.hpp b/cpp/include/nvtext/jaccard.hpp
index 649c17f0b1c..3c3486c079e 100644
--- a/cpp/include/nvtext/jaccard.hpp
+++ b/cpp/include/nvtext/jaccard.hpp
@@ -17,10 +17,11 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_jaccard
  * @{
@@ -78,4 +79,4 @@ std::unique_ptr<cudf::column> jaccard_index(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 7d3f6059454..fc28ecfb199 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -19,11 +19,12 @@
 #include <cudf/hashing.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_minhash
  * @{
@@ -151,4 +152,4 @@ std::unique_ptr<cudf::column> minhash64(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/ngrams_tokenize.hpp b/cpp/include/nvtext/ngrams_tokenize.hpp
index 09ce323a7ae..1048cd4abad 100644
--- a/cpp/include/nvtext/ngrams_tokenize.hpp
+++ b/cpp/include/nvtext/ngrams_tokenize.hpp
@@ -18,10 +18,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_ngrams
  * @{
@@ -86,4 +87,4 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
index e5967e78318..ec0b8981f8f 100644
--- a/cpp/include/nvtext/normalize.hpp
+++ b/cpp/include/nvtext/normalize.hpp
@@ -17,11 +17,12 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
 //! NVText APIs
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_normalize
  * @{
@@ -108,4 +109,4 @@ std::unique_ptr<cudf::column> normalize_characters(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/replace.hpp b/cpp/include/nvtext/replace.hpp
index aac21346c72..eedcd3976ca 100644
--- a/cpp/include/nvtext/replace.hpp
+++ b/cpp/include/nvtext/replace.hpp
@@ -18,11 +18,12 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
 //! NVText APIs
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_replace
  * @{
@@ -142,4 +143,4 @@ std::unique_ptr<cudf::column> filter_tokens(
   rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/stemmer.hpp b/cpp/include/nvtext/stemmer.hpp
index 20b81aba661..4607c42ceed 100644
--- a/cpp/include/nvtext/stemmer.hpp
+++ b/cpp/include/nvtext/stemmer.hpp
@@ -18,10 +18,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_stemmer
  * @{
@@ -172,4 +173,4 @@ std::unique_ptr<cudf::column> porter_stemmer_measure(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index a4e06495a1d..b5636c8401b 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -18,10 +18,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 
 /**
  * @addtogroup nvtext_tokenize
@@ -160,4 +161,4 @@ tokenizer_result subword_tokenize(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp
index 29fed0759c7..833b53efcde 100644
--- a/cpp/include/nvtext/tokenize.hpp
+++ b/cpp/include/nvtext/tokenize.hpp
@@ -18,10 +18,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
-namespace nvtext {
+namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_tokenize
  * @{
@@ -309,4 +310,4 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of tokenize group
-}  // namespace nvtext
+}  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/src/aggregation/aggregation.cpp b/cpp/src/aggregation/aggregation.cpp
index 5422304c5cb..a60a7f63882 100644
--- a/cpp/src/aggregation/aggregation.cpp
+++ b/cpp/src/aggregation/aggregation.cpp
@@ -16,6 +16,7 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <memory>
@@ -423,13 +424,16 @@ std::unique_ptr<Base> make_sum_aggregation()
 {
   return std::make_unique<detail::sum_aggregation>();
 }
-template std::unique_ptr<aggregation> make_sum_aggregation<aggregation>();
-template std::unique_ptr<rolling_aggregation> make_sum_aggregation<rolling_aggregation>();
-template std::unique_ptr<groupby_aggregation> make_sum_aggregation<groupby_aggregation>();
-template std::unique_ptr<groupby_scan_aggregation> make_sum_aggregation<groupby_scan_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_sum_aggregation<reduce_aggregation>();
-template std::unique_ptr<scan_aggregation> make_sum_aggregation<scan_aggregation>();
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<aggregation> make_sum_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_sum_aggregation<rolling_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_sum_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_scan_aggregation>
+make_sum_aggregation<groupby_scan_aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation> make_sum_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<scan_aggregation> make_sum_aggregation<scan_aggregation>();
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_sum_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a PRODUCT aggregation
@@ -438,13 +442,15 @@ std::unique_ptr<Base> make_product_aggregation()
 {
   return std::make_unique<detail::product_aggregation>();
 }
-template std::unique_ptr<aggregation> make_product_aggregation<aggregation>();
-template std::unique_ptr<groupby_aggregation> make_product_aggregation<groupby_aggregation>();
-template std::unique_ptr<groupby_scan_aggregation>
+template CUDF_EXPORT std::unique_ptr<aggregation> make_product_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_product_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_scan_aggregation>
 make_product_aggregation<groupby_scan_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_product_aggregation<reduce_aggregation>();
-template std::unique_ptr<scan_aggregation> make_product_aggregation<scan_aggregation>();
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_product_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<scan_aggregation> make_product_aggregation<scan_aggregation>();
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_product_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a MIN aggregation
@@ -453,13 +459,16 @@ std::unique_ptr<Base> make_min_aggregation()
 {
   return std::make_unique<detail::min_aggregation>();
 }
-template std::unique_ptr<aggregation> make_min_aggregation<aggregation>();
-template std::unique_ptr<rolling_aggregation> make_min_aggregation<rolling_aggregation>();
-template std::unique_ptr<groupby_aggregation> make_min_aggregation<groupby_aggregation>();
-template std::unique_ptr<groupby_scan_aggregation> make_min_aggregation<groupby_scan_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_min_aggregation<reduce_aggregation>();
-template std::unique_ptr<scan_aggregation> make_min_aggregation<scan_aggregation>();
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<aggregation> make_min_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_min_aggregation<rolling_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_min_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_scan_aggregation>
+make_min_aggregation<groupby_scan_aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation> make_min_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<scan_aggregation> make_min_aggregation<scan_aggregation>();
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_min_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a MAX aggregation
@@ -468,13 +477,16 @@ std::unique_ptr<Base> make_max_aggregation()
 {
   return std::make_unique<detail::max_aggregation>();
 }
-template std::unique_ptr<aggregation> make_max_aggregation<aggregation>();
-template std::unique_ptr<rolling_aggregation> make_max_aggregation<rolling_aggregation>();
-template std::unique_ptr<groupby_aggregation> make_max_aggregation<groupby_aggregation>();
-template std::unique_ptr<groupby_scan_aggregation> make_max_aggregation<groupby_scan_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_max_aggregation<reduce_aggregation>();
-template std::unique_ptr<scan_aggregation> make_max_aggregation<scan_aggregation>();
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<aggregation> make_max_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_max_aggregation<rolling_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_max_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_scan_aggregation>
+make_max_aggregation<groupby_scan_aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation> make_max_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<scan_aggregation> make_max_aggregation<scan_aggregation>();
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_max_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a COUNT aggregation
@@ -485,14 +497,14 @@ std::unique_ptr<Base> make_count_aggregation(null_policy null_handling)
     (null_handling == null_policy::INCLUDE) ? aggregation::COUNT_ALL : aggregation::COUNT_VALID;
   return std::make_unique<detail::count_aggregation>(kind);
 }
-template std::unique_ptr<aggregation> make_count_aggregation<aggregation>(
-  null_policy null_handling);
-template std::unique_ptr<rolling_aggregation> make_count_aggregation<rolling_aggregation>(
-  null_policy null_handling);
-template std::unique_ptr<groupby_aggregation> make_count_aggregation<groupby_aggregation>(
-  null_policy null_handling);
-template std::unique_ptr<groupby_scan_aggregation> make_count_aggregation<groupby_scan_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_count_aggregation<aggregation>(
   null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_count_aggregation<rolling_aggregation>(null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_count_aggregation<groupby_aggregation>(null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<groupby_scan_aggregation>
+make_count_aggregation<groupby_scan_aggregation>(null_policy null_handling);
 
 /// Factory to create a HISTOGRAM aggregation
 template <typename Base>
@@ -500,9 +512,11 @@ std::unique_ptr<Base> make_histogram_aggregation()
 {
   return std::make_unique<detail::histogram_aggregation>();
 }
-template std::unique_ptr<aggregation> make_histogram_aggregation<aggregation>();
-template std::unique_ptr<groupby_aggregation> make_histogram_aggregation<groupby_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_histogram_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<aggregation> make_histogram_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_histogram_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_histogram_aggregation<reduce_aggregation>();
 
 /// Factory to create a ANY aggregation
 template <typename Base>
@@ -510,9 +524,9 @@ std::unique_ptr<Base> make_any_aggregation()
 {
   return std::make_unique<detail::any_aggregation>();
 }
-template std::unique_ptr<aggregation> make_any_aggregation<aggregation>();
-template std::unique_ptr<reduce_aggregation> make_any_aggregation<reduce_aggregation>();
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<aggregation> make_any_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation> make_any_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_any_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a ALL aggregation
@@ -521,9 +535,9 @@ std::unique_ptr<Base> make_all_aggregation()
 {
   return std::make_unique<detail::all_aggregation>();
 }
-template std::unique_ptr<aggregation> make_all_aggregation<aggregation>();
-template std::unique_ptr<reduce_aggregation> make_all_aggregation<reduce_aggregation>();
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<aggregation> make_all_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation> make_all_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_all_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a SUM_OF_SQUARES aggregation
@@ -532,11 +546,12 @@ std::unique_ptr<Base> make_sum_of_squares_aggregation()
 {
   return std::make_unique<detail::sum_of_squares_aggregation>();
 }
-template std::unique_ptr<aggregation> make_sum_of_squares_aggregation<aggregation>();
-template std::unique_ptr<groupby_aggregation>
+template CUDF_EXPORT std::unique_ptr<aggregation> make_sum_of_squares_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
 make_sum_of_squares_aggregation<groupby_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_sum_of_squares_aggregation<reduce_aggregation>();
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_sum_of_squares_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_sum_of_squares_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a MEAN aggregation
@@ -545,11 +560,14 @@ std::unique_ptr<Base> make_mean_aggregation()
 {
   return std::make_unique<detail::mean_aggregation>();
 }
-template std::unique_ptr<aggregation> make_mean_aggregation<aggregation>();
-template std::unique_ptr<rolling_aggregation> make_mean_aggregation<rolling_aggregation>();
-template std::unique_ptr<groupby_aggregation> make_mean_aggregation<groupby_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_mean_aggregation<reduce_aggregation>();
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<aggregation> make_mean_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_mean_aggregation<rolling_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_mean_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_mean_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_mean_aggregation<segmented_reduce_aggregation>();
 
 /// Factory to create a M2 aggregation
@@ -558,8 +576,9 @@ std::unique_ptr<Base> make_m2_aggregation()
 {
   return std::make_unique<detail::m2_aggregation>();
 }
-template std::unique_ptr<aggregation> make_m2_aggregation<aggregation>();
-template std::unique_ptr<groupby_aggregation> make_m2_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<aggregation> make_m2_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_m2_aggregation<groupby_aggregation>();
 
 /// Factory to create a VARIANCE aggregation
 template <typename Base>
@@ -567,14 +586,15 @@ std::unique_ptr<Base> make_variance_aggregation(size_type ddof)
 {
   return std::make_unique<detail::var_aggregation>(ddof);
 }
-template std::unique_ptr<aggregation> make_variance_aggregation<aggregation>(size_type ddof);
-template std::unique_ptr<rolling_aggregation> make_variance_aggregation<rolling_aggregation>(
-  size_type ddof);
-template std::unique_ptr<groupby_aggregation> make_variance_aggregation<groupby_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_variance_aggregation<aggregation>(
   size_type ddof);
-template std::unique_ptr<reduce_aggregation> make_variance_aggregation<reduce_aggregation>(
-  size_type ddof);
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_variance_aggregation<rolling_aggregation>(size_type ddof);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_variance_aggregation<groupby_aggregation>(size_type ddof);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_variance_aggregation<reduce_aggregation>(size_type ddof);
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_variance_aggregation<segmented_reduce_aggregation>(size_type ddof);
 
 /// Factory to create a STD aggregation
@@ -583,14 +603,14 @@ std::unique_ptr<Base> make_std_aggregation(size_type ddof)
 {
   return std::make_unique<detail::std_aggregation>(ddof);
 }
-template std::unique_ptr<aggregation> make_std_aggregation<aggregation>(size_type ddof);
-template std::unique_ptr<rolling_aggregation> make_std_aggregation<rolling_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_std_aggregation<aggregation>(size_type ddof);
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation> make_std_aggregation<rolling_aggregation>(
   size_type ddof);
-template std::unique_ptr<groupby_aggregation> make_std_aggregation<groupby_aggregation>(
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation> make_std_aggregation<groupby_aggregation>(
   size_type ddof);
-template std::unique_ptr<reduce_aggregation> make_std_aggregation<reduce_aggregation>(
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation> make_std_aggregation<reduce_aggregation>(
   size_type ddof);
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_std_aggregation<segmented_reduce_aggregation>(size_type ddof);
 
 /// Factory to create a MEDIAN aggregation
@@ -599,9 +619,11 @@ std::unique_ptr<Base> make_median_aggregation()
 {
   return std::make_unique<detail::median_aggregation>();
 }
-template std::unique_ptr<aggregation> make_median_aggregation<aggregation>();
-template std::unique_ptr<groupby_aggregation> make_median_aggregation<groupby_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_median_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<aggregation> make_median_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_median_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_median_aggregation<reduce_aggregation>();
 
 /// Factory to create a QUANTILE aggregation
 template <typename Base>
@@ -610,12 +632,14 @@ std::unique_ptr<Base> make_quantile_aggregation(std::vector<double> const& quant
 {
   return std::make_unique<detail::quantile_aggregation>(quantiles, interp);
 }
-template std::unique_ptr<aggregation> make_quantile_aggregation<aggregation>(
-  std::vector<double> const& quantiles, interpolation interp);
-template std::unique_ptr<groupby_aggregation> make_quantile_aggregation<groupby_aggregation>(
-  std::vector<double> const& quantiles, interpolation interp);
-template std::unique_ptr<reduce_aggregation> make_quantile_aggregation<reduce_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_quantile_aggregation<aggregation>(
   std::vector<double> const& quantiles, interpolation interp);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_quantile_aggregation<groupby_aggregation>(std::vector<double> const& quantiles,
+                                               interpolation interp);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_quantile_aggregation<reduce_aggregation>(std::vector<double> const& quantiles,
+                                              interpolation interp);
 
 /// Factory to create an ARGMAX aggregation
 template <typename Base>
@@ -623,9 +647,11 @@ std::unique_ptr<Base> make_argmax_aggregation()
 {
   return std::make_unique<detail::argmax_aggregation>();
 }
-template std::unique_ptr<aggregation> make_argmax_aggregation<aggregation>();
-template std::unique_ptr<rolling_aggregation> make_argmax_aggregation<rolling_aggregation>();
-template std::unique_ptr<groupby_aggregation> make_argmax_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<aggregation> make_argmax_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_argmax_aggregation<rolling_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_argmax_aggregation<groupby_aggregation>();
 
 /// Factory to create an ARGMIN aggregation
 template <typename Base>
@@ -633,9 +659,11 @@ std::unique_ptr<Base> make_argmin_aggregation()
 {
   return std::make_unique<detail::argmin_aggregation>();
 }
-template std::unique_ptr<aggregation> make_argmin_aggregation<aggregation>();
-template std::unique_ptr<rolling_aggregation> make_argmin_aggregation<rolling_aggregation>();
-template std::unique_ptr<groupby_aggregation> make_argmin_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<aggregation> make_argmin_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_argmin_aggregation<rolling_aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_argmin_aggregation<groupby_aggregation>();
 
 /// Factory to create an NUNIQUE aggregation
 template <typename Base>
@@ -643,13 +671,13 @@ std::unique_ptr<Base> make_nunique_aggregation(null_policy null_handling)
 {
   return std::make_unique<detail::nunique_aggregation>(null_handling);
 }
-template std::unique_ptr<aggregation> make_nunique_aggregation<aggregation>(
-  null_policy null_handling);
-template std::unique_ptr<groupby_aggregation> make_nunique_aggregation<groupby_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_nunique_aggregation<aggregation>(
   null_policy null_handling);
-template std::unique_ptr<reduce_aggregation> make_nunique_aggregation<reduce_aggregation>(
-  null_policy null_handling);
-template std::unique_ptr<segmented_reduce_aggregation>
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_nunique_aggregation<groupby_aggregation>(null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_nunique_aggregation<reduce_aggregation>(null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<segmented_reduce_aggregation>
 make_nunique_aggregation<segmented_reduce_aggregation>(null_policy null_handling);
 
 /// Factory to create an NTH_ELEMENT aggregation
@@ -658,14 +686,14 @@ std::unique_ptr<Base> make_nth_element_aggregation(size_type n, null_policy null
 {
   return std::make_unique<detail::nth_element_aggregation>(n, null_handling);
 }
-template std::unique_ptr<aggregation> make_nth_element_aggregation<aggregation>(
-  size_type n, null_policy null_handling);
-template std::unique_ptr<groupby_aggregation> make_nth_element_aggregation<groupby_aggregation>(
-  size_type n, null_policy null_handling);
-template std::unique_ptr<reduce_aggregation> make_nth_element_aggregation<reduce_aggregation>(
-  size_type n, null_policy null_handling);
-template std::unique_ptr<rolling_aggregation> make_nth_element_aggregation<rolling_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_nth_element_aggregation<aggregation>(
   size_type n, null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_nth_element_aggregation<groupby_aggregation>(size_type n, null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_nth_element_aggregation<reduce_aggregation>(size_type n, null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_nth_element_aggregation<rolling_aggregation>(size_type n, null_policy null_handling);
 
 /// Factory to create a ROW_NUMBER aggregation
 template <typename Base>
@@ -673,8 +701,9 @@ std::unique_ptr<Base> make_row_number_aggregation()
 {
   return std::make_unique<detail::row_number_aggregation>();
 }
-template std::unique_ptr<aggregation> make_row_number_aggregation<aggregation>();
-template std::unique_ptr<rolling_aggregation> make_row_number_aggregation<rolling_aggregation>();
+template CUDF_EXPORT std::unique_ptr<aggregation> make_row_number_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_row_number_aggregation<rolling_aggregation>();
 
 /// Factory to create an EWMA aggregation
 template <typename Base>
@@ -682,9 +711,9 @@ std::unique_ptr<Base> make_ewma_aggregation(double const com, cudf::ewm_history
 {
   return std::make_unique<detail::ewma_aggregation>(com, history);
 }
-template std::unique_ptr<aggregation> make_ewma_aggregation<aggregation>(double const com,
-                                                                         cudf::ewm_history history);
-template std::unique_ptr<scan_aggregation> make_ewma_aggregation<scan_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_ewma_aggregation<aggregation>(
+  double const com, cudf::ewm_history history);
+template CUDF_EXPORT std::unique_ptr<scan_aggregation> make_ewma_aggregation<scan_aggregation>(
   double const com, cudf::ewm_history history);
 
 /// Factory to create a RANK aggregation
@@ -698,19 +727,19 @@ std::unique_ptr<Base> make_rank_aggregation(rank_method method,
   return std::make_unique<detail::rank_aggregation>(
     method, column_order, null_handling, null_precedence, percentage);
 }
-template std::unique_ptr<aggregation> make_rank_aggregation<aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_rank_aggregation<aggregation>(
   rank_method method,
   order column_order,
   null_policy null_handling,
   null_order null_precedence,
   rank_percentage percentage);
-template std::unique_ptr<groupby_scan_aggregation> make_rank_aggregation<groupby_scan_aggregation>(
-  rank_method method,
-  order column_order,
-  null_policy null_handling,
-  null_order null_precedence,
-  rank_percentage percentage);
-template std::unique_ptr<scan_aggregation> make_rank_aggregation<scan_aggregation>(
+template CUDF_EXPORT std::unique_ptr<groupby_scan_aggregation>
+make_rank_aggregation<groupby_scan_aggregation>(rank_method method,
+                                                order column_order,
+                                                null_policy null_handling,
+                                                null_order null_precedence,
+                                                rank_percentage percentage);
+template CUDF_EXPORT std::unique_ptr<scan_aggregation> make_rank_aggregation<scan_aggregation>(
   rank_method method,
   order column_order,
   null_policy null_handling,
@@ -723,14 +752,14 @@ std::unique_ptr<Base> make_collect_list_aggregation(null_policy null_handling)
 {
   return std::make_unique<detail::collect_list_aggregation>(null_handling);
 }
-template std::unique_ptr<aggregation> make_collect_list_aggregation<aggregation>(
-  null_policy null_handling);
-template std::unique_ptr<rolling_aggregation> make_collect_list_aggregation<rolling_aggregation>(
-  null_policy null_handling);
-template std::unique_ptr<groupby_aggregation> make_collect_list_aggregation<groupby_aggregation>(
-  null_policy null_handling);
-template std::unique_ptr<reduce_aggregation> make_collect_list_aggregation<reduce_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_collect_list_aggregation<aggregation>(
   null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_collect_list_aggregation<rolling_aggregation>(null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_collect_list_aggregation<groupby_aggregation>(null_policy null_handling);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_collect_list_aggregation<reduce_aggregation>(null_policy null_handling);
 
 /// Factory to create a COLLECT_SET aggregation
 template <typename Base>
@@ -740,14 +769,20 @@ std::unique_ptr<Base> make_collect_set_aggregation(null_policy null_handling,
 {
   return std::make_unique<detail::collect_set_aggregation>(null_handling, nulls_equal, nans_equal);
 }
-template std::unique_ptr<aggregation> make_collect_set_aggregation<aggregation>(
-  null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
-template std::unique_ptr<rolling_aggregation> make_collect_set_aggregation<rolling_aggregation>(
-  null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
-template std::unique_ptr<groupby_aggregation> make_collect_set_aggregation<groupby_aggregation>(
-  null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
-template std::unique_ptr<reduce_aggregation> make_collect_set_aggregation<reduce_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_collect_set_aggregation<aggregation>(
   null_policy null_handling, null_equality nulls_equal, nan_equality nans_equal);
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_collect_set_aggregation<rolling_aggregation>(null_policy null_handling,
+                                                  null_equality nulls_equal,
+                                                  nan_equality nans_equal);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_collect_set_aggregation<groupby_aggregation>(null_policy null_handling,
+                                                  null_equality nulls_equal,
+                                                  nan_equality nans_equal);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_collect_set_aggregation<reduce_aggregation>(null_policy null_handling,
+                                                 null_equality nulls_equal,
+                                                 nan_equality nans_equal);
 
 /// Factory to create a LAG aggregation
 template <typename Base>
@@ -755,8 +790,9 @@ std::unique_ptr<Base> make_lag_aggregation(size_type offset)
 {
   return std::make_unique<detail::lead_lag_aggregation>(aggregation::LAG, offset);
 }
-template std::unique_ptr<aggregation> make_lag_aggregation<aggregation>(size_type offset);
-template std::unique_ptr<rolling_aggregation> make_lag_aggregation<rolling_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_lag_aggregation<aggregation>(
+  size_type offset);
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation> make_lag_aggregation<rolling_aggregation>(
   size_type offset);
 
 /// Factory to create a LEAD aggregation
@@ -765,9 +801,10 @@ std::unique_ptr<Base> make_lead_aggregation(size_type offset)
 {
   return std::make_unique<detail::lead_lag_aggregation>(aggregation::LEAD, offset);
 }
-template std::unique_ptr<aggregation> make_lead_aggregation<aggregation>(size_type offset);
-template std::unique_ptr<rolling_aggregation> make_lead_aggregation<rolling_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_lead_aggregation<aggregation>(
   size_type offset);
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation>
+make_lead_aggregation<rolling_aggregation>(size_type offset);
 
 /// Factory to create a UDF aggregation
 template <typename Base>
@@ -781,9 +818,9 @@ std::unique_ptr<Base> make_udf_aggregation(udf_type type,
                                 output_type};
   return std::unique_ptr<detail::udf_aggregation>(a);
 }
-template std::unique_ptr<aggregation> make_udf_aggregation<aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_udf_aggregation<aggregation>(
   udf_type type, std::string const& user_defined_aggregator, data_type output_type);
-template std::unique_ptr<rolling_aggregation> make_udf_aggregation<rolling_aggregation>(
+template CUDF_EXPORT std::unique_ptr<rolling_aggregation> make_udf_aggregation<rolling_aggregation>(
   udf_type type, std::string const& user_defined_aggregator, data_type output_type);
 
 /// Factory to create a MERGE_LISTS aggregation
@@ -792,9 +829,11 @@ std::unique_ptr<Base> make_merge_lists_aggregation()
 {
   return std::make_unique<detail::merge_lists_aggregation>();
 }
-template std::unique_ptr<aggregation> make_merge_lists_aggregation<aggregation>();
-template std::unique_ptr<groupby_aggregation> make_merge_lists_aggregation<groupby_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_merge_lists_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<aggregation> make_merge_lists_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_merge_lists_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_merge_lists_aggregation<reduce_aggregation>();
 
 /// Factory to create a MERGE_SETS aggregation
 template <typename Base>
@@ -803,12 +842,12 @@ std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal,
 {
   return std::make_unique<detail::merge_sets_aggregation>(nulls_equal, nans_equal);
 }
-template std::unique_ptr<aggregation> make_merge_sets_aggregation<aggregation>(null_equality,
-                                                                               nan_equality);
-template std::unique_ptr<groupby_aggregation> make_merge_sets_aggregation<groupby_aggregation>(
-  null_equality, nan_equality);
-template std::unique_ptr<reduce_aggregation> make_merge_sets_aggregation<reduce_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_merge_sets_aggregation<aggregation>(
   null_equality, nan_equality);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+  make_merge_sets_aggregation<groupby_aggregation>(null_equality, nan_equality);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+  make_merge_sets_aggregation<reduce_aggregation>(null_equality, nan_equality);
 
 /// Factory to create a MERGE_M2 aggregation
 template <typename Base>
@@ -816,8 +855,9 @@ std::unique_ptr<Base> make_merge_m2_aggregation()
 {
   return std::make_unique<detail::merge_m2_aggregation>();
 }
-template std::unique_ptr<aggregation> make_merge_m2_aggregation<aggregation>();
-template std::unique_ptr<groupby_aggregation> make_merge_m2_aggregation<groupby_aggregation>();
+template CUDF_EXPORT std::unique_ptr<aggregation> make_merge_m2_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_merge_m2_aggregation<groupby_aggregation>();
 
 /// Factory to create a MERGE_HISTOGRAM aggregation
 template <typename Base>
@@ -825,10 +865,11 @@ std::unique_ptr<Base> make_merge_histogram_aggregation()
 {
   return std::make_unique<detail::merge_histogram_aggregation>();
 }
-template std::unique_ptr<aggregation> make_merge_histogram_aggregation<aggregation>();
-template std::unique_ptr<groupby_aggregation>
+template CUDF_EXPORT std::unique_ptr<aggregation> make_merge_histogram_aggregation<aggregation>();
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
 make_merge_histogram_aggregation<groupby_aggregation>();
-template std::unique_ptr<reduce_aggregation> make_merge_histogram_aggregation<reduce_aggregation>();
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_merge_histogram_aggregation<reduce_aggregation>();
 
 /// Factory to create a COVARIANCE aggregation
 template <typename Base>
@@ -836,10 +877,10 @@ std::unique_ptr<Base> make_covariance_aggregation(size_type min_periods, size_ty
 {
   return std::make_unique<detail::covariance_aggregation>(min_periods, ddof);
 }
-template std::unique_ptr<aggregation> make_covariance_aggregation<aggregation>(
-  size_type min_periods, size_type ddof);
-template std::unique_ptr<groupby_aggregation> make_covariance_aggregation<groupby_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_covariance_aggregation<aggregation>(
   size_type min_periods, size_type ddof);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_covariance_aggregation<groupby_aggregation>(size_type min_periods, size_type ddof);
 
 /// Factory to create a CORRELATION aggregation
 template <typename Base>
@@ -847,33 +888,34 @@ std::unique_ptr<Base> make_correlation_aggregation(correlation_type type, size_t
 {
   return std::make_unique<detail::correlation_aggregation>(type, min_periods);
 }
-template std::unique_ptr<aggregation> make_correlation_aggregation<aggregation>(
-  correlation_type type, size_type min_periods);
-template std::unique_ptr<groupby_aggregation> make_correlation_aggregation<groupby_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_correlation_aggregation<aggregation>(
   correlation_type type, size_type min_periods);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_correlation_aggregation<groupby_aggregation>(correlation_type type, size_type min_periods);
 
 template <typename Base>
 std::unique_ptr<Base> make_tdigest_aggregation(int max_centroids)
 {
   return std::make_unique<detail::tdigest_aggregation>(max_centroids);
 }
-template std::unique_ptr<aggregation> make_tdigest_aggregation<aggregation>(int max_centroids);
-template std::unique_ptr<groupby_aggregation> make_tdigest_aggregation<groupby_aggregation>(
-  int max_centroids);
-template std::unique_ptr<reduce_aggregation> make_tdigest_aggregation<reduce_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_tdigest_aggregation<aggregation>(
   int max_centroids);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_tdigest_aggregation<groupby_aggregation>(int max_centroids);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_tdigest_aggregation<reduce_aggregation>(int max_centroids);
 
 template <typename Base>
 std::unique_ptr<Base> make_merge_tdigest_aggregation(int max_centroids)
 {
   return std::make_unique<detail::merge_tdigest_aggregation>(max_centroids);
 }
-template std::unique_ptr<aggregation> make_merge_tdigest_aggregation<aggregation>(
-  int max_centroids);
-template std::unique_ptr<groupby_aggregation> make_merge_tdigest_aggregation<groupby_aggregation>(
-  int max_centroids);
-template std::unique_ptr<reduce_aggregation> make_merge_tdigest_aggregation<reduce_aggregation>(
+template CUDF_EXPORT std::unique_ptr<aggregation> make_merge_tdigest_aggregation<aggregation>(
   int max_centroids);
+template CUDF_EXPORT std::unique_ptr<groupby_aggregation>
+make_merge_tdigest_aggregation<groupby_aggregation>(int max_centroids);
+template CUDF_EXPORT std::unique_ptr<reduce_aggregation>
+make_merge_tdigest_aggregation<reduce_aggregation>(int max_centroids);
 
 namespace detail {
 namespace {
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index ba0253ec853..7a0bc312434 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -18,6 +18,7 @@
 #include "operation.cuh"
 #include "struct_binary_ops.cuh"
 
+#include <cudf/binaryop.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/structs/utilities.hpp>
diff --git a/cpp/src/bitmask/is_element_valid.cpp b/cpp/src/bitmask/is_element_valid.cpp
index e0f0ccdc861..4806c7a94e8 100644
--- a/cpp/src/bitmask/is_element_valid.cpp
+++ b/cpp/src/bitmask/is_element_valid.cpp
@@ -1,6 +1,5 @@
-
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column_view.hpp>
+#include <cudf/detail/is_element_valid.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index 4be3054b3dc..ac9931335ff 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -16,6 +16,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
+#include <cudf/concatenate.hpp>
 #include <cudf/detail/concatenate_masks.hpp>
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/get_value.cuh>
diff --git a/cpp/src/copying/purge_nonempty_nulls.cu b/cpp/src/copying/purge_nonempty_nulls.cu
index d69d214a881..581d0a00924 100644
--- a/cpp/src/copying/purge_nonempty_nulls.cu
+++ b/cpp/src/copying/purge_nonempty_nulls.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 #include <cudf/copying.hpp>
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/utilities/default_stream.hpp>
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index 08a33d40abe..cf40fda5971 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -27,6 +27,7 @@
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/dictionary/update_keys.hpp>
 #include <cudf/stream_compaction.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
diff --git a/cpp/src/filling/calendrical_month_sequence.cu b/cpp/src/filling/calendrical_month_sequence.cu
index 3e6d693dde5..f984f307ddd 100644
--- a/cpp/src/filling/calendrical_month_sequence.cu
+++ b/cpp/src/filling/calendrical_month_sequence.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/detail/calendrical_month_sequence.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
diff --git a/cpp/src/io/comp/gpuinflate.hpp b/cpp/src/io/comp/gpuinflate.hpp
index 5908b77c98b..8bfca2b30df 100644
--- a/cpp/src/io/comp/gpuinflate.hpp
+++ b/cpp/src/io/comp/gpuinflate.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/io/types.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -73,6 +74,7 @@ constexpr std::size_t BUFFER_PADDING_MULTIPLE{8};
  * @param[in] parse_hdr Whether or not to parse GZIP header
  * @param[in] stream CUDA stream to use
  */
+CUDF_EXPORT
 void gpuinflate(device_span<device_span<uint8_t const> const> inputs,
                 device_span<device_span<uint8_t> const> outputs,
                 device_span<compression_result> results,
@@ -101,6 +103,7 @@ void gpu_copy_uncompressed_blocks(device_span<device_span<uint8_t const> const>
  * @param[out] results List of output status structures
  * @param[in] stream CUDA stream to use
  */
+CUDF_EXPORT
 void gpu_unsnap(device_span<device_span<uint8_t const> const> inputs,
                 device_span<device_span<uint8_t> const> outputs,
                 device_span<compression_result> results,
@@ -113,6 +116,7 @@ void gpu_unsnap(device_span<device_span<uint8_t const> const> inputs,
  *
  * @return The size in bytes of required temporary memory
  */
+CUDF_EXPORT
 size_t get_gpu_debrotli_scratch_size(int max_num_inputs = 0);
 
 /**
@@ -128,6 +132,7 @@ size_t get_gpu_debrotli_scratch_size(int max_num_inputs = 0);
  * @param[in] scratch_size Size in bytes of the temporary memory
  * @param[in] stream CUDA stream to use
  */
+CUDF_EXPORT
 void gpu_debrotli(device_span<device_span<uint8_t const> const> inputs,
                   device_span<device_span<uint8_t> const> outputs,
                   device_span<compression_result> results,
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 6d2834206d4..62c3c5cd245 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -41,6 +41,7 @@
 #include <algorithm>
 
 namespace cudf::io {
+
 // Returns builder for csv_reader_options
 csv_reader_options_builder csv_reader_options::builder(source_info src)
 {
@@ -472,6 +473,8 @@ chunked_orc_reader::chunked_orc_reader(std::size_t chunk_read_limit,
 {
 }
 
+chunked_orc_reader::chunked_orc_reader() = default;
+
 // This destructor destroys the internal reader instance.
 // Since the declaration of the internal `reader` object does not exist in the header, this
 // destructor needs to be defined in a separate source file which can access to that object's
@@ -492,6 +495,10 @@ table_with_metadata chunked_orc_reader::read_chunk() const
   return reader->read_chunk();
 }
 
+orc_chunked_writer::orc_chunked_writer() = default;
+
+orc_chunked_writer::~orc_chunked_writer() = default;
+
 /**
  * @copydoc cudf::io::orc_chunked_writer::orc_chunked_writer
  */
@@ -618,6 +625,8 @@ std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const
   return writer->close(options.get_column_chunks_file_paths());
 }
 
+chunked_parquet_reader::chunked_parquet_reader() = default;
+
 /**
  * @copydoc cudf::io::chunked_parquet_reader::chunked_parquet_reader
  */
@@ -672,6 +681,8 @@ table_with_metadata chunked_parquet_reader::read_chunk() const
   return reader->read_chunk();
 }
 
+parquet_chunked_writer::parquet_chunked_writer() = default;
+
 /**
  * @copydoc cudf::io::parquet_chunked_writer::parquet_chunked_writer
  */
@@ -686,6 +697,8 @@ parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options co
     std::move(sinks), options, io_detail::single_write_mode::NO, stream);
 }
 
+parquet_chunked_writer::~parquet_chunked_writer() = default;
+
 /**
  * @copydoc cudf::io::parquet_chunked_writer::write
  */
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index e12892a2d50..20c143f66c7 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -21,6 +21,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/resource_ref.hpp>
 
@@ -28,10 +29,12 @@
 #include <vector>
 
 // Forward declaration of parse_options from parsing_utils.cuh
-namespace cudf::io {
+namespace cudf {
+namespace io {
+
 struct parse_options;
-}
-namespace cudf::io::json {
+
+namespace json {
 
 /**
  * @brief Struct that encapsulate all information of a columnar tree representation.
@@ -201,6 +204,7 @@ namespace detail {
  * @param[in] delimiter Specifies the delimiter to use as separator for JSON lines input
  * @param[in] stream The cuda stream to dispatch GPU kernels to
  */
+CUDF_EXPORT
 void get_stack_context(device_span<SymbolT const> json_in,
                        SymbolT* d_top_of_stack,
                        stack_behavior_t stack_behavior,
@@ -216,6 +220,7 @@ void get_stack_context(device_span<SymbolT const> json_in,
  * @param stream The cuda stream to dispatch GPU kernels to
  * @return Returns the post-processed token stream
  */
+CUDF_EXPORT
 std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> process_token_stream(
   device_span<PdaTokenT const> tokens,
   device_span<SymbolOffsetT const> token_indices,
@@ -232,6 +237,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
  * @return A tree representation of the input JSON string as vectors of node type, parent index,
  * level, begin index, and end index in the input JSON string
  */
+CUDF_EXPORT
 tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                     device_span<SymbolOffsetT const> token_indices,
                                     bool is_strict_nested_boundaries,
@@ -251,6 +257,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
  * @param mr Optional, resource with which to allocate
  * @return A tuple of the output column indices and the row offsets within each column for each node
  */
+CUDF_EXPORT
 std::tuple<rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
 records_orient_tree_traversal(device_span<SymbolT const> d_input,
                               tree_meta_t const& d_tree,
@@ -315,6 +322,7 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt
  * @param mr Optional, resource with which to allocate
  * @return The data parsed from the given JSON input
  */
+CUDF_EXPORT
 table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
                                              cudf::io::json_reader_options const& options,
                                              rmm::cuda_stream_view stream,
@@ -348,4 +356,6 @@ struct path_from_tree {
 
 }  // namespace detail
 
-}  // namespace cudf::io::json
+}  // namespace json
+}  // namespace io
+}  // namespace cudf
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index ff69f9b7627..32de4ebabfa 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -19,6 +19,7 @@
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -27,7 +28,8 @@
 
 #include <memory>
 
-namespace cudf::io::json::detail {
+namespace CUDF_EXPORT cudf {
+namespace io::json::detail {
 
 // Some magic numbers
 constexpr int num_subchunks               = 10;  // per chunk_size
@@ -51,4 +53,5 @@ size_type find_first_delimiter(device_span<char const> d_data,
                                char const delimiter,
                                rmm::cuda_stream_view stream);
 
-}  // namespace cudf::io::json::detail
+}  // namespace io::json::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/parquet/compact_protocol_reader.hpp b/cpp/src/io/parquet/compact_protocol_reader.hpp
index bcc9adfc8c0..12c24e2b848 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.hpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.hpp
@@ -18,6 +18,8 @@
 
 #include "parquet.hpp"
 
+#include <cudf/utilities/export.hpp>
+
 #include <algorithm>
 #include <cstddef>
 #include <optional>
@@ -25,7 +27,8 @@
 #include <utility>
 #include <vector>
 
-namespace cudf::io::parquet::detail {
+namespace CUDF_EXPORT cudf {
+namespace io::parquet::detail {
 
 /**
  * @brief Class for parsing Parquet's Thrift Compact Protocol encoded metadata
@@ -149,4 +152,5 @@ class CompactProtocolReader {
   friend class parquet_field_struct_blob;
 };
 
-}  // namespace cudf::io::parquet::detail
+}  // namespace io::parquet::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/utilities/base64_utilities.hpp b/cpp/src/io/utilities/base64_utilities.hpp
index 537d9c96d6b..b1eb120c47f 100644
--- a/cpp/src/io/utilities/base64_utilities.hpp
+++ b/cpp/src/io/utilities/base64_utilities.hpp
@@ -61,10 +61,13 @@
 // altered: applying clang-format for libcudf on this file.
 
 // altered: include required headers
+#include <cudf/utilities/export.hpp>
+
 #include <string>
 
 // altered: use cudf namespaces
-namespace cudf::io::detail {
+namespace CUDF_EXPORT cudf {
+namespace io::detail {
 
 /**
  * @brief Encodes input string to base64 and returns it
@@ -84,4 +87,5 @@ std::string base64_encode(std::string_view string_to_encode);
  */
 std::string base64_decode(std::string_view encoded_string);
 
-}  // namespace cudf::io::detail
+}  // namespace io::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index aa1b29a101f..73362334e26 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -20,11 +20,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/null_mask.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/types.hpp>
@@ -933,7 +933,7 @@ std::unique_ptr<column> parse_data(
   auto d_null_count    = rmm::device_scalar<size_type>(null_count, stream);
   auto null_count_data = d_null_count.data();
   if (null_mask.is_empty()) {
-    null_mask = cudf::detail::create_null_mask(col_size, mask_state::ALL_VALID, stream, mr);
+    null_mask = cudf::create_null_mask(col_size, mask_state::ALL_VALID, stream, mr);
   }
 
   // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 441bede200d..7e47b5b3d10 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -25,6 +25,7 @@
 
 #include <cudf/io/datasource.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -211,7 +212,7 @@ std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepa
 /**
  * @brief Byte range to be read/written in a single operation.
  */
-struct file_io_slice {
+CUDF_EXPORT struct file_io_slice {
   size_t offset;
   size_t size;
 };
@@ -221,7 +222,7 @@ struct file_io_slice {
  *
  * If `max_slice_size` is below 1024, 1024 will be used instead to prevent potential misuse.
  */
-std::vector<file_io_slice> make_file_io_slices(size_t size, size_t max_slice_size);
+CUDF_EXPORT std::vector<file_io_slice> make_file_io_slices(size_t size, size_t max_slice_size);
 
 }  // namespace detail
 }  // namespace io
diff --git a/cpp/src/io/utilities/row_selection.hpp b/cpp/src/io/utilities/row_selection.hpp
index 7fdcc65d77b..7c607099cdc 100644
--- a/cpp/src/io/utilities/row_selection.hpp
+++ b/cpp/src/io/utilities/row_selection.hpp
@@ -21,7 +21,8 @@
 #include <optional>
 #include <utility>
 
-namespace cudf::io::detail {
+namespace CUDF_EXPORT cudf {
+namespace io::detail {
 
 /**
  * @brief Adjusts the input skip_rows and num_rows options to the actual number of rows to
@@ -38,4 +39,5 @@ std::pair<int64_t, int64_t> skip_rows_num_rows_from_options(int64_t skip_rows,
                                                             std::optional<int64_t> const& num_rows,
                                                             int64_t num_source_rows);
 
-}  // namespace cudf::io::detail
+}  // namespace io::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/utilities/string_parsing.hpp b/cpp/src/io/utilities/string_parsing.hpp
index 3e6f57f2896..0d9e7e40e4e 100644
--- a/cpp/src/io/utilities/string_parsing.hpp
+++ b/cpp/src/io/utilities/string_parsing.hpp
@@ -18,6 +18,7 @@
 #include "io/utilities/parsing_utils.cuh"
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -43,7 +44,7 @@ namespace detail {
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @return The inferred data type
  */
-cudf::data_type infer_data_type(
+CUDF_EXPORT cudf::data_type infer_data_type(
   cudf::io::json_inference_options_view const& options,
   device_span<char const> data,
   thrust::zip_iterator<thrust::tuple<size_type const*, size_type const*>> offset_length_begin,
@@ -66,7 +67,7 @@ namespace json::detail {
  * @param mr The resource to be used for device memory allocation
  * @return The column that contains the parsed data
  */
-std::unique_ptr<column> parse_data(
+CUDF_EXPORT std::unique_ptr<column> parse_data(
   char const* data,
   thrust::zip_iterator<thrust::tuple<size_type const*, size_type const*>> offset_length_begin,
   size_type col_size,
diff --git a/cpp/src/io/utilities/trie.cuh b/cpp/src/io/utilities/trie.cuh
index 677743d77d0..caea8dabb88 100644
--- a/cpp/src/io/utilities/trie.cuh
+++ b/cpp/src/io/utilities/trie.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,7 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <optional>
@@ -67,7 +68,8 @@ inline trie_view make_trie_view(optional_trie const& t)
  *
  * @return A host vector of nodes representing the serialized trie
  */
-trie create_serialized_trie(std::vector<std::string> const& keys, rmm::cuda_stream_view stream);
+CUDF_EXPORT trie create_serialized_trie(std::vector<std::string> const& keys,
+                                        rmm::cuda_stream_view stream);
 
 /*
  * @brief Searches for a string in a serialized trie.
diff --git a/cpp/src/jit/parser.hpp b/cpp/src/jit/parser.hpp
index 55528bed6cf..85c8d63192f 100644
--- a/cpp/src/jit/parser.hpp
+++ b/cpp/src/jit/parser.hpp
@@ -16,12 +16,14 @@
 
 #pragma once
 
+#include <cudf/utilities/export.hpp>
+
 #include <map>
 #include <set>
 #include <string>
 #include <vector>
 
-namespace cudf {
+namespace CUDF_EXPORT cudf {
 namespace jit {
 /**
  * @brief Parse and transform a piece of PTX code that contains the implementation
@@ -239,4 +241,4 @@ inline std::string parse_single_function_ptx(std::string const& src,
 std::string parse_single_function_cuda(std::string const& src, std::string const& function_name);
 
 }  // namespace jit
-}  // namespace cudf
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index f03d394d6d7..30c03a8cd68 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -18,6 +18,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/valid_if.cuh>
+#include <cudf/lists/contains.hpp>
 #include <cudf/lists/detail/contains.hpp>
 #include <cudf/lists/detail/lists_column_factories.hpp>
 #include <cudf/lists/list_device_view.cuh>
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index 3d609a262b9..8cd58e7eff2 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 779eca438db..90f7994b21d 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/detail/gather.cuh>
+#include <cudf/lists/gather.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index 1d18b8c677c..5c7ab68d64b 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -26,6 +26,7 @@
 #include <cudf/lists/detail/combine.hpp>
 #include <cudf/lists/detail/set_operations.hpp>
 #include <cudf/lists/detail/stream_compaction.hpp>
+#include <cudf/lists/set_operations.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu
index 40dee010bd5..cdcb4aa957f 100644
--- a/cpp/src/lists/stream_compaction/distinct.cu
+++ b/cpp/src/lists/stream_compaction/distinct.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/stream_compaction.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/lists/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 7ecaa0fba56..e2c8d49a4ab 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -27,6 +27,7 @@
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/merge.hpp>
 #include <cudf/strings/detail/merge.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu
index 82b169c78ed..9810373b751 100644
--- a/cpp/src/partitioning/round_robin.cu
+++ b/cpp/src/partitioning/round_robin.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/null_mask.hpp>
+#include <cudf/partitioning.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
@@ -271,8 +272,8 @@ std::pair<std::unique_ptr<table>, std::vector<cudf::size_type>> round_robin_part
 std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> round_robin_partition(
   table_view const& input,
   cudf::size_type num_partitions,
-  cudf::size_type start_partition   = 0,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  cudf::size_type start_partition,
+  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
   return detail::round_robin_partition(
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index b25254cfe49..5d748de0019 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -25,6 +25,7 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/quantiles.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu
index af3bda2e62e..0b0e6701304 100644
--- a/cpp/src/quantiles/quantiles.cu
+++ b/cpp/src/quantiles/quantiles.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/quantiles.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index da36b7ab1da..421ed26e26d 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/quantiles.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu
index 0befb6ac7d7..0dbfc271a25 100644
--- a/cpp/src/reductions/scan/rank_scan.cu
+++ b/cpp/src/reductions/scan/rank_scan.cu
@@ -17,6 +17,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
+#include <cudf/detail/scan.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index 7c02a8d1b99..ee35d716d6e 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/scan.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/cast_functor.cuh>
 #include <cudf/reduction.hpp>
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index 48ab5963a29..e6de065dabb 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -13,11 +13,11 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-
 #include <cudf/column/column.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/reduction.hpp>
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 580db0e24c5..79124508b11 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/reshape.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/detail/interleave_columns.hpp>
+#include <cudf/reshape.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu
index 1c4019b2c73..29996aa2152 100644
--- a/cpp/src/reshape/tile.cu
+++ b/cpp/src/reshape/tile.cu
@@ -19,6 +19,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/reshape.hpp>
+#include <cudf/reshape.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu
index e612bd01118..5dff40a3396 100644
--- a/cpp/src/rolling/rolling.cu
+++ b/cpp/src/rolling/rolling.cu
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/rolling.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/resource_ref.hpp>
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 07425a92413..83209c55c8a 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -216,7 +216,7 @@ template class fixed_point_scalar<numeric::decimal32>;
 template class fixed_point_scalar<numeric::decimal64>;
 template class fixed_point_scalar<numeric::decimal128>;
 
-namespace detail {
+namespace CUDF_HIDDEN detail {
 
 template <typename T>
 fixed_width_scalar<T>::fixed_width_scalar(T value,
@@ -306,7 +306,7 @@ template class fixed_width_scalar<duration_ms>;
 template class fixed_width_scalar<duration_us>;
 template class fixed_width_scalar<duration_ns>;
 
-}  // namespace detail
+}  // namespace CUDF_HIDDEN detail
 
 template <typename T>
 numeric_scalar<T>::numeric_scalar(T value,
diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu
index 8f05196a71c..57f2c59de40 100644
--- a/cpp/src/search/contains_column.cu
+++ b/cpp/src/search/contains_column.cu
@@ -19,6 +19,7 @@
 #include <cudf/detail/search.hpp>
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
+#include <cudf/search.hpp>
 #include <cudf/table/table_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/search/contains_scalar.cu b/cpp/src/search/contains_scalar.cu
index e88acf68e28..2aa9e24174b 100644
--- a/cpp/src/search/contains_scalar.cu
+++ b/cpp/src/search/contains_scalar.cu
@@ -17,10 +17,12 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/search.hpp>
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
+#include <cudf/search.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index 4fb983dc5a6..81227cb9a2d 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/detail/null_mask.hpp>
+#include <cudf/detail/search.hpp>
 #include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/src/search/search_ordered.cu b/cpp/src/search/search_ordered.cu
index 328d3f0cee4..80651a4ec44 100644
--- a/cpp/src/search/search_ordered.cu
+++ b/cpp/src/search/search_ordered.cu
@@ -18,6 +18,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
+#include <cudf/search.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 2e4a776d3c0..514ab965fc5 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -16,6 +16,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/convert/convert_durations.hpp>
 #include <cudf/strings/detail/convert/int_to_string.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/types.hpp>
diff --git a/cpp/src/strings/strings_scalar_factories.cpp b/cpp/src/strings/strings_scalar_factories.cpp
index 233fee14694..cf973638cc4 100644
--- a/cpp/src/strings/strings_scalar_factories.cpp
+++ b/cpp/src/strings/strings_scalar_factories.cpp
@@ -15,6 +15,7 @@
  */
 
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/scalar/scalar_factories.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index f70598f33be..068d89a52dc 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #include "strings/char_types/char_cases.h"
 #include "strings/char_types/char_flags.h"
 
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 723c306da1d..808f2d1b284 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -20,6 +20,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index bfac7ab586e..12a15eb7e34 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -25,6 +25,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
diff --git a/cpp/tests/utilities/random_seed.cpp b/cpp/tests/utilities/random_seed.cpp
index 4d5035e5a22..ab5a31ce161 100644
--- a/cpp/tests/utilities/random_seed.cpp
+++ b/cpp/tests/utilities/random_seed.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,7 @@ namespace detail {
 /**
  * @copydoc cudf::test::detail::random_generator_incrementing_seed()
  */
-uint64_t random_generator_incrementing_seed()
+CUDF_EXPORT uint64_t random_generator_incrementing_seed()
 {
   static uint64_t seed = 0;
   return ++seed;
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 56f8f9d0472..22059c5bc7f 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -210,6 +210,7 @@ target_compile_definitions(
   cudfjni PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_DEFINITIONS}>"
                  "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_DEFINITIONS}>"
 )
+target_link_options(cudfjni PRIVATE "-Wl,--no-undefined")
 
 if(USE_GDS)
   add_library(cufilejni src/CuFileJni.cpp)
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index c58cd732b39..a9ace1398e4 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -45,6 +45,7 @@
 #include <cudf/search.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/stream_compaction.hpp>
+#include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -2789,7 +2790,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftDistinctJoinGatherMap
       auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right)
                          ? cudf::nullable_join::YES
                          : cudf::nullable_join::NO;
-      if (cudf::detail::has_nested_columns(right)) {
+      if (cudf::has_nested_columns(right)) {
         cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
         return hash.left_join();
       } else {
@@ -3010,7 +3011,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerDistinctJoinGatherMa
       std::pair<std::unique_ptr<rmm::device_uvector<cudf::size_type>>,
                 std::unique_ptr<rmm::device_uvector<cudf::size_type>>>
         maps;
-      if (cudf::detail::has_nested_columns(right)) {
+      if (cudf::has_nested_columns(right)) {
         cudf::distinct_hash_join<cudf::has_nested::YES> hash(right, left, has_nulls, nulleq);
         maps = hash.inner_join();
       } else {

From f756e01a3c5ff83421b1afb44460d9e5147a410e Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 25 Jul 2024 07:04:47 -0700
Subject: [PATCH 591/842] Implement support for scan_ndjson in cudf-polars
 (#16263)

Implement support for scan_ndjson in cudf-polars.

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16263
---
 python/cudf_polars/cudf_polars/dsl/ir.py      |  37 +++++-
 .../cudf_polars/testing/asserts.py            |  34 ++++--
 python/cudf_polars/tests/test_scan.py         | 115 +++++++++++++-----
 3 files changed, 146 insertions(+), 40 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index e5691cba7dd..7f62dff4389 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -204,10 +204,14 @@ class Scan(IR):
 
     def __post_init__(self) -> None:
         """Validate preconditions."""
-        if self.typ not in ("csv", "parquet"):
+        if self.typ not in ("csv", "parquet", "ndjson"):  # pragma: no cover
+            # This line is unhittable ATM since IPC/Anonymous scan raise
+            # on the polars side
             raise NotImplementedError(f"Unhandled scan type: {self.typ}")
+        if self.typ == "ndjson" and self.file_options.n_rows is not None:
+            raise NotImplementedError("row limit in scan")
         if self.cloud_options is not None and any(
-            self.cloud_options[k] is not None for k in ("aws", "azure", "gcp")
+            self.cloud_options.get(k) is not None for k in ("aws", "azure", "gcp")
         ):
             raise NotImplementedError(
                 "Read from cloud storage"
@@ -232,6 +236,13 @@ def __post_init__(self) -> None:
                 # Need to do some file introspection to get the number
                 # of columns so that column projection works right.
                 raise NotImplementedError("Reading CSV without header")
+        elif self.typ == "ndjson":
+            # TODO: consider handling the low memory option here
+            # (maybe use chunked JSON reader)
+            if self.reader_options["ignore_errors"]:
+                raise NotImplementedError(
+                    "ignore_errors is not supported in the JSON reader"
+                )
 
     def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         """Evaluate and return a dataframe."""
@@ -317,6 +328,28 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 # TODO: consider nested column names?
                 tbl_w_meta.column_names(include_children=False),
             )
+        elif self.typ == "ndjson":
+            json_schema: list[tuple[str, str, list]] = [
+                (name, typ, []) for name, typ in self.schema.items()
+            ]
+            plc_tbl_w_meta = plc.io.json.read_json(
+                plc.io.SourceInfo(self.paths),
+                lines=True,
+                dtypes=json_schema,
+                prune_columns=True,
+            )
+            # TODO: I don't think cudf-polars supports nested types in general right now
+            # (but when it does, we should pass child column names from nested columns in)
+            df = DataFrame.from_table(
+                plc_tbl_w_meta.tbl, plc_tbl_w_meta.column_names(include_children=False)
+            )
+            col_order = list(self.schema.keys())
+            # TODO: remove condition when dropping support for polars 1.0
+            # https://github.com/pola-rs/polars/pull/17363
+            if row_index is not None and row_index[0] in self.schema:
+                col_order.remove(row_index[0])
+            if col_order is not None:
+                df = df.select(col_order)
         else:
             raise NotImplementedError(
                 f"Unhandled scan type: {self.typ}"
diff --git a/python/cudf_polars/cudf_polars/testing/asserts.py b/python/cudf_polars/cudf_polars/testing/asserts.py
index a9a4ae5f0a6..d37c96a15de 100644
--- a/python/cudf_polars/cudf_polars/testing/asserts.py
+++ b/python/cudf_polars/cudf_polars/testing/asserts.py
@@ -14,8 +14,6 @@
 from cudf_polars.dsl.translate import translate_ir
 
 if TYPE_CHECKING:
-    from collections.abc import Mapping
-
     import polars as pl
 
     from cudf_polars.typing import OptimizationArgs
@@ -26,7 +24,9 @@
 def assert_gpu_result_equal(
     lazydf: pl.LazyFrame,
     *,
-    collect_kwargs: Mapping[OptimizationArgs, bool] | None = None,
+    collect_kwargs: dict[OptimizationArgs, bool] | None = None,
+    polars_collect_kwargs: dict[OptimizationArgs, bool] | None = None,
+    cudf_collect_kwargs: dict[OptimizationArgs, bool] | None = None,
     check_row_order: bool = True,
     check_column_order: bool = True,
     check_dtypes: bool = True,
@@ -43,8 +43,17 @@ def assert_gpu_result_equal(
     lazydf
         frame to collect.
     collect_kwargs
-        Keyword arguments to pass to collect. Useful for controlling
-        optimization settings.
+        Common keyword arguments to pass to collect for both polars CPU and
+        cudf-polars.
+        Useful for controlling optimization settings.
+    polars_collect_kwargs
+        Keyword arguments to pass to collect for execution on polars CPU.
+        Overrides kwargs in collect_kwargs.
+        Useful for controlling optimization settings.
+    cudf_collect_kwargs
+        Keyword arguments to pass to collect for execution on cudf-polars.
+        Overrides kwargs in collect_kwargs.
+        Useful for controlling optimization settings.
     check_row_order
         Expect rows to be in same order
     check_column_order
@@ -68,10 +77,19 @@ def assert_gpu_result_equal(
     NotImplementedError
         If GPU collection failed in some way.
     """
-    collect_kwargs = {} if collect_kwargs is None else collect_kwargs
-    expect = lazydf.collect(**collect_kwargs)
+    if collect_kwargs is None:
+        collect_kwargs = {}
+    final_polars_collect_kwargs = collect_kwargs.copy()
+    final_cudf_collect_kwargs = collect_kwargs.copy()
+    if polars_collect_kwargs is not None:
+        final_polars_collect_kwargs.update(polars_collect_kwargs)
+    if cudf_collect_kwargs is not None:  # pragma: no cover
+        # exclude from coverage since not used ATM
+        # but this is probably still useful
+        final_cudf_collect_kwargs.update(cudf_collect_kwargs)
+    expect = lazydf.collect(**final_polars_collect_kwargs)
     got = lazydf.collect(
-        **collect_kwargs,
+        **final_cudf_collect_kwargs,
         post_opt_callback=partial(execute_with_cudf, raise_on_fail=True),
     )
     assert_frame_equal(
diff --git a/python/cudf_polars/tests/test_scan.py b/python/cudf_polars/tests/test_scan.py
index 642b6ae8a37..64acbb076ed 100644
--- a/python/cudf_polars/tests/test_scan.py
+++ b/python/cudf_polars/tests/test_scan.py
@@ -31,33 +31,16 @@ def n_rows(request):
     return request.param
 
 
-@pytest.fixture(params=["csv", "parquet"])
-def df(request, tmp_path, row_index, n_rows):
-    df = pl.DataFrame(
+@pytest.fixture(scope="module")
+def df():
+    # TODO: more dtypes
+    return pl.DataFrame(
         {
-            "a": [1, 2, 3, None],
-            "b": ["ẅ", "x", "y", "z"],
-            "c": [None, None, 4, 5],
+            "a": [1, 2, 3, None, 4, 5],
+            "b": ["ẅ", "x", "y", "z", "123", "abcd"],
+            "c": [None, None, 4, 5, -1, 0],
         }
     )
-    name, offset = row_index
-    if request.param == "csv":
-        df.write_csv(tmp_path / "file.csv")
-        return pl.scan_csv(
-            tmp_path / "file.csv",
-            row_index_name=name,
-            row_index_offset=offset,
-            n_rows=n_rows,
-        )
-    else:
-        df.write_parquet(tmp_path / "file.pq")
-        # parquet doesn't have skip_rows argument
-        return pl.scan_parquet(
-            tmp_path / "file.pq",
-            row_index_name=name,
-            row_index_offset=offset,
-            n_rows=n_rows,
-        )
 
 
 @pytest.fixture(params=[None, ["a"], ["b", "a"]], ids=["all", "subset", "reordered"])
@@ -75,20 +58,72 @@ def mask(request):
     return request.param
 
 
-def test_scan(df, columns, mask):
-    q = df
+def make_source(df, path, format):
+    """
+    Writes the passed polars df to a file of
+    the desired format
+    """
+    if format == "csv":
+        df.write_csv(path)
+    elif format == "ndjson":
+        df.write_ndjson(path)
+    else:
+        df.write_parquet(path)
+
+
+@pytest.mark.parametrize(
+    "format, scan_fn",
+    [
+        ("csv", pl.scan_csv),
+        ("ndjson", pl.scan_ndjson),
+        ("parquet", pl.scan_parquet),
+    ],
+)
+def test_scan(tmp_path, df, format, scan_fn, row_index, n_rows, columns, mask, request):
+    name, offset = row_index
+    make_source(df, tmp_path / "file", format)
+    request.applymarker(
+        pytest.mark.xfail(
+            condition=(n_rows is not None and scan_fn is pl.scan_ndjson),
+            reason="libcudf does not support n_rows",
+        )
+    )
+    q = scan_fn(
+        tmp_path / "file",
+        row_index_name=name,
+        row_index_offset=offset,
+        n_rows=n_rows,
+    )
     if mask is not None:
         q = q.filter(mask)
     if columns is not None:
-        q = df.select(*columns)
-    assert_gpu_result_equal(q)
+        q = q.select(*columns)
+    polars_collect_kwargs = {}
+    if versions.POLARS_VERSION_LT_12:
+        # https://github.com/pola-rs/polars/issues/17553
+        polars_collect_kwargs = {"projection_pushdown": False}
+    assert_gpu_result_equal(
+        q,
+        polars_collect_kwargs=polars_collect_kwargs,
+        # This doesn't work in polars < 1.2 since the row-index
+        # is in the wrong order in previous polars releases
+        check_column_order=versions.POLARS_VERSION_LT_12,
+    )
 
 
 def test_scan_unsupported_raises(tmp_path):
     df = pl.DataFrame({"a": [1, 2, 3]})
 
-    df.write_ndjson(tmp_path / "df.json")
-    q = pl.scan_ndjson(tmp_path / "df.json")
+    df.write_ipc(tmp_path / "df.ipc")
+    q = pl.scan_ipc(tmp_path / "df.ipc")
+    assert_ir_translation_raises(q, NotImplementedError)
+
+
+def test_scan_ndjson_nrows_notimplemented(tmp_path, df):
+    df = pl.DataFrame({"a": [1, 2, 3]})
+
+    df.write_ndjson(tmp_path / "df.jsonl")
+    q = pl.scan_ndjson(tmp_path / "df.jsonl", n_rows=1)
     assert_ir_translation_raises(q, NotImplementedError)
 
 
@@ -225,3 +260,23 @@ def test_scan_csv_skip_initial_empty_rows(tmp_path):
     q = pl.scan_csv(tmp_path / "test.csv", separator="|", skip_rows=1)
 
     assert_gpu_result_equal(q)
+
+
+@pytest.mark.parametrize(
+    "schema",
+    [
+        # List of colnames (basicaly like names param in CSV)
+        {"b": pl.String, "a": pl.Float32},
+        {"a": pl.UInt64},
+    ],
+)
+def test_scan_ndjson_schema(df, tmp_path, schema):
+    make_source(df, tmp_path / "file", "ndjson")
+    q = pl.scan_ndjson(tmp_path / "file", schema=schema)
+    assert_gpu_result_equal(q)
+
+
+def test_scan_ndjson_unsupported(df, tmp_path):
+    make_source(df, tmp_path / "file", "ndjson")
+    q = pl.scan_ndjson(tmp_path / "file", ignore_errors=True)
+    assert_ir_translation_raises(q, NotImplementedError)

From e553295cfaf2f5bd1f539ee78d9a3a064e00e5f0 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Thu, 25 Jul 2024 11:14:47 -0500
Subject: [PATCH 592/842] Require fixed width types for casting in
 `cudf-polars` (#16381)

Fixes a bug where numeric <-> string casts are not being properly rejected at the cudf-polars level.

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16381
---
 python/cudf_polars/cudf_polars/dsl/expr.py         |  6 +++++-
 .../tests/expressions/test_numeric_binops.py       | 14 +++++++++++++-
 .../tests/expressions/test_stringfunction.py       |  6 ++++--
 3 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 6325feced94..9e0fca3f52f 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -1188,7 +1188,11 @@ class Cast(Expr):
     def __init__(self, dtype: plc.DataType, value: Expr) -> None:
         super().__init__(dtype)
         self.children = (value,)
-        if not plc.unary.is_supported_cast(self.dtype, value.dtype):
+        if not (
+            plc.traits.is_fixed_width(self.dtype)
+            and plc.traits.is_fixed_width(value.dtype)
+            and plc.unary.is_supported_cast(value.dtype, self.dtype)
+        ):
             raise NotImplementedError(
                 f"Can't cast {self.dtype.id().name} to {value.dtype.id().name}"
             )
diff --git a/python/cudf_polars/tests/expressions/test_numeric_binops.py b/python/cudf_polars/tests/expressions/test_numeric_binops.py
index b6bcd0026fa..8f68bbc460c 100644
--- a/python/cudf_polars/tests/expressions/test_numeric_binops.py
+++ b/python/cudf_polars/tests/expressions/test_numeric_binops.py
@@ -6,7 +6,10 @@
 
 import polars as pl
 
-from cudf_polars.testing.asserts import assert_gpu_result_equal
+from cudf_polars.testing.asserts import (
+    assert_gpu_result_equal,
+    assert_ir_translation_raises,
+)
 
 dtypes = [
     pl.Int8,
@@ -111,3 +114,12 @@ def test_binop_with_scalar(left_scalar, right_scalar):
     q = df.select(lop / rop)
 
     assert_gpu_result_equal(q)
+
+
+def test_numeric_to_string_cast_fails():
+    df = pl.DataFrame(
+        {"a": [1, 1, 2, 3, 3, 4, 1], "b": [None, 2, 3, 4, 5, 6, 7]}
+    ).lazy()
+    q = df.select(pl.col("a").cast(pl.String))
+
+    assert_ir_translation_raises(q, NotImplementedError)
diff --git a/python/cudf_polars/tests/expressions/test_stringfunction.py b/python/cudf_polars/tests/expressions/test_stringfunction.py
index 8cf65dd51ac..df08e15baa4 100644
--- a/python/cudf_polars/tests/expressions/test_stringfunction.py
+++ b/python/cudf_polars/tests/expressions/test_stringfunction.py
@@ -34,7 +34,9 @@ def ldf(with_nulls):
     if with_nulls:
         a[4] = None
         a[-3] = None
-    return pl.LazyFrame({"a": a, "b": range(len(a))})
+    return pl.LazyFrame(
+        {"a": a, "b": range(len(a)), "c": [str(i) for i in range(len(a))]}
+    )
 
 
 slice_cases = [
@@ -84,7 +86,7 @@ def test_contains_re_non_strict_raises(ldf):
 
 
 def test_contains_re_non_literal_raises(ldf):
-    q = ldf.select(pl.col("a").str.contains(pl.col("b"), literal=False))
+    q = ldf.select(pl.col("a").str.contains(pl.col("c"), literal=False))
 
     assert_ir_translation_raises(q, NotImplementedError)
 

From 1cea1eaf6c1e87e65729897dd9bbedc4bdc5e7ab Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Thu, 25 Jul 2024 16:26:34 -0400
Subject: [PATCH 593/842] Don't export bs_thread_pool (#16398)

## Description
cudf does not currently export any headers that depend on
bs_thread_pool, and having it as a dependency is currently causing
problems for consumers. Avoid exporting it since it's not needed.

## Checklist
- [ ] I am familiar with the [Contributing
Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
- [ ] New or existing tests cover these changes.
- [ ] The documentation is up to date with these changes.
---
 cpp/cmake/thirdparty/get_thread_pool.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/cmake/thirdparty/get_thread_pool.cmake b/cpp/cmake/thirdparty/get_thread_pool.cmake
index 235bf409058..777e16d9a4f 100644
--- a/cpp/cmake/thirdparty/get_thread_pool.cmake
+++ b/cpp/cmake/thirdparty/get_thread_pool.cmake
@@ -18,7 +18,7 @@ function(find_and_configure_thread_pool)
   include(${rapids-cmake-dir}/cpm/bs_thread_pool.cmake)
 
   # Find or install thread-pool
-  rapids_cpm_bs_thread_pool(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
+  rapids_cpm_bs_thread_pool()
 
 endfunction()
 

From cd762b4eb1fd55a0bc5079ed69bfc04426f10e60 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 26 Jul 2024 08:08:01 -1000
Subject: [PATCH 594/842] Gate ArrowStringArrayNumpySemantics cudf.pandas proxy
 behind version check (#16401)

## Description
`ArrowStringArrayNumpySemantics` was newly added in 2.1:
https://github.com/pandas-dev/pandas/blob/2.1.x/pandas/core/arrays/string_arrow.py#L488,
so putting the proxy wrapper behind a version check for pandas 2.0.x
compat

```ipython
In [1]: %load_ext cudf.pandas

In [2]: import pandas as pd

In [3]: pd.__version__
Out[3]: '2.0.0'
```

## Checklist
- [ ] I am familiar with the [Contributing
Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
- [ ] New or existing tests cover these changes.
- [ ] The documentation is up to date with these changes.
---
 python/cudf/cudf/pandas/_wrappers/pandas.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 59a243dd7c4..478108f36f1 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -26,6 +26,7 @@
 )
 
 import cudf
+import cudf.core._compat
 
 from ..annotation import nvtx
 from ..fast_slow_proxy import (
@@ -556,13 +557,14 @@ def Index__setattr__(self, name, value):
     },
 )
 
-ArrowStringArrayNumpySemantics = make_final_proxy_type(
-    "ArrowStringArrayNumpySemantics",
-    _Unusable,
-    pd.core.arrays.string_arrow.ArrowStringArrayNumpySemantics,
-    fast_to_slow=_Unusable(),
-    slow_to_fast=_Unusable(),
-)
+if cudf.core._compat.PANDAS_GE_210:
+    ArrowStringArrayNumpySemantics = make_final_proxy_type(
+        "ArrowStringArrayNumpySemantics",
+        _Unusable,
+        pd.core.arrays.string_arrow.ArrowStringArrayNumpySemantics,
+        fast_to_slow=_Unusable(),
+        slow_to_fast=_Unusable(),
+    )
 
 ArrowStringArray = make_final_proxy_type(
     "ArrowStringArray",

From 5dd3efba5b7e0c22dce87cf20aecb1b198677d2e Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 26 Jul 2024 16:47:49 -0400
Subject: [PATCH 595/842] Fix nightly memcheck error for empty
 STREAM_INTEROP_TEST (#16406)

## Description
The `STREAM_INTEROP_TEST` code was commented out in #16379 so the
`compute-sanitizer` returns an error for this test in the nightly
cpp-memcheck tests.
https://github.com/rapidsai/cudf/actions/runs/10107041505/job/27950193878#step:9:62177

This PR comments out the empty test so it is not built. The test will be
re-enabled in a future release when the deprecated functions are
replaced.

## Checklist
- [x] I am familiar with the [Contributing
Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
- [x] New or existing tests cover these changes.
- [x] The documentation is up to date with these changes.
---
 cpp/tests/CMakeLists.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 88187623930..22827484f9a 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -689,7 +689,10 @@ ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE tes
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
+# Deprecation from 16297 and fixes in 16379 caused this test to be empty This will be reenabled once
+# the deprecated APIs have been replaced in 24.10.
+#
+# ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)

From 473dec55abd1a3d9d540c541443f831d18ebb532 Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Fri, 26 Jul 2024 14:45:12 -0700
Subject: [PATCH 596/842] Add query 10 to the TPC-H suite (#16392)

Adds Q10 to the TPC-H benchmark suite

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16392
---
 cpp/examples/tpch/CMakeLists.txt |   4 +
 cpp/examples/tpch/q1.cpp         |   2 +-
 cpp/examples/tpch/q10.cpp        | 166 +++++++++++++++++++++++++++++++
 cpp/examples/tpch/q5.cpp         |  20 ++--
 cpp/examples/tpch/q6.cpp         |   2 +-
 5 files changed, 182 insertions(+), 12 deletions(-)
 create mode 100644 cpp/examples/tpch/q10.cpp

diff --git a/cpp/examples/tpch/CMakeLists.txt b/cpp/examples/tpch/CMakeLists.txt
index 1b91d07e148..373a6d72d56 100644
--- a/cpp/examples/tpch/CMakeLists.txt
+++ b/cpp/examples/tpch/CMakeLists.txt
@@ -30,3 +30,7 @@ target_compile_features(tpch_q6 PRIVATE cxx_std_17)
 add_executable(tpch_q9 q9.cpp)
 target_link_libraries(tpch_q9 PRIVATE cudf::cudf)
 target_compile_features(tpch_q9 PRIVATE cxx_std_17)
+
+add_executable(tpch_q10 q10.cpp)
+target_link_libraries(tpch_q10 PRIVATE cudf::cudf)
+target_compile_features(tpch_q10 PRIVATE cxx_std_17)
diff --git a/cpp/examples/tpch/q1.cpp b/cpp/examples/tpch/q1.cpp
index 1bdf039da4a..fe03320b888 100644
--- a/cpp/examples/tpch/q1.cpp
+++ b/cpp/examples/tpch/q1.cpp
@@ -124,7 +124,7 @@ int main(int argc, char const** argv)
   auto shipdate_upper =
     cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1998, 9, 2), true);
   auto const shipdate_upper_literal = cudf::ast::literal(shipdate_upper);
-  auto lineitem_pred                = std::make_unique<cudf::ast::operation>(
+  auto const lineitem_pred          = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LESS_EQUAL, shipdate_ref, shipdate_upper_literal);
 
   // Read out the `lineitem` table from parquet file
diff --git a/cpp/examples/tpch/q10.cpp b/cpp/examples/tpch/q10.cpp
new file mode 100644
index 00000000000..94da46f6930
--- /dev/null
+++ b/cpp/examples/tpch/q10.cpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../utilities/timer.hpp"
+#include "utils.hpp"
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+/**
+ * @file q10.cpp
+ * @brief Implement query 10 of the TPC-H benchmark.
+ *
+ * create view customer as select * from '/tables/scale-1/customer.parquet';
+ * create view orders as select * from '/tables/scale-1/orders.parquet';
+ * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
+ * create view nation as select * from '/tables/scale-1/nation.parquet';
+ *
+ * select
+ *    c_custkey,
+ *    c_name,
+ *    sum(l_extendedprice * (1 - l_discount)) as revenue,
+ *    c_acctbal,
+ *    n_name,
+ *    c_address,
+ *    c_phone,
+ *    c_comment
+ * from
+ *    customer,
+ *    orders,
+ *    lineitem,
+ *    nation
+ * where
+ *     c_custkey = o_custkey
+ *     and l_orderkey = o_orderkey
+ *     and o_orderdate >= date '1993-10-01'
+ *     and o_orderdate < date '1994-01-01'
+ *     and l_returnflag = 'R'
+ *     and c_nationkey = n_nationkey
+ * group by
+ *     c_custkey,
+ *     c_name,
+ *     c_acctbal,
+ *     c_phone,
+ *     n_name,
+ *     c_address,
+ *     c_comment
+ * order by
+ *     revenue desc;
+ */
+
+/**
+ * @brief Calculate the revenue column
+ *
+ * @param extendedprice The extended price column
+ * @param discount The discount column
+ * @param stream The CUDA stream used for device memory operations and kernel launches.
+ * @param mr Device memory resource used to allocate the returned column's device memory.
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+  cudf::column_view const& extendedprice,
+  cudf::column_view const& discount,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  auto const one = cudf::numeric_scalar<double>(1);
+  auto const one_minus_discount =
+    cudf::binary_operation(one, discount, cudf::binary_operator::SUB, discount.type(), stream, mr);
+  auto const revenue_type = cudf::data_type{cudf::type_id::FLOAT64};
+  auto revenue            = cudf::binary_operation(extendedprice,
+                                        one_minus_discount->view(),
+                                        cudf::binary_operator::MUL,
+                                        revenue_type,
+                                        stream,
+                                        mr);
+  return revenue;
+}
+int main(int argc, char const** argv)
+{
+  auto const args = parse_args(argc, argv);
+
+  // Use a memory pool
+  auto resource = create_memory_resource(args.memory_resource_type);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  cudf::examples::timer timer;
+
+  // Define the column projection and filter predicate for the `orders` table
+  std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
+  auto const o_orderdate_ref                 = cudf::ast::column_reference(std::distance(
+    orders_cols.begin(), std::find(orders_cols.begin(), orders_cols.end(), "o_orderdate")));
+  auto o_orderdate_lower =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1993, 10, 1), true);
+  auto const o_orderdate_lower_limit = cudf::ast::literal(o_orderdate_lower);
+  auto const o_orderdate_pred_lower  = cudf::ast::operation(
+    cudf::ast::ast_operator::GREATER_EQUAL, o_orderdate_ref, o_orderdate_lower_limit);
+  auto o_orderdate_upper =
+    cudf::timestamp_scalar<cudf::timestamp_D>(days_since_epoch(1994, 1, 1), true);
+  auto const o_orderdate_upper_limit = cudf::ast::literal(o_orderdate_upper);
+  auto const o_orderdate_pred_upper =
+    cudf::ast::operation(cudf::ast::ast_operator::LESS, o_orderdate_ref, o_orderdate_upper_limit);
+  auto const orders_pred = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::LOGICAL_AND, o_orderdate_pred_lower, o_orderdate_pred_upper);
+
+  auto const l_returnflag_ref = cudf::ast::column_reference(3);
+  auto r_scalar               = cudf::string_scalar("R");
+  auto const r_literal        = cudf::ast::literal(r_scalar);
+  auto const lineitem_pred    = std::make_unique<cudf::ast::operation>(
+    cudf::ast::ast_operator::EQUAL, l_returnflag_ref, r_literal);
+
+  // Read out the tables from parquet files
+  // while pushing down the column projections and filter predicates
+  auto const customer = read_parquet(
+    args.dataset_dir + "/customer.parquet",
+    {"c_custkey", "c_name", "c_nationkey", "c_acctbal", "c_address", "c_phone", "c_comment"});
+  auto const orders =
+    read_parquet(args.dataset_dir + "/orders.parquet", orders_cols, std::move(orders_pred));
+  auto const lineitem =
+    read_parquet(args.dataset_dir + "/lineitem.parquet",
+                 {"l_extendedprice", "l_discount", "l_orderkey", "l_returnflag"},
+                 std::move(lineitem_pred));
+  auto const nation = read_parquet(args.dataset_dir + "/nation.parquet", {"n_name", "n_nationkey"});
+
+  // Perform the joins
+  auto const join_a       = apply_inner_join(customer, nation, {"c_nationkey"}, {"n_nationkey"});
+  auto const join_b       = apply_inner_join(lineitem, orders, {"l_orderkey"}, {"o_orderkey"});
+  auto const joined_table = apply_inner_join(join_a, join_b, {"c_custkey"}, {"o_custkey"});
+
+  // Calculate and append the `revenue` column
+  auto revenue =
+    calc_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
+  (*joined_table).append(revenue, "revenue");
+
+  // Perform the groupby operation
+  auto const groupedby_table = apply_groupby(
+    joined_table,
+    groupby_context_t{
+      {"c_custkey", "c_name", "c_acctbal", "c_phone", "n_name", "c_address", "c_comment"},
+      {
+        {"revenue", {{cudf::aggregation::Kind::SUM, "revenue"}}},
+      }});
+
+  // Perform the order by operation
+  auto const orderedby_table =
+    apply_orderby(groupedby_table, {"revenue"}, {cudf::order::DESCENDING});
+
+  timer.print_elapsed_millis();
+
+  // Write query result to a parquet file
+  orderedby_table->to_parquet("q10.parquet");
+  return 0;
+}
diff --git a/cpp/examples/tpch/q5.cpp b/cpp/examples/tpch/q5.cpp
index e56850b94d6..89396a6c968 100644
--- a/cpp/examples/tpch/q5.cpp
+++ b/cpp/examples/tpch/q5.cpp
@@ -44,14 +44,14 @@
  *    region
  * where
  *     c_custkey = o_custkey
- *    and l_orderkey = o_orderkey
- *    and l_suppkey = s_suppkey
- *    and c_nationkey = s_nationkey
- *    and s_nationkey = n_nationkey
- *    and n_regionkey = r_regionkey
- *    and r_name = 'ASIA'
- *    and o_orderdate >= date '1994-01-01'
- *    and o_orderdate < date '1995-01-01'
+ *     and l_orderkey = o_orderkey
+ *     and l_suppkey = s_suppkey
+ *     and c_nationkey = s_nationkey
+ *     and s_nationkey = n_nationkey
+ *     and n_regionkey = r_regionkey
+ *     and r_name = 'ASIA'
+ *     and o_orderdate >= date '1994-01-01'
+ *     and o_orderdate < date '1995-01-01'
  * group by
  *    n_name
  * order by
@@ -109,7 +109,7 @@ int main(int argc, char const** argv)
   auto const o_orderdate_upper_limit = cudf::ast::literal(o_orderdate_upper);
   auto const o_orderdate_pred_upper =
     cudf::ast::operation(cudf::ast::ast_operator::LESS, o_orderdate_ref, o_orderdate_upper_limit);
-  auto orders_pred = std::make_unique<cudf::ast::operation>(
+  auto const orders_pred = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LOGICAL_AND, o_orderdate_pred_lower, o_orderdate_pred_upper);
 
   // Define the column projection and filter predicate for the `region` table
@@ -118,7 +118,7 @@ int main(int argc, char const** argv)
     region_cols.begin(), std::find(region_cols.begin(), region_cols.end(), "r_name")));
   auto r_name_value                          = cudf::string_scalar("ASIA");
   auto const r_name_literal                  = cudf::ast::literal(r_name_value);
-  auto region_pred                           = std::make_unique<cudf::ast::operation>(
+  auto const region_pred                     = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::EQUAL, r_name_ref, r_name_literal);
 
   // Read out the tables from parquet files
diff --git a/cpp/examples/tpch/q6.cpp b/cpp/examples/tpch/q6.cpp
index f11b3d6ab3b..405b2ac73ca 100644
--- a/cpp/examples/tpch/q6.cpp
+++ b/cpp/examples/tpch/q6.cpp
@@ -84,7 +84,7 @@ int main(int argc, char const** argv)
     cudf::ast::ast_operator::GREATER_EQUAL, shipdate_ref, shipdate_lower_literal);
   auto const shipdate_pred_b =
     cudf::ast::operation(cudf::ast::ast_operator::LESS, shipdate_ref, shipdate_upper_literal);
-  auto lineitem_pred = std::make_unique<cudf::ast::operation>(
+  auto const lineitem_pred = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b);
   auto lineitem =
     read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred));

From 24997fda194d5b8af34048a8bf275830cabbff8c Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Fri, 26 Jul 2024 18:37:30 -0700
Subject: [PATCH 597/842] Deduplicate decimal32/decimal64 to decimal128
 conversion function (#16236)

Closes #16194

This PR deduplicates the `convert_data_to_decimal128` function from `to_arrow.cu`, `writer_impl.cu` and `to_arrow_device.cu` to a common location.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16236
---
 cpp/CMakeLists.txt                            |  1 +
 .../interop/decimal_conversion_utilities.cu   | 70 +++++++++++++++++
 .../interop/decimal_conversion_utilities.cuh  | 44 +++++++++++
 cpp/src/interop/to_arrow.cu                   |  8 +-
 cpp/src/interop/to_arrow_device.cu            |  5 +-
 cpp/src/interop/to_arrow_host.cu              | 40 +---------
 cpp/src/io/parquet/writer_impl.cu             | 60 ++++-----------
 cpp/tests/interop/to_arrow_device_test.cpp    | 77 +++++++++++++++++++
 8 files changed, 220 insertions(+), 85 deletions(-)
 create mode 100644 cpp/src/interop/decimal_conversion_utilities.cu
 create mode 100644 cpp/src/interop/decimal_conversion_utilities.cuh

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 95c509efc5b..310bc99b279 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -365,6 +365,7 @@ add_library(
   src/interop/dlpack.cpp
   src/interop/from_arrow.cu
   src/interop/arrow_utilities.cpp
+  src/interop/decimal_conversion_utilities.cu
   src/interop/to_arrow.cu
   src/interop/to_arrow_device.cu
   src/interop/to_arrow_host.cu
diff --git a/cpp/src/interop/decimal_conversion_utilities.cu b/cpp/src/interop/decimal_conversion_utilities.cu
new file mode 100644
index 00000000000..2f81c754a30
--- /dev/null
+++ b/cpp/src/interop/decimal_conversion_utilities.cu
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "decimal_conversion_utilities.cuh"
+
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/linked_column.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/for_each.h>
+
+#include <type_traits>
+
+namespace cudf {
+namespace detail {
+
+template <typename DecimalType>
+std::unique_ptr<rmm::device_buffer> convert_decimals_to_decimal128(
+  cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
+{
+  static_assert(std::is_same_v<DecimalType, int32_t> or std::is_same_v<DecimalType, int64_t>,
+                "Only int32 and int64 decimal types can be converted to decimal128.");
+
+  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DecimalType);
+  auto buf = std::make_unique<rmm::device_buffer>(column.size() * sizeof(__int128_t), stream, mr);
+
+  thrust::for_each(rmm::exec_policy_nosync(stream, mr),
+                   thrust::make_counting_iterator(0),
+                   thrust::make_counting_iterator(column.size()),
+                   [in  = column.begin<DecimalType>(),
+                    out = reinterpret_cast<DecimalType*>(buf->data()),
+                    BIT_WIDTH_RATIO] __device__(auto in_idx) {
+                     auto const out_idx = in_idx * BIT_WIDTH_RATIO;
+                     // the lowest order bits are the value, the remainder
+                     // simply matches the sign bit to satisfy the two's
+                     // complement integer representation of negative numbers.
+                     out[out_idx] = in[in_idx];
+#pragma unroll BIT_WIDTH_RATIO - 1
+                     for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
+                       out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
+                     }
+                   });
+
+  return buf;
+}
+
+// Instantiate templates for int32_t and int64_t decimal types
+template std::unique_ptr<rmm::device_buffer> convert_decimals_to_decimal128<int32_t>(
+  cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+template std::unique_ptr<rmm::device_buffer> convert_decimals_to_decimal128<int64_t>(
+  cudf::column_view const& column, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/interop/decimal_conversion_utilities.cuh b/cpp/src/interop/decimal_conversion_utilities.cuh
new file mode 100644
index 00000000000..41263147404
--- /dev/null
+++ b/cpp/src/interop/decimal_conversion_utilities.cuh
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <type_traits>
+
+namespace cudf::detail {
+
+/**
+ * @brief Convert decimal32 and decimal64 numeric data to decimal128 and return the device vector
+ *
+ * @tparam DecimalType to convert from
+ *
+ * @param column A view of the input columns
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ *
+ * @return A device vector containing the converted decimal128 data
+ */
+template <typename DecimalType>
+std::unique_ptr<rmm::device_buffer> convert_decimals_to_decimal128(
+  cudf::column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
+
+}  // namespace cudf::detail
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 6b163e3441e..3d41f856f4f 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -15,6 +15,7 @@
  */
 
 #include "arrow_utilities.hpp"
+#include "decimal_conversion_utilities.cuh"
 #include "detail/arrow_allocator.hpp"
 
 #include <cudf/column/column.hpp>
@@ -158,8 +159,11 @@ std::shared_ptr<arrow::Array> unsupported_decimals_to_arrow(column_view input,
                                                             arrow::MemoryPool* ar_mr,
                                                             rmm::cuda_stream_view stream)
 {
-  auto buf =
-    detail::decimals_to_arrow<DeviceType>(input, stream, rmm::mr::get_current_device_resource());
+  auto buf = detail::convert_decimals_to_decimal128<DeviceType>(
+    input, stream, rmm::mr::get_current_device_resource());
+
+  // Synchronize stream here to ensure the decimal128 buffer is ready.
+  stream.synchronize();
 
   auto const buf_size_in_bytes = buf->size();
   auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index 2eb9b912054..cea7cdebcba 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -15,6 +15,7 @@
  */
 
 #include "arrow_utilities.hpp"
+#include "decimal_conversion_utilities.cuh"
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
@@ -141,7 +142,9 @@ int construct_decimals(cudf::column_view input,
   nanoarrow::UniqueArray tmp;
   NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, input));
 
-  auto buf = detail::decimals_to_arrow<DeviceType>(input, stream, mr);
+  auto buf = detail::convert_decimals_to_decimal128<DeviceType>(input, stream, mr);
+  // Synchronize stream here to ensure the decimal128 buffer is ready.
+  stream.synchronize();
   NANOARROW_RETURN_NOT_OK(set_buffer(std::move(buf), fixed_width_data_buffer_idx, tmp.get()));
 
   ArrowArrayMove(tmp.get(), out);
diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu
index c9e53ebaab7..193b3a3b5a2 100644
--- a/cpp/src/interop/to_arrow_host.cu
+++ b/cpp/src/interop/to_arrow_host.cu
@@ -15,6 +15,7 @@
  */
 
 #include "arrow_utilities.hpp"
+#include "decimal_conversion_utilities.cuh"
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/interop.hpp>
@@ -50,41 +51,6 @@
 namespace cudf {
 namespace detail {
 
-template <typename DeviceType>
-std::unique_ptr<rmm::device_buffer> decimals_to_arrow(cudf::column_view input,
-                                                      rmm::cuda_stream_view stream,
-                                                      rmm::device_async_resource_ref mr)
-{
-  constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType);
-  auto buf = std::make_unique<rmm::device_buffer>(input.size() * sizeof(__int128_t), stream, mr);
-
-  auto count = thrust::counting_iterator<size_type>(0);
-  thrust::for_each(rmm::exec_policy(stream, mr),
-                   count,
-                   count + input.size(),
-                   [in  = input.begin<DeviceType>(),
-                    out = reinterpret_cast<DeviceType*>(buf->data()),
-                    BIT_WIDTH_RATIO] __device__(auto in_idx) {
-                     auto const out_idx = in_idx * BIT_WIDTH_RATIO;
-                     // the lowest order bits are the value, the remainder
-                     // simply matches the sign bit to satisfy the two's
-                     // complement integer representation of negative numbers.
-                     out[out_idx] = in[in_idx];
-#pragma unroll BIT_WIDTH_RATIO - 1
-                     for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
-                       out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
-                     }
-                   });
-
-  return buf;
-}
-
-template std::unique_ptr<rmm::device_buffer> decimals_to_arrow<int32_t>(
-  cudf::column_view input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
-
-template std::unique_ptr<rmm::device_buffer> decimals_to_arrow<int64_t>(
-  cudf::column_view input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
-
 namespace {
 
 struct dispatch_to_arrow_host {
@@ -156,7 +122,9 @@ struct dispatch_to_arrow_host {
     NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_DECIMAL128, column));
 
     NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
-    auto buf = detail::decimals_to_arrow<DeviceType>(column, stream, mr);
+    auto buf = detail::convert_decimals_to_decimal128<DeviceType>(column, stream, mr);
+    // No need to synchronize stream here as populate_data_buffer uses the same stream to copy data
+    // to host.
     NANOARROW_RETURN_NOT_OK(
       populate_data_buffer(device_span<__int128_t const>(
                              reinterpret_cast<const __int128_t*>(buf->data()), column.size()),
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 2df71b77301..36a1d8377bf 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -22,6 +22,7 @@
 #include "arrow_schema_writer.hpp"
 #include "compact_protocol_reader.hpp"
 #include "compact_protocol_writer.hpp"
+#include "interop/decimal_conversion_utilities.cuh"
 #include "io/comp/nvcomp_adapter.hpp"
 #include "io/parquet/parquet.hpp"
 #include "io/parquet/parquet_gpu.hpp"
@@ -1601,50 +1602,12 @@ size_t column_index_buffer_size(EncColumnChunk* ck,
   return ck->ck_stat_size * num_pages + column_index_truncate_length + padding + size_struct_size;
 }
 
-/**
- * @brief Convert decimal32 and decimal64 data to decimal128 and return the device vector
- *
- * @tparam DecimalType to convert from
- *
- * @param column A view of the input columns
- * @param stream CUDA stream used for device memory operations and kernel launches
- *
- * @return A device vector containing the converted decimal128 data
- */
-template <typename DecimalType>
-rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& column,
-                                                           rmm::cuda_stream_view stream)
-{
-  size_type constexpr BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DecimalType);
-
-  rmm::device_uvector<__int128_t> d128_buffer(column.size(), stream);
-
-  thrust::for_each(rmm::exec_policy_nosync(stream),
-                   thrust::make_counting_iterator(0),
-                   thrust::make_counting_iterator(column.size()),
-                   [in  = column.begin<DecimalType>(),
-                    out = reinterpret_cast<DecimalType*>(d128_buffer.data()),
-                    BIT_WIDTH_RATIO] __device__(auto in_idx) {
-                     auto const out_idx = in_idx * BIT_WIDTH_RATIO;
-                     // The lowest order bits are the value, the remainder
-                     // simply matches the sign bit to satisfy the two's
-                     // complement integer representation of negative numbers.
-                     out[out_idx] = in[in_idx];
-#pragma unroll BIT_WIDTH_RATIO - 1
-                     for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) {
-                       out[out_idx + i] = in[in_idx] < 0 ? -1 : 0;
-                     }
-                   });
-
-  return d128_buffer;
-}
-
 /**
  * @brief Function to convert decimal32 and decimal64 columns to decimal128 data,
  *        update the input table metadata, and return a new vector of column views.
  *
  * @param[in,out] table_meta The table metadata
- * @param[in,out] d128_vectors Vector containing the computed decimal128 data buffers.
+ * @param[in,out] d128_buffers Buffers containing the converted decimal128 data.
  * @param input The input table
  * @param stream CUDA stream used for device memory operations and kernel launches
  *
@@ -1652,7 +1615,7 @@ rmm::device_uvector<__int128_t> convert_data_to_decimal128(column_view const& co
  */
 std::vector<column_view> convert_decimal_columns_and_metadata(
   table_input_metadata& table_meta,
-  std::vector<rmm::device_uvector<__int128_t>>& d128_vectors,
+  std::vector<std::unique_ptr<rmm::device_buffer>>& d128_buffers,
   table_view const& table,
   rmm::cuda_stream_view stream)
 {
@@ -1673,28 +1636,30 @@ std::vector<column_view> convert_decimal_columns_and_metadata(
     switch (column.type().id()) {
       case type_id::DECIMAL32:
         // Convert data to decimal128 type
-        d128_vectors.emplace_back(convert_data_to_decimal128<int32_t>(column, stream));
+        d128_buffers.emplace_back(cudf::detail::convert_decimals_to_decimal128<int32_t>(
+          column, stream, rmm::mr::get_current_device_resource()));
         // Update metadata
         metadata.set_decimal_precision(MAX_DECIMAL32_PRECISION);
         metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
         // Create a new column view from the d128 data vector
         return {data_type{type_id::DECIMAL128, column.type().scale()},
                 column.size(),
-                d128_vectors.back().data(),
+                d128_buffers.back()->data(),
                 column.null_mask(),
                 column.null_count(),
                 column.offset(),
                 converted_children};
       case type_id::DECIMAL64:
         // Convert data to decimal128 type
-        d128_vectors.emplace_back(convert_data_to_decimal128<int64_t>(column, stream));
+        d128_buffers.emplace_back(cudf::detail::convert_decimals_to_decimal128<int64_t>(
+          column, stream, rmm::mr::get_current_device_resource()));
         // Update metadata
         metadata.set_decimal_precision(MAX_DECIMAL64_PRECISION);
         metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
         // Create a new column view from the d128 data vector
         return {data_type{type_id::DECIMAL128, column.type().scale()},
                 column.size(),
-                d128_vectors.back().data(),
+                d128_buffers.back()->data(),
                 column.null_mask(),
                 column.null_count(),
                 column.offset(),
@@ -1722,6 +1687,9 @@ std::vector<column_view> convert_decimal_columns_and_metadata(
     std::back_inserter(converted_column_views),
     [&](auto elem) { return convert_column(thrust::get<0>(elem), thrust::get<1>(elem)); });
 
+  // Synchronize stream here to ensure all decimal128 buffers are ready.
+  stream.synchronize();
+
   return converted_column_views;
 }
 
@@ -1780,13 +1748,13 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
                                    rmm::cuda_stream_view stream)
 {
   // Container to store decimal128 converted data if needed
-  std::vector<rmm::device_uvector<__int128_t>> d128_vectors;
+  std::vector<std::unique_ptr<rmm::device_buffer>> d128_buffers;
 
   // Convert decimal32/decimal64 data to decimal128 if writing arrow schema
   // and initialize LinkedColVector
   auto vec = table_to_linked_columns(
     (write_arrow_schema)
-      ? table_view({convert_decimal_columns_and_metadata(table_meta, d128_vectors, input, stream)})
+      ? table_view({convert_decimal_columns_and_metadata(table_meta, d128_buffers, input, stream)})
       : input);
 
   auto schema_tree = construct_parquet_schema_tree(
diff --git a/cpp/tests/interop/to_arrow_device_test.cpp b/cpp/tests/interop/to_arrow_device_test.cpp
index 77da4039103..51216a8512c 100644
--- a/cpp/tests/interop/to_arrow_device_test.cpp
+++ b/cpp/tests/interop/to_arrow_device_test.cpp
@@ -710,6 +710,83 @@ TEST_F(ToArrowDeviceTest, StructColumn)
 template <typename T>
 using fp_wrapper = cudf::test::fixed_point_column_wrapper<T>;
 
+TEST_F(ToArrowDeviceTest, FixedPoint32Table)
+{
+  using namespace numeric;
+
+  for (auto const scale : {6, 4, 2, 0, -1, -3, -5}) {
+    auto const expect_data =
+      std::vector<int32_t>{-1000, -1, -1, -1, 2400, 0, 0, 0, -3456, -1, -1, -1,
+                           4650,  0,  0,  0,  5154, 0, 0, 0, 6800,  0,  0,  0};
+    auto col = fp_wrapper<int32_t>({-1000, 2400, -3456, 4650, 5154, 6800}, scale_type{scale});
+    std::vector<std::unique_ptr<cudf::column>> table_cols;
+    table_cols.emplace_back(col.release());
+    auto input = cudf::table(std::move(table_cols));
+
+    nanoarrow::UniqueSchema expected_schema;
+    ArrowSchemaInit(expected_schema.get());
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeStruct(expected_schema.get(), 1));
+    ArrowSchemaInit(expected_schema->children[0]);
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetTypeDecimal(expected_schema->children[0],
+                                                     NANOARROW_TYPE_DECIMAL128,
+                                                     cudf::detail::max_precision<int32_t>(),
+                                                     -scale));
+    NANOARROW_THROW_NOT_OK(ArrowSchemaSetName(expected_schema->children[0], "a"));
+    expected_schema->children[0]->flags = 0;
+
+    auto got_arrow_schema =
+      cudf::to_arrow_schema(input.view(), std::vector<cudf::column_metadata>{{"a"}});
+    compare_schemas(expected_schema.get(), got_arrow_schema.get());
+
+    auto result_dev_data = std::make_unique<rmm::device_uvector<int32_t>>(
+      expect_data.size(), cudf::get_default_stream());
+    cudaMemcpy(result_dev_data->data(),
+               expect_data.data(),
+               sizeof(int32_t) * expect_data.size(),
+               cudaMemcpyHostToDevice);
+
+    cudf::get_default_stream().synchronize();
+    nanoarrow::UniqueArray expected_array;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayInitFromSchema(expected_array.get(), expected_schema.get(), nullptr));
+    expected_array->length = input.num_rows();
+
+    expected_array->children[0]->length = input.num_rows();
+    NANOARROW_THROW_NOT_OK(
+      ArrowBufferSetAllocator(ArrowArrayBuffer(expected_array->children[0], 0), noop_alloc));
+    ArrowArrayValidityBitmap(expected_array->children[0])->buffer.data =
+      const_cast<uint8_t*>(reinterpret_cast<uint8_t const*>(input.view().column(0).null_mask()));
+
+    auto data_ptr = reinterpret_cast<uint8_t*>(result_dev_data->data());
+    NANOARROW_THROW_NOT_OK(ArrowBufferSetAllocator(
+      ArrowArrayBuffer(expected_array->children[0], 1),
+      ArrowBufferDeallocator(
+        [](ArrowBufferAllocator* alloc, uint8_t*, int64_t) {
+          auto buf =
+            reinterpret_cast<std::unique_ptr<rmm::device_uvector<int32_t>>*>(alloc->private_data);
+          delete buf;
+        },
+        new std::unique_ptr<rmm::device_uvector<int32_t>>(std::move(result_dev_data)))));
+    ArrowArrayBuffer(expected_array->children[0], 1)->data = data_ptr;
+    NANOARROW_THROW_NOT_OK(
+      ArrowArrayFinishBuilding(expected_array.get(), NANOARROW_VALIDATION_LEVEL_NONE, nullptr));
+
+    auto got_arrow_array = cudf::to_arrow_device(input.view());
+    ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+
+    got_arrow_array = cudf::to_arrow_device(std::move(input));
+    ASSERT_EQ(rmm::get_current_cuda_device().value(), got_arrow_array->device_id);
+    ASSERT_EQ(ARROW_DEVICE_CUDA, got_arrow_array->device_type);
+    ASSERT_CUDA_SUCCEEDED(
+      cudaEventSynchronize(*reinterpret_cast<cudaEvent_t*>(got_arrow_array->sync_event)));
+    compare_arrays(expected_schema.get(), expected_array.get(), &got_arrow_array->array);
+  }
+}
+
 TEST_F(ToArrowDeviceTest, FixedPoint64Table)
 {
   using namespace numeric;

From a51964ed8b00c3c88d463e329af7ec8378642343 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Mon, 29 Jul 2024 08:42:27 -0500
Subject: [PATCH 598/842] Fix a `pandas-2.0` missing attribute error (#16416)

`NumpyEADtype` is a 2.1.0+ change, this PR handles the missing attribute
error in pandas-2.0
---
 python/cudf/cudf/core/dtypes.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index de715191c08..27afec18b4e 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -17,10 +17,15 @@
 from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
 
 import cudf
-from cudf.core._compat import PANDAS_LT_300
+from cudf.core._compat import PANDAS_GE_210, PANDAS_LT_300
 from cudf.core.abc import Serializable
 from cudf.utils.docutils import doc_apply
 
+if PANDAS_GE_210:
+    PANDAS_NUMPY_DTYPE = pd.core.dtypes.dtypes.NumpyEADtype
+else:
+    PANDAS_NUMPY_DTYPE = pd.core.dtypes.dtypes.PandasDtype
+
 if TYPE_CHECKING:
     from cudf._typing import Dtype
     from cudf.core.buffer import Buffer
@@ -72,7 +77,7 @@ def dtype(arbitrary):
             return np.dtype("object")
         else:
             return dtype(pd_dtype.numpy_dtype)
-    elif isinstance(pd_dtype, pd.core.dtypes.dtypes.NumpyEADtype):
+    elif isinstance(pd_dtype, PANDAS_NUMPY_DTYPE):
         return dtype(pd_dtype.numpy_dtype)
     elif isinstance(pd_dtype, pd.CategoricalDtype):
         return cudf.CategoricalDtype.from_pandas(pd_dtype)

From 18c1465b597284d8b558964cc0ca48de7da60a17 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 29 Jul 2024 06:06:07 -1000
Subject: [PATCH 599/842] Align ewm APIs with pandas 2.x (#16413)

These all currently are not implemented and raise a `NotImplementedError`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16413
---
 python/cudf/cudf/core/window/ewm.py | 52 ++++++++++++++++++++++++-----
 1 file changed, 43 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py
index bb153d4b549..1203a840076 100644
--- a/python/cudf/cudf/core/window/ewm.py
+++ b/python/cudf/cudf/core/window/ewm.py
@@ -114,23 +114,57 @@ def __init__(
         self.adjust = adjust
         self.com = get_center_of_mass(com, span, halflife, alpha)
 
-    def mean(self):
+    def online(self, engine: str = "numba", engine_kwargs=None):
+        """
+        Return an ``OnlineExponentialMovingWindow`` object to calculate
+        exponentially moving window aggregations in an online method.
+
+        Currently not supported.
+        """
+        raise NotImplementedError("online is currently not supported.")
+
+    def mean(
+        self, numeric_only: bool = False, engine=None, engine_kwargs=None
+    ):
         """
         Calculate the ewm (exponential weighted moment) mean.
         """
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas."
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional and added for compatibility with pandas."
+            )
         return self._apply_agg("ewma")
 
-    def var(self, bias):
-        raise NotImplementedError("ewmvar not yet supported.")
+    def sum(self, numeric_only: bool = False, engine=None, engine_kwargs=None):
+        raise NotImplementedError("sum not yet supported.")
 
-    def std(self, bias):
-        raise NotImplementedError("ewmstd not yet supported.")
+    def var(self, bias: bool = False, numeric_only: bool = False):
+        raise NotImplementedError("var not yet supported.")
 
-    def corr(self, other):
-        raise NotImplementedError("ewmcorr not yet supported.")
+    def std(self, bias: bool = False, numeric_only: bool = False):
+        raise NotImplementedError("std not yet supported.")
 
-    def cov(self, other):
-        raise NotImplementedError("ewmcov not yet supported.")
+    def corr(
+        self, other, pairwise: bool | None = None, numeric_only: bool = False
+    ):
+        raise NotImplementedError("corr not yet supported.")
+
+    def cov(
+        self,
+        other,
+        pairwise: bool | None = None,
+        bias: bool = False,
+        numeric_only: bool = False,
+    ):
+        raise NotImplementedError("cov not yet supported.")
 
     def _apply_agg_series(self, sr, agg_name):
         if not is_numeric_dtype(sr.dtype):

From 58f47242fe04b1e25fd42e1e45e8c15417140777 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 29 Jul 2024 06:09:21 -1000
Subject: [PATCH 600/842] Align groupby APIs with pandas 2.x (#16403)

The following breaking APIs are affected:

* `apply`
* `transform`
* `describe`

The rest of the APIs are non-breaking and generally will raise a `NotImplementedError`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16403
---
 .../source/user_guide/api_docs/groupby.rst    |   3 +-
 python/cudf/cudf/core/groupby/groupby.py      | 629 ++++++++++++++----
 python/cudf/cudf/core/resample.py             |   6 +-
 python/cudf/cudf/tests/test_groupby.py        |  25 +
 4 files changed, 514 insertions(+), 149 deletions(-)

diff --git a/docs/cudf/source/user_guide/api_docs/groupby.rst b/docs/cudf/source/user_guide/api_docs/groupby.rst
index 80811efa33f..ca29087cbf9 100644
--- a/docs/cudf/source/user_guide/api_docs/groupby.rst
+++ b/docs/cudf/source/user_guide/api_docs/groupby.rst
@@ -68,7 +68,6 @@ Computations / descriptive stats
    GroupBy.std
    GroupBy.sum
    GroupBy.var
-   GroupBy.corr
    GroupBy.cov
 
 The following methods are available in both ``SeriesGroupBy`` and
@@ -81,6 +80,7 @@ application to columns of a specific data type.
    :toctree: api/
 
    DataFrameGroupBy.bfill
+   DataFrameGroupBy.corr
    DataFrameGroupBy.count
    DataFrameGroupBy.cumcount
    DataFrameGroupBy.cummax
@@ -102,5 +102,6 @@ The following methods are available only for ``SeriesGroupBy`` objects.
 .. autosummary::
    :toctree: api/
 
+   SeriesGroupBy.corr
    SeriesGroupBy.nunique
    SeriesGroupBy.unique
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 1646c5042fd..3cfbd1d736a 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -8,7 +8,7 @@
 import warnings
 from collections import abc
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Iterable
+from typing import TYPE_CHECKING, Any, Iterable, Literal
 
 import cupy as cp
 import numpy as np
@@ -306,6 +306,18 @@ def __iter__(self):
                 grouped_values[offsets[i] : offsets[i + 1]],
             )
 
+    def __len__(self) -> int:
+        return self.ngroups
+
+    @property
+    def ngroups(self) -> int:
+        _, offsets, _, _ = self._grouped()
+        return len(offsets) - 1
+
+    @property
+    def ndim(self) -> int:
+        return self.obj.ndim
+
     @property
     def dtypes(self):
         """
@@ -457,10 +469,20 @@ def size(self):
         )
 
     @_performance_tracking
-    def cumcount(self):
+    def cumcount(self, ascending: bool = True):
         """
         Return the cumulative count of keys in each group.
+
+        Parameters
+        ----------
+        ascending : bool, default True
+            If False, number in reverse, from length of group - 1 to 0.
+            Currently not supported
         """
+        if ascending is not True:
+            raise NotImplementedError(
+                "ascending is currently not implemented."
+            )
         return (
             cudf.Series(
                 cudf.core.column.column_empty(
@@ -527,7 +549,7 @@ def _groupby(self):
         )
 
     @_performance_tracking
-    def agg(self, func):
+    def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
         """
         Apply aggregation(s) to the groups.
 
@@ -615,6 +637,22 @@ def agg(self, func):
         1  1.5  1.75  2.0   2.0
         2  3.0  3.00  1.0   1.0
         """
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas"
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional added for compatibility with pandas"
+            )
+        if args:
+            raise NotImplementedError(
+                "Passing args to func is currently not supported."
+            )
+        if kwargs:
+            raise NotImplementedError(
+                "Passing kwargs to func is currently not supported."
+            )
         column_names, columns, normalized_aggs = self._normalize_aggs(func)
         orig_dtypes = tuple(c.dtype for c in columns)
 
@@ -935,12 +973,13 @@ def tail(self, n: int = 5, *, preserve_order: bool = True):
         )
 
     @_performance_tracking
-    def nth(self, n):
+    def nth(self, n, dropna: Literal["any", "all", None] = None):
         """
         Return the nth row from each group.
         """
-
-        self.obj["__groupbynth_order__"] = range(0, len(self.obj))
+        if dropna is not None:
+            raise NotImplementedError("dropna is not currently supported.")
+        self.obj["__groupbynth_order__"] = range(0, len(self.obj))  # type: ignore[index]
         # We perform another groupby here to have the grouping columns
         # be a part of dataframe columns.
         result = self.obj.groupby(self.grouping.keys).agg(lambda x: x.nth(n))
@@ -1423,13 +1462,13 @@ def _post_process_chunk_results(
 
     @_performance_tracking
     def apply(
-        self, function, *args, engine="auto", include_groups: bool = True
+        self, func, *args, engine="auto", include_groups: bool = True, **kwargs
     ):
         """Apply a python transformation function over the grouped chunk.
 
         Parameters
         ----------
-        function : callable
+        func : callable
           The python transformation function that will be applied
           on the grouped chunk.
         args : tuple
@@ -1452,6 +1491,9 @@ def apply(
             When True, will attempt to apply ``func`` to the groupings in
             the case that they are columns of the DataFrame. In the future,
             this will default to ``False``.
+        kwargs : dict
+            Optional keyword arguments to pass to the function.
+            Currently not supported
 
         Examples
         --------
@@ -1528,13 +1570,17 @@ def mult(df):
         dtype: int64
 
         """
+        if kwargs:
+            raise NotImplementedError(
+                "Passing kwargs to func is currently not supported."
+            )
         if self.obj.empty:
-            if function in {"count", "size", "idxmin", "idxmax"}:
+            if func in {"count", "size", "idxmin", "idxmax"}:
                 res = cudf.Series([], dtype="int64")
             else:
                 res = self.obj.copy(deep=True)
             res.index = self.grouping.keys
-            if function in {"sum", "product"}:
+            if func in {"sum", "product"}:
                 # For `sum` & `product`, boolean types
                 # will need to result in `int64` type.
                 for name, col in res._data.items():
@@ -1542,20 +1588,20 @@ def mult(df):
                         res._data[name] = col.astype("int")
             return res
 
-        if not callable(function):
-            raise TypeError(f"type {type(function)} is not callable")
+        if not callable(func):
+            raise TypeError(f"type {type(func)} is not callable")
         group_names, offsets, group_keys, grouped_values = self._grouped(
             include_groups=include_groups
         )
 
         if engine == "auto":
-            if _can_be_jitted(grouped_values, function, args):
+            if _can_be_jitted(grouped_values, func, args):
                 engine = "jit"
             else:
                 engine = "cudf"
         if engine == "jit":
             result = self._jit_groupby_apply(
-                function,
+                func,
                 group_names,
                 offsets,
                 group_keys,
@@ -1564,7 +1610,7 @@ def mult(df):
             )
         elif engine == "cudf":
             result = self._iterative_groupby_apply(
-                function,
+                func,
                 group_names,
                 offsets,
                 group_keys,
@@ -1744,12 +1790,14 @@ def _broadcast(self, values: cudf.Series) -> cudf.Series:
         return values
 
     @_performance_tracking
-    def transform(self, function):
+    def transform(
+        self, func, *args, engine=None, engine_kwargs=None, **kwargs
+    ):
         """Apply an aggregation, then broadcast the result to the group size.
 
         Parameters
         ----------
-        function: str or callable
+        func: str or callable
             Aggregation to apply to each group. Note that the set of
             operations currently supported by `transform` is identical
             to that supported by the `agg` method.
@@ -1778,18 +1826,35 @@ def transform(self, function):
         --------
         agg
         """
-        if not (isinstance(function, str) or callable(function)):
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas"
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional added for compatibility with pandas"
+            )
+        if args:
+            raise NotImplementedError(
+                "Passing args to func is currently not supported."
+            )
+        if kwargs:
+            raise NotImplementedError(
+                "Passing kwargs to func is currently not supported."
+            )
+
+        if not (isinstance(func, str) or callable(func)):
             raise TypeError(
                 "Aggregation must be a named aggregation or a callable"
             )
         try:
-            result = self.agg(function)
+            result = self.agg(func)
         except TypeError as e:
             raise NotImplementedError(
                 "Currently, `transform()` supports only aggregations."
             ) from e
         # If the aggregation is a scan, don't broadcast
-        if libgroupby._is_all_scan_aggregate([[function]]):
+        if libgroupby._is_all_scan_aggregate([[func]]):
             if len(result) != len(self.obj):
                 raise AssertionError(
                     "Unexpected result length for scan transform"
@@ -1824,7 +1889,7 @@ def func(x):
         return self.agg(func)
 
     @_performance_tracking
-    def describe(self, include=None, exclude=None):
+    def describe(self, percentiles=None, include=None, exclude=None):
         """
         Generate descriptive statistics that summarizes the central tendency,
         dispersion and shape of a dataset's distribution, excluding NaN values.
@@ -1833,6 +1898,10 @@ def describe(self, include=None, exclude=None):
 
         Parameters
         ----------
+        percentiles : list-like of numbers, optional
+            The percentiles to include in the output.
+            Currently not supported.
+
         include: 'all', list-like of dtypes or None (default), optional
             list of data types to include in the result.
             Ignored for Series.
@@ -1869,8 +1938,12 @@ def describe(self, include=None, exclude=None):
         90        1   24.0  <NA>   24.0   24.0   24.0   24.0   24.0
 
         """
-        if exclude is not None and include is not None:
-            raise NotImplementedError
+        if percentiles is not None:
+            raise NotImplementedError("percentiles is currently not supported")
+        if exclude is not None:
+            raise NotImplementedError("exclude is currently not supported")
+        if include is not None:
+            raise NotImplementedError("include is currently not supported")
 
         res = self.agg(
             [
@@ -1896,69 +1969,7 @@ def describe(self, include=None, exclude=None):
         return res
 
     @_performance_tracking
-    def corr(self, method="pearson", min_periods=1):
-        """
-        Compute pairwise correlation of columns, excluding NA/null values.
-
-        Parameters
-        ----------
-        method: {"pearson", "kendall", "spearman"} or callable,
-            default "pearson". Currently only the pearson correlation
-            coefficient is supported.
-
-        min_periods: int, optional
-            Minimum number of observations required per pair of columns
-            to have a valid result.
-
-        Returns
-        -------
-        DataFrame
-            Correlation matrix.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> gdf = cudf.DataFrame({
-        ...             "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
-        ...             "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
-        ...             "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
-        ...             "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1]})
-        >>> gdf
-           id  val1  val2  val3
-        0  a     5     4     4
-        1  a     4     5     5
-        2  a     6     6     6
-        3  b     4     1     1
-        4  b     8     2     2
-        5  b     7     9     9
-        6  c     4     8     8
-        7  c     5     5     5
-        8  c     2     1     1
-        >>> gdf.groupby("id").corr(method="pearson")
-                    val1      val2      val3
-        id
-        a   val1  1.000000  0.500000  0.500000
-            val2  0.500000  1.000000  1.000000
-            val3  0.500000  1.000000  1.000000
-        b   val1  1.000000  0.385727  0.385727
-            val2  0.385727  1.000000  1.000000
-            val3  0.385727  1.000000  1.000000
-        c   val1  1.000000  0.714575  0.714575
-            val2  0.714575  1.000000  1.000000
-            val3  0.714575  1.000000  1.000000
-        """
-
-        if method.lower() not in ("pearson",):
-            raise NotImplementedError(
-                "Only pearson correlation is currently supported"
-            )
-
-        return self._cov_or_corr(
-            lambda x: x.corr(method, min_periods), "Correlation"
-        )
-
-    @_performance_tracking
-    def cov(self, min_periods=0, ddof=1):
+    def cov(self, min_periods=0, ddof=1, numeric_only: bool = False):
         """
         Compute the pairwise covariance among the columns of a DataFrame,
         excluding NA/null values.
@@ -2042,6 +2053,10 @@ def cov(self, min_periods=0, ddof=1):
            val2  3.833333  12.333333  12.333333
            val3  3.833333  12.333333  12.333333
         """
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
 
         return self._cov_or_corr(
             lambda x: x.cov(min_periods, ddof), "Covariance"
@@ -2137,7 +2152,13 @@ def _cov_or_corr(self, func, method_name):
         return res
 
     @_performance_tracking
-    def var(self, ddof=1):
+    def var(
+        self,
+        ddof=1,
+        engine=None,
+        engine_kwargs=None,
+        numeric_only: bool = False,
+    ):
         """Compute the column-wise variance of the values in each group.
 
         Parameters
@@ -2146,6 +2167,18 @@ def var(self, ddof=1):
             The delta degrees of freedom. N - ddof is the divisor used to
             normalize the variance.
         """
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas"
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional added for compatibility with pandas"
+            )
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
 
         def func(x):
             return getattr(x, "var")(ddof=ddof)
@@ -2153,7 +2186,13 @@ def func(x):
         return self.agg(func)
 
     @_performance_tracking
-    def std(self, ddof=1):
+    def std(
+        self,
+        ddof=1,
+        engine=None,
+        engine_kwargs=None,
+        numeric_only: bool = False,
+    ):
         """Compute the column-wise std of the values in each group.
 
         Parameters
@@ -2162,6 +2201,18 @@ def std(self, ddof=1):
             The delta degrees of freedom. N - ddof is the divisor used to
             normalize the standard deviation.
         """
+        if engine is not None:
+            raise NotImplementedError(
+                "engine is non-functional and added for compatibility with pandas"
+            )
+        if engine_kwargs is not None:
+            raise NotImplementedError(
+                "engine_kwargs is non-functional added for compatibility with pandas"
+            )
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
 
         def func(x):
             return getattr(x, "std")(ddof=ddof)
@@ -2169,7 +2220,9 @@ def func(x):
         return self.agg(func)
 
     @_performance_tracking
-    def quantile(self, q=0.5, interpolation="linear"):
+    def quantile(
+        self, q=0.5, interpolation="linear", numeric_only: bool = False
+    ):
         """Compute the column-wise quantiles of the values in each group.
 
         Parameters
@@ -2179,7 +2232,14 @@ def quantile(self, q=0.5, interpolation="linear"):
         interpolation : {"linear", "lower", "higher", "midpoint", "nearest"}
             The interpolation method to use when the desired quantile lies
             between two data points. Defaults to "linear".
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+            Currently not supported
         """
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is not currently supported."
+            )
 
         def func(x):
             return getattr(x, "quantile")(q=q, interpolation=interpolation)
@@ -2333,7 +2393,14 @@ def fillna(
         )
 
     @_performance_tracking
-    def shift(self, periods=1, freq=None, axis=0, fill_value=None):
+    def shift(
+        self,
+        periods=1,
+        freq=None,
+        axis=0,
+        fill_value=None,
+        suffix: str | None = None,
+    ):
         """
         Shift each group by ``periods`` positions.
 
@@ -2355,6 +2422,10 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
               the list. The length of the list should match the number of
               columns shifted. Each value should match the data type of the
               column to fill.
+        suffix : str, optional
+            A string to add to each shifted column if there are multiple periods.
+            Ignored otherwise.
+            Currently not supported.
 
         Returns
         -------
@@ -2374,6 +2445,9 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
         if not axis == 0:
             raise NotImplementedError("Only axis=0 is supported.")
 
+        if suffix is not None:
+            raise NotImplementedError("shift is not currently supported.")
+
         values = self.grouping.values
         if is_list_like(fill_value):
             if len(fill_value) != len(values._data):
@@ -2473,6 +2547,142 @@ def pct_change(
         shifted = fill_grp.shift(periods=periods, freq=freq)
         return (filled / shifted) - 1
 
+    def _mimic_pandas_order(
+        self, result: DataFrameOrSeries
+    ) -> DataFrameOrSeries:
+        """Given a groupby result from libcudf, reconstruct the row orders
+        matching that of pandas. This also adds appropriate indices.
+        """
+        # TODO: copy metadata after this method is a common pattern, should
+        # merge in this method.
+
+        # This function is used to reorder the results of scan-based
+        # groupbys which have the same output size as input size.
+        # However, if the grouping key has NAs and dropna=True, the
+        # result coming back from libcudf has null_count few rows than
+        # the input, so we must produce an ordering from the full
+        # input range.
+        _, _, (ordering,) = self._groupby.groups(
+            [as_column(range(0, len(self.obj)))]
+        )
+        if self._dropna and any(
+            c.has_nulls(include_nan=True) > 0
+            for c in self.grouping._key_columns
+        ):
+            # Scan aggregations with null/nan keys put nulls in the
+            # corresponding output rows in pandas, to do that here
+            # expand the result by reindexing.
+            ri = cudf.RangeIndex(0, len(self.obj))
+            result.index = cudf.Index(ordering)
+            # This reorders and expands
+            result = result.reindex(ri)
+        else:
+            # Just reorder according to the groupings
+            result = result.take(ordering.argsort())
+        # Now produce the actual index we first thought of
+        result.index = self.obj.index
+        return result
+
+    def ohlc(self):
+        """
+        Compute open, high, low and close values of a group, excluding missing values.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("ohlc is currently not implemented")
+
+    @property
+    def plot(self):
+        """
+        Make plots of a grouped Series or DataFrame.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("plot is currently not implemented")
+
+    def resample(self, rule, *args, include_groups: bool = True, **kwargs):
+        """
+        Provide resampling when using a TimeGrouper.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("resample is currently not implemented")
+
+    def take(self, indices):
+        """
+        Return the elements in the given *positional* indices in each group.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("take is currently not implemented")
+
+    def filter(self, func, dropna: bool = True, *args, **kwargs):
+        """
+        Filter elements from groups that don't satisfy a criterion.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("filter is currently not implemented")
+
+    def expanding(self, *args, **kwargs):
+        """
+        Return an expanding grouper, providing expanding
+        functionality per group.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("expanding is currently not implemented")
+
+    def ewm(self, *args, **kwargs):
+        """
+        Return an ewm grouper, providing ewm functionality per group.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("expanding is currently not implemented")
+
+    def any(self, skipna: bool = True):
+        """
+        Return True if any value in the group is truthful, else False.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("any is currently not implemented")
+
+    def all(self, skipna: bool = True):
+        """
+        Return True if all values in the group are truthful, else False.
+
+        Currently not implemented.
+        """
+        raise NotImplementedError("all is currently not implemented")
+
+
+class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
+    obj: "cudf.core.dataframe.DataFrame"
+
+    _PROTECTED_KEYS = frozenset(("obj",))
+
+    def _reduce_numeric_only(self, op: str):
+        columns = list(
+            name
+            for name in self.obj._data.names
+            if (
+                is_numeric_dtype(self.obj._data[name].dtype)
+                and name not in self.grouping.names
+            )
+        )
+        return self[columns].agg(op)
+
+    def __getitem__(self, key):
+        return self.obj[key].groupby(
+            by=self.grouping.keys,
+            dropna=self._dropna,
+            sort=self._sort,
+            group_keys=self._group_keys,
+            as_index=self._as_index,
+        )
+
     def value_counts(
         self,
         subset=None,
@@ -2637,68 +2847,112 @@ def value_counts(
 
         return result
 
-    def _mimic_pandas_order(
-        self, result: DataFrameOrSeries
-    ) -> DataFrameOrSeries:
-        """Given a groupby result from libcudf, reconstruct the row orders
-        matching that of pandas. This also adds appropriate indices.
+    @_performance_tracking
+    def corr(
+        self, method="pearson", min_periods=1, numeric_only: bool = False
+    ):
         """
-        # TODO: copy metadata after this method is a common pattern, should
-        # merge in this method.
+        Compute pairwise correlation of columns, excluding NA/null values.
 
-        # This function is used to reorder the results of scan-based
-        # groupbys which have the same output size as input size.
-        # However, if the grouping key has NAs and dropna=True, the
-        # result coming back from libcudf has null_count few rows than
-        # the input, so we must produce an ordering from the full
-        # input range.
-        _, _, (ordering,) = self._groupby.groups(
-            [as_column(range(0, len(self.obj)))]
-        )
-        if self._dropna and any(
-            c.has_nulls(include_nan=True) > 0
-            for c in self.grouping._key_columns
-        ):
-            # Scan aggregations with null/nan keys put nulls in the
-            # corresponding output rows in pandas, to do that here
-            # expand the result by reindexing.
-            ri = cudf.RangeIndex(0, len(self.obj))
-            result.index = cudf.Index(ordering)
-            # This reorders and expands
-            result = result.reindex(ri)
-        else:
-            # Just reorder according to the groupings
-            result = result.take(ordering.argsort())
-        # Now produce the actual index we first thought of
-        result.index = self.obj.index
-        return result
+        Parameters
+        ----------
+        method: {"pearson", "kendall", "spearman"} or callable,
+            default "pearson". Currently only the pearson correlation
+            coefficient is supported.
 
+        min_periods: int, optional
+            Minimum number of observations required per pair of columns
+            to have a valid result.
 
-class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):
-    obj: "cudf.core.dataframe.DataFrame"
+        Returns
+        -------
+        DataFrame
+            Correlation matrix.
 
-    _PROTECTED_KEYS = frozenset(("obj",))
+        Examples
+        --------
+        >>> import cudf
+        >>> gdf = cudf.DataFrame({
+        ...             "id": ["a", "a", "a", "b", "b", "b", "c", "c", "c"],
+        ...             "val1": [5, 4, 6, 4, 8, 7, 4, 5, 2],
+        ...             "val2": [4, 5, 6, 1, 2, 9, 8, 5, 1],
+        ...             "val3": [4, 5, 6, 1, 2, 9, 8, 5, 1]})
+        >>> gdf
+           id  val1  val2  val3
+        0  a     5     4     4
+        1  a     4     5     5
+        2  a     6     6     6
+        3  b     4     1     1
+        4  b     8     2     2
+        5  b     7     9     9
+        6  c     4     8     8
+        7  c     5     5     5
+        8  c     2     1     1
+        >>> gdf.groupby("id").corr(method="pearson")
+                    val1      val2      val3
+        id
+        a   val1  1.000000  0.500000  0.500000
+            val2  0.500000  1.000000  1.000000
+            val3  0.500000  1.000000  1.000000
+        b   val1  1.000000  0.385727  0.385727
+            val2  0.385727  1.000000  1.000000
+            val3  0.385727  1.000000  1.000000
+        c   val1  1.000000  0.714575  0.714575
+            val2  0.714575  1.000000  1.000000
+            val3  0.714575  1.000000  1.000000
+        """
 
-    def _reduce_numeric_only(self, op: str):
-        columns = list(
-            name
-            for name in self.obj._data.names
-            if (
-                is_numeric_dtype(self.obj._data[name].dtype)
-                and name not in self.grouping.names
+        if method != "pearson":
+            raise NotImplementedError(
+                "Only pearson correlation is currently supported"
+            )
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
             )
-        )
-        return self[columns].agg(op)
 
-    def __getitem__(self, key):
-        return self.obj[key].groupby(
-            by=self.grouping.keys,
-            dropna=self._dropna,
-            sort=self._sort,
-            group_keys=self._group_keys,
-            as_index=self._as_index,
+        return self._cov_or_corr(
+            lambda x: x.corr(method, min_periods), "Correlation"
         )
 
+    def hist(
+        self,
+        column=None,
+        by=None,
+        grid: bool = True,
+        xlabelsize: int | None = None,
+        xrot: float | None = None,
+        ylabelsize: int | None = None,
+        yrot: float | None = None,
+        ax=None,
+        sharex: bool = False,
+        sharey: bool = False,
+        figsize: tuple[float, float] | None = None,
+        layout: tuple[int, int] | None = None,
+        bins: int | abc.Sequence[int] = 10,
+        backend: str | None = None,
+        legend: bool = False,
+        **kwargs,
+    ):
+        raise NotImplementedError("hist is not currently implemented")
+
+    def boxplot(
+        self,
+        subplots: bool = True,
+        column=None,
+        fontsize: int | None = None,
+        rot: int = 0,
+        grid: bool = True,
+        ax=None,
+        figsize: tuple[float, float] | None = None,
+        layout=None,
+        sharex: bool = False,
+        sharey: bool = True,
+        backend=None,
+        **kwargs,
+    ):
+        raise NotImplementedError("boxplot is not currently implemented")
+
 
 DataFrameGroupBy.__doc__ = groupby_doc_template.format(ret="")
 
@@ -2706,8 +2960,10 @@ def __getitem__(self, key):
 class SeriesGroupBy(GroupBy):
     obj: "cudf.core.series.Series"
 
-    def agg(self, func):
-        result = super().agg(func)
+    def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
+        result = super().agg(
+            func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
+        )
 
         # downcast the result to a Series:
         if len(result._data):
@@ -2722,14 +2978,95 @@ def agg(self, func):
 
     aggregate = agg
 
-    def apply(self, func, *args):
-        result = super().apply(func, *args)
+    def apply(self, func, *args, **kwargs):
+        result = super().apply(func, *args, **kwargs)
 
         # apply Series name to result
         result.name = self.obj.name
 
         return result
 
+    @property
+    def dtype(self) -> pd.Series:
+        raise NotImplementedError("dtype is currently not implemented.")
+
+    def hist(
+        self,
+        by=None,
+        ax=None,
+        grid: bool = True,
+        xlabelsize: int | None = None,
+        xrot: float | None = None,
+        ylabelsize: int | None = None,
+        yrot: float | None = None,
+        figsize: tuple[float, float] | None = None,
+        bins: int | abc.Sequence[int] = 10,
+        backend: str | None = None,
+        legend: bool = False,
+        **kwargs,
+    ):
+        raise NotImplementedError("hist is currently not implemented.")
+
+    @property
+    def is_monotonic_increasing(self) -> cudf.Series:
+        """
+        Return whether each group's values are monotonically increasing.
+
+        Currently not implemented
+        """
+        raise NotImplementedError(
+            "is_monotonic_increasing is currently not implemented."
+        )
+
+    @property
+    def is_monotonic_decreasing(self) -> cudf.Series:
+        """
+        Return whether each group's values are monotonically decreasing.
+
+        Currently not implemented
+        """
+        raise NotImplementedError(
+            "is_monotonic_decreasing is currently not implemented."
+        )
+
+    def nlargest(
+        self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
+    ) -> cudf.Series:
+        """
+        Return the largest n elements.
+
+        Currently not implemented
+        """
+        raise NotImplementedError("nlargest is currently not implemented.")
+
+    def nsmallest(
+        self, n: int = 5, keep: Literal["first", "last", "all"] = "first"
+    ) -> cudf.Series:
+        """
+        Return the smallest n elements.
+
+        Currently not implemented
+        """
+        raise NotImplementedError("nsmallest is currently not implemented.")
+
+    def value_counts(
+        self,
+        normalize: bool = False,
+        sort: bool = True,
+        ascending: bool = False,
+        bins=None,
+        dropna: bool = True,
+    ) -> cudf.Series | cudf.DataFrame:
+        raise NotImplementedError("value_counts is currently not implemented.")
+
+    def corr(
+        self,
+        other: cudf.Series,
+        method: str = "pearson",
+        min_periods: int | None = None,
+    ) -> cudf.Series:
+        raise NotImplementedError("corr is currently not implemented.")
+
 
 SeriesGroupBy.__doc__ = groupby_doc_template.format(ret="")
 
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index 4e0c5bd86b9..715bbf89b15 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -43,8 +43,10 @@ def __init__(self, obj, by, axis=None, kind=None):
         by = _ResampleGrouping(obj, by)
         super().__init__(obj, by=by)
 
-    def agg(self, func):
-        result = super().agg(func)
+    def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
+        result = super().agg(
+            func, *args, engine=engine, engine_kwargs=engine_kwargs, **kwargs
+        )
         if len(self.grouping.bin_labels) != len(result):
             index = cudf.core.index.Index(
                 self.grouping.bin_labels, name=self.grouping.names[0]
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 826a0e52f57..74f04c0584f 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -3885,3 +3885,28 @@ def test_group_by_raises_category_error(op):
 
     with pytest.raises(TypeError):
         df.groupby(df.a).agg(op)
+
+
+def test_ngroups():
+    pdf = pd.DataFrame({"a": [1, 1, 3], "b": range(3)})
+    gdf = cudf.DataFrame.from_pandas(pdf)
+
+    pgb = pdf.groupby("a")
+    ggb = gdf.groupby("a")
+    assert pgb.ngroups == ggb.ngroups
+    assert len(pgb) == len(ggb)
+
+
+def test_ndim():
+    pdf = pd.DataFrame({"a": [1, 1, 3], "b": range(3)})
+    gdf = cudf.DataFrame.from_pandas(pdf)
+
+    pgb = pdf.groupby("a")
+    ggb = gdf.groupby("a")
+    assert pgb.ndim == ggb.ndim
+
+    pser = pd.Series(range(3))
+    gser = cudf.Series.from_pandas(pser)
+    pgb = pser.groupby([0, 0, 1])
+    ggb = gser.groupby(cudf.Series([0, 0, 1]))
+    assert pgb.ndim == ggb.ndim

From 6e7624d6b31c93b0547590929ac63ed8e3a48d24 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 29 Jul 2024 14:06:51 -0400
Subject: [PATCH 601/842] Add stream parameter to reshape APIs (#16410)

Adds `stream` parameter to reshape APIs:
- `cudf::interleave_columns`
- `cudf::tile`
- `cudf::byte_cast`

Found while working #15983

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16410
---
 cpp/include/cudf/detail/reshape.hpp   |  4 ---
 cpp/include/cudf/reshape.hpp          | 17 ++++++----
 cpp/src/reshape/byte_cast.cu          | 11 ++-----
 cpp/src/reshape/interleave_columns.cu |  3 +-
 cpp/src/reshape/tile.cu               |  3 +-
 cpp/tests/CMakeLists.txt              |  1 +
 cpp/tests/streams/reshape_test.cpp    | 47 +++++++++++++++++++++++++++
 7 files changed, 65 insertions(+), 21 deletions(-)
 create mode 100644 cpp/tests/streams/reshape_test.cpp

diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp
index 30f8b88b116..68a856373bf 100644
--- a/cpp/include/cudf/detail/reshape.hpp
+++ b/cpp/include/cudf/detail/reshape.hpp
@@ -28,8 +28,6 @@ namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::tile
- *
- * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<table> tile(table_view const& input,
                             size_type count,
@@ -38,8 +36,6 @@ std::unique_ptr<table> tile(table_view const& input,
 
 /**
  * @copydoc cudf::interleave_columns
- *
- * @param stream CUDA stream used for device memory operations and kernel launches
  */
 std::unique_ptr<column> interleave_columns(table_view const& input,
                                            rmm::cuda_stream_view,
diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp
index a0a7fe694bb..07aaf6488ad 100644
--- a/cpp/include/cudf/reshape.hpp
+++ b/cpp/include/cudf/reshape.hpp
@@ -47,13 +47,14 @@ namespace CUDF_EXPORT cudf {
  * @throws cudf::logic_error if input contains no columns.
  * @throws cudf::logic_error if input columns dtypes are not identical.
  *
- * @param[in] input Table containing columns to interleave
- * @param[in] mr Device memory resource used to allocate the returned column's device memory
- *
+ * @param input Table containing columns to interleave
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
  * @return The interleaved columns as a single column
  */
 std::unique_ptr<column> interleave_columns(
   table_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -68,15 +69,17 @@ std::unique_ptr<column> interleave_columns(
  * return = [[8, 4, 7, 8, 4, 7], [5, 2, 3, 5, 2, 3]]
  * ```
  *
- * @param[in] input Table containing rows to be repeated
- * @param[in] count Number of times to tile "rows". Must be non-negative
- * @param[in] mr Device memory resource used to allocate the returned table's device memory
+ * @param input Table containing rows to be repeated
+ * @param count Number of times to tile "rows". Must be non-negative
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
  *
  * @return The table containing the tiled "rows"
  */
 std::unique_ptr<table> tile(
   table_view const& input,
   size_type count,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -95,6 +98,7 @@ enum class flip_endianness : bool { NO, YES };
  *
  * @param input_column Column to be converted to lists of bytes
  * @param endian_configuration Whether to retain or flip the endianness of the elements
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
  * @return The column containing the lists of bytes
@@ -102,6 +106,7 @@ enum class flip_endianness : bool { NO, YES };
 std::unique_ptr<column> byte_cast(
   column_view const& input_column,
   flip_endianness endian_configuration,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu
index 3dfa0b65814..2a03a5504c1 100644
--- a/cpp/src/reshape/byte_cast.cu
+++ b/cpp/src/reshape/byte_cast.cu
@@ -167,11 +167,6 @@ struct byte_list_conversion_fn<T, std::enable_if_t<std::is_same_v<T, cudf::strin
 
 }  // namespace
 
-/**
- * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::device_async_resource_ref)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
- */
 std::unique_ptr<column> byte_cast(column_view const& input,
                                   flip_endianness endian_configuration,
                                   rmm::cuda_stream_view stream,
@@ -183,15 +178,13 @@ std::unique_ptr<column> byte_cast(column_view const& input,
 
 }  // namespace detail
 
-/**
- * @copydoc cudf::byte_cast(column_view const&, flip_endianness, rmm::device_async_resource_ref)
- */
 std::unique_ptr<column> byte_cast(column_view const& input,
                                   flip_endianness endian_configuration,
+                                  rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::byte_cast(input, endian_configuration, cudf::get_default_stream(), mr);
+  return detail::byte_cast(input, endian_configuration, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 79124508b11..7473b6045af 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -264,10 +264,11 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
 }  // namespace detail
 
 std::unique_ptr<column> interleave_columns(table_view const& input,
+                                           rmm::cuda_stream_view stream,
                                            rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::interleave_columns(input, cudf::get_default_stream(), mr);
+  return detail::interleave_columns(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu
index 29996aa2152..3d4fb73c000 100644
--- a/cpp/src/reshape/tile.cu
+++ b/cpp/src/reshape/tile.cu
@@ -64,10 +64,11 @@ std::unique_ptr<table> tile(table_view const& in,
 
 std::unique_ptr<table> tile(table_view const& in,
                             size_type count,
+                            rmm::cuda_stream_view stream,
                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::tile(in, count, cudf::get_default_stream(), mr);
+  return detail::tile(in, count, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 22827484f9a..4dffcb41ba2 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -704,6 +704,7 @@ ConfigureTest(STREAM_PARQUETIO_TEST streams/io/parquet_test.cpp STREAM_MODE test
 ConfigureTest(STREAM_POOL_TEST streams/pool_test.cu STREAM_MODE testing)
 ConfigureTest(STREAM_REDUCTION_TEST streams/reduction_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_RESHAPE_TEST streams/reshape_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_ROLLING_TEST streams/rolling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/reshape_test.cpp b/cpp/tests/streams/reshape_test.cpp
new file mode 100644
index 00000000000..d7c5da91bca
--- /dev/null
+++ b/cpp/tests/streams/reshape_test.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/reshape.hpp>
+
+class ReshapeTest : public cudf::test::BaseFixture {};
+
+TEST_F(ReshapeTest, InterleaveColumns)
+{
+  auto a = cudf::test::fixed_width_column_wrapper<int32_t>({0, 3, 6});
+  auto b = cudf::test::fixed_width_column_wrapper<int32_t>({1, 4, 7});
+  auto c = cudf::test::fixed_width_column_wrapper<int32_t>({2, 5, 8});
+  cudf::table_view in(std::vector<cudf::column_view>{a, b, c});
+  cudf::interleave_columns(in, cudf::test::get_default_stream());
+}
+
+TEST_F(ReshapeTest, Tile)
+{
+  auto a = cudf::test::fixed_width_column_wrapper<int32_t>({-1, 0, 1});
+  cudf::table_view in(std::vector<cudf::column_view>{a});
+  cudf::tile(in, 2, cudf::test::get_default_stream());
+}
+
+TEST_F(ReshapeTest, ByteCast)
+{
+  auto a = cudf::test::fixed_width_column_wrapper<int32_t>({0, 100, -100, 1000, 1000});
+  cudf::byte_cast(a, cudf::flip_endianness::YES, cudf::test::get_default_stream());
+  cudf::byte_cast(a, cudf::flip_endianness::NO, cudf::test::get_default_stream());
+}

From 35796057b64e258713d4d89ba368837d30a1a9c5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 29 Jul 2024 08:33:23 -1000
Subject: [PATCH 602/842] Align misc DataFrame and MultiIndex methods with
 pandas 2.x (#16402)

The API changes in this PR are mostly adding implementations or adding missing keyword argument (although they might not be implemented). The APIs affected are:

* `DataFrame.insert`
* `DataFrame.melt`
* `DataFrame.merge`
* `DataFrame.quantile`
* `DataFrame.cov`
* `DataFrame.corr`
* `DataFrame.median`
* `DataFrame.rolling`
* `DataFrame.resample`
* `DataFrame.dropna`
* `MultiIndex.from_tuple`
* `MultiIndex.from_frame`
* `MultiIndex.from_product`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16402
---
 python/cudf/cudf/core/dataframe.py      | 106 +++++++++++++++++-------
 python/cudf/cudf/core/indexed_frame.py  |  81 +++++++++++-------
 python/cudf/cudf/core/multiindex.py     |  38 +++++++--
 python/cudf/cudf/core/reshape.py        |   3 +
 python/cudf/cudf/core/window/ewm.py     |  23 +++--
 python/cudf/cudf/core/window/rolling.py |  27 +++++-
 python/cudf/cudf/tests/test_dropna.py   |   9 ++
 7 files changed, 211 insertions(+), 76 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1d7136e61e3..6ea11fe9f64 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3215,26 +3215,37 @@ def reset_index(
         )
 
     @_performance_tracking
-    def insert(self, loc, name, value, nan_as_null=no_default):
+    def insert(
+        self,
+        loc,
+        column,
+        value,
+        allow_duplicates: bool = False,
+        nan_as_null=no_default,
+    ):
         """Add a column to DataFrame at the index specified by loc.
 
         Parameters
         ----------
         loc : int
             location to insert by index, cannot be greater then num columns + 1
-        name : number or string
-            name or label of column to be inserted
+        column : number or string
+            column or label of column to be inserted
         value : Series or array-like
         nan_as_null : bool, Default None
             If ``None``/``True``, converts ``np.nan`` values to
             ``null`` values.
             If ``False``, leaves ``np.nan`` values as is.
         """
+        if allow_duplicates is not False:
+            raise NotImplementedError(
+                "allow_duplicates is currently not implemented."
+            )
         if nan_as_null is no_default:
             nan_as_null = not cudf.get_option("mode.pandas_compatible")
         return self._insert(
             loc=loc,
-            name=name,
+            name=column,
             value=value,
             nan_as_null=nan_as_null,
             ignore_index=False,
@@ -4097,7 +4108,15 @@ def transpose(self):
     T = property(transpose, doc=transpose.__doc__)
 
     @_performance_tracking
-    def melt(self, **kwargs):
+    def melt(
+        self,
+        id_vars=None,
+        value_vars=None,
+        var_name=None,
+        value_name="value",
+        col_level=None,
+        ignore_index: bool = True,
+    ):
         """Unpivots a DataFrame from wide format to long format,
         optionally leaving identifier variables set.
 
@@ -4124,23 +4143,30 @@ def melt(self, **kwargs):
         """
         from cudf.core.reshape import melt
 
-        return melt(self, **kwargs)
+        return melt(
+            self,
+            id_vars=id_vars,
+            value_vars=value_vars,
+            var_name=var_name,
+            value_name=value_name,
+            col_level=col_level,
+            ignore_index=ignore_index,
+        )
 
     @_performance_tracking
     def merge(
         self,
         right,
+        how="inner",
         on=None,
         left_on=None,
         right_on=None,
         left_index=False,
         right_index=False,
-        how="inner",
         sort=False,
-        lsuffix=None,
-        rsuffix=None,
-        indicator=False,
         suffixes=("_x", "_y"),
+        indicator=False,
+        validate=None,
     ):
         """Merge GPU DataFrame objects by performing a database-style join
         operation by columns or indexes.
@@ -4241,17 +4267,8 @@ def merge(
             raise NotImplementedError(
                 "Only indicator=False is currently supported"
             )
-
-        if lsuffix or rsuffix:
-            raise ValueError(
-                "The lsuffix and rsuffix keywords have been replaced with the "
-                "``suffixes=`` keyword.  "
-                "Please provide the following instead: \n\n"
-                "    suffixes=('%s', '%s')"
-                % (lsuffix or "_x", rsuffix or "_y")
-            )
-        else:
-            lsuffix, rsuffix = suffixes
+        if validate is not None:
+            raise NotImplementedError("validate is currently not supported.")
 
         lhs, rhs = self, right
         merge_cls = Merge
@@ -5952,9 +5969,9 @@ def quantile(
         axis=0,
         numeric_only=True,
         interpolation=None,
+        method="single",
         columns=None,
         exact=True,
-        method="single",
     ):
         """
         Return values at the given quantile.
@@ -5980,14 +5997,14 @@ def quantile(
                 * higher: `j`.
                 * nearest: `i` or `j` whichever is nearest.
                 * midpoint: (`i` + `j`) / 2.
-        columns : list of str
-            List of column names to include.
-        exact : boolean
-            Whether to use approximate or exact quantile algorithm.
         method : {'single', 'table'}, default `'single'`
             Whether to compute quantiles per-column ('single') or over all
             columns ('table'). When 'table', the only allowed interpolation
             methods are 'nearest', 'lower', and 'higher'.
+        columns : list of str
+            List of column names to include.
+        exact : boolean
+            Whether to use approximate or exact quantile algorithm.
 
         Returns
         -------
@@ -7309,25 +7326,47 @@ def unnamed_group_generator():
             return result
 
     @_performance_tracking
-    def cov(self, **kwargs):
+    def cov(self, min_periods=None, ddof: int = 1, numeric_only: bool = False):
         """Compute the covariance matrix of a DataFrame.
 
         Parameters
         ----------
-        **kwargs
-            Keyword arguments to be passed to cupy.cov
+        min_periods : int, optional
+            Minimum number of observations required per pair of columns to
+            have a valid result.
+            Currently not supported.
+
+        ddof : int, default 1
+            Delta degrees of freedom.  The divisor used in calculations
+            is ``N - ddof``, where ``N`` represents the number of elements.
+
+        numeric_only : bool, default False
+            Include only `float`, `int` or `boolean` data.
+            Currently not supported.
 
         Returns
         -------
         cov : DataFrame
         """
-        cov = cupy.cov(self.values, rowvar=False)
+        if min_periods is not None:
+            raise NotImplementedError(
+                "min_periods is currently not supported."
+            )
+
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
+
+        cov = cupy.cov(self.values, ddof=ddof, rowvar=False)
         cols = self._data.to_pandas_index()
         df = DataFrame(cupy.asfortranarray(cov)).set_index(cols)
         df._set_columns_like(self._data)
         return df
 
-    def corr(self, method="pearson", min_periods=None):
+    def corr(
+        self, method="pearson", min_periods=None, numeric_only: bool = False
+    ):
         """Compute the correlation matrix of a DataFrame.
 
         Parameters
@@ -7357,6 +7396,11 @@ def corr(self, method="pearson", min_periods=None):
         if min_periods is not None:
             raise NotImplementedError("Unsupported argument 'min_periods'")
 
+        if numeric_only is not False:
+            raise NotImplementedError(
+                "numeric_only is currently not supported."
+            )
+
         corr = cupy.corrcoef(values, rowvar=False)
         cols = self._data.to_pandas_index()
         df = DataFrame(cupy.asfortranarray(corr)).set_index(cols)
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index e14f8923c25..0678ebfdd81 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1495,9 +1495,7 @@ def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs):
             **kwargs,
         )
 
-    def median(
-        self, axis=None, skipna=True, level=None, numeric_only=None, **kwargs
-    ):
+    def median(self, axis=None, skipna=True, numeric_only=None, **kwargs):
         """
         Return the median of the values for the requested axis.
 
@@ -1857,7 +1855,16 @@ def mask(
     @_performance_tracking
     @copy_docstring(Rolling)
     def rolling(
-        self, window, min_periods=None, center=False, axis=0, win_type=None
+        self,
+        window,
+        min_periods=None,
+        center: bool = False,
+        win_type: str | None = None,
+        on=None,
+        axis=0,
+        closed: str | None = None,
+        step: int | None = None,
+        method: str = "single",
     ):
         return Rolling(
             self,
@@ -1865,7 +1872,11 @@ def rolling(
             min_periods=min_periods,
             center=center,
             axis=axis,
+            on=on,
             win_type=win_type,
+            closed=closed,
+            step=step,
+            method=method,
         )
 
     @copy_docstring(ExponentialMovingWindow)
@@ -1880,6 +1891,7 @@ def ewm(
         ignore_na: bool = False,
         axis: int = 0,
         times: str | np.ndarray | None = None,
+        method: Literal["single", "table"] = "single",
     ):
         return ExponentialMovingWindow(
             self,
@@ -1892,6 +1904,7 @@ def ewm(
             ignore_na=ignore_na,
             axis=axis,
             times=times,
+            method=method,
         )
 
     @_performance_tracking
@@ -3943,16 +3956,15 @@ def resample(
         self,
         rule,
         axis=0,
-        closed=None,
-        label=None,
-        convention="start",
+        closed: Literal["right", "left"] | None = None,
+        label: Literal["right", "left"] | None = None,
+        convention: Literal["start", "end", "s", "e"] = "start",
         kind=None,
-        loffset=None,
-        base=None,
         on=None,
         level=None,
         origin="start_day",
         offset=None,
+        group_keys: bool = False,
     ):
         """
         Convert the frequency of ("resample") the given time series data.
@@ -4090,26 +4102,27 @@ def resample(
                 "deprecated and will be removed in a future version. ",
                 FutureWarning,
             )
-        if (axis, convention, kind, loffset, base, origin, offset) != (
-            0,
-            "start",
-            None,
-            None,
-            None,
-            "start_day",
-            None,
-        ):
-            raise NotImplementedError(
-                "The following arguments are not "
-                "currently supported by resample:\n\n"
-                "- axis\n"
-                "- convention\n"
-                "- kind\n"
-                "- loffset\n"
-                "- base\n"
-                "- origin\n"
-                "- offset"
+            raise NotImplementedError("kind is currently not supported.")
+        if axis != 0:
+            warnings.warn(
+                "The 'axis' keyword in is "
+                "deprecated and will be removed in a future version. ",
+                FutureWarning,
             )
+            raise NotImplementedError("axis is currently not supported.")
+        if convention != "start":
+            warnings.warn(
+                "The 'convention' keyword in is "
+                "deprecated and will be removed in a future version. ",
+                FutureWarning,
+            )
+            raise NotImplementedError("convention is currently not supported.")
+        if origin != "start_day":
+            raise NotImplementedError("origin is currently not supported.")
+        if offset is not None:
+            raise NotImplementedError("offset is currently not supported.")
+        if group_keys is not False:
+            raise NotImplementedError("group_keys is currently not supported.")
         by = cudf.Grouper(
             key=on, freq=rule, closed=closed, label=label, level=level
         )
@@ -4120,7 +4133,13 @@ def resample(
         )
 
     def dropna(
-        self, axis=0, how="any", thresh=None, subset=None, inplace=False
+        self,
+        axis=0,
+        how="any",
+        thresh=None,
+        subset=None,
+        inplace=False,
+        ignore_index: bool = False,
     ):
         """
         Drop rows (or columns) containing nulls from a Column.
@@ -4144,6 +4163,8 @@ def dropna(
             columns, subset is a list of rows to consider.
         inplace : bool, default False
             If True, do operation inplace and return None.
+        ignore_index : bool, default ``False``
+            If ``True``, the resulting axis will be labeled 0, 1, …, n - 1.
 
         Returns
         -------
@@ -4220,6 +4241,8 @@ def dropna(
         """
         if axis == 0:
             result = self._drop_na_rows(how=how, subset=subset, thresh=thresh)
+            if ignore_index:
+                result.index = RangeIndex(len(result))
         else:
             result = self._drop_na_columns(
                 how=how, subset=subset, thresh=thresh
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index dfc596bf279..0e1fddd7ed5 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -524,8 +524,10 @@ def codes(self):
             col.values for col in self._codes
         )
 
-    def get_slice_bound(self, label, side, kind=None):
-        raise NotImplementedError()
+    def get_slice_bound(self, label, side):
+        raise NotImplementedError(
+            "get_slice_bound is not currently implemented."
+        )
 
     @property  # type: ignore
     @_performance_tracking
@@ -1108,7 +1110,7 @@ def _concat(cls, objs):
 
     @classmethod
     @_performance_tracking
-    def from_tuples(cls, tuples, names=None):
+    def from_tuples(cls, tuples, sortorder: int | None = None, names=None):
         """
         Convert list of tuples to MultiIndex.
 
@@ -1116,6 +1118,9 @@ def from_tuples(cls, tuples, names=None):
         ----------
         tuples : list / sequence of tuple-likes
             Each tuple is the index of one row/column.
+        sortorder : int or None
+            Level of sortedness (must be lexicographically sorted by that
+            level).
         names : list / sequence of str, optional
             Names for the levels in the index.
 
@@ -1142,7 +1147,9 @@ def from_tuples(cls, tuples, names=None):
                    names=['number', 'color'])
         """
         # Use Pandas for handling Python host objects
-        pdi = pd.MultiIndex.from_tuples(tuples, names=names)
+        pdi = pd.MultiIndex.from_tuples(
+            tuples, sortorder=sortorder, names=names
+        )
         return cls.from_pandas(pdi)
 
     @_performance_tracking
@@ -1215,7 +1222,12 @@ def values(self):
 
     @classmethod
     @_performance_tracking
-    def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
+    def from_frame(
+        cls,
+        df: pd.DataFrame | cudf.DataFrame,
+        sortorder: int | None = None,
+        names=None,
+    ):
         """
         Make a MultiIndex from a DataFrame.
 
@@ -1223,6 +1235,9 @@ def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
         ----------
         df : DataFrame
             DataFrame to be converted to MultiIndex.
+        sortorder : int, optional
+            Level of sortedness (must be lexicographically sorted by that
+            level).
         names : list-like, optional
             If no names are provided, use the column names, or tuple of column
             names if the columns is a MultiIndex. If a sequence, overwrite
@@ -1273,11 +1288,13 @@ def from_frame(cls, df: pd.DataFrame | cudf.DataFrame, names=None):
         else:
             source_data = df
         names = names if names is not None else source_data._column_names
-        return cls.from_arrays(source_data._columns, names=names)
+        return cls.from_arrays(
+            source_data._columns, sortorder=sortorder, names=names
+        )
 
     @classmethod
     @_performance_tracking
-    def from_product(cls, arrays, names=None):
+    def from_product(cls, iterables, sortorder: int | None = None, names=None):
         """
         Make a MultiIndex from the cartesian product of multiple iterables.
 
@@ -1285,6 +1302,9 @@ def from_product(cls, arrays, names=None):
         ----------
         iterables : list / sequence of iterables
             Each iterable has unique labels for each level of the index.
+        sortorder : int or None
+            Level of sortedness (must be lexicographically sorted by that
+            level).
         names : list / sequence of str, optional
             Names for the levels in the index.
             If not explicitly provided, names will be inferred from the
@@ -1314,7 +1334,9 @@ def from_product(cls, arrays, names=None):
                    names=['number', 'color'])
         """
         # Use Pandas for handling Python host objects
-        pdi = pd.MultiIndex.from_product(arrays, names=names)
+        pdi = pd.MultiIndex.from_product(
+            iterables, sortorder=sortorder, names=names
+        )
         return cls.from_pandas(pdi)
 
     @classmethod
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index a542c5f5969..e7248977b1d 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -502,6 +502,7 @@ def melt(
     var_name=None,
     value_name="value",
     col_level=None,
+    ignore_index: bool = True,
 ):
     """Unpivots a DataFrame from wide format to long format,
     optionally leaving identifier variables set.
@@ -566,6 +567,8 @@ def melt(
     """
     if col_level is not None:
         raise NotImplementedError("col_level != None is not supported yet.")
+    if ignore_index is not True:
+        raise NotImplementedError("ignore_index is currently not supported.")
 
     # Arg cleaning
 
diff --git a/python/cudf/cudf/core/window/ewm.py b/python/cudf/cudf/core/window/ewm.py
index 1203a840076..ef0f6958aeb 100644
--- a/python/cudf/cudf/core/window/ewm.py
+++ b/python/cudf/cudf/core/window/ewm.py
@@ -1,7 +1,9 @@
 # Copyright (c) 2022-2024, NVIDIA CORPORATION.
-
 from __future__ import annotations
 
+import warnings
+from typing import Literal
+
 import numpy as np
 
 from cudf._lib.reduce import scan
@@ -103,13 +105,24 @@ def __init__(
         ignore_na: bool = False,
         axis: int = 0,
         times: str | np.ndarray | None = None,
+        method: Literal["single", "table"] = "single",
     ):
-        if (min_periods, ignore_na, axis, times) != (0, False, 0, None):
+        if min_periods != 0:
             raise NotImplementedError(
-                "The parameters `min_periods`, `ignore_na`, "
-                "`axis`, and `times` are not yet supported."
+                "min_periods is currently not supported."
             )
-
+        if ignore_na is not False:
+            raise NotImplementedError("ignore_na is currently not supported.")
+        if axis != 0:
+            warnings.warn(
+                "axis is deprecated with will be removed in a future version. "
+                "Transpose the DataFrame first instead."
+            )
+            raise NotImplementedError("axis is currently not supported.")
+        if times is not None:
+            raise NotImplementedError("times is currently not supported.")
+        if method != "single":
+            raise NotImplementedError("method is currently not supported.")
         self.obj = obj
         self.adjust = adjust
         self.com = get_center_of_mass(com, span, halflife, alpha)
diff --git a/python/cudf/cudf/core/window/rolling.py b/python/cudf/cudf/core/window/rolling.py
index 29391c68471..043a41145e5 100644
--- a/python/cudf/cudf/core/window/rolling.py
+++ b/python/cudf/cudf/core/window/rolling.py
@@ -1,4 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION
+from __future__ import annotations
+
+import warnings
 
 import numba
 import pandas as pd
@@ -196,17 +199,26 @@ def __init__(
         obj,
         window,
         min_periods=None,
-        center=False,
+        center: bool = False,
+        win_type: str | None = None,
+        on=None,
         axis=0,
-        win_type=None,
+        closed: str | None = None,
+        step: int | None = None,
+        method: str = "single",
     ):
         self.obj = obj
         self.window = window
         self.min_periods = min_periods
         self.center = center
         self._normalize()
-        self.agg_params = {}
+        # for var & std only?
+        self.agg_params: dict[str, int] = {}
         if axis != 0:
+            warnings.warn(
+                "axis is deprecated with will be removed in a future version. "
+                "Transpose the DataFrame first instead."
+            )
             raise NotImplementedError("axis != 0 is not supported yet.")
         self.axis = axis
 
@@ -217,6 +229,15 @@ def __init__(
                 )
         self.win_type = win_type
 
+        if on is not None:
+            raise NotImplementedError("on is currently not supported")
+        if closed not in (None, "right"):
+            raise NotImplementedError("closed is currently not supported")
+        if step is not None:
+            raise NotImplementedError("step is currently not supported")
+        if method != "single":
+            raise NotImplementedError("method is currently not supported")
+
     def __getitem__(self, arg):
         if isinstance(arg, tuple):
             arg = list(arg)
diff --git a/python/cudf/cudf/tests/test_dropna.py b/python/cudf/cudf/tests/test_dropna.py
index ed0cf0053ea..5b1ee0ffac6 100644
--- a/python/cudf/cudf/tests/test_dropna.py
+++ b/python/cudf/cudf/tests/test_dropna.py
@@ -284,3 +284,12 @@ def test_dropna_multiindex_2(data, how):
     got = gi.dropna(how)
 
     assert_eq(expect, got)
+
+
+def test_ignore_index():
+    pser = pd.Series([1, 2, np.nan], index=[2, 4, 1])
+    gser = cudf.from_pandas(pser)
+
+    result = pser.dropna(ignore_index=True)
+    expected = gser.dropna(ignore_index=True)
+    assert_eq(result, expected)

From 743e16426c564d0ed0d7e3d9be5f67e4605c4f32 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 29 Jul 2024 14:19:43 -0500
Subject: [PATCH 603/842] update some branch references in GitHub Actions
 configs (#16397)

Fixes some lingering references to `branch-24.08` in the `pr_issue_status_automation` CI workflow.

This was missed when new branches were cut because that file ends in `.yml` and `update-version.sh` was only modifying files ending in `.yaml`. The corresponding `update-version.sh` changes were made in #16183 and are already on 24.10 thanks to forward mergers.

https://github.com/rapidsai/cudf/blob/dc05a01f3fc0742c5fbbddd86a0f2007bfdc2050/ci/release/update-version.sh#L78

## Notes for Reviewers

I checked like this, and don't see any other missed references:

```shell
git grep -E '24\.8|24\.08|0\.39'
```

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/16397
---
 .github/workflows/pr_issue_status_automation.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index 8ca971dc28d..45e5191eb54 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.10
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.10
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.10
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:

From f8eb63e499f94d583d715f5c1f5e6f234589be57 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 29 Jul 2024 12:39:19 -1000
Subject: [PATCH 604/842] Align Index APIs with pandas 2.x (#16361)

Similar to https://github.com/rapidsai/cudf/pull/16310, the follow APIs have been modified to adjust/add parameters

* `to_flat_index`
* `isin`
* `unique`
* `transpose`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16361
---
 docs/cudf/source/conf.py                     |  5 ++++
 python/cudf/cudf/core/_base_index.py         | 25 ++++++++++++++++++--
 python/cudf/cudf/core/index.py               | 24 +++++++++++++++----
 python/cudf/cudf/core/multiindex.py          | 16 +++++++++++--
 python/cudf/cudf/core/series.py              |  8 -------
 python/cudf/cudf/core/single_column_frame.py |  7 ++++++
 python/cudf/cudf/tests/test_multiindex.py    |  9 +++++++
 7 files changed, 78 insertions(+), 16 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index f544536fb31..7421d9be298 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -561,6 +561,11 @@ def on_missing_reference(app, env, node, contnode):
     ("py:class", "ScalarLike"),
     ("py:class", "ParentType"),
     ("py:class", "ColumnLike"),
+    ("py:class", "ColumnLike"),
+    ("py:obj", "cudf.Index.transpose"),
+    ("py:obj", "cudf.Index.T"),
+    ("py:obj", "cudf.Index.to_flat_index"),
+    ("py:obj", "cudf.MultiIndex.to_flat_index"),
     # TODO: Remove this when we figure out why typing_extensions doesn't seem
     # to map types correctly for intersphinx
     ("py:class", "typing_extensions.Self"),
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 8fad82c5c46..c91514202c5 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -868,6 +868,24 @@ def to_numpy(self):
         """Convert to a numpy array."""
         raise NotImplementedError
 
+    def to_flat_index(self) -> Self:
+        """
+        Identity method.
+
+        This is implemented for compatibility with subclass implementations
+        when chaining.
+
+        Returns
+        -------
+        pd.Index
+            Caller.
+
+        See Also
+        --------
+        MultiIndex.to_flat_index : Subclass implementation.
+        """
+        return self
+
     def any(self):
         """
         Return whether any elements is True in Index.
@@ -945,7 +963,7 @@ def to_pandas(self, *, nullable: bool = False, arrow_type: bool = False):
         """
         raise NotImplementedError
 
-    def isin(self, values):
+    def isin(self, values, level=None):
         """Return a boolean array where the index values are in values.
 
         Compute boolean array of whether each index value is found in
@@ -956,6 +974,9 @@ def isin(self, values):
         ----------
         values : set, list-like, Index
             Sought values.
+        level : str or int, optional
+            Name or position of the index level to use (if the index is a
+            `MultiIndex`).
 
         Returns
         -------
@@ -979,7 +1000,7 @@ def isin(self, values):
         # ColumnBase.isin).
         raise NotImplementedError
 
-    def unique(self):
+    def unique(self, level: int | None = None):
         """
         Return unique values in the index.
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 1c48b8f4f2d..156cb973a9a 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -540,8 +540,12 @@ def memory_usage(self, deep: bool = False) -> int:
             )
         return 0
 
-    def unique(self) -> Self:
+    def unique(self, level: int | None = None) -> Self:
         # RangeIndex always has unique values
+        if level is not None and level > 0:
+            raise IndexError(
+                f"Too many levels: Index has only 1 level, not {level + 1}"
+            )
         return self.copy()
 
     @_performance_tracking
@@ -964,7 +968,11 @@ def _indices_of(self, value) -> cudf.core.column.NumericalColumn:
             i = []
         return as_column(i, dtype=size_type_dtype)
 
-    def isin(self, values):
+    def isin(self, values, level=None):
+        if level is not None and level > 0:
+            raise IndexError(
+                f"Too many levels: Index has only 1 level, not {level + 1}"
+            )
         if is_scalar(values):
             raise TypeError(
                 "only list-like objects are allowed to be passed "
@@ -1616,12 +1624,20 @@ def append(self, other):
 
         return self._concat(to_concat)
 
-    def unique(self):
+    def unique(self, level: int | None = None) -> Self:
+        if level is not None and level > 0:
+            raise IndexError(
+                f"Too many levels: Index has only 1 level, not {level + 1}"
+            )
         return cudf.core.index._index_from_data(
             {self.name: self._values.unique()}, name=self.name
         )
 
-    def isin(self, values):
+    def isin(self, values, level=None):
+        if level is not None and level > 0:
+            raise IndexError(
+                f"Too many levels: Index has only 1 level, not {level + 1}"
+            )
         if is_scalar(values):
             raise TypeError(
                 "only list-like objects are allowed to be passed "
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 0e1fddd7ed5..2788455aebf 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1156,6 +1156,15 @@ def from_tuples(cls, tuples, sortorder: int | None = None, names=None):
     def to_numpy(self):
         return self.values_host
 
+    def to_flat_index(self):
+        """
+        Convert a MultiIndex to an Index of Tuples containing the level values.
+
+        This is not currently implemented
+        """
+        # TODO: Could implement as Index of ListDtype?
+        raise NotImplementedError("to_flat_index is not currently supported.")
+
     @property  # type: ignore
     @_performance_tracking
     def values_host(self):
@@ -1734,8 +1743,11 @@ def fillna(self, value):
         return super().fillna(value=value)
 
     @_performance_tracking
-    def unique(self):
-        return self.drop_duplicates(keep="first")
+    def unique(self, level: int | None = None) -> Self | cudf.Index:
+        if level is None:
+            return self.drop_duplicates(keep="first")
+        else:
+            return self.get_level_values(level).unique()
 
     @_performance_tracking
     def nunique(self, dropna: bool = True) -> int:
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 8277ccf68fc..10ac1fdfc1e 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -2775,14 +2775,6 @@ def cov(self, other, min_periods=None, ddof: int | None = None):
                 f"{other.dtype}"
             )
 
-    @_performance_tracking
-    def transpose(self):
-        """Return the transpose, which is by definition self."""
-
-        return self
-
-    T = property(transpose, doc=transpose.__doc__)
-
     @_performance_tracking
     def duplicated(self, keep="first"):
         """
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index b93528f9693..a5ff1223791 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -389,3 +389,10 @@ def where(self, cond, other=None, inplace=False):
         result = cudf._lib.copying.copy_if_else(input_col, other, cond)
 
         return _make_categorical_like(result, self_column)
+
+    @_performance_tracking
+    def transpose(self):
+        """Return the transpose, which is by definition self."""
+        return self
+
+    T = property(transpose, doc=transpose.__doc__)
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index 2c00d48266c..b7314a36e73 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -2170,3 +2170,12 @@ def test_bool_raises():
         lfunc_args_and_kwargs=[[cudf.MultiIndex.from_arrays([range(1)])]],
         rfunc_args_and_kwargs=[[pd.MultiIndex.from_arrays([range(1)])]],
     )
+
+
+def test_unique_level():
+    pd_mi = pd.MultiIndex.from_arrays([[1, 1, 2], [3, 3, 2]])
+    cudf_mi = cudf.MultiIndex.from_pandas(pd_mi)
+
+    result = pd_mi.unique(level=1)
+    expected = cudf_mi.unique(level=1)
+    assert_eq(result, expected)

From bd302d773c50552531bc7f11f782f8ed876e8fab Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Mon, 29 Jul 2024 17:07:33 -0700
Subject: [PATCH 605/842] Support thread-safe for `prefetch_config::get` and
 `prefetch_config::set` (#16425)

This adds muti-thread support for `prefetch_config` getter and setter
functions. This avoid the issue that the config map is corrupted in
multi-thread environments.

Closes https://github.com/rapidsai/cudf/issues/16426.

---------

Signed-off-by: Nghia Truong <nghiat@nvidia.com>
---
 cpp/include/cudf/utilities/prefetch.hpp |  6 ++++++
 cpp/src/utilities/prefetch.cpp          | 15 +++++++++------
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf/utilities/prefetch.hpp b/cpp/include/cudf/utilities/prefetch.hpp
index 49fca73a2c8..3384181fc37 100644
--- a/cpp/include/cudf/utilities/prefetch.hpp
+++ b/cpp/include/cudf/utilities/prefetch.hpp
@@ -21,6 +21,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <map>
+#include <shared_mutex>
 #include <string>
 #include <string_view>
 
@@ -47,6 +48,8 @@ class prefetch_config {
   /**
    * @brief Get the value of a configuration key.
    *
+   * If the key does not exist, a `false` value will be returned.
+   *
    * @param key The configuration key.
    * @return The value of the configuration key.
    */
@@ -54,6 +57,8 @@ class prefetch_config {
   /**
    * @brief Set the value of a configuration key.
    *
+   * This is a thread-safe operation.
+   *
    * @param key The configuration key.
    * @param value The value to set.
    */
@@ -68,6 +73,7 @@ class prefetch_config {
  private:
   prefetch_config() = default;                //< Private constructor to enforce singleton pattern
   std::map<std::string, bool> config_values;  //< Map of configuration keys to values
+  std::shared_mutex config_mtx;               //< Mutex for thread-safe config access
 };
 
 /**
diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp
index 16f2c3a1202..86d6cc00764 100644
--- a/cpp/src/utilities/prefetch.cpp
+++ b/cpp/src/utilities/prefetch.cpp
@@ -34,13 +34,16 @@ prefetch_config& prefetch_config::instance()
 
 bool prefetch_config::get(std::string_view key)
 {
-  // Default to not prefetching
-  if (config_values.find(key.data()) == config_values.end()) {
-    return (config_values[key.data()] = false);
-  }
-  return config_values[key.data()];
+  std::shared_lock<std::shared_mutex> lock(config_mtx);
+  auto const it = config_values.find(key.data());
+  return it == config_values.end() ? false : it->second;  // default to not prefetching
+}
+
+void prefetch_config::set(std::string_view key, bool value)
+{
+  std::lock_guard<std::shared_mutex> lock(config_mtx);
+  config_values[key.data()] = value;
 }
-void prefetch_config::set(std::string_view key, bool value) { config_values[key.data()] = value; }
 
 cudaError_t prefetch_noexcept(std::string_view key,
                               void const* ptr,

From 368a34ca9fd7db1b6cfb6e7817978e3e4fcfb00b Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 29 Jul 2024 20:05:17 -0500
Subject: [PATCH 606/842] Use RMM adaptor constructors instead of factories.
 (#16414)

This PR uses RMM memory resource adaptor constructors instead of factory functions. With CTAD, we do not need the factory and can use the constructor directly. The factory will be deprecated in https://github.com/rapidsai/rmm/pull/1626.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

URL: https://github.com/rapidsai/cudf/pull/16414
---
 cpp/benchmarks/fixture/benchmark_fixture.hpp         |  2 +-
 .../cudf_test/stream_checking_resource_adaptor.hpp   | 12 ------------
 cpp/include/cudf_test/testing_main.hpp               |  2 +-
 java/src/main/native/src/RmmJni.cpp                  |  7 -------
 4 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index 8c8d6756b00..8900899f9be 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -107,7 +107,7 @@ class memory_stats_logger {
  public:
   memory_stats_logger()
     : existing_mr(rmm::mr::get_current_device_resource()),
-      statistics_mr(rmm::mr::make_statistics_adaptor(existing_mr))
+      statistics_mr(rmm::mr::statistics_resource_adaptor(existing_mr))
   {
     rmm::mr::set_current_device_resource(&statistics_mr);
   }
diff --git a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
index 4f3c723d195..417bbb3d9ab 100644
--- a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
+++ b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
@@ -156,16 +156,4 @@ class stream_checking_resource_adaptor final : public rmm::mr::device_memory_res
                                // cudf::test::get_default_stream() is observed.
 };
 
-/**
- * @brief Convenience factory to return a `stream_checking_resource_adaptor` around the
- * upstream resource `upstream`.
- *
- * @param upstream Reference to the upstream resource
- */
-inline stream_checking_resource_adaptor make_stream_checking_resource_adaptor(
-  rmm::device_async_resource_ref upstream, bool error_on_invalid_stream, bool check_default_stream)
-{
-  return stream_checking_resource_adaptor{upstream, error_on_invalid_stream, check_default_stream};
-}
-
 }  // namespace cudf::test
diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp
index 9866253a9f8..ed83ddabb00 100644
--- a/cpp/include/cudf_test/testing_main.hpp
+++ b/cpp/include/cudf_test/testing_main.hpp
@@ -183,7 +183,7 @@ inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
   auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();
   auto const error_on_invalid_stream = (stream_error_mode == "error");
   auto const check_default_stream    = (stream_mode == "new_cudf_default");
-  auto adaptor                       = cudf::test::make_stream_checking_resource_adaptor(
+  auto adaptor                       = cudf::test::stream_checking_resource_adaptor(
     resource, error_on_invalid_stream, check_default_stream);
   if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {
     rmm::mr::set_current_device_resource(&adaptor);
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 5842a980fc4..09c04a77590 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -154,13 +154,6 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
   }
 };
 
-template <typename Upstream>
-tracking_resource_adaptor<Upstream>* make_tracking_adaptor(Upstream* upstream,
-                                                           std::size_t size_alignment)
-{
-  return new tracking_resource_adaptor<Upstream>{upstream, size_alignment};
-}
-
 /**
  * @brief An RMM device memory resource adaptor that delegates to the wrapped resource
  * for most operations but will call Java to handle certain situations (e.g.: allocation failure).

From d1be0b6dc06fddd0b69fb69731281b16894cb132 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 29 Jul 2024 15:12:38 -1000
Subject: [PATCH 607/842] Align CategoricalIndex APIs with pandas 2.x (#16369)

Mostly exposing methods that were available on the CategoricalColumn

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16369
---
 python/cudf/cudf/core/column/categorical.py | 130 +++++++++++---------
 python/cudf/cudf/core/index.py              | 116 +++++++++++++++++
 python/cudf/cudf/tests/test_categorical.py  |  56 +++++++++
 3 files changed, 247 insertions(+), 55 deletions(-)

diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 9aaccca349d..9433a91b9c6 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -262,37 +262,10 @@ def add_categories(self, new_categories: Any) -> SeriesOrIndex | None:
         dtype: category
         Categories (2, int64): [1, 2]
         """
-        old_categories = self._column.categories
-        new_categories = column.as_column(
-            new_categories,
-            dtype=old_categories.dtype if len(new_categories) == 0 else None,
-        )
-
-        if is_mixed_with_object_dtype(old_categories, new_categories):
-            raise TypeError(
-                f"cudf does not support adding categories with existing "
-                f"categories of dtype `{old_categories.dtype}` and new "
-                f"categories of dtype `{new_categories.dtype}`, please "
-                f"type-cast new_categories to the same type as "
-                f"existing categories."
-            )
-        common_dtype = find_common_type(
-            [old_categories.dtype, new_categories.dtype]
+        return self._return_or_inplace(
+            self._column.add_categories(new_categories=new_categories)
         )
 
-        new_categories = new_categories.astype(common_dtype)
-        old_categories = old_categories.astype(common_dtype)
-
-        if old_categories.isin(new_categories).any():
-            raise ValueError("new categories must not include old categories")
-
-        new_categories = old_categories.append(new_categories)
-        out_col = self._column
-        if not out_col._categories_equal(new_categories):
-            out_col = out_col._set_categories(new_categories)
-
-        return self._return_or_inplace(out_col)
-
     def remove_categories(
         self,
         removals: Any,
@@ -349,23 +322,9 @@ def remove_categories(
         dtype: category
         Categories (3, int64): [1, 2, 10]
         """
-
-        cats = self.categories.to_series()
-        removals = cudf.Series(removals, dtype=cats.dtype)
-        removals_mask = removals.isin(cats)
-
-        # ensure all the removals are in the current categories
-        # list. If not, raise an error to match Pandas behavior
-        if not removals_mask.all():
-            vals = removals[~removals_mask].to_numpy()
-            raise ValueError(f"removals must all be in old categories: {vals}")
-
-        new_categories = cats[~cats.isin(removals)]._column
-        out_col = self._column
-        if not out_col._categories_equal(new_categories):
-            out_col = out_col._set_categories(new_categories)
-
-        return self._return_or_inplace(out_col)
+        return self._return_or_inplace(
+            self._column.remove_categories(removals=removals)
+        )
 
     def set_categories(
         self,
@@ -1319,7 +1278,7 @@ def _set_categories(
         new_categories: Any,
         is_unique: bool = False,
         ordered: bool = False,
-    ) -> CategoricalColumn:
+    ) -> Self:
         """Returns a new CategoricalColumn with the categories set to the
         specified *new_categories*.
 
@@ -1376,17 +1335,68 @@ def _set_categories(
         new_codes = df._data["new_codes"]
 
         # codes can't have masks, so take mask out before moving in
-        return column.build_categorical_column(
-            categories=new_cats,
-            codes=column.build_column(
-                new_codes.base_data, dtype=new_codes.dtype
+        return cast(
+            Self,
+            column.build_categorical_column(
+                categories=new_cats,
+                codes=column.build_column(
+                    new_codes.base_data, dtype=new_codes.dtype
+                ),
+                mask=new_codes.base_mask,
+                size=new_codes.size,
+                offset=new_codes.offset,
+                ordered=ordered,
             ),
-            mask=new_codes.base_mask,
-            size=new_codes.size,
-            offset=new_codes.offset,
-            ordered=ordered,
         )
 
+    def add_categories(self, new_categories: Any) -> Self:
+        old_categories = self.categories
+        new_categories = column.as_column(
+            new_categories,
+            dtype=old_categories.dtype if len(new_categories) == 0 else None,
+        )
+        if is_mixed_with_object_dtype(old_categories, new_categories):
+            raise TypeError(
+                f"cudf does not support adding categories with existing "
+                f"categories of dtype `{old_categories.dtype}` and new "
+                f"categories of dtype `{new_categories.dtype}`, please "
+                f"type-cast new_categories to the same type as "
+                f"existing categories."
+            )
+        common_dtype = find_common_type(
+            [old_categories.dtype, new_categories.dtype]
+        )
+
+        new_categories = new_categories.astype(common_dtype)
+        old_categories = old_categories.astype(common_dtype)
+
+        if old_categories.isin(new_categories).any():
+            raise ValueError("new categories must not include old categories")
+
+        new_categories = old_categories.append(new_categories)
+        if not self._categories_equal(new_categories):
+            return self._set_categories(new_categories)
+        return self
+
+    def remove_categories(
+        self,
+        removals: Any,
+    ) -> Self:
+        removals = column.as_column(removals).astype(self.categories.dtype)
+        removals_mask = removals.isin(self.categories)
+
+        # ensure all the removals are in the current categories
+        # list. If not, raise an error to match Pandas behavior
+        if not removals_mask.all():
+            raise ValueError("removals must all be in old categories")
+
+        new_categories = self.categories.apply_boolean_mask(
+            self.categories.isin(removals).unary_operator("not")
+        )
+        if not self._categories_equal(new_categories):
+            return self._set_categories(new_categories)
+        return self
+
     def reorder_categories(
         self,
         new_categories: Any,
@@ -1404,6 +1414,16 @@ def reorder_categories(
             )
         return self._set_categories(new_categories, ordered=ordered)
 
+    def rename_categories(self, new_categories) -> CategoricalColumn:
+        raise NotImplementedError(
+            "rename_categories is currently not supported."
+        )
+
+    def remove_unused_categories(self) -> Self:
+        raise NotImplementedError(
+            "remove_unused_categories is currently not supported."
+        )
+
     def as_ordered(self, ordered: bool):
         if self.dtype.ordered == ordered:
             return self
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 156cb973a9a..8c3b091abec 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2721,6 +2721,10 @@ def __init__(
             data = data.as_ordered(ordered=False)
         super().__init__(data, name=name)
 
+    @property
+    def ordered(self) -> bool:
+        return self._column.ordered
+
     @property  # type: ignore
     @_performance_tracking
     def codes(self):
@@ -2743,6 +2747,118 @@ def _is_boolean(self):
     def _is_categorical(self):
         return True
 
+    def add_categories(self, new_categories) -> Self:
+        """
+        Add new categories.
+
+        `new_categories` will be included at the last/highest place in the
+        categories and will be unused directly after this call.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.add_categories(new_categories)}
+        )
+
+    def as_ordered(self) -> Self:
+        """
+        Set the Categorical to be ordered.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.as_ordered(ordered=True)}
+        )
+
+    def as_unordered(self) -> Self:
+        """
+        Set the Categorical to be unordered.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.as_ordered(ordered=False)}
+        )
+
+    def remove_categories(self, removals) -> Self:
+        """
+        Remove the specified categories.
+
+        `removals` must be included in the old categories.
+
+        Parameters
+        ----------
+        removals : category or list of categories
+           The categories which should be removed.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.remove_categories(removals)}
+        )
+
+    def remove_unused_categories(self) -> Self:
+        """
+        Remove categories which are not used.
+
+        This method is currently not supported.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.remove_unused_categories()}
+        )
+
+    def rename_categories(self, new_categories) -> Self:
+        """
+        Rename categories.
+
+        This method is currently not supported.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.rename_categories(new_categories)}
+        )
+
+    def reorder_categories(self, new_categories, ordered=None) -> Self:
+        """
+        Reorder categories as specified in new_categories.
+
+        ``new_categories`` need to include all old categories and no new category
+        items.
+
+        Parameters
+        ----------
+        new_categories : Index-like
+           The categories in new order.
+        ordered : bool, optional
+           Whether or not the categorical is treated as a ordered categorical.
+           If not given, do not change the ordered information.
+        """
+        return type(self)._from_data(
+            {
+                self.name: self._column.reorder_categories(
+                    new_categories, ordered=ordered
+                )
+            }
+        )
+
+    def set_categories(
+        self, new_categories, ordered=None, rename: bool = False
+    ) -> Self:
+        """
+        Set the categories to the specified new_categories.
+
+        Parameters
+        ----------
+        new_categories : list-like
+            The categories in new order.
+        ordered : bool, default None
+            Whether or not the categorical is treated as
+            a ordered categorical. If not given, do
+            not change the ordered information.
+        rename : bool, default False
+            Whether or not the `new_categories` should be
+            considered as a rename of the old categories
+            or as reordered categories.
+        """
+        return type(self)._from_data(
+            {
+                self.name: self._column.set_categories(
+                    new_categories, ordered=ordered, rename=rename
+                )
+            }
+        )
+
 
 @_performance_tracking
 def interval_range(
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index 9b6029582ce..ae58af8ebce 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -891,3 +891,59 @@ def test_categorical_maxima(op):
     result = getattr(ser.cat.as_ordered(), op)()
     result_pd = getattr(ser_pd.cat.as_ordered(), op)()
     assert_eq(result, result_pd)
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_index_ordered(ordered):
+    pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered)
+    cudf_ci = cudf.from_pandas(pd_ci)
+    assert pd_ci.ordered == cudf_ci.ordered
+
+
+@pytest.mark.parametrize("method", ["as_ordered", "as_unordered"])
+@pytest.mark.parametrize("ordered", [True, False])
+def test_index_as_ordered(method, ordered):
+    pd_ci = pd.CategoricalIndex([1, 2, 3], ordered=ordered)
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = getattr(pd_ci, method)()
+    result = getattr(cudf_ci, method)()
+    assert_eq(result, expected)
+
+
+def test_index_add_categories():
+    pd_ci = pd.CategoricalIndex([1, 2, 3])
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = pd_ci.add_categories([4])
+    result = cudf_ci.add_categories([4])
+    assert_eq(result, expected)
+
+
+def test_index_remove_categories():
+    pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 2, 3, 4])
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = pd_ci.remove_categories([4])
+    result = cudf_ci.remove_categories([4])
+    assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_index_reorder_categories(ordered):
+    pd_ci = pd.CategoricalIndex([1, 2, 3], categories=[1, 3, 2, 4])
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = pd_ci.reorder_categories([1, 2, 3, 4], ordered=ordered)
+    result = cudf_ci.reorder_categories([1, 2, 3, 4], ordered=ordered)
+    assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_index_set_categories(ordered):
+    pd_ci = pd.CategoricalIndex([1, 2, 3])
+    cudf_ci = cudf.from_pandas(pd_ci)
+
+    expected = pd_ci.set_categories([1, 2, 3, 4], ordered=ordered)
+    result = cudf_ci.set_categories([1, 2, 3, 4], ordered=ordered)
+    assert_eq(result, expected)

From 5feeaf3827bfd20755cdd0516ef0c6ba484a600c Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 30 Jul 2024 08:02:01 -0500
Subject: [PATCH 608/842] [Bug] Remove loud `NativeFile` deprecation noise for
 `read_parquet` from S3 (#16415)

Important follow-up to https://github.com/rapidsai/cudf/pull/16132

Without this PR, using `dask_cudf.read_parquet("s3://...", ...)` will
result in loud deprecation warnings after `compute`/`persist` is called.
This is because dask will always pass `NativeFile` objects down to cudf.

My fault for missing this earlier!
---
 python/dask_cudf/dask_cudf/io/parquet.py      | 76 +++++++++-------
 .../dask_cudf/dask_cudf/io/tests/test_s3.py   | 86 ++++++++++++++++++-
 2 files changed, 128 insertions(+), 34 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index 810a804e428..f0cab953458 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -33,6 +33,7 @@
     _is_local_filesystem,
     _open_remote_files,
 )
+from cudf.utils.utils import maybe_filter_deprecation
 
 
 class CudfEngine(ArrowDatasetEngine):
@@ -110,39 +111,50 @@ def _read_paths(
                     ),
                 )
 
-            # Use cudf to read in data
-            try:
-                df = cudf.read_parquet(
-                    paths_or_fobs,
-                    engine="cudf",
-                    columns=columns,
-                    row_groups=row_groups if row_groups else None,
-                    dataset_kwargs=dataset_kwargs,
-                    categorical_partitions=False,
-                    **kwargs,
-                )
-            except RuntimeError as err:
-                # TODO: Remove try/except after null-schema issue is resolved
-                # (See: https://github.com/rapidsai/cudf/issues/12702)
-                if len(paths_or_fobs) > 1:
-                    df = cudf.concat(
-                        [
-                            cudf.read_parquet(
-                                pof,
-                                engine="cudf",
-                                columns=columns,
-                                row_groups=row_groups[i]
-                                if row_groups
-                                else None,
-                                dataset_kwargs=dataset_kwargs,
-                                categorical_partitions=False,
-                                **kwargs,
-                            )
-                            for i, pof in enumerate(paths_or_fobs)
-                        ]
+            # Filter out deprecation warning unless the user
+            # specifies open_file_options and/or use_python_file_object.
+            # Otherwise, the FutureWarning is out of their control.
+            with maybe_filter_deprecation(
+                (
+                    not open_file_options
+                    and "use_python_file_object" not in kwargs
+                ),
+                message="Support for reading pyarrow's NativeFile is deprecated",
+                category=FutureWarning,
+            ):
+                # Use cudf to read in data
+                try:
+                    df = cudf.read_parquet(
+                        paths_or_fobs,
+                        engine="cudf",
+                        columns=columns,
+                        row_groups=row_groups if row_groups else None,
+                        dataset_kwargs=dataset_kwargs,
+                        categorical_partitions=False,
+                        **kwargs,
                     )
-                else:
-                    raise err
+                except RuntimeError as err:
+                    # TODO: Remove try/except after null-schema issue is resolved
+                    # (See: https://github.com/rapidsai/cudf/issues/12702)
+                    if len(paths_or_fobs) > 1:
+                        df = cudf.concat(
+                            [
+                                cudf.read_parquet(
+                                    pof,
+                                    engine="cudf",
+                                    columns=columns,
+                                    row_groups=row_groups[i]
+                                    if row_groups
+                                    else None,
+                                    dataset_kwargs=dataset_kwargs,
+                                    categorical_partitions=False,
+                                    **kwargs,
+                                )
+                                for i, pof in enumerate(paths_or_fobs)
+                            ]
+                        )
+                    else:
+                        raise err
 
         # Apply filters (if any are defined)
         df = _apply_post_filters(df, filters)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index 3947c69aaa5..ac3245b3748 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -9,6 +9,8 @@
 import pyarrow.fs as pa_fs
 import pytest
 
+from dask.dataframe import assert_eq
+
 import dask_cudf
 
 moto = pytest.importorskip("moto", minversion="3.1.6")
@@ -102,6 +104,11 @@ def s3_context(s3_base, bucket, files=None):
                 pass
 
 
+@pytest.fixture
+def pdf(scope="module"):
+    return pd.DataFrame({"a": [1, 2, 3, 4], "b": [2.1, 2.2, 2.3, 2.4]})
+
+
 def test_read_csv(s3_base, s3so):
     with s3_context(
         s3_base=s3_base, bucket="daskcsv", files={"a.csv": b"a,b\n1,2\n3,4\n"}
@@ -112,6 +119,22 @@ def test_read_csv(s3_base, s3so):
         assert df.a.sum().compute() == 4
 
 
+def test_read_csv_warns(s3_base, s3so):
+    with s3_context(
+        s3_base=s3_base,
+        bucket="daskcsv_warns",
+        files={"a.csv": b"a,b\n1,2\n3,4\n"},
+    ):
+        with pytest.warns(FutureWarning):
+            df = dask_cudf.read_csv(
+                "s3://daskcsv_warns/*.csv",
+                blocksize="50 B",
+                storage_options=s3so,
+                use_python_file_object=True,
+            )
+            assert df.a.sum().compute() == 4
+
+
 @pytest.mark.parametrize(
     "open_file_options",
     [
@@ -120,8 +143,7 @@ def test_read_csv(s3_base, s3so):
         {"open_file_func": None},
     ],
 )
-def test_read_parquet(s3_base, s3so, open_file_options):
-    pdf = pd.DataFrame({"a": [1, 2, 3, 4], "b": [2.1, 2.2, 2.3, 2.4]})
+def test_read_parquet_open_file_options(s3_base, s3so, open_file_options, pdf):
     buffer = BytesIO()
     pdf.to_parquet(path=buffer)
     buffer.seek(0)
@@ -142,3 +164,63 @@ def test_read_parquet(s3_base, s3so, open_file_options):
             assert df.a.sum().compute() == 10
         with pytest.warns(FutureWarning):
             assert df.b.sum().compute() == 9
+
+
+def test_read_parquet(s3_base, s3so, pdf):
+    fname = "test_parquet_reader_dask.parquet"
+    bucket = "parquet"
+    buffer = BytesIO()
+    pdf.to_parquet(path=buffer)
+    buffer.seek(0)
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+        got = dask_cudf.read_parquet(
+            f"s3://{bucket}/{fname}",
+            storage_options=s3so,
+        )
+        assert_eq(pdf, got)
+
+
+def test_read_parquet_use_python_file_object(s3_base, s3so, pdf):
+    fname = "test_parquet_use_python_file_object.parquet"
+    bucket = "parquet"
+    buffer = BytesIO()
+    pdf.to_parquet(path=buffer)
+    buffer.seek(0)
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+        with pytest.warns(FutureWarning):
+            got = dask_cudf.read_parquet(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                read={"use_python_file_object": True},
+            ).head()
+            assert_eq(pdf, got)
+
+
+def test_read_orc(s3_base, s3so, pdf):
+    fname = "test_orc_reader_dask.orc"
+    bucket = "orc"
+    buffer = BytesIO()
+    pdf.to_orc(path=buffer)
+    buffer.seek(0)
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+        got = dask_cudf.read_orc(
+            f"s3://{bucket}/{fname}",
+            storage_options=s3so,
+        )
+        assert_eq(pdf, got)
+
+
+def test_read_orc_use_python_file_object(s3_base, s3so, pdf):
+    fname = "test_orc_use_python_file_object.orc"
+    bucket = "orc"
+    buffer = BytesIO()
+    pdf.to_orc(path=buffer)
+    buffer.seek(0)
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+        with pytest.warns(FutureWarning):
+            got = dask_cudf.read_orc(
+                f"s3://{bucket}/{fname}",
+                storage_options=s3so,
+                use_python_file_object=True,
+            ).head()
+            assert_eq(pdf, got)

From 0f07b0bb5e2cc89ca66e9d9639ff6ac961ec0471 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 30 Jul 2024 08:02:21 -0500
Subject: [PATCH 609/842] Enable prefetching before `runpy` (#16427)

This PR enables prefetching before we execute the `runpy` module and
script code.
---
 python/cudf/cudf/pandas/__main__.py | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py
index d4cb42d4c0b..591744ce793 100644
--- a/python/cudf/cudf/pandas/__main__.py
+++ b/python/cudf/cudf/pandas/__main__.py
@@ -73,6 +73,16 @@ def main():
     args = parser.parse_args()
 
     rmm_mode = install()
+    if "managed" in rmm_mode:
+        for key in {
+            "column_view::get_data",
+            "mutable_column_view::get_data",
+            "gather",
+            "hash_join",
+        }:
+            from cudf._lib import pylibcudf
+
+            pylibcudf.experimental.enable_prefetching(key)
     with profile(args.profile, args.line_profile, args.args[0]) as fn:
         args.args[0] = fn
         if args.module:
@@ -86,17 +96,6 @@ def main():
             sys.argv[:] = args.args
             runpy.run_path(args.args[0], run_name="__main__")
 
-    if "managed" in rmm_mode:
-        for key in {
-            "column_view::get_data",
-            "mutable_column_view::get_data",
-            "gather",
-            "hash_join",
-        }:
-            from cudf._lib import pylibcudf
-
-            pylibcudf.experimental.enable_prefetching(key)
-
 
 if __name__ == "__main__":
     main()

From dbf4bd02a8fdccd1891edbc2d049c3ddddb234b3 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 30 Jul 2024 12:14:14 -0500
Subject: [PATCH 610/842] Add about rmm modes in `cudf.pandas` docs (#16404)

This PR adds user facing docs for rmm memory modes and prefetching.

---------

Co-authored-by: Mark Harris <783069+harrism@users.noreply.github.com>
Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 docs/cudf/source/cudf_pandas/how-it-works.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/docs/cudf/source/cudf_pandas/how-it-works.md b/docs/cudf/source/cudf_pandas/how-it-works.md
index 75f57742ac9..8efd9d7e063 100644
--- a/docs/cudf/source/cudf_pandas/how-it-works.md
+++ b/docs/cudf/source/cudf_pandas/how-it-works.md
@@ -36,3 +36,19 @@ transfers.
 When using `cudf.pandas`, cuDF's [pandas compatibility
 mode](api.options) is automatically enabled, ensuring consistency with
 pandas-specific semantics like default sort ordering.
+
+`cudf.pandas` uses a managed memory pool by default. This allows `cudf.pandas` to process datasets larger than the memory of the GPU it is running on. Managed memory prefetching is also enabled by default to improve memory access performance. For more information on CUDA Unified Memory (managed memory), performance, and prefetching, see [this NVIDIA Developer blog post](https://developer.nvidia.com/blog/improving-gpu-memory-oversubscription-performance/).
+
+Pool allocators improve allocation performance. Without using one, memory
+allocation may be a bottleneck depending on the workload. Managed memory
+enables oversubscribing GPU memory. This allows cudf.pandas to process
+data larger than GPU memory in many cases, without CPU (Pandas) fallback.
+
+Other memory allocators can be used by changing the environment
+variable `CUDF_PANDAS_RMM_MODE` to one of the following.
+
+1. "managed_pool" (default): CUDA Unified Memory (managed memory) with RMM's asynchronous pool allocator.
+2. "managed": CUDA Unified Memory, (managed memory) with no pool allocator.
+3. "async": CUDA's built-in pool asynchronous pool allocator with normal CUDA device memory.
+4. "pool": RMM's asynchronous pool allocator with normal CUDA device memory.
+5. "cuda": normal CUDA device memory with no pool allocator.

From 8def2ec1acac6a538002db011d977bb22cfbda82 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 30 Jul 2024 14:34:59 -0500
Subject: [PATCH 611/842] Add Java APIs to copy column data to host
 asynchronously (#16429)

Adds Java methods to ColumnView to allow copying of column data to host memory asynchronously.  This can be used to avoid many unnecessary stream synchronization when copying many columns to the host.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/16429
---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 52 +++++++++++++------
 .../java/ai/rapids/cudf/HostColumnVector.java |  4 ++
 .../ai/rapids/cudf/HostColumnVectorCore.java  |  4 +-
 .../ai/rapids/cudf/JCudfSerialization.java    |  5 +-
 4 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 997ff77bae3..8ff2f0f0a73 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -5034,8 +5034,8 @@ private static NestedColumnVector createNestedColumnVector(DType type, long rows
   // DATA MOVEMENT
   /////////////////////////////////////////////////////////////////////////////
 
-  private static HostColumnVectorCore copyToHostNestedHelper(
-      ColumnView deviceCvPointer, HostMemoryAllocator hostMemoryAllocator) {
+  private static HostColumnVectorCore copyToHostAsyncNestedHelper(
+      Cuda.Stream stream, ColumnView deviceCvPointer, HostMemoryAllocator hostMemoryAllocator) {
     if (deviceCvPointer == null) {
       return null;
     }
@@ -5056,20 +5056,20 @@ private static HostColumnVectorCore copyToHostNestedHelper(
       currValidity = deviceCvPointer.getValid();
       if (currData != null) {
         hostData = hostMemoryAllocator.allocate(currData.length);
-        hostData.copyFromDeviceBuffer(currData);
+        hostData.copyFromDeviceBufferAsync(currData, stream);
       }
       if (currValidity != null) {
         hostValid = hostMemoryAllocator.allocate(currValidity.length);
-        hostValid.copyFromDeviceBuffer(currValidity);
+        hostValid.copyFromDeviceBufferAsync(currValidity, stream);
       }
       if (currOffsets != null) {
         hostOffsets = hostMemoryAllocator.allocate(currOffsets.length);
-        hostOffsets.copyFromDeviceBuffer(currOffsets);
+        hostOffsets.copyFromDeviceBufferAsync(currOffsets, stream);
       }
       int numChildren = deviceCvPointer.getNumChildren();
       for (int i = 0; i < numChildren; i++) {
         try(ColumnView childDevPtr = deviceCvPointer.getChildColumnView(i)) {
-          children.add(copyToHostNestedHelper(childDevPtr, hostMemoryAllocator));
+          children.add(copyToHostAsyncNestedHelper(stream, childDevPtr, hostMemoryAllocator));
         }
       }
       currNullCount = deviceCvPointer.getNullCount();
@@ -5103,11 +5103,20 @@ private static HostColumnVectorCore copyToHostNestedHelper(
     }
   }
 
+  /** Copy the data to the host synchronously. */
+  public HostColumnVector copyToHost(HostMemoryAllocator hostMemoryAllocator) {
+    HostColumnVector result = copyToHostAsync(Cuda.DEFAULT_STREAM, hostMemoryAllocator);
+    Cuda.DEFAULT_STREAM.sync();
+    return result;
+  }
+
   /**
-   * Copy the data to the host.
+   * Copy the data to the host asynchronously. The caller MUST synchronize on the stream
+   * before examining the result.
    */
-  public HostColumnVector copyToHost(HostMemoryAllocator hostMemoryAllocator) {
-    try (NvtxRange toHost = new NvtxRange("ensureOnHost", NvtxColor.BLUE)) {
+  public HostColumnVector copyToHostAsync(Cuda.Stream stream,
+                                          HostMemoryAllocator hostMemoryAllocator) {
+    try (NvtxRange toHost = new NvtxRange("toHostAsync", NvtxColor.BLUE)) {
       HostMemoryBuffer hostDataBuffer = null;
       HostMemoryBuffer hostValidityBuffer = null;
       HostMemoryBuffer hostOffsetsBuffer = null;
@@ -5127,16 +5136,16 @@ public HostColumnVector copyToHost(HostMemoryAllocator hostMemoryAllocator) {
         if (!type.isNestedType()) {
           if (valid != null) {
             hostValidityBuffer = hostMemoryAllocator.allocate(valid.getLength());
-            hostValidityBuffer.copyFromDeviceBuffer(valid);
+            hostValidityBuffer.copyFromDeviceBufferAsync(valid, stream);
           }
           if (offsets != null) {
             hostOffsetsBuffer = hostMemoryAllocator.allocate(offsets.length);
-            hostOffsetsBuffer.copyFromDeviceBuffer(offsets);
+            hostOffsetsBuffer.copyFromDeviceBufferAsync(offsets, stream);
           }
           // If a strings column is all null values there is no data buffer allocated
           if (data != null) {
             hostDataBuffer = hostMemoryAllocator.allocate(data.length);
-            hostDataBuffer.copyFromDeviceBuffer(data);
+            hostDataBuffer.copyFromDeviceBufferAsync(data, stream);
           }
           HostColumnVector ret = new HostColumnVector(type, rows, Optional.of(nullCount),
               hostDataBuffer, hostValidityBuffer, hostOffsetsBuffer);
@@ -5145,21 +5154,21 @@ public HostColumnVector copyToHost(HostMemoryAllocator hostMemoryAllocator) {
         } else {
           if (data != null) {
             hostDataBuffer = hostMemoryAllocator.allocate(data.length);
-            hostDataBuffer.copyFromDeviceBuffer(data);
+            hostDataBuffer.copyFromDeviceBufferAsync(data, stream);
           }
 
           if (valid != null) {
             hostValidityBuffer = hostMemoryAllocator.allocate(valid.getLength());
-            hostValidityBuffer.copyFromDeviceBuffer(valid);
+            hostValidityBuffer.copyFromDeviceBufferAsync(valid, stream);
           }
           if (offsets != null) {
             hostOffsetsBuffer = hostMemoryAllocator.allocate(offsets.getLength());
-            hostOffsetsBuffer.copyFromDeviceBuffer(offsets);
+            hostOffsetsBuffer.copyFromDeviceBufferAsync(offsets, stream);
           }
           List<HostColumnVectorCore> children = new ArrayList<>();
           for (int i = 0; i < getNumChildren(); i++) {
             try (ColumnView childDevPtr = getChildColumnView(i)) {
-              children.add(copyToHostNestedHelper(childDevPtr, hostMemoryAllocator));
+              children.add(copyToHostAsyncNestedHelper(stream, childDevPtr, hostMemoryAllocator));
             }
           }
           HostColumnVector ret = new HostColumnVector(type, rows, Optional.of(nullCount),
@@ -5192,10 +5201,19 @@ public HostColumnVector copyToHost(HostMemoryAllocator hostMemoryAllocator) {
     }
   }
 
+  /** Copy the data to host memory synchronously */
   public HostColumnVector copyToHost() {
     return copyToHost(DefaultHostMemoryAllocator.get());
   }
 
+  /**
+   * Copy the data to the host asynchronously. The caller MUST synchronize on the stream
+   * before examining the result.
+   */
+  public HostColumnVector copyToHostAsync(Cuda.Stream stream) {
+    return copyToHostAsync(stream, DefaultHostMemoryAllocator.get());
+  }
+
   /**
    * Calculate the total space required to copy the data to the host. This should be padded to
    * the alignment that the CPU requires.
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
index 6b41d10fee3..61b11673957 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVector.java
@@ -92,6 +92,8 @@ public interface EventHandler {
   public HostColumnVector(DType type, long rows, Optional<Long> nullCount,
                    HostMemoryBuffer hostDataBuffer, HostMemoryBuffer hostValidityBuffer,
                    HostMemoryBuffer offsetBuffer, List<HostColumnVectorCore> nestedHcv) {
+    // NOTE: This constructor MUST NOT examine the contents of any host buffers, as they may be
+    //       asynchronously written by the device.
     super(type, rows, nullCount, hostDataBuffer, hostValidityBuffer, offsetBuffer, nestedHcv);
     refCount = 0;
     incRefCountInternal(true);
@@ -100,6 +102,8 @@ public HostColumnVector(DType type, long rows, Optional<Long> nullCount,
   HostColumnVector(DType type, long rows, Optional<Long> nullCount,
                    HostMemoryBuffer hostDataBuffer, HostMemoryBuffer hostValidityBuffer,
                    HostMemoryBuffer offsetBuffer) {
+    // NOTE: This constructor MUST NOT examine the contents of any host buffers, as they may be
+    //       asynchronously written by the device.
     super(type, rows, nullCount, hostDataBuffer, hostValidityBuffer, offsetBuffer, new ArrayList<>());
     assert !type.equals(DType.LIST) : "This constructor should not be used for list type";
     if (nullCount.isPresent() && nullCount.get() > 0 && hostValidityBuffer == null) {
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
index 95d209c0984..a225fbf34e1 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -47,6 +47,8 @@ public class HostColumnVectorCore implements AutoCloseable {
   public HostColumnVectorCore(DType type, long rows,
                               Optional<Long> nullCount, HostMemoryBuffer data, HostMemoryBuffer validity,
                               HostMemoryBuffer offsets, List<HostColumnVectorCore> nestedChildren) {
+    // NOTE: This constructor MUST NOT examine the contents of any host buffers, as they may be
+    //       asynchronously written by the device.
     this.offHeap = new OffHeapState(data, validity,  offsets);
     MemoryCleaner.register(this, offHeap);
     this.type = type;
diff --git a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
index 666a8864003..89f363d2b29 100644
--- a/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
+++ b/java/src/main/java/ai/rapids/cudf/JCudfSerialization.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -907,8 +907,9 @@ private static ColumnBufferProvider[] providersFrom(ColumnVector[] columns) {
     boolean success = false;
     try {
       for (int i = 0; i < columns.length; i++) {
-        onHost[i] = columns[i].copyToHost();
+        onHost[i] = columns[i].copyToHostAsync(Cuda.DEFAULT_STREAM);
       }
+      Cuda.DEFAULT_STREAM.sync();
       ColumnBufferProvider[] ret = providersFrom(onHost, true);
       success = true;
       return ret;

From 1f7aae05a23d6f1d650400f8de7892743113a5e3 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 31 Jul 2024 09:27:43 -0500
Subject: [PATCH 612/842] Enable prefetching in cudf.pandas.install() (#16439)

This PR enables `cudf.pandas` managed memory prefetching in
`cudf.pandas.install()`, to ensure that prefetching is enabled for all
methods of enabling `cudf.pandas`.

I also fixed a bug in libcudf's prefetching logic, where it tried to
compute the number of characters in a strings column view even if the
string column view's data is `nullptr`. This errors, so we must avoid
the `chars_size()` call and stop the prefetch attempt early.
---
 cpp/src/column/column_view.cpp      |  5 ++++-
 cpp/src/utilities/prefetch.cpp      | 14 ++++++++++++++
 python/cudf/cudf/pandas/__init__.py | 19 +++++++++++++++++--
 python/cudf/cudf/pandas/__main__.py | 12 +-----------
 4 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index b0f9e9f0e74..386c5ebe478 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -45,7 +45,10 @@ void prefetch_col_data(ColumnView& col, void const* data_ptr, std::string_view k
         key, data_ptr, col.size() * size_of(col.type()), cudf::get_default_stream());
     } else if (col.type().id() == type_id::STRING) {
       strings_column_view scv{col};
-
+      if (data_ptr == nullptr) {
+        // Do not call chars_size if the data_ptr is nullptr.
+        return;
+      }
       cudf::experimental::prefetch::detail::prefetch_noexcept(
         key,
         data_ptr,
diff --git a/cpp/src/utilities/prefetch.cpp b/cpp/src/utilities/prefetch.cpp
index 86d6cc00764..58971552758 100644
--- a/cpp/src/utilities/prefetch.cpp
+++ b/cpp/src/utilities/prefetch.cpp
@@ -51,6 +51,20 @@ cudaError_t prefetch_noexcept(std::string_view key,
                               rmm::cuda_stream_view stream,
                               rmm::cuda_device_id device_id) noexcept
 {
+  // Don't try to prefetch nullptrs or empty data. Sometimes libcudf has column
+  // views that use nullptrs with a nonzero size as an optimization.
+  if (ptr == nullptr) {
+    if (prefetch_config::instance().debug) {
+      std::cerr << "Skipping prefetch of nullptr" << std::endl;
+    }
+    return cudaSuccess;
+  }
+  if (size == 0) {
+    if (prefetch_config::instance().debug) {
+      std::cerr << "Skipping prefetch of size 0" << std::endl;
+    }
+    return cudaSuccess;
+  }
   if (prefetch_config::instance().get(key)) {
     if (prefetch_config::instance().debug) {
       std::cerr << "Prefetching " << size << " bytes for key " << key << " at location " << ptr
diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py
index bf88c950385..a6667a7bcd9 100644
--- a/python/cudf/cudf/pandas/__init__.py
+++ b/python/cudf/cudf/pandas/__init__.py
@@ -7,6 +7,8 @@
 
 import rmm.mr
 
+from cudf._lib import pylibcudf
+
 from .fast_slow_proxy import is_proxy_object
 from .magics import load_ipython_extension
 from .profiler import Profiler
@@ -16,6 +18,19 @@
 
 LOADED = False
 
+_SUPPORTED_PREFETCHES = {
+    "column_view::get_data",
+    "mutable_column_view::get_data",
+    "gather",
+    "hash_join",
+}
+
+
+def _enable_managed_prefetching(rmm_mode):
+    if "managed" in rmm_mode:
+        for key in _SUPPORTED_PREFETCHES:
+            pylibcudf.experimental.enable_prefetching(key)
+
 
 def install():
     """Enable Pandas Accelerator Mode."""
@@ -33,7 +48,7 @@ def install():
             f"cudf.pandas detected an already configured memory resource, ignoring 'CUDF_PANDAS_RMM_MODE'={str(rmm_mode)}",
             UserWarning,
         )
-        return rmm_mode
+        return
 
     free_memory, _ = rmm.mr.available_device_memory()
     free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
@@ -57,7 +72,7 @@ def install():
     elif rmm_mode != "cuda":
         raise ValueError(f"Unsupported {rmm_mode=}")
     rmm.mr.set_current_device_resource(new_mr)
-    return rmm_mode
+    _enable_managed_prefetching(rmm_mode)
 
 
 def pytest_load_initial_conftests(early_config, parser, args):
diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py
index 591744ce793..3a82829eb7a 100644
--- a/python/cudf/cudf/pandas/__main__.py
+++ b/python/cudf/cudf/pandas/__main__.py
@@ -72,17 +72,7 @@ def main():
 
     args = parser.parse_args()
 
-    rmm_mode = install()
-    if "managed" in rmm_mode:
-        for key in {
-            "column_view::get_data",
-            "mutable_column_view::get_data",
-            "gather",
-            "hash_join",
-        }:
-            from cudf._lib import pylibcudf
-
-            pylibcudf.experimental.enable_prefetching(key)
+    install()
     with profile(args.profile, args.line_profile, args.args[0]) as fn:
         args.args[0] = fn
         if args.module:

From 79a1eed785fccbca2c20ff5cc844ec1a9e741ee5 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 31 Jul 2024 11:00:30 -0400
Subject: [PATCH 613/842] Remove checking for specific tests in memcheck script
 (#16412)

Removes the checking for specific gtests in the `run_cudf_memcheck_ctests.sh` script. Each of those tests can check the `LIBCUDF_MEMCHECK_ENABLED` environment variable themselves.
This simplifies the script logic and may help with replacing this with ctest logic in the future.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16412
---
 ci/run_cudf_memcheck_ctests.sh                                | 3 ---
 cpp/tests/error/error_handling_test.cu                        | 4 ++++
 .../test_default_stream_identification.cu                     | 1 +
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/ci/run_cudf_memcheck_ctests.sh b/ci/run_cudf_memcheck_ctests.sh
index aacd93e3b96..653829db419 100755
--- a/ci/run_cudf_memcheck_ctests.sh
+++ b/ci/run_cudf_memcheck_ctests.sh
@@ -15,9 +15,6 @@ export LIBCUDF_MEMCHECK_ENABLED=1
 for gt in ./*_TEST ; do
   test_name=$(basename ${gt})
   # Run gtests with compute-sanitizer
-  if [[ "$test_name" == "ERROR_TEST" ]] || [[ "$test_name" == "STREAM_IDENTIFICATION_TEST" ]]; then
-    continue
-  fi
   echo "Running compute-sanitizer on $test_name"
   compute-sanitizer --tool memcheck ${gt} "$@"
 done
diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index 46d01ec14ff..1dfe45556c4 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -50,6 +50,8 @@ CUDF_KERNEL void test_kernel(int* data) { data[threadIdx.x] = threadIdx.x; }
 // calls.
 TEST(StreamCheck, FailedKernel)
 {
+  if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { GTEST_SKIP(); }
+
   rmm::cuda_stream stream;
   int a;
   test_kernel<<<0, 0, 0, stream.value()>>>(&a);
@@ -61,6 +63,8 @@ TEST(StreamCheck, FailedKernel)
 
 TEST(StreamCheck, CatchFailedKernel)
 {
+  if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { GTEST_SKIP(); }
+
   rmm::cuda_stream stream;
   int a;
   test_kernel<<<0, 0, 0, stream.value()>>>(&a);
diff --git a/cpp/tests/identify_stream_usage/test_default_stream_identification.cu b/cpp/tests/identify_stream_usage/test_default_stream_identification.cu
index 268c7b37c81..c5fb75a7a8e 100644
--- a/cpp/tests/identify_stream_usage/test_default_stream_identification.cu
+++ b/cpp/tests/identify_stream_usage/test_default_stream_identification.cu
@@ -33,6 +33,7 @@ void test_cudaLaunchKernel()
   } catch (std::runtime_error&) {
     return;
   }
+  if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
   throw std::runtime_error("No exception raised for kernel on default stream!");
 }
 

From 9336c172b1f61408e2392cbbd953e7f7e6e9ae3d Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Wed, 31 Jul 2024 16:27:26 +0100
Subject: [PATCH 614/842] Add upper bound pin for polars (#16442)

This aligns the polars dependency with the most modern version supported by cudf-polars in this branch.

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16442
---
 dependencies.yaml                 | 2 +-
 python/cudf_polars/pyproject.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 0fa32404156..aeb030313ed 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -630,7 +630,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.0
+          - polars>=1.0,<1.3
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index f8a1973bdbf..424c83a5199 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -20,7 +20,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cudf==24.10.*,>=0.0.0a0",
-    "polars>=1.0",
+    "polars>=1.0,<1.3",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",

From 0f3b3808348debca8458bf73575745770b494ddc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 31 Jul 2024 07:38:56 -1000
Subject: [PATCH 615/842] Add environment variable to log cudf.pandas fallback
 calls (#16161)

Introduces a new environment variable `LOG_FAST_FALLBACK` which will create a structured log of the call that failed.

An example of the log is

```
INFO:root:{"debug_type": "LOG_FAST_FALLBACK", "failed_call": "pandas._libs.interval.Interval(0,1)", "exception": "Exception", "exception_message": "Cannot transform _Unusable", "pandas_object": "pandas._libs.interval.Interval", "passed_args": "0,1,", "passed_kwargs": {}}
```

I could turn this into a warning instead, but I imagine we would want to first utilize this to parse the failures and see generalized failures in aggregate

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16161
---
 python/cudf/cudf/pandas/_logger.py         | 80 ++++++++++++++++++++++
 python/cudf/cudf/pandas/fast_slow_proxy.py |  6 +-
 2 files changed, 85 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf/pandas/_logger.py

diff --git a/python/cudf/cudf/pandas/_logger.py b/python/cudf/cudf/pandas/_logger.py
new file mode 100644
index 00000000000..68923c3e35c
--- /dev/null
+++ b/python/cudf/cudf/pandas/_logger.py
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import json
+import logging
+
+logging.basicConfig(
+    filename="cudf_pandas_unit_tests_debug.log", level=logging.INFO
+)
+logger = logging.getLogger()
+
+
+class StructuredMessage:
+    # https://docs.python.org/3/howto/logging-cookbook.html#implementing-structured-logging
+    def __init__(self, debug_type: str, /, **kwargs) -> None:
+        self.debug_type = debug_type
+        self.kwargs = kwargs
+
+    def __str__(self) -> str:
+        log = {"debug_type": self.debug_type}
+        return json.dumps({**log, **self.kwargs})
+
+
+def reprify(arg) -> str:
+    """Attempt to return arg's repr for logging."""
+    try:
+        return repr(arg)
+    except Exception:
+        return "<REPR FAILED>"
+
+
+def log_fallback(
+    slow_args: tuple, slow_kwargs: dict, exception: Exception
+) -> None:
+    """Log when a fast call falls back to the slow path."""
+    caller = slow_args[0]
+    module = getattr(caller, "__module__", "")
+    obj_name = getattr(caller, "__qualname__", type(caller).__qualname__)
+    if module:
+        slow_object = f"{module}.{obj_name}"
+    else:
+        slow_object = obj_name
+    # TODO: Maybe use inspect.signature to map called args and kwargs
+    # to their keyword names, but a user calling an API incorrectly would
+    # break this.
+    caller_args = slow_args[1]
+    args_passed = ", ".join((reprify(arg) for arg in caller_args))
+    args_types_passed = ", ".join((type(arg).__name__ for arg in caller_args))
+    kwargs_passed = {}
+    kwargs_types_passed = ""
+    if len(slow_args) == 3:
+        caller_kwargs = slow_args[2]
+        if caller_kwargs:
+            fmt_kwargs = ", ".join(
+                f"{kwarg}={reprify(value)}"
+                for kwarg, value in caller_kwargs.items()
+            )
+            kwargs_types_passed = ", ".join(
+                f"{kwarg}={type(value).__name__}"
+                for kwarg, value in caller_kwargs.items()
+            )
+            args_passed = f"{args_passed}, {fmt_kwargs}"
+            kwargs_passed = {
+                kwarg: reprify(value) for kwarg, value in caller_kwargs.items()
+            }
+    message = StructuredMessage(
+        "LOG_FAST_FALLBACK",
+        failed_call=f"{slow_object}({args_passed})",
+        exception=type(exception).__name__,
+        exception_message=str(exception),
+        slow_object=slow_object,
+        args_passed=args_passed,
+        kwargs_passed=kwargs_passed,
+        args_types_passed=args_types_passed,
+        kwargs_types_passed=kwargs_types_passed,
+    )
+    logger.info(message)
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index dfb729cae6b..bb678fd1efe 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -930,13 +930,17 @@ def _fast_slow_function_call(
                             "Pandas debugging mode failed. "
                             f"The exception was {e}."
                         )
-    except Exception:
+    except Exception as err:
         with nvtx.annotate(
             "EXECUTE_SLOW",
             color=_CUDF_PANDAS_NVTX_COLORS["EXECUTE_SLOW"],
             domain="cudf_pandas",
         ):
             slow_args, slow_kwargs = _slow_arg(args), _slow_arg(kwargs)
+            if _env_get_bool("LOG_FAST_FALLBACK", False):
+                from ._logger import log_fallback
+
+                log_fallback(slow_args, slow_kwargs, err)
             with disable_module_accelerator():
                 result = func(*slow_args, **slow_kwargs)
     return _maybe_wrap_result(result, func, *args, **kwargs), fast

From 9f5e4a353508c1638e1d2d46f7bceab240294797 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 31 Jul 2024 12:39:29 -0500
Subject: [PATCH 616/842] Add `flatbuffers` to `libcudf` build (#16446)

## Description
Without `flatbuffers` being added to the conda environment `libcudf` is
being built in is causing the following build failures:
```
In file included from /nvme/0/pgali/cudf/cpp/src/io/parquet/arrow_schema_writer.cpp:26:
/nvme/0/pgali/cudf/cpp/src/io/parquet/ipc/Message_generated.h:6:10: fatal error: flatbuffers/flatbuffers.h: No such file or directory
    6 | #include <flatbuffers/flatbuffers.h>
      |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~
compilation terminated.
```

## Checklist
- [x] I am familiar with the [Contributing
Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
- [ ] New or existing tests cover these changes.
- [x] The documentation is up to date with these changes.
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 1 +
 conda/environments/all_cuda-125_arch-x86_64.yaml | 1 +
 conda/recipes/libcudf/conda_build_config.yaml    | 3 +++
 conda/recipes/libcudf/meta.yaml                  | 1 +
 dependencies.yaml                                | 1 +
 5 files changed, 7 insertions(+)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index b8d73a01f96..4b2e25140d7 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -30,6 +30,7 @@ dependencies:
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
+- flatbuffers==24.3.25
 - fmt>=10.1.1,<11
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 3f5fae49cbb..c2ae05d0072 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -31,6 +31,7 @@ dependencies:
 - dlpack>=0.8,<1.0
 - doxygen=1.9.1
 - fastavro>=0.22.9
+- flatbuffers==24.3.25
 - fmt>=10.1.1,<11
 - fsspec>=0.6.0
 - gcc_linux-64=11.*
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 4f99411e978..ff7458caf82 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -31,6 +31,9 @@ librdkafka_version:
 fmt_version:
   - ">=10.1.1,<11"
 
+flatbuffers_version:
+  - "=24.3.25"
+
 spdlog_version:
   - ">=1.12.0,<1.13"
 
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 76115362b6c..aa1c94a4bca 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -68,6 +68,7 @@ requirements:
     - dlpack {{ dlpack_version }}
     - librdkafka {{ librdkafka_version }}
     - fmt {{ fmt_version }}
+    - flatbuffers {{ flatbuffers_version }}
     - spdlog {{ spdlog_version }}
     - zlib {{ zlib_version }}
 
diff --git a/dependencies.yaml b/dependencies.yaml
index 48433d8e5c1..7ecce362101 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -287,6 +287,7 @@ dependencies:
       - output_types: conda
         packages:
           - fmt>=10.1.1,<11
+          - flatbuffers==24.3.25
           - librmm==24.8.*,>=0.0.0a0
           - libkvikio==24.8.*,>=0.0.0a0
           - librdkafka>=1.9.0,<1.10.0a0

From ed5e4aa3923279965f733a263d92f4dabf9b434d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 31 Jul 2024 13:44:26 -0400
Subject: [PATCH 617/842] Fix parquet_field_list read_func lambda capture
 invalid this pointer (#16440)

## Description
Fixes internal parquet_field_list subclass constructors capturing
invalid this pointer when passing objects to std::make_tuple. The
std::make_tuple usage creates a parameter object that is constructed,
moved, and destroyed. The this pointer is captured during constructor
call. The move constructor is called which creates its own separate this
pointer (all member data is moved/copied appropriately). The original
this pointer is invalidated by the following destructor. The lambda that
was captured in the constructor no longer contains a valid this value in
the final moved object.

This PR removes the dependency on the this pointer in the lambda and
captures the vector reference instead which is preserved correctly in
the object move. The ctor, move, dtor pattern occurs because of how
std::make_tuple is implemented by the standard library.

Closes https://github.com/rapidsai/cudf/issues/16408

## Checklist
- [x] I am familiar with the [Contributing
Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md).
- [x] New or existing tests cover these changes.
- [x] The documentation is up to date with these changes.
---
 .../io/parquet/compact_protocol_reader.cpp    | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index 192833507b0..e13ed5e85e5 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -137,10 +137,10 @@ class parquet_field_bool : public parquet_field {
 struct parquet_field_bool_list : public parquet_field_list<bool, FieldType::BOOLEAN_TRUE> {
   parquet_field_bool_list(int f, std::vector<bool>& v) : parquet_field_list(f, v)
   {
-    auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) {
+    auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) {
       auto const current_byte = cpr->getb();
       assert_bool_field_type(current_byte);
-      this->val[i] = current_byte == static_cast<int>(FieldType::BOOLEAN_TRUE);
+      val[i] = current_byte == static_cast<int>(FieldType::BOOLEAN_TRUE);
     };
     bind_read_func(read_value);
   }
@@ -188,8 +188,8 @@ template <typename T, FieldType EXPECTED_TYPE>
 struct parquet_field_int_list : public parquet_field_list<T, EXPECTED_TYPE> {
   parquet_field_int_list(int f, std::vector<T>& v) : parquet_field_list<T, EXPECTED_TYPE>(f, v)
   {
-    auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) {
-      this->val[i] = cpr->get_zigzag<T>();
+    auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) {
+      val[i] = cpr->get_zigzag<T>();
     };
     this->bind_read_func(read_value);
   }
@@ -229,11 +229,11 @@ class parquet_field_string : public parquet_field {
 struct parquet_field_string_list : public parquet_field_list<std::string, FieldType::BINARY> {
   parquet_field_string_list(int f, std::vector<std::string>& v) : parquet_field_list(f, v)
   {
-    auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) {
+    auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) {
       auto const l = cpr->get_u32();
       CUDF_EXPECTS(l < static_cast<size_t>(cpr->m_end - cpr->m_cur), "string length mismatch");
 
-      this->val[i].assign(reinterpret_cast<char const*>(cpr->m_cur), l);
+      val[i].assign(reinterpret_cast<char const*>(cpr->m_cur), l);
       cpr->m_cur += l;
     };
     bind_read_func(read_value);
@@ -269,8 +269,8 @@ struct parquet_field_enum_list : public parquet_field_list<Enum, FieldType::I32>
   parquet_field_enum_list(int f, std::vector<Enum>& v)
     : parquet_field_list<Enum, FieldType::I32>(f, v)
   {
-    auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) {
-      this->val[i] = static_cast<Enum>(cpr->get_i32());
+    auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) {
+      val[i] = static_cast<Enum>(cpr->get_i32());
     };
     this->bind_read_func(read_value);
   }
@@ -354,8 +354,8 @@ struct parquet_field_struct_list : public parquet_field_list<T, FieldType::STRUC
   parquet_field_struct_list(int f, std::vector<T>& v)
     : parquet_field_list<T, FieldType::STRUCT>(f, v)
   {
-    auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) {
-      cpr->read(&this->val[i]);
+    auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) {
+      cpr->read(&val[i]);
     };
     this->bind_read_func(read_value);
   }
@@ -395,7 +395,7 @@ struct parquet_field_binary_list
   : public parquet_field_list<std::vector<uint8_t>, FieldType::BINARY> {
   parquet_field_binary_list(int f, std::vector<std::vector<uint8_t>>& v) : parquet_field_list(f, v)
   {
-    auto const read_value = [this](uint32_t i, CompactProtocolReader* cpr) {
+    auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) {
       auto const l = cpr->get_u32();
       CUDF_EXPECTS(l <= static_cast<size_t>(cpr->m_end - cpr->m_cur), "binary length mismatch");
 
@@ -482,9 +482,7 @@ void CompactProtocolReader::skip_struct_field(int t, int depth)
         skip_struct_field(t, depth + 1);
       }
       break;
-    default:
-      // printf("unsupported skip for type %d\n", t);
-      break;
+    default: break;
   }
 }
 

From 5bcd8e062369a7d15222fa6d0bcc0b310553edbf Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 31 Jul 2024 10:34:37 -1000
Subject: [PATCH 618/842] Align DatetimeIndex APIs with pandas 2.x (#16367)

Mostly transferring methods that were defined on `Series.dt` methods to `DatetimeColumn` so it could be reused in `DatetimeIndex`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16367
---
 docs/cudf/source/conf.py                 |   2 +
 python/cudf/cudf/core/column/datetime.py |  56 ++++++
 python/cudf/cudf/core/index.py           | 211 ++++++++++++++++++++++-
 python/cudf/cudf/core/series.py          |  43 ++---
 python/cudf/cudf/tests/test_datetime.py  | 107 ++++++++++++
 5 files changed, 385 insertions(+), 34 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 7421d9be298..7ebafc0da95 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -556,6 +556,8 @@ def on_missing_reference(app, env, node, contnode):
     ("py:class", "Dtype"),
     # The following are erroneously warned due to
     # https://github.com/sphinx-doc/sphinx/issues/11225
+    ("py:obj", "cudf.DatetimeIndex.time"),
+    ("py:obj", "cudf.DatetimeIndex.date"),
     ("py:obj", "cudf.Index.values_host"),
     ("py:class", "pa.Array"),
     ("py:class", "ScalarLike"),
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 73902789c11..81fbb914842 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -286,6 +286,62 @@ def dayofyear(self) -> ColumnBase:
     def day_of_year(self) -> ColumnBase:
         return self.get_dt_field("day_of_year")
 
+    @property
+    def is_month_start(self) -> ColumnBase:
+        return (self.day == 1).fillna(False)
+
+    @property
+    def is_month_end(self) -> ColumnBase:
+        last_day_col = libcudf.datetime.last_day_of_month(self)
+        return (self.day == last_day_col.day).fillna(False)
+
+    @property
+    def is_quarter_end(self) -> ColumnBase:
+        last_month = self.month.isin([3, 6, 9, 12])
+        return (self.is_month_end & last_month).fillna(False)
+
+    @property
+    def is_quarter_start(self) -> ColumnBase:
+        first_month = self.month.isin([1, 4, 7, 10])
+        return (self.is_month_start & first_month).fillna(False)
+
+    @property
+    def is_year_end(self) -> ColumnBase:
+        day_of_year = self.day_of_year
+        leap_dates = libcudf.datetime.is_leap_year(self)
+
+        leap = day_of_year == cudf.Scalar(366)
+        non_leap = day_of_year == cudf.Scalar(365)
+        return libcudf.copying.copy_if_else(leap, non_leap, leap_dates).fillna(
+            False
+        )
+
+    @property
+    def is_year_start(self) -> ColumnBase:
+        return (self.day_of_year == 1).fillna(False)
+
+    @property
+    def days_in_month(self) -> ColumnBase:
+        return libcudf.datetime.days_in_month(self)
+
+    @property
+    def day_of_week(self) -> ColumnBase:
+        raise NotImplementedError("day_of_week is currently not implemented.")
+
+    @property
+    def is_normalized(self) -> bool:
+        raise NotImplementedError(
+            "is_normalized is currently not implemented."
+        )
+
+    def to_julian_date(self) -> ColumnBase:
+        raise NotImplementedError(
+            "to_julian_date is currently not implemented."
+        )
+
+    def normalize(self) -> ColumnBase:
+        raise NotImplementedError("normalize is currently not implemented.")
+
     @property
     def values(self):
         """
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 8c3b091abec..40a5d9ff259 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -61,6 +61,7 @@
 
 if TYPE_CHECKING:
     from collections.abc import Generator, Iterable
+    from datetime import tzinfo
 
 
 def ensure_index(index_like: Any) -> BaseIndex:
@@ -1680,7 +1681,7 @@ class DatetimeIndex(Index):
     copy : bool
         Make a copy of input.
     freq : str, optional
-        This is not yet supported
+        Frequency of the DatetimeIndex
     tz : pytz.timezone or dateutil.tz.tzfile
         This is not yet supported
     ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'
@@ -1847,6 +1848,210 @@ def searchsorted(
             value, side=side, ascending=ascending, na_position=na_position
         )
 
+    def as_unit(self, unit: str, round_ok: bool = True) -> Self:
+        """
+        Convert to a dtype with the given unit resolution.
+
+        Currently not implemented.
+
+        Parameters
+        ----------
+        unit : {'s', 'ms', 'us', 'ns'}
+        round_ok : bool, default True
+            If False and the conversion requires rounding, raise ValueError.
+        """
+        raise NotImplementedError("as_unit is currently not implemented")
+
+    def mean(self, *, skipna: bool = True, axis: int | None = 0):
+        return self._column.mean(skipna=skipna)
+
+    def std(self, *, skipna: bool = True, axis: int | None = 0, ddof: int = 1):
+        return self._column.std(skipna=skipna, ddof=ddof)
+
+    def strftime(self, date_format: str) -> Index:
+        """
+        Convert to Index using specified date_format.
+
+        Return an Index of formatted strings specified by date_format, which
+        supports the same string format as the python standard library.
+
+        Parameters
+        ----------
+        date_format : str
+            Date format string (e.g. "%Y-%m-%d").
+        """
+        return Index._from_data(
+            {self.name: self._column.strftime(date_format)}
+        )
+
+    @property
+    def asi8(self) -> cupy.ndarray:
+        return self._column.astype("int64").values
+
+    @property
+    def inferred_freq(self) -> cudf.DateOffset | None:
+        raise NotImplementedError("inferred_freq is currently not implemented")
+
+    @property
+    def freq(self) -> cudf.DateOffset | None:
+        return self._freq
+
+    @freq.setter
+    def freq(self) -> None:
+        raise NotImplementedError("Setting freq is currently not supported.")
+
+    @property
+    def freqstr(self) -> str:
+        raise NotImplementedError("freqstr is currently not implemented")
+
+    @property
+    def resolution(self) -> str:
+        """
+        Returns day, hour, minute, second, millisecond or microsecond
+        """
+        raise NotImplementedError("resolution is currently not implemented")
+
+    @property
+    def unit(self) -> str:
+        return self._column.time_unit
+
+    @property
+    def tz(self) -> tzinfo | None:
+        """
+        Return the timezone.
+
+        Returns
+        -------
+        datetime.tzinfo or None
+            Returns None when the array is tz-naive.
+        """
+        return getattr(self.dtype, "tz", None)
+
+    @property
+    def tzinfo(self) -> tzinfo | None:
+        """
+        Alias for tz attribute
+        """
+        return self.tz
+
+    def to_pydatetime(self) -> np.ndarray:
+        """
+        Return an ndarray of ``datetime.datetime`` objects.
+
+        Returns
+        -------
+        numpy.ndarray
+            An ndarray of ``datetime.datetime`` objects.
+        """
+        return self.to_pandas().to_pydatetime()
+
+    def to_julian_date(self) -> Index:
+        return Index._from_data({self.name: self._column.to_julian_date()})
+
+    def to_period(self, freq) -> pd.PeriodIndex:
+        return self.to_pandas().to_period(freq=freq)
+
+    def normalize(self) -> Self:
+        """
+        Convert times to midnight.
+
+        Currently not implemented.
+        """
+        return type(self)._from_data({self.name: self._column.normalize()})
+
+    @property
+    def time(self) -> np.ndarray:
+        """
+        Returns numpy array of ``datetime.time`` objects.
+
+        The time part of the Timestamps.
+        """
+        return self.to_pandas().time
+
+    @property
+    def timetz(self) -> np.ndarray:
+        """
+        Returns numpy array of ``datetime.time`` objects with timezones.
+
+        The time part of the Timestamps.
+        """
+        return self.to_pandas().timetz
+
+    @property
+    def date(self) -> np.ndarray:
+        """
+        Returns numpy array of python ``datetime.date`` objects.
+
+        Namely, the date part of Timestamps without time and
+        timezone information.
+        """
+        return self.to_pandas().date
+
+    @property
+    def is_month_start(self) -> cupy.ndarray:
+        """
+        Booleans indicating if dates are the first day of the month.
+        """
+        return self._column.is_month_start.values
+
+    @property
+    def is_month_end(self) -> cupy.ndarray:
+        """
+        Booleans indicating if dates are the last day of the month.
+        """
+        return self._column.is_month_end.values
+
+    @property
+    def is_quarter_end(self) -> cupy.ndarray:
+        """
+        Booleans indicating if dates are the last day of the quarter.
+        """
+        return self._column.is_quarter_end.values
+
+    @property
+    def is_quarter_start(self) -> cupy.ndarray:
+        """
+        Booleans indicating if dates are the start day of the quarter.
+        """
+        return self._column.is_quarter_start.values
+
+    @property
+    def is_year_end(self) -> cupy.ndarray:
+        """
+        Booleans indicating if dates are the last day of the year.
+        """
+        return self._column.is_year_end.values
+
+    @property
+    def is_year_start(self) -> cupy.ndarray:
+        """
+        Booleans indicating if dates are the first day of the year.
+        """
+        return self._column.is_year_start.values
+
+    @property
+    def is_normalized(self) -> bool:
+        """
+        Returns True if all of the dates are at midnight ("no time")
+        """
+        return self._column.is_normalized
+
+    @property
+    def days_in_month(self) -> Index:
+        """
+        Get the total number of days in the month that the date falls on.
+        """
+        return Index._from_data({self.name: self._column.days_in_month})
+
+    daysinmonth = days_in_month
+
+    @property
+    def day_of_week(self) -> Index:
+        """
+        Get the day of week that the date falls on.
+        """
+        return Index._from_data({self.name: self._column.day_of_week})
+
     @property  # type: ignore
     @_performance_tracking
     def year(self):
@@ -3391,9 +3596,11 @@ def _get_nearest_indexer(
     return indexer
 
 
-def _validate_freq(freq: Any) -> cudf.DateOffset:
+def _validate_freq(freq: Any) -> cudf.DateOffset | None:
     if isinstance(freq, str):
         return cudf.DateOffset._from_freqstr(freq)
+    elif freq is None:
+        return freq
     elif freq is not None and not isinstance(freq, cudf.DateOffset):
         raise ValueError(f"Invalid frequency: {freq}")
     return cast(cudf.DateOffset, freq)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 10ac1fdfc1e..929af5cd981 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -4415,7 +4415,9 @@ def is_month_start(self) -> Series:
         """
         Booleans indicating if dates are the first day of the month.
         """
-        return (self.day == 1).fillna(False)
+        return self._return_result_like_self(
+            self.series._column.is_month_start
+        )
 
     @property  # type: ignore
     @_performance_tracking
@@ -4462,9 +4464,7 @@ def days_in_month(self) -> Series:
         11    31
         dtype: int16
         """
-        return self._return_result_like_self(
-            libcudf.datetime.days_in_month(self.series._column)
-        )
+        return self._return_result_like_self(self.series._column.days_in_month)
 
     @property  # type: ignore
     @_performance_tracking
@@ -4505,9 +4505,7 @@ def is_month_end(self) -> Series:
         8    False
         dtype: bool
         """  # noqa: E501
-        last_day_col = libcudf.datetime.last_day_of_month(self.series._column)
-        last_day = self._return_result_like_self(last_day_col)
-        return (self.day == last_day.dt.day).fillna(False)
+        return self._return_result_like_self(self.series._column.is_month_end)
 
     @property  # type: ignore
     @_performance_tracking
@@ -4546,14 +4544,10 @@ def is_quarter_start(self) -> Series:
         7    False
         dtype: bool
         """
-        day = self.series._column.get_dt_field("day")
-        first_month = self.series._column.get_dt_field("month").isin(
-            [1, 4, 7, 10]
+        return self._return_result_like_self(
+            self.series._column.is_quarter_start
         )
 
-        result = ((day == cudf.Scalar(1)) & first_month).fillna(False)
-        return self._return_result_like_self(result)
-
     @property  # type: ignore
     @_performance_tracking
     def is_quarter_end(self) -> Series:
@@ -4591,16 +4585,10 @@ def is_quarter_end(self) -> Series:
         7    False
         dtype: bool
         """
-        day = self.series._column.get_dt_field("day")
-        last_day = libcudf.datetime.last_day_of_month(self.series._column)
-        last_day = last_day.get_dt_field("day")
-        last_month = self.series._column.get_dt_field("month").isin(
-            [3, 6, 9, 12]
+        return self._return_result_like_self(
+            self.series._column.is_quarter_end
         )
 
-        result = ((day == last_day) & last_month).fillna(False)
-        return self._return_result_like_self(result)
-
     @property  # type: ignore
     @_performance_tracking
     def is_year_start(self) -> Series:
@@ -4627,10 +4615,7 @@ def is_year_start(self) -> Series:
         2    True
         dtype: bool
         """
-        outcol = self.series._column.get_dt_field(
-            "day_of_year"
-        ) == cudf.Scalar(1)
-        return self._return_result_like_self(outcol.fillna(False))
+        return self._return_result_like_self(self.series._column.is_year_start)
 
     @property  # type: ignore
     @_performance_tracking
@@ -4658,13 +4643,7 @@ def is_year_end(self) -> Series:
         2    False
         dtype: bool
         """
-        day_of_year = self.series._column.get_dt_field("day_of_year")
-        leap_dates = libcudf.datetime.is_leap_year(self.series._column)
-
-        leap = day_of_year == cudf.Scalar(366)
-        non_leap = day_of_year == cudf.Scalar(365)
-        result = cudf._lib.copying.copy_if_else(leap, non_leap, leap_dates)
-        return self._return_result_like_self(result.fillna(False))
+        return self._return_result_like_self(self.series._column.is_year_end)
 
     @_performance_tracking
     def _get_dt_field(self, field: str) -> Series:
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 7ab9ff2ef23..6bc775d2a2c 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -7,6 +7,7 @@
 import cupy as cp
 import numpy as np
 import pandas as pd
+import pandas._testing as tm
 import pyarrow as pa
 import pytest
 
@@ -2429,3 +2430,109 @@ def test_day_month_name_locale_not_implemented(meth, klass):
         obj = obj.dt
     with pytest.raises(NotImplementedError):
         getattr(obj, meth)(locale="pt_BR.utf8")
+
+
+@pytest.mark.parametrize(
+    "attr",
+    [
+        "is_month_start",
+        "is_month_end",
+        "is_quarter_end",
+        "is_quarter_start",
+        "is_year_end",
+        "is_year_start",
+        "days_in_month",
+        "timetz",
+        "time",
+        "date",
+    ],
+)
+def test_dti_datetime_attributes(attr):
+    data = [
+        "2020-01-01",
+        "2020-01-31",
+        "2020-03-01",
+        "2020-03-31",
+        "2020-03-31",
+        "2020-12-31",
+        None,
+    ]
+    pd_dti = pd.DatetimeIndex(data, name="foo")
+    cudf_dti = cudf.from_pandas(pd_dti)
+
+    result = getattr(cudf_dti, attr)
+    expected = getattr(pd_dti, attr)
+    if isinstance(result, np.ndarray):
+        # numpy doesn't assert object arrays with NaT correctly
+        tm.assert_numpy_array_equal(result, expected)
+    else:
+        assert_eq(result, expected)
+
+
+@pytest.mark.parametrize("attr", ["freq", "unit"])
+def test_dti_properties(attr):
+    pd_dti = pd.DatetimeIndex(
+        ["2020-01-01", "2020-01-02"], dtype="datetime64[ns]"
+    )
+    cudf_dti = cudf.DatetimeIndex(
+        ["2020-01-01", "2020-01-02"], dtype="datetime64[ns]"
+    )
+
+    result = getattr(cudf_dti, attr)
+    expected = getattr(pd_dti, attr)
+    assert result == expected
+
+
+def test_dti_asi8():
+    pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo")
+    cudf_dti = cudf.from_pandas(pd_dti)
+
+    result = pd_dti.asi8
+    expected = cudf_dti.asi8
+    assert_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "method, kwargs",
+    [
+        ["mean", {}],
+        pytest.param(
+            "std",
+            {},
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/16444"
+            ),
+        ),
+        pytest.param(
+            "std",
+            {"ddof": 0},
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/16444"
+            ),
+        ),
+    ],
+)
+def test_dti_reduction(method, kwargs):
+    pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo")
+    cudf_dti = cudf.from_pandas(pd_dti)
+
+    result = getattr(cudf_dti, method)(**kwargs)
+    expected = getattr(pd_dti, method)(**kwargs)
+    assert result == expected
+
+
+@pytest.mark.parametrize(
+    "method, kwargs",
+    [
+        ["to_pydatetime", {}],
+        ["to_period", {"freq": "D"}],
+        ["strftime", {"date_format": "%Y-%m-%d"}],
+    ],
+)
+def test_dti_methods(method, kwargs):
+    pd_dti = pd.DatetimeIndex(["2020-01-01", "2020-12-31"], name="foo")
+    cudf_dti = cudf.from_pandas(pd_dti)
+
+    result = getattr(cudf_dti, method)(**kwargs)
+    expected = getattr(pd_dti, method)(**kwargs)
+    assert_eq(result, expected)

From e2d45d6f24adbeb3a21081e078a6c2776d550a06 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 31 Jul 2024 10:36:08 -1000
Subject: [PATCH 619/842] Align TimedeltaIndex APIs with pandas 2.x (#16368)

Mostly exposing methods that were available on the `TimedeltaColumn`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16368
---
 python/cudf/cudf/core/column/timedelta.py | 12 +++
 python/cudf/cudf/core/index.py            | 92 +++++++++++++++++++++++
 python/cudf/cudf/tests/test_timedelta.py  | 39 ++++++++++
 3 files changed, 143 insertions(+)

diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 59ea1cc002c..47c8ed6fd95 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -251,6 +251,18 @@ def normalize_binop_value(self, other) -> ColumnBinaryOperand:
     def time_unit(self) -> str:
         return np.datetime_data(self.dtype)[0]
 
+    def total_seconds(self) -> ColumnBase:
+        raise NotImplementedError("total_seconds is currently not implemented")
+
+    def ceil(self, freq: str) -> ColumnBase:
+        raise NotImplementedError("ceil is currently not implemented")
+
+    def floor(self, freq: str) -> ColumnBase:
+        raise NotImplementedError("floor is currently not implemented")
+
+    def round(self, freq: str) -> ColumnBase:
+        raise NotImplementedError("round is currently not implemented")
+
     def as_numerical_column(
         self, dtype: Dtype
     ) -> "cudf.core.column.NumericalColumn":
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 40a5d9ff259..888ea25cdae 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2759,6 +2759,98 @@ def __getitem__(self, index):
             return pd.Timedelta(value)
         return value
 
+    def as_unit(self, unit: str, round_ok: bool = True) -> Self:
+        """
+        Convert to a dtype with the given unit resolution.
+
+        Currently not implemented.
+
+        Parameters
+        ----------
+        unit : {'s', 'ms', 'us', 'ns'}
+        round_ok : bool, default True
+            If False and the conversion requires rounding, raise ValueError.
+        """
+        raise NotImplementedError("as_unit is currently not implemented")
+
+    @property
+    def freq(self) -> cudf.DateOffset | None:
+        raise NotImplementedError("freq is currently not implemented")
+
+    @property
+    def freqstr(self) -> str:
+        raise NotImplementedError("freqstr is currently not implemented")
+
+    @property
+    def resolution(self) -> str:
+        """
+        Returns day, hour, minute, second, millisecond or microsecond
+        """
+        raise NotImplementedError("resolution is currently not implemented")
+
+    @property
+    def unit(self) -> str:
+        return self._column.time_unit
+
+    def to_pytimedelta(self) -> np.ndarray:
+        """
+        Return an ndarray of ``datetime.timedelta`` objects.
+
+        Returns
+        -------
+        numpy.ndarray
+            An ndarray of ``datetime.timedelta`` objects.
+        """
+        return self.to_pandas().to_pytimedelta()
+
+    @property
+    def asi8(self) -> cupy.ndarray:
+        return self._column.astype("int64").values
+
+    def sum(self, *, skipna: bool = True, axis: int | None = 0):
+        return self._column.sum(skipna=skipna)
+
+    def mean(self, *, skipna: bool = True, axis: int | None = 0):
+        return self._column.mean(skipna=skipna)
+
+    def median(self, *, skipna: bool = True, axis: int | None = 0):
+        return self._column.median(skipna=skipna)
+
+    def std(self, *, skipna: bool = True, axis: int | None = 0, ddof: int = 1):
+        return self._column.std(skipna=skipna, ddof=ddof)
+
+    def total_seconds(self) -> cupy.ndarray:
+        """
+        Return total duration of each element expressed in seconds.
+
+        This method is currently not implemented.
+        """
+        return self._column.total_seconds().values
+
+    def ceil(self, freq: str) -> Self:
+        """
+        Ceil to the specified resolution.
+
+        This method is currently not implemented.
+        """
+        return type(self)._from_data({self.name: self._column.ceil(freq)})
+
+    def floor(self, freq: str) -> Self:
+        """
+        Floor to the specified resolution.
+
+        This method is currently not implemented.
+        """
+        return type(self)._from_data({self.name: self._column.floor(freq)})
+
+    def round(self, freq: str) -> Self:
+        """
+        Round to the specified resolution.
+
+        This method is currently not implemented.
+        """
+        return type(self)._from_data({self.name: self._column.round(freq)})
+
     @property  # type: ignore
     @_performance_tracking
     def days(self):
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index c4a2349f535..d622ff6b94e 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -1467,3 +1467,42 @@ def test_timedelta_series_cmpops_pandas_compatibility(data1, data2, op):
         got = op(gsr1, gsr2)
 
     assert_eq(expect, got)
+
+
+@pytest.mark.parametrize(
+    "method, kwargs",
+    [
+        ["sum", {}],
+        ["mean", {}],
+        ["median", {}],
+        ["std", {}],
+        ["std", {"ddof": 0}],
+    ],
+)
+def test_tdi_reductions(method, kwargs):
+    pd_tdi = pd.TimedeltaIndex(["1 day", "2 days", "3 days"])
+    cudf_tdi = cudf.from_pandas(pd_tdi)
+
+    result = getattr(pd_tdi, method)(**kwargs)
+    expected = getattr(cudf_tdi, method)(**kwargs)
+    assert result == expected
+
+
+def test_tdi_asi8():
+    pd_tdi = pd.TimedeltaIndex(["1 day", "2 days", "3 days"])
+    cudf_tdi = cudf.from_pandas(pd_tdi)
+
+    result = pd_tdi.asi8
+    expected = cudf_tdi.asi8
+    assert_eq(result, expected)
+
+
+def test_tdi_unit():
+    pd_tdi = pd.TimedeltaIndex(
+        ["1 day", "2 days", "3 days"], dtype="timedelta64[ns]"
+    )
+    cudf_tdi = cudf.from_pandas(pd_tdi)
+
+    result = pd_tdi.unit
+    expected = cudf_tdi.unit
+    assert result == expected

From dab8660df7ba823dcef8cb8276a3867c2bb27cc7 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 31 Jul 2024 10:37:48 -1000
Subject: [PATCH 620/842] Align IntervalIndex APIs with pandas 2.x (#16371)

Implemented the relatively straightforward, missing APIs and raised `NotImplementedError` for the others

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16371
---
 docs/cudf/source/conf.py                      |  15 ++-
 python/cudf/cudf/core/column/interval.py      |  64 ++++++++-
 python/cudf/cudf/core/index.py                | 123 ++++++++++++++++++
 .../cudf/cudf/tests/indexes/test_interval.py  |  33 +++++
 4 files changed, 229 insertions(+), 6 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 7ebafc0da95..43e2d6031bc 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -559,15 +559,20 @@ def on_missing_reference(app, env, node, contnode):
     ("py:obj", "cudf.DatetimeIndex.time"),
     ("py:obj", "cudf.DatetimeIndex.date"),
     ("py:obj", "cudf.Index.values_host"),
-    ("py:class", "pa.Array"),
-    ("py:class", "ScalarLike"),
-    ("py:class", "ParentType"),
-    ("py:class", "ColumnLike"),
-    ("py:class", "ColumnLike"),
     ("py:obj", "cudf.Index.transpose"),
     ("py:obj", "cudf.Index.T"),
     ("py:obj", "cudf.Index.to_flat_index"),
     ("py:obj", "cudf.MultiIndex.to_flat_index"),
+    ("py:meth", "pyarrow.Table.to_pandas"),
+    ("py:class", "pa.Array"),
+    ("py:class", "ScalarLike"),
+    ("py:class", "ParentType"),
+    ("py:class", "pyarrow.lib.DataType"),
+    ("py:class", "pyarrow.lib.Table"),
+    ("py:class", "pyarrow.lib.Scalar"),
+    ("py:class", "pyarrow.lib.ChunkedArray"),
+    ("py:class", "pyarrow.lib.Array"),
+    ("py:class", "ColumnLike"),
     # TODO: Remove this when we figure out why typing_extensions doesn't seem
     # to map types correctly for intersphinx
     ("py:class", "typing_extensions.Self"),
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index d09a1f66539..b2f79ef0c65 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -1,11 +1,18 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Literal
+
 import pandas as pd
 import pyarrow as pa
 
 import cudf
-from cudf.core.column import StructColumn
+from cudf.core.column import StructColumn, as_column
 from cudf.core.dtypes import IntervalDtype
 
+if TYPE_CHECKING:
+    from cudf.core.column import ColumnBase
+
 
 class IntervalColumn(StructColumn):
     def __init__(
@@ -85,6 +92,61 @@ def copy(self, deep=True):
             children=struct_copy.base_children,
         )
 
+    @property
+    def is_empty(self) -> ColumnBase:
+        left_equals_right = (self.right == self.left).fillna(False)
+        not_closed_both = as_column(
+            self.dtype.closed != "both", length=len(self)
+        )
+        return left_equals_right & not_closed_both
+
+    @property
+    def is_non_overlapping_monotonic(self) -> bool:
+        raise NotImplementedError(
+            "is_overlapping is currently not implemented."
+        )
+
+    @property
+    def is_overlapping(self) -> bool:
+        raise NotImplementedError(
+            "is_overlapping is currently not implemented."
+        )
+
+    @property
+    def length(self) -> ColumnBase:
+        return self.right - self.left
+
+    @property
+    def left(self) -> ColumnBase:
+        return self.children[0]
+
+    @property
+    def mid(self) -> ColumnBase:
+        try:
+            return 0.5 * (self.left + self.right)
+        except TypeError:
+            # datetime safe version
+            return self.left + 0.5 * self.length
+
+    @property
+    def right(self) -> ColumnBase:
+        return self.children[1]
+
+    def overlaps(other) -> ColumnBase:
+        raise NotImplementedError("overlaps is not currently implemented.")
+
+    def set_closed(
+        self, closed: Literal["left", "right", "both", "neither"]
+    ) -> IntervalColumn:
+        return IntervalColumn(
+            size=self.size,
+            dtype=IntervalDtype(self.dtype.fields["left"], closed),
+            mask=self.base_mask,
+            offset=self.offset,
+            null_count=self.null_count,
+            children=self.base_children,
+        )
+
     def as_interval_column(self, dtype):
         if isinstance(dtype, IntervalDtype):
             return IntervalColumn(
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 888ea25cdae..cd879d559cd 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3429,6 +3429,31 @@ def from_breaks(
         )
         return IntervalIndex(interval_col, name=name, closed=closed)
 
+    @classmethod
+    def from_arrays(
+        cls,
+        left,
+        right,
+        closed: Literal["left", "right", "both", "neither"] = "right",
+        copy: bool = False,
+        dtype=None,
+    ) -> Self:
+        raise NotImplementedError("from_arrays is currently not supported.")
+
+    @classmethod
+    def from_tuples(
+        cls,
+        data,
+        closed: Literal["left", "right", "both", "neither"] = "right",
+        name=None,
+        copy: bool = False,
+        dtype=None,
+    ) -> IntervalIndex:
+        piidx = pd.IntervalIndex.from_tuples(
+            data, closed=closed, name=name, copy=copy, dtype=dtype
+        )
+        return cls.from_pandas(piidx)
+
     def __getitem__(self, index):
         raise NotImplementedError(
             "Getting a scalar from an IntervalIndex is not yet supported"
@@ -3443,6 +3468,104 @@ def _is_boolean(self):
     def _clean_nulls_from_index(self):
         return self
 
+    @property
+    def is_empty(self) -> cupy.ndarray:
+        """
+        Indicates if an interval is empty, meaning it contains no points.
+        """
+        return self._column.is_empty.values
+
+    @property
+    def is_non_overlapping_monotonic(self) -> bool:
+        """
+        Return a True if the IntervalIndex is non-overlapping and monotonic.
+        """
+        return self._column.is_non_overlapping_monotonic
+
+    @property
+    def is_overlapping(self) -> bool:
+        """
+        Return True if the IntervalIndex has overlapping intervals, else False.
+
+        Currently not implemented
+        """
+        return self._column.is_overlapping
+
+    @property
+    def length(self) -> Index:
+        """
+        Return an Index with entries denoting the length of each Interval.
+        """
+        return _index_from_data({None: self._column.length})
+
+    @property
+    def left(self) -> Index:
+        """
+        Return left bounds of the intervals in the IntervalIndex.
+
+        The left bounds of each interval in the IntervalIndex are
+        returned as an Index. The datatype of the left bounds is the
+        same as the datatype of the endpoints of the intervals.
+        """
+        return _index_from_data({None: self._column.left})
+
+    @property
+    def mid(self) -> Index:
+        """
+        Return the midpoint of each interval in the IntervalIndex as an Index.
+
+        Each midpoint is calculated as the average of the left and right bounds
+        of each interval.
+        """
+        return _index_from_data({None: self._column.mid})
+
+    @property
+    def right(self) -> Index:
+        """
+        Return right bounds of the intervals in the IntervalIndex.
+
+        The right bounds of each interval in the IntervalIndex are
+        returned as an Index. The datatype of the right bounds is the
+        same as the datatype of the endpoints of the intervals.
+        """
+        return _index_from_data({None: self._column.right})
+
+    def overlaps(self, other) -> cupy.ndarray:
+        """
+        Check elementwise if an Interval overlaps the values in the IntervalIndex.
+
+        Currently not supported.
+        """
+        return self._column.overlaps(other).values
+
+    def set_closed(
+        self, closed: Literal["left", "right", "both", "neither"]
+    ) -> Self:
+        """
+        Return an identical IntervalArray closed on the specified side.
+
+        Parameters
+        ----------
+        closed : {'left', 'right', 'both', 'neither'}
+            Whether the intervals are closed on the left-side, right-side, both
+            or neither.
+        """
+        return type(self)._from_data(
+            {self.name: self._column.set_closed(closed)}
+        )
+
+    def to_tuples(self, na_tuple: bool = True) -> pd.Index:
+        """
+        Return an Index of tuples of the form (left, right).
+
+        Parameters
+        ----------
+        na_tuple : bool, default True
+            If ``True``, return ``NA`` as a tuple ``(nan, nan)``. If ``False``,
+            just return ``NA`` as ``nan``.
+        """
+        return self.to_pandas().to_tuples(na_tuple=na_tuple)
+
 
 @_performance_tracking
 def as_index(
diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py
index 87b76ab7609..3b3a9f96543 100644
--- a/python/cudf/cudf/tests/indexes/test_interval.py
+++ b/python/cudf/cudf/tests/indexes/test_interval.py
@@ -368,3 +368,36 @@ def test_intervalindex_conflicting_closed():
 def test_intervalindex_invalid_data():
     with pytest.raises(TypeError):
         cudf.IntervalIndex([1, 2])
+
+
+@pytest.mark.parametrize(
+    "attr",
+    [
+        "is_empty",
+        "length",
+        "left",
+        "right",
+        "mid",
+    ],
+)
+def test_intervalindex_properties(attr):
+    pd_ii = pd.IntervalIndex.from_arrays([0, 1], [0, 2])
+    cudf_ii = cudf.from_pandas(pd_ii)
+
+    result = getattr(cudf_ii, attr)
+    expected = getattr(pd_ii, attr)
+    assert_eq(result, expected)
+
+
+def test_set_closed():
+    data = [pd.Interval(0, 1)]
+    result = cudf.IntervalIndex(data).set_closed("both")
+    expected = pd.IntervalIndex(data).set_closed("both")
+    assert_eq(result, expected)
+
+
+def test_from_tuples():
+    data = [(1, 2), (10, 20)]
+    result = cudf.IntervalIndex.from_tuples(data, closed="left", name="a")
+    expected = pd.IntervalIndex.from_tuples(data, closed="left", name="a")
+    assert_eq(result, expected)

From be842259a835f4f7a5b9f7ff6fad1507d33c13cd Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Wed, 31 Jul 2024 17:53:13 -0500
Subject: [PATCH 621/842] Remove cuDF dependency from pylibcudf column
 from_device tests (#16441)

This removes the need to `import cudf` in `test_column_from_device` and removes a runtime dependency on numpy in the associated pylibcudf column method.

Authors:
  - https://github.com/brandon-b-miller
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16441
---
 python/cudf/cudf/_lib/pylibcudf/column.pyx    |  9 ++---
 .../cudf/_lib/pylibcudf/libcudf/types.pxd     |  2 +
 python/cudf/cudf/_lib/pylibcudf/types.pxd     |  2 +
 python/cudf/cudf/_lib/pylibcudf/types.pyx     | 16 +++++++-
 .../test_column_from_device.py                | 39 +++++++++++++++----
 5 files changed, 54 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/cudf/cudf/_lib/pylibcudf/column.pyx
index a61e0629292..1d9902b0374 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/column.pyx
@@ -15,13 +15,11 @@ from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from .gpumemoryview cimport gpumemoryview
 from .scalar cimport Scalar
-from .types cimport DataType, type_id
+from .types cimport DataType, size_of, type_id
 from .utils cimport int_to_bitmask_ptr, int_to_void_ptr
 
 import functools
 
-import numpy as np
-
 
 cdef class Column:
     """A container of nullable device data as a column of elements.
@@ -303,14 +301,15 @@ cdef class Column:
             raise ValueError("mask not yet supported.")
 
         typestr = iface['typestr'][1:]
+        data_type = _datatype_from_dtype_desc(typestr)
+
         if not is_c_contiguous(
             iface['shape'],
             iface['strides'],
-            np.dtype(typestr).itemsize
+            size_of(data_type)
         ):
             raise ValueError("Data must be C-contiguous")
 
-        data_type = _datatype_from_dtype_desc(typestr)
         size = iface['shape'][0]
         return Column(
             data_type,
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
index 8e94ec296cf..eabae68bc90 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
@@ -98,3 +98,5 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
         HIGHER
         MIDPOINT
         NEAREST
+
+    cdef size_type size_of(data_type t) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/cudf/cudf/_lib/pylibcudf/types.pxd
index 7d3ddca14a1..1f3e1aa2fbb 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pxd
@@ -27,3 +27,5 @@ cdef class DataType:
 
     @staticmethod
     cdef DataType from_libcudf(data_type dt)
+
+cpdef size_type size_of(DataType t)
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/cudf/cudf/_lib/pylibcudf/types.pyx
index c45c6071bb3..311f9ce4046 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/types.pyx
@@ -2,7 +2,12 @@
 
 from libc.stdint cimport int32_t
 
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type, type_id
+from cudf._lib.pylibcudf.libcudf.types cimport (
+    data_type,
+    size_of as cpp_size_of,
+    size_type,
+    type_id,
+)
 from cudf._lib.pylibcudf.libcudf.utilities.type_dispatcher cimport type_to_id
 
 from cudf._lib.pylibcudf.libcudf.types import type_id as TypeId  # no-cython-lint, isort:skip
@@ -69,6 +74,15 @@ cdef class DataType:
         ret.c_obj = dt
         return ret
 
+cpdef size_type size_of(DataType t):
+    """Returns the size in bytes of elements of the specified data_type.
+
+    Only fixed-width types are supported.
+
+    For details, see :cpp:func:`size_of`.
+    """
+    with nogil:
+        return cpp_size_of(t.c_obj)
 
 SIZE_TYPE = DataType(type_to_id[size_type]())
 SIZE_TYPE_ID = SIZE_TYPE.id()
diff --git a/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py b/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
index c4ff7bb43a5..78ee2cb100e 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
+++ b/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
@@ -4,7 +4,8 @@
 import pytest
 from utils import assert_column_eq
 
-import cudf
+import rmm
+
 from cudf._lib import pylibcudf as plc
 
 VALID_TYPES = [
@@ -35,17 +36,39 @@ def valid_type(request):
     return request.param
 
 
+class DataBuffer:
+    def __init__(self, obj, dtype):
+        self.obj = rmm.DeviceBuffer.to_device(obj)
+        self.dtype = dtype
+        self.shape = (int(len(self.obj) / self.dtype.itemsize),)
+        self.strides = (self.dtype.itemsize,)
+        self.typestr = self.dtype.str
+
+    @property
+    def __cuda_array_interface__(self):
+        return {
+            "data": self.obj.__cuda_array_interface__["data"],
+            "shape": self.shape,
+            "strides": self.strides,
+            "typestr": self.typestr,
+            "version": 0,
+        }
+
+
 @pytest.fixture
-def valid_column(valid_type):
+def input_column(valid_type):
     if valid_type == pa.bool_():
         return pa.array([True, False, True], type=valid_type)
     return pa.array([1, 2, 3], type=valid_type)
 
 
-def test_from_cuda_array_interface(valid_column):
-    col = plc.column.Column.from_cuda_array_interface_obj(
-        cudf.Series(valid_column)
-    )
-    expect = valid_column
+@pytest.fixture
+def iface_obj(input_column):
+    data = input_column.to_numpy(zero_copy_only=False)
+    return DataBuffer(data.view("uint8"), data.dtype)
+
+
+def test_from_cuda_array_interface(input_column, iface_obj):
+    col = plc.column.Column.from_cuda_array_interface_obj(iface_obj)
 
-    assert_column_eq(expect, col)
+    assert_column_eq(input_column, col)

From 445a75fca4d8d12d2230fef507dbfb696b6968fb Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 1 Aug 2024 02:45:30 -1000
Subject: [PATCH 622/842] Ensure objects with __interface__ are converted to
 cupy/numpy arrays (#16436)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

https://github.com/rapidsai/cudf/pull/16277 removed a universal cast to
a `cupy.array` in `_from_array`. Although the typing suggested this
method should only accept `np.ndarray` or `cupy.ndarray`, this method is
called on any object implementing the `__cuda_array_inferface__` or
`__array_interface__` (e.g. `numba.DeviceArray`) which caused a
performance regression in cuspatial
https://github.com/rapidsai/cuspatial/issues/1413

closes #16434


```python
In [1]: import cupy, numba.cuda

In [2]: import cudf

In [3]: cupy_array = cupy.ones((10_000, 100))

In [4]: %timeit cudf.DataFrame(cupy_array)
3.88 ms ± 52 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)

In [5]: %timeit cudf.DataFrame(numba.cuda.to_device(cupy_array))
3.99 ms ± 35.4 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
```

---------

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 python/cudf/benchmarks/API/bench_dataframe.py |  7 ++++
 python/cudf/cudf/core/column/column.py        |  3 +-
 python/cudf/cudf/core/dataframe.py            | 34 ++++++++++++-------
 3 files changed, 30 insertions(+), 14 deletions(-)

diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py
index 59d73015962..ba243eb6a7c 100644
--- a/python/cudf/benchmarks/API/bench_dataframe.py
+++ b/python/cudf/benchmarks/API/bench_dataframe.py
@@ -4,6 +4,7 @@
 
 import string
 
+import numba.cuda
 import numpy
 import pytest
 import pytest_cases
@@ -16,6 +17,12 @@ def bench_construction(benchmark, N):
     benchmark(cudf.DataFrame, {None: cupy.random.rand(N)})
 
 
+@pytest.mark.parametrize("N", [100, 100_000])
+@pytest.mark.pandas_incompatible
+def bench_construction_numba_device_array(benchmark, N):
+    benchmark(cudf.DataFrame, numba.cuda.to_device(numpy.ones((100, N))))
+
+
 @benchmark_with_object(cls="dataframe", dtype="float", cols=6)
 @pytest.mark.parametrize(
     "expr", ["a+b", "a+b+c+d+e", "a / (sin(a) + cos(b)) * tanh(d*e*f)"]
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 32e6aade65b..7e0d8ced595 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1843,8 +1843,7 @@ def as_column(
         else:
             mask = None
 
-        arbitrary = cupy.asarray(arbitrary)
-        arbitrary = cupy.ascontiguousarray(arbitrary)
+        arbitrary = cupy.asarray(arbitrary, order="C")
 
         data = as_buffer(arbitrary, exposed=cudf.get_option("copy_on_write"))
         col = build_column(data, dtype=arbitrary.dtype, mask=mask)
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1d7136e61e3..dca0c0b821a 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -782,7 +782,6 @@ def __init__(
             )
         elif hasattr(data, "__cuda_array_interface__"):
             arr_interface = data.__cuda_array_interface__
-
             # descr is an optional field of the _cuda_ary_iface_
             if "descr" in arr_interface:
                 if len(arr_interface["descr"]) == 1:
@@ -5835,17 +5834,18 @@ def from_records(
     @_performance_tracking
     def _from_arrays(
         cls,
-        data: np.ndarray | cupy.ndarray,
+        data,
         index=None,
         columns=None,
         nan_as_null=False,
     ):
-        """Convert a numpy/cupy array to DataFrame.
+        """
+        Convert an object implementing an array interface to DataFrame.
 
         Parameters
         ----------
-        data : numpy/cupy array of ndim 1 or 2,
-            dimensions greater than 2 are not supported yet.
+        data : object of ndim 1 or 2,
+            Object implementing ``__array_interface__`` or ``__cuda_array_interface__``
         index : Index or array-like
             Index to use for resulting frame. Will default to
             RangeIndex if no indexing information part of input data and
@@ -5857,13 +5857,23 @@ def _from_arrays(
         -------
         DataFrame
         """
-        if data.ndim != 1 and data.ndim != 2:
+        array_data: np.ndarray | cupy.ndarray
+        if hasattr(data, "__cuda_array_interface__"):
+            array_data = cupy.asarray(data, order="F")
+        elif hasattr(data, "__array_interface__"):
+            array_data = np.asarray(data, order="F")
+        else:
             raise ValueError(
-                f"records dimension expected 1 or 2 but found: {data.ndim}"
+                "data must be an object implementing __cuda_array_interface__ or __array_interface__"
+            )
+
+        if array_data.ndim not in {1, 2}:
+            raise ValueError(
+                f"records dimension expected 1 or 2 but found: {array_data.ndim}"
             )
 
         if data.ndim == 2:
-            num_cols = data.shape[1]
+            num_cols = array_data.shape[1]
         else:
             # Since we validate ndim to be either 1 or 2 above,
             # this case can be assumed to be ndim == 1.
@@ -5881,14 +5891,14 @@ def _from_arrays(
                 raise ValueError("Duplicate column names are not allowed")
             names = columns
 
-        if data.ndim == 2:
+        if array_data.ndim == 2:
             ca_data = {
-                k: column.as_column(data[:, i], nan_as_null=nan_as_null)
+                k: column.as_column(array_data[:, i], nan_as_null=nan_as_null)
                 for i, k in enumerate(names)
             }
-        elif data.ndim == 1:
+        elif array_data.ndim == 1:
             ca_data = {
-                names[0]: column.as_column(data, nan_as_null=nan_as_null)
+                names[0]: column.as_column(array_data, nan_as_null=nan_as_null)
             }
 
         if index is not None:

From 9d0c57a64d63d52182bd1c1e930180bf62404f1a Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 1 Aug 2024 10:59:27 -0700
Subject: [PATCH 623/842] Add skiprows and nrows to parquet reader (#16214)

closes #15144

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16214
---
 python/cudf/cudf/_lib/parquet.pyx             | 35 ++++++++++++-----
 .../cudf/cudf/_lib/pylibcudf/io/parquet.pxd   |  2 +-
 .../cudf/cudf/_lib/pylibcudf/io/parquet.pyx   | 18 ++++-----
 python/cudf/cudf/io/parquet.py                | 23 +++++++++++
 .../cudf/pylibcudf_tests/io/test_parquet.py   |  2 +-
 python/cudf/cudf/tests/test_parquet.py        | 39 +++++++++++++++++++
 python/cudf/cudf/utils/ioutils.py             | 10 +++++
 python/cudf_polars/cudf_polars/dsl/ir.py      |  2 +-
 8 files changed, 110 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index a2eed94bb3c..4a4b13b0b31 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -22,7 +22,7 @@ from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
 
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport make_unique, unique_ptr
@@ -132,7 +132,10 @@ cdef object _process_metadata(object df,
                               object filepaths_or_buffers,
                               list pa_buffers,
                               bool allow_range_index,
-                              bool use_pandas_metadata):
+                              bool use_pandas_metadata,
+                              size_type nrows=-1,
+                              int64_t skip_rows=0,
+                              ):
 
     add_df_col_struct_names(df, child_names)
     index_col = None
@@ -221,9 +224,13 @@ cdef object _process_metadata(object df,
                 else:
                     idx = cudf.Index(cudf.core.column.column_empty(0))
             else:
+                start = range_index_meta["start"] + skip_rows
+                stop = range_index_meta["stop"]
+                if nrows != -1:
+                    stop = start + nrows
                 idx = cudf.RangeIndex(
-                    start=range_index_meta['start'],
-                    stop=range_index_meta['stop'],
+                    start=start,
+                    stop=stop,
                     step=range_index_meta['step'],
                     name=range_index_meta['name']
                 )
@@ -260,7 +267,9 @@ def read_parquet_chunked(
     row_groups=None,
     use_pandas_metadata=True,
     size_t chunk_read_limit=0,
-    size_t pass_read_limit=1024000000
+    size_t pass_read_limit=1024000000,
+    size_type nrows=-1,
+    int64_t skip_rows=0
 ):
     # Convert NativeFile buffers to NativeFileDatasource,
     # but save original buffers in case we need to use
@@ -287,7 +296,9 @@ def read_parquet_chunked(
         row_groups,
         use_pandas_metadata,
         chunk_read_limit=chunk_read_limit,
-        pass_read_limit=pass_read_limit
+        pass_read_limit=pass_read_limit,
+        skip_rows=skip_rows,
+        nrows=nrows,
     )
 
     tbl_w_meta = reader.read_chunk()
@@ -320,13 +331,16 @@ def read_parquet_chunked(
     df = _process_metadata(df, column_names, child_names,
                            per_file_user_data, row_groups,
                            filepaths_or_buffers, pa_buffers,
-                           allow_range_index, use_pandas_metadata)
+                           allow_range_index, use_pandas_metadata,
+                           nrows=nrows, skip_rows=skip_rows)
     return df
 
 
 cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
                    use_pandas_metadata=True,
-                   Expression filters=None):
+                   Expression filters=None,
+                   size_type nrows=-1,
+                   int64_t skip_rows=0):
     """
     Cython function to call into libcudf API, see `read_parquet`.
 
@@ -362,6 +376,8 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
         filters,
         convert_strings_to_categories = False,
         use_pandas_metadata = use_pandas_metadata,
+        skip_rows = skip_rows,
+        nrows = nrows,
     )
 
     df = cudf.DataFrame._from_data(
@@ -371,7 +387,8 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     df = _process_metadata(df, tbl_w_meta.column_names(include_children=False),
                            tbl_w_meta.child_names, tbl_w_meta.per_file_user_data,
                            row_groups, filepaths_or_buffers, pa_buffers,
-                           allow_range_index, use_pandas_metadata)
+                           allow_range_index, use_pandas_metadata,
+                           nrows=nrows, skip_rows=skip_rows)
     return df
 
 cpdef read_parquet_metadata(filepaths_or_buffers):
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
index 027f215fb91..93ef849b813 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
@@ -28,7 +28,7 @@ cpdef read_parquet(
     bool convert_strings_to_categories = *,
     bool use_pandas_metadata = *,
     int64_t skip_rows = *,
-    size_type num_rows = *,
+    size_type nrows = *,
     # disabled see comment in parquet.pyx for more
     # ReaderColumnSchema reader_column_schema = *,
     # DataType timestamp_type = *
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
index 96119e1b714..84a79f9565f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
@@ -26,7 +26,7 @@ cdef parquet_reader_options _setup_parquet_reader_options(
     bool convert_strings_to_categories = False,
     bool use_pandas_metadata = True,
     int64_t skip_rows = 0,
-    size_type num_rows = -1,
+    size_type nrows = -1,
     # ReaderColumnSchema reader_column_schema = None,
     # DataType timestamp_type = DataType(type_id.EMPTY)
 ):
@@ -40,8 +40,8 @@ cdef parquet_reader_options _setup_parquet_reader_options(
     )
     if row_groups is not None:
         opts.set_row_groups(row_groups)
-    if num_rows != -1:
-        opts.set_num_rows(num_rows)
+    if nrows != -1:
+        opts.set_num_rows(nrows)
     if skip_rows != 0:
         opts.set_skip_rows(skip_rows)
     if columns is not None:
@@ -73,7 +73,7 @@ cdef class ChunkedParquetReader:
         Whether to convert string columns to the category type
     skip_rows : int64_t, default 0
         The number of rows to skip from the start of the file.
-    num_rows : size_type, default -1
+    nrows : size_type, default -1
         The number of rows to read. By default, read the entire file.
     chunk_read_limit : size_t, default 0
         Limit on total number of bytes to be returned per read,
@@ -90,7 +90,7 @@ cdef class ChunkedParquetReader:
         bool use_pandas_metadata=True,
         bool convert_strings_to_categories=False,
         int64_t skip_rows = 0,
-        size_type num_rows = -1,
+        size_type nrows = -1,
         size_t chunk_read_limit=0,
         size_t pass_read_limit=1024000000
     ):
@@ -103,7 +103,7 @@ cdef class ChunkedParquetReader:
             convert_strings_to_categories=convert_strings_to_categories,
             use_pandas_metadata=use_pandas_metadata,
             skip_rows=skip_rows,
-            num_rows=num_rows,
+            nrows=nrows,
         )
 
         with nogil:
@@ -152,7 +152,7 @@ cpdef read_parquet(
     bool convert_strings_to_categories = False,
     bool use_pandas_metadata = True,
     int64_t skip_rows = 0,
-    size_type num_rows = -1,
+    size_type nrows = -1,
     # Disabled, these aren't used by cudf-python
     # we should only add them back in if there's user demand
     # ReaderColumnSchema reader_column_schema = None,
@@ -178,7 +178,7 @@ cpdef read_parquet(
         the per-file user metadata of the ``TableWithMetadata``
     skip_rows : int64_t, default 0
         The number of rows to skip from the start of the file.
-    num_rows : size_type, default -1
+    nrows : size_type, default -1
         The number of rows to read. By default, read the entire file.
 
     Returns
@@ -195,7 +195,7 @@ cpdef read_parquet(
         convert_strings_to_categories,
         use_pandas_metadata,
         skip_rows,
-        num_rows,
+        nrows,
     )
 
     with nogil:
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 7dab2f20100..4a419a2fbb6 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -539,6 +539,8 @@ def read_parquet(
     open_file_options=None,
     bytes_per_thread=None,
     dataset_kwargs=None,
+    nrows=None,
+    skip_rows=None,
     *args,
     **kwargs,
 ):
@@ -685,6 +687,8 @@ def read_parquet(
             partition_keys=partition_keys,
             partition_categories=partition_categories,
             dataset_kwargs=dataset_kwargs,
+            nrows=nrows,
+            skip_rows=skip_rows,
             **kwargs,
         )
     # Apply filters row-wise (if any are defined), and return
@@ -813,6 +817,8 @@ def _parquet_to_frame(
     partition_keys=None,
     partition_categories=None,
     dataset_kwargs=None,
+    nrows=None,
+    skip_rows=None,
     **kwargs,
 ):
     # If this is not a partitioned read, only need
@@ -820,11 +826,18 @@ def _parquet_to_frame(
     if not partition_keys:
         return _read_parquet(
             paths_or_buffers,
+            nrows=nrows,
+            skip_rows=skip_rows,
             *args,
             row_groups=row_groups,
             **kwargs,
         )
 
+    if nrows is not None or skip_rows is not None:
+        raise NotImplementedError(
+            "nrows/skip_rows is not supported when reading a partitioned parquet dataset"
+        )
+
     partition_meta = None
     partitioning = (dataset_kwargs or {}).get("partitioning", None)
     if hasattr(partitioning, "schema"):
@@ -912,6 +925,8 @@ def _read_parquet(
     columns=None,
     row_groups=None,
     use_pandas_metadata=None,
+    nrows=None,
+    skip_rows=None,
     *args,
     **kwargs,
 ):
@@ -934,13 +949,21 @@ def _read_parquet(
                 columns=columns,
                 row_groups=row_groups,
                 use_pandas_metadata=use_pandas_metadata,
+                nrows=nrows if nrows is not None else -1,
+                skip_rows=skip_rows if skip_rows is not None else 0,
             )
         else:
+            if nrows is None:
+                nrows = -1
+            if skip_rows is None:
+                skip_rows = 0
             return libparquet.read_parquet(
                 filepaths_or_buffers,
                 columns=columns,
                 row_groups=row_groups,
                 use_pandas_metadata=use_pandas_metadata,
+                nrows=nrows,
+                skip_rows=skip_rows,
             )
     else:
         if (
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py b/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py
index 07d2ab3d69a..dbd20cd473e 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py
+++ b/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py
@@ -31,7 +31,7 @@ def test_read_parquet_basic(
 
     res = plc.io.parquet.read_parquet(
         plc.io.SourceInfo([source]),
-        num_rows=nrows,
+        nrows=nrows,
         skip_rows=skiprows,
         columns=columns,
     )
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 3806b901b10..879a2c50db7 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1978,6 +1978,25 @@ def test_parquet_partitioned(tmpdir_factory, cols, filename):
                 assert fn == filename
 
 
+@pytest.mark.parametrize("kwargs", [{"nrows": 1}, {"skip_rows": 1}])
+def test_parquet_partitioned_notimplemented(tmpdir_factory, kwargs):
+    # Checks that write_to_dataset is wrapping to_parquet
+    # as expected
+    pdf_dir = str(tmpdir_factory.mktemp("pdf_dir"))
+    size = 100
+    pdf = pd.DataFrame(
+        {
+            "a": np.arange(0, stop=size, dtype="int64"),
+            "b": np.random.choice(list("abcd"), size=size),
+            "c": np.random.choice(np.arange(4), size=size),
+        }
+    )
+    pdf.to_parquet(pdf_dir, index=False, partition_cols=["b"])
+
+    with pytest.raises(NotImplementedError):
+        cudf.read_parquet(pdf_dir, **kwargs)
+
+
 @pytest.mark.parametrize("return_meta", [True, False])
 def test_parquet_writer_chunked_partitioned(tmpdir_factory, return_meta):
     pdf_dir = str(tmpdir_factory.mktemp("pdf_dir"))
@@ -3768,6 +3787,26 @@ def test_parquet_chunked_reader(
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "nrows,skip_rows",
+    [
+        (0, 0),
+        (1000, 0),
+        (0, 1000),
+        (1000, 10000),
+    ],
+)
+def test_parquet_reader_nrows_skiprows(nrows, skip_rows):
+    df = pd.DataFrame(
+        {"a": [1, 2, 3, 4] * 100000, "b": ["av", "qw", "hi", "xyz"] * 100000}
+    )
+    expected = df[skip_rows : skip_rows + nrows]
+    buffer = BytesIO()
+    df.to_parquet(buffer)
+    got = cudf.read_parquet(buffer, nrows=nrows, skip_rows=skip_rows)
+    assert_eq(expected, got)
+
+
 def test_parquet_reader_pandas_compatibility():
     df = pd.DataFrame(
         {"a": [1, 2, 3, 4] * 10000, "b": ["av", "qw", "hi", "xyz"] * 10000}
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 80555750b3a..448a815fe1b 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -199,6 +199,16 @@
     in parallel (using a python thread pool). Default allocation is
     {bytes_per_thread} bytes.
     This parameter is functional only when `use_python_file_object=False`.
+skiprows : int, default None
+    If not None, the number of rows to skip from the start of the file.
+
+    .. note::
+       This option is not supported when the low-memory mode is on.
+nrows : int, default None
+    If not None, the total number of rows to read.
+
+    .. note:
+       This option is not supported when the low-memory mode is on.
 
 Returns
 -------
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 7f62dff4389..3754addeb11 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -321,7 +321,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             tbl_w_meta = plc.io.parquet.read_parquet(
                 plc.io.SourceInfo(self.paths),
                 columns=with_columns,
-                num_rows=nrows,
+                nrows=nrows,
             )
             df = DataFrame.from_table(
                 tbl_w_meta.tbl,

From 05745d04e08ea494a50d12bad977af7e71aaf27b Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 1 Aug 2024 17:00:19 -0400
Subject: [PATCH 624/842] Improve performance of hash_character_ngrams using
 warp-per-string kernel (#16212)

Improves the performance of `nvtext::hash_character_ngrams` using a warp-per-string kernel instead of a string per thread.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16212
---
 cpp/src/text/generate_ngrams.cu | 161 ++++++++++++++++++++++----------
 1 file changed, 113 insertions(+), 48 deletions(-)

diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 724f3603f29..6f700f84ec4 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -36,10 +36,12 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
 #include <cuda/functional>
+#include <thrust/copy.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform_scan.h>
 
 #include <stdexcept>
 
@@ -165,6 +167,47 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
 namespace detail {
 namespace {
 
+constexpr cudf::thread_index_type block_size       = 256;
+constexpr cudf::thread_index_type bytes_per_thread = 4;
+
+/**
+ * @brief Counts the number of ngrams in each row of the given strings column
+ *
+ * Each warp processes a single string.
+ * Formula is `count = max(0,str.length() - ngrams + 1)`
+ * If a string has less than ngrams characters, its count is 0.
+ */
+CUDF_KERNEL void count_char_ngrams_kernel(cudf::column_device_view const d_strings,
+                                          cudf::size_type ngrams,
+                                          cudf::size_type* d_counts)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+
+  auto const str_idx = idx / cudf::detail::warp_size;
+  if (str_idx >= d_strings.size()) { return; }
+  if (d_strings.is_null(str_idx)) {
+    d_counts[str_idx] = 0;
+    return;
+  }
+
+  namespace cg    = cooperative_groups;
+  auto const warp = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
+
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  auto const end   = d_str.data() + d_str.size_bytes();
+
+  auto const lane_idx   = warp.thread_rank();
+  cudf::size_type count = 0;
+  for (auto itr = d_str.data() + (lane_idx * bytes_per_thread); itr < end;
+       itr += cudf::detail::warp_size * bytes_per_thread) {
+    for (auto s = itr; (s < (itr + bytes_per_thread)) && (s < end); ++s) {
+      count += static_cast<cudf::size_type>(cudf::strings::detail::is_begin_utf8_char(*s));
+    }
+  }
+  auto const char_count = cg::reduce(warp, count, cg::plus<int>());
+  if (lane_idx == 0) { d_counts[str_idx] = cuda::std::max(0, char_count - ngrams + 1); }
+}
+
 /**
  * @brief Generate character ngrams for each string
  *
@@ -220,17 +263,16 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
 
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
 
-  auto sizes_itr = cudf::detail::make_counting_transform_iterator(
-    0,
-    cuda::proclaim_return_type<cudf::size_type>(
-      [d_strings = *d_strings, ngrams] __device__(auto idx) {
-        if (d_strings.is_null(idx)) { return 0; }
-        auto const length = d_strings.element<cudf::string_view>(idx).length();
-        return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams));
-      }));
-  auto [offsets, total_ngrams] =
-    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto [offsets, total_ngrams] = [&] {
+    auto counts           = rmm::device_uvector<cudf::size_type>(input.size(), stream);
+    auto const num_blocks = cudf::util::div_rounding_up_safe(
+      static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size);
+    count_char_ngrams_kernel<<<num_blocks, block_size, 0, stream.value()>>>(
+      *d_strings, ngrams, counts.data());
+    return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr);
+  }();
   auto d_offsets = offsets->view().data<cudf::size_type>();
+
   CUDF_EXPECTS(total_ngrams > 0,
                "Insufficient number of characters in each string to generate ngrams");
 
@@ -246,36 +288,64 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
 }
 
 namespace {
+
 /**
  * @brief Computes the hash of each character ngram
  *
- * Each thread processes a single string. Substrings are resolved for every character
+ * Each warp processes a single string. Substrings are resolved for every character
  * of the string and hashed.
  */
-struct character_ngram_hash_fn {
-  cudf::column_device_view const d_strings;
-  cudf::size_type ngrams;
-  cudf::size_type const* d_ngram_offsets;
-  cudf::hash_value_type* d_results;
+CUDF_KERNEL void character_ngram_hash_kernel(cudf::column_device_view const d_strings,
+                                             cudf::size_type ngrams,
+                                             cudf::size_type const* d_ngram_offsets,
+                                             cudf::hash_value_type* d_results)
+{
+  auto const idx = cudf::detail::grid_1d::global_thread_id();
+  if (idx >= (static_cast<cudf::thread_index_type>(d_strings.size()) * cudf::detail::warp_size)) {
+    return;
+  }
 
-  __device__ void operator()(cudf::size_type idx) const
-  {
-    if (d_strings.is_null(idx)) return;
-    auto const d_str = d_strings.element<cudf::string_view>(idx);
-    if (d_str.empty()) return;
-    auto itr                = d_str.begin();
-    auto const ngram_offset = d_ngram_offsets[idx];
-    auto const ngram_count  = d_ngram_offsets[idx + 1] - ngram_offset;
-    auto const hasher       = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
-    auto d_hashes           = d_results + ngram_offset;
-    for (cudf::size_type n = 0; n < ngram_count; ++n, ++itr) {
-      auto const begin = itr.byte_offset();
-      auto const end   = (itr + ngrams).byte_offset();
-      auto const ngram = cudf::string_view(d_str.data() + begin, end - begin);
-      *d_hashes++      = hasher(ngram);
+  auto const str_idx = idx / cudf::detail::warp_size;
+
+  if (d_strings.is_null(str_idx)) { return; }
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) { return; }
+
+  __shared__ cudf::hash_value_type hvs[block_size];  // temp store for hash values
+
+  auto const ngram_offset = d_ngram_offsets[str_idx];
+  auto const hasher       = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>{0};
+
+  auto const end        = d_str.data() + d_str.size_bytes();
+  auto const warp_count = (d_str.size_bytes() / cudf::detail::warp_size) + 1;
+  auto const lane_idx   = idx % cudf::detail::warp_size;
+
+  auto d_hashes = d_results + ngram_offset;
+  auto itr      = d_str.data() + lane_idx;
+  for (auto i = 0; i < warp_count; ++i) {
+    cudf::hash_value_type hash = 0;
+    if (itr < end && cudf::strings::detail::is_begin_utf8_char(*itr)) {
+      // resolve ngram substring
+      auto const sub_str =
+        cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
+      auto const [bytes, left] =
+        cudf::strings::detail::bytes_to_character_position(sub_str, ngrams);
+      if (left == 0) { hash = hasher(cudf::string_view(itr, bytes)); }
+    }
+    hvs[threadIdx.x] = hash;  // store hash into shared memory
+    __syncwarp();
+    if (lane_idx == 0) {
+      // copy valid hash values into d_hashes
+      auto const hashes = &hvs[threadIdx.x];
+      d_hashes          = thrust::copy_if(
+        thrust::seq, hashes, hashes + cudf::detail::warp_size, d_hashes, [](auto h) {
+          return h != 0;
+        });
     }
+    __syncwarp();
+    itr += cudf::detail::warp_size;
   }
-};
+}
 }  // namespace
 
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& input,
@@ -291,18 +361,16 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
   if (input.is_empty()) { return cudf::make_empty_column(output_type); }
 
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+  auto const grid      = cudf::detail::grid_1d(
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size);
 
   // build offsets column by computing the number of ngrams per string
-  auto sizes_itr = cudf::detail::make_counting_transform_iterator(
-    0,
-    cuda::proclaim_return_type<cudf::size_type>(
-      [d_strings = *d_strings, ngrams] __device__(auto idx) {
-        if (d_strings.is_null(idx)) { return 0; }
-        auto const length = d_strings.element<cudf::string_view>(idx).length();
-        return std::max(0, static_cast<cudf::size_type>(length + 1 - ngrams));
-      }));
-  auto [offsets, total_ngrams] =
-    cudf::detail::make_offsets_child_column(sizes_itr, sizes_itr + input.size(), stream, mr);
+  auto [offsets, total_ngrams] = [&] {
+    auto counts = rmm::device_uvector<cudf::size_type>(input.size(), stream);
+    count_char_ngrams_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      *d_strings, ngrams, counts.data());
+    return cudf::detail::make_offsets_child_column(counts.begin(), counts.end(), stream, mr);
+  }();
   auto d_offsets = offsets->view().data<cudf::size_type>();
 
   CUDF_EXPECTS(total_ngrams > 0,
@@ -313,11 +381,8 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
     cudf::make_numeric_column(output_type, total_ngrams, cudf::mask_state::UNALLOCATED, stream, mr);
   auto d_hashes = hashes->mutable_view().data<cudf::hash_value_type>();
 
-  character_ngram_hash_fn generator{*d_strings, ngrams, d_offsets, d_hashes};
-  thrust::for_each_n(rmm::exec_policy(stream),
-                     thrust::counting_iterator<cudf::size_type>(0),
-                     input.size(),
-                     generator);
+  character_ngram_hash_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    *d_strings, ngrams, d_offsets, d_hashes);
 
   return make_lists_column(
     input.size(), std::move(offsets), std::move(hashes), 0, rmm::device_buffer{}, stream, mr);

From a8a367009ff64478d78eb916fc9dc65b77b89aac Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Thu, 1 Aug 2024 16:45:01 -0700
Subject: [PATCH 625/842] Move exception handler into pylibcudf from cudf
 (#16468)

PR to help prepare for the splitting out of pylibcudf.

Authors:
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16468
---
 docs/cudf/source/developer_guide/pylibcudf.md |  2 +-
 .../{ => pylibcudf}/exception_handler.pxd     |  6 +--
 .../cudf/_lib/pylibcudf/libcudf/binaryop.pxd  | 12 ++---
 .../cudf/_lib/pylibcudf/libcudf/copying.pxd   | 44 +++++++++----------
 .../_lib/pylibcudf/libcudf/lists/contains.pxd | 12 ++---
 5 files changed, 38 insertions(+), 38 deletions(-)
 rename python/cudf/cudf/_lib/{ => pylibcudf}/exception_handler.pxd (95%)

diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md
index 0b881b2b057..2ae545a4955 100644
--- a/docs/cudf/source/developer_guide/pylibcudf.md
+++ b/docs/cudf/source/developer_guide/pylibcudf.md
@@ -149,7 +149,7 @@ Some guidelines on what should be tested:
   - Exception: In special cases where constructing suitable large tests is difficult in C++ (such as creating suitable input data for I/O testing), tests may be added to pylibcudf instead.
 - Nullable data should always be tested.
 - Expected exceptions should be tested. Tests should be written from the user's perspective in mind, and if the API is not currently throwing the appropriate exception it should be updated.
-  - Important note: If the exception should be produced by libcudf, the underlying libcudf API should be updated to throw the desired exception in C++. Such changes may require consultation with libcudf devs in nontrivial cases. [This issue](https://github.com/rapidsai/cudf/issues/12885) provides an overview and an indication of acceptable exception types that should cover most use cases. In rare cases a new C++ exception may need to be introduced in [`error.hpp`](https://github.com/rapidsai/cudf/blob/branch-24.04/cpp/include/cudf/utilities/error.hpp). If so, this exception will also need to be mapped to a suitable Python exception in [`exception_handler.pxd`](https://github.com/rapidsai/cudf/blob/branch-24.04/python/cudf/cudf/_lib/exception_handler.pxd).
+  - Important note: If the exception should be produced by libcudf, the underlying libcudf API should be updated to throw the desired exception in C++. Such changes may require consultation with libcudf devs in nontrivial cases. [This issue](https://github.com/rapidsai/cudf/issues/12885) provides an overview and an indication of acceptable exception types that should cover most use cases. In rare cases a new C++ exception may need to be introduced in [`error.hpp`](https://github.com/rapidsai/cudf/blob/branch-24.04/cpp/include/cudf/utilities/error.hpp). If so, this exception will also need to be mapped to a suitable Python exception in `exception_handler.pxd`.
 
 Some guidelines on how best to use pytests.
 - By default, fixtures producing device data containers should be of module scope and treated as immutable by tests. Allocating data on the GPU is expensive and slows tests. Almost all pylibcudf operations are out of place operations, so module-scoped fixtures should not typically be problematic to work with. Session-scoped fixtures would also work, but they are harder to reason about since they live in a different module, and if they need to change for any reason they could affect an arbitrarily large number of tests. Module scope is a good balance.
diff --git a/python/cudf/cudf/_lib/exception_handler.pxd b/python/cudf/cudf/_lib/pylibcudf/exception_handler.pxd
similarity index 95%
rename from python/cudf/cudf/_lib/exception_handler.pxd
rename to python/cudf/cudf/_lib/pylibcudf/exception_handler.pxd
index 4337d8db285..6abcd0a1c0f 100644
--- a/python/cudf/cudf/_lib/exception_handler.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/exception_handler.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 
 # See
@@ -24,7 +24,7 @@ cdef extern from *:
      * Since this function interoperates with Python's exception state, it
      * does not throw any C++ exceptions.
      */
-    void cudf_exception_handler()
+    void libcudf_exception_handler()
     {
       // Catch a handful of different errors here and turn them into the
       // equivalent Python errors.
@@ -66,4 +66,4 @@ cdef extern from *:
 
     }  // anonymous namespace
     """
-    cdef void cudf_exception_handler()
+    cdef void libcudf_exception_handler()
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
index b34fea6a775..78da5980db4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
@@ -5,7 +5,7 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
-from cudf._lib.exception_handler cimport cudf_exception_handler
+from cudf._lib.pylibcudf.exception_handler cimport libcudf_exception_handler
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
@@ -55,28 +55,28 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
         const column_view& rhs,
         binary_operator op,
         data_type output_type
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const scalar& rhs,
         binary_operator op,
         data_type output_type
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         binary_operator op,
         data_type output_type
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] binary_operation (
         const column_view& lhs,
         const column_view& rhs,
         const string& op,
         data_type output_type
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
 cdef extern from "cudf/binaryop.hpp" namespace "cudf::binops" nogil:
     cdef bool is_supported_operation(
@@ -84,4 +84,4 @@ cdef extern from "cudf/binaryop.hpp" namespace "cudf::binops" nogil:
         data_type lhs_type,
         data_type rhs_type,
         binary_operator op
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
index 001489d69bf..af3a16ad01b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
@@ -8,7 +8,7 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.exception_handler cimport cudf_exception_handler
+from cudf._lib.pylibcudf.exception_handler cimport libcudf_exception_handler
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
     column_view,
@@ -30,25 +30,25 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         const table_view& source_table,
         const column_view& gather_map,
         out_of_bounds_policy policy
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] shift(
         const column_view& input,
         size_type offset,
         const scalar& fill_values
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] scatter (
         const table_view& source_table,
         const column_view& scatter_map,
         const table_view& target_table,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] scatter (
         const vector[reference_wrapper[constscalar]]& source_scalars,
         const column_view& indices,
         const table_view& target,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cpdef enum class mask_allocation_policy(int32_t):
         NEVER
@@ -57,22 +57,22 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
 
     cdef unique_ptr[column] empty_like (
         const column_view& input_column
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] allocate_like (
         const column_view& input_column,
         mask_allocation_policy policy
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] allocate_like (
         const column_view& input_column,
         size_type size,
         mask_allocation_policy policy
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] empty_like (
         const table_view& input_table
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef void copy_range_in_place (
         const column_view& input_column,
@@ -80,7 +80,7 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         size_type input_begin,
         size_type input_end,
         size_type target_begin
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] copy_range (
         const column_view& input_column,
@@ -88,68 +88,68 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         size_type input_begin,
         size_type input_end,
         size_type target_begin
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef vector[column_view] slice (
         const column_view& input_column,
         vector[size_type] indices
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef vector[table_view] slice (
         const table_view& input_table,
         vector[size_type] indices
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef vector[column_view] split (
         const column_view& input_column,
         vector[size_type] splits
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef vector[table_view] split (
         const table_view& input_table,
         vector[size_type] splits
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const column_view& lhs,
         const column_view& rhs,
         const column_view& boolean_mask
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const scalar& lhs,
         const column_view& rhs,
         const column_view& boolean_mask
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const column_view& lhs,
         const scalar& rhs,
         const column_view boolean_mask
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] copy_if_else (
         const scalar& lhs,
         const scalar& rhs,
         const column_view boolean_mask
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] boolean_mask_scatter (
         const table_view& input,
         const table_view& target,
         const column_view& boolean_mask
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[table] boolean_mask_scatter (
         const vector[reference_wrapper[constscalar]]& input,
         const table_view& target,
         const column_view& boolean_mask
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[scalar] get_element (
         const column_view& input,
         size_type index
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cpdef enum class sample_with_replacement(bool):
         FALSE
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
index 82aed7d70a0..40bb2e78970 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
@@ -3,7 +3,7 @@
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 
-from cudf._lib.exception_handler cimport cudf_exception_handler
+from cudf._lib.pylibcudf.exception_handler cimport libcudf_exception_handler
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
 from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
@@ -21,25 +21,25 @@ cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
     cdef unique_ptr[column] contains(
         const lists_column_view& lists,
         const scalar& search_key,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] contains(
         const lists_column_view& lists,
         const column_view& search_keys,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] contains_nulls(
         const lists_column_view& lists,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] index_of(
         const lists_column_view& lists,
         const scalar& search_key,
         duplicate_find_option find_option,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler
 
     cdef unique_ptr[column] index_of(
         const lists_column_view& lists,
         const column_view& search_keys,
         duplicate_find_option find_option,
-    ) except +cudf_exception_handler
+    ) except +libcudf_exception_handler

From cc19d8a7b424abbc87f7767e3bc60c54390dc9e3 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 2 Aug 2024 09:34:27 -1000
Subject: [PATCH 626/842] Use explicit construction of column subclass instead
 of `build_column` when type is known (#16470)

When we need to construct a column with a specific type, we do not need to go through the indirection of `build_column`, which matches a column subclass to a passed type, and instead construct directly from the class instead

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16470
---
 python/cudf/cudf/core/_internals/where.py   |  2 +-
 python/cudf/cudf/core/column/categorical.py | 46 +++++++++++++--------
 python/cudf/cudf/core/column/column.py      |  2 +-
 python/cudf/cudf/core/column/datetime.py    | 10 ++---
 python/cudf/cudf/core/column/numerical.py   | 43 ++++++++-----------
 python/cudf/cudf/core/column/string.py      |  6 +--
 python/cudf/cudf/core/column/timedelta.py   |  8 ++--
 python/cudf/cudf/core/dataframe.py          |  4 +-
 python/cudf/cudf/core/index.py              |  8 ++--
 9 files changed, 64 insertions(+), 65 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 18ab32d2c9e..9f36499586b 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -110,7 +110,7 @@ def _make_categorical_like(result, column):
     if isinstance(column, cudf.core.column.CategoricalColumn):
         result = cudf.core.column.build_categorical_column(
             categories=column.categories,
-            codes=cudf.core.column.build_column(
+            codes=cudf.core.column.NumericalColumn(
                 result.base_data, dtype=result.dtype
             ),
             mask=result.base_mask,
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 9433a91b9c6..55bfae30470 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -572,13 +572,10 @@ def children(self) -> tuple[NumericalColumn]:
             codes_column = self.base_children[0]
             start = self.offset * codes_column.dtype.itemsize
             end = start + self.size * codes_column.dtype.itemsize
-            codes_column = cast(
-                cudf.core.column.NumericalColumn,
-                column.build_column(
-                    data=codes_column.base_data[start:end],
-                    dtype=codes_column.dtype,
-                    size=self.size,
-                ),
+            codes_column = cudf.core.column.NumericalColumn(
+                data=codes_column.base_data[start:end],
+                dtype=codes_column.dtype,
+                size=self.size,
             )
             self._children = (codes_column,)
         return self._children
@@ -660,8 +657,9 @@ def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
             Self,
             cudf.core.column.build_categorical_column(
                 categories=self.categories,
-                codes=cudf.core.column.build_column(
-                    codes.base_data, dtype=codes.dtype
+                codes=cudf.core.column.NumericalColumn(
+                    codes.base_data,  # type: ignore[arg-type]
+                    dtype=codes.dtype,
                 ),
                 mask=codes.base_mask,
                 ordered=self.ordered,
@@ -734,7 +732,10 @@ def sort_values(
         codes = self.codes.sort_values(ascending, na_position)
         col = column.build_categorical_column(
             categories=self.dtype.categories._values,
-            codes=column.build_column(codes.base_data, dtype=codes.dtype),
+            codes=cudf.core.column.NumericalColumn(
+                codes.base_data,  # type: ignore[arg-type]
+                dtype=codes.dtype,
+            ),
             mask=codes.base_mask,
             size=codes.size,
             ordered=self.dtype.ordered,
@@ -842,7 +843,10 @@ def unique(self) -> CategoricalColumn:
         codes = self.codes.unique()
         return column.build_categorical_column(
             categories=self.categories,
-            codes=column.build_column(codes.base_data, dtype=codes.dtype),
+            codes=cudf.core.column.NumericalColumn(
+                codes.base_data,  # type: ignore[arg-type]
+                dtype=codes.dtype,
+            ),
             mask=codes.base_mask,
             offset=codes.offset,
             size=codes.size,
@@ -980,7 +984,9 @@ def find_and_replace(
 
         result = column.build_categorical_column(
             categories=new_cats["cats"],
-            codes=column.build_column(output.base_data, dtype=output.dtype),
+            codes=cudf.core.column.NumericalColumn(
+                output.base_data, dtype=output.dtype
+            ),
             mask=output.base_mask,
             offset=output.offset,
             size=output.size,
@@ -1176,8 +1182,9 @@ def _concat(
 
         return column.build_categorical_column(
             categories=column.as_column(cats),
-            codes=column.build_column(
-                codes_col.base_data, dtype=codes_col.dtype
+            codes=cudf.core.column.NumericalColumn(
+                codes_col.base_data,  # type: ignore[arg-type]
+                dtype=codes_col.dtype,
             ),
             mask=codes_col.base_mask,
             size=codes_col.size,
@@ -1190,8 +1197,9 @@ def _with_type_metadata(
         if isinstance(dtype, CategoricalDtype):
             return column.build_categorical_column(
                 categories=dtype.categories._values,
-                codes=column.build_column(
-                    self.codes.base_data, dtype=self.codes.dtype
+                codes=cudf.core.column.NumericalColumn(
+                    self.codes.base_data,  # type: ignore[arg-type]
+                    dtype=self.codes.dtype,
                 ),
                 mask=self.codes.base_mask,
                 ordered=dtype.ordered,
@@ -1339,7 +1347,7 @@ def _set_categories(
             Self,
             column.build_categorical_column(
                 categories=new_cats,
-                codes=column.build_column(
+                codes=cudf.core.column.NumericalColumn(
                     new_codes.base_data, dtype=new_codes.dtype
                 ),
                 mask=new_codes.base_mask,
@@ -1472,7 +1480,9 @@ def pandas_categorical_as_column(
 
     return column.build_categorical_column(
         categories=categorical.categories,
-        codes=column.build_column(codes.base_data, codes.dtype),
+        codes=cudf.core.column.NumericalColumn(
+            codes.base_data, dtype=codes.dtype
+        ),
         size=codes.size,
         mask=mask,
         ordered=categorical.ordered,
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 7e0d8ced595..a7d2cb441dd 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1506,7 +1506,7 @@ def column_empty(
     elif isinstance(dtype, CategoricalDtype):
         data = None
         children = (
-            build_column(
+            cudf.core.column.NumericalColumn(
                 data=as_buffer(
                     rmm.DeviceBuffer(
                         size=row_count
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 81fbb914842..ce67ce81e6b 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -473,15 +473,15 @@ def as_timedelta_column(self, dtype: Dtype) -> None:  # type: ignore[override]
 
     def as_numerical_column(
         self, dtype: Dtype
-    ) -> "cudf.core.column.NumericalColumn":
-        col = column.build_column(
-            data=self.base_data,
-            dtype=np.int64,
+    ) -> cudf.core.column.NumericalColumn:
+        col = cudf.core.column.NumericalColumn(
+            data=self.base_data,  # type: ignore[arg-type]
+            dtype=np.dtype(np.int64),
             mask=self.base_mask,
             offset=self.offset,
             size=self.size,
         )
-        return cast("cudf.core.column.NumericalColumn", col.astype(dtype))
+        return cast(cudf.core.column.NumericalColumn, col.astype(dtype))
 
     def strftime(self, format: str) -> cudf.core.column.StringColumn:
         if len(self) == 0:
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index f9404eb3b40..c326a10c844 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -13,13 +13,7 @@
 from cudf import _lib as libcudf
 from cudf._lib import pylibcudf
 from cudf.api.types import is_integer, is_scalar
-from cudf.core.column import (
-    ColumnBase,
-    as_column,
-    build_column,
-    column,
-    string,
-)
+from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.mixins import BinaryOperand
 from cudf.errors import MixedTypeError
@@ -338,29 +332,23 @@ def as_string_column(self) -> cudf.core.column.StringColumn:
     def as_datetime_column(
         self, dtype: Dtype
     ) -> cudf.core.column.DatetimeColumn:
-        return cast(
-            "cudf.core.column.DatetimeColumn",
-            build_column(
-                data=self.astype("int64").base_data,
-                dtype=dtype,
-                mask=self.base_mask,
-                offset=self.offset,
-                size=self.size,
-            ),
+        return cudf.core.column.DatetimeColumn(
+            data=self.astype("int64").base_data,  # type: ignore[arg-type]
+            dtype=dtype,
+            mask=self.base_mask,
+            offset=self.offset,
+            size=self.size,
         )
 
     def as_timedelta_column(
         self, dtype: Dtype
     ) -> cudf.core.column.TimeDeltaColumn:
-        return cast(
-            "cudf.core.column.TimeDeltaColumn",
-            build_column(
-                data=self.astype("int64").base_data,
-                dtype=dtype,
-                mask=self.base_mask,
-                offset=self.offset,
-                size=self.size,
-            ),
+        return cudf.core.column.TimeDeltaColumn(
+            data=self.astype("int64").base_data,  # type: ignore[arg-type]
+            dtype=dtype,
+            mask=self.base_mask,
+            offset=self.offset,
+            size=self.size,
         )
 
     def as_decimal_column(
@@ -637,7 +625,10 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
         if isinstance(dtype, CategoricalDtype):
             return column.build_categorical_column(
                 categories=dtype.categories._values,
-                codes=build_column(self.base_data, dtype=self.dtype),
+                codes=cudf.core.column.NumericalColumn(
+                    self.base_data,  # type: ignore[arg-type]
+                    dtype=self.dtype,
+                ),
                 mask=self.base_mask,
                 ordered=dtype.ordered,
                 size=self.size,
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index ec95c50f455..b422ff86b17 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5934,9 +5934,9 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase":
 
         n_bytes_to_view = str_end_byte_offset - str_byte_offset
 
-        to_view = column.build_column(
-            self.base_data,
-            dtype=cudf.api.types.dtype("int8"),
+        to_view = cudf.core.column.NumericalColumn(
+            self.base_data,  # type: ignore[arg-type]
+            dtype=np.dtype(np.int8),
             offset=str_byte_offset,
             size=n_bytes_to_view,
         )
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index 47c8ed6fd95..ba0dc4779bb 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -265,10 +265,10 @@ def round(self, freq: str) -> ColumnBase:
 
     def as_numerical_column(
         self, dtype: Dtype
-    ) -> "cudf.core.column.NumericalColumn":
-        col = column.build_column(
-            data=self.base_data,
-            dtype=np.int64,
+    ) -> cudf.core.column.NumericalColumn:
+        col = cudf.core.column.NumericalColumn(
+            data=self.base_data,  # type: ignore[arg-type]
+            dtype=np.dtype(np.int64),
             mask=self.base_mask,
             offset=self.offset,
             size=self.size,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 52dc29974bf..865d2706ca3 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -46,10 +46,10 @@
 from cudf.core.column import (
     CategoricalColumn,
     ColumnBase,
+    NumericalColumn,
     StructColumn,
     as_column,
     build_categorical_column,
-    build_column,
     column_empty,
     concat_columns,
 )
@@ -8543,7 +8543,7 @@ def _reassign_categories(categories, cols, col_idxs):
         if idx in categories:
             cols[name] = build_categorical_column(
                 categories=categories[idx],
-                codes=build_column(
+                codes=NumericalColumn(
                     cols[name].base_data, dtype=cols[name].dtype
                 ),
                 mask=cols[name].base_mask,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index cd879d559cd..0d29ef07e7d 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2434,12 +2434,10 @@ def to_pandas(
         return result
 
     @_performance_tracking
-    def _get_dt_field(self, field):
+    def _get_dt_field(self, field: str) -> Index:
+        """Return an Index of a numerical component of the DatetimeIndex."""
         out_column = self._values.get_dt_field(field)
-        # column.column_empty_like always returns a Column object
-        # but we need a NumericalColumn for Index..
-        # how should this be handled?
-        out_column = column.build_column(
+        out_column = NumericalColumn(
             data=out_column.base_data,
             dtype=out_column.dtype,
             mask=out_column.base_mask,

From e0d1ac1efa9153f0a084bd72b7d4c300f9640196 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Fri, 2 Aug 2024 17:44:45 -0500
Subject: [PATCH 627/842] Fix typo in dispatch_row_equal. (#16473)

This PR fixes a small typo in the C++ code.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16473
---
 cpp/src/stream_compaction/distinct.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index e5cf29f3ebf..e2c5aba6802 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -51,7 +51,7 @@ namespace {
  * @param func The input functor to invoke
  */
 template <bool HasNested, typename Func>
-rmm::device_uvector<cudf::size_type> dipatch_row_equal(
+rmm::device_uvector<cudf::size_type> dispatch_row_equal(
   null_equality compare_nulls,
   nan_equality compare_nans,
   bool has_nulls,
@@ -110,9 +110,9 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
   };
 
   if (cudf::detail::has_nested_columns(input)) {
-    return dipatch_row_equal<true>(nulls_equal, nans_equal, has_nulls, row_equal, helper_func);
+    return dispatch_row_equal<true>(nulls_equal, nans_equal, has_nulls, row_equal, helper_func);
   } else {
-    return dipatch_row_equal<false>(nulls_equal, nans_equal, has_nulls, row_equal, helper_func);
+    return dispatch_row_equal<false>(nulls_equal, nans_equal, has_nulls, row_equal, helper_func);
   }
 }
 

From af57286536fc21b47b80e45be222773b751600c9 Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Mon, 5 Aug 2024 07:16:34 -0500
Subject: [PATCH 628/842] Add missing pylibcudf strings docs (#16471)

Noticed a few missing pylibcudf string docs that were missed, added them here.

Authors:
  - https://github.com/brandon-b-miller
  - Thomas Li (https://github.com/lithomas1)

Approvers:
  - Thomas Li (https://github.com/lithomas1)

URL: https://github.com/rapidsai/cudf/pull/16471
---
 .../api_docs/pylibcudf/strings/capitalize.rst |  6 +++
 .../api_docs/pylibcudf/strings/char_types.rst |  6 +++
 .../api_docs/pylibcudf/strings/find.rst       |  6 +++
 .../api_docs/pylibcudf/strings/index.rst      |  5 ++
 .../pylibcudf/strings/regex_flags.rst         |  6 +++
 .../pylibcudf/strings/regex_program.rst       |  6 +++
 .../_lib/pylibcudf/strings/capitalize.pyx     | 48 ++++++++++++++++++-
 .../_lib/pylibcudf/strings/regex_program.pyx  | 19 ++++++++
 8 files changed, 101 insertions(+), 1 deletion(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst
new file mode 100644
index 00000000000..578b2b75e37
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst
@@ -0,0 +1,6 @@
+==========
+capitalize
+==========
+
+.. automodule:: cudf._lib.pylibcudf.strings.capitalize
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst
new file mode 100644
index 00000000000..577ec34915b
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst
@@ -0,0 +1,6 @@
+==========
+char_types
+==========
+
+.. automodule:: cudf._lib.pylibcudf.strings.char_types
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst
new file mode 100644
index 00000000000..61d4079e9a3
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst
@@ -0,0 +1,6 @@
+====
+find
+====
+
+.. automodule:: cudf._lib.pylibcudf.strings.find
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
index cecf1ccc9bb..462a756a092 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/index.rst
@@ -4,6 +4,11 @@ strings
 .. toctree::
     :maxdepth: 1
 
+    capitalize
+    char_types
     contains
+    find
+    regex_flags
+    regex_program
     replace
     slice
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst
new file mode 100644
index 00000000000..0126b6a3706
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst
@@ -0,0 +1,6 @@
+===========
+regex_flags
+===========
+
+.. automodule:: cudf._lib.pylibcudf.strings.regex_flags
+   :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst
new file mode 100644
index 00000000000..2f398186d51
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst
@@ -0,0 +1,6 @@
+=============
+regex_program
+=============
+
+.. automodule:: cudf._lib.pylibcudf.strings.regex_program
+   :members:
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx
index d3f79088018..ccf84d25572 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx
@@ -22,7 +22,22 @@ cpdef Column capitalize(
     # TODO: default scalar values
     # https://github.com/rapidsai/cudf/issues/15505
 ):
-
+    """Returns a column of capitalized strings.
+
+    For details, see :cpp:func:`cudf::strings::capitalize`.
+
+    Parameters
+    ----------
+    input : Column
+        String column
+    delimiters : Scalar, default None
+        Characters for identifying words to capitalize
+
+    Returns
+    -------
+    pylibcudf.Column
+        Column of strings capitalized from the input column
+    """
     cdef unique_ptr[column] c_result
 
     if delimiters is None:
@@ -47,6 +62,23 @@ cpdef Column title(
     Column input,
     string_character_types sequence_type=string_character_types.ALPHA
 ):
+    """Modifies first character of each word to upper-case and lower-cases
+    the rest.
+
+    For details, see :cpp:func:`cudf::strings::title`.
+
+    Parameters
+    ----------
+    input : Column
+        String column
+    sequence_type : string_character_types, default string_character_types.ALPHA
+        The character type that is used when identifying words
+
+    Returns
+    -------
+    pylibcudf.Column
+        Column of titled strings
+    """
     cdef unique_ptr[column] c_result
     with nogil:
         c_result = cpp_capitalize.title(input.view(), sequence_type)
@@ -55,6 +87,20 @@ cpdef Column title(
 
 
 cpdef Column is_title(Column input):
+    """Checks if the strings in the input column are title formatted.
+
+    For details, see :cpp:func:`cudf::strings::is_title`.
+
+    Parameters
+    ----------
+    input : Column
+        String column
+
+    Returns
+    -------
+    pylibcudf.Column
+        Column of type BOOL8
+    """
     cdef unique_ptr[column] c_result
     with nogil:
         c_result = cpp_capitalize.is_title(input.view())
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
index d605b0aba02..5f0b8868452 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
@@ -13,12 +13,31 @@ from cudf._lib.pylibcudf.strings.regex_flags cimport regex_flags
 
 
 cdef class RegexProgram:
+    """Regex program class.
 
+    This is the Cython representation of
+    :cpp:class:`cudf::strings::regex_program`.
+
+    Do not instantiate this class directly, use the `create` method.
+
+    """
     def __init__(self, *args, **kwargs):
         raise ValueError("Do not instantiate RegexProgram directly, use create")
 
     @staticmethod
     def create(str pattern, int flags):
+        """Create a program from a pattern.
+
+        For detils, see :cpp:func:`cudf::strings::regex_program::create`.
+
+        Parameters
+        ----------
+        pattern : str
+            Regex pattern
+        flags : Uniont[int, RegexFlags]
+            Regex flags for interpreting special characters in the pattern
+
+        """
         cdef unique_ptr[regex_program] c_prog
         cdef regex_flags c_flags
         cdef string c_pattern = pattern.encode()

From 837dfe51a2f4d0268d6976464eed637645f524ff Mon Sep 17 00:00:00 2001
From: Rahul Prabhu <100436830+sdrp713@users.noreply.github.com>
Date: Mon, 5 Aug 2024 14:14:41 -0700
Subject: [PATCH 629/842] Added batch memset to memset data and validity
 buffers in parquet reader (#16281)

Under some situations in the Parquet reader (particularly the case with tables containing many columns or deeply nested column) we burn a decent amount of time doing cudaMemset() operations on output buffers. A good amount of this overhead seems to stem from the fact that we're simply launching many tiny kernels. This PR adds a batched memset kernel that takes a list of device spans as a single input and does all the work under a single kernel launch. This PR addresses issue #15773

## Improvements
Using out performance cluster, improvements of 2.39% were shown on running the overall NDS queries
Additionally, benchmarks were added showing big improvements(around 20%) especially on fixed width data types which can be shown below

data_type | num_cols | cardinality | run_length | bytes_per_second_before_this_pr | bytes_per_second_after_this_pr | speedup
--- | --- | --- | --- | --- | --- | ---
INTEGRAL | 1000 | 0 | 1 | 36514934834 | 42756531566 | 1.170932709
INTEGRAL | 1000 | 1000 | 1 | 35364061247 | 39112512476 | 1.105996062
INTEGRAL | 1000 | 0 | 32 | 37349112510 | 39641370858 | 1.061373837
INTEGRAL | 1000 | 1000 | 32 | 39167079622 | 43740824957 | 1.116775245
FLOAT | 1000 | 0 | 1 | 51877322003 | 64083898838 | 1.235296973
FLOAT | 1000 | 1000 | 1 | 48983612272 | 58705522023 | 1.198472699
FLOAT | 1000 | 0 | 32 | 46544977658 | 53715018581 | 1.154045426
FLOAT | 1000 | 1000 | 32 | 54493432148 | 66617609904 | 1.22248879
DECIMAL | 1000 | 0 | 1 | 47616412888 | 57952310685 | 1.217065864
DECIMAL | 1000 | 1000 | 1 | 47166138095 | 54283772484 | 1.1509056
DECIMAL | 1000 | 0 | 32 | 45266163387 | 53770390830 | 1.18787162
DECIMAL | 1000 | 1000 | 32 | 52292176603 | 58847723569 | 1.125363819
TIMESTAMP | 1000 | 0 | 1 | 50245415328 | 60797982330 | 1.210020495
TIMESTAMP | 1000 | 1000 | 1 | 50300238706 | 60810368331 | 1.208947908
TIMESTAMP | 1000 | 0 | 32 | 55338354243 | 66786275739 | 1.206871376
TIMESTAMP | 1000 | 1000 | 32 | 55680028082 | 69029227374 | 1.23974843
DURATION | 1000 | 0 | 1 | 54680007758 | 66855201896 | 1.222662626
DURATION | 1000 | 1000 | 1 | 54305832171 | 66602436269 | 1.226432477
DURATION | 1000 | 0 | 32 | 60040760815 | 72663056969 | 1.210228784
DURATION | 1000 | 1000 | 32 | 60212221703 | 75646396131 | 1.256329595
STRING | 1000 | 0 | 1 | 29691707753 | 33388700976 | 1.12451265
STRING | 1000 | 1000 | 1 | 31411129876 | 35407241037 | 1.127219593
STRING | 1000 | 0 | 32 | 29680479388 | 33382478907 | 1.124728427
STRING | 1000 | 1000 | 32 | 35476213777 | 40478389269 | 1.141000827
LIST | 1000 | 0 | 1 | 6874253484 | 7370835717 | 1.072237987
LIST | 1000 | 1000 | 1 | 6763426009 | 7253762966 | 1.07249831
LIST | 1000 | 0 | 32 | 6981508808 | 7502741115 | 1.074658977
LIST | 1000 | 1000 | 32 | 6989374761 | 7506418252 | 1.073975643
STRUCT | 1000 | 0 | 1 | 2137525922 | 2189495762 | 1.024313081
STRUCT | 1000 | 1000 | 1 | 1057923939 | 1078475980 | 1.019426766
STRUCT | 1000 | 0 | 32 | 1637342446 | 1698913790 | 1.037604439
STRUCT | 1000 | 1000 | 32 | 1057587701 | 1082539399 | 1.02359303

Authors:
  - Rahul Prabhu (https://github.com/sdrp713)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - https://github.com/nvdbaranec
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16281
---
 cpp/benchmarks/CMakeLists.txt                 |   5 +
 .../io/utilities/batched_memset_bench.cpp     | 101 ++++++++++++++++++
 cpp/include/cudf/io/detail/batched_memset.hpp |  82 ++++++++++++++
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  29 ++++-
 cpp/src/io/utilities/column_buffer.cpp        |  29 +++--
 cpp/src/io/utilities/column_buffer.hpp        |  23 +++-
 cpp/tests/CMakeLists.txt                      |   1 +
 .../utilities_tests/batched_memset_tests.cu   |  97 +++++++++++++++++
 8 files changed, 353 insertions(+), 14 deletions(-)
 create mode 100644 cpp/benchmarks/io/utilities/batched_memset_bench.cpp
 create mode 100644 cpp/include/cudf/io/detail/batched_memset.hpp
 create mode 100644 cpp/tests/utilities_tests/batched_memset_tests.cu

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index ff431c7f260..7be456ddfba 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -353,6 +353,11 @@ ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader
 ConfigureNVBench(JSON_READER_OPTION_NVBENCH io/json/json_reader_option.cpp)
 ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)
 
+# ##################################################################################################
+# * multi buffer memset benchmark
+# ----------------------------------------------------------------------
+ConfigureNVBench(BATCHED_MEMSET_BENCH io/utilities/batched_memset_bench.cpp)
+
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------
 ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)
diff --git a/cpp/benchmarks/io/utilities/batched_memset_bench.cpp b/cpp/benchmarks/io/utilities/batched_memset_bench.cpp
new file mode 100644
index 00000000000..2905895a63b
--- /dev/null
+++ b/cpp/benchmarks/io/utilities/batched_memset_bench.cpp
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/io/cuio_common.hpp>
+#include <benchmarks/io/nvbench_helpers.hpp>
+
+#include <cudf/io/parquet.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
+constexpr size_t data_size = 512 << 20;
+
+void parquet_read_common(cudf::size_type num_rows_to_read,
+                         cudf::size_type num_cols_to_read,
+                         cuio_source_sink_pair& source_sink,
+                         nvbench::state& state)
+{
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
+
+  auto mem_stats_logger = cudf::memory_stats_logger();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.exec(
+    nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+      try_drop_l3_cache();
+
+      timer.start();
+      auto const result = cudf::io::read_parquet(read_opts);
+      timer.stop();
+
+      CUDF_EXPECTS(result.tbl->num_columns() == num_cols_to_read, "Unexpected number of columns");
+      CUDF_EXPECTS(result.tbl->num_rows() == num_rows_to_read, "Unexpected number of rows");
+    });
+
+  auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_buffer_size(
+    mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
+  state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
+}
+
+template <data_type DataType>
+void bench_batched_memset(nvbench::state& state, nvbench::type_list<nvbench::enum_type<DataType>>)
+{
+  auto const d_type      = get_type_or_group(static_cast<int32_t>(DataType));
+  auto const num_cols    = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const cardinality = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length  = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
+  auto const compression = cudf::io::compression_type::NONE;
+  cuio_source_sink_pair source_sink(source_type);
+  auto const tbl =
+    create_random_table(cycle_dtypes(d_type, num_cols),
+                        table_size_bytes{data_size},
+                        data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+  auto const view = tbl->view();
+
+  cudf::io::parquet_writer_options write_opts =
+    cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+      .compression(compression);
+  cudf::io::write_parquet(write_opts);
+  auto const num_rows = view.num_rows();
+
+  parquet_read_common(num_rows, num_cols, source_sink, state);
+}
+
+using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
+                                            data_type::FLOAT,
+                                            data_type::DECIMAL,
+                                            data_type::TIMESTAMP,
+                                            data_type::DURATION,
+                                            data_type::STRING,
+                                            data_type::LIST,
+                                            data_type::STRUCT>;
+
+NVBENCH_BENCH_TYPES(bench_batched_memset, NVBENCH_TYPE_AXES(d_type_list))
+  .set_name("batched_memset")
+  .set_type_axes_names({"data_type"})
+  .add_int64_axis("num_cols", {1000})
+  .add_string_axis("io_type", {"DEVICE_BUFFER"})
+  .set_min_samples(4)
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
diff --git a/cpp/include/cudf/io/detail/batched_memset.hpp b/cpp/include/cudf/io/detail/batched_memset.hpp
new file mode 100644
index 00000000000..d0922cc64ee
--- /dev/null
+++ b/cpp/include/cudf/io/detail/batched_memset.hpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_buffer.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <cub/device/device_copy.cuh>
+#include <cuda/functional>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/transform.h>
+
+namespace CUDF_EXPORT cudf {
+namespace io::detail {
+
+/**
+ * @brief A helper function that takes in a vector of device spans and memsets them to the
+ * value provided using batches sent to the GPU.
+ *
+ * @param bufs Vector with device spans of data
+ * @param value Value to memset all device spans to
+ * @param _stream Stream used for device memory operations and kernel launches
+ *
+ * @return The data in device spans all set to value
+ */
+template <typename T>
+void batched_memset(std::vector<cudf::device_span<T>> const& bufs,
+                    T const value,
+                    rmm::cuda_stream_view stream)
+{
+  // define task and bytes parameters
+  auto const num_bufs = bufs.size();
+
+  // copy bufs into device memory and then get sizes
+  auto gpu_bufs =
+    cudf::detail::make_device_uvector_async(bufs, stream, rmm::mr::get_current_device_resource());
+
+  // get a vector with the sizes of all buffers
+  auto sizes = cudf::detail::make_counting_transform_iterator(
+    static_cast<std::size_t>(0),
+    cuda::proclaim_return_type<std::size_t>(
+      [gpu_bufs = gpu_bufs.data()] __device__(std::size_t i) { return gpu_bufs[i].size(); }));
+
+  // get an iterator with a constant value to memset
+  auto iter_in = thrust::make_constant_iterator(thrust::make_constant_iterator(value));
+
+  // get an iterator pointing to each device span
+  auto iter_out = thrust::make_transform_iterator(
+    thrust::counting_iterator<std::size_t>(0),
+    cuda::proclaim_return_type<T*>(
+      [gpu_bufs = gpu_bufs.data()] __device__(std::size_t i) { return gpu_bufs[i].data(); }));
+
+  size_t temp_storage_bytes = 0;
+
+  cub::DeviceCopy::Batched(nullptr, temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream);
+
+  rmm::device_buffer d_temp_storage(
+    temp_storage_bytes, stream, rmm::mr::get_current_device_resource());
+
+  cub::DeviceCopy::Batched(
+    d_temp_storage.data(), temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream);
+}
+
+}  // namespace io::detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index e006cc7d714..557b1a45c1f 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/batched_memset.hpp>
 
 #include <rmm/exec_policy.hpp>
 
@@ -1494,6 +1495,11 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
   // buffers if they are not part of a list hierarchy. mark down
   // if we have any list columns that need further processing.
   bool has_lists = false;
+  // Casting to std::byte since data buffer pointer is void *
+  std::vector<cudf::device_span<std::byte>> memset_bufs;
+  // Validity Buffer is a uint32_t pointer
+  std::vector<cudf::device_span<cudf::bitmask_type>> nullmask_bufs;
+
   for (size_t idx = 0; idx < _input_columns.size(); idx++) {
     auto const& input_col  = _input_columns[idx];
     size_t const max_depth = input_col.nesting_depth();
@@ -1514,13 +1520,19 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
         // we're going to start null mask as all valid and then turn bits off if necessary
         out_buf.create_with_mask(
           out_buf.type.id() == type_id::LIST && l_idx < max_depth ? num_rows + 1 : num_rows,
-          cudf::mask_state::ALL_VALID,
+          cudf::mask_state::UNINITIALIZED,
+          false,
           _stream,
           _mr);
+        memset_bufs.push_back(cudf::device_span<std::byte>(static_cast<std::byte*>(out_buf.data()),
+                                                           out_buf.data_size()));
+        nullmask_bufs.push_back(cudf::device_span<cudf::bitmask_type>(
+          out_buf.null_mask(),
+          cudf::util::round_up_safe(out_buf.null_mask_size(), sizeof(cudf::bitmask_type)) /
+            sizeof(cudf::bitmask_type)));
       }
     }
   }
-
   // compute output column sizes by examining the pages of the -input- columns
   if (has_lists) {
     auto h_cols_info =
@@ -1593,11 +1605,22 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
 
           // allocate
           // we're going to start null mask as all valid and then turn bits off if necessary
-          out_buf.create_with_mask(size, cudf::mask_state::ALL_VALID, _stream, _mr);
+          out_buf.create_with_mask(size, cudf::mask_state::UNINITIALIZED, false, _stream, _mr);
+          memset_bufs.push_back(cudf::device_span<std::byte>(
+            static_cast<std::byte*>(out_buf.data()), out_buf.data_size()));
+          nullmask_bufs.push_back(cudf::device_span<cudf::bitmask_type>(
+            out_buf.null_mask(),
+            cudf::util::round_up_safe(out_buf.null_mask_size(), sizeof(cudf::bitmask_type)) /
+              sizeof(cudf::bitmask_type)));
         }
       }
     }
   }
+
+  cudf::io::detail::batched_memset(memset_bufs, static_cast<std::byte>(0), _stream);
+  // Need to set null mask bufs to all high bits
+  cudf::io::detail::batched_memset(
+    nullmask_bufs, std::numeric_limits<cudf::bitmask_type>::max(), _stream);
 }
 
 std::vector<size_t> reader::impl::calculate_page_string_offsets()
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 2f4272b0367..8abfb000b94 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -33,7 +33,7 @@
 
 namespace cudf::io::detail {
 
-void gather_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
+void gather_column_buffer::allocate_strings_data(bool memset_data, rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column");
   // The contents of _strings will never be directly returned to the user.
@@ -56,11 +56,12 @@ std::unique_ptr<column> gather_column_buffer::make_string_column_impl(rmm::cuda_
   return make_strings_column(*_strings, stream, _mr);
 }
 
-void cudf::io::detail::inline_column_buffer::allocate_strings_data(rmm::cuda_stream_view stream)
+void cudf::io::detail::inline_column_buffer::allocate_strings_data(bool memset_data,
+                                                                   rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS(type.id() == type_id::STRING, "allocate_strings_data called for non-string column");
   // size + 1 for final offset. _string_data will be initialized later.
-  _data = create_data(data_type{type_id::INT32}, size + 1, stream, _mr);
+  _data = create_data(data_type{type_to_id<size_type>()}, size + 1, memset_data, stream, _mr);
 }
 
 void cudf::io::detail::inline_column_buffer::create_string_data(size_t num_bytes,
@@ -93,6 +94,7 @@ void copy_buffer_data(string_policy const& buff, string_policy& new_buff)
 template <class string_policy>
 void column_buffer_base<string_policy>::create_with_mask(size_type _size,
                                                          cudf::mask_state null_mask_state,
+                                                         bool memset_data,
                                                          rmm::cuda_stream_view stream,
                                                          rmm::device_async_resource_ref mr)
 {
@@ -100,16 +102,20 @@ void column_buffer_base<string_policy>::create_with_mask(size_type _size,
   _mr  = mr;
 
   switch (type.id()) {
-    case type_id::STRING: static_cast<string_policy*>(this)->allocate_strings_data(stream); break;
+    case type_id::STRING:
+      static_cast<string_policy*>(this)->allocate_strings_data(memset_data, stream);
+      break;
 
     // list columns store a buffer of int32's as offsets to represent
     // their individual rows
-    case type_id::LIST: _data = create_data(data_type{type_id::INT32}, size, stream, _mr); break;
+    case type_id::LIST:
+      _data = create_data(data_type{type_to_id<size_type>()}, size, memset_data, stream, _mr);
+      break;
 
     // struct columns store no data themselves.  just validity and children.
     case type_id::STRUCT: break;
 
-    default: _data = create_data(type, size, stream, _mr); break;
+    default: _data = create_data(type, size, memset_data, stream, _mr); break;
   }
   if (is_nullable) {
     _null_mask =
@@ -117,12 +123,21 @@ void column_buffer_base<string_policy>::create_with_mask(size_type _size,
   }
 }
 
+template <class string_policy>
+void column_buffer_base<string_policy>::create(size_type _size,
+                                               bool memset_data,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  create_with_mask(_size, mask_state::ALL_NULL, memset_data, stream, mr);
+}
+
 template <class string_policy>
 void column_buffer_base<string_policy>::create(size_type _size,
                                                rmm::cuda_stream_view stream,
                                                rmm::device_async_resource_ref mr)
 {
-  create_with_mask(_size, mask_state::ALL_NULL, stream, mr);
+  create_with_mask(_size, mask_state::ALL_NULL, true, stream, mr);
 }
 
 template <class string_policy>
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index ed6bb8bbdca..b2290965bb9 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -44,6 +44,7 @@ namespace detail {
  *
  * @param type The intended data type to populate
  * @param size The number of elements to be represented by the mask
+ * @param memset_data Defines whether data should be memset to 0
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned device_buffer
  *
@@ -51,17 +52,25 @@ namespace detail {
  */
 inline rmm::device_buffer create_data(data_type type,
                                       size_type size,
+                                      bool memset_data,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
   std::size_t data_size = size_of(type) * size;
 
   rmm::device_buffer data(data_size, stream, mr);
-  CUDF_CUDA_TRY(cudaMemsetAsync(data.data(), 0, data_size, stream.value()));
-
+  if (memset_data) { CUDF_CUDA_TRY(cudaMemsetAsync(data.data(), 0, data_size, stream.value())); }
   return data;
 }
 
+inline rmm::device_buffer create_data(data_type type,
+                                      size_type size,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  return create_data(type, size, true, stream, mr);
+}
+
 using string_index_pair = thrust::pair<char const*, size_type>;
 
 // forward declare friend functions
@@ -113,12 +122,18 @@ class column_buffer_base {
 
   // instantiate a column of known type with a specified size.  Allows deferred creation for
   // preprocessing steps such as in the Parquet reader
+  void create(size_type _size,
+              bool memset_data,
+              rmm::cuda_stream_view stream,
+              rmm::device_async_resource_ref mr);
+
   void create(size_type _size, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr);
 
   // like create(), but also takes a `cudf::mask_state` to allow initializing the null mask as
   // something other than `ALL_NULL`
   void create_with_mask(size_type _size,
                         cudf::mask_state null_mask_state,
+                        bool memset_data,
                         rmm::cuda_stream_view stream,
                         rmm::device_async_resource_ref mr);
 
@@ -192,7 +207,7 @@ class gather_column_buffer : public column_buffer_base<gather_column_buffer> {
     create(_size, stream, mr);
   }
 
-  void allocate_strings_data(rmm::cuda_stream_view stream);
+  void allocate_strings_data(bool memset_data, rmm::cuda_stream_view stream);
 
   [[nodiscard]] void* data_impl() { return _strings ? _strings->data() : _data.data(); }
   [[nodiscard]] void const* data_impl() const { return _strings ? _strings->data() : _data.data(); }
@@ -226,7 +241,7 @@ class inline_column_buffer : public column_buffer_base<inline_column_buffer> {
     create(_size, stream, mr);
   }
 
-  void allocate_strings_data(rmm::cuda_stream_view stream);
+  void allocate_strings_data(bool memset_data, rmm::cuda_stream_view stream);
 
   void* data_impl() { return _data.data(); }
   [[nodiscard]] void const* data_impl() const { return _data.data(); }
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 4dffcb41ba2..5e85b3e8adf 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -393,6 +393,7 @@ ConfigureTest(
   utilities_tests/pinned_memory_tests.cpp
   utilities_tests/type_check_tests.cpp
   utilities_tests/type_list_tests.cpp
+  utilities_tests/batched_memset_tests.cu
 )
 
 # ##################################################################################################
diff --git a/cpp/tests/utilities_tests/batched_memset_tests.cu b/cpp/tests/utilities_tests/batched_memset_tests.cu
new file mode 100644
index 00000000000..9fc5baeec97
--- /dev/null
+++ b/cpp/tests/utilities_tests/batched_memset_tests.cu
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/batched_memset.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/tuple.h>
+
+#include <type_traits>
+
+template <typename T>
+struct MultiBufferTestIntegral : public cudf::test::BaseFixture {};
+
+TEST(MultiBufferTestIntegral, BasicTest1)
+{
+  std::vector<size_t> const BUF_SIZES{
+    50000, 4, 1000, 0, 250000, 1, 100, 8000, 0, 1, 100, 1000, 10000, 100000, 0, 1, 100000};
+
+  // Device init
+  auto stream = cudf::get_default_stream();
+  auto mr     = rmm::mr::get_current_device_resource();
+
+  // Creating base vector for data and setting it to all 0xFF
+  std::vector<std::vector<uint64_t>> expected;
+  std::transform(BUF_SIZES.begin(), BUF_SIZES.end(), std::back_inserter(expected), [](auto size) {
+    return std::vector<uint64_t>(size + 2000, std::numeric_limits<uint64_t>::max());
+  });
+
+  // set buffer region to other value
+  std::for_each(thrust::make_zip_iterator(thrust::make_tuple(expected.begin(), BUF_SIZES.begin())),
+                thrust::make_zip_iterator(thrust::make_tuple(expected.end(), BUF_SIZES.end())),
+                [](auto elem) {
+                  std::fill_n(
+                    thrust::get<0>(elem).begin() + 1000, thrust::get<1>(elem), 0xEEEEEEEEEEEEEEEE);
+                });
+
+  // Copy host vector data to device
+  std::vector<rmm::device_uvector<uint64_t>> device_bufs;
+  std::transform(expected.begin(),
+                 expected.end(),
+                 std::back_inserter(device_bufs),
+                 [stream, mr](auto const& vec) {
+                   return cudf::detail::make_device_uvector_async(vec, stream, mr);
+                 });
+
+  // Initialize device buffers for memset
+  std::vector<cudf::device_span<uint64_t>> memset_bufs;
+  std::transform(
+    thrust::make_zip_iterator(thrust::make_tuple(device_bufs.begin(), BUF_SIZES.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(device_bufs.end(), BUF_SIZES.end())),
+    std::back_inserter(memset_bufs),
+    [](auto const& elem) {
+      return cudf::device_span<uint64_t>(thrust::get<0>(elem).data() + 1000, thrust::get<1>(elem));
+    });
+
+  // Function Call
+  cudf::io::detail::batched_memset(memset_bufs, uint64_t{0}, stream);
+
+  // Set all buffer regions to 0 for expected comparison
+  std::for_each(
+    thrust::make_zip_iterator(thrust::make_tuple(expected.begin(), BUF_SIZES.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(expected.end(), BUF_SIZES.end())),
+    [](auto elem) { std::fill_n(thrust::get<0>(elem).begin() + 1000, thrust::get<1>(elem), 0UL); });
+
+  // Compare to see that only given buffers are zeroed out
+  std::for_each(
+    thrust::make_zip_iterator(thrust::make_tuple(device_bufs.begin(), expected.begin())),
+    thrust::make_zip_iterator(thrust::make_tuple(device_bufs.end(), expected.end())),
+    [stream](auto const& elem) {
+      auto after_memset = cudf::detail::make_std_vector_async(thrust::get<0>(elem), stream);
+      EXPECT_TRUE(
+        std::equal(thrust::get<1>(elem).begin(), thrust::get<1>(elem).end(), after_memset.begin()));
+    });
+}

From 8068a2d616b6647bcd80720a2c24af858cbffd2d Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 5 Aug 2024 14:48:33 -0700
Subject: [PATCH 630/842] Fix build failures with GCC 13 (#16488)

Closes #16395

This PR resolves two types of compilation errors, allowing for successful builds with GCC 13:

- replacing the `cuco_allocator` strong type with an alias to fix a new build time check with GCC 13
- removing `std::move` when returning a temporary

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16488
---
 cpp/include/cudf/detail/cuco_helpers.hpp      | 17 ++----
 .../cudf/detail/distinct_hash_join.cuh        |  2 +-
 .../cudf/detail/hash_reduce_by_row.cuh        |  2 +-
 cpp/include/cudf/detail/join.hpp              |  2 +-
 cpp/include/cudf_test/column_wrapper.hpp      | 14 ++---
 cpp/src/groupby/hash/groupby.cu               | 19 +++----
 cpp/src/io/json/json_tree.cu                  | 35 +++++++------
 cpp/src/io/json/write_json.cu                 |  2 +-
 cpp/src/join/conditional_join.cu              | 52 +++++++++----------
 cpp/src/join/distinct_hash_join.cu            |  2 +-
 cpp/src/join/hash_join.cu                     |  2 +-
 cpp/src/join/join_common_utils.hpp            |  8 +--
 cpp/src/join/mixed_join.cu                    | 22 ++++----
 cpp/src/join/mixed_join_semi.cu               | 11 ++--
 cpp/src/reductions/histogram.cu               | 12 +++--
 cpp/src/search/contains_table.cu              | 17 +++---
 cpp/src/stream_compaction/distinct.cu         | 19 +++----
 cpp/src/stream_compaction/distinct_count.cu   | 17 +++---
 .../stream_compaction/distinct_helpers.hpp    |  2 +-
 cpp/src/text/bpe/byte_pair_encoding.cuh       |  4 +-
 cpp/src/text/bpe/load_merge_pairs.cu          | 39 +++++++-------
 cpp/src/text/vocabulary_tokenize.cu           |  4 +-
 cpp/tests/copying/gather_tests.cpp            | 14 ++---
 cpp/tests/reshape/byte_cast_tests.cpp         | 22 ++++----
 cpp/tests/structs/structs_column_tests.cpp    | 48 ++++++++---------
 25 files changed, 195 insertions(+), 193 deletions(-)

diff --git a/cpp/include/cudf/detail/cuco_helpers.hpp b/cpp/include/cudf/detail/cuco_helpers.hpp
index dca5a39bece..926df921715 100644
--- a/cpp/include/cudf/detail/cuco_helpers.hpp
+++ b/cpp/include/cudf/detail/cuco_helpers.hpp
@@ -36,19 +36,10 @@ static double constexpr CUCO_DESIRED_LOAD_FACTOR = 0.5;
  * later expects a standard C++ `Allocator` interface. This allocator helper provides a simple way
  * to handle cuco memory allocation/deallocation with the given `stream` and the rmm default memory
  * resource.
+ *
+ * @tparam T The allocator's value type.
  */
-class cuco_allocator
-  : public rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<char>> {
-  /// Default stream-ordered allocator type
-  using default_allocator = rmm::mr::polymorphic_allocator<char>;
-  /// The base allocator adaptor type
-  using base_type = rmm::mr::stream_allocator_adaptor<default_allocator>;
-
- public:
-  /**
-   * @brief Constructs the allocator adaptor with the given `stream`
-   */
-  cuco_allocator(rmm::cuda_stream_view stream) : base_type{default_allocator{}, stream} {}
-};
+template <typename T>
+using cuco_allocator = rmm::mr::stream_allocator_adaptor<rmm::mr::polymorphic_allocator<T>>;
 
 }  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
index c3bc3ad89fa..0b3d7ac58bf 100644
--- a/cpp/include/cudf/detail/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -99,7 +99,7 @@ struct distinct_hash_join {
                                            cuda::thread_scope_device,
                                            comparator_adapter<d_equal_type>,
                                            probing_scheme_type,
-                                           cudf::detail::cuco_allocator,
+                                           cudf::detail::cuco_allocator<char>,
                                            cuco_storage_type>;
 
   bool _has_nulls;  ///< true if nulls are present in either build table or probe table
diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
index dfe79646167..7a1e38eefe0 100644
--- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh
+++ b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
@@ -32,7 +32,7 @@
 namespace cudf::detail {
 
 using hash_map_type = cuco::legacy::
-  static_map<size_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator>;
+  static_map<size_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator<char>>;
 
 /**
  * @brief The base struct for customized reduction functor to perform reduce-by-key with keys are
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index ff7da4462a2..af46dd79cdb 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -59,7 +59,7 @@ struct hash_join {
     cuco::static_multimap<hash_value_type,
                           cudf::size_type,
                           cuda::thread_scope_device,
-                          cudf::detail::cuco_allocator,
+                          cudf::detail::cuco_allocator<char>,
                           cuco::legacy::double_hashing<DEFAULT_JOIN_CG_SIZE, Hasher, Hasher>>;
 
   hash_join()                            = delete;
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 4e504ec1d30..d00db222b62 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -1337,7 +1337,7 @@ class lists_column_wrapper : public detail::column_wrapper {
   lists_column_wrapper(std::initializer_list<SourceElementT> elements) : column_wrapper{}
   {
     build_from_non_nested(
-      std::move(cudf::test::fixed_width_column_wrapper<T, SourceElementT>(elements).release()));
+      cudf::test::fixed_width_column_wrapper<T, SourceElementT>(elements).release());
   }
 
   /**
@@ -1361,7 +1361,7 @@ class lists_column_wrapper : public detail::column_wrapper {
   lists_column_wrapper(InputIterator begin, InputIterator end) : column_wrapper{}
   {
     build_from_non_nested(
-      std::move(cudf::test::fixed_width_column_wrapper<T, SourceElementT>(begin, end).release()));
+      cudf::test::fixed_width_column_wrapper<T, SourceElementT>(begin, end).release());
   }
 
   /**
@@ -1386,7 +1386,7 @@ class lists_column_wrapper : public detail::column_wrapper {
     : column_wrapper{}
   {
     build_from_non_nested(
-      std::move(cudf::test::fixed_width_column_wrapper<T, SourceElementT>(elements, v).release()));
+      cudf::test::fixed_width_column_wrapper<T, SourceElementT>(elements, v).release());
   }
 
   /**
@@ -1413,8 +1413,8 @@ class lists_column_wrapper : public detail::column_wrapper {
   lists_column_wrapper(InputIterator begin, InputIterator end, ValidityIterator v)
     : column_wrapper{}
   {
-    build_from_non_nested(std::move(
-      cudf::test::fixed_width_column_wrapper<T, SourceElementT>(begin, end, v).release()));
+    build_from_non_nested(
+      cudf::test::fixed_width_column_wrapper<T, SourceElementT>(begin, end, v).release());
   }
 
   /**
@@ -1435,7 +1435,7 @@ class lists_column_wrapper : public detail::column_wrapper {
   lists_column_wrapper(std::initializer_list<std::string> elements) : column_wrapper{}
   {
     build_from_non_nested(
-      std::move(cudf::test::strings_column_wrapper(elements.begin(), elements.end()).release()));
+      cudf::test::strings_column_wrapper(elements.begin(), elements.end()).release());
   }
 
   /**
@@ -1460,7 +1460,7 @@ class lists_column_wrapper : public detail::column_wrapper {
     : column_wrapper{}
   {
     build_from_non_nested(
-      std::move(cudf::test::strings_column_wrapper(elements.begin(), elements.end(), v).release()));
+      cudf::test::strings_column_wrapper(elements.begin(), elements.end(), v).release());
   }
 
   /**
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 5fe4a5eb30f..35161eada28 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -568,15 +568,16 @@ std::unique_ptr<table> groupby(table_view const& keys,
   cudf::detail::result_cache sparse_results(requests.size());
 
   auto const comparator_helper = [&](auto const d_key_equal) {
-    auto const set = cuco::static_set{num_keys,
-                                      0.5,  // desired load factor
-                                      cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-                                      d_key_equal,
-                                      probing_scheme_type{d_row_hash},
-                                      cuco::thread_scope_device,
-                                      cuco::storage<1>{},
-                                      cudf::detail::cuco_allocator{stream},
-                                      stream.value()};
+    auto const set = cuco::static_set{
+      num_keys,
+      0.5,  // desired load factor
+      cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+      d_key_equal,
+      probing_scheme_type{d_row_hash},
+      cuco::thread_scope_device,
+      cuco::storage<1>{},
+      cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+      stream.value()};
 
     // Compute all single pass aggs first
     compute_single_pass_aggs(keys,
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index ad807b57766..ee6bc0b9f4b 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -545,15 +545,15 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
 
   using hasher_type                             = decltype(d_hasher);
   constexpr size_type empty_node_index_sentinel = -1;
-  auto key_set =
-    cuco::static_set{cuco::extent{compute_hash_table_size(num_fields, 40)},  // 40% occupancy
-                     cuco::empty_key{empty_node_index_sentinel},
-                     d_equal,
-                     cuco::linear_probing<1, hasher_type>{d_hasher},
-                     {},
-                     {},
-                     cudf::detail::cuco_allocator{stream},
-                     stream.value()};
+  auto key_set                                  = cuco::static_set{
+    cuco::extent{compute_hash_table_size(num_fields, 40)},  // 40% occupancy
+    cuco::empty_key{empty_node_index_sentinel},
+    d_equal,
+    cuco::linear_probing<1, hasher_type>{d_hasher},
+                                     {},
+                                     {},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
   key_set.insert_if_async(iter,
                           iter + num_nodes,
                           thrust::counting_iterator<size_type>(0),  // stencil
@@ -734,14 +734,15 @@ std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_n
   constexpr size_type empty_node_index_sentinel = -1;
   using hasher_type                             = decltype(d_hashed_cache);
 
-  auto key_set = cuco::static_set{cuco::extent{compute_hash_table_size(num_nodes)},
-                                  cuco::empty_key<cudf::size_type>{empty_node_index_sentinel},
-                                  d_equal,
-                                  cuco::linear_probing<1, hasher_type>{d_hashed_cache},
-                                  {},
-                                  {},
-                                  cudf::detail::cuco_allocator{stream},
-                                  stream.value()};
+  auto key_set = cuco::static_set{
+    cuco::extent{compute_hash_table_size(num_nodes)},
+    cuco::empty_key<cudf::size_type>{empty_node_index_sentinel},
+    d_equal,
+    cuco::linear_probing<1, hasher_type>{d_hashed_cache},
+    {},
+    {},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
 
   // insert and convert node ids to unique set ids
   auto nodes_itr         = thrust::make_counting_iterator<size_type>(0);
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index c688c809e04..60bb2366e87 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -649,7 +649,7 @@ struct column_to_strings_fn {
     auto const list_child_string = make_lists_column(
       column.size(),
       std::move(new_offsets),
-      std::move(child_string_with_null()),
+      child_string_with_null(),
       column.null_count(),
       cudf::detail::copy_bitmask(column, stream_, rmm::mr::get_current_device_resource()),
       stream_);
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index d4ef2747c9d..789702ce538 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -432,13 +432,13 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::conditional_join_anti_semi(left,
-                                                      right,
-                                                      binary_predicate,
-                                                      detail::join_kind::LEFT_SEMI_JOIN,
-                                                      output_size,
-                                                      cudf::get_default_stream(),
-                                                      mr));
+  return detail::conditional_join_anti_semi(left,
+                                            right,
+                                            binary_predicate,
+                                            detail::join_kind::LEFT_SEMI_JOIN,
+                                            output_size,
+                                            cudf::get_default_stream(),
+                                            mr);
 }
 
 std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
@@ -449,13 +449,13 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::conditional_join_anti_semi(left,
-                                                      right,
-                                                      binary_predicate,
-                                                      detail::join_kind::LEFT_ANTI_JOIN,
-                                                      output_size,
-                                                      cudf::get_default_stream(),
-                                                      mr));
+  return detail::conditional_join_anti_semi(left,
+                                            right,
+                                            binary_predicate,
+                                            detail::join_kind::LEFT_ANTI_JOIN,
+                                            output_size,
+                                            cudf::get_default_stream(),
+                                            mr);
 }
 
 std::size_t conditional_inner_join_size(table_view const& left,
@@ -484,12 +484,12 @@ std::size_t conditional_left_semi_join_size(table_view const& left,
                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::compute_conditional_join_output_size(left,
-                                                                right,
-                                                                binary_predicate,
-                                                                detail::join_kind::LEFT_SEMI_JOIN,
-                                                                cudf::get_default_stream(),
-                                                                mr));
+  return detail::compute_conditional_join_output_size(left,
+                                                      right,
+                                                      binary_predicate,
+                                                      detail::join_kind::LEFT_SEMI_JOIN,
+                                                      cudf::get_default_stream(),
+                                                      mr);
 }
 
 std::size_t conditional_left_anti_join_size(table_view const& left,
@@ -498,12 +498,12 @@ std::size_t conditional_left_anti_join_size(table_view const& left,
                                             rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return std::move(detail::compute_conditional_join_output_size(left,
-                                                                right,
-                                                                binary_predicate,
-                                                                detail::join_kind::LEFT_ANTI_JOIN,
-                                                                cudf::get_default_stream(),
-                                                                mr));
+  return detail::compute_conditional_join_output_size(left,
+                                                      right,
+                                                      binary_predicate,
+                                                      detail::join_kind::LEFT_ANTI_JOIN,
+                                                      cudf::get_default_stream(),
+                                                      mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index daa1bf17c0d..3d95b0c5a5c 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -119,7 +119,7 @@ distinct_hash_join<HasNested>::distinct_hash_join(cudf::table_view const& build,
                 {},
                 cuco::thread_scope_device,
                 cuco_storage_type{},
-                cudf::detail::cuco_allocator{stream},
+                cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
                 stream.value()}
 {
   CUDF_FUNC_RANGE();
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index eb9b687630b..5d01482f44a 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -374,7 +374,7 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,
                 cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
                 cuco::empty_value{cudf::detail::JoinNoneValue},
                 stream.value(),
-                cudf::detail::cuco_allocator{stream}},
+                cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream}},
     _build{build},
     _preprocessed_build{
       cudf::experimental::row::equality::preprocessed_table::create(_build, stream)}
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 4157100b67e..86402a0e7de 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -48,11 +48,13 @@ using mixed_multimap_type =
   cuco::static_multimap<hash_value_type,
                         size_type,
                         cuda::thread_scope_device,
-                        cudf::detail::cuco_allocator,
+                        cudf::detail::cuco_allocator<char>,
                         cuco::legacy::double_hashing<1, hash_type, hash_type>>;
 
-using semi_map_type = cuco::legacy::
-  static_map<hash_value_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator>;
+using semi_map_type = cuco::legacy::static_map<hash_value_type,
+                                               size_type,
+                                               cuda::thread_scope_device,
+                                               cudf::detail::cuco_allocator<char>>;
 
 using row_hash_legacy =
   cudf::row_hasher<cudf::hashing::detail::default_hash, cudf::nullate::DYNAMIC>;
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index 90748e6f322..48b94c777de 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -126,11 +126,12 @@ mixed_join(
   auto build_view = table_device_view::create(build, stream);
 
   // Don't use multimap_type because we want a CG size of 1.
-  mixed_multimap_type hash_table{compute_hash_table_size(build.num_rows()),
-                                 cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
-                                 cuco::empty_value{cudf::detail::JoinNoneValue},
-                                 stream.value(),
-                                 cudf::detail::cuco_allocator{stream}};
+  mixed_multimap_type hash_table{
+    compute_hash_table_size(build.num_rows()),
+    cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
+    cuco::empty_value{cudf::detail::JoinNoneValue},
+    stream.value(),
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream}};
 
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
@@ -391,11 +392,12 @@ compute_mixed_join_output_size(table_view const& left_equality,
   auto build_view = table_device_view::create(build, stream);
 
   // Don't use multimap_type because we want a CG size of 1.
-  mixed_multimap_type hash_table{compute_hash_table_size(build.num_rows()),
-                                 cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
-                                 cuco::empty_value{cudf::detail::JoinNoneValue},
-                                 stream.value(),
-                                 cudf::detail::cuco_allocator{stream}};
+  mixed_multimap_type hash_table{
+    compute_hash_table_size(build.num_rows()),
+    cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
+    cuco::empty_value{cudf::detail::JoinNoneValue},
+    stream.value(),
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream}};
 
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index c147ea3c253..3e4188a0fbd 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -163,11 +163,12 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
     cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build};
   auto const equality_probe = row_comparator.equal_to<false>(has_nulls, compare_nulls);
 
-  semi_map_type hash_table{compute_hash_table_size(build.num_rows()),
-                           cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
-                           cuco::empty_value{cudf::detail::JoinNoneValue},
-                           cudf::detail::cuco_allocator{stream},
-                           stream.value()};
+  semi_map_type hash_table{
+    compute_hash_table_size(build.num_rows()),
+    cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
+    cuco::empty_value{cudf::detail::JoinNoneValue},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
 
   // Create hash table containing all keys found in right table
   // TODO: To add support for nested columns we will need to flatten in many
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index bebb9d14923..d49c0c6f0d2 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -164,11 +164,13 @@ compute_row_frequencies(table_view const& input,
                "Nested types are not yet supported in histogram aggregation.",
                std::invalid_argument);
 
-  auto map = cudf::detail::hash_map_type{compute_hash_table_size(input.num_rows()),
-                                         cuco::empty_key{-1},
-                                         cuco::empty_value{std::numeric_limits<size_type>::min()},
-                                         cudf::detail::cuco_allocator{stream},
-                                         stream.value()};
+  auto map = cudf::detail::hash_map_type{
+    compute_hash_table_size(input.num_rows()),
+    cuco::empty_key{-1},
+    cuco::empty_value{std::numeric_limits<size_type>::min()},
+
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value()};
 
   auto const preprocessed_input =
     cudf::experimental::row::hash::preprocessed_table::create(input, stream);
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index 81227cb9a2d..66cefd0aa2f 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -229,14 +229,15 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
     [&](auto const& d_self_equal, auto const& d_two_table_equal, auto const& probing_scheme) {
       auto const d_equal = comparator_adapter{d_self_equal, d_two_table_equal};
 
-      auto set = cuco::static_set{cuco::extent{compute_hash_table_size(haystack.num_rows())},
-                                  cuco::empty_key{rhs_index_type{-1}},
-                                  d_equal,
-                                  probing_scheme,
-                                  {},
-                                  {},
-                                  cudf::detail::cuco_allocator{stream},
-                                  stream.value()};
+      auto set = cuco::static_set{
+        cuco::extent{compute_hash_table_size(haystack.num_rows())},
+        cuco::empty_key{rhs_index_type{-1}},
+        d_equal,
+        probing_scheme,
+        {},
+        {},
+        cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+        stream.value()};
 
       if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) {
         auto const bitmask_buffer_and_ptr = build_row_bitmask(haystack, stream);
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index e2c5aba6802..6afd6e34c50 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -97,15 +97,16 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
 
   auto const helper_func = [&](auto const& d_equal) {
     using RowHasher = std::decay_t<decltype(d_equal)>;
-    auto set        = hash_set_type<RowHasher>{num_rows,
-                                               0.5,  // desired load factor
-                                               cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
-                                               d_equal,
-                                               {row_hash.device_hasher(has_nulls)},
-                                               {},
-                                               {},
-                                               cudf::detail::cuco_allocator{stream},
-                                               stream.value()};
+    auto set        = hash_set_type<RowHasher>{
+      num_rows,
+      0.5,  // desired load factor
+      cuco::empty_key{cudf::detail::CUDF_SIZE_TYPE_SENTINEL},
+      d_equal,
+      {row_hash.device_hasher(has_nulls)},
+      {},
+      {},
+      cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+      stream.value()};
     return detail::reduce_by_row(set, num_rows, keep, stream, mr);
   };
 
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 9843bb889f4..cdf9faddf31 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -141,14 +141,15 @@ cudf::size_type distinct_count(table_view const& keys,
 
   auto const comparator_helper = [&](auto const row_equal) {
     using hasher_type = decltype(hash_key);
-    auto key_set      = cuco::static_set{cuco::extent{compute_hash_table_size(num_rows)},
-                                    cuco::empty_key<cudf::size_type>{-1},
-                                    row_equal,
-                                    cuco::linear_probing<1, hasher_type>{hash_key},
-                                         {},
-                                         {},
-                                    cudf::detail::cuco_allocator{stream},
-                                    stream.value()};
+    auto key_set      = cuco::static_set{
+      cuco::extent{compute_hash_table_size(num_rows)},
+      cuco::empty_key<cudf::size_type>{-1},
+      row_equal,
+      cuco::linear_probing<1, hasher_type>{hash_key},
+           {},
+           {},
+      cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+      stream.value()};
 
     auto const iter = thrust::counting_iterator<cudf::size_type>(0);
     // when nulls are equal, we skip hashing any row that has a null
diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp
index fca67c98873..bea02e3dbe8 100644
--- a/cpp/src/stream_compaction/distinct_helpers.hpp
+++ b/cpp/src/stream_compaction/distinct_helpers.hpp
@@ -57,7 +57,7 @@ using hash_set_type =
                                         cudf::experimental::row::hash::device_row_hasher<
                                           cudf::hashing::detail::default_hash,
                                           cudf::nullate::DYNAMIC>>,
-                   cudf::detail::cuco_allocator,
+                   cudf::detail::cuco_allocator<char>,
                    cuco::storage<1>>;
 
 /**
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cuh b/cpp/src/text/bpe/byte_pair_encoding.cuh
index a2e441c3284..69c77224eb7 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cuh
+++ b/cpp/src/text/bpe/byte_pair_encoding.cuh
@@ -106,7 +106,7 @@ using merge_pairs_map_type = cuco::static_map<cudf::size_type,
                                               cuda::thread_scope_device,
                                               bpe_equal,
                                               bpe_probe_scheme,
-                                              cudf::detail::cuco_allocator,
+                                              cudf::detail::cuco_allocator<char>,
                                               cuco_storage>;
 
 /**
@@ -164,7 +164,7 @@ using mp_table_map_type = cuco::static_map<cudf::size_type,
                                            cuda::thread_scope_device,
                                            mp_equal,
                                            mp_probe_scheme,
-                                           cudf::detail::cuco_allocator,
+                                           cudf::detail::cuco_allocator<char>,
                                            cuco_storage>;
 
 }  // namespace detail
diff --git a/cpp/src/text/bpe/load_merge_pairs.cu b/cpp/src/text/bpe/load_merge_pairs.cu
index f34c5c4f7f6..9fb86aecce3 100644
--- a/cpp/src/text/bpe/load_merge_pairs.cu
+++ b/cpp/src/text/bpe/load_merge_pairs.cu
@@ -43,16 +43,16 @@ namespace {
 std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   cudf::column_device_view const& input, rmm::cuda_stream_view stream)
 {
-  auto merge_pairs_map =
-    std::make_unique<merge_pairs_map_type>(static_cast<size_t>(input.size()),
-                                           cuco::empty_key{-1},
-                                           cuco::empty_value{-1},
-                                           bpe_equal{input},
-                                           bpe_probe_scheme{bpe_hasher{input}},
-                                           cuco::thread_scope_device,
-                                           cuco_storage{},
-                                           cudf::detail::cuco_allocator{stream},
-                                           stream.value());
+  auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
+    static_cast<size_t>(input.size()),
+    cuco::empty_key{-1},
+    cuco::empty_value{-1},
+    bpe_equal{input},
+    bpe_probe_scheme{bpe_hasher{input}},
+    cuco::thread_scope_device,
+    cuco_storage{},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value());
 
   auto iter = cudf::detail::make_counting_transform_iterator(
     0,
@@ -67,15 +67,16 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
 std::unique_ptr<detail::mp_table_map_type> initialize_mp_table_map(
   cudf::column_device_view const& input, rmm::cuda_stream_view stream)
 {
-  auto mp_table_map = std::make_unique<mp_table_map_type>(static_cast<size_t>(input.size()),
-                                                          cuco::empty_key{-1},
-                                                          cuco::empty_value{-1},
-                                                          mp_equal{input},
-                                                          mp_probe_scheme{mp_hasher{input}},
-                                                          cuco::thread_scope_device,
-                                                          cuco_storage{},
-                                                          cudf::detail::cuco_allocator{stream},
-                                                          stream.value());
+  auto mp_table_map = std::make_unique<mp_table_map_type>(
+    static_cast<size_t>(input.size()),
+    cuco::empty_key{-1},
+    cuco::empty_value{-1},
+    mp_equal{input},
+    mp_probe_scheme{mp_hasher{input}},
+    cuco::thread_scope_device,
+    cuco_storage{},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    stream.value());
 
   auto iter = cudf::detail::make_counting_transform_iterator(
     0,
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index 97abb1487d8..5945921ed9d 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -100,7 +100,7 @@ using vocabulary_map_type = cuco::static_map<cudf::size_type,
                                              cuda::thread_scope_device,
                                              vocab_equal,
                                              probe_scheme,
-                                             cudf::detail::cuco_allocator,
+                                             cudf::detail::cuco_allocator<char>,
                                              cuco_storage>;
 }  // namespace
 }  // namespace detail
@@ -152,7 +152,7 @@ tokenize_vocabulary::tokenize_vocabulary(cudf::strings_column_view const& input,
     detail::probe_scheme{detail::vocab_hasher{*d_vocabulary}},
     cuco::thread_scope_device,
     detail::cuco_storage{},
-    cudf::detail::cuco_allocator{stream},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
     stream.value());
 
   // the row index is the token id (value for each key in the map)
diff --git a/cpp/tests/copying/gather_tests.cpp b/cpp/tests/copying/gather_tests.cpp
index 284b6c4c50c..07ce672b14d 100644
--- a/cpp/tests/copying/gather_tests.cpp
+++ b/cpp/tests/copying/gather_tests.cpp
@@ -43,7 +43,7 @@ TYPED_TEST(GatherTest, IdentityTest)
 
   cudf::table_view source_table({source_column});
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
 
   for (auto i = 0; i < source_table.num_columns(); ++i) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(source_table.column(i), result->view().column(i));
@@ -66,7 +66,7 @@ TYPED_TEST(GatherTest, ReverseIdentityTest)
 
   cudf::table_view source_table({source_column});
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
   cudf::test::fixed_width_column_wrapper<TypeParam> expect_column(reversed_data,
                                                                   reversed_data + source_size);
 
@@ -94,7 +94,7 @@ TYPED_TEST(GatherTest, EveryOtherNullOdds)
 
   cudf::table_view source_table({source_column});
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
 
   auto expect_data  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 0; });
   auto expect_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return 0; });
@@ -126,7 +126,7 @@ TYPED_TEST(GatherTest, EveryOtherNullEvens)
 
   cudf::table_view source_table({source_column});
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
 
   auto expect_data =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2 + 1; });
@@ -160,7 +160,7 @@ TYPED_TEST(GatherTest, AllNull)
 
   cudf::table_view source_table({source_column});
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
 
   // Check that the result is also all invalid
   CUDF_TEST_EXPECT_TABLES_EQUAL(source_table, result->view());
@@ -190,7 +190,7 @@ TYPED_TEST(GatherTest, MultiColReverseIdentityTest)
 
   cudf::table_view source_table{source_columns};
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
 
   cudf::test::fixed_width_column_wrapper<TypeParam> expect_column(reversed_data,
                                                                   reversed_data + source_size);
@@ -228,7 +228,7 @@ TYPED_TEST(GatherTest, MultiColNulls)
 
   cudf::table_view source_table{source_columns};
 
-  std::unique_ptr<cudf::table> result = std::move(cudf::gather(source_table, gather_map));
+  std::unique_ptr<cudf::table> result = cudf::gather(source_table, gather_map);
 
   // Expected data
   auto expect_data =
diff --git a/cpp/tests/reshape/byte_cast_tests.cpp b/cpp/tests/reshape/byte_cast_tests.cpp
index cd280302677..b3d9b2e2f5f 100644
--- a/cpp/tests/reshape/byte_cast_tests.cpp
+++ b/cpp/tests/reshape/byte_cast_tests.cpp
@@ -61,8 +61,8 @@ TEST_F(ByteCastTest, int16ValuesWithNulls)
   auto [null_mask, null_count] = cudf::test::detail::make_null_mask(odd_validity, odd_validity + 5);
   auto int16_expected          = cudf::make_lists_column(
     5,
-    std::move(cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 2, 2, 4, 4}.release()),
-    std::move(int16_data.release()),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 2, 2, 4, 4}.release(),
+    int16_data.release(),
     null_count,
     std::move(null_mask));
 
@@ -109,8 +109,8 @@ TEST_F(ByteCastTest, int32ValuesWithNulls)
 
   auto int32_expected = cudf::make_lists_column(
     5,
-    std::move(cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 4, 4, 8, 8, 12}.release()),
-    std::move(int32_data.release()),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 4, 4, 8, 8, 12}.release(),
+    int32_data.release(),
     null_count,
     std::move(null_mask));
 
@@ -163,9 +163,8 @@ TEST_F(ByteCastTest, int64ValuesWithNulls)
   auto [null_mask, null_count] = cudf::test::detail::make_null_mask(odd_validity, odd_validity + 5);
   auto int64_expected          = cudf::make_lists_column(
     5,
-    std::move(
-      cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 8, 8, 16, 16}.release()),
-    std::move(int64_data.release()),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 8, 8, 16, 16}.release(),
+    int64_data.release(),
     null_count,
     std::move(null_mask));
 
@@ -226,8 +225,8 @@ TEST_F(ByteCastTest, fp32ValuesWithNulls)
     cudf::test::detail::make_null_mask(even_validity, even_validity + 5);
   auto fp32_expected = cudf::make_lists_column(
     5,
-    std::move(cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 4, 4, 8, 8, 12}.release()),
-    std::move(fp32_data.release()),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 4, 4, 8, 8, 12}.release(),
+    fp32_data.release(),
     null_count,
     std::move(null_mask));
 
@@ -297,9 +296,8 @@ TEST_F(ByteCastTest, fp64ValuesWithNulls)
   auto [null_mask, null_count] = cudf::test::detail::make_null_mask(odd_validity, odd_validity + 5);
   auto fp64_expected           = cudf::make_lists_column(
     5,
-    std::move(
-      cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 8, 8, 16, 16}.release()),
-    std::move(fp64_data.release()),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 8, 8, 16, 16}.release(),
+    fp64_data.release(),
     null_count,
     std::move(null_mask));
 
diff --git a/cpp/tests/structs/structs_column_tests.cpp b/cpp/tests/structs/structs_column_tests.cpp
index df005dfa1dc..f0010fc1ed9 100644
--- a/cpp/tests/structs/structs_column_tests.cpp
+++ b/cpp/tests/structs/structs_column_tests.cpp
@@ -448,12 +448,12 @@ TYPED_TEST(TypedStructColumnWrapperTest, ListOfStructOfList)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3; });
   auto [null_mask, null_count] =
     detail::make_null_mask(list_of_struct_of_list_validity, list_of_struct_of_list_validity + 5);
-  auto list_of_struct_of_list = cudf::make_lists_column(
-    5,
-    std::move(fixed_width_column_wrapper<size_type>{0, 2, 4, 6, 8, 10}.release()),
-    std::move(struct_of_lists_col),
-    null_count,
-    std::move(null_mask));
+  auto list_of_struct_of_list =
+    cudf::make_lists_column(5,
+                            fixed_width_column_wrapper<size_type>{0, 2, 4, 6, 8, 10}.release(),
+                            std::move(struct_of_lists_col),
+                            null_count,
+                            std::move(null_mask));
 
   // Compare with expected values.
 
@@ -468,12 +468,12 @@ TYPED_TEST(TypedStructColumnWrapperTest, ListOfStructOfList)
 
   std::tie(null_mask, null_count) =
     detail::make_null_mask(list_of_struct_of_list_validity, list_of_struct_of_list_validity + 5);
-  auto expected_level3_list = cudf::make_lists_column(
-    5,
-    std::move(fixed_width_column_wrapper<size_type>{0, 0, 2, 4, 4, 6}.release()),
-    std::move(expected_level2_struct),
-    null_count,
-    std::move(null_mask));
+  auto expected_level3_list =
+    cudf::make_lists_column(5,
+                            fixed_width_column_wrapper<size_type>{0, 0, 2, 4, 4, 6}.release(),
+                            std::move(expected_level2_struct),
+                            null_count,
+                            std::move(null_mask));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*list_of_struct_of_list, *expected_level3_list);
 }
@@ -498,12 +498,12 @@ TYPED_TEST(TypedStructColumnWrapperTest, StructOfListOfStruct)
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 3; });
   auto [null_mask, null_count] = detail::make_null_mask(list_validity, list_validity + 5);
 
-  auto lists_col = cudf::make_lists_column(
-    5,
-    std::move(fixed_width_column_wrapper<size_type>{0, 2, 4, 6, 8, 10}.release()),
-    std::move(structs_col),
-    null_count,
-    std::move(null_mask));
+  auto lists_col =
+    cudf::make_lists_column(5,
+                            fixed_width_column_wrapper<size_type>{0, 2, 4, 6, 8, 10}.release(),
+                            std::move(structs_col),
+                            null_count,
+                            std::move(null_mask));
 
   std::vector<std::unique_ptr<cudf::column>> cols;
   cols.push_back(std::move(lists_col));
@@ -519,12 +519,12 @@ TYPED_TEST(TypedStructColumnWrapperTest, StructOfListOfStruct)
 
   std::tie(null_mask, null_count) = detail::make_null_mask(list_validity, list_validity + 5);
 
-  auto expected_lists_col = cudf::make_lists_column(
-    5,
-    std::move(fixed_width_column_wrapper<size_type>{0, 2, 4, 6, 8, 10}.release()),
-    std::move(expected_structs_col),
-    null_count,
-    std::move(null_mask));
+  auto expected_lists_col =
+    cudf::make_lists_column(5,
+                            fixed_width_column_wrapper<size_type>{0, 2, 4, 6, 8, 10}.release(),
+                            std::move(expected_structs_col),
+                            null_count,
+                            std::move(null_mask));
 
   // Test that the lists child column is as expected.
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected_lists_col, struct_of_list_of_struct->child(0));

From e8156d42163fb02aa90baba9be20ab89bc9ebef1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 6 Aug 2024 17:03:10 -0400
Subject: [PATCH 631/842] Fix segmented-sort overlapped input/output indices
 (#16463)

Fixes call to CUB `DeviceSegmentedSort::SortPairs` where the input and output indices pointed to the same temp memory. The documentation from https://nvidia.github.io/cccl/cub/api/structcub_1_1DeviceSegmentedSort.html#id8 indicates the `d_values_in` and `d_values_out` memory must not overlap so using the same pointer for both created invalid output in certain conditions. The internal function was implemented to expect the input values to be updated in-place. The fix uses separate device memory for the input and output indices.

Closes #16455

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16463
---
 cpp/src/sort/segmented_sort_impl.cuh    |  4 +++-
 cpp/tests/sort/segmented_sort_tests.cpp | 26 ++++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh
index 6d472925b30..281fdfa6b8f 100644
--- a/cpp/src/sort/segmented_sort_impl.cuh
+++ b/cpp/src/sort/segmented_sort_impl.cuh
@@ -79,6 +79,8 @@ struct column_fast_sort_fn {
                                                 stream,
                                                 rmm::mr::get_current_device_resource());
     mutable_column_view output_view = temp_col->mutable_view();
+    auto temp_indices               = cudf::column(
+      cudf::column_view(indices.type(), indices.size(), indices.head(), nullptr, 0), stream);
 
     // DeviceSegmentedSort is faster than DeviceSegmentedRadixSort at this time
     auto fast_sort_impl = [stream](bool ascending, [[maybe_unused]] auto&&... args) {
@@ -118,7 +120,7 @@ struct column_fast_sort_fn {
     fast_sort_impl(ascending,
                    input.begin<T>(),
                    output_view.begin<T>(),
-                   indices.begin<size_type>(),
+                   temp_indices.view().begin<size_type>(),
                    indices.begin<size_type>(),
                    input.size(),
                    segment_offsets.size() - 1,
diff --git a/cpp/tests/sort/segmented_sort_tests.cpp b/cpp/tests/sort/segmented_sort_tests.cpp
index da9666cbc74..f4fe2c5956a 100644
--- a/cpp/tests/sort/segmented_sort_tests.cpp
+++ b/cpp/tests/sort/segmented_sort_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,7 +21,9 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/copying.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/sorting.hpp>
+#include <cudf/utilities/span.hpp>
 
 #include <type_traits>
 #include <vector>
@@ -338,3 +340,25 @@ TEST_F(SegmentedSortInt, Bool)
   result = cudf::stable_segmented_sorted_order(cudf::table_view({test_col}), segments);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 }
+
+// Specific test for fix in https://github.com/rapidsai/cudf/pull/16463
+TEST_F(SegmentedSortInt, UnbalancedOffsets)
+{
+  auto h_input = std::vector<int64_t>(3535);
+  std::iota(h_input.begin(), h_input.end(), 1);
+  std::sort(h_input.begin(), h_input.end(), std::greater<int64_t>{});
+  std::fill_n(h_input.begin(), 4, 0);
+  std::fill(h_input.begin() + 3533, h_input.end(), 10000);
+  auto d_input = cudf::detail::make_device_uvector_sync(
+    h_input, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto input    = cudf::column_view(cudf::device_span<int64_t const>(d_input));
+  auto segments = cudf::test::fixed_width_column_wrapper<int32_t>({0, 4, 3533, 3535});
+  // full sort should match handcrafted input data here
+  auto expected = cudf::sort(cudf::table_view({input}));
+
+  auto input_view = cudf::table_view({input});
+  auto result     = cudf::segmented_sort_by_key(input_view, input_view, segments);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected->view().column(0));
+  result = cudf::stable_segmented_sort_by_key(input_view, input_view, segments);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected->view().column(0));
+}

From 6b0bff4b096ea87cd3436dba86146ed75af0f81e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 6 Aug 2024 14:48:16 -1000
Subject: [PATCH 632/842] Disallow cudf.Series to accept column in favor of
 `._from_column` (#16454)

`cudf.Series` is a public constructor that happens to accept a private `ColumnBase` object. Many ops return Columns and is natural to want to reconstruct a `Series`.

This PR adds a `SingleColumnFrame._from_column` classmethod for instances where we need to wrap a new column in an `Index` or `Series`. This constructor also passes some unneeded validation in `ColumnAccessor` and `Series`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16454
---
 python/cudf/cudf/core/byte_pair_encoding.py  |   6 +-
 python/cudf/cudf/core/column/categorical.py  |  16 +--
 python/cudf/cudf/core/column/methods.py      |  15 ++-
 python/cudf/cudf/core/column/numerical.py    |  12 +-
 python/cudf/cudf/core/column/string.py       |  23 ++--
 python/cudf/cudf/core/dataframe.py           | 116 +++++++++----------
 python/cudf/cudf/core/groupby/groupby.py     |  13 +--
 python/cudf/cudf/core/index.py               |  44 ++++++-
 python/cudf/cudf/core/indexed_frame.py       |  18 +--
 python/cudf/cudf/core/multiindex.py          |  19 ++-
 python/cudf/cudf/core/reshape.py             |   8 +-
 python/cudf/cudf/core/series.py              | 101 ++++++++++++----
 python/cudf/cudf/core/single_column_frame.py |  41 +++----
 python/cudf/cudf/core/tokenize_vocabulary.py |   8 +-
 python/cudf/cudf/core/tools/datetimes.py     |  11 +-
 python/cudf/cudf/core/tools/numeric.py       |  29 ++---
 python/cudf/cudf/datasets.py                 |   5 +-
 python/cudf/cudf/io/dlpack.py                |   2 +-
 python/cudf/cudf/tests/test_apply_rows.py    |   8 +-
 python/cudf/cudf/tests/test_column.py        |  44 ++++---
 python/cudf/cudf/tests/test_dataframe.py     |  26 +++--
 python/cudf/cudf/tests/test_decimal.py       |  10 +-
 python/cudf/cudf/tests/test_df_protocol.py   |   6 +-
 python/cudf/cudf/tests/test_list.py          |   2 +-
 python/cudf/cudf/tests/test_pickling.py      |   4 +-
 python/cudf/cudf/tests/test_replace.py       |   6 +-
 python/cudf/cudf/tests/test_series.py        |  10 +-
 python/cudf/cudf/tests/test_setitem.py       |  10 +-
 python/cudf/cudf/tests/test_string.py        |   2 +-
 python/cudf/cudf/tests/test_string_udfs.py   |   4 +-
 python/dask_cudf/dask_cudf/backends.py       |   7 +-
 python/dask_cudf/dask_cudf/core.py           |   2 +-
 32 files changed, 360 insertions(+), 268 deletions(-)

diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
index 4c881022ecf..6ca64a0a2be 100644
--- a/python/cudf/cudf/core/byte_pair_encoding.py
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -27,7 +27,7 @@ class BytePairEncoder:
     def __init__(self, merges_pair: "cudf.Series"):
         self.merge_pairs = cpp_merge_pairs(merges_pair._column)
 
-    def __call__(self, text, separator: str = " "):
+    def __call__(self, text, separator: str = " ") -> cudf.Series:
         """
 
         Parameters
@@ -56,4 +56,4 @@ def __call__(self, text, separator: str = " "):
         sep = cudf.Scalar(separator, dtype="str")
         result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep)
 
-        return cudf.Series(result)
+        return cudf.Series._from_column(result)
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 55bfae30470..6fa69eb9cc1 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -123,7 +123,7 @@ def categories(self) -> "cudf.core.index.Index":
         return self._column.dtype.categories
 
     @property
-    def codes(self) -> "cudf.Series":
+    def codes(self) -> cudf.Series:
         """
         Return Series of codes as well as the index.
         """
@@ -132,7 +132,7 @@ def codes(self) -> "cudf.Series":
             if isinstance(self._parent, cudf.Series)
             else None
         )
-        return cudf.Series(self._column.codes, index=index)
+        return cudf.Series._from_column(self._column.codes, index=index)
 
     @property
     def ordered(self) -> bool:
@@ -918,7 +918,7 @@ def find_and_replace(
             )
             cur_categories = replaced.categories
             new_categories = cur_categories.apply_boolean_mask(
-                ~cudf.Series(cur_categories.isin(drop_values))
+                cur_categories.isin(drop_values).unary_operator("not")
             )
             replaced = replaced._set_categories(new_categories)
             df = df.dropna(subset=["new"])
@@ -943,7 +943,7 @@ def find_and_replace(
         # If a category is being replaced by an existing one, we
         # want to map it to None. If it's totally new, we want to
         # map it to the new label it is to be replaced by
-        dtype_replace = cudf.Series._from_data({None: replacement_col})
+        dtype_replace = cudf.Series._from_column(replacement_col)
         dtype_replace[dtype_replace.isin(cats_col)] = None
         new_cats_col = cats_col.find_and_replace(
             to_replace_col, dtype_replace._column
@@ -1273,12 +1273,8 @@ def _categories_equal(
             return False
         # if order doesn't matter, sort before the equals call below
         if not ordered:
-            cur_categories = cudf.Series(cur_categories).sort_values(
-                ignore_index=True
-            )
-            new_categories = cudf.Series(new_categories).sort_values(
-                ignore_index=True
-            )
+            cur_categories = cur_categories.sort_values()
+            new_categories = new_categories.sort_values()
         return cur_categories.equals(new_categories)
 
     def _set_categories(
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index 7c6f4e05577..8c46d238057 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -7,6 +7,8 @@
 from typing_extensions import Literal
 
 import cudf
+import cudf.core.column
+import cudf.core.column_accessor
 from cudf.utils.utils import NotIterable
 
 ParentType = Union["cudf.Series", "cudf.core.index.Index"]
@@ -84,14 +86,11 @@ def _return_or_inplace(
                         data=table, index=self._parent.index
                     )
             elif isinstance(self._parent, cudf.Series):
-                if retain_index:
-                    return cudf.Series(
-                        new_col,
-                        name=self._parent.name,
-                        index=self._parent.index,
-                    )
-                else:
-                    return cudf.Series(new_col, name=self._parent.name)
+                return cudf.Series._from_column(
+                    new_col,
+                    name=self._parent.name,
+                    index=self._parent.index if retain_index else None,
+                )
             elif isinstance(self._parent, cudf.BaseIndex):
                 return cudf.Index(new_col, name=self._parent.name)
             else:
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index c326a10c844..df27134d458 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -555,11 +555,8 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
 
                 if self.dtype.kind == "f":
                     # Exclude 'np.inf', '-np.inf'
-                    s = cudf.Series(self)
-                    # TODO: replace np.inf with cudf scalar when
-                    # https://github.com/rapidsai/cudf/pull/6297 merges
-                    non_infs = s[~((s == np.inf) | (s == -np.inf))]
-                    col = non_infs._column
+                    not_inf = (self != np.inf) & (self != -np.inf)
+                    col = self.apply_boolean_mask(not_inf)
                 else:
                     col = self
 
@@ -599,8 +596,7 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
             else:
                 filled = self.fillna(0)
                 return (
-                    cudf.Series(filled).astype(to_dtype).astype(filled.dtype)
-                    == cudf.Series(filled)
+                    filled.astype(to_dtype).astype(filled.dtype) == filled
                 ).all()
 
         # want to cast float to int:
@@ -615,7 +611,7 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
             # NOTE(seberg): it would make sense to limit to the mantissa range.
             if (float(self.min()) >= min_) and (float(self.max()) <= max_):
                 filled = self.fillna(0)
-                return (cudf.Series(filled) % 1 == 0).all()
+                return (filled % 1 == 0).all()
             else:
                 return False
 
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index b422ff86b17..1a4b558749d 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -358,7 +358,7 @@ def cat(self, others=None, sep=None, na_rep=None):
             )
 
         if len(data) == 1 and data.null_count == 1:
-            data = [""]
+            data = cudf.core.column.as_column("", length=len(data))
         # We only want to keep the index if we are adding something to each
         # row, not if we are joining all the rows into a single string.
         out = self._return_or_inplace(data, retain_index=others is not None)
@@ -3623,7 +3623,7 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         data = libstrings.findall(self._column, pat, flags)
         return self._return_or_inplace(data)
 
-    def find_multiple(self, patterns: SeriesOrIndex) -> "cudf.Series":
+    def find_multiple(self, patterns: SeriesOrIndex) -> cudf.Series:
         """
         Find all first occurrences of patterns in the Series/Index.
 
@@ -3679,12 +3679,12 @@ def find_multiple(self, patterns: SeriesOrIndex) -> "cudf.Series":
                 f"got: {patterns_column.dtype}"
             )
 
-        return cudf.Series(
+        return cudf.Series._from_column(
             libstrings.find_multiple(self._column, patterns_column),
+            name=self._parent.name,
             index=self._parent.index
             if isinstance(self._parent, cudf.Series)
             else self._parent,
-            name=self._parent.name,
         )
 
     def isempty(self) -> SeriesOrIndex:
@@ -4376,14 +4376,9 @@ def code_points(self) -> SeriesOrIndex:
         2    99
         dtype: int32
         """
-
-        new_col = libstrings.code_points(self._column)
-        if isinstance(self._parent, cudf.Series):
-            return cudf.Series(new_col, name=self._parent.name)
-        elif isinstance(self._parent, cudf.BaseIndex):
-            return cudf.Index(new_col, name=self._parent.name)
-        else:
-            return new_col
+        return self._return_or_inplace(
+            libstrings.code_points(self._column), retain_index=False
+        )
 
     def translate(self, table: dict) -> SeriesOrIndex:
         """
@@ -4694,7 +4689,9 @@ def character_tokenize(self) -> SeriesOrIndex:
         if isinstance(self._parent, cudf.Series):
             lengths = self.len().fillna(0)
             index = self._parent.index.repeat(lengths)
-            return cudf.Series(result_col, name=self._parent.name, index=index)
+            return cudf.Series._from_column(
+                result_col, name=self._parent.name, index=index
+            )
         elif isinstance(self._parent, cudf.BaseIndex):
             return cudf.Index(result_col, name=self._parent.name)
         else:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 865d2706ca3..a53c7bcc63c 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -382,7 +382,10 @@ def _setitem_tuple_arg(self, key, value):
                 length = len(idx) if idx is not None else 1
                 value = as_column(value, length=length)
 
-            new_col = cudf.Series(value, index=idx)
+            if isinstance(value, ColumnBase):
+                new_col = cudf.Series._from_column(value, index=idx)
+            else:
+                new_col = cudf.Series(value, index=idx)
             if len(self._frame.index) != 0:
                 new_col = new_col._align_to_index(
                     self._frame.index, how="right"
@@ -500,28 +503,33 @@ def __getitem__(self, arg):
             return frame._slice(row_spec.key)
         elif isinstance(row_spec, indexing_utils.ScalarIndexer):
             result = frame._gather(row_spec.key, keep_index=True)
+            new_name = result.index[0]
+            new_index = ensure_index(result.keys())
             # Attempt to turn into series.
-            try:
-                # Behaviour difference from pandas, which will merrily
-                # turn any heterogeneous set of columns into a series if
-                # you only ask for one row.
-                new_name = result.index[0]
-                result = Series._concat(
-                    [result[name] for name in column_names],
-                    index=result.keys(),
-                )
-                result.name = new_name
-                return result
-            except TypeError:
-                # Couldn't find a common type, Hence:
-                # Raise in pandas compatibility mode,
-                # or just return a 1xN dataframe otherwise
-                if cudf.get_option("mode.pandas_compatible"):
-                    raise TypeError(
-                        "All columns need to be of same type, please "
-                        "typecast to common dtype."
+            if len(column_names) == 0:
+                return Series([], index=new_index, name=new_name)
+            else:
+                try:
+                    # Behaviour difference from pandas, which will merrily
+                    # turn any heterogeneous set of columns into a series if
+                    # you only ask for one row.
+                    ser = Series._concat(
+                        [result[name] for name in column_names],
                     )
-                return result
+                except TypeError as err:
+                    # Couldn't find a common type, Hence:
+                    # Raise in pandas compatibility mode,
+                    # or just return a 1xN dataframe otherwise
+                    if cudf.get_option("mode.pandas_compatible"):
+                        raise TypeError(
+                            "All columns need to be of same type, please "
+                            "typecast to common dtype."
+                        ) from err
+                    return result
+                else:
+                    ser.index = new_index
+                    ser.name = new_name
+                    return ser
         elif isinstance(row_spec, indexing_utils.EmptyIndexer):
             return frame._empty_like(keep_index=True)
         assert_never(row_spec)
@@ -1488,14 +1496,14 @@ def __delitem__(self, name):
         self._drop_column(name)
 
     @_performance_tracking
-    def memory_usage(self, index=True, deep=False):
+    def memory_usage(self, index=True, deep=False) -> cudf.Series:
         mem_usage = [col.memory_usage for col in self._data.columns]
         names = [str(name) for name in self._data.names]
         if index:
             mem_usage.append(self.index.memory_usage())
             names.append("Index")
-        return Series._from_data(
-            data={None: as_column(mem_usage)},
+        return Series._from_column(
+            as_column(mem_usage),
             index=cudf.Index(names),
         )
 
@@ -1752,7 +1760,7 @@ def _concat(
             if 1 == first_data_column_position:
                 table_index = cudf.Index(cols[0])
             elif first_data_column_position > 1:
-                table_index = DataFrame._from_data(
+                table_index = cudf.MultiIndex._from_data(
                     data=dict(
                         zip(
                             indices[:first_data_column_position],
@@ -3803,7 +3811,9 @@ def agg(self, aggs, axis=None):
                     col_empty = column_empty(
                         len(idxs), dtype=col.dtype, masked=True
                     )
-                    ans = cudf.Series(data=col_empty, index=idxs)
+                    ans = cudf.Series._from_column(
+                        col_empty, index=cudf.Index(idxs)
+                    )
                     if isinstance(aggs.get(key), abc.Iterable):
                         # TODO : Allow simultaneous pass for multi-aggregation
                         # as a future optimization
@@ -4801,7 +4811,7 @@ def _func(x):  # pragma: no cover
         # this could be written as a single kernel
         result = {}
         for name, col in self._data.items():
-            apply_sr = Series._from_data({None: col})
+            apply_sr = Series._from_column(col)
             result[name] = apply_sr.apply(_func)._column
 
         return DataFrame._from_data(result, index=self.index)
@@ -6083,8 +6093,8 @@ def quantile(
 
             if q_is_number:
                 result = result.transpose()
-                return Series(
-                    data=result._columns[0], index=result.index, name=q
+                return Series._from_column(
+                    result._columns[0], name=q, index=result.index
                 )
         else:
             # Ensure that qs is non-scalar so that we always get a column back.
@@ -6346,13 +6356,9 @@ def count(self, axis=0, numeric_only=False):
         if axis != 0:
             raise NotImplementedError("Only axis=0 is currently supported.")
         length = len(self)
-        return Series._from_data(
-            {
-                None: as_column(
-                    [length - col.null_count for col in self._columns]
-                )
-            },
-            cudf.Index(self._data.names),
+        return Series._from_column(
+            as_column([length - col.null_count for col in self._columns]),
+            index=cudf.Index(self._data.names),
         )
 
     _SUPPORT_AXIS_LOOKUP = {
@@ -6480,7 +6486,7 @@ def _reduce(
                     )
                 else:
                     idx = cudf.Index(source._data.names)
-                return Series._from_data({None: as_column(result)}, idx)
+                return Series._from_column(as_column(result), index=idx)
         elif axis == 1:
             return source._apply_cupy_method_axis_1(op, **kwargs)
         else:
@@ -6710,11 +6716,7 @@ def _apply_cupy_method_axis_1(self, method, *args, **kwargs):
                 result = result.set_mask(
                     cudf._lib.transform.bools_to_mask(mask._column)
                 )
-            return Series(
-                result,
-                index=self.index,
-                dtype=result_dtype,
-            )
+            return Series._from_column(result, index=self.index)
         else:
             result_df = DataFrame(result).set_index(self.index)
             result_df._set_columns_like(prepared._data)
@@ -7302,9 +7304,7 @@ def unnamed_group_generator():
 
         # Construct the resulting dataframe / series
         if not has_unnamed_levels:
-            result = Series._from_data(
-                data={None: stacked[0]}, index=new_index
-            )
+            result = Series._from_column(stacked[0], index=new_index)
         else:
             if unnamed_level_values.nlevels == 1:
                 unnamed_level_values = unnamed_level_values.get_level_values(0)
@@ -7445,10 +7445,8 @@ def to_struct(self, name=None):
             size=len(self),
             offset=0,
         )
-        return cudf.Series._from_data(
-            cudf.core.column_accessor.ColumnAccessor(
-                {name: col}, verify=False
-            ),
+        return cudf.Series._from_column(
+            col,
             index=self.index,
             name=name,
         )
@@ -7935,12 +7933,10 @@ def eval(self, expr: str, inplace: bool = False, **kwargs):
                 raise ValueError(
                     "Cannot operate inplace if there is no assignment"
                 )
-            return Series._from_data(
-                {
-                    None: libcudf.transform.compute_column(
-                        [*self._columns], self._column_names, statements[0]
-                    )
-                }
+            return Series._from_column(
+                libcudf.transform.compute_column(
+                    [*self._columns], self._column_names, statements[0]
+                )
             )
 
         targets = []
@@ -8484,7 +8480,9 @@ def _get_non_null_cols_and_dtypes(col_idxs, list_of_columns):
     return non_null_columns, dtypes
 
 
-def _find_common_dtypes_and_categories(non_null_columns, dtypes):
+def _find_common_dtypes_and_categories(
+    non_null_columns, dtypes
+) -> dict[Any, ColumnBase]:
     # A mapping of {idx: categories}, where `categories` is a
     # column of all the unique categorical values from each
     # categorical column across all input frames
@@ -8500,9 +8498,9 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
             isinstance(col, cudf.core.column.CategoricalColumn) for col in cols
         ):
             # Combine and de-dupe the categories
-            categories[idx] = cudf.Series(
-                concat_columns([col.categories for col in cols])
-            )._column.unique()
+            categories[idx] = concat_columns(
+                [col.categories for col in cols]
+            ).unique()
             # Set the column dtype to the codes' dtype. The categories
             # will be re-assigned at the end
             dtypes[idx] = min_signed_type(len(categories[idx]))
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 3cfbd1d736a..92c4b73ceaa 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -458,12 +458,11 @@ def size(self):
         """
         Return the size of each group.
         """
+        col = cudf.core.column.column_empty(
+            len(self.obj), "int8", masked=False
+        )
         return (
-            cudf.Series(
-                cudf.core.column.column_empty(
-                    len(self.obj), "int8", masked=False
-                )
-            )
+            cudf.Series._from_column(col)
             .groupby(self.grouping, sort=self._sort, dropna=self._dropna)
             .agg("size")
         )
@@ -484,7 +483,7 @@ def cumcount(self, ascending: bool = True):
                 "ascending is currently not implemented."
             )
         return (
-            cudf.Series(
+            cudf.Series._from_column(
                 cudf.core.column.column_empty(
                     len(self.obj), "int8", masked=False
                 ),
@@ -1069,7 +1068,7 @@ def ngroup(self, ascending=True):
             # Count descending from num_groups - 1 to 0
             groups = range(num_groups - 1, -1, -1)
 
-        group_ids = cudf.Series._from_data({None: as_column(groups)})
+        group_ids = cudf.Series._from_column(as_column(groups))
 
         if has_null_group:
             group_ids.iloc[-1] = cudf.NA
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 0d29ef07e7d..094da09ab08 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -60,7 +60,7 @@
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
 
 if TYPE_CHECKING:
-    from collections.abc import Generator, Iterable
+    from collections.abc import Generator, Hashable, Iterable
     from datetime import tzinfo
 
 
@@ -1071,6 +1071,16 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
 
         return NotImplemented
 
+    @classmethod
+    @_performance_tracking
+    def _from_column(
+        cls, column: ColumnBase, *, name: Hashable = None
+    ) -> Self:
+        ca = cudf.core.column_accessor.ColumnAccessor(
+            {name: column}, verify=False
+        )
+        return _index_from_data(ca)
+
     @classmethod
     @_performance_tracking
     def _from_data(cls, data: MutableMapping, name: Any = no_default) -> Self:
@@ -1092,8 +1102,30 @@ def _from_data_like_self(
     @classmethod
     @_performance_tracking
     def from_arrow(cls, obj):
+        """Create from PyArrow Array/ChunkedArray.
+
+        Parameters
+        ----------
+        array : PyArrow Array/ChunkedArray
+            PyArrow Object which has to be converted.
+
+        Raises
+        ------
+        TypeError for invalid input type.
+
+        Returns
+        -------
+        SingleColumnFrame
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import pyarrow as pa
+        >>> cudf.Index.from_arrow(pa.array(["a", "b", None]))
+        Index(['a', 'b', <NA>], dtype='object')
+        """
         try:
-            return cls(ColumnBase.from_arrow(obj))
+            return cls._from_column(ColumnBase.from_arrow(obj))
         except TypeError:
             # Try interpreting object as a MultiIndex before failing.
             return cudf.MultiIndex.from_arrow(obj)
@@ -1297,22 +1329,22 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             return _return_get_indexer_result(result.values)
 
         scatter_map, indices = libcudf.join.join([lcol], [rcol], how="inner")
-        (result,) = libcudf.copying.scatter([indices], scatter_map, [result])
-        result_series = cudf.Series(result)
+        result = libcudf.copying.scatter([indices], scatter_map, [result])[0]
+        result_series = cudf.Series._from_column(result)
 
         if method in {"ffill", "bfill", "pad", "backfill"}:
             result_series = _get_indexer_basic(
                 index=self,
                 positions=result_series,
                 method=method,
-                target_col=cudf.Series(needle),
+                target_col=cudf.Series._from_column(needle),
                 tolerance=tolerance,
             )
         elif method == "nearest":
             result_series = _get_nearest_indexer(
                 index=self,
                 positions=result_series,
-                target_col=cudf.Series(needle),
+                target_col=cudf.Series._from_column(needle),
                 tolerance=tolerance,
             )
         elif method is not None:
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 0678ebfdd81..24d947a574a 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -35,6 +35,7 @@
     is_list_like,
     is_scalar,
 )
+from cudf.core._base_index import BaseIndex
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.buffer import acquire_spill_lock
 from cudf.core.column import ColumnBase, as_column
@@ -67,7 +68,6 @@
         Dtype,
         NotImplementedType,
     )
-    from cudf.core._base_index import BaseIndex
 
 
 doc_reset_index_template = """
@@ -304,6 +304,10 @@ def _from_data(
         index: BaseIndex | None = None,
     ):
         out = super()._from_data(data)
+        if not (index is None or isinstance(index, BaseIndex)):
+            raise ValueError(
+                f"index must be None or a cudf.Index not {type(index).__name__}"
+            )
         out._index = RangeIndex(out._data.nrows) if index is None else index
         return out
 
@@ -2934,8 +2938,8 @@ def hash_values(self, method="murmur3", seed=None):
         # Note that both Series and DataFrame return Series objects from this
         # calculation, necessitating the unfortunate circular reference to the
         # child class here.
-        return cudf.Series._from_data(
-            {None: libcudf.hash.hash([*self._columns], method, seed)},
+        return cudf.Series._from_column(
+            libcudf.hash.hash([*self._columns], method, seed),
             index=self.index,
         )
 
@@ -3219,13 +3223,13 @@ def duplicated(self, subset=None, keep="first"):
         distinct = libcudf.stream_compaction.distinct_indices(
             columns, keep=keep
         )
-        (result,) = libcudf.copying.scatter(
+        result = libcudf.copying.scatter(
             [cudf.Scalar(False, dtype=bool)],
             distinct,
             [as_column(True, length=len(self), dtype=bool)],
             bounds_check=False,
-        )
-        return cudf.Series(result, index=self.index)
+        )[0]
+        return cudf.Series._from_column(result, index=self.index)
 
     @_performance_tracking
     def _empty_like(self, keep_index=True) -> Self:
@@ -3506,7 +3510,7 @@ def _apply(self, func, kernel_getter, *args, **kwargs):
         col = _post_process_output_col(ans_col, retty)
 
         col.set_base_mask(libcudf.transform.bools_to_mask(ans_mask))
-        result = cudf.Series._from_data({None: col}, self.index)
+        result = cudf.Series._from_column(col, index=self.index)
 
         return result
 
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 2788455aebf..9646b34830f 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -702,12 +702,8 @@ def _compute_validity_mask(self, index, row_tuple, max_length):
             data_table = cudf.concat(
                 [
                     frame,
-                    cudf.DataFrame(
-                        {
-                            "idx": cudf.Series(
-                                column.as_column(range(len(frame)))
-                            )
-                        }
+                    cudf.DataFrame._from_data(
+                        {"idx": column.as_column(range(len(frame)))}
                     ),
                 ],
                 axis=1,
@@ -786,7 +782,7 @@ def _index_and_downcast(self, result, index, index_key):
             out_index.insert(
                 out_index._num_columns,
                 k,
-                cudf.Series._from_data({None: index._data.columns[k]}),
+                cudf.Series._from_column(index._data.columns[k]),
             )
 
         # determine if we should downcast from a DataFrame to a Series
@@ -852,7 +848,10 @@ def _get_row_major(
         valid_indices = self._get_valid_indices_by_tuple(
             df.index, row_tuple, len(df.index)
         )
-        indices = cudf.Series(valid_indices)
+        if isinstance(valid_indices, column.ColumnBase):
+            indices = cudf.Series._from_column(valid_indices)
+        else:
+            indices = cudf.Series(valid_indices)
         result = df.take(indices)
         final = self._index_and_downcast(result, result.index, row_tuple)
         return final
@@ -1925,8 +1924,8 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None):
             *join_keys,
             how="inner",
         )
-        (result,) = libcudf.copying.scatter([indices], scatter_map, [result])
-        result_series = cudf.Series(result)
+        result = libcudf.copying.scatter([indices], scatter_map, [result])[0]
+        result_series = cudf.Series._from_column(result)
 
         if method in {"ffill", "bfill", "pad", "backfill"}:
             result_series = _get_indexer_basic(
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index e7248977b1d..52a55760d4a 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -484,9 +484,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
         if len(new_objs) == 1 and not ignore_index:
             return new_objs[0]
         else:
-            return cudf.Series._concat(
-                objs, axis=axis, index=None if ignore_index else True
-            )
+            return cudf.Series._concat(objs, axis=axis, index=not ignore_index)
     elif typ is cudf.MultiIndex:
         return cudf.MultiIndex._concat(objs)
     elif issubclass(typ, cudf.Index):
@@ -632,7 +630,7 @@ def melt(
     def _tile(A, reps):
         series_list = [A] * reps
         if reps > 0:
-            return cudf.Series._concat(objs=series_list, index=None)
+            return cudf.Series._concat(objs=series_list, index=False)
         else:
             return cudf.Series([], dtype=A.dtype)
 
@@ -661,7 +659,7 @@ def _tile(A, reps):
 
     # Step 3: add values
     mdata[value_name] = cudf.Series._concat(
-        objs=[frame[val] for val in value_vars], index=None
+        objs=[frame[val] for val in value_vars], index=False
     )
 
     return cudf.DataFrame(mdata)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 929af5cd981..de57ac5f290 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -69,6 +69,8 @@
 from cudf.utils.performance_tracking import _performance_tracking
 
 if TYPE_CHECKING:
+    import pyarrow as pa
+
     from cudf._typing import (
         ColumnLike,
         DataFrameOrSeries,
@@ -294,8 +296,8 @@ def __getitem__(self, arg: Any) -> ScalarLike | DataFrameOrSeries:
             return result
         try:
             arg = self._loc_to_iloc(arg)
-        except (TypeError, KeyError, IndexError, ValueError):
-            raise KeyError(arg)
+        except (TypeError, KeyError, IndexError, ValueError) as err:
+            raise KeyError(arg) from err
 
         return self._frame.iloc[arg]
 
@@ -394,8 +396,10 @@ def _loc_to_iloc(self, arg):
             return _indices_from_labels(self._frame, arg)
 
         else:
-            arg = cudf.core.series.Series(cudf.core.column.as_column(arg))
-            if arg.dtype in (bool, np.bool_):
+            arg = cudf.core.series.Series._from_column(
+                cudf.core.column.as_column(arg)
+            )
+            if arg.dtype.kind == "b":
                 return arg
             else:
                 indices = _indices_from_labels(self._frame, arg)
@@ -510,7 +514,37 @@ def from_categorical(cls, categorical, codes=None):
         col = cudf.core.column.categorical.pandas_categorical_as_column(
             categorical, codes=codes
         )
-        return Series(data=col)
+        return Series._from_column(col)
+
+    @classmethod
+    @_performance_tracking
+    def from_arrow(cls, array: pa.Array):
+        """Create from PyArrow Array/ChunkedArray.
+
+        Parameters
+        ----------
+        array : PyArrow Array/ChunkedArray
+            PyArrow Object which has to be converted.
+
+        Raises
+        ------
+        TypeError for invalid input type.
+
+        Returns
+        -------
+        SingleColumnFrame
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import pyarrow as pa
+        >>> cudf.Series.from_arrow(pa.array(["a", "b", None]))
+        0       a
+        1       b
+        2    <NA>
+        dtype: object
+        """
+        return cls._from_column(ColumnBase.from_arrow(array))
 
     @classmethod
     @_performance_tracking
@@ -560,7 +594,8 @@ def from_masked_array(cls, data, mask, null_count=None):
         dtype: int64
         """
         col = as_column(data).set_mask(mask)
-        return cls(data=col)
+        ca = ColumnAccessor({None: col}, verify=False)
+        return cls._from_data(ca)
 
     @_performance_tracking
     def __init__(
@@ -586,10 +621,10 @@ def __init__(
             column = as_column(data, nan_as_null=nan_as_null, dtype=dtype)
             if isinstance(data, (pd.Series, Series)):
                 index_from_data = ensure_index(data.index)
-        elif isinstance(data, ColumnAccessor):
+        elif isinstance(data, (ColumnAccessor, ColumnBase)):
             raise TypeError(
                 "Use cudf.Series._from_data for constructing a Series from "
-                "ColumnAccessor"
+                "ColumnAccessor or a ColumnBase"
             )
         elif isinstance(data, dict):
             if not data:
@@ -656,6 +691,18 @@ def __init__(
             self._index = second_index
         self._check_data_index_length_match()
 
+    @classmethod
+    @_performance_tracking
+    def _from_column(
+        cls,
+        column: ColumnBase,
+        *,
+        name: abc.Hashable = None,
+        index: BaseIndex | None = None,
+    ) -> Self:
+        ca = ColumnAccessor({name: column}, verify=False)
+        return cls._from_data(ca, index=index)
+
     @classmethod
     @_performance_tracking
     def _from_data(
@@ -1535,17 +1582,21 @@ def dtype(self):
 
     @classmethod
     @_performance_tracking
-    def _concat(cls, objs, axis=0, index=True):
+    def _concat(cls, objs, axis=0, index: bool = True):
         # Concatenate index if not provided
         if index is True:
             if isinstance(objs[0].index, cudf.MultiIndex):
-                index = cudf.MultiIndex._concat([o.index for o in objs])
+                result_index = cudf.MultiIndex._concat([o.index for o in objs])
             else:
                 with warnings.catch_warnings():
                     warnings.simplefilter("ignore", FutureWarning)
-                    index = cudf.core.index.Index._concat(
+                    result_index = cudf.core.index.Index._concat(
                         [o.index for o in objs]
                     )
+        elif index is False:
+            result_index = None
+        else:
+            raise ValueError(f"{index=} must be a bool")
 
         names = {obj.name for obj in objs}
         if len(names) == 1:
@@ -1597,7 +1648,9 @@ def _concat(cls, objs, axis=0, index=True):
         if len(objs):
             col = col._with_type_metadata(objs[0].dtype)
 
-        return cls(data=col, index=index, name=name)
+        return cls._from_data(
+            ColumnAccessor({name: col}, verify=False), index=result_index
+        )
 
     @property  # type: ignore
     @_performance_tracking
@@ -2709,8 +2762,8 @@ def mode(self, dropna=True):
         if len(val_counts) > 0:
             val_counts = val_counts[val_counts == val_counts.iloc[0]]
 
-        return Series._from_data(
-            {self.name: val_counts.index.sort_values()._column}, name=self.name
+        return Series._from_column(
+            val_counts.index.sort_values()._column, name=self.name
         )
 
     @_performance_tracking
@@ -2999,8 +3052,8 @@ def isin(self, values):
                 f"to isin(), you passed a [{type(values).__name__}]"
             )
 
-        return Series._from_data(
-            {self.name: self._column.isin(values)}, index=self.index
+        return Series._from_column(
+            self._column.isin(values), name=self.name, index=self.index
         )
 
     @_performance_tracking
@@ -3036,7 +3089,7 @@ def unique(self):
         res = self._column.unique()
         if cudf.get_option("mode.pandas_compatible"):
             return res.values
-        return Series(res, name=self.name)
+        return Series._from_column(res, name=self.name)
 
     @_performance_tracking
     def value_counts(
@@ -3268,8 +3321,9 @@ def quantile(
         if return_scalar:
             return result
 
-        return Series._from_data(
-            data={self.name: result},
+        return Series._from_column(
+            result,
+            name=self.name,
             index=cudf.Index(np_array_q) if quant_index else None,
         )
 
@@ -3351,8 +3405,9 @@ def digitize(self, bins, right=False):
         3    2
         dtype: int32
         """
-        return Series(
-            cudf.core.column.numerical.digitize(self._column, bins, right)
+        return Series._from_column(
+            cudf.core.column.numerical.digitize(self._column, bins, right),
+            name=self.name,
         )
 
     @_performance_tracking
@@ -5293,10 +5348,10 @@ def isclose(a, b, rtol=1e-05, atol=1e-08, equal_nan=False):
     elif b_col.null_count:
         null_values = b_col.isnull()
     else:
-        return Series(result_col, index=index)
+        return Series._from_column(result_col, index=index)
 
     result_col[null_values] = False
     if equal_nan is True and a_col.null_count and b_col.null_count:
         result_col[equal_nulls] = True
 
-    return Series(result_col, index=index)
+    return Series._from_column(result_col, index=index)
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index a5ff1223791..eb6714029cf 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -15,11 +15,14 @@
     is_numeric_dtype,
 )
 from cudf.core.column import ColumnBase, as_column
+from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.frame import Frame
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import NotIterable
 
 if TYPE_CHECKING:
+    from collections.abc import Hashable
+
     import cupy
     import numpy
     import pyarrow as pa
@@ -112,35 +115,17 @@ def values_host(self) -> numpy.ndarray:  # noqa: D102
 
     @classmethod
     @_performance_tracking
-    def from_arrow(cls, array) -> Self:
-        """Create from PyArrow Array/ChunkedArray.
-
-        Parameters
-        ----------
-        array : PyArrow Array/ChunkedArray
-            PyArrow Object which has to be converted.
-
-        Raises
-        ------
-        TypeError for invalid input type.
-
-        Returns
-        -------
-        SingleColumnFrame
+    def _from_column(
+        cls, column: ColumnBase, *, name: Hashable = None
+    ) -> Self:
+        """Constructor for a single Column."""
+        ca = ColumnAccessor({name: column}, verify=False)
+        return cls._from_data(ca)
 
-        Examples
-        --------
-        >>> import cudf
-        >>> import pyarrow as pa
-        >>> cudf.Index.from_arrow(pa.array(["a", "b", None]))
-        Index(['a', 'b', None], dtype='object')
-        >>> cudf.Series.from_arrow(pa.array(["a", "b", None]))
-        0       a
-        1       b
-        2    <NA>
-        dtype: object
-        """
-        return cls(ColumnBase.from_arrow(array))
+    @classmethod
+    @_performance_tracking
+    def from_arrow(cls, array) -> Self:
+        raise NotImplementedError
 
     @_performance_tracking
     def to_arrow(self) -> pa.Array:
diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py
index afb3496311b..99d85c0c5c0 100644
--- a/python/cudf/cudf/core/tokenize_vocabulary.py
+++ b/python/cudf/cudf/core/tokenize_vocabulary.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from __future__ import annotations
 
@@ -22,7 +22,9 @@ class TokenizeVocabulary:
     def __init__(self, vocabulary: "cudf.Series"):
         self.vocabulary = cpp_tokenize_vocabulary(vocabulary._column)
 
-    def tokenize(self, text, delimiter: str = "", default_id: int = -1):
+    def tokenize(
+        self, text, delimiter: str = "", default_id: int = -1
+    ) -> cudf.Series:
         """
         Parameters
         ----------
@@ -45,4 +47,4 @@ def tokenize(self, text, delimiter: str = "", default_id: int = -1):
             text._column, self.vocabulary, delim, default_id
         )
 
-        return cudf.Series(result)
+        return cudf.Series._from_column(result)
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index c6e2b5d10e1..2f77778116f 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -18,6 +18,8 @@
 )
 from cudf.api.types import is_integer, is_scalar
 from cudf.core import column
+from cudf.core.column_accessor import ColumnAccessor
+from cudf.core.index import ensure_index
 
 # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112
 _unit_map = {
@@ -275,7 +277,7 @@ def to_datetime(
                 format=format,
                 utc=utc,
             )
-            return cudf.Series(col, index=arg.index)
+            return cudf.Series._from_column(col, index=arg.index)
         else:
             col = _process_col(
                 col=column.as_column(arg),
@@ -286,9 +288,12 @@ def to_datetime(
                 utc=utc,
             )
             if isinstance(arg, (cudf.BaseIndex, pd.Index)):
-                return cudf.Index(col, name=arg.name)
+                ca = ColumnAccessor({arg.name: col}, verify=False)
+                return cudf.DatetimeIndex._from_data(ca)
             elif isinstance(arg, (cudf.Series, pd.Series)):
-                return cudf.Series(col, index=arg.index, name=arg.name)
+                return cudf.Series._from_column(
+                    col, name=arg.name, index=ensure_index(arg.index)
+                )
             elif is_scalar(arg):
                 return col.element_indexing(0)
             else:
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 07158e4ee61..8b95f6f6a04 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
+from __future__ import annotations
 
 import warnings
+from typing import TYPE_CHECKING
 
 import numpy as np
 import pandas as pd
@@ -11,8 +13,12 @@
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_string_dtype
 from cudf.core.column import as_column
 from cudf.core.dtypes import CategoricalDtype
+from cudf.core.index import ensure_index
 from cudf.utils.dtypes import can_convert_to_column
 
+if TYPE_CHECKING:
+    from cudf.core.column import ColumnBase
+
 
 def to_numeric(arg, errors="raise", downcast=None):
     """
@@ -164,7 +170,9 @@ def to_numeric(arg, errors="raise", downcast=None):
                     break
 
     if isinstance(arg, (cudf.Series, pd.Series)):
-        return cudf.Series(col, index=arg.index, name=arg.name)
+        return cudf.Series._from_column(
+            col, name=arg.name, index=ensure_index(arg.index)
+        )
     else:
         if col.has_nulls():
             # To match pandas, always return a floating type filled with nan.
@@ -226,25 +234,10 @@ def _convert_str_col(col, errors, _downcast=None):
             raise ValueError("Unable to convert some strings to numerics.")
 
 
-def _proc_inf_empty_strings(col):
+def _proc_inf_empty_strings(col: ColumnBase) -> ColumnBase:
     """Handles empty and infinity strings"""
     col = libstrings.to_lower(col)
-    col = _proc_empty_strings(col)
-    col = _proc_inf_strings(col)
-    return col
-
-
-def _proc_empty_strings(col):
-    """Replaces empty strings with NaN"""
-    s = cudf.Series(col)
-    s = s.where(s != "", "NaN")
-    return s._column
-
-
-def _proc_inf_strings(col):
-    """Convert "inf/infinity" strings into "Inf", the native string
-    representing infinity in libcudf
-    """
+    col = col.find_and_replace(as_column([""]), as_column(["NaN"]))
     # TODO: This can be handled by libcudf in
     # future see StringColumn.as_numerical_column
     col = libstrings.replace_multi(
diff --git a/python/cudf/cudf/datasets.py b/python/cudf/cudf/datasets.py
index 7b183d5f1a3..dbabaacf6b5 100644
--- a/python/cudf/cudf/datasets.py
+++ b/python/cudf/cudf/datasets.py
@@ -5,7 +5,6 @@
 
 import cudf
 from cudf._lib.transform import bools_to_mask
-from cudf.core.column_accessor import ColumnAccessor
 
 __all__ = ["timeseries", "randomdata"]
 
@@ -73,9 +72,7 @@ def timeseries(
         )
         mask_buf = bools_to_mask(cudf.core.column.as_column(mask))
         masked_col = gdf[col]._column.set_mask(mask_buf)
-        gdf[col] = cudf.Series._from_data(
-            ColumnAccessor({None: masked_col}), index=gdf.index
-        )
+        gdf[col] = cudf.Series._from_column(masked_col, index=gdf.index)
 
     return gdf
 
diff --git a/python/cudf/cudf/io/dlpack.py b/python/cudf/cudf/io/dlpack.py
index d3d99aab0cd..1347b2cc38f 100644
--- a/python/cudf/cudf/io/dlpack.py
+++ b/python/cudf/cudf/io/dlpack.py
@@ -71,7 +71,7 @@ def to_dlpack(cudf_obj):
     if isinstance(cudf_obj, (cudf.DataFrame, cudf.Series, cudf.BaseIndex)):
         gdf = cudf_obj
     elif isinstance(cudf_obj, ColumnBase):
-        gdf = cudf.Series._from_data({None: cudf_obj})
+        gdf = cudf.Series._from_column(cudf_obj)
     else:
         raise TypeError(
             f"Input of type {type(cudf_obj)} cannot be converted "
diff --git a/python/cudf/cudf/tests/test_apply_rows.py b/python/cudf/cudf/tests/test_apply_rows.py
index a11022c1a17..f9b0d9c1e78 100644
--- a/python/cudf/cudf/tests/test_apply_rows.py
+++ b/python/cudf/cudf/tests/test_apply_rows.py
@@ -27,8 +27,12 @@ def test_dataframe_apply_rows(dtype, has_nulls, pessimistic):
         gdf_series_expected = gdf_series_a * gdf_series_b
     else:
         # optimistically ignore the null masks
-        a = cudf.Series(column.build_column(gdf_series_a.data, dtype))
-        b = cudf.Series(column.build_column(gdf_series_b.data, dtype))
+        a = cudf.Series._from_column(
+            column.build_column(gdf_series_a.data, dtype)
+        )
+        b = cudf.Series._from_column(
+            column.build_column(gdf_series_b.data, dtype)
+        )
         gdf_series_expected = a * b
 
     df_expected = cudf.DataFrame(
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index c288155112c..4aa7fb27c9b 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -95,7 +95,7 @@ def test_column_offset_and_size(pandas_input, offset, size):
     else:
         assert col.size == (col.data.size / col.dtype.itemsize)
 
-    got = cudf.Series(col)
+    got = cudf.Series._from_column(col)
 
     if offset is None:
         offset = 0
@@ -112,8 +112,8 @@ def test_column_offset_and_size(pandas_input, offset, size):
 
 def column_slicing_test(col, offset, size, cast_to_float=False):
     col_slice = col.slice(offset, offset + size)
-    series = cudf.Series(col)
-    sliced_series = cudf.Series(col_slice)
+    series = cudf.Series._from_column(col)
+    sliced_series = cudf.Series._from_column(col_slice)
 
     if cast_to_float:
         pd_series = series.astype(float).to_pandas()
@@ -208,7 +208,9 @@ def test_as_column_scalar_with_nan(nan_as_null, scalar, size):
     )
 
     got = (
-        cudf.Series(as_column(scalar, length=size, nan_as_null=nan_as_null))
+        cudf.Series._from_column(
+            as_column(scalar, length=size, nan_as_null=nan_as_null)
+        )
         .dropna()
         .to_numpy()
     )
@@ -250,12 +252,18 @@ def test_column_chunked_array_creation():
     actual_column = cudf.core.column.as_column(chunked_array, dtype="float")
     expected_column = cudf.core.column.as_column(pyarrow_array, dtype="float")
 
-    assert_eq(cudf.Series(actual_column), cudf.Series(expected_column))
+    assert_eq(
+        cudf.Series._from_column(actual_column),
+        cudf.Series._from_column(expected_column),
+    )
 
     actual_column = cudf.core.column.as_column(chunked_array)
     expected_column = cudf.core.column.as_column(pyarrow_array)
 
-    assert_eq(cudf.Series(actual_column), cudf.Series(expected_column))
+    assert_eq(
+        cudf.Series._from_column(actual_column),
+        cudf.Series._from_column(expected_column),
+    )
 
 
 @pytest.mark.parametrize(
@@ -287,7 +295,7 @@ def test_column_view_valid_numeric_to_numeric(data, from_dtype, to_dtype):
     gpu_data_view = gpu_data.view(to_dtype)
 
     expect = pd.Series(cpu_data_view, dtype=cpu_data_view.dtype)
-    got = cudf.Series(gpu_data_view, dtype=gpu_data_view.dtype)
+    got = cudf.Series._from_column(gpu_data_view).astype(gpu_data_view.dtype)
 
     gpu_ptr = gpu_data.data.get_ptr(mode="read")
     assert gpu_ptr == got._column.data.get_ptr(mode="read")
@@ -327,7 +335,7 @@ def test_column_view_invalid_numeric_to_numeric(data, from_dtype, to_dtype):
     ],
 )
 def test_column_view_valid_string_to_numeric(data, to_dtype):
-    expect = cudf.Series(cudf.Series(data)._column.view(to_dtype))
+    expect = cudf.Series._from_column(cudf.Series(data)._column.view(to_dtype))
     got = cudf.Series(str_host_view(data, to_dtype))
 
     assert_eq(expect, got)
@@ -342,7 +350,7 @@ def test_column_view_nulls_widths_even():
 
     sr = cudf.Series(data, dtype="int32")
     expect = cudf.Series(expect_data, dtype="float32")
-    got = cudf.Series(sr._column.view("float32"))
+    got = cudf.Series._from_column(sr._column.view("float32"))
 
     assert_eq(expect, got)
 
@@ -354,7 +362,7 @@ def test_column_view_nulls_widths_even():
 
     sr = cudf.Series(data, dtype="float64")
     expect = cudf.Series(expect_data, dtype="int64")
-    got = cudf.Series(sr._column.view("int64"))
+    got = cudf.Series._from_column(sr._column.view("int64"))
 
     assert_eq(expect, got)
 
@@ -365,7 +373,9 @@ def test_column_view_numeric_slice(slc):
     sr = cudf.Series(data)
 
     expect = cudf.Series(data[slc].view("int64"))
-    got = cudf.Series(sr._column.slice(slc.start, slc.stop).view("int64"))
+    got = cudf.Series._from_column(
+        sr._column.slice(slc.start, slc.stop).view("int64")
+    )
 
     assert_eq(expect, got)
 
@@ -376,7 +386,7 @@ def test_column_view_numeric_slice(slc):
 def test_column_view_string_slice(slc):
     data = ["a", "bcde", "cd", "efg", "h"]
 
-    expect = cudf.Series(
+    expect = cudf.Series._from_column(
         cudf.Series(data)._column.slice(slc.start, slc.stop).view("int8")
     )
     got = cudf.Series(str_host_view(data[slc], "int8"))
@@ -409,7 +419,10 @@ def test_as_column_buffer(data, expected):
     actual_column = cudf.core.column.as_column(
         cudf.core.buffer.as_buffer(data), dtype=data.dtype
     )
-    assert_eq(cudf.Series(actual_column), cudf.Series(expected))
+    assert_eq(
+        cudf.Series._from_column(actual_column),
+        cudf.Series._from_column(expected),
+    )
 
 
 @pytest.mark.parametrize(
@@ -436,7 +449,10 @@ def test_as_column_arrow_array(data, pyarrow_kwargs, cudf_kwargs):
     pyarrow_data = pa.array(data, **pyarrow_kwargs)
     cudf_from_pyarrow = as_column(pyarrow_data)
     expected = as_column(data, **cudf_kwargs)
-    assert_eq(cudf.Series(cudf_from_pyarrow), cudf.Series(expected))
+    assert_eq(
+        cudf.Series._from_column(cudf_from_pyarrow),
+        cudf.Series._from_column(expected),
+    )
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index e2ce5c03b70..2c59253d500 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -4264,34 +4264,36 @@ def test_empty_dataframe_describe():
 def test_as_column_types():
     col = column.as_column(cudf.Series([], dtype="float64"))
     assert_eq(col.dtype, np.dtype("float64"))
-    gds = cudf.Series(col)
+    gds = cudf.Series._from_column(col)
     pds = pd.Series(pd.Series([], dtype="float64"))
 
     assert_eq(pds, gds)
 
     col = column.as_column(cudf.Series([], dtype="float64"), dtype="float32")
     assert_eq(col.dtype, np.dtype("float32"))
-    gds = cudf.Series(col)
+    gds = cudf.Series._from_column(col)
     pds = pd.Series(pd.Series([], dtype="float32"))
 
     assert_eq(pds, gds)
 
     col = column.as_column(cudf.Series([], dtype="float64"), dtype="str")
     assert_eq(col.dtype, np.dtype("object"))
-    gds = cudf.Series(col)
+    gds = cudf.Series._from_column(col)
     pds = pd.Series(pd.Series([], dtype="str"))
 
     assert_eq(pds, gds)
 
     col = column.as_column(cudf.Series([], dtype="float64"), dtype="object")
     assert_eq(col.dtype, np.dtype("object"))
-    gds = cudf.Series(col)
+    gds = cudf.Series._from_column(col)
     pds = pd.Series(pd.Series([], dtype="object"))
 
     assert_eq(pds, gds)
 
     pds = pd.Series(np.array([1, 2, 3]), dtype="float32")
-    gds = cudf.Series(column.as_column(np.array([1, 2, 3]), dtype="float32"))
+    gds = cudf.Series._from_column(
+        column.as_column(np.array([1, 2, 3]), dtype="float32")
+    )
 
     assert_eq(pds, gds)
 
@@ -4301,23 +4303,25 @@ def test_as_column_types():
     assert_eq(pds, gds)
 
     pds = pd.Series([], dtype="float64")
-    gds = cudf.Series(column.as_column(pds))
+    gds = cudf.Series._from_column(column.as_column(pds))
     assert_eq(pds, gds)
 
     pds = pd.Series([1, 2, 4], dtype="int64")
-    gds = cudf.Series(column.as_column(cudf.Series([1, 2, 4]), dtype="int64"))
+    gds = cudf.Series._from_column(
+        column.as_column(cudf.Series([1, 2, 4]), dtype="int64")
+    )
 
     assert_eq(pds, gds)
 
     pds = pd.Series([1.2, 18.0, 9.0], dtype="float32")
-    gds = cudf.Series(
+    gds = cudf.Series._from_column(
         column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="float32")
     )
 
     assert_eq(pds, gds)
 
     pds = pd.Series([1.2, 18.0, 9.0], dtype="str")
-    gds = cudf.Series(
+    gds = cudf.Series._from_column(
         column.as_column(cudf.Series([1.2, 18.0, 9.0]), dtype="str")
     )
 
@@ -6521,7 +6525,9 @@ def test_from_pandas_for_series_nan_as_null(nan_as_null):
     data = [np.nan, 2.0, 3.0]
     psr = pd.Series(data)
 
-    expected = cudf.Series(column.as_column(data, nan_as_null=nan_as_null))
+    expected = cudf.Series._from_column(
+        column.as_column(data, nan_as_null=nan_as_null)
+    )
     got = cudf.from_pandas(psr, nan_as_null=nan_as_null)
 
     assert_eq(expected, got)
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index 65f739bc74a..b63788d20b7 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -106,7 +106,7 @@ def test_typecast_from_float_to_decimal(request, data, from_dtype, to_dtype):
     pa_arr = got.to_arrow().cast(
         pa.decimal128(to_dtype.precision, to_dtype.scale)
     )
-    expected = cudf.Series(Decimal64Column.from_arrow(pa_arr))
+    expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr))
 
     got = got.astype(to_dtype)
 
@@ -146,7 +146,7 @@ def test_typecast_from_int_to_decimal(data, from_dtype, to_dtype):
         .cast("float64")
         .cast(pa.decimal128(to_dtype.precision, to_dtype.scale))
     )
-    expected = cudf.Series(Decimal64Column.from_arrow(pa_arr))
+    expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr))
 
     got = got.astype(to_dtype)
 
@@ -206,9 +206,9 @@ def test_typecast_to_from_decimal(data, from_dtype, to_dtype):
         pa.decimal128(to_dtype.precision, to_dtype.scale), safe=False
     )
     if isinstance(to_dtype, Decimal32Dtype):
-        expected = cudf.Series(Decimal32Column.from_arrow(pa_arr))
+        expected = cudf.Series._from_column(Decimal32Column.from_arrow(pa_arr))
     elif isinstance(to_dtype, Decimal64Dtype):
-        expected = cudf.Series(Decimal64Column.from_arrow(pa_arr))
+        expected = cudf.Series._from_column(Decimal64Column.from_arrow(pa_arr))
 
     with expect_warning_if(to_dtype.scale < s.dtype.scale, UserWarning):
         got = s.astype(to_dtype)
@@ -245,7 +245,7 @@ def test_typecast_from_decimal(data, from_dtype, to_dtype):
     pa_arr = got.to_arrow().cast(to_dtype, safe=False)
 
     got = got.astype(to_dtype)
-    expected = cudf.Series(NumericalColumn.from_arrow(pa_arr))
+    expected = cudf.Series._from_column(NumericalColumn.from_arrow(pa_arr))
 
     assert_eq(got, expected)
     assert_eq(got.dtype, expected.dtype)
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index 7f48e414180..44270d20d59 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -78,7 +78,7 @@ def assert_buffer_equal(buffer_and_dtype: tuple[_CuDFBuffer, Any], cudfcol):
     # FIXME: In gh-10202 some minimal fixes were added to unblock CI. But
     # currently only non-null values are compared, null positions are
     # unchecked.
-    non_null_idxs = ~cudf.Series(cudfcol).isna()
+    non_null_idxs = cudfcol.notnull()
     assert_eq(
         col_from_buf.apply_boolean_mask(non_null_idxs),
         cudfcol.apply_boolean_mask(non_null_idxs),
@@ -86,8 +86,8 @@ def assert_buffer_equal(buffer_and_dtype: tuple[_CuDFBuffer, Any], cudfcol):
     array_from_dlpack = cp.from_dlpack(buf.__dlpack__()).get()
     col_array = cp.asarray(cudfcol.data_array_view(mode="read")).get()
     assert_eq(
-        array_from_dlpack[non_null_idxs.to_numpy()].flatten(),
-        col_array[non_null_idxs.to_numpy()].flatten(),
+        array_from_dlpack[non_null_idxs.values_host].flatten(),
+        col_array[non_null_idxs.values_host].flatten(),
     )
 
 
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 36bcaa66d7d..c4c883ca9f9 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -946,5 +946,5 @@ def test_empty_nested_list_uninitialized_offsets_memory_usage():
         null_count=col.null_count,
         children=(column_empty(0, col.children[0].dtype), empty_inner),
     )
-    ser = cudf.Series._from_data({None: col_empty_offset})
+    ser = cudf.Series._from_column(col_empty_offset)
     assert ser.memory_usage() == 8
diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py
index 719e8a33285..0f13a9e173a 100644
--- a/python/cudf/cudf/tests/test_pickling.py
+++ b/python/cudf/cudf/tests/test_pickling.py
@@ -127,7 +127,7 @@ def test_pickle_categorical_column(slices):
     pickled = pickle.dumps(input_col)
     out = pickle.loads(pickled)
 
-    assert_eq(Series(out), Series(input_col))
+    assert_eq(Series._from_column(out), Series._from_column(input_col))
 
 
 @pytest.mark.parametrize(
@@ -148,4 +148,4 @@ def test_pickle_string_column(slices):
     pickled = pickle.dumps(input_col)
     out = pickle.loads(pickled)
 
-    assert_eq(Series(out), Series(input_col))
+    assert_eq(Series._from_column(out), Series._from_column(input_col))
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index d4fe5ff3bb5..1973fe6fb41 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -817,12 +817,12 @@ def test_fillna_string(ps_data, fill_value, inplace):
 def test_series_fillna_invalid_dtype(data_dtype):
     gdf = cudf.Series([1, 2, None, 3], dtype=data_dtype)
     fill_value = 2.5
-    with pytest.raises(TypeError) as raises:
-        gdf.fillna(fill_value)
-    raises.match(
+    msg = (
         f"Cannot safely cast non-equivalent"
         f" {type(fill_value).__name__} to {gdf.dtype.type.__name__}"
     )
+    with pytest.raises(TypeError, match=msg):
+        gdf.fillna(fill_value)
 
 
 @pytest.mark.parametrize("data_dtype", NUMERIC_TYPES)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 8ed78d804bf..6a1887afb1f 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2041,7 +2041,7 @@ def test_series_ordered_dedup():
     sr = cudf.Series(np.random.randint(0, 100, 1000))
     # pandas unique() preserves order
     expect = pd.Series(sr.to_pandas().unique())
-    got = cudf.Series(sr._column.unique())
+    got = cudf.Series._from_column(sr._column.unique())
     assert_eq(expect.values, got.values)
 
 
@@ -2697,7 +2697,9 @@ def test_series_duplicate_index_reindex():
 def test_list_category_like_maintains_dtype():
     dtype = cudf.CategoricalDtype(categories=[1, 2, 3, 4], ordered=True)
     data = [1, 2, 3]
-    result = cudf.Series(cudf.core.column.as_column(data, dtype=dtype))
+    result = cudf.Series._from_column(
+        cudf.core.column.as_column(data, dtype=dtype)
+    )
     expected = pd.Series(data, dtype=dtype.to_pandas())
     assert_eq(result, expected)
 
@@ -2705,7 +2707,9 @@ def test_list_category_like_maintains_dtype():
 def test_list_interval_like_maintains_dtype():
     dtype = cudf.IntervalDtype(subtype=np.int8)
     data = [pd.Interval(1, 2)]
-    result = cudf.Series(cudf.core.column.as_column(data, dtype=dtype))
+    result = cudf.Series._from_column(
+        cudf.core.column.as_column(data, dtype=dtype)
+    )
     expected = pd.Series(data, dtype=dtype.to_pandas())
     assert_eq(result, expected)
 
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 69122cdbafa..5406836ba61 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -178,13 +178,19 @@ def test_column_set_equal_length_object_by_mask():
     bool_col = cudf.Series([True, True, True, True, True])._column
 
     data[bool_col] = replace_data
-    assert_eq(cudf.Series(data), cudf.Series(replace_data))
+    assert_eq(
+        cudf.Series._from_column(data),
+        cudf.Series._from_column(replace_data),
+    )
 
     data = cudf.Series([0, 0, 1, 1, 1])._column
     bool_col = cudf.Series([True, False, True, False, True])._column
     data[bool_col] = replace_data
 
-    assert_eq(cudf.Series(data), cudf.Series([100, 0, 300, 1, 500]))
+    assert_eq(
+        cudf.Series._from_column(data),
+        cudf.Series([100, 0, 300, 1, 500]),
+    )
 
 
 def test_column_set_unequal_length_object_by_mask():
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index f447759d010..4bd084a3938 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -2677,7 +2677,7 @@ def test_string_int_to_ipv4():
         ["0.0.0.0", None, "0.0.0.0", "41.168.0.1", "127.0.0.1", "41.197.0.1"]
     )
 
-    got = cudf.Series(gsr._column.int2ip())
+    got = cudf.Series._from_column(gsr._column.int2ip())
 
     assert_eq(expected, got)
 
diff --git a/python/cudf/cudf/tests/test_string_udfs.py b/python/cudf/cudf/tests/test_string_udfs.py
index 4432d2afc8e..69876d97aad 100644
--- a/python/cudf/cudf/tests/test_string_udfs.py
+++ b/python/cudf/cudf/tests/test_string_udfs.py
@@ -96,7 +96,7 @@ def run_udf_test(data, func, dtype):
     else:
         result = output
 
-    got = cudf.Series(result, dtype=dtype)
+    got = cudf.Series._from_column(result.astype(dtype))
     assert_eq(expect, got, check_dtype=False)
     with _CUDFNumbaConfig():
         udf_str_kernel.forall(len(data))(str_views, output)
@@ -105,7 +105,7 @@ def run_udf_test(data, func, dtype):
     else:
         result = output
 
-    got = cudf.Series(result, dtype=dtype)
+    got = cudf.Series._from_column(result.astype(dtype))
     assert_eq(expect, got, check_dtype=False)
 
 
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 4bdb5d921ec..2b1f745fc04 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -102,6 +102,7 @@ def _nest_list_data(data, leaf_type):
 
 @_dask_cudf_performance_tracking
 def _get_non_empty_data(s):
+    """Return a non empty column as metadata."""
     if isinstance(s, cudf.core.column.CategoricalColumn):
         categories = (
             s.categories if len(s.categories) else [UNKNOWN_CATEGORIES]
@@ -128,7 +129,7 @@ def _get_non_empty_data(s):
         data = [{key: None for key in struct_dtype.fields.keys()}] * 2
         data = cudf.core.column.as_column(data, dtype=s.dtype)
     elif is_string_dtype(s.dtype):
-        data = pa.array(["cat", "dog"])
+        data = cudf.core.column.as_column(pa.array(["cat", "dog"]))
     elif isinstance(s.dtype, pd.DatetimeTZDtype):
         from cudf.utils.dtypes import get_time_unit
 
@@ -153,7 +154,7 @@ def _nonempty_series(s, idx=None):
         idx = _nonempty_index(s.index)
     data = _get_non_empty_data(s._column)
 
-    return cudf.Series(data, name=s.name, index=idx)
+    return cudf.Series._from_column(data, name=s.name, index=idx)
 
 
 @meta_nonempty.register(cudf.DataFrame)
@@ -424,7 +425,7 @@ def hash_object_cudf_index(ind, index=None):
         return ind.to_frame(index=False).hash_values()
 
     col = cudf.core.column.as_column(ind)
-    return cudf.Series(col).hash_values()
+    return cudf.Series._from_column(col).hash_values()
 
 
 @group_split_dispatch.register((cudf.Series, cudf.DataFrame))
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index aab56e3a1b0..3181c8d69ec 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -342,7 +342,7 @@ def groupby(self, by=None, **kwargs):
 def sum_of_squares(x):
     x = x.astype("f8")._column
     outcol = libcudf.reduce.reduce("sum_of_squares", x)
-    return cudf.Series(outcol)
+    return cudf.Series._from_column(outcol)
 
 
 @_dask_cudf_performance_tracking

From 87b957690f02c8983ff77e7b95aa6a5504a590e3 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 7 Aug 2024 10:40:28 -0400
Subject: [PATCH 633/842] Update Changelog [skip ci]

---
 CHANGELOG.md | 376 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 376 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index a5efe4eb9e5..f2a7c337675 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,379 @@
+# cudf 24.08.00 (7 Aug 2024)
+
+## 🚨 Breaking Changes
+
+- Align Index __init__ APIs with pandas 2.x ([#16362](https://github.com/rapidsai/cudf/pull/16362)) [@mroeschke](https://github.com/mroeschke)
+- Align Series APIs with pandas 2.x ([#16333](https://github.com/rapidsai/cudf/pull/16333)) [@mroeschke](https://github.com/mroeschke)
+- Add missing `stream` param to dictionary factory APIs ([#16319](https://github.com/rapidsai/cudf/pull/16319)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Deprecate dtype= parameter in reduction methods ([#16313](https://github.com/rapidsai/cudf/pull/16313)) [@mroeschke](https://github.com/mroeschke)
+- Remove squeeze argument from groupby ([#16312](https://github.com/rapidsai/cudf/pull/16312)) [@mroeschke](https://github.com/mroeschke)
+- Align more DataFrame APIs with pandas ([#16310](https://github.com/rapidsai/cudf/pull/16310)) [@mroeschke](https://github.com/mroeschke)
+- Remove `mr` param from `write_csv` and `write_json` ([#16231](https://github.com/rapidsai/cudf/pull/16231)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Report number of rows per file read by PQ reader when no row selection and fix segfault in chunked PQ reader when skip_rows &gt; 0 ([#16195](https://github.com/rapidsai/cudf/pull/16195)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Refactor from_arrow_device/host to use resource_ref ([#16160](https://github.com/rapidsai/cudf/pull/16160)) [@harrism](https://github.com/harrism)
+- Deprecate Arrow support in I/O ([#16132](https://github.com/rapidsai/cudf/pull/16132)) [@lithomas1](https://github.com/lithomas1)
+- Return `FrozenList` for `Index.names` ([#16047](https://github.com/rapidsai/cudf/pull/16047)) [@galipremsagar](https://github.com/galipremsagar)
+- Add compile option to enable large strings support ([#16037](https://github.com/rapidsai/cudf/pull/16037)) [@davidwendt](https://github.com/davidwendt)
+- Hide visibility of non public symbols ([#15982](https://github.com/rapidsai/cudf/pull/15982)) [@robertmaynard](https://github.com/robertmaynard)
+- Rename strings multiple target replace API ([#15898](https://github.com/rapidsai/cudf/pull/15898)) [@davidwendt](https://github.com/davidwendt)
+- Pinned vector factory that uses the global pool ([#15895](https://github.com/rapidsai/cudf/pull/15895)) [@vuule](https://github.com/vuule)
+- Apply clang-tidy autofixes ([#15894](https://github.com/rapidsai/cudf/pull/15894)) [@vyasr](https://github.com/vyasr)
+- Support `arrow:schema` in Parquet writer to faithfully roundtrip `duration` types with Arrow ([#15875](https://github.com/rapidsai/cudf/pull/15875)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Expose stream parameter to public rolling APIs ([#15865](https://github.com/rapidsai/cudf/pull/15865)) [@srinivasyadav18](https://github.com/srinivasyadav18)
+- Fix large strings handling in nvtext::character_tokenize ([#15829](https://github.com/rapidsai/cudf/pull/15829)) [@davidwendt](https://github.com/davidwendt)
+- Remove legacy JSON reader and concurrent_unordered_map.cuh. ([#15813](https://github.com/rapidsai/cudf/pull/15813)) [@bdice](https://github.com/bdice)
+
+## 🐛 Bug Fixes
+
+- Add `flatbuffers` to `libcudf` build ([#16446](https://github.com/rapidsai/cudf/pull/16446)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix parquet_field_list read_func lambda capture invalid this pointer ([#16440](https://github.com/rapidsai/cudf/pull/16440)) [@davidwendt](https://github.com/davidwendt)
+- Enable prefetching in cudf.pandas.install() ([#16439](https://github.com/rapidsai/cudf/pull/16439)) [@bdice](https://github.com/bdice)
+- Enable prefetching before `runpy` ([#16427](https://github.com/rapidsai/cudf/pull/16427)) [@galipremsagar](https://github.com/galipremsagar)
+- Support thread-safe for `prefetch_config::get` and `prefetch_config::set` ([#16425](https://github.com/rapidsai/cudf/pull/16425)) [@ttnghia](https://github.com/ttnghia)
+- Fix a `pandas-2.0` missing attribute error ([#16416](https://github.com/rapidsai/cudf/pull/16416)) [@galipremsagar](https://github.com/galipremsagar)
+- [Bug] Remove loud `NativeFile` deprecation noise for `read_parquet` from S3 ([#16415](https://github.com/rapidsai/cudf/pull/16415)) [@rjzamora](https://github.com/rjzamora)
+- Fix nightly memcheck error for empty STREAM_INTEROP_TEST ([#16406](https://github.com/rapidsai/cudf/pull/16406)) [@davidwendt](https://github.com/davidwendt)
+- Gate ArrowStringArrayNumpySemantics cudf.pandas proxy behind version check ([#16401](https://github.com/rapidsai/cudf/pull/16401)) [@mroeschke](https://github.com/mroeschke)
+- Don&#39;t export bs_thread_pool ([#16398](https://github.com/rapidsai/cudf/pull/16398)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Require fixed width types for casting in `cudf-polars` ([#16381](https://github.com/rapidsai/cudf/pull/16381)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Fix docstring of `DataFrame.apply` ([#16351](https://github.com/rapidsai/cudf/pull/16351)) [@galipremsagar](https://github.com/galipremsagar)
+- Make __bool__ raise for more cudf objects ([#16311](https://github.com/rapidsai/cudf/pull/16311)) [@mroeschke](https://github.com/mroeschke)
+- Rename `.devcontainer`s for CUDA 12.5 ([#16293](https://github.com/rapidsai/cudf/pull/16293)) [@jakirkham](https://github.com/jakirkham)
+- Fix split_record for all empty strings column ([#16291](https://github.com/rapidsai/cudf/pull/16291)) [@davidwendt](https://github.com/davidwendt)
+- Fix logic in to_arrow for empty list column ([#16279](https://github.com/rapidsai/cudf/pull/16279)) [@wence-](https://github.com/wence-)
+- [BUG] Make name attr of Index fast slow attrs ([#16270](https://github.com/rapidsai/cudf/pull/16270)) [@Matt711](https://github.com/Matt711)
+- Add custom name setter and getter for proxy objects in `cudf.pandas` ([#16234](https://github.com/rapidsai/cudf/pull/16234)) [@Matt711](https://github.com/Matt711)
+- Fall back when casting a timestamp to numeric in cudf-polars ([#16232](https://github.com/rapidsai/cudf/pull/16232)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Disable large string support for Java build ([#16216](https://github.com/rapidsai/cudf/pull/16216)) [@jlowe](https://github.com/jlowe)
+- Remove CCCL patch for PR 211. ([#16207](https://github.com/rapidsai/cudf/pull/16207)) [@bdice](https://github.com/bdice)
+- Add single offset to an empty ListArray in cudf::to_arrow ([#16201](https://github.com/rapidsai/cudf/pull/16201)) [@davidwendt](https://github.com/davidwendt)
+- Fix `memory_usage` when calculating nested list column ([#16193](https://github.com/rapidsai/cudf/pull/16193)) [@mroeschke](https://github.com/mroeschke)
+- Support at/iat indexers in cudf.pandas ([#16177](https://github.com/rapidsai/cudf/pull/16177)) [@mroeschke](https://github.com/mroeschke)
+- Fix unused-return-value debug build error in from_arrow_stream_test.cpp ([#16168](https://github.com/rapidsai/cudf/pull/16168)) [@davidwendt](https://github.com/davidwendt)
+- Fix cudf::strings::replace_multiple hang on empty target ([#16167](https://github.com/rapidsai/cudf/pull/16167)) [@davidwendt](https://github.com/davidwendt)
+- Refactor from_arrow_device/host to use resource_ref ([#16160](https://github.com/rapidsai/cudf/pull/16160)) [@harrism](https://github.com/harrism)
+- interpolate returns new column if no values are interpolated ([#16158](https://github.com/rapidsai/cudf/pull/16158)) [@mroeschke](https://github.com/mroeschke)
+- Use provided memory resource for allocating mixed join results. ([#16153](https://github.com/rapidsai/cudf/pull/16153)) [@bdice](https://github.com/bdice)
+- Run DFG after verify-alpha-spec ([#16151](https://github.com/rapidsai/cudf/pull/16151)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Use size_t to allow large conditional joins ([#16127](https://github.com/rapidsai/cudf/pull/16127)) [@bdice](https://github.com/bdice)
+- Allow only scale=0 fixed-point values in fixed_width_column_wrapper ([#16120](https://github.com/rapidsai/cudf/pull/16120)) [@davidwendt](https://github.com/davidwendt)
+- Fix pylibcudf Table.num_rows for 0 columns case and add interop to docs ([#16108](https://github.com/rapidsai/cudf/pull/16108)) [@lithomas1](https://github.com/lithomas1)
+- Add support for proxy `np.flatiter` objects ([#16107](https://github.com/rapidsai/cudf/pull/16107)) [@Matt711](https://github.com/Matt711)
+- Ensure cudf objects can astype to any type when empty ([#16106](https://github.com/rapidsai/cudf/pull/16106)) [@mroeschke](https://github.com/mroeschke)
+- Support `pd.read_pickle` and `pd.to_pickle` in `cudf.pandas` ([#16105](https://github.com/rapidsai/cudf/pull/16105)) [@Matt711](https://github.com/Matt711)
+- Fix unnecessarily strict check in parquet chunked reader for choosing split locations. ([#16099](https://github.com/rapidsai/cudf/pull/16099)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Fix `is_monotonic_*` APIs to include `nan&#39;s` ([#16085](https://github.com/rapidsai/cudf/pull/16085)) [@galipremsagar](https://github.com/galipremsagar)
+- More safely parse CUDA versions when subprocess output is contaminated ([#16067](https://github.com/rapidsai/cudf/pull/16067)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- fast_slow_proxy: Don&#39;t import assert_eq at top-level ([#16063](https://github.com/rapidsai/cudf/pull/16063)) [@wence-](https://github.com/wence-)
+- Prevent bad ColumnAccessor state after .sort_index(axis=1, ignore_index=True) ([#16061](https://github.com/rapidsai/cudf/pull/16061)) [@mroeschke](https://github.com/mroeschke)
+- Fix ArrowDeviceArray interface to pass address of event ([#16058](https://github.com/rapidsai/cudf/pull/16058)) [@zeroshade](https://github.com/zeroshade)
+- Fix a size overflow bug in hash groupby ([#16053](https://github.com/rapidsai/cudf/pull/16053)) [@PointKernel](https://github.com/PointKernel)
+- Fix `atomic_ref` scope when multiple blocks are updating the same output ([#16051](https://github.com/rapidsai/cudf/pull/16051)) [@vuule](https://github.com/vuule)
+- Fix initialization error in to_arrow for empty string views ([#16033](https://github.com/rapidsai/cudf/pull/16033)) [@wence-](https://github.com/wence-)
+- Fix the int32 overflow when computing page fragment sizes for large string columns ([#16028](https://github.com/rapidsai/cudf/pull/16028)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Fix the pool size alignment issue ([#16024](https://github.com/rapidsai/cudf/pull/16024)) [@PointKernel](https://github.com/PointKernel)
+- Improve multibyte-split byte-range performance ([#16019](https://github.com/rapidsai/cudf/pull/16019)) [@davidwendt](https://github.com/davidwendt)
+- Fix target counting in strings char-parallel replace ([#16017](https://github.com/rapidsai/cudf/pull/16017)) [@davidwendt](https://github.com/davidwendt)
+- Support IntervalDtype in cudf.from_pandas ([#16014](https://github.com/rapidsai/cudf/pull/16014)) [@mroeschke](https://github.com/mroeschke)
+- Fix memory size in create_byte_range_infos_consecutive ([#16012](https://github.com/rapidsai/cudf/pull/16012)) [@davidwendt](https://github.com/davidwendt)
+- Hide visibility of non public symbols ([#15982](https://github.com/rapidsai/cudf/pull/15982)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix Cython typo preventing proper inheritance ([#15978](https://github.com/rapidsai/cudf/pull/15978)) [@vyasr](https://github.com/vyasr)
+- Fix convert_dtypes with convert_integer=False/convert_floating=True ([#15964](https://github.com/rapidsai/cudf/pull/15964)) [@mroeschke](https://github.com/mroeschke)
+- Fix nunique for `MultiIndex`, `DataFrame`, and all NA case with `dropna=False` ([#15962](https://github.com/rapidsai/cudf/pull/15962)) [@mroeschke](https://github.com/mroeschke)
+- Explicitly build for all GPU architectures ([#15959](https://github.com/rapidsai/cudf/pull/15959)) [@vyasr](https://github.com/vyasr)
+- Preserve column type and class information in more DataFrame operations ([#15949](https://github.com/rapidsai/cudf/pull/15949)) [@mroeschke](https://github.com/mroeschke)
+- Add __array_interface__ to cudf.pandas numpy.ndarray proxy ([#15936](https://github.com/rapidsai/cudf/pull/15936)) [@mroeschke](https://github.com/mroeschke)
+- Allow tests to be built when stream util is disabled ([#15933](https://github.com/rapidsai/cudf/pull/15933)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix JSON multi-source reading when total source size exceeds `INT_MAX` bytes ([#15930](https://github.com/rapidsai/cudf/pull/15930)) [@shrshi](https://github.com/shrshi)
+- Fix `dask_cudf.read_parquet` regression for legacy timestamp data ([#15929](https://github.com/rapidsai/cudf/pull/15929)) [@rjzamora](https://github.com/rjzamora)
+- Fix offsetalator when accessing over 268 million rows ([#15921](https://github.com/rapidsai/cudf/pull/15921)) [@davidwendt](https://github.com/davidwendt)
+- Fix debug assert in rowgroup_char_counts_kernel ([#15902](https://github.com/rapidsai/cudf/pull/15902)) [@davidwendt](https://github.com/davidwendt)
+- Fix categorical conversion from chunked arrow arrays ([#15886](https://github.com/rapidsai/cudf/pull/15886)) [@vyasr](https://github.com/vyasr)
+- Handling for `NaN` and `inf` when converting floating point to fixed point types ([#15885](https://github.com/rapidsai/cudf/pull/15885)) [@ttnghia](https://github.com/ttnghia)
+- Manual merge of Branch 24.08 from 24.06 ([#15869](https://github.com/rapidsai/cudf/pull/15869)) [@galipremsagar](https://github.com/galipremsagar)
+- Avoid unnecessary `Index` cast in `IndexedFrame.index` setter ([#15843](https://github.com/rapidsai/cudf/pull/15843)) [@charlesbluca](https://github.com/charlesbluca)
+- Fix large strings handling in nvtext::character_tokenize ([#15829](https://github.com/rapidsai/cudf/pull/15829)) [@davidwendt](https://github.com/davidwendt)
+- Fix multi-replace target count logic for large strings ([#15807](https://github.com/rapidsai/cudf/pull/15807)) [@davidwendt](https://github.com/davidwendt)
+- Fix JSON parsing memory corruption - Fix Mixed types nested children removal ([#15798](https://github.com/rapidsai/cudf/pull/15798)) [@karthikeyann](https://github.com/karthikeyann)
+- Allow anonymous user in devcontainer name. ([#15784](https://github.com/rapidsai/cudf/pull/15784)) [@bdice](https://github.com/bdice)
+- Add support for additional metaclasses of proxies and use for ExcelWriter ([#15399](https://github.com/rapidsai/cudf/pull/15399)) [@vyasr](https://github.com/vyasr)
+
+## 📖 Documentation
+
+- Add docstring for from_dataframe ([#16260](https://github.com/rapidsai/cudf/pull/16260)) [@mroeschke](https://github.com/mroeschke)
+- Update libcudf compiler requirements in contributing doc ([#16103](https://github.com/rapidsai/cudf/pull/16103)) [@davidwendt](https://github.com/davidwendt)
+- Add libcudf public/detail API pattern to developer guide ([#16086](https://github.com/rapidsai/cudf/pull/16086)) [@davidwendt](https://github.com/davidwendt)
+- Explain line profiler and how to know which functions are GPU-accelerated. ([#16079](https://github.com/rapidsai/cudf/pull/16079)) [@bdice](https://github.com/bdice)
+- cudf.pandas documentation improvement ([#15948](https://github.com/rapidsai/cudf/pull/15948)) [@Matt711](https://github.com/Matt711)
+- Reland &quot;Fix docs for IO readers and strings_convert&quot; ([#15872)&quot; (#15941](https://github.com/rapidsai/cudf/pull/15872)&quot; (#15941)) [@lithomas1](https://github.com/lithomas1)
+- Document how to use cudf.pandas in tandem with multiprocessing ([#15940](https://github.com/rapidsai/cudf/pull/15940)) [@wence-](https://github.com/wence-)
+- DOC: Add documentation for cudf.pandas in the Developer Guide ([#15889](https://github.com/rapidsai/cudf/pull/15889)) [@Matt711](https://github.com/Matt711)
+- Improve options docs ([#15888](https://github.com/rapidsai/cudf/pull/15888)) [@bdice](https://github.com/bdice)
+- DOC: add linkcode to docs ([#15860](https://github.com/rapidsai/cudf/pull/15860)) [@raybellwaves](https://github.com/raybellwaves)
+- DOC: use intersphinx mapping in pandas-compat ext ([#15846](https://github.com/rapidsai/cudf/pull/15846)) [@raybellwaves](https://github.com/raybellwaves)
+- Fix inconsistent usage of &#39;results&#39; and &#39;records&#39; in read-json.md ([#15766](https://github.com/rapidsai/cudf/pull/15766)) [@dagardner-nv](https://github.com/dagardner-nv)
+- Update PandasCompat.py to resolve references ([#15704](https://github.com/rapidsai/cudf/pull/15704)) [@raybellwaves](https://github.com/raybellwaves)
+
+## 🚀 New Features
+
+- Warn on cuDF failure when `POLARS_VERBOSE` is true ([#16308](https://github.com/rapidsai/cudf/pull/16308)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add `drop_nulls` in `cudf-polars` ([#16290](https://github.com/rapidsai/cudf/pull/16290)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- [JNI] Add setKernelPinnedCopyThreshold and setPinnedAllocationThreshold ([#16288](https://github.com/rapidsai/cudf/pull/16288)) [@abellina](https://github.com/abellina)
+- Implement support for scan_ndjson in cudf-polars ([#16263](https://github.com/rapidsai/cudf/pull/16263)) [@lithomas1](https://github.com/lithomas1)
+- Publish cudf-polars nightlies ([#16213](https://github.com/rapidsai/cudf/pull/16213)) [@lithomas1](https://github.com/lithomas1)
+- Modify `make_host_vector` and `make_device_uvector` factories to optionally use pinned memory and kernel copy ([#16206](https://github.com/rapidsai/cudf/pull/16206)) [@vuule](https://github.com/vuule)
+- Migrate lists/set_operations to pylibcudf ([#16190](https://github.com/rapidsai/cudf/pull/16190)) [@Matt711](https://github.com/Matt711)
+- Migrate lists/filling to pylibcudf ([#16189](https://github.com/rapidsai/cudf/pull/16189)) [@Matt711](https://github.com/Matt711)
+- Fall back to CPU for unsupported libcudf binaryops in cudf-polars ([#16188](https://github.com/rapidsai/cudf/pull/16188)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Use resource_ref for upstream in stream_checking_resource_adaptor ([#16187](https://github.com/rapidsai/cudf/pull/16187)) [@harrism](https://github.com/harrism)
+- Migrate lists/modifying to pylibcudf ([#16185](https://github.com/rapidsai/cudf/pull/16185)) [@Matt711](https://github.com/Matt711)
+- Migrate lists/filtering to pylibcudf ([#16184](https://github.com/rapidsai/cudf/pull/16184)) [@Matt711](https://github.com/Matt711)
+- Migrate lists/sorting to pylibcudf ([#16179](https://github.com/rapidsai/cudf/pull/16179)) [@Matt711](https://github.com/Matt711)
+- Add missing methods to lists/list_column_view.pxd in pylibcudf ([#16175](https://github.com/rapidsai/cudf/pull/16175)) [@Matt711](https://github.com/Matt711)
+- Migrate pylibcudf lists gathering ([#16170](https://github.com/rapidsai/cudf/pull/16170)) [@Matt711](https://github.com/Matt711)
+- Move kernel vis over to CUDF_HIDDEN ([#16165](https://github.com/rapidsai/cudf/pull/16165)) [@robertmaynard](https://github.com/robertmaynard)
+- Add groupby_max multi-threaded benchmark ([#16154](https://github.com/rapidsai/cudf/pull/16154)) [@srinivasyadav18](https://github.com/srinivasyadav18)
+- Promote has_nested_columns to cudf public API ([#16131](https://github.com/rapidsai/cudf/pull/16131)) [@robertmaynard](https://github.com/robertmaynard)
+- Promote IO support queries to cudf API ([#16125](https://github.com/rapidsai/cudf/pull/16125)) [@robertmaynard](https://github.com/robertmaynard)
+- cudf::merge public API now support passing a user stream ([#16124](https://github.com/rapidsai/cudf/pull/16124)) [@robertmaynard](https://github.com/robertmaynard)
+- Add TPC-H inspired examples for Libcudf ([#16088](https://github.com/rapidsai/cudf/pull/16088)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Installed cudf header use cudf::allocate_like ([#16087](https://github.com/rapidsai/cudf/pull/16087)) [@robertmaynard](https://github.com/robertmaynard)
+- `cudf-polars` string slicing ([#16082](https://github.com/rapidsai/cudf/pull/16082)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Migrate Parquet reader to pylibcudf ([#16078](https://github.com/rapidsai/cudf/pull/16078)) [@lithomas1](https://github.com/lithomas1)
+- Migrate lists/count_elements to pylibcudf ([#16072](https://github.com/rapidsai/cudf/pull/16072)) [@Matt711](https://github.com/Matt711)
+- Migrate lists/extract to pylibcudf ([#16071](https://github.com/rapidsai/cudf/pull/16071)) [@Matt711](https://github.com/Matt711)
+- Move common string utilities to public api ([#16070](https://github.com/rapidsai/cudf/pull/16070)) [@robertmaynard](https://github.com/robertmaynard)
+- stable_distinct public api now has a stream parameter ([#16068](https://github.com/rapidsai/cudf/pull/16068)) [@robertmaynard](https://github.com/robertmaynard)
+- Migrate expressions to pylibcudf ([#16056](https://github.com/rapidsai/cudf/pull/16056)) [@lithomas1](https://github.com/lithomas1)
+- Add support to ArrowDataSource in SourceInfo ([#16050](https://github.com/rapidsai/cudf/pull/16050)) [@lithomas1](https://github.com/lithomas1)
+- Experimental support for configurable prefetching ([#16020](https://github.com/rapidsai/cudf/pull/16020)) [@vyasr](https://github.com/vyasr)
+- Migrate CSV reader to pylibcudf ([#16011](https://github.com/rapidsai/cudf/pull/16011)) [@lithomas1](https://github.com/lithomas1)
+- Migrate string `slice` APIs to `pylibcudf` ([#15988](https://github.com/rapidsai/cudf/pull/15988)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Migrate lists/contains to pylibcudf ([#15981](https://github.com/rapidsai/cudf/pull/15981)) [@Matt711](https://github.com/Matt711)
+- Remove CCCL 2.2 patches as we now always use 2.5+ ([#15969](https://github.com/rapidsai/cudf/pull/15969)) [@robertmaynard](https://github.com/robertmaynard)
+- Migrate JSON reader to pylibcudf ([#15966](https://github.com/rapidsai/cudf/pull/15966)) [@lithomas1](https://github.com/lithomas1)
+- Add a developer check for proxy objects ([#15956](https://github.com/rapidsai/cudf/pull/15956)) [@Matt711](https://github.com/Matt711)
+- Start migrating I/O writers to pylibcudf (starting with JSON) ([#15952](https://github.com/rapidsai/cudf/pull/15952)) [@lithomas1](https://github.com/lithomas1)
+- Kernel copy for pinned memory ([#15934](https://github.com/rapidsai/cudf/pull/15934)) [@vuule](https://github.com/vuule)
+- Migrate left join and conditional join benchmarks to use nvbench ([#15931](https://github.com/rapidsai/cudf/pull/15931)) [@srinivasyadav18](https://github.com/srinivasyadav18)
+- Migrate lists/combine to pylibcudf ([#15928](https://github.com/rapidsai/cudf/pull/15928)) [@Matt711](https://github.com/Matt711)
+- Plumb pylibcudf strings `contains_re` through cudf_polars ([#15918](https://github.com/rapidsai/cudf/pull/15918)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Start migrating I/O to pylibcudf ([#15899](https://github.com/rapidsai/cudf/pull/15899)) [@lithomas1](https://github.com/lithomas1)
+- Pinned vector factory that uses the global pool ([#15895](https://github.com/rapidsai/cudf/pull/15895)) [@vuule](https://github.com/vuule)
+- Migrate strings `contains` operations to `pylibcudf` ([#15880](https://github.com/rapidsai/cudf/pull/15880)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Migrate quantile.pxd to pylibcudf ([#15874](https://github.com/rapidsai/cudf/pull/15874)) [@lithomas1](https://github.com/lithomas1)
+- Migrate round to pylibcudf ([#15863](https://github.com/rapidsai/cudf/pull/15863)) [@lithomas1](https://github.com/lithomas1)
+- Migrate string replace.pxd to pylibcudf ([#15839](https://github.com/rapidsai/cudf/pull/15839)) [@lithomas1](https://github.com/lithomas1)
+- Add an Environment Variable for debugging the fast path in cudf.pandas ([#15837](https://github.com/rapidsai/cudf/pull/15837)) [@Matt711](https://github.com/Matt711)
+- Add an option to run cuIO benchmarks with pinned buffers as input ([#15830](https://github.com/rapidsai/cudf/pull/15830)) [@vuule](https://github.com/vuule)
+- Update `pylibcudf` testing utilities ([#15772](https://github.com/rapidsai/cudf/pull/15772)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Migrate string `capitalize` APIs to `pylibcudf` ([#15503](https://github.com/rapidsai/cudf/pull/15503)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add tests for `pylibcudf` binaryops ([#15470](https://github.com/rapidsai/cudf/pull/15470)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Migrate column factories to pylibcudf ([#15257](https://github.com/rapidsai/cudf/pull/15257)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- cuDF/libcudf exponentially weighted moving averages ([#9027](https://github.com/rapidsai/cudf/pull/9027)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+
+## 🛠️ Improvements
+
+- Ensure objects with __interface__ are converted to cupy/numpy arrays ([#16436](https://github.com/rapidsai/cudf/pull/16436)) [@mroeschke](https://github.com/mroeschke)
+- Add about rmm modes in `cudf.pandas` docs ([#16404](https://github.com/rapidsai/cudf/pull/16404)) [@galipremsagar](https://github.com/galipremsagar)
+- Gracefully CUDF_FAIL when `skip_rows &gt; 0` in Chunked Parquet reader ([#16385](https://github.com/rapidsai/cudf/pull/16385)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Make C++ compilation warning free after #16297 ([#16379](https://github.com/rapidsai/cudf/pull/16379)) [@wence-](https://github.com/wence-)
+- Align Index __init__ APIs with pandas 2.x ([#16362](https://github.com/rapidsai/cudf/pull/16362)) [@mroeschke](https://github.com/mroeschke)
+- Use rapids_cpm_bs_thread_pool() ([#16360](https://github.com/rapidsai/cudf/pull/16360)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Rename PrefetchConfig to prefetch_config. ([#16358](https://github.com/rapidsai/cudf/pull/16358)) [@bdice](https://github.com/bdice)
+- Implement parquet reading using pylibcudf in cudf-polars ([#16346](https://github.com/rapidsai/cudf/pull/16346)) [@lithomas1](https://github.com/lithomas1)
+- Fix compile warnings with `jni_utils.hpp` ([#16336](https://github.com/rapidsai/cudf/pull/16336)) [@ttnghia](https://github.com/ttnghia)
+- Align Series APIs with pandas 2.x ([#16333](https://github.com/rapidsai/cudf/pull/16333)) [@mroeschke](https://github.com/mroeschke)
+- Add missing `stream` param to dictionary factory APIs ([#16319](https://github.com/rapidsai/cudf/pull/16319)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Mark cudf._typing as a typing module in ruff ([#16318](https://github.com/rapidsai/cudf/pull/16318)) [@mroeschke](https://github.com/mroeschke)
+- Add `stream` param to list explode APIs ([#16317](https://github.com/rapidsai/cudf/pull/16317)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Fix polars for 1.2.1 ([#16316](https://github.com/rapidsai/cudf/pull/16316)) [@lithomas1](https://github.com/lithomas1)
+- Use workflow branch 24.08 again ([#16314](https://github.com/rapidsai/cudf/pull/16314)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Deprecate dtype= parameter in reduction methods ([#16313](https://github.com/rapidsai/cudf/pull/16313)) [@mroeschke](https://github.com/mroeschke)
+- Remove squeeze argument from groupby ([#16312](https://github.com/rapidsai/cudf/pull/16312)) [@mroeschke](https://github.com/mroeschke)
+- Align more DataFrame APIs with pandas ([#16310](https://github.com/rapidsai/cudf/pull/16310)) [@mroeschke](https://github.com/mroeschke)
+- Clean unneeded/redudant dtype utils ([#16309](https://github.com/rapidsai/cudf/pull/16309)) [@mroeschke](https://github.com/mroeschke)
+- Implement read_csv in cudf-polars using pylibcudf ([#16307](https://github.com/rapidsai/cudf/pull/16307)) [@lithomas1](https://github.com/lithomas1)
+- Use Column.can_cast_safely instead of some ad-hoc dtype functions in .where ([#16303](https://github.com/rapidsai/cudf/pull/16303)) [@mroeschke](https://github.com/mroeschke)
+- Drop `{{ pin_compatible(&#39;numpy&#39;, max_pin=&#39;x&#39;) }}` ([#16301](https://github.com/rapidsai/cudf/pull/16301)) [@jakirkham](https://github.com/jakirkham)
+- Host implementation of `to_arrow` using nanoarrow ([#16297](https://github.com/rapidsai/cudf/pull/16297)) [@zeroshade](https://github.com/zeroshade)
+- Add ability to prefetch in `cudf.pandas` and change default to managed pool ([#16296](https://github.com/rapidsai/cudf/pull/16296)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix tests for polars 1.2 ([#16292](https://github.com/rapidsai/cudf/pull/16292)) [@lithomas1](https://github.com/lithomas1)
+- Introduce dedicated options for low memory readers ([#16289](https://github.com/rapidsai/cudf/pull/16289)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove decimal/floating 64/128bit switches due to register pressure ([#16287](https://github.com/rapidsai/cudf/pull/16287)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Make ColumnAccessor strictly require a mapping of columns ([#16285](https://github.com/rapidsai/cudf/pull/16285)) [@mroeschke](https://github.com/mroeschke)
+- Introduce version file so we can conditionally handle things in tests ([#16280](https://github.com/rapidsai/cudf/pull/16280)) [@wence-](https://github.com/wence-)
+- Type &amp; reduce cupy usage ([#16277](https://github.com/rapidsai/cudf/pull/16277)) [@mroeschke](https://github.com/mroeschke)
+- Update cudf::detail::grid_1d to use thread_index_type ([#16276](https://github.com/rapidsai/cudf/pull/16276)) [@davidwendt](https://github.com/davidwendt)
+- Replace np.isscalar/issubdtype checks with is_scalar/.kind checks ([#16275](https://github.com/rapidsai/cudf/pull/16275)) [@mroeschke](https://github.com/mroeschke)
+- Remove xml from sort_ninja_log.py utility ([#16274](https://github.com/rapidsai/cudf/pull/16274)) [@davidwendt](https://github.com/davidwendt)
+- Fix issue in horizontal concat implementation in cudf-polars ([#16271](https://github.com/rapidsai/cudf/pull/16271)) [@wence-](https://github.com/wence-)
+- Preserve order in left join for cudf-polars ([#16268](https://github.com/rapidsai/cudf/pull/16268)) [@wence-](https://github.com/wence-)
+- Replace is_datetime/timedelta_dtype checks with .kind checks ([#16262](https://github.com/rapidsai/cudf/pull/16262)) [@mroeschke](https://github.com/mroeschke)
+- Replace is_float/integer_dtype checks with .kind checks ([#16261](https://github.com/rapidsai/cudf/pull/16261)) [@mroeschke](https://github.com/mroeschke)
+- Build and test with CUDA 12.5.1 ([#16259](https://github.com/rapidsai/cudf/pull/16259)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Replace is_bool_type with checking .dtype.kind ([#16255](https://github.com/rapidsai/cudf/pull/16255)) [@mroeschke](https://github.com/mroeschke)
+- remove `cuco_noexcept.diff` ([#16254](https://github.com/rapidsai/cudf/pull/16254)) [@trxcllnt](https://github.com/trxcllnt)
+- Update contains_tests.cpp to use public cudf::slice ([#16253](https://github.com/rapidsai/cudf/pull/16253)) [@davidwendt](https://github.com/davidwendt)
+- Improve the test data for pylibcudf I/O tests ([#16247](https://github.com/rapidsai/cudf/pull/16247)) [@lithomas1](https://github.com/lithomas1)
+- Short circuit some Column methods ([#16246](https://github.com/rapidsai/cudf/pull/16246)) [@mroeschke](https://github.com/mroeschke)
+- Make nvcomp adapter compatible with new version macros ([#16245](https://github.com/rapidsai/cudf/pull/16245)) [@vuule](https://github.com/vuule)
+- Add Column.strftime/strptime instead of overloading `as_string/datetime/timedelta_column` ([#16243](https://github.com/rapidsai/cudf/pull/16243)) [@mroeschke](https://github.com/mroeschke)
+- Remove temporary functor overloads required by cuco version bump ([#16242](https://github.com/rapidsai/cudf/pull/16242)) [@PointKernel](https://github.com/PointKernel)
+- Remove hash_character_ngrams dependency from jaccard_index ([#16241](https://github.com/rapidsai/cudf/pull/16241)) [@davidwendt](https://github.com/davidwendt)
+- Expose sorted groupby parameters to pylibcudf ([#16240](https://github.com/rapidsai/cudf/pull/16240)) [@wence-](https://github.com/wence-)
+- Expose reflection to check if casting between two types is supported ([#16239](https://github.com/rapidsai/cudf/pull/16239)) [@wence-](https://github.com/wence-)
+- Handle nans in groupby-aggregations in polars executor ([#16233](https://github.com/rapidsai/cudf/pull/16233)) [@wence-](https://github.com/wence-)
+- Remove `mr` param from `write_csv` and `write_json` ([#16231](https://github.com/rapidsai/cudf/pull/16231)) [@JayjeetAtGithub](https://github.com/JayjeetAtGithub)
+- Support Literals in groupby-agg ([#16218](https://github.com/rapidsai/cudf/pull/16218)) [@wence-](https://github.com/wence-)
+- Handler csv reader options in cudf-polars ([#16211](https://github.com/rapidsai/cudf/pull/16211)) [@wence-](https://github.com/wence-)
+- Update vendored thread_pool implementation ([#16210](https://github.com/rapidsai/cudf/pull/16210)) [@wence-](https://github.com/wence-)
+- Add low memory JSON reader for `cudf.pandas` ([#16204](https://github.com/rapidsai/cudf/pull/16204)) [@galipremsagar](https://github.com/galipremsagar)
+- Clean up state variables in MultiIndex ([#16203](https://github.com/rapidsai/cudf/pull/16203)) [@mroeschke](https://github.com/mroeschke)
+- skip CMake 3.30.0 ([#16202](https://github.com/rapidsai/cudf/pull/16202)) [@jameslamb](https://github.com/jameslamb)
+- Assert valid metadata is passed in to_arrow for list_view ([#16198](https://github.com/rapidsai/cudf/pull/16198)) [@wence-](https://github.com/wence-)
+- Expose type traits to pylibcudf ([#16197](https://github.com/rapidsai/cudf/pull/16197)) [@wence-](https://github.com/wence-)
+- Report number of rows per file read by PQ reader when no row selection and fix segfault in chunked PQ reader when skip_rows &gt; 0 ([#16195](https://github.com/rapidsai/cudf/pull/16195)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Cast count aggs to correct dtype in translation ([#16192](https://github.com/rapidsai/cudf/pull/16192)) [@wence-](https://github.com/wence-)
+- Some small fixes in cudf-polars ([#16191](https://github.com/rapidsai/cudf/pull/16191)) [@wence-](https://github.com/wence-)
+- split up CUDA-suffixed dependencies in dependencies.yaml ([#16183](https://github.com/rapidsai/cudf/pull/16183)) [@jameslamb](https://github.com/jameslamb)
+- Define PTDS for the stream hook libs ([#16182](https://github.com/rapidsai/cudf/pull/16182)) [@trxcllnt](https://github.com/trxcllnt)
+- Make `test_python_cudf_pandas` generate `requirements.txt` ([#16181](https://github.com/rapidsai/cudf/pull/16181)) [@trxcllnt](https://github.com/trxcllnt)
+- Add environment-agnostic `ci/run_cudf_polars_pytest.sh` ([#16178](https://github.com/rapidsai/cudf/pull/16178)) [@trxcllnt](https://github.com/trxcllnt)
+- Implement translation for some unary functions and a single datetime extraction ([#16173](https://github.com/rapidsai/cudf/pull/16173)) [@wence-](https://github.com/wence-)
+- Remove size constraints on source files in batched JSON reading ([#16162](https://github.com/rapidsai/cudf/pull/16162)) [@shrshi](https://github.com/shrshi)
+- CI: Build wheels for cudf-polars ([#16156](https://github.com/rapidsai/cudf/pull/16156)) [@lithomas1](https://github.com/lithomas1)
+- Update cudf-polars for v1 release of polars ([#16149](https://github.com/rapidsai/cudf/pull/16149)) [@wence-](https://github.com/wence-)
+- Use strings concatenate to support large strings in CSV writer ([#16148](https://github.com/rapidsai/cudf/pull/16148)) [@davidwendt](https://github.com/davidwendt)
+- Use verify-alpha-spec hook ([#16144](https://github.com/rapidsai/cudf/pull/16144)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Adds write-coalescing code path optimization to FST ([#16143](https://github.com/rapidsai/cudf/pull/16143)) [@elstehle](https://github.com/elstehle)
+- MAINT: Adapt to NumPy 2 promotion changes ([#16141](https://github.com/rapidsai/cudf/pull/16141)) [@seberg](https://github.com/seberg)
+- API: Check for integer overflows when creating scalar form python int ([#16140](https://github.com/rapidsai/cudf/pull/16140)) [@seberg](https://github.com/seberg)
+- Remove the (unused) implementation of `host_parse_nested_json` ([#16135](https://github.com/rapidsai/cudf/pull/16135)) [@vuule](https://github.com/vuule)
+- Deprecate Arrow support in I/O ([#16132](https://github.com/rapidsai/cudf/pull/16132)) [@lithomas1](https://github.com/lithomas1)
+- Disable dict support for split-page kernel in the parquet reader. ([#16128](https://github.com/rapidsai/cudf/pull/16128)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Add throughput metrics for REDUCTION_BENCH/REDUCTION_NVBENCH benchmarks ([#16126](https://github.com/rapidsai/cudf/pull/16126)) [@jihoonson](https://github.com/jihoonson)
+- Add ensure_index to not unnecessarily shallow copy cudf.Index ([#16117](https://github.com/rapidsai/cudf/pull/16117)) [@mroeschke](https://github.com/mroeschke)
+- Make binary operators work between fixed-point and floating args ([#16116](https://github.com/rapidsai/cudf/pull/16116)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Implement Ternary copy_if_else ([#16114](https://github.com/rapidsai/cudf/pull/16114)) [@wence-](https://github.com/wence-)
+- Implement handlers for series literal in cudf-polars ([#16113](https://github.com/rapidsai/cudf/pull/16113)) [@wence-](https://github.com/wence-)
+- Fix dtype errors in `StringArrays` ([#16111](https://github.com/rapidsai/cudf/pull/16111)) [@galipremsagar](https://github.com/galipremsagar)
+- Ensure MultiIndex.to_frame deep copies columns ([#16110](https://github.com/rapidsai/cudf/pull/16110)) [@mroeschke](https://github.com/mroeschke)
+- Parallelize `gpuInitStringDescriptors` for fixed length byte array data ([#16109](https://github.com/rapidsai/cudf/pull/16109)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Finish implementation of cudf-polars boolean function handlers ([#16098](https://github.com/rapidsai/cudf/pull/16098)) [@wence-](https://github.com/wence-)
+- Expose and then implement support for cross joins in cudf-polars ([#16097](https://github.com/rapidsai/cudf/pull/16097)) [@wence-](https://github.com/wence-)
+- Defer copying in Column.astype(copy=True) ([#16095](https://github.com/rapidsai/cudf/pull/16095)) [@mroeschke](https://github.com/mroeschke)
+- Fix segfault in conditional join ([#16094](https://github.com/rapidsai/cudf/pull/16094)) [@bdice](https://github.com/bdice)
+- Free temp memory no longer needed in multibyte_split processing ([#16091](https://github.com/rapidsai/cudf/pull/16091)) [@davidwendt](https://github.com/davidwendt)
+- Rename gather/scatter benchmarks to clarify coalesced behavior. ([#16083](https://github.com/rapidsai/cudf/pull/16083)) [@bdice](https://github.com/bdice)
+- Adapt to polars upstream changes and turn on CI testing ([#16081](https://github.com/rapidsai/cudf/pull/16081)) [@wence-](https://github.com/wence-)
+- Reduce/clean copy usage in Series, reshaping ([#16080](https://github.com/rapidsai/cudf/pull/16080)) [@mroeschke](https://github.com/mroeschke)
+- Account for FIXED_LEN_BYTE_ARRAY when calculating fragment sizes in Parquet writer ([#16064](https://github.com/rapidsai/cudf/pull/16064)) [@etseidl](https://github.com/etseidl)
+- Reduce (shallow) copies in DataFrame ops ([#16060](https://github.com/rapidsai/cudf/pull/16060)) [@mroeschke](https://github.com/mroeschke)
+- Add multi-file support to `dask_cudf.read_json` ([#16057](https://github.com/rapidsai/cudf/pull/16057)) [@rjzamora](https://github.com/rjzamora)
+- Reduce deep copies in Index ops ([#16054](https://github.com/rapidsai/cudf/pull/16054)) [@mroeschke](https://github.com/mroeschke)
+- Implement chunked column wise concat in chunked parquet reader ([#16052](https://github.com/rapidsai/cudf/pull/16052)) [@galipremsagar](https://github.com/galipremsagar)
+- Add exception when trying to create large strings with cudf::test::strings_column_wrapper ([#16049](https://github.com/rapidsai/cudf/pull/16049)) [@davidwendt](https://github.com/davidwendt)
+- Return `FrozenList` for `Index.names` ([#16047](https://github.com/rapidsai/cudf/pull/16047)) [@galipremsagar](https://github.com/galipremsagar)
+- Add ast cast test ([#16045](https://github.com/rapidsai/cudf/pull/16045)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Remove `override_dtypes` and `include_index` from `Frame._copy_type_metadata` ([#16043](https://github.com/rapidsai/cudf/pull/16043)) [@mroeschke](https://github.com/mroeschke)
+- Add ruff rules to avoid importing from typing ([#16040](https://github.com/rapidsai/cudf/pull/16040)) [@mroeschke](https://github.com/mroeschke)
+- Fix decimal -&gt; float cast in ast code ([#16038](https://github.com/rapidsai/cudf/pull/16038)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Add compile option to enable large strings support ([#16037](https://github.com/rapidsai/cudf/pull/16037)) [@davidwendt](https://github.com/davidwendt)
+- Reduce conditional_join nvbench configurations ([#16036](https://github.com/rapidsai/cudf/pull/16036)) [@srinivasyadav18](https://github.com/srinivasyadav18)
+- Project automation update: skip if not in project ([#16035](https://github.com/rapidsai/cudf/pull/16035)) [@jarmak-nv](https://github.com/jarmak-nv)
+- Add stream parameter to cudf::io::text::multibyte_split ([#16034](https://github.com/rapidsai/cudf/pull/16034)) [@davidwendt](https://github.com/davidwendt)
+- Delete unused code from stringfunction evaluator ([#16032](https://github.com/rapidsai/cudf/pull/16032)) [@wence-](https://github.com/wence-)
+- Fix exclude regex in pre-commit clang-format hook ([#16030](https://github.com/rapidsai/cudf/pull/16030)) [@wence-](https://github.com/wence-)
+- Refactor rmm usage in `cudf.pandas` ([#16021](https://github.com/rapidsai/cudf/pull/16021)) [@galipremsagar](https://github.com/galipremsagar)
+- Enable ruff TCH: typing imports under if TYPE_CHECKING ([#16015](https://github.com/rapidsai/cudf/pull/16015)) [@mroeschke](https://github.com/mroeschke)
+- Restrict the allowed pandas timezone objects in cudf ([#16013](https://github.com/rapidsai/cudf/pull/16013)) [@mroeschke](https://github.com/mroeschke)
+- orc multithreaded benchmark ([#16009](https://github.com/rapidsai/cudf/pull/16009)) [@zpuller](https://github.com/zpuller)
+- Add tests of expression-based sort and sort-by ([#16008](https://github.com/rapidsai/cudf/pull/16008)) [@wence-](https://github.com/wence-)
+- Add tests of implemented StringFunctions ([#16007](https://github.com/rapidsai/cudf/pull/16007)) [@wence-](https://github.com/wence-)
+- Add test that diagonal concat with mismatching schemas raises ([#16006](https://github.com/rapidsai/cudf/pull/16006)) [@wence-](https://github.com/wence-)
+- Add coverage selecting len from a dataframe (number of rows) ([#16005](https://github.com/rapidsai/cudf/pull/16005)) [@wence-](https://github.com/wence-)
+- Add basic tests of dataframe scan ([#16003](https://github.com/rapidsai/cudf/pull/16003)) [@wence-](https://github.com/wence-)
+- Add coverage for both expression and dataframe filter ([#16002](https://github.com/rapidsai/cudf/pull/16002)) [@wence-](https://github.com/wence-)
+- Remove deprecated ExtContext node ([#16001](https://github.com/rapidsai/cudf/pull/16001)) [@wence-](https://github.com/wence-)
+- Fix typo bug in gather implementation ([#16000](https://github.com/rapidsai/cudf/pull/16000)) [@wence-](https://github.com/wence-)
+- Extend coverage of groupby and rolling window nodes ([#15999](https://github.com/rapidsai/cudf/pull/15999)) [@wence-](https://github.com/wence-)
+- Coverage of binops where one or both operands are a scalar ([#15998](https://github.com/rapidsai/cudf/pull/15998)) [@wence-](https://github.com/wence-)
+- Add full coverage for whole-frame Agg expressions ([#15997](https://github.com/rapidsai/cudf/pull/15997)) [@wence-](https://github.com/wence-)
+- Add tests covering magic methods of Expr objects ([#15996](https://github.com/rapidsai/cudf/pull/15996)) [@wence-](https://github.com/wence-)
+- Add full coverage of utility functions ([#15995](https://github.com/rapidsai/cudf/pull/15995)) [@wence-](https://github.com/wence-)
+- Test behaviour of containers ([#15994](https://github.com/rapidsai/cudf/pull/15994)) [@wence-](https://github.com/wence-)
+- Fix implemention of any, all, and isbetween ([#15993](https://github.com/rapidsai/cudf/pull/15993)) [@wence-](https://github.com/wence-)
+- Raise early on unhandled PythonScan node ([#15992](https://github.com/rapidsai/cudf/pull/15992)) [@wence-](https://github.com/wence-)
+- Remove mapfunction nodes that don&#39;t exist/aren&#39;t supported ([#15991](https://github.com/rapidsai/cudf/pull/15991)) [@wence-](https://github.com/wence-)
+- Add test coverage for slicing with &quot;out of bounds&quot; negative indices ([#15990](https://github.com/rapidsai/cudf/pull/15990)) [@wence-](https://github.com/wence-)
+- Standardize and type `Series.dt` methods ([#15987](https://github.com/rapidsai/cudf/pull/15987)) [@mroeschke](https://github.com/mroeschke)
+- Refactor distinct with hashset-based algorithms ([#15984](https://github.com/rapidsai/cudf/pull/15984)) [@srinivasyadav18](https://github.com/srinivasyadav18)
+- resolve dependency-file-generator warning, remove unnecessary rapids-build-backend configuration ([#15980](https://github.com/rapidsai/cudf/pull/15980)) [@jameslamb](https://github.com/jameslamb)
+- Project automation bug fixes ([#15971](https://github.com/rapidsai/cudf/pull/15971)) [@jarmak-nv](https://github.com/jarmak-nv)
+- Add typing to single_column_frame ([#15965](https://github.com/rapidsai/cudf/pull/15965)) [@mroeschke](https://github.com/mroeschke)
+- Move some misc Frame methods to appropriate locations ([#15963](https://github.com/rapidsai/cudf/pull/15963)) [@mroeschke](https://github.com/mroeschke)
+- Condense pylibcudf data fixtures ([#15958](https://github.com/rapidsai/cudf/pull/15958)) [@lithomas1](https://github.com/lithomas1)
+- Refactor fillna logic to push specifics toward Frame subclasses and Column subclasses ([#15957](https://github.com/rapidsai/cudf/pull/15957)) [@mroeschke](https://github.com/mroeschke)
+- Remove unused parsing utilities ([#15955](https://github.com/rapidsai/cudf/pull/15955)) [@vuule](https://github.com/vuule)
+- Remove `Scalar` container type from polars interpreter ([#15953](https://github.com/rapidsai/cudf/pull/15953)) [@wence-](https://github.com/wence-)
+- Support arbitrary CUDA versions in UDF code ([#15950](https://github.com/rapidsai/cudf/pull/15950)) [@bdice](https://github.com/bdice)
+- Support large strings in cudf::io::text::multibyte_split ([#15947](https://github.com/rapidsai/cudf/pull/15947)) [@davidwendt](https://github.com/davidwendt)
+- Add external issue label and project automation ([#15945](https://github.com/rapidsai/cudf/pull/15945)) [@jarmak-nv](https://github.com/jarmak-nv)
+- Enable round-tripping of large strings in `cudf` ([#15944](https://github.com/rapidsai/cudf/pull/15944)) [@galipremsagar](https://github.com/galipremsagar)
+- Add more complete type annotations in polars interpreter ([#15942](https://github.com/rapidsai/cudf/pull/15942)) [@wence-](https://github.com/wence-)
+- Update implementations to build with the latest cuco ([#15938](https://github.com/rapidsai/cudf/pull/15938)) [@PointKernel](https://github.com/PointKernel)
+- Support timezone aware pandas inputs in cudf ([#15935](https://github.com/rapidsai/cudf/pull/15935)) [@mroeschke](https://github.com/mroeschke)
+- Define Column.nan_as_null to return self ([#15923](https://github.com/rapidsai/cudf/pull/15923)) [@mroeschke](https://github.com/mroeschke)
+- Make Frame._dtype an iterator instead of a dict ([#15920](https://github.com/rapidsai/cudf/pull/15920)) [@mroeschke](https://github.com/mroeschke)
+- Port start of datetime.hpp to pylibcudf ([#15916](https://github.com/rapidsai/cudf/pull/15916)) [@wence-](https://github.com/wence-)
+- Introduce `NamedColumn` concept in cudf-polars ([#15914](https://github.com/rapidsai/cudf/pull/15914)) [@wence-](https://github.com/wence-)
+- Avoid redefining Frame._get_columns_by_label in subclasses ([#15912](https://github.com/rapidsai/cudf/pull/15912)) [@mroeschke](https://github.com/mroeschke)
+- Templatization of fixed-width parquet decoding kernels. ([#15911](https://github.com/rapidsai/cudf/pull/15911)) [@nvdbaranec](https://github.com/nvdbaranec)
+- New Decimal &lt;--&gt; Floating conversion ([#15905](https://github.com/rapidsai/cudf/pull/15905)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Use Arrow C Data Interface functions for Python interop ([#15904](https://github.com/rapidsai/cudf/pull/15904)) [@vyasr](https://github.com/vyasr)
+- Use offsetalator in cudf::io::json::detail::parse_string ([#15900](https://github.com/rapidsai/cudf/pull/15900)) [@davidwendt](https://github.com/davidwendt)
+- Rename strings multiple target replace API ([#15898](https://github.com/rapidsai/cudf/pull/15898)) [@davidwendt](https://github.com/davidwendt)
+- Apply clang-tidy autofixes ([#15894](https://github.com/rapidsai/cudf/pull/15894)) [@vyasr](https://github.com/vyasr)
+- Update Python labels and remove unnecessary ones ([#15893](https://github.com/rapidsai/cudf/pull/15893)) [@vyasr](https://github.com/vyasr)
+- Clean up pylibcudf test assertations ([#15892](https://github.com/rapidsai/cudf/pull/15892)) [@lithomas1](https://github.com/lithomas1)
+- Use offsetalator in orc rowgroup_char_counts_kernel ([#15891](https://github.com/rapidsai/cudf/pull/15891)) [@davidwendt](https://github.com/davidwendt)
+- Ensure literals have correct dtype ([#15890](https://github.com/rapidsai/cudf/pull/15890)) [@wence-](https://github.com/wence-)
+- Add overflow check when converting large strings to lists columns ([#15887](https://github.com/rapidsai/cudf/pull/15887)) [@davidwendt](https://github.com/davidwendt)
+- Use offsetalator in nvtext::tokenize_with_vocabulary ([#15878](https://github.com/rapidsai/cudf/pull/15878)) [@davidwendt](https://github.com/davidwendt)
+- Update interleave lists column for large strings ([#15877](https://github.com/rapidsai/cudf/pull/15877)) [@davidwendt](https://github.com/davidwendt)
+- Simple NumPy 2 fixes that are clearly no behavior change ([#15876](https://github.com/rapidsai/cudf/pull/15876)) [@seberg](https://github.com/seberg)
+- Support `arrow:schema` in Parquet writer to faithfully roundtrip `duration` types with Arrow ([#15875](https://github.com/rapidsai/cudf/pull/15875)) [@mhaseeb123](https://github.com/mhaseeb123)
+- Refactor join benchmarks to target public APIs with the default stream ([#15873](https://github.com/rapidsai/cudf/pull/15873)) [@PointKernel](https://github.com/PointKernel)
+- Fix url-decode benchmark to use offsetalator ([#15871](https://github.com/rapidsai/cudf/pull/15871)) [@davidwendt](https://github.com/davidwendt)
+- Use offsetalator in strings shift functor ([#15870](https://github.com/rapidsai/cudf/pull/15870)) [@davidwendt](https://github.com/davidwendt)
+- Memory Profiling ([#15866](https://github.com/rapidsai/cudf/pull/15866)) [@madsbk](https://github.com/madsbk)
+- Expose stream parameter to public rolling APIs ([#15865](https://github.com/rapidsai/cudf/pull/15865)) [@srinivasyadav18](https://github.com/srinivasyadav18)
+- Make Frame.astype return Self instead of a ColumnAccessor ([#15861](https://github.com/rapidsai/cudf/pull/15861)) [@mroeschke](https://github.com/mroeschke)
+- Use ColumnAccessor row and column length attributes more consistently ([#15857](https://github.com/rapidsai/cudf/pull/15857)) [@mroeschke](https://github.com/mroeschke)
+- add unit test setup for cudf_kafka ([#15853](https://github.com/rapidsai/cudf/pull/15853)) [@jameslamb](https://github.com/jameslamb)
+- Remove internal usage of core.index.as_index in favor of cudf.Index ([#15851](https://github.com/rapidsai/cudf/pull/15851)) [@mroeschke](https://github.com/mroeschke)
+- Ensure cudf.Series(cudf.Series(...)) creates a reference to the same index ([#15845](https://github.com/rapidsai/cudf/pull/15845)) [@mroeschke](https://github.com/mroeschke)
+- Remove benchmark-specific use of pinned-pooled memory in Parquet multithreaded benchmark. ([#15838](https://github.com/rapidsai/cudf/pull/15838)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Implement `on_bad_lines` in json reader ([#15834](https://github.com/rapidsai/cudf/pull/15834)) [@galipremsagar](https://github.com/galipremsagar)
+- Make Column.to_pandas return Index instead of Series ([#15833](https://github.com/rapidsai/cudf/pull/15833)) [@mroeschke](https://github.com/mroeschke)
+- Add test of interoperability of cuDF and arrow BYTE_STREAM_SPLIT encoders ([#15832](https://github.com/rapidsai/cudf/pull/15832)) [@etseidl](https://github.com/etseidl)
+- Refactor Parquet writer options and builders ([#15831](https://github.com/rapidsai/cudf/pull/15831)) [@etseidl](https://github.com/etseidl)
+- Migrate reshape.pxd to pylibcudf ([#15827](https://github.com/rapidsai/cudf/pull/15827)) [@lithomas1](https://github.com/lithomas1)
+- Remove legacy JSON reader and concurrent_unordered_map.cuh. ([#15813](https://github.com/rapidsai/cudf/pull/15813)) [@bdice](https://github.com/bdice)
+- Switch cuIO benchmarks to use pinned-pool host allocations by default. ([#15805](https://github.com/rapidsai/cudf/pull/15805)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Change thrust::count_if call to raw kernel in strings split APIs ([#15762](https://github.com/rapidsai/cudf/pull/15762)) [@davidwendt](https://github.com/davidwendt)
+- Improve performance for long strings for nvtext::replace_tokens ([#15756](https://github.com/rapidsai/cudf/pull/15756)) [@davidwendt](https://github.com/davidwendt)
+- Implement chunked parquet reader in cudf-python ([#15728](https://github.com/rapidsai/cudf/pull/15728)) [@galipremsagar](https://github.com/galipremsagar)
+- Add `from_arrow_host` functions for cudf interop with nanoarrow ([#15645](https://github.com/rapidsai/cudf/pull/15645)) [@zeroshade](https://github.com/zeroshade)
+- Add ability to enable rmm pool on `cudf.pandas` import ([#15628](https://github.com/rapidsai/cudf/pull/15628)) [@galipremsagar](https://github.com/galipremsagar)
+- Executor for polars logical plans ([#15504](https://github.com/rapidsai/cudf/pull/15504)) [@wence-](https://github.com/wence-)
+- Implement day_name and month_name to match pandas ([#15479](https://github.com/rapidsai/cudf/pull/15479)) [@btepera](https://github.com/btepera)
+- Utilities for decimal &lt;--&gt; floating conversion ([#15359](https://github.com/rapidsai/cudf/pull/15359)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- For powers of 10, replace ipow with switch ([#15353](https://github.com/rapidsai/cudf/pull/15353)) [@pmattione-nvidia](https://github.com/pmattione-nvidia)
+- Use rapids-build-backend. ([#15245](https://github.com/rapidsai/cudf/pull/15245)) [@vyasr](https://github.com/vyasr)
+- Add `codecov` coverage for `pandas_tests` ([#14513](https://github.com/rapidsai/cudf/pull/14513)) [@galipremsagar](https://github.com/galipremsagar)
+
 # cudf 24.06.00 (5 Jun 2024)
 
 ## 🚨 Breaking Changes

From 3fd8783e49246f4ae61351375201d616d5ab6b55 Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Wed, 7 Aug 2024 13:00:09 -0700
Subject: [PATCH 634/842] Add `stream` param to stream compaction APIs (#16295)

Add `stream` param to a bunch of stream compaction APIs.

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Mark Harris (https://github.com/harrism)
  - Karthikeyan (https://github.com/karthikeyann)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/16295
---
 cpp/include/cudf/detail/stream_compaction.hpp |  30 +-
 .../cudf/lists/detail/stream_compaction.hpp   |   9 +-
 cpp/include/cudf/stream_compaction.hpp        |  30 +-
 .../stream_compaction/apply_boolean_mask.cu   |   3 +-
 cpp/src/stream_compaction/distinct.cu         |   4 +-
 cpp/src/stream_compaction/distinct_count.cu   |  11 +-
 cpp/src/stream_compaction/drop_nans.cu        |   6 +-
 cpp/src/stream_compaction/drop_nulls.cu       |   6 +-
 cpp/src/stream_compaction/unique.cu           |   3 +-
 cpp/src/stream_compaction/unique_count.cu     |   8 +-
 .../stream_compaction/unique_count_column.cu  |   7 +-
 cpp/tests/streams/stream_compaction_test.cpp  | 365 ++++++++++++++----
 java/src/main/native/src/TableJni.cpp         |   1 +
 13 files changed, 362 insertions(+), 121 deletions(-)

diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index 05194148a70..85d2ee9790f 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -29,9 +29,7 @@ namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::drop_nulls(table_view const&, std::vector<size_type> const&,
- *                           cudf::size_type, rmm::device_async_resource_ref)
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ *                           cudf::size_type, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
@@ -41,9 +39,7 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
 
 /**
  * @copydoc cudf::drop_nans(table_view const&, std::vector<size_type> const&,
- *                          cudf::size_type, rmm::device_async_resource_ref)
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ *                          cudf::size_type, rmm::cuda_stream_view, rmm::device_async_resource_ref)
  */
 std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
@@ -53,8 +49,6 @@ std::unique_ptr<table> drop_nans(table_view const& input,
 
 /**
  * @copydoc cudf::apply_boolean_mask
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> apply_boolean_mask(table_view const& input,
                                           column_view const& boolean_mask,
@@ -63,8 +57,6 @@ std::unique_ptr<table> apply_boolean_mask(table_view const& input,
 
 /**
  * @copydoc cudf::unique
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> unique(table_view const& input,
                               std::vector<size_type> const& keys,
@@ -75,8 +67,6 @@ std::unique_ptr<table> unique(table_view const& input,
 
 /**
  * @copydoc cudf::distinct
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 std::unique_ptr<table> distinct(table_view const& input,
                                 std::vector<size_type> const& keys,
@@ -110,9 +100,7 @@ rmm::device_uvector<size_type> distinct_indices(table_view const& input,
                                                 rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::unique_count(column_view const&, null_policy, nan_policy)
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::unique_count(column_view const&, null_policy, nan_policy, rmm::cuda_stream_view)
  */
 cudf::size_type unique_count(column_view const& input,
                              null_policy null_handling,
@@ -120,18 +108,14 @@ cudf::size_type unique_count(column_view const& input,
                              rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::unique_count(table_view const&, null_equality)
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::unique_count(table_view const&, null_equality, rmm::cuda_stream_view)
  */
 cudf::size_type unique_count(table_view const& input,
                              null_equality nulls_equal,
                              rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy)
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy, rmm::cuda_stream_view)
  */
 cudf::size_type distinct_count(column_view const& input,
                                null_policy null_handling,
@@ -139,9 +123,7 @@ cudf::size_type distinct_count(column_view const& input,
                                rmm::cuda_stream_view stream);
 
 /**
- * @copydoc cudf::distinct_count(table_view const&, null_equality)
- *
- * @param[in] stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::distinct_count(table_view const&, null_equality, rmm::cuda_stream_view)
  */
 cudf::size_type distinct_count(table_view const& input,
                                null_equality nulls_equal,
diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp
index c11e07cd190..be0bd27083c 100644
--- a/cpp/include/cudf/lists/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/detail/stream_compaction.hpp
@@ -26,10 +26,7 @@ namespace CUDF_EXPORT cudf {
 namespace lists::detail {
 
 /**
- * @copydoc cudf::lists::apply_boolean_mask(lists_column_view const&, lists_column_view const&,
- * rmm::device_async_resource_ref)
- *
- * @param stream CUDA stream used for device memory operations and kernel launches
+ * @copydoc cudf::lists::apply_boolean_mask
  */
 std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                            lists_column_view const& boolean_mask,
@@ -37,9 +34,7 @@ std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                            rmm::device_async_resource_ref mr);
 
 /**
- * @copydoc cudf::list::distinct
- *
- * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @copydoc cudf::lists::distinct
  */
 std::unique_ptr<column> distinct(lists_column_view const& input,
                                  null_equality nulls_equal,
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index cfe404ff6ab..ced8d5849d0 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -67,6 +67,7 @@ namespace CUDF_EXPORT cudf {
  * @param[in] keys  vector of indices representing key columns from `input`
  * @param[in] keep_threshold The minimum number of non-null fields in a row
  *                           required to keep the row.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @return Table containing all rows of the `input` with at least @p
  * keep_threshold non-null fields in @p keys.
@@ -75,6 +76,7 @@ std::unique_ptr<table> drop_nulls(
   table_view const& input,
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -99,6 +101,7 @@ std::unique_ptr<table> drop_nulls(
  *
  * @param[in] input The input `table_view` to filter
  * @param[in] keys  vector of indices representing key columns from `input`
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @return Table containing all rows of the `input` without nulls in the columns
  * of @p keys.
@@ -106,6 +109,7 @@ std::unique_ptr<table> drop_nulls(
 std::unique_ptr<table> drop_nulls(
   table_view const& input,
   std::vector<size_type> const& keys,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -141,6 +145,7 @@ std::unique_ptr<table> drop_nulls(
  * @param[in] keys  vector of indices representing key columns from `input`
  * @param[in] keep_threshold The minimum number of non-NAN elements in a row
  *                           required to keep the row.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @return Table containing all rows of the `input` with at least @p
  * keep_threshold non-NAN elements in @p keys.
@@ -149,6 +154,7 @@ std::unique_ptr<table> drop_nans(
   table_view const& input,
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -174,6 +180,7 @@ std::unique_ptr<table> drop_nans(
  *
  * @param[in] input The input `table_view` to filter
  * @param[in] keys  vector of indices representing key columns from `input`
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @return Table containing all rows of the `input` without NANs in the columns
  * of @p keys.
@@ -181,6 +188,7 @@ std::unique_ptr<table> drop_nans(
 std::unique_ptr<table> drop_nans(
   table_view const& input,
   std::vector<size_type> const& keys,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -200,6 +208,7 @@ std::unique_ptr<table> drop_nans(
  * @param[in] input The input table_view to filter
  * @param[in] boolean_mask A nullable column_view of type type_id::BOOL8 used
  * as a mask to filter the `input`.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  * @param[in] mr Device memory resource used to allocate the returned table's device memory
  * @return Table containing copy of all rows of @p input passing
  * the filter defined by @p boolean_mask.
@@ -207,6 +216,7 @@ std::unique_ptr<table> drop_nans(
 std::unique_ptr<table> apply_boolean_mask(
   table_view const& input,
   column_view const& boolean_mask,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -241,6 +251,7 @@ enum class duplicate_keep_option {
  * @param[in] keep            keep any, first, last, or none of the found duplicates
  * @param[in] nulls_equal     flag to denote nulls are equal if null_equality::EQUAL, nulls are not
  *                            equal if null_equality::UNEQUAL
+ * @param[in] stream          CUDA stream used for device memory operations and kernel launches
  * @param[in] mr              Device memory resource used to allocate the returned table's device
  *                            memory
  *
@@ -251,6 +262,7 @@ std::unique_ptr<table> unique(
   std::vector<size_type> const& keys,
   duplicate_keep_option keep,
   null_equality nulls_equal         = null_equality::EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -269,6 +281,7 @@ std::unique_ptr<table> unique(
  * @param keep Copy any, first, last, or none of the found duplicates
  * @param nulls_equal Flag to specify whether null elements should be considered as equal
  * @param nans_equal Flag to specify whether NaN elements should be considered as equal
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table
  * @return Table with distinct rows in an unspecified order
  */
@@ -278,6 +291,7 @@ std::unique_ptr<table> distinct(
   duplicate_keep_option keep        = duplicate_keep_option::KEEP_ANY,
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -346,12 +360,14 @@ std::unique_ptr<table> stable_distinct(
  * @param[in] input The column_view whose consecutive groups of equivalent rows will be counted
  * @param[in] null_handling flag to include or ignore `null` while counting
  * @param[in] nan_handling flag to consider `NaN==null` or not
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  *
  * @return number of consecutive groups of equivalent rows in the column
  */
 cudf::size_type unique_count(column_view const& input,
                              null_policy null_handling,
-                             nan_policy nan_handling);
+                             nan_policy nan_handling,
+                             rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Count the number of consecutive groups of equivalent rows in a table.
@@ -359,11 +375,13 @@ cudf::size_type unique_count(column_view const& input,
  * @param[in] input Table whose consecutive groups of equivalent rows will be counted
  * @param[in] nulls_equal flag to denote if null elements should be considered equal
  *            nulls are not equal if null_equality::UNEQUAL.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  *
  * @return number of consecutive groups of equivalent rows in the column
  */
 cudf::size_type unique_count(table_view const& input,
-                             null_equality nulls_equal = null_equality::EQUAL);
+                             null_equality nulls_equal    = null_equality::EQUAL,
+                             rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Count the distinct elements in the column_view.
@@ -382,12 +400,14 @@ cudf::size_type unique_count(table_view const& input,
  * @param[in] input The column_view whose distinct elements will be counted
  * @param[in] null_handling flag to include or ignore `null` while counting
  * @param[in] nan_handling flag to consider `NaN==null` or not
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  *
  * @return number of distinct rows in the table
  */
 cudf::size_type distinct_count(column_view const& input,
                                null_policy null_handling,
-                               nan_policy nan_handling);
+                               nan_policy nan_handling,
+                               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Count the distinct rows in a table.
@@ -395,11 +415,13 @@ cudf::size_type distinct_count(column_view const& input,
  * @param[in] input Table whose distinct rows will be counted
  * @param[in] nulls_equal flag to denote if null elements should be considered equal.
  *            nulls are not equal if null_equality::UNEQUAL.
+ * @param[in] stream CUDA stream used for device memory operations and kernel launches
  *
  * @return number of distinct rows in the table
  */
 cudf::size_type distinct_count(table_view const& input,
-                               null_equality nulls_equal = null_equality::EQUAL);
+                               null_equality nulls_equal    = null_equality::EQUAL,
+                               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /** @} */
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/stream_compaction/apply_boolean_mask.cu b/cpp/src/stream_compaction/apply_boolean_mask.cu
index cdca9517d94..9812f4ffbd7 100644
--- a/cpp/src/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/stream_compaction/apply_boolean_mask.cu
@@ -91,9 +91,10 @@ std::unique_ptr<table> apply_boolean_mask(table_view const& input,
  */
 std::unique_ptr<table> apply_boolean_mask(table_view const& input,
                                           column_view const& boolean_mask,
+                                          rmm::cuda_stream_view stream,
                                           rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::apply_boolean_mask(input, boolean_mask, cudf::get_default_stream(), mr);
+  return detail::apply_boolean_mask(input, boolean_mask, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index 6afd6e34c50..24e2692cb6f 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -150,11 +150,11 @@ std::unique_ptr<table> distinct(table_view const& input,
                                 duplicate_keep_option keep,
                                 null_equality nulls_equal,
                                 nan_equality nans_equal,
+                                rmm::cuda_stream_view stream,
                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::distinct(
-    input, keys, keep, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
+  return detail::distinct(input, keys, keep, nulls_equal, nans_equal, stream, mr);
 }
 
 std::unique_ptr<column> distinct_indices(table_view const& input,
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index cdf9faddf31..78eb0fa5212 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -218,15 +218,18 @@ cudf::size_type distinct_count(column_view const& input,
 
 cudf::size_type distinct_count(column_view const& input,
                                null_policy null_handling,
-                               nan_policy nan_handling)
+                               nan_policy nan_handling,
+                               rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::distinct_count(input, null_handling, nan_handling, cudf::get_default_stream());
+  return detail::distinct_count(input, null_handling, nan_handling, stream);
 }
 
-cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal)
+cudf::size_type distinct_count(table_view const& input,
+                               null_equality nulls_equal,
+                               rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::distinct_count(input, nulls_equal, cudf::get_default_stream());
+  return detail::distinct_count(input, nulls_equal, stream);
 }
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/drop_nans.cu b/cpp/src/stream_compaction/drop_nans.cu
index b46381c8ff6..b98ebbc2ecc 100644
--- a/cpp/src/stream_compaction/drop_nans.cu
+++ b/cpp/src/stream_compaction/drop_nans.cu
@@ -117,20 +117,22 @@ std::unique_ptr<table> drop_nans(table_view const& input,
 std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
                                  cudf::size_type keep_threshold,
+                                 rmm::cuda_stream_view stream,
                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_nans(input, keys, keep_threshold, cudf::get_default_stream(), mr);
+  return detail::drop_nans(input, keys, keep_threshold, stream, mr);
 }
 /*
  * Filters a table to remove nan elements.
  */
 std::unique_ptr<table> drop_nans(table_view const& input,
                                  std::vector<size_type> const& keys,
+                                 rmm::cuda_stream_view stream,
                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_nans(input, keys, keys.size(), cudf::get_default_stream(), mr);
+  return detail::drop_nans(input, keys, keys.size(), stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/drop_nulls.cu b/cpp/src/stream_compaction/drop_nulls.cu
index cb7cd61bf02..2497e4e5065 100644
--- a/cpp/src/stream_compaction/drop_nulls.cu
+++ b/cpp/src/stream_compaction/drop_nulls.cu
@@ -90,20 +90,22 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
 std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
                                   cudf::size_type keep_threshold,
+                                  rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_nulls(input, keys, keep_threshold, cudf::get_default_stream(), mr);
+  return detail::drop_nulls(input, keys, keep_threshold, stream, mr);
 }
 /*
  * Filters a table to remove null elements.
  */
 std::unique_ptr<table> drop_nulls(table_view const& input,
                                   std::vector<size_type> const& keys,
+                                  rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_nulls(input, keys, keys.size(), cudf::get_default_stream(), mr);
+  return detail::drop_nulls(input, keys, keys.size(), stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/unique.cu b/cpp/src/stream_compaction/unique.cu
index edb47984d13..93de0e60b6d 100644
--- a/cpp/src/stream_compaction/unique.cu
+++ b/cpp/src/stream_compaction/unique.cu
@@ -119,10 +119,11 @@ std::unique_ptr<table> unique(table_view const& input,
                               std::vector<size_type> const& keys,
                               duplicate_keep_option const keep,
                               null_equality nulls_equal,
+                              rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::unique(input, keys, keep, nulls_equal, cudf::get_default_stream(), mr);
+  return detail::unique(input, keys, keep, nulls_equal, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/unique_count.cu b/cpp/src/stream_compaction/unique_count.cu
index 19607fe8105..d842f63cd7b 100644
--- a/cpp/src/stream_compaction/unique_count.cu
+++ b/cpp/src/stream_compaction/unique_count.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -67,10 +67,12 @@ cudf::size_type unique_count(table_view const& keys,
 
 }  // namespace detail
 
-cudf::size_type unique_count(table_view const& input, null_equality nulls_equal)
+cudf::size_type unique_count(table_view const& input,
+                             null_equality nulls_equal,
+                             rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::unique_count(input, nulls_equal, cudf::get_default_stream());
+  return detail::unique_count(input, nulls_equal, stream);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/unique_count_column.cu b/cpp/src/stream_compaction/unique_count_column.cu
index 16758b6e3a7..89ce2391a7b 100644
--- a/cpp/src/stream_compaction/unique_count_column.cu
+++ b/cpp/src/stream_compaction/unique_count_column.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -101,10 +101,11 @@ cudf::size_type unique_count(column_view const& input,
 
 cudf::size_type unique_count(column_view const& input,
                              null_policy null_handling,
-                             nan_policy nan_handling)
+                             nan_policy nan_handling,
+                             rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  return detail::unique_count(input, null_handling, nan_handling, cudf::get_default_stream());
+  return detail::unique_count(input, null_handling, nan_handling, stream);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/streams/stream_compaction_test.cpp b/cpp/tests/streams/stream_compaction_test.cpp
index 56443870602..443f4548b2c 100644
--- a/cpp/tests/streams/stream_compaction_test.cpp
+++ b/cpp/tests/streams/stream_compaction_test.cpp
@@ -41,6 +41,7 @@ auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL;
 auto constexpr NAN_EQUAL    = cudf::nan_equality::ALL_EQUAL;
 auto constexpr NAN_UNEQUAL  = cudf::nan_equality::UNEQUAL;
 
+using int16s_col = cudf::test::fixed_width_column_wrapper<int16_t>;
 using int32s_col = cudf::test::fixed_width_column_wrapper<int32_t>;
 using floats_col = cudf::test::fixed_width_column_wrapper<float>;
 
@@ -51,50 +52,9 @@ using cudf::test::iterators::no_nulls;
 using cudf::test::iterators::null_at;
 using cudf::test::iterators::nulls_at;
 
-struct StableDistinctKeepAny : public cudf::test::BaseFixture {};
+struct StreamCompactionTest : public cudf::test::BaseFixture {};
 
-struct StableDistinctKeepFirstLastNone : public cudf::test::BaseFixture {};
-
-TEST_F(StableDistinctKeepAny, NoNullsTableWithNaNs)
-{
-  // Column(s) used to test KEEP_ANY needs to have same rows in contiguous
-  // groups for equivalent keys because KEEP_ANY is nondeterministic.
-  auto const col1  = int32s_col{6, 6, 6, 1, 1, 1, 3, 5, 8, 5};
-  auto const col2  = floats_col{6, 6, 6, 1, 1, 1, 3, 4, 9, 4};
-  auto const keys1 = int32s_col{20, 20, 20, 15, 15, 15, 20, 19, 21, 9};
-  auto const keys2 = floats_col{19., 19., 19., NaN, NaN, NaN, 20., 20., 9., 21.};
-
-  auto const input   = cudf::table_view{{col1, col2, keys1, keys2}};
-  auto const key_idx = std::vector<cudf::size_type>{2, 3};
-
-  // NaNs are unequal.
-  {
-    auto const exp_col1  = int32s_col{6, 1, 1, 1, 3, 5, 8, 5};
-    auto const exp_col2  = floats_col{6, 1, 1, 1, 3, 4, 9, 4};
-    auto const exp_keys1 = int32s_col{20, 15, 15, 15, 20, 19, 21, 9};
-    auto const exp_keys2 = floats_col{19., NaN, NaN, NaN, 20., 20., 9., 21.};
-    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
-
-    auto const result = cudf::stable_distinct(
-      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
-  }
-
-  // NaNs are equal.
-  {
-    auto const exp_col1  = int32s_col{6, 1, 3, 5, 8, 5};
-    auto const exp_col2  = floats_col{6, 1, 3, 4, 9, 4};
-    auto const exp_keys1 = int32s_col{20, 15, 20, 19, 21, 9};
-    auto const exp_keys2 = floats_col{19., NaN, 20., 20., 9., 21.};
-    auto const expected  = cudf::table_view{{exp_col1, exp_col2, exp_keys1, exp_keys2}};
-
-    auto const result = cudf::stable_distinct(
-      input, key_idx, KEEP_ANY, NULL_EQUAL, NAN_EQUAL, cudf::test::get_default_stream());
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
-  }
-}
-
-TEST_F(StableDistinctKeepAny, InputWithNullsAndNaNs)
+TEST_F(StreamCompactionTest, StableDistinctKeepAny)
 {
   auto constexpr null{0.0};  // shadow the global `null` variable of type int
 
@@ -150,7 +110,7 @@ TEST_F(StableDistinctKeepAny, InputWithNullsAndNaNs)
   }
 }
 
-TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsEqual)
+TEST_F(StreamCompactionTest, StableDistinctKeepFirstLastNone)
 {
   // Column(s) used to test needs to have different rows for the same keys.
   auto const col     = int32s_col{0, 1, 2, 3, 4, 5, 6};
@@ -192,44 +152,313 @@ TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsEqual)
   }
 }
 
-TEST_F(StableDistinctKeepFirstLastNone, InputWithNaNsUnequal)
+TEST_F(StreamCompactionTest, DropNaNs)
 {
-  // Column(s) used to test needs to have different rows for the same keys.
-  auto const col     = int32s_col{0, 1, 2, 3, 4, 5, 6, 7};
-  auto const keys    = floats_col{20., NaN, NaN, 19., 21., 19., 22., 20.};
-  auto const input   = cudf::table_view{{col, keys}};
-  auto const key_idx = std::vector<cudf::size_type>{1};
+  auto const col1 = floats_col{{1., 2., NaN, NaN, 5., 6.}, nulls_at({2, 5})};
+  auto const col2 = int32s_col{{10, 40, 70, 5, 2, 10}, nulls_at({2, 5})};
+  auto const col3 = floats_col{{NaN, 40., 70., NaN, 2., 10.}, nulls_at({2, 5})};
+  cudf::table_view input{{col1, col2, col3}};
+
+  std::vector<cudf::size_type> keys{0, 2};
 
-  // KEEP_FIRST
   {
-    auto const exp_col  = int32s_col{0, 1, 2, 3, 4, 6};
-    auto const exp_keys = floats_col{20., NaN, NaN, 19., 21., 22.};
-    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+    // With keep_threshold
+    auto const col1_expected = floats_col{{1., 2., 3., 5., 6.}, nulls_at({2, 4})};
+    auto const col2_expected = int32s_col{{10, 40, 70, 2, 10}, nulls_at({2, 4})};
+    auto const col3_expected = floats_col{{NaN, 40., 70., 2., 10.}, nulls_at({2, 4})};
+    cudf::table_view expected{{col1_expected, col2_expected, col3_expected}};
+
+    auto result = cudf::drop_nans(input, keys, keys.size() - 1, cudf::test::get_default_stream());
 
-    auto const result = cudf::stable_distinct(
-      input, key_idx, KEEP_FIRST, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
   }
 
-  // KEEP_LAST
   {
-    auto const exp_col  = int32s_col{1, 2, 4, 5, 6, 7};
-    auto const exp_keys = floats_col{NaN, NaN, 21., 19., 22., 20.};
-    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+    // Without keep_threshold
+    auto const col1_expected = floats_col{{2., 3., 5., 6.}, nulls_at({1, 3})};
+    auto const col2_expected = int32s_col{{40, 70, 2, 10}, nulls_at({1, 3})};
+    auto const col3_expected = floats_col{{40., 70., 2., 10.}, nulls_at({1, 3})};
+    cudf::table_view expected{{col1_expected, col2_expected, col3_expected}};
+
+    auto result = cudf::drop_nans(input, keys, cudf::test::get_default_stream());
 
-    auto const result = cudf::stable_distinct(
-      input, key_idx, KEEP_LAST, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
   }
+}
+
+TEST_F(StreamCompactionTest, DropNulls)
+{
+  auto const col1 = int16s_col{{1, 0, 1, 0, 1, 0}, nulls_at({2, 5})};
+  auto const col2 = int32s_col{{10, 40, 70, 5, 2, 10}, nulls_at({2})};
+  auto const col3 = floats_col{{10., 40., 70., 5., 2., 10.}, no_nulls()};
+  cudf::table_view input{{col1, col2, col3}};
+  std::vector<cudf::size_type> keys{0, 1, 2};
 
-  // KEEP_NONE
   {
-    auto const exp_col  = int32s_col{1, 2, 4, 6};
-    auto const exp_keys = floats_col{NaN, NaN, 21., 22.};
-    auto const expected = cudf::table_view{{exp_col, exp_keys}};
+    // With keep_threshold
+    auto const col1_expected = int16s_col{{1, 0, 0, 1, 0}, null_at(4)};
+    auto const col2_expected = int32s_col{{10, 40, 5, 2, 10}, no_nulls()};
+    auto const col3_expected = floats_col{{10., 40., 5., 2., 10.}, no_nulls()};
+    cudf::table_view expected{{col1_expected, col2_expected, col3_expected}};
+
+    auto result = cudf::drop_nulls(input, keys, keys.size() - 1, cudf::test::get_default_stream());
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+  }
+
+  {
+    // Without keep_threshold
+    auto const col1_expected = int16s_col{{1, 0, 0, 1}, no_nulls()};
+    auto const col2_expected = int32s_col{{10, 40, 5, 2}, no_nulls()};
+    auto const col3_expected = floats_col{{10., 40., 5., 2.}, no_nulls()};
+    cudf::table_view expected{{col1_expected, col2_expected, col3_expected}};
+
+    auto result = cudf::drop_nulls(input, keys, cudf::test::get_default_stream());
 
-    auto const result = cudf::stable_distinct(
-      input, key_idx, KEEP_NONE, NULL_UNEQUAL, NAN_UNEQUAL, cudf::test::get_default_stream());
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
   }
 }
+
+TEST_F(StreamCompactionTest, Unique)
+{
+  auto const col1     = int32s_col{5, 4, 3, 5, 8, 5};
+  auto const col2     = floats_col{4., 5., 3., 4., 9., 4.};
+  auto const col1_key = int32s_col{20, 20, 20, 19, 21, 9};
+  auto const col2_key = int32s_col{19, 19, 20, 20, 9, 21};
+
+  cudf::table_view input{{col1, col2, col1_key, col2_key}};
+  std::vector<cudf::size_type> keys = {2, 3};
+
+  {
+    // KEEP_FIRST
+    auto const exp_col1_first     = int32s_col{5, 3, 5, 8, 5};
+    auto const exp_col2_first     = floats_col{4., 3., 4., 9., 4.};
+    auto const exp_col1_key_first = int32s_col{20, 20, 19, 21, 9};
+    auto const exp_col2_key_first = int32s_col{19, 20, 20, 9, 21};
+    cudf::table_view expected_first{
+      {exp_col1_first, exp_col2_first, exp_col1_key_first, exp_col2_key_first}};
+
+    auto const result = cudf::unique(input,
+                                     keys,
+                                     cudf::duplicate_keep_option::KEEP_FIRST,
+                                     cudf::null_equality::EQUAL,
+                                     cudf::test::get_default_stream());
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_first, *result);
+  }
+
+  {
+    // KEEP_LAST
+    auto const exp_col1_last     = int32s_col{4, 3, 5, 8, 5};
+    auto const exp_col2_last     = floats_col{5., 3., 4., 9., 4.};
+    auto const exp_col1_key_last = int32s_col{20, 20, 19, 21, 9};
+    auto const exp_col2_key_last = int32s_col{19, 20, 20, 9, 21};
+    cudf::table_view expected_last{
+      {exp_col1_last, exp_col2_last, exp_col1_key_last, exp_col2_key_last}};
+
+    auto const result = cudf::unique(input,
+                                     keys,
+                                     cudf::duplicate_keep_option::KEEP_LAST,
+                                     cudf::null_equality::EQUAL,
+                                     cudf::test::get_default_stream());
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_last, *result);
+  }
+
+  {
+    // KEEP_NONE
+    auto const exp_col1_unique     = int32s_col{3, 5, 8, 5};
+    auto const exp_col2_unique     = floats_col{3., 4., 9., 4.};
+    auto const exp_col1_key_unique = int32s_col{20, 19, 21, 9};
+    auto const exp_col2_key_unique = int32s_col{20, 20, 9, 21};
+    cudf::table_view expected_unique{
+      {exp_col1_unique, exp_col2_unique, exp_col1_key_unique, exp_col2_key_unique}};
+
+    auto const result = cudf::unique(input,
+                                     keys,
+                                     cudf::duplicate_keep_option::KEEP_NONE,
+                                     cudf::null_equality::EQUAL,
+                                     cudf::test::get_default_stream());
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_unique, *result);
+  }
+}
+
+TEST_F(StreamCompactionTest, Distinct)
+{
+  // Column(s) used to test needs to have different rows for the same keys.
+  auto const col1  = int32s_col{0, 1, 2, 3, 4, 5, 6};
+  auto const col2  = floats_col{10, 11, 12, 13, 14, 15, 16};
+  auto const keys1 = int32s_col{20, 20, 20, 20, 19, 21, 9};
+  auto const keys2 = int32s_col{19, 19, 19, 20, 20, 9, 21};
+
+  auto const input   = cudf::table_view{{col1, col2, keys1, keys2}};
+  auto const key_idx = std::vector<cudf::size_type>{2, 3};
+
+  // KEEP_FIRST
+  {
+    auto const exp_col1_sort  = int32s_col{6, 4, 0, 3, 5};
+    auto const exp_col2_sort  = floats_col{16, 14, 10, 13, 15};
+    auto const exp_keys1_sort = int32s_col{9, 19, 20, 20, 21};
+    auto const exp_keys2_sort = int32s_col{21, 20, 19, 20, 9};
+    auto const expected_sort =
+      cudf::table_view{{exp_col1_sort, exp_col2_sort, exp_keys1_sort, exp_keys2_sort}};
+
+    auto const result = cudf::distinct(input,
+                                       key_idx,
+                                       cudf::duplicate_keep_option::KEEP_FIRST,
+                                       cudf::null_equality::EQUAL,
+                                       cudf::nan_equality::ALL_EQUAL,
+                                       cudf::test::get_default_stream());
+    auto const result_sort =
+      cudf::sort_by_key(*result, result->select(key_idx), {}, {}, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort);
+  }
+
+  // KEEP_LAST
+  {
+    auto const exp_col1_sort  = int32s_col{6, 4, 2, 3, 5};
+    auto const exp_col2_sort  = floats_col{16, 14, 12, 13, 15};
+    auto const exp_keys1_sort = int32s_col{9, 19, 20, 20, 21};
+    auto const exp_keys2_sort = int32s_col{21, 20, 19, 20, 9};
+    auto const expected_sort =
+      cudf::table_view{{exp_col1_sort, exp_col2_sort, exp_keys1_sort, exp_keys2_sort}};
+
+    auto const result = cudf::distinct(input,
+                                       key_idx,
+                                       cudf::duplicate_keep_option::KEEP_LAST,
+                                       cudf::null_equality::EQUAL,
+                                       cudf::nan_equality::ALL_EQUAL,
+                                       cudf::test::get_default_stream());
+    auto const result_sort =
+      cudf::sort_by_key(*result, result->select(key_idx), {}, {}, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort);
+  }
+
+  // KEEP_NONE
+  {
+    auto const exp_col1_sort  = int32s_col{6, 4, 3, 5};
+    auto const exp_col2_sort  = floats_col{16, 14, 13, 15};
+    auto const exp_keys1_sort = int32s_col{9, 19, 20, 21};
+    auto const exp_keys2_sort = int32s_col{21, 20, 20, 9};
+    auto const expected_sort =
+      cudf::table_view{{exp_col1_sort, exp_col2_sort, exp_keys1_sort, exp_keys2_sort}};
+
+    auto const result = cudf::distinct(input,
+                                       key_idx,
+                                       cudf::duplicate_keep_option::KEEP_NONE,
+                                       cudf::null_equality::EQUAL,
+                                       cudf::nan_equality::ALL_EQUAL,
+                                       cudf::test::get_default_stream());
+    auto const result_sort =
+      cudf::sort_by_key(*result, result->select(key_idx), {}, {}, cudf::test::get_default_stream());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort, *result_sort);
+  }
+}
+
+TEST_F(StreamCompactionTest, ApplyBooleanMask)
+{
+  auto const col = int32s_col{
+    9668, 9590, 9526, 9205, 9434, 9347, 9160, 9569, 9143, 9807, 9606, 9446, 9279, 9822, 9691};
+  cudf::test::fixed_width_column_wrapper<bool> mask({false,
+                                                     false,
+                                                     true,
+                                                     false,
+                                                     false,
+                                                     true,
+                                                     false,
+                                                     true,
+                                                     false,
+                                                     true,
+                                                     false,
+                                                     false,
+                                                     true,
+                                                     false,
+                                                     true});
+  cudf::table_view input({col});
+  auto const col_expected = int32s_col{9526, 9347, 9569, 9807, 9279, 9691};
+  cudf::table_view expected({col_expected});
+  auto const result = cudf::apply_boolean_mask(input, mask, cudf::test::get_default_stream());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *result);
+}
+
+TEST_F(StreamCompactionTest, UniqueCountColumn)
+{
+  std::vector<int32_t> const input = {1, 3,  3,  4,  31, 1, 8,  2, 0, 4, 1,
+                                      4, 10, 40, 31, 42, 0, 42, 8, 5, 4};
+
+  cudf::test::fixed_width_column_wrapper<int32_t> input_col(input.begin(), input.end());
+  std::vector<double> input_data(input.begin(), input.end());
+
+  auto const new_end  = std::unique(input_data.begin(), input_data.end());
+  auto const expected = std::distance(input_data.begin(), new_end);
+  EXPECT_EQ(
+    expected,
+    cudf::unique_count(
+      input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID, cudf::test::get_default_stream()));
+}
+
+TEST_F(StreamCompactionTest, UniqueCountTable)
+{
+  std::vector<int32_t> const input1 = {1, 3, 3,  3,  4,  31, 1, 8,  2, 0, 4,
+                                       1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4};
+  std::vector<int32_t> const input2 = {3, 3,  3,  4,  31, 1, 8,  5, 0, 4, 1,
+                                       4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1};
+
+  std::vector<std::pair<int32_t, int32_t>> pair_input;
+  std::transform(input1.begin(),
+                 input1.end(),
+                 input2.begin(),
+                 std::back_inserter(pair_input),
+                 [](int32_t a, int32_t b) { return std::pair(a, b); });
+
+  cudf::test::fixed_width_column_wrapper<int32_t> input_col1(input1.begin(), input1.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> input_col2(input2.begin(), input2.end());
+  cudf::table_view input_table({input_col1, input_col2});
+
+  auto const new_end = std::unique(pair_input.begin(), pair_input.end());
+  auto const result  = std::distance(pair_input.begin(), new_end);
+  EXPECT_EQ(
+    result,
+    cudf::unique_count(input_table, null_equality::EQUAL, cudf::test::get_default_stream()));
+}
+
+TEST_F(StreamCompactionTest, DistinctCountColumn)
+{
+  std::vector<int32_t> const input = {1, 3,  3,  4,  31, 1, 8,  2, 0, 4, 1,
+                                      4, 10, 40, 31, 42, 0, 42, 8, 5, 4};
+
+  cudf::test::fixed_width_column_wrapper<int32_t> input_col(input.begin(), input.end());
+
+  auto const expected =
+    static_cast<cudf::size_type>(std::set<double>(input.begin(), input.end()).size());
+  EXPECT_EQ(
+    expected,
+    cudf::distinct_count(
+      input_col, null_policy::INCLUDE, nan_policy::NAN_IS_VALID, cudf::test::get_default_stream()));
+}
+
+TEST_F(StreamCompactionTest, DistinctCountTable)
+{
+  std::vector<int32_t> const input1 = {1, 3, 3,  3,  4,  31, 1, 8,  2, 0, 4,
+                                       1, 4, 10, 40, 31, 42, 0, 42, 8, 5, 4};
+  std::vector<int32_t> const input2 = {3, 3,  3,  4,  31, 1, 8,  5, 0, 4, 1,
+                                       4, 10, 40, 31, 42, 0, 42, 8, 5, 4, 1};
+
+  std::vector<std::pair<int32_t, int32_t>> pair_input;
+  std::transform(input1.begin(),
+                 input1.end(),
+                 input2.begin(),
+                 std::back_inserter(pair_input),
+                 [](int32_t a, int32_t b) { return std::pair(a, b); });
+
+  cudf::test::fixed_width_column_wrapper<int32_t> input_col1(input1.begin(), input1.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> input_col2(input2.begin(), input2.end());
+  cudf::table_view input_table({input_col1, input_col2});
+
+  auto const expected = static_cast<cudf::size_type>(
+    std::set<std::pair<int32_t, int32_t>>(pair_input.begin(), pair_input.end()).size());
+  EXPECT_EQ(
+    expected,
+    cudf::distinct_count(input_table, null_equality::EQUAL, cudf::test::get_default_stream()));
+}
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index a9ace1398e4..76ca8c533ce 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -3919,6 +3919,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(
                      keep_option,
                      nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL,
                      cudf::nan_equality::ALL_EQUAL,
+                     cudf::get_default_stream(),
                      rmm::mr::get_current_device_resource());
     return convert_table_for_return(env, result);
   }

From b933b54858a84082980f20522738fda4969a1318 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 7 Aug 2024 20:07:42 -0500
Subject: [PATCH 635/842] Use tool.scikit-build.cmake.version, set
 scikit-build-core minimum-version (#16503)

Contributes to https://github.com/rapidsai/build-planning/issues/58.

`scikit-build-core==0.10.0` was released today (https://github.com/scikit-build/scikit-build-core/releases/tag/v0.10.0), and wheel-building configurations across RAPIDS are incompatible with it.

This proposes upgrading to that version and fixing configuration here in a way that:

* is compatible with that new `scikit-build-core` version
* takes advantage of the forward-compatibility mechanism (`minimum-version`) that `scikit-build-core` provides, to reduce the risk of needing to do this again in the future

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - https://github.com/jakirkham

URL: https://github.com/rapidsai/cudf/pull/16503
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 2 +-
 conda/recipes/cudf_kafka/meta.yaml               | 2 +-
 dependencies.yaml                                | 4 ++--
 python/cudf/pyproject.toml                       | 5 +++--
 python/cudf_kafka/pyproject.toml                 | 5 +++--
 7 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index d04804cafaf..8d5fc2e31d9 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -82,7 +82,7 @@ dependencies:
 - rich
 - rmm==24.10.*,>=0.0.0a0
 - s3fs>=2022.3.0
-- scikit-build-core>=0.7.0
+- scikit-build-core>=0.10.0
 - scipy
 - spdlog>=1.12.0,<1.13
 - sphinx
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index e2c3558030d..7b0485d7f29 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -80,7 +80,7 @@ dependencies:
 - rich
 - rmm==24.10.*,>=0.0.0a0
 - s3fs>=2022.3.0
-- scikit-build-core>=0.7.0
+- scikit-build-core>=0.10.0
 - scipy
 - spdlog>=1.12.0,<1.13
 - sphinx
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 9137f099ad1..8d7ef63715b 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -62,7 +62,7 @@ requirements:
     - python
     - cython >=3.0.3
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
-    - scikit-build-core >=0.7.0
+    - scikit-build-core >=0.10.0
     - dlpack >=0.8,<1.0
     # TODO: Change to `2.0` for NumPy 2
     - numpy 1.23
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 1b0e0e2c236..748a32e5518 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -61,7 +61,7 @@ requirements:
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
-    - scikit-build-core >=0.7.0
+    - scikit-build-core >=0.10.0
     {% if cuda_major != "11" %}
     - cuda-cudart-dev
     {% endif %}
diff --git a/dependencies.yaml b/dependencies.yaml
index abb55a5e011..b0d62a9fb0d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -301,10 +301,10 @@ dependencies:
           - &rapids_build_backend rapids-build-backend>=0.3.0,<0.4.0.dev0
       - output_types: conda
         packages:
-          - scikit-build-core>=0.7.0
+          - scikit-build-core>=0.10.0
       - output_types: [requirements, pyproject]
         packages:
-          - scikit-build-core[pyproject]>=0.7.0
+          - scikit-build-core[pyproject]>=0.10.0
   rapids_build_setuptools:
     common:
       - output_types: [requirements, pyproject]
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index b2ddb06d8c9..60ac171f3d7 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -4,7 +4,7 @@
 build-backend = "rapids_build_backend.build"
 requires = [
     "rapids-build-backend>=0.3.0,<0.4.0.dev0",
-    "scikit-build-core[pyproject]>=0.7.0",
+    "scikit-build-core[pyproject]>=0.10.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
@@ -133,7 +133,8 @@ requires = [
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
-cmake.minimum-version = "3.26.4"
+cmake.version = "CMakeLists.txt"
+minimum-version = "build-system.requires"
 ninja.make-fallback = true
 sdist.exclude = ["*tests*"]
 sdist.reproducible = true
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index a9b60133f42..63c5b07c5f3 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -4,7 +4,7 @@
 build-backend = "rapids_build_backend.build"
 requires = [
     "rapids-build-backend>=0.3.0,<0.4.0.dev0",
-    "scikit-build-core[pyproject]>=0.7.0",
+    "scikit-build-core[pyproject]>=0.10.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project]
@@ -86,7 +86,8 @@ filterwarnings = [
 [tool.scikit-build]
 build-dir = "build/{wheel_tag}"
 cmake.build-type = "Release"
-cmake.minimum-version = "3.26.4"
+cmake.version = "CMakeLists.txt"
+minimum-version = "build-system.requires"
 ninja.make-fallback = true
 sdist.exclude = ["*tests*"]
 sdist.reproducible = true

From c146eed6f36e7c82052a3288e1bf6ab8c2216637 Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Wed, 7 Aug 2024 22:19:46 -0700
Subject: [PATCH 636/842] Expose `stream` param in transform APIs (#16452)

Exposes the `stream` param in transform APIs

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16452
---
 cpp/include/cudf/transform.hpp       |  21 +++-
 cpp/src/interop/to_arrow.cu          |   2 +-
 cpp/src/interop/to_arrow_device.cu   |   4 +-
 cpp/src/interop/to_arrow_host.cu     |   2 +-
 cpp/src/transform/bools_to_mask.cu   |   4 +-
 cpp/src/transform/compute_column.cu  |   3 +-
 cpp/src/transform/encode.cu          |   4 +-
 cpp/src/transform/mask_to_bools.cu   |   3 +-
 cpp/src/transform/nans_to_nulls.cu   |   4 +-
 cpp/src/transform/one_hot_encode.cu  |   3 +-
 cpp/src/transform/row_bit_count.cu   |  11 +-
 cpp/src/transform/transform.cpp      |   3 +-
 cpp/tests/CMakeLists.txt             |   1 +
 cpp/tests/streams/transform_test.cpp | 164 +++++++++++++++++++++++++++
 14 files changed, 210 insertions(+), 19 deletions(-)
 create mode 100644 cpp/tests/streams/transform_test.cpp

diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index adc5bdb2af8..f16214260f7 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -47,6 +47,7 @@ namespace CUDF_EXPORT cudf {
  * @param unary_udf     The PTX/CUDA string of the unary function to apply
  * @param output_type   The output type that is compatible with the output type in the UDF
  * @param is_ptx        true: the UDF is treated as PTX code; false: the UDF is treated as CUDA code
+ * @param stream        CUDA stream used for device memory operations and kernel launches
  * @param mr            Device memory resource used to allocate the returned column's device memory
  * @return              The column resulting from applying the unary function to
  *                      every element of the input
@@ -56,6 +57,7 @@ std::unique_ptr<column> transform(
   std::string const& unary_udf,
   data_type output_type,
   bool is_ptx,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -65,12 +67,14 @@ std::unique_ptr<column> transform(
  * @throws cudf::logic_error if `input.type()` is a non-floating type
  *
  * @param input         An immutable view of the input column of floating-point type
+ * @param stream        CUDA stream used for device memory operations and kernel launches
  * @param mr            Device memory resource used to allocate the returned bitmask
  * @return A pair containing a `device_buffer` with the new bitmask and it's
  * null count obtained by replacing `NaN` in `input` with null.
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
   column_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -83,12 +87,14 @@ std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
  *
  * @param table The table used for expression evaluation
  * @param expr The root of the expression tree
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource
  * @return Output column
  */
 std::unique_ptr<column> compute_column(
   table_view const& table,
   ast::expression const& expr,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -101,6 +107,7 @@ std::unique_ptr<column> compute_column(
  * @throws cudf::logic_error if `input.type()` is a non-boolean type
  *
  * @param input        Boolean elements to convert to a bitmask
+ * @param stream       CUDA stream used for device memory operations and kernel launches
  * @param mr           Device memory resource used to allocate the returned bitmask
  * @return A pair containing a `device_buffer` with the new bitmask and it's
  * null count obtained from input considering `true` represent `valid`/`1` and
@@ -108,6 +115,7 @@ std::unique_ptr<column> compute_column(
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
   column_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -130,12 +138,14 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
  * @endcode
  *
  * @param input Table containing values to be encoded
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return A pair containing the distinct row of the input table in sorter order,
  * and a column of integer indices representing the encoded rows.
  */
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
   cudf::table_view const& input,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -162,12 +172,14 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
  *
  * @param input Column containing values to be encoded
  * @param categories Column containing categories
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return A pair containing the owner to all encoded data and a table view into the data
  */
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(
   column_view const& input,
   column_view const& categories,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -188,6 +200,7 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(
  * @param bitmask A device pointer to the bitmask which needs to be converted
  * @param begin_bit position of the bit from which the conversion should start
  * @param end_bit position of the bit before which the conversion should stop
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned columns' device memory
  * @return A boolean column representing the given mask from [begin_bit, end_bit)
  */
@@ -195,6 +208,7 @@ std::unique_ptr<column> mask_to_bools(
   bitmask_type const* bitmask,
   size_type begin_bit,
   size_type end_bit,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -219,11 +233,14 @@ std::unique_ptr<column> mask_to_bools(
  * row_bit_count(column(x)) >= row_bit_count(gather(column(x)))
  *
  * @param t The table view to perform the computation on
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned columns' device memory
  * @return A 32-bit integer column containing the per-row bit counts
  */
 std::unique_ptr<column> row_bit_count(
-  table_view const& t, rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  table_view const& t,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for
@@ -240,12 +257,14 @@ std::unique_ptr<column> row_bit_count(
  *
  * @param t The table view to perform the computation on
  * @param segment_length The number of rows in each segment for which the total size is computed
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned columns' device memory
  * @return A 32-bit integer column containing the bit counts for each segment of rows
  */
 std::unique_ptr<column> segmented_row_bit_count(
   table_view const& t,
   size_type segment_length,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index 3d41f856f4f..a867d4adfa1 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -247,7 +247,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view in
                                                                   arrow::MemoryPool* ar_mr,
                                                                   rmm::cuda_stream_view stream)
 {
-  auto bitmask = bools_to_mask(input, stream, rmm::mr::get_current_device_resource());
+  auto bitmask = detail::bools_to_mask(input, stream, rmm::mr::get_current_device_resource());
 
   auto data_buffer = allocate_arrow_buffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);
 
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index cea7cdebcba..a5f3f9d87f5 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -200,7 +200,7 @@ int dispatch_to_arrow_device::operator()<bool>(cudf::column&& column,
   nanoarrow::UniqueArray tmp;
   NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column));
 
-  auto bitmask  = bools_to_mask(column.view(), stream, mr);
+  auto bitmask  = detail::bools_to_mask(column.view(), stream, mr);
   auto contents = column.release();
   NANOARROW_RETURN_NOT_OK(set_null_mask(contents, tmp.get()));
   NANOARROW_RETURN_NOT_OK(
@@ -442,7 +442,7 @@ int dispatch_to_arrow_device_view::operator()<bool>(ArrowArray* out) const
   nanoarrow::UniqueArray tmp;
   NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column));
 
-  auto bitmask = bools_to_mask(column, stream, mr);
+  auto bitmask = detail::bools_to_mask(column, stream, mr);
   NANOARROW_RETURN_NOT_OK(
     set_buffer(std::move(bitmask.first), fixed_width_data_buffer_idx, tmp.get()));
   NANOARROW_RETURN_NOT_OK(set_null_mask(column, tmp.get()));
diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu
index 193b3a3b5a2..26f7c7e6e53 100644
--- a/cpp/src/interop/to_arrow_host.cu
+++ b/cpp/src/interop/to_arrow_host.cu
@@ -147,7 +147,7 @@ int dispatch_to_arrow_host::operator()<bool>(ArrowArray* out) const
   NANOARROW_RETURN_NOT_OK(initialize_array(tmp.get(), NANOARROW_TYPE_BOOL, column));
 
   NANOARROW_RETURN_NOT_OK(populate_validity_bitmap(ArrowArrayValidityBitmap(tmp.get())));
-  auto bitmask = bools_to_mask(column, stream, mr);
+  auto bitmask = detail::bools_to_mask(column, stream, mr);
   NANOARROW_RETURN_NOT_OK(populate_data_buffer(
     device_span<uint8_t const>(reinterpret_cast<const uint8_t*>(bitmask.first->data()),
                                bitmask.first->size()),
diff --git a/cpp/src/transform/bools_to_mask.cu b/cpp/src/transform/bools_to_mask.cu
index c12f65deb46..452aebf4428 100644
--- a/cpp/src/transform/bools_to_mask.cu
+++ b/cpp/src/transform/bools_to_mask.cu
@@ -59,10 +59,10 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
 }  // namespace detail
 
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
-  column_view const& input, rmm::device_async_resource_ref mr)
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::bools_to_mask(input, cudf::get_default_stream(), mr);
+  return detail::bools_to_mask(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index 7960731f3a1..c4fc8d58552 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -138,10 +138,11 @@ std::unique_ptr<column> compute_column(table_view const& table,
 
 std::unique_ptr<column> compute_column(table_view const& table,
                                        ast::expression const& expr,
+                                       rmm::cuda_stream_view stream,
                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::compute_column(table, expr, cudf::get_default_stream(), mr);
+  return detail::compute_column(table, expr, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index 7a044b9f6f7..1c9d52bce1b 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -72,10 +72,10 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<column>> encode(table_view con
 }  // namespace detail
 
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
-  cudf::table_view const& input, rmm::device_async_resource_ref mr)
+  cudf::table_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::encode(input, cudf::get_default_stream(), mr);
+  return detail::encode(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/mask_to_bools.cu b/cpp/src/transform/mask_to_bools.cu
index adf5db02d9c..be0b80a2633 100644
--- a/cpp/src/transform/mask_to_bools.cu
+++ b/cpp/src/transform/mask_to_bools.cu
@@ -62,9 +62,10 @@ std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
 std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
                                       size_type begin_bit,
                                       size_type end_bit,
+                                      rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::mask_to_bools(bitmask, begin_bit, end_bit, cudf::get_default_stream(), mr);
+  return detail::mask_to_bools(bitmask, begin_bit, end_bit, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/transform/nans_to_nulls.cu b/cpp/src/transform/nans_to_nulls.cu
index fd4f33c594c..a24ba304004 100644
--- a/cpp/src/transform/nans_to_nulls.cu
+++ b/cpp/src/transform/nans_to_nulls.cu
@@ -93,10 +93,10 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
 }  // namespace detail
 
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
-  column_view const& input, rmm::device_async_resource_ref mr)
+  column_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::nans_to_nulls(input, cudf::get_default_stream(), mr);
+  return detail::nans_to_nulls(input, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 808f2d1b284..46e6e55b0b7 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -115,9 +115,10 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const&
 
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const& input,
                                                               column_view const& categories,
+                                                              rmm::cuda_stream_view stream,
                                                               rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::one_hot_encode(input, categories, cudf::get_default_stream(), mr);
+  return detail::one_hot_encode(input, categories, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 12a15eb7e34..4530fabf889 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -561,23 +561,26 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  return segmented_row_bit_count(t, 1, stream, mr);
+  return detail::segmented_row_bit_count(t, 1, stream, mr);
 }
 
 }  // namespace detail
 
 std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
                                                 size_type segment_length,
+                                                rmm::cuda_stream_view stream,
                                                 rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::segmented_row_bit_count(t, segment_length, cudf::get_default_stream(), mr);
+  return detail::segmented_row_bit_count(t, segment_length, stream, mr);
 }
 
-std::unique_ptr<column> row_bit_count(table_view const& t, rmm::device_async_resource_ref mr)
+std::unique_ptr<column> row_bit_count(table_view const& t,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::row_bit_count(t, cudf::get_default_stream(), mr);
+  return detail::row_bit_count(t, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 98ec44758b9..f5e9048fa0a 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -97,10 +97,11 @@ std::unique_ptr<column> transform(column_view const& input,
                                   std::string const& unary_udf,
                                   data_type output_type,
                                   bool is_ptx,
+                                  rmm::cuda_stream_view stream,
                                   rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::transform(input, unary_udf, output_type, is_ptx, cudf::get_default_stream(), mr);
+  return detail::transform(input, unary_udf, output_type, is_ptx, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 5e85b3e8adf..8c4b0f1e367 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -736,6 +736,7 @@ ConfigureTest(
   STREAM_MODE
   testing
 )
+ConfigureTest(STREAM_TRANSFORM_TEST streams/transform_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_UNARY_TEST streams/unary_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
diff --git a/cpp/tests/streams/transform_test.cpp b/cpp/tests/streams/transform_test.cpp
new file mode 100644
index 00000000000..9187672221c
--- /dev/null
+++ b/cpp/tests/streams/transform_test.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/ast/expressions.hpp>
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/types.hpp>
+
+class TransformTest : public cudf::test::BaseFixture {};
+
+template <class dtype, class Data>
+void test_udf(char const udf[], Data data_init, cudf::size_type size, bool is_ptx)
+{
+  auto all_valid = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
+  auto data_iter = cudf::detail::make_counting_transform_iterator(0, data_init);
+  cudf::test::fixed_width_column_wrapper<dtype, typename decltype(data_iter)::value_type> in(
+    data_iter, data_iter + size, all_valid);
+  cudf::transform(
+    in, udf, cudf::data_type(cudf::type_to_id<dtype>()), is_ptx, cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, Transform)
+{
+  char const* cuda =
+    R"***(
+__device__ inline void    fdsf   (
+       float* C,
+       float a
+)
+{
+  *C = a*a*a*a;
+}
+)***";
+
+  char const* ptx =
+    R"***(
+//
+// Generated by NVIDIA NVVM Compiler
+//
+// Compiler Build ID: CL-24817639
+// Cuda compilation tools, release 10.0, V10.0.130
+// Based on LLVM 3.4svn
+//
+
+.version 6.3
+.target sm_70
+.address_size 64
+
+	// .globl	_ZN8__main__7add$241Ef
+.common .global .align 8 .u64 _ZN08NumbaEnv8__main__7add$241Ef;
+.common .global .align 8 .u64 _ZN08NumbaEnv5numba7targets7numbers14int_power_impl12$3clocals$3e13int_power$242Efx;
+
+.visible .func  (.param .b32 func_retval0) _ZN8__main__7add$241Ef(
+	.param .b64 _ZN8__main__7add$241Ef_param_0,
+	.param .b32 _ZN8__main__7add$241Ef_param_1
+)
+{
+	.reg .f32 	%f<4>;
+	.reg .b32 	%r<2>;
+	.reg .b64 	%rd<2>;
+
+
+	ld.param.u64 	%rd1, [_ZN8__main__7add$241Ef_param_0];
+	ld.param.f32 	%f1, [_ZN8__main__7add$241Ef_param_1];
+	mul.f32 	%f2, %f1, %f1;
+	mul.f32 	%f3, %f2, %f2;
+	st.f32 	[%rd1], %f3;
+	mov.u32 	%r1, 0;
+	st.param.b32	[func_retval0+0], %r1;
+	ret;
+}
+)***";
+
+  auto data_init = [](cudf::size_type row) { return row % 3; };
+  test_udf<float>(cuda, data_init, 500, false);
+  test_udf<float>(ptx, data_init, 500, true);
+}
+
+TEST_F(TransformTest, ComputeColumn)
+{
+  auto c_0        = cudf::test::fixed_width_column_wrapper<cudf::size_type>{3, 20, 1, 50};
+  auto c_1        = cudf::test::fixed_width_column_wrapper<cudf::size_type>{10, 7, 20, 0};
+  auto table      = cudf::table_view{{c_0, c_1}};
+  auto col_ref_0  = cudf::ast::column_reference(0);
+  auto col_ref_1  = cudf::ast::column_reference(1);
+  auto expression = cudf::ast::operation(cudf::ast::ast_operator::ADD, col_ref_0, col_ref_1);
+  cudf::compute_column(table, expression, cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, BoolsToMask)
+{
+  std::vector<bool> input({1, 0, 1, 0, 1, 0, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> input_column(input.begin(), input.end());
+  cudf::bools_to_mask(input_column, cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, MaskToBools)
+{
+  cudf::mask_to_bools(nullptr, 0, 0, cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, Encode)
+{
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> input{{1, 2, 3, 2, 3, 2, 1}};
+  cudf::encode(cudf::table_view({input}), cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, OneHotEncode)
+{
+  auto input    = cudf::test::fixed_width_column_wrapper<cudf::size_type>{8, 8, 8, 9, 9};
+  auto category = cudf::test::fixed_width_column_wrapper<cudf::size_type>{8, 9};
+  cudf::one_hot_encode(input, category, cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, NaNsToNulls)
+{
+  std::vector<float> input = {1, 2, 3, 4, 5};
+  std::vector<bool> mask   = {true, true, true, true, false, false};
+  auto input_column =
+    cudf::test::fixed_width_column_wrapper<float>(input.begin(), input.end(), mask.begin());
+  cudf::nans_to_nulls(input_column, cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, RowBitCount)
+{
+  std::vector<std::string> strings{"abc", "ï", "", "z", "bananas", "warp", "", "zing"};
+  cudf::test::strings_column_wrapper col(strings.begin(), strings.end());
+  cudf::row_bit_count(cudf::table_view({col}), cudf::test::get_default_stream());
+}
+
+TEST_F(TransformTest, SegmentedRowBitCount)
+{
+  // clang-format off
+  std::vector<std::string> const strings { "daïs", "def", "", "z", "bananas", "warp", "", "zing" };
+  std::vector<bool>        const valids  {  1,      0,    0,  1,   0,          1,      1,  1 };
+  // clang-format on
+  cudf::test::strings_column_wrapper const col(strings.begin(), strings.end(), valids.begin());
+  auto const input              = cudf::table_view({col});
+  auto constexpr segment_length = 2;
+  cudf::segmented_row_bit_count(input, segment_length, cudf::test::get_default_stream());
+}

From a94512a568bd0351fd20b0c2cbcd6067fd4d504b Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Wed, 7 Aug 2024 22:20:57 -0700
Subject: [PATCH 637/842] Add interop example for `arrow::StringViewArray` to
 `cudf::column` (#16498)

Demonstrates the conversion from an `arrow:StringViewArray` to a `cudf::column`

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16498
---
 cpp/examples/build.sh               |   1 +
 cpp/examples/interop/CMakeLists.txt |  20 ++++
 cpp/examples/interop/interop.cpp    | 176 ++++++++++++++++++++++++++++
 3 files changed, 197 insertions(+)
 create mode 100644 cpp/examples/interop/CMakeLists.txt
 create mode 100644 cpp/examples/interop/interop.cpp

diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index dce81fb1677..2d6f6f316c7 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -61,3 +61,4 @@ build_example tpch
 build_example strings
 build_example nested_types
 build_example parquet_io
+build_example interop
diff --git a/cpp/examples/interop/CMakeLists.txt b/cpp/examples/interop/CMakeLists.txt
new file mode 100644
index 00000000000..a1f99c1d2fd
--- /dev/null
+++ b/cpp/examples/interop/CMakeLists.txt
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+cmake_minimum_required(VERSION 3.26.4)
+
+include(../set_cuda_architecture.cmake)
+
+rapids_cuda_init_architectures(interop_example)
+rapids_cuda_set_architectures(RAPIDS)
+
+project(
+  interop_example
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+include(../fetch_dependencies.cmake)
+
+add_executable(interop interop.cpp)
+target_link_libraries(interop PRIVATE cudf::cudf)
+target_compile_features(interop PRIVATE cxx_std_17)
diff --git a/cpp/examples/interop/interop.cpp b/cpp/examples/interop/interop.cpp
new file mode 100644
index 00000000000..8271c3836e4
--- /dev/null
+++ b/cpp/examples/interop/interop.cpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/interop.hpp>
+#include <cudf/io/csv.hpp>
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <arrow/array/array_binary.h>
+#include <arrow/type.h>
+
+// Helper functuons to create StringViews
+inline arrow::StringViewType::c_type to_inline_string_view(const void* data, int32_t const& size)
+{
+  arrow::StringViewType::c_type out;
+  out.inlined = {size, {}};
+  memcpy(&out.inlined.data, data, size);
+  return out;
+}
+inline arrow::StringViewType::c_type to_inline_string_view(std::string_view const& v)
+{
+  return to_inline_string_view(v.data(), static_cast<int32_t>(v.size()));
+}
+inline arrow::StringViewType::c_type to_string_view(const void* data,
+                                                    int32_t const& size,
+                                                    int32_t const& buffer_index,
+                                                    int32_t const& offset)
+{
+  if (size <= arrow::StringViewType::kInlineSize) { return to_inline_string_view(data, size); }
+  arrow::StringViewType::c_type out;
+  out.ref = {size, {}, buffer_index, offset};
+  memcpy(&out.ref.prefix, data, sizeof(out.ref.prefix));
+  return out;
+}
+inline arrow::StringViewType::c_type to_string_view(std::string_view const& v,
+                                                    int32_t const& buffer_index,
+                                                    int32_t const& offset)
+{
+  return to_string_view(v.data(), static_cast<int32_t>(v.size()), buffer_index, offset);
+}
+
+/**
+ * @brief Create a StringViewArray
+ *
+ * @param data_buffers The data buffers
+ * @param views The string views
+ * @param validate Whether to validate the array
+ */
+arrow::Result<std::shared_ptr<arrow::StringViewArray>> make_string_view_array(
+  arrow::BufferVector const& data_buffers,
+  std::vector<arrow::StringViewType::c_type> const& views,
+  bool validate = true)
+{
+  auto const length = static_cast<int64_t>(views.size());
+  auto const arr    = std::make_shared<arrow::StringViewArray>(
+    arrow::utf8_view(), length, arrow::Buffer::FromVector(views), std::move(data_buffers));
+  if (validate) { RETURN_NOT_OK(arr->ValidateFull()); }
+  return arr;
+}
+
+/**
+ * @brief Convert a vector of strings into a vector of the
+ * constituent chars and a vector of offsets.
+ *
+ * @param strings The vector of strings
+ */
+auto make_chars_and_offsets(std::vector<std::string> const& strings)
+{
+  std::vector<char> chars{};
+  std::vector<cudf::size_type> offsets(1, 0);
+  for (auto& str : strings) {
+    chars.insert(chars.end(), std::cbegin(str), std::cend(str));
+    auto const last_offset = static_cast<std::size_t>(offsets.back());
+    auto const next_offset = last_offset + str.length();
+    CUDF_EXPECTS(
+      next_offset < static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+      "Cannot use arrow_string_view_to_cudf_column to build a large strings column");
+    offsets.push_back(static_cast<cudf::size_type>(next_offset));
+  }
+  return std::make_tuple(std::move(chars), std::move(offsets));
+};
+
+/**
+ * @brief Convert an Arrow StringViewArray to a cudf::column
+ *
+ * @param array The Arrow StringViewArray
+ * @param stream The CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::column> arrow_string_view_to_cudf_column(
+  std::shared_ptr<arrow::StringViewArray> const& array,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+{
+  // Convert the string views into chars and offsets
+  std::vector<std::string> strings;
+  for (auto i = 0; i < array->length(); i++) {
+    strings.push_back(array->GetString(i));
+  }
+  auto const [chars, offsets] = make_chars_and_offsets(strings);
+
+  // Copy the chars vector to the device
+  rmm::device_uvector<char> d_chars(chars.size(), stream, mr);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
+    d_chars.data(), chars.data(), chars.size() * sizeof(char), cudaMemcpyDefault, stream.value()));
+
+  // Copy the offsets vector to the device
+  // and wrap it in a cudf::column
+  rmm::device_uvector<cudf::size_type> d_offsets(offsets.size(), stream, mr);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(d_offsets.data(),
+                                offsets.data(),
+                                offsets.size() * sizeof(cudf::size_type),
+                                cudaMemcpyDefault,
+                                stream.value()));
+  auto offsets_col =
+    std::make_unique<cudf::column>(std::move(d_offsets), rmm::device_buffer{0, stream, mr}, 0);
+
+  // Create a string column out of the chars and offsets
+  return cudf::make_strings_column(array->length(),
+                                   std::move(offsets_col),
+                                   d_chars.release(),
+                                   0,
+                                   rmm::device_buffer{0, stream, mr});
+}
+
+int main(int argc, char** argv)
+{
+  std::vector<std::shared_ptr<arrow::Buffer>> data_buffers;
+  std::vector<arrow::StringViewType::c_type> views;
+
+  // Define the data buffers and string views
+  auto const buffer_a =
+    arrow::Buffer::FromString("hello rapids teamapache arrow interopnvidiacudf");
+  data_buffers.push_back(buffer_a);
+  views.push_back(to_string_view("hello rapid steam", 0, 0));
+  views.push_back(to_string_view("apache arrow interop", 0, 17));
+  views.push_back(to_inline_string_view("nvidia"));
+  views.push_back(to_inline_string_view("cudf"));
+
+  // Create a StringViewArray
+  auto const string_view_col = make_string_view_array(data_buffers, views, true).ValueOrDie();
+  std::cout << string_view_col->ToString() << std::endl;
+
+  // Convert the StringViewArray to a cudf::column
+  auto const cudf_col = arrow_string_view_to_cudf_column(string_view_col);
+
+  // Write the cudf::column as CSV
+  auto const tbl_view                  = cudf::table_view({cudf_col->view()});
+  std::vector<std::string> const names = {"col_a"};
+
+  std::vector<char> h_buffer;
+  cudf::io::csv_writer_options writer_options =
+    cudf::io::csv_writer_options::builder(cudf::io::sink_info(&h_buffer), tbl_view)
+      .include_header(not names.empty())
+      .names(names);
+
+  cudf::io::write_csv(writer_options);
+  auto const result = std::string(h_buffer.data(), h_buffer.size());
+  std::cout << result << std::endl;
+
+  return 0;
+}

From cc75b05b426920e6522c49527f8b684f780f38e3 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 8 Aug 2024 10:00:22 -0400
Subject: [PATCH 638/842] Change IPv4 convert APIs to support UINT32 instead of
 INT64 (#16489)

Changes the integer type for `cudf::strings::ipv4_to_integers` and `cudf::strings::integers_to_ipv4` to use UINT32 types instead of INT64. The INT64 type was originally chosen because libcudf did not support unsigned types at the time.
This is a breaking change since the basic input/output type is changed.

Closes #16324

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - https://github.com/brandon-b-miller
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16489
---
 cpp/include/cudf/strings/convert/convert_ipv4.hpp | 11 +++--------
 cpp/src/strings/convert/convert_ipv4.cu           | 14 +++++++-------
 cpp/tests/strings/ipv4_tests.cpp                  |  8 ++++----
 python/cudf/cudf/core/column/numerical.py         |  4 ++--
 python/cudf/cudf/tests/test_string.py             |  6 ++++--
 5 files changed, 20 insertions(+), 23 deletions(-)

diff --git a/cpp/include/cudf/strings/convert/convert_ipv4.hpp b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
index 04a04907c12..97d1dfee017 100644
--- a/cpp/include/cudf/strings/convert/convert_ipv4.hpp
+++ b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
@@ -44,15 +44,12 @@ namespace strings {
  * No checking is done on the format. If a string is not in IPv4 format, the resulting
  * integer is undefined.
  *
- * The resulting 32-bit integer is placed in an int64_t to avoid setting the sign-bit
- * in an int32_t type. This could be changed if cudf supported a UINT32 type in the future.
- *
  * Any null entries will result in corresponding null entries in the output column.
  *
  * @param input Strings instance for this operation
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New INT64 column converted from strings
+ * @return New UINT32 column converted from strings
  */
 std::unique_ptr<column> ipv4_to_integers(
   strings_column_view const& input,
@@ -68,13 +65,11 @@ std::unique_ptr<column> ipv4_to_integers(
  * Each input integer is dissected into four integers by dividing the input into 8-bit sections.
  * These sub-integers are then converted into [0-9] characters and placed between '.' characters.
  *
- * No checking is done on the input integer value. Only the lower 32-bits are used.
- *
  * Any null entries will result in corresponding null entries in the output column.
  *
- * @throw cudf::logic_error if the input column is not INT64 type.
+ * @throw cudf::logic_error if the input column is not UINT32 type.
  *
- * @param integers Integer (INT64) column to convert
+ * @param integers Integer (UINT32) column to convert
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return New strings column
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 68a24e000ae..13d6e9bc3ba 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -46,7 +46,7 @@ namespace {
 struct ipv4_to_integers_fn {
   column_device_view const d_strings;
 
-  __device__ int64_t operator()(size_type idx)
+  __device__ uint32_t operator()(size_type idx)
   {
     if (d_strings.is_null(idx)) return 0;
     string_view d_str  = d_strings.element<string_view>(idx);
@@ -66,7 +66,7 @@ struct ipv4_to_integers_fn {
       }
     }
     uint32_t result = (ipvals[0] << 24) + (ipvals[1] << 16) + (ipvals[2] << 8) + ipvals[3];
-    return static_cast<int64_t>(result);
+    return result;
   }
 };
 
@@ -79,18 +79,18 @@ std::unique_ptr<column> ipv4_to_integers(strings_column_view const& input,
 {
   size_type strings_count = input.size();
   if (strings_count == 0) {
-    return make_numeric_column(data_type{type_id::INT64}, 0, mask_state::UNALLOCATED, stream);
+    return make_numeric_column(data_type{type_id::UINT32}, 0, mask_state::UNALLOCATED, stream);
   }
 
   auto strings_column = column_device_view::create(input.parent(), stream);
   // create output column copying the strings' null-mask
-  auto results   = make_numeric_column(data_type{type_id::INT64},
+  auto results   = make_numeric_column(data_type{type_id::UINT32},
                                      strings_count,
                                      cudf::detail::copy_bitmask(input.parent(), stream, mr),
                                      input.null_count(),
                                      stream,
                                      mr);
-  auto d_results = results->mutable_view().data<int64_t>();
+  auto d_results = results->mutable_view().data<uint32_t>();
   // fill output column with ipv4 integers
   thrust::transform(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<size_type>(0),
@@ -135,7 +135,7 @@ struct integers_to_ipv4_fn {
       return;
     }
 
-    auto const ip_number = d_column.element<int64_t>(idx);
+    auto const ip_number = d_column.element<uint32_t>(idx);
 
     char* out_ptr   = d_chars ? d_chars + d_offsets[idx] : nullptr;
     int shift_bits  = 24;
@@ -165,7 +165,7 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
 {
   if (integers.is_empty()) return make_empty_column(type_id::STRING);
 
-  CUDF_EXPECTS(integers.type().id() == type_id::INT64, "Input column must be type_id::INT64 type");
+  CUDF_EXPECTS(integers.type().id() == type_id::UINT32, "Input column must be UINT32 type");
 
   auto d_column = column_device_view::create(integers, stream);
   auto [offsets_column, chars] =
diff --git a/cpp/tests/strings/ipv4_tests.cpp b/cpp/tests/strings/ipv4_tests.cpp
index 3bfe0f9727e..ea3ac439e62 100644
--- a/cpp/tests/strings/ipv4_tests.cpp
+++ b/cpp/tests/strings/ipv4_tests.cpp
@@ -40,8 +40,8 @@ TEST_F(StringsConvertTest, IPv4ToIntegers)
   auto strings_view = cudf::strings_column_view(strings);
   auto results      = cudf::strings::ipv4_to_integers(strings_view);
 
-  std::vector<int64_t> h_expected{0, 0, 0, 698875905, 2130706433, 700776449, 3232235521};
-  cudf::test::fixed_width_column_wrapper<int64_t> expected(
+  std::vector<uint32_t> h_expected{0, 0, 0, 698875905, 2130706433, 700776449, 3232235521};
+  cudf::test::fixed_width_column_wrapper<uint32_t> expected(
     h_expected.cbegin(),
     h_expected.cend(),
     thrust::make_transform_iterator(h_strings.begin(),
@@ -59,8 +59,8 @@ TEST_F(StringsConvertTest, IntegersToIPv4)
     thrust::make_transform_iterator(h_strings.begin(),
                                     [](auto const str) { return str != nullptr; }));
 
-  std::vector<int64_t> h_column{3232235521, 167772161, 0, 0, 700055553, 700776449};
-  cudf::test::fixed_width_column_wrapper<int64_t> column(
+  std::vector<uint32_t> h_column{3232235521, 167772161, 0, 0, 700055553, 700776449};
+  cudf::test::fixed_width_column_wrapper<uint32_t> column(
     h_column.cbegin(),
     h_column.cend(),
     thrust::make_transform_iterator(h_strings.begin(),
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index df27134d458..b83d7600c82 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -313,8 +313,8 @@ def normalize_binop_value(
             return NotImplemented
 
     def int2ip(self) -> "cudf.core.column.StringColumn":
-        if self.dtype != cudf.dtype("int64"):
-            raise TypeError("Only int64 type can be converted to ip")
+        if self.dtype != cudf.dtype("uint32"):
+            raise TypeError("Only uint32 type can be converted to ip")
 
         return libcudf.string_casting.int2ip(self)
 
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 4bd084a3938..a2a3e874c91 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -2672,7 +2672,9 @@ def test_string_ip4_to_int():
 
 
 def test_string_int_to_ipv4():
-    gsr = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449])
+    gsr = cudf.Series([0, None, 0, 698875905, 2130706433, 700776449]).astype(
+        "uint32"
+    )
     expected = cudf.Series(
         ["0.0.0.0", None, "0.0.0.0", "41.168.0.1", "127.0.0.1", "41.197.0.1"]
     )
@@ -2718,7 +2720,7 @@ def test_string_isipv4():
 
 
 @pytest.mark.parametrize(
-    "dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"int64", "uint64"}))
+    "dtype", sorted(list(dtypeutils.NUMERIC_TYPES - {"uint32"}))
 )
 def test_string_int_to_ipv4_dtype_fail(dtype):
     gsr = cudf.Series([1, 2, 3, 4, 5]).astype(dtype)

From da51cad6c25f54ab344b0aa25e3dc1e4adb4550a Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 8 Aug 2024 10:25:11 -0500
Subject: [PATCH 639/842] Improve update-version.sh (#16506)

A few small tweaks to `update-version.sh` for alignment across RAPIDS.

The `UCX_PY` curl call is unused.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16506
---
 ci/release/update-version.sh | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index ad96aff3930..132e58249e6 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -18,18 +18,16 @@ CURRENT_MINOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}')
 CURRENT_PATCH=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}')
 CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 
-#Get <major>.<minor> for next version
+# Get <major>.<minor> for next version
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_PATCH=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[3]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
-NEXT_UCX_PY_VERSION="$(curl -sL https://version.gpuci.io/rapids/${NEXT_SHORT_TAG}).*"
 
 # Need to distutils-normalize the versions for some use cases
 CURRENT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${CURRENT_SHORT_TAG}'))")
 NEXT_SHORT_TAG_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_SHORT_TAG}'))")
 PATCH_PEP440=$(python -c "from setuptools.extern import packaging; print(packaging.version.Version('${NEXT_PATCH}'))")
-echo "current is ${CURRENT_SHORT_TAG_PEP440}, next is ${NEXT_SHORT_TAG_PEP440}"
 
 echo "Preparing release $CURRENT_TAG => $NEXT_FULL_TAG"
 
@@ -61,7 +59,7 @@ for DEP in "${DEPENDENCIES[@]}"; do
     sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
   for FILE in python/*/pyproject.toml; do
-    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE}
+    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" "${FILE}"
   done
 done
 
@@ -77,7 +75,7 @@ sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_
 # CI files
 for FILE in .github/workflows/*.yaml .github/workflows/*.yml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
-  sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" ${FILE};
+  sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
 sed_runner "s/branch-[0-9]+\.[0-9]+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh
 

From 792dd0686f4970c70f9bdba62c54a3de0a495fd5 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Thu, 8 Aug 2024 12:56:36 -0400
Subject: [PATCH 640/842] Update pre-commit hooks (#16510)

This PR updates pre-commit hooks to the latest versions that are supported without causing style check errors.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16510
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index bbcd78d051f..1b17eae0842 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -144,7 +144,7 @@ repos:
       - id: ruff-format
         files: python/.*$
   - repo: https://github.com/rapidsai/pre-commit-hooks
-    rev: v0.2.0
+    rev: v0.3.1
     hooks:
       - id: verify-copyright
         exclude: |

From 1bbe440ee7ddbc021f945e4156220f9bc270a443 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 8 Aug 2024 12:25:29 -0500
Subject: [PATCH 641/842] Add keep option to distinct nvbench (#16497)

This PR adopts some work from @srinivasyadav18 with additional modifications. This is meant to complement #16484.

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Srinivas Yadav (https://github.com/srinivasyadav18)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Srinivas Yadav (https://github.com/srinivasyadav18)

URL: https://github.com/rapidsai/cudf/pull/16497
---
 cpp/benchmarks/CMakeLists.txt                 |  1 +
 cpp/benchmarks/stream_compaction/distinct.cpp | 45 ++++++++++++-------
 .../stream_compaction/stable_distinct.cpp     | 45 ++++++++++++-------
 .../stream_compaction_common.cpp              | 35 +++++++++++++++
 .../stream_compaction_common.hpp              | 19 ++++++++
 5 files changed, 113 insertions(+), 32 deletions(-)
 create mode 100644 cpp/benchmarks/stream_compaction/stream_compaction_common.cpp
 create mode 100644 cpp/benchmarks/stream_compaction/stream_compaction_common.hpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 7be456ddfba..483b7b0a539 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -162,6 +162,7 @@ ConfigureNVBench(
   stream_compaction/distinct.cpp
   stream_compaction/distinct_count.cpp
   stream_compaction/stable_distinct.cpp
+  stream_compaction/stream_compaction_common.cpp
   stream_compaction/unique.cpp
   stream_compaction/unique_count.cpp
 )
diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp
index c04b6516903..d7deebca89a 100644
--- a/cpp/benchmarks/stream_compaction/distinct.cpp
+++ b/cpp/benchmarks/stream_compaction/distinct.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/stream_compaction/stream_compaction_common.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/list_view.hpp>
@@ -23,15 +24,29 @@
 
 #include <nvbench/nvbench.cuh>
 
+#include <limits>
+
 NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");
 
 template <typename Type>
 void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
 {
-  cudf::size_type const num_rows = state.get_int64("NumRows");
+  cudf::size_type const num_rows    = state.get_int64("NumRows");
+  auto const keep                   = get_keep(state.get_string("keep"));
+  cudf::size_type const cardinality = state.get_int64("cardinality");
+
+  if (cardinality > num_rows) {
+    state.skip("cardinality > num_rows");
+    return;
+  }
 
-  data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution(
-    cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
+  data_profile profile = data_profile_builder()
+                           .cardinality(cardinality)
+                           .null_probability(0.01)
+                           .distribution(cudf::type_to_id<Type>(),
+                                         distribution_id::UNIFORM,
+                                         static_cast<Type>(0),
+                                         std::numeric_limits<Type>::max());
 
   auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
 
@@ -40,20 +55,19 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = cudf::distinct(input_table,
-                                 {0},
-                                 cudf::duplicate_keep_option::KEEP_ANY,
-                                 cudf::null_equality::EQUAL,
-                                 cudf::nan_equality::ALL_EQUAL);
+    auto result = cudf::distinct(
+      input_table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
   });
 }
 
-using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
+using data_type = nvbench::type_list<int32_t, int64_t>;
 
 NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type))
   .set_name("distinct")
   .set_type_axes_names({"Type"})
-  .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
+  .add_string_axis("keep", {"any", "first", "last", "none"})
+  .add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000})
+  .add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000});
 
 template <typename Type>
 void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
@@ -61,6 +75,7 @@ void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
   auto const size               = state.get_int64("ColumnSize");
   auto const dtype              = cudf::type_to_id<Type>();
   double const null_probability = state.get_float64("null_probability");
+  auto const keep               = get_keep(state.get_string("keep"));
 
   auto builder = data_profile_builder().null_probability(null_probability);
   if (dtype == cudf::type_id::LIST) {
@@ -80,11 +95,8 @@ void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = cudf::distinct(*table,
-                                 {0},
-                                 cudf::duplicate_keep_option::KEEP_ANY,
-                                 cudf::null_equality::EQUAL,
-                                 cudf::nan_equality::ALL_EQUAL);
+    auto result =
+      cudf::distinct(*table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
   });
 }
 
@@ -92,5 +104,6 @@ NVBENCH_BENCH_TYPES(nvbench_distinct_list,
                     NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
   .set_name("distinct_list")
   .set_type_axes_names({"Type"})
+  .add_string_axis("keep", {"any", "first", "last", "none"})
   .add_float64_axis("null_probability", {0.0, 0.1})
   .add_int64_axis("ColumnSize", {100'000'000});
diff --git a/cpp/benchmarks/stream_compaction/stable_distinct.cpp b/cpp/benchmarks/stream_compaction/stable_distinct.cpp
index bcee3048013..0a8836c0583 100644
--- a/cpp/benchmarks/stream_compaction/stable_distinct.cpp
+++ b/cpp/benchmarks/stream_compaction/stable_distinct.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/stream_compaction/stream_compaction_common.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/list_view.hpp>
@@ -23,15 +24,29 @@
 
 #include <nvbench/nvbench.cuh>
 
+#include <limits>
+
 NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");
 
 template <typename Type>
 void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list<Type>)
 {
-  cudf::size_type const num_rows = state.get_int64("NumRows");
+  cudf::size_type const num_rows    = state.get_int64("NumRows");
+  auto const keep                   = get_keep(state.get_string("keep"));
+  cudf::size_type const cardinality = state.get_int64("cardinality");
+
+  if (cardinality > num_rows) {
+    state.skip("cardinality > num_rows");
+    return;
+  }
 
-  data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution(
-    cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
+  data_profile profile = data_profile_builder()
+                           .cardinality(cardinality)
+                           .null_probability(0.01)
+                           .distribution(cudf::type_to_id<Type>(),
+                                         distribution_id::UNIFORM,
+                                         static_cast<Type>(0),
+                                         std::numeric_limits<Type>::max());
 
   auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
 
@@ -40,20 +55,19 @@ void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list<Type>)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = cudf::stable_distinct(input_table,
-                                        {0},
-                                        cudf::duplicate_keep_option::KEEP_ANY,
-                                        cudf::null_equality::EQUAL,
-                                        cudf::nan_equality::ALL_EQUAL);
+    auto result = cudf::stable_distinct(
+      input_table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
   });
 }
 
-using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
+using data_type = nvbench::type_list<int32_t, int64_t>;
 
 NVBENCH_BENCH_TYPES(nvbench_stable_distinct, NVBENCH_TYPE_AXES(data_type))
   .set_name("stable_distinct")
   .set_type_axes_names({"Type"})
-  .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
+  .add_string_axis("keep", {"any", "first", "last", "none"})
+  .add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000})
+  .add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000});
 
 template <typename Type>
 void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
@@ -61,6 +75,7 @@ void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type
   auto const size               = state.get_int64("ColumnSize");
   auto const dtype              = cudf::type_to_id<Type>();
   double const null_probability = state.get_float64("null_probability");
+  auto const keep               = get_keep(state.get_string("keep"));
 
   auto builder = data_profile_builder().null_probability(null_probability);
   if (dtype == cudf::type_id::LIST) {
@@ -80,11 +95,8 @@ void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = cudf::stable_distinct(*table,
-                                        {0},
-                                        cudf::duplicate_keep_option::KEEP_ANY,
-                                        cudf::null_equality::EQUAL,
-                                        cudf::nan_equality::ALL_EQUAL);
+    auto result = cudf::stable_distinct(
+      *table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
   });
 }
 
@@ -92,5 +104,6 @@ NVBENCH_BENCH_TYPES(nvbench_stable_distinct_list,
                     NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
   .set_name("stable_distinct_list")
   .set_type_axes_names({"Type"})
+  .add_string_axis("keep", {"any", "first", "last", "none"})
   .add_float64_axis("null_probability", {0.0, 0.1})
   .add_int64_axis("ColumnSize", {100'000'000});
diff --git a/cpp/benchmarks/stream_compaction/stream_compaction_common.cpp b/cpp/benchmarks/stream_compaction/stream_compaction_common.cpp
new file mode 100644
index 00000000000..8cbb2956777
--- /dev/null
+++ b/cpp/benchmarks/stream_compaction/stream_compaction_common.cpp
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/stream_compaction/stream_compaction_common.hpp>
+
+#include <cudf/stream_compaction.hpp>
+#include <cudf/utilities/error.hpp>
+
+cudf::duplicate_keep_option get_keep(std::string const& keep_str)
+{
+  if (keep_str == "any") {
+    return cudf::duplicate_keep_option::KEEP_ANY;
+  } else if (keep_str == "first") {
+    return cudf::duplicate_keep_option::KEEP_FIRST;
+  } else if (keep_str == "last") {
+    return cudf::duplicate_keep_option::KEEP_LAST;
+  } else if (keep_str == "none") {
+    return cudf::duplicate_keep_option::KEEP_NONE;
+  } else {
+    CUDF_FAIL("Unsupported keep option.");
+  }
+}
diff --git a/cpp/benchmarks/stream_compaction/stream_compaction_common.hpp b/cpp/benchmarks/stream_compaction/stream_compaction_common.hpp
new file mode 100644
index 00000000000..d1ef2b10f41
--- /dev/null
+++ b/cpp/benchmarks/stream_compaction/stream_compaction_common.hpp
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/stream_compaction.hpp>
+
+cudf::duplicate_keep_option get_keep(std::string const& keep_str);

From 2c8de625b69bf5f7f3315c45a34bdf9ba45315a9 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Fri, 9 Aug 2024 08:25:58 -0500
Subject: [PATCH 642/842] enable list to be forced as string in JSON reader.
 (#16472)

closes #15278

This PR allows list type also forced as string when mixed type as string is enabled and a user given schema specifies a column as string, in JSON reader.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/16472
---
 cpp/src/io/json/json_column.cu  |  22 ++++---
 cpp/tests/io/json/json_test.cpp | 113 ++++++++++++++++++++++----------
 2 files changed, 90 insertions(+), 45 deletions(-)

diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 17fa7abdffe..e5e21e054a6 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -567,22 +567,22 @@ void make_device_json_column(device_span<SymbolT const> input,
     thrust::uninitialized_fill(rmm::exec_policy_nosync(stream), v.begin(), v.end(), 0);
   };
 
-  auto initialize_json_columns = [&](auto i, auto& col) {
-    if (column_categories[i] == NC_ERR || column_categories[i] == NC_FN) {
+  auto initialize_json_columns = [&](auto i, auto& col, auto column_category) {
+    if (column_category == NC_ERR || column_category == NC_FN) {
       return;
-    } else if (column_categories[i] == NC_VAL || column_categories[i] == NC_STR) {
+    } else if (column_category == NC_VAL || column_category == NC_STR) {
       col.string_offsets.resize(max_row_offsets[i] + 1, stream);
       col.string_lengths.resize(max_row_offsets[i] + 1, stream);
       init_to_zero(col.string_offsets);
       init_to_zero(col.string_lengths);
-    } else if (column_categories[i] == NC_LIST) {
+    } else if (column_category == NC_LIST) {
       col.child_offsets.resize(max_row_offsets[i] + 2, stream);
       init_to_zero(col.child_offsets);
     }
     col.num_rows = max_row_offsets[i] + 1;
     col.validity =
       cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
-    col.type = to_json_col_type(column_categories[i]);
+    col.type = to_json_col_type(column_category);
   };
 
   auto reinitialize_as_string = [&](auto i, auto& col) {
@@ -764,21 +764,23 @@ void make_device_json_column(device_span<SymbolT const> input,
       }
     }
 
+    auto this_column_category = column_categories[this_col_id];
     if (is_enabled_mixed_types_as_string) {
-      // get path of this column, check if it is a struct forced as string, and enforce it
+      // get path of this column, check if it is a struct/list forced as string, and enforce it
       auto const nt                             = tree_path.get_path(this_col_id);
       std::optional<data_type> const user_dtype = get_path_data_type(nt, options);
-      if (column_categories[this_col_id] == NC_STRUCT and user_dtype.has_value() and
-          user_dtype.value().id() == type_id::STRING) {
+      if ((column_categories[this_col_id] == NC_STRUCT or
+           column_categories[this_col_id] == NC_LIST) and
+          user_dtype.has_value() and user_dtype.value().id() == type_id::STRING) {
         is_mixed_type_column[this_col_id] = 1;
-        column_categories[this_col_id]    = NC_STR;
+        this_column_category              = NC_STR;
       }
     }
 
     CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
     // move into parent
     device_json_column col(stream, mr);
-    initialize_json_columns(this_col_id, col);
+    initialize_json_columns(this_col_id, col, this_column_category);
     auto inserted = parent_col.child_columns.try_emplace(name, std::move(col)).second;
     CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent");
     if (not replaced) parent_col.column_order.push_back(name);
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 993ab82f423..0a485e26b71 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2351,7 +2351,7 @@ TEST_F(JsonReaderTest, MapTypes)
   // Testing function for mixed types in JSON (for spark json reader)
   auto test_fn = [](std::string_view json_string, bool lines, std::vector<type_id> types) {
     std::map<std::string, cudf::io::schema_element> dtype_schema{
-      {"foo1", {data_type{type_id::STRING}}},  // list won't be a string
+      {"foo1", {data_type{type_id::STRING}}},  // list forced as a string
       {"foo2", {data_type{type_id::STRING}}},  // struct forced as a string
       {"1", {data_type{type_id::STRING}}},
       {"2", {data_type{type_id::STRING}}},
@@ -2378,17 +2378,17 @@ TEST_F(JsonReaderTest, MapTypes)
   test_fn(R"([{ "foo1": [1,2,3], "bar": 123 },
               { "foo2": { "a": 1 }, "bar": 456 }])",
           false,
-          {type_id::LIST, type_id::INT32, type_id::STRING});
+          {type_id::STRING, type_id::INT32, type_id::STRING});
   // jsonl
   test_fn(R"( { "foo1": [1,2,3], "bar": 123 }
               { "foo2": { "a": 1 }, "bar": 456 })",
           true,
-          {type_id::LIST, type_id::INT32, type_id::STRING});
+          {type_id::STRING, type_id::INT32, type_id::STRING});
   // jsonl-array
   test_fn(R"([123, [1,2,3]]
               [456, null,  { "a": 1 }])",
           true,
-          {type_id::INT64, type_id::LIST, type_id::STRING});
+          {type_id::INT64, type_id::STRING, type_id::STRING});
   // json-array
   test_fn(R"([[[1,2,3], null, 123],
               [null, { "a": 1 }, 456 ]])",
@@ -2678,38 +2678,81 @@ TEST_F(JsonReaderTest, JsonNestedDtypeFilter)
 
 TEST_F(JsonReaderTest, JSONMixedTypeChildren)
 {
-  std::string const json_str = R"(
-{ "Root": { "Key": [ { "EE": "A" } ] } }
-{ "Root": { "Key": {  } } }
-{ "Root": { "Key": [{ "YY": 1}] } }
-)";
-  // Column "EE" is created and destroyed
-  // Column "YY" should not be created
-
-  cudf::io::json_reader_options options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{json_str.c_str(), json_str.size()})
-      .lines(true)
-      .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
-      .normalize_single_quotes(true)
-      .normalize_whitespace(false)
-      .mixed_types_as_string(true)
-      .keep_quotes(true);
-
-  auto result = cudf::io::read_json(options);
+  // struct mixed.
+  {
+    std::string const json_str = R"(
+  { "Root": { "Key": [ { "EE": "A" } ] } }
+  { "Root": { "Key": {  } } }
+  { "Root": { "Key": [{ "YY": 1}] } }
+  )";
+    // Column "EE" is created and destroyed
+    // Column "YY" should not be created
+
+    cudf::io::json_reader_options options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_str.c_str(), json_str.size()})
+        .lines(true)
+        .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+        .normalize_single_quotes(true)
+        .normalize_whitespace(false)
+        .mixed_types_as_string(true)
+        .keep_quotes(true);
+
+    auto result = cudf::io::read_json(options);
+
+    ASSERT_EQ(result.tbl->num_columns(), 1);
+    ASSERT_EQ(result.metadata.schema_info.size(), 1);
+    EXPECT_EQ(result.metadata.schema_info[0].name, "Root");
+    ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
+    EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key");
+    ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
+    EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
+    // types
+    EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
+    EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING);
+    cudf::test::strings_column_wrapper expected(
+      {R"([ { "EE": "A" } ])", "{  }", R"([{ "YY": 1}])"});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0));
+  }
 
-  ASSERT_EQ(result.tbl->num_columns(), 1);
-  ASSERT_EQ(result.metadata.schema_info.size(), 1);
-  EXPECT_EQ(result.metadata.schema_info[0].name, "Root");
-  ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
-  EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key");
-  ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
-  EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
-  // types
-  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
-  EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING);
-  cudf::test::strings_column_wrapper expected({R"([ { "EE": "A" } ])", "{  }", R"([{ "YY": 1}])"});
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0));
+  // list mixed.
+  {
+    std::string const json_str = R"(
+  { "Root": { "Key": [ { "EE": "A" } ] } }
+  { "Root": { "Key": "abc" } }
+  { "Root": { "Key": [{ "YY": 1}] } }
+  )";
+    // Column "EE" is created and destroyed
+    // Column "YY" should not be created
+
+    cudf::io::json_reader_options options =
+      cudf::io::json_reader_options::builder(
+        cudf::io::source_info{json_str.c_str(), json_str.size()})
+        .lines(true)
+        .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+        .normalize_single_quotes(true)
+        .normalize_whitespace(false)
+        .mixed_types_as_string(true)
+        .keep_quotes(true);
+
+    auto result = cudf::io::read_json(options);
+
+    ASSERT_EQ(result.tbl->num_columns(), 1);
+    ASSERT_EQ(result.metadata.schema_info.size(), 1);
+    EXPECT_EQ(result.metadata.schema_info[0].name, "Root");
+    ASSERT_EQ(result.metadata.schema_info[0].children.size(), 1);
+    EXPECT_EQ(result.metadata.schema_info[0].children[0].name, "Key");
+    ASSERT_EQ(result.metadata.schema_info[0].children[0].children.size(), 2);
+    EXPECT_EQ(result.metadata.schema_info[0].children[0].children[0].name, "offsets");
+    // types
+    EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT);
+    EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::STRING);
+    cudf::test::strings_column_wrapper expected(
+      {R"([ { "EE": "A" } ])", "\"abc\"", R"([{ "YY": 1}])"});
+
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result.tbl->get_column(0).child(0));
+  }
 }
 
 CUDF_TEST_PROGRAM_MAIN()

From 9ec34ad81152a4d7889bdf1f5b92032000b09b8f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 9 Aug 2024 10:24:31 -0400
Subject: [PATCH 643/842] Remove a deprecated multibyte_split API (#16501)

Removes overloaded `cudf::io::text::multibyte_split` API deprecated in 24.08 and is no longer needed.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16501
---
 cpp/include/cudf/io/text/multibyte_split.hpp | 20 --------------------
 cpp/src/io/text/multibyte_split.cu           | 14 --------------
 2 files changed, 34 deletions(-)

diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index 8624a386d0f..3a1f9611324 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -96,26 +96,6 @@ std::unique_ptr<cudf::column> multibyte_split(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Splits the source text into a strings column using a multiple byte delimiter.
- *
- * @deprecated Since 24.08
- *
- * @param source The source input data encoded in UTF-8
- * @param delimiter UTF-8 encoded string for which to find offsets in the source
- * @param byte_range The position and size within `source` to produce the column from
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Memory resource to use for the device memory allocation
- * @return The strings found by splitting the source by the delimiter within the relevant byte
- * range.
- */
-[[deprecated]] std::unique_ptr<cudf::column> multibyte_split(
-  data_chunk_source const& source,
-  std::string const& delimiter,
-  std::optional<byte_range_info> byte_range,
-  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
-
 /** @} */  // end of group
 
 }  // namespace text
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index be2e2b9a79c..97729a091fb 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -567,20 +567,6 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 
 }  // namespace detail
 
-// deprecated in 24.08
-std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
-                                              std::string const& delimiter,
-                                              std::optional<byte_range_info> byte_range,
-                                              rmm::cuda_stream_view stream,
-                                              rmm::device_async_resource_ref mr)
-{
-  return multibyte_split(source,
-                         delimiter,
-                         parse_options{byte_range.value_or(create_byte_range_info_max())},
-                         stream,
-                         mr);
-}
-
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               parse_options options,

From 8009dc800bf79ba5fbacc9658235a212590640ba Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Fri, 9 Aug 2024 09:07:47 -0700
Subject: [PATCH 644/842] Update docs of the TPC-H derived examples (#16423)

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16423
---
 .gitignore                                    |  2 +
 cpp/examples/tpch/README.md                   | 37 ++++++------
 .../tpch/datagen/correct_datatypes.py         | 60 +++++++++++++++++++
 cpp/examples/tpch/datagen/datagen.sh          | 31 ++++++++++
 cpp/examples/tpch/datagen/tpch.patch          | 33 ++++++++++
 5 files changed, 145 insertions(+), 18 deletions(-)
 create mode 100644 cpp/examples/tpch/datagen/correct_datatypes.py
 create mode 100755 cpp/examples/tpch/datagen/datagen.sh
 create mode 100644 cpp/examples/tpch/datagen/tpch.patch

diff --git a/.gitignore b/.gitignore
index c89fb49697a..153c7f59744 100644
--- a/.gitignore
+++ b/.gitignore
@@ -79,6 +79,8 @@ Debug
 build/
 cpp/build/
 cpp/examples/*/install/
+cpp/examples/*/build/
+cpp/examples/tpch/datagen/datafusion
 cpp/include/cudf/ipc_generated/*.h
 cpp/thirdparty/googletest/
 
diff --git a/cpp/examples/tpch/README.md b/cpp/examples/tpch/README.md
index 1ea71ae9824..8c046c3f1e8 100644
--- a/cpp/examples/tpch/README.md
+++ b/cpp/examples/tpch/README.md
@@ -1,38 +1,39 @@
-# TPC-H Inspired Examples
+# TPC-H Derived Examples
 
 Implements TPC-H queries using `libcudf`. We leverage the data generator (wrapper around official TPC-H datagen) from [Apache Datafusion](https://github.com/apache/datafusion) for generating data in Parquet format.
 
 ## Requirements
 
 - Rust
+- [libcudf](https://github.com/rapidsai/cudf/blob/branch-24.08/CONTRIBUTING.md#setting-up-your-build-environment)
 
-## Generating the Dataset
+## Running Queries
 
-1. Clone the datafusion repository.
+1. Build the `libcudf` examples.
 ```bash
-git clone git@github.com:apache/datafusion.git
+cd cudf/cpp/examples
+./build.sh
 ```
+The TPC-H query binaries would be built inside `tpch/build`.
 
-2. Run the data generator. The data will be placed in a `data/` subdirectory.
+2. Generate the dataset.
 ```bash
-cd datafusion/benchmarks/
-./bench.sh data tpch
-
-# for scale factor 10,
-./bench.sh data tpch10
+cd tpch/datagen
+./datagen.sh [scale factor (1/10)]
 ```
 
-## Running Queries
+The parquet files will be generated in `tpch/datagen/datafusion/benchmarks/data/tpch_sf[scale factor]`.
 
-1. Build the examples.
+3. Set these environment variables for optimized runtimes.
 ```bash
-cd cpp/examples
-./build.sh
+export KVIKIO_COMPAT_MODE="on"
+export LIBCUDF_CUFILE_POLICY="KVIKIO"
+export CUDA_MODULE_LOADING="EAGER"
 ```
-The TPC-H query binaries would be built inside `examples/tpch/build`.
 
-2. Execute the queries.
+4. Execute the queries.
 ```bash
-./tpch/build/tpch_q1
+./tpch/build/tpch_q[query no] [path to dataset] [memory resource type (cuda/pool/managed/managed_pool)]
 ```
-A parquet file named `q1.parquet` would be generated holding the results of the query.
+
+A parquet file named `q[query no].parquet` would be generated containing the results of the query.
diff --git a/cpp/examples/tpch/datagen/correct_datatypes.py b/cpp/examples/tpch/datagen/correct_datatypes.py
new file mode 100644
index 00000000000..8564774647b
--- /dev/null
+++ b/cpp/examples/tpch/datagen/correct_datatypes.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import os
+import sys
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+import pandas as pd
+
+if __name__ == "__main__":
+    dataset_path = str(sys.argv[1])
+    tables = ["lineitem", "part", "partsupp", "orders", "supplier", "customer", "nation", "region"]
+    for table in tables:
+        filepath = os.path.join(dataset_path, f"{table}.parquet")
+        print("Reading file ", filepath)
+
+        if filepath.endswith("lineitem.parquet"):
+            df = pd.read_parquet(filepath)
+            df["l_linenumber"] = df["l_linenumber"].astype("int64")
+            df["l_quantity"] = df["l_quantity"].astype("int64")
+            df["l_extendedprice"] = df["l_extendedprice"].astype("float64")
+            df["l_discount"] = df["l_discount"].astype("float64")
+            df["l_tax"] = df["l_tax"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("part.parquet"):
+            df = pd.read_parquet(filepath)
+            df["p_size"] = df["p_size"].astype("int64")
+            df["p_retailprice"] = df["p_retailprice"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("partsupp.parquet"):
+            df = pd.read_parquet(filepath)
+            df["ps_availqty"] = df["ps_availqty"].astype("int64")
+            df["ps_supplycost"] = df["ps_supplycost"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("orders.parquet"):
+            df = pd.read_parquet(filepath)
+            df["o_totalprice"] = df["o_totalprice"].astype("float64")
+            df["o_shippriority"] = df["o_shippriority"].astype("int64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("supplier.parquet"):
+            df = pd.read_parquet(filepath)
+            df["s_acctbal"] = df["s_acctbal"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("customer.parquet"):
+            df = pd.read_parquet(filepath)
+            df["c_acctbal"] = df["c_acctbal"].astype("float64")
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("nation.parquet"):
+            df = pd.read_parquet(filepath)
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
+
+        elif filepath.endswith("region.parquet"):
+            df = pd.read_parquet(filepath)
+            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
diff --git a/cpp/examples/tpch/datagen/datagen.sh b/cpp/examples/tpch/datagen/datagen.sh
new file mode 100755
index 00000000000..0b03753daea
--- /dev/null
+++ b/cpp/examples/tpch/datagen/datagen.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -e
+
+scale_factor=$1
+script_dir=$(pwd)
+
+# Clone the datafusion repository and apply a patch
+# for single threaded data generation so that a
+# single parquet file is generated for each table
+rm -rf datafusion
+git clone https://github.com/apache/datafusion.git datafusion
+cd datafusion/
+git checkout 679a85f
+git apply ${script_dir}/tpch.patch
+cd benchmarks/
+
+# Generate the data
+# Currently, we support only scale factor 1 and 10
+if [ ${scale_factor} -eq 1 ]; then
+    ./bench.sh data tpch
+elif [ ${scale_factor} -eq 10 ]; then
+    ./bench.sh data tpch10
+else
+    echo "Unsupported scale factor"
+    exit 1
+fi
+
+# Correct the datatypes of the parquet files
+python3 ${script_dir}/correct_datatypes.py data/tpch_sf${scale_factor}
diff --git a/cpp/examples/tpch/datagen/tpch.patch b/cpp/examples/tpch/datagen/tpch.patch
new file mode 100644
index 00000000000..42727aa9904
--- /dev/null
+++ b/cpp/examples/tpch/datagen/tpch.patch
@@ -0,0 +1,33 @@
+diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
+index 3b854f6dc..f000f09c0 100755
+--- a/benchmarks/bench.sh
++++ b/benchmarks/bench.sh
+@@ -311,6 +311,15 @@ data_tpch() {
+         $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet
+         popd > /dev/null
+     fi
++
++    cp ${TPCH_DIR}/lineitem/part-0.parquet ${TPCH_DIR}/lineitem.parquet
++    cp ${TPCH_DIR}/orders/part-0.parquet ${TPCH_DIR}/orders.parquet
++    cp ${TPCH_DIR}/part/part-0.parquet ${TPCH_DIR}/part.parquet
++    cp ${TPCH_DIR}/partsupp/part-0.parquet ${TPCH_DIR}/partsupp.parquet
++    cp ${TPCH_DIR}/customer/part-0.parquet ${TPCH_DIR}/customer.parquet
++    cp ${TPCH_DIR}/supplier/part-0.parquet ${TPCH_DIR}/supplier.parquet
++    cp ${TPCH_DIR}/nation/part-0.parquet ${TPCH_DIR}/nation.parquet
++    cp ${TPCH_DIR}/region/part-0.parquet ${TPCH_DIR}/region.parquet
+ }
+
+ # Runs the tpch benchmark
+diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
+index b5204b343..84fd2e78d 100644
+--- a/datafusion/common/src/config.rs
++++ b/datafusion/common/src/config.rs
+@@ -250,7 +250,7 @@ config_namespace! {
+         /// concurrency.
+         ///
+         /// Defaults to the number of CPU cores on the system
+-        pub target_partitions: usize, default = num_cpus::get()
++        pub target_partitions: usize, default = 1
+
+         /// The default time zone
+         ///

From 4446cf0188c03b82cbec28493aa131027f25dffa Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Fri, 9 Aug 2024 12:43:23 -0500
Subject: [PATCH 645/842] Update json normalization to take device_buffer
 (#16520)

This change updates json normalization calls (quote and whitespace normalization) to take owning buffer of device_buffer as input rather than device_uvector. It makes it easy to hand over a string_column's char buffer to normalization calls.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/16520
---
 cpp/include/cudf/io/detail/json.hpp           |  4 ++--
 cpp/src/io/json/json_normalization.cu         | 20 +++++++++----------
 cpp/src/io/json/read_json.cu                  | 16 +++++++--------
 .../io/json/json_quote_normalization_test.cpp |  9 ++++-----
 .../json_whitespace_normalization_test.cu     |  7 +++----
 5 files changed, 27 insertions(+), 29 deletions(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 42b10a78ce8..38ba4f675c3 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -61,7 +61,7 @@ void write_json(data_sink* sink,
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
+void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& indata,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr);
 
@@ -72,7 +72,7 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<char>
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
+void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
                           rmm::cuda_stream_view stream,
                           rmm::device_async_resource_ref mr);
 }  // namespace io::json::detail
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index 760b2214365..cb8b4e97ebb 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -298,7 +298,7 @@ struct TransduceToNormalizedWS {
 
 namespace detail {
 
-void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<SymbolT>>& indata,
+void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& indata,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
@@ -311,22 +311,22 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<Symbo
                             normalize_quotes::TransduceToNormalizedQuotes{}),
                           stream);
 
-  rmm::device_uvector<SymbolT> outbuf(indata.size() * 2, stream, mr);
+  rmm::device_buffer outbuf(indata.size() * 2, stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(indata.data(),
+  parser.Transduce(reinterpret_cast<SymbolT const*>(indata.data()),
                    static_cast<SymbolOffsetT>(indata.size()),
-                   outbuf.data(),
+                   static_cast<SymbolT*>(outbuf.data()),
                    thrust::make_discard_iterator(),
                    outbuf_size.data(),
                    normalize_quotes::start_state,
                    stream);
 
   outbuf.resize(outbuf_size.value(stream), stream);
-  datasource::owning_buffer<rmm::device_uvector<SymbolT>> outdata(std::move(outbuf));
+  datasource::owning_buffer<rmm::device_buffer> outdata(std::move(outbuf));
   std::swap(indata, outdata);
 }
 
-void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<SymbolT>>& indata,
+void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
                           rmm::cuda_stream_view stream,
                           rmm::device_async_resource_ref mr)
 {
@@ -339,18 +339,18 @@ void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<SymbolT>
                             normalize_whitespace::TransduceToNormalizedWS{}),
                           stream);
 
-  rmm::device_uvector<SymbolT> outbuf(indata.size(), stream, mr);
+  rmm::device_buffer outbuf(indata.size(), stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(indata.data(),
+  parser.Transduce(reinterpret_cast<SymbolT const*>(indata.data()),
                    static_cast<SymbolOffsetT>(indata.size()),
-                   outbuf.data(),
+                   static_cast<SymbolT*>(outbuf.data()),
                    thrust::make_discard_iterator(),
                    outbuf_size.data(),
                    normalize_whitespace::start_state,
                    stream);
 
   outbuf.resize(outbuf_size.value(stream), stream);
-  datasource::owning_buffer<rmm::device_uvector<SymbolT>> outdata(std::move(outbuf));
+  datasource::owning_buffer<rmm::device_buffer> outdata(std::move(outbuf));
   std::swap(indata, outdata);
 }
 
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 590f70864b1..e0d0497e0a2 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -168,7 +168,7 @@ size_t estimate_size_per_subchunk(size_t chunk_size)
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @returns Data source owning buffer enclosing the bytes read
  */
-datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
+datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
   host_span<std::unique_ptr<datasource>> sources,
   json_reader_options const& reader_opts,
   rmm::cuda_stream_view stream)
@@ -200,8 +200,8 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
       ? total_source_size * estimated_compression_ratio + header_size
       : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk) +
           num_extra_delimiters;
-  rmm::device_uvector<char> buffer(buffer_size, stream);
-  device_span<char> bufspan(buffer);
+  rmm::device_buffer buffer(buffer_size, stream);
+  device_span<char> bufspan(reinterpret_cast<char*>(buffer.data()), buffer.size());
 
   // Offset within buffer indicating first read position
   std::int64_t buffer_offset = 0;
@@ -213,8 +213,8 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
     chunk_offset == 0 ? 0 : find_first_delimiter(readbufspan, '\n', stream);
   if (first_delim_pos == -1) {
     // return empty owning datasource buffer
-    auto empty_buf = rmm::device_uvector<char>(0, stream);
-    return datasource::owning_buffer<rmm::device_uvector<char>>(std::move(empty_buf));
+    auto empty_buf = rmm::device_buffer(0, stream);
+    return datasource::owning_buffer<rmm::device_buffer>(std::move(empty_buf));
   } else if (!should_load_all_sources) {
     // Find next delimiter
     std::int64_t next_delim_pos = -1;
@@ -232,12 +232,12 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
     }
     if (next_delim_pos < buffer_offset) next_delim_pos = buffer_offset + readbufspan.size();
 
-    return datasource::owning_buffer<rmm::device_uvector<char>>(
+    return datasource::owning_buffer<rmm::device_buffer>(
       std::move(buffer),
       reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
       next_delim_pos - first_delim_pos - shift_for_nonzero_offset);
   }
-  return datasource::owning_buffer<rmm::device_uvector<char>>(
+  return datasource::owning_buffer<rmm::device_buffer>(
     std::move(buffer),
     reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
     readbufspan.size() - first_delim_pos - shift_for_nonzero_offset);
@@ -249,7 +249,7 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  datasource::owning_buffer<rmm::device_uvector<char>> bufview =
+  datasource::owning_buffer<rmm::device_buffer> bufview =
     get_record_range_raw_input(sources, reader_opts, stream);
 
   // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp
index 55ad0afe499..3a9ba8d9f3b 100644
--- a/cpp/tests/io/json/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json/json_quote_normalization_test.cpp
@@ -26,7 +26,7 @@
 #include <cudf/io/json.hpp>
 #include <cudf/io/types.hpp>
 
-#include <rmm/device_uvector.hpp>
+#include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
@@ -42,12 +42,11 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou
     std::make_shared<rmm::mr::cuda_memory_resource>();
 
   auto stream_view  = cudf::test::get_default_stream();
-  auto device_input = cudf::detail::make_device_uvector_async(
-    host_input, stream_view, rmm::mr::get_current_device_resource());
+  auto device_input = rmm::device_buffer(
+    host_input.c_str(), host_input.size(), stream_view, rmm::mr::get_current_device_resource());
 
   // Preprocessing FST
-  cudf::io::datasource::owning_buffer<rmm::device_uvector<char>> device_data(
-    std::move(device_input));
+  cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
   cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get());
 
   std::string preprocessed_host_output(device_data.size(), 0);
diff --git a/cpp/tests/io/json/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu
index 8ed5fa81b12..01dd17fab98 100644
--- a/cpp/tests/io/json/json_whitespace_normalization_test.cu
+++ b/cpp/tests/io/json/json_whitespace_normalization_test.cu
@@ -38,12 +38,11 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou
   // Prepare cuda stream for data transfers & kernels
   auto stream_view = cudf::test::get_default_stream();
 
-  auto device_input = cudf::detail::make_device_uvector_async(
-    host_input, stream_view, rmm::mr::get_current_device_resource());
+  auto device_input = rmm::device_buffer(
+    host_input.c_str(), host_input.size(), stream_view, rmm::mr::get_current_device_resource());
 
   // Preprocessing FST
-  cudf::io::datasource::owning_buffer<rmm::device_uvector<char>> device_data(
-    std::move(device_input));
+  cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
   cudf::io::json::detail::normalize_whitespace(
     device_data, stream_view, rmm::mr::get_current_device_resource());
 

From 16aa0eaa54d00d88f897766d91f9e531f64b0070 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 9 Aug 2024 09:33:19 -1000
Subject: [PATCH 646/842] Allow DataFrame.sort_values(by=) to select an index
 level (#16519)

closes #14794

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16519
---
 python/cudf/cudf/core/index.py         | 13 ++++++++++++-
 python/cudf/cudf/core/indexed_frame.py | 26 +++++++++++++++++++++++++-
 python/cudf/cudf/tests/test_sorting.py | 20 ++++++++++++++++++++
 3 files changed, 57 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 094da09ab08..7f40428c1b8 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -5,6 +5,7 @@
 import operator
 import pickle
 import warnings
+from collections.abc import Hashable
 from functools import cache, cached_property
 from numbers import Number
 from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast
@@ -60,7 +61,7 @@
 from cudf.utils.utils import _warn_no_dask_cudf, search_range
 
 if TYPE_CHECKING:
-    from collections.abc import Generator, Hashable, Iterable
+    from collections.abc import Generator, Iterable
     from datetime import tzinfo
 
 
@@ -450,6 +451,16 @@ def __getitem__(self, index):
             return self.start + index * self.step
         return self._as_int_index()[index]
 
+    def _get_columns_by_label(self, labels) -> Index:
+        # used in .sort_values
+        if isinstance(labels, Hashable):
+            if labels == self.name:
+                return self._as_int_index()
+        elif is_list_like(labels):
+            if list(self.names) == list(labels):
+                return self._as_int_index()
+        raise KeyError(labels)
+
     @_performance_tracking
     def equals(self, other) -> bool:
         if isinstance(other, RangeIndex):
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 24d947a574a..3b44a0f5864 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3592,10 +3592,34 @@ def sort_values(
         if len(self) == 0:
             return self
 
+        try:
+            by_in_columns = self._get_columns_by_label(by)
+        except KeyError:
+            by_in_columns = None
+        if self.ndim == 1:
+            # For Series case, we're never selecting an index level.
+            by_in_index = None
+        else:
+            try:
+                by_in_index = self.index._get_columns_by_label(by)
+            except KeyError:
+                by_in_index = None
+
+        if by_in_columns is not None and by_in_index is not None:
+            raise ValueError(
+                f"{by=} appears in the {type(self).__name__} columns "
+                "and as an index level which is ambiguous."
+            )
+        elif by_in_columns is not None:
+            by_columns = by_in_columns
+        elif by_in_index is not None:
+            by_columns = by_in_index
+        else:
+            raise KeyError(by)
         # argsort the `by` column
         out = self._gather(
             GatherMap.from_column_unchecked(
-                self._get_columns_by_label(by)._get_sorted_inds(
+                by_columns._get_sorted_inds(
                     ascending=ascending, na_position=na_position
                 ),
                 len(self),
diff --git a/python/cudf/cudf/tests/test_sorting.py b/python/cudf/cudf/tests/test_sorting.py
index a8ffce6e88b..2cf2259d9ec 100644
--- a/python/cudf/cudf/tests/test_sorting.py
+++ b/python/cudf/cudf/tests/test_sorting.py
@@ -405,3 +405,23 @@ def test_dataframe_scatter_by_map_empty():
     df = DataFrame({"a": [], "b": []}, dtype="float64")
     scattered = df.scatter_by_map(df["a"])
     assert len(scattered) == 0
+
+
+def test_sort_values_by_index_level():
+    df = pd.DataFrame({"a": [1, 3, 2]}, index=pd.Index([1, 3, 2], name="b"))
+    cudf_df = DataFrame.from_pandas(df)
+    result = cudf_df.sort_values("b")
+    expected = df.sort_values("b")
+    assert_eq(result, expected)
+
+
+def test_sort_values_by_ambiguous():
+    df = pd.DataFrame({"a": [1, 3, 2]}, index=pd.Index([1, 3, 2], name="a"))
+    cudf_df = DataFrame.from_pandas(df)
+
+    assert_exceptions_equal(
+        lfunc=df.sort_values,
+        rfunc=cudf_df.sort_values,
+        lfunc_args_and_kwargs=(["a"], {}),
+        rfunc_args_and_kwargs=(["a"], {}),
+    )

From 4cd87d3fdb0de6154504f8486ed49b685a9dceec Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 9 Aug 2024 09:33:53 -1000
Subject: [PATCH 647/842] Fix `date_range(start, end, freq)` when end-start is
 divisible by freq (#16516)

xref https://github.com/rapidsai/cudf/issues/16507

`date_range` generates its dates via `range`, and the end of this range was calculated via `math.ceil((end - start) / freq)`. If `(end - start) / freq` did not produce a remainder, `math.ceil` would not correctly increment this value by `1` to capture the last date.

Instead, this PR uses `math.floor((end - start) / freq) + 1` to always ensure the last date is captured

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16516
---
 python/cudf/cudf/core/index.py           | 6 ++++--
 python/cudf/cudf/core/series.py          | 3 +++
 python/cudf/cudf/core/tools/datetimes.py | 9 +++++----
 python/cudf/cudf/tests/test_datetime.py  | 6 ++++++
 4 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 7f40428c1b8..3eab27bd165 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2414,11 +2414,13 @@ def day_name(self, locale: str | None = None) -> Index:
         >>> datetime_index = cudf.date_range("2016-12-31", "2017-01-08", freq="D")
         >>> datetime_index
         DatetimeIndex(['2016-12-31', '2017-01-01', '2017-01-02', '2017-01-03',
-                       '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07'],
+                       '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07',
+                       '2017-01-08'],
                       dtype='datetime64[ns]', freq='D')
         >>> datetime_index.day_name()
         Index(['Saturday', 'Sunday', 'Monday', 'Tuesday', 'Wednesday', 'Thursday',
-               'Friday', 'Saturday'], dtype='object')
+               'Friday', 'Saturday', 'Sunday'],
+              dtype='object')
         """
         day_names = self._column.get_day_names(locale)
         return Index._from_data({self.name: day_names})
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index de57ac5f290..53675d339ac 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -801,14 +801,17 @@ def dt(self):
         >>> s.dt.hour
         0    12
         1    13
+        2    14
         dtype: int16
         >>> s.dt.second
         0    0
         1    0
+        2    0
         dtype: int16
         >>> s.dt.day
         0    3
         1    3
+        2    3
         dtype: int16
 
         Returns
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index 2f77778116f..c50a36b68b5 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -951,7 +951,7 @@ def date_range(
         end = cudf.Scalar(end, dtype=dtype)
         _is_increment_sequence = end >= start
 
-        periods = math.ceil(
+        periods = math.floor(
             int(end - start) / _offset_to_nanoseconds_lower_bound(offset)
         )
 
@@ -959,9 +959,10 @@ def date_range(
             # Mismatched sign between (end-start) and offset, return empty
             # column
             periods = 0
-        elif periods == 0:
-            # end == start, return exactly 1 timestamp (start)
-            periods = 1
+        else:
+            # If end == start, periods == 0 and we return exactly 1 timestamp (start).
+            # Otherwise, since closed="both", we ensure the end point is included.
+            periods += 1
 
     # We compute `end_estim` (the estimated upper bound of the date
     # range) below, but don't always use it.  We do this to ensure
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 6bc775d2a2c..7be4faa42c3 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -2536,3 +2536,9 @@ def test_dti_methods(method, kwargs):
     result = getattr(cudf_dti, method)(**kwargs)
     expected = getattr(pd_dti, method)(**kwargs)
     assert_eq(result, expected)
+
+
+def test_date_range_start_end_divisible_by_freq():
+    result = cudf.date_range("2011-01-01", "2011-01-02", freq="h")
+    expected = pd.date_range("2011-01-01", "2011-01-02", freq="h")
+    assert_eq(result, expected)

From 45b20d135a290d5f14e291316e94674653f71737 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 9 Aug 2024 12:22:15 -1000
Subject: [PATCH 648/842] Preserve array name in MultiIndex.from_arrays
 (#16515)

xref https://github.com/rapidsai/cudf/issues/16507

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16515
---
 python/cudf/cudf/core/multiindex.py       |  4 ++++
 python/cudf/cudf/tests/test_multiindex.py | 10 ++++++++++
 2 files changed, 14 insertions(+)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index 9646b34830f..ab88b191570 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -1394,12 +1394,16 @@ def from_arrays(
             raise TypeError(error_msg)
         codes = []
         levels = []
+        names_from_arrays = []
         for array in arrays:
             if not (is_list_like(array) or is_column_like(array)):
                 raise TypeError(error_msg)
             code, level = factorize(array, sort=True)
             codes.append(code)
             levels.append(level)
+            names_from_arrays.append(getattr(array, "name", None))
+        if names is None:
+            names = names_from_arrays
         return cls(
             codes=codes, levels=levels, sortorder=sortorder, names=names
         )
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index b7314a36e73..a68f4574da3 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -2179,3 +2179,13 @@ def test_unique_level():
     result = pd_mi.unique(level=1)
     expected = cudf_mi.unique(level=1)
     assert_eq(result, expected)
+
+
+@pytest.mark.parametrize(
+    "idx", [pd.Index, pd.CategoricalIndex, pd.DatetimeIndex, pd.TimedeltaIndex]
+)
+def test_from_arrays_infer_names(idx):
+    arrays = [idx([1], name="foo"), idx([2], name="bar")]
+    expected = pd.MultiIndex.from_arrays(arrays)
+    result = cudf.MultiIndex.from_arrays(arrays)
+    assert_eq(result, expected)

From a3dc14fcea938729c7c9468bd6a64331395b2f78 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 12 Aug 2024 07:56:48 -1000
Subject: [PATCH 649/842] Disallow indexing by selecting duplicate labels
 (#16514)

xref https://github.com/rapidsai/cudf/issues/16507

I would say this was a bug before because we would silently return a new DataFrame with just `len(set(column_labels))` when selecting by column. Now this operation raises since duplicate column labels are generally not supported.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/16514
---
 python/cudf/cudf/core/column_accessor.py | 4 ++++
 python/cudf/cudf/tests/test_indexing.py  | 8 ++++++++
 2 files changed, 12 insertions(+)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 819d351b2c4..83596704672 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -530,6 +530,10 @@ def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
             )
         else:
             data = {k: self._grouped_data[k] for k in key}
+            if len(data) != len(key):
+                raise ValueError(
+                    "Selecting duplicate column labels is not supported."
+                )
         if self.multiindex:
             data = dict(_to_flat_dict_inner(data))
         return self.__class__(
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 7005cbc6834..716b4dc6acd 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -2361,3 +2361,11 @@ def test_sliced_categorical_as_ordered():
         name="a",
     )
     assert_eq(result, expected)
+
+
+def test_duplicate_labels_raises():
+    df = cudf.DataFrame([[1, 2]], columns=["a", "b"])
+    with pytest.raises(ValueError):
+        df[["a", "a"]]
+    with pytest.raises(ValueError):
+        df.loc[:, ["a", "a"]]

From 091cb72294a394deb176600e74c7cb115cfff05a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 12 Aug 2024 14:48:02 -0400
Subject: [PATCH 650/842] Remove deprecated public APIs from libcudf (#16524)

Removing some more deprecated public libcudf APIs.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16524
---
 cpp/include/cudf/strings/replace.hpp       | 12 ------------
 cpp/include/cudf/utilities/type_checks.hpp | 19 -------------------
 cpp/src/strings/replace/multi.cu           | 11 -----------
 cpp/src/utilities/type_checks.cpp          |  5 -----
 4 files changed, 47 deletions(-)

diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index 5b4ffb98f99..f450b77ad7a 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -160,18 +160,6 @@ std::unique_ptr<column> replace_multiple(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-/**
- * @copydoc cudf::strings::replace_multiple
- *
- * @deprecated since 24.08
- */
-[[deprecated]] std::unique_ptr<column> replace(
-  strings_column_view const& input,
-  strings_column_view const& targets,
-  strings_column_view const& repls,
-  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
-
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/type_checks.hpp b/cpp/include/cudf/utilities/type_checks.hpp
index 4fcbca09d17..aeb5db57830 100644
--- a/cpp/include/cudf/utilities/type_checks.hpp
+++ b/cpp/include/cudf/utilities/type_checks.hpp
@@ -22,25 +22,6 @@
 
 namespace CUDF_EXPORT cudf {
 
-/**
- * @brief Compare the types of two `column_view`s
- *
- * @deprecated Since 24.06. Use cudf::have_same_types instead.
- *
- * This function returns true if the type of `lhs` equals that of `rhs`.
- * - For fixed point types, the scale is compared.
- * - For dictionary types, the type of the keys are compared if both are
- *   non-empty columns.
- * - For lists types, the type of child columns are compared recursively.
- * - For struct types, the type of each field are compared in order.
- * - For all other types, the `id` of `data_type` is compared.
- *
- * @param lhs The first `column_view` to compare
- * @param rhs The second `column_view` to compare
- * @return true if column types match
- */
-[[deprecated]] bool column_types_equal(column_view const& lhs, column_view const& rhs);
-
 /**
  * @brief Compare the type IDs of two `column_view`s
  *
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index 2ca22f0e017..b5248700d53 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -533,16 +533,5 @@ std::unique_ptr<column> replace_multiple(strings_column_view const& strings,
   return detail::replace_multiple(strings, targets, repls, stream, mr);
 }
 
-// deprecated in 24.08
-std::unique_ptr<column> replace(strings_column_view const& strings,
-                                strings_column_view const& targets,
-                                strings_column_view const& repls,
-                                rmm::cuda_stream_view stream,
-                                rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::replace_multiple(strings, targets, repls, stream, mr);
-}
-
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/utilities/type_checks.cpp b/cpp/src/utilities/type_checks.cpp
index dac981fb532..3095b342748 100644
--- a/cpp/src/utilities/type_checks.cpp
+++ b/cpp/src/utilities/type_checks.cpp
@@ -139,11 +139,6 @@ bool have_same_types(column_view const& lhs, column_view const& rhs)
   return type_dispatcher(lhs.type(), columns_equal_fn{}, lhs, rhs);
 }
 
-bool column_types_equal(column_view const& lhs, column_view const& rhs)
-{
-  return have_same_types(lhs, rhs);
-}
-
 bool have_same_types(column_view const& lhs, scalar const& rhs)
 {
   return type_dispatcher(lhs.type(), column_scalar_equal_fn{}, lhs, rhs);

From cce00c00b0ae374ee72332aaea5fcd1cc121e85a Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Mon, 12 Aug 2024 14:38:37 -0700
Subject: [PATCH 651/842] Pass batch size to JSON reader using environment
 variable (#16502)

The JSON reader set the batch size to `INT_MAX` bytes since the motivation for implementing a batched JSON reader was to parse source files whose total size is larger than `INT_MAX` (#16138, #16162). However, we can use a much smaller batch size to evaluate the correctness of the reader and speed up tests significantly.
This PR focuses on reducing runtime of the batched reader test by setting the batch size to be used by the reader as an environment variable.
The runtime of `JsonLargeReaderTest.MultiBatch` in `LARGE_STRINGS_TEST` gtest  drops from ~52s to ~3s.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16502
---
 cpp/CMakeLists.txt                    |   1 -
 cpp/src/io/json/byte_range_info.cu    |  37 ----
 cpp/src/io/json/read_json.cu          | 291 +++++++++++++++-----------
 cpp/src/io/json/read_json.hpp         |  28 ++-
 cpp/tests/large_strings/json_tests.cu |  20 +-
 5 files changed, 204 insertions(+), 173 deletions(-)
 delete mode 100644 cpp/src/io/json/byte_range_info.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 310bc99b279..eeafc411874 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -392,7 +392,6 @@ add_library(
   src/io/csv/reader_impl.cu
   src/io/csv/writer_impl.cu
   src/io/functions.cpp
-  src/io/json/byte_range_info.cu
   src/io/json/json_column.cu
   src/io/json/json_normalization.cu
   src/io/json/json_tree.cu
diff --git a/cpp/src/io/json/byte_range_info.cu b/cpp/src/io/json/byte_range_info.cu
deleted file mode 100644
index 258a40b0dd3..00000000000
--- a/cpp/src/io/json/byte_range_info.cu
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/types.hpp>
-#include <cudf/utilities/span.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/find.h>
-
-namespace cudf::io::json::detail {
-
-// Extract the first character position in the string.
-size_type find_first_delimiter(device_span<char const> d_data,
-                               char const delimiter,
-                               rmm::cuda_stream_view stream)
-{
-  auto const first_delimiter_position =
-    thrust::find(rmm::exec_policy(stream), d_data.begin(), d_data.end(), delimiter);
-  return first_delimiter_position != d_data.end() ? first_delimiter_position - d_data.begin() : -1;
-}
-
-}  // namespace cudf::io::json::detail
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index e0d0497e0a2..2658cbbed2f 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -31,6 +31,7 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <thrust/distance.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scatter.h>
 
@@ -38,11 +39,14 @@
 
 namespace cudf::io::json::detail {
 
-size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
-                    size_t range_offset,
-                    size_t range_size)
+namespace {
+
+// Return total size of sources enclosing the passed range
+std::size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
+                         std::size_t range_offset,
+                         std::size_t range_size)
 {
-  return std::accumulate(sources.begin(), sources.end(), 0ul, [=](size_t sum, auto& source) {
+  return std::accumulate(sources.begin(), sources.end(), 0ul, [=](std::size_t sum, auto& source) {
     auto const size = source->size();
     // TODO take care of 0, 0, or *, 0 case.
     return sum +
@@ -50,109 +54,55 @@ size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
   });
 }
 
+// Return estimated size of subchunk using a heuristic involving the byte range size and the minimum
+// subchunk size
+std::size_t estimate_size_per_subchunk(std::size_t chunk_size)
+{
+  auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); };
+  // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to
+  // 10kb) and the byte range size
+  return geometric_mean(std::ceil(static_cast<double>(chunk_size) / num_subchunks),
+                        min_subchunk_size);
+}
+
 /**
- * @brief Read from array of data sources into RMM buffer. The size of the returned device span
-          can be larger than the number of bytes requested from the list of sources when
-          the range to be read spans across multiple sources. This is due to the delimiter
-          characters inserted after the end of each accessed source.
+ * @brief Return the upper bound on the batch size for the JSON reader.
  *
- * @param buffer Device span buffer to which data is read
- * @param sources Array of data sources
- * @param compression Compression format of source
- * @param range_offset Number of bytes to skip from source start
- * @param range_size Number of bytes to read from source
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @returns A subspan of the input device span containing data read
+ * The datasources passed to the JSON reader are split into batches demarcated by byte range
+ * offsets and read iteratively. The batch size is capped at INT_MAX bytes, which is the
+ * default value returned by the function. This value can be overridden at runtime using the
+ * environment variable LIBCUDF_JSON_BATCH_SIZE
+ *
+ * @return size in bytes
  */
-device_span<char> ingest_raw_input(device_span<char> buffer,
-                                   host_span<std::unique_ptr<datasource>> sources,
-                                   compression_type compression,
-                                   size_t range_offset,
-                                   size_t range_size,
-                                   rmm::cuda_stream_view stream)
+std::size_t get_batch_size_upper_bound()
 {
-  CUDF_FUNC_RANGE();
-  // We append a line delimiter between two files to make sure the last line of file i and the first
-  // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line
-  // delimiter.
-  auto constexpr num_delimiter_chars = 1;
-
-  if (compression == compression_type::NONE) {
-    auto delimiter_map = cudf::detail::make_empty_host_vector<size_t>(sources.size(), stream);
-    std::vector<size_t> prefsum_source_sizes(sources.size());
-    std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
-    size_t bytes_read = 0;
-    std::transform_inclusive_scan(sources.begin(),
-                                  sources.end(),
-                                  prefsum_source_sizes.begin(),
-                                  std::plus<size_t>{},
-                                  [](std::unique_ptr<datasource> const& s) { return s->size(); });
-    auto upper =
-      std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset);
-    size_t start_source = std::distance(prefsum_source_sizes.begin(), upper);
-
-    auto const total_bytes_to_read =
-      std::min(range_size, prefsum_source_sizes.back() - range_offset);
-    range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
-    for (size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) {
-      if (sources[i]->is_empty()) continue;
-      auto data_size =
-        std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read);
-      auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read +
-                         (num_delimiter_chars * delimiter_map.size());
-      if (sources[i]->is_device_read_preferred(data_size)) {
-        bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
-      } else {
-        h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
-        auto const& h_buffer = h_buffers.back();
-        CUDF_CUDA_TRY(cudaMemcpyAsync(
-          destination, h_buffer->data(), h_buffer->size(), cudaMemcpyHostToDevice, stream.value()));
-        bytes_read += h_buffer->size();
-      }
-      range_offset = 0;
-      delimiter_map.push_back(bytes_read + (num_delimiter_chars * delimiter_map.size()));
-    }
-    // Removing delimiter inserted after last non-empty source is read
-    if (!delimiter_map.empty()) { delimiter_map.pop_back(); }
-
-    // If this is a multi-file source, we scatter the JSON line delimiters between files
-    if (sources.size() > 1) {
-      static_assert(num_delimiter_chars == 1,
-                    "Currently only single-character delimiters are supported");
-      auto const delimiter_source = thrust::make_constant_iterator('\n');
-      auto const d_delimiter_map  = cudf::detail::make_device_uvector_async(
-        delimiter_map, stream, rmm::mr::get_current_device_resource());
-      thrust::scatter(rmm::exec_policy_nosync(stream),
-                      delimiter_source,
-                      delimiter_source + d_delimiter_map.size(),
-                      d_delimiter_map.data(),
-                      buffer.data());
-    }
-    stream.synchronize();
-    return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars));
-  }
-  // TODO: allow byte range reading from multiple compressed files.
-  auto remaining_bytes_to_read = std::min(range_size, sources[0]->size() - range_offset);
-  auto hbuffer                 = std::vector<uint8_t>(remaining_bytes_to_read);
-  // Single read because only a single compressed source is supported
-  // Reading to host because decompression of a single block is much faster on the CPU
-  sources[0]->host_read(range_offset, remaining_bytes_to_read, hbuffer.data());
-  auto uncomp_data = decompress(compression, hbuffer);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(buffer.data(),
-                                reinterpret_cast<char*>(uncomp_data.data()),
-                                uncomp_data.size() * sizeof(char),
-                                cudaMemcpyHostToDevice,
-                                stream.value()));
-  stream.synchronize();
-  return buffer.first(uncomp_data.size());
+  auto const batch_size_str         = std::getenv("LIBCUDF_JSON_BATCH_SIZE");
+  int64_t const batch_size          = batch_size_str != nullptr ? std::atol(batch_size_str) : 0L;
+  auto const batch_limit            = static_cast<int64_t>(std::numeric_limits<int32_t>::max());
+  auto const batch_size_upper_bound = static_cast<std::size_t>(
+    (batch_size > 0 && batch_size < batch_limit) ? batch_size : batch_limit);
+  return batch_size_upper_bound;
 }
 
-size_t estimate_size_per_subchunk(size_t chunk_size)
+/**
+ * @brief Extract the first delimiter character position in the string
+ *
+ * @param d_data Device span in which to search for delimiter character
+ * @param delimiter Delimiter character to search for
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return Position of first delimiter character in device array
+ */
+size_type find_first_delimiter(device_span<char const> d_data,
+                               char const delimiter,
+                               rmm::cuda_stream_view stream)
 {
-  auto geometric_mean = [](double a, double b) { return std::sqrt(a * b); };
-  // NOTE: heuristic for choosing subchunk size: geometric mean of minimum subchunk size (set to
-  // 10kb) and the byte range size
-  return geometric_mean(std::ceil((double)chunk_size / num_subchunks), min_subchunk_size);
+  auto const first_delimiter_position =
+    thrust::find(rmm::exec_policy(stream), d_data.begin(), d_data.end(), delimiter);
+  return first_delimiter_position != d_data.end()
+           ? static_cast<size_type>(thrust::distance(d_data.begin(), first_delimiter_position))
+           : -1;
 }
 
 /**
@@ -175,12 +125,12 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
 {
   CUDF_FUNC_RANGE();
 
-  size_t const total_source_size            = sources_size(sources, 0, 0);
+  std::size_t const total_source_size       = sources_size(sources, 0, 0);
   auto constexpr num_delimiter_chars        = 1;
   auto const num_extra_delimiters           = num_delimiter_chars * (sources.size() - 1);
   compression_type const reader_compression = reader_opts.get_compression();
-  size_t const chunk_offset                 = reader_opts.get_byte_range_offset();
-  size_t chunk_size                         = reader_opts.get_byte_range_size();
+  std::size_t const chunk_offset            = reader_opts.get_byte_range_offset();
+  std::size_t chunk_size                    = reader_opts.get_byte_range_size();
 
   CUDF_EXPECTS(total_source_size ? chunk_offset < total_source_size : !chunk_offset,
                "Invalid offsetting",
@@ -188,14 +138,14 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
   auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
   chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size;
 
-  int const num_subchunks_prealloced = should_load_all_sources ? 0 : max_subchunks_prealloced;
-  size_t const size_per_subchunk     = estimate_size_per_subchunk(chunk_size);
+  int const num_subchunks_prealloced  = should_load_all_sources ? 0 : max_subchunks_prealloced;
+  std::size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size);
 
   // The allocation for single source compressed input is estimated by assuming a ~4:1
   // compression ratio. For uncompressed inputs, we can getter a better estimate using the idea
   // of subchunks.
   auto constexpr header_size = 4096;
-  size_t const buffer_size =
+  std::size_t const buffer_size =
     reader_compression != compression_type::NONE
       ? total_source_size * estimated_compression_ratio + header_size
       : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk) +
@@ -217,8 +167,8 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
     return datasource::owning_buffer<rmm::device_buffer>(std::move(empty_buf));
   } else if (!should_load_all_sources) {
     // Find next delimiter
-    std::int64_t next_delim_pos = -1;
-    size_t next_subchunk_start  = chunk_offset + chunk_size;
+    std::int64_t next_delim_pos     = -1;
+    std::size_t next_subchunk_start = chunk_offset + chunk_size;
     while (next_subchunk_start < total_source_size && next_delim_pos < buffer_offset) {
       buffer_offset += readbufspan.size();
       readbufspan    = ingest_raw_input(bufspan.last(buffer_size - buffer_offset),
@@ -243,6 +193,8 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
     readbufspan.size() - first_delim_pos - shift_for_nonzero_offset);
 }
 
+// Helper function to read the current batch using byte range offsets and size
+// passed
 table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
                                json_reader_options const& reader_opts,
                                rmm::cuda_stream_view stream,
@@ -270,6 +222,92 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
   return device_parse_nested_json(buffer, reader_opts, stream, mr);
 }
 
+}  // anonymous namespace
+
+device_span<char> ingest_raw_input(device_span<char> buffer,
+                                   host_span<std::unique_ptr<datasource>> sources,
+                                   compression_type compression,
+                                   std::size_t range_offset,
+                                   std::size_t range_size,
+                                   rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  // We append a line delimiter between two files to make sure the last line of file i and the first
+  // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line
+  // delimiter.
+  auto constexpr num_delimiter_chars = 1;
+
+  if (compression == compression_type::NONE) {
+    auto delimiter_map = cudf::detail::make_empty_host_vector<std::size_t>(sources.size(), stream);
+    std::vector<std::size_t> prefsum_source_sizes(sources.size());
+    std::vector<std::unique_ptr<datasource::buffer>> h_buffers;
+    std::size_t bytes_read = 0;
+    std::transform_inclusive_scan(sources.begin(),
+                                  sources.end(),
+                                  prefsum_source_sizes.begin(),
+                                  std::plus<std::size_t>{},
+                                  [](std::unique_ptr<datasource> const& s) { return s->size(); });
+    auto upper =
+      std::upper_bound(prefsum_source_sizes.begin(), prefsum_source_sizes.end(), range_offset);
+    std::size_t start_source = std::distance(prefsum_source_sizes.begin(), upper);
+
+    auto const total_bytes_to_read =
+      std::min(range_size, prefsum_source_sizes.back() - range_offset);
+    range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0;
+    for (std::size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read;
+         i++) {
+      if (sources[i]->is_empty()) continue;
+      auto data_size =
+        std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read);
+      auto destination = reinterpret_cast<uint8_t*>(buffer.data()) + bytes_read +
+                         (num_delimiter_chars * delimiter_map.size());
+      if (sources[i]->is_device_read_preferred(data_size)) {
+        bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream);
+      } else {
+        h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size));
+        auto const& h_buffer = h_buffers.back();
+        CUDF_CUDA_TRY(cudaMemcpyAsync(
+          destination, h_buffer->data(), h_buffer->size(), cudaMemcpyHostToDevice, stream.value()));
+        bytes_read += h_buffer->size();
+      }
+      range_offset = 0;
+      delimiter_map.push_back(bytes_read + (num_delimiter_chars * delimiter_map.size()));
+    }
+    // Removing delimiter inserted after last non-empty source is read
+    if (!delimiter_map.empty()) { delimiter_map.pop_back(); }
+
+    // If this is a multi-file source, we scatter the JSON line delimiters between files
+    if (sources.size() > 1) {
+      static_assert(num_delimiter_chars == 1,
+                    "Currently only single-character delimiters are supported");
+      auto const delimiter_source = thrust::make_constant_iterator('\n');
+      auto const d_delimiter_map  = cudf::detail::make_device_uvector_async(
+        delimiter_map, stream, rmm::mr::get_current_device_resource());
+      thrust::scatter(rmm::exec_policy_nosync(stream),
+                      delimiter_source,
+                      delimiter_source + d_delimiter_map.size(),
+                      d_delimiter_map.data(),
+                      buffer.data());
+    }
+    stream.synchronize();
+    return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars));
+  }
+  // TODO: allow byte range reading from multiple compressed files.
+  auto remaining_bytes_to_read = std::min(range_size, sources[0]->size() - range_offset);
+  auto hbuffer                 = std::vector<uint8_t>(remaining_bytes_to_read);
+  // Single read because only a single compressed source is supported
+  // Reading to host because decompression of a single block is much faster on the CPU
+  sources[0]->host_read(range_offset, remaining_bytes_to_read, hbuffer.data());
+  auto uncomp_data = decompress(compression, hbuffer);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(buffer.data(),
+                                reinterpret_cast<char*>(uncomp_data.data()),
+                                uncomp_data.size() * sizeof(char),
+                                cudaMemcpyHostToDevice,
+                                stream.value()));
+  stream.synchronize();
+  return buffer.first(uncomp_data.size());
+}
+
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
@@ -296,15 +334,16 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
    * Note that the batched reader does not work for compressed inputs or for regular
    * JSON inputs.
    */
-  size_t const total_source_size = sources_size(sources, 0, 0);
-  size_t chunk_offset            = reader_opts.get_byte_range_offset();
-  size_t chunk_size              = reader_opts.get_byte_range_size();
-  chunk_size                     = !chunk_size ? total_source_size - chunk_offset
-                                               : std::min(chunk_size, total_source_size - chunk_offset);
+  std::size_t const total_source_size = sources_size(sources, 0, 0);
+  std::size_t chunk_offset            = reader_opts.get_byte_range_offset();
+  std::size_t chunk_size              = reader_opts.get_byte_range_size();
+  chunk_size                          = !chunk_size ? total_source_size - chunk_offset
+                                                    : std::min(chunk_size, total_source_size - chunk_offset);
 
-  size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size);
-  size_t const batch_size_ub =
-    std::numeric_limits<int>::max() - (max_subchunks_prealloced * size_per_subchunk);
+  std::size_t const size_per_subchunk      = estimate_size_per_subchunk(chunk_size);
+  std::size_t const batch_size_upper_bound = get_batch_size_upper_bound();
+  std::size_t const batch_size =
+    batch_size_upper_bound - (max_subchunks_prealloced * size_per_subchunk);
 
   /*
    * Identify the position (zero-indexed) of starting source file from which to begin
@@ -314,10 +353,10 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
    */
 
   // Prefix sum of source file sizes
-  size_t pref_source_size = 0;
+  std::size_t pref_source_size = 0;
   // Starting source file from which to being batching evaluated using byte range offset
-  size_t const start_source = [chunk_offset, &sources, &pref_source_size]() {
-    for (size_t src_idx = 0; src_idx < sources.size(); ++src_idx) {
+  std::size_t const start_source = [chunk_offset, &sources, &pref_source_size]() {
+    for (std::size_t src_idx = 0; src_idx < sources.size(); ++src_idx) {
       if (pref_source_size + sources[src_idx]->size() > chunk_offset) { return src_idx; }
       pref_source_size += sources[src_idx]->size();
     }
@@ -329,16 +368,16 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
    * batch begins, and `end_bytes_size` gives the terminal bytes position after which reading
    * stops.
    */
-  size_t pref_bytes_size = chunk_offset;
-  size_t end_bytes_size  = chunk_offset + chunk_size;
-  std::vector<size_t> batch_offsets{pref_bytes_size};
-  for (size_t i = start_source; i < sources.size() && pref_bytes_size < end_bytes_size;) {
+  std::size_t pref_bytes_size = chunk_offset;
+  std::size_t end_bytes_size  = chunk_offset + chunk_size;
+  std::vector<std::size_t> batch_offsets{pref_bytes_size};
+  for (std::size_t i = start_source; i < sources.size() && pref_bytes_size < end_bytes_size;) {
     pref_source_size += sources[i]->size();
     // If the current source file can subsume multiple batches, we split the file until the
     // boundary of the last batch exceeds the end of the file (indexed by `pref_source_size`)
     while (pref_bytes_size < end_bytes_size &&
-           pref_source_size >= std::min(pref_bytes_size + batch_size_ub, end_bytes_size)) {
-      auto next_batch_size = std::min(batch_size_ub, end_bytes_size - pref_bytes_size);
+           pref_source_size >= std::min(pref_bytes_size + batch_size, end_bytes_size)) {
+      auto next_batch_size = std::min(batch_size, end_bytes_size - pref_bytes_size);
       batch_offsets.push_back(batch_offsets.back() + next_batch_size);
       pref_bytes_size += next_batch_size;
     }
@@ -356,7 +395,7 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
   // Dispatch individual batches to read_batch and push the resulting table into
   // partial_tables array. Note that the reader options need to be updated for each
   // batch to adjust byte range offset and byte range size.
-  for (size_t i = 0; i < batch_offsets.size() - 1; i++) {
+  for (std::size_t i = 0; i < batch_offsets.size() - 1; i++) {
     batched_reader_opts.set_byte_range_offset(batch_offsets[i]);
     batched_reader_opts.set_byte_range_size(batch_offsets[i + 1] - batch_offsets[i]);
     partial_tables.emplace_back(
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index 32de4ebabfa..7e3a920f00d 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -37,6 +37,20 @@ constexpr size_t min_subchunk_size        = 10000;
 constexpr int estimated_compression_ratio = 4;
 constexpr int max_subchunks_prealloced    = 3;
 
+/**
+ * @brief Read from array of data sources into RMM buffer. The size of the returned device span
+          can be larger than the number of bytes requested from the list of sources when
+          the range to be read spans across multiple sources. This is due to the delimiter
+          characters inserted after the end of each accessed source.
+ *
+ * @param buffer Device span buffer to which data is read
+ * @param sources Array of data sources
+ * @param compression Compression format of source
+ * @param range_offset Number of bytes to skip from source start
+ * @param range_size Number of bytes to read from source
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @returns A subspan of the input device span containing data read
+ */
 device_span<char> ingest_raw_input(device_span<char> buffer,
                                    host_span<std::unique_ptr<datasource>> sources,
                                    compression_type compression,
@@ -44,14 +58,20 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
                                    size_t range_size,
                                    rmm::cuda_stream_view stream);
 
+/**
+ * @brief Reads and returns the entire data set in batches.
+ *
+ * @param sources Input `datasource` objects to read the dataset from
+ * @param reader_opts Settings for controlling reading behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource to use for device memory allocation
+ *
+ * @return cudf::table object that contains the array of cudf::column.
+ */
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               json_reader_options const& reader_opts,
                               rmm::cuda_stream_view stream,
                               rmm::device_async_resource_ref mr);
 
-size_type find_first_delimiter(device_span<char const> d_data,
-                               char const delimiter,
-                               rmm::cuda_stream_view stream);
-
 }  // namespace io::json::detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/tests/large_strings/json_tests.cu b/cpp/tests/large_strings/json_tests.cu
index 49abf7b484d..e34ab991c11 100644
--- a/cpp/tests/large_strings/json_tests.cu
+++ b/cpp/tests/large_strings/json_tests.cu
@@ -28,13 +28,17 @@ struct JsonLargeReaderTest : public cudf::test::StringsLargeTest {};
 
 TEST_F(JsonLargeReaderTest, MultiBatch)
 {
-  std::string json_string             = R"(
+  std::string json_string = R"(
     { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
     { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
     { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
     { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
-  constexpr size_t batch_size_ub      = std::numeric_limits<int>::max();
-  constexpr size_t expected_file_size = 1.5 * static_cast<double>(batch_size_ub);
+
+  std::size_t const batch_size_upper_bound = std::numeric_limits<int32_t>::max() / 16;
+  // set smaller batch_size to reduce file size and execution time
+  setenv("LIBCUDF_JSON_BATCH_SIZE", std::to_string(batch_size_upper_bound).c_str(), 1);
+
+  constexpr std::size_t expected_file_size = 1.5 * static_cast<double>(batch_size_upper_bound);
   std::size_t const log_repetitions =
     static_cast<std::size_t>(std::ceil(std::log2(expected_file_size / json_string.size())));
 
@@ -66,8 +70,11 @@ TEST_F(JsonLargeReaderTest, MultiBatch)
     datasources.emplace_back(cudf::io::datasource::create(hb));
   }
   // Test for different chunk sizes
-  std::vector<size_t> chunk_sizes{
-    batch_size_ub / 4, batch_size_ub / 2, batch_size_ub, static_cast<size_t>(batch_size_ub * 2)};
+  std::vector<std::size_t> chunk_sizes{batch_size_upper_bound / 4,
+                                       batch_size_upper_bound / 2,
+                                       batch_size_upper_bound,
+                                       static_cast<std::size_t>(batch_size_upper_bound * 2)};
+
   for (auto chunk_size : chunk_sizes) {
     auto const tables =
       split_byte_range_reading<std::int64_t>(datasources,
@@ -86,4 +93,7 @@ TEST_F(JsonLargeReaderTest, MultiBatch)
     // cannot use EQUAL due to concatenate removing null mask
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view());
   }
+
+  // go back to normal batch_size
+  unsetenv("LIBCUDF_LARGE_STRINGS_THRESHOLD");
 }

From e5f8dd33d78a2c964f8d6bac895deb73a9be7aa6 Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Mon, 12 Aug 2024 16:52:52 -0500
Subject: [PATCH 652/842] Update the java code to properly deal with lists
 being returned as strings (#16536)

Recently some JSON parsing was updated so lists could be returned as strings. This updates the java code so that when cleaning up the results to match the desired schema that it can handle corner cases associated with lists and structs properly.

Tests are covered in the Spark plugin, but I am happy to add some here if we really want to validate that part of this.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16536
---
 java/src/main/java/ai/rapids/cudf/Table.java | 29 +++++++++++++++++---
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 4e737451ed6..36e342cae13 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -1084,7 +1084,12 @@ private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.Nest
         // The types don't match so just return the input unchanged...
         return DidViewChange.no();
       } else {
-        String[] foundNames = children.getNames();
+        String[] foundNames;
+        if (children == null) {
+          foundNames = new String[0];
+        } else {
+          foundNames = children.getNames();
+        }
         HashMap<String, Integer> indices = new HashMap<>();
         for (int i = 0; i < foundNames.length; i++) {
           indices.put(foundNames[i], i);
@@ -1101,8 +1106,9 @@ private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.Nest
           for (int i = 0; i < columns.length; i++) {
             String neededColumnName = neededNames[i];
             Integer index = indices.get(neededColumnName);
+            Schema childSchema = schema.getChild(i);
             if (index != null) {
-              if (schema.getChild(i).isStructOrHasStructDescendant()) {
+              if (childSchema.isStructOrHasStructDescendant()) {
                 ColumnView child = cv.getChildColumnView(index);
                 boolean shouldCloseChild = true;
                 try {
@@ -1131,8 +1137,23 @@ private static DidViewChange gatherJSONColumns(Schema schema, TableWithMeta.Nest
               }
             } else {
               somethingChanged = true;
-              try (Scalar s = Scalar.fromNull(types[i])) {
-                columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount());
+              if (types[i] == DType.LIST) {
+                try (Scalar s = Scalar.listFromNull(childSchema.getChild(0).asHostDataType())) {
+                  columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount());
+                }
+              } else if (types[i] == DType.STRUCT) {
+                int numStructChildren = childSchema.getNumChildren();
+                HostColumnVector.DataType[] structChildren = new HostColumnVector.DataType[numStructChildren];
+                for (int structChildIndex = 0; structChildIndex < numStructChildren; structChildIndex++) {
+                  structChildren[structChildIndex] = childSchema.getChild(structChildIndex).asHostDataType();
+                }
+                try (Scalar s = Scalar.structFromNull(structChildren)) {
+                  columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount());
+                }
+              } else {
+                try (Scalar s = Scalar.fromNull(types[i])) {
+                  columns[i] = ColumnVector.fromScalar(s, (int) cv.getRowCount());
+                }
               }
             }
           }

From 7178bf2eb34334db909a151926d8112c441b3b09 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 13 Aug 2024 08:45:44 -0400
Subject: [PATCH 653/842] Rework cudf::io::text::byte_range_info class member
 functions (#16518)

Adds `const` declarations to appropriate member functions in class `cudf::io::text::byte_range_info` and moves the ctor implementation to .cpp file.
This helps with using the `byte_range_info` objects in `const` variables and inside of `const` functions.

Found while working on #15983

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16518
---
 cpp/include/cudf/io/text/byte_range_info.hpp | 21 ++++++++------------
 cpp/src/io/text/byte_range_info.cpp          |  7 +++++++
 cpp/src/io/text/multibyte_split.cu           |  2 +-
 3 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/cpp/include/cudf/io/text/byte_range_info.hpp b/cpp/include/cudf/io/text/byte_range_info.hpp
index 7e9256be1d3..5f3c91dc99c 100644
--- a/cpp/include/cudf/io/text/byte_range_info.hpp
+++ b/cpp/include/cudf/io/text/byte_range_info.hpp
@@ -16,7 +16,6 @@
 
 #pragma once
 
-#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
 
 #include <cstdint>
@@ -40,53 +39,49 @@ class byte_range_info {
   int64_t _size{};    ///< size in bytes
 
  public:
-  constexpr byte_range_info() = default;
+  byte_range_info() = default;
   /**
    * @brief Constructs a byte_range_info object
    *
    * @param offset offset in bytes
    * @param size size in bytes
    */
-  constexpr byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size)
-  {
-    CUDF_EXPECTS(offset >= 0, "offset must be non-negative");
-    CUDF_EXPECTS(size >= 0, "size must be non-negative");
-  }
+  byte_range_info(int64_t offset, int64_t size);
 
   /**
    * @brief Copy constructor
    *
    * @param other byte_range_info object to copy
    */
-  constexpr byte_range_info(byte_range_info const& other) noexcept = default;
+  byte_range_info(byte_range_info const& other) noexcept = default;
   /**
    * @brief  Copy assignment operator
    *
    * @param other byte_range_info object to copy
    * @return this object after copying
    */
-  constexpr byte_range_info& operator=(byte_range_info const& other) noexcept = default;
+  byte_range_info& operator=(byte_range_info const& other) noexcept = default;
 
   /**
    * @brief Get the offset in bytes
    *
    * @return Offset in bytes
    */
-  [[nodiscard]] constexpr int64_t offset() { return _offset; }
+  [[nodiscard]] int64_t offset() const { return _offset; }
 
   /**
    * @brief Get the size in bytes
    *
    * @return Size in bytes
    */
-  [[nodiscard]] constexpr int64_t size() { return _size; }
+  [[nodiscard]] int64_t size() const { return _size; }
 
   /**
    * @brief Returns whether the span is empty.
    *
-   * @return true iff the span is empty, i.e. `size() == 0`
+   * @return true iff the range is empty, i.e. `size() == 0`
    */
-  [[nodiscard]] constexpr bool empty() { return size() == 0; }
+  [[nodiscard]] bool is_empty() const { return size() == 0; }
 };
 
 /**
diff --git a/cpp/src/io/text/byte_range_info.cpp b/cpp/src/io/text/byte_range_info.cpp
index 6a7836ed4e1..fe811739b97 100644
--- a/cpp/src/io/text/byte_range_info.cpp
+++ b/cpp/src/io/text/byte_range_info.cpp
@@ -16,6 +16,7 @@
 
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/text/byte_range_info.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <limits>
 
@@ -23,6 +24,12 @@ namespace cudf {
 namespace io {
 namespace text {
 
+byte_range_info::byte_range_info(int64_t offset, int64_t size) : _offset(offset), _size(size)
+{
+  CUDF_EXPECTS(offset >= 0, "offset must be non-negative");
+  CUDF_EXPECTS(size >= 0, "size must be non-negative");
+}
+
 byte_range_info create_byte_range_info_max() { return {0, std::numeric_limits<int64_t>::max()}; }
 
 std::vector<byte_range_info> create_byte_range_infos_consecutive(int64_t total_bytes,
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 97729a091fb..e3435a24b18 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -310,7 +310,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 {
   CUDF_FUNC_RANGE();
 
-  if (byte_range.empty()) { return make_empty_column(type_id::STRING); }
+  if (byte_range.is_empty()) { return make_empty_column(type_id::STRING); }
 
   auto device_delim = cudf::string_scalar(delimiter, true, stream, mr);
 

From 419fb99fa9ac471ae00ebe7787543b8e9cc154b5 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 13 Aug 2024 08:52:30 -0400
Subject: [PATCH 654/842] Fix all-empty input column for strings split APIs
 (#16466)

Fixes specialized behavior for all empty input column on the strings split APIs.
Verifying behavior with Pandas `str.split( pat, expand, regex )`
`pat=None     -- whitespace`
`expand=False -- record APIs`
`regex=True   -- re APIs`

- [x] `split`
- [x] `split` - whitespace
- [x] `rsplit`
- [x] `rsplit` - whitespace
- [x] `split_record`
- [x] `split_record` - whitespace
- [x] `rsplit_record`
- [x] `rsplit_record` - whitespace
- [x] `split_re`
- [x] `rsplit_re`
- [x] `split_record_re`
- [x] `rsplit_record_re`

Closes #16453

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Bradley Dice (https://github.com/bdice)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/16466
---
 cpp/src/strings/split/split.cuh       | 24 ++++++--------
 cpp/src/strings/split/split_re.cu     |  4 +++
 cpp/tests/strings/split_tests.cpp     | 47 ++++++++++++++++++++++++---
 python/cudf/cudf/tests/test_string.py | 16 +++++++++
 4 files changed, 73 insertions(+), 18 deletions(-)

diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index 4d7096c02ca..af70367678e 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -142,7 +142,7 @@ struct base_split_tokenizer {
 
     // max_tokens already included in token counts
     if (d_tokens.size() == 1) {
-      d_tokens[0] = string_index_pair{d_str.data(), d_str.size_bytes()};
+      d_tokens[0] = string_index_pair{(d_str.empty() ? "" : d_str.data()), d_str.size_bytes()};
       return;
     }
 
@@ -357,24 +357,20 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
   auto const chars_bytes =
     get_offset_value(input.offsets(), input.offset() + strings_count, stream) -
     get_offset_value(input.offsets(), input.offset(), stream);
-  if (chars_bytes == 0) {
-    auto offsets = cudf::make_column_from_scalar(
-      numeric_scalar<int32_t>(0, true, stream), strings_count + 1, stream, mr);
-    auto tokens = rmm::device_uvector<string_index_pair>(0, stream);
-    return std::pair{std::move(offsets), std::move(tokens)};
-  }
   auto const d_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
 
   // count the number of delimiters in the entire column
   rmm::device_scalar<int64_t> d_count(0, stream);
-  constexpr int64_t block_size         = 512;
-  constexpr size_type bytes_per_thread = 4;
-  auto const num_blocks                = util::div_rounding_up_safe(
-    util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
-  count_delimiters_kernel<Tokenizer, block_size, bytes_per_thread>
-    <<<num_blocks, block_size, 0, stream.value()>>>(
-      tokenizer, d_offsets, chars_bytes, d_count.data());
+  if (chars_bytes > 0) {
+    constexpr int64_t block_size         = 512;
+    constexpr size_type bytes_per_thread = 4;
+    auto const num_blocks                = util::div_rounding_up_safe(
+      util::div_rounding_up_safe(chars_bytes, static_cast<int64_t>(bytes_per_thread)), block_size);
+    count_delimiters_kernel<Tokenizer, block_size, bytes_per_thread>
+      <<<num_blocks, block_size, 0, stream.value()>>>(
+        tokenizer, d_offsets, chars_bytes, d_count.data());
+  }
 
   // Create a vector of every delimiter position in the chars column.
   // These may include overlapping or otherwise out-of-bounds delimiters which
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index d72ec1085b5..e0aacf07ef0 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -71,6 +71,10 @@ struct token_reader_fn {
     auto const token_offset = d_token_offsets[idx];
     auto const token_count  = d_token_offsets[idx + 1] - token_offset;
     auto const d_result     = d_tokens + token_offset;  // store tokens here
+    if (nchars == 0) {
+      d_result[0] = string_index_pair{"", 0};
+      return;
+    }
 
     int64_t token_idx = 0;
     auto itr          = d_str.begin();
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index 4c020cb4c29..7ece08b19f2 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -307,24 +307,46 @@ TEST_F(StringsSplitTest, SplitRecordWhitespaceWithMaxSplit)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
 }
 
-TEST_F(StringsSplitTest, SplitRecordAllEmpty)
+TEST_F(StringsSplitTest, SplitAllEmpty)
 {
   auto input     = cudf::test::strings_column_wrapper({"", "", "", ""});
   auto sv        = cudf::strings_column_view(input);
+  auto empty     = cudf::string_scalar("");
   auto delimiter = cudf::string_scalar("s");
+
+  auto result = cudf::strings::split(sv, delimiter);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);
+  result = cudf::strings::rsplit(sv, delimiter);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);
+
+  // whitespace hits a special case where nothing matches returns an all-null column
+  auto expected = cudf::test::strings_column_wrapper({"", "", "", ""}, {0, 0, 0, 0});
+  result        = cudf::strings::split(sv, empty);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected);
+  result = cudf::strings::rsplit(sv, empty);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), expected);
+}
+
+TEST_F(StringsSplitTest, SplitRecordAllEmpty)
+{
+  auto input     = cudf::test::strings_column_wrapper({"", "", "", ""});
+  auto sv        = cudf::strings_column_view(input);
   auto empty     = cudf::string_scalar("");
+  auto delimiter = cudf::string_scalar("s");
 
   using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
-  LCW expected({LCW{}, LCW{}, LCW{}, LCW{}});
+  LCW expected({LCW{""}, LCW{""}, LCW{""}, LCW{""}});
+  LCW expected_empty({LCW{}, LCW{}, LCW{}, LCW{}});
+
   auto result = cudf::strings::split_record(sv, delimiter);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
   result = cudf::strings::split_record(sv, empty);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_empty);
 
   result = cudf::strings::rsplit_record(sv, delimiter);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
   result = cudf::strings::rsplit_record(sv, empty);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_empty);
 }
 
 TEST_F(StringsSplitTest, MultiByteDelimiters)
@@ -575,6 +597,23 @@ TEST_F(StringsSplitTest, SplitRegexWordBoundary)
   }
 }
 
+TEST_F(StringsSplitTest, SplitRegexAllEmpty)
+{
+  auto input = cudf::test::strings_column_wrapper({"", "", "", ""});
+  auto sv    = cudf::strings_column_view(input);
+  auto prog  = cudf::strings::regex_program::create("[ _]");
+
+  auto result = cudf::strings::split_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);
+  result = cudf::strings::rsplit_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view().column(0), input);
+
+  auto rec_result = cudf::strings::split_record_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), input);
+  rec_result = cudf::strings::rsplit_record_re(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view().column(0), input);
+}
+
 TEST_F(StringsSplitTest, RSplitRecord)
 {
   std::vector<char const*> h_strings{
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index a2a3e874c91..30880f074c0 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -978,6 +978,22 @@ def test_string_split_re(data, pat, n, expand):
     assert_eq(expect, got)
 
 
+@pytest.mark.parametrize("pat", [None, "\\s+"])
+@pytest.mark.parametrize("regex", [False, True])
+@pytest.mark.parametrize("expand", [False, True])
+def test_string_split_all_empty(pat, regex, expand):
+    ps = pd.Series(["", "", "", ""], dtype="str")
+    gs = cudf.Series(["", "", "", ""], dtype="str")
+
+    expect = ps.str.split(pat=pat, expand=expand, regex=regex)
+    got = gs.str.split(pat=pat, expand=expand, regex=regex)
+
+    if isinstance(got, cudf.DataFrame):
+        assert_eq(expect, got, check_column_type=False)
+    else:
+        assert_eq(expect, got)
+
+
 @pytest.mark.parametrize(
     "str_data", [[], ["a", "b", "c", "d", "e"], [None, None, None, None, None]]
 )

From 3a791cb8a83ca2cf446a910cb94d5a4e3edf2b9f Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 13 Aug 2024 08:56:43 -0400
Subject: [PATCH 655/842] Remove unneeded pair-iterator benchmark (#16511)

Removes the pair-iterator benchmark logic. The remaining benchmarks use the null-replacement-iterator which uses the libcudf pair-iterator internally. There is no need for benchmarking this unique iterator pattern that is not used by libcudf.

The `cpp/benchmarks/iterator/iterator.cu` failed to compile with gcc 12 because the sum-reduce function cannot resolve adding `thrust::pair` objects together likely due to some recent changes in CCCL. Regardless, adding `thrust::pair` objects is not something we need to benchmark. The existing benchmark benchmarks libcudf's usage of the internal pair-iterator correctly.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16511
---
 cpp/benchmarks/iterator/iterator.cu | 77 -----------------------------
 1 file changed, 77 deletions(-)

diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu
index ada7a9bd73d..fd0cebb12ea 100644
--- a/cpp/benchmarks/iterator/iterator.cu
+++ b/cpp/benchmarks/iterator/iterator.cu
@@ -30,7 +30,6 @@
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/pair.h>
 #include <thrust/reduce.h>
 
 #include <random>
@@ -161,68 +160,6 @@ void BM_iterator(benchmark::State& state)
                           sizeof(TypeParam));
 }
 
-// operator+ defined for pair iterator reduction
-template <typename T>
-__device__ thrust::pair<T, bool> operator+(thrust::pair<T, bool> lhs, thrust::pair<T, bool> rhs)
-{
-  return thrust::pair<T, bool>{lhs.first * lhs.second + rhs.first * rhs.second,
-                               lhs.second + rhs.second};
-}
-// -----------------------------------------------------------------------------
-template <typename T, bool has_null>
-void pair_iterator_bench_cub(cudf::column_view& col,
-                             rmm::device_uvector<thrust::pair<T, bool>>& result)
-{
-  thrust::pair<T, bool> init{0, false};
-  auto d_col    = cudf::column_device_view::create(col);
-  int num_items = col.size();
-  auto begin    = d_col->pair_begin<T, has_null>();
-  reduce_by_cub(result.begin(), begin, num_items, init);
-}
-
-template <typename T, bool has_null>
-void pair_iterator_bench_thrust(cudf::column_view& col,
-                                rmm::device_uvector<thrust::pair<T, bool>>& result)
-{
-  thrust::pair<T, bool> init{0, false};
-  auto d_col = cudf::column_device_view::create(col);
-  auto d_in  = d_col->pair_begin<T, has_null>();
-  auto d_end = d_in + col.size();
-  thrust::reduce(thrust::device, d_in, d_end, init, cudf::DeviceSum{});
-}
-
-template <class TypeParam, bool cub_or_thrust>
-void BM_pair_iterator(benchmark::State& state)
-{
-  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
-  using T      = TypeParam;
-  auto num_gen = thrust::counting_iterator<cudf::size_type>(0);
-  auto null_gen =
-    thrust::make_transform_iterator(num_gen, [](cudf::size_type row) { return row % 2 == 0; });
-
-  cudf::test::fixed_width_column_wrapper<T> wrap_hasnull_F(num_gen, num_gen + column_size);
-  cudf::test::fixed_width_column_wrapper<T> wrap_hasnull_T(
-    num_gen, num_gen + column_size, null_gen);
-  cudf::column_view hasnull_F = wrap_hasnull_F;
-  cudf::column_view hasnull_T = wrap_hasnull_T;
-
-  // Initialize dev_result to false
-  auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<thrust::pair<T, bool>>(
-    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    if (cub_or_thrust) {
-      pair_iterator_bench_cub<T, false>(hasnull_T,
-                                        dev_result);  // driven by pair iterator with nulls
-    } else {
-      pair_iterator_bench_thrust<T, false>(hasnull_T,
-                                           dev_result);  // driven by pair iterator with nulls
-    }
-  }
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * column_size *
-                          sizeof(TypeParam));
-}
-
 #define ITER_BM_BENCHMARK_DEFINE(name, type, cub_or_thrust, raw_or_iterator) \
   BENCHMARK_DEFINE_F(Iterator, name)(::benchmark::State & state)             \
   {                                                                          \
@@ -238,17 +175,3 @@ ITER_BM_BENCHMARK_DEFINE(double_cub_raw, double, true, true);
 ITER_BM_BENCHMARK_DEFINE(double_cub_iter, double, true, false);
 ITER_BM_BENCHMARK_DEFINE(double_thrust_raw, double, false, true);
 ITER_BM_BENCHMARK_DEFINE(double_thrust_iter, double, false, false);
-
-#define PAIRITER_BM_BENCHMARK_DEFINE(name, type, cub_or_thrust)  \
-  BENCHMARK_DEFINE_F(Iterator, name)(::benchmark::State & state) \
-  {                                                              \
-    BM_pair_iterator<type, cub_or_thrust>(state);                \
-  }                                                              \
-  BENCHMARK_REGISTER_F(Iterator, name)                           \
-    ->RangeMultiplier(10)                                        \
-    ->Range(1000, 10000000)                                      \
-    ->UseManualTime()                                            \
-    ->Unit(benchmark::kMillisecond);
-
-PAIRITER_BM_BENCHMARK_DEFINE(double_cub_pair, double, true);
-PAIRITER_BM_BENCHMARK_DEFINE(double_thrust_pair, double, false);

From 3801f811ab7713e4cb9cc3bb34d282f8a04e71e4 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 13 Aug 2024 12:40:40 -0500
Subject: [PATCH 656/842] Remove hardcoded versions from workflows. (#16540)

This PR removes hardcoded Python versions from CI workflows. It is a prerequisite for dropping Python 3.9. See https://github.com/rapidsai/build-planning/issues/88.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16540
---
 .github/workflows/pandas-tests.yaml | 3 ++-
 .github/workflows/pr.yaml           | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index cf0c2b377dd..10c803f7921 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -19,7 +19,8 @@ jobs:
       secrets: inherit
       uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
       with:
-        matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
+        # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+        matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
         build_type: nightly
         branch: ${{ inputs.branch }}
         date: ${{ inputs.date }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index c2e7f64f952..ea8a1762b2c 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -187,6 +187,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
       script: ci/cudf_pandas_scripts/run_tests.sh
@@ -196,7 +197,8 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
-      matrix_filter: map(select(.ARCH == "amd64" and .PY_VER == "3.9" and (.CUDA_VER | startswith("12.5.")) ))
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
       build_type: pull-request
       script: ci/cudf_pandas_scripts/pandas-tests/run.sh pr
       # Hide test failures because they exceed the GITHUB_STEP_SUMMARY output limit.

From 5780c4d8fb5afac2e04988a2ff5531f94c22d3a3 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Tue, 13 Aug 2024 13:46:31 -0700
Subject: [PATCH 657/842] Register `read_parquet` and `read_csv` with dask-expr
 (#16535)

After https://github.com/dask/dask-expr/pull/1114, Dask cuDF must register specific `read_parquet` and `read_csv` functions to be used when query-planning is enabled (the default).

**This PR is required for CI to pass with dask>2024.8.0**

**NOTE**: It probably doesn't make sense to add specific tests for this change. Once the 2014.7.1 dask pin is removed, all `dask_cudf` tests using `read_parquet` and  `read_csv` will fail without this change...

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Benjamin Zaitlen (https://github.com/quasiben)

URL: https://github.com/rapidsai/cudf/pull/16535
---
 python/dask_cudf/dask_cudf/backends.py | 35 ++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 2b1f745fc04..01bab30190a 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -667,6 +667,41 @@ def from_dict(
             constructor=constructor,
         )
 
+    @staticmethod
+    def read_parquet(*args, engine=None, **kwargs):
+        import dask_expr as dx
+
+        from dask_cudf.io.parquet import CudfEngine
+
+        return _default_backend(
+            dx.read_parquet, *args, engine=CudfEngine, **kwargs
+        )
+
+    @staticmethod
+    def read_csv(
+        path,
+        *args,
+        header="infer",
+        dtype_backend=None,
+        storage_options=None,
+        **kwargs,
+    ):
+        import dask_expr as dx
+        from fsspec.utils import stringify_path
+
+        if not isinstance(path, str):
+            path = stringify_path(path)
+        return dx.new_collection(
+            dx.io.csv.ReadCSV(
+                path,
+                dtype_backend=dtype_backend,
+                storage_options=storage_options,
+                kwargs=kwargs,
+                header=header,
+                dataframe_backend="cudf",
+            )
+        )
+
     @staticmethod
     def read_json(*args, **kwargs):
         from dask_cudf.io.json import read_json as read_json_impl

From cf3fabf7d090dcd983080e3c844002ebb7280e77 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Tue, 13 Aug 2024 22:59:47 +0200
Subject: [PATCH 658/842] Ensure comparisons with pyints and integer series
 always succeed (#16532)

When Python integers are compared to a series of integers, the result can always be correctly defined no matter the values of the Python integer.

This was always a very mild issue.  But with NumPy 2 behavior not upcasting the computation result type based on the value anymore, even things like:
```
cudf.Series([1, 2, 3], dtype="int8") < 1000
```
would fail.
(Similar paths could be taken for other integer scalars, but there would be mostly nice for performance.)

N.B. NumPy/pandas also support exact comparisons when mixing e.g. uint64 and int64.  This is another rare exception that cudf currently does not support.

Closes gh-16282

Authors:
  - Sebastian Berg (https://github.com/seberg)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16532
---
 python/cudf/cudf/core/column/numerical.py | 54 +++++++++++++++++------
 python/cudf/cudf/tests/test_binops.py     | 41 +++++++++++++++++
 2 files changed, 81 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index b83d7600c82..bbc74ef349e 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -199,16 +199,53 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             np.bool_: np.float32,
         }
 
+        out_dtype = None
         if op in {"__truediv__", "__rtruediv__"}:
             # Division with integer types results in a suitable float.
             if truediv_type := int_float_dtype_mapping.get(self.dtype.type):
                 return self.astype(truediv_type)._binaryop(other, op)
+        elif op in {
+            "__lt__",
+            "__gt__",
+            "__le__",
+            "__ge__",
+            "__eq__",
+            "__ne__",
+        }:
+            out_dtype = "bool"
+
+            # If `other` is a Python integer and it is out-of-bounds
+            # promotion could fail but we can trivially define the result
+            # in terms of `notnull` or `NULL_NOT_EQUALS`.
+            if type(other) is int and self.dtype.kind in "iu":  # noqa: E721
+                truthiness = None
+                iinfo = np.iinfo(self.dtype)
+                if iinfo.min > other:
+                    truthiness = op in {"__ne__", "__gt__", "__ge__"}
+                elif iinfo.max < other:
+                    truthiness = op in {"__ne__", "__lt__", "__le__"}
+
+                # Compare with minimum value so that the result is true/false
+                if truthiness is True:
+                    other = iinfo.min
+                    op = "__ge__"
+                elif truthiness is False:
+                    other = iinfo.min
+                    op = "__lt__"
+
+        elif op in {"NULL_EQUALS", "NULL_NOT_EQUALS"}:
+            out_dtype = "bool"
 
         reflect, op = self._check_reflected_op(op)
         if (other := self._wrap_binop_normalization(other)) is NotImplemented:
             return NotImplemented
-        out_dtype = self.dtype
-        if other is not None:
+
+        if out_dtype is not None:
+            pass  # out_dtype was already set to bool
+        if other is None:
+            # not a binary operator, so no need to promote
+            out_dtype = self.dtype
+        elif out_dtype is None:
             out_dtype = np.result_type(self.dtype, other.dtype)
             if op in {"__mod__", "__floordiv__"}:
                 tmp = self if reflect else other
@@ -225,17 +262,6 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                             out_dtype = cudf.dtype("float64")
                     elif is_scalar(tmp) and tmp == 0:
                         out_dtype = cudf.dtype("float64")
-        if op in {
-            "__lt__",
-            "__gt__",
-            "__le__",
-            "__ge__",
-            "__eq__",
-            "__ne__",
-            "NULL_EQUALS",
-            "NULL_NOT_EQUALS",
-        }:
-            out_dtype = "bool"
 
         if op in {"__and__", "__or__", "__xor__"}:
             if self.dtype.kind == "f" or other.dtype.kind == "f":
@@ -247,7 +273,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             if self.dtype.kind == "b" or other.dtype.kind == "b":
                 out_dtype = "bool"
 
-        if (
+        elif (
             op == "__pow__"
             and self.dtype.kind in "iu"
             and (is_integer(other) or other.dtype.kind in "iu")
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 503b1a975b4..4256ec872e6 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -290,6 +290,47 @@ def test_series_compare(cmpop, obj_class, dtype):
     np.testing.assert_equal(result3.to_numpy(), cmpop(arr1, arr2))
 
 
+@pytest.mark.parametrize(
+    "dtype,val",
+    [("int8", 200), ("int32", 2**32), ("uint8", -128), ("uint64", -1)],
+)
+@pytest.mark.parametrize(
+    "op",
+    [
+        operator.eq,
+        operator.ne,
+        operator.lt,
+        operator.le,
+        operator.gt,
+        operator.ge,
+    ],
+)
+@pytest.mark.parametrize("reverse", [False, True])
+def test_series_compare_integer(dtype, val, op, reverse):
+    # Tests that these actually work, even though they are out of bound.
+    force_cast_val = np.array(val).astype(dtype)
+    sr = Series(
+        [np.iinfo(dtype).min, np.iinfo(dtype).max, force_cast_val, None],
+        dtype=dtype,
+    )
+
+    if reverse:
+        _op = op
+
+        def op(x, y):
+            return _op(y, x)
+
+    # We expect the same result as comparing to a value within range (e.g. 0)
+    # except that a NULL value evaluates to False
+    if op(0, val):
+        expected = Series([True, True, True, None])
+    else:
+        expected = Series([False, False, False, None])
+
+    res = op(sr, val)
+    assert_eq(res, expected)
+
+
 def _series_compare_nulls_typegen():
     return [
         *combinations_with_replacement(DATETIME_TYPES, 2),

From 1f0d0c93f315f64698ffcc80082926896facf13a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 14 Aug 2024 09:07:22 -0400
Subject: [PATCH 659/842] Change cudf::empty_like to not include offsets for
 empty strings columns (#16529)

Fixes `cudf::empty_like` to only create empty child columns for nested types. The empty child columns are needed to store the types for consistency with `cudf::make_empty_column`.

Closes #16490

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16529
---
 cpp/src/copying/copy.cpp                  | 6 ++++++
 cpp/tests/copying/pack_tests.cpp          | 6 ++++--
 cpp/tests/replace/replace_nulls_tests.cpp | 2 +-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index 98ee6aa8f68..bac8dbe5d95 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -143,6 +143,12 @@ std::unique_ptr<column> empty_like(column_view const& input)
 {
   CUDF_FUNC_RANGE();
 
+  // test_dataframe.py passes an EMPTY column type here;
+  // this causes is_nested to throw an error since it uses the type-dispatcher
+  if ((input.type().id() == type_id::EMPTY) || !cudf::is_nested(input.type())) {
+    return make_empty_column(input.type());
+  }
+
   std::vector<std::unique_ptr<column>> children;
   std::transform(input.child_begin(),
                  input.child_end(),
diff --git a/cpp/tests/copying/pack_tests.cpp b/cpp/tests/copying/pack_tests.cpp
index ea4408efa6a..8a50e071cb9 100644
--- a/cpp/tests/copying/pack_tests.cpp
+++ b/cpp/tests/copying/pack_tests.cpp
@@ -573,6 +573,8 @@ TEST_F(PackUnpackTest, SlicedEmpty)
 
   cudf::table_view t({a, b, c, d});
 
-  auto sliced = cudf::split(t, {0});
-  this->run_test(sliced[0]);
+  auto sliced   = cudf::split(t, {0});
+  auto packed   = cudf::pack(t);
+  auto unpacked = cudf::unpack(packed);
+  CUDF_TEST_EXPECT_TABLES_EQUIVALENT(t, unpacked);
 }
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index 9603ea44a76..fcee27305f2 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -674,7 +674,7 @@ TEST_F(ReplaceDictionaryTest, ReplaceNullsEmpty)
   cudf::test::fixed_width_column_wrapper<int64_t> input_empty_w({});
   auto input_empty = cudf::dictionary::encode(input_empty_w);
   auto result      = cudf::replace_nulls(input_empty->view(), input_empty->view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), input_empty->view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), input_empty->view());
 }
 
 TEST_F(ReplaceDictionaryTest, ReplaceNullsNoNulls)

From c20d6b3a3588c70d985e0d737fed844a9c0c6426 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 14 Aug 2024 09:07:51 -0400
Subject: [PATCH 660/842] Remove unneeded output size parameter from internal
 count_matches utility (#16531)

Removes `output_size` parameter from `cudf::strings::detail::count_matches` utility since the output size should equal the input size from the first parameter. This also removes an unnecessary `assert()` call. The parameter became unnecessary as part of the large strings work.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/16531
---
 cpp/src/strings/contains.cu            | 2 +-
 cpp/src/strings/count_matches.cu       | 9 +++------
 cpp/src/strings/count_matches.hpp      | 2 --
 cpp/src/strings/extract/extract_all.cu | 2 +-
 cpp/src/strings/search/findall.cu      | 2 +-
 cpp/src/strings/split/split_re.cu      | 6 +++---
 6 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index 718ac41e36c..79d241205df 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -112,7 +112,7 @@ std::unique_ptr<column> count_re(strings_column_view const& input,
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
 
-  auto result = count_matches(*d_strings, *d_prog, input.size(), stream, mr);
+  auto result = count_matches(*d_strings, *d_prog, stream, mr);
   if (input.has_nulls()) {
     result->set_null_mask(cudf::detail::copy_bitmask(input.parent(), stream, mr),
                           input.null_count());
diff --git a/cpp/src/strings/count_matches.cu b/cpp/src/strings/count_matches.cu
index e8672ea5335..4ad3a75baf7 100644
--- a/cpp/src/strings/count_matches.cu
+++ b/cpp/src/strings/count_matches.cu
@@ -60,18 +60,15 @@ struct count_fn {
 
 std::unique_ptr<column> count_matches(column_device_view const& d_strings,
                                       reprog_device& d_prog,
-                                      size_type output_size,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  assert(output_size >= d_strings.size() and "Unexpected output size");
-
   auto results = make_numeric_column(
-    data_type{type_to_id<size_type>()}, output_size, mask_state::UNALLOCATED, stream, mr);
+    data_type{type_to_id<size_type>()}, d_strings.size(), mask_state::UNALLOCATED, stream, mr);
 
-  if (d_strings.size() == 0) return results;
+  if (d_strings.size() == 0) { return results; }
 
-  auto d_results = results->mutable_view().data<int32_t>();
+  auto d_results = results->mutable_view().data<cudf::size_type>();
 
   launch_transform_kernel(count_fn{d_strings}, d_prog, d_results, d_strings.size(), stream);
 
diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp
index 4a5efac37fd..eab9863b975 100644
--- a/cpp/src/strings/count_matches.hpp
+++ b/cpp/src/strings/count_matches.hpp
@@ -37,14 +37,12 @@ class reprog_device;
  *
  * @param d_strings Device view of the input strings column.
  * @param d_prog Regex instance to evaluate on each string.
- * @param output_size Number of rows for the output column.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Integer column of match counts
  */
 std::unique_ptr<column> count_matches(column_device_view const& d_strings,
                                       reprog_device& d_prog,
-                                      size_type output_size,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr);
 
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index 27691068d5a..897eba58833 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -119,7 +119,7 @@ std::unique_ptr<column> extract_all_record(strings_column_view const& input,
 
   // Get the match counts for each string.
   // This column will become the output lists child offsets column.
-  auto counts   = count_matches(*d_strings, *d_prog, strings_count, stream, mr);
+  auto counts   = count_matches(*d_strings, *d_prog, stream, mr);
   auto d_counts = counts->mutable_view().data<size_type>();
 
   // Compute null output rows
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index 0d0962258cf..2f7e7352458 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -104,7 +104,7 @@ std::unique_ptr<column> findall(strings_column_view const& input,
   auto d_prog = regex_device_builder::create_prog_device(prog, stream);
 
   // Create lists offsets column
-  auto const sizes              = count_matches(*d_strings, *d_prog, strings_count, stream, mr);
+  auto const sizes              = count_matches(*d_strings, *d_prog, stream, mr);
   auto [offsets, total_matches] = cudf::detail::make_offsets_child_column(
     sizes->view().begin<size_type>(), sizes->view().end<size_type>(), stream, mr);
   auto const d_offsets = offsets->view().data<size_type>();
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index e0aacf07ef0..d273c93ec12 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -210,8 +210,8 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   // count the number of delimiters matched in each string
-  auto const counts = count_matches(
-    *d_strings, *d_prog, strings_count, stream, rmm::mr::get_current_device_resource());
+  auto const counts =
+    count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource());
 
   // get the split tokens from the input column; this also converts the counts into offsets
   auto [tokens, offsets] =
@@ -275,7 +275,7 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   // count the number of delimiters matched in each string
-  auto counts = count_matches(*d_strings, *d_prog, strings_count, stream, mr);
+  auto counts = count_matches(*d_strings, *d_prog, stream, mr);
 
   // get the split tokens from the input column; this also converts the counts into offsets
   auto [tokens, offsets] =

From bf3372b1aa02939db32b2df62ab816a0eb9abdde Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 14 Aug 2024 12:06:29 -0500
Subject: [PATCH 661/842] Switch python version to `3.10` in `cudf.pandas`
 pandas test scripts (#16559)

python 3.9 support was recently dropped in rapids, hence changing the python version to 3.10

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16559
---
 ci/cudf_pandas_scripts/pandas-tests/diff.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/diff.sh b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
index 6cf70a2347f..5dbb4ba991c 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/diff.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/diff.sh
@@ -12,7 +12,7 @@ RAPIDS_FULL_VERSION=$(<./VERSION)
 rapids-logger "Github job name: ${GH_JOB_NAME}"
 rapids-logger "Rapids version: ${RAPIDS_FULL_VERSION}"
 
-PY_VER="39"
+PY_VER="310"
 MAIN_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.main-${RAPIDS_FULL_VERSION}-results.json
 PR_ARTIFACT=$(rapids-s3-path)cuda12_$(arch)_py${PY_VER}.pr-${RAPIDS_FULL_VERSION}-results.json
 

From 496151225aaf90318c089939d3a74e6ccee4e28d Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 14 Aug 2024 17:32:28 -0500
Subject: [PATCH 662/842] Ensure managed memory is supported in cudf.pandas.
 (#16552)

Currently, WSL users of `cudf.pandas` can try to enable UVM (managed
memory) but it is not supported by the driver. This PR detects whether
UVM is supported before enabling a managed memory pool or prefetching.

Closes https://github.com/rapidsai/cudf/issues/16551.

---------

Co-authored-by: Vyas Ramasubramani <vyas.ramasubramani@gmail.com>
Co-authored-by: Lawrence Mitchell <lmitchell@nvidia.com>
---
 dependencies.yaml                            |  2 +-
 docs/cudf/source/cudf_pandas/how-it-works.md | 21 ++++++++++++-----
 python/cudf/cudf/_lib/pylibcudf/utils.pyx    | 22 ++++++++++++++++++
 python/cudf/cudf/pandas/__init__.py          | 24 ++++++++++++++++----
 python/cudf_polars/pyproject.toml            |  2 +-
 5 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 7ecce362101..4c93ef60dd3 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -631,7 +631,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - polars>=1.0
+          - polars>=1.0,<1.3
   run_dask_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/docs/cudf/source/cudf_pandas/how-it-works.md b/docs/cudf/source/cudf_pandas/how-it-works.md
index 8efd9d7e063..0bb87f60afe 100644
--- a/docs/cudf/source/cudf_pandas/how-it-works.md
+++ b/docs/cudf/source/cudf_pandas/how-it-works.md
@@ -44,11 +44,20 @@ allocation may be a bottleneck depending on the workload. Managed memory
 enables oversubscribing GPU memory. This allows cudf.pandas to process
 data larger than GPU memory in many cases, without CPU (Pandas) fallback.
 
+```{note}
+CUDA Managed Memory on Windows, and more specifically Windows Subsystem for
+Linux (WSL2), [does not support oversubscription](
+https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#system-requirements-for-unified-memory),
+only unified addressing. Furthermore, managed memory on WSL2 has undesirable
+performance characteristics. Therefore, `cudf.pandas` uses a non-managed pool
+allocator on WSL2, so `cudf.pandas` is limited to the physical size of GPU memory.
+```
+
 Other memory allocators can be used by changing the environment
-variable `CUDF_PANDAS_RMM_MODE` to one of the following.
+variable `CUDF_PANDAS_RMM_MODE` to one of the following:
 
-1. "managed_pool" (default): CUDA Unified Memory (managed memory) with RMM's asynchronous pool allocator.
-2. "managed": CUDA Unified Memory, (managed memory) with no pool allocator.
-3. "async": CUDA's built-in pool asynchronous pool allocator with normal CUDA device memory.
-4. "pool": RMM's asynchronous pool allocator with normal CUDA device memory.
-5. "cuda": normal CUDA device memory with no pool allocator.
+1. `"managed_pool"` (default, if supported): CUDA Unified Memory (managed memory) with RMM's asynchronous pool allocator.
+2. `"managed"`: CUDA Unified Memory, (managed memory) with no pool allocator.
+3. `"async"`: CUDA's built-in pool asynchronous pool allocator with normal CUDA device memory.
+4. `"pool"` (default if `"managed_pool"` is not supported): RMM's asynchronous pool allocator with normal CUDA device memory.
+5. `"cuda"`: normal CUDA device memory with no pool allocator.
diff --git a/python/cudf/cudf/_lib/pylibcudf/utils.pyx b/python/cudf/cudf/_lib/pylibcudf/utils.pyx
index b4427e8ecff..42e3575ed44 100644
--- a/python/cudf/cudf/_lib/pylibcudf/utils.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/utils.pyx
@@ -6,6 +6,8 @@ from libc.stdint cimport uintptr_t
 from libcpp.functional cimport reference_wrapper
 from libcpp.vector cimport vector
 
+from cuda import cudart
+
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type
 
@@ -34,3 +36,23 @@ cdef vector[reference_wrapper[const scalar]] _as_vector(list source):
         c_scalars.push_back(
             reference_wrapper[constscalar](dereference((<Scalar?>slr).c_obj)))
     return c_scalars
+
+
+def _is_concurrent_managed_access_supported():
+    """Check the availability of concurrent managed access (UVM).
+
+    Note that WSL2 does not support managed memory.
+    """
+
+    # Ensure CUDA is initialized before checking cudaDevAttrConcurrentManagedAccess
+    cudart.cudaFree(0)
+
+    device_id = 0
+    err, supports_managed_access = cudart.cudaDeviceGetAttribute(
+        cudart.cudaDeviceAttr.cudaDevAttrConcurrentManagedAccess, device_id
+    )
+    if err != cudart.cudaError_t.cudaSuccess:
+        raise RuntimeError(
+            f"Failed to check cudaDevAttrConcurrentManagedAccess with error {err}"
+        )
+    return supports_managed_access != 0
diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py
index a6667a7bcd9..e88e795671e 100644
--- a/python/cudf/cudf/pandas/__init__.py
+++ b/python/cudf/cudf/pandas/__init__.py
@@ -26,8 +26,8 @@
 }
 
 
-def _enable_managed_prefetching(rmm_mode):
-    if "managed" in rmm_mode:
+def _enable_managed_prefetching(rmm_mode, managed_memory_is_supported):
+    if managed_memory_is_supported and "managed" in rmm_mode:
         for key in _SUPPORTED_PREFETCHES:
             pylibcudf.experimental.enable_prefetching(key)
 
@@ -40,7 +40,20 @@ def install():
     global LOADED
     LOADED = loader is not None
 
-    rmm_mode = os.getenv("CUDF_PANDAS_RMM_MODE", "managed_pool")
+    # The default mode is "managed_pool" if UVM is supported, otherwise "pool"
+    managed_memory_is_supported = (
+        pylibcudf.utils._is_concurrent_managed_access_supported()
+    )
+    default_rmm_mode = (
+        "managed_pool" if managed_memory_is_supported else "pool"
+    )
+    rmm_mode = os.getenv("CUDF_PANDAS_RMM_MODE", default_rmm_mode)
+
+    if "managed" in rmm_mode and not managed_memory_is_supported:
+        raise ValueError(
+            f"Managed memory is not supported on this system, so the requested {rmm_mode=} is invalid."
+        )
+
     # Check if a non-default memory resource is set
     current_mr = rmm.mr.get_current_device_resource()
     if not isinstance(current_mr, rmm.mr.CudaMemoryResource):
@@ -53,6 +66,7 @@ def install():
     free_memory, _ = rmm.mr.available_device_memory()
     free_memory = int(round(float(free_memory) * 0.80 / 256) * 256)
     new_mr = current_mr
+
     if rmm_mode == "pool":
         new_mr = rmm.mr.PoolMemoryResource(
             current_mr,
@@ -71,8 +85,10 @@ def install():
         )
     elif rmm_mode != "cuda":
         raise ValueError(f"Unsupported {rmm_mode=}")
+
     rmm.mr.set_current_device_resource(new_mr)
-    _enable_managed_prefetching(rmm_mode)
+
+    _enable_managed_prefetching(rmm_mode, managed_memory_is_supported)
 
 
 def pytest_load_initial_conftests(early_config, parser, args):
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index def1d086cc1..7b29ad3373d 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -20,7 +20,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cudf==24.8.*,>=0.0.0a0",
-    "polars>=1.0",
+    "polars>=1.0,<1.3",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",

From d684ae0e80d179d4d711c00278d00b5f66625303 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 14 Aug 2024 12:36:51 -1000
Subject: [PATCH 663/842] Raise NotImplementedError for Series.rename that's
 not a scalar (#16525)

xref https://github.com/rapidsai/cudf/issues/16507

Raising a `NotImplementedError` gives a chance for this work in `cudf.pandas`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16525
---
 python/cudf/cudf/core/series.py       | 4 ++++
 python/cudf/cudf/tests/test_series.py | 7 +++++++
 2 files changed, 11 insertions(+)

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 53675d339ac..822b966364f 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3589,6 +3589,10 @@ def rename(
             raise NotImplementedError("level is currently not supported.")
         if errors != "ignore":
             raise NotImplementedError("errors is currently not supported.")
+        if not is_scalar(index):
+            raise NotImplementedError(
+                ".rename does not currently support relabeling the index."
+            )
         out_data = self._data.copy(deep=copy)
         return Series._from_data(out_data, self.index, name=index)
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 6a1887afb1f..c7aea563535 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2289,6 +2289,13 @@ def test_series_rename(initial_name, name):
     assert_eq(actual, expected)
 
 
+@pytest.mark.parametrize("index", [lambda x: x * 2, {1: 2}])
+def test_rename_index_not_supported(index):
+    ser = cudf.Series(range(2))
+    with pytest.raises(NotImplementedError):
+        ser.rename(index=index)
+
+
 @pytest.mark.parametrize(
     "data",
     [

From 0253e976ede25d954c607663da61b445e213523f Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 14 Aug 2024 21:27:52 -0400
Subject: [PATCH 664/842] [FEA] Support named aggregations in
 `df.groupby().agg()` (#16528)

Closes #15967

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16528
---
 python/cudf/cudf/core/column_accessor.py      |  7 +---
 python/cudf/cudf/core/groupby/groupby.py      | 41 ++++++++++++-------
 python/cudf/cudf/tests/groupby/test_agg.py    | 30 ++++++++++++++
 .../cudf/cudf/tests/test_column_accessor.py   |  4 ++
 python/cudf/cudf/tests/test_dataframe.py      |  1 -
 5 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 83596704672..48bc84070b1 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -610,7 +610,7 @@ def _pad_key(self, key: Any, pad_value="") -> Any:
         return key + (pad_value,) * (self.nlevels - len(key))
 
     def rename_levels(
-        self, mapper: Mapping[Any, Any] | Callable, level: int | None
+        self, mapper: Mapping[Any, Any] | Callable, level: int | None = None
     ) -> ColumnAccessor:
         """
         Rename the specified levels of the given ColumnAccessor
@@ -653,10 +653,7 @@ def rename_column(x):
                 return x
 
             if level is None:
-                raise NotImplementedError(
-                    "Renaming columns with a MultiIndex and level=None is"
-                    "not supported"
-                )
+                level = 0
             new_col_names = (rename_column(k) for k in self.keys())
 
         else:
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 92c4b73ceaa..9b71ea57f1f 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -548,7 +548,7 @@ def _groupby(self):
         )
 
     @_performance_tracking
-    def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
+    def agg(self, func=None, *args, engine=None, engine_kwargs=None, **kwargs):
         """
         Apply aggregation(s) to the groups.
 
@@ -648,11 +648,10 @@ def agg(self, func, *args, engine=None, engine_kwargs=None, **kwargs):
             raise NotImplementedError(
                 "Passing args to func is currently not supported."
             )
-        if kwargs:
-            raise NotImplementedError(
-                "Passing kwargs to func is currently not supported."
-            )
-        column_names, columns, normalized_aggs = self._normalize_aggs(func)
+
+        column_names, columns, normalized_aggs = self._normalize_aggs(
+            func, **kwargs
+        )
         orig_dtypes = tuple(c.dtype for c in columns)
 
         # Note: When there are no key columns, the below produces
@@ -1266,11 +1265,11 @@ def _grouped(self, *, include_groups: bool = True):
         return (group_names, offsets, grouped_keys, grouped_values)
 
     def _normalize_aggs(
-        self, aggs: MultiColumnAggType
+        self, aggs: MultiColumnAggType, **kwargs
     ) -> tuple[Iterable[Any], tuple[ColumnBase, ...], list[list[AggType]]]:
         """
         Normalize aggs to a list of list of aggregations, where `out[i]`
-        is a list of aggregations for column `self.obj[i]`. We support three
+        is a list of aggregations for column `self.obj[i]`. We support four
         different form of `aggs` input here:
         - A single agg, such as "sum". This agg is applied to all value
         columns.
@@ -1279,18 +1278,30 @@ def _normalize_aggs(
         - A mapping of column name to aggs, such as
         {"a": ["sum"], "b": ["mean"]}, the aggs are applied to specified
         column.
+        - Pairs of column name and agg tuples passed as kwargs
+        eg. col1=("a", "sum"), col2=("b", "prod"). The output column names are
+        the keys. The aggs are applied to the corresponding column in the tuple.
         Each agg can be string or lambda functions.
         """
 
         aggs_per_column: Iterable[AggType | Iterable[AggType]]
-        if isinstance(aggs, dict):
-            column_names, aggs_per_column = aggs.keys(), aggs.values()
-            columns = tuple(self.obj._data[col] for col in column_names)
+        # TODO: Remove isinstance condition when the legacy dask_cudf API is removed.
+        # See https://github.com/rapidsai/cudf/pull/16528#discussion_r1715482302 for information.
+        if aggs or isinstance(aggs, dict):
+            if isinstance(aggs, dict):
+                column_names, aggs_per_column = aggs.keys(), aggs.values()
+                columns = tuple(self.obj._data[col] for col in column_names)
+            else:
+                values = self.grouping.values
+                column_names = values._column_names
+                columns = values._columns
+                aggs_per_column = (aggs,) * len(columns)
+        elif not aggs and kwargs:
+            column_names, aggs_per_column = kwargs.keys(), kwargs.values()
+            columns = tuple(self.obj._data[x[0]] for x in kwargs.values())
+            aggs_per_column = tuple(x[1] for x in kwargs.values())
         else:
-            values = self.grouping.values
-            column_names = values._column_names
-            columns = values._columns
-            aggs_per_column = (aggs,) * len(columns)
+            raise TypeError("Must provide at least one aggregation function.")
 
         # is_list_like performs type narrowing but type-checkers don't
         # know it. One could add a TypeGuard annotation to
diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py
index f8847f02d5a..99e7523031b 100644
--- a/python/cudf/cudf/tests/groupby/test_agg.py
+++ b/python/cudf/cudf/tests/groupby/test_agg.py
@@ -3,6 +3,7 @@
 import pytest
 
 import cudf
+from cudf.testing import assert_eq
 
 
 @pytest.mark.parametrize(
@@ -26,3 +27,32 @@ def test_series_agg(attr):
     pd_agg = getattr(pdf.groupby(["a"])["a"], attr)("count")
 
     assert agg.ndim == pd_agg.ndim
+
+
+@pytest.mark.parametrize("func", ["sum", "prod", "mean", "count"])
+@pytest.mark.parametrize("attr", ["agg", "aggregate"])
+def test_dataframe_agg(attr, func):
+    df = cudf.DataFrame({"a": [1, 2, 1, 2], "b": [0, 0, 0, 0]})
+    pdf = df.to_pandas()
+
+    agg = getattr(df.groupby("a"), attr)(func)
+    pd_agg = getattr(pdf.groupby(["a"]), attr)(func)
+
+    assert_eq(agg, pd_agg)
+
+    agg = getattr(df.groupby("a"), attr)({"b": func})
+    pd_agg = getattr(pdf.groupby(["a"]), attr)({"b": func})
+
+    assert_eq(agg, pd_agg)
+
+    agg = getattr(df.groupby("a"), attr)([func])
+    pd_agg = getattr(pdf.groupby(["a"]), attr)([func])
+
+    assert_eq(agg, pd_agg)
+
+    agg = getattr(df.groupby("a"), attr)(foo=("b", func), bar=("a", func))
+    pd_agg = getattr(pdf.groupby(["a"]), attr)(
+        foo=("b", func), bar=("a", func)
+    )
+
+    assert_eq(agg, pd_agg)
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
index e84e1433c10..2d7bc809d4d 100644
--- a/python/cudf/cudf/tests/test_column_accessor.py
+++ b/python/cudf/cudf/tests/test_column_accessor.py
@@ -362,6 +362,10 @@ def test_replace_level_values_MultiColumn():
     got = ca.rename_levels(mapper={"a": "f"}, level=0)
     check_ca_equal(expect, got)
 
+    # passing without level kwarg assumes level=0
+    got = ca.rename_levels(mapper={"a": "f"})
+    check_ca_equal(expect, got)
+
 
 def test_clear_nrows_empty_before():
     ca = ColumnAccessor({})
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 2c59253d500..89eb5a12c71 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9409,7 +9409,6 @@ def test_rename_for_level_RangeIndex_dataframe():
     assert_eq(expect, got)
 
 
-@pytest_xfail(reason="level=None not implemented yet")
 def test_rename_for_level_is_None_MC():
     gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
     gdf.columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])

From 19846b6c0ac40fc91ad28573af04ac7403754acb Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 14 Aug 2024 17:15:03 -1000
Subject: [PATCH 665/842] Disallow cudf.Index accepting column in favor of
 ._from_column (#16549)

Similar to https://github.com/rapidsai/cudf/pull/16454, this PR disallows the public `cudf.Index` accepting a private `ColumnBase` object in favor of `_from_column` (which was added in the linked PR)

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16549
---
 python/cudf/cudf/_lib/parquet.pyx           |   4 +-
 python/cudf/cudf/_lib/utils.pyx             |   6 +-
 python/cudf/cudf/api/types.py               |   2 +-
 python/cudf/cudf/core/_base_index.py        |   2 +-
 python/cudf/cudf/core/algorithms.py         |   6 +-
 python/cudf/cudf/core/column/categorical.py |   8 +-
 python/cudf/cudf/core/column/datetime.py    |  10 +-
 python/cudf/cudf/core/column/methods.py     |   6 +-
 python/cudf/cudf/core/column/string.py      |   2 +-
 python/cudf/cudf/core/cut.py                |   2 +-
 python/cudf/cudf/core/dataframe.py          |   8 +-
 python/cudf/cudf/core/dtypes.py             |  14 +-
 python/cudf/cudf/core/groupby/groupby.py    |   9 +-
 python/cudf/cudf/core/index.py              | 238 ++++++++++++--------
 python/cudf/cudf/core/indexed_frame.py      |  24 +-
 python/cudf/cudf/core/multiindex.py         |   7 +-
 python/cudf/cudf/core/resample.py           |   4 +-
 python/cudf/cudf/core/series.py             |   4 +-
 python/cudf/cudf/core/tools/datetimes.py    |  16 +-
 python/cudf/cudf/testing/testing.py         |   8 +-
 python/cudf/cudf/tests/test_multiindex.py   |   4 +-
 python/cudf/cudf/tests/test_string.py       |   2 +-
 22 files changed, 232 insertions(+), 154 deletions(-)

diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 4a4b13b0b31..0fffb6ade58 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -222,7 +222,7 @@ cdef object _process_metadata(object df,
                 if len(filtered_idx) > 0:
                     idx = cudf.concat(filtered_idx)
                 else:
-                    idx = cudf.Index(cudf.core.column.column_empty(0))
+                    idx = cudf.Index._from_column(cudf.core.column.column_empty(0))
             else:
                 start = range_index_meta["start"] + skip_rows
                 stop = range_index_meta["stop"]
@@ -240,7 +240,7 @@ cdef object _process_metadata(object df,
             index_data = df[index_col]
             actual_index_names = list(index_col_names.values())
             if len(index_data._data) == 1:
-                idx = cudf.Index(
+                idx = cudf.Index._from_column(
                     index_data._data.columns[0],
                     name=actual_index_names[0]
                 )
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index f136cd997a7..267432a0182 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -93,12 +93,12 @@ cpdef generate_pandas_metadata(table, index):
     materialize_index = False
     if index is not False:
         for level, name in enumerate(table._index.names):
-            if isinstance(table._index, cudf.core.multiindex.MultiIndex):
+            if isinstance(table._index, cudf.MultiIndex):
                 idx = table.index.get_level_values(level)
             else:
                 idx = table.index
 
-            if isinstance(idx, cudf.core.index.RangeIndex):
+            if isinstance(idx, cudf.RangeIndex):
                 if index is None:
                     descr = {
                         "kind": "range",
@@ -110,7 +110,7 @@ cpdef generate_pandas_metadata(table, index):
                 else:
                     materialize_index = True
                     # When `index=True`, RangeIndex needs to be materialized.
-                    materialized_idx = cudf.Index(idx._values, name=idx.name)
+                    materialized_idx = idx._as_int_index()
                     descr = _index_level_name(
                         index_name=materialized_idx.name,
                         level=level,
diff --git a/python/cudf/cudf/api/types.py b/python/cudf/cudf/api/types.py
index 294ae2fd985..9c436dfad18 100644
--- a/python/cudf/cudf/api/types.py
+++ b/python/cudf/cudf/api/types.py
@@ -249,7 +249,7 @@ def _union_categoricals(
             new_categories=sorted_categories
         )
 
-    return cudf.Index(result_col)
+    return cudf.CategoricalIndex._from_column(result_col)
 
 
 def is_bool_dtype(arr_or_dtype):
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index c91514202c5..d13351c49dd 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1979,7 +1979,7 @@ def from_pandas(cls, index: pd.Index, nan_as_null=no_default):
                 name=index.name,
             )
         else:
-            return cudf.Index(
+            return cudf.Index._from_column(
                 column.as_column(index, nan_as_null=nan_as_null),
                 name=index.name,
             )
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index 6c69fbd2637..e27d6ec8d3e 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -8,7 +8,7 @@
 import numpy as np
 
 from cudf.core.column import as_column
-from cudf.core.index import RangeIndex, ensure_index
+from cudf.core.index import Index, RangeIndex
 from cudf.core.scalar import Scalar
 from cudf.options import get_option
 from cudf.utils.dtypes import can_convert_to_column
@@ -112,7 +112,9 @@ def factorize(values, sort=False, use_na_sentinel=True, size_hint=None):
         dtype="int64" if get_option("mode.pandas_compatible") else None,
     ).values
 
-    return labels, cats.values if return_cupy_array else ensure_index(cats)
+    return labels, cats.values if return_cupy_array else Index._from_column(
+        cats
+    )
 
 
 def _interpolation(column: ColumnBase, index: BaseIndex) -> ColumnBase:
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 6fa69eb9cc1..d25983842f9 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -601,11 +601,13 @@ def __setitem__(self, key, value):
             to_add_categories = 0
         else:
             if cudf.api.types.is_scalar(value):
-                arr = [value]
+                arr = column.as_column(value, length=1, nan_as_null=False)
             else:
-                arr = value
+                arr = column.as_column(value, nan_as_null=False)
             to_add_categories = len(
-                cudf.Index(arr, nan_as_null=False).difference(self.categories)
+                cudf.Index._from_column(arr).difference(
+                    cudf.Index._from_column(self.categories)
+                )
             )
 
         if to_add_categories > 0:
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index ce67ce81e6b..1dbc94384d3 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -250,6 +250,10 @@ def __contains__(self, item: ScalarLike) -> bool:
     def time_unit(self) -> str:
         return np.datetime_data(self.dtype)[0]
 
+    @property
+    def quarter(self) -> ColumnBase:
+        return libcudf.datetime.extract_quarter(self)
+
     @property
     def year(self) -> ColumnBase:
         return self.get_dt_field("year")
@@ -308,7 +312,7 @@ def is_quarter_start(self) -> ColumnBase:
     @property
     def is_year_end(self) -> ColumnBase:
         day_of_year = self.day_of_year
-        leap_dates = libcudf.datetime.is_leap_year(self)
+        leap_dates = self.is_leap_year
 
         leap = day_of_year == cudf.Scalar(366)
         non_leap = day_of_year == cudf.Scalar(365)
@@ -316,6 +320,10 @@ def is_year_end(self) -> ColumnBase:
             False
         )
 
+    @property
+    def is_leap_year(self) -> ColumnBase:
+        return libcudf.datetime.is_leap_year(self)
+
     @property
     def is_year_start(self) -> ColumnBase:
         return (self.day_of_year == 1).fillna(False)
diff --git a/python/cudf/cudf/core/column/methods.py b/python/cudf/cudf/core/column/methods.py
index 8c46d238057..05a0ab2e09a 100644
--- a/python/cudf/cudf/core/column/methods.py
+++ b/python/cudf/cudf/core/column/methods.py
@@ -65,8 +65,8 @@ def _return_or_inplace(
         """
         if inplace:
             self._parent._mimic_inplace(
-                self._parent.__class__._from_data(
-                    {self._parent.name: new_col}
+                type(self._parent)._from_column(
+                    new_col, name=self._parent.name
                 ),
                 inplace=True,
             )
@@ -92,6 +92,6 @@ def _return_or_inplace(
                     index=self._parent.index if retain_index else None,
                 )
             elif isinstance(self._parent, cudf.BaseIndex):
-                return cudf.Index(new_col, name=self._parent.name)
+                return cudf.Index._from_column(new_col, name=self._parent.name)
             else:
                 return self._parent._mimic_inplace(new_col, inplace=False)
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 1a4b558749d..a710a9f46c2 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -4693,7 +4693,7 @@ def character_tokenize(self) -> SeriesOrIndex:
                 result_col, name=self._parent.name, index=index
             )
         elif isinstance(self._parent, cudf.BaseIndex):
-            return cudf.Index(result_col, name=self._parent.name)
+            return cudf.Index._from_column(result_col, name=self._parent.name)
         else:
             return result_col
 
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index 197f46ee9fe..a4ceea266b4 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -292,7 +292,7 @@ def cut(
     )
 
     # we return a categorical index, as we don't have a Categorical method
-    categorical_index = cudf.CategoricalIndex._from_data({None: col})
+    categorical_index = cudf.CategoricalIndex._from_column(col)
 
     if isinstance(orig_x, (pd.Series, cudf.Series)):
         # if we have a series input we return a series output
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index a53c7bcc63c..3033abd53f5 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -326,7 +326,7 @@ def _getitem_tuple_arg(self, arg):
                                 range(len(tmp_arg[0]))
                             )
                         },
-                        index=cudf.Index(tmp_arg[0]),
+                        index=cudf.Index._from_column(tmp_arg[0]),
                     )
                     columns_df[cantor_name] = column.as_column(
                         range(len(columns_df))
@@ -1758,7 +1758,7 @@ def _concat(
         for cols in columns:
             table_index = None
             if 1 == first_data_column_position:
-                table_index = cudf.Index(cols[0])
+                table_index = cudf.Index._from_column(cols[0])
             elif first_data_column_position > 1:
                 table_index = cudf.MultiIndex._from_data(
                     data=dict(
@@ -1810,7 +1810,7 @@ def _concat(
             if not isinstance(out.index, MultiIndex) and isinstance(
                 out.index.dtype, cudf.CategoricalDtype
             ):
-                out = out.set_index(cudf.Index(out.index._values))
+                out = out.set_index(out.index)
         for name, col in out._data.items():
             out._data[name] = col._with_type_metadata(
                 tables[0]._data[name].dtype
@@ -3007,7 +3007,7 @@ def set_index(
             and not isinstance(keys[0], (cudf.MultiIndex, pd.MultiIndex))
         ):
             # Don't turn single level MultiIndex into an Index
-            idx = cudf.Index(data_to_add[0], name=names[0])
+            idx = cudf.Index._from_column(data_to_add[0], name=names[0])
         else:
             idx = MultiIndex._from_data(dict(enumerate(data_to_add)))
             idx.names = names
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 27afec18b4e..6d532e01cba 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -182,7 +182,7 @@ def __init__(self, categories=None, ordered: bool = False) -> None:
         self._ordered = ordered
 
     @property
-    def categories(self) -> "cudf.core.index.Index":
+    def categories(self) -> cudf.Index:
         """
         An ``Index`` containing the unique categories allowed.
 
@@ -194,10 +194,12 @@ def categories(self) -> "cudf.core.index.Index":
         Index(['b', 'a'], dtype='object')
         """
         if self._categories is None:
-            return cudf.Index(
-                cudf.core.column.column_empty(0, dtype="object", masked=False)
+            col = cudf.core.column.column_empty(
+                0, dtype="object", masked=False
             )
-        return cudf.Index(self._categories, copy=False)
+        else:
+            col = self._categories
+        return cudf.Index._from_column(col)
 
     @property
     def type(self):
@@ -259,7 +261,9 @@ def to_pandas(self) -> pd.CategoricalDtype:
             categories = self._categories.to_pandas()
         return pd.CategoricalDtype(categories=categories, ordered=self.ordered)
 
-    def _init_categories(self, categories: Any):
+    def _init_categories(
+        self, categories: Any
+    ) -> cudf.core.column.ColumnBase | None:
         if categories is None:
             return categories
         if len(categories) == 0 and not isinstance(
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 9b71ea57f1f..4f283d41b17 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -403,8 +403,7 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]:
         if len(group_keys) > 1:
             index = cudf.MultiIndex.from_arrays(group_keys)
         else:
-            (group_keys,) = group_keys
-            index = cudf.Index(group_keys)
+            index = cudf.Index._from_column(group_keys[0])
         return dict(
             zip(index.to_pandas(), cp.split(indices.values, offsets[1:-1]))
         )
@@ -2583,7 +2582,7 @@ def _mimic_pandas_order(
             # corresponding output rows in pandas, to do that here
             # expand the result by reindexing.
             ri = cudf.RangeIndex(0, len(self.obj))
-            result.index = cudf.Index(ordering)
+            result.index = cudf.Index._from_column(ordering)
             # This reorders and expands
             result = result.reindex(ri)
         else:
@@ -3154,7 +3153,9 @@ def keys(self):
                 dict(zip(range(nkeys), self._key_columns))
             )._set_names(self.names)
         else:
-            return cudf.Index(self._key_columns[0], name=self.names[0])
+            return cudf.Index._from_column(
+                self._key_columns[0], name=self.names[0]
+            )
 
     @property
     def values(self) -> cudf.core.frame.Frame:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 3eab27bd165..c55f86d48e1 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -18,7 +18,6 @@
 
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib.datetime import extract_quarter, is_leap_year
 from cudf._lib.filling import sequence
 from cudf._lib.search import search_sorted
 from cudf._lib.types import size_type_dtype
@@ -819,22 +818,23 @@ def sort_values(
     @_performance_tracking
     def _gather(self, gather_map, nullify=False, check_bounds=True):
         gather_map = cudf.core.column.as_column(gather_map)
-        return cudf.Index._from_data(
-            {self.name: self._values.take(gather_map, nullify, check_bounds)}
+        return cudf.Index._from_column(
+            self._column.take(gather_map, nullify, check_bounds),
+            name=self.name,
         )
 
     @_performance_tracking
     def _apply_boolean_mask(self, boolean_mask):
-        return cudf.Index._from_data(
-            {self.name: self._values.apply_boolean_mask(boolean_mask)}
+        return cudf.Index._from_column(
+            self._column.apply_boolean_mask(boolean_mask), name=self.name
         )
 
     def repeat(self, repeats, axis=None):
         return self._as_int_index().repeat(repeats, axis)
 
     def _split(self, splits):
-        return cudf.Index._from_data(
-            {self.name: self._as_int_index()._split(splits)}
+        return cudf.Index._from_column(
+            self._as_int_index()._split(splits), name=self.name
         )
 
     def _binaryop(self, other, op: str):
@@ -1087,10 +1087,13 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
     def _from_column(
         cls, column: ColumnBase, *, name: Hashable = None
     ) -> Self:
-        ca = cudf.core.column_accessor.ColumnAccessor(
-            {name: column}, verify=False
-        )
-        return _index_from_data(ca)
+        if cls is Index:
+            ca = cudf.core.column_accessor.ColumnAccessor(
+                {name: column}, verify=False
+            )
+            return _index_from_data(ca)
+        else:
+            return super()._from_column(column, name=name)
 
     @classmethod
     @_performance_tracking
@@ -1223,8 +1226,8 @@ def _concat(cls, objs):
         if all(isinstance(obj, RangeIndex) for obj in non_empties):
             result = _concat_range_index(non_empties)
         else:
-            data = concat_columns([o._values for o in non_empties])
-            result = Index(data)
+            data = concat_columns([o._column for o in non_empties])
+            result = Index._from_column(data)
 
         names = {obj.name for obj in objs}
         if len(names) == 1:
@@ -1491,7 +1494,7 @@ def __repr__(self):
     def __getitem__(self, index):
         res = self._get_elements_from_column(index)
         if isinstance(res, ColumnBase):
-            res = Index(res, name=self.name)
+            res = Index._from_column(res, name=self.name)
         return res
 
     @property  # type: ignore
@@ -1610,8 +1613,8 @@ def _clean_nulls_from_index(self):
                 if isinstance(self, (DatetimeIndex, TimedeltaIndex))
                 else str(cudf.NA)
             )
-            return cudf.Index(
-                self._values.astype("str").fillna(fill_value),
+            return cudf.Index._from_column(
+                self._column.astype("str").fillna(fill_value),
                 name=self.name,
             )
 
@@ -1866,6 +1869,17 @@ def _from_data(
         result._freq = _validate_freq(freq)
         return result
 
+    @classmethod
+    @_performance_tracking
+    def _from_column(
+        cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None
+    ) -> Self:
+        if column.dtype.kind != "M":
+            raise ValueError("column must have a datetime type.")
+        result = super()._from_column(column, name=name)
+        result._freq = _validate_freq(freq)
+        return result
+
     def __getitem__(self, index):
         value = super().__getitem__(index)
         if cudf.get_option("mode.pandas_compatible") and isinstance(
@@ -1923,8 +1937,8 @@ def strftime(self, date_format: str) -> Index:
         date_format : str
             Date format string (e.g. "%Y-%m-%d").
         """
-        return Index._from_data(
-            {self.name: self._column.strftime(date_format)}
+        return Index._from_column(
+            self._column.strftime(date_format), name=self.name
         )
 
     @property
@@ -1989,7 +2003,9 @@ def to_pydatetime(self) -> np.ndarray:
         return self.to_pandas().to_pydatetime()
 
     def to_julian_date(self) -> Index:
-        return Index._from_data({self.name: self._column.to_julian_date()})
+        return Index._from_column(
+            self._column.to_julian_date(), name=self.name
+        )
 
     def to_period(self, freq) -> pd.PeriodIndex:
         return self.to_pandas().to_period(freq=freq)
@@ -2000,7 +2016,9 @@ def normalize(self) -> Self:
 
         Currently not implemented.
         """
-        return type(self)._from_data({self.name: self._column.normalize()})
+        return type(self)._from_column(
+            self._column.normalize(), name=self.name
+        )
 
     @property
     def time(self) -> np.ndarray:
@@ -2084,7 +2102,7 @@ def days_in_month(self) -> Index:
         """
         Get the total number of days in the month that the date falls on.
         """
-        return Index._from_data({self.name: self._column.days_in_month})
+        return Index._from_column(self._column.days_in_month, name=self.name)
 
     daysinmonth = days_in_month
 
@@ -2093,7 +2111,7 @@ def day_of_week(self) -> Index:
         """
         Get the day of week that the date falls on.
         """
-        return Index._from_data({self.name: self._column.day_of_week})
+        return Index._from_column(self._column.day_of_week, name=self.name)
 
     @property  # type: ignore
     @_performance_tracking
@@ -2234,15 +2252,15 @@ def microsecond(self):
         >>> datetime_index.microsecond
         Index([0, 1, 2], dtype='int32')
         """  # noqa: E501
-        return Index(
+        return Index._from_column(
             (
                 # Need to manually promote column to int32 because
                 # pandas-matching binop behaviour requires that this
                 # __mul__ returns an int16 column.
-                self._values.get_dt_field("millisecond").astype("int32")
+                self._column.get_dt_field("millisecond").astype("int32")
                 * cudf.Scalar(1000, dtype="int32")
             )
-            + self._values.get_dt_field("microsecond"),
+            + self._column.get_dt_field("microsecond"),
             name=self.name,
         )
 
@@ -2374,7 +2392,7 @@ def is_leap_year(self) -> cupy.ndarray:
         ndarray
         Booleans indicating if dates belong to a leap year.
         """
-        res = is_leap_year(self._values).fillna(False)
+        res = self._column.is_leap_year.fillna(False)
         return cupy.asarray(res)
 
     @property  # type: ignore
@@ -2400,8 +2418,7 @@ def quarter(self):
         >>> gIndex.quarter
         Index([2, 4], dtype='int8')
         """
-        res = extract_quarter(self._values)
-        return Index(res, dtype="int8")
+        return Index._from_column(self._column.quarter.astype("int8"))
 
     @_performance_tracking
     def day_name(self, locale: str | None = None) -> Index:
@@ -2423,7 +2440,7 @@ def day_name(self, locale: str | None = None) -> Index:
               dtype='object')
         """
         day_names = self._column.get_day_names(locale)
-        return Index._from_data({self.name: day_names})
+        return Index._from_column(day_names, name=self.name)
 
     @_performance_tracking
     def month_name(self, locale: str | None = None) -> Index:
@@ -2442,7 +2459,7 @@ def month_name(self, locale: str | None = None) -> Index:
         Index(['December', 'January', 'January', 'January', 'January', 'February'], dtype='object')
         """
         month_names = self._column.get_month_names(locale)
-        return Index._from_data({self.name: month_names})
+        return Index._from_column(month_names, name=self.name)
 
     @_performance_tracking
     def isocalendar(self) -> cudf.DataFrame:
@@ -2481,14 +2498,14 @@ def to_pandas(
     @_performance_tracking
     def _get_dt_field(self, field: str) -> Index:
         """Return an Index of a numerical component of the DatetimeIndex."""
-        out_column = self._values.get_dt_field(field)
+        out_column = self._column.get_dt_field(field)
         out_column = NumericalColumn(
             data=out_column.base_data,
             dtype=out_column.dtype,
             mask=out_column.base_mask,
             offset=out_column.offset,
         )
-        return Index(out_column, name=self.name)
+        return Index._from_column(out_column, name=self.name)
 
     def _is_boolean(self):
         return False
@@ -2522,9 +2539,7 @@ def ceil(self, freq):
         >>> gIndex.ceil("T")
         DatetimeIndex(['2020-05-31 08:06:00', '1999-12-31 18:41:00'], dtype='datetime64[ns]')
         """  # noqa: E501
-        out_column = self._values.ceil(freq)
-
-        return self.__class__._from_data({self.name: out_column})
+        return type(self)._from_column(self._column.ceil(freq), name=self.name)
 
     @_performance_tracking
     def floor(self, freq):
@@ -2555,9 +2570,9 @@ def floor(self, freq):
         >>> gIndex.floor("T")
         DatetimeIndex(['2020-05-31 08:59:00', '1999-12-31 18:44:00'], dtype='datetime64[ns]')
         """  # noqa: E501
-        out_column = self._values.floor(freq)
-
-        return self.__class__._from_data({self.name: out_column})
+        return type(self)._from_column(
+            self._column.floor(freq), name=self.name
+        )
 
     @_performance_tracking
     def round(self, freq):
@@ -2595,9 +2610,9 @@ def round(self, freq):
         >>> dt_idx.round('T')
         DatetimeIndex(['2001-01-01 00:05:00', '2001-01-01 00:05:00', '2001-01-01 00:05:00'], dtype='datetime64[ns]')
         """  # noqa: E501
-        out_column = self._values.round(freq)
-
-        return self.__class__._from_data({self.name: out_column})
+        return type(self)._from_column(
+            self._column.round(freq), name=self.name
+        )
 
     def tz_localize(
         self,
@@ -2647,8 +2662,8 @@ def tz_localize(
         to 'NaT'.
         """  # noqa: E501
         result_col = self._column.tz_localize(tz, ambiguous, nonexistent)
-        return DatetimeIndex._from_data(
-            {self.name: result_col}, freq=self._freq
+        return DatetimeIndex._from_column(
+            result_col, name=self.name, freq=self._freq
         )
 
     def tz_convert(self, tz: str | None):
@@ -2684,7 +2699,7 @@ def tz_convert(self, tz: str | None):
                       dtype='datetime64[ns, Europe/London]')
         """  # noqa: E501
         result_col = self._column.tz_convert(tz)
-        return DatetimeIndex._from_data({self.name: result_col})
+        return DatetimeIndex._from_column(result_col, name=self.name)
 
     def repeat(self, repeats, axis=None):
         res = super().repeat(repeats, axis=axis)
@@ -2794,6 +2809,15 @@ def __init__(
 
         super().__init__(data, name=name)
 
+    @classmethod
+    @_performance_tracking
+    def _from_column(
+        cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None
+    ) -> Self:
+        if column.dtype.kind != "m":
+            raise ValueError("column must have a timedelta type.")
+        return super()._from_column(column, name=name)
+
     def __getitem__(self, index):
         value = super().__getitem__(index)
         if cudf.get_option("mode.pandas_compatible") and isinstance(
@@ -2876,7 +2900,7 @@ def ceil(self, freq: str) -> Self:
 
         This method is currently not implemented.
         """
-        return type(self)._from_data({self.name: self._column.ceil(freq)})
+        return type(self)._from_column(self._column.ceil(freq), name=self.name)
 
     def floor(self, freq: str) -> Self:
         """
@@ -2884,7 +2908,9 @@ def floor(self, freq: str) -> Self:
 
         This method is currently not implemented.
         """
-        return type(self)._from_data({self.name: self._column.floor(freq)})
+        return type(self)._from_column(
+            self._column.floor(freq), name=self.name
+        )
 
     def round(self, freq: str) -> Self:
         """
@@ -2892,41 +2918,51 @@ def round(self, freq: str) -> Self:
 
         This method is currently not implemented.
         """
-        return type(self)._from_data({self.name: self._column.round(freq)})
+        return type(self)._from_column(
+            self._column.round(freq), name=self.name
+        )
 
     @property  # type: ignore
     @_performance_tracking
-    def days(self):
+    def days(self) -> cudf.Index:
         """
         Number of days for each element.
         """
         # Need to specifically return `int64` to avoid overflow.
-        return Index(self._values.days, name=self.name, dtype="int64")
+        return Index._from_column(
+            self._column.days.astype("int64"), name=self.name
+        )
 
     @property  # type: ignore
     @_performance_tracking
-    def seconds(self):
+    def seconds(self) -> cudf.Index:
         """
         Number of seconds (>= 0 and less than 1 day) for each element.
         """
-        return Index(self._values.seconds, name=self.name, dtype="int32")
+        return Index._from_column(
+            self._column.seconds.astype("int32"), name=self.name
+        )
 
     @property  # type: ignore
     @_performance_tracking
-    def microseconds(self):
+    def microseconds(self) -> cudf.Index:
         """
         Number of microseconds (>= 0 and less than 1 second) for each element.
         """
-        return Index(self._values.microseconds, name=self.name, dtype="int32")
+        return Index._from_column(
+            self._column.microseconds.astype("int32"), name=self.name
+        )
 
     @property  # type: ignore
     @_performance_tracking
-    def nanoseconds(self):
+    def nanoseconds(self) -> cudf.Index:
         """
         Number of nanoseconds (>= 0 and less than 1 microsecond) for each
         element.
         """
-        return Index(self._values.nanoseconds, name=self.name, dtype="int32")
+        return Index._from_column(
+            self._column.nanoseconds.astype("int32"), name=self.name
+        )
 
     @property  # type: ignore
     @_performance_tracking
@@ -3061,17 +3097,26 @@ def __init__(
             data = data.as_ordered(ordered=False)
         super().__init__(data, name=name)
 
+    @classmethod
+    @_performance_tracking
+    def _from_column(
+        cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None
+    ) -> Self:
+        if not isinstance(column.dtype, cudf.CategoricalDtype):
+            raise ValueError("column must have a categorial type.")
+        return super()._from_column(column, name=name)
+
     @property
     def ordered(self) -> bool:
         return self._column.ordered
 
     @property  # type: ignore
     @_performance_tracking
-    def codes(self):
+    def codes(self) -> cudf.Index:
         """
         The category codes of this categorical.
         """
-        return Index(self._values.codes)
+        return Index._from_column(self._column.codes)
 
     @property  # type: ignore
     @_performance_tracking
@@ -3094,24 +3139,24 @@ def add_categories(self, new_categories) -> Self:
         `new_categories` will be included at the last/highest place in the
         categories and will be unused directly after this call.
         """
-        return type(self)._from_data(
-            {self.name: self._column.add_categories(new_categories)}
+        return type(self)._from_column(
+            self._column.add_categories(new_categories), name=self.name
         )
 
     def as_ordered(self) -> Self:
         """
         Set the Categorical to be ordered.
         """
-        return type(self)._from_data(
-            {self.name: self._column.as_ordered(ordered=True)}
+        return type(self)._from_column(
+            self._column.as_ordered(ordered=True), name=self.name
         )
 
     def as_unordered(self) -> Self:
         """
         Set the Categorical to be unordered.
         """
-        return type(self)._from_data(
-            {self.name: self._column.as_ordered(ordered=False)}
+        return type(self)._from_column(
+            self._column.as_ordered(ordered=False), name=self.name
         )
 
     def remove_categories(self, removals) -> Self:
@@ -3125,8 +3170,8 @@ def remove_categories(self, removals) -> Self:
         removals : category or list of categories
            The categories which should be removed.
         """
-        return type(self)._from_data(
-            {self.name: self._column.remove_categories(removals)}
+        return type(self)._from_column(
+            self._column.remove_categories(removals), name=self.name
         )
 
     def remove_unused_categories(self) -> Self:
@@ -3135,8 +3180,8 @@ def remove_unused_categories(self) -> Self:
 
         This method is currently not supported.
         """
-        return type(self)._from_data(
-            {self.name: self._column.remove_unused_categories()}
+        return type(self)._from_column(
+            self._column.remove_unused_categories(), name=self.name
         )
 
     def rename_categories(self, new_categories) -> Self:
@@ -3145,8 +3190,8 @@ def rename_categories(self, new_categories) -> Self:
 
         This method is currently not supported.
         """
-        return type(self)._from_data(
-            {self.name: self._column.rename_categories(new_categories)}
+        return type(self)._from_column(
+            self._column.rename_categories(new_categories), name=self.name
         )
 
     def reorder_categories(self, new_categories, ordered=None) -> Self:
@@ -3164,12 +3209,9 @@ def reorder_categories(self, new_categories, ordered=None) -> Self:
            Whether or not the categorical is treated as a ordered categorical.
            If not given, do not change the ordered information.
         """
-        return type(self)._from_data(
-            {
-                self.name: self._column.reorder_categories(
-                    new_categories, ordered=ordered
-                )
-            }
+        return type(self)._from_column(
+            self._column.reorder_categories(new_categories, ordered=ordered),
+            name=self.name,
         )
 
     def set_categories(
@@ -3191,12 +3233,11 @@ def set_categories(
             considered as a rename of the old categories
             or as reordered categories.
         """
-        return type(self)._from_data(
-            {
-                self.name: self._column.set_categories(
-                    new_categories, ordered=ordered, rename=rename
-                )
-            }
+        return type(self)._from_column(
+            self._column.set_categories(
+                new_categories, ordered=ordered, rename=rename
+            ),
+            name=self.name,
         )
 
 
@@ -3411,6 +3452,15 @@ def __init__(
     def closed(self):
         return self.dtype.closed
 
+    @classmethod
+    @_performance_tracking
+    def _from_column(
+        cls, column: ColumnBase, *, name: Hashable = None, freq: Any = None
+    ) -> Self:
+        if not isinstance(column.dtype, cudf.IntervalDtype):
+            raise ValueError("column must have a interval type.")
+        return super()._from_column(column, name=name)
+
     @classmethod
     @_performance_tracking
     def from_breaks(
@@ -3593,8 +3643,8 @@ def set_closed(
             Whether the intervals are closed on the left-side, right-side, both
             or neither.
         """
-        return type(self)._from_data(
-            {self.name: self._column.set_closed(closed)}
+        return type(self)._from_column(
+            self._column.set_closed(closed), name=self.name
         )
 
     def to_tuples(self, na_tuple: bool = True) -> pd.Index:
@@ -3680,15 +3730,7 @@ def as_index(
     elif isinstance(arbitrary, BaseIndex):
         idx = arbitrary.copy(deep=copy).rename(name)
     elif isinstance(arbitrary, ColumnBase):
-        idx = _index_from_data({name: arbitrary})
-    elif isinstance(arbitrary, cudf.Series):
-        return as_index(
-            arbitrary._column,
-            nan_as_null=nan_as_null,
-            copy=copy,
-            name=name,
-            dtype=dtype,
-        )
+        raise ValueError("Use cudf.Index._from_column instead.")
     elif isinstance(arbitrary, (pd.RangeIndex, range)):
         idx = RangeIndex(
             start=arbitrary.start,
@@ -3708,11 +3750,9 @@ def as_index(
     elif isinstance(arbitrary, cudf.DataFrame) or is_scalar(arbitrary):
         raise ValueError("Index data must be 1-dimensional and list-like")
     else:
-        return as_index(
+        return Index._from_column(
             column.as_column(arbitrary, dtype=dtype, nan_as_null=nan_as_null),
-            copy=copy,
             name=name,
-            dtype=dtype,
         )
     if dtype is not None:
         idx = idx.astype(dtype)
@@ -3749,7 +3789,9 @@ def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex:
         elif step is None:
             # First non-empty index had only one element
             if obj.start == start:
-                result = Index(concat_columns([x._values for x in indexes]))
+                result = Index._from_column(
+                    concat_columns([x._column for x in indexes])
+                )
                 return result
             step = obj.start - start
 
@@ -3757,7 +3799,9 @@ def _concat_range_index(indexes: list[RangeIndex]) -> BaseIndex:
             next_ is not None and obj.start != next_
         )
         if non_consecutive:
-            result = Index(concat_columns([x._values for x in indexes]))
+            result = Index._from_column(
+                concat_columns([x._column for x in indexes])
+            )
             return result
         if step is not None:
             next_ = obj[-1] + step
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 3b44a0f5864..8be9f0ad78e 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -182,11 +182,16 @@ def _indices_from_labels(obj, labels):
             )
         else:
             labels = labels.astype(obj.index.dtype)
+        idx_labels = cudf.Index._from_column(labels)
+    else:
+        idx_labels = labels
 
     # join is not guaranteed to maintain the index ordering
     # so we will sort it with its initial ordering which is stored
     # in column "__"
-    lhs = cudf.DataFrame({"__": as_column(range(len(labels)))}, index=labels)
+    lhs = cudf.DataFrame(
+        {"__": as_column(range(len(idx_labels)))}, index=idx_labels
+    )
     rhs = cudf.DataFrame({"_": as_column(range(len(obj)))}, index=obj.index)
     return lhs.join(rhs).sort_values(by=["__", "_"])["_"]
 
@@ -6642,7 +6647,11 @@ def _drop_rows_by_labels(
         # 3. Use "leftanti" join to drop
         # TODO: use internal API with "leftanti" and specify left and right
         # join keys to bypass logic check
-        to_join = cudf.DataFrame(index=cudf.Index(labels, name=level))
+        if isinstance(labels, ColumnBase):
+            join_index = cudf.Index._from_column(labels, name=level)
+        else:
+            join_index = cudf.Index(labels, name=level)
+        to_join = cudf.DataFrame(index=join_index)
         join_res = working_df.join(to_join, how="leftanti")
 
         # 4. Reconstruct original layout, and rename
@@ -6669,12 +6678,11 @@ def _drop_rows_by_labels(
         if errors == "raise" and not labels.isin(obj.index).all():
             raise KeyError("One or more values not found in axis")
 
-        key_df = cudf.DataFrame._from_data(
-            data={},
-            index=cudf.Index(
-                labels, name=getattr(labels, "name", obj.index.name)
-            ),
-        )
+        if isinstance(labels, ColumnBase):
+            idx = cudf.Index._from_column(labels, name=obj.index.name)
+        else:
+            idx = cudf.Index(labels, name=labels.name)
+        key_df = cudf.DataFrame._from_data(data={}, index=idx)
         if isinstance(obj, cudf.DataFrame):
             res = obj.join(key_df, how="leftanti")
         else:
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index ab88b191570..a66e2936e3b 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -811,8 +811,9 @@ def _index_and_downcast(self, result, index, index_key):
             # it into an Index and name the final index values according
             # to that column's name.
             *_, last_column = index._data.columns
-            out_index = cudf.Index(last_column)
-            out_index.name = index.names[-1]
+            out_index = cudf.Index._from_column(
+                last_column, name=index.names[-1]
+            )
             index = out_index
         elif out_index._num_columns > 1:
             # Otherwise pop the leftmost levels, names, and codes from the
@@ -1061,7 +1062,7 @@ def get_level_values(self, level):
                 raise KeyError(f"Level not found: '{level}'")
         else:
             level_idx = colnames.index(level)
-        level_values = cudf.Index(
+        level_values = cudf.Index._from_column(
             self._data[level], name=self.names[level_idx]
         )
         return level_values
diff --git a/python/cudf/cudf/core/resample.py b/python/cudf/cudf/core/resample.py
index 715bbf89b15..e0aee28bfeb 100644
--- a/python/cudf/cudf/core/resample.py
+++ b/python/cudf/cudf/core/resample.py
@@ -145,7 +145,9 @@ def copy(self, deep=True):
     def keys(self):
         index = super().keys
         if self._freq is not None and isinstance(index, cudf.DatetimeIndex):
-            return cudf.DatetimeIndex._from_data(index._data, freq=self._freq)
+            return cudf.DatetimeIndex._from_column(
+                index._column, name=index.name, freq=self._freq
+            )
         return index
 
     def serialize(self):
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 822b966364f..2fb4fde6552 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -3245,8 +3245,8 @@ def value_counts(
             interval_col = IntervalColumn.from_struct_column(
                 res.index._column._get_decategorized_column()
             )
-            res.index = cudf.IntervalIndex._from_data(
-                {res.index.name: interval_col}
+            res.index = cudf.IntervalIndex._from_column(
+                interval_col, name=res.index.name
             )
         res.name = result_name
         return res
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index c50a36b68b5..a92bf420147 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -18,7 +18,6 @@
 )
 from cudf.api.types import is_integer, is_scalar
 from cudf.core import column
-from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.index import ensure_index
 
 # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/core/tools/datetimes.py#L1112
@@ -288,8 +287,7 @@ def to_datetime(
                 utc=utc,
             )
             if isinstance(arg, (cudf.BaseIndex, pd.Index)):
-                ca = ColumnAccessor({arg.name: col}, verify=False)
-                return cudf.DatetimeIndex._from_data(ca)
+                return cudf.DatetimeIndex._from_column(col, name=arg.name)
             elif isinstance(arg, (cudf.Series, pd.Series)):
                 return cudf.Series._from_column(
                     col, name=arg.name, index=ensure_index(arg.index)
@@ -297,7 +295,7 @@ def to_datetime(
             elif is_scalar(arg):
                 return col.element_indexing(0)
             else:
-                return cudf.Index(col)
+                return cudf.Index._from_column(col)
     except Exception as e:
         if errors == "raise":
             raise e
@@ -900,7 +898,9 @@ def date_range(
         end = cudf.Scalar(end, dtype=dtype).value.astype("int64")
         arr = np.linspace(start=start, stop=end, num=periods)
         result = cudf.core.column.as_column(arr).astype("datetime64[ns]")
-        return cudf.DatetimeIndex._from_data({name: result}).tz_localize(tz)
+        return cudf.DatetimeIndex._from_column(result, name=name).tz_localize(
+            tz
+        )
 
     # The code logic below assumes `freq` is defined. It is first normalized
     # into `DateOffset` for further computation with timestamps.
@@ -1001,9 +1001,9 @@ def date_range(
             "datetime64[ns]"
         )
 
-    return cudf.DatetimeIndex._from_data({name: res}, freq=freq).tz_localize(
-        tz
-    )
+    return cudf.DatetimeIndex._from_column(
+        res, name=name, freq=freq
+    ).tz_localize(tz)
 
 
 def _has_fixed_frequency(freq: DateOffset) -> bool:
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index c2072d90e98..31ad24a4664 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -398,8 +398,12 @@ def assert_index_equal(
             )
 
         for level in range(left.nlevels):
-            llevel = cudf.Index(left._columns[level], name=left.names[level])
-            rlevel = cudf.Index(right._columns[level], name=right.names[level])
+            llevel = cudf.Index._from_column(
+                left._columns[level], name=left.names[level]
+            )
+            rlevel = cudf.Index._from_column(
+                right._columns[level], name=right.names[level]
+            )
             mul_obj = f"MultiIndex level [{level}]"
             assert_index_equal(
                 llevel,
diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py
index a68f4574da3..b1e095e8853 100644
--- a/python/cudf/cudf/tests/test_multiindex.py
+++ b/python/cudf/cudf/tests/test_multiindex.py
@@ -167,7 +167,9 @@ def test_string_index():
     pdf.index = stringIndex.to_pandas()
     gdf.index = stringIndex
     assert_eq(pdf, gdf)
-    stringIndex = cudf.Index(as_column(["a", "b", "c", "d", "e"]), name="name")
+    stringIndex = cudf.Index._from_column(
+        as_column(["a", "b", "c", "d", "e"]), name="name"
+    )
     pdf.index = stringIndex.to_pandas()
     gdf.index = stringIndex
     assert_eq(pdf, gdf)
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 30880f074c0..cc88cc79769 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -1092,7 +1092,7 @@ def test_string_index():
     pdf.index = stringIndex.to_pandas()
     gdf.index = stringIndex
     assert_eq(pdf, gdf)
-    stringIndex = cudf.Index(
+    stringIndex = cudf.Index._from_column(
         cudf.core.column.as_column(["a", "b", "c", "d", "e"]), name="name"
     )
     pdf.index = stringIndex.to_pandas()

From 89863a3b791250a2285b90d2c13f51f009638f44 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 14 Aug 2024 17:22:31 -1000
Subject: [PATCH 666/842] Align public utility function signatures  with pandas
 2.x (#16565)

The following function signatures have a breaking change

* `concat`
* `get_dummies`
* `date_range`

Additionally deprecates the `cat` argument in `get_dummies` (doesn't exist in pandas and not tested), and fixes a bug in `interval_range` where `names` was not being respected

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16565
---
 python/cudf/cudf/__init__.py                  |  2 +
 python/cudf/cudf/core/index.py                |  4 +-
 python/cudf/cudf/core/reshape.py              | 74 ++++++++++++++-----
 python/cudf/cudf/core/tools/datetimes.py      | 12 +--
 python/cudf/cudf/core/tools/numeric.py        |  9 ++-
 .../cudf/cudf/tests/indexes/test_interval.py  |  6 ++
 python/cudf/cudf/tests/test_onehot.py         |  6 ++
 7 files changed, 84 insertions(+), 29 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index e14815a1b0d..77ae0791b81 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -97,6 +97,7 @@
     "DatetimeIndex",
     "Decimal32Dtype",
     "Decimal64Dtype",
+    "Decimal128Dtype",
     "Grouper",
     "Index",
     "IntervalDtype",
@@ -126,6 +127,7 @@
     "isclose",
     "melt",
     "merge",
+    "option_context",
     "pivot",
     "pivot_table",
     "read_avro",
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index c55f86d48e1..d02633a97fa 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3350,14 +3350,14 @@ def interval_range(
     if len(right_col) == 0 or len(left_col) == 0:
         dtype = IntervalDtype("int64", closed)
         data = column.column_empty_like_same_mask(left_col, dtype)
-        return IntervalIndex(data, closed=closed)
+        return IntervalIndex(data, closed=closed, name=name)
 
     interval_col = IntervalColumn(
         dtype=IntervalDtype(left_col.dtype, closed),
         size=len(left_col),
         children=(left_col, right_col),
     )
-    return IntervalIndex(interval_col, closed=closed)
+    return IntervalIndex(interval_col, closed=closed, name=name)
 
 
 class IntervalIndex(Index):
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 52a55760d4a..df471692702 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -118,7 +118,17 @@ def _normalize_series_and_dataframe(objs, axis):
             objs[idx] = obj.to_frame(name=name)
 
 
-def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
+def concat(
+    objs,
+    axis=0,
+    join="outer",
+    ignore_index=False,
+    keys=None,
+    levels=None,
+    names=None,
+    verify_integrity=False,
+    sort=None,
+):
     """Concatenate DataFrames, Series, or Indices row-wise.
 
     Parameters
@@ -132,6 +142,21 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
     ignore_index : bool, default False
         Set True to ignore the index of the *objs* and provide a
         default range index instead.
+    keys : sequence, default None
+        If multiple levels passed, should contain tuples. Construct
+        hierarchical index using the passed keys as the outermost level.
+        Currently not supported.
+    levels : list of sequences, default None
+        Specific levels (unique values) to use for constructing a
+        MultiIndex. Otherwise they will be inferred from the keys.
+        Currently not supported.
+    names : list, default None
+        Names for the levels in the resulting hierarchical index.
+        Currently not supported.
+    verify_integrity : bool, default False
+        Check whether the new concatenated axis contains duplicates. This can
+        be very expensive relative to the actual data concatenation.
+        Currently not supported.
     sort : bool, default False
         Sort non-concatenation axis if it is not already aligned.
 
@@ -243,6 +268,12 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
     0      a       1       c       3
     1      b       2       d       4
     """
+    if keys is not None:
+        raise NotImplementedError("keys is currently not supported")
+    if levels is not None:
+        raise NotImplementedError("levels is currently not supported")
+    if names is not None:
+        raise NotImplementedError("names is currently not supported")
     # TODO: Do we really need to have different error messages for an empty
     # list and a list of None?
     if not objs:
@@ -260,7 +291,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                 f"Can only concatenate dictionary input along axis=1, not {axis}"
             )
         objs = {k: obj for k, obj in objs.items() if obj is not None}
-        keys = list(objs)
+        keys_objs = list(objs)
         objs = list(objs.values())
         if any(isinstance(o, cudf.BaseIndex) for o in objs):
             raise TypeError(
@@ -268,7 +299,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
             )
     else:
         objs = [obj for obj in objs if obj is not None]
-        keys = None
+        keys_objs = None
 
     if not objs:
         raise ValueError("All objects passed were None")
@@ -317,8 +348,8 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                 result = obj.to_frame()
             else:
                 result = obj.copy(deep=True)
-            if keys is not None and isinstance(result, cudf.DataFrame):
-                k = keys[0]
+            if keys_objs is not None and isinstance(result, cudf.DataFrame):
+                k = keys_objs[0]
                 result.columns = cudf.MultiIndex.from_tuples(
                     [
                         (k, *c) if isinstance(c, tuple) else (k, c)
@@ -370,7 +401,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
             objs = _align_objs(objs, how=join, sort=sort)
             df.index = objs[0].index
 
-        if keys is None:
+        if keys_objs is None:
             for o in objs:
                 for name, col in o._data.items():
                     if name in df._data:
@@ -408,9 +439,9 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                     "label types in cuDF at this time. You must convert "
                     "the labels to the same type."
                 )
-            for k, o in zip(keys, objs):
+            for k, o in zip(keys_objs, objs):
                 for name, col in o._data.items():
-                    # if only series, then only keep keys as column labels
+                    # if only series, then only keep keys_objs as column labels
                     # if the existing column is multiindex, prepend it
                     # to handle cases where dfs and srs are concatenated
                     if only_series:
@@ -426,7 +457,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None):
                     else:
                         df[col_label] = col
 
-        if keys is None:
+        if keys_objs is None:
             df.columns = result_columns.unique()
             if ignore_index:
                 df.columns = cudf.RangeIndex(len(result_columns.unique()))
@@ -666,7 +697,7 @@ def _tile(A, reps):
 
 
 def get_dummies(
-    df,
+    data,
     prefix=None,
     prefix_sep="_",
     dummy_na=False,
@@ -681,7 +712,7 @@ def get_dummies(
 
     Parameters
     ----------
-    df : array-like, Series, or DataFrame
+    data : array-like, Series, or DataFrame
         Data of which to get dummy indicators.
     prefix : str, dict, or sequence, optional
         Prefix to append. Either a str (to apply a constant prefix), dict
@@ -759,17 +790,22 @@ def get_dummies(
 
     if cats is None:
         cats = {}
+    else:
+        warnings.warn(
+            "cats is deprecated and will be removed in a future version.",
+            FutureWarning,
+        )
     if sparse:
         raise NotImplementedError("sparse is not supported yet")
 
     if drop_first:
         raise NotImplementedError("drop_first is not supported yet")
 
-    if isinstance(df, cudf.DataFrame):
+    if isinstance(data, cudf.DataFrame):
         encode_fallback_dtypes = ["object", "category"]
 
         if columns is None or len(columns) == 0:
-            columns = df.select_dtypes(
+            columns = data.select_dtypes(
                 include=encode_fallback_dtypes
             )._column_names
 
@@ -796,33 +832,33 @@ def get_dummies(
         # If we have no columns to encode, we need to drop
         # fallback columns(if any)
         if len(columns) == 0:
-            return df.select_dtypes(exclude=encode_fallback_dtypes)
+            return data.select_dtypes(exclude=encode_fallback_dtypes)
         else:
             result_data = {
                 col_name: col
-                for col_name, col in df._data.items()
+                for col_name, col in data._data.items()
                 if col_name not in columns
             }
 
             for name in columns:
                 if name not in cats:
                     unique = _get_unique(
-                        column=df._data[name], dummy_na=dummy_na
+                        column=data._data[name], dummy_na=dummy_na
                     )
                 else:
                     unique = as_column(cats[name])
 
                 col_enc_data = _one_hot_encode_column(
-                    column=df._data[name],
+                    column=data._data[name],
                     categories=unique,
                     prefix=prefix_map.get(name, prefix),
                     prefix_sep=prefix_sep_map.get(name, prefix_sep),
                     dtype=dtype,
                 )
                 result_data.update(col_enc_data)
-            return cudf.DataFrame._from_data(result_data, index=df.index)
+            return cudf.DataFrame._from_data(result_data, index=data.index)
     else:
-        ser = cudf.Series(df)
+        ser = cudf.Series(data)
         unique = _get_unique(column=ser._column, dummy_na=dummy_na)
         data = _one_hot_encode_column(
             column=ser._column,
diff --git a/python/cudf/cudf/core/tools/datetimes.py b/python/cudf/cudf/core/tools/datetimes.py
index a92bf420147..7197560b5a4 100644
--- a/python/cudf/cudf/core/tools/datetimes.py
+++ b/python/cudf/cudf/core/tools/datetimes.py
@@ -785,7 +785,7 @@ def date_range(
     tz=None,
     normalize: bool = False,
     name=None,
-    closed: Literal["left", "right", "both", "neither"] = "both",
+    inclusive: Literal["left", "right", "both", "neither"] = "both",
     *,
     unit: str | None = None,
 ):
@@ -823,7 +823,7 @@ def date_range(
     name : str, default None
         Name of the resulting DatetimeIndex
 
-    closed : {"left", "right", "both", "neither"}, default "both"
+    inclusive : {"left", "right", "both", "neither"}, default "both"
         Whether to set each bound as closed or open.
         Currently only "both" is supported
 
@@ -839,7 +839,7 @@ def date_range(
     -----
     Of the four parameters `start`, `end`, `periods`, and `freq`, exactly three
     must be specified. If `freq` is omitted, the resulting DatetimeIndex will
-    have periods linearly spaced elements between start and end (closed on both
+    have periods linearly spaced elements between start and end (inclusive on both
     sides).
 
     cudf supports `freq` specified with either fixed-frequency offset
@@ -866,8 +866,8 @@ def date_range(
                 '2026-04-23 08:00:00'],
                 dtype='datetime64[ns]')
     """
-    if closed != "both":
-        raise NotImplementedError(f"{closed=} is currently unsupported.")
+    if inclusive != "both":
+        raise NotImplementedError(f"{inclusive=} is currently unsupported.")
     if unit is not None:
         raise NotImplementedError(f"{unit=} is currently unsupported.")
     if normalize is not False:
@@ -961,7 +961,7 @@ def date_range(
             periods = 0
         else:
             # If end == start, periods == 0 and we return exactly 1 timestamp (start).
-            # Otherwise, since closed="both", we ensure the end point is included.
+            # Otherwise, since inclusive="both", we ensure the end point is included.
             periods += 1
 
     # We compute `end_estim` (the estimated upper bound of the date
diff --git a/python/cudf/cudf/core/tools/numeric.py b/python/cudf/cudf/core/tools/numeric.py
index 8b95f6f6a04..6cecf3fa170 100644
--- a/python/cudf/cudf/core/tools/numeric.py
+++ b/python/cudf/cudf/core/tools/numeric.py
@@ -20,7 +20,7 @@
     from cudf.core.column import ColumnBase
 
 
-def to_numeric(arg, errors="raise", downcast=None):
+def to_numeric(arg, errors="raise", downcast=None, dtype_backend=None):
     """
     Convert argument into numerical types.
 
@@ -48,6 +48,8 @@ def to_numeric(arg, errors="raise", downcast=None):
         Note that downcast behavior is decoupled from parsing. Errors
         encountered during downcast is raised regardless of ``errors``
         parameter.
+    dtype_backend : None
+        Not implemented.
 
     Returns
     -------
@@ -93,7 +95,10 @@ def to_numeric(arg, errors="raise", downcast=None):
         For example ``[1, 'a']``. A ``TypeError`` will be raised when such
         input is received, regardless of ``errors`` parameter.
     """
-
+    if dtype_backend is not None:
+        raise NotImplementedError(
+            "dtype_backend is not currently implemented."
+        )
     if errors not in {"raise", "ignore", "coerce"}:
         raise ValueError("invalid error value specified")
     elif errors == "ignore":
diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py
index 3b3a9f96543..a567c27f584 100644
--- a/python/cudf/cudf/tests/indexes/test_interval.py
+++ b/python/cudf/cudf/tests/indexes/test_interval.py
@@ -401,3 +401,9 @@ def test_from_tuples():
     result = cudf.IntervalIndex.from_tuples(data, closed="left", name="a")
     expected = pd.IntervalIndex.from_tuples(data, closed="left", name="a")
     assert_eq(result, expected)
+
+
+def test_interval_range_name():
+    expected = pd.interval_range(start=0, periods=5, freq=2, name="foo")
+    result = cudf.interval_range(start=0, periods=5, freq=2, name="foo")
+    assert_eq(result, expected)
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index 154e1e19072..cc17dc46e0a 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -155,3 +155,9 @@ def test_get_dummies_array_like_with_nan():
     actual = cudf.get_dummies(ser, dummy_na=True, prefix="a", prefix_sep="_")
 
     assert_eq(expected, actual)
+
+
+def test_get_dummies_cats_deprecated():
+    df = cudf.DataFrame(range(3))
+    with pytest.warns(FutureWarning):
+        cudf.get_dummies(df, cats={0: [0, 1, 2]})

From 2bcb7ecd2c077b3989ced1b8be8727e1b71f93b1 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 14 Aug 2024 17:24:48 -1000
Subject: [PATCH 667/842] Fix `.replace(Index, Index)` raising a TypeError
 (#16513)

Since `cudf.Index` is list-like, passing this to `.replace` should act like replacing a list of values with a corresponding list of values.

Discovered while working on https://github.com/rapidsai/cuml/pull/6019

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16513
---
 python/cudf/cudf/core/indexed_frame.py | 14 +++++++-------
 python/cudf/cudf/tests/test_replace.py |  6 ++++++
 2 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 8be9f0ad78e..ae7369c80d1 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -6469,7 +6469,7 @@ def _get_replacement_values_for_columns(
         to_replace_columns = {col: [to_replace] for col in columns_dtype_map}
         values_columns = {col: [value] for col in columns_dtype_map}
     elif cudf.api.types.is_list_like(to_replace) or isinstance(
-        to_replace, ColumnBase
+        to_replace, (ColumnBase, BaseIndex)
     ):
         if is_scalar(value):
             to_replace_columns = {col: to_replace for col in columns_dtype_map}
@@ -6483,7 +6483,9 @@ def _get_replacement_values_for_columns(
                 )
                 for col in columns_dtype_map
             }
-        elif cudf.api.types.is_list_like(value):
+        elif cudf.api.types.is_list_like(
+            value
+        ) or cudf.utils.dtypes.is_column_like(value):
             if len(to_replace) != len(value):
                 raise ValueError(
                     f"Replacement lists must be "
@@ -6495,9 +6497,6 @@ def _get_replacement_values_for_columns(
                     col: to_replace for col in columns_dtype_map
                 }
                 values_columns = {col: value for col in columns_dtype_map}
-        elif cudf.utils.dtypes.is_column_like(value):
-            to_replace_columns = {col: to_replace for col in columns_dtype_map}
-            values_columns = {col: value for col in columns_dtype_map}
         else:
             raise TypeError(
                 "value argument must be scalar, list-like or Series"
@@ -6592,12 +6591,13 @@ def _get_replacement_values_for_columns(
     return all_na_columns, to_replace_columns, values_columns
 
 
-def _is_series(obj):
+def _is_series(obj: Any) -> bool:
     """
     Checks if the `obj` is of type `cudf.Series`
     instead of checking for isinstance(obj, cudf.Series)
+    to avoid circular imports.
     """
-    return isinstance(obj, Frame) and obj.ndim == 1 and obj.index is not None
+    return isinstance(obj, IndexedFrame) and obj.ndim == 1
 
 
 @_performance_tracking
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index 1973fe6fb41..e5ee0127a74 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -1378,3 +1378,9 @@ def test_fillna_nan_and_null():
     result = ser.fillna(2.2)
     expected = cudf.Series([2.2, 2.2, 1.1])
     assert_eq(result, expected)
+
+
+def test_replace_with_index_objects():
+    result = cudf.Series([1, 2]).replace(cudf.Index([1]), cudf.Index([2]))
+    expected = pd.Series([1, 2]).replace(pd.Index([1]), pd.Index([2]))
+    assert_eq(result, expected)

From ac42bc870a65d807784cae63e25b9e9ca788eb23 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 15 Aug 2024 09:37:43 -0400
Subject: [PATCH 668/842] Hide all gtest symbols in cudftestutil (#16546)

By hiding the gtest symbols in cudftestutil it allows consumers of the library to build with a differing version of gtest without issue.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Marcus D. Hanwell (https://github.com/cryos)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16546
---
 cpp/cmake/thirdparty/get_gtest.cmake | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index 10e6b026d9a..ec8cbd8c568 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -16,9 +16,18 @@
 function(find_and_configure_gtest)
   include(${rapids-cmake-dir}/cpm/gtest.cmake)
 
+  # Mark all the non explicit googletest symbols as hidden. This ensures that libcudftestutil can be
+  # used by consumers with a different shared gtest.
+  set(gtest_hide_internal_symbols ON)
+
   # Find or install GoogleTest
   rapids_cpm_gtest(BUILD_STATIC)
 
+  # Mark all the explicit googletest symbols as hidden. This ensures that libcudftestutil can be
+  # used by consumers with a different shared gtest.
+  if(TARGET gtest)
+    target_compile_definitions(gtest PUBLIC "$<BUILD_LOCAL_INTERFACE:GTEST_API_=>")
+  endif()
 endfunction()
 
 find_and_configure_gtest()

From f4a9b1c5016e254ebf2de55ac9946af6420ebff5 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 11:14:06 -1000
Subject: [PATCH 669/842] Use more idomatic cudf APIs in dask_cudf meta
 generation (#16487)

Namely:

* Avoiding `cudf.core` imports by checking public column `.dtype`s
* Using more straightforward cudf APIs to construct meta objects

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16487
---
 python/dask_cudf/dask_cudf/backends.py | 124 ++++++++++++-------------
 1 file changed, 58 insertions(+), 66 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 01bab30190a..82ea2ac033a 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -55,37 +55,31 @@
 @meta_nonempty.register(cudf.BaseIndex)
 @_dask_cudf_performance_tracking
 def _nonempty_index(idx):
-    if isinstance(idx, cudf.core.index.RangeIndex):
-        return cudf.core.index.RangeIndex(2, name=idx.name)
-    elif isinstance(idx, cudf.core.index.DatetimeIndex):
-        start = "1970-01-01"
-        data = np.array([start, "1970-01-02"], dtype=idx.dtype)
+    """Return a non-empty cudf.Index as metadata."""
+    # TODO: IntervalIndex, TimedeltaIndex?
+    if isinstance(idx, cudf.RangeIndex):
+        return cudf.RangeIndex(2, name=idx.name)
+    elif isinstance(idx, cudf.DatetimeIndex):
+        data = np.array(["1970-01-01", "1970-01-02"], dtype=idx.dtype)
         values = cudf.core.column.as_column(data)
-        return cudf.core.index.DatetimeIndex(values, name=idx.name)
-    elif isinstance(idx, cudf.core.index.CategoricalIndex):
-        key = tuple(idx._data.keys())
-        assert len(key) == 1
-        categories = idx._data[key[0]].categories
-        codes = [0, 0]
-        ordered = idx._data[key[0]].ordered
+        return cudf.DatetimeIndex(values, name=idx.name)
+    elif isinstance(idx, cudf.CategoricalIndex):
         values = cudf.core.column.build_categorical_column(
-            categories=categories, codes=codes, ordered=ordered
+            categories=idx.categories, codes=[0, 0], ordered=idx.ordered
         )
-        return cudf.core.index.CategoricalIndex(values, name=idx.name)
-    elif isinstance(idx, cudf.core.multiindex.MultiIndex):
+        return cudf.CategoricalIndex(values, name=idx.name)
+    elif isinstance(idx, cudf.MultiIndex):
         levels = [meta_nonempty(lev) for lev in idx.levels]
-        codes = [[0, 0] for i in idx.levels]
-        return cudf.core.multiindex.MultiIndex(
-            levels=levels, codes=codes, names=idx.names
-        )
-    elif isinstance(idx._column, cudf.core.column.StringColumn):
+        codes = [[0, 0]] * idx.nlevels
+        return cudf.MultiIndex(levels=levels, codes=codes, names=idx.names)
+    elif is_string_dtype(idx.dtype):
         return cudf.Index(["cat", "dog"], name=idx.name)
-    elif isinstance(idx, cudf.core.index.Index):
-        return cudf.core.index.Index(
-            np.arange(2, dtype=idx.dtype), name=idx.name
-        )
+    elif isinstance(idx, cudf.Index):
+        return cudf.Index(np.arange(2, dtype=idx.dtype), name=idx.name)
 
-    raise TypeError(f"Don't know how to handle index of type {type(idx)}")
+    raise TypeError(
+        f"Don't know how to handle index of type {type(idx).__name__}"
+    )
 
 
 def _nest_list_data(data, leaf_type):
@@ -101,50 +95,49 @@ def _nest_list_data(data, leaf_type):
 
 
 @_dask_cudf_performance_tracking
-def _get_non_empty_data(s):
-    """Return a non empty column as metadata."""
-    if isinstance(s, cudf.core.column.CategoricalColumn):
+def _get_non_empty_data(
+    s: cudf.core.column.ColumnBase,
+) -> cudf.core.column.ColumnBase:
+    """Return a non-empty column as metadata from a column."""
+    if isinstance(s.dtype, cudf.CategoricalDtype):
         categories = (
-            s.categories if len(s.categories) else [UNKNOWN_CATEGORIES]
+            s.categories if len(s.categories) else [UNKNOWN_CATEGORIES]  # type: ignore[attr-defined]
         )
         codes = cudf.core.column.as_column(
             0,
             dtype=cudf._lib.types.size_type_dtype,
             length=2,
         )
-        ordered = s.ordered
-        data = cudf.core.column.build_categorical_column(
+        ordered = s.ordered  # type: ignore[attr-defined]
+        return cudf.core.column.build_categorical_column(
             categories=categories, codes=codes, ordered=ordered
         )
-    elif isinstance(s, cudf.core.column.ListColumn):
+    elif isinstance(s.dtype, cudf.ListDtype):
         leaf_type = s.dtype.leaf_type
         if is_string_dtype(leaf_type):
             data = ["cat", "dog"]
         else:
             data = np.array([0, 1], dtype=leaf_type).tolist()
         data = _nest_list_data(data, s.dtype) * 2
-        data = cudf.core.column.as_column(data, dtype=s.dtype)
-    elif isinstance(s, cudf.core.column.StructColumn):
+        return cudf.core.column.as_column(data, dtype=s.dtype)
+    elif isinstance(s.dtype, cudf.StructDtype):
+        # Handles IntervalColumn
         struct_dtype = s.dtype
-        data = [{key: None for key in struct_dtype.fields.keys()}] * 2
-        data = cudf.core.column.as_column(data, dtype=s.dtype)
+        struct_data = [{key: None for key in struct_dtype.fields.keys()}] * 2
+        return cudf.core.column.as_column(struct_data, dtype=s.dtype)
     elif is_string_dtype(s.dtype):
-        data = cudf.core.column.as_column(pa.array(["cat", "dog"]))
+        return cudf.core.column.as_column(pa.array(["cat", "dog"]))
     elif isinstance(s.dtype, pd.DatetimeTZDtype):
-        from cudf.utils.dtypes import get_time_unit
-
-        data = cudf.date_range("2001-01-01", periods=2, freq=get_time_unit(s))
-        data = data.tz_localize(str(s.dtype.tz))._column
+        date_data = cudf.date_range("2001-01-01", periods=2, freq=s.time_unit)  # type: ignore[attr-defined]
+        return date_data.tz_localize(str(s.dtype.tz))._column
+    elif s.dtype.kind in "fiubmM":
+        return cudf.core.column.as_column(
+            np.arange(start=0, stop=2, dtype=s.dtype)
+        )
     else:
-        if pd.api.types.is_numeric_dtype(s.dtype):
-            data = cudf.core.column.as_column(
-                cp.arange(start=0, stop=2, dtype=s.dtype)
-            )
-        else:
-            data = cudf.core.column.as_column(
-                cp.arange(start=0, stop=2, dtype="int64")
-            ).astype(s.dtype)
-    return data
+        raise TypeError(
+            f"Don't know how to handle column of type {type(s).__name__}"
+        )
 
 
 @meta_nonempty.register(cudf.Series)
@@ -162,24 +155,25 @@ def _nonempty_series(s, idx=None):
 def meta_nonempty_cudf(x):
     idx = meta_nonempty(x.index)
     columns_with_dtype = dict()
-    res = cudf.DataFrame(index=idx)
-    for col in x._data.names:
-        dtype = str(x._data[col].dtype)
-        if dtype in ("list", "struct", "category"):
+    res = {}
+    for col_label, col in x._data.items():
+        dtype = col.dtype
+        if isinstance(
+            dtype,
+            (cudf.ListDtype, cudf.StructDtype, cudf.CategoricalDtype),
+        ):
             # 1. Not possible to hash and store list & struct types
             #    as they can contain different levels of nesting or
             #    fields.
-            # 2. Not possible to has `category` types as
+            # 2. Not possible to hash `category` types as
             #    they often contain an underlying types to them.
-            res._data[col] = _get_non_empty_data(x._data[col])
+            res[col_label] = _get_non_empty_data(col)
         else:
             if dtype not in columns_with_dtype:
-                columns_with_dtype[dtype] = cudf.core.column.as_column(
-                    _get_non_empty_data(x._data[col])
-                )
-            res._data[col] = columns_with_dtype[dtype]
+                columns_with_dtype[dtype] = _get_non_empty_data(col)
+            res[col_label] = columns_with_dtype[dtype]
 
-    return res
+    return cudf.DataFrame._from_data(res, index=idx)
 
 
 @make_meta_dispatch.register((cudf.Series, cudf.DataFrame))
@@ -197,9 +191,7 @@ def make_meta_cudf_index(x, index=None):
 @_dask_cudf_performance_tracking
 def _empty_series(name, dtype, index=None):
     if isinstance(dtype, str) and dtype == "category":
-        return cudf.Series(
-            [UNKNOWN_CATEGORIES], dtype=dtype, name=name, index=index
-        ).iloc[:0]
+        dtype = cudf.CategoricalDtype(categories=[UNKNOWN_CATEGORIES])
     return cudf.Series([], dtype=dtype, name=name, index=index)
 
 
@@ -337,7 +329,7 @@ def percentile_cudf(a, q, interpolation="linear"):
     if isinstance(q, Iterator):
         q = list(q)
 
-    if cudf.api.types._is_categorical_dtype(a.dtype):
+    if isinstance(a.dtype, cudf.CategoricalDtype):
         result = cp.percentile(a.cat.codes, q, interpolation=interpolation)
 
         return (
@@ -346,7 +338,7 @@ def percentile_cudf(a, q, interpolation="linear"):
             ),
             n,
         )
-    if np.issubdtype(a.dtype, np.datetime64):
+    if a.dtype.kind == "M":
         result = a.quantile(
             [i / 100.0 for i in q], interpolation=interpolation
         )

From 1e220b708582c73d128c53f3279d4588167a310f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 13:58:45 -1000
Subject: [PATCH 670/842] Return Interval object in pandas compat mode for
 IntervalIndex reductions (#16523)

xref https://github.com/rapidsai/cudf/issues/16507

In non pandas compat mode, I think this still makes sense to return a `dict` since that's the "scalar" type of a cudf struct/interval type, but in pandas compat mode we should match pandas and return an Interval.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16523
---
 python/cudf/cudf/_lib/reduce.pyx         |  6 +++++-
 python/cudf/cudf/core/column/interval.py | 14 ++++++++++++++
 python/cudf/cudf/tests/test_interval.py  | 11 +++++++++++
 3 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 64634b7a6f9..511bba20ef5 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -61,7 +61,11 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
             result,
             dtype=col_dtype.__class__(precision, scale),
         ).value
-    return DeviceScalar.from_pylibcudf(result).value
+    scalar = DeviceScalar.from_pylibcudf(result).value
+    if isinstance(col_dtype, cudf.StructDtype):
+        # TODO: Utilize column_metadata in libcudf to maintain field labels
+        return dict(zip(col_dtype.fields.keys(), scalar.values()))
+    return scalar
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index b2f79ef0c65..d9fc96a9f3e 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -11,6 +11,7 @@
 from cudf.core.dtypes import IntervalDtype
 
 if TYPE_CHECKING:
+    from cudf._typing import ScalarLike
     from cudf.core.column import ColumnBase
 
 
@@ -186,3 +187,16 @@ def element_indexing(self, index: int):
         if cudf.get_option("mode.pandas_compatible"):
             return pd.Interval(**result, closed=self.dtype.closed)
         return result
+
+    def _reduce(
+        self,
+        op: str,
+        skipna: bool | None = None,
+        min_count: int = 0,
+        *args,
+        **kwargs,
+    ) -> ScalarLike:
+        result = super()._reduce(op, skipna, min_count, *args, **kwargs)
+        if cudf.get_option("mode.pandas_compatible"):
+            return pd.Interval(**result, closed=self.dtype.closed)
+        return result
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index 5eeea87d8e0..2d194107658 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -194,3 +194,14 @@ def test_intervaldtype_eq_string_with_attributes():
     dtype = cudf.IntervalDtype("int64", closed="left")
     assert dtype == "interval"
     assert dtype == "interval[int64, left]"
+
+
+def test_reduction_return_interval_pandas_compatible():
+    ii = pd.IntervalIndex.from_tuples(
+        [("2017-01-03", "2017-01-04")], dtype="interval[datetime64[ns], right]"
+    )
+    cudf_ii = cudf.IntervalIndex.from_pandas(ii)
+    with cudf.option_context("mode.pandas_compatible", True):
+        result = cudf_ii.min()
+    expected = ii.min()
+    assert result == expected

From 50841355812685e0e48d1577b8384399cdad5a0f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 13:59:58 -1000
Subject: [PATCH 671/842] Make NumericalColumn.__init__ strict (#16457)

This PR makes `NumericalBaseColumn.__init__` and its subclasses strict putting restrictions on `data`, `dtype`, `size` and `children` so these columns cannot be constructed into to an invalid state. It also aligns the signature with the base class.

xref https://github.com/rapidsai/cudf/issues/16469

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16457
---
 python/cudf/cudf/_lib/column.pyx              |  2 +
 python/cudf/cudf/core/column/column.py        | 22 ++---
 python/cudf/cudf/core/column/decimal.py       | 92 ++++++++++++++++++-
 python/cudf/cudf/core/column/numerical.py     | 13 ++-
 .../cudf/cudf/core/column/numerical_base.py   | 29 +++++-
 5 files changed, 134 insertions(+), 24 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index e030147fdd3..f0c07dfbc1b 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -88,6 +88,8 @@ cdef class Column:
         object null_count=None,
         object children=()
     ):
+        if size < 0:
+            raise ValueError("size must be >=0")
         self._size = size
         self._distinct_count = {}
         self._dtype = dtype
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index a7d2cb441dd..9785c3e5517 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1652,23 +1652,19 @@ def build_column(
             null_count=null_count,
         )
     elif isinstance(dtype, StructDtype):
-        if size is None:
-            raise TypeError("Must specify size")
         return cudf.core.column.StructColumn(
             data=data,
             dtype=dtype,
-            size=size,
+            size=size,  # type: ignore[arg-type]
             offset=offset,
             mask=mask,
             null_count=null_count,
             children=children,
         )
     elif isinstance(dtype, cudf.Decimal64Dtype):
-        if size is None:
-            raise TypeError("Must specify size")
         return cudf.core.column.Decimal64Column(
-            data=data,
-            size=size,
+            data=data,  # type: ignore[arg-type]
+            size=size,  # type: ignore[arg-type]
             offset=offset,
             dtype=dtype,
             mask=mask,
@@ -1676,11 +1672,9 @@ def build_column(
             children=children,
         )
     elif isinstance(dtype, cudf.Decimal32Dtype):
-        if size is None:
-            raise TypeError("Must specify size")
         return cudf.core.column.Decimal32Column(
-            data=data,
-            size=size,
+            data=data,  # type: ignore[arg-type]
+            size=size,  # type: ignore[arg-type]
             offset=offset,
             dtype=dtype,
             mask=mask,
@@ -1688,11 +1682,9 @@ def build_column(
             children=children,
         )
     elif isinstance(dtype, cudf.Decimal128Dtype):
-        if size is None:
-            raise TypeError("Must specify size")
         return cudf.core.column.Decimal128Column(
-            data=data,
-            size=size,
+            data=data,  # type: ignore[arg-type]
+            size=size,  # type: ignore[arg-type]
             offset=offset,
             dtype=dtype,
             mask=mask,
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 6a7f338b065..3b979ef2e97 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -31,14 +31,38 @@
 
 if TYPE_CHECKING:
     from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
+    from cudf.core.buffer import Buffer
 
 
 class DecimalBaseColumn(NumericalBaseColumn):
     """Base column for decimal32, decimal64 or decimal128 columns"""
 
-    dtype: DecimalDtype
     _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
 
+    def __init__(
+        self,
+        data: Buffer,
+        size: int,
+        dtype: DecimalDtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple = (),
+    ):
+        if not isinstance(size, int):
+            raise ValueError("Must specify an integer size")
+        if not isinstance(dtype, DecimalDtype):
+            raise ValueError(f"{dtype=} must be a DecimalDtype instance")
+        super().__init__(
+            data=data,
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
+
     @property
     def __cuda_array_interface__(self):
         raise NotImplementedError(
@@ -205,7 +229,27 @@ def as_numerical_column(
 
 
 class Decimal32Column(DecimalBaseColumn):
-    dtype: Decimal32Dtype
+    def __init__(
+        self,
+        data: Buffer,
+        size: int,
+        dtype: Decimal32Dtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple = (),
+    ):
+        if not isinstance(dtype, Decimal32Dtype):
+            raise ValueError(f"{dtype=} must be a Decimal32Dtype instance")
+        super().__init__(
+            data=data,
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
 
     @classmethod
     def from_arrow(cls, data: pa.Array):
@@ -266,7 +310,27 @@ def _with_type_metadata(
 
 
 class Decimal128Column(DecimalBaseColumn):
-    dtype: Decimal128Dtype
+    def __init__(
+        self,
+        data: Buffer,
+        size: int,
+        dtype: Decimal128Dtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple = (),
+    ):
+        if not isinstance(dtype, Decimal128Dtype):
+            raise ValueError(f"{dtype=} must be a Decimal128Dtype instance")
+        super().__init__(
+            data=data,
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
 
     @classmethod
     def from_arrow(cls, data: pa.Array):
@@ -287,7 +351,27 @@ def _with_type_metadata(
 
 
 class Decimal64Column(DecimalBaseColumn):
-    dtype: Decimal64Dtype
+    def __init__(
+        self,
+        data: Buffer,
+        size: int,
+        dtype: Decimal64Dtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple = (),
+    ):
+        if not isinstance(dtype, Decimal64Dtype):
+            raise ValueError(f"{dtype=} must be a Decimal64Dtype instance")
+        super().__init__(
+            data=data,
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
 
     def __setitem__(self, key, value):
         if isinstance(value, np.integer):
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index bbc74ef349e..16e78ef35ef 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -61,25 +61,30 @@ class NumericalColumn(NumericalBaseColumn):
     def __init__(
         self,
         data: Buffer,
-        dtype: DtypeObj,
+        size: int | None,
+        dtype: np.dtype,
         mask: Buffer | None = None,
-        size: int | None = None,  # TODO: make this non-optional
         offset: int = 0,
         null_count: int | None = None,
+        children: tuple = (),
     ):
-        dtype = cudf.dtype(dtype)
+        if not (isinstance(dtype, np.dtype) and dtype.kind in "iufb"):
+            raise ValueError(
+                "dtype must be a floating, integer or boolean numpy dtype."
+            )
 
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
             size = (data.size // dtype.itemsize) - offset
         super().__init__(
-            data,
+            data=data,
             size=size,
             dtype=dtype,
             mask=mask,
             offset=offset,
             null_count=null_count,
+            children=children,
         )
 
     def _clear_cache(self):
diff --git a/python/cudf/cudf/core/column/numerical_base.py b/python/cudf/cudf/core/column/numerical_base.py
index f41010062c8..3b8dd05c13a 100644
--- a/python/cudf/cudf/core/column/numerical_base.py
+++ b/python/cudf/cudf/core/column/numerical_base.py
@@ -9,16 +9,19 @@
 
 import cudf
 from cudf import _lib as libcudf
+from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase
 from cudf.core.missing import NA
 from cudf.core.mixins import Scannable
 
 if TYPE_CHECKING:
     from cudf._typing import ScalarLike
+    from cudf.core.column.decimal import DecimalDtype
 
 
 class NumericalBaseColumn(ColumnBase, Scannable):
-    """A column composed of numerical data.
+    """
+    A column composed of numerical (bool, integer, float, decimal) data.
 
     This class encodes a standard interface for different types of columns
     containing numerical types of data. In particular, mathematical operations
@@ -42,6 +45,30 @@ class NumericalBaseColumn(ColumnBase, Scannable):
         "cummax",
     }
 
+    def __init__(
+        self,
+        data: Buffer,
+        size: int,
+        dtype: DecimalDtype | np.dtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple = (),
+    ):
+        if not isinstance(data, Buffer):
+            raise ValueError("data must be a Buffer instance.")
+        if len(children) != 0:
+            raise ValueError(f"{type(self).__name__} must have no children.")
+        super().__init__(
+            data=data,
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
+
     def _can_return_nan(self, skipna: bool | None = None) -> bool:
         return not skipna and self.has_nulls()
 

From 155eddedc0e2b68d203cfbc318172396f4293d98 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 14:00:57 -1000
Subject: [PATCH 672/842] Make Timedelta/DatetimeColumn.__init__ strict
 (#16464)

This PR makes Datetime/TimedeltaColumn.__init__ and its subclasses strict putting restrictions on data, dtype, size and children so these columns cannot be constructed into to an invalid state. It also aligns the signature with the base class.

xref https://github.com/rapidsai/cudf/issues/16469

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16464
---
 python/cudf/cudf/core/column/column.py    | 12 ++-----
 python/cudf/cudf/core/column/datetime.py  | 43 ++++++++++++++++-------
 python/cudf/cudf/core/column/timedelta.py | 17 +++++----
 3 files changed, 44 insertions(+), 28 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 9785c3e5517..b0e33e8b9ce 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1592,10 +1592,8 @@ def build_column(
             children=children,
         )
     elif dtype.type is np.datetime64:
-        if data is None:
-            raise TypeError("Must specify data buffer")
         return cudf.core.column.DatetimeColumn(
-            data=data,
+            data=data,  # type: ignore[arg-type]
             dtype=dtype,
             mask=mask,
             size=size,
@@ -1603,10 +1601,8 @@ def build_column(
             null_count=null_count,
         )
     elif isinstance(dtype, pd.DatetimeTZDtype):
-        if data is None:
-            raise TypeError("Must specify data buffer")
         return cudf.core.column.datetime.DatetimeTZColumn(
-            data=data,
+            data=data,  # type: ignore[arg-type]
             dtype=dtype,
             mask=mask,
             size=size,
@@ -1614,10 +1610,8 @@ def build_column(
             null_count=null_count,
         )
     elif dtype.type is np.timedelta64:
-        if data is None:
-            raise TypeError("Must specify data buffer")
         return cudf.core.column.TimeDeltaColumn(
-            data=data,
+            data=data,  # type: ignore[arg-type]
             dtype=dtype,
             mask=mask,
             size=size,
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 1dbc94384d3..d0ea4612a1b 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -24,6 +24,7 @@
     get_compatible_timezone,
     get_tz_data,
 )
+from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
@@ -34,10 +35,8 @@
         ColumnBinaryOperand,
         DatetimeLikeScalar,
         Dtype,
-        DtypeObj,
         ScalarLike,
     )
-    from cudf.core.buffer import Buffer
     from cudf.core.column.numerical import NumericalColumn
 
 if PANDAS_GE_220:
@@ -207,30 +206,39 @@ class DatetimeColumn(column.ColumnBase):
     def __init__(
         self,
         data: Buffer,
-        dtype: DtypeObj,
+        size: int | None,
+        dtype: np.dtype | pd.DatetimeTZDtype,
         mask: Buffer | None = None,
-        size: int | None = None,  # TODO: make non-optional
         offset: int = 0,
         null_count: int | None = None,
+        children: tuple = (),
     ):
-        dtype = cudf.dtype(dtype)
-        if dtype.kind != "M":
-            raise TypeError(f"{self.dtype} is not a supported datetime type")
-
+        if not isinstance(data, Buffer):
+            raise ValueError("data must be a Buffer.")
+        dtype = self._validate_dtype_instance(dtype)
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
             size = data.size // dtype.itemsize
             size = size - offset
+        if len(children) != 0:
+            raise ValueError(f"{type(self).__name__} must have no children.")
         super().__init__(
-            data,
+            data=data,
             size=size,
             dtype=dtype,
             mask=mask,
             offset=offset,
             null_count=null_count,
+            children=children,
         )
 
+    @staticmethod
+    def _validate_dtype_instance(dtype: np.dtype) -> np.dtype:
+        if not (isinstance(dtype, np.dtype) and dtype.kind == "M"):
+            raise ValueError("dtype must be a datetime, numpy dtype")
+        return dtype
+
     def __contains__(self, item: ScalarLike) -> bool:
         try:
             ts = pd.Timestamp(item).as_unit(self.time_unit)
@@ -858,21 +866,30 @@ class DatetimeTZColumn(DatetimeColumn):
     def __init__(
         self,
         data: Buffer,
+        size: int | None,
         dtype: pd.DatetimeTZDtype,
         mask: Buffer | None = None,
-        size: int | None = None,
         offset: int = 0,
         null_count: int | None = None,
+        children: tuple = (),
     ):
         super().__init__(
             data=data,
-            dtype=_get_base_dtype(dtype),
-            mask=mask,
             size=size,
+            dtype=dtype,
+            mask=mask,
             offset=offset,
             null_count=null_count,
+            children=children,
         )
-        self._dtype = get_compatible_timezone(dtype)
+
+    @staticmethod
+    def _validate_dtype_instance(
+        dtype: pd.DatetimeTZDtype,
+    ) -> pd.DatetimeTZDtype:
+        if not isinstance(dtype, pd.DatetimeTZDtype):
+            raise ValueError("dtype must be a pandas.DatetimeTZDtype")
+        return get_compatible_timezone(dtype)
 
     def to_pandas(
         self,
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index ba0dc4779bb..6b6f3e517a8 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -75,28 +75,33 @@ class TimeDeltaColumn(ColumnBase):
     def __init__(
         self,
         data: Buffer,
-        dtype: Dtype,
-        size: int | None = None,  # TODO: make non-optional
+        size: int | None,
+        dtype: np.dtype,
         mask: Buffer | None = None,
         offset: int = 0,
         null_count: int | None = None,
+        children: tuple = (),
     ):
-        dtype = cudf.dtype(dtype)
-        if dtype.kind != "m":
-            raise TypeError(f"{self.dtype} is not a supported duration type")
+        if not isinstance(data, Buffer):
+            raise ValueError("data must be a Buffer.")
+        if not (isinstance(dtype, np.dtype) and dtype.kind == "m"):
+            raise ValueError("dtype must be a timedelta numpy dtype.")
 
         if data.size % dtype.itemsize:
             raise ValueError("Buffer size must be divisible by element size")
         if size is None:
             size = data.size // dtype.itemsize
             size = size - offset
+        if len(children) != 0:
+            raise ValueError("TimedeltaColumn must have no children.")
         super().__init__(
-            data,
+            data=data,
             size=size,
             dtype=dtype,
             mask=mask,
             offset=offset,
             null_count=null_count,
+            children=children,
         )
 
     def __contains__(self, item: DatetimeLikeScalar) -> bool:

From f955dd76b47779d4f527efe25de417b1acbff4a7 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 15 Aug 2024 17:13:58 -0700
Subject: [PATCH 673/842] Rewrite remaining Python Arrow interop conversions
 using the C Data Interface (#16548)

This PR rewrites all remaining parts of the Python interop code previously using Arrow C++ types to instead use the C Data Interface. With this change, we no longer require pyarrow in that part of the Cython code. There are further improvements that we should make to streamline the internals, but I would like to keep this changeset minimal since getting it merged unblocks progress on multiple fronts so that we can progress further in parallel.

Contributes to #15193

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16548
---
 cpp/src/interop/arrow_utilities.cpp           |   1 +
 cpp/src/interop/to_arrow_schema.cpp           |   5 +-
 python/cudf/cudf/_lib/CMakeLists.txt          |   6 +-
 .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt   |   5 +-
 python/cudf/cudf/_lib/pylibcudf/interop.pyx   | 188 +++++++++---------
 .../cudf/_lib/pylibcudf/libcudf/interop.pxd   |  53 +++--
 .../cudf/cudf/pylibcudf_tests/common/utils.py |   6 +-
 7 files changed, 146 insertions(+), 118 deletions(-)

diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp
index 4292552a800..3776daf41aa 100644
--- a/cpp/src/interop/arrow_utilities.cpp
+++ b/cpp/src/interop/arrow_utilities.cpp
@@ -98,6 +98,7 @@ ArrowType id_to_arrow_type(cudf::type_id id)
 ArrowType id_to_arrow_storage_type(cudf::type_id id)
 {
   switch (id) {
+    case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_INT32;
     case cudf::type_id::TIMESTAMP_SECONDS:
     case cudf::type_id::TIMESTAMP_MILLISECONDS:
     case cudf::type_id::TIMESTAMP_MICROSECONDS:
diff --git a/cpp/src/interop/to_arrow_schema.cpp b/cpp/src/interop/to_arrow_schema.cpp
index b98ca8a7bed..5afed772656 100644
--- a/cpp/src/interop/to_arrow_schema.cpp
+++ b/cpp/src/interop/to_arrow_schema.cpp
@@ -170,8 +170,9 @@ int dispatch_to_arrow_type::operator()<cudf::list_view>(column_view input,
   NANOARROW_RETURN_NOT_OK(ArrowSchemaSetType(out, NANOARROW_TYPE_LIST));
   auto child = input.child(cudf::lists_column_view::child_column_index);
   ArrowSchemaInit(out->children[0]);
-  auto child_meta =
-    metadata.children_meta.empty() ? column_metadata{"element"} : metadata.children_meta[0];
+  auto child_meta = metadata.children_meta.empty()
+                      ? column_metadata{"element"}
+                      : metadata.children_meta[cudf::lists_column_view::child_column_index];
 
   out->flags = input.has_nulls() ? ARROW_FLAG_NULLABLE : 0;
   NANOARROW_RETURN_NOT_OK(ArrowSchemaSetName(out->children[0], child_meta.name.c_str()));
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 38b7e9ebe04..d32a2d8e3f8 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -64,9 +64,13 @@ rapids_cython_create_modules(
 
 target_link_libraries(strings_udf PUBLIC cudf_strings_udf)
 
-set(targets_using_arrow_headers interop avro csv orc json parquet)
+set(targets_using_arrow_headers avro csv orc json parquet)
 link_to_pyarrow_headers("${targets_using_arrow_headers}")
 
+include(${rapids-cmake-dir}/export/find_package_root.cmake)
+include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
+target_link_libraries(interop PUBLIC nanoarrow)
+
 add_subdirectory(io)
 add_subdirectory(nvtext)
 add_subdirectory(pylibcudf)
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
index df4591baa71..da32d530928 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
@@ -52,7 +52,10 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
 )
-link_to_pyarrow_headers(pylibcudf_interop)
+
+include(${rapids-cmake-dir}/export/find_package_root.cmake)
+include(../../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
+target_link_libraries(pylibcudf_interop PUBLIC nanoarrow)
 
 add_subdirectory(libcudf)
 add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
index adf7e1fd7e8..caa19724786 100644
--- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx
+++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx
@@ -1,11 +1,10 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
-from cpython cimport pycapsule
-from cython.operator cimport dereference
-from libcpp.memory cimport shared_ptr, unique_ptr
+from cpython.pycapsule cimport PyCapsule_GetPointer, PyCapsule_New
+from libc.stdlib cimport free
+from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-from pyarrow cimport lib as pa
 
 from dataclasses import dataclass, field
 from functools import singledispatch
@@ -18,23 +17,14 @@ from cudf._lib.pylibcudf.libcudf.interop cimport (
     ArrowArrayStream,
     ArrowSchema,
     column_metadata,
-    from_arrow as cpp_from_arrow,
     from_arrow_column as cpp_from_arrow_column,
     from_arrow_stream as cpp_from_arrow_stream,
-    to_arrow as cpp_to_arrow,
-)
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
-    fixed_point_scalar,
-    scalar,
+    to_arrow_host_raw,
+    to_arrow_schema_raw,
 )
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.wrappers.decimals cimport (
-    decimal32,
-    decimal64,
-    decimal128,
-    scale_type,
-)
 
+from . cimport copying
 from .column cimport Column
 from .scalar cimport Scalar
 from .table cimport Table
@@ -109,7 +99,9 @@ def from_arrow(pyarrow_object, *, DataType data_type=None):
     Union[Table, Scalar]
         The converted object of type corresponding to the input type in cudf.
     """
-    raise TypeError("from_arrow only accepts Table and Scalar objects")
+    raise TypeError(
+        f"Unsupported type {type(pyarrow_object)} for conversion from arrow"
+    )
 
 
 @from_arrow.register(pa.DataType)
@@ -133,7 +125,7 @@ def _from_arrow_table(pyarrow_object, *, DataType data_type=None):
         raise ValueError("data_type may not be passed for tables")
     stream = pyarrow_object.__arrow_c_stream__()
     cdef ArrowArrayStream* c_stream = (
-        <ArrowArrayStream*>pycapsule.PyCapsule_GetPointer(stream, "arrow_array_stream")
+        <ArrowArrayStream*>PyCapsule_GetPointer(stream, "arrow_array_stream")
     )
 
     cdef unique_ptr[table] c_result
@@ -146,54 +138,17 @@ def _from_arrow_table(pyarrow_object, *, DataType data_type=None):
 
 @from_arrow.register(pa.Scalar)
 def _from_arrow_scalar(pyarrow_object, *, DataType data_type=None):
-    cdef shared_ptr[pa.CScalar] arrow_scalar = pa.pyarrow_unwrap_scalar(pyarrow_object)
-
-    cdef unique_ptr[scalar] c_result
-    with nogil:
-        c_result = move(cpp_from_arrow(dereference(arrow_scalar)))
-
-    cdef Scalar result = Scalar.from_libcudf(move(c_result))
-
-    if result.type().id() != type_id.DECIMAL128:
-        if data_type is not None:
-            raise ValueError(
-                "dtype may not be passed for non-decimal types"
-            )
-        return result
-
-    if data_type is None:
-        raise ValueError(
-            "Decimal scalars must be constructed with a dtype"
-        )
-
-    cdef type_id tid = data_type.id()
-
-    if tid == type_id.DECIMAL32:
-        result.c_obj.reset(
-            new fixed_point_scalar[decimal32](
-                (
-                    <fixed_point_scalar[decimal128]*> result.c_obj.get()
-                ).value(),
-                scale_type(-pyarrow_object.type.scale),
-                result.c_obj.get().is_valid()
-            )
-        )
-    elif tid == type_id.DECIMAL64:
-        result.c_obj.reset(
-            new fixed_point_scalar[decimal64](
-                (
-                    <fixed_point_scalar[decimal128]*> result.c_obj.get()
-                ).value(),
-                scale_type(-pyarrow_object.type.scale),
-                result.c_obj.get().is_valid()
-            )
-        )
-    elif tid != type_id.DECIMAL128:
-        raise ValueError(
-            "Decimal scalars may only be cast to decimals"
-        )
-
-    return result
+    if isinstance(pyarrow_object.type, pa.ListType) and pyarrow_object.as_py() is None:
+        # pyarrow doesn't correctly handle None values for list types, so
+        # we have to create this one manually.
+        # https://github.com/apache/arrow/issues/40319
+        pa_array = pa.array([None], type=pyarrow_object.type)
+    else:
+        pa_array = pa.array([pyarrow_object])
+    return copying.get_element(
+        from_arrow(pa_array, data_type=data_type),
+        0,
+    )
 
 
 @from_arrow.register(pa.Array)
@@ -204,10 +159,10 @@ def _from_arrow_column(pyarrow_object, *, DataType data_type=None):
 
     schema, array = pyarrow_object.__arrow_c_array__()
     cdef ArrowSchema* c_schema = (
-        <ArrowSchema*>pycapsule.PyCapsule_GetPointer(schema, "arrow_schema")
+        <ArrowSchema*>PyCapsule_GetPointer(schema, "arrow_schema")
     )
     cdef ArrowArray* c_array = (
-        <ArrowArray*>pycapsule.PyCapsule_GetPointer(array, "arrow_array")
+        <ArrowArray*>PyCapsule_GetPointer(array, "arrow_array")
     )
 
     cdef unique_ptr[column] c_result
@@ -238,7 +193,7 @@ def to_arrow(cudf_object, metadata=None):
     Union[pyarrow.Array, pyarrow.Table, pyarrow.Scalar]
         The converted object of type corresponding to the input type in PyArrow.
     """
-    raise TypeError("to_arrow only accepts Table and Scalar objects")
+    raise TypeError(f"Unsupported type {type(cudf_object)} for conversion to arrow")
 
 
 @to_arrow.register(DataType)
@@ -281,46 +236,83 @@ def _to_arrow_datatype(cudf_object, **kwargs):
             )
 
 
-@to_arrow.register(Table)
-def _to_arrow_table(cudf_object, metadata=None):
+cdef void _release_schema(object schema_capsule) noexcept:
+    """Release the ArrowSchema object stored in a PyCapsule."""
+    cdef ArrowSchema* schema = <ArrowSchema*>PyCapsule_GetPointer(
+        schema_capsule, 'arrow_schema'
+    )
+    if schema.release != NULL:
+        schema.release(schema)
+
+    free(schema)
+
+
+cdef void _release_array(object array_capsule) noexcept:
+    """Release the ArrowArray object stored in a PyCapsule."""
+    cdef ArrowArray* array = <ArrowArray*>PyCapsule_GetPointer(
+        array_capsule, 'arrow_array'
+    )
+    if array.release != NULL:
+        array.release(array)
+
+    free(array)
+
+
+def _table_to_schema(Table tbl, metadata):
     if metadata is None:
-        metadata = [ColumnMetadata() for _ in range(len(cudf_object.columns()))]
+        metadata = [ColumnMetadata() for _ in range(len(tbl.columns()))]
     metadata = [ColumnMetadata(m) if isinstance(m, str) else m for m in metadata]
-    cdef vector[column_metadata] c_table_metadata
-    cdef shared_ptr[pa.CTable] c_table_result
+
+    cdef vector[column_metadata] c_metadata
+    c_metadata.reserve(len(metadata))
     for meta in metadata:
-        c_table_metadata.push_back(_metadata_to_libcudf(meta))
+        c_metadata.push_back(_metadata_to_libcudf(meta))
+
+    cdef ArrowSchema* raw_schema_ptr
     with nogil:
-        c_table_result = move(
-            cpp_to_arrow((<Table> cudf_object).view(), c_table_metadata)
-        )
+        raw_schema_ptr = to_arrow_schema_raw(tbl.view(), c_metadata)
 
-    return pa.pyarrow_wrap_table(c_table_result)
+    return PyCapsule_New(<void*>raw_schema_ptr, 'arrow_schema', _release_schema)
 
 
-@to_arrow.register(Scalar)
-def _to_arrow_scalar(cudf_object, metadata=None):
-    # Note that metadata for scalars is primarily important for preserving
-    # information on nested types since names are otherwise irrelevant.
-    if metadata is None:
-        metadata = ColumnMetadata()
-    metadata = ColumnMetadata(metadata) if isinstance(metadata, str) else metadata
-    cdef column_metadata c_scalar_metadata = _metadata_to_libcudf(metadata)
-    cdef shared_ptr[pa.CScalar] c_scalar_result
+def _table_to_host_array(Table tbl):
+    cdef ArrowArray* raw_host_array_ptr
     with nogil:
-        c_scalar_result = move(
-            cpp_to_arrow(
-                dereference((<Scalar> cudf_object).c_obj), c_scalar_metadata
-            )
-        )
+        raw_host_array_ptr = to_arrow_host_raw(tbl.view())
+
+    return PyCapsule_New(<void*>raw_host_array_ptr, "arrow_array", _release_array)
+
+
+class _TableWithArrowMetadata:
+    def __init__(self, tbl, metadata=None):
+        self.tbl = tbl
+        self.metadata = metadata
 
-    return pa.pyarrow_wrap_scalar(c_scalar_result)
+    def __arrow_c_array__(self, requested_schema=None):
+        return _table_to_schema(self.tbl, self.metadata), _table_to_host_array(self.tbl)
+
+
+# TODO: In the long run we should get rid of the `to_arrow` functions in favor of using
+# the protocols directly via `pa.table(cudf_object, schema=...)` directly. We can do the
+# same for columns. We cannot do this for scalars since there is no corresponding
+# protocol. Since this will require broader changes throughout the codebase, the current
+# approach is to leverage the protocol internally but to continue exposing `to_arrow`.
+@to_arrow.register(Table)
+def _to_arrow_table(cudf_object, metadata=None):
+    test_table = _TableWithArrowMetadata(cudf_object, metadata)
+    return pa.table(test_table)
 
 
 @to_arrow.register(Column)
 def _to_arrow_array(cudf_object, metadata=None):
     """Create a PyArrow array from a pylibcudf column."""
-    if metadata is None:
-        metadata = ColumnMetadata()
-    metadata = ColumnMetadata(metadata) if isinstance(metadata, str) else metadata
-    return to_arrow(Table([cudf_object]), [metadata])[0]
+    if metadata is not None:
+        metadata = [metadata]
+    return to_arrow(Table([cudf_object]), metadata)[0]
+
+
+@to_arrow.register(Scalar)
+def _to_arrow_scalar(cudf_object, metadata=None):
+    # Note that metadata for scalars is primarily important for preserving
+    # information on nested types since names are otherwise irrelevant.
+    return to_arrow(Column.from_scalar(cudf_object, 1), metadata=metadata)[0]
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
index 2151da28d4b..24d96b602dc 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
+++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
@@ -3,11 +3,11 @@
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-from pyarrow.lib cimport CScalar, CTable
 
 from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
 
 from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
 from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
 from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
@@ -29,6 +29,9 @@ cdef extern from "cudf/interop.hpp" nogil:
     cdef struct ArrowArrayStream:
         void (*release)(ArrowArrayStream*) noexcept nogil
 
+    cdef struct ArrowDeviceArray:
+        ArrowArray array
+
 
 cdef extern from "cudf/interop.hpp" namespace "cudf" \
         nogil:
@@ -38,27 +41,49 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \
     DLManagedTensor* to_dlpack(table_view input_table
                                ) except +
 
-    cdef unique_ptr[table] from_arrow(CTable input) except +
-    cdef unique_ptr[scalar] from_arrow(CScalar input) except +
-
     cdef cppclass column_metadata:
         column_metadata() except +
         column_metadata(string name_) except +
         string name
         vector[column_metadata] children_meta
 
-    cdef shared_ptr[CTable] to_arrow(
-        table_view input,
-        vector[column_metadata] metadata,
-    ) except +
-
-    cdef shared_ptr[CScalar] to_arrow(
-        const scalar& input,
-        column_metadata metadata,
-    ) except +
-
     cdef unique_ptr[table] from_arrow_stream(ArrowArrayStream* input) except +
     cdef unique_ptr[column] from_arrow_column(
         const ArrowSchema* schema,
         const ArrowArray* input
     ) except +
+
+
+cdef extern from *:
+    # Rather than exporting the underlying functions directly to Cython, we expose
+    # these wrappers that handle the release to avoid needing to teach Cython how
+    # to handle unique_ptrs with custom deleters that aren't default constructible.
+    # This will go away once we introduce cudf::arrow_column (need a
+    # cudf::arrow_schema as well), see
+    # https://github.com/rapidsai/cudf/issues/16104.
+    """
+    #include <nanoarrow/nanoarrow.h>
+    #include <nanoarrow/nanoarrow_device.h>
+
+    ArrowSchema* to_arrow_schema_raw(
+      cudf::table_view const& input,
+      cudf::host_span<cudf::column_metadata const> metadata) {
+      return to_arrow_schema(input, metadata).release();
+    }
+
+    ArrowArray* to_arrow_host_raw(
+      cudf::table_view const& tbl,
+      rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) {
+      // Assumes the sync event is null and the data is already on the host.
+      ArrowArray *arr = new ArrowArray();
+      auto device_arr = cudf::to_arrow_host(tbl, stream, mr);
+      ArrowArrayMove(&device_arr->array, arr);
+      return arr;
+    }
+    """
+    cdef ArrowSchema *to_arrow_schema_raw(
+        const table_view& tbl,
+        const vector[column_metadata]& metadata,
+    ) except + nogil
+    cdef ArrowArray* to_arrow_host_raw(const table_view& tbl) except + nogil
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py
index e19ff58927f..acb2b5be85c 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py
@@ -44,7 +44,7 @@ def metadata_from_arrow_type(
 def assert_column_eq(
     lhs: pa.Array | plc.Column,
     rhs: pa.Array | plc.Column,
-    check_field_nullability=True,
+    check_field_nullability=False,
 ) -> None:
     """Verify that a pylibcudf array and PyArrow array are equal.
 
@@ -59,7 +59,9 @@ def assert_column_eq(
         on child fields are equal.
 
         Useful for checking roundtripping of lossy formats like JSON that may not
-        preserve this information.
+        preserve this information. Also, our Arrow interop functions make different
+        choices by default than pyarrow field constructors since the interop functions
+        may make data-dependent choices.
     """
     # Nested types require children metadata to be passed to the conversion function.
     if isinstance(lhs, (pa.Array, pa.ChunkedArray)) and isinstance(

From 1c63e1ee31a07fb4999d7356919280ba3d528741 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 15 Aug 2024 21:51:47 -0400
Subject: [PATCH 674/842] Initial investigation into NumPy proxying in
 `cudf.pandas` (#16286)

Apart of #15397. Closes #14537. Creates `ProxyNDarray` which inherits from `np.ndarray`.

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16286
---
 python/cudf/cudf/pandas/_wrappers/numpy.py    |  3 +++
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 20 +++++++++++++++-
 python/cudf/cudf/pandas/proxy_base.py         | 23 +++++++++++++++++++
 .../cudf_pandas_tests/test_cudf_pandas.py     |  8 +++++++
 4 files changed, 53 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf/pandas/proxy_base.py

diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index 3b012169676..eabea9713f1 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -14,6 +14,7 @@
     make_final_proxy_type,
     make_intermediate_proxy_type,
 )
+from ..proxy_base import ProxyNDarrayBase
 from .common import (
     array_interface,
     array_method,
@@ -111,12 +112,14 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor):
     numpy.ndarray,
     fast_to_slow=cupy.ndarray.get,
     slow_to_fast=cupy.asarray,
+    bases=(ProxyNDarrayBase,),
     additional_attributes={
         "__array__": array_method,
         # So that pa.array(wrapped-numpy-array) works
         "__arrow_array__": arrow_array_method,
         "__cuda_array_interface__": cuda_array_interface,
         "__array_interface__": array_interface,
+        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
         # ndarrays are unhashable
         "__hash__": None,
         # iter(cupy-array) produces an iterable of zero-dim device
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index bb678fd1efe..61aa6310082 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -19,6 +19,7 @@
 from ..options import _env_get_bool
 from ..testing import assert_eq
 from .annotation import nvtx
+from .proxy_base import ProxyNDarrayBase
 
 
 def call_operator(fn, args, kwargs):
@@ -564,7 +565,11 @@ def _fsproxy_wrap(cls, value, func):
         _FinalProxy subclasses can override this classmethod if they
         need particular behaviour when wrapped up.
         """
-        proxy = object.__new__(cls)
+        base_class = _get_proxy_base_class(cls)
+        if base_class is object:
+            proxy = base_class.__new__(cls)
+        else:
+            proxy = base_class.__new__(cls, value)
         proxy._fsproxy_wrapped = value
         return proxy
 
@@ -1193,6 +1198,19 @@ def is_proxy_object(obj: Any) -> bool:
     return False
 
 
+def _get_proxy_base_class(cls):
+    """Returns the proxy base class if one exists"""
+    for proxy_class in PROXY_BASE_CLASSES:
+        if proxy_class in cls.__mro__:
+            return proxy_class
+    return object
+
+
+PROXY_BASE_CLASSES: set[type] = {
+    ProxyNDarrayBase,
+}
+
+
 NUMPY_TYPES: set[str] = set(np.sctypeDict.values())
 
 
diff --git a/python/cudf/cudf/pandas/proxy_base.py b/python/cudf/cudf/pandas/proxy_base.py
new file mode 100644
index 00000000000..61d9cde127c
--- /dev/null
+++ b/python/cudf/cudf/pandas/proxy_base.py
@@ -0,0 +1,23 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import cupy as cp
+import numpy as np
+
+
+class ProxyNDarrayBase(np.ndarray):
+    def __new__(cls, arr):
+        if isinstance(arr, cp.ndarray):
+            obj = np.asarray(arr.get()).view(cls)
+            return obj
+        elif isinstance(arr, np.ndarray):
+            obj = np.asarray(arr).view(cls)
+            return obj
+        else:
+            raise TypeError(
+                "Unsupported array type. Must be numpy.ndarray or cupy.ndarray"
+            )
+
+    def __array_finalize__(self, obj):
+        self._fsproxy_wrapped = getattr(obj, "_fsproxy_wrapped", None)
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 6292022d8e4..e5483fff913 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1632,3 +1632,11 @@ def test_change_index_name(index):
 
         assert s.index.name == name
         assert df.index.name == name
+
+
+def test_numpy_ndarray_isinstancecheck(series):
+    s1, s2 = series
+    arr1 = s1.values
+    arr2 = s2.values
+    assert isinstance(arr1, np.ndarray)
+    assert isinstance(arr2, np.ndarray)

From e690d9d25b4fadbd553f7ef14ac4918e95d98b0e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 15 Aug 2024 16:48:49 -1000
Subject: [PATCH 675/842] Ensure size is always passed to NumericalColumn
 (#16576)

https://github.com/rapidsai/cudf/pull/16457 requires `NumericalColumn` to be constructed with `size`. It appears another PR got in after this PR was created so there are currently a few usages where `size` isn't passed in.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16576
---
 python/cudf/cudf/core/_internals/where.py   |  4 +--
 python/cudf/cudf/core/column/categorical.py | 37 +++++----------------
 python/cudf/cudf/core/column/column.py      |  1 +
 python/cudf/cudf/core/column/numerical.py   |  1 +
 python/cudf/cudf/core/dataframe.py          |  5 +--
 python/cudf/cudf/core/index.py              |  1 +
 6 files changed, 13 insertions(+), 36 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 9f36499586b..0c754317185 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -110,9 +110,7 @@ def _make_categorical_like(result, column):
     if isinstance(column, cudf.core.column.CategoricalColumn):
         result = cudf.core.column.build_categorical_column(
             categories=column.categories,
-            codes=cudf.core.column.NumericalColumn(
-                result.base_data, dtype=result.dtype
-            ),
+            codes=result,
             mask=result.base_mask,
             size=result.size,
             offset=result.offset,
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index d25983842f9..66aed38bffd 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -659,10 +659,7 @@ def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
             Self,
             cudf.core.column.build_categorical_column(
                 categories=self.categories,
-                codes=cudf.core.column.NumericalColumn(
-                    codes.base_data,  # type: ignore[arg-type]
-                    dtype=codes.dtype,
-                ),
+                codes=codes,
                 mask=codes.base_mask,
                 ordered=self.ordered,
                 size=codes.size,
@@ -734,10 +731,7 @@ def sort_values(
         codes = self.codes.sort_values(ascending, na_position)
         col = column.build_categorical_column(
             categories=self.dtype.categories._values,
-            codes=cudf.core.column.NumericalColumn(
-                codes.base_data,  # type: ignore[arg-type]
-                dtype=codes.dtype,
-            ),
+            codes=codes,
             mask=codes.base_mask,
             size=codes.size,
             ordered=self.dtype.ordered,
@@ -845,10 +839,7 @@ def unique(self) -> CategoricalColumn:
         codes = self.codes.unique()
         return column.build_categorical_column(
             categories=self.categories,
-            codes=cudf.core.column.NumericalColumn(
-                codes.base_data,  # type: ignore[arg-type]
-                dtype=codes.dtype,
-            ),
+            codes=codes,
             mask=codes.base_mask,
             offset=codes.offset,
             size=codes.size,
@@ -986,9 +977,7 @@ def find_and_replace(
 
         result = column.build_categorical_column(
             categories=new_cats["cats"],
-            codes=cudf.core.column.NumericalColumn(
-                output.base_data, dtype=output.dtype
-            ),
+            codes=output,
             mask=output.base_mask,
             offset=output.offset,
             size=output.size,
@@ -1184,10 +1173,7 @@ def _concat(
 
         return column.build_categorical_column(
             categories=column.as_column(cats),
-            codes=cudf.core.column.NumericalColumn(
-                codes_col.base_data,  # type: ignore[arg-type]
-                dtype=codes_col.dtype,
-            ),
+            codes=codes_col,
             mask=codes_col.base_mask,
             size=codes_col.size,
             offset=codes_col.offset,
@@ -1199,10 +1185,7 @@ def _with_type_metadata(
         if isinstance(dtype, CategoricalDtype):
             return column.build_categorical_column(
                 categories=dtype.categories._values,
-                codes=cudf.core.column.NumericalColumn(
-                    self.codes.base_data,  # type: ignore[arg-type]
-                    dtype=self.codes.dtype,
-                ),
+                codes=self.codes,
                 mask=self.codes.base_mask,
                 ordered=dtype.ordered,
                 size=self.codes.size,
@@ -1345,9 +1328,7 @@ def _set_categories(
             Self,
             column.build_categorical_column(
                 categories=new_cats,
-                codes=cudf.core.column.NumericalColumn(
-                    new_codes.base_data, dtype=new_codes.dtype
-                ),
+                codes=new_codes,
                 mask=new_codes.base_mask,
                 size=new_codes.size,
                 offset=new_codes.offset,
@@ -1478,9 +1459,7 @@ def pandas_categorical_as_column(
 
     return column.build_categorical_column(
         categories=categorical.categories,
-        codes=cudf.core.column.NumericalColumn(
-            codes.base_data, dtype=codes.dtype
-        ),
+        codes=codes,
         size=codes.size,
         mask=mask,
         ordered=categorical.ordered,
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index b0e33e8b9ce..090c02da990 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1513,6 +1513,7 @@ def column_empty(
                         * cudf.dtype(libcudf.types.size_type_dtype).itemsize
                     )
                 ),
+                size=None,
                 dtype=libcudf.types.size_type_dtype,
             ),
         )
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 16e78ef35ef..ac36813202a 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -654,6 +654,7 @@ def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
                 categories=dtype.categories._values,
                 codes=cudf.core.column.NumericalColumn(
                     self.base_data,  # type: ignore[arg-type]
+                    self.size,
                     dtype=self.dtype,
                 ),
                 mask=self.base_mask,
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 3033abd53f5..f935217f4f9 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -46,7 +46,6 @@
 from cudf.core.column import (
     CategoricalColumn,
     ColumnBase,
-    NumericalColumn,
     StructColumn,
     as_column,
     build_categorical_column,
@@ -8541,9 +8540,7 @@ def _reassign_categories(categories, cols, col_idxs):
         if idx in categories:
             cols[name] = build_categorical_column(
                 categories=categories[idx],
-                codes=NumericalColumn(
-                    cols[name].base_data, dtype=cols[name].dtype
-                ),
+                codes=cols[name],
                 mask=cols[name].base_mask,
                 offset=cols[name].offset,
                 size=cols[name].size,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index d02633a97fa..ee2f0317f8d 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -2501,6 +2501,7 @@ def _get_dt_field(self, field: str) -> Index:
         out_column = self._column.get_dt_field(field)
         out_column = NumericalColumn(
             data=out_column.base_data,
+            size=out_column.size,
             dtype=out_column.dtype,
             mask=out_column.base_mask,
             offset=out_column.offset,

From e197d72f2daafb2f4804f823019b1ca7810ed560 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Fri, 16 Aug 2024 09:45:30 -0700
Subject: [PATCH 676/842] Replace `NativeFile` dependency in dask-cudf Parquet
 reader (#16569)

Replaces `read_parquet` logic that currently depends on `NativeFile` for remote-storage access.

**NOTE**: ~It is possible to remove `NativeFile` usage without adding the new `_prefetch_remote_buffers` logic.~ ~However, I'd like to replace the cudf data-transfer logic soon anyway.~

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/cudf/pull/16569
---
 python/dask_cudf/dask_cudf/backends.py        |  21 ++++
 python/dask_cudf/dask_cudf/io/parquet.py      | 102 +++++++-----------
 .../dask_cudf/dask_cudf/io/tests/test_s3.py   |  64 +++++++----
 3 files changed, 101 insertions(+), 86 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 82ea2ac033a..a65ae819b44 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -498,6 +498,25 @@ def _unsupported_kwargs(old, new, kwargs):
         )
 
 
+def _raise_unsupported_parquet_kwargs(
+    open_file_options=None, filesystem=None, **kwargs
+):
+    import fsspec
+
+    if open_file_options is not None:
+        raise ValueError(
+            "The open_file_options argument is no longer supported "
+            "by the 'cudf' backend."
+        )
+
+    if filesystem not in ("fsspec", None) and not isinstance(
+        filesystem, fsspec.AbstractFileSystem
+    ):
+        raise ValueError(
+            f"filesystem={filesystem} is not supported by the 'cudf' backend."
+        )
+
+
 # Register cudf->pandas
 to_pandas_dispatch = PandasBackendEntrypoint.to_backend_dispatch()
 
@@ -573,6 +592,7 @@ def from_dict(
     def read_parquet(*args, engine=None, **kwargs):
         from dask_cudf.io.parquet import CudfEngine
 
+        _raise_unsupported_parquet_kwargs(**kwargs)
         return _default_backend(
             dd.read_parquet,
             *args,
@@ -665,6 +685,7 @@ def read_parquet(*args, engine=None, **kwargs):
 
         from dask_cudf.io.parquet import CudfEngine
 
+        _raise_unsupported_parquet_kwargs(**kwargs)
         return _default_backend(
             dx.read_parquet, *args, engine=CudfEngine, **kwargs
         )
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index f0cab953458..8f52fce7818 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 import itertools
 import warnings
-from contextlib import ExitStack
 from functools import partial
 from io import BufferedWriter, BytesIO, IOBase
 
@@ -22,18 +21,13 @@
 import cudf
 from cudf.core.column import as_column, build_categorical_column
 from cudf.io import write_to_dataset
-from cudf.io.parquet import (
-    _apply_post_filters,
-    _default_open_file_options,
-    _normalize_filters,
-)
+from cudf.io.parquet import _apply_post_filters, _normalize_filters
 from cudf.utils.dtypes import cudf_dtype_from_pa_type
 from cudf.utils.ioutils import (
     _ROW_GROUP_SIZE_BYTES_DEFAULT,
+    _fsspec_data_transfer,
     _is_local_filesystem,
-    _open_remote_files,
 )
-from cudf.utils.utils import maybe_filter_deprecation
 
 
 class CudfEngine(ArrowDatasetEngine):
@@ -98,63 +92,45 @@ def _read_paths(
 
         dataset_kwargs = dataset_kwargs or {}
         dataset_kwargs["partitioning"] = partitioning or "hive"
-        with ExitStack() as stack:
-            # Non-local filesystem handling
-            paths_or_fobs = paths
-            if not _is_local_filesystem(fs):
-                paths_or_fobs = _open_remote_files(
-                    paths_or_fobs,
-                    fs,
-                    context_stack=stack,
-                    **_default_open_file_options(
-                        open_file_options, columns, row_groups
-                    ),
-                )
 
-            # Filter out deprecation warning unless the user
-            # specifies open_file_options and/or use_python_file_object.
-            # Otherwise, the FutureWarning is out of their control.
-            with maybe_filter_deprecation(
-                (
-                    not open_file_options
-                    and "use_python_file_object" not in kwargs
-                ),
-                message="Support for reading pyarrow's NativeFile is deprecated",
-                category=FutureWarning,
-            ):
-                # Use cudf to read in data
-                try:
-                    df = cudf.read_parquet(
-                        paths_or_fobs,
-                        engine="cudf",
-                        columns=columns,
-                        row_groups=row_groups if row_groups else None,
-                        dataset_kwargs=dataset_kwargs,
-                        categorical_partitions=False,
-                        **kwargs,
-                    )
-                except RuntimeError as err:
-                    # TODO: Remove try/except after null-schema issue is resolved
-                    # (See: https://github.com/rapidsai/cudf/issues/12702)
-                    if len(paths_or_fobs) > 1:
-                        df = cudf.concat(
-                            [
-                                cudf.read_parquet(
-                                    pof,
-                                    engine="cudf",
-                                    columns=columns,
-                                    row_groups=row_groups[i]
-                                    if row_groups
-                                    else None,
-                                    dataset_kwargs=dataset_kwargs,
-                                    categorical_partitions=False,
-                                    **kwargs,
-                                )
-                                for i, pof in enumerate(paths_or_fobs)
-                            ]
+        # Non-local filesystem handling
+        paths_or_fobs = paths
+        if not _is_local_filesystem(fs):
+            paths_or_fobs = [
+                _fsspec_data_transfer(fpath, fs=fs) for fpath in paths
+            ]
+
+        # Use cudf to read in data
+        try:
+            df = cudf.read_parquet(
+                paths_or_fobs,
+                engine="cudf",
+                columns=columns,
+                row_groups=row_groups if row_groups else None,
+                dataset_kwargs=dataset_kwargs,
+                categorical_partitions=False,
+                **kwargs,
+            )
+        except RuntimeError as err:
+            # TODO: Remove try/except after null-schema issue is resolved
+            # (See: https://github.com/rapidsai/cudf/issues/12702)
+            if len(paths_or_fobs) > 1:
+                df = cudf.concat(
+                    [
+                        cudf.read_parquet(
+                            pof,
+                            engine="cudf",
+                            columns=columns,
+                            row_groups=row_groups[i] if row_groups else None,
+                            dataset_kwargs=dataset_kwargs,
+                            categorical_partitions=False,
+                            **kwargs,
                         )
-                    else:
-                        raise err
+                        for i, pof in enumerate(paths_or_fobs)
+                    ]
+                )
+            else:
+                raise err
 
         # Apply filters (if any are defined)
         df = _apply_post_filters(df, filters)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index ac3245b3748..99f19917424 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -5,8 +5,8 @@
 from contextlib import contextmanager
 from io import BytesIO
 
+import fsspec
 import pandas as pd
-import pyarrow.fs as pa_fs
 import pytest
 
 from dask.dataframe import assert_eq
@@ -135,35 +135,53 @@ def test_read_csv_warns(s3_base, s3so):
             assert df.a.sum().compute() == 4
 
 
-@pytest.mark.parametrize(
-    "open_file_options",
-    [
-        {"precache_options": {"method": None}},
-        {"precache_options": {"method": "parquet"}},
-        {"open_file_func": None},
-    ],
-)
-def test_read_parquet_open_file_options(s3_base, s3so, open_file_options, pdf):
+def test_read_parquet_open_file_options_raises():
+    with pytest.raises(ValueError):
+        dask_cudf.read_parquet(
+            "s3://my/path",
+            open_file_options={"precache_options": {"method": "parquet"}},
+        )
+
+
+def test_read_parquet_filesystem(s3_base, s3so, pdf):
+    fname = "test_parquet_filesystem.parquet"
+    bucket = "parquet"
     buffer = BytesIO()
     pdf.to_parquet(path=buffer)
     buffer.seek(0)
-    with s3_context(
-        s3_base=s3_base, bucket="daskparquet", files={"file.parq": buffer}
-    ):
-        if "open_file_func" in open_file_options:
-            fs = pa_fs.S3FileSystem(
-                endpoint_override=s3so["client_kwargs"]["endpoint_url"],
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+        path = f"s3://{bucket}/{fname}"
+
+        # Cannot pass filesystem="arrow"
+        with pytest.raises(ValueError):
+            dask_cudf.read_parquet(
+                path,
+                storage_options=s3so,
+                filesystem="arrow",
             )
-            open_file_options["open_file_func"] = fs.open_input_file
+
+        # Can pass filesystem="fsspec"
         df = dask_cudf.read_parquet(
-            "s3://daskparquet/*.parq",
+            path,
             storage_options=s3so,
-            open_file_options=open_file_options,
+            filesystem="fsspec",
         )
-        with pytest.warns(FutureWarning):
-            assert df.a.sum().compute() == 10
-        with pytest.warns(FutureWarning):
-            assert df.b.sum().compute() == 9
+        assert df.b.sum().compute() == 9
+
+
+def test_read_parquet_filesystem_explicit(s3_base, s3so, pdf):
+    fname = "test_parquet_filesystem_explicit.parquet"
+    bucket = "parquet"
+    buffer = BytesIO()
+    pdf.to_parquet(path=buffer)
+    buffer.seek(0)
+    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
+        path = f"s3://{bucket}/{fname}"
+        fs = fsspec.core.get_fs_token_paths(
+            path, mode="rb", storage_options=s3so
+        )[0]
+        df = dask_cudf.read_parquet(path, filesystem=fs)
+        assert df.b.sum().compute() == 9
 
 
 def test_read_parquet(s3_base, s3so, pdf):

From 623dfceb42eb3e73b352b295898ff3e6cfe7c865 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 16 Aug 2024 12:50:23 -0400
Subject: [PATCH 677/842] [FEA] Add support for `cudf.unique` (#16554)

closes #16460

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16554
---
 python/cudf/cudf/__init__.py          |   2 +-
 python/cudf/cudf/core/algorithms.py   | 122 ++++++++++++++++++++++++++
 python/cudf/cudf/tests/test_unique.py | 117 ++++++++++++++++++++++++
 3 files changed, 240 insertions(+), 1 deletion(-)
 create mode 100644 python/cudf/cudf/tests/test_unique.py

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index 77ae0791b81..ccc45413de4 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -24,7 +24,7 @@
     register_series_accessor,
 )
 from cudf.api.types import dtype
-from cudf.core.algorithms import factorize
+from cudf.core.algorithms import factorize, unique
 from cudf.core.cut import cut
 from cudf.core.dataframe import DataFrame, from_dataframe, from_pandas, merge
 from cudf.core.dtypes import (
diff --git a/python/cudf/cudf/core/algorithms.py b/python/cudf/cudf/core/algorithms.py
index e27d6ec8d3e..b28fce6d343 100644
--- a/python/cudf/cudf/core/algorithms.py
+++ b/python/cudf/cudf/core/algorithms.py
@@ -7,6 +7,7 @@
 import cupy as cp
 import numpy as np
 
+import cudf
 from cudf.core.column import as_column
 from cudf.core.index import Index, RangeIndex
 from cudf.core.scalar import Scalar
@@ -145,3 +146,124 @@ def _interpolation(column: ColumnBase, index: BaseIndex) -> ColumnBase:
     first_nan_idx = valid_locs.values.argmax().item()
     result[:first_nan_idx] = np.nan
     return as_column(result)
+
+
+def unique(values):
+    """
+    Return unique values from array-like
+
+    Parameters
+    ----------
+    values : 1d array-like
+
+    Returns
+    -------
+    cudf.Series,
+
+        The return can be:
+
+        * Index : when the input is an Index
+        * cudf.Series : when the input is a Series
+        * cupy.ndarray : when the input is a cupy.ndarray
+
+        Return cudf.Series, cudf.Index, or cupy.ndarray.
+
+    See Also
+    --------
+    Index.unique : Return unique values from an Index.
+    Series.unique : Return unique values of Series object.
+
+    Examples
+    --------
+    >>> cudf.unique(cudf.Series([2, 1, 3, 3]))
+    0    2
+    1    1
+    2    3
+    dtype: int64
+
+    >>> cudf.unique(cudf.Series([2] + [1] * 5))
+    0    2
+    1    1
+    dtype: int64
+
+    >>> cudf.unique(cudf.Series([pd.Timestamp("20160101"), pd.Timestamp("20160101")]))
+    0   2016-01-01
+    dtype: datetime64[ns]
+
+    >>> cudf.unique(
+    ...     cudf.Series(
+    ...         [
+    ...             pd.Timestamp("20160101", tz="US/Eastern"),
+    ...             pd.Timestamp("20160101", tz="US/Eastern"),
+    ...             pd.Timestamp("20160103", tz="US/Eastern"),
+    ...         ]
+    ...     )
+    ... )
+    0   2016-01-01 00:00:00-05:00
+    1   2016-01-03 00:00:00-05:00
+    dtype: datetime64[ns, US/Eastern]
+
+    >>> cudf.unique(
+    ...     cudf.Index(
+    ...         [
+    ...             pd.Timestamp("20160101", tz="US/Eastern"),
+    ...             pd.Timestamp("20160101", tz="US/Eastern"),
+    ...             pd.Timestamp("20160103", tz="US/Eastern"),
+    ...         ]
+    ...     )
+    ... )
+    DatetimeIndex(['2016-01-01 00:00:00-05:00', '2016-01-03 00:00:00-05:00'],dtype='datetime64[ns, US/Eastern]')
+
+    An unordered Categorical will return categories in the
+    order of appearance.
+
+    >>> cudf.unique(cudf.Series(pd.Categorical(list("baabc"))))
+    0    b
+    1    a
+    2    c
+    dtype: category
+    Categories (3, object): ['a', 'b', 'c']
+
+    >>> cudf.unique(cudf.Series(pd.Categorical(list("baabc"), categories=list("abc"))))
+    0    b
+    1    a
+    2    c
+    dtype: category
+    Categories (3, object): ['a', 'b', 'c']
+
+    An ordered Categorical preserves the category ordering.
+
+    >>> pd.unique(
+    ...     pd.Series(
+    ...         pd.Categorical(list("baabc"), categories=list("abc"), ordered=True)
+    ...     )
+    ... )
+    0    b
+    1    a
+    2    c
+    dtype: category
+    Categories (3, object): ['a' < 'b' < 'c']
+
+    An array of tuples
+
+    >>> pd.unique(pd.Series([("a", "b"), ("b", "a"), ("a", "c"), ("b", "a")]).values)
+    array([('a', 'b'), ('b', 'a'), ('a', 'c')], dtype=object)
+    """
+    if not isinstance(values, (cudf.Series, cudf.Index, cp.ndarray)):
+        raise ValueError(
+            "Must pass cudf.Series, cudf.Index, or cupy.ndarray object"
+        )
+    if isinstance(values, cp.ndarray):
+        # pandas.unique will not sort the values in the result
+        # while cupy.unique documents it will, so we pass cupy.ndarray
+        # through cudf.Index to maintain the original order.
+        return cp.asarray(cudf.Index(values).unique())
+    if isinstance(values, cudf.Series):
+        if get_option("mode.pandas_compatible"):
+            if isinstance(values.dtype, cudf.CategoricalDtype):
+                raise NotImplementedError(
+                    "cudf.Categorical is not implemented"
+                )
+            else:
+                return cp.asarray(values.unique())
+    return values.unique()
diff --git a/python/cudf/cudf/tests/test_unique.py b/python/cudf/cudf/tests/test_unique.py
new file mode 100644
index 00000000000..699b3340521
--- /dev/null
+++ b/python/cudf/cudf/tests/test_unique.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import cupy as cp
+import numpy as np
+import pandas as pd
+import pytest
+
+import cudf
+from cudf.testing import assert_eq
+
+
+@pytest.fixture
+def df():
+    df = cudf.DataFrame()
+    np.random.seed(0)
+
+    arr = np.random.randint(2, size=10, dtype=np.int64)
+    df["foo"] = arr
+    df["bar"] = cudf.Series([pd.Timestamp(x) for x in arr])
+
+    return df
+
+
+@pytest.fixture(params=["foo", "bar"])
+def series_test_vals(request, df):
+    actual = cudf.unique(df[request.param])
+    expected = pd.unique(df[request.param].to_pandas())
+    return actual, expected
+
+
+def test_unique_series_obj(series_test_vals):
+    actual, expected = series_test_vals
+
+    assert isinstance(expected, np.ndarray)
+    assert isinstance(actual, cudf.Series)
+    assert_eq(actual, pd.Series(expected, name=actual.name))
+
+
+@pytest.mark.parametrize(
+    "index",
+    [
+        (cudf.Index, pd.Index),
+        (cudf.MultiIndex, pd.MultiIndex),
+        (cudf.DatetimeIndex, pd.DatetimeIndex),
+        (cudf.CategoricalIndex, pd.CategoricalIndex),
+    ],
+)
+@pytest.mark.parametrize("col", ["foo", "bar"])
+def test_unique_index_obj(index, col, df):
+    if index[0] == cudf.MultiIndex:
+        df.index = cudf.MultiIndex.from_arrays([df[col], df[col]])
+    else:
+        df.index = index[0](df[col])
+    actual = cudf.unique(df.index)
+    expected = pd.unique(df.index.to_pandas())
+
+    isinstance(expected, np.ndarray)
+    assert isinstance(actual, index[0])
+
+    if index[0] == cudf.MultiIndex:
+        expect = index[1].from_arrays(
+            [
+                [x[0] for x in expected],
+                [x[1] for x in expected],
+            ],
+            names=actual.names,
+        )
+        assert_eq(actual, expect)
+    else:
+        assert_eq(actual, index[1](expected, name=actual.name))
+
+
+def test_unique_cupy_ndarray(df):
+    arr = np.asarray(df["foo"].to_pandas())
+    garr = cp.asarray(df["foo"])
+
+    expected = pd.unique(arr)
+    actual = cudf.unique(garr)
+
+    isinstance(expected, np.ndarray)
+    isinstance(actual, cp.ndarray)
+    assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        ["abc", "def", "abc", "a", "def", None],
+        [10, 20, 100, -10, 0, 1, None, 10, 100],
+    ],
+)
+def test_category_dtype_unique(data):
+    gs = cudf.Series(data, dtype="category")
+    ps = gs.to_pandas()
+
+    actual = cudf.unique(gs)
+    expected = pd.unique(ps)
+
+    assert isinstance(expected, pd.Categorical)
+    assert isinstance(actual, cudf.Series)
+    assert_eq(actual, pd.Series(expected))
+
+
+def test_unique_fails_value_error(df):
+    with pytest.raises(
+        ValueError,
+        match="Must pass cudf.Series, cudf.Index, or cupy.ndarray object",
+    ):
+        cudf.unique(df)
+
+
+def test_unique_fails_not_implemented_error(df):
+    with cudf.option_context("mode.pandas_compatible", True):
+        with pytest.raises(
+            NotImplementedError, match="cudf.Categorical is not implemented"
+        ):
+            cudf.unique(cudf.Series(["foo", "foo"], dtype="category"))

From e16c2f2493d316259dc2472b448e61b6e717b7dd Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 16 Aug 2024 07:17:40 -1000
Subject: [PATCH 678/842] Make (Indexed)Frame.__init__ require data (and index)
 (#16430)

This PR makes `data` and `Index` required arguments of `Frame` and `IndexedFrame` where relevant so we can gradually move towards ensuring `data` and `index` are valid mapping of columns and a cudf Index respectively

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16430
---
 python/cudf/cudf/core/dataframe.py     |  2 +-
 python/cudf/cudf/core/frame.py         |  8 ++------
 python/cudf/cudf/core/indexed_frame.py | 16 +++++++++-------
 python/cudf/cudf/core/reshape.py       |  2 +-
 python/cudf/cudf/core/series.py        |  2 +-
 5 files changed, 14 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f935217f4f9..3d805881c5a 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -697,7 +697,7 @@ def __init__(
     ):
         if copy is not None:
             raise NotImplementedError("copy is not currently implemented.")
-        super().__init__()
+        super().__init__({}, index=cudf.Index([]))
         if nan_as_null is no_default:
             nan_as_null = not cudf.get_option("mode.pandas_compatible")
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 32c313e42d3..ce23d671a6c 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -53,14 +53,10 @@ class Frame(BinaryOperand, Scannable):
         A Frame representing the (optional) index columns.
     """
 
-    _data: "ColumnAccessor"
-
     _VALID_BINARY_OPERATIONS = BinaryOperand._SUPPORTED_BINARY_OPERATIONS
 
-    def __init__(self, data=None):
-        if data is None:
-            data = {}
-        self._data = cudf.core.column_accessor.ColumnAccessor(data)
+    def __init__(self, data: ColumnAccessor | MutableMapping[Any, ColumnBase]):
+        self._data = ColumnAccessor(data)
 
     @property
     def _num_columns(self) -> int:
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index ae7369c80d1..8eb6de79bce 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -265,7 +265,6 @@ class IndexedFrame(Frame):
     # mypy can't handle bound type variables as class members
     _loc_indexer_type: type[_LocIndexerClass]  # type: ignore
     _iloc_indexer_type: type[_IlocIndexerClass]  # type: ignore
-    _index: cudf.core.index.BaseIndex
     _groupby = GroupBy
     _resampler = _Resampler
 
@@ -284,18 +283,21 @@ class IndexedFrame(Frame):
         "cummax": {"op_name": "cumulative max"},
     }
 
-    def __init__(self, data=None, index=None):
+    def __init__(
+        self,
+        data: ColumnAccessor | MutableMapping[Any, ColumnBase],
+        index: BaseIndex,
+    ):
         super().__init__(data=data)
-        # TODO: Right now it is possible to initialize an IndexedFrame without
-        # an index. The code's correctness relies on the subclass constructors
-        # assigning the attribute after the fact. We should restructure those
-        # to ensure that this constructor is always invoked with an index.
+        if not isinstance(index, cudf.core._base_index.BaseIndex):
+            raise ValueError(
+                f"index must be a cudf index not {type(index).__name__}"
+            )
         self._index = index
 
     @property
     def _num_rows(self) -> int:
         # Important to use the index because the data may be empty.
-        # TODO: Remove once DataFrame.__init__ is cleaned up
         return len(self.index)
 
     @property
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index df471692702..703a239bea2 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -490,7 +490,7 @@ def concat(
         elif len(objs) == 1:
             obj = objs[0]
             result = cudf.DataFrame._from_data(
-                data=None if join == "inner" else obj._data.copy(deep=True),
+                data={} if join == "inner" else obj._data.copy(deep=True),
                 index=cudf.RangeIndex(len(obj))
                 if ignore_index
                 else obj.index.copy(deep=True),
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 2fb4fde6552..4be10752651 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -518,7 +518,7 @@ def from_categorical(cls, categorical, codes=None):
 
     @classmethod
     @_performance_tracking
-    def from_arrow(cls, array: pa.Array):
+    def from_arrow(cls, array: pa.Array) -> Self:
         """Create from PyArrow Array/ChunkedArray.
 
         Parameters

From 30011c58ed2444f0a6ba9f80c17766e591a610a1 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 16 Aug 2024 07:19:54 -1000
Subject: [PATCH 679/842] Clean up reshaping ops (#16553)

Uses some more "idiomatic" cudf patterns such as

* Checking `isinstance(column.dtype, ...)` instead of `isinstance(column, ...)` (to avoid importing the column objects)
* Using `DataFrame._from_data(dict)` instead of creating an empty `DataFrame` and adding columns one by one

Also avoids some column materialization in `DataFrame.columns = `:

* For `RangeIndex`, avoid materializing to a column to get a distinct count
* For `MultiIndex`, avoid creating a `cudf.MultiIndex` with columns as it's converted to a CPU object to get column labels for the `ColumnAccessor`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16553
---
 python/cudf/cudf/core/dataframe.py |   8 +-
 python/cudf/cudf/core/reshape.py   | 141 ++++++++++++++++-------------
 2 files changed, 82 insertions(+), 67 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 3d805881c5a..6ee3d69441f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2654,8 +2654,12 @@ def columns(self, columns):
         elif isinstance(columns, (cudf.BaseIndex, ColumnBase, Series)):
             level_names = (getattr(columns, "name", None),)
             rangeindex = isinstance(columns, cudf.RangeIndex)
-            columns = as_column(columns)
-            if columns.distinct_count(dropna=False) != len(columns):
+            if rangeindex:
+                unique_count = len(columns)
+            else:
+                columns = as_column(columns)
+                unique_count = columns.distinct_count(dropna=False)
+            if unique_count != len(columns):
                 raise ValueError("Duplicate column names are not allowed")
             pd_columns = pd.Index(columns.to_pandas())
             label_dtype = pd_columns.dtype
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 703a239bea2..3d205957126 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -3,7 +3,7 @@
 
 import itertools
 import warnings
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Literal
 
 import numpy as np
 import pandas as pd
@@ -14,7 +14,7 @@
 from cudf.api.extensions import no_default
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.column import ColumnBase, as_column, column_empty_like
-from cudf.core.column.categorical import CategoricalColumn
+from cudf.core.column_accessor import ColumnAccessor
 from cudf.utils.dtypes import min_unsigned_type
 
 if TYPE_CHECKING:
@@ -101,7 +101,9 @@ def _get_combined_index(indexes, intersect: bool = False, sort=None):
     return index
 
 
-def _normalize_series_and_dataframe(objs, axis):
+def _normalize_series_and_dataframe(
+    objs: list[cudf.Series | cudf.DataFrame], axis: Literal[0, 1]
+) -> None:
     """Convert any cudf.Series objects in objs to DataFrames in place."""
     # Default to naming series by a numerical id if they are not named.
     sr_name = 0
@@ -335,7 +337,7 @@ def concat(
                     result = obj.to_frame()
                 else:
                     result = obj.copy(deep=True)
-                result.columns = pd.RangeIndex(len(result._data))
+                result.columns = cudf.RangeIndex(len(result._data))
             else:
                 result = type(obj)._from_data(
                     data=obj._data.copy(deep=True),
@@ -350,7 +352,7 @@ def concat(
                 result = obj.copy(deep=True)
             if keys_objs is not None and isinstance(result, cudf.DataFrame):
                 k = keys_objs[0]
-                result.columns = cudf.MultiIndex.from_tuples(
+                result.columns = pd.MultiIndex.from_tuples(
                     [
                         (k, *c) if isinstance(c, tuple) else (k, c)
                         for c in result._column_names
@@ -369,7 +371,6 @@ def concat(
             raise TypeError(
                 "Can only concatenate Series and DataFrame objects when axis=1"
             )
-        df = cudf.DataFrame()
         _normalize_series_and_dataframe(objs, axis=axis)
 
         any_empty = any(obj.empty for obj in objs)
@@ -393,18 +394,23 @@ def concat(
         objs = [obj for obj in objs if obj.shape != (0, 0)]
 
         if len(objs) == 0:
-            return df
+            # TODO: https://github.com/rapidsai/cudf/issues/16550
+            return cudf.DataFrame()
 
         # Don't need to align indices of all `objs` since we
         # would anyway return an empty dataframe below
         if not empty_inner:
             objs = _align_objs(objs, how=join, sort=sort)
-            df.index = objs[0].index
+            result_index = objs[0].index
+        else:
+            result_index = None
 
+        result_data = {}
+        result_columns = None
         if keys_objs is None:
             for o in objs:
                 for name, col in o._data.items():
-                    if name in df._data:
+                    if name in result_data:
                         raise NotImplementedError(
                             f"A Column with duplicate name found: {name}, cuDF "
                             f"doesn't support having multiple columns with "
@@ -414,11 +420,11 @@ def concat(
                         # if join is inner and it contains an empty df
                         # we return an empty df, hence creating an empty
                         # column with dtype metadata retained.
-                        df[name] = cudf.core.column.column_empty_like(
+                        result_data[name] = cudf.core.column.column_empty_like(
                             col, newsize=0
                         )
                     else:
-                        df[name] = col
+                        result_data[name] = col
 
             result_columns = (
                 objs[0]
@@ -451,21 +457,21 @@ def concat(
                     else:
                         col_label = (k, name)
                     if empty_inner:
-                        df[col_label] = cudf.core.column.column_empty_like(
-                            col, newsize=0
+                        result_data[col_label] = (
+                            cudf.core.column.column_empty_like(col, newsize=0)
                         )
                     else:
-                        df[col_label] = col
+                        result_data[col_label] = col
 
-        if keys_objs is None:
-            df.columns = result_columns.unique()
-            if ignore_index:
-                df.columns = cudf.RangeIndex(len(result_columns.unique()))
-        elif ignore_index:
-            # with ignore_index the column names change to numbers
-            df.columns = cudf.RangeIndex(len(result_columns))
+        df = cudf.DataFrame._from_data(
+            ColumnAccessor(result_data, verify=False), index=result_index
+        )
+        if ignore_index:
+            df.columns = cudf.RangeIndex(df._num_columns)
+        elif result_columns is not None:
+            df.columns = result_columns
         elif not only_series:
-            df.columns = cudf.MultiIndex.from_tuples(df._column_names)
+            df.columns = pd.MultiIndex.from_tuples(df._column_names)
 
         if empty_inner:
             # if join is inner and it contains an empty df
@@ -486,6 +492,7 @@ def concat(
         if len(objs) == 0:
             # If objs is empty, that indicates all of
             # objs are empty dataframes.
+            # TODO: https://github.com/rapidsai/cudf/issues/16550
             return cudf.DataFrame()
         elif len(objs) == 1:
             obj = objs[0]
@@ -519,7 +526,7 @@ def concat(
     elif typ is cudf.MultiIndex:
         return cudf.MultiIndex._concat(objs)
     elif issubclass(typ, cudf.Index):
-        return cudf.core.index.Index._concat(objs)
+        return cudf.Index._concat(objs)
     else:
         raise TypeError(f"cannot concatenate object of type {typ}")
 
@@ -632,18 +639,19 @@ def melt(
         value_vars = [c for c in frame._column_names if c not in unique_id]
 
     # Error for unimplemented support for datatype
-    dtypes = [frame[col].dtype for col in id_vars + value_vars]
-    if any(isinstance(typ, cudf.CategoricalDtype) for typ in dtypes):
+    if any(
+        isinstance(frame[col].dtype, cudf.CategoricalDtype)
+        for col in id_vars + value_vars
+    ):
         raise NotImplementedError(
             "Categorical columns are not yet supported for function"
         )
 
     # Check dtype homogeneity in value_var
     # Because heterogeneous concat is unimplemented
-    dtypes = [frame[col].dtype for col in value_vars]
-    if len(dtypes) > 0:
-        dtype = dtypes[0]
-        if any(t != dtype for t in dtypes):
+    if len(value_vars) > 1:
+        dtype = frame[value_vars[0]].dtype
+        if any(frame[col].dtype != dtype for col in value_vars):
             raise ValueError("all cols in value_vars must have the same dtype")
 
     # overlap
@@ -969,37 +977,39 @@ def _pivot(df, index, columns):
     index_labels, index_idx = index._encode()
     column_labels = columns_labels.to_pandas().to_flat_index()
 
-    def as_tuple(x):
-        return x if isinstance(x, tuple) else (x,)
-
     result = {}
-    for v in df:
-        names = [as_tuple(v) + as_tuple(name) for name in column_labels]
+    if len(index_labels) != 0 and len(columns_labels) != 0:
+
+        def as_tuple(x):
+            return x if isinstance(x, tuple) else (x,)
+
         nrows = len(index_labels)
-        ncols = len(names)
-        num_elements = nrows * ncols
-        if num_elements > 0:
-            col = df._data[v]
+        for col_label, col in df._data.items():
+            names = [
+                as_tuple(col_label) + as_tuple(name) for name in column_labels
+            ]
+            new_size = nrows * len(names)
             scatter_map = (columns_idx * np.int32(nrows)) + index_idx
-            target = cudf.DataFrame._from_data(
-                {
-                    None: cudf.core.column.column_empty_like(
-                        col, masked=True, newsize=nrows * ncols
-                    )
-                }
+            target_col = cudf.core.column.column_empty_like(
+                col, masked=True, newsize=new_size
             )
-            target._data[None][scatter_map] = col
-            result_frames = target._split(range(nrows, nrows * ncols, nrows))
+            target_col[scatter_map] = col
+            target = cudf.Index._from_column(target_col)
             result.update(
                 {
-                    name: next(iter(f._columns))
-                    for name, f in zip(names, result_frames)
+                    name: idx._column
+                    for name, idx in zip(
+                        names, target._split(range(nrows, new_size, nrows))
+                    )
                 }
             )
 
     # the result of pivot always has a multicolumn
-    ca = cudf.core.column_accessor.ColumnAccessor(
-        result, multiindex=True, level_names=(None,) + columns._data.names
+    ca = ColumnAccessor(
+        result,
+        multiindex=True,
+        level_names=(None,) + columns._data.names,
+        verify=False,
     )
     return cudf.DataFrame._from_data(
         ca, index=cudf.Index(index_labels, name=index.name)
@@ -1070,19 +1080,20 @@ def pivot(data, columns=None, index=no_default, values=no_default):
     if index is no_default:
         index = df.index
     else:
-        index = cudf.core.index.Index(df.loc[:, index])
+        index = cudf.Index(df.loc[:, index])
     columns = cudf.Index(df.loc[:, columns])
 
     # Create a DataFrame composed of columns from both
     # columns and index
-    columns_index = {}
-    columns_index = {
-        i: col
-        for i, col in enumerate(
-            itertools.chain(index._data.columns, columns._data.columns)
-        )
-    }
-    columns_index = cudf.DataFrame(columns_index)
+    ca = ColumnAccessor(
+        dict(
+            enumerate(
+                itertools.chain(index._data.columns, columns._data.columns)
+            )
+        ),
+        verify=False,
+    )
+    columns_index = cudf.DataFrame._from_data(ca)
 
     # Check that each row is unique:
     if len(columns_index) != len(columns_index.drop_duplicates()):
@@ -1225,13 +1236,13 @@ def unstack(df, level, fill_value=None, sort: bool = True):
     return result
 
 
-def _get_unique(column, dummy_na):
+def _get_unique(column: ColumnBase, dummy_na: bool) -> ColumnBase:
     """
     Returns unique values in a column, if
     dummy_na is False, nan's are also dropped.
     """
-    if isinstance(column, cudf.core.column.CategoricalColumn):
-        unique = column.categories
+    if isinstance(column.dtype, cudf.CategoricalDtype):
+        unique = column.categories  # type: ignore[attr-defined]
     else:
         unique = column.unique().sort_values()
     if not dummy_na:
@@ -1251,11 +1262,11 @@ def _one_hot_encode_column(
     `prefix`, separated with category name with `prefix_sep`. The encoding
     columns maybe coerced into `dtype`.
     """
-    if isinstance(column, CategoricalColumn):
+    if isinstance(column.dtype, cudf.CategoricalDtype):
         if column.size == column.null_count:
             column = column_empty_like(categories, newsize=column.size)
         else:
-            column = column._get_decategorized_column()
+            column = column._get_decategorized_column()  # type: ignore[attr-defined]
 
     if column.size * categories.size >= np.iinfo(size_type_dtype).max:
         raise ValueError(
@@ -1536,7 +1547,7 @@ def pivot_table(
         table_columns = tuple(
             map(lambda column: column[1:], table._data.names)
         )
-        table.columns = cudf.MultiIndex.from_tuples(
+        table.columns = pd.MultiIndex.from_tuples(
             tuples=table_columns, names=column_names
         )
 

From bc8ca9befdd77d3f4a270a64dae178b2ef355181 Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Fri, 16 Aug 2024 12:02:21 -0700
Subject: [PATCH 680/842] Setup pylibcudf package (#16299)

Migrates cudf._lib.pylibcudf to a new pylibcudf package

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Matthew Murray (https://github.com/Matt711)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16299
---
 .github/labeler.yml                           |   2 +-
 .github/workflows/pr.yaml                     |  12 +-
 build.sh                                      |  15 +-
 ci/build_docs.sh                              |   2 +-
 ci/build_python.sh                            |   7 +
 ci/build_wheel_cudf.sh                        |   8 +-
 ci/build_wheel_pylibcudf.sh                   |  16 ++
 ci/cudf_pandas_scripts/pandas-tests/run.sh    |   2 +
 ci/cudf_pandas_scripts/run_tests.sh           |   2 +
 ci/test_python_cudf.sh                        |   2 +-
 ci/test_wheel_cudf.sh                         |  10 +-
 ci/test_wheel_cudf_polars.sh                  |   6 +-
 ci/test_wheel_dask_cudf.sh                    |   5 +-
 .../all_cuda-118_arch-x86_64.yaml             |   2 +
 .../all_cuda-125_arch-x86_64.yaml             |   2 +
 conda/recipes/cudf/meta.yaml                  |   2 +
 conda/recipes/cudf_kafka/meta.yaml            |   4 +-
 conda/recipes/pylibcudf/build.sh              |   4 +
 .../recipes/pylibcudf/conda_build_config.yaml |  20 +++
 conda/recipes/pylibcudf/meta.yaml             | 108 ++++++++++++
 dependencies.yaml                             | 165 +++++++++++++++++-
 .../api_docs/pylibcudf/aggregation.rst        |   2 +-
 .../api_docs/pylibcudf/binaryop.rst           |   2 +-
 .../user_guide/api_docs/pylibcudf/column.rst  |   2 +-
 .../api_docs/pylibcudf/column_factories.rst   |   2 +-
 .../api_docs/pylibcudf/concatenate.rst        |   2 +-
 .../user_guide/api_docs/pylibcudf/copying.rst |   2 +-
 .../api_docs/pylibcudf/datetime.rst           |   2 +-
 .../api_docs/pylibcudf/expressions.rst        |   2 +-
 .../user_guide/api_docs/pylibcudf/filling.rst |   2 +-
 .../api_docs/pylibcudf/gpumemoryview.rst      |   2 +-
 .../user_guide/api_docs/pylibcudf/groupby.rst |   2 +-
 .../user_guide/api_docs/pylibcudf/interop.rst |   2 +-
 .../user_guide/api_docs/pylibcudf/io/avro.rst |   2 +-
 .../user_guide/api_docs/pylibcudf/io/csv.rst  |   2 +-
 .../api_docs/pylibcudf/io/index.rst           |   2 +-
 .../user_guide/api_docs/pylibcudf/io/json.rst |   2 +-
 .../api_docs/pylibcudf/io/parquet.rst         |   2 +-
 .../user_guide/api_docs/pylibcudf/join.rst    |   2 +-
 .../user_guide/api_docs/pylibcudf/lists.rst   |   2 +-
 .../user_guide/api_docs/pylibcudf/merge.rst   |   2 +-
 .../api_docs/pylibcudf/quantiles.rst          |   2 +-
 .../user_guide/api_docs/pylibcudf/reduce.rst  |   2 +-
 .../user_guide/api_docs/pylibcudf/replace.rst |   2 +-
 .../user_guide/api_docs/pylibcudf/reshape.rst |   2 +-
 .../user_guide/api_docs/pylibcudf/rolling.rst |   2 +-
 .../user_guide/api_docs/pylibcudf/round.rst   |   2 +-
 .../user_guide/api_docs/pylibcudf/scalar.rst  |   2 +-
 .../user_guide/api_docs/pylibcudf/search.rst  |   2 +-
 .../user_guide/api_docs/pylibcudf/sorting.rst |   2 +-
 .../api_docs/pylibcudf/stream_compaction.rst  |   2 +-
 .../api_docs/pylibcudf/strings/capitalize.rst |   2 +-
 .../api_docs/pylibcudf/strings/char_types.rst |   2 +-
 .../api_docs/pylibcudf/strings/contains.rst   |   2 +-
 .../api_docs/pylibcudf/strings/find.rst       |   2 +-
 .../pylibcudf/strings/regex_flags.rst         |   2 +-
 .../pylibcudf/strings/regex_program.rst       |   2 +-
 .../api_docs/pylibcudf/strings/replace.rst    |   2 +-
 .../api_docs/pylibcudf/strings/slice.rst      |   2 +-
 .../user_guide/api_docs/pylibcudf/table.rst   |   2 +-
 .../user_guide/api_docs/pylibcudf/traits.rst  |   2 +-
 .../api_docs/pylibcudf/transform.rst          |   2 +-
 .../user_guide/api_docs/pylibcudf/types.rst   |   2 +-
 .../user_guide/api_docs/pylibcudf/unary.rst   |   2 +-
 python/cudf/CMakeLists.txt                    |   4 +-
 python/cudf/cudf/_lib/CMakeLists.txt          |   1 -
 python/cudf/cudf/_lib/__init__.py             |   1 -
 python/cudf/cudf/_lib/aggregation.pyx         |   3 +-
 python/cudf/cudf/_lib/avro.pyx                |   4 +-
 python/cudf/cudf/_lib/binaryop.pyx            |   3 +-
 python/cudf/cudf/_lib/column.pxd              |   9 +-
 python/cudf/cudf/_lib/column.pyx              |  20 +--
 python/cudf/cudf/_lib/concat.pyx              |   3 +-
 python/cudf/cudf/_lib/copying.pxd             |   2 +-
 python/cudf/cudf/_lib/copying.pyx             |  20 +--
 python/cudf/cudf/_lib/csv.pyx                 |  21 ++-
 python/cudf/cudf/_lib/datetime.pyx            |  13 +-
 python/cudf/cudf/_lib/filling.pyx             |   4 +-
 python/cudf/cudf/_lib/groupby.pyx             |   7 +-
 python/cudf/cudf/_lib/hash.pyx                |  15 +-
 python/cudf/cudf/_lib/interop.pyx             |   9 +-
 python/cudf/cudf/_lib/io/utils.pxd            |   7 +-
 python/cudf/cudf/_lib/io/utils.pyx            |  11 +-
 python/cudf/cudf/_lib/join.pyx                |   2 +-
 python/cudf/cudf/_lib/json.pyx                |  15 +-
 python/cudf/cudf/_lib/labeling.pyx            |  10 +-
 python/cudf/cudf/_lib/lists.pyx               |   7 +-
 python/cudf/cudf/_lib/merge.pyx               |   2 +-
 python/cudf/cudf/_lib/null_mask.pyx           |  11 +-
 .../cudf/_lib/nvtext/byte_pair_encode.pyx     |  11 +-
 .../cudf/cudf/_lib/nvtext/edit_distance.pyx   |   9 +-
 .../cudf/cudf/_lib/nvtext/generate_ngrams.pyx |  13 +-
 python/cudf/cudf/_lib/nvtext/jaccard.pyx      |  11 +-
 python/cudf/cudf/_lib/nvtext/minhash.pyx      |  11 +-
 .../cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx |  13 +-
 python/cudf/cudf/_lib/nvtext/normalize.pyx    |   9 +-
 python/cudf/cudf/_lib/nvtext/replace.pyx      |  13 +-
 python/cudf/cudf/_lib/nvtext/stemmer.pyx      |  11 +-
 .../cudf/_lib/nvtext/subword_tokenize.pyx     |   7 +-
 python/cudf/cudf/_lib/nvtext/tokenize.pyx     |  13 +-
 python/cudf/cudf/_lib/orc.pyx                 |  33 ++--
 python/cudf/cudf/_lib/parquet.pyx             |  39 +++--
 python/cudf/cudf/_lib/partitioning.pyx        |  13 +-
 python/cudf/cudf/_lib/pylibcudf/io/avro.pxd   |  12 --
 .../pylibcudf/libcudf/strings/extract.pxd     |  15 --
 .../_lib/pylibcudf/strings/char_types.pxd     |   5 -
 .../cudf/_lib/pylibcudf/strings/contains.pxd  |   7 -
 .../_lib/pylibcudf/strings/regex_flags.pxd    |   2 -
 python/cudf/cudf/_lib/quantiles.pyx           |   5 +-
 python/cudf/cudf/_lib/reduce.pyx              |   3 +-
 python/cudf/cudf/_lib/replace.pyx             |   3 +-
 python/cudf/cudf/_lib/reshape.pyx             |   5 +-
 python/cudf/cudf/_lib/rolling.pyx             |   3 +-
 python/cudf/cudf/_lib/round.pyx               |   4 +-
 python/cudf/cudf/_lib/scalar.pxd              |   3 +-
 python/cudf/cudf/_lib/scalar.pyx              |  14 +-
 python/cudf/cudf/_lib/search.pyx              |   2 +-
 python/cudf/cudf/_lib/sort.pyx                |  13 +-
 python/cudf/cudf/_lib/stream_compaction.pyx   |   2 +-
 python/cudf/cudf/_lib/string_casting.pyx      |  21 +--
 python/cudf/cudf/_lib/strings/attributes.pyx  |   9 +-
 python/cudf/cudf/_lib/strings/capitalize.pyx  |   2 +-
 python/cudf/cudf/_lib/strings/case.pyx        |   2 +-
 python/cudf/cudf/_lib/strings/char_types.pyx  |  11 +-
 python/cudf/cudf/_lib/strings/combine.pyx     |  13 +-
 python/cudf/cudf/_lib/strings/contains.pyx    |  19 +-
 .../strings/convert/convert_fixed_point.pyx   |  11 +-
 .../_lib/strings/convert/convert_floats.pyx   |   9 +-
 .../_lib/strings/convert/convert_integers.pyx |   9 +-
 .../_lib/strings/convert/convert_lists.pyx    |  11 +-
 .../_lib/strings/convert/convert_urls.pyx     |   9 +-
 python/cudf/cudf/_lib/strings/extract.pyx     |  11 +-
 python/cudf/cudf/_lib/strings/find.pyx        |   6 +-
 .../cudf/cudf/_lib/strings/find_multiple.pyx  |   9 +-
 python/cudf/cudf/_lib/strings/findall.pyx     |  11 +-
 python/cudf/cudf/_lib/strings/json.pyx        |  11 +-
 python/cudf/cudf/_lib/strings/padding.pyx     |  11 +-
 python/cudf/cudf/_lib/strings/repeat.pyx      |   9 +-
 python/cudf/cudf/_lib/strings/replace.pyx     |   5 +-
 python/cudf/cudf/_lib/strings/replace_re.pyx  |  17 +-
 .../cudf/_lib/strings/split/partition.pyx     |  11 +-
 python/cudf/cudf/_lib/strings/split/split.pyx |  19 +-
 python/cudf/cudf/_lib/strings/strip.pyx       |  11 +-
 python/cudf/cudf/_lib/strings/substring.pyx   |   2 +-
 python/cudf/cudf/_lib/strings/translate.pyx   |  13 +-
 python/cudf/cudf/_lib/strings/wrap.pyx        |   9 +-
 python/cudf/cudf/_lib/strings_udf.pyx         |  12 +-
 python/cudf/cudf/_lib/text.pyx                |   7 +-
 python/cudf/cudf/_lib/timezone.pyx            |   5 +-
 python/cudf/cudf/_lib/transform.pyx           |  24 +--
 python/cudf/cudf/_lib/transpose.pyx           |   7 +-
 python/cudf/cudf/_lib/types.pxd               |   8 +-
 python/cudf/cudf/_lib/types.pyx               |  12 +-
 python/cudf/cudf/_lib/unary.pyx               |   3 +-
 python/cudf/cudf/_lib/utils.pxd               |   4 +-
 python/cudf/cudf/_lib/utils.pyx               |   9 +-
 .../cudf/cudf/core/_internals/expressions.py  |   4 +-
 python/cudf/cudf/core/buffer/buffer.py        |   3 +-
 python/cudf/cudf/core/column/numerical.py     |   3 +-
 python/cudf/cudf/core/indexed_frame.py        |   4 +-
 python/cudf/cudf/pandas/__init__.py           |   3 +-
 python/cudf/pyproject.toml                    |   3 +
 .../cudf_kafka/cudf_kafka/_lib/CMakeLists.txt |   4 +-
 python/cudf_kafka/cudf_kafka/_lib/kafka.pxd   |   5 +-
 python/cudf_kafka/cudf_kafka/_lib/kafka.pyx   |   3 +-
 .../cudf_polars/containers/column.py          |   2 +-
 .../cudf_polars/containers/dataframe.py       |   3 +-
 python/cudf_polars/cudf_polars/dsl/expr.py    |   3 +-
 python/cudf_polars/cudf_polars/dsl/ir.py      |   3 +-
 .../cudf_polars/cudf_polars/dsl/translate.py  |   3 +-
 .../cudf_polars/typing/__init__.py            |   4 +-
 .../cudf_polars/cudf_polars/utils/dtypes.py   |   3 +-
 .../cudf_polars/cudf_polars/utils/sorting.py  |   2 +-
 python/cudf_polars/pyproject.toml             |   2 +-
 .../tests/containers/test_column.py           |   3 +-
 .../tests/containers/test_dataframe.py        |   3 +-
 python/cudf_polars/tests/dsl/test_expr.py     |   3 +-
 .../tests/expressions/test_literal.py         |   3 +-
 .../tests/expressions/test_sort.py            |   3 +-
 .../cudf_polars/tests/utils/test_broadcast.py |   3 +-
 python/pylibcudf/CMakeLists.txt               | 100 +++++++++++
 python/pylibcudf/README.md                    |   1 +
 .../cmake/Modules/LinkPyarrowHeaders.cmake    |   0
 .../cmake/Modules/WheelHelpers.cmake          |   0
 .../pylibcudf/CMakeLists.txt                  |   2 +-
 python/pylibcudf/pylibcudf/VERSION            |   1 +
 .../_lib => pylibcudf}/pylibcudf/__init__.pxd |   0
 .../_lib => pylibcudf}/pylibcudf/__init__.py  |   1 +
 python/pylibcudf/pylibcudf/_version.py        |  24 +++
 .../pylibcudf/aggregation.pxd                 |   5 +-
 .../pylibcudf/aggregation.pyx                 |  21 +--
 .../_lib => pylibcudf}/pylibcudf/binaryop.pxd |   3 +-
 .../_lib => pylibcudf}/pylibcudf/binaryop.pyx |  15 +-
 .../_lib => pylibcudf}/pylibcudf/column.pxd   |  11 +-
 .../_lib => pylibcudf}/pylibcudf/column.pyx   |  11 +-
 .../pylibcudf/column_factories.pxd            |   3 +-
 .../pylibcudf/column_factories.pyx            |   7 +-
 .../pylibcudf/concatenate.pxd                 |   0
 .../pylibcudf/concatenate.pyx                 |  11 +-
 .../_lib => pylibcudf}/pylibcudf/copying.pxd  |   5 +-
 .../_lib => pylibcudf}/pylibcudf/copying.pyx  |  21 ++-
 .../_lib => pylibcudf}/pylibcudf/datetime.pxd |   0
 .../_lib => pylibcudf}/pylibcudf/datetime.pyx |   7 +-
 .../pylibcudf/exception_handler.pxd           |   0
 .../pylibcudf/experimental.pxd                |   0
 .../pylibcudf/experimental.pyx                |   3 +-
 .../pylibcudf/expressions.pxd                 |   3 +-
 .../pylibcudf/expressions.pyx                 |  15 +-
 .../_lib => pylibcudf}/pylibcudf/filling.pxd  |   2 +-
 .../_lib => pylibcudf}/pylibcudf/filling.pyx  |   9 +-
 .../pylibcudf/gpumemoryview.pxd               |   0
 .../pylibcudf/gpumemoryview.pyx               |   0
 .../_lib => pylibcudf}/pylibcudf/groupby.pxd  |   9 +-
 .../_lib => pylibcudf}/pylibcudf/groupby.pyx  |  15 +-
 .../_lib => pylibcudf}/pylibcudf/interop.pyx  |   6 +-
 .../pylibcudf/io/CMakeLists.txt               |   0
 .../pylibcudf/io/__init__.pxd                 |   0
 .../pylibcudf/io/__init__.py                  |   0
 python/pylibcudf/pylibcudf/io/avro.pxd        |  12 ++
 .../_lib => pylibcudf}/pylibcudf/io/avro.pyx  |   7 +-
 .../_lib => pylibcudf}/pylibcudf/io/csv.pyx   |  11 +-
 .../pylibcudf/io/datasource.pxd               |   5 +-
 .../pylibcudf/io/datasource.pyx               |   5 +-
 .../_lib => pylibcudf}/pylibcudf/io/json.pxd  |   7 +-
 .../_lib => pylibcudf}/pylibcudf/io/json.pyx  |  19 +-
 .../pylibcudf/io/parquet.pxd                  |  11 +-
 .../pylibcudf/io/parquet.pyx                  |  15 +-
 .../_lib => pylibcudf}/pylibcudf/io/types.pxd |   7 +-
 .../_lib => pylibcudf}/pylibcudf/io/types.pyx |  13 +-
 .../_lib => pylibcudf}/pylibcudf/join.pxd     |   2 +-
 .../_lib => pylibcudf}/pylibcudf/join.pyx     |   9 +-
 .../pylibcudf/libcudf/CMakeLists.txt          |   0
 .../pylibcudf/libcudf/__init__.pxd            |   0
 .../pylibcudf/libcudf/__init__.py             |   0
 .../pylibcudf/libcudf/aggregation.pxd         |   3 +-
 .../pylibcudf/libcudf/aggregation.pyx         |   0
 .../pylibcudf/libcudf/binaryop.pxd            |  11 +-
 .../pylibcudf/libcudf/binaryop.pyx            |   0
 .../pylibcudf/libcudf/column/__init__.pxd     |   0
 .../pylibcudf/libcudf/column/__init__.py      |   0
 .../pylibcudf/libcudf/column/column.pxd       |   9 +-
 .../libcudf/column/column_factories.pxd       |  11 +-
 .../pylibcudf/libcudf/column/column_view.pxd  |   7 +-
 .../pylibcudf/libcudf/concatenate.pxd         |   7 +-
 .../pylibcudf/libcudf/contiguous_split.pxd    |   5 +-
 .../pylibcudf/libcudf/copying.pxd             |  19 +-
 .../pylibcudf/libcudf/copying.pyx             |   0
 .../pylibcudf/libcudf/datetime.pxd            |   7 +-
 .../pylibcudf/libcudf/experimental.pxd        |   0
 .../pylibcudf/libcudf/expressions.pxd         |   9 +-
 .../pylibcudf/libcudf/expressions.pyx         |   0
 .../pylibcudf/libcudf/filling.pxd             |  13 +-
 .../pylibcudf/libcudf/groupby.pxd             |  19 +-
 .../pylibcudf/libcudf/hash.pxd                |   7 +-
 .../pylibcudf/libcudf/interop.pxd             |  13 +-
 .../pylibcudf/libcudf/io/CMakeLists.txt       |   0
 .../pylibcudf/libcudf/io/__init__.pxd         |   0
 .../pylibcudf/libcudf/io/__init__.py          |   0
 .../pylibcudf/libcudf/io/arrow_io_source.pxd  |   3 +-
 .../pylibcudf/libcudf/io/avro.pxd             |   5 +-
 .../pylibcudf/libcudf/io/csv.pxd              |   7 +-
 .../pylibcudf/libcudf/io/data_sink.pxd        |   0
 .../pylibcudf/libcudf/io/datasource.pxd       |   0
 .../pylibcudf/libcudf/io/json.pxd             |   7 +-
 .../pylibcudf/libcudf/io/json.pyx             |   0
 .../pylibcudf/libcudf/io/orc.pxd              |   7 +-
 .../pylibcudf/libcudf/io/orc_metadata.pxd     |   5 +-
 .../pylibcudf/libcudf/io/parquet.pxd          |  76 ++++----
 .../pylibcudf/libcudf/io/parquet_metadata.pxd |   5 +-
 .../pylibcudf/libcudf/io/text.pxd             |   3 +-
 .../pylibcudf/libcudf/io/timezone.pxd         |   3 +-
 .../pylibcudf/libcudf/io/types.pxd            |  11 +-
 .../pylibcudf/libcudf/io/types.pyx            |   0
 .../pylibcudf/libcudf/join.pxd                |   9 +-
 .../pylibcudf/libcudf/labeling.pxd            |   5 +-
 .../pylibcudf/libcudf/lists/__init__.pxd      |   0
 .../pylibcudf/libcudf/lists/__init__.py       |   0
 .../pylibcudf/libcudf/lists/combine.pxd       |   7 +-
 .../pylibcudf/libcudf/lists/contains.pxd      |  13 +-
 .../libcudf/lists/count_elements.pxd          |   7 +-
 .../pylibcudf/libcudf/lists/explode.pxd       |   7 +-
 .../pylibcudf/libcudf/lists/extract.pxd       |   9 +-
 .../pylibcudf/libcudf/lists/filling.pxd       |   5 +-
 .../pylibcudf/libcudf/lists/gather.pxd        |   7 +-
 .../libcudf/lists/lists_column_view.pxd       |   4 +-
 .../pylibcudf/libcudf/lists/reverse.pxd       |   7 +-
 .../libcudf/lists/set_operations.pxd          |   9 +-
 .../pylibcudf/libcudf/lists/sorting.pxd       |   9 +-
 .../libcudf/lists/stream_compaction.pxd       |   9 +-
 .../pylibcudf/libcudf/merge.pxd               |   7 +-
 .../pylibcudf/libcudf/null_mask.pxd           |  11 +-
 .../pylibcudf/libcudf/nvtext/__init__.pxd     |   0
 .../pylibcudf/libcudf/nvtext/__init__.py      |   0
 .../libcudf/nvtext/byte_pair_encode.pxd       |   7 +-
 .../libcudf/nvtext/edit_distance.pxd          |   5 +-
 .../libcudf/nvtext/generate_ngrams.pxd        |   9 +-
 .../pylibcudf/libcudf/nvtext/jaccard.pxd      |   7 +-
 .../pylibcudf/libcudf/nvtext/minhash.pxd      |   7 +-
 .../libcudf/nvtext/ngrams_tokenize.pxd        |   9 +-
 .../pylibcudf/libcudf/nvtext/normalize.pxd    |   5 +-
 .../pylibcudf/libcudf/nvtext/replace.pxd      |   9 +-
 .../pylibcudf/libcudf/nvtext/stemmer.pxd      |   7 +-
 .../libcudf/nvtext/subword_tokenize.pxd       |   5 +-
 .../pylibcudf/libcudf/nvtext/tokenize.pxd     |   9 +-
 .../pylibcudf/libcudf/partitioning.pxd        |  11 +-
 .../pylibcudf/libcudf/quantiles.pxd           |  11 +-
 .../pylibcudf/libcudf/reduce.pxd              |  14 +-
 .../pylibcudf/libcudf/reduce.pyx              |   0
 .../pylibcudf/libcudf/replace.pxd             |   9 +-
 .../pylibcudf/libcudf/replace.pyx             |   0
 .../pylibcudf/libcudf/reshape.pxd             |   9 +-
 .../pylibcudf/libcudf/rolling.pxd             |  11 +-
 .../pylibcudf/libcudf/round.pxd               |   5 +-
 .../pylibcudf/libcudf/round.pyx               |   0
 .../pylibcudf/libcudf/scalar/__init__.pxd     |   0
 .../pylibcudf/libcudf/scalar/__init__.py      |   0
 .../pylibcudf/libcudf/scalar/scalar.pxd       |   9 +-
 .../libcudf/scalar/scalar_factories.pxd       |   5 +-
 .../pylibcudf/libcudf/search.pxd              |   9 +-
 .../pylibcudf/libcudf/sorting.pxd             |  15 +-
 .../pylibcudf/libcudf/stream_compaction.pxd   |  13 +-
 .../pylibcudf/libcudf/stream_compaction.pyx   |   0
 .../pylibcudf/libcudf/strings/CMakeLists.txt  |   0
 .../pylibcudf/libcudf/strings/__init__.pxd    |   0
 .../pylibcudf/libcudf/strings/__init__.py     |   0
 .../pylibcudf/libcudf/strings/attributes.pxd  |   5 +-
 .../pylibcudf/libcudf/strings/capitalize.pxd  |  11 +-
 .../pylibcudf/libcudf/strings/case.pxd        |   5 +-
 .../pylibcudf/libcudf/strings/char_types.pxd  |   7 +-
 .../pylibcudf/libcudf/strings/char_types.pyx  |   0
 .../pylibcudf/libcudf/strings/combine.pxd     |   9 +-
 .../pylibcudf/libcudf/strings/contains.pxd    |   9 +-
 .../libcudf/strings/convert/__init__.pxd      |   0
 .../libcudf/strings/convert/__init__.py       |   0
 .../strings/convert/convert_booleans.pxd      |   7 +-
 .../strings/convert/convert_datetime.pxd      |   7 +-
 .../strings/convert/convert_durations.pxd     |   7 +-
 .../strings/convert/convert_fixed_point.pxd   |   7 +-
 .../strings/convert/convert_floats.pxd        |   7 +-
 .../strings/convert/convert_integers.pxd      |   7 +-
 .../libcudf/strings/convert/convert_ipv4.pxd  |   5 +-
 .../libcudf/strings/convert/convert_lists.pxd |   7 +-
 .../libcudf/strings/convert/convert_urls.pxd  |   5 +-
 .../pylibcudf/libcudf/strings/extract.pxd     |  14 ++
 .../pylibcudf/libcudf/strings/find.pxd        |   9 +-
 .../libcudf/strings/find_multiple.pxd         |   5 +-
 .../pylibcudf/libcudf/strings/findall.pxd     |   7 +-
 .../pylibcudf/libcudf/strings/json.pxd        |   7 +-
 .../pylibcudf/libcudf/strings/padding.pxd     |  11 +-
 .../pylibcudf/libcudf/strings/regex_flags.pxd |   0
 .../pylibcudf/libcudf/strings/regex_flags.pyx |   0
 .../libcudf/strings/regex_program.pxd         |   3 +-
 .../pylibcudf/libcudf/strings/repeat.pxd      |   7 +-
 .../pylibcudf/libcudf/strings/replace.pxd     |   9 +-
 .../pylibcudf/libcudf/strings/replace_re.pxd  |  13 +-
 .../pylibcudf/libcudf/strings/side_type.pxd   |   0
 .../libcudf/strings/split/__init__.pxd        |   0
 .../libcudf/strings/split/__init__.py         |   0
 .../libcudf/strings/split/partition.pxd       |   9 +-
 .../pylibcudf/libcudf/strings/split/split.pxd |  13 +-
 .../pylibcudf/libcudf/strings/strip.pxd       |   9 +-
 .../pylibcudf/libcudf/strings/substring.pxd   |   9 +-
 .../pylibcudf/libcudf/strings/translate.pxd   |   9 +-
 .../pylibcudf/libcudf/strings/wrap.pxd        |   7 +-
 .../pylibcudf/libcudf/strings_udf.pxd         |   7 +-
 .../pylibcudf/libcudf/table/__init__.pxd      |   0
 .../pylibcudf/libcudf/table/__init__.py       |   0
 .../pylibcudf/libcudf/table/table.pxd         |  10 +-
 .../pylibcudf/libcudf/table/table_view.pxd    |   5 +-
 .../pylibcudf/libcudf/transform.pxd           |  17 +-
 .../pylibcudf/libcudf/transpose.pxd           |   5 +-
 .../pylibcudf/libcudf/types.pxd               |   0
 .../pylibcudf/libcudf/types.pyx               |   0
 .../pylibcudf/libcudf/unary.pxd               |   7 +-
 .../pylibcudf/libcudf/unary.pyx               |   0
 .../pylibcudf/libcudf/utilities/__init__.pxd  |   0
 .../pylibcudf/libcudf/utilities/__init__.py   |   0
 .../pylibcudf/libcudf/utilities/host_span.pxd |   0
 .../pylibcudf/libcudf/utilities/traits.pxd    |   3 +-
 .../libcudf/utilities/type_dispatcher.pxd     |   2 +-
 .../pylibcudf/libcudf/wrappers/__init__.pxd   |   0
 .../pylibcudf/libcudf/wrappers/__init__.py    |   0
 .../pylibcudf/libcudf/wrappers/decimals.pxd   |   3 +-
 .../pylibcudf/libcudf/wrappers/durations.pxd  |   0
 .../pylibcudf/libcudf/wrappers/timestamps.pxd |   0
 .../_lib => pylibcudf}/pylibcudf/lists.pxd    |   3 +-
 .../_lib => pylibcudf}/pylibcudf/lists.pyx    |  29 ++-
 .../_lib => pylibcudf}/pylibcudf/merge.pxd    |   0
 .../_lib => pylibcudf}/pylibcudf/merge.pyx    |   9 +-
 .../pylibcudf/quantiles.pxd                   |   3 +-
 .../pylibcudf/quantiles.pyx                   |  11 +-
 .../_lib => pylibcudf}/pylibcudf/reduce.pxd   |   2 +-
 .../_lib => pylibcudf}/pylibcudf/reduce.pyx   |  17 +-
 .../_lib => pylibcudf}/pylibcudf/replace.pxd  |   3 +-
 .../_lib => pylibcudf}/pylibcudf/replace.pyx  |   7 +-
 .../_lib => pylibcudf}/pylibcudf/reshape.pxd  |   2 +-
 .../_lib => pylibcudf}/pylibcudf/reshape.pyx  |   9 +-
 .../_lib => pylibcudf}/pylibcudf/rolling.pxd  |   2 +-
 .../_lib => pylibcudf}/pylibcudf/rolling.pyx  |   9 +-
 .../_lib => pylibcudf}/pylibcudf/round.pxd    |   3 +-
 .../_lib => pylibcudf}/pylibcudf/round.pyx    |  10 +-
 .../_lib => pylibcudf}/pylibcudf/scalar.pxd   |   3 +-
 .../_lib => pylibcudf}/pylibcudf/scalar.pyx   |   7 +-
 .../_lib => pylibcudf}/pylibcudf/search.pxd   |   0
 .../_lib => pylibcudf}/pylibcudf/search.pyx   |   7 +-
 .../_lib => pylibcudf}/pylibcudf/sorting.pxd  |  10 +-
 .../_lib => pylibcudf}/pylibcudf/sorting.pyx  |  11 +-
 .../pylibcudf/stream_compaction.pxd           |   6 +-
 .../pylibcudf/stream_compaction.pyx           |  17 +-
 .../pylibcudf/strings/CMakeLists.txt          |   0
 .../pylibcudf/strings/__init__.pxd            |   0
 .../pylibcudf/strings/__init__.py             |   0
 .../pylibcudf/strings/capitalize.pxd          |   4 +-
 .../pylibcudf/strings/capitalize.pyx          |  15 +-
 .../pylibcudf/strings/case.pxd                |   2 +-
 .../pylibcudf/strings/case.pyx                |   7 +-
 .../pylibcudf/strings/char_types.pxd          |   3 +
 .../pylibcudf/strings/char_types.pyx          |   2 +-
 .../pylibcudf/pylibcudf/strings/contains.pxd  |   7 +
 .../pylibcudf/strings/contains.pyx            |   9 +-
 .../pylibcudf/strings/find.pxd                |   6 +-
 .../pylibcudf/strings/find.pyx                |  27 ++-
 .../pylibcudf/strings/regex_flags.pxd         |   2 +
 .../pylibcudf/strings/regex_flags.pyx         |   2 +-
 .../pylibcudf/strings/regex_program.pxd       |   3 +-
 .../pylibcudf/strings/regex_program.pyx       |   8 +-
 .../pylibcudf/strings/replace.pxd             |   6 +-
 .../pylibcudf/strings/replace.pyx             |  15 +-
 .../pylibcudf/strings/slice.pxd               |   4 +-
 .../pylibcudf/strings/slice.pyx               |  21 ++-
 .../_lib => pylibcudf}/pylibcudf/table.pxd    |   5 +-
 .../_lib => pylibcudf}/pylibcudf/table.pyx    |   7 +-
 .../pylibcudf/tests}/common/utils.py          |  14 +-
 .../pylibcudf/tests}/conftest.py              |   5 +-
 .../pylibcudf/tests}/io/test_avro.py          |   3 +-
 .../pylibcudf/tests}/io/test_csv.py           |   5 +-
 .../pylibcudf/tests}/io/test_json.py          |   5 +-
 .../pylibcudf/tests}/io/test_parquet.py       |   7 +-
 .../tests}/io/test_source_sink_info.py        |   3 +-
 .../pylibcudf/tests}/pytest.ini               |   0
 .../pylibcudf/tests}/test_binaryops.py        |   3 +-
 .../pylibcudf/tests}/test_column_factories.py |   3 +-
 .../tests}/test_column_from_device.py         |   3 +-
 .../pylibcudf/tests}/test_copying.py          |   3 +-
 .../pylibcudf/tests}/test_datetime.py         |   6 +-
 .../pylibcudf/tests}/test_expressions.py      |   3 +-
 .../pylibcudf/tests}/test_interop.py          |   3 +-
 .../pylibcudf/tests}/test_join.py             |   3 +-
 .../pylibcudf/tests}/test_lists.py            |   6 +-
 .../pylibcudf/tests}/test_quantiles.py        |   3 +-
 .../pylibcudf/tests}/test_regex_program.py    |   3 +-
 .../pylibcudf/tests}/test_reshape.py          |   3 +-
 .../pylibcudf/tests}/test_round.py            |   8 +-
 .../tests}/test_string_capitalize.py          |  10 +-
 .../pylibcudf/tests}/test_string_case.py      |  10 +-
 .../pylibcudf/tests}/test_string_contains.py  |   6 +-
 .../pylibcudf/tests}/test_string_find.py      |   8 +-
 .../pylibcudf/tests}/test_string_replace.py   |   8 +-
 .../pylibcudf/tests}/test_string_slice.py     |   3 +-
 .../pylibcudf/tests}/test_table.py            |   3 +-
 .../pylibcudf/tests}/test_traits.py           |   2 +-
 .../pylibcudf/tests}/test_transform.py        |   3 +-
 .../pylibcudf/tests}/test_unary.py            |   2 +-
 .../_lib => pylibcudf}/pylibcudf/traits.pxd   |   0
 .../_lib => pylibcudf}/pylibcudf/traits.pyx   |   3 +-
 .../pylibcudf/transform.pxd                   |   0
 .../pylibcudf/transform.pyx                   |   5 +-
 .../_lib => pylibcudf}/pylibcudf/types.pxd    |   3 +-
 .../_lib => pylibcudf}/pylibcudf/types.pyx    |  27 ++-
 .../_lib => pylibcudf}/pylibcudf/unary.pxd    |   3 +-
 .../_lib => pylibcudf}/pylibcudf/unary.pyx    |   9 +-
 .../_lib => pylibcudf}/pylibcudf/utils.pxd    |   5 +-
 .../_lib => pylibcudf}/pylibcudf/utils.pyx    |   5 +-
 .../_lib => pylibcudf/pylibcudf}/variant.pxd  |   0
 python/pylibcudf/pyproject.toml               | 123 +++++++++++++
 475 files changed, 1916 insertions(+), 1522 deletions(-)
 create mode 100755 ci/build_wheel_pylibcudf.sh
 create mode 100644 conda/recipes/pylibcudf/build.sh
 create mode 100644 conda/recipes/pylibcudf/conda_build_config.yaml
 create mode 100644 conda/recipes/pylibcudf/meta.yaml
 delete mode 100644 python/cudf/cudf/_lib/pylibcudf/io/avro.pxd
 delete mode 100644 python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd
 delete mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd
 delete mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd
 delete mode 100644 python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd
 create mode 100644 python/pylibcudf/CMakeLists.txt
 create mode 120000 python/pylibcudf/README.md
 rename python/{cudf => pylibcudf}/cmake/Modules/LinkPyarrowHeaders.cmake (100%)
 rename python/{cudf => pylibcudf}/cmake/Modules/WheelHelpers.cmake (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/CMakeLists.txt (96%)
 create mode 120000 python/pylibcudf/pylibcudf/VERSION
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/__init__.py (99%)
 create mode 100644 python/pylibcudf/pylibcudf/_version.py
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/aggregation.pxd (96%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/aggregation.pyx (96%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/binaryop.pxd (90%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/binaryop.pyx (86%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/column.pxd (84%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/column.pyx (98%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/column_factories.pxd (92%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/column_factories.pyx (96%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/concatenate.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/concatenate.pyx (80%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/copying.pxd (94%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/copying.pyx (96%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/datetime.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/datetime.pyx (78%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/exception_handler.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/experimental.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/experimental.pyx (92%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/expressions.pxd (91%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/expressions.pyx (94%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/filling.pxd (90%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/filling.pyx (94%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/gpumemoryview.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/gpumemoryview.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/groupby.pxd (87%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/groupby.pyx (96%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/interop.pyx (98%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/CMakeLists.txt (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/__init__.py (100%)
 create mode 100644 python/pylibcudf/pylibcudf/io/avro.pxd
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/avro.pyx (89%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/csv.pyx (97%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/datasource.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/datasource.pyx (87%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/json.pxd (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/json.pyx (95%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/parquet.pxd (72%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/parquet.pyx (93%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/types.pxd (87%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/io/types.pyx (96%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/join.pxd (91%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/join.pyx (95%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/CMakeLists.txt (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/aggregation.pxd (98%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/aggregation.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/binaryop.pxd (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/binaryop.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/column/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/column/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/column/column.pxd (87%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/column/column_factories.pxd (93%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/column/column_view.pxd (97%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/concatenate.pxd (77%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/contiguous_split.pxd (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/copying.pxd (90%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/copying.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/datetime.pxd (92%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/experimental.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/expressions.pxd (90%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/expressions.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/filling.pxd (74%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/groupby.pxd (83%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/hash.pxd (86%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/interop.pxd (87%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/CMakeLists.txt (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/arrow_io_source.pxd (86%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/avro.pxd (91%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/csv.pxd (98%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/data_sink.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/datasource.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/json.pxd (96%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/json.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/orc.pxd (97%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/orc_metadata.pxd (94%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/parquet.pxd (80%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/parquet_metadata.pxd (89%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/text.pxd (96%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/timezone.pxd (86%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/types.pxd (92%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/io/types.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/join.pxd (88%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/labeling.pxd (78%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/combine.pxd (78%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/contains.pxd (75%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/count_elements.pxd (61%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/explode.pxd (59%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/extract.pxd (64%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/filling.pxd (76%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/gather.pxd (67%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/lists_column_view.pxd (86%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/reverse.pxd (62%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/set_operations.pxd (81%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/sorting.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/lists/stream_compaction.pxd (68%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/merge.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/null_mask.pxd (80%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd (73%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/edit_distance.pxd (75%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/generate_ngrams.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/jaccard.pxd (61%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/minhash.pxd (70%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd (58%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/normalize.pxd (75%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/replace.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/stemmer.pxd (79%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/subword_tokenize.pxd (92%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/nvtext/tokenize.pxd (84%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/partitioning.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/quantiles.pxd (70%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/reduce.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/reduce.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/replace.pxd (83%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/replace.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/reshape.pxd (57%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/rolling.pxd (64%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/round.pxd (75%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/round.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/scalar/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/scalar/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/scalar/scalar.pxd (91%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/scalar/scalar_factories.pxd (76%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/search.pxd (73%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/sorting.pxd (84%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/stream_compaction.pxd (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/stream_compaction.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/CMakeLists.txt (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/attributes.pxd (76%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/capitalize.pxd (63%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/case.pxd (81%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/char_types.pxd (82%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/char_types.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/combine.pxd (83%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/contains.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/convert_booleans.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/convert_datetime.pxd (76%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/convert_durations.pxd (72%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd (73%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/convert_floats.pxd (71%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/convert_integers.pxd (80%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd (76%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/convert_lists.pxd (62%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/convert/convert_urls.pxd (72%)
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/find.pxd (83%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/find_multiple.pxd (68%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/findall.pxd (56%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/json.pxd (79%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/padding.pxd (59%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/regex_flags.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/regex_flags.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/regex_program.pxd (84%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/repeat.pxd (67%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/replace.pxd (73%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/replace_re.pxd (63%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/side_type.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/split/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/split/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/split/partition.pxd (63%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/split/split.pxd (78%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/strip.pxd (52%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/substring.pxd (66%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/translate.pxd (73%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings/wrap.pxd (58%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/strings_udf.pxd (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/table/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/table/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/table/table.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/table/table_view.pxd (87%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/transform.pxd (73%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/transpose.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/types.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/types.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/unary.pxd (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/unary.pyx (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/utilities/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/utilities/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/utilities/host_span.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/utilities/traits.pxd (93%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/utilities/type_dispatcher.pxd (73%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/wrappers/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/wrappers/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/wrappers/decimals.pxd (90%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/wrappers/durations.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/libcudf/wrappers/timestamps.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/lists.pxd (94%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/lists.pyx (95%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/merge.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/merge.pyx (83%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/quantiles.pxd (86%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/quantiles.pyx (93%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/reduce.pxd (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/reduce.pyx (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/replace.pxd (92%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/replace.pyx (97%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/reshape.pxd (80%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/reshape.pyx (86%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/rolling.pxd (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/rolling.pyx (89%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/round.pxd (77%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/round.pyx (85%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/scalar.pxd (92%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/scalar.pyx (94%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/search.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/search.pyx (93%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/sorting.pxd (87%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/sorting.pyx (96%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/stream_compaction.pxd (89%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/stream_compaction.pyx (95%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/CMakeLists.txt (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/__init__.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/__init__.py (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/capitalize.pxd (64%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/capitalize.pyx (84%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/case.pxd (76%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/case.pyx (79%)
 create mode 100644 python/pylibcudf/pylibcudf/strings/char_types.pxd
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/char_types.pyx (64%)
 create mode 100644 python/pylibcudf/pylibcudf/strings/contains.pxd
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/contains.pyx (75%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/find.pxd (77%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/find.pyx (90%)
 create mode 100644 python/pylibcudf/pylibcudf/strings/regex_flags.pxd
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/regex_flags.pyx (59%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/regex_program.pxd (70%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/regex_program.pyx (84%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/replace.pxd (71%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/replace.pyx (90%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/slice.pxd (69%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/strings/slice.pyx (81%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/table.pxd (78%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/table.pyx (93%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/common/utils.py (97%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/conftest.py (98%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/io/test_avro.py (98%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/io/test_csv.py (98%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/io/test_json.py (99%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/io/test_parquet.py (97%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/io/test_source_sink_info.py (98%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/pytest.ini (100%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_binaryops.py (99%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_column_factories.py (99%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_column_from_device.py (97%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_copying.py (99%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_datetime.py (83%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_expressions.py (97%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_interop.py (98%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_join.py (94%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_lists.py (99%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_quantiles.py (99%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_regex_program.py (89%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_reshape.py (96%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_round.py (86%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_string_capitalize.py (86%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_string_case.py (80%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_string_contains.py (92%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_string_find.py (97%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_string_replace.py (95%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_string_slice.py (98%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_table.py (93%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_traits.py (98%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_transform.py (95%)
 rename python/{cudf/cudf/pylibcudf_tests => pylibcudf/pylibcudf/tests}/test_unary.py (93%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/traits.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/traits.pyx (98%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/transform.pxd (100%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/transform.pyx (87%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/types.pxd (91%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/types.pyx (66%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/unary.pxd (87%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/unary.pyx (94%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/utils.pxd (71%)
 rename python/{cudf/cudf/_lib => pylibcudf}/pylibcudf/utils.pyx (93%)
 rename python/{cudf/cudf/_lib => pylibcudf/pylibcudf}/variant.pxd (100%)
 create mode 100644 python/pylibcudf/pyproject.toml

diff --git a/.github/labeler.yml b/.github/labeler.yml
index 48967417af3..90cdda4d3ca 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -12,7 +12,7 @@ cudf.polars:
   - 'python/cudf_polars/**'
 
 pylibcudf:
-  - 'python/cudf/cudf/_lib/pylibcudf/**'
+  - 'python/cudf/pylibcudf/**'
 
 libcudf:
   - 'cpp/**'
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index ea8a1762b2c..74bdc666c68 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -23,6 +23,7 @@ jobs:
       - static-configure
       - conda-notebook-tests
       - docs-build
+      - wheel-build-pylibcudf
       - wheel-build-cudf
       - wheel-tests-cudf
       - wheel-build-cudf-polars
@@ -120,10 +121,17 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
-  wheel-build-cudf:
+  wheel-build-pylibcudf:
     needs: checks
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    with:
+      build_type: pull-request
+      script: "ci/build_wheel_pylibcudf.sh"
+  wheel-build-cudf:
+    needs: wheel-build-pylibcudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
@@ -135,7 +143,7 @@ jobs:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-cudf-polars:
-    needs: wheel-build-cudf
+    needs: wheel-build-pylibcudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
diff --git a/build.sh b/build.sh
index 52bb1e64d16..957f41aedac 100755
--- a/build.sh
+++ b/build.sh
@@ -17,11 +17,12 @@ ARGS=$*
 # script, and that this script resides in the repo dir!
 REPODIR=$(cd $(dirname $0); pwd)
 
-VALIDARGS="clean libcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp  --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings"
-HELP="$0 [clean] [libcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
+VALIDARGS="clean libcudf pylibcudf cudf cudfjar dask_cudf benchmarks tests libcudf_kafka cudf_kafka custreamz -v -g -n --pydevelop -l --allgpuarch --disable_nvtx --opensource_nvcomp  --show_depr_warn --ptds -h --build_metrics --incl_cache_stats --disable_large_strings"
+HELP="$0 [clean] [libcudf] [pylibcudf] [cudf] [cudfjar] [dask_cudf] [benchmarks] [tests] [libcudf_kafka] [cudf_kafka] [custreamz] [-v] [-g] [-n] [-h] [--cmake-args=\\\"<args>\\\"]
    clean                         - remove all existing build artifacts and configuration (start
                                    over)
    libcudf                       - build the cudf C++ code only
+   pylibcudf                     - build the pylibcudf Python package
    cudf                          - build the cudf Python package
    cudfjar                       - build cudf JAR with static libcudf using devtoolset toolchain
    dask_cudf                     - build the dask_cudf Python package
@@ -268,7 +269,7 @@ fi
 ################################################################################
 # Configure, build, and install libcudf
 
-if buildAll || hasArg libcudf || hasArg cudf || hasArg cudfjar; then
+if buildAll || hasArg libcudf || hasArg pylibcudf || hasArg cudf || hasArg cudfjar; then
     if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
         CUDF_CMAKE_CUDA_ARCHITECTURES="${CUDF_CMAKE_CUDA_ARCHITECTURES:-NATIVE}"
         if [[ "$CUDF_CMAKE_CUDA_ARCHITECTURES" == "NATIVE" ]]; then
@@ -340,6 +341,14 @@ if buildAll || hasArg libcudf; then
     fi
 fi
 
+# Build and install the pylibcudf Python package
+if buildAll || hasArg pylibcudf; then
+
+    cd ${REPODIR}/python/pylibcudf
+    SKBUILD_CMAKE_ARGS="-DCMAKE_PREFIX_PATH=${INSTALL_PREFIX};-DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR};-DCMAKE_CUDA_ARCHITECTURES=${CUDF_CMAKE_CUDA_ARCHITECTURES};${EXTRA_CMAKE_ARGS}" \
+        python ${PYTHON_ARGS_FOR_INSTALL} .
+fi
+
 # Build and install the cudf Python package
 if buildAll || hasArg cudf; then
 
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
index 14dc7a59048..c67d127e635 100755
--- a/ci/build_docs.sh
+++ b/ci/build_docs.sh
@@ -29,7 +29,7 @@ PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
 rapids-mamba-retry install \
   --channel "${CPP_CHANNEL}" \
   --channel "${PYTHON_CHANNEL}" \
-  libcudf cudf dask-cudf
+  libcudf pylibcudf cudf dask-cudf
 
 export RAPIDS_DOCS_DIR="$(mktemp -d)"
 
diff --git a/ci/build_python.sh b/ci/build_python.sh
index 79e09432779..2e3f70ba767 100755
--- a/ci/build_python.sh
+++ b/ci/build_python.sh
@@ -22,9 +22,16 @@ CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
 # TODO: Remove `--no-test` flag once importing on a CPU
 # node works correctly
 # With boa installed conda build forwards to the boa builder
+
+RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
+  --no-test \
+  --channel "${CPP_CHANNEL}" \
+  conda/recipes/pylibcudf
+
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
   --no-test \
   --channel "${CPP_CHANNEL}" \
+  --channel "${RAPIDS_CONDA_BLD_OUTPUT_DIR}" \
   conda/recipes/cudf
 
 RAPIDS_PACKAGE_VERSION=$(head -1 ./VERSION) rapids-conda-retry mambabuild \
diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index 1b563bc499c..7c0fb1efebe 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -7,10 +7,14 @@ package_dir="python/cudf"
 
 export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON"
 
+# Download the pylibcudf built in the previous step
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 /tmp/pylibcudf_dist
+
+echo "pylibcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/pylibcudf_dist/pylibcudf_*.whl)" > /tmp/constraints.txt
+export PIP_CONSTRAINT="/tmp/constraints.txt"
 ./ci/build_wheel.sh ${package_dir}
 
 python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
 
-
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh
new file mode 100755
index 00000000000..b25d118ff81
--- /dev/null
+++ b/ci/build_wheel_pylibcudf.sh
@@ -0,0 +1,16 @@
+#!/bin/bash
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir="python/pylibcudf"
+
+export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON"
+
+./ci/build_wheel.sh ${package_dir}
+
+python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
+
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index 48ee4a05628..8deaeab78a3 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -11,7 +11,9 @@ rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch and rapids
 rapids-logger "PR number: ${RAPIDS_REF_NAME:-"unknown"}"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
 RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+python -m pip install $(ls ./local-pylibcudf-dep/pylibcudf*.whl)
 python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,pandas-tests]
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 1c3b99953fb..bfb655db3ca 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -36,7 +36,9 @@ if [ "$no_cudf" = true ]; then
     echo "Skipping cudf install"
 else
     RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+    RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
     RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
+    python -m pip install $(ls ./local-pylibcudf-dep/pylibcudf*.whl)
     python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,cudf-pandas-tests]
 fi
 
diff --git a/ci/test_python_cudf.sh b/ci/test_python_cudf.sh
index 217dd2fd9a8..ae34047e87f 100755
--- a/ci/test_python_cudf.sh
+++ b/ci/test_python_cudf.sh
@@ -15,7 +15,7 @@ trap "EXITCODE=1" ERR
 set +e
 
 rapids-logger "pytest pylibcudf"
-pushd python/cudf/cudf/pylibcudf_tests
+pushd python/pylibcudf/pylibcudf/tests
 python -m pytest \
   --cache-clear \
   --dist=worksteal \
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index fdb61278d36..5a2c3ccac8f 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -3,11 +3,15 @@
 
 set -eou pipefail
 
+# Download the pylibcudf built in the previous step
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
 RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
-# echo to expand wildcard before adding `[extra]` requires for pip
-python -m pip install $(echo ./dist/cudf*.whl)[test]
+# Install both pylibcudf and cudf
+python -m pip install \
+    "$(echo ./local-pylibcudf-dep/pylibcudf*.whl)[test]" \
+    "$(echo ./dist/cudf*.whl)[test]"
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
@@ -15,7 +19,7 @@ mkdir -p "${RAPIDS_TESTS_DIR}"
 
 
 rapids-logger "pytest pylibcudf"
-pushd python/cudf/cudf/pylibcudf_tests
+pushd python/pylibcudf/pylibcudf/tests
 python -m pytest \
   --cache-clear \
   --dist=worksteal \
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index cc9f5788685..357d4170d47 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -10,7 +10,7 @@ set -eou pipefail
 # files in cudf_polars/pylibcudf", rather than "are there changes
 # between upstream and this branch which touch cudf_polars/pylibcudf"
 # TODO: is the target branch exposed anywhere in an environment variable?
-if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/cudf/cudf/_lib/pylibcudf/)" ];
+if [ -n "$(git diff --name-only origin/branch-24.10...HEAD -- python/cudf_polars/ python/pylibcudf/)" ];
 then
     HAS_CHANGES=1
 else
@@ -21,8 +21,8 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
 # Download the cudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
-python -m pip install ./local-cudf-dep/cudf*.whl
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
+python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl
 
 rapids-logger "Install cudf_polars"
 python -m pip install $(echo ./dist/cudf_polars*.whl)[test]
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index c3800d3cc25..4d045472604 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -7,8 +7,11 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
 # Download the cudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
 RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
-python -m pip install ./local-cudf-dep/cudf*.whl
+python -m pip install \
+    "$(echo ./local-pylibcudf-dep/pylibcudf*.whl)" \
+    "$(echo ./local-cudf-dep/cudf*.whl)"
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 8d5fc2e31d9..d0d18e57abc 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -56,12 +56,14 @@ dependencies:
 - ninja
 - notebook
 - numba>=0.57
+- numpy
 - numpy>=1.23,<2.0a0
 - numpydoc
 - nvcc_linux-64=11.8
 - nvcomp==3.0.6
 - nvtx>=0.2.1
 - packaging
+- pandas
 - pandas>=2.0,<2.2.3dev0
 - pandoc
 - pip
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 7b0485d7f29..caf39a32d79 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -55,11 +55,13 @@ dependencies:
 - ninja
 - notebook
 - numba>=0.57
+- numpy
 - numpy>=1.23,<2.0a0
 - numpydoc
 - nvcomp==3.0.6
 - nvtx>=0.2.1
 - packaging
+- pandas
 - pandas>=2.0,<2.2.3dev0
 - pandoc
 - pip
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 8d7ef63715b..7e86147732e 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -68,6 +68,7 @@ requirements:
     - numpy 1.23
     - pyarrow ==16.1.0.*
     - libcudf ={{ version }}
+    - pylibcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}
     - cudatoolkit
@@ -87,6 +88,7 @@ requirements:
     - numpy >=1.23,<2.0a0
     - {{ pin_compatible('pyarrow', max_pin='x.x') }}
     - libcudf ={{ version }}
+    - pylibcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
     {% if cuda_major == "11" %}
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 748a32e5518..d04d9b21a46 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -58,7 +58,7 @@ requirements:
     - python
     - cython >=3.0.3
     - cuda-version ={{ cuda_version }}
-    - cudf ={{ version }}
+    - pylibcudf ={{ version }}
     - libcudf_kafka ={{ version }}
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.10.0
@@ -69,7 +69,7 @@ requirements:
     - python
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
     - libcudf_kafka ={{ version }}
-    - cudf ={{ version }}
+    - pylibcudf ={{ version }}
     {% if cuda_major != "11" %}
     - cuda-cudart
     {% endif %}
diff --git a/conda/recipes/pylibcudf/build.sh b/conda/recipes/pylibcudf/build.sh
new file mode 100644
index 00000000000..483346504db
--- /dev/null
+++ b/conda/recipes/pylibcudf/build.sh
@@ -0,0 +1,4 @@
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+
+# This assumes the script is executed from the root of the repo directory
+./build.sh pylibcudf
diff --git a/conda/recipes/pylibcudf/conda_build_config.yaml b/conda/recipes/pylibcudf/conda_build_config.yaml
new file mode 100644
index 00000000000..af894cccda0
--- /dev/null
+++ b/conda/recipes/pylibcudf/conda_build_config.yaml
@@ -0,0 +1,20 @@
+c_compiler_version:
+  - 11
+
+cxx_compiler_version:
+  - 11
+
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
+  - "2.17"
+
+cmake_version:
+  - ">=3.26.4,!=3.30.0"
+
+cuda_compiler:
+  - cuda-nvcc
+
+cuda11_compiler:
+  - nvcc
diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
new file mode 100644
index 00000000000..f405fd10f5d
--- /dev/null
+++ b/conda/recipes/pylibcudf/meta.yaml
@@ -0,0 +1,108 @@
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+
+{% set version = environ['RAPIDS_PACKAGE_VERSION'].lstrip('v') %}
+{% set minor_version = version.split('.')[0] + '.' + version.split('.')[1] %}
+{% set py_version = environ['CONDA_PY'] %}
+{% set cuda_version = '.'.join(environ['RAPIDS_CUDA_VERSION'].split('.')[:2]) %}
+{% set cuda_major = cuda_version.split('.')[0] %}
+{% set date_string = environ['RAPIDS_DATE_STRING'] %}
+
+package:
+  name: pylibcudf
+  version: {{ version }}
+
+source:
+  path: ../../..
+
+build:
+  number: {{ GIT_DESCRIBE_NUMBER }}
+  string: cuda{{ cuda_major }}_py{{ py_version }}_{{ date_string }}_{{ GIT_DESCRIBE_HASH }}_{{ GIT_DESCRIBE_NUMBER }}
+  script_env:
+    - AWS_ACCESS_KEY_ID
+    - AWS_SECRET_ACCESS_KEY
+    - AWS_SESSION_TOKEN
+    - CMAKE_C_COMPILER_LAUNCHER
+    - CMAKE_CUDA_COMPILER_LAUNCHER
+    - CMAKE_CXX_COMPILER_LAUNCHER
+    - CMAKE_GENERATOR
+    - PARALLEL_LEVEL
+    - SCCACHE_BUCKET
+    - SCCACHE_IDLE_TIMEOUT
+    - SCCACHE_REGION
+    - SCCACHE_S3_KEY_PREFIX=pylibcudf-aarch64 # [aarch64]
+    - SCCACHE_S3_KEY_PREFIX=pylibcudf-linux64 # [linux64]
+    - SCCACHE_S3_USE_SSL
+    - SCCACHE_S3_NO_CREDENTIALS
+  ignore_run_exports:
+    # libcudf's run_exports pinning is looser than we would like
+    - libcudf
+  ignore_run_exports_from:
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }}
+    {% else %}
+    - {{ compiler('cuda') }}
+    - cuda-cudart-dev
+    - libcufile-dev  # [linux64]
+    {% endif %}
+
+requirements:
+  build:
+    - cmake {{ cmake_version }}
+    - ninja
+    - {{ compiler('c') }}
+    - {{ compiler('cxx') }}
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }} ={{ cuda_version }}
+    {% else %}
+    - {{ compiler('cuda') }}
+    {% endif %}
+    - cuda-version ={{ cuda_version }}
+    - {{ stdlib("c") }}
+  host:
+    - python
+    - cython >=3.0.3
+    - rapids-build-backend >=0.3.0,<0.4.0.dev0
+    - scikit-build-core >=0.10.0
+    - dlpack >=0.8,<1.0
+    # TODO: Change to `2.0` for NumPy 2
+    - numpy 1.23
+    - pyarrow ==16.1.0.*
+    - libcudf ={{ version }}
+    - rmm ={{ minor_version }}
+    {% if cuda_major == "11" %}
+    - cudatoolkit
+    {% else %}
+    - cuda-cudart-dev
+    - cuda-nvrtc
+    - libcufile-dev  # [linux64]
+    {% endif %}
+    - cuda-version ={{ cuda_version }}
+  run:
+    - python
+    - typing_extensions >=4.0.0
+    - pandas >=2.0,<2.2.3dev0
+    # TODO: Update `numpy` in `host` when dropping `<2.0a0`
+    - numpy >=1.23,<2.0a0
+    - {{ pin_compatible('pyarrow', max_pin='x.x') }}
+    - {{ pin_compatible('rmm', max_pin='x.x') }}
+    - fsspec >=0.6.0
+    {% if cuda_major == "11" %}
+    - cuda-python >=11.7.1,<12.0a0
+    {% else %}
+    - cuda-python >=12.0,<13.0a0
+    {% endif %}
+    - nvtx >=0.2.1
+    - packaging
+
+test:
+  requires:
+    - cuda-version ={{ cuda_version }}
+  imports:
+    - pylibcudf
+
+about:
+  home: https://rapids.ai/
+  license: Apache-2.0
+  license_family: APACHE
+  license_file: LICENSE
+  summary: pylibcudf library
diff --git a/dependencies.yaml b/dependencies.yaml
index b0d62a9fb0d..ca615905a15 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -10,6 +10,7 @@ files:
       - build_all
       - build_cpp
       - build_python_common
+      - build_python_pylibcudf
       - build_python_cudf
       - cuda
       - cuda_version
@@ -22,12 +23,14 @@ files:
       - rapids_build_setuptools
       - run_common
       - run_cudf
+      - run_pylibcudf
       - run_dask_cudf
       - run_custreamz
       - test_cpp
       - test_python_common
       - test_python_cudf
       - test_python_dask_cudf
+      - test_python_pylibcudf
       - depends_on_cupy
   test_static_build:
     output: none
@@ -76,14 +79,14 @@ files:
       - docs
       - libarrow_run
       - py_version
-  py_rapids_build_cudf:
+  py_build_cudf:
     output: pyproject
     pyproject_dir: python/cudf
     extras:
       table: build-system
     includes:
       - rapids_build_skbuild
-  py_build_cudf:
+  py_rapids_build_cudf:
     output: pyproject
     pyproject_dir: python/cudf
     extras:
@@ -93,6 +96,7 @@ files:
       - build_base
       - build_python_common
       - build_python_cudf
+      - pylibcudf_build_dep
   py_run_cudf:
     output: pyproject
     pyproject_dir: python/cudf
@@ -103,6 +107,7 @@ files:
       - run_cudf
       - pyarrow_run
       - depends_on_cupy
+      - depends_on_pylibcudf
   py_test_cudf:
     output: pyproject
     pyproject_dir: python/cudf
@@ -112,6 +117,40 @@ files:
     includes:
       - test_python_common
       - test_python_cudf
+  py_rapids_build_pylibcudf:
+    output: pyproject
+    pyproject_dir: python/pylibcudf
+    extras:
+      table: build-system
+    includes:
+      - rapids_build_skbuild
+  py_build_pylibcudf:
+    output: pyproject
+    pyproject_dir: python/pylibcudf
+    extras:
+      table: tool.rapids-build-backend
+      key: requires
+    includes:
+      - build_base
+      - build_python_common
+      - build_python_pylibcudf
+  py_run_pylibcudf:
+    output: pyproject
+    pyproject_dir: python/pylibcudf
+    extras:
+      table: project
+    includes:
+      - run_pylibcudf
+      - pyarrow_run
+  py_test_pylibcudf:
+    output: pyproject
+    pyproject_dir: python/pylibcudf
+    extras:
+      table: project.optional-dependencies
+      key: test
+    includes:
+      - test_python_common
+      - test_python_pylibcudf
   py_test_pandas_cudf:
     output: pyproject
     pyproject_dir: python/cudf
@@ -142,7 +181,7 @@ files:
       table: project
     includes:
       - run_cudf_polars
-      - depends_on_cudf
+      - depends_on_pylibcudf
   py_test_cudf_polars:
     output: pyproject
     pyproject_dir: python/cudf_polars
@@ -326,11 +365,36 @@ dependencies:
           # Sync with conda build constraint & wheel run constraint.
           # TODO: Change to `2.0.*` for NumPy 2
           - numpy==1.23.*
-  build_python_cudf:
+  build_python_pylibcudf:
     common:
       - output_types: conda
         packages:
           - &rmm_unsuffixed rmm==24.10.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for rmm-cu{11,12}.
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - rmm-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - rmm-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*rmm_unsuffixed]}
+  build_python_cudf:
+    common:
+      - output_types: conda
+        packages:
+          - *rmm_unsuffixed
           - pip
           - pip:
               - git+https://github.com/python-streamz/streamz.git@master
@@ -349,12 +413,33 @@ dependencies:
               cuda_suffixed: "true"
             packages:
               - rmm-cu12==24.10.*,>=0.0.0a0
+              - pylibcudf-cu12==24.10.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
               - rmm-cu11==24.10.*,>=0.0.0a0
+              - pylibcudf-cu11==24.10.*,>=0.0.0a0
           - {matrix: null, packages: [*rmm_unsuffixed]}
+  pylibcudf_build_dep:
+    common:
+      - output_types: conda
+        packages:
+          - &pylibcudf_unsuffixed pylibcudf==24.10.*,>=0.0.0a0
+    specific:
+      - output_types: [pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - pylibcudf-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - pylibcudf-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*pylibcudf_unsuffixed]}
   libarrow_build:
     common:
       - output_types: conda
@@ -560,6 +645,45 @@ dependencies:
           # TODO: Update `numpy` in `build_python_common` when dropping `<2.0a0`
           - numpy>=1.23,<2.0a0
           - pandas>=2.0,<2.2.3dev0
+  run_pylibcudf:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - nvtx>=0.2.1
+          - packaging
+          - typing_extensions>=4.0.0
+      - output_types: conda
+        packages:
+          - *rmm_unsuffixed
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for rmm.
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [conda, requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.*"}
+            packages:
+              - cuda-python>=12.0,<13.0a0
+          - matrix: {cuda: "11.*"}
+            packages: &run_pylibcudf_packages_all_cu11
+              - cuda-python>=11.7.1,<12.0a0
+          - {matrix: null, packages: *run_pylibcudf_packages_all_cu11}
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - rmm-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - rmm-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*rmm_unsuffixed]}
   run_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -680,6 +804,14 @@ dependencies:
           - pytest<8
           - pytest-cov
           - pytest-xdist
+  test_python_pylibcudf:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - fastavro>=0.22.9
+          - hypothesis
+          - numpy
+          - pandas
   test_python_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -724,6 +856,31 @@ dependencies:
         packages:
           - dask-cuda==24.10.*,>=0.0.0a0
           - *numba
+  depends_on_pylibcudf:
+    common:
+      - output_types: conda
+        packages:
+          - *pylibcudf_unsuffixed
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for rmm, cubinlinker, ptxcompiler.
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - pylibcudf-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - pylibcudf-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*pylibcudf_unsuffixed]}
   depends_on_cudf:
     common:
       - output_types: conda
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/aggregation.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/aggregation.rst
index 739305af5d4..4b2b213b6c3 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/aggregation.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/aggregation.rst
@@ -2,5 +2,5 @@
 aggregation
 ===========
 
-.. automodule:: cudf._lib.pylibcudf.aggregation
+.. automodule:: pylibcudf.aggregation
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/binaryop.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/binaryop.rst
index e5bc6aa7cda..8bbbfbf88c1 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/binaryop.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/binaryop.rst
@@ -2,5 +2,5 @@
 binaryop
 ========
 
-.. automodule:: cudf._lib.pylibcudf.binaryop
+.. automodule:: pylibcudf.binaryop
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/column.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/column.rst
index d1105d356b4..d26c8737cf4 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/column.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/column.rst
@@ -2,5 +2,5 @@
 Column
 ======
 
-.. automodule:: cudf._lib.pylibcudf.column
+.. automodule:: pylibcudf.column
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst
index c858135b6ce..8dfaa4bae03 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/column_factories.rst
@@ -2,5 +2,5 @@
 column_factories
 ================
 
-.. automodule:: cudf._lib.pylibcudf.column_factories
+.. automodule:: pylibcudf.column_factories
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/concatenate.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/concatenate.rst
index e83739056f4..7912cb83767 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/concatenate.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/concatenate.rst
@@ -2,5 +2,5 @@
 concatenate
 ===========
 
-.. automodule:: cudf._lib.pylibcudf.concatenate
+.. automodule:: pylibcudf.concatenate
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/copying.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/copying.rst
index fddd3ea440f..25e3ef50e6a 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/copying.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/copying.rst
@@ -2,5 +2,5 @@
 copying
 =======
 
-.. automodule:: cudf._lib.pylibcudf.copying
+.. automodule:: pylibcudf.copying
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
index 558268ea495..71f7874cfbe 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/datetime.rst
@@ -2,5 +2,5 @@
 datetime
 ========
 
-.. automodule:: cudf._lib.pylibcudf.datetime
+.. automodule:: pylibcudf.datetime
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst
index 03f769ee861..5493d4662a9 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/expressions.rst
@@ -2,5 +2,5 @@
 expressions
 ===========
 
-.. automodule:: cudf._lib.pylibcudf.expressions
+.. automodule:: pylibcudf.expressions
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/filling.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/filling.rst
index 542a5e12bc4..0d328a0b0e9 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/filling.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/filling.rst
@@ -2,5 +2,5 @@
 filling
 ========
 
-.. automodule:: cudf._lib.pylibcudf.filling
+.. automodule:: pylibcudf.filling
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/gpumemoryview.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/gpumemoryview.rst
index dffc7c24e02..5515a74adcc 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/gpumemoryview.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/gpumemoryview.rst
@@ -2,5 +2,5 @@
 gpumemoryview
 =============
 
-.. automodule:: cudf._lib.pylibcudf.gpumemoryview
+.. automodule:: pylibcudf.gpumemoryview
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/groupby.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/groupby.rst
index d6e994f7dbc..27cda383818 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/groupby.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/groupby.rst
@@ -2,5 +2,5 @@
 groupby
 =======
 
-.. automodule:: cudf._lib.pylibcudf.groupby
+.. automodule:: pylibcudf.groupby
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst
index 881ab8d7be4..0d2cb55212e 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/interop.rst
@@ -2,5 +2,5 @@
 interop
 =======
 
-.. automodule:: cudf._lib.pylibcudf.interop
+.. automodule:: pylibcudf.interop
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst
index 495bd505fdc..1c57a6157f5 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/avro.rst
@@ -2,5 +2,5 @@
 Avro
 ====
 
-.. automodule:: cudf._lib.pylibcudf.io.avro
+.. automodule:: pylibcudf.io.avro
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst
index 5a2276f8b2d..59f7d8fe54c 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/csv.rst
@@ -2,5 +2,5 @@
 CSV
 ===
 
-.. automodule:: cudf._lib.pylibcudf.io.csv
+.. automodule:: pylibcudf.io.csv
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
index e2d342ffe47..c8933981736 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst
@@ -5,7 +5,7 @@ I/O
 I/O Utility Classes
 ===================
 
-.. automodule:: cudf._lib.pylibcudf.io.types
+.. automodule:: pylibcudf.io.types
    :members:
 
 
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst
index 6aeae1f322a..a4626f43cc3 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/json.rst
@@ -2,5 +2,5 @@
 JSON
 ====
 
-.. automodule:: cudf._lib.pylibcudf.io.json
+.. automodule:: pylibcudf.io.json
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst
index 9dfbadfa216..07c2503ab28 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/parquet.rst
@@ -2,5 +2,5 @@
 Parquet
 =======
 
-.. automodule:: cudf._lib.pylibcudf.io.parquet
+.. automodule:: pylibcudf.io.parquet
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/join.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/join.rst
index 05b9709d116..de065e4fc40 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/join.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/join.rst
@@ -2,5 +2,5 @@
 join
 ====
 
-.. automodule:: cudf._lib.pylibcudf.join
+.. automodule:: pylibcudf.join
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/lists.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/lists.rst
index a127dd6006a..0fe1a876073 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/lists.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/lists.rst
@@ -2,5 +2,5 @@
 lists
 =====
 
-.. automodule:: cudf._lib.pylibcudf.lists
+.. automodule:: pylibcudf.lists
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/merge.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/merge.rst
index ef1189a064a..3f634ffcfd7 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/merge.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/merge.rst
@@ -2,5 +2,5 @@
 merge
 =====
 
-.. automodule:: cudf._lib.pylibcudf.merge
+.. automodule:: pylibcudf.merge
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst
index 3417c1ff59d..0f0f701b5dc 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/quantiles.rst
@@ -2,5 +2,5 @@
 quantiles
 =========
 
-.. automodule:: cudf._lib.pylibcudf.quantiles
+.. automodule:: pylibcudf.quantiles
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/reduce.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/reduce.rst
index e6f1b02331d..047f217c276 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/reduce.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/reduce.rst
@@ -2,5 +2,5 @@
 reduce
 ======
 
-.. automodule:: cudf._lib.pylibcudf.reduce
+.. automodule:: pylibcudf.reduce
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/replace.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/replace.rst
index 7f846872fca..7410b20e1b0 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/replace.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/replace.rst
@@ -2,5 +2,5 @@
 replace
 =======
 
-.. automodule:: cudf._lib.pylibcudf.replace
+.. automodule:: pylibcudf.replace
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/reshape.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/reshape.rst
index 964cef04923..09ec0501bb9 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/reshape.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/reshape.rst
@@ -2,5 +2,5 @@
 reshape
 =======
 
-.. automodule:: cudf._lib.pylibcudf.reshape
+.. automodule:: pylibcudf.reshape
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/rolling.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/rolling.rst
index 0817d117a94..1f8da467e84 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/rolling.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/rolling.rst
@@ -2,5 +2,5 @@
 rolling
 =======
 
-.. automodule:: cudf._lib.pylibcudf.rolling
+.. automodule:: pylibcudf.rolling
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst
index c97fda12301..e064357cbd1 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/round.rst
@@ -2,5 +2,5 @@
 round
 =====
 
-.. automodule:: cudf._lib.pylibcudf.round
+.. automodule:: pylibcudf.round
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/scalar.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/scalar.rst
index b12f47618fb..a9100c6bb2d 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/scalar.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/scalar.rst
@@ -2,5 +2,5 @@
 Scalar
 ======
 
-.. automodule:: cudf._lib.pylibcudf.scalar
+.. automodule:: pylibcudf.scalar
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst
index aa57bcd9d92..02307037994 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/search.rst
@@ -2,5 +2,5 @@
 search
 ======
 
-.. automodule:: cudf._lib.pylibcudf.search
+.. automodule:: pylibcudf.search
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/sorting.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/sorting.rst
index e9441366eeb..b8fd8fda9bd 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/sorting.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/sorting.rst
@@ -2,5 +2,5 @@
 sorting
 =======
 
-.. automodule:: cudf._lib.pylibcudf.sorting
+.. automodule:: pylibcudf.sorting
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/stream_compaction.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/stream_compaction.rst
index 00b479446d8..0252d0684d9 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/stream_compaction.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/stream_compaction.rst
@@ -2,5 +2,5 @@
 stream_compaction
 =================
 
-.. automodule:: cudf._lib.pylibcudf.stream_compaction
+.. automodule:: pylibcudf.stream_compaction
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst
index 578b2b75e37..6b9ed8d47e7 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/capitalize.rst
@@ -2,5 +2,5 @@
 capitalize
 ==========
 
-.. automodule:: cudf._lib.pylibcudf.strings.capitalize
+.. automodule:: pylibcudf.strings.capitalize
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst
index 577ec34915b..896fa6086db 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/char_types.rst
@@ -2,5 +2,5 @@
 char_types
 ==========
 
-.. automodule:: cudf._lib.pylibcudf.strings.char_types
+.. automodule:: pylibcudf.strings.char_types
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst
index e5745331bc7..d2d164be638 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/contains.rst
@@ -2,5 +2,5 @@
 contains
 ========
 
-.. automodule:: cudf._lib.pylibcudf.strings.contains
+.. automodule:: pylibcudf.strings.contains
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst
index 61d4079e9a3..7c540e99929 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/find.rst
@@ -2,5 +2,5 @@
 find
 ====
 
-.. automodule:: cudf._lib.pylibcudf.strings.find
+.. automodule:: pylibcudf.strings.find
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst
index 0126b6a3706..53fd712d864 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_flags.rst
@@ -2,5 +2,5 @@
 regex_flags
 ===========
 
-.. automodule:: cudf._lib.pylibcudf.strings.regex_flags
+.. automodule:: pylibcudf.strings.regex_flags
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst
index 2f398186d51..6f3d2f6681c 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/regex_program.rst
@@ -2,5 +2,5 @@
 regex_program
 =============
 
-.. automodule:: cudf._lib.pylibcudf.strings.regex_program
+.. automodule:: pylibcudf.strings.regex_program
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst
index 9575ec226a7..d5417adac43 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/replace.rst
@@ -2,5 +2,5 @@
 replace
 =======
 
-.. automodule:: cudf._lib.pylibcudf.strings.replace
+.. automodule:: pylibcudf.strings.replace
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst
index 0ee5af71c03..e9908904512 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/strings/slice.rst
@@ -2,5 +2,5 @@
 slice
 =====
 
-.. automodule:: cudf._lib.pylibcudf.strings.slice
+.. automodule:: pylibcudf.strings.slice
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst
index d8337b6596d..e39ca18a12b 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/table.rst
@@ -2,5 +2,5 @@
 Table
 =====
 
-.. automodule:: cudf._lib.pylibcudf.table
+.. automodule:: pylibcudf.table
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
index 294ca8dc78c..2cce7b9d7d7 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/traits.rst
@@ -2,5 +2,5 @@
 traits
 ======
 
-.. automodule:: cudf._lib.pylibcudf.traits
+.. automodule:: pylibcudf.traits
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst
index ef04bbad7e6..839163f83fc 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/transform.rst
@@ -2,5 +2,5 @@
 transform
 =========
 
-.. automodule:: cudf._lib.pylibcudf.transform
+.. automodule:: pylibcudf.transform
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/types.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/types.rst
index 8d5409bbd97..75521ac2f4d 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/types.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/types.rst
@@ -2,5 +2,5 @@
 types
 =====
 
-.. automodule:: cudf._lib.pylibcudf.types
+.. automodule:: pylibcudf.types
    :members:
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/unary.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/unary.rst
index add4baa0a54..34077242b90 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/unary.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/unary.rst
@@ -2,5 +2,5 @@
 unary
 =====
 
-.. automodule:: cudf._lib.pylibcudf.unary
+.. automodule:: pylibcudf.unary
    :members:
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index ecadbf5cbbc..e11d62b3bd5 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -79,7 +79,7 @@ if(NOT cudf_FOUND)
   # require access to libcudf, we place the library and all its dependent artifacts in the cudf
   # directory as a single source of truth and modify the other rpaths appropriately.
   set(cython_lib_dir cudf)
-  include(cmake/Modules/WheelHelpers.cmake)
+  include(../pylibcudf/cmake/Modules/WheelHelpers.cmake)
   # TODO: This install is currently overzealous. We should only install the libraries that are
   # downloaded by CPM during the build, not libraries that were found on the system.  However, in
   # practice right this would only be a problem is if libcudf was not found but some of the
@@ -92,7 +92,7 @@ endif()
 
 rapids_cython_init()
 
-include(cmake/Modules/LinkPyarrowHeaders.cmake)
+include(../pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake)
 add_subdirectory(cudf/_lib)
 add_subdirectory(udf_cpp)
 
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index d32a2d8e3f8..d6182673308 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -73,5 +73,4 @@ target_link_libraries(interop PUBLIC nanoarrow)
 
 add_subdirectory(io)
 add_subdirectory(nvtext)
-add_subdirectory(pylibcudf)
 add_subdirectory(strings)
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index 34c0e29d0b1..918edb6d3f1 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -21,7 +21,6 @@
     orc,
     parquet,
     partitioning,
-    pylibcudf,
     quantiles,
     reduce,
     replace,
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 1616c24eec2..7c91533cf93 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -3,8 +3,9 @@
 import pandas as pd
 from numba.np import numpy_support
 
+import pylibcudf
+
 import cudf
-from cudf._lib import pylibcudf
 from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES
 from cudf.utils import cudautils
 
diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx
index 3c132b22880..b1759635a36 100644
--- a/python/cudf/cudf/_lib/avro.pyx
+++ b/python/cudf/cudf/_lib/avro.pyx
@@ -2,8 +2,8 @@
 
 from cudf._lib.utils cimport data_from_pylibcudf_io
 
-import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.io.types import SourceInfo
+import pylibcudf as plc
+from pylibcudf.io.types import SourceInfo
 
 
 cpdef read_avro(datasource, columns=None, skip_rows=0, num_rows=-1):
diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
index 2e352dd7904..e2547476849 100644
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ b/python/cudf/cudf/_lib/binaryop.pyx
@@ -4,7 +4,8 @@ from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.types cimport dtype_to_pylibcudf_type
 
-from cudf._lib import pylibcudf
+import pylibcudf
+
 from cudf._lib.scalar import as_device_scalar
 from cudf.core.buffer import acquire_spill_lock
 
diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd
index 437f44af9f0..8ceea4920e2 100644
--- a/python/cudf/cudf/_lib/column.pxd
+++ b/python/cudf/cudf/_lib/column.pxd
@@ -5,14 +5,13 @@ from typing import Literal
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
-from rmm._lib.device_buffer cimport device_buffer
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
+from rmm._lib.device_buffer cimport device_buffer
 
 
 cdef class Column:
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index f0c07dfbc1b..2e400f775d3 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -7,11 +7,11 @@ import cupy as cp
 import numpy as np
 import pandas as pd
 
+import pylibcudf
 import rmm
 
 import cudf
 import cudf._lib as libcudf
-from cudf._lib import pylibcudf
 from cudf.core.buffer import (
     Buffer,
     ExposureTrackedBuffer,
@@ -39,18 +39,18 @@ from cudf._lib.types cimport (
 from cudf._lib.null_mask import bitmask_allocation_size_bytes
 from cudf._lib.types import dtype_from_pylibcudf_column
 
-
-cimport cudf._lib.pylibcudf.libcudf.copying as cpp_copying
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-cimport cudf._lib.pylibcudf.libcudf.unary as libcudf_unary
-from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_contents
-from cudf._lib.pylibcudf.libcudf.column.column_factories cimport (
+cimport pylibcudf.libcudf.copying as cpp_copying
+cimport pylibcudf.libcudf.types as libcudf_types
+cimport pylibcudf.libcudf.unary as libcudf_unary
+from pylibcudf.libcudf.column.column cimport column, column_contents
+from pylibcudf.libcudf.column.column_factories cimport (
     make_column_from_scalar as cpp_make_column_from_scalar,
     make_numeric_column,
 )
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx
index 89ddcfee99e..e661059faa3 100644
--- a/python/cudf/cudf/_lib/concat.pyx
+++ b/python/cudf/cudf/_lib/concat.pyx
@@ -5,7 +5,8 @@ from libcpp cimport bool
 from cudf._lib.column cimport Column
 from cudf._lib.utils cimport data_from_pylibcudf_table
 
-from cudf._lib import pylibcudf
+import pylibcudf
+
 from cudf.core.buffer import acquire_spill_lock
 
 
diff --git a/python/cudf/cudf/_lib/copying.pxd b/python/cudf/cudf/_lib/copying.pxd
index 8fc7f4e1da0..14c7d2066d8 100644
--- a/python/cudf/cudf/_lib/copying.pxd
+++ b/python/cudf/cudf/_lib/copying.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.contiguous_split cimport packed_columns
+from pylibcudf.libcudf.contiguous_split cimport packed_columns
 
 
 cdef class _CPackedColumns:
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index 796c70e615c..16182e31c08 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -10,8 +10,9 @@ from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport DeviceBuffer
 
+import pylibcudf
+
 import cudf
-from cudf._lib import pylibcudf
 from cudf.core.buffer import Buffer, acquire_spill_lock, as_buffer
 
 from cudf._lib.column cimport Column
@@ -26,17 +27,16 @@ from cudf.core.abc import Serializable
 
 from libcpp.memory cimport make_unique
 
-cimport cudf._lib.pylibcudf.libcudf.contiguous_split as cpp_contiguous_split
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.lists.gather cimport (
+cimport pylibcudf.libcudf.contiguous_split as cpp_contiguous_split
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.lists.gather cimport (
     segmented_gather as cpp_segmented_gather,
 )
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.types cimport size_type
+
 from cudf._lib.utils cimport columns_from_pylibcudf_table, data_from_table_view
 
 # workaround for https://github.com/cython/cython/issues/3885
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index 099b61d62ae..e9aa97ecbc9 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -6,8 +6,9 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-from cudf._lib.pylibcudf.io.datasource cimport Datasource, NativeFileDatasource
+cimport pylibcudf.libcudf.types as libcudf_types
+from pylibcudf.io.datasource cimport Datasource, NativeFileDatasource
+
 from cudf._lib.types cimport dtype_to_pylibcudf_type
 
 import errno
@@ -23,22 +24,24 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 
-from cudf._lib.io.utils cimport make_sink_info
-from cudf._lib.pylibcudf.libcudf.io.csv cimport (
+from pylibcudf.libcudf.io.csv cimport (
     csv_writer_options,
     write_csv as cpp_write_csv,
 )
-from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
-from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type, sink_info
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.io.data_sink cimport data_sink
+from pylibcudf.libcudf.io.types cimport compression_type, sink_info
+from pylibcudf.libcudf.table.table_view cimport table_view
+
+from cudf._lib.io.utils cimport make_sink_info
 from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table
 
 from pyarrow.lib import NativeFile
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
+
 from cudf.api.types import is_hashable
 
-from cudf._lib.pylibcudf.types cimport DataType
+from pylibcudf.types cimport DataType
 
 CSV_HEX_TYPE_MAP = {
     "hex": np.dtype("int64"),
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index b30ef875a7b..483250dd36f 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -7,13 +7,14 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-cimport cudf._lib.pylibcudf.libcudf.datetime as libcudf_datetime
+cimport pylibcudf.libcudf.datetime as libcudf_datetime
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.filling cimport calendrical_month_sequence
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.types cimport size_type
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.filling cimport calendrical_month_sequence
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/filling.pyx b/python/cudf/cudf/_lib/filling.pyx
index b7302f3d07a..b2f4c620144 100644
--- a/python/cudf/cudf/_lib/filling.pyx
+++ b/python/cudf/cudf/_lib/filling.pyx
@@ -2,12 +2,12 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-
 from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
-from cudf._lib import pylibcudf
+import pylibcudf
+
 from cudf._lib.scalar import as_device_scalar
 
 
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index 9d18e023fe8..c199ed96d4f 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -18,10 +18,11 @@ from cudf._lib.utils cimport columns_from_pylibcudf_table
 
 from cudf._lib.scalar import as_device_scalar
 
-from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.replace cimport replace_policy
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+
+import pylibcudf
 
-from cudf._lib import pylibcudf
 from cudf._lib.aggregation import make_aggregation
 
 # The sets below define the possible aggregations that can be performed on
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index b8331d5a226..48f75b12a73 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -7,10 +7,9 @@ from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.hash cimport (
+cimport pylibcudf.libcudf.types as libcudf_types
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.hash cimport (
     md5,
     murmurhash3_x86_32,
     sha1,
@@ -20,11 +19,13 @@ from cudf._lib.pylibcudf.libcudf.hash cimport (
     sha512,
     xxhash_64,
 )
-from cudf._lib.pylibcudf.libcudf.partitioning cimport (
+from pylibcudf.libcudf.partitioning cimport (
     hash_partition as cpp_hash_partition,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+
+from cudf._lib.column cimport Column
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index 37595b65e65..1dc586bb257 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -4,15 +4,16 @@ from cpython cimport pycapsule
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib import pylibcudf
+import pylibcudf
 
-from cudf._lib.pylibcudf.libcudf.interop cimport (
+from pylibcudf.libcudf.interop cimport (
     DLManagedTensor,
     from_dlpack as cpp_from_dlpack,
     to_dlpack as cpp_to_dlpack,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+
 from cudf._lib.utils cimport (
     columns_from_pylibcudf_table,
     columns_from_unique_ptr,
diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd
index 680a87c789e..1938f00c179 100644
--- a/python/cudf/cudf/_lib/io/utils.pxd
+++ b/python/cudf/cudf/_lib/io/utils.pxd
@@ -3,14 +3,15 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
+from pylibcudf.libcudf.io.data_sink cimport data_sink
+from pylibcudf.libcudf.io.types cimport (
     column_name_info,
     sink_info,
     source_info,
 )
 
+from cudf._lib.column cimport Column
+
 
 cdef source_info make_source_info(list src) except*
 cdef sink_info make_sinks_info(
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index 58956b9e9b7..b1900138d94 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -7,17 +7,18 @@ from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.io.datasource cimport Datasource
-from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
-from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
+from pylibcudf.io.datasource cimport Datasource
+from pylibcudf.libcudf.io.data_sink cimport data_sink
+from pylibcudf.libcudf.io.datasource cimport datasource
+from pylibcudf.libcudf.io.types cimport (
     column_name_info,
     host_buffer,
     sink_info,
     source_info,
 )
 
+from cudf._lib.column cimport Column
+
 import codecs
 import errno
 import io
diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx
index 0a54f0d67a0..2559358c21f 100644
--- a/python/cudf/cudf/_lib/join.pyx
+++ b/python/cudf/cudf/_lib/join.pyx
@@ -4,7 +4,7 @@ from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
 
-from cudf._lib import pylibcudf
+import pylibcudf
 
 # The functions below return the *gathermaps* that represent
 # the join result when joining on the keys `lhs` and `rhs`.
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index 03bf9ed8b75..9bbbcf60dcf 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -9,18 +9,19 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
+cimport pylibcudf.libcudf.io.types as cudf_io_types
+from pylibcudf.io.types cimport compression_type
+from pylibcudf.libcudf.io.json cimport json_recovery_mode_t
+from pylibcudf.libcudf.io.types cimport compression_type
+from pylibcudf.libcudf.types cimport data_type, type_id
+from pylibcudf.types cimport DataType
+
 from cudf._lib.column cimport Column
 from cudf._lib.io.utils cimport add_df_col_struct_names
-from cudf._lib.pylibcudf.io.types cimport compression_type
-from cudf._lib.pylibcudf.libcudf.io.json cimport json_recovery_mode_t
-from cudf._lib.pylibcudf.libcudf.io.types cimport compression_type
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
-from cudf._lib.pylibcudf.types cimport DataType
 from cudf._lib.types cimport dtype_to_data_type
 from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
 
 
 cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines):
diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx
index 439a727a9ca..2e1959a348d 100644
--- a/python/cudf/cudf/_lib/labeling.pyx
+++ b/python/cudf/cudf/_lib/labeling.pyx
@@ -6,13 +6,11 @@ from libcpp cimport bool as cbool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.labeling cimport inclusive, label_bins as cpp_label_bins
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.labeling cimport (
-    inclusive,
-    label_bins as cpp_label_bins,
-)
 
 
 # Note that the parameter input shadows a Python built-in in the local scope,
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index f6d9c8c404c..7e8710bedb6 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -4,13 +4,14 @@ from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 
+from pylibcudf.libcudf.types cimport null_order, size_type
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, size_type
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
-from cudf._lib import pylibcudf
+import pylibcudf
 
-from cudf._lib.pylibcudf cimport Scalar
+from pylibcudf cimport Scalar
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx
index fe7f7ad2918..9372acdab44 100644
--- a/python/cudf/cudf/_lib/merge.pyx
+++ b/python/cudf/cudf/_lib/merge.pyx
@@ -4,7 +4,7 @@ from libcpp cimport bool
 
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
-from cudf._lib import pylibcudf
+import pylibcudf
 
 
 def merge_sorted(
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
index b00deae2270..3a7b6a59bf3 100644
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ b/python/cudf/cudf/_lib/null_mask.pyx
@@ -10,9 +10,8 @@ from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.null_mask cimport (
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.null_mask cimport (
     bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes,
     bitmask_and as cpp_bitmask_and,
     bitmask_or as cpp_bitmask_or,
@@ -20,8 +19,10 @@ from cudf._lib.pylibcudf.libcudf.null_mask cimport (
     create_null_mask as cpp_create_null_mask,
     underlying_type_t_mask_state,
 )
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport mask_state, size_type
+
+from cudf._lib.column cimport Column
 from cudf._lib.utils cimport table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
index d60162d0656..0d768e24f39 100644
--- a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
+++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
@@ -6,15 +6,16 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.byte_pair_encode cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.byte_pair_encode cimport (
     bpe_merge_pairs as cpp_bpe_merge_pairs,
     byte_pair_encoding as cpp_byte_pair_encoding,
     load_merge_pairs as cpp_load_merge_pairs,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
index 514b6610575..e3c2273345a 100644
--- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
+++ b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
@@ -5,14 +5,15 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.edit_distance cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.edit_distance cimport (
     edit_distance as cpp_edit_distance,
     edit_distance_matrix as cpp_edit_distance_matrix,
 )
 
+from cudf._lib.column cimport Column
+
 
 @acquire_spill_lock()
 def edit_distance(Column strings, Column targets):
diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
index a6b9a1e4f7a..6591b527eec 100644
--- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
+++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
@@ -5,16 +5,17 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.generate_ngrams cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.generate_ngrams cimport (
     generate_character_ngrams as cpp_generate_character_ngrams,
     generate_ngrams as cpp_generate_ngrams,
     hash_character_ngrams as cpp_hash_character_ngrams,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
index 42fe15d6869..0ebf7c281e3 100644
--- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx
+++ b/python/cudf/cudf/_lib/nvtext/jaccard.pyx
@@ -5,13 +5,14 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.jaccard cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.jaccard cimport (
     jaccard_index as cpp_jaccard_index,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index 4c92999e190..5ee15d0e409 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -5,14 +5,15 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.minhash cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
index ccd8de8c96f..dec4f037d98 100644
--- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
@@ -5,14 +5,15 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.ngrams_tokenize cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.ngrams_tokenize cimport (
     ngrams_tokenize as cpp_ngrams_tokenize,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx
index 9f81f865bb7..5e86a9ce959 100644
--- a/python/cudf/cudf/_lib/nvtext/normalize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/normalize.pyx
@@ -6,14 +6,15 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.normalize cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.normalize cimport (
     normalize_characters as cpp_normalize_characters,
     normalize_spaces as cpp_normalize_spaces,
 )
 
+from cudf._lib.column cimport Column
+
 
 @acquire_spill_lock()
 def normalize_spaces(Column strings):
diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx
index ce2edc58d19..61ae3da5782 100644
--- a/python/cudf/cudf/_lib/nvtext/replace.pyx
+++ b/python/cudf/cudf/_lib/nvtext/replace.pyx
@@ -5,15 +5,16 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.replace cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.replace cimport (
     filter_tokens as cpp_filter_tokens,
     replace_tokens as cpp_replace_tokens,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
index 8f75953ae99..5bf25562fed 100644
--- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx
+++ b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
@@ -7,16 +7,17 @@ from libcpp.utility cimport move
 
 from enum import IntEnum
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.stemmer cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.stemmer cimport (
     is_letter as cpp_is_letter,
     letter_type,
     porter_stemmer_measure as cpp_porter_stemmer_measure,
     underlying_type_t_letter_type,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
 
 
 class LetterType(IntEnum):
diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
index 1112667a087..ee442ece5c6 100644
--- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
@@ -9,9 +9,8 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.subword_tokenize cimport (
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.subword_tokenize cimport (
     hashed_vocabulary as cpp_hashed_vocabulary,
     load_vocabulary_file as cpp_load_vocabulary_file,
     move as tr_move,
@@ -19,6 +18,8 @@ from cudf._lib.pylibcudf.libcudf.nvtext.subword_tokenize cimport (
     tokenizer_result as cpp_tokenizer_result,
 )
 
+from cudf._lib.column cimport Column
+
 
 cdef class Hashed_Vocabulary:
     cdef unique_ptr[cpp_hashed_vocabulary] c_obj
diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
index 98afd94ab1c..a7e63f1e9ae 100644
--- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
@@ -5,10 +5,9 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.nvtext.tokenize cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.nvtext.tokenize cimport (
     character_tokenize as cpp_character_tokenize,
     count_tokens as cpp_count_tokens,
     detokenize as cpp_detokenize,
@@ -17,8 +16,10 @@ from cudf._lib.pylibcudf.libcudf.nvtext.tokenize cimport (
     tokenize_vocabulary as cpp_tokenize_vocabulary,
     tokenize_with_vocabulary as cpp_tokenize_with_vocabulary,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index 9609e3131b4..d506dcd4346 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -14,23 +14,17 @@ from libcpp.vector cimport vector
 import datetime
 from collections import OrderedDict
 
-cimport cudf._lib.pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view
+cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view
 
 try:
     import ujson as json
 except ImportError:
     import json
 
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-from cudf._lib.column cimport Column
-from cudf._lib.io.utils cimport (
-    make_sink_info,
-    make_source_info,
-    update_column_struct_field_names,
-)
-from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
-from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
-from cudf._lib.pylibcudf.libcudf.io.orc cimport (
+cimport pylibcudf.libcudf.io.types as cudf_io_types
+from pylibcudf.io.datasource cimport NativeFileDatasource
+from pylibcudf.libcudf.io.data_sink cimport data_sink
+from pylibcudf.libcudf.io.orc cimport (
     chunked_orc_writer_options,
     orc_chunked_writer,
     orc_reader_options,
@@ -38,7 +32,7 @@ from cudf._lib.pylibcudf.libcudf.io.orc cimport (
     read_orc as libcudf_read_orc,
     write_orc as libcudf_write_orc,
 )
-from cudf._lib.pylibcudf.libcudf.io.orc_metadata cimport (
+from pylibcudf.libcudf.io.orc_metadata cimport (
     binary_statistics,
     bucket_statistics,
     column_statistics,
@@ -53,7 +47,7 @@ from cudf._lib.pylibcudf.libcudf.io.orc_metadata cimport (
     string_statistics,
     timestamp_statistics,
 )
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
+from pylibcudf.libcudf.io.types cimport (
     column_in_metadata,
     compression_type,
     sink_info,
@@ -61,9 +55,16 @@ from cudf._lib.pylibcudf.libcudf.io.types cimport (
     table_input_metadata,
     table_with_metadata,
 )
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type, type_id
-from cudf._lib.variant cimport get_if as std_get_if, holds_alternative
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport data_type, size_type, type_id
+from pylibcudf.variant cimport get_if as std_get_if, holds_alternative
+
+from cudf._lib.column cimport Column
+from cudf._lib.io.utils cimport (
+    make_sink_info,
+    make_source_info,
+    update_column_struct_field_names,
+)
 
 from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 0fffb6ade58..4bfb79ff651 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -31,40 +31,43 @@ from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-from cudf._lib.column cimport Column
-from cudf._lib.io.utils cimport (
-    add_df_col_struct_names,
-    make_sinks_info,
-    make_source_info,
-)
-from cudf._lib.pylibcudf.expressions cimport Expression
-from cudf._lib.pylibcudf.io.datasource cimport NativeFileDatasource
-from cudf._lib.pylibcudf.io.parquet cimport ChunkedParquetReader
-from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
+cimport pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
+cimport pylibcudf.libcudf.io.types as cudf_io_types
+from pylibcudf.expressions cimport Expression
+from pylibcudf.io.datasource cimport NativeFileDatasource
+from pylibcudf.io.parquet cimport ChunkedParquetReader
+from pylibcudf.libcudf.io.parquet cimport (
     chunked_parquet_writer_options,
     merge_row_group_metadata as parquet_merge_metadata,
     parquet_chunked_writer as cpp_parquet_chunked_writer,
     parquet_writer_options,
     write_parquet as parquet_writer,
 )
-from cudf._lib.pylibcudf.libcudf.io.parquet_metadata cimport (
+from pylibcudf.libcudf.io.parquet_metadata cimport (
     parquet_metadata,
     read_parquet_metadata as parquet_metadata_reader,
 )
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
+from pylibcudf.libcudf.io.types cimport (
     column_in_metadata,
     table_input_metadata,
 )
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
+from cudf._lib.io.utils cimport (
+    add_df_col_struct_names,
+    make_sinks_info,
+    make_source_info,
+)
 from cudf._lib.utils cimport table_view_from_table
 
 from pyarrow.lib import NativeFile
 
-import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf cimport Table
+import pylibcudf as plc
+
+from pylibcudf cimport Table
+
 from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
 
 
diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx
index 708ec4174aa..d94f0e1b564 100644
--- a/python/cudf/cudf/_lib/partitioning.pyx
+++ b/python/cudf/cudf/_lib/partitioning.pyx
@@ -7,19 +7,18 @@ from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.partitioning cimport partition as cpp_partition
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.partitioning cimport (
-    partition as cpp_partition,
-)
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 from cudf._lib.reduce import minmax
 from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
 
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+cimport pylibcudf.libcudf.types as libcudf_types
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd b/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd
deleted file mode 100644
index 3695f36a6e7..00000000000
--- a/python/cudf/cudf/_lib/pylibcudf/io/avro.pxd
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
-from cudf._lib.pylibcudf.libcudf.io.avro cimport avro_reader_options
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-
-
-cpdef TableWithMetadata read_avro(
-    SourceInfo source_info,
-    list columns = *,
-    size_type skip_rows = *,
-    size_type num_rows = *
-)
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd
deleted file mode 100644
index 57903ca27de..00000000000
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/extract.pxd
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-
-
-cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil:
-
-    cdef unique_ptr[table] extract(
-        column_view source_strings,
-        regex_program) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd
deleted file mode 100644
index a80e02f520c..00000000000
--- a/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pxd
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-from cudf._lib.pylibcudf.libcudf.strings.char_types cimport (
-    string_character_types,
-)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd
deleted file mode 100644
index 275aa95d97e..00000000000
--- a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pxd
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram
-
-
-cpdef Column contains_re(Column input, RegexProgram prog)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd b/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd
deleted file mode 100644
index 79937bf574a..00000000000
--- a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pxd
+++ /dev/null
@@ -1,2 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx
index 7b50c00919a..7666b7ff8da 100644
--- a/python/cudf/cudf/_lib/quantiles.pyx
+++ b/python/cudf/cudf/_lib/quantiles.pyx
@@ -13,10 +13,11 @@ from cudf._lib.types cimport (
 
 from cudf._lib.types import Interpolation
 
-from cudf._lib.pylibcudf.libcudf.types cimport interpolation, sorted
+from pylibcudf.libcudf.types cimport interpolation, sorted
+
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index 511bba20ef5..944753d28b8 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -8,7 +8,8 @@ from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.types cimport dtype_to_pylibcudf_type, is_decimal_type_id
 
-from cudf._lib import pylibcudf
+import pylibcudf
+
 from cudf._lib.aggregation import make_aggregation
 
 
diff --git a/python/cudf/cudf/_lib/replace.pyx b/python/cudf/cudf/_lib/replace.pyx
index 2b5f32c7675..b50c6dd25e3 100644
--- a/python/cudf/cudf/_lib/replace.pyx
+++ b/python/cudf/cudf/_lib/replace.pyx
@@ -6,7 +6,8 @@ from cudf.core.buffer import acquire_spill_lock
 from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
-from cudf._lib import pylibcudf
+import pylibcudf
+
 from cudf._lib.scalar import as_device_scalar
 
 
diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx
index 6bba8f0df35..6cebeb2bc16 100644
--- a/python/cudf/cudf/_lib/reshape.pyx
+++ b/python/cudf/cudf/_lib/reshape.pyx
@@ -2,11 +2,12 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
+from pylibcudf.libcudf.types cimport size_type
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/rolling.pyx b/python/cudf/cudf/_lib/rolling.pyx
index 5439e70fdce..687b261c2c7 100644
--- a/python/cudf/cudf/_lib/rolling.pyx
+++ b/python/cudf/cudf/_lib/rolling.pyx
@@ -4,7 +4,8 @@ from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
 
-from cudf._lib import pylibcudf
+import pylibcudf
+
 from cudf._lib.aggregation import make_aggregation
 
 
diff --git a/python/cudf/cudf/_lib/round.pyx b/python/cudf/cudf/_lib/round.pyx
index f8ad57947c8..f961c09e6f6 100644
--- a/python/cudf/cudf/_lib/round.pyx
+++ b/python/cudf/cudf/_lib/round.pyx
@@ -4,8 +4,8 @@ from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
 
-import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.round import RoundingMethod
+import pylibcudf as plc
+from pylibcudf.round import RoundingMethod
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd
index b57acbb37f1..27095ca02d4 100644
--- a/python/cudf/cudf/_lib/scalar.pxd
+++ b/python/cudf/cudf/_lib/scalar.pxd
@@ -3,10 +3,9 @@
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 
+from pylibcudf.libcudf.scalar.scalar cimport scalar
 from rmm._lib.memory_resource cimport DeviceMemoryResource
 
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-
 
 cdef class DeviceScalar:
     cdef public object c_value
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index e68398498d1..0dde91316fb 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -11,38 +11,40 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
+import pylibcudf
+
 import cudf
-from cudf._lib import pylibcudf
 from cudf._lib.types import LIBCUDF_TO_SUPPORTED_NUMPY_TYPES
 from cudf.core.dtypes import ListDtype, StructDtype
 from cudf.core.missing import NA, NaT
 
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
+cimport pylibcudf.libcudf.types as libcudf_types
 # We currently need this cimport because some of the implementations here
 # access the c_obj of the scalar, and because we need to be able to call
 # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until
 # DeviceScalar is phased out entirely from cuDF Cython (at which point
 # cudf.Scalar will be directly backed by pylibcudf.Scalar).
-from cudf._lib.pylibcudf cimport Scalar as plc_Scalar
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
+from pylibcudf cimport Scalar as plc_Scalar
+from pylibcudf.libcudf.scalar.scalar cimport (
     duration_scalar,
     list_scalar,
     scalar,
     struct_scalar,
     timestamp_scalar,
 )
-from cudf._lib.pylibcudf.libcudf.wrappers.durations cimport (
+from pylibcudf.libcudf.wrappers.durations cimport (
     duration_ms,
     duration_ns,
     duration_s,
     duration_us,
 )
-from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport (
+from pylibcudf.libcudf.wrappers.timestamps cimport (
     timestamp_ms,
     timestamp_ns,
     timestamp_s,
     timestamp_us,
 )
+
 from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id
 
 
diff --git a/python/cudf/cudf/_lib/search.pyx b/python/cudf/cudf/_lib/search.pyx
index 1ee73949fd3..8108361052b 100644
--- a/python/cudf/cudf/_lib/search.pyx
+++ b/python/cudf/cudf/_lib/search.pyx
@@ -4,7 +4,7 @@ from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
 
-from cudf._lib import pylibcudf
+import pylibcudf
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
index ff9565b9a89..185552ede82 100644
--- a/python/cudf/cudf/_lib/sort.pyx
+++ b/python/cudf/cudf/_lib/sort.pyx
@@ -9,18 +9,19 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+from pylibcudf.libcudf.aggregation cimport rank_method
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.search cimport lower_bound, upper_bound
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport null_order, order as cpp_order
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.search cimport lower_bound, upper_bound
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, order as cpp_order
 from cudf._lib.utils cimport (
     columns_from_pylibcudf_table,
     table_view_from_columns,
 )
 
-from cudf._lib import pylibcudf
+import pylibcudf
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
index 834f91f48d9..1b8831940e3 100644
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -7,7 +7,7 @@ from libcpp cimport bool
 from cudf._lib.column cimport Column
 from cudf._lib.utils cimport columns_from_pylibcudf_table
 
-from cudf._lib import pylibcudf
+import pylibcudf
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index dfad7fd101c..8d463829a19 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -12,39 +12,40 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_booleans cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.convert.convert_booleans cimport (
     from_booleans as cpp_from_booleans,
     to_booleans as cpp_to_booleans,
 )
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_datetime cimport (
+from pylibcudf.libcudf.strings.convert.convert_datetime cimport (
     from_timestamps as cpp_from_timestamps,
     is_timestamp as cpp_is_timestamp,
     to_timestamps as cpp_to_timestamps,
 )
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_durations cimport (
+from pylibcudf.libcudf.strings.convert.convert_durations cimport (
     from_durations as cpp_from_durations,
     to_durations as cpp_to_durations,
 )
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_floats cimport (
+from pylibcudf.libcudf.strings.convert.convert_floats cimport (
     from_floats as cpp_from_floats,
     to_floats as cpp_to_floats,
 )
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_integers cimport (
+from pylibcudf.libcudf.strings.convert.convert_integers cimport (
     from_integers as cpp_from_integers,
     hex_to_integers as cpp_hex_to_integers,
     integers_to_hex as cpp_integers_to_hex,
     is_hex as cpp_is_hex,
     to_integers as cpp_to_integers,
 )
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_ipv4 cimport (
+from pylibcudf.libcudf.strings.convert.convert_ipv4 cimport (
     integers_to_ipv4 as cpp_integers_to_ipv4,
     ipv4_to_integers as cpp_ipv4_to_integers,
     is_ipv4 as cpp_is_ipv4,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
+from pylibcudf.libcudf.types cimport data_type, type_id
+
 from cudf._lib.types cimport underlying_type_t_type_id
 
 import cudf
diff --git a/python/cudf/cudf/_lib/strings/attributes.pyx b/python/cudf/cudf/_lib/strings/attributes.pyx
index 1f3d7c4eb1b..fe8c17c9e31 100644
--- a/python/cudf/cudf/_lib/strings/attributes.pyx
+++ b/python/cudf/cudf/_lib/strings/attributes.pyx
@@ -5,15 +5,16 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.attributes cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.attributes cimport (
     code_points as cpp_code_points,
     count_bytes as cpp_count_bytes,
     count_characters as cpp_count_characters,
 )
 
+from cudf._lib.column cimport Column
+
 
 @acquire_spill_lock()
 def count_characters(Column source_strings):
diff --git a/python/cudf/cudf/_lib/strings/capitalize.pyx b/python/cudf/cudf/_lib/strings/capitalize.pyx
index b3ca6a5ac8f..42c40e2e753 100644
--- a/python/cudf/cudf/_lib/strings/capitalize.pyx
+++ b/python/cudf/cudf/_lib/strings/capitalize.pyx
@@ -4,7 +4,7 @@ from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/case.pyx b/python/cudf/cudf/_lib/strings/case.pyx
index 38f242a67d6..ad4cbb6f088 100644
--- a/python/cudf/cudf/_lib/strings/case.pyx
+++ b/python/cudf/cudf/_lib/strings/case.pyx
@@ -4,7 +4,7 @@ from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
 
-from cudf._lib.pylibcudf.strings import case
+from pylibcudf.strings import case
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx
index 5b7b6d19d9e..376a6f8af97 100644
--- a/python/cudf/cudf/_lib/strings/char_types.pyx
+++ b/python/cudf/cudf/_lib/strings/char_types.pyx
@@ -7,15 +7,16 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.char_types cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.char_types cimport (
     all_characters_of_type as cpp_all_characters_of_type,
     filter_characters_of_type as cpp_filter_characters_of_type,
     string_character_types,
 )
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx
index 288f333d4d8..76cc13db0da 100644
--- a/python/cudf/cudf/_lib/strings/combine.pyx
+++ b/python/cudf/cudf/_lib/strings/combine.pyx
@@ -5,18 +5,19 @@ from cudf.core.buffer import acquire_spill_lock
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.combine cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.combine cimport (
     concatenate as cpp_concatenate,
     join_list_elements as cpp_join_list_elements,
     join_strings as cpp_join_strings,
     output_if_empty_list,
     separator_on_nulls,
 )
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.table.table_view cimport table_view
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport table_view_from_columns
 
diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx
index 502a1d14696..82f5e06c547 100644
--- a/python/cudf/cudf/_lib/strings/contains.pyx
+++ b/python/cudf/cudf/_lib/strings/contains.pyx
@@ -9,21 +9,22 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.contains cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.contains cimport (
     count_re as cpp_count_re,
     like as cpp_like,
     matches_re as cpp_matches_re,
 )
-from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
-from cudf._lib.pylibcudf.strings import contains
-from cudf._lib.pylibcudf.strings.regex_program import RegexProgram
+from pylibcudf.strings import contains
+from pylibcudf.strings.regex_program import RegexProgram
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
index 6faff606226..a8df8c9a92c 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
@@ -7,15 +7,16 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_fixed_point cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.convert.convert_fixed_point cimport (
     from_fixed_point as cpp_from_fixed_point,
     is_fixed_point as cpp_is_fixed_point,
     to_fixed_point as cpp_to_fixed_point,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, type_id
+from pylibcudf.libcudf.types cimport data_type, type_id
+
+from cudf._lib.column cimport Column
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
index 341cbc99dab..7965b588703 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
@@ -5,13 +5,14 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_floats cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.convert.convert_floats cimport (
     is_float as cpp_is_float,
 )
 
+from cudf._lib.column cimport Column
+
 
 @acquire_spill_lock()
 def is_float(Column source_strings):
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
index 081b03cdc0d..8b6da2bfa1c 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
@@ -5,13 +5,14 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_integers cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.convert.convert_integers cimport (
     is_integer as cpp_is_integer,
 )
 
+from cudf._lib.column cimport Column
+
 
 @acquire_spill_lock()
 def is_integer(Column source_strings):
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
index 4418bf2a72d..73aebf8ab35 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
@@ -5,14 +5,15 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_lists cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.convert.convert_lists cimport (
     format_list_column as cpp_format_list_column,
 )
 
+from cudf._lib.column cimport Column
+
 from cudf._lib.scalar import as_device_scalar
 
 from cudf._lib.scalar cimport DeviceScalar
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
index 5f62efe5c00..e52116d6247 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
@@ -5,14 +5,15 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.convert.convert_urls cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.convert.convert_urls cimport (
     url_decode as cpp_url_decode,
     url_encode as cpp_url_encode,
 )
 
+from cudf._lib.column cimport Column
+
 
 @acquire_spill_lock()
 def url_decode(Column source_strings):
diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx
index 3b80c4f6368..63f4d57e562 100644
--- a/python/cudf/cudf/_lib/strings/extract.pyx
+++ b/python/cudf/cudf/_lib/strings/extract.pyx
@@ -8,12 +8,13 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.extract cimport extract as cpp_extract
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.table.table cimport table
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.extract cimport extract as cpp_extract
-from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
 from cudf._lib.utils cimport data_from_unique_ptr
 
 
diff --git a/python/cudf/cudf/_lib/strings/find.pyx b/python/cudf/cudf/_lib/strings/find.pyx
index 3c0009ee569..2d284d1aa9d 100644
--- a/python/cudf/cudf/_lib/strings/find.pyx
+++ b/python/cudf/cudf/_lib/strings/find.pyx
@@ -1,10 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
+
 from cudf.core.buffer import acquire_spill_lock
 
+from pylibcudf.libcudf.types cimport size_type
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx
index c75f28db21b..1358f8e3c2c 100644
--- a/python/cudf/cudf/_lib/strings/find_multiple.pyx
+++ b/python/cudf/cudf/_lib/strings/find_multiple.pyx
@@ -5,13 +5,14 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.find_multiple cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.find_multiple cimport (
     find_multiple as cpp_find_multiple,
 )
 
+from cudf._lib.column cimport Column
+
 
 @acquire_spill_lock()
 def find_multiple(Column source_strings, Column target_strings):
diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx
index 0d409889bc8..3cf2084e30a 100644
--- a/python/cudf/cudf/_lib/strings/findall.pyx
+++ b/python/cudf/cudf/_lib/strings/findall.pyx
@@ -8,12 +8,13 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.findall cimport findall as cpp_findall
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.findall cimport findall as cpp_findall
-from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx
index 560f284b56c..c9b0bba088d 100644
--- a/python/cudf/cudf/_lib/strings/json.pyx
+++ b/python/cudf/cudf/_lib/strings/json.pyx
@@ -5,14 +5,15 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.json cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.json cimport (
     get_json_object as cpp_get_json_object,
     get_json_object_options,
 )
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/padding.pyx b/python/cudf/cudf/_lib/strings/padding.pyx
index 9226810951f..d0239e91ec3 100644
--- a/python/cudf/cudf/_lib/strings/padding.pyx
+++ b/python/cudf/cudf/_lib/strings/padding.pyx
@@ -6,18 +6,19 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport size_type
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 from enum import IntEnum
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.strings.padding cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings.padding cimport (
     pad as cpp_pad,
     zfill as cpp_zfill,
 )
-from cudf._lib.pylibcudf.libcudf.strings.side_type cimport (
+from pylibcudf.libcudf.strings.side_type cimport (
     side_type,
     underlying_type_t_side_type,
 )
diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx
index 2b8116848cf..42fcfa5d94e 100644
--- a/python/cudf/cudf/_lib/strings/repeat.pyx
+++ b/python/cudf/cudf/_lib/strings/repeat.pyx
@@ -5,11 +5,12 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings cimport repeat as cpp_repeat
+from pylibcudf.libcudf.types cimport size_type
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings cimport repeat as cpp_repeat
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx
index 374831f1833..a260c4e4f45 100644
--- a/python/cudf/cudf/_lib/strings/replace.pyx
+++ b/python/cudf/cudf/_lib/strings/replace.pyx
@@ -4,11 +4,12 @@ from libc.stdint cimport int32_t
 
 from cudf.core.buffer import acquire_spill_lock
 
+from pylibcudf.libcudf.types cimport size_type
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx
index e13880a6186..fffc8b7c3f6 100644
--- a/python/cudf/cudf/_lib/strings/replace_re.pyx
+++ b/python/cudf/cudf/_lib/strings/replace_re.pyx
@@ -8,17 +8,18 @@ from libcpp.vector cimport vector
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
-from cudf._lib.pylibcudf.libcudf.strings.replace_re cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.strings.replace_re cimport (
     replace_re as cpp_replace_re,
     replace_with_backrefs as cpp_replace_with_backrefs,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx
index be377c0f86b..a81fb18e752 100644
--- a/python/cudf/cudf/_lib/strings/split/partition.pyx
+++ b/python/cudf/cudf/_lib/strings/split/partition.pyx
@@ -5,14 +5,15 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.split.partition cimport (
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.split.partition cimport (
     partition as cpp_partition,
     rpartition as cpp_rpartition,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table cimport table
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport data_from_unique_ptr
 
diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx
index 942235686d7..f481fea4c51 100644
--- a/python/cudf/cudf/_lib/strings/split/split.pyx
+++ b/python/cudf/cudf/_lib/strings/split/split.pyx
@@ -7,13 +7,12 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
-from cudf._lib.pylibcudf.libcudf.strings.split.split cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.strings.split.split cimport (
     rsplit as cpp_rsplit,
     rsplit_re as cpp_rsplit_re,
     rsplit_record as cpp_rsplit_record,
@@ -23,8 +22,10 @@ from cudf._lib.pylibcudf.libcudf.strings.split.split cimport (
     split_record as cpp_split_record,
     split_record_re as cpp_split_record_re,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport size_type
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport data_from_unique_ptr
 
diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx
index 199fa5fc3b6..acf52cb7b9f 100644
--- a/python/cudf/cudf/_lib/strings/strip.pyx
+++ b/python/cudf/cudf/_lib/strings/strip.pyx
@@ -5,12 +5,13 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.side_type cimport side_type
+from pylibcudf.libcudf.strings.strip cimport strip as cpp_strip
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.side_type cimport side_type
-from cudf._lib.pylibcudf.libcudf.strings.strip cimport strip as cpp_strip
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/substring.pyx b/python/cudf/cudf/_lib/strings/substring.pyx
index 706c21c0634..db96d99c7b6 100644
--- a/python/cudf/cudf/_lib/strings/substring.pyx
+++ b/python/cudf/cudf/_lib/strings/substring.pyx
@@ -10,7 +10,7 @@ from cudf._lib.scalar import as_device_scalar
 
 from cudf._lib.scalar cimport DeviceScalar
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx
index 8846e2e280d..3fad91bbfc0 100644
--- a/python/cudf/cudf/_lib/strings/translate.pyx
+++ b/python/cudf/cudf/_lib/strings/translate.pyx
@@ -8,16 +8,17 @@ from libcpp.vector cimport vector
 
 from cudf.core.buffer import acquire_spill_lock
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.translate cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.translate cimport (
     filter_characters as cpp_filter_characters,
     filter_type,
     translate as cpp_translate,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport char_utf8
+from pylibcudf.libcudf.types cimport char_utf8
+
+from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
 
 
diff --git a/python/cudf/cudf/_lib/strings/wrap.pyx b/python/cudf/cudf/_lib/strings/wrap.pyx
index 92750f21e4d..eed5cf33b10 100644
--- a/python/cudf/cudf/_lib/strings/wrap.pyx
+++ b/python/cudf/cudf/_lib/strings/wrap.pyx
@@ -5,11 +5,12 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import acquire_spill_lock
 
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.wrap cimport wrap as cpp_wrap
+from pylibcudf.libcudf.types cimport size_type
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.wrap cimport wrap as cpp_wrap
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/_lib/strings_udf.pyx b/python/cudf/cudf/_lib/strings_udf.pyx
index 7610cad0b40..78fc9f08bd8 100644
--- a/python/cudf/cudf/_lib/strings_udf.pyx
+++ b/python/cudf/cudf/_lib/strings_udf.pyx
@@ -2,7 +2,7 @@
 
 from libc.stdint cimport uint8_t, uint16_t, uintptr_t
 
-from cudf._lib.pylibcudf.libcudf.strings_udf cimport (
+from pylibcudf.libcudf.strings_udf cimport (
     get_character_cases_table as cpp_get_character_cases_table,
     get_character_flags_table as cpp_get_character_flags_table,
     get_special_case_mapping_table as cpp_get_special_case_mapping_table,
@@ -15,17 +15,17 @@ from libcpp.utility cimport move
 
 from cudf.core.buffer import as_buffer
 
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
-
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
-from cudf._lib.pylibcudf.libcudf.strings_udf cimport (
+from pylibcudf.libcudf.column.column cimport column, column_view
+from pylibcudf.libcudf.strings_udf cimport (
     column_from_udf_string_array as cpp_column_from_udf_string_array,
     free_udf_string_array as cpp_free_udf_string_array,
     get_cuda_build_version as cpp_get_cuda_build_version,
     to_string_view_array as cpp_to_string_view_array,
     udf_string,
 )
+from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+
+from cudf._lib.column cimport Column
 
 
 def get_cuda_build_version():
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
index 6e63b8758b8..ece69b424bb 100644
--- a/python/cudf/cudf/_lib/text.pyx
+++ b/python/cudf/cudf/_lib/text.pyx
@@ -8,9 +8,8 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.io.text cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.io.text cimport (
     byte_range_info,
     data_chunk_source,
     make_source,
@@ -20,6 +19,8 @@ from cudf._lib.pylibcudf.libcudf.io.text cimport (
     parse_options,
 )
 
+from cudf._lib.column cimport Column
+
 
 def read_text(object filepaths_or_buffers,
               object delimiter=None,
diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx
index 53977e984c2..bff3b2c4ce4 100644
--- a/python/cudf/cudf/_lib/timezone.pyx
+++ b/python/cudf/cudf/_lib/timezone.pyx
@@ -5,10 +5,11 @@ from libcpp.optional cimport make_optional
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from cudf._lib.pylibcudf.libcudf.io.timezone cimport (
+from pylibcudf.libcudf.io.timezone cimport (
     make_timezone_transition_table as cpp_make_timezone_transition_table,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table cimport table
+
 from cudf._lib.utils cimport columns_from_unique_ptr
 
 
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 622725e06a3..baa08a545ec 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -15,23 +15,23 @@ from libcpp.pair cimport pair
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
-
-cimport cudf._lib.pylibcudf.libcudf.transform as libcudf_transform
-from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf cimport transform as plc_transform
-from cudf._lib.pylibcudf.expressions cimport Expression
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.expressions cimport expression
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport (
+cimport pylibcudf.libcudf.transform as libcudf_transform
+from pylibcudf cimport transform as plc_transform
+from pylibcudf.expressions cimport Expression
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.expressions cimport expression
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport (
     bitmask_type,
     data_type,
     size_type,
     type_id,
 )
+from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+
+from cudf._lib.column cimport Column
 from cudf._lib.types cimport underlying_type_t_type_id
 from cudf._lib.utils cimport (
     columns_from_unique_ptr,
diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
index 82b23439e6a..f78fbd4c844 100644
--- a/python/cudf/cudf/_lib/transpose.pyx
+++ b/python/cudf/cudf/_lib/transpose.pyx
@@ -4,10 +4,11 @@ from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.transpose cimport transpose as cpp_transpose
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.transpose cimport transpose as cpp_transpose
 from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns
 
 
diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd
index 519d5ff8554..4fd3d31841e 100644
--- a/python/cudf/cudf/_lib/types.pxd
+++ b/python/cudf/cudf/_lib/types.pxd
@@ -3,11 +3,9 @@
 from libc.stdint cimport int32_t
 from libcpp cimport bool
 
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
+cimport pylibcudf.libcudf.types as libcudf_types
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 
 ctypedef bool underlying_type_t_order
 ctypedef bool underlying_type_t_null_order
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index 253fdf7b0d9..861bb063707 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -7,19 +7,19 @@ import pandas as pd
 
 from libcpp.memory cimport make_shared, shared_ptr
 
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
+cimport pylibcudf.libcudf.types as libcudf_types
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
+
 from cudf._lib.types cimport (
     underlying_type_t_interpolation,
     underlying_type_t_order,
     underlying_type_t_sorted,
 )
 
+import pylibcudf
+
 import cudf
-from cudf._lib import pylibcudf
 
 
 class TypeId(IntEnum):
diff --git a/python/cudf/cudf/_lib/unary.pyx b/python/cudf/cudf/_lib/unary.pyx
index 2f58c4512d6..d5602fd5a1c 100644
--- a/python/cudf/cudf/_lib/unary.pyx
+++ b/python/cudf/cudf/_lib/unary.pyx
@@ -5,7 +5,8 @@ from cudf._lib.types cimport dtype_to_pylibcudf_type
 
 import numpy as np
 
-from cudf._lib import pylibcudf
+import pylibcudf
+
 from cudf.api.types import is_decimal_dtype
 from cudf.core.buffer import acquire_spill_lock
 
diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd
index 1d55f7218dc..ff97fe80310 100644
--- a/python/cudf/cudf/_lib/utils.pxd
+++ b/python/cudf/cudf/_lib/utils.pxd
@@ -4,8 +4,8 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table, table_view
+from pylibcudf.libcudf.column.column cimport column_view
+from pylibcudf.libcudf.table.table cimport table, table_view
 
 
 cdef data_from_unique_ptr(
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index 267432a0182..cae28d02ef4 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -10,11 +10,12 @@ from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+from pylibcudf.libcudf.column.column cimport column, column_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport size_type
+
 from cudf._lib.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
 
 try:
     import ujson as json
diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py
index 63714a78572..67bde5a72b2 100644
--- a/python/cudf/cudf/core/_internals/expressions.py
+++ b/python/cudf/cudf/core/_internals/expressions.py
@@ -6,8 +6,8 @@
 
 import pyarrow as pa
 
-import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.expressions import (
+import pylibcudf as plc
+from pylibcudf.expressions import (
     ASTOperator,
     ColumnReference,
     Expression,
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
index 80dbbe4c048..32ae8c5ee53 100644
--- a/python/cudf/cudf/core/buffer/buffer.py
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -11,6 +11,7 @@
 import numpy
 from typing_extensions import Self
 
+import pylibcudf
 import rmm
 
 import cudf
@@ -501,7 +502,7 @@ def get_ptr_and_size(array_interface: Mapping) -> tuple[int, int]:
     shape = array_interface["shape"] or (1,)
     strides = array_interface["strides"]
     itemsize = cudf.dtype(array_interface["typestr"]).itemsize
-    if strides is None or cudf._lib.pylibcudf.column.is_c_contiguous(
+    if strides is None or pylibcudf.column.is_c_contiguous(
         shape, strides, itemsize
     ):
         nelem = math.prod(shape)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index ac36813202a..a37355dfcda 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -9,9 +9,10 @@
 import pandas as pd
 from typing_extensions import Self
 
+import pylibcudf
+
 import cudf
 from cudf import _lib as libcudf
-from cudf._lib import pylibcudf
 from cudf.api.types import is_integer, is_scalar
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.dtypes import CategoricalDtype
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 8eb6de79bce..2263dfd5c98 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -24,6 +24,8 @@
 import pandas as pd
 from typing_extensions import Self
 
+import pylibcudf
+
 import cudf
 import cudf._lib as libcudf
 import cudf.core
@@ -6311,7 +6313,7 @@ def rank(
         if method not in {"average", "min", "max", "first", "dense"}:
             raise KeyError(method)
 
-        method_enum = libcudf.pylibcudf.aggregation.RankMethod[method.upper()]
+        method_enum = pylibcudf.aggregation.RankMethod[method.upper()]
         if na_option not in {"keep", "top", "bottom"}:
             raise ValueError(
                 "na_option must be one of 'keep', 'top', or 'bottom'"
diff --git a/python/cudf/cudf/pandas/__init__.py b/python/cudf/cudf/pandas/__init__.py
index e88e795671e..bacf1f7e77b 100644
--- a/python/cudf/cudf/pandas/__init__.py
+++ b/python/cudf/cudf/pandas/__init__.py
@@ -5,10 +5,9 @@
 import os
 import warnings
 
+import pylibcudf
 import rmm.mr
 
-from cudf._lib import pylibcudf
-
 from .fast_slow_proxy import is_proxy_object
 from .magics import load_ipython_extension
 from .profiler import Profiler
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 60ac171f3d7..9db52164eca 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -30,6 +30,7 @@ dependencies = [
     "pandas>=2.0,<2.2.3dev0",
     "ptxcompiler",
     "pyarrow>=16.1.0,<16.2.0a0",
+    "pylibcudf==24.10.*,>=0.0.0a0",
     "rich",
     "rmm==24.10.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
@@ -88,6 +89,7 @@ known_dask = [
 ]
 known_rapids = [
     "rmm",
+    "pylibcudf"
 ]
 known_first_party = [
     "cudf",
@@ -127,6 +129,7 @@ requires = [
     "ninja",
     "numpy==1.23.*",
     "pyarrow==16.1.0.*",
+    "pylibcudf==24.10.*,>=0.0.0a0",
     "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
diff --git a/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
index 4f3b9220a4f..1b205537d73 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
+++ b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -20,5 +20,5 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}"
 )
-include(../../../cudf/cmake/Modules/LinkPyarrowHeaders.cmake)
+include(../../../pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake)
 link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}")
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
index 2de0bf39785..e65b0d233b9 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
@@ -6,9 +6,8 @@ from libcpp.map cimport map
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.io.datasource cimport Datasource
-from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
+from pylibcudf.io.datasource cimport Datasource
+from pylibcudf.libcudf.io.datasource cimport datasource
 
 
 cdef extern from "cudf_kafka/kafka_callback.hpp" \
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
index 2927dc0aa9a..20aa43b0134 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
@@ -6,8 +6,7 @@ from libcpp.map cimport map
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
+from pylibcudf.libcudf.io.datasource cimport datasource
 
 from cudf_kafka._lib.kafka cimport kafka_consumer
 
diff --git a/python/cudf_polars/cudf_polars/containers/column.py b/python/cudf_polars/cudf_polars/containers/column.py
index 02018548b2c..dd3b771e305 100644
--- a/python/cudf_polars/cudf_polars/containers/column.py
+++ b/python/cudf_polars/cudf_polars/containers/column.py
@@ -8,7 +8,7 @@
 import functools
 from typing import TYPE_CHECKING
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
 
 if TYPE_CHECKING:
     from typing_extensions import Self
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index dba76855329..7c28e7b9a6c 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -10,11 +10,10 @@
 from typing import TYPE_CHECKING, cast
 
 import pyarrow as pa
+import pylibcudf as plc
 
 import polars as pl
 
-import cudf._lib.pylibcudf as plc
-
 from cudf_polars.containers.column import NamedColumn
 from cudf_polars.utils import dtypes
 
diff --git a/python/cudf_polars/cudf_polars/dsl/expr.py b/python/cudf_polars/cudf_polars/dsl/expr.py
index 9e0fca3f52f..e1b4d30b76b 100644
--- a/python/cudf_polars/cudf_polars/dsl/expr.py
+++ b/python/cudf_polars/cudf_polars/dsl/expr.py
@@ -21,11 +21,10 @@
 from typing import TYPE_CHECKING, Any, ClassVar, NamedTuple
 
 import pyarrow as pa
+import pylibcudf as plc
 
 from polars.polars import _expr_nodes as pl_expr
 
-import cudf._lib.pylibcudf as plc
-
 from cudf_polars.containers import Column, NamedColumn
 from cudf_polars.utils import dtypes, sorting
 
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 3754addeb11..019f00f4fca 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -21,12 +21,11 @@
 from typing import TYPE_CHECKING, Any, Callable, ClassVar
 
 import pyarrow as pa
+import pylibcudf as plc
 from typing_extensions import assert_never
 
 import polars as pl
 
-import cudf._lib.pylibcudf as plc
-
 import cudf_polars.dsl.expr as expr
 from cudf_polars.containers import DataFrame, NamedColumn
 from cudf_polars.utils import sorting
diff --git a/python/cudf_polars/cudf_polars/dsl/translate.py b/python/cudf_polars/cudf_polars/dsl/translate.py
index dec45679c75..6dc97c7cb51 100644
--- a/python/cudf_polars/cudf_polars/dsl/translate.py
+++ b/python/cudf_polars/cudf_polars/dsl/translate.py
@@ -11,14 +11,13 @@
 from typing import Any
 
 import pyarrow as pa
+import pylibcudf as plc
 from typing_extensions import assert_never
 
 import polars as pl
 import polars.polars as plrs
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
-import cudf._lib.pylibcudf as plc
-
 from cudf_polars.dsl import expr, ir
 from cudf_polars.typing import NodeTraverser
 from cudf_polars.utils import dtypes
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index c04eac41bb7..02440e67fde 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -8,9 +8,9 @@
 from collections.abc import Mapping
 from typing import TYPE_CHECKING, Literal, Protocol, Union
 
-from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
+import pylibcudf as plc
 
-import cudf._lib.pylibcudf as plc
+from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
 if TYPE_CHECKING:
     from typing import Callable
diff --git a/python/cudf_polars/cudf_polars/utils/dtypes.py b/python/cudf_polars/cudf_polars/utils/dtypes.py
index cd68d021286..7f6ea1edfd9 100644
--- a/python/cudf_polars/cudf_polars/utils/dtypes.py
+++ b/python/cudf_polars/cudf_polars/utils/dtypes.py
@@ -8,12 +8,11 @@
 from functools import cache
 
 import pyarrow as pa
+import pylibcudf as plc
 from typing_extensions import assert_never
 
 import polars as pl
 
-import cudf._lib.pylibcudf as plc
-
 __all__ = ["from_polars", "downcast_arrow_lists"]
 
 
diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py
index 57f94c4ec4c..17ea44e5b1b 100644
--- a/python/cudf_polars/cudf_polars/utils/sorting.py
+++ b/python/cudf_polars/cudf_polars/utils/sorting.py
@@ -7,7 +7,7 @@
 
 from typing import TYPE_CHECKING
 
-import cudf._lib.pylibcudf as plc
+import pylibcudf as plc
 
 if TYPE_CHECKING:
     from collections.abc import Sequence
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 424c83a5199..c380853035d 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -19,8 +19,8 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "cudf==24.10.*,>=0.0.0a0",
     "polars>=1.0,<1.3",
+    "pylibcudf==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
diff --git a/python/cudf_polars/tests/containers/test_column.py b/python/cudf_polars/tests/containers/test_column.py
index 4f3c0de5975..19919877f84 100644
--- a/python/cudf_polars/tests/containers/test_column.py
+++ b/python/cudf_polars/tests/containers/test_column.py
@@ -6,10 +6,9 @@
 from functools import partial
 
 import pyarrow
+import pylibcudf as plc
 import pytest
 
-import cudf._lib.pylibcudf as plc
-
 from cudf_polars.containers import Column, NamedColumn
 
 
diff --git a/python/cudf_polars/tests/containers/test_dataframe.py b/python/cudf_polars/tests/containers/test_dataframe.py
index 87508e17407..6b470268084 100644
--- a/python/cudf_polars/tests/containers/test_dataframe.py
+++ b/python/cudf_polars/tests/containers/test_dataframe.py
@@ -3,12 +3,11 @@
 
 from __future__ import annotations
 
+import pylibcudf as plc
 import pytest
 
 import polars as pl
 
-import cudf._lib.pylibcudf as plc
-
 from cudf_polars.containers import DataFrame, NamedColumn
 
 
diff --git a/python/cudf_polars/tests/dsl/test_expr.py b/python/cudf_polars/tests/dsl/test_expr.py
index ddc3ca66d86..b7d4672daca 100644
--- a/python/cudf_polars/tests/dsl/test_expr.py
+++ b/python/cudf_polars/tests/dsl/test_expr.py
@@ -3,10 +3,9 @@
 
 from __future__ import annotations
 
+import pylibcudf as plc
 import pytest
 
-import cudf._lib.pylibcudf as plc
-
 from cudf_polars.dsl import expr
 
 
diff --git a/python/cudf_polars/tests/expressions/test_literal.py b/python/cudf_polars/tests/expressions/test_literal.py
index 5bd3131d1d7..ced49bdc254 100644
--- a/python/cudf_polars/tests/expressions/test_literal.py
+++ b/python/cudf_polars/tests/expressions/test_literal.py
@@ -2,12 +2,11 @@
 # SPDX-License-Identifier: Apache-2.0
 from __future__ import annotations
 
+import pylibcudf as plc
 import pytest
 
 import polars as pl
 
-import cudf._lib.pylibcudf as plc
-
 from cudf_polars.testing.asserts import (
     assert_gpu_result_equal,
     assert_ir_translation_raises,
diff --git a/python/cudf_polars/tests/expressions/test_sort.py b/python/cudf_polars/tests/expressions/test_sort.py
index d46df92db94..76c7648813a 100644
--- a/python/cudf_polars/tests/expressions/test_sort.py
+++ b/python/cudf_polars/tests/expressions/test_sort.py
@@ -4,12 +4,11 @@
 
 import itertools
 
+import pylibcudf as plc
 import pytest
 
 import polars as pl
 
-import cudf._lib.pylibcudf as plc
-
 from cudf_polars import translate_ir
 from cudf_polars.testing.asserts import assert_gpu_result_equal
 
diff --git a/python/cudf_polars/tests/utils/test_broadcast.py b/python/cudf_polars/tests/utils/test_broadcast.py
index 69ad1e519e2..35aaef44e1f 100644
--- a/python/cudf_polars/tests/utils/test_broadcast.py
+++ b/python/cudf_polars/tests/utils/test_broadcast.py
@@ -3,10 +3,9 @@
 
 from __future__ import annotations
 
+import pylibcudf as plc
 import pytest
 
-import cudf._lib.pylibcudf as plc
-
 from cudf_polars.containers import NamedColumn
 from cudf_polars.dsl.ir import broadcast
 
diff --git a/python/pylibcudf/CMakeLists.txt b/python/pylibcudf/CMakeLists.txt
new file mode 100644
index 00000000000..424d8372280
--- /dev/null
+++ b/python/pylibcudf/CMakeLists.txt
@@ -0,0 +1,100 @@
+# =============================================================================
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
+
+include(../../rapids_config.cmake)
+include(rapids-cuda)
+rapids_cuda_init_architectures(pylibcudf)
+
+project(
+  pylibcudf
+  VERSION "${RAPIDS_VERSION}"
+  LANGUAGES CXX CUDA
+)
+
+option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files"
+       OFF
+)
+option(USE_LIBARROW_FROM_PYARROW "Only use the libarrow contained in pyarrow" OFF)
+mark_as_advanced(USE_LIBARROW_FROM_PYARROW)
+
+# Find Python early so that later commands can use it
+find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
+
+# If the user requested it we attempt to find CUDF.
+if(FIND_CUDF_CPP)
+  include(rapids-cpm)
+  include(rapids-export)
+  include(rapids-find)
+  rapids_cpm_init()
+
+  if(USE_LIBARROW_FROM_PYARROW)
+    # We need to find arrow before libcudf since libcudf requires it but doesn't bundle arrow
+    # libraries. These variables have no effect because we are always searching for arrow via
+    # pyarrow, but they must be set as they are required arguments to the function in
+    # get_arrow.cmake.
+    set(CUDF_USE_ARROW_STATIC OFF)
+    set(CUDF_ENABLE_ARROW_S3 OFF)
+    set(CUDF_ENABLE_ARROW_ORC OFF)
+    set(CUDF_ENABLE_ARROW_PYTHON OFF)
+    set(CUDF_ENABLE_ARROW_PARQUET OFF)
+    include(../../cpp/cmake/thirdparty/get_arrow.cmake)
+  endif()
+
+  find_package(cudf "${RAPIDS_VERSION}" REQUIRED)
+
+  # an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack
+  # for the interop.pyx
+  include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
+else()
+  set(cudf_FOUND OFF)
+endif()
+
+include(rapids-cython-core)
+
+if(NOT cudf_FOUND)
+  set(BUILD_TESTS OFF)
+  set(BUILD_BENCHMARKS OFF)
+  set(CUDF_BUILD_TESTUTIL OFF)
+  set(CUDF_BUILD_STREAMS_TEST_UTIL OFF)
+  set(CUDA_STATIC_RUNTIME ON)
+
+  add_subdirectory(../../cpp cudf-cpp EXCLUDE_FROM_ALL)
+
+  # libcudf targets are excluded by default above via EXCLUDE_FROM_ALL to remove extraneous
+  # components like headers from libcudacxx, but we do need the libraries. However, we want to
+  # control where they are installed to. Since there are multiple subpackages of pylibcudf that
+  # require access to libcudf, we place the library and all its dependent artifacts in the cudf
+  # directory as a single source of truth and modify the other rpaths appropriately.
+  set(cython_lib_dir pylibcudf)
+  include(cmake/Modules/WheelHelpers.cmake)
+  # TODO: This install is currently overzealous. We should only install the libraries that are
+  # downloaded by CPM during the build, not libraries that were found on the system.  However, in
+  # practice right this would only be a problem is if libcudf was not found but some of the
+  # dependencies were, and we have no real use cases where that happens.
+  install_aliased_imported_targets(
+    TARGETS cudf arrow_shared nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp
+    DESTINATION ${cython_lib_dir}
+  )
+endif()
+
+rapids_cython_init()
+
+include(cmake/Modules/LinkPyarrowHeaders.cmake)
+add_subdirectory(pylibcudf)
+
+if(DEFINED cython_lib_dir)
+  rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}")
+endif()
diff --git a/python/pylibcudf/README.md b/python/pylibcudf/README.md
new file mode 120000
index 00000000000..fe840054137
--- /dev/null
+++ b/python/pylibcudf/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/cudf/cmake/Modules/LinkPyarrowHeaders.cmake b/python/pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake
similarity index 100%
rename from python/cudf/cmake/Modules/LinkPyarrowHeaders.cmake
rename to python/pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake
diff --git a/python/cudf/cmake/Modules/WheelHelpers.cmake b/python/pylibcudf/cmake/Modules/WheelHelpers.cmake
similarity index 100%
rename from python/cudf/cmake/Modules/WheelHelpers.cmake
rename to python/pylibcudf/cmake/Modules/WheelHelpers.cmake
diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
rename to python/pylibcudf/pylibcudf/CMakeLists.txt
index da32d530928..ab21bfe97ab 100644
--- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -54,7 +54,7 @@ rapids_cython_create_modules(
 )
 
 include(${rapids-cmake-dir}/export/find_package_root.cmake)
-include(../../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
+include(../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
 target_link_libraries(pylibcudf_interop PUBLIC nanoarrow)
 
 add_subdirectory(libcudf)
diff --git a/python/pylibcudf/pylibcudf/VERSION b/python/pylibcudf/pylibcudf/VERSION
new file mode 120000
index 00000000000..d62dc733efd
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/__init__.pxd
rename to python/pylibcudf/pylibcudf/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
similarity index 99%
rename from python/cudf/cudf/_lib/pylibcudf/__init__.py
rename to python/pylibcudf/pylibcudf/__init__.py
index 9705eba84b1..677fdaf80d0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -12,6 +12,7 @@
     filling,
     groupby,
     interop,
+    io,
     join,
     lists,
     merge,
diff --git a/python/pylibcudf/pylibcudf/_version.py b/python/pylibcudf/pylibcudf/_version.py
new file mode 100644
index 00000000000..d2765e5d53c
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/_version.py
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files(__package__)
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__git_commit__", "__version__"]
diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd b/python/pylibcudf/pylibcudf/aggregation.pxd
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
rename to python/pylibcudf/pylibcudf/aggregation.pxd
index 0981d0e855a..c9ab1eab21c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pxd
+++ b/python/pylibcudf/pylibcudf/aggregation.pxd
@@ -1,8 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.aggregation cimport (
+from pylibcudf.libcudf.aggregation cimport (
     Kind as kind_t,
     aggregation,
     correlation_type,
@@ -15,7 +14,7 @@ from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     rolling_aggregation,
     scan_aggregation,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.types cimport (
     interpolation,
     nan_equality,
     null_equality,
diff --git a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx b/python/pylibcudf/pylibcudf/aggregation.pyx
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
rename to python/pylibcudf/pylibcudf/aggregation.pyx
index eed2f6de585..e510b738f70 100644
--- a/python/cudf/cudf/_lib/pylibcudf/aggregation.pyx
+++ b/python/pylibcudf/pylibcudf/aggregation.pyx
@@ -4,8 +4,7 @@ from cython.operator cimport dereference
 from libcpp.cast cimport dynamic_cast
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf.aggregation cimport (
+from pylibcudf.libcudf.aggregation cimport (
     aggregation,
     correlation_type,
     ewm_history,
@@ -41,7 +40,7 @@ from cudf._lib.pylibcudf.libcudf.aggregation cimport (
     rolling_aggregation,
     scan_aggregation,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.types cimport (
     interpolation,
     nan_equality,
     null_equality,
@@ -51,18 +50,16 @@ from cudf._lib.pylibcudf.libcudf.types cimport (
     size_type,
 )
 
-from cudf._lib.pylibcudf.libcudf.aggregation import Kind  # no-cython-lint
-from cudf._lib.pylibcudf.libcudf.aggregation import \
+from pylibcudf.libcudf.aggregation import Kind  # no-cython-lint
+from pylibcudf.libcudf.aggregation import \
     correlation_type as CorrelationType  # no-cython-lint
-from cudf._lib.pylibcudf.libcudf.aggregation import \
+from pylibcudf.libcudf.aggregation import \
     ewm_history as EWMHistory  # no-cython-lint
-from cudf._lib.pylibcudf.libcudf.aggregation import \
+from pylibcudf.libcudf.aggregation import \
     rank_method as RankMethod  # no-cython-lint
-from cudf._lib.pylibcudf.libcudf.aggregation import \
+from pylibcudf.libcudf.aggregation import \
     rank_percentage as RankPercentage  # no-cython-lint
-from cudf._lib.pylibcudf.libcudf.aggregation import (  # no-cython-lint
-    udf_type as UdfType,
-)
+from pylibcudf.libcudf.aggregation import udf_type as UdfType  # no-cython-lint
 
 from .types cimport DataType
 
@@ -71,7 +68,7 @@ cdef class Aggregation:
     """A type of aggregation to perform.
 
     Aggregations are passed to APIs like
-    :py:func:`~cudf._lib.pylibcudf.groupby.GroupBy.aggregate` to indicate what
+    :py:func:`~pylibcudf.groupby.GroupBy.aggregate` to indicate what
     operations to perform. Using a class for aggregations provides a unified
     API for handling parametrizable aggregations. This class should never be
     instantiated directly, only via one of the factory functions.
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd b/python/pylibcudf/pylibcudf/binaryop.pxd
similarity index 90%
rename from python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
rename to python/pylibcudf/pylibcudf/binaryop.pxd
index 2411e28ac66..06625e9e2db 100644
--- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pxd
+++ b/python/pylibcudf/pylibcudf/binaryop.pxd
@@ -1,8 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-
-from cudf._lib.pylibcudf.libcudf.binaryop cimport binary_operator
+from pylibcudf.libcudf.binaryop cimport binary_operator
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx
similarity index 86%
rename from python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
rename to python/pylibcudf/pylibcudf/binaryop.pyx
index 44d9f4ad04a..5a67f4d6cdb 100644
--- a/python/cudf/cudf/_lib/pylibcudf/binaryop.pyx
+++ b/python/pylibcudf/pylibcudf/binaryop.pyx
@@ -5,12 +5,11 @@ from cython.operator import dereference
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
+from pylibcudf.libcudf cimport binaryop as cpp_binaryop
+from pylibcudf.libcudf.binaryop cimport binary_operator
+from pylibcudf.libcudf.column.column cimport column
 
-from cudf._lib.pylibcudf.libcudf cimport binaryop as cpp_binaryop
-from cudf._lib.pylibcudf.libcudf.binaryop cimport binary_operator
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-
-from cudf._lib.pylibcudf.libcudf.binaryop import \
+from pylibcudf.libcudf.binaryop import \
     binary_operator as BinaryOperator  # no-cython-lint
 
 from .column cimport Column
@@ -27,9 +26,9 @@ cpdef Column binary_operation(
     """Perform a binary operation between a column and another column or scalar.
 
     ``lhs`` and ``rhs`` may be a
-    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
-    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`, but at least one must be a
-    :py:class:`~cudf._lib.pylibcudf.column.Column`.
+    :py:class:`~pylibcudf.column.Column` or a
+    :py:class:`~pylibcudf.scalar.Scalar`, but at least one must be a
+    :py:class:`~pylibcudf.column.Column`.
 
     For details, see :cpp:func:`binary_operation`.
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pxd b/python/pylibcudf/pylibcudf/column.pxd
similarity index 84%
rename from python/cudf/cudf/_lib/pylibcudf/column.pxd
rename to python/pylibcudf/pylibcudf/column.pxd
index 13ee0a70681..92d63e4e495 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pxd
+++ b/python/pylibcudf/pylibcudf/column.pxd
@@ -2,16 +2,13 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type, size_type
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
+from pylibcudf.libcudf.types cimport bitmask_type, size_type
 
 from .gpumemoryview cimport gpumemoryview
 from .types cimport DataType
diff --git a/python/cudf/cudf/_lib/pylibcudf/column.pyx b/python/pylibcudf/pylibcudf/column.pyx
similarity index 98%
rename from python/cudf/cudf/_lib/pylibcudf/column.pyx
rename to python/pylibcudf/pylibcudf/column.pyx
index 1d9902b0374..a37a12fc7e1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column.pyx
+++ b/python/pylibcudf/pylibcudf/column.pyx
@@ -3,16 +3,13 @@
 from cython.operator cimport dereference
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
+from pylibcudf.libcudf.column.column cimport column, column_contents
+from pylibcudf.libcudf.column.column_factories cimport make_column_from_scalar
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.types cimport size_type
 
 from rmm._lib.device_buffer cimport DeviceBuffer
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_contents
-from cudf._lib.pylibcudf.libcudf.column.column_factories cimport (
-    make_column_from_scalar,
-)
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-
 from .gpumemoryview cimport gpumemoryview
 from .scalar cimport Scalar
 from .types cimport DataType, size_of, type_id
diff --git a/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd b/python/pylibcudf/pylibcudf/column_factories.pxd
similarity index 92%
rename from python/cudf/cudf/_lib/pylibcudf/column_factories.pxd
rename to python/pylibcudf/pylibcudf/column_factories.pxd
index 9dbd74ab16c..fef02359240 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column_factories.pxd
+++ b/python/pylibcudf/pylibcudf/column_factories.pxd
@@ -1,8 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type
+from pylibcudf.libcudf.types cimport mask_state, size_type
 
 from .column cimport Column
 from .types cimport DataType, size_type, type_id
diff --git a/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/column_factories.pyx
rename to python/pylibcudf/pylibcudf/column_factories.pyx
index ef7f512f0e5..4601cba515a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/column_factories.pyx
+++ b/python/pylibcudf/pylibcudf/column_factories.pyx
@@ -1,9 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_factories cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_factories cimport (
     make_duration_column as cpp_make_duration_column,
     make_empty_column as cpp_make_empty_column,
     make_fixed_point_column as cpp_make_fixed_point_column,
@@ -11,7 +10,7 @@ from cudf._lib.pylibcudf.libcudf.column.column_factories cimport (
     make_numeric_column as cpp_make_numeric_column,
     make_timestamp_column as cpp_make_timestamp_column,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport mask_state, size_type
+from pylibcudf.libcudf.types cimport mask_state, size_type
 
 from .types cimport DataType, type_id
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/concatenate.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/concatenate.pxd
rename to python/pylibcudf/pylibcudf/concatenate.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/concatenate.pyx b/python/pylibcudf/pylibcudf/concatenate.pyx
similarity index 80%
rename from python/cudf/cudf/_lib/pylibcudf/concatenate.pyx
rename to python/pylibcudf/pylibcudf/concatenate.pyx
index 5e40f921b2c..8bdcc086e0f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/concatenate.pyx
+++ b/python/pylibcudf/pylibcudf/concatenate.pyx
@@ -3,12 +3,11 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf cimport concatenate as cpp_concatenate
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf cimport concatenate as cpp_concatenate
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pxd b/python/pylibcudf/pylibcudf/copying.pxd
similarity index 94%
rename from python/cudf/cudf/_lib/pylibcudf/copying.pxd
rename to python/pylibcudf/pylibcudf/copying.pxd
index 06543d3ca92..7dfed437673 100644
--- a/python/cudf/cudf/_lib/pylibcudf/copying.pxd
+++ b/python/pylibcudf/pylibcudf/copying.pxd
@@ -1,12 +1,11 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool as cbool
-
-from cudf._lib.pylibcudf.libcudf.copying cimport (
+from pylibcudf.libcudf.copying cimport (
     mask_allocation_policy,
     out_of_bounds_policy,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pyx b/python/pylibcudf/pylibcudf/copying.pyx
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/copying.pyx
rename to python/pylibcudf/pylibcudf/copying.pyx
index 2d59deb3864..9743119d92a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/copying.pyx
+++ b/python/pylibcudf/pylibcudf/copying.pyx
@@ -6,29 +6,28 @@ from libcpp.functional cimport reference_wrapper
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
 # TODO: We want to make cpp a more full-featured package so that we can access
 # directly from that. It will make namespacing much cleaner in pylibcudf. What
 # we really want here would be
 # cimport libcudf... libcudf.copying.algo(...)
-from cudf._lib.pylibcudf.libcudf cimport copying as cpp_copying
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+from pylibcudf.libcudf cimport copying as cpp_copying
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.copying cimport (
+from pylibcudf.libcudf.copying cimport (
     mask_allocation_policy,
     out_of_bounds_policy,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport size_type
 
-from cudf._lib.pylibcudf.libcudf.copying import \
+from pylibcudf.libcudf.copying import \
     mask_allocation_policy as MaskAllocationPolicy  # no-cython-lint
-from cudf._lib.pylibcudf.libcudf.copying import \
+from pylibcudf.libcudf.copying import \
     out_of_bounds_policy as OutOfBoundsPolicy  # no-cython-lint
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/datetime.pxd b/python/pylibcudf/pylibcudf/datetime.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/datetime.pxd
rename to python/pylibcudf/pylibcudf/datetime.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/datetime.pyx b/python/pylibcudf/pylibcudf/datetime.pyx
similarity index 78%
rename from python/cudf/cudf/_lib/pylibcudf/datetime.pyx
rename to python/pylibcudf/pylibcudf/datetime.pyx
index 82351327de6..0ddc68bcb9d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/datetime.pyx
+++ b/python/pylibcudf/pylibcudf/datetime.pyx
@@ -1,11 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.datetime cimport (
-    extract_year as cpp_extract_year,
-)
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.datetime cimport extract_year as cpp_extract_year
 
 from .column cimport Column
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/exception_handler.pxd b/python/pylibcudf/pylibcudf/exception_handler.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/exception_handler.pxd
rename to python/pylibcudf/pylibcudf/exception_handler.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/experimental.pxd b/python/pylibcudf/pylibcudf/experimental.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/experimental.pxd
rename to python/pylibcudf/pylibcudf/experimental.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/experimental.pyx b/python/pylibcudf/pylibcudf/experimental.pyx
similarity index 92%
rename from python/cudf/cudf/_lib/pylibcudf/experimental.pyx
rename to python/pylibcudf/pylibcudf/experimental.pyx
index 1e2a682d879..b25a53e13b2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/experimental.pyx
+++ b/python/pylibcudf/pylibcudf/experimental.pyx
@@ -2,8 +2,7 @@
 
 from libcpp cimport bool
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf cimport experimental as cpp_experimental
+from pylibcudf.libcudf cimport experimental as cpp_experimental
 
 
 cpdef enable_prefetching(str key):
diff --git a/python/cudf/cudf/_lib/pylibcudf/expressions.pxd b/python/pylibcudf/pylibcudf/expressions.pxd
similarity index 91%
rename from python/cudf/cudf/_lib/pylibcudf/expressions.pxd
rename to python/pylibcudf/pylibcudf/expressions.pxd
index 64825b89d9f..65660b7c449 100644
--- a/python/cudf/cudf/_lib/pylibcudf/expressions.pxd
+++ b/python/pylibcudf/pylibcudf/expressions.pxd
@@ -1,8 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.expressions cimport (
+from pylibcudf.libcudf.expressions cimport (
     ast_operator,
     expression,
     table_reference,
diff --git a/python/cudf/cudf/_lib/pylibcudf/expressions.pyx b/python/pylibcudf/pylibcudf/expressions.pyx
similarity index 94%
rename from python/cudf/cudf/_lib/pylibcudf/expressions.pyx
rename to python/pylibcudf/pylibcudf/expressions.pyx
index b983a617533..a44c9e25987 100644
--- a/python/cudf/cudf/_lib/pylibcudf/expressions.pyx
+++ b/python/pylibcudf/pylibcudf/expressions.pyx
@@ -1,7 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-from cudf._lib.pylibcudf.libcudf.expressions import \
+from pylibcudf.libcudf.expressions import \
     ast_operator as ASTOperator  # no-cython-lint
-from cudf._lib.pylibcudf.libcudf.expressions import \
+from pylibcudf.libcudf.expressions import \
     table_reference as TableReference  # no-cython-lint
 
 from cython.operator cimport dereference
@@ -9,22 +9,21 @@ from libc.stdint cimport int32_t, int64_t
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf cimport expressions as libcudf_exp
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
+from pylibcudf.libcudf cimport expressions as libcudf_exp
+from pylibcudf.libcudf.scalar.scalar cimport (
     duration_scalar,
     numeric_scalar,
     string_scalar,
     timestamp_scalar,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type, type_id
-from cudf._lib.pylibcudf.libcudf.wrappers.durations cimport (
+from pylibcudf.libcudf.types cimport size_type, type_id
+from pylibcudf.libcudf.wrappers.durations cimport (
     duration_ms,
     duration_ns,
     duration_s,
     duration_us,
 )
-from cudf._lib.pylibcudf.libcudf.wrappers.timestamps cimport (
+from pylibcudf.libcudf.wrappers.timestamps cimport (
     timestamp_ms,
     timestamp_ns,
     timestamp_s,
diff --git a/python/cudf/cudf/_lib/pylibcudf/filling.pxd b/python/pylibcudf/pylibcudf/filling.pxd
similarity index 90%
rename from python/cudf/cudf/_lib/pylibcudf/filling.pxd
rename to python/pylibcudf/pylibcudf/filling.pxd
index 3560ebf2ea2..b9345f8cd42 100644
--- a/python/cudf/cudf/_lib/pylibcudf/filling.pxd
+++ b/python/pylibcudf/pylibcudf/filling.pxd
@@ -1,5 +1,5 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/filling.pyx b/python/pylibcudf/pylibcudf/filling.pyx
similarity index 94%
rename from python/cudf/cudf/_lib/pylibcudf/filling.pyx
rename to python/pylibcudf/pylibcudf/filling.pyx
index 05f67681428..61b430e64aa 100644
--- a/python/cudf/cudf/_lib/pylibcudf/filling.pyx
+++ b/python/pylibcudf/pylibcudf/filling.pyx
@@ -3,16 +3,15 @@
 from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.filling cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.filling cimport (
     fill as cpp_fill,
     fill_in_place as cpp_fill_in_place,
     repeat as cpp_repeat,
     sequence as cpp_sequence,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd b/python/pylibcudf/pylibcudf/gpumemoryview.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd
rename to python/pylibcudf/pylibcudf/gpumemoryview.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx b/python/pylibcudf/pylibcudf/gpumemoryview.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx
rename to python/pylibcudf/pylibcudf/gpumemoryview.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd b/python/pylibcudf/pylibcudf/groupby.pxd
similarity index 87%
rename from python/cudf/cudf/_lib/pylibcudf/groupby.pxd
rename to python/pylibcudf/pylibcudf/groupby.pxd
index eaa05c26986..79af2f1b746 100644
--- a/python/cudf/cudf/_lib/pylibcudf/groupby.pxd
+++ b/python/pylibcudf/pylibcudf/groupby.pxd
@@ -3,20 +3,19 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.aggregation cimport (
+from pylibcudf.libcudf.aggregation cimport (
     aggregation,
     groupby_aggregation,
     groupby_scan_aggregation,
 )
-from cudf._lib.pylibcudf.libcudf.groupby cimport (
+from pylibcudf.libcudf.groupby cimport (
     aggregation_request,
     aggregation_result,
     groupby,
     scan_request,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, order
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport null_order, order
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/groupby.pyx
rename to python/pylibcudf/pylibcudf/groupby.pyx
index f5bb46ca6a2..ae5d33aaa46 100644
--- a/python/cudf/cudf/_lib/pylibcudf/groupby.pyx
+++ b/python/pylibcudf/pylibcudf/groupby.pyx
@@ -6,18 +6,17 @@ from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.groupby cimport (
+from pylibcudf.libcudf.groupby cimport (
     aggregation_request,
     aggregation_result,
     groupby,
     groups,
     scan_request,
 )
-from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.replace cimport replace_policy
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport size_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
@@ -156,7 +155,7 @@ cdef class GroupBy:
         Parameters
         ----------
         requests : List[GroupByRequest]
-            The list of `~.cudf._lib.pylibcudf.groupby.GroupByRequest` , each
+            The list of `~.pylibcudf.groupby.GroupByRequest` , each
             representing a set of aggregations to perform on a given column of values.
 
         Returns
@@ -188,7 +187,7 @@ cdef class GroupBy:
         Parameters
         ----------
         requests : List[GroupByRequest]
-            The list of `~.cudf._lib.pylibcudf.groupby.GroupByRequest` , each
+            The list of `~.pylibcudf.groupby.GroupByRequest` , each
             representing a set of aggregations to perform on a given column of values.
 
         Returns
diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx
similarity index 98%
rename from python/cudf/cudf/_lib/pylibcudf/interop.pyx
rename to python/pylibcudf/pylibcudf/interop.pyx
index caa19724786..d54e5b7ba1f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx
+++ b/python/pylibcudf/pylibcudf/interop.pyx
@@ -11,8 +11,8 @@ from functools import singledispatch
 
 from pyarrow import lib as pa
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.interop cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.interop cimport (
     ArrowArray,
     ArrowArrayStream,
     ArrowSchema,
@@ -22,7 +22,7 @@ from cudf._lib.pylibcudf.libcudf.interop cimport (
     to_arrow_host_raw,
     to_arrow_schema_raw,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table cimport table
 
 from . cimport copying
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/io/CMakeLists.txt
rename to python/pylibcudf/pylibcudf/io/CMakeLists.txt
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/io/__init__.pxd
rename to python/pylibcudf/pylibcudf/io/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/io/__init__.py
rename to python/pylibcudf/pylibcudf/io/__init__.py
diff --git a/python/pylibcudf/pylibcudf/io/avro.pxd b/python/pylibcudf/pylibcudf/io/avro.pxd
new file mode 100644
index 00000000000..8696fcb3c15
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/io/avro.pxd
@@ -0,0 +1,12 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.io.avro cimport avro_reader_options
+from pylibcudf.libcudf.types cimport size_type
+
+
+cpdef TableWithMetadata read_avro(
+    SourceInfo source_info,
+    list columns = *,
+    size_type skip_rows = *,
+    size_type num_rows = *
+)
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx
similarity index 89%
rename from python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
rename to python/pylibcudf/pylibcudf/io/avro.pyx
index 538bd8aa322..667c67f4c36 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/avro.pyx
+++ b/python/pylibcudf/pylibcudf/io/avro.pyx
@@ -3,13 +3,12 @@
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
-from cudf._lib.pylibcudf.libcudf.io.avro cimport (
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.io.avro cimport (
     avro_reader_options,
     read_avro as cpp_read_avro,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
 
 
 cpdef TableWithMetadata read_avro(
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/csv.pyx b/python/pylibcudf/pylibcudf/io/csv.pyx
similarity index 97%
rename from python/cudf/cudf/_lib/pylibcudf/io/csv.pyx
rename to python/pylibcudf/pylibcudf/io/csv.pyx
index e9efb5befee..b53d6771cd6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/csv.pyx
+++ b/python/pylibcudf/pylibcudf/io/csv.pyx
@@ -5,19 +5,18 @@ from libcpp.map cimport map
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
-from cudf._lib.pylibcudf.libcudf.io.csv cimport (
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.io.csv cimport (
     csv_reader_options,
     read_csv as cpp_read_csv,
 )
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
+from pylibcudf.libcudf.io.types cimport (
     compression_type,
     quote_style,
     table_with_metadata,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
-from cudf._lib.pylibcudf.types cimport DataType
+from pylibcudf.libcudf.types cimport data_type, size_type
+from pylibcudf.types cimport DataType
 
 
 cdef tuple _process_parse_dates_hex(list cols):
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pxd b/python/pylibcudf/pylibcudf/io/datasource.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/io/datasource.pxd
rename to python/pylibcudf/pylibcudf/io/datasource.pxd
index a0a9c3fa0d4..05c03dceee2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pxd
+++ b/python/pylibcudf/pylibcudf/io/datasource.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport shared_ptr
-
-from cudf._lib.pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
-from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
+from pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
+from pylibcudf.libcudf.io.datasource cimport datasource
 
 
 cdef class Datasource:
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx b/python/pylibcudf/pylibcudf/io/datasource.pyx
similarity index 87%
rename from python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
rename to python/pylibcudf/pylibcudf/io/datasource.pyx
index 8f265f585de..6cc509b74cb 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/datasource.pyx
+++ b/python/pylibcudf/pylibcudf/io/datasource.pyx
@@ -3,9 +3,8 @@
 from libcpp.memory cimport shared_ptr
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 from pyarrow.lib cimport NativeFile
-
-from cudf._lib.pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
-from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
+from pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
+from pylibcudf.libcudf.io.datasource cimport datasource
 
 import warnings
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd b/python/pylibcudf/pylibcudf/io/json.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/io/json.pxd
rename to python/pylibcudf/pylibcudf/io/json.pxd
index 2e0e92a054f..ab9b5b99ce2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pxd
+++ b/python/pylibcudf/pylibcudf/io/json.pxd
@@ -1,14 +1,13 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp cimport bool
-
-from cudf._lib.pylibcudf.io.types cimport (
+from pylibcudf.io.types cimport (
     SinkInfo,
     SourceInfo,
     TableWithMetadata,
     compression_type,
 )
-from cudf._lib.pylibcudf.libcudf.io.json cimport json_recovery_mode_t
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.io.json cimport json_recovery_mode_t
+from pylibcudf.libcudf.types cimport size_type
 
 
 cpdef TableWithMetadata read_json(
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx
similarity index 95%
rename from python/cudf/cudf/_lib/pylibcudf/io/json.pyx
rename to python/pylibcudf/pylibcudf/io/json.pyx
index 2710ee60075..ce086f4a489 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/json.pyx
+++ b/python/pylibcudf/pylibcudf/io/json.pyx
@@ -5,14 +5,9 @@ from libcpp.map cimport map
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.concatenate cimport concatenate
-from cudf._lib.pylibcudf.io.types cimport (
-    SinkInfo,
-    SourceInfo,
-    TableWithMetadata,
-)
-from cudf._lib.pylibcudf.libcudf.io.json cimport (
+from pylibcudf.concatenate cimport concatenate
+from pylibcudf.io.types cimport SinkInfo, SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.io.json cimport (
     json_reader_options,
     json_recovery_mode_t,
     json_writer_options,
@@ -20,13 +15,13 @@ from cudf._lib.pylibcudf.libcudf.io.json cimport (
     schema_element,
     write_json as cpp_write_json,
 )
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
+from pylibcudf.libcudf.io.types cimport (
     compression_type,
     table_metadata,
     table_with_metadata,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
-from cudf._lib.pylibcudf.types cimport DataType
+from pylibcudf.libcudf.types cimport data_type, size_type
+from pylibcudf.types cimport DataType
 
 
 cdef map[string, schema_element] _generate_schema_map(list dtypes):
@@ -270,7 +265,7 @@ cpdef void write_json(
     str false_value = "false"
 ):
     """
-    Writes a :py:class:`~cudf._lib.pylibcudf.table.Table` to JSON format.
+    Writes a :py:class:`~pylibcudf.table.Table` to JSON format.
 
     Parameters
     ----------
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd
similarity index 72%
rename from python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
rename to python/pylibcudf/pylibcudf/io/parquet.pxd
index 93ef849b813..47458b00159 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pxd
+++ b/python/pylibcudf/pylibcudf/io/parquet.pxd
@@ -3,14 +3,13 @@
 from libc.stdint cimport int64_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.expressions cimport Expression
-from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
-from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
+from pylibcudf.expressions cimport Expression
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.io.parquet cimport (
     chunked_parquet_reader as cpp_chunked_parquet_reader,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.pylibcudf.types cimport DataType
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.types cimport DataType
 
 
 cdef class ChunkedParquetReader:
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx
similarity index 93%
rename from python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
rename to python/pylibcudf/pylibcudf/io/parquet.pyx
index 84a79f9565f..fb5244a2a9e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/parquet.pyx
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyx
@@ -5,17 +5,16 @@ from libcpp cimport bool
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.expressions cimport Expression
-from cudf._lib.pylibcudf.io.types cimport SourceInfo, TableWithMetadata
-from cudf._lib.pylibcudf.libcudf.expressions cimport expression
-from cudf._lib.pylibcudf.libcudf.io.parquet cimport (
+from pylibcudf.expressions cimport Expression
+from pylibcudf.io.types cimport SourceInfo, TableWithMetadata
+from pylibcudf.libcudf.expressions cimport expression
+from pylibcudf.libcudf.io.parquet cimport (
     chunked_parquet_reader as cpp_chunked_parquet_reader,
     parquet_reader_options,
     read_parquet as cpp_read_parquet,
 )
-from cudf._lib.pylibcudf.libcudf.io.types cimport table_with_metadata
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.io.types cimport table_with_metadata
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef parquet_reader_options _setup_parquet_reader_options(
@@ -169,7 +168,7 @@ cpdef read_parquet(
     row_groups : list[list[size_type]], default None
         List of row groups to be read.
     filters : Expression, default None
-        An AST :py:class:`cudf._lib.pylibcudf.expressions.Expression`
+        An AST :py:class:`pylibcudf.expressions.Expression`
         to use for predicate pushdown.
     convert_strings_to_categories : bool, default False
         Whether to convert string columns to the category type
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd b/python/pylibcudf/pylibcudf/io/types.pxd
similarity index 87%
rename from python/cudf/cudf/_lib/pylibcudf/io/types.pxd
rename to python/pylibcudf/pylibcudf/io/types.pxd
index 0094bf6032c..0ab28cb0973 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pxd
+++ b/python/pylibcudf/pylibcudf/io/types.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
+from pylibcudf.libcudf.io.data_sink cimport data_sink
+from pylibcudf.libcudf.io.types cimport (
     column_encoding,
     column_in_metadata,
     column_name_info,
@@ -19,7 +18,7 @@ from cudf._lib.pylibcudf.libcudf.io.types cimport (
     table_metadata,
     table_with_metadata,
 )
-from cudf._lib.pylibcudf.table cimport Table
+from pylibcudf.table cimport Table
 
 
 cdef class TableWithMetadata:
diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/io/types.pyx
rename to python/pylibcudf/pylibcudf/io/types.pyx
index 95fa7d4c2ee..1600a805b37 100644
--- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx
+++ b/python/pylibcudf/pylibcudf/io/types.pyx
@@ -6,11 +6,10 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.io.datasource cimport Datasource
-from cudf._lib.pylibcudf.libcudf.io.data_sink cimport data_sink
-from cudf._lib.pylibcudf.libcudf.io.datasource cimport datasource
-from cudf._lib.pylibcudf.libcudf.io.types cimport (
+from pylibcudf.io.datasource cimport Datasource
+from pylibcudf.libcudf.io.data_sink cimport data_sink
+from pylibcudf.libcudf.io.datasource cimport datasource
+from pylibcudf.libcudf.io.types cimport (
     column_name_info,
     host_buffer,
     source_info,
@@ -22,9 +21,9 @@ import errno
 import io
 import os
 
-from cudf._lib.pylibcudf.libcudf.io.json import \
+from pylibcudf.libcudf.io.json import \
     json_recovery_mode_t as JSONRecoveryMode  # no-cython-lint
-from cudf._lib.pylibcudf.libcudf.io.types import \
+from pylibcudf.libcudf.io.types import \
     compression_type as CompressionType  # no-cython-lint
 
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pxd b/python/pylibcudf/pylibcudf/join.pxd
similarity index 91%
rename from python/cudf/cudf/_lib/pylibcudf/join.pxd
rename to python/pylibcudf/pylibcudf/join.pxd
index 83b4776c16e..06969b4a2db 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pxd
+++ b/python/pylibcudf/pylibcudf/join.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.types cimport null_equality
+from pylibcudf.libcudf.types cimport null_equality
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx
similarity index 95%
rename from python/cudf/cudf/_lib/pylibcudf/join.pyx
rename to python/pylibcudf/pylibcudf/join.pyx
index 2ded84d84d1..25664286f19 100644
--- a/python/cudf/cudf/_lib/pylibcudf/join.pyx
+++ b/python/pylibcudf/pylibcudf/join.pyx
@@ -4,14 +4,13 @@ from cython.operator import dereference
 
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
+from pylibcudf.libcudf cimport join as cpp_join
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport null_equality
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.pylibcudf.libcudf cimport join as cpp_join
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport null_equality
-
 from .column cimport Column
 from .table cimport Table
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/CMakeLists.txt
rename to python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.py b/python/pylibcudf/pylibcudf/libcudf/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd b/python/pylibcudf/pylibcudf/libcudf/aggregation.pxd
similarity index 98%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
rename to python/pylibcudf/pylibcudf/libcudf/aggregation.pxd
index fe04db52094..58c579b86de 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/aggregation.pxd
@@ -5,8 +5,7 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.types cimport (
     data_type,
     interpolation,
     nan_equality,
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pyx b/python/pylibcudf/pylibcudf/libcudf/aggregation.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/aggregation.pyx
rename to python/pylibcudf/pylibcudf/libcudf/aggregation.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd b/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
rename to python/pylibcudf/pylibcudf/libcudf/binaryop.pxd
index 78da5980db4..d39767b4aa8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/binaryop.pxd
@@ -4,12 +4,11 @@ from libc.stdint cimport int32_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.exception_handler cimport libcudf_exception_handler
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from pylibcudf.exception_handler cimport libcudf_exception_handler
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/binaryop.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/libcudf/binaryop.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/binaryop.pyx
rename to python/pylibcudf/pylibcudf/libcudf/binaryop.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/column/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/column/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.py b/python/pylibcudf/pylibcudf/libcudf/column/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/column/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/column/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column.pxd
similarity index 87%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/column/column.pxd
rename to python/pylibcudf/pylibcudf/libcudf/column/column.pxd
index dd184d31cc6..7a369701bbd 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/column/column.pxd
@@ -3,14 +3,13 @@
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-from rmm._lib.device_buffer cimport device_buffer
-
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from pylibcudf.libcudf.types cimport data_type, size_type
+
+from rmm._lib.device_buffer cimport device_buffer
 
 
 cdef extern from "cudf/column/column.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd
similarity index 93%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd
rename to python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd
index 2faff21a77b..f1a326bcd40 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_factories.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/column/column_factories.pxd
@@ -1,12 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from rmm._lib.device_buffer cimport device_buffer
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.types cimport (
     bitmask_type,
     data_type,
     mask_state,
@@ -14,6 +11,8 @@ from cudf._lib.pylibcudf.libcudf.types cimport (
     type_id,
 )
 
+from rmm._lib.device_buffer cimport device_buffer
+
 
 cdef extern from "cudf/column/column_factories.hpp" namespace "cudf" nogil:
     cdef unique_ptr[column] make_numeric_column(data_type type,
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_view.pxd b/python/pylibcudf/pylibcudf/libcudf/column/column_view.pxd
similarity index 97%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_view.pxd
rename to python/pylibcudf/pylibcudf/libcudf/column/column_view.pxd
index c6403babe89..c0e971eb5bd 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/column/column_view.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/column/column_view.pxd
@@ -2,12 +2,7 @@
 
 from libcpp cimport bool
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.types cimport (
-    bitmask_type,
-    data_type,
-    size_type,
-)
+from pylibcudf.libcudf.types cimport bitmask_type, data_type, size_type
 
 
 cdef extern from "cudf/column/column_view.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/concatenate.pxd b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
similarity index 77%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/concatenate.pxd
rename to python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
index 0c362390ff2..92f5a185a54 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/concatenate.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/concatenate.pxd
@@ -2,13 +2,12 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
+from pylibcudf.libcudf.column.column cimport column, column_view
+from pylibcudf.libcudf.table.table cimport table, table_view
+from pylibcudf.libcudf.utilities.host_span cimport host_span
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table, table_view
-from cudf._lib.pylibcudf.libcudf.utilities.host_span cimport host_span
-
 
 cdef extern from "cudf/concatenate.hpp" namespace "cudf" nogil:
     # The versions of concatenate taking vectors don't exist in libcudf
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/contiguous_split.pxd b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/contiguous_split.pxd
rename to python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
index b06feacb016..cadac6a0022 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/contiguous_split.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/contiguous_split.pxd
@@ -3,12 +3,11 @@
 from libc.stdint cimport uint8_t
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport size_type
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-
 
 cdef extern from "cudf/contiguous_split.hpp" namespace "cudf" nogil:
     cdef cppclass packed_columns:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd b/python/pylibcudf/pylibcudf/libcudf/copying.pxd
similarity index 90%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
rename to python/pylibcudf/pylibcudf/libcudf/copying.pxd
index af3a16ad01b..4d4a4ba9b89 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/copying.pxd
@@ -5,19 +5,18 @@ from libcpp cimport bool
 from libcpp.functional cimport reference_wrapper
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-from rmm._lib.device_buffer cimport device_buffer
-
-from cudf._lib.pylibcudf.exception_handler cimport libcudf_exception_handler
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+from pylibcudf.exception_handler cimport libcudf_exception_handler
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport size_type
+
+from rmm._lib.device_buffer cimport device_buffer
 
 ctypedef const scalar constscalar
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pyx b/python/pylibcudf/pylibcudf/libcudf/copying.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/copying.pyx
rename to python/pylibcudf/pylibcudf/libcudf/copying.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
similarity index 92%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/datetime.pxd
rename to python/pylibcudf/pylibcudf/libcudf/datetime.pxd
index 7db77b9c7c5..a4465343197 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/datetime.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/datetime.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/experimental.pxd b/python/pylibcudf/pylibcudf/libcudf/experimental.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/experimental.pxd
rename to python/pylibcudf/pylibcudf/libcudf/experimental.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd b/python/pylibcudf/pylibcudf/libcudf/expressions.pxd
similarity index 90%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
rename to python/pylibcudf/pylibcudf/libcudf/expressions.pxd
index 427e16d4ff8..5ba2dff6074 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/expressions.pxd
@@ -3,15 +3,14 @@
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport (
     duration_scalar,
     numeric_scalar,
     timestamp_scalar,
 )
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/ast/expressions.hpp" namespace "cudf::ast" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx b/python/pylibcudf/pylibcudf/libcudf/expressions.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/expressions.pyx
rename to python/pylibcudf/pylibcudf/libcudf/expressions.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/filling.pxd b/python/pylibcudf/pylibcudf/libcudf/filling.pxd
similarity index 74%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/filling.pxd
rename to python/pylibcudf/pylibcudf/libcudf/filling.pxd
index 16ed682f930..7bed80050d2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/filling.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/filling.pxd
@@ -2,16 +2,15 @@
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/filling.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/groupby.pxd b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd
similarity index 83%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/groupby.pxd
rename to python/pylibcudf/pylibcudf/libcudf/groupby.pxd
index 16607cc3711..848462131fe 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/groupby.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/groupby.pxd
@@ -5,25 +5,24 @@ from libcpp.functional cimport reference_wrapper
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.aggregation cimport (
+from pylibcudf.libcudf.aggregation cimport (
     groupby_aggregation,
     groupby_scan_aggregation,
 )
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.replace cimport replace_policy
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport (
     null_order,
     null_policy,
     order,
     size_type,
     sorted,
 )
-from cudf._lib.pylibcudf.libcudf.utilities.host_span cimport host_span
+from pylibcudf.libcudf.utilities.host_span cimport host_span
 
 # workaround for https://github.com/cython/cython/issues/3885
 ctypedef const scalar constscalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/hash.pxd b/python/pylibcudf/pylibcudf/libcudf/hash.pxd
similarity index 86%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/hash.pxd
rename to python/pylibcudf/pylibcudf/libcudf/hash.pxd
index 5346252df69..51678ba69d8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/hash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/hash.pxd
@@ -3,10 +3,9 @@
 from libc.stdint cimport uint32_t, uint64_t
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
similarity index 87%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
rename to python/pylibcudf/pylibcudf/libcudf/interop.pxd
index 24d96b602dc..c7efff2340d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/interop.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
@@ -3,14 +3,11 @@
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "dlpack/dlpack.h" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/io/CMakeLists.txt
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/CMakeLists.txt
rename to python/pylibcudf/pylibcudf/libcudf/io/CMakeLists.txt
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/io/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.py b/python/pylibcudf/pylibcudf/libcudf/io/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/io/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/arrow_io_source.pxd b/python/pylibcudf/pylibcudf/libcudf/io/arrow_io_source.pxd
similarity index 86%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/arrow_io_source.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/arrow_io_source.pxd
index 1d2138f8d10..54a913a9ce3 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/arrow_io_source.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/arrow_io_source.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.io.datasource as cudf_io_datasource
 from libcpp.memory cimport shared_ptr
 from libcpp.string cimport string
 from pyarrow.includes.libarrow cimport CRandomAccessFile
 
-cimport cudf._lib.pylibcudf.libcudf.io.datasource as cudf_io_datasource
-
 
 cdef extern from "cudf/io/arrow_io_source.hpp" \
         namespace "cudf::io" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/avro.pxd b/python/pylibcudf/pylibcudf/libcudf/io/avro.pxd
similarity index 91%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/avro.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/avro.pxd
index 530df5aa8f1..2d76e2f6c80 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/avro.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/avro.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.io.types as cudf_io_types
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/io/avro.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd b/python/pylibcudf/pylibcudf/libcudf/io/csv.pxd
similarity index 98%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/csv.pxd
index b5ff6558cd8..73a6d98650c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/csv.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/csv.pxd
@@ -1,15 +1,14 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.io.types as cudf_io_types
+cimport pylibcudf.libcudf.table.table_view as cudf_table_view
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/csv.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/data_sink.pxd b/python/pylibcudf/pylibcudf/libcudf/io/data_sink.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/data_sink.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/data_sink.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/datasource.pxd b/python/pylibcudf/pylibcudf/libcudf/io/datasource.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/datasource.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/datasource.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/json.pxd
index 86621ae184f..7514e6c5258 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
@@ -1,15 +1,14 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.io.types as cudf_io_types
+cimport pylibcudf.libcudf.table.table_view as cudf_table_view
 from libc.stdint cimport int32_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/json.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pyx b/python/pylibcudf/pylibcudf/libcudf/io/json.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pyx
rename to python/pylibcudf/pylibcudf/libcudf/io/json.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd
similarity index 97%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/orc.pxd
index 25f91849dea..e4a09b8feb2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd
@@ -1,5 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.io.types as cudf_io_types
+cimport pylibcudf.libcudf.table.table_view as cudf_table_view
 from libc.stdint cimport int64_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
@@ -7,10 +9,7 @@ from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/orc.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc_metadata.pxd b/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd
similarity index 94%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc_metadata.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd
index a23655b06f8..db6cb0cdfa5 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/orc_metadata.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd
@@ -1,13 +1,12 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.io.types as cudf_io_types
 from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t
 from libcpp cimport bool
 from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-from cudf._lib.variant cimport monostate, variant
+from pylibcudf.variant cimport monostate, variant
 
 
 cdef extern from "cudf/io/orc_metadata.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
similarity index 80%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
index d86915c7da9..222d87defa0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
@@ -8,17 +8,25 @@ from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
-from cudf._lib.pylibcudf.libcudf.expressions cimport expression
-from cudf._lib.pylibcudf.libcudf.types cimport data_type, size_type
+from pylibcudf.libcudf.expressions cimport expression
+from pylibcudf.libcudf.io.types cimport (
+    compression_type,
+    dictionary_policy,
+    partition_info,
+    sink_info,
+    source_info,
+    statistics_freq,
+    table_input_metadata,
+    table_with_metadata,
+)
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport data_type, size_type
 
 
 cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
     cdef cppclass parquet_reader_options:
         parquet_reader_options() except +
-        cudf_io_types.source_info get_source_info() except +
+        source_info get_source_info() except +
         vector[vector[size_type]] get_row_groups() except +
         const optional[reference_wrapper[expression]]& get_filter() except +
         data_type get_timestamp_type() except +
@@ -38,13 +46,13 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
 
         @staticmethod
         parquet_reader_options_builder builder(
-            cudf_io_types.source_info src
+            source_info src
         ) except +
 
     cdef cppclass parquet_reader_options_builder:
         parquet_reader_options_builder() except +
         parquet_reader_options_builder(
-            cudf_io_types.source_info src
+            source_info src
         ) except +
         parquet_reader_options_builder& columns(
             vector[string] col_names
@@ -69,15 +77,15 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         ) except +
         parquet_reader_options build() except +
 
-    cdef cudf_io_types.table_with_metadata read_parquet(
+    cdef table_with_metadata read_parquet(
         parquet_reader_options args) except +
 
     cdef cppclass parquet_writer_options_base:
         parquet_writer_options_base() except +
-        cudf_io_types.sink_info get_sink_info() except +
-        cudf_io_types.compression_type get_compression() except +
-        cudf_io_types.statistics_freq get_stats_level() except +
-        const optional[cudf_io_types.table_input_metadata]& get_metadata(
+        sink_info get_sink_info() except +
+        compression_type get_compression() except +
+        statistics_freq get_stats_level() except +
+        const optional[table_input_metadata]& get_metadata(
         ) except +
         size_t get_row_group_size_bytes() except +
         size_type get_row_group_size_rows() except +
@@ -87,16 +95,16 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         bool is_enabled_write_arrow_schema() except +
 
         void set_metadata(
-            cudf_io_types.table_input_metadata m
+            table_input_metadata m
         ) except +
         void set_key_value_metadata(
             vector[map[string, string]] kvm
         ) except +
         void set_stats_level(
-            cudf_io_types.statistics_freq sf
+            statistics_freq sf
         ) except +
         void set_compression(
-            cudf_io_types.compression_type compression
+            compression_type compression
         ) except +
         void set_int96_timestamps(
             bool enabled
@@ -111,14 +119,14 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_max_dictionary_size(size_t val) except +
         void enable_write_v2_headers(bool val) except +
         void enable_write_arrow_schema(bool val) except +
-        void set_dictionary_policy(cudf_io_types.dictionary_policy policy) except +
+        void set_dictionary_policy(dictionary_policy policy) except +
 
     cdef cppclass parquet_writer_options(parquet_writer_options_base):
         parquet_writer_options() except +
-        cudf_table_view.table_view get_table() except +
+        table_view get_table() except +
         string get_column_chunks_file_paths() except +
         void set_partitions(
-            vector[cudf_io_types.partition_info] partitions
+            vector[partition_info] partitions
         ) except +
         void set_column_chunks_file_paths(
             vector[string] column_chunks_file_paths
@@ -126,24 +134,24 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
 
         @staticmethod
         parquet_writer_options_builder builder(
-            cudf_io_types.sink_info sink_,
-            cudf_table_view.table_view table_
+            sink_info sink_,
+            table_view table_
         ) except +
 
     cdef cppclass parquet_writer_options_builder_base[BuilderT, OptionsT]:
         parquet_writer_options_builder_base() except +
 
         BuilderT& metadata(
-            cudf_io_types.table_input_metadata m
+            table_input_metadata m
         ) except +
         BuilderT& key_value_metadata(
             vector[map[string, string]] kvm
         ) except +
         BuilderT& stats_level(
-            cudf_io_types.statistics_freq sf
+            statistics_freq sf
         ) except +
         BuilderT& compression(
-            cudf_io_types.compression_type compression
+            compression_type compression
         ) except +
         BuilderT& int96_timestamps(
             bool enabled
@@ -173,7 +181,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             bool val
         ) except +
         BuilderT& dictionary_policy(
-            cudf_io_types.dictionary_policy val
+            dictionary_policy val
         ) except +
         OptionsT build() except +
 
@@ -182,11 +190,11 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
                                                 parquet_writer_options]):
         parquet_writer_options_builder() except +
         parquet_writer_options_builder(
-            cudf_io_types.sink_info sink_,
-            cudf_table_view.table_view table_
+            sink_info sink_,
+            table_view table_
         ) except +
         parquet_writer_options_builder& partitions(
-            vector[cudf_io_types.partition_info] partitions
+            vector[partition_info] partitions
         ) except +
         parquet_writer_options_builder& column_chunks_file_paths(
             vector[string] column_chunks_file_paths
@@ -201,7 +209,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
 
         @staticmethod
         chunked_parquet_writer_options_builder builder(
-            cudf_io_types.sink_info sink_,
+            sink_info sink_,
         ) except +
 
     cdef cppclass chunked_parquet_writer_options_builder(
@@ -210,18 +218,18 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             ):
         chunked_parquet_writer_options_builder() except +
         chunked_parquet_writer_options_builder(
-            cudf_io_types.sink_info sink_,
+            sink_info sink_,
         ) except +
 
     cdef cppclass parquet_chunked_writer:
         parquet_chunked_writer() except +
         parquet_chunked_writer(chunked_parquet_writer_options args) except +
         parquet_chunked_writer& write(
-            cudf_table_view.table_view table_,
+            table_view table_,
         ) except +
         parquet_chunked_writer& write(
-            const cudf_table_view.table_view& table_,
-            const vector[cudf_io_types.partition_info]& partitions,
+            const table_view& table_,
+            const vector[partition_info]& partitions,
         ) except +
         unique_ptr[vector[uint8_t]] close(
             vector[string] column_chunks_file_paths,
@@ -237,7 +245,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
             size_t pass_read_limit,
             const parquet_reader_options& options) except +
         bool has_next() except +
-        cudf_io_types.table_with_metadata read_chunk() except +
+        table_with_metadata read_chunk() except +
 
     cdef unique_ptr[vector[uint8_t]] merge_row_group_metadata(
         const vector[unique_ptr[vector[uint8_t]]]& metadata_list
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet_metadata.pxd b/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd
similarity index 89%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet_metadata.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd
index 34a299b73ab..8e6da56c9a6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/parquet_metadata.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/parquet_metadata.pxd
@@ -1,12 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.io.types as cudf_io_types
 from libc.stdint cimport int64_t
 from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.io.types as cudf_io_types
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/io/parquet_metadata.hpp" namespace "cudf::io" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/text.pxd b/python/pylibcudf/pylibcudf/libcudf/io/text.pxd
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/text.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/text.pxd
index bec223d4079..14397ef970d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/text.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/text.pxd
@@ -4,8 +4,7 @@ from libc.stdint cimport uint64_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column cimport column
 
 
 cdef extern from "cudf/io/text/byte_range_info.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/timezone.pxd b/python/pylibcudf/pylibcudf/libcudf/io/timezone.pxd
similarity index 86%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/timezone.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/timezone.pxd
index 88cb5544dc1..676901efcec 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/timezone.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/timezone.pxd
@@ -4,8 +4,7 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.optional cimport optional
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table cimport table
 
 
 cdef extern from "cudf/timezone.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd b/python/pylibcudf/pylibcudf/libcudf/io/types.pxd
similarity index 92%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
rename to python/pylibcudf/pylibcudf/libcudf/io/types.pxd
index 0a6bddcd907..a3d99807876 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/types.pxd
@@ -1,5 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
+cimport pylibcudf.libcudf.io.datasource as cudf_io_datasource
+cimport pylibcudf.libcudf.table.table_view as cudf_table_view
 from libc.stdint cimport int32_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
@@ -9,12 +12,8 @@ from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
 from pyarrow.includes.libarrow cimport CRandomAccessFile
-
-cimport cudf._lib.pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
-cimport cudf._lib.pylibcudf.libcudf.io.datasource as cudf_io_datasource
-cimport cudf._lib.pylibcudf.libcudf.table.table_view as cudf_table_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/io/types.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pyx b/python/pylibcudf/pylibcudf/libcudf/io/types.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pyx
rename to python/pylibcudf/pylibcudf/libcudf/io/types.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd b/python/pylibcudf/pylibcudf/libcudf/join.pxd
similarity index 88%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
rename to python/pylibcudf/pylibcudf/libcudf/join.pxd
index 32cd17f7c11..6f6c145b23c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/join.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/join.pxd
@@ -4,14 +4,13 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport null_equality, size_type
 
 from rmm._lib.device_uvector cimport device_uvector
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport null_equality, size_type
-
 ctypedef unique_ptr[device_uvector[size_type]] gather_map_type
 ctypedef pair[gather_map_type, gather_map_type] gather_map_pair_type
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/labeling.pxd b/python/pylibcudf/pylibcudf/libcudf/labeling.pxd
similarity index 78%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/labeling.pxd
rename to python/pylibcudf/pylibcudf/libcudf/labeling.pxd
index 54731bf29af..ec6ef6b2a41 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/labeling.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/labeling.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/labeling/label_bins.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.py b/python/pylibcudf/pylibcudf/libcudf/lists/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/lists/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd
similarity index 78%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/combine.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd
index 728bd840f71..d077958ce03 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/combine.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/combine.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/lists/combine.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd
similarity index 75%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd
index 40bb2e78970..81a5ad46389 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/contains.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/contains.pxd
@@ -2,14 +2,11 @@
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.exception_handler cimport libcudf_exception_handler
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.exception_handler cimport libcudf_exception_handler
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
+from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/lists/contains.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd
similarity index 61%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd
index ba57a839fbc..e283551ed0c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/count_elements.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/count_elements.pxd
@@ -1,11 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 
 
 cdef extern from "cudf/lists/count_elements.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/explode.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/explode.pxd
similarity index 59%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/explode.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/explode.pxd
index 622a866f593..c64b2715cca 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/explode.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/explode.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/lists/explode.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/extract.pxd
similarity index 64%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/extract.pxd
index 53609ba8830..2ea060d87de 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/extract.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/extract.pxd
@@ -1,12 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column, column_view
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column, column_view
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/lists/extract.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/filling.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd
similarity index 76%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/filling.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd
index 8403fd179f7..54f5a8409b6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/filling.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/filling.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/lists/filling.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/gather.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/gather.pxd
index ab7ed141365..a762c6aa333 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/gather.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/gather.pxd
@@ -1,11 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 
 
 cdef extern from "cudf/lists/gather.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/lists_column_view.pxd
similarity index 86%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/lists_column_view.pxd
index 8917a6ac899..f43340a78b0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/lists_column_view.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/lists_column_view.pxd
@@ -1,10 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/reverse.pxd
similarity index 62%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/reverse.pxd
index 0382a5d42c3..43b671ebfa0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/reverse.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/reverse.pxd
@@ -1,11 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
 
 
 cdef extern from "cudf/lists/reverse.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/set_operations.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd
similarity index 81%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/set_operations.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd
index eb796897f87..266f04ef6b3 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/set_operations.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/set_operations.pxd
@@ -1,12 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport nan_equality, null_equality
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
+from pylibcudf.libcudf.types cimport nan_equality, null_equality
 
 
 cdef extern from "cudf/lists/set_operations.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd
index 337ac73908b..ea45f999c47 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/sorting.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/sorting.pxd
@@ -1,12 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, order
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
+from pylibcudf.libcudf.types cimport null_order, order
 
 
 cdef extern from "cudf/lists/sorting.hpp" namespace "cudf::lists" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd b/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd
similarity index 68%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd
rename to python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd
index b1fcf7800b0..d9df7c3ca2e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/lists/stream_compaction.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/lists/stream_compaction.pxd
@@ -1,12 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists.lists_column_view cimport (
-    lists_column_view,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport nan_equality, null_equality
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view
+from pylibcudf.libcudf.types cimport nan_equality, null_equality
 
 
 cdef extern from "cudf/lists/stream_compaction.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/merge.pxd b/python/pylibcudf/pylibcudf/libcudf/merge.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/merge.pxd
rename to python/pylibcudf/pylibcudf/libcudf/merge.pxd
index dacb3dc2d74..6930b7a0d06 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/merge.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/merge.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.types as libcudf_types
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/merge.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
similarity index 80%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/null_mask.pxd
rename to python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
index 0cab404c05f..3fc2c7e8f1e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/null_mask.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
@@ -2,17 +2,12 @@
 
 from libc.stdint cimport int32_t
 from libcpp.pair cimport pair
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport bitmask_type, mask_state, size_type
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport (
-    bitmask_type,
-    mask_state,
-    size_type,
-)
-
 ctypedef int32_t underlying_type_t_mask_state
 
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.py b/python/pylibcudf/pylibcudf/libcudf/nvtext/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
similarity index 73%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
index 033a820d2ef..fd768d22704 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/byte_pair_encode.pxd
@@ -2,10 +2,9 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/edit_distance.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd
similarity index 75%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/edit_distance.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd
index ca1f6650a5a..d459372fb8f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/edit_distance.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/edit_distance.pxd
@@ -2,9 +2,8 @@
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "nvtext/edit_distance.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/generate_ngrams.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
index 2034b1c1ee5..eefae746662 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/generate_ngrams.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/generate_ngrams.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/jaccard.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/jaccard.pxd
similarity index 61%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/jaccard.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/jaccard.pxd
index 789a1a2c35a..16c5f7f575e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/jaccard.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/jaccard.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/jaccard.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
similarity index 70%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/minhash.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index fc5577bf3f9..0c352a5068b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
similarity index 58%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
index 229f4d8f5a3..89f6e5edfc4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/ngrams_tokenize.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/ngrams_tokenize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/normalize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd
similarity index 75%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/normalize.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd
index 65c63b089df..cbf121920e1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/normalize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/normalize.pxd
@@ -2,9 +2,8 @@
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "nvtext/normalize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/replace.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/replace.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/replace.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/replace.pxd
index aaad28d2684..6bcfa1d9380 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/replace.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/replace.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/replace.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/stemmer.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
similarity index 79%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/stemmer.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
index 040d4c9de63..673bffa28ae 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/stemmer.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/stemmer.pxd
@@ -2,10 +2,9 @@
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/stemmer.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/subword_tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
similarity index 92%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
index cce40bcd3f6..aabac0a617b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/subword_tokenize.pxd
@@ -4,9 +4,8 @@ from libc.stdint cimport uint16_t, uint32_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "nvtext/subword_tokenize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/tokenize.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/tokenize.pxd
similarity index 84%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/tokenize.pxd
rename to python/pylibcudf/pylibcudf/libcudf/nvtext/tokenize.pxd
index 721a6cabd01..34c054cf36f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/nvtext/tokenize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/tokenize.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "nvtext/tokenize.hpp" namespace "nvtext" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/partitioning.pxd
rename to python/pylibcudf/pylibcudf/libcudf/partitioning.pxd
index babb167d2a0..1ea10e8a194 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/partitioning.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd
@@ -1,15 +1,14 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.types as libcudf_types
 from libc.stdint cimport uint32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/quantiles.pxd b/python/pylibcudf/pylibcudf/libcudf/quantiles.pxd
similarity index 70%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/quantiles.pxd
rename to python/pylibcudf/pylibcudf/libcudf/quantiles.pxd
index 32cfec2d4fc..cf2350fc36c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/quantiles.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/quantiles.pxd
@@ -3,12 +3,11 @@
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport (
     interpolation,
     null_order,
     order,
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pxd b/python/pylibcudf/pylibcudf/libcudf/reduce.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pxd
rename to python/pylibcudf/pylibcudf/libcudf/reduce.pxd
index 3ae1f1a2906..6d2f4bd23d1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/reduce.pxd
@@ -3,15 +3,11 @@
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport pair
-
-from cudf._lib.pylibcudf.libcudf.aggregation cimport (
-    reduce_aggregation,
-    scan_aggregation,
-)
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from pylibcudf.libcudf.aggregation cimport reduce_aggregation, scan_aggregation
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pyx b/python/pylibcudf/pylibcudf/libcudf/reduce.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/reduce.pyx
rename to python/pylibcudf/pylibcudf/libcudf/reduce.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pxd b/python/pylibcudf/pylibcudf/libcudf/replace.pxd
similarity index 83%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pxd
rename to python/pylibcudf/pylibcudf/libcudf/replace.pxd
index e67efbdaba0..4ac44fc946e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/replace.pxd
@@ -2,15 +2,12 @@
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/replace.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pyx b/python/pylibcudf/pylibcudf/libcudf/replace.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/replace.pyx
rename to python/pylibcudf/pylibcudf/libcudf/replace.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/reshape.pxd b/python/pylibcudf/pylibcudf/libcudf/reshape.pxd
similarity index 57%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/reshape.pxd
rename to python/pylibcudf/pylibcudf/libcudf/reshape.pxd
index dfd9a71c3d3..446a082ab1b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/reshape.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/reshape.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/reshape.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/rolling.pxd b/python/pylibcudf/pylibcudf/libcudf/rolling.pxd
similarity index 64%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/rolling.pxd
rename to python/pylibcudf/pylibcudf/libcudf/rolling.pxd
index d7844f99a73..9e76faa0eba 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/rolling.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/rolling.pxd
@@ -1,13 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
-
-from cudf._lib.pylibcudf.libcudf.aggregation cimport rolling_aggregation
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.aggregation cimport rolling_aggregation
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/rolling.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd b/python/pylibcudf/pylibcudf/libcudf/round.pxd
similarity index 75%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd
rename to python/pylibcudf/pylibcudf/libcudf/round.pxd
index 027c4634c9f..1b65133f275 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/round.pxd
@@ -2,9 +2,8 @@
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/round.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/round.pyx b/python/pylibcudf/pylibcudf/libcudf/round.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/round.pyx
rename to python/pylibcudf/pylibcudf/libcudf/round.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/scalar/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/scalar/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.py b/python/pylibcudf/pylibcudf/libcudf/scalar/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/scalar/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar.pxd b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
similarity index 91%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar.pxd
rename to python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
index 662eb90096e..4b40a8a26f6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar.pxd
@@ -3,11 +3,10 @@
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
-from cudf._lib.pylibcudf.libcudf.wrappers.decimals cimport scale_type
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport data_type
+from pylibcudf.libcudf.wrappers.decimals cimport scale_type
 
 
 cdef extern from "cudf/scalar/scalar.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd
similarity index 76%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
rename to python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd
index 8092c3d637d..ee4b47935b2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/scalar/scalar_factories.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/scalar/scalar_factories.pxd
@@ -2,9 +2,8 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 
 cdef extern from "cudf/scalar/scalar_factories.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/search.pxd b/python/pylibcudf/pylibcudf/libcudf/search.pxd
similarity index 73%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/search.pxd
rename to python/pylibcudf/pylibcudf/libcudf/search.pxd
index e2247a1366f..5a6ad5384c9 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/search.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/search.pxd
@@ -1,12 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.types as libcudf_types
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/search.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/sorting.pxd b/python/pylibcudf/pylibcudf/libcudf/sorting.pxd
similarity index 84%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/sorting.pxd
rename to python/pylibcudf/pylibcudf/libcudf/sorting.pxd
index 3d7d3aa2790..9e899855486 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/sorting.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/sorting.pxd
@@ -1,17 +1,14 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
+cimport pylibcudf.libcudf.types as libcudf_types
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
-
-cimport cudf._lib.pylibcudf.libcudf.types as libcudf_types
-from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.aggregation cimport rank_method
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/sorting.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pxd b/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pxd
rename to python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd
index 11d803e5b76..7830c9478c2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pxd
@@ -3,14 +3,11 @@
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-from cudf._lib.types import cudf_to_np_types, np_to_cudf_types
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport (
     nan_equality,
     nan_policy,
     null_equality,
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/libcudf/stream_compaction.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/stream_compaction.pyx
rename to python/pylibcudf/pylibcudf/libcudf/stream_compaction.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/CMakeLists.txt
rename to python/pylibcudf/pylibcudf/libcudf/strings/CMakeLists.txt
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/libcudf/strings/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/strings/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/attributes.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/attributes.pxd
similarity index 76%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/attributes.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/attributes.pxd
index c4d52c83663..5e510339834 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/attributes.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/attributes.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/attributes.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/capitalize.pxd
similarity index 63%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/capitalize.pxd
index b0771e16680..77e3f46d7ee 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/capitalize.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/capitalize.pxd
@@ -1,12 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.char_types cimport (
-    string_character_types,
-)
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.char_types cimport string_character_types
 
 
 cdef extern from "cudf/strings/capitalize.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/case.pxd
similarity index 81%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/case.pxd
index 82c146b0023..7869e90f387 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/case.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/case.pxd
@@ -1,8 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/case.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd
similarity index 82%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd
index f63e1a93f91..5d54c1c3593 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pxd
@@ -2,10 +2,9 @@
 
 from libc.stdint cimport uint32_t
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "cudf/strings/char_types/char_types.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/char_types.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/char_types.pyx
rename to python/pylibcudf/pylibcudf/libcudf/strings/char_types.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/combine.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
similarity index 83%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/combine.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
index b05e46af0d6..e4c9fa5817a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/combine.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/combine.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/strings/combine.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/contains.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
index f8ed253ff3c..c2fb5f0dce4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/contains.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/contains.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
 
 
 cdef extern from "cudf/strings/contains.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.py b/python/pylibcudf/pylibcudf/libcudf/strings/convert/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_booleans.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
index daac2b5be28..83a9573baad 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_booleans.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "cudf/strings/convert/convert_booleans.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_datetime.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
similarity index 76%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
index 263cee4fe1e..fa8975c4df9 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_datetime.pxd
@@ -2,10 +2,9 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_datetime.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_durations.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd
similarity index 72%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_durations.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd
index af357b9bde4..ebe10574353 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_durations.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_durations.pxd
@@ -2,10 +2,9 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_durations.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
similarity index 73%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
index 91c1abdb5e4..6f820f3c9a4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_fixed_point.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_fixed_point.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_floats.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
similarity index 71%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_floats.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
index 5fbf2be0244..f4fc4674506 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_floats.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_floats.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_floats.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_integers.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd
similarity index 80%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_integers.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd
index 3d6c59cbfcf..f12aab0a2e4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_integers.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_integers.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/strings/convert/convert_integers.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
similarity index 76%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
index 86de956b6b6..fe571cfced6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_ipv4.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/convert/convert_ipv4.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_lists.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd
similarity index 62%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_lists.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd
index aba2dbcca64..109111568d8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_lists.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_lists.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cdef extern from "cudf/strings/convert/convert_lists.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_urls.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd
similarity index 72%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_urls.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd
index fb7e0cae6de..5c07b698454 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/convert/convert_urls.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/convert/convert_urls.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/convert/convert_urls.hpp" namespace \
diff --git a/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
new file mode 100644
index 00000000000..12cd628fc1f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/extract.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.table.table cimport table
+
+
+cdef extern from "cudf/strings/extract.hpp" namespace "cudf::strings" nogil:
+
+    cdef unique_ptr[table] extract(
+        column_view source_strings,
+        regex_program) except +
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/find.pxd
similarity index 83%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/find.pxd
index 04e2ed554ee..1d1df1b8b8e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/find.pxd
@@ -2,11 +2,10 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/find.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find_multiple.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
similarity index 68%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find_multiple.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
index 1f1adc8e99f..0491644a10a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/find_multiple.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/find_multiple.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/strings/find_multiple.hpp" namespace "cudf::strings" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/findall.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
similarity index 56%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/findall.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
index 4bc450b8911..b25724586e1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/findall.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/findall.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
 
 
 cdef extern from "cudf/strings/findall.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/json.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/json.pxd
similarity index 79%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/json.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/json.pxd
index 5926fa1d29f..571ba7be7af 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/json.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/json.pxd
@@ -3,10 +3,9 @@
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar, string_scalar
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport scalar, string_scalar
 
 
 cdef extern from "cudf/json/json.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/padding.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd
similarity index 59%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/padding.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd
index 26681a1aa00..657fe61eb14 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/padding.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/padding.pxd
@@ -2,12 +2,11 @@
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.side_type cimport side_type
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.side_type cimport side_type
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/padding.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/regex_flags.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/regex_flags.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx b/python/pylibcudf/pylibcudf/libcudf/strings/regex_flags.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_flags.pyx
rename to python/pylibcudf/pylibcudf/libcudf/strings/regex_flags.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_program.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/regex_program.pxd
similarity index 84%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_program.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/regex_program.pxd
index e92c8bd7737..5d1d9e583d5 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/regex_program.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/regex_program.pxd
@@ -2,8 +2,7 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
 
 
 cdef extern from "cudf/strings/regex/regex_program.hpp" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/repeat.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd
similarity index 67%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/repeat.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd
index 9e128529406..410ff58f299 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/repeat.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/repeat.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/repeat_strings.hpp" namespace "cudf::strings" \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/replace.pxd
similarity index 73%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/replace.pxd
index 34e03eec638..fd5f4fc4751 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/replace.pxd
@@ -3,11 +3,10 @@
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/replace.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace_re.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
similarity index 63%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace_re.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
index 739505cd51d..40f0e2fa50c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/replace_re.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/replace_re.pxd
@@ -3,13 +3,12 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/replace_re.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/side_type.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/side_type.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/side_type.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/split/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.py b/python/pylibcudf/pylibcudf/libcudf/strings/split/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/strings/split/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/partition.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
similarity index 63%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/partition.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
index 5119124b3e3..4162e886a7d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/partition.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/partition.pxd
@@ -2,11 +2,10 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.table.table cimport table
 
 
 cdef extern from "cudf/strings/split/partition.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/split.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
similarity index 78%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/split.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
index 4f75664e47a..3046149aebb 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/split/split.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/split/split.pxd
@@ -2,13 +2,12 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/split/split.hpp" namespace \
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/strip.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd
similarity index 52%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/strip.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd
index 2d6fd6a9e89..b0ca771762d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/strip.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/strip.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.strings.side_type cimport side_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.strings.side_type cimport side_type
 
 
 cdef extern from "cudf/strings/strip.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/substring.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/substring.pxd
similarity index 66%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/substring.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/substring.pxd
index 02123cc0807..576dae9387f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/substring.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/substring.pxd
@@ -1,11 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/slice.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/translate.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd
similarity index 73%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/translate.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd
index b23ac277216..85fa719128a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/translate.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/translate.pxd
@@ -4,11 +4,10 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.types cimport char_utf8
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.types cimport char_utf8
 
 
 cdef extern from "cudf/strings/translate.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/wrap.pxd b/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd
similarity index 58%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings/wrap.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd
index 1d92d445634..c0053391328 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings/wrap.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings/wrap.pxd
@@ -1,10 +1,9 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/strings/wrap.hpp" namespace "cudf::strings" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd b/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd
rename to python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd
index 804ad30dfb1..0c8fe1060ac 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/strings_udf.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/strings_udf.pxd
@@ -4,13 +4,12 @@ from libc.stdint cimport uint8_t, uint16_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport size_type
 
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-
 
 cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \
         "cudf::strings::udf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/table/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/table/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.py b/python/pylibcudf/pylibcudf/libcudf/table/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/table/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/table/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table.pxd b/python/pylibcudf/pylibcudf/libcudf/table/table.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/table/table.pxd
rename to python/pylibcudf/pylibcudf/libcudf/table/table.pxd
index 737a1327d45..654c29b083a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/table/table.pxd
@@ -2,13 +2,9 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport (
-    mutable_table_view,
-    table_view,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table_view cimport mutable_table_view, table_view
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/table/table.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table_view.pxd b/python/pylibcudf/pylibcudf/libcudf/table/table_view.pxd
similarity index 87%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/table/table_view.pxd
rename to python/pylibcudf/pylibcudf/libcudf/table/table_view.pxd
index 00e1a89c025..3af2f6a6c2c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/table/table_view.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/table/table_view.pxd
@@ -1,12 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport (
+from pylibcudf.libcudf.column.column_view cimport (
     column_view,
     mutable_column_view,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
 
 
 cdef extern from "cudf/table/table_view.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/transform.pxd b/python/pylibcudf/pylibcudf/libcudf/transform.pxd
similarity index 73%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/transform.pxd
rename to python/pylibcudf/pylibcudf/libcudf/transform.pxd
index b0a978fe5c5..38298a7c1f1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/transform.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/transform.pxd
@@ -4,20 +4,15 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.string cimport string
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.expressions cimport expression
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport bitmask_type, data_type, size_type
 
 from rmm._lib.device_buffer cimport device_buffer
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.expressions cimport expression
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport (
-    bitmask_type,
-    data_type,
-    size_type,
-)
-
 
 cdef extern from "cudf/transform.hpp" namespace "cudf" nogil:
     cdef pair[unique_ptr[device_buffer], size_type] bools_to_mask (
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/transpose.pxd b/python/pylibcudf/pylibcudf/libcudf/transpose.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/transpose.pxd
rename to python/pylibcudf/pylibcudf/libcudf/transpose.pxd
index 5dcb9c165ad..9c0e3c073b0 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/transpose.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/transpose.pxd
@@ -2,9 +2,8 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef extern from "cudf/transpose.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd b/python/pylibcudf/pylibcudf/libcudf/types.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/types.pxd
rename to python/pylibcudf/pylibcudf/libcudf/types.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/types.pyx b/python/pylibcudf/pylibcudf/libcudf/types.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/types.pyx
rename to python/pylibcudf/pylibcudf/libcudf/types.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd b/python/pylibcudf/pylibcudf/libcudf/unary.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
rename to python/pylibcudf/pylibcudf/libcudf/unary.pxd
index 2a1b189af51..887f8c7fca4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/unary.pxd
@@ -3,10 +3,9 @@
 from libc.stdint cimport int32_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/unary.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pyx b/python/pylibcudf/pylibcudf/libcudf/unary.pyx
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/unary.pyx
rename to python/pylibcudf/pylibcudf/libcudf/unary.pyx
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/utilities/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.py b/python/pylibcudf/pylibcudf/libcudf/utilities/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/utilities/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/host_span.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/host_span.pxd
rename to python/pylibcudf/pylibcudf/libcudf/utilities/host_span.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd
similarity index 93%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd
rename to python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd
index 0cc58af735b..69765e44274 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/traits.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/utilities/traits.pxd
@@ -2,8 +2,7 @@
 
 from libcpp cimport bool
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.types cimport data_type
+from pylibcudf.libcudf.types cimport data_type
 
 
 cdef extern from "cudf/utilities/traits.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd b/python/pylibcudf/pylibcudf/libcudf/utilities/type_dispatcher.pxd
similarity index 73%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd
rename to python/pylibcudf/pylibcudf/libcudf/utilities/type_dispatcher.pxd
index 890fca3a662..fbeb6e9db90 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/utilities/type_dispatcher.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/utilities/type_dispatcher.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.types cimport type_id
+from pylibcudf.libcudf.types cimport type_id
 
 
 cdef extern from "cudf/utilities/type_dispatcher.hpp" namespace "cudf" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.pxd
rename to python/pylibcudf/pylibcudf/libcudf/wrappers/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.py b/python/pylibcudf/pylibcudf/libcudf/wrappers/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/__init__.py
rename to python/pylibcudf/pylibcudf/libcudf/wrappers/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/decimals.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd
similarity index 90%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/decimals.pxd
rename to python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd
index 09b0c87e4b8..558299501d6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/decimals.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/wrappers/decimals.pxd
@@ -1,8 +1,7 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t, int64_t
-
-from cudf._lib.pylibcudf.libcudf.types cimport int128
+from pylibcudf.libcudf.types cimport int128
 
 
 cdef extern from "cudf/fixed_point/fixed_point.hpp" namespace "numeric" nogil:
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/durations.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/durations.pxd
rename to python/pylibcudf/pylibcudf/libcudf/wrappers/durations.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/timestamps.pxd b/python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/libcudf/wrappers/timestamps.pxd
rename to python/pylibcudf/pylibcudf/libcudf/wrappers/timestamps.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pxd b/python/pylibcudf/pylibcudf/lists.pxd
similarity index 94%
rename from python/cudf/cudf/_lib/pylibcudf/lists.pxd
rename to python/pylibcudf/pylibcudf/lists.pxd
index 17619b489d2..e7d006e6e2e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pxd
+++ b/python/pylibcudf/pylibcudf/lists.pxd
@@ -1,8 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, size_type
+from pylibcudf.libcudf.types cimport null_order, size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx
similarity index 95%
rename from python/cudf/cudf/_lib/pylibcudf/lists.pyx
rename to python/pylibcudf/pylibcudf/lists.pyx
index c944fc35800..947caddc485 100644
--- a/python/cudf/cudf/_lib/pylibcudf/lists.pyx
+++ b/python/pylibcudf/pylibcudf/lists.pyx
@@ -4,9 +4,8 @@ from cython.operator cimport dereference
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.lists cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.lists cimport (
     contains as cpp_contains,
     explode as cpp_explode,
     filling as cpp_filling,
@@ -14,34 +13,34 @@ from cudf._lib.pylibcudf.libcudf.lists cimport (
     reverse as cpp_reverse,
     set_operations as cpp_set_operations,
 )
-from cudf._lib.pylibcudf.libcudf.lists.combine cimport (
+from pylibcudf.libcudf.lists.combine cimport (
     concatenate_list_elements as cpp_concatenate_list_elements,
     concatenate_null_policy,
     concatenate_rows as cpp_concatenate_rows,
 )
-from cudf._lib.pylibcudf.libcudf.lists.count_elements cimport (
+from pylibcudf.libcudf.lists.count_elements cimport (
     count_elements as cpp_count_elements,
 )
-from cudf._lib.pylibcudf.libcudf.lists.extract cimport (
+from pylibcudf.libcudf.lists.extract cimport (
     extract_list_element as cpp_extract_list_element,
 )
-from cudf._lib.pylibcudf.libcudf.lists.sorting cimport (
+from pylibcudf.libcudf.lists.sorting cimport (
     sort_lists as cpp_sort_lists,
     stable_sort_lists as cpp_stable_sort_lists,
 )
-from cudf._lib.pylibcudf.libcudf.lists.stream_compaction cimport (
+from pylibcudf.libcudf.lists.stream_compaction cimport (
     apply_boolean_mask as cpp_apply_boolean_mask,
     distinct as cpp_distinct,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport (
     nan_equality,
     null_equality,
     null_order,
     order,
     size_type,
 )
-from cudf._lib.pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType
+from pylibcudf.lists cimport ColumnOrScalar, ColumnOrSizeType
 
 from .column cimport Column, ListColumnView
 from .scalar cimport Scalar
@@ -131,8 +130,8 @@ cpdef Column contains(Column input, ColumnOrScalar search_key):
     the search_key is contained in the input.
 
     ``search_key`` may be a
-    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
-    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+    :py:class:`~pylibcudf.column.Column` or a
+    :py:class:`~pylibcudf.scalar.Scalar`.
 
     For details, see :cpp:func:`contains`.
 
@@ -192,8 +191,8 @@ cpdef Column index_of(Column input, ColumnOrScalar search_key, bool find_first_o
     key row within the corresponding list row in the lists column.
 
     ``search_key`` may be a
-    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
-    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+    :py:class:`~pylibcudf.column.Column` or a
+    :py:class:`~pylibcudf.scalar.Scalar`.
 
     For details, see :cpp:func:`index_of`.
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/merge.pxd b/python/pylibcudf/pylibcudf/merge.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/merge.pxd
rename to python/pylibcudf/pylibcudf/merge.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx
similarity index 83%
rename from python/cudf/cudf/_lib/pylibcudf/merge.pyx
rename to python/pylibcudf/pylibcudf/merge.pyx
index 5aa46c142f6..a7d43c9d158 100644
--- a/python/cudf/cudf/_lib/pylibcudf/merge.pyx
+++ b/python/pylibcudf/pylibcudf/merge.pyx
@@ -3,11 +3,10 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf cimport merge as cpp_merge
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, size_type
+from pylibcudf.libcudf cimport merge as cpp_merge
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.types cimport null_order, order, size_type
 
 from .table cimport Table
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd b/python/pylibcudf/pylibcudf/quantiles.pxd
similarity index 86%
rename from python/cudf/cudf/_lib/pylibcudf/quantiles.pxd
rename to python/pylibcudf/pylibcudf/quantiles.pxd
index 70ff135ca77..fbc1dfb30a6 100644
--- a/python/cudf/cudf/_lib/pylibcudf/quantiles.pxd
+++ b/python/pylibcudf/pylibcudf/quantiles.pxd
@@ -1,7 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.types cimport interpolation, sorted
+from pylibcudf.libcudf.types cimport interpolation, sorted
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx
similarity index 93%
rename from python/cudf/cudf/_lib/pylibcudf/quantiles.pyx
rename to python/pylibcudf/pylibcudf/quantiles.pyx
index c1f0e30ccd3..b847ade774d 100644
--- a/python/cudf/cudf/_lib/pylibcudf/quantiles.pyx
+++ b/python/pylibcudf/pylibcudf/quantiles.pyx
@@ -4,15 +4,14 @@ from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.quantiles cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.quantiles cimport (
     quantile as cpp_quantile,
     quantiles as cpp_quantiles,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, order, sorted
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport null_order, order, sorted
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/reduce.pxd b/python/pylibcudf/pylibcudf/reduce.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/reduce.pxd
rename to python/pylibcudf/pylibcudf/reduce.pxd
index 935efd4acf2..047f08297e4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/reduce.pxd
+++ b/python/pylibcudf/pylibcudf/reduce.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.reduce cimport scan_type
+from pylibcudf.libcudf.reduce cimport scan_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/reduce.pyx b/python/pylibcudf/pylibcudf/reduce.pyx
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/reduce.pyx
rename to python/pylibcudf/pylibcudf/reduce.pyx
index c272f183007..b0212a5b9c1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/reduce.pyx
+++ b/python/pylibcudf/pylibcudf/reduce.pyx
@@ -3,23 +3,18 @@
 from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move, pair
-
-from cudf._lib.pylibcudf.libcudf cimport reduce as cpp_reduce
-from cudf._lib.pylibcudf.libcudf.aggregation cimport (
-    reduce_aggregation,
-    scan_aggregation,
-)
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.reduce cimport scan_type
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf cimport reduce as cpp_reduce
+from pylibcudf.libcudf.aggregation cimport reduce_aggregation, scan_aggregation
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.reduce cimport scan_type
+from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 from .aggregation cimport Aggregation
 from .column cimport Column
 from .scalar cimport Scalar
 from .types cimport DataType
 
-from cudf._lib.pylibcudf.libcudf.reduce import \
-    scan_type as ScanType  # no-cython-lint
+from pylibcudf.libcudf.reduce import scan_type as ScanType  # no-cython-lint
 
 
 cpdef Scalar reduce(Column col, Aggregation agg, DataType data_type):
diff --git a/python/cudf/cudf/_lib/pylibcudf/replace.pxd b/python/pylibcudf/pylibcudf/replace.pxd
similarity index 92%
rename from python/cudf/cudf/_lib/pylibcudf/replace.pxd
rename to python/pylibcudf/pylibcudf/replace.pxd
index 40484c728db..cb9fa8bf960 100644
--- a/python/cudf/cudf/_lib/pylibcudf/replace.pxd
+++ b/python/pylibcudf/pylibcudf/replace.pxd
@@ -1,8 +1,7 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-
-from cudf._lib.pylibcudf.libcudf.replace cimport replace_policy
+from pylibcudf.libcudf.replace cimport replace_policy
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/replace.pyx b/python/pylibcudf/pylibcudf/replace.pyx
similarity index 97%
rename from python/cudf/cudf/_lib/pylibcudf/replace.pyx
rename to python/pylibcudf/pylibcudf/replace.pyx
index 6e08e8f64a9..115dee132fd 100644
--- a/python/cudf/cudf/_lib/pylibcudf/replace.pyx
+++ b/python/pylibcudf/pylibcudf/replace.pyx
@@ -6,11 +6,10 @@ from cython.operator import dereference
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
+from pylibcudf.libcudf cimport replace as cpp_replace
+from pylibcudf.libcudf.column.column cimport column
 
-from cudf._lib.pylibcudf.libcudf cimport replace as cpp_replace
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-
-from cudf._lib.pylibcudf.libcudf.replace import \
+from pylibcudf.libcudf.replace import \
     replace_policy as ReplacePolicy  # no-cython-lint
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/reshape.pxd b/python/pylibcudf/pylibcudf/reshape.pxd
similarity index 80%
rename from python/cudf/cudf/_lib/pylibcudf/reshape.pxd
rename to python/pylibcudf/pylibcudf/reshape.pxd
index a7cc45d7a08..c4d3d375f7a 100644
--- a/python/cudf/cudf/_lib/pylibcudf/reshape.pxd
+++ b/python/pylibcudf/pylibcudf/reshape.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .scalar cimport Scalar
diff --git a/python/cudf/cudf/_lib/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx
similarity index 86%
rename from python/cudf/cudf/_lib/pylibcudf/reshape.pyx
rename to python/pylibcudf/pylibcudf/reshape.pyx
index b68eba48cd6..a99145be900 100644
--- a/python/cudf/cudf/_lib/pylibcudf/reshape.pyx
+++ b/python/pylibcudf/pylibcudf/reshape.pyx
@@ -2,14 +2,13 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.reshape cimport (
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.reshape cimport (
     interleave_columns as cpp_interleave_columns,
     tile as cpp_tile,
 )
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport size_type
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/rolling.pxd b/python/pylibcudf/pylibcudf/rolling.pxd
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/rolling.pxd
rename to python/pylibcudf/pylibcudf/rolling.pxd
index cdadee68d43..9fcda21a62f 100644
--- a/python/cudf/cudf/_lib/pylibcudf/rolling.pxd
+++ b/python/pylibcudf/pylibcudf/rolling.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf.types cimport size_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/rolling.pyx b/python/pylibcudf/pylibcudf/rolling.pyx
similarity index 89%
rename from python/cudf/cudf/_lib/pylibcudf/rolling.pyx
rename to python/pylibcudf/pylibcudf/rolling.pyx
index 7aa7828a5dd..a46540d7ffa 100644
--- a/python/cudf/cudf/_lib/pylibcudf/rolling.pyx
+++ b/python/pylibcudf/pylibcudf/rolling.pyx
@@ -3,11 +3,10 @@
 from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.libcudf cimport rolling as cpp_rolling
-from cudf._lib.pylibcudf.libcudf.aggregation cimport rolling_aggregation
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
+from pylibcudf.libcudf cimport rolling as cpp_rolling
+from pylibcudf.libcudf.aggregation cimport rolling_aggregation
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.types cimport size_type
 
 from .aggregation cimport Aggregation
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/round.pxd b/python/pylibcudf/pylibcudf/round.pxd
similarity index 77%
rename from python/cudf/cudf/_lib/pylibcudf/round.pxd
rename to python/pylibcudf/pylibcudf/round.pxd
index ccb64fc2847..c8501b03fad 100644
--- a/python/cudf/cudf/_lib/pylibcudf/round.pxd
+++ b/python/pylibcudf/pylibcudf/round.pxd
@@ -1,7 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libc.stdint cimport int32_t
-
-from cudf._lib.pylibcudf.libcudf.round cimport rounding_method
+from pylibcudf.libcudf.round cimport rounding_method
 
 from .column cimport Column
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/round.pyx b/python/pylibcudf/pylibcudf/round.pyx
similarity index 85%
rename from python/cudf/cudf/_lib/pylibcudf/round.pyx
rename to python/pylibcudf/pylibcudf/round.pyx
index cfcc2aafbb8..dc60d53b07e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/round.pyx
+++ b/python/pylibcudf/pylibcudf/round.pyx
@@ -2,16 +2,12 @@
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
+from pylibcudf.libcudf.round cimport round as cpp_round, rounding_method
 
-from cudf._lib.pylibcudf.libcudf.round cimport (
-    round as cpp_round,
-    rounding_method,
-)
-
-from cudf._lib.pylibcudf.libcudf.round import \
+from pylibcudf.libcudf.round import \
     rounding_method as RoundingMethod  # no-cython-lint
 
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column cimport column
 
 from .column cimport Column
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/pylibcudf/pylibcudf/scalar.pxd
similarity index 92%
rename from python/cudf/cudf/_lib/pylibcudf/scalar.pxd
rename to python/pylibcudf/pylibcudf/scalar.pxd
index e6c9db2f1ac..8664dfa4b7e 100644
--- a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd
+++ b/python/pylibcudf/pylibcudf/scalar.pxd
@@ -2,11 +2,10 @@
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
+from pylibcudf.libcudf.scalar.scalar cimport scalar
 
 from rmm._lib.memory_resource cimport DeviceMemoryResource
 
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-
 from .column cimport Column
 from .types cimport DataType
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/pylibcudf/pylibcudf/scalar.pyx
similarity index 94%
rename from python/cudf/cudf/_lib/pylibcudf/scalar.pyx
rename to python/pylibcudf/pylibcudf/scalar.pyx
index 67730be07d8..3e20938af0c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx
+++ b/python/pylibcudf/pylibcudf/scalar.pyx
@@ -3,14 +3,11 @@
 from cython cimport no_gc_clear
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport make_empty_scalar_like
 
 from rmm._lib.memory_resource cimport get_current_device_resource
 
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
-    make_empty_scalar_like,
-)
-
 from .column cimport Column
 from .types cimport DataType
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/search.pxd b/python/pylibcudf/pylibcudf/search.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/search.pxd
rename to python/pylibcudf/pylibcudf/search.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx
similarity index 93%
rename from python/cudf/cudf/_lib/pylibcudf/search.pyx
rename to python/pylibcudf/pylibcudf/search.pyx
index 151a39f204f..ff2468f3f9c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/search.pyx
+++ b/python/pylibcudf/pylibcudf/search.pyx
@@ -3,10 +3,9 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf cimport search as cpp_search
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, order
+from pylibcudf.libcudf cimport search as cpp_search
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.types cimport null_order, order
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd b/python/pylibcudf/pylibcudf/sorting.pxd
similarity index 87%
rename from python/cudf/cudf/_lib/pylibcudf/sorting.pxd
rename to python/pylibcudf/pylibcudf/sorting.pxd
index a4ea541a03b..8127ab21ad1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/sorting.pxd
+++ b/python/pylibcudf/pylibcudf/sorting.pxd
@@ -1,14 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-
-from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
-from cudf._lib.pylibcudf.libcudf.types cimport (
-    null_order,
-    null_policy,
-    order,
-    size_type,
-)
+from pylibcudf.libcudf.aggregation cimport rank_method
+from pylibcudf.libcudf.types cimport null_order, null_policy, order, size_type
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx
similarity index 96%
rename from python/cudf/cudf/_lib/pylibcudf/sorting.pyx
rename to python/pylibcudf/pylibcudf/sorting.pyx
index 8c5a8e26899..bd173eebacb 100644
--- a/python/cudf/cudf/_lib/pylibcudf/sorting.pyx
+++ b/python/pylibcudf/pylibcudf/sorting.pyx
@@ -3,12 +3,11 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf cimport sorting as cpp_sorting
-from cudf._lib.pylibcudf.libcudf.aggregation cimport rank_method
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport null_order, null_policy, order
+from pylibcudf.libcudf cimport sorting as cpp_sorting
+from pylibcudf.libcudf.aggregation cimport rank_method
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport null_order, null_policy, order
 
 from .column cimport Column
 from .table cimport Table
diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd b/python/pylibcudf/pylibcudf/stream_compaction.pxd
similarity index 89%
rename from python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
rename to python/pylibcudf/pylibcudf/stream_compaction.pxd
index 6f89aaf90e7..a4f39792f0c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pxd
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pxd
@@ -1,9 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.stream_compaction cimport (
-    duplicate_keep_option,
-)
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.stream_compaction cimport duplicate_keep_option
+from pylibcudf.libcudf.types cimport (
     nan_equality,
     nan_policy,
     null_equality,
diff --git a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx
similarity index 95%
rename from python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
rename to python/pylibcudf/pylibcudf/stream_compaction.pyx
index 43449d3690a..b574bfa9fa2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/stream_compaction.pyx
+++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx
@@ -3,16 +3,11 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf cimport (
-    stream_compaction as cpp_stream_compaction,
-)
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.stream_compaction cimport (
-    duplicate_keep_option,
-)
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf cimport stream_compaction as cpp_stream_compaction
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.stream_compaction cimport duplicate_keep_option
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.types cimport (
     nan_equality,
     nan_policy,
     null_equality,
@@ -20,7 +15,7 @@ from cudf._lib.pylibcudf.libcudf.types cimport (
     size_type,
 )
 
-from cudf._lib.pylibcudf.libcudf.stream_compaction import \
+from pylibcudf.libcudf.stream_compaction import \
     duplicate_keep_option as DuplicateKeepOption  # no-cython-lint, isort:skip
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/strings/CMakeLists.txt
rename to python/pylibcudf/pylibcudf/strings/CMakeLists.txt
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/strings/__init__.pxd
rename to python/pylibcudf/pylibcudf/strings/__init__.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/strings/__init__.py
rename to python/pylibcudf/pylibcudf/strings/__init__.py
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd b/python/pylibcudf/pylibcudf/strings/capitalize.pxd
similarity index 64%
rename from python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd
rename to python/pylibcudf/pylibcudf/strings/capitalize.pxd
index 9acf189fc23..b45949d4eb4 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pxd
+++ b/python/pylibcudf/pylibcudf/strings/capitalize.pxd
@@ -1,7 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.scalar cimport Scalar
+from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
 
 
 cpdef Column capitalize(Column input, Scalar delimiters=*)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx b/python/pylibcudf/pylibcudf/strings/capitalize.pyx
similarity index 84%
rename from python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx
rename to python/pylibcudf/pylibcudf/strings/capitalize.pyx
index ccf84d25572..06b991c3cf1 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/capitalize.pyx
+++ b/python/pylibcudf/pylibcudf/strings/capitalize.pyx
@@ -2,16 +2,15 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
     make_string_scalar as cpp_make_string_scalar,
 )
-from cudf._lib.pylibcudf.libcudf.strings cimport capitalize as cpp_capitalize
-from cudf._lib.pylibcudf.scalar cimport Scalar
-from cudf._lib.pylibcudf.strings.char_types cimport string_character_types
+from pylibcudf.libcudf.strings cimport capitalize as cpp_capitalize
+from pylibcudf.scalar cimport Scalar
+from pylibcudf.strings.char_types cimport string_character_types
 
 from cython.operator import dereference
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd b/python/pylibcudf/pylibcudf/strings/case.pxd
similarity index 76%
rename from python/cudf/cudf/_lib/pylibcudf/strings/case.pxd
rename to python/pylibcudf/pylibcudf/strings/case.pxd
index 225d566fe06..d3c98d5e3dc 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/case.pxd
+++ b/python/pylibcudf/pylibcudf/strings/case.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.column cimport Column
+from pylibcudf.column cimport Column
 
 
 cpdef Column to_lower(Column input)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx b/python/pylibcudf/pylibcudf/strings/case.pyx
similarity index 79%
rename from python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
rename to python/pylibcudf/pylibcudf/strings/case.pyx
index 3a360fd6b10..9e6cd7717d3 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/case.pyx
+++ b/python/pylibcudf/pylibcudf/strings/case.pyx
@@ -2,10 +2,9 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.strings cimport case as cpp_case
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport case as cpp_case
 
 
 cpdef Column to_lower(Column input):
diff --git a/python/pylibcudf/pylibcudf/strings/char_types.pxd b/python/pylibcudf/pylibcudf/strings/char_types.pxd
new file mode 100644
index 00000000000..ad4e4cf61d8
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/char_types.pxd
@@ -0,0 +1,3 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.libcudf.strings.char_types cimport string_character_types
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx b/python/pylibcudf/pylibcudf/strings/char_types.pyx
similarity index 64%
rename from python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx
rename to python/pylibcudf/pylibcudf/strings/char_types.pyx
index d96161951c6..e7621fb4d84 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/char_types.pyx
+++ b/python/pylibcudf/pylibcudf/strings/char_types.pyx
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.strings.char_types import \
+from pylibcudf.libcudf.strings.char_types import \
     string_character_types as StringCharacterTypes  # no-cython-lint
diff --git a/python/pylibcudf/pylibcudf/strings/contains.pxd b/python/pylibcudf/pylibcudf/strings/contains.pxd
new file mode 100644
index 00000000000..2cd4891a0ea
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/contains.pxd
@@ -0,0 +1,7 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.column cimport Column
+from pylibcudf.strings.regex_program cimport RegexProgram
+
+
+cpdef Column contains_re(Column input, RegexProgram prog)
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx b/python/pylibcudf/pylibcudf/strings/contains.pyx
similarity index 75%
rename from python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx
rename to python/pylibcudf/pylibcudf/strings/contains.pyx
index 8c598b7c953..1a2446f6e2c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/contains.pyx
+++ b/python/pylibcudf/pylibcudf/strings/contains.pyx
@@ -1,11 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.strings cimport contains as cpp_contains
-from cudf._lib.pylibcudf.strings.regex_program cimport RegexProgram
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport contains as cpp_contains
+from pylibcudf.strings.regex_program cimport RegexProgram
 
 
 cpdef Column contains_re(
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd b/python/pylibcudf/pylibcudf/strings/find.pxd
similarity index 77%
rename from python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
rename to python/pylibcudf/pylibcudf/strings/find.pxd
index bb43069f190..e7524a9360b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/find.pxd
+++ b/python/pylibcudf/pylibcudf/strings/find.pxd
@@ -1,8 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.pylibcudf.scalar cimport Scalar
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
 
 ctypedef fused ColumnOrScalar:
     Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx b/python/pylibcudf/pylibcudf/strings/find.pyx
similarity index 90%
rename from python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
rename to python/pylibcudf/pylibcudf/strings/find.pyx
index a0214efd0a1..22d370bf7e8 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/find.pyx
+++ b/python/pylibcudf/pylibcudf/strings/find.pyx
@@ -1,15 +1,14 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.strings cimport find as cpp_find
-from cudf._lib.pylibcudf.scalar cimport Scalar
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.strings cimport find as cpp_find
+from pylibcudf.scalar cimport Scalar
 
 from cython.operator import dereference
 
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
 
 
 cpdef Column find(
@@ -22,8 +21,8 @@ cpdef Column find(
     first found in each string of the provided column.
 
     ``target`` may be a
-    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
-    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+    :py:class:`~pylibcudf.column.Column` or a
+    :py:class:`~pylibcudf.scalar.Scalar`.
 
     If ``target`` is a scalar, the scalar will be searched for in each string.
     If ``target`` is a column, the corresponding string in the column will be
@@ -126,8 +125,8 @@ cpdef Column contains(
     column.
 
     ``target`` may be a
-    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
-    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+    :py:class:`~pylibcudf.column.Column` or a
+    :py:class:`~pylibcudf.scalar.Scalar`.
 
     If ``target`` is a scalar, the scalar will be searched for in each string.
     If ``target`` is a column, the corresponding string in the column will be
@@ -180,8 +179,8 @@ cpdef Column starts_with(
     column.
 
     ``target`` may be a
-    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
-    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+    :py:class:`~pylibcudf.column.Column` or a
+    :py:class:`~pylibcudf.scalar.Scalar`.
 
     If ``target`` is a scalar, the scalar will be searched for in each string.
     If ``target`` is a column, the corresponding string in the column will be
@@ -233,8 +232,8 @@ cpdef Column ends_with(
     target string was found at the end of the string in the provided column.
 
     ``target`` may be a
-    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
-    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+    :py:class:`~pylibcudf.column.Column` or a
+    :py:class:`~pylibcudf.scalar.Scalar`.
 
     If ``target`` is a scalar, the scalar will be searched for in each string.
     If ``target`` is a column, the corresponding string in the column will be
diff --git a/python/pylibcudf/pylibcudf/strings/regex_flags.pxd b/python/pylibcudf/pylibcudf/strings/regex_flags.pxd
new file mode 100644
index 00000000000..1ce3cd07df8
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/strings/regex_flags.pxd
@@ -0,0 +1,2 @@
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx b/python/pylibcudf/pylibcudf/strings/regex_flags.pyx
similarity index 59%
rename from python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx
rename to python/pylibcudf/pylibcudf/strings/regex_flags.pyx
index 903c2ddd503..ce3b6b10a42 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/regex_flags.pyx
+++ b/python/pylibcudf/pylibcudf/strings/regex_flags.pyx
@@ -1,4 +1,4 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.libcudf.strings.regex_flags import \
+from pylibcudf.libcudf.strings.regex_flags import \
     regex_flags as RegexFlags  # no-cython-lint
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd b/python/pylibcudf/pylibcudf/strings/regex_program.pxd
similarity index 70%
rename from python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd
rename to python/pylibcudf/pylibcudf/strings/regex_program.pxd
index 61ed268fb2d..045cc1e1c6b 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pxd
+++ b/python/pylibcudf/pylibcudf/strings/regex_program.pxd
@@ -2,8 +2,7 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
-
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
 
 
 cdef class RegexProgram:
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx b/python/pylibcudf/pylibcudf/strings/regex_program.pyx
similarity index 84%
rename from python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
rename to python/pylibcudf/pylibcudf/strings/regex_program.pyx
index 5f0b8868452..f426b6888ae 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/regex_program.pyx
+++ b/python/pylibcudf/pylibcudf/strings/regex_program.pyx
@@ -4,12 +4,12 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
+from pylibcudf.libcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.libcudf.strings.regex_program cimport regex_program
 
-from cudf._lib.pylibcudf.libcudf.strings.regex_flags cimport regex_flags
-from cudf._lib.pylibcudf.libcudf.strings.regex_program cimport regex_program
+from pylibcudf.strings.regex_flags import RegexFlags
 
-from cudf._lib.pylibcudf.strings.regex_flags import RegexFlags
-from cudf._lib.pylibcudf.strings.regex_flags cimport regex_flags
+from pylibcudf.strings.regex_flags cimport regex_flags
 
 
 cdef class RegexProgram:
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd b/python/pylibcudf/pylibcudf/strings/replace.pxd
similarity index 71%
rename from python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd
rename to python/pylibcudf/pylibcudf/strings/replace.pxd
index 52e2dc3c738..26273b96c57 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pxd
+++ b/python/pylibcudf/pylibcudf/strings/replace.pxd
@@ -1,8 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.pylibcudf.scalar cimport Scalar
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
 
 
 cpdef Column replace(
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx b/python/pylibcudf/pylibcudf/strings/replace.pyx
similarity index 90%
rename from python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx
rename to python/pylibcudf/pylibcudf/strings/replace.pyx
index c757150a600..9d0ebf4a814 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/replace.pyx
+++ b/python/pylibcudf/pylibcudf/strings/replace.pyx
@@ -2,20 +2,19 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport string_scalar
-from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport string_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
     make_string_scalar as cpp_make_string_scalar,
 )
-from cudf._lib.pylibcudf.libcudf.strings.replace cimport (
+from pylibcudf.libcudf.strings.replace cimport (
     replace as cpp_replace,
     replace_multiple as cpp_replace_multiple,
     replace_slice as cpp_replace_slice,
 )
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.pylibcudf.scalar cimport Scalar
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
 
 
 cpdef Column replace(
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd b/python/pylibcudf/pylibcudf/strings/slice.pxd
similarity index 69%
rename from python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd
rename to python/pylibcudf/pylibcudf/strings/slice.pxd
index 7d8d0006ef4..01e9f2b3c88 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pxd
+++ b/python/pylibcudf/pylibcudf/strings/slice.pxd
@@ -1,7 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.scalar cimport Scalar
+from pylibcudf.column cimport Column
+from pylibcudf.scalar cimport Scalar
 
 ctypedef fused ColumnOrScalar:
     Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx b/python/pylibcudf/pylibcudf/strings/slice.pyx
similarity index 81%
rename from python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx
rename to python/pylibcudf/pylibcudf/strings/slice.pyx
index df75134fb71..70d10cab36c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/strings/slice.pyx
+++ b/python/pylibcudf/pylibcudf/strings/slice.pyx
@@ -2,16 +2,15 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
-
-from cudf._lib.pylibcudf.column cimport Column
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
-from cudf._lib.pylibcudf.libcudf.scalar.scalar_factories cimport (
+from pylibcudf.column cimport Column
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.scalar.scalar cimport numeric_scalar
+from pylibcudf.libcudf.scalar.scalar_factories cimport (
     make_fixed_width_scalar as cpp_make_fixed_width_scalar,
 )
-from cudf._lib.pylibcudf.libcudf.strings cimport substring as cpp_slice
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-from cudf._lib.pylibcudf.scalar cimport Scalar
+from pylibcudf.libcudf.strings cimport substring as cpp_slice
+from pylibcudf.libcudf.types cimport size_type
+from pylibcudf.scalar cimport Scalar
 
 from cython.operator import dereference
 
@@ -25,9 +24,9 @@ cpdef Column slice_strings(
     """Perform a slice operation on a strings column.
 
     ``start`` and ``stop`` may be a
-    :py:class:`~cudf._lib.pylibcudf.column.Column` or a
-    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`. But ``step`` must be a
-    :py:class:`~cudf._lib.pylibcudf.scalar.Scalar`.
+    :py:class:`~pylibcudf.column.Column` or a
+    :py:class:`~pylibcudf.scalar.Scalar`. But ``step`` must be a
+    :py:class:`~pylibcudf.scalar.Scalar`.
 
     For details, see :cpp:func:`cudf::strings::slice_strings`.
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pxd b/python/pylibcudf/pylibcudf/table.pxd
similarity index 78%
rename from python/cudf/cudf/_lib/pylibcudf/table.pxd
rename to python/pylibcudf/pylibcudf/table.pxd
index e476fc770e3..cf5c0aa80f2 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pxd
+++ b/python/pylibcudf/pylibcudf/table.pxd
@@ -1,9 +1,8 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
-from cudf._lib.pylibcudf.libcudf.table.table_view cimport table_view
+from pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.table.table_view cimport table_view
 
 
 cdef class Table:
diff --git a/python/cudf/cudf/_lib/pylibcudf/table.pyx b/python/pylibcudf/pylibcudf/table.pyx
similarity index 93%
rename from python/cudf/cudf/_lib/pylibcudf/table.pyx
rename to python/pylibcudf/pylibcudf/table.pyx
index d91fa0474b0..5f77b89a605 100644
--- a/python/cudf/cudf/_lib/pylibcudf/table.pyx
+++ b/python/pylibcudf/pylibcudf/table.pyx
@@ -4,10 +4,9 @@ from cython.operator cimport dereference
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.column.column_view cimport column_view
-from cudf._lib.pylibcudf.libcudf.table.table cimport table
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.column.column_view cimport column_view
+from pylibcudf.libcudf.table.table cimport table
 
 from .column cimport Column
 
diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/pylibcudf/pylibcudf/tests/common/utils.py
similarity index 97%
rename from python/cudf/cudf/pylibcudf_tests/common/utils.py
rename to python/pylibcudf/pylibcudf/tests/common/utils.py
index acb2b5be85c..babe6634318 100644
--- a/python/cudf/cudf/pylibcudf_tests/common/utils.py
+++ b/python/pylibcudf/pylibcudf/tests/common/utils.py
@@ -6,11 +6,11 @@
 
 import numpy as np
 import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from pyarrow.parquet import write_table as pq_write_table
-
-from cudf._lib import pylibcudf as plc
-from cudf._lib.pylibcudf.io.types import CompressionType
+from pylibcudf.io.types import CompressionType
 
 
 def metadata_from_arrow_type(
@@ -157,13 +157,13 @@ def _flatten_arrays(arr):
         for lh_arr, rh_arr in zip(lhs, rhs):
             # Check NaNs positions match
             # and then filter out nans
-            lhs_nans = pa.compute.is_nan(lh_arr)
-            rhs_nans = pa.compute.is_nan(rh_arr)
+            lhs_nans = pc.is_nan(lh_arr)
+            rhs_nans = pc.is_nan(rh_arr)
             assert lhs_nans.equals(rhs_nans)
 
-            if pa.compute.any(lhs_nans) or pa.compute.any(rhs_nans):
+            if pc.any(lhs_nans) or pc.any(rhs_nans):
                 # masks must be equal at this point
-                mask = pa.compute.fill_null(pa.compute.invert(lhs_nans), True)
+                mask = pc.fill_null(pc.invert(lhs_nans), True)
                 lh_arr = lh_arr.filter(mask)
                 rh_arr = rh_arr.filter(mask)
 
diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/pylibcudf/pylibcudf/tests/conftest.py
similarity index 98%
rename from python/cudf/cudf/pylibcudf_tests/conftest.py
rename to python/pylibcudf/pylibcudf/tests/conftest.py
index 945e1689229..fdce6f353ca 100644
--- a/python/cudf/cudf/pylibcudf_tests/conftest.py
+++ b/python/pylibcudf/pylibcudf/tests/conftest.py
@@ -8,10 +8,9 @@
 
 import numpy as np
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
-
-import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.io.types import CompressionType
+from pylibcudf.io.types import CompressionType
 
 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "common"))
 
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_avro.py b/python/pylibcudf/pylibcudf/tests/io/test_avro.py
similarity index 98%
rename from python/cudf/cudf/pylibcudf_tests/io/test_avro.py
rename to python/pylibcudf/pylibcudf/tests/io/test_avro.py
index 061d6792ce3..0cd5064a697 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_avro.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_avro.py
@@ -5,11 +5,10 @@
 
 import fastavro
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 from utils import assert_table_and_meta_eq
 
-import cudf._lib.pylibcudf as plc
-
 avro_dtype_pairs = [
     ("boolean", pa.bool_()),
     ("int", pa.int32()),
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
similarity index 98%
rename from python/cudf/cudf/pylibcudf_tests/io/test_csv.py
rename to python/pylibcudf/pylibcudf/tests/io/test_csv.py
index 95326a8b681..ccd7eef54f3 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_csv.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py
@@ -5,7 +5,9 @@
 
 import pandas as pd
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
+from pylibcudf.io.types import CompressionType
 from utils import (
     _convert_numeric_types_to_floating,
     assert_table_and_meta_eq,
@@ -13,9 +15,6 @@
     write_source_str,
 )
 
-import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.io.types import CompressionType
-
 # Shared kwargs to pass to make_source
 _COMMON_CSV_SOURCE_KWARGS = {
     "format": "csv",
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_json.py b/python/pylibcudf/pylibcudf/tests/io/test_json.py
similarity index 99%
rename from python/cudf/cudf/pylibcudf_tests/io/test_json.py
rename to python/pylibcudf/pylibcudf/tests/io/test_json.py
index 4239f2438bb..9d976fedf00 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_json.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_json.py
@@ -3,7 +3,9 @@
 
 import pandas as pd
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
+from pylibcudf.io.types import CompressionType
 from utils import (
     assert_table_and_meta_eq,
     make_source,
@@ -11,9 +13,6 @@
     write_source_str,
 )
 
-import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.io.types import CompressionType
-
 # Shared kwargs to pass to make_source
 _COMMON_JSON_SOURCE_KWARGS = {"format": "json", "orient": "records"}
 
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
similarity index 97%
rename from python/cudf/cudf/pylibcudf_tests/io/test_parquet.py
rename to python/pylibcudf/pylibcudf/tests/io/test_parquet.py
index dbd20cd473e..f6e843ccf66 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_parquet.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_parquet.py
@@ -1,18 +1,17 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
 import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from pyarrow.parquet import read_table
-from utils import assert_table_and_meta_eq, make_source
-
-import cudf._lib.pylibcudf as plc
-from cudf._lib.pylibcudf.expressions import (
+from pylibcudf.expressions import (
     ASTOperator,
     ColumnNameReference,
     ColumnReference,
     Literal,
     Operation,
 )
+from utils import assert_table_and_meta_eq, make_source
 
 # Shared kwargs to pass to make_source
 _COMMON_PARQUET_SOURCE_KWARGS = {"format": "parquet"}
diff --git a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
similarity index 98%
rename from python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
rename to python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
index 438c482b77a..747f58ec8cf 100644
--- a/python/cudf/cudf/pylibcudf_tests/io/test_source_sink_info.py
+++ b/python/pylibcudf/pylibcudf/tests/io/test_source_sink_info.py
@@ -2,10 +2,9 @@
 
 import io
 
+import pylibcudf as plc
 import pytest
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture(params=[plc.io.SourceInfo, plc.io.SinkInfo])
 def io_class(request):
diff --git a/python/cudf/cudf/pylibcudf_tests/pytest.ini b/python/pylibcudf/pylibcudf/tests/pytest.ini
similarity index 100%
rename from python/cudf/cudf/pylibcudf_tests/pytest.ini
rename to python/pylibcudf/pylibcudf/tests/pytest.ini
diff --git a/python/cudf/cudf/pylibcudf_tests/test_binaryops.py b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
similarity index 99%
rename from python/cudf/cudf/pylibcudf_tests/test_binaryops.py
rename to python/pylibcudf/pylibcudf/tests/test_binaryops.py
index a83caf39ead..f784cb3c191 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_binaryops.py
+++ b/python/pylibcudf/pylibcudf/tests/test_binaryops.py
@@ -4,11 +4,10 @@
 
 import numpy as np
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-from cudf._lib import pylibcudf as plc
-
 
 def idfn(param):
     ltype, rtype, outtype, plc_op, _ = param
diff --git a/python/cudf/cudf/pylibcudf_tests/test_column_factories.py b/python/pylibcudf/pylibcudf/tests/test_column_factories.py
similarity index 99%
rename from python/cudf/cudf/pylibcudf_tests/test_column_factories.py
rename to python/pylibcudf/pylibcudf/tests/test_column_factories.py
index 4c05770a41f..8cedbc6d42f 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_column_factories.py
+++ b/python/pylibcudf/pylibcudf/tests/test_column_factories.py
@@ -1,11 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 from utils import DEFAULT_STRUCT_TESTING_TYPE, assert_column_eq
 
-from cudf._lib import pylibcudf as plc
-
 EMPTY_COL_SIZE = 3
 
 NUMERIC_TYPES = [
diff --git a/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
similarity index 97%
rename from python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
rename to python/pylibcudf/pylibcudf/tests/test_column_from_device.py
index 78ee2cb100e..0e129fdf0ef 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_column_from_device.py
+++ b/python/pylibcudf/pylibcudf/tests/test_column_from_device.py
@@ -1,13 +1,12 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
 import rmm
 
-from cudf._lib import pylibcudf as plc
-
 VALID_TYPES = [
     pa.int8(),
     pa.int16(),
diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/pylibcudf/pylibcudf/tests/test_copying.py
similarity index 99%
rename from python/cudf/cudf/pylibcudf_tests/test_copying.py
rename to python/pylibcudf/pylibcudf/tests/test_copying.py
index f27fe4e942e..628682d0a66 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_copying.py
+++ b/python/pylibcudf/pylibcudf/tests/test_copying.py
@@ -2,6 +2,7 @@
 
 import pyarrow as pa
 import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import (
     DEFAULT_STRUCT_TESTING_TYPE,
@@ -15,8 +16,6 @@
     metadata_from_arrow_type,
 )
 
-from cudf._lib import pylibcudf as plc
-
 
 # TODO: consider moving this to conftest and "pairing"
 # it with pa_type, so that they don't get out of sync
diff --git a/python/cudf/cudf/pylibcudf_tests/test_datetime.py b/python/pylibcudf/pylibcudf/tests/test_datetime.py
similarity index 83%
rename from python/cudf/cudf/pylibcudf_tests/test_datetime.py
rename to python/pylibcudf/pylibcudf/tests/test_datetime.py
index 75af0fa6ca1..d3aa6101e2d 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_datetime.py
+++ b/python/pylibcudf/pylibcudf/tests/test_datetime.py
@@ -3,11 +3,11 @@
 import datetime
 
 import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture
 def column(has_nulls):
@@ -25,6 +25,6 @@ def column(has_nulls):
 def test_extract_year(column):
     got = plc.datetime.extract_year(column)
     # libcudf produces an int16, arrow produces an int64
-    expect = pa.compute.year(plc.interop.to_arrow(column)).cast(pa.int16())
+    expect = pc.year(plc.interop.to_arrow(column)).cast(pa.int16())
 
     assert_column_eq(expect, got)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_expressions.py b/python/pylibcudf/pylibcudf/tests/test_expressions.py
similarity index 97%
rename from python/cudf/cudf/pylibcudf_tests/test_expressions.py
rename to python/pylibcudf/pylibcudf/tests/test_expressions.py
index f661512caad..5894ef4624c 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_expressions.py
+++ b/python/pylibcudf/pylibcudf/tests/test_expressions.py
@@ -1,9 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 
-import cudf._lib.pylibcudf as plc
-
 # We can't really evaluate these expressions, so just make sure
 # construction works properly
 
diff --git a/python/cudf/cudf/pylibcudf_tests/test_interop.py b/python/pylibcudf/pylibcudf/tests/test_interop.py
similarity index 98%
rename from python/cudf/cudf/pylibcudf_tests/test_interop.py
rename to python/pylibcudf/pylibcudf/tests/test_interop.py
index 5c05f460e28..01c998f16d4 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_interop.py
+++ b/python/pylibcudf/pylibcudf/tests/test_interop.py
@@ -1,10 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 
-import cudf._lib.pylibcudf as plc
-
 
 def test_list_dtype_roundtrip():
     list_type = pa.list_(pa.int32())
diff --git a/python/cudf/cudf/pylibcudf_tests/test_join.py b/python/pylibcudf/pylibcudf/tests/test_join.py
similarity index 94%
rename from python/cudf/cudf/pylibcudf_tests/test_join.py
rename to python/pylibcudf/pylibcudf/tests/test_join.py
index eb25ed915b1..61e02f4d28d 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_join.py
+++ b/python/pylibcudf/pylibcudf/tests/test_join.py
@@ -2,10 +2,9 @@
 
 import numpy as np
 import pyarrow as pa
+import pylibcudf as plc
 from utils import assert_table_eq
 
-from cudf._lib import pylibcudf as plc
-
 
 def test_cross_join():
     left = pa.Table.from_arrays([[0, 1, 2], [3, 4, 5]], names=["a", "b"])
diff --git a/python/cudf/cudf/pylibcudf_tests/test_lists.py b/python/pylibcudf/pylibcudf/tests/test_lists.py
similarity index 99%
rename from python/cudf/cudf/pylibcudf_tests/test_lists.py
rename to python/pylibcudf/pylibcudf/tests/test_lists.py
index 33f95a7d364..2353a6ff8f9 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_lists.py
+++ b/python/pylibcudf/pylibcudf/tests/test_lists.py
@@ -2,11 +2,11 @@
 
 import numpy as np
 import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-from cudf._lib import pylibcudf as plc
-
 
 @pytest.fixture
 def test_data():
@@ -184,7 +184,7 @@ def test_extract_list_element_scalar(list_column):
     plc_column = plc.interop.from_arrow(pa.array(list_column))
 
     res = plc.lists.extract_list_element(plc_column, 0)
-    expect = pa.compute.list_element(list_column, 0)
+    expect = pc.list_element(list_column, 0)
 
     assert_column_eq(expect, res)
 
diff --git a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py b/python/pylibcudf/pylibcudf/tests/test_quantiles.py
similarity index 99%
rename from python/cudf/cudf/pylibcudf_tests/test_quantiles.py
rename to python/pylibcudf/pylibcudf/tests/test_quantiles.py
index 13f3b037606..bac56691306 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_quantiles.py
+++ b/python/pylibcudf/pylibcudf/tests/test_quantiles.py
@@ -3,11 +3,10 @@
 import numpy as np
 import pyarrow as pa
 import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
-import cudf._lib.pylibcudf as plc
-
 # Map pylibcudf interpolation options to pyarrow options
 interp_mapping = {
     plc.types.Interpolation.LINEAR: "linear",
diff --git a/python/cudf/cudf/pylibcudf_tests/test_regex_program.py b/python/pylibcudf/pylibcudf/tests/test_regex_program.py
similarity index 89%
rename from python/cudf/cudf/pylibcudf_tests/test_regex_program.py
rename to python/pylibcudf/pylibcudf/tests/test_regex_program.py
index 3a9bcec3616..777315df538 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_regex_program.py
+++ b/python/pylibcudf/pylibcudf/tests/test_regex_program.py
@@ -1,9 +1,8 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
+import pylibcudf as plc
 import pytest
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.mark.parametrize("pat", ["(", "*", "\\"])
 def test_regex_program_invalid(pat):
diff --git a/python/cudf/cudf/pylibcudf_tests/test_reshape.py b/python/pylibcudf/pylibcudf/tests/test_reshape.py
similarity index 96%
rename from python/cudf/cudf/pylibcudf_tests/test_reshape.py
rename to python/pylibcudf/pylibcudf/tests/test_reshape.py
index da1157e5832..01115bc363a 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_reshape.py
+++ b/python/pylibcudf/pylibcudf/tests/test_reshape.py
@@ -1,11 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq, assert_table_eq
 
-from cudf._lib import pylibcudf as plc
-
 
 @pytest.fixture(scope="module")
 def reshape_data():
diff --git a/python/cudf/cudf/pylibcudf_tests/test_round.py b/python/pylibcudf/pylibcudf/tests/test_round.py
similarity index 86%
rename from python/cudf/cudf/pylibcudf_tests/test_round.py
rename to python/pylibcudf/pylibcudf/tests/test_round.py
index 991e6ed310d..0b30316b9a0 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_round.py
+++ b/python/pylibcudf/pylibcudf/tests/test_round.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture(params=["float32", "float64"])
 def column(request, has_nulls):
@@ -26,8 +26,6 @@ def test_round(column, round_mode, decimals):
         "half_to_even": plc.round.RoundingMethod.HALF_EVEN,
     }[round_mode]
     got = plc.round.round(column, decimals, method)
-    expect = pa.compute.round(
-        plc.interop.to_arrow(column), decimals, round_mode
-    )
+    expect = pc.round(plc.interop.to_arrow(column), decimals, round_mode)
 
     assert_column_eq(expect, got)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
similarity index 86%
rename from python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py
rename to python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
index c4e437fe5d9..176ccc55b96 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_string_capitalize.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_capitalize.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture(scope="module")
 def str_data():
@@ -34,7 +34,7 @@ def str_data():
 def test_capitalize(str_data):
     pa_data, plc_data = str_data
     got = plc.strings.capitalize.capitalize(plc_data)
-    expected = pa.compute.utf8_capitalize(pa_data)
+    expected = pc.utf8_capitalize(pa_data)
     assert_column_eq(expected, got)
 
 
@@ -43,12 +43,12 @@ def test_title(str_data):
     got = plc.strings.capitalize.title(
         plc_data, plc.strings.char_types.StringCharacterTypes.CASE_TYPES
     )
-    expected = pa.compute.utf8_title(pa_data)
+    expected = pc.utf8_title(pa_data)
     assert_column_eq(expected, got)
 
 
 def test_is_title(str_data):
     pa_data, plc_data = str_data
     got = plc.strings.capitalize.is_title(plc_data)
-    expected = pa.compute.utf8_is_title(pa_data)
+    expected = pc.utf8_is_title(pa_data)
     assert_column_eq(expected, got)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_case.py b/python/pylibcudf/pylibcudf/tests/test_string_case.py
similarity index 80%
rename from python/cudf/cudf/pylibcudf_tests/test_string_case.py
rename to python/pylibcudf/pylibcudf/tests/test_string_case.py
index 1039859b2cf..233cc253b14 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_string_case.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_case.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture(scope="module")
 def string_col():
@@ -17,19 +17,19 @@ def string_col():
 def test_to_upper(string_col):
     plc_col = plc.interop.from_arrow(string_col)
     got = plc.strings.case.to_upper(plc_col)
-    expected = pa.compute.utf8_upper(string_col)
+    expected = pc.utf8_upper(string_col)
     assert_column_eq(expected, got)
 
 
 def test_to_lower(string_col):
     plc_col = plc.interop.from_arrow(string_col)
     got = plc.strings.case.to_lower(plc_col)
-    expected = pa.compute.utf8_lower(string_col)
+    expected = pc.utf8_lower(string_col)
     assert_column_eq(expected, got)
 
 
 def test_swapcase(string_col):
     plc_col = plc.interop.from_arrow(string_col)
     got = plc.strings.case.swapcase(plc_col)
-    expected = pa.compute.utf8_swapcase(string_col)
+    expected = pc.utf8_swapcase(string_col)
     assert_column_eq(expected, got)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_contains.py b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
similarity index 92%
rename from python/cudf/cudf/pylibcudf_tests/test_string_contains.py
rename to python/pylibcudf/pylibcudf/tests/test_string_contains.py
index fc8c6656b5d..4f88e09183f 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_string_contains.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_contains.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture(scope="module")
 def target_col():
@@ -44,7 +44,7 @@ def plc_target_pat(pa_target_scalar):
 def test_contains_re(target_col, pa_target_scalar, plc_target_pat):
     pa_target_col, plc_target_col = target_col
     got = plc.strings.contains.contains_re(plc_target_col, plc_target_pat)
-    expected = pa.compute.match_substring_regex(
+    expected = pc.match_substring_regex(
         pa_target_col, pa_target_scalar.as_py()
     )
     assert_column_eq(got, expected)
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_find.py b/python/pylibcudf/pylibcudf/tests/test_string_find.py
similarity index 97%
rename from python/cudf/cudf/pylibcudf_tests/test_string_find.py
rename to python/pylibcudf/pylibcudf/tests/test_string_find.py
index 95a1a3cf731..db3b13a5aae 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_string_find.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_find.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture(scope="module")
 def data_col():
@@ -223,7 +223,7 @@ def test_starts_with(data_col, target_scalar):
     pa_target_scalar, plc_target_scalar = target_scalar
     py_target = pa_target_scalar.as_py()
     got = plc.strings.find.starts_with(plc_data_col, plc_target_scalar)
-    expected = pa.compute.starts_with(pa_data_col, py_target)
+    expected = pc.starts_with(pa_data_col, py_target)
     assert_column_eq(expected, got)
 
 
@@ -242,7 +242,7 @@ def test_ends_with(data_col, target_scalar):
     pa_target_scalar, plc_target_scalar = target_scalar
     py_target = pa_target_scalar.as_py()
     got = plc.strings.find.ends_with(plc_data_col, plc_target_scalar)
-    expected = pa.compute.ends_with(pa_data_col, py_target)
+    expected = pc.ends_with(pa_data_col, py_target)
     assert_column_eq(expected, got)
 
 
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_replace.py b/python/pylibcudf/pylibcudf/tests/test_string_replace.py
similarity index 95%
rename from python/cudf/cudf/pylibcudf_tests/test_string_replace.py
rename to python/pylibcudf/pylibcudf/tests/test_string_replace.py
index f20edf6a506..5a9c2007b73 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_string_replace.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_replace.py
@@ -1,11 +1,11 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pyarrow.compute as pc
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture(scope="module")
 def data_col():
@@ -64,7 +64,7 @@ def test_replace(data_col, scalar_repl_target, scalar_repl, maxrepl):
         plc_data_col, plc_target, plc_repl, maxrepl
     )
 
-    expected = pa.compute.replace_substring(
+    expected = pc.replace_substring(
         pa_data_col,
         pattern=pa_target,
         replacement=pa_repl,
@@ -90,7 +90,7 @@ def test_replace_slice(data_col, scalar_repl, startstop):
         # count_characters on the input, take the max and set stop to that
         stop = 1000
 
-    expected = pa.compute.utf8_replace_slice(pa_data_col, start, stop, pa_repl)
+    expected = pc.utf8_replace_slice(pa_data_col, start, stop, pa_repl)
 
     assert_column_eq(expected, got)
 
diff --git a/python/cudf/cudf/pylibcudf_tests/test_string_slice.py b/python/pylibcudf/pylibcudf/tests/test_string_slice.py
similarity index 98%
rename from python/cudf/cudf/pylibcudf_tests/test_string_slice.py
rename to python/pylibcudf/pylibcudf/tests/test_string_slice.py
index bd63987b30f..d9ce5591b98 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_string_slice.py
+++ b/python/pylibcudf/pylibcudf/tests/test_string_slice.py
@@ -1,11 +1,10 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 from utils import assert_column_eq
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.fixture(scope="module")
 def pa_col():
diff --git a/python/cudf/cudf/pylibcudf_tests/test_table.py b/python/pylibcudf/pylibcudf/tests/test_table.py
similarity index 93%
rename from python/cudf/cudf/pylibcudf_tests/test_table.py
rename to python/pylibcudf/pylibcudf/tests/test_table.py
index cf1d51f6491..e822d6a97a8 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_table.py
+++ b/python/pylibcudf/pylibcudf/tests/test_table.py
@@ -1,10 +1,9 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 import pyarrow as pa
+import pylibcudf as plc
 import pytest
 
-import cudf._lib.pylibcudf as plc
-
 
 @pytest.mark.parametrize(
     "arrow_tbl",
diff --git a/python/cudf/cudf/pylibcudf_tests/test_traits.py b/python/pylibcudf/pylibcudf/tests/test_traits.py
similarity index 98%
rename from python/cudf/cudf/pylibcudf_tests/test_traits.py
rename to python/pylibcudf/pylibcudf/tests/test_traits.py
index 6c22cb02f21..2570e8abd51 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_traits.py
+++ b/python/pylibcudf/pylibcudf/tests/test_traits.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib import pylibcudf as plc
+import pylibcudf as plc
 
 
 def test_is_relationally_comparable():
diff --git a/python/cudf/cudf/pylibcudf_tests/test_transform.py b/python/pylibcudf/pylibcudf/tests/test_transform.py
similarity index 95%
rename from python/cudf/cudf/pylibcudf_tests/test_transform.py
rename to python/pylibcudf/pylibcudf/tests/test_transform.py
index 312939888dd..06fc35d8835 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_transform.py
+++ b/python/pylibcudf/pylibcudf/tests/test_transform.py
@@ -3,10 +3,9 @@
 import math
 
 import pyarrow as pa
+import pylibcudf as plc
 from utils import assert_column_eq
 
-from cudf._lib import pylibcudf as plc
-
 
 def test_nans_to_nulls(has_nans):
     if has_nans:
diff --git a/python/cudf/cudf/pylibcudf_tests/test_unary.py b/python/pylibcudf/pylibcudf/tests/test_unary.py
similarity index 93%
rename from python/cudf/cudf/pylibcudf_tests/test_unary.py
rename to python/pylibcudf/pylibcudf/tests/test_unary.py
index b5e4f0cb0e8..9b8085d5c52 100644
--- a/python/cudf/cudf/pylibcudf_tests/test_unary.py
+++ b/python/pylibcudf/pylibcudf/tests/test_unary.py
@@ -1,6 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
-from cudf._lib import pylibcudf as plc
+import pylibcudf as plc
 
 
 def test_is_supported_cast():
diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pxd b/python/pylibcudf/pylibcudf/traits.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/traits.pxd
rename to python/pylibcudf/pylibcudf/traits.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/traits.pyx b/python/pylibcudf/pylibcudf/traits.pyx
similarity index 98%
rename from python/cudf/cudf/_lib/pylibcudf/traits.pyx
rename to python/pylibcudf/pylibcudf/traits.pyx
index d2370f8d641..5a1c67e1f6c 100644
--- a/python/cudf/cudf/_lib/pylibcudf/traits.pyx
+++ b/python/pylibcudf/pylibcudf/traits.pyx
@@ -1,8 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-
-from cudf._lib.pylibcudf.libcudf.utilities cimport traits
+from pylibcudf.libcudf.utilities cimport traits
 
 from .types cimport DataType
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/transform.pxd b/python/pylibcudf/pylibcudf/transform.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/pylibcudf/transform.pxd
rename to python/pylibcudf/pylibcudf/transform.pxd
diff --git a/python/cudf/cudf/_lib/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx
similarity index 87%
rename from python/cudf/cudf/_lib/pylibcudf/transform.pyx
rename to python/pylibcudf/pylibcudf/transform.pyx
index a734e71b820..100ccb580ce 100644
--- a/python/cudf/cudf/_lib/pylibcudf/transform.pyx
+++ b/python/pylibcudf/pylibcudf/transform.pyx
@@ -2,12 +2,11 @@
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move, pair
+from pylibcudf.libcudf cimport transform as cpp_transform
+from pylibcudf.libcudf.types cimport size_type
 
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
-from cudf._lib.pylibcudf.libcudf cimport transform as cpp_transform
-from cudf._lib.pylibcudf.libcudf.types cimport size_type
-
 from .column cimport Column
 from .gpumemoryview cimport gpumemoryview
 
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pxd b/python/pylibcudf/pylibcudf/types.pxd
similarity index 91%
rename from python/cudf/cudf/_lib/pylibcudf/types.pxd
rename to python/pylibcudf/pylibcudf/types.pxd
index 1f3e1aa2fbb..aa48979d961 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pxd
+++ b/python/pylibcudf/pylibcudf/types.pxd
@@ -2,8 +2,7 @@
 
 from libc.stdint cimport int32_t
 from libcpp cimport bool as cbool
-
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.types cimport (
     data_type,
     interpolation,
     mask_state,
diff --git a/python/cudf/cudf/_lib/pylibcudf/types.pyx b/python/pylibcudf/pylibcudf/types.pyx
similarity index 66%
rename from python/cudf/cudf/_lib/pylibcudf/types.pyx
rename to python/pylibcudf/pylibcudf/types.pyx
index 311f9ce4046..58c7d97e9bc 100644
--- a/python/cudf/cudf/_lib/pylibcudf/types.pyx
+++ b/python/pylibcudf/pylibcudf/types.pyx
@@ -1,25 +1,24 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
-
-from cudf._lib.pylibcudf.libcudf.types cimport (
+from pylibcudf.libcudf.types cimport (
     data_type,
     size_of as cpp_size_of,
     size_type,
     type_id,
 )
-from cudf._lib.pylibcudf.libcudf.utilities.type_dispatcher cimport type_to_id
-
-from cudf._lib.pylibcudf.libcudf.types import type_id as TypeId  # no-cython-lint, isort:skip
-from cudf._lib.pylibcudf.libcudf.types import nan_policy as NanPolicy  # no-cython-lint, isort:skip
-from cudf._lib.pylibcudf.libcudf.types import null_policy as NullPolicy  # no-cython-lint, isort:skip
-from cudf._lib.pylibcudf.libcudf.types import interpolation as Interpolation  # no-cython-lint, isort:skip
-from cudf._lib.pylibcudf.libcudf.types import mask_state as MaskState  # no-cython-lint, isort:skip
-from cudf._lib.pylibcudf.libcudf.types import nan_equality as NanEquality  # no-cython-lint, isort:skip
-from cudf._lib.pylibcudf.libcudf.types import null_equality as NullEquality  # no-cython-lint, isort:skip
-from cudf._lib.pylibcudf.libcudf.types import null_order as NullOrder  # no-cython-lint, isort:skip
-from cudf._lib.pylibcudf.libcudf.types import order as Order  # no-cython-lint, isort:skip
-from cudf._lib.pylibcudf.libcudf.types import sorted as Sorted  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.utilities.type_dispatcher cimport type_to_id
+
+from pylibcudf.libcudf.types import type_id as TypeId  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.types import nan_policy as NanPolicy  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.types import null_policy as NullPolicy  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.types import interpolation as Interpolation  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.types import mask_state as MaskState  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.types import nan_equality as NanEquality  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.types import null_equality as NullEquality  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.types import null_order as NullOrder  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.types import order as Order  # no-cython-lint, isort:skip
+from pylibcudf.libcudf.types import sorted as Sorted  # no-cython-lint, isort:skip
 
 
 cdef class DataType:
diff --git a/python/cudf/cudf/_lib/pylibcudf/unary.pxd b/python/pylibcudf/pylibcudf/unary.pxd
similarity index 87%
rename from python/cudf/cudf/_lib/pylibcudf/unary.pxd
rename to python/pylibcudf/pylibcudf/unary.pxd
index d07df838172..9ee08653599 100644
--- a/python/cudf/cudf/_lib/pylibcudf/unary.pxd
+++ b/python/pylibcudf/pylibcudf/unary.pxd
@@ -1,8 +1,7 @@
 # Copyright (c) 2024, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-
-from cudf._lib.pylibcudf.libcudf.unary cimport unary_operator
+from pylibcudf.libcudf.unary cimport unary_operator
 
 from .column cimport Column
 from .types cimport DataType
diff --git a/python/cudf/cudf/_lib/pylibcudf/unary.pyx b/python/pylibcudf/pylibcudf/unary.pyx
similarity index 94%
rename from python/cudf/cudf/_lib/pylibcudf/unary.pyx
rename to python/pylibcudf/pylibcudf/unary.pyx
index 8da46f0a832..839360ef406 100644
--- a/python/cudf/cudf/_lib/pylibcudf/unary.pyx
+++ b/python/pylibcudf/pylibcudf/unary.pyx
@@ -3,12 +3,11 @@
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
+from pylibcudf.libcudf cimport unary as cpp_unary
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.unary cimport unary_operator
 
-from cudf._lib.pylibcudf.libcudf cimport unary as cpp_unary
-from cudf._lib.pylibcudf.libcudf.column.column cimport column
-from cudf._lib.pylibcudf.libcudf.unary cimport unary_operator
-
-from cudf._lib.pylibcudf.libcudf.unary import \
+from pylibcudf.libcudf.unary import \
     unary_operator as UnaryOperator  # no-cython-lint
 
 from .column cimport Column
diff --git a/python/cudf/cudf/_lib/pylibcudf/utils.pxd b/python/pylibcudf/pylibcudf/utils.pxd
similarity index 71%
rename from python/cudf/cudf/_lib/pylibcudf/utils.pxd
rename to python/pylibcudf/pylibcudf/utils.pxd
index 77c05086397..6b994f20b61 100644
--- a/python/cudf/cudf/_lib/pylibcudf/utils.pxd
+++ b/python/pylibcudf/pylibcudf/utils.pxd
@@ -2,9 +2,8 @@
 
 from libcpp.functional cimport reference_wrapper
 from libcpp.vector cimport vector
-
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.types cimport bitmask_type
 
 
 cdef void * int_to_void_ptr(Py_ssize_t ptr) nogil
diff --git a/python/cudf/cudf/_lib/pylibcudf/utils.pyx b/python/pylibcudf/pylibcudf/utils.pyx
similarity index 93%
rename from python/cudf/cudf/_lib/pylibcudf/utils.pyx
rename to python/pylibcudf/pylibcudf/utils.pyx
index 42e3575ed44..ee4421ddeaf 100644
--- a/python/cudf/cudf/_lib/pylibcudf/utils.pyx
+++ b/python/pylibcudf/pylibcudf/utils.pyx
@@ -5,11 +5,10 @@ from cython.operator import dereference
 from libc.stdint cimport uintptr_t
 from libcpp.functional cimport reference_wrapper
 from libcpp.vector cimport vector
-
 from cuda import cudart
 
-from cudf._lib.pylibcudf.libcudf.scalar.scalar cimport scalar
-from cudf._lib.pylibcudf.libcudf.types cimport bitmask_type
+from pylibcudf.libcudf.scalar.scalar cimport scalar
+from pylibcudf.libcudf.types cimport bitmask_type
 
 from .scalar cimport Scalar
 
diff --git a/python/cudf/cudf/_lib/variant.pxd b/python/pylibcudf/pylibcudf/variant.pxd
similarity index 100%
rename from python/cudf/cudf/_lib/variant.pxd
rename to python/pylibcudf/pylibcudf/variant.pxd
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
new file mode 100644
index 00000000000..b037508d03f
--- /dev/null
+++ b/python/pylibcudf/pyproject.toml
@@ -0,0 +1,123 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
+
+[build-system]
+build-backend = "rapids_build_backend.build"
+requires = [
+    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
+    "scikit-build-core[pyproject]>=0.10.0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project]
+name = "pylibcudf"
+dynamic = ["version"]
+description = "pylibcudf - Python bindings for libcudf"
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.9"
+dependencies = [
+    "cuda-python>=11.7.1,<12.0a0",
+    "nvtx>=0.2.1",
+    "packaging",
+    "pyarrow>=16.1.0,<16.2.0a0",
+    "rmm==24.10.*,>=0.0.0a0",
+    "typing_extensions>=4.0.0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+classifiers = [
+    "Intended Audience :: Developers",
+    "Topic :: Database",
+    "Topic :: Scientific/Engineering",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+]
+
+[project.optional-dependencies]
+test = [
+    "fastavro>=0.22.9",
+    "hypothesis",
+    "numpy",
+    "pandas",
+    "pytest-cov",
+    "pytest-xdist",
+    "pytest<8",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/cudf"
+Documentation = "https://docs.rapids.ai/api/cudf/stable/"
+
+[tool.isort]
+line_length = 79
+multi_line_output = 3
+include_trailing_comma = true
+force_grid_wrap = 0
+combine_as_imports = true
+order_by_type = true
+known_dask = [
+    "dask",
+    "distributed",
+    "dask_cuda",
+]
+known_rapids = [
+    "rmm",
+]
+known_first_party = [
+    "cudf",
+]
+default_section = "THIRDPARTY"
+sections = [
+    "FUTURE",
+    "STDLIB",
+    "THIRDPARTY",
+    "DASK",
+    "RAPIDS",
+    "FIRSTPARTY",
+    "LOCALFOLDER",
+]
+skip = [
+    "thirdparty",
+    ".eggs",
+    ".git",
+    ".hg",
+    ".mypy_cache",
+    ".tox",
+    ".venv",
+    "_build",
+    "buck-out",
+    "build",
+    "dist",
+    "__init__.py",
+]
+
+[tool.rapids-build-backend]
+build-backend = "scikit_build_core.build"
+dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
+requires = [
+    "cmake>=3.26.4,!=3.30.0",
+    "cython>=3.0.3",
+    "ninja",
+    "numpy==1.23.*",
+    "pyarrow==16.1.0.*",
+    "rmm==24.10.*,>=0.0.0a0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[tool.scikit-build]
+build-dir = "build/{wheel_tag}"
+cmake.build-type = "Release"
+cmake.version = "CMakeLists.txt"
+minimum-version = "build-system.requires"
+ninja.make-fallback = true
+sdist.exclude = ["*tests*"]
+sdist.reproducible = true
+wheel.packages = ["pylibcudf"]
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.regex"
+input = "pylibcudf/VERSION"
+regex = "(?P<value>.*)"

From 10cdd5fc5dcfc73404ae825f5d4bcf357c69ff24 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 16 Aug 2024 12:49:28 -0700
Subject: [PATCH 681/842] Reenable arrow tests (#16556)

This PR reenables the tests that were disabled in #16379, converting them to use the new C data interface functions instead of the old libarrow-based ones.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16556
---
 cpp/tests/CMakeLists.txt              |   4 -
 cpp/tests/interop/arrow_utils.hpp     |   5 +-
 cpp/tests/interop/from_arrow_test.cpp | 145 +++++++++++++------
 cpp/tests/interop/to_arrow_test.cpp   | 192 ++++++++++++++++----------
 cpp/tests/streams/interop_test.cpp    |  78 -----------
 5 files changed, 224 insertions(+), 200 deletions(-)
 delete mode 100644 cpp/tests/streams/interop_test.cpp

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 8c4b0f1e367..006b36add0e 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -690,10 +690,6 @@ ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE tes
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
-# Deprecation from 16297 and fixes in 16379 caused this test to be empty This will be reenabled once
-# the deprecated APIs have been replaced in 24.10.
-#
-# ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_JSONIO_TEST streams/io/json_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LABELING_BINS_TEST streams/labeling_bins_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_LISTS_TEST streams/lists_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp
index 1fdf02e02f1..08eada632a5 100644
--- a/cpp/tests/interop/arrow_utils.hpp
+++ b/cpp/tests/interop/arrow_utils.hpp
@@ -32,6 +32,8 @@
 
 #include <arrow/util/bitmap_builders.h>
 
+#include <algorithm>
+
 #pragma once
 
 template <typename T>
@@ -154,8 +156,9 @@ std::shared_ptr<arrow::Array> get_arrow_list_array(std::vector<T> data,
                "Failed to append values to buffer builder");
   CUDF_EXPECTS(buff_builder.Finish(&offset_buffer).ok(), "Failed to allocate buffer");
 
+  auto nullable = std::accumulate(list_validity.begin(), list_validity.end(), 0) > 0;
   return std::make_shared<arrow::ListArray>(
-    arrow::list(data_array->type()),
+    arrow::list(arrow::field("", data_array->type(), nullable)),
     offsets.size() - 1,
     offset_buffer,
     data_array,
diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp
index 733e5814425..81c406c0faf 100644
--- a/cpp/tests/interop/from_arrow_test.cpp
+++ b/cpp/tests/interop/from_arrow_test.cpp
@@ -14,13 +14,6 @@
  * limitations under the License.
  */
 
-// These interop functions are deprecated. We keep the code in this
-// test and will migrate the tests to export the arrow C data
-// interface which we consume with from_arrow_host. For now, the tests
-// are commented out.
-
-#if 0
-
 #include <tests/interop/arrow_utils.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -43,6 +36,10 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
+#include <arrow/c/bridge.h>
+#include <nanoarrow/nanoarrow.h>
+#include <nanoarrow/nanoarrow_device.h>
+
 std::unique_ptr<cudf::table> get_cudf_table()
 {
   std::vector<std::unique_ptr<cudf::column>> columns;
@@ -93,6 +90,45 @@ struct FromArrowTest : public cudf::test::BaseFixture {};
 template <typename T>
 struct FromArrowTestDurationsTest : public cudf::test::BaseFixture {};
 
+std::optional<std::unique_ptr<cudf::table>> export_table(std::shared_ptr<arrow::Table> arrow_table)
+{
+  ArrowSchema schema;
+  if (!arrow::ExportSchema(*arrow_table->schema(), &schema).ok()) { return std::nullopt; }
+  auto batch = arrow_table->CombineChunksToBatch().ValueOrDie();
+  ArrowArray arr;
+  if (!arrow::ExportRecordBatch(*batch, &arr).ok()) { return std::nullopt; }
+  auto ret = cudf::from_arrow(&schema, &arr);
+  arr.release(&arr);
+  schema.release(&schema);
+  return {std::move(ret)};
+}
+
+std::optional<std::unique_ptr<cudf::scalar>> export_scalar(arrow::Scalar const& arrow_scalar)
+{
+  auto maybe_array = arrow::MakeArrayFromScalar(arrow_scalar, 1);
+  if (!maybe_array.ok()) { return std::nullopt; }
+  auto array = *maybe_array;
+
+  ArrowSchema schema;
+  if (!arrow::ExportType(*array->type(), &schema).ok()) { return std::nullopt; }
+
+  ArrowArray arr;
+  if (!arrow::ExportArray(*array, &arr).ok()) { return std::nullopt; }
+
+  auto col = cudf::from_arrow_column(&schema, &arr);
+  auto ret = cudf::get_element(col->view(), 0);
+
+  arr.release(&arr);
+  schema.release(&schema);
+  return {std::move(ret)};
+}
+
+std::optional<std::unique_ptr<cudf::scalar>> export_scalar(
+  std::shared_ptr<arrow::Scalar> const arrow_scalar)
+{
+  return export_scalar(*arrow_scalar);
+}
+
 TYPED_TEST_SUITE(FromArrowTestDurationsTest, cudf::test::DurationTypes);
 
 TEST_F(FromArrowTest, EmptyTable)
@@ -102,9 +138,10 @@ TEST_F(FromArrowTest, EmptyTable)
   auto expected_cudf_table = tables.first->view();
   auto arrow_table         = tables.second;
 
-  auto got_cudf_table = cudf::from_arrow(*arrow_table);
+  auto got_cudf_table = export_table(arrow_table);
+  ASSERT_TRUE(got_cudf_table.has_value());
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table, got_cudf_table->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table, got_cudf_table.value()->view());
 }
 
 TEST_F(FromArrowTest, DateTimeTable)
@@ -127,9 +164,10 @@ TEST_F(FromArrowTest, DateTimeTable)
 
   auto arrow_table = arrow::Table::Make(schema, {arr});
 
-  auto got_cudf_table = cudf::from_arrow(*arrow_table);
+  auto got_cudf_table = export_table(arrow_table);
+  ASSERT_TRUE(got_cudf_table.has_value());
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table.value()->view());
 }
 
 TYPED_TEST(FromArrowTestDurationsTest, DurationTable)
@@ -160,9 +198,10 @@ TYPED_TEST(FromArrowTestDurationsTest, DurationTable)
 
   auto arrow_table = arrow::Table::Make(schema, {arr});
 
-  auto got_cudf_table = cudf::from_arrow(*arrow_table);
+  auto got_cudf_table = export_table(arrow_table);
+  ASSERT_TRUE(got_cudf_table.has_value());
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table.value()->view());
 }
 
 TEST_F(FromArrowTest, NestedList)
@@ -188,8 +227,9 @@ TEST_F(FromArrowTest, NestedList)
 
   auto arrow_table = arrow::Table::Make(schema, {nested_list_arr});
 
-  auto got_cudf_table = cudf::from_arrow(*arrow_table);
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table->view());
+  auto got_cudf_table = export_table(arrow_table);
+  ASSERT_TRUE(got_cudf_table.has_value());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table_view, got_cudf_table.value()->view());
 }
 
 TEST_F(FromArrowTest, StructColumn)
@@ -274,9 +314,10 @@ TEST_F(FromArrowTest, StructColumn)
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
   auto input  = arrow::Table::Make(schema, {struct_array});
 
-  auto got_cudf_table = cudf::from_arrow(*input);
+  auto got_cudf_table = export_table(input);
+  ASSERT_TRUE(got_cudf_table.has_value());
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table, got_cudf_table->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table, got_cudf_table.value()->view());
 }
 
 TEST_F(FromArrowTest, DictionaryIndicesType)
@@ -304,9 +345,10 @@ TEST_F(FromArrowTest, DictionaryIndicesType)
 
   cudf::table expected_table(std::move(columns));
 
-  auto got_cudf_table = cudf::from_arrow(*arrow_table);
+  auto got_cudf_table = export_table(arrow_table);
+  ASSERT_TRUE(got_cudf_table.has_value());
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.view(), got_cudf_table->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table.view(), got_cudf_table.value()->view());
 }
 
 TEST_F(FromArrowTest, ChunkedArray)
@@ -369,9 +411,10 @@ TEST_F(FromArrowTest, ChunkedArray)
 
   auto expected_cudf_table = get_cudf_table();
 
-  auto got_cudf_table = cudf::from_arrow(*arrow_table);
+  auto got_cudf_table = export_table(arrow_table);
+  ASSERT_TRUE(got_cudf_table.has_value());
 
-  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table->view(), got_cudf_table->view());
+  CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table->view(), got_cudf_table.value()->view());
 }
 
 struct FromArrowTestSlice
@@ -388,13 +431,14 @@ TEST_P(FromArrowTestSlice, SliceTest)
   auto sliced_cudf_table   = cudf::slice(cudf_table_view, {start, end})[0];
   auto expected_cudf_table = cudf::table{sliced_cudf_table};
   auto sliced_arrow_table  = arrow_table->Slice(start, end - start);
-  auto got_cudf_table      = cudf::from_arrow(*sliced_arrow_table);
+  auto got_cudf_table      = export_table(sliced_arrow_table);
+  ASSERT_TRUE(got_cudf_table.has_value());
 
   // This has been added to take-care of empty string column issue with no children
-  if (got_cudf_table->num_rows() == 0 and expected_cudf_table.num_rows() == 0) {
-    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_cudf_table.view(), got_cudf_table->view());
+  if (got_cudf_table.value()->num_rows() == 0 and expected_cudf_table.num_rows() == 0) {
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(expected_cudf_table.view(), got_cudf_table.value()->view());
   } else {
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table.view(), got_cudf_table->view());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected_cudf_table.view(), got_cudf_table.value()->view());
   }
 }
 
@@ -417,9 +461,10 @@ TEST_F(FromArrowTest, FixedPoint128Table)
     auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
     auto const arrow_table   = arrow::Table::Make(schema, {arr});
 
-    auto got_cudf_table = cudf::from_arrow(*arrow_table);
+    auto got_cudf_table = export_table(arrow_table);
+    ASSERT_TRUE(got_cudf_table.has_value());
 
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table.value()->view());
   }
 }
 
@@ -441,9 +486,10 @@ TEST_F(FromArrowTest, FixedPoint128TableLarge)
     auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
     auto const arrow_table   = arrow::Table::Make(schema, {arr});
 
-    auto got_cudf_table = cudf::from_arrow(*arrow_table);
+    auto got_cudf_table = export_table(arrow_table);
+    ASSERT_TRUE(got_cudf_table.has_value());
 
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table.value()->view());
   }
 }
 
@@ -466,9 +512,10 @@ TEST_F(FromArrowTest, FixedPoint128TableNulls)
     auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
     auto const arrow_table   = arrow::Table::Make(schema, {arr});
 
-    auto got_cudf_table = cudf::from_arrow(*arrow_table);
+    auto got_cudf_table = export_table(arrow_table);
+    ASSERT_TRUE(got_cudf_table.has_value());
 
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table.value()->view());
   }
 }
 
@@ -493,9 +540,10 @@ TEST_F(FromArrowTest, FixedPoint128TableNullsLarge)
     auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
     auto const arrow_table   = arrow::Table::Make(schema, {arr});
 
-    auto got_cudf_table = cudf::from_arrow(*arrow_table);
+    auto got_cudf_table = export_table(arrow_table);
+    ASSERT_TRUE(got_cudf_table.has_value());
 
-    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table->view());
+    CUDF_TEST_EXPECT_TABLES_EQUAL(expected, got_cudf_table.value()->view());
   }
 }
 
@@ -519,9 +567,12 @@ TYPED_TEST(FromArrowNumericScalarTest, Basic)
 {
   TypeParam const value{42};
   auto const arrow_scalar = arrow::MakeScalar(value);
-  auto const cudf_scalar  = cudf::from_arrow(*arrow_scalar);
+
+  auto const cudf_scalar = export_scalar(arrow_scalar);
+  ASSERT_TRUE(cudf_scalar.has_value());
+
   auto const cudf_numeric_scalar =
-    dynamic_cast<cudf::numeric_scalar<TypeParam>*>(cudf_scalar.get());
+    dynamic_cast<cudf::numeric_scalar<TypeParam>*>(cudf_scalar.value().get());
   if (cudf_numeric_scalar == nullptr) { CUDF_FAIL("Attempted to test with a non-numeric type."); }
   EXPECT_EQ(cudf_numeric_scalar->type(), cudf::data_type(cudf::type_to_id<TypeParam>()));
   EXPECT_EQ(cudf_numeric_scalar->value(), value);
@@ -535,12 +586,13 @@ TEST_F(FromArrowDecimalScalarTest, Basic)
   auto const value{42};
   auto const precision{8};
   auto const scale{4};
-  auto arrow_scalar = arrow::Decimal128Scalar(value, arrow::decimal128(precision, -scale));
-  auto cudf_scalar  = cudf::from_arrow(arrow_scalar);
+  auto arrow_scalar      = arrow::Decimal128Scalar(value, arrow::decimal128(precision, -scale));
+  auto const cudf_scalar = export_scalar(arrow_scalar);
+  ASSERT_TRUE(cudf_scalar.has_value());
 
   // Arrow offers a minimum of 128 bits for the Decimal type.
   auto const cudf_decimal_scalar =
-    dynamic_cast<cudf::fixed_point_scalar<numeric::decimal128>*>(cudf_scalar.get());
+    dynamic_cast<cudf::fixed_point_scalar<numeric::decimal128>*>(cudf_scalar.value().get());
   EXPECT_EQ(cudf_decimal_scalar->type(),
             cudf::data_type(cudf::type_to_id<numeric::decimal128>(), scale));
   EXPECT_EQ(cudf_decimal_scalar->value(), value);
@@ -552,9 +604,10 @@ TEST_F(FromArrowStringScalarTest, Basic)
 {
   auto const value        = std::string("hello world");
   auto const arrow_scalar = arrow::StringScalar(value);
-  auto const cudf_scalar  = cudf::from_arrow(arrow_scalar);
+  auto const cudf_scalar  = export_scalar(arrow_scalar);
+  ASSERT_TRUE(cudf_scalar.has_value());
 
-  auto const cudf_string_scalar = dynamic_cast<cudf::string_scalar*>(cudf_scalar.get());
+  auto const cudf_string_scalar = dynamic_cast<cudf::string_scalar*>(cudf_scalar.value().get());
   EXPECT_EQ(cudf_string_scalar->type(), cudf::data_type(cudf::type_id::STRING));
   EXPECT_EQ(cudf_string_scalar->to_string(), value);
 }
@@ -572,9 +625,10 @@ TEST_F(FromArrowListScalarTest, Basic)
   auto const array       = *maybe_array;
 
   auto const arrow_scalar = arrow::ListScalar(array);
-  auto const cudf_scalar  = cudf::from_arrow(arrow_scalar);
+  auto const cudf_scalar  = export_scalar(arrow_scalar);
+  ASSERT_TRUE(cudf_scalar.has_value());
 
-  auto const cudf_list_scalar = dynamic_cast<cudf::list_scalar*>(cudf_scalar.get());
+  auto const cudf_list_scalar = dynamic_cast<cudf::list_scalar*>(cudf_scalar.value().get());
   EXPECT_EQ(cudf_list_scalar->type(), cudf::data_type(cudf::type_id::LIST));
 
   cudf::test::fixed_width_column_wrapper<int64_t> const lhs(
@@ -592,9 +646,10 @@ TEST_F(FromArrowStructScalarTest, Basic)
   auto const field        = arrow::field("", underlying_arrow_scalar->type);
   auto const arrow_type   = arrow::struct_({field});
   auto const arrow_scalar = arrow::StructScalar({underlying_arrow_scalar}, arrow_type);
-  auto const cudf_scalar  = cudf::from_arrow(arrow_scalar);
+  auto const cudf_scalar  = export_scalar(arrow_scalar);
+  ASSERT_TRUE(cudf_scalar.has_value());
 
-  auto const cudf_struct_scalar = dynamic_cast<cudf::struct_scalar*>(cudf_scalar.get());
+  auto const cudf_struct_scalar = dynamic_cast<cudf::struct_scalar*>(cudf_scalar.value().get());
   EXPECT_EQ(cudf_struct_scalar->type(), cudf::data_type(cudf::type_id::STRUCT));
 
   cudf::test::fixed_width_column_wrapper<int64_t> const col({value});
@@ -602,5 +657,3 @@ TEST_F(FromArrowStructScalarTest, Basic)
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(lhs, cudf_struct_scalar->view());
 }
-
-#endif
diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp
index 328ba210a3f..90ae12cdd90 100644
--- a/cpp/tests/interop/to_arrow_test.cpp
+++ b/cpp/tests/interop/to_arrow_test.cpp
@@ -14,13 +14,6 @@
  * limitations under the License.
  */
 
-// These interop functions are deprecated. We keep the code in this
-// test and will migrate the tests to export via the arrow C data
-// interface with to_arrow_host which arrow can consume. For now, the
-// test is commented out.
-
-#if 0
-
 #include <tests/interop/arrow_utils.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -38,6 +31,7 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/interop.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
@@ -45,6 +39,8 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
+#include <arrow/c/bridge.h>
+
 using vector_of_columns = std::vector<std::unique_ptr<cudf::column>>;
 
 std::pair<std::unique_ptr<cudf::table>, std::shared_ptr<arrow::Table>> get_tables(
@@ -130,7 +126,7 @@ std::pair<std::unique_ptr<cudf::table>, std::shared_ptr<arrow::Table>> get_table
   auto keys       = cudf::test::to_host<int64_t>(view.keys()).first;
   auto indices    = cudf::test::to_host<uint32_t>(view.indices()).first;
   auto dict_array = get_arrow_dict_array(std::vector<int64_t>(keys.begin(), keys.end()),
-                                         std::vector<int32_t>(indices.begin(), indices.end()),
+                                         std::vector<uint32_t>(indices.begin(), indices.end()),
                                          validity);
   auto boolarray  = get_arrow_array<bool>(bool_data, bool_validity);
   auto list_array = get_arrow_list_array<int64_t>(
@@ -168,6 +164,21 @@ struct ToArrowTest : public cudf::test::BaseFixture {};
 template <typename T>
 struct ToArrowTestDurationsTest : public cudf::test::BaseFixture {};
 
+auto is_equal(cudf::table_view const& table,
+              cudf::host_span<cudf::column_metadata const> metadata,
+              std::shared_ptr<arrow::Table> expected_arrow_table)
+{
+  auto got_arrow_schema = cudf::to_arrow_schema(table, metadata);
+  auto got_arrow_table  = cudf::to_arrow_host(table);
+
+  for (auto i = 0; i < got_arrow_schema->n_children; ++i) {
+    auto arr = arrow::ImportArray(got_arrow_table->array.children[i], got_arrow_schema->children[i])
+                 .ValueOrDie();
+    if (!expected_arrow_table->column(i)->Equals(arrow::ChunkedArray(arr))) { return false; }
+  }
+  return true;
+}
+
 TYPED_TEST_SUITE(ToArrowTestDurationsTest, cudf::test::DurationTypes);
 
 TEST_F(ToArrowTest, EmptyTable)
@@ -179,10 +190,9 @@ TEST_F(ToArrowTest, EmptyTable)
   auto struct_meta          = cudf::column_metadata{"f"};
   struct_meta.children_meta = {{"integral"}, {"string"}};
 
-  auto got_arrow_table =
-    cudf::to_arrow(cudf_table_view, {{"a"}, {"b"}, {"c"}, {"d"}, {"e"}, struct_meta});
-
-  ASSERT_EQ(expected_arrow_table->Equals(*got_arrow_table, true), true);
+  std::vector<cudf::column_metadata> const metadata = {
+    {"a"}, {"b"}, {"c"}, {"d"}, {"e"}, struct_meta};
+  ASSERT_TRUE(is_equal(cudf_table_view, metadata, expected_arrow_table));
 }
 
 TEST_F(ToArrowTest, DateTimeTable)
@@ -203,12 +213,10 @@ TEST_F(ToArrowTest, DateTimeTable)
   std::vector<std::shared_ptr<arrow::Field>> schema_vector({arrow::field("a", arr->type())});
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
 
-
   auto expected_arrow_table = arrow::Table::Make(schema, {arr});
 
-  auto got_arrow_table = cudf::to_arrow(input_view, {{"a"}});
-
-  ASSERT_EQ(expected_arrow_table->Equals(*got_arrow_table, true), true);
+  std::vector<cudf::column_metadata> const metadata = {{"a"}};
+  ASSERT_TRUE(is_equal(input_view, metadata, expected_arrow_table));
 }
 
 TYPED_TEST(ToArrowTestDurationsTest, DurationTable)
@@ -239,9 +247,8 @@ TYPED_TEST(ToArrowTestDurationsTest, DurationTable)
 
   auto expected_arrow_table = arrow::Table::Make(schema, {arr});
 
-  auto got_arrow_table = cudf::to_arrow(input_view, {{"a"}});
-
-  ASSERT_EQ(expected_arrow_table->Equals(*got_arrow_table, true), true);
+  std::vector<cudf::column_metadata> const metadata = {{"a"}};
+  ASSERT_TRUE(is_equal(input_view, metadata, expected_arrow_table));
 }
 
 TEST_F(ToArrowTest, NestedList)
@@ -255,20 +262,20 @@ TEST_F(ToArrowTest, NestedList)
   auto list_arr = get_arrow_list_array<int64_t>({6, 7, 8, 9}, {0, 1, 4}, {1, 0, 1, 1});
   std::vector<int32_t> offset{0, 0, 2};
   auto mask_buffer     = arrow::internal::BytesToBits({0, 1}).ValueOrDie();
-  auto nested_list_arr = std::make_shared<arrow::ListArray>(arrow::list(list(arrow::int64())),
-                                                            offset.size() - 1,
-                                                            arrow::Buffer::Wrap(offset),
-                                                            list_arr,
-                                                            mask_buffer);
+  auto nested_list_arr = std::make_shared<arrow::ListArray>(
+    arrow::list(arrow::field("a", arrow::list(arrow::int64()), false)),
+    offset.size() - 1,
+    arrow::Buffer::Wrap(offset),
+    list_arr,
+    mask_buffer);
 
   std::vector<std::shared_ptr<arrow::Field>> schema_vector(
     {arrow::field("a", nested_list_arr->type())});
   auto schema = std::make_shared<arrow::Schema>(schema_vector);
 
-  auto expected_arrow_table = arrow::Table::Make(schema, {nested_list_arr});
-  auto got_arrow_table      = cudf::to_arrow(input_view, {{"a"}});
-
-  ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+  auto expected_arrow_table                         = arrow::Table::Make(schema, {nested_list_arr});
+  std::vector<cudf::column_metadata> const metadata = {{"a"}};
+  ASSERT_TRUE(is_equal(input_view, metadata, expected_arrow_table));
 }
 
 TEST_F(ToArrowTest, StructColumn)
@@ -324,7 +331,10 @@ TEST_F(ToArrowTest, StructColumn)
   auto list_arr = get_arrow_list_array<int64_t>({1, 2, 3, 4, 5, 6, 7, 8, 9}, {0, 2, 4, 5, 6, 7, 9});
   std::vector<int32_t> offset{0, 3, 4, 6};
   auto nested_list_arr = std::make_shared<arrow::ListArray>(
-    arrow::list(list(arrow::int64())), offset.size() - 1, arrow::Buffer::Wrap(offset), list_arr);
+    arrow::list(arrow::field("a", arrow::list(arrow::field("a", arrow::int64(), false)), false)),
+    offset.size() - 1,
+    arrow::Buffer::Wrap(offset),
+    list_arr);
 
   std::vector<std::shared_ptr<arrow::Array>> child_arrays2({str2_array, int2_array});
   auto fields2 = std::vector<std::shared_ptr<arrow::Field>>{
@@ -356,9 +366,8 @@ TEST_F(ToArrowTest, StructColumn)
 
   auto expected_arrow_table = arrow::Table::Make(schema, {struct_array});
 
-  auto got_arrow_table = cudf::to_arrow(input_view, {metadata});
-
-  ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+  std::vector<cudf::column_metadata> const meta = {metadata};
+  ASSERT_TRUE(is_equal(input_view, meta, expected_arrow_table));
 }
 
 template <typename T>
@@ -380,9 +389,8 @@ TEST_F(ToArrowTest, FixedPoint64Table)
     auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
     auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
 
-    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
-
-    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+    std::vector<cudf::column_metadata> const metadata = {{"a"}};
+    ASSERT_TRUE(is_equal(input, metadata, expected_arrow_table));
   }
 }
 
@@ -402,9 +410,8 @@ TEST_F(ToArrowTest, FixedPoint128Table)
     auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
     auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
 
-    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
-
-    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+    std::vector<cudf::column_metadata> const metadata = {{"a"}};
+    ASSERT_TRUE(is_equal(input, metadata, expected_arrow_table));
   }
 }
 
@@ -431,9 +438,8 @@ TEST_F(ToArrowTest, FixedPoint64TableLarge)
     auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
     auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
 
-    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
-
-    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+    std::vector<cudf::column_metadata> const metadata = {{"a"}};
+    ASSERT_TRUE(is_equal(input, metadata, expected_arrow_table));
   }
 }
 
@@ -455,9 +461,8 @@ TEST_F(ToArrowTest, FixedPoint128TableLarge)
     auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
     auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
 
-    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
-
-    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+    std::vector<cudf::column_metadata> const metadata = {{"a"}};
+    ASSERT_TRUE(is_equal(input, metadata, expected_arrow_table));
   }
 }
 
@@ -479,9 +484,8 @@ TEST_F(ToArrowTest, FixedPoint64TableNullsSimple)
     auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
     auto const arrow_table   = arrow::Table::Make(schema, {arr});
 
-    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
-
-    ASSERT_TRUE(arrow_table->Equals(*got_arrow_table, true));
+    std::vector<cudf::column_metadata> const metadata = {{"a"}};
+    ASSERT_TRUE(is_equal(input, metadata, arrow_table));
   }
 }
 
@@ -503,9 +507,8 @@ TEST_F(ToArrowTest, FixedPoint128TableNullsSimple)
     auto const schema        = std::make_shared<arrow::Schema>(schema_vector);
     auto const arrow_table   = arrow::Table::Make(schema, {arr});
 
-    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
-
-    ASSERT_TRUE(arrow_table->Equals(*got_arrow_table, true));
+    std::vector<cudf::column_metadata> const metadata = {{"a"}};
+    ASSERT_TRUE(is_equal(input, metadata, arrow_table));
   }
 }
 
@@ -529,9 +532,8 @@ TEST_F(ToArrowTest, FixedPoint64TableNulls)
     auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
     auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
 
-    auto got_arrow_table = cudf::to_arrow(input, {{"a"}});
-
-    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+    std::vector<cudf::column_metadata> const metadata = {{"a"}};
+    ASSERT_TRUE(is_equal(input, metadata, expected_arrow_table));
   }
 }
 
@@ -554,9 +556,8 @@ TEST_F(ToArrowTest, FixedPoint128TableNulls)
     auto const schema               = std::make_shared<arrow::Schema>(schema_vector);
     auto const expected_arrow_table = arrow::Table::Make(schema, {arr});
 
-    auto const got_arrow_table = cudf::to_arrow(input, {{"a"}});
-
-    ASSERT_TRUE(expected_arrow_table->Equals(*got_arrow_table, true));
+    std::vector<cudf::column_metadata> const metadata = {{"a"}};
+    ASSERT_TRUE(is_equal(input, metadata, expected_arrow_table));
   }
 }
 
@@ -575,10 +576,10 @@ TEST_P(ToArrowTestSlice, SliceTest)
   auto expected_arrow_table = arrow_table->Slice(start, end - start);
   auto struct_meta          = cudf::column_metadata{"f"};
   struct_meta.children_meta = {{"integral"}, {"string"}};
-  auto got_arrow_table =
-    cudf::to_arrow(sliced_cudf_table, {{"a"}, {"b"}, {"c"}, {"d"}, {"e"}, struct_meta});
 
-  ASSERT_EQ(expected_arrow_table->Equals(*got_arrow_table, true), true);
+  std::vector<cudf::column_metadata> const metadata = {
+    {"a"}, {"b"}, {"c"}, {"d"}, {"e"}, struct_meta};
+  ASSERT_TRUE(is_equal(sliced_cudf_table, metadata, expected_arrow_table));
 }
 
 INSTANTIATE_TEST_CASE_P(ToArrowTest,
@@ -595,13 +596,58 @@ using NumericTypesNotBool =
   cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
 TYPED_TEST_SUITE(ToArrowNumericScalarTest, NumericTypesNotBool);
 
+auto col_to_arrow_type(cudf::column_view const& col)
+{
+  switch (col.type().id()) {
+    case cudf::type_id::BOOL8: return arrow::boolean();
+    case cudf::type_id::INT8: return arrow::int8();
+    case cudf::type_id::INT16: return arrow::int16();
+    case cudf::type_id::INT32: return arrow::int32();
+    case cudf::type_id::INT64: return arrow::int64();
+    case cudf::type_id::UINT8: return arrow::uint8();
+    case cudf::type_id::UINT16: return arrow::uint16();
+    case cudf::type_id::UINT32: return arrow::uint32();
+    case cudf::type_id::UINT64: return arrow::uint64();
+    case cudf::type_id::FLOAT32: return arrow::float32();
+    case cudf::type_id::FLOAT64: return arrow::float64();
+    case cudf::type_id::TIMESTAMP_DAYS: return arrow::date32();
+    case cudf::type_id::STRING: return arrow::utf8();
+    case cudf::type_id::LIST:
+      return arrow::list(col_to_arrow_type(col.child(cudf::lists_column_view::child_column_index)));
+    case cudf::type_id::DECIMAL128: return arrow::decimal(38, -col.type().scale());
+    default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error);
+  }
+}
+
+std::optional<std::shared_ptr<arrow::Scalar>> cudf_scalar_to_arrow(
+  cudf::scalar const& scalar, std::optional<cudf::column_metadata> metadata = std::nullopt)
+{
+  auto const cudf_column   = cudf::make_column_from_scalar(scalar, 1);
+  auto const c_arrow_array = cudf::to_arrow_host(*cudf_column);
+  auto const arrow_array   = [&]() {
+    if (metadata.has_value()) {
+      auto const table = cudf::table_view({cudf_column->view()});
+      std::vector<cudf::column_metadata> const table_metadata = {metadata.value()};
+      auto const arrow_schema = cudf::to_arrow_schema(table, table_metadata);
+      return arrow::ImportArray(&c_arrow_array->array, arrow_schema->children[0]).ValueOrDie();
+    } else {
+      auto const arrow_type = col_to_arrow_type(cudf_column->view());
+      return arrow::ImportArray(&c_arrow_array->array, arrow_type).ValueOrDie();
+    }
+  }();
+  auto const maybe_scalar = arrow_array->GetScalar(0);
+  if (!maybe_scalar.ok()) { return std::nullopt; }
+  return maybe_scalar.ValueOrDie();
+}
+
 TYPED_TEST(ToArrowNumericScalarTest, Basic)
 {
   TypeParam const value{42};
   auto const cudf_scalar = cudf::make_fixed_width_scalar<TypeParam>(value);
 
-  cudf::column_metadata const metadata{""};
-  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+  auto const maybe_scalar = cudf_scalar_to_arrow(*cudf_scalar);
+  ASSERT_TRUE(maybe_scalar.has_value());
+  auto const arrow_scalar = *maybe_scalar;
 
   auto const ref_arrow_scalar = arrow::MakeScalar(value);
   EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar));
@@ -621,8 +667,9 @@ TEST_F(ToArrowDecimalScalarTest, Basic)
   auto const cudf_scalar =
     cudf::make_fixed_point_scalar<numeric::decimal128>(value, numeric::scale_type{scale});
 
-  cudf::column_metadata const metadata{""};
-  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+  auto const maybe_scalar = cudf_scalar_to_arrow(*cudf_scalar);
+  ASSERT_TRUE(maybe_scalar.has_value());
+  auto const arrow_scalar = *maybe_scalar;
 
   auto const maybe_ref_arrow_scalar =
     arrow::MakeScalar(arrow::decimal128(precision, -scale), value);
@@ -636,9 +683,10 @@ struct ToArrowStringScalarTest : public cudf::test::BaseFixture {};
 TEST_F(ToArrowStringScalarTest, Basic)
 {
   std::string const value{"hello world"};
-  auto const cudf_scalar = cudf::make_string_scalar(value);
-  cudf::column_metadata const metadata{""};
-  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+  auto const cudf_scalar  = cudf::make_string_scalar(value);
+  auto const maybe_scalar = cudf_scalar_to_arrow(*cudf_scalar);
+  ASSERT_TRUE(maybe_scalar.has_value());
+  auto const arrow_scalar = *maybe_scalar;
 
   auto const ref_arrow_scalar = arrow::MakeScalar(value);
   EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar));
@@ -656,8 +704,9 @@ TEST_F(ToArrowListScalarTest, Basic)
 
   auto const cudf_scalar = cudf::make_list_scalar(col);
 
-  cudf::column_metadata const metadata{""};
-  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+  auto const maybe_scalar = cudf_scalar_to_arrow(*cudf_scalar);
+  ASSERT_TRUE(maybe_scalar.has_value());
+  auto const arrow_scalar = *maybe_scalar;
 
   arrow::Int64Builder builder;
   auto const status      = builder.AppendValues(host_values, host_validity);
@@ -682,7 +731,10 @@ TEST_F(ToArrowStructScalarTest, Basic)
 
   cudf::column_metadata metadata{""};
   metadata.children_meta.emplace_back(field_name);
-  auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata);
+
+  auto const maybe_scalar = cudf_scalar_to_arrow(*cudf_scalar, metadata);
+  ASSERT_TRUE(maybe_scalar.has_value());
+  auto const arrow_scalar = *maybe_scalar;
 
   auto const underlying_arrow_scalar = arrow::MakeScalar(value);
   auto const field            = arrow::field(field_name, underlying_arrow_scalar->type, false);
@@ -693,5 +745,3 @@ TEST_F(ToArrowStructScalarTest, Basic)
 }
 
 CUDF_TEST_PROGRAM_MAIN()
-
-#endif
diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp
deleted file mode 100644
index 9ba862585d0..00000000000
--- a/cpp/tests/streams/interop_test.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-// These interop functions are deprecated. We keep the code in this
-// test and will migrate the tests to export via the arrow C data
-// interface with to_arrow_host which arrow can consume. For now, the
-// test is commented out.
-
-#if 0
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/default_stream.hpp>
-
-#include <cudf/interop.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/scalar/scalar_factories.hpp>
-#include <cudf/table/table_view.hpp>
-
-struct ArrowTest : public cudf::test::BaseFixture {};
-
-TEST_F(ArrowTest, ToArrow)
-{
-  int32_t const value{42};
-  auto col = cudf::test::fixed_width_column_wrapper<int32_t>{{value}};
-  cudf::table_view tbl{{col}};
-
-  std::vector<cudf::column_metadata> metadata{{""}};
-  cudf::to_arrow(tbl, metadata, cudf::test::get_default_stream());
-}
-
-TEST_F(ArrowTest, FromArrow)
-{
-  std::vector<int64_t> host_values = {1, 2, 3, 5, 6, 7, 8};
-  std::vector<bool> host_validity  = {true, true, true, false, true, true, true};
-
-  arrow::Int64Builder builder;
-  auto status      = builder.AppendValues(host_values, host_validity);
-  auto maybe_array = builder.Finish();
-  auto array       = *maybe_array;
-
-  auto field  = arrow::field("", arrow::int32());
-  auto schema = arrow::schema({field});
-  auto table  = arrow::Table::Make(schema, {array});
-  cudf::from_arrow(*table, cudf::test::get_default_stream());
-}
-
-TEST_F(ArrowTest, ToArrowScalar)
-{
-  int32_t const value{42};
-  auto cudf_scalar =
-    cudf::make_fixed_width_scalar<int32_t>(value, cudf::test::get_default_stream());
-
-  cudf::column_metadata metadata{""};
-  cudf::to_arrow(*cudf_scalar, metadata, cudf::test::get_default_stream());
-}
-
-TEST_F(ArrowTest, FromArrowScalar)
-{
-  int32_t const value{42};
-  auto arrow_scalar = arrow::MakeScalar(value);
-  cudf::from_arrow(*arrow_scalar, cudf::test::get_default_stream());
-}
-
-#endif

From cb843dbdc2fc0c73c8af98909304c768bb65c16f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 16 Aug 2024 11:02:06 -1000
Subject: [PATCH 682/842] Fix DataFrame reductions with median returning scalar
 instead of Series (#16527)

xref https://github.com/rapidsai/cudf/issues/16507

This turned into a little bit of a refactor that also fixes the following:

* `cudf.DataFrame.from_pandas` not preserving the `pandas.DataFrame.column.dtype`
* `cudf.DataFrame.<reduction>(axis=0)` not preserving the `.column` properties in the resulting `.index`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16527
---
 python/cudf/cudf/core/column_accessor.py  |   3 +
 python/cudf/cudf/core/dataframe.py        | 120 ++++++++--------------
 python/cudf/cudf/core/indexed_frame.py    |  36 +------
 python/cudf/cudf/tests/test_dataframe.py  |   6 ++
 python/cudf/cudf/tests/test_reductions.py |  35 +++++++
 5 files changed, 90 insertions(+), 110 deletions(-)

diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 48bc84070b1..67c19f11e41 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -352,6 +352,9 @@ def insert(
             new_values = self.columns[:loc] + (value,) + self.columns[loc:]
             self._data = self._data.__class__(zip(new_keys, new_values))
         self._clear_cache(old_ncols, old_ncols + 1)
+        if old_ncols == 0:
+            # The type(name) may no longer match the prior label_dtype
+            self.label_dtype = None
 
     def copy(self, deep=False) -> ColumnAccessor:
         """
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 6ee3d69441f..97684129203 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5497,14 +5497,9 @@ def from_pandas(cls, dataframe, nan_as_null=no_default):
             )
 
         if isinstance(dataframe, pd.DataFrame):
-            if not dataframe.columns.is_unique:
-                raise ValueError("Duplicate column names are not allowed")
-
             data = {
-                col_name: column.as_column(
-                    col_value.array, nan_as_null=nan_as_null
-                )
-                for col_name, col_value in dataframe.items()
+                i: column.as_column(col_value.array, nan_as_null=nan_as_null)
+                for i, (_, col_value) in enumerate(dataframe.items())
             }
             if isinstance(dataframe.index, pd.MultiIndex):
                 index = cudf.MultiIndex.from_pandas(
@@ -5515,14 +5510,8 @@ def from_pandas(cls, dataframe, nan_as_null=no_default):
                     dataframe.index, nan_as_null=nan_as_null
                 )
             df = cls._from_data(data, index)
-            df._data._level_names = tuple(dataframe.columns.names)
-
-            if isinstance(dataframe.columns, pd.RangeIndex):
-                df._data.rangeindex = True
-            # Set columns only if it is a MultiIndex
-            elif isinstance(dataframe.columns, pd.MultiIndex):
-                df.columns = dataframe.columns
-
+            # Checks duplicate columns and sets column metadata
+            df.columns = dataframe.columns
             return df
         elif hasattr(dataframe, "__dataframe__"):
             # TODO: Probably should be handled in the constructor as
@@ -6382,8 +6371,11 @@ def _reduce(
         source = self
 
         if axis is None:
+            assert PANDAS_LT_300, "Replace if/else with just axis=2"
+            # TODO(pandas3.0): Remove if/else for just axis = 2
             if op in {"sum", "product", "std", "var"}:
-                # Do not remove until pandas 2.0 support is added.
+                # pandas only raises FutureWarning for these ops
+                # though it applies for all reductions
                 warnings.warn(
                     f"In a future version, {type(self).__name__}"
                     f".{op}(axis=None) will return a scalar {op} over "
@@ -6402,9 +6394,7 @@ def _reduce(
 
         if numeric_only:
             numeric_cols = (
-                name
-                for name in self._data.names
-                if is_numeric_dtype(self._data[name].dtype)
+                name for name, dtype in self._dtypes if is_numeric_dtype(dtype)
             )
             source = self._get_columns_by_label(numeric_cols)
             if source.empty:
@@ -6414,62 +6404,41 @@ def _reduce(
                     else source.index,
                     dtype="float64",
                 )
-        if axis in {0, 2}:
-            if axis == 2 and op in ("kurtosis", "kurt", "skew"):
-                # TODO: concat + op can probably be done in the general case
-                # for axis == 2.
-                # https://github.com/rapidsai/cudf/issues/14930
-                return getattr(concat_columns(source._data.columns), op)(
-                    **kwargs
-                )
-            try:
-                result = [
-                    getattr(source._data[col], op)(**kwargs)
-                    for col in source._data.names
-                ]
-            except AttributeError:
-                numeric_ops = (
-                    "mean",
-                    "min",
-                    "max",
-                    "sum",
-                    "product",
-                    "prod",
-                    "std",
-                    "var",
-                    "kurtosis",
-                    "kurt",
-                    "skew",
-                )
-
-                if op in numeric_ops:
+        if (
+            axis == 2
+            and op in {"kurtosis", "skew"}
+            and self._num_rows < 4
+            and self._num_columns > 1
+        ):
+            # Total number of elements may satisfy the min number of values
+            # to compute skew/kurtosis
+            return getattr(concat_columns(source._columns), op)(**kwargs)
+        elif axis == 1:
+            return source._apply_cupy_method_axis_1(op, **kwargs)
+        else:
+            axis_0_results = []
+            for col_label, col in source._data.items():
+                try:
+                    axis_0_results.append(getattr(col, op)(**kwargs))
+                except AttributeError as err:
                     if numeric_only:
-                        try:
-                            result = [
-                                getattr(source._data[col], op)(**kwargs)
-                                for col in source._data.names
-                            ]
-                        except AttributeError:
-                            raise NotImplementedError(
-                                f"Not all column dtypes support op {op}"
-                            )
-                    elif any(
-                        not is_numeric_dtype(self._data[name].dtype)
-                        for name in self._data.names
-                    ):
+                        raise NotImplementedError(
+                            f"Column {col_label} with type {col.dtype} does not support {op}"
+                        ) from err
+                    elif not is_numeric_dtype(col.dtype):
                         raise TypeError(
                             "Non numeric columns passed with "
                             "`numeric_only=False`, pass `numeric_only=True` "
                             f"to perform DataFrame.{op}"
-                        )
-                else:
-                    raise
+                        ) from err
+                    else:
+                        raise
             if axis == 2:
-                return getattr(as_column(result, nan_as_null=False), op)(
-                    **kwargs
-                )
+                return getattr(
+                    as_column(axis_0_results, nan_as_null=False), op
+                )(**kwargs)
             else:
-                source_dtypes = [c.dtype for c in source._data.columns]
+                source_dtypes = [dtype for _, dtype in source._dtypes]
                 common_dtype = find_common_type(source_dtypes)
                 if (
                     is_object_dtype(common_dtype)
@@ -6483,17 +6452,14 @@ def _reduce(
                         "Columns must all have the same dtype to "
                         f"perform {op=} with {axis=}"
                     )
+                pd_index = source._data.to_pandas_index()
                 if source._data.multiindex:
-                    idx = MultiIndex.from_tuples(
-                        source._data.names, names=source._data.level_names
-                    )
+                    idx = MultiIndex.from_pandas(pd_index)
                 else:
-                    idx = cudf.Index(source._data.names)
-                return Series._from_column(as_column(result), index=idx)
-        elif axis == 1:
-            return source._apply_cupy_method_axis_1(op, **kwargs)
-        else:
-            raise ValueError(f"Invalid value of {axis=} received for {op}")
+                    idx = cudf.Index.from_pandas(pd_index)
+                return Series._from_column(
+                    as_column(axis_0_results), index=idx
+                )
 
     @_performance_tracking
     def _scan(
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 2263dfd5c98..e46e24dd0d8 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -1386,11 +1386,6 @@ def sum(
         a    10
         b    34
         dtype: int64
-
-        .. pandas-compat::
-           :meth:`pandas.DataFrame.sum`, :meth:`pandas.Series.sum`
-
-            Parameters currently not supported are `level`, `numeric_only`.
         """
         return self._reduce(
             "sum",
@@ -1447,11 +1442,6 @@ def product(
         a      24
         b    5040
         dtype: int64
-
-        .. pandas-compat::
-            :meth:`pandas.DataFrame.product`, :meth:`pandas.Series.product`
-
-            Parameters currently not supported are level`, `numeric_only`.
         """
 
         return self._reduce(
@@ -1508,7 +1498,9 @@ def mean(self, axis=0, skipna=True, numeric_only=False, **kwargs):
             **kwargs,
         )
 
-    def median(self, axis=None, skipna=True, numeric_only=None, **kwargs):
+    def median(
+        self, axis=no_default, skipna=True, numeric_only=None, **kwargs
+    ):
         """
         Return the median of the values for the requested axis.
 
@@ -1542,11 +1534,6 @@ def median(self, axis=None, skipna=True, numeric_only=None, **kwargs):
         dtype: int64
         >>> ser.median()
         17.0
-
-        .. pandas-compat::
-            :meth:`pandas.DataFrame.median`, :meth:`pandas.Series.median`
-
-            Parameters currently not supported are `level` and `numeric_only`.
         """
         return self._reduce(
             "median",
@@ -1598,12 +1585,6 @@ def std(
         a    1.290994
         b    1.290994
         dtype: float64
-
-        .. pandas-compat::
-            :meth:`pandas.DataFrame.std`, :meth:`pandas.Series.std`
-
-            Parameters currently not supported are `level` and
-            `numeric_only`
         """
 
         return self._reduce(
@@ -1657,12 +1638,6 @@ def var(
         a    1.666667
         b    1.666667
         dtype: float64
-
-        .. pandas-compat::
-            :meth:`pandas.DataFrame.var`, :meth:`pandas.Series.var`
-
-            Parameters currently not supported are `level` and
-            `numeric_only`
         """
         return self._reduce(
             "var",
@@ -1713,11 +1688,6 @@ def kurtosis(self, axis=0, skipna=True, numeric_only=False, **kwargs):
         a   -1.2
         b   -1.2
         dtype: float64
-
-        .. pandas-compat::
-            :meth:`pandas.DataFrame.kurtosis`
-
-            Parameters currently not supported are `level` and `numeric_only`
         """
         if axis not in (0, "index", None, no_default):
             raise NotImplementedError("Only axis=0 is currently supported.")
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 89eb5a12c71..9122a1074ac 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -11114,3 +11114,9 @@ def test_bool_raises():
         lfunc_args_and_kwargs=[[cudf.DataFrame()]],
         rfunc_args_and_kwargs=[[pd.DataFrame()]],
     )
+
+
+def test_from_pandas_preserve_column_dtype():
+    df = pd.DataFrame([[1, 2]], columns=pd.Index([1, 2], dtype="int8"))
+    result = cudf.DataFrame.from_pandas(df)
+    pd.testing.assert_index_equal(result.columns, df.columns, exact=True)
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index 8be6463c699..a70a2ea15dd 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -358,6 +358,30 @@ def test_reductions_axis_none_warning(op):
     assert_eq(expected, actual, check_dtype=False)
 
 
+@pytest.mark.parametrize(
+    "op",
+    [
+        "sum",
+        "product",
+        "std",
+        "var",
+        "kurt",
+        "kurtosis",
+        "skew",
+        "min",
+        "max",
+        "mean",
+        "median",
+    ],
+)
+def test_dataframe_reduction_no_args(op):
+    df = cudf.DataFrame({"a": range(10), "b": range(10)})
+    pdf = df.to_pandas()
+    result = getattr(df, op)()
+    expected = getattr(pdf, op)()
+    assert_eq(result, expected)
+
+
 def test_reduction_column_multiindex():
     idx = cudf.MultiIndex.from_tuples(
         [("a", 1), ("a", 2)], names=["foo", "bar"]
@@ -374,3 +398,14 @@ def test_dtype_deprecated(op):
     with pytest.warns(FutureWarning):
         result = getattr(ser, op)(dtype=np.dtype(np.int8))
     assert isinstance(result, np.int8)
+
+
+@pytest.mark.parametrize(
+    "columns", [pd.RangeIndex(2), pd.Index([0, 1], dtype="int8")]
+)
+def test_dataframe_axis_0_preserve_column_type_in_index(columns):
+    pd_df = pd.DataFrame([[1, 2]], columns=columns)
+    cudf_df = cudf.DataFrame.from_pandas(pd_df)
+    result = cudf_df.sum(axis=0)
+    expected = pd_df.sum(axis=0)
+    assert_eq(result, expected, check_index_type=True)

From fd44adc9e02dec4cdde9626f46ba231bda4a7ea6 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 16 Aug 2024 13:02:49 -1000
Subject: [PATCH 683/842] Make CategoricalColumn.__init__ strict (#16456)

This PR transfers some of the validation logic in `build_column` directly into `CategoricalColumn` just in case `CategoricalColumn` is called independently of `build_column`. Additionally adds stricter validation of `data`, `dtype` and `children` so the column doesn't represent an invalid state

xref https://github.com/rapidsai/cudf/issues/16469

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16456
---
 python/cudf/cudf/_lib/column.pyx            |  6 +--
 python/cudf/cudf/core/column/categorical.py | 56 +++++++++++++--------
 python/cudf/cudf/core/column/column.py      |  9 +---
 3 files changed, 40 insertions(+), 31 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 2e400f775d3..e27c595edda 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -86,7 +86,7 @@ cdef class Column:
         object mask=None,
         int offset=0,
         object null_count=None,
-        object children=()
+        tuple children=()
     ):
         if size < 0:
             raise ValueError("size must be >=0")
@@ -297,11 +297,11 @@ cdef class Column:
                 dtypes = [
                     base_child.dtype for base_child in self.base_children
                 ]
-                self._children = [
+                self._children = tuple(
                     child._with_type_metadata(dtype) for child, dtype in zip(
                         children, dtypes
                     )
-                ]
+                )
         return self._children
 
     def set_base_children(self, value):
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 66aed38bffd..1fdaf9f8c07 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -465,6 +465,18 @@ def reorder_categories(
         )
 
 
+def validate_categorical_children(children) -> None:
+    if not (
+        len(children) == 1
+        and isinstance(children[0], cudf.core.column.numerical.NumericalColumn)
+        and children[0].dtype.kind in "iu"
+    ):
+        # TODO: Enforce unsigned integer?
+        raise ValueError(
+            "Must specify exactly one child NumericalColumn of integers for representing the codes."
+        )
+
+
 class CategoricalColumn(column.ColumnBase):
     """
     Implements operations for Columns of Categorical type
@@ -481,8 +493,7 @@ class CategoricalColumn(column.ColumnBase):
         respectively
     """
 
-    dtype: cudf.core.dtypes.CategoricalDtype
-    _codes: NumericalColumn | None
+    dtype: CategoricalDtype
     _children: tuple[NumericalColumn]
     _VALID_REDUCTIONS = {
         "max",
@@ -499,25 +510,29 @@ class CategoricalColumn(column.ColumnBase):
 
     def __init__(
         self,
+        data: None,
+        size: int | None,
         dtype: CategoricalDtype,
         mask: Buffer | None = None,
-        size: int | None = None,
         offset: int = 0,
         null_count: int | None = None,
-        children: tuple["column.ColumnBase", ...] = (),
+        children: tuple[NumericalColumn] = (),  # type: ignore[assignment]
     ):
+        if data is not None:
+            raise ValueError(f"{data=} must be None")
+        validate_categorical_children(children)
         if size is None:
-            for child in children:
-                assert child.offset == 0
-                assert child.base_mask is None
-            size = children[0].size
+            child = children[0]
+            assert child.offset == 0
+            assert child.base_mask is None
+            size = child.size
             size = size - offset
-        if isinstance(dtype, pd.api.types.CategoricalDtype):
-            dtype = CategoricalDtype.from_pandas(dtype)
         if not isinstance(dtype, CategoricalDtype):
-            raise ValueError("dtype must be instance of CategoricalDtype")
+            raise ValueError(
+                f"{dtype=} must be cudf.CategoricalDtype instance."
+            )
         super().__init__(
-            data=None,
+            data=data,
             size=size,
             dtype=dtype,
             mask=mask,
@@ -525,7 +540,7 @@ def __init__(
             null_count=null_count,
             children=children,
         )
-        self._codes = None
+        self._codes = self.children[0].set_mask(self.mask)
 
     @property
     def base_size(self) -> int:
@@ -558,13 +573,14 @@ def _process_values_for_isin(
         rhs = cudf.core.column.as_column(values, dtype=self.dtype)
         return lhs, rhs
 
-    def set_base_mask(self, value: Buffer | None):
+    def set_base_mask(self, value: Buffer | None) -> None:
         super().set_base_mask(value)
-        self._codes = None
+        self._codes = self.children[0].set_mask(self.mask)
 
-    def set_base_children(self, value: tuple[ColumnBase, ...]):
+    def set_base_children(self, value: tuple[NumericalColumn]) -> None:  # type: ignore[override]
         super().set_base_children(value)
-        self._codes = None
+        validate_categorical_children(value)
+        self._codes = value[0].set_mask(self.mask)
 
     @property
     def children(self) -> tuple[NumericalColumn]:
@@ -586,9 +602,7 @@ def categories(self) -> ColumnBase:
 
     @property
     def codes(self) -> NumericalColumn:
-        if self._codes is None:
-            self._codes = self.children[0].set_mask(self.mask)
-        return cast(cudf.core.column.NumericalColumn, self._codes)
+        return self._codes
 
     @property
     def ordered(self) -> bool:
@@ -1131,7 +1145,7 @@ def _mimic_inplace(
     ) -> Self | None:
         out = super()._mimic_inplace(other_col, inplace=inplace)
         if inplace and isinstance(other_col, CategoricalColumn):
-            self._codes = other_col._codes
+            self._codes = other_col.codes
         return out
 
     def view(self, dtype: Dtype) -> ColumnBase:
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 090c02da990..19d6bf84d3f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1578,19 +1578,14 @@ def build_column(
         return col
 
     if isinstance(dtype, CategoricalDtype):
-        if not len(children) == 1:
-            raise ValueError(
-                "Must specify exactly one child column for CategoricalColumn"
-            )
-        if not isinstance(children[0], ColumnBase):
-            raise TypeError("children must be a tuple of Columns")
         return cudf.core.column.CategoricalColumn(
+            data=data,  # type: ignore[arg-type]
             dtype=dtype,
             mask=mask,
             size=size,
             offset=offset,
             null_count=null_count,
-            children=children,
+            children=children,  # type: ignore[arg-type]
         )
     elif dtype.type is np.datetime64:
         return cudf.core.column.DatetimeColumn(

From b63ba70f2cf3724eeb118f9d2ec03a370c135f23 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 16 Aug 2024 18:27:07 -0700
Subject: [PATCH 684/842] Add build job for pylibcudf (#16587)

This was missed in #16299 and is necessary to get builds published.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16587
---
 .github/workflows/build.yaml | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 2fc39c06fad..9943b02a521 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -67,7 +67,27 @@ jobs:
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
       sha: ${{ inputs.sha }}
+  wheel-build-pylibcudf:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_pylibcudf.sh
+  wheel-publish-pylibcudf:
+    needs: wheel-build-pylibcudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: pylibcudf
   wheel-build-cudf:
+    needs: wheel-publish-pylibcudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
@@ -109,7 +129,7 @@ jobs:
       date: ${{ inputs.date }}
       package-name: dask_cudf
   wheel-build-cudf-polars:
-    needs: wheel-publish-cudf
+    needs: wheel-publish-pylibcudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:

From dd2c12dd8a8682b562bb3b420e0982f79a99438d Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 16 Aug 2024 23:18:42 -0400
Subject: [PATCH 685/842] Revert "Make proxy NumPy arrays pass isinstance check
 in `cudf.pandas`" (#16586)

Reverts rapidsai/cudf#16286

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16586
---
 python/cudf/cudf/pandas/_wrappers/numpy.py    |  3 ---
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 20 +---------------
 python/cudf/cudf/pandas/proxy_base.py         | 23 -------------------
 .../cudf_pandas_tests/test_cudf_pandas.py     |  8 -------
 4 files changed, 1 insertion(+), 53 deletions(-)
 delete mode 100644 python/cudf/cudf/pandas/proxy_base.py

diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index eabea9713f1..3b012169676 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -14,7 +14,6 @@
     make_final_proxy_type,
     make_intermediate_proxy_type,
 )
-from ..proxy_base import ProxyNDarrayBase
 from .common import (
     array_interface,
     array_method,
@@ -112,14 +111,12 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor):
     numpy.ndarray,
     fast_to_slow=cupy.ndarray.get,
     slow_to_fast=cupy.asarray,
-    bases=(ProxyNDarrayBase,),
     additional_attributes={
         "__array__": array_method,
         # So that pa.array(wrapped-numpy-array) works
         "__arrow_array__": arrow_array_method,
         "__cuda_array_interface__": cuda_array_interface,
         "__array_interface__": array_interface,
-        "__array_ufunc__": _FastSlowAttribute("__array_ufunc__"),
         # ndarrays are unhashable
         "__hash__": None,
         # iter(cupy-array) produces an iterable of zero-dim device
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 61aa6310082..bb678fd1efe 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -19,7 +19,6 @@
 from ..options import _env_get_bool
 from ..testing import assert_eq
 from .annotation import nvtx
-from .proxy_base import ProxyNDarrayBase
 
 
 def call_operator(fn, args, kwargs):
@@ -565,11 +564,7 @@ def _fsproxy_wrap(cls, value, func):
         _FinalProxy subclasses can override this classmethod if they
         need particular behaviour when wrapped up.
         """
-        base_class = _get_proxy_base_class(cls)
-        if base_class is object:
-            proxy = base_class.__new__(cls)
-        else:
-            proxy = base_class.__new__(cls, value)
+        proxy = object.__new__(cls)
         proxy._fsproxy_wrapped = value
         return proxy
 
@@ -1198,19 +1193,6 @@ def is_proxy_object(obj: Any) -> bool:
     return False
 
 
-def _get_proxy_base_class(cls):
-    """Returns the proxy base class if one exists"""
-    for proxy_class in PROXY_BASE_CLASSES:
-        if proxy_class in cls.__mro__:
-            return proxy_class
-    return object
-
-
-PROXY_BASE_CLASSES: set[type] = {
-    ProxyNDarrayBase,
-}
-
-
 NUMPY_TYPES: set[str] = set(np.sctypeDict.values())
 
 
diff --git a/python/cudf/cudf/pandas/proxy_base.py b/python/cudf/cudf/pandas/proxy_base.py
deleted file mode 100644
index 61d9cde127c..00000000000
--- a/python/cudf/cudf/pandas/proxy_base.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
-# All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-import cupy as cp
-import numpy as np
-
-
-class ProxyNDarrayBase(np.ndarray):
-    def __new__(cls, arr):
-        if isinstance(arr, cp.ndarray):
-            obj = np.asarray(arr.get()).view(cls)
-            return obj
-        elif isinstance(arr, np.ndarray):
-            obj = np.asarray(arr).view(cls)
-            return obj
-        else:
-            raise TypeError(
-                "Unsupported array type. Must be numpy.ndarray or cupy.ndarray"
-            )
-
-    def __array_finalize__(self, obj):
-        self._fsproxy_wrapped = getattr(obj, "_fsproxy_wrapped", None)
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index e5483fff913..6292022d8e4 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1632,11 +1632,3 @@ def test_change_index_name(index):
 
         assert s.index.name == name
         assert df.index.name == name
-
-
-def test_numpy_ndarray_isinstancecheck(series):
-    s1, s2 = series
-    arr1 = s1.values
-    arr2 = s2.values
-    assert isinstance(arr1, np.ndarray)
-    assert isinstance(arr2, np.ndarray)

From 592342c152af743390a923f125a380fe3b8f41c1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 19 Aug 2024 09:28:35 -0400
Subject: [PATCH 686/842] Remove invalid column_view usage in
 string-scalar-to-column function (#16530)

Fixes the `make_column_from_scalar` function for `string_scalar` internal usage of a temporary `column_view` with non-zero size but no data or children to call `cudf::strings::detail::fill`. This relied too much on fragile internal logic which has cause several headaches including the recent work adding prefetch logic to libcudf.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16530
---
 cpp/src/column/column_factories.cu | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/cpp/src/column/column_factories.cu b/cpp/src/column/column_factories.cu
index bad20d6817c..ad9c5e4d3a0 100644
--- a/cpp/src/column/column_factories.cu
+++ b/cpp/src/column/column_factories.cu
@@ -20,11 +20,12 @@
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/lists/detail/lists_column_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/detail/fill.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
+#include <thrust/uninitialized_fill.h>
 
 namespace cudf {
 
@@ -57,15 +58,26 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stri
 {
   if (size == 0) return make_empty_column(value.type());
 
-  // Since we are setting every row to the scalar, the fill() never needs to access
-  // any of the children in the strings column which would otherwise cause an exception.
-  column_view sc{value.type(), size, nullptr, nullptr, 0};
-  auto& sv = static_cast<scalar_type_t<cudf::string_view> const&>(value);
+  if (!value.is_valid(stream)) {
+    return make_strings_column(
+      size,
+      make_column_from_scalar(numeric_scalar<int32_t>(0), size + 1, stream, mr),
+      rmm::device_buffer{},
+      size,
+      cudf::detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr));
+  }
 
-  // fill the column with the scalar
-  auto output = strings::detail::fill(strings_column_view(sc), 0, size, sv, stream, mr);
+  auto& ss         = static_cast<scalar_type_t<cudf::string_view> const&>(value);
+  auto const d_str = ss.value(stream);  // no actual data is copied
 
-  return output;
+  // fill the column with the scalar
+  rmm::device_uvector<cudf::strings::detail::string_index_pair> indices(size, stream);
+  auto const row_value =
+    d_str.empty() ? cudf::strings::detail::string_index_pair{"", 0}
+                  : cudf::strings::detail::string_index_pair{d_str.data(), d_str.size_bytes()};
+  thrust::uninitialized_fill(
+    rmm::exec_policy_nosync(stream), indices.begin(), indices.end(), row_value);
+  return cudf::strings::detail::make_strings_column(indices.begin(), indices.end(), stream, mr);
 }
 
 template <>

From 1b18cbc1e0b0e5dd7109228ce34c0fde5a2ddcb8 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 19 Aug 2024 08:03:54 -0700
Subject: [PATCH 687/842] Add `ToCudfBackend` expression to dask-cudf (#16573)

Adds a `ToCudfBackend` expression for "pandas" to "cudf" conversion, preventing `to_backend("cudf")` operations from blocking useful optimizations like predicate pushdown.

This is the dask-cudf component of https://github.com/dask/dask-expr/pull/1115

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)

URL: https://github.com/rapidsai/cudf/pull/16573
---
 python/dask_cudf/dask_cudf/backends.py        | 20 +++++++-----
 python/dask_cudf/dask_cudf/expr/_expr.py      | 31 ++++++++++++++++++-
 python/dask_cudf/dask_cudf/tests/test_core.py | 16 +++++++++-
 python/dask_cudf/dask_cudf/tests/utils.py     |  4 +++
 4 files changed, 62 insertions(+), 9 deletions(-)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index a65ae819b44..16b2c8959e2 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -537,6 +537,12 @@ def to_cudf_dispatch_from_pandas(data, nan_as_null=None, **kwargs):
     return cudf.from_pandas(data, nan_as_null=nan_as_null)
 
 
+@to_cudf_dispatch.register((cudf.DataFrame, cudf.Series, cudf.Index))
+def to_cudf_dispatch_from_cudf(data, **kwargs):
+    _unsupported_kwargs("cudf", "cudf", kwargs)
+    return data
+
+
 # Define "cudf" backend engine to be registered with Dask
 class CudfBackendEntrypoint(DataFrameBackendEntrypoint):
     """Backend-entrypoint class for Dask-DataFrame
@@ -643,20 +649,20 @@ class CudfDXBackendEntrypoint(DataFrameBackendEntrypoint):
     Examples
     --------
     >>> import dask
-    >>> import dask_expr
+    >>> import dask_expr as dx
     >>> with dask.config.set({"dataframe.backend": "cudf"}):
     ...     ddf = dx.from_dict({"a": range(10)})
     >>> type(ddf._meta)
     <class 'cudf.core.dataframe.DataFrame'>
     """
 
-    @classmethod
-    def to_backend_dispatch(cls):
-        return CudfBackendEntrypoint.to_backend_dispatch()
+    @staticmethod
+    def to_backend(data, **kwargs):
+        import dask_expr as dx
 
-    @classmethod
-    def to_backend(cls, *args, **kwargs):
-        return CudfBackendEntrypoint.to_backend(*args, **kwargs)
+        from dask_cudf.expr._expr import ToCudfBackend
+
+        return dx.new_collection(ToCudfBackend(data, kwargs))
 
     @staticmethod
     def from_dict(
diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py
index 8fccaccb695..8a2c50d3fe7 100644
--- a/python/dask_cudf/dask_cudf/expr/_expr.py
+++ b/python/dask_cudf/dask_cudf/expr/_expr.py
@@ -4,12 +4,41 @@
 import dask_expr._shuffle as _shuffle_module
 from dask_expr import new_collection
 from dask_expr._cumulative import CumulativeBlockwise
-from dask_expr._expr import Expr, VarColumns
+from dask_expr._expr import Elemwise, Expr, VarColumns
 from dask_expr._reductions import Reduction, Var
 
 from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty
 from dask.dataframe.dispatch import is_categorical_dtype
 
+import cudf
+
+##
+## Custom expressions
+##
+
+
+class ToCudfBackend(Elemwise):
+    # TODO: Inherit from ToBackend when rapids-dask-dependency
+    # is pinned to dask>=2024.8.1
+    _parameters = ["frame", "options"]
+    _projection_passthrough = True
+    _filter_passthrough = True
+    _preserves_partitioning_information = True
+
+    @staticmethod
+    def operation(df, options):
+        from dask_cudf.backends import to_cudf_dispatch
+
+        return to_cudf_dispatch(df, **options)
+
+    def _simplify_down(self):
+        if isinstance(
+            self.frame._meta, (cudf.DataFrame, cudf.Series, cudf.Index)
+        ):
+            # We already have cudf data
+            return self.frame
+
+
 ##
 ## Custom expression patching
 ##
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 174923c2c7e..905d8c08135 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -15,7 +15,11 @@
 import cudf
 
 import dask_cudf
-from dask_cudf.tests.utils import skip_dask_expr, xfail_dask_expr
+from dask_cudf.tests.utils import (
+    require_dask_expr,
+    skip_dask_expr,
+    xfail_dask_expr,
+)
 
 
 def test_from_dict_backend_dispatch():
@@ -993,3 +997,13 @@ def test_series_isin_error():
         ser.isin([1, 5, "a"])
     with pytest.raises(TypeError):
         ddf.isin([1, 5, "a"]).compute()
+
+
+@require_dask_expr()
+def test_to_backend_simplify():
+    # Check that column projection is not blocked by to_backend
+    with dask.config.set({"dataframe.backend": "pandas"}):
+        df = dd.from_dict({"x": [1, 2, 3], "y": [4, 5, 6]}, npartitions=2)
+        df2 = df.to_backend("cudf")[["y"]].simplify()
+        df3 = df[["y"]].to_backend("cudf").to_backend("cudf").simplify()
+        assert df2._name == df3._name
diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py
index c7dedbb6b4a..cc0c6899804 100644
--- a/python/dask_cudf/dask_cudf/tests/utils.py
+++ b/python/dask_cudf/dask_cudf/tests/utils.py
@@ -48,3 +48,7 @@ def xfail_dask_expr(reason=_default_reason, lt_version=None):
     else:
         xfail = QUERY_PLANNING_ON
     return pytest.mark.xfail(xfail, reason=reason)
+
+
+def require_dask_expr(reason="requires dask-expr"):
+    return pytest.mark.skipif(not QUERY_PLANNING_ON, reason=reason)

From 049177839e79dd28c776b5edfb2fd3f6c1b884a2 Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Mon, 19 Aug 2024 17:05:16 +0200
Subject: [PATCH 688/842] MAINT: Adapt to numpy hiding flagsobject away
 (#16593)

Authors:
  - Sebastian Berg (https://github.com/seberg)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16593
---
 python/cudf/cudf/pandas/_wrappers/numpy.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index 3b012169676..90ac5198270 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -7,7 +7,7 @@
 import cupy
 import cupy._core.flags
 import numpy
-import numpy.core.multiarray
+from packaging import version
 
 from ..fast_slow_proxy import (
     _FastSlowAttribute,
@@ -141,10 +141,15 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor):
     },
 )
 
+if version.parse(numpy.__version__) >= version.parse("2.0"):
+    # NumPy 2 introduced `_core` and gives warnings for access to `core`.
+    from numpy._core.multiarray import flagsobj as _numpy_flagsobj
+else:
+    from numpy.core.multiarray import flagsobj as _numpy_flagsobj
 
 # Mapping flags between slow and fast types
 _ndarray_flags = make_intermediate_proxy_type(
     "_ndarray_flags",
     cupy._core.flags.Flags,
-    numpy.core.multiarray.flagsobj,
+    _numpy_flagsobj,
 )

From c516fc48694b6bdbeeb5b31ebdc760034efdb285 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 19 Aug 2024 06:55:03 -1000
Subject: [PATCH 689/842] Make ListColumn.__init__ strict (#16465)

This PR makes `ListColumn.__init__` strict putting restrictions on data, dtype, size and children so these columns cannot be constructed into to an invalid state. It also aligns the signature with the base class.

xref https://github.com/rapidsai/cudf/issues/16469

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16465
---
 python/cudf/cudf/core/column/column.py |  5 +-
 python/cudf/cudf/core/column/lists.py  | 64 +++++++++++++++++---------
 python/cudf/cudf/core/column/string.py |  1 +
 python/cudf/cudf/tests/test_list.py    |  2 +
 4 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 19d6bf84d3f..0857727d23f 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1625,12 +1625,13 @@ def build_column(
         )
     elif isinstance(dtype, ListDtype):
         return cudf.core.column.ListColumn(
-            size=size,
+            data=None,
+            size=size,  # type: ignore[arg-type]
             dtype=dtype,
             mask=mask,
             offset=offset,
             null_count=null_count,
-            children=children,
+            children=children,  # type: ignore[arg-type]
         )
     elif isinstance(dtype, IntervalDtype):
         return cudf.core.column.IntervalColumn(
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 1b7cd95b3d0..302f04a0e71 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 from functools import cached_property
-from typing import TYPE_CHECKING, Sequence
+from typing import TYPE_CHECKING, Sequence, cast
 
 import numpy as np
 import pandas as pd
@@ -29,30 +29,46 @@
 from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar
 from cudf.core.column import ColumnBase, as_column, column
 from cudf.core.column.methods import ColumnMethods, ParentType
+from cudf.core.column.numerical import NumericalColumn
 from cudf.core.dtypes import ListDtype
 from cudf.core.missing import NA
 
 if TYPE_CHECKING:
     from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
+    from cudf.core.buffer import Buffer
 
 
 class ListColumn(ColumnBase):
-    dtype: ListDtype
     _VALID_BINARY_OPERATIONS = {"__add__", "__radd__"}
 
     def __init__(
         self,
-        size,
-        dtype,
-        mask=None,
-        offset=0,
-        null_count=None,
-        children=(),
+        data: None,
+        size: int,
+        dtype: ListDtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple[NumericalColumn, ColumnBase] = (),  # type: ignore[assignment]
     ):
+        if data is not None:
+            raise ValueError("data must be None")
+        if not isinstance(dtype, ListDtype):
+            raise ValueError("dtype must be a cudf.ListDtype")
+        if not (
+            len(children) == 2
+            and isinstance(children[0], NumericalColumn)
+            # TODO: Enforce int32_t (size_type) used in libcudf?
+            and children[0].dtype.kind == "i"
+            and isinstance(children[1], ColumnBase)
+        ):
+            raise ValueError(
+                "children must a tuple of 2 columns of (signed integer offsets, list values)"
+            )
         super().__init__(
-            None,
-            size,
-            dtype,
+            data=data,
+            size=size,
+            dtype=dtype,
             mask=mask,
             offset=offset,
             null_count=null_count,
@@ -131,7 +147,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             raise TypeError("can only concatenate list to list")
 
     @property
-    def elements(self):
+    def elements(self) -> ColumnBase:
         """
         Column containing the elements of each list (may itself be a
         ListColumn)
@@ -139,11 +155,11 @@ def elements(self):
         return self.children[1]
 
     @property
-    def offsets(self):
+    def offsets(self) -> NumericalColumn:
         """
         Integer offsets to elements specifying each row of the ListColumn
         """
-        return self.children[0]
+        return cast(NumericalColumn, self.children[0])
 
     def to_arrow(self):
         offsets = self.offsets.to_arrow()
@@ -172,10 +188,9 @@ def set_base_data(self, value):
         else:
             super().set_base_data(value)
 
-    def set_base_children(self, value: tuple[ColumnBase, ...]):
+    def set_base_children(self, value: tuple[NumericalColumn, ColumnBase]):  # type: ignore[override]
         super().set_base_children(value)
-        _, values = value
-        self._dtype = cudf.ListDtype(element_type=values.dtype)
+        self._dtype = cudf.ListDtype(element_type=value[1].dtype)
 
     @property
     def __cuda_array_interface__(self):
@@ -196,12 +211,13 @@ def _with_type_metadata(
                 dtype.element_type
             )
             return ListColumn(
+                data=None,
                 dtype=dtype,
                 mask=self.base_mask,
                 size=self.size,
                 offset=self.offset,
                 null_count=self.null_count,
-                children=(self.base_children[0], elements),
+                children=(self.base_children[0], elements),  # type: ignore[arg-type]
             )
 
         return self
@@ -226,24 +242,25 @@ def from_sequences(
         """
         data_col = column.column_empty(0)
         mask_col = []
-        offset_col = [0]
+        offset_vals = [0]
         offset = 0
 
         # Build Data, Mask & Offsets
         for data in arbitrary:
             if cudf._lib.scalar._is_null_host_scalar(data):
                 mask_col.append(False)
-                offset_col.append(offset)
+                offset_vals.append(offset)
             else:
                 mask_col.append(True)
                 data_col = data_col.append(as_column(data))
                 offset += len(data)
-                offset_col.append(offset)
+                offset_vals.append(offset)
 
-        offset_col = column.as_column(offset_col, dtype=size_type_dtype)
+        offset_col = column.as_column(offset_vals, dtype=size_type_dtype)
 
         # Build ListColumn
         res = cls(
+            data=None,
             size=len(arbitrary),
             dtype=cudf.ListDtype(data_col.dtype),
             mask=cudf._lib.transform.bools_to_mask(as_column(mask_col)),
@@ -283,12 +300,13 @@ def _transform_leaves(self, func, *args, **kwargs) -> Self:
         for c in cc:
             o = c.children[0]
             lc = cudf.core.column.ListColumn(  # type: ignore
+                data=None,
                 size=c.size,
                 dtype=cudf.ListDtype(lc.dtype),
                 mask=c.mask,
                 offset=c.offset,
                 null_count=c.null_count,
-                children=(o, lc),
+                children=(o, lc),  # type: ignore[arg-type]
             )
         return lc
 
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index a710a9f46c2..6f7508822d4 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -549,6 +549,7 @@ def _split_by_character(self):
         offset_col = col.children[0]
 
         return cudf.core.column.ListColumn(
+            data=None,
             size=len(col),
             dtype=cudf.ListDtype(col.dtype),
             mask=col.mask,
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index c4c883ca9f9..7d87fc73621 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -928,6 +928,7 @@ def test_empty_nested_list_uninitialized_offsets_memory_usage():
     col = column_empty(0, cudf.ListDtype(cudf.ListDtype("int64")))
     nested_col = col.children[1]
     empty_inner = type(nested_col)(
+        data=None,
         size=nested_col.size,
         dtype=nested_col.dtype,
         mask=nested_col.mask,
@@ -939,6 +940,7 @@ def test_empty_nested_list_uninitialized_offsets_memory_usage():
         ),
     )
     col_empty_offset = type(col)(
+        data=None,
         size=col.size,
         dtype=col.dtype,
         mask=col.mask,

From 074abcc0fa9eb9d2944b145f29fa02eb9edddc55 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Mon, 19 Aug 2024 13:37:21 -0700
Subject: [PATCH 690/842] Add `public` qualifier for some member functions in
 Java class `Schema` (#16583)

This adds the public qualifier for some member functions of `Schema` class in Java code, allowing them to be accessed outside of the `ai.rapids.cudf` package such as from spark-rapids-jni or Spark plugin.

Java docs are also added for the newly became public functions as well as some existing public functions.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: https://github.com/rapidsai/cudf/pull/16583
---
 java/src/main/java/ai/rapids/cudf/Schema.java | 56 +++++++++++++++----
 1 file changed, 46 insertions(+), 10 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java
index 43603386649..76b2799aad6 100644
--- a/java/src/main/java/ai/rapids/cudf/Schema.java
+++ b/java/src/main/java/ai/rapids/cudf/Schema.java
@@ -120,7 +120,7 @@ private void flattenIfNeeded() {
 
   private int flattenedLength(int startingLength) {
     if (childSchemas != null) {
-      for (Schema child: childSchemas) {
+      for (Schema child : childSchemas) {
         startingLength++;
         startingLength = child.flattenedLength(startingLength);
       }
@@ -150,11 +150,19 @@ public static Builder builder() {
     return new Builder(DType.STRUCT);
   }
 
+  /**
+   * Get names of the columns flattened from all levels in schema by depth-first traversal.
+   * @return An array containing names of all columns in schema.
+   */
   public String[] getFlattenedColumnNames() {
     flattenIfNeeded();
     return flattenedNames;
   }
 
+  /**
+   * Get names of the top level child columns in schema.
+   * @return An array containing names of top level child columns.
+   */
   public String[] getColumnNames() {
     if (childNames == null) {
       return null;
@@ -162,6 +170,10 @@ public String[] getColumnNames() {
     return childNames.toArray(new String[childNames.size()]);
   }
 
+  /**
+   * Check if the schema is nested (i.e., top level type is LIST or STRUCT).
+   * @return true if the schema is nested, false otherwise.
+   */
   public boolean isNested() {
     return childSchemas != null && childSchemas.size() > 0;
   }
@@ -173,7 +185,7 @@ public boolean isNested() {
    */
   public boolean hasNestedChildren() {
     if (childSchemas != null) {
-      for (Schema child: childSchemas) {
+      for (Schema child : childSchemas) {
         if (child.isNested()) {
           return true;
         }
@@ -182,7 +194,11 @@ public boolean hasNestedChildren() {
     return false;
   }
 
-  int[] getFlattenedTypeIds() {
+  /**
+   * Get type ids of the columns flattened from all levels in schema by depth-first traversal.
+   * @return An array containing type ids of all columns in schema.
+   */
+  public int[] getFlattenedTypeIds() {
     flattenIfNeeded();
     if (flattenedTypes == null) {
       return null;
@@ -194,7 +210,11 @@ int[] getFlattenedTypeIds() {
     return ret;
   }
 
-  int[] getFlattenedTypeScales() {
+  /**
+   * Get scales of the columns' types flattened from all levels in schema by depth-first traversal.
+   * @return An array containing type scales of all columns in schema.
+   */
+  public int[] getFlattenedTypeScales() {
     flattenIfNeeded();
     if (flattenedTypes == null) {
       return null;
@@ -206,11 +226,19 @@ int[] getFlattenedTypeScales() {
     return ret;
   }
 
-  DType[] getFlattenedTypes() {
+  /**
+   * Get the types of the columns in schema flattened from all levels by depth-first traversal.
+   * @return An array containing types of all columns in schema.
+   */
+  public DType[] getFlattenedTypes() {
     flattenIfNeeded();
     return flattenedTypes;
   }
 
+  /**
+   * Get types of the top level child columns in schema.
+   * @return An array containing types of top level child columns.
+   */
   public DType[] getChildTypes() {
     if (childSchemas == null) {
       return null;
@@ -222,6 +250,10 @@ public DType[] getChildTypes() {
     return ret;
   }
 
+  /**
+   * Get number of top level child columns in schema.
+   * @return Number of child columns.
+   */
   public int getNumChildren() {
     if (childSchemas == null) {
       return 0;
@@ -229,7 +261,11 @@ public int getNumChildren() {
     return childSchemas.size();
   }
 
-  int[] getFlattenedNumChildren() {
+  /**
+   * Get numbers of child columns for each level in schema.
+   * @return Numbers of child columns for all levels flattened by depth-first traversal.
+   */
+  public int[] getFlattenedNumChildren() {
     flattenIfNeeded();
     return flattenedCounts;
   }
@@ -253,7 +289,7 @@ public boolean isStructOrHasStructDescendant() {
 
   public HostColumnVector.DataType asHostDataType() {
     if (topLevelType == DType.LIST) {
-      assert(childSchemas != null && childSchemas.size() == 1);
+      assert (childSchemas != null && childSchemas.size() == 1);
       HostColumnVector.DataType element = childSchemas.get(0).asHostDataType();
       return new HostColumnVector.ListType(true, element);
     } else if (topLevelType == DType.STRUCT) {
@@ -261,7 +297,7 @@ public HostColumnVector.DataType asHostDataType() {
         return new HostColumnVector.StructType(true);
       } else {
         List<HostColumnVector.DataType> childTypes =
-                childSchemas.stream().map(Schema::asHostDataType).collect(Collectors.toList());
+            childSchemas.stream().map(Schema::asHostDataType).collect(Collectors.toList());
         return new HostColumnVector.StructType(true, childTypes);
       }
     } else {
@@ -269,7 +305,7 @@ public HostColumnVector.DataType asHostDataType() {
     }
   }
 
-    public static class Builder {
+  public static class Builder {
     private final DType topLevelType;
     private final List<String> names;
     private final List<Builder> types;
@@ -326,7 +362,7 @@ public Schema build() {
       List<Schema> children = null;
       if (types != null) {
         children = new ArrayList<>(types.size());
-        for (Builder b: types) {
+        for (Builder b : types) {
           children.add(b.build());
         }
       }

From 79a5a97b2662bab6862ed895a6d802edd17d2502 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 19 Aug 2024 13:59:10 -0700
Subject: [PATCH 691/842] Remove NativeFile support from cudf Python (#16589)

This PR removes all support for passing NativeFile objects through cudf's I/O routines.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16589
---
 python/cudf/cudf/_lib/csv.pyx                 |   9 +-
 python/cudf/cudf/_lib/orc.pyx                 |  10 -
 python/cudf/cudf/_lib/parquet.pyx             |  43 +---
 python/cudf/cudf/io/csv.py                    |  11 +-
 python/cudf/cudf/io/orc.py                    |  33 +--
 python/cudf/cudf/io/parquet.py                | 102 ++------
 python/cudf/cudf/tests/test_csv.py            |  13 -
 python/cudf/cudf/tests/test_gcs.py            |   6 +-
 python/cudf/cudf/tests/test_parquet.py        |  33 +--
 python/cudf/cudf/tests/test_s3.py             | 168 +++----------
 python/cudf/cudf/utils/ioutils.py             | 234 ++----------------
 python/cudf/cudf/utils/utils.py               |  26 --
 .../dask_cudf/dask_cudf/io/tests/test_s3.py   |  48 ----
 python/pylibcudf/pylibcudf/io/datasource.pxd  |   7 -
 python/pylibcudf/pylibcudf/io/datasource.pyx  |  24 --
 15 files changed, 86 insertions(+), 681 deletions(-)

diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index e9aa97ecbc9..a90fe0f9ac6 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -7,7 +7,6 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 cimport pylibcudf.libcudf.types as libcudf_types
-from pylibcudf.io.datasource cimport Datasource, NativeFileDatasource
 
 from cudf._lib.types cimport dtype_to_pylibcudf_type
 
@@ -35,8 +34,6 @@ from pylibcudf.libcudf.table.table_view cimport table_view
 from cudf._lib.io.utils cimport make_sink_info
 from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table
 
-from pyarrow.lib import NativeFile
-
 import pylibcudf as plc
 
 from cudf.api.types import is_hashable
@@ -127,9 +124,7 @@ def read_csv(
     cudf.read_csv
     """
 
-    if not isinstance(datasource, (BytesIO, StringIO, bytes,
-                                   Datasource,
-                                   NativeFile)):
+    if not isinstance(datasource, (BytesIO, StringIO, bytes)):
         if not os.path.isfile(datasource):
             raise FileNotFoundError(
                 errno.ENOENT, os.strerror(errno.ENOENT), datasource
@@ -139,8 +134,6 @@ def read_csv(
         datasource = datasource.read().encode()
     elif isinstance(datasource, str) and not os.path.isfile(datasource):
         datasource = datasource.encode()
-    elif isinstance(datasource, NativeFile):
-        datasource = NativeFileDatasource(datasource)
 
     validate_args(delimiter, sep, delim_whitespace, decimal, thousands,
                   nrows, skipfooter, byte_range, skiprows)
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index d506dcd4346..adeba6fffb1 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -22,7 +22,6 @@ except ImportError:
     import json
 
 cimport pylibcudf.libcudf.io.types as cudf_io_types
-from pylibcudf.io.datasource cimport NativeFileDatasource
 from pylibcudf.libcudf.io.data_sink cimport data_sink
 from pylibcudf.libcudf.io.orc cimport (
     chunked_orc_writer_options,
@@ -71,8 +70,6 @@ from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 from cudf._lib.types cimport underlying_type_t_type_id
 from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
 
-from pyarrow.lib import NativeFile
-
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
 
@@ -204,10 +201,6 @@ cpdef read_parsed_orc_statistics(filepath_or_buffer):
     cudf.io.orc.read_orc_statistics
     """
 
-    # Handle NativeFile input
-    if isinstance(filepath_or_buffer, NativeFile):
-        filepath_or_buffer = NativeFileDatasource(filepath_or_buffer)
-
     cdef parsed_orc_statistics parsed = (
         libcudf_read_parsed_orc_statistics(make_source_info([filepath_or_buffer]))
     )
@@ -490,9 +483,6 @@ cdef orc_reader_options make_orc_reader_options(
     bool use_index
 ) except*:
 
-    for i, datasource in enumerate(filepaths_or_buffers):
-        if isinstance(datasource, NativeFile):
-            filepaths_or_buffers[i] = NativeFileDatasource(datasource)
     cdef vector[vector[size_type]] strps = stripes
     cdef orc_reader_options opts
     cdef source_info src = make_source_info(filepaths_or_buffers)
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 4bfb79ff651..c874a51e220 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -34,7 +34,6 @@ from libcpp.vector cimport vector
 cimport pylibcudf.libcudf.io.data_sink as cudf_io_data_sink
 cimport pylibcudf.libcudf.io.types as cudf_io_types
 from pylibcudf.expressions cimport Expression
-from pylibcudf.io.datasource cimport NativeFileDatasource
 from pylibcudf.io.parquet cimport ChunkedParquetReader
 from pylibcudf.libcudf.io.parquet cimport (
     chunked_parquet_writer_options,
@@ -62,8 +61,6 @@ from cudf._lib.io.utils cimport (
 )
 from cudf._lib.utils cimport table_view_from_table
 
-from pyarrow.lib import NativeFile
-
 import pylibcudf as plc
 
 from pylibcudf cimport Table
@@ -133,7 +130,6 @@ cdef object _process_metadata(object df,
                               list per_file_user_data,
                               object row_groups,
                               object filepaths_or_buffers,
-                              list pa_buffers,
                               bool allow_range_index,
                               bool use_pandas_metadata,
                               size_type nrows=-1,
@@ -199,9 +195,7 @@ cdef object _process_metadata(object df,
                     pa.parquet.read_metadata(
                         # Pyarrow cannot read directly from bytes
                         io.BytesIO(s) if isinstance(s, bytes) else s
-                    ) for s in (
-                        pa_buffers or filepaths_or_buffers
-                    )
+                    ) for s in filepaths_or_buffers
                 ]
 
                 filtered_idx = []
@@ -274,27 +268,13 @@ def read_parquet_chunked(
     size_type nrows=-1,
     int64_t skip_rows=0
 ):
-    # Convert NativeFile buffers to NativeFileDatasource,
-    # but save original buffers in case we need to use
-    # pyarrow for metadata processing
-    # (See: https://github.com/rapidsai/cudf/issues/9599)
-
-    pa_buffers = []
-
-    new_bufs = []
-    for i, datasource in enumerate(filepaths_or_buffers):
-        if isinstance(datasource, NativeFile):
-            new_bufs.append(NativeFileDatasource(datasource))
-        else:
-            new_bufs.append(datasource)
-
     # Note: If this function ever takes accepts filters
     # allow_range_index needs to be False when a filter is passed
     # (see read_parquet)
     allow_range_index = columns is not None and len(columns) != 0
 
     reader = ChunkedParquetReader(
-        plc.io.SourceInfo(new_bufs),
+        plc.io.SourceInfo(filepaths_or_buffers),
         columns,
         row_groups,
         use_pandas_metadata,
@@ -333,7 +313,7 @@ def read_parquet_chunked(
     )
     df = _process_metadata(df, column_names, child_names,
                            per_file_user_data, row_groups,
-                           filepaths_or_buffers, pa_buffers,
+                           filepaths_or_buffers,
                            allow_range_index, use_pandas_metadata,
                            nrows=nrows, skip_rows=skip_rows)
     return df
@@ -356,16 +336,6 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
     cudf.io.parquet.to_parquet
     """
 
-    # Convert NativeFile buffers to NativeFileDatasource,
-    # but save original buffers in case we need to use
-    # pyarrow for metadata processing
-    # (See: https://github.com/rapidsai/cudf/issues/9599)
-    pa_buffers = []
-    for i, datasource in enumerate(filepaths_or_buffers):
-        if isinstance(datasource, NativeFile):
-            pa_buffers.append(datasource)
-            filepaths_or_buffers[i] = NativeFileDatasource(datasource)
-
     allow_range_index = True
     if columns is not None and len(columns) == 0 or filters:
         allow_range_index = False
@@ -389,7 +359,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
 
     df = _process_metadata(df, tbl_w_meta.column_names(include_children=False),
                            tbl_w_meta.child_names, tbl_w_meta.per_file_user_data,
-                           row_groups, filepaths_or_buffers, pa_buffers,
+                           row_groups, filepaths_or_buffers,
                            allow_range_index, use_pandas_metadata,
                            nrows=nrows, skip_rows=skip_rows)
     return df
@@ -403,11 +373,6 @@ cpdef read_parquet_metadata(filepaths_or_buffers):
     cudf.io.parquet.read_parquet
     cudf.io.parquet.to_parquet
     """
-    # Convert NativeFile buffers to NativeFileDatasource
-    for i, datasource in enumerate(filepaths_or_buffers):
-        if isinstance(datasource, NativeFile):
-            filepaths_or_buffers[i] = NativeFileDatasource(datasource)
-
     cdef cudf_io_types.source_info source = make_source_info(filepaths_or_buffers)
 
     args = move(source)
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 0f2820a01e9..e61fc5063dc 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -5,7 +5,6 @@
 from io import BytesIO, StringIO
 
 import numpy as np
-from pyarrow.lib import NativeFile
 
 import cudf
 from cudf import _lib as libcudf
@@ -50,7 +49,6 @@ def read_csv(
     comment=None,
     delim_whitespace=False,
     byte_range=None,
-    use_python_file_object=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
@@ -63,12 +61,6 @@ def read_csv(
             FutureWarning,
         )
 
-    if use_python_file_object and bytes_per_thread is not None:
-        raise ValueError(
-            "bytes_per_thread is only supported when "
-            "`use_python_file_object=False`"
-        )
-
     if bytes_per_thread is None:
         bytes_per_thread = ioutils._BYTES_PER_THREAD_DEFAULT
 
@@ -84,8 +76,7 @@ def read_csv(
     filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
         compression=compression,
-        iotypes=(BytesIO, StringIO, NativeFile),
-        use_python_file_object=use_python_file_object,
+        iotypes=(BytesIO, StringIO),
         storage_options=storage_options,
         bytes_per_thread=bytes_per_thread,
     )
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 289292b5182..4f04caafc5d 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -10,7 +10,6 @@
 from cudf._lib import orc as liborc
 from cudf.api.types import is_list_like
 from cudf.utils import ioutils
-from cudf.utils.utils import maybe_filter_deprecation
 
 
 def _make_empty_df(filepath_or_buffer, columns):
@@ -281,7 +280,6 @@ def read_orc(
     num_rows=None,
     use_index=True,
     timestamp_type=None,
-    use_python_file_object=None,
     storage_options=None,
     bytes_per_thread=None,
 ):
@@ -321,9 +319,6 @@ def read_orc(
             )
 
     filepaths_or_buffers = []
-    have_nativefile = any(
-        isinstance(source, pa.NativeFile) for source in filepath_or_buffer
-    )
     for source in filepath_or_buffer:
         if ioutils.is_directory(
             path_or_data=source, storage_options=storage_options
@@ -339,7 +334,6 @@ def read_orc(
         tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
             path_or_data=source,
             compression=None,
-            use_python_file_object=use_python_file_object,
             storage_options=storage_options,
             bytes_per_thread=bytes_per_thread,
         )
@@ -364,24 +358,17 @@ def read_orc(
             stripes = selected_stripes
 
     if engine == "cudf":
-        # Don't want to warn if use_python_file_object causes us to get
-        # a NativeFile (there is a separate deprecation warning for that)
-        with maybe_filter_deprecation(
-            not have_nativefile,
-            message="Support for reading pyarrow's NativeFile is deprecated",
-            category=FutureWarning,
-        ):
-            return DataFrame._from_data(
-                *liborc.read_orc(
-                    filepaths_or_buffers,
-                    columns,
-                    stripes,
-                    skiprows,
-                    num_rows,
-                    use_index,
-                    timestamp_type,
-                )
+        return DataFrame._from_data(
+            *liborc.read_orc(
+                filepaths_or_buffers,
+                columns,
+                stripes,
+                skiprows,
+                num_rows,
+                use_index,
+                timestamp_type,
             )
+        )
     else:
         from pyarrow import orc
 
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 4a419a2fbb6..fac51a9e471 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -15,7 +15,6 @@
 
 import numpy as np
 import pandas as pd
-import pyarrow as pa
 from pyarrow import dataset as ds
 
 import cudf
@@ -24,7 +23,6 @@
 from cudf.core.column import as_column, build_categorical_column, column_empty
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
-from cudf.utils.utils import maybe_filter_deprecation
 
 BYTE_SIZES = {
     "kb": 1000,
@@ -352,8 +350,6 @@ def read_parquet_metadata(filepath_or_buffer):
             path_or_data=source,
             compression=None,
             fs=fs,
-            use_python_file_object=None,
-            open_file_options=None,
             storage_options=None,
             bytes_per_thread=None,
         )
@@ -534,9 +530,7 @@ def read_parquet(
     filters=None,
     row_groups=None,
     use_pandas_metadata=True,
-    use_python_file_object=None,
     categorical_partitions=True,
-    open_file_options=None,
     bytes_per_thread=None,
     dataset_kwargs=None,
     nrows=None,
@@ -549,16 +543,6 @@ def read_parquet(
         raise ValueError(
             f"Only supported engines are {{'cudf', 'pyarrow'}}, got {engine=}"
         )
-    # Do not allow the user to set file-opening options
-    # when `use_python_file_object=False` is specified
-    if use_python_file_object is False:
-        if open_file_options:
-            raise ValueError(
-                "open_file_options is not currently supported when "
-                "use_python_file_object is set to False."
-            )
-        open_file_options = {}
-
     if bytes_per_thread is None:
         bytes_per_thread = ioutils._BYTES_PER_THREAD_DEFAULT
 
@@ -612,23 +596,11 @@ def read_parquet(
     filepath_or_buffer = paths if paths else filepath_or_buffer
 
     filepaths_or_buffers = []
-    if use_python_file_object:
-        open_file_options = _default_open_file_options(
-            open_file_options=open_file_options,
-            columns=columns,
-            row_groups=row_groups,
-            fs=fs,
-        )
-    have_nativefile = any(
-        isinstance(source, pa.NativeFile) for source in filepath_or_buffer
-    )
     for source in filepath_or_buffer:
         tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
             path_or_data=source,
             compression=None,
             fs=fs,
-            use_python_file_object=use_python_file_object,
-            open_file_options=open_file_options,
             storage_options=storage_options,
             bytes_per_thread=bytes_per_thread,
         )
@@ -669,28 +641,20 @@ def read_parquet(
         )
 
     # Convert parquet data to a cudf.DataFrame
-
-    # Don't want to warn if use_python_file_object causes us to get
-    # a NativeFile (there is a separate deprecation warning for that)
-    with maybe_filter_deprecation(
-        not have_nativefile,
-        message="Support for reading pyarrow's NativeFile is deprecated",
-        category=FutureWarning,
-    ):
-        df = _parquet_to_frame(
-            filepaths_or_buffers,
-            engine,
-            *args,
-            columns=columns,
-            row_groups=row_groups,
-            use_pandas_metadata=use_pandas_metadata,
-            partition_keys=partition_keys,
-            partition_categories=partition_categories,
-            dataset_kwargs=dataset_kwargs,
-            nrows=nrows,
-            skip_rows=skip_rows,
-            **kwargs,
-        )
+    df = _parquet_to_frame(
+        filepaths_or_buffers,
+        engine,
+        *args,
+        columns=columns,
+        row_groups=row_groups,
+        use_pandas_metadata=use_pandas_metadata,
+        partition_keys=partition_keys,
+        partition_categories=partition_categories,
+        dataset_kwargs=dataset_kwargs,
+        nrows=nrows,
+        skip_rows=skip_rows,
+        **kwargs,
+    )
     # Apply filters row-wise (if any are defined), and return
     df = _apply_post_filters(df, filters)
     if projected_columns:
@@ -1570,44 +1534,6 @@ def __exit__(self, *args):
         self.close()
 
 
-def _default_open_file_options(
-    open_file_options, columns, row_groups, fs=None
-):
-    """
-    Set default fields in open_file_options.
-
-    Copies and updates `open_file_options` to
-    include column and row-group information
-    under the "precache_options" key. By default,
-    we set "method" to "parquet", but precaching
-    will be disabled if the user chooses `method=None`
-
-    Parameters
-    ----------
-    open_file_options : dict or None
-    columns : list
-    row_groups : list
-    fs : fsspec.AbstractFileSystem, Optional
-    """
-    if fs and ioutils._is_local_filesystem(fs):
-        # Quick return for local fs
-        return open_file_options or {}
-    # Assume remote storage if `fs` was not specified
-    open_file_options = (open_file_options or {}).copy()
-    precache_options = open_file_options.pop("precache_options", {}).copy()
-    if precache_options.get("method", "parquet") == "parquet":
-        precache_options.update(
-            {
-                "method": "parquet",
-                "engine": precache_options.get("engine", "pyarrow"),
-                "columns": columns,
-                "row_groups": row_groups,
-            }
-        )
-    open_file_options["precache_options"] = precache_options
-    return open_file_options
-
-
 def _hive_dirname(name, val):
     # Simple utility to produce hive directory name
     if pd.isna(val):
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 6a21cb1b9d7..40ba415e681 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -13,7 +13,6 @@
 import numpy as np
 import pandas as pd
 import pytest
-from pyarrow import fs as pa_fs
 
 import cudf
 from cudf import read_csv
@@ -1080,18 +1079,6 @@ def test_csv_reader_filepath_or_buffer(tmpdir, path_or_buf, src):
     assert_eq(expect, got)
 
 
-def test_csv_reader_arrow_nativefile(path_or_buf):
-    # Check that we can read a file opened with the
-    # Arrow FileSystem interface
-    expect = cudf.read_csv(path_or_buf("filepath"))
-    fs, path = pa_fs.FileSystem.from_uri(path_or_buf("filepath"))
-    with pytest.warns(FutureWarning):
-        with fs.open_input_file(path) as fil:
-            got = cudf.read_csv(fil)
-
-    assert_eq(expect, got)
-
-
 def test_small_zip(tmpdir):
     df = pd.DataFrame(
         {
diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py
index 28fdfb5c2f1..82ecd356bbf 100644
--- a/python/cudf/cudf/tests/test_gcs.py
+++ b/python/cudf/cudf/tests/test_gcs.py
@@ -42,12 +42,8 @@ def mock_size(*args):
     monkeypatch.setattr(gcsfs.core.GCSFileSystem, "size", mock_size)
 
     # Test read from explicit path.
-    # Since we are monkey-patching, we cannot use
-    # use_python_file_object=True, because the pyarrow
-    # `open_input_file` command will fail (since it doesn't
-    # use the monkey-patched `open` definition)
     with pytest.warns(FutureWarning):
-        got = cudf.read_csv(f"gcs://{fpath}", use_python_file_object=False)
+        got = cudf.read_csv(f"gcs://{fpath}")
     assert_eq(pdf, got)
 
     # AbstractBufferedFile -> PythonFile conversion
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 879a2c50db7..db4f1c9c8bd 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -19,7 +19,7 @@
 import pytest
 from fsspec.core import get_fs_token_paths
 from packaging import version
-from pyarrow import fs as pa_fs, parquet as pq
+from pyarrow import parquet as pq
 
 import cudf
 from cudf._lib.parquet import read_parquet_chunked
@@ -705,40 +705,17 @@ def test_parquet_reader_filepath_or_buffer(parquet_path_or_buf, src):
     assert_eq(expect, got)
 
 
-def test_parquet_reader_arrow_nativefile(parquet_path_or_buf):
-    # Check that we can read a file opened with the
-    # Arrow FileSystem interface
-    expect = cudf.read_parquet(parquet_path_or_buf("filepath"))
-    fs, path = pa_fs.FileSystem.from_uri(parquet_path_or_buf("filepath"))
-    with fs.open_input_file(path) as fil:
-        with pytest.warns(FutureWarning):
-            got = cudf.read_parquet(fil)
-
-    assert_eq(expect, got)
-
-
-@pytest.mark.parametrize("use_python_file_object", [True, False])
-def test_parquet_reader_use_python_file_object(
-    parquet_path_or_buf, use_python_file_object
-):
-    # Check that the non-default `use_python_file_object=True`
-    # option works as expected
+def test_parquet_reader_file_types(parquet_path_or_buf):
     expect = cudf.read_parquet(parquet_path_or_buf("filepath"))
     fs, _, paths = get_fs_token_paths(parquet_path_or_buf("filepath"))
 
     # Pass open fsspec file
-    with pytest.warns(FutureWarning):
-        with fs.open(paths[0], mode="rb") as fil:
-            got1 = cudf.read_parquet(
-                fil, use_python_file_object=use_python_file_object
-            )
+    with fs.open(paths[0], mode="rb") as fil:
+        got1 = cudf.read_parquet(fil)
     assert_eq(expect, got1)
 
     # Pass path only
-    with pytest.warns(FutureWarning):
-        got2 = cudf.read_parquet(
-            paths[0], use_python_file_object=use_python_file_object
-        )
+    got2 = cudf.read_parquet(paths[0])
     assert_eq(expect, got2)
 
 
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index 3ae318d3bf5..6579fd23634 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -7,7 +7,6 @@
 
 import numpy as np
 import pandas as pd
-import pyarrow.fs as pa_fs
 import pytest
 from fsspec.core import get_fs_token_paths
 
@@ -138,48 +137,17 @@ def test_read_csv(s3_base, s3so, pdf, bytes_per_thread):
     buffer = pdf.to_csv(index=False)
 
     # Use fsspec file object
-    with pytest.warns(FutureWarning):
-        with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-            got = cudf.read_csv(
-                f"s3://{bucket}/{fname}",
-                storage_options=s3so,
-                bytes_per_thread=bytes_per_thread,
-                use_python_file_object=False,
-            )
-    assert_eq(pdf, got)
-
-    # Use Arrow PythonFile object
-    with pytest.warns(FutureWarning):
-        with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-            got = cudf.read_csv(
-                f"s3://{bucket}/{fname}",
-                storage_options=s3so,
-                use_python_file_object=True,
-            )
-    assert_eq(pdf, got)
-
-
-def test_read_csv_arrow_nativefile(s3_base, s3so, pdf):
-    # Write to buffer
-    fname = "test_csv_reader_arrow_nativefile.csv"
-    bucket = "csv"
-    buffer = pdf.to_csv(index=False)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        fs = pa_fs.S3FileSystem(
-            endpoint_override=s3so["client_kwargs"]["endpoint_url"],
+        got = cudf.read_csv(
+            f"s3://{bucket}/{fname}",
+            storage_options=s3so,
+            bytes_per_thread=bytes_per_thread,
         )
-        with pytest.warns(FutureWarning):
-            with fs.open_input_file(f"{bucket}/{fname}") as fil:
-                got = cudf.read_csv(fil)
-
     assert_eq(pdf, got)
 
 
 @pytest.mark.parametrize("bytes_per_thread", [32, 1024])
-@pytest.mark.parametrize("use_python_file_object", [True, False])
-def test_read_csv_byte_range(
-    s3_base, s3so, pdf, bytes_per_thread, use_python_file_object
-):
+def test_read_csv_byte_range(s3_base, s3so, pdf, bytes_per_thread):
     # Write to buffer
     fname = "test_csv_reader_byte_range.csv"
     bucket = "csv"
@@ -187,18 +155,14 @@ def test_read_csv_byte_range(
 
     # Use fsspec file object
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        with pytest.warns(FutureWarning):
-            got = cudf.read_csv(
-                f"s3://{bucket}/{fname}",
-                storage_options=s3so,
-                byte_range=(74, 73),
-                bytes_per_thread=bytes_per_thread
-                if not use_python_file_object
-                else None,
-                header=None,
-                names=["Integer", "Float", "Integer2", "String", "Boolean"],
-                use_python_file_object=use_python_file_object,
-            )
+        got = cudf.read_csv(
+            f"s3://{bucket}/{fname}",
+            storage_options=s3so,
+            byte_range=(74, 73),
+            bytes_per_thread=bytes_per_thread,
+            header=None,
+            names=["Integer", "Float", "Integer2", "String", "Boolean"],
+        )
 
     assert_eq(pdf.iloc[-2:].reset_index(drop=True), got)
 
@@ -226,16 +190,12 @@ def test_write_csv(s3_base, s3so, pdf, chunksize):
 
 @pytest.mark.parametrize("bytes_per_thread", [32, 1024])
 @pytest.mark.parametrize("columns", [None, ["Float", "String"]])
-@pytest.mark.parametrize("precache", [None, "parquet"])
-@pytest.mark.parametrize("use_python_file_object", [True, False])
 def test_read_parquet(
     s3_base,
     s3so,
     pdf,
     bytes_per_thread,
     columns,
-    precache,
-    use_python_file_object,
 ):
     fname = "test_parquet_reader.parquet"
     bucket = "parquet"
@@ -245,19 +205,12 @@ def test_read_parquet(
     # Check direct path handling
     buffer.seek(0)
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        with pytest.warns(FutureWarning):
-            got1 = cudf.read_parquet(
-                f"s3://{bucket}/{fname}",
-                open_file_options=(
-                    {"precache_options": {"method": precache}}
-                    if use_python_file_object
-                    else None
-                ),
-                storage_options=s3so,
-                bytes_per_thread=bytes_per_thread,
-                columns=columns,
-                use_python_file_object=use_python_file_object,
-            )
+        got1 = cudf.read_parquet(
+            f"s3://{bucket}/{fname}",
+            storage_options=s3so,
+            bytes_per_thread=bytes_per_thread,
+            columns=columns,
+        )
     expect = pdf[columns] if columns else pdf
     assert_eq(expect, got1)
 
@@ -268,13 +221,11 @@ def test_read_parquet(
             f"s3://{bucket}/{fname}", storage_options=s3so
         )[0]
         with fs.open(f"s3://{bucket}/{fname}", mode="rb") as f:
-            with pytest.warns(FutureWarning):
-                got2 = cudf.read_parquet(
-                    f,
-                    bytes_per_thread=bytes_per_thread,
-                    columns=columns,
-                    use_python_file_object=use_python_file_object,
-                )
+            got2 = cudf.read_parquet(
+                f,
+                bytes_per_thread=bytes_per_thread,
+                columns=columns,
+            )
     assert_eq(expect, got2)
 
 
@@ -350,28 +301,7 @@ def test_read_parquet_multi_file(s3_base, s3so, pdf):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize("columns", [None, ["Float", "String"]])
-def test_read_parquet_arrow_nativefile(s3_base, s3so, pdf, columns):
-    # Write to buffer
-    fname = "test_parquet_reader_arrow_nativefile.parquet"
-    bucket = "parquet"
-    buffer = BytesIO()
-    pdf.to_parquet(path=buffer)
-    buffer.seek(0)
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        with pytest.warns(FutureWarning):
-            fs = pa_fs.S3FileSystem(
-                endpoint_override=s3so["client_kwargs"]["endpoint_url"],
-            )
-            with fs.open_input_file(f"{bucket}/{fname}") as fil:
-                got = cudf.read_parquet(fil, columns=columns)
-
-    expect = pdf[columns] if columns else pdf
-    assert_eq(expect, got)
-
-
-@pytest.mark.parametrize("precache", [None, "parquet"])
-def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache):
+def test_read_parquet_filters(s3_base, s3so, pdf_ext):
     fname = "test_parquet_reader_filters.parquet"
     bucket = "parquet"
     buffer = BytesIO()
@@ -379,13 +309,11 @@ def test_read_parquet_filters(s3_base, s3so, pdf_ext, precache):
     buffer.seek(0)
     filters = [("String", "==", "Omega")]
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        with pytest.warns(FutureWarning):
-            got = cudf.read_parquet(
-                f"s3://{bucket}/{fname}",
-                storage_options=s3so,
-                filters=filters,
-                open_file_options={"precache_options": {"method": precache}},
-            )
+        got = cudf.read_parquet(
+            f"s3://{bucket}/{fname}",
+            storage_options=s3so,
+            filters=filters,
+        )
 
     # All row-groups should be filtered out
     assert_eq(pdf_ext.iloc[:0], got.reset_index(drop=True))
@@ -445,33 +373,8 @@ def test_read_json(s3_base, s3so):
     assert_eq(expect, got)
 
 
-@pytest.mark.parametrize("use_python_file_object", [False, True])
-@pytest.mark.parametrize("columns", [None, ["string1"]])
-def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns):
-    source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc")
-    fname = "test_orc_reader.orc"
-    bucket = "orc"
-    expect = pd.read_orc(source_file)
-
-    with open(source_file, "rb") as f:
-        buffer = f.read()
-
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        with pytest.warns(FutureWarning):
-            got = cudf.read_orc(
-                f"s3://{bucket}/{fname}",
-                columns=columns,
-                storage_options=s3so,
-                use_python_file_object=use_python_file_object,
-            )
-
-    if columns:
-        expect = expect[columns]
-    assert_eq(expect, got)
-
-
 @pytest.mark.parametrize("columns", [None, ["string1"]])
-def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns):
+def test_read_orc(s3_base, s3so, datadir, columns):
     source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc")
     fname = "test_orc_reader.orc"
     bucket = "orc"
@@ -481,12 +384,11 @@ def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns):
         buffer = f.read()
 
     with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        fs = pa_fs.S3FileSystem(
-            endpoint_override=s3so["client_kwargs"]["endpoint_url"],
+        got = cudf.read_orc(
+            f"s3://{bucket}/{fname}",
+            columns=columns,
+            storage_options=s3so,
         )
-        with pytest.warns(FutureWarning):
-            with fs.open_input_file(f"{bucket}/{fname}") as fil:
-                got = cudf.read_orc(fil, columns=columns)
 
     if columns:
         expect = expect[columns]
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 448a815fe1b..4ac9b63985f 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -13,19 +13,10 @@
 import numpy as np
 import pandas as pd
 from fsspec.core import get_fs_token_paths
-from pyarrow import PythonFile as ArrowPythonFile
-from pyarrow.lib import NativeFile
 
-from cudf.api.extensions import no_default
 from cudf.core._compat import PANDAS_LT_300
 from cudf.utils.docutils import docfmt_partial
 
-try:
-    import fsspec.parquet as fsspec_parquet
-
-except ImportError:
-    fsspec_parquet = None
-
 _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
 _ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024
 
@@ -173,32 +164,12 @@
 use_pandas_metadata : boolean, default True
     If True and dataset has custom PANDAS schema metadata, ensure that index
     columns are also loaded.
-use_python_file_object : boolean, default True
-    If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time.
-
-    .. deprecated:: 24.08
-        `use_python_file_object` is deprecated and will be removed in a future
-        version of cudf, as PyArrow NativeFiles will no longer be accepted as
-        input/output in cudf readers/writers in the future.
-open_file_options : dict, optional
-    Dictionary of key-value pairs to pass to the function used to open remote
-    files. By default, this will be `fsspec.parquet.open_parquet_file`. To
-    deactivate optimized precaching, set the "method" to `None` under the
-    "precache_options" key. Note that the `open_file_func` key can also be
-    used to specify a custom file-open function.
-
-    .. deprecated:: 24.08
-        `open_file_options` is deprecated as it was intended for
-        pyarrow file inputs, which will no longer be accepted as
-        input/output cudf readers/writers in the future.
 bytes_per_thread : int, default None
     Determines the number of bytes to be allocated per thread to read the
     files in parallel. When there is a file of large size, we get slightly
     better throughput by decomposing it and transferring multiple "blocks"
     in parallel (using a python thread pool). Default allocation is
     {bytes_per_thread} bytes.
-    This parameter is functional only when `use_python_file_object=False`.
 skiprows : int, default None
     If not None, the number of rows to skip from the start of the file.
 
@@ -485,14 +456,6 @@
     This parameter is deprecated.
 use_index : bool, default True
     If True, use row index if available for faster seeking.
-use_python_file_object : boolean, default True
-    If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time.
-
-    .. deprecated:: 24.08
-        `use_python_file_object` is deprecated and will be removed in a future
-        version of cudf, as PyArrow NativeFiles will no longer be accepted as
-        input/output in cudf readers/writers in the future.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -506,7 +469,6 @@
     better throughput by decomposing it and transferring multiple "blocks"
     in parallel (using a python thread pool). Default allocation is
     {bytes_per_thread} bytes.
-    This parameter is functional only when `use_python_file_object=False`.
 
 Returns
 -------
@@ -1209,14 +1171,6 @@
     size to zero to read all data after the offset location. Reads the row
     that starts before or at the end of the range, even if it ends after
     the end of the range.
-use_python_file_object : boolean, default True
-    If True, Arrow-backed PythonFile objects will be used in place of fsspec
-    AbstractBufferedFile objects at IO time.
-
-    .. deprecated:: 24.08
-        `use_python_file_object` is deprecated and will be removed in a future
-        version of cudf, as PyArrow NativeFiles will no longer be accepted as
-        input/output in cudf readers/writers in the future.
 storage_options : dict, optional, default None
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
@@ -1230,7 +1184,6 @@
     better throughput by decomposing it and transferring multiple "blocks"
     in parallel (using a python thread pool). Default allocation is
     {bytes_per_thread} bytes.
-    This parameter is functional only when `use_python_file_object=False`.
 Returns
 -------
 GPU ``DataFrame`` object.
@@ -1454,22 +1407,6 @@
     Mode in which file is opened
 iotypes : (), default (BytesIO)
     Object type to exclude from file-like check
-use_python_file_object : boolean, default False
-    If True, Arrow-backed PythonFile objects will be used in place
-    of fsspec AbstractBufferedFile objects.
-
-    .. deprecated:: 24.08
-        `use_python_file_object` is deprecated and will be removed in a future
-        version of cudf, as PyArrow NativeFiles will no longer be accepted as
-        input/output in cudf readers/writers.
-open_file_options : dict, optional
-    Optional dictionary of keyword arguments to pass to
-    `_open_remote_files` (used for remote storage only).
-
-    .. deprecated:: 24.08
-        `open_file_options` is deprecated as it was intended for
-        pyarrow file inputs, which will no longer be accepted as
-        input/output cudf readers/writers in the future.
 allow_raw_text_input : boolean, default False
     If True, this indicates the input `path_or_data` could be a raw text
     input and will not check for its existence in the filesystem. If False,
@@ -1490,7 +1427,6 @@
     better throughput by decomposing it and transferring multiple "blocks"
     in parallel (using a Python thread pool). Default allocation is
     {bytes_per_thread} bytes.
-    This parameter is functional only when `use_python_file_object=False`.
 
 Returns
 -------
@@ -1635,119 +1571,13 @@ def _get_filesystem_and_paths(path_or_data, storage_options):
     return fs, return_paths
 
 
-def _set_context(obj, stack):
-    # Helper function to place open file on context stack
-    if stack is None:
-        return obj
-    return stack.enter_context(obj)
-
-
-def _open_remote_files(
-    paths,
-    fs,
-    context_stack=None,
-    open_file_func=None,
-    precache_options=None,
-    **kwargs,
-):
-    """Return a list of open file-like objects given
-    a list of remote file paths.
-
-    Parameters
-    ----------
-    paths : list(str)
-        List of file-path strings.
-    fs : fsspec.AbstractFileSystem
-        Fsspec file-system object.
-    context_stack : contextlib.ExitStack, Optional
-        Context manager to use for open files.
-    open_file_func : Callable, Optional
-        Call-back function to use for opening. If this argument
-        is specified, all other arguments will be ignored.
-    precache_options : dict, optional
-        Dictionary of key-word arguments to pass to use for
-        precaching. Unless the input contains ``{"method": None}``,
-        ``fsspec.parquet.open_parquet_file`` will be used for remote
-        storage.
-    **kwargs :
-        Key-word arguments to be passed to format-specific
-        open functions.
-    """
-
-    # Just use call-back function if one was specified
-    if open_file_func is not None:
-        return [
-            _set_context(open_file_func(path, **kwargs), context_stack)
-            for path in paths
-        ]
-
-    # Check if the "precache" option is supported.
-    # In the future, fsspec should do this check for us
-    precache_options = (precache_options or {}).copy()
-    precache = precache_options.pop("method", None)
-    if precache not in ("parquet", None):
-        raise ValueError(f"{precache} not a supported `precache` option.")
-
-    # Check that "parts" caching (used for all format-aware file handling)
-    # is supported by the installed fsspec/s3fs version
-    if precache == "parquet" and not fsspec_parquet:
-        warnings.warn(
-            f"This version of fsspec ({fsspec.__version__}) does "
-            f"not support parquet-optimized precaching. Please upgrade "
-            f"to the latest fsspec version for better performance."
-        )
-        precache = None
-
-    if precache == "parquet":
-        # Use fsspec.parquet module.
-        # TODO: Use `cat_ranges` to collect "known"
-        # parts for all files at once.
-        row_groups = precache_options.pop("row_groups", None) or (
-            [None] * len(paths)
-        )
-        return [
-            ArrowPythonFile(
-                _set_context(
-                    fsspec_parquet.open_parquet_file(
-                        path,
-                        fs=fs,
-                        row_groups=rgs,
-                        **precache_options,
-                        **kwargs,
-                    ),
-                    context_stack,
-                )
-            )
-            for path, rgs in zip(paths, row_groups)
-        ]
-
-    # Avoid top-level pyarrow.fs import.
-    # Importing pyarrow.fs initializes a S3 SDK with a finalizer
-    # that runs atexit. In some circumstances it appears this
-    # runs a call into a logging system that is already shutdown.
-    # To avoid this, we only import this subsystem if it is
-    # really needed.
-    # See https://github.com/aws/aws-sdk-cpp/issues/2681
-    from pyarrow.fs import FSSpecHandler, PyFileSystem
-
-    # Default open - Use pyarrow filesystem API
-    pa_fs = PyFileSystem(FSSpecHandler(fs))
-    return [
-        _set_context(pa_fs.open_input_file(fpath), context_stack)
-        for fpath in paths
-    ]
-
-
 @doc_get_reader_filepath_or_buffer()
 def get_reader_filepath_or_buffer(
     path_or_data,
     compression,
     mode="rb",
     fs=None,
-    iotypes=(BytesIO, NativeFile),
-    # no_default aliases to False
-    use_python_file_object=no_default,
-    open_file_options=None,
+    iotypes=(BytesIO,),
     allow_raw_text_input=False,
     storage_options=None,
     bytes_per_thread=_BYTES_PER_THREAD_DEFAULT,
@@ -1758,30 +1588,6 @@ def get_reader_filepath_or_buffer(
 
     path_or_data = stringify_pathlike(path_or_data)
 
-    if use_python_file_object is no_default:
-        use_python_file_object = False
-    elif use_python_file_object is not None:
-        warnings.warn(
-            "The 'use_python_file_object' keyword is deprecated and "
-            "will be removed in a future version.",
-            FutureWarning,
-        )
-    else:
-        # Preserve the readers (e.g. read_csv) default of True
-        # if no use_python_file_object option is specified by the user
-        # for now (note: this is different from the default for this
-        # function of False)
-        # TODO: when non-pyarrow file reading perf is good enough
-        # we can default this to False
-        use_python_file_object = True
-
-    if open_file_options is not None:
-        warnings.warn(
-            "The 'open_file_options' keyword is deprecated and "
-            "will be removed in a future version.",
-            FutureWarning,
-        )
-
     if isinstance(path_or_data, str):
         # Get a filesystem object if one isn't already available
         paths = [path_or_data]
@@ -1866,38 +1672,28 @@ def get_reader_filepath_or_buffer(
                 raise FileNotFoundError(
                     f"{path_or_data} could not be resolved to any files"
                 )
-            if use_python_file_object:
-                path_or_data = _open_remote_files(
-                    paths,
-                    fs,
-                    **(open_file_options or {}),
-                )
-            else:
-                path_or_data = [
-                    BytesIO(
-                        _fsspec_data_transfer(
-                            fpath,
-                            fs=fs,
-                            mode=mode,
-                            bytes_per_thread=bytes_per_thread,
-                        )
+            path_or_data = [
+                BytesIO(
+                    _fsspec_data_transfer(
+                        fpath,
+                        fs=fs,
+                        mode=mode,
+                        bytes_per_thread=bytes_per_thread,
                     )
-                    for fpath in paths
-                ]
+                )
+                for fpath in paths
+            ]
             if len(path_or_data) == 1:
                 path_or_data = path_or_data[0]
 
     elif not isinstance(path_or_data, iotypes) and is_file_like(path_or_data):
         if isinstance(path_or_data, TextIOWrapper):
             path_or_data = path_or_data.buffer
-        if use_python_file_object:
-            path_or_data = ArrowPythonFile(path_or_data)
-        else:
-            path_or_data = BytesIO(
-                _fsspec_data_transfer(
-                    path_or_data, mode=mode, bytes_per_thread=bytes_per_thread
-                )
+        path_or_data = BytesIO(
+            _fsspec_data_transfer(
+                path_or_data, mode=mode, bytes_per_thread=bytes_per_thread
             )
+        )
 
     return path_or_data, compression
 
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index c9b343e0f9f..7347ec7866a 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -6,7 +6,6 @@
 import os
 import traceback
 import warnings
-from contextlib import contextmanager
 
 import numpy as np
 import pandas as pd
@@ -404,28 +403,3 @@ def _all_bools_with_nulls(lhs, rhs, bool_fill_value):
     if result_mask is not None:
         result_col = result_col.set_mask(result_mask.as_mask())
     return result_col
-
-
-@contextmanager
-def maybe_filter_deprecation(
-    condition: bool, message: str, category: type[Warning]
-):
-    """Conditionally filter a warning category.
-
-    Parameters
-    ----------
-    condition
-        If true, filter the warning
-    message
-        Message to match, passed to :func:`warnings.filterwarnings`
-    category
-        Category of warning, passed to :func:`warnings.filterwarnings`
-    """
-    with warnings.catch_warnings():
-        if condition:
-            warnings.filterwarnings(
-                "ignore",
-                message,
-                category=category,
-            )
-        yield
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
index 99f19917424..a14ffbc37dc 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py
@@ -119,22 +119,6 @@ def test_read_csv(s3_base, s3so):
         assert df.a.sum().compute() == 4
 
 
-def test_read_csv_warns(s3_base, s3so):
-    with s3_context(
-        s3_base=s3_base,
-        bucket="daskcsv_warns",
-        files={"a.csv": b"a,b\n1,2\n3,4\n"},
-    ):
-        with pytest.warns(FutureWarning):
-            df = dask_cudf.read_csv(
-                "s3://daskcsv_warns/*.csv",
-                blocksize="50 B",
-                storage_options=s3so,
-                use_python_file_object=True,
-            )
-            assert df.a.sum().compute() == 4
-
-
 def test_read_parquet_open_file_options_raises():
     with pytest.raises(ValueError):
         dask_cudf.read_parquet(
@@ -198,22 +182,6 @@ def test_read_parquet(s3_base, s3so, pdf):
         assert_eq(pdf, got)
 
 
-def test_read_parquet_use_python_file_object(s3_base, s3so, pdf):
-    fname = "test_parquet_use_python_file_object.parquet"
-    bucket = "parquet"
-    buffer = BytesIO()
-    pdf.to_parquet(path=buffer)
-    buffer.seek(0)
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        with pytest.warns(FutureWarning):
-            got = dask_cudf.read_parquet(
-                f"s3://{bucket}/{fname}",
-                storage_options=s3so,
-                read={"use_python_file_object": True},
-            ).head()
-            assert_eq(pdf, got)
-
-
 def test_read_orc(s3_base, s3so, pdf):
     fname = "test_orc_reader_dask.orc"
     bucket = "orc"
@@ -226,19 +194,3 @@ def test_read_orc(s3_base, s3so, pdf):
             storage_options=s3so,
         )
         assert_eq(pdf, got)
-
-
-def test_read_orc_use_python_file_object(s3_base, s3so, pdf):
-    fname = "test_orc_use_python_file_object.orc"
-    bucket = "orc"
-    buffer = BytesIO()
-    pdf.to_orc(path=buffer)
-    buffer.seek(0)
-    with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}):
-        with pytest.warns(FutureWarning):
-            got = dask_cudf.read_orc(
-                f"s3://{bucket}/{fname}",
-                storage_options=s3so,
-                use_python_file_object=True,
-            ).head()
-            assert_eq(pdf, got)
diff --git a/python/pylibcudf/pylibcudf/io/datasource.pxd b/python/pylibcudf/pylibcudf/io/datasource.pxd
index 05c03dceee2..c08f36693c7 100644
--- a/python/pylibcudf/pylibcudf/io/datasource.pxd
+++ b/python/pylibcudf/pylibcudf/io/datasource.pxd
@@ -1,14 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport shared_ptr
-from pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
 from pylibcudf.libcudf.io.datasource cimport datasource
 
 
 cdef class Datasource:
     cdef datasource* get_datasource(self) except * nogil
-
-
-cdef class NativeFileDatasource(Datasource):
-    cdef shared_ptr[arrow_io_source] c_datasource
-    cdef datasource* get_datasource(self) nogil
diff --git a/python/pylibcudf/pylibcudf/io/datasource.pyx b/python/pylibcudf/pylibcudf/io/datasource.pyx
index 6cc509b74cb..02418444caa 100644
--- a/python/pylibcudf/pylibcudf/io/datasource.pyx
+++ b/python/pylibcudf/pylibcudf/io/datasource.pyx
@@ -1,34 +1,10 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from libcpp.memory cimport shared_ptr
-from pyarrow.includes.libarrow cimport CRandomAccessFile
-from pyarrow.lib cimport NativeFile
-from pylibcudf.libcudf.io.arrow_io_source cimport arrow_io_source
 from pylibcudf.libcudf.io.datasource cimport datasource
 
-import warnings
-
 
 cdef class Datasource:
     cdef datasource* get_datasource(self) except * nogil:
         with gil:
             raise NotImplementedError("get_datasource() should not "
                                       + "be directly invoked here")
-
-cdef class NativeFileDatasource(Datasource):
-
-    def __cinit__(self, NativeFile native_file):
-
-        cdef shared_ptr[CRandomAccessFile] ra_src
-
-        warnings.warn(
-            "Support for reading pyarrow's NativeFile is deprecated "
-            "and will be removed in a future release of cudf.",
-            FutureWarning,
-        )
-
-        ra_src = native_file.get_random_access_file()
-        self.c_datasource.reset(new arrow_io_source(ra_src))
-
-    cdef datasource* get_datasource(self) nogil:
-        return <datasource *> (self.c_datasource.get())

From 6ccc2c2e4d7b4cda0bb4f844a28d69254049b795 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 19 Aug 2024 16:39:34 -0500
Subject: [PATCH 692/842] standardize and consolidate wheel installations in
 testing scripts (#16575)

I noticed some common changes to wheel-testing scripts in the PRs splitting off `pylibcudf` (#16299) and `libcudf` (#15483).

* consolidating multiple `pip install`'s into 1
  - *(this is safer, as it removes the risk of `pip` replacing a previously-installed CI package with another one from a remote package repository)*
* standardizing the approach used for "install some wheels built earlier in this same CI run"

These can go onto `branch-24.10` right now, so proposing them in a separate PR so that `cudf` CI can benefit from them without having to wait on those large PRs.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16575
---
 ci/cudf_pandas_scripts/pandas-tests/run.sh | 13 +++++++++----
 ci/cudf_pandas_scripts/run_tests.sh        | 13 +++++++++----
 ci/test_wheel_cudf.sh                      | 11 ++++++-----
 ci/test_wheel_cudf_polars.sh               | 15 ++++++++++-----
 ci/test_wheel_dask_cudf.sh                 | 14 +++++++-------
 5 files changed, 41 insertions(+), 25 deletions(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index 8deaeab78a3..97c3139080f 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -11,10 +11,15 @@ rapids-logger "Running Pandas tests using $PANDAS_TESTS_BRANCH branch and rapids
 rapids-logger "PR number: ${RAPIDS_REF_NAME:-"unknown"}"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
-python -m pip install $(ls ./local-pylibcudf-dep/pylibcudf*.whl)
-python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,pandas-tests]
+
+# Download the cudf and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+
+# echo to expand wildcard before adding `[extra]` requires for pip
+python -m pip install \
+  "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,pandas-tests]" \
+  "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index bfb655db3ca..8215ce729b3 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -36,10 +36,15 @@ if [ "$no_cudf" = true ]; then
     echo "Skipping cudf install"
 else
     RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-    RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
-    RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
-    python -m pip install $(ls ./local-pylibcudf-dep/pylibcudf*.whl)
-    python -m pip install $(ls ./local-cudf-dep/cudf*.whl)[test,cudf-pandas-tests]
+
+    # Download the cudf and pylibcudf built in the previous step
+    RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+    RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+
+    # echo to expand wildcard before adding `[extra]` requires for pip
+    python -m pip install \
+        "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,cudf-pandas-tests]" \
+        "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 fi
 
 python -m pytest -p cudf.pandas \
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index 5a2c3ccac8f..19131952098 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -3,15 +3,16 @@
 
 set -eou pipefail
 
-# Download the pylibcudf built in the previous step
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
+
+# Download the cudf and pylibcudf built in the previous step
 RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
-# Install both pylibcudf and cudf
+# echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
-    "$(echo ./local-pylibcudf-dep/pylibcudf*.whl)[test]" \
-    "$(echo ./dist/cudf*.whl)[test]"
+  "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+  "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]"
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index 357d4170d47..6438d13c4b7 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -20,12 +20,17 @@ fi
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
-# Download the cudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
-python -m pip install ./local-pylibcudf-dep/pylibcudf*.whl
+# Download the cudf and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
-rapids-logger "Install cudf_polars"
-python -m pip install $(echo ./dist/cudf_polars*.whl)[test]
+rapids-logger "Installing cudf_polars and its dependencies"
+
+# echo to expand wildcard before adding `[extra]` requires for pip
+python -m pip install \
+    "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
+    "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+    "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
 rapids-logger "Run cudf_polars tests"
 
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 4d045472604..ff893a08e27 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -6,15 +6,15 @@ set -eou pipefail
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
-# Download the cudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibcudf-dep
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
-python -m pip install \
-    "$(echo ./local-pylibcudf-dep/pylibcudf*.whl)" \
-    "$(echo ./local-cudf-dep/cudf*.whl)"
+# Download the cudf and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
 # echo to expand wildcard before adding `[extra]` requires for pip
-python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
+python -m pip install \
+  "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
+  "$(echo ./dist/dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+  "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
 RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/

From f2d13c9dbe957cd2a5cbf93a339149ab3edc0240 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Mon, 19 Aug 2024 17:23:23 -0500
Subject: [PATCH 693/842] make more use of YAML anchors in dependencies.yaml
 (#16597)

Contributes to https://github.com/rapidsai/build-planning/issues/33

Follow-up to #16299

This proposes some simplifications to `dependencies.yaml`. It's not intended to change any behavior.

* more use of YAML anchors for requirements that are intended to be identical to each other
* eliminating the `pylibcudf_build_dep` dependency group that was introduced in #16299, in favor of just tracking the `pylibcudf` build dependency alongside `cudf`'s `rmm` build dependency in the existing `build_python_cudf` group
  - *(sorry I'd missed that in the review on #16299)*

I found myself starting to make similar changes in the PR breaking up these packages into more (splitting out a `libcudf` in #15483) and thought they'd be better as a standalone PR.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16597
---
 dependencies.yaml | 45 ++++++++++++++-------------------------------
 1 file changed, 14 insertions(+), 31 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index ca615905a15..a774345fe95 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -96,7 +96,6 @@ files:
       - build_base
       - build_python_common
       - build_python_cudf
-      - pylibcudf_build_dep
   py_run_cudf:
     output: pyproject
     pyproject_dir: python/cudf
@@ -383,12 +382,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu12==24.10.*,>=0.0.0a0
+              - &rmm_cu12 rmm-cu12==24.10.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu11==24.10.*,>=0.0.0a0
+              - &rmm_cu11 rmm-cu11==24.10.*,>=0.0.0a0
           - {matrix: null, packages: [*rmm_unsuffixed]}
   build_python_cudf:
     common:
@@ -412,34 +411,18 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu12==24.10.*,>=0.0.0a0
-              - pylibcudf-cu12==24.10.*,>=0.0.0a0
+              - &pylibcudf_cu12 pylibcudf-cu12==24.10.*,>=0.0.0a0
+              - *rmm_cu12
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu11==24.10.*,>=0.0.0a0
-              - pylibcudf-cu11==24.10.*,>=0.0.0a0
-          - {matrix: null, packages: [*rmm_unsuffixed]}
-  pylibcudf_build_dep:
-    common:
-      - output_types: conda
-        packages:
-          - &pylibcudf_unsuffixed pylibcudf==24.10.*,>=0.0.0a0
-    specific:
-      - output_types: [pyproject]
-        matrices:
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "true"
-            packages:
-              - pylibcudf-cu12==24.10.*,>=0.0.0a0
+              - &pylibcudf_cu11 pylibcudf-cu11==24.10.*,>=0.0.0a0
+              - *rmm_cu11
           - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "true"
             packages:
-              - pylibcudf-cu11==24.10.*,>=0.0.0a0
-          - {matrix: null, packages: [*pylibcudf_unsuffixed]}
+              - &pylibcudf_unsuffixed pylibcudf==24.10.*,>=0.0.0a0
+              - *rmm_unsuffixed
   libarrow_build:
     common:
       - output_types: conda
@@ -677,12 +660,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu12==24.10.*,>=0.0.0a0
+              - *rmm_cu12
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu11==24.10.*,>=0.0.0a0
+              - *rmm_cu11
           - {matrix: null, packages: [*rmm_unsuffixed]}
   run_cudf:
     common:
@@ -728,7 +711,7 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu12==24.10.*,>=0.0.0a0
+              - *rmm_cu12
               - pynvjitlink-cu12>=0.0.0a0
           - matrix:
               cuda: "12.*"
@@ -740,7 +723,7 @@ dependencies:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - rmm-cu11==24.10.*,>=0.0.0a0
+              - *rmm_cu11
               - cubinlinker-cu11
               - ptxcompiler-cu11
           - matrix:
@@ -874,12 +857,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - pylibcudf-cu12==24.10.*,>=0.0.0a0
+              - *pylibcudf_cu12
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - pylibcudf-cu11==24.10.*,>=0.0.0a0
+              - *pylibcudf_cu11
           - {matrix: null, packages: [*pylibcudf_unsuffixed]}
   depends_on_cudf:
     common:

From 3f6dd14e26deccc761ed06790cf806edc266d5e4 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 19 Aug 2024 12:29:15 -1000
Subject: [PATCH 694/842] Make StructColumn.__init__ strict (#16467)

This PR makes `StructColumn.__init__` strict putting restrictions on data, dtype, size and children so these columns cannot be constructed into to an invalid state. It also aligns the signature with the base class.

xref https://github.com/rapidsai/cudf/issues/16469

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16467
---
 python/cudf/cudf/core/column/column.py   | 13 +++--
 python/cudf/cudf/core/column/interval.py | 71 ++++++++++++++++--------
 python/cudf/cudf/core/column/struct.py   | 50 ++++++++++++++---
 python/cudf/cudf/core/index.py           |  6 +-
 4 files changed, 100 insertions(+), 40 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 0857727d23f..27278120abb 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1635,22 +1635,23 @@ def build_column(
         )
     elif isinstance(dtype, IntervalDtype):
         return cudf.core.column.IntervalColumn(
+            data=None,
+            size=size,  # type: ignore[arg-type]
             dtype=dtype,
             mask=mask,
-            size=size,
             offset=offset,
-            children=children,
             null_count=null_count,
+            children=children,  # type: ignore[arg-type]
         )
     elif isinstance(dtype, StructDtype):
         return cudf.core.column.StructColumn(
-            data=data,
-            dtype=dtype,
+            data=None,
             size=size,  # type: ignore[arg-type]
-            offset=offset,
+            dtype=dtype,
             mask=mask,
+            offset=offset,
             null_count=null_count,
-            children=children,
+            children=children,  # type: ignore[arg-type]
         )
     elif isinstance(dtype, cudf.Decimal64Dtype):
         return cudf.core.column.Decimal64Column(
diff --git a/python/cudf/cudf/core/column/interval.py b/python/cudf/cudf/core/column/interval.py
index d9fc96a9f3e..9147270c289 100644
--- a/python/cudf/cudf/core/column/interval.py
+++ b/python/cudf/cudf/core/column/interval.py
@@ -11,32 +11,46 @@
 from cudf.core.dtypes import IntervalDtype
 
 if TYPE_CHECKING:
+    from typing_extensions import Self
+
     from cudf._typing import ScalarLike
+    from cudf.core.buffer import Buffer
     from cudf.core.column import ColumnBase
 
 
 class IntervalColumn(StructColumn):
     def __init__(
         self,
-        dtype,
-        mask=None,
-        size=None,
-        offset=0,
-        null_count=None,
-        children=(),
+        data: None,
+        size: int,
+        dtype: IntervalDtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple[ColumnBase, ColumnBase] = (),  # type: ignore[assignment]
     ):
+        if len(children) != 2:
+            raise ValueError(
+                "children must be a tuple of two columns (left edges, right edges)."
+            )
         super().__init__(
-            data=None,
+            data=data,
+            size=size,
             dtype=dtype,
             mask=mask,
-            size=size,
             offset=offset,
             null_count=null_count,
             children=children,
         )
 
+    @staticmethod
+    def _validate_dtype_instance(dtype: IntervalDtype) -> IntervalDtype:
+        if not isinstance(dtype, IntervalDtype):
+            raise ValueError("dtype must be a IntervalDtype.")
+        return dtype
+
     @classmethod
-    def from_arrow(cls, data):
+    def from_arrow(cls, data: pa.Array) -> Self:
         new_col = super().from_arrow(data.storage)
         size = len(data)
         dtype = IntervalDtype.from_arrow(data.type)
@@ -48,16 +62,17 @@ def from_arrow(cls, data):
         null_count = data.null_count
         children = new_col.children
 
-        return IntervalColumn(
+        return cls(
+            data=None,
             size=size,
             dtype=dtype,
             mask=mask,
             offset=offset,
             null_count=null_count,
-            children=children,
+            children=children,  # type: ignore[arg-type]
         )
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         typ = self.dtype.to_arrow()
         struct_arrow = super().to_arrow()
         if len(struct_arrow) == 0:
@@ -67,9 +82,14 @@ def to_arrow(self):
         return pa.ExtensionArray.from_storage(typ, struct_arrow)
 
     @classmethod
-    def from_struct_column(cls, struct_column: StructColumn, closed="right"):
+    def from_struct_column(
+        cls,
+        struct_column: StructColumn,
+        closed: Literal["left", "right", "both", "neither"] = "right",
+    ) -> Self:
         first_field_name = next(iter(struct_column.dtype.fields.keys()))
-        return IntervalColumn(
+        return cls(
+            data=None,
             size=struct_column.size,
             dtype=IntervalDtype(
                 struct_column.dtype.fields[first_field_name], closed
@@ -77,12 +97,13 @@ def from_struct_column(cls, struct_column: StructColumn, closed="right"):
             mask=struct_column.base_mask,
             offset=struct_column.offset,
             null_count=struct_column.null_count,
-            children=struct_column.base_children,
+            children=struct_column.base_children,  # type: ignore[arg-type]
         )
 
-    def copy(self, deep=True):
+    def copy(self, deep: bool = True) -> Self:
         struct_copy = super().copy(deep=deep)
-        return IntervalColumn(
+        return IntervalColumn(  # type: ignore[return-value]
+            data=None,
             size=struct_copy.size,
             dtype=IntervalDtype(
                 struct_copy.dtype.fields["left"], self.dtype.closed
@@ -90,7 +111,7 @@ def copy(self, deep=True):
             mask=struct_copy.base_mask,
             offset=struct_copy.offset,
             null_count=struct_copy.null_count,
-            children=struct_copy.base_children,
+            children=struct_copy.base_children,  # type: ignore[arg-type]
         )
 
     @property
@@ -138,25 +159,27 @@ def overlaps(other) -> ColumnBase:
 
     def set_closed(
         self, closed: Literal["left", "right", "both", "neither"]
-    ) -> IntervalColumn:
-        return IntervalColumn(
+    ) -> Self:
+        return IntervalColumn(  # type: ignore[return-value]
+            data=None,
             size=self.size,
             dtype=IntervalDtype(self.dtype.fields["left"], closed),
             mask=self.base_mask,
             offset=self.offset,
             null_count=self.null_count,
-            children=self.base_children,
+            children=self.base_children,  # type: ignore[arg-type]
         )
 
-    def as_interval_column(self, dtype):
+    def as_interval_column(self, dtype: IntervalDtype) -> Self:  # type: ignore[override]
         if isinstance(dtype, IntervalDtype):
-            return IntervalColumn(
+            return IntervalColumn(  # type: ignore[return-value]
+                data=None,
                 size=self.size,
                 dtype=dtype,
                 mask=self.mask,
                 offset=self.offset,
                 null_count=self.null_count,
-                children=tuple(
+                children=tuple(  # type: ignore[arg-type]
                     child.astype(dtype.subtype) for child in self.children
                 ),
             )
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index c2ce787eeae..2fda3b2c434 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -14,7 +14,10 @@
 from cudf.core.missing import NA
 
 if TYPE_CHECKING:
+    from typing_extensions import Self
+
     from cudf._typing import Dtype
+    from cudf.core.buffer import Buffer
 
 
 class StructColumn(ColumnBase):
@@ -23,10 +26,39 @@ class StructColumn(ColumnBase):
 
     Every column has n children, where n is
     the number of fields in the Struct Dtype.
-
     """
 
-    dtype: StructDtype
+    def __init__(
+        self,
+        data: None,
+        size: int,
+        dtype: StructDtype,
+        mask: Buffer | None = None,
+        offset: int = 0,
+        null_count: int | None = None,
+        children: tuple[ColumnBase, ...] = (),
+    ):
+        if data is not None:
+            raise ValueError("data must be None.")
+        dtype = self._validate_dtype_instance(dtype)
+        super().__init__(
+            data=data,
+            size=size,
+            dtype=dtype,
+            mask=mask,
+            offset=offset,
+            null_count=null_count,
+            children=children,
+        )
+
+    @staticmethod
+    def _validate_dtype_instance(dtype: StructDtype) -> StructDtype:
+        # IntervalDtype is a subclass of StructDtype, so compare types exactly
+        if type(dtype) is not StructDtype:
+            raise ValueError(
+                f"{type(dtype).__name__} must be a StructDtype exactly."
+            )
+        return dtype
 
     @property
     def base_size(self):
@@ -35,7 +67,7 @@ def base_size(self):
         else:
             return self.size + self.offset
 
-    def to_arrow(self):
+    def to_arrow(self) -> pa.Array:
         children = [
             pa.nulls(len(child))
             if len(child) == child.null_count
@@ -50,7 +82,7 @@ def to_arrow(self):
             }
         )
 
-        if self.nullable:
+        if self.mask is not None:
             buffers = (pa.py_buffer(self.mask.memoryview()),)
         else:
             buffers = (None,)
@@ -73,7 +105,7 @@ def to_pandas(
             return pd.Index(self.to_arrow().tolist(), dtype="object")
 
     @cached_property
-    def memory_usage(self):
+    def memory_usage(self) -> int:
         n = 0
         if self.nullable:
             n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size)
@@ -99,7 +131,7 @@ def __setitem__(self, key, value):
             value = cudf.Scalar(value, self.dtype)
         super().__setitem__(key, value)
 
-    def copy(self, deep=True):
+    def copy(self, deep: bool = True) -> Self:
         # Since struct columns are immutable, both deep and
         # shallow copies share the underlying device data and mask.
         result = super().copy(deep=False)
@@ -107,15 +139,15 @@ def copy(self, deep=True):
             result = result._rename_fields(self.dtype.fields.keys())
         return result
 
-    def _rename_fields(self, names):
+    def _rename_fields(self, names) -> Self:
         """
         Return a StructColumn with the same field values as this StructColumn,
         but with the field names equal to `names`.
         """
-        dtype = cudf.core.dtypes.StructDtype(
+        dtype = StructDtype(
             {name: col.dtype for name, col in zip(names, self.children)}
         )
-        return StructColumn(
+        return StructColumn(  # type: ignore[return-value]
             data=None,
             size=self.size,
             dtype=dtype,
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index ee2f0317f8d..6a5e718c2c5 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3354,6 +3354,7 @@ def interval_range(
         return IntervalIndex(data, closed=closed, name=name)
 
     interval_col = IntervalColumn(
+        data=None,
         dtype=IntervalDtype(left_col.dtype, closed),
         size=len(left_col),
         children=(left_col, right_col),
@@ -3425,6 +3426,7 @@ def __init__(
             elif isinstance(data.dtype, (pd.IntervalDtype, IntervalDtype)):
                 data = np.array([], dtype=data.dtype.subtype)
             interval_col = IntervalColumn(
+                None,
                 dtype=IntervalDtype(data.dtype, closed),
                 size=len(data),
                 children=(as_column(data), as_column(data)),
@@ -3436,12 +3438,13 @@ def __init__(
             if copy:
                 col = col.copy()
             interval_col = IntervalColumn(
+                data=None,
                 dtype=IntervalDtype(col.dtype.subtype, closed),
                 mask=col.mask,
                 size=col.size,
                 offset=col.offset,
                 null_count=col.null_count,
-                children=col.children,
+                children=col.children,  # type: ignore[arg-type]
             )
 
         if dtype:
@@ -3517,6 +3520,7 @@ def from_breaks(
         )
 
         interval_col = IntervalColumn(
+            data=None,
             dtype=IntervalDtype(left_col.dtype, closed),
             size=len(left_col),
             children=(left_col, right_col),

From a45af4a61ba582d6af839702148e9a6e2da69bc9 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 19 Aug 2024 19:06:28 -0700
Subject: [PATCH 695/842] Remove arrow_io_source (#16607)

The `arrow_io_source` in libcudf only existed to support Python's pyarrow NativeFile integration, which was deprecated and removed in #16589.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Thomas Li (https://github.com/lithomas1)
  - Bradley Dice (https://github.com/bdice)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16607
---
 cpp/CMakeLists.txt                            |   1 -
 cpp/include/cudf/io/arrow_io_source.hpp       |  93 ----------------
 cpp/src/io/utilities/arrow_io_source.cpp      |  87 ---------------
 cpp/tests/CMakeLists.txt                      |   4 -
 cpp/tests/io/arrow_io_source_test.cpp         | 103 ------------------
 cpp/tests/io/csv_test.cpp                     |  26 +----
 cpp/tests/io/json/json_test.cpp               |  26 -----
 .../pylibcudf/libcudf/io/arrow_io_source.pxd  |  14 ---
 8 files changed, 1 insertion(+), 353 deletions(-)
 delete mode 100644 cpp/include/cudf/io/arrow_io_source.hpp
 delete mode 100644 cpp/src/io/utilities/arrow_io_source.cpp
 delete mode 100644 cpp/tests/io/arrow_io_source_test.cpp
 delete mode 100644 python/pylibcudf/pylibcudf/libcudf/io/arrow_io_source.pxd

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index eeafc411874..ff00c484501 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -438,7 +438,6 @@ add_library(
   src/io/text/bgzip_data_chunk_source.cu
   src/io/text/bgzip_utils.cpp
   src/io/text/multibyte_split.cu
-  src/io/utilities/arrow_io_source.cpp
   src/io/utilities/base64_utilities.cpp
   src/io/utilities/column_buffer.cpp
   src/io/utilities/column_buffer_strings.cu
diff --git a/cpp/include/cudf/io/arrow_io_source.hpp b/cpp/include/cudf/io/arrow_io_source.hpp
deleted file mode 100644
index ed5c839cbb4..00000000000
--- a/cpp/include/cudf/io/arrow_io_source.hpp
+++ /dev/null
@@ -1,93 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "datasource.hpp"
-
-#include <cudf/utilities/export.hpp>
-
-#include <arrow/filesystem/filesystem.h>
-#include <arrow/io/interfaces.h>
-
-#include <memory>
-#include <string>
-#include <utility>
-
-namespace CUDF_EXPORT cudf {
-namespace io {
-/**
- * @addtogroup io_datasources
- * @{
- * @file
- */
-
-/**
- * @brief Implementation class for reading from an Apache Arrow file. The file
- * could be a memory-mapped file or other implementation supported by Arrow.
- */
-class arrow_io_source : public datasource {
- public:
-  /**
-   * @brief Constructs an object from an Apache Arrow Filesystem URI
-   *
-   * @param arrow_uri Apache Arrow Filesystem URI
-   */
-  explicit arrow_io_source(std::string const& arrow_uri);
-
-  /**
-   * @brief Constructs an object from an `arrow` source object.
-   *
-   * @param file The `arrow` object from which the data is read
-   */
-  explicit arrow_io_source(std::shared_ptr<arrow::io::RandomAccessFile> file)
-    : arrow_file(std::move(file))
-  {
-  }
-
-  /**
-   * @brief Returns a buffer with a subset of data from the `arrow` source.
-   *
-   * @param offset The offset in bytes from which to read
-   * @param size The number of bytes to read
-   * @return A buffer with the read data
-   */
-  std::unique_ptr<buffer> host_read(size_t offset, size_t size) override;
-
-  /**
-   * @brief Reads a selected range from the `arrow` source into a preallocated buffer.
-   *
-   * @param[in] offset The offset in bytes from which to read
-   * @param[in] size The number of bytes to read
-   * @param[out] dst The preallocated buffer to read into
-   * @return The number of bytes read
-   */
-  size_t host_read(size_t offset, size_t size, uint8_t* dst) override;
-  /**
-   * @brief Returns the size of the data in the `arrow` source.
-   *
-   * @return The size of the data in the `arrow` source
-   */
-  [[nodiscard]] size_t size() const override;
-
- private:
-  std::shared_ptr<arrow::fs::FileSystem> filesystem;
-  std::shared_ptr<arrow::io::RandomAccessFile> arrow_file;
-};
-
-/** @} */  // end of group
-}  // namespace io
-}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/utilities/arrow_io_source.cpp b/cpp/src/io/utilities/arrow_io_source.cpp
deleted file mode 100644
index 157240b8b08..00000000000
--- a/cpp/src/io/utilities/arrow_io_source.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/io/arrow_io_source.hpp>
-
-#include <arrow/buffer.h>
-#include <arrow/filesystem/filesystem.h>
-#include <arrow/result.h>
-
-#include <memory>
-#include <string>
-#include <utility>
-
-namespace cudf::io {
-
-/**
- * @brief Implementation for an owning buffer where `arrow::Buffer` holds the data.
- */
-class arrow_io_buffer : public datasource::buffer {
-  std::shared_ptr<arrow::Buffer> arrow_buffer;
-
- public:
-  explicit arrow_io_buffer(std::shared_ptr<arrow::Buffer> arrow_buffer)
-    : arrow_buffer(std::move(arrow_buffer))
-  {
-  }
-  [[nodiscard]] size_t size() const override { return arrow_buffer->size(); }
-  [[nodiscard]] uint8_t const* data() const override { return arrow_buffer->data(); }
-};
-
-arrow_io_source::arrow_io_source(std::string const& arrow_uri)
-{
-  std::string const uri_start_delimiter = "//";
-  std::string const uri_end_delimiter   = "?";
-
-  auto const result = arrow::fs::FileSystemFromUri(arrow_uri);
-  CUDF_EXPECTS(result.ok(), "Failed to generate Arrow Filesystem instance from URI.");
-  filesystem = result.ValueOrDie();
-
-  // Parse the path from the URI
-  auto const start = [&]() {
-    auto const delim_start = arrow_uri.find(uri_start_delimiter);
-    return delim_start == std::string::npos ? 0 : delim_start + uri_start_delimiter.size();
-  }();
-  auto const end  = arrow_uri.find(uri_end_delimiter) - start;
-  auto const path = arrow_uri.substr(start, end);
-
-  auto const in_stream = filesystem->OpenInputFile(path);
-  CUDF_EXPECTS(in_stream.ok(), "Failed to open Arrow RandomAccessFile");
-  arrow_file = in_stream.ValueOrDie();
-}
-
-std::unique_ptr<datasource::buffer> arrow_io_source::host_read(size_t offset, size_t size)
-{
-  auto const result = arrow_file->ReadAt(offset, size);
-  CUDF_EXPECTS(result.ok(), "Cannot read file data");
-  return std::make_unique<arrow_io_buffer>(result.ValueOrDie());
-}
-
-size_t arrow_io_source::host_read(size_t offset, size_t size, uint8_t* dst)
-{
-  auto const result = arrow_file->ReadAt(offset, size, dst);
-  CUDF_EXPECTS(result.ok(), "Cannot read file data");
-  return result.ValueOrDie();
-}
-
-[[nodiscard]] size_t arrow_io_source::size() const
-{
-  auto const result = arrow_file->GetSize();
-  CUDF_EXPECTS(result.ok(), "Cannot get file size");
-  return result.ValueOrDie();
-}
-
-}  // namespace cudf::io
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 006b36add0e..ac77a362e1c 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -321,7 +321,6 @@ ConfigureTest(
 ConfigureTest(JSON_WRITER_TEST io/json/json_writer.cpp)
 ConfigureTest(JSON_TYPE_CAST_TEST io/json/json_type_cast_test.cu)
 ConfigureTest(NESTED_JSON_TEST io/json/nested_json_test.cpp io/json/json_tree.cpp)
-ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
 ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
 ConfigureTest(JSON_QUOTE_NORMALIZATION io/json/json_quote_normalization_test.cpp)
 ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json/json_whitespace_normalization_test.cu)
@@ -334,9 +333,6 @@ target_link_libraries(DATA_CHUNK_SOURCE_TEST PRIVATE ZLIB::ZLIB)
 ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu)
 ConfigureTest(FST_TEST io/fst/fst_test.cu)
 ConfigureTest(TYPE_INFERENCE_TEST io/type_inference_test.cu)
-if(CUDF_ENABLE_ARROW_S3)
-  target_compile_definitions(ARROW_IO_SOURCE_TEST PRIVATE "S3_ENABLED")
-endif()
 
 # ##################################################################################################
 # * sort tests ------------------------------------------------------------------------------------
diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp
deleted file mode 100644
index ffdf2c7e00f..00000000000
--- a/cpp/tests/io/arrow_io_source_test.cpp
+++ /dev/null
@@ -1,103 +0,0 @@
-/*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/table_utilities.hpp>
-#include <cudf_test/testing_main.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <cudf/io/arrow_io_source.hpp>
-#include <cudf/io/json.hpp>
-#include <cudf/io/parquet.hpp>
-
-#include <arrow/filesystem/filesystem.h>
-#include <arrow/filesystem/s3fs.h>
-#include <arrow/io/api.h>
-#include <arrow/util/config.h>
-
-#include <fstream>
-#include <memory>
-#include <string>
-
-// Global environment for temporary files
-auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
-  ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
-
-// Base test fixture for tests
-struct ArrowIOTest : public cudf::test::BaseFixture {};
-
-TEST_F(ArrowIOTest, URIFileSystem)
-{
-  const std::string file_name = temp_env->get_temp_dir() + "JsonLinesFileTest.json";
-  std::ofstream outfile(file_name, std::ofstream::out);
-  outfile << "{\"a\":11, \"b\":1.1}\n{\"a\":22, \"b\":2.2}";
-  outfile.close();
-
-  std::string file_uri = "file://" + file_name;
-  auto datasource      = std::make_unique<cudf::io::arrow_io_source>(file_uri);
-
-  // Populate the JSON Reader Options
-  cudf::io::json_reader_options options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info(datasource.get())).lines(true);
-
-  // Read the JSON file from the LocalFileSystem
-  cudf::io::table_with_metadata tbl = cudf::io::read_json(options);
-
-  ASSERT_EQ(2, tbl.tbl->num_columns());
-  ASSERT_EQ(2, tbl.tbl->num_rows());
-}
-
-TEST_F(ArrowIOTest, S3FileSystem)
-{
-  std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2";
-
-  // Check to see if Arrow was built with support for S3. If not, ensure this
-  // test throws. If so, validate the S3 file contents.
-  auto const s3_unsupported = arrow::fs::FileSystemFromUri(s3_uri).status().IsNotImplemented();
-  if (s3_unsupported) {
-    EXPECT_THROW(std::make_unique<cudf::io::arrow_io_source>(s3_uri), cudf::logic_error);
-  } else {
-    auto datasource = std::make_unique<cudf::io::arrow_io_source>(s3_uri);
-
-    // Populate the Parquet Reader Options
-    cudf::io::source_info src(datasource.get());
-    std::vector<std::string> single_column;
-    single_column.insert(single_column.begin(), "total_bill");
-    cudf::io::parquet_reader_options_builder builder(src);
-    cudf::io::parquet_reader_options options = builder.columns(single_column).build();
-
-    // Read the Parquet file from S3
-    cudf::io::table_with_metadata tbl = cudf::io::read_parquet(options);
-
-    ASSERT_EQ(1, tbl.tbl->num_columns());  // Only single column specified in reader_options
-    ASSERT_EQ(244, tbl.tbl->num_rows());   // known number of rows from the S3 file
-  }
-
-#ifdef ARROW_S3
-  if (!s3_unsupported) {
-    // Verify that we are using Arrow with S3, and call finalize
-    // https://github.com/apache/arrow/issues/36974
-    // This needs to be in a separate conditional to ensure we call
-    // finalize after all arrow_io_source instances have been deleted.
-    [[maybe_unused]] auto _ = arrow::fs::EnsureS3Finalized();
-  }
-#endif
-}
-
-CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index ff433264446..dc14824d834 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -25,8 +25,8 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/csv.hpp>
+#include <cudf/io/datasource.hpp>
 #include <cudf/strings/convert/convert_datetime.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -1197,30 +1197,6 @@ TEST_F(CsvReaderTest, HeaderOnlyFile)
   EXPECT_EQ(3, view.num_columns());
 }
 
-TEST_F(CsvReaderTest, ArrowFileSource)
-{
-  auto filepath = temp_env->get_temp_dir() + "ArrowFileSource.csv";
-  {
-    std::ofstream outfile(filepath, std::ofstream::out);
-    outfile << "A\n9\n8\n7\n6\n5\n4\n3\n2\n";
-  }
-
-  std::shared_ptr<arrow::io::ReadableFile> infile;
-  ASSERT_TRUE(arrow::io::ReadableFile::Open(filepath).Value(&infile).ok());
-
-  auto arrow_source = cudf::io::arrow_io_source{infile};
-  cudf::io::csv_reader_options in_opts =
-    cudf::io::csv_reader_options::builder(cudf::io::source_info{&arrow_source})
-      .dtypes({dtype<int8_t>()});
-  auto result = cudf::io::read_csv(in_opts);
-
-  auto const view = result.tbl->view();
-  EXPECT_EQ(1, view.num_columns());
-  ASSERT_EQ(type_id::INT8, view.column(0).type().id());
-
-  expect_column_data_equal(std::vector<int8_t>{9, 8, 7, 6, 5, 4, 3, 2}, view.column(0));
-}
-
 TEST_F(CsvReaderTest, InvalidFloatingPoint)
 {
   auto const filepath = temp_env->get_temp_dir() + "InvalidFloatingPoint.csv";
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 0a485e26b71..576a698ba31 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -26,7 +26,6 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/iterator.cuh>
-#include <cudf/io/arrow_io_source.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/strings/convert/convert_fixed_point.hpp>
 #include <cudf/strings/repeat_strings.hpp>
@@ -958,31 +957,6 @@ TEST_F(JsonReaderTest, NoDataFileValues)
   EXPECT_EQ(0, view.num_columns());
 }
 
-TEST_F(JsonReaderTest, ArrowFileSource)
-{
-  const std::string fname = temp_env->get_temp_dir() + "ArrowFileSource.csv";
-
-  std::ofstream outfile(fname, std::ofstream::out);
-  outfile << "[9]\n[8]\n[7]\n[6]\n[5]\n[4]\n[3]\n[2]\n";
-  outfile.close();
-
-  std::shared_ptr<arrow::io::ReadableFile> infile;
-  ASSERT_TRUE(arrow::io::ReadableFile::Open(fname).Value(&infile).ok());
-
-  auto arrow_source = cudf::io::arrow_io_source{infile};
-  cudf::io::json_reader_options in_options =
-    cudf::io::json_reader_options::builder(cudf::io::source_info{&arrow_source})
-      .dtypes({dtype<int8_t>()})
-      .lines(true);
-
-  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
-
-  EXPECT_EQ(result.tbl->num_columns(), 1);
-  EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT8);
-
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int8_wrapper{{9, 8, 7, 6, 5, 4, 3, 2}});
-}
-
 TEST_P(JsonReaderParamTest, InvalidFloatingPoint)
 {
   auto const test_opt       = GetParam();
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/arrow_io_source.pxd b/python/pylibcudf/pylibcudf/libcudf/io/arrow_io_source.pxd
deleted file mode 100644
index 54a913a9ce3..00000000000
--- a/python/pylibcudf/pylibcudf/libcudf/io/arrow_io_source.pxd
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-cimport pylibcudf.libcudf.io.datasource as cudf_io_datasource
-from libcpp.memory cimport shared_ptr
-from libcpp.string cimport string
-from pyarrow.includes.libarrow cimport CRandomAccessFile
-
-
-cdef extern from "cudf/io/arrow_io_source.hpp" \
-        namespace "cudf::io" nogil:
-
-    cdef cppclass arrow_io_source(cudf_io_datasource.datasource):
-        arrow_io_source(const string& arrow_uri) except +
-        arrow_io_source(shared_ptr[CRandomAccessFile]) except +

From 3ac409dc26437deb77d30f64ec148121394878e4 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 19 Aug 2024 21:18:11 -0700
Subject: [PATCH 696/842] Fix C++ and Cython io types (#16610)

The C++ I/O types were previously not specifying a base type despite the fact that the Cython code was relying on the base being an int32. This has apparently never bitten us before, but in theory this could go very wrong since it leaves the underlying type up to the compiler and if the C++ binary used something other than an int32 that would result in an ABI incompatibility with the Python build that would produce spurious results.

While fixing this, I also noticed that the Cython contained a number of erroneous (likely outdated) declarations. Since Cython extern declarations are simply an indicate to Cython of how to resolve a function call _if_ it appears in compiled Cython code, these were not causing any build failures because these were all unused APIs, so I removed them from the Cython with no further changes needed.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16610
---
 cpp/include/cudf/io/types.hpp                 | 12 ++---
 .../pylibcudf/pylibcudf/libcudf/io/types.pxd  | 50 ++++++++-----------
 2 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 3df737413fa..a34881942ce 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -54,7 +54,7 @@ namespace io {
 /**
  * @brief Compression algorithms
  */
-enum class compression_type {
+enum class compression_type : int32_t {
   NONE,    ///< No compression
   AUTO,    ///< Automatically detect or select compression format
   SNAPPY,  ///< Snappy format, using byte-oriented LZ77
@@ -72,7 +72,7 @@ enum class compression_type {
 /**
  * @brief Data source or destination types
  */
-enum class io_type {
+enum class io_type : int32_t {
   FILEPATH,          ///< Input/output is a file path
   HOST_BUFFER,       ///< Input/output is a buffer in host memory
   DEVICE_BUFFER,     ///< Input/output is a buffer in device memory
@@ -83,7 +83,7 @@ enum class io_type {
 /**
  * @brief Behavior when handling quotations in field data
  */
-enum class quote_style {
+enum class quote_style : int32_t {
   MINIMAL,     ///< Quote only fields which contain special characters
   ALL,         ///< Quote all fields
   NONNUMERIC,  ///< Quote all non-numeric fields
@@ -93,7 +93,7 @@ enum class quote_style {
 /**
  * @brief Column statistics granularity type for parquet/orc writers
  */
-enum statistics_freq {
+enum statistics_freq : int32_t {
   STATISTICS_NONE     = 0,  ///< No column statistics
   STATISTICS_ROWGROUP = 1,  ///< Per-Rowgroup column statistics
   STATISTICS_PAGE     = 2,  ///< Per-page column statistics
@@ -103,7 +103,7 @@ enum statistics_freq {
 /**
  * @brief Valid encodings for use with `column_in_metadata::set_encoding()`
  */
-enum class column_encoding {
+enum class column_encoding : int32_t {
   // Common encodings:
   USE_DEFAULT = -1,  ///< No encoding has been requested, use default encoding
   DICTIONARY,        ///< Use dictionary encoding
@@ -222,7 +222,7 @@ class writer_compression_statistics {
 /**
  * @brief Control use of dictionary encoding for parquet writer
  */
-enum dictionary_policy {
+enum dictionary_policy : int32_t {
   NEVER    = 0,  ///< Never use dictionary encoding
   ADAPTIVE = 1,  ///< Use dictionary when it will not impact compression
   ALWAYS   = 2   ///< Use dictionary regardless of impact on compression
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/types.pxd b/python/pylibcudf/pylibcudf/libcudf/io/types.pxd
index a3d99807876..5f3be2f0727 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/types.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/types.pxd
@@ -6,12 +6,10 @@ cimport pylibcudf.libcudf.table.table_view as cudf_table_view
 from libc.stdint cimport int32_t, uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
-from libcpp.memory cimport shared_ptr, unique_ptr
-from libcpp.pair cimport pair
+from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.vector cimport vector
-from pyarrow.includes.libarrow cimport CRandomAccessFile
 from pylibcudf.libcudf.table.table cimport table
 from pylibcudf.libcudf.types cimport size_type
 
@@ -42,32 +40,32 @@ cdef extern from "cudf/io/types.hpp" \
     cpdef enum class io_type(int32_t):
         FILEPATH
         HOST_BUFFER
+        DEVICE_BUFFER
         VOID
         USER_IMPLEMENTED
 
     cpdef enum class statistics_freq(int32_t):
-        STATISTICS_NONE = 0,
-        STATISTICS_ROWGROUP = 1,
-        STATISTICS_PAGE = 2,
-        STATISTICS_COLUMN = 3,
+        STATISTICS_NONE,
+        STATISTICS_ROWGROUP,
+        STATISTICS_PAGE,
+        STATISTICS_COLUMN,
 
     cpdef enum class dictionary_policy(int32_t):
-        NEVER = 0,
-        ADAPTIVE = 1,
-        ALWAYS = 2,
-
-    cdef extern from "cudf/io/types.hpp" namespace "cudf::io" nogil:
-        cpdef enum class column_encoding(int32_t):
-            USE_DEFAULT = -1
-            DICTIONARY = 0
-            PLAIN = 1
-            DELTA_BINARY_PACKED = 2
-            DELTA_LENGTH_BYTE_ARRAY =3
-            DELTA_BYTE_ARRAY = 4
-            BYTE_STREAM_SPLIT = 5
-            DIRECT = 6
-            DIRECT_V2 = 7
-            DICTIONARY_V2 = 8
+        NEVER,
+        ADAPTIVE,
+        ALWAYS,
+
+    cpdef enum class column_encoding(int32_t):
+        USE_DEFAULT
+        DICTIONARY
+        PLAIN
+        DELTA_BINARY_PACKED
+        DELTA_LENGTH_BYTE_ARRAY
+        DELTA_BYTE_ARRAY
+        BYTE_STREAM_SPLIT
+        DIRECT
+        DIRECT_V2
+        DICTIONARY_V2
 
     cdef cppclass column_name_info:
         string name
@@ -76,7 +74,6 @@ cdef extern from "cudf/io/types.hpp" \
     cdef cppclass table_metadata:
         table_metadata() except +
 
-        vector[string] column_names
         map[string, string] user_data
         vector[unordered_map[string, string]] per_file_user_data
         vector[column_name_info] schema_info
@@ -120,10 +117,7 @@ cdef extern from "cudf/io/types.hpp" \
         host_buffer(const char* data, size_t size)
 
     cdef cppclass source_info:
-        io_type type
         const vector[string]& filepaths() except +
-        const vector[host_buffer]& buffers() except +
-        vector[shared_ptr[CRandomAccessFile]] files
 
         source_info() except +
         source_info(const vector[string] &filepaths) except +
@@ -132,9 +126,7 @@ cdef extern from "cudf/io/types.hpp" \
         source_info(const vector[cudf_io_datasource.datasource*] &datasources) except +
 
     cdef cppclass sink_info:
-        io_type type
         const vector[string]& filepaths()
-        const vector[vector[char] *]& buffers()
         const vector[cudf_io_data_sink.data_sink *]& user_sinks()
 
         sink_info() except +

From 2f7d35435db2b5ed9ead96cf43e2a710db5e5e6d Mon Sep 17 00:00:00 2001
From: Nicolas <denoyelle.nicolas@gmail.com>
Date: Tue, 20 Aug 2024 03:52:34 -0500
Subject: [PATCH 697/842] bug-fix: cudf/io/json.hpp use after move (#16609)

This PR fixes a use after move in json header.
The fix simply shifts the attributes to access the object value before moving it.
Closes https://github.com/rapidsai/cudf/issues/16608

Authors:
  - Nicolas (https://github.com/NicolasDenoyelle)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16609
---
 cpp/include/cudf/io/json.hpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 0cb39d15cd5..fde1857cb7f 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -696,6 +696,8 @@ class json_writer_options_builder;
 class json_writer_options {
   // Specify the sink to use for writer output
   sink_info _sink;
+  // maximum number of rows to write in each chunk (limits memory use)
+  size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
   // Set of columns to output
   table_view _table;
   // string to use for null entries
@@ -704,8 +706,6 @@ class json_writer_options {
   bool _include_nulls = false;
   // Indicates whether to use JSON lines for records format
   bool _lines = false;
-  // maximum number of rows to write in each chunk (limits memory use)
-  size_type _rows_per_chunk = std::numeric_limits<size_type>::max();
   // string to use for values != 0 in INT8 types (default 'true')
   std::string _true_value = std::string{"true"};
   // string to use for values == 0 in INT8 types (default 'false')
@@ -720,7 +720,7 @@ class json_writer_options {
    * @param table Table to be written to output
    */
   explicit json_writer_options(sink_info sink, table_view table)
-    : _sink(std::move(sink)), _table(std::move(table)), _rows_per_chunk(table.num_rows())
+    : _sink(std::move(sink)), _rows_per_chunk(table.num_rows()), _table(std::move(table))
   {
   }
 

From 1cccf3eeaee50cd69107b3c54ee349720233d8c6 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Tue, 20 Aug 2024 16:32:29 +0200
Subject: [PATCH 698/842] Replace usages of `thrust::optional` with
 `std::optional` (#15091)

We want to get rid of thrust types in API boundaries so replace them by the better suited std types

Authors:
  - Michael Schellenberger Costa (https://github.com/miscco)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - https://github.com/nvdbaranec
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/15091
---
 .../cudf/column/column_device_view.cuh        | 28 ++++-----
 cpp/include/cudf/detail/copy_if_else.cuh      |  6 +-
 cpp/include/cudf/detail/indexalator.cuh       | 12 ++--
 cpp/include/cudf/detail/iterator.cuh          | 26 ++++----
 cpp/include/cudf/json/json.hpp                |  2 -
 .../strings/detail/convert/fixed_point.cuh    |  8 +--
 .../cudf/strings/detail/copy_if_else.cuh      |  6 +-
 .../cudf/table/experimental/row_operators.cuh |  6 +-
 cpp/src/binaryop/binaryop.cpp                 |  4 +-
 cpp/src/io/orc/orc.hpp                        |  7 ++-
 cpp/src/io/orc/writer_impl.cu                 |  6 +-
 .../io/parquet/compact_protocol_reader.cpp    |  8 +--
 cpp/src/io/parquet/parquet.hpp                | 62 +++++++++----------
 cpp/src/io/parquet/parquet_gpu.hpp            | 14 ++---
 cpp/src/io/parquet/predicate_pushdown.cpp     |  6 +-
 cpp/src/io/parquet/reader_impl.cpp            |  2 +-
 cpp/src/io/parquet/reader_impl_chunking.cu    |  6 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    |  6 +-
 cpp/src/io/parquet/writer_impl.cu             |  8 +--
 cpp/src/json/json_path.cu                     | 22 +++----
 cpp/src/lists/contains.cu                     |  1 -
 cpp/src/lists/explode.cu                      | 14 ++---
 cpp/src/strings/convert/convert_datetime.cu   | 10 +--
 cpp/src/strings/regex/regex.cuh               |  4 +-
 cpp/src/strings/regex/regex.inl               |  6 +-
 cpp/src/strings/replace/multi_re.cu           |  2 +-
 cpp/src/transform/row_bit_count.cu            | 18 +++---
 cpp/tests/io/parquet_common.cpp               |  2 +-
 cpp/tests/io/parquet_common.hpp               |  2 +-
 cpp/tests/iterator/indexalator_test.cu        | 11 ++--
 cpp/tests/iterator/offsetalator_test.cu       |  3 +-
 cpp/tests/iterator/optional_iterator_test.cuh | 25 ++++----
 .../optional_iterator_test_numeric.cu         | 10 +--
 33 files changed, 176 insertions(+), 177 deletions(-)

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 89fe59bfeaa..c3238cb94fd 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -32,9 +32,9 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 
 #include <algorithm>
@@ -614,7 +614,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   /**
    * @brief Return an optional iterator to the first element of the column.
    *
-   * Dereferencing the returned iterator returns a `thrust::optional<T>`.
+   * Dereferencing the returned iterator returns a `cuda::std::optional<T>`.
    *
    * The element of this iterator contextually converts to bool. The conversion returns true
    * if the object contains a value and false if it does not contain a value.
@@ -739,7 +739,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   /**
    * @brief Return an optional iterator to the element following the last element of the column.
    *
-   * The returned iterator represents a `thrust::optional<T>` element.
+   * The returned iterator represents a `cuda::std::optional<T>` element.
    *
    * This function does not participate in overload resolution if
    * `column_device_view::has_element_accessor<T>()` is false.
@@ -1272,21 +1272,21 @@ struct value_accessor {
  * @brief optional accessor of a column
  *
  *
- * The optional_accessor always returns a `thrust::optional` of `column[i]`. The validity
+ * The optional_accessor always returns a `cuda::std::optional` of `column[i]`. The validity
  * of the optional is determined by the `Nullate` parameter which may be one of the following:
  *
  * - `nullate::YES` means that the column supports nulls and the optional returned
  *    might be valid or invalid.
  *
  * - `nullate::NO` means the caller attests that the column has no null values,
- *    no checks will occur and `thrust::optional{column[i]}` will be
+ *    no checks will occur and `cuda::std::optional{column[i]}` will be
  *    return for each `i`.
  *
  * - `nullate::DYNAMIC` defers the assumption of nullability to runtime and the caller
  *    specifies if the column has nulls at runtime.
- *    For `DYNAMIC{true}` the return value will be `thrust::optional{column[i]}` if
- *      element `i` is not null and `thrust::optional{}` if element `i` is null.
- *    For `DYNAMIC{false}` the return value will always be `thrust::optional{column[i]}`.
+ *    For `DYNAMIC{true}` the return value will be `cuda::std::optional{column[i]}` if
+ *      element `i` is not null and `cuda::std::optional{}` if element `i` is null.
+ *    For `DYNAMIC{false}` the return value will always be `cuda::std::optional{column[i]}`.
  *
  * @throws cudf::logic_error if column datatype and template T type mismatch.
  * @throws cudf::logic_error if the column is not nullable and `with_nulls` evaluates to true
@@ -1312,19 +1312,19 @@ struct optional_accessor {
   }
 
   /**
-   * @brief Returns a `thrust::optional` of `column[i]`.
+   * @brief Returns a `cuda::std::optional` of `column[i]`.
    *
    * @param i The index of the element to return
-   * @return A `thrust::optional` that contains the value of `column[i]` is not null. If that
+   * @return A `cuda::std::optional` that contains the value of `column[i]` is not null. If that
    * element is null, the resulting optional will not contain a value.
    */
-  __device__ inline thrust::optional<T> operator()(cudf::size_type i) const
+  __device__ inline cuda::std::optional<T> operator()(cudf::size_type i) const
   {
     if (has_nulls) {
-      return (col.is_valid_nocheck(i)) ? thrust::optional<T>{col.element<T>(i)}
-                                       : thrust::optional<T>{thrust::nullopt};
+      return (col.is_valid_nocheck(i)) ? cuda::std::optional<T>{col.element<T>(i)}
+                                       : cuda::std::optional<T>{cuda::std::nullopt};
     }
-    return thrust::optional<T>{col.element<T>(i)};
+    return cuda::std::optional<T>{col.element<T>(i)};
   }
 
   Nullate has_nulls{};  ///< Indicates if the `col` should be checked for nulls.
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index 8418e279ce7..d260a4591b7 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -25,8 +25,8 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/iterator/iterator_traits.h>
-#include <thrust/optional.h>
 
 namespace cudf {
 namespace detail {
@@ -70,7 +70,7 @@ __launch_bounds__(block_size) CUDF_KERNEL
   while (warp_cur <= warp_end) {
     auto const index = static_cast<size_type>(tidx);
     auto const opt_value =
-      (index < end) ? (filter(index) ? lhs[index] : rhs[index]) : thrust::nullopt;
+      (index < end) ? (filter(index) ? lhs[index] : rhs[index]) : cuda::std::nullopt;
     if (opt_value) { out.element<T>(index) = static_cast<T>(*opt_value); }
 
     // update validity
@@ -156,7 +156,7 @@ std::unique_ptr<column> copy_if_else(bool nullable,
                                      rmm::cuda_stream_view stream,
                                      rmm::device_async_resource_ref mr)
 {
-  // This is the type of the thrust::optional element in the passed iterators
+  // This is the type of the cuda::std::optional element in the passed iterators
   using Element = typename thrust::iterator_traits<LeftIter>::value_type::value_type;
 
   size_type size           = std::distance(lhs_begin, lhs_end);
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index b5d57da6cd5..ec7b1c3e6b6 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -22,9 +22,9 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/traits.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 
 namespace cudf {
@@ -376,10 +376,10 @@ struct indexalator_factory {
       iter = make_input_iterator(col);
     }
 
-    __device__ thrust::optional<size_type> operator()(size_type i) const
+    __device__ cuda::std::optional<size_type> operator()(size_type i) const
     {
-      return has_nulls && !bit_is_set(null_mask, i + offset) ? thrust::nullopt
-                                                             : thrust::make_optional(iter[i]);
+      return has_nulls && !bit_is_set(null_mask, i + offset) ? cuda::std::nullopt
+                                                             : cuda::std::make_optional(iter[i]);
     }
   };
 
@@ -400,9 +400,9 @@ struct indexalator_factory {
       iter = indexalator_factory::make_input_iterator(input);
     }
 
-    __device__ thrust::optional<size_type> operator()(size_type) const
+    __device__ cuda::std::optional<size_type> operator()(size_type) const
     {
-      return is_null ? thrust::nullopt : thrust::make_optional(*iter);
+      return is_null ? cuda::std::nullopt : cuda::std::make_optional(*iter);
     }
   };
 
diff --git a/cpp/include/cudf/detail/iterator.cuh b/cpp/include/cudf/detail/iterator.cuh
index 9e6227ec19b..4349e1b70fd 100644
--- a/cpp/include/cudf/detail/iterator.cuh
+++ b/cpp/include/cudf/detail/iterator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -37,10 +37,10 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 
+#include <cuda/std/optional>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 
 #include <utility>
@@ -186,7 +186,7 @@ auto make_null_replacement_iterator(column_device_view const& column,
 /**
  * @brief Constructs an optional iterator over a column's values and its validity.
  *
- * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
+ * Dereferencing the returned iterator returns a `cuda::std::optional<Element>`.
  *
  * The element of this iterator contextually converts to bool. The conversion returns true
  * if the object contains a value and false if it does not contain a value.
@@ -237,7 +237,7 @@ auto make_null_replacement_iterator(column_device_view const& column,
  * @param column The column to iterate
  * @param has_nulls Indicates whether `column` is checked for nulls.
  * @return Iterator that returns valid column elements and the validity of the
- * element in a `thrust::optional`
+ * element in a `cuda::std::optional`
  */
 template <typename Element, typename Nullate>
 auto make_optional_iterator(column_device_view const& column, Nullate has_nulls)
@@ -393,7 +393,7 @@ auto inline make_scalar_iterator(scalar const& scalar_value)
 /**
  * @brief Optional accessor for a scalar
  *
- * The `scalar_optional_accessor` always returns a `thrust::optional` of the scalar.
+ * The `scalar_optional_accessor` always returns a `cuda::std::optional` of the scalar.
  * The validity of the optional is determined by the `Nullate` parameter which may
  * be one of the following:
  *
@@ -401,14 +401,14 @@ auto inline make_scalar_iterator(scalar const& scalar_value)
  *    will contain a value only if the scalar is valid.
  *
  * - `nullate::NO` means the caller attests that the scalar will always be valid,
- *    no checks will occur and `thrust::optional{column[i]}` will return a value
+ *    no checks will occur and `cuda::std::optional{column[i]}` will return a value
  *    for each `i`.
  *
  * - `nullate::DYNAMIC` defers the assumption of nullability to runtime and the caller
  *    specifies if the scalar may be valid or invalid.
- *    For `DYNAMIC{true}` the return value will be a `thrust::optional{scalar}` when the
- *      scalar is valid and a `thrust::optional{}` when the scalar is invalid.
- *    For `DYNAMIC{false}` the return value will always be a `thrust::optional{scalar}`.
+ *    For `DYNAMIC{true}` the return value will be a `cuda::std::optional{scalar}` when the
+ *      scalar is valid and a `cuda::std::optional{}` when the scalar is invalid.
+ *    For `DYNAMIC{false}` the return value will always be a `cuda::std::optional{scalar}`.
  *
  * @throws `cudf::logic_error` if scalar datatype and Element type mismatch.
  *
@@ -418,7 +418,7 @@ auto inline make_scalar_iterator(scalar const& scalar_value)
 template <typename Element, typename Nullate>
 struct scalar_optional_accessor : public scalar_value_accessor<Element> {
   using super_t    = scalar_value_accessor<Element>;
-  using value_type = thrust::optional<Element>;
+  using value_type = cuda::std::optional<Element>;
 
   scalar_optional_accessor(scalar const& scalar_value, Nullate with_nulls)
     : scalar_value_accessor<Element>(scalar_value), has_nulls{with_nulls}
@@ -427,7 +427,7 @@ struct scalar_optional_accessor : public scalar_value_accessor<Element> {
 
   __device__ inline value_type const operator()(size_type) const
   {
-    if (has_nulls && !super_t::dscalar.is_valid()) { return value_type{thrust::nullopt}; }
+    if (has_nulls && !super_t::dscalar.is_valid()) { return value_type{cuda::std::nullopt}; }
 
     if constexpr (cudf::is_fixed_point<Element>()) {
       using namespace numeric;
@@ -519,7 +519,7 @@ struct scalar_representation_pair_accessor : public scalar_value_accessor<Elemen
 /**
  * @brief Constructs an optional iterator over a scalar's values and its validity.
  *
- * Dereferencing the returned iterator returns a `thrust::optional<Element>`.
+ * Dereferencing the returned iterator returns a `cuda::std::optional<Element>`.
  *
  * The element of this iterator contextually converts to bool. The conversion returns true
  * if the object contains a value and false if it does not contain a value.
@@ -575,7 +575,7 @@ struct scalar_representation_pair_accessor : public scalar_value_accessor<Elemen
  *
  * @param scalar_value The scalar to be returned by the iterator.
  * @param has_nulls Indicates if the scalar value may be invalid.
- * @return Iterator that returns scalar and the validity of the scalar in a thrust::optional
+ * @return Iterator that returns scalar and the validity of the scalar in a cuda::std::optional
  */
 template <typename Element, typename Nullate>
 auto inline make_optional_iterator(scalar const& scalar_value, Nullate has_nulls)
diff --git a/cpp/include/cudf/json/json.hpp b/cpp/include/cudf/json/json.hpp
index 48d5dcf7727..403374c536d 100644
--- a/cpp/include/cudf/json/json.hpp
+++ b/cpp/include/cudf/json/json.hpp
@@ -22,8 +22,6 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <thrust/optional.h>
-
 namespace CUDF_EXPORT cudf {
 
 /**
diff --git a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
index 5f51da967d3..8440805960e 100644
--- a/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
+++ b/cpp/include/cudf/strings/detail/convert/fixed_point.cuh
@@ -17,8 +17,8 @@
 
 #include <cudf/fixed_point/temporary.hpp>
 
+#include <cuda/std/optional>
 #include <cuda/std/type_traits>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 
 namespace cudf {
@@ -88,7 +88,7 @@ __device__ inline thrust::pair<UnsignedDecimalType, int32_t> parse_integer(
  * @return Integer value of the exponent
  */
 template <bool check_only = false>
-__device__ thrust::optional<int32_t> parse_exponent(char const* iter, char const* iter_end)
+__device__ cuda::std::optional<int32_t> parse_exponent(char const* iter, char const* iter_end)
 {
   constexpr uint32_t exponent_max = static_cast<uint32_t>(std::numeric_limits<int32_t>::max());
 
@@ -105,12 +105,12 @@ __device__ thrust::optional<int32_t> parse_exponent(char const* iter, char const
   while (iter < iter_end) {
     auto const ch = *iter++;
     if (ch < '0' || ch > '9') {
-      if (check_only) { return thrust::nullopt; }
+      if (check_only) { return cuda::std::nullopt; }
       break;
     }
 
     uint32_t exp_check = static_cast<uint32_t>(exp_ten * 10) + static_cast<uint32_t>(ch - '0');
-    if (check_only && (exp_check > exponent_max)) { return thrust::nullopt; }  // check overflow
+    if (check_only && (exp_check > exponent_max)) { return cuda::std::nullopt; }  // check overflow
     exp_ten = static_cast<int32_t>(exp_check);
   }
 
diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index 4db7651330b..213a41ca596 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -25,8 +25,8 @@
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
+#include <cuda/std/optional>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/transform.h>
 
 namespace cudf {
@@ -41,9 +41,9 @@ namespace detail {
  * ```
  *
  * @tparam StringIterLeft A random access iterator whose value_type is
- * `thrust::optional<string_view>` where the `optional` has a value iff the element is valid.
+ * `cuda::std::optional<string_view>` where the `optional` has a value iff the element is valid.
  * @tparam StringIterRight A random access iterator whose value_type is
- * `thrust::optional<string_view>` where the `optional` has a value iff the element is valid.
+ * `cuda::std::optional<string_view>` where the `optional` has a value iff the element is valid.
  * @tparam Filter Functor that takes an index and returns a boolean.
  *
  * @param lhs_begin Start of first set of data. Used when `filter_fn` returns true.
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index f05e5f4ca5c..3f33c70c29a 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -211,7 +211,7 @@ struct sorting_physical_element_comparator {
   }
 };
 
-using optional_dremel_view = thrust::optional<detail::dremel_device_view const>;
+using optional_dremel_view = cuda::std::optional<detail::dremel_device_view const>;
 
 // The has_nested_columns template parameter of the device_row_comparator is
 // necessary to help the compiler optimize our code. Without it, the list and
@@ -223,12 +223,12 @@ using optional_dremel_view = thrust::optional<detail::dremel_device_view const>;
 // std::optional<device_span<dremel_device_view>> in the
 // preprocessed_table/device_row_comparator (which is always valid when
 // has_nested_columns and is otherwise invalid) that is then unpacked to a
-// thrust::optional<dremel_device_view> at the element_comparator level (which
+// cuda::std::optional<dremel_device_view> at the element_comparator level (which
 // is always valid for a list column and otherwise invalid).  We cannot use an
 // additional template parameter for the element_comparator on a per-column
 // basis because we cannot conditionally define dremel_device_view member
 // variables without jumping through extra hoops with inheritance, so the
-// thrust::optional<dremel_device_view> member must be an optional rather than
+// cuda::std::optional<dremel_device_view> member must be an optional rather than
 // a raw dremel_device_view.
 /**
  * @brief Computes the lexicographic comparison between 2 rows.
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 3ac8547baad..25b0f68aaa8 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -41,7 +41,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <thrust/optional.h>
+#include <cuda/std/optional>
 
 #include <jit_preprocessed_files/binaryop/jit/kernel.cu.jit.hpp>
 
@@ -173,7 +173,7 @@ template <typename Lhs, typename Rhs>
 void fixed_point_binary_operation_validation(binary_operator op,
                                              Lhs lhs,
                                              Rhs rhs,
-                                             thrust::optional<cudf::data_type> output_type = {})
+                                             cuda::std::optional<cudf::data_type> output_type = {})
 {
   CUDF_EXPECTS((is_fixed_point(lhs) or is_fixed_point(rhs)),
                "One of the inputs must have fixed_point data_type.");
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index e1403acd455..790532c9d54 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -24,7 +24,7 @@
 #include <cudf/io/orc_types.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <thrust/optional.h>
+#include <cuda/std/optional>
 
 #include <algorithm>
 #include <cstddef>
@@ -692,11 +692,12 @@ class metadata {
  * @brief `column_device_view` and additional, ORC specific, information on the column.
  */
 struct orc_column_device_view : public column_device_view {
-  __device__ orc_column_device_view(column_device_view col, thrust::optional<uint32_t> parent_idx)
+  __device__ orc_column_device_view(column_device_view col,
+                                    cuda::std::optional<uint32_t> parent_idx)
     : column_device_view{col}, parent_index{parent_idx}
   {
   }
-  thrust::optional<uint32_t> parent_index;
+  cuda::std::optional<uint32_t> parent_index;
   bitmask_type const* pushdown_mask = nullptr;
 };
 
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index f3b8cfbc836..04eee68e757 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -42,6 +42,7 @@
 #include <cooperative_groups/memcpy_async.h>
 #include <cuda/std/climits>
 #include <cuda/std/limits>
+#include <cuda/std/optional>
 #include <thrust/execution_policy.h>
 #include <thrust/extrema.h>
 #include <thrust/for_each.h>
@@ -50,7 +51,6 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
@@ -1831,7 +1831,7 @@ orc_table_view make_orc_table_view(table_view const& table,
     type_kinds, stream, rmm::mr::get_current_device_resource());
 
   rmm::device_uvector<orc_column_device_view> d_orc_columns(orc_columns.size(), stream);
-  using stack_value_type = thrust::pair<column_device_view const*, thrust::optional<uint32_t>>;
+  using stack_value_type = thrust::pair<column_device_view const*, cuda::std::optional<uint32_t>>;
   rmm::device_uvector<stack_value_type> stack_storage(orc_columns.size(), stream);
 
   // pre-order append ORC device columns
@@ -1847,7 +1847,7 @@ orc_table_view make_orc_table_view(table_view const& table,
                        thrust::make_reverse_iterator(d_table.end()),
                        thrust::make_reverse_iterator(d_table.begin()),
                        [&stack](column_device_view const& c) {
-                         stack.push({&c, thrust::nullopt});
+                         stack.push({&c, cuda::std::nullopt});
                        });
 
       uint32_t idx = 0;
diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index e13ed5e85e5..afcf6b373a9 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -304,10 +304,10 @@ class parquet_field_struct : public parquet_field {
 template <typename E, typename T>
 class parquet_field_union_struct : public parquet_field {
   E& enum_val;
-  thrust::optional<T>& val;  // union structs are always wrapped in std::optional
+  cuda::std::optional<T>& val;  // union structs are always wrapped in std::optional
 
  public:
-  parquet_field_union_struct(int f, E& ev, thrust::optional<T>& v)
+  parquet_field_union_struct(int f, E& ev, cuda::std::optional<T>& v)
     : parquet_field(f), enum_val(ev), val(v)
   {
   }
@@ -431,10 +431,10 @@ class parquet_field_struct_blob : public parquet_field {
  */
 template <typename T, typename FieldFunctor>
 class parquet_field_optional : public parquet_field {
-  thrust::optional<T>& val;
+  cuda::std::optional<T>& val;
 
  public:
-  parquet_field_optional(int f, thrust::optional<T>& v) : parquet_field(f), val(v) {}
+  parquet_field_optional(int f, cuda::std::optional<T>& v) : parquet_field(f), val(v) {}
 
   inline void operator()(CompactProtocolReader* cpr, int field_type)
   {
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 8ee4c175e09..5d10472b0ae 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -20,7 +20,7 @@
 
 #include <cudf/types.hpp>
 
-#include <thrust/optional.h>
+#include <cuda/std/optional>
 
 #include <cstdint>
 #include <optional>
@@ -94,10 +94,10 @@ struct LogicalType {
     BSON
   };
   Type type;
-  thrust::optional<DecimalType> decimal_type;
-  thrust::optional<TimeType> time_type;
-  thrust::optional<TimestampType> timestamp_type;
-  thrust::optional<IntType> int_type;
+  cuda::std::optional<DecimalType> decimal_type;
+  cuda::std::optional<TimeType> time_type;
+  cuda::std::optional<TimestampType> timestamp_type;
+  cuda::std::optional<IntType> int_type;
 
   LogicalType(Type tp = UNDEFINED) : type(tp) {}
   LogicalType(DecimalType&& dt) : type(DECIMAL), decimal_type(dt) {}
@@ -178,21 +178,21 @@ struct SchemaElement {
   // 5: nested fields
   int32_t num_children = 0;
   // 6: DEPRECATED: record the original type before conversion to parquet type
-  thrust::optional<ConvertedType> converted_type;
+  cuda::std::optional<ConvertedType> converted_type;
   // 7: DEPRECATED: record the scale for DECIMAL converted type
   int32_t decimal_scale = 0;
   // 8: DEPRECATED: record the precision for DECIMAL converted type
   int32_t decimal_precision = 0;
   // 9: save field_id from original schema
-  thrust::optional<int32_t> field_id;
+  cuda::std::optional<int32_t> field_id;
   // 10: replaces converted type
-  thrust::optional<LogicalType> logical_type;
+  cuda::std::optional<LogicalType> logical_type;
 
   // extra cudf specific fields
   bool output_as_byte_array = false;
 
   // cudf type determined from arrow:schema
-  thrust::optional<type_id> arrow_type;
+  cuda::std::optional<type_id> arrow_type;
 
   // The following fields are filled in later during schema initialization
   int max_definition_level = 0;
@@ -259,21 +259,21 @@ struct SchemaElement {
  */
 struct Statistics {
   // deprecated max value in signed comparison order
-  thrust::optional<std::vector<uint8_t>> max;
+  cuda::std::optional<std::vector<uint8_t>> max;
   // deprecated min value in signed comparison order
-  thrust::optional<std::vector<uint8_t>> min;
+  cuda::std::optional<std::vector<uint8_t>> min;
   // count of null values in the column
-  thrust::optional<int64_t> null_count;
+  cuda::std::optional<int64_t> null_count;
   // count of distinct values occurring
-  thrust::optional<int64_t> distinct_count;
+  cuda::std::optional<int64_t> distinct_count;
   // max value for column determined by ColumnOrder
-  thrust::optional<std::vector<uint8_t>> max_value;
+  cuda::std::optional<std::vector<uint8_t>> max_value;
   // min value for column determined by ColumnOrder
-  thrust::optional<std::vector<uint8_t>> min_value;
+  cuda::std::optional<std::vector<uint8_t>> min_value;
   // If true, max_value is the actual maximum value for a column
-  thrust::optional<bool> is_max_value_exact;
+  cuda::std::optional<bool> is_max_value_exact;
   // If true, min_value is the actual minimum value for a column
-  thrust::optional<bool> is_min_value_exact;
+  cuda::std::optional<bool> is_min_value_exact;
 };
 
 /**
@@ -282,7 +282,7 @@ struct Statistics {
 struct SizeStatistics {
   // Number of variable-width bytes stored for the page/chunk. Should not be set for anything
   // but the BYTE_ARRAY physical type.
-  thrust::optional<int64_t> unencoded_byte_array_data_bytes;
+  cuda::std::optional<int64_t> unencoded_byte_array_data_bytes;
   /**
    * When present, there is expected to be one element corresponding to each
    * repetition (i.e. size=max repetition_level+1) where each element
@@ -291,14 +291,14 @@ struct SizeStatistics {
    *
    * This value should not be written if max_repetition_level is 0.
    */
-  thrust::optional<std::vector<int64_t>> repetition_level_histogram;
+  cuda::std::optional<std::vector<int64_t>> repetition_level_histogram;
 
   /**
    * Same as repetition_level_histogram except for definition levels.
    *
    * This value should not be written if max_definition_level is 0 or 1.
    */
-  thrust::optional<std::vector<int64_t>> definition_level_histogram;
+  cuda::std::optional<std::vector<int64_t>> definition_level_histogram;
 };
 
 /**
@@ -319,7 +319,7 @@ struct OffsetIndex {
   std::vector<PageLocation> page_locations;
   // per-page size info. see description of the same field in SizeStatistics. only present for
   // columns with a BYTE_ARRAY physical type.
-  thrust::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes;
+  cuda::std::optional<std::vector<int64_t>> unencoded_byte_array_data_bytes;
 };
 
 /**
@@ -331,10 +331,10 @@ struct ColumnIndex {
   std::vector<std::vector<uint8_t>> max_values;  // upper bound for values in each page
   BoundaryOrder boundary_order =
     BoundaryOrder::UNORDERED;  // Indicates if min and max values are ordered
-  thrust::optional<std::vector<int64_t>> null_counts;  // Optional count of null values per page
+  cuda::std::optional<std::vector<int64_t>> null_counts;  // Optional count of null values per page
   // Repetition/definition level histograms for the column chunk
-  thrust::optional<std::vector<int64_t>> repetition_level_histogram;
-  thrust::optional<std::vector<int64_t>> definition_level_histogram;
+  cuda::std::optional<std::vector<int64_t>> repetition_level_histogram;
+  cuda::std::optional<std::vector<int64_t>> definition_level_histogram;
 };
 
 /**
@@ -384,11 +384,11 @@ struct ColumnChunkMetaData {
   Statistics statistics;
   // Set of all encodings used for pages in this column chunk. This information can be used to
   // determine if all data pages are dictionary encoded for example.
-  thrust::optional<std::vector<PageEncodingStats>> encoding_stats;
+  cuda::std::optional<std::vector<PageEncodingStats>> encoding_stats;
   // Optional statistics to help estimate total memory when converted to in-memory representations.
   // The histograms contained in these statistics can also be useful in some cases for more
   // fine-grained nullability/list length filter pushdown.
-  thrust::optional<SizeStatistics> size_statistics;
+  cuda::std::optional<SizeStatistics> size_statistics;
 };
 
 /**
@@ -430,13 +430,13 @@ struct RowGroup {
   int64_t num_rows = 0;
   // If set, specifies a sort ordering of the rows in this RowGroup.
   // The sorting columns can be a subset of all the columns.
-  thrust::optional<std::vector<SortingColumn>> sorting_columns;
+  cuda::std::optional<std::vector<SortingColumn>> sorting_columns;
   // Byte offset from beginning of file to first page (data or dictionary) in this row group
-  thrust::optional<int64_t> file_offset;
+  cuda::std::optional<int64_t> file_offset;
   // Total byte size of all compressed (and potentially encrypted) column data in this row group
-  thrust::optional<int64_t> total_compressed_size;
+  cuda::std::optional<int64_t> total_compressed_size;
   // Row group ordinal in the file
-  thrust::optional<int16_t> ordinal;
+  cuda::std::optional<int16_t> ordinal;
 };
 
 /**
@@ -461,7 +461,7 @@ struct FileMetaData {
   std::vector<RowGroup> row_groups;
   std::vector<KeyValue> key_value_metadata;
   std::string created_by = "";
-  thrust::optional<std::vector<ColumnOrder>> column_orders;
+  cuda::std::optional<std::vector<ColumnOrder>> column_orders;
 };
 
 /**
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index efc1f5ebab1..8f52f073833 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -394,7 +394,7 @@ struct ColumnChunkDesc {
                            uint8_t def_level_bits_,
                            uint8_t rep_level_bits_,
                            Compression codec_,
-                           thrust::optional<LogicalType> logical_type_,
+                           cuda::std::optional<LogicalType> logical_type_,
                            int32_t ts_clock_rate_,
                            int32_t src_col_index_,
                            int32_t src_col_schema_,
@@ -438,12 +438,12 @@ struct ColumnChunkDesc {
   int32_t num_data_pages{};                     // number of data pages
   int32_t num_dict_pages{};                     // number of dictionary pages
   PageInfo const* dict_page{};
-  string_index_pair* str_dict_index{};           // index for string dictionary
-  bitmask_type** valid_map_base{};               // base pointers of valid bit map for this column
-  void** column_data_base{};                     // base pointers of column data
-  void** column_string_base{};                   // base pointers of column string data
-  Compression codec{};                           // compressed codec enum
-  thrust::optional<LogicalType> logical_type{};  // logical type
+  string_index_pair* str_dict_index{};  // index for string dictionary
+  bitmask_type** valid_map_base{};      // base pointers of valid bit map for this column
+  void** column_data_base{};            // base pointers of column data
+  void** column_string_base{};          // base pointers of column string data
+  Compression codec{};                  // compressed codec enum
+  cuda::std::optional<LogicalType> logical_type{};  // logical type
   int32_t ts_clock_rate{};  // output timestamp clock frequency (0=default, 1000=ms, 1000000000=ns)
 
   int32_t src_col_index{};   // my input column index
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 481c1e9fcdd..5ca090b05b3 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -154,7 +154,7 @@ struct stats_caster {
         }
 
         void set_index(size_type index,
-                       thrust::optional<std::vector<uint8_t>> const& binary_value,
+                       cuda::std::optional<std::vector<uint8_t>> const& binary_value,
                        Type const type)
         {
           if (binary_value.has_value()) {
@@ -236,8 +236,8 @@ struct stats_caster {
             max.set_index(stats_idx, max_value, colchunk.meta_data.type);
           } else {
             // Marking it null, if column present in row group
-            min.set_index(stats_idx, thrust::nullopt, {});
-            max.set_index(stats_idx, thrust::nullopt, {});
+            min.set_index(stats_idx, cuda::std::nullopt, {});
+            max.set_index(stats_idx, cuda::std::nullopt, {});
           }
           stats_idx++;
         }
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 68ec61ead0a..2648a1f41ab 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -39,7 +39,7 @@ namespace {
 // be treated as a string. Currently the only logical type that has special handling is DECIMAL.
 // Other valid types in the future would be UUID (still treated as string) and FLOAT16 (which
 // for now would also be treated as a string).
-inline bool is_treat_fixed_length_as_string(thrust::optional<LogicalType> const& logical_type)
+inline bool is_treat_fixed_length_as_string(cuda::std::optional<LogicalType> const& logical_type)
 {
   if (!logical_type.has_value()) { return true; }
   return logical_type->type != LogicalType::DECIMAL;
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 794750ab6d2..54ba898b058 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -370,11 +370,11 @@ int64_t find_next_split(int64_t cur_pos,
  *
  * @return A tuple of Parquet clock rate and Parquet decimal type.
  */
-[[nodiscard]] std::tuple<int32_t, thrust::optional<LogicalType>> conversion_info(
+[[nodiscard]] std::tuple<int32_t, cuda::std::optional<LogicalType>> conversion_info(
   type_id column_type_id,
   type_id timestamp_type_id,
   Type physical,
-  thrust::optional<LogicalType> logical_type)
+  cuda::std::optional<LogicalType> logical_type)
 {
   int32_t const clock_rate =
     is_chrono(data_type{column_type_id}) ? to_clockrate(timestamp_type_id) : 0;
@@ -385,7 +385,7 @@ int64_t find_next_split(int64_t cur_pos,
     // if decimal but not outputting as float or decimal, then convert to no logical type
     if (column_type_id != type_id::FLOAT64 and
         not cudf::is_fixed_point(data_type{column_type_id})) {
-      return std::make_tuple(clock_rate, thrust::nullopt);
+      return std::make_tuple(clock_rate, cuda::std::nullopt);
     }
   }
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 581c44d024b..00f75e4e828 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -38,7 +38,7 @@ namespace flatbuf = cudf::io::parquet::flatbuf;
 
 namespace {
 
-thrust::optional<LogicalType> converted_to_logical_type(SchemaElement const& schema)
+cuda::std::optional<LogicalType> converted_to_logical_type(SchemaElement const& schema)
 {
   if (schema.converted_type.has_value()) {
     switch (schema.converted_type.value()) {
@@ -66,7 +66,7 @@ thrust::optional<LogicalType> converted_to_logical_type(SchemaElement const& sch
       default: return LogicalType{LogicalType::UNDEFINED};
     }
   }
-  return thrust::nullopt;
+  return cuda::std::nullopt;
 }
 
 }  // namespace
@@ -246,7 +246,7 @@ void metadata::sanitize_schema()
         struct_elem.repetition_type = REQUIRED;
         struct_elem.num_children    = schema_elem.num_children;
         struct_elem.type            = UNDEFINED_TYPE;
-        struct_elem.converted_type  = thrust::nullopt;
+        struct_elem.converted_type  = cuda::std::nullopt;
 
         // swap children
         struct_elem.children_idx = std::move(schema_elem.children_idx);
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 36a1d8377bf..c2c5dbb4a56 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -185,7 +185,7 @@ struct aggregate_writer_metadata {
     std::vector<std::vector<uint8_t>> column_indexes;
   };
   std::vector<per_file_metadata> files;
-  thrust::optional<std::vector<ColumnOrder>> column_orders = thrust::nullopt;
+  cuda::std::optional<std::vector<ColumnOrder>> column_orders = cuda::std::nullopt;
 };
 
 namespace {
@@ -471,7 +471,7 @@ struct leaf_schema_fn {
   std::enable_if_t<std::is_same_v<T, cudf::timestamp_ns>, void> operator()()
   {
     col_schema.type           = (timestamp_is_int96) ? Type::INT96 : Type::INT64;
-    col_schema.converted_type = thrust::nullopt;
+    col_schema.converted_type = cuda::std::nullopt;
     col_schema.stats_dtype    = statistics_dtype::dtype_timestamp64;
     if (timestamp_is_int96) {
       col_schema.ts_scale = -1000;  // negative value indicates division by absolute value
@@ -749,7 +749,7 @@ std::vector<schema_tree_node> construct_parquet_schema_tree(
           col_schema.type = Type::BYTE_ARRAY;
         }
 
-        col_schema.converted_type  = thrust::nullopt;
+        col_schema.converted_type  = cuda::std::nullopt;
         col_schema.stats_dtype     = statistics_dtype::dtype_byte_array;
         col_schema.repetition_type = col_nullable ? OPTIONAL : REQUIRED;
         col_schema.name = (schema[parent_idx].name == "list") ? "element" : col_meta.get_name();
@@ -2776,7 +2776,7 @@ std::unique_ptr<std::vector<uint8_t>> writer::merge_row_group_metadata(
   // See https://github.com/rapidsai/cudf/pull/14264#issuecomment-1778311615
   for (auto& se : md.schema) {
     if (se.logical_type.has_value() && se.logical_type.value().type == LogicalType::UNKNOWN) {
-      se.logical_type = thrust::nullopt;
+      se.logical_type = cuda::std::nullopt;
     }
   }
 
diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu
index d1a1097de35..1bf4bf3b153 100644
--- a/cpp/src/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -39,7 +39,7 @@
 #include <rmm/exec_policy.hpp>
 #include <rmm/resource_ref.hpp>
 
-#include <thrust/optional.h>
+#include <cuda/std/optional>
 #include <thrust/pair.h>
 #include <thrust/scan.h>
 #include <thrust/tuple.h>
@@ -207,7 +207,7 @@ class parser {
 struct json_output {
   size_t output_max_len;
   char* output;
-  thrust::optional<size_t> output_len;
+  cuda::std::optional<size_t> output_len;
 
   __device__ void add_output(char const* str, size_t len)
   {
@@ -656,7 +656,7 @@ class path_state : private parser {
  * @param stream Cuda stream to perform any gpu actions on
  * @returns A pair containing the command buffer, and maximum stack depth required.
  */
-std::pair<thrust::optional<rmm::device_uvector<path_operator>>, int> build_command_buffer(
+std::pair<cuda::std::optional<rmm::device_uvector<path_operator>>, int> build_command_buffer(
   cudf::string_scalar const& json_path, rmm::cuda_stream_view stream)
 {
   std::string h_json_path = json_path.to_string(stream);
@@ -690,8 +690,8 @@ std::pair<thrust::optional<rmm::device_uvector<path_operator>>, int> build_comma
   } while (op.type != path_operator_type::END);
 
   auto const is_empty = h_operators.size() == 1 && h_operators[0].type == path_operator_type::END;
-  return is_empty ? std::pair(thrust::nullopt, 0)
-                  : std::pair(thrust::make_optional(cudf::detail::make_device_uvector_sync(
+  return is_empty ? std::pair(cuda::std::nullopt, 0)
+                  : std::pair(cuda::std::make_optional(cudf::detail::make_device_uvector_sync(
                                 h_operators, stream, rmm::mr::get_current_device_resource())),
                               max_stack_depth);
 }
@@ -920,9 +920,9 @@ __launch_bounds__(block_size) CUDF_KERNEL
                               path_operator const* const commands,
                               size_type* d_sizes,
                               cudf::detail::input_offsetalator output_offsets,
-                              thrust::optional<char*> out_buf,
-                              thrust::optional<bitmask_type*> out_validity,
-                              thrust::optional<size_type*> out_valid_count,
+                              cuda::std::optional<char*> out_buf,
+                              cuda::std::optional<bitmask_type*> out_validity,
+                              cuda::std::optional<size_type*> out_valid_count,
                               get_json_object_options options)
 {
   auto tid          = cudf::detail::grid_1d::global_thread_id();
@@ -1012,9 +1012,9 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
       std::get<0>(preprocess).value().data(),
       sizes.data(),
       d_offsets,
-      thrust::nullopt,
-      thrust::nullopt,
-      thrust::nullopt,
+      cuda::std::nullopt,
+      cuda::std::nullopt,
+      cuda::std::nullopt,
       options);
 
   // convert sizes to offsets
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 30c03a8cd68..11703527d26 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -40,7 +40,6 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/reverse_iterator.h>
 #include <thrust/logical.h>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 #include <thrust/tabulate.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu
index 46c4fc78a6f..74a0d842aad 100644
--- a/cpp/src/lists/explode.cu
+++ b/cpp/src/lists/explode.cu
@@ -29,6 +29,7 @@
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
+#include <cuda/std/optional>
 #include <thrust/advance.h>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -36,7 +37,6 @@
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/scan.h>
 #include <thrust/transform.h>
 
@@ -57,8 +57,8 @@ std::unique_ptr<table> build_table(
   size_type const explode_column_idx,
   column_view const& sliced_child,
   cudf::device_span<size_type const> gather_map,
-  thrust::optional<cudf::device_span<size_type const>> explode_col_gather_map,
-  thrust::optional<rmm::device_uvector<size_type>> position_array,
+  cuda::std::optional<cudf::device_span<size_type const>> explode_col_gather_map,
+  cuda::std::optional<rmm::device_uvector<size_type>> position_array,
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
@@ -143,8 +143,8 @@ std::unique_ptr<table> explode(table_view const& input_table,
                      explode_column_idx,
                      sliced_child,
                      gather_map,
-                     thrust::nullopt,
-                     thrust::nullopt,
+                     cuda::std::nullopt,
+                     cuda::std::nullopt,
                      stream,
                      mr);
 }
@@ -193,7 +193,7 @@ std::unique_ptr<table> explode_position(table_view const& input_table,
                      explode_column_idx,
                      sliced_child,
                      gather_map,
-                     thrust::nullopt,
+                     cuda::std::nullopt,
                      std::move(pos),
                      stream,
                      mr);
@@ -292,7 +292,7 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
     sliced_child,
     gather_map,
     explode_col_gather_map,
-    include_position ? std::move(pos) : thrust::optional<rmm::device_uvector<size_type>>{},
+    include_position ? std::move(pos) : cuda::std::optional<rmm::device_uvector<size_type>>{},
     stream,
     mr);
 }
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 64a2107e17a..99c40f00b00 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -36,11 +36,11 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/execution_policy.h>
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/logical.h>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 #include <thrust/transform.h>
 
@@ -519,7 +519,7 @@ struct check_datetime_format {
    * The checking here is a little more strict than the actual
    * parser used for conversion.
    */
-  __device__ thrust::optional<timestamp_components> check_string(string_view const& d_string)
+  __device__ cuda::std::optional<timestamp_components> check_string(string_view const& d_string)
   {
     timestamp_components dateparts = {1970, 1, 1, 0};  // init to epoch time
 
@@ -529,7 +529,7 @@ struct check_datetime_format {
       // eliminate static character values first
       if (item.item_type == format_char_type::literal) {
         // check static character matches
-        if (*ptr != item.value) return thrust::nullopt;
+        if (*ptr != item.value) return cuda::std::nullopt;
         ptr += item.length;
         length -= item.length;
         continue;
@@ -645,7 +645,7 @@ struct check_datetime_format {
         case 'Z': result = true;  // skip
         default: break;
       }
-      if (!result) return thrust::nullopt;
+      if (!result) return cuda::std::nullopt;
       ptr += bytes_read;
       length -= bytes_read;
     }
@@ -821,7 +821,7 @@ struct datetime_formatter_fn {
     // We only dissect the timestamp into components if needed
     // by a specifier. And then we only do it once and reuse it.
     // This can improve performance when not using uncommon specifiers.
-    thrust::optional<cuda::std::chrono::sys_days> days;
+    cuda::std::optional<cuda::std::chrono::sys_days> days;
 
     auto days_from_timestamp = [tstamp]() {
       auto const count = tstamp.time_since_epoch().count();
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index e6134296e45..2df404048f7 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -23,8 +23,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cuda/std/optional>
 #include <cuda_runtime.h>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 
 #include <memory>
@@ -36,7 +36,7 @@ namespace detail {
 struct relist;
 
 using match_pair   = thrust::pair<cudf::size_type, cudf::size_type>;
-using match_result = thrust::optional<match_pair>;
+using match_result = cuda::std::optional<match_pair>;
 
 constexpr int32_t MAX_SHARED_MEM      = 2048;  ///< Memory size for storing prog instruction data
 constexpr std::size_t MAX_WORKING_MEM = 0x01'FFFF'FFFF;  ///< Memory size for state data
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 23e1944cda4..3b899e4edc1 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -260,12 +260,12 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
       switch (jnk.starttype) {
         case BOL:
           if (pos == 0) break;
-          if (jnk.startchar != '^') { return thrust::nullopt; }
+          if (jnk.startchar != '^') { return cuda::std::nullopt; }
           --itr;
           startchar = static_cast<char_utf8>('\n');
         case CHAR: {
           auto const find_itr = find_char(startchar, dstr, itr);
-          if (find_itr.byte_offset() >= dstr.size_bytes()) { return thrust::nullopt; }
+          if (find_itr.byte_offset() >= dstr.size_bytes()) { return cuda::std::nullopt; }
           itr = find_itr + (jnk.starttype == BOL);
           pos = itr.position();
           break;
@@ -396,7 +396,7 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
     checkstart = jnk.list1->get_size() == 0;
   } while (!last_character && (!checkstart || !match));
 
-  return match ? match_result({begin, end}) : thrust::nullopt;
+  return match ? match_result({begin, end}) : cuda::std::nullopt;
 }
 
 __device__ __forceinline__ match_result reprog_device::find(int32_t const thread_idx,
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 31234ea42ec..0ad3ab2305c 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -92,7 +92,7 @@ struct replace_multi_regex_fn {
         }
         reprog_device prog = progs[ptn_idx];
 
-        auto const result = !prog.is_empty() ? prog.find(idx, d_str, itr) : thrust::nullopt;
+        auto const result = !prog.is_empty() ? prog.find(idx, d_str, itr) : cuda::std::nullopt;
         d_ranges[ptn_idx] =
           result ? found_range{result->first, result->second} : found_range{nchars, nchars};
       }
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 4530fabf889..6a965d10184 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -35,8 +35,8 @@
 #include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
+#include <cuda/std/optional>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/tabulate.h>
 
 namespace cudf {
@@ -159,9 +159,9 @@ void flatten_hierarchy(ColIter begin,
                        std::vector<column_info>& info,
                        hierarchy_info& h_info,
                        rmm::cuda_stream_view stream,
-                       size_type cur_depth                = 0,
-                       size_type cur_branch_depth         = 0,
-                       thrust::optional<int> parent_index = {});
+                       size_type cur_depth                   = 0,
+                       size_type cur_branch_depth            = 0,
+                       cuda::std::optional<int> parent_index = {});
 
 /**
  * @brief Type-dispatched functor called by flatten_hierarchy.
@@ -177,7 +177,7 @@ struct flatten_functor {
                   rmm::cuda_stream_view,
                   size_type cur_depth,
                   size_type cur_branch_depth,
-                  thrust::optional<int>)
+                  cuda::std::optional<int>)
   {
     out.push_back(col);
     info.push_back({cur_depth, cur_branch_depth, cur_branch_depth});
@@ -194,7 +194,7 @@ struct flatten_functor {
                   rmm::cuda_stream_view,
                   size_type cur_depth,
                   size_type cur_branch_depth,
-                  thrust::optional<int>)
+                  cuda::std::optional<int>)
   {
     out.push_back(col);
     info.push_back({cur_depth, cur_branch_depth, cur_branch_depth});
@@ -210,7 +210,7 @@ struct flatten_functor {
                   rmm::cuda_stream_view stream,
                   size_type cur_depth,
                   size_type cur_branch_depth,
-                  thrust::optional<int> parent_index)
+                  cuda::std::optional<int> parent_index)
   {
     // track branch depth as we reach this list and after we pass it
     auto const branch_depth_start = cur_branch_depth;
@@ -243,7 +243,7 @@ struct flatten_functor {
                   rmm::cuda_stream_view stream,
                   size_type cur_depth,
                   size_type cur_branch_depth,
-                  thrust::optional<int>)
+                  cuda::std::optional<int>)
   {
     out.push_back(col);
     info.push_back({cur_depth, cur_branch_depth, cur_branch_depth});
@@ -284,7 +284,7 @@ void flatten_hierarchy(ColIter begin,
                        rmm::cuda_stream_view stream,
                        size_type cur_depth,
                        size_type cur_branch_depth,
-                       thrust::optional<int> parent_index)
+                       cuda::std::optional<int> parent_index)
 {
   std::for_each(begin, end, [&](column_view const& col) {
     cudf::type_dispatcher(col.type(),
diff --git a/cpp/tests/io/parquet_common.cpp b/cpp/tests/io/parquet_common.cpp
index c1211869bcc..3dd5ad145ea 100644
--- a/cpp/tests/io/parquet_common.cpp
+++ b/cpp/tests/io/parquet_common.cpp
@@ -744,7 +744,7 @@ int32_t compare(T& v1, T& v2)
 int32_t compare_binary(std::vector<uint8_t> const& v1,
                        std::vector<uint8_t> const& v2,
                        cudf::io::parquet::detail::Type ptype,
-                       thrust::optional<cudf::io::parquet::detail::ConvertedType> const& ctype)
+                       cuda::std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype)
 {
   auto ctype_val = ctype.value_or(cudf::io::parquet::detail::UNKNOWN);
   switch (ptype) {
diff --git a/cpp/tests/io/parquet_common.hpp b/cpp/tests/io/parquet_common.hpp
index 59ee85444f2..bc6145d77da 100644
--- a/cpp/tests/io/parquet_common.hpp
+++ b/cpp/tests/io/parquet_common.hpp
@@ -172,7 +172,7 @@ std::pair<cudf::table, std::string> create_parquet_typed_with_stats(std::string
 int32_t compare_binary(std::vector<uint8_t> const& v1,
                        std::vector<uint8_t> const& v2,
                        cudf::io::parquet::detail::Type ptype,
-                       thrust::optional<cudf::io::parquet::detail::ConvertedType> const& ctype);
+                       cuda::std::optional<cudf::io::parquet::detail::ConvertedType> const& ctype);
 
 void expect_compression_stats_empty(std::shared_ptr<cudf::io::writer_compression_statistics> stats);
 
diff --git a/cpp/tests/iterator/indexalator_test.cu b/cpp/tests/iterator/indexalator_test.cu
index 0c10853ec02..dac2356dcb0 100644
--- a/cpp/tests/iterator/indexalator_test.cu
+++ b/cpp/tests/iterator/indexalator_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,10 +20,10 @@
 
 #include <cudf/detail/indexalator.cuh>
 
+#include <cuda/std/optional>
 #include <thrust/binary_search.h>
 #include <thrust/gather.h>
 #include <thrust/host_vector.h>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 #include <thrust/scatter.h>
 #include <thrust/sequence.h>
@@ -84,15 +84,16 @@ TYPED_TEST(IndexalatorTest, optional_iterator)
   auto d_col = cudf::test::fixed_width_column_wrapper<T>(
     host_values.begin(), host_values.end(), validity.begin());
 
-  auto expected_values = thrust::host_vector<thrust::optional<cudf::size_type>>(host_values.size());
+  auto expected_values =
+    thrust::host_vector<cuda::std::optional<cudf::size_type>>(host_values.size());
 
   std::transform(host_values.begin(),
                  host_values.end(),
                  validity.begin(),
                  expected_values.begin(),
                  [](T v, bool b) {
-                   return (b) ? thrust::make_optional(static_cast<cudf::size_type>(v))
-                              : thrust::nullopt;
+                   return (b) ? cuda::std::make_optional(static_cast<cudf::size_type>(v))
+                              : cuda::std::nullopt;
                  });
 
   auto it_dev = cudf::detail::indexalator_factory::make_input_optional_iterator(d_col);
diff --git a/cpp/tests/iterator/offsetalator_test.cu b/cpp/tests/iterator/offsetalator_test.cu
index e569e58f42a..b206ff947bb 100644
--- a/cpp/tests/iterator/offsetalator_test.cu
+++ b/cpp/tests/iterator/offsetalator_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,7 +23,6 @@
 #include <thrust/binary_search.h>
 #include <thrust/gather.h>
 #include <thrust/host_vector.h>
-#include <thrust/optional.h>
 #include <thrust/pair.h>
 #include <thrust/scatter.h>
 #include <thrust/sequence.h>
diff --git a/cpp/tests/iterator/optional_iterator_test.cuh b/cpp/tests/iterator/optional_iterator_test.cuh
index 6a264cee9a8..04f5410a44f 100644
--- a/cpp/tests/iterator/optional_iterator_test.cuh
+++ b/cpp/tests/iterator/optional_iterator_test.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,8 +16,8 @@
 
 #include <tests/iterator/iterator_tests.cuh>
 
+#include <cuda/std/optional>
 #include <thrust/host_vector.h>
-#include <thrust/optional.h>
 
 template <typename T>
 void nonull_optional_iterator(IteratorTest<T>& testFixture)
@@ -32,9 +32,9 @@ void nonull_optional_iterator(IteratorTest<T>& testFixture)
   auto d_col = cudf::column_device_view::create(w_col);
 
   // calculate the expected value by CPU.
-  thrust::host_vector<thrust::optional<T>> replaced_array(host_values.size());
+  thrust::host_vector<cuda::std::optional<T>> replaced_array(host_values.size());
   std::transform(host_values.begin(), host_values.end(), replaced_array.begin(), [](auto s) {
-    return thrust::optional<T>{s};
+    return cuda::std::optional<T>{s};
   });
 
   // GPU test
@@ -61,19 +61,20 @@ void null_optional_iterator(IteratorTest<T>& testFixture)
   auto d_col = cudf::column_device_view::create(w_col);
 
   // calculate the expected value by CPU.
-  thrust::host_vector<thrust::optional<T>> optional_values(host_values.size());
-  std::transform(host_values.begin(),
-                 host_values.end(),
-                 host_bools.begin(),
-                 optional_values.begin(),
-                 [](auto s, bool b) { return b ? thrust::optional<T>{s} : thrust::optional<T>{}; });
+  thrust::host_vector<cuda::std::optional<T>> optional_values(host_values.size());
+  std::transform(
+    host_values.begin(),
+    host_values.end(),
+    host_bools.begin(),
+    optional_values.begin(),
+    [](auto s, bool b) { return b ? cuda::std::optional<T>{s} : cuda::std::optional<T>{}; });
 
-  thrust::host_vector<thrust::optional<T>> value_all_valid(host_values.size());
+  thrust::host_vector<cuda::std::optional<T>> value_all_valid(host_values.size());
   std::transform(host_values.begin(),
                  host_values.end(),
                  host_bools.begin(),
                  value_all_valid.begin(),
-                 [](auto s, bool b) { return thrust::optional<T>{s}; });
+                 [](auto s, bool b) { return cuda::std::optional<T>{s}; });
 
   // GPU test for correct null mapping
   testFixture.iterator_test_thrust(
diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu
index 98befb0a3ee..257c0979017 100644
--- a/cpp/tests/iterator/optional_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu
@@ -18,9 +18,9 @@
 
 #include <cudf/utilities/default_stream.hpp>
 
+#include <cuda/std/optional>
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/transform_iterator.h>
-#include <thrust/optional.h>
 #include <thrust/reduce.h>
 #include <thrust/transform.h>
 
@@ -49,21 +49,21 @@ TYPED_TEST(NumericOptionalIteratorTest, null_optional_iterator) { null_optional_
 // Transformers and Operators for optional_iterator test
 template <typename ElementType>
 struct transformer_optional_meanvar {
-  using ResultType = thrust::optional<cudf::meanvar<ElementType>>;
+  using ResultType = cuda::std::optional<cudf::meanvar<ElementType>>;
 
-  CUDF_HOST_DEVICE inline ResultType operator()(thrust::optional<ElementType> const& optional)
+  CUDF_HOST_DEVICE inline ResultType operator()(cuda::std::optional<ElementType> const& optional)
   {
     if (optional.has_value()) {
       auto v = *optional;
       return cudf::meanvar<ElementType>{v, static_cast<ElementType>(v * v), 1};
     }
-    return thrust::nullopt;
+    return cuda::std::nullopt;
   }
 };
 
 template <typename T>
 struct optional_to_meanvar {
-  CUDF_HOST_DEVICE inline T operator()(thrust::optional<T> const& v) { return v.value_or(T{0}); }
+  CUDF_HOST_DEVICE inline T operator()(cuda::std::optional<T> const& v) { return v.value_or(T{0}); }
 };
 
 // TODO: enable this test also at __CUDACC_DEBUG__

From 555734dee7a8fb10f50c8609a8e4fb2c025e6305 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 20 Aug 2024 09:32:59 -0500
Subject: [PATCH 699/842] Remove thrust::optional from expression evaluator
 (#16604)

This PR follows up on a request from @davidwendt in https://github.com/rapidsai/cudf/pull/15091#discussion_r1722183142.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16604
---
 cpp/include/cudf/ast/detail/expression_evaluator.cuh | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
index 105d87ff96f..9d8762555d7 100644
--- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh
+++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,8 +29,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
-#include <thrust/optional.h>
-
 namespace cudf {
 
 namespace ast {
@@ -278,7 +276,7 @@ struct expression_evaluator {
     detail::device_data_reference const& input_reference,
     IntermediateDataType<has_nulls>* thread_intermediate_storage,
     cudf::size_type left_row_index,
-    thrust::optional<cudf::size_type> right_row_index = {}) const
+    cudf::size_type right_row_index = {}) const
   {
     // TODO: Everywhere in the code assumes that the table reference is either
     // left or right. Should we error-check somewhere to prevent
@@ -291,7 +289,7 @@ struct expression_evaluator {
       // any case where input_reference.table_source == table_reference::RIGHT.
       // Otherwise, behavior is undefined.
       auto const row_index =
-        (input_reference.table_source == table_reference::LEFT) ? left_row_index : *right_row_index;
+        (input_reference.table_source == table_reference::LEFT) ? left_row_index : right_row_index;
       if constexpr (has_nulls) {
         return table.column(input_reference.data_index).is_valid(row_index)
                  ? ReturnType(table.column(input_reference.data_index).element<Element>(row_index))
@@ -329,7 +327,7 @@ struct expression_evaluator {
     detail::device_data_reference const& device_data_reference,
     IntermediateDataType<has_nulls>* thread_intermediate_storage,
     cudf::size_type left_row_index,
-    thrust::optional<cudf::size_type> right_row_index = {}) const
+    cudf::size_type right_row_index = {}) const
   {
     CUDF_UNREACHABLE("Unsupported type in resolve_input.");
   }

From b32bc10ee9795ba94df9a79d6fa5bfd2a53455d6 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 20 Aug 2024 12:43:51 -0500
Subject: [PATCH 700/842] do not install cudf in cudf_polars wheel tests
 (#16612)

Removes unnecessary installation of `cudf` wheels in wheel testing for `cudf_polars`.

`cudf_polars` doesn't depend on `cudf`, and neither do its tests. However, right now it's downloading `cudf` during it's wheel tests. I mistakenly introduced that in #16575.

This introduced a race condition that could lead to CI failures whenever the `cudf` wheels aren't published yet by the time the `cudf_polars` tests. Because the `cudf_polars` wheel tests (rightly) do not wait for `cudf` wheels to be available:

https://github.com/rapidsai/cudf/blob/555734dee7a8fb10f50c8609a8e4fb2c025e6305/.github/workflows/pr.yaml#L154-L155

https://github.com/rapidsai/cudf/blob/555734dee7a8fb10f50c8609a8e4fb2c025e6305/.github/workflows/pr.yaml#L145-L146

Noticed this in #16611

```text
[rapids-download-from-s3] Downloading and decompressing s3://rapids-downloads/ci/cudf/pull-request/16611/a6b7eff/cudf_wheel_python_cudf_cu12_py310_x86_64.tar.gz into ./dist
download failed: s3://rapids-downloads/ci/cudf/pull-request/16611/a6b7eff/cudf_wheel_python_cudf_cu12_py310_x86_64.tar.gz to - An error occurred (404) when calling the HeadObject operation: Not Found
```

([build link](https://github.com/rapidsai/cudf/actions/runs/10472939821/job/29004728278?pr=16611))

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16612
---
 ci/test_wheel_cudf_polars.sh | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index 6438d13c4b7..e9c6188502c 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -20,15 +20,13 @@ fi
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
 
-# Download the cudf and pylibcudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+# Download pylibcudf built in the previous step
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
 rapids-logger "Installing cudf_polars and its dependencies"
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
-    "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
     "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
     "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 

From e450baf1d748a4a361797ee18a1372095212b816 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 20 Aug 2024 13:59:58 -0500
Subject: [PATCH 701/842] remove streamz git dependency, standardize build
 dependency names, consolidate some dependency lists (#16611)

Proposes some additional cleanup in `dependencies.yaml`, for things I noticed while working through #15483.

* standardizes the naming of keys in the `files:` section for build dependencies
  - *`py_build_{project}` = dependencies for the `[build-system]` table*
  - *`py_rapids_build_{project}` = dependencies for the `[tool.rapids-build-backend]` table*
  - *this is how it was done over most of the other repos in https://github.com/rapidsai/build-planning/issues/31, it was just missed because `cudf` was one of the first repos to add `rapids-build-backend`*
* removes the dependency on building `streamz` from latest source on GitHub
  - *`custreamz` conda packages and wheels depend on packages for those, not this git dependency*
    - https://github.com/rapidsai/cudf/blob/2f7d35435db2b5ed9ead96cf43e2a710db5e5e6d/dependencies.yaml#L752-L754
    - https://github.com/rapidsai/cudf/blob/2f7d35435db2b5ed9ead96cf43e2a710db5e5e6d/conda/recipes/custreamz/meta.yaml#L45-L47
  - *if this is really needed, I don't think it belongs in the `build_python_cudf` set*
  - *the last commit to `streamz` was 2 years ago (https://github.com/python-streamz/streamz), this doesn't seem like a `rapids-dask-dependency`, try-to-always-test-against-latest, situation to me*
  - *I'm guessing this is left over from a time before `streamz` was regularly publishing wheels... it's been in `dependencies.yaml` since that file was first introduced here in November 2022 (#11674)*
  - *the last release, v0.6.4, was made on July 27, 2022. There have been around 20 commits to `master` since then ([history link](https://github.com/python-streamz/streamz/commits/master/)) ... but if `custreamz` really needed those, I'd expect `custreamz` to depend on the version built from GitHub sources. I strongly suspect that that isn't the case.*
* removes `build_python_cudf` and `build_python_libcudf` lists in `dependencies.yaml`, in favor of re-using the `depends_on_rmm` and `depends_on_pylibcudf` lists

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16611
---
 .../all_cuda-118_arch-x86_64.yaml             |   3 -
 .../all_cuda-125_arch-x86_64.yaml             |   3 -
 dependencies.yaml                             | 137 ++++++------------
 3 files changed, 42 insertions(+), 101 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index d0d18e57abc..018162bd848 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -66,7 +66,6 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.3dev0
 - pandoc
-- pip
 - pre-commit
 - ptxcompiler
 - pyarrow==16.1.0.*
@@ -99,6 +98,4 @@ dependencies:
 - transformers==4.39.3
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
-- pip:
-  - git+https://github.com/python-streamz/streamz.git@master
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index caf39a32d79..c60ffa7aaa5 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -64,7 +64,6 @@ dependencies:
 - pandas
 - pandas>=2.0,<2.2.3dev0
 - pandoc
-- pip
 - pre-commit
 - pyarrow==16.1.0.*
 - pydata-sphinx-theme!=0.14.2
@@ -97,6 +96,4 @@ dependencies:
 - transformers==4.39.3
 - typing_extensions>=4.0.0
 - zlib>=1.2.13
-- pip:
-  - git+https://github.com/python-streamz/streamz.git@master
 name: all_cuda-125_arch-x86_64
diff --git a/dependencies.yaml b/dependencies.yaml
index a774345fe95..150d03be021 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -10,10 +10,10 @@ files:
       - build_all
       - build_cpp
       - build_python_common
-      - build_python_pylibcudf
-      - build_python_cudf
       - cuda
       - cuda_version
+      - depends_on_cupy
+      - depends_on_rmm
       - develop
       - docs
       - libarrow_build
@@ -31,7 +31,6 @@ files:
       - test_python_cudf
       - test_python_dask_cudf
       - test_python_pylibcudf
-      - depends_on_cupy
   test_static_build:
     output: none
     includes:
@@ -95,7 +94,8 @@ files:
     includes:
       - build_base
       - build_python_common
-      - build_python_cudf
+      - depends_on_pylibcudf
+      - depends_on_rmm
   py_run_cudf:
     output: pyproject
     pyproject_dir: python/cudf
@@ -107,6 +107,7 @@ files:
       - pyarrow_run
       - depends_on_cupy
       - depends_on_pylibcudf
+      - depends_on_rmm
   py_test_cudf:
     output: pyproject
     pyproject_dir: python/cudf
@@ -116,14 +117,14 @@ files:
     includes:
       - test_python_common
       - test_python_cudf
-  py_rapids_build_pylibcudf:
+  py_build_pylibcudf:
     output: pyproject
     pyproject_dir: python/pylibcudf
     extras:
       table: build-system
     includes:
       - rapids_build_skbuild
-  py_build_pylibcudf:
+  py_rapids_build_pylibcudf:
     output: pyproject
     pyproject_dir: python/pylibcudf
     extras:
@@ -132,15 +133,16 @@ files:
     includes:
       - build_base
       - build_python_common
-      - build_python_pylibcudf
+      - depends_on_rmm
   py_run_pylibcudf:
     output: pyproject
     pyproject_dir: python/pylibcudf
     extras:
       table: project
     includes:
-      - run_pylibcudf
+      - depends_on_rmm
       - pyarrow_run
+      - run_pylibcudf
   py_test_pylibcudf:
     output: pyproject
     pyproject_dir: python/pylibcudf
@@ -215,14 +217,14 @@ files:
     includes:
       - test_python_common
       - test_python_dask_cudf
-  py_rapids_build_cudf_kafka:
+  py_build_cudf_kafka:
     output: pyproject
     pyproject_dir: python/cudf_kafka
     extras:
       table: build-system
     includes:
       - rapids_build_skbuild
-  py_build_cudf_kafka:
+  py_rapids_build_cudf_kafka:
     output: pyproject
     pyproject_dir: python/cudf_kafka
     extras:
@@ -364,65 +366,6 @@ dependencies:
           # Sync with conda build constraint & wheel run constraint.
           # TODO: Change to `2.0.*` for NumPy 2
           - numpy==1.23.*
-  build_python_pylibcudf:
-    common:
-      - output_types: conda
-        packages:
-          - &rmm_unsuffixed rmm==24.10.*,>=0.0.0a0
-      - output_types: requirements
-        packages:
-          # pip recognizes the index as a global option for the requirements.txt file
-          # This index is needed for rmm-cu{11,12}.
-          - --extra-index-url=https://pypi.nvidia.com
-          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
-    specific:
-      - output_types: [requirements, pyproject]
-        matrices:
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "true"
-            packages:
-              - &rmm_cu12 rmm-cu12==24.10.*,>=0.0.0a0
-          - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "true"
-            packages:
-              - &rmm_cu11 rmm-cu11==24.10.*,>=0.0.0a0
-          - {matrix: null, packages: [*rmm_unsuffixed]}
-  build_python_cudf:
-    common:
-      - output_types: conda
-        packages:
-          - *rmm_unsuffixed
-          - pip
-          - pip:
-              - git+https://github.com/python-streamz/streamz.git@master
-      - output_types: requirements
-        packages:
-          # pip recognizes the index as a global option for the requirements.txt file
-          # This index is needed for rmm-cu{11,12}.
-          - --extra-index-url=https://pypi.nvidia.com
-          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
-          - git+https://github.com/python-streamz/streamz.git@master
-    specific:
-      - output_types: [requirements, pyproject]
-        matrices:
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "true"
-            packages:
-              - &pylibcudf_cu12 pylibcudf-cu12==24.10.*,>=0.0.0a0
-              - *rmm_cu12
-          - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "true"
-            packages:
-              - &pylibcudf_cu11 pylibcudf-cu11==24.10.*,>=0.0.0a0
-              - *rmm_cu11
-          - matrix:
-            packages:
-              - &pylibcudf_unsuffixed pylibcudf==24.10.*,>=0.0.0a0
-              - *rmm_unsuffixed
   libarrow_build:
     common:
       - output_types: conda
@@ -635,9 +578,6 @@ dependencies:
           - nvtx>=0.2.1
           - packaging
           - typing_extensions>=4.0.0
-      - output_types: conda
-        packages:
-          - *rmm_unsuffixed
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -654,19 +594,6 @@ dependencies:
             packages: &run_pylibcudf_packages_all_cu11
               - cuda-python>=11.7.1,<12.0a0
           - {matrix: null, packages: *run_pylibcudf_packages_all_cu11}
-      - output_types: [requirements, pyproject]
-        matrices:
-          - matrix:
-              cuda: "12.*"
-              cuda_suffixed: "true"
-            packages:
-              - *rmm_cu12
-          - matrix:
-              cuda: "11.*"
-              cuda_suffixed: "true"
-            packages:
-              - *rmm_cu11
-          - {matrix: null, packages: [*rmm_unsuffixed]}
   run_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -677,9 +604,6 @@ dependencies:
           - packaging
           - rich
           - typing_extensions>=4.0.0
-      - output_types: conda
-        packages:
-          - *rmm_unsuffixed
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -711,19 +635,16 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - *rmm_cu12
               - pynvjitlink-cu12>=0.0.0a0
           - matrix:
               cuda: "12.*"
               cuda_suffixed: "false"
             packages:
-              - *rmm_unsuffixed
               - *pynvjitlink_unsuffixed
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - *rmm_cu11
               - cubinlinker-cu11
               - ptxcompiler-cu11
           - matrix:
@@ -732,7 +653,6 @@ dependencies:
             packages: &run_cudf_cu11_unsuffixed
               - *cubinlinker_unsuffixed
               - *ptxcompiler_unsuffixed
-              - *rmm_unsuffixed
           - {matrix: null, packages: *run_cudf_cu11_unsuffixed}
   run_cudf_polars:
     common:
@@ -843,7 +763,7 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - *pylibcudf_unsuffixed
+          - &pylibcudf_unsuffixed pylibcudf==24.10.*,>=0.0.0a0
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -857,12 +777,12 @@ dependencies:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - *pylibcudf_cu12
+              - pylibcudf-cu12==24.10.*,>=0.0.0a0
           - matrix:
               cuda: "11.*"
               cuda_suffixed: "true"
             packages:
-              - *pylibcudf_cu11
+              - pylibcudf-cu11==24.10.*,>=0.0.0a0
           - {matrix: null, packages: [*pylibcudf_unsuffixed]}
   depends_on_cudf:
     common:
@@ -929,6 +849,33 @@ dependencies:
             packages: &cupy_packages_cu11
               - cupy-cuda11x>=12.0.0
           - {matrix: null, packages: *cupy_packages_cu11}
+  depends_on_rmm:
+    common:
+      - output_types: conda
+        packages:
+          - &rmm_unsuffixed rmm==24.10.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for rmm-cu{11,12}.
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - rmm-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - rmm-cu11==24.10.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - *rmm_unsuffixed
   test_python_pandas_cudf:
     common:
       - output_types: [requirements, pyproject]

From 28fee97c24bcb5f6c61241058c7c3f824687f654 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 20 Aug 2024 17:02:49 -0400
Subject: [PATCH 702/842] Enable gtests previously disabled for
 compute-sanitizer bug (#16581)

Enables tests disable in https://github.com/rapidsai/cudf/pull/15259 due to a `compute-sanitizer` bug. This has been fixed in the CUDA 12.5 release and the nightly memchecks should pass again with these enabled.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16581
---
 .../iterator/value_iterator_test_numeric.cu   | 16 ++-----------
 cpp/tests/reductions/reduction_tests.cpp      |  3 ---
 .../reductions/segmented_reduction_tests.cpp  | 24 -------------------
 3 files changed, 2 insertions(+), 41 deletions(-)

diff --git a/cpp/tests/iterator/value_iterator_test_numeric.cu b/cpp/tests/iterator/value_iterator_test_numeric.cu
index d3d1c12bdc7..39e05ff6832 100644
--- a/cpp/tests/iterator/value_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/value_iterator_test_numeric.cu
@@ -23,17 +23,5 @@ template <typename T>
 struct NumericValueIteratorTest : public IteratorTest<T> {};
 
 TYPED_TEST_SUITE(NumericValueIteratorTest, TestingTypes);
-TYPED_TEST(NumericValueIteratorTest, non_null_iterator)
-{
-  if constexpr (std::is_same_v<TypeParam, int16_t> || std::is_same_v<TypeParam, uint16_t>) {
-    if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
-  }
-  non_null_iterator(*this);
-}
-TYPED_TEST(NumericValueIteratorTest, null_iterator)
-{
-  if constexpr (std::is_same_v<TypeParam, int16_t> || std::is_same_v<TypeParam, uint16_t>) {
-    if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
-  }
-  null_iterator(*this);
-}
+TYPED_TEST(NumericValueIteratorTest, non_null_iterator) { non_null_iterator(*this); }
+TYPED_TEST(NumericValueIteratorTest, null_iterator) { null_iterator(*this); }
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 0ec4cfa34c4..949ffcc26a6 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -300,9 +300,6 @@ TYPED_TEST_SUITE(ReductionTest, cudf::test::NumericTypes);
 TYPED_TEST(ReductionTest, Product)
 {
   using T = TypeParam;
-  if constexpr (std::is_same_v<T, int16_t> || std::is_same_v<T, uint16_t>) {
-    if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
-  }
 
   std::vector<int> int_values({5, -1, 1, 0, 3, 2, 4});
   std::vector<bool> host_bools({true, true, false, false, true, true, true});
diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
index 37efc116d2a..668690639a6 100644
--- a/cpp/tests/reductions/segmented_reduction_tests.cpp
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -87,10 +87,6 @@ TYPED_TEST(SegmentedReductionTest, SumExcludeNulls)
 
 TYPED_TEST(SegmentedReductionTest, ProductExcludeNulls)
 {
-  if constexpr (std::is_same_v<TypeParam, int16_t> || std::is_same_v<TypeParam, uint16_t>) {
-    if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
-  }
-
   // [1, 3, 5], [null, 3, 5], [1], [null], [null, null], []
   // values:    {1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX}
   // offsets:   {0, 3, 6, 7, 8, 10, 10}
@@ -141,10 +137,6 @@ TYPED_TEST(SegmentedReductionTest, ProductExcludeNulls)
 
 TYPED_TEST(SegmentedReductionTest, MaxExcludeNulls)
 {
-  if constexpr (std::is_same_v<TypeParam, int16_t> || std::is_same_v<TypeParam, uint16_t>) {
-    if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
-  }
-
   // [1, 2, 3], [1, null, 3], [1], [null], [null, null], []
   // values:    {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}
   // offsets:   {0, 3, 6, 7, 8, 10, 10}
@@ -193,10 +185,6 @@ TYPED_TEST(SegmentedReductionTest, MaxExcludeNulls)
 
 TYPED_TEST(SegmentedReductionTest, MinExcludeNulls)
 {
-  if constexpr (std::is_same_v<TypeParam, int16_t> || std::is_same_v<TypeParam, uint16_t>) {
-    if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
-  }
-
   // [1, 2, 3], [1, null, 3], [1], [null], [null, null], []
   // values:   {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}
   // offsets:  {0, 3, 6, 7, 8, 10, 10}
@@ -388,10 +376,6 @@ TYPED_TEST(SegmentedReductionTest, SumIncludeNulls)
 
 TYPED_TEST(SegmentedReductionTest, ProductIncludeNulls)
 {
-  if constexpr (std::is_same_v<TypeParam, int16_t> || std::is_same_v<TypeParam, uint16_t>) {
-    if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
-  }
-
   // [1, 3, 5], [null, 3, 5], [1], [null], [null, null], []
   // values:    {1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX}
   // offsets:   {0, 3, 6, 7, 8, 10, 10}
@@ -445,10 +429,6 @@ TYPED_TEST(SegmentedReductionTest, ProductIncludeNulls)
 
 TYPED_TEST(SegmentedReductionTest, MaxIncludeNulls)
 {
-  if constexpr (std::is_same_v<TypeParam, int16_t> || std::is_same_v<TypeParam, uint16_t>) {
-    if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
-  }
-
   // [1, 2, 3], [1, null, 3], [1], [null], [null, null], []
   // values:    {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}
   // offsets:   {0, 3, 6, 7, 8, 10, 10}
@@ -500,10 +480,6 @@ TYPED_TEST(SegmentedReductionTest, MaxIncludeNulls)
 
 TYPED_TEST(SegmentedReductionTest, MinIncludeNulls)
 {
-  if constexpr (std::is_same_v<TypeParam, int16_t> || std::is_same_v<TypeParam, uint16_t>) {
-    if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return; }
-  }
-
   // [1, 2, 3], [1, null, 3], [1], [null], [null, null], []
   // values:   {1, 2, 3, 1, XXX, 3, 1, XXX, XXX}
   // offsets:  {0, 3, 6, 7, 8, 10, 10}

From 58799d698d861866b5650d368f5195174fc9644e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 20 Aug 2024 11:29:27 -1000
Subject: [PATCH 703/842] Add stricter typing and validation to ColumnAccessor
 (#16602)

* Added typing annotations that are generally a little stricter on when `Column`s should be passed. Added error handling for these cases
* Moved some argument checking that was performed on `DataFrame` to `ColumnAccessor`
* Adding more `verify=False` to `ColumnAccessor` calls and preserving `.label_dtype` more when we're just selecting columns from the prior `ColumnAccessor`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16602
---
 python/cudf/cudf/_lib/csv.pyx                 |   2 +-
 python/cudf/cudf/core/_base_index.py          |   2 +-
 python/cudf/cudf/core/column_accessor.py      | 114 ++++++++++--------
 python/cudf/cudf/core/dataframe.py            |  14 +--
 python/cudf/cudf/core/frame.py                |   4 +-
 python/cudf/cudf/core/indexing_utils.py       |   4 -
 python/cudf/cudf/core/join/_join_helpers.py   |   8 +-
 python/cudf/cudf/core/join/join.py            |   5 +-
 .../cudf/cudf/tests/test_column_accessor.py   |   2 +-
 9 files changed, 80 insertions(+), 75 deletions(-)

diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index a90fe0f9ac6..e0f57df1368 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -282,7 +282,7 @@ def read_csv(
     # Set index if the index_col parameter is passed
     if index_col is not None and index_col is not False:
         if isinstance(index_col, int):
-            index_col_name = df._data.select_by_index(index_col).names[0]
+            index_col_name = df._data.get_labels_by_index(index_col)[0]
             df = df.set_index(index_col_name)
             if isinstance(index_col_name, str) and \
                     names is None and orig_header == "infer":
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index d13351c49dd..a224e0ce0d0 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -1698,7 +1698,7 @@ def join(
         # in case of MultiIndex
         if isinstance(lhs, cudf.MultiIndex):
             on = (
-                lhs._data.select_by_index(level).names[0]
+                lhs._data.get_labels_by_index(level)[0]
                 if isinstance(level, int)
                 else level
             )
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 67c19f11e41..7aa3e5f8163 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -102,7 +102,7 @@ def __init__(
         rangeindex: bool = False,
         label_dtype: Dtype | None = None,
         verify: bool = True,
-    ):
+    ) -> None:
         if isinstance(data, ColumnAccessor):
             self._data = data._data
             self._level_names = data.level_names
@@ -147,10 +147,10 @@ def __iter__(self):
     def __getitem__(self, key: Any) -> ColumnBase:
         return self._data[key]
 
-    def __setitem__(self, key: Any, value: Any):
+    def __setitem__(self, key: Any, value: ColumnBase) -> None:
         self.set_by_label(key, value)
 
-    def __delitem__(self, key: Any):
+    def __delitem__(self, key: Any) -> None:
         old_ncols = len(self._data)
         del self._data[key]
         new_ncols = len(self._data)
@@ -174,7 +174,7 @@ def __repr__(self) -> str:
 
     def _from_columns_like_self(
         self, columns: abc.Iterable[ColumnBase], verify: bool = True
-    ):
+    ) -> Self:
         """
         Return a new ColumnAccessor with columns and the properties of self.
 
@@ -250,7 +250,7 @@ def _grouped_data(self) -> abc.MutableMapping:
         else:
             return self._data
 
-    def _clear_cache(self, old_ncols: int, new_ncols: int):
+    def _clear_cache(self, old_ncols: int, new_ncols: int) -> None:
         """
         Clear cached attributes.
 
@@ -310,16 +310,14 @@ def to_pandas_index(self) -> pd.Index:
             )
         return result
 
-    def insert(
-        self, name: Any, value: Any, loc: int = -1, validate: bool = True
-    ):
+    def insert(self, name: Any, value: ColumnBase, loc: int = -1) -> None:
         """
         Insert column into the ColumnAccessor at the specified location.
 
         Parameters
         ----------
         name : Name corresponding to the new column
-        value : column-like
+        value : ColumnBase
         loc : int, optional
             The location to insert the new value at.
             Must be (0 <= loc <= ncols). By default, the column is added
@@ -330,33 +328,35 @@ def insert(
         None, this function operates in-place.
         """
         name = self._pad_key(name)
+        if name in self._data:
+            raise ValueError(f"Cannot insert '{name}', already exists")
 
         old_ncols = len(self._data)
         if loc == -1:
             loc = old_ncols
-        if not (0 <= loc <= old_ncols):
+        elif not (0 <= loc <= old_ncols):
             raise ValueError(
                 f"insert: loc out of bounds: must be  0 <= loc <= {old_ncols}"
             )
+
+        if not isinstance(value, column.ColumnBase):
+            raise ValueError("value must be a Column")
+        elif old_ncols > 0 and len(value) != self.nrows:
+            raise ValueError("All columns must be of equal length")
+
         # TODO: we should move all insert logic here
-        if name in self._data:
-            raise ValueError(f"Cannot insert '{name}', already exists")
         if loc == old_ncols:
-            if validate:
-                value = column.as_column(value)
-                if old_ncols > 0 and len(value) != self.nrows:
-                    raise ValueError("All columns must be of equal length")
             self._data[name] = value
         else:
             new_keys = self.names[:loc] + (name,) + self.names[loc:]
             new_values = self.columns[:loc] + (value,) + self.columns[loc:]
-            self._data = self._data.__class__(zip(new_keys, new_values))
+            self._data = dict(zip(new_keys, new_values))
         self._clear_cache(old_ncols, old_ncols + 1)
         if old_ncols == 0:
             # The type(name) may no longer match the prior label_dtype
             self.label_dtype = None
 
-    def copy(self, deep=False) -> ColumnAccessor:
+    def copy(self, deep: bool = False) -> Self:
         """
         Make a copy of this ColumnAccessor.
         """
@@ -373,7 +373,7 @@ def copy(self, deep=False) -> ColumnAccessor:
             verify=False,
         )
 
-    def select_by_label(self, key: Any) -> ColumnAccessor:
+    def select_by_label(self, key: Any) -> Self:
         """
         Return a subset of this column accessor,
         composed of the keys specified by `key`.
@@ -389,7 +389,7 @@ def select_by_label(self, key: Any) -> ColumnAccessor:
         if isinstance(key, slice):
             return self._select_by_label_slice(key)
         elif pd.api.types.is_list_like(key) and not isinstance(key, tuple):
-            return self._select_by_label_list_like(key)
+            return self._select_by_label_list_like(tuple(key))
         else:
             if isinstance(key, tuple):
                 if any(isinstance(k, slice) for k in key):
@@ -427,9 +427,13 @@ def get_labels_by_index(self, index: Any) -> tuple:
             # TODO: Doesn't handle on-device columns
             return tuple(n for n, keep in zip(self.names, index) if keep)
         else:
+            if len(set(index)) != len(index):
+                raise NotImplementedError(
+                    "Selecting duplicate column labels is not supported."
+                )
             return tuple(self.names[i] for i in index)
 
-    def select_by_index(self, index: Any) -> ColumnAccessor:
+    def select_by_index(self, index: Any) -> Self:
         """
         Return a ColumnAccessor composed of the columns
         specified by index.
@@ -445,13 +449,15 @@ def select_by_index(self, index: Any) -> ColumnAccessor:
         """
         keys = self.get_labels_by_index(index)
         data = {k: self._data[k] for k in keys}
-        return self.__class__(
+        return type(self)(
             data,
             multiindex=self.multiindex,
             level_names=self.level_names,
+            label_dtype=self.label_dtype,
+            verify=False,
         )
 
-    def swaplevel(self, i=-2, j=-1):
+    def swaplevel(self, i=-2, j=-1) -> Self:
         """
         Swap level i with level j.
         Calling this method does not change the ordering of the values.
@@ -467,6 +473,10 @@ def swaplevel(self, i=-2, j=-1):
         -------
         ColumnAccessor
         """
+        if not self.multiindex:
+            raise ValueError(
+                "swaplevel is only valid for self.multiindex=True"
+            )
 
         i = _get_level(i, self.nlevels, self.level_names)
         j = _get_level(j, self.nlevels, self.level_names)
@@ -486,13 +496,16 @@ def swaplevel(self, i=-2, j=-1):
         new_names = list(self.level_names)
         new_names[i], new_names[j] = new_names[j], new_names[i]
 
-        return self.__class__(
+        return type(self)(
             new_data,
-            multiindex=True,
+            multiindex=self.multiindex,
             level_names=new_names,
+            rangeindex=self.rangeindex,
+            label_dtype=self.label_dtype,
+            verify=False,
         )
 
-    def set_by_label(self, key: Any, value: Any, validate: bool = True):
+    def set_by_label(self, key: Any, value: ColumnBase) -> None:
         """
         Add (or modify) column by name.
 
@@ -500,26 +513,21 @@ def set_by_label(self, key: Any, value: Any, validate: bool = True):
         ----------
         key
             name of the column
-        value : column-like
+        value : Column
             The value to insert into the column.
-        validate : bool
-            If True, the provided value will be coerced to a column and
-            validated before setting (Default value = True).
         """
         key = self._pad_key(key)
-        if validate:
-            value = column.as_column(value)
-            if len(self._data) > 0 and len(value) != self.nrows:
-                raise ValueError("All columns must be of equal length")
+        if not isinstance(value, column.ColumnBase):
+            raise ValueError("value must be a Column")
+        if len(self) > 0 and len(value) != self.nrows:
+            raise ValueError("All columns must be of equal length")
 
         old_ncols = len(self._data)
         self._data[key] = value
         new_ncols = len(self._data)
         self._clear_cache(old_ncols, new_ncols)
 
-    def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
-        # Might be a generator
-        key = tuple(key)
+    def _select_by_label_list_like(self, key: tuple) -> Self:
         # Special-casing for boolean mask
         if (bn := len(key)) > 0 and all(map(is_bool, key)):
             if bn != (n := len(self.names)):
@@ -539,19 +547,22 @@ def _select_by_label_list_like(self, key: Any) -> ColumnAccessor:
                 )
         if self.multiindex:
             data = dict(_to_flat_dict_inner(data))
-        return self.__class__(
+        return type(self)(
             data,
             multiindex=self.multiindex,
             level_names=self.level_names,
+            label_dtype=self.label_dtype,
+            verify=False,
         )
 
-    def _select_by_label_grouped(self, key: Any) -> ColumnAccessor:
+    def _select_by_label_grouped(self, key: Any) -> Self:
         result = self._grouped_data[key]
         if isinstance(result, column.ColumnBase):
             # self._grouped_data[key] = self._data[key] so skip validation
-            return self.__class__(
+            return type(self)(
                 data={key: result},
                 multiindex=self.multiindex,
+                label_dtype=self.label_dtype,
                 verify=False,
             )
         else:
@@ -563,9 +574,10 @@ def _select_by_label_grouped(self, key: Any) -> ColumnAccessor:
                 result,
                 multiindex=self.nlevels - len(key) > 1,
                 level_names=self.level_names[len(key) :],
+                verify=False,
             )
 
-    def _select_by_label_slice(self, key: slice) -> ColumnAccessor:
+    def _select_by_label_slice(self, key: slice) -> Self:
         start, stop = key.start, key.stop
         if key.step is not None:
             raise TypeError("Label slicing with step is not supported")
@@ -585,19 +597,22 @@ def _select_by_label_slice(self, key: slice) -> ColumnAccessor:
                 stop_idx = len(self.names) - idx
                 break
         keys = self.names[start_idx:stop_idx]
-        return self.__class__(
+        return type(self)(
             {k: self._data[k] for k in keys},
             multiindex=self.multiindex,
             level_names=self.level_names,
+            label_dtype=self.label_dtype,
             verify=False,
         )
 
-    def _select_by_label_with_wildcard(self, key: Any) -> ColumnAccessor:
+    def _select_by_label_with_wildcard(self, key: tuple) -> Self:
         key = self._pad_key(key, slice(None))
-        return self.__class__(
-            {k: self._data[k] for k in self._data if _keys_equal(k, key)},
+        data = {k: self._data[k] for k in self.names if _keys_equal(k, key)}
+        return type(self)(
+            data,
             multiindex=self.multiindex,
             level_names=self.level_names,
+            label_dtype=self.label_dtype,
             verify=False,
         )
 
@@ -614,7 +629,7 @@ def _pad_key(self, key: Any, pad_value="") -> Any:
 
     def rename_levels(
         self, mapper: Mapping[Any, Any] | Callable, level: int | None = None
-    ) -> ColumnAccessor:
+    ) -> Self:
         """
         Rename the specified levels of the given ColumnAccessor
 
@@ -686,7 +701,7 @@ def rename_column(x):
             verify=False,
         )
 
-    def droplevel(self, level):
+    def droplevel(self, level) -> None:
         # drop the nth level
         if level < 0:
             level += self.nlevels
@@ -701,9 +716,8 @@ def droplevel(self, level):
             self._level_names[:level] + self._level_names[level + 1 :]
         )
 
-        if (
-            len(self._level_names) == 1
-        ):  # can't use nlevels, as it depends on multiindex
+        if len(self._level_names) == 1:
+            # can't use nlevels, as it depends on multiindex
             self.multiindex = False
         self._clear_cache(old_ncols, new_ncols)
 
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 97684129203..43693ec20b1 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -382,19 +382,19 @@ def _setitem_tuple_arg(self, key, value):
                 value = as_column(value, length=length)
 
             if isinstance(value, ColumnBase):
-                new_col = cudf.Series._from_column(value, index=idx)
+                new_ser = cudf.Series._from_column(value, index=idx)
             else:
-                new_col = cudf.Series(value, index=idx)
+                new_ser = cudf.Series(value, index=idx)
             if len(self._frame.index) != 0:
-                new_col = new_col._align_to_index(
+                new_ser = new_ser._align_to_index(
                     self._frame.index, how="right"
                 )
 
             if len(self._frame.index) == 0:
                 self._frame.index = (
-                    idx if idx is not None else cudf.RangeIndex(len(new_col))
+                    idx if idx is not None else cudf.RangeIndex(len(new_ser))
                 )
-            self._frame._data.insert(key[1], new_col)
+            self._frame._data.insert(key[1], new_ser._column)
         else:
             if is_scalar(value):
                 for col in columns_df._column_names:
@@ -981,6 +981,7 @@ def _init_from_series_list(self, data, columns, index):
             self._data.rangeindex = isinstance(
                 columns, (range, cudf.RangeIndex, pd.RangeIndex)
             )
+            self._data.label_dtype = pd.Index(columns).dtype
         else:
             self._data.rangeindex = True
 
@@ -3272,9 +3273,6 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
             If False, a reindexing operation is performed if
             `value.index` is not equal to `self.index`.
         """
-        if name in self._data:
-            raise NameError(f"duplicated column name {name}")
-
         num_cols = self._num_columns
         if loc < 0:
             loc += num_cols + 1
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index ce23d671a6c..3e1efd7c97a 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -1010,9 +1010,7 @@ def _copy_type_metadata(self: Self, other: Self) -> Self:
         See `ColumnBase._with_type_metadata` for more information.
         """
         for (name, col), (_, dtype) in zip(self._data.items(), other._dtypes):
-            self._data.set_by_label(
-                name, col._with_type_metadata(dtype), validate=False
-            )
+            self._data.set_by_label(name, col._with_type_metadata(dtype))
 
         return self
 
diff --git a/python/cudf/cudf/core/indexing_utils.py b/python/cudf/cudf/core/indexing_utils.py
index a0089242909..8182e5cede2 100644
--- a/python/cudf/cudf/core/indexing_utils.py
+++ b/python/cudf/cudf/core/indexing_utils.py
@@ -152,10 +152,6 @@ def destructure_dataframe_iloc_indexer(
         column_names: ColumnLabels = list(
             frame._data.get_labels_by_index(cols)
         )
-        if len(set(column_names)) != len(column_names):
-            raise NotImplementedError(
-                "cudf DataFrames do not support repeated column names"
-            )
     except TypeError:
         raise TypeError(
             "Column indices must be integers, slices, "
diff --git a/python/cudf/cudf/core/join/_join_helpers.py b/python/cudf/cudf/core/join/_join_helpers.py
index 32c84763401..854c44ff1a1 100644
--- a/python/cudf/cudf/core/join/_join_helpers.py
+++ b/python/cudf/cudf/core/join/_join_helpers.py
@@ -37,16 +37,16 @@ class _ColumnIndexer(_Indexer):
     def get(self, obj: cudf.DataFrame) -> ColumnBase:
         return obj._data[self.name]
 
-    def set(self, obj: cudf.DataFrame, value: ColumnBase, validate=False):
-        obj._data.set_by_label(self.name, value, validate=validate)
+    def set(self, obj: cudf.DataFrame, value: ColumnBase):
+        obj._data.set_by_label(self.name, value)
 
 
 class _IndexIndexer(_Indexer):
     def get(self, obj: cudf.DataFrame) -> ColumnBase:
         return obj.index._data[self.name]
 
-    def set(self, obj: cudf.DataFrame, value: ColumnBase, validate=False):
-        obj.index._data.set_by_label(self.name, value, validate=validate)
+    def set(self, obj: cudf.DataFrame, value: ColumnBase):
+        obj.index._data.set_by_label(self.name, value)
 
 
 def _match_join_keys(
diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py
index ce81c1fc5b1..b65bc7af832 100644
--- a/python/cudf/cudf/core/join/join.py
+++ b/python/cudf/cudf/core/join/join.py
@@ -272,8 +272,8 @@ def perform_merge(self) -> cudf.DataFrame:
                 lcol_casted = lcol_casted.astype("category")
                 rcol_casted = rcol_casted.astype("category")
 
-            left_key.set(self.lhs, lcol_casted, validate=False)
-            right_key.set(self.rhs, rcol_casted, validate=False)
+            left_key.set(self.lhs, lcol_casted)
+            right_key.set(self.rhs, rcol_casted)
 
         left_rows, right_rows = self._gather_maps(
             left_join_cols, right_join_cols
@@ -329,7 +329,6 @@ def _merge_results(
                     lkey.set(
                         left_result,
                         lkey.get(left_result).fillna(rkey.get(right_result)),
-                        validate=False,
                     )
 
         # All columns from the left table make it into the output. Non-key
diff --git a/python/cudf/cudf/tests/test_column_accessor.py b/python/cudf/cudf/tests/test_column_accessor.py
index 2d7bc809d4d..5cef077c18d 100644
--- a/python/cudf/cudf/tests/test_column_accessor.py
+++ b/python/cudf/cudf/tests/test_column_accessor.py
@@ -370,7 +370,7 @@ def test_replace_level_values_MultiColumn():
 def test_clear_nrows_empty_before():
     ca = ColumnAccessor({})
     assert ca.nrows == 0
-    ca.insert("new", [1])
+    ca.insert("new", as_column([1]))
     assert ca.nrows == 1
 
 
From 8ab553c7835b21c2d5fcc76cb24960db03722b15 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 21 Aug 2024 08:47:13 -0400
Subject: [PATCH 704/842] Move libcudf reduction google-benchmarks to nvbench
 (#16564)

Reworks the reduction benchmarks currently coded with google-bench to use nvbench instead.
This removes the need to support `row_bit_count` for dictionary column types.
#16121

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16564
---
 cpp/benchmarks/CMakeLists.txt           |  14 ++-
 cpp/benchmarks/reduction/anyall.cpp     |  80 +++++++---------
 cpp/benchmarks/reduction/dictionary.cpp | 111 ++++++++++++-----------
 cpp/benchmarks/reduction/minmax.cpp     |  63 +++++--------
 cpp/benchmarks/reduction/reduce.cpp     | 116 ++++++++++++------------
 cpp/benchmarks/reduction/scan.cpp       |  65 ++++++-------
 6 files changed, 210 insertions(+), 239 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 483b7b0a539..6db282a7728 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -207,12 +207,16 @@ ConfigureBench(TYPE_DISPATCHER_BENCH type_dispatcher/type_dispatcher.cu)
 
 # ##################################################################################################
 # * reduction benchmark ---------------------------------------------------------------------------
-ConfigureBench(
-  REDUCTION_BENCH reduction/anyall.cpp reduction/dictionary.cpp reduction/minmax.cpp
-  reduction/reduce.cpp reduction/scan.cpp
-)
 ConfigureNVBench(
-  REDUCTION_NVBENCH reduction/rank.cpp reduction/scan_structs.cpp reduction/segmented_reduce.cpp
+  REDUCTION_NVBENCH
+  reduction/anyall.cpp
+  reduction/dictionary.cpp
+  reduction/minmax.cpp
+  reduction/rank.cpp
+  reduction/reduce.cpp
+  reduction/scan.cpp
+  reduction/scan_structs.cpp
+  reduction/segmented_reduce.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/reduction/anyall.cpp b/cpp/benchmarks/reduction/anyall.cpp
index e9d23881764..1e578fab181 100644
--- a/cpp/benchmarks/reduction/anyall.cpp
+++ b/cpp/benchmarks/reduction/anyall.cpp
@@ -16,65 +16,51 @@
 
 #include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/common/table_utilities.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/types.hpp>
 
-#include <memory>
+#include <nvbench/nvbench.cuh>
 
-class Reduction : public cudf::benchmark {};
+#include <memory>
 
-template <typename type>
-void BM_reduction_anyall(benchmark::State& state,
-                         std::unique_ptr<cudf::reduce_aggregation> const& agg)
+template <typename DataType>
+static void reduction_anyall(nvbench::state& state, nvbench::type_list<DataType>)
 {
-  cudf::size_type const column_size{static_cast<cudf::size_type>(state.range(0))};
-  auto const dtype           = cudf::type_to_id<type>();
-  data_profile const profile = data_profile_builder().no_validity().distribution(
-    dtype, distribution_id::UNIFORM, 0, agg->kind == cudf::aggregation::ANY ? 0 : 100);
-  auto const values = create_random_column(dtype, row_count{column_size}, profile);
+  auto const size     = static_cast<cudf::size_type>(state.get_int64("size"));
+  auto const kind_str = state.get_string("kind");
 
-  cudf::data_type output_dtype{cudf::type_id::BOOL8};
+  auto const input_type = cudf::type_to_id<DataType>();
+  auto const agg        = kind_str == "any" ? cudf::make_any_aggregation<cudf::reduce_aggregation>()
+                                            : cudf::make_all_aggregation<cudf::reduce_aggregation>();
 
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    auto result = cudf::reduce(*values, *agg, output_dtype);
-  }
+  data_profile const profile =
+    data_profile_builder().no_validity().distribution(input_type,
+                                                      distribution_id::UNIFORM,
+                                                      (kind_str == "all" ? 1 : 0),
+                                                      (kind_str == "any" ? 0 : 100));
+  auto const values = create_random_column(input_type, row_count{size}, profile);
 
-  // The benchmark takes a column and produces one scalar.
-  set_items_processed(state, column_size + 1);
-  set_bytes_processed(state, estimate_size(values->view()) + cudf::size_of(output_dtype));
-}
+  auto const output_type = cudf::data_type{cudf::type_id::BOOL8};
+  auto stream            = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.add_element_count(size);
+  state.add_global_memory_reads<DataType>(size);
+  state.add_global_memory_writes<nvbench::int8_t>(1);
 
-#define concat(a, b, c) a##b##c
-#define get_agg(op)     concat(cudf::make_, op, _aggregation<cudf::reduce_aggregation>())
+  state.exec(nvbench::exec_tag::sync, [&values, output_type, &agg](nvbench::launch& launch) {
+    cudf::reduce(*values, *agg, output_type);
+  });
 
-// TYPE, OP
-#define RBM_BENCHMARK_DEFINE(name, type, aggregation)             \
-  BENCHMARK_DEFINE_F(Reduction, name)(::benchmark::State & state) \
-  {                                                               \
-    BM_reduction_anyall<type>(state, get_agg(aggregation));       \
-  }                                                               \
-  BENCHMARK_REGISTER_F(Reduction, name)                           \
-    ->UseManualTime()                                             \
-    ->Arg(10000)      /* 10k */                                   \
-    ->Arg(100000)     /* 100k */                                  \
-    ->Arg(1000000)    /* 1M */                                    \
-    ->Arg(10000000)   /* 10M */                                   \
-    ->Arg(100000000); /* 100M */
+  set_throughputs(state);
+}
 
-#define REDUCE_BENCHMARK_DEFINE(type, aggregation) \
-  RBM_BENCHMARK_DEFINE(concat(type, _, aggregation), type, aggregation)
+using Types = nvbench::type_list<bool, int8_t, int32_t, float>;
 
-REDUCE_BENCHMARK_DEFINE(bool, all);
-REDUCE_BENCHMARK_DEFINE(int8_t, all);
-REDUCE_BENCHMARK_DEFINE(int32_t, all);
-REDUCE_BENCHMARK_DEFINE(float, all);
-REDUCE_BENCHMARK_DEFINE(bool, any);
-REDUCE_BENCHMARK_DEFINE(int8_t, any);
-REDUCE_BENCHMARK_DEFINE(int32_t, any);
-REDUCE_BENCHMARK_DEFINE(float, any);
+NVBENCH_BENCH_TYPES(reduction_anyall, NVBENCH_TYPE_AXES(Types))
+  .set_name("anyall")
+  .set_type_axes_names({"DataType"})
+  .add_string_axis("kind", {"any", "all"})
+  .add_int64_axis("size", {100'000, 1'000'000, 10'000'000, 100'000'000});
diff --git a/cpp/benchmarks/reduction/dictionary.cpp b/cpp/benchmarks/reduction/dictionary.cpp
index 5095337dbb3..1bdb50a539a 100644
--- a/cpp/benchmarks/reduction/dictionary.cpp
+++ b/cpp/benchmarks/reduction/dictionary.cpp
@@ -16,79 +16,84 @@
 
 #include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
 
+#include <cudf/aggregation.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/types.hpp>
 #include <cudf/unary.hpp>
 
-class ReductionDictionary : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-template <typename T>
-void BM_reduction_dictionary(benchmark::State& state,
-                             std::unique_ptr<cudf::reduce_aggregation> const& agg)
+template <cudf::reduce_aggregation::Kind kind>
+static std::unique_ptr<cudf::reduce_aggregation> make_reduce_aggregation()
 {
-  cudf::size_type const column_size{static_cast<cudf::size_type>(state.range(0))};
+  switch (kind) {
+    case cudf::reduce_aggregation::ANY:
+      return cudf::make_any_aggregation<cudf::reduce_aggregation>();
+    case cudf::reduce_aggregation::ALL:
+      return cudf::make_all_aggregation<cudf::reduce_aggregation>();
+    case cudf::reduce_aggregation::MIN:
+      return cudf::make_min_aggregation<cudf::reduce_aggregation>();
+    case cudf::reduce_aggregation::MAX:
+      return cudf::make_max_aggregation<cudf::reduce_aggregation>();
+    case cudf::reduce_aggregation::MEAN:
+      return cudf::make_mean_aggregation<cudf::reduce_aggregation>();
+    default: CUDF_FAIL("Unsupported reduce aggregation in this benchmark");
+  }
+}
+
+template <typename DataType, cudf::reduce_aggregation::Kind kind>
+static void reduction_dictionary(nvbench::state& state,
+                                 nvbench::type_list<DataType, nvbench::enum_type<kind>>)
+{
+  cudf::size_type const size{static_cast<cudf::size_type>(state.get_int64("size"))};
 
-  // int column and encoded dictionary column
   data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution(
     cudf::type_to_id<long>(),
     distribution_id::UNIFORM,
-    (agg->kind == cudf::aggregation::ALL ? 1 : 0),
-    (agg->kind == cudf::aggregation::ANY ? 0 : 100));
-  auto int_column = create_random_column(cudf::type_to_id<long>(), row_count{column_size}, profile);
-  auto number_col = cudf::cast(*int_column, cudf::data_type{cudf::type_to_id<T>()});
+    (kind == cudf::aggregation::ALL ? 1 : 0),
+    (kind == cudf::aggregation::ANY ? 0 : 100));
+  auto int_column = create_random_column(cudf::type_to_id<long>(), row_count{size}, profile);
+  auto number_col = cudf::cast(*int_column, cudf::data_type{cudf::type_to_id<DataType>()});
   auto values     = cudf::dictionary::encode(*number_col);
 
-  cudf::data_type output_dtype = [&] {
-    if (agg->kind == cudf::aggregation::ANY || agg->kind == cudf::aggregation::ALL)
+  cudf::data_type output_type = [&] {
+    if (kind == cudf::aggregation::ANY || kind == cudf::aggregation::ALL) {
       return cudf::data_type{cudf::type_id::BOOL8};
-    if (agg->kind == cudf::aggregation::MEAN) return cudf::data_type{cudf::type_id::FLOAT64};
-    return cudf::data_type{cudf::type_to_id<T>()};
+    }
+    if (kind == cudf::aggregation::MEAN) { return cudf::data_type{cudf::type_id::FLOAT64}; }
+    return cudf::data_type{cudf::type_to_id<DataType>()};
   }();
 
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    auto result = cudf::reduce(*values, *agg, output_dtype);
+  auto agg = make_reduce_aggregation<kind>();
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.add_element_count(size);
+  state.add_global_memory_reads<DataType>(size);
+  if (kind == cudf::aggregation::ANY || kind == cudf::aggregation::ALL) {
+    state.add_global_memory_writes<nvbench::int8_t>(1);  // BOOL8s
+  } else {
+    state.add_global_memory_writes<DataType>(1);
   }
 
-  // The benchmark takes a column and produces two scalars.
-  set_items_processed(state, column_size + 1);
+  state.exec(nvbench::exec_tag::sync, [&values, output_type, &agg](nvbench::launch& launch) {
+    cudf::reduce(*values, *agg, output_type);
+  });
 
-  // We don't set the metrics for the size read/written as row_bit_count() doesn't
-  // support the dictionary type yet (and so is estimate_size()).
-  // See https://github.com/rapidsai/cudf/issues/16121 for details.
+  set_throughputs(state);
 }
 
-#define concat(a, b, c) a##b##c
-#define get_agg(op)     concat(cudf::make_, op, _aggregation<cudf::reduce_aggregation>())
-
-// TYPE, OP
-#define RBM_BENCHMARK_DEFINE(name, type, aggregation)                       \
-  BENCHMARK_DEFINE_F(ReductionDictionary, name)(::benchmark::State & state) \
-  {                                                                         \
-    BM_reduction_dictionary<type>(state, get_agg(aggregation));             \
-  }                                                                         \
-  BENCHMARK_REGISTER_F(ReductionDictionary, name)                           \
-    ->UseManualTime()                                                       \
-    ->Arg(10000)      /* 10k */                                             \
-    ->Arg(100000)     /* 100k */                                            \
-    ->Arg(1000000)    /* 1M */                                              \
-    ->Arg(10000000)   /* 10M */                                             \
-    ->Arg(100000000); /* 100M */
-
-#define REDUCE_BENCHMARK_DEFINE(type, aggregation) \
-  RBM_BENCHMARK_DEFINE(concat(type, _, aggregation), type, aggregation)
+using Types    = nvbench::type_list<int32_t, float>;
+using AggKinds = nvbench::enum_type_list<cudf::reduce_aggregation::ALL,
+                                         cudf::reduce_aggregation::ANY,
+                                         cudf::reduce_aggregation::MIN,
+                                         cudf::reduce_aggregation::MAX,
+                                         cudf::reduce_aggregation::MEAN>;
 
-REDUCE_BENCHMARK_DEFINE(int32_t, all);
-REDUCE_BENCHMARK_DEFINE(float, all);
-REDUCE_BENCHMARK_DEFINE(int32_t, any);
-REDUCE_BENCHMARK_DEFINE(float, any);
-REDUCE_BENCHMARK_DEFINE(int32_t, min);
-REDUCE_BENCHMARK_DEFINE(float, min);
-REDUCE_BENCHMARK_DEFINE(int32_t, max);
-REDUCE_BENCHMARK_DEFINE(float, max);
-REDUCE_BENCHMARK_DEFINE(int32_t, mean);
-REDUCE_BENCHMARK_DEFINE(float, mean);
+NVBENCH_BENCH_TYPES(reduction_dictionary, NVBENCH_TYPE_AXES(Types, AggKinds))
+  .set_name("reduction_dictionary")
+  .set_type_axes_names({"DataType", "AggKinds"})
+  .add_int64_axis("size", {100'000, 1'000'000, 10'000'000, 100'000'000});
diff --git a/cpp/benchmarks/reduction/minmax.cpp b/cpp/benchmarks/reduction/minmax.cpp
index 050f2887221..c89e22d3f44 100644
--- a/cpp/benchmarks/reduction/minmax.cpp
+++ b/cpp/benchmarks/reduction/minmax.cpp
@@ -16,55 +16,40 @@
 
 #include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/common/table_utilities.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/types.hpp>
 
-class Reduction : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-template <typename type>
-void BM_reduction(benchmark::State& state)
+template <typename DataType>
+static void reduction_minmax(nvbench::state& state, nvbench::type_list<DataType>)
 {
-  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
-  auto const dtype_id = cudf::type_to_id<type>();
-  auto const input_column =
-    create_random_column(dtype_id, row_count{column_size}, data_profile_builder().no_validity());
+  auto const size = static_cast<cudf::size_type>(state.get_int64("size"));
 
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    auto result = cudf::minmax(*input_column);
-  }
+  auto const input_type = cudf::type_to_id<DataType>();
 
-  // The benchmark takes a column and produces two scalars.
-  set_items_processed(state, column_size + 2);
-  cudf::data_type dtype = cudf::data_type{dtype_id};
-  set_bytes_processed(state, estimate_size(input_column->view()) + 2 * cudf::size_of(dtype));
-}
+  data_profile const profile =
+    data_profile_builder().no_validity().distribution(input_type, distribution_id::UNIFORM, 0, 100);
+  auto const input_column = create_random_column(input_type, row_count{size}, profile);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.add_element_count(size);
+  state.add_global_memory_reads<DataType>(size);
+  state.add_global_memory_writes<DataType>(2);
 
-#define concat(a, b, c) a##b##c
-#define get_agg(op)     concat(cudf::make_, op, _aggregation())
+  state.exec(nvbench::exec_tag::sync,
+             [&input_column](nvbench::launch& launch) { cudf::minmax(*input_column); });
 
-// TYPE, OP
-#define RBM_BENCHMARK_DEFINE(name, type, aggregation)                                            \
-  BENCHMARK_DEFINE_F(Reduction, name)(::benchmark::State & state) { BM_reduction<type>(state); } \
-  BENCHMARK_REGISTER_F(Reduction, name)                                                          \
-    ->UseManualTime()                                                                            \
-    ->Arg(10000)      /* 10k */                                                                  \
-    ->Arg(100000)     /* 100k */                                                                 \
-    ->Arg(1000000)    /* 1M */                                                                   \
-    ->Arg(10000000)   /* 10M */                                                                  \
-    ->Arg(100000000); /* 100M */
+  set_throughputs(state);
+}
 
-#define REDUCE_BENCHMARK_DEFINE(type, aggregation) \
-  RBM_BENCHMARK_DEFINE(concat(type, _, aggregation), type, aggregation)
+using Types = nvbench::type_list<bool, int8_t, int32_t, float, cudf::timestamp_ms>;
 
-REDUCE_BENCHMARK_DEFINE(bool, minmax);
-REDUCE_BENCHMARK_DEFINE(int8_t, minmax);
-REDUCE_BENCHMARK_DEFINE(int32_t, minmax);
-using cudf::timestamp_ms;
-REDUCE_BENCHMARK_DEFINE(timestamp_ms, minmax);
-REDUCE_BENCHMARK_DEFINE(float, minmax);
+NVBENCH_BENCH_TYPES(reduction_minmax, NVBENCH_TYPE_AXES(Types))
+  .set_name("minmax")
+  .set_type_axes_names({"DataType"})
+  .add_int64_axis("size", {100'000, 1'000'000, 10'000'000, 100'000'000});
diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp
index 63c96f4fe9e..14bf90c4943 100644
--- a/cpp/benchmarks/reduction/reduce.cpp
+++ b/cpp/benchmarks/reduction/reduce.cpp
@@ -16,82 +16,80 @@
 
 #include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/common/table_utilities.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/types.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 #include <memory>
 
-class Reduction : public cudf::benchmark {};
+template <cudf::reduce_aggregation::Kind kind>
+static std::unique_ptr<cudf::reduce_aggregation> make_reduce_aggregation()
+{
+  switch (kind) {
+    case cudf::reduce_aggregation::MIN:
+      return cudf::make_min_aggregation<cudf::reduce_aggregation>();
+    case cudf::reduce_aggregation::SUM:
+      return cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+    case cudf::reduce_aggregation::MEAN:
+      return cudf::make_mean_aggregation<cudf::reduce_aggregation>();
+    case cudf::reduce_aggregation::PRODUCT:
+      return cudf::make_product_aggregation<cudf::reduce_aggregation>();
+    case cudf::reduce_aggregation::VARIANCE:
+      return cudf::make_variance_aggregation<cudf::reduce_aggregation>();
+    case cudf::reduce_aggregation::STD:
+      return cudf::make_std_aggregation<cudf::reduce_aggregation>();
+    default: CUDF_FAIL("Unsupported reduce aggregation in this benchmark");
+  }
+}
 
-template <typename type>
-void BM_reduction(benchmark::State& state, std::unique_ptr<cudf::reduce_aggregation> const& agg)
+template <typename DataType, cudf::reduce_aggregation::Kind kind>
+static void reduction(nvbench::state& state, nvbench::type_list<DataType, nvbench::enum_type<kind>>)
 {
-  cudf::size_type const column_size{(cudf::size_type)state.range(0)};
-  auto const dtype = cudf::type_to_id<type>();
+  auto const size = static_cast<cudf::size_type>(state.get_int64("size"));
+  if (cudf::is_chrono<DataType>() && kind != cudf::aggregation::MIN) {
+    state.skip("Skip chrono types for some aggregations");
+  }
+
+  auto const input_type = cudf::type_to_id<DataType>();
   data_profile const profile =
-    data_profile_builder().no_validity().distribution(dtype, distribution_id::UNIFORM, 0, 100);
-  auto const input_column = create_random_column(dtype, row_count{column_size}, profile);
+    data_profile_builder().no_validity().distribution(input_type, distribution_id::UNIFORM, 0, 100);
+  auto const input_column = create_random_column(input_type, row_count{size}, profile);
 
-  cudf::data_type output_dtype =
-    (agg->kind == cudf::aggregation::MEAN || agg->kind == cudf::aggregation::VARIANCE ||
-     agg->kind == cudf::aggregation::STD)
+  cudf::data_type output_type =
+    (kind == cudf::aggregation::MEAN || kind == cudf::aggregation::VARIANCE ||
+     kind == cudf::aggregation::STD)
       ? cudf::data_type{cudf::type_id::FLOAT64}
       : input_column->type();
 
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    auto result = cudf::reduce(*input_column, *agg, output_dtype);
-  }
+  auto agg = make_reduce_aggregation<kind>();
 
-  // The benchmark takes a column and produces two scalars.
-  set_items_processed(state, column_size + 1);
-  set_bytes_processed(state, estimate_size(input_column->view()) + cudf::size_of(output_dtype));
-}
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.add_element_count(size);
+  state.add_global_memory_reads<DataType>(size);
+  state.add_global_memory_writes<DataType>(1);
 
-#define concat(a, b, c) a##b##c
-#define get_agg(op)     concat(cudf::make_, op, _aggregation<cudf::reduce_aggregation>())
+  state.exec(nvbench::exec_tag::sync, [&input_column, output_type, &agg](nvbench::launch& launch) {
+    cudf::reduce(*input_column, *agg, output_type);
+  });
 
-// TYPE, OP
-#define RBM_BENCHMARK_DEFINE(name, type, aggregation)             \
-  BENCHMARK_DEFINE_F(Reduction, name)(::benchmark::State & state) \
-  {                                                               \
-    BM_reduction<type>(state, get_agg(aggregation));              \
-  }                                                               \
-  BENCHMARK_REGISTER_F(Reduction, name)                           \
-    ->UseManualTime()                                             \
-    ->Arg(10000)      /* 10k */                                   \
-    ->Arg(100000)     /* 100k */                                  \
-    ->Arg(1000000)    /* 1M */                                    \
-    ->Arg(10000000)   /* 10M */                                   \
-    ->Arg(100000000); /* 100M */
-
-#define REDUCE_BENCHMARK_DEFINE(type, aggregation) \
-  RBM_BENCHMARK_DEFINE(concat(type, _, aggregation), type, aggregation)
+  set_throughputs(state);
+}
 
-#define REDUCE_BENCHMARK_NUMERIC(aggregation)    \
-  REDUCE_BENCHMARK_DEFINE(bool, aggregation);    \
-  REDUCE_BENCHMARK_DEFINE(int8_t, aggregation);  \
-  REDUCE_BENCHMARK_DEFINE(int32_t, aggregation); \
-  REDUCE_BENCHMARK_DEFINE(int64_t, aggregation); \
-  REDUCE_BENCHMARK_DEFINE(float, aggregation);   \
-  REDUCE_BENCHMARK_DEFINE(double, aggregation);
+using Types    = nvbench::type_list<int32_t, int64_t, double, cudf::timestamp_ms>;
+using AggKinds = nvbench::enum_type_list<cudf::reduce_aggregation::MIN,
+                                         cudf::reduce_aggregation::SUM,
+                                         cudf::reduce_aggregation::PRODUCT,
+                                         cudf::reduce_aggregation::VARIANCE,
+                                         cudf::reduce_aggregation::STD,
+                                         cudf::reduce_aggregation::MEAN>;
 
-REDUCE_BENCHMARK_NUMERIC(sum);
-REDUCE_BENCHMARK_DEFINE(int32_t, product);
-REDUCE_BENCHMARK_DEFINE(float, product);
-REDUCE_BENCHMARK_DEFINE(int64_t, min);
-REDUCE_BENCHMARK_DEFINE(double, min);
-using cudf::timestamp_ms;
-REDUCE_BENCHMARK_DEFINE(timestamp_ms, min);
-REDUCE_BENCHMARK_DEFINE(int8_t, mean);
-REDUCE_BENCHMARK_DEFINE(float, mean);
-REDUCE_BENCHMARK_DEFINE(int32_t, variance);
-REDUCE_BENCHMARK_DEFINE(double, variance);
-REDUCE_BENCHMARK_DEFINE(int64_t, std);
-REDUCE_BENCHMARK_DEFINE(float, std);
+NVBENCH_BENCH_TYPES(reduction, NVBENCH_TYPE_AXES(Types, AggKinds))
+  .set_name("reduction")
+  .set_type_axes_names({"DataType", "AggKinds"})
+  .add_int64_axis("size", {100'000, 1'000'000, 10'000'000, 100'000'000});
diff --git a/cpp/benchmarks/reduction/scan.cpp b/cpp/benchmarks/reduction/scan.cpp
index dc05aad9807..f3d67a79498 100644
--- a/cpp/benchmarks/reduction/scan.cpp
+++ b/cpp/benchmarks/reduction/scan.cpp
@@ -16,9 +16,7 @@
 
 #include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/common/table_utilities.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
@@ -26,43 +24,38 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 
-class ReductionScan : public cudf::benchmark {};
+#include <nvbench/nvbench.cuh>
 
-template <typename type>
-static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
+template <typename DataType>
+static void reduction_scan(nvbench::state& state, nvbench::type_list<DataType>)
 {
-  cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
-  auto const dtype  = cudf::type_to_id<type>();
-  auto const column = create_random_column(dtype, row_count{n_rows});
-  if (!include_nulls) column->set_null_mask(rmm::device_buffer{}, 0);
+  auto const size       = static_cast<cudf::size_type>(state.get_int64("size"));
+  auto const nulls      = state.get_float64("nulls");
+  auto const input_type = cudf::type_to_id<DataType>();
 
-  std::unique_ptr<cudf::column> result = nullptr;
-  for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    result = cudf::scan(
-      *column, *cudf::make_min_aggregation<cudf::scan_aggregation>(), cudf::scan_type::INCLUSIVE);
-  }
+  data_profile const profile = data_profile_builder().null_probability(nulls).distribution(
+    input_type, distribution_id::UNIFORM, 0, 100);
+  auto const input_column = create_random_column(input_type, row_count{size}, profile);
 
-  // The benchmark takes a column and produces a new column of the same size as input.
-  set_items_processed(state, n_rows * 2);
-  set_bytes_processed(state, estimate_size(column->view()) + estimate_size(result->view()));
+  auto agg = cudf::make_min_aggregation<cudf::scan_aggregation>();
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.add_element_count(size);
+  state.add_global_memory_reads<DataType>(size);
+  state.add_global_memory_writes<DataType>(1);
+
+  state.exec(nvbench::exec_tag::sync, [&input_column, &agg](nvbench::launch& launch) {
+    cudf::scan(*input_column, *agg, cudf::scan_type::INCLUSIVE);
+  });
+
+  set_throughputs(state);
 }
 
-#define SCAN_BENCHMARK_DEFINE(name, type, nulls)                          \
-  BENCHMARK_DEFINE_F(ReductionScan, name)                                 \
-  (::benchmark::State & state) { BM_reduction_scan<type>(state, nulls); } \
-  BENCHMARK_REGISTER_F(ReductionScan, name)                               \
-    ->UseManualTime()                                                     \
-    ->Arg(10000)      /* 10k */                                           \
-    ->Arg(100000)     /* 100k */                                          \
-    ->Arg(1000000)    /* 1M */                                            \
-    ->Arg(10000000)   /* 10M */                                           \
-    ->Arg(100000000); /* 100M */
+using Types = nvbench::type_list<int8_t, int32_t, uint64_t, float, int16_t, uint32_t, double>;
 
-SCAN_BENCHMARK_DEFINE(int8_no_nulls, int8_t, false);
-SCAN_BENCHMARK_DEFINE(int32_no_nulls, int32_t, false);
-SCAN_BENCHMARK_DEFINE(uint64_no_nulls, uint64_t, false);
-SCAN_BENCHMARK_DEFINE(float_no_nulls, float, false);
-SCAN_BENCHMARK_DEFINE(int16_nulls, int16_t, true);
-SCAN_BENCHMARK_DEFINE(uint32_nulls, uint32_t, true);
-SCAN_BENCHMARK_DEFINE(double_nulls, double, true);
+NVBENCH_BENCH_TYPES(reduction_scan, NVBENCH_TYPE_AXES(Types))
+  .set_name("scan")
+  .set_type_axes_names({"DataType"})
+  .add_float64_axis("nulls", {0.0, 0.1})
+  .add_int64_axis("size", {100'000, 1'000'000, 10'000'000, 100'000'000});

From 6a2f323ac2c53b32d8a1d47b36dd0d0786027a7c Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Wed, 21 Aug 2024 07:35:44 -0700
Subject: [PATCH 705/842] Fix function parameters with common dependency
 modified during their evaluation (#16620)

This fixes an issue in JNI C++ code. In particular, during a function call, the two passing parameters are evaluated using an index value, but that index is modified during evaluating one of the parameters, leading to out-of-bound access when evaluating the other.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: https://github.com/rapidsai/cudf/pull/16620
---
 java/src/main/native/src/TableJni.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 76ca8c533ce..ecc551f1143 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1037,9 +1037,9 @@ cudf::io::schema_element read_schema_element(int& index,
     // go to the next entry, so recursion can parse it.
     index++;
     for (int i = 0; i < num_children; i++) {
+      auto const name = std::string{names.get(index).get()};
       child_elems.insert(
-        std::pair{names.get(index).get(),
-                  cudf::jni::read_schema_element(index, children, names, types, scales)});
+        std::pair{name, cudf::jni::read_schema_element(index, children, names, types, scales)});
     }
     return cudf::io::schema_element{d_type, std::move(child_elems)};
   } else {
@@ -1830,9 +1830,9 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
       std::map<std::string, cudf::io::schema_element> data_types;
       int at = 0;
       while (at < n_types.size()) {
+        auto const name = std::string{n_col_names.get(at).get()};
         data_types.insert(std::pair{
-          n_col_names.get(at).get(),
-          cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
+          name, cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
       }
       opts.dtypes(data_types);
     } else {
@@ -1929,9 +1929,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
       std::map<std::string, cudf::io::schema_element> data_types;
       int at = 0;
       while (at < n_types.size()) {
+        auto const name = std::string{n_col_names.get(at).get()};
         data_types.insert(std::pair{
-          n_col_names.get(at).get(),
-          cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
+          name, cudf::jni::read_schema_element(at, n_children, n_col_names, n_types, n_scales)});
       }
       opts.dtypes(data_types);
     } else {

From bf2ee328f99cae51c8bdbc240e0ceedb102c24ca Mon Sep 17 00:00:00 2001
From: Thomas Li <47963215+lithomas1@users.noreply.github.com>
Date: Wed, 21 Aug 2024 14:47:11 -0700
Subject: [PATCH 706/842] DOC: Refresh pylibcudf guide (#15856)

This PR updates the pylibcudf dev guide with some more recent recommendations.

Authors:
  - Thomas Li (https://github.com/lithomas1)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/15856
---
 docs/cudf/source/developer_guide/pylibcudf.md | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md
index 2ae545a4955..4e10459fe2b 100644
--- a/docs/cudf/source/developer_guide/pylibcudf.md
+++ b/docs/cudf/source/developer_guide/pylibcudf.md
@@ -13,10 +13,8 @@ To satisfy the goals of pylibcudf, we impose the following set of design princip
 - Every public function or method should be `cpdef`ed. This allows it to be used in both Cython and Python code. This incurs some slight overhead over `cdef` functions, but we assume that this is acceptable because 1) the vast majority of users will be using pure Python rather than Cython, and 2) the overhead of a `cpdef` function over a `cdef` function is on the order of a nanosecond, while CUDA kernel launch overhead is on the order of a microsecond, so these function overheads should be washed out by typical usage of pylibcudf.
 - Every variable used should be strongly typed and either be a primitive type (int, float, etc) or a cdef class. Any enums in C++ should be mirrored using `cpdef enum`, which will create both a C-style enum in Cython and a PEP 435-style Python enum that will automatically be used in Python.
 - All typing in code should be written using Cython syntax, not PEP 484 Python typing syntax. Not only does this ensure compatibility with Cython < 3, but even with Cython 3 PEP 484 support remains incomplete as of this writing.
-- All cudf code should interact only with pylibcudf, never with libcudf directly.
-- All imports should be relative so that pylibcudf can be easily extracted from cudf later
-  - Exception: All imports of libcudf API bindings in `cudf._lib.cpp` should use absolute imports of `cudf._lib.cpp as libcudf`. We should convert the `cpp` directory into a proper package so that it can be imported as `libcudf` in that fashion. When moving pylibcudf into a separate package, it will be renamed to `libcudf` and only the imports will need to change.
-- Ideally, pylibcudf should depend on nothing other than rmm and pyarrow. This will allow it to be extracted into a a largely standalone library and used in environments where the larger dependency tree of cudf may be cumbersome.
+- All cudf code should interact only with pylibcudf, never with libcudf directly. This is not currently the case, but is the direction that the library is moving towards.
+- Ideally, pylibcudf should depend on no RAPIDS component other than rmm, and should in general have minimal runtime dependencies.
 
 
 ## Relationship to libcudf
@@ -112,6 +110,9 @@ Then, a corresponding pylibcudf fixture may be created using a simple `from_arro
 This approach ensures consistent global coverage across types for various tests.
 
 In general, pylibcudf tests should prefer validating against a corresponding pyarrow implementation rather than hardcoding data.
+If there is no pyarrow implementation, another alternative is to write a pure Python implementation that loops over the values
+of the Table/Column, if a scalar Python equivalent of the pylibcudf implementation exists (this is especially relevant for string methods).
+
 This approach is more resilient to changes to input data, particularly given the fixture strategy outlined above.
 Standard tools for comparing between pylibcudf and pyarrow types are provided in the utils module.
 
@@ -242,3 +243,8 @@ cpdef ColumnOrTable empty_like(ColumnOrTable input)
 
 [Cython supports specializing the contents of fused-type functions based on the argument types](https://cython.readthedocs.io/en/latest/src/userguide/fusedtypes.html#type-checking-specializations), so any type-specific logic may be encoded using the appropriate conditionals.
 See the pylibcudf source for examples of how to implement such functions.
+
+In the event that libcudf provides multiple overloads for the same function with differing numbers of arguments, specify the maximum number of arguments in the Cython definition,
+and set arguments not shared between overloads to `None`. If a user tries to pass in an unsupported argument for a specific overload type, you should raise `ValueError`.
+
+Finally, consider making an libcudf issue if you think this inconsistency can be addressed on the libcudf side.

From 6c4905da22ad5b3d5007f45f38a3fa8449f7f8e1 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 21 Aug 2024 21:03:12 -0700
Subject: [PATCH 707/842] Remove legacy Arrow interop APIs (#16590)

Contributes to #15193.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/16590
---
 cpp/CMakeLists.txt                           |   3 -
 cpp/include/cudf/detail/interop.hpp          | 101 +---
 cpp/include/cudf/interop.hpp                 | 101 ----
 cpp/src/interop/detail/arrow_allocator.cpp   |  83 ---
 cpp/src/interop/detail/arrow_allocator.hpp   |  31 --
 cpp/src/interop/from_arrow.cu                | 524 -------------------
 cpp/src/interop/to_arrow.cu                  | 490 -----------------
 cpp/tests/interop/arrow_utils.hpp            |  64 ++-
 java/src/main/native/src/ColumnVectorJni.cpp |  76 ++-
 java/src/main/native/src/TableJni.cpp        |  35 +-
 10 files changed, 167 insertions(+), 1341 deletions(-)
 delete mode 100644 cpp/src/interop/detail/arrow_allocator.cpp
 delete mode 100644 cpp/src/interop/detail/arrow_allocator.hpp
 delete mode 100644 cpp/src/interop/from_arrow.cu
 delete mode 100644 cpp/src/interop/to_arrow.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ff00c484501..6b8bb26825b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -363,17 +363,14 @@ add_library(
   src/hash/sha512_hash.cu
   src/hash/xxhash_64.cu
   src/interop/dlpack.cpp
-  src/interop/from_arrow.cu
   src/interop/arrow_utilities.cpp
   src/interop/decimal_conversion_utilities.cu
-  src/interop/to_arrow.cu
   src/interop/to_arrow_device.cu
   src/interop/to_arrow_host.cu
   src/interop/from_arrow_device.cu
   src/interop/from_arrow_host.cu
   src/interop/from_arrow_stream.cu
   src/interop/to_arrow_schema.cpp
-  src/interop/detail/arrow_allocator.cpp
   src/io/avro/avro.cpp
   src/io/avro/avro_gpu.cu
   src/io/avro/reader_impl.cu
diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index 0b9319ba663..0d8f078c9d1 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -16,29 +16,13 @@
 
 #pragma once
 
-// We disable warning 611 because the `arrow::TableBatchReader` only partially
-// override the `ReadNext` method of `arrow::RecordBatchReader::ReadNext`
-// triggering warning 611-D from nvcc.
-#ifdef __CUDACC__
-#pragma nv_diag_suppress 611
-#pragma nv_diag_suppress 2810
-#endif
-#include <rmm/resource_ref.hpp>
-
-#include <arrow/api.h>
-#ifdef __CUDACC__
-#pragma nv_diag_default 611
-#pragma nv_diag_default 2810
-#endif
-
 #include <cudf/interop.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-
-#include <string>
+#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
@@ -61,89 +45,6 @@ DLManagedTensor* to_dlpack(table_view const& input,
                            rmm::cuda_stream_view stream,
                            rmm::device_async_resource_ref mr);
 
-// Creating arrow as per given type_id and buffer arguments
-template <typename... Ts>
-std::shared_ptr<arrow::Array> to_arrow_array(cudf::type_id id, Ts&&... args)
-{
-  switch (id) {
-    case type_id::BOOL8: return std::make_shared<arrow::BooleanArray>(std::forward<Ts>(args)...);
-    case type_id::INT8: return std::make_shared<arrow::Int8Array>(std::forward<Ts>(args)...);
-    case type_id::INT16: return std::make_shared<arrow::Int16Array>(std::forward<Ts>(args)...);
-    case type_id::INT32: return std::make_shared<arrow::Int32Array>(std::forward<Ts>(args)...);
-    case type_id::INT64: return std::make_shared<arrow::Int64Array>(std::forward<Ts>(args)...);
-    case type_id::UINT8: return std::make_shared<arrow::UInt8Array>(std::forward<Ts>(args)...);
-    case type_id::UINT16: return std::make_shared<arrow::UInt16Array>(std::forward<Ts>(args)...);
-    case type_id::UINT32: return std::make_shared<arrow::UInt32Array>(std::forward<Ts>(args)...);
-    case type_id::UINT64: return std::make_shared<arrow::UInt64Array>(std::forward<Ts>(args)...);
-    case type_id::FLOAT32: return std::make_shared<arrow::FloatArray>(std::forward<Ts>(args)...);
-    case type_id::FLOAT64: return std::make_shared<arrow::DoubleArray>(std::forward<Ts>(args)...);
-    case type_id::TIMESTAMP_DAYS:
-      return std::make_shared<arrow::Date32Array>(std::make_shared<arrow::Date32Type>(),
-                                                  std::forward<Ts>(args)...);
-    case type_id::TIMESTAMP_SECONDS:
-      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::SECOND),
-                                                     std::forward<Ts>(args)...);
-    case type_id::TIMESTAMP_MILLISECONDS:
-      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::MILLI),
-                                                     std::forward<Ts>(args)...);
-    case type_id::TIMESTAMP_MICROSECONDS:
-      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::MICRO),
-                                                     std::forward<Ts>(args)...);
-    case type_id::TIMESTAMP_NANOSECONDS:
-      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::NANO),
-                                                     std::forward<Ts>(args)...);
-    case type_id::DURATION_SECONDS:
-      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::SECOND),
-                                                    std::forward<Ts>(args)...);
-    case type_id::DURATION_MILLISECONDS:
-      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::MILLI),
-                                                    std::forward<Ts>(args)...);
-    case type_id::DURATION_MICROSECONDS:
-      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::MICRO),
-                                                    std::forward<Ts>(args)...);
-    case type_id::DURATION_NANOSECONDS:
-      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::NANO),
-                                                    std::forward<Ts>(args)...);
-    default: CUDF_FAIL("Unsupported type_id conversion to arrow");
-  }
-}
-
-// Converting arrow type to cudf type
-data_type arrow_to_cudf_type(arrow::DataType const& arrow_type);
-
-/**
- * @copydoc cudf::to_arrow(table_view input, std::vector<column_metadata> const& metadata,
- * rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr)
- */
-std::shared_ptr<arrow::Table> to_arrow(table_view input,
-                                       std::vector<column_metadata> const& metadata,
-                                       rmm::cuda_stream_view stream,
-                                       arrow::MemoryPool* ar_mr);
-
-/**
- * @copydoc cudf::to_arrow(cudf::scalar const& input, column_metadata const& metadata,
- * rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr)
- */
-std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
-                                        column_metadata const& metadata,
-                                        rmm::cuda_stream_view stream,
-                                        arrow::MemoryPool* ar_mr);
-/**
- * @copydoc cudf::from_arrow(arrow::Table const& input_table, rmm::cuda_stream_view stream,
- * rmm::device_async_resource_ref mr)
- */
-std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
-                                  rmm::cuda_stream_view stream,
-                                  rmm::device_async_resource_ref mr);
-
-/**
- * @copydoc cudf::from_arrow(arrow::Scalar const& input, rmm::cuda_stream_view stream,
- * rmm::device_async_resource_ref mr)
- */
-std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::device_async_resource_ref mr);
-
 /**
  * @brief Return a maximum precision for a given type.
  *
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 9a8f87b4a46..0f52b0f7b31 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -16,21 +16,6 @@
 
 #pragma once
 
-// We disable warning 611 because the `arrow::TableBatchReader` only partially
-// override the `ReadNext` method of `arrow::RecordBatchReader::ReadNext`
-// triggering warning 611-D from nvcc.
-#ifdef __CUDACC__
-#pragma nv_diag_suppress 611
-#pragma nv_diag_suppress 2810
-#endif
-#include <rmm/resource_ref.hpp>
-
-#include <arrow/api.h>
-#ifdef __CUDACC__
-#pragma nv_diag_default 611
-#pragma nv_diag_default 2810
-#endif
-
 #include <cudf/column/column.hpp>
 #include <cudf/detail/transform.hpp>
 #include <cudf/table/table.hpp>
@@ -131,59 +116,6 @@ struct column_metadata {
   column_metadata() = default;
 };
 
-/**
- * @brief Create `arrow::Table` from cudf table `input`
- *
- * Converts the `cudf::table_view` to `arrow::Table` with the provided
- * metadata `column_names`.
- *
- * @deprecated Since 24.08. Use cudf::to_arrow_host instead.
- *
- * @throws cudf::logic_error if `column_names` size doesn't match with number of columns.
- *
- * @param input table_view that needs to be converted to arrow Table
- * @param metadata Contains hierarchy of names of columns and children
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param ar_mr arrow memory pool to allocate memory for arrow Table
- * @return arrow Table generated from `input`
- *
- * @note For decimals, since the precision is not stored for them in libcudf,
- * it will be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type
- * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
- * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
- * converted to Arrow decimal128 of the precision 38.
- */
-[[deprecated("Use cudf::to_arrow_host")]] std::shared_ptr<arrow::Table> to_arrow(
-  table_view input,
-  std::vector<column_metadata> const& metadata = {},
-  rmm::cuda_stream_view stream                 = cudf::get_default_stream(),
-  arrow::MemoryPool* ar_mr                     = arrow::default_memory_pool());
-
-/**
- * @brief Create `arrow::Scalar` from cudf scalar `input`
- *
- * Converts the `cudf::scalar` to `arrow::Scalar`.
- *
- * @deprecated Since 24.08.
- *
- * @param input scalar that needs to be converted to arrow Scalar
- * @param metadata Contains hierarchy of names of columns and children
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param ar_mr arrow memory pool to allocate memory for arrow Scalar
- * @return arrow Scalar generated from `input`
- *
- * @note For decimals, since the precision is not stored for them in libcudf,
- * it will be converted to an Arrow decimal128 that has the widest-precision the cudf decimal type
- * supports. For example, numeric::decimal32 will be converted to Arrow decimal128 of the precision
- * 9 which is the maximum precision for 32-bit types. Similarly, numeric::decimal128 will be
- * converted to Arrow decimal128 of the precision 38.
- */
-[[deprecated("Use cudf::to_arrow_host")]] std::shared_ptr<arrow::Scalar> to_arrow(
-  cudf::scalar const& input,
-  column_metadata const& metadata = {},
-  rmm::cuda_stream_view stream    = cudf::get_default_stream(),
-  arrow::MemoryPool* ar_mr        = arrow::default_memory_pool());
-
 /**
  * @brief typedef for a unique_ptr to an ArrowSchema with custom deleter
  *
@@ -386,39 +318,6 @@ unique_device_array_t to_arrow_host(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
 
-/**
- * @brief Create `cudf::table` from given arrow Table input
- *
- * @deprecated Since 24.08. Use cudf::from_arrow_host instead.
- *
- * @param input arrow:Table that needs to be converted to `cudf::table`
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr    Device memory resource used to allocate `cudf::table`
- * @return cudf table generated from given arrow Table
- */
-[[deprecated("Use cudf::from_arrow_host")]] std::unique_ptr<table> from_arrow(
-  arrow::Table const& input,
-  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Create `cudf::scalar` from given arrow Scalar input
- *
- * @deprecated Since 24.08. Use arrow's `MakeArrayFromScalar` on the
- * input, followed by `ExportArray` to obtain something that can be
- * consumed by `from_arrow_host`. Then use `cudf::get_element` to
- * extract a device scalar from the column.
- *
- * @param input `arrow::Scalar` that needs to be converted to `cudf::scalar`
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr    Device memory resource used to allocate `cudf::scalar`
- * @return cudf scalar generated from given arrow Scalar
- */
-[[deprecated("See docstring for migration strategies")]] std::unique_ptr<cudf::scalar> from_arrow(
-  arrow::Scalar const& input,
-  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
-
 /**
  * @brief Create `cudf::table` from given ArrowArray and ArrowSchema input
  *
diff --git a/cpp/src/interop/detail/arrow_allocator.cpp b/cpp/src/interop/detail/arrow_allocator.cpp
deleted file mode 100644
index 2a19a5360fe..00000000000
--- a/cpp/src/interop/detail/arrow_allocator.cpp
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2021-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/detail/interop.hpp>
-
-#include <sys/mman.h>
-#include <unistd.h>
-
-#include <memory>
-
-namespace cudf {
-namespace detail {
-
-/*
-  Enable Transparent Huge Pages (THP) for large (>4MB) allocations.
-  `buf` is returned untouched.
-  Enabling THP can improve performance of device-host memory transfers
-  significantly, see <https://github.com/rapidsai/cudf/pull/13914>.
-*/
-template <typename T>
-T enable_hugepage(T&& buf)
-{
-  if (buf->size() < (1u << 22u)) {  // Smaller than 4 MB
-    return std::move(buf);
-  }
-
-#ifdef MADV_HUGEPAGE
-  auto const pagesize = sysconf(_SC_PAGESIZE);
-  void* addr          = const_cast<uint8_t*>(buf->data());
-  if (addr == nullptr) { return std::move(buf); }
-  auto length{static_cast<std::size_t>(buf->size())};
-  if (std::align(pagesize, pagesize, addr, length)) {
-    // Intentionally not checking for errors that may be returned by older kernel versions;
-    // optimistically tries enabling huge pages.
-    madvise(addr, length, MADV_HUGEPAGE);
-  }
-#endif
-  return std::move(buf);
-}
-
-std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(int64_t const size, arrow::MemoryPool* ar_mr)
-{
-  /*
-  nvcc 11.0 generates Internal Compiler Error during codegen when arrow::AllocateBuffer
-  and `ValueOrDie` are used inside a CUDA compilation unit.
-
-  To work around this issue we compile an allocation shim in C++ and use
-  that from our cuda sources
-  */
-  arrow::Result<std::unique_ptr<arrow::Buffer>> result = arrow::AllocateBuffer(size, ar_mr);
-  CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow buffer");
-  return enable_hugepage(std::move(result).ValueOrDie());
-}
-
-std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(int64_t const size, arrow::MemoryPool* ar_mr)
-{
-  /*
-  nvcc 11.0 generates Internal Compiler Error during codegen when arrow::AllocateBuffer
-  and `ValueOrDie` are used inside a CUDA compilation unit.
-
-  To work around this issue we compile an allocation shim in C++ and use
-  that from our cuda sources
-  */
-  arrow::Result<std::shared_ptr<arrow::Buffer>> result = arrow::AllocateBitmap(size, ar_mr);
-  CUDF_EXPECTS(result.ok(), "Failed to allocate Arrow bitmap");
-  return enable_hugepage(std::move(result).ValueOrDie());
-}
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/src/interop/detail/arrow_allocator.hpp b/cpp/src/interop/detail/arrow_allocator.hpp
deleted file mode 100644
index 75c1baa0dca..00000000000
--- a/cpp/src/interop/detail/arrow_allocator.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <cudf/detail/interop.hpp>
-
-namespace cudf {
-namespace detail {
-
-// unique_ptr because that is what AllocateBuffer returns
-std::unique_ptr<arrow::Buffer> allocate_arrow_buffer(int64_t const size, arrow::MemoryPool* ar_mr);
-
-// shared_ptr because that is what AllocateBitmap returns
-std::shared_ptr<arrow::Buffer> allocate_arrow_bitmap(int64_t const size, arrow::MemoryPool* ar_mr);
-
-}  // namespace detail
-}  // namespace cudf
diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
deleted file mode 100644
index 579820cbae3..00000000000
--- a/cpp/src/interop/from_arrow.cu
+++ /dev/null
@@ -1,524 +0,0 @@
-/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#include <cudf/column/column_factories.hpp>
-#include <cudf/column/column_view.hpp>
-#include <cudf/detail/concatenate.hpp>
-#include <cudf/detail/copy.hpp>
-#include <cudf/detail/interop.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/transform.hpp>
-#include <cudf/detail/unary.hpp>
-#include <cudf/dictionary/dictionary_factories.hpp>
-#include <cudf/interop.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/traits.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/resource_ref.hpp>
-
-#include <thrust/gather.h>
-
-namespace cudf {
-
-namespace detail {
-data_type arrow_to_cudf_type(arrow::DataType const& arrow_type)
-{
-  switch (arrow_type.id()) {
-    case arrow::Type::NA: return data_type(type_id::EMPTY);
-    case arrow::Type::BOOL: return data_type(type_id::BOOL8);
-    case arrow::Type::INT8: return data_type(type_id::INT8);
-    case arrow::Type::INT16: return data_type(type_id::INT16);
-    case arrow::Type::INT32: return data_type(type_id::INT32);
-    case arrow::Type::INT64: return data_type(type_id::INT64);
-    case arrow::Type::UINT8: return data_type(type_id::UINT8);
-    case arrow::Type::UINT16: return data_type(type_id::UINT16);
-    case arrow::Type::UINT32: return data_type(type_id::UINT32);
-    case arrow::Type::UINT64: return data_type(type_id::UINT64);
-    case arrow::Type::FLOAT: return data_type(type_id::FLOAT32);
-    case arrow::Type::DOUBLE: return data_type(type_id::FLOAT64);
-    case arrow::Type::DATE32: return data_type(type_id::TIMESTAMP_DAYS);
-    case arrow::Type::TIMESTAMP: {
-      auto type = static_cast<arrow::TimestampType const*>(&arrow_type);
-      switch (type->unit()) {
-        case arrow::TimeUnit::type::SECOND: return data_type(type_id::TIMESTAMP_SECONDS);
-        case arrow::TimeUnit::type::MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS);
-        case arrow::TimeUnit::type::MICRO: return data_type(type_id::TIMESTAMP_MICROSECONDS);
-        case arrow::TimeUnit::type::NANO: return data_type(type_id::TIMESTAMP_NANOSECONDS);
-        default: CUDF_FAIL("Unsupported timestamp unit in arrow");
-      }
-    }
-    case arrow::Type::DURATION: {
-      auto type = static_cast<arrow::DurationType const*>(&arrow_type);
-      switch (type->unit()) {
-        case arrow::TimeUnit::type::SECOND: return data_type(type_id::DURATION_SECONDS);
-        case arrow::TimeUnit::type::MILLI: return data_type(type_id::DURATION_MILLISECONDS);
-        case arrow::TimeUnit::type::MICRO: return data_type(type_id::DURATION_MICROSECONDS);
-        case arrow::TimeUnit::type::NANO: return data_type(type_id::DURATION_NANOSECONDS);
-        default: CUDF_FAIL("Unsupported duration unit in arrow");
-      }
-    }
-    case arrow::Type::STRING: return data_type(type_id::STRING);
-    case arrow::Type::LARGE_STRING: return data_type(type_id::STRING);
-    case arrow::Type::DICTIONARY: return data_type(type_id::DICTIONARY32);
-    case arrow::Type::LIST: return data_type(type_id::LIST);
-    case arrow::Type::DECIMAL: {
-      auto const type = static_cast<arrow::Decimal128Type const*>(&arrow_type);
-      return data_type{type_id::DECIMAL128, -type->scale()};
-    }
-    case arrow::Type::STRUCT: return data_type(type_id::STRUCT);
-    default: CUDF_FAIL("Unsupported type_id conversion to cudf");
-  }
-}
-
-namespace {
-/**
- * @brief Functor to return column for a corresponding arrow array. column
- * is formed from buffer underneath the arrow array along with any offset and
- * change in length that array has.
- */
-struct dispatch_to_cudf_column {
-  /**
-   * @brief Returns mask from an array without any offsets.
-   */
-  std::unique_ptr<rmm::device_buffer> get_mask_buffer(arrow::Array const& array,
-                                                      rmm::cuda_stream_view stream,
-                                                      rmm::device_async_resource_ref mr)
-  {
-    if (array.null_bitmap_data() == nullptr) {
-      return std::make_unique<rmm::device_buffer>(0, stream, mr);
-    }
-    auto const null_bitmap_size = array.null_bitmap()->size();
-    auto const allocation_size =
-      bitmask_allocation_size_bytes(static_cast<size_type>(null_bitmap_size * CHAR_BIT));
-    auto mask        = std::make_unique<rmm::device_buffer>(allocation_size, stream, mr);
-    auto mask_buffer = array.null_bitmap();
-    CUDF_CUDA_TRY(cudaMemcpyAsync(mask->data(),
-                                  reinterpret_cast<uint8_t const*>(mask_buffer->address()),
-                                  null_bitmap_size,
-                                  cudaMemcpyDefault,
-                                  stream.value()));
-    // Zero-initialize trailing padding bytes
-    auto const num_trailing_bytes = allocation_size - null_bitmap_size;
-    if (num_trailing_bytes > 0) {
-      auto trailing_bytes = static_cast<uint8_t*>(mask->data()) + null_bitmap_size;
-      CUDF_CUDA_TRY(cudaMemsetAsync(trailing_bytes, 0, num_trailing_bytes, stream.value()));
-    }
-    return mask;
-  }
-
-  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
-  std::unique_ptr<column> operator()(
-    arrow::Array const&, data_type, bool, rmm::cuda_stream_view, rmm::device_async_resource_ref)
-  {
-    CUDF_FAIL("Unsupported type in from_arrow.");
-  }
-
-  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
-  std::unique_ptr<column> operator()(arrow::Array const& array,
-                                     data_type type,
-                                     bool skip_mask,
-                                     rmm::cuda_stream_view stream,
-                                     rmm::device_async_resource_ref mr)
-  {
-    auto data_buffer         = array.data()->buffers[1];
-    size_type const num_rows = array.length();
-    auto const has_nulls     = skip_mask ? false : array.null_bitmap_data() != nullptr;
-    auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr);
-    auto mutable_column_view = col->mutable_view();
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      mutable_column_view.data<T>(),
-      reinterpret_cast<uint8_t const*>(data_buffer->address()) + array.offset() * sizeof(T),
-      sizeof(T) * num_rows,
-      cudaMemcpyDefault,
-      stream.value()));
-    if (has_nulls) {
-      auto tmp_mask = get_mask_buffer(array, stream, mr);
-
-      // If array is sliced, we have to copy whole mask and then take copy.
-      auto out_mask = (num_rows == static_cast<size_type>(data_buffer->size() / sizeof(T)))
-                        ? std::move(*tmp_mask)
-                        : cudf::detail::copy_bitmask(static_cast<bitmask_type*>(tmp_mask->data()),
-                                                     array.offset(),
-                                                     array.offset() + num_rows,
-                                                     stream,
-                                                     mr);
-
-      col->set_null_mask(std::move(out_mask), array.null_count());
-    }
-
-    return col;
-  }
-};
-
-std::unique_ptr<column> get_empty_type_column(size_type size)
-{
-  // this abomination is required by cuDF Python, which needs to handle
-  // [PyArrow null arrays](https://arrow.apache.org/docs/python/generated/pyarrow.NullArray.html)
-  // of finite length
-  return std::make_unique<column>(
-    data_type(type_id::EMPTY), size, rmm::device_buffer{}, rmm::device_buffer{}, size);
-}
-
-/**
- * @brief Returns cudf column formed from given arrow array
- * This has been introduced to take care of compiler error "error: explicit specialization of
- * function must precede its first use"
- */
-std::unique_ptr<column> get_column(arrow::Array const& array,
-                                   data_type type,
-                                   bool skip_mask,
-                                   rmm::cuda_stream_view stream,
-                                   rmm::device_async_resource_ref mr);
-
-template <>
-std::unique_ptr<column> dispatch_to_cudf_column::operator()<numeric::decimal128>(
-  arrow::Array const& array,
-  data_type type,
-  bool skip_mask,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  using DeviceType = __int128_t;
-
-  auto data_buffer    = array.data()->buffers[1];
-  auto const num_rows = static_cast<size_type>(array.length());
-  auto col = make_fixed_width_column(type, num_rows, mask_state::UNALLOCATED, stream, mr);
-  auto mutable_column_view = col->mutable_view();
-
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    mutable_column_view.data<DeviceType>(),
-    reinterpret_cast<uint8_t const*>(data_buffer->address()) + array.offset() * sizeof(DeviceType),
-    sizeof(DeviceType) * num_rows,
-    cudaMemcpyDefault,
-    stream.value()));
-
-  auto null_mask = [&] {
-    if (not skip_mask and array.null_bitmap_data()) {
-      auto temp_mask = get_mask_buffer(array, stream, mr);
-      // If array is sliced, we have to copy whole mask and then take copy.
-      return (num_rows == static_cast<size_type>(data_buffer->size() / sizeof(DeviceType)))
-               ? std::move(*temp_mask.release())
-               : cudf::detail::copy_bitmask(static_cast<bitmask_type*>(temp_mask->data()),
-                                            array.offset(),
-                                            array.offset() + num_rows,
-                                            stream,
-                                            mr);
-    }
-    return rmm::device_buffer{};
-  }();
-
-  col->set_null_mask(std::move(null_mask), array.null_count());
-  return col;
-}
-
-template <>
-std::unique_ptr<column> dispatch_to_cudf_column::operator()<bool>(arrow::Array const& array,
-                                                                  data_type,
-                                                                  bool skip_mask,
-                                                                  rmm::cuda_stream_view stream,
-                                                                  rmm::device_async_resource_ref mr)
-{
-  auto data_buffer = array.data()->buffers[1];
-  // mask-to-bools expects the mask to be bitmask_type aligned/padded
-  auto data = rmm::device_buffer(
-    cudf::bitmask_allocation_size_bytes(data_buffer->size() * CHAR_BIT), stream, mr);
-  CUDF_CUDA_TRY(cudaMemcpyAsync(data.data(),
-                                reinterpret_cast<uint8_t const*>(data_buffer->address()),
-                                data_buffer->size(),
-                                cudaMemcpyDefault,
-                                stream.value()));
-  auto out_col = mask_to_bools(static_cast<bitmask_type*>(data.data()),
-                               array.offset(),
-                               array.offset() + array.length(),
-                               stream,
-                               mr);
-
-  auto const has_nulls = skip_mask ? false : array.null_bitmap_data() != nullptr;
-  if (has_nulls) {
-    auto out_mask =
-      detail::copy_bitmask(static_cast<bitmask_type*>(get_mask_buffer(array, stream, mr)->data()),
-                           array.offset(),
-                           array.offset() + array.length(),
-                           stream,
-                           mr);
-
-    out_col->set_null_mask(std::move(out_mask), array.null_count());
-  }
-
-  return out_col;
-}
-
-template <>
-std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::string_view>(
-  arrow::Array const& array,
-  data_type,
-  bool,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  if (array.length() == 0) { return make_empty_column(type_id::STRING); }
-
-  std::unique_ptr<column> offsets_column;
-  std::unique_ptr<arrow::Array> char_array;
-
-  if (array.type_id() == arrow::Type::LARGE_STRING) {
-    auto str_array    = static_cast<arrow::LargeStringArray const*>(&array);
-    auto offset_array = std::make_unique<arrow::Int64Array>(
-      str_array->value_offsets()->size() / sizeof(int64_t), str_array->value_offsets(), nullptr);
-    offsets_column = dispatch_to_cudf_column{}.operator()<int64_t>(
-      *offset_array, data_type(type_id::INT64), true, stream, mr);
-    char_array = std::make_unique<arrow::Int8Array>(
-      str_array->value_data()->size(), str_array->value_data(), nullptr);
-  } else if (array.type_id() == arrow::Type::STRING) {
-    auto str_array    = static_cast<arrow::StringArray const*>(&array);
-    auto offset_array = std::make_unique<arrow::Int32Array>(
-      str_array->value_offsets()->size() / sizeof(int32_t), str_array->value_offsets(), nullptr);
-    offsets_column = dispatch_to_cudf_column{}.operator()<int32_t>(
-      *offset_array, data_type(type_id::INT32), true, stream, mr);
-    char_array = std::make_unique<arrow::Int8Array>(
-      str_array->value_data()->size(), str_array->value_data(), nullptr);
-  } else {
-    throw std::runtime_error("Unsupported array type");
-  }
-
-  rmm::device_buffer chars(char_array->length(), stream, mr);
-  auto data_buffer = char_array->data()->buffers[1];
-  CUDF_CUDA_TRY(cudaMemcpyAsync(chars.data(),
-                                reinterpret_cast<uint8_t const*>(data_buffer->address()),
-                                chars.size(),
-                                cudaMemcpyDefault,
-                                stream.value()));
-
-  auto const num_rows = offsets_column->size() - 1;
-  auto out_col        = make_strings_column(num_rows,
-                                     std::move(offsets_column),
-                                     std::move(chars),
-                                     array.null_count(),
-                                     std::move(*get_mask_buffer(array, stream, mr)));
-
-  return num_rows == array.length()
-           ? std::move(out_col)
-           : std::make_unique<column>(
-               cudf::detail::slice(out_col->view(),
-                                   static_cast<size_type>(array.offset()),
-                                   static_cast<size_type>(array.offset() + array.length()),
-                                   stream),
-               stream,
-               mr);
-}
-
-template <>
-std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::dictionary32>(
-  arrow::Array const& array,
-  data_type,
-  bool,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  auto dict_array  = static_cast<arrow::DictionaryArray const*>(&array);
-  auto dict_type   = arrow_to_cudf_type(*(dict_array->dictionary()->type()));
-  auto keys_column = get_column(*(dict_array->dictionary()), dict_type, true, stream, mr);
-  auto ind_type    = arrow_to_cudf_type(*(dict_array->indices()->type()));
-
-  auto indices_column = get_column(*(dict_array->indices()), ind_type, false, stream, mr);
-  // If index type is not of type uint32_t, then cast it to uint32_t
-  auto const dict_indices_type = data_type{type_id::UINT32};
-  if (indices_column->type().id() != dict_indices_type.id())
-    indices_column = cudf::detail::cast(indices_column->view(), dict_indices_type, stream, mr);
-
-  // Child columns shouldn't have masks and we need the mask in main column
-  auto column_contents = indices_column->release();
-  indices_column       = std::make_unique<column>(dict_indices_type,
-                                            static_cast<size_type>(array.length()),
-                                            std::move(*(column_contents.data)),
-                                            rmm::device_buffer{},
-                                            0);
-
-  return make_dictionary_column(std::move(keys_column),
-                                std::move(indices_column),
-                                std::move(*(column_contents.null_mask)),
-                                array.null_count());
-}
-
-template <>
-std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::struct_view>(
-  arrow::Array const& array,
-  data_type,
-  bool,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  auto struct_array = static_cast<arrow::StructArray const*>(&array);
-  std::vector<std::unique_ptr<column>> child_columns;
-  // Offsets have already been applied to child
-  arrow::ArrayVector array_children = struct_array->fields();
-  std::transform(array_children.cbegin(),
-                 array_children.cend(),
-                 std::back_inserter(child_columns),
-                 [&mr, &stream](auto const& child_array) {
-                   auto type = arrow_to_cudf_type(*(child_array->type()));
-                   return get_column(*child_array, type, false, stream, mr);
-                 });
-
-  auto out_mask = std::move(*(get_mask_buffer(array, stream, mr)));
-  if (struct_array->null_bitmap_data() != nullptr) {
-    out_mask = detail::copy_bitmask(static_cast<bitmask_type*>(out_mask.data()),
-                                    array.offset(),
-                                    array.offset() + array.length(),
-                                    stream,
-                                    mr);
-  }
-
-  return make_structs_column(
-    array.length(), move(child_columns), array.null_count(), std::move(out_mask), stream, mr);
-}
-
-template <>
-std::unique_ptr<column> dispatch_to_cudf_column::operator()<cudf::list_view>(
-  arrow::Array const& array,
-  data_type,
-  bool,
-  rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr)
-{
-  auto list_array   = static_cast<arrow::ListArray const*>(&array);
-  auto offset_array = std::make_unique<arrow::Int32Array>(
-    list_array->value_offsets()->size() / sizeof(int32_t), list_array->value_offsets(), nullptr);
-  auto offsets_column = dispatch_to_cudf_column{}.operator()<int32_t>(
-    *offset_array, data_type(type_id::INT32), true, stream, mr);
-
-  auto child_type   = arrow_to_cudf_type(*(list_array->values()->type()));
-  auto child_column = get_column(*(list_array->values()), child_type, false, stream, mr);
-
-  auto const num_rows = offsets_column->size() - 1;
-  auto out_col        = make_lists_column(num_rows,
-                                   std::move(offsets_column),
-                                   std::move(child_column),
-                                   array.null_count(),
-                                   std::move(*get_mask_buffer(array, stream, mr)),
-                                   stream,
-                                   mr);
-
-  return num_rows == array.length()
-           ? std::move(out_col)
-           : std::make_unique<column>(
-               cudf::detail::slice(out_col->view(),
-                                   static_cast<size_type>(array.offset()),
-                                   static_cast<size_type>(array.offset() + array.length()),
-                                   stream),
-               stream,
-               mr);
-}
-
-std::unique_ptr<column> get_column(arrow::Array const& array,
-                                   data_type type,
-                                   bool skip_mask,
-                                   rmm::cuda_stream_view stream,
-                                   rmm::device_async_resource_ref mr)
-{
-  return type.id() != type_id::EMPTY
-           ? type_dispatcher(type, dispatch_to_cudf_column{}, array, type, skip_mask, stream, mr)
-           : get_empty_type_column(array.length());
-}
-
-}  // namespace
-
-std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
-                                  rmm::cuda_stream_view stream,
-                                  rmm::device_async_resource_ref mr)
-{
-  if (input_table.num_columns() == 0) { return std::make_unique<table>(); }
-  std::vector<std::unique_ptr<column>> columns;
-  auto chunked_arrays = input_table.columns();
-  std::transform(chunked_arrays.begin(),
-                 chunked_arrays.end(),
-                 std::back_inserter(columns),
-                 [&mr, &stream](auto const& chunked_array) {
-                   std::vector<std::unique_ptr<column>> concat_columns;
-                   auto cudf_type    = arrow_to_cudf_type(*(chunked_array->type()));
-                   auto array_chunks = chunked_array->chunks();
-                   if (cudf_type.id() == type_id::EMPTY) {
-                     return get_empty_type_column(chunked_array->length());
-                   }
-                   std::transform(array_chunks.begin(),
-                                  array_chunks.end(),
-                                  std::back_inserter(concat_columns),
-                                  [&cudf_type, &mr, &stream](auto const& array_chunk) {
-                                    return get_column(*array_chunk, cudf_type, false, stream, mr);
-                                  });
-                   if (concat_columns.empty()) {
-                     return std::make_unique<column>(
-                       cudf_type, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0);
-                   } else if (concat_columns.size() == 1) {
-                     return std::move(concat_columns[0]);
-                   }
-
-                   std::vector<cudf::column_view> column_views;
-                   std::transform(concat_columns.begin(),
-                                  concat_columns.end(),
-                                  std::back_inserter(column_views),
-                                  [](auto const& col) { return col->view(); });
-                   return cudf::detail::concatenate(column_views, stream, mr);
-                 });
-
-  return std::make_unique<table>(std::move(columns));
-}
-
-std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::device_async_resource_ref mr)
-{
-  auto maybe_array = arrow::MakeArrayFromScalar(input, 1);
-  if (!maybe_array.ok()) { CUDF_FAIL("Failed to create array"); }
-  auto array = *maybe_array;
-
-  auto field = arrow::field("", input.type);
-
-  auto table = arrow::Table::Make(arrow::schema({field}), {array});
-
-  auto cudf_table = detail::from_arrow(*table, stream, mr);
-
-  auto cv = cudf_table->view().column(0);
-  return get_element(cv, 0, stream);
-}
-
-}  // namespace detail
-
-std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
-                                  rmm::cuda_stream_view stream,
-                                  rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-
-  return detail::from_arrow(input_table, stream, mr);
-}
-
-std::unique_ptr<cudf::scalar> from_arrow(arrow::Scalar const& input,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::device_async_resource_ref mr)
-{
-  CUDF_FUNC_RANGE();
-
-  return detail::from_arrow(input, stream, mr);
-}
-}  // namespace cudf
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
deleted file mode 100644
index a867d4adfa1..00000000000
--- a/cpp/src/interop/to_arrow.cu
+++ /dev/null
@@ -1,490 +0,0 @@
-/*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "arrow_utilities.hpp"
-#include "decimal_conversion_utilities.cuh"
-#include "detail/arrow_allocator.hpp"
-
-#include <cudf/column/column.hpp>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/column/column_view.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/detail/interop.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/detail/unary.hpp>
-#include <cudf/dictionary/dictionary_column_view.hpp>
-#include <cudf/interop.hpp>
-#include <cudf/null_mask.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/traits.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-
-#include <thrust/copy.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/counting_iterator.h>
-
-namespace cudf {
-namespace detail {
-namespace {
-
-/**
- * @brief Create arrow data buffer from given cudf column
- */
-template <typename T>
-std::shared_ptr<arrow::Buffer> fetch_data_buffer(device_span<T const> input,
-                                                 arrow::MemoryPool* ar_mr,
-                                                 rmm::cuda_stream_view stream)
-{
-  int64_t const data_size_in_bytes = sizeof(T) * input.size();
-
-  auto data_buffer = allocate_arrow_buffer(data_size_in_bytes, ar_mr);
-
-  CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
-                                input.data(),
-                                data_size_in_bytes,
-                                cudaMemcpyDefault,
-                                stream.value()));
-
-  return std::move(data_buffer);
-}
-
-/**
- * @brief Create arrow buffer of mask from given cudf column
- */
-std::shared_ptr<arrow::Buffer> fetch_mask_buffer(column_view input_view,
-                                                 arrow::MemoryPool* ar_mr,
-                                                 rmm::cuda_stream_view stream)
-{
-  int64_t const mask_size_in_bytes = cudf::bitmask_allocation_size_bytes(input_view.size());
-
-  if (input_view.has_nulls()) {
-    auto mask_buffer = allocate_arrow_bitmap(static_cast<int64_t>(input_view.size()), ar_mr);
-    CUDF_CUDA_TRY(cudaMemcpyAsync(
-      mask_buffer->mutable_data(),
-      (input_view.offset() > 0)
-        ? cudf::detail::copy_bitmask(input_view, stream, rmm::mr::get_current_device_resource())
-            .data()
-        : input_view.null_mask(),
-      mask_size_in_bytes,
-      cudaMemcpyDefault,
-      stream.value()));
-
-    // Resets all padded bits to 0
-    mask_buffer->ZeroPadding();
-
-    return mask_buffer;
-  }
-
-  return nullptr;
-}
-
-/**
- * @brief Functor to convert cudf column to arrow array
- */
-struct dispatch_to_arrow {
-  /**
-   * @brief Creates vector Arrays from given cudf column children
-   */
-  std::vector<std::shared_ptr<arrow::Array>> fetch_child_array(
-    column_view input_view,
-    std::vector<column_metadata> const& metadata,
-    arrow::MemoryPool* ar_mr,
-    rmm::cuda_stream_view stream)
-  {
-    std::vector<std::shared_ptr<arrow::Array>> child_arrays;
-    std::transform(
-      input_view.child_begin(),
-      input_view.child_end(),
-      metadata.begin(),
-      std::back_inserter(child_arrays),
-      [&ar_mr, &stream](auto const& child, auto const& meta) {
-        return type_dispatcher(
-          child.type(), dispatch_to_arrow{}, child, child.type().id(), meta, ar_mr, stream);
-      });
-    return child_arrays;
-  }
-
-  template <typename T, CUDF_ENABLE_IF(not is_rep_layout_compatible<T>())>
-  std::shared_ptr<arrow::Array> operator()(
-    column_view, cudf::type_id, column_metadata const&, arrow::MemoryPool*, rmm::cuda_stream_view)
-  {
-    CUDF_FAIL("Unsupported type for to_arrow.");
-  }
-
-  template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
-  std::shared_ptr<arrow::Array> operator()(column_view input_view,
-                                           cudf::type_id id,
-                                           column_metadata const&,
-                                           arrow::MemoryPool* ar_mr,
-                                           rmm::cuda_stream_view stream)
-  {
-    return to_arrow_array(
-      id,
-      static_cast<int64_t>(input_view.size()),
-      fetch_data_buffer<T>(
-        device_span<T const>(input_view.data<T>(), input_view.size()), ar_mr, stream),
-      fetch_mask_buffer(input_view, ar_mr, stream),
-      static_cast<int64_t>(input_view.null_count()));
-  }
-};
-
-// Convert decimal types from libcudf to arrow where those types are not
-// directly supported by Arrow. These types must be fit into 128 bits, the
-// smallest decimal resolution supported by Arrow.
-template <typename DeviceType>
-std::shared_ptr<arrow::Array> unsupported_decimals_to_arrow(column_view input,
-                                                            int32_t precision,
-                                                            arrow::MemoryPool* ar_mr,
-                                                            rmm::cuda_stream_view stream)
-{
-  auto buf = detail::convert_decimals_to_decimal128<DeviceType>(
-    input, stream, rmm::mr::get_current_device_resource());
-
-  // Synchronize stream here to ensure the decimal128 buffer is ready.
-  stream.synchronize();
-
-  auto const buf_size_in_bytes = buf->size();
-  auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
-
-  CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
-                                buf->data(),
-                                buf_size_in_bytes,
-                                cudaMemcpyDefault,
-                                stream.value()));
-
-  auto type    = arrow::decimal(precision, -input.type().scale());
-  auto mask    = fetch_mask_buffer(input, ar_mr, stream);
-  auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, std::move(data_buffer)};
-  auto data    = std::make_shared<arrow::ArrayData>(type, input.size(), buffers);
-
-  return std::make_shared<arrow::Decimal128Array>(data);
-}
-
-template <>
-std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal32>(
-  column_view input,
-  cudf::type_id,
-  column_metadata const&,
-  arrow::MemoryPool* ar_mr,
-  rmm::cuda_stream_view stream)
-{
-  using DeviceType = int32_t;
-  return unsupported_decimals_to_arrow<DeviceType>(
-    input, cudf::detail::max_precision<DeviceType>(), ar_mr, stream);
-}
-
-template <>
-std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
-  column_view input,
-  cudf::type_id,
-  column_metadata const&,
-  arrow::MemoryPool* ar_mr,
-  rmm::cuda_stream_view stream)
-{
-  using DeviceType = int64_t;
-  return unsupported_decimals_to_arrow<DeviceType>(
-    input, cudf::detail::max_precision<DeviceType>(), ar_mr, stream);
-}
-
-template <>
-std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal128>(
-  column_view input,
-  cudf::type_id,
-  column_metadata const&,
-  arrow::MemoryPool* ar_mr,
-  rmm::cuda_stream_view stream)
-{
-  using DeviceType         = __int128_t;
-  auto const max_precision = cudf::detail::max_precision<DeviceType>();
-
-  rmm::device_uvector<DeviceType> buf(input.size(), stream);
-
-  thrust::copy(rmm::exec_policy(stream),  //
-               input.begin<DeviceType>(),
-               input.end<DeviceType>(),
-               buf.begin());
-
-  auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType);
-  auto data_buffer             = allocate_arrow_buffer(buf_size_in_bytes, ar_mr);
-
-  CUDF_CUDA_TRY(cudaMemcpyAsync(
-    data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value()));
-
-  auto type    = arrow::decimal(max_precision, -input.type().scale());
-  auto mask    = fetch_mask_buffer(input, ar_mr, stream);
-  auto buffers = std::vector<std::shared_ptr<arrow::Buffer>>{mask, std::move(data_buffer)};
-  auto data    = std::make_shared<arrow::ArrayData>(type, input.size(), buffers);
-
-  return std::make_shared<arrow::Decimal128Array>(data);
-}
-
-template <>
-std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<bool>(column_view input,
-                                                                  cudf::type_id id,
-                                                                  column_metadata const&,
-                                                                  arrow::MemoryPool* ar_mr,
-                                                                  rmm::cuda_stream_view stream)
-{
-  auto bitmask = detail::bools_to_mask(input, stream, rmm::mr::get_current_device_resource());
-
-  auto data_buffer = allocate_arrow_buffer(static_cast<int64_t>(bitmask.first->size()), ar_mr);
-
-  CUDF_CUDA_TRY(cudaMemcpyAsync(data_buffer->mutable_data(),
-                                bitmask.first->data(),
-                                bitmask.first->size(),
-                                cudaMemcpyDefault,
-                                stream.value()));
-  return to_arrow_array(id,
-                        static_cast<int64_t>(input.size()),
-                        std::move(data_buffer),
-                        fetch_mask_buffer(input, ar_mr, stream),
-                        static_cast<int64_t>(input.null_count()));
-}
-
-template <>
-std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::string_view>(
-  column_view input,
-  cudf::type_id,
-  column_metadata const&,
-  arrow::MemoryPool* ar_mr,
-  rmm::cuda_stream_view stream)
-{
-  std::unique_ptr<column> tmp_column =
-    ((input.offset() != 0) or
-     ((input.num_children() == 1) and (input.child(0).size() - 1 != input.size())))
-      ? std::make_unique<cudf::column>(input, stream)
-      : nullptr;
-
-  column_view input_view = (tmp_column != nullptr) ? tmp_column->view() : input;
-  auto child_arrays      = fetch_child_array(input_view, {{}, {}}, ar_mr, stream);
-  if (child_arrays.empty()) {
-    // Empty string will have only one value in offset of 4 bytes
-    auto tmp_offset_buffer = allocate_arrow_buffer(sizeof(int32_t), ar_mr);
-    auto tmp_data_buffer   = allocate_arrow_buffer(0, ar_mr);
-    memset(tmp_offset_buffer->mutable_data(), 0, sizeof(int32_t));
-
-    return std::make_shared<arrow::StringArray>(
-      0, std::move(tmp_offset_buffer), std::move(tmp_data_buffer));
-  }
-  auto offset_buffer = child_arrays[strings_column_view::offsets_column_index]->data()->buffers[1];
-  auto const sview   = strings_column_view{input_view};
-  auto data_buffer   = fetch_data_buffer<char>(
-    device_span<char const>{sview.chars_begin(stream),
-                              static_cast<std::size_t>(sview.chars_size(stream))},
-    ar_mr,
-    stream);
-  if (sview.offsets().type().id() == cudf::type_id::INT64) {
-    return std::make_shared<arrow::LargeStringArray>(static_cast<int64_t>(input_view.size()),
-                                                     offset_buffer,
-                                                     data_buffer,
-                                                     fetch_mask_buffer(input_view, ar_mr, stream),
-                                                     static_cast<int64_t>(input_view.null_count()));
-  } else {
-    return std::make_shared<arrow::StringArray>(static_cast<int64_t>(input_view.size()),
-                                                offset_buffer,
-                                                data_buffer,
-                                                fetch_mask_buffer(input_view, ar_mr, stream),
-                                                static_cast<int64_t>(input_view.null_count()));
-  }
-}
-
-template <>
-std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::struct_view>(
-  column_view input,
-  cudf::type_id,
-  column_metadata const& metadata,
-  arrow::MemoryPool* ar_mr,
-  rmm::cuda_stream_view stream)
-{
-  CUDF_EXPECTS(metadata.children_meta.size() == static_cast<std::size_t>(input.num_children()),
-               "Number of field names and number of children doesn't match\n");
-  std::unique_ptr<column> tmp_column = nullptr;
-
-  if (input.offset() != 0) { tmp_column = std::make_unique<cudf::column>(input, stream); }
-
-  column_view input_view = (tmp_column != nullptr) ? tmp_column->view() : input;
-  auto child_arrays      = fetch_child_array(input_view, metadata.children_meta, ar_mr, stream);
-  auto mask              = fetch_mask_buffer(input_view, ar_mr, stream);
-
-  std::vector<std::shared_ptr<arrow::Field>> fields;
-  std::transform(child_arrays.cbegin(),
-                 child_arrays.cend(),
-                 metadata.children_meta.cbegin(),
-                 std::back_inserter(fields),
-                 [](auto const array, auto const meta) {
-                   return std::make_shared<arrow::Field>(
-                     meta.name, array->type(), array->null_count() > 0);
-                 });
-  auto dtype = std::make_shared<arrow::StructType>(fields);
-
-  return std::make_shared<arrow::StructArray>(dtype,
-                                              static_cast<int64_t>(input_view.size()),
-                                              child_arrays,
-                                              mask,
-                                              static_cast<int64_t>(input_view.null_count()));
-}
-
-template <>
-std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::list_view>(
-  column_view input,
-  cudf::type_id,
-  column_metadata const& metadata,
-  arrow::MemoryPool* ar_mr,
-  rmm::cuda_stream_view stream)
-{
-  CUDF_EXPECTS(metadata.children_meta.empty() ||
-                 metadata.children_meta.size() == static_cast<std::size_t>(input.num_children()),
-               "Number of field names and number of children do not match\n");
-  std::unique_ptr<column> tmp_column = nullptr;
-  if ((input.offset() != 0) or
-      ((input.num_children() == 2) and (input.child(0).size() - 1 != input.size()))) {
-    tmp_column = std::make_unique<cudf::column>(input, stream);
-  }
-
-  column_view input_view = (tmp_column != nullptr) ? tmp_column->view() : input;
-  auto children_meta =
-    metadata.children_meta.empty() ? std::vector<column_metadata>{{}, {}} : metadata.children_meta;
-  auto child_arrays = fetch_child_array(input_view, children_meta, ar_mr, stream);
-  if (child_arrays.empty() || child_arrays[0]->data()->length == 0) {
-    auto element_type = child_arrays.empty() ? arrow::null() : child_arrays[1]->type();
-    auto result       = arrow::MakeEmptyArray(arrow::list(element_type), ar_mr);
-    CUDF_EXPECTS(result.ok(), "Failed to construct empty arrow list array\n");
-    return result.ValueUnsafe();
-  }
-
-  auto offset_buffer = child_arrays[0]->data()->buffers[1];
-  auto data          = child_arrays[1];
-  return std::make_shared<arrow::ListArray>(arrow::list(data->type()),
-                                            static_cast<int64_t>(input_view.size()),
-                                            offset_buffer,
-                                            data,
-                                            fetch_mask_buffer(input_view, ar_mr, stream),
-                                            static_cast<int64_t>(input_view.null_count()));
-}
-
-template <>
-std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<cudf::dictionary32>(
-  column_view input,
-  cudf::type_id,
-  column_metadata const& metadata,
-  arrow::MemoryPool* ar_mr,
-  rmm::cuda_stream_view stream)
-{
-  // Arrow dictionary requires indices to be signed integer
-  std::unique_ptr<column> dict_indices =
-    detail::cast(cudf::dictionary_column_view(input).get_indices_annotated(),
-                 cudf::data_type{type_id::INT32},
-                 stream,
-                 rmm::mr::get_current_device_resource());
-  auto indices = dispatch_to_arrow{}.operator()<int32_t>(
-    dict_indices->view(), dict_indices->type().id(), {}, ar_mr, stream);
-  auto dict_keys = cudf::dictionary_column_view(input).keys();
-  auto dictionary =
-    type_dispatcher(dict_keys.type(),
-                    dispatch_to_arrow{},
-                    dict_keys,
-                    dict_keys.type().id(),
-                    metadata.children_meta.empty() ? column_metadata{} : metadata.children_meta[0],
-                    ar_mr,
-                    stream);
-
-  return std::make_shared<arrow::DictionaryArray>(
-    arrow::dictionary(indices->type(), dictionary->type()), indices, dictionary);
-}
-}  // namespace
-
-std::shared_ptr<arrow::Table> to_arrow(table_view input,
-                                       std::vector<column_metadata> const& metadata,
-                                       rmm::cuda_stream_view stream,
-                                       arrow::MemoryPool* ar_mr)
-{
-  CUDF_EXPECTS((metadata.size() == static_cast<std::size_t>(input.num_columns())),
-               "columns' metadata should be equal to number of columns in table");
-
-  std::vector<std::shared_ptr<arrow::Array>> arrays;
-  std::vector<std::shared_ptr<arrow::Field>> fields;
-
-  std::transform(
-    input.begin(),
-    input.end(),
-    metadata.begin(),
-    std::back_inserter(arrays),
-    [&](auto const& c, auto const& meta) {
-      return c.type().id() != type_id::EMPTY
-               ? type_dispatcher(
-                   c.type(), detail::dispatch_to_arrow{}, c, c.type().id(), meta, ar_mr, stream)
-               : std::make_shared<arrow::NullArray>(c.size());
-    });
-
-  std::transform(
-    arrays.begin(),
-    arrays.end(),
-    metadata.begin(),
-    std::back_inserter(fields),
-    [](auto const& array, auto const& meta) { return arrow::field(meta.name, array->type()); });
-
-  auto result = arrow::Table::Make(arrow::schema(fields), arrays);
-
-  // synchronize the stream because after the return the data may be accessed from the host before
-  // the above `cudaMemcpyAsync` calls have completed their copies (especially if pinned host
-  // memory is used).
-  stream.synchronize();
-
-  return result;
-}
-
-std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
-                                        column_metadata const& metadata,
-                                        rmm::cuda_stream_view stream,
-                                        arrow::MemoryPool* ar_mr)
-{
-  auto const column = cudf::make_column_from_scalar(input, 1, stream);
-  cudf::table_view const tv{{column->view()}};
-  auto const arrow_table  = detail::to_arrow(tv, {metadata}, stream, ar_mr);
-  auto const ac           = arrow_table->column(0);
-  auto const maybe_scalar = ac->GetScalar(0);
-  if (!maybe_scalar.ok()) { CUDF_FAIL("Failed to produce a scalar"); }
-  return maybe_scalar.ValueOrDie();
-}
-}  // namespace detail
-
-std::shared_ptr<arrow::Table> to_arrow(table_view input,
-                                       std::vector<column_metadata> const& metadata,
-                                       rmm::cuda_stream_view stream,
-                                       arrow::MemoryPool* ar_mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::to_arrow(input, metadata, stream, ar_mr);
-}
-
-std::shared_ptr<arrow::Scalar> to_arrow(cudf::scalar const& input,
-                                        column_metadata const& metadata,
-                                        rmm::cuda_stream_view stream,
-                                        arrow::MemoryPool* ar_mr)
-{
-  CUDF_FUNC_RANGE();
-  return detail::to_arrow(input, metadata, stream, ar_mr);
-}
-}  // namespace cudf
diff --git a/cpp/tests/interop/arrow_utils.hpp b/cpp/tests/interop/arrow_utils.hpp
index 08eada632a5..70a9fe64d70 100644
--- a/cpp/tests/interop/arrow_utils.hpp
+++ b/cpp/tests/interop/arrow_utils.hpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -30,11 +32,65 @@
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 
+#include <arrow/api.h>
 #include <arrow/util/bitmap_builders.h>
 
-#include <algorithm>
-
-#pragma once
+// Creating arrow as per given type_id and buffer arguments
+template <typename... Ts>
+std::shared_ptr<arrow::Array> to_arrow_array(cudf::type_id id, Ts&&... args)
+{
+  switch (id) {
+    case cudf::type_id::BOOL8:
+      return std::make_shared<arrow::BooleanArray>(std::forward<Ts>(args)...);
+    case cudf::type_id::INT8: return std::make_shared<arrow::Int8Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::INT16:
+      return std::make_shared<arrow::Int16Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::INT32:
+      return std::make_shared<arrow::Int32Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::INT64:
+      return std::make_shared<arrow::Int64Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::UINT8:
+      return std::make_shared<arrow::UInt8Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::UINT16:
+      return std::make_shared<arrow::UInt16Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::UINT32:
+      return std::make_shared<arrow::UInt32Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::UINT64:
+      return std::make_shared<arrow::UInt64Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::FLOAT32:
+      return std::make_shared<arrow::FloatArray>(std::forward<Ts>(args)...);
+    case cudf::type_id::FLOAT64:
+      return std::make_shared<arrow::DoubleArray>(std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_DAYS:
+      return std::make_shared<arrow::Date32Array>(std::make_shared<arrow::Date32Type>(),
+                                                  std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_SECONDS:
+      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::SECOND),
+                                                     std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_MILLISECONDS:
+      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::MILLI),
+                                                     std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_MICROSECONDS:
+      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::MICRO),
+                                                     std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_NANOSECONDS:
+      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::NANO),
+                                                     std::forward<Ts>(args)...);
+    case cudf::type_id::DURATION_SECONDS:
+      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::SECOND),
+                                                    std::forward<Ts>(args)...);
+    case cudf::type_id::DURATION_MILLISECONDS:
+      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::MILLI),
+                                                    std::forward<Ts>(args)...);
+    case cudf::type_id::DURATION_MICROSECONDS:
+      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::MICRO),
+                                                    std::forward<Ts>(args)...);
+    case cudf::type_id::DURATION_NANOSECONDS:
+      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::NANO),
+                                                    std::forward<Ts>(args)...);
+    default: CUDF_FAIL("Unsupported type_id conversion to arrow");
+  }
+}
 
 template <typename T>
 std::enable_if_t<cudf::is_fixed_width<T>() and !std::is_same_v<T, bool>,
@@ -50,7 +106,7 @@ get_arrow_array(std::vector<T> const& data, std::vector<uint8_t> const& mask = {
   std::shared_ptr<arrow::Buffer> mask_buffer =
     mask.empty() ? nullptr : arrow::internal::BytesToBits(mask).ValueOrDie();
 
-  return cudf::detail::to_arrow_array(cudf::type_to_id<T>(), data.size(), data_buffer, mask_buffer);
+  return to_arrow_array(cudf::type_to_id<T>(), data.size(), data_buffer, mask_buffer);
 }
 
 template <typename T>
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index cdc5aa41abe..9b718b2ed83 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -38,12 +38,70 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 
 #include <arrow/api.h>
+#include <arrow/c/bridge.h>
 
 #include <algorithm>
 
 using cudf::jni::ptr_as_jlong;
 using cudf::jni::release_as_jlong;
 
+// Creating arrow as per given type_id and buffer arguments
+template <typename... Ts>
+std::shared_ptr<arrow::Array> to_arrow_array(cudf::type_id id, Ts&&... args)
+{
+  switch (id) {
+    case cudf::type_id::BOOL8:
+      return std::make_shared<arrow::BooleanArray>(std::forward<Ts>(args)...);
+    case cudf::type_id::INT8: return std::make_shared<arrow::Int8Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::INT16:
+      return std::make_shared<arrow::Int16Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::INT32:
+      return std::make_shared<arrow::Int32Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::INT64:
+      return std::make_shared<arrow::Int64Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::UINT8:
+      return std::make_shared<arrow::UInt8Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::UINT16:
+      return std::make_shared<arrow::UInt16Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::UINT32:
+      return std::make_shared<arrow::UInt32Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::UINT64:
+      return std::make_shared<arrow::UInt64Array>(std::forward<Ts>(args)...);
+    case cudf::type_id::FLOAT32:
+      return std::make_shared<arrow::FloatArray>(std::forward<Ts>(args)...);
+    case cudf::type_id::FLOAT64:
+      return std::make_shared<arrow::DoubleArray>(std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_DAYS:
+      return std::make_shared<arrow::Date32Array>(std::make_shared<arrow::Date32Type>(),
+                                                  std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_SECONDS:
+      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::SECOND),
+                                                     std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_MILLISECONDS:
+      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::MILLI),
+                                                     std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_MICROSECONDS:
+      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::MICRO),
+                                                     std::forward<Ts>(args)...);
+    case cudf::type_id::TIMESTAMP_NANOSECONDS:
+      return std::make_shared<arrow::TimestampArray>(arrow::timestamp(arrow::TimeUnit::NANO),
+                                                     std::forward<Ts>(args)...);
+    case cudf::type_id::DURATION_SECONDS:
+      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::SECOND),
+                                                    std::forward<Ts>(args)...);
+    case cudf::type_id::DURATION_MILLISECONDS:
+      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::MILLI),
+                                                    std::forward<Ts>(args)...);
+    case cudf::type_id::DURATION_MICROSECONDS:
+      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::MICRO),
+                                                    std::forward<Ts>(args)...);
+    case cudf::type_id::DURATION_NANOSECONDS:
+      return std::make_shared<arrow::DurationArray>(arrow::duration(arrow::TimeUnit::NANO),
+                                                    std::forward<Ts>(args)...);
+    default: CUDF_FAIL("Unsupported type_id conversion to arrow");
+  }
+}
+
 extern "C" {
 
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_sequence(
@@ -141,15 +199,27 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_fromArrow(JNIEnv* env,
         break;
       default:
         // this handles the primitive types
-        arrow_array = cudf::detail::to_arrow_array(
-          n_type, j_col_length, data_buffer, null_buffer, j_null_count);
+        arrow_array = to_arrow_array(n_type, j_col_length, data_buffer, null_buffer, j_null_count);
     }
     auto name_and_type                                = arrow::field("col", arrow_array->type());
     std::vector<std::shared_ptr<arrow::Field>> fields = {name_and_type};
     std::shared_ptr<arrow::Schema> schema             = std::make_shared<arrow::Schema>(fields);
     auto arrow_table =
       arrow::Table::Make(schema, std::vector<std::shared_ptr<arrow::Array>>{arrow_array});
-    auto retCols = cudf::from_arrow(*(arrow_table))->release();
+
+    ArrowSchema sch;
+    if (!arrow::ExportSchema(*arrow_table->schema(), &sch).ok()) {
+      JNI_THROW_NEW(env, "java/lang/RuntimeException", "Unable to produce an ArrowSchema", 0)
+    }
+    auto batch = arrow_table->CombineChunksToBatch().ValueOrDie();
+    ArrowArray arr;
+    if (!arrow::ExportRecordBatch(*batch, &arr).ok()) {
+      JNI_THROW_NEW(env, "java/lang/RuntimeException", "Unable to produce an ArrowArray", 0)
+    }
+    auto retCols = cudf::from_arrow(&sch, &arr)->release();
+    arr.release(&arr);
+    sch.release(&sch);
+
     if (retCols.size() != 1) {
       JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "Must result in one column", 0);
     }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index ecc551f1143..c749c8c84bf 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -54,6 +54,8 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
+#include <arrow/api.h>
+#include <arrow/c/bridge.h>
 #include <arrow/io/api.h>
 #include <arrow/ipc/api.h>
 
@@ -1069,6 +1071,15 @@ void append_flattened_child_names(cudf::io::column_name_info const& info,
   }
 }
 
+// Recursively make schema and its children nullable
+void set_nullable(ArrowSchema* schema)
+{
+  schema->flags |= ARROW_FLAG_NULLABLE;
+  for (int i = 0; i < schema->n_children; ++i) {
+    set_nullable(schema->children[i]);
+  }
+}
+
 }  // namespace
 
 }  // namespace jni
@@ -2635,7 +2646,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_convertCudfToArrowTable(JNIEnv
     // The pointer to the shared_ptr<> is returned as a jlong.
     using result_t = std::shared_ptr<arrow::Table>;
 
-    auto result = cudf::to_arrow(*tview, state->get_column_metadata(*tview));
+    auto got_arrow_schema = cudf::to_arrow_schema(*tview, state->get_column_metadata(*tview));
+    cudf::jni::set_nullable(got_arrow_schema.get());
+    auto got_arrow_array = cudf::to_arrow_host(*tview);
+    auto batch =
+      arrow::ImportRecordBatch(&got_arrow_array->array, got_arrow_schema.get()).ValueOrDie();
+    auto result = arrow::Table::FromRecordBatches({batch}).ValueOrDie();
+
     return ptr_as_jlong(new result_t{result});
   }
   CATCH_STD(env, 0)
@@ -2746,7 +2763,21 @@ Java_ai_rapids_cudf_Table_convertArrowTableToCudf(JNIEnv* env, jclass, jlong arr
 
   try {
     cudf::jni::auto_set_device(env);
-    return convert_table_for_return(env, cudf::from_arrow(*(handle->get())));
+
+    ArrowSchema sch;
+    if (!arrow::ExportSchema(*handle->get()->schema(), &sch).ok()) {
+      JNI_THROW_NEW(env, "java/lang/RuntimeException", "Unable to produce an ArrowSchema", 0)
+    }
+    auto batch = handle->get()->CombineChunksToBatch().ValueOrDie();
+    ArrowArray arr;
+    if (!arrow::ExportRecordBatch(*batch, &arr).ok()) {
+      JNI_THROW_NEW(env, "java/lang/RuntimeException", "Unable to produce an ArrowArray", 0)
+    }
+    auto ret = cudf::from_arrow(&sch, &arr);
+    arr.release(&arr);
+    sch.release(&sch);
+
+    return convert_table_for_return(env, ret);
   }
   CATCH_STD(env, 0)
 }

From 1fd96756daf90b8d2f901fe19a168e9d11974c0b Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Wed, 21 Aug 2024 21:10:20 -0700
Subject: [PATCH 708/842] Fix overflow bug in low-memory JSON reader (#16632)

Bug fix for #16627.
Changes byte range offsets and sizes from `size_type` to `size_t` in pylibcudf.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16632
---
 python/pylibcudf/pylibcudf/io/json.pxd         |  4 ++--
 python/pylibcudf/pylibcudf/io/json.pyx         | 12 ++++++------
 python/pylibcudf/pylibcudf/libcudf/io/json.pxd | 12 ++++++------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/python/pylibcudf/pylibcudf/io/json.pxd b/python/pylibcudf/pylibcudf/io/json.pxd
index ab9b5b99ce2..f65c1034598 100644
--- a/python/pylibcudf/pylibcudf/io/json.pxd
+++ b/python/pylibcudf/pylibcudf/io/json.pxd
@@ -15,8 +15,8 @@ cpdef TableWithMetadata read_json(
     list dtypes = *,
     compression_type compression = *,
     bool lines = *,
-    size_type byte_range_offset = *,
-    size_type byte_range_size = *,
+    size_t byte_range_offset = *,
+    size_t byte_range_size = *,
     bool keep_quotes = *,
     bool mixed_types_as_string = *,
     bool prune_columns = *,
diff --git a/python/pylibcudf/pylibcudf/io/json.pyx b/python/pylibcudf/pylibcudf/io/json.pyx
index ce086f4a489..29e49083bc6 100644
--- a/python/pylibcudf/pylibcudf/io/json.pyx
+++ b/python/pylibcudf/pylibcudf/io/json.pyx
@@ -51,8 +51,8 @@ cdef json_reader_options _setup_json_reader_options(
         list dtypes,
         compression_type compression,
         bool lines,
-        size_type byte_range_offset,
-        size_type byte_range_size,
+        size_t byte_range_offset,
+        size_t byte_range_size,
         bool keep_quotes,
         bool mixed_types_as_string,
         bool prune_columns,
@@ -189,8 +189,8 @@ cpdef TableWithMetadata read_json(
     list dtypes = None,
     compression_type compression = compression_type.AUTO,
     bool lines = False,
-    size_type byte_range_offset = 0,
-    size_type byte_range_size = 0,
+    size_t byte_range_offset = 0,
+    size_t byte_range_size = 0,
     bool keep_quotes = False,
     bool mixed_types_as_string = False,
     bool prune_columns = False,
@@ -212,9 +212,9 @@ cpdef TableWithMetadata read_json(
         (column_child_name, column_child_type, list of grandchild dtypes).
     compression: CompressionType, default CompressionType.AUTO
         The compression format of the JSON source.
-    byte_range_offset : size_type, default 0
+    byte_range_offset : size_t, default 0
         Number of bytes to skip from source start.
-    byte_range_size : size_type, default 0
+    byte_range_size : size_t, default 0
         Number of bytes to read. By default, will read all bytes.
     keep_quotes : bool, default False
         Whether the reader should keep quotes of string values.
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
index 7514e6c5258..1c74f8ca3ac 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/json.pxd
@@ -27,8 +27,8 @@ cdef extern from "cudf/io/json.hpp" \
         cudf_io_types.source_info get_source() except +
         vector[string] get_dtypes() except +
         cudf_io_types.compression_type get_compression() except +
-        size_type get_byte_range_offset() except +
-        size_type get_byte_range_size() except +
+        size_t get_byte_range_offset() except +
+        size_t get_byte_range_size() except +
         bool is_enabled_lines() except +
         bool is_enabled_mixed_types_as_string() except +
         bool is_enabled_prune_columns() except +
@@ -41,8 +41,8 @@ cdef extern from "cudf/io/json.hpp" \
         void set_compression(
             cudf_io_types.compression_type compression
         ) except +
-        void set_byte_range_offset(size_type offset) except +
-        void set_byte_range_size(size_type size) except +
+        void set_byte_range_offset(size_t offset) except +
+        void set_byte_range_size(size_t size) except +
         void enable_lines(bool val) except +
         void enable_mixed_types_as_string(bool val) except +
         void enable_prune_columns(bool val) except +
@@ -73,10 +73,10 @@ cdef extern from "cudf/io/json.hpp" \
             cudf_io_types.compression_type compression
         ) except +
         json_reader_options_builder& byte_range_offset(
-            size_type offset
+            size_t offset
         ) except +
         json_reader_options_builder& byte_range_size(
-            size_type size
+            size_t size
         ) except +
         json_reader_options_builder& lines(
             bool val

From 00ff2ee5ec2fd23c65e759dc2f9d2907a1c9cb00 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Thu, 22 Aug 2024 10:35:27 -0700
Subject: [PATCH 709/842] [FEA] Add filesystem argument to `cudf.read_parquet`
 (#16577)

This PR adds a `filesystem` kwarg to `cudf.read_parquet` (in alignment with [the pandas API](https://pandas.pydata.org/docs/reference/api/pandas.read_parquet.html)).

When a user has already constructed an `fsspec.AbstractFileSystem` object outside of cudf, they can now pass that object in to `read_parquet` to avoid redundant (and possibly inconsistent) filesystem inference. This PR also makes it possible for us to remove [explicit remote-IO handling from dask-cudf](https://github.com/rapidsai/cudf/blob/623dfceb42eb3e73b352b295898ff3e6cfe7c865/python/dask_cudf/dask_cudf/io/parquet.py#L100) (and consolidate the logic in cudf/ioutils).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16577
---
 python/cudf/cudf/io/parquet.py           |  5 ++-
 python/cudf/cudf/tests/test_s3.py        | 22 ++++++++++
 python/cudf/cudf/utils/ioutils.py        | 54 ++++++++++++++++++------
 python/dask_cudf/dask_cudf/io/parquet.py | 23 +++-------
 4 files changed, 75 insertions(+), 29 deletions(-)

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index fac51a9e471..560f257c115 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -527,6 +527,7 @@ def read_parquet(
     engine="cudf",
     columns=None,
     storage_options=None,
+    filesystem=None,
     filters=None,
     row_groups=None,
     use_pandas_metadata=True,
@@ -567,7 +568,9 @@ def read_parquet(
     # Start by trying construct a filesystem object, so we
     # can apply filters on remote file-systems
     fs, paths = ioutils._get_filesystem_and_paths(
-        path_or_data=filepath_or_buffer, storage_options=storage_options
+        path_or_data=filepath_or_buffer,
+        storage_options=storage_options,
+        filesystem=filesystem,
     )
 
     # Normalize and validate filters
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index 6579fd23634..3b23a53091e 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -269,6 +269,28 @@ def test_read_parquet_ext(
     assert_eq(expect, got1)
 
 
+def test_read_parquet_filesystem(s3_base, s3so, pdf):
+    fname = "data.0.parquet"
+    # NOTE: Need a unique bucket name when a glob pattern
+    # is used, otherwise fsspec seems to cache the bucket
+    # contents, and later tests using the same bucket name
+    # will fail.
+    bucket = "test_read_parquet_filesystem"
+    buffer = BytesIO()
+    pdf.to_parquet(path=buffer)
+    buffer.seek(0)
+    fs = get_fs_token_paths("s3://", mode="rb", storage_options=s3so)[0]
+    with s3_context(
+        s3_base=s3_base,
+        bucket=bucket,
+        files={fname: buffer},
+    ):
+        # Check that a glob pattern works
+        path = f"s3://{bucket}/{'data.*.parquet'}"
+        got = cudf.read_parquet(path, filesystem=fs)
+    assert_eq(pdf, got)
+
+
 def test_read_parquet_multi_file(s3_base, s3so, pdf):
     fname_1 = "test_parquet_reader_multi_file_1.parquet"
     buffer_1 = BytesIO()
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 4ac9b63985f..18106e7475b 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -12,7 +12,7 @@
 import fsspec.implementations.local
 import numpy as np
 import pandas as pd
-from fsspec.core import get_fs_token_paths
+from fsspec.core import expand_paths_if_needed, get_fs_token_paths
 
 from cudf.core._compat import PANDAS_LT_300
 from cudf.utils.docutils import docfmt_partial
@@ -139,6 +139,9 @@
     For other URLs (e.g. starting with "s3://", and "gcs://") the key-value
     pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
     ``urllib`` for more details.
+filesystem : fsspec.AbstractFileSystem, default None
+    Filesystem object to use when reading the parquet data. This argument
+    should not be used at the same time as `storage_options`.
 filters : list of tuple, list of lists of tuples, default None
     If not None, specifies a filter predicate used to filter out row groups
     using statistics stored for each row group as Parquet metadata. Row groups
@@ -1536,11 +1539,18 @@ def is_directory(path_or_data, storage_options=None):
     return False
 
 
-def _get_filesystem_and_paths(path_or_data, storage_options):
+def _get_filesystem_and_paths(
+    path_or_data,
+    storage_options,
+    *,
+    filesystem=None,
+):
     # Returns a filesystem object and the filesystem-normalized
     # paths. If `path_or_data` does not correspond to a path or
     # list of paths (or if the protocol is not supported), the
     # return will be `None` for the fs and `[]` for the paths.
+    # If a filesystem object is already available, it can be
+    # passed with the `filesystem` argument.
 
     fs = None
     return_paths = path_or_data
@@ -1557,16 +1567,36 @@ def _get_filesystem_and_paths(path_or_data, storage_options):
         else:
             path_or_data = [path_or_data]
 
-        try:
-            fs, _, fs_paths = get_fs_token_paths(
-                path_or_data, mode="rb", storage_options=storage_options
-            )
-            return_paths = fs_paths
-        except ValueError as e:
-            if str(e).startswith("Protocol not known"):
-                return None, []
-            else:
-                raise e
+        if filesystem is None:
+            try:
+                fs, _, fs_paths = get_fs_token_paths(
+                    path_or_data, mode="rb", storage_options=storage_options
+                )
+                return_paths = fs_paths
+            except ValueError as e:
+                if str(e).startswith("Protocol not known"):
+                    return None, []
+                else:
+                    raise e
+        else:
+            if not isinstance(filesystem, fsspec.AbstractFileSystem):
+                raise ValueError(
+                    f"Expected fsspec.AbstractFileSystem. Got {filesystem}"
+                )
+
+            if storage_options:
+                raise ValueError(
+                    f"Cannot specify storage_options when an explicit "
+                    f"filesystem object is specified. Got: {storage_options}"
+                )
+
+            fs = filesystem
+            return_paths = [
+                fs._strip_protocol(u)
+                for u in expand_paths_if_needed(
+                    path_or_data, "rb", 1, fs, None
+                )
+            ]
 
     return fs, return_paths
 
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index 8f52fce7818..c025280c240 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -23,11 +23,7 @@
 from cudf.io import write_to_dataset
 from cudf.io.parquet import _apply_post_filters, _normalize_filters
 from cudf.utils.dtypes import cudf_dtype_from_pa_type
-from cudf.utils.ioutils import (
-    _ROW_GROUP_SIZE_BYTES_DEFAULT,
-    _fsspec_data_transfer,
-    _is_local_filesystem,
-)
+from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
 
 
 class CudfEngine(ArrowDatasetEngine):
@@ -93,40 +89,35 @@ def _read_paths(
         dataset_kwargs = dataset_kwargs or {}
         dataset_kwargs["partitioning"] = partitioning or "hive"
 
-        # Non-local filesystem handling
-        paths_or_fobs = paths
-        if not _is_local_filesystem(fs):
-            paths_or_fobs = [
-                _fsspec_data_transfer(fpath, fs=fs) for fpath in paths
-            ]
-
         # Use cudf to read in data
         try:
             df = cudf.read_parquet(
-                paths_or_fobs,
+                paths,
                 engine="cudf",
                 columns=columns,
                 row_groups=row_groups if row_groups else None,
                 dataset_kwargs=dataset_kwargs,
                 categorical_partitions=False,
+                filesystem=fs,
                 **kwargs,
             )
         except RuntimeError as err:
             # TODO: Remove try/except after null-schema issue is resolved
             # (See: https://github.com/rapidsai/cudf/issues/12702)
-            if len(paths_or_fobs) > 1:
+            if len(paths) > 1:
                 df = cudf.concat(
                     [
                         cudf.read_parquet(
-                            pof,
+                            path,
                             engine="cudf",
                             columns=columns,
                             row_groups=row_groups[i] if row_groups else None,
                             dataset_kwargs=dataset_kwargs,
                             categorical_partitions=False,
+                            filesystem=fs,
                             **kwargs,
                         )
-                        for i, pof in enumerate(paths_or_fobs)
+                        for i, path in enumerate(paths)
                     ]
                 )
             else:

From 81d71fce73306ae88bee1c78ed1f88e10916ad17 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Thu, 22 Aug 2024 12:56:09 -0500
Subject: [PATCH 710/842] update-version.sh fix (#16629)

Updates the `update-version.sh` script to include missed version
updates.
---
 ci/release/update-version.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 132e58249e6..e79a91510b8 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -51,6 +51,7 @@ DEPENDENCIES=(
   kvikio
   libkvikio
   librmm
+  pylibcudf
   rapids-dask-dependency
   rmm
 )
@@ -77,7 +78,7 @@ for FILE in .github/workflows/*.yaml .github/workflows/*.yml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
   sed_runner "s/dask-cuda.git@branch-[^\"\s]\+/dask-cuda.git@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
 done
-sed_runner "s/branch-[0-9]+\.[0-9]+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh
+sed_runner "s/branch-[0-9]\+\.[0-9]\+/branch-${NEXT_SHORT_TAG}/g" ci/test_wheel_cudf_polars.sh
 
 # Java files
 NEXT_FULL_JAVA_TAG="${NEXT_SHORT_TAG}.${PATCH_PEP440}-SNAPSHOT"

From e4e867aace96b80fccf030cc02a11f89cbb9c05f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 22 Aug 2024 09:29:44 -1000
Subject: [PATCH 711/842] Annotate `ColumnAccessor._data` labels as `Hashable`
 (#16623)

The motivating change here is that since we store a dictionary of columns in `ColumnAccessor`, the labels should be `collections.abc.Hashable` and therefore we can type methods that select by key with this annotation.

This led to a mypy-typing-validation cascade that made me type the output of `def as_column(...) -> ColumnBase` which also lead to typing validation in several other files.

Namely there no logic changes here.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16623
---
 python/cudf/cudf/_lib/column.pyi              |  2 +-
 python/cudf/cudf/core/_internals/timezones.py |  2 +-
 python/cudf/cudf/core/column/categorical.py   |  6 +-
 python/cudf/cudf/core/column/column.py        | 22 ++++--
 python/cudf/cudf/core/column/lists.py         |  7 +-
 python/cudf/cudf/core/column/numerical.py     |  4 +-
 python/cudf/cudf/core/column/string.py        |  8 +-
 python/cudf/cudf/core/column_accessor.py      | 76 +++++++++++--------
 python/cudf/cudf/core/copy_types.py           | 19 +++--
 python/cudf/cudf/core/dataframe.py            |  2 +-
 python/cudf/cudf/core/indexed_frame.py        | 26 ++++---
 11 files changed, 105 insertions(+), 69 deletions(-)

diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi
index bcab009c102..bb38488eefb 100644
--- a/python/cudf/cudf/_lib/column.pyi
+++ b/python/cudf/cudf/_lib/column.pyi
@@ -54,7 +54,7 @@ class Column:
     @property
     def mask_ptr(self) -> int: ...
     def set_base_mask(self, value: Buffer | None) -> None: ...
-    def set_mask(self, value: Buffer | None) -> Self: ...
+    def set_mask(self, value: ColumnBase | Buffer | None) -> Self: ...
     @property
     def null_count(self) -> int: ...
     @property
diff --git a/python/cudf/cudf/core/_internals/timezones.py b/python/cudf/cudf/core/_internals/timezones.py
index 29cb9d7bd12..fd89904e766 100644
--- a/python/cudf/cudf/core/_internals/timezones.py
+++ b/python/cudf/cudf/core/_internals/timezones.py
@@ -120,7 +120,7 @@ def _read_tzfile_as_columns(
 
         # this happens for UTC-like zones
         min_date = np.int64(np.iinfo("int64").min + 1).astype("M8[s]")
-        return (as_column([min_date]), as_column([np.timedelta64(0, "s")]))
+        return (as_column([min_date]), as_column([np.timedelta64(0, "s")]))  # type: ignore[return-value]
     return tuple(transition_times_and_offsets)  # type: ignore[return-value]
 
 
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 1fdaf9f8c07..a7e98e5218f 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -984,9 +984,9 @@ def find_and_replace(
         )
         replacement_col = catmap._data["index"].astype(replaced.codes.dtype)
 
-        replaced = column.as_column(replaced.codes)
+        replaced_codes = column.as_column(replaced.codes)
         output = libcudf.replace.replace(
-            replaced, to_replace_col, replacement_col
+            replaced_codes, to_replace_col, replacement_col
         )
 
         result = column.build_categorical_column(
@@ -1064,7 +1064,7 @@ def _validate_fillna_value(
                 raise TypeError(
                     "Cannot set a categorical with non-categorical data"
                 )
-            fill_value = fill_value._set_categories(
+            fill_value = cast(CategoricalColumn, fill_value)._set_categories(
                 self.categories,
             )
             return fill_value.codes.astype(self.codes.dtype)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 27278120abb..60b4126ddd4 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -553,7 +553,7 @@ def __setitem__(self, key: Any, value: Any):
         """
 
         # Normalize value to scalar/column
-        value_normalized = (
+        value_normalized: cudf.Scalar | ColumnBase = (
             cudf.Scalar(value, dtype=self.dtype)
             if is_scalar(value)
             else as_column(value, dtype=self.dtype)
@@ -609,9 +609,12 @@ def _scatter_by_slice(
                 )
 
         # step != 1, create a scatter map with arange
-        scatter_map = as_column(
-            rng,
-            dtype=cudf.dtype(np.int32),
+        scatter_map = cast(
+            cudf.core.column.NumericalColumn,
+            as_column(
+                rng,
+                dtype=cudf.dtype(np.int32),
+            ),
         )
 
         return self._scatter_by_column(scatter_map, value)
@@ -1111,11 +1114,16 @@ def argsort(
         if (ascending and self.is_monotonic_increasing) or (
             not ascending and self.is_monotonic_decreasing
         ):
-            return as_column(range(len(self)))
+            return cast(
+                cudf.core.column.NumericalColumn, as_column(range(len(self)))
+            )
         elif (ascending and self.is_monotonic_decreasing) or (
             not ascending and self.is_monotonic_increasing
         ):
-            return as_column(range(len(self) - 1, -1, -1))
+            return cast(
+                cudf.core.column.NumericalColumn,
+                as_column(range(len(self) - 1, -1, -1)),
+            )
         else:
             return libcudf.sort.order_by(
                 [self], [ascending], na_position, stable=True
@@ -1752,7 +1760,7 @@ def as_column(
     nan_as_null: bool | None = None,
     dtype: Dtype | None = None,
     length: int | None = None,
-):
+) -> ColumnBase:
     """Create a Column from an arbitrary object
 
     Parameters
diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py
index 302f04a0e71..c6a39199e3b 100644
--- a/python/cudf/cudf/core/column/lists.py
+++ b/python/cudf/cudf/core/column/lists.py
@@ -256,7 +256,10 @@ def from_sequences(
                 offset += len(data)
                 offset_vals.append(offset)
 
-        offset_col = column.as_column(offset_vals, dtype=size_type_dtype)
+        offset_col = cast(
+            NumericalColumn,
+            column.as_column(offset_vals, dtype=size_type_dtype),
+        )
 
         # Build ListColumn
         res = cls(
@@ -338,7 +341,7 @@ def __init__(self, parent: ParentType):
 
     def get(
         self,
-        index: int,
+        index: int | ColumnLike,
         default: ScalarLike | ColumnLike | None = None,
     ) -> ParentType:
         """
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index a37355dfcda..90bec049831 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -142,7 +142,7 @@ def __setitem__(self, key: Any, value: Any):
         """
 
         # Normalize value to scalar/column
-        device_value = (
+        device_value: cudf.Scalar | ColumnBase = (
             cudf.Scalar(
                 value,
                 dtype=self.dtype
@@ -552,7 +552,7 @@ def _validate_fillna_value(
     ) -> cudf.Scalar | ColumnBase:
         """Align fill_value for .fillna based on column type."""
         if is_scalar(fill_value):
-            cudf_obj = cudf.Scalar(fill_value)
+            cudf_obj: cudf.Scalar | ColumnBase = cudf.Scalar(fill_value)
             if not as_column(cudf_obj).can_cast_safely(self.dtype):
                 raise TypeError(
                     f"Cannot safely cast non-equivalent "
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 6f7508822d4..16e6908f308 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -776,11 +776,13 @@ def contains(
             # TODO: we silently ignore the `regex=` flag here
             if case is False:
                 input_column = libstrings.to_lower(self._column)
-                pat = libstrings.to_lower(column.as_column(pat, dtype="str"))
+                col_pat = libstrings.to_lower(
+                    column.as_column(pat, dtype="str")
+                )
             else:
                 input_column = self._column
-                pat = column.as_column(pat, dtype="str")
-            result_col = libstrings.contains_multiple(input_column, pat)
+                col_pat = column.as_column(pat, dtype="str")
+            result_col = libstrings.contains_multiple(input_column, col_pat)
         return self._return_or_inplace(result_col)
 
     def like(self, pat: str, esc: str | None = None) -> SeriesOrIndex:
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 7aa3e5f8163..34076fa0060 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -6,7 +6,7 @@
 import sys
 from collections import abc
 from functools import cached_property, reduce
-from typing import TYPE_CHECKING, Any, Callable, Mapping
+from typing import TYPE_CHECKING, Any, Callable, Mapping, cast
 
 import numpy as np
 import pandas as pd
@@ -35,7 +35,7 @@ class _NestedGetItemDict(dict):
     """
 
     @classmethod
-    def from_zip(cls, data):
+    def from_zip(cls, data: abc.Iterator):
         """Create from zip, specialized factory for nesting."""
         obj = cls()
         for key, value in data:
@@ -91,12 +91,12 @@ class ColumnAccessor(abc.MutableMapping):
         column length and data.values() are all Columns
     """
 
-    _data: dict[Any, ColumnBase]
-    _level_names: tuple[Any, ...]
+    _data: dict[abc.Hashable, ColumnBase]
+    _level_names: tuple[abc.Hashable, ...]
 
     def __init__(
         self,
-        data: abc.MutableMapping[Any, ColumnBase] | Self,
+        data: abc.MutableMapping[abc.Hashable, ColumnBase] | Self,
         multiindex: bool = False,
         level_names=None,
         rangeindex: bool = False,
@@ -141,16 +141,16 @@ def __init__(
                 f"data must be a ColumnAccessor or MutableMapping, not {type(data).__name__}"
             )
 
-    def __iter__(self):
+    def __iter__(self) -> abc.Iterator:
         return iter(self._data)
 
-    def __getitem__(self, key: Any) -> ColumnBase:
+    def __getitem__(self, key: abc.Hashable) -> ColumnBase:
         return self._data[key]
 
-    def __setitem__(self, key: Any, value: ColumnBase) -> None:
+    def __setitem__(self, key: abc.Hashable, value: ColumnBase) -> None:
         self.set_by_label(key, value)
 
-    def __delitem__(self, key: Any) -> None:
+    def __delitem__(self, key: abc.Hashable) -> None:
         old_ncols = len(self._data)
         del self._data[key]
         new_ncols = len(self._data)
@@ -186,7 +186,7 @@ def _from_columns_like_self(
             Whether to verify column length and type.
         """
         if sys.version_info.major >= 3 and sys.version_info.minor >= 10:
-            data = zip(self.names, columns, strict=True)
+            data = zip(self.names, columns, strict=True)  # type: ignore[call-overload]
         else:
             columns = list(columns)
             if len(columns) != len(self.names):
@@ -205,7 +205,7 @@ def _from_columns_like_self(
         )
 
     @property
-    def level_names(self) -> tuple[Any, ...]:
+    def level_names(self) -> tuple[abc.Hashable, ...]:
         if self._level_names is None or len(self._level_names) == 0:
             return tuple((None,) * max(1, self.nlevels))
         else:
@@ -221,7 +221,7 @@ def nlevels(self) -> int:
             return len(next(iter(self.keys())))
 
     @property
-    def name(self) -> Any:
+    def name(self) -> abc.Hashable:
         return self.level_names[-1]
 
     @cached_property
@@ -232,7 +232,7 @@ def nrows(self) -> int:
             return len(next(iter(self.values())))
 
     @cached_property
-    def names(self) -> tuple[Any, ...]:
+    def names(self) -> tuple[abc.Hashable, ...]:
         return tuple(self.keys())
 
     @cached_property
@@ -291,7 +291,7 @@ def to_pandas_index(self) -> pd.Index:
                     )
                 elif cudf.api.types.infer_dtype(self.names) == "integer":
                     if len(self.names) == 1:
-                        start = self.names[0]
+                        start = cast(int, self.names[0])
                         return pd.RangeIndex(
                             start=start, stop=start + 1, step=1, name=self.name
                         )
@@ -299,7 +299,9 @@ def to_pandas_index(self) -> pd.Index:
                     if len(uniques) == 1 and uniques[0] != 0:
                         diff = uniques[0]
                         new_range = range(
-                            self.names[0], self.names[-1] + diff, diff
+                            cast(int, self.names[0]),
+                            cast(int, self.names[-1]) + diff,
+                            diff,
                         )
                         return pd.RangeIndex(new_range, name=self.name)
             result = pd.Index(
@@ -310,7 +312,9 @@ def to_pandas_index(self) -> pd.Index:
             )
         return result
 
-    def insert(self, name: Any, value: ColumnBase, loc: int = -1) -> None:
+    def insert(
+        self, name: abc.Hashable, value: ColumnBase, loc: int = -1
+    ) -> None:
         """
         Insert column into the ColumnAccessor at the specified location.
 
@@ -457,7 +461,7 @@ def select_by_index(self, index: Any) -> Self:
             verify=False,
         )
 
-    def swaplevel(self, i=-2, j=-1) -> Self:
+    def swaplevel(self, i: abc.Hashable = -2, j: abc.Hashable = -1) -> Self:
         """
         Swap level i with level j.
         Calling this method does not change the ordering of the values.
@@ -486,7 +490,7 @@ def swaplevel(self, i=-2, j=-1) -> Self:
 
         # swap old keys for i and j
         for n, row in enumerate(self.names):
-            new_keys[n][i], new_keys[n][j] = row[j], row[i]
+            new_keys[n][i], new_keys[n][j] = row[j], row[i]  # type: ignore[call-overload, index]
             new_dict.update({row: tuple(new_keys[n])})
 
         # TODO: Change to deep=False when copy-on-write is default
@@ -494,10 +498,10 @@ def swaplevel(self, i=-2, j=-1) -> Self:
 
         # swap level_names for i and j
         new_names = list(self.level_names)
-        new_names[i], new_names[j] = new_names[j], new_names[i]
+        new_names[i], new_names[j] = new_names[j], new_names[i]  # type: ignore[call-overload]
 
         return type(self)(
-            new_data,
+            new_data,  # type: ignore[arg-type]
             multiindex=self.multiindex,
             level_names=new_names,
             rangeindex=self.rangeindex,
@@ -505,7 +509,7 @@ def swaplevel(self, i=-2, j=-1) -> Self:
             verify=False,
         )
 
-    def set_by_label(self, key: Any, value: ColumnBase) -> None:
+    def set_by_label(self, key: abc.Hashable, value: ColumnBase) -> None:
         """
         Add (or modify) column by name.
 
@@ -555,7 +559,7 @@ def _select_by_label_list_like(self, key: tuple) -> Self:
             verify=False,
         )
 
-    def _select_by_label_grouped(self, key: Any) -> Self:
+    def _select_by_label_grouped(self, key: abc.Hashable) -> Self:
         result = self._grouped_data[key]
         if isinstance(result, column.ColumnBase):
             # self._grouped_data[key] = self._data[key] so skip validation
@@ -606,8 +610,12 @@ def _select_by_label_slice(self, key: slice) -> Self:
         )
 
     def _select_by_label_with_wildcard(self, key: tuple) -> Self:
-        key = self._pad_key(key, slice(None))
-        data = {k: self._data[k] for k in self.names if _keys_equal(k, key)}
+        pad_key = self._pad_key(key, slice(None))
+        data = {
+            k: self._data[k]
+            for k in self.names
+            if _keys_equal(k, pad_key)  # type: ignore[arg-type]
+        }
         return type(self)(
             data,
             multiindex=self.multiindex,
@@ -616,7 +624,9 @@ def _select_by_label_with_wildcard(self, key: tuple) -> Self:
             verify=False,
         )
 
-    def _pad_key(self, key: Any, pad_value="") -> Any:
+    def _pad_key(
+        self, key: abc.Hashable, pad_value: str | slice = ""
+    ) -> abc.Hashable:
         """
         Pad the provided key to a length equal to the number
         of levels.
@@ -628,7 +638,9 @@ def _pad_key(self, key: Any, pad_value="") -> Any:
         return key + (pad_value,) * (self.nlevels - len(key))
 
     def rename_levels(
-        self, mapper: Mapping[Any, Any] | Callable, level: int | None = None
+        self,
+        mapper: Mapping[abc.Hashable, abc.Hashable] | Callable,
+        level: int | None = None,
     ) -> Self:
         """
         Rename the specified levels of the given ColumnAccessor
@@ -701,14 +713,14 @@ def rename_column(x):
             verify=False,
         )
 
-    def droplevel(self, level) -> None:
+    def droplevel(self, level: int) -> None:
         # drop the nth level
         if level < 0:
             level += self.nlevels
 
         old_ncols = len(self._data)
         self._data = {
-            _remove_key_level(key, level): value
+            _remove_key_level(key, level): value  # type: ignore[arg-type]
             for key, value in self._data.items()
         }
         new_ncols = len(self._data)
@@ -722,7 +734,7 @@ def droplevel(self, level) -> None:
         self._clear_cache(old_ncols, new_ncols)
 
 
-def _keys_equal(target: Any, key: Any) -> bool:
+def _keys_equal(target: abc.Hashable, key: abc.Iterable) -> bool:
     """
     Compare `key` to `target`.
 
@@ -740,7 +752,7 @@ def _keys_equal(target: Any, key: Any) -> bool:
     return True
 
 
-def _remove_key_level(key: Any, level: int) -> Any:
+def _remove_key_level(key: tuple, level: int) -> abc.Hashable:
     """
     Remove a level from key. If detupleize is True, and if only a
     single level remains, convert the tuple to a scalar.
@@ -751,7 +763,9 @@ def _remove_key_level(key: Any, level: int) -> Any:
     return result
 
 
-def _get_level(x, nlevels, level_names):
+def _get_level(
+    x: abc.Hashable, nlevels: int, level_names: tuple[abc.Hashable, ...]
+) -> abc.Hashable:
     """Get the level index from a level number or name.
 
     If given an integer, this function will handle wraparound for
diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py
index 6afbc0bbc65..16d8964f083 100644
--- a/python/cudf/cudf/core/copy_types.py
+++ b/python/cudf/cudf/core/copy_types.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, cast
 
@@ -44,15 +44,17 @@ class GatherMap:
         If the map is not in bounds.
     """
 
-    #: The gather map
-    column: "NumericalColumn"
     #: The number of rows the gather map has been validated for
     nrows: int
     #: Was the validation for nullify=True?
     nullify: bool
 
     def __init__(self, column: Any, nrows: int, *, nullify: bool):
-        self.column = cudf.core.column.as_column(column)
+        #: The gather map
+        self.column = cast(
+            cudf.core.column.NumericalColumn,
+            cudf.core.column.as_column(column),
+        )
         self.nrows = nrows
         self.nullify = nullify
         if len(self.column) == 0:
@@ -135,11 +137,12 @@ class BooleanMask:
         If the mask has the wrong number of rows
     """
 
-    #: The boolean mask
-    column: "NumericalColumn"
-
     def __init__(self, column: Any, nrows: int):
-        self.column = cudf.core.column.as_column(column)
+        #: The boolean mask
+        self.column = cast(
+            cudf.core.column.NumericalColumn,
+            cudf.core.column.as_column(column),
+        )
         if self.column.dtype.kind != "b":
             raise TypeError("Boolean mask must have bool dtype")
         if len(column) != nrows:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 43693ec20b1..14b63c2b0d7 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -5830,7 +5830,7 @@ def from_records(
 
         df = cls._from_data(
             ColumnAccessor(
-                data=ca_data,
+                data=ca_data,  # type: ignore[arg-type]
                 multiindex=isinstance(
                     columns, (pd.MultiIndex, cudf.MultiIndex)
                 ),
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index e46e24dd0d8..60253b9ae5d 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -40,7 +40,7 @@
 from cudf.core._base_index import BaseIndex
 from cudf.core._compat import PANDAS_LT_300
 from cudf.core.buffer import acquire_spill_lock
-from cudf.core.column import ColumnBase, as_column
+from cudf.core.column import ColumnBase, NumericalColumn, as_column
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.copy_types import BooleanMask, GatherMap
 from cudf.core.dtypes import ListDtype
@@ -3008,9 +3008,12 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self:
         if stride != 1:
             return self._gather(
                 GatherMap.from_column_unchecked(
-                    as_column(
-                        range(start, stop, stride),
-                        dtype=libcudf.types.size_type_dtype,
+                    cast(
+                        NumericalColumn,
+                        as_column(
+                            range(start, stop, stride),
+                            dtype=libcudf.types.size_type_dtype,
+                        ),
                     ),
                     len(self),
                     nullify=False,
@@ -4761,10 +4764,13 @@ def _sample_axis_0(
     ):
         try:
             gather_map = GatherMap.from_column_unchecked(
-                cudf.core.column.as_column(
-                    random_state.choice(
-                        len(self), size=n, replace=replace, p=weights
-                    )
+                cast(
+                    NumericalColumn,
+                    cudf.core.column.as_column(
+                        random_state.choice(
+                            len(self), size=n, replace=replace, p=weights
+                        )
+                    ),
                 ),
                 len(self),
                 nullify=False,
@@ -6599,7 +6605,7 @@ def _drop_rows_by_labels(
             level = 0
 
         levels_index = obj.index.get_level_values(level)
-        if errors == "raise" and not labels.isin(levels_index).all():
+        if errors == "raise" and not labels.isin(levels_index).all():  # type: ignore[union-attr]
             raise KeyError("One or more values not found in axis")
 
         if isinstance(level, int):
@@ -6649,7 +6655,7 @@ def _drop_rows_by_labels(
             )
 
     else:
-        if errors == "raise" and not labels.isin(obj.index).all():
+        if errors == "raise" and not labels.isin(obj.index).all():  # type: ignore[union-attr]
             raise KeyError("One or more values not found in axis")
 
         if isinstance(labels, ColumnBase):

From 8b20298c960387c825cfd1476bcf0bc9119df58e Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 22 Aug 2024 16:48:22 -0500
Subject: [PATCH 712/842] Move pragma once in rolling/jit/operation.hpp.
 (#16636)

I noticed from https://github.com/rapidsai/cudf/pull/16590#discussion_r1725842333 that there was one other file where `#pragma once` was not at the top. This PR fixes that.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16636
---
 cpp/src/rolling/jit/operation.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/rolling/jit/operation.hpp b/cpp/src/rolling/jit/operation.hpp
index f8a52c03d4e..3be739ec5bf 100644
--- a/cpp/src/rolling/jit/operation.hpp
+++ b/cpp/src/rolling/jit/operation.hpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
+#pragma once
+
 #include "rolling/jit/operation-udf.hpp"
 
 #include <cudf/types.hpp>
 
-#pragma once
-
 struct rolling_udf_ptx {
   template <typename OutType, typename InType>
   static OutType operate(InType const* in_col, cudf::size_type start, cudf::size_type count)

From eaefcb4e9baa587f40bc6daa5452c170b9f9616b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 22 Aug 2024 12:33:13 -1000
Subject: [PATCH 713/842] Support DecimalDtype meta in dask_cudf (#16634)

To enable some tpch benchmarking for dask-cudf

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

URL: https://github.com/rapidsai/cudf/pull/16634
---
 python/dask_cudf/dask_cudf/backends.py        |  2 ++
 python/dask_cudf/dask_cudf/tests/test_join.py | 11 +++++++++++
 2 files changed, 13 insertions(+)

diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 16b2c8959e2..5bd3eb5fa7f 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -134,6 +134,8 @@ def _get_non_empty_data(
         return cudf.core.column.as_column(
             np.arange(start=0, stop=2, dtype=s.dtype)
         )
+    elif isinstance(s.dtype, cudf.core.dtypes.DecimalDtype):
+        return cudf.core.column.as_column(range(2), dtype=s.dtype)
     else:
         raise TypeError(
             f"Don't know how to handle column of type {type(s).__name__}"
diff --git a/python/dask_cudf/dask_cudf/tests/test_join.py b/python/dask_cudf/dask_cudf/tests/test_join.py
index ed291ef31a7..3e078c47cdd 100644
--- a/python/dask_cudf/dask_cudf/tests/test_join.py
+++ b/python/dask_cudf/dask_cudf/tests/test_join.py
@@ -386,3 +386,14 @@ def test_issue_12773():
         expected.to_pandas(),
         check_index=False,
     )
+
+
+@pytest.mark.parametrize(
+    "typ", [cudf.Decimal32Dtype, cudf.Decimal64Dtype, cudf.Decimal128Dtype]
+)
+def test_merge_on_decimal(typ):
+    df = cudf.DataFrame({"a": [1], "b": [2]}, dtype=typ(1))
+    ddf = dask_cudf.from_cudf(df, npartitions=1)
+    result = ddf.merge(ddf, left_on="a", right_on="a")
+    expected = df.merge(df, left_on="a", right_on="a")
+    dd.assert_eq(result, expected)

From 83f68c920f51f9e69f2a5bf0fddf26babac2483b Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 22 Aug 2024 18:59:47 -0400
Subject: [PATCH 714/842] Revert "Hide all gtest symbols in cudftestutil
 (#16546)" (#16644)

This reverts commit ac42bc870a65d807784cae63e25b9e9ca788eb23.

We need to revert #16546 as it broke the gtest builds for cudf. Therefore gtests that actually fail wouldn't properly report an error but silently continue and report as passed.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16644
---
 cpp/cmake/thirdparty/get_gtest.cmake | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
index ec8cbd8c568..10e6b026d9a 100644
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ b/cpp/cmake/thirdparty/get_gtest.cmake
@@ -16,18 +16,9 @@
 function(find_and_configure_gtest)
   include(${rapids-cmake-dir}/cpm/gtest.cmake)
 
-  # Mark all the non explicit googletest symbols as hidden. This ensures that libcudftestutil can be
-  # used by consumers with a different shared gtest.
-  set(gtest_hide_internal_symbols ON)
-
   # Find or install GoogleTest
   rapids_cpm_gtest(BUILD_STATIC)
 
-  # Mark all the explicit googletest symbols as hidden. This ensures that libcudftestutil can be
-  # used by consumers with a different shared gtest.
-  if(TARGET gtest)
-    target_compile_definitions(gtest PUBLIC "$<BUILD_LOCAL_INTERFACE:GTEST_API_=>")
-  endif()
 endfunction()
 
 find_and_configure_gtest()

From 91f304ecb16dbe06c1405df42ada9b66875f61c8 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 23 Aug 2024 07:51:23 -0500
Subject: [PATCH 715/842] Enable testing `cudf.pandas` unit tests for all minor
 versions of pandas (#16595)

Fixes: https://github.com/rapidsai/cudf/issues/16537

This PR enables testing `cudf.pandas` unit tests with all minor versions of pandas-2

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16595
---
 .../fetch_pandas_versions.py                  | 24 +++++++++++++
 ci/cudf_pandas_scripts/run_tests.sh           | 36 ++++++++++++++++---
 .../cudf_pandas_tests/test_cudf_pandas.py     | 18 ++++++++++
 .../cudf/cudf_pandas_tests/test_profiler.py   |  8 +++++
 4 files changed, 82 insertions(+), 4 deletions(-)
 create mode 100644 ci/cudf_pandas_scripts/fetch_pandas_versions.py

diff --git a/ci/cudf_pandas_scripts/fetch_pandas_versions.py b/ci/cudf_pandas_scripts/fetch_pandas_versions.py
new file mode 100644
index 00000000000..b6913f947e8
--- /dev/null
+++ b/ci/cudf_pandas_scripts/fetch_pandas_versions.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import requests
+from packaging.version import Version
+from packaging.specifiers import SpecifierSet
+import argparse
+
+def get_pandas_versions(pandas_range):
+    url = "https://pypi.org/pypi/pandas/json"
+    response = requests.get(url)
+    data = response.json()
+    versions = [Version(v) for v in data['releases']]
+    specifier = SpecifierSet(pandas_range.lstrip("pandas"))
+    matching_versions = [v for v in versions if v in specifier]
+    matching_minors = sorted(set(".".join((str(v.major), str(v.minor))) for v in matching_versions), key=Version)
+    return matching_minors
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Filter pandas versions by prefix.")
+    parser.add_argument("pandas_range", type=str, help="The version prefix to filter by.")
+    args = parser.parse_args()
+
+    versions = get_pandas_versions(args.pandas_range)
+    print(','.join(versions))
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 8215ce729b3..5bfc083bcd3 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -9,13 +9,20 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${PWD}/test-results"}
 RAPIDS_COVERAGE_DIR=${RAPIDS_COVERAGE_DIR:-"${PWD}/coverage-results"}
 mkdir -p "${RAPIDS_TESTS_DIR}" "${RAPIDS_COVERAGE_DIR}"
 
+DEPENDENCIES_PATH="dependencies.yaml"
+package_name="pandas"
+
+# Use grep to find the line containing the package name and version constraint
+pandas_version_constraint=$(grep -oP "pandas>=\d+\.\d+,\<\d+\.\d+\.\d+dev\d+" $DEPENDENCIES_PATH)
+
 # Function to display script usage
 function display_usage {
-    echo "Usage: $0 [--no-cudf]"
+    echo "Usage: $0 [--no-cudf] [pandas-version]"
 }
 
 # Default value for the --no-cudf option
 no_cudf=false
+PANDAS_VERSION=""
 
 # Parse command-line arguments
 while [[ $# -gt 0 ]]; do
@@ -25,9 +32,14 @@ while [[ $# -gt 0 ]]; do
             shift
             ;;
         *)
-            echo "Error: Unknown option $1"
-            display_usage
-            exit 1
+            if [[ -z "$PANDAS_VERSION" ]]; then
+                PANDAS_VERSION=$1
+                shift
+            else
+                echo "Error: Unknown option $1"
+                display_usage
+                exit 1
+            fi
             ;;
     esac
 done
@@ -53,3 +65,19 @@ python -m pytest -p cudf.pandas \
     --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \
     --cov-report=term \
     ./python/cudf/cudf_pandas_tests/
+
+output=$(python ci/cudf_pandas_scripts/fetch_pandas_versions.py $pandas_version_constraint)
+
+# Convert the comma-separated list into an array
+IFS=',' read -r -a versions <<< "$output"
+
+for version in "${versions[@]}"; do
+    echo "Installing pandas version: ${version}"
+    python -m pip install "pandas==${version}"
+    python -m pytest -p cudf.pandas \
+    --cov-config=./python/cudf/.coveragerc \
+    --cov=cudf \
+    --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \
+    --cov-report=term \
+    ./python/cudf/cudf_pandas_tests/
+done
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 6292022d8e4..028f5f173ac 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -42,6 +42,8 @@
     get_calendar,
 )
 
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+
 # Accelerated pandas has the real pandas and cudf modules as attributes
 pd = xpd._fsproxy_slow
 cudf = xpd._fsproxy_fast
@@ -607,6 +609,10 @@ def test_array_function_series_fallback(series):
     tm.assert_equal(expect, got)
 
 
+@pytest.mark.xfail(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_timedeltaproperties(series):
     psr, sr = series
     psr, sr = psr.astype("timedelta64[ns]"), sr.astype("timedelta64[ns]")
@@ -666,6 +672,10 @@ def test_maintain_container_subclasses(multiindex):
     assert isinstance(got, xpd.core.indexes.frozen.FrozenList)
 
 
+@pytest.mark.xfail(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas due to unsupported boxcar window type",
+)
 def test_rolling_win_type():
     pdf = pd.DataFrame(range(5))
     df = xpd.DataFrame(range(5))
@@ -1281,6 +1291,10 @@ def max_times_two(self):
     assert s.max_times_two() == 6
 
 
+@pytest.mark.xfail(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="DatetimeArray.__floordiv__ missing in pandas-2.0.0",
+)
 def test_floordiv_array_vs_df():
     xarray = xpd.Series([1, 2, 3], dtype="datetime64[ns]").array
     parray = pd.Series([1, 2, 3], dtype="datetime64[ns]").array
@@ -1552,6 +1566,10 @@ def test_numpy_cupy_flatiter(series):
     assert type(arr.flat._fsproxy_slow) == np.flatiter
 
 
+@pytest.mark.xfail(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="pyarrow_numpy storage type was not supported in pandas-2.0.0",
+)
 def test_arrow_string_arrays():
     cu_s = xpd.Series(["a", "b", "c"])
     pd_s = pd.Series(["a", "b", "c"])
diff --git a/python/cudf/cudf_pandas_tests/test_profiler.py b/python/cudf/cudf_pandas_tests/test_profiler.py
index 588398265f2..5b7bde06d1d 100644
--- a/python/cudf/cudf_pandas_tests/test_profiler.py
+++ b/python/cudf/cudf_pandas_tests/test_profiler.py
@@ -5,6 +5,8 @@
 import os
 import subprocess
 
+import pytest
+
 from cudf.pandas import LOADED, Profiler
 
 if not LOADED:
@@ -13,7 +15,13 @@
 import numpy as np
 import pandas as pd
 
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="function names change across versions of pandas, so making sure it only runs on latest version of pandas",
+)
 def test_profiler():
     np.random.seed(42)
     with Profiler() as profiler:

From 8d6b2616af8aeec6dfd02d787084c583e2447791 Mon Sep 17 00:00:00 2001
From: Mike Sarahan <msarahan@gmail.com>
Date: Fri, 23 Aug 2024 10:47:40 -0500
Subject: [PATCH 716/842] adding wheel build for libcudf (#15483)

Contributes to https://github.com/rapidsai/build-planning/issues/33

Adds a standalone `libcudf` wheel, containing the `libcudf` C++ shared library.

Fixes #16588

Authors:
  - Mike Sarahan (https://github.com/msarahan)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15483
---
 .github/workflows/build.yaml                  | 20 ++++
 .github/workflows/pr.yaml                     | 11 ++-
 ci/build_wheel_cudf.sh                        | 26 ++++--
 ci/build_wheel_libcudf.sh                     | 15 +++
 ci/build_wheel_pylibcudf.sh                   | 22 ++++-
 ci/cudf_pandas_scripts/pandas-tests/run.sh    |  8 +-
 ci/cudf_pandas_scripts/run_tests.sh           |  8 +-
 ci/release/update-version.sh                  |  1 +
 ci/test_wheel_cudf.sh                         |  8 +-
 ci/test_wheel_cudf_polars.sh                  |  8 +-
 ci/test_wheel_dask_cudf.sh                    | 10 +-
 dependencies.yaml                             | 91 ++++++++++++++++++-
 python/cudf/CMakeLists.txt                    | 69 ++------------
 python/cudf/cudf/__init__.py                  | 10 ++
 python/cudf/cudf/_lib/CMakeLists.txt          |  1 +
 python/cudf/pyproject.toml                    |  3 +
 python/libcudf/CMakeLists.txt                 | 58 ++++++++++++
 python/libcudf/LICENSE                        |  1 +
 python/libcudf/README.md                      |  1 +
 .../cmake/Modules/WheelHelpers.cmake          |  0
 python/libcudf/libcudf/VERSION                |  1 +
 python/libcudf/libcudf/__init__.py            | 16 ++++
 python/libcudf/libcudf/_version.py            | 33 +++++++
 python/libcudf/libcudf/load.py                | 51 +++++++++++
 python/libcudf/pyproject.toml                 | 75 +++++++++++++++
 python/pylibcudf/CMakeLists.txt               | 68 ++------------
 python/pylibcudf/pylibcudf/CMakeLists.txt     |  2 +
 python/pylibcudf/pylibcudf/__init__.py        | 10 ++
 python/pylibcudf/pyproject.toml               |  3 +
 29 files changed, 476 insertions(+), 154 deletions(-)
 create mode 100755 ci/build_wheel_libcudf.sh
 create mode 100644 python/libcudf/CMakeLists.txt
 create mode 120000 python/libcudf/LICENSE
 create mode 120000 python/libcudf/README.md
 rename python/{pylibcudf => libcudf}/cmake/Modules/WheelHelpers.cmake (100%)
 create mode 120000 python/libcudf/libcudf/VERSION
 create mode 100644 python/libcudf/libcudf/__init__.py
 create mode 100644 python/libcudf/libcudf/_version.py
 create mode 100644 python/libcudf/libcudf/load.py
 create mode 100644 python/libcudf/pyproject.toml

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 9943b02a521..0ea4d5c54dc 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -67,7 +67,27 @@ jobs:
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
       sha: ${{ inputs.sha }}
+  wheel-build-libcudf:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      script: ci/build_wheel_libcudf.sh
+  wheel-publish-libcudf:
+    needs: wheel-build-libcudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    with:
+      build_type: ${{ inputs.build_type || 'branch' }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+      package-name: libcudf
   wheel-build-pylibcudf:
+    needs: [wheel-publish-libcudf]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 74bdc666c68..2e2a8b6b9bc 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -23,6 +23,7 @@ jobs:
       - static-configure
       - conda-notebook-tests
       - docs-build
+      - wheel-build-libcudf
       - wheel-build-pylibcudf
       - wheel-build-cudf
       - wheel-tests-cudf
@@ -121,10 +122,18 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
-  wheel-build-pylibcudf:
+  wheel-build-libcudf:
     needs: checks
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    with:
+      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
+      build_type: pull-request
+      script: "ci/build_wheel_libcudf.sh"
+  wheel-build-pylibcudf:
+    needs: [checks, wheel-build-libcudf]
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_pylibcudf.sh"
diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index 7c0fb1efebe..cf33703f544 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -5,16 +5,28 @@ set -euo pipefail
 
 package_dir="python/cudf"
 
-export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON"
-
-# Download the pylibcudf built in the previous step
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 /tmp/pylibcudf_dist
 
-echo "pylibcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/pylibcudf_dist/pylibcudf_*.whl)" > /tmp/constraints.txt
+# Downloads libcudf and pylibcudf wheels from this current build,
+# then ensures 'cudf' wheel builds always use the 'libcudf' and 'pylibcudf' just built in the same CI run.
+#
+# Using env variable PIP_CONSTRAINT is necessary to ensure the constraints
+# are used when creating the isolated build environment.
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libcudf_dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python /tmp/pylibcudf_dist
+echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf_*.whl)" > /tmp/constraints.txt
+echo "pylibcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/pylibcudf_dist/pylibcudf_*.whl)" >> /tmp/constraints.txt
 export PIP_CONSTRAINT="/tmp/constraints.txt"
+
 ./ci/build_wheel.sh ${package_dir}
 
-python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
+python -m auditwheel repair \
+    --exclude libcudf.so \
+    --exclude libarrow.so.1601 \
+    --exclude libnvcomp.so \
+    --exclude libnvcomp_bitcomp.so \
+    --exclude libnvcomp_gdeflate.so \
+    -w ${package_dir}/final_dist \
+    ${package_dir}/dist/*
 
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python ${package_dir}/final_dist
diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
new file mode 100755
index 00000000000..9694c3f6144
--- /dev/null
+++ b/ci/build_wheel_libcudf.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_dir="python/libcudf"
+
+./ci/build_wheel.sh ${package_dir}
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+mkdir -p ${package_dir}/final_dist
+python -m auditwheel repair --exclude libarrow.so.1601 -w ${package_dir}/final_dist ${package_dir}/dist/*
+
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist
diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh
index b25d118ff81..7181a49d397 100755
--- a/ci/build_wheel_pylibcudf.sh
+++ b/ci/build_wheel_pylibcudf.sh
@@ -5,12 +5,26 @@ set -euo pipefail
 
 package_dir="python/pylibcudf"
 
-export SKBUILD_CMAKE_ARGS="-DUSE_LIBARROW_FROM_PYARROW=ON"
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-./ci/build_wheel.sh ${package_dir}
+# Downloads libcudf wheel from this current build,
+# then ensures 'pylibcudf' wheel builds always use the 'libcudf' just built in the same CI run.
+#
+# Using env variable PIP_CONSTRAINT is necessary to ensure the constraints
+# are used when creating the isolated build environment.
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libcudf_dist
+echo "libcudf-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libcudf_dist/libcudf_*.whl)" > /tmp/constraints.txt
+export PIP_CONSTRAINT="/tmp/constraints.txt"
 
-python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
+./ci/build_wheel.sh ${package_dir}
 
+python -m auditwheel repair \
+    --exclude libcudf.so \
+    --exclude libarrow.so.1601 \
+    --exclude libnvcomp.so \
+    --exclude libnvcomp_bitcomp.so \
+    --exclude libnvcomp_gdeflate.so \
+    -w ${package_dir}/final_dist \
+    ${package_dir}/dist/*
 
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/final_dist
diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index 97c3139080f..e5cd4436a3a 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -12,13 +12,15 @@ rapids-logger "PR number: ${RAPIDS_REF_NAME:-"unknown"}"
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-# Download the cudf and pylibcudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+# Download the cudf, libcudf, and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
   "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,pandas-tests]" \
+  "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
   "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 5bfc083bcd3..90ea1afbe6a 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -49,13 +49,15 @@ if [ "$no_cudf" = true ]; then
 else
     RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-    # Download the cudf and pylibcudf built in the previous step
-    RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
-    RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+    # Download the cudf, libcudf, and pylibcudf built in the previous step
+    RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
+    RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
+    RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
     # echo to expand wildcard before adding `[extra]` requires for pip
     python -m pip install \
         "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,cudf-pandas-tests]" \
+        "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
         "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 fi
 
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index e79a91510b8..be55b49870f 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -49,6 +49,7 @@ DEPENDENCIES=(
   dask-cuda
   dask-cudf
   kvikio
+  libcudf
   libkvikio
   librmm
   pylibcudf
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index 19131952098..6861d699695 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -5,13 +5,15 @@ set -eou pipefail
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
-# Download the cudf and pylibcudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+# Download the cudf, libcudf, and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
   "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+  "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
   "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]"
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index e9c6188502c..0baf6c9e277 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -18,16 +18,18 @@ else
 fi
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 python ./dist
 
-# Download pylibcudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+# Download libcudf and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 rapids-logger "Installing cudf_polars and its dependencies"
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
     "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+    "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
     "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
 rapids-logger "Run cudf_polars tests"
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index ff893a08e27..fa74b2398f7 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -4,16 +4,18 @@
 set -eou pipefail
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 ./dist
+RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" RAPIDS_PY_WHEEL_PURE="1" rapids-download-wheels-from-s3 python ./dist
 
-# Download the cudf and pylibcudf built in the previous step
-RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
-RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
+# Download the cudf, libcudf, and pylibcudf built in the previous step
+RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
+RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
+RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
   "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
   "$(echo ./dist/dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
+  "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
   "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 
 RESULTS_DIR=${RAPIDS_TESTS_DIR:-"$(mktemp -d)"}
diff --git a/dependencies.yaml b/dependencies.yaml
index 150d03be021..553d01735b2 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -13,6 +13,7 @@ files:
       - cuda
       - cuda_version
       - depends_on_cupy
+      - depends_on_librmm
       - depends_on_rmm
       - develop
       - docs
@@ -95,6 +96,8 @@ files:
       - build_base
       - build_python_common
       - depends_on_pylibcudf
+      - depends_on_libcudf
+      - depends_on_librmm
       - depends_on_rmm
   py_run_cudf:
     output: pyproject
@@ -106,6 +109,7 @@ files:
       - run_cudf
       - pyarrow_run
       - depends_on_cupy
+      - depends_on_libcudf
       - depends_on_pylibcudf
       - depends_on_rmm
   py_test_cudf:
@@ -117,6 +121,31 @@ files:
     includes:
       - test_python_common
       - test_python_cudf
+  py_build_libcudf:
+    output: pyproject
+    pyproject_dir: python/libcudf
+    extras:
+      table: build-system
+    includes:
+      - rapids_build_skbuild
+  py_rapids_build_libcudf:
+    output: pyproject
+    pyproject_dir: python/libcudf
+    extras:
+      table: tool.rapids-build-backend
+      key: requires
+    includes:
+      - build_base
+      - build_cpp
+      - build_python_libcudf
+      - depends_on_librmm
+  py_run_libcudf:
+    output: pyproject
+    pyproject_dir: python/libcudf
+    extras:
+      table: project
+    includes:
+      - pyarrow_run
   py_build_pylibcudf:
     output: pyproject
     pyproject_dir: python/pylibcudf
@@ -133,6 +162,8 @@ files:
     includes:
       - build_base
       - build_python_common
+      - depends_on_libcudf
+      - depends_on_librmm
       - depends_on_rmm
   py_run_pylibcudf:
     output: pyproject
@@ -140,6 +171,7 @@ files:
     extras:
       table: project
     includes:
+      - depends_on_libcudf
       - depends_on_rmm
       - pyarrow_run
       - run_pylibcudf
@@ -359,13 +391,18 @@ dependencies:
           - cython>=3.0.3
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
-          - pyarrow==16.1.0.*
+          - &pyarrow_build pyarrow==16.1.0.*
       - output_types: pyproject
         packages:
           # Hard pin the patch version used during the build.
           # Sync with conda build constraint & wheel run constraint.
           # TODO: Change to `2.0.*` for NumPy 2
           - numpy==1.23.*
+  build_python_libcudf:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - *pyarrow_build
   libarrow_build:
     common:
       - output_types: conda
@@ -759,6 +796,31 @@ dependencies:
         packages:
           - dask-cuda==24.10.*,>=0.0.0a0
           - *numba
+  depends_on_libcudf:
+    common:
+      - output_types: conda
+        packages:
+          - &libcudf_unsuffixed libcudf==24.10.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for libcudf-cu{11,12}.
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - libcudf-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - libcudf-cu11==24.10.*,>=0.0.0a0
+          - {matrix: null, packages: [*libcudf_unsuffixed]}
   depends_on_pylibcudf:
     common:
       - output_types: conda
@@ -849,6 +911,33 @@ dependencies:
             packages: &cupy_packages_cu11
               - cupy-cuda11x>=12.0.0
           - {matrix: null, packages: *cupy_packages_cu11}
+  depends_on_librmm:
+    common:
+      - output_types: conda
+        packages:
+          - &librmm_unsuffixed librmm==24.10.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for librmm-cu{11,12}.
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - librmm-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - librmm-cu11==24.10.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - *librmm_unsuffixed
   depends_on_rmm:
     common:
       - output_types: conda
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index e11d62b3bd5..72f20b30052 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -24,72 +24,15 @@ project(
   LANGUAGES CXX CUDA
 )
 
-option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files"
-       OFF
-)
-option(USE_LIBARROW_FROM_PYARROW "Only use the libarrow contained in pyarrow" OFF)
-mark_as_advanced(USE_LIBARROW_FROM_PYARROW)
-
-# Find Python early so that later commands can use it
-find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
-
-# If the user requested it we attempt to find CUDF.
-if(FIND_CUDF_CPP)
-  include(rapids-cpm)
-  include(rapids-export)
-  include(rapids-find)
-  rapids_cpm_init()
+find_package(cudf "${RAPIDS_VERSION}" REQUIRED)
 
-  if(USE_LIBARROW_FROM_PYARROW)
-    # We need to find arrow before libcudf since libcudf requires it but doesn't bundle arrow
-    # libraries. These variables have no effect because we are always searching for arrow via
-    # pyarrow, but they must be set as they are required arguments to the function in
-    # get_arrow.cmake.
-    set(CUDF_USE_ARROW_STATIC OFF)
-    set(CUDF_ENABLE_ARROW_S3 OFF)
-    set(CUDF_ENABLE_ARROW_ORC OFF)
-    set(CUDF_ENABLE_ARROW_PYTHON OFF)
-    set(CUDF_ENABLE_ARROW_PARQUET OFF)
-    include(../../cpp/cmake/thirdparty/get_arrow.cmake)
-  endif()
-
-  find_package(cudf "${RAPIDS_VERSION}" REQUIRED)
-
-  # an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack
-  # for the interop.pyx
-  include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
-else()
-  set(cudf_FOUND OFF)
-endif()
+# an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack
+# for the interop.pyx
+include(rapids-cpm)
+rapids_cpm_init()
+include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
 
 include(rapids-cython-core)
-
-if(NOT cudf_FOUND)
-  set(BUILD_TESTS OFF)
-  set(BUILD_BENCHMARKS OFF)
-  set(CUDF_BUILD_TESTUTIL OFF)
-  set(CUDF_BUILD_STREAMS_TEST_UTIL OFF)
-  set(CUDA_STATIC_RUNTIME ON)
-
-  add_subdirectory(../../cpp cudf-cpp EXCLUDE_FROM_ALL)
-
-  # libcudf targets are excluded by default above via EXCLUDE_FROM_ALL to remove extraneous
-  # components like headers from libcudacxx, but we do need the libraries. However, we want to
-  # control where they are installed to. Since there are multiple subpackages of cudf._lib that
-  # require access to libcudf, we place the library and all its dependent artifacts in the cudf
-  # directory as a single source of truth and modify the other rpaths appropriately.
-  set(cython_lib_dir cudf)
-  include(../pylibcudf/cmake/Modules/WheelHelpers.cmake)
-  # TODO: This install is currently overzealous. We should only install the libraries that are
-  # downloaded by CPM during the build, not libraries that were found on the system.  However, in
-  # practice right this would only be a problem is if libcudf was not found but some of the
-  # dependencies were, and we have no real use cases where that happens.
-  install_aliased_imported_targets(
-    TARGETS cudf arrow_shared nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp
-    DESTINATION ${cython_lib_dir}
-  )
-endif()
-
 rapids_cython_init()
 
 include(../pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake)
diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index ccc45413de4..d7da42a1708 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -1,5 +1,15 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
+# If libcudf was installed as a wheel, we must request it to load the library symbols.
+# Otherwise, we assume that the library was installed in a system path that ld can find.
+try:
+    import libcudf
+except ModuleNotFoundError:
+    pass
+else:
+    libcudf.load_library()
+    del libcudf
+
 # _setup_numba _must be called before numba.cuda is imported, because
 # it sets the numba config variable responsible for enabling
 # Minor Version Compatibility. Setting it after importing numba.cuda has no effect.
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index d6182673308..5ea378fc0e5 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -63,6 +63,7 @@ rapids_cython_create_modules(
 )
 
 target_link_libraries(strings_udf PUBLIC cudf_strings_udf)
+target_include_directories(interop PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
 
 set(targets_using_arrow_headers avro csv orc json parquet)
 link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 9db52164eca..cb9fa30afab 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -23,6 +23,7 @@ dependencies = [
     "cuda-python>=11.7.1,<12.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
+    "libcudf==24.10.*,>=0.0.0a0",
     "numba>=0.57",
     "numpy>=1.23,<2.0a0",
     "nvtx>=0.2.1",
@@ -126,6 +127,8 @@ matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
+    "libcudf==24.10.*,>=0.0.0a0",
+    "librmm==24.10.*,>=0.0.0a0",
     "ninja",
     "numpy==1.23.*",
     "pyarrow==16.1.0.*",
diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt
new file mode 100644
index 00000000000..09c7ed2e217
--- /dev/null
+++ b/python/libcudf/CMakeLists.txt
@@ -0,0 +1,58 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
+
+include(../../rapids_config.cmake)
+
+project(
+  libcudf-python
+  VERSION "${RAPIDS_VERSION}"
+  LANGUAGES CXX
+)
+
+# Check if cudf is already available. If so, it is the user's responsibility to ensure that the
+# CMake package is also available at build time of the Python cudf package.
+find_package(cudf "${RAPIDS_VERSION}")
+
+if(cudf_FOUND)
+  return()
+endif()
+
+unset(cudf_FOUND)
+
+# For wheels, this should always be true
+set(USE_LIBARROW_FROM_PYARROW ON)
+
+# Find Python early so that later commands can use it
+find_package(Python 3.10 REQUIRED COMPONENTS Interpreter)
+
+set(BUILD_TESTS OFF)
+set(BUILD_BENCHMARKS OFF)
+set(CUDF_BUILD_TESTUTIL OFF)
+set(CUDF_BUILD_STREAMS_TEST_UTIL OFF)
+set(CUDA_STATIC_RUNTIME ON)
+
+set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
+
+include(../pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake)
+
+add_subdirectory(../../cpp cudf-cpp)
+
+# Ensure other libraries needed by libcudf.so get installed alongside it.
+include(cmake/Modules/WheelHelpers.cmake)
+install_aliased_imported_targets(
+  TARGETS cudf arrow_shared nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp
+  DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
+)
diff --git a/python/libcudf/LICENSE b/python/libcudf/LICENSE
new file mode 120000
index 00000000000..30cff7403da
--- /dev/null
+++ b/python/libcudf/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/libcudf/README.md b/python/libcudf/README.md
new file mode 120000
index 00000000000..fe840054137
--- /dev/null
+++ b/python/libcudf/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/pylibcudf/cmake/Modules/WheelHelpers.cmake b/python/libcudf/cmake/Modules/WheelHelpers.cmake
similarity index 100%
rename from python/pylibcudf/cmake/Modules/WheelHelpers.cmake
rename to python/libcudf/cmake/Modules/WheelHelpers.cmake
diff --git a/python/libcudf/libcudf/VERSION b/python/libcudf/libcudf/VERSION
new file mode 120000
index 00000000000..d62dc733efd
--- /dev/null
+++ b/python/libcudf/libcudf/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/libcudf/libcudf/__init__.py b/python/libcudf/libcudf/__init__.py
new file mode 100644
index 00000000000..10c476cbe89
--- /dev/null
+++ b/python/libcudf/libcudf/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from libcudf._version import __git_commit__, __version__
+from libcudf.load import load_library
diff --git a/python/libcudf/libcudf/_version.py b/python/libcudf/libcudf/_version.py
new file mode 100644
index 00000000000..7dd732b4905
--- /dev/null
+++ b/python/libcudf/libcudf/_version.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files(__package__)
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+try:
+    __git_commit__ = (
+        importlib.resources.files(__package__)
+        .joinpath("GIT_COMMIT")
+        .read_text()
+        .strip()
+    )
+except FileNotFoundError:
+    __git_commit__ = ""
+
+__all__ = ["__git_commit__", "__version__"]
diff --git a/python/libcudf/libcudf/load.py b/python/libcudf/libcudf/load.py
new file mode 100644
index 00000000000..f6ba0d51bdb
--- /dev/null
+++ b/python/libcudf/libcudf/load.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import ctypes
+import os
+
+
+def load_library():
+    # This is loading the libarrow shared library in situations where it comes from the
+    # pyarrow package (i.e. when installed as a wheel).
+    import pyarrow  # noqa: F401
+
+    # Dynamically load libcudf.so. Prefer a system library if one is present to
+    # avoid clobbering symbols that other packages might expect, but if no
+    # other library is present use the one in the wheel.
+    libcudf_lib = None
+    try:
+        libcudf_lib = ctypes.CDLL("libcudf.so", ctypes.RTLD_GLOBAL)
+    except OSError:
+        # If neither of these directories contain the library, we assume we are in an
+        # environment where the C++ library is already installed somewhere else and the
+        # CMake build of the libcudf Python package was a no-op.
+        #
+        # Note that this approach won't work for real editable installs of the libcudf package.
+        # scikit-build-core has limited support for importlib.resources so there isn't a clean
+        # way to support that case yet.
+        for lib_dir in ("lib", "lib64"):
+            if os.path.isfile(
+                lib := os.path.join(
+                    os.path.dirname(__file__), lib_dir, "libcudf.so"
+                )
+            ):
+                libcudf_lib = ctypes.CDLL(lib, ctypes.RTLD_GLOBAL)
+                break
+
+    # The caller almost never needs to do anything with this library, but no
+    # harm in offering the option since this object at least provides a handle
+    # to inspect where libcudf was loaded from.
+    return libcudf_lib
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
new file mode 100644
index 00000000000..fd01f7f6e2f
--- /dev/null
+++ b/python/libcudf/pyproject.toml
@@ -0,0 +1,75 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[build-system]
+build-backend = "rapids_build_backend.build"
+requires = [
+    "rapids-build-backend>=0.3.0,<0.4.0.dev0",
+    "scikit-build-core[pyproject]>=0.10.0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project]
+name = "libcudf"
+dynamic = ["version"]
+description = "cuDF - GPU Dataframe (C++)"
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.10"
+classifiers = [
+    "Intended Audience :: Developers",
+    "Topic :: Database",
+    "Topic :: Scientific/Engineering",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: C++",
+    "Environment :: GPU :: NVIDIA CUDA",
+]
+dependencies = [
+    "pyarrow>=16.1.0,<16.2.0a0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/cudf"
+
+[project.entry-points."cmake.prefix"]
+libcudf = "libcudf"
+
+[tool.scikit-build]
+build-dir = "build/{wheel_tag}"
+cmake.build-type = "Release"
+cmake.version = "CMakeLists.txt"
+minimum-version = "build-system.requires"
+ninja.make-fallback = true
+sdist.reproducible = true
+wheel.packages = ["libcudf"]
+wheel.install-dir = "libcudf"
+wheel.py-api = "py3"
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.regex"
+input = "libcudf/VERSION"
+regex = "(?P<value>.*)"
+
+[tool.rapids-build-backend]
+build-backend = "scikit_build_core.build"
+dependencies-file = "../../dependencies.yaml"
+matrix-entry = "cuda_suffixed=true"
+requires = [
+    "cmake>=3.26.4,!=3.30.0",
+    "librmm==24.10.*,>=0.0.0a0",
+    "ninja",
+    "pyarrow==16.1.0.*",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/pylibcudf/CMakeLists.txt b/python/pylibcudf/CMakeLists.txt
index 424d8372280..340ad120377 100644
--- a/python/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/CMakeLists.txt
@@ -24,72 +24,16 @@ project(
   LANGUAGES CXX CUDA
 )
 
-option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files"
-       OFF
-)
-option(USE_LIBARROW_FROM_PYARROW "Only use the libarrow contained in pyarrow" OFF)
-mark_as_advanced(USE_LIBARROW_FROM_PYARROW)
-
-# Find Python early so that later commands can use it
-find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
-
-# If the user requested it we attempt to find CUDF.
-if(FIND_CUDF_CPP)
-  include(rapids-cpm)
-  include(rapids-export)
-  include(rapids-find)
-  rapids_cpm_init()
-
-  if(USE_LIBARROW_FROM_PYARROW)
-    # We need to find arrow before libcudf since libcudf requires it but doesn't bundle arrow
-    # libraries. These variables have no effect because we are always searching for arrow via
-    # pyarrow, but they must be set as they are required arguments to the function in
-    # get_arrow.cmake.
-    set(CUDF_USE_ARROW_STATIC OFF)
-    set(CUDF_ENABLE_ARROW_S3 OFF)
-    set(CUDF_ENABLE_ARROW_ORC OFF)
-    set(CUDF_ENABLE_ARROW_PYTHON OFF)
-    set(CUDF_ENABLE_ARROW_PARQUET OFF)
-    include(../../cpp/cmake/thirdparty/get_arrow.cmake)
-  endif()
-
-  find_package(cudf "${RAPIDS_VERSION}" REQUIRED)
+find_package(cudf "${RAPIDS_VERSION}" REQUIRED)
 
-  # an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack
-  # for the interop.pyx
-  include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
-else()
-  set(cudf_FOUND OFF)
-endif()
+# an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack
+# for the interop.pyx
+include(rapids-cpm)
+rapids_cpm_init()
+include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
 
 include(rapids-cython-core)
 
-if(NOT cudf_FOUND)
-  set(BUILD_TESTS OFF)
-  set(BUILD_BENCHMARKS OFF)
-  set(CUDF_BUILD_TESTUTIL OFF)
-  set(CUDF_BUILD_STREAMS_TEST_UTIL OFF)
-  set(CUDA_STATIC_RUNTIME ON)
-
-  add_subdirectory(../../cpp cudf-cpp EXCLUDE_FROM_ALL)
-
-  # libcudf targets are excluded by default above via EXCLUDE_FROM_ALL to remove extraneous
-  # components like headers from libcudacxx, but we do need the libraries. However, we want to
-  # control where they are installed to. Since there are multiple subpackages of pylibcudf that
-  # require access to libcudf, we place the library and all its dependent artifacts in the cudf
-  # directory as a single source of truth and modify the other rpaths appropriately.
-  set(cython_lib_dir pylibcudf)
-  include(cmake/Modules/WheelHelpers.cmake)
-  # TODO: This install is currently overzealous. We should only install the libraries that are
-  # downloaded by CPM during the build, not libraries that were found on the system.  However, in
-  # practice right this would only be a problem is if libcudf was not found but some of the
-  # dependencies were, and we have no real use cases where that happens.
-  install_aliased_imported_targets(
-    TARGETS cudf arrow_shared nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp
-    DESTINATION ${cython_lib_dir}
-  )
-endif()
-
 rapids_cython_init()
 
 include(cmake/Modules/LinkPyarrowHeaders.cmake)
diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index ab21bfe97ab..f81a32e07f9 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -53,6 +53,8 @@ rapids_cython_create_modules(
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
 )
 
+target_include_directories(pylibcudf_interop PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
+
 include(${rapids-cmake-dir}/export/find_package_root.cmake)
 include(../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
 target_link_libraries(pylibcudf_interop PUBLIC nanoarrow)
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index 677fdaf80d0..e784c6c6dd5 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -1,5 +1,15 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
+# If libcudf was installed as a wheel, we must request it to load the library symbols.
+# Otherwise, we assume that the library was installed in a system path that ld can find.
+try:
+    import libcudf
+except ModuleNotFoundError:
+    pass
+else:
+    libcudf.load_library()
+    del libcudf
+
 from . import (
     aggregation,
     binaryop,
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index b037508d03f..63d76e9fd4e 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -19,6 +19,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cuda-python>=11.7.1,<12.0a0",
+    "libcudf==24.10.*,>=0.0.0a0",
     "nvtx>=0.2.1",
     "packaging",
     "pyarrow>=16.1.0,<16.2.0a0",
@@ -101,6 +102,8 @@ matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
+    "libcudf==24.10.*,>=0.0.0a0",
+    "librmm==24.10.*,>=0.0.0a0",
     "ninja",
     "numpy==1.23.*",
     "pyarrow==16.1.0.*",

From a7ca3afb251805face3dd3248381f4cc9503e143 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 23 Aug 2024 12:24:30 -0700
Subject: [PATCH 717/842] Add the missing `num_aggregations` axis for
 `groupby_max_cardinality` (#16630)

This PR fixes a minor bug where the `num_aggregations` axis was missed when working on #16154.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16630
---
 cpp/benchmarks/groupby/group_max.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp
index f41285008c4..b9a701a71f4 100644
--- a/cpp/benchmarks/groupby/group_max.cpp
+++ b/cpp/benchmarks/groupby/group_max.cpp
@@ -101,4 +101,5 @@ NVBENCH_BENCH_TYPES(bench_groupby_max,
 
 NVBENCH_BENCH_TYPES(bench_groupby_max_cardinality, NVBENCH_TYPE_AXES(nvbench::type_list<int32_t>))
   .set_name("groupby_max_cardinality")
+  .add_int64_axis("num_aggregations", {1})
   .add_int64_axis("cardinality", {10, 20, 50, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000});

From 7bd14a58cd10504c99044a2d33159bc3d59e7139 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Fri, 23 Aug 2024 15:27:12 -0500
Subject: [PATCH 718/842] Add pylibcudf build dir in build.sh for `clean`
 (#16648)

This PR adds `pylibcudf` build dir in `build.sh` for `clean` to properly delete the pylibcudf build files and folders.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16648
---
 build.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/build.sh b/build.sh
index 957f41aedac..211e1db9fbf 100755
--- a/build.sh
+++ b/build.sh
@@ -54,10 +54,11 @@ KAFKA_LIB_BUILD_DIR=${KAFKA_LIB_BUILD_DIR:=${REPODIR}/cpp/libcudf_kafka/build}
 CUDF_KAFKA_BUILD_DIR=${REPODIR}/python/cudf_kafka/build
 CUDF_BUILD_DIR=${REPODIR}/python/cudf/build
 DASK_CUDF_BUILD_DIR=${REPODIR}/python/dask_cudf/build
+PYLIBCUDF_BUILD_DIR=${REPODIR}/python/pylibcudf/build
 CUSTREAMZ_BUILD_DIR=${REPODIR}/python/custreamz/build
 CUDF_JAR_JAVA_BUILD_DIR="$REPODIR/java/target"
 
-BUILD_DIRS="${LIB_BUILD_DIR} ${CUDF_BUILD_DIR} ${DASK_CUDF_BUILD_DIR} ${KAFKA_LIB_BUILD_DIR} ${CUDF_KAFKA_BUILD_DIR} ${CUSTREAMZ_BUILD_DIR} ${CUDF_JAR_JAVA_BUILD_DIR}"
+BUILD_DIRS="${LIB_BUILD_DIR} ${CUDF_BUILD_DIR} ${DASK_CUDF_BUILD_DIR} ${KAFKA_LIB_BUILD_DIR} ${CUDF_KAFKA_BUILD_DIR} ${CUSTREAMZ_BUILD_DIR} ${CUDF_JAR_JAVA_BUILD_DIR} ${PYLIBCUDF_BUILD_DIR}"
 
 # Set defaults for vars modified by flags to this script
 VERBOSE_FLAG=""

From 7ca6a8cfb40291d28dbd0a99e00275e1b4fc869b Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 23 Aug 2024 16:53:59 -0400
Subject: [PATCH 719/842] fix libcudf wheel publishing, make package-type
 explicit in wheel publishing (#16650)

Follow-up to #15483.

Contributes to https://github.com/rapidsai/build-planning/issues/33

Wheel publishing for `libcudf` is failing like this:

```text
Error:  File "./dist/*.whl" does not exist
```

([build link](https://github.com/rapidsai/cudf/actions/runs/10528569930/job/29176811683))

Because the `package-type` was not set to `cpp` in the `wheels-publish` CI workflow, and that workflow defaults to `python`. ([shared-workflows code link](https://github.com/rapidsai/shared-workflows/blob/157e9824e6e2181fca9aa5c4bea4defd4cc322b0/.github/workflows/wheels-publish.yaml#L23-L26)).

This fixes that, and makes that choice explicit for all wheel publishing jobs.

References for this `package-type` argument:

* https://github.com/rapidsai/shared-workflows/pull/209
* https://github.com/rapidsai/gha-tools/pull/105

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16650
---
 .github/workflows/build.yaml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 0ea4d5c54dc..72daff7b66b 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -86,6 +86,7 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: libcudf
+      package-type: cpp
   wheel-build-pylibcudf:
     needs: [wheel-publish-libcudf]
     secrets: inherit
@@ -106,6 +107,7 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: pylibcudf
+      package-type: python
   wheel-build-cudf:
     needs: wheel-publish-pylibcudf
     secrets: inherit
@@ -126,6 +128,7 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: cudf
+      package-type: python
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
@@ -148,6 +151,7 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: dask_cudf
+      package-type: python
   wheel-build-cudf-polars:
     needs: wheel-publish-pylibcudf
     secrets: inherit
@@ -170,6 +174,7 @@ jobs:
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
       package-name: cudf_polars
+      package-type: python
   trigger-pandas-tests:
     if: inputs.build_type == 'nightly'
     needs: wheel-build-cudf

From 508bdea0dac581d5a33ceb609766c419ef51bbbb Mon Sep 17 00:00:00 2001
From: jakirkham <jakirkham@gmail.com>
Date: Fri, 23 Aug 2024 19:15:07 -0700
Subject: [PATCH 720/842] Rebuild for & Support NumPy 2 (#16300)

Part of issue: https://github.com/rapidsai/build-planning/issues/38

Start building `cudf` with `numpy` version `2.0`. This remains compatible with `numpy` version `1.x` and `2.x`. Allows us to test building with `numpy` version `2.0` (and make sure we catch any issues that show up). Also relaxes the `numpy` `1.x` pin. Pulls in the RDFG changes that are rolling out for broader RAPIDS NumPy 2 support.

Authors:
  - https://github.com/jakirkham
  - Sebastian Berg (https://github.com/seberg)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Ray Douglass (https://github.com/raydouglass)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16300
---
 ci/cudf_pandas_scripts/run_tests.sh              | 2 +-
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +-
 conda/recipes/cudf/meta.yaml                     | 6 ++----
 conda/recipes/pylibcudf/meta.yaml                | 6 ++----
 dependencies.yaml                                | 8 +++-----
 python/cudf/pyproject.toml                       | 4 ++--
 python/cudf_kafka/pyproject.toml                 | 2 +-
 python/dask_cudf/pyproject.toml                  | 2 +-
 python/pylibcudf/pyproject.toml                  | 2 +-
 10 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 90ea1afbe6a..39056d58d56 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -75,7 +75,7 @@ IFS=',' read -r -a versions <<< "$output"
 
 for version in "${versions[@]}"; do
     echo "Installing pandas version: ${version}"
-    python -m pip install "pandas==${version}"
+    python -m pip install "numpy>=1.23,<2.0a0" "pandas==${version}"
     python -m pytest -p cudf.pandas \
     --cov-config=./python/cudf/.coveragerc \
     --cov=cudf \
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 018162bd848..5cf7508ba51 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -57,7 +57,7 @@ dependencies:
 - notebook
 - numba>=0.57
 - numpy
-- numpy>=1.23,<2.0a0
+- numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
 - nvcomp==3.0.6
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index c60ffa7aaa5..28b927254f7 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -56,7 +56,7 @@ dependencies:
 - notebook
 - numba>=0.57
 - numpy
-- numpy>=1.23,<2.0a0
+- numpy>=1.23,<3.0a0
 - numpydoc
 - nvcomp==3.0.6
 - nvtx>=0.2.1
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 7e86147732e..b2dad767da4 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -64,8 +64,7 @@ requirements:
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.10.0
     - dlpack >=0.8,<1.0
-    # TODO: Change to `2.0` for NumPy 2
-    - numpy 1.23
+    - numpy 2.0
     - pyarrow ==16.1.0.*
     - libcudf ={{ version }}
     - pylibcudf ={{ version }}
@@ -84,8 +83,7 @@ requirements:
     - pandas >=2.0,<2.2.3dev0
     - cupy >=12.0.0
     - numba >=0.57
-    # TODO: Update `numpy` in `host` when dropping `<2.0a0`
-    - numpy >=1.23,<2.0a0
+    - numpy >=1.23,<3.0a0
     - {{ pin_compatible('pyarrow', max_pin='x.x') }}
     - libcudf ={{ version }}
     - pylibcudf ={{ version }}
diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
index f405fd10f5d..fef78467027 100644
--- a/conda/recipes/pylibcudf/meta.yaml
+++ b/conda/recipes/pylibcudf/meta.yaml
@@ -64,8 +64,7 @@ requirements:
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.10.0
     - dlpack >=0.8,<1.0
-    # TODO: Change to `2.0` for NumPy 2
-    - numpy 1.23
+    - numpy 2.0
     - pyarrow ==16.1.0.*
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
@@ -81,8 +80,7 @@ requirements:
     - python
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.3dev0
-    # TODO: Update `numpy` in `host` when dropping `<2.0a0`
-    - numpy >=1.23,<2.0a0
+    - numpy >=1.23,<3.0a0
     - {{ pin_compatible('pyarrow', max_pin='x.x') }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
diff --git a/dependencies.yaml b/dependencies.yaml
index 553d01735b2..194577817db 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -394,10 +394,9 @@ dependencies:
           - &pyarrow_build pyarrow==16.1.0.*
       - output_types: pyproject
         packages:
-          # Hard pin the patch version used during the build.
+          # Hard pin the version used during the build.
           # Sync with conda build constraint & wheel run constraint.
-          # TODO: Change to `2.0.*` for NumPy 2
-          - numpy==1.23.*
+          - numpy==2.0.*
   build_python_libcudf:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -605,8 +604,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - fsspec>=0.6.0
-          # TODO: Update `numpy` in `build_python_common` when dropping `<2.0a0`
-          - numpy>=1.23,<2.0a0
+          - numpy>=1.23,<3.0a0
           - pandas>=2.0,<2.2.3dev0
   run_pylibcudf:
     common:
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index cb9fa30afab..e7bac17f8ba 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -25,7 +25,7 @@ dependencies = [
     "fsspec>=0.6.0",
     "libcudf==24.10.*,>=0.0.0a0",
     "numba>=0.57",
-    "numpy>=1.23,<2.0a0",
+    "numpy>=1.23,<3.0a0",
     "nvtx>=0.2.1",
     "packaging",
     "pandas>=2.0,<2.2.3dev0",
@@ -130,7 +130,7 @@ requires = [
     "libcudf==24.10.*,>=0.0.0a0",
     "librmm==24.10.*,>=0.0.0a0",
     "ninja",
-    "numpy==1.23.*",
+    "numpy==2.0.*",
     "pyarrow==16.1.0.*",
     "pylibcudf==24.10.*,>=0.0.0a0",
     "rmm==24.10.*,>=0.0.0a0",
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 63c5b07c5f3..2d0222a3fe9 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -106,6 +106,6 @@ requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
     "ninja",
-    "numpy==1.23.*",
+    "numpy==2.0.*",
     "pyarrow==16.1.0.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 872ecd35c28..d5da7030a75 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
     "cudf==24.10.*,>=0.0.0a0",
     "cupy-cuda11x>=12.0.0",
     "fsspec>=0.6.0",
-    "numpy>=1.23,<2.0a0",
+    "numpy>=1.23,<3.0a0",
     "pandas>=2.0,<2.2.3dev0",
     "rapids-dask-dependency==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index 63d76e9fd4e..5f5594b462b 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -105,7 +105,7 @@ requires = [
     "libcudf==24.10.*,>=0.0.0a0",
     "librmm==24.10.*,>=0.0.0a0",
     "ninja",
-    "numpy==1.23.*",
+    "numpy==2.0.*",
     "pyarrow==16.1.0.*",
     "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From 96f2cc5262e5b6b0f50109d327857e306214b3a4 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Mon, 26 Aug 2024 10:21:48 -0400
Subject: [PATCH 721/842] Remove CUDA whole compilation ODR violations (#16603)

CUDA whole compilation mode requires that all kernels are only launched from TUs that compile them. Previously libcudf would compile a subset of kernels in separate TUs from where they are launched.
To keep compile times ( and library size ) as low as possible I have introduced a single C++ function call between the original call site and the kernel launch. In testing this neglibile differences on compile time and binary size.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16603
---
 cpp/src/join/mixed_join.cu                   | 191 ++++++++++---------
 cpp/src/join/mixed_join_kernel.cu            |  10 +-
 cpp/src/join/mixed_join_kernel.cuh           |  64 +++++--
 cpp/src/join/mixed_join_kernel.hpp           |  80 ++++++++
 cpp/src/join/mixed_join_kernel_nulls.cu      |  10 +-
 cpp/src/join/mixed_join_kernels.cuh          | 124 ------------
 cpp/src/join/mixed_join_kernels_semi.cu      |  86 +++++----
 cpp/src/join/mixed_join_kernels_semi.cuh     |  29 +--
 cpp/src/join/mixed_join_semi.cu              |  38 ++--
 cpp/src/join/mixed_join_size_kernel.cu       |  12 +-
 cpp/src/join/mixed_join_size_kernel.cuh      |  64 +++++--
 cpp/src/join/mixed_join_size_kernel.hpp      |  85 +++++++++
 cpp/src/join/mixed_join_size_kernel_nulls.cu |  12 +-
 13 files changed, 472 insertions(+), 333 deletions(-)
 create mode 100644 cpp/src/join/mixed_join_kernel.hpp
 delete mode 100644 cpp/src/join/mixed_join_kernels.cuh
 create mode 100644 cpp/src/join/mixed_join_size_kernel.hpp

diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index 48b94c777de..eb12065c6a9 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -16,7 +16,8 @@
 
 #include "join_common_utils.cuh"
 #include "join_common_utils.hpp"
-#include "mixed_join_kernels.cuh"
+#include "mixed_join_kernel.hpp"
+#include "mixed_join_size_kernel.hpp"
 
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/ast/expressions.hpp>
@@ -178,9 +179,6 @@ mixed_join(
     join_size            = output_size_data->first;
     matches_per_row_span = output_size_data->second;
   } else {
-    // Allocate storage for the counter used to get the size of the join output
-    rmm::device_scalar<std::size_t> size(0, stream, mr);
-
     matches_per_row =
       rmm::device_uvector<size_type>{static_cast<std::size_t>(outer_num_rows), stream, mr};
     // Note that the view goes out of scope after this else statement, but the
@@ -190,37 +188,38 @@ mixed_join(
     matches_per_row_span = cudf::device_span<size_type const>{
       matches_per_row->begin(), static_cast<std::size_t>(outer_num_rows)};
     if (has_nulls) {
-      compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
-        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-          *left_conditional_view,
-          *right_conditional_view,
-          *probe_view,
-          *build_view,
-          hash_probe,
-          equality_probe,
-          kernel_join_type,
-          hash_table_view,
-          parser.device_expression_data,
-          swap_tables,
-          size.data(),
-          mutable_matches_per_row_span);
+      join_size = launch_compute_mixed_join_output_size<true>(*left_conditional_view,
+                                                              *right_conditional_view,
+                                                              *probe_view,
+                                                              *build_view,
+                                                              hash_probe,
+                                                              equality_probe,
+                                                              kernel_join_type,
+                                                              hash_table_view,
+                                                              parser.device_expression_data,
+                                                              swap_tables,
+                                                              mutable_matches_per_row_span,
+                                                              config,
+                                                              shmem_size_per_block,
+                                                              stream,
+                                                              mr);
     } else {
-      compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>
-        <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-          *left_conditional_view,
-          *right_conditional_view,
-          *probe_view,
-          *build_view,
-          hash_probe,
-          equality_probe,
-          kernel_join_type,
-          hash_table_view,
-          parser.device_expression_data,
-          swap_tables,
-          size.data(),
-          mutable_matches_per_row_span);
+      join_size = launch_compute_mixed_join_output_size<false>(*left_conditional_view,
+                                                               *right_conditional_view,
+                                                               *probe_view,
+                                                               *build_view,
+                                                               hash_probe,
+                                                               equality_probe,
+                                                               kernel_join_type,
+                                                               hash_table_view,
+                                                               parser.device_expression_data,
+                                                               swap_tables,
+                                                               mutable_matches_per_row_span,
+                                                               config,
+                                                               shmem_size_per_block,
+                                                               stream,
+                                                               mr);
     }
-    join_size = size.value(stream);
   }
 
   // The initial early exit clauses guarantee that we will not reach this point
@@ -249,37 +248,39 @@ mixed_join(
   auto const& join_output_r = right_indices->data();
 
   if (has_nulls) {
-    mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        kernel_join_type,
-        hash_table_view,
-        join_output_l,
-        join_output_r,
-        parser.device_expression_data,
-        join_result_offsets.data(),
-        swap_tables);
+    launch_mixed_join<true>(*left_conditional_view,
+                            *right_conditional_view,
+                            *probe_view,
+                            *build_view,
+                            hash_probe,
+                            equality_probe,
+                            kernel_join_type,
+                            hash_table_view,
+                            join_output_l,
+                            join_output_r,
+                            parser.device_expression_data,
+                            join_result_offsets.data(),
+                            swap_tables,
+                            config,
+                            shmem_size_per_block,
+                            stream);
   } else {
-    mixed_join<DEFAULT_JOIN_BLOCK_SIZE, false>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        kernel_join_type,
-        hash_table_view,
-        join_output_l,
-        join_output_r,
-        parser.device_expression_data,
-        join_result_offsets.data(),
-        swap_tables);
+    launch_mixed_join<false>(*left_conditional_view,
+                             *right_conditional_view,
+                             *probe_view,
+                             *build_view,
+                             hash_probe,
+                             equality_probe,
+                             kernel_join_type,
+                             hash_table_view,
+                             join_output_l,
+                             join_output_r,
+                             parser.device_expression_data,
+                             join_result_offsets.data(),
+                             swap_tables,
+                             config,
+                             shmem_size_per_block,
+                             stream);
   }
 
   auto join_indices = std::pair(std::move(left_indices), std::move(right_indices));
@@ -423,9 +424,6 @@ compute_mixed_join_output_size(table_view const& left_equality,
   detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
   auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
 
-  // Allocate storage for the counter used to get the size of the join output
-  rmm::device_scalar<std::size_t> size(0, stream, mr);
-
   auto const preprocessed_probe =
     experimental::row::equality::preprocessed_table::create(probe, stream);
   auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
@@ -436,39 +434,42 @@ compute_mixed_join_output_size(table_view const& left_equality,
 
   // Determine number of output rows without actually building the output to simply
   // find what the size of the output will be.
+  std::size_t size = 0;
   if (has_nulls) {
-    compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        join_type,
-        hash_table_view,
-        parser.device_expression_data,
-        swap_tables,
-        size.data(),
-        matches_per_row_span);
+    size = launch_compute_mixed_join_output_size<true>(*left_conditional_view,
+                                                       *right_conditional_view,
+                                                       *probe_view,
+                                                       *build_view,
+                                                       hash_probe,
+                                                       equality_probe,
+                                                       join_type,
+                                                       hash_table_view,
+                                                       parser.device_expression_data,
+                                                       swap_tables,
+                                                       matches_per_row_span,
+                                                       config,
+                                                       shmem_size_per_block,
+                                                       stream,
+                                                       mr);
   } else {
-    compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        join_type,
-        hash_table_view,
-        parser.device_expression_data,
-        swap_tables,
-        size.data(),
-        matches_per_row_span);
+    size = launch_compute_mixed_join_output_size<false>(*left_conditional_view,
+                                                        *right_conditional_view,
+                                                        *probe_view,
+                                                        *build_view,
+                                                        hash_probe,
+                                                        equality_probe,
+                                                        join_type,
+                                                        hash_table_view,
+                                                        parser.device_expression_data,
+                                                        swap_tables,
+                                                        matches_per_row_span,
+                                                        config,
+                                                        shmem_size_per_block,
+                                                        stream,
+                                                        mr);
   }
 
-  return {size.value(stream), std::move(matches_per_row)};
+  return {size, std::move(matches_per_row)};
 }
 
 }  // namespace detail
diff --git a/cpp/src/join/mixed_join_kernel.cu b/cpp/src/join/mixed_join_kernel.cu
index 61cfa168b03..cd4016837cc 100644
--- a/cpp/src/join/mixed_join_kernel.cu
+++ b/cpp/src/join/mixed_join_kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,12 @@
  */
 
 #include "mixed_join_kernel.cuh"
+#include "mixed_join_kernel.hpp"
 
 namespace cudf {
 namespace detail {
 
-template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, false>(
+template void launch_mixed_join<false>(
   table_device_view left_table,
   table_device_view right_table,
   table_device_view probe,
@@ -32,7 +33,10 @@ template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, false>(
   size_type* join_output_r,
   cudf::ast::detail::expression_device_view device_expression_data,
   cudf::size_type const* join_result_offsets,
-  bool const swap_tables);
+  bool const swap_tables,
+  detail::grid_1d const config,
+  int64_t shmem_size_per_block,
+  rmm::cuda_stream_view stream);
 
 }  // namespace detail
 
diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh
index ea59f23c77f..9d011d43de6 100644
--- a/cpp/src/join/mixed_join_kernel.cuh
+++ b/cpp/src/join/mixed_join_kernel.cuh
@@ -19,6 +19,7 @@
 #include "join_common_utils.cuh"
 #include "join_common_utils.hpp"
 #include "mixed_join_common_utils.cuh"
+#include "mixed_join_kernel.hpp"
 
 #include <cudf/ast/detail/expression_evaluator.cuh>
 #include <cudf/ast/detail/expression_parser.hpp>
@@ -39,20 +40,20 @@ namespace cg = cooperative_groups;
 #pragma GCC diagnostic ignored "-Wattributes"
 
 template <cudf::size_type block_size, bool has_nulls>
-CUDF_HIDDEN __launch_bounds__(block_size) __global__
-  void mixed_join(table_device_view left_table,
-                  table_device_view right_table,
-                  table_device_view probe,
-                  table_device_view build,
-                  row_hash const hash_probe,
-                  row_equality const equality_probe,
-                  join_kind const join_type,
-                  cudf::detail::mixed_multimap_type::device_view hash_table_view,
-                  size_type* join_output_l,
-                  size_type* join_output_r,
-                  cudf::ast::detail::expression_device_view device_expression_data,
-                  cudf::size_type const* join_result_offsets,
-                  bool const swap_tables)
+CUDF_KERNEL void __launch_bounds__(block_size)
+  mixed_join(table_device_view left_table,
+             table_device_view right_table,
+             table_device_view probe,
+             table_device_view build,
+             row_hash const hash_probe,
+             row_equality const equality_probe,
+             join_kind const join_type,
+             cudf::detail::mixed_multimap_type::device_view hash_table_view,
+             size_type* join_output_l,
+             size_type* join_output_r,
+             cudf::ast::detail::expression_device_view device_expression_data,
+             cudf::size_type const* join_result_offsets,
+             bool const swap_tables)
 {
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is
@@ -111,6 +112,41 @@ CUDF_HIDDEN __launch_bounds__(block_size) __global__
   }
 }
 
+template <bool has_nulls>
+void launch_mixed_join(table_device_view left_table,
+                       table_device_view right_table,
+                       table_device_view probe,
+                       table_device_view build,
+                       row_hash const hash_probe,
+                       row_equality const equality_probe,
+                       join_kind const join_type,
+                       cudf::detail::mixed_multimap_type::device_view hash_table_view,
+                       size_type* join_output_l,
+                       size_type* join_output_r,
+                       cudf::ast::detail::expression_device_view device_expression_data,
+                       cudf::size_type const* join_result_offsets,
+                       bool const swap_tables,
+                       detail::grid_1d const config,
+                       int64_t shmem_size_per_block,
+                       rmm::cuda_stream_view stream)
+{
+  mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>
+    <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+      left_table,
+      right_table,
+      probe,
+      build,
+      hash_probe,
+      equality_probe,
+      join_type,
+      hash_table_view,
+      join_output_l,
+      join_output_r,
+      device_expression_data,
+      join_result_offsets,
+      swap_tables);
+}
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernel.hpp b/cpp/src/join/mixed_join_kernel.hpp
new file mode 100644
index 00000000000..cc92e9d8ba4
--- /dev/null
+++ b/cpp/src/join/mixed_join_kernel.hpp
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "join/join_common_utils.hpp"
+#include "join/mixed_join_common_utils.cuh"
+
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/span.hpp>
+
+namespace CUDF_EXPORT cudf {
+namespace detail {
+
+/**
+ * @brief Performs a join using the combination of a hash lookup to identify
+ * equal rows between one pair of tables and the evaluation of an expression
+ * containing an arbitrary expression.
+ *
+ * This method probes the hash table with each row in the probe table using a
+ * custom equality comparator that also checks that the conditional expression
+ * evaluates to true between the left/right tables when a match is found
+ * between probe and build rows.
+ *
+ * @tparam block_size The number of threads per block for this kernel
+ * @tparam has_nulls Whether or not the inputs may contain nulls.
+ *
+ * @param[in] left_table The left table
+ * @param[in] right_table The right table
+ * @param[in] probe The table with which to probe the hash table for matches.
+ * @param[in] build The table with which the hash table was built.
+ * @param[in] hash_probe The hasher used for the probe table.
+ * @param[in] equality_probe The equality comparator used when probing the hash table.
+ * @param[in] join_type The type of join to be performed
+ * @param[in] hash_table_view The hash table built from `build`.
+ * @param[out] join_output_l The left result of the join operation
+ * @param[out] join_output_r The right result of the join operation
+ * @param[in] device_expression_data Container of device data required to evaluate the desired
+ * expression.
+ * @param[in] join_result_offsets The starting indices in join_output[l|r]
+ * where the matches for each row begin. Equivalent to a prefix sum of
+ * matches_per_row.
+ * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
+ * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
+ */
+template <bool has_nulls>
+void launch_mixed_join(table_device_view left_table,
+                       table_device_view right_table,
+                       table_device_view probe,
+                       table_device_view build,
+                       row_hash const hash_probe,
+                       row_equality const equality_probe,
+                       join_kind const join_type,
+                       cudf::detail::mixed_multimap_type::device_view hash_table_view,
+                       size_type* join_output_l,
+                       size_type* join_output_r,
+                       cudf::ast::detail::expression_device_view device_expression_data,
+                       cudf::size_type const* join_result_offsets,
+                       bool const swap_tables,
+                       detail::grid_1d const config,
+                       int64_t shmem_size_per_block,
+                       rmm::cuda_stream_view stream);
+
+}  // namespace detail
+
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/join/mixed_join_kernel_nulls.cu b/cpp/src/join/mixed_join_kernel_nulls.cu
index 518f8ed8555..185aa133f2d 100644
--- a/cpp/src/join/mixed_join_kernel_nulls.cu
+++ b/cpp/src/join/mixed_join_kernel_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,12 @@
  */
 
 #include "mixed_join_kernel.cuh"
+#include "mixed_join_kernel.hpp"
 
 namespace cudf {
 namespace detail {
 
-template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>(
+template void launch_mixed_join<true>(
   table_device_view left_table,
   table_device_view right_table,
   table_device_view probe,
@@ -32,7 +33,10 @@ template __global__ void mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>(
   size_type* join_output_r,
   cudf::ast::detail::expression_device_view device_expression_data,
   cudf::size_type const* join_result_offsets,
-  bool const swap_tables);
+  bool const swap_tables,
+  detail::grid_1d const config,
+  int64_t shmem_size_per_block,
+  rmm::cuda_stream_view stream);
 
 }  // namespace detail
 
diff --git a/cpp/src/join/mixed_join_kernels.cuh b/cpp/src/join/mixed_join_kernels.cuh
deleted file mode 100644
index 037c02666d4..00000000000
--- a/cpp/src/join/mixed_join_kernels.cuh
+++ /dev/null
@@ -1,124 +0,0 @@
-/*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "join/join_common_utils.hpp"
-#include "join/mixed_join_common_utils.cuh"
-
-#include <cudf/ast/detail/expression_parser.hpp>
-#include <cudf/table/table_device_view.cuh>
-#include <cudf/utilities/span.hpp>
-
-namespace cudf {
-namespace detail {
-
-/**
- * @brief Computes the output size of joining the left table to the right table.
- *
- * This method probes the hash table with each row in the probe table using a
- * custom equality comparator that also checks that the conditional expression
- * evaluates to true between the left/right tables when a match is found
- * between probe and build rows.
- *
- * @tparam block_size The number of threads per block for this kernel
- * @tparam has_nulls Whether or not the inputs may contain nulls.
- *
- * @param[in] left_table The left table
- * @param[in] right_table The right table
- * @param[in] probe The table with which to probe the hash table for matches.
- * @param[in] build The table with which the hash table was built.
- * @param[in] hash_probe The hasher used for the probe table.
- * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] join_type The type of join to be performed
- * @param[in] hash_table_view The hash table built from `build`.
- * @param[in] device_expression_data Container of device data required to evaluate the desired
- * expression.
- * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
- * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
- * @param[out] output_size The resulting output size
- * @param[out] matches_per_row The number of matches in one pair of
- * equality/conditional tables for each row in the other pair of tables. If
- * swap_tables is true, matches_per_row corresponds to the right_table,
- * otherwise it corresponds to the left_table. Note that corresponding swap of
- * left/right tables to determine which is the build table and which is the
- * probe table has already happened on the host.
- */
-
-template <int block_size, bool has_nulls>
-__global__ void compute_mixed_join_output_size(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::mixed_multimap_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
-/**
- * @brief Performs a join using the combination of a hash lookup to identify
- * equal rows between one pair of tables and the evaluation of an expression
- * containing an arbitrary expression.
- *
- * This method probes the hash table with each row in the probe table using a
- * custom equality comparator that also checks that the conditional expression
- * evaluates to true between the left/right tables when a match is found
- * between probe and build rows.
- *
- * @tparam block_size The number of threads per block for this kernel
- * @tparam has_nulls Whether or not the inputs may contain nulls.
- *
- * @param[in] left_table The left table
- * @param[in] right_table The right table
- * @param[in] probe The table with which to probe the hash table for matches.
- * @param[in] build The table with which the hash table was built.
- * @param[in] hash_probe The hasher used for the probe table.
- * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] join_type The type of join to be performed
- * @param[in] hash_table_view The hash table built from `build`.
- * @param[out] join_output_l The left result of the join operation
- * @param[out] join_output_r The right result of the join operation
- * @param[in] device_expression_data Container of device data required to evaluate the desired
- * expression.
- * @param[in] join_result_offsets The starting indices in join_output[l|r]
- * where the matches for each row begin. Equivalent to a prefix sum of
- * matches_per_row.
- * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
- * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
- */
-template <cudf::size_type block_size, bool has_nulls>
-__global__ void mixed_join(table_device_view left_table,
-                           table_device_view right_table,
-                           table_device_view probe,
-                           table_device_view build,
-                           row_hash const hash_probe,
-                           row_equality const equality_probe,
-                           join_kind const join_type,
-                           cudf::detail::mixed_multimap_type::device_view hash_table_view,
-                           size_type* join_output_l,
-                           size_type* join_output_r,
-                           cudf::ast::detail::expression_device_view device_expression_data,
-                           cudf::size_type const* join_result_offsets,
-                           bool const swap_tables);
-
-}  // namespace detail
-
-}  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index 1f31eaa7878..7459ac3e99c 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -14,9 +14,7 @@
  * limitations under the License.
  */
 
-#include "join/join_common_utils.cuh"
-#include "join/join_common_utils.hpp"
-#include "join/mixed_join_common_utils.cuh"
+#include "join/mixed_join_kernels_semi.cuh"
 
 #include <cudf/ast/detail/expression_evaluator.cuh>
 #include <cudf/ast/detail/expression_parser.hpp>
@@ -35,16 +33,16 @@ namespace cg = cooperative_groups;
 #pragma GCC diagnostic ignored "-Wattributes"
 
 template <cudf::size_type block_size, bool has_nulls>
-CUDF_HIDDEN __launch_bounds__(block_size) __global__
-  void mixed_join_semi(table_device_view left_table,
-                       table_device_view right_table,
-                       table_device_view probe,
-                       table_device_view build,
-                       row_hash const hash_probe,
-                       row_equality const equality_probe,
-                       cudf::detail::semi_map_type::device_view hash_table_view,
-                       cudf::device_span<bool> left_table_keep_mask,
-                       cudf::ast::detail::expression_device_view device_expression_data)
+CUDF_KERNEL void __launch_bounds__(block_size)
+  mixed_join_semi(table_device_view left_table,
+                  table_device_view right_table,
+                  table_device_view probe,
+                  table_device_view build,
+                  row_hash const hash_probe,
+                  row_equality const equality_probe,
+                  cudf::detail::semi_map_type::device_view hash_table_view,
+                  cudf::device_span<bool> left_table_keep_mask,
+                  cudf::ast::detail::expression_device_view device_expression_data)
 {
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is
@@ -75,28 +73,46 @@ CUDF_HIDDEN __launch_bounds__(block_size) __global__
   }
 }
 
-template __global__ void mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, true>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  cudf::detail::semi_map_type::device_view hash_table_view,
-  cudf::device_span<bool> left_table_keep_mask,
-  cudf::ast::detail::expression_device_view device_expression_data);
-
-template __global__ void mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  cudf::detail::semi_map_type::device_view hash_table_view,
-  cudf::device_span<bool> left_table_keep_mask,
-  cudf::ast::detail::expression_device_view device_expression_data);
+void launch_mixed_join_semi(bool has_nulls,
+                            table_device_view left_table,
+                            table_device_view right_table,
+                            table_device_view probe,
+                            table_device_view build,
+                            row_hash const hash_probe,
+                            row_equality const equality_probe,
+                            cudf::detail::semi_map_type::device_view hash_table_view,
+                            cudf::device_span<bool> left_table_keep_mask,
+                            cudf::ast::detail::expression_device_view device_expression_data,
+                            detail::grid_1d const config,
+                            int64_t shmem_size_per_block,
+                            rmm::cuda_stream_view stream)
+{
+  if (has_nulls) {
+    mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, true>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        left_table,
+        right_table,
+        probe,
+        build,
+        hash_probe,
+        equality_probe,
+        hash_table_view,
+        left_table_keep_mask,
+        device_expression_data);
+  } else {
+    mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>
+      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+        left_table,
+        right_table,
+        probe,
+        build,
+        hash_probe,
+        equality_probe,
+        hash_table_view,
+        left_table_keep_mask,
+        device_expression_data);
+  }
+}
 
 }  // namespace detail
-
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh
index 4ea404d451c..43714ffb36a 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cuh
+++ b/cpp/src/join/mixed_join_kernels_semi.cuh
@@ -16,8 +16,9 @@
 
 #pragma once
 
-#include "join/join_common_utils.hpp"
-#include "join/mixed_join_common_utils.cuh"
+#include "join_common_utils.cuh"
+#include "join_common_utils.hpp"
+#include "mixed_join_common_utils.cuh"
 
 #include <cudf/ast/detail/expression_parser.hpp>
 #include <cudf/table/table_device_view.cuh>
@@ -39,6 +40,7 @@ namespace detail {
  * @tparam block_size The number of threads per block for this kernel
  * @tparam has_nulls Whether or not the inputs may contain nulls.
  *
+ * @param[in] has_nulls If the input has nulls
  * @param[in] left_table The left table
  * @param[in] right_table The right table
  * @param[in] probe The table with which to probe the hash table for matches.
@@ -51,16 +53,19 @@ namespace detail {
  * @param[in] device_expression_data Container of device data required to evaluate the desired
  * expression.
  */
-template <cudf::size_type block_size, bool has_nulls>
-__global__ void mixed_join_semi(table_device_view left_table,
-                                table_device_view right_table,
-                                table_device_view probe,
-                                table_device_view build,
-                                row_hash const hash_probe,
-                                row_equality const equality_probe,
-                                cudf::detail::semi_map_type::device_view hash_table_view,
-                                cudf::device_span<bool> left_table_keep_mask,
-                                cudf::ast::detail::expression_device_view device_expression_data);
+void launch_mixed_join_semi(bool has_nulls,
+                            table_device_view left_table,
+                            table_device_view right_table,
+                            table_device_view probe,
+                            table_device_view build,
+                            row_hash const hash_probe,
+                            row_equality const equality_probe,
+                            cudf::detail::semi_map_type::device_view hash_table_view,
+                            cudf::device_span<bool> left_table_keep_mask,
+                            cudf::ast::detail::expression_device_view device_expression_data,
+                            detail::grid_1d const config,
+                            int64_t shmem_size_per_block,
+                            rmm::cuda_stream_view stream);
 
 }  // namespace detail
 
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index 3e4188a0fbd..a79aa6673d6 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -227,31 +227,19 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   // Vector used to indicate indices from left/probe table which are present in output
   auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);
 
-  if (has_nulls) {
-    mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, true>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        hash_table_view,
-        cudf::device_span<bool>(left_table_keep_mask),
-        parser.device_expression_data);
-  } else {
-    mixed_join_semi<DEFAULT_JOIN_BLOCK_SIZE, false>
-      <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
-        *left_conditional_view,
-        *right_conditional_view,
-        *probe_view,
-        *build_view,
-        hash_probe,
-        equality_probe,
-        hash_table_view,
-        cudf::device_span<bool>(left_table_keep_mask),
-        parser.device_expression_data);
-  }
+  launch_mixed_join_semi(has_nulls,
+                         *left_conditional_view,
+                         *right_conditional_view,
+                         *probe_view,
+                         *build_view,
+                         hash_probe,
+                         equality_probe,
+                         hash_table_view,
+                         cudf::device_span<bool>(left_table_keep_mask),
+                         parser.device_expression_data,
+                         config,
+                         shmem_size_per_block,
+                         stream);
 
   auto gather_map = std::make_unique<rmm::device_uvector<size_type>>(probe.num_rows(), stream, mr);
 
diff --git a/cpp/src/join/mixed_join_size_kernel.cu b/cpp/src/join/mixed_join_size_kernel.cu
index 4011acb65d6..4882c8769e6 100644
--- a/cpp/src/join/mixed_join_size_kernel.cu
+++ b/cpp/src/join/mixed_join_size_kernel.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,11 +15,12 @@
  */
 
 #include "mixed_join_size_kernel.cuh"
+#include "mixed_join_size_kernel.hpp"
 
 namespace cudf {
 namespace detail {
 
-template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, false>(
+template std::size_t launch_compute_mixed_join_output_size<false>(
   table_device_view left_table,
   table_device_view right_table,
   table_device_view probe,
@@ -30,8 +31,11 @@ template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE,
   cudf::detail::mixed_multimap_type::device_view hash_table_view,
   ast::detail::expression_device_view device_expression_data,
   bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
+  cudf::device_span<cudf::size_type> matches_per_row,
+  detail::grid_1d const config,
+  int64_t shmem_size_per_block,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh
index 00a90f8273f..a1066e32331 100644
--- a/cpp/src/join/mixed_join_size_kernel.cuh
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -36,19 +36,19 @@ namespace cg = cooperative_groups;
 #pragma GCC diagnostic ignored "-Wattributes"
 
 template <int block_size, bool has_nulls>
-CUDF_HIDDEN __launch_bounds__(block_size) __global__ void compute_mixed_join_output_size(
-  table_device_view left_table,
-  table_device_view right_table,
-  table_device_view probe,
-  table_device_view build,
-  row_hash const hash_probe,
-  row_equality const equality_probe,
-  join_kind const join_type,
-  cudf::detail::mixed_multimap_type::device_view hash_table_view,
-  ast::detail::expression_device_view device_expression_data,
-  bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row)
+CUDF_KERNEL void __launch_bounds__(block_size)
+  compute_mixed_join_output_size(table_device_view left_table,
+                                 table_device_view right_table,
+                                 table_device_view probe,
+                                 table_device_view build,
+                                 row_hash const hash_probe,
+                                 row_equality const equality_probe,
+                                 join_kind const join_type,
+                                 cudf::detail::mixed_multimap_type::device_view hash_table_view,
+                                 ast::detail::expression_device_view device_expression_data,
+                                 bool const swap_tables,
+                                 std::size_t* output_size,
+                                 cudf::device_span<cudf::size_type> matches_per_row)
 {
   // The (required) extern storage of the shared memory array leads to
   // conflicting declarations between different templates. The easiest
@@ -103,5 +103,43 @@ CUDF_HIDDEN __launch_bounds__(block_size) __global__ void compute_mixed_join_out
   }
 }
 
+template <bool has_nulls>
+std::size_t launch_compute_mixed_join_output_size(
+  table_device_view left_table,
+  table_device_view right_table,
+  table_device_view probe,
+  table_device_view build,
+  row_hash const hash_probe,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  cudf::device_span<cudf::size_type> matches_per_row,
+  detail::grid_1d const config,
+  int64_t shmem_size_per_block,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  // Allocate storage for the counter used to get the size of the join output
+  rmm::device_scalar<std::size_t> size(0, stream, mr);
+
+  compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
+    <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
+      left_table,
+      right_table,
+      probe,
+      build,
+      hash_probe,
+      equality_probe,
+      join_type,
+      hash_table_view,
+      device_expression_data,
+      swap_tables,
+      size.data(),
+      matches_per_row);
+  return size.value(stream);
+}
+
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_size_kernel.hpp b/cpp/src/join/mixed_join_size_kernel.hpp
new file mode 100644
index 00000000000..b09805c14dc
--- /dev/null
+++ b/cpp/src/join/mixed_join_size_kernel.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "join_common_utils.cuh"
+#include "join_common_utils.hpp"
+#include "mixed_join_common_utils.cuh"
+
+#include <cudf/ast/detail/expression_evaluator.cuh>
+#include <cudf/ast/detail/expression_parser.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/export.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <cooperative_groups.h>
+#include <cub/cub.cuh>
+#include <thrust/iterator/discard_iterator.h>
+
+namespace CUDF_EXPORT cudf {
+namespace detail {
+
+/**
+ * @brief Computes the output size of joining the left table to the right table.
+ *
+ * This method probes the hash table with each row in the probe table using a
+ * custom equality comparator that also checks that the conditional expression
+ * evaluates to true between the left/right tables when a match is found
+ * between probe and build rows.
+ *
+ * @tparam block_size The number of threads per block for this kernel
+ * @tparam has_nulls Whether or not the inputs may contain nulls.
+ *
+ * @param[in] left_table The left table
+ * @param[in] right_table The right table
+ * @param[in] probe The table with which to probe the hash table for matches.
+ * @param[in] build The table with which the hash table was built.
+ * @param[in] hash_probe The hasher used for the probe table.
+ * @param[in] equality_probe The equality comparator used when probing the hash table.
+ * @param[in] join_type The type of join to be performed
+ * @param[in] hash_table_view The hash table built from `build`.
+ * @param[in] device_expression_data Container of device data required to evaluate the desired
+ * expression.
+ * @param[in] swap_tables If true, the kernel was launched with one thread per right row and
+ * the kernel needs to internally loop over left rows. Otherwise, loop over right rows.
+ * @param[out] output_size The resulting output size
+ * @param[out] matches_per_row The number of matches in one pair of
+ * equality/conditional tables for each row in the other pair of tables. If
+ * swap_tables is true, matches_per_row corresponds to the right_table,
+ * otherwise it corresponds to the left_table. Note that corresponding swap of
+ * left/right tables to determine which is the build table and which is the
+ * probe table has already happened on the host.
+ */
+
+template <bool has_nulls>
+std::size_t launch_compute_mixed_join_output_size(
+  cudf::table_device_view left_table,
+  cudf::table_device_view right_table,
+  cudf::table_device_view probe,
+  cudf::table_device_view build,
+  row_hash const hash_probe,
+  row_equality const equality_probe,
+  join_kind const join_type,
+  cudf::detail::mixed_multimap_type::device_view hash_table_view,
+  ast::detail::expression_device_view device_expression_data,
+  bool const swap_tables,
+  cudf::device_span<cudf::size_type> matches_per_row,
+  detail::grid_1d const config,
+  int64_t shmem_size_per_block,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/join/mixed_join_size_kernel_nulls.cu b/cpp/src/join/mixed_join_size_kernel_nulls.cu
index 2868113bf33..11f9103da4d 100644
--- a/cpp/src/join/mixed_join_size_kernel_nulls.cu
+++ b/cpp/src/join/mixed_join_size_kernel_nulls.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,7 @@
 namespace cudf {
 namespace detail {
 
-template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>(
+template std::size_t launch_compute_mixed_join_output_size<true>(
   table_device_view left_table,
   table_device_view right_table,
   table_device_view probe,
@@ -30,8 +30,10 @@ template __global__ void compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE,
   cudf::detail::mixed_multimap_type::device_view hash_table_view,
   ast::detail::expression_device_view device_expression_data,
   bool const swap_tables,
-  std::size_t* output_size,
-  cudf::device_span<cudf::size_type> matches_per_row);
-
+  cudf::device_span<cudf::size_type> matches_per_row,
+  detail::grid_1d const config,
+  int64_t shmem_size_per_block,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
 }  // namespace detail
 }  // namespace cudf

From a2503913bb362e43fa77615748ed4b4e31ac5055 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 26 Aug 2024 09:26:32 -0700
Subject: [PATCH 722/842] Revise `get_reader_filepath_or_buffer` to handle a
 list of data sources (#16613)

The cudf read APIs (e.g. `cudf.read_parquet`, `cudf.read_json`, etc...) currently iterate over data sources, calling `get_reader_filepath_or_buffer` on each source independently when multiple files are mapped to the same `cudf.DataFrame`. This is suboptimal when the data sources are remote-file paths (e.g. in S3).  In this case, we **should** be initiating network transfers for all files in parallel (and as early as possible).

This PR makes it easier to optimize multi-file data transfer in follow-up work. It also simplifies and centralizes some of the common logic used by the various read APIs.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16613
---
 python/cudf/cudf/io/avro.py       |  17 +--
 python/cudf/cudf/io/csv.py        |  15 +--
 python/cudf/cudf/io/json.py       |  58 ++-------
 python/cudf/cudf/io/orc.py        |  41 ++----
 python/cudf/cudf/io/parquet.py    |  59 ++-------
 python/cudf/cudf/io/text.py       |   6 +-
 python/cudf/cudf/utils/ioutils.py | 210 ++++++++++++++++--------------
 7 files changed, 161 insertions(+), 245 deletions(-)

diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py
index 728b34045bf..964bd02b03e 100644
--- a/python/cudf/cudf/io/avro.py
+++ b/python/cudf/cudf/io/avro.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2022, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import cudf
 from cudf import _lib as libcudf
@@ -15,22 +15,13 @@ def read_avro(
 ):
     """{docstring}"""
 
-    is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer(
+    filepath_or_buffer = ioutils.get_reader_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
         storage_options=storage_options,
     )
-    if not is_single_filepath_or_buffer:
-        raise NotImplementedError(
-            "`read_avro` does not yet support reading multiple files"
-        )
-
-    filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer(
-        path_or_data=filepath_or_buffer,
-        compression=None,
-        storage_options=storage_options,
+    filepath_or_buffer = ioutils._select_single_source(
+        filepath_or_buffer, "read_avro"
     )
-    if compression is not None:
-        ValueError("URL content-encoding decompression is not supported")
 
     return cudf.DataFrame._from_data(
         *libcudf.avro.read_avro(
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index e61fc5063dc..a9c20150930 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -64,22 +64,15 @@ def read_csv(
     if bytes_per_thread is None:
         bytes_per_thread = ioutils._BYTES_PER_THREAD_DEFAULT
 
-    is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer(
+    filepath_or_buffer = ioutils.get_reader_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
-        storage_options=storage_options,
-    )
-    if not is_single_filepath_or_buffer:
-        raise NotImplementedError(
-            "`read_csv` does not yet support reading multiple files"
-        )
-
-    filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer(
-        path_or_data=filepath_or_buffer,
-        compression=compression,
         iotypes=(BytesIO, StringIO),
         storage_options=storage_options,
         bytes_per_thread=bytes_per_thread,
     )
+    filepath_or_buffer = ioutils._select_single_source(
+        filepath_or_buffer, "read_csv"
+    )
 
     if na_values is not None and is_scalar(na_values):
         na_values = [na_values]
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index fc3387d5117..d86db656fd0 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -9,7 +9,6 @@
 
 import cudf
 from cudf._lib import json as libjson
-from cudf.api.types import is_list_like
 from cudf.utils import ioutils
 from cudf.utils.dtypes import _maybe_convert_to_default_type
 
@@ -62,37 +61,15 @@ def read_json(
                 f"following positional arguments: {list(args)}"
             )
 
-        # Multiple sources are passed as a list. If a single source is passed,
-        # wrap it in a list for unified processing downstream.
-        if not is_list_like(path_or_buf):
-            path_or_buf = [path_or_buf]
-
-        filepaths_or_buffers = []
-        for source in path_or_buf:
-            if ioutils.is_directory(
-                path_or_data=source, storage_options=storage_options
-            ):
-                fs = ioutils._ensure_filesystem(
-                    passed_filesystem=None,
-                    path=source,
-                    storage_options=storage_options,
-                )
-                source = ioutils.stringify_pathlike(source)
-                source = fs.sep.join([source, "*.json"])
-
-            tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
-                path_or_data=source,
-                compression=compression,
-                iotypes=(BytesIO, StringIO),
-                allow_raw_text_input=True,
-                storage_options=storage_options,
-                warn_on_raw_text_input=True,
-                warn_meta=("json", "read_json"),
-            )
-            if isinstance(tmp_source, list):
-                filepaths_or_buffers.extend(tmp_source)
-            else:
-                filepaths_or_buffers.append(tmp_source)
+        filepaths_or_buffers = ioutils.get_reader_filepath_or_buffer(
+            path_or_buf,
+            iotypes=(BytesIO, StringIO),
+            allow_raw_text_input=True,
+            storage_options=storage_options,
+            warn_on_raw_text_input=True,
+            warn_meta=("json", "read_json"),
+            expand_dir_pattern="*.json",
+        )
 
         df = libjson.read_json(
             filepaths_or_buffers=filepaths_or_buffers,
@@ -111,25 +88,18 @@ def read_json(
             "be GPU accelerated in the future"
         )
 
-        if not ioutils.ensure_single_filepath_or_buffer(
-            path_or_data=path_or_buf,
-            storage_options=storage_options,
-        ):
-            raise NotImplementedError(
-                "`read_json` does not yet support reading "
-                "multiple files via pandas"
-            )
-
-        path_or_buf, compression = ioutils.get_reader_filepath_or_buffer(
+        filepath_or_buffer = ioutils.get_reader_filepath_or_buffer(
             path_or_data=path_or_buf,
-            compression=compression,
             iotypes=(BytesIO, StringIO),
             allow_raw_text_input=True,
             storage_options=storage_options,
         )
+        filepath_or_buffer = ioutils._select_single_source(
+            filepath_or_buffer, "read_json (via pandas)"
+        )
 
         pd_value = pd.read_json(
-            path_or_buf,
+            filepath_or_buffer,
             lines=lines,
             dtype=dtype,
             compression=compression,
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 4f04caafc5d..fd246c6215f 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -4,7 +4,6 @@
 import warnings
 
 import pyarrow as pa
-from fsspec.utils import stringify_path
 
 import cudf
 from cudf._lib import orc as liborc
@@ -170,8 +169,11 @@ def read_orc_statistics(
     files_statistics = []
     stripes_statistics = []
     for source in filepaths_or_buffers:
-        path_or_buf, _ = ioutils.get_reader_filepath_or_buffer(
-            path_or_data=source, compression=None, **kwargs
+        path_or_buf = ioutils.get_reader_filepath_or_buffer(
+            path_or_data=source, **kwargs
+        )
+        path_or_buf = ioutils._select_single_source(
+            path_or_buf, "read_orc_statistics"
         )
         (
             column_names,
@@ -318,33 +320,12 @@ def read_orc(
                 "A list of stripes must be provided for each input source"
             )
 
-    filepaths_or_buffers = []
-    for source in filepath_or_buffer:
-        if ioutils.is_directory(
-            path_or_data=source, storage_options=storage_options
-        ):
-            fs = ioutils._ensure_filesystem(
-                passed_filesystem=None,
-                path=source,
-                storage_options=storage_options,
-            )
-            source = stringify_path(source)
-            source = fs.sep.join([source, "*.orc"])
-
-        tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
-            path_or_data=source,
-            compression=None,
-            storage_options=storage_options,
-            bytes_per_thread=bytes_per_thread,
-        )
-        if compression is not None:
-            raise ValueError(
-                "URL content-encoding decompression is not supported"
-            )
-        if isinstance(tmp_source, list):
-            filepaths_or_buffers.extend(tmp_source)
-        else:
-            filepaths_or_buffers.append(tmp_source)
+    filepaths_or_buffers = ioutils.get_reader_filepath_or_buffer(
+        path_or_data=filepath_or_buffer,
+        storage_options=storage_options,
+        bytes_per_thread=bytes_per_thread,
+        expand_dir_pattern="*.orc",
+    )
 
     if filters is not None:
         selected_stripes = _filter_stripes(
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 560f257c115..6b895abbf66 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -329,39 +329,12 @@ def write_to_dataset(
 @_performance_tracking
 def read_parquet_metadata(filepath_or_buffer):
     """{docstring}"""
-    # Multiple sources are passed as a list. If a single source is passed,
-    # wrap it in a list for unified processing downstream.
-    if not is_list_like(filepath_or_buffer):
-        filepath_or_buffer = [filepath_or_buffer]
-
-    # Start by trying to construct a filesystem object
-    fs, paths = ioutils._get_filesystem_and_paths(
-        path_or_data=filepath_or_buffer, storage_options=None
-    )
-
-    # Check if filepath or buffer
-    filepath_or_buffer = paths if paths else filepath_or_buffer
 
     # List of filepaths or buffers
-    filepaths_or_buffers = []
-
-    for source in filepath_or_buffer:
-        tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
-            path_or_data=source,
-            compression=None,
-            fs=fs,
-            storage_options=None,
-            bytes_per_thread=None,
-        )
-
-        if compression is not None:
-            raise ValueError(
-                "URL content-encoding decompression is not supported"
-            )
-        if isinstance(tmp_source, list):
-            filepath_or_buffer.extend(tmp_source)
-        else:
-            filepaths_or_buffers.append(tmp_source)
+    filepaths_or_buffers = ioutils.get_reader_filepath_or_buffer(
+        path_or_data=filepath_or_buffer,
+        bytes_per_thread=None,
+    )
 
     return libparquet.read_parquet_metadata(filepaths_or_buffers)
 
@@ -598,24 +571,12 @@ def read_parquet(
         )
     filepath_or_buffer = paths if paths else filepath_or_buffer
 
-    filepaths_or_buffers = []
-    for source in filepath_or_buffer:
-        tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
-            path_or_data=source,
-            compression=None,
-            fs=fs,
-            storage_options=storage_options,
-            bytes_per_thread=bytes_per_thread,
-        )
-
-        if compression is not None:
-            raise ValueError(
-                "URL content-encoding decompression is not supported"
-            )
-        if isinstance(tmp_source, list):
-            filepath_or_buffer.extend(tmp_source)
-        else:
-            filepaths_or_buffers.append(tmp_source)
+    filepaths_or_buffers = ioutils.get_reader_filepath_or_buffer(
+        path_or_data=filepath_or_buffer,
+        fs=fs,
+        storage_options=storage_options,
+        bytes_per_thread=bytes_per_thread,
+    )
 
     # Warn user if they are not using cudf for IO
     # (There is a good chance this was not the intention)
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 4329480bb2c..0043efce1e4 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -24,12 +24,14 @@ def read_text(
     if delimiter is None:
         raise ValueError("delimiter needs to be provided")
 
-    filepath_or_buffer, _ = ioutils.get_reader_filepath_or_buffer(
+    filepath_or_buffer = ioutils.get_reader_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
-        compression=None,
         iotypes=(BytesIO, StringIO),
         storage_options=storage_options,
     )
+    filepath_or_buffer = ioutils._select_single_source(
+        filepath_or_buffer, "read_text"
+    )
 
     return cudf.Series._from_data(
         libtext.read_text(
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 18106e7475b..e5944d7093c 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -14,6 +14,7 @@
 import pandas as pd
 from fsspec.core import expand_paths_if_needed, get_fs_token_paths
 
+from cudf.api.types import is_list_like
 from cudf.core._compat import PANDAS_LT_300
 from cudf.utils.docutils import docfmt_partial
 
@@ -799,7 +800,7 @@
     k1   k2
 0  1.0  [1]
 """  # noqa: E501
-doc_read_json = docfmt_partial(docstring=_docstring_read_json)
+doc_read_json: Callable = docfmt_partial(docstring=_docstring_read_json)
 
 _docstring_to_json = """
 Convert the cuDF object to a JSON string.
@@ -869,7 +870,7 @@
 --------
 cudf.read_json
 """
-doc_to_json = docfmt_partial(docstring=_docstring_to_json)
+doc_to_json: Callable = docfmt_partial(docstring=_docstring_to_json)
 
 _docstring_read_hdf = """
 Read from the store, close it if we opened it.
@@ -1399,13 +1400,14 @@
 Return either a filepath string to data, or a memory buffer of data.
 If filepath, then the source filepath is expanded to user's environment.
 If buffer, then data is returned in-memory as bytes or a ByteIO object.
+This function is designed to process multiple data sources of the same
+type at once. If path_or_data is a list, the output will also be a list.
 
 Parameters
 ----------
-path_or_data : str, file-like object, bytes, ByteIO
-    Path to data or the data itself.
-compression : str
-    Type of compression algorithm for the content
+path_or_data : str, file-like object, bytes, ByteIO, list
+    Path to data or the data itself. Pass in a list to process multiple
+    sources of the same type at once.
 mode : str
     Mode in which file is opened
 iotypes : (), default (BytesIO)
@@ -1430,14 +1432,15 @@
     better throughput by decomposing it and transferring multiple "blocks"
     in parallel (using a Python thread pool). Default allocation is
     {bytes_per_thread} bytes.
+expand_dir_pattern : str, default None
+    Glob pattern to use when expanding directories into file paths
+    (e.g. "*.json"). If this parameter is not specified, directories
+    will not be expanded.
 
 Returns
 -------
-filepath_or_buffer : str, bytes, BytesIO, list
-    Filepath string or in-memory buffer of data or a
-    list of Filepath strings or in-memory buffers of data.
-compression : str
-    Type of compression algorithm for the content
+List[str, bytes, BytesIO]
+    List of filepath strings or in-memory data buffers.
     """.format(bytes_per_thread=_BYTES_PER_THREAD_DEFAULT)
 
 
@@ -1494,29 +1497,15 @@ def _is_local_filesystem(fs):
     return isinstance(fs, fsspec.implementations.local.LocalFileSystem)
 
 
-def ensure_single_filepath_or_buffer(path_or_data, storage_options=None):
-    """Return False if `path_or_data` resolves to multiple filepaths or
-    buffers.
+def _select_single_source(sources: list, caller: str):
+    """Select the first element from a list of sources.
+    Raise an error if sources contains multiple elements
     """
-    path_or_data = stringify_pathlike(path_or_data)
-    if isinstance(path_or_data, str):
-        path_or_data = os.path.expanduser(path_or_data)
-        try:
-            fs, _, paths = get_fs_token_paths(
-                path_or_data, mode="rb", storage_options=storage_options
-            )
-        except ValueError as e:
-            if str(e).startswith("Protocol not known"):
-                return True
-            else:
-                raise e
-
-        if len(paths) > 1:
-            return False
-    elif isinstance(path_or_data, (list, tuple)) and len(path_or_data) > 1:
-        return False
-
-    return True
+    if len(sources) > 1:
+        raise ValueError(
+            f"{caller} does not support multiple sources, got: {sources}"
+        )
+    return sources[0]
 
 
 def is_directory(path_or_data, storage_options=None):
@@ -1601,10 +1590,24 @@ def _get_filesystem_and_paths(
     return fs, return_paths
 
 
+def _maybe_expand_directories(paths, glob_pattern, fs):
+    # Expand directory paths using a glob pattern.
+    # This is a no-op if either glob_pattern or fs are None
+    if fs is None or glob_pattern is None:
+        return paths
+    expanded_paths = []
+    for path in paths:
+        if fs.isdir(path):
+            expanded_paths.extend(fs.glob(fs.sep.join([path, glob_pattern])))
+        else:
+            expanded_paths.append(path)
+    return expanded_paths
+
+
 @doc_get_reader_filepath_or_buffer()
 def get_reader_filepath_or_buffer(
     path_or_data,
-    compression,
+    *,
     mode="rb",
     fs=None,
     iotypes=(BytesIO,),
@@ -1613,32 +1616,38 @@ def get_reader_filepath_or_buffer(
     bytes_per_thread=_BYTES_PER_THREAD_DEFAULT,
     warn_on_raw_text_input=None,
     warn_meta=None,
+    expand_dir_pattern=None,
 ):
     """{docstring}"""
 
-    path_or_data = stringify_pathlike(path_or_data)
-
-    if isinstance(path_or_data, str):
-        # Get a filesystem object if one isn't already available
-        paths = [path_or_data]
+    # Convert path_or_data to a list of input data sources
+    input_sources = [
+        stringify_pathlike(source)
+        for source in (
+            path_or_data if is_list_like(path_or_data) else [path_or_data]
+        )
+    ]
+    if not input_sources:
+        raise ValueError("Empty input source list: {input_sources}.")
+
+    filepaths_or_buffers = []
+    string_paths = [isinstance(source, str) for source in input_sources]
+    if any(string_paths):
+        # Sources are all strings. Thes strings are typically
+        # file paths, but they may also be raw text strings.
+
+        # Don't allow a mix of source types
+        if not all(string_paths):
+            raise ValueError("Invalid input source list: {input_sources}.")
+
+        # Make sure we define a filesystem (if possible)
+        paths = input_sources
+        raw_text_input = False
         if fs is None:
-            fs, paths = _get_filesystem_and_paths(
-                path_or_data, storage_options
-            )
-            if fs is None:
-                if warn_on_raw_text_input:
-                    # Do not remove until pandas 3.0 support is added.
-                    assert (
-                        PANDAS_LT_300
-                    ), "Need to drop after pandas-3.0 support is added."
-                    warnings.warn(
-                        f"Passing literal {warn_meta[0]} to {warn_meta[1]} is "
-                        "deprecated and will be removed in a future version. "
-                        "To read from a literal string, wrap it in a "
-                        "'StringIO' object.",
-                        FutureWarning,
-                    )
-                return path_or_data, compression
+            fs, paths = _get_filesystem_and_paths(paths, storage_options)
+
+        # Expand directories (if necessary)
+        paths = _maybe_expand_directories(paths, expand_dir_pattern, fs)
 
         if _is_local_filesystem(fs):
             # Doing this as `read_json` accepts a json string
@@ -1660,7 +1669,7 @@ def get_reader_filepath_or_buffer(
 
             if len(paths):
                 if fs.exists(paths[0]):
-                    path_or_data = paths if len(paths) > 1 else paths[0]
+                    filepaths_or_buffers = paths
 
                 # raise FileNotFound if path looks like json
                 # following pandas
@@ -1670,21 +1679,40 @@ def get_reader_filepath_or_buffer(
                     tuple(f".json{c}" for c in compression_extensions)
                 ):
                     raise FileNotFoundError(
-                        f"{path_or_data} could not be resolved to any files"
+                        f"{input_sources} could not be resolved to any files"
                     )
-                elif warn_on_raw_text_input:
-                    # Do not remove until pandas 3.0 support is added.
-                    assert (
-                        PANDAS_LT_300
-                    ), "Need to drop after pandas-3.0 support is added."
-                    warnings.warn(
-                        f"Passing literal {warn_meta[0]} to {warn_meta[1]} is "
-                        "deprecated and will be removed in a future version. "
-                        "To read from a literal string, wrap it in a "
-                        "'StringIO' object.",
-                        FutureWarning,
+                else:
+                    raw_text_input = True
+            else:
+                raw_text_input = True
+
+        elif fs is not None:
+            # TODO: We can use cat_ranges and/or parquet-aware logic
+            # to copy all remote data into host memory at once here.
+            # The current solution iterates over files, and copies
+            # ALL data from each file (even when we are performing
+            # partial IO, and don't need the entire file)
+            if len(paths) == 0:
+                raise FileNotFoundError(
+                    f"{input_sources} could not be resolved to any files"
+                )
+            filepaths_or_buffers = [
+                BytesIO(
+                    _fsspec_data_transfer(
+                        fpath,
+                        fs=fs,
+                        mode=mode,
+                        bytes_per_thread=bytes_per_thread,
                     )
-            elif warn_on_raw_text_input:
+                )
+                for fpath in paths
+            ]
+        else:
+            raw_text_input = True
+
+        if raw_text_input:
+            filepaths_or_buffers = input_sources
+            if warn_on_raw_text_input:
                 # Do not remove until pandas 3.0 support is added.
                 assert (
                     PANDAS_LT_300
@@ -1697,35 +1725,25 @@ def get_reader_filepath_or_buffer(
                     FutureWarning,
                 )
 
-        else:
-            if len(paths) == 0:
-                raise FileNotFoundError(
-                    f"{path_or_data} could not be resolved to any files"
-                )
-            path_or_data = [
-                BytesIO(
-                    _fsspec_data_transfer(
-                        fpath,
-                        fs=fs,
-                        mode=mode,
-                        bytes_per_thread=bytes_per_thread,
+    else:
+        # Sources are already buffers or file-like objects
+        for source in input_sources:
+            if not isinstance(source, iotypes) and is_file_like(source):
+                if isinstance(source, TextIOWrapper):
+                    source = source.buffer
+                filepaths_or_buffers.append(
+                    BytesIO(
+                        _fsspec_data_transfer(
+                            source,
+                            mode=mode,
+                            bytes_per_thread=bytes_per_thread,
+                        )
                     )
                 )
-                for fpath in paths
-            ]
-            if len(path_or_data) == 1:
-                path_or_data = path_or_data[0]
-
-    elif not isinstance(path_or_data, iotypes) and is_file_like(path_or_data):
-        if isinstance(path_or_data, TextIOWrapper):
-            path_or_data = path_or_data.buffer
-        path_or_data = BytesIO(
-            _fsspec_data_transfer(
-                path_or_data, mode=mode, bytes_per_thread=bytes_per_thread
-            )
-        )
+            else:
+                filepaths_or_buffers.append(source)
 
-    return path_or_data, compression
+    return filepaths_or_buffers
 
 
 def get_writer_filepath_or_buffer(path_or_data, mode, storage_options=None):

From d15d470e526de205bed8808a9c15d0a4d7642667 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 26 Aug 2024 12:03:41 -0500
Subject: [PATCH 723/842] Preserve Series name in duplicated method. (#16655)

Closes #16654.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16655
---
 python/cudf/cudf/core/indexed_frame.py | 4 +++-
 python/cudf/cudf/tests/test_series.py  | 5 +++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 60253b9ae5d..ad6aa56d472 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -3198,8 +3198,10 @@ def duplicated(self, subset=None, keep="first"):
         """
         subset = self._preprocess_subset(subset)
 
+        name = None
         if isinstance(self, cudf.Series):
             columns = [self._column]
+            name = self.name
         else:
             columns = [self._data[n] for n in subset]
         distinct = libcudf.stream_compaction.distinct_indices(
@@ -3211,7 +3213,7 @@ def duplicated(self, subset=None, keep="first"):
             [as_column(True, length=len(self), dtype=bool)],
             bounds_check=False,
         )[0]
-        return cudf.Series._from_column(result, index=self.index)
+        return cudf.Series._from_column(result, index=self.index, name=name)
 
     @_performance_tracking
     def _empty_like(self, keep_index=True) -> Self:
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index c7aea563535..8d673e23ab2 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2115,8 +2115,9 @@ def test_series_hasnans(data):
     ],
 )
 @pytest.mark.parametrize("keep", ["first", "last", False])
-def test_series_duplicated(data, index, keep):
-    gs = cudf.Series(data, index=index)
+@pytest.mark.parametrize("name", [None, "a"])
+def test_series_duplicated(data, index, keep, name):
+    gs = cudf.Series(data, index=index, name=name)
     ps = gs.to_pandas()
 
     assert_eq(gs.duplicated(keep=keep), ps.duplicated(keep=keep))

From f5113228c3aa89d49e71d42d11c38afe52695aa6 Mon Sep 17 00:00:00 2001
From: "Marcus D. Hanwell" <mhanwell@gmail.com>
Date: Mon, 26 Aug 2024 16:19:30 -0400
Subject: [PATCH 724/842] bug-fix: Don't enable the CUDA language if testing
 was requested when finding cudf (#16615)

This PR removes CMake code enabling the CUDA language if the testing component was requested.
Closes #16614

Authors:
  - Marcus D. Hanwell (https://github.com/cryos)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16615
---
 cpp/CMakeLists.txt | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 6b8bb26825b..a6f72ed6b75 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -1069,23 +1069,12 @@ if(CUDF_ENABLE_ARROW_PARQUET)
   )
 endif()
 
-string(
-  APPEND
-  install_code_string
-  [=[
-if(testing IN_LIST cudf_FIND_COMPONENTS)
-  enable_language(CUDA)
-endif()
-]=]
-)
-
 rapids_export(
   INSTALL cudf
   EXPORT_SET cudf-exports ${_components_export_string}
   GLOBAL_TARGETS cudf cudftestutil
   NAMESPACE cudf::
   DOCUMENTATION doc_string
-  FINAL_CODE_BLOCK install_code_string
 )
 
 # ##################################################################################################

From c4591c06db5347ea2bf6e37ead678343042a7932 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 27 Aug 2024 09:23:43 -0400
Subject: [PATCH 725/842] Use non-mangled type names in nvbench output (#16649)

Uses the `NVBENCH_DECLARE_TYPE_STRINGS` feature to produce readable type names in the nvbench output.
Example previous output for `cudf::timestamp_ms` would appear like this:
```
| cuda::std::__4::chrono::time_point<cuda::std::__4::chrono::system_clock, cuda::std::__4::chrono::duration<long, cuda::std::__4::ratio<1l, 1000l> > > | 100000 |  23840x | 25.138 us | 21.98% | 20.979 us |  9.54% | 4.767G |  38.134 GB/s |  4.38% |
```
Adding the nvbench name feature changes this to:
```
| cudf::timestamp_ms | 100000 |  24752x | 24.387 us | 21.58% | 20.208 us | 3.79% | 4.948G |  39.588 GB/s |  4.55% |
```

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16649
---
 cpp/benchmarks/reduction/minmax.cpp | 2 ++
 cpp/benchmarks/reduction/reduce.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/cpp/benchmarks/reduction/minmax.cpp b/cpp/benchmarks/reduction/minmax.cpp
index c89e22d3f44..636de303cc4 100644
--- a/cpp/benchmarks/reduction/minmax.cpp
+++ b/cpp/benchmarks/reduction/minmax.cpp
@@ -47,6 +47,8 @@ static void reduction_minmax(nvbench::state& state, nvbench::type_list<DataType>
   set_throughputs(state);
 }
 
+NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");
+
 using Types = nvbench::type_list<bool, int8_t, int32_t, float, cudf::timestamp_ms>;
 
 NVBENCH_BENCH_TYPES(reduction_minmax, NVBENCH_TYPE_AXES(Types))
diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp
index 14bf90c4943..a30c27c519c 100644
--- a/cpp/benchmarks/reduction/reduce.cpp
+++ b/cpp/benchmarks/reduction/reduce.cpp
@@ -81,6 +81,8 @@ static void reduction(nvbench::state& state, nvbench::type_list<DataType, nvbenc
   set_throughputs(state);
 }
 
+NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");
+
 using Types    = nvbench::type_list<int32_t, int64_t, double, cudf::timestamp_ms>;
 using AggKinds = nvbench::enum_type_list<cudf::reduce_aggregation::MIN,
                                          cudf::reduce_aggregation::SUM,

From 115ddcef6451ec7befad69affdafd6a2c8304660 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 27 Aug 2024 09:34:51 -0400
Subject: [PATCH 726/842] Fix integer overflow in indexalator pointer logic
 (#16643)

Fixes integer overflow in the indexalator logic when incrementing/decrementing its data pointer. Any sufficiently large int32 input values used in computing the byte-pointer position causes an overflow when multiplying the value by the byte-width of the underlying index type. For example, this overflow would occur when accessing rows greater than 536,870,912 with an underlying index type of int32 (4-bytes).

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Paul Mattione (https://github.com/pmattione-nvidia)

URL: https://github.com/rapidsai/cudf/pull/16643
---
 cpp/include/cudf/detail/indexalator.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index ec7b1c3e6b6..f0510c86c3a 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -93,7 +93,7 @@ struct input_indexalator : base_normalator<input_indexalator, cudf::size_type> {
    */
   __device__ inline cudf::size_type operator[](size_type idx) const
   {
-    void const* tp = p_ + (idx * this->width_);
+    void const* tp = p_ + (static_cast<std::ptrdiff_t>(idx) * this->width_);
     return type_dispatcher(this->dtype_, normalize_type{}, tp);
   }
 
@@ -109,7 +109,7 @@ struct input_indexalator : base_normalator<input_indexalator, cudf::size_type> {
   CUDF_HOST_DEVICE input_indexalator(void const* data, data_type dtype, cudf::size_type offset = 0)
     : base_normalator<input_indexalator, cudf::size_type>(dtype), p_{static_cast<char const*>(data)}
   {
-    p_ += offset * this->width_;
+    p_ += static_cast<std::ptrdiff_t>(offset) * this->width_;
   }
 
  protected:
@@ -165,7 +165,7 @@ struct output_indexalator : base_normalator<output_indexalator, cudf::size_type>
   __device__ inline output_indexalator const operator[](size_type idx) const
   {
     output_indexalator tmp{*this};
-    tmp.p_ += (idx * this->width_);
+    tmp.p_ += static_cast<std::ptrdiff_t>(idx) * this->width_;
     return tmp;
   }
 

From efa97704d0c1ee83d04ab59f1746194c86743656 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 27 Aug 2024 11:02:22 -0500
Subject: [PATCH 727/842] Drop Python 3.9 support (#16637)

Contributes to https://github.com/rapidsai/build-planning/issues/88

Finishes the work of dropping Python 3.9 support.

This project stopped building / testing against Python 3.9 as of https://github.com/rapidsai/shared-workflows/pull/235.
This PR updates configuration and docs to reflect that.

## Notes for Reviewers

### How I tested this

Checked that there were no remaining uses like this:

```shell
git grep -E '3\.9'
git grep '39'
git grep 'py39'
```

And similar for variations on Python 3.8 (to catch things that were missed the last time this was done).

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16637
---
 README.md                                     |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             |  2 +-
 .../all_cuda-125_arch-x86_64.yaml             |  2 +-
 cpp/cmake/thirdparty/get_arrow.cmake          |  2 +-
 dependencies.yaml                             |  6 +----
 python/cudf/pyproject.toml                    |  3 +--
 python/cudf_kafka/pyproject.toml              |  2 +-
 .../cudf_polars/containers/dataframe.py       | 13 +++++----
 python/cudf_polars/cudf_polars/dsl/ir.py      | 27 ++++++++++++-------
 .../cudf_polars/typing/__init__.py            |  4 +--
 .../cudf_polars/cudf_polars/utils/sorting.py  |  2 +-
 python/cudf_polars/pyproject.toml             | 12 ++++++---
 python/custreamz/pyproject.toml               |  3 +--
 python/dask_cudf/pyproject.toml               |  3 +--
 python/pylibcudf/pyproject.toml               |  3 +--
 15 files changed, 47 insertions(+), 39 deletions(-)

diff --git a/README.md b/README.md
index fd8b0365807..f1b010394d6 100644
--- a/README.md
+++ b/README.md
@@ -89,7 +89,7 @@ conda install -c rapidsai -c conda-forge -c nvidia \
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
 of our latest development branch.
 
-Note: cuDF is supported only on Linux, and with Python versions 3.9 and later.
+Note: cuDF is supported only on Linux, and with Python versions 3.10 and later.
 
 See the [RAPIDS installation guide](https://docs.rapids.ai/install) for more OS and version info.
 
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 5cf7508ba51..fcd6e27a7f6 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -76,7 +76,7 @@ dependencies:
 - pytest-xdist
 - pytest<8
 - python-confluent-kafka>=1.9.0,<1.10.0a0
-- python>=3.9,<3.12
+- python>=3.10,<3.12
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - rapids-dask-dependency==24.10.*,>=0.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 28b927254f7..bedc3a90885 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -74,7 +74,7 @@ dependencies:
 - pytest-xdist
 - pytest<8
 - python-confluent-kafka>=1.9.0,<1.10.0a0
-- python>=3.9,<3.12
+- python>=3.10,<3.12
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - rapids-dask-dependency==24.10.*,>=0.0.0a0
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 0afdc526981..e3e6a07661a 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -45,7 +45,7 @@ function(find_libarrow_in_python_wheel PYARROW_VERSION)
     APPEND
     initial_code_block
     [=[
-find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
+find_package(Python 3.10 REQUIRED COMPONENTS Interpreter)
 execute_process(
     COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_library_dirs()[0])"
     OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR
diff --git a/dependencies.yaml b/dependencies.yaml
index 194577817db..04b5940c9fb 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -584,10 +584,6 @@ dependencies:
     specific:
       - output_types: conda
         matrices:
-          - matrix:
-              py: "3.9"
-            packages:
-              - python=3.9
           - matrix:
               py: "3.10"
             packages:
@@ -598,7 +594,7 @@ dependencies:
               - python=3.11
           - matrix:
             packages:
-              - python>=3.9,<3.12
+              - python>=3.10,<3.12
   run_common:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index e7bac17f8ba..a6d26d17d46 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -16,7 +16,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
     "cachetools",
     "cubinlinker",
@@ -42,7 +42,6 @@ classifiers = [
     "Topic :: Scientific/Engineering",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
 ]
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 2d0222a3fe9..01e7299a33a 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -16,7 +16,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
     "cudf==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cudf_polars/cudf_polars/containers/dataframe.py b/python/cudf_polars/cudf_polars/containers/dataframe.py
index 7c28e7b9a6c..a5c99e2bc11 100644
--- a/python/cudf_polars/cudf_polars/containers/dataframe.py
+++ b/python/cudf_polars/cudf_polars/containers/dataframe.py
@@ -105,7 +105,9 @@ def from_polars(cls, df: pl.DataFrame) -> Self:
         return cls(
             [
                 NamedColumn(column, h_col.name).copy_metadata(h_col)
-                for column, h_col in zip(d_table.columns(), df.iter_columns())
+                for column, h_col in zip(
+                    d_table.columns(), df.iter_columns(), strict=True
+                )
             ]
         )
 
@@ -134,8 +136,10 @@ def from_table(cls, table: plc.Table, names: Sequence[str]) -> Self:
         if table.num_columns() != len(names):
             raise ValueError("Mismatching name and table length.")
         return cls(
-            # TODO: strict=True when we drop py39
-            [NamedColumn(c, name) for c, name in zip(table.columns(), names)]
+            [
+                NamedColumn(c, name)
+                for c, name in zip(table.columns(), names, strict=True)
+            ]
         )
 
     def sorted_like(
@@ -165,8 +169,7 @@ def sorted_like(
         subset = self.column_names_set if subset is None else subset
         self.columns = [
             c.sorted_like(other) if c.name in subset else c
-            # TODO: strict=True when we drop py39
-            for c, other in zip(self.columns, like.columns)
+            for c, other in zip(self.columns, like.columns, strict=True)
         ]
         return self
 
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index 019f00f4fca..ebc7dee6bfb 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -310,7 +310,8 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                 *(
                     (piece.tbl, piece.column_names(include_children=False))
                     for piece in pieces
-                )
+                ),
+                strict=True,
             )
             df = DataFrame.from_table(
                 plc.concatenate.concatenate(list(tables)),
@@ -426,7 +427,8 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             pdf = pdf.select(self.projection)
         df = DataFrame.from_polars(pdf)
         assert all(
-            c.obj.type() == dtype for c, dtype in zip(df.columns, self.schema.values())
+            c.obj.type() == dtype
+            for c, dtype in zip(df.columns, self.schema.values(), strict=True)
         )
         if self.predicate is not None:
             (mask,) = broadcast(self.predicate.evaluate(df), target_length=df.num_rows)
@@ -600,9 +602,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         for i, table in enumerate(raw_tables):
             (column,) = table.columns()
             raw_columns.append(NamedColumn(column, f"tmp{i}"))
-        mapping = dict(zip(replacements, raw_columns))
+        mapping = dict(zip(replacements, raw_columns, strict=True))
         result_keys = [
-            NamedColumn(gk, k.name) for gk, k in zip(group_keys.columns(), keys)
+            NamedColumn(gk, k.name)
+            for gk, k in zip(group_keys.columns(), keys, strict=True)
         ]
         result_subs = DataFrame(raw_columns)
         results = [
@@ -752,7 +755,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             columns = plc.join.cross_join(left.table, right.table).columns()
             left_cols = [
                 NamedColumn(new, old.name).sorted_like(old)
-                for new, old in zip(columns[: left.num_columns], left.columns)
+                for new, old in zip(
+                    columns[: left.num_columns], left.columns, strict=True
+                )
             ]
             right_cols = [
                 NamedColumn(
@@ -761,7 +766,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                     if old.name not in left.column_names_set
                     else f"{old.name}{suffix}",
                 )
-                for new, old in zip(columns[left.num_columns :], right.columns)
+                for new, old in zip(
+                    columns[left.num_columns :], right.columns, strict=True
+                )
             ]
             return DataFrame([*left_cols, *right_cols])
         # TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184
@@ -803,6 +810,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
                         for left_col, right_col in zip(
                             left.select_columns(left_on.column_names_set),
                             right.select_columns(right_on.column_names_set),
+                            strict=True,
                         )
                     )
                 )
@@ -909,7 +917,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
         result = DataFrame(
             [
                 NamedColumn(c, old.name).sorted_like(old)
-                for c, old in zip(table.columns(), df.columns)
+                for c, old in zip(table.columns(), df.columns, strict=True)
             ]
         )
         if keys_sorted or self.stable:
@@ -974,7 +982,8 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             self.null_order,
         )
         columns = [
-            NamedColumn(c, old.name) for c, old in zip(table.columns(), df.columns)
+            NamedColumn(c, old.name)
+            for c, old in zip(table.columns(), df.columns, strict=True)
         ]
         # If a sort key is in the result table, set the sortedness property
         for k, i in enumerate(keys_in_result):
@@ -1089,7 +1098,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame:
             # final tag is "swapping" which is useful for the
             # optimiser (it blocks some pushdown operations)
             old, new, _ = self.options
-            return df.rename_columns(dict(zip(old, new)))
+            return df.rename_columns(dict(zip(old, new, strict=True)))
         elif self.name == "explode":
             df = self.df.evaluate(cache=cache)
             ((to_explode,),) = self.options
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index 02440e67fde..5276073e62a 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -13,9 +13,7 @@
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
 if TYPE_CHECKING:
-    from typing import Callable
-
-    from typing_extensions import TypeAlias
+    from typing import Callable, TypeAlias
 
     import polars as pl
 
diff --git a/python/cudf_polars/cudf_polars/utils/sorting.py b/python/cudf_polars/cudf_polars/utils/sorting.py
index 17ea44e5b1b..6ce216cbf8f 100644
--- a/python/cudf_polars/cudf_polars/utils/sorting.py
+++ b/python/cudf_polars/cudf_polars/utils/sorting.py
@@ -45,7 +45,7 @@ def sort_order(
     null_precedence = []
     if len(descending) != len(nulls_last) or len(descending) != num_keys:
         raise ValueError("Mismatching length of arguments in sort_order")
-    for asc, null_last in zip(column_order, nulls_last):
+    for asc, null_last in zip(column_order, nulls_last, strict=True):
         if (asc == plc.types.Order.ASCENDING) ^ (not null_last):
             null_precedence.append(plc.types.NullOrder.AFTER)
         elif (asc == plc.types.Order.ASCENDING) ^ null_last:
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index c380853035d..0382e3ce6a2 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -17,7 +17,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
     "polars>=1.0,<1.3",
     "pylibcudf==24.10.*,>=0.0.0a0",
@@ -28,7 +28,6 @@ classifiers = [
     "Topic :: Scientific/Engineering",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
 ]
@@ -62,7 +61,7 @@ exclude_also = [
 [tool.ruff]
 line-length = 88
 indent-width = 4
-target-version = "py39"
+target-version = "py310"
 fix = true
 
 [tool.ruff.lint]
@@ -115,6 +114,9 @@ ignore = [
   "TD003", # Missing issue link on the line following this TODO
   # tryceratops
   "TRY003", # Avoid specifying long messages outside the exception class
+  # pyupgrade
+  "UP035",  # Import from `collections.abc` instead: `Callable`
+  "UP038",  # Use `X | Y` in `isinstance` call instead of `(X, Y)`
   # Lints below are turned off because of conflicts with the ruff
   # formatter
   # See https://docs.astral.sh/ruff/formatter/#conflicting-lint-rules
@@ -137,6 +139,10 @@ fixable = ["ALL"]
 
 [tool.ruff.lint.per-file-ignores]
 "**/tests/**/*.py" = ["D"]
+"**/cudf_polars/typing/__init__.py" = [
+  # pyupgrade
+  "UP007", # Use `X | Y` for type annotations
+]
 
 [tool.ruff.lint.flake8-pytest-style]
 # https://docs.astral.sh/ruff/settings/#lintflake8-pytest-style
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index d6b88167262..be5331236a5 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -17,7 +17,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
     "confluent-kafka>=1.9.0,<1.10.0a0",
     "cudf==24.10.*,>=0.0.0a0",
@@ -31,7 +31,6 @@ classifiers = [
     "Topic :: Apache Kafka",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
 ]
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index d5da7030a75..93bf532d67f 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -17,7 +17,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
     "cudf==24.10.*,>=0.0.0a0",
     "cupy-cuda11x>=12.0.0",
@@ -32,7 +32,6 @@ classifiers = [
     "Topic :: Scientific/Engineering",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
 ]
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index 5f5594b462b..0d673ea4cc3 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -16,7 +16,7 @@ authors = [
     { name = "NVIDIA Corporation" },
 ]
 license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
+requires-python = ">=3.10"
 dependencies = [
     "cuda-python>=11.7.1,<12.0a0",
     "libcudf==24.10.*,>=0.0.0a0",
@@ -32,7 +32,6 @@ classifiers = [
     "Topic :: Scientific/Engineering",
     "License :: OSI Approved :: Apache Software License",
     "Programming Language :: Python",
-    "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
 ]

From f1cc962df38b1fc113b579bef57a27f93d11cec2 Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Tue, 27 Aug 2024 09:16:39 -0700
Subject: [PATCH 728/842] Fix `cudf::rank` not getting enough params (#16666)

Fix issue #16624

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16666
---
 cpp/benchmarks/sort/rank_lists.cpp   | 2 ++
 cpp/benchmarks/sort/rank_structs.cpp | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/cpp/benchmarks/sort/rank_lists.cpp b/cpp/benchmarks/sort/rank_lists.cpp
index fbdb40b3537..7015fe08089 100644
--- a/cpp/benchmarks/sort/rank_lists.cpp
+++ b/cpp/benchmarks/sort/rank_lists.cpp
@@ -37,6 +37,8 @@ void nvbench_rank_lists(nvbench::state& state, nvbench::type_list<nvbench::enum_
                cudf::order::ASCENDING,
                null_frequency ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE,
                cudf::null_order::AFTER,
+               false,
+               cudf::get_default_stream(),
                rmm::mr::get_current_device_resource());
   });
 }
diff --git a/cpp/benchmarks/sort/rank_structs.cpp b/cpp/benchmarks/sort/rank_structs.cpp
index 4b0da29df9d..8b4b09464d8 100644
--- a/cpp/benchmarks/sort/rank_structs.cpp
+++ b/cpp/benchmarks/sort/rank_structs.cpp
@@ -35,6 +35,8 @@ void nvbench_rank_structs(nvbench::state& state, nvbench::type_list<nvbench::enu
                cudf::order::ASCENDING,
                nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE,
                cudf::null_order::AFTER,
+               false,
+               cudf::get_default_stream(),
                rmm::mr::get_current_device_resource());
   });
 }

From 2d494ed7860c4c3295c5a9f4dc3a605565f30494 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 27 Aug 2024 09:30:16 -0700
Subject: [PATCH 729/842] Add `num_multiprocessors` utility (#16628)

This PR introduces a new `num_multiprocessors` utility and moves the existing `elements_per_thread` host utility to the new `cuda.hpp` header.

Needed by #16619.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16628
---
 cpp/CMakeLists.txt                            |  1 +
 cpp/benchmarks/join/generate_input_tables.cuh | 10 +---
 cpp/include/cudf/detail/copy_if.cuh           |  1 +
 cpp/include/cudf/detail/utilities/cuda.cuh    | 29 ---------
 cpp/include/cudf/detail/utilities/cuda.hpp    | 59 +++++++++++++++++++
 cpp/src/io/comp/debrotli.cu                   | 18 +++---
 cpp/src/utilities/cuda.cpp                    | 34 +++++++++++
 7 files changed, 105 insertions(+), 47 deletions(-)
 create mode 100644 cpp/include/cudf/detail/utilities/cuda.hpp
 create mode 100644 cpp/src/utilities/cuda.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index a6f72ed6b75..4080c5d02da 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -666,6 +666,7 @@ add_library(
   src/unary/math_ops.cu
   src/unary/nan_ops.cu
   src/unary/null_ops.cu
+  src/utilities/cuda.cpp
   src/utilities/cuda_memcpy.cu
   src/utilities/default_stream.cpp
   src/utilities/host_memory.cpp
diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
index f7984b29d6b..75bbe8174d3 100644
--- a/cpp/benchmarks/join/generate_input_tables.cuh
+++ b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -150,13 +151,8 @@ void generate_input_tables(key_type* const build_tbl,
   CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
     &num_blocks_init_probe_tbl, init_probe_tbl<key_type, size_type>, block_size, 0));
 
-  int dev_id{-1};
-  CUDF_CUDA_TRY(cudaGetDevice(&dev_id));
-
-  int num_sms{-1};
-  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
-
-  int const num_states =
+  auto const num_sms = cudf::detail::num_multiprocessors();
+  auto const num_states =
     num_sms * std::max(num_blocks_init_build_tbl, num_blocks_init_probe_tbl) * block_size;
   rmm::device_uvector<curandState> devStates(num_states, cudf::get_default_stream());
 
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index b6310e6cd2f..4071fa01fb2 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -22,6 +22,7 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/table/table.hpp>
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index 5007af7f9f1..d31ca3d92d1 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -189,35 +189,6 @@ __device__ T single_lane_block_sum_reduce(T lane_value)
   return result;
 }
 
-/**
- * @brief Get the number of elements that can be processed per thread.
- *
- * @param[in] kernel The kernel for which the elements per thread needs to be assessed
- * @param[in] total_size Number of elements
- * @param[in] block_size Expected block size
- *
- * @return cudf::size_type Elements per thread that can be processed for given specification.
- */
-template <typename Kernel>
-cudf::size_type elements_per_thread(Kernel kernel,
-                                    cudf::size_type total_size,
-                                    cudf::size_type block_size,
-                                    cudf::size_type max_per_thread = 32)
-{
-  CUDF_FUNC_RANGE();
-
-  // calculate theoretical occupancy
-  int max_blocks = 0;
-  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0));
-
-  int device = 0;
-  CUDF_CUDA_TRY(cudaGetDevice(&device));
-  int num_sms = 0;
-  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
-  int per_thread = total_size / (max_blocks * num_sms * block_size);
-  return std::clamp(per_thread, 1, max_per_thread);
-}
-
 /**
  * @brief Finds the smallest value not less than `number_to_round` and modulo `modulus` is
  * zero. Expects modulus to be a power of 2.
diff --git a/cpp/include/cudf/detail/utilities/cuda.hpp b/cpp/include/cudf/detail/utilities/cuda.hpp
new file mode 100644
index 00000000000..58c7ae8ed6a
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/cuda.hpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <algorithm>
+
+namespace CUDF_EXPORT cudf {
+namespace detail {
+
+/**
+ * @brief Get the number of multiprocessors on the device
+ */
+cudf::size_type num_multiprocessors();
+
+/**
+ * @brief Get the number of elements that can be processed per thread.
+ *
+ * @param[in] kernel The kernel for which the elements per thread needs to be assessed
+ * @param[in] total_size Number of elements
+ * @param[in] block_size Expected block size
+ *
+ * @return cudf::size_type Elements per thread that can be processed for given specification.
+ */
+template <typename Kernel>
+cudf::size_type elements_per_thread(Kernel kernel,
+                                    cudf::size_type total_size,
+                                    cudf::size_type block_size,
+                                    cudf::size_type max_per_thread = 32)
+{
+  CUDF_FUNC_RANGE();
+
+  // calculate theoretical occupancy
+  int max_blocks = 0;
+  CUDF_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_blocks, kernel, block_size, 0));
+
+  int per_thread = total_size / (max_blocks * num_multiprocessors() * block_size);
+  return std::clamp(per_thread, 1, max_per_thread);
+}
+
+}  // namespace detail
+}  // namespace CUDF_EXPORT cudf
diff --git a/cpp/src/io/comp/debrotli.cu b/cpp/src/io/comp/debrotli.cu
index 861820f47e7..72649dbe427 100644
--- a/cpp/src/io/comp/debrotli.cu
+++ b/cpp/src/io/comp/debrotli.cu
@@ -58,6 +58,7 @@ THE SOFTWARE.
 #include "gpuinflate.hpp"
 #include "io/utilities/block_utils.cuh"
 
+#include <cudf/detail/utilities/cuda.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -2047,19 +2048,14 @@ CUDF_KERNEL void __launch_bounds__(block_size, 2)
  */
 size_t __host__ get_gpu_debrotli_scratch_size(int max_num_inputs)
 {
-  int sm_count = 0;
-  int dev      = 0;
   uint32_t max_fb_size, min_fb_size, fb_size;
-  CUDF_CUDA_TRY(cudaGetDevice(&dev));
-  if (cudaSuccess == cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev)) {
-    // printf("%d SMs on device %d\n", sm_count, dev);
-    max_num_inputs =
-      min(max_num_inputs, sm_count * 3);  // no more than 3 blocks/sm at most due to 32KB smem use
-    if (max_num_inputs <= 0) {
-      max_num_inputs = sm_count * 2;  // Target 2 blocks/SM by default for scratch mem computation
-    }
+  auto const sm_count = cudf::detail::num_multiprocessors();
+  // no more than 3 blocks/sm at most due to 32KB smem use
+  max_num_inputs = std::min(max_num_inputs, sm_count * 3);
+  if (max_num_inputs <= 0) {
+    max_num_inputs = sm_count * 2;  // Target 2 blocks/SM by default for scratch mem computation
   }
-  max_num_inputs = min(max(max_num_inputs, 1), 512);
+  max_num_inputs = std::min(std::max(max_num_inputs, 1), 512);
   // Max fb size per block occurs if all huffman tables for all 3 group types fail local_alloc()
   // with num_htrees=256 (See HuffmanTreeGroupAlloc)
   max_fb_size = 256 * (630 + 1080 + 920) * 2;  // 1.3MB
diff --git a/cpp/src/utilities/cuda.cpp b/cpp/src/utilities/cuda.cpp
new file mode 100644
index 00000000000..53ca0608170
--- /dev/null
+++ b/cpp/src/utilities/cuda.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/utilities/cuda.hpp>
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+
+#include <cuda_runtime.h>
+
+namespace cudf::detail {
+
+cudf::size_type num_multiprocessors()
+{
+  int device = 0;
+  CUDF_CUDA_TRY(cudaGetDevice(&device));
+  int num_sms = 0;
+  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, device));
+  return num_sms;
+}
+
+}  // namespace cudf::detail

From dd585e84756992bee0ecbae6f77107d64cddaede Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 27 Aug 2024 12:58:02 -0400
Subject: [PATCH 730/842] Prune workflows based on changed files (#16642)

Only run tests based on things that have actually changed. For example, if only Python files have changed, we don't need to run the C++ tests.

Contributes to https://github.com/rapidsai/build-planning/issues/94

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/16642
---
 .github/workflows/pr.yaml | 88 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 78 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 2e2a8b6b9bc..35c7e3d95b6 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -12,6 +12,7 @@ concurrency:
 jobs:
   pr-builder:
     needs:
+      - changed-files
       - checks
       - conda-cpp-build
       - conda-cpp-checks
@@ -37,6 +38,63 @@ jobs:
       - pandas-tests-diff
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
+    if: always()
+    with:
+      needs: ${{ toJSON(needs) }}
+  changed-files:
+    runs-on: ubuntu-latest
+    name: "Check changed files"
+    outputs:
+      test_cpp: ${{ steps.changed-files.outputs.cpp_any_changed == 'true' }}
+      test_java: ${{ steps.changed-files.outputs.java_any_changed == 'true' }}
+      test_notebooks: ${{ steps.changed-files.outputs.notebooks_any_changed == 'true' }}
+      test_python: ${{ steps.changed-files.outputs.python_any_changed == 'true' }}
+    steps:
+      - name: Get PR info
+        id: get-pr-info
+        uses: rapidsai/shared-actions/get-pr-info@main
+      - name: Checkout code repo
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.sha }}
+          fetch-depth: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).commits }}
+          persist-credentials: false
+      - name: Get changed files
+        id: changed-files
+        uses: tj-actions/changed-files@v45
+        with:
+          base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
+          files_yaml: |
+            cpp:
+              - '**'
+              - '!CONTRIBUTING.md'
+              - '!README.md'
+              - '!docs/**'
+              - '!img/**'
+              - '!java/**'
+              - '!notebooks/**'
+              - '!python/**'
+            java:
+              - '**'
+              - '!CONTRIBUTING.md'
+              - '!README.md'
+              - '!docs/**'
+              - '!img/**'
+              - '!notebooks/**'
+              - '!python/**'
+            notebooks:
+              - '**'
+              - '!CONTRIBUTING.md'
+              - '!README.md'
+              - '!java/**'
+            python:
+              - '**'
+              - '!CONTRIBUTING.md'
+              - '!README.md'
+              - '!docs/**'
+              - '!img/**'
+              - '!java/**'
+              - '!notebooks/**'
   checks:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
@@ -56,9 +114,10 @@ jobs:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
-    needs: conda-cpp-build
+    needs: [conda-cpp-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_cpp == 'true'
     with:
       build_type: pull-request
   conda-python-build:
@@ -68,24 +127,27 @@ jobs:
     with:
       build_type: pull-request
   conda-python-cudf-tests:
-    needs: conda-python-build
+    needs: [conda-python-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
       script: "ci/test_python_cudf.sh"
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
-    needs: conda-python-build
+    needs: [conda-python-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
       script: "ci/test_python_other.sh"
   conda-java-tests:
-    needs: conda-cpp-build
+    needs: [conda-cpp-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_java == 'true'
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -103,9 +165,10 @@ jobs:
       container_image: "rapidsai/ci-wheel:latest"
       run_script: "ci/configure_cpp_static.sh"
   conda-notebook-tests:
-    needs: conda-python-build
+    needs: [conda-python-build, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_notebooks == 'true'
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -145,9 +208,10 @@ jobs:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
-    needs: wheel-build-cudf
+    needs: [wheel-build-cudf, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
@@ -161,9 +225,10 @@ jobs:
       build_type: pull-request
       script: "ci/build_wheel_cudf_polars.sh"
   wheel-tests-cudf-polars:
-    needs: wheel-build-cudf-polars
+    needs: [wheel-build-cudf-polars, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -181,9 +246,10 @@ jobs:
       build_type: pull-request
       script: "ci/build_wheel_dask_cudf.sh"
   wheel-tests-dask-cudf:
-    needs: wheel-build-dask-cudf
+    needs: [wheel-build-dask-cudf, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -200,9 +266,10 @@ jobs:
         build-all -DBUILD_BENCHMARKS=ON --verbose;
         sccache -s;
   unit-tests-cudf-pandas:
-    needs: wheel-build-cudf
+    needs: [wheel-build-cudf, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -210,9 +277,10 @@ jobs:
       script: ci/cudf_pandas_scripts/run_tests.sh
   pandas-tests:
     # run the Pandas unit tests using PR branch
-    needs: wheel-build-cudf
+    needs: [wheel-build-cudf, changed-files]
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))

From 6747d2dc9d0deb4585b6306fed8a41bdf65e5558 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Tue, 27 Aug 2024 14:48:14 -0400
Subject: [PATCH 731/842] Update rapidsai/pre-commit-hooks (#16669)

This PR updates rapidsai/pre-commit-hooks to the version 0.4.0.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16669
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1b17eae0842..f861fb57916 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -144,7 +144,7 @@ repos:
       - id: ruff-format
         files: python/.*$
   - repo: https://github.com/rapidsai/pre-commit-hooks
-    rev: v0.3.1
+    rev: v0.4.0
     hooks:
       - id: verify-copyright
         exclude: |

From 1a2aad27b7e136f87be80debed6da7d3528ebda1 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 27 Aug 2024 13:33:47 -0700
Subject: [PATCH 732/842] Remove arrow dependency (#16640)

This PR removes libarrow as a dependency of libcudf since we no longer use any of its APIs in our C++ code. The following places remain dependent on libarrow:
- tests: We have tests demonstrating how to interoperate with libarrow objects, as well as other tests that leverage Arrow for I/O.
- examples: We have an example demonstrating interop with libarrow arrays.
- JNI: The JNI is still using libarrow to handle ingestion or production of Arrow buffers.

In all three cases above, we are now statically linking libarrow. We also always pull it in via CPM, which means that we never require libarrow to exist on the user's system anymore. Of the above three cases, we should expect the first two to persist indefinitely. The JNI could be updated to use nanoarrow instead if desired, but that is not critical since the primary benefit of removing libarrow as a direct dependency is to remove it as a constraint for package managers such as conda in environments where we must match the version of Arrow required by other dependencies.

pyarrow remains a dependency of the cudf Python packages. For now, this PR retains the tight pinning on 16.1 since we know that this version works. A future PR will loosen this pinning since we are no longer constrained to ABI-compatible versions and can support a range of pyarrow versions that support the necessary Python APIs (I believe pyarrow>=13 will work, but that remains to be tested).

Resolves #15193

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - James Lamb (https://github.com/jameslamb)
  - Robert Maynard (https://github.com/robertmaynard)
  - https://github.com/jakirkham
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/16640
---
 ci/build_wheel_cudf.sh                        |   1 -
 ci/build_wheel_libcudf.sh                     |   2 +-
 ci/build_wheel_pylibcudf.sh                   |   1 -
 .../all_cuda-118_arch-x86_64.yaml             |   6 -
 .../all_cuda-125_arch-x86_64.yaml             |   6 -
 conda/recipes/cudf/meta.yaml                  |   4 +-
 conda/recipes/libcudf/conda_build_config.yaml |   3 -
 conda/recipes/libcudf/meta.yaml               |   2 -
 conda/recipes/pylibcudf/meta.yaml             |   4 +-
 cpp/CMakeLists.txt                            |  27 +-
 cpp/cmake/thirdparty/get_arrow.cmake          | 285 +++++++-----------
 cpp/examples/interop/CMakeLists.txt           |   7 +
 cpp/tests/CMakeLists.txt                      |  23 +-
 dependencies.yaml                             |  42 +--
 java/src/main/native/CMakeLists.txt           |   4 +
 python/cudf/CMakeLists.txt                    |   1 -
 python/cudf/cudf/_lib/CMakeLists.txt          |   3 -
 python/cudf/cudf/_lib/io/CMakeLists.txt       |   2 -
 python/cudf/pyproject.toml                    |   2 -
 .../cudf_kafka/cudf_kafka/_lib/CMakeLists.txt |   2 -
 python/cudf_kafka/pyproject.toml              |   2 -
 python/libcudf/CMakeLists.txt                 |   9 +-
 python/libcudf/libcudf/load.py                |   4 -
 python/libcudf/pyproject.toml                 |   1 -
 python/pylibcudf/CMakeLists.txt               |   1 -
 .../cmake/Modules/LinkPyarrowHeaders.cmake    |  40 ---
 python/pylibcudf/pylibcudf/io/CMakeLists.txt  |   5 -
 .../pylibcudf/libcudf/io/CMakeLists.txt       |   3 -
 python/pylibcudf/pyproject.toml               |   4 +-
 29 files changed, 145 insertions(+), 351 deletions(-)
 delete mode 100644 python/pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake

diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index cf33703f544..e5565c4b53c 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -22,7 +22,6 @@ export PIP_CONSTRAINT="/tmp/constraints.txt"
 
 python -m auditwheel repair \
     --exclude libcudf.so \
-    --exclude libarrow.so.1601 \
     --exclude libnvcomp.so \
     --exclude libnvcomp_bitcomp.so \
     --exclude libnvcomp_gdeflate.so \
diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh
index 9694c3f6144..8975381ceba 100755
--- a/ci/build_wheel_libcudf.sh
+++ b/ci/build_wheel_libcudf.sh
@@ -10,6 +10,6 @@ package_dir="python/libcudf"
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 
 mkdir -p ${package_dir}/final_dist
-python -m auditwheel repair --exclude libarrow.so.1601 -w ${package_dir}/final_dist ${package_dir}/dist/*
+python -m auditwheel repair -w ${package_dir}/final_dist ${package_dir}/dist/*
 
 RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp ${package_dir}/final_dist
diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh
index 7181a49d397..0e4745bda28 100755
--- a/ci/build_wheel_pylibcudf.sh
+++ b/ci/build_wheel_pylibcudf.sh
@@ -20,7 +20,6 @@ export PIP_CONSTRAINT="/tmp/constraints.txt"
 
 python -m auditwheel repair \
     --exclude libcudf.so \
-    --exclude libarrow.so.1601 \
     --exclude libnvcomp.so \
     --exclude libnvcomp_bitcomp.so \
     --exclude libnvcomp_gdeflate.so \
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index fcd6e27a7f6..96596958636 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -37,15 +37,11 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-acero==16.1.0.*
-- libarrow-dataset==16.1.0.*
-- libarrow==16.1.0.*
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
 - libkvikio==24.10.*,>=0.0.0a0
-- libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.10.*,>=0.0.0a0
 - make
@@ -56,7 +52,6 @@ dependencies:
 - ninja
 - notebook
 - numba>=0.57
-- numpy
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
@@ -68,7 +63,6 @@ dependencies:
 - pandoc
 - pre-commit
 - ptxcompiler
-- pyarrow==16.1.0.*
 - pydata-sphinx-theme!=0.14.2
 - pytest-benchmark
 - pytest-cases>=3.8.2
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index bedc3a90885..efc5f76b90f 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -38,13 +38,9 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
-- libarrow-acero==16.1.0.*
-- libarrow-dataset==16.1.0.*
-- libarrow==16.1.0.*
 - libcufile-dev
 - libcurand-dev
 - libkvikio==24.10.*,>=0.0.0a0
-- libparquet==16.1.0.*
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.10.*,>=0.0.0a0
 - make
@@ -55,7 +51,6 @@ dependencies:
 - ninja
 - notebook
 - numba>=0.57
-- numpy
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcomp==3.0.6
@@ -65,7 +60,6 @@ dependencies:
 - pandas>=2.0,<2.2.3dev0
 - pandoc
 - pre-commit
-- pyarrow==16.1.0.*
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink>=0.0.0a0
 - pytest-benchmark
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index b2dad767da4..53f52a35651 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -64,8 +64,6 @@ requirements:
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.10.0
     - dlpack >=0.8,<1.0
-    - numpy 2.0
-    - pyarrow ==16.1.0.*
     - libcudf ={{ version }}
     - pylibcudf ={{ version }}
     - rmm ={{ minor_version }}
@@ -84,7 +82,7 @@ requirements:
     - cupy >=12.0.0
     - numba >=0.57
     - numpy >=1.23,<3.0a0
-    - {{ pin_compatible('pyarrow', max_pin='x.x') }}
+    - pyarrow ==16.1.0.*
     - libcudf ={{ version }}
     - pylibcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index ff7458caf82..4b1c4cca828 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -19,9 +19,6 @@ c_stdlib_version:
 cmake_version:
   - ">=3.26.4,!=3.30.0"
 
-libarrow_version:
-  - "==16.1.0"
-
 dlpack_version:
   - ">=0.8,<1.0"
 
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index aa1c94a4bca..1c2e9e8dd98 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -64,7 +64,6 @@ requirements:
     {% endif %}
     - cuda-version ={{ cuda_version }}
     - nvcomp {{ nvcomp_version }}
-    - libarrow {{ libarrow_version }}
     - dlpack {{ dlpack_version }}
     - librdkafka {{ librdkafka_version }}
     - fmt {{ fmt_version }}
@@ -92,7 +91,6 @@ outputs:
         - cmake {{ cmake_version }}
       host:
         - cuda-version ={{ cuda_version }}
-        - libarrow {{ libarrow_version }}
       run:
         - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
         {% if cuda_major == "11" %}
diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
index fef78467027..67b9b76bb8c 100644
--- a/conda/recipes/pylibcudf/meta.yaml
+++ b/conda/recipes/pylibcudf/meta.yaml
@@ -64,8 +64,6 @@ requirements:
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - scikit-build-core >=0.10.0
     - dlpack >=0.8,<1.0
-    - numpy 2.0
-    - pyarrow ==16.1.0.*
     - libcudf ={{ version }}
     - rmm ={{ minor_version }}
     {% if cuda_major == "11" %}
@@ -81,7 +79,7 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.3dev0
     - numpy >=1.23,<3.0a0
-    - {{ pin_compatible('pyarrow', max_pin='x.x') }}
+    - pyarrow ==16.1.0.*
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
     {% if cuda_major == "11" %}
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 4080c5d02da..1040fcb7b91 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -54,11 +54,6 @@ mark_as_advanced(CUDF_BUILD_TESTUTIL)
 option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON)
 option(CUDF_LARGE_STRINGS_DISABLED "Build with large string support disabled" OFF)
 mark_as_advanced(CUDF_LARGE_STRINGS_DISABLED)
-option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF)
-option(CUDF_ENABLE_ARROW_ORC "Build the Arrow ORC adapter" OFF)
-option(CUDF_ENABLE_ARROW_PYTHON "Find (or build) Arrow with Python support" OFF)
-option(CUDF_ENABLE_ARROW_PARQUET "Find (or build) Arrow with Parquet support" OFF)
-option(CUDF_ENABLE_ARROW_S3 "Build/Enable AWS S3 Arrow filesystem support" OFF)
 option(
   CUDF_USE_PER_THREAD_DEFAULT_STREAM
   "Build cuDF with per-thread default stream, including passing the per-thread default
@@ -81,8 +76,6 @@ option(CUDA_ENABLE_LINEINFO
 option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON)
 # cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
-option(USE_LIBARROW_FROM_PYARROW "Only use the libarrow contained in pyarrow" OFF)
-mark_as_advanced(USE_LIBARROW_FROM_PYARROW)
 
 set(DEFAULT_CUDF_BUILD_STREAMS_TEST_UTIL ON)
 if(CUDA_STATIC_RUNTIME OR NOT BUILD_SHARED_LIBS)
@@ -100,8 +93,6 @@ message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}")
 message(VERBOSE "CUDF: Configure CMake to build (google & nvbench) benchmarks: ${BUILD_BENCHMARKS}")
 message(VERBOSE "CUDF: Build cuDF shared libraries: ${BUILD_SHARED_LIBS}")
 message(VERBOSE "CUDF: Use a file cache for JIT compiled kernels: ${JITIFY_USE_CACHE}")
-message(VERBOSE "CUDF: Build and statically link Arrow libraries: ${CUDF_USE_ARROW_STATIC}")
-message(VERBOSE "CUDF: Build and enable S3 filesystem support for Arrow: ${CUDF_ENABLE_ARROW_S3}")
 message(VERBOSE "CUDF: Build with per-thread default stream: ${CUDF_USE_PER_THREAD_DEFAULT_STREAM}")
 message(
   VERBOSE
@@ -192,8 +183,6 @@ include(cmake/thirdparty/get_nvcomp.cmake)
 include(cmake/thirdparty/get_cccl.cmake)
 # find rmm
 include(cmake/thirdparty/get_rmm.cmake)
-# find arrow
-include(cmake/thirdparty/get_arrow.cmake)
 # find flatbuffers
 include(cmake/thirdparty/get_flatbuffers.cmake)
 # find dlpack
@@ -807,7 +796,7 @@ add_dependencies(cudf jitify_preprocess_run)
 # Specify the target module library dependencies
 target_link_libraries(
   cudf
-  PUBLIC ${ARROW_LIBRARIES} CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS::thread_pool>
+  PUBLIC CCCL::CCCL rmm::rmm $<BUILD_LOCAL_INTERFACE:BS::thread_pool>
   PRIVATE $<BUILD_LOCAL_INTERFACE:nvtx3::nvtx3-cpp> cuco::cuco ZLIB::ZLIB nvcomp::nvcomp
           kvikio::kvikio $<TARGET_NAME_IF_EXISTS:cuFile_interface> nanoarrow
 )
@@ -1056,20 +1045,6 @@ following IMPORTED GLOBAL  targets:
     ]=]
 )
 
-if(CUDF_ENABLE_ARROW_PARQUET)
-  string(
-    APPEND
-    install_code_string
-    [=[
-  if(NOT Parquet_DIR)
-    set(Parquet_DIR "${Arrow_DIR}")
-  endif()
-  set(ArrowDataset_DIR "${Arrow_DIR}")
-  find_dependency(ArrowDataset)
-  ]=]
-  )
-endif()
-
 rapids_export(
   INSTALL cudf
   EXPORT_SET cudf-exports ${_components_export_string}
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index e3e6a07661a..07cbf5150f4 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -22,82 +22,8 @@
 
 include_guard(GLOBAL)
 
-# Generate a FindArrow module for the case where we need to search for arrow within a pip install
-# pyarrow.
-function(find_libarrow_in_python_wheel PYARROW_VERSION)
-  string(REPLACE "." ";" PYARROW_VER_COMPONENTS "${PYARROW_VERSION}")
-  list(GET PYARROW_VER_COMPONENTS 0 PYARROW_MAJOR_VER)
-  list(GET PYARROW_VER_COMPONENTS 1 PYARROW_MINOR_VER)
-
-  # Ensure that the major and minor versions are two digits long
-  string(LENGTH ${PYARROW_MAJOR_VER} PYARROW_MAJOR_LENGTH)
-  string(LENGTH ${PYARROW_MINOR_VER} PYARROW_MINOR_LENGTH)
-  if(${PYARROW_MAJOR_LENGTH} EQUAL 1)
-    set(PYARROW_MAJOR_VER "0${PYARROW_MAJOR_VER}")
-  endif()
-  if(${PYARROW_MINOR_LENGTH} EQUAL 1)
-    set(PYARROW_MINOR_VER "0${PYARROW_MINOR_VER}")
-  endif()
-
-  set(PYARROW_LIB "libarrow.so.${PYARROW_MAJOR_VER}${PYARROW_MINOR_VER}")
-
-  string(
-    APPEND
-    initial_code_block
-    [=[
-find_package(Python 3.10 REQUIRED COMPONENTS Interpreter)
-execute_process(
-    COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_library_dirs()[0])"
-    OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR
-    OUTPUT_STRIP_TRAILING_WHITESPACE
-    COMMAND_ERROR_IS_FATAL ANY
-)
-list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}")
-]=]
-  )
-  string(
-    APPEND
-    final_code_block
-    [=[
-list(POP_BACK CMAKE_PREFIX_PATH)
-]=]
-  )
-  rapids_find_generate_module(
-    Arrow NO_CONFIG
-    VERSION "${PYARROW_VERSION}"
-    LIBRARY_NAMES "${PYARROW_LIB}"
-    BUILD_EXPORT_SET cudf-exports
-    INSTALL_EXPORT_SET cudf-exports
-    HEADER_NAMES arrow/python/arrow_to_pandas.h INITIAL_CODE_BLOCK initial_code_block
-                 FINAL_CODE_BLOCK final_code_block
-  )
-
-  find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL)
-  add_library(arrow_shared ALIAS Arrow::Arrow)
-
-  rapids_export_package(BUILD Arrow cudf-exports)
-  rapids_export_package(INSTALL Arrow cudf-exports)
-endfunction()
-
 # This function finds arrow and sets any additional necessary environment variables.
-function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON
-         ENABLE_PARQUET PYARROW_LIBARROW
-)
-
-  if(PYARROW_LIBARROW)
-    # Generate a FindArrow.cmake to find pyarrow's libarrow.so
-    find_libarrow_in_python_wheel(${VERSION})
-    set(ARROW_FOUND
-        TRUE
-        PARENT_SCOPE
-    )
-    set(ARROW_LIBRARIES
-        arrow_shared
-        PARENT_SCOPE
-    )
-    return()
-  endif()
-
+function(find_and_configure_arrow VERSION BUILD_STATIC EXCLUDE_FROM_ALL ENABLE_PARQUET)
   if(BUILD_STATIC)
     if(TARGET arrow_static)
       set(ARROW_FOUND
@@ -124,10 +50,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
     endif()
   endif()
 
-  if(NOT ARROW_ARMV8_ARCH)
-    set(ARROW_ARMV8_ARCH "armv8-a")
-  endif()
-
   if(NOT ARROW_SIMD_LEVEL)
     set(ARROW_SIMD_LEVEL "NONE")
   endif()
@@ -150,14 +72,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
     set(ARROW_OPENSSL_USE_SHARED ON)
   endif()
 
-  set(ARROW_PYTHON_OPTIONS "")
-  if(ENABLE_PYTHON)
-    list(APPEND ARROW_PYTHON_OPTIONS "ARROW_PYTHON ON")
-    # Arrow's logic to build Boost from source is busted, so we have to get it from the system.
-    list(APPEND ARROW_PYTHON_OPTIONS "BOOST_SOURCE SYSTEM")
-    list(APPEND ARROW_PYTHON_OPTIONS "ARROW_DEPENDENCY_SOURCE AUTO")
-  endif()
-
   set(ARROW_PARQUET_OPTIONS "")
   if(ENABLE_PARQUET)
     # Arrow's logic to build Boost from source is busted, so we have to get it from the system.
@@ -174,6 +88,7 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
     GIT_REPOSITORY https://github.com/apache/arrow.git
     GIT_TAG apache-arrow-${VERSION}
     GIT_SHALLOW TRUE SOURCE_SUBDIR cpp
+    EXCLUDE_FROM_ALL ${EXCLUDE_FROM_ALL}
     OPTIONS "CMAKE_VERBOSE_MAKEFILE ON"
             "ARROW_ACERO ON"
             "ARROW_IPC ON"
@@ -181,16 +96,14 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
             "ARROW_WITH_BACKTRACE ON"
             "ARROW_CXXFLAGS -w"
             "ARROW_JEMALLOC OFF"
-            "ARROW_S3 ${ENABLE_S3}"
-            "ARROW_ORC ${ENABLE_ORC}"
-            # e.g. needed by blazingsql-io
+            "ARROW_S3 OFF"
+            "ARROW_ORC OFF"
             ${ARROW_PARQUET_OPTIONS}
             "ARROW_PARQUET ${ENABLE_PARQUET}"
             "ARROW_FILESYSTEM ON"
-            ${ARROW_PYTHON_OPTIONS}
+            "ARROW_PYTHON OFF"
             # Arrow modifies CMake's GLOBAL RULE_LAUNCH_COMPILE unless this is off
             "ARROW_USE_CCACHE OFF"
-            "ARROW_ARMV8_ARCH ${ARROW_ARMV8_ARCH}"
             "ARROW_SIMD_LEVEL ${ARROW_SIMD_LEVEL}"
             "ARROW_BUILD_STATIC ${ARROW_BUILD_STATIC}"
             "ARROW_BUILD_SHARED ${ARROW_BUILD_SHARED}"
@@ -269,7 +182,6 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
   endif()
 
   if(Arrow_ADDED)
-
     set(arrow_code_string
         [=[
           if (TARGET cudf::arrow_shared AND (NOT TARGET arrow_shared))
@@ -324,101 +236,106 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
         get_target_property(interface_libs arrow_static INTERFACE_LINK_LIBRARIES)
       endif()
     endif()
-    rapids_export(
-      BUILD Arrow
-      VERSION ${VERSION}
-      EXPORT_SET arrow_targets
-      GLOBAL_TARGETS arrow_shared arrow_static
-      NAMESPACE cudf::
-      FINAL_CODE_BLOCK arrow_code_string
-    )
-
-    if(ENABLE_PARQUET)
-
-      set(arrow_acero_code_string
-          [=[
-              if (TARGET cudf::arrow_acero_shared AND (NOT TARGET arrow_acero_shared))
-                  add_library(arrow_acero_shared ALIAS cudf::arrow_acero_shared)
-              endif()
-              if (TARGET cudf::arrow_acero_static AND (NOT TARGET arrow_acero_static))
-                  add_library(arrow_acero_static ALIAS cudf::arrow_acero_static)
-              endif()
-            ]=]
-      )
 
+    include(rapids-export)
+    if(NOT EXCLUDE_FROM_ALL)
       rapids_export(
-        BUILD ArrowAcero
+        BUILD Arrow
         VERSION ${VERSION}
-        EXPORT_SET arrow_acero_targets
-        GLOBAL_TARGETS arrow_acero_shared arrow_acero_static
+        EXPORT_SET arrow_targets
+        GLOBAL_TARGETS arrow_shared arrow_static
         NAMESPACE cudf::
-        FINAL_CODE_BLOCK arrow_acero_code_string
+        FINAL_CODE_BLOCK arrow_code_string
       )
 
-      set(arrow_dataset_code_string
-          [=[
-              if (TARGET cudf::arrow_dataset_shared AND (NOT TARGET arrow_dataset_shared))
-                  add_library(arrow_dataset_shared ALIAS cudf::arrow_dataset_shared)
-              endif()
-              if (TARGET cudf::arrow_dataset_static AND (NOT TARGET arrow_dataset_static))
-                  add_library(arrow_dataset_static ALIAS cudf::arrow_dataset_static)
-              endif()
-            ]=]
-      )
+      if(ENABLE_PARQUET)
+        set(arrow_acero_code_string
+            [=[
+                if (TARGET cudf::arrow_acero_shared AND (NOT TARGET arrow_acero_shared))
+                    add_library(arrow_acero_shared ALIAS cudf::arrow_acero_shared)
+                endif()
+                if (TARGET cudf::arrow_acero_static AND (NOT TARGET arrow_acero_static))
+                    add_library(arrow_acero_static ALIAS cudf::arrow_acero_static)
+                endif()
+              ]=]
+        )
 
-      rapids_export(
-        BUILD ArrowDataset
-        VERSION ${VERSION}
-        EXPORT_SET arrow_dataset_targets
-        GLOBAL_TARGETS arrow_dataset_shared arrow_dataset_static
-        NAMESPACE cudf::
-        FINAL_CODE_BLOCK arrow_dataset_code_string
-      )
+        rapids_export(
+          BUILD ArrowAcero
+          VERSION ${VERSION}
+          EXPORT_SET arrow_acero_targets
+          GLOBAL_TARGETS arrow_acero_shared arrow_acero_static
+          NAMESPACE cudf::
+          FINAL_CODE_BLOCK arrow_acero_code_string
+        )
 
-      set(parquet_code_string
-          [=[
-              if (TARGET cudf::parquet_shared AND (NOT TARGET parquet_shared))
-                  add_library(parquet_shared ALIAS cudf::parquet_shared)
-              endif()
-              if (TARGET cudf::parquet_static AND (NOT TARGET parquet_static))
-                  add_library(parquet_static ALIAS cudf::parquet_static)
-              endif()
-            ]=]
-      )
+        set(arrow_dataset_code_string
+            [=[
+                if (TARGET cudf::arrow_dataset_shared AND (NOT TARGET arrow_dataset_shared))
+                    add_library(arrow_dataset_shared ALIAS cudf::arrow_dataset_shared)
+                endif()
+                if (TARGET cudf::arrow_dataset_static AND (NOT TARGET arrow_dataset_static))
+                    add_library(arrow_dataset_static ALIAS cudf::arrow_dataset_static)
+                endif()
+              ]=]
+        )
 
-      rapids_export(
-        BUILD Parquet
-        VERSION ${VERSION}
-        EXPORT_SET parquet_targets
-        GLOBAL_TARGETS parquet_shared parquet_static
-        NAMESPACE cudf::
-        FINAL_CODE_BLOCK parquet_code_string
-      )
+        rapids_export(
+          BUILD ArrowDataset
+          VERSION ${VERSION}
+          EXPORT_SET arrow_dataset_targets
+          GLOBAL_TARGETS arrow_dataset_shared arrow_dataset_static
+          NAMESPACE cudf::
+          FINAL_CODE_BLOCK arrow_dataset_code_string
+        )
+        set(parquet_code_string
+            [=[
+                if (TARGET cudf::parquet_shared AND (NOT TARGET parquet_shared))
+                    add_library(parquet_shared ALIAS cudf::parquet_shared)
+                endif()
+                if (TARGET cudf::parquet_static AND (NOT TARGET parquet_static))
+                    add_library(parquet_static ALIAS cudf::parquet_static)
+                endif()
+              ]=]
+        )
+
+        rapids_export(
+          BUILD Parquet
+          VERSION ${VERSION}
+          EXPORT_SET parquet_targets
+          GLOBAL_TARGETS parquet_shared parquet_static
+          NAMESPACE cudf::
+          FINAL_CODE_BLOCK parquet_code_string
+        )
+      endif()
     endif()
   endif()
-  # We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency`
-  rapids_export_package(BUILD Arrow cudf-exports)
-  rapids_export_package(INSTALL Arrow cudf-exports)
 
-  if(ENABLE_PARQUET)
-    rapids_export_package(BUILD Parquet cudf-exports)
-    rapids_export_package(BUILD ArrowDataset cudf-exports)
-  endif()
+  if(NOT EXCLUDE_FROM_ALL)
+    # We generate the arrow-configfiles when we built arrow locally, so always do `find_dependency`
+    rapids_export_package(BUILD Arrow cudf-exports)
+    rapids_export_package(INSTALL Arrow cudf-exports)
 
-  include("${rapids-cmake-dir}/export/find_package_root.cmake")
-  rapids_export_find_package_root(
-    BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports
-  )
-  rapids_export_find_package_root(
-    BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=]
-    EXPORT_SET cudf-exports
-    CONDITION ENABLE_PARQUET
-  )
-  rapids_export_find_package_root(
-    BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=]
-    EXPORT_SET cudf-exports
-    CONDITION ENABLE_PARQUET
-  )
+    if(ENABLE_PARQUET)
+      rapids_export_package(BUILD Parquet cudf-exports)
+      rapids_export_package(BUILD ArrowDataset cudf-exports)
+    endif()
+
+    include("${rapids-cmake-dir}/export/find_package_root.cmake")
+    rapids_export_find_package_root(
+      BUILD Arrow [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cudf-exports
+    )
+    rapids_export_find_package_root(
+      BUILD Parquet [=[${CMAKE_CURRENT_LIST_DIR}]=]
+      EXPORT_SET cudf-exports
+      CONDITION ENABLE_PARQUET
+    )
+    rapids_export_find_package_root(
+      BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=]
+      EXPORT_SET cudf-exports
+      CONDITION ENABLE_PARQUET
+    )
+  endif()
 
   set(ARROW_LIBRARIES
       "${ARROW_LIBRARIES}"
@@ -435,7 +352,21 @@ if(NOT DEFINED CUDF_VERSION_Arrow)
   )
 endif()
 
+# Default to static arrow builds
+if(NOT DEFINED CUDF_USE_ARROW_STATIC)
+  set(CUDF_USE_ARROW_STATIC ON)
+endif()
+
+# Default to excluding from installation since we generally privately and statically link Arrow.
+if(NOT DEFINED CUDF_EXCLUDE_ARROW_FROM_ALL)
+  set(CUDF_EXCLUDE_ARROW_FROM_ALL OFF)
+endif()
+
+if(NOT DEFINED CUDF_ENABLE_ARROW_PARQUET)
+  set(CUDF_ENABLE_ARROW_PARQUET OFF)
+endif()
+
 find_and_configure_arrow(
-  ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_ENABLE_ARROW_S3} ${CUDF_ENABLE_ARROW_ORC}
-  ${CUDF_ENABLE_ARROW_PYTHON} ${CUDF_ENABLE_ARROW_PARQUET} ${USE_LIBARROW_FROM_PYARROW}
+  ${CUDF_VERSION_Arrow} ${CUDF_USE_ARROW_STATIC} ${CUDF_EXCLUDE_ARROW_FROM_ALL}
+  ${CUDF_ENABLE_ARROW_PARQUET}
 )
diff --git a/cpp/examples/interop/CMakeLists.txt b/cpp/examples/interop/CMakeLists.txt
index a1f99c1d2fd..2816f613d3d 100644
--- a/cpp/examples/interop/CMakeLists.txt
+++ b/cpp/examples/interop/CMakeLists.txt
@@ -15,6 +15,13 @@ project(
 
 include(../fetch_dependencies.cmake)
 
+# The Arrow CMake is currently broken if the build type is not set
+set(CMAKE_BUILD_TYPE Release)
+# No need to install Arrow libs when only the final example executable is shipped.
+set(CUDF_EXCLUDE_ARROW_FROM_ALL ON)
+include(../../cmake/thirdparty/get_arrow.cmake)
+
 add_executable(interop interop.cpp)
 target_link_libraries(interop PRIVATE cudf::cudf)
 target_compile_features(interop PRIVATE cxx_std_17)
+target_link_libraries(interop PRIVATE ${ARROW_LIBRARIES})
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index ac77a362e1c..f86acbcc51b 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -24,8 +24,8 @@ rapids_test_init()
 # properties and linking to build the test
 function(ConfigureTest CMAKE_TEST_NAME)
   set(options)
-  set(one_value GPUS PERCENT STREAM_MODE EXTRA_LIB)
-  set(multi_value)
+  set(one_value GPUS PERCENT STREAM_MODE)
+  set(multi_value EXTRA_LIBS)
   cmake_parse_arguments(_CUDF_TEST "${options}" "${one_value}" "${multi_value}" ${ARGN})
   if(NOT DEFINED _CUDF_TEST_GPUS AND NOT DEFINED _CUDF_TEST_PERCENT)
     set(_CUDF_TEST_GPUS 1)
@@ -57,7 +57,7 @@ function(ConfigureTest CMAKE_TEST_NAME)
   target_link_libraries(
     ${CMAKE_TEST_NAME}
     PRIVATE cudftestutil GTest::gmock GTest::gmock_main GTest::gtest GTest::gtest_main
-            nvtx3::nvtx3-cpp $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIB}"
+            nvtx3::nvtx3-cpp $<TARGET_NAME_IF_EXISTS:conda_env> "${_CUDF_TEST_EXTRA_LIBS}"
   )
   rapids_cuda_set_runtime(${CMAKE_TEST_NAME} USE_STATIC ${CUDA_STATIC_RUNTIME})
   rapids_test_add(
@@ -78,6 +78,14 @@ function(ConfigureTest CMAKE_TEST_NAME)
   endif()
 endfunction()
 
+# ##################################################################################################
+# dependencies  ###################################################################################
+# ##################################################################################################
+
+# No need to install Arrow libs when only the final test executables are shipped.
+set(CUDF_EXCLUDE_ARROW_FROM_ALL ON)
+include(../cmake/thirdparty/get_arrow.cmake)
+
 # ##################################################################################################
 # test sources ##################################################################################
 # ##################################################################################################
@@ -197,7 +205,7 @@ ConfigureTest(
   QUANTILES_TEST quantiles/percentile_approx_test.cpp quantiles/quantile_test.cpp
   quantiles/quantiles_test.cpp
   GPUS 1
-  PERCENT 70
+  PERCENT 70 EXTRA_LIBS ${ARROW_LIBRARIES}
 )
 
 # ##################################################################################################
@@ -276,8 +284,9 @@ ConfigureTest(
   interop/from_arrow_host_test.cpp
   interop/from_arrow_stream_test.cpp
   interop/dlpack_test.cpp
-  EXTRA_LIB
+  EXTRA_LIBS
   nanoarrow
+  ${ARROW_LIBRARIES}
 )
 
 # ##################################################################################################
@@ -288,7 +297,7 @@ ConfigureTest(ROW_SELECTION_TEST io/row_selection_test.cpp)
 ConfigureTest(
   CSV_TEST io/csv_test.cpp
   GPUS 1
-  PERCENT 30
+  PERCENT 30 EXTRA_LIBS ${ARROW_LIBRARIES}
 )
 ConfigureTest(
   FILE_IO_TEST io/file_io_test.cpp
@@ -316,7 +325,7 @@ ConfigureTest(
 ConfigureTest(
   JSON_TEST io/json/json_test.cpp io/json/json_chunked_reader.cu
   GPUS 1
-  PERCENT 30
+  PERCENT 30 EXTRA_LIBS ${ARROW_LIBRARIES}
 )
 ConfigureTest(JSON_WRITER_TEST io/json/json_writer.cpp)
 ConfigureTest(JSON_TYPE_CAST_TEST io/json/json_type_cast_test.cu)
diff --git a/dependencies.yaml b/dependencies.yaml
index 04b5940c9fb..b55860815bf 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -17,7 +17,6 @@ files:
       - depends_on_rmm
       - develop
       - docs
-      - libarrow_build
       - notebooks
       - py_version
       - rapids_build_skbuild
@@ -40,7 +39,6 @@ files:
     output: none
     includes:
       - cuda_version
-      - libarrow_run
       - test_cpp
   test_python:
     output: none
@@ -58,7 +56,6 @@ files:
       - build_all
       - cuda
       - cuda_version
-      - libarrow_run
       - test_java
   test_notebooks:
     output: none
@@ -77,7 +74,6 @@ files:
       - cuda
       - cuda_version
       - docs
-      - libarrow_run
       - py_version
   py_build_cudf:
     output: pyproject
@@ -137,7 +133,6 @@ files:
     includes:
       - build_base
       - build_cpp
-      - build_python_libcudf
       - depends_on_librmm
   py_run_libcudf:
     output: pyproject
@@ -389,38 +384,6 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - cython>=3.0.3
-          # Hard pin the patch version used during the build. This must be kept
-          # in sync with the version pinned in get_arrow.cmake.
-          - &pyarrow_build pyarrow==16.1.0.*
-      - output_types: pyproject
-        packages:
-          # Hard pin the version used during the build.
-          # Sync with conda build constraint & wheel run constraint.
-          - numpy==2.0.*
-  build_python_libcudf:
-    common:
-      - output_types: [conda, requirements, pyproject]
-        packages:
-          - *pyarrow_build
-  libarrow_build:
-    common:
-      - output_types: conda
-        packages:
-          # Hard pin the Arrow patch version used during the build. This must
-          # be kept in sync with the version pinned in get_arrow.cmake.
-          - libarrow-acero==16.1.0.*
-          - libarrow-dataset==16.1.0.*
-          - libarrow==16.1.0.*
-          - libparquet==16.1.0.*
-  libarrow_run:
-    common:
-      - output_types: conda
-        packages:
-          # Allow runtime version to float up to patch version
-          - libarrow-acero>=16.1.0,<16.2.0a0
-          - libarrow-dataset>=16.1.0,<16.2.0a0
-          - libarrow>=16.1.0,<16.2.0a0
-          - libparquet>=16.1.0,<16.2.0a0
   pyarrow_run:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -600,7 +563,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - fsspec>=0.6.0
-          - numpy>=1.23,<3.0a0
+          - &numpy numpy>=1.23,<3.0a0
           - pandas>=2.0,<2.2.3dev0
   run_pylibcudf:
     common:
@@ -731,6 +694,7 @@ dependencies:
           - *cmake_ver
           - maven
           - openjdk=8.*
+          - boost
   test_python_common:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -744,7 +708,7 @@ dependencies:
         packages:
           - fastavro>=0.22.9
           - hypothesis
-          - numpy
+          - *numpy
           - pandas
   test_python_cudf:
     common:
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index 22059c5bc7f..c18a90140b6 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -212,6 +212,10 @@ target_compile_definitions(
 )
 target_link_options(cudfjni PRIVATE "-Wl,--no-undefined")
 
+set(CUDF_ENABLE_ARROW_PARQUET ON)
+include(../../../../cpp/cmake/thirdparty/get_arrow.cmake)
+target_link_libraries(cudfjni PRIVATE ${ARROW_LIBRARIES})
+
 if(USE_GDS)
   add_library(cufilejni src/CuFileJni.cpp)
   set_target_properties(
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index 72f20b30052..7193ada5b93 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -35,7 +35,6 @@ include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
 include(rapids-cython-core)
 rapids_cython_init()
 
-include(../pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake)
 add_subdirectory(cudf/_lib)
 add_subdirectory(udf_cpp)
 
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 5ea378fc0e5..5d4b5421f16 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -65,9 +65,6 @@ rapids_cython_create_modules(
 target_link_libraries(strings_udf PUBLIC cudf_strings_udf)
 target_include_directories(interop PUBLIC "$<BUILD_INTERFACE:${DLPACK_INCLUDE_DIR}>")
 
-set(targets_using_arrow_headers avro csv orc json parquet)
-link_to_pyarrow_headers("${targets_using_arrow_headers}")
-
 include(${rapids-cmake-dir}/export/find_package_root.cmake)
 include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake)
 target_link_libraries(interop PUBLIC nanoarrow)
diff --git a/python/cudf/cudf/_lib/io/CMakeLists.txt b/python/cudf/cudf/_lib/io/CMakeLists.txt
index 620229a1275..e7408cf2852 100644
--- a/python/cudf/cudf/_lib/io/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/io/CMakeLists.txt
@@ -19,5 +19,3 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX io_ ASSOCIATED_TARGETS cudf
 )
-
-link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}")
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index a6d26d17d46..8386935fab0 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -129,8 +129,6 @@ requires = [
     "libcudf==24.10.*,>=0.0.0a0",
     "librmm==24.10.*,>=0.0.0a0",
     "ninja",
-    "numpy==2.0.*",
-    "pyarrow==16.1.0.*",
     "pylibcudf==24.10.*,>=0.0.0a0",
     "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
index 1b205537d73..4490c41c7a9 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
+++ b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
@@ -20,5 +20,3 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}"
 )
-include(../../../pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake)
-link_to_pyarrow_headers("${RAPIDS_CYTHON_CREATED_TARGETS}")
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 01e7299a33a..6ca798bb11c 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -106,6 +106,4 @@ requires = [
     "cmake>=3.26.4,!=3.30.0",
     "cython>=3.0.3",
     "ninja",
-    "numpy==2.0.*",
-    "pyarrow==16.1.0.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt
index 09c7ed2e217..96eb6c3bb30 100644
--- a/python/libcudf/CMakeLists.txt
+++ b/python/libcudf/CMakeLists.txt
@@ -32,9 +32,6 @@ endif()
 
 unset(cudf_FOUND)
 
-# For wheels, this should always be true
-set(USE_LIBARROW_FROM_PYARROW ON)
-
 # Find Python early so that later commands can use it
 find_package(Python 3.10 REQUIRED COMPONENTS Interpreter)
 
@@ -46,13 +43,11 @@ set(CUDA_STATIC_RUNTIME ON)
 
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib)
 
-include(../pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake)
-
 add_subdirectory(../../cpp cudf-cpp)
 
 # Ensure other libraries needed by libcudf.so get installed alongside it.
 include(cmake/Modules/WheelHelpers.cmake)
 install_aliased_imported_targets(
-  TARGETS cudf arrow_shared nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp
-  DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
+  TARGETS cudf nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp DESTINATION
+  ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
 )
diff --git a/python/libcudf/libcudf/load.py b/python/libcudf/libcudf/load.py
index f6ba0d51bdb..ba134710868 100644
--- a/python/libcudf/libcudf/load.py
+++ b/python/libcudf/libcudf/load.py
@@ -18,10 +18,6 @@
 
 
 def load_library():
-    # This is loading the libarrow shared library in situations where it comes from the
-    # pyarrow package (i.e. when installed as a wheel).
-    import pyarrow  # noqa: F401
-
     # Dynamically load libcudf.so. Prefer a system library if one is present to
     # avoid clobbering symbols that other packages might expect, but if no
     # other library is present use the one in the wheel.
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index fd01f7f6e2f..43878d0aec2 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -71,5 +71,4 @@ requires = [
     "cmake>=3.26.4,!=3.30.0",
     "librmm==24.10.*,>=0.0.0a0",
     "ninja",
-    "pyarrow==16.1.0.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/pylibcudf/CMakeLists.txt b/python/pylibcudf/CMakeLists.txt
index 340ad120377..a4b831790fb 100644
--- a/python/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/CMakeLists.txt
@@ -36,7 +36,6 @@ include(rapids-cython-core)
 
 rapids_cython_init()
 
-include(cmake/Modules/LinkPyarrowHeaders.cmake)
 add_subdirectory(pylibcudf)
 
 if(DEFINED cython_lib_dir)
diff --git a/python/pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake b/python/pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake
deleted file mode 100644
index d432f9fe1f5..00000000000
--- a/python/pylibcudf/cmake/Modules/LinkPyarrowHeaders.cmake
+++ /dev/null
@@ -1,40 +0,0 @@
-# =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-include_guard(GLOBAL)
-
-find_package(Python REQUIRED COMPONENTS Development NumPy)
-
-execute_process(
-  COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())"
-  OUTPUT_VARIABLE PYARROW_INCLUDE_DIR
-  ERROR_VARIABLE PYARROW_ERROR
-  RESULT_VARIABLE PYARROW_RESULT
-  OUTPUT_STRIP_TRAILING_WHITESPACE
-)
-
-if(${PYARROW_RESULT})
-  message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}")
-endif()
-
-# Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts of
-# cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
-# requirement for arrow headers infects all of cudf. These requirements will go away once all
-# scalar-related Cython code is removed from cudf.
-function(link_to_pyarrow_headers targets)
-  foreach(target IN LISTS targets)
-    # PyArrow headers require numpy headers.
-    target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}")
-    target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
-  endforeach()
-endfunction()
diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
index 55bea4fc262..bcc2151f5b6 100644
--- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt
@@ -20,8 +20,3 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_io_ ASSOCIATED_TARGETS cudf
 )
-
-set(targets_using_arrow_headers pylibcudf_io_avro pylibcudf_io_csv pylibcudf_io_datasource
-                                pylibcudf_io_json pylibcudf_io_parquet pylibcudf_io_types
-)
-link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/io/CMakeLists.txt
index 6831063ecb9..9f5f74506e9 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/libcudf/io/CMakeLists.txt
@@ -21,6 +21,3 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf MODULE_PREFIX cpp_io_
 )
-
-set(targets_using_arrow_headers cpp_io_json cpp_io_types)
-link_to_pyarrow_headers("${targets_using_arrow_headers}")
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index 0d673ea4cc3..e4c6edc6141 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -40,7 +40,7 @@ classifiers = [
 test = [
     "fastavro>=0.22.9",
     "hypothesis",
-    "numpy",
+    "numpy>=1.23,<3.0a0",
     "pandas",
     "pytest-cov",
     "pytest-xdist",
@@ -104,8 +104,6 @@ requires = [
     "libcudf==24.10.*,>=0.0.0a0",
     "librmm==24.10.*,>=0.0.0a0",
     "ninja",
-    "numpy==2.0.*",
-    "pyarrow==16.1.0.*",
     "rmm==24.10.*,>=0.0.0a0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

From d0e5cdfc4df197bfb4846a243e3d9ea9d7b87aab Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 27 Aug 2024 12:17:51 -1000
Subject: [PATCH 733/842] Allow for binops between two differently sized
 DecimalDtypes (#16638)

Currently cudf Python has some custom logic for determining the resulting dtype of a binop between 2 decimal dtypes since Python decimal dtype support `precision` and libcudf doesn't. But libcudf does require that the 2 operands have the same decimal type when calculating the binop, so we must ensure the inputs are cast to the same, resulting dtype.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16638
---
 python/cudf/cudf/core/column/decimal.py | 17 ++++++++++++++---
 python/cudf/cudf/tests/test_decimal.py  | 10 ++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index 3b979ef2e97..8803ebd6791 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -135,9 +135,15 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str):
         # are computed outside of libcudf
         if op in {"__add__", "__sub__", "__mul__", "__div__"}:
             output_type = _get_decimal_type(lhs.dtype, rhs.dtype, op)
+            lhs = lhs.astype(
+                type(output_type)(lhs.dtype.precision, lhs.dtype.scale)
+            )
+            rhs = rhs.astype(
+                type(output_type)(rhs.dtype.precision, rhs.dtype.scale)
+            )
             result = libcudf.binaryop.binaryop(lhs, rhs, op, output_type)
-            # TODO:  Why is this necessary? Why isn't the result's
-            # precision already set correctly based on output_type?
+            # libcudf doesn't support precision, so result.dtype doesn't
+            # maintain output_type.precision
             result.dtype.precision = output_type.precision
         elif op in {
             "__eq__",
@@ -430,7 +436,11 @@ def _with_type_metadata(
         return self
 
 
-def _get_decimal_type(lhs_dtype, rhs_dtype, op):
+def _get_decimal_type(
+    lhs_dtype: DecimalDtype,
+    rhs_dtype: DecimalDtype,
+    op: str,
+) -> DecimalDtype:
     """
     Returns the resulting decimal type after calculating
     precision & scale when performing the binary operation
@@ -441,6 +451,7 @@ def _get_decimal_type(lhs_dtype, rhs_dtype, op):
 
     # This should at some point be hooked up to libcudf's
     # binary_operation_fixed_point_scale
+    # Note: libcudf decimal types don't have a concept of precision
 
     p1, p2 = lhs_dtype.precision, rhs_dtype.precision
     s1, s2 = lhs_dtype.scale, rhs_dtype.scale
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index b63788d20b7..048b3a656e3 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -398,3 +398,13 @@ def test_decimal_overflow():
     s = cudf.Series([1, 2], dtype=cudf.Decimal128Dtype(precision=38, scale=0))
     result = s * Decimal("1.0")
     assert_eq(cudf.Decimal128Dtype(precision=38, scale=1), result.dtype)
+
+
+def test_decimal_binop_upcast_operands():
+    ser1 = cudf.Series([0.51, 1.51, 2.51]).astype(cudf.Decimal64Dtype(18, 2))
+    ser2 = cudf.Series([0.90, 0.96, 0.99]).astype(cudf.Decimal128Dtype(19, 2))
+    result = ser1 + ser2
+    expected = cudf.Series([1.41, 2.47, 3.50]).astype(
+        cudf.Decimal128Dtype(20, 2)
+    )
+    assert_eq(result, expected)

From 88de8dd5bc0d2476a554107626d72ceb6d65cbab Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 27 Aug 2024 13:26:27 -1000
Subject: [PATCH 734/842] Fix interval_range right child non-zero offset
 (#16651)

xref https://github.com/rapidsai/cudf/issues/16507

Similar to what is done in `IntervalIndex.from_breaks`, `interval_index` generates the right edges by slicing a range of fencepost edges. However, we don't want to maintain the new `offset` (`1`) on the right edge after slicing as it adversely impacts subsequent indexing operations.

~~Additionally, I noticed that `Index(struct_data)` would automatically convert it to an `IntervalIndex`, but `IntervalIndex` has a strict requirement on the data have `left/right` keys, so making this raise a `NotImplementedError` instead~~
^ Will tackle this in a follow up, looks like there are cases where this is valid

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16651
---
 python/cudf/cudf/core/index.py                  | 12 +++++++++++-
 python/cudf/cudf/tests/indexes/test_interval.py |  6 ++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 6a5e718c2c5..df8af856f4f 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3250,7 +3250,7 @@ def interval_range(
     freq=None,
     name=None,
     closed="right",
-) -> "IntervalIndex":
+) -> IntervalIndex:
     """
     Returns a fixed frequency IntervalIndex.
 
@@ -3347,6 +3347,16 @@ def interval_range(
     )
     left_col = bin_edges.slice(0, len(bin_edges) - 1)
     right_col = bin_edges.slice(1, len(bin_edges))
+    # For indexing, children should both have 0 offset
+    right_col = type(right_col)(
+        data=right_col.data,
+        dtype=right_col.dtype,
+        size=right_col.size,
+        mask=right_col.mask,
+        offset=0,
+        null_count=right_col.null_count,
+        children=right_col.children,
+    )
 
     if len(right_col) == 0 or len(left_col) == 0:
         dtype = IntervalDtype("int64", closed)
diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py
index a567c27f584..6653a94c9be 100644
--- a/python/cudf/cudf/tests/indexes/test_interval.py
+++ b/python/cudf/cudf/tests/indexes/test_interval.py
@@ -407,3 +407,9 @@ def test_interval_range_name():
     expected = pd.interval_range(start=0, periods=5, freq=2, name="foo")
     result = cudf.interval_range(start=0, periods=5, freq=2, name="foo")
     assert_eq(result, expected)
+
+
+def test_from_interval_range_indexing():
+    result = cudf.interval_range(start=0, end=1, name="a").repeat(2)
+    expected = pd.interval_range(start=0, end=1, name="a").repeat(2)
+    assert_eq(result, expected)

From e2a15cb1ba856616b7de08e2f1a5c06d6d7c4a35 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 27 Aug 2024 19:47:31 -0400
Subject: [PATCH 735/842] Fix strings::detail::copy_range when target contains
 nulls (#16626)

Fixes the logic in `cudf::strings::detail::copy_range` handling of nulls in the target range. The optimization check for nulls is removed simplifying the logic and making it more reliable as well. The benchmark showed no significant change in performance.
Also adds a specific gtest for this case.
Error was introduced in #15010
Closes #16618

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16626
---
 cpp/src/strings/copying/copy_range.cu  | 23 +++--------------------
 cpp/tests/copying/copy_range_tests.cpp | 10 ++++++++++
 2 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/cpp/src/strings/copying/copy_range.cu b/cpp/src/strings/copying/copy_range.cu
index 9f8c47602f8..2434de1795e 100644
--- a/cpp/src/strings/copying/copy_range.cu
+++ b/cpp/src/strings/copying/copy_range.cu
@@ -40,20 +40,14 @@ struct compute_element_size {
   size_type source_begin;
   size_type target_begin;
   size_type target_end;
-  bool source_has_nulls;
-  bool target_has_nulls;
 
   __device__ cudf::size_type operator()(cudf::size_type idx)
   {
     if (idx >= target_begin && idx < target_end) {
       auto const str_idx = source_begin + (idx - target_begin);
-      return source_has_nulls && d_source.is_null_nocheck(str_idx)
-               ? 0
-               : d_source.element<string_view>(str_idx).size_bytes();
+      return d_source.is_null(str_idx) ? 0 : d_source.element<string_view>(str_idx).size_bytes();
     } else {
-      return target_has_nulls && d_target.is_null_nocheck(idx)
-               ? 0
-               : d_target.element<string_view>(idx).size_bytes();
+      return d_target.is_null(idx) ? 0 : d_target.element<string_view>(idx).size_bytes();
     }
   }
 };
@@ -97,20 +91,9 @@ std::unique_ptr<column> copy_range(strings_column_view const& source,
       mr);
   }();
 
-  auto [check_source, check_target] = [target, null_count = null_count] {
-    // check validities for both source & target
-    if (target.has_nulls()) { return std::make_pair(true, true); }
-    // check validities for source only
-    if (null_count > 0) { return std::make_pair(true, false); }
-    // no need to check validities
-    return std::make_pair(false, false);
-  }();
-
   // create offsets
   auto sizes_begin = cudf::detail::make_counting_transform_iterator(
-    0,
-    compute_element_size{
-      d_source, d_target, source_begin, target_begin, target_end, check_source, check_target});
+    0, compute_element_size{d_source, d_target, source_begin, target_begin, target_end});
   auto [offsets_column, chars_bytes] = cudf::strings::detail::make_offsets_child_column(
     sizes_begin, sizes_begin + target.size(), stream, mr);
   auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
diff --git a/cpp/tests/copying/copy_range_tests.cpp b/cpp/tests/copying/copy_range_tests.cpp
index 223946ddcee..25d93da277b 100644
--- a/cpp/tests/copying/copy_range_tests.cpp
+++ b/cpp/tests/copying/copy_range_tests.cpp
@@ -232,6 +232,16 @@ TEST_F(CopyRangeTestFixture, CopyWithNullsString)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*p_ret, expected);
 }
 
+TEST_F(CopyRangeTestFixture, CopyWithTargetNullsString)
+{
+  auto target =
+    cudf::test::strings_column_wrapper({"a", "b", "", "d", "", "é"}, {1, 1, 0, 1, 1, 1});
+  auto source   = cudf::test::strings_column_wrapper({"A", "B", "C", "D", "E", "F"});
+  auto result   = cudf::copy_range(source, target, 1, 5, 1);
+  auto expected = cudf::test::strings_column_wrapper({"a", "B", "C", "D", "E", "é"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+}
+
 TEST_F(CopyRangeTestFixture, CopyNoNullsString)
 {
   cudf::size_type size{100};

From d1412e00092d752e4e34371042d7dbfe972ba5d7 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 27 Aug 2024 19:52:57 -0400
Subject: [PATCH 736/842] Rework strings::slice benchmark to use nvbench
 (#16563)

Moves google-benchmark  for `cudf::strings::slice_strings` to nvbench.
This is to help measure performance improvements in follow on work for strings-slice.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16563
---
 cpp/benchmarks/CMakeLists.txt   |  2 +-
 cpp/benchmarks/string/slice.cpp | 89 ++++++++++++++++-----------------
 2 files changed, 44 insertions(+), 47 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 6db282a7728..7f3edfa0a01 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -325,7 +325,6 @@ ConfigureBench(
   string/filter.cpp
   string/repeat_strings.cpp
   string/replace.cpp
-  string/slice.cpp
   string/translate.cpp
   string/url_decode.cu
 )
@@ -346,6 +345,7 @@ ConfigureNVBench(
   string/like.cpp
   string/replace_re.cpp
   string/reverse.cpp
+  string/slice.cpp
   string/split.cpp
   string/split_re.cpp
 )
diff --git a/cpp/benchmarks/string/slice.cpp b/cpp/benchmarks/string/slice.cpp
index 0f973a7c8b5..1898f0340b6 100644
--- a/cpp/benchmarks/string/slice.cpp
+++ b/cpp/benchmarks/string/slice.cpp
@@ -14,11 +14,8 @@
  * limitations under the License.
  */
 
-#include "string_bench_args.hpp"
-
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/fixture/benchmark_fixture.hpp>
-#include <benchmarks/synchronization/synchronization.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
 
 #include <cudf_test/column_wrapper.hpp>
 
@@ -29,56 +26,56 @@
 
 #include <thrust/iterator/constant_iterator.h>
 
+#include <nvbench/nvbench.cuh>
+
 #include <limits>
 
-class StringSlice : public cudf::benchmark {};
+static void bench_slice(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const stype     = state.get_string("type");
 
-enum slice_type { position, multi_position };
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
 
-static void BM_slice(benchmark::State& state, slice_type rt)
-{
-  cudf::size_type const n_rows{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const max_str_length{static_cast<cudf::size_type>(state.range(1))};
   data_profile const profile = data_profile_builder().distribution(
-    cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
-  auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const column = create_random_column(cudf::type_id::STRING, row_count{num_rows}, profile);
   cudf::strings_column_view input(column->view());
-  auto starts_itr = thrust::constant_iterator<cudf::size_type>(max_str_length / 3);
-  auto stops_itr  = thrust::constant_iterator<cudf::size_type>(max_str_length / 2);
-  cudf::test::fixed_width_column_wrapper<int32_t> starts(starts_itr, starts_itr + n_rows);
-  cudf::test::fixed_width_column_wrapper<int32_t> stops(stops_itr, stops_itr + n_rows);
+  auto starts_itr = thrust::constant_iterator<cudf::size_type>(row_width / 4);
+  auto starts =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>(starts_itr, starts_itr + num_rows);
+  auto stops_itr = thrust::constant_iterator<cudf::size_type>(row_width / 3);
+  auto stops =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>(stops_itr, stops_itr + num_rows);
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::get_default_stream());
-    switch (rt) {
-      case position:
-        cudf::strings::slice_strings(input, max_str_length / 3, max_str_length / 2);
-        break;
-      case multi_position: cudf::strings::slice_strings(input, starts, stops); break;
-    }
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  // gather some throughput statistics as well
+  auto chars_size = input.chars_size(stream);
+  state.add_element_count(chars_size, "chars_size");           // number of bytes
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);  // all bytes are read
+  auto output_size = (row_width / 3 - row_width / 4) * num_rows;
+  state.add_global_memory_writes<nvbench::int8_t>(output_size);
+
+  if (stype == "multi") {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::slice_strings(input, starts, stops, stream);
+    });
+  } else {
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      cudf::strings::slice_strings(input, row_width / 4, row_width / 3, 1, stream);
+    });
   }
 
-  state.SetBytesProcessed(state.iterations() * input.chars_size(cudf::get_default_stream()));
+  set_throughputs(state);
 }
 
-static void generate_bench_args(benchmark::internal::Benchmark* b)
-{
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 2;
-  generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
-}
-
-#define STRINGS_BENCHMARK_DEFINE(name)                          \
-  BENCHMARK_DEFINE_F(StringSlice, name)                         \
-  (::benchmark::State & st) { BM_slice(st, slice_type::name); } \
-  BENCHMARK_REGISTER_F(StringSlice, name)                       \
-    ->Apply(generate_bench_args)                                \
-    ->UseManualTime()                                           \
-    ->Unit(benchmark::kMillisecond);
-
-STRINGS_BENCHMARK_DEFINE(position)
-STRINGS_BENCHMARK_DEFINE(multi_position)
+NVBENCH_BENCH(bench_slice)
+  .set_name("slice")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048})
+  .add_int64_axis("num_rows", {262144, 2097152, 16777216})
+  .add_string_axis("type", {"position", "multi"});

From 60f30d831325d5816e6968e8037796b8ce1dc579 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 27 Aug 2024 17:45:33 -0700
Subject: [PATCH 737/842] Use `make_host_vector` instead of `make_std_vector`
 to facilitate pinned memory optimizations (#16386)

Replaced most of `make_std_vector` calls with `make_host_vector` to allow pinned memory and kernel copies, when enabled.
Skipped places where the change would impact the public API.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: https://github.com/rapidsai/cudf/pull/16386
---
 cpp/include/cudf/detail/gather.cuh         |  2 +-
 cpp/src/io/csv/csv_gpu.cu                  |  4 +--
 cpp/src/io/csv/csv_gpu.hpp                 |  2 +-
 cpp/src/io/csv/reader_impl.cu              |  2 +-
 cpp/src/io/json/json_column.cu             | 42 +++++++++++-----------
 cpp/src/io/orc/writer_impl.cu              |  8 ++---
 cpp/src/io/orc/writer_impl.hpp             |  5 +--
 cpp/src/io/parquet/predicate_pushdown.cpp  |  2 +-
 cpp/src/io/parquet/reader_impl_chunking.cu | 20 +++++------
 cpp/src/io/parquet/writer_impl.cu          | 22 ++++++------
 cpp/src/io/utilities/datasource.cpp        |  4 +--
 cpp/src/text/jaccard.cu                    |  2 +-
 12 files changed, 57 insertions(+), 58 deletions(-)

diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 41f5494f78f..df6fe6e6ccb 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -609,7 +609,7 @@ void gather_bitmask(table_view const& source,
        stream);
 
   // Copy the valid counts into each column
-  auto const valid_counts = make_std_vector_sync(d_valid_counts, stream);
+  auto const valid_counts = make_host_vector_sync(d_valid_counts, stream);
   for (size_t i = 0; i < target.size(); ++i) {
     if (target[i]->nullable()) {
       auto const null_count = target_rows - valid_counts[i];
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 7a05d0aebaf..5a0c6decfda 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -794,7 +794,7 @@ device_span<uint64_t> __host__ remove_blank_rows(cudf::io::parse_options_view co
   return row_offsets.subspan(0, new_end - row_offsets.begin());
 }
 
-std::vector<column_type_histogram> detect_column_types(
+cudf::detail::host_vector<column_type_histogram> detect_column_types(
   cudf::io::parse_options_view const& options,
   device_span<char const> const data,
   device_span<column_parse::flags const> const column_flags,
@@ -812,7 +812,7 @@ std::vector<column_type_histogram> detect_column_types(
   data_type_detection<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, column_flags, row_starts, d_stats);
 
-  return detail::make_std_vector_sync(d_stats, stream);
+  return detail::make_host_vector_sync(d_stats, stream);
 }
 
 void decode_row_column_data(cudf::io::parse_options_view const& options,
diff --git a/cpp/src/io/csv/csv_gpu.hpp b/cpp/src/io/csv/csv_gpu.hpp
index 06c60319371..aa3d9f6c7b7 100644
--- a/cpp/src/io/csv/csv_gpu.hpp
+++ b/cpp/src/io/csv/csv_gpu.hpp
@@ -199,7 +199,7 @@ device_span<uint64_t> remove_blank_rows(cudf::io::parse_options_view const& opti
  *
  * @return stats Histogram of each dtypes' occurrence for each column
  */
-std::vector<column_type_histogram> detect_column_types(
+cudf::detail::host_vector<column_type_histogram> detect_column_types(
   cudf::io::parse_options_view const& options,
   device_span<char const> data,
   device_span<column_parse::flags const> column_flags,
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index 40d4372ae9d..e27b06682bb 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -614,7 +614,7 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
     d_valid_counts,
     stream);
 
-  auto const h_valid_counts = cudf::detail::make_std_vector_sync(d_valid_counts, stream);
+  auto const h_valid_counts = cudf::detail::make_host_vector_sync(d_valid_counts, stream);
   for (int i = 0; i < num_active_columns; ++i) {
     out_buffers[i].null_count() = num_records - h_valid_counts[i];
   }
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index e5e21e054a6..8d6890045be 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -77,16 +77,16 @@ void print_tree(host_span<SymbolT const> input,
                 tree_meta_t const& d_gpu_tree,
                 rmm::cuda_stream_view stream)
 {
-  print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.node_categories, stream),
+  print_vec(cudf::detail::make_host_vector_sync(d_gpu_tree.node_categories, stream),
             "node_categories",
             to_cat);
-  print_vec(cudf::detail::make_std_vector_sync(d_gpu_tree.parent_node_ids, stream),
+  print_vec(cudf::detail::make_host_vector_sync(d_gpu_tree.parent_node_ids, stream),
             "parent_node_ids",
             to_int);
   print_vec(
-    cudf::detail::make_std_vector_sync(d_gpu_tree.node_levels, stream), "node_levels", to_int);
-  auto node_range_begin = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_begin, stream);
-  auto node_range_end   = cudf::detail::make_std_vector_sync(d_gpu_tree.node_range_end, stream);
+    cudf::detail::make_host_vector_sync(d_gpu_tree.node_levels, stream), "node_levels", to_int);
+  auto node_range_begin = cudf::detail::make_host_vector_sync(d_gpu_tree.node_range_begin, stream);
+  auto node_range_end   = cudf::detail::make_host_vector_sync(d_gpu_tree.node_range_end, stream);
   print_vec(node_range_begin, "node_range_begin", to_int);
   print_vec(node_range_end, "node_range_end", to_int);
   for (int i = 0; i < int(node_range_begin.size()); i++) {
@@ -373,9 +373,9 @@ std::vector<std::string> copy_strings_to_host_sync(
   auto to_host        = [stream](auto const& col) {
     if (col.is_empty()) return std::vector<std::string>{};
     auto const scv     = cudf::strings_column_view(col);
-    auto const h_chars = cudf::detail::make_std_vector_async<char>(
+    auto const h_chars = cudf::detail::make_host_vector_async<char>(
       cudf::device_span<char const>(scv.chars_begin(stream), scv.chars_size(stream)), stream);
-    auto const h_offsets = cudf::detail::make_std_vector_async(
+    auto const h_offsets = cudf::detail::make_host_vector_async(
       cudf::device_span<cudf::size_type const>(scv.offsets().data<cudf::size_type>() + scv.offset(),
                                                scv.size() + 1),
       stream);
@@ -523,25 +523,23 @@ void make_device_json_column(device_span<SymbolT const> input,
                           row_array_parent_col_id,
                           stream);
   auto num_columns    = d_unique_col_ids.size();
-  auto unique_col_ids = cudf::detail::make_std_vector_async(d_unique_col_ids, stream);
+  auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream);
   auto column_categories =
-    cudf::detail::make_std_vector_async(d_column_tree.node_categories, stream);
-  auto column_parent_ids =
-    cudf::detail::make_std_vector_async(d_column_tree.parent_node_ids, stream);
+    cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream);
+  auto const column_parent_ids =
+    cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream);
   auto column_range_beg =
-    cudf::detail::make_std_vector_async(d_column_tree.node_range_begin, stream);
-  auto max_row_offsets = cudf::detail::make_std_vector_async(d_max_row_offsets, stream);
+    cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream);
+  auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream);
   std::vector<std::string> column_names = copy_strings_to_host_sync(
     input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream);
-  stream.synchronize();
   // array of arrays column names
   if (is_array_of_arrays) {
     TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2;
     auto values_column_indices =
       get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream);
     auto h_values_column_indices =
-      cudf::detail::make_std_vector_async(values_column_indices, stream);
-    stream.synchronize();
+      cudf::detail::make_host_vector_sync(values_column_indices, stream);
     std::transform(unique_col_ids.begin(),
                    unique_col_ids.end(),
                    column_names.begin(),
@@ -611,11 +609,13 @@ void make_device_json_column(device_span<SymbolT const> input,
     return thrust::get<0>(a) < thrust::get<0>(b);
   });
 
-  std::vector<uint8_t> is_str_column_all_nulls{};
-  if (is_enabled_mixed_types_as_string) {
-    is_str_column_all_nulls = cudf::detail::make_std_vector_sync(
-      is_all_nulls_each_column(input, d_column_tree, tree, col_ids, options, stream), stream);
-  }
+  auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() {
+    if (is_enabled_mixed_types_as_string) {
+      return cudf::detail::make_host_vector_sync(
+        is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream);
+    }
+    return cudf::detail::make_empty_host_vector<uint8_t>(0, stream);
+  }();
 
   // use hash map because we may skip field name's col_ids
   std::unordered_map<NodeIndexT, std::reference_wrapper<device_json_column>> columns;
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 04eee68e757..ede9fd060b8 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -1978,7 +1978,7 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
 
   // Gather the row group sizes and copy to host
   auto d_tmp_rowgroup_sizes = rmm::device_uvector<uint32_t>(segmentation.num_rowgroups(), stream);
-  std::map<uint32_t, std::vector<uint32_t>> rg_sizes;
+  std::map<uint32_t, cudf::detail::host_vector<uint32_t>> rg_sizes;
   for (auto const& [col_idx, esizes] : elem_sizes) {
     // Copy last elem in each row group - equal to row group size
     thrust::tabulate(rmm::exec_policy(stream),
@@ -1991,14 +1991,14 @@ encoder_decimal_info decimal_chunk_sizes(orc_table_view& orc_table,
                        return src[rg_bounds[idx][col_idx].end - 1];
                      });
 
-    rg_sizes[col_idx] = cudf::detail::make_std_vector_async(d_tmp_rowgroup_sizes, stream);
+    rg_sizes.emplace(col_idx, cudf::detail::make_host_vector_async(d_tmp_rowgroup_sizes, stream));
   }
 
   return {std::move(elem_sizes), std::move(rg_sizes)};
 }
 
 std::map<uint32_t, size_t> decimal_column_sizes(
-  std::map<uint32_t, std::vector<uint32_t>> const& chunk_sizes)
+  std::map<uint32_t, cudf::detail::host_vector<uint32_t>> const& chunk_sizes)
 {
   std::map<uint32_t, size_t> column_sizes;
   std::transform(chunk_sizes.cbegin(),
@@ -2056,7 +2056,7 @@ auto set_rowgroup_char_counts(orc_table_view& orc_table,
                             orc_table.d_string_column_indices,
                             stream);
 
-  auto const h_counts = cudf::detail::make_std_vector_sync(counts, stream);
+  auto const h_counts = cudf::detail::make_host_vector_sync(counts, stream);
 
   for (auto col_idx : orc_table.string_column_indices) {
     auto& str_column = orc_table.column(col_idx);
diff --git a/cpp/src/io/orc/writer_impl.hpp b/cpp/src/io/orc/writer_impl.hpp
index f5f8b3cfed9..cae849ee315 100644
--- a/cpp/src/io/orc/writer_impl.hpp
+++ b/cpp/src/io/orc/writer_impl.hpp
@@ -90,8 +90,9 @@ struct stripe_rowgroups {
  */
 struct encoder_decimal_info {
   std::map<uint32_t, rmm::device_uvector<uint32_t>>
-    elem_sizes;                                        ///< Column index -> per-element size map
-  std::map<uint32_t, std::vector<uint32_t>> rg_sizes;  ///< Column index -> per-rowgroup size map
+    elem_sizes;  ///< Column index -> per-element size map
+  std::map<uint32_t, cudf::detail::host_vector<uint32_t>>
+    rg_sizes;  ///< Column index -> per-rowgroup size map
 };
 
 /**
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index 5ca090b05b3..c8b8b7a1193 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -468,7 +468,7 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   auto validity_it = cudf::detail::make_counting_transform_iterator(
     0, [bitmask = host_bitmask.data()](auto bit_index) { return bit_is_set(bitmask, bit_index); });
 
-  auto is_row_group_required = cudf::detail::make_std_vector_sync(
+  auto const is_row_group_required = cudf::detail::make_host_vector_sync(
     device_span<uint8_t const>(predicate.data<uint8_t>(), predicate.size()), stream);
 
   // Return only filtered row groups based on predicate
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 54ba898b058..00d62c45962 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -77,9 +77,9 @@ void print_cumulative_page_info(device_span<PageInfo const> d_pages,
                                 device_span<cumulative_page_info const> d_c_info,
                                 rmm::cuda_stream_view stream)
 {
-  std::vector<PageInfo> pages              = cudf::detail::make_std_vector_sync(d_pages, stream);
-  std::vector<ColumnChunkDesc> chunks      = cudf::detail::make_std_vector_sync(d_chunks, stream);
-  std::vector<cumulative_page_info> c_info = cudf::detail::make_std_vector_sync(d_c_info, stream);
+  auto const pages  = cudf::detail::make_host_vector_sync(d_pages, stream);
+  auto const chunks = cudf::detail::make_host_vector_sync(d_chunks, stream);
+  auto const c_info = cudf::detail::make_host_vector_sync(d_c_info, stream);
 
   printf("------------\nCumulative sizes by page\n");
 
@@ -647,7 +647,7 @@ std::tuple<rmm::device_uvector<page_span>, size_t, size_t> compute_next_subpass(
   auto [aggregated_info, page_keys_by_split] = adjust_cumulative_sizes(c_info, pages, stream);
 
   // bring back to the cpu
-  auto const h_aggregated_info = cudf::detail::make_std_vector_sync(aggregated_info, stream);
+  auto const h_aggregated_info = cudf::detail::make_host_vector_sync(aggregated_info, stream);
   // print_cumulative_row_info(h_aggregated_info, "adjusted");
 
   // TODO: if the user has explicitly specified skip_rows/num_rows we could be more intelligent
@@ -694,8 +694,7 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
   auto [aggregated_info, page_keys_by_split] = adjust_cumulative_sizes(c_info, pages, stream);
 
   // bring back to the cpu
-  std::vector<cumulative_page_info> h_aggregated_info =
-    cudf::detail::make_std_vector_sync(aggregated_info, stream);
+  auto const h_aggregated_info = cudf::detail::make_host_vector_sync(aggregated_info, stream);
   // print_cumulative_row_info(h_aggregated_info, "adjusted");
 
   std::vector<row_range> splits;
@@ -1304,9 +1303,8 @@ void reader::impl::setup_next_pass(read_mode mode)
     printf("\tskip_rows: %'lu\n", pass.skip_rows);
     printf("\tnum_rows: %'lu\n", pass.num_rows);
     printf("\tbase mem usage: %'lu\n", pass.base_mem_size);
-    auto const num_columns = _input_columns.size();
-    std::vector<size_type> h_page_offsets =
-      cudf::detail::make_std_vector_sync(pass.page_offsets, _stream);
+    auto const num_columns    = _input_columns.size();
+    auto const h_page_offsets = cudf::detail::make_host_vector_sync(pass.page_offsets, _stream);
     for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
       printf("\t\tColumn %'lu: num_pages(%'d)\n",
              c_idx,
@@ -1426,7 +1424,7 @@ void reader::impl::setup_next_subpass(read_mode mode)
     subpass.pages = subpass.page_buf;
   }
 
-  std::vector<page_span> h_spans = cudf::detail::make_std_vector_async(page_indices, _stream);
+  auto const h_spans = cudf::detail::make_host_vector_async(page_indices, _stream);
   subpass.pages.device_to_host_async(_stream);
 
   _stream.synchronize();
@@ -1464,7 +1462,7 @@ void reader::impl::setup_next_subpass(read_mode mode)
   printf("\t\tTotal expected usage: %'lu\n",
          total_expected_size == 0 ? subpass.decomp_page_data.size() + pass.base_mem_size
                                   : total_expected_size + pass.base_mem_size);
-  std::vector<page_span> h_page_indices = cudf::detail::make_std_vector_sync(page_indices, _stream);
+  auto const h_page_indices = cudf::detail::make_host_vector_sync(page_indices, _stream);
   for (size_t c_idx = 0; c_idx < num_columns; c_idx++) {
     printf("\t\tColumn %'lu: pages(%'lu - %'lu)\n",
            c_idx,
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index c2c5dbb4a56..74992aa733f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -2230,20 +2230,20 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     bool need_sync{false};
 
     // need to fetch the histogram data from the device
-    std::vector<uint32_t> h_def_histogram;
-    std::vector<uint32_t> h_rep_histogram;
-    if (stats_granularity == statistics_freq::STATISTICS_COLUMN) {
-      if (def_histogram_bfr_size > 0) {
-        h_def_histogram =
-          std::move(cudf::detail::make_std_vector_async(def_level_histogram, stream));
+    auto const h_def_histogram = [&]() {
+      if (stats_granularity == statistics_freq::STATISTICS_COLUMN && def_histogram_bfr_size > 0) {
         need_sync = true;
+        return cudf::detail::make_host_vector_async(def_level_histogram, stream);
       }
-      if (rep_histogram_bfr_size > 0) {
-        h_rep_histogram =
-          std::move(cudf::detail::make_std_vector_async(rep_level_histogram, stream));
+      return cudf::detail::make_host_vector<uint32_t>(0, stream);
+    }();
+    auto const h_rep_histogram = [&]() {
+      if (stats_granularity == statistics_freq::STATISTICS_COLUMN && rep_histogram_bfr_size > 0) {
         need_sync = true;
+        return cudf::detail::make_host_vector_async(rep_level_histogram, stream);
       }
-    }
+      return cudf::detail::make_host_vector<uint32_t>(0, stream);
+    }();
 
     for (int r = 0; r < num_rowgroups; r++) {
       int p           = rg_to_part[r];
@@ -2265,7 +2265,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
         update_chunk_encoding_stats(column_chunk_meta, ck, write_v2_headers);
 
         if (ck.ck_stat_size != 0) {
-          std::vector<uint8_t> const stats_blob = cudf::detail::make_std_vector_sync(
+          auto const stats_blob = cudf::detail::make_host_vector_sync(
             device_span<uint8_t const>(dev_bfr, ck.ck_stat_size), stream);
           CompactProtocolReader cp(stats_blob.data(), stats_blob.size());
           cp.read(&column_chunk_meta.statistics);
diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp
index 91be154e09d..e4313eba454 100644
--- a/cpp/src/io/utilities/datasource.cpp
+++ b/cpp/src/io/utilities/datasource.cpp
@@ -297,10 +297,10 @@ class device_buffer_source final : public datasource {
   {
     auto const count  = std::min(size, this->size() - offset);
     auto const stream = cudf::get_default_stream();
-    auto h_data       = cudf::detail::make_std_vector_async(
+    auto h_data       = cudf::detail::make_host_vector_async(
       cudf::device_span<std::byte const>{_d_buffer.data() + offset, count}, stream);
     stream.synchronize();
-    return std::make_unique<owning_buffer<std::vector<std::byte>>>(std::move(h_data));
+    return std::make_unique<owning_buffer<cudf::detail::host_vector<std::byte>>>(std::move(h_data));
   }
 
   [[nodiscard]] bool supports_device_read() const override { return true; }
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index e465fb79c89..e856b89b836 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -376,7 +376,7 @@ std::pair<rmm::device_uvector<uint32_t>, rmm::device_uvector<int64_t>> hash_subs
                           sub_offsets.begin(),
                           sub_offsets.end(),
                           indices.begin());
-      return cudf::detail::make_std_vector_sync(indices, stream);
+      return cudf::detail::make_host_vector_sync(indices, stream);
     }();
 
     // Call segmented sort with the sort sections

From 1a96e4cca188f4e0500a87c391ef105b49a42288 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 27 Aug 2024 18:57:48 -1000
Subject: [PATCH 738/842] Fix loc/iloc.__setitem__[:, loc] with non cupy types
 (#16677)

Discovered in https://github.com/rapidsai/cudf/pull/16652, `DataFrame.iloc/loc.__setitem__` with a non-cupy type e.g. `"category"` failed because the indexing path unconditionally tries to `cupy.asarray` the value to be set which only accepts types recognized by cupy.

We can skip this `asarray` if we have a numpy/pandas/cudf object

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16677
---
 python/cudf/cudf/core/dataframe.py      | 10 ++++++----
 python/cudf/cudf/tests/test_indexing.py | 10 ++++++++++
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 14b63c2b0d7..d54a800aedf 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -414,8 +414,9 @@ def _setitem_tuple_arg(self, key, value):
                     )
 
             else:
-                value = cupy.asarray(value)
-                if value.ndim == 2:
+                if not is_column_like(value):
+                    value = cupy.asarray(value)
+                if getattr(value, "ndim", 1) == 2:
                     # If the inner dimension is 1, it's broadcastable to
                     # all columns of the dataframe.
                     indexed_shape = columns_df.loc[key[0]].shape
@@ -558,8 +559,9 @@ def _setitem_tuple_arg(self, key, value):
         else:
             # TODO: consolidate code path with identical counterpart
             # in `_DataFrameLocIndexer._setitem_tuple_arg`
-            value = cupy.asarray(value)
-            if value.ndim == 2:
+            if not is_column_like(value):
+                value = cupy.asarray(value)
+            if getattr(value, "ndim", 1) == 2:
                 indexed_shape = columns_df.iloc[key[0]].shape
                 if value.shape[1] == 1:
                     if value.shape[0] != indexed_shape[0]:
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 716b4dc6acd..9df2852dde8 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -2369,3 +2369,13 @@ def test_duplicate_labels_raises():
         df[["a", "a"]]
     with pytest.raises(ValueError):
         df.loc[:, ["a", "a"]]
+
+
+@pytest.mark.parametrize("indexer", ["iloc", "loc"])
+@pytest.mark.parametrize("dtype", ["category", "timedelta64[ns]"])
+def test_loc_iloc_setitem_col_slice_non_cupy_types(indexer, dtype):
+    df_pd = pd.DataFrame(range(2), dtype=dtype)
+    df_cudf = cudf.DataFrame.from_pandas(df_pd)
+    getattr(df_pd, indexer)[:, 0] = getattr(df_pd, indexer)[:, 0]
+    getattr(df_cudf, indexer)[:, 0] = getattr(df_cudf, indexer)[:, 0]
+    assert_eq(df_pd, df_cudf)

From 569939f40094b266a768a270d8966c5f7277c46a Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 28 Aug 2024 08:36:14 -0500
Subject: [PATCH 739/842] Fix slowdown in DataFrame repr in jupyter notebook
 (#16656)

Fixes: #15747

This PR fixes slow-down in `DataFrame` repr inside a jupyter notebook.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Murray (https://github.com/Matt711)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/16656
---
 .gitignore                                    |  4 ++
 ci/cudf_pandas_scripts/run_tests.sh           |  3 +
 .../all_cuda-118_arch-x86_64.yaml             |  4 ++
 .../all_cuda-125_arch-x86_64.yaml             |  4 ++
 dependencies.yaml                             |  8 ++-
 python/cudf/cudf/core/dataframe.py            |  4 +-
 python/cudf/cudf/pandas/_wrappers/pandas.py   | 28 ++++++++
 .../data/repr_slow_down_test.ipynb            | 69 +++++++++++++++++++
 .../cudf_pandas_tests/test_cudf_pandas.py     | 36 ++++++++++
 python/cudf/pyproject.toml                    |  4 ++
 10 files changed, 162 insertions(+), 2 deletions(-)
 create mode 100644 python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb

diff --git a/.gitignore b/.gitignore
index 153c7f59744..619e1464b2a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -178,3 +178,7 @@ jupyter_execute
 # clang tooling
 compile_commands.json
 .clangd/
+
+# pytest artifacts
+rmm_log.txt
+python/cudf/cudf_pandas_tests/data/rmm_log.txt
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 39056d58d56..52964496b36 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -61,6 +61,9 @@ else
         "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
 fi
 
+python -m pip install ipykernel
+python -m ipykernel install --user --name python3
+
 python -m pytest -p cudf.pandas \
     --cov-config=./python/cudf/.coveragerc \
     --cov=cudf \
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 96596958636..c4c32da8af2 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -37,6 +37,7 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
+- jupyter_client
 - libcufile-dev=1.4.0.31
 - libcufile=1.4.0.31
 - libcurand-dev=10.3.0.86
@@ -48,6 +49,8 @@ dependencies:
 - moto>=4.0.8
 - msgpack-python
 - myst-nb
+- nbconvert
+- nbformat
 - nbsphinx
 - ninja
 - notebook
@@ -57,6 +60,7 @@ dependencies:
 - nvcc_linux-64=11.8
 - nvcomp==3.0.6
 - nvtx>=0.2.1
+- openpyxl
 - packaging
 - pandas
 - pandas>=2.0,<2.2.3dev0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index efc5f76b90f..7439c9543a5 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -38,6 +38,7 @@ dependencies:
 - hypothesis
 - identify>=2.5.20
 - ipython
+- jupyter_client
 - libcufile-dev
 - libcurand-dev
 - libkvikio==24.10.*,>=0.0.0a0
@@ -47,6 +48,8 @@ dependencies:
 - moto>=4.0.8
 - msgpack-python
 - myst-nb
+- nbconvert
+- nbformat
 - nbsphinx
 - ninja
 - notebook
@@ -55,6 +58,7 @@ dependencies:
 - numpydoc
 - nvcomp==3.0.6
 - nvtx>=0.2.1
+- openpyxl
 - packaging
 - pandas
 - pandas>=2.0,<2.2.3dev0
diff --git a/dependencies.yaml b/dependencies.yaml
index b55860815bf..5be291b3671 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -31,6 +31,7 @@ files:
       - test_python_cudf
       - test_python_dask_cudf
       - test_python_pylibcudf
+      - test_python_cudf_pandas
   test_static_build:
     output: none
     includes:
@@ -49,6 +50,7 @@ files:
       - test_python_common
       - test_python_cudf
       - test_python_dask_cudf
+      - test_python_cudf_pandas
   test_java:
     output: none
     includes:
@@ -934,9 +936,13 @@ dependencies:
           # installation issues with `psycopg2`.
           - pandas[test, pyarrow, performance, computation, fss, excel, parquet, feather, hdf5, spss, html, xml, plot, output-formatting, clipboard, compression]
           - pytest-reportlog
+          - ipython
   test_python_cudf_pandas:
     common:
-      - output_types: [requirements, pyproject]
+      - output_types: [conda, requirements, pyproject]
         packages:
           - ipython
+          - jupyter_client
+          - nbconvert
+          - nbformat
           - openpyxl
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index d54a800aedf..a309b9117eb 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -680,7 +680,9 @@ class DataFrame(IndexedFrame, Serializable, GetAttrGetItemMixin):
     3  3   0.3
     """
 
-    _PROTECTED_KEYS = frozenset(("_data", "_index"))
+    _PROTECTED_KEYS = frozenset(
+        ("_data", "_index", "_ipython_canary_method_should_not_exist_")
+    )
     _accessors: set[Any] = set()
     _loc_indexer_type = _DataFrameLocIndexer
     _iloc_indexer_type = _DataFrameIlocIndexer
diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py
index 478108f36f1..6d03063fa27 100644
--- a/python/cudf/cudf/pandas/_wrappers/pandas.py
+++ b/python/cudf/cudf/pandas/_wrappers/pandas.py
@@ -61,6 +61,12 @@
     TimeGrouper as pd_TimeGrouper,
 )
 
+try:
+    from IPython import get_ipython
+
+    ipython_shell = get_ipython()
+except ImportError:
+    ipython_shell = None
 
 cudf.set_option("mode.pandas_compatible", True)
 
@@ -208,6 +214,12 @@ def _DataFrame__dir__(self):
     ]
 
 
+def ignore_ipython_canary_check(self, **kwargs):
+    raise AttributeError(
+        "_ipython_canary_method_should_not_exist_ doesn't exist"
+    )
+
+
 DataFrame = make_final_proxy_type(
     "DataFrame",
     cudf.DataFrame,
@@ -220,10 +232,26 @@ def _DataFrame__dir__(self):
         "_constructor": _FastSlowAttribute("_constructor"),
         "_constructor_sliced": _FastSlowAttribute("_constructor_sliced"),
         "_accessors": set(),
+        "_ipython_canary_method_should_not_exist_": ignore_ipython_canary_check,
     },
 )
 
 
+def custom_repr_html(obj):
+    # This custom method is need to register a html format
+    # for ipython
+    return _fast_slow_function_call(
+        lambda obj: obj._repr_html_(),
+        obj,
+    )[0]
+
+
+if ipython_shell:
+    # See: https://ipython.readthedocs.io/en/stable/config/integrating.html#formatters-for-third-party-types
+    html_formatter = ipython_shell.display_formatter.formatters["text/html"]
+    html_formatter.for_type(DataFrame, custom_repr_html)
+
+
 Series = make_final_proxy_type(
     "Series",
     cudf.Series,
diff --git a/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb b/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb
new file mode 100644
index 00000000000..c7d39b78810
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/data/repr_slow_down_test.ipynb
@@ -0,0 +1,69 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext cudf.pandas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "np.random.seed(0)\n",
+    "\n",
+    "num_rows = 25_000_000\n",
+    "num_columns = 12\n",
+    "\n",
+    "# Create a DataFrame with random data\n",
+    "df = pd.DataFrame(np.random.randint(0, 100, size=(num_rows, num_columns)),\n",
+    "                  columns=[f'Column_{i}' for i in range(1, num_columns + 1)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 028f5f173ac..0827602852d 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -14,9 +14,12 @@
 import types
 from io import BytesIO, StringIO
 
+import jupyter_client
+import nbformat
 import numpy as np
 import pyarrow as pa
 import pytest
+from nbconvert.preprocessors import ExecutePreprocessor
 from numba import NumbaDeprecationWarning
 from pytz import utc
 
@@ -1650,3 +1653,36 @@ def test_change_index_name(index):
 
         assert s.index.name == name
         assert df.index.name == name
+
+
+def test_notebook_slow_repr():
+    notebook_filename = (
+        os.path.dirname(os.path.abspath(__file__))
+        + "/data/repr_slow_down_test.ipynb"
+    )
+    with open(notebook_filename, "r", encoding="utf-8") as f:
+        nb = nbformat.read(f, as_version=4)
+
+    ep = ExecutePreprocessor(
+        timeout=20, kernel_name=jupyter_client.KernelManager().kernel_name
+    )
+
+    try:
+        ep.preprocess(nb, {"metadata": {"path": "./"}})
+    except Exception as e:
+        assert False, f"Error executing the notebook: {e}"
+
+    # Collect the outputs
+    html_result = nb.cells[2]["outputs"][0]["data"]["text/html"]
+    for string in {
+        "div",
+        "Column_1",
+        "Column_2",
+        "Column_3",
+        "Column_4",
+        "tbody",
+        "</table>",
+    }:
+        assert (
+            string in html_result
+        ), f"Expected string {string} not found in the output"
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 8386935fab0..0c1d5015078 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -63,11 +63,15 @@ test = [
     "tzdata",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 pandas-tests = [
+    "ipython",
     "pandas[test, pyarrow, performance, computation, fss, excel, parquet, feather, hdf5, spss, html, xml, plot, output-formatting, clipboard, compression]",
     "pytest-reportlog",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 cudf-pandas-tests = [
     "ipython",
+    "jupyter_client",
+    "nbconvert",
+    "nbformat",
     "openpyxl",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 

From 5491b394921ca3e03f09c9e789f1ba00da9db0b1 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 28 Aug 2024 11:03:10 -0500
Subject: [PATCH 740/842] switch from typing.Callable to
 collections.abc.Callable (#16670)

Follow-up to #16637.

Once this project's minimum support Python version was bumped up to Python 3.10, `ruff` started raising this error from `pyupgrade`:

```text
Import from `collections.abc` instead: `Callable`
```

* ruff docs: https://docs.astral.sh/ruff/rules/deprecated-import/
* `typing` docs saying that `typing.Callable` is deprecated starting in Python 3.9 https://docs.python.org/3/library/typing.html#typing.Callable
* context: https://github.com/rapidsai/cudf/pull/16637#discussion_r1727482177

This proposes accepting that suggestion, so that `cudf` won't be broken whenever `Callable` is removed from the `typing` module.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/16670
---
 pyproject.toml                                    | 4 +++-
 python/cudf/cudf/_typing.py                       | 3 ++-
 python/cudf/cudf/core/column/numerical.py         | 4 +++-
 python/cudf/cudf/core/column_accessor.py          | 4 ++--
 python/cudf/cudf/core/dataframe.py                | 4 ++--
 python/cudf/cudf/core/dtypes.py                   | 4 +++-
 python/cudf/cudf/core/frame.py                    | 4 ++--
 python/cudf/cudf/core/udf/utils.py                | 5 ++++-
 python/cudf/cudf/io/parquet.py                    | 6 +++++-
 python/cudf/cudf/options.py                       | 4 ++--
 python/cudf/cudf/pandas/fast_slow_proxy.py        | 4 ++--
 python/cudf/cudf/utils/ioutils.py                 | 2 +-
 python/cudf_polars/cudf_polars/dsl/ir.py          | 4 ++--
 python/cudf_polars/cudf_polars/typing/__init__.py | 3 ++-
 python/cudf_polars/pyproject.toml                 | 1 -
 python/dask_cudf/dask_cudf/io/json.py             | 2 +-
 16 files changed, 36 insertions(+), 22 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e15cb7b3cdd..8f9aa165e5a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -87,7 +87,9 @@ select = [
     # non-pep585-annotation
     "UP006",
     # non-pep604-annotation
-    "UP007"
+    "UP007",
+    # Import from `collections.abc` instead: `Callable`
+    "UP035",
 ]
 ignore = [
     # whitespace before :
diff --git a/python/cudf/cudf/_typing.py b/python/cudf/cudf/_typing.py
index 34c96cc8cb3..6e8ad556b08 100644
--- a/python/cudf/cudf/_typing.py
+++ b/python/cudf/cudf/_typing.py
@@ -1,7 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
 import sys
-from typing import TYPE_CHECKING, Any, Callable, Dict, Iterable, TypeVar, Union
+from collections.abc import Callable
+from typing import TYPE_CHECKING, Any, Dict, Iterable, TypeVar, Union
 
 import numpy as np
 from pandas import Period, Timedelta, Timestamp
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 90bec049831..7f391c8a79c 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -3,7 +3,7 @@
 from __future__ import annotations
 
 import functools
-from typing import TYPE_CHECKING, Any, Callable, Sequence, cast
+from typing import TYPE_CHECKING, Any, Sequence, cast
 
 import numpy as np
 import pandas as pd
@@ -28,6 +28,8 @@
 from .numerical_base import NumericalBaseColumn
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from cudf._typing import (
         ColumnBinaryOperand,
         ColumnLike,
diff --git a/python/cudf/cudf/core/column_accessor.py b/python/cudf/cudf/core/column_accessor.py
index 34076fa0060..09b0f453692 100644
--- a/python/cudf/cudf/core/column_accessor.py
+++ b/python/cudf/cudf/core/column_accessor.py
@@ -6,7 +6,7 @@
 import sys
 from collections import abc
 from functools import cached_property, reduce
-from typing import TYPE_CHECKING, Any, Callable, Mapping, cast
+from typing import TYPE_CHECKING, Any, Mapping, cast
 
 import numpy as np
 import pandas as pd
@@ -639,7 +639,7 @@ def _pad_key(
 
     def rename_levels(
         self,
-        mapper: Mapping[abc.Hashable, abc.Hashable] | Callable,
+        mapper: Mapping[abc.Hashable, abc.Hashable] | abc.Callable,
         level: int | None = None,
     ) -> Self:
         """
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index a309b9117eb..6065e0e1eeb 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -13,8 +13,8 @@
 import textwrap
 import warnings
 from collections import abc, defaultdict
-from collections.abc import Iterator
-from typing import TYPE_CHECKING, Any, Callable, Literal, MutableMapping, cast
+from collections.abc import Callable, Iterator
+from typing import TYPE_CHECKING, Any, Literal, MutableMapping, cast
 
 import cupy
 import numba
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 6d532e01cba..2110e610c37 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -7,7 +7,7 @@
 import textwrap
 import warnings
 from functools import cached_property
-from typing import TYPE_CHECKING, Any, Callable
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
 import pandas as pd
@@ -27,6 +27,8 @@
     PANDAS_NUMPY_DTYPE = pd.core.dtypes.dtypes.PandasDtype
 
 if TYPE_CHECKING:
+    from collections.abc import Callable
+
     from cudf._typing import Dtype
     from cudf.core.buffer import Buffer
 
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 3e1efd7c97a..cbe1e97d834 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -6,7 +6,7 @@
 import pickle
 import warnings
 from collections import abc
-from typing import TYPE_CHECKING, Any, Callable, Literal, MutableMapping
+from typing import TYPE_CHECKING, Any, Literal, MutableMapping
 
 # TODO: The `numpy` import is needed for typing purposes during doc builds
 # only, need to figure out why the `np` alias is insufficient then remove.
@@ -403,7 +403,7 @@ def __arrow_array__(self, type=None):
     @_performance_tracking
     def _to_array(
         self,
-        get_array: Callable,
+        get_array: abc.Callable,
         module: ModuleType,
         copy: bool,
         dtype: Dtype | None = None,
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index d616761cb3b..6d7362952c9 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -3,7 +3,7 @@
 
 import functools
 import os
-from typing import Any, Callable
+from typing import TYPE_CHECKING, Any
 
 import cachetools
 import cupy as cp
@@ -41,6 +41,9 @@
 from cudf.utils.performance_tracking import _performance_tracking
 from cudf.utils.utils import initfunc
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
 # Maximum size of a string column is 2 GiB
 _STRINGS_UDF_DEFAULT_HEAP_SIZE = os.environ.get("STRINGS_UDF_HEAP_SIZE", 2**31)
 _heap_size = 0
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 6b895abbf66..d6b2ae2f31c 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -10,7 +10,7 @@
 from collections import defaultdict
 from contextlib import ExitStack
 from functools import partial, reduce
-from typing import Callable
+from typing import TYPE_CHECKING
 from uuid import uuid4
 
 import numpy as np
@@ -24,6 +24,10 @@
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
 BYTE_SIZES = {
     "kb": 1000,
     "mb": 1000000,
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index 94e73021cec..df7bbe22a61 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -5,10 +5,10 @@
 import textwrap
 from contextlib import ContextDecorator
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Callable
+from typing import TYPE_CHECKING, Any
 
 if TYPE_CHECKING:
-    from collections.abc import Container
+    from collections.abc import Callable, Container
 
 
 @dataclass
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index bb678fd1efe..4b0fd9a5b36 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -10,9 +10,9 @@
 import pickle
 import types
 import warnings
-from collections.abc import Iterator
+from collections.abc import Callable, Iterator
 from enum import IntEnum
-from typing import Any, Callable, Literal, Mapping
+from typing import Any, Literal, Mapping
 
 import numpy as np
 
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index e5944d7093c..94974e595b1 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -4,9 +4,9 @@
 import os
 import urllib
 import warnings
+from collections.abc import Callable
 from io import BufferedWriter, BytesIO, IOBase, TextIOWrapper
 from threading import Thread
-from typing import Callable
 
 import fsspec
 import fsspec.implementations.local
diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py
index ebc7dee6bfb..e334e6f5cc5 100644
--- a/python/cudf_polars/cudf_polars/dsl/ir.py
+++ b/python/cudf_polars/cudf_polars/dsl/ir.py
@@ -18,7 +18,7 @@
 import types
 from functools import cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, ClassVar
+from typing import TYPE_CHECKING, Any, ClassVar
 
 import pyarrow as pa
 import pylibcudf as plc
@@ -31,7 +31,7 @@
 from cudf_polars.utils import sorting
 
 if TYPE_CHECKING:
-    from collections.abc import MutableMapping
+    from collections.abc import Callable, MutableMapping
     from typing import Literal
 
     from cudf_polars.typing import Schema
diff --git a/python/cudf_polars/cudf_polars/typing/__init__.py b/python/cudf_polars/cudf_polars/typing/__init__.py
index 5276073e62a..adab10bdded 100644
--- a/python/cudf_polars/cudf_polars/typing/__init__.py
+++ b/python/cudf_polars/cudf_polars/typing/__init__.py
@@ -13,7 +13,8 @@
 from polars.polars import _expr_nodes as pl_expr, _ir_nodes as pl_ir
 
 if TYPE_CHECKING:
-    from typing import Callable, TypeAlias
+    from collections.abc import Callable
+    from typing import TypeAlias
 
     import polars as pl
 
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index 0382e3ce6a2..f2bab9e6623 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -115,7 +115,6 @@ ignore = [
   # tryceratops
   "TRY003", # Avoid specifying long messages outside the exception class
   # pyupgrade
-  "UP035",  # Import from `collections.abc` instead: `Callable`
   "UP038",  # Use `X | Y` in `isinstance` call instead of `(X, Y)`
   # Lints below are turned off because of conflicts with the ruff
   # formatter
diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py
index 8705d98e9d6..98c5ceedb76 100644
--- a/python/dask_cudf/dask_cudf/io/json.py
+++ b/python/dask_cudf/dask_cudf/io/json.py
@@ -81,7 +81,7 @@ def read_json(
 
         If str, this value will be used as the ``engine`` argument
         when :func:`cudf.read_json` is used to create each partition.
-        If a :obj:`~typing.Callable`, this value will be used as the
+        If a :obj:`~collections.abc.Callable`, this value will be used as the
         underlying function used to create each partition from JSON
         data. The default value is "auto", so that
         ``engine=partial(cudf.read_json, engine="auto")`` will be

From c600a65e4fd82a4a6eb00feaee032b62872de761 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 28 Aug 2024 09:55:11 -0700
Subject: [PATCH 741/842] Update documentation for Dask cuDF (#16671)

General documentation update for Dask cuDF:

- Adds `README.md` file to `dask_cudf` (this is currently a symlink to cudf's README, which isn't terribly helpful)
- Emphasizes direct usage of the `dask.dataframe` API (rather than the explicit `dask_cudf` API)
  - Including the `to_backend` API
- Advertises query-planning support
- Includes a simple Dask CUDA example (and best-practices link)

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Mads R. B. Kristensen (https://github.com/madsbk)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16671
---
 docs/cudf/source/user_guide/10min.ipynb |   6 +-
 python/dask_cudf/README.md              | 136 +++++++++++++++++++++++-
 2 files changed, 140 insertions(+), 2 deletions(-)
 mode change 120000 => 100644 python/dask_cudf/README.md

diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index c3da2558db8..2eaa75b3189 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -15,7 +15,11 @@
     "\n",
     "[Dask](https://dask.org/) is a flexible library for parallel computing in Python that makes scaling out your workflow smooth and simple. On the CPU, Dask uses Pandas to execute operations in parallel on DataFrame partitions.\n",
     "\n",
-    "[Dask-cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed using cuDF GPU DataFrames instead of Pandas DataFrames. For instance, when you call `dask_cudf.read_csv(...)`, your cluster's GPUs do the work of parsing the CSV file(s) by calling [`cudf.read_csv()`](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.read_csv.html).\n",
+    "[Dask cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed using cuDF GPU DataFrames instead of Pandas DataFrames. For instance, when you call `dask_cudf.read_csv(...)`, your cluster's GPUs do the work of parsing the CSV file(s) by calling [`cudf.read_csv()`](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.read_csv.html).\n",
+    "\n",
+    "\n",
+    "> [!NOTE]  \n",
+    "> This notebook uses the explicit Dask cuDF API (`dask_cudf`) for clarity. However, we strongly recommend that you use Dask's [configuration infrastructure](https://docs.dask.org/en/latest/configuration.html) to set the `\"dataframe.backend\"` to `\"cudf\"`, and work with the `dask.dataframe` API directly. Please see the [Dask cuDF documentation](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) for more information.\n",
     "\n",
     "\n",
     "## When to use cuDF and Dask-cuDF\n",
diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md
deleted file mode 120000
index fe840054137..00000000000
--- a/python/dask_cudf/README.md
+++ /dev/null
@@ -1 +0,0 @@
-../../README.md
\ No newline at end of file
diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md
new file mode 100644
index 00000000000..6edb9f87d48
--- /dev/null
+++ b/python/dask_cudf/README.md
@@ -0,0 +1,135 @@
+# <div align="left"><img src="../../img/rapids_logo.png" width="90px"/>&nbsp;Dask cuDF - A GPU Backend for Dask DataFrame</div>
+
+Dask cuDF (a.k.a. dask-cudf or `dask_cudf`) is an extension library for [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html). When installed, Dask cuDF is automatically registered as the `"cudf"` [dataframe backend](https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html) for Dask DataFrame.
+
+## Using Dask cuDF
+
+### The Dask DataFrame API (Recommended)
+
+Simply set the `"dataframe.backend"` [configuration](https://docs.dask.org/en/stable/configuration.html) to `"cudf"` in Dask, and the public Dask DataFrame API will leverage `cudf` automatically:
+
+```python
+import dask
+dask.config.set({"dataframe.backend": "cudf"})
+
+import dask.dataframe as dd
+# This gives us a cuDF-backed dataframe
+df = dd.read_parquet("data.parquet", ...)
+```
+
+> [!IMPORTANT]
+> The `"dataframe.backend"` configuration will only be used for collection creation when the following APIs are used: `read_parquet`, `read_json`, `read_csv`, `read_orc`, `read_hdf`, and `from_dict`. For example, if `from_map`, `from_pandas`, `from_delayed`, or `from_array` are used, the backend of the new collection will depend on the input to the function:
+
+```python
+import pandas as pd
+import cudf
+
+# This gives us a Pandas-backed dataframe
+dd.from_pandas(pd.DataFrame({"a": range(10)}))
+
+# This gives us a cuDF-backed dataframe
+dd.from_pandas(cudf.DataFrame({"a": range(10)}))
+```
+
+A cuDF-backed DataFrame collection can be moved to the `"pandas"` backend:
+
+```python
+df = df.to_backend("pandas")
+```
+
+Similarly, a Pandas-backed DataFrame collection can be moved to the `"cudf"` backend:
+
+```python
+df = df.to_backend("cudf")
+```
+
+### The Explicit Dask cuDF API
+
+In addition to providing the `"cudf"` backend for Dask DataFrame, Dask cuDF also provides an explicit `dask_cudf` API:
+
+```python
+import dask_cudf
+
+# This always gives us a cuDF-backed dataframe
+df = dask_cudf.read_parquet("data.parquet", ...)
+```
+
+> [!NOTE]
+> This API is used implicitly by the Dask DataFrame API when the `"cudf"` backend is enabled. Therefore, using it directly will not provide any performance benefit over the CPU/GPU-portable `dask.dataframe` API. Also, using some parts of the explicit API are incompatible with automatic query planning (see the next section).
+
+See the [Dask cuDF's API documentation](https://docs.rapids.ai/api/dask-cudf/stable/) for further information.
+
+## Query Planning
+
+Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+). As long as the `"dataframe.query-planning"` configuration is set to `True` (the default) when `dask.dataframe` is first imported, [Dask Expressions](https://github.com/dask/dask-expr) will be used under the hood.
+
+For example, the following user code will automatically benefit from predicate pushdown when the result is computed.
+
+```python
+df = dd.read_parquet("/my/parquet/dataset/")
+result = df.sort_values('B')['A']
+```
+
+Unoptimized expression graph (`df.pprint()`):
+```
+Projection: columns='A'
+  SortValues: by=['B'] shuffle_method='tasks' options={}
+    ReadParquetFSSpec: path='/my/parquet/dataset/' ...
+```
+
+Simplified expression graph (`df.simplify().pprint()`):
+```
+Projection: columns='A'
+  SortValues: by=['B'] shuffle_method='tasks' options={}
+    ReadParquetFSSpec: path='/my/parquet/dataset/' columns=['A', 'B'] ...
+```
+
+> [!NOTE]
+> Dask will automatically simplify the expression graph (within `optimize`) when the result is converted to a task graph (via `compute` or `persist`). The user does not need to call `simplify` themself.
+
+
+## Using Multiple GPUs and Multiple Nodes
+
+Whenever possible, Dask cuDF (i.e. Dask DataFrame) will automatically try to partition your data into small-enough tasks to fit comfortably in the memory of a single GPU. This means the necessary compute tasks needed to compute a query can often be streamed to a single GPU process for out-of-core computing. This also means that the compute tasks can be executed in parallel over a multi-GPU cluster.
+
+> [!IMPORTANT]
+> Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU or multi-node execution on their own. You must deploy a distributed cluster (ideally with [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/)) to leverage multiple GPUs.
+
+In order to execute your Dask workflow on multiple GPUs, you will typically need to use [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) to deploy distributed Dask cluster, and [Distributed](https://distributed.dask.org/en/stable/client.html) to define a `client` object. For example:
+
+```python
+
+from dask_cuda import LocalCUDACluster
+from distributed import Client
+
+client = Client(
+    LocalCUDACluster(
+        CUDA_VISIBLE_DEVICES="0,1",  # Use two workers (on devices 0 and 1)
+        rmm_pool_size=0.9,  # Use 90% of GPU memory as a pool for faster allocations
+        enable_cudf_spill=True,  # Improve device memory stability
+        local_directory="/fast/scratch/",  # Use fast local storage for spilling
+    )
+)
+
+df = dd.read_parquet("/my/parquet/dataset/")
+agg = df.groupby('B').sum()
+agg.compute()  # This will use the cluster defined above
+```
+
+> [!NOTE]
+> This example uses `compute` to materialize a concrete `cudf.DataFrame` object in local memory. Never call `compute` on a large collection that cannot fit comfortably in the memory of a single GPU! See Dask's [documentation on managing computation](https://distributed.dask.org/en/stable/manage-computation.html) for more details.
+
+Please see the [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) documentation for more information about deploying GPU-aware clusters (including [best practices](https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/)).
+
+## Install
+
+See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to-date information and commands for installing Dask cuDF and other RAPIDS packages.
+
+## Resources
+
+- [Dask cuDF API documentation](https://docs.rapids.ai/api/dask-cudf/stable/)
+- [cuDF API documentation](https://docs.rapids.ai/api/cudf/stable/)
+- [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/)
+- [Dask CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/)
+- [Deployment](https://docs.rapids.ai/deployment/stable/)
+- [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate.

From 872e01e8c11fe61051d7be46f09f285252f2c6ac Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 28 Aug 2024 12:08:38 -0500
Subject: [PATCH 742/842] Fix slowdown in `CategoricalIndex.__repr__` (#16665)

Fixes: #13297

This PR fixes a slow-down in performing repr of a `CategoricalIndex` when there are too many unique values. There was no other choice to fix this in a better way by using public APIs, because all the public APIs seem to be performing categories validation even if `fastpath=True`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16665
---
 python/cudf/cudf/core/index.py      | 16 +++++++++++++++-
 python/cudf/cudf/testing/_utils.py  | 21 +++++++++++++++++++++
 python/cudf/cudf/tests/test_repr.py | 11 +++++++++++
 3 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index df8af856f4f..27c6556f976 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1443,7 +1443,21 @@ def __repr__(self):
                     output[:break_idx].replace("'", "") + output[break_idx:]
                 )
             else:
-                output = repr(preprocess.to_pandas())
+                # Too many non-unique categories will cause
+                # the output to take too long. In this case, we
+                # split the categories into data and categories
+                # and generate the repr separately and
+                # merge them.
+                pd_cats = pd.Categorical(
+                    preprocess.astype(preprocess.categories.dtype).to_pandas()
+                )
+                pd_preprocess = pd.CategoricalIndex(pd_cats)
+                data_repr = repr(pd_preprocess).split("\n")
+                pd_preprocess.dtype._categories = (
+                    preprocess.categories.to_pandas()
+                )
+                cats_repr = repr(pd_preprocess).split("\n")
+                output = "\n".join(data_repr[:-1] + cats_repr[-1:])
 
             output = output.replace("nan", str(cudf.NA))
         elif preprocess._values.nullable:
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index a6a2d4eea00..540f12c8382 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import itertools
+import signal
 import string
 from collections import abc
 from contextlib import contextmanager
@@ -368,3 +369,23 @@ def sv_to_udf_str_testing_lowering(context, builder, sig, args):
     return cast_string_view_to_udf_string(
         context, builder, sig.args[0], sig.return_type, args[0]
     )
+
+
+class cudf_timeout:
+    """
+    Context manager to raise a TimeoutError after a specified number of seconds.
+    """
+
+    def __init__(self, seconds, *, timeout_message=""):
+        self.seconds = int(seconds)
+        self.timeout_message = timeout_message
+
+    def _timeout_handler(self, signum, frame):
+        raise TimeoutError(self.timeout_message)
+
+    def __enter__(self):
+        signal.signal(signal.SIGALRM, self._timeout_handler)
+        signal.alarm(self.seconds)
+
+    def __exit__(self, type, value, traceback):
+        signal.alarm(0)
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index a013745f71e..57eef9e3463 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1480,3 +1480,14 @@ def test_interval_index_repr():
     gi = cudf.from_pandas(pi)
 
     assert repr(pi) == repr(gi)
+
+
+def test_large_unique_categories_repr():
+    # Unfortunately, this is a long running test (takes about 1 minute)
+    # and there is no way we can reduce the time
+    pi = pd.CategoricalIndex(range(100_000_000))
+    gi = cudf.CategoricalIndex(range(100_000_000))
+    expected_repr = repr(pi)
+    with utils.cudf_timeout(2, timeout_message="Failed to repr fast enough"):
+        actual_repr = repr(gi)
+    assert expected_repr == actual_repr

From dba6c1fe37bbc4a3b15123bfd3a5c1d5cf693fe3 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 28 Aug 2024 09:29:38 -1000
Subject: [PATCH 743/842] Remove build_categorical_column in favor of
 CategoricalColumn constructor (#16617)

`build_categorical_column` was largely redundant with the CategoricalColumn constructor, so in the spirit of having One Way to Do Things, replacing the former with the latter.

There is usage of `build_categorical_column` in cugraph that has been replaced in https://github.com/rapidsai/cugraph/pull/4618

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Richard (Rick) Zamora (https://github.com/rjzamora)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16617
---
 python/cudf/cudf/core/_internals/where.py    |  13 -
 python/cudf/cudf/core/column/__init__.py     |   1 -
 python/cudf/cudf/core/column/categorical.py  | 266 +++++++++----------
 python/cudf/cudf/core/column/column.py       |  96 +++----
 python/cudf/cudf/core/column/numerical.py    |  20 +-
 python/cudf/cudf/core/cut.py                 |  17 +-
 python/cudf/cudf/core/dataframe.py           |  33 +--
 python/cudf/cudf/core/df_protocol.py         |  28 +-
 python/cudf/cudf/core/frame.py               |  25 +-
 python/cudf/cudf/core/index.py               |  18 +-
 python/cudf/cudf/core/indexed_frame.py       |  12 +-
 python/cudf/cudf/core/series.py              |  21 +-
 python/cudf/cudf/core/single_column_frame.py |   7 +-
 python/cudf/cudf/io/parquet.py               |  16 +-
 python/dask_cudf/dask_cudf/backends.py       |  19 +-
 python/dask_cudf/dask_cudf/io/parquet.py     |  12 +-
 16 files changed, 284 insertions(+), 320 deletions(-)

diff --git a/python/cudf/cudf/core/_internals/where.py b/python/cudf/cudf/core/_internals/where.py
index 0c754317185..2199d4d5ba5 100644
--- a/python/cudf/cudf/core/_internals/where.py
+++ b/python/cudf/cudf/core/_internals/where.py
@@ -106,19 +106,6 @@ def _check_and_cast_columns_with_other(
     return _normalize_categorical(source_col.astype(common_dtype), other)
 
 
-def _make_categorical_like(result, column):
-    if isinstance(column, cudf.core.column.CategoricalColumn):
-        result = cudf.core.column.build_categorical_column(
-            categories=column.categories,
-            codes=result,
-            mask=result.base_mask,
-            size=result.size,
-            offset=result.offset,
-            ordered=column.ordered,
-        )
-    return result
-
-
 def _can_cast(from_dtype, to_dtype):
     """
     Utility function to determine if we can cast
diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index e7119fcdf47..5781d77ee9a 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -8,7 +8,6 @@
 from cudf.core.column.column import (
     ColumnBase,
     as_column,
-    build_categorical_column,
     build_column,
     column_empty,
     column_empty_like,
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index a7e98e5218f..de5ed15771d 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -52,6 +52,15 @@
 _DEFAULT_CATEGORICAL_VALUE = np.int8(-1)
 
 
+def as_unsigned_codes(
+    num_cats: int, codes: NumericalColumn
+) -> NumericalColumn:
+    codes_dtype = min_unsigned_type(num_cats)
+    return cast(
+        cudf.core.column.numerical.NumericalColumn, codes.astype(codes_dtype)
+    )
+
+
 class CategoricalAccessor(ColumnMethods):
     """
     Accessor object for categorical properties of the Series values.
@@ -637,13 +646,12 @@ def __setitem__(self, key, value):
             value = value.codes
         codes = self.codes
         codes[key] = value
-        out = cudf.core.column.build_categorical_column(
-            categories=self.categories,
-            codes=codes,
-            mask=codes.base_mask,
+        out = type(self)(
+            data=self.data,
             size=codes.size,
-            offset=self.offset,
-            ordered=self.ordered,
+            dtype=self.dtype,
+            mask=codes.base_mask,
+            children=(codes,),
         )
         self._mimic_inplace(out, inplace=True)
 
@@ -669,16 +677,13 @@ def _fill(
 
     def slice(self, start: int, stop: int, stride: int | None = None) -> Self:
         codes = self.codes.slice(start, stop, stride)
-        return cast(
-            Self,
-            cudf.core.column.build_categorical_column(
-                categories=self.categories,
-                codes=codes,
-                mask=codes.base_mask,
-                ordered=self.ordered,
-                size=codes.size,
-                offset=codes.offset,
-            ),
+        return type(self)(
+            data=self.data,  # type: ignore[arg-type]
+            size=codes.size,
+            dtype=self.dtype,
+            mask=codes.base_mask,
+            offset=codes.offset,
+            children=(codes,),
         )
 
     def _reduce(
@@ -719,7 +724,7 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             )
         return self.codes._binaryop(other.codes, op)
 
-    def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn:
+    def normalize_binop_value(self, other: ScalarLike) -> Self:
         if isinstance(other, column.ColumnBase):
             if not isinstance(other, CategoricalColumn):
                 return NotImplemented
@@ -727,30 +732,27 @@ def normalize_binop_value(self, other: ScalarLike) -> CategoricalColumn:
                 raise TypeError(
                     "Categoricals can only compare with the same type"
                 )
-            return other
-
-        ary = column.as_column(
+            return cast(Self, other)
+        codes = column.as_column(
             self._encode(other), length=len(self), dtype=self.codes.dtype
         )
-        return column.build_categorical_column(
-            categories=self.dtype.categories._values,
-            codes=column.as_column(ary),
+        return type(self)(
+            data=None,
+            size=self.size,
+            dtype=self.dtype,
             mask=self.base_mask,
-            ordered=self.dtype.ordered,
+            children=(codes,),  # type: ignore[arg-type]
         )
 
-    def sort_values(
-        self, ascending: bool = True, na_position="last"
-    ) -> CategoricalColumn:
+    def sort_values(self, ascending: bool = True, na_position="last") -> Self:
         codes = self.codes.sort_values(ascending, na_position)
-        col = column.build_categorical_column(
-            categories=self.dtype.categories._values,
-            codes=codes,
-            mask=codes.base_mask,
+        return type(self)(
+            data=self.data,  # type: ignore[arg-type]
             size=codes.size,
-            ordered=self.dtype.ordered,
+            dtype=self.dtype,
+            mask=codes.base_mask,
+            children=(codes,),
         )
-        return col
 
     def element_indexing(self, index: int) -> ScalarLike:
         val = self.codes.element_indexing(index)
@@ -777,12 +779,12 @@ def to_pandas(
 
         if self.categories.dtype.kind == "f":
             new_mask = bools_to_mask(self.notnull())
-            col = column.build_categorical_column(
-                categories=self.categories,
-                codes=column.as_column(self.codes, dtype=self.codes.dtype),
+            col = type(self)(
+                data=self.data,  # type: ignore[arg-type]
+                size=self.size,
+                dtype=self.dtype,
                 mask=new_mask,
-                ordered=self.dtype.ordered,
-                size=self.codes.size,
+                children=self.children,
             )
         else:
             col = self
@@ -849,15 +851,15 @@ def data_array_view(
     ) -> numba.cuda.devicearray.DeviceNDArray:
         return self.codes.data_array_view(mode=mode)
 
-    def unique(self) -> CategoricalColumn:
+    def unique(self) -> Self:
         codes = self.codes.unique()
-        return column.build_categorical_column(
-            categories=self.categories,
-            codes=codes,
+        return type(self)(
+            data=self.data,  # type: ignore[arg-type]
+            size=codes.size,
+            dtype=self.dtype,
             mask=codes.base_mask,
             offset=codes.offset,
-            size=codes.size,
-            ordered=self.ordered,
+            children=(codes,),
         )
 
     def _encode(self, value) -> ScalarLike:
@@ -988,14 +990,17 @@ def find_and_replace(
         output = libcudf.replace.replace(
             replaced_codes, to_replace_col, replacement_col
         )
+        codes = as_unsigned_codes(len(new_cats["cats"]), output)
 
-        result = column.build_categorical_column(
-            categories=new_cats["cats"],
-            codes=output,
-            mask=output.base_mask,
-            offset=output.offset,
-            size=output.size,
-            ordered=self.dtype.ordered,
+        result = type(self)(
+            data=self.data,  # type: ignore[arg-type]
+            size=codes.size,
+            dtype=CategoricalDtype(
+                categories=new_cats["cats"], ordered=self.dtype.ordered
+            ),
+            mask=codes.base_mask,
+            offset=codes.offset,
+            children=(codes,),
         )
         if result.dtype != self.dtype:
             warnings.warn(
@@ -1082,7 +1087,7 @@ def is_monotonic_increasing(self) -> bool:
     def is_monotonic_decreasing(self) -> bool:
         return bool(self.ordered) and self.codes.is_monotonic_decreasing
 
-    def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn:
+    def as_categorical_column(self, dtype: Dtype) -> Self:
         if isinstance(dtype, str) and dtype == "category":
             return self
         if isinstance(dtype, pd.CategoricalDtype):
@@ -1099,7 +1104,23 @@ def as_categorical_column(self, dtype: Dtype) -> CategoricalColumn:
         if not isinstance(self.categories, type(dtype.categories._column)):
             # If both categories are of different Column types,
             # return a column full of Nulls.
-            return _create_empty_categorical_column(self, dtype)
+            codes = cast(
+                cudf.core.column.numerical.NumericalColumn,
+                column.as_column(
+                    _DEFAULT_CATEGORICAL_VALUE,
+                    length=self.size,
+                    dtype=self.codes.dtype,
+                ),
+            )
+            codes = as_unsigned_codes(len(dtype.categories), codes)
+            return type(self)(
+                data=self.data,  # type: ignore[arg-type]
+                size=self.size,
+                dtype=dtype,
+                mask=self.base_mask,
+                offset=self.offset,
+                children=(codes,),
+            )
 
         return self.set_categories(
             new_categories=dtype.categories, ordered=bool(dtype.ordered)
@@ -1185,26 +1206,29 @@ def _concat(
             codes = [o for o in codes if len(o)]
             codes_col = libcudf.concat.concat_columns(objs)
 
-        return column.build_categorical_column(
-            categories=column.as_column(cats),
-            codes=codes_col,
-            mask=codes_col.base_mask,
+        codes_col = as_unsigned_codes(
+            len(cats),
+            cast(cudf.core.column.numerical.NumericalColumn, codes_col),
+        )
+        return CategoricalColumn(
+            data=None,
             size=codes_col.size,
+            dtype=CategoricalDtype(categories=cats),
+            mask=codes_col.base_mask,
             offset=codes_col.offset,
+            children=(codes_col,),  # type: ignore[arg-type]
         )
 
-    def _with_type_metadata(
-        self: CategoricalColumn, dtype: Dtype
-    ) -> CategoricalColumn:
+    def _with_type_metadata(self: Self, dtype: Dtype) -> Self:
         if isinstance(dtype, CategoricalDtype):
-            return column.build_categorical_column(
-                categories=dtype.categories._values,
-                codes=self.codes,
-                mask=self.codes.base_mask,
-                ordered=dtype.ordered,
+            return type(self)(
+                data=self.data,  # type: ignore[arg-type]
                 size=self.codes.size,
+                dtype=dtype,
+                mask=self.codes.base_mask,
                 offset=self.codes.offset,
                 null_count=self.codes.null_count,
+                children=(self.codes,),
             )
         return self
 
@@ -1213,7 +1237,7 @@ def set_categories(
         new_categories: Any,
         ordered: bool = False,
         rename: bool = False,
-    ) -> CategoricalColumn:
+    ) -> Self:
         # See CategoricalAccessor.set_categories.
 
         ordered = ordered if ordered is not None else self.ordered
@@ -1232,25 +1256,39 @@ def set_categories(
                     "new_categories must have the same "
                     "number of items as old categories"
                 )
-
-            out_col = column.build_categorical_column(
-                categories=new_categories,
-                codes=self.base_children[0],
-                mask=self.base_mask,
+            out_col = type(self)(
+                data=self.data,  # type: ignore[arg-type]
                 size=self.size,
+                dtype=CategoricalDtype(
+                    categories=new_categories, ordered=ordered
+                ),
+                mask=self.base_mask,
                 offset=self.offset,
-                ordered=ordered,
+                children=(self.codes,),
             )
         else:
             out_col = self
             if type(out_col.categories) is not type(new_categories):
                 # If both categories are of different Column types,
                 # return a column full of Nulls.
-                out_col = _create_empty_categorical_column(
-                    self,
-                    CategoricalDtype(
+                new_codes = cast(
+                    cudf.core.column.numerical.NumericalColumn,
+                    column.as_column(
+                        _DEFAULT_CATEGORICAL_VALUE,
+                        length=self.size,
+                        dtype=self.codes.dtype,
+                    ),
+                )
+                new_codes = as_unsigned_codes(len(new_categories), new_codes)
+                out_col = type(self)(
+                    data=self.data,  # type: ignore[arg-type]
+                    size=self.size,
+                    dtype=CategoricalDtype(
                         categories=new_categories, ordered=ordered
                     ),
+                    mask=self.base_mask,
+                    offset=self.offset,
+                    children=(new_codes,),
                 )
             elif (
                 not out_col._categories_equal(new_categories, ordered=True)
@@ -1335,19 +1373,19 @@ def _set_categories(
         df.reset_index(drop=True, inplace=True)
 
         ordered = ordered if ordered is not None else self.ordered
-        new_codes = df._data["new_codes"]
+        new_codes = cast(
+            cudf.core.column.numerical.NumericalColumn, df._data["new_codes"]
+        )
 
         # codes can't have masks, so take mask out before moving in
-        return cast(
-            Self,
-            column.build_categorical_column(
-                categories=new_cats,
-                codes=new_codes,
-                mask=new_codes.base_mask,
-                size=new_codes.size,
-                offset=new_codes.offset,
-                ordered=ordered,
-            ),
+        new_codes = as_unsigned_codes(len(new_cats), new_codes)
+        return type(self)(
+            data=self.data,  # type: ignore[arg-type]
+            size=new_codes.size,
+            dtype=CategoricalDtype(categories=new_cats, ordered=ordered),
+            mask=new_codes.base_mask,
+            offset=new_codes.offset,
+            children=(new_codes,),
         )
 
     def add_categories(self, new_categories: Any) -> Self:
@@ -1425,56 +1463,16 @@ def remove_unused_categories(self) -> Self:
             "remove_unused_categories is currently not supported."
         )
 
-    def as_ordered(self, ordered: bool):
+    def as_ordered(self, ordered: bool) -> Self:
         if self.dtype.ordered == ordered:
             return self
-        return column.build_categorical_column(
-            categories=self.categories,
-            codes=self.codes,
-            mask=self.base_mask,
+        return type(self)(
+            data=self.data,  # type: ignore[arg-type]
             size=self.size,
+            dtype=CategoricalDtype(
+                categories=self.categories, ordered=ordered
+            ),
+            mask=self.base_mask,
             offset=self.offset,
-            ordered=ordered,
+            children=self.children,
         )
-
-
-def _create_empty_categorical_column(
-    categorical_column: CategoricalColumn, dtype: "CategoricalDtype"
-) -> CategoricalColumn:
-    return column.build_categorical_column(
-        categories=column.as_column(dtype.categories),
-        codes=column.as_column(
-            _DEFAULT_CATEGORICAL_VALUE,
-            length=categorical_column.size,
-            dtype=categorical_column.codes.dtype,
-        ),
-        offset=categorical_column.offset,
-        size=categorical_column.size,
-        mask=categorical_column.base_mask,
-        ordered=dtype.ordered,
-    )
-
-
-def pandas_categorical_as_column(
-    categorical: ColumnLike, codes: ColumnLike | None = None
-) -> CategoricalColumn:
-    """Creates a CategoricalColumn from a pandas.Categorical
-
-    If ``codes`` is defined, use it instead of ``categorical.codes``
-    """
-    codes = categorical.codes if codes is None else codes
-    codes = column.as_column(codes)
-
-    valid_codes = codes != codes.dtype.type(_DEFAULT_CATEGORICAL_VALUE)
-
-    mask = None
-    if not valid_codes.all():
-        mask = bools_to_mask(valid_codes)
-
-    return column.build_categorical_column(
-        categories=categorical.categories,
-        codes=codes,
-        size=codes.size,
-        mask=mask,
-        ordered=categorical.ordered,
-    )
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 60b4126ddd4..885476a897c 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -352,13 +352,17 @@ def from_arrow(cls, array: pa.Array) -> ColumnBase:
 
             codes = libcudf.interop.from_arrow(indices_table)[0]
             categories = libcudf.interop.from_arrow(dictionaries_table)[0]
-
-            return build_categorical_column(
-                categories=categories,
-                codes=codes,
-                mask=codes.base_mask,
+            codes = cudf.core.column.categorical.as_unsigned_codes(
+                len(categories), codes
+            )
+            return cudf.core.column.CategoricalColumn(
+                data=None,
                 size=codes.size,
-                ordered=array.type.ordered,
+                dtype=CategoricalDtype(
+                    categories=categories, ordered=array.type.ordered
+                ),
+                mask=codes.base_mask,
+                children=(codes,),
             )
 
         result = libcudf.interop.from_arrow(data)[0]
@@ -950,10 +954,10 @@ def is_monotonic_decreasing(self) -> bool:
         )
 
     def sort_values(
-        self: ColumnBase,
+        self: Self,
         ascending: bool = True,
         na_position: str = "last",
-    ) -> ColumnBase:
+    ) -> Self:
         if (not ascending and self.is_monotonic_decreasing) or (
             ascending and self.is_monotonic_increasing
         ):
@@ -1041,12 +1045,16 @@ def as_categorical_column(self, dtype) -> ColumnBase:
             and dtype._categories is not None
         ):
             cat_col = dtype._categories
-            labels = self._label_encoding(cats=cat_col)
-            return build_categorical_column(
-                categories=cat_col,
-                codes=labels,
+            codes = self._label_encoding(cats=cat_col)
+            codes = cudf.core.column.categorical.as_unsigned_codes(
+                len(cat_col), codes
+            )
+            return cudf.core.column.categorical.CategoricalColumn(
+                data=None,
+                size=None,
+                dtype=dtype,
                 mask=self.mask,
-                ordered=dtype.ordered,
+                children=(codes,),
             )
 
         # Categories must be unique and sorted in ascending order.
@@ -1058,15 +1066,16 @@ def as_categorical_column(self, dtype) -> ColumnBase:
         # columns include null index in factorization; remove:
         if self.has_nulls():
             cats = cats.dropna()
-            min_type = min_unsigned_type(len(cats), 8)
-            if cudf.dtype(min_type).itemsize < labels.dtype.itemsize:
-                labels = labels.astype(min_type)
 
-        return build_categorical_column(
-            categories=cats,
-            codes=labels,
+        labels = cudf.core.column.categorical.as_unsigned_codes(
+            len(cats), labels
+        )
+        return cudf.core.column.categorical.CategoricalColumn(
+            data=None,
+            size=None,
+            dtype=CategoricalDtype(categories=cats, ordered=ordered),
             mask=self.mask,
-            ordered=ordered,
+            children=(labels,),
         )
 
     def as_numerical_column(
@@ -1186,7 +1195,7 @@ def searchsorted(
             na_position=na_position,
         )
 
-    def unique(self) -> ColumnBase:
+    def unique(self) -> Self:
         """
         Get unique values in the data
         """
@@ -1695,51 +1704,6 @@ def build_column(
         raise TypeError(f"Unrecognized dtype: {dtype}")
 
 
-def build_categorical_column(
-    categories: ColumnBase,
-    codes: ColumnBase,
-    mask: Buffer | None = None,
-    size: int | None = None,
-    offset: int = 0,
-    null_count: int | None = None,
-    ordered: bool = False,
-) -> "cudf.core.column.CategoricalColumn":
-    """
-    Build a CategoricalColumn
-
-    Parameters
-    ----------
-    categories : Column
-        Column of categories
-    codes : Column
-        Column of codes, the size of the resulting Column will be
-        the size of `codes`
-    mask : Buffer
-        Null mask
-    size : int, optional
-    offset : int, optional
-    ordered : bool, default False
-        Indicates whether the categories are ordered
-    """
-    codes_dtype = min_unsigned_type(len(categories))
-    codes = as_column(codes)
-    if codes.dtype != codes_dtype:
-        codes = codes.astype(codes_dtype)
-
-    dtype = CategoricalDtype(categories=categories, ordered=ordered)
-
-    result = build_column(
-        data=None,
-        dtype=dtype,
-        mask=mask,
-        size=size,
-        offset=offset,
-        null_count=null_count,
-        children=(codes,),
-    )
-    return cast("cudf.core.column.CategoricalColumn", result)
-
-
 def check_invalid_array(shape: tuple, dtype):
     """Invalid ndarrays properties that are not supported"""
     if len(shape) > 1:
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index 7f391c8a79c..78d2814ed26 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -651,22 +651,20 @@ def can_cast_safely(self, to_dtype: DtypeObj) -> bool:
 
         return False
 
-    def _with_type_metadata(self: ColumnBase, dtype: Dtype) -> ColumnBase:
+    def _with_type_metadata(self: Self, dtype: Dtype) -> ColumnBase:
         if isinstance(dtype, CategoricalDtype):
-            return column.build_categorical_column(
-                categories=dtype.categories._values,
-                codes=cudf.core.column.NumericalColumn(
-                    self.base_data,  # type: ignore[arg-type]
-                    self.size,
-                    dtype=self.dtype,
-                ),
-                mask=self.base_mask,
-                ordered=dtype.ordered,
+            codes = cudf.core.column.categorical.as_unsigned_codes(
+                len(dtype.categories), self
+            )
+            return cudf.core.column.CategoricalColumn(
+                data=None,
                 size=self.size,
+                dtype=dtype,
+                mask=self.base_mask,
                 offset=self.offset,
                 null_count=self.null_count,
+                children=(codes,),
             )
-
         return self
 
     def to_pandas(
diff --git a/python/cudf/cudf/core/cut.py b/python/cudf/cudf/core/cut.py
index a4ceea266b4..c9b1fa2669c 100644
--- a/python/cudf/cudf/core/cut.py
+++ b/python/cudf/cudf/core/cut.py
@@ -8,7 +8,8 @@
 
 import cudf
 from cudf.api.types import is_list_like
-from cudf.core.column import as_column, build_categorical_column
+from cudf.core.column import as_column
+from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes
 from cudf.core.index import IntervalIndex, interval_range
 
 
@@ -282,13 +283,17 @@ def cut(
             # should allow duplicate categories.
             return interval_labels[index_labels]
 
-    col = build_categorical_column(
-        categories=interval_labels,
-        codes=index_labels,
+    index_labels = as_unsigned_codes(len(interval_labels), index_labels)
+
+    col = CategoricalColumn(
+        data=None,
+        size=index_labels.size,
+        dtype=cudf.CategoricalDtype(
+            categories=interval_labels, ordered=ordered
+        ),
         mask=index_labels.base_mask,
         offset=index_labels.offset,
-        size=index_labels.size,
-        ordered=ordered,
+        children=(index_labels,),
     )
 
     # we return a categorical index, as we don't have a Categorical method
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 6065e0e1eeb..0d632f4775f 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -48,10 +48,10 @@
     ColumnBase,
     StructColumn,
     as_column,
-    build_categorical_column,
     column_empty,
     concat_columns,
 )
+from cudf.core.column.categorical import as_unsigned_codes
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.copy_types import BooleanMask
 from cudf.core.groupby.groupby import DataFrameGroupBy, groupby_doc_template
@@ -3067,7 +3067,6 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None):
 
         from cudf.core._internals.where import (
             _check_and_cast_columns_with_other,
-            _make_categorical_like,
         )
 
         # First process the condition.
@@ -3119,7 +3118,7 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None):
 
         out = []
         for (name, col), other_col in zip(self._data.items(), other_cols):
-            col, other_col = _check_and_cast_columns_with_other(
+            source_col, other_col = _check_and_cast_columns_with_other(
                 source_col=col,
                 other=other_col,
                 inplace=inplace,
@@ -3127,16 +3126,16 @@ def where(self, cond, other=None, inplace=False, axis=None, level=None):
 
             if cond_col := cond._data.get(name):
                 result = cudf._lib.copying.copy_if_else(
-                    col, other_col, cond_col
+                    source_col, other_col, cond_col
                 )
 
-                out.append(_make_categorical_like(result, self._data[name]))
+                out.append(result._with_type_metadata(col.dtype))
             else:
                 out_mask = cudf._lib.null_mask.create_null_mask(
-                    len(col),
+                    len(source_col),
                     state=cudf._lib.null_mask.MaskState.ALL_NULL,
                 )
-                out.append(col.set_mask(out_mask))
+                out.append(source_col.set_mask(out_mask))
 
         return self._mimic_inplace(
             self._from_data_like_self(self._data._from_columns_like_self(out)),
@@ -3296,9 +3295,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True):
         # least require a deprecation cycle because we currently support
         # inserting a pd.Categorical.
         if isinstance(value, pd.Categorical):
-            value = cudf.core.column.categorical.pandas_categorical_as_column(
-                value
-            )
+            value = as_column(value)
 
         if _is_scalar_or_zero_d_array(value):
             dtype = None
@@ -8510,12 +8507,16 @@ def _cast_cols_to_common_dtypes(col_idxs, list_of_columns, dtypes, categories):
 def _reassign_categories(categories, cols, col_idxs):
     for name, idx in zip(cols, col_idxs):
         if idx in categories:
-            cols[name] = build_categorical_column(
-                categories=categories[idx],
-                codes=cols[name],
-                mask=cols[name].base_mask,
-                offset=cols[name].offset,
-                size=cols[name].size,
+            codes = as_unsigned_codes(len(categories[idx]), cols[name])
+            cols[name] = CategoricalColumn(
+                data=None,
+                size=codes.size,
+                dtype=cudf.CategoricalDtype(
+                    categories=categories[idx], ordered=False
+                ),
+                mask=codes.base_mask,
+                offset=codes.offset,
+                children=(codes,),
             )
 
 
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index a70a42c04af..5250a741d3d 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -13,7 +13,12 @@
 
 import cudf
 from cudf.core.buffer import Buffer, as_buffer
-from cudf.core.column import as_column, build_categorical_column, build_column
+from cudf.core.column import (
+    CategoricalColumn,
+    NumericalColumn,
+    as_column,
+    build_column,
+)
 
 # Implementation of interchange protocol classes
 # ----------------------------------------------
@@ -830,18 +835,19 @@ def _protocol_to_cudf_column_categorical(
     assert buffers["data"] is not None, "data buffer should not be None"
     codes_buffer, codes_dtype = buffers["data"]
     codes_buffer = _ensure_gpu_buffer(codes_buffer, codes_dtype, allow_copy)
-    cdtype = protocol_dtype_to_cupy_dtype(codes_dtype)
-    codes = build_column(
-        codes_buffer._buf,
-        cdtype,
+    cdtype = np.dtype(protocol_dtype_to_cupy_dtype(codes_dtype))
+    codes = NumericalColumn(
+        data=codes_buffer._buf,
+        size=None,
+        dtype=cdtype,
     )
-
-    cudfcol = build_categorical_column(
-        categories=categories,
-        codes=codes,
-        mask=codes.base_mask,
+    cudfcol = CategoricalColumn(
+        data=None,
         size=codes.size,
-        ordered=ordered,
+        dtype=cudf.CategoricalDtype(categories=categories, ordered=ordered),
+        mask=codes.base_mask,
+        offset=codes.offset,
+        children=(codes,),
     )
 
     return _set_missing_values(col, cudfcol, allow_copy), buffers
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index cbe1e97d834..7b2bc85b13b 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -24,10 +24,10 @@
 from cudf.core.column import (
     ColumnBase,
     as_column,
-    build_categorical_column,
     deserialize_columns,
     serialize_columns,
 )
+from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.core.mixins import BinaryOperand, Scannable
 from cudf.utils import ioutils
@@ -889,18 +889,21 @@ def from_arrow(cls, data: pa.Table) -> Self:
                 for name in dict_dictionaries.keys()
             }
 
-            cudf_category_frame = {
-                name: build_categorical_column(
-                    cudf_dictionaries_columns[name],
-                    codes,
-                    mask=codes.base_mask,
+            for name, codes in zip(
+                dict_indices_table.column_names, indices_columns
+            ):
+                categories = cudf_dictionaries_columns[name]
+                codes = as_unsigned_codes(len(categories), codes)
+                cudf_category_frame[name] = CategoricalColumn(
+                    data=None,
                     size=codes.size,
-                    ordered=dict_ordered[name],
-                )
-                for name, codes in zip(
-                    dict_indices_table.column_names, indices_columns
+                    dtype=cudf.CategoricalDtype(
+                        categories=categories,
+                        ordered=dict_ordered[name],
+                    ),
+                    mask=codes.base_mask,
+                    children=(codes,),
                 )
-            }
 
         # Handle non-dict arrays
         cudf_non_category_frame = {
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 27c6556f976..500fc580097 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3079,22 +3079,8 @@ def __init__(
         name = _getdefault_name(data, name=name)
         if isinstance(data, CategoricalColumn):
             data = data
-        elif isinstance(data, pd.Series) and (
-            isinstance(data.dtype, pd.CategoricalDtype)
-        ):
-            codes_data = column.as_column(data.cat.codes.values)
-            data = column.build_categorical_column(
-                categories=data.cat.categories,
-                codes=codes_data,
-                ordered=data.cat.ordered,
-            )
-        elif isinstance(data, (pd.Categorical, pd.CategoricalIndex)):
-            codes_data = column.as_column(data.codes)
-            data = column.build_categorical_column(
-                categories=data.categories,
-                codes=codes_data,
-                ordered=data.ordered,
-            )
+        elif isinstance(getattr(data, "dtype", None), pd.CategoricalDtype):
+            data = column.as_column(data)
         else:
             data = column.as_column(
                 data, dtype="category" if dtype is None else dtype
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index ad6aa56d472..fd6bf37f0e6 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -173,17 +173,7 @@ def _drop_columns(f: Frame, columns: abc.Iterable, errors: str):
 def _indices_from_labels(obj, labels):
     if not isinstance(labels, cudf.MultiIndex):
         labels = cudf.core.column.as_column(labels)
-
-        if isinstance(obj.index.dtype, cudf.CategoricalDtype):
-            labels = labels.astype("category")
-            codes = labels.codes.astype(obj.index.codes.dtype)
-            labels = cudf.core.column.build_categorical_column(
-                categories=labels.dtype.categories,
-                codes=codes,
-                ordered=labels.dtype.ordered,
-            )
-        else:
-            labels = labels.astype(obj.index.dtype)
+        labels = labels.astype(obj.index.dtype)
         idx_labels = cudf.Index._from_column(labels)
     else:
         idx_labels = labels
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 4be10752651..a831a798772 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -38,7 +38,9 @@
     as_column,
 )
 from cudf.core.column.categorical import (
+    _DEFAULT_CATEGORICAL_VALUE,
     CategoricalAccessor as CategoricalAccessor,
+    CategoricalColumn,
 )
 from cudf.core.column.column import concat_columns
 from cudf.core.column.lists import ListMethods
@@ -511,9 +513,22 @@ def from_categorical(cls, categorical, codes=None):
         dtype: category
         Categories (3, object): ['a', 'b', 'c']
         """  # noqa: E501
-        col = cudf.core.column.categorical.pandas_categorical_as_column(
-            categorical, codes=codes
-        )
+        col = as_column(categorical)
+        if codes is not None:
+            codes = as_column(codes)
+
+            valid_codes = codes != codes.dtype.type(_DEFAULT_CATEGORICAL_VALUE)
+
+            mask = None
+            if not valid_codes.all():
+                mask = libcudf.transform.bools_to_mask(valid_codes)
+            col = CategoricalColumn(
+                data=col.data,
+                size=codes.size,
+                dtype=col.dtype,
+                mask=mask,
+                children=(codes,),
+            )
         return Series._from_column(col)
 
     @classmethod
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index eb6714029cf..55dda34a576 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -350,7 +350,6 @@ def _get_elements_from_column(self, arg) -> ScalarLike | ColumnBase:
     def where(self, cond, other=None, inplace=False):
         from cudf.core._internals.where import (
             _check_and_cast_columns_with_other,
-            _make_categorical_like,
         )
 
         if isinstance(other, cudf.DataFrame):
@@ -366,14 +365,12 @@ def where(self, cond, other=None, inplace=False):
         if not cudf.api.types.is_scalar(other):
             other = cudf.core.column.as_column(other)
 
-        self_column = self._column
         input_col, other = _check_and_cast_columns_with_other(
-            source_col=self_column, other=other, inplace=inplace
+            source_col=self._column, other=other, inplace=inplace
         )
 
         result = cudf._lib.copying.copy_if_else(input_col, other, cond)
-
-        return _make_categorical_like(result, self_column)
+        return result._with_type_metadata(self.dtype)
 
     @_performance_tracking
     def transpose(self):
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index d6b2ae2f31c..984115dcbbe 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -20,7 +20,8 @@
 import cudf
 from cudf._lib import parquet as libparquet
 from cudf.api.types import is_list_like
-from cudf.core.column import as_column, build_categorical_column, column_empty
+from cudf.core.column import as_column, column_empty
+from cudf.core.column.categorical import CategoricalColumn, as_unsigned_codes
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
 
@@ -811,12 +812,17 @@ def _parquet_to_frame(
                     partition_categories[name].index(value),
                     length=_len,
                 )
-                dfs[-1][name] = build_categorical_column(
-                    categories=partition_categories[name],
-                    codes=codes,
+                codes = as_unsigned_codes(
+                    len(partition_categories[name]), codes
+                )
+                dfs[-1][name] = CategoricalColumn(
+                    data=None,
                     size=codes.size,
+                    dtype=cudf.CategoricalDtype(
+                        categories=partition_categories[name], ordered=False
+                    ),
                     offset=codes.offset,
-                    ordered=False,
+                    children=(codes,),
                 )
             else:
                 # Not building categorical columns, so
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 5bd3eb5fa7f..9347ebba5de 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -64,8 +64,11 @@ def _nonempty_index(idx):
         values = cudf.core.column.as_column(data)
         return cudf.DatetimeIndex(values, name=idx.name)
     elif isinstance(idx, cudf.CategoricalIndex):
-        values = cudf.core.column.build_categorical_column(
-            categories=idx.categories, codes=[0, 0], ordered=idx.ordered
+        values = cudf.core.column.CategoricalColumn(
+            data=None,
+            size=None,
+            dtype=idx.dtype,
+            children=(cudf.core.column.as_column([0, 0], dtype=np.uint8),),
         )
         return cudf.CategoricalIndex(values, name=idx.name)
     elif isinstance(idx, cudf.MultiIndex):
@@ -105,12 +108,16 @@ def _get_non_empty_data(
         )
         codes = cudf.core.column.as_column(
             0,
-            dtype=cudf._lib.types.size_type_dtype,
+            dtype=np.uint8,
             length=2,
         )
-        ordered = s.ordered  # type: ignore[attr-defined]
-        return cudf.core.column.build_categorical_column(
-            categories=categories, codes=codes, ordered=ordered
+        return cudf.core.column.CategoricalColumn(
+            data=None,
+            size=codes.size,
+            dtype=cudf.CategoricalDtype(
+                categories=categories, ordered=s.dtype.ordered
+            ),
+            children=(codes,),  # type: ignore[arg-type]
         )
     elif isinstance(s.dtype, cudf.ListDtype):
         leaf_type = s.dtype.leaf_type
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index c025280c240..e793d4381d1 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -19,7 +19,7 @@
     create_metadata_file_dd = None
 
 import cudf
-from cudf.core.column import as_column, build_categorical_column
+from cudf.core.column import CategoricalColumn, as_column
 from cudf.io import write_to_dataset
 from cudf.io.parquet import _apply_post_filters, _normalize_filters
 from cudf.utils.dtypes import cudf_dtype_from_pa_type
@@ -163,12 +163,14 @@ def _read_paths(
                         partitions[i].keys.get_loc(index2),
                         length=len(df),
                     )
-                    df[name] = build_categorical_column(
-                        categories=partitions[i].keys,
-                        codes=codes,
+                    df[name] = CategoricalColumn(
+                        data=None,
                         size=codes.size,
+                        dtype=cudf.CategoricalDtype(
+                            categories=partitions[i].keys, ordered=False
+                        ),
                         offset=codes.offset,
-                        ordered=False,
+                        children=(codes,),
                     )
                 elif name not in df.columns:
                     # Add non-categorical partition column

From 925530afe8178b7e788ea1a8d4df4c0eb4d042dc Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 28 Aug 2024 13:22:18 -0700
Subject: [PATCH 744/842] Relax Arrow pin (#16681)

With this change, cudf users can install any version of pyarrow greater than 14. This is the minimum version supporting the C Data Interface, which is a requirement for us (it may be possible to relax in principle, but would require changes to the cudf/pylibcudf code). A few tests are skipped due to bugs or missing features in older versions of pyarrow.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - James Lamb (https://github.com/jameslamb)

URL: https://github.com/rapidsai/cudf/pull/16681
---
 .../all_cuda-118_arch-x86_64.yaml             |  1 +
 .../all_cuda-125_arch-x86_64.yaml             |  1 +
 conda/recipes/cudf/meta.yaml                  |  2 +-
 conda/recipes/pylibcudf/meta.yaml             |  2 +-
 dependencies.yaml                             | 12 ++--------
 python/cudf/cudf/tests/test_parquet.py        | 24 +++++++++++++++----
 python/cudf/pyproject.toml                    |  2 +-
 python/libcudf/pyproject.toml                 |  3 ---
 python/pylibcudf/pylibcudf/interop.pyx        |  1 -
 python/pylibcudf/pyproject.toml               |  2 +-
 10 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index c4c32da8af2..7f6967d7287 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -67,6 +67,7 @@ dependencies:
 - pandoc
 - pre-commit
 - ptxcompiler
+- pyarrow>=14.0.0,<18.0.0a0
 - pydata-sphinx-theme!=0.14.2
 - pytest-benchmark
 - pytest-cases>=3.8.2
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 7439c9543a5..c1315e73f16 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -64,6 +64,7 @@ dependencies:
 - pandas>=2.0,<2.2.3dev0
 - pandoc
 - pre-commit
+- pyarrow>=14.0.0,<18.0.0a0
 - pydata-sphinx-theme!=0.14.2
 - pynvjitlink>=0.0.0a0
 - pytest-benchmark
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 53f52a35651..e22b4a4eddc 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -82,7 +82,7 @@ requirements:
     - cupy >=12.0.0
     - numba >=0.57
     - numpy >=1.23,<3.0a0
-    - pyarrow ==16.1.0.*
+    - pyarrow>=14.0.0,<18.0.0a0
     - libcudf ={{ version }}
     - pylibcudf ={{ version }}
     - {{ pin_compatible('rmm', max_pin='x.x') }}
diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml
index 67b9b76bb8c..7c1efa0176c 100644
--- a/conda/recipes/pylibcudf/meta.yaml
+++ b/conda/recipes/pylibcudf/meta.yaml
@@ -79,7 +79,7 @@ requirements:
     - typing_extensions >=4.0.0
     - pandas >=2.0,<2.2.3dev0
     - numpy >=1.23,<3.0a0
-    - pyarrow ==16.1.0.*
+    - pyarrow>=14.0.0,<18.0.0a0
     - {{ pin_compatible('rmm', max_pin='x.x') }}
     - fsspec >=0.6.0
     {% if cuda_major == "11" %}
diff --git a/dependencies.yaml b/dependencies.yaml
index 5be291b3671..c6851d9cb90 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -19,6 +19,7 @@ files:
       - docs
       - notebooks
       - py_version
+      - pyarrow_run
       - rapids_build_skbuild
       - rapids_build_setuptools
       - run_common
@@ -46,7 +47,6 @@ files:
     includes:
       - cuda_version
       - py_version
-      - pyarrow_run
       - test_python_common
       - test_python_cudf
       - test_python_dask_cudf
@@ -136,13 +136,6 @@ files:
       - build_base
       - build_cpp
       - depends_on_librmm
-  py_run_libcudf:
-    output: pyproject
-    pyproject_dir: python/libcudf
-    extras:
-      table: project
-    includes:
-      - pyarrow_run
   py_build_pylibcudf:
     output: pyproject
     pyproject_dir: python/pylibcudf
@@ -390,8 +383,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          # Allow runtime version to float up to patch version
-          - pyarrow>=16.1.0,<16.2.0a0
+          - pyarrow>=14.0.0,<18.0.0a0
   cuda_version:
     specific:
       - output_types: conda
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index db4f1c9c8bd..879b2bd3d74 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -515,10 +515,6 @@ def test_parquet_read_filtered_multiple_files(tmpdir):
     )
 
 
-@pytest.mark.skipif(
-    version.parse(pa.__version__) < version.parse("1.0.1"),
-    reason="pyarrow 1.0.0 needed for various operators and operand types",
-)
 @pytest.mark.parametrize(
     "predicate,expected_len",
     [
@@ -2393,6 +2389,10 @@ def test_parquet_writer_list_large_mixed(tmpdir):
 
 @pytest.mark.parametrize("store_schema", [True, False])
 def test_parquet_writer_list_chunked(tmpdir, store_schema):
+    if store_schema and version.parse(pa.__version__) < version.parse(
+        "15.0.0"
+    ):
+        pytest.skip("https://github.com/apache/arrow/pull/37792")
     table1 = cudf.DataFrame(
         {
             "a": list_gen(string_gen, 128, 80, 50),
@@ -2578,6 +2578,10 @@ def normalized_equals(value1, value2):
 @pytest.mark.parametrize("add_nulls", [True, False])
 @pytest.mark.parametrize("store_schema", [True, False])
 def test_parquet_writer_statistics(tmpdir, pdf, add_nulls, store_schema):
+    if store_schema and version.parse(pa.__version__) < version.parse(
+        "15.0.0"
+    ):
+        pytest.skip("https://github.com/apache/arrow/pull/37792")
     file_path = tmpdir.join("cudf.parquet")
     if "col_category" in pdf.columns:
         pdf = pdf.drop(columns=["col_category", "col_bool"])
@@ -2957,6 +2961,10 @@ def test_per_column_options_string_col(tmpdir, encoding):
     assert encoding in fmd.row_group(0).column(0).encodings
 
 
+@pytest.mark.skipif(
+    version.parse(pa.__version__) < version.parse("16.0.0"),
+    reason="https://github.com/apache/arrow/pull/39748",
+)
 @pytest.mark.parametrize(
     "num_rows",
     [200, 10000],
@@ -3557,6 +3565,10 @@ def test_parquet_reader_roundtrip_structs_with_arrow_schema(tmpdir, data):
 
 
 @pytest.mark.parametrize("index", [None, True, False])
+@pytest.mark.skipif(
+    version.parse(pa.__version__) < version.parse("15.0.0"),
+    reason="https://github.com/apache/arrow/pull/37792",
+)
 def test_parquet_writer_roundtrip_with_arrow_schema(index):
     # Ensure that the concrete and nested types are faithfully being roundtripped
     # across Parquet with arrow schema
@@ -3707,6 +3719,10 @@ def test_parquet_writer_int96_timestamps_and_arrow_schema():
     ],
 )
 @pytest.mark.parametrize("index", [None, True, False])
+@pytest.mark.skipif(
+    version.parse(pa.__version__) < version.parse("15.0.0"),
+    reason="https://github.com/apache/arrow/pull/37792",
+)
 def test_parquet_writer_roundtrip_structs_with_arrow_schema(
     tmpdir, data, index
 ):
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 0c1d5015078..17d1292980b 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -30,7 +30,7 @@ dependencies = [
     "packaging",
     "pandas>=2.0,<2.2.3dev0",
     "ptxcompiler",
-    "pyarrow>=16.1.0,<16.2.0a0",
+    "pyarrow>=14.0.0,<18.0.0a0",
     "pylibcudf==24.10.*,>=0.0.0a0",
     "rich",
     "rmm==24.10.*,>=0.0.0a0",
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index 43878d0aec2..5f4b9957fd0 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -37,9 +37,6 @@ classifiers = [
     "Programming Language :: C++",
     "Environment :: GPU :: NVIDIA CUDA",
 ]
-dependencies = [
-    "pyarrow>=16.1.0,<16.2.0a0",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 
 [project.urls]
 Homepage = "https://github.com/rapidsai/cudf"
diff --git a/python/pylibcudf/pylibcudf/interop.pyx b/python/pylibcudf/pylibcudf/interop.pyx
index d54e5b7ba1f..1a03fa5b45b 100644
--- a/python/pylibcudf/pylibcudf/interop.pyx
+++ b/python/pylibcudf/pylibcudf/interop.pyx
@@ -152,7 +152,6 @@ def _from_arrow_scalar(pyarrow_object, *, DataType data_type=None):
 
 
 @from_arrow.register(pa.Array)
-@from_arrow.register(pa.ChunkedArray)
 def _from_arrow_column(pyarrow_object, *, DataType data_type=None):
     if data_type is not None:
         raise ValueError("data_type may not be passed for arrays")
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index e4c6edc6141..bfade41353c 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -22,7 +22,7 @@ dependencies = [
     "libcudf==24.10.*,>=0.0.0a0",
     "nvtx>=0.2.1",
     "packaging",
-    "pyarrow>=16.1.0,<16.2.0a0",
+    "pyarrow>=14.0.0,<18.0.0a0",
     "rmm==24.10.*,>=0.0.0a0",
     "typing_extensions>=4.0.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From fbd61142a47bd9ef6f739f97c81c88c1ca9430d4 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 28 Aug 2024 15:06:10 -0700
Subject: [PATCH 745/842] Support reading matching projected and filter cols
 from Parquet files with otherwise mismatched schemas (#16394)

Closes #16269.

This PR adds support to read (matching) projected/selected and filter columns from Parquet files with otherwise mismatching schemas.

### Solution Description
We create a `std::vector<unordered_maps<int32_t, int32_t>>`, one per file except 0th file. We then co-walk schema trees and populate the map with corresponding (one-to-one mapped) `schema_idx` of valid selected (projection and filter) column between 0th and the rest of the files. The same `unordered_map` is used to get the `schema_idx` of the same columns across files when creating `ColumnChunkDesc` and copying column chunk metadata into the page decoder.

### Known Limitation
- [x] Nullability across files: Each selected column must still be either nullable or non-nullable across all files. See #12702 also described in [#dask/9935](https://github.com/dask/dask/pull/9935)

CC @wence-

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Lawrence Mitchell (https://github.com/wence-)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16394
---
 cpp/include/cudf/io/parquet.hpp               |  37 +++
 cpp/src/io/parquet/reader_impl.cpp            |  13 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp    | 174 ++++++++++--
 cpp/src/io/parquet/reader_impl_helpers.hpp    |  53 +++-
 python/cudf/cudf/_lib/parquet.pyx             |  10 +-
 python/cudf/cudf/io/parquet.py                |   5 +
 python/cudf/cudf/tests/test_parquet.py        | 248 ++++++++++++++++++
 python/cudf/cudf/utils/ioutils.py             |   3 +
 python/pylibcudf/pylibcudf/io/parquet.pxd     |   1 +
 python/pylibcudf/pylibcudf/io/parquet.pyx     |  14 +-
 .../pylibcudf/libcudf/io/parquet.pxd          |   6 +-
 11 files changed, 534 insertions(+), 30 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 12897ac77ef..64c37f9a9df 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -76,6 +76,8 @@ class parquet_reader_options {
   bool _use_pandas_metadata = true;
   // Whether to read and use ARROW schema
   bool _use_arrow_schema = true;
+  // Whether to allow reading matching select columns from mismatched Parquet files.
+  bool _allow_mismatched_pq_schemas = false;
   // Cast timestamp columns to a specific type
   data_type _timestamp_type{type_id::EMPTY};
 
@@ -138,6 +140,18 @@ class parquet_reader_options {
    */
   [[nodiscard]] bool is_enabled_use_arrow_schema() const { return _use_arrow_schema; }
 
+  /**
+   * @brief Returns true/false depending on whether to read matching projected and filter columns
+   * from mismatched Parquet sources.
+   *
+   * @return `true` if mismatched projected and filter columns will be read from mismatched Parquet
+   * sources.
+   */
+  [[nodiscard]] bool is_enabled_allow_mismatched_pq_schemas() const
+  {
+    return _allow_mismatched_pq_schemas;
+  }
+
   /**
    * @brief Returns optional tree of metadata.
    *
@@ -258,6 +272,15 @@ class parquet_reader_options {
    */
   void enable_use_arrow_schema(bool val) { _use_arrow_schema = val; }
 
+  /**
+   * @brief Sets to enable/disable reading of matching projected and filter columns from mismatched
+   * Parquet sources.
+   *
+   * @param val Boolean value whether to read matching projected and filter columns from mismatched
+   * Parquet sources.
+   */
+  void enable_allow_mismatched_pq_schemas(bool val) { _allow_mismatched_pq_schemas = val; }
+
   /**
    * @brief Sets reader column schema.
    *
@@ -382,6 +405,20 @@ class parquet_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Sets to enable/disable reading of matching projected and filter columns from mismatched
+   * Parquet sources.
+   *
+   * @param val Boolean value whether to read matching projected and filter columns from mismatched
+   * Parquet sources.
+   * @return this for chaining.
+   */
+  parquet_reader_options_builder& allow_mismatched_pq_schemas(bool val)
+  {
+    options._allow_mismatched_pq_schemas = val;
+    return *this;
+  }
+
   /**
    * @brief Sets reader metadata.
    *
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 2648a1f41ab..9950e2f7d7d 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -470,8 +470,10 @@ reader::impl::impl(std::size_t chunk_read_limit,
     _input_pass_read_limit{pass_read_limit}
 {
   // Open and parse the source dataset metadata
-  _metadata =
-    std::make_unique<aggregate_reader_metadata>(_sources, options.is_enabled_use_arrow_schema());
+  _metadata = std::make_unique<aggregate_reader_metadata>(
+    _sources,
+    options.is_enabled_use_arrow_schema(),
+    options.get_columns().has_value() and options.is_enabled_allow_mismatched_pq_schemas());
 
   // Strings may be returned as either string or categorical columns
   _strings_to_categorical = options.is_enabled_convert_strings_to_categories();
@@ -769,11 +771,14 @@ parquet_column_schema walk_schema(aggregate_reader_metadata const* mt, int idx)
 
 parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> const> sources)
 {
-  // do not use arrow schema when reading information from parquet metadata.
+  // Do not use arrow schema when reading information from parquet metadata.
   static constexpr auto use_arrow_schema = false;
 
+  // Do not select any columns when only reading the parquet metadata.
+  static constexpr auto has_column_projection = false;
+
   // Open and parse the source dataset metadata
-  auto metadata = aggregate_reader_metadata(sources, use_arrow_schema);
+  auto metadata = aggregate_reader_metadata(sources, use_arrow_schema, has_column_projection);
 
   return parquet_metadata{parquet_schema{walk_schema(&metadata, 0)},
                           metadata.get_num_rows(),
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 00f75e4e828..8b5678f202b 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -380,6 +380,17 @@ aggregate_reader_metadata::collect_keyval_metadata() const
   return kv_maps;
 }
 
+std::vector<std::unordered_map<int32_t, int32_t>> aggregate_reader_metadata::init_schema_idx_maps(
+  bool const has_cols_from_mismatched_srcs) const
+{
+  // Only initialize if more than 1 data sources and has select columns from mismatched data sources
+  if (has_cols_from_mismatched_srcs and per_file_metadata.size() > 1) {
+    return std::vector<std::unordered_map<int32_t, int32_t>>{per_file_metadata.size() - 1};
+  }
+
+  return {};
+}
+
 int64_t aggregate_reader_metadata::calc_num_rows() const
 {
   return std::accumulate(
@@ -539,13 +550,18 @@ void aggregate_reader_metadata::column_info_for_row_group(row_group_info& rg_inf
 }
 
 aggregate_reader_metadata::aggregate_reader_metadata(
-  host_span<std::unique_ptr<datasource> const> sources, bool use_arrow_schema)
+  host_span<std::unique_ptr<datasource> const> sources,
+  bool use_arrow_schema,
+  bool has_cols_from_mismatched_srcs)
   : per_file_metadata(metadatas_from_sources(sources)),
     keyval_maps(collect_keyval_metadata()),
+    schema_idx_maps(init_schema_idx_maps(has_cols_from_mismatched_srcs)),
     num_rows(calc_num_rows()),
     num_row_groups(calc_num_row_groups())
 {
-  if (per_file_metadata.size() > 0) {
+  // Validate that all sources have the same schema unless we are reading select columns
+  // from mismatched sources, in which case, we will only check the projected columns later.
+  if (per_file_metadata.size() > 1 and not has_cols_from_mismatched_srcs) {
     auto const& first_meta = per_file_metadata.front();
     auto const num_cols =
       first_meta.row_groups.size() > 0 ? first_meta.row_groups.front().columns.size() : 0;
@@ -632,7 +648,7 @@ arrow_schema_data_types aggregate_reader_metadata::collect_arrow_schema() const
       if (field->type_type() == flatbuf::Type::Type_Duration) {
         auto type_data = field->type_as_Duration();
         if (type_data != nullptr) {
-          auto name = (field->name()) ? field->name()->str() : "";
+          auto name = field->name() ? field->name()->str() : "";
           // set the schema_elem type to duration type
           schema_elem.type = duration_from_flatbuffer(type_data);
           arrow_type_col_seen |= (schema_elem.type.id() != type_id::EMPTY);
@@ -868,12 +884,23 @@ ColumnChunkMetaData const& aggregate_reader_metadata::get_column_metadata(size_t
                                                                           size_type src_idx,
                                                                           int schema_idx) const
 {
+  // schema_idx_maps will only have > 0 size when we are reading matching column projection from
+  // mismatched Parquet sources.
+  if (src_idx and not schema_idx_maps.empty()) {
+    auto const& schema_idx_map = schema_idx_maps[src_idx - 1];
+    CUDF_EXPECTS(schema_idx_map.find(schema_idx) != schema_idx_map.end(),
+                 "Unmapped schema index encountered in the specified source tree",
+                 std::range_error);
+    schema_idx = schema_idx_map.at(schema_idx);
+  }
+
   auto col =
     std::find_if(per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(),
                  per_file_metadata[src_idx].row_groups[row_group_index].columns.end(),
                  [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx; });
   CUDF_EXPECTS(col != std::end(per_file_metadata[src_idx].row_groups[row_group_index].columns),
-               "Found no metadata for schema index");
+               "Found no metadata for schema index",
+               std::range_error);
   return col->meta_data;
 }
 
@@ -1041,18 +1068,19 @@ aggregate_reader_metadata::select_columns(
   std::optional<std::vector<std::string>> const& filter_columns_names,
   bool include_index,
   bool strings_to_categorical,
-  type_id timestamp_type_id) const
+  type_id timestamp_type_id)
 {
-  auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) {
-    auto const& col_schema_idx =
-      std::find_if(schema_elem.children_idx.cbegin(),
-                   schema_elem.children_idx.cend(),
-                   [&](size_t col_schema_idx) { return get_schema(col_schema_idx).name == name; });
-
-    return (col_schema_idx != schema_elem.children_idx.end())
-             ? static_cast<size_type>(*col_schema_idx)
-             : -1;
-  };
+  auto const find_schema_child =
+    [&](SchemaElement const& schema_elem, std::string const& name, int const pfm_idx = 0) {
+      auto const& col_schema_idx = std::find_if(
+        schema_elem.children_idx.cbegin(),
+        schema_elem.children_idx.cend(),
+        [&](size_t col_schema_idx) { return get_schema(col_schema_idx, pfm_idx).name == name; });
+
+      return (col_schema_idx != schema_elem.children_idx.end())
+               ? static_cast<size_type>(*col_schema_idx)
+               : -1;
+    };
 
   std::vector<cudf::io::detail::inline_column_buffer> output_columns;
   std::vector<input_column_info> input_columns;
@@ -1074,7 +1102,7 @@ aggregate_reader_metadata::select_columns(
       if (schema_elem.is_stub()) {
         // is this legit?
         CUDF_EXPECTS(schema_elem.num_children == 1, "Unexpected number of children for stub");
-        auto child_col_name_info = (col_name_info) ? &col_name_info->children[0] : nullptr;
+        auto const child_col_name_info = col_name_info ? &col_name_info->children[0] : nullptr;
         return build_column(
           child_col_name_info, schema_elem.children_idx[0], out_col_array, has_list_parent);
       }
@@ -1154,6 +1182,97 @@ aggregate_reader_metadata::select_columns(
       return path_is_valid;
     };
 
+  // Compares two schema elements to be equal except their number of children
+  auto const equal_to_except_num_children = [](SchemaElement const& lhs, SchemaElement const& rhs) {
+    return lhs.type == rhs.type and lhs.converted_type == rhs.converted_type and
+           lhs.type_length == rhs.type_length and lhs.repetition_type == rhs.repetition_type and
+           lhs.name == rhs.name and lhs.decimal_scale == rhs.decimal_scale and
+           lhs.decimal_precision == rhs.decimal_precision and lhs.field_id == rhs.field_id;
+  };
+
+  // Maps a projected column's schema_idx in the zeroth per_file_metadata (source) to the
+  // corresponding schema_idx in pfm_idx'th per_file_metadata (destination). The projected
+  // column's path must match across sources, else an appropriate exception is thrown.
+  std::function<void(column_name_info const*, int const, int const, int const)> map_column =
+    [&](column_name_info const* col_name_info,
+        int const src_schema_idx,
+        int const dst_schema_idx,
+        int const pfm_idx) {
+      auto const& src_schema_elem = get_schema(src_schema_idx);
+      auto const& dst_schema_elem = get_schema(dst_schema_idx, pfm_idx);
+
+      // Check the schema elements to be equal except their number of children as we only care about
+      // the specific column paths in the schema trees. Raise an invalid_argument error if the
+      // schema elements don't match.
+      CUDF_EXPECTS(equal_to_except_num_children(src_schema_elem, dst_schema_elem),
+                   "Encountered mismatching SchemaElement properties for a column in "
+                   "the selected path",
+                   std::invalid_argument);
+
+      // If src_schema_elem is a stub, it does not exist in the column_name_info and column_buffer
+      // hierarchy. So continue on with mapping.
+      if (src_schema_elem.is_stub()) {
+        // Check if dst_schema_elem is also a stub i.e. has num_children == 1 that we didn't
+        // previously check. Raise an invalid_argument error if dst_schema_elem is not a stub.
+        CUDF_EXPECTS(dst_schema_elem.is_stub(),
+                     "Encountered mismatching schemas for stub.",
+                     std::invalid_argument);
+        auto const child_col_name_info = col_name_info ? &col_name_info->children[0] : nullptr;
+        return map_column(child_col_name_info,
+                          src_schema_elem.children_idx[0],
+                          dst_schema_elem.children_idx[0],
+                          pfm_idx);
+      }
+
+      // The path ends here. If this is a list/struct col (has children), then map all its children
+      // which must be identical.
+      if (col_name_info == nullptr or col_name_info->children.empty()) {
+        // Check the number of children to be equal to be mapped. An out_of_range error if the
+        // number of children isn't equal.
+        CUDF_EXPECTS(src_schema_elem.num_children == dst_schema_elem.num_children,
+                     "Encountered mismatching number of children for a "
+                     "column in the selected path",
+                     std::out_of_range);
+
+        std::for_each(thrust::make_counting_iterator(0),
+                      thrust::make_counting_iterator(src_schema_elem.num_children),
+                      [&](auto const child_idx) {
+                        map_column(nullptr,
+                                   src_schema_elem.children_idx[child_idx],
+                                   dst_schema_elem.children_idx[child_idx],
+                                   pfm_idx);
+                      });
+      }
+      // The path goes further down to specific child(ren) of this column so map only those
+      // children.
+      else {
+        std::for_each(
+          col_name_info->children.cbegin(),
+          col_name_info->children.cend(),
+          [&](auto const& child_col_name_info) {
+            // Ensure that each named child column exists in the destination schema tree for the
+            // paths to align up. An out_of_range error otherwise.
+            CUDF_EXPECTS(
+              find_schema_child(dst_schema_elem, child_col_name_info.name, pfm_idx) != -1,
+              "Encountered mismatching schema tree depths across data sources",
+              std::out_of_range);
+            map_column(&child_col_name_info,
+                       find_schema_child(src_schema_elem, child_col_name_info.name),
+                       find_schema_child(dst_schema_elem, child_col_name_info.name, pfm_idx),
+                       pfm_idx);
+          });
+      }
+
+      // We're at a leaf and this is an input column (one with actual data stored) so map it.
+      if (src_schema_elem.num_children == 0) {
+        // Get the schema_idx_map for this data source (pfm)
+        auto& schema_idx_map = schema_idx_maps[pfm_idx - 1];
+
+        // Map the schema index from 0th tree (src) to the one in the current (dst) tree.
+        schema_idx_map[src_schema_idx] = dst_schema_idx;
+      }
+    };
+
   std::vector<int> output_column_schemas;
 
   //
@@ -1287,7 +1406,28 @@ aggregate_reader_metadata::select_columns(
     for (auto& col : selected_columns) {
       auto const& top_level_col_schema_idx = find_schema_child(root, col.name);
       bool valid_column = build_column(&col, top_level_col_schema_idx, output_columns, false);
-      if (valid_column) output_column_schemas.push_back(top_level_col_schema_idx);
+      if (valid_column) {
+        output_column_schemas.push_back(top_level_col_schema_idx);
+
+        // Map the column's schema_idx across the rest of the data sources if required.
+        if (per_file_metadata.size() > 1 and not schema_idx_maps.empty()) {
+          std::for_each(thrust::make_counting_iterator(static_cast<size_t>(1)),
+                        thrust::make_counting_iterator(per_file_metadata.size()),
+                        [&](auto const pfm_idx) {
+                          auto const& dst_root = get_schema(0, pfm_idx);
+                          // Ensure that each top level column exists in the destination schema
+                          // tree. An out_of_range error is thrown otherwise.
+                          CUDF_EXPECTS(
+                            find_schema_child(dst_root, col.name, pfm_idx) != -1,
+                            "Encountered mismatching schema tree depths across data sources",
+                            std::out_of_range);
+                          map_column(&col,
+                                     top_level_col_schema_idx,
+                                     find_schema_child(dst_root, col.name, pfm_idx),
+                                     pfm_idx);
+                        });
+        }
+      }
     }
   }
 
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 309132a5347..6f2863136b2 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -128,6 +128,7 @@ struct arrow_schema_data_types {
 class aggregate_reader_metadata {
   std::vector<metadata> per_file_metadata;
   std::vector<std::unordered_map<std::string, std::string>> keyval_maps;
+  std::vector<std::unordered_map<int32_t, int32_t>> schema_idx_maps;
 
   int64_t num_rows;
   size_type num_row_groups;
@@ -144,6 +145,19 @@ class aggregate_reader_metadata {
   [[nodiscard]] std::vector<std::unordered_map<std::string, std::string>> collect_keyval_metadata()
     const;
 
+  /**
+   * @brief Initialize the vector of schema_idx maps.
+   *
+   * Initializes a vector of hash maps that will store the one-to-one mappings between the
+   * schema_idx'es of the selected columns in the zeroth per_file_metadata (source) and each
+   * kth per_file_metadata (destination) for k in range: [1, per_file_metadata.size()-1].
+   *
+   * @param has_cols_from_mismatched_srcs True if we are reading select cols from mismatched
+   * parquet schemas.
+   */
+  [[nodiscard]] std::vector<std::unordered_map<int32_t, int32_t>> init_schema_idx_maps(
+    bool has_cols_from_mismatched_srcs) const;
+
   /**
    * @brief Decodes and constructs the arrow schema from the ARROW_SCHEMA_KEY IPC message
    * in key value metadata section of Parquet file footer
@@ -183,10 +197,28 @@ class aggregate_reader_metadata {
 
  public:
   aggregate_reader_metadata(host_span<std::unique_ptr<datasource> const> sources,
-                            bool use_arrow_schema);
+                            bool use_arrow_schema,
+                            bool has_cols_from_mismatched_srcs);
 
   [[nodiscard]] RowGroup const& get_row_group(size_type row_group_index, size_type src_idx) const;
 
+  /**
+   * @brief Extracts the schema_idx'th column chunk metadata from row_group_index'th row group of
+   * the src_idx'th file.
+   *
+   * Extracts the schema_idx'th column chunk metadata from the specified row group index of the
+   * src_idx'th file. Note that the schema_idx is actually the index in the zeroth file which may
+   * not be the same in all files, in which case, the schema_idx is mapped to the corresponding
+   * index in the src_idx'th file and returned. A range_error error is thrown if schema_idx
+   * doesn't exist or isn't mapped to the src_idx file.
+   *
+   * @param row_group_index The row group index in the file to extract column chunk metadata from.
+   * @param src_idx The per_file_metadata index to extract extract column chunk metadata from.
+   * @param schema_idx The schema_idx of the column chunk to be extracted
+   *
+   * @return The requested column chunk metadata or a range_error error if the schema index isn't
+   * valid.
+   */
   [[nodiscard]] ColumnChunkMetaData const& get_column_metadata(size_type row_group_index,
                                                                size_type src_idx,
                                                                int schema_idx) const;
@@ -202,9 +234,22 @@ class aggregate_reader_metadata {
 
   [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; }
 
-  [[nodiscard]] auto const& get_schema(int schema_idx) const
+  /**
+   * @brief Extracts the schema_idx'th SchemaElement from the pfm_idx'th file
+   *
+   * @param schema_idx The index of the SchemaElement to be extracted.
+   * @param pfm_idx The index of the per_file_metadata to extract SchemaElement from, default = 0 if
+   * not specified.
+   *
+   * @return The requested SchemaElement or an error if invalid schema_idx or pfm_idx.
+   */
+  [[nodiscard]] auto const& get_schema(int schema_idx, int pfm_idx = 0) const
   {
-    return per_file_metadata[0].schema[schema_idx];
+    CUDF_EXPECTS(
+      schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast<int>(per_file_metadata.size()),
+      "Parquet reader encountered an invalid schema_idx or pfm_idx",
+      std::invalid_argument);
+    return per_file_metadata[pfm_idx].schema[schema_idx];
   }
 
   [[nodiscard]] auto const& get_key_value_metadata() const& { return keyval_maps; }
@@ -314,7 +359,7 @@ class aggregate_reader_metadata {
                  std::optional<std::vector<std::string>> const& filter_columns_names,
                  bool include_index,
                  bool strings_to_categorical,
-                 type_id timestamp_type_id) const;
+                 type_id timestamp_type_id);
 };
 
 /**
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index c874a51e220..a0155671a26 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -266,7 +266,8 @@ def read_parquet_chunked(
     size_t chunk_read_limit=0,
     size_t pass_read_limit=1024000000,
     size_type nrows=-1,
-    int64_t skip_rows=0
+    int64_t skip_rows=0,
+    allow_mismatched_pq_schemas=False
 ):
     # Note: If this function ever takes accepts filters
     # allow_range_index needs to be False when a filter is passed
@@ -277,11 +278,12 @@ def read_parquet_chunked(
         plc.io.SourceInfo(filepaths_or_buffers),
         columns,
         row_groups,
-        use_pandas_metadata,
+        use_pandas_metadata=use_pandas_metadata,
         chunk_read_limit=chunk_read_limit,
         pass_read_limit=pass_read_limit,
         skip_rows=skip_rows,
         nrows=nrows,
+        allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
     )
 
     tbl_w_meta = reader.read_chunk()
@@ -323,7 +325,8 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
                    use_pandas_metadata=True,
                    Expression filters=None,
                    size_type nrows=-1,
-                   int64_t skip_rows=0):
+                   int64_t skip_rows=0,
+                   allow_mismatched_pq_schemas=False):
     """
     Cython function to call into libcudf API, see `read_parquet`.
 
@@ -351,6 +354,7 @@ cpdef read_parquet(filepaths_or_buffers, columns=None, row_groups=None,
         use_pandas_metadata = use_pandas_metadata,
         skip_rows = skip_rows,
         nrows = nrows,
+        allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
     )
 
     df = cudf.DataFrame._from_data(
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 984115dcbbe..526f12aa94e 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -514,6 +514,7 @@ def read_parquet(
     dataset_kwargs=None,
     nrows=None,
     skip_rows=None,
+    allow_mismatched_pq_schemas=False,
     *args,
     **kwargs,
 ):
@@ -622,6 +623,7 @@ def read_parquet(
         dataset_kwargs=dataset_kwargs,
         nrows=nrows,
         skip_rows=skip_rows,
+        allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
         **kwargs,
     )
     # Apply filters row-wise (if any are defined), and return
@@ -865,6 +867,7 @@ def _read_parquet(
     use_pandas_metadata=None,
     nrows=None,
     skip_rows=None,
+    allow_mismatched_pq_schemas=False,
     *args,
     **kwargs,
 ):
@@ -889,6 +892,7 @@ def _read_parquet(
                 use_pandas_metadata=use_pandas_metadata,
                 nrows=nrows if nrows is not None else -1,
                 skip_rows=skip_rows if skip_rows is not None else 0,
+                allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
             )
         else:
             if nrows is None:
@@ -902,6 +906,7 @@ def _read_parquet(
                 use_pandas_metadata=use_pandas_metadata,
                 nrows=nrows,
                 skip_rows=skip_rows,
+                allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
             )
     else:
         if (
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 879b2bd3d74..6623c537ddf 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3809,3 +3809,251 @@ def test_parquet_reader_pandas_compatibility():
     with cudf.option_context("io.parquet.low_memory", True):
         expected = cudf.read_parquet(buffer)
     assert_eq(expected, df)
+
+
+@pytest.mark.parametrize("store_schema", [True, False])
+def test_parquet_reader_with_mismatched_tables(store_schema):
+    # cuDF tables with mixed types
+    df1 = cudf.DataFrame(
+        {
+            "i32": cudf.Series([None, None, None], dtype="int32"),
+            "i64": cudf.Series([1234, None, 123], dtype="int64"),
+            "list": list([[1, 2], [None, 4], [5, 6]]),
+            "time": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"),
+            "str": ["vfd", None, "ghu"],
+            "d_list": list(
+                [
+                    [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                    [None, pd.Timedelta(minutes=3)],
+                    [pd.Timedelta(minutes=8), None],
+                ]
+            ),
+        }
+    )
+
+    df2 = cudf.DataFrame(
+        {
+            "str": ["abc", "def", None],
+            "i64": cudf.Series([None, 65, 98], dtype="int64"),
+            "times": cudf.Series([1234, None, 4123], dtype="datetime64[us]"),
+            "list": list([[7, 8], [9, 10], [None, 12]]),
+            "d_list": list(
+                [
+                    [pd.Timedelta(minutes=4), None],
+                    [None, None],
+                    [pd.Timedelta(minutes=6), None],
+                ]
+            ),
+        }
+    )
+
+    # IO buffers
+    buf1 = BytesIO()
+    buf2 = BytesIO()
+
+    # Write Parquet with and without arrow schema
+    df1.to_parquet(buf1, store_schema=store_schema)
+    df2.to_parquet(buf2, store_schema=store_schema)
+
+    # Read mismatched Parquet files
+    got = cudf.read_parquet(
+        [buf1, buf2],
+        columns=["list", "d_list", "str"],
+        filters=[("i64", ">", 20)],
+        allow_mismatched_pq_schemas=True,
+    )
+
+    # Construct the expected table
+    expected = cudf.concat(
+        [
+            df1[df1["i64"] > 20][["list", "d_list", "str"]],
+            df2[df2["i64"] > 20][["list", "d_list", "str"]],
+        ]
+    ).reset_index(drop=True)
+
+    # Read with chunked reader (filter columns not supported)
+    got_chunked = read_parquet_chunked(
+        [buf1, buf2],
+        columns=["list", "d_list", "str"],
+        chunk_read_limit=240,
+        pass_read_limit=240,
+        allow_mismatched_pq_schemas=True,
+    )
+
+    # Construct the expected table without filter columns
+    expected_chunked = cudf.concat(
+        [df1[["list", "d_list", "str"]], df2[["list", "d_list", "str"]]]
+    ).reset_index(drop=True)
+
+    # Check results
+    assert_eq(expected, got)
+    assert_eq(expected_chunked, got_chunked)
+
+
+def test_parquet_reader_with_mismatched_structs():
+    data1 = [
+        {
+            "a": 1,
+            "b": {
+                "inner_a": 10,
+                "inner_b": {"inner_inner_b": 1, "inner_inner_a": 2},
+            },
+            "c": 2,
+        },
+        {
+            "a": 3,
+            "b": {"inner_a": 30, "inner_b": {"inner_inner_a": 210}},
+            "c": 4,
+        },
+        {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6},
+        {"a": 7, "b": None, "c": 8},
+        {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None},
+        None,
+        {
+            "a": None,
+            "b": {
+                "inner_a": None,
+                "inner_b": {"inner_inner_b": None, "inner_inner_a": 10},
+            },
+            "c": 10,
+        },
+    ]
+
+    data2 = [
+        {"a": 1, "b": {"inner_b": {"inner_inner_a": None}}},
+        {"a": 3, "b": {"inner_b": {"inner_inner_a": 1}}},
+        {"a": 5, "b": {"inner_b": None}},
+        {"a": 7, "b": {"inner_b": {"inner_inner_b": 1, "inner_inner_a": 0}}},
+        {"a": None, "b": {"inner_b": None}},
+        None,
+        {"a": None, "b": {"inner_b": {"inner_inner_a": 1}}},
+    ]
+
+    # cuDF tables from struct data
+    df1 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data1}))
+    df2 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data2}))
+
+    # Buffers
+    buf1 = BytesIO()
+    buf2 = BytesIO()
+
+    # Write to parquet
+    df1.to_parquet(buf1)
+    df2.to_parquet(buf2)
+
+    # Read the struct.b.inner_b.inner_inner_a column from parquet
+    got = cudf.read_parquet(
+        [buf1, buf2],
+        columns=["struct.b.inner_b.inner_inner_a"],
+        allow_mismatched_pq_schemas=True,
+    )
+    got = (
+        cudf.Series(got["struct"])
+        .struct.field("b")
+        .struct.field("inner_b")
+        .struct.field("inner_inner_a")
+    )
+
+    # Read with chunked reader
+    got_chunked = read_parquet_chunked(
+        [buf1, buf2],
+        columns=["struct.b.inner_b.inner_inner_a"],
+        chunk_read_limit=240,
+        pass_read_limit=240,
+        allow_mismatched_pq_schemas=True,
+    )
+    got_chunked = (
+        cudf.Series(got_chunked["struct"])
+        .struct.field("b")
+        .struct.field("inner_b")
+        .struct.field("inner_inner_a")
+    )
+
+    # Construct the expected series
+    expected = cudf.concat(
+        [
+            cudf.Series(df1["struct"])
+            .struct.field("b")
+            .struct.field("inner_b")
+            .struct.field("inner_inner_a"),
+            cudf.Series(df2["struct"])
+            .struct.field("b")
+            .struct.field("inner_b")
+            .struct.field("inner_inner_a"),
+        ]
+    ).reset_index(drop=True)
+
+    # Check results
+    assert_eq(expected, got)
+    assert_eq(expected, got_chunked)
+
+
+def test_parquet_reader_with_mismatched_schemas_error():
+    df1 = cudf.DataFrame(
+        {
+            "millis": cudf.Series([123, 3454, 123], dtype="timedelta64[ms]"),
+            "i64": cudf.Series([123, 3454, 123], dtype="int64"),
+            "i32": cudf.Series([123, 3454, 123], dtype="int32"),
+        }
+    )
+    df2 = cudf.DataFrame(
+        {
+            "i64": cudf.Series([123, 3454, 123], dtype="int64"),
+            "millis": cudf.Series([123, 3454, 123], dtype="timedelta64[ms]"),
+        }
+    )
+
+    buf1 = BytesIO()
+    buf2 = BytesIO()
+
+    df1.to_parquet(buf1, store_schema=True)
+    df2.to_parquet(buf2, store_schema=False)
+
+    with pytest.raises(
+        ValueError,
+        match="Encountered mismatching SchemaElement properties for a column in the selected path",
+    ):
+        cudf.read_parquet(
+            [buf1, buf2], columns=["millis"], allow_mismatched_pq_schemas=True
+        )
+
+    data1 = [
+        {"a": 1, "b": {"inner_a": 1, "inner_b": 6}},
+        {"a": 3, "b": {"inner_a": None, "inner_b": 2}},
+    ]
+    data2 = [
+        {"b": {"inner_a": 1}, "c": "str"},
+        {"b": {"inner_a": None}, "c": None},
+    ]
+
+    # cuDF tables from struct data
+    df1 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data1}))
+    df2 = cudf.DataFrame.from_arrow(pa.Table.from_pydict({"struct": data2}))
+
+    # Buffers
+    buf1 = BytesIO()
+    buf2 = BytesIO()
+
+    # Write to parquet
+    df1.to_parquet(buf1)
+    df2.to_parquet(buf2)
+
+    with pytest.raises(
+        IndexError,
+        match="Encountered mismatching number of children for a column in the selected path",
+    ):
+        cudf.read_parquet(
+            [buf1, buf2],
+            columns=["struct.b"],
+            allow_mismatched_pq_schemas=True,
+        )
+
+    with pytest.raises(
+        IndexError,
+        match="Encountered mismatching schema tree depths across data sources",
+    ):
+        cudf.read_parquet(
+            [buf1, buf2],
+            columns=["struct.b.inner_b"],
+            allow_mismatched_pq_schemas=True,
+        )
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 94974e595b1..6b146be0fa3 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -184,6 +184,9 @@
 
     .. note:
        This option is not supported when the low-memory mode is on.
+allow_mismatched_pq_schemas : boolean, default False
+    If True, enables reading (matching) columns specified in `columns` and `filters`
+    options from the input files with otherwise mismatched schemas.
 
 Returns
 -------
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/io/parquet.pxd
index 47458b00159..9c476030ded 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pxd
+++ b/python/pylibcudf/pylibcudf/io/parquet.pxd
@@ -28,6 +28,7 @@ cpdef read_parquet(
     bool use_pandas_metadata = *,
     int64_t skip_rows = *,
     size_type nrows = *,
+    bool allow_mismatched_pq_schemas = *,
     # disabled see comment in parquet.pyx for more
     # ReaderColumnSchema reader_column_schema = *,
     # DataType timestamp_type = *
diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx
index fb5244a2a9e..df1f1b14247 100644
--- a/python/pylibcudf/pylibcudf/io/parquet.pyx
+++ b/python/pylibcudf/pylibcudf/io/parquet.pyx
@@ -26,6 +26,7 @@ cdef parquet_reader_options _setup_parquet_reader_options(
     bool use_pandas_metadata = True,
     int64_t skip_rows = 0,
     size_type nrows = -1,
+    bool allow_mismatched_pq_schemas=False,
     # ReaderColumnSchema reader_column_schema = None,
     # DataType timestamp_type = DataType(type_id.EMPTY)
 ):
@@ -34,6 +35,7 @@ cdef parquet_reader_options _setup_parquet_reader_options(
         parquet_reader_options.builder(source_info.c_obj)
         .convert_strings_to_categories(convert_strings_to_categories)
         .use_pandas_metadata(use_pandas_metadata)
+        .allow_mismatched_pq_schemas(allow_mismatched_pq_schemas)
         .use_arrow_schema(True)
         .build()
     )
@@ -80,6 +82,9 @@ cdef class ChunkedParquetReader:
     pass_read_limit : size_t, default 1024000000
         Limit on the amount of memory used for reading and decompressing data
         or 0 if there is no limit.
+    allow_mismatched_pq_schemas : bool, default False
+        Whether to read (matching) columns specified in `columns` from
+        the input files with otherwise mismatched schemas.
     """
     def __init__(
         self,
@@ -91,7 +96,8 @@ cdef class ChunkedParquetReader:
         int64_t skip_rows = 0,
         size_type nrows = -1,
         size_t chunk_read_limit=0,
-        size_t pass_read_limit=1024000000
+        size_t pass_read_limit=1024000000,
+        bool allow_mismatched_pq_schemas=False
     ):
 
         cdef parquet_reader_options opts = _setup_parquet_reader_options(
@@ -103,6 +109,7 @@ cdef class ChunkedParquetReader:
             use_pandas_metadata=use_pandas_metadata,
             skip_rows=skip_rows,
             nrows=nrows,
+            allow_mismatched_pq_schemas=allow_mismatched_pq_schemas,
         )
 
         with nogil:
@@ -152,6 +159,7 @@ cpdef read_parquet(
     bool use_pandas_metadata = True,
     int64_t skip_rows = 0,
     size_type nrows = -1,
+    bool allow_mismatched_pq_schemas = False,
     # Disabled, these aren't used by cudf-python
     # we should only add them back in if there's user demand
     # ReaderColumnSchema reader_column_schema = None,
@@ -179,6 +187,9 @@ cpdef read_parquet(
         The number of rows to skip from the start of the file.
     nrows : size_type, default -1
         The number of rows to read. By default, read the entire file.
+    allow_mismatched_pq_schemas : bool, default False
+        If True, enable reading (matching) columns specified in `columns`
+        from the input files with otherwise mismatched schemas.
 
     Returns
     -------
@@ -195,6 +206,7 @@ cpdef read_parquet(
         use_pandas_metadata,
         skip_rows,
         nrows,
+        allow_mismatched_pq_schemas,
     )
 
     with nogil:
diff --git a/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
index 222d87defa0..de6a6c1e82d 100644
--- a/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/io/parquet.pxd
@@ -32,7 +32,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         data_type get_timestamp_type() except +
         bool is_enabled_use_pandas_metadata() except +
         bool is_enabled_arrow_schema() except +
-
+        bool is_enabled_allow_mismatched_pq_schemas() except +
         # setter
 
         void set_filter(expression &filter) except +
@@ -41,6 +41,7 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_row_groups(vector[vector[size_type]] row_grp) except +
         void set_skip_rows(int64_t val) except +
         void enable_use_arrow_schema(bool val) except +
+        void enable_allow_mismatched_pq_schemas(bool val) except +
         void enable_use_pandas_metadata(bool val) except +
         void set_timestamp_type(data_type type) except +
 
@@ -69,6 +70,9 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         parquet_reader_options_builder& use_arrow_schema(
             bool val
         ) except +
+        parquet_reader_options_builder& allow_mismatched_pq_schemas(
+            bool val
+        ) except +
         parquet_reader_options_builder& timestamp_type(
             data_type type
         ) except +

From 9e9efcc9f5ed8411fb09f4d8384e14612a7f3b10 Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Thu, 29 Aug 2024 09:19:48 +1000
Subject: [PATCH 746/842] Replace raw device_memory_resource pointer in
 pylibcudf Cython (#16674)

Replaces a single `device_memory_resource*` in pylibcudf Cython inline C++ function with `rmm::device_async_resource_ref` to help smooth RMM refactoring effort.

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16674
---
 python/pylibcudf/pylibcudf/libcudf/interop.pxd | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/pylibcudf/pylibcudf/libcudf/interop.pxd b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
index c7efff2340d..9228c017d93 100644
--- a/python/pylibcudf/pylibcudf/libcudf/interop.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
@@ -71,7 +71,7 @@ cdef extern from *:
     ArrowArray* to_arrow_host_raw(
       cudf::table_view const& tbl,
       rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) {
+      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) {
       // Assumes the sync event is null and the data is already on the host.
       ArrowArray *arr = new ArrowArray();
       auto device_arr = cudf::to_arrow_host(tbl, stream, mr);

From f6e2355dfefb1a02a984425aabeca7a4fcb2bfde Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 28 Aug 2024 19:03:34 -0500
Subject: [PATCH 747/842] Handle `ordered` parameter in
 `CategoricalIndex.__repr__` (#16683)

Thanks @mroeschke for catching this in https://github.com/rapidsai/cudf/pull/16665#discussion_r1735277661

This PR factors in the `ordered` parameter while generating the `repr` for `CategoricalIndex`.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16683
---
 python/cudf/cudf/core/index.py      | 1 +
 python/cudf/cudf/tests/test_repr.py | 8 ++++++++
 2 files changed, 9 insertions(+)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 500fc580097..fc35ffa3744 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1456,6 +1456,7 @@ def __repr__(self):
                 pd_preprocess.dtype._categories = (
                     preprocess.categories.to_pandas()
                 )
+                pd_preprocess.dtype._ordered = preprocess.dtype.ordered
                 cats_repr = repr(pd_preprocess).split("\n")
                 output = "\n".join(data_repr[:-1] + cats_repr[-1:])
 
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 57eef9e3463..681b467f66c 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1491,3 +1491,11 @@ def test_large_unique_categories_repr():
     with utils.cudf_timeout(2, timeout_message="Failed to repr fast enough"):
         actual_repr = repr(gi)
     assert expected_repr == actual_repr
+
+
+@pytest.mark.parametrize("ordered", [True, False])
+def test_categorical_index_ordered(ordered):
+    pi = pd.CategoricalIndex(range(10), ordered=ordered)
+    gi = cudf.CategoricalIndex(range(10), ordered=ordered)
+
+    assert repr(pi) == repr(gi)

From f2d153b5e1d0c8410947afb438033468dc84d1b8 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 29 Aug 2024 10:30:21 -1000
Subject: [PATCH 748/842] Have interval_range use IntervalIndex.from_breaks,
 remove column_empty_same_mask (#16694)

To match pandas implementation, `interval_range` dispatches to `IntervalIndex.from_breaks` which allows some code deduplication. This also allows us to remove `column_empty_same_mask` which (luckily) I didn't find any usage across RAPIDS

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16694
---
 python/cudf/cudf/core/column/__init__.py |  1 -
 python/cudf/cudf/core/column/column.py   | 16 -------------
 python/cudf/cudf/core/index.py           | 30 +++---------------------
 3 files changed, 3 insertions(+), 44 deletions(-)

diff --git a/python/cudf/cudf/core/column/__init__.py b/python/cudf/cudf/core/column/__init__.py
index 5781d77ee9a..06791df7dc0 100644
--- a/python/cudf/cudf/core/column/__init__.py
+++ b/python/cudf/cudf/core/column/__init__.py
@@ -11,7 +11,6 @@
     build_column,
     column_empty,
     column_empty_like,
-    column_empty_like_same_mask,
     concat_columns,
     deserialize_columns,
     serialize_columns,
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 885476a897c..7674565e2c3 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -1483,22 +1483,6 @@ def _has_any_nan(arbitrary: pd.Series | np.ndarray) -> bool:
     )
 
 
-def column_empty_like_same_mask(
-    column: ColumnBase, dtype: Dtype
-) -> ColumnBase:
-    """Create a new empty Column with the same length and the same mask.
-
-    Parameters
-    ----------
-    dtype : np.dtype like
-        The dtype of the data buffer.
-    """
-    result = column_empty_like(column, dtype)
-    if column.nullable:
-        result = result.set_mask(column.mask)
-    return result
-
-
 def column_empty(
     row_count: int, dtype: Dtype = "object", masked: bool = False
 ) -> ColumnBase:
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index fc35ffa3744..241a276ebe2 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -3346,31 +3346,7 @@ def interval_range(
         init=start.device_value,
         step=freq.device_value,
     )
-    left_col = bin_edges.slice(0, len(bin_edges) - 1)
-    right_col = bin_edges.slice(1, len(bin_edges))
-    # For indexing, children should both have 0 offset
-    right_col = type(right_col)(
-        data=right_col.data,
-        dtype=right_col.dtype,
-        size=right_col.size,
-        mask=right_col.mask,
-        offset=0,
-        null_count=right_col.null_count,
-        children=right_col.children,
-    )
-
-    if len(right_col) == 0 or len(left_col) == 0:
-        dtype = IntervalDtype("int64", closed)
-        data = column.column_empty_like_same_mask(left_col, dtype)
-        return IntervalIndex(data, closed=closed, name=name)
-
-    interval_col = IntervalColumn(
-        data=None,
-        dtype=IntervalDtype(left_col.dtype, closed),
-        size=len(left_col),
-        children=(left_col, right_col),
-    )
-    return IntervalIndex(interval_col, closed=closed, name=name)
+    return IntervalIndex.from_breaks(bin_edges, closed=closed, name=name)
 
 
 class IntervalIndex(Index):
@@ -3520,7 +3496,7 @@ def from_breaks(
         left_col = breaks.slice(0, len(breaks) - 1)
         right_col = breaks.slice(1, len(breaks))
         # For indexing, children should both have 0 offset
-        right_col = column.build_column(
+        right_col = type(right_col)(
             data=right_col.data,
             dtype=right_col.dtype,
             size=right_col.size,
@@ -3536,7 +3512,7 @@ def from_breaks(
             size=len(left_col),
             children=(left_col, right_col),
         )
-        return IntervalIndex(interval_col, name=name, closed=closed)
+        return IntervalIndex._from_column(interval_col, name=name)
 
     @classmethod
     def from_arrays(

From eca5108d2f3120c83b26ba5e3c9a6cfaa2b0b233 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 29 Aug 2024 16:37:15 -0400
Subject: [PATCH 749/842] Disable gtests/ERROR_TEST during compute-sanitizer
 memcheck test (#16691)

Disables the `gtests/ERROR_TEST` when run under `compute-sanitizer` for memcheck. The `compute-sanitizer` started hanging on some of these tests. There is no value in running memcheck on any of the tests in `ERROR_TEST`

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Paul Mattione (https://github.com/pmattione-nvidia)

URL: https://github.com/rapidsai/cudf/pull/16691
---
 cpp/tests/error/error_handling_test.cu | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index 1dfe45556c4..9c7459fa69d 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -50,8 +50,6 @@ CUDF_KERNEL void test_kernel(int* data) { data[threadIdx.x] = threadIdx.x; }
 // calls.
 TEST(StreamCheck, FailedKernel)
 {
-  if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { GTEST_SKIP(); }
-
   rmm::cuda_stream stream;
   int a;
   test_kernel<<<0, 0, 0, stream.value()>>>(&a);
@@ -63,8 +61,6 @@ TEST(StreamCheck, FailedKernel)
 
 TEST(StreamCheck, CatchFailedKernel)
 {
-  if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { GTEST_SKIP(); }
-
   rmm::cuda_stream stream;
   int a;
   test_kernel<<<0, 0, 0, stream.value()>>>(&a);
@@ -131,6 +127,8 @@ TEST(DebugAssert, cudf_assert_true)
 // 2.) The RMM Pool interferes with the death test
 int main(int argc, char** argv)
 {
+  if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return 0; }
+
   ::testing::InitGoogleTest(&argc, argv);
   auto const cmd_opts = parse_cudf_test_opts(argc, argv);
   auto adaptor        = make_stream_mode_adaptor(cmd_opts);

From 21d05d73a66c0bc0009ff378beb58fb4f0f2bf2d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 29 Aug 2024 16:40:14 -0400
Subject: [PATCH 750/842] Move apply_boolean_mask benchmark to nvbench (#16616)

Reworks the `apply_booleam_mask` benchmark as an nvbench benchmark under the `STREAM_COMPACTION_NVBENCH` module. `cudf::string_view` was added as a type to help measure the performance improvement in a follow on PR for `apply_boolean_mask` for strings

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16616
---
 cpp/benchmarks/CMakeLists.txt                 |   5 +-
 .../stream_compaction/apply_boolean_mask.cpp  | 138 ++++++------------
 2 files changed, 48 insertions(+), 95 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 7f3edfa0a01..99ef9e2976f 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -151,14 +151,11 @@ ConfigureBench(COPY_IF_ELSE_BENCH copying/copy_if_else.cpp)
 # * transpose benchmark ---------------------------------------------------------------------------
 ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cpp)
 
-# ##################################################################################################
-# * apply_boolean_mask benchmark ------------------------------------------------------------------
-ConfigureBench(APPLY_BOOLEAN_MASK_BENCH stream_compaction/apply_boolean_mask.cpp)
-
 # ##################################################################################################
 # * stream_compaction benchmark -------------------------------------------------------------------
 ConfigureNVBench(
   STREAM_COMPACTION_NVBENCH
+  stream_compaction/apply_boolean_mask.cpp
   stream_compaction/distinct.cpp
   stream_compaction/distinct_count.cpp
   stream_compaction/stable_distinct.cpp
diff --git a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
index 492237474ff..fa017ca9e29 100644
--- a/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
+++ b/cpp/benchmarks/stream_compaction/apply_boolean_mask.cpp
@@ -15,120 +15,76 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
 
 #include <cudf/stream_compaction.hpp>
+#include <cudf/strings/string_view.hpp>
 
-#include <fixture/benchmark_fixture.hpp>
-#include <synchronization/synchronization.hpp>
+#include <nvbench/nvbench.cuh>
 
 namespace {
 
-constexpr cudf::size_type hundredM      = 1e8;
-constexpr cudf::size_type tenM          = 1e7;
-constexpr cudf::size_type tenK          = 1e4;
-constexpr cudf::size_type fifty_percent = 50;
-
-void percent_range(benchmark::internal::Benchmark* b)
-{
-  b->Unit(benchmark::kMillisecond);
-  for (int percent = 0; percent <= 100; percent += 10)
-    b->Args({hundredM, percent});
-}
-
-void size_range(benchmark::internal::Benchmark* b)
-{
-  b->Unit(benchmark::kMillisecond);
-  for (int size = tenK; size <= hundredM; size *= 10)
-    b->Args({size, fifty_percent});
-}
-
 template <typename T>
-void calculate_bandwidth(benchmark::State& state, cudf::size_type num_columns)
+void calculate_bandwidth(nvbench::state& state)
 {
-  cudf::size_type const column_size{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const percent_true{static_cast<cudf::size_type>(state.range(1))};
-
-  float const fraction                  = percent_true / 100.f;
-  cudf::size_type const column_size_out = fraction * column_size;
-  int64_t const mask_size =
-    sizeof(bool) * column_size + cudf::bitmask_allocation_size_bytes(column_size);
-  int64_t const validity_bytes_in  = (fraction >= 1.0f / 32)
-                                       ? cudf::bitmask_allocation_size_bytes(column_size)
-                                       : 4 * column_size_out;
-  int64_t const validity_bytes_out = cudf::bitmask_allocation_size_bytes(column_size_out);
-  int64_t const column_bytes_out   = sizeof(T) * column_size_out;
+  auto const n_rows       = static_cast<cudf::size_type>(state.get_int64("rows"));
+  auto const n_cols       = static_cast<cudf::size_type>(state.get_int64("columns"));
+  auto const percent_true = static_cast<cudf::size_type>(state.get_int64("hits_%"));
+
+  double const fraction             = percent_true / 100.0;
+  cudf::size_type const output_size = fraction * n_rows;
+  int64_t const mask_size = sizeof(bool) * n_rows + cudf::bitmask_allocation_size_bytes(n_rows);
+  int64_t const validity_bytes_in =
+    (fraction >= 1.0 / 32) ? cudf::bitmask_allocation_size_bytes(n_rows) : 4 * output_size;
+  int64_t const validity_bytes_out = cudf::bitmask_allocation_size_bytes(output_size);
+  int64_t const column_bytes_out   = sizeof(T) * output_size;
   int64_t const column_bytes_in    = column_bytes_out;  // we only read unmasked inputs
 
-  int64_t const bytes_read =
-    (column_bytes_in + validity_bytes_in) * num_columns +  // reading columns
-    mask_size;                                             // reading boolean mask
+  int64_t const bytes_read = (column_bytes_in + validity_bytes_in) * n_cols +  // reading columns
+                             mask_size;  // reading boolean mask
   int64_t const bytes_written =
-    (column_bytes_out + validity_bytes_out) * num_columns;  // writing columns
+    (column_bytes_out + validity_bytes_out) * n_cols;  // writing columns
 
-  state.SetItemsProcessed(state.iterations() * column_size * num_columns);
-  state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * (bytes_read + bytes_written));
+  state.add_element_count(n_rows * n_cols);
+  state.add_global_memory_reads<nvbench::int8_t>(bytes_read);
+  state.add_global_memory_writes<nvbench::int8_t>(bytes_written);
 }
 
 }  // namespace
 
-template <class T>
-void BM_apply_boolean_mask(benchmark::State& state, cudf::size_type num_columns)
+template <typename DataType>
+void apply_boolean_mask_benchmark(nvbench::state& state, nvbench::type_list<DataType>)
 {
-  cudf::size_type const column_size{static_cast<cudf::size_type>(state.range(0))};
-  cudf::size_type const percent_true{static_cast<cudf::size_type>(state.range(1))};
+  auto const n_rows       = static_cast<cudf::size_type>(state.get_int64("rows"));
+  auto const n_cols       = static_cast<cudf::size_type>(state.get_int64("columns"));
+  auto const percent_true = static_cast<cudf::size_type>(state.get_int64("hits_%"));
 
-  data_profile profile = data_profile_builder().cardinality(0).null_probability(0.0).distribution(
-    cudf::type_to_id<T>(), distribution_id::UNIFORM, 0, 100);
+  auto const input_type = cudf::type_to_id<DataType>();
+  data_profile profile  = data_profile_builder().cardinality(0).no_validity().distribution(
+    input_type, distribution_id::UNIFORM, 0, 20);
 
-  auto source_table = create_random_table(
-    cycle_dtypes({cudf::type_to_id<T>()}, num_columns), row_count{column_size}, profile);
+  auto source_table =
+    create_random_table(cycle_dtypes({input_type}, n_cols), row_count{n_rows}, profile);
 
   profile.set_bool_probability_true(percent_true / 100.0);
   profile.set_null_probability(std::nullopt);  // no null mask
-  auto mask = create_random_column(cudf::type_id::BOOL8, row_count{column_size}, profile);
+  auto mask = create_random_column(cudf::type_id::BOOL8, row_count{n_rows}, profile);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  calculate_bandwidth<DataType>(state);
 
-  for (auto _ : state) {
-    cuda_event_timer raii(state, true);
-    auto result = cudf::apply_boolean_mask(*source_table, mask->view());
-  }
+  state.exec(nvbench::exec_tag::sync, [&source_table, &mask](nvbench::launch& launch) {
+    cudf::apply_boolean_mask(*source_table, mask->view());
+  });
 
-  calculate_bandwidth<T>(state, num_columns);
+  set_throughputs(state);
 }
 
-template <class T>
-class ApplyBooleanMask : public cudf::benchmark {
- public:
-  using TypeParam = T;
-};
-
-#define ABM_BENCHMARK_DEFINE(name, type, n_columns)                                  \
-  BENCHMARK_TEMPLATE_DEFINE_F(ApplyBooleanMask, name, type)(::benchmark::State & st) \
-  {                                                                                  \
-    BM_apply_boolean_mask<TypeParam>(st, n_columns);                                 \
-  }
-
-ABM_BENCHMARK_DEFINE(float_1_col, float, 1);
-ABM_BENCHMARK_DEFINE(float_2_col, float, 2);
-ABM_BENCHMARK_DEFINE(float_4_col, float, 4);
-
-// shmoo 1, 2, 4 column float across percentage true
-BENCHMARK_REGISTER_F(ApplyBooleanMask, float_1_col)->Apply(percent_range);
-BENCHMARK_REGISTER_F(ApplyBooleanMask, float_2_col)->Apply(percent_range);
-BENCHMARK_REGISTER_F(ApplyBooleanMask, float_4_col)->Apply(percent_range);
-
-// shmoo 1, 2, 4 column float across column sizes with 50% true
-BENCHMARK_REGISTER_F(ApplyBooleanMask, float_1_col)->Apply(size_range);
-BENCHMARK_REGISTER_F(ApplyBooleanMask, float_2_col)->Apply(size_range);
-BENCHMARK_REGISTER_F(ApplyBooleanMask, float_4_col)->Apply(size_range);
-
-// spot benchmark other types
-ABM_BENCHMARK_DEFINE(int8_1_col, int8_t, 1);
-ABM_BENCHMARK_DEFINE(int16_1_col, int16_t, 1);
-ABM_BENCHMARK_DEFINE(int32_1_col, int32_t, 1);
-ABM_BENCHMARK_DEFINE(int64_1_col, int64_t, 1);
-ABM_BENCHMARK_DEFINE(double_1_col, double, 1);
-BENCHMARK_REGISTER_F(ApplyBooleanMask, int8_1_col)->Args({tenM, fifty_percent});
-BENCHMARK_REGISTER_F(ApplyBooleanMask, int16_1_col)->Args({tenM, fifty_percent});
-BENCHMARK_REGISTER_F(ApplyBooleanMask, int32_1_col)->Args({tenM, fifty_percent});
-BENCHMARK_REGISTER_F(ApplyBooleanMask, int64_1_col)->Args({tenM, fifty_percent});
-BENCHMARK_REGISTER_F(ApplyBooleanMask, double_1_col)->Args({tenM, fifty_percent});
+using data_type = nvbench::type_list<int32_t, int64_t, double, cudf::string_view>;
+NVBENCH_BENCH_TYPES(apply_boolean_mask_benchmark, NVBENCH_TYPE_AXES(data_type))
+  .set_name("apply_boolean_mask")
+  .set_type_axes_names({"type"})
+  .add_int64_axis("columns", {1, 4})
+  .add_int64_axis("rows", {100'000, 1'000'000, 10'000'000})
+  .add_int64_axis("hits_%", {10, 50, 100});

From 8c7af08073fba49c7a7e62cc30595b2962ae7e65 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Thu, 29 Aug 2024 18:18:05 -0500
Subject: [PATCH 751/842] Increase timeouts for couple of tests (#16692)

This PR increases timeouts for tests.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16692
---
 python/cudf/cudf/testing/_utils.py            | 25 ++++++++++---------
 python/cudf/cudf/tests/test_repr.py           |  2 +-
 .../cudf_pandas_tests/test_cudf_pandas.py     |  2 +-
 3 files changed, 15 insertions(+), 14 deletions(-)

diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index 540f12c8382..8cb9efa873c 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -1,8 +1,8 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
 import itertools
-import signal
 import string
+import time
 from collections import abc
 from contextlib import contextmanager
 from decimal import Decimal
@@ -376,16 +376,17 @@ class cudf_timeout:
     Context manager to raise a TimeoutError after a specified number of seconds.
     """
 
-    def __init__(self, seconds, *, timeout_message=""):
-        self.seconds = int(seconds)
-        self.timeout_message = timeout_message
-
-    def _timeout_handler(self, signum, frame):
-        raise TimeoutError(self.timeout_message)
+    def __init__(self, timeout):
+        self.timeout = timeout
 
     def __enter__(self):
-        signal.signal(signal.SIGALRM, self._timeout_handler)
-        signal.alarm(self.seconds)
-
-    def __exit__(self, type, value, traceback):
-        signal.alarm(0)
+        self.start_time = time.perf_counter()
+
+    def __exit__(self, *args):
+        elapsed_time = (
+            time.perf_counter() - self.start_time
+        )  # Calculate elapsed time
+        if elapsed_time >= self.timeout:
+            raise TimeoutError(
+                f"Expected to finish in {self.timeout=} seconds but took {elapsed_time=} seconds"
+            )
diff --git a/python/cudf/cudf/tests/test_repr.py b/python/cudf/cudf/tests/test_repr.py
index 681b467f66c..95e19fae501 100644
--- a/python/cudf/cudf/tests/test_repr.py
+++ b/python/cudf/cudf/tests/test_repr.py
@@ -1488,7 +1488,7 @@ def test_large_unique_categories_repr():
     pi = pd.CategoricalIndex(range(100_000_000))
     gi = cudf.CategoricalIndex(range(100_000_000))
     expected_repr = repr(pi)
-    with utils.cudf_timeout(2, timeout_message="Failed to repr fast enough"):
+    with utils.cudf_timeout(6):
         actual_repr = repr(gi)
     assert expected_repr == actual_repr
 
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 0827602852d..505d5d0b9cc 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -1664,7 +1664,7 @@ def test_notebook_slow_repr():
         nb = nbformat.read(f, as_version=4)
 
     ep = ExecutePreprocessor(
-        timeout=20, kernel_name=jupyter_client.KernelManager().kernel_name
+        timeout=30, kernel_name=jupyter_client.KernelManager().kernel_name
     )
 
     try:

From 53f488ba2db10bead273b1e5eff5f1a07703a7ae Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 29 Aug 2024 14:09:14 -1000
Subject: [PATCH 752/842] Add type annotations to Index classes, utilize
 _from_column more (#16695)

* Add more type annotations to `index.py`
* More consistently use `Index._from_column` where appropriate
* Remove single used `Index._indices_of` in favor of just accessing the `Column._indicies_of` method

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16695
---
 python/cudf/cudf/core/index.py  | 165 +++++++++++++++-----------------
 python/cudf/cudf/core/series.py |   5 +-
 2 files changed, 83 insertions(+), 87 deletions(-)

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 241a276ebe2..66d03682de4 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -46,7 +46,6 @@
 from cudf.core.column.column import as_column, concat_columns
 from cudf.core.column.string import StringMethods as StringMethods
 from cudf.core.dtypes import IntervalDtype
-from cudf.core.frame import Frame
 from cudf.core.join._join_helpers import _match_join_keys
 from cudf.core.mixins import BinaryOperand
 from cudf.core.single_column_frame import SingleColumnFrame
@@ -63,6 +62,8 @@
     from collections.abc import Generator, Iterable
     from datetime import tzinfo
 
+    from cudf.core.frame import Frame
+
 
 def ensure_index(index_like: Any) -> BaseIndex:
     """
@@ -316,7 +317,7 @@ def _num_rows(self) -> int:
 
     @cached_property  # type: ignore
     @_performance_tracking
-    def _values(self):
+    def _values(self) -> ColumnBase:
         if len(self) > 0:
             return column.as_column(self._range, dtype=self.dtype)
         else:
@@ -582,7 +583,7 @@ def __rmul__(self, other):
         return self.__mul__(other)
 
     @_performance_tracking
-    def _as_int_index(self):
+    def _as_int_index(self) -> Index:
         # Convert self to an integer index. This method is used to perform ops
         # that are not defined directly on RangeIndex.
         return cudf.Index._from_data(self._data)
@@ -870,12 +871,12 @@ def join(
 
     @property  # type: ignore
     @_performance_tracking
-    def _column(self):
+    def _column(self) -> ColumnBase:
         return self._as_int_index()._column
 
     @property  # type: ignore
     @_performance_tracking
-    def _columns(self):
+    def _columns(self) -> list[ColumnBase]:
         return self._as_int_index()._columns
 
     @property  # type: ignore
@@ -937,7 +938,7 @@ def notna(self) -> cupy.ndarray:
     notnull = isna
 
     @_performance_tracking
-    def _minmax(self, meth: str):
+    def _minmax(self, meth: str) -> int | float:
         no_steps = len(self) - 1
         if no_steps == -1:
             return np.nan
@@ -948,10 +949,10 @@ def _minmax(self, meth: str):
 
         return self.start + self.step * no_steps
 
-    def min(self):
+    def min(self) -> int | float:
         return self._minmax("min")
 
-    def max(self):
+    def max(self) -> int | float:
         return self._minmax("max")
 
     @property
@@ -1115,7 +1116,7 @@ def _from_data_like_self(
 
     @classmethod
     @_performance_tracking
-    def from_arrow(cls, obj):
+    def from_arrow(cls, obj) -> Index | cudf.MultiIndex:
         """Create from PyArrow Array/ChunkedArray.
 
         Parameters
@@ -1145,11 +1146,11 @@ def from_arrow(cls, obj):
             return cudf.MultiIndex.from_arrow(obj)
 
     @cached_property
-    def is_monotonic_increasing(self):
+    def is_monotonic_increasing(self) -> bool:
         return super().is_monotonic_increasing
 
     @cached_property
-    def is_monotonic_decreasing(self):
+    def is_monotonic_decreasing(self) -> bool:
         return super().is_monotonic_decreasing
 
     def _binaryop(
@@ -1191,7 +1192,7 @@ def _binaryop(
 
     @property  # type: ignore
     @_performance_tracking
-    def _values(self):
+    def _values(self) -> ColumnBase:
         return self._column
 
     @classmethod
@@ -1239,12 +1240,12 @@ def _concat(cls, objs):
         return result
 
     @_performance_tracking
-    def memory_usage(self, deep=False):
+    def memory_usage(self, deep: bool = False) -> int:
         return self._column.memory_usage
 
     @cached_property  # type: ignore
     @_performance_tracking
-    def is_unique(self):
+    def is_unique(self) -> bool:
         return self._column.is_unique
 
     @_performance_tracking
@@ -1271,7 +1272,7 @@ def equals(self, other) -> bool:
             return False
 
     @_performance_tracking
-    def copy(self, name=None, deep=False):
+    def copy(self, name: Hashable = None, deep: bool = False) -> Self:
         """
         Make a copy of this object.
 
@@ -1288,13 +1289,11 @@ def copy(self, name=None, deep=False):
         New index instance.
         """
         name = self.name if name is None else name
-
-        return _index_from_data(
-            {name: self._values.copy(True) if deep else self._values}
-        )
+        col = self._column.copy(deep=True) if deep else self._column
+        return type(self)._from_column(col, name=name)
 
     @_performance_tracking
-    def astype(self, dtype, copy: bool = True):
+    def astype(self, dtype, copy: bool = True) -> Index:
         return super().astype({self.name: dtype}, copy)
 
     @_performance_tracking
@@ -1405,7 +1404,7 @@ def get_loc(self, key) -> int | slice | cupy.ndarray:
         return mask
 
     @_performance_tracking
-    def __repr__(self):
+    def __repr__(self) -> str:
         max_seq_items = pd.get_option("max_seq_items") or len(self)
         mr = 0
         if 2 * max_seq_items < len(self):
@@ -1501,8 +1500,8 @@ def __repr__(self):
             keywords.append(
                 f"freq={self._freq._maybe_as_fast_pandas_offset().freqstr!r}"
             )
-        keywords = ", ".join(keywords)
-        lines.append(f"{prior_to_dtype} {keywords})")
+        joined_keywords = ", ".join(keywords)
+        lines.append(f"{prior_to_dtype} {joined_keywords})")
         return "\n".join(lines)
 
     @_performance_tracking
@@ -1518,47 +1517,47 @@ def dtype(self):
         """
         `dtype` of the underlying values in Index.
         """
-        return self._values.dtype
+        return self._column.dtype
 
     @_performance_tracking
-    def isna(self):
+    def isna(self) -> cupy.ndarray:
         return self._column.isnull().values
 
     isnull = isna
 
     @_performance_tracking
-    def notna(self):
+    def notna(self) -> cupy.ndarray:
         return self._column.notnull().values
 
     notnull = notna
 
-    def _is_numeric(self):
+    def _is_numeric(self) -> bool:
         return (
             isinstance(self._values, cudf.core.column.NumericalColumn)
             and self.dtype.kind != "b"
         )
 
-    def _is_boolean(self):
+    def _is_boolean(self) -> bool:
         return self.dtype.kind == "b"
 
-    def _is_integer(self):
+    def _is_integer(self) -> bool:
         return self.dtype.kind in "iu"
 
-    def _is_floating(self):
+    def _is_floating(self) -> bool:
         return self.dtype.kind == "f"
 
-    def _is_object(self):
-        return isinstance(self._values, cudf.core.column.StringColumn)
+    def _is_object(self) -> bool:
+        return isinstance(self._column, cudf.core.column.StringColumn)
 
-    def _is_categorical(self):
+    def _is_categorical(self) -> bool:
         return False
 
-    def _is_interval(self):
+    def _is_interval(self) -> bool:
         return False
 
     @property  # type: ignore
     @_performance_tracking
-    def hasnans(self):
+    def hasnans(self) -> bool:
         return self._column.has_nulls(include_nan=True)
 
     @_performance_tracking
@@ -1600,13 +1599,13 @@ def argsort(
             na_position=na_position,
         )
 
-    def repeat(self, repeats, axis=None):
-        return self._from_columns_like_self(
-            Frame._repeat([*self._columns], repeats, axis), self._column_names
-        )
+    def repeat(self, repeats, axis=None) -> Self:
+        result = super()._repeat([self._column], repeats, axis)[0]
+        result = result._with_type_metadata(self.dtype)
+        return type(self)._from_column(result, name=self.name)
 
     @_performance_tracking
-    def where(self, cond, other=None, inplace=False):
+    def where(self, cond, other=None, inplace=False) -> Index:
         result_col = super().where(cond, other, inplace)
         return self._mimic_inplace(
             _index_from_data({self.name: result_col}),
@@ -1614,14 +1613,14 @@ def where(self, cond, other=None, inplace=False):
         )
 
     @property
-    def values(self):
+    def values(self) -> cupy.ndarray:
         return self._column.values
 
-    def __contains__(self, item):
+    def __contains__(self, item) -> bool:
         hash(item)
-        return item in self._values
+        return item in self._column
 
-    def _clean_nulls_from_index(self):
+    def _clean_nulls_from_index(self) -> Index:
         if self._values.has_nulls():
             fill_value = (
                 str(cudf.NaT)
@@ -1635,8 +1634,8 @@ def _clean_nulls_from_index(self):
 
         return self
 
-    def any(self):
-        return self._values.any()
+    def any(self) -> bool:
+        return self._column.any()
 
     def to_pandas(
         self, *, nullable: bool = False, arrow_type: bool = False
@@ -1691,11 +1690,9 @@ def unique(self, level: int | None = None) -> Self:
             raise IndexError(
                 f"Too many levels: Index has only 1 level, not {level + 1}"
             )
-        return cudf.core.index._index_from_data(
-            {self.name: self._values.unique()}, name=self.name
-        )
+        return type(self)._from_column(self._column.unique(), name=self.name)
 
-    def isin(self, values, level=None):
+    def isin(self, values, level=None) -> cupy.ndarray:
         if level is not None and level > 0:
             raise IndexError(
                 f"Too many levels: Index has only 1 level, not {level + 1}"
@@ -1706,11 +1703,7 @@ def isin(self, values, level=None):
                 f"to isin(), you passed a {type(values).__name__}"
             )
 
-        return self._values.isin(values).values
-
-    def _indices_of(self, value):
-        """Return indices of value in index"""
-        return self._column.indices_of(value)
+        return self._column.isin(values).values
 
     @copy_docstring(StringMethods)  # type: ignore
     @property
@@ -2130,7 +2123,7 @@ def day_of_week(self) -> Index:
 
     @property  # type: ignore
     @_performance_tracking
-    def year(self):
+    def year(self) -> Index:
         """
         The year of the datetime.
 
@@ -2149,7 +2142,7 @@ def year(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def month(self):
+    def month(self) -> Index:
         """
         The month as January=1, December=12.
 
@@ -2168,7 +2161,7 @@ def month(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def day(self):
+    def day(self) -> Index:
         """
         The day of the datetime.
 
@@ -2187,7 +2180,7 @@ def day(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def hour(self):
+    def hour(self) -> Index:
         """
         The hours of the datetime.
 
@@ -2208,7 +2201,7 @@ def hour(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def minute(self):
+    def minute(self) -> Index:
         """
         The minutes of the datetime.
 
@@ -2229,7 +2222,7 @@ def minute(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def second(self):
+    def second(self) -> Index:
         """
         The seconds of the datetime.
 
@@ -2250,7 +2243,7 @@ def second(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def microsecond(self):
+    def microsecond(self) -> Index:
         """
         The microseconds of the datetime.
 
@@ -2281,7 +2274,7 @@ def microsecond(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def nanosecond(self):
+    def nanosecond(self) -> Index:
         """
         The nanoseconds of the datetime.
 
@@ -2303,7 +2296,7 @@ def nanosecond(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def weekday(self):
+    def weekday(self) -> Index:
         """
         The day of the week with Monday=0, Sunday=6.
 
@@ -2325,7 +2318,7 @@ def weekday(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def dayofweek(self):
+    def dayofweek(self) -> Index:
         """
         The day of the week with Monday=0, Sunday=6.
 
@@ -2347,7 +2340,7 @@ def dayofweek(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def dayofyear(self):
+    def dayofyear(self) -> Index:
         """
         The day of the year, from 1-365 in non-leap years and
         from 1-366 in leap years.
@@ -2370,7 +2363,7 @@ def dayofyear(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def day_of_year(self):
+    def day_of_year(self) -> Index:
         """
         The day of the year, from 1-365 in non-leap years and
         from 1-366 in leap years.
@@ -2412,7 +2405,7 @@ def is_leap_year(self) -> cupy.ndarray:
 
     @property  # type: ignore
     @_performance_tracking
-    def quarter(self):
+    def quarter(self) -> Index:
         """
         Integer indicator for which quarter of the year the date belongs in.
 
@@ -2523,11 +2516,11 @@ def _get_dt_field(self, field: str) -> Index:
         )
         return Index._from_column(out_column, name=self.name)
 
-    def _is_boolean(self):
+    def _is_boolean(self) -> bool:
         return False
 
     @_performance_tracking
-    def ceil(self, freq):
+    def ceil(self, freq: str) -> Self:
         """
         Perform ceil operation on the data to the specified freq.
 
@@ -2558,7 +2551,7 @@ def ceil(self, freq):
         return type(self)._from_column(self._column.ceil(freq), name=self.name)
 
     @_performance_tracking
-    def floor(self, freq):
+    def floor(self, freq: str) -> Self:
         """
         Perform floor operation on the data to the specified freq.
 
@@ -2591,7 +2584,7 @@ def floor(self, freq):
         )
 
     @_performance_tracking
-    def round(self, freq):
+    def round(self, freq: str) -> Self:
         """
         Perform round operation on the data to the specified freq.
 
@@ -2635,7 +2628,7 @@ def tz_localize(
         tz: str | None,
         ambiguous: Literal["NaT"] = "NaT",
         nonexistent: Literal["NaT"] = "NaT",
-    ):
+    ) -> Self:
         """
         Localize timezone-naive data to timezone-aware data.
 
@@ -2682,7 +2675,7 @@ def tz_localize(
             result_col, name=self.name, freq=self._freq
         )
 
-    def tz_convert(self, tz: str | None):
+    def tz_convert(self, tz: str | None) -> Self:
         """
         Convert tz-aware datetimes from one time zone to another.
 
@@ -2717,7 +2710,7 @@ def tz_convert(self, tz: str | None):
         result_col = self._column.tz_convert(tz)
         return DatetimeIndex._from_column(result_col, name=self.name)
 
-    def repeat(self, repeats, axis=None):
+    def repeat(self, repeats, axis=None) -> Self:
         res = super().repeat(repeats, axis=axis)
         res._freq = None
         return res
@@ -2982,7 +2975,7 @@ def nanoseconds(self) -> cudf.Index:
 
     @property  # type: ignore
     @_performance_tracking
-    def components(self):
+    def components(self) -> cudf.DataFrame:
         """
         Return a dataframe of the components (days, hours, minutes,
         seconds, milliseconds, microseconds, nanoseconds) of the Timedeltas.
@@ -3003,7 +2996,7 @@ def inferred_freq(self):
         """
         raise NotImplementedError("inferred_freq is not yet supported")
 
-    def _is_boolean(self):
+    def _is_boolean(self) -> bool:
         return False
 
 
@@ -3122,16 +3115,16 @@ def codes(self) -> cudf.Index:
 
     @property  # type: ignore
     @_performance_tracking
-    def categories(self):
+    def categories(self) -> cudf.Index:
         """
         The categories of this categorical.
         """
         return self.dtype.categories
 
-    def _is_boolean(self):
+    def _is_boolean(self) -> bool:
         return False
 
-    def _is_categorical(self):
+    def _is_categorical(self) -> bool:
         return True
 
     def add_categories(self, new_categories) -> Self:
@@ -3440,7 +3433,7 @@ def __init__(
         super().__init__(interval_col, name=name)
 
     @property
-    def closed(self):
+    def closed(self) -> Literal["left", "right", "neither", "both"]:
         return self.dtype.closed
 
     @classmethod
@@ -3461,7 +3454,7 @@ def from_breaks(
         name=None,
         copy: bool = False,
         dtype=None,
-    ):
+    ) -> Self:
         """
         Construct an IntervalIndex from an array of splits.
 
@@ -3533,7 +3526,7 @@ def from_tuples(
         name=None,
         copy: bool = False,
         dtype=None,
-    ) -> IntervalIndex:
+    ) -> Self:
         piidx = pd.IntervalIndex.from_tuples(
             data, closed=closed, name=name, copy=copy, dtype=dtype
         )
@@ -3544,13 +3537,13 @@ def __getitem__(self, index):
             "Getting a scalar from an IntervalIndex is not yet supported"
         )
 
-    def _is_interval(self):
+    def _is_interval(self) -> bool:
         return True
 
-    def _is_boolean(self):
+    def _is_boolean(self) -> bool:
         return False
 
-    def _clean_nulls_from_index(self):
+    def _clean_nulls_from_index(self) -> Self:
         return self
 
     @property
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index a831a798772..837c6872258 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -377,7 +377,10 @@ def _loc_to_iloc(self, arg):
                     warnings.warn(warn_msg, FutureWarning)
                     return arg
             try:
-                indices = self._frame.index._indices_of(arg)
+                if isinstance(self._frame.index, RangeIndex):
+                    indices = self._frame.index._indices_of(arg)
+                else:
+                    indices = self._frame.index._column.indices_of(arg)
                 if (n := len(indices)) == 0:
                     raise KeyError("Label scalar is out of bounds")
                 elif n == 1:

From 8f2d68750f839326343db00debb5735fe14075d3 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Thu, 29 Aug 2024 17:40:20 -0700
Subject: [PATCH 753/842] Refactor dictionary encoding in PQ writer to migrate
 to the new `cuco::static_map` (#16541)

Part of #12261. This PR refactors the dictionary encoding in Parquet writers to migrate from `cuco::legacy::static_map` to `cuco::static_map` to build the dictionaries.

### Performance Results
The changes result in +0.08% average speed improvement and +16.22% average memory footprint increase (stems from the adjusted sizes by `cuco::make_window_extent` due to [prime gap](https://en.wikipedia.org/wiki/Prime_gap)) across the benchmark cases extended from #16591

Currently, we do see a roughly 8% speed improvement in map insert and find kernels which is counteracted by the map init and map collect kernels as they have to process 16.22% more slots. With a cuco bump, the average speed improvement will increase from 0.08% to +3% and the memory footprint change will go back from 16.22% to +0%.

### Hardware used for benchmarking
```
 `NVIDIA RTX 5880 Ada Generation`
* SM Version: 890 (PTX Version: 860)
* Number of SMs: 110
* SM Default Clock Rate: 18446744071874 MHz
* Global Memory: 23879 MiB Free / 48632 MiB Total
* Global Memory Bus Peak: 960 GB/sec (384-bit DDR @10001MHz)
* Max Shared Memory: 100 KiB/SM, 48 KiB/Block
* L2 Cache Size: 98304 KiB
* Maximum Active Blocks: 24/SM
* Maximum Active Threads: 1536/SM, 1024/Block
* Available Registers: 65536/SM, 65536/Block
* ECC Enabled: No
```

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16541
---
 cpp/src/io/parquet/chunk_dict.cu   | 370 ++++++++++++++++-------------
 cpp/src/io/parquet/parquet_gpu.cuh |  73 +++++-
 cpp/src/io/parquet/parquet_gpu.hpp |  44 +---
 cpp/src/io/parquet/writer_impl.cu  |  42 ++--
 4 files changed, 295 insertions(+), 234 deletions(-)

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index a43c6d4cbb6..17ccb73c0a8 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -22,6 +22,7 @@
 
 #include <rmm/exec_policy.hpp>
 
+#include <cuco/static_map_ref.cuh>
 #include <cuda/atomic>
 
 namespace cudf::io::parquet::detail {
@@ -30,28 +31,14 @@ namespace {
 constexpr int DEFAULT_BLOCK_SIZE = 256;
 }
 
-template <int block_size>
-CUDF_KERNEL void __launch_bounds__(block_size)
-  initialize_chunk_hash_maps_kernel(device_span<EncColumnChunk> chunks)
-{
-  auto const chunk = chunks[blockIdx.x];
-  auto const t     = threadIdx.x;
-  // fut: Now that per-chunk dict is same size as ck.num_values, try to not use one block per chunk
-  for (thread_index_type i = 0; i < chunk.dict_map_size; i += block_size) {
-    if (t + i < chunk.dict_map_size) {
-      new (&chunk.dict_map_slots[t + i].first) map_type::atomic_key_type{KEY_SENTINEL};
-      new (&chunk.dict_map_slots[t + i].second) map_type::atomic_mapped_type{VALUE_SENTINEL};
-    }
-  }
-}
-
 template <typename T>
 struct equality_functor {
   column_device_view const& col;
-  __device__ bool operator()(size_type lhs_idx, size_type rhs_idx)
+  __device__ bool operator()(key_type lhs_idx, key_type rhs_idx) const
   {
-    // We don't call this for nulls so this is fine
-    auto const equal = cudf::experimental::row::equality::nan_equal_physical_equality_comparator{};
+    // We don't call this for nulls so this is fine.
+    auto constexpr equal =
+      cudf::experimental::row::equality::nan_equal_physical_equality_comparator{};
     return equal(col.element<T>(lhs_idx), col.element<T>(rhs_idx));
   }
 };
@@ -59,38 +46,167 @@ struct equality_functor {
 template <typename T>
 struct hash_functor {
   column_device_view const& col;
-  __device__ auto operator()(size_type idx) const
+  uint32_t const seed = 0;
+  __device__ auto operator()(key_type idx) const
   {
-    return cudf::hashing::detail::MurmurHash3_x86_32<T>{}(col.element<T>(idx));
+    return cudf::hashing::detail::MurmurHash3_x86_32<T>{seed}(col.element<T>(idx));
   }
 };
 
+template <int block_size>
 struct map_insert_fn {
-  map_type::device_mutable_view& map;
+  storage_ref_type const& storage_ref;
+  EncColumnChunk* const& chunk;
 
   template <typename T>
-  __device__ bool operator()(column_device_view const& col, size_type i)
+  __device__ void operator()(size_type const s_start_value_idx, size_type const end_value_idx)
   {
     if constexpr (column_device_view::has_element_accessor<T>()) {
-      auto hash_fn     = hash_functor<T>{col};
-      auto equality_fn = equality_functor<T>{col};
-      return map.insert(std::pair(i, i), hash_fn, equality_fn);
+      using block_reduce = cub::BlockReduce<size_type, block_size>;
+      __shared__ typename block_reduce::TempStorage reduce_storage;
+
+      auto const col                     = chunk->col_desc;
+      column_device_view const& data_col = *col->leaf_column;
+      __shared__ size_type total_num_dict_entries;
+
+      using equality_fn_type = equality_functor<T>;
+      using hash_fn_type     = hash_functor<T>;
+      // Choosing `linear_probing` over `double_hashing` for slighhhtly better performance seen in
+      // benchmarks.
+      using probing_scheme_type = cuco::linear_probing<map_cg_size, hash_fn_type>;
+
+      // Make a view of the hash map.
+      auto hash_map_ref = cuco::static_map_ref{cuco::empty_key{KEY_SENTINEL},
+                                               cuco::empty_value{VALUE_SENTINEL},
+                                               equality_fn_type{data_col},
+                                               probing_scheme_type{hash_fn_type{data_col}},
+                                               cuco::thread_scope_block,
+                                               storage_ref};
+
+      // Create a map ref with `cuco::insert` operator
+      auto map_insert_ref = hash_map_ref.with_operators(cuco::insert);
+      auto const t        = threadIdx.x;
+
+      // Create atomic refs to the current chunk's num_dict_entries and uniq_data_size
+      cuda::atomic_ref<size_type, SCOPE> const chunk_num_dict_entries{chunk->num_dict_entries};
+      cuda::atomic_ref<size_type, SCOPE> const chunk_uniq_data_size{chunk->uniq_data_size};
+
+      // Note: Adjust the following loop to use `cg::tile<map_cg_size>` if needed in the future.
+      for (thread_index_type val_idx = s_start_value_idx + t; val_idx - t < end_value_idx;
+           val_idx += block_size) {
+        size_type is_unique      = 0;
+        size_type uniq_elem_size = 0;
+
+        // Check if this index is valid.
+        auto const is_valid =
+          val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx);
+
+        // Insert tile_val_idx to hash map and count successful insertions.
+        if (is_valid) {
+          // Insert the keys using a single thread for best performance for now.
+          is_unique      = map_insert_ref.insert(cuco::pair{val_idx, val_idx});
+          uniq_elem_size = [&]() -> size_type {
+            if (not is_unique) { return 0; }
+            switch (col->physical_type) {
+              case Type::INT32: return 4;
+              case Type::INT64: return 8;
+              case Type::INT96: return 12;
+              case Type::FLOAT: return 4;
+              case Type::DOUBLE: return 8;
+              case Type::BYTE_ARRAY: {
+                auto const col_type = data_col.type().id();
+                if (col_type == type_id::STRING) {
+                  // Strings are stored as 4 byte length + string bytes
+                  return 4 + data_col.element<string_view>(val_idx).size_bytes();
+                } else if (col_type == type_id::LIST) {
+                  // Binary is stored as 4 byte length + bytes
+                  return 4 +
+                         get_element<statistics::byte_array_view>(data_col, val_idx).size_bytes();
+                }
+                CUDF_UNREACHABLE(
+                  "Byte array only supports string and list<byte> column types for dictionary "
+                  "encoding!");
+              }
+              case Type::FIXED_LEN_BYTE_ARRAY:
+                if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
+                CUDF_UNREACHABLE(
+                  "Fixed length byte array only supports decimal 128 column types for dictionary "
+                  "encoding!");
+              default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
+            }
+          }();
+        }
+        // Reduce num_unique and uniq_data_size from all tiles.
+        auto num_unique = block_reduce(reduce_storage).Sum(is_unique);
+        __syncthreads();
+        auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
+        // The first thread in the block atomically updates total num_unique and uniq_data_size
+        if (t == 0) {
+          total_num_dict_entries =
+            chunk_num_dict_entries.fetch_add(num_unique, cuda::std::memory_order_relaxed);
+          total_num_dict_entries += num_unique;
+          chunk_uniq_data_size.fetch_add(uniq_data_size, cuda::std::memory_order_relaxed);
+        }
+        __syncthreads();
+
+        // Check if the num unique values in chunk has already exceeded max dict size and early exit
+        if (total_num_dict_entries > MAX_DICT_SIZE) { return; }
+      }  // for loop
     } else {
       CUDF_UNREACHABLE("Unsupported type to insert in map");
     }
   }
 };
 
+template <int block_size>
 struct map_find_fn {
-  map_type::device_view& map;
-
+  storage_ref_type const& storage_ref;
+  EncColumnChunk* const& chunk;
   template <typename T>
-  __device__ map_type::device_view::iterator operator()(column_device_view const& col, size_type i)
+  __device__ void operator()(size_type const s_start_value_idx,
+                             size_type const end_value_idx,
+                             size_type const s_ck_start_val_idx)
   {
     if constexpr (column_device_view::has_element_accessor<T>()) {
-      auto hash_fn     = hash_functor<T>{col};
-      auto equality_fn = equality_functor<T>{col};
-      return map.find(i, hash_fn, equality_fn);
+      auto const col                     = chunk->col_desc;
+      column_device_view const& data_col = *col->leaf_column;
+
+      using equality_fn_type = equality_functor<T>;
+      using hash_fn_type     = hash_functor<T>;
+      // Choosing `linear_probing` over `double_hashing` for slighhhtly better performance seen in
+      // benchmarks.
+      using probing_scheme_type = cuco::linear_probing<map_cg_size, hash_fn_type>;
+
+      // Make a view of the hash map.
+      auto hash_map_ref = cuco::static_map_ref{cuco::empty_key{KEY_SENTINEL},
+                                               cuco::empty_value{VALUE_SENTINEL},
+                                               equality_fn_type{data_col},
+                                               probing_scheme_type{hash_fn_type{data_col}},
+                                               cuco::thread_scope_block,
+                                               storage_ref};
+
+      // Create a map ref with `cuco::find` operator
+      auto const map_find_ref = hash_map_ref.with_operators(cuco::find);
+      auto const t            = threadIdx.x;
+
+      // Note: Adjust the following loop to use `cg::tiles<map_cg_size>` if needed in the future.
+      for (thread_index_type val_idx = s_start_value_idx + t; val_idx < end_value_idx;
+           val_idx += block_size) {
+        // Find the key using a single thread for best performance for now.
+        if (data_col.is_valid(val_idx)) {
+          // No need for atomic as this is not going to be modified by any other thread.
+          chunk->dict_index[val_idx - s_ck_start_val_idx] = [&]() {
+            auto const found_slot = map_find_ref.find(val_idx);
+
+            // Fail if we didn't find the previously inserted key.
+            cudf_assert(found_slot != map_find_ref.end() &&
+                        "Unable to find value in map in dictionary index construction");
+
+            // Return the found value.
+            return found_slot->second;
+          }();
+        }
+      }
     } else {
       CUDF_UNREACHABLE("Unsupported type to find in map");
     }
@@ -99,124 +215,61 @@ struct map_find_fn {
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  populate_chunk_hash_maps_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
+  populate_chunk_hash_maps_kernel(device_span<window_type> const map_storage,
+                                  cudf::detail::device_2dspan<PageFragment const> frags)
 {
-  auto col_idx = blockIdx.y;
-  auto block_x = blockIdx.x;
-  auto t       = threadIdx.x;
-  auto frag    = frags[col_idx][block_x];
-  auto chunk   = frag.chunk;
-  auto col     = chunk->col_desc;
+  auto const col_idx = blockIdx.y;
+  auto const block_x = blockIdx.x;
+  auto const frag    = frags[col_idx][block_x];
+  auto chunk         = frag.chunk;
+  auto col           = chunk->col_desc;
 
   if (not chunk->use_dictionary) { return; }
 
-  using block_reduce = cub::BlockReduce<size_type, block_size>;
-  __shared__ typename block_reduce::TempStorage reduce_storage;
-
   size_type start_row = frag.start_row;
   size_type end_row   = frag.start_row + frag.num_rows;
 
-  // Find the bounds of values in leaf column to be inserted into the map for current chunk
+  // Find the bounds of values in leaf column to be inserted into the map for current chunk.
   size_type const s_start_value_idx = row_to_value_idx(start_row, *col);
   size_type const end_value_idx     = row_to_value_idx(end_row, *col);
 
   column_device_view const& data_col = *col->leaf_column;
-
-  // Make a view of the hash map
-  auto hash_map_mutable = map_type::device_mutable_view(chunk->dict_map_slots,
-                                                        chunk->dict_map_size,
-                                                        cuco::empty_key{KEY_SENTINEL},
-                                                        cuco::empty_value{VALUE_SENTINEL});
-
-  __shared__ size_type total_num_dict_entries;
-  thread_index_type val_idx = s_start_value_idx + t;
-  while (val_idx - block_size < end_value_idx) {
-    auto const is_valid =
-      val_idx < end_value_idx and val_idx < data_col.size() and data_col.is_valid(val_idx);
-
-    // insert element at val_idx to hash map and count successful insertions
-    size_type is_unique      = 0;
-    size_type uniq_elem_size = 0;
-    if (is_valid) {
-      is_unique =
-        type_dispatcher(data_col.type(), map_insert_fn{hash_map_mutable}, data_col, val_idx);
-      uniq_elem_size = [&]() -> size_type {
-        if (not is_unique) { return 0; }
-        switch (col->physical_type) {
-          case Type::INT32: return 4;
-          case Type::INT64: return 8;
-          case Type::INT96: return 12;
-          case Type::FLOAT: return 4;
-          case Type::DOUBLE: return 8;
-          case Type::BYTE_ARRAY: {
-            auto const col_type = data_col.type().id();
-            if (col_type == type_id::STRING) {
-              // Strings are stored as 4 byte length + string bytes
-              return 4 + data_col.element<string_view>(val_idx).size_bytes();
-            } else if (col_type == type_id::LIST) {
-              // Binary is stored as 4 byte length + bytes
-              return 4 + get_element<statistics::byte_array_view>(data_col, val_idx).size_bytes();
-            }
-            CUDF_UNREACHABLE(
-              "Byte array only supports string and list<byte> column types for dictionary "
-              "encoding!");
-          }
-          case Type::FIXED_LEN_BYTE_ARRAY:
-            if (data_col.type().id() == type_id::DECIMAL128) { return sizeof(__int128_t); }
-            CUDF_UNREACHABLE(
-              "Fixed length byte array only supports decimal 128 column types for dictionary "
-              "encoding!");
-          default: CUDF_UNREACHABLE("Unsupported type for dictionary encoding");
-        }
-      }();
-    }
-
-    auto num_unique = block_reduce(reduce_storage).Sum(is_unique);
-    __syncthreads();
-    auto uniq_data_size = block_reduce(reduce_storage).Sum(uniq_elem_size);
-    if (t == 0) {
-      total_num_dict_entries = atomicAdd(&chunk->num_dict_entries, num_unique);
-      total_num_dict_entries += num_unique;
-      atomicAdd(&chunk->uniq_data_size, uniq_data_size);
-    }
-    __syncthreads();
-
-    // Check if the num unique values in chunk has already exceeded max dict size and early exit
-    if (total_num_dict_entries > MAX_DICT_SIZE) { return; }
-
-    val_idx += block_size;
-  }  // while
+  storage_ref_type const storage_ref{chunk->dict_map_size,
+                                     map_storage.data() + chunk->dict_map_offset};
+  type_dispatcher(data_col.type(),
+                  map_insert_fn<block_size>{storage_ref, chunk},
+                  s_start_value_idx,
+                  end_value_idx);
 }
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  collect_map_entries_kernel(device_span<EncColumnChunk> chunks)
+  collect_map_entries_kernel(device_span<window_type> const map_storage,
+                             device_span<EncColumnChunk> chunks)
 {
   auto& chunk = chunks[blockIdx.x];
   if (not chunk.use_dictionary) { return; }
 
-  auto t   = threadIdx.x;
-  auto map = map_type::device_view(chunk.dict_map_slots,
-                                   chunk.dict_map_size,
-                                   cuco::empty_key{KEY_SENTINEL},
-                                   cuco::empty_value{VALUE_SENTINEL});
-
-  __shared__ cuda::atomic<size_type, cuda::thread_scope_block> counter;
+  auto t = threadIdx.x;
+  __shared__ cuda::atomic<size_type, SCOPE> counter;
   using cuda::std::memory_order_relaxed;
-  if (t == 0) { new (&counter) cuda::atomic<size_type, cuda::thread_scope_block>{0}; }
+  if (t == 0) { new (&counter) cuda::atomic<size_type, SCOPE>{0}; }
   __syncthreads();
-  for (size_type i = 0; i < chunk.dict_map_size; i += block_size) {
-    if (t + i < chunk.dict_map_size) {
-      auto* slot = reinterpret_cast<map_type::value_type*>(map.begin_slot() + t + i);
-      auto key   = slot->first;
+
+  // Iterate over all windows in the map.
+  for (; t < chunk.dict_map_size; t += block_size) {
+    auto window = map_storage.data() + chunk.dict_map_offset + t;
+    // Collect all slots from each window.
+    for (auto& slot : *window) {
+      auto const key = slot.first;
       if (key != KEY_SENTINEL) {
-        auto loc = counter.fetch_add(1, memory_order_relaxed);
+        auto const loc = counter.fetch_add(1, memory_order_relaxed);
         cudf_assert(loc < MAX_DICT_SIZE && "Number of filled slots exceeds max dict size");
         chunk.dict_data[loc] = key;
-        // If sorting dict page ever becomes a hard requirement, enable the following statement and
-        // add a dict sorting step before storing into the slot's second field.
-        // chunk.dict_data_idx[loc] = t + i;
-        slot->second = loc;
+        // If sorting dict page ever becomes a hard requirement, enable the following statement
+        // and add a dict sorting step before storing into the slot's second field.
+        // chunk.dict_data_idx[loc] = idx;
+        slot.second = loc;
       }
     }
   }
@@ -224,75 +277,60 @@ CUDF_KERNEL void __launch_bounds__(block_size)
 
 template <int block_size>
 CUDF_KERNEL void __launch_bounds__(block_size)
-  get_dictionary_indices_kernel(cudf::detail::device_2dspan<PageFragment const> frags)
+  get_dictionary_indices_kernel(device_span<window_type> const map_storage,
+                                cudf::detail::device_2dspan<PageFragment const> frags)
 {
-  auto col_idx = blockIdx.y;
-  auto block_x = blockIdx.x;
-  auto t       = threadIdx.x;
-  auto frag    = frags[col_idx][block_x];
-  auto chunk   = frag.chunk;
-  auto col     = chunk->col_desc;
+  auto const col_idx = blockIdx.y;
+  auto const block_x = blockIdx.x;
+  auto const frag    = frags[col_idx][block_x];
+  auto chunk         = frag.chunk;
 
   if (not chunk->use_dictionary) { return; }
 
   size_type start_row = frag.start_row;
   size_type end_row   = frag.start_row + frag.num_rows;
 
+  auto const col = chunk->col_desc;
   // Find the bounds of values in leaf column to be searched in the map for current chunk
   auto const s_start_value_idx  = row_to_value_idx(start_row, *col);
   auto const s_ck_start_val_idx = row_to_value_idx(chunk->start_row, *col);
   auto const end_value_idx      = row_to_value_idx(end_row, *col);
 
   column_device_view const& data_col = *col->leaf_column;
-
-  auto map = map_type::device_view(chunk->dict_map_slots,
-                                   chunk->dict_map_size,
-                                   cuco::empty_key{KEY_SENTINEL},
-                                   cuco::empty_value{VALUE_SENTINEL});
-
-  thread_index_type val_idx = s_start_value_idx + t;
-  while (val_idx < end_value_idx) {
-    if (data_col.is_valid(val_idx)) {
-      auto found_slot = type_dispatcher(data_col.type(), map_find_fn{map}, data_col, val_idx);
-      cudf_assert(found_slot != map.end() &&
-                  "Unable to find value in map in dictionary index construction");
-      if (found_slot != map.end()) {
-        // No need for atomic as this is not going to be modified by any other thread
-        auto* val_ptr = reinterpret_cast<map_type::mapped_type*>(&found_slot->second);
-        chunk->dict_index[val_idx - s_ck_start_val_idx] = *val_ptr;
-      }
-    }
-
-    val_idx += block_size;
-  }
-}
-
-void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
-{
-  constexpr int block_size = 1024;
-  initialize_chunk_hash_maps_kernel<block_size>
-    <<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
+  storage_ref_type const storage_ref{chunk->dict_map_size,
+                                     map_storage.data() + chunk->dict_map_offset};
+
+  type_dispatcher(data_col.type(),
+                  map_find_fn<block_size>{storage_ref, chunk},
+                  s_start_value_idx,
+                  end_value_idx,
+                  s_ck_start_val_idx);
 }
 
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<PageFragment const> frags,
+void populate_chunk_hash_maps(device_span<window_type> const map_storage,
+                              cudf::detail::device_2dspan<PageFragment const> frags,
                               rmm::cuda_stream_view stream)
 {
   dim3 const dim_grid(frags.size().second, frags.size().first);
   populate_chunk_hash_maps_kernel<DEFAULT_BLOCK_SIZE>
-    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(frags);
+    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(map_storage, frags);
 }
 
-void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream)
+void collect_map_entries(device_span<window_type> const map_storage,
+                         device_span<EncColumnChunk> chunks,
+                         rmm::cuda_stream_view stream)
 {
   constexpr int block_size = 1024;
-  collect_map_entries_kernel<block_size><<<chunks.size(), block_size, 0, stream.value()>>>(chunks);
+  collect_map_entries_kernel<block_size>
+    <<<chunks.size(), block_size, 0, stream.value()>>>(map_storage, chunks);
 }
 
-void get_dictionary_indices(cudf::detail::device_2dspan<PageFragment const> frags,
+void get_dictionary_indices(device_span<window_type> const map_storage,
+                            cudf::detail::device_2dspan<PageFragment const> frags,
                             rmm::cuda_stream_view stream)
 {
   dim3 const dim_grid(frags.size().second, frags.size().first);
   get_dictionary_indices_kernel<DEFAULT_BLOCK_SIZE>
-    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(frags);
+    <<<dim_grid, DEFAULT_BLOCK_SIZE, 0, stream.value()>>>(map_storage, frags);
 }
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index e3c44c78898..7c09764da2d 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -18,25 +18,37 @@
 
 #include "parquet_gpu.hpp"
 
+#include <cudf/detail/cuco_helpers.hpp>
 #include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/types.hpp>
 
-#include <cuco/static_map.cuh>
+#include <cuco/pair.cuh>
+#include <cuco/storage.cuh>
 
 namespace cudf::io::parquet::detail {
 
-auto constexpr KEY_SENTINEL   = size_type{-1};
-auto constexpr VALUE_SENTINEL = size_type{-1};
+using key_type    = size_type;
+using mapped_type = size_type;
+using slot_type   = cuco::pair<key_type, mapped_type>;
 
-using map_type = cuco::legacy::static_map<size_type, size_type>;
+auto constexpr map_cg_size =
+  1;  ///< A CUDA Cooperative Group of 1 thread (set for best performance) to handle each subset.
+      ///< Note: Adjust insert and find loops to use `cg::tile<map_cg_size>` if increasing this.
+auto constexpr window_size =
+  1;  ///< Number of concurrent slots (set for best performance) handled by each thread.
+auto constexpr occupancy_factor = 1.43f;  ///< cuCollections suggests using a hash map of size
+                                          ///< N * (1/0.7) = 1.43 to target a 70% occupancy factor.
 
-/**
- * @brief The alias of `map_type::pair_atomic_type` class.
- *
- * Declare this struct by trivial subclassing instead of type aliasing so we can have forward
- * declaration of this struct somewhere else.
- */
-struct slot_type : public map_type::pair_atomic_type {};
+auto constexpr KEY_SENTINEL   = key_type{-1};
+auto constexpr VALUE_SENTINEL = mapped_type{-1};
+auto constexpr SCOPE          = cuda::thread_scope_block;
+
+using storage_type     = cuco::aow_storage<slot_type,
+                                       window_size,
+                                       cuco::extent<std::size_t>,
+                                       cudf::detail::cuco_allocator<char>>;
+using storage_ref_type = typename storage_type::ref_type;
+using window_type      = typename storage_type::window_type;
 
 /**
  * @brief Return the byte length of parquet dtypes that are physically represented by INT32
@@ -81,4 +93,43 @@ inline size_type __device__ row_to_value_idx(size_type idx,
   return idx;
 }
 
+/**
+ * @brief Insert chunk values into their respective hash maps
+ *
+ * @param map_storage Bulk hashmap storage
+ * @param frags Column fragments
+ * @param stream CUDA stream to use
+ */
+void populate_chunk_hash_maps(device_span<window_type> const map_storage,
+                              cudf::detail::device_2dspan<PageFragment const> frags,
+                              rmm::cuda_stream_view stream);
+
+/**
+ * @brief Compact dictionary hash map entries into chunk.dict_data
+ *
+ * @param map_storage Bulk hashmap storage
+ * @param chunks Flat span of chunks to compact hash maps for
+ * @param stream CUDA stream to use
+ */
+void collect_map_entries(device_span<window_type> const map_storage,
+                         device_span<EncColumnChunk> chunks,
+                         rmm::cuda_stream_view stream);
+
+/**
+ * @brief Get the Dictionary Indices for each row
+ *
+ * For each row of a chunk, gets the indices into chunk.dict_data which contains the value otherwise
+ * stored in input column [row]. Stores these indices into chunk.dict_index.
+ *
+ * Since dict_data itself contains indices into the original cudf column, this means that
+ * col[row] == col[dict_data[dict_index[row - chunk.start_row]]]
+ *
+ * @param map_storage Bulk hashmap storage
+ * @param frags Column fragments
+ * @param stream CUDA stream to use
+ */
+void get_dictionary_indices(device_span<window_type> const map_storage,
+                            cudf::detail::device_2dspan<PageFragment const> frags,
+                            rmm::cuda_stream_view stream);
+
 }  // namespace cudf::io::parquet::detail
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 8f52f073833..125d35f6499 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -514,7 +514,6 @@ constexpr unsigned int kDictHashBits = 16;
 constexpr size_t kDictScratchSize    = (1 << kDictHashBits) * sizeof(uint32_t);
 
 struct EncPage;
-struct slot_type;
 
 // convert Encoding to a mask value
 constexpr uint32_t encoding_to_mask(Encoding encoding)
@@ -560,7 +559,8 @@ struct EncColumnChunk {
   uint8_t is_compressed;    //!< Nonzero if the chunk uses compression
   uint32_t dictionary_size;    //!< Size of dictionary page including header
   uint32_t ck_stat_size;       //!< Size of chunk-level statistics (included in 1st page header)
-  slot_type* dict_map_slots;   //!< Hash map storage for calculating dict encoding for this chunk
+  uint32_t dict_map_offset;    //!< Offset of the hash map storage for calculating dict encoding for
+                               //!< this chunk
   size_type dict_map_size;     //!< Size of dict_map_slots
   size_type num_dict_entries;  //!< Total number of entries in dictionary
   size_type
@@ -1001,46 +1001,6 @@ void InitFragmentStatistics(device_span<statistics_group> groups,
                             device_span<PageFragment const> fragments,
                             rmm::cuda_stream_view stream);
 
-/**
- * @brief Initialize per-chunk hash maps used for dictionary with sentinel values
- *
- * @param chunks Flat span of chunks to initialize hash maps for
- * @param stream CUDA stream to use
- */
-void initialize_chunk_hash_maps(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream);
-
-/**
- * @brief Insert chunk values into their respective hash maps
- *
- * @param frags Column fragments
- * @param stream CUDA stream to use
- */
-void populate_chunk_hash_maps(cudf::detail::device_2dspan<PageFragment const> frags,
-                              rmm::cuda_stream_view stream);
-
-/**
- * @brief Compact dictionary hash map entries into chunk.dict_data
- *
- * @param chunks Flat span of chunks to compact hash maps for
- * @param stream CUDA stream to use
- */
-void collect_map_entries(device_span<EncColumnChunk> chunks, rmm::cuda_stream_view stream);
-
-/**
- * @brief Get the Dictionary Indices for each row
- *
- * For each row of a chunk, gets the indices into chunk.dict_data which contains the value otherwise
- * stored in input column [row]. Stores these indices into chunk.dict_index.
- *
- * Since dict_data itself contains indices into the original cudf column, this means that
- * col[row] == col[dict_data[dict_index[row - chunk.start_row]]]
- *
- * @param frags Column fragments
- * @param stream CUDA stream to use
- */
-void get_dictionary_indices(cudf::detail::device_2dspan<PageFragment const> frags,
-                            rmm::cuda_stream_view stream);
-
 /**
  * @brief Launches kernel for initializing encoder data pages
  *
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 74992aa733f..46c3151c731 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1285,10 +1285,10 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     return std::pair(std::move(dict_data), std::move(dict_index));
   }
 
-  // Allocate slots for each chunk
-  std::vector<rmm::device_uvector<slot_type>> hash_maps_storage;
-  hash_maps_storage.reserve(h_chunks.size());
-  for (auto& chunk : h_chunks) {
+  // Variable to keep track of the current total map storage size
+  size_t total_map_storage_size = 0;
+  // Populate dict offsets and sizes for each chunk that need to build a dictionary.
+  std::for_each(h_chunks.begin(), h_chunks.end(), [&](auto& chunk) {
     auto const& chunk_col_desc = col_desc[chunk.col_desc_id];
     auto const is_requested_non_dict =
       chunk_col_desc.requested_encoding != column_encoding::USE_DEFAULT &&
@@ -1300,19 +1300,31 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
       chunk.use_dictionary = false;
     } else {
       chunk.use_dictionary = true;
-      // cuCollections suggests using a hash map of size N * (1/0.7) = num_values * 1.43
-      // https://github.com/NVIDIA/cuCollections/blob/3a49fc71/include/cuco/static_map.cuh#L190-L193
-      auto& inserted_map   = hash_maps_storage.emplace_back(chunk.num_values * 1.43, stream);
-      chunk.dict_map_slots = inserted_map.data();
-      chunk.dict_map_size  = inserted_map.size();
+      chunk.dict_map_size =
+        static_cast<cudf::size_type>(cuco::make_window_extent<map_cg_size, window_size>(
+          static_cast<cudf::size_type>(occupancy_factor * chunk.num_values)));
+      chunk.dict_map_offset = total_map_storage_size;
+      total_map_storage_size += chunk.dict_map_size;
     }
-  }
+  });
 
-  chunks.host_to_device_async(stream);
+  // No chunk needs to create a dictionary, exit early
+  if (total_map_storage_size == 0) { return {std::move(dict_data), std::move(dict_index)}; }
 
-  initialize_chunk_hash_maps(chunks.device_view().flat_view(), stream);
-  populate_chunk_hash_maps(frags, stream);
+  // Create a single bulk storage used by all sub-dictionaries
+  auto map_storage = storage_type{
+    total_map_storage_size,
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream}};
+  // Create a span of non-const map_storage as map_storage_ref takes in a non-const pointer.
+  device_span<window_type> const map_storage_data{map_storage.data(), total_map_storage_size};
 
+  // Synchronize
+  chunks.host_to_device_async(stream);
+  // Initialize storage with the given sentinel
+  map_storage.initialize_async({KEY_SENTINEL, VALUE_SENTINEL}, {stream.value()});
+  // Populate the hash map for each chunk
+  populate_chunk_hash_maps(map_storage_data, frags, stream);
+  // Synchronize again
   chunks.device_to_host_sync(stream);
 
   // Make decision about which chunks have dictionary
@@ -1372,8 +1384,8 @@ build_chunk_dictionaries(hostdevice_2dvector<EncColumnChunk>& chunks,
     chunk.dict_index          = inserted_dict_index.data();
   }
   chunks.host_to_device_async(stream);
-  collect_map_entries(chunks.device_view().flat_view(), stream);
-  get_dictionary_indices(frags, stream);
+  collect_map_entries(map_storage_data, chunks.device_view().flat_view(), stream);
+  get_dictionary_indices(map_storage_data, frags, stream);
 
   return std::pair(std::move(dict_data), std::move(dict_index));
 }

From f932bf9c62f73aabee2ac094180036399ce88dcf Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Thu, 29 Aug 2024 15:28:37 -1000
Subject: [PATCH 754/842] Fix Series.to_frame(name=None) setting a None name
 (#16698)

In pandas 2.0, `to_frame(name=None)` allowed the resulting column name to be `None` https://github.com/pandas-dev/pandas/pull/45523

Looks like based on the current default of `cudf.Series.to_frame`, this behavior was not reflected.

Additionally, created a `SingleColumnFrame._to_frame` to more easily share the logic between `Series.to_frame` and `Index.to_frame`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16698
---
 python/cudf/cudf/core/_base_index.py         | 58 --------------------
 python/cudf/cudf/core/index.py               | 57 +++++++++++++++++++
 python/cudf/cudf/core/series.py              | 12 +---
 python/cudf/cudf/core/single_column_frame.py | 11 ++++
 python/cudf/cudf/tests/test_series.py        |  7 +++
 5 files changed, 77 insertions(+), 68 deletions(-)

diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index a224e0ce0d0..ff114474aa4 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -798,64 +798,6 @@ def fillna(self, value, downcast=None):
 
         return super().fillna(value=value)
 
-    def to_frame(self, index=True, name=no_default):
-        """Create a DataFrame with a column containing this Index
-
-        Parameters
-        ----------
-        index : boolean, default True
-            Set the index of the returned DataFrame as the original Index
-        name : object, defaults to index.name
-            The passed name should substitute for the index name (if it has
-            one).
-
-        Returns
-        -------
-        DataFrame
-            DataFrame containing the original Index data.
-
-        See Also
-        --------
-        Index.to_series : Convert an Index to a Series.
-        Series.to_frame : Convert Series to DataFrame.
-
-        Examples
-        --------
-        >>> import cudf
-        >>> idx = cudf.Index(['Ant', 'Bear', 'Cow'], name='animal')
-        >>> idx.to_frame()
-               animal
-        animal
-        Ant       Ant
-        Bear     Bear
-        Cow       Cow
-
-        By default, the original Index is reused. To enforce a new Index:
-
-        >>> idx.to_frame(index=False)
-            animal
-        0   Ant
-        1  Bear
-        2   Cow
-
-        To override the name of the resulting column, specify `name`:
-
-        >>> idx.to_frame(index=False, name='zoo')
-            zoo
-        0   Ant
-        1  Bear
-        2   Cow
-        """
-
-        if name is no_default:
-            col_name = 0 if self.name is None else self.name
-        else:
-            col_name = name
-
-        return cudf.DataFrame(
-            {col_name: self._values}, index=self if index else None
-        )
-
     def to_arrow(self):
         """Convert to a suitable Arrow object."""
         raise NotImplementedError
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 66d03682de4..b2bd20c4982 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -529,6 +529,11 @@ def to_pandas(
             name=self.name,
         )
 
+    def to_frame(
+        self, index: bool = True, name: Hashable = no_default
+    ) -> cudf.DataFrame:
+        return self._as_int_index().to_frame(index=index, name=name)
+
     @property
     def is_unique(self) -> bool:
         return True
@@ -1646,6 +1651,58 @@ def to_pandas(
         result.name = self.name
         return result
 
+    def to_frame(
+        self, index: bool = True, name: Hashable = no_default
+    ) -> cudf.DataFrame:
+        """Create a DataFrame with a column containing this Index
+
+        Parameters
+        ----------
+        index : boolean, default True
+            Set the index of the returned DataFrame as the original Index
+        name : object, defaults to index.name
+            The passed name should substitute for the index name (if it has
+            one).
+
+        Returns
+        -------
+        DataFrame
+            DataFrame containing the original Index data.
+
+        See Also
+        --------
+        Index.to_series : Convert an Index to a Series.
+        Series.to_frame : Convert Series to DataFrame.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> idx = cudf.Index(['Ant', 'Bear', 'Cow'], name='animal')
+        >>> idx.to_frame()
+               animal
+        animal
+        Ant       Ant
+        Bear     Bear
+        Cow       Cow
+
+        By default, the original Index is reused. To enforce a new Index:
+
+        >>> idx.to_frame(index=False)
+            animal
+        0   Ant
+        1  Bear
+        2   Cow
+
+        To override the name of the resulting column, specify `name`:
+
+        >>> idx.to_frame(index=False, name='zoo')
+            zoo
+        0   Ant
+        1  Bear
+        2   Cow
+        """
+        return self._to_frame(name=name, index=self if index else None)
+
     def append(self, other):
         if is_list_like(other):
             to_concat = [self]
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 837c6872258..aadbd80f4b4 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -1160,7 +1160,7 @@ def reset_index(
         )
 
     @_performance_tracking
-    def to_frame(self, name=None):
+    def to_frame(self, name: abc.Hashable = no_default) -> cudf.DataFrame:
         """Convert Series into a DataFrame
 
         Parameters
@@ -1192,15 +1192,7 @@ def to_frame(self, name=None):
         13   <NA>
         15      d
         """  # noqa: E501
-
-        if name is not None:
-            col = name
-        elif self.name is None:
-            col = 0
-        else:
-            col = self.name
-
-        return cudf.DataFrame({col: self._column}, index=self.index)
+        return self._to_frame(name=name, index=self.index)
 
     @_performance_tracking
     def memory_usage(self, index=True, deep=False):
diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
index 55dda34a576..0e66f383ca0 100644
--- a/python/cudf/cudf/core/single_column_frame.py
+++ b/python/cudf/cudf/core/single_column_frame.py
@@ -158,6 +158,17 @@ def to_arrow(self) -> pa.Array:
         """
         return self._column.to_arrow()
 
+    def _to_frame(
+        self, name: Hashable, index: cudf.Index | None
+    ) -> cudf.DataFrame:
+        """Helper function for Series.to_frame, Index.to_frame"""
+        if name is no_default:
+            col_name = 0 if self.name is None else self.name
+        else:
+            col_name = name
+        ca = ColumnAccessor({col_name: self._column}, verify=False)
+        return cudf.DataFrame._from_data(ca, index=index)
+
     @property  # type: ignore
     @_performance_tracking
     def is_unique(self) -> bool:
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 8d673e23ab2..a24002dc38e 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2557,6 +2557,13 @@ def test_series_arrow_list_types_roundtrip():
             cudf.from_pandas(pdf)
 
 
+@pytest.mark.parametrize("base_name", [None, "a"])
+def test_series_to_frame_none_name(base_name):
+    result = cudf.Series(range(1), name=base_name).to_frame(name=None)
+    expected = pd.Series(range(1), name=base_name).to_frame(name=None)
+    assert_eq(result, expected)
+
+
 @pytest.mark.parametrize("klass", [cudf.Index, cudf.Series])
 @pytest.mark.parametrize(
     "data", [pa.array([float("nan")]), pa.chunked_array([[float("nan")]])]

From 62a53b34f6c5c9145e908403d674cc6c16bab7f2 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 29 Aug 2024 21:54:02 -0400
Subject: [PATCH 755/842] [FEA] Add third-party library integration testing of
 cudf.pandas to cudf (#16645)

Closes #16580

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16645
---
 .github/workflows/test.yaml                   |  11 +
 ci/cudf_pandas_scripts/run_tests.sh           |   3 +
 .../ci_run_library_tests.sh                   |  56 +++
 .../third-party-integration/test.sh           |  83 ++++
 .../dependencies.yaml                         | 276 +++++++++++++
 .../tests/conftest.py                         | 173 +++++++++
 .../tests/pytest.ini                          |   7 +
 .../tests/test_cugraph.py                     |  94 +++++
 .../tests/test_cuml.py                        | 152 ++++++++
 .../tests/test_dask.py                        |  10 +
 .../tests/test_featureengine.py               |  47 +++
 .../tests/test_holoviews.py                   |  79 ++++
 .../tests/test_hvplot.py                      |  72 ++++
 .../tests/test_ibis.py                        | 169 ++++++++
 .../tests/test_matplotlib.py                  |  70 ++++
 .../tests/test_numpy.py                       |  59 +++
 .../tests/test_plotly.py                      |  67 ++++
 .../tests/test_pytorch.py                     | 128 ++++++
 .../tests/test_scipy.py                       |  65 ++++
 .../tests/test_seaborn.py                     |  60 +++
 .../tests/test_sklearn.py                     |  82 ++++
 .../tests/test_stumpy.py                      |  94 +++++
 .../tests/test_stumpy_distributed.py          |  48 +++
 .../tests/test_tensorflow.py                  | 367 ++++++++++++++++++
 .../tests/test_xgboost.py                     | 135 +++++++
 25 files changed, 2407 insertions(+)
 create mode 100755 ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh
 create mode 100755 ci/cudf_pandas_scripts/third-party-integration/test.sh
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cugraph.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_dask.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_featureengine.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_hvplot.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_ibis.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_scipy.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_sklearn.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py
 create mode 100644 python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 9feea050b19..2c68f2861bb 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -124,3 +124,14 @@ jobs:
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
       script: ci/cudf_pandas_scripts/run_tests.sh
+  third-party-integration-tests-cudf-pandas:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    with:
+      build_type: nightly
+      branch: ${{ inputs.branch }}
+      date: ${{ inputs.date }}
+      sha: ${{ inputs.sha }}
+      container_image: "rapidsai/ci-conda:latest"
+      run_script: |
+        ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 52964496b36..8b85695c861 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -64,7 +64,9 @@ fi
 python -m pip install ipykernel
 python -m ipykernel install --user --name python3
 
+# The third-party integration tests are ignored because they are run nightly in seperate CI job
 python -m pytest -p cudf.pandas \
+    --ignore=./python/cudf/cudf_pandas_tests/third_party_integration_tests/ \
     --cov-config=./python/cudf/.coveragerc \
     --cov=cudf \
     --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \
@@ -80,6 +82,7 @@ for version in "${versions[@]}"; do
     echo "Installing pandas version: ${version}"
     python -m pip install "numpy>=1.23,<2.0a0" "pandas==${version}"
     python -m pytest -p cudf.pandas \
+    --ignore=./python/cudf/cudf_pandas_tests/third_party_integration_tests/ \
     --cov-config=./python/cudf/.coveragerc \
     --cov=cudf \
     --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/cudf-pandas-coverage.xml" \
diff --git a/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh b/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh
new file mode 100755
index 00000000000..54a56508cdc
--- /dev/null
+++ b/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+cleanup() {
+    rm ${TEST_DIR}/results-*.pickle
+}
+
+trap cleanup EXIT
+
+runtest_gold() {
+    local lib=$1
+
+    pytest \
+    -v \
+    --continue-on-collection-errors \
+    --cache-clear \
+    --numprocesses=${NUM_PROCESSES} \
+    --dist=worksteal \
+    ${TEST_DIR}/test_${lib}*.py
+}
+
+runtest_cudf_pandas() {
+    local lib=$1
+
+    pytest \
+    -p cudf.pandas \
+    -v \
+    --continue-on-collection-errors \
+    --cache-clear \
+    --numprocesses=${NUM_PROCESSES} \
+    --dist=worksteal \
+    ${TEST_DIR}/test_${lib}*.py
+}
+
+main() {
+    local lib=$1
+
+    # generation phase
+    runtest_gold ${lib}
+    runtest_cudf_pandas ${lib}
+
+    # assertion phase
+    pytest \
+    --compare \
+    -p cudf.pandas \
+    -v \
+    --continue-on-collection-errors \
+    --cache-clear \
+    --numprocesses=${NUM_PROCESSES} \
+    --dist=worksteal \
+    ${TEST_DIR}/test_${lib}*.py
+}
+
+main $@
diff --git a/ci/cudf_pandas_scripts/third-party-integration/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh
new file mode 100755
index 00000000000..89b28c30e39
--- /dev/null
+++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh
@@ -0,0 +1,83 @@
+#!/bin/bash
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+# Common setup steps shared by Python test jobs
+
+set -euo pipefail
+
+write_output() {
+  local key="$1"
+  local value="$2"
+  echo "$key=$value" | tee --append "${GITHUB_OUTPUT:-/dev/null}"
+}
+
+extract_lib_from_dependencies_yaml() {
+    local file=$1
+    # Parse all keys in dependencies.yaml under the "files" section,
+    # extract all the keys that start with "test_", and extract the rest
+    local extracted_libs="$(yq -o json $file | jq -rc '.files | with_entries(select(.key | contains("test_"))) | keys | map(sub("^test_"; ""))')"
+    echo $extracted_libs
+}
+
+main() {
+    local dependencies_yaml="$1"
+
+    LIBS=$(extract_lib_from_dependencies_yaml "$dependencies_yaml")
+    LIBS=${LIBS#[}
+    LIBS=${LIBS%]}
+
+    for lib in ${LIBS//,/ }; do
+        lib=$(echo "$lib" | tr -d '""')
+        echo "Running tests for library $lib"
+
+        CUDA_MAJOR=$(if [ "$lib" = "tensorflow" ]; then echo "11"; else echo "12"; fi)
+
+        . /opt/conda/etc/profile.d/conda.sh
+
+        rapids-logger "Generate Python testing dependencies"
+        rapids-dependency-file-generator \
+          --config "$dependencies_yaml" \
+          --output conda \
+          --file-key test_${lib} \
+          --matrix "cuda=${CUDA_MAJOR};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
+
+        rapids-mamba-retry env create --yes -f env.yaml -n test
+
+        # Temporarily allow unbound variables for conda activation.
+        set +u
+        conda activate test
+        set -u
+
+        repo_root=$(git rev-parse --show-toplevel)
+        TEST_DIR=${repo_root}/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests
+
+        rapids-print-env
+
+        rapids-logger "Check GPU usage"
+        nvidia-smi
+
+        EXITCODE=0
+        trap "EXITCODE=1" ERR
+        set +e
+
+        rapids-logger "pytest ${lib}"
+
+        NUM_PROCESSES=8
+        serial_libraries=(
+            "tensorflow"
+        )
+        for serial_library in "${serial_libraries[@]}"; do
+            if [ "${lib}" = "${serial_library}" ]; then
+                NUM_PROCESSES=1
+            fi
+        done
+
+        TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh ${lib}
+
+        rapids-logger "Test script exiting with value: ${EXITCODE}"
+    done
+
+    exit ${EXITCODE}
+}
+
+main "$@"
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
new file mode 100644
index 00000000000..05e1d8178d5
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
@@ -0,0 +1,276 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+# Dependency list for https://github.com/rapidsai/dependency-file-generator
+files:
+  checks:
+    output: none
+    includes:
+      - develop
+      - py_version
+  test_dask:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_dask
+  test_matplotlib:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_matplotlib
+  test_numpy:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_numpy
+  test_pytorch:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_pytorch
+  test_seaborn:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_seaborn
+  test_scipy:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_scipy
+  test_sklearn:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_sklearn
+  test_stumpy:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_stumpy
+  test_tensorflow:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_tensorflow
+  test_xgboost:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_xgboost
+  test_cuml:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_cuml
+  test_cugraph:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_cugraph
+  test_ibis:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_ibis
+  test_hvplot:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_hvplot
+  test_holoviews:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_holoviews
+  test_plotly:
+    output: none
+    includes:
+      - cuda_version
+      - py_version
+      - test_base
+      - test_plotly
+
+channels:
+  - rapidsai-nightly
+  - rapidsai
+  - conda-forge
+  - nvidia
+
+dependencies:
+  develop:
+    common:
+      - output_types: conda
+        packages:
+          - pre-commit
+  cuda_version:
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              cuda: "11"
+            packages:
+              - cuda-version=11.8
+          - matrix:
+              cuda: "11.8"
+            packages:
+              - cuda-version=11.8
+          - matrix:
+              cuda: "12.0"
+            packages:
+              - cuda-version=12.0
+          - matrix:
+              cuda: "12.2"
+            packages:
+              - cuda-version=12.2
+          - matrix:
+              cuda: "12.5"
+            packages:
+              - cuda-version=12.5
+          - matrix:
+              cuda: "12"
+            packages:
+              - cuda-version=12.5
+  py_version:
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              py: "3.10"
+            packages:
+              - python=3.10
+          - matrix:
+              py: "3.11"
+            packages:
+              - python=3.11
+          - matrix:
+            packages:
+              - python>=3.10,<3.12
+  test_base:
+    common:
+      - output_types: conda
+        packages:
+          - cudf==24.10.*,>=0.0.0a0
+          - pandas
+          - pytest
+          - pytest-xdist
+  test_dask:
+    common:
+      - output_types: conda
+        packages:
+          - dask
+  test_matplotlib:
+    common:
+      - output_types: conda
+        packages:
+          - matplotlib-base
+  test_numpy:
+    common:
+      - output_types: conda
+        packages:
+          - numpy
+  test_pytorch:
+    common:
+      - output_types: conda
+        packages:
+          - numpy
+          - pytorch>=2.1.0
+  test_seaborn:
+    common:
+      - output_types: conda
+        packages:
+          - seaborn
+  test_scipy:
+    common:
+      - output_types: conda
+        packages:
+          - scipy
+  test_sklearn:
+    common:
+      - output_types: conda
+        packages:
+          - scikit-learn
+  test_stumpy:
+    common:
+      - output_types: conda
+        packages:
+          - dask
+          - stumpy
+  test_tensorflow:
+    common:
+      - output_types: conda
+        packages:
+          - tensorflow
+  test_xgboost:
+    common:
+      - output_types: conda
+        packages:
+          - hypothesis
+          - numpy
+          - scipy
+          - scikit-learn
+          - pip
+          - pip:
+            - xgboost>=2.0.1
+  test_cuml:
+    common:
+      - output_types: conda
+        packages:
+          - cuml==24.10.*,>=0.0.0a0
+          - scikit-learn
+  test_cugraph:
+    common:
+      - output_types: conda
+        packages:
+          - cugraph==24.10.*,>=0.0.0a0
+          - networkx
+  test_ibis:
+    common:
+      - output_types: conda
+        packages:
+          - pip
+          - pip:
+              - ibis-framework[pandas]
+  test_hvplot:
+    common:
+      - output_types: conda
+        packages:
+          - hvplot
+  test_holoviews:
+    common:
+      - output_types: conda
+        packages:
+          - holoviews
+  test_plotly:
+    common:
+      - output_types: conda
+        packages:
+          - plotly
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py
new file mode 100644
index 00000000000..33b6ffdbd5c
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/conftest.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import os
+import pickle
+from typing import TYPE_CHECKING, BinaryIO
+
+import _pytest
+import _pytest.config
+import _pytest.nodes
+import pytest
+
+if TYPE_CHECKING:
+    import _pytest.python
+
+from _pytest.stash import StashKey
+
+from cudf.pandas.module_accelerator import disable_module_accelerator
+
+file_handle_key = StashKey[BinaryIO]()
+basename_key = StashKey[str]()
+test_folder_key = StashKey[str]()
+results = StashKey[tuple[dict, dict]]()
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--compare",
+        action="store_true",
+        default=False,
+        help="Run comparison step?",
+    )
+
+
+def read_results(f):
+    while True:
+        try:
+            yield pickle.load(f)
+        except EOFError:
+            return
+
+
+def pytest_collection_modifyitems(
+    session, config: _pytest.config.Config, items: list[_pytest.nodes.Item]
+):
+    if config.getoption("--compare"):
+        current_pass = "compare"
+    elif "cudf.pandas" in config.option.plugins:
+        current_pass = "cudf_pandas"
+    else:
+        current_pass = "gold"
+
+    def swap_xfail(item: _pytest.nodes.Item, name: str):
+        """Replace custom `xfail_**` mark with a `xfail` mark having the same kwargs."""
+
+        old_mark = item.keywords[name]
+        new_mark = pytest.mark.xfail(**old_mark.kwargs)
+
+        # Replace all "xfail_**" mark in the node chain with the "xfail" mark
+        # if not found, the node chain is not modified.
+        for node, mark in item.iter_markers_with_node(name):
+            idx = node.own_markers.index(mark)
+            node.own_markers[idx] = new_mark
+
+    for item in items:
+        if current_pass == "gold" and "xfail_gold" in item.keywords:
+            swap_xfail(item, "xfail_gold")
+        elif (
+            current_pass == "cudf_pandas"
+            and "xfail_cudf_pandas" in item.keywords
+        ):
+            swap_xfail(item, "xfail_cudf_pandas")
+        elif current_pass == "compare" and "xfail_compare" in item.keywords:
+            swap_xfail(item, "xfail_compare")
+
+
+def pytest_configure(config: _pytest.config.Config):
+    gold_basename = "results-gold"
+    cudf_basename = "results-cudf-pandas"
+    test_folder = os.path.join(os.path.dirname(__file__))
+
+    if config.getoption("--compare"):
+        # Everyone reads everything
+        gold_path = os.path.join(test_folder, f"{gold_basename}.pickle")
+        cudf_path = os.path.join(test_folder, f"{cudf_basename}.pickle")
+        with disable_module_accelerator():
+            with open(gold_path, "rb") as f:
+                gold_results = dict(read_results(f))
+        with open(cudf_path, "rb") as f:
+            cudf_results = dict(read_results(f))
+        config.stash[results] = (gold_results, cudf_results)
+    else:
+        if "cudf.pandas" in config.option.plugins:
+            basename = cudf_basename
+        else:
+            basename = gold_basename
+
+        if hasattr(config, "workerinput"):
+            # If we're on an xdist worker, open a worker-unique pickle file.
+            worker = config.workerinput["workerid"]
+            filename = f"{basename}-{worker}.pickle"
+        else:
+            filename = f"{basename}.pickle"
+
+        pickle_path = os.path.join(test_folder, filename)
+        config.stash[file_handle_key] = open(pickle_path, "wb")
+        config.stash[test_folder_key] = test_folder
+        config.stash[basename_key] = basename
+
+
+def pytest_pyfunc_call(pyfuncitem: _pytest.python.Function):
+    if pyfuncitem.config.getoption("--compare"):
+        gold_results, cudf_results = pyfuncitem.config.stash[results]
+        key = pyfuncitem.nodeid
+        try:
+            gold = gold_results[key]
+        except KeyError:
+            assert False, "pickled gold result is not available"
+        try:
+            cudf = cudf_results[key]
+        except KeyError:
+            assert False, "pickled cudf result is not available"
+        if gold is None and cudf is None:
+            raise ValueError(f"Integration test {key} did not return a value")
+        asserter = pyfuncitem.get_closest_marker("assert_eq")
+        if asserter is None:
+            assert gold == cudf, "Test failed"
+        else:
+            asserter.kwargs["fn"](gold, cudf)
+    else:
+        # Replace default call of test function with one that captures the
+        # result
+        testfunction = pyfuncitem.obj
+        funcargs = pyfuncitem.funcargs
+        testargs = {
+            arg: funcargs[arg] for arg in pyfuncitem._fixtureinfo.argnames
+        }
+        result = testfunction(**testargs)
+        # Tuple-based key-value pairs, key is the node-id
+        try:
+            pickle.dump(
+                (pyfuncitem.nodeid, result),
+                pyfuncitem.config.stash[file_handle_key],
+            )
+        except pickle.PicklingError:
+            pass
+    return True
+
+
+def pytest_unconfigure(config):
+    if config.getoption("--compare"):
+        return
+    if file_handle_key not in config.stash:
+        # We didn't open a pickle file
+        return
+    if not hasattr(config, "workerinput"):
+        # If we're the controlling process
+        if (
+            hasattr(config.option, "numprocesses")
+            and config.option.numprocesses is not None
+        ):
+            # Concat the worker partial pickle results and remove them
+            for i in range(config.option.numprocesses):
+                worker_result = os.path.join(
+                    config.stash[test_folder_key],
+                    f"{config.stash[basename_key]}-gw{i}.pickle",
+                )
+                with open(worker_result, "rb") as f:
+                    config.stash[file_handle_key].write(f.read())
+                os.remove(worker_result)
+    # Close our file
+    del config.stash[file_handle_key]
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini
new file mode 100644
index 00000000000..817d98e6ba2
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/pytest.ini
@@ -0,0 +1,7 @@
+[pytest]
+xfail_strict=true
+markers=
+    assert_eq: custom binary asserter for a test
+    xfail_gold: this test is expected to fail in the gold pass
+    xfail_cudf_pandas: this test is expected to fail in the cudf_pandas pass
+    xfail_compare: this test is expected to fail in the comparison pass
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cugraph.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cugraph.py
new file mode 100644
index 00000000000..7acc8672063
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cugraph.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import cugraph
+import cupy as cp
+import networkx as nx
+import numpy as np
+import pandas as pd
+import pytest
+
+cugraph_algos = [
+    "betweenness_centrality",
+    "degree_centrality",
+    "katz_centrality",
+    "sorensen_coefficient",
+    "jaccard_coefficient",
+]
+
+nx_algos = [
+    "betweenness_centrality",
+    "degree_centrality",
+    "katz_centrality",
+]
+
+
+def assert_cugraph_equal(expect, got):
+    if isinstance(expect, cp.ndarray):
+        expect = expect.get()
+    if isinstance(got, cp.ndarray):
+        got = got.get()
+    elif isinstance(expect, np.ndarray) and isinstance(got, np.ndarray):
+        assert np.array_equal(expect, got)
+    else:
+        assert expect == got
+
+
+pytestmark = pytest.mark.assert_eq(fn=assert_cugraph_equal)
+
+
+@pytest.fixture(scope="session")
+def df():
+    return pd.DataFrame({"source": [0, 1, 2], "destination": [1, 2, 3]})
+
+
+@pytest.fixture(scope="session")
+def adjacency_matrix():
+    data = {
+        "A": [0, 1, 1, 0],
+        "B": [1, 0, 0, 1],
+        "C": [1, 0, 0, 1],
+        "D": [0, 1, 1, 0],
+    }
+    df = pd.DataFrame(data, index=["A", "B", "C", "D"])
+    return df
+
+
+@pytest.mark.parametrize("algo", cugraph_algos)
+def test_cugraph_from_pandas_edgelist(df, algo):
+    G = cugraph.Graph()
+    G.from_pandas_edgelist(df)
+    return getattr(cugraph, algo)(G).to_pandas().values
+
+
+@pytest.mark.parametrize("algo", cugraph_algos)
+def test_cugraph_from_pandas_adjacency(adjacency_matrix, algo):
+    G = cugraph.Graph()
+    G.from_pandas_adjacency(adjacency_matrix)
+    res = getattr(cugraph, algo)(G).to_pandas()
+    return res.sort_values(list(res.columns)).values
+
+
+@pytest.mark.parametrize("algo", cugraph_algos)
+def test_cugraph_from_numpy_array(df, algo):
+    G = cugraph.Graph()
+    G.from_numpy_array(df.values)
+    return getattr(cugraph, algo)(G).to_pandas().values
+
+
+@pytest.mark.parametrize("algo", nx_algos)
+def test_networkx_from_pandas_edgelist(df, algo):
+    G = nx.from_pandas_edgelist(
+        df, "source", "destination", ["source", "destination"]
+    )
+    return getattr(nx, algo)(G)
+
+
+@pytest.mark.parametrize("algo", nx_algos)
+def test_networkx_from_pandas_adjacency(adjacency_matrix, algo):
+    G = nx.from_pandas_adjacency(adjacency_matrix)
+    return getattr(nx, algo)(G)
+
+
+@pytest.mark.parametrize("algo", nx_algos)
+def test_networkx_from_numpy_array(adjacency_matrix, algo):
+    G = nx.from_numpy_array(adjacency_matrix.values)
+    return getattr(nx, algo)(G)
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py
new file mode 100644
index 00000000000..892d0886596
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_cuml.py
@@ -0,0 +1,152 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import cupy as cp
+import numpy as np
+import pandas as pd
+import pytest
+from cuml.cluster import KMeans
+from cuml.decomposition import PCA
+from cuml.ensemble import RandomForestClassifier
+from cuml.linear_model import LinearRegression, LogisticRegression
+from cuml.metrics import accuracy_score
+from cuml.model_selection import train_test_split
+from cuml.pipeline import Pipeline
+from cuml.preprocessing import StandardScaler
+
+
+def assert_cuml_equal(expect, got):
+    # Coerce GPU arrays to CPU
+    if isinstance(expect, cp.ndarray):
+        expect = expect.get()
+    if isinstance(got, cp.ndarray):
+        got = got.get()
+
+    # Handle equality
+    if isinstance(expect, KMeans) and isinstance(got, KMeans):
+        # same clusters
+        np.testing.assert_allclose(
+            expect.cluster_centers_, got.cluster_centers_
+        )
+    elif isinstance(expect, np.ndarray) and isinstance(got, np.ndarray):
+        np.testing.assert_allclose(expect, got)
+    elif isinstance(expect, tuple) and isinstance(got, tuple):
+        assert len(expect) == len(got)
+        for e, g in zip(expect, got):
+            assert_cuml_equal(e, g)
+    elif isinstance(expect, pd.DataFrame):
+        assert pd.testing.assert_frame_equal(expect, got)
+    elif isinstance(expect, pd.Series):
+        assert pd.testing.assert_series_equal(expect, got)
+    else:
+        assert expect == got
+
+
+pytestmark = pytest.mark.assert_eq(fn=assert_cuml_equal)
+
+
+@pytest.fixture
+def binary_classification_data():
+    data = {
+        "feature1": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0],
+        "feature2": [2.0, 4.0, 1.0, 3.0, 5.0, 7.0, 6.0, 8.0, 10.0, 9.0],
+        "target": [0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
+    }
+    df = pd.DataFrame(data)
+    return df
+
+
+def test_linear_regression():
+    lr = LinearRegression(fit_intercept=True, normalize=False, algorithm="eig")
+    X = pd.DataFrame()
+    X["col1"] = np.array([1, 1, 2, 2], dtype=np.float32)
+    X["col2"] = np.array([1, 2, 2, 3], dtype=np.float32)
+    y = pd.Series(np.array([6.0, 8.0, 9.0, 11.0], dtype=np.float32))
+    lr.fit(X, y)
+
+    X_new = pd.DataFrame()
+    X_new["col1"] = np.array([3, 2], dtype=np.float32)
+    X_new["col2"] = np.array([5, 5], dtype=np.float32)
+    preds = lr.predict(X_new)
+    return preds.values
+
+
+def test_logistic_regression(binary_classification_data):
+    X = binary_classification_data[["feature1", "feature2"]]
+    y = binary_classification_data["target"]
+
+    (X_train, X_test, y_train, y_test) = train_test_split(
+        X, y, test_size=0.2, random_state=42
+    )
+
+    model = LogisticRegression()
+    model.fit(X_train, y_train)
+
+    y_pred = model.predict(X_test)
+    accuracy = accuracy_score(y_test, y_pred)
+
+    return accuracy
+
+
+def test_random_forest(binary_classification_data):
+    X = binary_classification_data[["feature1", "feature2"]]
+    y = binary_classification_data["target"]
+
+    X_train, X_test, y_train, y_test = train_test_split(
+        X, y, test_size=0.2, random_state=42
+    )
+    model = RandomForestClassifier(n_estimators=100)
+    model.fit(X_train, y_train)
+    preds = model.predict(X_test)
+    return preds.values
+
+
+def test_clustering():
+    rng = np.random.default_rng(42)
+    nsamps = 300
+    X = rng.random((nsamps, 2))
+    data = pd.DataFrame(X, columns=["x", "y"])
+
+    kmeans = KMeans(n_clusters=3, random_state=42)
+    kmeans.fit(data)
+    return kmeans
+
+
+def test_data_scaling():
+    data = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0])
+    scaler = StandardScaler()
+
+    scaled_data = scaler.fit_transform(data.values.reshape(-1, 1))
+    return scaled_data
+
+
+def test_pipeline(binary_classification_data):
+    X = binary_classification_data[["feature1", "feature2"]]
+    y = binary_classification_data["target"]
+
+    pipe = Pipeline(
+        [
+            ("scaler", StandardScaler()),
+            ("pca", PCA()),
+            ("random_forest", LogisticRegression()),
+        ]
+    )
+
+    pipe.fit(X, y)
+    results = pipe.predict(X)
+    return results.values
+
+
+@pytest.mark.parametrize(
+    "X, y",
+    [
+        (pd.DataFrame({"a": range(10), "b": range(10)}), pd.Series(range(10))),
+        (
+            pd.DataFrame({"a": range(10), "b": range(10)}).values,
+            pd.Series(range(10)).values,
+        ),  # cudf.pandas wrapped numpy arrays
+    ],
+)
+def test_train_test_split(X, y):
+    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
+
+    # Compare only the size of the data splits
+    return len(X_train), len(X_test), len(y_train), len(y_test)
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_dask.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_dask.py
new file mode 100644
index 00000000000..c34778dfded
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_dask.py
@@ -0,0 +1,10 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import pandas as pd
+
+import dask.dataframe as dd
+
+
+def test_sum():
+    data = {"x": range(1, 11)}
+    ddf = dd.from_pandas(pd.DataFrame(data), npartitions=2)
+    return ddf["x"].sum().compute()
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_featureengine.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_featureengine.py
new file mode 100644
index 00000000000..3e247291fad
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_featureengine.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import numpy as np
+import pandas as pd
+from feature_engine.imputation import DropMissingData
+from feature_engine.preprocessing import MatchVariables
+
+
+def test_drop_missing_data():
+    data = {
+        "x": [np.nan, 1, 1, 0, np.nan],
+        "y": ["a", np.nan, "b", np.nan, "a"],
+    }
+    df = pd.DataFrame(data)
+
+    dmd = DropMissingData()
+    dmd.fit(df)
+    dmd.transform(df)
+
+    return dmd
+
+
+def test_match_variables():
+    train = pd.DataFrame(
+        {
+            "Name": ["tom", "nick", "krish", "jack"],
+            "City": ["London", "Manchester", "Liverpool", "Bristol"],
+            "Age": [20, 21, 19, 18],
+            "Marks": [0.9, 0.8, 0.7, 0.6],
+        }
+    )
+
+    test = pd.DataFrame(
+        {
+            "Name": ["tom", "sam", "nick"],
+            "Age": [20, 22, 23],
+            "Marks": [0.9, 0.7, 0.6],
+            "Hobbies": ["tennis", "rugby", "football"],
+        }
+    )
+
+    match_columns = MatchVariables()
+
+    match_columns.fit(train)
+
+    df_transformed = match_columns.transform(test)
+
+    return df_transformed
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py
new file mode 100644
index 00000000000..bef02c86355
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_holoviews.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import holoviews as hv
+import numpy as np
+import pandas as pd
+import pytest
+
+nsamps = 1000
+hv.extension("bokeh")  # load holoviews extension
+
+
+def assert_holoviews_equal(expect, got):
+    expect_data, expect_ndims, expect_kdims, expect_vdims, expect_shape = (
+        expect
+    )
+    got_data, got_ndims, got_kdims, got_vdims, got_shape = got
+
+    if isinstance(expect_data, dict):
+        np.testing.assert_allclose(expect_data["x"], got_data["x"])
+        np.testing.assert_allclose(
+            expect_data["Frequency"], got_data["Frequency"]
+        )
+    else:
+        pd._testing.assert_frame_equal(expect_data, got_data)
+    assert expect_ndims == got_ndims
+    assert expect_kdims == got_kdims
+    assert expect_vdims == got_vdims
+    assert expect_shape == got_shape
+
+
+pytestmark = pytest.mark.assert_eq(fn=assert_holoviews_equal)
+
+
+@pytest.fixture(scope="module")
+def df():
+    rng = np.random.default_rng(42)
+    return pd.DataFrame(
+        {
+            "x": rng.random(nsamps),
+            "y": rng.random(nsamps),
+            "category": rng.integers(0, 10, nsamps),
+            "category2": rng.integers(0, 10, nsamps),
+        }
+    )
+
+
+def get_plot_info(plot):
+    return (
+        plot.data,
+        plot.ndims,
+        plot.kdims,
+        plot.vdims,
+        plot.shape,
+    )
+
+
+def test_holoviews_barplot(df):
+    return get_plot_info(hv.Bars(df, kdims="category", vdims="y"))
+
+
+def test_holoviews_scatterplot(df):
+    return get_plot_info(hv.Scatter(df, kdims="x", vdims="y"))
+
+
+def test_holoviews_curve(df):
+    return get_plot_info(hv.Curve(df, kdims="category", vdims="y"))
+
+
+def test_holoviews_heatmap(df):
+    return get_plot_info(
+        hv.HeatMap(df, kdims=["category", "category2"], vdims="y")
+    )
+
+
+def test_holoviews_histogram(df):
+    return get_plot_info(hv.Histogram(df.values))
+
+
+def test_holoviews_hexbin(df):
+    return get_plot_info(hv.HexTiles(df, kdims=["x", "y"], vdims="y"))
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_hvplot.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_hvplot.py
new file mode 100644
index 00000000000..0f0d2f8bcbd
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_hvplot.py
@@ -0,0 +1,72 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import hvplot.pandas  # noqa: F401, needs to monkey patch pandas with this.
+import numpy as np
+import pandas as pd
+import pytest
+
+nsamps = 1000
+
+
+def assert_hvplot_equal(expect, got):
+    expect_data, expect_ndims, expect_kdims, expect_vdims, expect_shape = (
+        expect
+    )
+    got_data, got_ndims, got_kdims, got_vdims, got_shape = got
+
+    if isinstance(expect_data, dict):
+        np.testing.assert_allclose(expect_data["x"], got_data["x"])
+        np.testing.assert_allclose(
+            expect_data["Frequency"], got_data["Frequency"]
+        )
+    else:
+        pd._testing.assert_frame_equal(expect_data, got_data)
+    assert expect_ndims == got_ndims
+    assert expect_kdims == got_kdims
+    assert expect_vdims == got_vdims
+    assert expect_shape == got_shape
+
+
+pytestmark = pytest.mark.assert_eq(fn=assert_hvplot_equal)
+
+
+@pytest.fixture(scope="module")
+def df():
+    rng = np.random.default_rng(42)
+    return pd.DataFrame(
+        {
+            "x": rng.random(nsamps),
+            "y": rng.random(nsamps),
+            "category": rng.integers(0, 10, nsamps),
+            "category2": rng.integers(0, 10, nsamps),
+        }
+    )
+
+
+def get_plot_info(plot):
+    return (
+        plot.data,
+        plot.ndims,
+        plot.kdims,
+        plot.vdims,
+        plot.shape,
+    )
+
+
+def test_hvplot_barplot(df):
+    return get_plot_info(df.hvplot.bar(x="category", y="y"))
+
+
+def test_hvplot_scatterplot(df):
+    return get_plot_info(df.hvplot.scatter(x="x", y="y"))
+
+
+def test_hvplot_lineplot(df):
+    return get_plot_info(df.hvplot.line(x="x", y="y"))
+
+
+def test_hvplot_heatmap(df):
+    return get_plot_info(df.hvplot.heatmap(x="x", y="y", C="y"))
+
+
+def test_hvplot_hexbin(df):
+    return get_plot_info(df.hvplot.hexbin(x="x", y="y", C="y"))
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_ibis.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_ibis.py
new file mode 100644
index 00000000000..2a8cf7c6ac2
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_ibis.py
@@ -0,0 +1,169 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+import ibis
+import numpy as np
+import pandas as pd
+import pytest
+
+ibis.set_backend("pandas")
+ibis.options.interactive = False
+
+
+def ibis_assert_equal(expect, got, rtol: float = 1e-7, atol: float = 0.0):
+    pd._testing.assert_almost_equal(expect, got, rtol=rtol, atol=atol)
+
+
+pytestmark = pytest.mark.assert_eq(fn=ibis_assert_equal)
+
+
+COLUMN_REDUCTIONS = ["sum", "min", "max", "mean", "var", "std"]
+ELEMENTWISE_UFUNCS = [
+    "sin",
+    "cos",
+    "atan",
+    "exp",
+    "log",
+    "abs",
+]
+STRING_UNARY_FUNCS = [
+    "lower",
+    "upper",
+    "capitalize",
+    "reverse",
+]
+
+
+@pytest.fixture
+def ibis_table_num_str():
+    N = 1000
+    K = 5
+    rng = np.random.default_rng(42)
+
+    df = pd.DataFrame(
+        rng.integers(0, 100, (N, K)), columns=[f"col{x}" for x in np.arange(K)]
+    )
+    df["key"] = rng.choice(np.arange(10), N)
+    df["str_col"] = rng.choice(["Hello", "World", "It's", "Me", "Again"], N)
+    table = ibis.memtable(df, name="t")
+    return table
+
+
+@pytest.fixture
+def ibis_table_num():
+    N = 100
+    K = 2
+    rng = np.random.default_rng(42)
+
+    df = pd.DataFrame(
+        rng.integers(0, 100, (N, K)), columns=[f"val{x}" for x in np.arange(K)]
+    )
+    df["key"] = rng.choice(np.arange(10), N)
+    table = ibis.memtable(df, name="t")
+    return table
+
+
+@pytest.mark.parametrize("op", COLUMN_REDUCTIONS)
+def test_column_reductions(ibis_table_num_str, op):
+    t = ibis_table_num_str
+    return getattr(t.col1, op)().to_pandas()
+
+
+@pytest.mark.parametrize("op", ["mean", "sum", "min", "max"])
+def test_groupby_reductions(ibis_table_num_str, op):
+    t = ibis_table_num_str
+    return getattr(t.group_by("key").col1, op)().to_pandas()
+
+
+@pytest.mark.parametrize("op", ELEMENTWISE_UFUNCS)
+def test_mutate_ufunc(ibis_table_num_str, op):
+    t = ibis_table_num_str
+    expr = getattr(t.col1, op)()
+    return t.mutate(col1_sin=expr).to_pandas()
+
+
+@pytest.mark.parametrize("op", STRING_UNARY_FUNCS)
+def test_string_unary(ibis_table_num_str, op):
+    t = ibis_table_num_str
+    return getattr(t.str_col, op)().to_pandas()
+
+
+def test_nunique(ibis_table_num_str):
+    t = ibis_table_num_str
+    return t.col1.nunique().to_pandas()
+
+
+def test_count(ibis_table_num_str):
+    t = ibis_table_num_str
+    return t.col1.count().to_pandas()
+
+
+def test_select(ibis_table_num_str):
+    t = ibis_table_num_str
+    return t.select("col0", "col1").to_pandas()
+
+
+def test_between(ibis_table_num_str):
+    t = ibis_table_num_str
+    return t.key.between(4, 8).to_pandas()
+
+
+def test_notin(ibis_table_num_str):
+    t = ibis_table_num_str
+    return t.key.notin([0, 1, 8, 3]).to_pandas()
+
+
+def test_window(ibis_table_num_str):
+    t = ibis_table_num_str
+    return (
+        t.group_by("key").mutate(demeaned=t.col1 - t.col1.mean()).to_pandas()
+    )
+
+
+def test_limit(ibis_table_num_str):
+    t = ibis_table_num_str
+    return t.limit(5).to_pandas()
+
+
+def test_filter(ibis_table_num_str):
+    t = ibis_table_num_str
+    return t.filter([t.key == 4, t.col0 > 15]).to_pandas()
+
+
+@pytest.mark.skip(reason="Join ordering not currently guaranteed, i.e., flaky")
+@pytest.mark.parametrize("join_type", ["inner", "left", "right"])
+def test_join_exact_ordering(ibis_table_num_str, ibis_table_num, join_type):
+    t1 = ibis_table_num_str
+    t2 = ibis_table_num
+    res = t1.join(t2, "key", how=join_type).to_pandas()
+    return res
+
+
+@pytest.mark.parametrize("join_type", ["inner", "left", "right"])
+def test_join_sort_correctness(ibis_table_num_str, ibis_table_num, join_type):
+    """
+    While we don't currently guarantee exact row ordering
+    we can still test join correctness with ex-post sorting.
+    """
+    t1 = ibis_table_num_str
+    t2 = ibis_table_num
+    res = t1.join(t2, "key", how=join_type).to_pandas()
+
+    res_sorted = res.sort_values(by=res.columns.tolist()).reset_index(
+        drop=True
+    )
+    return res_sorted
+
+
+def test_order_by(ibis_table_num_str):
+    t = ibis_table_num_str
+    return t.order_by(ibis.desc("col1")).to_pandas()
+
+
+def test_aggregate_having(ibis_table_num_str):
+    t = ibis_table_num_str
+    return t.aggregate(
+        by=["key"],
+        sum_c0=t.col0.sum(),
+        avg_c0=t.col0.mean(),
+        having=t.col1.mean() > 50,
+    ).to_pandas()
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py
new file mode 100644
index 00000000000..665b9d6fb08
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_matplotlib.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import pytest
+from matplotlib.axes import Axes
+from matplotlib.collections import PathCollection
+from matplotlib.lines import Line2D
+from matplotlib.patches import Rectangle
+from pandas._testing import assert_equal
+
+
+def assert_plots_equal(expect, got):
+    if isinstance(expect, Axes) and isinstance(got, Axes):
+        for expect_ch, got_ch in zip(
+            expect.get_children(), got.get_children()
+        ):
+            assert type(expect_ch) == type(got_ch)
+            if isinstance(expect_ch, Line2D):
+                assert_equal(expect_ch.get_xdata(), got_ch.get_xdata())
+                assert_equal(expect_ch.get_ydata(), got_ch.get_ydata())
+            elif isinstance(expect_ch, Rectangle):
+                assert expect_ch.get_height() == got_ch.get_height()
+    elif isinstance(expect, PathCollection) and isinstance(
+        got, PathCollection
+    ):
+        assert_equal(expect.get_offsets()[:, 0], got.get_offsets()[:, 0])
+        assert_equal(expect.get_offsets()[:, 1], got.get_offsets()[:, 1])
+    else:
+        assert_equal(expect, got)
+
+
+pytestmark = pytest.mark.assert_eq(fn=assert_plots_equal)
+
+
+def test_line():
+    df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [2, 4, 6, 8, 10]})
+    (data,) = plt.plot(df["x"], df["y"], marker="o", linestyle="-")
+
+    return plt.gca()
+
+
+def test_bar():
+    data = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"])
+    ax = data.plot(kind="bar")
+    return ax
+
+
+def test_scatter():
+    df = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [5, 4, 3, 2, 1]})
+
+    fig, ax = plt.subplots(figsize=(8, 6))
+    ax.scatter(df["x"], df["y"])
+
+    return plt.gca()
+
+
+def test_dataframe_plot():
+    rng = np.random.default_rng(42)
+    df = pd.DataFrame(rng.random((10, 5)), columns=["a", "b", "c", "d", "e"])
+    ax = df.plot()
+
+    return ax
+
+
+def test_series_plot():
+    sr = pd.Series([1, 2, 3, 4, 5])
+    ax = sr.plot()
+
+    return ax
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py
new file mode 100644
index 00000000000..472f1889354
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_numpy.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+import numpy as np
+import pandas as pd
+import pytest
+
+nsamps = 1000
+reductions = ["sum", "min", "max", "mean", "var", "std"]
+
+
+pytestmark = pytest.mark.assert_eq(fn=np.testing.assert_allclose)
+
+
+@pytest.fixture(scope="module")
+def sr():
+    rng = np.random.default_rng(42)
+    return pd.Series(rng.random(nsamps))
+
+
+@pytest.mark.parametrize("op", reductions)
+def test_numpy_series_reductions(sr, op):
+    return getattr(np, op)(sr)
+
+
+@pytest.fixture(scope="module")
+def df():
+    rng = np.random.default_rng(42)
+    return pd.DataFrame({"A": rng.random(nsamps), "B": rng.random(nsamps)})
+
+
+@pytest.mark.parametrize("op", reductions)
+def test_numpy_dataframe_reductions(df, op):
+    return getattr(np, op)(df)
+
+
+def test_numpy_dot(df):
+    return np.dot(df, df.T)
+
+
+def test_numpy_fft(sr):
+    fft = np.fft.fft(sr)
+    return fft
+
+
+def test_numpy_sort(df):
+    return np.sort(df)
+
+
+@pytest.mark.parametrize("percentile", [0, 25, 50, 75, 100])
+def test_numpy_percentile(df, percentile):
+    return np.percentile(df, percentile)
+
+
+def test_numpy_unique(df):
+    return np.unique(df)
+
+
+def test_numpy_transpose(df):
+    return np.transpose(df)
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py
new file mode 100644
index 00000000000..27d9df83476
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_plotly.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import numpy as np
+import pandas as pd
+import plotly.express as px
+import pytest
+
+nsamps = 100
+
+
+def assert_plotly_equal(expect, got):
+    assert type(expect) == type(got)
+    if isinstance(expect, dict):
+        assert expect.keys() == got.keys()
+        for k in expect.keys():
+            assert_plotly_equal(expect[k], got[k])
+    elif isinstance(got, list):
+        assert len(expect) == len(got)
+        for i in range(len(expect)):
+            assert_plotly_equal(expect[i], got[i])
+    elif isinstance(expect, np.ndarray):
+        np.testing.assert_allclose(expect, got)
+    else:
+        assert expect == got
+
+
+pytestmark = pytest.mark.assert_eq(fn=assert_plotly_equal)
+
+
+@pytest.fixture(scope="module")
+def df():
+    rng = np.random.default_rng(42)
+    return pd.DataFrame(
+        {
+            "x": rng.random(nsamps),
+            "y": rng.random(nsamps),
+            "category": rng.integers(0, 10, nsamps),
+            "category2": rng.integers(0, 10, nsamps),
+        }
+    )
+
+
+def test_plotly_scatterplot(df):
+    return px.scatter(df, x="x", y="y").to_plotly_json()
+
+
+def test_plotly_lineplot(df):
+    return px.line(df, x="category", y="y").to_plotly_json()
+
+
+def test_plotly_barplot(df):
+    return px.bar(df, x="category", y="y").to_plotly_json()
+
+
+def test_plotly_histogram(df):
+    return px.histogram(df, x="category").to_plotly_json()
+
+
+def test_plotly_pie(df):
+    return px.pie(df, values="category", names="category2").to_plotly_json()
+
+
+def test_plotly_heatmap(df):
+    return px.density_heatmap(df, x="category", y="category2").to_plotly_json()
+
+
+def test_plotly_boxplot(df):
+    return px.box(df, x="category", y="y").to_plotly_json()
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
new file mode 100644
index 00000000000..ae9db3836a6
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+import numpy as np
+import pandas as pd
+import pytest
+import torch
+
+pytestmark = pytest.mark.assert_eq(fn=torch.testing.assert_close)
+
+
+@pytest.fixture
+def data():
+    rng = np.random.default_rng(0)
+    x1 = rng.random(100, dtype=np.float32)
+    x2 = rng.random(100, dtype=np.float32)
+    y = np.zeros(100).astype(np.int64)
+
+    y[(x1 > x2) & (x1 > 0)] = 0
+    y[(x1 < x2) & (x1 > 0)] = 1
+    y[(x1 > x2) & (x1 < 0)] = 2
+    y[(x1 < x2) & (x1 < 0)] = 3
+
+    return x1, x2, y
+
+
+class Dataset(torch.utils.data.Dataset):
+    def __init__(self, x1, x2, y):
+        self.x1 = x1
+        self.x2 = x2
+        self.y = y
+
+    def __getitem__(self, idx):
+        x1 = self.x1[idx]
+        x2 = self.x2[idx]
+        y = self.y[idx]
+        return (x1, x2), y
+
+    def __len__(self):
+        return len(self.x1)
+
+
+def test_dataloader_auto_batching(data):
+    x1, x2, y = (pd.Series(i) for i in data)
+
+    dataset = Dataset(x1, x2, y)
+
+    # default collate_fn
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=10)
+
+    (x1, x2), y = next(iter(dataloader))
+    return x1, x2, y
+
+
+def test_dataloader_manual_batching(data):
+    x1, x2, y = (pd.Series(i) for i in data)
+
+    dataset = Dataset(x1, x2, y)
+
+    # default collate_fn
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=None)
+
+    (x1, x2), y = next(iter(dataloader))
+    return x1, x2, y
+
+
+class Model(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc1 = torch.nn.Linear(2, 10)
+        self.relu1 = torch.nn.ReLU()
+        self.fc2 = torch.nn.Linear(10, 10)
+        self.relu2 = torch.nn.ReLU()
+        self.output = torch.nn.Linear(10, 4)
+
+    def forward(self, x1, x2):
+        x = torch.stack([x1, x2], dim=0).T
+        x = self.fc1(x)
+        x = self.relu1(x)
+        x = self.fc2(x)
+        x = self.relu2(x)
+        return torch.nn.functional.softmax(x, dim=1)
+
+
+def train(model, dataloader, optimizer, criterion):
+    model.train()
+    for (x1, x2), y in dataloader:
+        x1 = x1.to("cuda")
+        x2 = x2.to("cuda")
+        y = y.to("cuda")
+
+        optimizer.zero_grad()
+        y_pred = model(x1, x2)
+        loss = criterion(y_pred, y)
+        loss.backward()
+        optimizer.step()
+
+
+def test_torch_train(data):
+    torch.manual_seed(0)
+
+    x1, x2, y = (pd.Series(i) for i in data)
+    dataset = Dataset(x1, x2, y)
+    # default collate_fn
+    dataloader = torch.utils.data.DataLoader(dataset, batch_size=10)
+
+    model = Model().to("cuda")
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
+    criterion = torch.nn.CrossEntropyLoss()
+
+    train(model, dataloader, optimizer, criterion)
+
+    test_x1, test_x2 = next(iter(dataloader))[0]
+    test_x1 = test_x1.to("cuda")
+    test_x2 = test_x2.to("cuda")
+
+    return model(test_x1, test_x2)
+
+
+def test_torch_tensor_ctor():
+    s = pd.Series(range(5))
+    return torch.tensor(s.values)
+
+
+@pytest.mark.xfail_cudf_pandas(reason="Known failure, see xdf/#210")
+@pytest.mark.xfail_compare
+def test_torch_tensor_from_numpy():
+    s = pd.Series(range(5))
+    return torch.from_numpy(s.values)
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_scipy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_scipy.py
new file mode 100644
index 00000000000..963a8549000
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_scipy.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+import numpy as np
+import pandas as pd
+import pytest
+import scipy
+
+
+@pytest.mark.parametrize("func", ["hmean", "tvar", "gstd"])
+def test_scipy_stats(func):
+    rng = np.random.default_rng(42)
+    data = pd.Series(rng.random(1000))
+    return getattr(scipy.stats, func)(data)
+
+
+@pytest.mark.parametrize("func", ["norm"])
+def test_scipy_linalg(func):
+    rng = np.random.default_rng(42)
+    data = pd.Series(rng.random(1000))
+    return getattr(scipy.linalg, func)(data)
+
+
+pytestmark = pytest.mark.assert_eq(fn=pd._testing.assert_almost_equal)
+
+
+def test_compute_pi():
+    def circle(x):
+        return (1 - x**2) ** 0.5
+
+    x = pd.Series(np.linspace(0, 1, 100))
+    y = pd.Series(circle(np.linspace(0, 1, 100)))
+
+    result = scipy.integrate.trapezoid(y, x)
+    return result * 4
+
+
+def test_matrix_solve():
+    A = pd.DataFrame([[2, 3], [1, 2]])
+    b = pd.Series([1, 2])
+
+    return scipy.linalg.solve(A, b)
+
+
+def test_correlation():
+    data = pd.DataFrame({"A": [1, 2, 3, 4, 5], "B": [5, 4, 3, 2, 1]})
+
+    return scipy.stats.pearsonr(data["A"], data["B"])
+
+
+def test_optimization():
+    x = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0])
+
+    def rosen(x):  # banana function from scipy tutorial
+        return sum(
+            100.0 * (x[1:] - x[:-1] ** 2.0) ** 2.0 + (1 - x[:-1]) ** 2.0
+        )
+
+    result = scipy.optimize.fmin(rosen, x)
+    return result
+
+
+def test_regression():
+    data = pd.DataFrame({"x": [1, 2, 3, 4, 5], "y": [2, 4, 5, 4, 5]})
+    result = scipy.stats.linregress(data["y"], data["y"])
+    return result.slope, result.intercept
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py
new file mode 100644
index 00000000000..4b272900acd
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_seaborn.py
@@ -0,0 +1,60 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import pandas as pd
+import pytest
+import seaborn as sns
+from matplotlib.axes import Axes
+from matplotlib.collections import PathCollection
+from matplotlib.lines import Line2D
+from matplotlib.patches import Rectangle
+from pandas._testing import assert_equal
+
+
+def assert_plots_equal(expect, got):
+    if isinstance(expect, Axes) and isinstance(got, Axes):
+        for expect_ch, got_ch in zip(
+            expect.get_children(), got.get_children()
+        ):
+            assert type(expect_ch) == type(got_ch)
+            if isinstance(expect_ch, Line2D):
+                assert_equal(expect_ch.get_xdata(), got_ch.get_xdata())
+                assert_equal(expect_ch.get_ydata(), got_ch.get_ydata())
+            elif isinstance(expect_ch, Rectangle):
+                assert expect_ch.get_height() == got_ch.get_height()
+    elif isinstance(expect, PathCollection) and isinstance(
+        got, PathCollection
+    ):
+        assert_equal(expect.get_offsets()[:, 0], got.get_offsets()[:, 0])
+        assert_equal(expect.get_offsets()[:, 1], got.get_offsets()[:, 1])
+    else:
+        assert_equal(expect, got)
+
+
+pytestmark = pytest.mark.assert_eq(fn=assert_plots_equal)
+
+
+@pytest.fixture(scope="module")
+def df():
+    df = pd.DataFrame(
+        {
+            "x": [2, 3, 4, 5, 11],
+            "y": [4, 3, 2, 1, 15],
+            "hue": ["c", "a", "b", "b", "a"],
+        }
+    )
+    return df
+
+
+def test_bar(df):
+    ax = sns.barplot(data=df, x="x", y="y")
+    return ax
+
+
+def test_scatter(df):
+    ax = sns.scatterplot(data=df, x="x", y="y", hue="hue")
+    return ax
+
+
+def test_lineplot_with_sns_data():
+    df = sns.load_dataset("flights")
+    ax = sns.lineplot(data=df, x="month", y="passengers")
+    return ax
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_sklearn.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_sklearn.py
new file mode 100644
index 00000000000..1635fd3dcda
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_sklearn.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+import numpy as np
+import pandas as pd
+import pytest
+from sklearn.cluster import KMeans
+from sklearn.feature_selection import SelectKBest, f_classif
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+
+
+def test_regression():
+    data = {
+        "feature1": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
+        "feature2": [2, 4, 1, 3, 5, 7, 6, 8, 10, 9],
+        "target": [0, 0, 0, 0, 1, 1, 1, 1, 1, 1],
+    }
+    df = pd.DataFrame(data)
+
+    X = df[["feature1", "feature2"]]
+    y = df["target"]
+
+    # Data Splitting
+    (X_train, X_test, y_train, y_test) = train_test_split(
+        X, y, test_size=0.2, random_state=42
+    )
+
+    # Basic deterministic LR model
+    model = LogisticRegression()
+    model.fit(X_train, y_train)
+
+    # predction phase
+    y_pred = model.predict(X_test)
+    accuracy = accuracy_score(y_test, y_pred)
+
+    return accuracy
+
+
+@pytest.mark.assert_eq(fn=np.testing.assert_allclose)
+def test_clustering():
+    rng = np.random.default_rng(42)
+    nsamps = 300
+    X = rng.random((nsamps, 2))
+    data = pd.DataFrame(X, columns=["x", "y"])
+
+    # Create and fit a KMeans clustering model
+    kmeans = KMeans(n_clusters=3, random_state=42)
+    kmeans.fit(data)
+    return kmeans.cluster_centers_
+
+
+def test_feature_selection():
+    rng = np.random.default_rng(42)
+    n_samples = 100
+    n_features = 10
+
+    X = rng.random((n_samples, n_features))
+    y = rng.integers(0, 2, size=n_samples)
+
+    data = pd.DataFrame(
+        X, columns=[f"feature{i}" for i in range(1, n_features + 1)]
+    )
+    data["target"] = y
+
+    # Select the top k features
+    k_best = SelectKBest(score_func=f_classif, k=5)
+    k_best.fit_transform(X, y)
+
+    feat_inds = k_best.get_support(indices=True)
+    features = data.iloc[:, feat_inds]
+
+    return sorted(features.columns.tolist())
+
+
+@pytest.mark.assert_eq(fn=np.testing.assert_allclose)
+def test_data_scaling():
+    data = pd.Series([1.0, 2.0, 3.0, 4.0, 5.0])
+    scaler = StandardScaler()
+
+    scaled_data = scaler.fit_transform(data.values.reshape(-1, 1))
+    return scaled_data
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy.py
new file mode 100644
index 00000000000..69248002a58
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+import numpy as np
+import pandas as pd
+import pytest
+import stumpy
+from numba import cuda
+from pandas._testing import assert_equal
+
+
+def stumpy_assert_equal(expected, got):
+    def as_float64(x):
+        if isinstance(x, (tuple, list)):
+            return [as_float64(y) for y in x]
+        else:
+            return x.astype(np.float64)
+
+    assert_equal(as_float64(expected), as_float64(got))
+
+
+pytestmark = pytest.mark.assert_eq(fn=stumpy_assert_equal)
+
+
+def test_1d_time_series():
+    rng = np.random.default_rng(42)
+    ts = pd.Series(rng.random(10))
+    m = 3
+
+    return stumpy.stump(ts, m)
+
+
+def test_1d_gpu():
+    rng = np.random.default_rng(42)
+    your_time_series = rng.random(10000)
+    window_size = (
+        50  # Approximately, how many data points might be found in a pattern
+    )
+    all_gpu_devices = [
+        device.id for device in cuda.list_devices()
+    ]  # Get a list of all available GPU devices
+
+    return stumpy.gpu_stump(
+        your_time_series, m=window_size, device_id=all_gpu_devices
+    )
+
+
+def test_multidimensional_timeseries():
+    rng = np.random.default_rng(42)
+    # Each row represents data from a different dimension while each column represents
+    # data from the same dimension
+    your_time_series = rng.random((3, 1000))
+    # Approximately, how many data points might be found in a pattern
+    window_size = 50
+
+    return stumpy.mstump(your_time_series, m=window_size)
+
+
+def test_anchored_time_series_chains():
+    rng = np.random.default_rng(42)
+    your_time_series = rng.random(10000)
+    window_size = (
+        50  # Approximately, how many data points might be found in a pattern
+    )
+
+    matrix_profile = stumpy.stump(your_time_series, m=window_size)
+
+    left_matrix_profile_index = matrix_profile[:, 2]
+    right_matrix_profile_index = matrix_profile[:, 3]
+    idx = 10  # Subsequence index for which to retrieve the anchored time series chain for
+
+    anchored_chain = stumpy.atsc(
+        left_matrix_profile_index, right_matrix_profile_index, idx
+    )
+
+    all_chain_set, longest_unanchored_chain = stumpy.allc(
+        left_matrix_profile_index, right_matrix_profile_index
+    )
+
+    return anchored_chain, all_chain_set, longest_unanchored_chain
+
+
+def test_semantic_segmentation():
+    rng = np.random.default_rng(42)
+    your_time_series = rng.random(10000)
+    window_size = (
+        50  # Approximately, how many data points might be found in a pattern
+    )
+
+    matrix_profile = stumpy.stump(your_time_series, m=window_size)
+
+    subseq_len = 50
+    return stumpy.fluss(
+        matrix_profile[:, 1], L=subseq_len, n_regimes=2, excl_factor=1
+    )
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
new file mode 100644
index 00000000000..37e3cc34856
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_stumpy_distributed.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+import numpy as np
+import pandas as pd
+import pytest
+import stumpy
+from pandas._testing import assert_equal
+
+from dask.distributed import Client, LocalCluster
+
+
+def stumpy_assert_equal(expected, got):
+    def as_float64(x):
+        if isinstance(x, (tuple, list)):
+            return [as_float64(y) for y in x]
+        else:
+            return x.astype(np.float64)
+
+    assert_equal(as_float64(expected), as_float64(got))
+
+
+pytestmark = pytest.mark.assert_eq(fn=stumpy_assert_equal)
+
+
+# Shared dask client for all tests in this module
+@pytest.fixture(scope="module")
+def dask_client():
+    with LocalCluster(n_workers=4, threads_per_worker=1) as cluster:
+        with Client(cluster) as dask_client:
+            yield dask_client
+
+
+def test_1d_distributed(dask_client):
+    np.random.seed(42)
+    ts = pd.Series(np.random.rand(100))
+    m = 10
+    return stumpy.stumped(dask_client, ts, m)
+
+
+def test_multidimensional_distributed_timeseries(dask_client):
+    np.random.seed(42)
+    # Each row represents data from a different dimension while each column represents
+    # data from the same dimension
+    your_time_series = np.random.rand(3, 1000)
+    # Approximately, how many data points might be found in a pattern
+    window_size = 50
+
+    return stumpy.mstumped(dask_client, your_time_series, m=window_size)
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py
new file mode 100644
index 00000000000..ba1f518cbfd
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_tensorflow.py
@@ -0,0 +1,367 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+import numpy as np
+import pandas as pd
+import pytest
+import tensorflow as tf
+
+SHUFFLE_BUFFER = 500
+BATCH_SIZE = 2
+
+pytestmark = pytest.mark.assert_eq(fn=pd._testing.assert_equal)
+
+
+@pytest.fixture(scope="module")
+def df():
+    rng = np.random.RandomState(42)
+
+    nrows = 303
+    columns = {
+        "age": rng.randint(29, 78, size=(nrows,), dtype="int64"),
+        "sex": rng.randint(0, 2, size=(nrows,), dtype="int64"),
+        "cp": rng.randint(0, 5, size=(nrows,), dtype="int64"),
+        "trestbps": rng.randint(94, 201, size=(nrows,), dtype="int64"),
+        "chol": rng.randint(126, 565, size=(nrows,), dtype="int64"),
+        "fbs": rng.randint(0, 2, size=(nrows,), dtype="int64"),
+        "restecg": rng.randint(0, 3, size=(nrows,), dtype="int64"),
+        "thalach": rng.randint(71, 203, size=(nrows,), dtype="int64"),
+        "exang": rng.randint(0, 2, size=(nrows,), dtype="int64"),
+        "oldpeak": rng.uniform(0.0, 6.2, size=(nrows,)),
+        "slope": rng.randint(1, 4, size=(nrows,), dtype="int64"),
+        "ca": rng.randint(0, 4, size=(nrows,), dtype="int64"),
+        "thal": rng.choice(
+            ["fixed", "normal", "reversible", "1", "2"], size=(nrows,)
+        ),
+        "target": rng.randint(0, 2, size=(nrows,), dtype="int64"),
+    }
+
+    return pd.DataFrame(columns)
+
+
+@pytest.fixture(scope="module")
+def target(df):
+    return df.pop("target")
+
+
+@pytest.fixture
+def model_gen():
+    def make_model(numeric_features):
+        normalizer = tf.keras.layers.Normalization(axis=-1)
+        normalizer.adapt(numeric_features)
+        model = tf.keras.Sequential(
+            [
+                normalizer,
+                tf.keras.layers.Dense(10, activation="relu"),
+                tf.keras.layers.Dense(1),
+            ]
+        )
+
+        model.compile(
+            optimizer="adam",
+            loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
+            metrics=["accuracy"],
+        )
+        return model
+
+    return make_model
+
+
+def test_dataframe_as_array(model_gen, df, target):
+    tf.keras.utils.set_random_seed(42)
+
+    numeric_feature_names = ["age", "thalach", "trestbps", "chol", "oldpeak"]
+    numeric_features = df[numeric_feature_names]
+
+    numeric_features = tf.convert_to_tensor(
+        numeric_features.values, dtype=tf.float32
+    )
+
+    model = model_gen(numeric_features)
+    model.fit(numeric_features, target, epochs=1, batch_size=BATCH_SIZE)
+
+    test_data = numeric_features[:BATCH_SIZE]
+    return model.predict(test_data)
+
+
+def test_dataframe_as_dataset(model_gen, df, target):
+    tf.keras.utils.set_random_seed(42)
+
+    numeric_feature_names = ["age", "thalach", "trestbps", "chol", "oldpeak"]
+    numeric_features = df[numeric_feature_names]
+
+    numeric_features = tf.convert_to_tensor(
+        numeric_features.values, dtype=tf.float32
+    )
+
+    dataset = tf.data.Dataset.from_tensor_slices((numeric_features, target))
+    dataset = dataset.shuffle(SHUFFLE_BUFFER).batch(BATCH_SIZE)
+
+    model = model_gen(numeric_features)
+    model.fit(dataset, epochs=1)
+
+    test_data = dataset.take(1)
+    return model.predict(test_data)
+
+
+def stack_dict(inputs, func=tf.stack):
+    values = []
+    for key in sorted(inputs.keys()):
+        values.append(CastLayer()(inputs[key]))
+
+    class MyLayer(tf.keras.layers.Layer):
+        def call(self, val):
+            return func(val, axis=-1)
+
+    return MyLayer()(values)
+
+
+def test_dataframe_as_dictionary_with_keras_input_layer(df, target):
+    # ensure deterministic results
+    tf.keras.utils.set_random_seed(42)
+
+    numeric_feature_names = ["age", "thalach", "trestbps", "chol", "oldpeak"]
+    numeric_features = df[numeric_feature_names]
+
+    inputs = {}
+    for name in numeric_features:
+        inputs[name] = tf.keras.Input(shape=(1,), name=name, dtype=tf.float32)
+
+    x = stack_dict(inputs, func=tf.concat)
+
+    normalizer = tf.keras.layers.Normalization(axis=-1)
+    normalizer.adapt(stack_dict(dict(numeric_features)))
+
+    x = normalizer(x)
+    x = tf.keras.layers.Dense(10, activation="relu")(x)
+    x = tf.keras.layers.Dense(1)(x)
+
+    model = tf.keras.Model(inputs, x)
+
+    model.compile(
+        optimizer="adam",
+        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
+        metrics=["accuracy"],
+        run_eagerly=True,
+    )
+
+    # Train with dictionary of columns as input:
+    model.fit(dict(numeric_features), target, epochs=1, batch_size=BATCH_SIZE)
+
+    # Train with a dataset of dictionary-elements
+    numeric_dict_ds = tf.data.Dataset.from_tensor_slices(
+        (dict(numeric_features), target)
+    )
+    numeric_dict_batches = numeric_dict_ds.shuffle(SHUFFLE_BUFFER).batch(
+        BATCH_SIZE
+    )
+    model.fit(numeric_dict_batches, epochs=1)
+
+    # Predict
+    return model.predict(numeric_dict_batches.take(1))
+
+
+def test_full_example_train_with_ds(df, target):
+    # https://www.tensorflow.org/tutorials/load_data/pandas_dataframe#full_example
+    # Inputs are converted to tf.dataset and then batched
+
+    # ensure deterministic results
+    tf.keras.utils.set_random_seed(42)
+
+    numeric_feature_names = ["age", "thalach", "trestbps", "chol", "oldpeak"]
+    binary_feature_names = ["sex", "fbs", "exang"]
+    categorical_feature_names = ["cp", "restecg", "slope", "thal", "ca"]
+
+    numeric_features = df[numeric_feature_names]
+
+    inputs = {}
+    for name, column in df.items():
+        if isinstance(column[0], str):
+            dtype = tf.string
+        elif name in categorical_feature_names or name in binary_feature_names:
+            dtype = tf.int64
+        else:
+            dtype = tf.float32
+
+        inputs[name] = tf.keras.Input(shape=(), name=name, dtype=dtype)
+
+    preprocessed = []
+
+    # Process binary features
+    for name in binary_feature_names:
+        inp = inputs[name]
+        inp = inp[:, tf.newaxis]
+        float_value = CastLayer()(inp)
+        preprocessed.append(float_value)
+
+    normalizer = tf.keras.layers.Normalization(axis=-1)
+    normalizer.adapt(stack_dict(dict(numeric_features)))
+
+    # Process numeric features
+    numeric_inputs = {}
+    for name in numeric_feature_names:
+        numeric_inputs[name] = inputs[name]
+
+    numeric_inputs = stack_dict(numeric_inputs)
+    numeric_normalized = normalizer(numeric_inputs)
+
+    preprocessed.append(numeric_normalized)
+
+    # Process categorical features
+    for name in categorical_feature_names:
+        vocab = sorted(set(df[name]))
+        print(f"name: {name}")
+        print(f"vocab: {vocab}\n")
+
+        if isinstance(vocab[0], str):
+            lookup = tf.keras.layers.StringLookup(
+                vocabulary=vocab, output_mode="one_hot"
+            )
+        else:
+            lookup = tf.keras.layers.IntegerLookup(
+                vocabulary=vocab, output_mode="one_hot"
+            )
+
+        x = inputs[name][:, tf.newaxis]
+        x = lookup(x)
+        preprocessed.append(x)
+
+    # Concatenate all tensors
+    preprocesssed_result = MyConcatLayer()(preprocessed)
+
+    preprocessor = tf.keras.Model(inputs, preprocesssed_result)
+
+    # Create the model
+    body = tf.keras.Sequential(
+        [
+            tf.keras.layers.Dense(10, activation="relu"),
+            tf.keras.layers.Dense(10, activation="relu"),
+            tf.keras.layers.Dense(1),
+        ]
+    )
+
+    x = preprocessor(inputs)
+    result = body(x)
+
+    model = tf.keras.Model(inputs, result)
+
+    model.compile(
+        optimizer="adam",
+        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
+        metrics=["accuracy"],
+    )
+
+    ds = tf.data.Dataset.from_tensor_slices((dict(df), target))
+    ds = ds.batch(BATCH_SIZE)
+    model.fit(ds, epochs=1)
+
+    return model.predict(ds.take(1))
+
+
+class CastLayer(tf.keras.layers.Layer):
+    def __init__(self, **kwargs):
+        super(CastLayer, self).__init__(**kwargs)
+
+    def call(self, inp):
+        return tf.cast(inp, tf.float32)
+
+
+class MyConcatLayer(tf.keras.layers.Layer):
+    def call(self, values):
+        values = [tf.cast(v, tf.float32) for v in values]
+        return tf.concat(values, axis=-1)
+
+
+def test_full_example_train_with_df(df, target):
+    # https://www.tensorflow.org/tutorials/load_data/pandas_dataframe#full_example
+    # Inputs are directly passed as dictionary of series
+
+    # ensure deterministic results
+    tf.keras.utils.set_random_seed(42)
+
+    numeric_feature_names = ["age", "thalach", "trestbps", "chol", "oldpeak"]
+    binary_feature_names = ["sex", "fbs", "exang"]
+    categorical_feature_names = ["cp", "restecg", "slope", "thal", "ca"]
+
+    numeric_features = df[numeric_feature_names]
+
+    inputs = {}
+
+    for name, column in df.items():
+        if isinstance(column[0], str):
+            dtype = tf.string
+        elif name in categorical_feature_names or name in binary_feature_names:
+            dtype = tf.int64
+        else:
+            dtype = tf.float32
+
+        inputs[name] = tf.keras.Input(shape=(), name=name, dtype=dtype)
+
+    preprocessed = []
+
+    # Process binary features
+    for name in binary_feature_names:
+        inp = inputs[name]
+        inp = inp[:, tf.newaxis]
+        float_value = CastLayer()(inp)
+        preprocessed.append(float_value)
+
+    normalizer = tf.keras.layers.Normalization(axis=-1)
+    normalizer.adapt(stack_dict(dict(numeric_features)))
+
+    # Process numeric features
+    numeric_inputs = {}
+    for name in numeric_feature_names:
+        numeric_inputs[name] = inputs[name]
+
+    numeric_inputs = stack_dict(numeric_inputs)
+    numeric_normalized = normalizer(numeric_inputs)
+
+    preprocessed.append(numeric_normalized)
+
+    # Process categorical features
+    for name in categorical_feature_names:
+        vocab = sorted(set(df[name]))
+        print(f"name: {name}")
+        print(f"vocab: {vocab}\n")
+
+        if isinstance(vocab[0], str):
+            lookup = tf.keras.layers.StringLookup(
+                vocabulary=vocab, output_mode="one_hot"
+            )
+        else:
+            lookup = tf.keras.layers.IntegerLookup(
+                vocabulary=vocab, output_mode="one_hot"
+            )
+
+        x = inputs[name][:, tf.newaxis]
+        x = lookup(x)
+        preprocessed.append(x)
+
+    # Concatenate all tensors
+    preprocesssed_result = MyConcatLayer()(preprocessed)
+
+    preprocessor = tf.keras.Model(inputs, preprocesssed_result)
+
+    # Create the model
+    body = tf.keras.Sequential(
+        [
+            tf.keras.layers.Dense(10, activation="relu"),
+            tf.keras.layers.Dense(10, activation="relu"),
+            tf.keras.layers.Dense(1),
+        ]
+    )
+
+    x = preprocessor(inputs)
+    result = body(x)
+
+    model = tf.keras.Model(inputs, result)
+
+    model.compile(
+        optimizer="adam",
+        loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
+        metrics=["accuracy"],
+    )
+
+    model.fit(dict(df), target, epochs=1, batch_size=BATCH_SIZE)
+
+    return model.predict(dict(df[:BATCH_SIZE]))
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py
new file mode 100644
index 00000000000..70f1e6a4250
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_xgboost.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import numpy as np
+import pandas as pd
+import pytest
+import scipy.sparse
+import xgboost as xgb
+from sklearn.datasets import make_regression
+from xgboost.testing import IteratorForTest, make_categorical
+
+n_samples = 128
+n_features = 16
+
+
+def xgboost_assert_equal(expect, got, rtol: float = 1e-7, atol: float = 0.0):
+    if isinstance(expect, (tuple, list)):
+        assert len(expect) == len(got)
+        for e, g in zip(expect, got):
+            xgboost_assert_equal(e, g, rtol, atol)
+    elif isinstance(expect, scipy.sparse.csr_matrix):
+        np.testing.assert_allclose(expect.data, got.data, rtol=rtol, atol=atol)
+        np.testing.assert_equal(expect.indices, got.indices)
+        np.testing.assert_equal(expect.indptr, got.indptr)
+    else:
+        pd._testing.assert_almost_equal(expect, got, rtol=rtol, atol=atol)
+
+
+pytestmark = pytest.mark.assert_eq(fn=xgboost_assert_equal)
+
+
+@pytest.fixture
+def reg_data() -> tuple[np.ndarray, np.ndarray]:
+    X, y = make_regression(n_samples, n_features, random_state=11)
+    return X, y
+
+
+@pytest.fixture
+def reg_batches_data() -> tuple[list[pd.DataFrame], list[pd.Series]]:
+    cov = []
+    res = []
+    for i in range(3):
+        X, y = make_regression(n_samples, n_features, random_state=i + 1)
+        cov.append(pd.DataFrame(X))
+        res.append(pd.Series(y))
+    return cov, res
+
+
+def test_with_dmatrix(
+    reg_data: tuple[np.ndarray, np.ndarray],
+) -> tuple[scipy.sparse.csr_matrix, scipy.sparse.csr_matrix]:
+    """DMatrix is the primary interface for XGBoost."""
+    X, y = reg_data
+    X_df = pd.DataFrame(X)
+    y_ser = pd.Series(y)
+    Xy = xgb.DMatrix(X_df, y_ser)
+    assert Xy.feature_names == list(map(str, X_df.columns))
+    csr_0 = Xy.get_data()
+
+    Xc, yc = make_categorical(
+        n_samples, n_features, n_categories=13, onehot=False
+    )
+    Xy = xgb.DMatrix(Xc, yc, enable_categorical=True)
+    csr_1 = Xy.get_data()
+    return csr_0, csr_1
+
+
+def test_with_quantile_dmatrix(
+    reg_data: tuple[np.ndarray, np.ndarray],
+) -> tuple[scipy.sparse.csr_matrix, scipy.sparse.csr_matrix]:
+    """QuantileDMatrix is an optimization for the `hist` tree method for XGBoost."""
+    from xgboost.testing.data import memory
+
+    memory.clear(warn=False)
+
+    X, y = reg_data
+    X_df = pd.DataFrame(X)
+    y_ser = pd.Series(y)
+    Xy = xgb.QuantileDMatrix(X_df, y_ser)
+    assert Xy.feature_names == list(map(str, X_df.columns))
+    csr_0 = Xy.get_data()
+
+    Xc, yc = make_categorical(
+        n_samples, n_features, n_categories=13, onehot=False
+    )
+    Xy = xgb.QuantileDMatrix(Xc, yc, enable_categorical=True)
+    csr_1 = Xy.get_data()
+    return csr_0, csr_1
+
+
+def test_with_iter_quantile_dmatrix(
+    reg_batches_data: tuple[list[pd.DataFrame], list[pd.DataFrame]],
+) -> scipy.sparse.csr_matrix:
+    """Using iterator to initialize QuantileDMatrix."""
+    cov, res = reg_batches_data
+    it = IteratorForTest(cov, res, w=None, cache=None)
+    Xy = xgb.QuantileDMatrix(it)
+    csr = Xy.get_data()
+    return csr
+
+
+@pytest.mark.parametrize("device", ["cpu", "cuda"])
+def test_with_external_memory(
+    device: str,
+    reg_batches_data: tuple[list[pd.DataFrame], list[pd.DataFrame]],
+) -> np.ndarray:
+    """Test with iterator-based external memory."""
+    cov, res = reg_batches_data
+    it = IteratorForTest(cov, res, w=None, cache="cache")
+    Xy = xgb.DMatrix(it)
+    predt = xgb.train({"device": device}, Xy, num_boost_round=1).predict(Xy)
+    return predt
+
+
+@pytest.mark.parametrize("device", ["cpu", "cuda"])
+def test_predict(device: str) -> np.ndarray:
+    reg = xgb.XGBRegressor(n_estimators=2, device=device)
+    X, y = make_regression(n_samples, n_features, random_state=11)
+    X_df = pd.DataFrame(X)
+    reg.fit(X_df, y)
+    booster = reg.get_booster()
+
+    predt0 = reg.predict(X_df)
+
+    predt1 = booster.inplace_predict(X_df)
+    np.testing.assert_allclose(predt0, predt1)
+
+    predt2 = booster.predict(xgb.DMatrix(X_df))
+    np.testing.assert_allclose(predt0, predt2)
+
+    predt3 = booster.inplace_predict(X)
+    np.testing.assert_allclose(predt0, predt3)
+
+    return predt0

From 23fb31e7af5e768722f640601034a9d490c2e54c Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Thu, 29 Aug 2024 19:27:06 -0700
Subject: [PATCH 756/842] Add a libcudf/thrust-based TPC-H derived datagen
 (#16294)

This PR adds a TPC-H (according to spec 3.0.1) inspired datagen written using `libcudf` and `thrust`

### Implementation Status

- [x] lineitem
- [x] orders
- [x] region
- [x] nation
- [x] supplier
- [x] customer
- [x] part
- [x] partsupp

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Yunsong Wang (https://github.com/PointKernel)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16294
---
 cpp/benchmarks/CMakeLists.txt                 |  24 +
 .../random_column_generator.cu                | 246 +++++
 .../random_column_generator.hpp               | 150 +++
 .../tpch_data_generator/table_helpers.cpp     | 386 +++++++
 .../tpch_data_generator/table_helpers.hpp     | 155 +++
 .../tpch_data_generator.cpp                   | 987 ++++++++++++++++++
 .../tpch_data_generator.hpp                   |  94 ++
 7 files changed, 2042 insertions(+)
 create mode 100644 cpp/benchmarks/common/tpch_data_generator/random_column_generator.cu
 create mode 100644 cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp
 create mode 100644 cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp
 create mode 100644 cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp
 create mode 100644 cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp
 create mode 100644 cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 99ef9e2976f..d2c22b788cb 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -35,6 +35,30 @@ target_include_directories(
          "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
 )
 
+add_library(
+  tpch_data_generator STATIC
+  common/tpch_data_generator/tpch_data_generator.cpp common/tpch_data_generator/table_helpers.cpp
+  common/tpch_data_generator/random_column_generator.cu
+)
+target_compile_features(tpch_data_generator PUBLIC cxx_std_17 cuda_std_17)
+
+target_compile_options(
+  tpch_data_generator PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
+                             "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
+)
+
+target_link_libraries(
+  tpch_data_generator
+  PUBLIC cudf cudftestutil nvtx3::nvtx3-cpp
+  PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
+)
+
+target_include_directories(
+  tpch_data_generator
+  PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>" "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
+         "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
+)
+
 # ##################################################################################################
 # * compiler function -----------------------------------------------------------------------------
 
diff --git a/cpp/benchmarks/common/tpch_data_generator/random_column_generator.cu b/cpp/benchmarks/common/tpch_data_generator/random_column_generator.cu
new file mode 100644
index 00000000000..4246bd1a83b
--- /dev/null
+++ b/cpp/benchmarks/common/tpch_data_generator/random_column_generator.cu
@@ -0,0 +1,246 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "random_column_generator.hpp"
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/binaryop.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/filling.hpp>
+#include <cudf/strings/detail/strings_children.cuh>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+
+#include <string>
+
+namespace cudf::datagen {
+
+namespace {
+
+// Functor for generating random strings
+struct random_string_generator {
+  char* chars;
+  thrust::default_random_engine engine;
+  thrust::uniform_int_distribution<unsigned char> char_dist;
+
+  CUDF_HOST_DEVICE random_string_generator(char* c) : chars(c), char_dist(44, 122) {}
+
+  __device__ void operator()(thrust::tuple<int64_t, int64_t> str_begin_end)
+  {
+    auto begin = thrust::get<0>(str_begin_end);
+    auto end   = thrust::get<1>(str_begin_end);
+    engine.discard(begin);
+    for (auto i = begin; i < end; ++i) {
+      auto ch = char_dist(engine);
+      if (i == end - 1 && ch >= '\x7F') ch = ' ';  // last element ASCII only.
+      if (ch >= '\x7F')                            // x7F is at the top edge of ASCII
+        chars[i++] = '\xC4';                       // these characters are assigned two bytes
+      chars[i] = static_cast<char>(ch + (ch >= '\x7F'));
+    }
+  }
+};
+
+// Functor for generating random numbers
+template <typename T>
+struct random_number_generator {
+  T lower;
+  T upper;
+
+  CUDF_HOST_DEVICE random_number_generator(T lower, T upper) : lower(lower), upper(upper) {}
+
+  __device__ T operator()(const int64_t idx) const
+  {
+    if constexpr (cudf::is_integral<T>()) {
+      thrust::default_random_engine engine;
+      thrust::uniform_int_distribution<T> dist(lower, upper);
+      engine.discard(idx);
+      return dist(engine);
+    } else {
+      thrust::default_random_engine engine;
+      thrust::uniform_real_distribution<T> dist(lower, upper);
+      engine.discard(idx);
+      return dist(engine);
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<cudf::column> generate_random_string_column(cudf::size_type lower,
+                                                            cudf::size_type upper,
+                                                            cudf::size_type num_rows,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto offsets_begin = cudf::detail::make_counting_transform_iterator(
+    0, random_number_generator<cudf::size_type>(lower, upper));
+  auto [offsets_column, computed_bytes] = cudf::strings::detail::make_offsets_child_column(
+    offsets_begin, offsets_begin + num_rows, stream, mr);
+  rmm::device_uvector<char> chars(computed_bytes, stream);
+
+  auto const offset_itr =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
+
+  // We generate the strings in parallel into the `chars` vector using the
+  // offsets vector generated above.
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::make_zip_iterator(offset_itr, offset_itr + 1),
+                     num_rows,
+                     random_string_generator(chars.data()));
+
+  return cudf::make_strings_column(
+    num_rows, std::move(offsets_column), chars.release(), 0, rmm::device_buffer{});
+}
+
+template <typename T>
+std::unique_ptr<cudf::column> generate_random_numeric_column(T lower,
+                                                             T upper,
+                                                             cudf::size_type num_rows,
+                                                             rmm::cuda_stream_view stream,
+                                                             rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto col = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_to_id<T>()}, num_rows, cudf::mask_state::UNALLOCATED, stream, mr);
+  cudf::size_type begin = 0;
+  cudf::size_type end   = num_rows;
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator(begin),
+                    thrust::make_counting_iterator(end),
+                    col->mutable_view().begin<T>(),
+                    random_number_generator<T>(lower, upper));
+  return col;
+}
+
+template std::unique_ptr<cudf::column> generate_random_numeric_column<int8_t>(
+  int8_t lower,
+  int8_t upper,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template std::unique_ptr<cudf::column> generate_random_numeric_column<int16_t>(
+  int16_t lower,
+  int16_t upper,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template std::unique_ptr<cudf::column> generate_random_numeric_column<cudf::size_type>(
+  cudf::size_type lower,
+  cudf::size_type upper,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template std::unique_ptr<cudf::column> generate_random_numeric_column<double>(
+  double lower,
+  double upper,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+std::unique_ptr<cudf::column> generate_primary_key_column(cudf::scalar const& start,
+                                                          cudf::size_type num_rows,
+                                                          rmm::cuda_stream_view stream,
+                                                          rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return cudf::sequence(num_rows, start, stream, mr);
+}
+
+std::unique_ptr<cudf::column> generate_repeat_string_column(std::string const& value,
+                                                            cudf::size_type num_rows,
+                                                            rmm::cuda_stream_view stream,
+                                                            rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto const scalar = cudf::string_scalar(value);
+  return cudf::make_column_from_scalar(scalar, num_rows, stream, mr);
+}
+
+std::unique_ptr<cudf::column> generate_random_string_column_from_set(
+  cudf::host_span<const char* const> set,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Build a gather map of random strings to choose from
+  // The size of the string sets always fits within 16-bit integers
+  auto const indices =
+    generate_primary_key_column(cudf::numeric_scalar<int16_t>(0), set.size(), stream, mr);
+  auto const keys       = cudf::test::strings_column_wrapper(set.begin(), set.end()).release();
+  auto const gather_map = cudf::table_view({indices->view(), keys->view()});
+
+  // Build a column of random keys to gather from the set
+  auto const gather_keys =
+    generate_random_numeric_column<int16_t>(0, set.size() - 1, num_rows, stream, mr);
+
+  // Perform the gather operation
+  auto const gathered_table = cudf::gather(
+    gather_map, gather_keys->view(), cudf::out_of_bounds_policy::DONT_CHECK, stream, mr);
+  auto gathered_table_columns = gathered_table->release();
+  return std::move(gathered_table_columns[1]);
+}
+
+template <typename T>
+std::unique_ptr<cudf::column> generate_repeat_sequence_column(T seq_length,
+                                                              bool zero_indexed,
+                                                              cudf::size_type num_rows,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto pkey =
+    generate_primary_key_column(cudf::numeric_scalar<cudf::size_type>(0), num_rows, stream, mr);
+  auto repeat_seq_zero_indexed = cudf::binary_operation(pkey->view(),
+                                                        cudf::numeric_scalar<T>(seq_length),
+                                                        cudf::binary_operator::MOD,
+                                                        cudf::data_type{cudf::type_to_id<T>()},
+                                                        stream,
+                                                        mr);
+  if (zero_indexed) { return repeat_seq_zero_indexed; }
+  return cudf::binary_operation(repeat_seq_zero_indexed->view(),
+                                cudf::numeric_scalar<T>(1),
+                                cudf::binary_operator::ADD,
+                                cudf::data_type{cudf::type_to_id<T>()},
+                                stream,
+                                mr);
+}
+
+template std::unique_ptr<cudf::column> generate_repeat_sequence_column<int8_t>(
+  int8_t seq_length,
+  bool zero_indexed,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+template std::unique_ptr<cudf::column> generate_repeat_sequence_column<cudf::size_type>(
+  cudf::size_type seq_length,
+  bool zero_indexed,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr);
+
+}  // namespace cudf::datagen
diff --git a/cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp b/cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp
new file mode 100644
index 00000000000..3e254f49805
--- /dev/null
+++ b/cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp
@@ -0,0 +1,150 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column.hpp>
+
+#include <string>
+
+namespace cudf::datagen {
+
+/**
+ * @brief Generate a column of random strings
+ *
+ * @param lower The lower bound of the length of the strings
+ * @param upper The upper bound of the length of the strings
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::column> generate_random_string_column(
+  cudf::size_type lower,
+  cudf::size_type upper,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate a column of random numbers
+ *
+ * Example:
+ *
+ * lower = 10
+ * upper = 15
+ * num_rows = 10
+ * result = [10, 11, 14, 14, 13, 12, 11, 11, 12, 14]
+
+ *
+ * @param lower The lower bound of the random numbers
+ * @param upper The upper bound of the random numbers
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+template <typename T>
+std::unique_ptr<cudf::column> generate_random_numeric_column(
+  T lower,
+  T upper,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate a primary key column
+ *
+ * Example:
+ *
+ * start = 1
+ * num_rows = 10
+ * result = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+ *
+ * @param start The starting value of the primary key
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::column> generate_primary_key_column(
+  cudf::scalar const& start,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate a column where all the rows have the same string value
+ *
+ * Example:
+ *
+ * value = "abc"
+ * num_rows = 5
+ * result = ["abc", "abc", "abc", "abc", "abc"]
+ *
+ * @param value The string value to fill the column with
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::column> generate_repeat_string_column(
+  std::string const& value,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate a column by randomly choosing from set of strings
+ *
+ * Example:
+ *
+ * set = {"s1", "s2", "s3"}
+ * num_rows = 10
+ * result = ["s1", "s2", "s2", "s1", "s3", "s3", "s3", "s2", "s1", "s1"]
+ *
+ * @param set The set of strings to choose from
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::column> generate_random_string_column_from_set(
+  cudf::host_span<const char* const> set,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate a column consisting of a repeating sequence of integers
+ *
+ * Example:
+ *
+ * seq_length = 3
+ * zero_indexed = false
+ * num_rows = 10
+ * result = [1, 2, 3, 1, 2, 3, 1, 2, 3, 1]
+ *
+ * @param seq_length The length of the repeating sequence
+ * @param zero_indexed Whether the sequence is zero or one indexed
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+template <typename T>
+std::unique_ptr<cudf::column> generate_repeat_sequence_column(
+  T seq_length,
+  bool zero_indexed,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+}  // namespace cudf::datagen
diff --git a/cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp b/cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp
new file mode 100644
index 00000000000..36bf9c49cea
--- /dev/null
+++ b/cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp
@@ -0,0 +1,386 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "table_helpers.hpp"
+
+#include "random_column_generator.hpp"
+
+#include <cudf/aggregation.hpp>
+#include <cudf/ast/detail/operators.hpp>
+#include <cudf/ast/expressions.hpp>
+#include <cudf/binaryop.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/filling.hpp>
+#include <cudf/join.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/convert/convert_integers.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/unary.hpp>
+
+#include <vector>
+
+namespace cudf::datagen {
+
+/**
+ * @brief Add a column of days to a column of timestamp_days
+ *
+ * @param timestamp_days The column of timestamp_days
+ * @param days The column of days to add
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::column> add_calendrical_days(cudf::column_view const& timestamp_days,
+                                                   cudf::column_view const& days,
+                                                   rmm::cuda_stream_view stream,
+                                                   rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto const days_duration_type = cudf::cast(days, cudf::data_type{cudf::type_id::DURATION_DAYS});
+  auto const data_type          = cudf::data_type{cudf::type_id::TIMESTAMP_DAYS};
+  return cudf::binary_operation(
+    timestamp_days, days_duration_type->view(), cudf::binary_operator::ADD, data_type, stream, mr);
+}
+
+/**
+ * @brief Perform a left join operation between two tables
+ *
+ * @param left_input The left table
+ * @param right_input The right table
+ * @param left_on The indices of the columns to join on in the left table
+ * @param right_on The indices of the columns to join on in the right table
+ * @param compare_nulls The null equality comparison
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ */
+std::unique_ptr<cudf::table> perform_left_join(cudf::table_view const& left_input,
+                                               cudf::table_view const& right_input,
+                                               std::vector<cudf::size_type> const& left_on,
+                                               std::vector<cudf::size_type> const& right_on,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  constexpr auto oob_policy = cudf::out_of_bounds_policy::NULLIFY;
+  auto const left_selected  = left_input.select(left_on);
+  auto const right_selected = right_input.select(right_on);
+  auto const [left_join_indices, right_join_indices] =
+    cudf::left_join(left_selected, right_selected, cudf::null_equality::EQUAL, mr);
+
+  auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
+  auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
+
+  auto const left_indices_col  = cudf::column_view{left_indices_span};
+  auto const right_indices_col = cudf::column_view{right_indices_span};
+
+  auto const left_result  = cudf::gather(left_input, left_indices_col, oob_policy, stream, mr);
+  auto const right_result = cudf::gather(right_input, right_indices_col, oob_policy, stream, mr);
+
+  auto joined_cols = left_result->release();
+  auto right_cols  = right_result->release();
+  joined_cols.insert(joined_cols.end(),
+                     std::make_move_iterator(right_cols.begin()),
+                     std::make_move_iterator(right_cols.end()));
+  return std::make_unique<cudf::table>(std::move(joined_cols));
+}
+
+/**
+ * @brief Generate the `p_retailprice` column of the `part` table
+ *
+ * @param p_partkey The `p_partkey` column of the `part` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_p_retailprice(
+  cudf::column_view const& p_partkey,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Expression: (90000 + ((p_partkey/10) modulo 20001) + 100 * (p_partkey modulo 1000)) / 100
+  auto table             = cudf::table_view({p_partkey});
+  auto p_partkey_col_ref = cudf::ast::column_reference(0);
+
+  auto scalar_10    = cudf::numeric_scalar<cudf::size_type>(10);
+  auto scalar_100   = cudf::numeric_scalar<cudf::size_type>(100);
+  auto scalar_1000  = cudf::numeric_scalar<cudf::size_type>(1000);
+  auto scalar_20001 = cudf::numeric_scalar<cudf::size_type>(20001);
+  auto scalar_90000 = cudf::numeric_scalar<cudf::size_type>(90000);
+
+  auto literal_10    = cudf::ast::literal(scalar_10);
+  auto literal_100   = cudf::ast::literal(scalar_100);
+  auto literal_1000  = cudf::ast::literal(scalar_1000);
+  auto literal_20001 = cudf::ast::literal(scalar_20001);
+  auto literal_90000 = cudf::ast::literal(scalar_90000);
+
+  auto expr_a = cudf::ast::operation(cudf::ast::ast_operator::DIV, p_partkey_col_ref, literal_10);
+  auto expr_b = cudf::ast::operation(cudf::ast::ast_operator::MOD, expr_a, literal_20001);
+  auto expr_c = cudf::ast::operation(cudf::ast::ast_operator::MOD, p_partkey_col_ref, literal_1000);
+  auto expr_d = cudf::ast::operation(cudf::ast::ast_operator::MUL, expr_c, literal_100);
+  auto expr_e = cudf::ast::operation(cudf::ast::ast_operator::ADD, expr_b, expr_d);
+  auto expr_f = cudf::ast::operation(cudf::ast::ast_operator::ADD, expr_e, literal_90000);
+  auto final_expr = cudf::ast::operation(cudf::ast::ast_operator::TRUE_DIV, expr_f, literal_100);
+
+  // Execute the AST expression
+  return cudf::compute_column(table, final_expr, stream, mr);
+}
+
+/**
+ * @brief Generate the `l_suppkey` column of the `lineitem` table
+ *
+ * @param l_partkey The `l_partkey` column of the `lineitem` table
+ * @param scale_factor The scale factor to use
+ * @param num_rows The number of rows in the `lineitem` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_l_suppkey(cudf::column_view const& l_partkey,
+                                                                cudf::size_type scale_factor,
+                                                                cudf::size_type num_rows,
+                                                                rmm::cuda_stream_view stream,
+                                                                rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Expression: (l_partkey + (i * (s/4 + (int)(l_partkey - 1)/s))) % s + 1
+
+  // Generate the `s` col
+  auto s_empty = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows, cudf::mask_state::UNALLOCATED, stream);
+
+  auto s = cudf::fill(s_empty->view(),
+                      0,
+                      num_rows,
+                      cudf::numeric_scalar<cudf::size_type>(scale_factor * 10'000),
+                      stream,
+                      mr);
+
+  // Generate the `i` col
+  auto i = generate_repeat_sequence_column<cudf::size_type>(4, true, num_rows, stream, mr);
+
+  // Create a table view out of `l_partkey`, `s`, and `i`
+  auto table = cudf::table_view({l_partkey, s->view(), i->view()});
+
+  // Create the AST expression
+  auto scalar_1  = cudf::numeric_scalar<cudf::size_type>(1);
+  auto scalar_4  = cudf::numeric_scalar<cudf::size_type>(4);
+  auto literal_1 = cudf::ast::literal(scalar_1);
+  auto literal_4 = cudf::ast::literal(scalar_4);
+
+  auto l_partkey_col_ref = cudf::ast::column_reference(0);
+  auto s_col_ref         = cudf::ast::column_reference(1);
+  auto i_col_ref         = cudf::ast::column_reference(2);
+
+  // (int)(l_partkey - 1)/s
+  auto expr_a = cudf::ast::operation(cudf::ast::ast_operator::SUB, l_partkey_col_ref, literal_1);
+  auto expr_b = cudf::ast::operation(cudf::ast::ast_operator::DIV, expr_a, s_col_ref);
+
+  // s/4
+  auto expr_c = cudf::ast::operation(cudf::ast::ast_operator::DIV, s_col_ref, literal_4);
+
+  // (s/4 + (int)(l_partkey - 1)/s)
+  auto expr_d = cudf::ast::operation(cudf::ast::ast_operator::ADD, expr_c, expr_b);
+
+  // (i * (s/4 + (int)(l_partkey - 1)/s))
+  auto expr_e = cudf::ast::operation(cudf::ast::ast_operator::MUL, i_col_ref, expr_d);
+
+  // (l_partkey + (i * (s/4 + (int)(l_partkey - 1)/s)))
+  auto expr_f = cudf::ast::operation(cudf::ast::ast_operator::ADD, l_partkey_col_ref, expr_e);
+
+  // (l_partkey + (i * (s/4 + (int)(l_partkey - 1)/s))) % s
+  auto expr_g = cudf::ast::operation(cudf::ast::ast_operator::MOD, expr_f, s_col_ref);
+
+  // (l_partkey + (i * (s/4 + (int)(l_partkey - 1)/s))) % s + 1
+  auto final_expr = cudf::ast::operation(cudf::ast::ast_operator::ADD, expr_g, literal_1);
+
+  // Execute the AST expression
+  return cudf::compute_column(table, final_expr, stream, mr);
+}
+
+/**
+ * @brief Generate the `ps_suppkey` column of the `partsupp` table
+ *
+ * @param ps_partkey The `ps_partkey` column of the `partsupp` table
+ * @param scale_factor The scale factor to use
+ * @param num_rows The number of rows in the `partsupp` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_ps_suppkey(
+  cudf::column_view const& ps_partkey,
+  cudf::size_type scale_factor,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream,
+  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Expression: ps_suppkey = (ps_partkey + (i * (s/4 + (int)(ps_partkey - 1)/s))) % s + 1
+
+  // Generate the `s` col
+  auto s_empty = cudf::make_numeric_column(
+    cudf::data_type{cudf::type_id::INT32}, num_rows, cudf::mask_state::UNALLOCATED, stream);
+
+  auto s = cudf::fill(s_empty->view(),
+                      0,
+                      num_rows,
+                      cudf::numeric_scalar<cudf::size_type>(scale_factor * 10'000),
+                      stream,
+                      mr);
+
+  // Generate the `i` col
+  auto i = generate_repeat_sequence_column<cudf::size_type>(4, true, num_rows, stream, mr);
+
+  // Create a table view out of `p_partkey`, `s`, and `i`
+  auto table = cudf::table_view({ps_partkey, s->view(), i->view()});
+
+  // Create the AST expression
+  auto scalar_1  = cudf::numeric_scalar<cudf::size_type>(1);
+  auto scalar_4  = cudf::numeric_scalar<cudf::size_type>(4);
+  auto literal_1 = cudf::ast::literal(scalar_1);
+  auto literal_4 = cudf::ast::literal(scalar_4);
+
+  auto ps_partkey_col_ref = cudf::ast::column_reference(0);
+  auto s_col_ref          = cudf::ast::column_reference(1);
+  auto i_col_ref          = cudf::ast::column_reference(2);
+
+  // (int)(ps_partkey - 1)/s
+  auto expr_a = cudf::ast::operation(cudf::ast::ast_operator::SUB, ps_partkey_col_ref, literal_1);
+  auto expr_b = cudf::ast::operation(cudf::ast::ast_operator::DIV, expr_a, s_col_ref);
+
+  // s/4
+  auto expr_c = cudf::ast::operation(cudf::ast::ast_operator::DIV, s_col_ref, literal_4);
+
+  // (s/4 + (int)(ps_partkey - 1)/s)
+  auto expr_d = cudf::ast::operation(cudf::ast::ast_operator::ADD, expr_c, expr_b);
+
+  // (i * (s/4 + (int)(ps_partkey - 1)/s))
+  auto expr_e = cudf::ast::operation(cudf::ast::ast_operator::MUL, i_col_ref, expr_d);
+
+  // (ps_partkey + (i * (s/4 + (int)(ps_partkey - 1)/s)))
+  auto expr_f = cudf::ast::operation(cudf::ast::ast_operator::ADD, ps_partkey_col_ref, expr_e);
+
+  // (ps_partkey + (i * (s/4 + (int)(ps_partkey - 1)/s))) % s
+  auto expr_g = cudf::ast::operation(cudf::ast::ast_operator::MOD, expr_f, s_col_ref);
+
+  // (ps_partkey + (i * (s/4 + (int)(ps_partkey - 1)/s))) % s + 1
+  auto final_expr = cudf::ast::operation(cudf::ast::ast_operator::ADD, expr_g, literal_1);
+
+  // Execute the AST expression
+  return cudf::compute_column(table, final_expr, stream, mr);
+}
+
+/**
+ * @brief Calculate the cardinality of the `lineitem` table
+ *
+ * @param o_rep_freqs The frequency of each `o_orderkey` value in the `lineitem` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] cudf::size_type calculate_l_cardinality(cudf::column_view const& o_rep_freqs,
+                                                      rmm::cuda_stream_view stream,
+                                                      rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto const sum_agg = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+  auto const l_num_rows_scalar =
+    cudf::reduce(o_rep_freqs, *sum_agg, cudf::data_type{cudf::type_id::INT32}, stream, mr);
+  return reinterpret_cast<cudf::numeric_scalar<cudf::size_type>*>(l_num_rows_scalar.get())
+    ->value(stream);
+}
+
+/**
+ * @brief Calculate the charge column for the `lineitem` table
+ *
+ * @param extendedprice The `l_extendedprice` column
+ * @param tax The `l_tax` column
+ * @param discount The `l_discount` column
+ * @param stream The CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_charge(cudf::column_view const& extendedprice,
+                                                             cudf::column_view const& tax,
+                                                             cudf::column_view const& discount,
+                                                             rmm::cuda_stream_view stream,
+                                                             rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto const one                = cudf::numeric_scalar<double>(1);
+  auto const one_minus_discount = cudf::binary_operation(
+    one, discount, cudf::binary_operator::SUB, cudf::data_type{cudf::type_id::FLOAT64}, stream, mr);
+  auto disc_price = cudf::binary_operation(extendedprice,
+                                           one_minus_discount->view(),
+                                           cudf::binary_operator::MUL,
+                                           cudf::data_type{cudf::type_id::FLOAT64},
+                                           stream,
+                                           mr);
+  auto const one_plus_tax =
+    cudf::binary_operation(one, tax, cudf::binary_operator::ADD, tax.type(), stream, mr);
+  return cudf::binary_operation(disc_price->view(),
+                                one_plus_tax->view(),
+                                cudf::binary_operator::MUL,
+                                cudf::data_type{cudf::type_id::FLOAT64},
+                                stream,
+                                mr);
+}
+
+/**
+ * @brief Generate a column of random addresses according to TPC-H specification clause 4.2.2.7
+ *
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> generate_address_column(
+  cudf::size_type num_rows, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return generate_random_string_column(10, 40, num_rows, stream, mr);
+}
+
+/**
+ * @brief Generate a phone number column according to TPC-H specification clause 4.2.2.9
+ *
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> generate_phone_column(cudf::size_type num_rows,
+                                                                  rmm::cuda_stream_view stream,
+                                                                  rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto const part_a = cudf::strings::from_integers(
+    generate_random_numeric_column<int16_t>(10, 34, num_rows, stream, mr)->view());
+  auto const part_b = cudf::strings::from_integers(
+    generate_random_numeric_column<int16_t>(100, 999, num_rows, stream, mr)->view());
+  auto const part_c = cudf::strings::from_integers(
+    generate_random_numeric_column<int16_t>(100, 999, num_rows, stream, mr)->view());
+  auto const part_d = cudf::strings::from_integers(
+    generate_random_numeric_column<int16_t>(1000, 9999, num_rows, stream, mr)->view());
+  auto const phone_parts_table =
+    cudf::table_view({part_a->view(), part_b->view(), part_c->view(), part_d->view()});
+  return cudf::strings::concatenate(phone_parts_table,
+                                    cudf::string_scalar("-"),
+                                    cudf::string_scalar("", false),
+                                    cudf::strings::separator_on_nulls::NO,
+                                    stream,
+                                    mr);
+}
+
+}  // namespace cudf::datagen
diff --git a/cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp b/cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp
new file mode 100644
index 00000000000..11091689469
--- /dev/null
+++ b/cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <vector>
+
+namespace cudf::datagen {
+
+/**
+ * @brief Add a column of days to a column of timestamp_days
+ *
+ * @param timestamp_days The column of timestamp_days
+ * @param days The column of days to add
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::column> add_calendrical_days(
+  cudf::column_view const& timestamp_days,
+  cudf::column_view const& days,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Perform a left join operation between two tables
+ *
+ * @param left_input The left table
+ * @param right_input The right table
+ * @param left_on The indices of the columns to join on in the left table
+ * @param right_on The indices of the columns to join on in the right table
+ * @param compare_nulls The null equality comparison
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ */
+std::unique_ptr<cudf::table> perform_left_join(
+  cudf::table_view const& left_input,
+  cudf::table_view const& right_input,
+  std::vector<cudf::size_type> const& left_on,
+  std::vector<cudf::size_type> const& right_on,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate the `p_retailprice` column of the `part` table
+ *
+ * @param p_partkey The `p_partkey` column of the `part` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_p_retailprice(
+  cudf::column_view const& p_partkey,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate the `l_suppkey` column of the `lineitem` table
+ *
+ * @param l_partkey The `l_partkey` column of the `lineitem` table
+ * @param scale_factor The scale factor to use
+ * @param num_rows The number of rows in the `lineitem` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_l_suppkey(
+  cudf::column_view const& l_partkey,
+  cudf::size_type scale_factor,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate the `ps_suppkey` column of the `partsupp` table
+ *
+ * @param ps_partkey The `ps_partkey` column of the `partsupp` table
+ * @param scale_factor The scale factor to use
+ * @param num_rows The number of rows in the `partsupp` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_ps_suppkey(
+  cudf::column_view const& ps_partkey,
+  cudf::size_type scale_factor,
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+/**
+ * @brief Calculate the cardinality of the `lineitem` table
+ *
+ * @param o_rep_freqs The frequency of each `o_orderkey` value in the `lineitem` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] cudf::size_type calculate_l_cardinality(
+  cudf::column_view const& o_rep_freqs,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+/**
+ * @brief Calculate the charge column for the `lineitem` table
+ *
+ * @param extendedprice The `l_extendedprice` column
+ * @param tax The `l_tax` column
+ * @param discount The `l_discount` column
+ * @param stream The CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_charge(
+  cudf::column_view const& extendedprice,
+  cudf::column_view const& tax,
+  cudf::column_view const& discount,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate a column of random addresses according to TPC-H specification clause 4.2.2.7
+ *
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> generate_address_column(
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate a phone number column according to TPC-H specification clause 4.2.2.9
+ *
+ * @param num_rows The number of rows in the column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+[[nodiscard]] std::unique_ptr<cudf::column> generate_phone_column(
+  cudf::size_type num_rows,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+}  // namespace cudf::datagen
diff --git a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp b/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp
new file mode 100644
index 00000000000..9001c50c5a5
--- /dev/null
+++ b/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp
@@ -0,0 +1,987 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tpch_data_generator.hpp"
+
+#include "random_column_generator.hpp"
+#include "table_helpers.hpp"
+
+#include <cudf_test/column_wrapper.hpp>
+
+#include <cudf/ast/detail/operators.hpp>
+#include <cudf/ast/expressions.hpp>
+#include <cudf/binaryop.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/filling.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/round.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/convert/convert_datetime.hpp>
+#include <cudf/strings/convert/convert_integers.hpp>
+#include <cudf/strings/padding.hpp>
+#include <cudf/transform.hpp>
+#include <cudf/unary.hpp>
+
+#include <array>
+#include <string>
+#include <vector>
+
+namespace cudf::datagen {
+
+namespace {
+constexpr std::array nations{
+  "ALGERIA", "ARGENTINA", "BRAZIL",         "CANADA",       "EGYPT", "ETHIOPIA", "FRANCE",
+  "GERMANY", "INDIA",     "INDONESIA",      "IRAN",         "IRAQ",  "JAPAN",    "JORDAN",
+  "KENYA",   "MOROCCO",   "MOZAMBIQUE",     "PERU",         "CHINA", "ROMANIA",  "SAUDI ARABIA",
+  "VIETNAM", "RUSSIA",    "UNITED KINGDOM", "UNITED STATES"};
+
+constexpr std::array years{"1992", "1993", "1994", "1995", "1996", "1997", "1998"};
+constexpr std::array months{"1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"};
+constexpr std::array days{"1",  "2",  "3",  "4",  "5",  "6",  "7",  "8",  "9",  "10", "11",
+                          "12", "13", "14", "15", "16", "17", "18", "19", "20", "21", "22",
+                          "23", "24", "25", "26", "27", "28", "29", "30", "31"};
+
+constexpr std::array vocab_p_name{
+  "almond",   "antique",   "aquamarine", "azure",      "beige",     "bisque",    "black",
+  "blanched", "blue",      "blush",      "brown",      "burlywood", "burnished", "chartreuse",
+  "chiffon",  "chocolate", "coral",      "cornflower", "cornsilk",  "cream",     "cyan",
+  "dark",     "deep",      "dim",        "dodger",     "drab",      "firebrick", "floral",
+  "forest",   "frosted",   "gainsboro",  "ghost",      "goldenrod", "green",     "grey",
+  "honeydew", "hot",       "indian",     "ivory",      "khaki",     "lace",      "lavender",
+  "lawn",     "lemon",     "light",      "lime",       "linen",     "magenta",   "maroon",
+  "medium",   "metallic",  "midnight",   "mint",       "misty",     "moccasin",  "navajo",
+  "navy",     "olive",     "orange",     "orchid",     "pale",      "papaya",    "peach",
+  "peru",     "pink",      "plum",       "powder",     "puff",      "purple",    "red",
+  "rose",     "rosy",      "royal",      "saddle",     "salmon",    "sandy",     "seashell",
+  "sienna",   "sky",       "slate",      "smoke",      "snow",      "spring",    "steel",
+  "tan",      "thistle",   "tomato",     "turquoise",  "violet",    "wheat",     "white",
+  "yellow"};
+
+constexpr std::array vocab_modes{"REG AIR", "AIR", "RAIL", "SHIP", "TRUCK", "MAIL", "FOB"};
+
+constexpr std::array vocab_instructions{
+  "DELIVER IN PERSON", "COLLECT COD", "NONE", "TAKE BACK RETURN"};
+
+constexpr std::array vocab_priorities{"1-URGENT", "2-HIGH", "3-MEDIUM", "4-NOT SPECIFIED", "5-LOW"};
+
+constexpr std::array vocab_segments{
+  "AUTOMOBILE", "BUILDING", "FURNITURE", "MACHINERY", "HOUSEHOLD"};
+
+constexpr std::array vocab_types{
+  "STANDARD ANODIZED TIN",     "STANDARD ANODIZED NICKEL", "STANDARD ANODIZED BRASS",
+  "STANDARD ANODIZED STEEL",   "STANDARD ANODIZED COPPER", "STANDARD BURNISHED TIN",
+  "STANDARD BURNISHED NICKEL", "STANDARD BURNISHED BRASS", "STANDARD BURNISHED STEEL",
+  "STANDARD BURNISHED COPPER", "STANDARD PLATED TIN",      "STANDARD PLATED NICKEL",
+  "STANDARD PLATED BRASS",     "STANDARD PLATED STEEL",    "STANDARD PLATED COPPER",
+  "STANDARD POLISHED TIN",     "STANDARD POLISHED NICKEL", "STANDARD POLISHED BRASS",
+  "STANDARD POLISHED STEEL",   "STANDARD POLISHED COPPER", "STANDARD BRUSHED TIN",
+  "STANDARD BRUSHED NICKEL",   "STANDARD BRUSHED BRASS",   "STANDARD BRUSHED STEEL",
+  "STANDARD BRUSHED COPPER",   "SMALL ANODIZED TIN",       "SMALL ANODIZED NICKEL",
+  "SMALL ANODIZED BRASS",      "SMALL ANODIZED STEEL",     "SMALL ANODIZED COPPER",
+  "SMALL BURNISHED TIN",       "SMALL BURNISHED NICKEL",   "SMALL BURNISHED BRASS",
+  "SMALL BURNISHED STEEL",     "SMALL BURNISHED COPPER",   "SMALL PLATED TIN",
+  "SMALL PLATED NICKEL",       "SMALL PLATED BRASS",       "SMALL PLATED STEEL",
+  "SMALL PLATED COPPER",       "SMALL POLISHED TIN",       "SMALL POLISHED NICKEL",
+  "SMALL POLISHED BRASS",      "SMALL POLISHED STEEL",     "SMALL POLISHED COPPER",
+  "SMALL BRUSHED TIN",         "SMALL BRUSHED NICKEL",     "SMALL BRUSHED BRASS",
+  "SMALL BRUSHED STEEL",       "SMALL BRUSHED COPPER",     "MEDIUM ANODIZED TIN",
+  "MEDIUM ANODIZED NICKEL",    "MEDIUM ANODIZED BRASS",    "MEDIUM ANODIZED STEEL",
+  "MEDIUM ANODIZED COPPER",    "MEDIUM BURNISHED TIN",     "MEDIUM BURNISHED NICKEL",
+  "MEDIUM BURNISHED BRASS",    "MEDIUM BURNISHED STEEL",   "MEDIUM BURNISHED COPPER",
+  "MEDIUM PLATED TIN",         "MEDIUM PLATED NICKEL",     "MEDIUM PLATED BRASS",
+  "MEDIUM PLATED STEEL",       "MEDIUM PLATED COPPER",     "MEDIUM POLISHED TIN",
+  "MEDIUM POLISHED NICKEL",    "MEDIUM POLISHED BRASS",    "MEDIUM POLISHED STEEL",
+  "MEDIUM POLISHED COPPER",    "MEDIUM BRUSHED TIN",       "MEDIUM BRUSHED NICKEL",
+  "MEDIUM BRUSHED BRASS",      "MEDIUM BRUSHED STEEL",     "MEDIUM BRUSHED COPPER",
+  "LARGE ANODIZED TIN",        "LARGE ANODIZED NICKEL",    "LARGE ANODIZED BRASS",
+  "LARGE ANODIZED STEEL",      "LARGE ANODIZED COPPER",    "LARGE BURNISHED TIN",
+  "LARGE BURNISHED NICKEL",    "LARGE BURNISHED BRASS",    "LARGE BURNISHED STEEL",
+  "LARGE BURNISHED COPPER",    "LARGE PLATED TIN",         "LARGE PLATED NICKEL",
+  "LARGE PLATED BRASS",        "LARGE PLATED STEEL",       "LARGE PLATED COPPER",
+  "LARGE POLISHED TIN",        "LARGE POLISHED NICKEL",    "LARGE POLISHED BRASS",
+  "LARGE POLISHED STEEL",      "LARGE POLISHED COPPER",    "LARGE BRUSHED TIN",
+  "LARGE BRUSHED NICKEL",      "LARGE BRUSHED BRASS",      "LARGE BRUSHED STEEL",
+  "LARGE BRUSHED COPPER",      "ECONOMY ANODIZED TIN",     "ECONOMY ANODIZED NICKEL",
+  "ECONOMY ANODIZED BRASS",    "ECONOMY ANODIZED STEEL",   "ECONOMY ANODIZED COPPER",
+  "ECONOMY BURNISHED TIN",     "ECONOMY BURNISHED NICKEL", "ECONOMY BURNISHED BRASS",
+  "ECONOMY BURNISHED STEEL",   "ECONOMY BURNISHED COPPER", "ECONOMY PLATED TIN",
+  "ECONOMY PLATED NICKEL",     "ECONOMY PLATED BRASS",     "ECONOMY PLATED STEEL",
+  "ECONOMY PLATED COPPER",     "ECONOMY POLISHED TIN",     "ECONOMY POLISHED NICKEL",
+  "ECONOMY POLISHED BRASS",    "ECONOMY POLISHED STEEL",   "ECONOMY POLISHED COPPER",
+  "ECONOMY BRUSHED TIN",       "ECONOMY BRUSHED NICKEL",   "ECONOMY BRUSHED BRASS",
+  "ECONOMY BRUSHED STEEL",     "ECONOMY BRUSHED COPPER",   "PROMO ANODIZED TIN",
+  "PROMO ANODIZED NICKEL",     "PROMO ANODIZED BRASS",     "PROMO ANODIZED STEEL",
+  "PROMO ANODIZED COPPER",     "PROMO BURNISHED TIN",      "PROMO BURNISHED NICKEL",
+  "PROMO BURNISHED BRASS",     "PROMO BURNISHED STEEL",    "PROMO BURNISHED COPPER",
+  "PROMO PLATED TIN",          "PROMO PLATED NICKEL",      "PROMO PLATED BRASS",
+  "PROMO PLATED STEEL",        "PROMO PLATED COPPER",      "PROMO POLISHED TIN",
+  "PROMO POLISHED NICKEL",     "PROMO POLISHED BRASS",     "PROMO POLISHED STEEL",
+  "PROMO POLISHED COPPER",     "PROMO BRUSHED TIN",        "PROMO BRUSHED NICKEL",
+  "PROMO BRUSHED BRASS",       "PROMO BRUSHED STEEL",      "PROMO BRUSHED COPPER"};
+
+constexpr std::array vocab_containers{
+  "SM CASE",   "SM BOX",     "SM BAG",    "SM JAR",     "SM PKG",    "SM PACK",   "SM CAN",
+  "SM DRUM",   "LG CASE",    "LG BOX",    "LG BAG",     "LG JAR",    "LG PKG",    "LG PACK",
+  "LG CAN",    "LG DRUM",    "MED CASE",  "MED BOX",    "MED BAG",   "MED JAR",   "MED PKG",
+  "MED PACK",  "MED CAN",    "MED DRUM",  "JUMBO CASE", "JUMBO BOX", "JUMBO BAG", "JUMBO JAR",
+  "JUMBO PKG", "JUMBO PACK", "JUMBO CAN", "JUMBO DRUM", "WRAP CASE", "WRAP BOX",  "WRAP BAG",
+  "WRAP JAR",  "WRAP PKG",   "WRAP PACK", "WRAP CAN",   "WRAP DRUM"};
+
+}  // namespace
+
+/**
+ * @brief Generate a table out of the independent columns of the `orders` table
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_orders_independent(double scale_factor,
+                                                         rmm::cuda_stream_view stream,
+                                                         rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  cudf::size_type const o_num_rows = scale_factor * 1'500'000;
+
+  // Generate the `o_orderkey` column
+  auto o_orderkey = [&]() {
+    auto const o_orderkey_candidates = generate_primary_key_column(
+      cudf::numeric_scalar<cudf::size_type>(1), 4 * o_num_rows, stream, mr);
+    auto const o_orderkey_unsorted = cudf::sample(cudf::table_view({o_orderkey_candidates->view()}),
+                                                  o_num_rows,
+                                                  cudf::sample_with_replacement::FALSE,
+                                                  0,
+                                                  stream,
+                                                  mr);
+    auto const sort_result =
+      cudf::sort_by_key(o_orderkey_unsorted->view(),
+                        cudf::table_view({o_orderkey_unsorted->view().column(0)}),
+                        {},
+                        {},
+                        stream,
+                        mr);
+    return std::move(sort_result->release()[0]);
+  }();
+
+  // Generate the `o_custkey` column
+  auto o_custkey = [&]() {
+    auto const col = generate_random_numeric_column<cudf::size_type>(
+      1, scale_factor * 49'000, o_num_rows, stream, mr);
+    auto const col_mul_3 = cudf::binary_operation(col->view(),
+                                                  cudf::numeric_scalar<cudf::size_type>(3),
+                                                  cudf::binary_operator::MUL,
+                                                  cudf::data_type{cudf::type_id::INT32},
+                                                  stream,
+                                                  mr);
+    return cudf::binary_operation(col_mul_3->view(),
+                                  cudf::numeric_scalar<cudf::size_type>(1),
+                                  cudf::binary_operator::ADD,
+                                  cudf::data_type{cudf::type_id::INT32},
+                                  stream,
+                                  mr);
+  }();
+
+  // Generate the `o_orderdate` column
+  auto o_orderdate_ts = [&]() {
+    auto const o_orderdate_year = generate_random_string_column_from_set(
+      cudf::host_span<const char* const>(years.data(), years.size()), o_num_rows, stream, mr);
+    auto const o_orderdate_month = generate_random_string_column_from_set(
+      cudf::host_span<const char* const>(months.data(), months.size()), o_num_rows, stream, mr);
+    auto const o_orderdate_day = generate_random_string_column_from_set(
+      cudf::host_span<const char* const>(days.data(), days.size()), o_num_rows, stream, mr);
+    auto const o_orderdate_str = cudf::strings::concatenate(
+      cudf::table_view(
+        {o_orderdate_year->view(), o_orderdate_month->view(), o_orderdate_day->view()}),
+      cudf::string_scalar("-"),
+      cudf::string_scalar("", false),
+      cudf::strings::separator_on_nulls::NO,
+      stream,
+      mr);
+
+    return cudf::strings::to_timestamps(o_orderdate_str->view(),
+                                        cudf::data_type{cudf::type_id::TIMESTAMP_DAYS},
+                                        std::string("%Y-%m-%d"),
+                                        stream,
+                                        mr);
+  }();
+
+  // Generate the `o_orderpriority` column
+  auto o_orderpriority = generate_random_string_column_from_set(
+    cudf::host_span<const char* const>(vocab_priorities.data(), vocab_priorities.size()),
+    o_num_rows,
+    stream,
+    mr);
+
+  // Generate the `o_clerk` column
+  auto o_clerk = [&]() {
+    auto const clerk_repeat = generate_repeat_string_column("Clerk#", o_num_rows, stream, mr);
+    auto const random_c     = generate_random_numeric_column<cudf::size_type>(
+      1, scale_factor * 1'000, o_num_rows, stream, mr);
+    auto const random_c_str        = cudf::strings::from_integers(random_c->view(), stream, mr);
+    auto const random_c_str_padded = cudf::strings::zfill(random_c_str->view(), 9, stream, mr);
+    return cudf::strings::concatenate(
+      cudf::table_view({clerk_repeat->view(), random_c_str_padded->view()}),
+      cudf::string_scalar(""),
+      cudf::string_scalar("", false),
+      cudf::strings::separator_on_nulls::NO,
+      stream,
+      mr);
+  }();
+
+  // Generate the `o_shippriority` column
+  auto o_shippriority = [&]() {
+    auto const empty = cudf::make_numeric_column(
+      cudf::data_type{cudf::type_id::INT8}, o_num_rows, cudf::mask_state::UNALLOCATED, stream);
+    return cudf::fill(empty->view(), 0, o_num_rows, cudf::numeric_scalar<int8_t>(0), stream, mr);
+  }();
+
+  // Generate the `o_comment` column
+  // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification
+  auto o_comment = generate_random_string_column(19, 78, o_num_rows, stream, mr);
+
+  // Generate the `orders_independent` table
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(o_orderkey));
+  columns.push_back(std::move(o_custkey));
+  columns.push_back(std::move(o_orderdate_ts));
+  columns.push_back(std::move(o_orderpriority));
+  columns.push_back(std::move(o_clerk));
+  columns.push_back(std::move(o_shippriority));
+  columns.push_back(std::move(o_comment));
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
+/**
+ * @brief Generate the `lineitem` table partially
+ *
+ * @param orders_independent Table with the independent columns of the `orders` table
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_lineitem_partial(cudf::table_view const& orders_independent,
+                                                       double scale_factor,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto const o_num_rows = orders_independent.num_rows();
+  // Generate the `lineitem` table. For each row in the `orders` table,
+  // we have a random number (between 1 and 7) of rows in the `lineitem` table
+
+  // For each `o_orderkey`, generate a random number (between 1 and 7),
+  // which will be the number of rows in the `lineitem` table that will
+  // have the same `l_orderkey`
+  auto const o_rep_freqs = generate_random_numeric_column<int8_t>(1, 7, o_num_rows, stream, mr);
+
+  // Sum up the `o_rep_freqs` to get the number of rows in the
+  // `lineitem` table. This is required to generate the independent columns
+  // in the `lineitem` table
+  auto const l_num_rows = calculate_l_cardinality(o_rep_freqs->view(), stream, mr);
+
+  // We create a table out of `o_orderkey` and `o_orderdate_ts` by repeating
+  // the rows of `orders` according to the frequencies in `o_rep_freqs`
+  auto const o_orderkey     = orders_independent.column(0);
+  auto const o_orderdate_ts = orders_independent.column(2);
+  auto const l_base =
+    cudf::repeat(cudf::table_view({o_orderkey, o_orderdate_ts}), o_rep_freqs->view(), stream, mr);
+  auto l_base_columns = l_base->release();
+
+  // Generate the `l_orderkey` column
+  auto l_orderkey = std::move(l_base_columns[0]);
+
+  // Generate the `l_partkey` column
+  auto l_partkey = generate_random_numeric_column<cudf::size_type>(
+    1, scale_factor * 200'000, l_num_rows, stream, mr);
+
+  // Generate the `l_suppkey` column
+  auto l_suppkey = calculate_l_suppkey(l_partkey->view(), scale_factor, l_num_rows, stream, mr);
+
+  // Generate the `l_linenumber` column
+  auto l_linenumber = generate_repeat_sequence_column<int8_t>(7, false, l_num_rows, stream, mr);
+
+  // Generate the `l_quantity` column
+  auto l_quantity = generate_random_numeric_column<int8_t>(1, 50, l_num_rows, stream, mr);
+
+  // Generate the `l_discount` column
+  auto l_discount = [&]() {
+    auto const col = generate_random_numeric_column<double>(0.00, 0.10, l_num_rows, stream, mr);
+    return cudf::round(col->view(), 2);
+  }();
+
+  // Generate the `l_tax` column
+  auto l_tax = [&]() {
+    auto const col = generate_random_numeric_column<double>(0.00, 0.08, l_num_rows, stream, mr);
+    return cudf::round(col->view(), 2);
+  }();
+
+  // Get the orderdate column from the `l_base` table
+  auto const ol_orderdate_ts = std::move(l_base_columns[1]);
+
+  // Generate the `l_shipdate` column
+  auto l_shipdate_ts = [&]() {
+    auto const l_shipdate_rand_add_days =
+      generate_random_numeric_column<int8_t>(1, 121, l_num_rows, stream, mr);
+    return add_calendrical_days(
+      ol_orderdate_ts->view(), l_shipdate_rand_add_days->view(), stream, mr);
+  }();
+
+  // Generate the `l_commitdate` column
+  auto l_commitdate_ts = [&]() {
+    auto const l_commitdate_rand_add_days =
+      generate_random_numeric_column<int8_t>(30, 90, l_num_rows, stream, mr);
+    return add_calendrical_days(
+      ol_orderdate_ts->view(), l_commitdate_rand_add_days->view(), stream, mr);
+  }();
+
+  // Generate the `l_receiptdate` column
+  auto l_receiptdate_ts = [&]() {
+    auto const l_receiptdate_rand_add_days =
+      generate_random_numeric_column<int8_t>(1, 30, l_num_rows, stream, mr);
+    return add_calendrical_days(
+      l_shipdate_ts->view(), l_receiptdate_rand_add_days->view(), stream, mr);
+  }();
+
+  // Define the current date as per clause 4.2.2.12 of the TPC-H specification
+  constexpr cudf::size_type current_date_days_since_epoch = 9'298;
+  auto current_date =
+    cudf::timestamp_scalar<cudf::timestamp_D>(current_date_days_since_epoch, true);
+  auto current_date_literal = cudf::ast::literal(current_date);
+
+  // Generate the `l_returnflag` column
+  // if `l_receiptdate` <= current_date then "R" or "A" else "N"
+  auto l_returnflag = [&]() {
+    auto const col_ref = cudf::ast::column_reference(0);
+    auto const pred =
+      cudf::ast::operation(cudf::ast::ast_operator::LESS_EQUAL, col_ref, current_date_literal);
+    auto const binary_mask =
+      cudf::compute_column(cudf::table_view({l_receiptdate_ts->view()}), pred, stream, mr);
+
+    auto const multiplier =
+      generate_repeat_sequence_column<int8_t>(2, false, l_num_rows, stream, mr);
+    auto const ternary_mask   = cudf::binary_operation(binary_mask->view(),
+                                                     multiplier->view(),
+                                                     cudf::binary_operator::MUL,
+                                                     cudf::data_type{cudf::type_id::INT8},
+                                                     stream,
+                                                     mr);
+    auto const indices        = cudf::test::fixed_width_column_wrapper<int8_t>({0, 1, 2}).release();
+    auto const keys           = cudf::test::strings_column_wrapper({"N", "A", "R"}).release();
+    auto const gather_map     = cudf::table_view({indices->view(), keys->view()});
+    auto const gathered_table = cudf::gather(
+      gather_map, ternary_mask->view(), cudf::out_of_bounds_policy::DONT_CHECK, stream, mr);
+    return std::move(gathered_table->release()[1]);
+  }();
+
+  // Generate the `l_linestatus` column
+  // if `l_shipdate` > current_date then "F" else "O"
+  auto [l_linestatus, l_linestatus_mask] = [&]() {
+    auto const col_ref = cudf::ast::column_reference(0);
+    auto const pred =
+      cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref, current_date_literal);
+    auto mask = cudf::compute_column(cudf::table_view({l_shipdate_ts->view()}), pred, stream, mr);
+    auto mask_index_type      = cudf::cast(mask->view(), cudf::data_type{cudf::type_id::INT8});
+    auto const indices        = cudf::test::fixed_width_column_wrapper<int8_t>({0, 1}).release();
+    auto const keys           = cudf::test::strings_column_wrapper({"O", "F"}).release();
+    auto const gather_map     = cudf::table_view({indices->view(), keys->view()});
+    auto const gathered_table = cudf::gather(
+      gather_map, mask_index_type->view(), cudf::out_of_bounds_policy::DONT_CHECK, stream, mr);
+    return std::make_tuple(std::move(gathered_table->release()[1]), std::move(mask_index_type));
+  }();
+
+  // Generate the `l_shipinstruct` column
+  auto l_shipinstruct = generate_random_string_column_from_set(
+    cudf::host_span<const char* const>(vocab_instructions.data(), vocab_instructions.size()),
+    l_num_rows,
+    stream,
+    mr);
+
+  // Generate the `l_shipmode` column
+  auto l_shipmode = generate_random_string_column_from_set(
+    cudf::host_span<const char* const>(vocab_modes.data(), vocab_modes.size()),
+    l_num_rows,
+    stream,
+    mr);
+
+  // Generate the `l_comment` column
+  // NOTE: This column is not compliant with
+  // clause 4.2.2.10 of the TPC-H specification
+  auto l_comment = generate_random_string_column(10, 43, l_num_rows, stream, mr);
+
+  // Generate the `lineitem_partial` table
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(l_linestatus_mask));
+  columns.push_back(std::move(l_orderkey));
+  columns.push_back(std::move(l_partkey));
+  columns.push_back(std::move(l_suppkey));
+  columns.push_back(std::move(l_linenumber));
+  columns.push_back(std::move(l_quantity));
+  columns.push_back(std::move(l_discount));
+  columns.push_back(std::move(l_tax));
+  columns.push_back(std::move(l_shipdate_ts));
+  columns.push_back(std::move(l_commitdate_ts));
+  columns.push_back(std::move(l_receiptdate_ts));
+  columns.push_back(std::move(l_returnflag));
+  columns.push_back(std::move(l_linestatus));
+  columns.push_back(std::move(l_shipinstruct));
+  columns.push_back(std::move(l_shipmode));
+  columns.push_back(std::move(l_comment));
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
+std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& lineitem,
+                                                       rmm::cuda_stream_view stream,
+                                                       rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  auto const l_linestatus_mask = lineitem.column(0);
+  auto const l_orderkey        = lineitem.column(1);
+  auto const l_discount        = lineitem.column(6);
+  auto const l_tax             = lineitem.column(7);
+  auto const l_extendedprice   = lineitem.column(16);
+
+  std::vector<std::unique_ptr<cudf::column>> orders_dependent_columns;
+
+  // Generate the `o_totalprice` column
+  // We calculate the `charge` column, which is a function of `l_extendedprice`,
+  // `l_tax`, and `l_discount` and then group by `l_orderkey` and sum the `charge`
+  auto const l_charge = calculate_charge(l_extendedprice, l_tax, l_discount, stream, mr);
+  auto o_totalprice   = [&]() {
+    auto const keys = cudf::table_view({l_orderkey});
+    cudf::groupby::groupby gb(keys);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    requests.push_back(cudf::groupby::aggregation_request());
+    requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+    requests[0].values = l_charge->view();
+    auto agg_result    = gb.aggregate(requests);
+    return cudf::round(agg_result.second[0].results[0]->view(), 2);
+  }();
+  orders_dependent_columns.push_back(std::move(o_totalprice));
+
+  // Generate the `o_orderstatus` column
+  auto o_orderstatus = [&]() {
+    auto const keys = cudf::table_view({l_orderkey});
+    cudf::groupby::groupby gb(keys);
+    std::vector<cudf::groupby::aggregation_request> requests;
+
+    // Perform a `count` aggregation on `l_orderkey`
+    requests.push_back(cudf::groupby::aggregation_request());
+    requests[0].aggregations.push_back(cudf::make_count_aggregation<cudf::groupby_aggregation>());
+    requests[0].values = l_orderkey;
+
+    // Perform a `sum` aggregation on `l_linestatus_mask`
+    requests.push_back(cudf::groupby::aggregation_request());
+    requests[1].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+    requests[1].values = l_linestatus_mask;
+
+    // Perform the aggregations
+    auto agg_result = gb.aggregate(requests);
+
+    // Create a `table_view` out of the `l_orderkey`, `count`, and `sum` columns
+    auto const count = std::move(agg_result.second[0].results[0]);
+    auto const sum   = cudf::cast(
+      agg_result.second[1].results[0]->view(), cudf::data_type{cudf::type_id::INT32}, stream, mr);
+
+    auto const table =
+      cudf::table_view({agg_result.first->get_column(0).view(), count->view(), sum->view()});
+
+    // Now on this table,
+    // if `sum` == `count` then "O",
+    // if `sum` == 0, then "F",
+    // else "P"
+
+    // So, we first evaluate an expression `sum == count` and generate a boolean mask
+    auto const count_ref = cudf::ast::column_reference(1);
+    auto const sum_ref   = cudf::ast::column_reference(2);
+    auto const expr_a    = cudf::ast::operation(cudf::ast::ast_operator::EQUAL, sum_ref, count_ref);
+    auto const mask_a    = cudf::compute_column(table, expr_a);
+    auto const o_orderstatus_intermediate =
+      cudf::copy_if_else(cudf::string_scalar("O"), cudf::string_scalar("F"), mask_a->view());
+
+    // Then, we evaluate an expression `sum == 0` and generate a boolean mask
+    auto zero_scalar        = cudf::numeric_scalar<cudf::size_type>(0);
+    auto const zero_literal = cudf::ast::literal(zero_scalar);
+    auto const expr_b_left =
+      cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, sum_ref, count_ref);
+    auto const expr_b_right =
+      cudf::ast::operation(cudf::ast::ast_operator::NOT_EQUAL, sum_ref, zero_literal);
+    auto const expr_b =
+      cudf::ast::operation(cudf::ast::ast_operator::LOGICAL_AND, expr_b_left, expr_b_right);
+    auto const mask_b = cudf::compute_column(table, expr_b);
+    return cudf::copy_if_else(
+      cudf::string_scalar("P"), o_orderstatus_intermediate->view(), mask_b->view());
+  }();
+  orders_dependent_columns.push_back(std::move(o_orderstatus));
+  return std::make_unique<cudf::table>(std::move(orders_dependent_columns));
+}
+
+/**
+ * @brief Generate the `partsupp` table
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_partsupp(double scale_factor,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Define the number of rows in the `part` and `partsupp` tables
+  cudf::size_type const p_num_rows  = scale_factor * 200'000;
+  cudf::size_type const ps_num_rows = scale_factor * 800'000;
+
+  // Generate the `ps_partkey` column
+  auto ps_partkey = [&]() {
+    auto const p_partkey =
+      generate_primary_key_column(cudf::numeric_scalar<cudf::size_type>(1), p_num_rows, stream, mr);
+    auto const rep_table = cudf::repeat(cudf::table_view({p_partkey->view()}), 4, stream, mr);
+    return std::move(rep_table->release()[0]);
+  }();
+
+  // Generate the `ps_suppkey` column
+  auto ps_suppkey = calculate_ps_suppkey(ps_partkey->view(), scale_factor, ps_num_rows, stream, mr);
+
+  // Generate the `ps_availqty` column
+  auto ps_availqty = generate_random_numeric_column<int16_t>(1, 9999, ps_num_rows, stream, mr);
+
+  // Generate the `ps_supplycost` column
+  auto ps_supplycost = [&]() {
+    auto const col = generate_random_numeric_column<double>(1.00, 1000.00, ps_num_rows, stream, mr);
+    return cudf::round(col->view(), 2);
+  }();
+
+  // Generate the `ps_comment` column
+  // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification
+  auto ps_comment = generate_random_string_column(49, 198, ps_num_rows, stream, mr);
+
+  // Create the `partsupp` table
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(ps_partkey));
+  columns.push_back(std::move(ps_suppkey));
+  columns.push_back(std::move(ps_availqty));
+  columns.push_back(std::move(ps_supplycost));
+  columns.push_back(std::move(ps_comment));
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
+/**
+ * @brief Generate the `part` table
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_part(double scale_factor,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  cudf::size_type const num_rows = scale_factor * 200'000;
+
+  // Generate the `p_partkey` column
+  auto p_partkey =
+    generate_primary_key_column(cudf::numeric_scalar<cudf::size_type>(1), num_rows, stream, mr);
+
+  // Generate the `p_name` column
+  auto p_name = [&]() {
+    auto const p_name_a = generate_random_string_column_from_set(
+      cudf::host_span<const char* const>(vocab_p_name.data(), vocab_p_name.size()),
+      num_rows,
+      stream,
+      mr);
+    auto const p_name_b = generate_random_string_column_from_set(
+      cudf::host_span<const char* const>(vocab_p_name.data(), vocab_p_name.size()),
+      num_rows,
+      stream,
+      mr);
+    auto const p_name_c = generate_random_string_column_from_set(
+      cudf::host_span<const char* const>(vocab_p_name.data(), vocab_p_name.size()),
+      num_rows,
+      stream,
+      mr);
+    auto const p_name_d = generate_random_string_column_from_set(
+      cudf::host_span<const char* const>(vocab_p_name.data(), vocab_p_name.size()),
+      num_rows,
+      stream,
+      mr);
+    auto const p_name_e = generate_random_string_column_from_set(
+      cudf::host_span<const char* const>(vocab_p_name.data(), vocab_p_name.size()),
+      num_rows,
+      stream,
+      mr);
+    return cudf::strings::concatenate(
+      cudf::table_view(
+        {p_name_a->view(), p_name_b->view(), p_name_c->view(), p_name_d->view(), p_name_e->view()}),
+      cudf::string_scalar(" "),
+      cudf::string_scalar("", false),
+      cudf::strings::separator_on_nulls::NO,
+      stream,
+      mr);
+  }();
+
+  // Generate the `p_mfgr` and `p_brand` columns
+  auto const random_values_m = generate_random_numeric_column<int8_t>(1, 5, num_rows, stream, mr);
+  auto const random_values_m_str =
+    cudf::strings::from_integers(random_values_m->view(), stream, mr);
+
+  auto const random_values_n = generate_random_numeric_column<int8_t>(1, 5, num_rows, stream, mr);
+  auto const random_values_n_str =
+    cudf::strings::from_integers(random_values_n->view(), stream, mr);
+
+  auto p_mfgr = [&]() {
+    auto const mfgr_repeat = generate_repeat_string_column("Manufacturer#", num_rows, stream, mr);
+    return cudf::strings::concatenate(
+      cudf::table_view({mfgr_repeat->view(), random_values_m_str->view()}),
+      cudf::string_scalar(""),
+      cudf::string_scalar("", false),
+      cudf::strings::separator_on_nulls::NO,
+      stream,
+      mr);
+  }();
+
+  auto p_brand = [&]() {
+    auto const brand_repeat = generate_repeat_string_column("Brand#", num_rows, stream, mr);
+    return cudf::strings::concatenate(
+      cudf::table_view(
+        {brand_repeat->view(), random_values_m_str->view(), random_values_n_str->view()}),
+      cudf::string_scalar(""),
+      cudf::string_scalar("", false),
+      cudf::strings::separator_on_nulls::NO,
+      stream,
+      mr);
+  }();
+
+  // Generate the `p_type` column
+  auto p_type = generate_random_string_column_from_set(
+    cudf::host_span<const char* const>(vocab_types.data(), vocab_types.size()),
+    num_rows,
+    stream,
+    mr);
+
+  // Generate the `p_size` column
+  auto p_size = generate_random_numeric_column<int8_t>(1, 50, num_rows, stream, mr);
+
+  // Generate the `p_container` column
+  auto p_container = generate_random_string_column_from_set(
+    cudf::host_span<const char* const>(vocab_containers.data(), vocab_containers.size()),
+    num_rows,
+    stream,
+    mr);
+
+  // Generate the `p_retailprice` column
+  auto p_retailprice = calculate_p_retailprice(p_partkey->view(), stream, mr);
+
+  // Generate the `p_comment` column
+  // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification
+  auto p_comment = generate_random_string_column(5, 22, num_rows, stream, mr);
+
+  // Create the `part` table
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(p_partkey));
+  columns.push_back(std::move(p_name));
+  columns.push_back(std::move(p_mfgr));
+  columns.push_back(std::move(p_brand));
+  columns.push_back(std::move(p_type));
+  columns.push_back(std::move(p_size));
+  columns.push_back(std::move(p_container));
+  columns.push_back(std::move(p_retailprice));
+  columns.push_back(std::move(p_comment));
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
+/**
+ * @brief Generate the `orders`, `lineitem`, and `part` tables
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::tuple<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>>
+generate_orders_lineitem_part(double scale_factor,
+                              rmm::cuda_stream_view stream,
+                              rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Generate a table with the independent columns of the `orders` table
+  auto orders_independent = generate_orders_independent(scale_factor, stream, mr);
+
+  // Generate the `lineitem` table partially
+  auto lineitem_partial =
+    generate_lineitem_partial(orders_independent->view(), scale_factor, stream, mr);
+
+  // Generate the `part` table
+  auto part = generate_part(scale_factor, stream, mr);
+
+  // Join the `part` and partial `lineitem` tables, then calculate the `l_extendedprice` column,
+  // add the column to the `lineitem` table, and write the `lineitem` table to a parquet file
+
+  auto l_extendedprice = [&]() {
+    auto const left = cudf::table_view(
+      {lineitem_partial->get_column(2).view(), lineitem_partial->get_column(5).view()});
+    auto const right = cudf::table_view({part->get_column(0).view(), part->get_column(7).view()});
+    auto const joined_table   = perform_left_join(left, right, {0}, {0}, stream, mr);
+    auto joined_table_columns = joined_table->release();
+    auto const l_quantity     = std::move(joined_table_columns[1]);
+    auto const l_quantity_fp =
+      cudf::cast(l_quantity->view(), cudf::data_type{cudf::type_id::FLOAT64});
+    auto const p_retailprice = std::move(joined_table_columns[3]);
+    auto const col           = cudf::binary_operation(l_quantity_fp->view(),
+                                            p_retailprice->view(),
+                                            cudf::binary_operator::MUL,
+                                            cudf::data_type{cudf::type_id::FLOAT64},
+                                            stream,
+                                            mr);
+    return cudf::round(col->view(), 2);
+  }();
+
+  auto lineitem_partial_columns = lineitem_partial->release();
+  lineitem_partial_columns.push_back(std::move(l_extendedprice));
+  auto lineitem_temp = std::make_unique<cudf::table>(std::move(lineitem_partial_columns));
+
+  // Generate the dependent columns of the `orders` table
+  // and merge them with the independent columns
+  auto orders_dependent = generate_orders_dependent(lineitem_temp->view(), stream, mr);
+
+  auto orders_independent_columns = orders_independent->release();
+  auto orders_dependent_columns   = orders_dependent->release();
+  orders_independent_columns.insert(orders_independent_columns.end(),
+                                    std::make_move_iterator(orders_dependent_columns.begin()),
+                                    std::make_move_iterator(orders_dependent_columns.end()));
+
+  // Create the `orders` table
+  auto orders = std::make_unique<cudf::table>(std::move(orders_independent_columns));
+
+  // Create the `lineitem` table
+  auto lineitem_temp_columns = lineitem_temp->release();
+  lineitem_temp_columns.erase(lineitem_temp_columns.begin());
+  auto lineitem = std::make_unique<cudf::table>(std::move(lineitem_temp_columns));
+
+  return std::make_tuple(std::move(orders), std::move(lineitem), std::move(part));
+}
+
+/**
+ * @brief Generate the `supplier` table
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_supplier(double scale_factor,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Calculate the number of rows based on the scale factor
+  cudf::size_type const num_rows = scale_factor * 10'000;
+
+  // Generate the `s_suppkey` column
+  auto s_suppkey =
+    generate_primary_key_column(cudf::numeric_scalar<cudf::size_type>(1), num_rows, stream, mr);
+
+  // Generate the `s_name` column
+  auto s_name = [&]() {
+    auto const supplier_repeat = generate_repeat_string_column("Supplier#", num_rows, stream, mr);
+    auto const s_suppkey_str   = cudf::strings::from_integers(s_suppkey->view(), stream, mr);
+    auto const s_suppkey_str_padded = cudf::strings::zfill(s_suppkey_str->view(), 9, stream, mr);
+    return cudf::strings::concatenate(
+      cudf::table_view({supplier_repeat->view(), s_suppkey_str_padded->view()}),
+      cudf::string_scalar(""),
+      cudf::string_scalar("", false),
+      cudf::strings::separator_on_nulls::NO,
+      stream,
+      mr);
+  }();
+
+  // Generate the `s_address` column
+  auto s_address = generate_address_column(num_rows, stream, mr);
+
+  // Generate the `s_nationkey` column
+  auto s_nationkey = generate_random_numeric_column<int8_t>(0, 24, num_rows, stream, mr);
+
+  // Generate the `s_phone` column
+  auto s_phone = generate_phone_column(num_rows, stream, mr);
+
+  // Generate the `s_acctbal` column
+  auto s_acctbal = [&]() {
+    auto const col = generate_random_numeric_column<double>(-999.99, 9999.99, num_rows, stream, mr);
+    return cudf::round(col->view(), 2);
+  }();
+
+  // Generate the `s_comment` column
+  // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification
+  auto s_comment = generate_random_string_column(25, 100, num_rows, stream, mr);
+
+  // Create the `supplier` table
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(s_suppkey));
+  columns.push_back(std::move(s_name));
+  columns.push_back(std::move(s_address));
+  columns.push_back(std::move(s_nationkey));
+  columns.push_back(std::move(s_phone));
+  columns.push_back(std::move(s_acctbal));
+  columns.push_back(std::move(s_comment));
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
+/**
+ * @brief Generate the `customer` table
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_customer(double scale_factor,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Calculate the number of rows based on the scale factor
+  cudf::size_type const num_rows = scale_factor * 150'000;
+
+  // Generate the `c_custkey` column
+  auto c_custkey =
+    generate_primary_key_column(cudf::numeric_scalar<cudf::size_type>(1), num_rows, stream, mr);
+
+  // Generate the `c_name` column
+  auto c_name = [&]() {
+    auto const customer_repeat = generate_repeat_string_column("Customer#", num_rows, stream, mr);
+    auto const c_custkey_str   = cudf::strings::from_integers(c_custkey->view(), stream, mr);
+    auto const c_custkey_str_padded = cudf::strings::zfill(c_custkey_str->view(), 9, stream, mr);
+    return cudf::strings::concatenate(
+      cudf::table_view({customer_repeat->view(), c_custkey_str_padded->view()}),
+      cudf::string_scalar(""),
+      cudf::string_scalar("", false),
+      cudf::strings::separator_on_nulls::NO,
+      stream,
+      mr);
+  }();
+
+  // Generate the `c_address` column
+  auto c_address = generate_address_column(num_rows, stream, mr);
+
+  // Generate the `c_nationkey` column
+  auto c_nationkey = generate_random_numeric_column<int8_t>(0, 24, num_rows, stream, mr);
+
+  // Generate the `c_phone` column
+  auto c_phone = generate_phone_column(num_rows, stream, mr);
+
+  // Generate the `c_acctbal` column
+  auto c_acctbal = [&]() {
+    auto const col = generate_random_numeric_column<double>(-999.99, 9999.99, num_rows, stream, mr);
+    return cudf::round(col->view(), 2);
+  }();
+
+  // Generate the `c_mktsegment` column
+  auto c_mktsegment = generate_random_string_column_from_set(
+    cudf::host_span<const char* const>(vocab_segments.data(), vocab_segments.size()),
+    num_rows,
+    stream,
+    mr);
+
+  // Generate the `c_comment` column
+  // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification
+  auto c_comment = generate_random_string_column(29, 116, num_rows, stream, mr);
+
+  // Create the `customer` table
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(c_custkey));
+  columns.push_back(std::move(c_name));
+  columns.push_back(std::move(c_address));
+  columns.push_back(std::move(c_nationkey));
+  columns.push_back(std::move(c_phone));
+  columns.push_back(std::move(c_acctbal));
+  columns.push_back(std::move(c_mktsegment));
+  columns.push_back(std::move(c_comment));
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
+/**
+ * @brief Generate the `nation` table
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_nation(rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Define the number of rows
+  constexpr cudf::size_type num_rows = 25;
+
+  // Generate the `n_nationkey` column
+  auto n_nationkey =
+    generate_primary_key_column(cudf::numeric_scalar<int8_t>(0), num_rows, stream, mr);
+
+  // Generate the `n_name` column
+  auto n_name = cudf::test::strings_column_wrapper(nations.begin(), nations.end()).release();
+
+  // Generate the `n_regionkey` column
+  std::vector<int8_t> region_keys{0, 1, 1, 1, 4, 0, 3, 3, 2, 2, 4, 4, 2,
+                                  4, 0, 0, 0, 1, 2, 3, 4, 2, 3, 3, 1};
+  auto n_regionkey =
+    cudf::test::fixed_width_column_wrapper<int8_t>(region_keys.begin(), region_keys.end())
+      .release();
+
+  // Generate the `n_comment` column
+  // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification
+  auto n_comment = generate_random_string_column(31, 114, num_rows, stream, mr);
+
+  // Create the `nation` table
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(n_nationkey));
+  columns.push_back(std::move(n_name));
+  columns.push_back(std::move(n_regionkey));
+  columns.push_back(std::move(n_comment));
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
+/**
+ * @brief Generate the `region` table
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_region(rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  // Define the number of rows
+  constexpr cudf::size_type num_rows = 5;
+
+  // Generate the `r_regionkey` column
+  auto r_regionkey =
+    generate_primary_key_column(cudf::numeric_scalar<int8_t>(0), num_rows, stream, mr);
+
+  // Generate the `r_name` column
+  auto r_name =
+    cudf::test::strings_column_wrapper({"AFRICA", "AMERICA", "ASIA", "EUROPE", "MIDDLE EAST"})
+      .release();
+
+  // Generate the `r_comment` column
+  // NOTE: This column is not compliant with clause 4.2.2.10 of the TPC-H specification
+  auto r_comment = generate_random_string_column(31, 115, num_rows, stream, mr);
+
+  // Create the `region` table
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(r_regionkey));
+  columns.push_back(std::move(r_name));
+  columns.push_back(std::move(r_comment));
+  return std::make_unique<cudf::table>(std::move(columns));
+}
+
+}  // namespace cudf::datagen
diff --git a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp b/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp
new file mode 100644
index 00000000000..a6286dd8dba
--- /dev/null
+++ b/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/table/table.hpp>
+
+namespace CUDF_EXPORT cudf {
+namespace datagen {
+
+/**
+ * @brief Generate the `orders`, `lineitem`, and `part` tables
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::tuple<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>>
+generate_orders_lineitem_part(
+  double scale_factor,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate the `partsupp` table
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_partsupp(
+  double scale_factor,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate the `supplier` table
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_supplier(
+  double scale_factor,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate the `customer` table
+ *
+ * @param scale_factor The scale factor to generate
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_customer(
+  double scale_factor,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate the `nation` table
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_nation(
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief Generate the `region` table
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_region(
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+
+}  // namespace datagen
+}  // namespace CUDF_EXPORT cudf

From 5a81a80cef59649f059d55004f745001a59b3f6f Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 30 Aug 2024 11:57:53 -0400
Subject: [PATCH 757/842] [BUG] Add gpu node type to cudf-pandas 3rd-party
 integration nightly CI job (#16704)

Following up #16645, and adding a gpu node type to the nightly CI job

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16704
---
 .github/workflows/test.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 2c68f2861bb..8605fa46f68 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -132,6 +132,7 @@ jobs:
       branch: ${{ inputs.branch }}
       date: ${{ inputs.date }}
       sha: ${{ inputs.sha }}
+      node_type: "gpu-v100-latest-1"
       container_image: "rapidsai/ci-conda:latest"
       run_script: |
         ci/cudf_pandas_scripts/third-party-integration/test.sh python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml

From 2d6758f39592e6296a042eb8e771171c50899013 Mon Sep 17 00:00:00 2001
From: Shruti Shivakumar <shruti.shivakumar@gmail.com>
Date: Fri, 30 Aug 2024 13:22:58 -0700
Subject: [PATCH 758/842] Enable batched multi-source reading of JSONL files
 with large records (#16687)

Addresses #16664

Implements reallocate-and-retry logic when the initial buffer size estimate fails for byte range reading.
Chunked reader test checks for correct reallocation for different chunk sizes.

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16687
---
 cpp/src/io/json/read_json.cu    | 48 ++++++++++++++++++++++++---------
 cpp/tests/io/json/json_test.cpp | 47 ++++++++++++++++++++++++++++++++
 2 files changed, 82 insertions(+), 13 deletions(-)

diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 2658cbbed2f..98e8e8d3c7e 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -138,14 +138,14 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
   auto should_load_all_sources = !chunk_size || chunk_size >= total_source_size - chunk_offset;
   chunk_size = should_load_all_sources ? total_source_size - chunk_offset : chunk_size;
 
-  int const num_subchunks_prealloced  = should_load_all_sources ? 0 : max_subchunks_prealloced;
+  int num_subchunks_prealloced        = should_load_all_sources ? 0 : max_subchunks_prealloced;
   std::size_t const size_per_subchunk = estimate_size_per_subchunk(chunk_size);
 
   // The allocation for single source compressed input is estimated by assuming a ~4:1
   // compression ratio. For uncompressed inputs, we can getter a better estimate using the idea
   // of subchunks.
   auto constexpr header_size = 4096;
-  std::size_t const buffer_size =
+  std::size_t buffer_size =
     reader_compression != compression_type::NONE
       ? total_source_size * estimated_compression_ratio + header_size
       : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk) +
@@ -169,18 +169,40 @@ datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
     // Find next delimiter
     std::int64_t next_delim_pos     = -1;
     std::size_t next_subchunk_start = chunk_offset + chunk_size;
-    while (next_subchunk_start < total_source_size && next_delim_pos < buffer_offset) {
-      buffer_offset += readbufspan.size();
-      readbufspan    = ingest_raw_input(bufspan.last(buffer_size - buffer_offset),
-                                     sources,
-                                     reader_compression,
-                                     next_subchunk_start,
-                                     size_per_subchunk,
-                                     stream);
-      next_delim_pos = find_first_delimiter(readbufspan, '\n', stream) + buffer_offset;
-      if (next_delim_pos < buffer_offset) { next_subchunk_start += size_per_subchunk; }
+    while (next_delim_pos < buffer_offset) {
+      for (int subchunk = 0;
+           subchunk < num_subchunks_prealloced && next_delim_pos < buffer_offset &&
+           next_subchunk_start < total_source_size;
+           subchunk++) {
+        buffer_offset += readbufspan.size();
+        readbufspan    = ingest_raw_input(bufspan.last(buffer_size - buffer_offset),
+                                       sources,
+                                       reader_compression,
+                                       next_subchunk_start,
+                                       size_per_subchunk,
+                                       stream);
+        next_delim_pos = find_first_delimiter(readbufspan, '\n', stream) + buffer_offset;
+        next_subchunk_start += size_per_subchunk;
+      }
+      if (next_delim_pos < buffer_offset) {
+        if (next_subchunk_start >= total_source_size) {
+          // If we have reached the end of source list but the source does not terminate with a
+          // newline character
+          next_delim_pos = buffer_offset + readbufspan.size();
+        } else {
+          // Our buffer_size estimate is insufficient to read until the end of the line! We need to
+          // allocate more memory and try again!
+          num_subchunks_prealloced *= 2;
+          buffer_size = reader_compression != compression_type::NONE
+                          ? 2 * buffer_size
+                          : std::min(total_source_size,
+                                     buffer_size + num_subchunks_prealloced * size_per_subchunk) +
+                              num_extra_delimiters;
+          buffer.resize(buffer_size, stream);
+          bufspan = device_span<char>(reinterpret_cast<char*>(buffer.data()), buffer.size());
+        }
+      }
     }
-    if (next_delim_pos < buffer_offset) next_delim_pos = buffer_offset + readbufspan.size();
 
     return datasource::owning_buffer<rmm::device_buffer>(
       std::move(buffer),
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 576a698ba31..c26e5ca3edb 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -680,6 +680,53 @@ TEST_F(JsonReaderTest, JsonLinesByteRange)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), int64_wrapper{{3000, 4000, 5000}});
 }
 
+TEST_F(JsonReaderTest, JsonLinesByteRangeWithRealloc)
+{
+  std::string long_string     = "haha";
+  std::size_t log_repetitions = 12;
+  long_string.reserve(long_string.size() * (1UL << log_repetitions));
+  for (std::size_t i = 0; i < log_repetitions; i++) {
+    long_string += long_string;
+  }
+
+  auto json_string = [&long_string]() {
+    std::string json_string   = R"(
+      { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+      { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+      { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+      { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+    std::string replace_chars = "c";
+    std::size_t pos           = json_string.find(replace_chars);
+    while (pos != std::string::npos) {
+      // Replace the substring with the specified string
+      json_string.replace(pos, replace_chars.size(), long_string);
+
+      // Find the next occurrence of the substring
+      pos = json_string.find(replace_chars, pos + long_string.size());
+    }
+    return json_string;
+  }();
+
+  // Initialize parsing options (reading json lines). Set byte range offset and size so as to read
+  // the second row of input
+  cudf::io::json_reader_options json_lines_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{cudf::host_span<std::byte>(
+        reinterpret_cast<std::byte*>(json_string.data()), json_string.size())})
+      .lines(true)
+      .compression(cudf::io::compression_type::NONE)
+      .recovery_mode(cudf::io::json_recovery_mode_t::FAIL)
+      .byte_range_offset(16430)
+      .byte_range_size(30);
+
+  // Read full test data via existing, nested JSON lines reader
+  cudf::io::table_with_metadata result = cudf::io::read_json(json_lines_options);
+
+  EXPECT_EQ(result.tbl->num_columns(), 3);
+  EXPECT_EQ(result.tbl->num_rows(), 1);
+  EXPECT_EQ(result.metadata.schema_info[2].name, long_string);
+}
+
 TEST_F(JsonReaderTest, JsonLinesMultipleFilesByteRange_AcrossFiles)
 {
   const std::string file1 = temp_env->get_temp_dir() + "JsonLinesMultipleFilesByteRangeTest1.json";

From c6c720f48815ec93a543cb42fbb128d3c0eb983e Mon Sep 17 00:00:00 2001
From: Charles Blackmon-Luca <20627856+charlesbluca@users.noreply.github.com>
Date: Fri, 30 Aug 2024 16:47:26 -0400
Subject: [PATCH 759/842] Implement exposed null mask APIs in pylibcudf
 (#15908)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Charles Blackmon-Luca (https://github.com/charlesbluca)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/15908
---
 docs/cudf/source/conf.py                      |   2 +
 .../user_guide/api_docs/pylibcudf/index.rst   |   1 +
 .../api_docs/pylibcudf/null_mask.rst          |   6 +
 python/cudf/cudf/_lib/null_mask.pyx           | 103 +++----------
 python/pylibcudf/pylibcudf/CMakeLists.txt     |   1 +
 python/pylibcudf/pylibcudf/__init__.pxd       |   2 +
 python/pylibcudf/pylibcudf/__init__.py        |   2 +
 .../pylibcudf/pylibcudf/libcudf/null_mask.pxd |   2 -
 python/pylibcudf/pylibcudf/null_mask.pxd      |  18 +++
 python/pylibcudf/pylibcudf/null_mask.pyx      | 142 ++++++++++++++++++
 .../pylibcudf/tests/test_null_mask.py         |  59 ++++++++
 11 files changed, 252 insertions(+), 86 deletions(-)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/null_mask.rst
 create mode 100644 python/pylibcudf/pylibcudf/null_mask.pxd
 create mode 100644 python/pylibcudf/pylibcudf/null_mask.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_null_mask.py

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 43e2d6031bc..c58bc42327c 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -342,6 +342,7 @@ def clean_all_xml_files(path):
     "cudf.Series": ("cudf.core.series.Series", "cudf.Series"),
     "cudf.Index": ("cudf.core.index.Index", "cudf.Index"),
     "cupy.core.core.ndarray": ("cupy.ndarray", "cupy.ndarray"),
+    "DeviceBuffer": ("rmm._lib.device_buffer.DeviceBuffer", "rmm.DeviceBuffer"),
 }
 
 
@@ -383,6 +384,7 @@ def _generate_namespaces(namespaces):
     # Cython types that don't alias cleanly because of
     # https://github.com/cython/cython/issues/5609
     "size_type",
+    "size_t",
     "type_id",
     # Unknown base types
     "int32_t",
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 505765bba0f..6a2b66e8ea0 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -23,6 +23,7 @@ This page provides API documentation for pylibcudf.
     join
     lists
     merge
+    null_mask
     quantiles
     reduce
     replace
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/null_mask.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/null_mask.rst
new file mode 100644
index 00000000000..4799c62eace
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/null_mask.rst
@@ -0,0 +1,6 @@
+=========
+null_mask
+=========
+
+.. automodule:: pylibcudf.null_mask
+   :members:
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
index 3a7b6a59bf3..d54e8e66281 100644
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ b/python/cudf/cudf/_lib/null_mask.pyx
@@ -1,39 +1,11 @@
 # Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from enum import Enum
-
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+import pylibcudf
+from pylibcudf.null_mask import MaskState
 
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 
-from libcpp.memory cimport make_unique, unique_ptr
-from libcpp.pair cimport pair
-from libcpp.utility cimport move
-
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.null_mask cimport (
-    bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes,
-    bitmask_and as cpp_bitmask_and,
-    bitmask_or as cpp_bitmask_or,
-    copy_bitmask as cpp_copy_bitmask,
-    create_null_mask as cpp_create_null_mask,
-    underlying_type_t_mask_state,
-)
-from pylibcudf.libcudf.table.table_view cimport table_view
-from pylibcudf.libcudf.types cimport mask_state, size_type
-
 from cudf._lib.column cimport Column
-from cudf._lib.utils cimport table_view_from_columns
-
-
-class MaskState(Enum):
-    """
-    Enum for null mask creation state
-    """
-    UNALLOCATED = <underlying_type_t_mask_state> mask_state.UNALLOCATED
-    UNINITIALIZED = <underlying_type_t_mask_state> mask_state.UNINITIALIZED
-    ALL_VALID = <underlying_type_t_mask_state> mask_state.ALL_VALID
-    ALL_NULL = <underlying_type_t_mask_state> mask_state.ALL_NULL
 
 
 @acquire_spill_lock()
@@ -45,33 +17,20 @@ def copy_bitmask(Column col):
     if col.base_mask is None:
         return None
 
-    cdef column_view col_view = col.view()
-    cdef device_buffer db
-    cdef unique_ptr[device_buffer] up_db
-
-    with nogil:
-        db = move(cpp_copy_bitmask(col_view))
-        up_db = move(make_unique[device_buffer](move(db)))
-
-    rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
+    rmm_db = pylibcudf.null_mask.copy_bitmask(col.to_pylibcudf(mode="read"))
     buf = as_buffer(rmm_db)
     return buf
 
 
-def bitmask_allocation_size_bytes(size_type num_bits):
+def bitmask_allocation_size_bytes(num_bits):
     """
     Given a size, calculates the number of bytes that should be allocated for a
     column validity mask
     """
-    cdef size_t output_size
-
-    with nogil:
-        output_size = cpp_bitmask_allocation_size_bytes(num_bits)
+    return pylibcudf.null_mask.bitmask_allocation_size_bytes(num_bits)
 
-    return output_size
 
-
-def create_null_mask(size_type size, state=MaskState.UNINITIALIZED):
+def create_null_mask(size, state=MaskState.UNINITIALIZED):
     """
     Given a size and a mask state, allocate a mask that can properly represent
     the given size with the given mask state
@@ -83,48 +42,24 @@ def create_null_mask(size_type size, state=MaskState.UNINITIALIZED):
     state : ``MaskState``, default ``MaskState.UNINITIALIZED``
         State the null mask should be created in
     """
-    if not isinstance(state, MaskState):
-        raise TypeError(
-            "`state` is required to be of type `MaskState`, got "
-            + (type(state).__name__)
-        )
-
-    cdef device_buffer db
-    cdef unique_ptr[device_buffer] up_db
-    cdef mask_state c_mask_state = <mask_state>(
-        <underlying_type_t_mask_state>(state.value)
-    )
-
-    with nogil:
-        db = move(cpp_create_null_mask(size, c_mask_state))
-        up_db = move(make_unique[device_buffer](move(db)))
-
-    rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
+    rmm_db = pylibcudf.null_mask.create_null_mask(size, state)
     buf = as_buffer(rmm_db)
     return buf
 
 
 @acquire_spill_lock()
-def bitmask_and(columns: list):
-    cdef table_view c_view = table_view_from_columns(columns)
-    cdef pair[device_buffer, size_type] c_result
-    cdef unique_ptr[device_buffer] up_db
-    with nogil:
-        c_result = move(cpp_bitmask_and(c_view))
-        up_db = move(make_unique[device_buffer](move(c_result.first)))
-    dbuf = DeviceBuffer.c_from_unique_ptr(move(up_db))
-    buf = as_buffer(dbuf)
-    return buf, c_result.second
+def bitmask_and(list columns):
+    rmm_db, other = pylibcudf.null_mask.bitmask_and(
+        [col.to_pylibcudf(mode="read") for col in columns]
+    )
+    buf = as_buffer(rmm_db)
+    return buf, other
 
 
 @acquire_spill_lock()
-def bitmask_or(columns: list):
-    cdef table_view c_view = table_view_from_columns(columns)
-    cdef pair[device_buffer, size_type] c_result
-    cdef unique_ptr[device_buffer] up_db
-    with nogil:
-        c_result = move(cpp_bitmask_or(c_view))
-        up_db = move(make_unique[device_buffer](move(c_result.first)))
-    dbuf = DeviceBuffer.c_from_unique_ptr(move(up_db))
-    buf = as_buffer(dbuf)
-    return buf, c_result.second
+def bitmask_or(list columns):
+    rmm_db, other = pylibcudf.null_mask.bitmask_or(
+        [col.to_pylibcudf(mode="read") for col in columns]
+    )
+    buf = as_buffer(rmm_db)
+    return buf, other
diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index f81a32e07f9..a4f17344cb0 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -29,6 +29,7 @@ set(cython_sources
     join.pyx
     lists.pyx
     merge.pyx
+    null_mask.pyx
     quantiles.pyx
     reduce.pyx
     replace.pyx
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index 71f523fc3cd..841efa59bda 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -15,6 +15,7 @@ from . cimport (
     join,
     lists,
     merge,
+    null_mask,
     quantiles,
     reduce,
     replace,
@@ -57,6 +58,7 @@ __all__ = [
     "join",
     "lists",
     "merge",
+    "null_mask",
     "quantiles",
     "reduce",
     "replace",
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index e784c6c6dd5..d3878a89a6a 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -26,6 +26,7 @@
     join,
     lists,
     merge,
+    null_mask,
     quantiles,
     reduce,
     replace,
@@ -69,6 +70,7 @@
     "join",
     "lists",
     "merge",
+    "null_mask",
     "quantiles",
     "reduce",
     "replace",
diff --git a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
index 3fc2c7e8f1e..5f582091b06 100644
--- a/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/null_mask.pxd
@@ -8,8 +8,6 @@ from pylibcudf.libcudf.types cimport bitmask_type, mask_state, size_type
 
 from rmm._lib.device_buffer cimport device_buffer
 
-ctypedef int32_t underlying_type_t_mask_state
-
 
 cdef extern from "cudf/null_mask.hpp" namespace "cudf" nogil:
     cdef device_buffer copy_bitmask "cudf::copy_bitmask" (
diff --git a/python/pylibcudf/pylibcudf/null_mask.pxd b/python/pylibcudf/pylibcudf/null_mask.pxd
new file mode 100644
index 00000000000..ab5c0080312
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/null_mask.pxd
@@ -0,0 +1,18 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from pylibcudf.libcudf.types cimport mask_state, size_type
+
+from rmm._lib.device_buffer cimport DeviceBuffer
+
+from .column cimport Column
+
+
+cpdef DeviceBuffer copy_bitmask(Column col)
+
+cpdef size_t bitmask_allocation_size_bytes(size_type number_of_bits)
+
+cpdef DeviceBuffer create_null_mask(size_type size, mask_state state = *)
+
+cpdef tuple bitmask_and(list columns)
+
+cpdef tuple bitmask_or(list columns)
diff --git a/python/pylibcudf/pylibcudf/null_mask.pyx b/python/pylibcudf/pylibcudf/null_mask.pyx
new file mode 100644
index 00000000000..5bdde06f21f
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/null_mask.pyx
@@ -0,0 +1,142 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport make_unique
+from libcpp.pair cimport pair
+from libcpp.utility cimport move
+from pylibcudf.libcudf cimport null_mask as cpp_null_mask
+from pylibcudf.libcudf.types cimport mask_state, size_type
+
+from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+
+from pylibcudf.libcudf.types import mask_state as MaskState  # no-cython-lint
+
+from .column cimport Column
+from .table cimport Table
+
+
+cdef DeviceBuffer buffer_to_python(device_buffer buf):
+    return DeviceBuffer.c_from_unique_ptr(make_unique[device_buffer](move(buf)))
+
+
+cpdef DeviceBuffer copy_bitmask(Column col):
+    """Copies ``col``'s bitmask into a ``DeviceBuffer``.
+
+    For details, see :cpp:func:`copy_bitmask`.
+
+    Parameters
+    ----------
+    col : Column
+        Column whose bitmask needs to be copied
+
+    Returns
+    -------
+    rmm.DeviceBuffer
+        A ``DeviceBuffer`` containing ``col``'s bitmask, or an empty ``DeviceBuffer``
+        if ``col`` is not nullable
+    """
+    cdef device_buffer db
+
+    with nogil:
+        db = move(cpp_null_mask.copy_bitmask(col.view()))
+
+    return buffer_to_python(move(db))
+
+cpdef size_t bitmask_allocation_size_bytes(size_type number_of_bits):
+    """
+    Computes the required bytes necessary to represent the specified number of bits
+    with a 64B padding boundary.
+
+    For details, see :cpp:func:`bitmask_allocation_size_bytes`.
+
+    Parameters
+    ----------
+    number_of_bits : size_type
+        The number of bits that need to be represented
+
+    Returns
+    -------
+    size_t
+        The necessary number of bytes
+    """
+    with nogil:
+        return cpp_null_mask.bitmask_allocation_size_bytes(number_of_bits)
+
+
+cpdef DeviceBuffer create_null_mask(
+    size_type size,
+    mask_state state = mask_state.UNINITIALIZED
+):
+    """Creates a ``DeviceBuffer`` for use as a null value indicator bitmask of a
+    ``Column``.
+
+    For details, see :cpp:func:`create_null_mask`.
+
+    Parameters
+    ----------
+    size : size_type
+        The number of elements to be represented by the mask
+    state : mask_state, optional
+        The desired state of the mask. Can be one of { MaskState.UNALLOCATED,
+        MaskState.UNINITIALIZED, MaskState.ALL_VALID, MaskState.ALL_NULL }
+        (default MaskState.UNINITIALIZED)
+
+    Returns
+    -------
+    rmm.DeviceBuffer
+        A ``DeviceBuffer`` for use as a null bitmask satisfying the desired size and
+        state
+    """
+    cdef device_buffer db
+
+    with nogil:
+        db = move(cpp_null_mask.create_null_mask(size, state))
+
+    return buffer_to_python(move(db))
+
+
+cpdef tuple bitmask_and(list columns):
+    """Performs bitwise AND of the bitmasks of a list of columns.
+
+    For details, see :cpp:func:`bitmask_and`.
+
+    Parameters
+    ----------
+    columns : list
+        The list of columns
+
+    Returns
+    -------
+    tuple[DeviceBuffer, size_type]
+        A tuple of the resulting mask and count of unset bits
+    """
+    cdef Table c_table = Table(columns)
+    cdef pair[device_buffer, size_type] c_result
+
+    with nogil:
+        c_result = move(cpp_null_mask.bitmask_and(c_table.view()))
+
+    return buffer_to_python(move(c_result.first)), c_result.second
+
+
+cpdef tuple bitmask_or(list columns):
+    """Performs bitwise OR of the bitmasks of a list of columns.
+
+    For details, see :cpp:func:`bitmask_or`.
+
+    Parameters
+    ----------
+    columns : list
+        The list of columns
+
+    Returns
+    -------
+    tuple[DeviceBuffer, size_type]
+        A tuple of the resulting mask and count of unset bits
+    """
+    cdef Table c_table = Table(columns)
+    cdef pair[device_buffer, size_type] c_result
+
+    with nogil:
+        c_result = move(cpp_null_mask.bitmask_or(c_table.view()))
+
+    return buffer_to_python(move(c_result.first)), c_result.second
diff --git a/python/pylibcudf/pylibcudf/tests/test_null_mask.py b/python/pylibcudf/pylibcudf/tests/test_null_mask.py
new file mode 100644
index 00000000000..3edcae59edc
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_null_mask.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+from pylibcudf.null_mask import MaskState
+
+import rmm
+
+
+@pytest.fixture(params=[False, True])
+def nullable(request):
+    return request.param
+
+
+@pytest.fixture(params=["float32", "float64"])
+def column(request, nullable):
+    values = [2.5, 2.49, 1.6, 8, -1.5, -1.7, -0.5, 0.5]
+    typ = {"float32": pa.float32(), "float64": pa.float64()}[request.param]
+    if nullable:
+        values[2] = None
+    return plc.interop.from_arrow(pa.array(values, type=typ))
+
+
+def test_copy_bitmask(column, nullable):
+    expected = column.null_mask().obj if nullable else rmm.DeviceBuffer()
+    got = plc.null_mask.copy_bitmask(column)
+
+    assert expected.size == got.size
+    assert expected.tobytes() == got.tobytes()
+
+
+def test_bitmask_allocation_size_bytes():
+    assert plc.null_mask.bitmask_allocation_size_bytes(0) == 0
+    assert plc.null_mask.bitmask_allocation_size_bytes(1) == 64
+    assert plc.null_mask.bitmask_allocation_size_bytes(512) == 64
+    assert plc.null_mask.bitmask_allocation_size_bytes(513) == 128
+    assert plc.null_mask.bitmask_allocation_size_bytes(1024) == 128
+    assert plc.null_mask.bitmask_allocation_size_bytes(1025) == 192
+
+
+@pytest.mark.parametrize("size", [0, 1, 512, 1024])
+@pytest.mark.parametrize(
+    "state",
+    [
+        MaskState.UNALLOCATED,
+        MaskState.UNINITIALIZED,
+        MaskState.ALL_VALID,
+        MaskState.ALL_NULL,
+    ],
+)
+def test_create_null_mask(size, state):
+    mask = plc.null_mask.create_null_mask(size, state)
+
+    assert mask.size == (
+        0
+        if state == MaskState.UNALLOCATED
+        else plc.null_mask.bitmask_allocation_size_bytes(size)
+    )

From 5e420ff63ba2997a37bf5dfbfaa73c5f05225f9d Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Fri, 30 Aug 2024 17:44:03 -0400
Subject: [PATCH 760/842] Use merge base when calculating changed files
 (#16709)

`get-pr-info.outputs.base.sha` does not actually give the merge base, but merely the tip of the target branch. Calculate the merge base and pass it to the `changed-files` step.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16709
---
 .github/workflows/pr.yaml | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 35c7e3d95b6..0d79568f589 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -56,14 +56,21 @@ jobs:
       - name: Checkout code repo
         uses: actions/checkout@v4
         with:
-          ref: ${{ inputs.sha }}
-          fetch-depth: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).commits }}
+          fetch-depth: 0
           persist-credentials: false
+      - name: Calculate merge base
+        id: calculate-merge-base
+        env:
+          PR_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
+          BASE_SHA: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
+        run: |
+          (echo -n "merge-base="; git merge-base "$BASE_SHA" "$PR_SHA") > "$GITHUB_OUTPUT"
       - name: Get changed files
         id: changed-files
         uses: tj-actions/changed-files@v45
         with:
-          base_sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).base.sha }}
+          base_sha: ${{ steps.calculate-merge-base.outputs.merge-base }}
+          sha: ${{ fromJSON(steps.get-pr-info.outputs.pr-info).head.sha }}
           files_yaml: |
             cpp:
               - '**'

From 4ad4b2347160212b10f394719f575c6e477f129e Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Sat, 31 Aug 2024 11:39:01 -0500
Subject: [PATCH 761/842] remove some unnecessary libcudf nightly builds
 (#16714)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Follow-up to #16650 and #15483.

`libcudf` wheels are identical (same content, same filename) across Python versions, but due to an oversight in the PRs linked above, we're currently building nightlies of them once per Python version supported by RAPIDS 😭

You can see this on recent runs of the `build` workflow:

<img width="752" alt="image" src="https://github.com/user-attachments/assets/ba3a2192-1752-4d32-a79b-6f238fae9f18">

([build link](https://github.com/rapidsai/cudf/actions/runs/10627299703/job/29460218854))

This PR fixes that by applying the same matrix filter to `libcudf` nightly build jobs as is currently applied to PR jobs:

https://github.com/rapidsai/cudf/blob/5e420ff63ba2997a37bf5dfbfaa73c5f05225f9d/.github/workflows/pr.yaml#L195-L200

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16714
---
 .github/workflows/build.yaml | 2 ++
 .github/workflows/pr.yaml    | 1 +
 2 files changed, 3 insertions(+)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 72daff7b66b..b5d17022a3a 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -71,6 +71,8 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
+      # build for every combination of arch and CUDA version, but only for the latest Python
+      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 0d79568f589..8730804e8b6 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -197,6 +197,7 @@ jobs:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
+      # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
       script: "ci/build_wheel_libcudf.sh"

From 76059580abb7a60128545d6ed977c942ea39b3be Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <bobby@apache.org>
Date: Sun, 1 Sep 2024 11:56:27 -0500
Subject: [PATCH 762/842] Remove java
 ColumnView.copyWithBooleanColumnAsValidity (#16660)

This depends on https://github.com/NVIDIA/spark-rapids/pull/11399

Essentially ifElse is faster than this API and this API is not safe to use generically.

https://github.com/NVIDIA/spark-rapids/issues/11397#issuecomment-2310570124

So I am removing it after replacing all calls to it with calls to `ifElse/cudf::copy_if_else`

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)
  - Mike Wilson (https://github.com/hyperbolic2346)

URL: https://github.com/rapidsai/cudf/pull/16660
---
 .../main/java/ai/rapids/cudf/ColumnView.java  | 38 -----------------
 java/src/main/native/src/ColumnViewJni.cpp    | 15 -------
 java/src/main/native/src/ColumnViewJni.cu     | 31 --------------
 java/src/main/native/src/ColumnViewJni.hpp    | 16 -------
 .../java/ai/rapids/cudf/ColumnVectorTest.java | 42 +------------------
 5 files changed, 1 insertion(+), 141 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 8ff2f0f0a73..6bd4e06c47e 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -913,25 +913,6 @@ public final ColumnVector mergeAndSetValidity(BinaryOp mergeOp, ColumnView... co
     return new ColumnVector(bitwiseMergeAndSetValidity(getNativeView(), columnViews, mergeOp.nativeId));
   }
 
-  /**
-   * Creates a deep copy of a column while replacing the validity mask. The validity mask is the
-   * device_vector equivalent of the boolean column given as argument.
-   *
-   * The boolColumn must have the same number of rows as the current column.
-   * The result column will have the same number of rows as the current column.
-   * For all indices `i` where the boolColumn is `true`, the result column will have a valid value at index i.
-   * For all other values (i.e. `false` or `null`), the result column will have nulls.
-   *
-   * If the current column has a null at a given index `i`, and the new validity mask is `true` at index `i`,
-   * then the row value is undefined.
-   *
-   * @param boolColumn bool column whose value is to be used as the validity mask.
-   * @return Deep copy of the column with replaced validity mask.
-   */
-  public final ColumnVector copyWithBooleanColumnAsValidity(ColumnView boolColumn) {
-    return new ColumnVector(copyWithBooleanColumnAsValidity(getNativeView(), boolColumn.getNativeView()));
-  }
-
   /////////////////////////////////////////////////////////////////////////////
   // DATE/TIME
   /////////////////////////////////////////////////////////////////////////////
@@ -4767,25 +4748,6 @@ private static native long clamper(long nativeView, long loScalarHandle, long lo
   private static native long bitwiseMergeAndSetValidity(long baseHandle, long[] viewHandles,
                                                         int nullConfig) throws CudfException;
 
-  /**
-   * Native method to deep copy a column while replacing the null mask. The null mask is the
-   * device_vector equivalent of the boolean column given as argument.
-   *
-   * The boolColumn must have the same number of rows as the exemplar column.
-   * The result column will have the same number of rows as the exemplar.
-   * For all indices `i` where the boolean column is `true`, the result column will have a valid value at index i.
-   * For all other values (i.e. `false` or `null`), the result column will have nulls.
-   *
-   * If the exemplar column has a null at a given index `i`, and the new validity mask is `true` at index `i`,
-   * then the resultant row value is undefined.
-   *
-   * @param exemplarViewHandle column view of the column that is deep copied.
-   * @param boolColumnViewHandle bool column whose value is to be used as the null mask.
-   * @return Deep copy of the column with replaced null mask.
-   */
-  private static native long copyWithBooleanColumnAsValidity(long exemplarViewHandle,
-                                                             long boolColumnViewHandle) throws CudfException;
-
   ////////
   // Native cudf::column_view life cycle and metadata access methods. Life cycle methods
   // should typically only be called from the OffHeap inner class.
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index 4551325ebb1..72f0ad19912 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -2090,21 +2090,6 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_bitwiseMergeAndSetValidit
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_copyWithBooleanColumnAsValidity(
-  JNIEnv* env, jobject j_object, jlong exemplar_handle, jlong validity_column_handle)
-{
-  JNI_NULL_CHECK(env, exemplar_handle, "ColumnView handle is null", 0);
-  JNI_NULL_CHECK(env, validity_column_handle, "Validity column handle is null", 0);
-  try {
-    cudf::jni::auto_set_device(env);
-    auto const exemplar = *reinterpret_cast<cudf::column_view*>(exemplar_handle);
-    auto const validity = *reinterpret_cast<cudf::column_view*>(validity_column_handle);
-    return release_as_jlong(
-      cudf::jni::new_column_with_boolean_column_as_validity(exemplar, validity));
-  }
-  CATCH_STD(env, 0);
-}
-
 ////////
 // Native cudf::column_view life cycle and metadata access methods. Life cycle methods
 // should typically only be called from the CudfColumn inner class.
diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu
index 2dbff923544..46261b087ae 100644
--- a/java/src/main/native/src/ColumnViewJni.cu
+++ b/java/src/main/native/src/ColumnViewJni.cu
@@ -43,37 +43,6 @@
 
 namespace cudf::jni {
 
-std::unique_ptr<cudf::column> new_column_with_boolean_column_as_validity(
-  cudf::column_view const& exemplar, cudf::column_view const& validity_column)
-{
-  CUDF_EXPECTS(validity_column.type().id() == type_id::BOOL8,
-               "Validity column must be of type bool");
-  CUDF_EXPECTS(validity_column.size() == exemplar.size(),
-               "Exemplar and validity columns must have the same size");
-
-  auto validity_device_view = cudf::column_device_view::create(validity_column);
-  auto validity_begin       = cudf::detail::make_optional_iterator<bool>(
-    *validity_device_view, cudf::nullate::DYNAMIC{validity_column.has_nulls()});
-  auto validity_end            = validity_begin + validity_device_view->size();
-  auto [null_mask, null_count] = cudf::detail::valid_if(
-    validity_begin,
-    validity_end,
-    [] __device__(auto optional_bool) { return optional_bool.value_or(false); },
-    cudf::get_default_stream(),
-    rmm::mr::get_current_device_resource());
-  auto const exemplar_without_null_mask =
-    cudf::column_view{exemplar.type(),
-                      exemplar.size(),
-                      exemplar.head<void>(),
-                      nullptr,
-                      0,
-                      exemplar.offset(),
-                      std::vector<cudf::column_view>{exemplar.child_begin(), exemplar.child_end()}};
-  auto deep_copy = std::make_unique<cudf::column>(exemplar_without_null_mask);
-  deep_copy->set_null_mask(std::move(null_mask), null_count);
-  return deep_copy;
-}
-
 std::unique_ptr<cudf::column> generate_list_offsets(cudf::column_view const& list_length,
                                                     rmm::cuda_stream_view stream)
 {
diff --git a/java/src/main/native/src/ColumnViewJni.hpp b/java/src/main/native/src/ColumnViewJni.hpp
index c9eef0139ea..c8c441e8fae 100644
--- a/java/src/main/native/src/ColumnViewJni.hpp
+++ b/java/src/main/native/src/ColumnViewJni.hpp
@@ -22,22 +22,6 @@
 
 namespace cudf::jni {
 
-/**
- * @brief Creates a deep copy of the exemplar column, with its validity set to the equivalent
- * of the boolean `validity` column's value.
- *
- * The bool_column must have the same number of rows as the exemplar column.
- * The result column will have the same number of rows as the exemplar.
- * For all indices `i` where the boolean column is `true`, the result column will have a valid value
- * at index i. For all other values (i.e. `false` or `null`), the result column will have nulls.
- *
- * @param exemplar The column to be deep copied.
- * @param bool_column bool column whose value is to be used as the validity.
- * @return Deep copy of the exemplar, with the replaced validity.
- */
-std::unique_ptr<cudf::column> new_column_with_boolean_column_as_validity(
-  cudf::column_view const& exemplar, cudf::column_view const& bool_column);
-
 /**
  * @brief Generates list offsets with lengths of each list.
  *
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 7136b162c13..708744569df 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -6395,46 +6395,6 @@ void testReplaceSameIndexColumnInStruct() {
     assertTrue(e.getMessage().contains("Duplicate mapping found for replacing child index"));
   }
 
-  @Test
-  void testCopyWithBooleanColumnAsValidity() {
-    final Boolean T = true;
-    final Boolean F = false;
-    final Integer X = null;
-
-    // Straight-line: Invalidate every other row.
-    try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
-         ColumnVector validity = ColumnVector.fromBoxedBooleans(F, T, F, T, F, T, F, T, F, T);
-         ColumnVector expected = ColumnVector.fromBoxedInts(X, 2, X, 4, X, 6, X, 8, X, 10);
-         ColumnVector result = exemplar.copyWithBooleanColumnAsValidity(validity)) {
-      assertColumnsAreEqual(expected, result);
-    }
-
-    // Straight-line: Invalidate all Rows.
-    try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
-         ColumnVector validity = ColumnVector.fromBoxedBooleans(F, F, F, F, F, F, F, F, F, F);
-         ColumnVector expected = ColumnVector.fromBoxedInts(X, X, X, X, X, X, X, X, X, X);
-         ColumnVector result = exemplar.copyWithBooleanColumnAsValidity(validity)) {
-      assertColumnsAreEqual(expected, result);
-    }
-
-    // Nulls in the validity column are treated as invalid.
-    try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
-         ColumnVector validity = ColumnVector.fromBoxedBooleans(F, T, F, T, F, T, F, null, F, null);
-         ColumnVector expected = ColumnVector.fromBoxedInts(X, 2, X, 4, X, 6, X, X, X, X);
-         ColumnVector result = exemplar.copyWithBooleanColumnAsValidity(validity)) {
-      assertColumnsAreEqual(expected, result);
-    }
-
-    // Negative case: Mismatch in row count.
-    Exception x = assertThrows(CudfException.class, () ->  {
-      try (ColumnVector exemplar = ColumnVector.fromBoxedInts(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
-         ColumnVector validity = ColumnVector.fromBoxedBooleans(F, T, F, T);
-         ColumnVector result = exemplar.copyWithBooleanColumnAsValidity(validity)) {
-      }
-    });
-    assertTrue(x.getMessage().contains("Exemplar and validity columns must have the same size"));
-  }
-
   @Test
   void testSegmentedGather() {
     HostColumnVector.DataType dt = new ListType(true, new BasicType(true, DType.STRING));

From 557aabf8d0be528881aadb9795e6d92790a085a8 Mon Sep 17 00:00:00 2001
From: Alessandro Bellina <abellina@nvidia.com>
Date: Tue, 3 Sep 2024 11:43:05 -0500
Subject: [PATCH 763/842] Ensure we pass the has_nulls tparam to mixed_join
 kernels (#16708)

Fixes https://github.com/rapidsai/cudf/issues/16706

I'll build/test our stack with this change, but it looks like a typo.

If there's a quick unit test we can add I'd be happy to hear recommendations or for someone else to follow on with such a test.

Authors:
  - Alessandro Bellina (https://github.com/abellina)

Approvers:
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Nghia Truong (https://github.com/ttnghia)
  - David Wendt (https://github.com/davidwendt)
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/16708
---
 cpp/src/join/mixed_join_kernel.cuh      | 2 +-
 cpp/src/join/mixed_join_size_kernel.cuh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/join/mixed_join_kernel.cuh b/cpp/src/join/mixed_join_kernel.cuh
index 9d011d43de6..368b1fba870 100644
--- a/cpp/src/join/mixed_join_kernel.cuh
+++ b/cpp/src/join/mixed_join_kernel.cuh
@@ -130,7 +130,7 @@ void launch_mixed_join(table_device_view left_table,
                        int64_t shmem_size_per_block,
                        rmm::cuda_stream_view stream)
 {
-  mixed_join<DEFAULT_JOIN_BLOCK_SIZE, true>
+  mixed_join<DEFAULT_JOIN_BLOCK_SIZE, has_nulls>
     <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
       left_table,
       right_table,
diff --git a/cpp/src/join/mixed_join_size_kernel.cuh b/cpp/src/join/mixed_join_size_kernel.cuh
index a1066e32331..84e9be45030 100644
--- a/cpp/src/join/mixed_join_size_kernel.cuh
+++ b/cpp/src/join/mixed_join_size_kernel.cuh
@@ -124,7 +124,7 @@ std::size_t launch_compute_mixed_join_output_size(
   // Allocate storage for the counter used to get the size of the join output
   rmm::device_scalar<std::size_t> size(0, stream, mr);
 
-  compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, true>
+  compute_mixed_join_output_size<DEFAULT_JOIN_BLOCK_SIZE, has_nulls>
     <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
       left_table,
       right_table,

From 25779d95d413e0ddf9379dee22e36eea7bf5f08e Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Tue, 3 Sep 2024 12:24:36 -0500
Subject: [PATCH 764/842] Add boost-devel to Java CI Docker image (#16707)

Fixes #16678.  Adds the boost-devel package to the Java CI Docker environment now that the Boost headers are not being picked up implicitly after libcudf dropped the Arrow dependency in #16640.  libcudfjni still requires Arrow for now, and thus requires Boost headers.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16707
---
 java/ci/Dockerfile.rocky | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/ci/Dockerfile.rocky b/java/ci/Dockerfile.rocky
index 6b87f3ed34e..152af22f7e4 100644
--- a/java/ci/Dockerfile.rocky
+++ b/java/ci/Dockerfile.rocky
@@ -28,7 +28,7 @@ ARG TARGETPLATFORM=linux/amd64
 FROM --platform=$TARGETPLATFORM nvidia/cuda:$CUDA_VERSION-devel-rockylinux$OS_RELEASE
 ARG TOOLSET_VERSION=11
 ### Install basic requirements
-RUN dnf --enablerepo=powertools install -y  scl-utils gcc-toolset-${TOOLSET_VERSION} git zlib-devel maven tar wget patch ninja-build
+RUN dnf --enablerepo=powertools install -y  scl-utils gcc-toolset-${TOOLSET_VERSION} git zlib-devel maven tar wget patch ninja-build boost-devel
 ## pre-create the CMAKE_INSTALL_PREFIX folder, set writable by any user for Jenkins
 RUN mkdir /usr/local/rapids /rapids && chmod 777 /usr/local/rapids /rapids
 

From 0097b454254ac30739c59dee8f29a91e6643360b Mon Sep 17 00:00:00 2001
From: Hirota Akio <33370421+a-hirota@users.noreply.github.com>
Date: Wed, 4 Sep 2024 02:28:16 +0900
Subject: [PATCH 765/842] Fix typo in column_factories.hpp comment from 'depth
 1' to 'depth 2' (#16700)

This PR fixes a typo in the `cpp/include/cudf/column/column_factories.hpp` file. The comment incorrectly mentioned "data (depth 1)" instead of "data (depth 2)". This correction improves code clarity and documentation accuracy.

Authors:
  - Hirota Akio (https://github.com/a-hirota)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16700
---
 cpp/include/cudf/column/column_factories.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index c1f295b7ea8..b2dcb25acb5 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -469,7 +469,7 @@ std::unique_ptr<column> make_strings_column(size_type num_strings,
  * offsets (depth 1)   {0, 2, 5, 7}
  * data    (depth 1)
  * offsets (depth 2)
- * data    (depth 1)   {1, 2, 3, 4, 5, 6, 7}
+ * data    (depth 2)   {1, 2, 3, 4, 5, 6, 7}
  * @endcode
  *
  * @param[in] num_rows The number of lists the column represents.

From e18b537315c07b73d1eb26354208249605e3e8be Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 3 Sep 2024 08:30:15 -1000
Subject: [PATCH 766/842] Use Series._from_column more consistently to avoid
 validation (#16716)

This modifies cases where `_from_column` provided the same logic or where 1 column was produced so `._from_column` was valid to use

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16716
---
 python/cudf/cudf/_lib/text.pyx     |  2 +-
 python/cudf/cudf/core/dataframe.py | 15 ++++-----------
 python/cudf/cudf/core/series.py    | 14 ++++++--------
 python/cudf/cudf/io/text.py        |  2 +-
 4 files changed, 12 insertions(+), 21 deletions(-)

diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
index ece69b424bb..b2c7232f549 100644
--- a/python/cudf/cudf/_lib/text.pyx
+++ b/python/cudf/cudf/_lib/text.pyx
@@ -86,4 +86,4 @@ def read_text(object filepaths_or_buffers,
             delim,
             c_options))
 
-    return {None: Column.from_unique_ptr(move(c_col))}
+    return Column.from_unique_ptr(move(c_col))
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 0d632f4775f..7a171fe9e05 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -473,15 +473,8 @@ def __getitem__(self, arg):
         ca = self._frame._data
         index = self._frame.index
         if col_is_scalar:
-            s = Series._from_data(
-                data=ColumnAccessor(
-                    {key: ca._data[key] for key in column_names},
-                    multiindex=ca.multiindex,
-                    level_names=ca.level_names,
-                    verify=False,
-                ),
-                index=index,
-            )
+            name = column_names[0]
+            s = Series._from_column(ca._data[name], name=name, index=index)
             return s._getitem_preprocessed(row_spec)
         if column_names != list(self._frame._column_names):
             frame = self._frame._from_data(
@@ -7770,8 +7763,8 @@ def interleave_columns(self):
                 "interleave_columns does not support 'category' dtype."
             )
 
-        return self._constructor_sliced._from_data(
-            {None: libcudf.reshape.interleave_columns([*self._columns])}
+        return self._constructor_sliced._from_column(
+            libcudf.reshape.interleave_columns([*self._columns])
         )
 
     @_performance_tracking
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index aadbd80f4b4..48445f018d3 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -611,9 +611,7 @@ def from_masked_array(cls, data, mask, null_count=None):
         4      14
         dtype: int64
         """
-        col = as_column(data).set_mask(mask)
-        ca = ColumnAccessor({None: col}, verify=False)
-        return cls._from_data(ca)
+        return cls._from_column(as_column(data).set_mask(mask))
 
     @_performance_tracking
     def __init__(
@@ -1150,7 +1148,7 @@ def reset_index(
             if name is no_default:
                 name = 0 if self.name is None else self.name
             data[name] = data.pop(self.name)
-            return cudf.core.dataframe.DataFrame._from_data(data, index)
+            return self._constructor_expanddim._from_data(data, index)
         # For ``name`` behavior, see:
         # https://github.com/pandas-dev/pandas/issues/44575
         # ``name`` has to be ignored when `drop=True`
@@ -1661,9 +1659,7 @@ def _concat(cls, objs, axis=0, index: bool = True):
         if len(objs):
             col = col._with_type_metadata(objs[0].dtype)
 
-        return cls._from_data(
-            ColumnAccessor({name: col}, verify=False), index=result_index
-        )
+        return cls._from_column(col, name=name, index=result_index)
 
     @property  # type: ignore
     @_performance_tracking
@@ -1977,7 +1973,9 @@ def between(self, left, right, inclusive="both") -> Series:
                 "Inclusive has to be either string of 'both', "
                 "'left', 'right', or 'neither'."
             )
-        return self._from_data({self.name: lmask & rmask}, self.index)
+        return self._from_column(
+            lmask & rmask, name=self.name, index=self.index
+        )
 
     @_performance_tracking
     def all(self, axis=0, bool_only=None, skipna=True, **kwargs):
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 0043efce1e4..5ce738cae0e 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -33,7 +33,7 @@ def read_text(
         filepath_or_buffer, "read_text"
     )
 
-    return cudf.Series._from_data(
+    return cudf.Series._from_column(
         libtext.read_text(
             filepath_or_buffer,
             delimiter=delimiter,

From a83ac6f27254b2ebf99397d81b776c74f93469bf Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 3 Sep 2024 10:07:49 -1000
Subject: [PATCH 767/842] Add return type annotations to MultiIndex (#16696)

Mostly just return type annotations. No logic changes.

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16696
---
 docs/cudf/source/conf.py            |   2 +
 python/cudf/cudf/core/multiindex.py | 109 ++++++++++++++++------------
 2 files changed, 63 insertions(+), 48 deletions(-)

diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index c58bc42327c..95813907bf4 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -566,6 +566,8 @@ def on_missing_reference(app, env, node, contnode):
     ("py:obj", "cudf.Index.to_flat_index"),
     ("py:obj", "cudf.MultiIndex.to_flat_index"),
     ("py:meth", "pyarrow.Table.to_pandas"),
+    ("py:class", "pd.DataFrame"),
+    ("py:class", "pandas.core.indexes.frozen.FrozenList"),
     ("py:class", "pa.Array"),
     ("py:class", "ScalarLike"),
     ("py:class", "ParentType"),
diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
index a66e2936e3b..e00890ac5c3 100644
--- a/python/cudf/cudf/core/multiindex.py
+++ b/python/cudf/cudf/core/multiindex.py
@@ -247,7 +247,7 @@ def to_series(self, index=None, name=None):
         )
 
     @_performance_tracking
-    def astype(self, dtype, copy: bool = True):
+    def astype(self, dtype, copy: bool = True) -> Self:
         if not is_object_dtype(dtype):
             raise TypeError(
                 "Setting a MultiIndex dtype to anything other than object is "
@@ -256,7 +256,7 @@ def astype(self, dtype, copy: bool = True):
         return self
 
     @_performance_tracking
-    def rename(self, names, inplace=False):
+    def rename(self, names, inplace: bool = False) -> Self | None:
         """
         Alter MultiIndex level names
 
@@ -303,7 +303,9 @@ def rename(self, names, inplace=False):
         return self.set_names(names, level=None, inplace=inplace)
 
     @_performance_tracking
-    def set_names(self, names, level=None, inplace=False):
+    def set_names(
+        self, names, level=None, inplace: bool = False
+    ) -> Self | None:
         names_is_list_like = is_list_like(names)
         level_is_list_like = is_list_like(level)
 
@@ -345,7 +347,7 @@ def _from_data(
         cls,
         data: MutableMapping,
         name: Any = None,
-    ) -> MultiIndex:
+    ) -> Self:
         """
         Use when you have a ColumnAccessor-like mapping but no codes and levels.
         """
@@ -394,7 +396,7 @@ def copy(
         names=None,
         deep=False,
         name=None,
-    ):
+    ) -> Self:
         """Returns copy of MultiIndex object.
 
         Returns a copy of `MultiIndex`. The `levels` and `codes` value can be
@@ -457,7 +459,7 @@ def copy(
         )
 
     @_performance_tracking
-    def __repr__(self):
+    def __repr__(self) -> str:
         max_seq_items = pd.get_option("display.max_seq_items") or len(self)
 
         if len(self) > max_seq_items:
@@ -503,7 +505,7 @@ def __repr__(self):
     @property  # type: ignore
     @_external_only_api("Use ._codes instead")
     @_performance_tracking
-    def codes(self):
+    def codes(self) -> pd.core.indexes.frozen.FrozenList:
         """
         Returns the codes of the underlying MultiIndex.
 
@@ -531,7 +533,7 @@ def get_slice_bound(self, label, side):
 
     @property  # type: ignore
     @_performance_tracking
-    def nlevels(self):
+    def nlevels(self) -> int:
         """Integer number of levels in this MultiIndex."""
         return self._num_columns
 
@@ -590,7 +592,7 @@ def _get_level_label(self, level):
             return self.names[level]
 
     @_performance_tracking
-    def isin(self, values, level=None):
+    def isin(self, values, level=None) -> cp.ndarray:
         """Return a boolean array where the index values are in values.
 
         Compute boolean array of whether each index value is found in
@@ -864,7 +866,7 @@ def _validate_indexer(
         | slice
         | tuple[Any, ...]
         | list[tuple[Any, ...]],
-    ):
+    ) -> None:
         if isinstance(indexer, numbers.Number):
             return
         if isinstance(indexer, tuple):
@@ -900,12 +902,12 @@ def __eq__(self, other):
 
     @property  # type: ignore
     @_performance_tracking
-    def size(self):
+    def size(self) -> int:
         # The size of a MultiIndex is only dependent on the number of rows.
         return self._num_rows
 
     @_performance_tracking
-    def take(self, indices):
+    def take(self, indices) -> Self:
         if isinstance(indices, cudf.Series) and indices.has_nulls:
             raise ValueError("Column must have no nulls.")
         obj = super().take(indices)
@@ -957,7 +959,12 @@ def __getitem__(self, index):
             return result
 
     @_performance_tracking
-    def to_frame(self, index=True, name=no_default, allow_duplicates=False):
+    def to_frame(
+        self,
+        index: bool = True,
+        name=no_default,
+        allow_duplicates: bool = False,
+    ) -> cudf.DataFrame:
         """
         Create a DataFrame with the levels of the MultiIndex as columns.
 
@@ -1034,7 +1041,7 @@ def to_frame(self, index=True, name=no_default, allow_duplicates=False):
         )
 
     @_performance_tracking
-    def get_level_values(self, level):
+    def get_level_values(self, level) -> cudf.Index:
         """
         Return the values at the requested level
 
@@ -1067,30 +1074,30 @@ def get_level_values(self, level):
         )
         return level_values
 
-    def _is_numeric(self):
+    def _is_numeric(self) -> bool:
         return False
 
-    def _is_boolean(self):
+    def _is_boolean(self) -> bool:
         return False
 
-    def _is_integer(self):
+    def _is_integer(self) -> bool:
         return False
 
-    def _is_floating(self):
+    def _is_floating(self) -> bool:
         return False
 
-    def _is_object(self):
+    def _is_object(self) -> bool:
         return False
 
-    def _is_categorical(self):
+    def _is_categorical(self) -> bool:
         return False
 
-    def _is_interval(self):
+    def _is_interval(self) -> bool:
         return False
 
     @classmethod
     @_performance_tracking
-    def _concat(cls, objs):
+    def _concat(cls, objs) -> Self:
         source_data = [o.to_frame(index=False) for o in objs]
 
         # TODO: Verify if this is really necessary or if we can rely on
@@ -1100,17 +1107,19 @@ def _concat(cls, objs):
             for obj in source_data[1:]:
                 obj.columns = colnames
 
-        source_data = cudf.DataFrame._concat(source_data)
+        source_df = cudf.DataFrame._concat(source_data)
         try:
             # Only set names if all objs have the same names
             (names,) = {o.names for o in objs} - {None}
         except ValueError:
-            names = [None] * source_data._num_columns
-        return cudf.MultiIndex.from_frame(source_data, names=names)
+            names = [None] * source_df._num_columns
+        return cudf.MultiIndex.from_frame(source_df, names=names)
 
     @classmethod
     @_performance_tracking
-    def from_tuples(cls, tuples, sortorder: int | None = None, names=None):
+    def from_tuples(
+        cls, tuples, sortorder: int | None = None, names=None
+    ) -> Self:
         """
         Convert list of tuples to MultiIndex.
 
@@ -1153,7 +1162,7 @@ def from_tuples(cls, tuples, sortorder: int | None = None, names=None):
         return cls.from_pandas(pdi)
 
     @_performance_tracking
-    def to_numpy(self):
+    def to_numpy(self) -> np.ndarray:
         return self.values_host
 
     def to_flat_index(self):
@@ -1167,7 +1176,7 @@ def to_flat_index(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def values_host(self):
+    def values_host(self) -> np.ndarray:
         """
         Return a numpy representation of the MultiIndex.
 
@@ -1195,7 +1204,7 @@ def values_host(self):
 
     @property  # type: ignore
     @_performance_tracking
-    def values(self):
+    def values(self) -> cp.ndarray:
         """
         Return a CuPy representation of the MultiIndex.
 
@@ -1236,7 +1245,7 @@ def from_frame(
         df: pd.DataFrame | cudf.DataFrame,
         sortorder: int | None = None,
         names=None,
-    ):
+    ) -> Self:
         """
         Make a MultiIndex from a DataFrame.
 
@@ -1303,7 +1312,9 @@ def from_frame(
 
     @classmethod
     @_performance_tracking
-    def from_product(cls, iterables, sortorder: int | None = None, names=None):
+    def from_product(
+        cls, iterables, sortorder: int | None = None, names=None
+    ) -> Self:
         """
         Make a MultiIndex from the cartesian product of multiple iterables.
 
@@ -1355,7 +1366,7 @@ def from_arrays(
         arrays,
         sortorder=None,
         names=None,
-    ) -> MultiIndex:
+    ) -> Self:
         """
         Convert arrays to MultiIndex.
 
@@ -1410,7 +1421,7 @@ def from_arrays(
         )
 
     @_performance_tracking
-    def _poplevels(self, level):
+    def _poplevels(self, level) -> None | MultiIndex | cudf.Index:
         """
         Remove and return the specified levels from self.
 
@@ -1461,7 +1472,7 @@ def _poplevels(self, level):
         return popped
 
     @_performance_tracking
-    def swaplevel(self, i=-2, j=-1):
+    def swaplevel(self, i=-2, j=-1) -> Self:
         """
         Swap level i with level j.
         Calling this method does not change the ordering of the values.
@@ -1512,7 +1523,7 @@ def swaplevel(self, i=-2, j=-1):
         return midx
 
     @_performance_tracking
-    def droplevel(self, level=-1):
+    def droplevel(self, level=-1) -> MultiIndex | cudf.Index:
         """
         Removes the specified levels from the MultiIndex.
 
@@ -1598,7 +1609,9 @@ def to_pandas(
 
     @classmethod
     @_performance_tracking
-    def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default):
+    def from_pandas(
+        cls, multiindex: pd.MultiIndex, nan_as_null=no_default
+    ) -> Self:
         """
         Convert from a Pandas MultiIndex
 
@@ -1633,11 +1646,11 @@ def from_pandas(cls, multiindex: pd.MultiIndex, nan_as_null=no_default):
 
     @cached_property  # type: ignore
     @_performance_tracking
-    def is_unique(self):
+    def is_unique(self) -> bool:
         return len(self) == len(self.unique())
 
     @property
-    def dtype(self):
+    def dtype(self) -> np.dtype:
         return np.dtype("O")
 
     @_performance_tracking
@@ -1706,7 +1719,7 @@ def is_monotonic_decreasing(self) -> bool:
         )
 
     @_performance_tracking
-    def fillna(self, value):
+    def fillna(self, value) -> Self:
         """
         Fill null values with the specified value.
 
@@ -1758,7 +1771,7 @@ def nunique(self, dropna: bool = True) -> int:
         mi = self.dropna(how="all") if dropna else self
         return len(mi.unique())
 
-    def _clean_nulls_from_index(self):
+    def _clean_nulls_from_index(self) -> Self:
         """
         Convert all na values(if any) in MultiIndex object
         to `<NA>` as a preprocessing step to `__repr__` methods.
@@ -1769,20 +1782,20 @@ def _clean_nulls_from_index(self):
         )
 
     @_performance_tracking
-    def memory_usage(self, deep=False):
+    def memory_usage(self, deep: bool = False) -> int:
         usage = sum(col.memory_usage for col in self._columns)
         usage += sum(level.memory_usage(deep=deep) for level in self._levels)
         usage += sum(code.memory_usage for code in self._codes)
         return usage
 
     @_performance_tracking
-    def difference(self, other, sort=None):
+    def difference(self, other, sort=None) -> Self:
         if hasattr(other, "to_pandas"):
             other = other.to_pandas()
         return cudf.from_pandas(self.to_pandas().difference(other, sort))
 
     @_performance_tracking
-    def append(self, other):
+    def append(self, other) -> Self:
         """
         Append a collection of MultiIndex objects together
 
@@ -2000,7 +2013,7 @@ def get_loc(self, key):
         mask[true_inds] = True
         return mask
 
-    def _get_reconciled_name_object(self, other) -> MultiIndex:
+    def _get_reconciled_name_object(self, other) -> Self:
         """
         If the result of a set operation will be self,
         return self, unless the names change, in which
@@ -2026,7 +2039,7 @@ def _maybe_match_names(self, other):
         ]
 
     @_performance_tracking
-    def union(self, other, sort=None):
+    def union(self, other, sort=None) -> Self:
         if not isinstance(other, MultiIndex):
             msg = "other must be a MultiIndex or a list of tuples"
             try:
@@ -2050,7 +2063,7 @@ def union(self, other, sort=None):
         return self._union(other, sort=sort)
 
     @_performance_tracking
-    def _union(self, other, sort=None):
+    def _union(self, other, sort=None) -> Self:
         # TODO: When to_frame is refactored to return a
         # deep copy in future, we should push most of the common
         # logic between MultiIndex._union & BaseIndex._union into
@@ -2076,7 +2089,7 @@ def _union(self, other, sort=None):
         return midx
 
     @_performance_tracking
-    def _intersection(self, other, sort=None):
+    def _intersection(self, other, sort=None) -> Self:
         if self.names != other.names:
             deep = True
             col_names = list(range(0, self.nlevels))
@@ -2167,7 +2180,7 @@ def _columns_for_reset_index(
         else:
             yield from self._split_columns_by_levels(levels, in_levels=True)
 
-    def repeat(self, repeats, axis=None):
+    def repeat(self, repeats, axis=None) -> Self:
         return self._from_data(
             self._data._from_columns_like_self(
                 super()._repeat([*self._columns], repeats, axis)

From fa1486e1d1d09116d2b5f57dfef7d9307ebc76c6 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 3 Sep 2024 16:31:30 -0400
Subject: [PATCH 768/842] Remove ERROR_TEST gtest from libcudf (#16722)

Removes the `ERROR_TEST` gtest from libcudf. This test was only verifying some macros on mostly CUDA behavior and not libcudf specific functions. The tests have become troublesome to support in CI especially in conjunction with other tools like `compute-sanitizer`.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

URL: https://github.com/rapidsai/cudf/pull/16722
---
 cpp/tests/CMakeLists.txt               |   4 -
 cpp/tests/error/error_handling_test.cu | 136 -------------------------
 2 files changed, 140 deletions(-)
 delete mode 100644 cpp/tests/error/error_handling_test.cu

diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index f86acbcc51b..1bedb344a01 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -110,10 +110,6 @@ ConfigureTest(SCALAR_TEST scalar/scalar_test.cpp scalar/scalar_device_view_test.
 # * timestamps tests ------------------------------------------------------------------------------
 ConfigureTest(TIMESTAMPS_TEST wrappers/timestamps_test.cu)
 
-# ##################################################################################################
-# * cudf tests ------------------------------------------------------------------------------------
-ConfigureTest(ERROR_TEST error/error_handling_test.cu)
-
 # ##################################################################################################
 # * groupby tests ---------------------------------------------------------------------------------
 ConfigureTest(
diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
deleted file mode 100644
index 9c7459fa69d..00000000000
--- a/cpp/tests/error/error_handling_test.cu
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/default_stream.hpp>
-#include <cudf_test/stream_checking_resource_adaptor.hpp>
-#include <cudf_test/testing_main.hpp>
-
-#include <cudf/filling.hpp>
-#include <cudf/utilities/default_stream.hpp>
-#include <cudf/utilities/error.hpp>
-
-#include <rmm/cuda_stream.hpp>
-
-TEST(ExpectsTest, FalseCondition)
-{
-  EXPECT_THROW(CUDF_EXPECTS(false, "condition is false"), cudf::logic_error);
-}
-
-TEST(ExpectsTest, TrueCondition) { EXPECT_NO_THROW(CUDF_EXPECTS(true, "condition is true")); }
-
-TEST(CudaTryTest, Error) { EXPECT_THROW(CUDF_CUDA_TRY(cudaErrorLaunchFailure), cudf::cuda_error); }
-
-TEST(CudaTryTest, Success) { EXPECT_NO_THROW(CUDF_CUDA_TRY(cudaSuccess)); }
-
-TEST(StreamCheck, success) { EXPECT_NO_THROW(CUDF_CHECK_CUDA(0)); }
-
-namespace {
-// Some silly kernel that will cause an error
-CUDF_KERNEL void test_kernel(int* data) { data[threadIdx.x] = threadIdx.x; }
-}  // namespace
-
-// In a release build and without explicit synchronization, CUDF_CHECK_CUDA may
-// or may not fail on erroneous asynchronous CUDA calls. Invoke
-// cudaStreamSynchronize to guarantee failure on error. In a non-release build,
-// CUDF_CHECK_CUDA deterministically fails on erroneous asynchronous CUDA
-// calls.
-TEST(StreamCheck, FailedKernel)
-{
-  rmm::cuda_stream stream;
-  int a;
-  test_kernel<<<0, 0, 0, stream.value()>>>(&a);
-#ifdef NDEBUG
-  stream.synchronize();
-#endif
-  EXPECT_THROW(CUDF_CHECK_CUDA(stream.value()), cudf::cuda_error);
-}
-
-TEST(StreamCheck, CatchFailedKernel)
-{
-  rmm::cuda_stream stream;
-  int a;
-  test_kernel<<<0, 0, 0, stream.value()>>>(&a);
-#ifndef NDEBUG
-  stream.synchronize();
-#endif
-  EXPECT_THROW(CUDF_CHECK_CUDA(stream.value()), cudf::cuda_error);
-}
-
-CUDF_KERNEL void kernel() { asm("trap;"); }
-
-TEST(DeathTest, CudaFatalError)
-{
-  testing::FLAGS_gtest_death_test_style = "threadsafe";
-  auto call_kernel                      = []() {
-    kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>();
-    try {
-      CUDF_CUDA_TRY(cudaDeviceSynchronize());
-    } catch (const cudf::fatal_cuda_error& fe) {
-      std::abort();
-    }
-  };
-  ASSERT_DEATH(call_kernel(), "");
-}
-
-#ifndef NDEBUG
-
-CUDF_KERNEL void assert_false_kernel() { cudf_assert(false && "this kernel should die"); }
-
-CUDF_KERNEL void assert_true_kernel() { cudf_assert(true && "this kernel should live"); }
-
-TEST(DebugAssertDeathTest, cudf_assert_false)
-{
-  testing::FLAGS_gtest_death_test_style = "threadsafe";
-
-  auto call_kernel = []() {
-    auto const stream = cudf::get_default_stream().value();
-    assert_false_kernel<<<1, 1, 0, stream>>>();
-
-    // Kernel should fail with `cudaErrorAssert`
-    // This error invalidates the current device context, so we need to kill
-    // the current process. Running with EXPECT_DEATH spawns a new process for
-    // each attempted kernel launch
-    if (cudaErrorAssert == cudaDeviceSynchronize()) { std::abort(); }
-
-    // If we reach this point, the cudf_assert didn't work so we exit normally, which will cause
-    // EXPECT_DEATH to fail.
-  };
-
-  EXPECT_DEATH(call_kernel(), "this kernel should die");
-}
-
-TEST(DebugAssert, cudf_assert_true)
-{
-  auto const stream = cudf::get_default_stream().value();
-  assert_true_kernel<<<1, 1, 0, stream>>>();
-  ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize());
-}
-
-#endif
-
-// These tests don't use CUDF_TEST_PROGRAM_MAIN because :
-// 1.) They don't need the RMM Pool
-// 2.) The RMM Pool interferes with the death test
-int main(int argc, char** argv)
-{
-  if (getenv("LIBCUDF_MEMCHECK_ENABLED")) { return 0; }
-
-  ::testing::InitGoogleTest(&argc, argv);
-  auto const cmd_opts = parse_cudf_test_opts(argc, argv);
-  auto adaptor        = make_stream_mode_adaptor(cmd_opts);
-  return RUN_ALL_TESTS();
-}

From 26091a44b3dbf0f56fc0dfc5f081077f2d00681f Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Wed, 4 Sep 2024 09:10:24 -0400
Subject: [PATCH 769/842] Refactor cudf pandas integration tests CI (#16728)

Following up #16645 with a couple improvements

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16728
---
 ..._library_tests.sh => run-library-tests.sh} | 24 +++++++------------
 .../third-party-integration/test.sh           |  2 +-
 2 files changed, 10 insertions(+), 16 deletions(-)
 rename ci/cudf_pandas_scripts/third-party-integration/{ci_run_library_tests.sh => run-library-tests.sh} (69%)

diff --git a/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh b/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh
similarity index 69%
rename from ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh
rename to ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh
index 54a56508cdc..d44d25d658c 100755
--- a/ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh
+++ b/ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh
@@ -9,23 +9,17 @@ cleanup() {
 
 trap cleanup EXIT
 
-runtest_gold() {
+runtest() {
     local lib=$1
+    local mode=$2
 
-    pytest \
-    -v \
-    --continue-on-collection-errors \
-    --cache-clear \
-    --numprocesses=${NUM_PROCESSES} \
-    --dist=worksteal \
-    ${TEST_DIR}/test_${lib}*.py
-}
-
-runtest_cudf_pandas() {
-    local lib=$1
+    local plugin=""
+    if [ "$mode" = "cudf" ]; then
+        plugin="-p cudf.pandas"
+    fi
 
     pytest \
-    -p cudf.pandas \
+    $plugin \
     -v \
     --continue-on-collection-errors \
     --cache-clear \
@@ -38,8 +32,8 @@ main() {
     local lib=$1
 
     # generation phase
-    runtest_gold ${lib}
-    runtest_cudf_pandas ${lib}
+    runtest ${lib} "gold"
+    runtest ${lib} "cudf"
 
     # assertion phase
     pytest \
diff --git a/ci/cudf_pandas_scripts/third-party-integration/test.sh b/ci/cudf_pandas_scripts/third-party-integration/test.sh
index 89b28c30e39..f8ddbaba0f3 100755
--- a/ci/cudf_pandas_scripts/third-party-integration/test.sh
+++ b/ci/cudf_pandas_scripts/third-party-integration/test.sh
@@ -72,7 +72,7 @@ main() {
             fi
         done
 
-        TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/ci_run_library_tests.sh ${lib}
+        TEST_DIR=${TEST_DIR} NUM_PROCESSES=${NUM_PROCESSES} ci/cudf_pandas_scripts/third-party-integration/run-library-tests.sh ${lib}
 
         rapids-logger "Test script exiting with value: ${EXITCODE}"
     done

From 28bf38ec82563a770335d8a8de13c9268b9418cd Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 4 Sep 2024 15:17:43 +0000
Subject: [PATCH 770/842] working conftest

---
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 10 +++
 .../cudf/pandas/scripts/conftest-patch.py     | 67 ++++++++++++++-
 .../cudf/pandas/scripts/run-pandas-tests.sh   |  2 +-
 python/cudf/cudf_pandas_tests/conftest.py     | 85 +++++++++++++++++++
 4 files changed, 162 insertions(+), 2 deletions(-)
 create mode 100644 python/cudf/cudf_pandas_tests/conftest.py

diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 4b0fd9a5b36..4a71a037d63 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -870,6 +870,14 @@ def _assert_fast_slow_eq(left, right):
         assert_eq(left, right)
 
 
+def _fast_function_call():
+    return 1
+
+
+def _slow_function_call():
+    return 1
+
+
 def _fast_slow_function_call(
     func: Callable,
     /,
@@ -899,6 +907,7 @@ def _fast_slow_function_call(
                 # try slow path
                 raise Exception()
             fast = True
+            _fast_function_call()
             if _env_get_bool("CUDF_PANDAS_DEBUGGING", False):
                 try:
                     with nvtx.annotate(
@@ -941,6 +950,7 @@ def _fast_slow_function_call(
                 from ._logger import log_fallback
 
                 log_fallback(slow_args, slow_kwargs, err)
+            _slow_function_call()
             with disable_module_accelerator():
                 result = func(*slow_args, **slow_kwargs)
     return _maybe_wrap_result(result, func, *args, **kwargs), fast
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index 505a40b0bfa..fc02183f352 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -1,10 +1,13 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
+# SPDX-FileCopyrightText: Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES.
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
 import contextlib
+import json
+import multiprocessing
 import os
 import sys
+from collections import defaultdict
 from functools import wraps
 
 import pytest
@@ -36,4 +39,66 @@ def patch_testing_functions():
     pytest.raises = replace_kwargs({"match": None})(pytest.raises)
 
 
+# # Dictionary to store function call counts
+# manager = multiprocessing.Manager()
+# function_call_counts = manager.dict()
+
+# # The specific function to track
+# FUNCTION_NAME = {'_slow_function_call', '_fast_function_call'}
+
+# def trace_calls(frame, event, arg):
+#     if event != 'call':
+#         return
+#     code = frame.f_code
+#     func_name = code.co_name
+#     if func_name in FUNCTION_NAME:
+#         function_call_counts[func_name] = function_call_counts.get(func_name, 0) + 1
+
+# def pytest_sessionstart(session):
+#     # Set the profile function to trace calls
+#     sys.setprofile(trace_calls)
+
+# def pytest_sessionfinish(session, exitstatus):
+#     # Remove the profile function
+#     sys.setprofile(None)
+
+# @pytest.hookimpl(tryfirst=True)
+# def pytest_runtest_setup(item):
+#     # Check if this is the first test in the file
+#     if item.nodeid.split("::")[0] != getattr(pytest_runtest_setup, "current_file", None):
+#         # If it's a new file, reset the function call counts
+#         global function_call_counts
+#         function_call_counts = manager.dict()
+#         pytest_runtest_setup.current_file = item.nodeid.split("::")[0]
+
+# @pytest.hookimpl(trylast=True)
+# def pytest_runtest_teardown(item, nextitem):
+#     # Check if this is the last test in the file
+#     if nextitem is None or nextitem.nodeid.split("::")[0] != item.nodeid.split("::")[0]:
+#         # Write the function call counts to a file
+#         worker_id = os.getenv('PYTEST_XDIST_WORKER', 'master')
+#         output_file = f'function_call_counts_{os.path.basename(item.nodeid.split("::")[0])}_{worker_id}.txt'
+#         with open(output_file, 'w') as f:
+#             for func, count in function_call_counts.items():
+#                 f.write(f'{func}: {count}\n')
+#         print(f'Function call counts have been written to {output_file}')
+
+# @pytest.hookimpl(tryfirst=True)
+# def pytest_configure(config):
+#     if hasattr(config, 'workerinput'):
+#         # Running in xdist worker
+#         global function_call_counts
+#         function_call_counts = manager.dict()
+
+# @pytest.hookimpl(trylast=True)
+# def pytest_unconfigure(config):
+#     if hasattr(config, 'workerinput'):
+#         # Running in xdist worker
+#         worker_id = config.workerinput['workerid']
+#         output_file = f'function_call_counts_worker_{worker_id}.txt'
+#         with open(output_file, 'w') as f:
+#             for func, count in function_call_counts.items():
+#                 f.write(f'{func}: {count}\n')
+#         print(f'Function call counts have been written to {output_file}')
+
 sys.path.append(os.path.dirname(__file__))
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 9c65b74d081..4ba5361b983 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -137,7 +137,7 @@ and not test_eof_states \
 and not test_array_tz"
 
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
-PANDAS_CI="1" timeout 60m python -m pytest -p cudf.pandas \
+PANDAS_CI="1" timeout 600m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
     -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \
     --import-mode=importlib \
diff --git a/python/cudf/cudf_pandas_tests/conftest.py b/python/cudf/cudf_pandas_tests/conftest.py
new file mode 100644
index 00000000000..1898a785651
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/conftest.py
@@ -0,0 +1,85 @@
+import json
+import multiprocessing
+import os
+import sys
+from collections import defaultdict
+
+import pytest
+
+# Dictionary to store function call counts
+manager = multiprocessing.Manager()
+function_call_counts = manager.dict()
+
+# The specific function to track
+FUNCTION_NAME = {"_slow_function_call", "_fast_function_call"}
+
+
+def trace_calls(frame, event, arg):
+    if event != "call":
+        return
+    code = frame.f_code
+    func_name = code.co_name
+    if func_name in FUNCTION_NAME:
+        function_call_counts[func_name] = (
+            function_call_counts.get(func_name, 0) + 1
+        )
+
+
+def pytest_sessionstart(session):
+    # Set the profile function to trace calls
+    sys.setprofile(trace_calls)
+
+
+def pytest_sessionfinish(session, exitstatus):
+    # Remove the profile function
+    sys.setprofile(None)
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_runtest_setup(item):
+    # Check if this is the first test in the file
+    if item.nodeid.split("::")[0] != getattr(
+        pytest_runtest_setup, "current_file", None
+    ):
+        # If it's a new file, reset the function call counts
+        global function_call_counts
+        function_call_counts = manager.dict()
+        pytest_runtest_setup.current_file = item.nodeid.split("::")[0]
+
+
+@pytest.hookimpl(trylast=True)
+def pytest_runtest_teardown(item, nextitem):
+    # Check if this is the last test in the file
+    if (
+        nextitem is None
+        or nextitem.nodeid.split("::")[0] != item.nodeid.split("::")[0]
+    ):
+        # Write the function call counts to a file
+        worker_id = os.getenv("PYTEST_XDIST_WORKER", "master")
+        output_file = f'function_call_counts_{os.path.basename(item.nodeid.split("::")[0])}_{worker_id}.json'
+        with open(output_file, "w") as f:
+            json.dump(dict(function_call_counts), f)
+            # for func, count in function_call_counts.items():
+            #     f.write(f'{func}: {count}\n')
+        print(f"Function call counts have been written to {output_file}")
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_configure(config):
+    if hasattr(config, "workerinput"):
+        # Running in xdist worker
+        global function_call_counts
+        function_call_counts = manager.dict()
+
+
+@pytest.hookimpl(trylast=True)
+def pytest_unconfigure(config):
+    if hasattr(config, "workerinput"):
+        # Running in xdist worker
+        worker_id = config.workerinput["workerid"]
+        output_file = f"function_call_counts_worker_{worker_id}.json"
+        with open(output_file, "w") as f:
+            json.dump(dict(function_call_counts), f)
+            # for func, count in function_call_counts.items():
+            #     f.write(f'{func}: {count}\n')
+        print(f"Function call counts have been written to {output_file}")

From 1b6f02d536d253465d2c601f222fb0acede8a942 Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Wed, 4 Sep 2024 12:02:40 -0500
Subject: [PATCH 771/842] Multi-file and Parquet-aware prefetching from remote
 storage (#16657)

Follow up to https://github.com/rapidsai/cudf/pull/16613
Supersedes https://github.com/rapidsai/cudf/pull/16166

Improves remote-IO read performance when multiple files are read at once. Also enables partial IO for remote Parquet files (previously removed in `24.10` by https://github.com/rapidsai/cudf/pull/16589).

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16657
---
 python/cudf/cudf/io/parquet.py    |  40 +++++++++
 python/cudf/cudf/tests/test_s3.py |  47 ++++++++++
 python/cudf/cudf/utils/ioutils.py | 141 ++++++++++++++++++++++++++----
 3 files changed, 212 insertions(+), 16 deletions(-)

diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 526f12aa94e..62be7378e9e 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -577,11 +577,51 @@ def read_parquet(
         )
     filepath_or_buffer = paths if paths else filepath_or_buffer
 
+    # Prepare remote-IO options
+    prefetch_options = kwargs.pop("prefetch_options", {})
+    if not ioutils._is_local_filesystem(fs):
+        # The default prefetch method depends on the
+        # `row_groups` argument. In most cases we will use
+        # method="all" by default, because it is fastest
+        # when we need to read most of the file(s).
+        # If a (simple) `row_groups` selection is made, we
+        # use method="parquet" to avoid transferring the
+        # entire file over the network
+        method = prefetch_options.get("method")
+        _row_groups = None
+        if method in (None, "parquet"):
+            if row_groups is None:
+                # If the user didn't specify a method, don't use
+                # 'parquet' prefetcher for column projection alone.
+                method = method or "all"
+            elif all(r == row_groups[0] for r in row_groups):
+                # Row group selection means we are probably
+                # reading half the file or less. We should
+                # avoid a full file transfer by default.
+                method = "parquet"
+                _row_groups = row_groups[0]
+            elif (method := method or "all") == "parquet":
+                raise ValueError(
+                    "The 'parquet' prefetcher requires a uniform "
+                    "row-group selection for all paths within the "
+                    "same `read_parquet` call. "
+                    "Got: {row_groups}"
+                )
+        if method == "parquet":
+            prefetch_options = prefetch_options.update(
+                {
+                    "method": method,
+                    "columns": columns,
+                    "row_groups": _row_groups,
+                }
+            )
+
     filepaths_or_buffers = ioutils.get_reader_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
         fs=fs,
         storage_options=storage_options,
         bytes_per_thread=bytes_per_thread,
+        prefetch_options=prefetch_options,
     )
 
     # Warn user if they are not using cudf for IO
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index 3b23a53091e..0958b68084d 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -229,6 +229,53 @@ def test_read_parquet(
     assert_eq(expect, got2)
 
 
+@pytest.mark.parametrize("method", ["all", "parquet"])
+@pytest.mark.parametrize("blocksize", [1024 * 1024, 1024])
+def test_read_parquet_prefetch_options(
+    s3_base,
+    s3so,
+    pdf,
+    method,
+    blocksize,
+):
+    bucket = "parquet"
+    fname_1 = "test_parquet_reader_prefetch_options_1.parquet"
+    buffer_1 = BytesIO()
+    pdf.to_parquet(path=buffer_1)
+    buffer_1.seek(0)
+
+    fname_2 = "test_parquet_reader_prefetch_options_2.parquet"
+    buffer_2 = BytesIO()
+    pdf_2 = pdf.copy()
+    pdf_2["Integer"] += 1
+    pdf_2.to_parquet(path=buffer_2)
+    buffer_2.seek(0)
+
+    with s3_context(
+        s3_base=s3_base,
+        bucket=bucket,
+        files={
+            fname_1: buffer_1,
+            fname_2: buffer_2,
+        },
+    ):
+        got = cudf.read_parquet(
+            [
+                f"s3://{bucket}/{fname_1}",
+                f"s3://{bucket}/{fname_2}",
+            ],
+            storage_options=s3so,
+            prefetch_options={
+                "method": method,
+                "blocksize": blocksize,
+            },
+            columns=["String", "Integer"],
+        )
+
+    expect = pd.concat([pdf, pdf_2], ignore_index=True)[["String", "Integer"]]
+    assert_eq(expect, got)
+
+
 @pytest.mark.parametrize("bytes_per_thread", [32, 1024])
 @pytest.mark.parametrize("columns", [None, ["List", "Struct"]])
 @pytest.mark.parametrize("index", [None, "Integer"])
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 6b146be0fa3..1627107b57d 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
 import datetime
+import functools
+import operator
 import os
 import urllib
 import warnings
@@ -18,6 +20,12 @@
 from cudf.core._compat import PANDAS_LT_300
 from cudf.utils.docutils import docfmt_partial
 
+try:
+    import fsspec.parquet as fsspec_parquet
+
+except ImportError:
+    fsspec_parquet = None
+
 _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
 _ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024
 
@@ -187,6 +195,11 @@
 allow_mismatched_pq_schemas : boolean, default False
     If True, enables reading (matching) columns specified in `columns` and `filters`
     options from the input files with otherwise mismatched schemas.
+prefetch_options : dict, default None
+    WARNING: This is an experimental feature and may be removed at any
+    time without warning or deprecation period.
+    Dictionary of options to use to prefetch bytes from remote storage.
+    These options are passed through to `get_reader_filepath_or_buffer`.
 
 Returns
 -------
@@ -1439,6 +1452,14 @@
     Glob pattern to use when expanding directories into file paths
     (e.g. "*.json"). If this parameter is not specified, directories
     will not be expanded.
+prefetch_options : dict, default None
+    WARNING: This is an experimental feature and may be removed at any
+    time without warning or deprecation period.
+    Dictionary of options to use to prefetch bytes from remote storage.
+    These options are only used when `path_or_data` is a list of remote
+    paths. If 'method' is set to 'all' (the default), the only supported
+    option is 'blocksize' (default 256 MB). If method is set to 'parquet',
+    'columns' and 'row_groups' are also supported (default None).
 
 Returns
 -------
@@ -1620,6 +1641,7 @@ def get_reader_filepath_or_buffer(
     warn_on_raw_text_input=None,
     warn_meta=None,
     expand_dir_pattern=None,
+    prefetch_options=None,
 ):
     """{docstring}"""
 
@@ -1690,26 +1712,15 @@ def get_reader_filepath_or_buffer(
                 raw_text_input = True
 
         elif fs is not None:
-            # TODO: We can use cat_ranges and/or parquet-aware logic
-            # to copy all remote data into host memory at once here.
-            # The current solution iterates over files, and copies
-            # ALL data from each file (even when we are performing
-            # partial IO, and don't need the entire file)
             if len(paths) == 0:
                 raise FileNotFoundError(
                     f"{input_sources} could not be resolved to any files"
                 )
-            filepaths_or_buffers = [
-                BytesIO(
-                    _fsspec_data_transfer(
-                        fpath,
-                        fs=fs,
-                        mode=mode,
-                        bytes_per_thread=bytes_per_thread,
-                    )
-                )
-                for fpath in paths
-            ]
+            filepaths_or_buffers = _prefetch_remote_buffers(
+                paths,
+                fs,
+                **(prefetch_options or {}),
+            )
         else:
             raw_text_input = True
 
@@ -2099,3 +2110,101 @@ def _read_byte_ranges(
 
     for worker in workers:
         worker.join()
+
+
+def _get_remote_bytes_all(
+    remote_paths, fs, *, blocksize=_BYTES_PER_THREAD_DEFAULT
+):
+    # TODO: Experiment with a heuristic to avoid the fs.sizes
+    # call when we are reading many files at once (the latency
+    # of collecting the file sizes is unnecessary in this case)
+    if max(sizes := fs.sizes(remote_paths)) <= blocksize:
+        # Don't bother breaking up individual files
+        return fs.cat_ranges(remote_paths, None, None)
+    else:
+        # Construct list of paths, starts, and ends
+        paths, starts, ends = map(
+            list,
+            zip(
+                *(
+                    (r, j, min(j + blocksize, s))
+                    for r, s in zip(remote_paths, sizes)
+                    for j in range(0, s, blocksize)
+                )
+            ),
+        )
+
+        # Collect the byte ranges
+        chunks = fs.cat_ranges(paths, starts, ends)
+
+        # Construct local byte buffers
+        # (Need to make sure path offsets are ordered correctly)
+        unique_count = dict(zip(*np.unique(paths, return_counts=True)))
+        offset = np.cumsum([0] + [unique_count[p] for p in remote_paths])
+        buffers = [
+            functools.reduce(operator.add, chunks[offset[i] : offset[i + 1]])
+            for i in range(len(remote_paths))
+        ]
+        return buffers
+
+
+def _get_remote_bytes_parquet(
+    remote_paths,
+    fs,
+    *,
+    columns=None,
+    row_groups=None,
+    blocksize=_BYTES_PER_THREAD_DEFAULT,
+):
+    if fsspec_parquet is None or (columns is None and row_groups is None):
+        return _get_remote_bytes_all(remote_paths, fs, blocksize=blocksize)
+
+    sizes = fs.sizes(remote_paths)
+    data = fsspec_parquet._get_parquet_byte_ranges(
+        remote_paths,
+        fs,
+        columns=columns,
+        row_groups=row_groups,
+        max_block=blocksize,
+    )
+
+    buffers = []
+    for size, path in zip(sizes, remote_paths):
+        path_data = data[path]
+        buf = np.empty(size, dtype="b")
+        for range_offset in path_data.keys():
+            chunk = path_data[range_offset]
+            buf[range_offset[0] : range_offset[1]] = np.frombuffer(
+                chunk, dtype="b"
+            )
+        buffers.append(buf.tobytes())
+    return buffers
+
+
+def _prefetch_remote_buffers(
+    paths,
+    fs,
+    *,
+    method="all",
+    **prefetch_options,
+):
+    # Gather bytes ahead of time for remote filesystems
+    if fs and paths and not _is_local_filesystem(fs):
+        try:
+            prefetcher = {
+                "parquet": _get_remote_bytes_parquet,
+                "all": _get_remote_bytes_all,
+            }[method]
+        except KeyError:
+            raise ValueError(
+                f"{method} is not a supported remote-data prefetcher."
+                " Expected 'parquet' or 'all'."
+            )
+        return prefetcher(
+            paths,
+            fs,
+            **prefetch_options,
+        )
+
+    else:
+        return paths

From ad1369d2d6eabf4b0ae480a10463a74f3034aece Mon Sep 17 00:00:00 2001
From: Sebastian Berg <sebastianb@nvidia.com>
Date: Thu, 5 Sep 2024 01:11:07 +0200
Subject: [PATCH 772/842] CI: Test against old versions of key dependencies
 (#16570)

This adds explicit tests with old versions of key dependencies. Specifically:
- `numba==0.57`
- `numpy==1.23`
- `pandas==2.0`
- ~`fsspec==0.6.0`~ excluded it. `transformers==4.39.3` requires `huggingface_hub` which requires `fsspec>=2023.5.0`.  In principle one could include it e.g. only for conda which doesn't pull in `transformers`, but that seemed not worth the trouble?
- `cupy==12.0.0`
- `pyarrow==16.1.0`

See also https://github.com/rapidsai/build-planning/issues/81

(Marking as draft until I see that things work.)

Authors:
  - Sebastian Berg (https://github.com/seberg)
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/16570
---
 ci/cudf_pandas_scripts/run_tests.sh           |  13 +-
 ci/test_python_common.sh                      |   3 +-
 ci/test_wheel_cudf.sh                         |  14 ++
 ci/test_wheel_cudf_polars.sh                  |  11 ++
 ci/test_wheel_dask_cudf.sh                    |  13 ++
 dependencies.yaml                             |  22 +++
 .../cudf/cudf/tests/indexes/test_interval.py  |   4 +
 .../test_avro_reader_fastavro_integration.py  |   5 +
 python/cudf/cudf/tests/test_binops.py         |  41 +++++-
 python/cudf/cudf/tests/test_categorical.py    |   5 +
 python/cudf/cudf/tests/test_concat.py         |  99 ++++++++-----
 python/cudf/cudf/tests/test_csv.py            |  12 +-
 python/cudf/cudf/tests/test_dataframe.py      |  19 ++-
 python/cudf/cudf/tests/test_datetime.py       |  35 ++++-
 python/cudf/cudf/tests/test_doctests.py       |   5 +
 python/cudf/cudf/tests/test_groupby.py        | 112 +++++++++++++++
 python/cudf/cudf/tests/test_index.py          |  37 ++++-
 python/cudf/cudf/tests/test_indexing.py       |   8 ++
 python/cudf/cudf/tests/test_interpolate.py    |   4 +
 python/cudf/cudf/tests/test_interval.py       |   5 +
 python/cudf/cudf/tests/test_join_order.py     | 130 +++++++++++++++++-
 python/cudf/cudf/tests/test_mvc.py            |   8 +-
 python/cudf/cudf/tests/test_numerical.py      |   3 +-
 python/cudf/cudf/tests/test_orc.py            |   8 +-
 python/cudf/cudf/tests/test_parquet.py        |   5 +
 python/cudf/cudf/tests/test_reductions.py     |   5 +
 python/cudf/cudf/tests/test_replace.py        |  20 ++-
 python/cudf/cudf/tests/test_resampling.py     |   9 ++
 python/cudf/cudf/tests/test_reshape.py        |  17 ++-
 python/cudf/cudf/tests/test_stats.py          |   8 ++
 .../cudf_pandas_tests/test_cudf_pandas.py     |  12 +-
 .../dask_cudf/tests/test_applymap.py          |   6 +
 .../dask_cudf/tests/test_distributed.py       |   5 +
 .../dask_cudf/dask_cudf/tests/test_groupby.py |   5 +
 34 files changed, 638 insertions(+), 70 deletions(-)

diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 8b85695c861..1c2724a9a5d 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -54,8 +54,19 @@ else
     RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
     RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
-    # echo to expand wildcard before adding `[extra]` requires for pip
+    echo "" > ./constraints.txt
+    if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
+        # `test_python` constraints are for `[test]` not `[cudf-pandas-tests]`
+        rapids-dependency-file-generator \
+            --output requirements \
+            --file-key test_python \
+            --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
+        | tee ./constraints.txt
+    fi
+
     python -m pip install \
+        -v \
+        --constraint ./constraints.txt \
         "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test,cudf-pandas-tests]" \
         "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
         "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh
index e8849588aa5..d0675b0431a 100755
--- a/ci/test_python_common.sh
+++ b/ci/test_python_common.sh
@@ -14,7 +14,8 @@ ENV_YAML_DIR="$(mktemp -d)"
 rapids-dependency-file-generator \
   --output conda \
   --file-key test_python \
-  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
+    | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test
 
diff --git a/ci/test_wheel_cudf.sh b/ci/test_wheel_cudf.sh
index 6861d699695..28ded2f8e0f 100755
--- a/ci/test_wheel_cudf.sh
+++ b/ci/test_wheel_cudf.sh
@@ -10,8 +10,22 @@ RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from
 RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
+rapids-logger "Install cudf, pylibcudf, and test requirements"
+
+# Constrain to minimum dependency versions if job is set up as "oldest"
+echo "" > ./constraints.txt
+if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
+    rapids-dependency-file-generator \
+        --output requirements \
+        --file-key py_test_cudf \
+        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
+      | tee ./constraints.txt
+fi
+
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
+    -v \
+    --constraint ./constraints.txt \
   "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
   "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
   "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]"
diff --git a/ci/test_wheel_cudf_polars.sh b/ci/test_wheel_cudf_polars.sh
index 0baf6c9e277..9844090258a 100755
--- a/ci/test_wheel_cudf_polars.sh
+++ b/ci/test_wheel_cudf_polars.sh
@@ -25,9 +25,20 @@ RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
 rapids-logger "Installing cudf_polars and its dependencies"
+# Constraint to minimum dependency versions if job is set up as "oldest"
+echo "" > ./constraints.txt
+if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
+    rapids-dependency-file-generator \
+        --output requirements \
+        --file-key py_test_cudf_polars \
+        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
+      | tee ./constraints.txt
+fi
 
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
+    -v \
+    --constraint ./constraints.txt \
     "$(echo ./dist/cudf_polars_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
     "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
     "$(echo ./dist/pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)"
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index fa74b2398f7..0d39807d56c 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -11,8 +11,21 @@ RAPIDS_PY_WHEEL_NAME="cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from
 RAPIDS_PY_WHEEL_NAME="libcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./dist
 RAPIDS_PY_WHEEL_NAME="pylibcudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python ./dist
 
+rapids-logger "Install dask_cudf, cudf, pylibcudf, and test requirements"
+# Constraint to minimum dependency versions if job is set up as "oldest"
+echo "" > ./constraints.txt
+if [[ $RAPIDS_DEPENDENCIES == "oldest" ]]; then
+    rapids-dependency-file-generator \
+        --output requirements \
+        --file-key py_test_dask_cudf \
+        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
+      | tee ./constraints.txt
+fi
+
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install \
+  -v \
+  --constraint ./constraints.txt \
   "$(echo ./dist/cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
   "$(echo ./dist/dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)[test]" \
   "$(echo ./dist/libcudf_${RAPIDS_PY_CUDA_SUFFIX}*.whl)" \
diff --git a/dependencies.yaml b/dependencies.yaml
index c6851d9cb90..f8b231efd6d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -696,6 +696,28 @@ dependencies:
           - pytest<8
           - pytest-cov
           - pytest-xdist
+    specific:
+      # Define additional constraints for testing with oldest dependencies.
+      - output_types: [conda, requirements]
+        matrices:
+          - matrix: {dependencies: "oldest"}
+            packages:
+              - numba==0.57.*
+              - numpy==1.23.*
+              - pandas==2.0.*
+              - pyarrow==14.0.0
+              - cupy==12.0.0  # ignored as pip constraint
+          - matrix:
+            packages:
+      - output_types: requirements
+        # Using --constraints for pip install, so we list cupy multiple times
+        matrices:
+          - matrix: {dependencies: "oldest"}
+            packages:
+              - cupy-cuda11x==12.0.0
+              - cupy-cuda12x==12.0.0
+          - matrix:
+            packages:
   test_python_pylibcudf:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf/cudf/tests/indexes/test_interval.py b/python/cudf/cudf/tests/indexes/test_interval.py
index 6653a94c9be..25edf788daf 100644
--- a/python/cudf/cudf/tests/indexes/test_interval.py
+++ b/python/cudf/cudf/tests/indexes/test_interval.py
@@ -149,6 +149,10 @@ def test_interval_range_periods_basic_dtype(start_t, end_t, periods_t):
     assert_eq(pindex, gindex)
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Does not warn on older versions of pandas",
+)
 def test_interval_range_periods_warnings():
     start_val, end_val, periods_val = 0, 4, 1.0
 
diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
index 2ec1d1d2f28..9d69e626c3d 100644
--- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
+++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py
@@ -23,6 +23,7 @@
 import pytest
 
 import cudf
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.testing import assert_eq
 from cudf.testing.dataset_generator import rand_dataframe
 
@@ -302,6 +303,10 @@ def get_days_from_epoch(date: datetime.date | None) -> int | None:
 @pytest.mark.parametrize("namespace", [None, "root_ns"])
 @pytest.mark.parametrize("nullable", [True, False])
 @pytest.mark.parametrize("prepend_null", [True, False])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas (datetime(9999, ...) too large)",
+)
 def test_can_parse_avro_date_logical_type(namespace, nullable, prepend_null):
     avro_type = {"logicalType": "date", "type": "int"}
     if nullable:
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 4256ec872e6..2e8519509e2 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -13,7 +13,11 @@
 
 import cudf
 from cudf import Index, Series
-from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+from cudf.core._compat import (
+    PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_GE_220,
+    PANDAS_VERSION,
+)
 from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.testing import _utils as utils, assert_eq
 from cudf.utils.dtypes import (
@@ -1781,6 +1785,20 @@ def test_datetime_dateoffset_binaryop(
             reason="https://github.com/pandas-dev/pandas/issues/57448",
         )
     )
+    if (
+        not PANDAS_GE_220
+        and dtype in {"datetime64[ms]", "datetime64[s]"}
+        and frequency in ("microseconds", "nanoseconds")
+        and n_periods != 0
+    ):
+        pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595")
+    if (
+        not PANDAS_GE_220
+        and dtype == "datetime64[us]"
+        and frequency == "nanoseconds"
+        and n_periods != 0
+    ):
+        pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595")
 
     date_col = [
         f"2000-01-01 00:00:{components}",
@@ -1834,7 +1852,11 @@ def test_datetime_dateoffset_binaryop(
     "ignore:Discarding nonzero nanoseconds:UserWarning"
 )
 @pytest.mark.parametrize("op", [operator.add, operator.sub])
-def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op):
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
+def test_datetime_dateoffset_binaryop_multiple(request, date_col, kwargs, op):
     gsr = cudf.Series(date_col, dtype="datetime64[ns]")
     psr = gsr.to_pandas()
 
@@ -1873,6 +1895,21 @@ def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op):
 def test_datetime_dateoffset_binaryop_reflected(
     n_periods, frequency, dtype, components
 ):
+    if (
+        not PANDAS_GE_220
+        and dtype in {"datetime64[ms]", "datetime64[s]"}
+        and frequency in ("microseconds", "nanoseconds")
+        and n_periods != 0
+    ):
+        pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595")
+    if (
+        not PANDAS_GE_220
+        and dtype == "datetime64[us]"
+        and frequency == "nanoseconds"
+        and n_periods != 0
+    ):
+        pytest.skip(reason="https://github.com/pandas-dev/pandas/pull/55595")
+
     date_col = [
         f"2000-01-01 00:00:{components}",
         f"2000-01-31 00:00:{components}",
diff --git a/python/cudf/cudf/tests/test_categorical.py b/python/cudf/cudf/tests/test_categorical.py
index ae58af8ebce..cd1ad21ae59 100644
--- a/python/cudf/cudf/tests/test_categorical.py
+++ b/python/cudf/cudf/tests/test_categorical.py
@@ -11,6 +11,7 @@
 import pytest
 
 import cudf
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.testing import assert_eq
 from cudf.testing._utils import NUMERIC_TYPES, assert_exceptions_equal
 
@@ -858,6 +859,10 @@ def test_cat_from_scalar(scalar):
     assert_eq(ps, gs)
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Does not warn on older versions of pandas",
+)
 def test_cat_groupby_fillna():
     ps = pd.Series(["a", "b", "c"], dtype="category")
     gs = cudf.from_pandas(ps)
diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
index c1c03de48d4..8da589ba45b 100644
--- a/python/cudf/cudf/tests/test_concat.py
+++ b/python/cudf/cudf/tests/test_concat.py
@@ -9,6 +9,7 @@
 import pytest
 
 import cudf
+from cudf.core._compat import PANDAS_GE_220
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing import assert_eq
 from cudf.testing._utils import assert_exceptions_equal, expect_warning_if
@@ -451,45 +452,75 @@ def test_concat_mixed_input():
         [pd.Series([1, 2, 3]), pd.DataFrame({"a": []})],
         [pd.Series([], dtype="float64"), pd.DataFrame({"a": []})],
         [pd.Series([], dtype="float64"), pd.DataFrame({"a": [1, 2]})],
-        [
-            pd.Series([1, 2, 3.0, 1.2], name="abc"),
-            pd.DataFrame({"a": [1, 2]}),
-        ],
-        [
-            pd.Series(
-                [1, 2, 3.0, 1.2], name="abc", index=[100, 110, 120, 130]
-            ),
-            pd.DataFrame({"a": [1, 2]}),
-        ],
-        [
-            pd.Series(
-                [1, 2, 3.0, 1.2], name="abc", index=["a", "b", "c", "d"]
+        pytest.param(
+            [
+                pd.Series([1, 2, 3.0, 1.2], name="abc"),
+                pd.DataFrame({"a": [1, 2]}),
+            ],
+            marks=pytest.mark.skipif(
+                not PANDAS_GE_220,
+                reason="https://github.com/pandas-dev/pandas/pull/56365",
             ),
-            pd.DataFrame({"a": [1, 2]}, index=["a", "b"]),
-        ],
-        [
-            pd.Series(
-                [1, 2, 3.0, 1.2, 8, 100],
-                name="New name",
-                index=["a", "b", "c", "d", "e", "f"],
+        ),
+        pytest.param(
+            [
+                pd.Series(
+                    [1, 2, 3.0, 1.2], name="abc", index=[100, 110, 120, 130]
+                ),
+                pd.DataFrame({"a": [1, 2]}),
+            ],
+            marks=pytest.mark.skipif(
+                not PANDAS_GE_220,
+                reason="https://github.com/pandas-dev/pandas/pull/56365",
             ),
-            pd.DataFrame(
-                {"a": [1, 2, 4, 10, 11, 12]},
-                index=["a", "b", "c", "d", "e", "f"],
+        ),
+        pytest.param(
+            [
+                pd.Series(
+                    [1, 2, 3.0, 1.2], name="abc", index=["a", "b", "c", "d"]
+                ),
+                pd.DataFrame({"a": [1, 2]}, index=["a", "b"]),
+            ],
+            marks=pytest.mark.skipif(
+                not PANDAS_GE_220,
+                reason="https://github.com/pandas-dev/pandas/pull/56365",
             ),
-        ],
-        [
-            pd.Series(
-                [1, 2, 3.0, 1.2, 8, 100],
-                name="New name",
-                index=["a", "b", "c", "d", "e", "f"],
+        ),
+        pytest.param(
+            [
+                pd.Series(
+                    [1, 2, 3.0, 1.2, 8, 100],
+                    name="New name",
+                    index=["a", "b", "c", "d", "e", "f"],
+                ),
+                pd.DataFrame(
+                    {"a": [1, 2, 4, 10, 11, 12]},
+                    index=["a", "b", "c", "d", "e", "f"],
+                ),
+            ],
+            marks=pytest.mark.skipif(
+                not PANDAS_GE_220,
+                reason="https://github.com/pandas-dev/pandas/pull/56365",
             ),
-            pd.DataFrame(
-                {"a": [1, 2, 4, 10, 11, 12]},
-                index=["a", "b", "c", "d", "e", "f"],
+        ),
+        pytest.param(
+            [
+                pd.Series(
+                    [1, 2, 3.0, 1.2, 8, 100],
+                    name="New name",
+                    index=["a", "b", "c", "d", "e", "f"],
+                ),
+                pd.DataFrame(
+                    {"a": [1, 2, 4, 10, 11, 12]},
+                    index=["a", "b", "c", "d", "e", "f"],
+                ),
+            ]
+            * 7,
+            marks=pytest.mark.skipif(
+                not PANDAS_GE_220,
+                reason="https://github.com/pandas-dev/pandas/pull/56365",
             ),
-        ]
-        * 7,
+        ),
     ],
 )
 def test_concat_series_dataframe_input(objs):
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index 40ba415e681..cee3d23eadc 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -16,9 +16,13 @@
 
 import cudf
 from cudf import read_csv
-from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+from cudf.core._compat import (
+    PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_GE_220,
+    PANDAS_VERSION,
+)
 from cudf.testing import assert_eq
-from cudf.testing._utils import assert_exceptions_equal
+from cudf.testing._utils import assert_exceptions_equal, expect_warning_if
 
 
 def make_numeric_dataframe(nrows, dtype):
@@ -1270,14 +1274,14 @@ def test_csv_reader_delim_whitespace():
     # with header row
     with pytest.warns(FutureWarning):
         cu_df = read_csv(StringIO(buffer), delim_whitespace=True)
-    with pytest.warns(FutureWarning):
+    with expect_warning_if(PANDAS_GE_220):
         pd_df = pd.read_csv(StringIO(buffer), delim_whitespace=True)
     assert_eq(pd_df, cu_df)
 
     # without header row
     with pytest.warns(FutureWarning):
         cu_df = read_csv(StringIO(buffer), delim_whitespace=True, header=None)
-    with pytest.warns(FutureWarning):
+    with expect_warning_if(PANDAS_GE_220):
         pd_df = pd.read_csv(
             StringIO(buffer), delim_whitespace=True, header=None
         )
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 9122a1074ac..f4d1578bda7 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -26,7 +26,11 @@
 
 import cudf
 from cudf.api.extensions import no_default
-from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+from cudf.core._compat import (
+    PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_GE_220,
+    PANDAS_VERSION,
+)
 from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.core.column import column
 from cudf.errors import MixedTypeError
@@ -3561,8 +3565,11 @@ def test_dataframe_empty_sort_index():
 @pytest.mark.parametrize("inplace", [True, False])
 @pytest.mark.parametrize("na_position", ["first", "last"])
 def test_dataframe_sort_index(
-    index, axis, ascending, inplace, ignore_index, na_position
+    request, index, axis, ascending, inplace, ignore_index, na_position
 ):
+    if not PANDAS_GE_220 and axis in (1, "columns") and ignore_index:
+        pytest.skip(reason="Bug fixed in pandas-2.2")
+
     pdf = pd.DataFrame(
         {"b": [1, 3, 2], "a": [1, 4, 3], "c": [4, 1, 5]},
         index=index,
@@ -3612,6 +3619,10 @@ def test_dataframe_sort_index(
 @pytest.mark.parametrize("ignore_index", [True, False])
 @pytest.mark.parametrize("inplace", [True, False])
 @pytest.mark.parametrize("na_position", ["first", "last"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_dataframe_mulitindex_sort_index(
     request, axis, level, ascending, inplace, ignore_index, na_position
 ):
@@ -6747,6 +6758,10 @@ def test_dataframe_init_from_arrays_cols(data, cols, index):
         None,
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_dataframe_assign_scalar(request, col_data, assign_val):
     request.applymarker(
         pytest.mark.xfail(
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 7be4faa42c3..4a2345fc009 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -14,7 +14,11 @@
 import cudf
 import cudf.testing.dataset_generator as dataset_generator
 from cudf import DataFrame, Series
-from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+from cudf.core._compat import (
+    PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_GE_220,
+    PANDAS_VERSION,
+)
 from cudf.core.index import DatetimeIndex
 from cudf.testing import assert_eq
 from cudf.testing._utils import (
@@ -801,6 +805,10 @@ def test_to_datetime_different_formats_notimplemented():
         cudf.to_datetime(["2015-02-01", "2015-02-01 10:10:10"])
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas.",
+)
 def test_datetime_can_cast_safely():
     sr = cudf.Series(
         ["1679-01-01", "2000-01-31", "2261-01-01"], dtype="datetime64[ms]"
@@ -847,6 +855,10 @@ def test_datetime_array_timeunit_cast(dtype):
 
 
 @pytest.mark.parametrize("timeunit", ["D", "W", "M", "Y"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_datetime_scalar_timeunit_cast(timeunit):
     testscalar = np.datetime64("2016-11-20", timeunit)
 
@@ -1535,6 +1547,10 @@ def test_date_range_start_end_periods(start, end, periods):
     )
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_date_range_start_end_freq(start, end, freq):
     if isinstance(freq, str):
         _gfreq = _pfreq = freq
@@ -1551,6 +1567,10 @@ def test_date_range_start_end_freq(start, end, freq):
     )
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_date_range_start_freq_periods(start, freq, periods):
     if isinstance(freq, str):
         _gfreq = _pfreq = freq
@@ -1643,6 +1663,9 @@ def test_date_range_raise_overflow():
     ],
 )
 def test_date_range_raise_unsupported(freqstr_unsupported):
+    if not PANDAS_GE_220 and freqstr_unsupported.endswith("E"):
+        pytest.skip(reason="YE, etc. support was added in pandas 2.2")
+
     s, e = "2001-01-01", "2008-01-31"
     pd.date_range(start=s, end=e, freq=freqstr_unsupported)
     with pytest.raises(ValueError, match="does not yet support"):
@@ -1654,7 +1677,7 @@ def test_date_range_raise_unsupported(freqstr_unsupported):
     if freqstr_unsupported != "3MS":
         freqstr_unsupported = freqstr_unsupported.lower()
         with pytest.raises(ValueError, match="does not yet support"):
-            with pytest.warns(FutureWarning):
+            with expect_warning_if(PANDAS_GE_220):
                 cudf.date_range(start=s, end=e, freq=freqstr_unsupported)
 
 
@@ -1995,6 +2018,10 @@ def test_first(idx, offset):
         )
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="warning not present in older pandas versions",
+)
 def test_first_start_at_end_of_month(idx, offset):
     p = pd.Series(range(len(idx)), index=idx)
     g = cudf.from_pandas(p)
@@ -2319,6 +2346,10 @@ def test_datetime_to_str(data, dtype):
     assert_eq(actual.to_pandas(nullable=True), expected)
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_datetime_string_to_datetime_resolution_loss_raises():
     data = ["2020-01-01 00:00:00.00001"]
     dtype = "datetime64[s]"
diff --git a/python/cudf/cudf/tests/test_doctests.py b/python/cudf/cudf/tests/test_doctests.py
index 794660cffcb..5d3d18cbe95 100644
--- a/python/cudf/cudf/tests/test_doctests.py
+++ b/python/cudf/cudf/tests/test_doctests.py
@@ -11,6 +11,7 @@
 from packaging import version
 
 import cudf
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 
 pytestmark = pytest.mark.filterwarnings("ignore::FutureWarning")
 
@@ -96,6 +97,10 @@ def prinoptions(cls):
         itertools.chain(*[_find_doctests_in_obj(mod) for mod in tests]),
         ids=lambda docstring: docstring.name,
     )
+    @pytest.mark.skipif(
+        PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+        reason="Doctests not expected to pass on older versions of pandas",
+    )
     def test_docstring(self, docstring):
         # We ignore differences in whitespace in the doctest output, and enable
         # the use of an ellipsis "..." to match any string in the doctest
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index 74f04c0584f..0aaa71e50d7 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -188,6 +188,10 @@ def test_groupby_as_index_single_agg(pdf, gdf, as_index):
 
 @pytest.mark.parametrize("engine", ["cudf", "jit"])
 @pytest.mark.parametrize("as_index", [True, False])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Include groups missing on old versions of pandas",
+)
 def test_groupby_as_index_apply(pdf, gdf, as_index, engine):
     gdf = gdf.groupby("y", as_index=as_index).apply(
         lambda df: df["x"].mean(), engine=engine
@@ -298,6 +302,10 @@ def assert_values_equal(arr):
             assert_values_equal(pddf[k].values)
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply():
     np.random.seed(0)
     df = DataFrame()
@@ -338,6 +346,10 @@ def f3(df, k, L, m):
 
 
 @pytest.mark.parametrize("func,args", create_test_groupby_apply_args_params())
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_args(func, args):
     np.random.seed(0)
     df = DataFrame()
@@ -500,6 +512,10 @@ def func(df):
     "func", ["min", "max", "sum", "mean", "var", "std", "idxmin", "idxmax"]
 )
 @pytest.mark.parametrize("dataset", ["small", "large", "nans"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Include groups missing on old versions of pandas",
+)
 def test_groupby_apply_jit_unary_reductions(
     func, dtype, dataset, groupby_jit_datasets
 ):
@@ -530,6 +546,10 @@ def func(df):
 
 
 # test unary index reductions for special values
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def groupby_apply_jit_idx_reductions_special_vals_inner(
     func, data, dtype, special_val
 ):
@@ -555,6 +575,10 @@ def func(df):
 @pytest.mark.parametrize("func", ["min", "max", "sum", "mean", "var", "std"])
 @pytest.mark.parametrize("special_val", [np.nan, np.inf, -np.inf])
 @pytest.mark.parametrize("dataset", ["small", "large", "nans"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Include groups missing on old versions of pandas",
+)
 def test_groupby_apply_jit_reductions_special_vals(
     func, dtype, dataset, groupby_jit_datasets, special_val
 ):
@@ -583,6 +607,10 @@ def test_groupby_apply_jit_reductions_special_vals(
     ],
 )
 @pytest.mark.parametrize("dataset", ["small", "large", "nans"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="include_groups keyword new in pandas 2.2",
+)
 def test_groupby_apply_jit_idx_reductions_special_vals(
     func, dtype, dataset, groupby_jit_datasets, special_val
 ):
@@ -593,6 +621,10 @@ def test_groupby_apply_jit_idx_reductions_special_vals(
 
 
 @pytest.mark.parametrize("dtype", ["int32"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_jit_sum_integer_overflow(dtype):
     max = np.iinfo(dtype).max
 
@@ -627,6 +659,10 @@ def func(group):
         "large",
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_jit_correlation(dataset, groupby_jit_datasets, dtype):
     dataset = groupby_jit_datasets[dataset]
 
@@ -653,6 +689,10 @@ def func(group):
 
 
 @pytest.mark.parametrize("dtype", ["int32", "int64"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_jit_correlation_zero_variance(dtype):
     # pearson correlation is undefined when the variance of either
     # variable is zero. This test ensures that the jit implementation
@@ -711,6 +751,10 @@ def func(group):
 
 
 @pytest.mark.parametrize("dtype", ["uint8", "str"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_unsupported_dtype(dtype):
     df = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
     df["b"] = df["b"].astype(dtype)
@@ -739,6 +783,10 @@ def func(group):
         lambda df: df["val1"].mean() + df["val2"].std(),
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_jit_basic(func, groupby_jit_data_small):
     run_groupby_apply_jit_test(groupby_jit_data_small, func, ["key1", "key2"])
 
@@ -759,12 +807,20 @@ def f3(df, k, L, m):
 @pytest.mark.parametrize(
     "func,args", create_test_groupby_apply_jit_args_params()
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_jit_args(func, args, groupby_jit_data_small):
     run_groupby_apply_jit_test(
         groupby_jit_data_small, func, ["key1", "key2"], *args
     )
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_jit_block_divergence():
     # https://github.com/rapidsai/cudf/issues/12686
     df = cudf.DataFrame(
@@ -782,6 +838,10 @@ def diverging_block(grp_df):
     run_groupby_apply_jit_test(df, diverging_block, ["a"])
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_caching():
     # Make sure similar functions that differ
     # by simple things like constants actually
@@ -818,6 +878,10 @@ def f(group):
     assert precompiled.currsize == 3
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_no_bytecode_fallback():
     # tests that a function which contains no bytecode
     # attribute, but would still be executable using
@@ -836,6 +900,10 @@ def f(group):
     assert_groupby_results_equal(expect, got)
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_return_col_from_df():
     # tests a UDF that consists of purely colwise
     # ops, such as `lambda group: group.x + group.y`
@@ -862,6 +930,10 @@ def func(df):
 
 
 @pytest.mark.parametrize("func", [lambda group: group.sum()])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_return_df(func):
     # tests a UDF that reduces over a dataframe
     # and produces a series with the original column names
@@ -1940,6 +2012,10 @@ def test_groupby_agg_combinations(agg):
     )
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Include groups missing on old versions of pandas",
+)
 def test_groupby_apply_noempty_group():
     pdf = pd.DataFrame(
         {"a": [1, 1, 2, 2], "b": [1, 2, 1, 2], "c": [1, 2, 3, 4]}
@@ -2208,6 +2284,10 @@ def f3(x, k, L, m):
 @pytest.mark.parametrize(
     "func,args", create_test_groupby_apply_return_scalars_params()
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_apply_return_scalars(func, args):
     pdf = pd.DataFrame(
         {
@@ -2266,6 +2346,10 @@ def f5(x, k, L, m):
 @pytest.mark.parametrize(
     "func,args", create_test_groupby_apply_return_series_dataframe_params()
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Include groups missing on old versions of pandas",
+)
 def test_groupby_apply_return_series_dataframe(func, args):
     pdf = pd.DataFrame(
         {"key": [0, 0, 1, 1, 2, 2, 2], "val": [0, 1, 2, 3, 4, 5, 6]}
@@ -2744,6 +2828,10 @@ def test_groupby_diff_row_zero_shift(nelem):
 
 # TODO: test for category columns when cudf.Scalar supports category type
 @pytest.mark.parametrize("nelem", [10, 100, 1000])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="warning not present in older pandas versions",
+)
 def test_groupby_fillna_multi_value(nelem):
     t = rand_dataframe(
         dtypes_meta=[
@@ -2790,6 +2878,10 @@ def test_groupby_fillna_multi_value(nelem):
 # TODO: test for category columns when cudf.Scalar supports category type
 # TODO: cudf.fillna does not support decimal column to column fill yet
 @pytest.mark.parametrize("nelem", [10, 100, 1000])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="warning not present in older pandas versions",
+)
 def test_groupby_fillna_multi_value_df(nelem):
     t = rand_dataframe(
         dtypes_meta=[
@@ -2843,6 +2935,10 @@ def test_groupby_fillna_multi_value_df(nelem):
     "data", [[1, None, 2, None, 3, None], [1, 2, 3, 4, 5, 6]]
 )
 @pytest.mark.parametrize("args", [{"value": 42}, {"method": "ffill"}])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="warning not present in older pandas versions",
+)
 def test_groupby_various_by_fillna(by, data, args):
     ps = pd.Series(data)
     gs = cudf.from_pandas(ps)
@@ -3146,6 +3242,10 @@ def test_groupby_freq_s(label, closed):
         ),
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Warnings only given on newer versions.",
+)
 def test_groupby_get_group(pdf, group, name, obj):
     gdf = cudf.from_pandas(pdf)
 
@@ -3644,6 +3744,10 @@ def test_group_by_pandas_sort_order(groups, sort):
         "last",
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_group_by_empty_reduction(dtype, reduce_op):
     gdf = cudf.DataFrame({"a": [], "b": [], "c": []}, dtype=dtype)
     pdf = gdf.to_pandas()
@@ -3664,6 +3768,10 @@ def test_group_by_empty_reduction(dtype, reduce_op):
     "apply_op",
     ["sum", "min", "max", "idxmax"],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_group_by_empty_apply(request, dtype, apply_op):
     request.applymarker(
         pytest.mark.xfail(
@@ -3719,6 +3827,10 @@ def test_groupby_consecutive_operations():
     assert_groupby_results_equal(actual, expected, check_dtype=False)
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Warning only given on newer versions.",
+)
 def test_categorical_grouping_pandas_compatibility():
     gdf = cudf.DataFrame(
         {
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index 722a64cb553..3f483219423 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -16,6 +16,11 @@
 
 import cudf
 from cudf.api.extensions import no_default
+from cudf.core._compat import (
+    PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_GE_220,
+    PANDAS_VERSION,
+)
 from cudf.core.index import CategoricalIndex, DatetimeIndex, Index, RangeIndex
 from cudf.testing import assert_eq
 from cudf.testing._utils import (
@@ -791,9 +796,27 @@ def test_index_to_series(data):
     "name_data,name_other",
     [("abc", "c"), (None, "abc"), ("abc", pd.NA), ("abc", "abc")],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_index_difference(data, other, sort, name_data, name_other):
     pd_data = pd.Index(data, name=name_data)
     pd_other = pd.Index(other, name=name_other)
+    if (
+        not PANDAS_GE_220
+        and isinstance(pd_data.dtype, pd.CategoricalDtype)
+        and not isinstance(pd_other.dtype, pd.CategoricalDtype)
+        and pd_other.isnull().any()
+    ):
+        pytest.skip(reason="https://github.com/pandas-dev/pandas/issues/57318")
+
+    if (
+        not PANDAS_GE_220
+        and len(pd_other) == 0
+        and len(pd_data) != len(pd_data.unique())
+    ):
+        pytest.skip(reason="Bug fixed in pandas-2.2+")
 
     gd_data = cudf.from_pandas(pd_data)
     gd_other = cudf.from_pandas(pd_other)
@@ -1017,6 +1040,10 @@ def test_index_equal_misc(data, other):
         ["abcd", "defgh", "werty", "poiu"],
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Does not warn on older versions of pandas",
+)
 def test_index_append(data, other):
     pd_data = pd.Index(data)
     pd_other = pd.Index(other)
@@ -1220,6 +1247,10 @@ def test_index_append_error(data, other):
         ),
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Does not warn on older versions of pandas",
+)
 def test_index_append_list(data, other):
     pd_data = data
     pd_other = other
@@ -2084,6 +2115,10 @@ def test_get_indexer_multi_numeric_deviate(key, method):
 
 
 @pytest.mark.parametrize("method", ["ffill", "bfill"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_get_indexer_multi_error(method):
     pi = pd.MultiIndex.from_tuples(
         [(2, 1, 1), (1, 2, 3), (1, 2, 1), (1, 1, 10), (1, 1, 1), (2, 2, 1)]
@@ -2527,7 +2562,7 @@ def test_isin_index(index, values):
     )
     with expect_warning_if(is_dt_str):
         got = gidx.isin(values)
-    with expect_warning_if(is_dt_str):
+    with expect_warning_if(PANDAS_GE_220 and is_dt_str):
         expected = pidx.isin(values)
 
     assert_eq(got, expected)
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index 9df2852dde8..00ae99466bb 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -1016,6 +1016,10 @@ def test_series_setitem_iloc(key, value, nulls):
         (slice(0, 2), [0.5, 0.25]),
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_series_setitem_dtype(key, value):
     psr = pd.Series([1, 2, 3], dtype="int32")
     gsr = cudf.from_pandas(psr)
@@ -1634,6 +1638,10 @@ def test_dataframe_loc_iloc_inplace_update_with_RHS_dataframe(
     assert_eq(expected, actual)
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="No warning in older versions of pandas",
+)
 def test_dataframe_loc_inplace_update_with_invalid_RHS_df_columns():
     gdf = cudf.DataFrame({"x": [1, 2, 3], "y": [4, 5, 6]})
     pdf = gdf.to_pandas()
diff --git a/python/cudf/cudf/tests/test_interpolate.py b/python/cudf/cudf/tests/test_interpolate.py
index a4f0b9fc97e..c76a49103e2 100644
--- a/python/cudf/cudf/tests/test_interpolate.py
+++ b/python/cudf/cudf/tests/test_interpolate.py
@@ -125,6 +125,10 @@ def test_interpolate_series_values_or_index(data, index, method):
         ),
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Does not fail on older versions of pandas",
+)
 def test_interpolate_dataframe_error_cases(data, kwargs):
     gsr = cudf.DataFrame(data)
     psr = gsr.to_pandas()
diff --git a/python/cudf/cudf/tests/test_interval.py b/python/cudf/cudf/tests/test_interval.py
index 2d194107658..5e1dd33fbf1 100644
--- a/python/cudf/cudf/tests/test_interval.py
+++ b/python/cudf/cudf/tests/test_interval.py
@@ -6,6 +6,7 @@
 import pytest
 
 import cudf
+from cudf.core._compat import PANDAS_GE_220
 from cudf.testing import assert_eq
 
 
@@ -168,6 +169,10 @@ def test_interval_index_unique():
 
 @pytest.mark.parametrize("box", [pd.Series, pd.IntervalIndex])
 @pytest.mark.parametrize("tz", ["US/Eastern", None])
+@pytest.mark.skipif(
+    condition=not PANDAS_GE_220,
+    reason="ME frequency new in pandas 2.2",
+)
 def test_interval_with_datetime(tz, box):
     dti = pd.date_range(
         start=pd.Timestamp("20180101", tz=tz),
diff --git a/python/cudf/cudf/tests/test_join_order.py b/python/cudf/cudf/tests/test_join_order.py
index 9ea4ba007d2..9a95f0e01ab 100644
--- a/python/cudf/cudf/tests/test_join_order.py
+++ b/python/cudf/cudf/tests/test_join_order.py
@@ -1,13 +1,19 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 import itertools
+import operator
 import string
+from collections import defaultdict
 
 import numpy as np
 import pytest
 
 import cudf
-from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+from cudf.core._compat import (
+    PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_GE_220,
+    PANDAS_VERSION,
+)
 from cudf.testing import assert_eq
 
 
@@ -35,10 +41,124 @@ def right():
 # Behaviour in sort=False case didn't match documentation in many
 # cases prior to https://github.com/pandas-dev/pandas/pull/54611
 # (released as part of pandas 2.2)
-def expected(left, right, sort, *, how):
-    left = left.to_pandas()
-    right = right.to_pandas()
-    return left.merge(right, on="key", how=how, sort=sort)
+if PANDAS_GE_220:
+    # Behaviour in sort=False case didn't match documentation in many
+    # cases prior to https://github.com/pandas-dev/pandas/pull/54611
+    # (released as part of pandas 2.2)
+    def expected(left, right, sort, *, how):
+        left = left.to_pandas()
+        right = right.to_pandas()
+        return left.merge(right, on="key", how=how, sort=sort)
+
+else:
+
+    def expect_inner(left, right, sort):
+        left_key = left.key.values_host.tolist()
+        left_val = left.val.values_host.tolist()
+        right_key = right.key.values_host.tolist()
+        right_val = right.val.values_host.tolist()
+
+        right_have = defaultdict(list)
+        for i, k in enumerate(right_key):
+            right_have[k].append(i)
+        keys = []
+        val_x = []
+        val_y = []
+        for k, v in zip(left_key, left_val):
+            if k not in right_have:
+                continue
+            for i in right_have[k]:
+                keys.append(k)
+                val_x.append(v)
+                val_y.append(right_val[i])
+
+        if sort:
+            # Python sort is stable, so this will preserve input order for
+            # equal items.
+            keys, val_x, val_y = zip(
+                *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0))
+            )
+        return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y})
+
+    def expect_left(left, right, sort):
+        left_key = left.key.values_host.tolist()
+        left_val = left.val.values_host.tolist()
+        right_key = right.key.values_host.tolist()
+        right_val = right.val.values_host.tolist()
+
+        right_have = defaultdict(list)
+        for i, k in enumerate(right_key):
+            right_have[k].append(i)
+        keys = []
+        val_x = []
+        val_y = []
+        for k, v in zip(left_key, left_val):
+            if k not in right_have:
+                right_vals = [None]
+            else:
+                right_vals = [right_val[i] for i in right_have[k]]
+
+            for rv in right_vals:
+                keys.append(k)
+                val_x.append(v)
+                val_y.append(rv)
+
+        if sort:
+            # Python sort is stable, so this will preserve input order for
+            # equal items.
+            keys, val_x, val_y = zip(
+                *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0))
+            )
+        return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y})
+
+    def expect_outer(left, right, sort):
+        left_key = left.key.values_host.tolist()
+        left_val = left.val.values_host.tolist()
+        right_key = right.key.values_host.tolist()
+        right_val = right.val.values_host.tolist()
+        right_have = defaultdict(list)
+        for i, k in enumerate(right_key):
+            right_have[k].append(i)
+        keys = []
+        val_x = []
+        val_y = []
+        for k, v in zip(left_key, left_val):
+            if k not in right_have:
+                right_vals = [None]
+            else:
+                right_vals = [right_val[i] for i in right_have[k]]
+            for rv in right_vals:
+                keys.append(k)
+                val_x.append(v)
+                val_y.append(rv)
+        left_have = set(left_key)
+        for k, v in zip(right_key, right_val):
+            if k not in left_have:
+                keys.append(k)
+                val_x.append(None)
+                val_y.append(v)
+
+        # Python sort is stable, so this will preserve input order for
+        # equal items.
+        # outer joins are always sorted, but we test both sort values
+        keys, val_x, val_y = zip(
+            *sorted(zip(keys, val_x, val_y), key=operator.itemgetter(0))
+        )
+        return cudf.DataFrame({"key": keys, "val_x": val_x, "val_y": val_y})
+
+    def expected(left, right, sort, *, how):
+        if how == "inner":
+            return expect_inner(left, right, sort)
+        elif how == "outer":
+            return expect_outer(left, right, sort)
+        elif how == "left":
+            return expect_left(left, right, sort)
+        elif how == "right":
+            return expect_left(right, left, sort).rename(
+                {"val_x": "val_y", "val_y": "val_x"}, axis=1
+            )
+        else:
+            raise NotImplementedError()
 
 
 @pytest.mark.parametrize("how", ["inner", "left", "right", "outer"])
diff --git a/python/cudf/cudf/tests/test_mvc.py b/python/cudf/cudf/tests/test_mvc.py
index 7dd25ebc500..055bc5757b3 100644
--- a/python/cudf/cudf/tests/test_mvc.py
+++ b/python/cudf/cudf/tests/test_mvc.py
@@ -1,8 +1,9 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 import subprocess
 import sys
 
 import pytest
+from packaging import version
 
 IS_CUDA_11 = False
 IS_CUDA_12 = False
@@ -14,9 +15,12 @@
 # do not test cuda 12 if pynvjitlink isn't present
 HAVE_PYNVJITLINK = False
 try:
+    import numba
     import pynvjitlink  # noqa: F401
 
-    HAVE_PYNVJITLINK = True
+    HAVE_PYNVJITLINK = version.parse(numba.__version__) >= version.parse(
+        "0.58"
+    )
 except ModuleNotFoundError:
     pass
 
diff --git a/python/cudf/cudf/tests/test_numerical.py b/python/cudf/cudf/tests/test_numerical.py
index 1b0589254f5..b1a2f081cd2 100644
--- a/python/cudf/cudf/tests/test_numerical.py
+++ b/python/cudf/cudf/tests/test_numerical.py
@@ -5,6 +5,7 @@
 import pytest
 
 import cudf
+from cudf.core._compat import PANDAS_GE_220
 from cudf.testing import assert_eq
 from cudf.testing._utils import NUMERIC_TYPES, expect_warning_if
 from cudf.utils.dtypes import np_dtypes_to_pandas_dtypes
@@ -373,7 +374,7 @@ def test_to_numeric_error(data, errors):
         ):
             cudf.to_numeric(data, errors=errors)
     else:
-        with expect_warning_if(errors == "ignore"):
+        with expect_warning_if(PANDAS_GE_220 and errors == "ignore"):
             expect = pd.to_numeric(data, errors=errors)
         with expect_warning_if(errors == "ignore"):
             got = cudf.to_numeric(data, errors=errors)
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index e0884a5819a..c2a30b76bea 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -1679,7 +1679,13 @@ def run_orc_columns_and_index_param(index_obj, index, columns):
     "columns",
     [
         None,
-        [],
+        pytest.param(
+            [],
+            marks=pytest.mark.skipif(
+                PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+                reason="Bug in older version of pandas",
+            ),
+        ),
     ],
 )
 def test_orc_columns_and_index_param(index_obj, index, columns):
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 6623c537ddf..8b59a7eef08 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -23,6 +23,7 @@
 
 import cudf
 from cudf._lib.parquet import read_parquet_chunked
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.io.parquet import (
     ParquetDatasetWriter,
     ParquetWriter,
@@ -3034,6 +3035,10 @@ def test_parquet_reader_rle_boolean(datadir):
 #                a list column in a schema, the cudf reader was confusing
 #                nesting information between a list column and a subsequent
 #                string column, ultimately causing a crash.
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Older versions of pandas do not have DataFrame.map()",
+)
 def test_parquet_reader_one_level_list2(datadir):
     # we are reading in a file containing binary types, but cudf returns
     # those as strings. so we have to massage the pandas data to get
diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py
index a70a2ea15dd..f276f394cd0 100644
--- a/python/cudf/cudf/tests/test_reductions.py
+++ b/python/cudf/cudf/tests/test_reductions.py
@@ -10,6 +10,7 @@
 
 import cudf
 from cudf import Series
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing import _utils as utils, assert_eq
 from cudf.testing._utils import NUMERIC_TYPES, expect_warning_if, gen_rand
@@ -342,6 +343,10 @@ def test_any_all_axis_none(data, op):
         "median",
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Warning not given on older versions of pandas",
+)
 def test_reductions_axis_none_warning(op):
     df = cudf.DataFrame({"a": [1, 2, 3], "b": [10, 2, 3]})
     pdf = df.to_pandas()
diff --git a/python/cudf/cudf/tests/test_replace.py b/python/cudf/cudf/tests/test_replace.py
index e5ee0127a74..3a8928297c0 100644
--- a/python/cudf/cudf/tests/test_replace.py
+++ b/python/cudf/cudf/tests/test_replace.py
@@ -10,7 +10,11 @@
 import pytest
 
 import cudf
-from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+from cudf.core._compat import (
+    PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_GE_220,
+    PANDAS_VERSION,
+)
 from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype, Decimal128Dtype
 from cudf.testing import assert_eq
 from cudf.testing._utils import (
@@ -66,7 +70,7 @@ def test_series_replace_all(gsr, to_replace, value):
     )
     with expect_warning_if(expect_warn):
         actual = gsr.replace(to_replace=gd_to_replace, value=gd_value)
-    with expect_warning_if(expect_warn):
+    with expect_warning_if(expect_warn and PANDAS_GE_220):
         if pd_value is None:
             # TODO: Remove this workaround once cudf
             # introduces `no_default` values
@@ -91,7 +95,7 @@ def test_series_replace():
 
     # Categorical
     psr3 = pd.Series(["one", "two", "three"], dtype="category")
-    with pytest.warns(FutureWarning):
+    with expect_warning_if(PANDAS_GE_220, FutureWarning):
         psr4 = psr3.replace("one", "two")
     sr3 = cudf.from_pandas(psr3)
     with pytest.warns(FutureWarning):
@@ -100,7 +104,7 @@ def test_series_replace():
         psr4.sort_values().reset_index(drop=True),
         sr4.sort_values().reset_index(drop=True),
     )
-    with pytest.warns(FutureWarning):
+    with expect_warning_if(PANDAS_GE_220, FutureWarning):
         psr5 = psr3.replace("one", "five")
     with pytest.warns(FutureWarning):
         sr5 = sr3.replace("one", "five")
@@ -517,7 +521,7 @@ def test_fillna_categorical(psr_data, fill_value, inplace):
             pd.date_range(
                 "2010-01-01",
                 "2020-01-10",
-                freq="1YE",
+                freq="1YE" if PANDAS_GE_220 else "1y",
             )
         ),
         pd.Series(["2010-01-01", None, "2011-10-10"], dtype="datetime64[ns]"),
@@ -564,7 +568,7 @@ def test_fillna_categorical(psr_data, fill_value, inplace):
             pd.date_range(
                 "2010-01-01",
                 "2020-01-10",
-                freq="1YE",
+                freq="1YE" if PANDAS_GE_220 else "1y",
             )
         )
         + pd.Timedelta("1d"),
@@ -1069,6 +1073,10 @@ def test_numeric_series_replace_dtype(series_dtype, replacement):
         ),
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Warning not given on older versions of pandas",
+)
 def test_replace_inplace(pframe, replace_args):
     gpu_frame = cudf.from_pandas(pframe)
     pandas_frame = pframe.copy()
diff --git a/python/cudf/cudf/tests/test_resampling.py b/python/cudf/cudf/tests/test_resampling.py
index 95fa8e9a50a..a61477981f8 100644
--- a/python/cudf/cudf/tests/test_resampling.py
+++ b/python/cudf/cudf/tests/test_resampling.py
@@ -5,6 +5,7 @@
 import pytest
 
 import cudf
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.testing import assert_eq
 
 
@@ -147,6 +148,10 @@ def test_dataframe_resample_level():
         ("10D", "1D", "s"),
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_resampling_frequency_conversion(in_freq, sampling_freq, out_freq):
     # test that we cast to the appropriate frequency
     # when resampling:
@@ -164,6 +169,10 @@ def test_resampling_frequency_conversion(in_freq, sampling_freq, out_freq):
     assert got.index.dtype == np.dtype(f"datetime64[{out_freq}]")
 
 
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_resampling_downsampling_ms():
     pdf = pd.DataFrame(
         {
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index 50db4302b75..4235affd4d1 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -8,10 +8,19 @@
 import pytest
 
 import cudf
-from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
+from cudf.core._compat import (
+    PANDAS_CURRENT_SUPPORTED_VERSION,
+    PANDAS_GE_220,
+    PANDAS_VERSION,
+)
 from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.testing import assert_eq
-from cudf.testing._utils import ALL_TYPES, DATETIME_TYPES, NUMERIC_TYPES
+from cudf.testing._utils import (
+    ALL_TYPES,
+    DATETIME_TYPES,
+    NUMERIC_TYPES,
+    expect_warning_if,
+)
 
 pytest_xfail = pytest.mark.xfail
 pytestmark = pytest.mark.spilling
@@ -220,7 +229,7 @@ def test_df_stack_multiindex_column_axis(columns, index, level, dropna):
 
     with pytest.warns(FutureWarning):
         got = gdf.stack(level=level, dropna=dropna, future_stack=False)
-    with pytest.warns(FutureWarning):
+    with expect_warning_if(PANDAS_GE_220, FutureWarning):
         expect = pdf.stack(level=level, dropna=dropna, future_stack=False)
 
     assert_eq(expect, got, check_dtype=False)
@@ -265,7 +274,7 @@ def test_df_stack_multiindex_column_axis_pd_example(level):
 
     df = pd.DataFrame(np.random.randn(4, 4), columns=columns)
 
-    with pytest.warns(FutureWarning):
+    with expect_warning_if(PANDAS_GE_220, FutureWarning):
         expect = df.stack(level=level, future_stack=False)
     gdf = cudf.from_pandas(df)
     with pytest.warns(FutureWarning):
diff --git a/python/cudf/cudf/tests/test_stats.py b/python/cudf/cudf/tests/test_stats.py
index d5f63fdab77..f952cea07f8 100644
--- a/python/cudf/cudf/tests/test_stats.py
+++ b/python/cudf/cudf/tests/test_stats.py
@@ -447,6 +447,10 @@ def test_cov1d(data1, data2):
     ],
 )
 @pytest.mark.parametrize("method", ["spearman", "pearson"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Warnings missing on older pandas (scipy version seems unrelated?)",
+)
 def test_corr1d(data1, data2, method):
     if method == "spearman":
         # Pandas uses scipy.stats.spearmanr code-path
@@ -585,6 +589,10 @@ def test_min_count_ops(data, ops, skipna, min_count):
     ],
 )
 @pytest.mark.parametrize("dtype", ["datetime64[ns]", "timedelta64[ns]"])
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_cov_corr_datetime_timedelta(data1, data2, dtype):
     gsr1 = cudf.Series(data1, dtype=dtype)
     gsr2 = cudf.Series(data2, dtype=dtype)
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index 505d5d0b9cc..d10c531d757 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -23,6 +23,7 @@
 from numba import NumbaDeprecationWarning
 from pytz import utc
 
+from cudf.core._compat import PANDAS_GE_220
 from cudf.pandas import LOADED, Profiler
 from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object
 
@@ -536,12 +537,15 @@ def test_array_ufunc(series):
 @pytest.mark.xfail(strict=False, reason="Fails in CI, passes locally.")
 def test_groupby_apply_func_returns_series(dataframe):
     pdf, df = dataframe
+    if PANDAS_GE_220:
+        kwargs = {"include_groups": False}
+    else:
+        kwargs = {}
+
     expect = pdf.groupby("a").apply(
-        lambda group: pd.Series({"x": 1}), include_groups=False
-    )
-    got = df.groupby("a").apply(
-        lambda group: xpd.Series({"x": 1}), include_groups=False
+        lambda group: pd.Series({"x": 1}), **kwargs
     )
+    got = df.groupby("a").apply(lambda group: xpd.Series({"x": 1}), **kwargs)
     tm.assert_equal(expect, got)
 
 
diff --git a/python/dask_cudf/dask_cudf/tests/test_applymap.py b/python/dask_cudf/dask_cudf/tests/test_applymap.py
index d84235481c3..e4e79b7b8cf 100644
--- a/python/dask_cudf/dask_cudf/tests/test_applymap.py
+++ b/python/dask_cudf/dask_cudf/tests/test_applymap.py
@@ -5,6 +5,8 @@
 
 from dask import dataframe as dd
 
+from cudf.core._compat import PANDAS_GE_210
+
 from dask_cudf.tests.utils import _make_random_frame
 
 
@@ -18,6 +20,10 @@
     ],
 )
 @pytest.mark.parametrize("has_na", [True, False])
+@pytest.mark.skipif(
+    not PANDAS_GE_210,
+    reason="DataFrame.map requires pandas>=2.1.0",
+)
 def test_applymap_basic(func, has_na):
     size = 2000
     pdf, dgdf = _make_random_frame(size, include_na=False)
diff --git a/python/dask_cudf/dask_cudf/tests/test_distributed.py b/python/dask_cudf/dask_cudf/tests/test_distributed.py
index be10b0d4843..d03180852eb 100644
--- a/python/dask_cudf/dask_cudf/tests/test_distributed.py
+++ b/python/dask_cudf/dask_cudf/tests/test_distributed.py
@@ -80,6 +80,11 @@ def test_str_series_roundtrip():
 
 
 def test_p2p_shuffle():
+    pytest.importorskip(
+        "pyarrow",
+        minversion="14.0.1",
+        reason="P2P shuffling requires pyarrow>=14.0.1",
+    )
     # Check that we can use `shuffle_method="p2p"`
     with dask_cuda.LocalCUDACluster(n_workers=1) as cluster:
         with Client(cluster):
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index cf916b713b2..7b9f0ca328a 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -9,6 +9,7 @@
 from dask.utils_test import hlg_layer
 
 import cudf
+from cudf.core._compat import PANDAS_CURRENT_SUPPORTED_VERSION, PANDAS_VERSION
 from cudf.testing._utils import expect_warning_if
 
 import dask_cudf
@@ -316,6 +317,10 @@ def test_groupby_dropna_cudf(dropna, by):
         (None, ["a", "d"]),
     ],
 )
+@pytest.mark.skipif(
+    PANDAS_VERSION < PANDAS_CURRENT_SUPPORTED_VERSION,
+    reason="Fails in older versions of pandas",
+)
 def test_groupby_dropna_dask(dropna, by):
     # NOTE: This test is borrowed from upstream dask
     #       (dask/dask/dataframe/tests/test_groupby.py)

From e1ab1e799d7a29289419014e19ec5c6f2e99ae91 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Thu, 5 Sep 2024 09:48:03 -0400
Subject: [PATCH 773/842] Make isinstance check pass for proxy ndarrays
 (#16601)

Closes #14537.

Authors:
  - Matthew Murray (https://github.com/Matt711)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Matthew Roeschke (https://github.com/mroeschke)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/16601
---
 python/cudf/cudf/pandas/_wrappers/numpy.py    | 23 +++++++++
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 26 +++++++++-
 python/cudf/cudf/pandas/proxy_base.py         | 22 ++++++++
 .../cudf_pandas_tests/test_cudf_pandas.py     | 50 ++++++++++++++++++-
 4 files changed, 119 insertions(+), 2 deletions(-)
 create mode 100644 python/cudf/cudf/pandas/proxy_base.py

diff --git a/python/cudf/cudf/pandas/_wrappers/numpy.py b/python/cudf/cudf/pandas/_wrappers/numpy.py
index 90ac5198270..d5e669cb58f 100644
--- a/python/cudf/cudf/pandas/_wrappers/numpy.py
+++ b/python/cudf/cudf/pandas/_wrappers/numpy.py
@@ -10,10 +10,13 @@
 from packaging import version
 
 from ..fast_slow_proxy import (
+    _fast_slow_function_call,
     _FastSlowAttribute,
+    is_proxy_object,
     make_final_proxy_type,
     make_intermediate_proxy_type,
 )
+from ..proxy_base import ProxyNDarrayBase
 from .common import (
     array_interface,
     array_method,
@@ -105,18 +108,38 @@ def wrap_ndarray(cls, arr: cupy.ndarray | numpy.ndarray, constructor):
         return super(cls, cls)._fsproxy_wrap(arr, constructor)
 
 
+def ndarray__array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+    result, _ = _fast_slow_function_call(
+        getattr(ufunc, method),
+        *inputs,
+        **kwargs,
+    )
+    if isinstance(result, tuple):
+        if is_proxy_object(result[0]) and isinstance(
+            result[0]._fsproxy_wrapped, numpy.ndarray
+        ):
+            return tuple(numpy.asarray(x) for x in result)
+    elif is_proxy_object(result) and isinstance(
+        result._fsproxy_wrapped, numpy.ndarray
+    ):
+        return numpy.asarray(result)
+    return result
+
+
 ndarray = make_final_proxy_type(
     "ndarray",
     cupy.ndarray,
     numpy.ndarray,
     fast_to_slow=cupy.ndarray.get,
     slow_to_fast=cupy.asarray,
+    bases=(ProxyNDarrayBase,),
     additional_attributes={
         "__array__": array_method,
         # So that pa.array(wrapped-numpy-array) works
         "__arrow_array__": arrow_array_method,
         "__cuda_array_interface__": cuda_array_interface,
         "__array_interface__": array_interface,
+        "__array_ufunc__": ndarray__array_ufunc__,
         # ndarrays are unhashable
         "__hash__": None,
         # iter(cupy-array) produces an iterable of zero-dim device
diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 4b0fd9a5b36..afa1ce5f86c 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -19,6 +19,7 @@
 from ..options import _env_get_bool
 from ..testing import assert_eq
 from .annotation import nvtx
+from .proxy_base import ProxyNDarrayBase
 
 
 def call_operator(fn, args, kwargs):
@@ -564,7 +565,17 @@ def _fsproxy_wrap(cls, value, func):
         _FinalProxy subclasses can override this classmethod if they
         need particular behaviour when wrapped up.
         """
-        proxy = object.__new__(cls)
+        # TODO: Replace the if-elif-else using singledispatch helper function
+        base_class = _get_proxy_base_class(cls)
+        if base_class is object:
+            proxy = base_class.__new__(cls)
+        elif base_class is ProxyNDarrayBase:
+            proxy = base_class.__new__(cls, value)
+        else:
+            raise TypeError(
+                f"Cannot create an proxy instance of {cls.__name__} using base class {base_class.__name__}. "
+                f"Expected either 'object' or another type in 'PROXY_BASE_CLASSES'"
+            )
         proxy._fsproxy_wrapped = value
         return proxy
 
@@ -1193,6 +1204,19 @@ def is_proxy_object(obj: Any) -> bool:
     return False
 
 
+def _get_proxy_base_class(cls):
+    """Returns the proxy base class if one exists"""
+    for proxy_class in PROXY_BASE_CLASSES:
+        if proxy_class in cls.__mro__:
+            return proxy_class
+    return object
+
+
+PROXY_BASE_CLASSES: set[type] = {
+    ProxyNDarrayBase,
+}
+
+
 NUMPY_TYPES: set[str] = set(np.sctypeDict.values())
 
 
diff --git a/python/cudf/cudf/pandas/proxy_base.py b/python/cudf/cudf/pandas/proxy_base.py
new file mode 100644
index 00000000000..6f732834e94
--- /dev/null
+++ b/python/cudf/cudf/pandas/proxy_base.py
@@ -0,0 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import cupy as cp
+import numpy as np
+
+
+class ProxyNDarrayBase(np.ndarray):
+    def __new__(cls, arr):
+        if isinstance(arr, cp.ndarray):
+            arr = arr.get()
+        if not isinstance(arr, np.ndarray):
+            raise TypeError(
+                "Unsupported array type. Must be numpy.ndarray or cupy.ndarray"
+            )
+        return np.asarray(arr, dtype=arr.dtype).view(cls)
+
+    def __array_finalize__(self, obj):
+        if obj is None:
+            return
+        self._fsproxy_wrapped = getattr(obj, "_fsproxy_wrapped", obj)
diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
index d10c531d757..c4ab4b0a853 100644
--- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
+++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py
@@ -14,18 +14,20 @@
 import types
 from io import BytesIO, StringIO
 
+import cupy as cp
 import jupyter_client
 import nbformat
 import numpy as np
 import pyarrow as pa
 import pytest
 from nbconvert.preprocessors import ExecutePreprocessor
-from numba import NumbaDeprecationWarning
+from numba import NumbaDeprecationWarning, vectorize
 from pytz import utc
 
 from cudf.core._compat import PANDAS_GE_220
 from cudf.pandas import LOADED, Profiler
 from cudf.pandas.fast_slow_proxy import _Unusable, is_proxy_object
+from cudf.testing import assert_eq
 
 if not LOADED:
     raise ImportError("These tests must be run with cudf.pandas loaded")
@@ -1690,3 +1692,49 @@ def test_notebook_slow_repr():
         assert (
             string in html_result
         ), f"Expected string {string} not found in the output"
+
+
+def test_numpy_ndarray_isinstancecheck(array):
+    arr1, arr2 = array
+    assert isinstance(arr1, np.ndarray)
+    assert isinstance(arr2, np.ndarray)
+
+
+def test_numpy_ndarray_np_ufunc(array):
+    arr1, arr2 = array
+
+    @np.vectorize
+    def add_one_ufunc(arr):
+        return arr + 1
+
+    assert_eq(add_one_ufunc(arr1), add_one_ufunc(arr2))
+
+
+def test_numpy_ndarray_cp_ufunc(array):
+    arr1, arr2 = array
+
+    @cp.vectorize
+    def add_one_ufunc(arr):
+        return arr + 1
+
+    assert_eq(add_one_ufunc(cp.asarray(arr1)), add_one_ufunc(arr2))
+
+
+def test_numpy_ndarray_numba_ufunc(array):
+    arr1, arr2 = array
+
+    @vectorize
+    def add_one_ufunc(arr):
+        return arr + 1
+
+    assert_eq(add_one_ufunc(arr1), add_one_ufunc(arr2))
+
+
+def test_numpy_ndarray_numba_cuda_ufunc(array):
+    arr1, arr2 = array
+
+    @vectorize(["int64(int64)"], target="cuda")
+    def add_one_ufunc(a):
+        return a + 1
+
+    assert_eq(cp.asarray(add_one_ufunc(arr1)), cp.asarray(add_one_ufunc(arr2)))

From 949f1719226f0b27a4df8fedbf4624f46fb0589d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 5 Sep 2024 09:52:01 -0400
Subject: [PATCH 774/842] Performance improvement for strings::slice for wide
 strings (#16574)

Improves performance of wide strings (avg > 64 bytes) when using `cudf::strings::slice_strings`.
Addresses some concerns from issue #15924

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16574
---
 cpp/src/strings/slice.cu | 182 ++++++++++++++++++++++++++++++---------
 1 file changed, 141 insertions(+), 41 deletions(-)

diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index cf82a837c51..d8324a9b08e 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/slice.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -32,6 +33,8 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/resource_ref.hpp>
 
+#include <cooperative_groups.h>
+#include <cooperative_groups/reduce.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
@@ -40,6 +43,9 @@ namespace cudf {
 namespace strings {
 namespace detail {
 namespace {
+
+constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 128;
+
 /**
  * @brief Function logic for compute_substrings_from_fn API
  *
@@ -51,17 +57,19 @@ struct substring_from_fn {
   IndexIterator const starts;
   IndexIterator const stops;
 
-  __device__ string_view operator()(size_type idx) const
+  __device__ string_index_pair operator()(size_type idx) const
   {
-    if (d_column.is_null(idx)) { return string_view{nullptr, 0}; }
+    if (d_column.is_null(idx)) { return string_index_pair{nullptr, 0}; }
     auto const d_str  = d_column.template element<string_view>(idx);
     auto const length = d_str.length();
     auto const start  = std::max(starts[idx], 0);
-    if (start >= length) { return string_view{}; }
+    if (start >= length) { return string_index_pair{"", 0}; }
 
-    auto const stop = stops[idx];
-    auto const end  = (((stop < 0) || (stop > length)) ? length : stop);
-    return start < end ? d_str.substr(start, end - start) : string_view{};
+    auto const stop    = stops[idx];
+    auto const end     = (((stop < 0) || (stop > length)) ? length : stop);
+    auto const sub_str = start < end ? d_str.substr(start, end - start) : string_view{};
+    return sub_str.empty() ? string_index_pair{"", 0}
+                           : string_index_pair{sub_str.data(), sub_str.size_bytes()};
   }
 
   substring_from_fn(column_device_view const& d_column, IndexIterator starts, IndexIterator stops)
@@ -70,6 +78,82 @@ struct substring_from_fn {
   }
 };
 
+template <typename IndexIterator>
+CUDF_KERNEL void substring_from_kernel(column_device_view const d_strings,
+                                       IndexIterator starts,
+                                       IndexIterator stops,
+                                       string_index_pair* d_output)
+{
+  auto const idx     = cudf::detail::grid_1d::global_thread_id();
+  auto const str_idx = idx / cudf::detail::warp_size;
+  if (str_idx >= d_strings.size()) { return; }
+
+  namespace cg    = cooperative_groups;
+  auto const warp = cg::tiled_partition<cudf::detail::warp_size>(cg::this_thread_block());
+
+  if (d_strings.is_null(str_idx)) {
+    if (warp.thread_rank() == 0) { d_output[str_idx] = string_index_pair{nullptr, 0}; }
+    return;
+  }
+  auto const d_str = d_strings.element<cudf::string_view>(str_idx);
+  if (d_str.empty()) {
+    if (warp.thread_rank() == 0) { d_output[str_idx] = string_index_pair{"", 0}; }
+    return;
+  }
+
+  auto const start = max(starts[str_idx], 0);
+  auto stop        = [stop = stops[str_idx]] {
+    return (stop < 0) ? std::numeric_limits<size_type>::max() : stop;
+  }();
+  auto const end = d_str.data() + d_str.size_bytes();
+
+  auto start_counts = thrust::make_pair(0, 0);
+  auto stop_counts  = thrust::make_pair(0, 0);
+
+  auto itr = d_str.data() + warp.thread_rank();
+
+  size_type char_count = 0;
+  size_type byte_count = 0;
+  while (byte_count < d_str.size_bytes()) {
+    if (char_count <= start) { start_counts = {char_count, byte_count}; }
+    if (char_count <= stop) {
+      stop_counts = {char_count, byte_count};
+    } else {
+      break;
+    }
+    size_type const cc = (itr < end) && is_begin_utf8_char(*itr);
+    size_type const bc = (itr < end);
+    char_count += cg::reduce(warp, cc, cg::plus<int>());
+    byte_count += cg::reduce(warp, bc, cg::plus<int>());
+    itr += cudf::detail::warp_size;
+  }
+
+  if (warp.thread_rank() == 0) {
+    if (start >= char_count) {
+      d_output[str_idx] = string_index_pair{"", 0};
+      return;
+    }
+
+    // we are just below start/stop and must now increment up to it from here
+    auto first_byte = start_counts.second;
+    if (start_counts.first < start) {
+      auto const sub_str = string_view(d_str.data() + first_byte, d_str.size_bytes() - first_byte);
+      first_byte += std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first));
+    }
+
+    stop           = max(stop, char_count);
+    auto last_byte = stop_counts.second;
+    if (stop_counts.first < stop) {
+      auto const sub_str = string_view(d_str.data() + last_byte, d_str.size_bytes() - last_byte);
+      last_byte += std::get<0>(bytes_to_character_position(sub_str, stop - stop_counts.first));
+    }
+
+    d_output[str_idx] = (first_byte < last_byte)
+                          ? string_index_pair{d_str.data() + first_byte, last_byte - first_byte}
+                          : string_index_pair{"", 0};
+  }
+}
+
 /**
  * @brief Function logic for the substring API.
  *
@@ -149,54 +233,67 @@ struct substring_fn {
  *
  * @tparam IndexIterator Iterator type for character position values
  *
- * @param d_column Input strings column to substring
+ * @param input Input strings column to substring
  * @param starts Start positions index iterator
  * @param stops Stop positions index iterator
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  */
 template <typename IndexIterator>
-std::unique_ptr<column> compute_substrings_from_fn(column_device_view const& d_column,
+std::unique_ptr<column> compute_substrings_from_fn(strings_column_view const& input,
                                                    IndexIterator starts,
                                                    IndexIterator stops,
                                                    rmm::cuda_stream_view stream,
                                                    rmm::device_async_resource_ref mr)
 {
-  auto results = rmm::device_uvector<string_view>(d_column.size(), stream);
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::counting_iterator<size_type>(0),
-                    thrust::counting_iterator<size_type>(d_column.size()),
-                    results.begin(),
-                    substring_from_fn{d_column, starts, stops});
-  return make_strings_column(results, string_view{nullptr, 0}, stream, mr);
+  auto results = rmm::device_uvector<string_index_pair>(input.size(), stream);
+
+  auto const d_column = column_device_view::create(input.parent(), stream);
+
+  if ((input.chars_size(stream) / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
+    thrust::transform(rmm::exec_policy(stream),
+                      thrust::counting_iterator<size_type>(0),
+                      thrust::counting_iterator<size_type>(input.size()),
+                      results.begin(),
+                      substring_from_fn{*d_column, starts, stops});
+  } else {
+    constexpr thread_index_type block_size = 512;
+    auto const threads =
+      static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size;
+    auto const num_blocks = util::div_rounding_up_safe(threads, block_size);
+    substring_from_kernel<IndexIterator>
+      <<<num_blocks, block_size, 0, stream.value()>>>(*d_column, starts, stops, results.data());
+  }
+  return make_strings_column(results.begin(), results.end(), stream, mr);
 }
 
 }  // namespace
 
 //
-std::unique_ptr<column> slice_strings(strings_column_view const& strings,
+std::unique_ptr<column> slice_strings(strings_column_view const& input,
                                       numeric_scalar<size_type> const& start,
                                       numeric_scalar<size_type> const& stop,
                                       numeric_scalar<size_type> const& step,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  if (strings.is_empty()) return make_empty_column(type_id::STRING);
+  if (input.size() == input.null_count()) {
+    return std::make_unique<column>(input.parent(), stream, mr);
+  }
 
   auto const step_valid = step.is_valid(stream);
-  auto const step_value = step_valid ? step.value(stream) : 0;
+  auto const step_value = step_valid ? step.value(stream) : 1;
   if (step_valid) { CUDF_EXPECTS(step_value != 0, "Step parameter must not be 0"); }
 
-  auto const d_column = column_device_view::create(strings.parent(), stream);
-
   // optimization for (step==1 and start < stop) -- expect this to be most common
-  if (step_value == 1 and start.is_valid(stream) and stop.is_valid(stream)) {
-    auto const start_value = start.value(stream);
-    auto const stop_value  = stop.value(stream);
+  if (step_value == 1) {
+    auto const start_value = start.is_valid(stream) ? start.value(stream) : 0;
+    auto const stop_value =
+      stop.is_valid(stream) ? stop.value(stream) : std::numeric_limits<size_type>::max();
     // note that any negative values here must use the alternate function below
     if ((start_value >= 0) && (start_value < stop_value)) {
       // this is about 2x faster on long strings for this common case
-      return compute_substrings_from_fn(*d_column,
+      return compute_substrings_from_fn(input,
                                         thrust::constant_iterator<size_type>(start_value),
                                         thrust::constant_iterator<size_type>(stop_value),
                                         stream,
@@ -204,31 +301,35 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
     }
   }
 
+  auto const d_column = column_device_view::create(input.parent(), stream);
+
   auto const d_start = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(start));
   auto const d_stop  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(stop));
   auto const d_step  = get_scalar_device_view(const_cast<numeric_scalar<size_type>&>(step));
 
   auto [offsets, chars] = make_strings_children(
-    substring_fn{*d_column, d_start, d_stop, d_step}, strings.size(), stream, mr);
+    substring_fn{*d_column, d_start, d_stop, d_step}, input.size(), stream, mr);
 
-  return make_strings_column(strings.size(),
+  return make_strings_column(input.size(),
                              std::move(offsets),
                              chars.release(),
-                             strings.null_count(),
-                             cudf::detail::copy_bitmask(strings.parent(), stream, mr));
+                             input.null_count(),
+                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
 }
 
-std::unique_ptr<column> slice_strings(strings_column_view const& strings,
+std::unique_ptr<column> slice_strings(strings_column_view const& input,
                                       column_view const& starts_column,
                                       column_view const& stops_column,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  size_type strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
-  CUDF_EXPECTS(starts_column.size() == strings_count,
+  if (input.size() == input.null_count()) {
+    return std::make_unique<column>(input.parent(), stream, mr);
+  }
+
+  CUDF_EXPECTS(starts_column.size() == input.size(),
                "Parameter starts must have the same number of rows as strings.");
-  CUDF_EXPECTS(stops_column.size() == strings_count,
+  CUDF_EXPECTS(stops_column.size() == input.size(),
                "Parameter stops must have the same number of rows as strings.");
   CUDF_EXPECTS(cudf::have_same_types(starts_column, stops_column),
                "Parameters starts and stops must be of the same type.",
@@ -242,17 +343,16 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                "Positions values must be fixed width type.",
                cudf::data_type_error);
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto starts_iter    = cudf::detail::indexalator_factory::make_input_iterator(starts_column);
-  auto stops_iter     = cudf::detail::indexalator_factory::make_input_iterator(stops_column);
-  return compute_substrings_from_fn(*strings_column, starts_iter, stops_iter, stream, mr);
+  auto starts_iter = cudf::detail::indexalator_factory::make_input_iterator(starts_column);
+  auto stops_iter  = cudf::detail::indexalator_factory::make_input_iterator(stops_column);
+  return compute_substrings_from_fn(input, starts_iter, stops_iter, stream, mr);
 }
 
 }  // namespace detail
 
 // external API
 
-std::unique_ptr<column> slice_strings(strings_column_view const& strings,
+std::unique_ptr<column> slice_strings(strings_column_view const& input,
                                       numeric_scalar<size_type> const& start,
                                       numeric_scalar<size_type> const& stop,
                                       numeric_scalar<size_type> const& step,
@@ -260,17 +360,17 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice_strings(strings, start, stop, step, stream, mr);
+  return detail::slice_strings(input, start, stop, step, stream, mr);
 }
 
-std::unique_ptr<column> slice_strings(strings_column_view const& strings,
+std::unique_ptr<column> slice_strings(strings_column_view const& input,
                                       column_view const& starts_column,
                                       column_view const& stops_column,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice_strings(strings, starts_column, stops_column, stream, mr);
+  return detail::slice_strings(input, starts_column, stops_column, stream, mr);
 }
 
 }  // namespace strings

From 0cc059fb2b81adbdc9593052292838995dc78b10 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 5 Sep 2024 15:07:29 -0700
Subject: [PATCH 775/842] Upgrade to nvcomp 4.0.1 (#16076)

This PR bumps nvcomp to 4.0.1.

Depends on:
- https://github.com/conda-forge/nvcomp-feedstock/pull/15
- https://github.com/rapidsai/rapids-cmake/pull/633
- https://github.com/rapidsai/kvikio/pull/449

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Robert Maynard (https://github.com/robertmaynard)
  - Peixin (https://github.com/pxLi)
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/cudf/pull/16076
---
 ci/build_wheel_cudf.sh                                  | 2 --
 ci/build_wheel_pylibcudf.sh                             | 2 --
 conda/environments/all_cuda-118_arch-x86_64.yaml        | 2 +-
 conda/environments/all_cuda-125_arch-x86_64.yaml        | 2 +-
 conda/recipes/libcudf/conda_build_config.yaml           | 2 +-
 dependencies.yaml                                       | 2 +-
 java/pom.xml                                            | 4 +---
 java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java | 3 ---
 java/src/main/native/CMakeLists.txt                     | 5 ++---
 python/libcudf/CMakeLists.txt                           | 3 +--
 10 files changed, 8 insertions(+), 19 deletions(-)

diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh
index e5565c4b53c..fb93b06dbe2 100755
--- a/ci/build_wheel_cudf.sh
+++ b/ci/build_wheel_cudf.sh
@@ -23,8 +23,6 @@ export PIP_CONSTRAINT="/tmp/constraints.txt"
 python -m auditwheel repair \
     --exclude libcudf.so \
     --exclude libnvcomp.so \
-    --exclude libnvcomp_bitcomp.so \
-    --exclude libnvcomp_gdeflate.so \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh
index 0e4745bda28..5e9f7f8a0c4 100755
--- a/ci/build_wheel_pylibcudf.sh
+++ b/ci/build_wheel_pylibcudf.sh
@@ -21,8 +21,6 @@ export PIP_CONSTRAINT="/tmp/constraints.txt"
 python -m auditwheel repair \
     --exclude libcudf.so \
     --exclude libnvcomp.so \
-    --exclude libnvcomp_bitcomp.so \
-    --exclude libnvcomp_gdeflate.so \
     -w ${package_dir}/final_dist \
     ${package_dir}/dist/*
 
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 7f6967d7287..fa4c77d67b4 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -58,7 +58,7 @@ dependencies:
 - numpy>=1.23,<3.0a0
 - numpydoc
 - nvcc_linux-64=11.8
-- nvcomp==3.0.6
+- nvcomp==4.0.1
 - nvtx>=0.2.1
 - openpyxl
 - packaging
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index c1315e73f16..9b487347a5e 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -56,7 +56,7 @@ dependencies:
 - numba>=0.57
 - numpy>=1.23,<3.0a0
 - numpydoc
-- nvcomp==3.0.6
+- nvcomp==4.0.1
 - nvtx>=0.2.1
 - openpyxl
 - packaging
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 4b1c4cca828..dae04c08aca 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -35,7 +35,7 @@ spdlog_version:
   - ">=1.12.0,<1.13"
 
 nvcomp_version:
-  - "=3.0.6"
+  - "=4.0.1"
 
 zlib_version:
   - ">=1.2.13"
diff --git a/dependencies.yaml b/dependencies.yaml
index f8b231efd6d..a3f0ffeec82 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -354,7 +354,7 @@ dependencies:
           - flatbuffers==24.3.25
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
-          - nvcomp==3.0.6
+          - nvcomp==4.0.1
           - spdlog>=1.12.0,<1.13
   rapids_build_skbuild:
     common:
diff --git a/java/pom.xml b/java/pom.xml
index 9694e741f16..e4f1cdf64e7 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -1,6 +1,6 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!--
-  Copyright (c) 2019-2023, NVIDIA CORPORATION.
+  Copyright (c) 2019-2024, NVIDIA CORPORATION.
 
   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
@@ -590,8 +590,6 @@
                                         <include>libcudfjni.so</include>
                                         <include>libcufilejni.so</include>
                                         <include>libnvcomp.so</include>
-                                        <include>libnvcomp_gdeflate.so</include>
-                                        <include>libnvcomp_bitcomp.so</include>
                                     </includes>
                                 </resource>
                                 <resource>
diff --git a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
index 7ee590e3c82..58182c3e62e 100755
--- a/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
+++ b/java/src/main/java/ai/rapids/cudf/NativeDepsLoader.java
@@ -54,9 +54,6 @@ public class NativeDepsLoader {
    * subsequent stages are loaded.
    */
   private static final String[][] loadOrder = new String[][]{
-      new String[]{
-          "nvcomp_bitcomp", "nvcomp_gdeflate"
-      },
       new String[]{
           "nvcomp"
       },
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index c18a90140b6..32045f3c50e 100644
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -267,9 +267,8 @@ if(TARGET nvcomp::nvcomp)
   add_custom_command(
     TARGET cudfjni
     PRE_LINK
-    COMMAND
-      ${CMAKE_COMMAND} -E copy $<TARGET_FILE:nvcomp::nvcomp> $<TARGET_FILE:nvcomp::nvcomp_gdeflate>
-      $<TARGET_FILE:nvcomp::nvcomp_bitcomp> "${PROJECT_BINARY_DIR}"
+    COMMAND ${CMAKE_COMMAND} -E copy $<TARGET_FILE:nvcomp::nvcomp>
+            "${PROJECT_BINARY_DIR}/libnvcomp.so"
     COMMENT "Copying nvcomp libraries to ${PROJECT_BINARY_DIR}"
   )
 endif()
diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt
index 96eb6c3bb30..0a8f5c4807d 100644
--- a/python/libcudf/CMakeLists.txt
+++ b/python/libcudf/CMakeLists.txt
@@ -48,6 +48,5 @@ add_subdirectory(../../cpp cudf-cpp)
 # Ensure other libraries needed by libcudf.so get installed alongside it.
 include(cmake/Modules/WheelHelpers.cmake)
 install_aliased_imported_targets(
-  TARGETS cudf nvcomp::nvcomp nvcomp::nvcomp_gdeflate nvcomp::nvcomp_bitcomp DESTINATION
-  ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
+  TARGETS cudf nvcomp::nvcomp DESTINATION ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}
 )

From 0e86f621bbf32c6b5a72fa95afd1f74d6fa50aba Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Thu, 5 Sep 2024 17:10:27 -0500
Subject: [PATCH 776/842] Add performance tips to cudf.pandas FAQ. (#16693)

This PR adds a section with performance tips to the `cudf.pandas` FAQ.

I based this section on some common user questions about performance, to make it clearer that `cudf.pandas` is designed for optimal performance with large data sizes and provide some alternatives for common needs where `cudf` or `cudf.pandas` aren't the best fit. See these links for examples:

- https://github.com/rapidsai/cudf/issues/14548#issuecomment-1838529130
- https://github.com/rapidsai/cudf/issues/16065
- https://stackoverflow.com/questions/78626099/cudf-is-very-slow

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16693
---
 docs/cudf/source/cudf_pandas/faq.md | 38 ++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index cdf32216619..fa5d203f52c 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -32,7 +32,7 @@ pandas. You can learn more about these edge cases in
 
 We also run nightly tests that track interactions between
 `cudf.pandas` and other third party libraries. See
-[Third-Party Library Compatibility](#does-it-work-with-third-party-libraries).
+[Third-Party Library Compatibility](#does-cudf-pandas-work-with-third-party-libraries).
 
 ## How can I tell if `cudf.pandas` is active?
 
@@ -69,7 +69,38 @@ performance, try to use only functionality that can run entirely on GPU.
 This helps reduce the number of memory transfers needed to fallback to
 CPU.
 
-## Does it work with third-party libraries?
+## How can I improve performance of my workflow with `cudf.pandas`?
+
+Most workflows will see significant performance improvements with
+`cudf.pandas`. However, sometimes things can be slower than expected.
+First, it's important to note that GPUs are good at parallel processing
+of large amounts of data. Small data sizes may be slower on GPU than
+CPU, because of the cost of data transfers. cuDF achieves the highest
+performance with many rows of data. As a _very rough_ rule of thumb,
+`cudf.pandas` shines on workflows with more than 10,000 - 100,000 rows
+of data, depending on the algorithms, data types, and other factors.
+Datasets that are several gigabytes in size and/or have millions of
+rows are a great fit for `cudf.pandas`.
+
+Here are some more tips to improve workflow performance:
+
+- Reshape data so it is long rather than wide (more rows, fewer
+  columns). This improves cuDF's ability to execute in parallel on the
+  entire GPU!
+- Avoid element-wise iteration and mutation. If you can, use pandas
+  functions to manipulate an entire column at once rather than writing
+  raw `for` loops that compute and assign.
+- If your data is really an n-dimensional array with lots of columns
+  where you aim to do lots of math (like adding matrices),
+  [CuPy](https://cupy.dev/) or [NumPy](https://numpy.org/) may be a
+  better choice than pandas or `cudf.pandas`. Array libraries are built
+  for different use cases than DataFrame libraries, and will get optimal
+  performance from using contiguous memory for multidimensional array
+  storage. Use the `.values` method to convert a DataFrame or Series to
+  an array.
+
+(does-cudf-pandas-work-with-third-party-libraries)=
+## Does `cudf.pandas` work with third-party libraries?
 
 `cudf.pandas` is tested with numerous popular third-party libraries.
 `cudf.pandas` will not only work but will accelerate pandas operations
@@ -97,7 +128,7 @@ common interactions with the following Python libraries:
 Please review the section on [Known Limitations](#are-there-any-known-limitations)
 for details about what is expected not to work (and why).
 
-## Can I use this with Dask or PySpark?
+## Can I use `cudf.pandas` with Dask or PySpark?
 
 `cudf.pandas` is not designed for distributed or out-of-core computing
 (OOC) workflows today. If you are looking for accelerated OOC and
@@ -111,6 +142,7 @@ cuDF (learn more in [this
 blog](https://medium.com/rapids-ai/easy-cpu-gpu-arrays-and-dataframes-run-your-dask-code-where-youd-like-e349d92351d)) and the [RAPIDS Accelerator for Apache Spark](https://nvidia.github.io/spark-rapids/)
 provides a similar configuration-based plugin for Spark.
 
+(are-there-any-known-limitations)=
 ## Are there any known limitations?
 
 There are a few known limitations that you should be aware of:

From 715677e2d23f2f5981af51d10e6fb9bd7faa292a Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Thu, 5 Sep 2024 18:16:26 -0400
Subject: [PATCH 777/842] Add libcudf example with large strings (#15983)

Creating an example that shows reading large strings columns. This uses the 1 billion row challenge input data and provides three examples of loading this data:
- `brc` uses the CSV reader to load the input file in one call and aggregates the results using `groupby`
- `brc_chunks` uses the CSV reader to load the input file in chunks, aggregates each chunk, and computes the results
- `brc_pipeline` same as `brc_chunks` but input chunks are processed in separate threads/streams.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Gregory Kimball (https://github.com/GregoryKimball)
  - Bradley Dice (https://github.com/bdice)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/15983
---
 cpp/examples/billion_rows/CMakeLists.txt      |  34 ++++
 cpp/examples/billion_rows/README.md           |  44 +++++
 cpp/examples/billion_rows/brc.cpp             |  94 ++++++++++
 cpp/examples/billion_rows/brc_chunks.cpp      | 116 ++++++++++++
 cpp/examples/billion_rows/brc_pipeline.cpp    | 171 ++++++++++++++++++
 cpp/examples/billion_rows/common.hpp          |  47 +++++
 cpp/examples/billion_rows/groupby_results.cpp | 112 ++++++++++++
 cpp/examples/billion_rows/groupby_results.hpp |  55 ++++++
 cpp/examples/build.sh                         |   1 +
 9 files changed, 674 insertions(+)
 create mode 100644 cpp/examples/billion_rows/CMakeLists.txt
 create mode 100644 cpp/examples/billion_rows/README.md
 create mode 100644 cpp/examples/billion_rows/brc.cpp
 create mode 100644 cpp/examples/billion_rows/brc_chunks.cpp
 create mode 100644 cpp/examples/billion_rows/brc_pipeline.cpp
 create mode 100644 cpp/examples/billion_rows/common.hpp
 create mode 100644 cpp/examples/billion_rows/groupby_results.cpp
 create mode 100644 cpp/examples/billion_rows/groupby_results.hpp

diff --git a/cpp/examples/billion_rows/CMakeLists.txt b/cpp/examples/billion_rows/CMakeLists.txt
new file mode 100644
index 00000000000..d95bb73b258
--- /dev/null
+++ b/cpp/examples/billion_rows/CMakeLists.txt
@@ -0,0 +1,34 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+cmake_minimum_required(VERSION 3.26.4)
+
+include(../set_cuda_architecture.cmake)
+
+# initialize cuda architecture
+rapids_cuda_init_architectures(billion_rows)
+rapids_cuda_set_architectures(RAPIDS)
+
+project(
+  billion_rows
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+include(../fetch_dependencies.cmake)
+
+list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
+
+add_library(groupby_results OBJECT groupby_results.cpp)
+target_link_libraries(groupby_results PRIVATE cudf::cudf)
+
+add_executable(brc brc.cpp)
+target_link_libraries(brc PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:groupby_results>)
+install(TARGETS brc DESTINATION bin/examples/libcudf)
+
+add_executable(brc_chunks brc_chunks.cpp)
+target_link_libraries(brc_chunks PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:groupby_results>)
+install(TARGETS brc_chunks DESTINATION bin/examples/libcudf)
+
+add_executable(brc_pipeline brc_pipeline.cpp)
+target_link_libraries(brc_pipeline PRIVATE cudf::cudf nvToolsExt $<TARGET_OBJECTS:groupby_results>)
+install(TARGETS brc_pipeline DESTINATION bin/examples/libcudf)
diff --git a/cpp/examples/billion_rows/README.md b/cpp/examples/billion_rows/README.md
new file mode 100644
index 00000000000..73ff7aa19f0
--- /dev/null
+++ b/cpp/examples/billion_rows/README.md
@@ -0,0 +1,44 @@
+# libcudf C++ example for the 1 billion row challenge
+
+This C++ example demonstrates using libcudf APIs to read and process
+a table with 1 billion rows. The 1 billion row challenge is described here:
+https://github.com/gunnarmorling/1brc
+
+The examples load the 1 billion row text file using the CSV reader.
+The file contains around 400 unique city names (string type) along with
+random temperature values (float type).
+Once loaded, the examples performs groupby aggregations to find the
+minimum, maximum, and average temperature for each city.
+
+There are three examples included:
+1. `brc.cpp`
+   Loads the file in one call to the CSV reader.
+   This generally requires a large amount of available GPU memory.
+2. `brc_chunks.cpp`
+   Loads and processes the file in chunks.
+   The number of chunks to use is a parameter to the executable.
+3. `brc_pipeline.cpp`
+   Loads and processes the file in chunks with separate threads/streams.
+   The number of chunks and number of threads to use are parameters to the executable.
+
+An input file can be generated using the instructions from
+https://github.com/gunnarmorling/1brc.
+
+## Compile and execute
+
+```bash
+# Configure project
+cmake -S . -B build/
+# Build
+cmake --build build/ --parallel $PARALLEL_LEVEL
+# Execute
+build/brc input.txt
+# Execute in chunked mode with 25 chunks (default)
+build/brc_chunks input.txt 25
+# Execute in pipeline mode with 25 chunks and 2 threads (defaults)
+build/brc_pipeline input.txt 25 2
+```
+
+If your machine does not come with a pre-built libcudf binary, expect the
+first build to take some time, as it would build libcudf on the host machine.
+It may be sped up by configuring the proper `PARALLEL_LEVEL` number.
diff --git a/cpp/examples/billion_rows/brc.cpp b/cpp/examples/billion_rows/brc.cpp
new file mode 100644
index 00000000000..b7b292cf16e
--- /dev/null
+++ b/cpp/examples/billion_rows/brc.cpp
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "common.hpp"
+#include "groupby_results.hpp"
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/io/csv.hpp>
+#include <cudf/io/types.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <string>
+
+using elapsed_t = std::chrono::duration<double>;
+
+int main(int argc, char const** argv)
+{
+  if (argc < 2) {
+    std::cout << "required parameter: input-file-path\n";
+    return 1;
+  }
+
+  auto const input_file = std::string{argv[1]};
+  std::cout << "Input: " << input_file << std::endl;
+
+  auto const mr_name = std::string("pool");
+  auto resource      = create_memory_resource(mr_name);
+  auto stats_mr =
+    rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get());
+  rmm::mr::set_current_device_resource(&stats_mr);
+  auto stream = cudf::get_default_stream();
+
+  auto start = std::chrono::steady_clock::now();
+
+  auto const csv_result = [input_file, stream] {
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{input_file})
+        .header(-1)
+        .delimiter(';')
+        .doublequote(false)
+        .dtypes(std::vector<cudf::data_type>{cudf::data_type{cudf::type_id::STRING},
+                                             cudf::data_type{cudf::type_id::FLOAT32}})
+        .na_filter(false);
+    return cudf::io::read_csv(in_opts, stream).tbl;
+  }();
+  elapsed_t elapsed = std::chrono::steady_clock::now() - start;
+  std::cout << "File load time: " << elapsed.count() << " seconds\n";
+  auto const csv_table = csv_result->view();
+  std::cout << "Input rows: " << csv_table.num_rows() << std::endl;
+
+  auto const cities = csv_table.column(0);
+  auto const temps  = csv_table.column(1);
+
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+  aggregations.emplace_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
+  aggregations.emplace_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+  aggregations.emplace_back(cudf::make_mean_aggregation<cudf::groupby_aggregation>());
+
+  auto result = compute_results(cities, temps, std::move(aggregations), stream);
+
+  // The other 2 examples employ sorting for the sub-aggregates so enabling
+  // the following line may be more comparable in performance with them.
+  //
+  // result      = cudf::sort_by_key(result->view(), result->view().select({0}), {}, {}, stream);
+
+  stream.synchronize();
+
+  elapsed = std::chrono::steady_clock::now() - start;
+  std::cout << "Number of keys: " << result->num_rows() << std::endl;
+  std::cout << "Process time: " << elapsed.count() << " seconds\n";
+  std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n";
+
+  return 0;
+}
diff --git a/cpp/examples/billion_rows/brc_chunks.cpp b/cpp/examples/billion_rows/brc_chunks.cpp
new file mode 100644
index 00000000000..4a65c59e461
--- /dev/null
+++ b/cpp/examples/billion_rows/brc_chunks.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "common.hpp"
+#include "groupby_results.hpp"
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/io/csv.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
+#include <chrono>
+#include <filesystem>
+#include <iostream>
+#include <memory>
+#include <string>
+
+using elapsed_t = std::chrono::duration<double>;
+
+std::unique_ptr<cudf::table> load_chunk(std::string const& input_file,
+                                        std::size_t start,
+                                        std::size_t size,
+                                        rmm::cuda_stream_view stream)
+{
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{input_file})
+      .header(-1)
+      .delimiter(';')
+      .doublequote(false)
+      .byte_range_offset(start)
+      .byte_range_size(size)
+      .dtypes(std::vector<cudf::data_type>{cudf::data_type{cudf::type_id::STRING},
+                                           cudf::data_type{cudf::type_id::FLOAT32}})
+      .na_filter(false);
+  return cudf::io::read_csv(in_opts, stream).tbl;
+}
+
+int main(int argc, char const** argv)
+{
+  if (argc < 2) {
+    std::cout << "required parameter: input-file-path\n";
+    std::cout << "optional parameter: chunk-count\n";
+    return 1;
+  }
+
+  auto const input_file = std::string{argv[1]};
+  auto const divider    = (argc < 3) ? 25 : std::stoi(std::string(argv[2]));
+
+  std::cout << "Input: " << input_file << std::endl;
+  std::cout << "Chunks: " << divider << std::endl;
+
+  auto const mr_name = std::string("pool");
+  auto resource      = create_memory_resource(mr_name);
+  auto stats_mr =
+    rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get());
+  rmm::mr::set_current_device_resource(&stats_mr);
+  auto stream = cudf::get_default_stream();
+
+  std::filesystem::path p = input_file;
+  auto const file_size    = std::filesystem::file_size(p);
+
+  auto start = std::chrono::steady_clock::now();
+
+  std::vector<std::unique_ptr<cudf::table>> agg_data;
+  std::size_t chunk_size     = file_size / divider + ((file_size % divider) != 0);
+  std::size_t start_pos      = 0;
+  cudf::size_type total_rows = 0;
+  do {
+    auto const input_table = load_chunk(input_file, start_pos, chunk_size, stream);
+    auto const read_rows   = input_table->num_rows();
+    if (read_rows == 0) break;
+
+    auto const cities = input_table->view().column(0);
+    auto const temps  = input_table->view().column(1);
+
+    std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+    aggregations.emplace_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
+    aggregations.emplace_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+    aggregations.emplace_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+    aggregations.emplace_back(cudf::make_count_aggregation<cudf::groupby_aggregation>());
+    auto result = compute_results(cities, temps, std::move(aggregations), stream);
+
+    agg_data.emplace_back(
+      cudf::sort_by_key(result->view(), result->view().select({0}), {}, {}, stream));
+    start_pos += chunk_size;
+    chunk_size = std::min(chunk_size, file_size - start_pos);
+    total_rows += read_rows;
+  } while (start_pos < file_size && chunk_size > 0);
+
+  // now aggregate the aggregate results
+  auto results = compute_final_aggregates(agg_data, stream);
+  stream.synchronize();
+
+  elapsed_t elapsed = std::chrono::steady_clock::now() - start;
+  std::cout << "Number of keys: " << results->num_rows() << std::endl;
+  std::cout << "Process time: " << elapsed.count() << " seconds\n";
+  std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n";
+
+  return 0;
+}
diff --git a/cpp/examples/billion_rows/brc_pipeline.cpp b/cpp/examples/billion_rows/brc_pipeline.cpp
new file mode 100644
index 00000000000..c65edc163e1
--- /dev/null
+++ b/cpp/examples/billion_rows/brc_pipeline.cpp
@@ -0,0 +1,171 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "common.hpp"
+#include "groupby_results.hpp"
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/io/csv.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/cuda_stream_pool.hpp>
+#include <rmm/mr/device/statistics_resource_adaptor.hpp>
+
+#include <chrono>
+#include <filesystem>
+#include <iostream>
+#include <memory>
+#include <string>
+
+using elapsed_t  = std::chrono::duration<double>;
+using byte_range = std::pair<std::size_t, std::size_t>;
+using result_t   = std::unique_ptr<cudf::table>;
+
+std::unique_ptr<cudf::table> load_chunk(std::string const& input_file,
+                                        std::size_t start,
+                                        std::size_t size,
+                                        rmm::cuda_stream_view stream)
+{
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{input_file})
+      .header(-1)
+      .delimiter(';')
+      .doublequote(false)
+      .byte_range_offset(start)
+      .byte_range_size(size)
+      .dtypes(std::vector<cudf::data_type>{cudf::data_type{cudf::type_id::STRING},
+                                           cudf::data_type{cudf::type_id::FLOAT32}})
+      .na_filter(false);
+  return cudf::io::read_csv(in_opts, stream).tbl;
+}
+
+struct chunk_fn {
+  std::string input_file;
+  std::vector<result_t>& agg_data;
+  rmm::cuda_stream_view stream;
+
+  std::vector<byte_range> byte_ranges{};
+  bool first_range{};
+
+  void add_range(std::size_t start, std::size_t size)
+  {
+    byte_ranges.push_back(byte_range{start, size});
+    if (!first_range) { first_range = (start == 0); }
+  }
+
+  void operator()()
+  {
+    using namespace std::chrono_literals;
+
+    // process each byte range assigned to this thread
+    for (auto& br : byte_ranges) {
+      auto const input_table = load_chunk(input_file, br.first, br.second, stream);
+      auto const read_rows   = input_table->num_rows();
+      if (read_rows == 0) continue;
+
+      auto const cities = input_table->view().column(0);
+      auto const temps  = input_table->view().column(1);
+
+      std::vector<std::unique_ptr<cudf::groupby_aggregation>> aggregations;
+      aggregations.emplace_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
+      aggregations.emplace_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
+      aggregations.emplace_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+      aggregations.emplace_back(cudf::make_count_aggregation<cudf::groupby_aggregation>());
+      auto result = compute_results(cities, temps, std::move(aggregations), stream);
+
+      agg_data.emplace_back(
+        cudf::sort_by_key(result->view(), result->view().select({0}), {}, {}, stream));
+    }
+    // done with this stream
+    stream.synchronize_no_throw();
+  }
+};
+
+int main(int argc, char const** argv)
+{
+  if (argc < 2) {
+    std::cout << "required parameter: input-file-path\n";
+    std::cout << "optional parameters: chunk-count thread-count\n";
+    return 1;
+  }
+
+  auto const input_file   = std::string{argv[1]};
+  auto const divider      = (argc < 3) ? 25 : std::stoi(std::string(argv[2]));
+  auto const thread_count = (argc < 4) ? 2 : std::stoi(std::string(argv[3]));
+
+  std::cout << "Input: " << input_file << std::endl;
+  std::cout << "Chunks: " << divider << std::endl;
+  std::cout << "Threads: " << thread_count << std::endl;
+
+  auto const mr_name = std::string("pool");
+  auto resource      = create_memory_resource(mr_name);
+  auto stats_mr =
+    rmm::mr::statistics_resource_adaptor<rmm::mr::device_memory_resource>(resource.get());
+  rmm::mr::set_current_device_resource(&stats_mr);
+  auto stream = cudf::get_default_stream();
+
+  std::filesystem::path p = input_file;
+  auto const file_size    = std::filesystem::file_size(p);
+
+  auto start = std::chrono::steady_clock::now();
+
+  std::size_t chunk_size = file_size / divider + ((file_size % divider) != 0);
+  std::size_t start_pos  = 0;
+
+  auto stream_pool = rmm::cuda_stream_pool(thread_count);
+  std::vector<std::vector<result_t>> chunk_results(thread_count);
+
+  std::vector<chunk_fn> chunk_tasks;
+  for (auto& cr : chunk_results) {
+    chunk_tasks.emplace_back(chunk_fn{input_file, cr, stream_pool.get_stream()});
+  }
+  for (std::size_t i = 0; i < divider; ++i) {
+    auto start = i * chunk_size;
+    auto size  = std::min(chunk_size, file_size - start);
+    chunk_tasks[i % thread_count].add_range(start, size);
+  }
+  std::vector<std::thread> threads;
+  for (auto& c : chunk_tasks) {
+    threads.emplace_back(std::thread{c});
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+
+  // in case some kernels are still running on the default stream
+  stream.synchronize();
+
+  // combine each thread's agg data into a single vector
+  std::vector<result_t> agg_data(divider);
+  auto begin = agg_data.begin();
+  for (auto& c : chunk_results) {
+    std::move(c.begin(), c.end(), begin);
+    begin += c.size();
+  }
+
+  // now aggregate the aggregate results
+  auto results = compute_final_aggregates(agg_data, stream);
+  stream.synchronize();
+
+  elapsed_t elapsed = std::chrono::steady_clock::now() - start;
+  std::cout << "Number of keys: " << results->num_rows() << std::endl;
+  std::cout << "Process time: " << elapsed.count() << " seconds\n";
+  std::cout << "Peak memory: " << (stats_mr.get_bytes_counter().peak / 1048576.0) << " MB\n";
+
+  return 0;
+}
diff --git a/cpp/examples/billion_rows/common.hpp b/cpp/examples/billion_rows/common.hpp
new file mode 100644
index 00000000000..d3063034d28
--- /dev/null
+++ b/cpp/examples/billion_rows/common.hpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <string>
+
+/**
+ * @brief Create CUDA memory resource
+ */
+auto make_cuda_mr() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
+
+/**
+ * @brief Create a pool device memory resource
+ */
+auto make_pool_mr()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
+    make_cuda_mr(), rmm::percent_of_free_device_memory(50));
+}
+
+/**
+ * @brief Create memory resource for libcudf functions
+ */
+std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(std::string const& name)
+{
+  if (name == "pool") { return make_pool_mr(); }
+  return make_cuda_mr();
+}
diff --git a/cpp/examples/billion_rows/groupby_results.cpp b/cpp/examples/billion_rows/groupby_results.cpp
new file mode 100644
index 00000000000..0a7f24830f6
--- /dev/null
+++ b/cpp/examples/billion_rows/groupby_results.cpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "groupby_results.hpp"
+
+#include <cudf/aggregation.hpp>
+#include <cudf/binaryop.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/filling.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/reshape.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+std::unique_ptr<cudf::table> compute_results(
+  cudf::column_view const& cities,
+  cudf::column_view const& temperatures,
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>>&& aggregations,
+  rmm::cuda_stream_view stream)
+{
+  auto groupby_obj      = cudf::groupby::groupby(cudf::table_view({cities}));
+  auto aggregation_reqs = std::vector<cudf::groupby::aggregation_request>{};
+  auto& req             = aggregation_reqs.emplace_back();
+  req.values            = temperatures;
+  req.aggregations      = std::move(aggregations);
+
+  auto result = groupby_obj.aggregate(aggregation_reqs, stream);
+
+  auto rtn = result.first->release();
+  for (auto& r : result.second.front().results) {
+    rtn.emplace_back(std::move(r));
+  }
+
+  return std::make_unique<cudf::table>(std::move(rtn));
+}
+
+std::unique_ptr<cudf::table> compute_final_aggregates(
+  std::vector<std::unique_ptr<cudf::table>>& agg_data, rmm::cuda_stream_view stream)
+{
+  // first combine all the results into a vectors of columns
+  std::vector<cudf::column_view> min_cols, max_cols, sum_cols, count_cols;
+  for (auto& tbl : agg_data) {
+    auto const tv = tbl->view();
+    min_cols.push_back(tv.column(1));
+    max_cols.push_back(tv.column(2));
+    sum_cols.push_back(tv.column(3));
+    count_cols.push_back(tv.column(4));
+  }
+
+  // Create single columns out of the aggregate table results.
+  // This relies on every key appearing in every chunk segment.
+  // All the values for each key become contiguous within the output column.
+  // For example, for N=min_cols.size() (number of unique cities):
+  //   All of the mins for city[i] are in row[i] of each column of vector min_cols.
+  //   The interleave_columns API transposes these into a single column where
+  //   the first N rows are values for city[0],
+  //   the next N rows are values for city[1],
+  //   ...
+  //   the last N rows are values for city[N-1]
+  // The final result for each city is computed using segmented_reduce.
+  auto mins   = cudf::interleave_columns(cudf::table_view{min_cols});
+  auto maxes  = cudf::interleave_columns(cudf::table_view{max_cols});
+  auto sums   = cudf::interleave_columns(cudf::table_view{sum_cols});
+  auto counts = cudf::interleave_columns(cudf::table_view{count_cols});
+
+  // Build the offsets needed for segmented reduce.
+  // These are increasing integer values spaced evenly as per the number of cities (keys).
+  auto const num_keys = agg_data.front()->num_rows();
+  auto const size     = static_cast<cudf::size_type>(num_keys) + 1;
+  auto const start    = cudf::numeric_scalar<cudf::size_type>(0, true, stream);
+  auto const step     = cudf::numeric_scalar<cudf::size_type>(agg_data.size(), true, stream);
+  auto seg_offsets    = cudf::sequence(size, start, step, stream);
+  auto offsets_span   = cudf::device_span<cudf::size_type const>(seg_offsets->view());
+
+  // compute the min/max for each key by using segmented reduce
+  auto min_agg = cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>();
+  mins         = cudf::segmented_reduce(
+    mins->view(), offsets_span, *min_agg, mins->type(), cudf::null_policy::EXCLUDE, stream);
+  auto max_agg = cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>();
+  maxes        = cudf::segmented_reduce(
+    maxes->view(), offsets_span, *max_agg, maxes->type(), cudf::null_policy::EXCLUDE, stream);
+
+  // compute the sum and total counts in the same way
+  auto sum_agg = cudf::make_sum_aggregation<cudf::segmented_reduce_aggregation>();
+  sums         = cudf::segmented_reduce(
+    sums->view(), offsets_span, *sum_agg, sums->type(), cudf::null_policy::EXCLUDE, stream);
+  counts = cudf::segmented_reduce(
+    counts->view(), offsets_span, *sum_agg, counts->type(), cudf::null_policy::EXCLUDE, stream);
+
+  // compute the means using binary-operation to divide the individual rows sum/count
+  auto means = cudf::binary_operation(
+    sums->view(), counts->view(), cudf::binary_operator::DIV, sums->type(), stream);
+
+  std::vector<std::unique_ptr<cudf::column>> results;
+  results.emplace_back(std::move(mins));
+  results.emplace_back(std::move(maxes));
+  results.emplace_back(std::move(means));
+  return std::make_unique<cudf::table>(std::move(results));
+}
diff --git a/cpp/examples/billion_rows/groupby_results.hpp b/cpp/examples/billion_rows/groupby_results.hpp
new file mode 100644
index 00000000000..d5a88428329
--- /dev/null
+++ b/cpp/examples/billion_rows/groupby_results.hpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <vector>
+
+/**
+ * @brief Process the cities and temperatures
+ *
+ * Perform the given aggregations using the cities as the keys and the
+ * temperatures as values.
+ *
+ * @param cities The city names
+ * @param temperatures The temperature values
+ * @param aggregations Which groupby aggregations to perform
+ * @param stream CUDA stream to use for launching kernels
+ * @return aggregated results
+ */
+std::unique_ptr<cudf::table> compute_results(
+  cudf::column_view const& cities,
+  cudf::column_view const& temperatures,
+  std::vector<std::unique_ptr<cudf::groupby_aggregation>>&& aggregations,
+  rmm::cuda_stream_view stream = cudf::get_default_stream());
+
+/**
+ * @brief Produce the final aggregations from sub-aggregate results
+ *
+ * @param agg_data Sub-aggregations to summarize
+ * @param stream CUDA stream to use for launching kernels
+ * @return final results
+ */
+std::unique_ptr<cudf::table> compute_final_aggregates(
+  std::vector<std::unique_ptr<cudf::table>>& agg_data,
+  rmm::cuda_stream_view stream = cudf::get_default_stream());
diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index 2d6f6f316c7..8e8d8bd0b78 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -61,4 +61,5 @@ build_example tpch
 build_example strings
 build_example nested_types
 build_example parquet_io
+build_example billion_rows
 build_example interop

From 7018a33be752da9363db5431560d8d12bf378920 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 5 Sep 2024 19:21:00 -0500
Subject: [PATCH 778/842] Add support for Python 3.12, update Kafka
 dependencies to 2.5.x (#16745)

Contributes to https://github.com/rapidsai/build-planning/issues/40

This PR adds support for Python 3.12.

Other changes required to add that support:

* updating `librdkafka` / `python-confluent-kafka`, `1.9.* -> 2.5.*` ([link to thread](https://github.com/rapidsai/cudf/pull/16745#discussion_r1745871756))
* removing use of `ast.Num` in syntax tree parsing, in favor of checking the `.value` of an `ast.Constant` against a hard-coded list of builtin types ([link to thread](https://github.com/rapidsai/cudf/pull/16745/files#r1745876846))
* ignoring deprecation warnings about `datetime.datetime.utcnow()` ([link to thread](https://github.com/rapidsai/cudf/pull/16745/files#r1746075083))
* skipping doctests that end up running `repr()` on an `OrderedDict` ([link to thread](https://github.com/rapidsai/cudf/pull/16745/files#r1746079415))

## Notes for Reviewers

This is part of ongoing work to add Python 3.12 support across RAPIDS.
It temporarily introduces a build/test matrix including Python 3.12, from https://github.com/rapidsai/shared-workflows/pull/213.

A follow-up PR will revert back to pointing at the `branch-24.10` branch of `shared-workflows` once all
RAPIDS repos have added Python 3.12 support.

### This will fail until all dependencies have been updates to Python 3.12

CI here is expected to fail until all of this project's upstream dependencies support Python 3.12.

This can be merged whenever all CI jobs are passing.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16745
---
 .github/workflows/build.yaml                  | 28 +++++------
 .github/workflows/pandas-tests.yaml           |  2 +-
 .github/workflows/pr.yaml                     | 48 +++++++++----------
 .../workflows/pr_issue_status_automation.yml  |  6 +--
 .github/workflows/test.yaml                   | 24 +++++-----
 README.md                                     |  2 +-
 .../all_cuda-118_arch-x86_64.yaml             |  6 +--
 .../all_cuda-125_arch-x86_64.yaml             |  6 +--
 conda/recipes/custreamz/meta.yaml             |  4 +-
 conda/recipes/libcudf/conda_build_config.yaml |  2 +-
 dependencies.yaml                             | 12 +++--
 .../cudf/cudf/core/_internals/expressions.py  |  2 +-
 python/cudf/cudf/core/dataframe.py            |  2 +-
 python/cudf/cudf/core/series.py               |  2 +-
 python/cudf/cudf/tests/pytest.ini             |  2 +
 .../dependencies.yaml                         |  6 ++-
 python/cudf/pyproject.toml                    |  1 +
 python/cudf_polars/pyproject.toml             |  1 +
 python/custreamz/pyproject.toml               |  3 +-
 python/dask_cudf/pyproject.toml               |  3 ++
 python/pylibcudf/pyproject.toml               |  1 +
 21 files changed, 90 insertions(+), 73 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index b5d17022a3a..d6d3e3fdd33 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-libcudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -81,7 +81,7 @@ jobs:
   wheel-publish-libcudf:
     needs: wheel-build-libcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -92,7 +92,7 @@ jobs:
   wheel-build-pylibcudf:
     needs: [wheel-publish-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -102,7 +102,7 @@ jobs:
   wheel-publish-pylibcudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -113,7 +113,7 @@ jobs:
   wheel-build-cudf:
     needs: wheel-publish-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -134,7 +134,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -146,7 +146,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -157,7 +157,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-publish-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -169,7 +169,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index 10c803f7921..d670132cca9 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,7 +17,7 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
       with:
         # This selects "ARCH=amd64 + the latest supported Python + CUDA".
         matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 8730804e8b6..a4a8f036174 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -37,7 +37,7 @@ jobs:
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.12
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -104,39 +104,39 @@ jobs:
               - '!notebooks/**'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
     if: needs.changed-files.outputs.test_cpp == 'true'
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
@@ -145,7 +145,7 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
@@ -153,7 +153,7 @@ jobs:
   conda-java-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     if: needs.changed-files.outputs.test_java == 'true'
     with:
       build_type: pull-request
@@ -164,7 +164,7 @@ jobs:
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -174,7 +174,7 @@ jobs:
   conda-notebook-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     if: needs.changed-files.outputs.test_notebooks == 'true'
     with:
       build_type: pull-request
@@ -185,7 +185,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -195,7 +195,7 @@ jobs:
   wheel-build-libcudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -204,21 +204,21 @@ jobs:
   wheel-build-pylibcudf:
     needs: [checks, wheel-build-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       build_type: pull-request
       script: "ci/build_wheel_pylibcudf.sh"
   wheel-build-cudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
@@ -226,7 +226,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -235,7 +235,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: [wheel-build-cudf-polars, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -247,7 +247,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -256,7 +256,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: [wheel-build-dask-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -265,7 +265,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
@@ -276,7 +276,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -287,7 +287,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -299,7 +299,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
         node_type: cpu4
         build_type: pull-request
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index 45e5191eb54..fe77ad4b6b2 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.10
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@python-3.12
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.10
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@python-3.12
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.10
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@python-3.12
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 8605fa46f68..4af6a0d690d 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -54,7 +54,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -85,7 +85,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,7 +97,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -106,7 +106,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -117,7 +117,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -126,7 +126,7 @@ jobs:
       script: ci/cudf_pandas_scripts/run_tests.sh
   third-party-integration-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/README.md b/README.md
index f1b010394d6..f62f7885d63 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,7 @@ cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
-    cudf=24.10 python=3.11 cuda-version=12.5
+    cudf=24.10 python=3.12 cuda-version=12.5
 ```
 
 We also provide [nightly Conda packages](https://anaconda.org/rapidsai-nightly) built from the HEAD
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index fa4c77d67b4..c96e8706d27 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -43,7 +43,7 @@ dependencies:
 - libcurand-dev=10.3.0.86
 - libcurand=10.3.0.86
 - libkvikio==24.10.*,>=0.0.0a0
-- librdkafka>=1.9.0,<1.10.0a0
+- librdkafka>=2.5.0,<2.6.0a0
 - librmm==24.10.*,>=0.0.0a0
 - make
 - moto>=4.0.8
@@ -74,8 +74,8 @@ dependencies:
 - pytest-cov
 - pytest-xdist
 - pytest<8
-- python-confluent-kafka>=1.9.0,<1.10.0a0
-- python>=3.10,<3.12
+- python-confluent-kafka>=2.5.0,<2.6.0a0
+- python>=3.10,<3.13
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - rapids-dask-dependency==24.10.*,>=0.0.0a0
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index 9b487347a5e..e54a44d9f6e 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -42,7 +42,7 @@ dependencies:
 - libcufile-dev
 - libcurand-dev
 - libkvikio==24.10.*,>=0.0.0a0
-- librdkafka>=1.9.0,<1.10.0a0
+- librdkafka>=2.5.0,<2.6.0a0
 - librmm==24.10.*,>=0.0.0a0
 - make
 - moto>=4.0.8
@@ -72,8 +72,8 @@ dependencies:
 - pytest-cov
 - pytest-xdist
 - pytest<8
-- python-confluent-kafka>=1.9.0,<1.10.0a0
-- python>=3.10,<3.12
+- python-confluent-kafka>=2.5.0,<2.6.0a0
+- python>=3.10,<3.13
 - pytorch>=2.1.0
 - rapids-build-backend>=0.3.0,<0.4.0.dev0
 - rapids-dask-dependency==24.10.*,>=0.0.0a0
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index f5ea426e0b1..a031f05a73a 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -39,7 +39,7 @@ requirements:
     - python
     - rapids-build-backend >=0.3.0,<0.4.0.dev0
     - setuptools
-    - python-confluent-kafka >=1.9.0,<1.10.0a0
+    - python-confluent-kafka >=2.5.0,<2.6.0a0
     - cudf_kafka ={{ version }}
     - cuda-version ={{ cuda_version }}
   run:
@@ -48,7 +48,7 @@ requirements:
     - cudf ={{ version }}
     - cudf_kafka ={{ version }}
     - rapids-dask-dependency ={{ minor_version }}
-    - python-confluent-kafka >=1.9.0,<1.10.0a0
+    - python-confluent-kafka >=2.5.0,<2.6.0a0
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index dae04c08aca..33fa4b4eccf 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -23,7 +23,7 @@ dlpack_version:
   - ">=0.8,<1.0"
 
 librdkafka_version:
-  - ">=1.9.0,<1.10.0a0"
+  - ">=2.5.0,<2.6.0a0"
 
 fmt_version:
   - ">=10.1.1,<11"
diff --git a/dependencies.yaml b/dependencies.yaml
index a3f0ffeec82..32c1d7a0845 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -352,7 +352,7 @@ dependencies:
           - librmm==24.10.*,>=0.0.0a0
           - libkvikio==24.10.*,>=0.0.0a0
           - flatbuffers==24.3.25
-          - librdkafka>=1.9.0,<1.10.0a0
+          - librdkafka>=2.5.0,<2.6.0a0
           # Align nvcomp version with rapids-cmake
           - nvcomp==4.0.1
           - spdlog>=1.12.0,<1.13
@@ -550,8 +550,12 @@ dependencies:
             packages:
               - python=3.11
           - matrix:
+              py: "3.12"
             packages:
-              - python>=3.10,<3.12
+              - python=3.12
+          - matrix:
+            packages:
+              - python>=3.10,<3.13
   run_common:
     common:
       - output_types: [conda, requirements, pyproject]
@@ -656,13 +660,13 @@ dependencies:
     common:
       - output_types: conda
         packages:
-          - python-confluent-kafka>=1.9.0,<1.10.0a0
+          - python-confluent-kafka>=2.5.0,<2.6.0a0
       - output_types: [conda, requirements, pyproject]
         packages:
           - streamz
       - output_types: [requirements, pyproject]
         packages:
-          - confluent-kafka>=1.9.0,<1.10.0a0
+          - confluent-kafka>=2.5.0,<2.6.0a0
   test_cpp:
     common:
       - output_types: conda
diff --git a/python/cudf/cudf/core/_internals/expressions.py b/python/cudf/cudf/core/_internals/expressions.py
index 67bde5a72b2..90d9118027a 100644
--- a/python/cudf/cudf/core/_internals/expressions.py
+++ b/python/cudf/cudf/core/_internals/expressions.py
@@ -120,7 +120,7 @@ def visit_Name(self, node):
         self.stack.append(ColumnReference(col_id))
 
     def visit_Constant(self, node):
-        if not isinstance(node, (ast.Num, ast.Str)):
+        if not isinstance(node.value, (float, int, str, complex)):
             raise ValueError(
                 f"Unsupported literal {repr(node.value)} of type "
                 "{type(node.value).__name__}"
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 7a171fe9e05..58a16a6d504 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -2359,7 +2359,7 @@ def to_dict(
         You can also specify the mapping type.
 
         >>> from collections import OrderedDict, defaultdict
-        >>> df.to_dict(into=OrderedDict)
+        >>> df.to_dict(into=OrderedDict)  # doctest: +SKIP
         OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
                      ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
 
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index 48445f018d3..acd97c2047c 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -975,7 +975,7 @@ def to_dict(self, into: type[dict] = dict) -> dict:
         >>> s.to_dict()
         {0: 1, 1: 2, 2: 3, 3: 4}
         >>> from collections import OrderedDict, defaultdict
-        >>> s.to_dict(OrderedDict)
+        >>> s.to_dict(OrderedDict)  # doctest: +SKIP
         OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])
         >>> dd = defaultdict(list)
         >>> s.to_dict(dd)
diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini
index 710473acb85..2136bca0e28 100644
--- a/python/cudf/cudf/tests/pytest.ini
+++ b/python/cudf/cudf/tests/pytest.ini
@@ -8,6 +8,8 @@ filterwarnings =
     error
     ignore:::.*xdist.*
     ignore:::.*pytest.*
+    # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow()
+    ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning
     # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+
     ignore:Passing a BlockManager to DataFrame is deprecated:DeprecationWarning
     # PerformanceWarning from cupy warming up the JIT cache
diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
index 05e1d8178d5..f742f46c7ed 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/dependencies.yaml
@@ -172,8 +172,12 @@ dependencies:
             packages:
               - python=3.11
           - matrix:
+              py: "3.12"
             packages:
-              - python>=3.10,<3.12
+              - python=3.12
+          - matrix:
+            packages:
+              - python>=3.10,<3.13
   test_base:
     common:
       - output_types: conda
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 17d1292980b..5833ee43c07 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -44,6 +44,7 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 
 [project.optional-dependencies]
diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml
index f2bab9e6623..984b5487b98 100644
--- a/python/cudf_polars/pyproject.toml
+++ b/python/cudf_polars/pyproject.toml
@@ -30,6 +30,7 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 
 [project.optional-dependencies]
diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml
index be5331236a5..5aa474e2862 100644
--- a/python/custreamz/pyproject.toml
+++ b/python/custreamz/pyproject.toml
@@ -19,7 +19,7 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.10"
 dependencies = [
-    "confluent-kafka>=1.9.0,<1.10.0a0",
+    "confluent-kafka>=2.5.0,<2.6.0a0",
     "cudf==24.10.*,>=0.0.0a0",
     "cudf_kafka==24.10.*,>=0.0.0a0",
     "streamz",
@@ -33,6 +33,7 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 
 [project.optional-dependencies]
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 93bf532d67f..9ac834586a6 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -34,6 +34,7 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 
 [project.entry-points."dask.dataframe.backends"]
@@ -117,6 +118,8 @@ skip = [
 filterwarnings = [
     "error::FutureWarning",
     "error::DeprecationWarning",
+    # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow()
+    "ignore:.*datetime.*utcnow.*scheduled for removal:DeprecationWarning",
     "ignore:create_block_manager_from_blocks is deprecated and will be removed in a future version. Use public APIs instead.:DeprecationWarning",
     # https://github.com/dask/partd/blob/main/partd/pandas.py#L198
     "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning",
diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml
index bfade41353c..3aaca09d8bd 100644
--- a/python/pylibcudf/pyproject.toml
+++ b/python/pylibcudf/pyproject.toml
@@ -34,6 +34,7 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
 ]
 
 [project.optional-dependencies]

From 8d8faefddd72981c6e868a3504d2baf5a37ef8e2 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 6 Sep 2024 15:00:36 -0500
Subject: [PATCH 779/842] allow pandas patch version to float in cudf-pandas
 unit tests (#16763)

#16745 added support for Python 3.12 in this project.

When that was merged, nightly `unit-tests-cudf-pandas` jobs on Python 3.12 started failing, with errors from compiling `pandas`: ([build link](https://github.com/rapidsai/cudf/actions/runs/10733915866/job/29768130164))

That's only happening because we're running `pip install pandas==2.1` in those jobs, which matches exactly `pandas==2.1.0`, which does not have Python 3.12 wheels on PyPI (https://pypi.org/project/pandas/2.1.0/#files).

```text
Collecting pandas==2.1
  Downloading pandas-2.1.0.tar.gz (4.3 MB)
```

`pandas==2.1.1` DOES have Python 3.12 wheels on PyPI (https://pypi.org/project/pandas/2.1.1/#files).

To fix those jobs, this proposes allowing the patch version of `pandas` installed in those CI jobs to float:

* before: `pip install pandas==2.1`
* after: `pip install pandas==2.1.*`

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Mike Sarahan (https://github.com/msarahan)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16763
---
 ci/cudf_pandas_scripts/run_tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/cudf_pandas_scripts/run_tests.sh b/ci/cudf_pandas_scripts/run_tests.sh
index 1c2724a9a5d..c6228a4ef33 100755
--- a/ci/cudf_pandas_scripts/run_tests.sh
+++ b/ci/cudf_pandas_scripts/run_tests.sh
@@ -91,7 +91,7 @@ IFS=',' read -r -a versions <<< "$output"
 
 for version in "${versions[@]}"; do
     echo "Installing pandas version: ${version}"
-    python -m pip install "numpy>=1.23,<2.0a0" "pandas==${version}"
+    python -m pip install "numpy>=1.23,<2.0a0" "pandas==${version}.*"
     python -m pytest -p cudf.pandas \
     --ignore=./python/cudf/cudf_pandas_tests/third_party_integration_tests/ \
     --cov-config=./python/cudf/.coveragerc \

From f97f61c60fe9e387982e4824290bd8903b508b6e Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 6 Sep 2024 18:38:11 -0400
Subject: [PATCH 780/842] Remove xfail from torch-cudf.pandas integration test
 (#16705)

The torch test should no longer fail after #16601.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16705
---
 .../third_party_integration_tests/tests/test_pytorch.py         | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
index ae9db3836a6..ad287471aa0 100644
--- a/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
+++ b/python/cudf/cudf_pandas_tests/third_party_integration_tests/tests/test_pytorch.py
@@ -121,8 +121,6 @@ def test_torch_tensor_ctor():
     return torch.tensor(s.values)
 
 
-@pytest.mark.xfail_cudf_pandas(reason="Known failure, see xdf/#210")
-@pytest.mark.xfail_compare
 def test_torch_tensor_from_numpy():
     s = pd.Series(range(5))
     return torch.from_numpy(s.values)

From aa08fdb0d09b90e8bc4b640ea326712cb1a5b868 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Fri, 6 Sep 2024 18:39:11 -0400
Subject: [PATCH 781/842] [DOC] Remove out of date section from cudf.pandas
 docs (#16697)

Proxy numpy arrays now instances of real numpy arrays (#16601), so libraries (eg. numba, torch) which utilize NumPy's C API should now be able to use proxy arrays. This PR updates the cudf.pandas documentation to reflect this.

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16697
---
 docs/cudf/source/cudf_pandas/faq.md | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md
index fa5d203f52c..34b657488c1 100644
--- a/docs/cudf/source/cudf_pandas/faq.md
+++ b/docs/cudf/source/cudf_pandas/faq.md
@@ -151,11 +151,6 @@ There are a few known limitations that you should be aware of:
   [value mutability](https://pandas.pydata.org/pandas-docs/stable/getting_started/overview.html#mutability-and-copying-of-data)
   of Pandas objects is not always guaranteed. You should follow the
   pandas recommendation to favor immutable operations.
-- `cudf.pandas` can't currently interface smoothly with functions that
-  interact with objects using a C API (such as the Python or NumPy C
-  API)
-  - For example, you can write `torch.tensor(df.values)` but not
-    `torch.from_numpy(df.values)`, as the latter uses the NumPy C API
 - For performance reasons, joins and join-based operations are not
   currently implemented to maintain the same row ordering as standard
   pandas

From 478406740a500ce74d8cd4b4bea07fd163256796 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Sat, 7 Sep 2024 03:51:16 -0500
Subject: [PATCH 782/842] Check index bounds in compact protocol reader.
 (#16493)

This adds bounds checking to the compact protocol reader's read function.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Nghia Truong (https://github.com/ttnghia)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16493
---
 cpp/src/io/parquet/compact_protocol_reader.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cpp/src/io/parquet/compact_protocol_reader.cpp b/cpp/src/io/parquet/compact_protocol_reader.cpp
index afcf6b373a9..b978799b8bc 100644
--- a/cpp/src/io/parquet/compact_protocol_reader.cpp
+++ b/cpp/src/io/parquet/compact_protocol_reader.cpp
@@ -140,6 +140,7 @@ struct parquet_field_bool_list : public parquet_field_list<bool, FieldType::BOOL
     auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) {
       auto const current_byte = cpr->getb();
       assert_bool_field_type(current_byte);
+      CUDF_EXPECTS(i < val.size(), "Index out of bounds");
       val[i] = current_byte == static_cast<int>(FieldType::BOOLEAN_TRUE);
     };
     bind_read_func(read_value);
@@ -189,6 +190,7 @@ struct parquet_field_int_list : public parquet_field_list<T, EXPECTED_TYPE> {
   parquet_field_int_list(int f, std::vector<T>& v) : parquet_field_list<T, EXPECTED_TYPE>(f, v)
   {
     auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) {
+      CUDF_EXPECTS(i < val.size(), "Index out of bounds");
       val[i] = cpr->get_zigzag<T>();
     };
     this->bind_read_func(read_value);
@@ -233,6 +235,7 @@ struct parquet_field_string_list : public parquet_field_list<std::string, FieldT
       auto const l = cpr->get_u32();
       CUDF_EXPECTS(l < static_cast<size_t>(cpr->m_end - cpr->m_cur), "string length mismatch");
 
+      CUDF_EXPECTS(i < val.size(), "Index out of bounds");
       val[i].assign(reinterpret_cast<char const*>(cpr->m_cur), l);
       cpr->m_cur += l;
     };
@@ -270,6 +273,7 @@ struct parquet_field_enum_list : public parquet_field_list<Enum, FieldType::I32>
     : parquet_field_list<Enum, FieldType::I32>(f, v)
   {
     auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) {
+      CUDF_EXPECTS(i < val.size(), "Index out of bounds");
       val[i] = static_cast<Enum>(cpr->get_i32());
     };
     this->bind_read_func(read_value);
@@ -355,6 +359,7 @@ struct parquet_field_struct_list : public parquet_field_list<T, FieldType::STRUC
     : parquet_field_list<T, FieldType::STRUCT>(f, v)
   {
     auto const read_value = [&val = v](uint32_t i, CompactProtocolReader* cpr) {
+      CUDF_EXPECTS(i < val.size(), "Index out of bounds");
       cpr->read(&val[i]);
     };
     this->bind_read_func(read_value);
@@ -399,6 +404,7 @@ struct parquet_field_binary_list
       auto const l = cpr->get_u32();
       CUDF_EXPECTS(l <= static_cast<size_t>(cpr->m_end - cpr->m_cur), "binary length mismatch");
 
+      CUDF_EXPECTS(i < val.size(), "Index out of bounds");
       val[i].resize(l);
       val[i].assign(cpr->m_cur, cpr->m_cur + l);
       cpr->m_cur += l;

From 26a81b66181bab3171ca62f3a4afcbb1b8c9b403 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 9 Sep 2024 05:16:48 -1000
Subject: [PATCH 783/842] Allow read_csv(header=None) to return int column
 labels in `mode.pandas_compatible` (#16769)

closes https://github.com/rapidsai/cudf/issues/16766

If the cudf `read_csv` behavior of always returning string column labels is long standing behavior, we can match the pandas behavior of returning integer column labels in `mode.pandas_compatible`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16769
---
 python/cudf/cudf/_lib/csv.pyx      | 4 +++-
 python/cudf/cudf/tests/test_csv.py | 8 ++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index e0f57df1368..058e884e08b 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -276,8 +276,10 @@ def read_csv(
                     col_name = df._data.names[index]
                     df._data[col_name] = df._data[col_name].astype(col_dtype)
 
-    if names is not None and len(names) and isinstance(names[0], (int)):
+    if names is not None and len(names) and isinstance(names[0], int):
         df.columns = [int(x) for x in df._data]
+    elif names is None and header == -1 and cudf.get_option("mode.pandas_compatible"):
+        df.columns = [int(x) for x in df._column_names]
 
     # Set index if the index_col parameter is passed
     if index_col is not None and index_col is not False:
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index cee3d23eadc..b6efc8ebd88 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -2269,3 +2269,11 @@ def test_read_compressed_BOM(tmpdir):
         f.write(buffer)
 
     assert_eq(pd.read_csv(fname), cudf.read_csv(fname))
+
+
+def test_read_header_none_pandas_compat_column_type():
+    data = "1\n2\n"
+    with cudf.option_context("mode.pandas_compatible", True):
+        result = cudf.read_csv(StringIO(data), header=None).columns
+    expected = pd.read_csv(StringIO(data), header=None).columns
+    pd.testing.assert_index_equal(result, expected, exact=True)

From 150f1b10ed9c702d5283216b746df685e1708716 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 9 Sep 2024 10:15:57 -1000
Subject: [PATCH 784/842] Add labeling APIs to pylibcudf (#16761)

Contributes to https://github.com/rapidsai/cudf/issues/15162

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/16761
---
 docs/cudf/source/developer_guide/pylibcudf.md | 17 ++---
 python/cudf/cudf/_lib/labeling.pyx            | 40 +++---------
 python/pylibcudf/pylibcudf/CMakeLists.txt     |  1 +
 python/pylibcudf/pylibcudf/__init__.pxd       |  1 +
 python/pylibcudf/pylibcudf/__init__.py        |  3 +
 python/pylibcudf/pylibcudf/labeling.pxd       | 14 ++++
 python/pylibcudf/pylibcudf/labeling.pyx       | 65 +++++++++++++++++++
 .../pylibcudf/libcudf/CMakeLists.txt          |  4 +-
 .../pylibcudf/pylibcudf/libcudf/labeling.pxd  |  8 +--
 .../pylibcudf/pylibcudf/libcudf/labeling.pyx  |  0
 .../pylibcudf/tests/test_labeling.py          | 25 +++++++
 11 files changed, 134 insertions(+), 44 deletions(-)
 create mode 100644 python/pylibcudf/pylibcudf/labeling.pxd
 create mode 100644 python/pylibcudf/pylibcudf/labeling.pyx
 create mode 100644 python/pylibcudf/pylibcudf/libcudf/labeling.pyx
 create mode 100644 python/pylibcudf/pylibcudf/tests/test_labeling.py

diff --git a/docs/cudf/source/developer_guide/pylibcudf.md b/docs/cudf/source/developer_guide/pylibcudf.md
index 4e10459fe2b..39840e72e21 100644
--- a/docs/cudf/source/developer_guide/pylibcudf.md
+++ b/docs/cudf/source/developer_guide/pylibcudf.md
@@ -186,7 +186,7 @@ Here is an example of appropriate enum usage.
 
 
 ```cython
-# cpp/copying.pxd
+# pylibcudf/libcudf/copying.pxd
 cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
     # cpdef here so that we export both a cdef enum class and a Python enum.Enum.
     cpdef enum class out_of_bounds_policy(bool):
@@ -194,8 +194,9 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         DONT_CHECK
 
 
-# cpp/copying.pyx
-# This file is empty, but is required to compile the Python enum in cpp/copying.pxd
+# pylibcudf/libcudf/copying.pyx
+# This file is empty, but is required to compile the Python enum in pylibcudf/libcudf/copying.pxd
+# Ensure this file is included in pylibcudf/libcudf/CMakeLists.txt
 
 
 # pylibcudf/copying.pxd
@@ -203,21 +204,21 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
 # cimport the enum using the exact name
 # Once https://github.com/cython/cython/issues/5609 is resolved,
 # this import should instead be
-# from cudf._lib.cpp.copying cimport out_of_bounds_policy as OutOfBoundsPolicy
-from cudf._lib.cpp.copying cimport out_of_bounds_policy
+# from pylibcudf.libcudf.copying cimport out_of_bounds_policy as OutOfBoundsPolicy
+from pylibcudf.libcudf.copying cimport out_of_bounds_policy
 
 
 # pylibcudf/copying.pyx
 # Access cpp.copying members that aren't part of this module's public API via
 # this module alias
-from cudf._lib.cpp cimport copying as cpp_copying
-from cudf._lib.cpp.copying cimport out_of_bounds_policy
+from pylibcudf.libcudf cimport copying as cpp_copying
+from pylibcudf.libcudf.copying cimport out_of_bounds_policy
 
 # This import exposes the enum in the public API of this module.
 # It requires a no-cython-lint tag because it will be unused: all typing of
 # parameters etc will need to use the Cython name `out_of_bounds_policy` until
 # the Cython bug is resolved.
-from cudf._lib.cpp.copying import \
+from pylibcudf.libcudf.copying import \
     out_of_bounds_policy as OutOfBoundsPolicy  # no-cython-lint
 ```
 
diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx
index 2e1959a348d..3966cce8981 100644
--- a/python/cudf/cudf/_lib/labeling.pyx
+++ b/python/cudf/cudf/_lib/labeling.pyx
@@ -1,16 +1,11 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
 
-from cudf.core.buffer import acquire_spill_lock
-
 from libcpp cimport bool as cbool
-from libcpp.memory cimport unique_ptr
-from libcpp.utility cimport move
 
-from pylibcudf.libcudf.column.column cimport column
-from pylibcudf.libcudf.column.column_view cimport column_view
-from pylibcudf.libcudf.labeling cimport inclusive, label_bins as cpp_label_bins
+import pylibcudf as plc
 
 from cudf._lib.column cimport Column
+from cudf.core.buffer import acquire_spill_lock
 
 
 # Note that the parameter input shadows a Python built-in in the local scope,
@@ -19,26 +14,11 @@ from cudf._lib.column cimport Column
 @acquire_spill_lock()
 def label_bins(Column input, Column left_edges, cbool left_inclusive,
                Column right_edges, cbool right_inclusive):
-    cdef inclusive c_left_inclusive = \
-        inclusive.YES if left_inclusive else inclusive.NO
-    cdef inclusive c_right_inclusive = \
-        inclusive.YES if right_inclusive else inclusive.NO
-
-    cdef column_view input_view = input.view()
-    cdef column_view left_edges_view = left_edges.view()
-    cdef column_view right_edges_view = right_edges.view()
-
-    cdef unique_ptr[column] c_result
-
-    with nogil:
-        c_result = move(
-            cpp_label_bins(
-                input_view,
-                left_edges_view,
-                c_left_inclusive,
-                right_edges_view,
-                c_right_inclusive,
-            )
-        )
-
-    return Column.from_unique_ptr(move(c_result))
+    plc_column = plc.labeling.label_bins(
+        input.to_pylibcudf(mode="read"),
+        left_edges.to_pylibcudf(mode="read"),
+        left_inclusive,
+        right_edges.to_pylibcudf(mode="read"),
+        right_inclusive
+    )
+    return Column.from_pylibcudf(plc_column)
diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt
index a4f17344cb0..f07c8897e34 100644
--- a/python/pylibcudf/pylibcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/CMakeLists.txt
@@ -27,6 +27,7 @@ set(cython_sources
     groupby.pyx
     interop.pyx
     join.pyx
+    labeling.pyx
     lists.pyx
     merge.pyx
     null_mask.pyx
diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd
index 841efa59bda..b7cf6413c05 100644
--- a/python/pylibcudf/pylibcudf/__init__.pxd
+++ b/python/pylibcudf/pylibcudf/__init__.pxd
@@ -13,6 +13,7 @@ from . cimport (
     filling,
     groupby,
     join,
+    labeling,
     lists,
     merge,
     null_mask,
diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py
index d3878a89a6a..84b1c29f791 100644
--- a/python/pylibcudf/pylibcudf/__init__.py
+++ b/python/pylibcudf/pylibcudf/__init__.py
@@ -24,6 +24,7 @@
     interop,
     io,
     join,
+    labeling,
     lists,
     merge,
     null_mask,
@@ -67,7 +68,9 @@
     "gpumemoryview",
     "groupby",
     "interop",
+    "io",
     "join",
+    "labeling",
     "lists",
     "merge",
     "null_mask",
diff --git a/python/pylibcudf/pylibcudf/labeling.pxd b/python/pylibcudf/pylibcudf/labeling.pxd
new file mode 100644
index 00000000000..6f8797ae7d3
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/labeling.pxd
@@ -0,0 +1,14 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+from libcpp cimport bool
+from pylibcudf.libcudf.labeling cimport inclusive
+
+from .column cimport Column
+
+
+cpdef Column label_bins(
+    Column input,
+    Column left_edges,
+    bool left_inclusive,
+    Column right_edges,
+    bool right_inclusive
+)
diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx
new file mode 100644
index 00000000000..b5a7445df36
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/labeling.pyx
@@ -0,0 +1,65 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+from pylibcudf.libcudf cimport labeling as cpp_labeling
+from pylibcudf.libcudf.column.column cimport column
+from pylibcudf.libcudf.labeling cimport inclusive
+
+from pylibcudf.libcudf.labeling import inclusive as Inclusive  # no-cython-lint
+
+from .column cimport Column
+
+
+cpdef Column label_bins(
+    Column input,
+    Column left_edges,
+    bool left_inclusive,
+    Column right_edges,
+    bool right_inclusive
+):
+    """Labels elements based on membership in the specified bins.
+
+    Parameters
+    ----------
+    input : Column
+        Column of input elements to label according to the specified bins.
+    left_edges : Column
+        Column of the left edge of each bin.
+    left_inclusive : bool
+        Whether or not the left edge is inclusive.
+    right_edges : Column
+        Column of the right edge of each bin.
+    right_inclusive : bool
+        Whether or not the right edge is inclusive.
+
+    Returns
+    -------
+    Column
+        Column of integer labels of the elements in `input`
+        according to the specified bins.
+    """
+    cdef unique_ptr[column] c_result
+    cdef inclusive c_left_inclusive = (
+        inclusive.YES
+        if left_inclusive
+        else inclusive.NO
+    )
+    cdef inclusive c_right_inclusive = (
+        inclusive.YES
+        if right_inclusive
+        else inclusive.NO
+    )
+
+    with nogil:
+        c_result = move(
+            cpp_labeling.label_bins(
+                input.view(),
+                left_edges.view(),
+                c_left_inclusive,
+                right_edges.view(),
+                c_right_inclusive,
+            )
+        )
+
+    return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
index b04e94f1546..2167616690f 100644
--- a/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
+++ b/python/pylibcudf/pylibcudf/libcudf/CMakeLists.txt
@@ -12,8 +12,8 @@
 # the License.
 # =============================================================================
 
-set(cython_sources aggregation.pyx binaryop.pyx copying.pyx expressions.pyx reduce.pyx replace.pyx
-                   round.pyx stream_compaction.pyx types.pyx unary.pyx
+set(cython_sources aggregation.pyx binaryop.pyx copying.pyx expressions.pyx labeling.pyx reduce.pyx
+                   replace.pyx round.pyx stream_compaction.pyx types.pyx unary.pyx
 )
 
 set(linked_libraries cudf::cudf)
diff --git a/python/pylibcudf/pylibcudf/libcudf/labeling.pxd b/python/pylibcudf/pylibcudf/libcudf/labeling.pxd
index ec6ef6b2a41..400c4282f7a 100644
--- a/python/pylibcudf/pylibcudf/libcudf/labeling.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/labeling.pxd
@@ -1,14 +1,14 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION.
-
+from libcpp cimport int
 from libcpp.memory cimport unique_ptr
 from pylibcudf.libcudf.column.column cimport column
 from pylibcudf.libcudf.column.column_view cimport column_view
 
 
 cdef extern from "cudf/labeling/label_bins.hpp" namespace "cudf" nogil:
-    ctypedef enum inclusive:
-        YES "cudf::inclusive::YES"
-        NO "cudf::inclusive::NO"
+    cpdef enum class inclusive(int):
+        YES
+        NO
 
     cdef unique_ptr[column] label_bins (
         const column_view &input,
diff --git a/python/pylibcudf/pylibcudf/libcudf/labeling.pyx b/python/pylibcudf/pylibcudf/libcudf/labeling.pyx
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/pylibcudf/pylibcudf/tests/test_labeling.py b/python/pylibcudf/pylibcudf/tests/test_labeling.py
new file mode 100644
index 00000000000..f7fb7463b50
--- /dev/null
+++ b/python/pylibcudf/pylibcudf/tests/test_labeling.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+import pyarrow as pa
+import pylibcudf as plc
+import pytest
+
+
+@pytest.mark.parametrize("left_inclusive", [True, False])
+@pytest.mark.parametrize("right_inclusive", [True, False])
+def test_label_bins(left_inclusive, right_inclusive):
+    in_col = plc.interop.from_arrow(pa.array([1, 2, 3]))
+    left_edges = plc.interop.from_arrow(pa.array([0, 5]))
+    right_edges = plc.interop.from_arrow(pa.array([4, 6]))
+    result = plc.interop.to_arrow(
+        plc.labeling.label_bins(
+            in_col, left_edges, left_inclusive, right_edges, right_inclusive
+        )
+    )
+    expected = pa.chunked_array([[0, 0, 0]], type=pa.int32())
+    assert result.equals(expected)
+
+
+def test_inclusive_enum():
+    assert plc.labeling.Inclusive.YES == 0
+    assert plc.labeling.Inclusive.NO == 1

From a6b3de50eff6208ca9ef4047d915ef9f3acd6636 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 9 Sep 2024 20:54:15 +0000
Subject: [PATCH 785/842] enable logging

---
 .../cudf/pandas/scripts/conftest-patch.py     | 139 ++++++++++--------
 1 file changed, 76 insertions(+), 63 deletions(-)

diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index fc02183f352..d214ec8defc 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -3,11 +3,9 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import contextlib
-import json
 import multiprocessing
 import os
 import sys
-from collections import defaultdict
 from functools import wraps
 
 import pytest
@@ -39,66 +37,81 @@ def patch_testing_functions():
     pytest.raises = replace_kwargs({"match": None})(pytest.raises)
 
 
-# # Dictionary to store function call counts
-# manager = multiprocessing.Manager()
-# function_call_counts = manager.dict()
-
-# # The specific function to track
-# FUNCTION_NAME = {'_slow_function_call', '_fast_function_call'}
-
-# def trace_calls(frame, event, arg):
-#     if event != 'call':
-#         return
-#     code = frame.f_code
-#     func_name = code.co_name
-#     if func_name in FUNCTION_NAME:
-#         function_call_counts[func_name] = function_call_counts.get(func_name, 0) + 1
-
-# def pytest_sessionstart(session):
-#     # Set the profile function to trace calls
-#     sys.setprofile(trace_calls)
-
-# def pytest_sessionfinish(session, exitstatus):
-#     # Remove the profile function
-#     sys.setprofile(None)
-
-# @pytest.hookimpl(tryfirst=True)
-# def pytest_runtest_setup(item):
-#     # Check if this is the first test in the file
-#     if item.nodeid.split("::")[0] != getattr(pytest_runtest_setup, "current_file", None):
-#         # If it's a new file, reset the function call counts
-#         global function_call_counts
-#         function_call_counts = manager.dict()
-#         pytest_runtest_setup.current_file = item.nodeid.split("::")[0]
-
-# @pytest.hookimpl(trylast=True)
-# def pytest_runtest_teardown(item, nextitem):
-#     # Check if this is the last test in the file
-#     if nextitem is None or nextitem.nodeid.split("::")[0] != item.nodeid.split("::")[0]:
-#         # Write the function call counts to a file
-#         worker_id = os.getenv('PYTEST_XDIST_WORKER', 'master')
-#         output_file = f'function_call_counts_{os.path.basename(item.nodeid.split("::")[0])}_{worker_id}.txt'
-#         with open(output_file, 'w') as f:
-#             for func, count in function_call_counts.items():
-#                 f.write(f'{func}: {count}\n')
-#         print(f'Function call counts have been written to {output_file}')
-
-# @pytest.hookimpl(tryfirst=True)
-# def pytest_configure(config):
-#     if hasattr(config, 'workerinput'):
-#         # Running in xdist worker
-#         global function_call_counts
-#         function_call_counts = manager.dict()
-
-# @pytest.hookimpl(trylast=True)
-# def pytest_unconfigure(config):
-#     if hasattr(config, 'workerinput'):
-#         # Running in xdist worker
-#         worker_id = config.workerinput['workerid']
-#         output_file = f'function_call_counts_worker_{worker_id}.txt'
-#         with open(output_file, 'w') as f:
-#             for func, count in function_call_counts.items():
-#                 f.write(f'{func}: {count}\n')
-#         print(f'Function call counts have been written to {output_file}')
+# Dictionary to store function call counts
+manager = multiprocessing.Manager()
+function_call_counts = manager.dict()
+
+# The specific function to track
+FUNCTION_NAME = {"_slow_function_call", "_fast_function_call"}
+
+
+def trace_calls(frame, event, arg):
+    if event != "call":
+        return
+    code = frame.f_code
+    func_name = code.co_name
+    if func_name in FUNCTION_NAME:
+        function_call_counts[func_name] = (
+            function_call_counts.get(func_name, 0) + 1
+        )
+
+
+def pytest_sessionstart(session):
+    # Set the profile function to trace calls
+    sys.setprofile(trace_calls)
+
+
+def pytest_sessionfinish(session, exitstatus):
+    # Remove the profile function
+    sys.setprofile(None)
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_runtest_setup(item):
+    # Check if this is the first test in the file
+    if item.nodeid.split("::")[0] != getattr(
+        pytest_runtest_setup, "current_file", None
+    ):
+        # If it's a new file, reset the function call counts
+        global function_call_counts
+        function_call_counts = manager.dict()
+        pytest_runtest_setup.current_file = item.nodeid.split("::")[0]
+
+
+@pytest.hookimpl(trylast=True)
+def pytest_runtest_teardown(item, nextitem):
+    # Check if this is the last test in the file
+    if (
+        nextitem is None
+        or nextitem.nodeid.split("::")[0] != item.nodeid.split("::")[0]
+    ):
+        # Write the function call counts to a file
+        worker_id = os.getenv("PYTEST_XDIST_WORKER", "master")
+        output_file = f'function_call_counts_{os.path.basename(item.nodeid.split("::")[0])}_{worker_id}.txt'
+        with open(output_file, "w") as f:
+            for func, count in function_call_counts.items():
+                f.write(f"{func}: {count}\n")
+        print(f"Function call counts have been written to {output_file}")
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_configure(config):
+    if hasattr(config, "workerinput"):
+        # Running in xdist worker
+        global function_call_counts
+        function_call_counts = manager.dict()
+
+
+@pytest.hookimpl(trylast=True)
+def pytest_unconfigure(config):
+    if hasattr(config, "workerinput"):
+        # Running in xdist worker
+        worker_id = config.workerinput["workerid"]
+        output_file = f"function_call_counts_worker_{worker_id}.txt"
+        with open(output_file, "w") as f:
+            for func, count in function_call_counts.items():
+                f.write(f"{func}: {count}\n")
+        print(f"Function call counts have been written to {output_file}")
+
 
 sys.path.append(os.path.dirname(__file__))

From 0206872e4a79cb26f1c60e50957c9606e9e5c654 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 9 Sep 2024 21:58:12 +0000
Subject: [PATCH 786/842] test

---
 python/cudf/cudf_pandas_tests/conftest.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf_pandas_tests/conftest.py b/python/cudf/cudf_pandas_tests/conftest.py
index 1898a785651..7c4bec4e8a8 100644
--- a/python/cudf/cudf_pandas_tests/conftest.py
+++ b/python/cudf/cudf_pandas_tests/conftest.py
@@ -1,8 +1,8 @@
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 import json
 import multiprocessing
 import os
 import sys
-from collections import defaultdict
 
 import pytest
 

From 92f0197197df9e1defbd49903d0b3c25b071d805 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Mon, 9 Sep 2024 16:16:17 -0700
Subject: [PATCH 787/842] Simplify the nvCOMP adapter (#16762)

This PR removes the adapter code that allow running with older nvCOMP versions.
Feature status checking has been significantly simplified, and compile-time checks for newer compression types have been removed.
Also removed the fallback to the old version of get_temp_size, since we are now guaranteed to have access to the extended version.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/16762
---
 cpp/include/cudf/io/nvcomp_adapter.hpp     |  24 +-
 cpp/src/io/comp/nvcomp_adapter.cpp         | 334 +++------------------
 cpp/src/io/comp/nvcomp_adapter.hpp         |  14 +-
 cpp/src/io/orc/writer_impl.cu              |   8 +-
 cpp/src/io/parquet/writer_impl_helpers.cpp |   2 +-
 cpp/tests/io/comp/decomp_test.cpp          |  46 ++-
 6 files changed, 79 insertions(+), 349 deletions(-)

diff --git a/cpp/include/cudf/io/nvcomp_adapter.hpp b/cpp/include/cudf/io/nvcomp_adapter.hpp
index f3260d0cb53..e7fe3cc7214 100644
--- a/cpp/include/cudf/io/nvcomp_adapter.hpp
+++ b/cpp/include/cudf/io/nvcomp_adapter.hpp
@@ -36,33 +36,20 @@ struct feature_status_parameters {
   int lib_patch_version;                 ///< patch version
   bool are_all_integrations_enabled;     ///< all integrations
   bool are_stable_integrations_enabled;  ///< stable integrations
-  int compute_capability_major;          ///< cuda compute major version
 
   /**
-   * @brief Default Constructor
+   * @brief Default constructor using the current version of nvcomp and current environment
+   * variables
    */
   feature_status_parameters();
 
   /**
-   * @brief feature_status_parameters Constructor
+   * @brief Constructor using the current version of nvcomp
    *
-   * @param major positive integer representing major value of nvcomp
-   * @param minor positive integer representing minor value of nvcomp
-   * @param patch positive integer representing patch value of nvcomp
    * @param all_enabled if all integrations are enabled
    * @param stable_enabled if stable integrations are enabled
-   * @param cc_major CUDA compute capability
    */
-  feature_status_parameters(
-    int major, int minor, int patch, bool all_enabled, bool stable_enabled, int cc_major)
-    : lib_major_version{major},
-      lib_minor_version{minor},
-      lib_patch_version{patch},
-      are_all_integrations_enabled{all_enabled},
-      are_stable_integrations_enabled{stable_enabled},
-      compute_capability_major{cc_major}
-  {
-  }
+  feature_status_parameters(bool all_enabled, bool stable_enabled);
 };
 
 /**
@@ -74,8 +61,7 @@ inline bool operator==(feature_status_parameters const& lhs, feature_status_para
          lhs.lib_minor_version == rhs.lib_minor_version and
          lhs.lib_patch_version == rhs.lib_patch_version and
          lhs.are_all_integrations_enabled == rhs.are_all_integrations_enabled and
-         lhs.are_stable_integrations_enabled == rhs.are_stable_integrations_enabled and
-         lhs.compute_capability_major == rhs.compute_capability_major;
+         lhs.are_stable_integrations_enabled == rhs.are_stable_integrations_enabled;
 }
 
 /**
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index 5d0c6a8c83b..261a8eb401d 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -22,95 +22,44 @@
 #include <cudf/io/config_utils.hpp>
 #include <cudf/utilities/error.hpp>
 
+#include <nvcomp/deflate.h>
 #include <nvcomp/lz4.h>
 #include <nvcomp/snappy.h>
+#include <nvcomp/zstd.h>
 
 #include <mutex>
 
-#define NVCOMP_DEFLATE_HEADER <nvcomp/deflate.h>
-#if __has_include(NVCOMP_DEFLATE_HEADER)
-#include NVCOMP_DEFLATE_HEADER
-#endif
-
-#define NVCOMP_ZSTD_HEADER <nvcomp/zstd.h>
-#if __has_include(NVCOMP_ZSTD_HEADER)
-#include NVCOMP_ZSTD_HEADER
-#endif
-
-// When building with nvcomp 4.0 or newer, map the new version macros to the old ones
-#ifndef NVCOMP_MAJOR_VERSION
-#define NVCOMP_MAJOR_VERSION NVCOMP_VER_MAJOR
-#define NVCOMP_MINOR_VERSION NVCOMP_VER_MINOR
-#define NVCOMP_PATCH_VERSION NVCOMP_VER_PATCH
-#endif
-
-#define NVCOMP_HAS_ZSTD_DECOMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 3))
-
-#define NVCOMP_HAS_ZSTD_COMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 4))
-
-#define NVCOMP_HAS_DEFLATE(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 5))
-
-#define NVCOMP_HAS_DECOMP_TEMPSIZE_EX(MAJOR, MINOR, PATCH) \
-  (MAJOR > 2 or (MAJOR == 2 and MINOR > 3) or (MAJOR == 2 and MINOR == 3 and PATCH >= 1))
-
-#define NVCOMP_HAS_COMP_TEMPSIZE_EX(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 6))
-
-// ZSTD is stable for nvcomp 2.3.2 or newer
-#define NVCOMP_ZSTD_DECOMP_IS_STABLE(MAJOR, MINOR, PATCH) \
-  (MAJOR > 2 or (MAJOR == 2 and MINOR > 3) or (MAJOR == 2 and MINOR == 3 and PATCH >= 2))
-
 namespace cudf::io::nvcomp {
 
 // Dispatcher for nvcompBatched<format>DecompressGetTempSizeEx
 template <typename... Args>
-std::optional<nvcompStatus_t> batched_decompress_get_temp_size_ex(compression_type compression,
-                                                                  Args&&... args)
+auto batched_decompress_get_temp_size_ex(compression_type compression, Args&&... args)
 {
-#if NVCOMP_HAS_DECOMP_TEMPSIZE_EX(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
   switch (compression) {
     case compression_type::SNAPPY:
       return nvcompBatchedSnappyDecompressGetTempSizeEx(std::forward<Args>(args)...);
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_DECOMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       return nvcompBatchedZstdDecompressGetTempSizeEx(std::forward<Args>(args)...);
-#else
-      return std::nullopt;
-#endif
     case compression_type::LZ4:
       return nvcompBatchedLZ4DecompressGetTempSizeEx(std::forward<Args>(args)...);
-    case compression_type::DEFLATE: [[fallthrough]];
-    default: return std::nullopt;
-  }
-#endif
-  return std::nullopt;
-}
-
-// Dispatcher for nvcompBatched<format>DecompressGetTempSize
-template <typename... Args>
-auto batched_decompress_get_temp_size(compression_type compression, Args&&... args)
-{
-  switch (compression) {
-    case compression_type::SNAPPY:
-      return nvcompBatchedSnappyDecompressGetTempSize(std::forward<Args>(args)...);
-    case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_DECOMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
-      return nvcompBatchedZstdDecompressGetTempSize(std::forward<Args>(args)...);
-#else
-      CUDF_FAIL("Decompression error: " +
-                nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD).value());
-#endif
     case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
-      return nvcompBatchedDeflateDecompressGetTempSize(std::forward<Args>(args)...);
-#else
-      CUDF_FAIL("Decompression error: " +
-                nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE).value());
-#endif
-    case compression_type::LZ4:
-      return nvcompBatchedLZ4DecompressGetTempSize(std::forward<Args>(args)...);
+      return nvcompBatchedDeflateDecompressGetTempSizeEx(std::forward<Args>(args)...);
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
+size_t batched_decompress_temp_size(compression_type compression,
+                                    size_t num_chunks,
+                                    size_t max_uncomp_chunk_size,
+                                    size_t max_total_uncomp_size)
+{
+  size_t temp_size             = 0;
+  nvcompStatus_t nvcomp_status = batched_decompress_get_temp_size_ex(
+    compression, num_chunks, max_uncomp_chunk_size, &temp_size, max_total_uncomp_size);
+
+  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
+               "Unable to get scratch size for decompression");
+  return temp_size;
+}
 
 // Dispatcher for nvcompBatched<format>DecompressAsync
 template <typename... Args>
@@ -120,19 +69,9 @@ auto batched_decompress_async(compression_type compression, Args&&... args)
     case compression_type::SNAPPY:
       return nvcompBatchedSnappyDecompressAsync(std::forward<Args>(args)...);
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_DECOMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       return nvcompBatchedZstdDecompressAsync(std::forward<Args>(args)...);
-#else
-      CUDF_FAIL("Decompression error: " +
-                nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD).value());
-#endif
     case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       return nvcompBatchedDeflateDecompressAsync(std::forward<Args>(args)...);
-#else
-      CUDF_FAIL("Decompression error: " +
-                nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE).value());
-#endif
     case compression_type::LZ4: return nvcompBatchedLZ4DecompressAsync(std::forward<Args>(args)...);
     default: CUDF_FAIL("Unsupported compression type");
   }
@@ -149,27 +88,6 @@ std::string compression_type_name(compression_type compression)
   return "compression_type(" + std::to_string(static_cast<int>(compression)) + ")";
 }
 
-size_t batched_decompress_temp_size(compression_type compression,
-                                    size_t num_chunks,
-                                    size_t max_uncomp_chunk_size,
-                                    size_t max_total_uncomp_size)
-{
-  size_t temp_size   = 0;
-  auto nvcomp_status = batched_decompress_get_temp_size_ex(
-    compression, num_chunks, max_uncomp_chunk_size, &temp_size, max_total_uncomp_size);
-
-  if (nvcomp_status.value_or(nvcompStatus_t::nvcompErrorInternal) !=
-      nvcompStatus_t::nvcompSuccess) {
-    nvcomp_status =
-      batched_decompress_get_temp_size(compression, num_chunks, max_uncomp_chunk_size, &temp_size);
-  }
-
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
-               "Unable to get scratch size for decompression");
-
-  return temp_size;
-}
-
 void batched_decompress(compression_type compression,
                         device_span<device_span<uint8_t const> const> inputs,
                         device_span<device_span<uint8_t> const> outputs,
@@ -204,54 +122,10 @@ void batched_decompress(compression_type compression,
   update_compression_results(nvcomp_statuses, actual_uncompressed_data_sizes, results, stream);
 }
 
-// Wrapper for nvcompBatched<format>CompressGetTempSize
-auto batched_compress_get_temp_size(compression_type compression,
-                                    size_t batch_size,
-                                    size_t max_uncompressed_chunk_bytes)
-{
-  size_t temp_size             = 0;
-  nvcompStatus_t nvcomp_status = nvcompStatus_t::nvcompSuccess;
-  switch (compression) {
-    case compression_type::SNAPPY:
-      nvcomp_status = nvcompBatchedSnappyCompressGetTempSize(
-        batch_size, max_uncompressed_chunk_bytes, nvcompBatchedSnappyDefaultOpts, &temp_size);
-      break;
-    case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
-      nvcomp_status = nvcompBatchedDeflateCompressGetTempSize(
-        batch_size, max_uncompressed_chunk_bytes, nvcompBatchedDeflateDefaultOpts, &temp_size);
-      break;
-#else
-      CUDF_FAIL("Compression error: " +
-                nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE).value());
-#endif
-    case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
-      nvcomp_status = nvcompBatchedZstdCompressGetTempSize(
-        batch_size, max_uncompressed_chunk_bytes, nvcompBatchedZstdDefaultOpts, &temp_size);
-      break;
-#else
-      CUDF_FAIL("Compression error: " +
-                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
-#endif
-    case compression_type::LZ4:
-      nvcomp_status = nvcompBatchedLZ4CompressGetTempSize(
-        batch_size, max_uncompressed_chunk_bytes, nvcompBatchedLZ4DefaultOpts, &temp_size);
-      break;
-    default: CUDF_FAIL("Unsupported compression type");
-  }
-
-  CUDF_EXPECTS(nvcomp_status == nvcompStatus_t::nvcompSuccess,
-               "Unable to get scratch size for compression");
-  return temp_size;
-}
-
-#if NVCOMP_HAS_COMP_TEMPSIZE_EX(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
-// Wrapper for nvcompBatched<format>CompressGetTempSizeEx
-auto batched_compress_get_temp_size_ex(compression_type compression,
-                                       size_t batch_size,
-                                       size_t max_uncompressed_chunk_bytes,
-                                       size_t max_total_uncompressed_bytes)
+size_t batched_compress_temp_size(compression_type compression,
+                                  size_t batch_size,
+                                  size_t max_uncompressed_chunk_bytes,
+                                  size_t max_total_uncompressed_bytes)
 {
   size_t temp_size             = 0;
   nvcompStatus_t nvcomp_status = nvcompStatus_t::nvcompSuccess;
@@ -291,28 +165,8 @@ auto batched_compress_get_temp_size_ex(compression_type compression,
                "Unable to get scratch size for compression");
   return temp_size;
 }
-#endif
-
-size_t batched_compress_temp_size(compression_type compression,
-                                  size_t num_chunks,
-                                  size_t max_uncomp_chunk_size,
-                                  size_t max_total_uncomp_size)
-{
-#if NVCOMP_HAS_COMP_TEMPSIZE_EX(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
-  try {
-    return batched_compress_get_temp_size_ex(
-      compression, num_chunks, max_uncomp_chunk_size, max_total_uncomp_size);
-  } catch (...) {
-    // Ignore errors in the expanded version; fall back to the old API in case of failure
-    CUDF_LOG_WARN(
-      "CompressGetTempSizeEx call failed, falling back to CompressGetTempSize; this may increase "
-      "the memory usage");
-  }
-#endif
-
-  return batched_compress_get_temp_size(compression, num_chunks, max_uncomp_chunk_size);
-}
 
+// Wrapper for nvcompBatched<format>CompressGetMaxOutputChunkSize
 size_t compress_max_output_chunk_size(compression_type compression,
                                       uint32_t max_uncompressed_chunk_bytes)
 {
@@ -328,23 +182,13 @@ size_t compress_max_output_chunk_size(compression_type compression,
         capped_uncomp_bytes, nvcompBatchedSnappyDefaultOpts, &max_comp_chunk_size);
       break;
     case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       status = nvcompBatchedDeflateCompressGetMaxOutputChunkSize(
         capped_uncomp_bytes, nvcompBatchedDeflateDefaultOpts, &max_comp_chunk_size);
       break;
-#else
-      CUDF_FAIL("Compression error: " +
-                nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE).value());
-#endif
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       status = nvcompBatchedZstdCompressGetMaxOutputChunkSize(
         capped_uncomp_bytes, nvcompBatchedZstdDefaultOpts, &max_comp_chunk_size);
       break;
-#else
-      CUDF_FAIL("Compression error: " +
-                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
-#endif
     case compression_type::LZ4:
       status = nvcompBatchedLZ4CompressGetMaxOutputChunkSize(
         capped_uncomp_bytes, nvcompBatchedLZ4DefaultOpts, &max_comp_chunk_size);
@@ -384,7 +228,6 @@ static void batched_compress_async(compression_type compression,
                                                        stream.value());
       break;
     case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       nvcomp_status = nvcompBatchedDeflateCompressAsync(device_uncompressed_ptrs,
                                                         device_uncompressed_bytes,
                                                         max_uncompressed_chunk_bytes,
@@ -396,12 +239,7 @@ static void batched_compress_async(compression_type compression,
                                                         nvcompBatchedDeflateDefaultOpts,
                                                         stream.value());
       break;
-#else
-      CUDF_FAIL("Compression error: " +
-                nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE).value());
-#endif
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       nvcomp_status = nvcompBatchedZstdCompressAsync(device_uncompressed_ptrs,
                                                      device_uncompressed_bytes,
                                                      max_uncompressed_chunk_bytes,
@@ -413,10 +251,6 @@ static void batched_compress_async(compression_type compression,
                                                      nvcompBatchedZstdDefaultOpts,
                                                      stream.value());
       break;
-#else
-      CUDF_FAIL("Compression error: " +
-                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
-#endif
     case compression_type::LZ4:
       nvcomp_status = nvcompBatchedLZ4CompressAsync(device_uncompressed_ptrs,
                                                     device_uncompressed_bytes,
@@ -478,16 +312,18 @@ void batched_compress(compression_type compression,
 }
 
 feature_status_parameters::feature_status_parameters()
-  : lib_major_version{NVCOMP_MAJOR_VERSION},
-    lib_minor_version{NVCOMP_MINOR_VERSION},
-    lib_patch_version{NVCOMP_PATCH_VERSION},
-    are_all_integrations_enabled{nvcomp_integration::is_all_enabled()},
-    are_stable_integrations_enabled{nvcomp_integration::is_stable_enabled()}
+  : feature_status_parameters(nvcomp_integration::is_all_enabled(),
+                              nvcomp_integration::is_stable_enabled())
+{
+}
+
+feature_status_parameters::feature_status_parameters(bool all_enabled, bool stable_enabled)
+  : lib_major_version{NVCOMP_VER_MAJOR},
+    lib_minor_version{NVCOMP_VER_MINOR},
+    lib_patch_version{NVCOMP_VER_PATCH},
+    are_all_integrations_enabled{all_enabled},
+    are_stable_integrations_enabled{stable_enabled}
 {
-  int device;
-  CUDF_CUDA_TRY(cudaGetDevice(&device));
-  CUDF_CUDA_TRY(
-    cudaDeviceGetAttribute(&compute_capability_major, cudaDevAttrComputeCapabilityMajor, device));
 }
 
 // Represents all parameters required to determine status of a compression/decompression feature
@@ -510,41 +346,19 @@ std::optional<std::string> is_compression_disabled_impl(compression_type compres
 {
   switch (compression) {
     case compression_type::DEFLATE: {
-      if (not NVCOMP_HAS_DEFLATE(
-            params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
-        return "nvCOMP 2.5 or newer is required for Deflate compression";
-      }
       if (not params.are_all_integrations_enabled) {
         return "DEFLATE compression is experimental, you can enable it through "
                "`LIBCUDF_NVCOMP_POLICY` environment variable.";
       }
       return std::nullopt;
     }
-    case compression_type::SNAPPY: {
-      if (not params.are_stable_integrations_enabled) {
-        return "Snappy compression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
-               "environment variable.";
-      }
-      return std::nullopt;
-    }
-    case compression_type::ZSTD: {
-      if (not NVCOMP_HAS_ZSTD_COMP(
-            params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
-        return "nvCOMP 2.4 or newer is required for Zstandard compression";
-      }
-      if (not params.are_stable_integrations_enabled) {
-        return "Zstandard compression is experimental, you can enable it through "
-               "`LIBCUDF_NVCOMP_POLICY` environment variable.";
-      }
-      return std::nullopt;
-    }
     case compression_type::LZ4:
+    case compression_type::SNAPPY:
+    case compression_type::ZSTD:
       if (not params.are_stable_integrations_enabled) {
-        return "LZ4 compression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
-               "environment variable.";
+        return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable.";
       }
       return std::nullopt;
-    default: return "Unsupported compression type";
   }
   return "Unsupported compression type";
 }
@@ -578,58 +392,25 @@ std::optional<std::string> is_compression_disabled(compression_type compression,
   return reason;
 }
 
-std::optional<std::string> is_zstd_decomp_disabled(feature_status_parameters const& params)
-{
-  if (not NVCOMP_HAS_ZSTD_DECOMP(
-        params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
-    return "nvCOMP 2.3 or newer is required for Zstandard decompression";
-  }
-
-  if (NVCOMP_ZSTD_DECOMP_IS_STABLE(
-        params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
-    if (not params.are_stable_integrations_enabled) {
-      return "Zstandard decompression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
-             "environment variable.";
-    }
-  } else if (not params.are_all_integrations_enabled) {
-    return "Zstandard decompression is experimental, you can enable it through "
-           "`LIBCUDF_NVCOMP_POLICY` environment variable.";
-  }
-
-  return std::nullopt;
-}
-
 std::optional<std::string> is_decompression_disabled_impl(compression_type compression,
                                                           feature_status_parameters params)
 {
   switch (compression) {
     case compression_type::DEFLATE: {
-      if (not NVCOMP_HAS_DEFLATE(
-            params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
-        return "nvCOMP 2.5 or newer is required for Deflate decompression";
-      }
       if (not params.are_all_integrations_enabled) {
         return "DEFLATE decompression is experimental, you can enable it through "
                "`LIBCUDF_NVCOMP_POLICY` environment variable.";
       }
       return std::nullopt;
     }
-    case compression_type::SNAPPY: {
-      if (not params.are_stable_integrations_enabled) {
-        return "Snappy decompression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
-               "environment variable.";
-      }
-      return std::nullopt;
-    }
-    case compression_type::ZSTD: return is_zstd_decomp_disabled(params);
-    case compression_type::LZ4: {
+    case compression_type::LZ4:
+    case compression_type::SNAPPY:
+    case compression_type::ZSTD: {
       if (not params.are_stable_integrations_enabled) {
-        return "LZ4 decompression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
-               "environment variable.";
+        return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable.";
       }
       return std::nullopt;
     }
-    default: return "Unsupported compression type";
   }
   return "Unsupported compression type";
 }
@@ -663,24 +444,13 @@ std::optional<std::string> is_decompression_disabled(compression_type compressio
   return reason;
 }
 
-size_t compress_input_alignment_bits(compression_type compression)
+size_t required_alignment(compression_type compression)
 {
   switch (compression) {
-    case compression_type::DEFLATE: return 0;
-    case compression_type::SNAPPY: return 0;
-    case compression_type::ZSTD: return 2;
-    case compression_type::LZ4: return 2;
-    default: CUDF_FAIL("Unsupported compression type");
-  }
-}
-
-size_t compress_output_alignment_bits(compression_type compression)
-{
-  switch (compression) {
-    case compression_type::DEFLATE: return 3;
-    case compression_type::SNAPPY: return 0;
-    case compression_type::ZSTD: return 0;
-    case compression_type::LZ4: return 2;
+    case compression_type::DEFLATE: return nvcompDeflateRequiredAlignment;
+    case compression_type::SNAPPY: return nvcompSnappyRequiredAlignment;
+    case compression_type::ZSTD: return nvcompZstdRequiredAlignment;
+    case compression_type::LZ4: return nvcompLZ4RequiredAlignment;
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
@@ -688,16 +458,10 @@ size_t compress_output_alignment_bits(compression_type compression)
 std::optional<size_t> compress_max_allowed_chunk_size(compression_type compression)
 {
   switch (compression) {
-    case compression_type::DEFLATE: return 64 * 1024;
-    case compression_type::SNAPPY: return std::nullopt;
-    case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
-      return nvcompZstdCompressionMaxAllowedChunkSize;
-#else
-      CUDF_FAIL("Compression error: " +
-                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
-#endif
-    case compression_type::LZ4: return 16 * 1024 * 1024;
+    case compression_type::DEFLATE: return nvcompDeflateCompressionMaxAllowedChunkSize;
+    case compression_type::SNAPPY: return nvcompSnappyCompressionMaxAllowedChunkSize;
+    case compression_type::ZSTD: return nvcompZstdCompressionMaxAllowedChunkSize;
+    case compression_type::LZ4: return nvcompLZ4CompressionMaxAllowedChunkSize;
     default: return std::nullopt;
   }
 }
diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
index 43c79e32375..583bd6a3523 100644
--- a/cpp/src/io/comp/nvcomp_adapter.hpp
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -75,20 +75,12 @@ size_t batched_decompress_temp_size(compression_type compression,
                                                     uint32_t max_uncomp_chunk_size);
 
 /**
- * @brief Gets input alignment requirements for the given compression type.
+ * @brief Gets input and output alignment requirements for the given compression type.
  *
  * @param compression Compression type
- * @returns required alignment, in bits
+ * @returns required alignment
  */
-[[nodiscard]] size_t compress_input_alignment_bits(compression_type compression);
-
-/**
- * @brief Gets output alignment requirements for the given compression type.
- *
- * @param compression Compression type
- * @returns required alignment, in bits
- */
-[[nodiscard]] size_t compress_output_alignment_bits(compression_type compression);
+[[nodiscard]] size_t required_alignment(compression_type compression);
 
 /**
  * @brief Maximum size of uncompressed chunks that can be compressed with nvCOMP.
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index ede9fd060b8..ebdf9f3f249 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -532,20 +532,20 @@ auto uncomp_block_alignment(CompressionKind compression_kind)
 {
   if (compression_kind == NONE or
       nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) {
-    return 1u;
+    return 1ul;
   }
 
-  return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(compression_kind));
+  return nvcomp::required_alignment(to_nvcomp_compression_type(compression_kind));
 }
 
 auto comp_block_alignment(CompressionKind compression_kind)
 {
   if (compression_kind == NONE or
       nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) {
-    return 1u;
+    return 1ul;
   }
 
-  return 1u << nvcomp::compress_output_alignment_bits(to_nvcomp_compression_type(compression_kind));
+  return nvcomp::required_alignment(to_nvcomp_compression_type(compression_kind));
 }
 
 /**
diff --git a/cpp/src/io/parquet/writer_impl_helpers.cpp b/cpp/src/io/parquet/writer_impl_helpers.cpp
index e2f09f872d3..396d44c0763 100644
--- a/cpp/src/io/parquet/writer_impl_helpers.cpp
+++ b/cpp/src/io/parquet/writer_impl_helpers.cpp
@@ -62,7 +62,7 @@ uint32_t page_alignment(Compression codec)
     return 1u;
   }
 
-  return 1u << nvcomp::compress_input_alignment_bits(to_nvcomp_compression_type(codec));
+  return nvcomp::required_alignment(to_nvcomp_compression_type(codec));
 }
 
 size_t max_compression_output_size(Compression codec, uint32_t compression_blocksize)
diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp
index 38c1a57eca9..840cf263ed9 100644
--- a/cpp/tests/io/comp/decomp_test.cpp
+++ b/cpp/tests/io/comp/decomp_test.cpp
@@ -176,23 +176,19 @@ TEST_F(NvcompConfigTest, Compression)
   using cudf::io::nvcomp::compression_type;
   auto const& comp_disabled = cudf::io::nvcomp::is_compression_disabled;
 
-  EXPECT_FALSE(comp_disabled(compression_type::DEFLATE, {2, 5, 0, true, true, 0}));
-  // version 2.5 required
-  EXPECT_TRUE(comp_disabled(compression_type::DEFLATE, {2, 4, 0, true, true, 0}));
+  EXPECT_FALSE(comp_disabled(compression_type::DEFLATE, {true, true}));
   // all integrations enabled required
-  EXPECT_TRUE(comp_disabled(compression_type::DEFLATE, {2, 5, 0, false, true, 0}));
+  EXPECT_TRUE(comp_disabled(compression_type::DEFLATE, {false, true}));
 
-  EXPECT_FALSE(comp_disabled(compression_type::ZSTD, {2, 4, 0, true, true, 0}));
-  EXPECT_FALSE(comp_disabled(compression_type::ZSTD, {2, 4, 0, false, true, 0}));
-  // 2.4 version required
-  EXPECT_TRUE(comp_disabled(compression_type::ZSTD, {2, 3, 1, false, true, 0}));
+  EXPECT_FALSE(comp_disabled(compression_type::ZSTD, {true, true}));
+  EXPECT_FALSE(comp_disabled(compression_type::ZSTD, {false, true}));
   // stable integrations enabled required
-  EXPECT_TRUE(comp_disabled(compression_type::ZSTD, {2, 4, 0, false, false, 0}));
+  EXPECT_TRUE(comp_disabled(compression_type::ZSTD, {false, false}));
 
-  EXPECT_FALSE(comp_disabled(compression_type::SNAPPY, {2, 5, 0, true, true, 0}));
-  EXPECT_FALSE(comp_disabled(compression_type::SNAPPY, {2, 4, 0, false, true, 0}));
+  EXPECT_FALSE(comp_disabled(compression_type::SNAPPY, {true, true}));
+  EXPECT_FALSE(comp_disabled(compression_type::SNAPPY, {false, true}));
   // stable integrations enabled required
-  EXPECT_TRUE(comp_disabled(compression_type::SNAPPY, {2, 3, 0, false, false, 0}));
+  EXPECT_TRUE(comp_disabled(compression_type::SNAPPY, {false, false}));
 }
 
 TEST_F(NvcompConfigTest, Decompression)
@@ -200,27 +196,19 @@ TEST_F(NvcompConfigTest, Decompression)
   using cudf::io::nvcomp::compression_type;
   auto const& decomp_disabled = cudf::io::nvcomp::is_decompression_disabled;
 
-  EXPECT_FALSE(decomp_disabled(compression_type::DEFLATE, {2, 5, 0, true, true, 7}));
-  // version 2.5 required
-  EXPECT_TRUE(decomp_disabled(compression_type::DEFLATE, {2, 4, 0, true, true, 7}));
+  EXPECT_FALSE(decomp_disabled(compression_type::DEFLATE, {true, true}));
   // all integrations enabled required
-  EXPECT_TRUE(decomp_disabled(compression_type::DEFLATE, {2, 5, 0, false, true, 7}));
-
-  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {2, 4, 0, true, true, 7}));
-  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {2, 3, 2, false, true, 6}));
-  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {2, 3, 0, true, true, 6}));
-  // 2.3.1 and earlier requires all integrations to be enabled
-  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {2, 3, 1, false, true, 7}));
-  // 2.3 version required
-  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {2, 2, 0, true, true, 7}));
+  EXPECT_TRUE(decomp_disabled(compression_type::DEFLATE, {false, true}));
+
+  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {true, true}));
+  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {false, true}));
   // stable integrations enabled required
-  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {2, 4, 0, false, false, 7}));
+  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {false, false}));
 
-  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {2, 4, 0, true, true, 7}));
-  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {2, 3, 0, false, true, 7}));
-  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {2, 2, 0, false, true, 7}));
+  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {true, true}));
+  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {false, true}));
   // stable integrations enabled required
-  EXPECT_TRUE(decomp_disabled(compression_type::SNAPPY, {2, 2, 0, false, false, 7}));
+  EXPECT_TRUE(decomp_disabled(compression_type::SNAPPY, {false, false}));
 }
 
 CUDF_TEST_PROGRAM_MAIN()

From f21979ec3fbfb97ddab8ee465aadf8e98ad33e65 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Mon, 9 Sep 2024 17:03:37 -0700
Subject: [PATCH 788/842] Extend the Parquet writer's dictionary encoding
 benchmark. (#16591)

This PR extends the data cardinality and run length range for the existing parquet writer's encoding benchmark.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16591
---
 cpp/benchmarks/io/parquet/parquet_writer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp
index 46d2927a92b..256e50f0e64 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp
@@ -202,8 +202,8 @@ NVBENCH_BENCH_TYPES(BM_parq_write_encode, NVBENCH_TYPE_AXES(d_type_list))
   .set_name("parquet_write_encode")
   .set_type_axes_names({"data_type"})
   .set_min_samples(4)
-  .add_int64_axis("cardinality", {0, 1000})
-  .add_int64_axis("run_length", {1, 32});
+  .add_int64_axis("cardinality", {0, 1000, 10'000, 100'000})
+  .add_int64_axis("run_length", {1, 8, 32});
 
 NVBENCH_BENCH_TYPES(BM_parq_write_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list))
   .set_name("parquet_write_io_compression")

From afd3a4b4776adf738284c9f0b99e1fc2fcefeec8 Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Tue, 10 Sep 2024 22:03:48 +1000
Subject: [PATCH 789/842] Add libcudf wrappers around current_device_resource
 functions. (#16679)

Merge after rapidsai/rmm#1661

Creates and uses CUDF internal wrappers around RMM `current_device_resource` functions.

I've marked this PR as breaking because it breaks the ABI, however the API is compatible.

For reviewers, the most substantial additions are in the new file `<cudf/utilities/memory_resource.hpp>`, and in the `DEVELOPER_GUIDE.md` and `*.rst` docs. The rest are all replacements of an include and all calls to `rmm::get_current_device_resource()` with `cudf::get_current_device_resource_ref()`.

Closes #16676

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - https://github.com/nvdbaranec
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/16679
---
 cpp/benchmarks/common/generate_input.cu       | 14 +--
 .../random_column_generator.hpp               | 14 +--
 .../tpch_data_generator/table_helpers.cpp     |  3 +
 .../tpch_data_generator/table_helpers.hpp     | 20 +++--
 .../tpch_data_generator.cpp                   |  3 +
 .../tpch_data_generator.hpp                   | 14 +--
 cpp/benchmarks/copying/contiguous_split.cu    |  5 +-
 cpp/benchmarks/copying/shift.cu               |  5 +-
 cpp/benchmarks/fixture/benchmark_fixture.hpp  | 13 +--
 cpp/benchmarks/fixture/nvbench_fixture.hpp    |  5 +-
 cpp/benchmarks/io/cuio_common.cpp             |  4 +-
 cpp/benchmarks/io/json/nested_json.cpp        |  5 +-
 .../io/orc/orc_reader_multithreaded.cpp       |  3 +-
 .../io/parquet/parquet_reader_multithread.cpp |  3 +-
 cpp/benchmarks/iterator/iterator.cu           |  3 +-
 cpp/benchmarks/join/join_common.hpp           |  3 +-
 cpp/benchmarks/json/json.cu                   |  3 +-
 cpp/benchmarks/lists/copying/scatter_lists.cu |  3 +-
 cpp/benchmarks/lists/set_operations.cpp       |  5 +-
 cpp/benchmarks/merge/merge_lists.cpp          |  9 +-
 cpp/benchmarks/merge/merge_structs.cpp        |  9 +-
 cpp/benchmarks/reduction/rank.cpp             |  3 +-
 cpp/benchmarks/reduction/scan_structs.cpp     |  3 +-
 cpp/benchmarks/search/contains_table.cpp      |  7 +-
 cpp/benchmarks/sort/rank_lists.cpp            |  3 +-
 cpp/benchmarks/sort/rank_structs.cpp          |  3 +-
 cpp/benchmarks/sort/sort_lists.cpp            |  8 +-
 cpp/benchmarks/sort/sort_structs.cpp          |  6 +-
 .../developer_guide/DEVELOPER_GUIDE.md        | 29 +++---
 cpp/examples/basic/src/process_csv.cpp        |  2 +-
 cpp/examples/interop/interop.cpp              |  3 +-
 cpp/examples/nested_types/deduplication.cpp   |  2 +-
 cpp/examples/parquet_io/parquet_io.cpp        |  2 +-
 cpp/examples/strings/common.hpp               |  2 +-
 cpp/examples/tpch/q1.cpp                      |  7 +-
 cpp/examples/tpch/q10.cpp                     |  5 +-
 cpp/examples/tpch/q5.cpp                      |  5 +-
 cpp/examples/tpch/q6.cpp                      |  5 +-
 cpp/examples/tpch/q9.cpp                      |  5 +-
 cpp/examples/tpch/utils.hpp                   |  3 +-
 .../cudf/ast/detail/expression_parser.hpp     |  3 +-
 cpp/include/cudf/binaryop.hpp                 | 14 ++-
 cpp/include/cudf/column/column.hpp            |  7 +-
 cpp/include/cudf/column/column_factories.hpp  | 35 ++++----
 cpp/include/cudf/concatenate.hpp              | 10 +--
 cpp/include/cudf/contiguous_split.hpp         | 13 ++-
 cpp/include/cudf/copying.hpp                  | 40 ++++-----
 cpp/include/cudf/datetime.hpp                 | 44 +++++----
 cpp/include/cudf/detail/binaryop.hpp          |  2 +-
 .../detail/calendrical_month_sequence.cuh     |  2 +-
 cpp/include/cudf/detail/concatenate.hpp       |  2 +-
 cpp/include/cudf/detail/concatenate_masks.hpp |  2 +-
 cpp/include/cudf/detail/contiguous_split.hpp  |  2 +-
 cpp/include/cudf/detail/copy.hpp              |  2 +-
 cpp/include/cudf/detail/copy_if.cuh           |  2 +-
 cpp/include/cudf/detail/copy_if_else.cuh      |  2 +-
 cpp/include/cudf/detail/copy_range.cuh        |  2 +-
 cpp/include/cudf/detail/datetime.hpp          |  3 +-
 .../cudf/detail/distinct_hash_join.cuh        |  2 +-
 cpp/include/cudf/detail/fill.hpp              |  2 +-
 cpp/include/cudf/detail/gather.cuh            |  6 +-
 cpp/include/cudf/detail/gather.hpp            |  2 +-
 cpp/include/cudf/detail/groupby.hpp           |  2 +-
 .../detail/groupby/group_replace_nulls.hpp    |  2 +-
 .../cudf/detail/groupby/sort_helper.hpp       |  2 +-
 .../cudf/detail/hash_reduce_by_row.cuh        |  2 +-
 cpp/include/cudf/detail/interop.hpp           |  2 +-
 cpp/include/cudf/detail/join.hpp              |  2 +-
 cpp/include/cudf/detail/label_bins.hpp        |  3 +-
 cpp/include/cudf/detail/merge.hpp             |  2 +-
 cpp/include/cudf/detail/null_mask.cuh         | 10 +--
 cpp/include/cudf/detail/null_mask.hpp         |  2 +-
 cpp/include/cudf/detail/quantiles.hpp         |  2 +-
 cpp/include/cudf/detail/repeat.hpp            |  2 +-
 cpp/include/cudf/detail/replace.hpp           |  2 +-
 cpp/include/cudf/detail/reshape.hpp           |  2 +-
 cpp/include/cudf/detail/rolling.hpp           |  2 +-
 cpp/include/cudf/detail/round.hpp             |  2 +-
 cpp/include/cudf/detail/scan.hpp              |  2 +-
 cpp/include/cudf/detail/scatter.cuh           |  4 +-
 cpp/include/cudf/detail/scatter.hpp           |  2 +-
 cpp/include/cudf/detail/search.hpp            |  2 +-
 cpp/include/cudf/detail/sequence.hpp          |  6 +-
 .../cudf/detail/sizes_to_offsets_iterator.cuh |  2 +-
 cpp/include/cudf/detail/sorting.hpp           |  2 +-
 cpp/include/cudf/detail/stream_compaction.hpp |  2 +-
 cpp/include/cudf/detail/structs/utilities.hpp |  2 +-
 cpp/include/cudf/detail/tdigest/tdigest.hpp   |  2 +-
 cpp/include/cudf/detail/timezone.hpp          |  4 +-
 cpp/include/cudf/detail/transform.hpp         |  2 +-
 cpp/include/cudf/detail/transpose.hpp         |  2 +-
 cpp/include/cudf/detail/unary.hpp             |  2 +-
 .../cudf/detail/utilities/host_memory.hpp     |  3 +-
 .../cudf/detail/utilities/host_vector.hpp     |  8 +-
 .../detail/utilities/vector_factories.hpp     |  2 +-
 cpp/include/cudf/detail/valid_if.cuh          |  2 +-
 .../cudf/dictionary/detail/concatenate.hpp    |  2 +-
 cpp/include/cudf/dictionary/detail/encode.hpp |  2 +-
 cpp/include/cudf/dictionary/detail/merge.hpp  |  2 +-
 .../cudf/dictionary/detail/replace.hpp        |  2 +-
 cpp/include/cudf/dictionary/detail/search.hpp |  2 +-
 .../cudf/dictionary/detail/update_keys.hpp    |  2 +-
 .../cudf/dictionary/dictionary_factories.hpp  |  9 +-
 cpp/include/cudf/dictionary/encode.hpp        |  8 +-
 cpp/include/cudf/dictionary/search.hpp        |  6 +-
 cpp/include/cudf/dictionary/update_keys.hpp   | 14 ++-
 cpp/include/cudf/filling.hpp                  | 16 ++--
 cpp/include/cudf/groupby.hpp                  | 15 ++--
 cpp/include/cudf/hashing.hpp                  | 22 +++--
 cpp/include/cudf/hashing/detail/hashing.hpp   |  2 +-
 cpp/include/cudf/interop.hpp                  | 34 ++++---
 cpp/include/cudf/io/avro.hpp                  |  6 +-
 cpp/include/cudf/io/csv.hpp                   |  6 +-
 cpp/include/cudf/io/detail/avro.hpp           |  2 +-
 cpp/include/cudf/io/detail/batched_memset.hpp |  6 +-
 cpp/include/cudf/io/detail/csv.hpp            |  2 +-
 cpp/include/cudf/io/detail/json.hpp           |  2 +-
 cpp/include/cudf/io/detail/orc.hpp            |  2 +-
 cpp/include/cudf/io/detail/parquet.hpp        |  3 +-
 cpp/include/cudf/io/detail/tokenize_json.hpp  |  2 +-
 cpp/include/cudf/io/json.hpp                  |  6 +-
 cpp/include/cudf/io/orc.hpp                   | 12 ++-
 cpp/include/cudf/io/parquet.hpp               | 10 +--
 cpp/include/cudf/io/text/detail/trie.hpp      |  1 -
 cpp/include/cudf/io/text/multibyte_split.hpp  |  5 +-
 cpp/include/cudf/join.hpp                     | 59 ++++++------
 cpp/include/cudf/json/json.hpp                |  6 +-
 cpp/include/cudf/labeling/label_bins.hpp      |  5 +-
 cpp/include/cudf/lists/combine.hpp            |  8 +-
 cpp/include/cudf/lists/contains.hpp           | 14 ++-
 cpp/include/cudf/lists/count_elements.hpp     |  6 +-
 cpp/include/cudf/lists/detail/combine.hpp     |  3 +-
 cpp/include/cudf/lists/detail/concatenate.hpp |  2 +-
 cpp/include/cudf/lists/detail/contains.hpp    |  3 +-
 cpp/include/cudf/lists/detail/copying.hpp     |  2 +-
 cpp/include/cudf/lists/detail/extract.hpp     |  3 +-
 cpp/include/cudf/lists/detail/gather.cuh      |  2 +-
 .../cudf/lists/detail/interleave_columns.hpp  |  2 +-
 .../lists/detail/lists_column_factories.hpp   |  4 +-
 cpp/include/cudf/lists/detail/reverse.hpp     |  3 +-
 cpp/include/cudf/lists/detail/scatter.cuh     |  2 +-
 .../cudf/lists/detail/scatter_helper.cuh      |  2 +-
 .../cudf/lists/detail/set_operations.hpp      |  2 +-
 cpp/include/cudf/lists/detail/sorting.hpp     |  2 +-
 .../cudf/lists/detail/stream_compaction.hpp   |  2 +-
 cpp/include/cudf/lists/explode.hpp            | 12 ++-
 cpp/include/cudf/lists/extract.hpp            |  8 +-
 cpp/include/cudf/lists/filling.hpp            |  7 +-
 cpp/include/cudf/lists/gather.hpp             |  6 +-
 cpp/include/cudf/lists/reverse.hpp            |  6 +-
 cpp/include/cudf/lists/set_operations.hpp     | 10 +--
 cpp/include/cudf/lists/sorting.hpp            |  8 +-
 cpp/include/cudf/lists/stream_compaction.hpp  |  7 +-
 cpp/include/cudf/merge.hpp                    |  6 +-
 cpp/include/cudf/null_mask.hpp                | 13 ++-
 cpp/include/cudf/partitioning.hpp             |  9 +-
 cpp/include/cudf/quantiles.hpp                | 10 +--
 cpp/include/cudf/reduction.hpp                | 16 ++--
 .../cudf/reduction/detail/histogram.hpp       |  2 +-
 .../cudf/reduction/detail/reduction.cuh       |  2 +-
 .../cudf/reduction/detail/reduction.hpp       |  3 +-
 .../reduction/detail/reduction_functions.hpp  |  2 +-
 .../detail/segmented_reduction_functions.hpp  |  3 +-
 cpp/include/cudf/replace.hpp                  | 22 +++--
 cpp/include/cudf/reshape.hpp                  | 10 +--
 cpp/include/cudf/rolling.hpp                  | 24 +++--
 cpp/include/cudf/round.hpp                    |  6 +-
 cpp/include/cudf/scalar/scalar.hpp            | 67 +++++++-------
 cpp/include/cudf/scalar/scalar_factories.hpp  | 27 +++---
 cpp/include/cudf/search.hpp                   | 10 +--
 cpp/include/cudf/sorting.hpp                  | 26 +++---
 cpp/include/cudf/stream_compaction.hpp        | 22 +++--
 cpp/include/cudf/strings/attributes.hpp       | 10 +--
 cpp/include/cudf/strings/capitalize.hpp       | 10 +--
 cpp/include/cudf/strings/case.hpp             | 10 +--
 .../cudf/strings/char_types/char_types.hpp    |  8 +-
 cpp/include/cudf/strings/combine.hpp          | 14 ++-
 cpp/include/cudf/strings/contains.hpp         | 14 ++-
 .../cudf/strings/convert/convert_booleans.hpp |  8 +-
 .../cudf/strings/convert/convert_datetime.hpp | 10 +--
 .../strings/convert/convert_durations.hpp     |  8 +-
 .../strings/convert/convert_fixed_point.hpp   | 10 +--
 .../cudf/strings/convert/convert_floats.hpp   | 10 +--
 .../cudf/strings/convert/convert_integers.hpp | 18 ++--
 .../cudf/strings/convert/convert_ipv4.hpp     | 10 +--
 .../cudf/strings/convert/convert_lists.hpp    |  6 +-
 .../cudf/strings/convert/convert_urls.hpp     |  8 +-
 cpp/include/cudf/strings/detail/combine.hpp   |  2 +-
 .../cudf/strings/detail/concatenate.hpp       |  2 +-
 .../cudf/strings/detail/converters.hpp        |  2 +-
 .../cudf/strings/detail/copy_if_else.cuh      |  2 +-
 .../cudf/strings/detail/copy_range.hpp        |  2 +-
 cpp/include/cudf/strings/detail/copying.hpp   |  2 +-
 cpp/include/cudf/strings/detail/fill.hpp      |  2 +-
 cpp/include/cudf/strings/detail/gather.cuh    |  2 +-
 cpp/include/cudf/strings/detail/merge.hpp     |  1 +
 cpp/include/cudf/strings/detail/replace.hpp   |  2 +-
 cpp/include/cudf/strings/detail/scan.hpp      |  2 +-
 cpp/include/cudf/strings/detail/scatter.cuh   |  4 +-
 .../cudf/strings/detail/strings_children.cuh  |  2 +-
 .../detail/strings_column_factories.cuh       |  2 +-
 cpp/include/cudf/strings/detail/utilities.hpp |  2 +-
 cpp/include/cudf/strings/extract.hpp          |  8 +-
 cpp/include/cudf/strings/find.hpp             | 22 +++--
 cpp/include/cudf/strings/find_multiple.hpp    |  6 +-
 cpp/include/cudf/strings/findall.hpp          |  6 +-
 cpp/include/cudf/strings/padding.hpp          |  8 +-
 cpp/include/cudf/strings/repeat_strings.hpp   | 10 +--
 cpp/include/cudf/strings/replace.hpp          | 10 +--
 cpp/include/cudf/strings/replace_re.hpp       | 10 +--
 cpp/include/cudf/strings/reverse.hpp          |  6 +-
 cpp/include/cudf/strings/slice.hpp            |  8 +-
 cpp/include/cudf/strings/split/partition.hpp  |  8 +-
 cpp/include/cudf/strings/split/split.hpp      | 12 ++-
 cpp/include/cudf/strings/split/split_re.hpp   | 12 ++-
 cpp/include/cudf/strings/strip.hpp            |  6 +-
 cpp/include/cudf/strings/translate.hpp        |  8 +-
 cpp/include/cudf/strings/utilities.hpp        |  6 +-
 cpp/include/cudf/strings/wrap.hpp             |  6 +-
 .../cudf/structs/detail/concatenate.hpp       |  3 +-
 cpp/include/cudf/structs/detail/scan.hpp      |  2 +-
 cpp/include/cudf/table/table.hpp              |  7 +-
 cpp/include/cudf/timezone.hpp                 |  6 +-
 cpp/include/cudf/transform.hpp                | 22 +++--
 cpp/include/cudf/transpose.hpp                |  6 +-
 cpp/include/cudf/unary.hpp                    | 16 ++--
 .../cudf/utilities/memory_resource.hpp        | 90 +++++++++++++++++++
 cpp/include/cudf/utilities/pinned_memory.hpp  |  3 +-
 cpp/include/cudf_test/base_fixture.hpp        |  7 +-
 cpp/include/cudf_test/column_wrapper.hpp      | 16 ++--
 .../stream_checking_resource_adaptor.hpp      |  2 +-
 cpp/include/cudf_test/tdigest_utilities.cuh   | 20 ++---
 cpp/include/cudf_test/testing_main.hpp        |  8 +-
 cpp/include/doxygen_groups.h                  |  3 +-
 cpp/include/nvtext/byte_pair_encoding.hpp     | 11 ++-
 cpp/include/nvtext/detail/generate_ngrams.hpp |  3 +-
 cpp/include/nvtext/detail/load_hash_file.hpp  |  2 +-
 cpp/include/nvtext/detail/tokenize.hpp        |  2 +-
 cpp/include/nvtext/edit_distance.hpp          |  7 +-
 cpp/include/nvtext/generate_ngrams.hpp        |  9 +-
 cpp/include/nvtext/jaccard.hpp                |  5 +-
 cpp/include/nvtext/minhash.hpp                | 11 ++-
 cpp/include/nvtext/ngrams_tokenize.hpp        |  5 +-
 cpp/include/nvtext/normalize.hpp              |  7 +-
 cpp/include/nvtext/replace.hpp                |  7 +-
 cpp/include/nvtext/stemmer.hpp                |  9 +-
 cpp/include/nvtext/subword_tokenize.hpp       |  7 +-
 cpp/include/nvtext/tokenize.hpp               | 21 +++--
 cpp/src/binaryop/binaryop.cpp                 |  2 +-
 cpp/src/binaryop/compiled/binary_ops.cu       |  4 +-
 cpp/src/binaryop/compiled/binary_ops.hpp      |  2 +-
 cpp/src/bitmask/null_mask.cu                  |  2 +-
 cpp/src/column/column.cu                      |  2 +-
 cpp/src/column/column_factories.cpp           |  3 +-
 cpp/src/column/column_factories.cu            |  3 +-
 cpp/src/copying/concatenate.cu                |  6 +-
 cpp/src/copying/contiguous_split.cu           |  6 +-
 cpp/src/copying/copy.cpp                      |  2 +-
 cpp/src/copying/copy.cu                       |  5 +-
 cpp/src/copying/copy_range.cu                 |  6 +-
 cpp/src/copying/gather.cu                     |  2 +-
 cpp/src/copying/get_element.cu                |  2 +-
 cpp/src/copying/pack.cpp                      |  2 +-
 cpp/src/copying/purge_nonempty_nulls.cu       |  3 +-
 cpp/src/copying/reverse.cu                    |  3 +-
 cpp/src/copying/sample.cu                     |  2 +-
 cpp/src/copying/scatter.cu                    |  8 +-
 cpp/src/copying/segmented_shift.cu            |  2 +-
 cpp/src/copying/shift.cu                      |  2 +-
 cpp/src/datetime/datetime_ops.cu              |  2 +-
 cpp/src/datetime/timezone.cpp                 |  3 +-
 cpp/src/dictionary/add_keys.cu                |  6 +-
 cpp/src/dictionary/decode.cu                  |  2 +-
 cpp/src/dictionary/detail/concatenate.cu      |  7 +-
 cpp/src/dictionary/detail/merge.cu            |  2 +-
 cpp/src/dictionary/dictionary_factories.cu    |  2 +-
 cpp/src/dictionary/encode.cu                  |  2 +-
 cpp/src/dictionary/remove_keys.cu             |  2 +-
 cpp/src/dictionary/replace.cu                 |  4 +-
 cpp/src/dictionary/search.cu                  |  2 +-
 cpp/src/dictionary/set_keys.cu                |  4 +-
 cpp/src/filling/calendrical_month_sequence.cu |  2 +-
 cpp/src/filling/fill.cu                       |  5 +-
 cpp/src/filling/repeat.cu                     |  3 +-
 cpp/src/filling/sequence.cu                   |  2 +-
 cpp/src/groupby/common/utils.hpp              |  3 +-
 cpp/src/groupby/groupby.cu                    |  7 +-
 cpp/src/groupby/hash/groupby.cu               |  8 +-
 cpp/src/groupby/sort/aggregate.cpp            |  6 +-
 cpp/src/groupby/sort/functors.hpp             |  2 +-
 cpp/src/groupby/sort/group_argmax.cu          |  2 +-
 cpp/src/groupby/sort/group_argmin.cu          |  2 +-
 cpp/src/groupby/sort/group_collect.cu         |  2 +-
 cpp/src/groupby/sort/group_correlation.cu     |  2 +-
 cpp/src/groupby/sort/group_count.cu           |  2 +-
 cpp/src/groupby/sort/group_count_scan.cu      |  2 +-
 cpp/src/groupby/sort/group_histogram.cu       |  2 +-
 cpp/src/groupby/sort/group_m2.cu              |  2 +-
 cpp/src/groupby/sort/group_max.cu             |  3 +-
 cpp/src/groupby/sort/group_max_scan.cu        |  3 +-
 cpp/src/groupby/sort/group_merge_lists.cu     |  2 +-
 cpp/src/groupby/sort/group_merge_m2.cu        |  2 +-
 cpp/src/groupby/sort/group_min.cu             |  3 +-
 cpp/src/groupby/sort/group_min_scan.cu        |  3 +-
 cpp/src/groupby/sort/group_nth_element.cu     |  2 +-
 cpp/src/groupby/sort/group_nunique.cu         |  2 +-
 cpp/src/groupby/sort/group_product.cu         |  2 +-
 cpp/src/groupby/sort/group_product_scan.cu    |  3 +-
 cpp/src/groupby/sort/group_quantiles.cu       |  4 +-
 cpp/src/groupby/sort/group_rank_scan.cu       |  6 +-
 cpp/src/groupby/sort/group_reductions.hpp     |  2 +-
 cpp/src/groupby/sort/group_replace_nulls.cu   |  2 +-
 cpp/src/groupby/sort/group_scan.hpp           |  2 +-
 cpp/src/groupby/sort/group_scan_util.cuh      |  2 +-
 .../sort/group_single_pass_reduction_util.cuh |  2 +-
 cpp/src/groupby/sort/group_std.cu             |  2 +-
 cpp/src/groupby/sort/group_sum.cu             |  2 +-
 cpp/src/groupby/sort/group_sum_scan.cu        |  3 +-
 cpp/src/groupby/sort/scan.cpp                 | 12 +--
 cpp/src/groupby/sort/sort_helper.cu           | 14 +--
 cpp/src/hash/md5_hash.cu                      |  2 +-
 cpp/src/hash/murmurhash3_x64_128.cu           |  2 +-
 cpp/src/hash/murmurhash3_x86_32.cu            |  2 +-
 cpp/src/hash/sha1_hash.cu                     |  2 +-
 cpp/src/hash/sha224_hash.cu                   |  2 +-
 cpp/src/hash/sha256_hash.cu                   |  2 +-
 cpp/src/hash/sha384_hash.cu                   |  2 +-
 cpp/src/hash/sha512_hash.cu                   |  2 +-
 cpp/src/hash/sha_hash.cuh                     |  2 +-
 cpp/src/hash/xxhash_64.cu                     |  2 +-
 cpp/src/interop/arrow_utilities.cpp           |  1 -
 cpp/src/interop/arrow_utilities.hpp           |  3 +-
 .../interop/decimal_conversion_utilities.cuh  |  2 +-
 cpp/src/interop/dlpack.cpp                    |  2 +-
 cpp/src/interop/from_arrow_device.cu          |  2 +-
 cpp/src/interop/from_arrow_host.cu            |  2 +-
 cpp/src/interop/from_arrow_stream.cu          |  1 -
 cpp/src/interop/to_arrow_device.cu            |  3 +-
 cpp/src/interop/to_arrow_host.cu              |  3 +-
 cpp/src/io/avro/reader_impl.cu                |  8 +-
 cpp/src/io/comp/uncomp.cpp                    |  3 +-
 cpp/src/io/csv/csv_gpu.cu                     |  3 +-
 cpp/src/io/csv/durations.cu                   |  2 +-
 cpp/src/io/csv/durations.hpp                  |  3 +-
 cpp/src/io/csv/reader_impl.cu                 | 14 +--
 cpp/src/io/csv/writer_impl.cu                 | 13 ++-
 cpp/src/io/functions.cpp                      |  3 +-
 cpp/src/io/json/json_column.cu                | 14 +--
 cpp/src/io/json/json_normalization.cu         |  2 +-
 cpp/src/io/json/json_tree.cu                  |  2 +-
 cpp/src/io/json/nested_json.hpp               |  3 +-
 cpp/src/io/json/nested_json_gpu.cu            |  8 +-
 cpp/src/io/json/read_json.cu                  | 10 +--
 cpp/src/io/json/read_json.hpp                 |  2 +-
 cpp/src/io/json/write_json.cu                 | 26 +++---
 cpp/src/io/orc/reader_impl.hpp                |  2 +-
 cpp/src/io/orc/reader_impl_decode.cu          |  8 +-
 cpp/src/io/orc/reader_impl_helpers.cpp        |  2 +-
 cpp/src/io/orc/reader_impl_helpers.hpp        |  2 +-
 cpp/src/io/orc/stripe_enc.cu                  |  3 +-
 cpp/src/io/orc/writer_impl.cu                 | 13 +--
 cpp/src/io/parquet/predicate_pushdown.cpp     |  6 +-
 cpp/src/io/parquet/reader.cpp                 |  2 +-
 cpp/src/io/parquet/reader_impl.cpp            |  5 +-
 cpp/src/io/parquet/reader_impl.hpp            |  5 +-
 cpp/src/io/parquet/reader_impl_chunking.cu    | 15 ++--
 cpp/src/io/parquet/reader_impl_preprocess.cu  |  9 +-
 cpp/src/io/parquet/writer_impl.cu             | 13 +--
 cpp/src/io/text/multibyte_split.cu            |  7 +-
 cpp/src/io/utilities/column_buffer.cpp        |  6 +-
 cpp/src/io/utilities/column_buffer.hpp        |  5 +-
 cpp/src/io/utilities/data_casting.cu          |  2 +-
 cpp/src/io/utilities/output_builder.cuh       |  4 +-
 cpp/src/io/utilities/string_parsing.hpp       |  2 +-
 cpp/src/io/utilities/trie.cu                  |  5 +-
 cpp/src/join/conditional_join.cu              |  2 +-
 cpp/src/join/conditional_join.hpp             |  3 +-
 cpp/src/join/cross_join.cu                    |  2 +-
 cpp/src/join/distinct_hash_join.cu            |  5 +-
 cpp/src/join/hash_join.cu                     |  4 +-
 cpp/src/join/join.cu                          |  8 +-
 cpp/src/join/join_common_utils.cuh            |  2 +-
 cpp/src/join/join_utils.cu                    |  3 +-
 cpp/src/join/mixed_join.cu                    |  6 +-
 cpp/src/join/mixed_join_semi.cu               |  4 +-
 cpp/src/join/mixed_join_size_kernel.hpp       |  3 +
 cpp/src/join/semi_join.cu                     |  4 +-
 cpp/src/json/json_path.cu                     |  6 +-
 cpp/src/labeling/label_bins.cu                |  2 +-
 .../combine/concatenate_list_elements.cu      |  2 +-
 cpp/src/lists/combine/concatenate_rows.cu     |  8 +-
 cpp/src/lists/contains.cu                     |  6 +-
 cpp/src/lists/copying/concatenate.cu          |  2 +-
 cpp/src/lists/copying/copying.cu              |  2 +-
 cpp/src/lists/copying/gather.cu               |  2 +-
 cpp/src/lists/copying/scatter_helper.cu       |  3 +-
 cpp/src/lists/copying/segmented_gather.cu     |  2 +-
 cpp/src/lists/count_elements.cu               |  2 +-
 cpp/src/lists/dremel.cu                       |  3 +-
 cpp/src/lists/explode.cu                      |  2 +-
 cpp/src/lists/extract.cu                      |  4 +-
 cpp/src/lists/interleave_columns.cu           |  4 +-
 cpp/src/lists/lists_column_factories.cu       |  4 +-
 cpp/src/lists/reverse.cu                      |  4 +-
 cpp/src/lists/segmented_sort.cu               |  2 +-
 cpp/src/lists/sequences.cu                    |  2 +-
 cpp/src/lists/set_operations.cu               | 26 +++---
 .../stream_compaction/apply_boolean_mask.cu   |  4 +-
 cpp/src/lists/stream_compaction/distinct.cu   |  4 +-
 cpp/src/lists/utilities.cu                    |  3 +-
 cpp/src/lists/utilities.hpp                   |  2 +-
 cpp/src/merge/merge.cu                        | 12 +--
 cpp/src/partitioning/partitioning.cu          |  6 +-
 cpp/src/partitioning/round_robin.cu           |  2 +-
 cpp/src/quantiles/quantile.cu                 |  4 +-
 cpp/src/quantiles/quantiles.cu                |  6 +-
 cpp/src/quantiles/tdigest/tdigest.cu          |  4 +-
 .../quantiles/tdigest/tdigest_aggregation.cu  |  8 +-
 cpp/src/reductions/all.cu                     |  5 +-
 cpp/src/reductions/any.cu                     |  5 +-
 cpp/src/reductions/collect_ops.cu             |  3 +-
 cpp/src/reductions/compound.cuh               |  3 +-
 cpp/src/reductions/histogram.cu               |  5 +-
 cpp/src/reductions/max.cu                     |  2 +-
 cpp/src/reductions/mean.cu                    |  2 +-
 cpp/src/reductions/min.cu                     |  3 +-
 cpp/src/reductions/minmax.cu                  |  2 +-
 .../reductions/nested_type_minmax_util.cuh    |  5 +-
 cpp/src/reductions/nth_element.cu             |  2 +-
 cpp/src/reductions/product.cu                 |  2 +-
 cpp/src/reductions/reductions.cpp             |  6 +-
 cpp/src/reductions/scan/rank_scan.cu          |  4 +-
 cpp/src/reductions/scan/scan.cpp              |  3 +-
 cpp/src/reductions/scan/scan.cuh              |  2 +-
 cpp/src/reductions/scan/scan_exclusive.cu     |  2 +-
 cpp/src/reductions/scan/scan_inclusive.cu     |  2 +-
 cpp/src/reductions/segmented/all.cu           |  3 +-
 cpp/src/reductions/segmented/any.cu           |  3 +-
 cpp/src/reductions/segmented/compound.cuh     |  5 +-
 cpp/src/reductions/segmented/counts.cu        |  3 +-
 cpp/src/reductions/segmented/counts.hpp       |  2 +-
 cpp/src/reductions/segmented/max.cu           |  3 +-
 cpp/src/reductions/segmented/mean.cu          |  2 +-
 cpp/src/reductions/segmented/min.cu           |  3 +-
 cpp/src/reductions/segmented/nunique.cu       |  2 +-
 cpp/src/reductions/segmented/product.cu       |  3 +-
 cpp/src/reductions/segmented/reductions.cpp   |  2 +-
 cpp/src/reductions/segmented/simple.cuh       |  4 +-
 cpp/src/reductions/segmented/std.cu           |  2 +-
 cpp/src/reductions/segmented/sum.cu           |  3 +-
 .../reductions/segmented/sum_of_squares.cu    |  2 +-
 .../reductions/segmented/update_validity.cu   |  3 +-
 .../reductions/segmented/update_validity.hpp  |  2 +-
 cpp/src/reductions/segmented/var.cu           |  2 +-
 cpp/src/reductions/simple.cuh                 |  4 +-
 cpp/src/reductions/std.cu                     |  2 +-
 cpp/src/reductions/sum.cu                     |  2 +-
 cpp/src/reductions/sum_of_squares.cu          |  2 +-
 cpp/src/reductions/var.cu                     |  2 +-
 cpp/src/replace/clamp.cu                      |  4 +-
 cpp/src/replace/nans.cu                       |  2 +-
 cpp/src/replace/nulls.cu                      |  2 +-
 cpp/src/replace/replace.cu                    |  8 +-
 cpp/src/reshape/byte_cast.cu                  |  2 +-
 cpp/src/reshape/interleave_columns.cu         |  2 +-
 cpp/src/reshape/tile.cu                       |  2 +-
 cpp/src/rolling/detail/lead_lag_nested.cuh    |  5 +-
 cpp/src/rolling/detail/nth_element.cuh        |  2 +-
 .../detail/optimized_unbounded_window.cpp     |  5 +-
 .../detail/optimized_unbounded_window.hpp     |  2 +-
 cpp/src/rolling/detail/rolling.cuh            |  4 +-
 cpp/src/rolling/detail/rolling.hpp            |  3 +-
 .../rolling/detail/rolling_collect_list.cu    |  2 +-
 .../rolling/detail/rolling_collect_list.cuh   |  2 +-
 .../rolling/detail/rolling_fixed_window.cu    |  3 +-
 .../rolling/detail/rolling_variable_window.cu |  3 +-
 cpp/src/rolling/grouped_rolling.cu            |  7 +-
 cpp/src/rolling/rolling.cu                    |  3 +-
 cpp/src/round/round.cu                        |  2 +-
 cpp/src/scalar/scalar.cpp                     |  4 +-
 cpp/src/scalar/scalar_factories.cpp           |  2 +-
 cpp/src/search/contains_column.cu             |  6 +-
 cpp/src/search/contains_scalar.cu             |  3 +-
 cpp/src/search/contains_table.cu              |  4 +-
 cpp/src/search/search_ordered.cu              |  4 +-
 cpp/src/sort/rank.cu                          |  2 +-
 cpp/src/sort/segmented_sort.cu                |  2 +-
 cpp/src/sort/segmented_sort_impl.cuh          | 18 ++--
 cpp/src/sort/sort.cu                          |  4 +-
 cpp/src/sort/sort_column.cu                   |  3 +-
 cpp/src/sort/sort_column_impl.cuh             |  2 +-
 cpp/src/sort/sort_impl.cuh                    |  3 +-
 cpp/src/sort/stable_segmented_sort.cu         |  3 +-
 cpp/src/sort/stable_sort.cu                   |  4 +-
 cpp/src/sort/stable_sort_column.cu            |  3 +-
 .../stream_compaction/apply_boolean_mask.cu   |  2 +-
 cpp/src/stream_compaction/distinct.cu         |  5 +-
 cpp/src/stream_compaction/distinct_count.cu   |  3 +-
 .../stream_compaction/distinct_helpers.hpp    |  2 +-
 cpp/src/stream_compaction/drop_nans.cu        |  2 +-
 cpp/src/stream_compaction/drop_nulls.cu       |  2 +-
 cpp/src/stream_compaction/stable_distinct.cu  |  5 +-
 cpp/src/stream_compaction/unique.cu           |  2 +-
 cpp/src/strings/attributes.cu                 |  2 +-
 cpp/src/strings/capitalize.cu                 |  2 +-
 cpp/src/strings/case.cu                       |  2 +-
 cpp/src/strings/char_types/char_types.cu      |  2 +-
 cpp/src/strings/combine/concatenate.cu        |  2 +-
 cpp/src/strings/combine/join.cu               |  2 +-
 cpp/src/strings/combine/join_list_elements.cu |  2 +-
 cpp/src/strings/contains.cu                   |  2 +-
 cpp/src/strings/convert/convert_booleans.cu   |  2 +-
 cpp/src/strings/convert/convert_datetime.cu   |  4 +-
 cpp/src/strings/convert/convert_durations.cu  |  2 +-
 .../strings/convert/convert_fixed_point.cu    |  2 +-
 cpp/src/strings/convert/convert_floats.cu     |  2 +-
 cpp/src/strings/convert/convert_hex.cu        |  2 +-
 cpp/src/strings/convert/convert_integers.cu   |  2 +-
 cpp/src/strings/convert/convert_ipv4.cu       |  2 +-
 cpp/src/strings/convert/convert_lists.cu      |  2 +-
 cpp/src/strings/convert/convert_urls.cu       |  2 +-
 cpp/src/strings/copying/concatenate.cu        |  4 +-
 cpp/src/strings/copying/copy_range.cu         |  2 +-
 cpp/src/strings/copying/copying.cu            |  2 +-
 cpp/src/strings/copying/shift.cu              |  2 +-
 cpp/src/strings/count_matches.cu              |  3 +-
 cpp/src/strings/count_matches.hpp             |  2 +-
 cpp/src/strings/extract/extract.cu            |  2 +-
 cpp/src/strings/extract/extract_all.cu        |  2 +-
 cpp/src/strings/filling/fill.cu               |  2 +-
 cpp/src/strings/filter_chars.cu               |  6 +-
 cpp/src/strings/like.cu                       |  2 +-
 cpp/src/strings/padding.cu                    |  2 +-
 cpp/src/strings/regex/utilities.cuh           |  2 +-
 cpp/src/strings/repeat_strings.cu             |  2 +-
 cpp/src/strings/replace/backref_re.cu         |  4 +-
 cpp/src/strings/replace/find_replace.cu       |  2 +-
 cpp/src/strings/replace/multi.cu              |  8 +-
 cpp/src/strings/replace/multi_re.cu           |  4 +-
 cpp/src/strings/replace/replace.cu            |  4 +-
 cpp/src/strings/replace/replace_nulls.cu      |  2 +-
 cpp/src/strings/replace/replace_re.cu         |  2 +-
 cpp/src/strings/replace/replace_slice.cu      |  2 +-
 cpp/src/strings/reverse.cu                    |  2 +-
 cpp/src/strings/scan/scan_inclusive.cu        |  2 +-
 cpp/src/strings/search/find.cu                |  2 +-
 cpp/src/strings/search/find_multiple.cu       |  2 +-
 cpp/src/strings/search/findall.cu             |  2 +-
 cpp/src/strings/slice.cu                      |  2 +-
 cpp/src/strings/split/partition.cu            |  2 +-
 cpp/src/strings/split/split.cu                |  2 +-
 cpp/src/strings/split/split.cuh               |  2 +-
 cpp/src/strings/split/split_re.cu             |  6 +-
 cpp/src/strings/split/split_record.cu         |  2 +-
 cpp/src/strings/strings_column_factories.cu   |  2 +-
 cpp/src/strings/strings_scalar_factories.cpp  |  2 +-
 cpp/src/strings/strip.cu                      |  2 +-
 cpp/src/strings/translate.cu                  |  6 +-
 cpp/src/strings/utilities.cu                  |  2 +-
 cpp/src/strings/wrap.cu                       |  2 +-
 cpp/src/structs/copying/concatenate.cu        |  2 +-
 cpp/src/structs/scan/scan_inclusive.cu        |  2 +-
 cpp/src/structs/structs_column_factories.cu   |  2 +-
 cpp/src/structs/utilities.cpp                 |  3 +-
 cpp/src/table/row_operators.cu                | 26 +++---
 cpp/src/table/table.cpp                       |  2 +-
 cpp/src/text/bpe/byte_pair_encoding.cu        |  2 +-
 cpp/src/text/bpe/load_merge_pairs.cu          |  2 +-
 cpp/src/text/detokenize.cu                    |  4 +-
 cpp/src/text/edit_distance.cu                 |  2 +-
 cpp/src/text/generate_ngrams.cu               |  4 +-
 cpp/src/text/jaccard.cu                       |  2 +-
 cpp/src/text/minhash.cu                       |  2 +-
 cpp/src/text/ngrams_tokenize.cu               |  8 +-
 cpp/src/text/normalize.cu                     |  2 +-
 cpp/src/text/replace.cu                       |  2 +-
 cpp/src/text/stemmer.cu                       |  2 +-
 cpp/src/text/subword/load_hash_file.cu        |  2 +-
 cpp/src/text/subword/subword_tokenize.cu      |  2 +-
 cpp/src/text/tokenize.cu                      |  6 +-
 cpp/src/text/vocabulary_tokenize.cu           |  2 +-
 cpp/src/transform/bools_to_mask.cu            |  2 +-
 cpp/src/transform/compute_column.cu           |  2 +-
 cpp/src/transform/encode.cu                   |  2 +-
 cpp/src/transform/mask_to_bools.cu            |  2 +-
 cpp/src/transform/nans_to_nulls.cu            |  2 +-
 cpp/src/transform/one_hot_encode.cu           |  2 +-
 cpp/src/transform/row_bit_count.cu            |  4 +-
 cpp/src/transform/transform.cpp               |  2 +-
 cpp/src/transpose/transpose.cu                |  2 +-
 cpp/src/unary/cast_ops.cu                     |  2 +-
 cpp/src/unary/math_ops.cu                     |  4 +-
 cpp/src/unary/nan_ops.cu                      |  2 +-
 cpp/src/unary/null_ops.cu                     |  3 +-
 cpp/src/unary/unary_ops.cuh                   |  2 +-
 cpp/src/utilities/host_memory.cpp             |  2 +-
 cpp/tests/bitmask/bitmask_tests.cpp           |  3 +-
 cpp/tests/bitmask/valid_if_tests.cu           | 11 +--
 cpp/tests/column/column_test.cpp              |  5 +-
 cpp/tests/copying/detail_gather_tests.cu      |  9 +-
 cpp/tests/copying/gather_str_tests.cpp        | 11 ++-
 cpp/tests/copying/shift_tests.cpp             |  6 +-
 cpp/tests/copying/split_tests.cpp             |  9 +-
 .../device_atomics/device_atomics_test.cu     |  5 +-
 cpp/tests/dictionary/search_test.cpp          |  9 +-
 cpp/tests/fixed_point/fixed_point_tests.cu    |  7 +-
 cpp/tests/groupby/histogram_tests.cpp         |  5 +-
 cpp/tests/groupby/tdigest_tests.cu            | 15 ++--
 cpp/tests/io/json/json_chunked_reader.cu      |  6 +-
 .../io/json/json_quote_normalization_test.cpp |  3 +-
 cpp/tests/io/json/json_tree.cpp               | 23 ++---
 cpp/tests/io/json/json_type_cast_test.cu      |  9 +-
 .../json_whitespace_normalization_test.cu     |  5 +-
 cpp/tests/io/json/nested_json_test.cpp        | 41 ++++-----
 cpp/tests/io/orc_chunked_reader_test.cu       |  5 +-
 cpp/tests/io/parquet_chunked_reader_test.cu   |  3 +-
 cpp/tests/io/parquet_writer_test.cpp          |  3 +-
 cpp/tests/io/type_inference_test.cu           | 29 +++---
 cpp/tests/iterator/iterator_tests.cuh         |  3 +-
 cpp/tests/iterator/value_iterator_test.cuh    |  5 +-
 .../iterator/value_iterator_test_strings.cu   | 11 +--
 cpp/tests/join/distinct_join_tests.cpp        |  3 +-
 cpp/tests/join/join_tests.cpp                 |  7 +-
 cpp/tests/join/semi_anti_join_tests.cpp       |  5 +-
 cpp/tests/large_strings/json_tests.cu         |  3 +-
 .../large_strings/large_strings_fixture.cpp   |  2 +-
 .../partitioning/hash_partition_test.cpp      |  3 +-
 .../quantiles/percentile_approx_test.cpp      |  3 +-
 .../reductions/segmented_reduction_tests.cpp  | 71 +++++++--------
 cpp/tests/scalar/scalar_device_view_test.cu   |  3 +-
 cpp/tests/sort/segmented_sort_tests.cpp       |  3 +-
 cpp/tests/streams/reduction_test.cpp          |  5 +-
 cpp/tests/strings/contains_tests.cpp          |  5 +-
 cpp/tests/strings/factories_test.cu           | 11 +--
 cpp/tests/strings/integers_tests.cpp          |  3 +-
 cpp/tests/structs/utilities_tests.cpp         | 39 ++++----
 cpp/tests/table/table_view_tests.cu           |  5 +-
 cpp/tests/types/type_dispatcher_test.cu       |  5 +-
 cpp/tests/utilities/tdigest_utilities.cu      |  7 +-
 .../utilities_tests/batched_memset_tests.cu   |  3 +-
 .../utilities_tests/pinned_memory_tests.cpp   |  1 -
 cpp/tests/utilities_tests/span_tests.cu       |  3 +-
 .../source/libcudf_docs/api_docs/index.rst    |  1 +
 .../libcudf_docs/api_docs/memory_resource.rst |  5 ++
 .../main/native/include/maps_column_view.hpp  | 11 ++-
 java/src/main/native/src/ColumnVectorJni.cpp  |  5 +-
 java/src/main/native/src/ColumnViewJni.cu     | 10 +--
 java/src/main/native/src/RmmJni.cpp           |  9 +-
 java/src/main/native/src/TableJni.cpp         |  5 +-
 java/src/main/native/src/maps_column_view.cu  |  6 +-
 .../strings/src/strings/udf/udf_apis.cu       |  3 +-
 .../pylibcudf/pylibcudf/libcudf/interop.pxd   |  4 +-
 652 files changed, 1771 insertions(+), 1824 deletions(-)
 create mode 100644 cpp/include/cudf/utilities/memory_resource.hpp
 create mode 100644 docs/cudf/source/libcudf_docs/api_docs/memory_resource.rst

diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index 0970003deb2..dc258e32dc5 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -28,10 +28,10 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -507,7 +507,7 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
                            null_mask.end(),
                            thrust::identity<bool>{},
                            cudf::get_default_stream(),
-                           rmm::mr::get_current_device_resource());
+                           cudf::get_current_device_resource_ref());
 
   return std::make_unique<cudf::column>(
     dtype,
@@ -591,7 +591,7 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
                            null_mask.end() - 1,
                            thrust::identity<bool>{},
                            cudf::get_default_stream(),
-                           rmm::mr::get_current_device_resource());
+                           cudf::get_current_device_resource_ref());
   return cudf::make_strings_column(
     num_rows,
     std::make_unique<cudf::column>(std::move(offsets), rmm::device_buffer{}, 0),
@@ -626,7 +626,7 @@ std::unique_ptr<cudf::column> create_random_column<cudf::string_view>(data_profi
                                         cudf::out_of_bounds_policy::DONT_CHECK,
                                         cudf::detail::negative_index_policy::NOT_ALLOWED,
                                         cudf::get_default_stream(),
-                                        rmm::mr::get_current_device_resource());
+                                        cudf::get_current_device_resource_ref());
   return std::move(str_table->release()[0]);
 }
 
@@ -688,7 +688,7 @@ std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profi
                                         valids.end(),
                                         thrust::identity<bool>{},
                                         cudf::get_default_stream(),
-                                        rmm::mr::get_current_device_resource());
+                                        cudf::get_current_device_resource_ref());
         }
         return std::pair<rmm::device_buffer, cudf::size_type>{};
       }();
@@ -782,7 +782,7 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
                                                           valids.end(),
                                                           thrust::identity<bool>{},
                                                           cudf::get_default_stream(),
-                                                          rmm::mr::get_current_device_resource());
+                                                          cudf::get_current_device_resource_ref());
     list_column                  = cudf::make_lists_column(
       current_num_rows,
       std::move(offsets_column),
@@ -933,7 +933,7 @@ std::pair<rmm::device_buffer, cudf::size_type> create_random_null_mask(
                                   thrust::make_counting_iterator<cudf::size_type>(size),
                                   bool_generator{seed, 1.0 - *null_probability},
                                   cudf::get_default_stream(),
-                                  rmm::mr::get_current_device_resource());
+                                  cudf::get_current_device_resource_ref());
   }
 }
 
diff --git a/cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp b/cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp
index 3e254f49805..0bf1eee4e85 100644
--- a/cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp
+++ b/cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp
@@ -17,6 +17,8 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <string>
 
@@ -36,7 +38,7 @@ std::unique_ptr<cudf::column> generate_random_string_column(
   cudf::size_type upper,
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate a column of random numbers
@@ -61,7 +63,7 @@ std::unique_ptr<cudf::column> generate_random_numeric_column(
   T upper,
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate a primary key column
@@ -81,7 +83,7 @@ std::unique_ptr<cudf::column> generate_primary_key_column(
   cudf::scalar const& start,
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate a column where all the rows have the same string value
@@ -101,7 +103,7 @@ std::unique_ptr<cudf::column> generate_repeat_string_column(
   std::string const& value,
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate a column by randomly choosing from set of strings
@@ -121,7 +123,7 @@ std::unique_ptr<cudf::column> generate_random_string_column_from_set(
   cudf::host_span<const char* const> set,
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate a column consisting of a repeating sequence of integers
@@ -145,6 +147,6 @@ std::unique_ptr<cudf::column> generate_repeat_sequence_column(
   bool zero_indexed,
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace cudf::datagen
diff --git a/cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp b/cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp
index 36bf9c49cea..d4368906702 100644
--- a/cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp
+++ b/cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp
@@ -35,6 +35,9 @@
 #include <cudf/transform.hpp>
 #include <cudf/unary.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
+
 #include <vector>
 
 namespace cudf::datagen {
diff --git a/cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp b/cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp
index 11091689469..7d862afe755 100644
--- a/cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp
+++ b/cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp
@@ -20,6 +20,8 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <vector>
 
@@ -37,7 +39,7 @@ std::unique_ptr<cudf::column> add_calendrical_days(
   cudf::column_view const& timestamp_days,
   cudf::column_view const& days,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Perform a left join operation between two tables
@@ -56,7 +58,7 @@ std::unique_ptr<cudf::table> perform_left_join(
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate the `p_retailprice` column of the `part` table
@@ -68,7 +70,7 @@ std::unique_ptr<cudf::table> perform_left_join(
 [[nodiscard]] std::unique_ptr<cudf::column> calculate_p_retailprice(
   cudf::column_view const& p_partkey,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate the `l_suppkey` column of the `lineitem` table
@@ -84,7 +86,7 @@ std::unique_ptr<cudf::table> perform_left_join(
   cudf::size_type scale_factor,
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate the `ps_suppkey` column of the `partsupp` table
@@ -100,7 +102,7 @@ std::unique_ptr<cudf::table> perform_left_join(
   cudf::size_type scale_factor,
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /**
  * @brief Calculate the cardinality of the `lineitem` table
  *
@@ -111,7 +113,7 @@ std::unique_ptr<cudf::table> perform_left_join(
 [[nodiscard]] cudf::size_type calculate_l_cardinality(
   cudf::column_view const& o_rep_freqs,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /**
  * @brief Calculate the charge column for the `lineitem` table
  *
@@ -126,7 +128,7 @@ std::unique_ptr<cudf::table> perform_left_join(
   cudf::column_view const& tax,
   cudf::column_view const& discount,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate a column of random addresses according to TPC-H specification clause 4.2.2.7
@@ -138,7 +140,7 @@ std::unique_ptr<cudf::table> perform_left_join(
 [[nodiscard]] std::unique_ptr<cudf::column> generate_address_column(
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate a phone number column according to TPC-H specification clause 4.2.2.9
@@ -150,6 +152,6 @@ std::unique_ptr<cudf::table> perform_left_join(
 [[nodiscard]] std::unique_ptr<cudf::column> generate_phone_column(
   cudf::size_type num_rows,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace cudf::datagen
diff --git a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp b/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp
index 9001c50c5a5..236fe8095ad 100644
--- a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp
+++ b/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp
@@ -36,6 +36,9 @@
 #include <cudf/transform.hpp>
 #include <cudf/unary.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
+
 #include <array>
 #include <string>
 #include <vector>
diff --git a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp b/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp
index a6286dd8dba..6e09c1e5708 100644
--- a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp
+++ b/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp
@@ -17,6 +17,8 @@
 #pragma once
 
 #include <cudf/table/table.hpp>
+#include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace datagen {
@@ -32,7 +34,7 @@ std::tuple<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::table>, std::uniq
 generate_orders_lineitem_part(
   double scale_factor,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate the `partsupp` table
@@ -44,7 +46,7 @@ generate_orders_lineitem_part(
 std::unique_ptr<cudf::table> generate_partsupp(
   double scale_factor,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate the `supplier` table
@@ -56,7 +58,7 @@ std::unique_ptr<cudf::table> generate_partsupp(
 std::unique_ptr<cudf::table> generate_supplier(
   double scale_factor,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate the `customer` table
@@ -68,7 +70,7 @@ std::unique_ptr<cudf::table> generate_supplier(
 std::unique_ptr<cudf::table> generate_customer(
   double scale_factor,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate the `nation` table
@@ -78,7 +80,7 @@ std::unique_ptr<cudf::table> generate_customer(
  */
 std::unique_ptr<cudf::table> generate_nation(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate the `region` table
@@ -88,7 +90,7 @@ std::unique_ptr<cudf::table> generate_nation(
  */
 std::unique_ptr<cudf::table> generate_region(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace datagen
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/benchmarks/copying/contiguous_split.cu b/cpp/benchmarks/copying/contiguous_split.cu
index 910fc689c0b..161f67425c1 100644
--- a/cpp/benchmarks/copying/contiguous_split.cu
+++ b/cpp/benchmarks/copying/contiguous_split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/contiguous_split.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -32,7 +33,7 @@ void contiguous_split(cudf::table_view const& src_table, std::vector<cudf::size_
 
 void chunked_pack(cudf::table_view const& src_table, std::vector<cudf::size_type> const&)
 {
-  auto const mr     = rmm::mr::get_current_device_resource();
+  auto const mr     = cudf::get_current_device_resource_ref();
   auto const stream = cudf::get_default_stream();
   auto user_buffer  = rmm::device_uvector<std::uint8_t>(100L * 1024 * 1024, stream, mr);
   auto chunked_pack = cudf::chunked_pack::create(src_table, user_buffer.size(), mr);
diff --git a/cpp/benchmarks/copying/shift.cu b/cpp/benchmarks/copying/shift.cu
index efc385cf10b..8f8e17ad4d0 100644
--- a/cpp/benchmarks/copying/shift.cu
+++ b/cpp/benchmarks/copying/shift.cu
@@ -20,14 +20,13 @@
 #include <cudf/copying.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
   T value                           = 0,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto s = new ScalarType(value, true, stream, mr);
   return std::unique_ptr<cudf::scalar>(s);
diff --git a/cpp/benchmarks/fixture/benchmark_fixture.hpp b/cpp/benchmarks/fixture/benchmark_fixture.hpp
index 8900899f9be..2f697ab0459 100644
--- a/cpp/benchmarks/fixture/benchmark_fixture.hpp
+++ b/cpp/benchmarks/fixture/benchmark_fixture.hpp
@@ -16,10 +16,11 @@
 
 #pragma once
 
+#include <cudf/utilities/memory_resource.hpp>
+
 #include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/device/statistics_resource_adaptor.hpp>
 
@@ -83,13 +84,13 @@ class benchmark : public ::benchmark::Fixture {
   void SetUp(::benchmark::State const& state) override
   {
     mr = make_pool_instance();
-    rmm::mr::set_current_device_resource(mr.get());  // set default resource to pool
+    cudf::set_current_device_resource(mr.get());  // set default resource to pool
   }
 
   void TearDown(::benchmark::State const& state) override
   {
     // reset default resource to the initial resource
-    rmm::mr::set_current_device_resource(nullptr);
+    cudf::set_current_device_resource(nullptr);
     mr.reset();
   }
 
@@ -106,13 +107,13 @@ class benchmark : public ::benchmark::Fixture {
 class memory_stats_logger {
  public:
   memory_stats_logger()
-    : existing_mr(rmm::mr::get_current_device_resource()),
+    : existing_mr(cudf::get_current_device_resource()),
       statistics_mr(rmm::mr::statistics_resource_adaptor(existing_mr))
   {
-    rmm::mr::set_current_device_resource(&statistics_mr);
+    cudf::set_current_device_resource(&statistics_mr);
   }
 
-  ~memory_stats_logger() { rmm::mr::set_current_device_resource(existing_mr); }
+  ~memory_stats_logger() { cudf::set_current_device_resource(existing_mr); }
 
   [[nodiscard]] size_t peak_memory_usage() const noexcept
   {
diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp
index df1492690bb..63f09285a26 100644
--- a/cpp/benchmarks/fixture/nvbench_fixture.hpp
+++ b/cpp/benchmarks/fixture/nvbench_fixture.hpp
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/cuda_device.hpp>
@@ -24,10 +25,8 @@
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <string>
 
@@ -110,7 +109,7 @@ struct nvbench_base_fixture {
     }
 
     mr = create_memory_resource(rmm_mode);
-    rmm::mr::set_current_device_resource(mr.get());
+    cudf::set_current_device_resource(mr.get());
     std::cout << "RMM memory resource = " << rmm_mode << "\n";
 
     cudf::set_pinned_memory_resource(create_cuio_host_memory_resource(cuio_host_mode));
diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index 645994f3f0d..fe24fb58728 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -18,9 +18,9 @@
 
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/logger.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/pinned_host_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <unistd.h>
 
@@ -34,7 +34,7 @@ temp_directory const cuio_source_sink_pair::tmpdir{"cudf_gbench"};
 // Don't use cudf's pinned pool for the source data
 rmm::host_async_resource_ref pinned_memory_resource()
 {
-  static rmm::mr::pinned_host_memory_resource mr = rmm::mr::pinned_host_memory_resource{};
+  static auto mr = rmm::mr::pinned_host_memory_resource{};
 
   return mr;
 }
diff --git a/cpp/benchmarks/io/json/nested_json.cpp b/cpp/benchmarks/io/json/nested_json.cpp
index 9fd8de172a3..ae3528b783c 100644
--- a/cpp/benchmarks/io/json/nested_json.cpp
+++ b/cpp/benchmarks/io/json/nested_json.cpp
@@ -23,6 +23,7 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/repeat_strings.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -170,7 +171,7 @@ void BM_NESTED_JSON(nvbench::state& state)
       cudf::device_span<char const>{input->data(), static_cast<size_t>(input->size())},
       default_options,
       cudf::get_default_stream(),
-      rmm::mr::get_current_device_resource());
+      cudf::get_current_device_resource_ref());
   });
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
@@ -201,7 +202,7 @@ void BM_NESTED_JSON_DEPTH(nvbench::state& state)
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     // Allocate device-side temporary storage & run algorithm
     cudf::io::json::detail::device_parse_nested_json(
-      input, default_options, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+      input, default_options, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   });
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
diff --git a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
index e91bf06fdfa..6f20b4bd457 100644
--- a/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_multithreaded.cpp
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/io/orc.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
 #include <BS_thread_pool.hpp>
@@ -109,7 +110,7 @@ void BM_orc_multithreaded_read_common(nvbench::state& state,
                    auto const stream = streams[index % num_threads];
                    cudf::io::orc_reader_options read_opts =
                      cudf::io::orc_reader_options::builder(source_info_vector[index]);
-                   cudf::io::read_orc(read_opts, stream, rmm::mr::get_current_device_resource());
+                   cudf::io::read_orc(read_opts, stream, cudf::get_current_device_resource_ref());
                  };
 
                  threads.pause();
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index 9e76ebb71ab..3abd4280081 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -22,6 +22,7 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
 #include <nvtx3/nvtx3.hpp>
@@ -111,7 +112,7 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
                  auto const stream = streams[index % num_threads];
                  cudf::io::parquet_reader_options read_opts =
                    cudf::io::parquet_reader_options::builder(source_info_vector[index]);
-                 cudf::io::read_parquet(read_opts, stream, rmm::mr::get_current_device_resource());
+                 cudf::io::read_parquet(read_opts, stream, cudf::get_current_device_resource_ref());
                };
 
                threads.pause();
diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu
index fd0cebb12ea..e2576c0d690 100644
--- a/cpp/benchmarks/iterator/iterator.cu
+++ b/cpp/benchmarks/iterator/iterator.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
 
@@ -138,7 +139,7 @@ void BM_iterator(benchmark::State& state)
 
   // Initialize dev_result to false
   auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<TypeParam>(
-    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     if (cub_or_thrust) {
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index 3d9d9c57548..1f1ca414ad1 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -29,6 +29,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -86,7 +87,7 @@ void BM_join(state_type& state, Join JoinFunc)
                                   validity + size,
                                   thrust::identity<bool>{},
                                   cudf::get_default_stream(),
-                                  rmm::mr::get_current_device_resource());
+                                  cudf::get_current_device_resource_ref());
   };
 
   std::unique_ptr<cudf::column> right_key_column0 = [&]() {
diff --git a/cpp/benchmarks/json/json.cu b/cpp/benchmarks/json/json.cu
index 06b793bf5f1..6d01f132189 100644
--- a/cpp/benchmarks/json/json.cu
+++ b/cpp/benchmarks/json/json.cu
@@ -25,6 +25,7 @@
 #include <cudf/strings/string_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -171,7 +172,7 @@ auto build_json_string_column(int desired_bytes, int num_rows)
   json_benchmark_row_builder jb{
     desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order};
   auto [offsets, chars] = cudf::strings::detail::make_strings_children(
-    jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    jb, num_rows, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   return cudf::make_strings_column(num_rows, std::move(offsets), chars.release(), 0, {});
 }
 
diff --git a/cpp/benchmarks/lists/copying/scatter_lists.cu b/cpp/benchmarks/lists/copying/scatter_lists.cu
index 570decf410f..526a43d9ff5 100644
--- a/cpp/benchmarks/lists/copying/scatter_lists.cu
+++ b/cpp/benchmarks/lists/copying/scatter_lists.cu
@@ -22,6 +22,7 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
@@ -38,7 +39,7 @@ template <class TypeParam, bool coalesce>
 void BM_lists_scatter(::benchmark::State& state)
 {
   auto stream = cudf::get_default_stream();
-  auto mr     = rmm::mr::get_current_device_resource();
+  auto mr     = cudf::get_current_device_resource_ref();
 
   cudf::size_type const base_size{(cudf::size_type)state.range(0)};
   cudf::size_type const num_elements_per_row{(cudf::size_type)state.range(1)};
diff --git a/cpp/benchmarks/lists/set_operations.cpp b/cpp/benchmarks/lists/set_operations.cpp
index 6bed33d2570..8a94227c23b 100644
--- a/cpp/benchmarks/lists/set_operations.cpp
+++ b/cpp/benchmarks/lists/set_operations.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <benchmarks/common/generate_input.hpp>
 
 #include <cudf/lists/set_operations.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -55,7 +56,7 @@ void nvbench_set_op(nvbench::state& state, BenchFuncPtr bfunc)
           cudf::null_equality::EQUAL,
           cudf::nan_equality::ALL_EQUAL,
           cudf::get_default_stream(),
-          rmm::mr::get_current_device_resource());
+          cudf::get_current_device_resource_ref());
   });
 }
 
diff --git a/cpp/benchmarks/merge/merge_lists.cpp b/cpp/benchmarks/merge/merge_lists.cpp
index bcb9f10ac83..2fe8b02055b 100644
--- a/cpp/benchmarks/merge/merge_lists.cpp
+++ b/cpp/benchmarks/merge/merge_lists.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/merge.hpp>
 #include <cudf/detail/sorting.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -27,11 +28,11 @@ void nvbench_merge_list(nvbench::state& state)
 
   auto const input1 = create_lists_data(state);
   auto const sorted_input1 =
-    cudf::detail::sort(*input1, {}, {}, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::sort(*input1, {}, {}, stream, cudf::get_current_device_resource_ref());
 
   auto const input2 = create_lists_data(state);
   auto const sorted_input2 =
-    cudf::detail::sort(*input2, {}, {}, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::sort(*input2, {}, {}, stream, cudf::get_current_device_resource_ref());
 
   stream.synchronize();
 
@@ -43,7 +44,7 @@ void nvbench_merge_list(nvbench::state& state)
                         {cudf::order::ASCENDING},
                         {},
                         stream_view,
-                        rmm::mr::get_current_device_resource());
+                        cudf::get_current_device_resource_ref());
   });
 }
 
diff --git a/cpp/benchmarks/merge/merge_structs.cpp b/cpp/benchmarks/merge/merge_structs.cpp
index 9c56b44b623..cfb44d2737f 100644
--- a/cpp/benchmarks/merge/merge_structs.cpp
+++ b/cpp/benchmarks/merge/merge_structs.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cudf/detail/merge.hpp>
 #include <cudf/detail/sorting.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -27,11 +28,11 @@ void nvbench_merge_struct(nvbench::state& state)
 
   auto const input1 = create_structs_data(state);
   auto const sorted_input1 =
-    cudf::detail::sort(*input1, {}, {}, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::sort(*input1, {}, {}, stream, cudf::get_current_device_resource_ref());
 
   auto const input2 = create_structs_data(state);
   auto const sorted_input2 =
-    cudf::detail::sort(*input2, {}, {}, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::sort(*input2, {}, {}, stream, cudf::get_current_device_resource_ref());
 
   stream.synchronize();
 
@@ -43,7 +44,7 @@ void nvbench_merge_struct(nvbench::state& state)
                         {cudf::order::ASCENDING},
                         {},
                         stream_view,
-                        rmm::mr::get_current_device_resource());
+                        cudf::get_current_device_resource_ref());
   });
 }
 
diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp
index 14876c80d3e..05aeed47fa6 100644
--- a/cpp/benchmarks/reduction/rank.cpp
+++ b/cpp/benchmarks/reduction/rank.cpp
@@ -21,6 +21,7 @@
 #include <cudf/detail/scan.hpp>
 #include <cudf/filling.hpp>
 #include <cudf/lists/list_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -45,7 +46,7 @@ static void nvbench_reduction_scan(nvbench::state& state, nvbench::type_list<typ
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     rmm::cuda_stream_view stream_view{launch.get_stream()};
     result = cudf::detail::inclusive_dense_rank_scan(
-      input, stream_view, rmm::mr::get_current_device_resource());
+      input, stream_view, cudf::get_current_device_resource_ref());
   });
 
   state.add_element_count(input.size());
diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp
index a781f75a314..2de1db6dfe5 100644
--- a/cpp/benchmarks/reduction/scan_structs.cpp
+++ b/cpp/benchmarks/reduction/scan_structs.cpp
@@ -20,6 +20,7 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/scan.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -57,7 +58,7 @@ static void nvbench_structs_scan(nvbench::state& state)
   std::unique_ptr<cudf::column> result = nullptr;
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     result = cudf::detail::scan_inclusive(
-      input_view, *agg, null_policy, stream, rmm::mr::get_current_device_resource());
+      input_view, *agg, null_policy, stream, cudf::get_current_device_resource_ref());
   });
 
   state.add_element_count(input_view.size());
diff --git a/cpp/benchmarks/search/contains_table.cpp b/cpp/benchmarks/search/contains_table.cpp
index 17702d0741c..3bc1ac9c70a 100644
--- a/cpp/benchmarks/search/contains_table.cpp
+++ b/cpp/benchmarks/search/contains_table.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,8 +20,7 @@
 #include <cudf/detail/search.hpp>
 #include <cudf/lists/list_view.hpp>
 #include <cudf/types.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -58,7 +57,7 @@ static void nvbench_contains_table(nvbench::state& state, nvbench::type_list<Typ
                              cudf::null_equality::EQUAL,
                              cudf::nan_equality::ALL_EQUAL,
                              stream_view,
-                             rmm::mr::get_current_device_resource());
+                             cudf::get_current_device_resource_ref());
   });
 
   state.add_buffer_size(
diff --git a/cpp/benchmarks/sort/rank_lists.cpp b/cpp/benchmarks/sort/rank_lists.cpp
index 7015fe08089..8dfede3cb3a 100644
--- a/cpp/benchmarks/sort/rank_lists.cpp
+++ b/cpp/benchmarks/sort/rank_lists.cpp
@@ -21,6 +21,7 @@
 #include <cudf_test/column_utilities.hpp>
 
 #include <cudf/sorting.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -39,7 +40,7 @@ void nvbench_rank_lists(nvbench::state& state, nvbench::type_list<nvbench::enum_
                cudf::null_order::AFTER,
                false,
                cudf::get_default_stream(),
-               rmm::mr::get_current_device_resource());
+               cudf::get_current_device_resource_ref());
   });
 }
 
diff --git a/cpp/benchmarks/sort/rank_structs.cpp b/cpp/benchmarks/sort/rank_structs.cpp
index 8b4b09464d8..7575ba48a1a 100644
--- a/cpp/benchmarks/sort/rank_structs.cpp
+++ b/cpp/benchmarks/sort/rank_structs.cpp
@@ -19,6 +19,7 @@
 #include <benchmarks/common/generate_nested_types.hpp>
 
 #include <cudf/sorting.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -37,7 +38,7 @@ void nvbench_rank_structs(nvbench::state& state, nvbench::type_list<nvbench::enu
                cudf::null_order::AFTER,
                false,
                cudf::get_default_stream(),
-               rmm::mr::get_current_device_resource());
+               cudf::get_current_device_resource_ref());
   });
 }
 
diff --git a/cpp/benchmarks/sort/sort_lists.cpp b/cpp/benchmarks/sort/sort_lists.cpp
index 2052de3688c..abc89472538 100644
--- a/cpp/benchmarks/sort/sort_lists.cpp
+++ b/cpp/benchmarks/sort/sort_lists.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <benchmarks/common/generate_nested_types.hpp>
 
 #include <cudf/detail/sorting.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -33,7 +34,7 @@ void sort_multiple_lists(nvbench::state& state)
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     cudf::detail::sorted_order(
-      *input_table, {}, {}, stream, rmm::mr::get_current_device_resource());
+      *input_table, {}, {}, stream, cudf::get_current_device_resource_ref());
   });
 }
 
@@ -76,7 +77,8 @@ void sort_lists_of_structs(nvbench::state& state)
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     rmm::cuda_stream_view stream_view{launch.get_stream()};
-    cudf::detail::sorted_order(input_table, {}, {}, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::sorted_order(
+      input_table, {}, {}, stream, cudf::get_current_device_resource_ref());
   });
 }
 
diff --git a/cpp/benchmarks/sort/sort_structs.cpp b/cpp/benchmarks/sort/sort_structs.cpp
index 3a3d1080ba0..fa1cf0279dd 100644
--- a/cpp/benchmarks/sort/sort_structs.cpp
+++ b/cpp/benchmarks/sort/sort_structs.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <benchmarks/common/generate_nested_types.hpp>
 
 #include <cudf/detail/sorting.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvbench/nvbench.cuh>
 
@@ -26,7 +27,8 @@ void nvbench_sort_struct(nvbench::state& state)
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     rmm::cuda_stream_view stream_view{launch.get_stream()};
-    cudf::detail::sorted_order(*input, {}, {}, stream_view, rmm::mr::get_current_device_resource());
+    cudf::detail::sorted_order(
+      *input, {}, {}, stream_view, cudf::get_current_device_resource_ref());
   });
 }
 
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index aa054ba93e9..fce8adb4c06 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -223,17 +223,17 @@ can be passed to libcudf functions via `rmm::device_async_resource_ref` paramete
 
 ### Current Device Memory Resource
 
-RMM provides a "default" memory resource for each device that can be accessed and updated via the
-`rmm::mr::get_current_device_resource()` and `rmm::mr::set_current_device_resource(...)` functions,
-respectively. All memory resource parameters should be defaulted to use the return value of
-`rmm::mr::get_current_device_resource()`.
+RMM provides a "default" memory resource for each device and functions to access and set it. libcudf
+provides wrappers for these functions in `cpp/include/cudf/utilities/memory_resource.hpp`.
+All memory resource parameters should be defaulted to use the return value of
+`cudf::get_current_device_resource_ref()`.
 
 ### Resource Refs
 
 Memory resources are passed via resource ref parameters. A resource ref is a memory resource wrapper
 that enables consumers to specify properties of resources that they expect. These are defined
-in the `cuda::mr` namespace of libcu++, but RMM provides some convenience wrappers in
-`rmm/resource_ref.hpp`:
+in the `cuda::mr` namespace of libcu++, but RMM provides some convenience aliases in
+`rmm/resource_ref.hpp`.
  - `rmm::device_resource_ref` accepts a memory resource that provides synchronous allocation
     of device-accessible memory.
  - `rmm::device_async_resource_ref` accepts a memory resource that provides stream-ordered allocation
@@ -247,7 +247,8 @@ in the `cuda::mr` namespace of libcu++, but RMM provides some convenience wrappe
  - `rmm::host_async_resource_ref` accepts a memory resource that provides stream-ordered allocation
     of host- and device-accessible memory.
 
-See the libcu++ [docs on `resource_ref`](https://nvidia.github.io/cccl/libcudacxx/extended_api/memory_resource/resource_ref.html) for more information.
+See the libcu++ [docs on `resource_ref`](https://nvidia.github.io/cccl/libcudacxx/extended_api/memory_resource/resource_ref.html)
+for more information.
 
 ## cudf::column
 
@@ -515,7 +516,7 @@ For example:
 // cpp/include/cudf/header.hpp
 void external_function(...,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 // cpp/include/cudf/detail/header.hpp
 namespace detail{
@@ -575,7 +576,7 @@ whose outputs will be returned. Example:
 // Returned `column` contains newly allocated memory,
 // therefore the API must accept a memory resource pointer
 std::unique_ptr<column> returns_output_memory(
-  ..., rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  ..., rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 // This API does not allocate any new *output* memory, therefore
 // a memory resource is unnecessary
@@ -586,17 +587,17 @@ This rule automatically applies to all detail APIs that allocate memory. Any det
 called by any public API, and therefore could be allocating memory that is returned to the user.
 To support such uses cases, all detail APIs allocating memory resources should accept an `mr`
 parameter. Callers are responsible for either passing through a provided `mr` or
-`rmm::mr::get_current_device_resource()` as needed.
+`cudf::get_current_device_resource_ref()` as needed.
 
 ### Temporary Memory
 
 Not all memory allocated within a libcudf API is returned to the caller. Often algorithms must
 allocate temporary, scratch memory for intermediate results. Always use the default resource
-obtained from `rmm::mr::get_current_device_resource()` for temporary memory allocations. Example:
+obtained from `cudf::get_current_device_resource_ref()` for temporary memory allocations. Example:
 
 ```c++
 rmm::device_buffer some_function(
-  ..., rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) {
+  ..., rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) {
     rmm::device_buffer returned_buffer(..., mr); // Returned buffer uses the passed in MR
     ...
     rmm::device_buffer temporary_buffer(...); // Temporary buffer uses default MR
@@ -613,7 +614,7 @@ use memory resources for device memory allocation with automated lifetime manage
 #### rmm::device_buffer
 Allocates a specified number of bytes of untyped, uninitialized device memory using a
 memory resource. If no `rmm::device_async_resource_ref` is explicitly provided, it uses
-`rmm::mr::get_current_device_resource()`.
+`cudf::get_current_device_resource_ref()`.
 
 `rmm::device_buffer` is movable and copyable on a stream. A copy performs a deep copy of the
 `device_buffer`'s device memory on the specified stream, whereas a move moves ownership of the
@@ -685,7 +686,7 @@ rmm::device_uvector<int32_t> v(100, s);
 // Initializes the elements to 0
 thrust::uninitialized_fill(thrust::cuda::par.on(s.value()), v.begin(), v.end(), int32_t{0});
 
-rmm::mr::device_memory_resource * mr = new my_custom_resource{...};
+auto mr = new my_custom_resource{...};
 // Allocates uninitialized storage for 100 `int32_t` elements on stream `s` using the resource `mr`
 rmm::device_uvector<int32_t> v2{100, s, mr};
 ```
diff --git a/cpp/examples/basic/src/process_csv.cpp b/cpp/examples/basic/src/process_csv.cpp
index 0d2b6b099ac..d27789a78a6 100644
--- a/cpp/examples/basic/src/process_csv.cpp
+++ b/cpp/examples/basic/src/process_csv.cpp
@@ -90,7 +90,7 @@ int main(int argc, char** argv)
   // it being set as the default
   // Also, call this before the first libcudf API call to ensure all data is allocated by the same
   // memory resource.
-  rmm::mr::set_current_device_resource(&mr);
+  cudf::set_current_device_resource(&mr);
 
   // Read data
   auto stock_table_with_metadata = read_csv("4stock_5day.csv");
diff --git a/cpp/examples/interop/interop.cpp b/cpp/examples/interop/interop.cpp
index 8271c3836e4..133a4e3a514 100644
--- a/cpp/examples/interop/interop.cpp
+++ b/cpp/examples/interop/interop.cpp
@@ -17,6 +17,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/interop.hpp>
 #include <cudf/io/csv.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
 
@@ -104,7 +105,7 @@ auto make_chars_and_offsets(std::vector<std::string> const& strings)
 std::unique_ptr<cudf::column> arrow_string_view_to_cudf_column(
   std::shared_ptr<arrow::StringViewArray> const& array,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   // Convert the string views into chars and offsets
   std::vector<std::string> strings;
diff --git a/cpp/examples/nested_types/deduplication.cpp b/cpp/examples/nested_types/deduplication.cpp
index c7c54592b70..f067b358f2d 100644
--- a/cpp/examples/nested_types/deduplication.cpp
+++ b/cpp/examples/nested_types/deduplication.cpp
@@ -192,7 +192,7 @@ int main(int argc, char const** argv)
 
   auto pool     = mr_name == "pool";
   auto resource = create_memory_resource(pool);
-  rmm::mr::set_current_device_resource(resource.get());
+  cudf::set_current_device_resource(resource.get());
 
   std::cout << "Reading " << input_filepath << "..." << std::endl;
   // read input file
diff --git a/cpp/examples/parquet_io/parquet_io.cpp b/cpp/examples/parquet_io/parquet_io.cpp
index 274a2599189..442731694fa 100644
--- a/cpp/examples/parquet_io/parquet_io.cpp
+++ b/cpp/examples/parquet_io/parquet_io.cpp
@@ -123,7 +123,7 @@ int main(int argc, char const** argv)
   // Create and use a memory pool
   bool is_pool_used = true;
   auto resource     = create_memory_resource(is_pool_used);
-  rmm::mr::set_current_device_resource(resource.get());
+  cudf::set_current_device_resource(resource.get());
 
   // Read input parquet file
   // We do not want to time the initial read time as it may include
diff --git a/cpp/examples/strings/common.hpp b/cpp/examples/strings/common.hpp
index 65a9c100c7c..1855374803a 100644
--- a/cpp/examples/strings/common.hpp
+++ b/cpp/examples/strings/common.hpp
@@ -93,7 +93,7 @@ int main(int argc, char const** argv)
 
   auto const mr_name = std::string{argc > 2 ? std::string(argv[2]) : std::string("cuda")};
   auto resource      = create_memory_resource(mr_name);
-  rmm::mr::set_current_device_resource(resource.get());
+  cudf::set_current_device_resource(resource.get());
 
   auto const csv_file   = std::string{argv[1]};
   auto const csv_result = [csv_file] {
diff --git a/cpp/examples/tpch/q1.cpp b/cpp/examples/tpch/q1.cpp
index fe03320b888..87b7e613766 100644
--- a/cpp/examples/tpch/q1.cpp
+++ b/cpp/examples/tpch/q1.cpp
@@ -20,6 +20,7 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 /**
  * @file q1.cpp
@@ -62,7 +63,7 @@
   cudf::column_view const& discount,
   cudf::column_view const& extendedprice,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto const one = cudf::numeric_scalar<double>(1);
   auto const one_minus_discount =
@@ -89,7 +90,7 @@
   cudf::column_view const& tax,
   cudf::column_view const& disc_price,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto const one = cudf::numeric_scalar<double>(1);
   auto const one_plus_tax =
@@ -106,7 +107,7 @@ int main(int argc, char const** argv)
 
   // Use a memory pool
   auto resource = create_memory_resource(args.memory_resource_type);
-  rmm::mr::set_current_device_resource(resource.get());
+  cudf::set_current_device_resource(resource.get());
 
   cudf::examples::timer timer;
 
diff --git a/cpp/examples/tpch/q10.cpp b/cpp/examples/tpch/q10.cpp
index 94da46f6930..fdf147b50e0 100644
--- a/cpp/examples/tpch/q10.cpp
+++ b/cpp/examples/tpch/q10.cpp
@@ -20,6 +20,7 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 /**
  * @file q10.cpp
@@ -75,7 +76,7 @@
   cudf::column_view const& extendedprice,
   cudf::column_view const& discount,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto const one = cudf::numeric_scalar<double>(1);
   auto const one_minus_discount =
@@ -95,7 +96,7 @@ int main(int argc, char const** argv)
 
   // Use a memory pool
   auto resource = create_memory_resource(args.memory_resource_type);
-  rmm::mr::set_current_device_resource(resource.get());
+  cudf::set_current_device_resource(resource.get());
 
   cudf::examples::timer timer;
 
diff --git a/cpp/examples/tpch/q5.cpp b/cpp/examples/tpch/q5.cpp
index 89396a6c968..12c186db10e 100644
--- a/cpp/examples/tpch/q5.cpp
+++ b/cpp/examples/tpch/q5.cpp
@@ -20,6 +20,7 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 /**
  * @file q5.cpp
@@ -70,7 +71,7 @@
   cudf::column_view const& extendedprice,
   cudf::column_view const& discount,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto const one = cudf::numeric_scalar<double>(1);
   auto const one_minus_discount =
@@ -91,7 +92,7 @@ int main(int argc, char const** argv)
 
   // Use a memory pool
   auto resource = create_memory_resource(args.memory_resource_type);
-  rmm::mr::set_current_device_resource(resource.get());
+  cudf::set_current_device_resource(resource.get());
 
   cudf::examples::timer timer;
 
diff --git a/cpp/examples/tpch/q6.cpp b/cpp/examples/tpch/q6.cpp
index 405b2ac73ca..92dac40c768 100644
--- a/cpp/examples/tpch/q6.cpp
+++ b/cpp/examples/tpch/q6.cpp
@@ -20,6 +20,7 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 /**
  * @file q6.cpp
@@ -51,7 +52,7 @@
   cudf::column_view const& extendedprice,
   cudf::column_view const& discount,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto const revenue_type = cudf::data_type{cudf::type_id::FLOAT64};
   auto revenue            = cudf::binary_operation(
@@ -65,7 +66,7 @@ int main(int argc, char const** argv)
 
   // Use a memory pool
   auto resource = create_memory_resource(args.memory_resource_type);
-  rmm::mr::set_current_device_resource(resource.get());
+  cudf::set_current_device_resource(resource.get());
 
   cudf::examples::timer timer;
 
diff --git a/cpp/examples/tpch/q9.cpp b/cpp/examples/tpch/q9.cpp
index d3c218253f9..2882182aa2b 100644
--- a/cpp/examples/tpch/q9.cpp
+++ b/cpp/examples/tpch/q9.cpp
@@ -22,6 +22,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 /**
  * @file q9.cpp
@@ -84,7 +85,7 @@
   cudf::column_view const& supplycost,
   cudf::column_view const& quantity,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto const one = cudf::numeric_scalar<double>(1);
   auto const one_minus_discount =
@@ -114,7 +115,7 @@ int main(int argc, char const** argv)
 
   // Use a memory pool
   auto resource = create_memory_resource(args.memory_resource_type);
-  rmm::mr::set_current_device_resource(resource.get());
+  cudf::set_current_device_resource(resource.get());
 
   cudf::examples::timer timer;
 
diff --git a/cpp/examples/tpch/utils.hpp b/cpp/examples/tpch/utils.hpp
index e586da2c802..8102fa8f976 100644
--- a/cpp/examples/tpch/utils.hpp
+++ b/cpp/examples/tpch/utils.hpp
@@ -27,6 +27,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/unary.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
@@ -189,7 +190,7 @@ std::vector<T> concat(std::vector<T> const& lhs, std::vector<T> const& rhs)
   auto const left_selected                           = left_input.select(left_on);
   auto const right_selected                          = right_input.select(right_on);
   auto const [left_join_indices, right_join_indices] = cudf::inner_join(
-    left_selected, right_selected, compare_nulls, rmm::mr::get_current_device_resource());
+    left_selected, right_selected, compare_nulls, cudf::get_current_device_resource_ref());
 
   auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
   auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
diff --git a/cpp/include/cudf/ast/detail/expression_parser.hpp b/cpp/include/cudf/ast/detail/expression_parser.hpp
index da552d95421..a254171ef11 100644
--- a/cpp/include/cudf/ast/detail/expression_parser.hpp
+++ b/cpp/include/cudf/ast/detail/expression_parser.hpp
@@ -20,8 +20,7 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/scan.h>
 
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index 51199bb5792..63908f6c870 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -19,9 +19,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
@@ -171,7 +169,7 @@ std::unique_ptr<column> binary_operation(
   binary_operator op,
   data_type output_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs a binary operation between a column and a scalar.
@@ -202,7 +200,7 @@ std::unique_ptr<column> binary_operation(
   binary_operator op,
   data_type output_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs a binary operation between two columns.
@@ -232,7 +230,7 @@ std::unique_ptr<column> binary_operation(
   binary_operator op,
   data_type output_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs a binary operation between two columns using a
@@ -263,7 +261,7 @@ std::unique_ptr<column> binary_operation(
   std::string const& ptx,
   data_type output_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the `scale` for a `fixed_point` number based on given binary operator `op`
@@ -315,7 +313,7 @@ std::pair<rmm::device_buffer, size_type> scalar_col_valid_mask_and(
   column_view const& col,
   scalar const& s,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace binops
 
diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
index 5d1d74c3f28..de19a076cc4 100644
--- a/cpp/include/cudf/column/column.hpp
+++ b/cpp/include/cudf/column/column.hpp
@@ -19,12 +19,11 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <type_traits>
@@ -65,7 +64,7 @@ class column {
    */
   column(column const& other,
          rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-         rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+         rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Move the contents from `other` to create a new column.
@@ -143,7 +142,7 @@ class column {
    */
   explicit column(column_view view,
                   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Returns the column's logical element type
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index b2dcb25acb5..c3b68b52c36 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -18,12 +18,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
@@ -78,7 +77,7 @@ std::unique_ptr<column> make_numeric_column(
   size_type size,
   mask_state state                  = mask_state::UNALLOCATED,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -104,7 +103,7 @@ std::unique_ptr<column> make_numeric_column(
   B&& null_mask,
   size_type null_count,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
   return std::make_unique<column>(type,
@@ -136,7 +135,7 @@ std::unique_ptr<column> make_fixed_point_column(
   size_type size,
   mask_state state                  = mask_state::UNALLOCATED,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -161,7 +160,7 @@ std::unique_ptr<column> make_fixed_point_column(
   B&& null_mask,
   size_type null_count,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.");
   return std::make_unique<column>(type,
@@ -194,7 +193,7 @@ std::unique_ptr<column> make_timestamp_column(
   size_type size,
   mask_state state                  = mask_state::UNALLOCATED,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -220,7 +219,7 @@ std::unique_ptr<column> make_timestamp_column(
   B&& null_mask,
   size_type null_count,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
   return std::make_unique<column>(type,
@@ -253,7 +252,7 @@ std::unique_ptr<column> make_duration_column(
   size_type size,
   mask_state state                  = mask_state::UNALLOCATED,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -279,7 +278,7 @@ std::unique_ptr<column> make_duration_column(
   B&& null_mask,
   size_type null_count,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
   return std::make_unique<column>(type,
@@ -312,7 +311,7 @@ std::unique_ptr<column> make_fixed_width_column(
   size_type size,
   mask_state state                  = mask_state::UNALLOCATED,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct column with sufficient uninitialized storage to hold `size` elements of the
@@ -338,7 +337,7 @@ std::unique_ptr<column> make_fixed_width_column(
   B&& null_mask,
   size_type null_count,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type.");
   if (is_timestamp(type)) {
@@ -377,7 +376,7 @@ std::unique_ptr<column> make_fixed_width_column(
 std::unique_ptr<column> make_strings_column(
   cudf::device_span<thrust::pair<char const*, size_type> const> strings,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct a STRING type column given a device span of string_view.
@@ -409,7 +408,7 @@ std::unique_ptr<column> make_strings_column(
   cudf::device_span<string_view const> string_views,
   string_view const null_placeholder,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct a STRING type column given offsets column, chars columns, and null mask and null
@@ -497,7 +496,7 @@ std::unique_ptr<cudf::column> make_lists_column(
   size_type null_count,
   rmm::device_buffer&& null_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct a STRUCT column using specified child columns as members.
@@ -528,7 +527,7 @@ std::unique_ptr<cudf::column> make_structs_column(
   size_type null_count,
   rmm::device_buffer&& null_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct a column with size elements that are all equal to the given scalar.
@@ -548,7 +547,7 @@ std::unique_ptr<column> make_column_from_scalar(
   scalar const& s,
   size_type size,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct a dictionary column with size elements that are all equal to the given scalar.
@@ -568,7 +567,7 @@ std::unique_ptr<column> make_dictionary_from_scalar(
   scalar const& s,
   size_type size,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/concatenate.hpp b/cpp/include/cudf/concatenate.hpp
index 0935bdf7def..155740dc29e 100644
--- a/cpp/include/cudf/concatenate.hpp
+++ b/cpp/include/cudf/concatenate.hpp
@@ -19,11 +19,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
-
 #include <memory>
 
 namespace CUDF_EXPORT cudf {
@@ -49,7 +47,7 @@ namespace CUDF_EXPORT cudf {
 rmm::device_buffer concatenate_masks(
   host_span<column_view const> views,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Concatenates multiple columns into a single column
@@ -66,7 +64,7 @@ rmm::device_buffer concatenate_masks(
 std::unique_ptr<column> concatenate(
   host_span<column_view const> columns_to_concat,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Columns of `tables_to_concat` are concatenated vertically to return a
@@ -95,7 +93,7 @@ std::unique_ptr<column> concatenate(
 std::unique_ptr<table> concatenate(
   host_span<table_view const> tables_to_concat,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/contiguous_split.hpp b/cpp/include/cudf/contiguous_split.hpp
index 195dac25268..41eef9559b8 100644
--- a/cpp/include/cudf/contiguous_split.hpp
+++ b/cpp/include/cudf/contiguous_split.hpp
@@ -19,8 +19,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 #include <vector>
@@ -122,7 +121,7 @@ struct packed_table {
 std::vector<packed_table> contiguous_split(
   cudf::table_view const& input,
   std::vector<size_type> const& splits,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 namespace detail {
 
@@ -154,7 +153,7 @@ struct contiguous_split_state;
  * // Choose a memory resource (optional). This memory resource is used for scratch/thrust temporary
  * // data. In memory constrained cases, this can be used to set aside scratch memory
  * // for `chunked_pack` at the beginning of a program.
- * auto mr = rmm::mr::get_current_device_resource();
+ * auto mr = cudf::get_current_device_resource_ref();
  *
  * // Define a buffer size for each chunk: the larger the buffer is, the more SMs can be
  * // occupied by this algorithm.
@@ -205,7 +204,7 @@ class chunked_pack {
   explicit chunked_pack(
     cudf::table_view const& input,
     std::size_t user_buffer_size,
-    rmm::device_async_resource_ref temp_mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref temp_mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Destructor that will be implemented as default. Declared with definition here because
@@ -270,7 +269,7 @@ class chunked_pack {
   [[nodiscard]] static std::unique_ptr<chunked_pack> create(
     cudf::table_view const& input,
     std::size_t user_buffer_size,
-    rmm::device_async_resource_ref temp_mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref temp_mr = cudf::get_current_device_resource_ref());
 
  private:
   // internal state of contiguous split
@@ -290,7 +289,7 @@ class chunked_pack {
  *         and device memory respectively
  */
 packed_columns pack(cudf::table_view const& input,
-                    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Produce the metadata used for packing a table stored in a contiguous buffer.
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index 3c44ff48fdf..388f19abea2 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -24,9 +24,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 #include <vector>
@@ -88,7 +86,7 @@ std::unique_ptr<table> gather(
   column_view const& gather_map,
   out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK,
   rmm::cuda_stream_view stream       = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr  = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Reverses the rows within a table.
@@ -108,7 +106,7 @@ std::unique_ptr<table> gather(
 std::unique_ptr<table> reverse(
   table_view const& source_table,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Reverses the elements of a column
@@ -128,7 +126,7 @@ std::unique_ptr<table> reverse(
 std::unique_ptr<column> reverse(
   column_view const& source_column,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Scatters the rows of the source table into a copy of the target table
@@ -177,7 +175,7 @@ std::unique_ptr<table> scatter(
   column_view const& scatter_map,
   table_view const& target,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Scatters a row of scalar values into a copy of the target table
@@ -220,7 +218,7 @@ std::unique_ptr<table> scatter(
   column_view const& indices,
   table_view const& target,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Indicates when to allocate a mask, based on an existing mask.
@@ -268,7 +266,7 @@ std::unique_ptr<column> allocate_like(
   column_view const& input,
   mask_allocation_policy mask_alloc = mask_allocation_policy::RETAIN,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates an uninitialized new column of the specified size and same type as the `input`.
@@ -291,7 +289,7 @@ std::unique_ptr<column> allocate_like(
   size_type size,
   mask_allocation_policy mask_alloc = mask_allocation_policy::RETAIN,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a table of empty columns with the same types as the `input_table`
@@ -383,7 +381,7 @@ std::unique_ptr<column> copy_range(
   size_type source_end,
   size_type target_begin,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a new column by shifting all values by an offset.
@@ -427,7 +425,7 @@ std::unique_ptr<column> shift(
   size_type offset,
   scalar const& fill_value,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Slices a `column_view` into a set of `column_view`s according to a set of indices.
@@ -630,7 +628,7 @@ std::unique_ptr<column> copy_if_else(
   column_view const& rhs,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
@@ -656,7 +654,7 @@ std::unique_ptr<column> copy_if_else(
   column_view const& rhs,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
@@ -682,7 +680,7 @@ std::unique_ptr<column> copy_if_else(
   scalar const& rhs,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief   Returns a new column, where each element is selected from either @p lhs or
@@ -706,7 +704,7 @@ std::unique_ptr<column> copy_if_else(
   scalar const& rhs,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Scatters rows from the input table to rows of the output corresponding
@@ -750,7 +748,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& target,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Scatters scalar values to rows of the output corresponding
@@ -789,7 +787,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& target,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Get the element at specified index from a column
@@ -809,7 +807,7 @@ std::unique_ptr<scalar> get_element(
   column_view const& input,
   size_type index,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Indicates whether a row can be sampled more than once.
@@ -853,7 +851,7 @@ std::unique_ptr<table> sample(
   sample_with_replacement replacement = sample_with_replacement::FALSE,
   int64_t const seed                  = 0,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Checks if a column or its descendants have non-empty null rows
@@ -970,7 +968,7 @@ bool may_have_nonempty_nulls(column_view const& input);
 std::unique_ptr<column> purge_nonempty_nulls(
   column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index f7bed8bdc7e..c7523c80b2b 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -18,9 +18,7 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
@@ -49,7 +47,7 @@ namespace datetime {
  */
 std::unique_ptr<cudf::column> extract_year(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Extracts month from any datetime type and returns an int16_t
@@ -63,7 +61,7 @@ std::unique_ptr<cudf::column> extract_year(
  */
 std::unique_ptr<cudf::column> extract_month(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Extracts day from any datetime type and returns an int16_t
@@ -77,7 +75,7 @@ std::unique_ptr<cudf::column> extract_month(
  */
 std::unique_ptr<cudf::column> extract_day(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Extracts a weekday from any datetime type and returns an int16_t
@@ -91,7 +89,7 @@ std::unique_ptr<cudf::column> extract_day(
  */
 std::unique_ptr<cudf::column> extract_weekday(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Extracts hour from any datetime type and returns an int16_t
@@ -105,7 +103,7 @@ std::unique_ptr<cudf::column> extract_weekday(
  */
 std::unique_ptr<cudf::column> extract_hour(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Extracts minute from any datetime type and returns an int16_t
@@ -119,7 +117,7 @@ std::unique_ptr<cudf::column> extract_hour(
  */
 std::unique_ptr<cudf::column> extract_minute(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Extracts second from any datetime type and returns an int16_t
@@ -133,7 +131,7 @@ std::unique_ptr<cudf::column> extract_minute(
  */
 std::unique_ptr<cudf::column> extract_second(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Extracts millisecond fraction from any datetime type and returns an int16_t
@@ -150,7 +148,7 @@ std::unique_ptr<cudf::column> extract_second(
  */
 std::unique_ptr<cudf::column> extract_millisecond_fraction(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Extracts microsecond fraction from any datetime type and returns an int16_t
@@ -167,7 +165,7 @@ std::unique_ptr<cudf::column> extract_millisecond_fraction(
  */
 std::unique_ptr<cudf::column> extract_microsecond_fraction(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Extracts nanosecond fraction from any datetime type and returns an int16_t
@@ -184,7 +182,7 @@ std::unique_ptr<cudf::column> extract_microsecond_fraction(
  */
 std::unique_ptr<cudf::column> extract_nanosecond_fraction(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 /**
@@ -205,7 +203,7 @@ std::unique_ptr<cudf::column> extract_nanosecond_fraction(
  */
 std::unique_ptr<cudf::column> last_day_of_month(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Computes the day number since the start of the year from the datetime and
@@ -219,7 +217,7 @@ std::unique_ptr<cudf::column> last_day_of_month(
  */
 std::unique_ptr<cudf::column> day_of_year(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Adds or subtracts a number of months from the datetime type and returns a
@@ -254,7 +252,7 @@ std::unique_ptr<cudf::column> day_of_year(
 std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::column_view const& months,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Adds or subtracts a number of months from the datetime type and returns a
@@ -289,7 +287,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
 std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::scalar const& months,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Check if the year of the given date is a leap year
@@ -306,7 +304,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
  */
 std::unique_ptr<cudf::column> is_leap_year(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Extract the number of days in the month
@@ -322,7 +320,7 @@ std::unique_ptr<cudf::column> is_leap_year(
  */
 std::unique_ptr<cudf::column> days_in_month(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Returns the quarter of the date
@@ -338,7 +336,7 @@ std::unique_ptr<cudf::column> days_in_month(
  */
 std::unique_ptr<cudf::column> extract_quarter(
   cudf::column_view const& column,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Fixed frequencies supported by datetime rounding functions ceil, floor, round.
@@ -367,7 +365,7 @@ enum class rounding_frequency : int32_t {
 std::unique_ptr<cudf::column> ceil_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Round datetimes down to the nearest multiple of the given frequency.
@@ -382,7 +380,7 @@ std::unique_ptr<cudf::column> ceil_datetimes(
 std::unique_ptr<cudf::column> floor_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Round datetimes to the nearest multiple of the given frequency.
@@ -397,7 +395,7 @@ std::unique_ptr<cudf::column> floor_datetimes(
 std::unique_ptr<cudf::column> round_datetimes(
   cudf::column_view const& column,
   rounding_frequency freq,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/detail/binaryop.hpp b/cpp/include/cudf/detail/binaryop.hpp
index fe739327a08..91f774839d9 100644
--- a/cpp/include/cudf/detail/binaryop.hpp
+++ b/cpp/include/cudf/detail/binaryop.hpp
@@ -18,9 +18,9 @@
 #include <cudf/binaryop.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 //! Inner interfaces and implementations
diff --git a/cpp/include/cudf/detail/calendrical_month_sequence.cuh b/cpp/include/cudf/detail/calendrical_month_sequence.cuh
index a9cf54e29b8..2097411357d 100644
--- a/cpp/include/cudf/detail/calendrical_month_sequence.cuh
+++ b/cpp/include/cudf/detail/calendrical_month_sequence.cuh
@@ -21,11 +21,11 @@
 #include <cudf/detail/datetime_ops.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/include/cudf/detail/concatenate.hpp b/cpp/include/cudf/detail/concatenate.hpp
index 1be269710b2..51166f6054b 100644
--- a/cpp/include/cudf/detail/concatenate.hpp
+++ b/cpp/include/cudf/detail/concatenate.hpp
@@ -20,10 +20,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
diff --git a/cpp/include/cudf/detail/concatenate_masks.hpp b/cpp/include/cudf/detail/concatenate_masks.hpp
index fc829361fde..4f9e7f9cd13 100644
--- a/cpp/include/cudf/detail/concatenate_masks.hpp
+++ b/cpp/include/cudf/detail/concatenate_masks.hpp
@@ -18,12 +18,12 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 //! Inner interfaces and implementations
diff --git a/cpp/include/cudf/detail/contiguous_split.hpp b/cpp/include/cudf/detail/contiguous_split.hpp
index 52c51daa917..52ca091e1cd 100644
--- a/cpp/include/cudf/detail/contiguous_split.hpp
+++ b/cpp/include/cudf/detail/contiguous_split.hpp
@@ -19,9 +19,9 @@
 #include <cudf/contiguous_split.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index 2be432c0825..60aa500f129 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -20,11 +20,11 @@
 #include <cudf/copying.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <initializer_list>
 
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 4071fa01fb2..dfb646c66c4 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -30,6 +30,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -38,7 +39,6 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda/atomic>
diff --git a/cpp/include/cudf/detail/copy_if_else.cuh b/cpp/include/cudf/detail/copy_if_else.cuh
index d260a4591b7..a70cd5a0661 100644
--- a/cpp/include/cudf/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/detail/copy_if_else.cuh
@@ -21,9 +21,9 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_scalar.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/std/optional>
 #include <thrust/iterator/iterator_traits.h>
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index 1b3b2056c6c..3aa136d630b 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -23,11 +23,11 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda_runtime.h>
diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp
index 95469de8ae6..31782cbaf8a 100644
--- a/cpp/include/cudf/detail/datetime.hpp
+++ b/cpp/include/cudf/detail/datetime.hpp
@@ -18,8 +18,7 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh
index 0b3d7ac58bf..2acc10105cf 100644
--- a/cpp/include/cudf/detail/distinct_hash_join.cuh
+++ b/cpp/include/cudf/detail/distinct_hash_join.cuh
@@ -18,10 +18,10 @@
 #include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_set.cuh>
 
diff --git a/cpp/include/cudf/detail/fill.hpp b/cpp/include/cudf/detail/fill.hpp
index 82c6af8b611..04b3b63a9ed 100644
--- a/cpp/include/cudf/detail/fill.hpp
+++ b/cpp/include/cudf/detail/fill.hpp
@@ -19,9 +19,9 @@
 #include <cudf/filling.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index df6fe6e6ccb..d91c3df719a 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -33,12 +33,12 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/gather.h>
@@ -582,11 +582,11 @@ void gather_bitmask(table_view const& source,
     return col->mutable_view().null_mask();
   });
   auto d_target_masks =
-    make_device_uvector_async(target_masks, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(target_masks, stream, cudf::get_current_device_resource_ref());
 
   auto const device_source = table_device_view::create(source, stream);
   auto d_valid_counts      = make_zeroed_device_uvector_async<size_type>(
-    target.size(), stream, rmm::mr::get_current_device_resource());
+    target.size(), stream, cudf::get_current_device_resource_ref());
 
   // Dispatch operation enum to get implementation
   auto const impl = [op]() {
diff --git a/cpp/include/cudf/detail/gather.hpp b/cpp/include/cudf/detail/gather.hpp
index 39cd43934e3..48fb60aa5dd 100644
--- a/cpp/include/cudf/detail/gather.hpp
+++ b/cpp/include/cudf/detail/gather.hpp
@@ -21,10 +21,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/include/cudf/detail/groupby.hpp b/cpp/include/cudf/detail/groupby.hpp
index 36eae05ce39..3e9511de5e4 100644
--- a/cpp/include/cudf/detail/groupby.hpp
+++ b/cpp/include/cudf/detail/groupby.hpp
@@ -17,10 +17,10 @@
 
 #include <cudf/groupby.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <utility>
diff --git a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
index c0910b4d5ae..e3a6f7db2b5 100644
--- a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
+++ b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
@@ -20,10 +20,10 @@
 #include <cudf/replace.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 namespace CUDF_EXPORT cudf {
 namespace groupby {
 namespace detail {
diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp
index a411a890622..ce8783d8b79 100644
--- a/cpp/include/cudf/detail/groupby/sort_helper.hpp
+++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp
@@ -20,10 +20,10 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace groupby::detail::sort {
diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
index 7a1e38eefe0..7de79b31bc7 100644
--- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh
+++ b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
@@ -18,11 +18,11 @@
 #include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_map.cuh>
 #include <thrust/for_each.h>
diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index 0d8f078c9d1..938d0e95097 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -20,9 +20,9 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index af46dd79cdb..b084a94cbc8 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -20,11 +20,11 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_multimap.cuh>
 
diff --git a/cpp/include/cudf/detail/label_bins.hpp b/cpp/include/cudf/detail/label_bins.hpp
index 92a417b0132..44fcba0d2d6 100644
--- a/cpp/include/cudf/detail/label_bins.hpp
+++ b/cpp/include/cudf/detail/label_bins.hpp
@@ -21,11 +21,10 @@
 #include <cudf/labeling/label_bins.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 
diff --git a/cpp/include/cudf/detail/merge.hpp b/cpp/include/cudf/detail/merge.hpp
index 72e34b76158..43a0387ab99 100644
--- a/cpp/include/cudf/detail/merge.hpp
+++ b/cpp/include/cudf/detail/merge.hpp
@@ -17,9 +17,9 @@
 #pragma once
 
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
diff --git a/cpp/include/cudf/detail/null_mask.cuh b/cpp/include/cudf/detail/null_mask.cuh
index ae6db5409cc..327c732716c 100644
--- a/cpp/include/cudf/detail/null_mask.cuh
+++ b/cpp/include/cudf/detail/null_mask.cuh
@@ -21,12 +21,12 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_reduce.cuh>
 #include <cub/device/device_segmented_reduce.cuh>
@@ -164,7 +164,7 @@ size_type inplace_bitmask_binop(Binop op,
   CUDF_EXPECTS(std::all_of(masks.begin(), masks.end(), [](auto p) { return p != nullptr; }),
                "Mask pointer cannot be null");
 
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref();
   rmm::device_scalar<size_type> d_counter{0, stream, mr};
   rmm::device_uvector<bitmask_type const*> d_masks(masks.size(), stream, mr);
   rmm::device_uvector<size_type> d_begin_bits(masks_begin_bits.size(), stream, mr);
@@ -434,7 +434,7 @@ std::vector<size_type> segmented_count_bits(bitmask_type const* bitmask,
     std::distance(indices_begin, indices_end), stream);
   std::copy(indices_begin, indices_end, std::back_inserter(h_indices));
   auto const d_indices =
-    make_device_uvector_async(h_indices, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(h_indices, stream, cudf::get_current_device_resource_ref());
 
   // Compute the bit counts over each segment.
   auto first_bit_indices_begin = thrust::make_transform_iterator(
@@ -449,7 +449,7 @@ std::vector<size_type> segmented_count_bits(bitmask_type const* bitmask,
                                        last_bit_indices_begin,
                                        count_bits,
                                        stream,
-                                       rmm::mr::get_current_device_resource());
+                                       cudf::get_current_device_resource_ref());
 
   // Copy the results back to the host.
   return make_std_vector_sync(d_bit_counts, stream);
@@ -576,7 +576,7 @@ std::pair<rmm::device_buffer, size_type> segmented_null_mask_reduction(
                                        last_bit_indices_begin,
                                        cudf::detail::count_bits_policy::SET_BITS,
                                        stream,
-                                       rmm::mr::get_current_device_resource());
+                                       cudf::get_current_device_resource_ref());
   auto const length_and_valid_count =
     thrust::make_zip_iterator(segment_length_iterator, segment_valid_counts.begin());
   return cudf::detail::valid_if(
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 67e3617d873..b8c52a4ae2c 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -18,10 +18,10 @@
 #include <cudf/column/column.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp
index 23d5fb73ba3..4f912077e59 100644
--- a/cpp/include/cudf/detail/quantiles.hpp
+++ b/cpp/include/cudf/detail/quantiles.hpp
@@ -19,9 +19,9 @@
 #include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
diff --git a/cpp/include/cudf/detail/repeat.hpp b/cpp/include/cudf/detail/repeat.hpp
index e17f1b7c5fd..81ac5bf2b14 100644
--- a/cpp/include/cudf/detail/repeat.hpp
+++ b/cpp/include/cudf/detail/repeat.hpp
@@ -18,9 +18,9 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/include/cudf/detail/replace.hpp b/cpp/include/cudf/detail/replace.hpp
index e2bd729861b..3b18b95ce75 100644
--- a/cpp/include/cudf/detail/replace.hpp
+++ b/cpp/include/cudf/detail/replace.hpp
@@ -18,9 +18,9 @@
 #include <cudf/replace.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp
index 68a856373bf..aeeed282d8b 100644
--- a/cpp/include/cudf/detail/reshape.hpp
+++ b/cpp/include/cudf/detail/reshape.hpp
@@ -18,9 +18,9 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/include/cudf/detail/rolling.hpp b/cpp/include/cudf/detail/rolling.hpp
index 5bfa5679531..d8d5506969b 100644
--- a/cpp/include/cudf/detail/rolling.hpp
+++ b/cpp/include/cudf/detail/rolling.hpp
@@ -20,9 +20,9 @@
 #include <cudf/rolling.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/include/cudf/detail/round.hpp b/cpp/include/cudf/detail/round.hpp
index ba3ef1c1ce7..df1faf05dbd 100644
--- a/cpp/include/cudf/detail/round.hpp
+++ b/cpp/include/cudf/detail/round.hpp
@@ -18,9 +18,9 @@
 
 #include <cudf/round.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 //! Inner interfaces and implementations
diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp
index bd60309c5c3..313964a6341 100644
--- a/cpp/include/cudf/detail/scan.hpp
+++ b/cpp/include/cudf/detail/scan.hpp
@@ -18,9 +18,9 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index 80bc87731ca..fa93ce4e13c 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -30,12 +30,12 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/distance.h>
@@ -223,7 +223,7 @@ struct column_scatterer_impl<dictionary32> {
     auto target_matched    = dictionary::detail::add_keys(target, source.keys(), stream, mr);
     auto const target_view = dictionary_column_view(target_matched->view());
     auto source_matched    = dictionary::detail::set_keys(
-      source, target_view.keys(), stream, rmm::mr::get_current_device_resource());
+      source, target_view.keys(), stream, cudf::get_current_device_resource_ref());
     auto const source_view = dictionary_column_view(source_matched->view());
 
     // now build the new indices by doing a scatter on just the matched indices
diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp
index 6691ddc5c09..39f973bb611 100644
--- a/cpp/include/cudf/detail/scatter.hpp
+++ b/cpp/include/cudf/detail/scatter.hpp
@@ -20,10 +20,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/include/cudf/detail/search.hpp b/cpp/include/cudf/detail/search.hpp
index 72e2cf074bc..da3b98660dc 100644
--- a/cpp/include/cudf/detail/search.hpp
+++ b/cpp/include/cudf/detail/search.hpp
@@ -20,10 +20,10 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
diff --git a/cpp/include/cudf/detail/sequence.hpp b/cpp/include/cudf/detail/sequence.hpp
index a08010a610f..41d9fe41080 100644
--- a/cpp/include/cudf/detail/sequence.hpp
+++ b/cpp/include/cudf/detail/sequence.hpp
@@ -19,16 +19,16 @@
 #include <cudf/filling.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
 /**
  * @copydoc cudf::sequence(size_type size, scalar const& init, scalar const& step,
  *                                       rmm::device_async_resource_ref mr =
- *rmm::mr::get_current_device_resource())
+ *cudf::get_current_device_resource_ref())
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
@@ -41,7 +41,7 @@ std::unique_ptr<column> sequence(size_type size,
 /**
  * @copydoc cudf::sequence(size_type size, scalar const& init,
                                          rmm::device_async_resource_ref mr =
- rmm::mr::get_current_device_resource())
+ cudf::get_current_device_resource_ref())
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
diff --git a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
index 63e4fca8915..88ec0c07dc5 100644
--- a/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
+++ b/cpp/include/cudf/detail/sizes_to_offsets_iterator.cuh
@@ -19,11 +19,11 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp
index 08cf329f199..185855e1fc0 100644
--- a/cpp/include/cudf/detail/sorting.hpp
+++ b/cpp/include/cudf/detail/sorting.hpp
@@ -19,9 +19,9 @@
 #include <cudf/sorting.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index 85d2ee9790f..8a4366bdd63 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -20,10 +20,10 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index 7de68035b19..261c54afd51 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -19,11 +19,11 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace structs::detail {
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index 10eb3d389c7..80a4460023f 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -19,10 +19,10 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace tdigest::detail {
diff --git a/cpp/include/cudf/detail/timezone.hpp b/cpp/include/cudf/detail/timezone.hpp
index c7798ff60ed..5738f9ec8e9 100644
--- a/cpp/include/cudf/detail/timezone.hpp
+++ b/cpp/include/cudf/detail/timezone.hpp
@@ -17,9 +17,9 @@
 
 #include <cudf/timezone.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
@@ -34,7 +34,7 @@ std::unique_ptr<table> make_timezone_transition_table(
   std::optional<std::string_view> tzif_dir,
   std::string_view timezone_name,
   rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace detail
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index 02849ef023c..4cfa95468f2 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -20,9 +20,9 @@
 #include <cudf/transform.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
diff --git a/cpp/include/cudf/detail/transpose.hpp b/cpp/include/cudf/detail/transpose.hpp
index 559b2c32996..22382fa0713 100644
--- a/cpp/include/cudf/detail/transpose.hpp
+++ b/cpp/include/cudf/detail/transpose.hpp
@@ -19,9 +19,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace detail {
diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index bb05138bc8c..18b1e9b2d2e 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -20,10 +20,10 @@
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
diff --git a/cpp/include/cudf/detail/utilities/host_memory.hpp b/cpp/include/cudf/detail/utilities/host_memory.hpp
index c6775a950c9..c661faf1fbe 100644
--- a/cpp/include/cudf/detail/utilities/host_memory.hpp
+++ b/cpp/include/cudf/detail/utilities/host_memory.hpp
@@ -18,10 +18,9 @@
 
 #include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <cstddef>
 
 namespace cudf::detail {
diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp
index d4dd7b0d626..ecb8f910463 100644
--- a/cpp/include/cudf/detail/utilities/host_vector.hpp
+++ b/cpp/include/cudf/detail/utilities/host_vector.hpp
@@ -19,9 +19,9 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/aligned.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/host_vector.h>
 
@@ -33,7 +33,7 @@ namespace CUDF_EXPORT cudf {
 namespace detail {
 
 /*! \p rmm_host_allocator is a CUDA-specific host memory allocator
- *  that employs \c a `rmm::host_async_resource_ref` for allocation.
+ *  that employs \c a `cudf::host_async_resource_ref` for allocation.
  *
  *  \see https://en.cppreference.com/w/cpp/memory/allocator
  */
@@ -68,10 +68,10 @@ inline constexpr bool contains_property =
   (cuda::std::is_same_v<DesiredProperty, Properties> || ... || false);
 
 /*! \p rmm_host_allocator is a CUDA-specific host memory allocator
- *  that employs \c `rmm::host_async_resource_ref` for allocation.
+ *  that employs \c `cudf::host_async_resource_ref` for allocation.
  *
  * The \p rmm_host_allocator provides an interface for host memory allocation through the user
- * provided \c `rmm::host_async_resource_ref`. The \p rmm_host_allocator does not take ownership of
+ * provided \c `cudf::host_async_resource_ref`. The \p rmm_host_allocator does not take ownership of
  * this reference and therefore it is the user's responsibility to ensure its lifetime for the
  * duration of the lifetime of the \p rmm_host_allocator.
  *
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index a9d91cdeee1..953ae5b9308 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -27,13 +27,13 @@
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index 56a2c76b741..cfb2e70bfed 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -22,10 +22,10 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 
diff --git a/cpp/include/cudf/dictionary/detail/concatenate.hpp b/cpp/include/cudf/dictionary/detail/concatenate.hpp
index 0eb17aa06f4..12f09616295 100644
--- a/cpp/include/cudf/dictionary/detail/concatenate.hpp
+++ b/cpp/include/cudf/dictionary/detail/concatenate.hpp
@@ -18,10 +18,10 @@
 #include <cudf/column/column.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace dictionary::detail {
diff --git a/cpp/include/cudf/dictionary/detail/encode.hpp b/cpp/include/cudf/dictionary/detail/encode.hpp
index cc7ffbd397f..600ba8d6c67 100644
--- a/cpp/include/cudf/dictionary/detail/encode.hpp
+++ b/cpp/include/cudf/dictionary/detail/encode.hpp
@@ -19,9 +19,9 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace dictionary::detail {
diff --git a/cpp/include/cudf/dictionary/detail/merge.hpp b/cpp/include/cudf/dictionary/detail/merge.hpp
index a1777d412fe..69d0d9fa9b0 100644
--- a/cpp/include/cudf/dictionary/detail/merge.hpp
+++ b/cpp/include/cudf/dictionary/detail/merge.hpp
@@ -18,9 +18,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/detail/merge.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace dictionary::detail {
diff --git a/cpp/include/cudf/dictionary/detail/replace.hpp b/cpp/include/cudf/dictionary/detail/replace.hpp
index 1e1ee182fc5..c854e794b17 100644
--- a/cpp/include/cudf/dictionary/detail/replace.hpp
+++ b/cpp/include/cudf/dictionary/detail/replace.hpp
@@ -19,9 +19,9 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace dictionary::detail {
diff --git a/cpp/include/cudf/dictionary/detail/search.hpp b/cpp/include/cudf/dictionary/detail/search.hpp
index 921acc258a9..09907c9070d 100644
--- a/cpp/include/cudf/dictionary/detail/search.hpp
+++ b/cpp/include/cudf/dictionary/detail/search.hpp
@@ -19,9 +19,9 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace dictionary {
diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp
index 9eb812eb8ee..0848df64596 100644
--- a/cpp/include/cudf/dictionary/detail/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp
@@ -19,10 +19,10 @@
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace dictionary::detail {
diff --git a/cpp/include/cudf/dictionary/dictionary_factories.hpp b/cpp/include/cudf/dictionary/dictionary_factories.hpp
index 2f663c4af61..4a63ee05479 100644
--- a/cpp/include/cudf/dictionary/dictionary_factories.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_factories.hpp
@@ -18,10 +18,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 /**
@@ -67,7 +66,7 @@ std::unique_ptr<column> make_dictionary_column(
   column_view const& keys_column,
   column_view const& indices_column,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct a dictionary column by taking ownership of the provided keys
@@ -97,7 +96,7 @@ std::unique_ptr<column> make_dictionary_column(
   rmm::device_buffer&& null_mask,
   size_type null_count,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct a dictionary column by taking ownership of the provided keys
@@ -124,7 +123,7 @@ std::unique_ptr<column> make_dictionary_column(
   std::unique_ptr<column> keys_column,
   std::unique_ptr<column> indices_column,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp
index 9e68c947793..dc81fd74992 100644
--- a/cpp/include/cudf/dictionary/encode.hpp
+++ b/cpp/include/cudf/dictionary/encode.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace dictionary {
@@ -62,7 +60,7 @@ std::unique_ptr<column> encode(
   column_view const& column,
   data_type indices_type            = data_type{type_id::UINT32},
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a column by gathering the keys from the provided
@@ -82,7 +80,7 @@ std::unique_ptr<column> encode(
 std::unique_ptr<column> decode(
   dictionary_column_view const& dictionary_column,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/search.hpp b/cpp/include/cudf/dictionary/search.hpp
index 66275de33e9..16d59318dd0 100644
--- a/cpp/include/cudf/dictionary/search.hpp
+++ b/cpp/include/cudf/dictionary/search.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace dictionary {
@@ -46,7 +44,7 @@ std::unique_ptr<scalar> get_index(
   dictionary_column_view const& dictionary,
   scalar const& key,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/update_keys.hpp b/cpp/include/cudf/dictionary/update_keys.hpp
index c02e91f8d78..85e5af8cf22 100644
--- a/cpp/include/cudf/dictionary/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/update_keys.hpp
@@ -17,11 +17,9 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
-
 namespace CUDF_EXPORT cudf {
 namespace dictionary {
 /**
@@ -61,7 +59,7 @@ std::unique_ptr<column> add_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& new_keys,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a new dictionary column by removing the specified keys
@@ -93,7 +91,7 @@ std::unique_ptr<column> remove_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys_to_remove,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a new dictionary column by removing any keys
@@ -115,7 +113,7 @@ std::unique_ptr<column> remove_keys(
 std::unique_ptr<column> remove_unused_keys(
   dictionary_column_view const& dictionary_column,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a new dictionary column by applying only the specified keys
@@ -149,7 +147,7 @@ std::unique_ptr<column> set_keys(
   dictionary_column_view const& dictionary_column,
   column_view const& keys,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create new dictionaries that have keys merged from the input dictionaries.
@@ -165,7 +163,7 @@ std::unique_ptr<column> set_keys(
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace dictionary
diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp
index 054f1e859f4..15a21b44f3b 100644
--- a/cpp/include/cudf/filling.hpp
+++ b/cpp/include/cudf/filling.hpp
@@ -19,9 +19,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
@@ -94,7 +92,7 @@ std::unique_ptr<column> fill(
   size_type end,
   scalar const& value,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Repeat rows of a Table.
@@ -128,7 +126,7 @@ std::unique_ptr<table> repeat(
   table_view const& input_table,
   column_view const& count,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Repeat rows of a Table.
@@ -153,7 +151,7 @@ std::unique_ptr<table> repeat(
   table_view const& input_table,
   size_type count,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Fills a column with a sequence of value specified by an initial value and a step.
@@ -184,7 +182,7 @@ std::unique_ptr<column> sequence(
   scalar const& init,
   scalar const& step,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Fills a column with a sequence of value specified by an initial value and a step of 1.
@@ -211,7 +209,7 @@ std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generate a sequence of timestamps beginning at `init` and incrementing by `months` for
@@ -242,7 +240,7 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(
   scalar const& init,
   size_type months,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/groupby.hpp b/cpp/include/cudf/groupby.hpp
index f7df9c1aa9b..11c778408fe 100644
--- a/cpp/include/cudf/groupby.hpp
+++ b/cpp/include/cudf/groupby.hpp
@@ -22,11 +22,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <utility>
@@ -186,7 +185,7 @@ class groupby {
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
     host_span<aggregation_request const> requests,
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @copydoc aggregate(host_span<aggregation_request const>, rmm::device_async_resource_ref)
@@ -196,7 +195,7 @@ class groupby {
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> aggregate(
     host_span<aggregation_request const> requests,
     rmm::cuda_stream_view stream,
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
   /**
    * @brief Performs grouped scans on the specified values.
    *
@@ -250,7 +249,7 @@ class groupby {
    */
   std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> scan(
     host_span<scan_request const> requests,
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Performs grouped shifts for specified values.
@@ -306,7 +305,7 @@ class groupby {
     table_view const& values,
     host_span<size_type const> offsets,
     std::vector<std::reference_wrapper<scalar const>> const& fill_values,
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief The grouped data corresponding to a groupby operation on a set of values.
@@ -335,7 +334,7 @@ class groupby {
    * @return A `groups` object representing grouped keys and values
    */
   groups get_groups(cudf::table_view values           = {},
-                    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Performs grouped replace nulls on @p value
@@ -375,7 +374,7 @@ class groupby {
   std::pair<std::unique_ptr<table>, std::unique_ptr<table>> replace_nulls(
     table_view const& values,
     host_span<cudf::replace_policy const> replace_policies,
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
  private:
   table_view _keys;                                      ///< Keys that determine grouping
diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp
index b8be2af6967..0c5327edb91 100644
--- a/cpp/include/cudf/hashing.hpp
+++ b/cpp/include/cudf/hashing.hpp
@@ -18,9 +18,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 
@@ -62,7 +60,7 @@ std::unique_ptr<column> murmurhash3_x86_32(
   table_view const& input,
   uint32_t seed                     = DEFAULT_HASH_SEED,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the MurmurHash3 64-bit hash value of each row in the given table
@@ -81,7 +79,7 @@ std::unique_ptr<table> murmurhash3_x64_128(
   table_view const& input,
   uint64_t seed                     = DEFAULT_HASH_SEED,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the MD5 hash value of each row in the given table
@@ -95,7 +93,7 @@ std::unique_ptr<table> murmurhash3_x64_128(
 std::unique_ptr<column> md5(
   table_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the SHA-1 hash value of each row in the given table
@@ -109,7 +107,7 @@ std::unique_ptr<column> md5(
 std::unique_ptr<column> sha1(
   table_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the SHA-224 hash value of each row in the given table
@@ -123,7 +121,7 @@ std::unique_ptr<column> sha1(
 std::unique_ptr<column> sha224(
   table_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the SHA-256 hash value of each row in the given table
@@ -137,7 +135,7 @@ std::unique_ptr<column> sha224(
 std::unique_ptr<column> sha256(
   table_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the SHA-384 hash value of each row in the given table
@@ -151,7 +149,7 @@ std::unique_ptr<column> sha256(
 std::unique_ptr<column> sha384(
   table_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the SHA-512 hash value of each row in the given table
@@ -165,7 +163,7 @@ std::unique_ptr<column> sha384(
 std::unique_ptr<column> sha512(
   table_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the XXHash_64 hash value of each row in the given table
@@ -183,7 +181,7 @@ std::unique_ptr<column> xxhash_64(
   table_view const& input,
   uint64_t seed                     = DEFAULT_HASH_SEED,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace hashing
 
diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp
index 1a459430346..a978e54a1b9 100644
--- a/cpp/include/cudf/hashing/detail/hashing.hpp
+++ b/cpp/include/cudf/hashing/detail/hashing.hpp
@@ -17,9 +17,9 @@
 
 #include <cudf/hashing.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cstddef>
 #include <functional>
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 0f52b0f7b31..f789d950e51 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -22,11 +22,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
-
 #include <utility>
 
 struct DLManagedTensor;
@@ -65,7 +63,7 @@ namespace CUDF_EXPORT cudf {
  */
 std::unique_ptr<table> from_dlpack(
   DLManagedTensor const* managed_tensor,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Convert a cudf table into a DLPack DLTensor
@@ -87,7 +85,7 @@ std::unique_ptr<table> from_dlpack(
  */
 DLManagedTensor* to_dlpack(
   table_view const& input,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 
@@ -173,7 +171,7 @@ unique_schema_t to_arrow_schema(cudf::table_view const& input,
 unique_device_array_t to_arrow_device(
   cudf::table&& table,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create `ArrowDeviceArray` from cudf column and metadata
@@ -202,7 +200,7 @@ unique_device_array_t to_arrow_device(
 unique_device_array_t to_arrow_device(
   cudf::column&& col,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create `ArrowDeviceArray` from a table view
@@ -234,7 +232,7 @@ unique_device_array_t to_arrow_device(
 unique_device_array_t to_arrow_device(
   cudf::table_view const& table,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create `ArrowDeviceArray` from a column view
@@ -266,7 +264,7 @@ unique_device_array_t to_arrow_device(
 unique_device_array_t to_arrow_device(
   cudf::column_view const& col,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Copy table view data to host and create `ArrowDeviceArray` for it
@@ -291,7 +289,7 @@ unique_device_array_t to_arrow_device(
 unique_device_array_t to_arrow_host(
   cudf::table_view const& table,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Copy column view data to host and create `ArrowDeviceArray` for it
@@ -316,7 +314,7 @@ unique_device_array_t to_arrow_host(
 unique_device_array_t to_arrow_host(
   cudf::column_view const& col,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create `cudf::table` from given ArrowArray and ArrowSchema input
@@ -337,7 +335,7 @@ std::unique_ptr<cudf::table> from_arrow(
   ArrowSchema const* schema,
   ArrowArray const* input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input
@@ -356,7 +354,7 @@ std::unique_ptr<cudf::column> from_arrow_column(
   ArrowSchema const* schema,
   ArrowArray const* input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create `cudf::table` from given ArrowDeviceArray input
@@ -380,7 +378,7 @@ std::unique_ptr<table> from_arrow_host(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create `cudf::table` from given ArrowArrayStream input
@@ -398,7 +396,7 @@ std::unique_ptr<table> from_arrow_host(
 std::unique_ptr<table> from_arrow_stream(
   ArrowArrayStream* input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create `cudf::column` from given ArrowDeviceArray input
@@ -421,7 +419,7 @@ std::unique_ptr<column> from_arrow_host_column(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray
@@ -502,7 +500,7 @@ unique_table_view_t from_arrow_device(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief typedef for a unique_ptr to a `cudf::column_view` with custom deleter
@@ -545,7 +543,7 @@ unique_column_view_t from_arrow_device_column(
   ArrowSchema const* schema,
   ArrowDeviceArray const* input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/io/avro.hpp b/cpp/include/cudf/io/avro.hpp
index 63f9ea3a624..b307d05c09d 100644
--- a/cpp/include/cudf/io/avro.hpp
+++ b/cpp/include/cudf/io/avro.hpp
@@ -20,9 +20,7 @@
 
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 #include <string>
@@ -217,7 +215,7 @@ class avro_reader_options_builder {
  */
 table_with_metadata read_avro(
   avro_reader_options const& options,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace io
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index bbb4636a5a3..dae056ef157 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -20,9 +20,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 #include <string>
@@ -1354,7 +1352,7 @@ class csv_reader_options_builder {
 table_with_metadata read_csv(
   csv_reader_options options,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 /**
diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp
index 13f695d6866..ab6cb422296 100644
--- a/cpp/include/cudf/io/detail/avro.hpp
+++ b/cpp/include/cudf/io/detail/avro.hpp
@@ -19,9 +19,9 @@
 #include <cudf/io/avro.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace io::detail::avro {
diff --git a/cpp/include/cudf/io/detail/batched_memset.hpp b/cpp/include/cudf/io/detail/batched_memset.hpp
index d0922cc64ee..1c74be4a9fe 100644
--- a/cpp/include/cudf/io/detail/batched_memset.hpp
+++ b/cpp/include/cudf/io/detail/batched_memset.hpp
@@ -16,10 +16,10 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_copy.cuh>
 #include <cuda/functional>
@@ -50,7 +50,7 @@ void batched_memset(std::vector<cudf::device_span<T>> const& bufs,
 
   // copy bufs into device memory and then get sizes
   auto gpu_bufs =
-    cudf::detail::make_device_uvector_async(bufs, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::make_device_uvector_async(bufs, stream, cudf::get_current_device_resource_ref());
 
   // get a vector with the sizes of all buffers
   auto sizes = cudf::detail::make_counting_transform_iterator(
@@ -72,7 +72,7 @@ void batched_memset(std::vector<cudf::device_span<T>> const& bufs,
   cub::DeviceCopy::Batched(nullptr, temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream);
 
   rmm::device_buffer d_temp_storage(
-    temp_storage_bytes, stream, rmm::mr::get_current_device_resource());
+    temp_storage_bytes, stream, cudf::get_current_device_resource_ref());
 
   cub::DeviceCopy::Batched(
     d_temp_storage.data(), temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream);
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index d4cad2f70fd..409663938a9 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -18,9 +18,9 @@
 
 #include <cudf/io/csv.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace io::detail::csv {
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 38ba4f675c3..73ff17b2b93 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -19,9 +19,9 @@
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace io::json::detail {
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 7538cf7d29c..4a240d76696 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -22,9 +22,9 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <string>
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index a6945e0b7ab..1528ac0124a 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -25,10 +25,9 @@
 #include <cudf/io/types.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <string>
 #include <vector>
diff --git a/cpp/include/cudf/io/detail/tokenize_json.hpp b/cpp/include/cudf/io/detail/tokenize_json.hpp
index 715eb855daa..a5b5caf300f 100644
--- a/cpp/include/cudf/io/detail/tokenize_json.hpp
+++ b/cpp/include/cudf/io/detail/tokenize_json.hpp
@@ -18,11 +18,11 @@
 
 #include <cudf/io/json.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf::io::json {
 
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index fde1857cb7f..a3d6533705e 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -20,9 +20,7 @@
 
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <map>
 #include <string>
@@ -675,7 +673,7 @@ class json_reader_options_builder {
 table_with_metadata read_json(
   json_reader_options options,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 8d484b15872..163fa20806d 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -21,9 +21,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 #include <optional>
@@ -409,7 +407,7 @@ class orc_reader_options_builder {
 table_with_metadata read_orc(
   orc_reader_options const& options,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief The chunked orc reader class to read an ORC file iteratively into a series of
@@ -479,7 +477,7 @@ class chunked_orc_reader {
     size_type output_row_granularity,
     orc_reader_options const& options,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct the reader from input/output size limits along with other ORC reader options.
@@ -500,7 +498,7 @@ class chunked_orc_reader {
     std::size_t pass_read_limit,
     orc_reader_options const& options,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct the reader from output size limits along with other ORC reader options.
@@ -518,7 +516,7 @@ class chunked_orc_reader {
     std::size_t chunk_read_limit,
     orc_reader_options const& options,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Destructor, destroying the internal reader instance.
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index 64c37f9a9df..ed7b2ac0850 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -22,9 +22,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <iostream>
 #include <memory>
@@ -502,7 +500,7 @@ class parquet_reader_options_builder {
 table_with_metadata read_parquet(
   parquet_reader_options const& options,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief The chunked parquet reader class to read Parquet file iteratively in to a series of
@@ -540,7 +538,7 @@ class chunked_parquet_reader {
     std::size_t chunk_read_limit,
     parquet_reader_options const& options,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Constructor for chunked reader.
@@ -566,7 +564,7 @@ class chunked_parquet_reader {
     std::size_t pass_read_limit,
     parquet_reader_options const& options,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Destructor, destroying the internal reader instance.
diff --git a/cpp/include/cudf/io/text/detail/trie.hpp b/cpp/include/cudf/io/text/detail/trie.hpp
index eee3fefc79f..70e06eeac93 100644
--- a/cpp/include/cudf/io/text/detail/trie.hpp
+++ b/cpp/include/cudf/io/text/detail/trie.hpp
@@ -22,7 +22,6 @@
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/resource_ref.hpp>
 
 #include <algorithm>
diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index 3a1f9611324..99f9e7534ac 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -19,10 +19,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -94,7 +93,7 @@ std::unique_ptr<cudf::column> multibyte_split(
   std::string const& delimiter,
   parse_options options             = {},
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index f4139721475..cc8912cb022 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -22,12 +22,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <optional>
 #include <utility>
@@ -109,7 +108,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 inner_join(cudf::table_view const& left_keys,
            cudf::table_view const& right_keys,
            null_equality compare_nulls       = null_equality::EQUAL,
-           rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+           rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to a
@@ -149,7 +148,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 left_join(cudf::table_view const& left_keys,
           cudf::table_view const& right_keys,
           null_equality compare_nulls       = null_equality::EQUAL,
-          rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+          rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to a
@@ -188,7 +187,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 full_join(cudf::table_view const& left_keys,
           cudf::table_view const& right_keys,
           null_equality compare_nulls       = null_equality::EQUAL,
-          rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+          rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a vector of row indices corresponding to a left semi-join
@@ -216,7 +215,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_semi_join(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
   null_equality compare_nulls       = null_equality::EQUAL,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a vector of row indices corresponding to a left anti join
@@ -247,7 +246,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
   cudf::table_view const& left_keys,
   cudf::table_view const& right_keys,
   null_equality compare_nulls       = null_equality::EQUAL,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs a cross join on two tables (`left`, `right`)
@@ -274,7 +273,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> left_anti_join(
 std::unique_ptr<cudf::table> cross_join(
   cudf::table_view const& left,
   cudf::table_view const& right,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief The enum class to specify if any of the input join tables (`build` table and any later
@@ -353,7 +352,7 @@ class hash_join {
   inner_join(cudf::table_view const& probe,
              std::optional<std::size_t> output_size = {},
              rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-             rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource()) const;
+             rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
   /**
    * Returns the row indices that can be used to construct the result of performing
@@ -378,7 +377,7 @@ class hash_join {
   left_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size = {},
             rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-            rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource()) const;
+            rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref()) const;
 
   /**
    * Returns the row indices that can be used to construct the result of performing
@@ -403,7 +402,7 @@ class hash_join {
   full_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size = {},
             rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-            rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource()) const;
+            rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref()) const;
 
   /**
    * Returns the exact number of matches (rows) when performing an inner join with the specified
@@ -455,7 +454,7 @@ class hash_join {
   [[nodiscard]] std::size_t full_join_size(
     cudf::table_view const& probe,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
  private:
   const std::unique_ptr<impl_type const> _impl;
@@ -511,7 +510,7 @@ class distinct_hash_join {
   [[nodiscard]] std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
                           std::unique_ptr<rmm::device_uvector<size_type>>>
   inner_join(rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-             rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+             rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
   /**
    * @brief Returns the build table indices that can be used to construct the result of performing
@@ -530,7 +529,7 @@ class distinct_hash_join {
    */
   [[nodiscard]] std::unique_ptr<rmm::device_uvector<size_type>> left_join(
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
  private:
   using impl_type = typename cudf::detail::distinct_hash_join<HasNested>;  ///< Implementation type
@@ -579,7 +578,7 @@ conditional_inner_join(table_view const& left,
                        table_view const& right,
                        ast::expression const& binary_predicate,
                        std::optional<std::size_t> output_size = {},
-                       rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                       rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs
@@ -624,7 +623,7 @@ conditional_left_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
                       std::optional<std::size_t> output_size = {},
-                      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                      rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs
@@ -666,7 +665,7 @@ std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
 conditional_full_join(table_view const& left,
                       table_view const& right,
                       ast::expression const& binary_predicate,
-                      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                      rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left table
@@ -705,7 +704,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size = {},
-  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left table
@@ -744,7 +743,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
   table_view const& right,
   ast::expression const& binary_predicate,
   std::optional<std::size_t> output_size = {},
-  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs of
@@ -802,7 +801,7 @@ mixed_inner_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs of
@@ -862,7 +861,7 @@ mixed_left_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a pair of row index vectors corresponding to all pairs of
@@ -922,7 +921,7 @@ mixed_full_join(
   ast::expression const& binary_predicate,
   null_equality compare_nulls = null_equality::EQUAL,
   std::optional<std::pair<std::size_t, device_span<size_type const>>> output_size_data = {},
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left tables
@@ -969,7 +968,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns an index vector corresponding to all rows in the left tables
@@ -1017,7 +1016,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1057,7 +1056,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1097,7 +1096,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
   table_view const& right_conditional,
   ast::expression const& binary_predicate,
   null_equality compare_nulls       = null_equality::EQUAL,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1120,7 +1119,7 @@ std::size_t conditional_inner_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1143,7 +1142,7 @@ std::size_t conditional_left_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1166,7 +1165,7 @@ std::size_t conditional_left_semi_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the exact number of matches (rows) when performing a
@@ -1189,6 +1188,6 @@ std::size_t conditional_left_anti_join_size(
   table_view const& left,
   table_view const& right,
   ast::expression const& binary_predicate,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/json/json.hpp b/cpp/include/cudf/json/json.hpp
index 403374c536d..2ad3421d27d 100644
--- a/cpp/include/cudf/json/json.hpp
+++ b/cpp/include/cudf/json/json.hpp
@@ -18,9 +18,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 
@@ -169,7 +167,7 @@ std::unique_ptr<cudf::column> get_json_object(
   cudf::string_scalar const& json_path,
   get_json_object_options options   = get_json_object_options{},
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/labeling/label_bins.hpp b/cpp/include/cudf/labeling/label_bins.hpp
index 7eb25134ca5..1d0ead35d96 100644
--- a/cpp/include/cudf/labeling/label_bins.hpp
+++ b/cpp/include/cudf/labeling/label_bins.hpp
@@ -19,10 +19,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 
@@ -76,7 +75,7 @@ std::unique_ptr<column> label_bins(
   column_view const& right_edges,
   inclusive right_inclusive,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/lists/combine.hpp b/cpp/include/cudf/lists/combine.hpp
index 5a310e6651f..fd2f42cf649 100644
--- a/cpp/include/cudf/lists/combine.hpp
+++ b/cpp/include/cudf/lists/combine.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 
@@ -68,7 +66,7 @@ std::unique_ptr<column> concatenate_rows(
   table_view const& input,
   concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Concatenating multiple lists on the same row of a lists column into a single list.
@@ -99,7 +97,7 @@ std::unique_ptr<column> concatenate_list_elements(
   column_view const& input,
   concatenate_null_policy null_policy = concatenate_null_policy::IGNORE,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/contains.hpp b/cpp/include/cudf/lists/contains.hpp
index cd0a216488c..e498c60682e 100644
--- a/cpp/include/cudf/lists/contains.hpp
+++ b/cpp/include/cudf/lists/contains.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists {
@@ -52,7 +50,7 @@ std::unique_ptr<column> contains(
   cudf::lists_column_view const& lists,
   cudf::scalar const& search_key,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a column of `bool` values indicating whether the list rows of the first
@@ -76,7 +74,7 @@ std::unique_ptr<column> contains(
   cudf::lists_column_view const& lists,
   cudf::column_view const& search_keys,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a column of `bool` values indicating whether each row in the `lists` column
@@ -98,7 +96,7 @@ std::unique_ptr<column> contains(
 std::unique_ptr<column> contains_nulls(
   cudf::lists_column_view const& lists,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Option to choose whether `index_of()` returns the first or last match
@@ -142,7 +140,7 @@ std::unique_ptr<column> index_of(
   cudf::scalar const& search_key,
   duplicate_find_option find_option = duplicate_find_option::FIND_FIRST,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a column of values indicating the position of a search key
@@ -179,7 +177,7 @@ std::unique_ptr<column> index_of(
   cudf::column_view const& search_keys,
   duplicate_find_option find_option = duplicate_find_option::FIND_FIRST,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp
index a6f2ea6e68a..e7d50f11099 100644
--- a/cpp/include/cudf/lists/count_elements.hpp
+++ b/cpp/include/cudf/lists/count_elements.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists {
@@ -54,7 +52,7 @@ namespace lists {
 std::unique_ptr<column> count_elements(
   lists_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of lists_elements group
 
diff --git a/cpp/include/cudf/lists/detail/combine.hpp b/cpp/include/cudf/lists/detail/combine.hpp
index 07309da2814..ee7a6a465c3 100644
--- a/cpp/include/cudf/lists/detail/combine.hpp
+++ b/cpp/include/cudf/lists/detail/combine.hpp
@@ -18,8 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/combine.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/concatenate.hpp b/cpp/include/cudf/lists/detail/concatenate.hpp
index edfa3355dcd..d3a3a48dbb2 100644
--- a/cpp/include/cudf/lists/detail/concatenate.hpp
+++ b/cpp/include/cudf/lists/detail/concatenate.hpp
@@ -19,10 +19,10 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/contains.hpp b/cpp/include/cudf/lists/detail/contains.hpp
index 1ca3651b55a..9d30ef90723 100644
--- a/cpp/include/cudf/lists/detail/contains.hpp
+++ b/cpp/include/cudf/lists/detail/contains.hpp
@@ -17,8 +17,7 @@
 
 #include <cudf/lists/contains.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/copying.hpp b/cpp/include/cudf/lists/detail/copying.hpp
index 76154ae7064..04e6b18cd27 100644
--- a/cpp/include/cudf/lists/detail/copying.hpp
+++ b/cpp/include/cudf/lists/detail/copying.hpp
@@ -16,9 +16,9 @@
 #pragma once
 
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/extract.hpp b/cpp/include/cudf/lists/detail/extract.hpp
index e14b93ff912..7448f513788 100644
--- a/cpp/include/cudf/lists/detail/extract.hpp
+++ b/cpp/include/cudf/lists/detail/extract.hpp
@@ -17,8 +17,7 @@
 
 #include <cudf/lists/extract.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index 294282d7caa..31b18c90c68 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -22,11 +22,11 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/functional.h>
diff --git a/cpp/include/cudf/lists/detail/interleave_columns.hpp b/cpp/include/cudf/lists/detail/interleave_columns.hpp
index ae8caa853f3..ebf554f0964 100644
--- a/cpp/include/cudf/lists/detail/interleave_columns.hpp
+++ b/cpp/include/cudf/lists/detail/interleave_columns.hpp
@@ -17,9 +17,9 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/lists_column_factories.hpp b/cpp/include/cudf/lists/detail/lists_column_factories.hpp
index 18d66f15b1e..b726264aa65 100644
--- a/cpp/include/cudf/lists/detail/lists_column_factories.hpp
+++ b/cpp/include/cudf/lists/detail/lists_column_factories.hpp
@@ -19,9 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/reverse.hpp b/cpp/include/cudf/lists/detail/reverse.hpp
index d10d7784e6c..a5a86f4d44d 100644
--- a/cpp/include/cudf/lists/detail/reverse.hpp
+++ b/cpp/include/cudf/lists/detail/reverse.hpp
@@ -17,8 +17,7 @@
 
 #include <cudf/lists/reverse.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index be76e456900..51f2fa3cd23 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -26,11 +26,11 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
diff --git a/cpp/include/cudf/lists/detail/scatter_helper.cuh b/cpp/include/cudf/lists/detail/scatter_helper.cuh
index fc44e0bc290..49678c97554 100644
--- a/cpp/include/cudf/lists/detail/scatter_helper.cuh
+++ b/cpp/include/cudf/lists/detail/scatter_helper.cuh
@@ -20,10 +20,10 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/list_device_view.cuh>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/include/cudf/lists/detail/set_operations.hpp b/cpp/include/cudf/lists/detail/set_operations.hpp
index abfcef72d47..51293969e58 100644
--- a/cpp/include/cudf/lists/detail/set_operations.hpp
+++ b/cpp/include/cudf/lists/detail/set_operations.hpp
@@ -19,10 +19,10 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/sorting.hpp b/cpp/include/cudf/lists/detail/sorting.hpp
index 8cbfbbae769..748fb7acfee 100644
--- a/cpp/include/cudf/lists/detail/sorting.hpp
+++ b/cpp/include/cudf/lists/detail/sorting.hpp
@@ -16,9 +16,9 @@
 #pragma once
 
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp
index be0bd27083c..fa7c0c173d2 100644
--- a/cpp/include/cudf/lists/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/detail/stream_compaction.hpp
@@ -18,9 +18,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists::detail {
diff --git a/cpp/include/cudf/lists/explode.hpp b/cpp/include/cudf/lists/explode.hpp
index a3375887815..23745e8a443 100644
--- a/cpp/include/cudf/lists/explode.hpp
+++ b/cpp/include/cudf/lists/explode.hpp
@@ -19,9 +19,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
@@ -75,7 +73,7 @@ std::unique_ptr<table> explode(
   table_view const& input_table,
   size_type explode_column_idx,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Explodes a list column's elements and includes a position column.
@@ -121,7 +119,7 @@ std::unique_ptr<table> explode_position(
   table_view const& input_table,
   size_type explode_column_idx,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Explodes a list column's elements retaining any null entries or empty lists inside.
@@ -165,7 +163,7 @@ std::unique_ptr<table> explode_outer(
   table_view const& input_table,
   size_type explode_column_idx,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Explodes a list column's elements retaining any null entries or empty lists and includes a
@@ -211,7 +209,7 @@ std::unique_ptr<table> explode_outer_position(
   table_view const& input_table,
   size_type explode_column_idx,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/lists/extract.hpp b/cpp/include/cudf/lists/extract.hpp
index 29a02308c66..f584dff6bed 100644
--- a/cpp/include/cudf/lists/extract.hpp
+++ b/cpp/include/cudf/lists/extract.hpp
@@ -19,9 +19,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists {
@@ -69,7 +67,7 @@ std::unique_ptr<column> extract_list_element(
   lists_column_view const& lists_column,
   size_type index,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a column where each row is a single element from the corresponding sublist
@@ -110,7 +108,7 @@ std::unique_ptr<column> extract_list_element(
   lists_column_view const& lists_column,
   column_view const& indices,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/filling.hpp b/cpp/include/cudf/lists/filling.hpp
index a1f3c37ad9e..d887a844aba 100644
--- a/cpp/include/cudf/lists/filling.hpp
+++ b/cpp/include/cudf/lists/filling.hpp
@@ -18,10 +18,9 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -69,7 +68,7 @@ std::unique_ptr<column> sequences(
   column_view const& starts,
   column_view const& sizes,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a lists column in which each row contains a sequence of values specified by a tuple
@@ -111,7 +110,7 @@ std::unique_ptr<column> sequences(
   column_view const& steps,
   column_view const& sizes,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/gather.hpp b/cpp/include/cudf/lists/gather.hpp
index 6359e0488c9..3e3c09cfea1 100644
--- a/cpp/include/cudf/lists/gather.hpp
+++ b/cpp/include/cudf/lists/gather.hpp
@@ -20,9 +20,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists {
@@ -77,7 +75,7 @@ std::unique_ptr<column> segmented_gather(
   lists_column_view const& gather_map_list,
   out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK,
   rmm::cuda_stream_view stream       = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr  = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/reverse.hpp b/cpp/include/cudf/lists/reverse.hpp
index f00e6e5117a..0c99dcbe8ae 100644
--- a/cpp/include/cudf/lists/reverse.hpp
+++ b/cpp/include/cudf/lists/reverse.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
@@ -52,7 +50,7 @@ namespace lists {
 std::unique_ptr<column> reverse(
   lists_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 
diff --git a/cpp/include/cudf/lists/set_operations.hpp b/cpp/include/cudf/lists/set_operations.hpp
index 55b1591fc44..f8ea972528c 100644
--- a/cpp/include/cudf/lists/set_operations.hpp
+++ b/cpp/include/cudf/lists/set_operations.hpp
@@ -19,9 +19,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists {
@@ -64,7 +64,7 @@ std::unique_ptr<column> have_overlap(
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a lists column of distinct elements common to two input lists columns.
@@ -101,7 +101,7 @@ std::unique_ptr<column> intersect_distinct(
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a lists column of distinct elements found in either of two input lists columns.
@@ -138,7 +138,7 @@ std::unique_ptr<column> union_distinct(
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a lists column of distinct elements found only in the left input column.
@@ -175,7 +175,7 @@ std::unique_ptr<column> difference_distinct(
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/sorting.hpp b/cpp/include/cudf/lists/sorting.hpp
index 39c71f6e9fa..ee18ed57c57 100644
--- a/cpp/include/cudf/lists/sorting.hpp
+++ b/cpp/include/cudf/lists/sorting.hpp
@@ -19,9 +19,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists {
@@ -58,7 +56,7 @@ std::unique_ptr<column> sort_lists(
   order column_order,
   null_order null_precedence,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Segmented sort of the elements within a list in each row of a list column using stable
@@ -71,7 +69,7 @@ std::unique_ptr<column> stable_sort_lists(
   order column_order,
   null_order null_precedence,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace lists
diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp
index 28ef13cd870..59b53c10ac9 100644
--- a/cpp/include/cudf/lists/stream_compaction.hpp
+++ b/cpp/include/cudf/lists/stream_compaction.hpp
@@ -18,10 +18,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace lists {
@@ -65,7 +64,7 @@ std::unique_ptr<column> apply_boolean_mask(
   lists_column_view const& input,
   lists_column_view const& boolean_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a new list column without duplicate elements in each list.
@@ -92,7 +91,7 @@ std::unique_ptr<column> distinct(
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/merge.hpp b/cpp/include/cudf/merge.hpp
index 83c6ff04500..18701bf8ec6 100644
--- a/cpp/include/cudf/merge.hpp
+++ b/cpp/include/cudf/merge.hpp
@@ -18,9 +18,7 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 #include <vector>
@@ -109,6 +107,6 @@ std::unique_ptr<cudf::table> merge(
   std::vector<cudf::order> const& column_order,
   std::vector<cudf::null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                         = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr                    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr                    = cudf::get_current_device_resource_ref());
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/null_mask.hpp b/cpp/include/cudf/null_mask.hpp
index 70ca6aa29c5..fe719bf2c62 100644
--- a/cpp/include/cudf/null_mask.hpp
+++ b/cpp/include/cudf/null_mask.hpp
@@ -18,11 +18,10 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_buffer.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <vector>
 
@@ -92,7 +91,7 @@ rmm::device_buffer create_null_mask(
   size_type size,
   mask_state state,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Sets a pre-allocated bitmask buffer to a given state in the range
@@ -135,7 +134,7 @@ rmm::device_buffer copy_bitmask(
   size_type begin_bit,
   size_type end_bit,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Copies `view`'s bitmask from the bits
@@ -152,7 +151,7 @@ rmm::device_buffer copy_bitmask(
 rmm::device_buffer copy_bitmask(
   column_view const& view,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs bitwise AND of the bitmasks of columns of a table. Returns
@@ -169,7 +168,7 @@ rmm::device_buffer copy_bitmask(
 std::pair<rmm::device_buffer, size_type> bitmask_and(
   table_view const& view,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs bitwise OR of the bitmasks of columns of a table. Returns
@@ -186,7 +185,7 @@ std::pair<rmm::device_buffer, size_type> bitmask_and(
 std::pair<rmm::device_buffer, size_type> bitmask_or(
   table_view const& view,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Given a validity bitmask, counts the number of null elements (unset bits)
diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp
index 6a53553063e..385da993262 100644
--- a/cpp/include/cudf/partitioning.hpp
+++ b/cpp/include/cudf/partitioning.hpp
@@ -19,10 +19,9 @@
 #include <cudf/hashing.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -80,7 +79,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   table_view const& t,
   column_view const& partition_map,
   size_type num_partitions,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Partitions rows from the input table into multiple output tables.
@@ -109,7 +108,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   hash_id hash_function             = hash_id::HASH_MURMUR3,
   uint32_t seed                     = DEFAULT_HASH_SEED,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Round-robin partition.
@@ -252,7 +251,7 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> round_robi
   table_view const& input,
   cudf::size_type num_partitions,
   cudf::size_type start_partition   = 0,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp
index 47eac2e72f9..f6bae170f03 100644
--- a/cpp/include/cudf/quantiles.hpp
+++ b/cpp/include/cudf/quantiles.hpp
@@ -21,9 +21,7 @@
 #include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 /**
@@ -61,7 +59,7 @@ std::unique_ptr<column> quantile(
   interpolation interp               = interpolation::LINEAR,
   column_view const& ordered_indices = {},
   bool exact                         = true,
-  rmm::device_async_resource_ref mr  = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr  = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the rows of the input corresponding to the requested quantiles.
@@ -100,7 +98,7 @@ std::unique_ptr<table> quantiles(
   cudf::sorted is_input_sorted                   = sorted::NO,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Calculate approximate percentiles on an input tdigest column.
@@ -127,7 +125,7 @@ std::unique_ptr<table> quantiles(
 std::unique_ptr<column> percentile_approx(
   tdigest::tdigest_column_view const& input,
   column_view const& percentiles,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp
index e42ff5df15d..41be2e70cc3 100644
--- a/cpp/include/cudf/reduction.hpp
+++ b/cpp/include/cudf/reduction.hpp
@@ -19,9 +19,7 @@
 #include <cudf/aggregation.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <optional>
 
@@ -85,7 +83,7 @@ std::unique_ptr<scalar> reduce(
   reduce_aggregation const& agg,
   data_type output_dtype,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Computes the reduction of the values in all rows of a column with an initial value
@@ -109,7 +107,7 @@ std::unique_ptr<scalar> reduce(
   data_type output_dtype,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Compute reduction of each segment in the input column
@@ -161,7 +159,7 @@ std::unique_ptr<column> segmented_reduce(
   data_type output_dtype,
   null_policy null_handling,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Compute reduction of each segment in the input column with an initial value. Only SUM,
@@ -188,7 +186,7 @@ std::unique_ptr<column> segmented_reduce(
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Computes the scan of a column.
@@ -214,7 +212,7 @@ std::unique_ptr<column> scan(
   scan_type inclusive,
   null_policy null_handling         = null_policy::EXCLUDE,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Determines the minimum and maximum values of a column.
@@ -229,7 +227,7 @@ std::unique_ptr<column> scan(
 std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
   column_view const& col,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/reduction/detail/histogram.hpp b/cpp/include/cudf/reduction/detail/histogram.hpp
index 5b17df47ec7..c990db32977 100644
--- a/cpp/include/cudf/reduction/detail/histogram.hpp
+++ b/cpp/include/cudf/reduction/detail/histogram.hpp
@@ -20,10 +20,10 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
diff --git a/cpp/include/cudf/reduction/detail/reduction.cuh b/cpp/include/cudf/reduction/detail/reduction.cuh
index 7d1754d86f2..37e1545bcf2 100644
--- a/cpp/include/cudf/reduction/detail/reduction.cuh
+++ b/cpp/include/cudf/reduction/detail/reduction.cuh
@@ -20,13 +20,13 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/utilities/cast_functor.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_reduce.cuh>
 #include <thrust/for_each.h>
diff --git a/cpp/include/cudf/reduction/detail/reduction.hpp b/cpp/include/cudf/reduction/detail/reduction.hpp
index a15783fb460..fd0e3abb529 100644
--- a/cpp/include/cudf/reduction/detail/reduction.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction.hpp
@@ -20,8 +20,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <optional>
 
diff --git a/cpp/include/cudf/reduction/detail/reduction_functions.hpp b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
index fa21dc87e64..b40211a54ad 100644
--- a/cpp/include/cudf/reduction/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/reduction_functions.hpp
@@ -21,9 +21,9 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
diff --git a/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
index 1c55b387454..af45a14874b 100644
--- a/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
+++ b/cpp/include/cudf/reduction/detail/segmented_reduction_functions.hpp
@@ -19,11 +19,10 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
-#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
diff --git a/cpp/include/cudf/replace.hpp b/cpp/include/cudf/replace.hpp
index 43aabd6c6c6..8d8510da5ea 100644
--- a/cpp/include/cudf/replace.hpp
+++ b/cpp/include/cudf/replace.hpp
@@ -19,9 +19,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
@@ -58,7 +56,7 @@ std::unique_ptr<column> replace_nulls(
   column_view const& input,
   column_view const& replacement,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Replaces all null values in a column with a scalar.
@@ -77,7 +75,7 @@ std::unique_ptr<column> replace_nulls(
   column_view const& input,
   scalar const& replacement,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Replaces all null values in a column with the first non-null value that precedes/follows.
@@ -96,7 +94,7 @@ std::unique_ptr<column> replace_nulls(
   column_view const& input,
   replace_policy const& replace_policy,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Replaces all NaN values in a column with corresponding values from another column
@@ -124,7 +122,7 @@ std::unique_ptr<column> replace_nans(
   column_view const& input,
   column_view const& replacement,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Replaces all NaN values in a column with a scalar
@@ -151,7 +149,7 @@ std::unique_ptr<column> replace_nans(
   column_view const& input,
   scalar const& replacement,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Return a copy of `input_col` replacing any `values_to_replace[i]`
@@ -170,7 +168,7 @@ std::unique_ptr<column> find_and_replace_all(
   column_view const& values_to_replace,
   column_view const& replacement_values,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Replaces values less than `lo` in `input` with `lo_replace`,
@@ -225,7 +223,7 @@ std::unique_ptr<column> clamp(
   scalar const& hi,
   scalar const& hi_replace,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Replaces values less than `lo` in `input` with `lo`,
@@ -271,7 +269,7 @@ std::unique_ptr<column> clamp(
   scalar const& lo,
   scalar const& hi,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Copies from a column of floating-point elements and replaces `-NaN` and `-0.0` with `+NaN`
@@ -291,7 +289,7 @@ std::unique_ptr<column> clamp(
 std::unique_ptr<column> normalize_nans_and_zeros(
   column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Modifies a column of floating-point elements to replace all `-NaN` and `-0.0` with `+NaN`
diff --git a/cpp/include/cudf/reshape.hpp b/cpp/include/cudf/reshape.hpp
index 07aaf6488ad..e437e7abfca 100644
--- a/cpp/include/cudf/reshape.hpp
+++ b/cpp/include/cudf/reshape.hpp
@@ -20,9 +20,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
@@ -55,7 +53,7 @@ namespace CUDF_EXPORT cudf {
 std::unique_ptr<column> interleave_columns(
   table_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Repeats the rows from `input` table `count` times to form a new table.
@@ -80,7 +78,7 @@ std::unique_ptr<table> tile(
   table_view const& input,
   size_type count,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Configures whether byte casting flips endianness
@@ -107,7 +105,7 @@ std::unique_ptr<column> byte_cast(
   column_view const& input_column,
   flip_endianness endian_configuration,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 
diff --git a/cpp/include/cudf/rolling.hpp b/cpp/include/cudf/rolling.hpp
index 5a8c454d8fc..8a717c3f510 100644
--- a/cpp/include/cudf/rolling.hpp
+++ b/cpp/include/cudf/rolling.hpp
@@ -19,9 +19,7 @@
 #include <cudf/rolling/range_window_bounds.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
@@ -70,7 +68,7 @@ std::unique_ptr<column> rolling_window(
   size_type min_periods,
   rolling_aggregation const& agg,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  @copybrief rolling_window
@@ -95,7 +93,7 @@ std::unique_ptr<column> rolling_window(
   size_type min_periods,
   rolling_aggregation const& agg,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Abstraction for window boundary sizes
@@ -245,7 +243,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type min_periods,
   rolling_aggregation const& aggr,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  @copybrief grouped_rolling_window
@@ -267,7 +265,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type min_periods,
   rolling_aggregation const& aggr,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  @copybrief grouped_rolling_window
@@ -294,7 +292,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type min_periods,
   rolling_aggregation const& aggr,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  @copybrief grouped_rolling_window
@@ -318,7 +316,7 @@ std::unique_ptr<column> grouped_rolling_window(
   size_type min_periods,
   rolling_aggregation const& aggr,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Applies a grouping-aware, timestamp-based rolling window function to the values in a
@@ -415,7 +413,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
   size_type min_periods,
   rolling_aggregation const& aggr,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Applies a grouping-aware, timestamp-based rolling window function to the values in a
@@ -446,7 +444,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(
   size_type min_periods,
   rolling_aggregation const& aggr,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Applies a grouping-aware, value range-based rolling window function to the values in a
@@ -568,7 +566,7 @@ std::unique_ptr<column> grouped_range_rolling_window(
   size_type min_periods,
   rolling_aggregation const& aggr,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Applies a variable-size rolling window function to the values in a column.
@@ -613,7 +611,7 @@ std::unique_ptr<column> rolling_window(
   size_type min_periods,
   rolling_aggregation const& agg,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/round.hpp b/cpp/include/cudf/round.hpp
index ef144b328f7..ba56ff34b97 100644
--- a/cpp/include/cudf/round.hpp
+++ b/cpp/include/cudf/round.hpp
@@ -18,9 +18,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 
@@ -76,7 +74,7 @@ std::unique_ptr<column> round(
   column_view const& input,
   int32_t decimal_places            = 0,
   rounding_method method            = rounding_method::HALF_UP,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index 2c5cc60fc70..e8a498afc09 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -19,13 +19,12 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 /**
  * @file
@@ -114,7 +113,7 @@ class scalar {
    */
   scalar(scalar const& other,
          rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-         rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+         rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new scalar object.
@@ -130,7 +129,7 @@ class scalar {
   scalar(data_type type,
          bool is_valid                     = false,
          rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-         rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+         rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 };
 
 namespace detail {
@@ -166,7 +165,7 @@ class fixed_width_scalar : public scalar {
    */
   fixed_width_scalar(fixed_width_scalar const& other,
                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Set the value of the scalar.
@@ -217,7 +216,7 @@ class fixed_width_scalar : public scalar {
   fixed_width_scalar(T value,
                      bool is_valid                     = true,
                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new fixed width scalar object from existing device memory.
@@ -230,7 +229,7 @@ class fixed_width_scalar : public scalar {
   fixed_width_scalar(rmm::device_scalar<T>&& data,
                      bool is_valid                     = true,
                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 };
 
 }  // namespace detail
@@ -266,7 +265,7 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
    */
   numeric_scalar(numeric_scalar const& other,
                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                 rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new numeric scalar object.
@@ -279,7 +278,7 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
   numeric_scalar(T value,
                  bool is_valid                     = true,
                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                 rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new numeric scalar object from existing device memory.
@@ -292,7 +291,7 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
   numeric_scalar(rmm::device_scalar<T>&& data,
                  bool is_valid                     = true,
                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                 rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 };
 
 /**
@@ -329,7 +328,7 @@ class fixed_point_scalar : public scalar {
    */
   fixed_point_scalar(fixed_point_scalar const& other,
                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new fixed_point scalar object from already shifted value and scale.
@@ -344,7 +343,7 @@ class fixed_point_scalar : public scalar {
                      numeric::scale_type scale,
                      bool is_valid                     = true,
                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new fixed_point scalar object from a value and default 0-scale.
@@ -357,7 +356,7 @@ class fixed_point_scalar : public scalar {
   fixed_point_scalar(rep_type value,
                      bool is_valid                     = true,
                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new fixed_point scalar object from a fixed_point number.
@@ -370,7 +369,7 @@ class fixed_point_scalar : public scalar {
   fixed_point_scalar(T value,
                      bool is_valid                     = true,
                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new fixed_point scalar object from existing device memory.
@@ -385,7 +384,7 @@ class fixed_point_scalar : public scalar {
                      numeric::scale_type scale,
                      bool is_valid                     = true,
                      rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                     rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                     rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Get the value of the scalar.
@@ -454,7 +453,7 @@ class string_scalar : public scalar {
    */
   string_scalar(string_scalar const& other,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new string scalar object.
@@ -469,7 +468,7 @@ class string_scalar : public scalar {
   string_scalar(std::string const& string,
                 bool is_valid                     = true,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new string scalar object from string_view.
@@ -484,7 +483,7 @@ class string_scalar : public scalar {
   string_scalar(value_type const& source,
                 bool is_valid                     = true,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new string scalar object from string_view in device memory.
@@ -499,7 +498,7 @@ class string_scalar : public scalar {
   string_scalar(rmm::device_scalar<value_type>& data,
                 bool is_valid                     = true,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new string scalar object by moving an existing string data buffer.
@@ -515,7 +514,7 @@ class string_scalar : public scalar {
   string_scalar(rmm::device_buffer&& data,
                 bool is_valid                     = true,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Explicit conversion operator to get the value of the scalar in a host std::string.
@@ -587,7 +586,7 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
    */
   chrono_scalar(chrono_scalar const& other,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new chrono scalar object.
@@ -600,7 +599,7 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
   chrono_scalar(T value,
                 bool is_valid                     = true,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new chrono scalar object from existing device memory.
@@ -613,7 +612,7 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
   chrono_scalar(rmm::device_scalar<T>&& data,
                 bool is_valid                     = true,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 };
 
 /**
@@ -646,7 +645,7 @@ class timestamp_scalar : public chrono_scalar<T> {
    */
   timestamp_scalar(timestamp_scalar const& other,
                    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new timestamp scalar object from a duration that is
@@ -662,7 +661,7 @@ class timestamp_scalar : public chrono_scalar<T> {
   timestamp_scalar(Duration2 const& value,
                    bool is_valid,
                    rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Returns the duration in number of ticks since the UNIX epoch.
@@ -702,7 +701,7 @@ class duration_scalar : public chrono_scalar<T> {
    */
   duration_scalar(duration_scalar const& other,
                   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new duration scalar object from tick counts.
@@ -715,7 +714,7 @@ class duration_scalar : public chrono_scalar<T> {
   duration_scalar(rep_type value,
                   bool is_valid,
                   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Returns the duration in number of ticks.
@@ -751,7 +750,7 @@ class list_scalar : public scalar {
    */
   list_scalar(list_scalar const& other,
               rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-              rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+              rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new list scalar object from column_view.
@@ -766,7 +765,7 @@ class list_scalar : public scalar {
   list_scalar(cudf::column_view const& data,
               bool is_valid                     = true,
               rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-              rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+              rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new list scalar object from existing column.
@@ -779,7 +778,7 @@ class list_scalar : public scalar {
   list_scalar(cudf::column&& data,
               bool is_valid                     = true,
               rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-              rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+              rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Returns a non-owning, immutable view to underlying device data.
@@ -816,7 +815,7 @@ class struct_scalar : public scalar {
    */
   struct_scalar(struct_scalar const& other,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new struct scalar object from table_view.
@@ -831,7 +830,7 @@ class struct_scalar : public scalar {
   struct_scalar(table_view const& data,
                 bool is_valid                     = true,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new struct scalar object from a host_span of column_views.
@@ -846,7 +845,7 @@ class struct_scalar : public scalar {
   struct_scalar(host_span<column_view const> data,
                 bool is_valid                     = true,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new struct scalar object from an existing table in device memory.
@@ -862,7 +861,7 @@ class struct_scalar : public scalar {
   struct_scalar(table&& data,
                 bool is_valid                     = true,
                 rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Returns a non-owning, immutable view to underlying device data.
diff --git a/cpp/include/cudf/scalar/scalar_factories.hpp b/cpp/include/cudf/scalar/scalar_factories.hpp
index a422c3bfbe9..87700115996 100644
--- a/cpp/include/cudf/scalar/scalar_factories.hpp
+++ b/cpp/include/cudf/scalar/scalar_factories.hpp
@@ -17,10 +17,9 @@
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 /**
@@ -45,7 +44,7 @@ namespace CUDF_EXPORT cudf {
 std::unique_ptr<scalar> make_numeric_scalar(
   data_type type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct scalar with uninitialized storage to hold a value of the
@@ -62,7 +61,7 @@ std::unique_ptr<scalar> make_numeric_scalar(
 std::unique_ptr<scalar> make_timestamp_scalar(
   data_type type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct scalar with uninitialized storage to hold a value of the
@@ -79,7 +78,7 @@ std::unique_ptr<scalar> make_timestamp_scalar(
 std::unique_ptr<scalar> make_duration_scalar(
   data_type type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct scalar with uninitialized storage to hold a value of the
@@ -96,7 +95,7 @@ std::unique_ptr<scalar> make_duration_scalar(
 std::unique_ptr<scalar> make_fixed_width_scalar(
   data_type type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct STRING type scalar given a `std::string`.
@@ -113,7 +112,7 @@ std::unique_ptr<scalar> make_fixed_width_scalar(
 std::unique_ptr<scalar> make_string_scalar(
   std::string const& string,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Constructs default constructed scalar of type `type`
@@ -128,7 +127,7 @@ std::unique_ptr<scalar> make_string_scalar(
 std::unique_ptr<scalar> make_default_constructed_scalar(
   data_type type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates an empty (invalid) scalar of the same type as the `input` column_view.
@@ -143,7 +142,7 @@ std::unique_ptr<scalar> make_default_constructed_scalar(
 std::unique_ptr<scalar> make_empty_scalar_like(
   column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct scalar using the given value of fixed width type
@@ -158,7 +157,7 @@ template <typename T>
 std::unique_ptr<scalar> make_fixed_width_scalar(
   T value,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   return std::make_unique<scalar_type_t<T>>(value, true, stream, mr);
 }
@@ -178,7 +177,7 @@ std::unique_ptr<scalar> make_fixed_point_scalar(
   typename T::rep value,
   numeric::scale_type scale,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   return std::make_unique<scalar_type_t<T>>(value, scale, true, stream, mr);
 }
@@ -194,7 +193,7 @@ std::unique_ptr<scalar> make_fixed_point_scalar(
 std::unique_ptr<scalar> make_list_scalar(
   column_view elements,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct a struct scalar using the given table_view.
@@ -209,7 +208,7 @@ std::unique_ptr<scalar> make_list_scalar(
 std::unique_ptr<scalar> make_struct_scalar(
   table_view const& data,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Construct a struct scalar using the given span of column views.
@@ -224,7 +223,7 @@ std::unique_ptr<scalar> make_struct_scalar(
 std::unique_ptr<scalar> make_struct_scalar(
   host_span<column_view const> data,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/search.hpp b/cpp/include/cudf/search.hpp
index ad170ec726b..e10c8c8b4d2 100644
--- a/cpp/include/cudf/search.hpp
+++ b/cpp/include/cudf/search.hpp
@@ -21,9 +21,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <vector>
 
@@ -75,7 +73,7 @@ std::unique_ptr<column> lower_bound(
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Find largest indices in a sorted table where values should be inserted to maintain order.
@@ -117,7 +115,7 @@ std::unique_ptr<column> upper_bound(
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Check if the given `needle` value exists in the `haystack` column.
@@ -166,7 +164,7 @@ std::unique_ptr<column> contains(
   column_view const& haystack,
   column_view const& needles,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index 4cb265a2a0b..b773f76defe 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -20,9 +20,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 #include <vector>
@@ -56,7 +54,7 @@ std::unique_ptr<column> sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the row indices that would produce `input` in a stable
@@ -71,7 +69,7 @@ std::unique_ptr<column> stable_sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Checks whether the rows of a `table` are sorted in a lexicographical
@@ -115,7 +113,7 @@ std::unique_ptr<table> sort(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs a stable lexicographic sort of the rows of a table
@@ -127,7 +125,7 @@ std::unique_ptr<table> stable_sort(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs a key-value sort.
@@ -157,7 +155,7 @@ std::unique_ptr<table> sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs a key-value stable sort.
@@ -170,7 +168,7 @@ std::unique_ptr<table> stable_sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Computes the ranks of input column in sorted order.
@@ -210,7 +208,7 @@ std::unique_ptr<column> rank(
   null_order null_precedence,
   bool percentage,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns sorted order after sorting each segment in the table.
@@ -261,7 +259,7 @@ std::unique_ptr<column> segmented_sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns sorted order after stably sorting each segment in the table.
@@ -274,7 +272,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs a lexicographic segmented sort of a table
@@ -330,7 +328,7 @@ std::unique_ptr<table> segmented_sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Performs a stably lexicographic segmented sort of a table
@@ -344,7 +342,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
   rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr              = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr              = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/stream_compaction.hpp b/cpp/include/cudf/stream_compaction.hpp
index ced8d5849d0..ed0730d50a4 100644
--- a/cpp/include/cudf/stream_compaction.hpp
+++ b/cpp/include/cudf/stream_compaction.hpp
@@ -19,9 +19,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 #include <vector>
@@ -77,7 +75,7 @@ std::unique_ptr<table> drop_nulls(
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Filters a table to remove null elements.
@@ -110,7 +108,7 @@ std::unique_ptr<table> drop_nulls(
   table_view const& input,
   std::vector<size_type> const& keys,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Filters a table to remove NANs with threshold count.
@@ -155,7 +153,7 @@ std::unique_ptr<table> drop_nans(
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Filters a table to remove NANs.
@@ -189,7 +187,7 @@ std::unique_ptr<table> drop_nans(
   table_view const& input,
   std::vector<size_type> const& keys,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Filters `input` using `boolean_mask` of boolean values as a mask.
@@ -217,7 +215,7 @@ std::unique_ptr<table> apply_boolean_mask(
   table_view const& input,
   column_view const& boolean_mask,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Choices for drop_duplicates API for retainment of duplicate rows
@@ -263,7 +261,7 @@ std::unique_ptr<table> unique(
   duplicate_keep_option keep,
   null_equality nulls_equal         = null_equality::EQUAL,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a new table without duplicate rows.
@@ -292,7 +290,7 @@ std::unique_ptr<table> distinct(
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a column of indices of all distinct rows in the input table.
@@ -314,7 +312,7 @@ std::unique_ptr<column> distinct_indices(
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Create a new table without duplicate rows, preserving input order.
@@ -346,7 +344,7 @@ std::unique_ptr<table> stable_distinct(
   null_equality nulls_equal         = null_equality::EQUAL,
   nan_equality nans_equal           = nan_equality::ALL_EQUAL,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Count the number of consecutive groups of equivalent rows in a column.
diff --git a/cpp/include/cudf/strings/attributes.hpp b/cpp/include/cudf/strings/attributes.hpp
index 323290e907c..5f2eda8fa5b 100644
--- a/cpp/include/cudf/strings/attributes.hpp
+++ b/cpp/include/cudf/strings/attributes.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 
@@ -48,7 +46,7 @@ namespace strings {
  */
 std::unique_ptr<column> count_characters(
   strings_column_view const& input,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column containing byte lengths
@@ -66,7 +64,7 @@ std::unique_ptr<column> count_characters(
  */
 std::unique_ptr<column> count_bytes(
   strings_column_view const& input,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a numeric column with code point values (integers) for each
@@ -86,7 +84,7 @@ std::unique_ptr<column> count_bytes(
  */
 std::unique_ptr<column> code_points(
   strings_column_view const& input,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of strings_apis group
 
diff --git a/cpp/include/cudf/strings/capitalize.hpp b/cpp/include/cudf/strings/capitalize.hpp
index 420b46a05b2..312e3a5bef1 100644
--- a/cpp/include/cudf/strings/capitalize.hpp
+++ b/cpp/include/cudf/strings/capitalize.hpp
@@ -19,9 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/char_types/char_types.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -63,7 +61,7 @@ std::unique_ptr<column> capitalize(
   strings_column_view const& input,
   string_scalar const& delimiters   = string_scalar("", true, cudf::get_default_stream()),
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Modifies first character of each word to upper-case and lower-cases the rest.
@@ -96,7 +94,7 @@ std::unique_ptr<column> title(
   strings_column_view const& input,
   string_character_types sequence_type = string_character_types::ALPHA,
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Checks if the strings in the input column are title formatted.
@@ -125,7 +123,7 @@ std::unique_ptr<column> title(
 std::unique_ptr<column> is_title(
   strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/case.hpp b/cpp/include/cudf/strings/case.hpp
index 45f56a681a6..c2bd559accc 100644
--- a/cpp/include/cudf/strings/case.hpp
+++ b/cpp/include/cudf/strings/case.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -46,7 +44,7 @@ namespace strings {
 std::unique_ptr<column> to_lower(
   strings_column_view const& strings,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Converts a column of strings to upper case.
@@ -65,7 +63,7 @@ std::unique_ptr<column> to_lower(
 std::unique_ptr<column> to_upper(
   strings_column_view const& strings,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column of strings converting lower case characters to
@@ -85,7 +83,7 @@ std::unique_ptr<column> to_upper(
 std::unique_ptr<column> swapcase(
   strings_column_view const& strings,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/char_types/char_types.hpp b/cpp/include/cudf/strings/char_types/char_types.hpp
index a6af681eec6..3ebe5cb53e9 100644
--- a/cpp/include/cudf/strings/char_types/char_types.hpp
+++ b/cpp/include/cudf/strings/char_types/char_types.hpp
@@ -19,9 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/char_types/char_types_enum.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -68,7 +66,7 @@ std::unique_ptr<column> all_characters_of_type(
   string_character_types types,
   string_character_types verify_types = string_character_types::ALL_TYPES,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Filter specific character types from a column of strings.
@@ -115,7 +113,7 @@ std::unique_ptr<column> filter_characters_of_type(
   string_scalar const& replacement     = string_scalar(""),
   string_character_types types_to_keep = string_character_types::ALL_TYPES,
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/combine.hpp b/cpp/include/cudf/strings/combine.hpp
index 2cade813d78..d766fba0cdc 100644
--- a/cpp/include/cudf/strings/combine.hpp
+++ b/cpp/include/cudf/strings/combine.hpp
@@ -20,9 +20,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -81,7 +79,7 @@ std::unique_ptr<column> join_strings(
   string_scalar const& separator    = string_scalar(""),
   string_scalar const& narep        = string_scalar("", false),
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Concatenates a list of strings columns using separators for each row
@@ -149,7 +147,7 @@ std::unique_ptr<column> concatenate(
   string_scalar const& col_narep       = string_scalar("", false),
   separator_on_nulls separate_nulls    = separator_on_nulls::YES,
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Row-wise concatenates the given list of strings columns and
@@ -204,7 +202,7 @@ std::unique_ptr<column> concatenate(
   string_scalar const& narep        = string_scalar("", false),
   separator_on_nulls separate_nulls = separator_on_nulls::YES,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Given a lists column of strings (each row is a list of strings), concatenates the strings
@@ -271,7 +269,7 @@ std::unique_ptr<column> join_list_elements(
   separator_on_nulls separate_nulls      = separator_on_nulls::YES,
   output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING,
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Given a lists column of strings (each row is a list of strings), concatenates the strings
@@ -330,7 +328,7 @@ std::unique_ptr<column> join_list_elements(
   separator_on_nulls separate_nulls      = separator_on_nulls::YES,
   output_if_empty_list empty_list_policy = output_if_empty_list::EMPTY_STRING,
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp
index 59c9b2dea40..2a25ac79bbb 100644
--- a/cpp/include/cudf/strings/contains.hpp
+++ b/cpp/include/cudf/strings/contains.hpp
@@ -19,9 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/regex/flags.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -61,7 +59,7 @@ std::unique_ptr<column> contains_re(
   strings_column_view const& input,
   regex_program const& prog,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a boolean column identifying rows which
@@ -89,7 +87,7 @@ std::unique_ptr<column> matches_re(
   strings_column_view const& input,
   regex_program const& prog,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the number of times the given regex_program's pattern
@@ -117,7 +115,7 @@ std::unique_ptr<column> count_re(
   strings_column_view const& input,
   regex_program const& prog,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a boolean column identifying rows which
@@ -164,7 +162,7 @@ std::unique_ptr<column> like(
   string_scalar const& pattern,
   string_scalar const& escape_character = string_scalar(""),
   rmm::cuda_stream_view stream          = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr     = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr     = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a boolean column identifying rows which
@@ -205,7 +203,7 @@ std::unique_ptr<column> like(
   strings_column_view const& patterns,
   string_scalar const& escape_character = string_scalar(""),
   rmm::cuda_stream_view stream          = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr     = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr     = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_booleans.hpp b/cpp/include/cudf/strings/convert/convert_booleans.hpp
index d79dd4a80ea..bf7b6c1525b 100644
--- a/cpp/include/cudf/strings/convert/convert_booleans.hpp
+++ b/cpp/include/cudf/strings/convert/convert_booleans.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -46,7 +44,7 @@ std::unique_ptr<column> to_booleans(
   strings_column_view const& input,
   string_scalar const& true_string,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a new strings column converting the boolean values from the
@@ -68,7 +66,7 @@ std::unique_ptr<column> from_booleans(
   string_scalar const& true_string,
   string_scalar const& false_string,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_datetime.hpp b/cpp/include/cudf/strings/convert/convert_datetime.hpp
index c3b3c91ab35..04eba83925d 100644
--- a/cpp/include/cudf/strings/convert/convert_datetime.hpp
+++ b/cpp/include/cudf/strings/convert/convert_datetime.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <string>
 #include <vector>
@@ -90,7 +88,7 @@ std::unique_ptr<column> to_timestamps(
   data_type timestamp_type,
   std::string_view format,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Verifies the given strings column can be parsed to timestamps using the provided format
@@ -137,7 +135,7 @@ std::unique_ptr<column> is_timestamp(
   strings_column_view const& input,
   std::string_view format,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a new strings column converting a timestamp column into
@@ -251,7 +249,7 @@ std::unique_ptr<column> from_timestamps(
   strings_column_view const& names  = strings_column_view(column_view{
     data_type{type_id::STRING}, 0, nullptr, nullptr, 0}),
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_durations.hpp b/cpp/include/cudf/strings/convert/convert_durations.hpp
index 8b69968a609..25184cbfd02 100644
--- a/cpp/include/cudf/strings/convert/convert_durations.hpp
+++ b/cpp/include/cudf/strings/convert/convert_durations.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -78,7 +76,7 @@ std::unique_ptr<column> to_durations(
   data_type duration_type,
   std::string_view format,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a new strings column converting a duration column into
@@ -129,7 +127,7 @@ std::unique_ptr<column> from_durations(
   column_view const& durations,
   std::string_view format           = "%D days %H:%M:%S",
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
index a9c5aea6343..6d5e94a8e02 100644
--- a/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
+++ b/cpp/include/cudf/strings/convert/convert_fixed_point.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -64,7 +62,7 @@ std::unique_ptr<column> to_fixed_point(
   strings_column_view const& input,
   data_type output_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a new strings column converting the fixed-point values
@@ -94,7 +92,7 @@ std::unique_ptr<column> to_fixed_point(
 std::unique_ptr<column> from_fixed_point(
   column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -126,7 +124,7 @@ std::unique_ptr<column> is_fixed_point(
   strings_column_view const& input,
   data_type decimal_type            = data_type{type_id::DECIMAL64},
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_floats.hpp b/cpp/include/cudf/strings/convert/convert_floats.hpp
index 64e9bb776f4..52fb47df94f 100644
--- a/cpp/include/cudf/strings/convert/convert_floats.hpp
+++ b/cpp/include/cudf/strings/convert/convert_floats.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -50,7 +48,7 @@ std::unique_ptr<column> to_floats(
   strings_column_view const& strings,
   data_type output_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a new strings column converting the float values from the
@@ -73,7 +71,7 @@ std::unique_ptr<column> to_floats(
 std::unique_ptr<column> from_floats(
   column_view const& floats,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -99,7 +97,7 @@ std::unique_ptr<column> from_floats(
 std::unique_ptr<column> is_float(
   strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_integers.hpp b/cpp/include/cudf/strings/convert/convert_integers.hpp
index 62eb1fdda4d..9aad32bfba4 100644
--- a/cpp/include/cudf/strings/convert/convert_integers.hpp
+++ b/cpp/include/cudf/strings/convert/convert_integers.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -57,7 +55,7 @@ std::unique_ptr<column> to_integers(
   strings_column_view const& input,
   data_type output_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a new strings column converting the integer values from the
@@ -78,7 +76,7 @@ std::unique_ptr<column> to_integers(
 std::unique_ptr<column> from_integers(
   column_view const& integers,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -107,7 +105,7 @@ std::unique_ptr<column> from_integers(
 std::unique_ptr<column> is_integer(
   strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -141,7 +139,7 @@ std::unique_ptr<column> is_integer(
   strings_column_view const& input,
   data_type int_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a new integer numeric column parsing hexadecimal values from the
@@ -171,7 +169,7 @@ std::unique_ptr<column> hex_to_integers(
   strings_column_view const& input,
   data_type output_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -198,7 +196,7 @@ std::unique_ptr<column> hex_to_integers(
 std::unique_ptr<column> is_hex(
   strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a new strings column converting integer columns to hexadecimal
@@ -231,7 +229,7 @@ std::unique_ptr<column> is_hex(
 std::unique_ptr<column> integers_to_hex(
   column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_ipv4.hpp b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
index 97d1dfee017..2dd82554cee 100644
--- a/cpp/include/cudf/strings/convert/convert_ipv4.hpp
+++ b/cpp/include/cudf/strings/convert/convert_ipv4.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -54,7 +52,7 @@ namespace strings {
 std::unique_ptr<column> ipv4_to_integers(
   strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Converts integers into IPv4 addresses as strings.
@@ -77,7 +75,7 @@ std::unique_ptr<column> ipv4_to_integers(
 std::unique_ptr<column> integers_to_ipv4(
   column_view const& integers,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a boolean column identifying strings in which all
@@ -104,7 +102,7 @@ std::unique_ptr<column> integers_to_ipv4(
 std::unique_ptr<column> is_ipv4(
   strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_lists.hpp b/cpp/include/cudf/strings/convert/convert_lists.hpp
index 85b67907228..80d0511fc1f 100644
--- a/cpp/include/cudf/strings/convert/convert_lists.hpp
+++ b/cpp/include/cudf/strings/convert/convert_lists.hpp
@@ -19,9 +19,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -64,7 +62,7 @@ std::unique_ptr<column> format_list_column(
   strings_column_view const& separators = strings_column_view(column_view{
     data_type{type_id::STRING}, 0, nullptr, nullptr, 0}),
   rmm::cuda_stream_view stream          = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr     = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr     = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/convert/convert_urls.hpp b/cpp/include/cudf/strings/convert/convert_urls.hpp
index a42a5cd2407..d6e87f9d543 100644
--- a/cpp/include/cudf/strings/convert/convert_urls.hpp
+++ b/cpp/include/cudf/strings/convert/convert_urls.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -48,7 +46,7 @@ namespace strings {
 std::unique_ptr<column> url_encode(
   strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Encodes each string using URL encoding.
@@ -71,7 +69,7 @@ std::unique_ptr<column> url_encode(
 std::unique_ptr<column> url_decode(
   strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp
index 962191eae6a..31698457048 100644
--- a/cpp/include/cudf/strings/detail/combine.hpp
+++ b/cpp/include/cudf/strings/detail/combine.hpp
@@ -22,9 +22,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/detail/concatenate.hpp b/cpp/include/cudf/strings/detail/concatenate.hpp
index e038102ab1f..75762e61afe 100644
--- a/cpp/include/cudf/strings/detail/concatenate.hpp
+++ b/cpp/include/cudf/strings/detail/concatenate.hpp
@@ -20,10 +20,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/detail/converters.hpp b/cpp/include/cudf/strings/detail/converters.hpp
index 73a97499293..3880b8abc32 100644
--- a/cpp/include/cudf/strings/detail/converters.hpp
+++ b/cpp/include/cudf/strings/detail/converters.hpp
@@ -19,9 +19,9 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index 213a41ca596..6b025e8659d 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -18,11 +18,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <cuda/std/optional>
diff --git a/cpp/include/cudf/strings/detail/copy_range.hpp b/cpp/include/cudf/strings/detail/copy_range.hpp
index 71dcf9edaf3..33ac74da97f 100644
--- a/cpp/include/cudf/strings/detail/copy_range.hpp
+++ b/cpp/include/cudf/strings/detail/copy_range.hpp
@@ -17,9 +17,9 @@
 
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/detail/copying.hpp b/cpp/include/cudf/strings/detail/copying.hpp
index b4d3362359d..f97cc9f5b5d 100644
--- a/cpp/include/cudf/strings/detail/copying.hpp
+++ b/cpp/include/cudf/strings/detail/copying.hpp
@@ -20,9 +20,9 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/detail/fill.hpp b/cpp/include/cudf/strings/detail/fill.hpp
index 1a3ff2c9166..55508b0ac1b 100644
--- a/cpp/include/cudf/strings/detail/fill.hpp
+++ b/cpp/include/cudf/strings/detail/fill.hpp
@@ -20,9 +20,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 4369de317b3..4216523df97 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -24,11 +24,11 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/prefetch.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
diff --git a/cpp/include/cudf/strings/detail/merge.hpp b/cpp/include/cudf/strings/detail/merge.hpp
index 0aa5c0c2899..92f0fe34576 100644
--- a/cpp/include/cudf/strings/detail/merge.hpp
+++ b/cpp/include/cudf/strings/detail/merge.hpp
@@ -21,6 +21,7 @@
 #include <cudf/utilities/export.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/detail/replace.hpp b/cpp/include/cudf/strings/detail/replace.hpp
index ab092555c48..780a0f6a9f5 100644
--- a/cpp/include/cudf/strings/detail/replace.hpp
+++ b/cpp/include/cudf/strings/detail/replace.hpp
@@ -20,9 +20,9 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/detail/scan.hpp b/cpp/include/cudf/strings/detail/scan.hpp
index 4991fd633d5..71fbfadf9ec 100644
--- a/cpp/include/cudf/strings/detail/scan.hpp
+++ b/cpp/include/cudf/strings/detail/scan.hpp
@@ -17,9 +17,9 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh
index 87f0e7ae47c..e49d6dff40d 100644
--- a/cpp/include/cudf/strings/detail/scatter.cuh
+++ b/cpp/include/cudf/strings/detail/scatter.cuh
@@ -19,12 +19,12 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
@@ -70,7 +70,7 @@ std::unique_ptr<column> scatter(SourceIterator begin,
 
   // create vector of string_view's to scatter into
   rmm::device_uvector<string_view> target_vector =
-    create_string_vector_from_column(target, stream, rmm::mr::get_current_device_resource());
+    create_string_vector_from_column(target, stream, cudf::get_current_device_resource_ref());
 
   // this ensures empty strings are not mapped to nulls in the make_strings_column function
   auto const size = thrust::distance(begin, end);
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
index 55b59dd4ff2..1283226879b 100644
--- a/cpp/include/cudf/strings/detail/strings_children.cuh
+++ b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -23,11 +23,11 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/prefetch.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
index a3221038eed..6b1b453a752 100644
--- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh
+++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -22,10 +22,10 @@
 #include <cudf/strings/detail/gather.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index 1fa505501d8..d276c5df7dc 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -19,11 +19,11 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings::detail {
diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp
index 2ef7308b802..f8bf93b77cf 100644
--- a/cpp/include/cudf/strings/extract.hpp
+++ b/cpp/include/cudf/strings/extract.hpp
@@ -18,9 +18,7 @@
 #include <cudf/strings/regex/flags.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -64,7 +62,7 @@ std::unique_ptr<table> extract(
   strings_column_view const& input,
   regex_program const& prog,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a lists column of strings where each string column row corresponds to the
@@ -100,7 +98,7 @@ std::unique_ptr<column> extract_all_record(
   strings_column_view const& input,
   regex_program const& prog,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/find.hpp b/cpp/include/cudf/strings/find.hpp
index efba6da9454..e024b116a71 100644
--- a/cpp/include/cudf/strings/find.hpp
+++ b/cpp/include/cudf/strings/find.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -59,7 +57,7 @@ std::unique_ptr<column> find(
   size_type start                   = 0,
   size_type stop                    = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column of character position values where the target
@@ -90,7 +88,7 @@ std::unique_ptr<column> rfind(
   size_type start                   = 0,
   size_type stop                    = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column of character position values where the target
@@ -117,7 +115,7 @@ std::unique_ptr<column> find(
   strings_column_view const& target,
   size_type start                   = 0,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -138,7 +136,7 @@ std::unique_ptr<column> contains(
   strings_column_view const& input,
   string_scalar const& target,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -163,7 +161,7 @@ std::unique_ptr<column> contains(
   strings_column_view const& input,
   strings_column_view const& targets,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -185,7 +183,7 @@ std::unique_ptr<column> starts_with(
   strings_column_view const& input,
   string_scalar const& target,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -211,7 +209,7 @@ std::unique_ptr<column> starts_with(
   strings_column_view const& input,
   strings_column_view const& targets,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -233,7 +231,7 @@ std::unique_ptr<column> ends_with(
   strings_column_view const& input,
   string_scalar const& target,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a column of boolean values for each string where true indicates
@@ -259,7 +257,7 @@ std::unique_ptr<column> ends_with(
   strings_column_view const& input,
   strings_column_view const& targets,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/find_multiple.hpp b/cpp/include/cudf/strings/find_multiple.hpp
index dea08308ff0..1fe446db8da 100644
--- a/cpp/include/cudf/strings/find_multiple.hpp
+++ b/cpp/include/cudf/strings/find_multiple.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -59,7 +57,7 @@ std::unique_ptr<column> find_multiple(
   strings_column_view const& input,
   strings_column_view const& targets,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp
index 26249b6842c..c6b9bc7e58a 100644
--- a/cpp/include/cudf/strings/findall.hpp
+++ b/cpp/include/cudf/strings/findall.hpp
@@ -18,9 +18,7 @@
 #include <cudf/strings/regex/flags.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -66,7 +64,7 @@ std::unique_ptr<column> findall(
   strings_column_view const& input,
   regex_program const& prog,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/padding.hpp b/cpp/include/cudf/strings/padding.hpp
index 11e35f717ae..606a866cb8a 100644
--- a/cpp/include/cudf/strings/padding.hpp
+++ b/cpp/include/cudf/strings/padding.hpp
@@ -19,9 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/side_type.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -62,7 +60,7 @@ std::unique_ptr<column> pad(
   side_type side                    = side_type::RIGHT,
   std::string_view fill_char        = " ",
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Add '0' as padding to the left of each string.
@@ -92,7 +90,7 @@ std::unique_ptr<column> zfill(
   strings_column_view const& input,
   size_type width,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp
index e160f75390b..af419d9501f 100644
--- a/cpp/include/cudf/strings/repeat_strings.hpp
+++ b/cpp/include/cudf/strings/repeat_strings.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -61,7 +59,7 @@ std::unique_ptr<string_scalar> repeat_string(
   string_scalar const& input,
   size_type repeat_times,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Repeat each string in the given strings column a given number of times
@@ -92,7 +90,7 @@ std::unique_ptr<column> repeat_strings(
   strings_column_view const& input,
   size_type repeat_times,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Repeat each string in the given strings column by the numbers of times given in another
@@ -129,7 +127,7 @@ std::unique_ptr<column> repeat_strings(
   strings_column_view const& input,
   column_view const& repeat_times,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/replace.hpp b/cpp/include/cudf/strings/replace.hpp
index f450b77ad7a..c7a87bbb0d0 100644
--- a/cpp/include/cudf/strings/replace.hpp
+++ b/cpp/include/cudf/strings/replace.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -70,7 +68,7 @@ std::unique_ptr<column> replace(
   string_scalar const& repl,
   cudf::size_type maxrepl           = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief This function replaces each string in the column with the provided
@@ -112,7 +110,7 @@ std::unique_ptr<column> replace_slice(
   size_type start                   = 0,
   size_type stop                    = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Replaces substrings matching a list of targets with the corresponding
@@ -158,7 +156,7 @@ std::unique_ptr<column> replace_multiple(
   strings_column_view const& targets,
   strings_column_view const& repls,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp
index 6b487072cb2..4a58142cbe6 100644
--- a/cpp/include/cudf/strings/replace_re.hpp
+++ b/cpp/include/cudf/strings/replace_re.hpp
@@ -19,9 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/regex/flags.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <optional>
 
@@ -60,7 +58,7 @@ std::unique_ptr<column> replace_re(
   string_scalar const& replacement           = string_scalar(""),
   std::optional<size_type> max_replace_count = std::nullopt,
   rmm::cuda_stream_view stream               = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr          = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr          = cudf::get_current_device_resource_ref());
 
 /**
  * @brief For each string, replaces any character sequence matching the given patterns
@@ -84,7 +82,7 @@ std::unique_ptr<column> replace_re(
   strings_column_view const& replacements,
   regex_flags const flags           = regex_flags::DEFAULT,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief For each string, replaces any character sequence matching the given regex
@@ -109,7 +107,7 @@ std::unique_ptr<column> replace_with_backrefs(
   regex_program const& prog,
   std::string_view replacement,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace strings
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/strings/reverse.hpp b/cpp/include/cudf/strings/reverse.hpp
index fbda2e5fe7c..f9ab34373df 100644
--- a/cpp/include/cudf/strings/reverse.hpp
+++ b/cpp/include/cudf/strings/reverse.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -49,7 +47,7 @@ namespace strings {
 std::unique_ptr<column> reverse(
   strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/slice.hpp b/cpp/include/cudf/strings/slice.hpp
index b0da6976207..754bee4b1f0 100644
--- a/cpp/include/cudf/strings/slice.hpp
+++ b/cpp/include/cudf/strings/slice.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -65,7 +63,7 @@ std::unique_ptr<column> slice_strings(
   numeric_scalar<size_type> const& stop  = numeric_scalar<size_type>(0, false),
   numeric_scalar<size_type> const& step  = numeric_scalar<size_type>(1),
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a new strings column that contains substrings of the
@@ -110,7 +108,7 @@ std::unique_ptr<column> slice_strings(
   column_view const& starts,
   column_view const& stops,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/split/partition.hpp b/cpp/include/cudf/strings/split/partition.hpp
index 8f5ae752417..92573a665c9 100644
--- a/cpp/include/cudf/strings/split/partition.hpp
+++ b/cpp/include/cudf/strings/split/partition.hpp
@@ -18,9 +18,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -63,7 +61,7 @@ std::unique_ptr<table> partition(
   strings_column_view const& input,
   string_scalar const& delimiter    = string_scalar(""),
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a set of 3 columns by splitting each string using the
@@ -97,7 +95,7 @@ std::unique_ptr<table> rpartition(
   strings_column_view const& input,
   string_scalar const& delimiter    = string_scalar(""),
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/split/split.hpp b/cpp/include/cudf/strings/split/split.hpp
index ca371d7abd1..026192d4a0b 100644
--- a/cpp/include/cudf/strings/split/split.hpp
+++ b/cpp/include/cudf/strings/split/split.hpp
@@ -18,9 +18,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -58,7 +56,7 @@ std::unique_ptr<table> split(
   string_scalar const& delimiter    = string_scalar(""),
   size_type maxsplit                = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a list of columns by splitting each string using the
@@ -88,7 +86,7 @@ std::unique_ptr<table> rsplit(
   string_scalar const& delimiter    = string_scalar(""),
   size_type maxsplit                = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Splits individual strings elements into a list of strings.
@@ -162,7 +160,7 @@ std::unique_ptr<column> split_record(
   string_scalar const& delimiter    = string_scalar(""),
   size_type maxsplit                = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Splits individual strings elements into a list of strings starting
@@ -241,7 +239,7 @@ std::unique_ptr<column> rsplit_record(
   string_scalar const& delimiter    = string_scalar(""),
   size_type maxsplit                = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index 96ef0b6e830..ce376ab93cf 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -85,7 +83,7 @@ std::unique_ptr<table> split_re(
   regex_program const& prog,
   size_type maxsplit                = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Splits strings elements into a table of strings columns using a
@@ -141,7 +139,7 @@ std::unique_ptr<table> rsplit_re(
   regex_program const& prog,
   size_type maxsplit                = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Splits strings elements into a list column of strings
@@ -199,7 +197,7 @@ std::unique_ptr<column> split_record_re(
   regex_program const& prog,
   size_type maxsplit                = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Splits strings elements into a list column of strings using the given
@@ -259,7 +257,7 @@ std::unique_ptr<column> rsplit_record_re(
   regex_program const& prog,
   size_type maxsplit                = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/strip.hpp b/cpp/include/cudf/strings/strip.hpp
index 4cfba59c72c..396940dbb30 100644
--- a/cpp/include/cudf/strings/strip.hpp
+++ b/cpp/include/cudf/strings/strip.hpp
@@ -19,9 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/side_type.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -67,7 +65,7 @@ std::unique_ptr<column> strip(
   side_type side                    = side_type::BOTH,
   string_scalar const& to_strip     = string_scalar(""),
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/translate.hpp b/cpp/include/cudf/strings/translate.hpp
index 531753f4a8c..aa69a2e5679 100644
--- a/cpp/include/cudf/strings/translate.hpp
+++ b/cpp/include/cudf/strings/translate.hpp
@@ -19,9 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <vector>
 
@@ -58,7 +56,7 @@ std::unique_ptr<column> translate(
   strings_column_view const& input,
   std::vector<std::pair<char_utf8, char_utf8>> const& chars_table,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Removes or keeps the specified character ranges in cudf::strings::filter_characters
@@ -105,7 +103,7 @@ std::unique_ptr<column> filter_characters(
   filter_type keep_characters       = filter_type::KEEP,
   string_scalar const& replacement  = string_scalar(""),
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/utilities.hpp b/cpp/include/cudf/strings/utilities.hpp
index ae445282382..999fff0f4c8 100644
--- a/cpp/include/cudf/strings/utilities.hpp
+++ b/cpp/include/cudf/strings/utilities.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -35,7 +33,7 @@ namespace strings {
 rmm::device_uvector<string_view> create_string_vector_from_column(
   cudf::strings_column_view const strings,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Return the threshold size for a strings column to use int64 offsets
diff --git a/cpp/include/cudf/strings/wrap.hpp b/cpp/include/cudf/strings/wrap.hpp
index 465a9d15d00..96ae2fb0582 100644
--- a/cpp/include/cudf/strings/wrap.hpp
+++ b/cpp/include/cudf/strings/wrap.hpp
@@ -17,9 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace strings {
@@ -68,7 +66,7 @@ std::unique_ptr<column> wrap(
   strings_column_view const& input,
   size_type width,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of doxygen group
 }  // namespace strings
diff --git a/cpp/include/cudf/structs/detail/concatenate.hpp b/cpp/include/cudf/structs/detail/concatenate.hpp
index 16be868af52..96964eac31f 100644
--- a/cpp/include/cudf/structs/detail/concatenate.hpp
+++ b/cpp/include/cudf/structs/detail/concatenate.hpp
@@ -19,10 +19,9 @@
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 namespace CUDF_EXPORT cudf {
 namespace structs::detail {
 
diff --git a/cpp/include/cudf/structs/detail/scan.hpp b/cpp/include/cudf/structs/detail/scan.hpp
index 6121f63d42f..e9e721c3335 100644
--- a/cpp/include/cudf/structs/detail/scan.hpp
+++ b/cpp/include/cudf/structs/detail/scan.hpp
@@ -18,9 +18,9 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace structs::detail {
diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp
index be2af7ac653..762131a174f 100644
--- a/cpp/include/cudf/table/table.hpp
+++ b/cpp/include/cudf/table/table.hpp
@@ -18,10 +18,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
@@ -58,7 +57,7 @@ class table {
    */
   explicit table(table const& other,
                  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                 rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
   /**
    * @brief Moves the contents from a vector of `unique_ptr`s to columns to
    * construct a new table.
@@ -77,7 +76,7 @@ class table {
    */
   table(table_view view,
         rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-        rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+        rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Returns the number of columns in the table
diff --git a/cpp/include/cudf/timezone.hpp b/cpp/include/cudf/timezone.hpp
index 8329c64e24f..aa903770e26 100644
--- a/cpp/include/cudf/timezone.hpp
+++ b/cpp/include/cudf/timezone.hpp
@@ -16,9 +16,7 @@
 #pragma once
 
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 #include <optional>
@@ -52,6 +50,6 @@ static constexpr uint32_t solar_cycle_entry_count = 2 * solar_cycle_years;
 std::unique_ptr<table> make_timezone_transition_table(
   std::optional<std::string_view> tzif_dir,
   std::string_view timezone_name,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/transform.hpp b/cpp/include/cudf/transform.hpp
index f16214260f7..82b8bee1acf 100644
--- a/cpp/include/cudf/transform.hpp
+++ b/cpp/include/cudf/transform.hpp
@@ -19,9 +19,7 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <memory>
 
@@ -58,7 +56,7 @@ std::unique_ptr<column> transform(
   data_type output_type,
   bool is_ptx,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a null_mask from `input` by converting `NaN` to null and
@@ -75,7 +73,7 @@ std::unique_ptr<column> transform(
 std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
   column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Compute a new column by evaluating an expression tree on a table.
@@ -95,7 +93,7 @@ std::unique_ptr<column> compute_column(
   table_view const& table,
   ast::expression const& expr,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a bitmask from a column of boolean elements.
@@ -116,7 +114,7 @@ std::unique_ptr<column> compute_column(
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
   column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Encode the rows of the given table as integers
@@ -146,7 +144,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
   cudf::table_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Encodes `input` by generating a new column for each value in `categories` indicating the
@@ -180,7 +178,7 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(
   column_view const& input,
   column_view const& categories,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a boolean column from given bitmask.
@@ -209,7 +207,7 @@ std::unique_ptr<column> mask_to_bools(
   size_type begin_bit,
   size_type end_bit,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for
@@ -240,7 +238,7 @@ std::unique_ptr<column> mask_to_bools(
 std::unique_ptr<column> row_bit_count(
   table_view const& t,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns an approximate cumulative size in bits of all columns in the `table_view` for
@@ -265,7 +263,7 @@ std::unique_ptr<column> segmented_row_bit_count(
   table_view const& t,
   size_type segment_length,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/transpose.hpp b/cpp/include/cudf/transpose.hpp
index f4433c46a06..8b680071e71 100644
--- a/cpp/include/cudf/transpose.hpp
+++ b/cpp/include/cudf/transpose.hpp
@@ -18,9 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
 /**
@@ -46,7 +44,7 @@ namespace CUDF_EXPORT cudf {
  */
 std::pair<std::unique_ptr<column>, table_view> transpose(
   table_view const& input,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/unary.hpp b/cpp/include/cudf/unary.hpp
index 55f4c1f5a23..53e0f3a15d2 100644
--- a/cpp/include/cudf/unary.hpp
+++ b/cpp/include/cudf/unary.hpp
@@ -21,11 +21,9 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
-
 #include <memory>
 
 namespace CUDF_EXPORT cudf {
@@ -159,7 +157,7 @@ std::unique_ptr<cudf::column> unary_operation(
   cudf::column_view const& input,
   cudf::unary_operator op,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a column of `type_id::BOOL8` elements where for every element in `input` `true`
@@ -175,7 +173,7 @@ std::unique_ptr<cudf::column> unary_operation(
 std::unique_ptr<cudf::column> is_null(
   cudf::column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a column of `type_id::BOOL8` elements where for every element in `input` `true`
@@ -191,7 +189,7 @@ std::unique_ptr<cudf::column> is_null(
 std::unique_ptr<cudf::column> is_valid(
   cudf::column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief  Casts data from dtype specified in input to dtype specified in output.
@@ -210,7 +208,7 @@ std::unique_ptr<column> cast(
   column_view const& input,
   data_type out_type,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Check if a cast between two datatypes is supported.
@@ -238,7 +236,7 @@ bool is_supported_cast(data_type from, data_type to) noexcept;
 std::unique_ptr<column> is_nan(
   cudf::column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a column of `type_id::BOOL8` elements indicating the absence of `NaN` values
@@ -257,7 +255,7 @@ std::unique_ptr<column> is_nan(
 std::unique_ptr<column> is_not_nan(
   cudf::column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT cudf
diff --git a/cpp/include/cudf/utilities/memory_resource.hpp b/cpp/include/cudf/utilities/memory_resource.hpp
new file mode 100644
index 00000000000..b562574fd79
--- /dev/null
+++ b/cpp/include/cudf/utilities/memory_resource.hpp
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/memory_resource.hpp>
+
+#include <rmm/cuda_device.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
+namespace cudf {
+
+/**
+ * @addtogroup memory_resource
+ * @{
+ * @file
+ */
+
+/**
+ * @brief Get the current device memory resource.
+ *
+ * @return The current device memory resource.
+ */
+inline rmm::mr::device_memory_resource* get_current_device_resource()
+{
+  return rmm::mr::get_current_device_resource();
+}
+
+/**
+ * @brief Get the current device memory resource reference.
+ *
+ * @return The current device memory resource reference.
+ */
+inline rmm::device_async_resource_ref get_current_device_resource_ref()
+{
+  // For now, match current behavior which is to return current resource pointer
+  return rmm::mr::get_current_device_resource();
+}
+
+/**
+ * @brief Set the current device memory resource.
+ *
+ * @param mr The new device memory resource.
+ * @return The previous device memory resource.
+ */
+inline rmm::mr::device_memory_resource* set_current_device_resource(
+  rmm::mr::device_memory_resource* mr)
+{
+  return rmm::mr::set_current_device_resource(mr);
+}
+
+/**
+ * @brief Set the current device memory resource reference.
+ *
+ * @param mr The new device memory resource reference.
+ * @return The previous device memory resource reference.
+ */
+inline rmm::device_async_resource_ref set_current_device_resource_ref(
+  rmm::device_async_resource_ref mr)
+{
+  return rmm::mr::set_current_device_resource_ref(mr);
+}
+
+/**
+ * @brief Reset the current device memory resource reference to the initial resource.
+ *
+ * @return The previous device memory resource reference.
+ */
+inline rmm::device_async_resource_ref reset_current_device_resource_ref()
+{
+  return rmm::mr::reset_current_device_resource_ref();
+}
+
+/** @} */  // end of group
+}  // namespace cudf
diff --git a/cpp/include/cudf/utilities/pinned_memory.hpp b/cpp/include/cudf/utilities/pinned_memory.hpp
index 623a033698f..2cab0aa363e 100644
--- a/cpp/include/cudf/utilities/pinned_memory.hpp
+++ b/cpp/include/cudf/utilities/pinned_memory.hpp
@@ -17,8 +17,7 @@
 #pragma once
 
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <optional>
 
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index 04bd51e9aa3..7b86f971cae 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -20,11 +20,10 @@
 #include <cudf_test/file_utilities.hpp>
 
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT cudf {
 namespace test {
@@ -38,7 +37,7 @@ namespace test {
  * ```
  */
 class BaseFixture : public ::testing::Test {
-  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
+  rmm::device_async_resource_ref _mr{cudf::get_current_device_resource_ref()};
 
  public:
   /**
@@ -59,7 +58,7 @@ class BaseFixture : public ::testing::Test {
  */
 template <typename T>
 class BaseFixtureWithParam : public ::testing::TestWithParam<T> {
-  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
+  rmm::device_async_resource_ref _mr{cudf::get_current_device_resource_ref()};
 
  public:
   /**
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index d00db222b62..6206c1311d2 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -33,11 +33,11 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/device_buffer.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/functional.h>
@@ -771,10 +771,10 @@ class strings_column_wrapper : public detail::column_wrapper {
     auto all_valid        = thrust::make_constant_iterator(true);
     auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, all_valid);
     auto d_chars          = cudf::detail::make_device_uvector_async(
-      chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+      chars, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
     auto d_offsets = std::make_unique<cudf::column>(
       cudf::detail::make_device_uvector_sync(
-        offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
+        offsets, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref()),
       rmm::device_buffer{},
       0);
     wrapped =
@@ -821,14 +821,14 @@ class strings_column_wrapper : public detail::column_wrapper {
     auto [chars, offsets]        = detail::make_chars_and_offsets(begin, end, v);
     auto [null_mask, null_count] = detail::make_null_mask_vector(v, v + num_strings);
     auto d_chars                 = cudf::detail::make_device_uvector_async(
-      chars, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+      chars, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
     auto d_offsets = std::make_unique<cudf::column>(
       cudf::detail::make_device_uvector_async(
-        offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
+        offsets, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref()),
       rmm::device_buffer{},
       0);
     auto d_bitmask = cudf::detail::make_device_uvector_sync(
-      null_mask, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+      null_mask, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
     wrapped = cudf::make_strings_column(
       num_strings, std::move(d_offsets), d_chars.release(), null_count, d_bitmask.release());
   }
@@ -1651,7 +1651,7 @@ class lists_column_wrapper : public detail::column_wrapper {
     auto data = children.empty() ? cudf::empty_like(expected_hierarchy)
                                  : cudf::concatenate(children,
                                                      cudf::test::get_default_stream(),
-                                                     rmm::mr::get_current_device_resource());
+                                                     cudf::get_current_device_resource_ref());
 
     // increment depth
     depth = expected_depth + 1;
@@ -1756,7 +1756,7 @@ class lists_column_wrapper : public detail::column_wrapper {
                        lists_column_view(expected_hierarchy).child()),
       col.null_count(),
       cudf::copy_bitmask(
-        col, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()),
+        col, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref()),
       cudf::test::get_default_stream());
   }
 
diff --git a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
index 417bbb3d9ab..b4001babe24 100644
--- a/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
+++ b/cpp/include/cudf_test/stream_checking_resource_adaptor.hpp
@@ -18,9 +18,9 @@
 #include <cudf_test/default_stream.hpp>
 
 #include <cudf/detail/utilities/stacktrace.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <iostream>
 
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index 5fd2403b0f2..1758790cd64 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -24,9 +24,9 @@
 #include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
@@ -171,7 +171,7 @@ void tdigest_minmax_compare(cudf::tdigest::tdigest_column_view const& tdv,
   thrust::host_vector<device_span<T const>> h_spans;
   h_spans.push_back({input_values.begin<T>(), static_cast<size_t>(input_values.size())});
   auto spans = cudf::detail::make_device_uvector_async(
-    h_spans, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    h_spans, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto expected_min = cudf::make_fixed_width_column(
     data_type{type_id::FLOAT64}, spans.size(), mask_state::UNALLOCATED);
@@ -271,7 +271,7 @@ void tdigest_simple_all_nulls_aggregation(Func op)
 
   // NOTE: an empty tdigest column still has 1 row.
   auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -562,12 +562,12 @@ template <typename MergeFunc>
 void tdigest_merge_empty(MergeFunc merge_op)
 {
   // 3 empty tdigests all in the same group
-  auto a = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
-                                                            rmm::mr::get_current_device_resource());
-  auto b = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
-                                                            rmm::mr::get_current_device_resource());
-  auto c = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
-                                                            rmm::mr::get_current_device_resource());
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto b = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   std::vector<column_view> cols;
   cols.push_back(*a);
   cols.push_back(*b);
@@ -578,7 +578,7 @@ void tdigest_merge_empty(MergeFunc merge_op)
   auto result      = merge_op(*values, delta);
 
   auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
 }
diff --git a/cpp/include/cudf_test/testing_main.hpp b/cpp/include/cudf_test/testing_main.hpp
index ed83ddabb00..272c91133f8 100644
--- a/cpp/include/cudf_test/testing_main.hpp
+++ b/cpp/include/cudf_test/testing_main.hpp
@@ -21,6 +21,7 @@
 
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/aligned.hpp>
 #include <rmm/cuda_stream_view.hpp>
@@ -30,7 +31,6 @@
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
 namespace CUDF_EXPORT cudf {
@@ -161,7 +161,7 @@ inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts)
 {
   auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();
   auto resource       = cudf::test::create_memory_resource(rmm_mode);
-  rmm::mr::set_current_device_resource(resource.get());
+  cudf::set_current_device_resource(resource.get());
   return resource;
 }
 
@@ -178,7 +178,7 @@ inline auto make_memory_resource_adaptor(cxxopts::ParseResult const& cmd_opts)
  */
 inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
 {
-  auto resource                      = rmm::mr::get_current_device_resource();
+  auto resource                      = cudf::get_current_device_resource_ref();
   auto const stream_mode             = cmd_opts["stream_mode"].as<std::string>();
   auto const stream_error_mode       = cmd_opts["stream_error_mode"].as<std::string>();
   auto const error_on_invalid_stream = (stream_error_mode == "error");
@@ -186,7 +186,7 @@ inline auto make_stream_mode_adaptor(cxxopts::ParseResult const& cmd_opts)
   auto adaptor                       = cudf::test::stream_checking_resource_adaptor(
     resource, error_on_invalid_stream, check_default_stream);
   if ((stream_mode == "new_cudf_default") || (stream_mode == "new_testing_default")) {
-    rmm::mr::set_current_device_resource(&adaptor);
+    cudf::set_current_device_resource(&adaptor);
   }
   return adaptor;
 }
diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h
index 7c395ffee42..5f3e7efbbfe 100644
--- a/cpp/include/doxygen_groups.h
+++ b/cpp/include/doxygen_groups.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -30,6 +30,7 @@
 
 /**
  * @defgroup default_stream Default Stream
+ * @defgroup memory_resource Memory Resource Management
  * @defgroup cudf_classes Classes
  * @{
  *   @defgroup column_classes Column
diff --git a/cpp/include/nvtext/byte_pair_encoding.hpp b/cpp/include/nvtext/byte_pair_encoding.hpp
index 6559933f696..ab862df044d 100644
--- a/cpp/include/nvtext/byte_pair_encoding.hpp
+++ b/cpp/include/nvtext/byte_pair_encoding.hpp
@@ -21,8 +21,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT nvtext {
 
@@ -49,7 +48,7 @@ struct bpe_merge_pairs {
    */
   bpe_merge_pairs(std::unique_ptr<cudf::column>&& input,
                   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   /**
    * @brief Construct a new bpe merge pairs object
@@ -60,7 +59,7 @@ struct bpe_merge_pairs {
    */
   bpe_merge_pairs(cudf::strings_column_view const& input,
                   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
   ~bpe_merge_pairs();
   bpe_merge_pairs();
@@ -98,7 +97,7 @@ struct bpe_merge_pairs {
 std::unique_ptr<bpe_merge_pairs> load_merge_pairs(
   cudf::strings_column_view const& merge_pairs,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Byte pair encode the input strings.
@@ -130,7 +129,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
   cudf::strings_column_view const& input,
   bpe_merge_pairs const& merges_pairs,
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
-  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/detail/generate_ngrams.hpp b/cpp/include/nvtext/detail/generate_ngrams.hpp
index 7c49421560d..ae48fed4e79 100644
--- a/cpp/include/nvtext/detail/generate_ngrams.hpp
+++ b/cpp/include/nvtext/detail/generate_ngrams.hpp
@@ -15,10 +15,11 @@
  */
 #pragma once
 
+#include <cudf/utilities/memory_resource.hpp>
+
 #include <nvtext/generate_ngrams.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT nvtext {
 namespace detail {
diff --git a/cpp/include/nvtext/detail/load_hash_file.hpp b/cpp/include/nvtext/detail/load_hash_file.hpp
index 438a4a9afdd..1334cbf47ea 100644
--- a/cpp/include/nvtext/detail/load_hash_file.hpp
+++ b/cpp/include/nvtext/detail/load_hash_file.hpp
@@ -16,11 +16,11 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/subword_tokenize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cstdint>
 #include <cstring>
diff --git a/cpp/include/nvtext/detail/tokenize.hpp b/cpp/include/nvtext/detail/tokenize.hpp
index 57ad008f1a9..5e5c78e993f 100644
--- a/cpp/include/nvtext/detail/tokenize.hpp
+++ b/cpp/include/nvtext/detail/tokenize.hpp
@@ -19,9 +19,9 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace CUDF_EXPORT nvtext {
 namespace detail {
diff --git a/cpp/include/nvtext/edit_distance.hpp b/cpp/include/nvtext/edit_distance.hpp
index 102f2cffa18..723ba310a1e 100644
--- a/cpp/include/nvtext/edit_distance.hpp
+++ b/cpp/include/nvtext/edit_distance.hpp
@@ -19,8 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 //! NVText APIs
 namespace CUDF_EXPORT nvtext {
@@ -64,7 +63,7 @@ std::unique_ptr<cudf::column> edit_distance(
   cudf::strings_column_view const& input,
   cudf::strings_column_view const& targets,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Compute the edit distance between all the strings in the input column.
@@ -102,7 +101,7 @@ std::unique_ptr<cudf::column> edit_distance(
 std::unique_ptr<cudf::column> edit_distance_matrix(
   cudf::strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp
index ce79d985a49..54282b8ef3c 100644
--- a/cpp/include/nvtext/generate_ngrams.hpp
+++ b/cpp/include/nvtext/generate_ngrams.hpp
@@ -19,8 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT nvtext {
 /**
@@ -62,7 +61,7 @@ std::unique_ptr<cudf::column> generate_ngrams(
   cudf::size_type ngrams,
   cudf::string_scalar const& separator,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Generates ngrams of characters within each string
@@ -91,7 +90,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
   cudf::strings_column_view const& input,
   cudf::size_type ngrams            = 2,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Hashes ngrams of characters within each string
@@ -126,7 +125,7 @@ std::unique_ptr<cudf::column> hash_character_ngrams(
   cudf::strings_column_view const& input,
   cudf::size_type ngrams            = 5,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/jaccard.hpp b/cpp/include/nvtext/jaccard.hpp
index 3c3486c079e..e0b924ac658 100644
--- a/cpp/include/nvtext/jaccard.hpp
+++ b/cpp/include/nvtext/jaccard.hpp
@@ -18,8 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT nvtext {
 /**
@@ -76,7 +75,7 @@ std::unique_ptr<cudf::column> jaccard_index(
   cudf::strings_column_view const& input2,
   cudf::size_type width,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index fc28ecfb199..c83a4260c19 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -20,10 +20,9 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 namespace CUDF_EXPORT nvtext {
 /**
  * @addtogroup nvtext_minhash
@@ -56,7 +55,7 @@ std::unique_ptr<cudf::column> minhash(
   cudf::numeric_scalar<uint32_t> seed = 0,
   cudf::size_type width               = 4,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the minhash values for each string per seed
@@ -88,7 +87,7 @@ std::unique_ptr<cudf::column> minhash(
   cudf::device_span<uint32_t const> seeds,
   cudf::size_type width             = 4,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the minhash value for each string
@@ -117,7 +116,7 @@ std::unique_ptr<cudf::column> minhash64(
   cudf::numeric_scalar<uint64_t> seed = 0,
   cudf::size_type width               = 4,
   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr   = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr   = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the minhash values for each string per seed
@@ -149,7 +148,7 @@ std::unique_ptr<cudf::column> minhash64(
   cudf::device_span<uint64_t const> seeds,
   cudf::size_type width             = 4,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/ngrams_tokenize.hpp b/cpp/include/nvtext/ngrams_tokenize.hpp
index 1048cd4abad..e3b3c23a7a9 100644
--- a/cpp/include/nvtext/ngrams_tokenize.hpp
+++ b/cpp/include/nvtext/ngrams_tokenize.hpp
@@ -19,8 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT nvtext {
 /**
@@ -84,7 +83,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(
   cudf::string_scalar const& delimiter,
   cudf::string_scalar const& separator,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/normalize.hpp b/cpp/include/nvtext/normalize.hpp
index ec0b8981f8f..74325f4a406 100644
--- a/cpp/include/nvtext/normalize.hpp
+++ b/cpp/include/nvtext/normalize.hpp
@@ -18,8 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 //! NVText APIs
 namespace CUDF_EXPORT nvtext {
@@ -55,7 +54,7 @@ namespace CUDF_EXPORT nvtext {
 std::unique_ptr<cudf::column> normalize_spaces(
   cudf::strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Normalizes strings characters for tokenizing.
@@ -106,7 +105,7 @@ std::unique_ptr<cudf::column> normalize_characters(
   cudf::strings_column_view const& input,
   bool do_lower_case,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/replace.hpp b/cpp/include/nvtext/replace.hpp
index eedcd3976ca..bbd0503379b 100644
--- a/cpp/include/nvtext/replace.hpp
+++ b/cpp/include/nvtext/replace.hpp
@@ -19,8 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 //! NVText APIs
 namespace CUDF_EXPORT nvtext {
@@ -91,7 +90,7 @@ std::unique_ptr<cudf::column> replace_tokens(
   cudf::strings_column_view const& replacements,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Removes tokens whose lengths are less than a specified number of characters.
@@ -140,7 +139,7 @@ std::unique_ptr<cudf::column> filter_tokens(
   cudf::string_scalar const& replacement = cudf::string_scalar{""},
   cudf::string_scalar const& delimiter   = cudf::string_scalar{""},
   rmm::cuda_stream_view stream           = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr      = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr      = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/stemmer.hpp b/cpp/include/nvtext/stemmer.hpp
index 4607c42ceed..55a4124bfd0 100644
--- a/cpp/include/nvtext/stemmer.hpp
+++ b/cpp/include/nvtext/stemmer.hpp
@@ -19,8 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT nvtext {
 /**
@@ -83,7 +82,7 @@ std::unique_ptr<cudf::column> is_letter(
   letter_type ltype,
   cudf::size_type character_index,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns boolean column indicating if character at `indices[i]` of `input[i]`
@@ -136,7 +135,7 @@ std::unique_ptr<cudf::column> is_letter(
   letter_type ltype,
   cudf::column_view const& indices,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the Porter Stemmer measurements of a strings column.
@@ -170,7 +169,7 @@ std::unique_ptr<cudf::column> is_letter(
 std::unique_ptr<cudf::column> porter_stemmer_measure(
   cudf::strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/subword_tokenize.hpp b/cpp/include/nvtext/subword_tokenize.hpp
index b5636c8401b..c4210699975 100644
--- a/cpp/include/nvtext/subword_tokenize.hpp
+++ b/cpp/include/nvtext/subword_tokenize.hpp
@@ -19,8 +19,7 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT nvtext {
 
@@ -68,7 +67,7 @@ struct hashed_vocabulary {
  */
 std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   std::string const& filename_hashed_vocabulary,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Result object for the subword_tokenize functions.
@@ -158,7 +157,7 @@ tokenizer_result subword_tokenize(
   uint32_t stride,
   bool do_lower_case,
   bool do_truncate,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/include/nvtext/tokenize.hpp b/cpp/include/nvtext/tokenize.hpp
index 833b53efcde..e61601c6fea 100644
--- a/cpp/include/nvtext/tokenize.hpp
+++ b/cpp/include/nvtext/tokenize.hpp
@@ -19,8 +19,7 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace CUDF_EXPORT nvtext {
 /**
@@ -63,7 +62,7 @@ std::unique_ptr<cudf::column> tokenize(
   cudf::strings_column_view const& input,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a single column of strings by tokenizing the input strings
@@ -99,7 +98,7 @@ std::unique_ptr<cudf::column> tokenize(
   cudf::strings_column_view const& input,
   cudf::strings_column_view const& delimiters,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the number of tokens in each string of a strings column.
@@ -130,7 +129,7 @@ std::unique_ptr<cudf::column> count_tokens(
   cudf::strings_column_view const& input,
   cudf::string_scalar const& delimiter = cudf::string_scalar{""},
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the number of tokens in each string of a strings column
@@ -162,7 +161,7 @@ std::unique_ptr<cudf::column> count_tokens(
   cudf::strings_column_view const& input,
   cudf::strings_column_view const& delimiters,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns a single column of strings by converting each character to a string.
@@ -188,7 +187,7 @@ std::unique_ptr<cudf::column> count_tokens(
 std::unique_ptr<cudf::column> character_tokenize(
   cudf::strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Creates a strings column from a strings column of tokens and an
@@ -229,7 +228,7 @@ std::unique_ptr<cudf::column> detokenize(
   cudf::column_view const& row_indices,
   cudf::string_scalar const& separator = cudf::string_scalar(" "),
   rmm::cuda_stream_view stream         = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr    = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr    = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Vocabulary object to be used with nvtext::tokenize_with_vocabulary
@@ -251,7 +250,7 @@ struct tokenize_vocabulary {
    */
   tokenize_vocabulary(cudf::strings_column_view const& input,
                       rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-                      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+                      rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
   ~tokenize_vocabulary();
 
   struct tokenize_vocabulary_impl;
@@ -274,7 +273,7 @@ struct tokenize_vocabulary {
 std::unique_ptr<tokenize_vocabulary> load_vocabulary(
   cudf::strings_column_view const& input,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /**
  * @brief Returns the token ids for the input string by looking up each delimited
@@ -307,7 +306,7 @@ std::unique_ptr<cudf::column> tokenize_with_vocabulary(
   cudf::string_scalar const& delimiter,
   cudf::size_type default_id        = -1,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
 /** @} */  // end of tokenize group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 25b0f68aaa8..a6c878efbbc 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -35,11 +35,11 @@
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/std/optional>
 
diff --git a/cpp/src/binaryop/compiled/binary_ops.cu b/cpp/src/binaryop/compiled/binary_ops.cu
index 7a0bc312434..3c558f1e264 100644
--- a/cpp/src/binaryop/compiled/binary_ops.cu
+++ b/cpp/src/binaryop/compiled/binary_ops.cu
@@ -24,11 +24,11 @@
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/functional.h>
@@ -116,7 +116,7 @@ scalar_as_column_view::return_type scalar_as_column_view::operator()<cudf::struc
 auto scalar_to_column_view(
   scalar const& scal,
   rmm::cuda_stream_view stream,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   return type_dispatcher(scal.type(), scalar_as_column_view{}, scal, stream, mr);
 }
diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp
index ceeba9cf817..cdcc40331f2 100644
--- a/cpp/src/binaryop/compiled/binary_ops.hpp
+++ b/cpp/src/binaryop/compiled/binary_ops.hpp
@@ -19,9 +19,9 @@
 #include <cudf/binaryop.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index d0faeea8336..4ca05f9c335 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -27,13 +27,13 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/binary_search.h>
diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu
index 90f719b9516..973b1ffd133 100644
--- a/cpp/src/column/column.cu
+++ b/cpp/src/column/column.cu
@@ -30,12 +30,12 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index 0260068d4db..482413d0ccb 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -23,10 +23,9 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/detail/fill.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <thrust/iterator/constant_iterator.h>
 
 namespace cudf {
diff --git a/cpp/src/column/column_factories.cu b/cpp/src/column/column_factories.cu
index ad9c5e4d3a0..60405ae7af1 100644
--- a/cpp/src/column/column_factories.cu
+++ b/cpp/src/column/column_factories.cu
@@ -21,8 +21,7 @@
 #include <cudf/lists/detail/lists_column_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/detail/strings_column_factories.cuh>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/uninitialized_fill.h>
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index ac9931335ff..b8e140f1fa5 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -32,11 +32,11 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/advance.h>
 #include <thrust/binary_search.h>
@@ -82,7 +82,7 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
                  [](auto const& col) { return *col; });
 
   auto d_views =
-    make_device_uvector_async(device_views, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(device_views, stream, cudf::get_current_device_resource_ref());
 
   // Compute the partition offsets
   auto offsets = cudf::detail::make_host_vector<size_t>(views.size() + 1, stream);
@@ -94,7 +94,7 @@ auto create_device_views(host_span<column_view const> views, rmm::cuda_stream_vi
     [](auto const& col) { return col.size(); },
     thrust::plus{});
   auto d_offsets =
-    make_device_uvector_async(offsets, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(offsets, stream, cudf::get_current_device_resource_ref());
   auto const output_size = offsets.back();
 
   return std::make_tuple(
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 95544742fb7..15aa31ff5ee 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -28,10 +28,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -1939,8 +1939,8 @@ struct contiguous_split_state {
       std::transform(h_buf_sizes,
                      h_buf_sizes + num_partitions,
                      std::back_inserter(out_buffers),
-                     [stream = stream,
-                      mr = mr.value_or(rmm::mr::get_current_device_resource())](std::size_t bytes) {
+                     [stream = stream, mr = mr.value_or(cudf::get_current_device_resource_ref())](
+                       std::size_t bytes) {
                        return rmm::device_buffer{bytes, stream, mr};
                      });
     }
diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index bac8dbe5d95..d60fb5ce110 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -23,10 +23,10 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index e86a1f8d6f1..e5e2514d035 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -25,13 +25,12 @@
 #include <cudf/strings/detail/copy_if_else.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -180,7 +179,7 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::column_view const& lh
                                                     out_of_bounds_policy::DONT_CHECK,
                                                     negative_index_policy::NOT_ALLOWED,
                                                     stream,
-                                                    rmm::mr::get_current_device_resource());
+                                                    cudf::get_current_device_resource_ref());
 
   auto result = cudf::detail::scatter(
     table_view{std::vector<column_view>{scatter_src_lhs->get_column(0).view()}},
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index dd18f99a3c8..bffb48a8ec0 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -31,11 +31,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 
@@ -100,7 +100,7 @@ struct out_of_place_copy_range_dispatch {
     cudf::size_type source_end,
     cudf::size_type target_begin,
     rmm::cuda_stream_view stream,
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
   {
     auto p_ret = std::make_unique<cudf::column>(target, stream, mr);
     if ((!p_ret->nullable()) && source.has_nulls(source_begin, source_end)) {
@@ -157,7 +157,7 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
     cudf::dictionary::detail::add_keys(dict_target, dict_source.keys(), stream, mr);
   auto const target_view = cudf::dictionary_column_view(target_matched->view());
   auto source_matched    = cudf::dictionary::detail::set_keys(
-    dict_source, target_view.keys(), stream, rmm::mr::get_current_device_resource());
+    dict_source, target_view.keys(), stream, cudf::get_current_device_resource_ref());
   auto const source_view = cudf::dictionary_column_view(source_matched->view());
 
   // build the new indices by calling in_place_copy_range on just the indices
diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu
index 5eb039419df..d1ab39d665d 100644
--- a/cpp/src/copying/gather.cu
+++ b/cpp/src/copying/gather.cu
@@ -23,9 +23,9 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index b8860da479c..29a28f81d1a 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -27,9 +27,9 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <stdexcept>
 
diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp
index 819ad593c0a..1282eec6c44 100644
--- a/cpp/src/copying/pack.cpp
+++ b/cpp/src/copying/pack.cpp
@@ -18,9 +18,9 @@
 #include <cudf/detail/contiguous_split.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/copying/purge_nonempty_nulls.cu b/cpp/src/copying/purge_nonempty_nulls.cu
index 581d0a00924..684deabf038 100644
--- a/cpp/src/copying/purge_nonempty_nulls.cu
+++ b/cpp/src/copying/purge_nonempty_nulls.cu
@@ -18,8 +18,7 @@
 #include <cudf/detail/gather.cuh>
 #include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/count.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/copying/reverse.cu b/cpp/src/copying/reverse.cu
index d3d42e35e26..effbb59f223 100644
--- a/cpp/src/copying/reverse.cu
+++ b/cpp/src/copying/reverse.cu
@@ -21,12 +21,11 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/constant_iterator.h>
diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu
index ba00527f6b6..dc03856c7cf 100644
--- a/cpp/src/copying/sample.cu
+++ b/cpp/src/copying/sample.cu
@@ -24,9 +24,9 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 993ee074f14..cd14eb96ec4 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -33,10 +33,10 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/count.h>
@@ -198,7 +198,7 @@ struct column_scalar_scatterer_impl<dictionary32, MapIterator> {
                                    mr);
     auto dict_view    = dictionary_column_view(dict_target->view());
     auto scalar_index = dictionary::detail::get_index(
-      dict_view, source.get(), stream, rmm::mr::get_current_device_resource());
+      dict_view, source.get(), stream, cudf::get_current_device_resource_ref());
     auto scalar_iter = thrust::make_permutation_iterator(
       indexalator_factory::make_input_iterator(*scalar_index), thrust::make_constant_iterator(0));
     auto new_indices = std::make_unique<column>(dict_view.get_indices_annotated(), stream, mr);
@@ -271,7 +271,7 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
     auto scatter_functor   = column_scalar_scatterer<decltype(scatter_iter)>{};
     auto fields_iter_begin = make_counting_transform_iterator(0, [&](auto const& i) {
       auto row_slr = detail::get_element(
-        typed_s->view().column(i), 0, stream, rmm::mr::get_current_device_resource());
+        typed_s->view().column(i), 0, stream, cudf::get_current_device_resource_ref());
       return type_dispatcher<dispatch_storage_type>(row_slr->type(),
                                                     scatter_functor,
                                                     *row_slr,
@@ -416,7 +416,7 @@ std::unique_ptr<column> boolean_mask_scatter(column_view const& input,
 
   // The scatter map is actually a table with only one column, which is scatter map.
   auto scatter_map = detail::apply_boolean_mask(
-    table_view{{indices->view()}}, boolean_mask, stream, rmm::mr::get_current_device_resource());
+    table_view{{indices->view()}}, boolean_mask, stream, cudf::get_current_device_resource_ref());
   auto output_table = detail::scatter(
     table_view{{input}}, scatter_map->get_column(0).view(), table_view{{target}}, stream, mr);
 
diff --git a/cpp/src/copying/segmented_shift.cu b/cpp/src/copying/segmented_shift.cu
index b7abc60f240..6ea5c5ab38a 100644
--- a/cpp/src/copying/segmented_shift.cu
+++ b/cpp/src/copying/segmented_shift.cu
@@ -21,10 +21,10 @@
 #include <cudf/strings/detail/copy_if_else.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu
index 91254f21170..674f6dbd28a 100644
--- a/cpp/src/copying/shift.cu
+++ b/cpp/src/copying/shift.cu
@@ -25,13 +25,13 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index 7629cad79a9..fd9a6b8f5fe 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -29,13 +29,13 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf/wrappers/durations.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/permutation_iterator.h>
diff --git a/cpp/src/datetime/timezone.cpp b/cpp/src/datetime/timezone.cpp
index 7ca1b51df98..6498a5e6c55 100644
--- a/cpp/src/datetime/timezone.cpp
+++ b/cpp/src/datetime/timezone.cpp
@@ -18,8 +18,7 @@
 #include <cudf/detail/timezone.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/table.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <algorithm>
 #include <filesystem>
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index 0ed9006f88b..565055009ba 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -30,11 +30,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
-
 namespace cudf {
 namespace dictionary {
 namespace detail {
@@ -61,7 +59,7 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
   // first, concatenate the keys together
   // [a,b,c,d,f] + [d,b,e] = [a,b,c,d,f,d,b,e]
   auto combined_keys = cudf::detail::concatenate(
-    std::vector<column_view>{old_keys, new_keys}, stream, rmm::mr::get_current_device_resource());
+    std::vector<column_view>{old_keys, new_keys}, stream, cudf::get_current_device_resource_ref());
 
   // Drop duplicates from the combined keys, then sort the result.
   // sort(distinct([a,b,c,d,f,d,b,e])) = [a,b,c,d,e,f]
diff --git a/cpp/src/dictionary/decode.cu b/cpp/src/dictionary/decode.cu
index 9f05593fc40..fb013586999 100644
--- a/cpp/src/dictionary/decode.cu
+++ b/cpp/src/dictionary/decode.cu
@@ -23,9 +23,9 @@
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
diff --git a/cpp/src/dictionary/detail/concatenate.cu b/cpp/src/dictionary/detail/concatenate.cu
index 72828309425..b3a8bb4cd20 100644
--- a/cpp/src/dictionary/detail/concatenate.cu
+++ b/cpp/src/dictionary/detail/concatenate.cu
@@ -27,13 +27,12 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -120,7 +119,7 @@ struct compute_children_offsets_fn {
         return offsets_pair{lhs.first + rhs.first, lhs.second + rhs.second};
       });
     return cudf::detail::make_device_uvector_sync(
-      offsets, stream, rmm::mr::get_current_device_resource());
+      offsets, stream, cudf::get_current_device_resource_ref());
   }
 
  private:
@@ -229,7 +228,7 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns,
     return keys;
   });
   auto all_keys =
-    cudf::detail::concatenate(keys_views, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::concatenate(keys_views, stream, cudf::get_current_device_resource_ref());
 
   // sort keys and remove duplicates;
   // this becomes the keys child for the output dictionary column
diff --git a/cpp/src/dictionary/detail/merge.cu b/cpp/src/dictionary/detail/merge.cu
index c65aa5d1101..0af71397196 100644
--- a/cpp/src/dictionary/detail/merge.cu
+++ b/cpp/src/dictionary/detail/merge.cu
@@ -22,10 +22,10 @@
 #include <cudf/dictionary/detail/merge.hpp>
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
diff --git a/cpp/src/dictionary/dictionary_factories.cu b/cpp/src/dictionary/dictionary_factories.cu
index 0617d71fa51..3e0c98d36ea 100644
--- a/cpp/src/dictionary/dictionary_factories.cu
+++ b/cpp/src/dictionary/dictionary_factories.cu
@@ -20,10 +20,10 @@
 #include <cudf/detail/unary.hpp>
 #include <cudf/dictionary/detail/encode.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace {
diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu
index ff29d83b80a..c8ccb511e8f 100644
--- a/cpp/src/dictionary/encode.cu
+++ b/cpp/src/dictionary/encode.cu
@@ -27,9 +27,9 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 35387efa56b..119f43a4ae9 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -27,11 +27,11 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index bc17dfd4bab..fe0b103cc55 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -25,10 +25,10 @@
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/dictionary/dictionary_factories.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace dictionary {
@@ -132,7 +132,7 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
     input, make_column_from_scalar(replacement, 1, stream)->view(), stream, mr);
   auto const input_view = dictionary_column_view(input_matched->view());
   auto const scalar_index =
-    get_index(input_view, replacement, stream, rmm::mr::get_current_device_resource());
+    get_index(input_view, replacement, stream, cudf::get_current_device_resource_ref());
 
   // now build the new indices by doing replace-null on the updated indices
   auto const input_indices = input_view.get_indices_annotated();
diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu
index 231619836f9..04e2c17635d 100644
--- a/cpp/src/dictionary/search.cu
+++ b/cpp/src/dictionary/search.cu
@@ -20,13 +20,13 @@
 #include <cudf/dictionary/search.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index cf40fda5971..be5c3dd6a26 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -31,11 +31,11 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
@@ -185,7 +185,7 @@ std::vector<std::unique_ptr<column>> match_dictionaries(
 {
   std::vector<column_view> keys(input.size());
   std::transform(input.begin(), input.end(), keys.begin(), [](auto& col) { return col.keys(); });
-  auto new_keys  = cudf::detail::concatenate(keys, stream, rmm::mr::get_current_device_resource());
+  auto new_keys  = cudf::detail::concatenate(keys, stream, cudf::get_current_device_resource_ref());
   auto keys_view = new_keys->view();
   std::vector<std::unique_ptr<column>> result(input.size());
   std::transform(input.begin(), input.end(), result.begin(), [keys_view, mr, stream](auto& col) {
diff --git a/cpp/src/filling/calendrical_month_sequence.cu b/cpp/src/filling/calendrical_month_sequence.cu
index f984f307ddd..f5ad211bd0d 100644
--- a/cpp/src/filling/calendrical_month_sequence.cu
+++ b/cpp/src/filling/calendrical_month_sequence.cu
@@ -20,11 +20,11 @@
 #include <cudf/filling.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index 1fc9ed31c09..cfb209c0569 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -32,12 +32,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 
@@ -175,7 +174,7 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
 
   // get the index of the key just added
   auto index_of_value = cudf::dictionary::detail::get_index(
-    target_matched->view(), value, stream, rmm::mr::get_current_device_resource());
+    target_matched->view(), value, stream, cudf::get_current_device_resource_ref());
   // now call fill using just the indices column and the new index
   auto new_indices =
     cudf::type_dispatcher(target_indices.type(),
diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu
index ff4005d9366..2e78954d78a 100644
--- a/cpp/src/filling/repeat.cu
+++ b/cpp/src/filling/repeat.cu
@@ -27,13 +27,12 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu
index ee1745b8498..d8fd993bbd1 100644
--- a/cpp/src/filling/sequence.cu
+++ b/cpp/src/filling/sequence.cu
@@ -24,11 +24,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/sequence.h>
 #include <thrust/tabulate.h>
diff --git a/cpp/src/groupby/common/utils.hpp b/cpp/src/groupby/common/utils.hpp
index 82c3c08b501..80849357811 100644
--- a/cpp/src/groupby/common/utils.hpp
+++ b/cpp/src/groupby/common/utils.hpp
@@ -18,10 +18,9 @@
 
 #include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/detail/groupby.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <memory>
 #include <vector>
 
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index e43dfcb4d98..cc0682b68b9 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -35,12 +35,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -284,7 +283,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::replace_nulls
     std::back_inserter(results),
     [&](auto i) {
       bool nullable       = values.column(i).nullable();
-      auto final_mr       = nullable ? rmm::mr::get_current_device_resource() : mr;
+      auto final_mr       = nullable ? cudf::get_current_device_resource_ref() : mr;
       auto grouped_values = helper().grouped_values(values.column(i), stream, final_mr);
       return nullable ? detail::group_replace_nulls(
                           *grouped_values, group_labels, replace_policies[i], stream, mr)
@@ -331,7 +330,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::shift(
     std::back_inserter(results),
     [&](size_type i) {
       auto grouped_values =
-        helper().grouped_values(values.column(i), stream, rmm::mr::get_current_device_resource());
+        helper().grouped_values(values.column(i), stream, cudf::get_current_device_resource_ref());
       return cudf::detail::segmented_shift(
         grouped_values->view(), group_offsets, offsets[i], fill_values[i].get(), stream, mr);
     });
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index 35161eada28..f9a80a048b5 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -39,11 +39,11 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.cuh>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_set.cuh>
 #include <thrust/for_each.h>
@@ -401,7 +401,7 @@ void sparse_to_dense_results(table_view const& keys,
                              rmm::device_async_resource_ref mr)
 {
   auto row_bitmask =
-    cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first;
+    cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first;
   bool skip_key_rows_with_nulls = keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
   bitmask_type const* row_bitmask_ptr =
     skip_key_rows_with_nulls ? static_cast<bitmask_type*>(row_bitmask.data()) : nullptr;
@@ -475,13 +475,13 @@ void compute_single_pass_aggs(table_view const& keys,
   auto d_sparse_table = mutable_table_device_view::create(sparse_table, stream);
   auto d_values       = table_device_view::create(flattened_values, stream);
   auto const d_aggs   = cudf::detail::make_device_uvector_async(
-    agg_kinds, stream, rmm::mr::get_current_device_resource());
+    agg_kinds, stream, cudf::get_current_device_resource_ref());
   auto const skip_key_rows_with_nulls =
     keys_have_nulls and include_null_keys == null_policy::EXCLUDE;
 
   auto row_bitmask =
     skip_key_rows_with_nulls
-      ? cudf::detail::bitmask_and(keys, stream, rmm::mr::get_current_device_resource()).first
+      ? cudf::detail::bitmask_and(keys, stream, cudf::get_current_device_resource_ref()).first
       : rmm::device_buffer{};
 
   thrust::for_each_n(
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index ba59616babe..a9085a1f1fd 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -35,9 +35,9 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <unordered_map>
@@ -435,7 +435,7 @@ void aggregate_result_functor::operator()<aggregation::COLLECT_SET>(aggregation
                                                     helper.num_groups(stream),
                                                     null_handling,
                                                     stream,
-                                                    rmm::mr::get_current_device_resource());
+                                                    cudf::get_current_device_resource_ref());
   auto const nulls_equal =
     dynamic_cast<cudf::detail::collect_set_aggregation const&>(agg)._nulls_equal;
   auto const nans_equal =
@@ -507,7 +507,7 @@ void aggregate_result_functor::operator()<aggregation::MERGE_SETS>(aggregation c
                                                        helper.group_offsets(stream),
                                                        helper.num_groups(stream),
                                                        stream,
-                                                       rmm::mr::get_current_device_resource());
+                                                       cudf::get_current_device_resource_ref());
   auto const& merge_sets_agg = dynamic_cast<cudf::detail::merge_sets_aggregation const&>(agg);
   cache.add_result(values,
                    agg,
diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp
index 057085fe85d..a13866802be 100644
--- a/cpp/src/groupby/sort/functors.hpp
+++ b/cpp/src/groupby/sort/functors.hpp
@@ -20,9 +20,9 @@
 #include <cudf/detail/aggregation/result_cache.hpp>
 #include <cudf/detail/groupby/sort_helper.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/groupby/sort/group_argmax.cu b/cpp/src/groupby/sort/group_argmax.cu
index a1d197b1307..7dce341130e 100644
--- a/cpp/src/groupby/sort/group_argmax.cu
+++ b/cpp/src/groupby/sort/group_argmax.cu
@@ -17,10 +17,10 @@
 #include "groupby/sort/group_single_pass_reduction_util.cuh"
 
 #include <cudf/detail/gather.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
diff --git a/cpp/src/groupby/sort/group_argmin.cu b/cpp/src/groupby/sort/group_argmin.cu
index 03243bef836..c4bed330b9f 100644
--- a/cpp/src/groupby/sort/group_argmin.cu
+++ b/cpp/src/groupby/sort/group_argmin.cu
@@ -17,10 +17,10 @@
 #include "groupby/sort/group_single_pass_reduction_util.cuh"
 
 #include <cudf/detail/gather.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
diff --git a/cpp/src/groupby/sort/group_collect.cu b/cpp/src/groupby/sort/group_collect.cu
index 555c5d3ad41..a1cac7ee3bc 100644
--- a/cpp/src/groupby/sort/group_collect.cu
+++ b/cpp/src/groupby/sort/group_collect.cu
@@ -20,10 +20,10 @@
 #include <cudf/detail/copy_if.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
diff --git a/cpp/src/groupby/sort/group_correlation.cu b/cpp/src/groupby/sort/group_correlation.cu
index 152aa98a8b9..7f2102dc8ee 100644
--- a/cpp/src/groupby/sort/group_correlation.cu
+++ b/cpp/src/groupby/sort/group_correlation.cu
@@ -21,12 +21,12 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
diff --git a/cpp/src/groupby/sort/group_count.cu b/cpp/src/groupby/sort/group_count.cu
index 56a4943e272..2e1cb9591c4 100644
--- a/cpp/src/groupby/sort/group_count.cu
+++ b/cpp/src/groupby/sort/group_count.cu
@@ -18,11 +18,11 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/adjacent_difference.h>
diff --git a/cpp/src/groupby/sort/group_count_scan.cu b/cpp/src/groupby/sort/group_count_scan.cu
index c076f21e1f8..5897cc341d4 100644
--- a/cpp/src/groupby/sort/group_count_scan.cu
+++ b/cpp/src/groupby/sort/group_count_scan.cu
@@ -17,11 +17,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scan.h>
diff --git a/cpp/src/groupby/sort/group_histogram.cu b/cpp/src/groupby/sort/group_histogram.cu
index 1000ec0d470..861d801a070 100644
--- a/cpp/src/groupby/sort/group_histogram.cu
+++ b/cpp/src/groupby/sort/group_histogram.cu
@@ -23,10 +23,10 @@
 #include <cudf/reduction/detail/histogram.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_buffer.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
diff --git a/cpp/src/groupby/sort/group_m2.cu b/cpp/src/groupby/sort/group_m2.cu
index 77f33486284..a17a4433d05 100644
--- a/cpp/src/groupby/sort/group_m2.cu
+++ b/cpp/src/groupby/sort/group_m2.cu
@@ -21,13 +21,13 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
 #include <thrust/reduce.h>
diff --git a/cpp/src/groupby/sort/group_max.cu b/cpp/src/groupby/sort/group_max.cu
index 60b071c25ff..06a759dd25a 100644
--- a/cpp/src/groupby/sort/group_max.cu
+++ b/cpp/src/groupby/sort/group_max.cu
@@ -16,8 +16,9 @@
 
 #include "groupby/sort/group_single_pass_reduction_util.cuh"
 
+#include <cudf/utilities/memory_resource.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
diff --git a/cpp/src/groupby/sort/group_max_scan.cu b/cpp/src/groupby/sort/group_max_scan.cu
index 270059cfcad..21e439a2253 100644
--- a/cpp/src/groupby/sort/group_max_scan.cu
+++ b/cpp/src/groupby/sort/group_max_scan.cu
@@ -16,8 +16,9 @@
 
 #include "groupby/sort/group_scan_util.cuh"
 
+#include <cudf/utilities/memory_resource.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
diff --git a/cpp/src/groupby/sort/group_merge_lists.cu b/cpp/src/groupby/sort/group_merge_lists.cu
index 92cce1aa00e..009530a9915 100644
--- a/cpp/src/groupby/sort/group_merge_lists.cu
+++ b/cpp/src/groupby/sort/group_merge_lists.cu
@@ -16,11 +16,11 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 
diff --git a/cpp/src/groupby/sort/group_merge_m2.cu b/cpp/src/groupby/sort/group_merge_m2.cu
index 4ad8fa5ff07..746c3fe3962 100644
--- a/cpp/src/groupby/sort/group_merge_m2.cu
+++ b/cpp/src/groupby/sort/group_merge_m2.cu
@@ -20,12 +20,12 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/groupby/sort/group_min.cu b/cpp/src/groupby/sort/group_min.cu
index 22aaf664168..f86aa14430a 100644
--- a/cpp/src/groupby/sort/group_min.cu
+++ b/cpp/src/groupby/sort/group_min.cu
@@ -16,8 +16,9 @@
 
 #include "groupby/sort/group_single_pass_reduction_util.cuh"
 
+#include <cudf/utilities/memory_resource.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
diff --git a/cpp/src/groupby/sort/group_min_scan.cu b/cpp/src/groupby/sort/group_min_scan.cu
index 4ddc10a2e5a..96b7ad95a19 100644
--- a/cpp/src/groupby/sort/group_min_scan.cu
+++ b/cpp/src/groupby/sort/group_min_scan.cu
@@ -16,8 +16,9 @@
 
 #include "groupby/sort/group_scan_util.cuh"
 
+#include <cudf/utilities/memory_resource.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
diff --git a/cpp/src/groupby/sort/group_nth_element.cu b/cpp/src/groupby/sort/group_nth_element.cu
index 1bc1eef908c..a4752b6948b 100644
--- a/cpp/src/groupby/sort/group_nth_element.cu
+++ b/cpp/src/groupby/sort/group_nth_element.cu
@@ -22,11 +22,11 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/constant_iterator.h>
diff --git a/cpp/src/groupby/sort/group_nunique.cu b/cpp/src/groupby/sort/group_nunique.cu
index de11e70719a..348ab366762 100644
--- a/cpp/src/groupby/sort/group_nunique.cu
+++ b/cpp/src/groupby/sort/group_nunique.cu
@@ -18,11 +18,11 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/discard_iterator.h>
diff --git a/cpp/src/groupby/sort/group_product.cu b/cpp/src/groupby/sort/group_product.cu
index 83ca1059325..5e81c8513c8 100644
--- a/cpp/src/groupby/sort/group_product.cu
+++ b/cpp/src/groupby/sort/group_product.cu
@@ -17,10 +17,10 @@
 #include "groupby/sort/group_single_pass_reduction_util.cuh"
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
diff --git a/cpp/src/groupby/sort/group_product_scan.cu b/cpp/src/groupby/sort/group_product_scan.cu
index 40c53ceeff1..016f293ac5b 100644
--- a/cpp/src/groupby/sort/group_product_scan.cu
+++ b/cpp/src/groupby/sort/group_product_scan.cu
@@ -16,8 +16,9 @@
 
 #include "groupby/sort/group_scan_util.cuh"
 
+#include <cudf/utilities/memory_resource.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
diff --git a/cpp/src/groupby/sort/group_quantiles.cu b/cpp/src/groupby/sort/group_quantiles.cu
index 3156dfaadd0..82d557b9f7e 100644
--- a/cpp/src/groupby/sort/group_quantiles.cu
+++ b/cpp/src/groupby/sort/group_quantiles.cu
@@ -24,12 +24,12 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
@@ -165,7 +165,7 @@ std::unique_ptr<column> group_quantiles(column_view const& values,
                                         rmm::device_async_resource_ref mr)
 {
   auto dv_quantiles = cudf::detail::make_device_uvector_async(
-    quantiles, stream, rmm::mr::get_current_device_resource());
+    quantiles, stream, cudf::get_current_device_resource_ref());
 
   auto values_type = cudf::is_dictionary(values.type())
                        ? dictionary_column_view(values).keys().type()
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index 0b65889f127..65bd5ac408f 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -23,11 +23,11 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/reverse_iterator.h>
@@ -226,13 +226,13 @@ std::unique_ptr<column> average_rank_scan(column_view const& grouped_values,
                                 group_labels,
                                 group_offsets,
                                 stream,
-                                rmm::mr::get_current_device_resource());
+                                cudf::get_current_device_resource_ref());
   auto min_rank = min_rank_scan(grouped_values,
                                 value_order,
                                 group_labels,
                                 group_offsets,
                                 stream,
-                                rmm::mr::get_current_device_resource());
+                                cudf::get_current_device_resource_ref());
   auto ranks    = make_fixed_width_column(
     data_type{type_to_id<double>()}, group_labels.size(), mask_state::UNALLOCATED, stream, mr);
   auto mutable_ranks = ranks->mutable_view();
diff --git a/cpp/src/groupby/sort/group_reductions.hpp b/cpp/src/groupby/sort/group_reductions.hpp
index 5e76dc3135a..f8a531094c6 100644
--- a/cpp/src/groupby/sort/group_reductions.hpp
+++ b/cpp/src/groupby/sort/group_reductions.hpp
@@ -18,10 +18,10 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/groupby/sort/group_replace_nulls.cu b/cpp/src/groupby/sort/group_replace_nulls.cu
index 566507da230..088ed05e5eb 100644
--- a/cpp/src/groupby/sort/group_replace_nulls.cu
+++ b/cpp/src/groupby/sort/group_replace_nulls.cu
@@ -19,9 +19,9 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/replace/nulls.cuh>
 #include <cudf/replace.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/groupby/sort/group_scan.hpp b/cpp/src/groupby/sort/group_scan.hpp
index 6f2daae5f9d..b5d8ce23a97 100644
--- a/cpp/src/groupby/sort/group_scan.hpp
+++ b/cpp/src/groupby/sort/group_scan.hpp
@@ -18,10 +18,10 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/groupby/sort/group_scan_util.cuh b/cpp/src/groupby/sort/group_scan_util.cuh
index b360ba2c45d..86835ea8a67 100644
--- a/cpp/src/groupby/sort/group_scan_util.cuh
+++ b/cpp/src/groupby/sort/group_scan_util.cuh
@@ -29,12 +29,12 @@
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 5e892710d3b..2358f47bbbb 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -26,11 +26,11 @@
 #include <cudf/detail/utilities/element_argminmax.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/groupby/sort/group_std.cu b/cpp/src/groupby/sort/group_std.cu
index 70f64186f21..86ee20dbbe2 100644
--- a/cpp/src/groupby/sort/group_std.cu
+++ b/cpp/src/groupby/sort/group_std.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
@@ -29,7 +30,6 @@
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/groupby/sort/group_sum.cu b/cpp/src/groupby/sort/group_sum.cu
index 316b6f395bb..fbbc9b5fd15 100644
--- a/cpp/src/groupby/sort/group_sum.cu
+++ b/cpp/src/groupby/sort/group_sum.cu
@@ -17,10 +17,10 @@
 #include "groupby/sort/group_single_pass_reduction_util.cuh"
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
diff --git a/cpp/src/groupby/sort/group_sum_scan.cu b/cpp/src/groupby/sort/group_sum_scan.cu
index 01c4d0c2c4a..d3af8c8794a 100644
--- a/cpp/src/groupby/sort/group_sum_scan.cu
+++ b/cpp/src/groupby/sort/group_sum_scan.cu
@@ -16,8 +16,9 @@
 
 #include "groupby/sort/group_scan_util.cuh"
 
+#include <cudf/utilities/memory_resource.hpp>
+
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace groupby {
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index f211c61b3b7..62bceccdf5f 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -33,9 +33,9 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
@@ -145,7 +145,7 @@ void scan_result_functor::operator()<aggregation::RANK>(aggregation const& agg)
       return cudf::detail::sequence(group_labels.size(),
                                     *cudf::make_fixed_width_scalar(size_type{0}, stream),
                                     stream,
-                                    rmm::mr::get_current_device_resource());
+                                    cudf::get_current_device_resource_ref());
     } else {
       auto sort_order = (rank_agg._method == rank_method::FIRST ? cudf::detail::stable_sorted_order
                                                                        : cudf::detail::sorted_order);
@@ -153,7 +153,7 @@ void scan_result_functor::operator()<aggregation::RANK>(aggregation const& agg)
                                {order::ASCENDING, rank_agg._column_order},
                                {null_order::AFTER, rank_agg._null_precedence},
                         stream,
-                        rmm::mr::get_current_device_resource());
+                        cudf::get_current_device_resource_ref());
     }
   }();
 
@@ -172,18 +172,18 @@ void scan_result_functor::operator()<aggregation::RANK>(aggregation const& agg)
                           helper.group_labels(stream),
                           helper.group_offsets(stream),
                           stream,
-                          rmm::mr::get_current_device_resource());
+                          cudf::get_current_device_resource_ref());
   if (rank_agg._percentage != rank_percentage::NONE) {
     auto count = get_grouped_values().nullable() and rank_agg._null_handling == null_policy::EXCLUDE
                    ? detail::group_count_valid(get_grouped_values(),
                                                helper.group_labels(stream),
                                                helper.num_groups(stream),
                                                stream,
-                                               rmm::mr::get_current_device_resource())
+                                               cudf::get_current_device_resource_ref())
                    : detail::group_count_all(helper.group_offsets(stream),
                                              helper.num_groups(stream),
                                              stream,
-                                             rmm::mr::get_current_device_resource());
+                                             cudf::get_current_device_resource_ref());
     result     = detail::group_rank_to_percentage(rank_agg._method,
                                               rank_agg._percentage,
                                               *result,
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index 4da1da089cd..35e3e05a364 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -31,11 +31,11 @@
 #include <cudf/strings/string_view.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/distance.h>
@@ -100,7 +100,7 @@ column_view sort_groupby_helper::key_sort_order(rmm::cuda_stream_view stream)
                                                numeric_scalar<size_type>(0, true, stream),
                                                numeric_scalar<size_type>(1, true, stream),
                                                stream,
-                                               rmm::mr::get_current_device_resource());
+                                               cudf::get_current_device_resource_ref());
     return sliced_key_sorted_order();
   }
 
@@ -109,7 +109,7 @@ column_view sort_groupby_helper::key_sort_order(rmm::cuda_stream_view stream)
                               ? std::vector(_keys.num_columns(), null_order::AFTER)
                               : _null_precedence;
     _key_sorted_order     = cudf::detail::stable_sorted_order(
-      _keys, {}, precedence, stream, rmm::mr::get_current_device_resource());
+      _keys, {}, precedence, stream, cudf::get_current_device_resource_ref());
   } else {  // Pandas style
     // Temporarily prepend the keys table with a column that indicates the
     // presence of a null value within a row. This allows moving all rows that
@@ -125,7 +125,7 @@ column_view sort_groupby_helper::key_sort_order(rmm::cuda_stream_view stream)
     }();
 
     _key_sorted_order = cudf::detail::stable_sorted_order(
-      augmented_keys, {}, precedence, stream, rmm::mr::get_current_device_resource());
+      augmented_keys, {}, precedence, stream, cudf::get_current_device_resource_ref());
 
     // All rows with one or more null values are at the end of the resulting sorted order.
   }
@@ -223,7 +223,7 @@ column_view sort_groupby_helper::unsorted_keys_labels(rmm::cuda_stream_view stre
                           scatter_map,
                           table_view({temp_labels->view()}),
                           stream,
-                          rmm::mr::get_current_device_resource());
+                          cudf::get_current_device_resource_ref());
 
   _unsorted_keys_labels = std::move(t_unsorted_keys_labels->release()[0]);
 
@@ -235,13 +235,13 @@ column_view sort_groupby_helper::keys_bitmask_column(rmm::cuda_stream_view strea
   if (_keys_bitmask_column) return _keys_bitmask_column->view();
 
   auto [row_bitmask, null_count] =
-    cudf::detail::bitmask_and(_keys, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::bitmask_and(_keys, stream, cudf::get_current_device_resource_ref());
 
   auto const zero = numeric_scalar<int8_t>(0, true, stream);
   // Create a temporary variable and only set _keys_bitmask_column right before the return.
   // This way, a 2nd (parallel) call to this will not be given a partially created object.
   auto keys_bitmask_column = cudf::detail::sequence(
-    _keys.num_rows(), zero, zero, stream, rmm::mr::get_current_device_resource());
+    _keys.num_rows(), zero, zero, stream, cudf::get_current_device_resource_ref());
   keys_bitmask_column->set_null_mask(std::move(row_bitmask), null_count);
 
   _keys_bitmask_column = std::move(keys_bitmask_column);
diff --git a/cpp/src/hash/md5_hash.cu b/cpp/src/hash/md5_hash.cu
index 0b559e8e86c..c7bfd4aecf4 100644
--- a/cpp/src/hash/md5_hash.cu
+++ b/cpp/src/hash/md5_hash.cu
@@ -25,11 +25,11 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/constant_iterator.h>
diff --git a/cpp/src/hash/murmurhash3_x64_128.cu b/cpp/src/hash/murmurhash3_x64_128.cu
index 6c91532a193..090bd92af8c 100644
--- a/cpp/src/hash/murmurhash3_x64_128.cu
+++ b/cpp/src/hash/murmurhash3_x64_128.cu
@@ -19,10 +19,10 @@
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/hashing/detail/murmurhash3_x64_128.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 
diff --git a/cpp/src/hash/murmurhash3_x86_32.cu b/cpp/src/hash/murmurhash3_x86_32.cu
index eac72f5d995..dd7b19633be 100644
--- a/cpp/src/hash/murmurhash3_x86_32.cu
+++ b/cpp/src/hash/murmurhash3_x86_32.cu
@@ -20,10 +20,10 @@
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/tabulate.h>
 
diff --git a/cpp/src/hash/sha1_hash.cu b/cpp/src/hash/sha1_hash.cu
index f7609eb26af..3a0c442ed16 100644
--- a/cpp/src/hash/sha1_hash.cu
+++ b/cpp/src/hash/sha1_hash.cu
@@ -19,11 +19,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/hash/sha224_hash.cu b/cpp/src/hash/sha224_hash.cu
index cf04504a489..3ac3c5dbbba 100644
--- a/cpp/src/hash/sha224_hash.cu
+++ b/cpp/src/hash/sha224_hash.cu
@@ -19,11 +19,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/hash/sha256_hash.cu b/cpp/src/hash/sha256_hash.cu
index 664913c0f4c..8036308f09e 100644
--- a/cpp/src/hash/sha256_hash.cu
+++ b/cpp/src/hash/sha256_hash.cu
@@ -19,11 +19,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/hash/sha384_hash.cu b/cpp/src/hash/sha384_hash.cu
index 92192f501ec..30fe181d55b 100644
--- a/cpp/src/hash/sha384_hash.cu
+++ b/cpp/src/hash/sha384_hash.cu
@@ -19,11 +19,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/hash/sha512_hash.cu b/cpp/src/hash/sha512_hash.cu
index 244206aeeb9..fd74f38423b 100644
--- a/cpp/src/hash/sha512_hash.cu
+++ b/cpp/src/hash/sha512_hash.cu
@@ -19,11 +19,11 @@
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/hash/sha_hash.cuh b/cpp/src/hash/sha_hash.cuh
index 6976241057e..ebaec8e2775 100644
--- a/cpp/src/hash/sha_hash.cuh
+++ b/cpp/src/hash/sha_hash.cuh
@@ -24,11 +24,11 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
diff --git a/cpp/src/hash/xxhash_64.cu b/cpp/src/hash/xxhash_64.cu
index 4366c12b453..fad8383210b 100644
--- a/cpp/src/hash/xxhash_64.cu
+++ b/cpp/src/hash/xxhash_64.cu
@@ -19,11 +19,11 @@
 #include <cudf/hashing/detail/hash_functions.cuh>
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/tabulate.h>
 
diff --git a/cpp/src/interop/arrow_utilities.cpp b/cpp/src/interop/arrow_utilities.cpp
index 3776daf41aa..a99262fb3bf 100644
--- a/cpp/src/interop/arrow_utilities.cpp
+++ b/cpp/src/interop/arrow_utilities.cpp
@@ -21,7 +21,6 @@
 #include <cudf/utilities/error.hpp>
 
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/interop/arrow_utilities.hpp b/cpp/src/interop/arrow_utilities.hpp
index 1cee3071fcb..1b79fbf9eda 100644
--- a/cpp/src/interop/arrow_utilities.hpp
+++ b/cpp/src/interop/arrow_utilities.hpp
@@ -17,11 +17,10 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <nanoarrow/nanoarrow.h>
 
diff --git a/cpp/src/interop/decimal_conversion_utilities.cuh b/cpp/src/interop/decimal_conversion_utilities.cuh
index 41263147404..6b62eb0fee4 100644
--- a/cpp/src/interop/decimal_conversion_utilities.cuh
+++ b/cpp/src/interop/decimal_conversion_utilities.cuh
@@ -18,9 +18,9 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <type_traits>
 
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 78ddd7f5ad5..ba5b11b90d8 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -20,12 +20,12 @@
 #include <cudf/structs/struct_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <dlpack/dlpack.h>
 
diff --git a/cpp/src/interop/from_arrow_device.cu b/cpp/src/interop/from_arrow_device.cu
index 440df571de0..057e563c86e 100644
--- a/cpp/src/interop/from_arrow_device.cu
+++ b/cpp/src/interop/from_arrow_device.cu
@@ -28,13 +28,13 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
diff --git a/cpp/src/interop/from_arrow_host.cu b/cpp/src/interop/from_arrow_host.cu
index efde8f2a463..2e9504a6726 100644
--- a/cpp/src/interop/from_arrow_host.cu
+++ b/cpp/src/interop/from_arrow_host.cu
@@ -31,13 +31,13 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
diff --git a/cpp/src/interop/from_arrow_stream.cu b/cpp/src/interop/from_arrow_stream.cu
index 578105aa90a..deff62be576 100644
--- a/cpp/src/interop/from_arrow_stream.cu
+++ b/cpp/src/interop/from_arrow_stream.cu
@@ -24,7 +24,6 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <nanoarrow/nanoarrow.h>
 #include <nanoarrow/nanoarrow.hpp>
diff --git a/cpp/src/interop/to_arrow_device.cu b/cpp/src/interop/to_arrow_device.cu
index a5f3f9d87f5..a2874b46b06 100644
--- a/cpp/src/interop/to_arrow_device.cu
+++ b/cpp/src/interop/to_arrow_device.cu
@@ -30,14 +30,13 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/interop/to_arrow_host.cu b/cpp/src/interop/to_arrow_host.cu
index 26f7c7e6e53..79fb7550044 100644
--- a/cpp/src/interop/to_arrow_host.cu
+++ b/cpp/src/interop/to_arrow_host.cu
@@ -30,14 +30,13 @@
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/io/avro/reader_impl.cu b/cpp/src/io/avro/reader_impl.cu
index 69a0e982a5b..f0a92f7554d 100644
--- a/cpp/src/io/avro/reader_impl.cu
+++ b/cpp/src/io/avro/reader_impl.cu
@@ -26,6 +26,7 @@
 #include <cudf/io/detail/avro.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 
@@ -33,7 +34,6 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/equal.h>
 #include <thrust/functional.h>
@@ -448,7 +448,7 @@ std::vector<column_buffer> decode_data(metadata& meta,
   }
 
   auto block_list = cudf::detail::make_device_uvector_async(
-    meta.block_list, stream, rmm::mr::get_current_device_resource());
+    meta.block_list, stream, cudf::get_current_device_resource_ref());
 
   schema_desc.host_to_device_async(stream);
 
@@ -578,9 +578,9 @@ table_with_metadata read_avro(std::unique_ptr<cudf::io::datasource>&& source,
         }
 
         d_global_dict = cudf::detail::make_device_uvector_async(
-          h_global_dict, stream, rmm::mr::get_current_device_resource());
+          h_global_dict, stream, cudf::get_current_device_resource_ref());
         d_global_dict_data = cudf::detail::make_device_uvector_async(
-          h_global_dict_data, stream, rmm::mr::get_current_device_resource());
+          h_global_dict_data, stream, cudf::get_current_device_resource_ref());
 
         stream.synchronize();
       }
diff --git a/cpp/src/io/comp/uncomp.cpp b/cpp/src/io/comp/uncomp.cpp
index ab516dd585d..602ff1734b6 100644
--- a/cpp/src/io/comp/uncomp.cpp
+++ b/cpp/src/io/comp/uncomp.cpp
@@ -21,6 +21,7 @@
 
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <cuda_runtime.h>
@@ -510,7 +511,7 @@ size_t decompress_zstd(host_span<uint8_t const> src,
 {
   // Init device span of spans (source)
   auto const d_src =
-    cudf::detail::make_device_uvector_async(src, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::make_device_uvector_async(src, stream, cudf::get_current_device_resource_ref());
   auto hd_srcs = cudf::detail::hostdevice_vector<device_span<uint8_t const>>(1, stream);
   hd_srcs[0]   = d_src;
   hd_srcs.host_to_device_async(stream);
diff --git a/cpp/src/io/csv/csv_gpu.cu b/cpp/src/io/csv/csv_gpu.cu
index 5a0c6decfda..273e82edf8b 100644
--- a/cpp/src/io/csv/csv_gpu.cu
+++ b/cpp/src/io/csv/csv_gpu.cu
@@ -28,6 +28,7 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
@@ -807,7 +808,7 @@ cudf::detail::host_vector<column_type_histogram> detect_column_types(
   int const grid_size  = (row_starts.size() + block_size - 1) / block_size;
 
   auto d_stats = detail::make_zeroed_device_uvector_async<column_type_histogram>(
-    num_active_columns, stream, rmm::mr::get_current_device_resource());
+    num_active_columns, stream, cudf::get_current_device_resource_ref());
 
   data_type_detection<<<grid_size, block_size, 0, stream.value()>>>(
     options, data, column_flags, row_starts, d_stats);
diff --git a/cpp/src/io/csv/durations.cu b/cpp/src/io/csv/durations.cu
index 918951d5902..eac86b2f199 100644
--- a/cpp/src/io/csv/durations.cu
+++ b/cpp/src/io/csv/durations.cu
@@ -22,9 +22,9 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/io/csv/durations.hpp b/cpp/src/io/csv/durations.hpp
index f671f435eeb..62f31dcd09c 100644
--- a/cpp/src/io/csv/durations.hpp
+++ b/cpp/src/io/csv/durations.hpp
@@ -17,10 +17,9 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index e27b06682bb..ebca334a715 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -37,10 +37,10 @@
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -532,7 +532,7 @@ void infer_column_types(parse_options const& parse_opts,
   auto const column_stats = cudf::io::csv::gpu::detect_column_types(
     parse_opts.view(),
     data,
-    make_device_uvector_async(column_flags, stream, rmm::mr::get_current_device_resource()),
+    make_device_uvector_async(column_flags, stream, cudf::get_current_device_resource_ref()),
     row_offsets,
     num_inferred_columns,
     stream);
@@ -601,16 +601,16 @@ std::vector<column_buffer> decode_data(parse_options const& parse_opts,
   }
 
   auto d_valid_counts = cudf::detail::make_zeroed_device_uvector_async<size_type>(
-    num_active_columns, stream, rmm::mr::get_current_device_resource());
+    num_active_columns, stream, cudf::get_current_device_resource_ref());
 
   cudf::io::csv::gpu::decode_row_column_data(
     parse_opts.view(),
     data,
-    make_device_uvector_async(column_flags, stream, rmm::mr::get_current_device_resource()),
+    make_device_uvector_async(column_flags, stream, cudf::get_current_device_resource_ref()),
     row_offsets,
-    make_device_uvector_async(column_types, stream, rmm::mr::get_current_device_resource()),
-    make_device_uvector_async(h_data, stream, rmm::mr::get_current_device_resource()),
-    make_device_uvector_async(h_valid, stream, rmm::mr::get_current_device_resource()),
+    make_device_uvector_async(column_types, stream, cudf::get_current_device_resource_ref()),
+    make_device_uvector_async(h_data, stream, cudf::get_current_device_resource_ref()),
+    make_device_uvector_async(h_valid, stream, cudf::get_current_device_resource_ref()),
     d_valid_counts,
     stream);
 
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 00a6dcb2286..b84446b5f3e 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -38,11 +38,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/host_vector.h>
@@ -436,7 +435,7 @@ void write_csv(data_sink* out_sink,
   // (even for tables with no rows)
   //
   write_chunked_begin(
-    out_sink, table, user_column_names, options, stream, rmm::mr::get_current_device_resource());
+    out_sink, table, user_column_names, options, stream, cudf::get_current_device_resource_ref());
 
   if (table.num_rows() > 0) {
     // no need to check same-size columns constraint; auto-enforced by table_view
@@ -470,7 +469,7 @@ void write_csv(data_sink* out_sink,
 
     // convert each chunk to CSV:
     //
-    column_to_strings_fn converter{options, stream, rmm::mr::get_current_device_resource()};
+    column_to_strings_fn converter{options, stream, cudf::get_current_device_resource_ref()};
     for (auto&& sub_view : vector_views) {
       // Skip if the table has no rows
       if (sub_view.num_rows() == 0) continue;
@@ -505,13 +504,13 @@ void write_csv(data_sink* out_sink,
                                                     options_narep,
                                                     strings::separator_on_nulls::YES,
                                                     stream,
-                                                    rmm::mr::get_current_device_resource());
+                                                    cudf::get_current_device_resource_ref());
         return cudf::strings::detail::replace_nulls(
-          str_table_view.column(0), options_narep, stream, rmm::mr::get_current_device_resource());
+          str_table_view.column(0), options_narep, stream, cudf::get_current_device_resource_ref());
       }();
 
       write_chunked(
-        out_sink, str_concat_col->view(), options, stream, rmm::mr::get_current_device_resource());
+        out_sink, str_concat_col->view(), options, stream, cudf::get_current_device_resource_ref());
     }
   }
 }
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 62c3c5cd245..0ca54da5aaf 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -35,8 +35,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <algorithm>
 
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index 8d6890045be..8890c786287 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -26,12 +26,12 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <cuda/functional>
@@ -369,7 +369,7 @@ std::vector<std::string> copy_strings_to_host_sync(
                                    0,
                                    options_view,
                                    stream,
-                                   rmm::mr::get_current_device_resource());
+                                   cudf::get_current_device_resource_ref());
   auto to_host        = [stream](auto const& col) {
     if (col.is_empty()) return std::vector<std::string>{};
     auto const scv     = cudf::strings_column_view(col);
@@ -825,9 +825,9 @@ void make_device_json_column(device_span<SymbolT const> input,
   }
 
   auto d_ignore_vals = cudf::detail::make_device_uvector_async(
-    ignore_vals, stream, rmm::mr::get_current_device_resource());
+    ignore_vals, stream, cudf::get_current_device_resource_ref());
   auto d_columns_data = cudf::detail::make_device_uvector_async(
-    columns_data, stream, rmm::mr::get_current_device_resource());
+    columns_data, stream, cudf::get_current_device_resource_ref());
 
   // 3. scatter string offsets to respective columns, set validity bits
   thrust::for_each_n(
@@ -1118,13 +1118,13 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
   auto gpu_tree = [&]() {
     // Parse the JSON and get the token stream
     const auto [tokens_gpu, token_indices_gpu] =
-      get_token_stream(d_input, options, stream, rmm::mr::get_current_device_resource());
+      get_token_stream(d_input, options, stream, cudf::get_current_device_resource_ref());
     // gpu tree generation
     return get_tree_representation(tokens_gpu,
                                    token_indices_gpu,
                                    options.is_enabled_mixed_types_as_string(),
                                    stream,
-                                   rmm::mr::get_current_device_resource());
+                                   cudf::get_current_device_resource_ref());
   }();  // IILE used to free memory of token data.
 #ifdef NJP_DEBUG_PRINT
   auto h_input = cudf::detail::make_host_vector_async(d_input, stream);
@@ -1150,7 +1150,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                   is_array_of_arrays,
                                   options.is_enabled_lines(),
                                   stream,
-                                  rmm::mr::get_current_device_resource());
+                                  cudf::get_current_device_resource_ref());
 
   device_json_column root_column(stream, mr);
   root_column.type = json_col_t::ListColumn;
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index cb8b4e97ebb..7899ea7bac4 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -18,12 +18,12 @@
 
 #include <cudf/io/detail/json.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/discard_iterator.h>
 
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index ee6bc0b9f4b..4d0dc010c57 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -26,12 +26,12 @@
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_radix_sort.cuh>
 #include <cuco/static_set.cuh>
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 20c143f66c7..b06458e1a8e 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -22,8 +22,7 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <map>
 #include <vector>
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 1e484d74679..d76e5447c30 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -31,12 +31,12 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/device_vector.h>
 #include <thrust/iterator/discard_iterator.h>
@@ -1517,7 +1517,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
     fst::detail::make_translation_functor<symbol_t, 0, 2>(token_filter::TransduceToken{}),
     stream);
 
-  auto const mr = rmm::mr::get_current_device_resource();
+  auto const mr = cudf::get_current_device_resource_ref();
   rmm::device_scalar<SymbolOffsetT> d_num_selected_tokens(stream, mr);
   rmm::device_uvector<PdaTokenT> filtered_tokens_out{tokens.size(), stream, mr};
   rmm::device_uvector<SymbolOffsetT> filtered_token_indices_out{tokens.size(), stream, mr};
@@ -2125,10 +2125,10 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
       // Move string_offsets and string_lengths to GPU
       rmm::device_uvector<json_column::row_offset_t> d_string_offsets =
         cudf::detail::make_device_uvector_async(
-          json_col.string_offsets, stream, rmm::mr::get_current_device_resource());
+          json_col.string_offsets, stream, cudf::get_current_device_resource_ref());
       rmm::device_uvector<json_column::row_offset_t> d_string_lengths =
         cudf::detail::make_device_uvector_async(
-          json_col.string_lengths, stream, rmm::mr::get_current_device_resource());
+          json_col.string_lengths, stream, cudf::get_current_device_resource_ref());
 
       // Prepare iterator that returns (string_offset, string_length)-tuples
       auto offset_length_it =
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 98e8e8d3c7e..bd82b040359 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -25,11 +25,11 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/iterator/constant_iterator.h>
@@ -229,13 +229,13 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
   // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
   // invoke pre-processing FST
   if (reader_opts.is_enabled_normalize_single_quotes()) {
-    normalize_single_quotes(bufview, stream, rmm::mr::get_current_device_resource());
+    normalize_single_quotes(bufview, stream, cudf::get_current_device_resource_ref());
   }
 
   // If input JSON buffer has unquoted spaces and tabs and option to normalize whitespaces is
   // enabled, invoke pre-processing FST
   if (reader_opts.is_enabled_normalize_whitespace()) {
-    normalize_whitespace(bufview, stream, rmm::mr::get_current_device_resource());
+    normalize_whitespace(bufview, stream, cudf::get_current_device_resource_ref());
   }
 
   auto buffer =
@@ -304,7 +304,7 @@ device_span<char> ingest_raw_input(device_span<char> buffer,
                     "Currently only single-character delimiters are supported");
       auto const delimiter_source = thrust::make_constant_iterator('\n');
       auto const d_delimiter_map  = cudf::detail::make_device_uvector_async(
-        delimiter_map, stream, rmm::mr::get_current_device_resource());
+        delimiter_map, stream, cudf::get_current_device_resource_ref());
       thrust::scatter(rmm::exec_policy_nosync(stream),
                       delimiter_source,
                       delimiter_source + d_delimiter_map.size(),
@@ -421,7 +421,7 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
     batched_reader_opts.set_byte_range_offset(batch_offsets[i]);
     batched_reader_opts.set_byte_range_size(batch_offsets[i + 1] - batch_offsets[i]);
     partial_tables.emplace_back(
-      read_batch(sources, batched_reader_opts, stream, rmm::mr::get_current_device_resource()));
+      read_batch(sources, batched_reader_opts, stream, cudf::get_current_device_resource_ref()));
   }
 
   auto expects_schema_equality =
diff --git a/cpp/src/io/json/read_json.hpp b/cpp/src/io/json/read_json.hpp
index 7e3a920f00d..982190eecb5 100644
--- a/cpp/src/io/json/read_json.hpp
+++ b/cpp/src/io/json/read_json.hpp
@@ -20,11 +20,11 @@
 #include <cudf/io/json.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 
diff --git a/cpp/src/io/json/write_json.cu b/cpp/src/io/json/write_json.cu
index 60bb2366e87..dc7199d7ab1 100644
--- a/cpp/src/io/json/write_json.cu
+++ b/cpp/src/io/json/write_json.cu
@@ -42,12 +42,11 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -437,7 +436,7 @@ std::unique_ptr<column> join_list_of_strings(lists_column_view const& lists_stri
 
   // scatter string and separator
   auto labels = cudf::lists::detail::generate_labels(
-    lists_strings, num_strings, stream, rmm::mr::get_current_device_resource());
+    lists_strings, num_strings, stream, cudf::get_current_device_resource_ref());
   auto d_strings_children = cudf::column_device_view::create(strings_children, stream);
   thrust::for_each(rmm::exec_policy(stream),
                    thrust::make_counting_iterator<size_type>(0),
@@ -645,13 +644,13 @@ struct column_to_strings_fn {
       }
     };
     auto new_offsets = cudf::lists::detail::get_normalized_offsets(
-      lists_column_view(column), stream_, rmm::mr::get_current_device_resource());
+      lists_column_view(column), stream_, cudf::get_current_device_resource_ref());
     auto const list_child_string = make_lists_column(
       column.size(),
       std::move(new_offsets),
       child_string_with_null(),
       column.null_count(),
-      cudf::detail::copy_bitmask(column, stream_, rmm::mr::get_current_device_resource()),
+      cudf::detail::copy_bitmask(column, stream_, cudf::get_current_device_resource_ref()),
       stream_);
     return join_list_of_strings(lists_column_view(*list_child_string),
                                 list_row_begin_wrap.value(stream_),
@@ -736,7 +735,7 @@ struct column_to_strings_fn {
                              narep,
                              options_.is_enabled_include_nulls(),
                              stream_,
-                             rmm::mr::get_current_device_resource());
+                             cudf::get_current_device_resource_ref());
   }
 
  private:
@@ -765,17 +764,18 @@ std::unique_ptr<column> make_strings_column_from_host(host_span<std::string cons
   std::string const host_chars =
     std::accumulate(host_strings.begin(), host_strings.end(), std::string(""));
   auto d_chars = cudf::detail::make_device_uvector_async(
-    host_chars, stream, rmm::mr::get_current_device_resource());
+    host_chars, stream, cudf::get_current_device_resource_ref());
   std::vector<cudf::size_type> offsets(host_strings.size() + 1, 0);
   std::transform_inclusive_scan(host_strings.begin(),
                                 host_strings.end(),
                                 offsets.begin() + 1,
                                 std::plus<cudf::size_type>{},
                                 [](auto& str) { return str.size(); });
-  auto d_offsets = std::make_unique<cudf::column>(
-    cudf::detail::make_device_uvector_sync(offsets, stream, rmm::mr::get_current_device_resource()),
-    rmm::device_buffer{},
-    0);
+  auto d_offsets =
+    std::make_unique<cudf::column>(cudf::detail::make_device_uvector_sync(
+                                     offsets, stream, cudf::get_current_device_resource_ref()),
+                                   rmm::device_buffer{},
+                                   0);
   return cudf::make_strings_column(
     host_strings.size(), std::move(d_offsets), d_chars.release(), 0, {});
 }
@@ -798,7 +798,7 @@ std::unique_ptr<column> make_column_names_column(host_span<column_name_info cons
   auto unescaped_string_col = make_strings_column_from_host(unescaped_column_names, stream);
   auto d_column             = column_device_view::create(*unescaped_string_col, stream);
   return escape_strings_fn{*d_column, true}.get_escaped_strings(
-    *unescaped_string_col, stream, rmm::mr::get_current_device_resource());
+    *unescaped_string_col, stream, cudf::get_current_device_resource_ref());
 }
 
 void write_chunked(data_sink* out_sink,
@@ -893,7 +893,7 @@ void write_json(data_sink* out_sink,
     }
 
     // convert each chunk to JSON:
-    column_to_strings_fn converter{options, stream, rmm::mr::get_current_device_resource()};
+    column_to_strings_fn converter{options, stream, cudf::get_current_device_resource_ref()};
 
     for (auto&& sub_view : vector_views) {
       // Skip if the table has no rows
diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
index 94b294087b8..bb2d6dbcc9f 100644
--- a/cpp/src/io/orc/reader_impl.hpp
+++ b/cpp/src/io/orc/reader_impl.hpp
@@ -22,9 +22,9 @@
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/orc.hpp>
 #include <cudf/io/orc.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <io/utilities/column_buffer.hpp>
 
diff --git a/cpp/src/io/orc/reader_impl_decode.cu b/cpp/src/io/orc/reader_impl_decode.cu
index e3b9a048be8..d628e936cb1 100644
--- a/cpp/src/io/orc/reader_impl_decode.cu
+++ b/cpp/src/io/orc/reader_impl_decode.cu
@@ -28,13 +28,13 @@
 #include <cudf/io/config_utils.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/fill.h>
@@ -506,7 +506,7 @@ void scan_null_counts(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc> const&
     }
   }
   auto const d_prefix_sums_to_update = cudf::detail::make_device_uvector_async(
-    prefix_sums_to_update, stream, rmm::mr::get_current_device_resource());
+    prefix_sums_to_update, stream, cudf::get_current_device_resource_ref());
 
   thrust::for_each(
     rmm::exec_policy_nosync(stream),
@@ -683,7 +683,7 @@ std::vector<range> find_table_splits(table_view const& input,
   segment_length = std::min(segment_length, input.num_rows());
 
   auto const d_segmented_sizes = cudf::detail::segmented_row_bit_count(
-    input, segment_length, stream, rmm::mr::get_current_device_resource());
+    input, segment_length, stream, cudf::get_current_device_resource_ref());
 
   auto segmented_sizes =
     cudf::detail::hostdevice_vector<cumulative_size>(d_segmented_sizes->size(), stream);
@@ -777,7 +777,7 @@ void reader_impl::decompress_and_decode_stripes(read_mode mode)
       [](auto const& sum, auto const& cols_level) { return sum + cols_level.size(); });
 
     return cudf::detail::make_zeroed_device_uvector_async<uint32_t>(
-      num_total_cols * stripe_count, _stream, rmm::mr::get_current_device_resource());
+      num_total_cols * stripe_count, _stream, cudf::get_current_device_resource_ref());
   }();
   std::size_t num_processed_lvl_columns      = 0;
   std::size_t num_processed_prev_lvl_columns = 0;
diff --git a/cpp/src/io/orc/reader_impl_helpers.cpp b/cpp/src/io/orc/reader_impl_helpers.cpp
index c943ae17d97..4c1079cffe8 100644
--- a/cpp/src/io/orc/reader_impl_helpers.cpp
+++ b/cpp/src/io/orc/reader_impl_helpers.cpp
@@ -16,7 +16,7 @@
 
 #include "reader_impl_helpers.hpp"
 
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf::io::orc::detail {
 
diff --git a/cpp/src/io/orc/reader_impl_helpers.hpp b/cpp/src/io/orc/reader_impl_helpers.hpp
index a563fb19e15..5528b2ee763 100644
--- a/cpp/src/io/orc/reader_impl_helpers.hpp
+++ b/cpp/src/io/orc/reader_impl_helpers.hpp
@@ -21,9 +21,9 @@
 #include "io/utilities/column_buffer.hpp"
 
 #include <cudf/io/orc.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <vector>
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index 80f32512b98..5c70e35fd2e 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -26,6 +26,7 @@
 #include <cudf/io/orc_types.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -1425,7 +1426,7 @@ void decimal_sizes_to_offsets(device_2dspan<rowgroup_rows const> rg_bounds,
 
   // Copy the vector of views to the device so that we can pass it to the kernel
   auto d_sizes = cudf::detail::make_device_uvector_async<decimal_column_element_sizes>(
-    h_sizes, stream, rmm::mr::get_current_device_resource());
+    h_sizes, stream, cudf::get_current_device_resource_ref());
 
   constexpr int block_size = 256;
   dim3 const grid_size{static_cast<unsigned int>(elem_sizes.size()),        // num decimal columns
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index ebdf9f3f249..60a64fb0ee6 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -32,6 +32,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -728,7 +729,7 @@ std::vector<std::vector<rowgroup_rows>> calculate_aligned_rowgroup_bounds(
                                 cudaMemcpyDefault,
                                 stream.value()));
   auto const d_stripes = cudf::detail::make_device_uvector_async(
-    segmentation.stripes, stream, rmm::mr::get_current_device_resource());
+    segmentation.stripes, stream, cudf::get_current_device_resource_ref());
 
   // One thread per column, per stripe
   thrust::for_each_n(
@@ -1354,7 +1355,7 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
     }
     //  Copy to device
     auto const d_stat_chunks = cudf::detail::make_device_uvector_async<statistics_chunk>(
-      h_stat_chunks, stream, rmm::mr::get_current_device_resource());
+      h_stat_chunks, stream, cudf::get_current_device_resource_ref());
     stats_merge.host_to_device_async(stream);
 
     // Encode and return
@@ -1738,7 +1739,7 @@ pushdown_null_masks init_pushdown_null_masks(orc_table_view& orc_table,
 
   // Attach null masks to device column views (async)
   auto const d_mask_ptrs = cudf::detail::make_device_uvector_async(
-    mask_ptrs, stream, rmm::mr::get_current_device_resource());
+    mask_ptrs, stream, cudf::get_current_device_resource_ref());
   thrust::for_each_n(
     rmm::exec_policy(stream),
     thrust::make_counting_iterator(0ul),
@@ -1828,7 +1829,7 @@ orc_table_view make_orc_table_view(table_view const& table,
       return orc_column.orc_kind();
     });
   auto const d_type_kinds = cudf::detail::make_device_uvector_async(
-    type_kinds, stream, rmm::mr::get_current_device_resource());
+    type_kinds, stream, cudf::get_current_device_resource_ref());
 
   rmm::device_uvector<orc_column_device_view> d_orc_columns(orc_columns.size(), stream);
   using stack_value_type = thrust::pair<column_device_view const*, cuda::std::optional<uint32_t>>;
@@ -1879,7 +1880,7 @@ orc_table_view make_orc_table_view(table_view const& table,
           std::move(d_orc_columns),
           str_col_indexes,
           cudf::detail::make_device_uvector_sync(
-            str_col_indexes, stream, rmm::mr::get_current_device_resource())};
+            str_col_indexes, stream, cudf::get_current_device_resource_ref())};
 }
 
 hostdevice_2dvector<rowgroup_rows> calculate_rowgroup_bounds(orc_table_view const& orc_table,
@@ -2239,7 +2240,7 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
 
       // Create the inverse permutation - i.e. the mapping from the original order to the sorted
       auto order_copy = cudf::detail::make_device_uvector_async<uint32_t>(
-        sd.data_order, current_stream, rmm::mr::get_current_device_resource());
+        sd.data_order, current_stream, cudf::get_current_device_resource_ref());
       thrust::scatter(rmm::exec_policy_nosync(current_stream),
                       thrust::counting_iterator<uint32_t>(0),
                       thrust::counting_iterator<uint32_t>(sd.data_order.size()),
diff --git a/cpp/src/io/parquet/predicate_pushdown.cpp b/cpp/src/io/parquet/predicate_pushdown.cpp
index c8b8b7a1193..b90ca36c8c7 100644
--- a/cpp/src/io/parquet/predicate_pushdown.cpp
+++ b/cpp/src/io/parquet/predicate_pushdown.cpp
@@ -25,12 +25,10 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
-
 #include <thrust/iterator/counting_iterator.h>
 
 #include <algorithm>
@@ -399,7 +397,7 @@ std::optional<std::vector<std::vector<size_type>>> aggregate_reader_metadata::fi
   std::reference_wrapper<ast::expression const> filter,
   rmm::cuda_stream_view stream) const
 {
-  auto mr = rmm::mr::get_current_device_resource();
+  auto mr = cudf::get_current_device_resource_ref();
   // Create row group indices.
   std::vector<std::vector<size_type>> filtered_row_group_indices;
   std::vector<std::vector<size_type>> all_row_group_indices;
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
index 65dafb568c0..dd354b905f3 100644
--- a/cpp/src/io/parquet/reader.cpp
+++ b/cpp/src/io/parquet/reader.cpp
@@ -16,7 +16,7 @@
 
 #include "reader_impl.hpp"
 
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf::io::parquet::detail {
 
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
index 9950e2f7d7d..7d817bde7af 100644
--- a/cpp/src/io/parquet/reader_impl.cpp
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -23,8 +23,7 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/detail/utilities.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -706,7 +705,7 @@ table_with_metadata reader::impl::finalize_output(read_mode mode,
     auto predicate  = cudf::detail::compute_column(*read_table,
                                                   _expr_conv.get_converted_expr().value().get(),
                                                   _stream,
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
     CUDF_EXPECTS(predicate->view().type().id() == type_id::BOOL8,
                  "Predicate filter should return a boolean");
     // Exclude columns present in filter only in output
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index 5e3cc4301f9..2d46da14bec 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -28,11 +28,10 @@
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/parquet.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <optional>
@@ -369,7 +368,7 @@ class reader::impl {
                                                                          size_t chunk_num_rows);
 
   rmm::cuda_stream_view _stream;
-  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
+  rmm::device_async_resource_ref _mr{cudf::get_current_device_resource_ref()};
 
   // Reader configs.
   struct {
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 00d62c45962..84f0dab0d8b 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -25,6 +25,7 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/config_utils.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
 
@@ -441,7 +442,7 @@ adjust_cumulative_sizes(device_span<cumulative_page_info const> c_info,
 {
   // sort by row count
   rmm::device_uvector<cumulative_page_info> c_info_sorted =
-    make_device_uvector_async(c_info, stream, rmm::mr::get_current_device_resource());
+    make_device_uvector_async(c_info, stream, cudf::get_current_device_resource_ref());
   thrust::sort(
     rmm::exec_policy_nosync(stream), c_info_sorted.begin(), c_info_sorted.end(), row_count_less{});
 
@@ -846,9 +847,9 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
     });
   }
   auto d_comp_in = cudf::detail::make_device_uvector_async(
-    comp_in, stream, rmm::mr::get_current_device_resource());
+    comp_in, stream, cudf::get_current_device_resource_ref());
   auto d_comp_out = cudf::detail::make_device_uvector_async(
-    comp_out, stream, rmm::mr::get_current_device_resource());
+    comp_out, stream, cudf::get_current_device_resource_ref());
 
   int32_t start_pos = 0;
   for (auto const& codec : codecs) {
@@ -922,9 +923,9 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
   // now copy the uncompressed V2 def and rep level data
   if (not copy_in.empty()) {
     auto const d_copy_in = cudf::detail::make_device_uvector_async(
-      copy_in, stream, rmm::mr::get_current_device_resource());
+      copy_in, stream, cudf::get_current_device_resource_ref());
     auto const d_copy_out = cudf::detail::make_device_uvector_async(
-      copy_out, stream, rmm::mr::get_current_device_resource());
+      copy_out, stream, cudf::get_current_device_resource_ref());
 
     gpu_copy_uncompressed_blocks(d_copy_in, d_copy_out, stream);
     stream.synchronize();
@@ -1143,7 +1144,7 @@ void include_decompression_scratch_size(device_span<ColumnChunkDesc const> chunk
 
   // add to the cumulative_page_info data
   rmm::device_uvector<size_t> d_temp_cost = cudf::detail::make_device_uvector_async(
-    temp_cost, stream, rmm::mr::get_current_device_resource());
+    temp_cost, stream, cudf::get_current_device_resource_ref());
   auto iter = thrust::make_counting_iterator(size_t{0});
   thrust::for_each(rmm::exec_policy_nosync(stream),
                    iter,
@@ -1346,7 +1347,7 @@ void reader::impl::setup_next_subpass(read_mode mode)
     [&]() -> std::tuple<rmm::device_uvector<page_span>, size_t, size_t> {
     if (!pass.has_compressed_data || _input_pass_read_limit == 0) {
       rmm::device_uvector<page_span> page_indices(
-        num_columns, _stream, rmm::mr::get_current_device_resource());
+        num_columns, _stream, cudf::get_current_device_resource_ref());
       auto iter = thrust::make_counting_iterator(0);
       thrust::transform(rmm::exec_policy_nosync(_stream),
                         iter,
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 557b1a45c1f..52918f5bc80 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/batched_memset.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
 
@@ -392,7 +393,7 @@ void fill_in_page_info(host_span<ColumnChunkDesc> chunks,
   }
 
   auto d_page_indexes = cudf::detail::make_device_uvector_async(
-    page_indexes, stream, rmm::mr::get_current_device_resource());
+    page_indexes, stream, cudf::get_current_device_resource_ref());
 
   auto iter = thrust::make_counting_iterator<size_type>(0);
   thrust::for_each(
@@ -754,7 +755,7 @@ void reader::impl::build_string_dict_indices()
 
   // allocate and distribute pointers
   pass.str_dict_index = cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
-    total_str_dict_indexes, _stream, rmm::mr::get_current_device_resource());
+    total_str_dict_indexes, _stream, cudf::get_current_device_resource_ref());
 
   auto iter = thrust::make_counting_iterator(0);
   thrust::for_each(
@@ -907,7 +908,7 @@ void reader::impl::allocate_level_decode_space()
   size_t const per_page_decode_buf_size = LEVEL_DECODE_BUF_SIZE * 2 * pass.level_type_size;
   auto const decode_buf_size            = per_page_decode_buf_size * pages.size();
   subpass.level_decode_data =
-    rmm::device_buffer(decode_buf_size, _stream, rmm::mr::get_current_device_resource());
+    rmm::device_buffer(decode_buf_size, _stream, cudf::get_current_device_resource_ref());
 
   // distribute the buffers
   uint8_t* buf = static_cast<uint8_t*>(subpass.level_decode_data.data());
@@ -1551,7 +1552,7 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
         .nesting_depth;
 
     auto const d_cols_info = cudf::detail::make_device_uvector_async(
-      h_cols_info, _stream, rmm::mr::get_current_device_resource());
+      h_cols_info, _stream, cudf::get_current_device_resource_ref());
 
     auto const num_keys = _input_columns.size() * max_depth * subpass.pages.size();
     // size iterator. indexes pages by sorted order
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 46c3151c731..81fd4ab9f82 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -43,6 +43,7 @@
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -1048,7 +1049,7 @@ parquet_column_view::parquet_column_view(schema_tree_node const& schema_node,
   // TODO(cp): Explore doing this for all columns in a single go outside this ctor. Maybe using
   // hostdevice_vector. Currently this involves a cudaMemcpyAsync for each column.
   _d_nullability = cudf::detail::make_device_uvector_async(
-    _nullability, stream, rmm::mr::get_current_device_resource());
+    _nullability, stream, cudf::get_current_device_resource_ref());
 
   _is_list = (_max_rep_level > 0);
 
@@ -1120,7 +1121,7 @@ void init_row_group_fragments(cudf::detail::hostdevice_2dvector<PageFragment>& f
                               rmm::cuda_stream_view stream)
 {
   auto d_partitions = cudf::detail::make_device_uvector_async(
-    partitions, stream, rmm::mr::get_current_device_resource());
+    partitions, stream, cudf::get_current_device_resource_ref());
   InitRowGroupFragments(frag, col_desc, d_partitions, part_frag_offset, fragment_size, stream);
   frag.device_to_host_sync(stream);
 }
@@ -1140,7 +1141,7 @@ void calculate_page_fragments(device_span<PageFragment> frag,
                               rmm::cuda_stream_view stream)
 {
   auto d_frag_sz = cudf::detail::make_device_uvector_async(
-    frag_sizes, stream, rmm::mr::get_current_device_resource());
+    frag_sizes, stream, cudf::get_current_device_resource_ref());
   CalculatePageFragments(frag, d_frag_sz, stream);
 }
 
@@ -1649,7 +1650,7 @@ std::vector<column_view> convert_decimal_columns_and_metadata(
       case type_id::DECIMAL32:
         // Convert data to decimal128 type
         d128_buffers.emplace_back(cudf::detail::convert_decimals_to_decimal128<int32_t>(
-          column, stream, rmm::mr::get_current_device_resource()));
+          column, stream, cudf::get_current_device_resource_ref()));
         // Update metadata
         metadata.set_decimal_precision(MAX_DECIMAL32_PRECISION);
         metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
@@ -1664,7 +1665,7 @@ std::vector<column_view> convert_decimal_columns_and_metadata(
       case type_id::DECIMAL64:
         // Convert data to decimal128 type
         d128_buffers.emplace_back(cudf::detail::convert_decimals_to_decimal128<int64_t>(
-          column, stream, rmm::mr::get_current_device_resource()));
+          column, stream, cudf::get_current_device_resource_ref()));
         // Update metadata
         metadata.set_decimal_precision(MAX_DECIMAL64_PRECISION);
         metadata.set_type_length(size_of(data_type{type_id::DECIMAL128, column.type().scale()}));
@@ -1869,7 +1870,7 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
   part_frag_offset.push_back(part_frag_offset.back() + num_frag_in_part.back());
 
   auto d_part_frag_offset = cudf::detail::make_device_uvector_async(
-    part_frag_offset, stream, rmm::mr::get_current_device_resource());
+    part_frag_offset, stream, cudf::get_current_device_resource_ref());
   cudf::detail::hostdevice_2dvector<PageFragment> row_group_fragments(
     num_columns, num_fragments, stream);
 
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index e3435a24b18..028f922bec3 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -33,13 +33,12 @@
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
@@ -345,9 +344,9 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
     auto const concurrency = 2;
     auto num_tile_states   = std::max(32, TILES_PER_CHUNK * concurrency + 32);
     auto tile_multistates =
-      scan_tile_state<multistate>(num_tile_states, stream, rmm::mr::get_current_device_resource());
+      scan_tile_state<multistate>(num_tile_states, stream, cudf::get_current_device_resource_ref());
     auto tile_offsets = scan_tile_state<output_offset>(
-      num_tile_states, stream, rmm::mr::get_current_device_resource());
+      num_tile_states, stream, cudf::get_current_device_resource_ref());
 
     multibyte_split_init_kernel<<<TILES_PER_CHUNK,
                                   THREADS_PER_TILE,
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index 8abfb000b94..249dc3b5875 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -24,9 +24,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <iomanip>
 #include <sstream>
@@ -44,7 +42,7 @@ void gather_column_buffer::allocate_strings_data(bool memset_data, rmm::cuda_str
   // default rmm memory resource.
   _strings = std::make_unique<rmm::device_uvector<string_index_pair>>(
     cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
-      size, stream, rmm::mr::get_current_device_resource()));
+      size, stream, cudf::get_current_device_resource_ref()));
 }
 
 std::unique_ptr<column> gather_column_buffer::make_string_column_impl(rmm::cuda_stream_view stream)
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index b2290965bb9..e73b2bc88de 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -26,13 +26,12 @@
 #include <cudf/io/types.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
@@ -167,7 +166,7 @@ class column_buffer_base {
   rmm::device_buffer _data{};
   rmm::device_buffer _null_mask{};
   size_type _null_count{0};
-  rmm::device_async_resource_ref _mr{rmm::mr::get_current_device_resource()};
+  rmm::device_async_resource_ref _mr{cudf::get_current_device_resource_ref()};
 
  public:
   data_type type{type_id::EMPTY};
diff --git a/cpp/src/io/utilities/data_casting.cu b/cpp/src/io/utilities/data_casting.cu
index 73362334e26..f70171eef68 100644
--- a/cpp/src/io/utilities/data_casting.cu
+++ b/cpp/src/io/utilities/data_casting.cu
@@ -28,11 +28,11 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utf8.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda/functional>
diff --git a/cpp/src/io/utilities/output_builder.cuh b/cpp/src/io/utilities/output_builder.cuh
index 3bc5ccf41ef..f7e6de03354 100644
--- a/cpp/src/io/utilities/output_builder.cuh
+++ b/cpp/src/io/utilities/output_builder.cuh
@@ -16,12 +16,12 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 
@@ -207,7 +207,7 @@ class output_builder {
   output_builder(size_type max_write_size,
                  size_type max_growth,
                  rmm::cuda_stream_view stream,
-                 rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+                 rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
     : _max_write_size{max_write_size}, _max_growth{max_growth}
   {
     CUDF_EXPECTS(max_write_size > 0, "Internal error");
diff --git a/cpp/src/io/utilities/string_parsing.hpp b/cpp/src/io/utilities/string_parsing.hpp
index 0d9e7e40e4e..1d6d5a0a570 100644
--- a/cpp/src/io/utilities/string_parsing.hpp
+++ b/cpp/src/io/utilities/string_parsing.hpp
@@ -19,10 +19,10 @@
 
 #include <cudf/types.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/tuple.h>
diff --git a/cpp/src/io/utilities/trie.cu b/cpp/src/io/utilities/trie.cu
index 3be1a8332ca..504e72147e5 100644
--- a/cpp/src/io/utilities/trie.cu
+++ b/cpp/src/io/utilities/trie.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include "trie.cuh"
 
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <cuda_runtime.h>
@@ -104,7 +105,7 @@ rmm::device_uvector<serial_trie_node> create_serialized_trie(std::vector<std::st
     if (has_children) { nodes.push_back(serial_trie_node(trie_terminating_character)); }
   }
   return cudf::detail::make_device_uvector_sync(
-    nodes, stream, rmm::mr::get_current_device_resource());
+    nodes, stream, cudf::get_current_device_resource_ref());
 }
 
 }  // namespace detail
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index 789702ce538..748691fb7d1 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -28,9 +28,9 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp
index 06eb83d6ba8..4f6a9484e8c 100644
--- a/cpp/src/join/conditional_join.hpp
+++ b/cpp/src/join/conditional_join.hpp
@@ -20,10 +20,9 @@
 #include <cudf/ast/expressions.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu
index a2ee3a7796b..eeb49736bac 100644
--- a/cpp/src/join/cross_join.cu
+++ b/cpp/src/join/cross_join.cu
@@ -27,9 +27,9 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu
index 3d95b0c5a5c..c7294152982 100644
--- a/cpp/src/join/distinct_hash_join.cu
+++ b/cpp/src/join/distinct_hash_join.cu
@@ -24,11 +24,11 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cooperative_groups.h>
 #include <cub/block/block_scan.cuh>
@@ -139,7 +139,8 @@ distinct_hash_join<HasNested>::distinct_hash_join(cudf::table_view const& build,
   } else {
     auto stencil = thrust::counting_iterator<size_type>{0};
     auto const row_bitmask =
-      cudf::detail::bitmask_and(this->_build, stream, rmm::mr::get_current_device_resource()).first;
+      cudf::detail::bitmask_and(this->_build, stream, cudf::get_current_device_resource_ref())
+        .first;
     auto const pred =
       cudf::detail::row_is_valid{reinterpret_cast<bitmask_type const*>(row_bitmask.data())};
 
diff --git a/cpp/src/join/hash_join.cu b/cpp/src/join/hash_join.cu
index 5d01482f44a..beeaabfdaab 100644
--- a/cpp/src/join/hash_join.cu
+++ b/cpp/src/join/hash_join.cu
@@ -22,13 +22,13 @@
 #include <cudf/hashing/detail/helper_functions.cuh>
 #include <cudf/join.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/functional.h>
@@ -385,7 +385,7 @@ hash_join<Hasher>::hash_join(cudf::table_view const& build,
   if (_is_empty) { return; }
 
   auto const row_bitmask =
-    cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()).first;
+    cudf::detail::bitmask_and(build, stream, cudf::get_current_device_resource_ref()).first;
   cudf::detail::build_join_hash_table(_build,
                                       _preprocessed_build,
                                       _hash_table,
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index bc7f09763ec..0abff27667b 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -21,9 +21,9 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -41,7 +41,7 @@ inner_join(table_view const& left_input,
   auto matched = cudf::dictionary::detail::match_dictionaries(
     {left_input, right_input},
     stream,
-    rmm::mr::get_current_device_resource());  // temporary objects returned
+    cudf::get_current_device_resource_ref());  // temporary objects returned
 
   // now rebuild the table views with the updated ones
   auto const left      = matched.second.front();
@@ -76,7 +76,7 @@ left_join(table_view const& left_input,
   auto matched = cudf::dictionary::detail::match_dictionaries(
     {left_input, right_input},  // these should match
     stream,
-    rmm::mr::get_current_device_resource());  // temporary objects returned
+    cudf::get_current_device_resource_ref());  // temporary objects returned
   // now rebuild the table views with the updated ones
   table_view const left  = matched.second.front();
   table_view const right = matched.second.back();
@@ -101,7 +101,7 @@ full_join(table_view const& left_input,
   auto matched = cudf::dictionary::detail::match_dictionaries(
     {left_input, right_input},  // these should match
     stream,
-    rmm::mr::get_current_device_resource());  // temporary objects returned
+    cudf::get_current_device_resource_ref());  // temporary objects returned
   // now rebuild the table views with the updated ones
   table_view const left  = matched.second.front();
   table_view const right = matched.second.back();
diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh
index 3d0f3e4340d..4f75908fe72 100644
--- a/cpp/src/join/join_common_utils.cuh
+++ b/cpp/src/join/join_common_utils.cuh
@@ -21,10 +21,10 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/join/join_utils.cu b/cpp/src/join/join_utils.cu
index 8d916da9f2c..16302657ac2 100644
--- a/cpp/src/join/join_utils.cu
+++ b/cpp/src/join/join_utils.cu
@@ -16,8 +16,9 @@
 
 #include "join_common_utils.cuh"
 
+#include <cudf/utilities/memory_resource.hpp>
+
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/functional.h>
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index eb12065c6a9..8ff78dd47f4 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -29,11 +29,11 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/scan.h>
@@ -138,7 +138,7 @@ mixed_join(
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
   auto const row_bitmask =
-    cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()).first;
+    cudf::detail::bitmask_and(build, stream, cudf::get_current_device_resource_ref()).first;
   auto const preprocessed_build =
     experimental::row::equality::preprocessed_table::create(build, stream);
   build_join_hash_table(build,
@@ -404,7 +404,7 @@ compute_mixed_join_output_size(table_view const& left_equality,
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
   auto const row_bitmask =
-    cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource()).first;
+    cudf::detail::bitmask_and(build, stream, cudf::get_current_device_resource_ref()).first;
   auto const preprocessed_build =
     experimental::row::equality::preprocessed_table::create(build, stream);
   build_join_hash_table(build,
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index a79aa6673d6..cfb785e242c 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -30,11 +30,11 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -208,7 +208,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   } else {
     thrust::counting_iterator<cudf::size_type> stencil(0);
     auto const [row_bitmask, _] =
-      cudf::detail::bitmask_and(build, stream, rmm::mr::get_current_device_resource());
+      cudf::detail::bitmask_and(build, stream, cudf::get_current_device_resource_ref());
     row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
 
     // insert valid rows
diff --git a/cpp/src/join/mixed_join_size_kernel.hpp b/cpp/src/join/mixed_join_size_kernel.hpp
index b09805c14dc..0f570c601d7 100644
--- a/cpp/src/join/mixed_join_size_kernel.hpp
+++ b/cpp/src/join/mixed_join_size_kernel.hpp
@@ -25,6 +25,9 @@
 #include <cudf/utilities/export.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
+
 #include <cooperative_groups.h>
 #include <cub/cub.cuh>
 #include <thrust/iterator/discard_iterator.h>
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 91d98d5e8d3..f69ded73e8d 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -25,11 +25,11 @@
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
@@ -72,7 +72,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_anti_join(
                                               compare_nulls,
                                               nan_equality::ALL_EQUAL,
                                               stream,
-                                              rmm::mr::get_current_device_resource());
+                                              cudf::get_current_device_resource_ref());
 
   auto const left_num_rows = left_keys.num_rows();
   auto gather_map =
diff --git a/cpp/src/json/json_path.cu b/cpp/src/json/json_path.cu
index 1bf4bf3b153..59fdbedf089 100644
--- a/cpp/src/json/json_path.cu
+++ b/cpp/src/json/json_path.cu
@@ -34,10 +34,10 @@
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/std/optional>
 #include <thrust/pair.h>
@@ -692,7 +692,7 @@ std::pair<cuda::std::optional<rmm::device_uvector<path_operator>>, int> build_co
   auto const is_empty = h_operators.size() == 1 && h_operators[0].type == path_operator_type::END;
   return is_empty ? std::pair(cuda::std::nullopt, 0)
                   : std::pair(cuda::std::make_optional(cudf::detail::make_device_uvector_sync(
-                                h_operators, stream, rmm::mr::get_current_device_resource())),
+                                h_operators, stream, cudf::get_current_device_resource_ref())),
                               max_stack_depth);
 }
 
@@ -999,7 +999,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
 
   // compute output sizes
   auto sizes =
-    rmm::device_uvector<size_type>(col.size(), stream, rmm::mr::get_current_device_resource());
+    rmm::device_uvector<size_type>(col.size(), stream, cudf::get_current_device_resource_ref());
   auto d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(col.offsets());
 
   constexpr int block_size = 512;
diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu
index 7ee1d540831..18a500069ad 100644
--- a/cpp/src/labeling/label_bins.cu
+++ b/cpp/src/labeling/label_bins.cu
@@ -24,6 +24,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
@@ -32,7 +33,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/advance.h>
 #include <thrust/binary_search.h>
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index 58ec053712d..7ae5db3e84b 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -27,10 +27,10 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/execution_policy.h>
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index bc1b48b11cd..790c99c494d 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -23,11 +23,11 @@
 #include <cudf/lists/combine.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -219,7 +219,7 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
   // concatenate the input table into one column.
   std::vector<column_view> cols(input.num_columns());
   std::copy(input.begin(), input.end(), cols.begin());
-  auto concat = cudf::detail::concatenate(cols, stream, rmm::mr::get_current_device_resource());
+  auto concat = cudf::detail::concatenate(cols, stream, cudf::get_current_device_resource_ref());
 
   // whether or not we should be generating a null mask at all
   auto const build_null_mask = concat->has_nulls();
@@ -251,7 +251,7 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
               return row_null_counts[row_index] != num_columns;
             }),
           stream,
-          rmm::mr::get_current_device_resource());
+          cudf::get_current_device_resource_ref());
       }
       // NULLIFY_OUTPUT_ROW.  Output row is nullfied if any input row is null
       return cudf::detail::valid_if(
@@ -264,7 +264,7 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
             return row_null_counts[row_index] == 0;
           }),
         stream,
-        rmm::mr::get_current_device_resource());
+        cudf::get_current_device_resource_ref());
     }();
     concat->set_null_mask(std::move(null_mask), null_count);
   }
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 11703527d26..9556ef23784 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -28,11 +28,11 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/row_operators.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/execution_policy.h>
@@ -316,7 +316,7 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
                                       search_key,
                                       duplicate_find_option::FIND_FIRST,
                                       stream,
-                                      rmm::mr::get_current_device_resource());
+                                      cudf::get_current_device_resource_ref());
   return to_contains(std::move(key_indices), stream, mr);
 }
 
@@ -332,7 +332,7 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
                                       search_keys,
                                       duplicate_find_option::FIND_FIRST,
                                       stream,
-                                      rmm::mr::get_current_device_resource());
+                                      cudf::get_current_device_resource_ref());
   return to_contains(std::move(key_indices), stream, mr);
 }
 
diff --git a/cpp/src/lists/copying/concatenate.cu b/cpp/src/lists/copying/concatenate.cu
index 8cd58e7eff2..c8bc4799688 100644
--- a/cpp/src/lists/copying/concatenate.cu
+++ b/cpp/src/lists/copying/concatenate.cu
@@ -25,10 +25,10 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
diff --git a/cpp/src/lists/copying/copying.cu b/cpp/src/lists/copying/copying.cu
index 162c6140656..b4c0fb12b8e 100644
--- a/cpp/src/lists/copying/copying.cu
+++ b/cpp/src/lists/copying/copying.cu
@@ -20,10 +20,10 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/lists/copying/gather.cu b/cpp/src/lists/copying/gather.cu
index cadeb273a65..0df1801b99b 100644
--- a/cpp/src/lists/copying/gather.cu
+++ b/cpp/src/lists/copying/gather.cu
@@ -16,9 +16,9 @@
 
 #include <cudf/detail/gather.cuh>
 #include <cudf/lists/detail/gather.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu
index b754fef24e5..9cbb3c59510 100644
--- a/cpp/src/lists/copying/scatter_helper.cu
+++ b/cpp/src/lists/copying/scatter_helper.cu
@@ -21,10 +21,9 @@
 #include <cudf/lists/detail/copying.hpp>
 #include <cudf/lists/detail/scatter_helper.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <cuda/functional>
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index 90f7994b21d..f6e48f141e1 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -22,9 +22,9 @@
 #include <cudf/lists/detail/gather.cuh>
 #include <cudf/lists/gather.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu
index 19c434d10e1..78f78ff6246 100644
--- a/cpp/src/lists/count_elements.cu
+++ b/cpp/src/lists/count_elements.cu
@@ -24,10 +24,10 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu
index 50f40924478..469442d46d4 100644
--- a/cpp/src/lists/dremel.cu
+++ b/cpp/src/lists/dremel.cu
@@ -22,6 +22,7 @@
 #include <cudf/lists/detail/dremel.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
 
@@ -267,7 +268,7 @@ dremel_data get_encoding(column_view h_col,
   }
 
   auto d_nullability = cudf::detail::make_device_uvector_async(
-    nullability, stream, rmm::mr::get_current_device_resource());
+    nullability, stream, cudf::get_current_device_resource_ref());
 
   rmm::device_uvector<uint8_t> rep_level(max_vals_size, stream);
   rmm::device_uvector<uint8_t> def_level(max_vals_size, stream);
diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu
index 74a0d842aad..00e19e2e2cb 100644
--- a/cpp/src/lists/explode.cu
+++ b/cpp/src/lists/explode.cu
@@ -21,12 +21,12 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <cuda/std/optional>
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index c0ce86fb56e..b6d22955e67 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -26,10 +26,10 @@
 #include <cudf/lists/extract.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/fill.h>
@@ -105,7 +105,7 @@ std::unique_ptr<cudf::column> make_index_offsets(size_type num_lists, rmm::cuda_
   return cudf::detail::sequence(num_lists + 1,
                                 cudf::scalar_type_t<size_type>(0, true, stream),
                                 stream,
-                                rmm::mr::get_current_device_resource());
+                                cudf::get_current_device_resource_ref());
 }
 
 }  // namespace
diff --git a/cpp/src/lists/interleave_columns.cu b/cpp/src/lists/interleave_columns.cu
index 45ae3671d4e..3d6fdda957b 100644
--- a/cpp/src/lists/interleave_columns.cu
+++ b/cpp/src/lists/interleave_columns.cu
@@ -24,12 +24,12 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -104,7 +104,7 @@ std::unique_ptr<column> concatenate_and_gather_lists(host_span<column_view const
 {
   // Concatenate all columns into a single (temporary) column.
   auto const concatenated_col =
-    cudf::detail::concatenate(columns_to_concat, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::concatenate(columns_to_concat, stream, cudf::get_current_device_resource_ref());
 
   // The number of input columns is known to be non-zero thus it's safe to call `front()` here.
   auto const num_cols       = columns_to_concat.size();
diff --git a/cpp/src/lists/lists_column_factories.cu b/cpp/src/lists/lists_column_factories.cu
index 66ad1c35c33..dea38947a54 100644
--- a/cpp/src/lists/lists_column_factories.cu
+++ b/cpp/src/lists/lists_column_factories.cu
@@ -22,10 +22,10 @@
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/lists/detail/lists_column_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/sequence.h>
@@ -48,7 +48,7 @@ std::unique_ptr<cudf::column> make_lists_column_from_scalar(list_scalar const& v
                              stream,
                              mr);
   }
-  auto mr_final = size == 1 ? mr : rmm::mr::get_current_device_resource();
+  auto mr_final = size == 1 ? mr : cudf::get_current_device_resource_ref();
 
   // Handcraft a 1-row column
   auto sizes_itr = thrust::constant_iterator<size_type>(value.view().size());
diff --git a/cpp/src/lists/reverse.cu b/cpp/src/lists/reverse.cu
index d913ce070ae..b80f6c882c8 100644
--- a/cpp/src/lists/reverse.cu
+++ b/cpp/src/lists/reverse.cu
@@ -23,11 +23,11 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/lists/reverse.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -45,7 +45,7 @@ std::unique_ptr<column> reverse(lists_column_view const& input,
 
   // The labels are also a map from each list element to its corresponding zero-based list index.
   auto const labels =
-    generate_labels(input, child.size(), stream, rmm::mr::get_current_device_resource());
+    generate_labels(input, child.size(), stream, cudf::get_current_device_resource_ref());
 
   // The offsets of the output lists column.
   auto out_offsets = get_normalized_offsets(input, stream, mr);
diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu
index f920fb916eb..c78b6d793d4 100644
--- a/cpp/src/lists/segmented_sort.cu
+++ b/cpp/src/lists/segmented_sort.cu
@@ -24,10 +24,10 @@
 #include <cudf/lists/sorting.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
index 7d57d8ddb60..4b50bf626f2 100644
--- a/cpp/src/lists/sequences.cu
+++ b/cpp/src/lists/sequences.cu
@@ -24,11 +24,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/distance.h>
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index 5c7ab68d64b..c0bc10dd266 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -27,12 +27,12 @@
 #include <cudf/lists/detail/set_operations.hpp>
 #include <cudf/lists/detail/stream_compaction.hpp>
 #include <cudf/lists/set_operations.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/functional.h>
@@ -78,15 +78,15 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
   auto const lhs_child = lhs.get_sliced_child(stream);
   auto const rhs_child = rhs.get_sliced_child(stream);
   auto const lhs_labels =
-    generate_labels(lhs, lhs_child.size(), stream, rmm::mr::get_current_device_resource());
+    generate_labels(lhs, lhs_child.size(), stream, cudf::get_current_device_resource_ref());
   auto const rhs_labels =
-    generate_labels(rhs, rhs_child.size(), stream, rmm::mr::get_current_device_resource());
+    generate_labels(rhs, rhs_child.size(), stream, cudf::get_current_device_resource_ref());
   auto const lhs_table = table_view{{lhs_labels->view(), lhs_child}};
   auto const rhs_table = table_view{{rhs_labels->view(), rhs_child}};
 
   // Check existence for each row of the rhs_table in lhs_table.
   auto const contained = cudf::detail::contains(
-    lhs_table, rhs_table, nulls_equal, nans_equal, stream, rmm::mr::get_current_device_resource());
+    lhs_table, rhs_table, nulls_equal, nans_equal, stream, cudf::get_current_device_resource_ref());
 
   auto const num_rows = lhs.size();
 
@@ -148,20 +148,20 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
   auto const lhs_child = lhs.get_sliced_child(stream);
   auto const rhs_child = rhs.get_sliced_child(stream);
   auto const lhs_labels =
-    generate_labels(lhs, lhs_child.size(), stream, rmm::mr::get_current_device_resource());
+    generate_labels(lhs, lhs_child.size(), stream, cudf::get_current_device_resource_ref());
   auto const rhs_labels =
-    generate_labels(rhs, rhs_child.size(), stream, rmm::mr::get_current_device_resource());
+    generate_labels(rhs, rhs_child.size(), stream, cudf::get_current_device_resource_ref());
   auto const lhs_table = table_view{{lhs_labels->view(), lhs_child}};
   auto const rhs_table = table_view{{rhs_labels->view(), rhs_child}};
 
   auto const contained = cudf::detail::contains(
-    lhs_table, rhs_table, nulls_equal, nans_equal, stream, rmm::mr::get_current_device_resource());
+    lhs_table, rhs_table, nulls_equal, nans_equal, stream, cudf::get_current_device_resource_ref());
 
   auto const intersect_table = cudf::detail::copy_if(
     rhs_table,
     [contained = contained.begin()] __device__(auto const idx) { return contained[idx]; },
     stream,
-    rmm::mr::get_current_device_resource());
+    cudf::get_current_device_resource_ref());
 
   // A stable algorithm is required to ensure that list labels remain contiguous.
   auto out_table = cudf::detail::stable_distinct(intersect_table->view(),
@@ -205,7 +205,7 @@ std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
     lists::detail::concatenate_rows(table_view{{lhs.parent(), rhs.parent()}},
                                     concatenate_null_policy::NULLIFY_OUTPUT_ROW,
                                     stream,
-                                    rmm::mr::get_current_device_resource());
+                                    cudf::get_current_device_resource_ref());
 
   return cudf::lists::detail::distinct(
     lists_column_view{union_col->view()}, nulls_equal, nans_equal, stream, mr);
@@ -231,20 +231,20 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
   auto const lhs_child = lhs.get_sliced_child(stream);
   auto const rhs_child = rhs.get_sliced_child(stream);
   auto const lhs_labels =
-    generate_labels(lhs, lhs_child.size(), stream, rmm::mr::get_current_device_resource());
+    generate_labels(lhs, lhs_child.size(), stream, cudf::get_current_device_resource_ref());
   auto const rhs_labels =
-    generate_labels(rhs, rhs_child.size(), stream, rmm::mr::get_current_device_resource());
+    generate_labels(rhs, rhs_child.size(), stream, cudf::get_current_device_resource_ref());
   auto const lhs_table = table_view{{lhs_labels->view(), lhs_child}};
   auto const rhs_table = table_view{{rhs_labels->view(), rhs_child}};
 
   auto const contained = cudf::detail::contains(
-    rhs_table, lhs_table, nulls_equal, nans_equal, stream, rmm::mr::get_current_device_resource());
+    rhs_table, lhs_table, nulls_equal, nans_equal, stream, cudf::get_current_device_resource_ref());
 
   auto const difference_table = cudf::detail::copy_if(
     lhs_table,
     [contained = contained.begin()] __device__(auto const idx) { return !contained[idx]; },
     stream,
-    rmm::mr::get_current_device_resource());
+    cudf::get_current_device_resource_ref());
 
   // A stable algorithm is required to ensure that list labels remain contiguous.
   auto out_table = cudf::detail::stable_distinct(difference_table->view(),
diff --git a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
index 71aafa3ce12..c78e9c22e2a 100644
--- a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
@@ -27,9 +27,9 @@
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/reduce.h>
 #include <thrust/scan.h>
@@ -73,7 +73,7 @@ std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                              null_policy::EXCLUDE,
                                              std::nullopt,
                                              stream,
-                                             rmm::mr::get_current_device_resource());
+                                             cudf::get_current_device_resource_ref());
     auto const d_sizes     = column_device_view::create(*sizes, stream);
     auto const sizes_begin = cudf::detail::make_null_replacement_iterator(*d_sizes, size_type{0});
     auto const sizes_end   = sizes_begin + sizes->size();
diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu
index cdcb4aa957f..ab750de9ef2 100644
--- a/cpp/src/lists/stream_compaction/distinct.cu
+++ b/cpp/src/lists/stream_compaction/distinct.cu
@@ -25,9 +25,9 @@
 #include <cudf/lists/stream_compaction.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <utility>
@@ -50,7 +50,7 @@ std::unique_ptr<column> distinct(lists_column_view const& input,
 
   auto const child = input.get_sliced_child(stream);
   auto const labels =
-    generate_labels(input, child.size(), stream, rmm::mr::get_current_device_resource());
+    generate_labels(input, child.size(), stream, cudf::get_current_device_resource_ref());
 
   auto const distinct_table =
     cudf::detail::stable_distinct(table_view{{labels->view(), child}},  // input table
diff --git a/cpp/src/lists/utilities.cu b/cpp/src/lists/utilities.cu
index 7fb960f02ca..53ddc27a8a5 100644
--- a/cpp/src/lists/utilities.cu
+++ b/cpp/src/lists/utilities.cu
@@ -19,8 +19,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/labeling/label_segments.cuh>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf::lists::detail {
 
diff --git a/cpp/src/lists/utilities.hpp b/cpp/src/lists/utilities.hpp
index 218ad7872e9..c0fcf7b7182 100644
--- a/cpp/src/lists/utilities.hpp
+++ b/cpp/src/lists/utilities.hpp
@@ -18,10 +18,10 @@
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf::lists::detail {
 
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index e2c8d49a4ab..b9e0da0a3fe 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -34,13 +34,13 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -247,7 +247,7 @@ index_vector generate_merged_indices(table_view const& left_table,
   auto rhs_device_view = table_device_view::create(right_table, stream);
 
   auto d_column_order = cudf::detail::make_device_uvector_async(
-    column_order, stream, rmm::mr::get_current_device_resource());
+    column_order, stream, cudf::get_current_device_resource_ref());
 
   if (has_nulls) {
     auto const new_null_precedence = [&]() {
@@ -261,7 +261,7 @@ index_vector generate_merged_indices(table_view const& left_table,
     }();
 
     auto d_null_precedence = cudf::detail::make_device_uvector_async(
-      new_null_precedence, stream, rmm::mr::get_current_device_resource());
+      new_null_precedence, stream, cudf::get_current_device_resource_ref());
 
     auto ineq_op = detail::row_lexicographic_tagged_comparator<true>(
       *lhs_device_view, *rhs_device_view, d_column_order, d_null_precedence);
@@ -307,7 +307,7 @@ index_vector generate_merged_indices_nested(table_view const& left_table,
                                                           column_order,
                                                           null_precedence,
                                                           stream,
-                                                          rmm::mr::get_current_device_resource());
+                                                          cudf::get_current_device_resource_ref());
   auto const left_indices         = left_indices_col->view();
   auto left_indices_mutable       = left_indices_col->mutable_view();
   auto const left_indices_begin   = left_indices.begin<cudf::size_type>();
@@ -647,7 +647,7 @@ table_ptr_type merge(std::vector<table_view> const& tables_to_merge,
   // This utility will ensure all corresponding dictionary columns have matching keys.
   // It will return any new dictionary columns created as well as updated table_views.
   auto matched = cudf::dictionary::detail::match_dictionaries(
-    tables_to_merge, stream, rmm::mr::get_current_device_resource());
+    tables_to_merge, stream, cudf::get_current_device_resource_ref());
   auto merge_tables = matched.second;
 
   // A queue of (table view, table) pairs
@@ -673,7 +673,7 @@ table_ptr_type merge(std::vector<table_view> const& tables_to_merge,
     auto const right_table = top_and_pop(merge_queue);
 
     // Only use mr for the output table
-    auto const& new_tbl_mr = merge_queue.empty() ? mr : rmm::mr::get_current_device_resource();
+    auto const& new_tbl_mr = merge_queue.empty() ? mr : cudf::get_current_device_resource_ref();
     auto merged_table      = merge(left_table.view,
                               right_table.view,
                               key_cols,
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index f10388794fc..17008e80e79 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -27,11 +27,11 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_scan.cuh>
 #include <cub/device/device_histogram.cuh>
@@ -501,10 +501,10 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
 
   // Holds the total number of rows in each partition
   auto global_partition_sizes = cudf::detail::make_zeroed_device_uvector_async<size_type>(
-    num_partitions, stream, rmm::mr::get_current_device_resource());
+    num_partitions, stream, cudf::get_current_device_resource_ref());
 
   auto row_partition_offset = cudf::detail::make_zeroed_device_uvector_async<size_type>(
-    num_rows, stream, rmm::mr::get_current_device_resource());
+    num_rows, stream, cudf::get_current_device_resource_ref());
 
   auto const row_hasher = experimental::row::hash::row_hasher(table_to_hash, stream);
   auto const hasher =
diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu
index 9810373b751..5a4c90a67a5 100644
--- a/cpp/src/partitioning/round_robin.cu
+++ b/cpp/src/partitioning/round_robin.cu
@@ -26,12 +26,12 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index 5d748de0019..80fd72a3088 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -30,11 +30,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -89,7 +89,7 @@ struct quantile_functor {
     auto d_output = mutable_column_device_view::create(output->mutable_view(), stream);
 
     auto q_device =
-      cudf::detail::make_device_uvector_sync(q, stream, rmm::mr::get_current_device_resource());
+      cudf::detail::make_device_uvector_sync(q, stream, cudf::get_current_device_resource_ref());
 
     if (!cudf::is_dictionary(input.type())) {
       auto sorted_data =
diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu
index 0b0e6701304..69421f3bfc4 100644
--- a/cpp/src/quantiles/quantiles.cu
+++ b/cpp/src/quantiles/quantiles.cu
@@ -26,9 +26,9 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -55,7 +55,7 @@ std::unique_ptr<table> quantiles(table_view const& input,
     });
 
   auto const q_device =
-    cudf::detail::make_device_uvector_async(q, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::make_device_uvector_async(q, stream, cudf::get_current_device_resource_ref());
 
   auto quantile_idx_iter = thrust::make_transform_iterator(q_device.begin(), quantile_idx_lookup);
 
@@ -90,7 +90,7 @@ std::unique_ptr<table> quantiles(table_view const& input,
       input, thrust::make_counting_iterator<size_type>(0), q, interp, stream, mr);
   } else {
     auto sorted_idx = detail::sorted_order(
-      input, column_order, null_precedence, stream, rmm::mr::get_current_device_resource());
+      input, column_order, null_precedence, stream, cudf::get_current_device_resource_ref());
     return detail::quantiles(input, sorted_idx->view().data<size_type>(), q, interp, stream, mr);
   }
 }
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 421ed26e26d..0d017cf1f13 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -25,10 +25,10 @@
 #include <cudf/quantiles.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/advance.h>
@@ -199,7 +199,7 @@ std::unique_ptr<column> compute_approx_percentiles(tdigest_column_view const& in
                                                           weight.size(),
                                                           mask_state::UNALLOCATED,
                                                           stream,
-                                                          rmm::mr::get_current_device_resource());
+                                                          cudf::get_current_device_resource_ref());
   auto keys               = cudf::detail::make_counting_transform_iterator(
     0,
     cuda::proclaim_return_type<std::ptrdiff_t>(
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 229af89fc46..2dd25a7b890 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -29,11 +29,11 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/unary.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/advance.h>
@@ -1082,7 +1082,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                                               {order::ASCENDING},
                                               {},
                                               stream,
-                                              rmm::mr::get_current_device_resource());
+                                              cudf::get_current_device_resource_ref());
                  });
 
   // generate min and max values
@@ -1143,7 +1143,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
                  std::back_inserter(tdigest_views),
                  [](std::unique_ptr<table> const& t) { return t->view(); });
   auto merged =
-    cudf::detail::concatenate(tdigest_views, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::concatenate(tdigest_views, stream, cudf::get_current_device_resource_ref());
 
   // generate cumulative weights
   auto merged_weights     = merged->get_column(1).view();
@@ -1220,7 +1220,7 @@ std::unique_ptr<scalar> reduce_tdigest(column_view const& col,
   // order with nulls at the end.
   table_view t({col});
   auto sorted = cudf::detail::sort(
-    t, {order::ASCENDING}, {null_order::AFTER}, stream, rmm::mr::get_current_device_resource());
+    t, {order::ASCENDING}, {null_order::AFTER}, stream, cudf::get_current_device_resource_ref());
 
   auto const delta = max_centroids;
   return cudf::type_dispatcher(
diff --git a/cpp/src/reductions/all.cu b/cpp/src/reductions/all.cu
index 11b0e2732fe..67ea29a2cb1 100644
--- a/cpp/src/reductions/all.cu
+++ b/cpp/src/reductions/all.cu
@@ -18,8 +18,7 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <cuda/atomic>
 #include <thrust/for_each.h>
@@ -66,7 +65,7 @@ struct all_fn {
         cudf::dictionary::detail::make_dictionary_pair_iterator<T>(*d_dict, input.has_nulls());
       return thrust::make_transform_iterator(pair_iter, null_iter);
     }();
-    auto d_result = rmm::device_scalar<int32_t>(1, stream, rmm::mr::get_current_device_resource());
+    auto d_result = rmm::device_scalar<int32_t>(1, stream, cudf::get_current_device_resource_ref());
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
                        input.size(),
diff --git a/cpp/src/reductions/any.cu b/cpp/src/reductions/any.cu
index 0ebeb7a48b9..057f038c622 100644
--- a/cpp/src/reductions/any.cu
+++ b/cpp/src/reductions/any.cu
@@ -18,8 +18,7 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <cuda/atomic>
 #include <thrust/for_each.h>
@@ -66,7 +65,7 @@ struct any_fn {
         cudf::dictionary::detail::make_dictionary_pair_iterator<T>(*d_dict, input.has_nulls());
       return thrust::make_transform_iterator(pair_iter, null_iter);
     }();
-    auto d_result = rmm::device_scalar<int32_t>(0, stream, rmm::mr::get_current_device_resource());
+    auto d_result = rmm::device_scalar<int32_t>(0, stream, cudf::get_current_device_resource_ref());
     thrust::for_each_n(rmm::exec_policy(stream),
                        thrust::make_counting_iterator<size_type>(0),
                        input.size(),
diff --git a/cpp/src/reductions/collect_ops.cu b/cpp/src/reductions/collect_ops.cu
index c1a1f117ee1..01dfb8f2c7d 100644
--- a/cpp/src/reductions/collect_ops.cu
+++ b/cpp/src/reductions/collect_ops.cu
@@ -22,8 +22,7 @@
 #include <cudf/reduction/detail/reduction_functions.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/compound.cuh b/cpp/src/reductions/compound.cuh
index aa71546f049..6bc8b48832f 100644
--- a/cpp/src/reductions/compound.cuh
+++ b/cpp/src/reductions/compound.cuh
@@ -19,11 +19,10 @@
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/reduction/detail/reduction.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <thrust/iterator/transform_iterator.h>
 
 namespace cudf {
diff --git a/cpp/src/reductions/histogram.cu b/cpp/src/reductions/histogram.cu
index d49c0c6f0d2..362b5f74c46 100644
--- a/cpp/src/reductions/histogram.cu
+++ b/cpp/src/reductions/histogram.cu
@@ -20,8 +20,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/structs/structs_column_view.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <cuda/atomic>
 #include <cuda/functional>
@@ -223,7 +222,7 @@ compute_row_frequencies(table_view const& input,
       partial_counts ? partial_counts.value().begin<histogram_count_type>() : nullptr},
     histogram_count_type{0},
     stream,
-    rmm::mr::get_current_device_resource());
+    cudf::get_current_device_resource_ref());
 
   auto const input_it = thrust::make_zip_iterator(
     thrust::make_tuple(thrust::make_counting_iterator(0), reduction_results.begin()));
diff --git a/cpp/src/reductions/max.cu b/cpp/src/reductions/max.cu
index 682889f0fee..0434d043240 100644
--- a/cpp/src/reductions/max.cu
+++ b/cpp/src/reductions/max.cu
@@ -18,9 +18,9 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/mean.cu b/cpp/src/reductions/mean.cu
index e8a10f02cc1..c5ab501f607 100644
--- a/cpp/src/reductions/mean.cu
+++ b/cpp/src/reductions/mean.cu
@@ -18,9 +18,9 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/min.cu b/cpp/src/reductions/min.cu
index 7986bda5751..26b91ebe868 100644
--- a/cpp/src/reductions/min.cu
+++ b/cpp/src/reductions/min.cu
@@ -18,8 +18,7 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index 6cb58786971..139de068050 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -24,9 +24,9 @@
 #include <cudf/reduction.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/extrema.h>
 #include <thrust/functional.h>
diff --git a/cpp/src/reductions/nested_type_minmax_util.cuh b/cpp/src/reductions/nested_type_minmax_util.cuh
index 3cf390d3574..6a2c4c44553 100644
--- a/cpp/src/reductions/nested_type_minmax_util.cuh
+++ b/cpp/src/reductions/nested_type_minmax_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/reduction/detail/reduction_operators.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace reduction {
@@ -104,7 +105,7 @@ class comparison_binop_generator {
         std::vector<null_order>{DEFAULT_NULL_ORDER},
         cudf::structs::detail::column_nullability::MATCH_INCOMING,
         stream,
-        rmm::mr::get_current_device_resource())},
+        cudf::get_current_device_resource_ref())},
       row_comparator{[&input_,
                       &input_tview     = input_tview,
                       &flattened_input = flattened_input,
diff --git a/cpp/src/reductions/nth_element.cu b/cpp/src/reductions/nth_element.cu
index e266f477c5d..4f6198696bd 100644
--- a/cpp/src/reductions/nth_element.cu
+++ b/cpp/src/reductions/nth_element.cu
@@ -19,11 +19,11 @@
 #include <cudf/detail/copy.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/reduction/detail/reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
diff --git a/cpp/src/reductions/product.cu b/cpp/src/reductions/product.cu
index 28ff8db3708..f5fd735a9f4 100644
--- a/cpp/src/reductions/product.cu
+++ b/cpp/src/reductions/product.cu
@@ -18,9 +18,9 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index d4ea84742c7..d187375b69f 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -29,10 +29,10 @@
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <utility>
 
@@ -78,7 +78,7 @@ struct reduce_dispatch_functor {
         return standard_deviation(col, output_dtype, var_agg._ddof, stream, mr);
       }
       case aggregation::MEDIAN: {
-        auto current_mr     = rmm::mr::get_current_device_resource();
+        auto current_mr     = cudf::get_current_device_resource_ref();
         auto sorted_indices = cudf::detail::sorted_order(
           table_view{{col}}, {}, {null_order::AFTER}, stream, current_mr);
         auto valid_sorted_indices =
@@ -91,7 +91,7 @@ struct reduce_dispatch_functor {
         auto quantile_agg = static_cast<cudf::detail::quantile_aggregation const&>(agg);
         CUDF_EXPECTS(quantile_agg._quantiles.size() == 1,
                      "Reduction quantile accepts only one quantile value");
-        auto current_mr     = rmm::mr::get_current_device_resource();
+        auto current_mr     = cudf::get_current_device_resource_ref();
         auto sorted_indices = cudf::detail::sorted_order(
           table_view{{col}}, {}, {null_order::AFTER}, stream, current_mr);
         auto valid_sorted_indices =
diff --git a/cpp/src/reductions/scan/rank_scan.cu b/cpp/src/reductions/scan/rank_scan.cu
index 0dbfc271a25..6d0adc83359 100644
--- a/cpp/src/reductions/scan/rank_scan.cu
+++ b/cpp/src/reductions/scan/rank_scan.cu
@@ -21,10 +21,10 @@
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/scan.h>
 #include <thrust/tabulate.h>
@@ -135,7 +135,7 @@ std::unique_ptr<column> inclusive_one_normalized_percent_rank_scan(
   column_view const& order_by, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr)
 {
   auto const rank_column =
-    inclusive_rank_scan(order_by, stream, rmm::mr::get_current_device_resource());
+    inclusive_rank_scan(order_by, stream, cudf::get_current_device_resource_ref());
   auto const rank_view = rank_column->view();
 
   // Result type for min 0-index percent rank is independent of input type.
diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp
index de4dcf1de52..d3c0b54f286 100644
--- a/cpp/src/reductions/scan/scan.cpp
+++ b/cpp/src/reductions/scan/scan.cpp
@@ -20,8 +20,7 @@
 #include <cudf/detail/scan.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 
diff --git a/cpp/src/reductions/scan/scan.cuh b/cpp/src/reductions/scan/scan.cuh
index 6c237741ac3..76f98fe9a28 100644
--- a/cpp/src/reductions/scan/scan.cuh
+++ b/cpp/src/reductions/scan/scan.cuh
@@ -20,10 +20,10 @@
 #include <cudf/detail/utilities/device_operators.cuh>
 #include <cudf/reduction.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <utility>
 
diff --git a/cpp/src/reductions/scan/scan_exclusive.cu b/cpp/src/reductions/scan/scan_exclusive.cu
index 7224bf47390..38ed0a68901 100644
--- a/cpp/src/reductions/scan/scan_exclusive.cu
+++ b/cpp/src/reductions/scan/scan_exclusive.cu
@@ -23,10 +23,10 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/cast_functor.cuh>
 #include <cudf/null_mask.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/scan.h>
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index ee35d716d6e..a876d54d45f 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -28,10 +28,10 @@
 #include <cudf/reduction.hpp>
 #include <cudf/strings/detail/scan.hpp>
 #include <cudf/structs/detail/scan.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/find.h>
 #include <thrust/functional.h>
diff --git a/cpp/src/reductions/segmented/all.cu b/cpp/src/reductions/segmented/all.cu
index 489fc6a283c..e59e6a6896b 100644
--- a/cpp/src/reductions/segmented/all.cu
+++ b/cpp/src/reductions/segmented/all.cu
@@ -17,8 +17,7 @@
 #include "simple.cuh"
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/any.cu b/cpp/src/reductions/segmented/any.cu
index a9a8528548a..444ab689c39 100644
--- a/cpp/src/reductions/segmented/any.cu
+++ b/cpp/src/reductions/segmented/any.cu
@@ -17,8 +17,7 @@
 #include "simple.cuh"
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/compound.cuh b/cpp/src/reductions/segmented/compound.cuh
index 035a8bdcd75..77fabbe485f 100644
--- a/cpp/src/reductions/segmented/compound.cuh
+++ b/cpp/src/reductions/segmented/compound.cuh
@@ -22,11 +22,10 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.cuh>
 #include <cudf/reduction/detail/segmented_reduction.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <thrust/adjacent_difference.h>
 #include <thrust/iterator/transform_iterator.h>
 
@@ -73,7 +72,7 @@ std::unique_ptr<column> compound_segmented_reduction(column_view const& col,
                                               offsets,
                                               null_handling,
                                               stream,
-                                              rmm::mr::get_current_device_resource());
+                                              cudf::get_current_device_resource_ref());
 
   // Run segmented reduction
   if (col.has_nulls()) {
diff --git a/cpp/src/reductions/segmented/counts.cu b/cpp/src/reductions/segmented/counts.cu
index 79737828678..5a072d6ca0a 100644
--- a/cpp/src/reductions/segmented/counts.cu
+++ b/cpp/src/reductions/segmented/counts.cu
@@ -17,8 +17,7 @@
 #include "counts.hpp"
 
 #include <cudf/detail/null_mask.cuh>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/adjacent_difference.h>
 
diff --git a/cpp/src/reductions/segmented/counts.hpp b/cpp/src/reductions/segmented/counts.hpp
index f249644e564..c3f3e935f9a 100644
--- a/cpp/src/reductions/segmented/counts.hpp
+++ b/cpp/src/reductions/segmented/counts.hpp
@@ -17,11 +17,11 @@
 #pragma once
 
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 class column_device_view;
diff --git a/cpp/src/reductions/segmented/max.cu b/cpp/src/reductions/segmented/max.cu
index 1c79edcc08c..49d0fe5f01c 100644
--- a/cpp/src/reductions/segmented/max.cu
+++ b/cpp/src/reductions/segmented/max.cu
@@ -17,8 +17,7 @@
 #include "simple.cuh"
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/mean.cu b/cpp/src/reductions/segmented/mean.cu
index 8df6bee97e9..a9919086c8d 100644
--- a/cpp/src/reductions/segmented/mean.cu
+++ b/cpp/src/reductions/segmented/mean.cu
@@ -17,9 +17,9 @@
 #include "compound.cuh"
 
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/min.cu b/cpp/src/reductions/segmented/min.cu
index ae1d5ae42a4..052c81bc2c7 100644
--- a/cpp/src/reductions/segmented/min.cu
+++ b/cpp/src/reductions/segmented/min.cu
@@ -17,8 +17,7 @@
 #include "simple.cuh"
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/nunique.cu b/cpp/src/reductions/segmented/nunique.cu
index d4fcf89e161..9b7e6f9fe57 100644
--- a/cpp/src/reductions/segmented/nunique.cu
+++ b/cpp/src/reductions/segmented/nunique.cu
@@ -24,9 +24,9 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/reductions/segmented/product.cu b/cpp/src/reductions/segmented/product.cu
index 1b82e7e5aec..84e54ce6b6c 100644
--- a/cpp/src/reductions/segmented/product.cu
+++ b/cpp/src/reductions/segmented/product.cu
@@ -17,8 +17,7 @@
 #include "simple.cuh"
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/reductions.cpp b/cpp/src/reductions/segmented/reductions.cpp
index e6de065dabb..40d1d8a0a53 100644
--- a/cpp/src/reductions/segmented/reductions.cpp
+++ b/cpp/src/reductions/segmented/reductions.cpp
@@ -22,10 +22,10 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/simple.cuh b/cpp/src/reductions/segmented/simple.cuh
index da59df6b314..6c35e750e6b 100644
--- a/cpp/src/reductions/segmented/simple.cuh
+++ b/cpp/src/reductions/segmented/simple.cuh
@@ -28,12 +28,12 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/reduction/detail/segmented_reduction.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/iterator/counting_iterator.h>
@@ -243,7 +243,7 @@ std::unique_ptr<column> fixed_point_segmented_reduction(
                                                   offsets,
                                                   null_policy::EXCLUDE,  // do not count nulls
                                                   stream,
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
 
       auto const max_count = thrust::reduce(rmm::exec_policy(stream),
                                             counts.begin(),
diff --git a/cpp/src/reductions/segmented/std.cu b/cpp/src/reductions/segmented/std.cu
index 0a7eb007f68..1d1a26e5176 100644
--- a/cpp/src/reductions/segmented/std.cu
+++ b/cpp/src/reductions/segmented/std.cu
@@ -17,9 +17,9 @@
 #include "compound.cuh"
 
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/sum.cu b/cpp/src/reductions/segmented/sum.cu
index bb06f6d7c8e..220148a7841 100644
--- a/cpp/src/reductions/segmented/sum.cu
+++ b/cpp/src/reductions/segmented/sum.cu
@@ -17,8 +17,7 @@
 #include "simple.cuh"
 
 #include <cudf/reduction/detail/reduction_functions.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/sum_of_squares.cu b/cpp/src/reductions/segmented/sum_of_squares.cu
index 25d52f9bc79..6f3c1abd942 100644
--- a/cpp/src/reductions/segmented/sum_of_squares.cu
+++ b/cpp/src/reductions/segmented/sum_of_squares.cu
@@ -17,9 +17,9 @@
 #include "simple.cuh"
 
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/segmented/update_validity.cu b/cpp/src/reductions/segmented/update_validity.cu
index 92cfe5417ef..f0c3f0a0f0b 100644
--- a/cpp/src/reductions/segmented/update_validity.cu
+++ b/cpp/src/reductions/segmented/update_validity.cu
@@ -18,10 +18,9 @@
 
 #include <cudf/detail/null_mask.cuh>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 namespace cudf {
 namespace reduction {
 namespace detail {
diff --git a/cpp/src/reductions/segmented/update_validity.hpp b/cpp/src/reductions/segmented/update_validity.hpp
index c143e1a4761..d60be8e92f4 100644
--- a/cpp/src/reductions/segmented/update_validity.hpp
+++ b/cpp/src/reductions/segmented/update_validity.hpp
@@ -19,10 +19,10 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <optional>
 
diff --git a/cpp/src/reductions/segmented/var.cu b/cpp/src/reductions/segmented/var.cu
index 35f2771dfcf..f70943c19fc 100644
--- a/cpp/src/reductions/segmented/var.cu
+++ b/cpp/src/reductions/segmented/var.cu
@@ -17,9 +17,9 @@
 #include "compound.cuh"
 
 #include <cudf/reduction/detail/segmented_reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/simple.cuh b/cpp/src/reductions/simple.cuh
index 372ceccf60b..e897deee8a3 100644
--- a/cpp/src/reductions/simple.cuh
+++ b/cpp/src/reductions/simple.cuh
@@ -27,12 +27,12 @@
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/structs/struct_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -344,7 +344,7 @@ struct same_element_type_dispatcher {
       dictionary_column_view(col).get_indices_annotated(),
       init,
       stream,
-      rmm::mr::get_current_device_resource());
+      cudf::get_current_device_resource_ref());
     return resolve_key<ElementType>(dictionary_column_view(col).keys(), *index, stream, mr);
   }
 
diff --git a/cpp/src/reductions/std.cu b/cpp/src/reductions/std.cu
index 9c78b35313b..38076b52b14 100644
--- a/cpp/src/reductions/std.cu
+++ b/cpp/src/reductions/std.cu
@@ -18,9 +18,9 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/sum.cu b/cpp/src/reductions/sum.cu
index 51b251a836e..898eadb8435 100644
--- a/cpp/src/reductions/sum.cu
+++ b/cpp/src/reductions/sum.cu
@@ -18,9 +18,9 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/sum_of_squares.cu b/cpp/src/reductions/sum_of_squares.cu
index dc0eae56e98..49917f3009e 100644
--- a/cpp/src/reductions/sum_of_squares.cu
+++ b/cpp/src/reductions/sum_of_squares.cu
@@ -18,9 +18,9 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/reductions/var.cu b/cpp/src/reductions/var.cu
index aaab9dd4604..0e7b2fea9f8 100644
--- a/cpp/src/reductions/var.cu
+++ b/cpp/src/reductions/var.cu
@@ -18,9 +18,9 @@
 
 #include <cudf/dictionary/dictionary_column_view.hpp>
 #include <cudf/reduction/detail/reduction_functions.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace reduction {
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index cb3caf9d068..7f605f08d8d 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -34,11 +34,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -258,7 +258,7 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
     return result;
   }();
   auto matched_view = dictionary_column_view(matched_column->view());
-  auto default_mr   = rmm::mr::get_current_device_resource();
+  auto default_mr   = cudf::get_current_device_resource_ref();
 
   // get the indexes for lo_replace and for hi_replace
   auto lo_replace_index =
diff --git a/cpp/src/replace/nans.cu b/cpp/src/replace/nans.cu
index eba6f6b436e..394c2a2de80 100644
--- a/cpp/src/replace/nans.cu
+++ b/cpp/src/replace/nans.cu
@@ -24,11 +24,11 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 13e130588c1..1df1549432f 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -37,13 +37,13 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index c2cd03cd761..86ec8cfc91e 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -48,12 +48,12 @@
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
@@ -262,14 +262,14 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::diction
     auto new_keys = cudf::detail::concatenate(
       std::vector<cudf::column_view>({values.keys(), replacements.keys()}),
       stream,
-      rmm::mr::get_current_device_resource());
+      cudf::get_current_device_resource_ref());
     return cudf::dictionary::detail::add_keys(input, new_keys->view(), stream, mr);
   }();
   auto matched_view   = cudf::dictionary_column_view(matched_input->view());
   auto matched_values = cudf::dictionary::detail::set_keys(
-    values, matched_view.keys(), stream, rmm::mr::get_current_device_resource());
+    values, matched_view.keys(), stream, cudf::get_current_device_resource_ref());
   auto matched_replacements = cudf::dictionary::detail::set_keys(
-    replacements, matched_view.keys(), stream, rmm::mr::get_current_device_resource());
+    replacements, matched_view.keys(), stream, cudf::get_current_device_resource_ref());
 
   auto indices_type = matched_view.indices().type();
   auto new_indices  = cudf::type_dispatcher<cudf::dispatch_storage_type>(
diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu
index 2a03a5504c1..0526594cbef 100644
--- a/cpp/src/reshape/byte_cast.cu
+++ b/cpp/src/reshape/byte_cast.cu
@@ -23,11 +23,11 @@
 #include <cudf/reshape.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/for_each.h>
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 7473b6045af..6c47d6f2216 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -29,10 +29,10 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu
index 3d4fb73c000..45c40df3aeb 100644
--- a/cpp/src/reshape/tile.cu
+++ b/cpp/src/reshape/tile.cu
@@ -24,9 +24,9 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/src/rolling/detail/lead_lag_nested.cuh b/cpp/src/rolling/detail/lead_lag_nested.cuh
index cfedcac8ae4..5d5fe9e4aa3 100644
--- a/cpp/src/rolling/detail/lead_lag_nested.cuh
+++ b/cpp/src/rolling/detail/lead_lag_nested.cuh
@@ -24,12 +24,11 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/scatter.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -200,7 +199,7 @@ std::unique_ptr<column> compute_lead_lag_for_nested(aggregation::Kind op,
                          out_of_bounds_policy::DONT_CHECK,
                          cudf::detail::negative_index_policy::NOT_ALLOWED,
                          stream,
-                         rmm::mr::get_current_device_resource());
+                         cudf::get_current_device_resource_ref());
 
   // Scatter defaults into locations where LEAD/LAG computed nulls.
   auto scattered_results = cudf::detail::scatter(
diff --git a/cpp/src/rolling/detail/nth_element.cuh b/cpp/src/rolling/detail/nth_element.cuh
index 571f4c02cb5..ce1e666d5a0 100644
--- a/cpp/src/rolling/detail/nth_element.cuh
+++ b/cpp/src/rolling/detail/nth_element.cuh
@@ -21,9 +21,9 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/utilities/bit.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/execution_policy.h>
diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.cpp b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
index 4175c6e34c1..72c23395a93 100644
--- a/cpp/src/rolling/detail/optimized_unbounded_window.cpp
+++ b/cpp/src/rolling/detail/optimized_unbounded_window.cpp
@@ -25,8 +25,7 @@
 #include <cudf/types.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf::detail {
 
@@ -143,7 +142,7 @@ std::unique_ptr<column> reduction_based_rolling_window(column_view const& input,
                                              return_dtype,
                                              std::nullopt,
                                              stream,
-                                             rmm::mr::get_current_device_resource());
+                                             cudf::get_current_device_resource_ref());
     }
   }();
   // Blow up results into separate column.
diff --git a/cpp/src/rolling/detail/optimized_unbounded_window.hpp b/cpp/src/rolling/detail/optimized_unbounded_window.hpp
index 153586b187f..5adba764e9d 100644
--- a/cpp/src/rolling/detail/optimized_unbounded_window.hpp
+++ b/cpp/src/rolling/detail/optimized_unbounded_window.hpp
@@ -16,9 +16,9 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace rmm::mr {
 class device_memory_resource;
diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh
index c18bb9d9885..528700137bf 100644
--- a/cpp/src/rolling/detail/rolling.cuh
+++ b/cpp/src/rolling/detail/rolling.cuh
@@ -44,13 +44,13 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/std/climits>
 #include <cuda/std/limits>
@@ -928,7 +928,7 @@ class rolling_aggregation_postprocessor final : public cudf::detail::aggregation
                                                      min_periods,
                                                      agg._null_handling,
                                                      stream,
-                                                     rmm::mr::get_current_device_resource());
+                                                     cudf::get_current_device_resource_ref());
 
     result = lists::detail::distinct(
       lists_column_view{collected_list->view()}, agg._nulls_equal, agg._nans_equal, stream, mr);
diff --git a/cpp/src/rolling/detail/rolling.hpp b/cpp/src/rolling/detail/rolling.hpp
index 2624d982712..8820a6264e7 100644
--- a/cpp/src/rolling/detail/rolling.hpp
+++ b/cpp/src/rolling/detail/rolling.hpp
@@ -18,10 +18,9 @@
 
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 namespace cudf {
 // helper functions - used in the rolling window implementation and tests
 
diff --git a/cpp/src/rolling/detail/rolling_collect_list.cu b/cpp/src/rolling/detail/rolling_collect_list.cu
index b259bd51fc4..8a98b65b406 100644
--- a/cpp/src/rolling/detail/rolling_collect_list.cu
+++ b/cpp/src/rolling/detail/rolling_collect_list.cu
@@ -18,9 +18,9 @@
 
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/iterator.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
diff --git a/cpp/src/rolling/detail/rolling_collect_list.cuh b/cpp/src/rolling/detail/rolling_collect_list.cuh
index 7630898f820..f3eff6b0689 100644
--- a/cpp/src/rolling/detail/rolling_collect_list.cuh
+++ b/cpp/src/rolling/detail/rolling_collect_list.cuh
@@ -21,10 +21,10 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/extrema.h>
diff --git a/cpp/src/rolling/detail/rolling_fixed_window.cu b/cpp/src/rolling/detail/rolling_fixed_window.cu
index df0e72748ce..23424da13cd 100644
--- a/cpp/src/rolling/detail/rolling_fixed_window.cu
+++ b/cpp/src/rolling/detail/rolling_fixed_window.cu
@@ -20,8 +20,7 @@
 
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <cuda/functional>
 #include <thrust/extrema.h>
diff --git a/cpp/src/rolling/detail/rolling_variable_window.cu b/cpp/src/rolling/detail/rolling_variable_window.cu
index 83e8faec291..c2324947ef6 100644
--- a/cpp/src/rolling/detail/rolling_variable_window.cu
+++ b/cpp/src/rolling/detail/rolling_variable_window.cu
@@ -18,8 +18,7 @@
 
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <cuda/functional>
 #include <thrust/extrema.h>
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index 1158bf22494..ac6c7b11ef5 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -28,8 +28,7 @@
 #include <cudf/types.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -605,9 +604,9 @@ get_null_bounds_for_orderby_column(column_view const& orderby_column,
 
     // When there are no nulls, just copy the input group offsets to the output.
     return std::make_tuple(cudf::detail::make_device_uvector_async(
-                             group_offsets_span, stream, rmm::mr::get_current_device_resource()),
+                             group_offsets_span, stream, cudf::get_current_device_resource_ref()),
                            cudf::detail::make_device_uvector_async(
-                             group_offsets_span, stream, rmm::mr::get_current_device_resource()));
+                             group_offsets_span, stream, cudf::get_current_device_resource_ref()));
   }
 }
 
diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu
index 5dff40a3396..651bf26b8d9 100644
--- a/cpp/src/rolling/rolling.cu
+++ b/cpp/src/rolling/rolling.cu
@@ -20,8 +20,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/rolling.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index 369ed039b66..8988d73fb02 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -30,11 +30,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 #include <thrust/uninitialized_fill.h>
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 83209c55c8a..31535198c58 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -21,10 +21,10 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -591,7 +591,7 @@ table struct_scalar::init_data(table&& data,
 
   // push validity mask down
   auto const validity = cudf::detail::create_null_mask(
-    1, mask_state::ALL_NULL, stream, rmm::mr::get_current_device_resource());
+    1, mask_state::ALL_NULL, stream, cudf::get_current_device_resource_ref());
   for (auto& col : data_cols) {
     col = cudf::structs::detail::superimpose_nulls(
       static_cast<bitmask_type const*>(validity.data()), 1, std::move(col), stream, mr);
diff --git a/cpp/src/scalar/scalar_factories.cpp b/cpp/src/scalar/scalar_factories.cpp
index d59c5c9fc85..656fe61fbbe 100644
--- a/cpp/src/scalar/scalar_factories.cpp
+++ b/cpp/src/scalar/scalar_factories.cpp
@@ -19,11 +19,11 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace {
diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu
index 57f2c59de40..5d21e8f662c 100644
--- a/cpp/src/search/contains_column.cu
+++ b/cpp/src/search/contains_column.cu
@@ -21,9 +21,9 @@
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/search.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -59,10 +59,10 @@ std::unique_ptr<column> contains_column_dispatch::operator()<dictionary32>(
   dictionary_column_view const needles(needles_in);
   // first combine keys so both dictionaries have the same set
   auto needles_matched = dictionary::detail::add_keys(
-    needles, haystack.keys(), stream, rmm::mr::get_current_device_resource());
+    needles, haystack.keys(), stream, cudf::get_current_device_resource_ref());
   auto const needles_view = dictionary_column_view(needles_matched->view());
   auto haystack_matched   = dictionary::detail::set_keys(
-    haystack, needles_view.keys(), stream, rmm::mr::get_current_device_resource());
+    haystack, needles_view.keys(), stream, cudf::get_current_device_resource_ref());
   auto const haystack_view = dictionary_column_view(haystack_matched->view());
 
   // now just use the indices for the contains
diff --git a/cpp/src/search/contains_scalar.cu b/cpp/src/search/contains_scalar.cu
index 2aa9e24174b..21f2d601d6b 100644
--- a/cpp/src/search/contains_scalar.cu
+++ b/cpp/src/search/contains_scalar.cu
@@ -27,6 +27,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -146,7 +147,7 @@ bool contains_scalar_dispatch::operator()<cudf::dictionary32>(column_view const&
   auto const dict_col = cudf::dictionary_column_view(haystack);
   // first, find the needle in the dictionary's key set
   auto const index = cudf::dictionary::detail::get_index(
-    dict_col, needle, stream, rmm::mr::get_current_device_resource());
+    dict_col, needle, stream, cudf::get_current_device_resource_ref());
   // if found, check the index is actually in the indices column
   return index->is_valid(stream) && cudf::type_dispatcher(dict_col.indices().type(),
                                                           contains_scalar_dispatch{},
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index 66cefd0aa2f..2f6d23b7f7d 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -23,11 +23,11 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_set.cuh>
 #include <cuda/functional>
@@ -119,7 +119,7 @@ std::pair<rmm::device_buffer, bitmask_type const*> build_row_bitmask(table_view
   if (nullable_columns.size() > 1) {
     auto row_bitmask =
       cudf::detail::bitmask_and(
-        table_view{nullable_columns}, stream, rmm::mr::get_current_device_resource())
+        table_view{nullable_columns}, stream, cudf::get_current_device_resource_ref())
         .first;
     auto const row_bitmask_ptr = static_cast<bitmask_type const*>(row_bitmask.data());
     return std::pair(std::move(row_bitmask), row_bitmask_ptr);
diff --git a/cpp/src/search/search_ordered.cu b/cpp/src/search/search_ordered.cu
index 80651a4ec44..ac93e24b254 100644
--- a/cpp/src/search/search_ordered.cu
+++ b/cpp/src/search/search_ordered.cu
@@ -23,10 +23,10 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 
@@ -64,7 +64,7 @@ std::unique_ptr<column> search_ordered(table_view const& haystack,
   // This utility will ensure all corresponding dictionary columns have matching keys.
   // It will return any new dictionary columns created as well as updated table_views.
   auto const matched = dictionary::detail::match_dictionaries(
-    {haystack, needles}, stream, rmm::mr::get_current_device_resource());
+    {haystack, needles}, stream, cudf::get_current_device_resource_ref());
   auto const& matched_haystack = matched.second.front();
   auto const& matched_needles  = matched.second.back();
 
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index c5dcc7c240d..cbde87198bd 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -27,10 +27,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <cuda/std/type_traits>
diff --git a/cpp/src/sort/segmented_sort.cu b/cpp/src/sort/segmented_sort.cu
index 408ac29b8a9..5dc5c39f2bc 100644
--- a/cpp/src/sort/segmented_sort.cu
+++ b/cpp/src/sort/segmented_sort.cu
@@ -20,10 +20,10 @@
 #include <cudf/detail/sorting.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh
index 281fdfa6b8f..a397d4c6630 100644
--- a/cpp/src/sort/segmented_sort_impl.cuh
+++ b/cpp/src/sort/segmented_sort_impl.cuh
@@ -23,11 +23,10 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/sequence.hpp>
 #include <cudf/detail/sorting.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/device/device_segmented_sort.cuh>
 
@@ -77,7 +76,7 @@ struct column_fast_sort_fn {
                                                 input.size(),
                                                 mask_allocation_policy::NEVER,
                                                 stream,
-                                                rmm::mr::get_current_device_resource());
+                                                cudf::get_current_device_resource_ref());
     mutable_column_view output_view = temp_col->mutable_view();
     auto temp_indices               = cudf::column(
       cudf::column_view(indices.type(), indices.size(), indices.head(), nullptr, 0), stream);
@@ -311,12 +310,13 @@ std::unique_ptr<table> segmented_sort_by_key_common(table_view const& values,
 {
   CUDF_EXPECTS(values.num_rows() == keys.num_rows(),
                "Mismatch in number of rows for values and keys");
-  auto sorted_order = segmented_sorted_order_common<method>(keys,
-                                                            segment_offsets,
-                                                            column_order,
-                                                            null_precedence,
-                                                            stream,
-                                                            rmm::mr::get_current_device_resource());
+  auto sorted_order =
+    segmented_sorted_order_common<method>(keys,
+                                          segment_offsets,
+                                          column_order,
+                                          null_precedence,
+                                          stream,
+                                          cudf::get_current_device_resource_ref());
   // Gather segmented sort of child value columns
   return detail::gather(values,
                         sorted_order->view(),
diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
index 7216bc99e08..ac6fef17952 100644
--- a/cpp/src/sort/sort.cu
+++ b/cpp/src/sort/sort.cu
@@ -24,9 +24,9 @@
 #include <cudf/sorting.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/sort.h>
@@ -53,7 +53,7 @@ std::unique_ptr<table> sort_by_key(table_view const& values,
                "Mismatch in number of rows for values and keys");
 
   auto sorted_order = detail::sorted_order(
-    keys, column_order, null_precedence, stream, rmm::mr::get_current_device_resource());
+    keys, column_order, null_precedence, stream, cudf::get_current_device_resource_ref());
 
   return detail::gather(values,
                         sorted_order->view(),
diff --git a/cpp/src/sort/sort_column.cu b/cpp/src/sort/sort_column.cu
index 99a45bf91a3..212f4728c05 100644
--- a/cpp/src/sort/sort_column.cu
+++ b/cpp/src/sort/sort_column.cu
@@ -19,10 +19,9 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <thrust/sequence.h>
 
 namespace cudf {
diff --git a/cpp/src/sort/sort_column_impl.cuh b/cpp/src/sort/sort_column_impl.cuh
index 564791e0b49..906cfb23894 100644
--- a/cpp/src/sort/sort_column_impl.cuh
+++ b/cpp/src/sort/sort_column_impl.cuh
@@ -21,11 +21,11 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
diff --git a/cpp/src/sort/sort_impl.cuh b/cpp/src/sort/sort_impl.cuh
index 20e977e9fd5..d5efebf26e2 100644
--- a/cpp/src/sort/sort_impl.cuh
+++ b/cpp/src/sort/sort_impl.cuh
@@ -20,8 +20,7 @@
 #include "sort_column_impl.cuh"
 
 #include <cudf/column/column_factories.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/sort/stable_segmented_sort.cu b/cpp/src/sort/stable_segmented_sort.cu
index 61e37205c98..e814386db66 100644
--- a/cpp/src/sort/stable_segmented_sort.cu
+++ b/cpp/src/sort/stable_segmented_sort.cu
@@ -20,8 +20,7 @@
 #include <cudf/detail/sorting.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu
index ce05a755756..6ce4dfbead8 100644
--- a/cpp/src/sort/stable_sort.cu
+++ b/cpp/src/sort/stable_sort.cu
@@ -24,9 +24,9 @@
 #include <cudf/sorting.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
@@ -69,7 +69,7 @@ std::unique_ptr<table> stable_sort_by_key(table_view const& values,
                "Mismatch in number of rows for values and keys");
 
   auto sorted_order = detail::stable_sorted_order(
-    keys, column_order, null_precedence, stream, rmm::mr::get_current_device_resource());
+    keys, column_order, null_precedence, stream, cudf::get_current_device_resource_ref());
 
   return detail::gather(values,
                         sorted_order->view(),
diff --git a/cpp/src/sort/stable_sort_column.cu b/cpp/src/sort/stable_sort_column.cu
index bdb631a8154..e1aca9d9fe3 100644
--- a/cpp/src/sort/stable_sort_column.cu
+++ b/cpp/src/sort/stable_sort_column.cu
@@ -19,10 +19,9 @@
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <thrust/sequence.h>
 
 namespace cudf {
diff --git a/cpp/src/stream_compaction/apply_boolean_mask.cu b/cpp/src/stream_compaction/apply_boolean_mask.cu
index 9812f4ffbd7..2c60687b92c 100644
--- a/cpp/src/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/stream_compaction/apply_boolean_mask.cu
@@ -24,10 +24,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <algorithm>
 
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index 24e2692cb6f..7d11b02d3e1 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -26,11 +26,10 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <utility>
 #include <vector>
@@ -134,7 +133,7 @@ std::unique_ptr<table> distinct(table_view const& input,
                                                    nulls_equal,
                                                    nans_equal,
                                                    stream,
-                                                   rmm::mr::get_current_device_resource());
+                                                   cudf::get_current_device_resource_ref());
   return detail::gather(input,
                         gather_map,
                         out_of_bounds_policy::DONT_CHECK,
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 78eb0fa5212..46a7f088298 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -30,6 +30,7 @@
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -159,7 +160,7 @@ cudf::size_type distinct_count(table_view const& keys,
       // We must consider a row if any of its column entries is valid,
       // hence OR together the validities of the columns.
       auto const [row_bitmask, null_count] =
-        cudf::detail::bitmask_or(keys, stream, rmm::mr::get_current_device_resource());
+        cudf::detail::bitmask_or(keys, stream, cudf::get_current_device_resource_ref());
 
       // Unless all columns have a null mask, row_bitmask will be
       // null, and null_count will be zero. Equally, unless there is
diff --git a/cpp/src/stream_compaction/distinct_helpers.hpp b/cpp/src/stream_compaction/distinct_helpers.hpp
index bea02e3dbe8..f15807c2434 100644
--- a/cpp/src/stream_compaction/distinct_helpers.hpp
+++ b/cpp/src/stream_compaction/distinct_helpers.hpp
@@ -18,10 +18,10 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuco/static_set.cuh>
 #include <cuda/functional>
diff --git a/cpp/src/stream_compaction/drop_nans.cu b/cpp/src/stream_compaction/drop_nans.cu
index b98ebbc2ecc..8a53a2e8360 100644
--- a/cpp/src/stream_compaction/drop_nans.cu
+++ b/cpp/src/stream_compaction/drop_nans.cu
@@ -22,10 +22,10 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/execution_policy.h>
diff --git a/cpp/src/stream_compaction/drop_nulls.cu b/cpp/src/stream_compaction/drop_nulls.cu
index 2497e4e5065..22da762a0dd 100644
--- a/cpp/src/stream_compaction/drop_nulls.cu
+++ b/cpp/src/stream_compaction/drop_nulls.cu
@@ -22,9 +22,9 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/count.h>
 #include <thrust/execution_policy.h>
diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu
index 074d4fd7d1a..2097b7bd3d2 100644
--- a/cpp/src/stream_compaction/stable_distinct.cu
+++ b/cpp/src/stream_compaction/stable_distinct.cu
@@ -19,10 +19,9 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scatter.h>
 #include <thrust/uninitialized_fill.h>
@@ -47,7 +46,7 @@ std::unique_ptr<table> stable_distinct(table_view const& input,
                                                          nulls_equal,
                                                          nans_equal,
                                                          stream,
-                                                         rmm::mr::get_current_device_resource());
+                                                         cudf::get_current_device_resource_ref());
 
   // The only difference between this implementation and the unstable version
   // is that the stable implementation must retain the input order. The
diff --git a/cpp/src/stream_compaction/unique.cu b/cpp/src/stream_compaction/unique.cu
index 93de0e60b6d..eaabc6f1272 100644
--- a/cpp/src/stream_compaction/unique.cu
+++ b/cpp/src/stream_compaction/unique.cu
@@ -31,11 +31,11 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu
index 778f546990d..c56d25fde2b 100644
--- a/cpp/src/strings/attributes.cu
+++ b/cpp/src/strings/attributes.cu
@@ -26,11 +26,11 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/warp/warp_reduce.cuh>
 #include <cuda/functional>
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index 3f7a98381b8..45e80cc780d 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -25,9 +25,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/pair.h>
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index 27befdea209..4c015f3cbed 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -29,10 +29,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda/atomic>
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 58137aced0f..c3b4938da1a 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -27,9 +27,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index a2c77c5e77f..617ff41a043 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -29,11 +29,11 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index b534e9b2e5b..07e659e380e 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -29,11 +29,11 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index f5dfc1a2012..663dc9dda73 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -27,10 +27,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index 79d241205df..67531fea579 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -27,9 +27,9 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index d4ccb685061..3ba17fdb872 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -24,10 +24,10 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index 99c40f00b00..4c9eba5b526 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -28,13 +28,13 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/std/optional>
 #include <thrust/execution_policy.h>
@@ -161,7 +161,7 @@ struct format_compiler {
 
     // copy format_items to device memory
     d_items = cudf::detail::make_device_uvector_async(
-      items, stream, rmm::mr::get_current_device_resource());
+      items, stream, cudf::get_current_device_resource_ref());
   }
 
   device_span<format_item const> format_items() { return device_span<format_item const>(d_items); }
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index 514ab965fc5..0db1adf1223 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -21,10 +21,10 @@
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/functional.h>
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 73089ad407e..9848c1f605e 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -27,11 +27,11 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/std/climits>
 #include <cuda/std/limits>
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index bd7b411d3c3..d3d90104252 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -25,12 +25,12 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index a34b148a951..fce83e87645 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -24,12 +24,12 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/execution_policy.h>
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index aeabc71d300..b4eead05ce5 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -27,11 +27,11 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 13d6e9bc3ba..c0c890341ae 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -24,9 +24,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
index 604f928430b..f574f091ab5 100644
--- a/cpp/src/strings/convert/convert_lists.cu
+++ b/cpp/src/strings/convert/convert_lists.cu
@@ -21,9 +21,9 @@
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index 39907a38f2f..520f5897415 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -28,10 +28,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 352e0f9f41a..1d9d12686eb 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -24,11 +24,11 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_device_view.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/advance.h>
 #include <thrust/binary_search.h>
@@ -87,7 +87,7 @@ auto create_strings_device_views(host_span<column_view const> views, rmm::cuda_s
     });
   thrust::inclusive_scan(thrust::host, offset_it, input_offsets.end(), offset_it);
   auto d_input_offsets = cudf::detail::make_device_uvector_async(
-    input_offsets, stream, rmm::mr::get_current_device_resource());
+    input_offsets, stream, cudf::get_current_device_resource_ref());
   auto const output_size = input_offsets.back();
 
   // Compute the partition offsets and size of chars column
diff --git a/cpp/src/strings/copying/copy_range.cu b/cpp/src/strings/copying/copy_range.cu
index 2434de1795e..90865a4b73e 100644
--- a/cpp/src/strings/copying/copy_range.cu
+++ b/cpp/src/strings/copying/copy_range.cu
@@ -22,10 +22,10 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/strings/copying/copying.cu b/cpp/src/strings/copying/copying.cu
index e8b411d50a6..f923f99c131 100644
--- a/cpp/src/strings/copying/copying.cu
+++ b/cpp/src/strings/copying/copying.cu
@@ -21,11 +21,11 @@
 #include <cudf/strings/detail/copying.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/copying/shift.cu b/cpp/src/strings/copying/shift.cu
index b386c0860d1..e36d5f9f14e 100644
--- a/cpp/src/strings/copying/shift.cu
+++ b/cpp/src/strings/copying/shift.cu
@@ -22,10 +22,10 @@
 #include <cudf/strings/detail/copying.hpp>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/count_matches.cu b/cpp/src/strings/count_matches.cu
index 4ad3a75baf7..ae4e623a9e8 100644
--- a/cpp/src/strings/count_matches.cu
+++ b/cpp/src/strings/count_matches.cu
@@ -20,8 +20,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/strings/string_view.cuh>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp
index eab9863b975..f46168a3389 100644
--- a/cpp/src/strings/count_matches.hpp
+++ b/cpp/src/strings/count_matches.hpp
@@ -17,9 +17,9 @@
 #pragma once
 
 #include <cudf/column/column.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu
index b18b50d1b43..7323918dcff 100644
--- a/cpp/src/strings/extract/extract.cu
+++ b/cpp/src/strings/extract/extract.cu
@@ -26,10 +26,10 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/execution_policy.h>
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index 897eba58833..a9fbb375e37 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -27,10 +27,10 @@
 #include <cudf/strings/extract.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/transform_scan.h>
diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu
index 878d0fe11ba..6a2da3542c7 100644
--- a/cpp/src/strings/filling/fill.cu
+++ b/cpp/src/strings/filling/fill.cu
@@ -20,9 +20,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 48620af8cad..3e8b5e2af57 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -28,10 +28,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/translate.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/find.h>
@@ -134,8 +134,8 @@ std::unique_ptr<column> filter_characters(
     characters_to_filter.begin(), characters_to_filter.end(), htable.begin(), [](auto entry) {
       return char_range{entry.first, entry.second};
     });
-  rmm::device_uvector<char_range> table =
-    cudf::detail::make_device_uvector_async(htable, stream, rmm::mr::get_current_device_resource());
+  rmm::device_uvector<char_range> table = cudf::detail::make_device_uvector_async(
+    htable, stream, cudf::get_current_device_resource_ref());
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
diff --git a/cpp/src/strings/like.cu b/cpp/src/strings/like.cu
index 4df1b9b4ffe..f8db66f998b 100644
--- a/cpp/src/strings/like.cu
+++ b/cpp/src/strings/like.cu
@@ -21,10 +21,10 @@
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index 0d146108436..fb2ce9a251a 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -24,9 +24,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/src/strings/regex/utilities.cuh b/cpp/src/strings/regex/utilities.cuh
index afbfe9de049..679907788bb 100644
--- a/cpp/src/strings/regex/utilities.cuh
+++ b/cpp/src/strings/regex/utilities.cuh
@@ -24,10 +24,10 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/strings_children.cuh>
 #include <cudf/strings/detail/utilities.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/scan.h>
 
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 022f1eb3232..eae4839b3e4 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -27,9 +27,9 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index 86afe4c8b9b..a46b5ebad4f 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -28,9 +28,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <regex>
 
@@ -120,7 +120,7 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& input,
   auto group_count = std::min(99, d_prog->group_counts());  // group count should NOT exceed 99
   auto const parse_result                    = parse_backrefs(replacement, group_count);
   rmm::device_uvector<backref_type> backrefs = cudf::detail::make_device_uvector_async(
-    parse_result.second, stream, rmm::mr::get_current_device_resource());
+    parse_result.second, stream, cudf::get_current_device_resource_ref());
   string_scalar repl_scalar(parse_result.first, true, stream);
   string_view const d_repl_template = repl_scalar.value(stream);
 
diff --git a/cpp/src/strings/replace/find_replace.cu b/cpp/src/strings/replace/find_replace.cu
index 79bf6e3c910..8a8001dd81a 100644
--- a/cpp/src/strings/replace/find_replace.cu
+++ b/cpp/src/strings/replace/find_replace.cu
@@ -18,10 +18,10 @@
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/replace/multi.cu b/cpp/src/strings/replace/multi.cu
index b5248700d53..352d883bdc5 100644
--- a/cpp/src/strings/replace/multi.cu
+++ b/cpp/src/strings/replace/multi.cu
@@ -30,10 +30,10 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/copy.h>
@@ -321,9 +321,9 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
     get_offset_value(input.offsets(), input.offset(), stream);
 
   auto d_targets =
-    create_string_vector_from_column(targets, stream, rmm::mr::get_current_device_resource());
+    create_string_vector_from_column(targets, stream, cudf::get_current_device_resource_ref());
   auto d_replacements =
-    create_string_vector_from_column(repls, stream, rmm::mr::get_current_device_resource());
+    create_string_vector_from_column(repls, stream, cudf::get_current_device_resource_ref());
 
   replace_multi_parallel_fn fn{
     *d_strings,
@@ -361,7 +361,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
 
   // create a vector of offsets to each string's set of target positions
   auto const targets_offsets = create_offsets_from_positions(
-    input, targets_positions, stream, rmm::mr::get_current_device_resource());
+    input, targets_positions, stream, cudf::get_current_device_resource_ref());
   auto const d_targets_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(targets_offsets->view());
 
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index 0ad3ab2305c..0777253bb38 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -29,9 +29,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
@@ -180,7 +180,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& input,
                    return *prog;
                  });
   auto d_progs =
-    cudf::detail::make_device_uvector_async(progs, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::make_device_uvector_async(progs, stream, cudf::get_current_device_resource_ref());
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
   auto const d_repls   = column_device_view::create(replacements.parent(), stream);
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index f7a3a3aea5c..16df0dbabdf 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -29,10 +29,10 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
@@ -312,7 +312,7 @@ std::unique_ptr<column> replace_character_parallel(strings_column_view const& in
 
   // create a vector of offsets to each string's set of target positions
   auto const targets_offsets = create_offsets_from_positions(
-    input, targets_positions, stream, rmm::mr::get_current_device_resource());
+    input, targets_positions, stream, cudf::get_current_device_resource_ref());
   auto const d_targets_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(targets_offsets->view());
 
diff --git a/cpp/src/strings/replace/replace_nulls.cu b/cpp/src/strings/replace/replace_nulls.cu
index ffd9e6c2553..ff86501f02c 100644
--- a/cpp/src/strings/replace/replace_nulls.cu
+++ b/cpp/src/strings/replace/replace_nulls.cu
@@ -25,10 +25,10 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index fd988855424..19d660e312e 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -27,9 +27,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/src/strings/replace/replace_slice.cu b/cpp/src/strings/replace/replace_slice.cu
index 04d81218a16..938e3c0270b 100644
--- a/cpp/src/strings/replace/replace_slice.cu
+++ b/cpp/src/strings/replace/replace_slice.cu
@@ -25,9 +25,9 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 
diff --git a/cpp/src/strings/reverse.cu b/cpp/src/strings/reverse.cu
index cbd231bc5f3..a207215523d 100644
--- a/cpp/src/strings/reverse.cu
+++ b/cpp/src/strings/reverse.cu
@@ -24,10 +24,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/src/strings/scan/scan_inclusive.cu b/cpp/src/strings/scan/scan_inclusive.cu
index b3e45f65a21..84cc87bad3e 100644
--- a/cpp/src/strings/scan/scan_inclusive.cu
+++ b/cpp/src/strings/scan/scan_inclusive.cu
@@ -20,11 +20,11 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/scan.h>
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index 45eba39f413..9bd1abb5542 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -27,10 +27,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/binary_search.h>
diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu
index 223a941a88a..ec7015878dd 100644
--- a/cpp/src/strings/search/find_multiple.cu
+++ b/cpp/src/strings/search/find_multiple.cu
@@ -24,10 +24,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index 2f7e7352458..067a513af96 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -28,10 +28,10 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/pair.h>
 
diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index d8324a9b08e..978a844c476 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -28,10 +28,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu
index 93d55c494fe..df1cdcc9d79 100644
--- a/cpp/src/strings/split/partition.cu
+++ b/cpp/src/strings/split/partition.cu
@@ -24,10 +24,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index bc01a46ca6d..352ca83c8b2 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -28,10 +28,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/binary_search.h>
diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
index af70367678e..81aca001d53 100644
--- a/cpp/src/strings/split/split.cuh
+++ b/cpp/src/strings/split/split.cuh
@@ -24,10 +24,10 @@
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/copy.h>
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index d273c93ec12..ef96b9d3f36 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -27,9 +27,9 @@
 #include <cudf/strings/split/split_re.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/distance.h>
 #include <thrust/functional.h>
@@ -152,7 +152,7 @@ std::pair<rmm::device_uvector<string_index_pair>, std::unique_ptr<column>> gener
   auto const end   = begin + strings_count;
 
   auto [offsets, total_tokens] = cudf::detail::make_offsets_child_column(
-    begin, end, stream, rmm::mr::get_current_device_resource());
+    begin, end, stream, cudf::get_current_device_resource_ref());
   auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
 
   // build a vector of tokens
@@ -211,7 +211,7 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
 
   // count the number of delimiters matched in each string
   auto const counts =
-    count_matches(*d_strings, *d_prog, stream, rmm::mr::get_current_device_resource());
+    count_matches(*d_strings, *d_prog, stream, cudf::get_current_device_resource_ref());
 
   // get the split tokens from the input column; this also converts the counts into offsets
   auto [tokens, offsets] =
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 3e8be750b9e..6f14462faf1 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -27,9 +27,9 @@
 #include <cudf/strings/split/split.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu
index a298285f841..07516f91dcf 100644
--- a/cpp/src/strings/strings_column_factories.cu
+++ b/cpp/src/strings/strings_column_factories.cu
@@ -21,11 +21,11 @@
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/pair.h>
diff --git a/cpp/src/strings/strings_scalar_factories.cpp b/cpp/src/strings/strings_scalar_factories.cpp
index cf973638cc4..219d1174d42 100644
--- a/cpp/src/strings/strings_scalar_factories.cpp
+++ b/cpp/src/strings/strings_scalar_factories.cpp
@@ -16,9 +16,9 @@
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 // Create a strings-type column from array of pointer/size pairs
diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu
index 639097abe63..0dc4c038a02 100644
--- a/cpp/src/strings/strip.cu
+++ b/cpp/src/strings/strip.cu
@@ -23,10 +23,10 @@
 #include <cudf/strings/strip.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace strings {
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index a242b008a54..22ab5d4fe81 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -25,10 +25,10 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/translate.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -107,8 +107,8 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
     return lhs.first < rhs.first;
   });
   // copy translate table to device memory
-  rmm::device_uvector<translate_table> table =
-    cudf::detail::make_device_uvector_async(htable, stream, rmm::mr::get_current_device_resource());
+  rmm::device_uvector<translate_table> table = cudf::detail::make_device_uvector_async(
+    htable, stream, cudf::get_current_device_resource_ref());
 
   auto d_strings = column_device_view::create(strings.parent(), stream);
 
diff --git a/cpp/src/strings/utilities.cu b/cpp/src/strings/utilities.cu
index 068d89a52dc..45bd4615435 100644
--- a/cpp/src/strings/utilities.cu
+++ b/cpp/src/strings/utilities.cu
@@ -26,11 +26,11 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu
index dff1891c3cc..38a18aff98d 100644
--- a/cpp/src/strings/wrap.cu
+++ b/cpp/src/strings/wrap.cu
@@ -25,11 +25,11 @@
 #include <cudf/strings/wrap.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/structs/copying/concatenate.cu b/cpp/src/structs/copying/concatenate.cu
index 2ccf071711a..2120b4f08c4 100644
--- a/cpp/src/structs/copying/concatenate.cu
+++ b/cpp/src/structs/copying/concatenate.cu
@@ -23,9 +23,9 @@
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/structs/structs_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <algorithm>
 #include <memory>
diff --git a/cpp/src/structs/scan/scan_inclusive.cu b/cpp/src/structs/scan/scan_inclusive.cu
index a6ccea5fca1..28756b25c89 100644
--- a/cpp/src/structs/scan/scan_inclusive.cu
+++ b/cpp/src/structs/scan/scan_inclusive.cu
@@ -20,11 +20,11 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/device_operators.cuh>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/scan.h>
diff --git a/cpp/src/structs/structs_column_factories.cu b/cpp/src/structs/structs_column_factories.cu
index bbe2bb96fde..86b30d0ccbd 100644
--- a/cpp/src/structs/structs_column_factories.cu
+++ b/cpp/src/structs/structs_column_factories.cu
@@ -17,9 +17,9 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index 81806c92e23..5df9943303d 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -25,11 +25,10 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <rmm/resource_ref.hpp>
-
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
diff --git a/cpp/src/table/row_operators.cu b/cpp/src/table/row_operators.cu
index 2969557c78f..990c4855a14 100644
--- a/cpp/src/table/row_operators.cu
+++ b/cpp/src/table/row_operators.cu
@@ -27,12 +27,10 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_checks.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
-
 #include <thrust/iterator/transform_iterator.h>
 
 namespace cudf {
@@ -319,7 +317,7 @@ auto list_lex_preprocess(table_view const& table, rmm::cuda_stream_view stream)
     }
   }
   auto d_dremel_device_views = detail::make_device_uvector_sync(
-    dremel_device_views, stream, rmm::mr::get_current_device_resource());
+    dremel_device_views, stream, cudf::get_current_device_resource_ref());
   return std::make_tuple(std::move(dremel_data), std::move(d_dremel_device_views));
 }
 
@@ -588,12 +586,12 @@ transform_lists_of_structs(column_view const& lhs,
       auto const concatenated_children =
         cudf::detail::concatenate(std::vector<column_view>{child_lhs, child_rhs},
                                   stream,
-                                  rmm::mr::get_current_device_resource());
+                                  cudf::get_current_device_resource_ref());
 
       auto const ranks        = compute_ranks(concatenated_children->view(),
                                        column_null_order,
                                        stream,
-                                       rmm::mr::get_current_device_resource());
+                                       cudf::get_current_device_resource_ref());
       auto const ranks_slices = cudf::detail::slice(
         ranks->view(),
         {0, child_lhs.size(), child_lhs.size(), child_lhs.size() + child_rhs.size()},
@@ -647,13 +645,13 @@ std::shared_ptr<preprocessed_table> preprocessed_table::create(
 {
   check_lex_compatibility(preprocessed_input);
 
-  auto d_table = table_device_view::create(preprocessed_input, stream);
-  auto d_column_order =
-    detail::make_device_uvector_async(column_order, stream, rmm::mr::get_current_device_resource());
+  auto d_table        = table_device_view::create(preprocessed_input, stream);
+  auto d_column_order = detail::make_device_uvector_async(
+    column_order, stream, cudf::get_current_device_resource_ref());
   auto d_null_precedence = detail::make_device_uvector_async(
-    null_precedence, stream, rmm::mr::get_current_device_resource());
+    null_precedence, stream, cudf::get_current_device_resource_ref());
   auto d_depths = detail::make_device_uvector_async(
-    verticalized_col_depths, stream, rmm::mr::get_current_device_resource());
+    verticalized_col_depths, stream, cudf::get_current_device_resource_ref());
 
   if (detail::has_nested_columns(preprocessed_input)) {
     auto [dremel_data, d_dremel_device_view] = list_lex_preprocess(preprocessed_input, stream);
@@ -699,7 +697,7 @@ std::shared_ptr<preprocessed_table> preprocessed_table::create(
           lhs_col,
           null_precedence.empty() ? null_order::BEFORE : new_null_precedence[col_idx],
           stream,
-          rmm::mr::get_current_device_resource());
+          cudf::get_current_device_resource_ref());
 
         transformed_cvs.emplace_back(std::move(transformed));
         transformed_columns.insert(transformed_columns.end(),
@@ -761,7 +759,7 @@ preprocessed_table::create(table_view const& lhs,
           rhs_col,
           null_precedence.empty() ? null_order::BEFORE : null_precedence[col_idx],
           stream,
-          rmm::mr::get_current_device_resource());
+          cudf::get_current_device_resource_ref());
 
       transformed_lhs_cvs.emplace_back(std::move(transformed_lhs));
       transformed_rhs_cvs.emplace_back(std::move(transformed_rhs));
@@ -854,7 +852,7 @@ std::shared_ptr<preprocessed_table> preprocessed_table::create(table_view const&
   check_eq_compatibility(t);
 
   auto [null_pushed_table, nullable_data] =
-    structs::detail::push_down_nulls(t, stream, rmm::mr::get_current_device_resource());
+    structs::detail::push_down_nulls(t, stream, cudf::get_current_device_resource_ref());
   auto struct_offset_removed_table = remove_struct_child_offsets(null_pushed_table);
   auto verticalized_t =
     std::get<0>(decompose_structs(struct_offset_removed_table, decompose_lists_column::YES));
diff --git a/cpp/src/table/table.cpp b/cpp/src/table/table.cpp
index 9dac7be5efe..cb707c94288 100644
--- a/cpp/src/table/table.cpp
+++ b/cpp/src/table/table.cpp
@@ -18,9 +18,9 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cu b/cpp/src/text/bpe/byte_pair_encoding.cu
index e196eee275f..f46f49ddc0e 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cu
+++ b/cpp/src/text/bpe/byte_pair_encoding.cu
@@ -29,12 +29,12 @@
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/byte_pair_encoding.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/distance.h>
diff --git a/cpp/src/text/bpe/load_merge_pairs.cu b/cpp/src/text/bpe/load_merge_pairs.cu
index 9fb86aecce3..cd68566bdec 100644
--- a/cpp/src/text/bpe/load_merge_pairs.cu
+++ b/cpp/src/text/bpe/load_merge_pairs.cu
@@ -23,12 +23,12 @@
 #include <cudf/strings/split/split.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/byte_pair_encoding.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 
diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index 6635b61093e..15cb53c7c28 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -27,12 +27,12 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/tokenize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/count.h>
@@ -148,7 +148,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
   auto strings_column = cudf::column_device_view::create(strings.parent(), stream);
   // the indices may not be in order so we need to build a sorted map
   auto sorted_rows = cudf::detail::stable_sorted_order(
-    cudf::table_view({row_indices}), {}, {}, stream, rmm::mr::get_current_device_resource());
+    cudf::table_view({row_indices}), {}, {}, stream, cudf::get_current_device_resource_ref());
   auto const d_row_map = sorted_rows->view().data<cudf::size_type>();
 
   // create offsets for the tokens for each output string
diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu
index 8d857175407..b04e9961e01 100644
--- a/cpp/src/text/edit_distance.cu
+++ b/cpp/src/text/edit_distance.cu
@@ -22,13 +22,13 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/edit_distance.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 6f700f84ec4..a87ecb81b9d 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -29,12 +29,12 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/detail/generate_ngrams.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
@@ -122,7 +122,7 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
                              return !d_strings.element<cudf::string_view>(idx).empty();
                            },
                            stream,
-                           rmm::mr::get_current_device_resource())
+                           cudf::get_current_device_resource_ref())
                            ->release();
     strings_count = table_offsets.front()->size() - 1;
     auto result   = std::move(table_offsets.front());
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index e856b89b836..2de94a4eb59 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -26,6 +26,7 @@
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <nvtext/jaccard.hpp>
@@ -33,7 +34,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <thrust/binary_search.h>
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 4318123627d..605582f28a6 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -28,12 +28,12 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/minhash.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/execution_policy.h>
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index 95dd8ff3d6c..eee293268a2 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -27,13 +27,13 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/detail/tokenize.hpp>
 #include <nvtext/ngrams_tokenize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <thrust/for_each.h>
@@ -166,7 +166,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
   auto const count_itr =
     cudf::detail::make_counting_transform_iterator(0, strings_tokenizer{d_strings, d_delimiter});
   auto [token_offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column(
-    count_itr, count_itr + strings_count, stream, rmm::mr::get_current_device_resource());
+    count_itr, count_itr + strings_count, stream, cudf::get_current_device_resource_ref());
   auto d_token_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(token_offsets->view());
 
@@ -191,7 +191,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
         return (token_count >= ngrams) ? token_count - ngrams + 1 : 0;
       }));
   auto [ngram_offsets, total_ngrams] = cudf::detail::make_offsets_child_column(
-    ngram_counts, ngram_counts + strings_count, stream, rmm::mr::get_current_device_resource());
+    ngram_counts, ngram_counts + strings_count, stream, cudf::get_current_device_resource_ref());
   auto d_ngram_offsets = ngram_offsets->view().begin<cudf::size_type>();
 
   // Compute the total size of the ngrams for each string (not for each ngram)
@@ -207,7 +207,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
   auto const sizes_itr = cudf::detail::make_counting_transform_iterator(
     0, ngram_builder_fn{d_strings, d_separator, ngrams, d_token_offsets, d_token_positions});
   auto [chars_offsets, output_chars_size] = cudf::strings::detail::make_offsets_child_column(
-    sizes_itr, sizes_itr + strings_count, stream, rmm::mr::get_current_device_resource());
+    sizes_itr, sizes_itr + strings_count, stream, cudf::get_current_device_resource_ref());
   auto d_chars_offsets =
     cudf::detail::offsetalator_factory::make_input_iterator(chars_offsets->view());
 
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 4db11dc5beb..7e2b766862d 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -32,11 +32,11 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/normalize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/execution_policy.h>
 #include <thrust/for_each.h>
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index 81c787caf86..943bcbe9b3a 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -28,11 +28,11 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/replace.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/binary_search.h>
diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu
index 4746b6b74b9..379e68b891b 100644
--- a/cpp/src/text/stemmer.cu
+++ b/cpp/src/text/stemmer.cu
@@ -25,12 +25,12 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/stemmer.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/iterator/constant_iterator.h>
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index a08fdea3e84..eca703e2604 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -22,13 +22,13 @@
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/detail/load_hash_file.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/fill.h>
 
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index e05427eb6ac..d7e04a0c208 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -25,13 +25,13 @@
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/detail/load_hash_file.hpp>
 #include <nvtext/subword_tokenize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/for_each.h>
 #include <thrust/functional.h>
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index 3ce6064d9c2..df25950e6d5 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -27,6 +27,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/detail/tokenize.hpp>
 #include <nvtext/tokenize.hpp>
@@ -34,7 +35,6 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/atomic>
 #include <thrust/copy.h>
@@ -79,14 +79,14 @@ std::unique_ptr<cudf::column> tokenize_fn(cudf::size_type strings_count,
 {
   // get the number of tokens in each string
   auto const token_counts =
-    token_count_fn(strings_count, tokenizer, stream, rmm::mr::get_current_device_resource());
+    token_count_fn(strings_count, tokenizer, stream, cudf::get_current_device_resource_ref());
   auto d_token_counts = token_counts->view();
   // create token-index offsets from the counts
   auto [token_offsets, total_tokens] =
     cudf::detail::make_offsets_child_column(d_token_counts.template begin<cudf::size_type>(),
                                             d_token_counts.template end<cudf::size_type>(),
                                             stream,
-                                            rmm::mr::get_current_device_resource());
+                                            cudf::get_current_device_resource_ref());
   //  build a list of pointers to each token
   rmm::device_uvector<string_index_pair> tokens(total_tokens, stream);
   // now go get the tokens
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index 5945921ed9d..a2297987732 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -32,11 +32,11 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <nvtext/tokenize.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuco/static_map.cuh>
diff --git a/cpp/src/transform/bools_to_mask.cu b/cpp/src/transform/bools_to_mask.cu
index 452aebf4428..f365d690fde 100644
--- a/cpp/src/transform/bools_to_mask.cu
+++ b/cpp/src/transform/bools_to_mask.cu
@@ -23,11 +23,11 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index c4fc8d58552..93105b321dd 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -30,11 +30,11 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index 1c9d52bce1b..cffb77ba776 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -27,11 +27,11 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <memory>
 #include <numeric>
diff --git a/cpp/src/transform/mask_to_bools.cu b/cpp/src/transform/mask_to_bools.cu
index be0b80a2633..fe1f6674e8b 100644
--- a/cpp/src/transform/mask_to_bools.cu
+++ b/cpp/src/transform/mask_to_bools.cu
@@ -21,10 +21,10 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/bit.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/counting_iterator.h>
diff --git a/cpp/src/transform/nans_to_nulls.cu b/cpp/src/transform/nans_to_nulls.cu
index a24ba304004..adb8852c6e6 100644
--- a/cpp/src/transform/nans_to_nulls.cu
+++ b/cpp/src/transform/nans_to_nulls.cu
@@ -22,11 +22,11 @@
 #include <cudf/null_mask.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 46e6e55b0b7..e1a784a985e 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -24,12 +24,12 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_checks.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 6a965d10184..66bbe532e46 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -28,11 +28,11 @@
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <cuda/functional>
 #include <cuda/std/optional>
@@ -526,7 +526,7 @@ std::unique_ptr<column> segmented_row_bit_count(table_view const& t,
 
   // move stack info to the gpu
   rmm::device_uvector<column_info> d_info =
-    cudf::detail::make_device_uvector_async(info, stream, rmm::mr::get_current_device_resource());
+    cudf::detail::make_device_uvector_async(info, stream, cudf::get_current_device_resource_ref());
 
   // each thread needs to maintain a stack of row spans of size max_branch_depth. we will use
   // shared memory to do this rather than allocating a potentially gigantic temporary buffer
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index f5e9048fa0a..52b96bc9039 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -24,11 +24,11 @@
 #include <cudf/detail/transform.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <jit_preprocessed_files/transform/jit/kernel.cu.jit.hpp>
 
diff --git a/cpp/src/transpose/transpose.cu b/cpp/src/transpose/transpose.cu
index abde43535be..810fd8afd73 100644
--- a/cpp/src/transpose/transpose.cu
+++ b/cpp/src/transpose/transpose.cu
@@ -22,11 +22,11 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/transpose.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index ec21813705a..0913796a527 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -27,12 +27,12 @@
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index ab17da5f8c4..1d506c59cd9 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -22,10 +22,10 @@
 #include <cudf/dictionary/detail/encode.hpp>
 #include <cudf/dictionary/detail/iterator.cuh>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
@@ -349,7 +349,7 @@ std::unique_ptr<cudf::column> transform_fn(cudf::dictionary_column_view const& i
 {
   auto dictionary_view = cudf::column_device_view::create(input.parent(), stream);
   auto dictionary_itr  = dictionary::detail::make_dictionary_iterator<T>(*dictionary_view);
-  auto default_mr      = rmm::mr::get_current_device_resource();
+  auto default_mr      = cudf::get_current_device_resource_ref();
   // call unary-op using temporary output buffer
   auto output = transform_fn<T, UFN>(dictionary_itr,
                                      dictionary_itr + input.size(),
diff --git a/cpp/src/unary/nan_ops.cu b/cpp/src/unary/nan_ops.cu
index 08aa8755624..17a90a14248 100644
--- a/cpp/src/unary/nan_ops.cu
+++ b/cpp/src/unary/nan_ops.cu
@@ -21,10 +21,10 @@
 #include <cudf/detail/unary.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 namespace detail {
diff --git a/cpp/src/unary/null_ops.cu b/cpp/src/unary/null_ops.cu
index a223a090128..f6514ea265b 100644
--- a/cpp/src/unary/null_ops.cu
+++ b/cpp/src/unary/null_ops.cu
@@ -18,8 +18,7 @@
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
diff --git a/cpp/src/unary/unary_ops.cuh b/cpp/src/unary/unary_ops.cuh
index 61c41705665..34a20d88f37 100644
--- a/cpp/src/unary/unary_ops.cuh
+++ b/cpp/src/unary/unary_ops.cuh
@@ -21,10 +21,10 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/unary.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <thrust/transform.h>
 
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index 7c3cea42023..125b98c4a67 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -18,12 +18,12 @@
 #include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/export.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp
index 4bf648bed5a..fe221fb1c48 100644
--- a/cpp/tests/bitmask/bitmask_tests.cpp
+++ b/cpp/tests/bitmask/bitmask_tests.cpp
@@ -28,6 +28,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
@@ -90,7 +91,7 @@ rmm::device_uvector<cudf::bitmask_type> make_mask(cudf::size_type size, bool fil
 {
   if (!fill_valid) {
     return cudf::detail::make_zeroed_device_uvector_sync<cudf::bitmask_type>(
-      size, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+      size, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   } else {
     auto ret = rmm::device_uvector<cudf::bitmask_type>(size, cudf::get_default_stream());
     CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(),
diff --git a/cpp/tests/bitmask/valid_if_tests.cu b/cpp/tests/bitmask/valid_if_tests.cu
index 65143ec17f1..96f122f21a8 100644
--- a/cpp/tests/bitmask/valid_if_tests.cu
+++ b/cpp/tests/bitmask/valid_if_tests.cu
@@ -22,6 +22,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 
@@ -43,7 +44,7 @@ TEST_F(ValidIfTest, EmptyRange)
                                        thrust::make_counting_iterator(0),
                                        odds_valid{},
                                        cudf::get_default_stream(),
-                                       rmm::mr::get_current_device_resource());
+                                       cudf::get_current_device_resource_ref());
   auto const& buffer = actual.first;
   EXPECT_EQ(0u, buffer.size());
   EXPECT_EQ(nullptr, buffer.data());
@@ -56,7 +57,7 @@ TEST_F(ValidIfTest, InvalidRange)
                                       thrust::make_counting_iterator(0),
                                       odds_valid{},
                                       cudf::get_default_stream(),
-                                      rmm::mr::get_current_device_resource()),
+                                      cudf::get_current_device_resource_ref()),
                cudf::logic_error);
 }
 
@@ -68,7 +69,7 @@ TEST_F(ValidIfTest, OddsValid)
                                        thrust::make_counting_iterator(10000),
                                        odds_valid{},
                                        cudf::get_default_stream(),
-                                       rmm::mr::get_current_device_resource());
+                                       cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.first.data(), actual.first.data(), expected.first.size());
   EXPECT_EQ(5000, actual.second);
   EXPECT_EQ(expected.second, actual.second);
@@ -82,7 +83,7 @@ TEST_F(ValidIfTest, AllValid)
                                        thrust::make_counting_iterator(10000),
                                        all_valid{},
                                        cudf::get_default_stream(),
-                                       rmm::mr::get_current_device_resource());
+                                       cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.first.data(), actual.first.data(), expected.first.size());
   EXPECT_EQ(0, actual.second);
   EXPECT_EQ(expected.second, actual.second);
@@ -96,7 +97,7 @@ TEST_F(ValidIfTest, AllNull)
                                        thrust::make_counting_iterator(10000),
                                        all_null{},
                                        cudf::get_default_stream(),
-                                       rmm::mr::get_current_device_resource());
+                                       cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.first.data(), actual.first.data(), expected.first.size());
   EXPECT_EQ(10000, actual.second);
   EXPECT_EQ(expected.second, actual.second);
diff --git a/cpp/tests/column/column_test.cpp b/cpp/tests/column/column_test.cpp
index 1ba9b14dc1f..14b4197de71 100644
--- a/cpp/tests/column/column_test.cpp
+++ b/cpp/tests/column/column_test.cpp
@@ -33,6 +33,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <numeric>
 #include <random>
@@ -373,7 +374,7 @@ TYPED_TEST(TypedColumnTest, DeviceUvectorConstructorNoMask)
                                                  this->num_elements());
 
   auto original = cudf::detail::make_device_uvector_async(
-    data, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    data, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto original_data = original.data();
   cudf::column moved_to{std::move(original), rmm::device_buffer{}, 0};
   verify_column_views(moved_to);
@@ -389,7 +390,7 @@ TYPED_TEST(TypedColumnTest, DeviceUvectorConstructorWithMask)
                                                  this->num_elements());
 
   auto original = cudf::detail::make_device_uvector_async(
-    data, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    data, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto original_data = original.data();
   auto original_mask = this->all_valid_mask.data();
   cudf::column moved_to{std::move(original), std::move(this->all_valid_mask), 0};
diff --git a/cpp/tests/copying/detail_gather_tests.cu b/cpp/tests/copying/detail_gather_tests.cu
index 17ced5ccd34..b9ae91afd1e 100644
--- a/cpp/tests/copying/detail_gather_tests.cu
+++ b/cpp/tests/copying/detail_gather_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
 
@@ -62,7 +63,7 @@ TYPED_TEST(GatherTest, GatherDetailDeviceVectorTest)
                            gather_map.end(),
                            cudf::out_of_bounds_policy::DONT_CHECK,
                            cudf::get_default_stream(),
-                           rmm::mr::get_current_device_resource());
+                           cudf::get_current_device_resource_ref());
 
     for (auto i = 0; i < source_table.num_columns(); ++i) {
       CUDF_TEST_EXPECT_COLUMNS_EQUAL(source_table.column(i), result->view().column(i));
@@ -79,7 +80,7 @@ TYPED_TEST(GatherTest, GatherDetailDeviceVectorTest)
                            gather_map.data() + gather_map.size(),
                            cudf::out_of_bounds_policy::DONT_CHECK,
                            cudf::get_default_stream(),
-                           rmm::mr::get_current_device_resource());
+                           cudf::get_current_device_resource_ref());
 
     for (auto i = 0; i < source_table.num_columns(); ++i) {
       CUDF_TEST_EXPECT_COLUMNS_EQUAL(source_table.column(i), result->view().column(i));
@@ -107,7 +108,7 @@ TYPED_TEST(GatherTest, GatherDetailInvalidIndexTest)
                          cudf::out_of_bounds_policy::NULLIFY,
                          cudf::detail::negative_index_policy::NOT_ALLOWED,
                          cudf::get_default_stream(),
-                         rmm::mr::get_current_device_resource());
+                         cudf::get_current_device_resource_ref());
 
   auto expect_data =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2) ? 0 : i; });
diff --git a/cpp/tests/copying/gather_str_tests.cpp b/cpp/tests/copying/gather_str_tests.cpp
index b31f34504e7..28098878086 100644
--- a/cpp/tests/copying/gather_str_tests.cpp
+++ b/cpp/tests/copying/gather_str_tests.cpp
@@ -24,8 +24,7 @@
 #include <cudf/detail/gather.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 class GatherTestStr : public cudf::test::BaseFixture {};
 
@@ -91,7 +90,7 @@ TEST_F(GatherTestStr, Gather)
                                       cudf::out_of_bounds_policy::NULLIFY,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
                                       cudf::get_default_stream(),
-                                      rmm::mr::get_current_device_resource());
+                                      cudf::get_current_device_resource_ref());
 
   std::vector<char const*> h_expected;
   std::vector<int32_t> expected_validity;
@@ -122,7 +121,7 @@ TEST_F(GatherTestStr, GatherDontCheckOutOfBounds)
                                       cudf::out_of_bounds_policy::DONT_CHECK,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
                                       cudf::get_default_stream(),
-                                      rmm::mr::get_current_device_resource());
+                                      cudf::get_current_device_resource_ref());
 
   std::vector<char const*> h_expected;
   for (int itr : h_map) {
@@ -141,7 +140,7 @@ TEST_F(GatherTestStr, GatherEmptyMapStringsColumn)
                                       cudf::out_of_bounds_policy::NULLIFY,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
                                       cudf::get_default_stream(),
-                                      rmm::mr::get_current_device_resource());
+                                      cudf::get_current_device_resource_ref());
   cudf::test::expect_column_empty(results->get_column(0).view());
 }
 
@@ -155,6 +154,6 @@ TEST_F(GatherTestStr, GatherZeroSizeStringsColumn)
                                       cudf::out_of_bounds_policy::NULLIFY,
                                       cudf::detail::negative_index_policy::NOT_ALLOWED,
                                       cudf::get_default_stream(),
-                                      rmm::mr::get_current_device_resource());
+                                      cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, results->get_column(0).view());
 }
diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp
index 01ad4f2247c..ff6808d9a79 100644
--- a/cpp/tests/copying/shift_tests.cpp
+++ b/cpp/tests/copying/shift_tests.cpp
@@ -23,10 +23,10 @@
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <limits>
 #include <memory>
@@ -37,7 +37,7 @@ using TestTypes = cudf::test::Types<int32_t>;
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto s = new ScalarType(cudf::test::make_type_param_scalar<T>(0), false, stream, mr);
   return std::unique_ptr<cudf::scalar>(s);
@@ -47,7 +47,7 @@ template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
   T value,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto s = new ScalarType(value, true, stream, mr);
   return std::unique_ptr<cudf::scalar>(s);
diff --git a/cpp/tests/copying/split_tests.cpp b/cpp/tests/copying/split_tests.cpp
index 7ff159cf896..ee3e7da5e0f 100644
--- a/cpp/tests/copying/split_tests.cpp
+++ b/cpp/tests/copying/split_tests.cpp
@@ -28,6 +28,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/filling.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_buffer.hpp>
 
@@ -1383,7 +1384,7 @@ struct ContiguousSplitTest : public cudf::test::BaseFixture {};
 
 std::vector<cudf::packed_table> do_chunked_pack(cudf::table_view const& input)
 {
-  auto mr = rmm::mr::get_current_device_resource();
+  auto mr = cudf::get_current_device_resource_ref();
 
   rmm::device_buffer bounce_buff(1 * 1024 * 1024, cudf::get_default_stream(), mr);
   auto bounce_buff_span =
@@ -2383,7 +2384,7 @@ TEST_F(ContiguousSplitTableCornerCases, ChunkSpanTooSmall)
 {
   auto chunked_pack = cudf::chunked_pack::create({}, 1 * 1024 * 1024);
   rmm::device_buffer buff(
-    1 * 1024, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+    1 * 1024, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::device_span<uint8_t> too_small(static_cast<uint8_t*>(buff.data()), buff.size());
   std::size_t copied = 0;
   // throws because we created chunked_contig_split with 1MB, but we are giving
@@ -2396,7 +2397,7 @@ TEST_F(ContiguousSplitTableCornerCases, EmptyTableHasNextFalse)
 {
   auto chunked_pack = cudf::chunked_pack::create({}, 1 * 1024 * 1024);
   rmm::device_buffer buff(
-    1 * 1024 * 1024, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+    1 * 1024 * 1024, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::device_span<uint8_t> bounce_buff(static_cast<uint8_t*>(buff.data()), buff.size());
   EXPECT_EQ(chunked_pack->has_next(), false);  // empty input table
   std::size_t copied = 0;
@@ -2409,7 +2410,7 @@ TEST_F(ContiguousSplitTableCornerCases, ExhaustedHasNextFalse)
   cudf::test::strings_column_wrapper a{"abc", "def", "ghi", "jkl", "mno", "", "st", "uvwx"};
   cudf::table_view t({a});
   rmm::device_buffer buff(
-    1 * 1024 * 1024, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+    1 * 1024 * 1024, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::device_span<uint8_t> bounce_buff(static_cast<uint8_t*>(buff.data()), buff.size());
   auto chunked_pack = cudf::chunked_pack::create(t, buff.size());
   EXPECT_EQ(chunked_pack->has_next(), true);
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index ccf5ccae187..b81f8196d89 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/device_atomics.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
@@ -144,9 +145,9 @@ struct AtomicsTest : public cudf::test::BaseFixture {
     result_init[5] = result_init[2];
 
     auto dev_data = cudf::detail::make_device_uvector_sync(
-      v, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+      v, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
     auto dev_result = cudf::detail::make_device_uvector_sync(
-      result_init, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+      result_init, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
     if (block_size == 0) { block_size = vec_size; }
 
diff --git a/cpp/tests/dictionary/search_test.cpp b/cpp/tests/dictionary/search_test.cpp
index 1b73576e083..25501b4fde7 100644
--- a/cpp/tests/dictionary/search_test.cpp
+++ b/cpp/tests/dictionary/search_test.cpp
@@ -20,6 +20,7 @@
 
 #include <cudf/dictionary/detail/search.hpp>
 #include <cudf/dictionary/search.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 struct DictionarySearchTest : public cudf::test::BaseFixture {};
 
@@ -39,7 +40,7 @@ TEST_F(DictionarySearchTest, StringsColumn)
   result   = cudf::dictionary::detail::get_insert_index(dictionary,
                                                       cudf::string_scalar("eee"),
                                                       cudf::get_default_stream(),
-                                                      rmm::mr::get_current_device_resource());
+                                                      cudf::get_current_device_resource_ref());
   n_result = dynamic_cast<cudf::numeric_scalar<uint32_t>*>(result.get());
   EXPECT_EQ(uint32_t{5}, n_result->value());
 }
@@ -59,7 +60,7 @@ TEST_F(DictionarySearchTest, WithNulls)
   result   = cudf::dictionary::detail::get_insert_index(dictionary,
                                                       cudf::numeric_scalar<int64_t>(5),
                                                       cudf::get_default_stream(),
-                                                      rmm::mr::get_current_device_resource());
+                                                      cudf::get_current_device_resource_ref());
   n_result = dynamic_cast<cudf::numeric_scalar<uint32_t>*>(result.get());
   EXPECT_EQ(uint32_t{1}, n_result->value());
 }
@@ -71,7 +72,7 @@ TEST_F(DictionarySearchTest, EmptyColumn)
   auto result = cudf::dictionary::get_index(dictionary, key);
   EXPECT_FALSE(result->is_valid());
   result = cudf::dictionary::detail::get_insert_index(
-    dictionary, key, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    dictionary, key, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   EXPECT_FALSE(result->is_valid());
 }
 
@@ -82,6 +83,6 @@ TEST_F(DictionarySearchTest, Errors)
   EXPECT_THROW(cudf::dictionary::get_index(dictionary, key), cudf::data_type_error);
   EXPECT_THROW(
     cudf::dictionary::detail::get_insert_index(
-      dictionary, key, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
+      dictionary, key, cudf::get_default_stream(), cudf::get_current_device_resource_ref()),
     cudf::data_type_error);
 }
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu
index 24b4e335840..f34760341d8 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cu
+++ b/cpp/tests/fixed_point/fixed_point_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,6 +22,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -82,7 +83,7 @@ TEST_F(FixedPointTest, DecimalXXThrustOnDevice)
 
   std::vector<decimal32> vec1(1000, decimal32{1, scale_type{-2}});
   auto d_vec1 = cudf::detail::make_device_uvector_sync(
-    vec1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    vec1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto const sum = thrust::reduce(rmm::exec_policy(cudf::get_default_stream()),
                                   std::cbegin(d_vec1),
@@ -96,7 +97,7 @@ TEST_F(FixedPointTest, DecimalXXThrustOnDevice)
   thrust::inclusive_scan(std::cbegin(vec1), std::cend(vec1), std::begin(vec1));
 
   d_vec1 = cudf::detail::make_device_uvector_sync(
-    vec1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    vec1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   std::vector<int32_t> vec2(1000);
   std::iota(std::begin(vec2), std::end(vec2), 1);
diff --git a/cpp/tests/groupby/histogram_tests.cpp b/cpp/tests/groupby/histogram_tests.cpp
index 612486d8e5c..2d447025919 100644
--- a/cpp/tests/groupby/histogram_tests.cpp
+++ b/cpp/tests/groupby/histogram_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/groupby.hpp>
 #include <cudf/lists/sorting.hpp>
 #include <cudf/sorting.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 using int32s_col  = cudf::test::fixed_width_column_wrapper<int32_t>;
 using int64s_col  = cudf::test::fixed_width_column_wrapper<int64_t>;
@@ -68,7 +69,7 @@ auto groupby_histogram(cudf::column_view const& keys,
                                                    cudf::order::ASCENDING,
                                                    cudf::null_order::BEFORE,
                                                    cudf::get_default_stream(),
-                                                   rmm::mr::get_current_device_resource());
+                                                   cudf::get_current_device_resource_ref());
 
   return std::pair{std::move(sorted_keys), std::move(sorted_histograms)};
 }
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index 97edc1c45a7..baa59026b07 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/copy.h>
 #include <thrust/fill.h>
@@ -468,16 +469,16 @@ TEST_F(TDigestMergeTest, EmptyGroups)
   cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0};
   int const delta = 1000;
 
-  auto a = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
-                                                            rmm::mr::get_current_device_resource());
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto b = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta);
-  auto c = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
-                                                            rmm::mr::get_current_device_resource());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta);
-  auto e = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream(),
-                                                            rmm::mr::get_current_device_resource());
+  auto e = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   std::vector<cudf::column_view> cols;
   cols.push_back(*a);
diff --git a/cpp/tests/io/json/json_chunked_reader.cu b/cpp/tests/io/json/json_chunked_reader.cu
index b9dee54752c..c9ee6542a4d 100644
--- a/cpp/tests/io/json/json_chunked_reader.cu
+++ b/cpp/tests/io/json/json_chunked_reader.cu
@@ -22,7 +22,7 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
 
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <fstream>
 #include <string>
@@ -63,7 +63,7 @@ TEST_F(JsonReaderTest, ByteRange_SingleSource)
                                                  json_lines_options,
                                                  chunk_size,
                                                  cudf::get_default_stream(),
-                                                 rmm::mr::get_current_device_resource());
+                                                 cudf::get_current_device_resource_ref());
 
     auto table_views = std::vector<cudf::table_view>(tables.size());
     std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
@@ -158,7 +158,7 @@ TEST_F(JsonReaderTest, ByteRange_MultiSource)
                                                  json_lines_options,
                                                  chunk_size,
                                                  cudf::get_default_stream(),
-                                                 rmm::mr::get_current_device_resource());
+                                                 cudf::get_current_device_resource_ref());
 
     auto table_views = std::vector<cudf::table_view>(tables.size());
     std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp
index 3a9ba8d9f3b..d23acf3ae00 100644
--- a/cpp/tests/io/json/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json/json_quote_normalization_test.cpp
@@ -25,6 +25,7 @@
 #include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/io/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
@@ -43,7 +44,7 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou
 
   auto stream_view  = cudf::test::get_default_stream();
   auto device_input = rmm::device_buffer(
-    host_input.c_str(), host_input.size(), stream_view, rmm::mr::get_current_device_resource());
+    host_input.c_str(), host_input.size(), stream_view, cudf::get_current_device_resource_ref());
 
   // Preprocessing FST
   cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp
index 8bcd5790e99..875cc467b6a 100644
--- a/cpp/tests/io/json/json_tree.cpp
+++ b/cpp/tests/io/json/json_tree.cpp
@@ -26,6 +26,7 @@
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream.hpp>
@@ -590,11 +591,11 @@ TEST_F(JsonTest, TreeRepresentation)
 
   // Parse the JSON and get the token stream
   auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
-    d_input, options, stream, rmm::mr::get_current_device_resource());
+    d_input, options, stream, cudf::get_current_device_resource_ref());
 
   // Get the JSON's tree representation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
-    tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource());
+    tokens_gpu, token_indices_gpu, false, stream, cudf::get_current_device_resource_ref());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   compare_trees(cpu_tree, gpu_tree);
@@ -678,11 +679,11 @@ TEST_F(JsonTest, TreeRepresentation2)
 
   // Parse the JSON and get the token stream
   auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
-    d_input, options, stream, rmm::mr::get_current_device_resource());
+    d_input, options, stream, cudf::get_current_device_resource_ref());
 
   // Get the JSON's tree representation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
-    tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource());
+    tokens_gpu, token_indices_gpu, false, stream, cudf::get_current_device_resource_ref());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   compare_trees(cpu_tree, gpu_tree);
@@ -753,11 +754,11 @@ TEST_F(JsonTest, TreeRepresentation3)
 
   // Parse the JSON and get the token stream
   auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
-    d_input, options, stream, rmm::mr::get_current_device_resource());
+    d_input, options, stream, cudf::get_current_device_resource_ref());
 
   // Get the JSON's tree representation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
-    tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource());
+    tokens_gpu, token_indices_gpu, false, stream, cudf::get_current_device_resource_ref());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   compare_trees(cpu_tree, gpu_tree);
@@ -779,13 +780,13 @@ TEST_F(JsonTest, TreeRepresentationError)
 
   // Parse the JSON and get the token stream
   auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
-    d_input, options, stream, rmm::mr::get_current_device_resource());
+    d_input, options, stream, cudf::get_current_device_resource_ref());
 
   // Get the JSON's tree representation
   // This JSON is invalid and will raise an exception.
   EXPECT_THROW(
     cuio_json::detail::get_tree_representation(
-      tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource()),
+      tokens_gpu, token_indices_gpu, false, stream, cudf::get_current_device_resource_ref()),
     cudf::logic_error);
 }
 
@@ -862,7 +863,7 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal)
 
   // Parse the JSON and get the token stream
   auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream(
-    d_input, options, stream, rmm::mr::get_current_device_resource());
+    d_input, options, stream, cudf::get_current_device_resource_ref());
   // host tree generation
   auto cpu_tree = get_tree_representation_cpu(tokens_gpu, token_indices_gpu, options, stream);
   bool const is_array_of_arrays =
@@ -875,7 +876,7 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal)
     records_orient_tree_traversal_cpu(input, cpu_tree, is_array_of_arrays, json_lines, stream);
   // gpu tree generation
   auto gpu_tree = cuio_json::detail::get_tree_representation(
-    tokens_gpu, token_indices_gpu, false, stream, rmm::mr::get_current_device_resource());
+    tokens_gpu, token_indices_gpu, false, stream, cudf::get_current_device_resource_ref());
 
 #if LIBCUDF_JSON_DEBUG_DUMP
   printf("BEFORE traversal (gpu_tree):\n");
@@ -889,7 +890,7 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal)
                                                      is_array_of_arrays,
                                                      json_lines,
                                                      stream,
-                                                     rmm::mr::get_current_device_resource());
+                                                     cudf::get_current_device_resource_ref());
 #if LIBCUDF_JSON_DEBUG_DUMP
   printf("AFTER  traversal (gpu_tree):\n");
   print_tree(gpu_tree);
diff --git a/cpp/tests/io/json/json_type_cast_test.cu b/cpp/tests/io/json/json_type_cast_test.cu
index fe430010f4b..c18d4189626 100644
--- a/cpp/tests/io/json/json_type_cast_test.cu
+++ b/cpp/tests/io/json/json_type_cast_test.cu
@@ -32,6 +32,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
 
@@ -73,7 +74,7 @@ auto default_json_options()
 TEST_F(JSONTypeCastTest, String)
 {
   auto const stream = cudf::get_default_stream();
-  auto mr           = rmm::mr::get_current_device_resource();
+  auto mr           = cudf::get_current_device_resource_ref();
   auto const type   = cudf::data_type{cudf::type_id::STRING};
 
   auto in_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 4; });
@@ -110,7 +111,7 @@ TEST_F(JSONTypeCastTest, String)
 TEST_F(JSONTypeCastTest, Int)
 {
   auto const stream = cudf::get_default_stream();
-  auto mr           = rmm::mr::get_current_device_resource();
+  auto mr           = cudf::get_current_device_resource_ref();
   auto const type   = cudf::data_type{cudf::type_id::INT64};
 
   cudf::test::strings_column_wrapper data({"1", "null", "3", "true", "5", "false"});
@@ -141,7 +142,7 @@ TEST_F(JSONTypeCastTest, Int)
 TEST_F(JSONTypeCastTest, StringEscapes)
 {
   auto const stream = cudf::get_default_stream();
-  auto mr           = rmm::mr::get_current_device_resource();
+  auto mr           = cudf::get_current_device_resource_ref();
   auto const type   = cudf::data_type{cudf::type_id::STRING};
 
   cudf::test::strings_column_wrapper data({
@@ -183,7 +184,7 @@ TEST_F(JSONTypeCastTest, StringEscapes)
 TEST_F(JSONTypeCastTest, ErrorNulls)
 {
   auto const stream = cudf::get_default_stream();
-  auto mr           = rmm::mr::get_current_device_resource();
+  auto mr           = cudf::get_current_device_resource_ref();
   auto const type   = cudf::data_type{cudf::type_id::STRING};
 
   // error in decoding
diff --git a/cpp/tests/io/json/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu
index 01dd17fab98..6d79fdc98ef 100644
--- a/cpp/tests/io/json/json_whitespace_normalization_test.cu
+++ b/cpp/tests/io/json/json_whitespace_normalization_test.cu
@@ -23,6 +23,7 @@
 #include <cudf/io/detail/json.hpp>
 #include <cudf/io/json.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -39,12 +40,12 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou
   auto stream_view = cudf::test::get_default_stream();
 
   auto device_input = rmm::device_buffer(
-    host_input.c_str(), host_input.size(), stream_view, rmm::mr::get_current_device_resource());
+    host_input.c_str(), host_input.size(), stream_view, cudf::get_current_device_resource_ref());
 
   // Preprocessing FST
   cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
   cudf::io::json::detail::normalize_whitespace(
-    device_data, stream_view, rmm::mr::get_current_device_resource());
+    device_data, stream_view, cudf::get_current_device_resource_ref());
 
   std::string preprocessed_host_output(device_data.size(), 0);
   CUDF_CUDA_TRY(cudaMemcpyAsync(preprocessed_host_output.data(),
diff --git a/cpp/tests/io/json/nested_json_test.cpp b/cpp/tests/io/json/nested_json_test.cpp
index 5dc25133719..327169ae563 100644
--- a/cpp/tests/io/json/nested_json_test.cpp
+++ b/cpp/tests/io/json/nested_json_test.cpp
@@ -32,6 +32,7 @@
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/exec_policy.hpp>
@@ -447,7 +448,7 @@ TEST_F(JsonNewlineDelimiterTest, TokenStream)
 
   // Parse the JSON and get the token stream
   auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
-    d_input, default_options, stream, rmm::mr::get_current_device_resource());
+    d_input, default_options, stream, cudf::get_current_device_resource_ref());
   // Copy back the number of tokens that were written
   auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
   auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
@@ -581,7 +582,7 @@ TEST_F(JsonNewlineDelimiterTest, TokenStream2)
 
   // Parse the JSON and get the token stream
   auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
-    d_input, default_options, stream, rmm::mr::get_current_device_resource());
+    d_input, default_options, stream, cudf::get_current_device_resource_ref());
   // Copy back the number of tokens that were written
   auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
   auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
@@ -639,7 +640,7 @@ TEST_F(JsonParserTest, ExtractColumn)
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
-  auto mr           = rmm::mr::get_current_device_resource();
+  auto mr           = cudf::get_current_device_resource_ref();
 
   // Default parsing options
   cudf::io::json_reader_options default_options{};
@@ -648,7 +649,7 @@ TEST_F(JsonParserTest, ExtractColumn)
   auto const d_input      = cudf::detail::make_device_uvector_async(
     cudf::host_span<char const>{input.c_str(), input.size()},
     stream,
-    rmm::mr::get_current_device_resource());
+    cudf::get_current_device_resource_ref());
   // Get the JSON's tree representation
   auto const cudf_table = json_parser(d_input, default_options, stream, mr);
 
@@ -739,7 +740,7 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStream)
 
   // Parse the JSON and get the token stream
   auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
-    d_input, default_options, stream, rmm::mr::get_current_device_resource());
+    d_input, default_options, stream, cudf::get_current_device_resource_ref());
   // Copy back the number of tokens that were written
   auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
   auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
@@ -856,9 +857,9 @@ TEST_F(JsonTest, PostProcessTokenStream)
   auto const d_offsets = cudf::detail::make_device_uvector_async(
     cudf::host_span<token_index_t const>{offsets.data(), offsets.size()},
     stream,
-    rmm::mr::get_current_device_resource());
-  auto const d_tokens =
-    cudf::detail::make_device_uvector_async(tokens, stream, rmm::mr::get_current_device_resource());
+    cudf::get_current_device_resource_ref());
+  auto const d_tokens = cudf::detail::make_device_uvector_async(
+    tokens, stream, cudf::get_current_device_resource_ref());
 
   // Run system-under-test
   auto [d_filtered_tokens, d_filtered_indices] =
@@ -883,7 +884,7 @@ TEST_P(JsonDelimiterParamTest, UTF_JSON)
 {
   // Prepare cuda stream for data transfers & kernels
   auto const stream    = cudf::get_default_stream();
-  auto mr              = rmm::mr::get_current_device_resource();
+  auto mr              = cudf::get_current_device_resource_ref();
   auto json_parser     = cuio_json::detail::device_parse_nested_json;
   char const delimiter = GetParam();
 
@@ -904,7 +905,7 @@ TEST_P(JsonDelimiterParamTest, UTF_JSON)
   auto const d_ascii_pass = cudf::detail::make_device_uvector_sync(
     cudf::host_span<char const>{ascii_pass.c_str(), ascii_pass.size()},
     stream,
-    rmm::mr::get_current_device_resource());
+    cudf::get_current_device_resource_ref());
 
   CUDF_EXPECT_NO_THROW(json_parser(d_ascii_pass, default_options, stream, mr));
 
@@ -921,7 +922,7 @@ TEST_P(JsonDelimiterParamTest, UTF_JSON)
   auto const d_utf_failed = cudf::detail::make_device_uvector_sync(
     cudf::host_span<char const>{utf_failed.c_str(), utf_failed.size()},
     stream,
-    rmm::mr::get_current_device_resource());
+    cudf::get_current_device_resource_ref());
   CUDF_EXPECT_NO_THROW(json_parser(d_utf_failed, default_options, stream, mr));
 
   // utf-8 string that passes parsing.
@@ -938,7 +939,7 @@ TEST_P(JsonDelimiterParamTest, UTF_JSON)
   auto const d_utf_pass = cudf::detail::make_device_uvector_sync(
     cudf::host_span<char const>{utf_pass.c_str(), utf_pass.size()},
     stream,
-    rmm::mr::get_current_device_resource());
+    cudf::get_current_device_resource_ref());
   CUDF_EXPECT_NO_THROW(json_parser(d_utf_pass, default_options, stream, mr));
 }
 
@@ -949,7 +950,7 @@ TEST_F(JsonParserTest, ExtractColumnWithQuotes)
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
-  auto mr           = rmm::mr::get_current_device_resource();
+  auto mr           = cudf::get_current_device_resource_ref();
 
   // Default parsing options
   cudf::io::json_reader_options options{};
@@ -959,7 +960,7 @@ TEST_F(JsonParserTest, ExtractColumnWithQuotes)
   auto const d_input      = cudf::detail::make_device_uvector_async(
     cudf::host_span<char const>{input.c_str(), input.size()},
     stream,
-    rmm::mr::get_current_device_resource());
+    cudf::get_current_device_resource_ref());
   // Get the JSON's tree representation
   auto const cudf_table = json_parser(d_input, options, stream, mr);
 
@@ -982,7 +983,7 @@ TEST_F(JsonParserTest, ExpectFailMixStructAndList)
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
-  auto mr           = rmm::mr::get_current_device_resource();
+  auto mr           = cudf::get_current_device_resource_ref();
 
   // Default parsing options
   cudf::io::json_reader_options options{};
@@ -1002,7 +1003,7 @@ TEST_F(JsonParserTest, ExpectFailMixStructAndList)
     auto const d_input = cudf::detail::make_device_uvector_async(
       cudf::host_span<char const>{input.c_str(), input.size()},
       stream,
-      rmm::mr::get_current_device_resource());
+      cudf::get_current_device_resource_ref());
     EXPECT_THROW(auto const cudf_table = json_parser(d_input, options, stream, mr),
                  cudf::logic_error);
   }
@@ -1011,7 +1012,7 @@ TEST_F(JsonParserTest, ExpectFailMixStructAndList)
     auto const d_input = cudf::detail::make_device_uvector_async(
       cudf::host_span<char const>{input.c_str(), input.size()},
       stream,
-      rmm::mr::get_current_device_resource());
+      cudf::get_current_device_resource_ref());
     CUDF_EXPECT_NO_THROW(auto const cudf_table = json_parser(d_input, options, stream, mr));
   }
 }
@@ -1023,7 +1024,7 @@ TEST_F(JsonParserTest, EmptyString)
 
   // Prepare cuda stream for data transfers & kernels
   auto const stream = cudf::get_default_stream();
-  auto mr           = rmm::mr::get_current_device_resource();
+  auto mr           = cudf::get_current_device_resource_ref();
 
   // Default parsing options
   cudf::io::json_reader_options default_options{};
@@ -1032,7 +1033,7 @@ TEST_F(JsonParserTest, EmptyString)
   auto const d_input =
     cudf::detail::make_device_uvector_sync(cudf::host_span<char const>{input.c_str(), input.size()},
                                            stream,
-                                           rmm::mr::get_current_device_resource());
+                                           cudf::get_current_device_resource_ref());
   // Get the JSON's tree representation
   auto const cudf_table = json_parser(d_input, default_options, stream, mr);
 
@@ -1177,7 +1178,7 @@ TEST_P(JsonDelimiterParamTest, RecoveringTokenStreamNewlineAndDelimiter)
 
   // Parse the JSON and get the token stream
   auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream(
-    d_input, default_options, stream, rmm::mr::get_current_device_resource());
+    d_input, default_options, stream, cudf::get_current_device_resource_ref());
   // Copy back the number of tokens that were written
   auto const tokens_gpu        = cudf::detail::make_std_vector_async(d_tokens_gpu, stream);
   auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream);
diff --git a/cpp/tests/io/orc_chunked_reader_test.cu b/cpp/tests/io/orc_chunked_reader_test.cu
index 2b78a5e7251..8ad1fea649d 100644
--- a/cpp/tests/io/orc_chunked_reader_test.cu
+++ b/cpp/tests/io/orc_chunked_reader_test.cu
@@ -37,6 +37,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -79,7 +80,7 @@ auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
         null_count,
         std::move(col),
         cudf::get_default_stream(),
-        rmm::mr::get_current_device_resource());
+        cudf::get_current_device_resource_ref());
 
       // Shift nulls of the next column by one position, to avoid having all nulls
       // in the same table rows.
@@ -121,7 +122,7 @@ auto chunked_read(std::string const& filepath,
 
   // TODO: remove this scope, when we get rid of mem stat in the reader.
   // This is to avoid use-after-free of memory resource created by the mem stat object.
-  auto mr = rmm::mr::get_current_device_resource();
+  auto mr = cudf::get_current_device_resource_ref();
 
   do {
     auto chunk = reader.read_chunk();
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cu b/cpp/tests/io/parquet_chunked_reader_test.cu
index 66b36aeed63..153a8a0c5aa 100644
--- a/cpp/tests/io/parquet_chunked_reader_test.cu
+++ b/cpp/tests/io/parquet_chunked_reader_test.cu
@@ -38,6 +38,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -80,7 +81,7 @@ auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
         null_count,
         std::move(col),
         cudf::get_default_stream(),
-        rmm::mr::get_current_device_resource());
+        cudf::get_current_device_resource_ref());
 
       // Shift nulls of the next column by one position, to avoid having all nulls
       // in the same table rows.
diff --git a/cpp/tests/io/parquet_writer_test.cpp b/cpp/tests/io/parquet_writer_test.cpp
index e07ebe25322..c8100038942 100644
--- a/cpp/tests/io/parquet_writer_test.cpp
+++ b/cpp/tests/io/parquet_writer_test.cpp
@@ -26,6 +26,7 @@
 #include <cudf/io/parquet.hpp>
 #include <cudf/io/types.hpp>
 #include <cudf/unary.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <src/io/parquet/parquet.hpp>
 #include <src/io/parquet/parquet_common.hpp>
@@ -192,7 +193,7 @@ TEST_F(ParquetWriterTest, BufferSource)
       cudf::host_span<uint8_t const>{reinterpret_cast<uint8_t const*>(out_buffer.data()),
                                      out_buffer.size()},
       cudf::get_default_stream(),
-      rmm::mr::get_current_device_resource());
+      cudf::get_current_device_resource_ref());
     auto const d_buffer = cudf::device_span<std::byte const>(
       reinterpret_cast<std::byte const*>(d_input.data()), d_input.size());
     cudf::io::parquet_reader_options in_opts =
diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu
index 37156292f44..b20f2024cb9 100644
--- a/cpp/tests/io/type_inference_test.cu
+++ b/cpp/tests/io/type_inference_test.cu
@@ -22,6 +22,7 @@
 
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -55,9 +56,9 @@ TEST_F(TypeInference, Basic)
   auto const string_offset   = std::vector<cudf::size_type>{1, 4, 7};
   auto const string_length   = std::vector<cudf::size_type>{2, 2, 1};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
-    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_offset, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
-    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_length, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -88,9 +89,9 @@ TEST_F(TypeInference, Null)
   auto const string_offset   = std::vector<cudf::size_type>{1, 1, 4};
   auto const string_length   = std::vector<cudf::size_type>{0, 2, 1};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
-    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_offset, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
-    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_length, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -121,9 +122,9 @@ TEST_F(TypeInference, AllNull)
   auto const string_offset   = std::vector<cudf::size_type>{1, 1, 1};
   auto const string_length   = std::vector<cudf::size_type>{0, 0, 4};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
-    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_offset, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
-    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_length, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -154,9 +155,9 @@ TEST_F(TypeInference, String)
   auto const string_offset   = std::vector<cudf::size_type>{1, 8, 12};
   auto const string_length   = std::vector<cudf::size_type>{6, 3, 4};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
-    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_offset, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
-    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_length, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -187,9 +188,9 @@ TEST_F(TypeInference, Bool)
   auto const string_offset   = std::vector<cudf::size_type>{1, 6, 12};
   auto const string_length   = std::vector<cudf::size_type>{4, 5, 5};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
-    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_offset, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
-    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_length, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -220,9 +221,9 @@ TEST_F(TypeInference, Timestamp)
   auto const string_offset   = std::vector<cudf::size_type>{1, 10};
   auto const string_length   = std::vector<cudf::size_type>{8, 9};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
-    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_offset, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
-    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_length, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
@@ -254,9 +255,9 @@ TEST_F(TypeInference, InvalidInput)
   auto const string_offset   = std::vector<cudf::size_type>{1, 3, 5, 7, 9};
   auto const string_length   = std::vector<cudf::size_type>{1, 1, 1, 1, 1};
   auto const d_string_offset = cudf::detail::make_device_uvector_async(
-    string_offset, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_offset, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const d_string_length = cudf::detail::make_device_uvector_async(
-    string_length, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    string_length, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto d_col_strings =
     thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh
index c6da6b75930..5c9f6114eb5 100644
--- a/cpp/tests/iterator/iterator_tests.cuh
+++ b/cpp/tests/iterator/iterator_tests.cuh
@@ -22,6 +22,7 @@
 #include <cudf/detail/utilities/transform_unary_functions.cuh>  // for meanvar
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
@@ -87,7 +88,7 @@ struct IteratorTest : public cudf::test::BaseFixture {
     InputIterator d_in_last = d_in + num_items;
     EXPECT_EQ(thrust::distance(d_in, d_in_last), num_items);
     auto dev_expected = cudf::detail::make_device_uvector_sync(
-      expected, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+      expected, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
     // using a temporary vector and calling transform and all_of separately is
     // equivalent to thrust::equal but compiles ~3x faster
diff --git a/cpp/tests/iterator/value_iterator_test.cuh b/cpp/tests/iterator/value_iterator_test.cuh
index 8252ce88f39..a479a263b09 100644
--- a/cpp/tests/iterator/value_iterator_test.cuh
+++ b/cpp/tests/iterator/value_iterator_test.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 #include <tests/iterator/iterator_tests.cuh>
 
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/host_vector.h>
 
@@ -26,7 +27,7 @@ void non_null_iterator(IteratorTest<T>& testFixture)
 {
   auto host_array = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
   auto dev_array  = cudf::detail::make_device_uvector_sync(
-    host_array, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    host_array, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   // calculate the expected value by CPU.
   thrust::host_vector<T> replaced_array(host_array);
diff --git a/cpp/tests/iterator/value_iterator_test_strings.cu b/cpp/tests/iterator/value_iterator_test_strings.cu
index 10bb3f21ee1..a965c65aef0 100644
--- a/cpp/tests/iterator/value_iterator_test_strings.cu
+++ b/cpp/tests/iterator/value_iterator_test_strings.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
 #include "iterator_tests.cuh"
 
 #include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -31,7 +32,7 @@ auto strings_to_string_views(std::vector<std::string>& input_strings)
   std::tie(chars, offsets) = cudf::test::detail::make_chars_and_offsets(
     input_strings.begin(), input_strings.end(), all_valid);
   auto dev_chars = cudf::detail::make_device_uvector_sync(
-    chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    chars, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   // calculate the expected value by CPU. (but contains device pointers)
   thrust::host_vector<cudf::string_view> replaced_array(input_strings.size());
@@ -52,7 +53,7 @@ TEST_F(StringIteratorTest, string_view_null_iterator)
   std::string zero("zero");
   // the char data has to be in GPU
   auto initmsg = cudf::detail::make_device_uvector_sync(
-    zero, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    zero, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   T init = T{initmsg.data(), int(initmsg.size())};
 
   // data and valid arrays
@@ -88,7 +89,7 @@ TEST_F(StringIteratorTest, string_view_no_null_iterator)
   std::string zero("zero");
   // the char data has to be in GPU
   auto initmsg = cudf::detail::make_device_uvector_sync(
-    zero, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    zero, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   T init = T{initmsg.data(), int(initmsg.size())};
 
   // data array
@@ -113,7 +114,7 @@ TEST_F(StringIteratorTest, string_scalar_iterator)
   std::string zero("zero");
   // the char data has to be in GPU
   auto initmsg = cudf::detail::make_device_uvector_sync(
-    zero, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    zero, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   T init = T{initmsg.data(), int(initmsg.size())};
 
   // data array
diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp
index 05ae4ea1d04..93754091b3f 100644
--- a/cpp/tests/join/distinct_join_tests.cpp
+++ b/cpp/tests/join/distinct_join_tests.cpp
@@ -29,6 +29,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <limits>
 #include <vector>
@@ -44,7 +45,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> get_left_indices(cudf::siz
   auto sequence = std::vector<cudf::size_type>(size);
   std::iota(sequence.begin(), sequence.end(), 0);
   auto indices = cudf::detail::make_device_uvector_sync(
-    sequence, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    sequence, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   return std::make_unique<rmm::device_uvector<cudf::size_type>>(std::move(indices));
 }
 
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index 4e88414d553..ab387a5c7f5 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -37,8 +37,7 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <limits>
 
@@ -69,7 +68,7 @@ std::unique_ptr<cudf::table> join_and_gather(
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
   cudf::null_equality compare_nulls,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto left_selected  = left_input.select(left_on);
   auto right_selected = right_input.select(right_on);
@@ -2028,7 +2027,7 @@ struct JoinTestLists : public cudf::test::BaseFixture {
     auto const probe_tv = cudf::table_view{{probe}};
 
     auto const [left_result_map, right_result_map] =
-      join_func(build_tv, probe_tv, nulls_equal, rmm::mr::get_current_device_resource());
+      join_func(build_tv, probe_tv, nulls_equal, cudf::get_current_device_resource_ref());
 
     auto const left_result_table =
       sort_and_gather(build_tv, column_view_from_device_uvector(*left_result_map), oob_policy);
diff --git a/cpp/tests/join/semi_anti_join_tests.cpp b/cpp/tests/join/semi_anti_join_tests.cpp
index de3d8bdaa23..3e279260b99 100644
--- a/cpp/tests/join/semi_anti_join_tests.cpp
+++ b/cpp/tests/join/semi_anti_join_tests.cpp
@@ -28,8 +28,7 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
-
-#include <rmm/resource_ref.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -59,7 +58,7 @@ std::unique_ptr<cudf::table> join_and_gather(
   std::vector<cudf::size_type> const& left_on,
   std::vector<cudf::size_type> const& right_on,
   cudf::null_equality compare_nulls,
-  rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref())
 {
   auto left_selected      = left_input.select(left_on);
   auto right_selected     = right_input.select(right_on);
diff --git a/cpp/tests/large_strings/json_tests.cu b/cpp/tests/large_strings/json_tests.cu
index e34ab991c11..80bde168b75 100644
--- a/cpp/tests/large_strings/json_tests.cu
+++ b/cpp/tests/large_strings/json_tests.cu
@@ -22,6 +22,7 @@
 #include <cudf/concatenate.hpp>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/json.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 struct JsonLargeReaderTest : public cudf::test::StringsLargeTest {};
@@ -81,7 +82,7 @@ TEST_F(JsonLargeReaderTest, MultiBatch)
                                              json_lines_options,
                                              chunk_size,
                                              cudf::get_default_stream(),
-                                             rmm::mr::get_current_device_resource());
+                                             cudf::get_current_device_resource_ref());
 
     auto table_views = std::vector<cudf::table_view>(tables.size());
     std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
diff --git a/cpp/tests/large_strings/large_strings_fixture.cpp b/cpp/tests/large_strings/large_strings_fixture.cpp
index ac8159369a1..249319da7f7 100644
--- a/cpp/tests/large_strings/large_strings_fixture.cpp
+++ b/cpp/tests/large_strings/large_strings_fixture.cpp
@@ -126,7 +126,7 @@ int main(int argc, char** argv)
   auto const cmd_opts = parse_cudf_test_opts(argc, argv);
   // hardcoding the CUDA memory resource to keep from exceeding the pool
   auto mr = cudf::test::make_cuda();
-  rmm::mr::set_current_device_resource(mr.get());
+  cudf::set_current_device_resource(mr.get());
   auto adaptor = make_stream_mode_adaptor(cmd_opts);
 
   // create object to automatically be destroyed at the end of main()
diff --git a/cpp/tests/partitioning/hash_partition_test.cpp b/cpp/tests/partitioning/hash_partition_test.cpp
index 24dadf9b520..579d918a31d 100644
--- a/cpp/tests/partitioning/hash_partition_test.cpp
+++ b/cpp/tests/partitioning/hash_partition_test.cpp
@@ -26,6 +26,7 @@
 #include <cudf/partitioning.hpp>
 #include <cudf/sorting.hpp>
 #include <cudf/table/table.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
@@ -290,7 +291,7 @@ void run_fixed_width_test(size_t cols,
   // Make a table view of the partition numbers
   constexpr cudf::data_type dtype{cudf::type_id::INT32};
   auto d_partitions = cudf::detail::make_device_uvector_sync(
-    partitions, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    partitions, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::column_view partitions_col(dtype, rows, d_partitions.data(), nullptr, 0);
   cudf::table_view partitions_table({partitions_col});
 
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index 06c6b9dfbe4..915717713df 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -29,6 +29,7 @@
 #include <cudf/transform.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <arrow/util/tdigest.h>
 
@@ -371,7 +372,7 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {};
 TEST_F(PercentileApproxTest, EmptyInput)
 {
   auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
 
   std::vector<cudf::column_view> input;
diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
index 668690639a6..19996f827cf 100644
--- a/cpp/tests/reductions/segmented_reduction_tests.cpp
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -23,6 +23,7 @@
 #include <cudf/reduction.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <limits>
 #include <utility>
@@ -49,7 +50,7 @@ TYPED_TEST(SegmentedReductionTest, SumExcludeNulls)
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<TypeParam>{{6, 4, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -97,7 +98,7 @@ TYPED_TEST(SegmentedReductionTest, ProductExcludeNulls)
     {1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{15, 15, 1, XXX, XXX, XXX},
                                                                         {1, 1, 1, 0, 0, 0}};
 
@@ -147,7 +148,7 @@ TYPED_TEST(SegmentedReductionTest, MaxExcludeNulls)
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<TypeParam>{{3, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -195,7 +196,7 @@ TYPED_TEST(SegmentedReductionTest, MinExcludeNulls)
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<TypeParam>{{1, 1, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -244,7 +245,7 @@ TYPED_TEST(SegmentedReductionTest, AnyExcludeNulls)
     {1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {false, false, true, true, bool{XXX}, false, true, bool{XXX}, bool{XXX}},
     {true, true, true, true, false, true, true, false, false}};
@@ -284,7 +285,7 @@ TYPED_TEST(SegmentedReductionTest, AllExcludeNulls)
     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {true, true, bool{XXX}, true, bool{XXX}, bool{XXX}, false, false, false},
     {true, true, false, true, false, false, true, true, true}};
@@ -335,7 +336,7 @@ TYPED_TEST(SegmentedReductionTest, SumIncludeNulls)
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{6, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -386,7 +387,7 @@ TYPED_TEST(SegmentedReductionTest, ProductIncludeNulls)
     {1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{15, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -439,7 +440,7 @@ TYPED_TEST(SegmentedReductionTest, MaxIncludeNulls)
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{3, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -490,7 +491,7 @@ TYPED_TEST(SegmentedReductionTest, MinIncludeNulls)
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 1, 0, 1, 1, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<TypeParam>{{1, XXX, 1, XXX, XXX, XXX},
                                                                         {1, 0, 1, 0, 0, 0}};
 
@@ -542,7 +543,7 @@ TYPED_TEST(SegmentedReductionTest, AnyIncludeNulls)
     {1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {false, bool{XXX}, true, bool{XXX}, bool{XXX}, false, true, bool{XXX}, bool{XXX}},
     {true, false, true, false, false, true, true, false, false}};
@@ -605,7 +606,7 @@ TYPED_TEST(SegmentedReductionTest, AllIncludeNulls)
     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<bool>{
     {true, bool{XXX}, bool{XXX}, true, bool{XXX}, bool{XXX}, false, bool{XXX}, false},
     {true, false, false, true, false, false, true, false, true}};
@@ -670,7 +671,7 @@ TEST_F(SegmentedReductionTestUntyped, PartialSegmentReduction)
     {1, 2, 3, 4, 5, 6, 7}, {true, true, true, true, true, true, true}};
   auto const offsets   = std::vector<cudf::size_type>{1, 3, 4};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<int32_t>{{5, 4}, {true, true}};
 
   auto res =
@@ -721,7 +722,7 @@ TEST_F(SegmentedReductionTestUntyped, NonNullableInput)
   auto const input     = cudf::test::fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7};
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 3, 7};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect =
     cudf::test::fixed_width_column_wrapper<int32_t>{{1, XXX, 5, 22}, {true, false, true, true}};
 
@@ -767,7 +768,7 @@ TEST_F(SegmentedReductionTestUntyped, Mean)
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg         = cudf::make_mean_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
 
@@ -786,7 +787,7 @@ TEST_F(SegmentedReductionTestUntyped, MeanNulls)
     {10, 20, 30, 40, 50, 60, 0, 80, 90}, {true, true, true, true, true, true, false, true, true});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg         = cudf::make_mean_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
 
@@ -808,7 +809,7 @@ TEST_F(SegmentedReductionTestUntyped, SumOfSquares)
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::INT32};
 
@@ -828,7 +829,7 @@ TEST_F(SegmentedReductionTestUntyped, SumOfSquaresNulls)
     {10, 20, 30, 40, 50, 60, 0, 80, 90}, {true, true, true, true, true, true, false, true, true});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::INT64};
 
@@ -851,7 +852,7 @@ TEST_F(SegmentedReductionTestUntyped, StandardDeviation)
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg         = cudf::make_std_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
 
@@ -871,7 +872,7 @@ TEST_F(SegmentedReductionTestUntyped, StandardDeviationNulls)
     {10, 0, 20, 30, 54, 63, 0, 72, 81}, {true, false, true, true, true, true, false, true, true});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg         = cudf::make_std_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
 
@@ -894,7 +895,7 @@ TEST_F(SegmentedReductionTestUntyped, Variance)
     cudf::test::fixed_width_column_wrapper<int32_t>{10, 20, 30, 40, 50, 60, 70, 80, 90};
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg         = cudf::make_variance_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT32};
 
@@ -914,7 +915,7 @@ TEST_F(SegmentedReductionTestUntyped, VarianceNulls)
     {10, 0, 20, 30, 54, 63, 0, 72, 81}, {true, false, true, true, true, true, false, true, true});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg         = cudf::make_variance_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::FLOAT64};
 
@@ -936,7 +937,7 @@ TEST_F(SegmentedReductionTestUntyped, NUnique)
     cudf::test::fixed_width_column_wrapper<int32_t>({10, 15, 20, 30, 60, 60, 70, 70, 80});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 2, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg         = cudf::make_nunique_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::INT32};
 
@@ -956,7 +957,7 @@ TEST_F(SegmentedReductionTestUntyped, NUniqueNulls)
     {10, 0, 20, 30, 60, 60, 70, 70, 0}, {true, false, true, true, true, true, true, true, false});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 2, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg         = cudf::make_nunique_aggregation<cudf::segmented_reduce_aggregation>();
   auto const output_type = cudf::data_type{cudf::type_id::INT32};
 
@@ -978,7 +979,7 @@ TEST_F(SegmentedReductionTestUntyped, Errors)
     {10, 0, 20, 30, 54, 63, 0, 72, 81}, {true, false, true, true, true, true, false, true, true});
   auto const offsets   = std::vector<cudf::size_type>{0, 1, 1, 4, 9};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const null_policy = cudf::null_policy::EXCLUDE;
   auto const output_type = cudf::data_type{cudf::type_id::TIMESTAMP_DAYS};
   auto const str_input =
@@ -1047,7 +1048,7 @@ TEST_F(SegmentedReductionTestUntyped, ReduceEmptyColumn)
   auto const input     = cudf::test::fixed_width_column_wrapper<int32_t>{};
   auto const offsets   = std::vector<cudf::size_type>{0};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<int32_t>{};
 
   auto res =
@@ -1084,7 +1085,7 @@ TEST_F(SegmentedReductionTestUntyped, EmptyInputWithOffsets)
   auto const input     = cudf::test::fixed_width_column_wrapper<int32_t>{};
   auto const offsets   = std::vector<cudf::size_type>{0, 0, 0, 0, 0, 0};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::fixed_width_column_wrapper<int32_t>{
     {XXX, XXX, XXX, XXX, XXX}, {false, false, false, false, false}};
 
@@ -1133,7 +1134,7 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MaxWithNulls)
 
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg = cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1161,7 +1162,7 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MinWithNulls)
 
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg = cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1189,7 +1190,7 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MaxNonNullableInput)
 
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 4, 4};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg = cudf::make_max_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1214,7 +1215,7 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MinNonNullableInput)
 
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 4, 4};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg = cudf::make_min_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1239,7 +1240,7 @@ TYPED_TEST(SegmentedReductionFixedPointTest, Sum)
 
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg = cudf::make_sum_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1277,7 +1278,7 @@ TYPED_TEST(SegmentedReductionFixedPointTest, Product)
 
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 12, 12};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg = cudf::make_product_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1314,7 +1315,7 @@ TYPED_TEST(SegmentedReductionFixedPointTest, SumOfSquares)
 
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const agg = cudf::make_sum_of_squares_aggregation<cudf::segmented_reduce_aggregation>();
 
   for (auto scale : {-2, 0, 5}) {
@@ -1478,7 +1479,7 @@ TEST_F(SegmentedReductionStringTest, EmptyInputWithOffsets)
   auto const input     = cudf::test::strings_column_wrapper{};
   auto const offsets   = std::vector<cudf::size_type>{0, 0, 0, 0};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const expect = cudf::test::strings_column_wrapper({XXX, XXX, XXX}, {false, false, false});
 
   auto result =
diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu
index 5026954403b..2232aefefcd 100644
--- a/cpp/tests/scalar/scalar_device_view_test.cu
+++ b/cpp/tests/scalar/scalar_device_view_test.cu
@@ -25,6 +25,7 @@
 #include <cudf/strings/string_view.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <thrust/sequence.h>
@@ -131,7 +132,7 @@ TEST_F(StringScalarDeviceViewTest, Value)
   auto scalar_device_view = cudf::get_scalar_device_view(s);
   rmm::device_scalar<bool> result{cudf::get_default_stream()};
   auto value_v = cudf::detail::make_device_uvector_sync(
-    value, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    value, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   test_string_value<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     scalar_device_view, value_v.data(), value.size(), result.data());
diff --git a/cpp/tests/sort/segmented_sort_tests.cpp b/cpp/tests/sort/segmented_sort_tests.cpp
index f4fe2c5956a..79421a1fa30 100644
--- a/cpp/tests/sort/segmented_sort_tests.cpp
+++ b/cpp/tests/sort/segmented_sort_tests.cpp
@@ -23,6 +23,7 @@
 #include <cudf/copying.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/sorting.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <type_traits>
@@ -350,7 +351,7 @@ TEST_F(SegmentedSortInt, UnbalancedOffsets)
   std::fill_n(h_input.begin(), 4, 0);
   std::fill(h_input.begin() + 3533, h_input.end(), 10000);
   auto d_input = cudf::detail::make_device_uvector_sync(
-    h_input, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    h_input, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto input    = cudf::column_view(cudf::device_span<int64_t const>(d_input));
   auto segments = cudf::test::fixed_width_column_wrapper<int32_t>({0, 4, 3533, 3535});
   // full sort should match handcrafted input data here
diff --git a/cpp/tests/streams/reduction_test.cpp b/cpp/tests/streams/reduction_test.cpp
index e6438ac2834..b4f013fc960 100644
--- a/cpp/tests/streams/reduction_test.cpp
+++ b/cpp/tests/streams/reduction_test.cpp
@@ -23,6 +23,7 @@
 #include <cudf/reduction.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 class ReductionTest : public cudf::test::BaseFixture {};
 
@@ -53,7 +54,7 @@ TEST_F(ReductionTest, SegmentedReductionSum)
     {true, true, true, true, false, true, true, false, false, false}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto res =
     cudf::segmented_reduce(input,
@@ -71,7 +72,7 @@ TEST_F(ReductionTest, SegmentedReductionSumScalarInit)
     {true, true, true, true, false, true, true, false, false, false}};
   auto const offsets   = std::vector<cudf::size_type>{0, 3, 6, 7, 8, 10, 10};
   auto const d_offsets = cudf::detail::make_device_uvector_async(
-    offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
+    offsets, cudf::test::get_default_stream(), cudf::get_current_device_resource_ref());
   auto const init_scalar = cudf::make_fixed_width_scalar<int>(3, cudf::test::get_default_stream());
   auto res =
     cudf::segmented_reduce(input,
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index 59423d5b927..c816316d0ff 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -23,6 +23,7 @@
 #include <cudf/strings/contains.hpp>
 #include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -298,10 +299,10 @@ TEST_F(StringsContainsTests, HexTest)
     {thrust::make_counting_iterator<cudf::size_type>(0),
      thrust::make_counting_iterator<cudf::size_type>(0) + count + 1});
   auto d_chars = cudf::detail::make_device_uvector_sync(
-    ascii_chars, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    ascii_chars, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d_offsets = std::make_unique<cudf::column>(
     cudf::detail::make_device_uvector_sync(
-      offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
+      offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref()),
     rmm::device_buffer{},
     0);
   auto input = cudf::make_strings_column(count, std::move(d_offsets), d_chars.release(), 0, {});
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index 35d648f16e0..90054e41d36 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -28,6 +28,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -79,7 +80,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
     h_offsets[idx + 1] = offset;
   }
   auto d_strings = cudf::detail::make_device_uvector_sync(
-    strings, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    strings, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   CUDF_CUDA_TRY(cudaMemcpy(d_buffer.data(), h_buffer.data(), memsize, cudaMemcpyDefault));
   auto column = cudf::make_strings_column(d_strings);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
@@ -140,14 +141,14 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
 
   std::vector<cudf::bitmask_type> h_nulls{h_null_mask};
   auto d_buffer = cudf::detail::make_device_uvector_sync(
-    h_buffer, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    h_buffer, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d_offsets = std::make_unique<cudf::column>(
     cudf::detail::make_device_uvector_sync(
-      h_offsets, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
+      h_offsets, cudf::get_default_stream(), cudf::get_current_device_resource_ref()),
     rmm::device_buffer{},
     0);
   auto d_nulls = cudf::detail::make_device_uvector_sync(
-    h_nulls, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    h_nulls, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto column = cudf::make_strings_column(
     count, std::move(d_offsets), d_buffer.release(), null_count, d_nulls.release());
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
@@ -191,7 +192,7 @@ TEST_F(StringsFactoriesTest, EmptyStringsColumn)
   auto d_chars   = rmm::device_uvector<char>(0, cudf::get_default_stream());
   auto d_offsets = std::make_unique<cudf::column>(
     cudf::detail::make_zeroed_device_uvector_sync<cudf::size_type>(
-      1, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
+      1, cudf::get_default_stream(), cudf::get_current_device_resource_ref()),
     rmm::device_buffer{},
     0);
   rmm::device_uvector<cudf::bitmask_type> d_nulls{0, cudf::get_default_stream()};
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 7a038fa6d75..ce5f68de3c9 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -22,6 +22,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/convert/convert_integers.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -295,7 +296,7 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   h_integers.push_back(std::numeric_limits<TypeParam>::min());
   h_integers.push_back(std::numeric_limits<TypeParam>::max());
   auto const d_integers = cudf::detail::make_device_uvector_sync(
-    h_integers, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    h_integers, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto integers      = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                             (cudf::size_type)d_integers.size());
   auto integers_view = integers->mutable_view();
diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp
index e5ff700a242..c33eedf9bd9 100644
--- a/cpp/tests/structs/utilities_tests.cpp
+++ b/cpp/tests/structs/utilities_tests.cpp
@@ -30,6 +30,7 @@
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/null_mask.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 template <typename T>
 using nums = cudf::test::fixed_width_column_wrapper<T, int32_t>;
@@ -60,7 +61,7 @@ TYPED_TEST(TypedStructUtilitiesTest, ListsAtTopLevel)
                                                   {},
                                                   cudf::structs::detail::column_nullability::FORCE,
                                                   cudf::get_default_stream(),
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(table, flattened_table->flattened_columns());
 }
@@ -82,7 +83,7 @@ TYPED_TEST(TypedStructUtilitiesTest, NoStructs)
                                                   {},
                                                   cudf::structs::detail::column_nullability::FORCE,
                                                   cudf::get_default_stream(),
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(table, flattened_table->flattened_columns());
 }
@@ -114,7 +115,7 @@ TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStruct)
                                                   {},
                                                   cudf::structs::detail::column_nullability::FORCE,
                                                   cudf::get_default_stream(),
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -147,7 +148,7 @@ TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStructWithNulls)
                                                   {},
                                                   cudf::structs::detail::column_nullability::FORCE,
                                                   cudf::get_default_stream(),
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -196,7 +197,7 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStruct)
                                                   {},
                                                   cudf::structs::detail::column_nullability::FORCE,
                                                   cudf::get_default_stream(),
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -246,7 +247,7 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtLeafLevel)
                                                   {},
                                                   cudf::structs::detail::column_nullability::FORCE,
                                                   cudf::get_default_stream(),
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -297,7 +298,7 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtTopLevel)
                                                   {},
                                                   cudf::structs::detail::column_nullability::FORCE,
                                                   cudf::get_default_stream(),
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -348,7 +349,7 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtAllLevels)
                                                   {},
                                                   cudf::structs::detail::column_nullability::FORCE,
                                                   cudf::get_default_stream(),
-                                                  rmm::mr::get_current_device_resource());
+                                                  cudf::get_current_device_resource_ref());
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, flattened_table->flattened_columns());
 }
 
@@ -363,7 +364,7 @@ void test_non_struct_columns(cudf::column_view const& input)
 {
   // push_down_nulls() on non-struct columns should return the input column, unchanged.
   auto [superimposed, backing_data] = cudf::structs::detail::push_down_nulls(
-    input, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    input, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(input, superimposed);
   EXPECT_TRUE(backing_data.new_null_masks.empty());
@@ -427,7 +428,7 @@ TYPED_TEST(TypedSuperimposeTest, BasicStruct)
                                  make_lists_member<T>(cudf::test::iterators::nulls_at({4, 5})));
 
   auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
-    structs_view, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    structs_view, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   // After push_down_nulls(), the struct nulls (i.e. at index-0) should have been pushed
   // down to the children. All members should have nulls at row-index 0.
@@ -453,7 +454,7 @@ TYPED_TEST(TypedSuperimposeTest, NonNullableParentStruct)
                          .release();
 
   auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
-    structs_input->view(), cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    structs_input->view(), cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   // After push_down_nulls(), none of the child structs should have changed,
   // because the parent had no nulls to begin with.
@@ -487,8 +488,10 @@ TYPED_TEST(TypedSuperimposeTest, NestedStruct_ChildNullable_ParentNonNullable)
   auto structs_of_structs =
     cudf::test::structs_column_wrapper{std::move(outer_struct_members)}.release();
 
-  auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
-    structs_of_structs->view(), cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto [output, backing_data] =
+    cudf::structs::detail::push_down_nulls(structs_of_structs->view(),
+                                           cudf::get_default_stream(),
+                                           cudf::get_current_device_resource_ref());
 
   // After push_down_nulls(), outer-struct column should not have pushed nulls to child
   // structs. But the child struct column must push its nulls to its own children.
@@ -530,8 +533,10 @@ TYPED_TEST(TypedSuperimposeTest, NestedStruct_ChildNullable_ParentNullable)
   cudf::detail::set_null_mask(
     structs_of_structs_view.null_mask(), 1, 2, false, cudf::get_default_stream());
 
-  auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
-    structs_of_structs->view(), cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+  auto [output, backing_data] =
+    cudf::structs::detail::push_down_nulls(structs_of_structs->view(),
+                                           cudf::get_default_stream(),
+                                           cudf::get_current_device_resource_ref());
 
   // After push_down_nulls(), outer-struct column should not have pushed nulls to child
   // structs. But the child struct column must push its nulls to its own children.
@@ -587,7 +592,7 @@ TYPED_TEST(TypedSuperimposeTest, Struct_Sliced)
   // lists_member: 00111
 
   auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
-    sliced_structs, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    sliced_structs, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   // After push_down_nulls(), the null masks should be:
   // STRUCT:       11110
@@ -640,7 +645,7 @@ TYPED_TEST(TypedSuperimposeTest, NestedStruct_Sliced)
   // lists_member:   00110
 
   auto [output, backing_data] = cudf::structs::detail::push_down_nulls(
-    sliced_structs, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    sliced_structs, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   // After push_down_nulls(), the null masks will be:
   // STRUCT<STRUCT>: 11101
diff --git a/cpp/tests/table/table_view_tests.cu b/cpp/tests/table/table_view_tests.cu
index 77b3c6c475c..a393c655fbb 100644
--- a/cpp/tests/table/table_view_tests.cu
+++ b/cpp/tests/table/table_view_tests.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -47,7 +48,7 @@ void row_comparison(cudf::table_view input1,
   auto device_table_1 = cudf::table_device_view::create(input1, stream);
   auto device_table_2 = cudf::table_device_view::create(input2, stream);
   auto d_column_order = cudf::detail::make_device_uvector_sync(
-    column_order, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    column_order, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto comparator = cudf::row_lexicographic_comparator(
     cudf::nullate::NO{}, *device_table_1, *device_table_2, d_column_order.data());
diff --git a/cpp/tests/types/type_dispatcher_test.cu b/cpp/tests/types/type_dispatcher_test.cu
index 21e56de4621..f18e9afc09c 100644
--- a/cpp/tests/types/type_dispatcher_test.cu
+++ b/cpp/tests/types/type_dispatcher_test.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -70,7 +71,7 @@ CUDF_KERNEL void dispatch_test_kernel(cudf::type_id id, bool* d_result)
 TYPED_TEST(TypedDispatcherTest, DeviceDispatch)
 {
   auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(
-    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   dispatch_test_kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     cudf::type_to_id<TypeParam>(), result.data());
   CUDF_CUDA_TRY(cudaDeviceSynchronize());
@@ -131,7 +132,7 @@ CUDF_KERNEL void double_dispatch_test_kernel(cudf::type_id id1, cudf::type_id id
 TYPED_TEST(TypedDoubleDispatcherTest, DeviceDoubleDispatch)
 {
   auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(
-    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   double_dispatch_test_kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     cudf::type_to_id<TypeParam>(), cudf::type_to_id<TypeParam>(), result.data());
   CUDF_CUDA_TRY(cudaDeviceSynchronize());
diff --git a/cpp/tests/utilities/tdigest_utilities.cu b/cpp/tests/utilities/tdigest_utilities.cu
index ec3ea0d9a83..233a307cde4 100644
--- a/cpp/tests/utilities/tdigest_utilities.cu
+++ b/cpp/tests/utilities/tdigest_utilities.cu
@@ -23,6 +23,7 @@
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
 
@@ -65,11 +66,11 @@ void tdigest_sample_compare(cudf::tdigest::tdigest_column_view const& tdv,
   }
 
   auto d_expected_src = cudf::detail::make_device_uvector_async(
-    h_expected_src, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    h_expected_src, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d_expected_mean = cudf::detail::make_device_uvector_async(
-    h_expected_mean, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    h_expected_mean, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d_expected_weight = cudf::detail::make_device_uvector_async(
-    h_expected_weight, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    h_expected_weight, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto iter = thrust::make_counting_iterator(0);
   thrust::for_each(
diff --git a/cpp/tests/utilities_tests/batched_memset_tests.cu b/cpp/tests/utilities_tests/batched_memset_tests.cu
index 9fc5baeec97..bed0f40d70e 100644
--- a/cpp/tests/utilities_tests/batched_memset_tests.cu
+++ b/cpp/tests/utilities_tests/batched_memset_tests.cu
@@ -21,6 +21,7 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/batched_memset.hpp>
 #include <cudf/io/parquet.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -41,7 +42,7 @@ TEST(MultiBufferTestIntegral, BasicTest1)
 
   // Device init
   auto stream = cudf::get_default_stream();
-  auto mr     = rmm::mr::get_current_device_resource();
+  auto mr     = cudf::get_current_device_resource_ref();
 
   // Creating base vector for data and setting it to all 0xFF
   std::vector<std::vector<uint64_t>> expected;
diff --git a/cpp/tests/utilities_tests/pinned_memory_tests.cpp b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
index 93259fd63ee..ae7c6fa8b8c 100644
--- a/cpp/tests/utilities_tests/pinned_memory_tests.cpp
+++ b/cpp/tests/utilities_tests/pinned_memory_tests.cpp
@@ -25,7 +25,6 @@
 
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 class PinnedMemoryTest : public cudf::test::BaseFixture {
   size_t prev_copy_threshold;
diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu
index 30496728083..019d6adc007 100644
--- a/cpp/tests/utilities_tests/span_tests.cu
+++ b/cpp/tests/utilities_tests/span_tests.cu
@@ -23,6 +23,7 @@
 
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -253,7 +254,7 @@ CUDF_KERNEL void simple_device_kernel(device_span<bool> result) { result[0] = tr
 TEST(SpanTest, CanUseDeviceSpan)
 {
   auto d_message = cudf::detail::make_zeroed_device_uvector_async<bool>(
-    1, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   auto d_span = device_span<bool>(d_message.data(), d_message.size());
 
diff --git a/docs/cudf/source/libcudf_docs/api_docs/index.rst b/docs/cudf/source/libcudf_docs/api_docs/index.rst
index c077a7cd452..96ff0eb7850 100644
--- a/docs/cudf/source/libcudf_docs/api_docs/index.rst
+++ b/docs/cudf/source/libcudf_docs/api_docs/index.rst
@@ -7,6 +7,7 @@ libcudf documentation
 
    cudf_namespace
    default_stream
+   memory_resource
    cudf_classes
    column_apis
    datetime_apis
diff --git a/docs/cudf/source/libcudf_docs/api_docs/memory_resource.rst b/docs/cudf/source/libcudf_docs/api_docs/memory_resource.rst
new file mode 100644
index 00000000000..e32f8a9beb0
--- /dev/null
+++ b/docs/cudf/source/libcudf_docs/api_docs/memory_resource.rst
@@ -0,0 +1,5 @@
+Memory Resource Management
+==========================
+
+.. doxygengroup:: memory_resource
+   :members:
diff --git a/java/src/main/native/include/maps_column_view.hpp b/java/src/main/native/include/maps_column_view.hpp
index be25dbd2e55..93c117aef18 100644
--- a/java/src/main/native/include/maps_column_view.hpp
+++ b/java/src/main/native/include/maps_column_view.hpp
@@ -19,10 +19,9 @@
 #include <cudf/column/column_view.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 namespace cudf {
 
@@ -87,7 +86,7 @@ class maps_column_view {
   std::unique_ptr<column> get_values_for(
     column_view const& keys,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
   /**
    * @brief Map lookup by a scalar key.
@@ -106,7 +105,7 @@ class maps_column_view {
   std::unique_ptr<column> get_values_for(
     scalar const& key,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
   /**
    * @brief Check if each map row contains a specified scalar key.
@@ -127,7 +126,7 @@ class maps_column_view {
   std::unique_ptr<column> contains(
     scalar const& key,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
   /**
    * @brief Check if each map row contains keys specified by a column
@@ -149,7 +148,7 @@ class maps_column_view {
   std::unique_ptr<column> contains(
     column_view const& key,
     rmm::cuda_stream_view stream      = cudf::get_default_stream(),
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) const;
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const;
 
  private:
   lists_column_view keys_, values_;
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index 9b718b2ed83..7285a0f1b5c 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -34,8 +34,7 @@
 #include <cudf/strings/combine.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/utilities/bit.hpp>
-
-#include <rmm/mr/device/per_device_resource.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <arrow/api.h>
 #include <arrow/c/bridge.h>
@@ -399,7 +398,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatenate(JNIEnv* env
     return release_as_jlong(
       is_lists_column
         ? cudf::lists::detail::concatenate(
-            columns, cudf::get_default_stream(), rmm::mr::get_current_device_resource())
+            columns, cudf::get_default_stream(), cudf::get_current_device_resource_ref())
         : cudf::concatenate(columns));
   }
   CATCH_STD(env, 0);
diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu
index 46261b087ae..9558c3ccbeb 100644
--- a/java/src/main/native/src/ColumnViewJni.cu
+++ b/java/src/main/native/src/ColumnViewJni.cu
@@ -28,11 +28,11 @@
 #include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/functional.h>
 #include <thrust/logical.h>
@@ -134,7 +134,7 @@ void post_process_list_overlap(cudf::column_view const& lhs,
                            validity.end(),
                            thrust::identity{},
                            cudf::get_default_stream(),
-                           rmm::mr::get_current_device_resource());
+                           cudf::get_current_device_resource_ref());
 
   if (new_null_count > 0) {
     // If the `overlap_result` column is nullable, perform `bitmask_and` of its nullmask and the
@@ -146,7 +146,7 @@ void post_process_list_overlap(cudf::column_view const& lhs,
         std::vector<cudf::size_type>{0, 0},
         overlap_cv.size(),
         stream,
-        rmm::mr::get_current_device_resource());
+        cudf::get_current_device_resource_ref());
       overlap_result->set_null_mask(std::move(null_mask), null_count);
     } else {
       // Just set the output nullmask as the new nullmask.
@@ -179,7 +179,7 @@ std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view cons
                        cudf::null_equality::EQUAL,
                        cudf::nan_equality::ALL_EQUAL,
                        stream,
-                       rmm::mr::get_current_device_resource())
+                       cudf::get_current_device_resource_ref())
                        ->release();
   auto const out_labels = out_columns.front()->view();
 
@@ -206,7 +206,7 @@ std::unique_ptr<cudf::column> lists_distinct_by_key(cudf::lists_column_view cons
     std::move(out_offsets),
     std::move(out_structs),
     input.null_count(),
-    cudf::detail::copy_bitmask(input.parent(), stream, rmm::mr::get_current_device_resource()),
+    cudf::detail::copy_bitmask(input.parent(), stream, cudf::get_current_device_resource_ref()),
     stream);
 }
 
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index 09c04a77590..23c7b7fb243 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -16,6 +16,7 @@
 
 #include "cudf_jni_apis.hpp"
 
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/aligned.hpp>
@@ -27,10 +28,8 @@
 #include <rmm/mr/device/logging_resource_adaptor.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/owning_wrapper.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 #include <rmm/mr/pinned_host_memory_resource.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <atomic>
 #include <ctime>
@@ -617,7 +616,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv* env,
 {
   try {
     cudf::jni::auto_set_device(env);
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref();
     auto c_stream = rmm::cuda_stream_view(reinterpret_cast<cudaStream_t>(stream));
     void* ret     = mr.allocate_async(size, rmm::CUDA_ALLOCATION_ALIGNMENT, c_stream);
     return reinterpret_cast<jlong>(ret);
@@ -630,7 +629,7 @@ Java_ai_rapids_cudf_Rmm_free(JNIEnv* env, jclass clazz, jlong ptr, jlong size, j
 {
   try {
     cudf::jni::auto_set_device(env);
-    rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource();
+    rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref();
     void* cptr                        = reinterpret_cast<void*>(ptr);
     auto c_stream = rmm::cuda_stream_view(reinterpret_cast<cudaStream_t>(stream));
     mr.deallocate_async(cptr, size, rmm::CUDA_ALLOCATION_ALIGNMENT, c_stream);
@@ -1002,7 +1001,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setCurrentDeviceResourceInternal(
   try {
     cudf::jni::auto_set_device(env);
     auto mr = reinterpret_cast<rmm::mr::device_memory_resource*>(new_handle);
-    rmm::mr::set_current_device_resource(mr);
+    cudf::set_current_device_resource(mr);
   }
   CATCH_STD(env, )
 }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index c749c8c84bf..c5abf08a59d 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -47,6 +47,7 @@
 #include <cudf/stream_compaction.hpp>
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -3951,7 +3952,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_dropDuplicates(
                      nulls_equal ? cudf::null_equality::EQUAL : cudf::null_equality::UNEQUAL,
                      cudf::nan_equality::ALL_EQUAL,
                      cudf::get_default_stream(),
-                     rmm::mr::get_current_device_resource());
+                     cudf::get_current_device_resource_ref());
     return convert_table_for_return(env, result);
   }
   CATCH_STD(env, 0);
@@ -4116,7 +4117,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_makeChunkedPack(
     // and scratch memory only.
     auto temp_mr      = memoryResourceHandle != 0
                           ? reinterpret_cast<rmm::mr::device_memory_resource*>(memoryResourceHandle)
-                          : rmm::mr::get_current_device_resource();
+                          : cudf::get_current_device_resource_ref();
     auto chunked_pack = cudf::chunked_pack::create(*n_table, bounce_buffer_size, temp_mr);
     return reinterpret_cast<jlong>(chunked_pack.release());
   }
diff --git a/java/src/main/native/src/maps_column_view.cu b/java/src/main/native/src/maps_column_view.cu
index d3ee52c074c..d26ae86f531 100644
--- a/java/src/main/native/src/maps_column_view.cu
+++ b/java/src/main/native/src/maps_column_view.cu
@@ -18,9 +18,9 @@
 #include <cudf/lists/detail/contains.hpp>
 #include <cudf/lists/detail/extract.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/exec_policy.hpp>
-#include <rmm/resource_ref.hpp>
 
 #include <maps_column_view.hpp>
 
@@ -65,7 +65,7 @@ std::unique_ptr<column> get_values_for_impl(maps_column_view const& maps_view,
                                              lookup_keys,
                                              lists::duplicate_find_option::FIND_LAST,
                                              stream,
-                                             rmm::mr::get_current_device_resource());
+                                             cudf::get_current_device_resource_ref());
   auto constexpr absent_offset  = size_type{-1};
   auto constexpr nullity_offset = std::numeric_limits<size_type>::min();
   thrust::replace(rmm::exec_policy(stream),
@@ -103,7 +103,7 @@ std::unique_ptr<column> contains_impl(maps_column_view const& maps_view,
   CUDF_EXPECTS(lookup_keys.type().id() == keys.child().type().id(),
                "Lookup keys must have the same type as the keys of the map column.");
   auto const contains =
-    lists::detail::contains(keys, lookup_keys, stream, rmm::mr::get_current_device_resource());
+    lists::detail::contains(keys, lookup_keys, stream, cudf::get_current_device_resource_ref());
   // Replace nulls with BOOL8{false};
   auto const scalar_false = numeric_scalar<bool>{false, true, stream};
   return detail::replace_nulls(contains->view(), scalar_false, stream, mr);
diff --git a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
index b924995cf4b..6fab2684ce4 100644
--- a/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
+++ b/python/cudf/udf_cpp/strings/src/strings/udf/udf_apis.cu
@@ -20,6 +20,7 @@
 #include <cudf/strings/udf/udf_string.cuh>
 #include <cudf/strings/utilities.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/memory_resource.hpp>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
@@ -58,7 +59,7 @@ std::unique_ptr<rmm::device_buffer> to_string_view_array(cudf::column_view const
 {
   return std::make_unique<rmm::device_buffer>(
     std::move(cudf::strings::create_string_vector_from_column(
-                cudf::strings_column_view(input), stream, rmm::mr::get_current_device_resource())
+                cudf::strings_column_view(input), stream, cudf::get_current_device_resource_ref())
                 .release()));
 }
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/interop.pxd b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
index 9228c017d93..30b97fdec34 100644
--- a/python/pylibcudf/pylibcudf/libcudf/interop.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/interop.pxd
@@ -70,8 +70,8 @@ cdef extern from *:
 
     ArrowArray* to_arrow_host_raw(
       cudf::table_view const& tbl,
-      rmm::cuda_stream_view stream        = cudf::get_default_stream(),
-      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()) {
+      rmm::cuda_stream_view stream       = cudf::get_default_stream(),
+      rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) {
       // Assumes the sync event is null and the data is already on the host.
       ArrowArray *arr = new ArrowArray();
       auto device_arr = cudf::to_arrow_host(tbl, stream, mr);

From afc9f4f84e4031fd028046a7668afec27d79627e Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 10 Sep 2024 07:21:28 -1000
Subject: [PATCH 790/842] Add labeling pylibcudf doc pages (#16779)

Follow up to https://github.com/rapidsai/cudf/pull/16761, I forgot to add the doc pages for the labeling pylibcudf APIs

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16779
---
 docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst    | 1 +
 docs/cudf/source/user_guide/api_docs/pylibcudf/labeling.rst | 6 ++++++
 2 files changed, 7 insertions(+)
 create mode 100644 docs/cudf/source/user_guide/api_docs/pylibcudf/labeling.rst

diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
index 6a2b66e8ea0..d6f8cd2a1ff 100644
--- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst
@@ -21,6 +21,7 @@ This page provides API documentation for pylibcudf.
     groupby
     interop
     join
+    labeling
     lists
     merge
     null_mask
diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/labeling.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/labeling.rst
new file mode 100644
index 00000000000..3f3ae4c5a77
--- /dev/null
+++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/labeling.rst
@@ -0,0 +1,6 @@
+========
+labeling
+========
+
+.. automodule:: pylibcudf.labeling
+   :members:

From 6dd5689d123bdb68be849fd15ff4cb6348535c72 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Tue, 10 Sep 2024 13:13:32 -0500
Subject: [PATCH 791/842] use libkvikio wheels in wheel builds (#16778)

Follow-up to #15483.
Contributes to https://github.com/rapidsai/build-planning/issues/33.

Adds a build-time dependency on `libkvikio` wheels for `libcudf` wheels (per https://github.com/rapidsai/cudf/pull/15483#discussion_r1583969365).

With this change, CPM is no longer used to download and install the kvikio headers.

Before:

```text
  -- Found cuFile: /usr/local/cuda/lib64/libcufile.so
  -- CPM: Adding package KvikIO@24.10 (branch-24.10)
```

([recent build link from branch-24.10](https://github.com/rapidsai/cudf/actions/runs/10780576194/job/29896649202#step:9:7673))

After:

```text
  -- KvikIO: Found cuFile Batch API: TRUE
  -- KvikIO: Found cuFile Stream API: TRUE
  -- CPM: Using local package KvikIO@24.10.0
```

([build link from this PR](https://github.com/rapidsai/cudf/actions/runs/10780504202/job/29896555443?pr=16778#step:9:7754))

## Notes for Reviewers

### This removes kvikio headers/CMake files from libcudf wheels

Cuts around 0.8 MB (23 files) out of `libcudf` wheels.

As of this PR, these would no longer be vendored in `libcudf` wheels:

```text
    0  09-08-2024 06:17   libcudf/include/kvikio/
    0  09-08-2024 06:17   libcudf/include/kvikio/shim/
 6356  09-08-2024 06:17   libcudf/include/kvikio/batch.hpp
 3812  09-08-2024 06:17   libcudf/include/kvikio/buffer.hpp
10499  09-08-2024 06:17   libcudf/include/kvikio/utils.hpp
 1399  09-08-2024 06:17   libcudf/include/kvikio/cufile_config.hpp
33385  09-08-2024 06:17   libcudf/include/kvikio/file_handle.hpp
 7299  09-08-2024 06:17   libcudf/include/kvikio/driver.hpp
 9678  09-08-2024 06:17   libcudf/include/kvikio/defaults.hpp
 5352  09-08-2024 06:17   libcudf/include/kvikio/stream.hpp
 6002  09-08-2024 06:17   libcudf/include/kvikio/error.hpp
 4501  09-08-2024 06:17   libcudf/include/kvikio/bounce_buffer.hpp
 3197  09-08-2024 06:17   libcudf/include/kvikio/parallel_operation.hpp
 9864  09-08-2024 06:17   libcudf/include/kvikio/posix_io.hpp
  717  09-08-2024 06:17   libcudf/include/kvikio/version_config.hpp
 4529  09-08-2024 06:17   libcudf/include/kvikio/shim/cuda.hpp
 3331  09-08-2024 06:17   libcudf/include/kvikio/shim/utils.hpp
 4055  09-08-2024 06:17   libcudf/include/kvikio/shim/cufile_h_wrapper.hpp
 2242  09-08-2024 06:17   libcudf/include/kvikio/shim/cuda_h_wrapper.hpp
 7510  09-08-2024 06:17   libcudf/include/kvikio/shim/cufile.hpp
    0  09-08-2024 06:17   libcudf/lib64/cmake/kvikio/
 5031  09-08-2024 06:17   libcudf/lib64/cmake/kvikio/kvikio-targets.cmake
 3681  09-08-2024 06:17   libcudf/lib64/cmake/kvikio/kvikio-config-version.cmake
 6915  09-08-2024 06:17   libcudf/lib64/cmake/kvikio/kvikio-config.cmake
 1529  09-08-2024 06:17   libcudf/lib64/cmake/kvikio/kvikio-dependencies.cmake
 3851  09-08-2024 06:17   libcudf/lib64/cmake/kvikio/FindcuFile.cmake
```

This is safe because kvikio is a PRIVATE dependency of `libcudf`.

https://github.com/rapidsai/cudf/blob/150f1b10ed9c702d5283216b746df685e1708716/cpp/CMakeLists.txt#L796-L802


#

Authors:
  - James Lamb (https://github.com/jameslamb)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16778
---
 dependencies.yaml             | 29 +++++++++++++++++++++++++++--
 python/libcudf/pyproject.toml |  1 +
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 32c1d7a0845..483335c02ff 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -13,6 +13,7 @@ files:
       - cuda
       - cuda_version
       - depends_on_cupy
+      - depends_on_libkvikio
       - depends_on_librmm
       - depends_on_rmm
       - develop
@@ -135,6 +136,7 @@ files:
     includes:
       - build_base
       - build_cpp
+      - depends_on_libkvikio
       - depends_on_librmm
   py_build_pylibcudf:
     output: pyproject
@@ -349,8 +351,6 @@ dependencies:
       - output_types: conda
         packages:
           - fmt>=10.1.1,<11
-          - librmm==24.10.*,>=0.0.0a0
-          - libkvikio==24.10.*,>=0.0.0a0
           - flatbuffers==24.3.25
           - librdkafka>=2.5.0,<2.6.0a0
           # Align nvcomp version with rapids-cmake
@@ -889,6 +889,31 @@ dependencies:
             packages: &cupy_packages_cu11
               - cupy-cuda11x>=12.0.0
           - {matrix: null, packages: *cupy_packages_cu11}
+  depends_on_libkvikio:
+    common:
+      - output_types: conda
+        packages:
+          - &libkvikio_unsuffixed libkvikio==24.10.*,>=0.0.0a0
+      - output_types: requirements
+        packages:
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix:
+              cuda: "12.*"
+              cuda_suffixed: "true"
+            packages:
+              - libkvikio-cu12==24.10.*,>=0.0.0a0
+          - matrix:
+              cuda: "11.*"
+              cuda_suffixed: "true"
+            packages:
+              - libkvikio-cu11==24.10.*,>=0.0.0a0
+          - matrix:
+            packages:
+              - *libkvikio_unsuffixed
   depends_on_librmm:
     common:
       - output_types: conda
diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml
index 5f4b9957fd0..2c98b97eddf 100644
--- a/python/libcudf/pyproject.toml
+++ b/python/libcudf/pyproject.toml
@@ -66,6 +66,7 @@ dependencies-file = "../../dependencies.yaml"
 matrix-entry = "cuda_suffixed=true"
 requires = [
     "cmake>=3.26.4,!=3.30.0",
+    "libkvikio==24.10.*,>=0.0.0a0",
     "librmm==24.10.*,>=0.0.0a0",
     "ninja",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From f4364f8841a20d1feea23bb0879c5f90f0271f42 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 10 Sep 2024 19:19:12 +0000
Subject: [PATCH 792/842] test

---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 4ba5361b983..a7aeca0c58c 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -145,6 +145,7 @@ PANDAS_CI="1" timeout 600m python -m pytest -p cudf.pandas \
     "$@" || [ $? = 1 ]  # Exit success if exit code was 1 (permit test failures but not other errors)
 
 mv *.json ..
+ls -al
 cd ..
 
 rm -rf pandas-testing/pandas-tests/

From c6a44a16c32dde0988fc7f71eae463e25c979a3a Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 10 Sep 2024 19:53:36 +0000
Subject: [PATCH 793/842] test

---
 .github/workflows/pr.yaml                     | 318 +++++++++---------
 .../cudf/pandas/scripts/conftest-patch.py     |  11 +-
 2 files changed, 164 insertions(+), 165 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index a4a8f036174..18d1ea39d59 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -14,26 +14,26 @@ jobs:
     needs:
       - changed-files
       - checks
-      - conda-cpp-build
-      - conda-cpp-checks
-      - conda-cpp-tests
-      - conda-python-build
-      - conda-python-cudf-tests
-      - conda-python-other-tests
-      - conda-java-tests
+      # - conda-cpp-build
+      # - conda-cpp-checks
+      # - conda-cpp-tests
+      # - conda-python-build
+      # - conda-python-cudf-tests
+      # - conda-python-other-tests
+      # - conda-java-tests
       - static-configure
-      - conda-notebook-tests
-      - docs-build
+      # - conda-notebook-tests
+      # - docs-build
       - wheel-build-libcudf
       - wheel-build-pylibcudf
       - wheel-build-cudf
-      - wheel-tests-cudf
-      - wheel-build-cudf-polars
-      - wheel-tests-cudf-polars
-      - wheel-build-dask-cudf
-      - wheel-tests-dask-cudf
-      - devcontainer
-      - unit-tests-cudf-pandas
+      # - wheel-tests-cudf
+      # - wheel-build-cudf-polars
+      # - wheel-tests-cudf-polars
+      # - wheel-build-dask-cudf
+      # - wheel-tests-dask-cudf
+      # - devcontainer
+      # - unit-tests-cudf-pandas
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
@@ -107,60 +107,60 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12
     with:
       enable_check_generated_files: false
-  conda-cpp-build:
-    needs: checks
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
-    with:
-      build_type: pull-request
-  conda-cpp-checks:
-    needs: conda-cpp-build
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
-    with:
-      build_type: pull-request
-      enable_check_symbols: true
-  conda-cpp-tests:
-    needs: [conda-cpp-build, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
-    if: needs.changed-files.outputs.test_cpp == 'true'
-    with:
-      build_type: pull-request
-  conda-python-build:
-    needs: conda-cpp-build
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
-    with:
-      build_type: pull-request
-  conda-python-cudf-tests:
-    needs: [conda-python-build, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
-    if: needs.changed-files.outputs.test_python == 'true'
-    with:
-      build_type: pull-request
-      script: "ci/test_python_cudf.sh"
-  conda-python-other-tests:
-    # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
-    needs: [conda-python-build, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
-    if: needs.changed-files.outputs.test_python == 'true'
-    with:
-      build_type: pull-request
-      script: "ci/test_python_other.sh"
-  conda-java-tests:
-    needs: [conda-cpp-build, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
-    if: needs.changed-files.outputs.test_java == 'true'
-    with:
-      build_type: pull-request
-      node_type: "gpu-v100-latest-1"
-      arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
-      run_script: "ci/test_java.sh"
+  # conda-cpp-build:
+  #   needs: checks
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
+  #   with:
+  #     build_type: pull-request
+  # conda-cpp-checks:
+  #   needs: conda-cpp-build
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
+  #   with:
+  #     build_type: pull-request
+  #     enable_check_symbols: true
+  # conda-cpp-tests:
+  #   needs: [conda-cpp-build, changed-files]
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
+  #   if: needs.changed-files.outputs.test_cpp == 'true'
+  #   with:
+  #     build_type: pull-request
+  # conda-python-build:
+  #   needs: conda-cpp-build
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+  #   with:
+  #     build_type: pull-request
+  # conda-python-cudf-tests:
+  #   needs: [conda-python-build, changed-files]
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+  #   if: needs.changed-files.outputs.test_python == 'true'
+  #   with:
+  #     build_type: pull-request
+  #     script: "ci/test_python_cudf.sh"
+  # conda-python-other-tests:
+  #   # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
+  #   needs: [conda-python-build, changed-files]
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+  #   if: needs.changed-files.outputs.test_python == 'true'
+  #   with:
+  #     build_type: pull-request
+  #     script: "ci/test_python_other.sh"
+  # conda-java-tests:
+  #   needs: [conda-cpp-build, changed-files]
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+  #   if: needs.changed-files.outputs.test_java == 'true'
+  #   with:
+  #     build_type: pull-request
+  #     node_type: "gpu-v100-latest-1"
+  #     arch: "amd64"
+  #     container_image: "rapidsai/ci-conda:latest"
+  #     run_script: "ci/test_java.sh"
   static-configure:
     needs: checks
     secrets: inherit
@@ -171,27 +171,27 @@ jobs:
       # primary static consumers (Spark) are not in conda anyway.
       container_image: "rapidsai/ci-wheel:latest"
       run_script: "ci/configure_cpp_static.sh"
-  conda-notebook-tests:
-    needs: [conda-python-build, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
-    if: needs.changed-files.outputs.test_notebooks == 'true'
-    with:
-      build_type: pull-request
-      node_type: "gpu-v100-latest-1"
-      arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
-      run_script: "ci/test_notebooks.sh"
-  docs-build:
-    needs: conda-python-build
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
-    with:
-      build_type: pull-request
-      node_type: "gpu-v100-latest-1"
-      arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
-      run_script: "ci/build_docs.sh"
+  # conda-notebook-tests:
+  #   needs: [conda-python-build, changed-files]
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+  #   if: needs.changed-files.outputs.test_notebooks == 'true'
+  #   with:
+  #     build_type: pull-request
+  #     node_type: "gpu-v100-latest-1"
+  #     arch: "amd64"
+  #     container_image: "rapidsai/ci-conda:latest"
+  #     run_script: "ci/test_notebooks.sh"
+  # docs-build:
+  #   needs: conda-python-build
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+  #   with:
+  #     build_type: pull-request
+  #     node_type: "gpu-v100-latest-1"
+  #     arch: "amd64"
+  #     container_image: "rapidsai/ci-conda:latest"
+  #     run_script: "ci/build_docs.sh"
   wheel-build-libcudf:
     needs: checks
     secrets: inherit
@@ -215,74 +215,74 @@ jobs:
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
-  wheel-tests-cudf:
-    needs: [wheel-build-cudf, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
-    if: needs.changed-files.outputs.test_python == 'true'
-    with:
-      build_type: pull-request
-      script: ci/test_wheel_cudf.sh
-  wheel-build-cudf-polars:
-    needs: wheel-build-pylibcudf
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
-    with:
-      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-      build_type: pull-request
-      script: "ci/build_wheel_cudf_polars.sh"
-  wheel-tests-cudf-polars:
-    needs: [wheel-build-cudf-polars, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
-    if: needs.changed-files.outputs.test_python == 'true'
-    with:
-      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-      build_type: pull-request
-      # This always runs, but only fails if this PR touches code in
-      # pylibcudf or cudf_polars
-      script: "ci/test_wheel_cudf_polars.sh"
-  wheel-build-dask-cudf:
-    needs: wheel-build-cudf
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
-    with:
-      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-      build_type: pull-request
-      script: "ci/build_wheel_dask_cudf.sh"
-  wheel-tests-dask-cudf:
-    needs: [wheel-build-dask-cudf, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
-    if: needs.changed-files.outputs.test_python == 'true'
-    with:
-      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-      build_type: pull-request
-      script: ci/test_wheel_dask_cudf.sh
-  devcontainer:
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12
-    with:
-      arch: '["amd64"]'
-      cuda: '["12.5"]'
-      build_command: |
-        sccache -z;
-        build-all -DBUILD_BENCHMARKS=ON --verbose;
-        sccache -s;
-  unit-tests-cudf-pandas:
-    needs: [wheel-build-cudf, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
-    if: needs.changed-files.outputs.test_python == 'true'
-    with:
-      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-      build_type: pull-request
-      script: ci/cudf_pandas_scripts/run_tests.sh
+  # wheel-tests-cudf:
+  #   needs: [wheel-build-cudf, changed-files]
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+  #   if: needs.changed-files.outputs.test_python == 'true'
+  #   with:
+  #     build_type: pull-request
+  #     script: ci/test_wheel_cudf.sh
+  # wheel-build-cudf-polars:
+  #   needs: wheel-build-pylibcudf
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+  #   with:
+  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+  #     build_type: pull-request
+  #     script: "ci/build_wheel_cudf_polars.sh"
+  # wheel-tests-cudf-polars:
+  #   needs: [wheel-build-cudf-polars, changed-files]
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+  #   if: needs.changed-files.outputs.test_python == 'true'
+  #   with:
+  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+  #     build_type: pull-request
+  #     # This always runs, but only fails if this PR touches code in
+  #     # pylibcudf or cudf_polars
+  #     script: "ci/test_wheel_cudf_polars.sh"
+  # wheel-build-dask-cudf:
+  #   needs: wheel-build-cudf
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+  #   with:
+  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+  #     build_type: pull-request
+  #     script: "ci/build_wheel_dask_cudf.sh"
+  # wheel-tests-dask-cudf:
+  #   needs: [wheel-build-dask-cudf, changed-files]
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+  #   if: needs.changed-files.outputs.test_python == 'true'
+  #   with:
+  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+  #     build_type: pull-request
+  #     script: ci/test_wheel_dask_cudf.sh
+  # devcontainer:
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12
+  #   with:
+  #     arch: '["amd64"]'
+  #     cuda: '["12.5"]'
+  #     build_command: |
+  #       sccache -z;
+  #       build-all -DBUILD_BENCHMARKS=ON --verbose;
+  #       sccache -s;
+  # unit-tests-cudf-pandas:
+  #   needs: [wheel-build-cudf, changed-files]
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+  #   if: needs.changed-files.outputs.test_python == 'true'
+  #   with:
+  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+  #     build_type: pull-request
+  #     script: ci/cudf_pandas_scripts/run_tests.sh
   pandas-tests:
     # run the Pandas unit tests using PR branch
     needs: [wheel-build-cudf, changed-files]
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index d214ec8defc..6bda9b4d2a3 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import contextlib
+import json
 import multiprocessing
 import os
 import sys
@@ -87,10 +88,9 @@ def pytest_runtest_teardown(item, nextitem):
     ):
         # Write the function call counts to a file
         worker_id = os.getenv("PYTEST_XDIST_WORKER", "master")
-        output_file = f'function_call_counts_{os.path.basename(item.nodeid.split("::")[0])}_{worker_id}.txt'
+        output_file = f'function_call_counts_{os.path.basename(item.nodeid.split("::")[0])}_{worker_id}.json'
         with open(output_file, "w") as f:
-            for func, count in function_call_counts.items():
-                f.write(f"{func}: {count}\n")
+            json.dump(function_call_counts, f, indent=4)
         print(f"Function call counts have been written to {output_file}")
 
 
@@ -107,10 +107,9 @@ def pytest_unconfigure(config):
     if hasattr(config, "workerinput"):
         # Running in xdist worker
         worker_id = config.workerinput["workerid"]
-        output_file = f"function_call_counts_worker_{worker_id}.txt"
+        output_file = f"function_call_counts_worker_{worker_id}.json"
         with open(output_file, "w") as f:
-            for func, count in function_call_counts.items():
-                f.write(f"{func}: {count}\n")
+            json.dump(function_call_counts, f, indent=4)
         print(f"Function call counts have been written to {output_file}")
 
 
From 5192b885bba82039823da687bc0a013ee74566a7 Mon Sep 17 00:00:00 2001
From: Jihoon Son <ghoonson@gmail.com>
Date: Tue, 10 Sep 2024 14:19:27 -0700
Subject: [PATCH 794/842] Fix empty cluster handling in tdigest merge (#16675)

This PR fixes an edge case bug in the tdigest merge. When there are multiple distinct keys but all values are empty clusters, the value column is currently merged into a single empty cluster after merge, which leads to an error while creating a result table because of the mismatching number of rows in the key and value columns. This bug can be reproduced only when all values are empty clusters. If some values are empty but some are not, the current implementation returns a valid result. This bug was originally reported in https://github.com/NVIDIA/spark-rapids/issues/11367.

The bug exists in `merge_tdigests()` as it assumes that there is no empty cluster in the merge stage even when there are (`has_nulls` are fixed to `false`). It is rather safe to assume that always there could be empty clusters. This PR fixes the flag by fixing it to true. Also, `has_nulls` has been renamed to a more descriptive name, `may_have_empty_clusters`.

The tdigest reduce does not have the same issue as it does not call `merge_tdigests()`.

Authors:
  - Jihoon Son (https://github.com/jihoonson)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - https://github.com/nvdbaranec

URL: https://github.com/rapidsai/cudf/pull/16675
---
 cpp/include/cudf/detail/tdigest/tdigest.hpp   | 17 ++--
 cpp/include/cudf_test/tdigest_utilities.cuh   | 20 ++---
 cpp/src/quantiles/tdigest/tdigest.cu          | 23 ++---
 .../quantiles/tdigest/tdigest_aggregation.cu  | 70 +++++++++------
 cpp/tests/groupby/tdigest_tests.cu            | 90 +++++++++++++++++--
 .../quantiles/percentile_approx_test.cpp      |  4 +-
 6 files changed, 162 insertions(+), 62 deletions(-)

diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index 80a4460023f..672b95e2d01 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -143,28 +143,29 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
                                             rmm::device_async_resource_ref mr);
 
 /**
- * @brief Create an empty tdigest column.
+ * @brief Create a tdigest column of empty clusters.
  *
- * An empty tdigest column contains a single row of length 0
+ * The column created contains the specified number of rows of empty clusters.
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @returns An empty tdigest column.
+ * @returns A tdigest column of empty clusters.
  */
 CUDF_EXPORT
-std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr);
+std::unique_ptr<column> make_tdigest_column_of_empty_clusters(size_type num_rows,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::device_async_resource_ref mr);
 
 /**
- * @brief Create an empty tdigest scalar.
+ * @brief Create a scalar of an empty tdigest cluster.
  *
- * An empty tdigest scalar is a struct_scalar that contains a single row of length 0
+ * The returned scalar is a struct_scalar that contains a single row of an empty cluster.
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @returns An empty tdigest scalar.
+ * @returns A scalar of an empty tdigest cluster.
  */
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr);
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index 1758790cd64..be7d19b2227 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -270,8 +270,8 @@ void tdigest_simple_all_nulls_aggregation(Func op)
     static_cast<column_view>(values).type(), tdigest_gen{}, op, values, delta);
 
   // NOTE: an empty tdigest column still has 1 row.
-  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -562,12 +562,12 @@ template <typename MergeFunc>
 void tdigest_merge_empty(MergeFunc merge_op)
 {
   // 3 empty tdigests all in the same group
-  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
-  auto b = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
-  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto b = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   std::vector<column_view> cols;
   cols.push_back(*a);
   cols.push_back(*b);
@@ -577,8 +577,8 @@ void tdigest_merge_empty(MergeFunc merge_op)
   auto const delta = 1000;
   auto result      = merge_op(*values, delta);
 
-  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
 }
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 0d017cf1f13..76cd55bf994 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -292,32 +292,33 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
   return make_structs_column(num_rows, std::move(children), 0, {}, stream, mr);
 }
 
-std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
-                                                  rmm::device_async_resource_ref mr)
+std::unique_ptr<column> make_tdigest_column_of_empty_clusters(size_type num_rows,
+                                                              rmm::cuda_stream_view stream,
+                                                              rmm::device_async_resource_ref mr)
 {
   auto offsets = cudf::make_fixed_width_column(
-    data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
+    data_type(type_id::INT32), num_rows + 1, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                offsets->mutable_view().begin<size_type>(),
                offsets->mutable_view().end<size_type>(),
                0);
 
-  auto min_col =
-    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
+  auto min_col = cudf::make_numeric_column(
+    data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                min_col->mutable_view().begin<double>(),
                min_col->mutable_view().end<double>(),
                0);
-  auto max_col =
-    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
+  auto max_col = cudf::make_numeric_column(
+    data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                max_col->mutable_view().begin<double>(),
                max_col->mutable_view().end<double>(),
                0);
 
-  return make_tdigest_column(1,
-                             make_empty_column(type_id::FLOAT64),
-                             make_empty_column(type_id::FLOAT64),
+  return make_tdigest_column(num_rows,
+                             cudf::make_empty_column(type_id::FLOAT64),
+                             cudf::make_empty_column(type_id::FLOAT64),
                              std::move(offsets),
                              std::move(min_col),
                              std::move(max_col),
@@ -338,7 +339,7 @@ std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr)
 {
-  auto contents = make_empty_tdigest_column(stream, mr)->release();
+  auto contents = make_tdigest_column_of_empty_clusters(1, stream, mr)->release();
   return std::make_unique<struct_scalar>(
     std::move(*std::make_unique<table>(std::move(contents.children))), true, stream, mr);
 }
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index 2dd25a7b890..d591fb5c171 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -366,8 +366,8 @@ std::unique_ptr<scalar> to_tdigest_scalar(std::unique_ptr<column>&& tdigest,
  * @param group_cluster_wl    Output.  The set of cluster weight limits for each group.
  * @param group_num_clusters  Output.  The number of output clusters for each input group.
  * @param group_cluster_offsets  Offsets per-group to the start of it's clusters
- * @param has_nulls Whether or not the input contains nulls
- *
+ * @param may_have_empty_clusters Whether or not there could be empty clusters. Must only be
+ * set to false when there is no empty cluster, true otherwise.
  */
 
 template <typename GroupInfo, typename NearestWeightFunc, typename CumulativeWeight>
@@ -379,7 +379,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
                                                 double* group_cluster_wl,
                                                 size_type* group_num_clusters,
                                                 size_type const* group_cluster_offsets,
-                                                bool has_nulls)
+                                                bool may_have_empty_clusters)
 {
   int const tid = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -399,11 +399,12 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
   // a group with nothing in it.
   group_num_clusters[group_index] = 0;
   if (total_weight <= 0) {
-    // if the input contains nulls we can potentially have a group that generates no
-    // clusters because -all- of the input values are null.  in that case, the reduce_by_key call
-    // in the tdigest generation step will need a location to store the unused reduction value for
-    // that group of nulls. these "stubs" will be postprocessed out afterwards.
-    if (has_nulls) { group_num_clusters[group_index] = 1; }
+    // If the input contains empty clusters, we can potentially have a group that also generates
+    // empty clusters because -all- of the input values are null or empty cluster. In that case, the
+    // `reduce_by_key` call in the tdigest generation step will need a location to store the unused
+    // reduction value for that group of nulls and empty clusters. These "stubs" will be
+    // postprocessed out afterwards.
+    if (may_have_empty_clusters) { group_num_clusters[group_index] = 1; }
     return;
   }
 
@@ -502,7 +503,8 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
  * stream that falls before our current cluster limit
  * @param group_info         A functor which returns the info for the specified group (total weight,
  * size and start offset)
- * @param has_nulls          Whether or not the input data contains nulls
+ * @param may_have_empty_clusters Whether or not there could be empty clusters. It should be
+ * set to false only when there is no empty cluster.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
@@ -516,7 +518,7 @@ generate_group_cluster_info(int delta,
                             NearestWeight nearest_weight,
                             GroupInfo group_info,
                             CumulativeWeight cumulative_weight,
-                            bool has_nulls,
+                            bool may_have_empty_clusters,
                             rmm::cuda_stream_view stream,
                             rmm::device_async_resource_ref mr)
 {
@@ -535,7 +537,7 @@ generate_group_cluster_info(int delta,
     nullptr,
     group_num_clusters.begin(),
     nullptr,
-    has_nulls);
+    may_have_empty_clusters);
 
   // generate group cluster offsets (where the clusters for a given group start and end)
   auto group_cluster_offsets = cudf::make_numeric_column(
@@ -567,7 +569,7 @@ generate_group_cluster_info(int delta,
     group_cluster_wl.begin(),
     group_num_clusters.begin(),
     group_cluster_offsets->view().begin<size_type>(),
-    has_nulls);
+    may_have_empty_clusters);
 
   return {std::move(group_cluster_wl),
           std::move(group_cluster_offsets),
@@ -580,7 +582,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                                             std::unique_ptr<column>&& offsets,
                                             std::unique_ptr<column>&& min_col,
                                             std::unique_ptr<column>&& max_col,
-                                            bool has_nulls,
+                                            bool may_have_empty_clusters,
                                             rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr)
 {
@@ -595,7 +597,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                           size_type i) { return is_stub_weight(offsets[i]) ? 1 : 0; };
 
   size_type const num_stubs = [&]() {
-    if (!has_nulls) { return 0; }
+    if (!may_have_empty_clusters) { return 0; }
     auto iter = cudf::detail::make_counting_transform_iterator(
       0, cuda::proclaim_return_type<size_type>(is_stub_digest));
     return thrust::reduce(rmm::exec_policy(stream), iter, iter + num_rows);
@@ -661,6 +663,10 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                                                     mr);
 }
 
+/**
+ * @brief A functor which returns the cluster index within a group that the value at
+ * the given value index falls into.
+ */
 template <typename CumulativeWeight>
 struct compute_tdigests_keys_fn {
   int const delta;
@@ -706,8 +712,8 @@ struct compute_tdigests_keys_fn {
  * boundaries.
  *
  * @param delta              tdigest compression level
- * @param values_begin       Beginning of the range of input values.
- * @param values_end         End of the range of input values.
+ * @param centroids_begin    Beginning of the range of centroids.
+ * @param centroids_end      End of the range of centroids.
  * @param cumulative_weight  Functor which returns cumulative weight and group information for
  * an absolute input value index.
  * @param min_col            Column containing the minimum value per group.
@@ -715,7 +721,8 @@ struct compute_tdigests_keys_fn {
  * @param group_cluster_wl   Cluster weight limits for each group.
  * @param group_cluster_offsets R-value reference of offsets into the cluster weight limits.
  * @param total_clusters     Total number of clusters in all groups.
- * @param has_nulls          Whether or not the input contains nulls
+ * @param may_have_empty_clusters Whether or not there could be empty clusters. It should be
+ * set to false only when there is no empty cluster.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
@@ -731,7 +738,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                                          rmm::device_uvector<double> const& group_cluster_wl,
                                          std::unique_ptr<column>&& group_cluster_offsets,
                                          size_type total_clusters,
-                                         bool has_nulls,
+                                         bool may_have_empty_clusters,
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr)
 {
@@ -750,7 +757,9 @@ std::unique_ptr<column> compute_tdigests(int delta,
   //   double       // max
   // }
   //
-  if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
+  if (total_clusters == 0) {
+    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr);
+  }
 
   // each input group represents an individual tdigest.  within each tdigest, we want the keys
   // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall
@@ -793,7 +802,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                              std::move(group_cluster_offsets),
                              std::move(min_col),
                              std::move(max_col),
-                             has_nulls,
+                             may_have_empty_clusters,
                              stream,
                              mr);
 }
@@ -1145,8 +1154,13 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
   auto merged =
     cudf::detail::concatenate(tdigest_views, stream, cudf::get_current_device_resource_ref());
 
+  auto merged_weights = merged->get_column(1).view();
+  // If there are no values, we can simply return a column that has only empty tdigests.
+  if (merged_weights.size() == 0) {
+    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(num_groups, stream, mr);
+  }
+
   // generate cumulative weights
-  auto merged_weights     = merged->get_column(1).view();
   auto cumulative_weights = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED, stream);
   auto keys = cudf::detail::make_counting_transform_iterator(
@@ -1161,6 +1175,10 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 
   auto const delta = max_centroids;
 
+  // We do not know whether there is any empty cluster in the input without actually reading the
+  // data, which could be expensive. So, we just assume that there could be empty clusters.
+  auto const may_have_empty_clusters = true;
+
   // generate cluster info
   auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info(
     delta,
@@ -1177,7 +1195,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
       group_labels,
       group_offsets,
       {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
-    false,
+    may_have_empty_clusters,
     stream,
     mr);
 
@@ -1202,7 +1220,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
     group_cluster_wl,
     std::move(group_cluster_offsets),
     total_clusters,
-    false,
+    may_have_empty_clusters,
     stream,
     mr);
 }
@@ -1267,7 +1285,9 @@ std::unique_ptr<column> group_tdigest(column_view const& col,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
+  if (col.size() == 0) {
+    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr);
+  }
 
   auto const delta = max_centroids;
   return cudf::type_dispatcher(col.type(),
@@ -1293,7 +1313,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
   tdigest_column_view tdv(input);
 
   if (num_groups == 0 || input.size() == 0) {
-    return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr);
+    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr);
   }
 
   // bring group offsets back to the host
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index baa59026b07..3780dbb1d95 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -469,16 +469,16 @@ TEST_F(TDigestMergeTest, EmptyGroups)
   cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0};
   int const delta = 1000;
 
-  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto b = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta);
-  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta);
-  auto e = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto e = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   std::vector<cudf::column_view> cols;
   cols.push_back(*a);
@@ -507,3 +507,81 @@ TEST_F(TDigestMergeTest, EmptyGroups)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]);
 }
+
+std::unique_ptr<cudf::table> do_agg(
+  cudf::column_view key,
+  cudf::column_view val,
+  std::function<std::unique_ptr<cudf::groupby_aggregation>()> make_agg)
+{
+  std::vector<cudf::column_view> keys;
+  keys.push_back(key);
+  cudf::table_view const key_table(keys);
+
+  cudf::groupby::groupby gb(key_table);
+  std::vector<cudf::groupby::aggregation_request> requests;
+  cudf::groupby::aggregation_request req;
+  req.values = val;
+  req.aggregations.push_back(make_agg());
+  requests.push_back(std::move(req));
+
+  auto result = gb.aggregate(std::move(requests));
+
+  std::vector<std::unique_ptr<cudf::column>> result_columns;
+  for (auto&& c : result.first->release()) {
+    result_columns.push_back(std::move(c));
+  }
+
+  EXPECT_EQ(result.second.size(), 1);
+  EXPECT_EQ(result.second[0].results.size(), 1);
+  result_columns.push_back(std::move(result.second[0].results[0]));
+
+  return std::make_unique<cudf::table>(std::move(result_columns));
+}
+
+TEST_F(TDigestMergeTest, AllGroupsHaveEmptyClusters)
+{
+  // The input must be sorted by the key.
+  // See `aggregate_result_functor::operator()<aggregation::TDIGEST>` for details.
+  auto const keys      = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 0, 1, 1, 2}};
+  auto const keys_view = cudf::column_view(keys);
+  auto val_elems  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
+  auto val_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
+    // All values are null
+    return false;
+  });
+  auto const vals = cudf::test::fixed_width_column_wrapper<int32_t>{
+    val_elems, val_elems + keys_view.size(), val_valids};
+
+  auto const delta = 10000;
+
+  // Compute tdigest. The result should have 3 empty clusters, one per group.
+  auto const compute_result = do_agg(keys_view, cudf::column_view(vals), [&delta]() {
+    return cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta);
+  });
+
+  auto const expected_computed_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
+  cudf::column_view const expected_computed_keys_view{expected_computed_keys};
+  auto const expected_computed_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    expected_computed_keys_view.size(),
+    cudf::get_default_stream(),
+    rmm::mr::get_current_device_resource());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_keys_view, compute_result->get_column(0).view());
+  // The computed values are nullable even though the input values are not.
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_vals->view(),
+                                 compute_result->get_column(1).view());
+
+  // Merge tdigest. The result should have 3 empty clusters, one per group.
+  auto const merge_result =
+    do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() {
+      return cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta);
+    });
+
+  auto const expected_merged_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
+  cudf::column_view const expected_merged_keys_view{expected_merged_keys};
+  auto const expected_merged_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    expected_merged_keys_view.size(),
+    cudf::get_default_stream(),
+    rmm::mr::get_current_device_resource());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_keys_view, merge_result->get_column(0).view());
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_vals->view(), merge_result->get_column(1).view());
+}
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index 915717713df..7359f0406fc 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -371,8 +371,8 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {};
 
 TEST_F(PercentileApproxTest, EmptyInput)
 {
-  auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column(
-    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto empty_ = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
+    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
 
   std::vector<cudf::column_view> input;

From 2cc6e0dd3dff6645943934eb23805cf164a73caa Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Tue, 10 Sep 2024 23:02:35 +0000
Subject: [PATCH 795/842] test

---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index a7aeca0c58c..74f52dee4fc 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -146,6 +146,9 @@ PANDAS_CI="1" timeout 600m python -m pytest -p cudf.pandas \
 
 mv *.json ..
 ls -al
+ls -al tests/
 cd ..
-
+ls -al
+ls -al pandas-testing/
+ls -al pandas-testing/pandas-tests/
 rm -rf pandas-testing/pandas-tests/

From c3d323df1df5ba4c5377374b5b4ffdc06829c02b Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Tue, 10 Sep 2024 17:14:28 -0700
Subject: [PATCH 796/842] Move NDS-H examples into benchmarks (#16663)

Moving the TPC-H examples into benchmarks by converting each of them into NVBench's. The benchmarks can be built by

```bash
./build.sh libcudf benchmarks
```

Also, addresses #16711

Authors:
  - Jayjeet Chakraborty (https://github.com/JayjeetAtGithub)

Approvers:
  - Mark Harris (https://github.com/harrism)

URL: https://github.com/rapidsai/cudf/pull/16663
---
 .gitignore                                    |   1 -
 cpp/benchmarks/CMakeLists.txt                 |  26 +-
 .../ndsh_data_generator.cpp}                  |  66 +--
 .../ndsh_data_generator.hpp}                  |   0
 .../random_column_generator.cu                |   0
 .../random_column_generator.hpp               |   0
 .../table_helpers.cpp                         |   0
 .../table_helpers.hpp                         |   0
 cpp/benchmarks/ndsh/README.md                 |  11 +
 .../tpch/q1.cpp => benchmarks/ndsh/q01.cpp}   |  49 +-
 .../tpch/q5.cpp => benchmarks/ndsh/q05.cpp}   |  56 ++-
 .../tpch/q6.cpp => benchmarks/ndsh/q06.cpp}   |  52 +-
 .../tpch/q9.cpp => benchmarks/ndsh/q09.cpp}   |  62 +--
 .../tpch => benchmarks/ndsh}/q10.cpp          |  51 +-
 cpp/benchmarks/ndsh/utilities.cpp             | 400 +++++++++++++++
 cpp/benchmarks/ndsh/utilities.hpp             | 227 +++++++++
 cpp/examples/build.sh                         |   1 -
 cpp/examples/tpch/CMakeLists.txt              |  36 --
 cpp/examples/tpch/README.md                   |  39 --
 .../tpch/datagen/correct_datatypes.py         |  60 ---
 cpp/examples/tpch/datagen/datagen.sh          |  31 --
 cpp/examples/tpch/datagen/tpch.patch          |  33 --
 cpp/examples/tpch/utils.hpp                   | 458 ------------------
 23 files changed, 846 insertions(+), 813 deletions(-)
 rename cpp/benchmarks/common/{tpch_data_generator/tpch_data_generator.cpp => ndsh_data_generator/ndsh_data_generator.cpp} (97%)
 rename cpp/benchmarks/common/{tpch_data_generator/tpch_data_generator.hpp => ndsh_data_generator/ndsh_data_generator.hpp} (100%)
 rename cpp/benchmarks/common/{tpch_data_generator => ndsh_data_generator}/random_column_generator.cu (100%)
 rename cpp/benchmarks/common/{tpch_data_generator => ndsh_data_generator}/random_column_generator.hpp (100%)
 rename cpp/benchmarks/common/{tpch_data_generator => ndsh_data_generator}/table_helpers.cpp (100%)
 rename cpp/benchmarks/common/{tpch_data_generator => ndsh_data_generator}/table_helpers.hpp (100%)
 create mode 100644 cpp/benchmarks/ndsh/README.md
 rename cpp/{examples/tpch/q1.cpp => benchmarks/ndsh/q01.cpp} (82%)
 rename cpp/{examples/tpch/q5.cpp => benchmarks/ndsh/q05.cpp} (80%)
 rename cpp/{examples/tpch/q6.cpp => benchmarks/ndsh/q06.cpp} (79%)
 rename cpp/{examples/tpch/q9.cpp => benchmarks/ndsh/q09.cpp} (78%)
 rename cpp/{examples/tpch => benchmarks/ndsh}/q10.cpp (81%)
 create mode 100644 cpp/benchmarks/ndsh/utilities.cpp
 create mode 100644 cpp/benchmarks/ndsh/utilities.hpp
 delete mode 100644 cpp/examples/tpch/CMakeLists.txt
 delete mode 100644 cpp/examples/tpch/README.md
 delete mode 100644 cpp/examples/tpch/datagen/correct_datatypes.py
 delete mode 100755 cpp/examples/tpch/datagen/datagen.sh
 delete mode 100644 cpp/examples/tpch/datagen/tpch.patch
 delete mode 100644 cpp/examples/tpch/utils.hpp

diff --git a/.gitignore b/.gitignore
index 619e1464b2a..180a6a286e2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -80,7 +80,6 @@ build/
 cpp/build/
 cpp/examples/*/install/
 cpp/examples/*/build/
-cpp/examples/tpch/datagen/datafusion
 cpp/include/cudf/ipc_generated/*.h
 cpp/thirdparty/googletest/
 
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index d2c22b788cb..3bf9d02b384 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -36,25 +36,25 @@ target_include_directories(
 )
 
 add_library(
-  tpch_data_generator STATIC
-  common/tpch_data_generator/tpch_data_generator.cpp common/tpch_data_generator/table_helpers.cpp
-  common/tpch_data_generator/random_column_generator.cu
+  ndsh_data_generator STATIC
+  common/ndsh_data_generator/ndsh_data_generator.cpp common/ndsh_data_generator/table_helpers.cpp
+  common/ndsh_data_generator/random_column_generator.cu
 )
-target_compile_features(tpch_data_generator PUBLIC cxx_std_17 cuda_std_17)
+target_compile_features(ndsh_data_generator PUBLIC cxx_std_17 cuda_std_17)
 
 target_compile_options(
-  tpch_data_generator PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
+  ndsh_data_generator PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>"
                              "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>"
 )
 
 target_link_libraries(
-  tpch_data_generator
+  ndsh_data_generator
   PUBLIC cudf cudftestutil nvtx3::nvtx3-cpp
   PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
 )
 
 target_include_directories(
-  tpch_data_generator
+  ndsh_data_generator
   PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>" "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
          "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
 )
@@ -127,8 +127,8 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
                INSTALL_RPATH "\$ORIGIN/../../../lib"
   )
   target_link_libraries(
-    ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common cudf_datagen nvbench::nvbench
-                                $<TARGET_NAME_IF_EXISTS:conda_env>
+    ${CMAKE_BENCH_NAME} PRIVATE cudf_benchmark_common ndsh_data_generator cudf_datagen
+                                nvbench::nvbench $<TARGET_NAME_IF_EXISTS:conda_env>
   )
   install(
     TARGETS ${CMAKE_BENCH_NAME}
@@ -175,6 +175,14 @@ ConfigureBench(COPY_IF_ELSE_BENCH copying/copy_if_else.cpp)
 # * transpose benchmark ---------------------------------------------------------------------------
 ConfigureBench(TRANSPOSE_BENCH transpose/transpose.cpp)
 
+# ##################################################################################################
+# * nds-h benchmark --------------------------------------------------------------------------------
+ConfigureNVBench(NDSH_Q1 ndsh/q01.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q5 ndsh/q05.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q6 ndsh/q06.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q9 ndsh/q09.cpp ndsh/utilities.cpp)
+ConfigureNVBench(NDSH_Q10 ndsh/q10.cpp ndsh/utilities.cpp)
+
 # ##################################################################################################
 # * stream_compaction benchmark -------------------------------------------------------------------
 ConfigureNVBench(
diff --git a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp b/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp
similarity index 97%
rename from cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp
rename to cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp
index 236fe8095ad..fa7edd225ba 100644
--- a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.cpp
+++ b/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.cpp
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "tpch_data_generator.hpp"
+#include "ndsh_data_generator.hpp"
 
 #include "random_column_generator.hpp"
 #include "table_helpers.hpp"
@@ -435,46 +435,37 @@ std::unique_ptr<cudf::table> generate_lineitem_partial(cudf::table_view const& o
   columns.push_back(std::move(l_quantity));
   columns.push_back(std::move(l_discount));
   columns.push_back(std::move(l_tax));
+  columns.push_back(std::move(l_returnflag));
+  columns.push_back(std::move(l_linestatus));
   columns.push_back(std::move(l_shipdate_ts));
   columns.push_back(std::move(l_commitdate_ts));
   columns.push_back(std::move(l_receiptdate_ts));
-  columns.push_back(std::move(l_returnflag));
-  columns.push_back(std::move(l_linestatus));
   columns.push_back(std::move(l_shipinstruct));
   columns.push_back(std::move(l_shipmode));
   columns.push_back(std::move(l_comment));
   return std::make_unique<cudf::table>(std::move(columns));
 }
 
-std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& lineitem,
+/**
+ * @brief Generate the part of the `orders` table dependent on the `lineitem` table
+ *
+ * @param lineitem_partial The partially generated `lineitem` table
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& lineitem_partial,
                                                        rmm::cuda_stream_view stream,
                                                        rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  auto const l_linestatus_mask = lineitem.column(0);
-  auto const l_orderkey        = lineitem.column(1);
-  auto const l_discount        = lineitem.column(6);
-  auto const l_tax             = lineitem.column(7);
-  auto const l_extendedprice   = lineitem.column(16);
+  auto const l_linestatus_mask = lineitem_partial.column(0);
+  auto const l_orderkey        = lineitem_partial.column(1);
+  auto const l_extendedprice   = lineitem_partial.column(6);
+  auto const l_discount        = lineitem_partial.column(7);
+  auto const l_tax             = lineitem_partial.column(8);
 
   std::vector<std::unique_ptr<cudf::column>> orders_dependent_columns;
 
-  // Generate the `o_totalprice` column
-  // We calculate the `charge` column, which is a function of `l_extendedprice`,
-  // `l_tax`, and `l_discount` and then group by `l_orderkey` and sum the `charge`
-  auto const l_charge = calculate_charge(l_extendedprice, l_tax, l_discount, stream, mr);
-  auto o_totalprice   = [&]() {
-    auto const keys = cudf::table_view({l_orderkey});
-    cudf::groupby::groupby gb(keys);
-    std::vector<cudf::groupby::aggregation_request> requests;
-    requests.push_back(cudf::groupby::aggregation_request());
-    requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
-    requests[0].values = l_charge->view();
-    auto agg_result    = gb.aggregate(requests);
-    return cudf::round(agg_result.second[0].results[0]->view(), 2);
-  }();
-  orders_dependent_columns.push_back(std::move(o_totalprice));
-
   // Generate the `o_orderstatus` column
   auto o_orderstatus = [&]() {
     auto const keys = cudf::table_view({l_orderkey});
@@ -529,6 +520,22 @@ std::unique_ptr<cudf::table> generate_orders_dependent(cudf::table_view const& l
       cudf::string_scalar("P"), o_orderstatus_intermediate->view(), mask_b->view());
   }();
   orders_dependent_columns.push_back(std::move(o_orderstatus));
+
+  // Generate the `o_totalprice` column
+  // We calculate the `charge` column, which is a function of `l_extendedprice`,
+  // `l_tax`, and `l_discount` and then group by `l_orderkey` and sum the `charge`
+  auto const l_charge = calculate_charge(l_extendedprice, l_tax, l_discount, stream, mr);
+  auto o_totalprice   = [&]() {
+    auto const keys = cudf::table_view({l_orderkey});
+    cudf::groupby::groupby gb(keys);
+    std::vector<cudf::groupby::aggregation_request> requests;
+    requests.push_back(cudf::groupby::aggregation_request());
+    requests[0].aggregations.push_back(cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+    requests[0].values = l_charge->view();
+    auto agg_result    = gb.aggregate(requests);
+    return cudf::round(agg_result.second[0].results[0]->view(), 2);
+  }();
+  orders_dependent_columns.push_back(std::move(o_totalprice));
   return std::make_unique<cudf::table>(std::move(orders_dependent_columns));
 }
 
@@ -730,9 +737,7 @@ generate_orders_lineitem_part(double scale_factor,
   // Generate the `part` table
   auto part = generate_part(scale_factor, stream, mr);
 
-  // Join the `part` and partial `lineitem` tables, then calculate the `l_extendedprice` column,
-  // add the column to the `lineitem` table, and write the `lineitem` table to a parquet file
-
+  // Join the `part` and partial `lineitem` tables, then calculate the `l_extendedprice` column
   auto l_extendedprice = [&]() {
     auto const left = cudf::table_view(
       {lineitem_partial->get_column(2).view(), lineitem_partial->get_column(5).view()});
@@ -752,8 +757,9 @@ generate_orders_lineitem_part(double scale_factor,
     return cudf::round(col->view(), 2);
   }();
 
+  // Insert the `l_extendedprice` column into the partial columns of the `lineitem` table
   auto lineitem_partial_columns = lineitem_partial->release();
-  lineitem_partial_columns.push_back(std::move(l_extendedprice));
+  lineitem_partial_columns.insert(lineitem_partial_columns.begin() + 6, std::move(l_extendedprice));
   auto lineitem_temp = std::make_unique<cudf::table>(std::move(lineitem_partial_columns));
 
   // Generate the dependent columns of the `orders` table
@@ -762,7 +768,7 @@ generate_orders_lineitem_part(double scale_factor,
 
   auto orders_independent_columns = orders_independent->release();
   auto orders_dependent_columns   = orders_dependent->release();
-  orders_independent_columns.insert(orders_independent_columns.end(),
+  orders_independent_columns.insert(orders_independent_columns.begin() + 2,
                                     std::make_move_iterator(orders_dependent_columns.begin()),
                                     std::make_move_iterator(orders_dependent_columns.end()));
 
diff --git a/cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp b/cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.hpp
similarity index 100%
rename from cpp/benchmarks/common/tpch_data_generator/tpch_data_generator.hpp
rename to cpp/benchmarks/common/ndsh_data_generator/ndsh_data_generator.hpp
diff --git a/cpp/benchmarks/common/tpch_data_generator/random_column_generator.cu b/cpp/benchmarks/common/ndsh_data_generator/random_column_generator.cu
similarity index 100%
rename from cpp/benchmarks/common/tpch_data_generator/random_column_generator.cu
rename to cpp/benchmarks/common/ndsh_data_generator/random_column_generator.cu
diff --git a/cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp b/cpp/benchmarks/common/ndsh_data_generator/random_column_generator.hpp
similarity index 100%
rename from cpp/benchmarks/common/tpch_data_generator/random_column_generator.hpp
rename to cpp/benchmarks/common/ndsh_data_generator/random_column_generator.hpp
diff --git a/cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp
similarity index 100%
rename from cpp/benchmarks/common/tpch_data_generator/table_helpers.cpp
rename to cpp/benchmarks/common/ndsh_data_generator/table_helpers.cpp
diff --git a/cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp b/cpp/benchmarks/common/ndsh_data_generator/table_helpers.hpp
similarity index 100%
rename from cpp/benchmarks/common/tpch_data_generator/table_helpers.hpp
rename to cpp/benchmarks/common/ndsh_data_generator/table_helpers.hpp
diff --git a/cpp/benchmarks/ndsh/README.md b/cpp/benchmarks/ndsh/README.md
new file mode 100644
index 00000000000..0a462e1684e
--- /dev/null
+++ b/cpp/benchmarks/ndsh/README.md
@@ -0,0 +1,11 @@
+# NDS-H Benchmarks for `libcudf`
+
+## Disclaimer
+
+NDS-H is derived from the TPC-H Benchmarks and as such any results obtained using NDS-H are not
+comparable to published TPC-H Benchmark results, as the results obtained from using NDS-H do not
+comply with the TPC-H Benchmarks.
+
+## Current Status
+
+For now, only Q1, Q5, Q6, Q9, and Q10 have been implemented
diff --git a/cpp/examples/tpch/q1.cpp b/cpp/benchmarks/ndsh/q01.cpp
similarity index 82%
rename from cpp/examples/tpch/q1.cpp
rename to cpp/benchmarks/ndsh/q01.cpp
index 87b7e613766..ef709926ae9 100644
--- a/cpp/examples/tpch/q1.cpp
+++ b/cpp/benchmarks/ndsh/q01.cpp
@@ -14,17 +14,19 @@
  * limitations under the License.
  */
 
-#include "../utilities/timer.hpp"
-#include "utils.hpp"
+#include "utilities.hpp"
 
 #include <cudf/ast/expressions.hpp>
+#include <cudf/binaryop.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 /**
- * @file q1.cpp
- * @brief Implement query 1 of the TPC-H benchmark.
+ * @file q01.cpp
+ * @brief Implement query 1 of the NDS-H benchmark.
  *
  * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
  *
@@ -59,7 +61,7 @@
  * @param stream The CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-[[nodiscard]] std::unique_ptr<cudf::column> calc_disc_price(
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_disc_price(
   cudf::column_view const& discount,
   cudf::column_view const& extendedprice,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -86,7 +88,7 @@
  * @param stream The CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-[[nodiscard]] std::unique_ptr<cudf::column> calc_charge(
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_charge(
   cudf::column_view const& tax,
   cudf::column_view const& disc_price,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -101,16 +103,9 @@
   return charge;
 }
 
-int main(int argc, char const** argv)
+void run_ndsh_q1(nvbench::state& state,
+                 std::unordered_map<std::string, parquet_device_buffer>& sources)
 {
-  auto const args = parse_args(argc, argv);
-
-  // Use a memory pool
-  auto resource = create_memory_resource(args.memory_resource_type);
-  cudf::set_current_device_resource(resource.get());
-
-  cudf::examples::timer timer;
-
   // Define the column projections and filter predicate for `lineitem` table
   std::vector<std::string> const lineitem_cols = {"l_returnflag",
                                                   "l_linestatus",
@@ -130,12 +125,12 @@ int main(int argc, char const** argv)
 
   // Read out the `lineitem` table from parquet file
   auto lineitem =
-    read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred));
+    read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred));
 
   // Calculate the discount price and charge columns and append to lineitem table
   auto disc_price =
-    calc_disc_price(lineitem->column("l_discount"), lineitem->column("l_extendedprice"));
-  auto charge = calc_charge(lineitem->column("l_tax"), disc_price->view());
+    calculate_disc_price(lineitem->column("l_discount"), lineitem->column("l_extendedprice"));
+  auto charge = calculate_charge(lineitem->column("l_tax"), disc_price->view());
   (*lineitem).append(disc_price, "disc_price").append(charge, "charge");
 
   // Perform the group by operation
@@ -167,9 +162,21 @@ int main(int argc, char const** argv)
                                              {"l_returnflag", "l_linestatus"},
                                              {cudf::order::ASCENDING, cudf::order::ASCENDING});
 
-  timer.print_elapsed_millis();
-
   // Write query result to a parquet file
   orderedby_table->to_parquet("q1.parquet");
-  return 0;
 }
+
+void ndsh_q1(nvbench::state& state)
+{
+  // Generate the required parquet files in device buffers
+  double const scale_factor = state.get_float64("scale_factor");
+  std::unordered_map<std::string, parquet_device_buffer> sources;
+  generate_parquet_data_sources(scale_factor, {"lineitem"}, sources);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { run_ndsh_q1(state, sources); });
+}
+
+NVBENCH_BENCH(ndsh_q1).set_name("ndsh_q1").add_float64_axis("scale_factor", {0.01, 0.1, 1});
diff --git a/cpp/examples/tpch/q5.cpp b/cpp/benchmarks/ndsh/q05.cpp
similarity index 80%
rename from cpp/examples/tpch/q5.cpp
rename to cpp/benchmarks/ndsh/q05.cpp
index 12c186db10e..522bc4789c2 100644
--- a/cpp/examples/tpch/q5.cpp
+++ b/cpp/benchmarks/ndsh/q05.cpp
@@ -14,17 +14,19 @@
  * limitations under the License.
  */
 
-#include "../utilities/timer.hpp"
-#include "utils.hpp"
+#include "utilities.hpp"
 
 #include <cudf/ast/expressions.hpp>
+#include <cudf/binaryop.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 /**
- * @file q5.cpp
- * @brief Implement query 5 of the TPC-H benchmark.
+ * @file q05.cpp
+ * @brief Implement query 5 of the NDS-H benchmark.
  *
  * create view customer as select * from '/tables/scale-1/customer.parquet';
  * create view orders as select * from '/tables/scale-1/orders.parquet';
@@ -67,7 +69,7 @@
  * @param stream The CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_revenue(
   cudf::column_view const& extendedprice,
   cudf::column_view const& discount,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -86,16 +88,9 @@
   return revenue;
 }
 
-int main(int argc, char const** argv)
+void run_ndsh_q5(nvbench::state& state,
+                 std::unordered_map<std::string, parquet_device_buffer>& sources)
 {
-  auto const args = parse_args(argc, argv);
-
-  // Use a memory pool
-  auto resource = create_memory_resource(args.memory_resource_type);
-  cudf::set_current_device_resource(resource.get());
-
-  cudf::examples::timer timer;
-
   // Define the column projection and filter predicate for the `orders` table
   std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
   auto const o_orderdate_ref                 = cudf::ast::column_reference(std::distance(
@@ -125,17 +120,17 @@ int main(int argc, char const** argv)
   // Read out the tables from parquet files
   // while pushing down the column projections and filter predicates
   auto const customer =
-    read_parquet(args.dataset_dir + "/customer.parquet", {"c_custkey", "c_nationkey"});
+    read_parquet(sources["customer"].make_source_info(), {"c_custkey", "c_nationkey"});
   auto const orders =
-    read_parquet(args.dataset_dir + "/orders.parquet", orders_cols, std::move(orders_pred));
-  auto const lineitem = read_parquet(args.dataset_dir + "/lineitem.parquet",
+    read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred));
+  auto const lineitem = read_parquet(sources["lineitem"].make_source_info(),
                                      {"l_orderkey", "l_suppkey", "l_extendedprice", "l_discount"});
   auto const supplier =
-    read_parquet(args.dataset_dir + "/supplier.parquet", {"s_suppkey", "s_nationkey"});
+    read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"});
   auto const nation =
-    read_parquet(args.dataset_dir + "/nation.parquet", {"n_nationkey", "n_regionkey", "n_name"});
+    read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_regionkey", "n_name"});
   auto const region =
-    read_parquet(args.dataset_dir + "/region.parquet", region_cols, std::move(region_pred));
+    read_parquet(sources["region"].make_source_info(), region_cols, std::move(region_pred));
 
   // Perform the joins
   auto const join_a = apply_inner_join(region, nation, {"r_regionkey"}, {"n_regionkey"});
@@ -147,7 +142,7 @@ int main(int argc, char const** argv)
 
   // Calculate and append the `revenue` column
   auto revenue =
-    calc_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
+    calculate_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
   (*joined_table).append(revenue, "revenue");
 
   // Perform the groupby operation
@@ -162,9 +157,22 @@ int main(int argc, char const** argv)
   auto const orderedby_table =
     apply_orderby(groupedby_table, {"revenue"}, {cudf::order::DESCENDING});
 
-  timer.print_elapsed_millis();
-
   // Write query result to a parquet file
   orderedby_table->to_parquet("q5.parquet");
-  return 0;
 }
+
+void ndsh_q5(nvbench::state& state)
+{
+  // Generate the required parquet files in device buffers
+  double const scale_factor = state.get_float64("scale_factor");
+  std::unordered_map<std::string, parquet_device_buffer> sources;
+  generate_parquet_data_sources(
+    scale_factor, {"customer", "orders", "lineitem", "supplier", "nation", "region"}, sources);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { run_ndsh_q5(state, sources); });
+}
+
+NVBENCH_BENCH(ndsh_q5).set_name("ndsh_q5").add_float64_axis("scale_factor", {0.01, 0.1, 1});
diff --git a/cpp/examples/tpch/q6.cpp b/cpp/benchmarks/ndsh/q06.cpp
similarity index 79%
rename from cpp/examples/tpch/q6.cpp
rename to cpp/benchmarks/ndsh/q06.cpp
index 92dac40c768..04078547973 100644
--- a/cpp/examples/tpch/q6.cpp
+++ b/cpp/benchmarks/ndsh/q06.cpp
@@ -14,17 +14,20 @@
  * limitations under the License.
  */
 
-#include "../utilities/timer.hpp"
-#include "utils.hpp"
+#include "utilities.hpp"
 
 #include <cudf/ast/expressions.hpp>
+#include <cudf/binaryop.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/unary.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 /**
- * @file q6.cpp
- * @brief Implement query 6 of the TPC-H benchmark.
+ * @file q06.cpp
+ * @brief Implement query 6 of the NDS-H benchmark.
  *
  * create view lineitem as select * from '/tables/scale-1/lineitem.parquet';
  *
@@ -48,7 +51,7 @@
  * @param stream The CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_revenue(
   cudf::column_view const& extendedprice,
   cudf::column_view const& discount,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -60,16 +63,9 @@
   return revenue;
 }
 
-int main(int argc, char const** argv)
+void run_ndsh_q6(nvbench::state& state,
+                 std::unordered_map<std::string, parquet_device_buffer>& sources)
 {
-  auto const args = parse_args(argc, argv);
-
-  // Use a memory pool
-  auto resource = create_memory_resource(args.memory_resource_type);
-  cudf::set_current_device_resource(resource.get());
-
-  cudf::examples::timer timer;
-
   // Read out the `lineitem` table from parquet file
   std::vector<std::string> const lineitem_cols = {
     "l_extendedprice", "l_discount", "l_shipdate", "l_quantity"};
@@ -88,7 +84,7 @@ int main(int argc, char const** argv)
   auto const lineitem_pred = std::make_unique<cudf::ast::operation>(
     cudf::ast::ast_operator::LOGICAL_AND, shipdate_pred_a, shipdate_pred_b);
   auto lineitem =
-    read_parquet(args.dataset_dir + "/lineitem.parquet", lineitem_cols, std::move(lineitem_pred));
+    read_parquet(sources["lineitem"].make_source_info(), lineitem_cols, std::move(lineitem_pred));
 
   // Cast the discount and quantity columns to float32 and append to lineitem table
   auto discout_float =
@@ -99,8 +95,8 @@ int main(int argc, char const** argv)
   (*lineitem).append(discout_float, "l_discount_float").append(quantity_float, "l_quantity_float");
 
   // Apply the filters
-  auto const discount_ref = cudf::ast::column_reference(lineitem->col_id("l_discount_float"));
-  auto const quantity_ref = cudf::ast::column_reference(lineitem->col_id("l_quantity_float"));
+  auto const discount_ref = cudf::ast::column_reference(lineitem->column_id("l_discount_float"));
+  auto const quantity_ref = cudf::ast::column_reference(lineitem->column_id("l_quantity_float"));
 
   auto discount_lower               = cudf::numeric_scalar<float_t>(0.05);
   auto const discount_lower_literal = cudf::ast::literal(discount_lower);
@@ -123,16 +119,28 @@ int main(int argc, char const** argv)
   auto const filtered_table = apply_filter(lineitem, discount_quantity_pred);
 
   // Calculate the `revenue` column
-  auto revenue =
-    calc_revenue(filtered_table->column("l_extendedprice"), filtered_table->column("l_discount"));
+  auto revenue = calculate_revenue(filtered_table->column("l_extendedprice"),
+                                   filtered_table->column("l_discount"));
 
   // Sum the `revenue` column
   auto const revenue_view = revenue->view();
   auto const result_table = apply_reduction(revenue_view, cudf::aggregation::Kind::SUM, "revenue");
 
-  timer.print_elapsed_millis();
-
   // Write query result to a parquet file
   result_table->to_parquet("q6.parquet");
-  return 0;
 }
+
+void ndsh_q6(nvbench::state& state)
+{
+  // Generate the required parquet files in device buffers
+  double const scale_factor = state.get_float64("scale_factor");
+  std::unordered_map<std::string, parquet_device_buffer> sources;
+  generate_parquet_data_sources(scale_factor, {"lineitem"}, sources);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { run_ndsh_q6(state, sources); });
+}
+
+NVBENCH_BENCH(ndsh_q6).set_name("ndsh_q6").add_float64_axis("scale_factor", {0.01, 0.1, 1});
diff --git a/cpp/examples/tpch/q9.cpp b/cpp/benchmarks/ndsh/q09.cpp
similarity index 78%
rename from cpp/examples/tpch/q9.cpp
rename to cpp/benchmarks/ndsh/q09.cpp
index 2882182aa2b..59218ab8912 100644
--- a/cpp/examples/tpch/q9.cpp
+++ b/cpp/benchmarks/ndsh/q09.cpp
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "../utilities/timer.hpp"
-#include "utils.hpp"
+#include "utilities.hpp"
 
+#include <cudf/binaryop.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/datetime.hpp>
 #include <cudf/scalar/scalar.hpp>
@@ -24,9 +24,11 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 /**
- * @file q9.cpp
- * @brief Implement query 9 of the TPC-H benchmark.
+ * @file q09.cpp
+ * @brief Implement query 9 of the NDS-H benchmark.
  *
  * create view part as select * from '/tables/scale-1/part.parquet';
  * create view supplier as select * from '/tables/scale-1/supplier.parquet';
@@ -79,7 +81,7 @@
  * @param stream The CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-[[nodiscard]] std::unique_ptr<cudf::column> calc_amount(
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_amount(
   cudf::column_view const& discount,
   cudf::column_view const& extendedprice,
   cudf::column_view const& supplycost,
@@ -109,28 +111,21 @@
   return amount;
 }
 
-int main(int argc, char const** argv)
+void run_ndsh_q9(nvbench::state& state,
+                 std::unordered_map<std::string, parquet_device_buffer>& sources)
 {
-  auto const args = parse_args(argc, argv);
-
-  // Use a memory pool
-  auto resource = create_memory_resource(args.memory_resource_type);
-  cudf::set_current_device_resource(resource.get());
-
-  cudf::examples::timer timer;
-
   // Read out the table from parquet files
   auto const lineitem = read_parquet(
-    args.dataset_dir + "/lineitem.parquet",
+    sources["lineitem"].make_source_info(),
     {"l_suppkey", "l_partkey", "l_orderkey", "l_extendedprice", "l_discount", "l_quantity"});
-  auto const nation = read_parquet(args.dataset_dir + "/nation.parquet", {"n_nationkey", "n_name"});
+  auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_nationkey", "n_name"});
   auto const orders =
-    read_parquet(args.dataset_dir + "/orders.parquet", {"o_orderkey", "o_orderdate"});
-  auto const part     = read_parquet(args.dataset_dir + "/part.parquet", {"p_partkey", "p_name"});
-  auto const partsupp = read_parquet(args.dataset_dir + "/partsupp.parquet",
+    read_parquet(sources["orders"].make_source_info(), {"o_orderkey", "o_orderdate"});
+  auto const part     = read_parquet(sources["part"].make_source_info(), {"p_partkey", "p_name"});
+  auto const partsupp = read_parquet(sources["partsupp"].make_source_info(),
                                      {"ps_suppkey", "ps_partkey", "ps_supplycost"});
   auto const supplier =
-    read_parquet(args.dataset_dir + "/supplier.parquet", {"s_suppkey", "s_nationkey"});
+    read_parquet(sources["supplier"].make_source_info(), {"s_suppkey", "s_nationkey"});
 
   // Generating the `profit` table
   // Filter the part table using `p_name like '%green%'`
@@ -150,10 +145,10 @@ int main(int argc, char const** argv)
   // Calculate the `nation`, `o_year`, and `amount` columns
   auto n_name = std::make_unique<cudf::column>(joined_table->column("n_name"));
   auto o_year = cudf::datetime::extract_year(joined_table->column("o_orderdate"));
-  auto amount = calc_amount(joined_table->column("l_discount"),
-                            joined_table->column("l_extendedprice"),
-                            joined_table->column("ps_supplycost"),
-                            joined_table->column("l_quantity"));
+  auto amount = calculate_amount(joined_table->column("l_discount"),
+                                 joined_table->column("l_extendedprice"),
+                                 joined_table->column("ps_supplycost"),
+                                 joined_table->column("l_quantity"));
 
   // Put together the `profit` table
   std::vector<std::unique_ptr<cudf::column>> profit_columns;
@@ -175,9 +170,22 @@ int main(int argc, char const** argv)
   auto const orderedby_table = apply_orderby(
     groupedby_table, {"nation", "o_year"}, {cudf::order::ASCENDING, cudf::order::DESCENDING});
 
-  timer.print_elapsed_millis();
-
   // Write query result to a parquet file
   orderedby_table->to_parquet("q9.parquet");
-  return 0;
 }
+
+void ndsh_q9(nvbench::state& state)
+{
+  // Generate the required parquet files in device buffers
+  double const scale_factor = state.get_float64("scale_factor");
+  std::unordered_map<std::string, parquet_device_buffer> sources;
+  generate_parquet_data_sources(
+    scale_factor, {"part", "supplier", "lineitem", "partsupp", "orders", "nation"}, sources);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { run_ndsh_q9(state, sources); });
+}
+
+NVBENCH_BENCH(ndsh_q9).set_name("ndsh_q9").add_float64_axis("scale_factor", {0.01, 0.1, 1});
diff --git a/cpp/examples/tpch/q10.cpp b/cpp/benchmarks/ndsh/q10.cpp
similarity index 81%
rename from cpp/examples/tpch/q10.cpp
rename to cpp/benchmarks/ndsh/q10.cpp
index fdf147b50e0..a520480020a 100644
--- a/cpp/examples/tpch/q10.cpp
+++ b/cpp/benchmarks/ndsh/q10.cpp
@@ -14,17 +14,19 @@
  * limitations under the License.
  */
 
-#include "../utilities/timer.hpp"
-#include "utils.hpp"
+#include "utilities.hpp"
 
 #include <cudf/ast/expressions.hpp>
+#include <cudf/binaryop.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
+#include <nvbench/nvbench.cuh>
+
 /**
  * @file q10.cpp
- * @brief Implement query 10 of the TPC-H benchmark.
+ * @brief Implement query 10 of the NDS-H benchmark.
  *
  * create view customer as select * from '/tables/scale-1/customer.parquet';
  * create view orders as select * from '/tables/scale-1/orders.parquet';
@@ -72,7 +74,7 @@
  * @param stream The CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
-[[nodiscard]] std::unique_ptr<cudf::column> calc_revenue(
+[[nodiscard]] std::unique_ptr<cudf::column> calculate_revenue(
   cudf::column_view const& extendedprice,
   cudf::column_view const& discount,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
@@ -90,16 +92,10 @@
                                         mr);
   return revenue;
 }
-int main(int argc, char const** argv)
-{
-  auto const args = parse_args(argc, argv);
-
-  // Use a memory pool
-  auto resource = create_memory_resource(args.memory_resource_type);
-  cudf::set_current_device_resource(resource.get());
-
-  cudf::examples::timer timer;
 
+void run_ndsh_q10(nvbench::state& state,
+                  std::unordered_map<std::string, parquet_device_buffer>& sources)
+{
   // Define the column projection and filter predicate for the `orders` table
   std::vector<std::string> const orders_cols = {"o_custkey", "o_orderkey", "o_orderdate"};
   auto const o_orderdate_ref                 = cudf::ast::column_reference(std::distance(
@@ -126,15 +122,15 @@ int main(int argc, char const** argv)
   // Read out the tables from parquet files
   // while pushing down the column projections and filter predicates
   auto const customer = read_parquet(
-    args.dataset_dir + "/customer.parquet",
+    sources["customer"].make_source_info(),
     {"c_custkey", "c_name", "c_nationkey", "c_acctbal", "c_address", "c_phone", "c_comment"});
   auto const orders =
-    read_parquet(args.dataset_dir + "/orders.parquet", orders_cols, std::move(orders_pred));
+    read_parquet(sources["orders"].make_source_info(), orders_cols, std::move(orders_pred));
   auto const lineitem =
-    read_parquet(args.dataset_dir + "/lineitem.parquet",
+    read_parquet(sources["lineitem"].make_source_info(),
                  {"l_extendedprice", "l_discount", "l_orderkey", "l_returnflag"},
                  std::move(lineitem_pred));
-  auto const nation = read_parquet(args.dataset_dir + "/nation.parquet", {"n_name", "n_nationkey"});
+  auto const nation = read_parquet(sources["nation"].make_source_info(), {"n_name", "n_nationkey"});
 
   // Perform the joins
   auto const join_a       = apply_inner_join(customer, nation, {"c_nationkey"}, {"n_nationkey"});
@@ -143,7 +139,7 @@ int main(int argc, char const** argv)
 
   // Calculate and append the `revenue` column
   auto revenue =
-    calc_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
+    calculate_revenue(joined_table->column("l_extendedprice"), joined_table->column("l_discount"));
   (*joined_table).append(revenue, "revenue");
 
   // Perform the groupby operation
@@ -159,9 +155,22 @@ int main(int argc, char const** argv)
   auto const orderedby_table =
     apply_orderby(groupedby_table, {"revenue"}, {cudf::order::DESCENDING});
 
-  timer.print_elapsed_millis();
-
   // Write query result to a parquet file
   orderedby_table->to_parquet("q10.parquet");
-  return 0;
 }
+
+void ndsh_q10(nvbench::state& state)
+{
+  // Generate the required parquet files in device buffers
+  double const scale_factor = state.get_float64("scale_factor");
+  std::unordered_map<std::string, parquet_device_buffer> sources;
+  generate_parquet_data_sources(
+    scale_factor, {"customer", "orders", "lineitem", "nation"}, sources);
+
+  auto stream = cudf::get_default_stream();
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { run_ndsh_q10(state, sources); });
+}
+
+NVBENCH_BENCH(ndsh_q10).set_name("ndsh_q10").add_float64_axis("scale_factor", {0.01, 0.1, 1});
diff --git a/cpp/benchmarks/ndsh/utilities.cpp b/cpp/benchmarks/ndsh/utilities.cpp
new file mode 100644
index 00000000000..2d514764fc2
--- /dev/null
+++ b/cpp/benchmarks/ndsh/utilities.cpp
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "utilities.hpp"
+
+#include "common/ndsh_data_generator/ndsh_data_generator.hpp"
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/join.hpp>
+#include <cudf/reduction.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/stream_compaction.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/transform.hpp>
+
+#include <cstdlib>
+#include <ctime>
+
+namespace {
+
+std::vector<std::string> const ORDERS_SCHEMA   = {"o_orderkey",
+                                                  "o_custkey",
+                                                  "o_orderstatus",
+                                                  "o_totalprice",
+                                                  "o_orderdate",
+                                                  "o_orderpriority",
+                                                  "o_clerk",
+                                                  "o_shippriority",
+                                                  "o_comment"};
+std::vector<std::string> const LINEITEM_SCHEMA = {"l_orderkey",
+                                                  "l_partkey",
+                                                  "l_suppkey",
+                                                  "l_linenumber",
+                                                  "l_quantity",
+                                                  "l_extendedprice",
+                                                  "l_discount",
+                                                  "l_tax",
+                                                  "l_returnflag",
+                                                  "l_linestatus",
+                                                  "l_shipdate",
+                                                  "l_commitdate",
+                                                  "l_receiptdate",
+                                                  "l_shipinstruct",
+                                                  "l_shipmode",
+                                                  "l_comment"};
+std::vector<std::string> const PART_SCHEMA     = {"p_partkey",
+                                                  "p_name",
+                                                  "p_mfgr",
+                                                  "p_brand",
+                                                  "p_type",
+                                                  "p_size",
+                                                  "p_container",
+                                                  "p_retailprice",
+                                                  "p_comment"};
+std::vector<std::string> const PARTSUPP_SCHEMA = {
+  "ps_partkey", "ps_suppkey", "ps_availqty", "ps_supplycost", "ps_comment"};
+std::vector<std::string> const SUPPLIER_SCHEMA = {
+  "s_suppkey", "s_name", "s_address", "s_nationkey", "s_phone", "s_acctbal", "s_comment"};
+std::vector<std::string> const CUSTOMER_SCHEMA = {"c_custkey",
+                                                  "c_name",
+                                                  "c_address",
+                                                  "c_nationkey",
+                                                  "c_phone",
+                                                  "c_acctbal",
+                                                  "c_mktsegment",
+                                                  "c_comment"};
+std::vector<std::string> const NATION_SCHEMA   = {
+  "n_nationkey", "n_name", "n_regionkey", "n_comment"};
+std::vector<std::string> const REGION_SCHEMA = {"r_regionkey", "r_name", "r_comment"};
+
+}  // namespace
+
+cudf::table_view table_with_names::table() const { return tbl->view(); }
+
+cudf::column_view table_with_names::column(std::string const& col_name) const
+{
+  return tbl->view().column(column_id(col_name));
+}
+
+std::vector<std::string> const& table_with_names::column_names() const { return col_names; }
+
+cudf::size_type table_with_names::column_id(std::string const& col_name) const
+{
+  auto it = std::find(col_names.begin(), col_names.end(), col_name);
+  if (it == col_names.end()) {
+    std::string err_msg = "Column `" + col_name + "` not found";
+    throw std::runtime_error(err_msg);
+  }
+  return std::distance(col_names.begin(), it);
+}
+
+table_with_names& table_with_names::append(std::unique_ptr<cudf::column>& col,
+                                           std::string const& col_name)
+{
+  auto cols = tbl->release();
+  cols.push_back(std::move(col));
+  tbl = std::make_unique<cudf::table>(std::move(cols));
+  col_names.push_back(col_name);
+  return (*this);
+}
+
+cudf::table_view table_with_names::select(std::vector<std::string> const& col_names) const
+{
+  CUDF_FUNC_RANGE();
+  std::vector<cudf::size_type> col_indices;
+  for (auto const& col_name : col_names) {
+    col_indices.push_back(column_id(col_name));
+  }
+  return tbl->select(col_indices);
+}
+
+void table_with_names::to_parquet(std::string const& filepath) const
+{
+  CUDF_FUNC_RANGE();
+  auto const sink_info = cudf::io::sink_info(filepath);
+  cudf::io::table_metadata metadata;
+  metadata.schema_info =
+    std::vector<cudf::io::column_name_info>(col_names.begin(), col_names.end());
+  auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
+  auto builder = cudf::io::parquet_writer_options::builder(sink_info, tbl->view());
+  builder.metadata(table_input_metadata);
+  auto const options = builder.build();
+  cudf::io::write_parquet(options);
+}
+
+std::unique_ptr<cudf::table> join_and_gather(cudf::table_view const& left_input,
+                                             cudf::table_view const& right_input,
+                                             std::vector<cudf::size_type> const& left_on,
+                                             std::vector<cudf::size_type> const& right_on,
+                                             cudf::null_equality compare_nulls)
+{
+  CUDF_FUNC_RANGE();
+  constexpr auto oob_policy                          = cudf::out_of_bounds_policy::DONT_CHECK;
+  auto const left_selected                           = left_input.select(left_on);
+  auto const right_selected                          = right_input.select(right_on);
+  auto const [left_join_indices, right_join_indices] = cudf::inner_join(
+    left_selected, right_selected, compare_nulls, cudf::get_current_device_resource_ref());
+
+  auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
+  auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
+
+  auto const left_indices_col  = cudf::column_view{left_indices_span};
+  auto const right_indices_col = cudf::column_view{right_indices_span};
+
+  auto const left_result  = cudf::gather(left_input, left_indices_col, oob_policy);
+  auto const right_result = cudf::gather(right_input, right_indices_col, oob_policy);
+
+  auto joined_cols = left_result->release();
+  auto right_cols  = right_result->release();
+  joined_cols.insert(joined_cols.end(),
+                     std::make_move_iterator(right_cols.begin()),
+                     std::make_move_iterator(right_cols.end()));
+  return std::make_unique<cudf::table>(std::move(joined_cols));
+}
+
+std::unique_ptr<table_with_names> apply_inner_join(
+  std::unique_ptr<table_with_names> const& left_input,
+  std::unique_ptr<table_with_names> const& right_input,
+  std::vector<std::string> const& left_on,
+  std::vector<std::string> const& right_on,
+  cudf::null_equality compare_nulls)
+{
+  CUDF_FUNC_RANGE();
+  std::vector<cudf::size_type> left_on_indices;
+  std::vector<cudf::size_type> right_on_indices;
+  std::transform(
+    left_on.begin(), left_on.end(), std::back_inserter(left_on_indices), [&](auto const& col_name) {
+      return left_input->column_id(col_name);
+    });
+  std::transform(right_on.begin(),
+                 right_on.end(),
+                 std::back_inserter(right_on_indices),
+                 [&](auto const& col_name) { return right_input->column_id(col_name); });
+  auto table = join_and_gather(
+    left_input->table(), right_input->table(), left_on_indices, right_on_indices, compare_nulls);
+  ;
+  std::vector<std::string> merged_column_names;
+  merged_column_names.reserve(left_input->column_names().size() +
+                              right_input->column_names().size());
+  std::copy(left_input->column_names().begin(),
+            left_input->column_names().end(),
+            std::back_inserter(merged_column_names));
+  std::copy(right_input->column_names().begin(),
+            right_input->column_names().end(),
+            std::back_inserter(merged_column_names));
+  return std::make_unique<table_with_names>(std::move(table), merged_column_names);
+  return std::make_unique<table_with_names>(std::move(table), merged_column_names);
+}
+
+std::unique_ptr<table_with_names> apply_filter(std::unique_ptr<table_with_names> const& table,
+                                               cudf::ast::operation const& predicate)
+{
+  CUDF_FUNC_RANGE();
+  auto const boolean_mask = cudf::compute_column(table->table(), predicate);
+  auto result_table       = cudf::apply_boolean_mask(table->table(), boolean_mask->view());
+  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
+}
+
+std::unique_ptr<table_with_names> apply_mask(std::unique_ptr<table_with_names> const& table,
+                                             std::unique_ptr<cudf::column> const& mask)
+{
+  CUDF_FUNC_RANGE();
+  auto result_table = cudf::apply_boolean_mask(table->table(), mask->view());
+  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
+}
+
+std::unique_ptr<table_with_names> apply_groupby(std::unique_ptr<table_with_names> const& table,
+                                                groupby_context_t const& ctx)
+{
+  CUDF_FUNC_RANGE();
+  auto const keys = table->select(ctx.keys);
+  cudf::groupby::groupby groupby_obj(keys);
+  std::vector<std::string> result_column_names;
+  result_column_names.insert(result_column_names.end(), ctx.keys.begin(), ctx.keys.end());
+  std::vector<cudf::groupby::aggregation_request> requests;
+  for (auto& [value_col, aggregations] : ctx.values) {
+    requests.emplace_back(cudf::groupby::aggregation_request());
+    for (auto& agg : aggregations) {
+      if (agg.first == cudf::aggregation::Kind::SUM) {
+        requests.back().aggregations.push_back(
+          cudf::make_sum_aggregation<cudf::groupby_aggregation>());
+      } else if (agg.first == cudf::aggregation::Kind::MEAN) {
+        requests.back().aggregations.push_back(
+          cudf::make_mean_aggregation<cudf::groupby_aggregation>());
+      } else if (agg.first == cudf::aggregation::Kind::COUNT_ALL) {
+        requests.back().aggregations.push_back(
+          cudf::make_count_aggregation<cudf::groupby_aggregation>());
+      } else {
+        throw std::runtime_error("Unsupported aggregation");
+      }
+      result_column_names.push_back(agg.second);
+    }
+    requests.back().values = table->column(value_col);
+  }
+  auto agg_results = groupby_obj.aggregate(requests);
+  std::vector<std::unique_ptr<cudf::column>> result_columns;
+  for (auto i = 0; i < agg_results.first->num_columns(); i++) {
+    auto col = std::make_unique<cudf::column>(agg_results.first->get_column(i));
+    result_columns.push_back(std::move(col));
+  }
+  for (size_t i = 0; i < agg_results.second.size(); i++) {
+    for (size_t j = 0; j < agg_results.second[i].results.size(); j++) {
+      result_columns.push_back(std::move(agg_results.second[i].results[j]));
+    }
+  }
+  auto result_table = std::make_unique<cudf::table>(std::move(result_columns));
+  return std::make_unique<table_with_names>(std::move(result_table), result_column_names);
+}
+
+std::unique_ptr<table_with_names> apply_orderby(std::unique_ptr<table_with_names> const& table,
+                                                std::vector<std::string> const& sort_keys,
+                                                std::vector<cudf::order> const& sort_key_orders)
+{
+  CUDF_FUNC_RANGE();
+  std::vector<cudf::column_view> column_views;
+  for (auto& key : sort_keys) {
+    column_views.push_back(table->column(key));
+  }
+  auto result_table =
+    cudf::sort_by_key(table->table(), cudf::table_view{column_views}, sort_key_orders);
+  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
+}
+
+std::unique_ptr<table_with_names> apply_reduction(cudf::column_view const& column,
+                                                  cudf::aggregation::Kind const& agg_kind,
+                                                  std::string const& col_name)
+{
+  CUDF_FUNC_RANGE();
+  auto const agg            = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+  auto const result         = cudf::reduce(column, *agg, column.type());
+  cudf::size_type const len = 1;
+  auto col                  = cudf::make_column_from_scalar(*result, len);
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.push_back(std::move(col));
+  auto result_table                  = std::make_unique<cudf::table>(std::move(columns));
+  std::vector<std::string> col_names = {col_name};
+  return std::make_unique<table_with_names>(std::move(result_table), col_names);
+}
+
+std::unique_ptr<table_with_names> read_parquet(
+  cudf::io::source_info const& source_info,
+  std::vector<std::string> const& columns,
+  std::unique_ptr<cudf::ast::operation> const& predicate)
+{
+  CUDF_FUNC_RANGE();
+  auto builder = cudf::io::parquet_reader_options_builder(source_info);
+  if (!columns.empty()) { builder.columns(columns); }
+  if (predicate) { builder.filter(*predicate); }
+  auto const options       = builder.build();
+  auto table_with_metadata = cudf::io::read_parquet(options);
+  std::vector<std::string> column_names;
+  for (auto const& col_info : table_with_metadata.metadata.schema_info) {
+    column_names.push_back(col_info.name);
+  }
+  return std::make_unique<table_with_names>(std::move(table_with_metadata.tbl), column_names);
+}
+
+std::tm make_tm(int year, int month, int day)
+{
+  std::tm tm{};
+  tm.tm_year = year - 1900;
+  tm.tm_mon  = month - 1;
+  tm.tm_mday = day;
+  return tm;
+}
+
+int32_t days_since_epoch(int year, int month, int day)
+{
+  std::tm tm             = make_tm(year, month, day);
+  std::tm epoch          = make_tm(1970, 1, 1);
+  std::time_t time       = std::mktime(&tm);
+  std::time_t epoch_time = std::mktime(&epoch);
+  double diff            = std::difftime(time, epoch_time) / (60 * 60 * 24);
+  return static_cast<int32_t>(diff);
+}
+
+void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
+                                    std::vector<std::string> const& col_names,
+                                    parquet_device_buffer& source)
+{
+  CUDF_FUNC_RANGE();
+  auto const stream = cudf::get_default_stream();
+
+  // Prepare the table metadata
+  cudf::io::table_metadata metadata;
+  std::vector<cudf::io::column_name_info> col_name_infos;
+  for (auto& col_name : col_names) {
+    col_name_infos.push_back(cudf::io::column_name_info(col_name));
+  }
+  metadata.schema_info            = col_name_infos;
+  auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
+
+  // Declare a host and device buffer
+  std::vector<char> h_buffer;
+
+  // Write parquet data to host buffer
+  auto builder =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&h_buffer), table->view());
+  builder.metadata(table_input_metadata);
+  auto const options = builder.build();
+  cudf::io::write_parquet(options);
+
+  // Copy host buffer to device buffer
+  source.d_buffer.resize(h_buffer.size(), stream);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(
+    source.d_buffer.data(), h_buffer.data(), h_buffer.size(), cudaMemcpyDefault, stream.value()));
+}
+
+void generate_parquet_data_sources(double scale_factor,
+                                   std::vector<std::string> const& table_names,
+                                   std::unordered_map<std::string, parquet_device_buffer>& sources)
+{
+  CUDF_FUNC_RANGE();
+  std::for_each(table_names.begin(), table_names.end(), [&](auto const& table_name) {
+    sources[table_name] = parquet_device_buffer();
+  });
+
+  auto [orders, lineitem, part] = cudf::datagen::generate_orders_lineitem_part(
+    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+
+  auto partsupp = cudf::datagen::generate_partsupp(
+    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+
+  auto supplier = cudf::datagen::generate_supplier(
+    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+
+  auto customer = cudf::datagen::generate_customer(
+    scale_factor, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+
+  auto nation = cudf::datagen::generate_nation(cudf::get_default_stream(),
+                                               cudf::get_current_device_resource_ref());
+
+  auto region = cudf::datagen::generate_region(cudf::get_default_stream(),
+                                               cudf::get_current_device_resource_ref());
+
+  write_to_parquet_device_buffer(std::move(orders), ORDERS_SCHEMA, sources["orders"]);
+  write_to_parquet_device_buffer(std::move(lineitem), LINEITEM_SCHEMA, sources["lineitem"]);
+  write_to_parquet_device_buffer(std::move(part), PART_SCHEMA, sources["part"]);
+  write_to_parquet_device_buffer(std::move(partsupp), PARTSUPP_SCHEMA, sources["partsupp"]);
+  write_to_parquet_device_buffer(std::move(customer), CUSTOMER_SCHEMA, sources["customer"]);
+  write_to_parquet_device_buffer(std::move(supplier), SUPPLIER_SCHEMA, sources["supplier"]);
+  write_to_parquet_device_buffer(std::move(nation), NATION_SCHEMA, sources["nation"]);
+  write_to_parquet_device_buffer(std::move(region), REGION_SCHEMA, sources["region"]);
+}
diff --git a/cpp/benchmarks/ndsh/utilities.hpp b/cpp/benchmarks/ndsh/utilities.hpp
new file mode 100644
index 00000000000..762e43deccf
--- /dev/null
+++ b/cpp/benchmarks/ndsh/utilities.hpp
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/groupby.hpp>
+#include <cudf/io/parquet.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+/**
+ * @brief A class to represent a table with column names attached
+ */
+class table_with_names {
+ public:
+  table_with_names(std::unique_ptr<cudf::table> tbl, std::vector<std::string> col_names)
+    : tbl(std::move(tbl)), col_names(col_names){};
+  /**
+   * @brief Return the table view
+   */
+  [[nodiscard]] cudf::table_view table() const;
+  /**
+   * @brief Return the column view for a given column name
+   *
+   * @param col_name The name of the column
+   */
+  [[nodiscard]] cudf::column_view column(std::string const& col_name) const;
+  /**
+   * @param Return the column names of the table
+   */
+  [[nodiscard]] std::vector<std::string> const& column_names() const;
+  /**
+   * @brief Translate a column name to a column index
+   *
+   * @param col_name The name of the column
+   */
+  [[nodiscard]] cudf::size_type column_id(std::string const& col_name) const;
+  /**
+   * @brief Append a column to the table
+   *
+   * @param col The column to append
+   * @param col_name The name of the appended column
+   */
+  table_with_names& append(std::unique_ptr<cudf::column>& col, std::string const& col_name);
+  /**
+   * @brief Select a subset of columns from the table
+   *
+   * @param col_names The names of the columns to select
+   */
+  [[nodiscard]] cudf::table_view select(std::vector<std::string> const& col_names) const;
+  /**
+   * @brief Write the table to a parquet file
+   *
+   * @param filepath The path to the parquet file
+   */
+  void to_parquet(std::string const& filepath) const;
+
+ private:
+  std::unique_ptr<cudf::table> tbl;
+  std::vector<std::string> col_names;
+};
+
+/**
+ * @brief Inner join two tables and gather the result
+ *
+ * @param left_input The left input table
+ * @param right_input The right input table
+ * @param left_on The columns to join on in the left table
+ * @param right_on The columns to join on in the right table
+ * @param compare_nulls The null equality policy
+ */
+[[nodiscard]] std::unique_ptr<cudf::table> join_and_gather(
+  cudf::table_view const& left_input,
+  cudf::table_view const& right_input,
+  std::vector<cudf::size_type> const& left_on,
+  std::vector<cudf::size_type> const& right_on,
+  cudf::null_equality compare_nulls);
+
+/**
+ * @brief Apply an inner join operation to two tables
+ *
+ * @param left_input The left input table
+ * @param right_input The right input table
+ * @param left_on The columns to join on in the left table
+ * @param right_on The columns to join on in the right table
+ * @param compare_nulls The null equality policy
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_inner_join(
+  std::unique_ptr<table_with_names> const& left_input,
+  std::unique_ptr<table_with_names> const& right_input,
+  std::vector<std::string> const& left_on,
+  std::vector<std::string> const& right_on,
+  cudf::null_equality compare_nulls = cudf::null_equality::EQUAL);
+
+/**
+ * @brief Apply a filter predicate to a table
+ *
+ * @param table The input table
+ * @param predicate The filter predicate
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_filter(
+  std::unique_ptr<table_with_names> const& table, cudf::ast::operation const& predicate);
+
+/**
+ * @brief Apply a boolean mask to a table
+ *
+ * @param table The input table
+ * @param mask The boolean mask
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_mask(
+  std::unique_ptr<table_with_names> const& table, std::unique_ptr<cudf::column> const& mask);
+
+/**
+ * Struct representing group by key columns, value columns, and the type of aggregations to perform
+ * on the value columns
+ */
+struct groupby_context_t {
+  std::vector<std::string> keys;
+  std::unordered_map<std::string, std::vector<std::pair<cudf::aggregation::Kind, std::string>>>
+    values;
+};
+
+/**
+ * @brief Apply a groupby operation to a table
+ *
+ * @param table The input table
+ * @param ctx The groupby context
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_groupby(
+  std::unique_ptr<table_with_names> const& table, groupby_context_t const& ctx);
+
+/**
+ * @brief Apply an order by operation to a table
+ *
+ * @param table The input table
+ * @param sort_keys The sort keys
+ * @param sort_key_orders The sort key orders
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_orderby(
+  std::unique_ptr<table_with_names> const& table,
+  std::vector<std::string> const& sort_keys,
+  std::vector<cudf::order> const& sort_key_orders);
+
+/**
+ * @brief Apply a reduction operation to a column
+ *
+ * @param column The input column
+ * @param agg_kind The aggregation kind
+ * @param col_name The name of the output column
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> apply_reduction(
+  cudf::column_view const& column,
+  cudf::aggregation::Kind const& agg_kind,
+  std::string const& col_name);
+
+/**
+ * @brief Read a parquet file into a table
+ *
+ * @param source_info The source of the parquet file
+ * @param columns The columns to read
+ * @param predicate The filter predicate to pushdown
+ */
+[[nodiscard]] std::unique_ptr<table_with_names> read_parquet(
+  cudf::io::source_info const& source_info,
+  std::vector<std::string> const& columns                = {},
+  std::unique_ptr<cudf::ast::operation> const& predicate = nullptr);
+
+/**
+ * @brief Generate the `std::tm` structure from year, month, and day
+ *
+ * @param year The year
+ * @param month The month
+ * @param day The day
+ */
+std::tm make_tm(int year, int month, int day);
+
+/**
+ * @brief Calculate the number of days since the UNIX epoch
+ *
+ * @param year The year
+ * @param month The month
+ * @param day The day
+ */
+int32_t days_since_epoch(int year, int month, int day);
+
+/**
+ * @brief Struct representing a parquet device buffer
+ */
+struct parquet_device_buffer {
+  parquet_device_buffer() : d_buffer{0, cudf::get_default_stream()} {};
+  cudf::io::source_info make_source_info() { return cudf::io::source_info(d_buffer); }
+  rmm::device_uvector<std::byte> d_buffer;
+};
+
+/**
+ * @brief Write a `cudf::table` to a parquet device buffer
+ *
+ * @param table The `cudf::table` to write
+ * @param col_names The column names of the table
+ * @param parquet_device_buffer The parquet device buffer to write the table to
+ */
+void write_to_parquet_device_buffer(std::unique_ptr<cudf::table> const& table,
+                                    std::vector<std::string> const& col_names,
+                                    parquet_device_buffer& source);
+
+/**
+ * @brief Generate NDS-H tables and write to parquet device buffers
+ *
+ * @param scale_factor The scale factor of NDS-H tables to generate
+ * @param table_names The names of the tables to generate
+ * @param sources The parquet data sources to populate
+ */
+void generate_parquet_data_sources(double scale_factor,
+                                   std::vector<std::string> const& table_names,
+                                   std::unordered_map<std::string, parquet_device_buffer>& sources);
diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index 8e8d8bd0b78..25984df1b60 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -57,7 +57,6 @@ build_example() {
 }
 
 build_example basic
-build_example tpch
 build_example strings
 build_example nested_types
 build_example parquet_io
diff --git a/cpp/examples/tpch/CMakeLists.txt b/cpp/examples/tpch/CMakeLists.txt
deleted file mode 100644
index 373a6d72d56..00000000000
--- a/cpp/examples/tpch/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-cmake_minimum_required(VERSION 3.26.4)
-
-include(../set_cuda_architecture.cmake)
-
-rapids_cuda_init_architectures(tpch_example)
-rapids_cuda_set_architectures(RAPIDS)
-
-project(
-  tpch_example
-  VERSION 0.0.1
-  LANGUAGES CXX CUDA
-)
-
-include(../fetch_dependencies.cmake)
-
-add_executable(tpch_q1 q1.cpp)
-target_link_libraries(tpch_q1 PRIVATE cudf::cudf)
-target_compile_features(tpch_q1 PRIVATE cxx_std_17)
-
-add_executable(tpch_q5 q5.cpp)
-target_link_libraries(tpch_q5 PRIVATE cudf::cudf)
-target_compile_features(tpch_q5 PRIVATE cxx_std_17)
-
-add_executable(tpch_q6 q6.cpp)
-target_link_libraries(tpch_q6 PRIVATE cudf::cudf)
-target_compile_features(tpch_q6 PRIVATE cxx_std_17)
-
-add_executable(tpch_q9 q9.cpp)
-target_link_libraries(tpch_q9 PRIVATE cudf::cudf)
-target_compile_features(tpch_q9 PRIVATE cxx_std_17)
-
-add_executable(tpch_q10 q10.cpp)
-target_link_libraries(tpch_q10 PRIVATE cudf::cudf)
-target_compile_features(tpch_q10 PRIVATE cxx_std_17)
diff --git a/cpp/examples/tpch/README.md b/cpp/examples/tpch/README.md
deleted file mode 100644
index 8c046c3f1e8..00000000000
--- a/cpp/examples/tpch/README.md
+++ /dev/null
@@ -1,39 +0,0 @@
-# TPC-H Derived Examples
-
-Implements TPC-H queries using `libcudf`. We leverage the data generator (wrapper around official TPC-H datagen) from [Apache Datafusion](https://github.com/apache/datafusion) for generating data in Parquet format.
-
-## Requirements
-
-- Rust
-- [libcudf](https://github.com/rapidsai/cudf/blob/branch-24.08/CONTRIBUTING.md#setting-up-your-build-environment)
-
-## Running Queries
-
-1. Build the `libcudf` examples.
-```bash
-cd cudf/cpp/examples
-./build.sh
-```
-The TPC-H query binaries would be built inside `tpch/build`.
-
-2. Generate the dataset.
-```bash
-cd tpch/datagen
-./datagen.sh [scale factor (1/10)]
-```
-
-The parquet files will be generated in `tpch/datagen/datafusion/benchmarks/data/tpch_sf[scale factor]`.
-
-3. Set these environment variables for optimized runtimes.
-```bash
-export KVIKIO_COMPAT_MODE="on"
-export LIBCUDF_CUFILE_POLICY="KVIKIO"
-export CUDA_MODULE_LOADING="EAGER"
-```
-
-4. Execute the queries.
-```bash
-./tpch/build/tpch_q[query no] [path to dataset] [memory resource type (cuda/pool/managed/managed_pool)]
-```
-
-A parquet file named `q[query no].parquet` would be generated containing the results of the query.
diff --git a/cpp/examples/tpch/datagen/correct_datatypes.py b/cpp/examples/tpch/datagen/correct_datatypes.py
deleted file mode 100644
index 8564774647b..00000000000
--- a/cpp/examples/tpch/datagen/correct_datatypes.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-import os
-import sys
-
-import pyarrow as pa
-import pyarrow.parquet as pq
-import pandas as pd
-
-if __name__ == "__main__":
-    dataset_path = str(sys.argv[1])
-    tables = ["lineitem", "part", "partsupp", "orders", "supplier", "customer", "nation", "region"]
-    for table in tables:
-        filepath = os.path.join(dataset_path, f"{table}.parquet")
-        print("Reading file ", filepath)
-
-        if filepath.endswith("lineitem.parquet"):
-            df = pd.read_parquet(filepath)
-            df["l_linenumber"] = df["l_linenumber"].astype("int64")
-            df["l_quantity"] = df["l_quantity"].astype("int64")
-            df["l_extendedprice"] = df["l_extendedprice"].astype("float64")
-            df["l_discount"] = df["l_discount"].astype("float64")
-            df["l_tax"] = df["l_tax"].astype("float64")
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("part.parquet"):
-            df = pd.read_parquet(filepath)
-            df["p_size"] = df["p_size"].astype("int64")
-            df["p_retailprice"] = df["p_retailprice"].astype("float64")
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("partsupp.parquet"):
-            df = pd.read_parquet(filepath)
-            df["ps_availqty"] = df["ps_availqty"].astype("int64")
-            df["ps_supplycost"] = df["ps_supplycost"].astype("float64")
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("orders.parquet"):
-            df = pd.read_parquet(filepath)
-            df["o_totalprice"] = df["o_totalprice"].astype("float64")
-            df["o_shippriority"] = df["o_shippriority"].astype("int64")
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("supplier.parquet"):
-            df = pd.read_parquet(filepath)
-            df["s_acctbal"] = df["s_acctbal"].astype("float64")
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("customer.parquet"):
-            df = pd.read_parquet(filepath)
-            df["c_acctbal"] = df["c_acctbal"].astype("float64")
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("nation.parquet"):
-            df = pd.read_parquet(filepath)
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
-
-        elif filepath.endswith("region.parquet"):
-            df = pd.read_parquet(filepath)
-            pq.write_table(pa.Table.from_pandas(df), filepath, compression="snappy")
diff --git a/cpp/examples/tpch/datagen/datagen.sh b/cpp/examples/tpch/datagen/datagen.sh
deleted file mode 100755
index 0b03753daea..00000000000
--- a/cpp/examples/tpch/datagen/datagen.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-set -e
-
-scale_factor=$1
-script_dir=$(pwd)
-
-# Clone the datafusion repository and apply a patch
-# for single threaded data generation so that a
-# single parquet file is generated for each table
-rm -rf datafusion
-git clone https://github.com/apache/datafusion.git datafusion
-cd datafusion/
-git checkout 679a85f
-git apply ${script_dir}/tpch.patch
-cd benchmarks/
-
-# Generate the data
-# Currently, we support only scale factor 1 and 10
-if [ ${scale_factor} -eq 1 ]; then
-    ./bench.sh data tpch
-elif [ ${scale_factor} -eq 10 ]; then
-    ./bench.sh data tpch10
-else
-    echo "Unsupported scale factor"
-    exit 1
-fi
-
-# Correct the datatypes of the parquet files
-python3 ${script_dir}/correct_datatypes.py data/tpch_sf${scale_factor}
diff --git a/cpp/examples/tpch/datagen/tpch.patch b/cpp/examples/tpch/datagen/tpch.patch
deleted file mode 100644
index 42727aa9904..00000000000
--- a/cpp/examples/tpch/datagen/tpch.patch
+++ /dev/null
@@ -1,33 +0,0 @@
-diff --git a/benchmarks/bench.sh b/benchmarks/bench.sh
-index 3b854f6dc..f000f09c0 100755
---- a/benchmarks/bench.sh
-+++ b/benchmarks/bench.sh
-@@ -311,6 +311,15 @@ data_tpch() {
-         $CARGO_COMMAND --bin tpch -- convert --input "${TPCH_DIR}" --output "${TPCH_DIR}" --format parquet
-         popd > /dev/null
-     fi
-+
-+    cp ${TPCH_DIR}/lineitem/part-0.parquet ${TPCH_DIR}/lineitem.parquet
-+    cp ${TPCH_DIR}/orders/part-0.parquet ${TPCH_DIR}/orders.parquet
-+    cp ${TPCH_DIR}/part/part-0.parquet ${TPCH_DIR}/part.parquet
-+    cp ${TPCH_DIR}/partsupp/part-0.parquet ${TPCH_DIR}/partsupp.parquet
-+    cp ${TPCH_DIR}/customer/part-0.parquet ${TPCH_DIR}/customer.parquet
-+    cp ${TPCH_DIR}/supplier/part-0.parquet ${TPCH_DIR}/supplier.parquet
-+    cp ${TPCH_DIR}/nation/part-0.parquet ${TPCH_DIR}/nation.parquet
-+    cp ${TPCH_DIR}/region/part-0.parquet ${TPCH_DIR}/region.parquet
- }
-
- # Runs the tpch benchmark
-diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
-index b5204b343..84fd2e78d 100644
---- a/datafusion/common/src/config.rs
-+++ b/datafusion/common/src/config.rs
-@@ -250,7 +250,7 @@ config_namespace! {
-         /// concurrency.
-         ///
-         /// Defaults to the number of CPU cores on the system
--        pub target_partitions: usize, default = num_cpus::get()
-+        pub target_partitions: usize, default = 1
-
-         /// The default time zone
-         ///
diff --git a/cpp/examples/tpch/utils.hpp b/cpp/examples/tpch/utils.hpp
deleted file mode 100644
index 8102fa8f976..00000000000
--- a/cpp/examples/tpch/utils.hpp
+++ /dev/null
@@ -1,458 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cudf/binaryop.hpp>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/groupby.hpp>
-#include <cudf/io/parquet.hpp>
-#include <cudf/join.hpp>
-#include <cudf/reduction.hpp>
-#include <cudf/sorting.hpp>
-#include <cudf/stream_compaction.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/transform.hpp>
-#include <cudf/unary.hpp>
-#include <cudf/utilities/memory_resource.hpp>
-
-#include <rmm/cuda_device.hpp>
-#include <rmm/mr/device/cuda_memory_resource.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/device/owning_wrapper.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-
-#include <ctime>
-
-// RMM memory resource creation utilities
-inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
-inline auto make_pool()
-{
-  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
-    make_cuda(), rmm::percent_of_free_device_memory(50));
-}
-inline auto make_managed() { return std::make_shared<rmm::mr::managed_memory_resource>(); }
-inline auto make_managed_pool()
-{
-  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(
-    make_managed(), rmm::percent_of_free_device_memory(50));
-}
-inline std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(
-  std::string const& mode)
-{
-  if (mode == "cuda") return make_cuda();
-  if (mode == "pool") return make_pool();
-  if (mode == "managed") return make_managed();
-  if (mode == "managed_pool") return make_managed_pool();
-  CUDF_FAIL("Unknown rmm_mode parameter: " + mode +
-            "\nExpecting: cuda, pool, managed, or managed_pool");
-}
-
-/**
- * @brief A class to represent a table with column names attached
- */
-class table_with_names {
- public:
-  table_with_names(std::unique_ptr<cudf::table> tbl, std::vector<std::string> col_names)
-    : tbl(std::move(tbl)), col_names(col_names)
-  {
-  }
-  /**
-   * @brief Return the table view
-   */
-  [[nodiscard]] cudf::table_view table() const { return tbl->view(); }
-  /**
-   * @brief Return the column view for a given column name
-   *
-   * @param col_name The name of the column
-   */
-  [[nodiscard]] cudf::column_view column(std::string const& col_name) const
-  {
-    return tbl->view().column(col_id(col_name));
-  }
-  /**
-   * @param Return the column names of the table
-   */
-  [[nodiscard]] std::vector<std::string> column_names() const { return col_names; }
-  /**
-   * @brief Translate a column name to a column index
-   *
-   * @param col_name The name of the column
-   */
-  [[nodiscard]] cudf::size_type col_id(std::string const& col_name) const
-  {
-    CUDF_FUNC_RANGE();
-    auto it = std::find(col_names.begin(), col_names.end(), col_name);
-    if (it == col_names.end()) { throw std::runtime_error("Column not found"); }
-    return std::distance(col_names.begin(), it);
-  }
-  /**
-   * @brief Append a column to the table
-   *
-   * @param col The column to append
-   * @param col_name The name of the appended column
-   */
-  table_with_names& append(std::unique_ptr<cudf::column>& col, std::string const& col_name)
-  {
-    CUDF_FUNC_RANGE();
-    auto cols = tbl->release();
-    cols.push_back(std::move(col));
-    tbl = std::make_unique<cudf::table>(std::move(cols));
-    col_names.push_back(col_name);
-    return (*this);
-  }
-  /**
-   * @brief Select a subset of columns from the table
-   *
-   * @param col_names The names of the columns to select
-   */
-  [[nodiscard]] cudf::table_view select(std::vector<std::string> const& col_names) const
-  {
-    CUDF_FUNC_RANGE();
-    std::vector<cudf::size_type> col_indices;
-    for (auto const& col_name : col_names) {
-      col_indices.push_back(col_id(col_name));
-    }
-    return tbl->select(col_indices);
-  }
-  /**
-   * @brief Write the table to a parquet file
-   *
-   * @param filepath The path to the parquet file
-   */
-  void to_parquet(std::string const& filepath) const
-  {
-    CUDF_FUNC_RANGE();
-    auto const sink_info = cudf::io::sink_info(filepath);
-    cudf::io::table_metadata metadata;
-    metadata.schema_info =
-      std::vector<cudf::io::column_name_info>(col_names.begin(), col_names.end());
-    auto const table_input_metadata = cudf::io::table_input_metadata{metadata};
-    auto builder = cudf::io::parquet_writer_options::builder(sink_info, tbl->view());
-    builder.metadata(table_input_metadata);
-    auto const options = builder.build();
-    cudf::io::write_parquet(options);
-  }
-
- private:
-  std::unique_ptr<cudf::table> tbl;
-  std::vector<std::string> col_names;
-};
-
-/**
- * @brief Concatenate two vectors
- *
- * @param lhs The left vector
- * @param rhs The right vector
- */
-template <typename T>
-std::vector<T> concat(std::vector<T> const& lhs, std::vector<T> const& rhs)
-{
-  std::vector<T> result;
-  result.reserve(lhs.size() + rhs.size());
-  std::copy(lhs.begin(), lhs.end(), std::back_inserter(result));
-  std::copy(rhs.begin(), rhs.end(), std::back_inserter(result));
-  return result;
-}
-
-/**
- * @brief Inner join two tables and gather the result
- *
- * @param left_input The left input table
- * @param right_input The right input table
- * @param left_on The columns to join on in the left table
- * @param right_on The columns to join on in the right table
- * @param compare_nulls The null equality policy
- */
-[[nodiscard]] std::unique_ptr<cudf::table> join_and_gather(
-  cudf::table_view const& left_input,
-  cudf::table_view const& right_input,
-  std::vector<cudf::size_type> const& left_on,
-  std::vector<cudf::size_type> const& right_on,
-  cudf::null_equality compare_nulls)
-{
-  CUDF_FUNC_RANGE();
-  constexpr auto oob_policy                          = cudf::out_of_bounds_policy::DONT_CHECK;
-  auto const left_selected                           = left_input.select(left_on);
-  auto const right_selected                          = right_input.select(right_on);
-  auto const [left_join_indices, right_join_indices] = cudf::inner_join(
-    left_selected, right_selected, compare_nulls, cudf::get_current_device_resource_ref());
-
-  auto const left_indices_span  = cudf::device_span<cudf::size_type const>{*left_join_indices};
-  auto const right_indices_span = cudf::device_span<cudf::size_type const>{*right_join_indices};
-
-  auto const left_indices_col  = cudf::column_view{left_indices_span};
-  auto const right_indices_col = cudf::column_view{right_indices_span};
-
-  auto const left_result  = cudf::gather(left_input, left_indices_col, oob_policy);
-  auto const right_result = cudf::gather(right_input, right_indices_col, oob_policy);
-
-  auto joined_cols = left_result->release();
-  auto right_cols  = right_result->release();
-  joined_cols.insert(joined_cols.end(),
-                     std::make_move_iterator(right_cols.begin()),
-                     std::make_move_iterator(right_cols.end()));
-  return std::make_unique<cudf::table>(std::move(joined_cols));
-}
-
-/**
- * @brief Apply an inner join operation to two tables
- *
- * @param left_input The left input table
- * @param right_input The right input table
- * @param left_on The columns to join on in the left table
- * @param right_on The columns to join on in the right table
- * @param compare_nulls The null equality policy
- */
-[[nodiscard]] std::unique_ptr<table_with_names> apply_inner_join(
-  std::unique_ptr<table_with_names> const& left_input,
-  std::unique_ptr<table_with_names> const& right_input,
-  std::vector<std::string> const& left_on,
-  std::vector<std::string> const& right_on,
-  cudf::null_equality compare_nulls = cudf::null_equality::EQUAL)
-{
-  CUDF_FUNC_RANGE();
-  std::vector<cudf::size_type> left_on_indices;
-  std::vector<cudf::size_type> right_on_indices;
-  std::transform(
-    left_on.begin(), left_on.end(), std::back_inserter(left_on_indices), [&](auto const& col_name) {
-      return left_input->col_id(col_name);
-    });
-  std::transform(right_on.begin(),
-                 right_on.end(),
-                 std::back_inserter(right_on_indices),
-                 [&](auto const& col_name) { return right_input->col_id(col_name); });
-  auto table = join_and_gather(
-    left_input->table(), right_input->table(), left_on_indices, right_on_indices, compare_nulls);
-  return std::make_unique<table_with_names>(
-    std::move(table), concat(left_input->column_names(), right_input->column_names()));
-}
-
-/**
- * @brief Apply a filter predicated to a table
- *
- * @param table The input table
- * @param predicate The filter predicate
- */
-[[nodiscard]] std::unique_ptr<table_with_names> apply_filter(
-  std::unique_ptr<table_with_names> const& table, cudf::ast::operation const& predicate)
-{
-  CUDF_FUNC_RANGE();
-  auto const boolean_mask = cudf::compute_column(table->table(), predicate);
-  auto result_table       = cudf::apply_boolean_mask(table->table(), boolean_mask->view());
-  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
-}
-
-/**
- * @brief Apply a boolean mask to a table
- *
- * @param table The input table
- * @param mask The boolean mask
- */
-[[nodiscard]] std::unique_ptr<table_with_names> apply_mask(
-  std::unique_ptr<table_with_names> const& table, std::unique_ptr<cudf::column> const& mask)
-{
-  CUDF_FUNC_RANGE();
-  auto result_table = cudf::apply_boolean_mask(table->table(), mask->view());
-  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
-}
-
-struct groupby_context_t {
-  std::vector<std::string> keys;
-  std::unordered_map<std::string, std::vector<std::pair<cudf::aggregation::Kind, std::string>>>
-    values;
-};
-
-/**
- * @brief Apply a groupby operation to a table
- *
- * @param table The input table
- * @param ctx The groupby context
- */
-[[nodiscard]] std::unique_ptr<table_with_names> apply_groupby(
-  std::unique_ptr<table_with_names> const& table, groupby_context_t const& ctx)
-{
-  CUDF_FUNC_RANGE();
-  auto const keys = table->select(ctx.keys);
-  cudf::groupby::groupby groupby_obj(keys);
-  std::vector<std::string> result_column_names;
-  result_column_names.insert(result_column_names.end(), ctx.keys.begin(), ctx.keys.end());
-  std::vector<cudf::groupby::aggregation_request> requests;
-  for (auto& [value_col, aggregations] : ctx.values) {
-    requests.emplace_back(cudf::groupby::aggregation_request());
-    for (auto& agg : aggregations) {
-      if (agg.first == cudf::aggregation::Kind::SUM) {
-        requests.back().aggregations.push_back(
-          cudf::make_sum_aggregation<cudf::groupby_aggregation>());
-      } else if (agg.first == cudf::aggregation::Kind::MEAN) {
-        requests.back().aggregations.push_back(
-          cudf::make_mean_aggregation<cudf::groupby_aggregation>());
-      } else if (agg.first == cudf::aggregation::Kind::COUNT_ALL) {
-        requests.back().aggregations.push_back(
-          cudf::make_count_aggregation<cudf::groupby_aggregation>());
-      } else {
-        throw std::runtime_error("Unsupported aggregation");
-      }
-      result_column_names.push_back(agg.second);
-    }
-    requests.back().values = table->column(value_col);
-  }
-  auto agg_results = groupby_obj.aggregate(requests);
-  std::vector<std::unique_ptr<cudf::column>> result_columns;
-  for (size_t i = 0; i < agg_results.first->num_columns(); i++) {
-    auto col = std::make_unique<cudf::column>(agg_results.first->get_column(i));
-    result_columns.push_back(std::move(col));
-  }
-  for (size_t i = 0; i < agg_results.second.size(); i++) {
-    for (size_t j = 0; j < agg_results.second[i].results.size(); j++) {
-      result_columns.push_back(std::move(agg_results.second[i].results[j]));
-    }
-  }
-  auto result_table = std::make_unique<cudf::table>(std::move(result_columns));
-  return std::make_unique<table_with_names>(std::move(result_table), result_column_names);
-}
-
-/**
- * @brief Apply an order by operation to a table
- *
- * @param table The input table
- * @param sort_keys The sort keys
- * @param sort_key_orders The sort key orders
- */
-[[nodiscard]] std::unique_ptr<table_with_names> apply_orderby(
-  std::unique_ptr<table_with_names> const& table,
-  std::vector<std::string> const& sort_keys,
-  std::vector<cudf::order> const& sort_key_orders)
-{
-  CUDF_FUNC_RANGE();
-  std::vector<cudf::column_view> column_views;
-  for (auto& key : sort_keys) {
-    column_views.push_back(table->column(key));
-  }
-  auto result_table =
-    cudf::sort_by_key(table->table(), cudf::table_view{column_views}, sort_key_orders);
-  return std::make_unique<table_with_names>(std::move(result_table), table->column_names());
-}
-
-/**
- * @brief Apply a reduction operation to a column
- *
- * @param column The input column
- * @param agg_kind The aggregation kind
- * @param col_name The name of the output column
- */
-[[nodiscard]] std::unique_ptr<table_with_names> apply_reduction(
-  cudf::column_view const& column,
-  cudf::aggregation::Kind const& agg_kind,
-  std::string const& col_name)
-{
-  CUDF_FUNC_RANGE();
-  auto const agg            = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
-  auto const result         = cudf::reduce(column, *agg, column.type());
-  cudf::size_type const len = 1;
-  auto col                  = cudf::make_column_from_scalar(*result, len);
-  std::vector<std::unique_ptr<cudf::column>> columns;
-  columns.push_back(std::move(col));
-  auto result_table                  = std::make_unique<cudf::table>(std::move(columns));
-  std::vector<std::string> col_names = {col_name};
-  return std::make_unique<table_with_names>(std::move(result_table), col_names);
-}
-
-/**
- * @brief Read a parquet file into a table
- *
- * @param filename The path to the parquet file
- * @param columns The columns to read
- * @param predicate The filter predicate to pushdown
- */
-[[nodiscard]] std::unique_ptr<table_with_names> read_parquet(
-  std::string const& filename,
-  std::vector<std::string> const& columns                = {},
-  std::unique_ptr<cudf::ast::operation> const& predicate = nullptr)
-{
-  CUDF_FUNC_RANGE();
-  auto const source = cudf::io::source_info(filename);
-  auto builder      = cudf::io::parquet_reader_options_builder(source);
-  if (!columns.empty()) { builder.columns(columns); }
-  if (predicate) { builder.filter(*predicate); }
-  auto const options       = builder.build();
-  auto table_with_metadata = cudf::io::read_parquet(options);
-  std::vector<std::string> column_names;
-  for (auto const& col_info : table_with_metadata.metadata.schema_info) {
-    column_names.push_back(col_info.name);
-  }
-  return std::make_unique<table_with_names>(std::move(table_with_metadata.tbl), column_names);
-}
-
-/**
- * @brief Generate the `std::tm` structure from year, month, and day
- *
- * @param year The year
- * @param month The month
- * @param day The day
- */
-std::tm make_tm(int year, int month, int day)
-{
-  std::tm tm{};
-  tm.tm_year = year - 1900;
-  tm.tm_mon  = month - 1;
-  tm.tm_mday = day;
-  return tm;
-}
-
-/**
- * @brief Calculate the number of days since the UNIX epoch
- *
- * @param year The year
- * @param month The month
- * @param day The day
- */
-int32_t days_since_epoch(int year, int month, int day)
-{
-  std::tm tm             = make_tm(year, month, day);
-  std::tm epoch          = make_tm(1970, 1, 1);
-  std::time_t time       = std::mktime(&tm);
-  std::time_t epoch_time = std::mktime(&epoch);
-  double diff            = std::difftime(time, epoch_time) / (60 * 60 * 24);
-  return static_cast<int32_t>(diff);
-}
-
-struct tpch_example_args {
-  std::string dataset_dir;
-  std::string memory_resource_type;
-};
-
-/**
- * @brief Parse command line arguments into a struct
- *
- * @param argc The number of command line arguments
- * @param argv The command line arguments
- */
-tpch_example_args parse_args(int argc, char const** argv)
-{
-  if (argc < 3) {
-    std::string usage_message = "Usage: " + std::string(argv[0]) +
-                                " <dataset_dir> <memory_resource_type>\n The query result will be "
-                                "saved to a parquet file named q{query_no}.parquet in the current "
-                                "working directory ";
-    throw std::runtime_error(usage_message);
-  }
-  tpch_example_args args;
-  args.dataset_dir          = argv[1];
-  args.memory_resource_type = argv[2];
-  return args;
-}

From 4cdb1bf9cf7ad4f19b8abd034513172902d187a3 Mon Sep 17 00:00:00 2001
From: Matthew Murray <41342305+Matt711@users.noreply.github.com>
Date: Tue, 10 Sep 2024 20:20:29 -0400
Subject: [PATCH 797/842] [FEA] Add support for `cudf.NamedAgg` (#16744)

Closes #15118

Authors:
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/16744
---
 python/cudf/cudf/__init__.py               |  2 +-
 python/cudf/cudf/core/groupby/__init__.py  |  5 ++-
 python/cudf/cudf/core/groupby/groupby.py   | 46 ++++++++++++++++++++--
 python/cudf/cudf/tests/groupby/test_agg.py | 16 ++++++++
 4 files changed, 63 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/__init__.py b/python/cudf/cudf/__init__.py
index d7da42a1708..99b759e2166 100644
--- a/python/cudf/cudf/__init__.py
+++ b/python/cudf/cudf/__init__.py
@@ -46,7 +46,7 @@
     ListDtype,
     StructDtype,
 )
-from cudf.core.groupby import Grouper
+from cudf.core.groupby import Grouper, NamedAgg
 from cudf.core.index import (
     BaseIndex,
     CategoricalIndex,
diff --git a/python/cudf/cudf/core/groupby/__init__.py b/python/cudf/cudf/core/groupby/__init__.py
index 4375ed3e3da..621edb316cf 100644
--- a/python/cudf/cudf/core/groupby/__init__.py
+++ b/python/cudf/cudf/core/groupby/__init__.py
@@ -1,8 +1,9 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 
-from cudf.core.groupby.groupby import GroupBy, Grouper
+from cudf.core.groupby.groupby import GroupBy, Grouper, NamedAgg
 
 __all__ = [
     "GroupBy",
     "Grouper",
+    "NamedAgg",
 ]
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index 4f283d41b17..6424c8af877 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -76,6 +76,34 @@ def _is_row_of(chunk, obj):
     )
 
 
+NamedAgg = pd.NamedAgg
+
+
+NamedAgg.__doc__ = """
+Helper for column specific aggregation with control over output column names.
+
+Subclass of typing.NamedTuple.
+
+Parameters
+----------
+column : Hashable
+    Column label in the DataFrame to apply aggfunc.
+aggfunc : function or str
+    Function to apply to the provided column.
+
+Examples
+--------
+>>> df = cudf.DataFrame({"key": [1, 1, 2], "a": [-1, 0, 1], 1: [10, 11, 12]})
+>>> agg_a = cudf.NamedAgg(column="a", aggfunc="min")
+>>> agg_1 = cudf.NamedAgg(column=1, aggfunc=lambda x: x.mean())
+>>> df.groupby("key").agg(result_a=agg_a, result_1=agg_1)
+        result_a  result_1
+key
+1          -1      10.5
+2           1      12.0
+"""
+
+
 groupby_doc_template = textwrap.dedent(
     """Group using a mapper or by a Series of columns.
 
@@ -1296,9 +1324,21 @@ def _normalize_aggs(
                 columns = values._columns
                 aggs_per_column = (aggs,) * len(columns)
         elif not aggs and kwargs:
-            column_names, aggs_per_column = kwargs.keys(), kwargs.values()
-            columns = tuple(self.obj._data[x[0]] for x in kwargs.values())
-            aggs_per_column = tuple(x[1] for x in kwargs.values())
+            column_names = kwargs.keys()
+
+            def _raise_invalid_type(x):
+                raise TypeError(
+                    f"Invalid keyword argument {x} of type {type(x)} was passed to agg"
+                )
+
+            columns, aggs_per_column = zip(
+                *(
+                    (self.obj._data[x[0]], x[1])
+                    if isinstance(x, tuple)
+                    else _raise_invalid_type(x)
+                    for x in kwargs.values()
+                )
+            )
         else:
             raise TypeError("Must provide at least one aggregation function.")
 
diff --git a/python/cudf/cudf/tests/groupby/test_agg.py b/python/cudf/cudf/tests/groupby/test_agg.py
index 99e7523031b..dc20a27177a 100644
--- a/python/cudf/cudf/tests/groupby/test_agg.py
+++ b/python/cudf/cudf/tests/groupby/test_agg.py
@@ -56,3 +56,19 @@ def test_dataframe_agg(attr, func):
     )
 
     assert_eq(agg, pd_agg)
+
+    agg = getattr(df.groupby("a"), attr)(
+        foo=cudf.NamedAgg(column="b", aggfunc=func),
+        bar=cudf.NamedAgg(column="a", aggfunc=func),
+    )
+    pd_agg = getattr(pdf.groupby(["a"]), attr)(
+        foo=("b", func), bar=("a", func)
+    )
+
+    assert_eq(agg, pd_agg)
+
+
+def test_dataframe_agg_with_invalid_kwarg():
+    with pytest.raises(TypeError, match="Invalid keyword argument"):
+        df = cudf.DataFrame({"a": [1, 2, 1, 2], "b": [0, 0, 0, 0]})
+        df.groupby("a").agg(foo=set())

From 750adca4e4cc7b18ef80ba39950ed1d250919016 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 10 Sep 2024 17:40:49 -0700
Subject: [PATCH 798/842] nvCOMP GZIP integration (#16770)

nvCOMP GZIP integration. Opt-in for now.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Mark Harris (https://github.com/harrism)
  - Nghia Truong (https://github.com/ttnghia)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

URL: https://github.com/rapidsai/cudf/pull/16770
---
 cpp/include/cudf/io/nvcomp_adapter.hpp     |  2 +-
 cpp/src/io/comp/nvcomp_adapter.cpp         | 14 +++++++++++---
 cpp/src/io/parquet/reader_impl_chunking.cu | 14 ++++++++++++--
 docs/cudf/source/user_guide/io/io.md       |  6 +++++-
 4 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/cpp/include/cudf/io/nvcomp_adapter.hpp b/cpp/include/cudf/io/nvcomp_adapter.hpp
index e7fe3cc7214..0d74a4158ad 100644
--- a/cpp/include/cudf/io/nvcomp_adapter.hpp
+++ b/cpp/include/cudf/io/nvcomp_adapter.hpp
@@ -24,7 +24,7 @@
 namespace CUDF_EXPORT cudf {
 namespace io::nvcomp {
 
-enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4 };
+enum class compression_type { SNAPPY, ZSTD, DEFLATE, LZ4, GZIP };
 
 /**
  * @brief Set of parameters that impact whether nvCOMP features are enabled.
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index 261a8eb401d..c3187f73a95 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -23,6 +23,7 @@
 #include <cudf/utilities/error.hpp>
 
 #include <nvcomp/deflate.h>
+#include <nvcomp/gzip.h>
 #include <nvcomp/lz4.h>
 #include <nvcomp/snappy.h>
 #include <nvcomp/zstd.h>
@@ -44,6 +45,8 @@ auto batched_decompress_get_temp_size_ex(compression_type compression, Args&&...
       return nvcompBatchedLZ4DecompressGetTempSizeEx(std::forward<Args>(args)...);
     case compression_type::DEFLATE:
       return nvcompBatchedDeflateDecompressGetTempSizeEx(std::forward<Args>(args)...);
+    case compression_type::GZIP:
+      return nvcompBatchedGzipDecompressGetTempSizeEx(std::forward<Args>(args)...);
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
@@ -73,6 +76,8 @@ auto batched_decompress_async(compression_type compression, Args&&... args)
     case compression_type::DEFLATE:
       return nvcompBatchedDeflateDecompressAsync(std::forward<Args>(args)...);
     case compression_type::LZ4: return nvcompBatchedLZ4DecompressAsync(std::forward<Args>(args)...);
+    case compression_type::GZIP:
+      return nvcompBatchedGzipDecompressAsync(std::forward<Args>(args)...);
     default: CUDF_FAIL("Unsupported compression type");
   }
 }
@@ -84,6 +89,7 @@ std::string compression_type_name(compression_type compression)
     case compression_type::ZSTD: return "Zstandard";
     case compression_type::DEFLATE: return "Deflate";
     case compression_type::LZ4: return "LZ4";
+    case compression_type::GZIP: return "GZIP";
   }
   return "compression_type(" + std::to_string(static_cast<int>(compression)) + ")";
 }
@@ -359,8 +365,8 @@ std::optional<std::string> is_compression_disabled_impl(compression_type compres
         return "nvCOMP use is disabled through the `LIBCUDF_NVCOMP_POLICY` environment variable.";
       }
       return std::nullopt;
+    default: return "Unsupported compression type";
   }
-  return "Unsupported compression type";
 }
 
 std::optional<std::string> is_compression_disabled(compression_type compression,
@@ -396,7 +402,8 @@ std::optional<std::string> is_decompression_disabled_impl(compression_type compr
                                                           feature_status_parameters params)
 {
   switch (compression) {
-    case compression_type::DEFLATE: {
+    case compression_type::DEFLATE:
+    case compression_type::GZIP: {
       if (not params.are_all_integrations_enabled) {
         return "DEFLATE decompression is experimental, you can enable it through "
                "`LIBCUDF_NVCOMP_POLICY` environment variable.";
@@ -447,6 +454,7 @@ std::optional<std::string> is_decompression_disabled(compression_type compressio
 size_t required_alignment(compression_type compression)
 {
   switch (compression) {
+    case compression_type::GZIP:
     case compression_type::DEFLATE: return nvcompDeflateRequiredAlignment;
     case compression_type::SNAPPY: return nvcompSnappyRequiredAlignment;
     case compression_type::ZSTD: return nvcompZstdRequiredAlignment;
@@ -462,7 +470,7 @@ std::optional<size_t> compress_max_allowed_chunk_size(compression_type compressi
     case compression_type::SNAPPY: return nvcompSnappyCompressionMaxAllowedChunkSize;
     case compression_type::ZSTD: return nvcompZstdCompressionMaxAllowedChunkSize;
     case compression_type::LZ4: return nvcompLZ4CompressionMaxAllowedChunkSize;
-    default: return std::nullopt;
+    default: CUDF_FAIL("Unsupported compression type");
   }
 }
 
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 84f0dab0d8b..245e1829c72 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -865,8 +865,18 @@ std::vector<row_range> compute_page_splits_by_row(device_span<cumulative_page_in
 
     switch (codec.compression_type) {
       case GZIP:
-        gpuinflate(
-          d_comp_in_view, d_comp_out_view, d_comp_res_view, gzip_header_included::YES, stream);
+        if (cudf::io::nvcomp_integration::is_all_enabled()) {
+          nvcomp::batched_decompress(nvcomp::compression_type::GZIP,
+                                     d_comp_in_view,
+                                     d_comp_out_view,
+                                     d_comp_res_view,
+                                     codec.max_decompressed_size,
+                                     codec.total_decomp_size,
+                                     stream);
+        } else {
+          gpuinflate(
+            d_comp_in_view, d_comp_out_view, d_comp_res_view, gzip_header_included::YES, stream);
+        }
         break;
       case SNAPPY:
         if (cudf::io::nvcomp_integration::is_stable_enabled()) {
diff --git a/docs/cudf/source/user_guide/io/io.md b/docs/cudf/source/user_guide/io/io.md
index adcdaa51e7e..97b961b455b 100644
--- a/docs/cudf/source/user_guide/io/io.md
+++ b/docs/cudf/source/user_guide/io/io.md
@@ -75,7 +75,6 @@ IO format.
 
 </div>
 
-
 **Notes:**
 
 - \[¹\] - Not all orientations are GPU-accelerated.
@@ -177,4 +176,9 @@ If no value is set, behavior will be the same as the "STABLE" option.
     +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
     | DEFLATE               | ❌     | ❌     | ❌           | ❌           | ❌      | ❌     | Experimental | Experimental | ❌     |
     +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
+    | LZ4                   | ❌     | ❌     | Stable       | Stable       | ❌      | ❌     | Stable       | Stable       | ❌     |
+    +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
+    | GZIP                  | ❌     | ❌     | Experimental | Experimental | ❌      | ❌     | ❌           | ❌           | ❌     |
+    +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
+
 ```

From ab5ba4eb45d6861d19ca345e471567c50799ed0f Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 11 Sep 2024 12:16:40 +0000
Subject: [PATCH 799/842] test

---
 python/cudf/cudf/pandas/scripts/conftest-patch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index 6bda9b4d2a3..db19d6a6471 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -90,7 +90,7 @@ def pytest_runtest_teardown(item, nextitem):
         worker_id = os.getenv("PYTEST_XDIST_WORKER", "master")
         output_file = f'function_call_counts_{os.path.basename(item.nodeid.split("::")[0])}_{worker_id}.json'
         with open(output_file, "w") as f:
-            json.dump(function_call_counts, f, indent=4)
+            json.dump(dict(function_call_counts), f, indent=4)
         print(f"Function call counts have been written to {output_file}")
 
 
@@ -109,7 +109,7 @@ def pytest_unconfigure(config):
         worker_id = config.workerinput["workerid"]
         output_file = f"function_call_counts_worker_{worker_id}.json"
         with open(output_file, "w") as f:
-            json.dump(function_call_counts, f, indent=4)
+            json.dump(dict(function_call_counts), f, indent=4)
         print(f"Function call counts have been written to {output_file}")
 
 
From 3b7d7405729c97420d01acd6613450b7c21f1260 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 11 Sep 2024 12:31:35 +0000
Subject: [PATCH 800/842] test

---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 74f52dee4fc..36c73ba3e26 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -149,6 +149,5 @@ ls -al
 ls -al tests/
 cd ..
 ls -al
-ls -al pandas-testing/
-ls -al pandas-testing/pandas-tests/
+ls -al pandas-tests/
 rm -rf pandas-testing/pandas-tests/

From 9acbaf88cbe025a9dd2ccf1208828de2598e0199 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Wed, 11 Sep 2024 09:34:21 -0500
Subject: [PATCH 801/842] JSON reader validation of values (#15968)

Addresses part of https://github.com/rapidsai/cudf/issues/15222
This change adds validation stage in JSON reader at tokens level. If any validation fails in a row, it will make the entire row as null.

- [x] validation functor - implement spark validation rules. (@revans2 implemented all validation rules)
- [x] move output iterator to thrust. (already merged by https://github.com/NVIDIA/cccl/pull/2282)
- [x] Fix failing tests and infer data type for Float.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/15968
---
 cpp/CMakeLists.txt                            |   1 +
 cpp/include/cudf/io/json.hpp                  | 190 +++++++++++
 cpp/src/io/json/json_normalization.cu         |   3 +
 cpp/src/io/json/nested_json.hpp               |  15 +
 cpp/src/io/json/nested_json_gpu.cu            |   5 +-
 cpp/src/io/json/process_tokens.cu             | 310 ++++++++++++++++++
 cpp/src/io/json/tabulate_output_iterator.cuh  | 132 ++++++++
 cpp/tests/io/json/json_test.cpp               |  80 +++++
 .../main/java/ai/rapids/cudf/JSONOptions.java |  65 +++-
 java/src/main/java/ai/rapids/cudf/Table.java  |  73 ++++-
 java/src/main/native/src/TableJni.cpp         |  66 +++-
 .../test/java/ai/rapids/cudf/TableTest.java   | 201 ++++++++++++
 12 files changed, 1113 insertions(+), 28 deletions(-)
 create mode 100644 cpp/src/io/json/process_tokens.cu
 create mode 100644 cpp/src/io/json/tabulate_output_iterator.cuh

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 1040fcb7b91..7bc01e64441 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -384,6 +384,7 @@ add_library(
   src/io/json/nested_json_gpu.cu
   src/io/json/read_json.cu
   src/io/json/parser_features.cpp
+  src/io/json/process_tokens.cu
   src/io/json/write_json.cu
   src/io/orc/aggregate_orc_metadata.cpp
   src/io/orc/dict_enc.cu
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index a3d6533705e..ff25a5bacae 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -20,6 +20,7 @@
 
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 
 #include <map>
@@ -128,6 +129,19 @@ class json_reader_options {
   // Whether to recover after an invalid JSON line
   json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;
 
+  // Validation checks for spark
+  // Should the json validation be strict or not
+  // Note: strict validation enforces the JSON specification https://www.json.org/json-en.html
+  bool _strict_validation = false;
+  // Allow leading zeros for numeric values.
+  bool _allow_numeric_leading_zeros = true;
+  // Allow non-numeric numbers: NaN, +INF, -INF, +Infinity, Infinity, -Infinity
+  bool _allow_nonnumeric_numbers = true;
+  // Allow unquoted control characters
+  bool _allow_unquoted_control_chars = true;
+  // Additional values to recognize as null values
+  std::vector<std::string> _na_values;
+
   /**
    * @brief Constructor from source info.
    *
@@ -298,6 +312,55 @@ class json_reader_options {
    */
   [[nodiscard]] json_recovery_mode_t recovery_mode() const { return _recovery_mode; }
 
+  /**
+   * @brief Whether json validation should be enforced strictly or not.
+   *
+   * @return true if it should be.
+   */
+  [[nodiscard]] bool is_strict_validation() const { return _strict_validation; }
+
+  /**
+   * @brief Whether leading zeros are allowed in numeric values.
+   *
+   * @note: This validation is enforced only if strict validation is enabled.
+   *
+   * @return true if leading zeros are allowed in numeric values
+   */
+  [[nodiscard]] bool is_allowed_numeric_leading_zeros() const
+  {
+    return _allow_numeric_leading_zeros;
+  }
+
+  /**
+   * @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity,
+   * and -Infinity.
+   *
+   * @note: This validation is enforced only if strict validation is enabled.
+   *
+   * @return true if leading zeros are allowed in numeric values
+   */
+  [[nodiscard]] bool is_allowed_nonnumeric_numbers() const { return _allow_nonnumeric_numbers; }
+
+  /**
+   * @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32
+   * be allowed without some form of escaping.
+   *
+   * @note: This validation is enforced only if strict validation is enabled.
+   *
+   * @return true if unquoted control chars are allowed.
+   */
+  [[nodiscard]] bool is_allowed_unquoted_control_chars() const
+  {
+    return _allow_unquoted_control_chars;
+  }
+
+  /**
+   * @brief Returns additional values to recognize as null values.
+   *
+   * @return Additional values to recognize as null values
+   */
+  [[nodiscard]] std::vector<std::string> const& get_na_values() const { return _na_values; }
+
   /**
    * @brief Set data types for columns to be read.
    *
@@ -427,6 +490,63 @@ class json_reader_options {
    * @param val An enum value to indicate the JSON reader's behavior on invalid JSON lines.
    */
   void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; }
+
+  /**
+   * @brief Set whether strict validation is enabled or not.
+   *
+   * @param val Boolean value to indicate whether strict validation is enabled.
+   */
+  void set_strict_validation(bool val) { _strict_validation = val; }
+
+  /**
+   * @brief Set whether leading zeros are allowed in numeric values. Strict validation
+   * must be enabled for this to work.
+   *
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
+   *
+   * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
+   */
+  void allow_numeric_leading_zeros(bool val)
+  {
+    CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
+    _allow_numeric_leading_zeros = val;
+  }
+
+  /**
+   * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity,
+   * Infinity, and -Infinity. Strict validation must be enabled for this to work.
+   *
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
+   *
+   * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
+   */
+  void allow_nonnumeric_numbers(bool val)
+  {
+    CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
+    _allow_nonnumeric_numbers = val;
+  }
+
+  /**
+   * @brief Set whether in a quoted string should characters greater than or equal to 0
+   * and less than 32 be allowed without some form of escaping. Strict validation must
+   * be enabled for this to work.
+   *
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
+   *
+   * @param val true to indicate whether unquoted control chars are allowed.
+   */
+  void allow_unquoted_control_chars(bool val)
+  {
+    CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
+    _allow_unquoted_control_chars = val;
+  }
+
+  /**
+   * @brief Sets additional values to recognize as null values.
+   *
+   * @param vals Vector of values to be considered to be null
+   */
+  void set_na_values(std::vector<std::string> vals) { _na_values = std::move(vals); }
 };
 
 /**
@@ -638,6 +758,76 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether json validation should be strict or not.
+   *
+   * @param val Boolean value to indicate whether json validation should be strict or not.
+   * @return this for chaining
+   */
+  json_reader_options_builder& strict_validation(bool val)
+  {
+    options.set_strict_validation(val);
+    return *this;
+  }
+
+  /**
+   * @brief Set Whether leading zeros are allowed in numeric values. Strict validation must
+   * be enabled for this to have any effect.
+   *
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
+   *
+   * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
+   * @return this for chaining
+   */
+  json_reader_options_builder& numeric_leading_zeros(bool val)
+  {
+    options.allow_numeric_leading_zeros(val);
+    return *this;
+  }
+
+  /**
+   * @brief Set whether specific unquoted number values are valid JSON. The values are NaN,
+   * +INF, -INF, +Infinity, Infinity, and -Infinity.
+   * Strict validation must be enabled for this to have any effect.
+   *
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
+   *
+   * @param val Boolean value to indicate if unquoted nonnumeric values are valid json or not.
+   * @return this for chaining
+   */
+  json_reader_options_builder& nonnumeric_numbers(bool val)
+  {
+    options.allow_nonnumeric_numbers(val);
+    return *this;
+  }
+
+  /**
+   * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without
+   * some form of escaping. Strict validation must be enabled for this to have any effect.
+   *
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
+   *
+   * @param val Boolean value to indicate if unquoted control chars are allowed or not.
+   * @return this for chaining
+   */
+  json_reader_options_builder& unquoted_control_chars(bool val)
+  {
+    options.allow_unquoted_control_chars(val);
+    return *this;
+  }
+
+  /**
+   * @brief Sets additional values to recognize as null values.
+   *
+   * @param vals Vector of values to be considered to be null
+   * @return this for chaining
+   */
+  json_reader_options_builder& na_values(std::vector<std::string> vals)
+  {
+    options.set_na_values(std::move(vals));
+    return *this;
+  }
+
   /**
    * @brief move json_reader_options member once it's built.
    */
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index 7899ea7bac4..97d5884fef1 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -16,6 +16,7 @@
 
 #include "io/fst/lookup_tables.cuh"
 
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/memory_resource.hpp>
@@ -302,6 +303,7 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& inda
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
+  CUDF_FUNC_RANGE();
   static constexpr std::int32_t min_out = 0;
   static constexpr std::int32_t max_out = 2;
   auto parser =
@@ -330,6 +332,7 @@ void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
                           rmm::cuda_stream_view stream,
                           rmm::device_async_resource_ref mr)
 {
+  CUDF_FUNC_RANGE();
   static constexpr std::int32_t min_out = 0;
   static constexpr std::int32_t max_out = 2;
   auto parser =
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index b06458e1a8e..75639a0438f 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -225,6 +225,21 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
   device_span<SymbolOffsetT const> token_indices,
   rmm::cuda_stream_view stream);
 
+/**
+ * @brief Validate the tokens conforming to behavior given in options.
+ *
+ * @param d_input The string of input characters
+ * @param tokens The tokens to be post-processed
+ * @param token_indices The tokens' corresponding indices that are post-processed
+ * @param options Parsing options specifying the parsing behaviour
+ * @param stream The cuda stream to dispatch GPU kernels to
+ */
+void validate_token_stream(device_span<char const> d_input,
+                           device_span<PdaTokenT> tokens,
+                           device_span<SymbolOffsetT> token_indices,
+                           cudf::io::json_reader_options const& options,
+                           rmm::cuda_stream_view stream);
+
 /**
  * @brief Parses the given JSON string and generates a tree representation of the given input.
  *
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index d76e5447c30..4e513d3495c 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1660,6 +1660,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
 
   if (delimiter_offset == 1) {
     tokens.set_element(0, token_t::LineEnd, stream);
+    validate_token_stream(json_in, tokens, tokens_indices, options, stream);
     auto [filtered_tokens, filtered_tokens_indices] =
       process_token_stream(tokens, tokens_indices, stream);
     tokens         = std::move(filtered_tokens);
@@ -2082,7 +2083,9 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt
   parse_opts.keepquotes = options.is_enabled_keep_quotes();
   parse_opts.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
   parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
-  parse_opts.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
+  std::vector<std::string> na_values{"", "null"};
+  na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end());
+  parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream);
   return parse_opts;
 }
 
diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
new file mode 100644
index 00000000000..83c7b663980
--- /dev/null
+++ b/cpp/src/io/json/process_tokens.cu
@@ -0,0 +1,310 @@
+
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "io/utilities/trie.cuh"
+#include "nested_json.hpp"
+#include "tabulate_output_iterator.cuh"
+
+#include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/tokenize_json.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/functional>
+#include <thrust/transform_scan.h>
+
+namespace cudf::io::json {
+namespace detail {
+
+struct write_if {
+  using token_t   = cudf::io::json::token_t;
+  using scan_type = thrust::pair<token_t, bool>;
+  PdaTokenT* tokens;
+  size_t n;
+  // Index, value
+  __device__ void operator()(size_type i, scan_type x)
+  {
+    if (i == n - 1 or tokens[i + 1] == token_t::LineEnd) {
+      if (x.first == token_t::ErrorBegin and tokens[i] != token_t::ErrorBegin) {
+        tokens[i] = token_t::ErrorBegin;
+      }
+    }
+  }
+};
+
+enum class number_state {
+  START = 0,
+  SAW_NEG,  // not a complete state
+  LEADING_ZERO,
+  WHOLE,
+  SAW_RADIX,  // not a complete state
+  FRACTION,
+  START_EXPONENT,       // not a complete state
+  AFTER_SIGN_EXPONENT,  // not a complete state
+  EXPONENT
+};
+
+enum class string_state {
+  NORMAL = 0,
+  ESCAPED,   // not a complete state
+  ESCAPED_U  // not a complete state
+};
+
+__device__ inline bool substr_eq(const char* data,
+                                 SymbolOffsetT const start,
+                                 SymbolOffsetT const end,
+                                 SymbolOffsetT const expected_len,
+                                 const char* expected)
+{
+  if (end - start != expected_len) { return false; }
+  for (auto idx = 0; idx < expected_len; idx++) {
+    if (data[start + idx] != expected[idx]) { return false; }
+  }
+  return true;
+}
+
+void validate_token_stream(device_span<char const> d_input,
+                           device_span<PdaTokenT> tokens,
+                           device_span<SymbolOffsetT> token_indices,
+                           cudf::io::json_reader_options const& options,
+                           rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  if (!options.is_strict_validation()) { return; }
+  using token_t = cudf::io::json::token_t;
+  cudf::detail::optional_trie trie_na =
+    cudf::detail::create_serialized_trie(options.get_na_values(), stream);
+  auto trie_na_view    = cudf::detail::make_trie_view(trie_na);
+  auto validate_values = cuda::proclaim_return_type<bool>(
+    [data                        = d_input.data(),
+     trie_na                     = trie_na_view,
+     allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(),
+     allow_nonnumeric =
+       options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start,
+                                                           SymbolOffsetT end) -> bool {
+      // This validates an unquoted value. A value must match https://www.json.org/json-en.html
+      // but the leading and training whitespace should already have been removed, and is not
+      // a string
+      auto c               = data[start];
+      auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start});
+      if (is_null_literal) {
+        return true;
+      } else if ('n' == c) {
+        return substr_eq(data, start, end, 4, "null");
+      } else if ('t' == c) {
+        return substr_eq(data, start, end, 4, "true");
+      } else if ('f' == c) {
+        return substr_eq(data, start, end, 5, "false");
+      } else if (allow_nonnumeric && c == 'N') {
+        return substr_eq(data, start, end, 3, "NaN");
+      } else if (allow_nonnumeric && c == 'I') {
+        return substr_eq(data, start, end, 8, "Infinity");
+      } else if (allow_nonnumeric && c == '+') {
+        return substr_eq(data, start, end, 4, "+INF") ||
+               substr_eq(data, start, end, 9, "+Infinity");
+      } else if ('-' == c || c <= '9' && 'c' >= '0') {
+        // number
+        auto num_state = number_state::START;
+        for (auto at = start; at < end; at++) {
+          c = data[at];
+          switch (num_state) {
+            case number_state::START:
+              if ('-' == c) {
+                num_state = number_state::SAW_NEG;
+              } else if ('0' == c) {
+                num_state = number_state::LEADING_ZERO;
+              } else if (c >= '1' && c <= '9') {
+                num_state = number_state::WHOLE;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::SAW_NEG:
+              if ('0' == c) {
+                num_state = number_state::LEADING_ZERO;
+              } else if (c >= '1' && c <= '9') {
+                num_state = number_state::WHOLE;
+              } else if (allow_nonnumeric && 'I' == c) {
+                return substr_eq(data, start, end, 4, "-INF") ||
+                       substr_eq(data, start, end, 9, "-Infinity");
+              } else {
+                return false;
+              }
+              break;
+            case number_state::LEADING_ZERO:
+              if (allow_numeric_leading_zeros && c >= '0' && c <= '9') {
+                num_state = number_state::WHOLE;
+              } else if ('.' == c) {
+                num_state = number_state::SAW_RADIX;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::START_EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::WHOLE:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::WHOLE;
+              } else if ('.' == c) {
+                num_state = number_state::SAW_RADIX;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::START_EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::SAW_RADIX:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::FRACTION;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::START_EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::FRACTION:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::FRACTION;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::START_EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::START_EXPONENT:
+              if ('+' == c || '-' == c) {
+                num_state = number_state::AFTER_SIGN_EXPONENT;
+              } else if (c >= '0' && c <= '9') {
+                num_state = number_state::EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::AFTER_SIGN_EXPONENT:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::EXPONENT:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+          }
+        }
+        return num_state != number_state::AFTER_SIGN_EXPONENT &&
+               num_state != number_state::START_EXPONENT && num_state != number_state::SAW_NEG &&
+               num_state != number_state::SAW_RADIX;
+      } else {
+        return false;
+      }
+    });
+
+  auto validate_strings = cuda::proclaim_return_type<bool>(
+    [data = d_input.data(),
+     allow_unquoted_control_chars =
+       options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start,
+                                                               SymbolOffsetT end) -> bool {
+      // This validates a quoted string. A string must match https://www.json.org/json-en.html
+      // but we already know that it has a starting and ending " and all white space has been
+      // stripped out. Also the base CUDF validation makes sure escaped chars are correct
+      // so we only need to worry about unquoted control chars
+
+      auto state   = string_state::NORMAL;
+      auto u_count = 0;
+      for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
+        auto c = data[idx];
+        if (!allow_unquoted_control_chars && static_cast<int>(c) >= 0 && static_cast<int>(c) < 32) {
+          return false;
+        }
+
+        switch (state) {
+          case string_state::NORMAL:
+            if (c == '\\') { state = string_state::ESCAPED; }
+            break;
+          case string_state::ESCAPED:
+            // in Spark you can allow any char to be escaped, but CUDF
+            // validates it in some cases so we need to also validate it.
+            if (c == 'u') {
+              state   = string_state::ESCAPED_U;
+              u_count = 0;
+            } else if (c == '"' || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' ||
+                       c == 'r' || c == 't') {
+              state = string_state::NORMAL;
+            } else {
+              return false;
+            }
+            break;
+          case string_state::ESCAPED_U:
+            if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) {
+              u_count++;
+              if (u_count == 4) {
+                state   = string_state::NORMAL;
+                u_count = 0;
+              }
+            } else {
+              return false;
+            }
+            break;
+        }
+      }
+      return string_state::NORMAL == state;
+    });
+
+  auto num_tokens = tokens.size();
+  auto count_it   = thrust::make_counting_iterator(0);
+  auto predicate  = [tokens        = tokens.begin(),
+                    token_indices = token_indices.begin(),
+                    validate_values,
+                    validate_strings] __device__(auto i) -> bool {
+    if (tokens[i] == token_t::ValueEnd) {
+      return !validate_values(token_indices[i - 1], token_indices[i]);
+    } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) {
+      return !validate_strings(token_indices[i - 1], token_indices[i]);
+    }
+    return false;
+  };
+
+  using scan_type            = write_if::scan_type;
+  auto conditional_write     = write_if{tokens.begin(), num_tokens};
+  auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write);
+  auto transform_op          = cuda::proclaim_return_type<scan_type>(
+    [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type {
+      if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
+      return {static_cast<token_t>(tokens[i]), tokens[i] == token_t::LineEnd};
+    });
+  auto binary_op = cuda::proclaim_return_type<scan_type>(
+    [] __device__(scan_type prev, scan_type curr) -> scan_type {
+      auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first);
+      return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second);
+    });
+
+  thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+                                   count_it,
+                                   count_it + num_tokens,
+                                   conditional_output_it,
+                                   transform_op,
+                                   binary_op);  // in-place scan
+}
+}  // namespace detail
+}  // namespace cudf::io::json
diff --git a/cpp/src/io/json/tabulate_output_iterator.cuh b/cpp/src/io/json/tabulate_output_iterator.cuh
new file mode 100644
index 00000000000..7cf3655e259
--- /dev/null
+++ b/cpp/src/io/json/tabulate_output_iterator.cuh
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/iterator_adaptor.h>
+
+namespace cudf {
+namespace detail {
+
+// Proxy reference that calls BinaryFunction with index value and the rhs of assignment operator
+template <typename BinaryFunction, typename IndexT>
+class tabulate_output_iterator_proxy {
+ public:
+  __host__ __device__ tabulate_output_iterator_proxy(const IndexT index, BinaryFunction fun)
+    : index(index), fun(fun)
+  {
+  }
+  template <typename T>
+  __host__ __device__ tabulate_output_iterator_proxy operator=(const T& rhs_value)
+  {
+    fun(index, rhs_value);
+    return *this;
+  }
+
+ private:
+  IndexT index;
+  BinaryFunction fun;
+};
+
+/**
+ * @brief Tabulate output iterator with custom binary function which takes index and value.
+ *
+ * @code {.cpp}
+ * #include "tabulate_output_iterator.cuh"
+ * #include <thrust/device_vector.h>
+ * #include <thrust/iterator/counting_iterator.h>
+ * #include <thrust/iterator/transform_iterator.h>
+ *
+ * struct set_bits_field {
+ *   int* bitfield;
+ *   __device__ inline void set_bit(size_t bit_index)
+ *   {
+ *     atomicOr(&bitfield[bit_index/32], (int{1} << (bit_index % 32)));
+ *   }
+ *   __device__ inline void clear_bit(size_t bit_index)
+ *   {
+ *     atomicAnd(&bitfield[bit_index / 32], ~(int{1} << (bit_index % 32)));
+ *   }
+ *   // Index, value
+ *   __device__ void operator()(size_t i, bool x)
+ *   {
+ *     if (x)
+ *       set_bit(i);
+ *     else
+ *       clear_bit(i);
+ *   }
+ * };
+ *
+ * thrust::device_vector<int> v(1, 0x00000000);
+ * auto result_begin = thrust::make_tabulate_output_iterator(set_bits_field{v.data().get()});
+ * auto value = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
+ *   [] __device__ (int x) {   return x%2; });
+ * thrust::copy(thrust::device, value, value+32, result_begin);
+ * assert(v[0] == 0xaaaaaaaa);
+ * @endcode
+ *
+ *
+ * @tparam BinaryFunction Binary function to be called with the Iterator value and the rhs of
+ * assignment operator.
+ * @tparam Iterator iterator type that acts as index of the output.
+ */
+template <typename BinaryFunction, typename IndexT = ptrdiff_t>
+class tabulate_output_iterator
+  : public thrust::iterator_adaptor<tabulate_output_iterator<BinaryFunction, IndexT>,
+                                    thrust::counting_iterator<IndexT>,
+                                    thrust::use_default,
+                                    thrust::use_default,
+                                    thrust::use_default,
+                                    tabulate_output_iterator_proxy<BinaryFunction, IndexT>> {
+ public:
+  // parent class.
+  using super_t = thrust::iterator_adaptor<tabulate_output_iterator<BinaryFunction, IndexT>,
+                                           thrust::counting_iterator<IndexT>,
+                                           thrust::use_default,
+                                           thrust::use_default,
+                                           thrust::use_default,
+                                           tabulate_output_iterator_proxy<BinaryFunction, IndexT>>;
+  // friend thrust::iterator_core_access to allow it access to the private interface dereference()
+  friend class thrust::iterator_core_access;
+  __host__ __device__ tabulate_output_iterator(BinaryFunction fun) : fun(fun) {}
+
+ private:
+  BinaryFunction fun;
+
+  // thrust::iterator_core_access accesses this function
+  __host__ __device__ typename super_t::reference dereference() const
+  {
+    return tabulate_output_iterator_proxy<BinaryFunction, IndexT>(*this->base(), fun);
+  }
+};
+
+template <typename BinaryFunction>
+tabulate_output_iterator<BinaryFunction> __host__ __device__
+make_tabulate_output_iterator(BinaryFunction fun)
+{
+  return tabulate_output_iterator<BinaryFunction>(fun);
+}  // end make_tabulate_output_iterator
+
+}  // namespace detail
+}  // namespace cudf
+
+// Register tabulate_output_iterator_proxy with 'is_proxy_reference' from
+// type_traits to enable its use with algorithms.
+template <class BinaryFunction, class IndexT>
+struct thrust::detail::is_proxy_reference<
+  cudf::detail::tabulate_output_iterator_proxy<BinaryFunction, IndexT>>
+  : public thrust::detail::true_type {};
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index c26e5ca3edb..960c19fce2e 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2180,6 +2180,86 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringSync)
   cudf::set_pinned_memory_resource(last_mr);
 }
 
+// Validation
+TEST_F(JsonReaderTest, ValueValidation)
+{
+  // parsing error as null rows
+  std::string data =
+    // 0 -> a: -2 (valid)
+    R"({"a":-2 }{})"
+    "\n"
+    // 1 -> (invalid)
+    R"({"b":{}should_be_invalid})"
+    "\n"
+    // 2 -> b (valid)
+    R"({"b":{"a":3} })"
+    "\n"
+    // 3 -> c: (valid/null based on option)
+    R"({"a": 1, "c":nan, "d": "null" } )"
+    "\n"
+    "\n"
+    // 4 -> (valid/null based on option)
+    R"({"a":04, "c": 1.23, "d": "abc"} 123)"
+    "\n"
+    // 5 -> (valid)
+    R"({"a":5}//Comment after record)"
+    "\n"
+    // 6 -> ((valid/null based on option)
+    R"({"a":06} //Comment after whitespace)"
+    "\n"
+    // 7 -> (invalid)
+    R"({"a":5 //Invalid Comment within record})";
+
+  // leadingZeros allowed
+  // na_values,
+  {
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+        .lines(true)
+        .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+        .strict_validation(true);
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+    EXPECT_EQ(result.tbl->num_columns(), 4);
+    EXPECT_EQ(result.tbl->num_rows(), 8);
+    auto b_a_col  = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0});
+    auto a_column = int64_wrapper{{-2, 0, 0, 0, 4, 5, 6, 0},
+                                  {true, false, false, false, true, true, true, false}};
+    auto b_column = cudf::test::structs_column_wrapper(
+      {b_a_col}, {false, false, true, false, false, false, false, false});
+    auto c_column = float64_wrapper({0.0, 0.0, 0.0, 0.0, 1.23, 0.0, 0.0, 0.0},
+                                    {false, false, false, false, true, false, false, false});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), a_column);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), b_column);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), c_column);
+  }
+  // leadingZeros not allowed, NaN allowed
+  {
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+        .lines(true)
+        .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+        .strict_validation(true)
+        .numeric_leading_zeros(false)
+        .na_values({"nan"});
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+    EXPECT_EQ(result.tbl->num_columns(), 4);
+    EXPECT_EQ(result.tbl->num_rows(), 8);
+    EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::INT8);  // empty column
+    auto b_a_col  = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0});
+    auto a_column = int64_wrapper{{-2, 0, 0, 1, 4, 5, 6, 0},
+                                  {true, false, false, true, false, true, false, false}};
+    auto b_column = cudf::test::structs_column_wrapper(
+      {b_a_col}, {false, false, true, false, false, false, false, false});
+    auto c_column = int8_wrapper({0, 0, 0, 0, 0, 0, 0, 0},
+                                 {false, false, false, false, false, false, false, false});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), a_column);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), b_column);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), c_column);
+  }
+}
+
 TEST_F(JsonReaderTest, MixedTypes)
 {
   using LCWS    = cudf::test::lists_column_wrapper<cudf::string_view>;
diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index b37d0d88ec9..c8308ca17ec 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -34,6 +34,10 @@ public final class JSONOptions extends ColumnFilterOptions {
   private final boolean normalizeWhitespace;
   private final boolean mixedTypesAsStrings;
   private final boolean keepStringQuotes;
+  private final boolean strictValidation;
+  private final boolean allowLeadingZeros;
+  private final boolean allowNonNumericNumbers;
+  private final boolean allowUnquotedControlChars;
 
   private JSONOptions(Builder builder) {
     super(builder);
@@ -44,6 +48,10 @@ private JSONOptions(Builder builder) {
     normalizeWhitespace = builder.normalizeWhitespace;
     mixedTypesAsStrings = builder.mixedTypesAsStrings;
     keepStringQuotes = builder.keepQuotes;
+    strictValidation = builder.strictValidation;
+    allowLeadingZeros = builder.allowLeadingZeros;
+    allowNonNumericNumbers = builder.allowNonNumericNumbers;
+    allowUnquotedControlChars = builder.allowUnquotedControlChars;
   }
 
   public boolean isDayFirst() {
@@ -75,6 +83,22 @@ public boolean keepStringQuotes() {
     return keepStringQuotes;
   }
 
+  public boolean strictValidation() {
+    return strictValidation;
+  }
+
+  public boolean leadingZerosAllowed() {
+    return allowLeadingZeros;
+  }
+
+  public boolean nonNumericNumbersAllowed() {
+    return allowNonNumericNumbers;
+  }
+
+  public boolean unquotedControlChars() {
+    return allowUnquotedControlChars;
+  }
+
   @Override
   String[] getIncludeColumnNames() {
     throw new UnsupportedOperationException("JSON reader didn't support column prune");
@@ -85,6 +109,10 @@ public static Builder builder() {
   }
 
   public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptions.Builder> {
+    private boolean strictValidation = false;
+    private boolean allowUnquotedControlChars = true;
+    private boolean allowNonNumericNumbers = false;
+    private boolean allowLeadingZeros = false;
     private boolean dayFirst = false;
     private boolean lines = true;
 
@@ -95,10 +123,45 @@ public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptio
     private boolean mixedTypesAsStrings = false;
     private boolean keepQuotes = false;
 
+    /**
+     * Should json validation be strict or not
+     */
+    public Builder withStrictValidation(boolean isAllowed) {
+      strictValidation = isAllowed;
+      return this;
+    }
+
+    /**
+     * Should leading zeros on numbers be allowed or not. Strict validation
+     * must be enabled for this to have any effect.
+     */
+    public Builder withLeadingZeros(boolean isAllowed) {
+      allowLeadingZeros = isAllowed;
+      return this;
+    }
+
+    /**
+     * Should non-numeric numbers be allowed or not. Strict validation
+     * must be enabled for this to have any effect.
+     */
+    public Builder withNonNumericNumbers(boolean isAllowed) {
+      allowNonNumericNumbers = isAllowed;
+      return this;
+    }
+
+    /**
+     * Should unquoted control chars be allowed in strings. Strict validation
+     * must be enabled for this to have any effect.
+     */
+    public Builder withUnquotedControlChars(boolean isAllowed) {
+      allowUnquotedControlChars = isAllowed;
+      return this;
+    }
+
     /**
      * Whether to parse dates as DD/MM versus MM/DD
      * @param dayFirst true: DD/MM, false, MM/DD
-     * @return
+     * @return builder for chaining
      */
     public Builder withDayFirst(boolean dayFirst) {
       this.dayFirst = dayFirst;
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 36e342cae13..cbb126d7ee5 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -254,7 +254,11 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
                                         boolean normalizeSingleQuotes,
                                         boolean normalizeWhitespace,
                                         boolean mixedTypesAsStrings,
-                                        boolean keepStringQuotes) throws CudfException;
+                                        boolean keepStringQuotes,
+                                        boolean strictValidation,
+                                        boolean allowLeadingZeros,
+                                        boolean allowNonNumericNumbers,
+                                        boolean allowUnquotedControl) throws CudfException;
 
   private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
                                       int[] dTypeIds, int[] dTypeScales,
@@ -264,6 +268,10 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
                                       boolean normalizeWhitespace,
                                       boolean mixedTypesAsStrings,
                                       boolean keepStringQuotes,
+                                      boolean strictValidation,
+                                      boolean allowLeadingZeros,
+                                      boolean allowNonNumericNumbers,
+                                      boolean allowUnquotedControl,
                                       long dsHandle) throws CudfException;
 
   private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines,
@@ -272,7 +280,12 @@ private static native long readAndInferJSONFromDataSource(boolean dayFirst, bool
                                       boolean normalizeWhitespace,
                                       boolean mixedTypesAsStrings,
                                       boolean keepStringQuotes,
+                                      boolean strictValidation,
+                                      boolean allowLeadingZeros,
+                                      boolean allowNonNumericNumbers,
+                                      boolean allowUnquotedControl,
                                       long dsHandle) throws CudfException;
+
   private static native long readAndInferJSON(long address, long length,
                                               boolean dayFirst,
                                               boolean lines,
@@ -280,7 +293,11 @@ private static native long readAndInferJSON(long address, long length,
                                               boolean normalizeSingleQuotes,
                                               boolean normalizeWhitespace,
                                               boolean mixedTypesAsStrings,
-                                              boolean keepStringQuotes) throws CudfException;
+                                              boolean keepStringQuotes,
+                                              boolean strictValidation,
+                                              boolean allowLeadingZeros,
+                                              boolean allowNonNumericNumbers,
+                                              boolean allowUnquotedControl) throws CudfException;
 
   /**
    * Read in Parquet formatted data.
@@ -1292,7 +1309,11 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
                     opts.isNormalizeSingleQuotes(),
                     opts.isNormalizeWhitespace(),
                     opts.isMixedTypesAsStrings(),
-                opts.keepStringQuotes()))) {
+                    opts.keepStringQuotes(),
+                    opts.strictValidation(),
+                    opts.leadingZerosAllowed(),
+                    opts.nonNumericNumbersAllowed(),
+                    opts.unquotedControlChars()))) {
 
       return gatherJSONColumns(schema, twm, -1);
     }
@@ -1370,7 +1391,12 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
         opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
         opts.isNormalizeSingleQuotes(),
         opts.isNormalizeWhitespace(),
-        opts.isMixedTypesAsStrings(), opts.keepStringQuotes()));
+        opts.isMixedTypesAsStrings(),
+        opts.keepStringQuotes(),
+        opts.strictValidation(),
+        opts.leadingZerosAllowed(),
+        opts.nonNumericNumbersAllowed(),
+        opts.unquotedControlChars()));
   }
 
   /**
@@ -1388,6 +1414,10 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
           opts.isNormalizeWhitespace(),
           opts.isMixedTypesAsStrings(),
           opts.keepStringQuotes(),
+          opts.strictValidation(),
+          opts.leadingZerosAllowed(),
+          opts.nonNumericNumbersAllowed(),
+          opts.unquotedControlChars(),
           dsHandle));
         return twm;
       } finally {
@@ -1430,10 +1460,18 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
     try (TableWithMeta twm = new TableWithMeta(readJSON(
             schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
             schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null,
-            buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(),
-            opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
+            buffer.getAddress() + offset, len,
+            opts.isDayFirst(),
+            opts.isLines(),
+            opts.isRecoverWithNull(),
+            opts.isNormalizeSingleQuotes(),
             opts.isNormalizeWhitespace(),
-            opts.isMixedTypesAsStrings(), opts.keepStringQuotes()))) {
+            opts.isMixedTypesAsStrings(),
+            opts.keepStringQuotes(),
+            opts.strictValidation(),
+            opts.leadingZerosAllowed(),
+            opts.nonNumericNumbersAllowed(),
+            opts.unquotedControlChars()))) {
       return gatherJSONColumns(schema, twm, emptyRowCount);
     }
   }
@@ -1454,17 +1492,26 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
    * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
    * @param opts various JSON parsing options.
    * @param ds the DataSource to read from.
-   * @param emtpyRowCount the number of rows to return if no columns were read.
+   * @param emptyRowCount the number of rows to return if no columns were read.
    * @return the data parsed as a table on the GPU.
    */
-  public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emtpyRowCount) {
+  public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) {
     long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
     try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
-        schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(),
-        opts.isLines(), opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
+        schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
+        opts.isDayFirst(),
+        opts.isLines(),
+        opts.isRecoverWithNull(),
+        opts.isNormalizeSingleQuotes(),
         opts.isNormalizeWhitespace(),
-        opts.isMixedTypesAsStrings(), opts.keepStringQuotes(), dsHandle))) {
-      return gatherJSONColumns(schema, twm, emtpyRowCount);
+        opts.isMixedTypesAsStrings(),
+        opts.keepStringQuotes(),
+        opts.strictValidation(),
+        opts.leadingZerosAllowed(),
+        opts.nonNumericNumbersAllowed(),
+        opts.unquotedControlChars(),
+        dsHandle))) {
+      return gatherJSONColumns(schema, twm, emptyRowCount);
     } finally {
       DataSourceHelper.destroyWrapperDataSource(dsHandle);
     }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index c5abf08a59d..40a111209b0 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1623,6 +1623,10 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
                                                          jboolean normalize_whitespace,
                                                          jboolean mixed_types_as_string,
                                                          jboolean keep_quotes,
+                                                         jboolean strict_validation,
+                                                         jboolean allow_leading_zeros,
+                                                         jboolean allow_nonnumeric_numbers,
+                                                         jboolean allow_unquoted_control,
                                                          jlong ds_handle)
 {
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
@@ -1642,8 +1646,13 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
+        .strict_validation(strict_validation)
         .keep_quotes(keep_quotes);
-
+    if (strict_validation) {
+      opts.numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control);
+    }
     auto result =
       std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
 
@@ -1652,17 +1661,22 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
-                                                                   jclass,
-                                                                   jlong buffer,
-                                                                   jlong buffer_length,
-                                                                   jboolean day_first,
-                                                                   jboolean lines,
-                                                                   jboolean recover_with_null,
-                                                                   jboolean normalize_single_quotes,
-                                                                   jboolean normalize_whitespace,
-                                                                   jboolean mixed_types_as_string,
-                                                                   jboolean keep_quotes)
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
+                                           jclass,
+                                           jlong buffer,
+                                           jlong buffer_length,
+                                           jboolean day_first,
+                                           jboolean lines,
+                                           jboolean recover_with_null,
+                                           jboolean normalize_single_quotes,
+                                           jboolean normalize_whitespace,
+                                           jboolean mixed_types_as_string,
+                                           jboolean keep_quotes,
+                                           jboolean strict_validation,
+                                           jboolean allow_leading_zeros,
+                                           jboolean allow_nonnumeric_numbers,
+                                           jboolean allow_unquoted_control)
 {
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
   if (buffer_length <= 0) {
@@ -1684,8 +1698,14 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
         .recovery_mode(recovery_mode)
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+        .strict_validation(strict_validation)
         .mixed_types_as_string(mixed_types_as_string)
         .keep_quotes(keep_quotes);
+    if (strict_validation) {
+      opts.numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control);
+    }
 
     auto result =
       std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
@@ -1790,6 +1810,10 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
                                                  jboolean normalize_whitespace,
                                                  jboolean mixed_types_as_string,
                                                  jboolean keep_quotes,
+                                                 jboolean strict_validation,
+                                                 jboolean allow_leading_zeros,
+                                                 jboolean allow_nonnumeric_numbers,
+                                                 jboolean allow_unquoted_control,
                                                  jlong ds_handle)
 {
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
@@ -1824,7 +1848,13 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
+        .strict_validation(strict_validation)
         .keep_quotes(keep_quotes);
+    if (strict_validation) {
+      opts.numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control);
+    }
 
     if (!n_types.is_null()) {
       if (n_types.size() != n_scales.size()) {
@@ -1874,7 +1904,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
                                                            jboolean normalize_single_quotes,
                                                            jboolean normalize_whitespace,
                                                            jboolean mixed_types_as_string,
-                                                           jboolean keep_quotes)
+                                                           jboolean keep_quotes,
+                                                           jboolean strict_validation,
+                                                           jboolean allow_leading_zeros,
+                                                           jboolean allow_nonnumeric_numbers,
+                                                           jboolean allow_unquoted_control)
 {
   bool read_buffer = true;
   if (buffer == 0) {
@@ -1923,7 +1957,13 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
+        .strict_validation(strict_validation)
         .keep_quotes(keep_quotes);
+    if (strict_validation) {
+      opts.numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control);
+    }
 
     if (!n_types.is_null()) {
       if (n_types.size() != n_scales.size()) {
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 050bcbb268f..56fe63598d9 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -437,6 +437,7 @@ void testReadWhitespacesJSONFile() throws IOException {
     }
   }
 
+  @Test
   void testReadSingleQuotesJSONFileKeepQuotes() throws IOException {
     Schema schema = Schema.builder()
         .column(DType.STRING, "A")
@@ -455,6 +456,206 @@ void testReadSingleQuotesJSONFileKeepQuotes() throws IOException {
     }
   }
 
+  private static final byte[] JSON_VALIDATION_BUFFER = (
+      "{\"a\":true}\n" +
+      "{\"a\":false}\n" +
+      "{\"a\":null}\n" +
+      "{\"a\":true, \"b\":truee}\n" +
+      "{\"a\":true, \"b\":\"nulll\"}\n" +
+      "{\"a\": 1}\n" +
+      "{\"a\": 0}\n" +
+      "{\"a\": -}\n" +
+      "{\"a\": -0}\n" +
+      "{\"a\": -01}\n" +
+
+      "{\"a\": 01}\n" +
+      "{\"a\": -0.1}\n" +
+      "{\"a\": -00.1}\n" +
+      "{\"a\": NaN}\n" +
+      "{\"a\": INF}\n" +
+      "{\"a\": +INF}\n" +
+      "{\"a\": -INF}\n" +
+      "{\"a\": +Infinity}\n" +
+      "{\"a\": Infinity}\n" +
+      "{\"a\": -Infinity}\n" +
+
+      "{\"a\": INFinity}\n" +
+      "{\"a\":\"3710-11-10T02:46:58.732Z\"}\n" +
+      "{\"a\":12.}\n" +
+      "{\"a\": -3.4e+38}\n" +
+      "{\"a\": -3.4e-38}\n" +
+      "{\"a\": 1.4e38}\n" +
+      "{\"a\": -3.4E+38}\n" +
+      "{\"a\": -3.4E-38}\n" +
+      "{\"a\": 1.4E38}\n" +
+      "{\"a\": -3.4E+}\n" +
+
+      "{\"a\": -3.4E-}\n" +
+      "{\"a\": \"A\u0000B\"}\n" +
+      "{\"a\": \"A\\u0000B\"}\n" +
+      "{\"a\": \"A\u0001B\"}\n" +
+      "{\"a\": \"A\\u0001B\"}\n" +
+      "{\"a\": \"A\u001FB\"}\n" +
+      "{\"a\": \"A\\u001FB\"}\n" +
+      "{\"a\": \"A\u0020B\"}\n" +
+      "{\"a\": \"A\\u0020B\"}\n" +
+      "{\"a\": \"\\u12\"}\n" +
+
+      "{\"a\": \"\\z\"}\n" +
+      "{\"a\": \"\\r\"}\n" +
+      "{\"a\": \"something\", \"b\": \"\\z\"}\n"
+  ).getBytes(StandardCharsets.UTF_8);
+
+  @Test
+  void testJSONValidationNoStrict() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(false)
+        .withLeadingZeros(false)
+        .withNonNumericNumbers(false)
+        .withUnquotedControlChars(true)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, "true", "true", "1", "0", "-", "-0", "-01",
+            "01", "-0.1", "-00.1", "NaN", "INF", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity",
+            "INFinity", "\"3710-11-10T02:46:58.732Z\"", "12.", "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", "-3.4E+",
+            "-3.4E-", "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", "\"something\"")
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testJSONValidation() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(true)
+        .withLeadingZeros(false)
+        .withNonNumericNumbers(false)
+        .withUnquotedControlChars(true)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, null, "true", "1", "0", null, "-0", null,
+            null, "-0.1", null, null, null, null, null, null, null, null,
+            null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
+            null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", null)
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testJSONValidationLeadingZeros() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(true)
+        .withLeadingZeros(true)
+        .withNonNumericNumbers(false)
+        .withUnquotedControlChars(true)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, null, "true", "1", "0", null, "-0", "-01",
+            "01", "-0.1", "-00.1", null, null, null, null, null, null, null,
+            null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
+            null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", null)
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testJSONValidationNonNumeric() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(true)
+        .withLeadingZeros(false)
+        .withNonNumericNumbers(true)
+        .withUnquotedControlChars(true)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, null, "true", "1", "0", null, "-0", null,
+            null, "-0.1", null, "NaN", null, "+INF", "-INF", "+Infinity", "Infinity", "-Infinity",
+            null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
+            null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", null)
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testJSONValidationUnquotedControl() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(true)
+        .withLeadingZeros(false)
+        .withNonNumericNumbers(false)
+        .withUnquotedControlChars(false)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, null, "true", "1", "0", null, "-0", null,
+            null, "-0.1", null, null, null, null, null, null, null, null,
+            null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
+            null, null, "\"A\u0000B\"", null, "\"A\u0001B\"", null, "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", null)
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" +
       "{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" +
       "{\"d\":[1,2,3]}\n" +

From 985f671e1308c97de992887f3bccedced494fa44 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 11 Sep 2024 13:03:19 -0400
Subject: [PATCH 802/842] Fix slice_strings wide strings logic with multi-byte
 characters (#16777)

Fixes logic error in computing character and byte counts for slice positions in strings with specific pattern of multi-byte characters.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Jason Lowe (https://github.com/jlowe)
  - Zach Puller (https://github.com/zpuller)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16777
---
 cpp/src/strings/slice.cu          |  8 +++++---
 cpp/tests/strings/slice_tests.cpp | 19 +++++++++++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/cpp/src/strings/slice.cu b/cpp/src/strings/slice.cu
index 978a844c476..4c39fc96397 100644
--- a/cpp/src/strings/slice.cu
+++ b/cpp/src/strings/slice.cu
@@ -122,26 +122,28 @@ CUDF_KERNEL void substring_from_kernel(column_device_view const d_strings,
       break;
     }
     size_type const cc = (itr < end) && is_begin_utf8_char(*itr);
-    size_type const bc = (itr < end);
+    size_type const bc = (itr < end) ? bytes_in_utf8_byte(*itr) : 0;
     char_count += cg::reduce(warp, cc, cg::plus<int>());
     byte_count += cg::reduce(warp, bc, cg::plus<int>());
     itr += cudf::detail::warp_size;
   }
 
+  __syncwarp();
+
   if (warp.thread_rank() == 0) {
     if (start >= char_count) {
       d_output[str_idx] = string_index_pair{"", 0};
       return;
     }
 
-    // we are just below start/stop and must now increment up to it from here
+    // we are just below start/stop and must now increment up to them from here
     auto first_byte = start_counts.second;
     if (start_counts.first < start) {
       auto const sub_str = string_view(d_str.data() + first_byte, d_str.size_bytes() - first_byte);
       first_byte += std::get<0>(bytes_to_character_position(sub_str, start - start_counts.first));
     }
 
-    stop           = max(stop, char_count);
+    stop           = min(stop, char_count);
     auto last_byte = stop_counts.second;
     if (stop_counts.first < stop) {
       auto const sub_str = string_view(d_str.data() + last_byte, d_str.size_bytes() - last_byte);
diff --git a/cpp/tests/strings/slice_tests.cpp b/cpp/tests/strings/slice_tests.cpp
index 52e439bd93f..7f7fd9d521b 100644
--- a/cpp/tests/strings/slice_tests.cpp
+++ b/cpp/tests/strings/slice_tests.cpp
@@ -268,6 +268,25 @@ TEST_F(StringsSliceTest, MaxPositions)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
+TEST_F(StringsSliceTest, MultiByteChars)
+{
+  auto input = cudf::test::strings_column_wrapper({
+    // clang-format off
+    "quick brown fox jumped over the lazy brown dog; the fat cats jump in place without moving "
+    "the following code snippet demonstrates how to use search for values in an ordered range  "
+            // this placement tests proper multi-byte chars handling  ------vvvvv
+    "it returns the last position where value could be inserted without the ééééé ordering ",
+    "algorithms execution is parallelized as determined by an execution policy; this is a 12345"
+    "continuation of previous row to make sure string boundaries are honored 012345678901234567"
+           //   v--- this one also
+    "01234567890é34567890012345678901234567890"
+    // clang-format on
+  });
+
+  auto results = cudf::strings::slice_strings(cudf::strings_column_view(input), 0);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
+}
+
 TEST_F(StringsSliceTest, Error)
 {
   cudf::test::strings_column_wrapper strings{"this string intentionally left blank"};

From 0b32f55b1ed38507437770d21da1e4e1a1c4a17d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Wed, 11 Sep 2024 13:33:37 -0400
Subject: [PATCH 803/842] Fix nvbench output for sha512 (#16773)

Fixes the `sha512` output for nvbench for `GlobalMem BW`.
Previously:
```
|    65536 |     0 | sha512 |   1216x | 417.898 us |  1.40% | 412.669 us | 0.61% |     24.139 GB/s |           3.14% |
| 16777216 |     0 | sha512 |     11x |  71.392 ms |  0.03% |  71.387 ms | 0.03% | 258404.649 PB/s | 33642233417.78% |
|    65536 |   0.1 | sha512 |   1184x | 433.031 us |  1.58% | 427.815 us | 1.01% |     22.919 GB/s |           2.98% |
| 16777216 |   0.1 | sha512 |     11x |  73.457 ms |  0.03% |  73.452 ms | 0.03% | 251140.174 PB/s | 32696456458.71% |
```
Fixed integer overflow calculation:
```
|    65536 |     0 | sha512 |   1200x | 423.838 us |  1.42% | 418.561 us | 0.66% |  23.799 GB/s |  3.10% |
| 16777216 |     0 | sha512 |     11x |  72.773 ms |  0.11% |  72.767 ms | 0.11% |  35.041 GB/s |  4.56% |
|    65536 |   0.1 | sha512 |   1168x | 439.078 us |  1.60% | 433.843 us | 1.05% |  22.601 GB/s |  2.94% |
| 16777216 |   0.1 | sha512 |     19x |  75.108 ms |  0.49% |  75.102 ms | 0.49% |  33.412 GB/s |  4.35% |
```

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16773
---
 cpp/benchmarks/hashing/hash.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp
index 61e79a47a50..e4ff0c8c4a7 100644
--- a/cpp/benchmarks/hashing/hash.cpp
+++ b/cpp/benchmarks/hashing/hash.cpp
@@ -50,7 +50,7 @@ static void bench_hash(nvbench::state& state)
   state.add_global_memory_reads<nvbench::int64_t>(num_rows);
   // add memory read from bitmaks
   if (!no_nulls) {
-    state.add_global_memory_reads<nvbench::int8_t>(2 *
+    state.add_global_memory_reads<nvbench::int8_t>(2L *
                                                    cudf::bitmask_allocation_size_bytes(num_rows));
   }
   // memory written depends on used hash
@@ -63,37 +63,37 @@ static void bench_hash(nvbench::state& state)
     });
   } else if (hash_name == "md5") {
     // md5 creates a 32-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(32 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(32L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::md5(data->view()); });
   } else if (hash_name == "sha1") {
     // sha1 creates a 40-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(40 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(40L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha1(data->view()); });
   } else if (hash_name == "sha224") {
     // sha224 creates a 56-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(56 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(56L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha224(data->view()); });
   } else if (hash_name == "sha256") {
     // sha256 creates a 64-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(64 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(64L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha256(data->view()); });
   } else if (hash_name == "sha384") {
     // sha384 creates a 96-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(96 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(96L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha384(data->view()); });
   } else if (hash_name == "sha512") {
     // sha512 creates a 128-byte string
-    state.add_global_memory_writes<nvbench::int8_t>(128 * num_rows);
+    state.add_global_memory_writes<nvbench::int8_t>(128L * num_rows);
 
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = cudf::hashing::sha512(data->view()); });

From e063baa7a447a8273c213c6fbef2ffc93a95ff99 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 11 Sep 2024 15:14:26 -0700
Subject: [PATCH 804/842] Support reading multiple PQ sources with mismatching
 nullability for columns (#16639)

Related to #12702.

This PR adds support of reading multiple Parquet files with mismatched nullability for input columns. i.e. A column may not be nullable in one input file and nullable in another file.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Matthew Roeschke (https://github.com/mroeschke)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/16639
---
 cpp/src/io/parquet/page_decode.cuh           |   2 +-
 cpp/src/io/parquet/parquet.hpp               |   7 +-
 cpp/src/io/parquet/parquet_gpu.hpp           |   7 +-
 cpp/src/io/parquet/reader_impl_chunking.cu   |  18 +-
 cpp/src/io/parquet/reader_impl_helpers.cpp   | 120 ++++++---
 cpp/src/io/parquet/reader_impl_helpers.hpp   |  27 +-
 cpp/src/io/parquet/reader_impl_preprocess.cu | 104 +++++---
 python/cudf/cudf/tests/test_parquet.py       | 254 ++++++++++++++++---
 8 files changed, 418 insertions(+), 121 deletions(-)

diff --git a/cpp/src/io/parquet/page_decode.cuh b/cpp/src/io/parquet/page_decode.cuh
index a3f91f6859b..9ed2929a70e 100644
--- a/cpp/src/io/parquet/page_decode.cuh
+++ b/cpp/src/io/parquet/page_decode.cuh
@@ -893,7 +893,7 @@ __device__ void gpuDecodeLevels(page_state_s* s,
 {
   bool has_repetition = s->col.max_level[level_type::REPETITION] > 0;
 
-  constexpr int batch_size = 32;
+  constexpr int batch_size = cudf::detail::warp_size;
   int cur_leaf_count       = target_leaf_count;
   while (s->error == 0 && s->nz_count < target_leaf_count &&
          s->input_value_count < s->num_input_values) {
diff --git a/cpp/src/io/parquet/parquet.hpp b/cpp/src/io/parquet/parquet.hpp
index 5d10472b0ae..7c985643887 100644
--- a/cpp/src/io/parquet/parquet.hpp
+++ b/cpp/src/io/parquet/parquet.hpp
@@ -203,10 +203,9 @@ struct SchemaElement {
   bool operator==(SchemaElement const& other) const
   {
     return type == other.type && converted_type == other.converted_type &&
-           type_length == other.type_length && repetition_type == other.repetition_type &&
-           name == other.name && num_children == other.num_children &&
-           decimal_scale == other.decimal_scale && decimal_precision == other.decimal_precision &&
-           field_id == other.field_id;
+           type_length == other.type_length && name == other.name &&
+           num_children == other.num_children && decimal_scale == other.decimal_scale &&
+           decimal_precision == other.decimal_precision && field_id == other.field_id;
   }
 
   // the parquet format is a little squishy when it comes to interpreting
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 125d35f6499..1390339c1ae 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -400,7 +400,8 @@ struct ColumnChunkDesc {
                            int32_t src_col_schema_,
                            column_chunk_info const* chunk_info_,
                            float list_bytes_per_row_est_,
-                           bool strings_to_categorical_)
+                           bool strings_to_categorical_,
+                           int32_t src_file_idx_)
     : compressed_data(compressed_data_),
       compressed_size(compressed_size_),
       num_values(num_values_),
@@ -419,7 +420,8 @@ struct ColumnChunkDesc {
       src_col_schema(src_col_schema_),
       h_chunk_info(chunk_info_),
       list_bytes_per_row_est(list_bytes_per_row_est_),
-      is_strings_to_cat(strings_to_categorical_)
+      is_strings_to_cat(strings_to_categorical_),
+      src_file_idx(src_file_idx_)
 
   {
   }
@@ -456,6 +458,7 @@ struct ColumnChunkDesc {
 
   bool is_strings_to_cat{};    // convert strings to hashes
   bool is_large_string_col{};  // `true` if string data uses 64-bit offsets
+  int32_t src_file_idx{};      // source file index
 };
 
 /**
diff --git a/cpp/src/io/parquet/reader_impl_chunking.cu b/cpp/src/io/parquet/reader_impl_chunking.cu
index 245e1829c72..c588fedb85c 100644
--- a/cpp/src/io/parquet/reader_impl_chunking.cu
+++ b/cpp/src/io/parquet/reader_impl_chunking.cu
@@ -1511,10 +1511,13 @@ void reader::impl::create_global_chunk_info()
     std::transform(
       _input_columns.begin(), _input_columns.end(), column_mapping.begin(), [&](auto const& col) {
         // translate schema_idx into something we can use for the page indexes
-        if (auto it = std::find_if(
-              columns.begin(),
-              columns.end(),
-              [&col](auto const& col_chunk) { return col_chunk.schema_idx == col.schema_idx; });
+        if (auto it = std::find_if(columns.begin(),
+                                   columns.end(),
+                                   [&](auto const& col_chunk) {
+                                     return col_chunk.schema_idx ==
+                                            _metadata->map_schema_index(col.schema_idx,
+                                                                        rg.source_index);
+                                   });
             it != columns.end()) {
           return std::distance(columns.begin(), it);
         }
@@ -1535,7 +1538,8 @@ void reader::impl::create_global_chunk_info()
       auto col = _input_columns[i];
       // look up metadata
       auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
-      auto& schema   = _metadata->get_schema(col.schema_idx);
+      auto& schema   = _metadata->get_schema(
+        _metadata->map_schema_index(col.schema_idx, rg.source_index), rg.source_index);
 
       auto [clock_rate, logical_type] =
         conversion_info(to_type_id(schema, _strings_to_categorical, _options.timestamp_type.id()),
@@ -1574,9 +1578,9 @@ void reader::impl::create_global_chunk_info()
                                        col.schema_idx,
                                        chunk_info,
                                        list_bytes_per_row_est,
-                                       schema.type == BYTE_ARRAY and _strings_to_categorical));
+                                       schema.type == BYTE_ARRAY and _strings_to_categorical,
+                                       rg.source_index));
     }
-
     // Adjust for skip_rows when updating the remaining rows after the first group
     remaining_rows -=
       (skip_rows) ? std::min<int>(rg.start_row + row_group.num_rows - skip_rows, remaining_rows)
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
index 8b5678f202b..6d566b5815e 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.cpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -423,8 +423,13 @@ void aggregate_reader_metadata::column_info_for_row_group(row_group_info& rg_inf
   std::vector<column_chunk_info> chunks(rg.columns.size());
 
   for (size_t col_idx = 0; col_idx < rg.columns.size(); col_idx++) {
-    auto const& col_chunk    = rg.columns[col_idx];
-    auto& schema             = get_schema(col_chunk.schema_idx);
+    auto const& col_chunk = rg.columns[col_idx];
+    auto const is_schema_idx_mapped =
+      is_schema_index_mapped(col_chunk.schema_idx, rg_info.source_index);
+    auto const mapped_schema_idx = is_schema_idx_mapped
+                                     ? map_schema_index(col_chunk.schema_idx, rg_info.source_index)
+                                     : col_chunk.schema_idx;
+    auto& schema = get_schema(mapped_schema_idx, is_schema_idx_mapped ? rg_info.source_index : 0);
     auto const max_def_level = schema.max_definition_level;
     auto const max_rep_level = schema.max_repetition_level;
 
@@ -559,22 +564,40 @@ aggregate_reader_metadata::aggregate_reader_metadata(
     num_rows(calc_num_rows()),
     num_row_groups(calc_num_row_groups())
 {
-  // Validate that all sources have the same schema unless we are reading select columns
-  // from mismatched sources, in which case, we will only check the projected columns later.
-  if (per_file_metadata.size() > 1 and not has_cols_from_mismatched_srcs) {
-    auto const& first_meta = per_file_metadata.front();
+  if (per_file_metadata.size() > 1) {
+    auto& first_meta = per_file_metadata.front();
     auto const num_cols =
       first_meta.row_groups.size() > 0 ? first_meta.row_groups.front().columns.size() : 0;
-    auto const& schema = first_meta.schema;
-
-    // Verify that the input files have matching numbers of columns and schema.
-    for (auto const& pfm : per_file_metadata) {
-      if (pfm.row_groups.size() > 0) {
-        CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(),
-                     "All sources must have the same number of columns");
+    auto& schema = first_meta.schema;
+
+    // Validate that all sources have the same schema unless we are reading select columns
+    // from mismatched sources, in which case, we will only check the projected columns later.
+    if (not has_cols_from_mismatched_srcs) {
+      // Verify that the input files have matching numbers of columns and schema.
+      for (auto const& pfm : per_file_metadata) {
+        if (pfm.row_groups.size() > 0) {
+          CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(),
+                       "All sources must have the same number of columns");
+        }
+        CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema");
       }
-      CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema");
     }
+
+    // Mark the column schema in the first (default) source as nullable if it is nullable in any of
+    // the input sources. This avoids recomputing this within build_column() and
+    // populate_metadata().
+    std::for_each(
+      thrust::make_counting_iterator(static_cast<size_t>(1)),
+      thrust::make_counting_iterator(schema.size()),
+      [&](auto const schema_idx) {
+        if (schema[schema_idx].repetition_type == REQUIRED and
+            std::any_of(
+              per_file_metadata.begin() + 1, per_file_metadata.end(), [&](auto const& pfm) {
+                return pfm.schema[schema_idx].repetition_type != REQUIRED;
+              })) {
+          schema[schema_idx].repetition_type = OPTIONAL;
+        }
+      });
   }
 
   // Collect and apply arrow:schema from Parquet's key value metadata section
@@ -884,15 +907,8 @@ ColumnChunkMetaData const& aggregate_reader_metadata::get_column_metadata(size_t
                                                                           size_type src_idx,
                                                                           int schema_idx) const
 {
-  // schema_idx_maps will only have > 0 size when we are reading matching column projection from
-  // mismatched Parquet sources.
-  if (src_idx and not schema_idx_maps.empty()) {
-    auto const& schema_idx_map = schema_idx_maps[src_idx - 1];
-    CUDF_EXPECTS(schema_idx_map.find(schema_idx) != schema_idx_map.end(),
-                 "Unmapped schema index encountered in the specified source tree",
-                 std::range_error);
-    schema_idx = schema_idx_map.at(schema_idx);
-  }
+  // Map schema index to the provided source file index
+  schema_idx = map_schema_index(schema_idx, src_idx);
 
   auto col =
     std::find_if(per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(),
@@ -924,6 +940,46 @@ aggregate_reader_metadata::get_rowgroup_metadata() const
   return rg_metadata;
 }
 
+bool aggregate_reader_metadata::is_schema_index_mapped(int schema_idx, int pfm_idx) const
+{
+  // Check if schema_idx or pfm_idx is invalid
+  CUDF_EXPECTS(
+    schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast<int>(per_file_metadata.size()),
+    "Parquet reader encountered an invalid schema_idx or pfm_idx",
+    std::out_of_range);
+
+  // True if root index requested or zeroth file index or schema_idx maps doesn't exist. (i.e.
+  // schemas are identical).
+  if (schema_idx == 0 or pfm_idx == 0 or schema_idx_maps.empty()) { return true; }
+
+  // Check if mapped
+  auto const& schema_idx_map = schema_idx_maps[pfm_idx - 1];
+  return schema_idx_map.find(schema_idx) != schema_idx_map.end();
+}
+
+int aggregate_reader_metadata::map_schema_index(int schema_idx, int pfm_idx) const
+{
+  // Check if schema_idx or pfm_idx is invalid
+  CUDF_EXPECTS(
+    schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast<int>(per_file_metadata.size()),
+    "Parquet reader encountered an invalid schema_idx or pfm_idx",
+    std::out_of_range);
+
+  // Check if pfm_idx is zero or root index requested or schema_idx_maps doesn't exist (i.e.
+  // schemas are identical).
+  if (schema_idx == 0 or pfm_idx == 0 or schema_idx_maps.empty()) { return schema_idx; }
+
+  // schema_idx_maps will only have > 0 size when we are reading matching column projection from
+  // mismatched Parquet sources.
+  auto const& schema_idx_map = schema_idx_maps[pfm_idx - 1];
+  CUDF_EXPECTS(schema_idx_map.find(schema_idx) != schema_idx_map.end(),
+               "Unmapped schema index encountered in the specified source tree",
+               std::out_of_range);
+
+  // Return the mapped schema idx.
+  return schema_idx_map.at(schema_idx);
+}
+
 std::string aggregate_reader_metadata::get_pandas_index() const
 {
   // Assumes that all input files have the same metadata
@@ -1185,8 +1241,8 @@ aggregate_reader_metadata::select_columns(
   // Compares two schema elements to be equal except their number of children
   auto const equal_to_except_num_children = [](SchemaElement const& lhs, SchemaElement const& rhs) {
     return lhs.type == rhs.type and lhs.converted_type == rhs.converted_type and
-           lhs.type_length == rhs.type_length and lhs.repetition_type == rhs.repetition_type and
-           lhs.name == rhs.name and lhs.decimal_scale == rhs.decimal_scale and
+           lhs.type_length == rhs.type_length and lhs.name == rhs.name and
+           lhs.decimal_scale == rhs.decimal_scale and
            lhs.decimal_precision == rhs.decimal_precision and lhs.field_id == rhs.field_id;
   };
 
@@ -1209,6 +1265,11 @@ aggregate_reader_metadata::select_columns(
                    "the selected path",
                    std::invalid_argument);
 
+      // Get the schema_idx_map for this data source (pfm)
+      auto& schema_idx_map = schema_idx_maps[pfm_idx - 1];
+      // Map the schema index from 0th tree (src) to the one in the current (dst) tree.
+      schema_idx_map[src_schema_idx] = dst_schema_idx;
+
       // If src_schema_elem is a stub, it does not exist in the column_name_info and column_buffer
       // hierarchy. So continue on with mapping.
       if (src_schema_elem.is_stub()) {
@@ -1262,15 +1323,6 @@ aggregate_reader_metadata::select_columns(
                        pfm_idx);
           });
       }
-
-      // We're at a leaf and this is an input column (one with actual data stored) so map it.
-      if (src_schema_elem.num_children == 0) {
-        // Get the schema_idx_map for this data source (pfm)
-        auto& schema_idx_map = schema_idx_maps[pfm_idx - 1];
-
-        // Map the schema index from 0th tree (src) to the one in the current (dst) tree.
-        schema_idx_map[src_schema_idx] = dst_schema_idx;
-      }
     };
 
   std::vector<int> output_column_schemas;
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
index 6f2863136b2..6487c92f48f 100644
--- a/cpp/src/io/parquet/reader_impl_helpers.hpp
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -234,6 +234,26 @@ class aggregate_reader_metadata {
 
   [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; }
 
+  /**
+   * @brief Checks if a schema index from 0th source is mapped to the specified file index
+   *
+   * @param schema_idx The index of the SchemaElement in the zeroth file.
+   * @param pfm_idx The index of the file (per_file_metadata) to check mappings for.
+   *
+   * @return True if schema index is mapped
+   */
+  [[nodiscard]] bool is_schema_index_mapped(int schema_idx, int pfm_idx) const;
+
+  /**
+   * @brief Maps schema index from 0th source file to the specified file index
+   *
+   * @param schema_idx The index of the SchemaElement in the zeroth file.
+   * @param pfm_idx The index of the file (per_file_metadata) to map the schema_idx to.
+   *
+   * @return Mapped schema index
+   */
+  [[nodiscard]] int map_schema_index(int schema_idx, int pfm_idx) const;
+
   /**
    * @brief Extracts the schema_idx'th SchemaElement from the pfm_idx'th file
    *
@@ -248,7 +268,7 @@ class aggregate_reader_metadata {
     CUDF_EXPECTS(
       schema_idx >= 0 and pfm_idx >= 0 and pfm_idx < static_cast<int>(per_file_metadata.size()),
       "Parquet reader encountered an invalid schema_idx or pfm_idx",
-      std::invalid_argument);
+      std::out_of_range);
     return per_file_metadata[pfm_idx].schema[schema_idx];
   }
 
@@ -256,7 +276,10 @@ class aggregate_reader_metadata {
   [[nodiscard]] auto&& get_key_value_metadata() && { return std::move(keyval_maps); }
 
   /**
-   * @brief Gets the concrete nesting depth of output cudf columns
+   * @brief Gets the concrete nesting depth of output cudf columns.
+   *
+   * Gets the nesting depth of the output cudf column for the given schema.
+   * The nesting depth must be equal for the given schema_index across all sources.
    *
    * @param schema_index Schema index of the input column
    *
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
index 52918f5bc80..8e67f233213 100644
--- a/cpp/src/io/parquet/reader_impl_preprocess.cu
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -79,23 +79,30 @@ void print_pages(cudf::detail::hostdevice_vector<PageInfo>& pages, rmm::cuda_str
  * is indicated when adding new values.  This function generates the mappings of
  * the R/D levels to those start/end bounds
  *
- * @param remap Maps column schema index to the R/D remapping vectors for that column
- * @param src_col_schema The column schema to generate the new mapping for
+ * @param remap Maps column schema index to the R/D remapping vectors for that column for a
+ *              particular input source file
+ * @param src_col_schema The source column schema to generate the new mapping for
+ * @param mapped_src_col_schema Mapped column schema for src_file_idx'th file
+ * @param src_file_idx The input source file index for the column schema
  * @param md File metadata information
  */
-void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::vector<int>>>& remap,
-                               int src_col_schema,
-                               aggregate_reader_metadata const& md)
+void generate_depth_remappings(
+  std::map<std::pair<int, int>, std::pair<std::vector<int>, std::vector<int>>>& remap,
+  int const src_col_schema,
+  int const mapped_src_col_schema,
+  int const src_file_idx,
+  aggregate_reader_metadata const& md)
 {
   // already generated for this level
-  if (remap.find(src_col_schema) != remap.end()) { return; }
-  auto schema   = md.get_schema(src_col_schema);
-  int max_depth = md.get_output_nesting_depth(src_col_schema);
+  if (remap.find({src_col_schema, src_file_idx}) != remap.end()) { return; }
+  auto const& schema   = md.get_schema(mapped_src_col_schema, src_file_idx);
+  auto const max_depth = md.get_output_nesting_depth(src_col_schema);
 
-  CUDF_EXPECTS(remap.find(src_col_schema) == remap.end(),
+  CUDF_EXPECTS(remap.find({src_col_schema, src_file_idx}) == remap.end(),
                "Attempting to remap a schema more than once");
   auto inserted =
-    remap.insert(std::pair<int, std::pair<std::vector<int>, std::vector<int>>>{src_col_schema, {}});
+    remap.insert(std::pair<std::pair<int, int>, std::pair<std::vector<int>, std::vector<int>>>{
+      {src_col_schema, src_file_idx}, {}});
   auto& depth_remap = inserted.first->second;
 
   std::vector<int>& rep_depth_remap = (depth_remap.first);
@@ -136,15 +143,15 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
     auto find_shallowest = [&](int r) {
       int shallowest = -1;
       int cur_depth  = max_depth - 1;
-      int schema_idx = src_col_schema;
+      int schema_idx = mapped_src_col_schema;
       while (schema_idx > 0) {
-        auto cur_schema = md.get_schema(schema_idx);
+        auto& cur_schema = md.get_schema(schema_idx, src_file_idx);
         if (cur_schema.max_repetition_level == r) {
           // if this is a repeated field, map it one level deeper
           shallowest = cur_schema.is_stub() ? cur_depth + 1 : cur_depth;
         }
         // if it's one-level encoding list
-        else if (cur_schema.is_one_level_list(md.get_schema(cur_schema.parent_idx))) {
+        else if (cur_schema.is_one_level_list(md.get_schema(cur_schema.parent_idx, src_file_idx))) {
           shallowest = cur_depth - 1;
         }
         if (!cur_schema.is_stub()) { cur_depth--; }
@@ -159,10 +166,10 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
   for (int s_idx = schema.max_definition_level; s_idx >= 0; s_idx--) {
     auto find_deepest = [&](int d) {
       SchemaElement prev_schema;
-      int schema_idx = src_col_schema;
+      int schema_idx = mapped_src_col_schema;
       int r1         = 0;
       while (schema_idx > 0) {
-        SchemaElement cur_schema = md.get_schema(schema_idx);
+        SchemaElement cur_schema = md.get_schema(schema_idx, src_file_idx);
         if (cur_schema.max_definition_level == d) {
           // if this is a repeated field, map it one level deeper
           r1 = cur_schema.is_stub() ? prev_schema.max_repetition_level
@@ -175,10 +182,10 @@ void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::ve
 
       // we now know R1 from above. return the deepest nesting level that has the
       // same repetition level
-      schema_idx = src_col_schema;
+      schema_idx = mapped_src_col_schema;
       int depth  = max_depth - 1;
       while (schema_idx > 0) {
-        SchemaElement cur_schema = md.get_schema(schema_idx);
+        SchemaElement cur_schema = md.get_schema(schema_idx, src_file_idx);
         if (cur_schema.max_repetition_level == r1) {
           // if this is a repeated field, map it one level deeper
           depth = cur_schema.is_stub() ? depth + 1 : depth;
@@ -783,9 +790,20 @@ void reader::impl::allocate_nesting_info()
   std::vector<int> per_page_nesting_info_size(num_columns);
   auto iter = thrust::make_counting_iterator(size_type{0});
   std::transform(iter, iter + num_columns, per_page_nesting_info_size.begin(), [&](size_type i) {
+    // Schema index of the current input column
     auto const schema_idx = _input_columns[i].schema_idx;
-    auto const& schema    = _metadata->get_schema(schema_idx);
-    return max(schema.max_definition_level + 1, _metadata->get_output_nesting_depth(schema_idx));
+    // Get the max_definition_level of this column across all sources.
+    auto max_definition_level = _metadata->get_schema(schema_idx).max_definition_level + 1;
+    std::for_each(thrust::make_counting_iterator(static_cast<size_t>(1)),
+                  thrust::make_counting_iterator(_sources.size()),
+                  [&](auto const src_file_idx) {
+                    auto const& schema = _metadata->get_schema(
+                      _metadata->map_schema_index(schema_idx, src_file_idx), src_file_idx);
+                    max_definition_level =
+                      std::max(max_definition_level, schema.max_definition_level + 1);
+                  });
+
+    return std::max(max_definition_level, _metadata->get_output_nesting_depth(schema_idx));
   });
 
   // compute total # of page_nesting infos needed and allocate space. doing this in one
@@ -813,6 +831,8 @@ void reader::impl::allocate_nesting_info()
         page_nesting_decode_info.device_ptr() + src_info_index;
 
       pages[target_page_index + p_idx].nesting_info_size = per_page_nesting_info_size[idx];
+      // Set the number of output nesting levels from the zeroth source as nesting must be
+      // identical across sources.
       pages[target_page_index + p_idx].num_output_nesting_levels =
         _metadata->get_output_nesting_depth(src_col_schema);
 
@@ -821,25 +841,36 @@ void reader::impl::allocate_nesting_info()
     target_page_index += subpass.column_page_count[idx];
   }
 
+  // Reset the target_page_index
+  target_page_index = 0;
+
   // fill in
   int nesting_info_index = 0;
-  std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
   for (size_t idx = 0; idx < _input_columns.size(); idx++) {
     auto const src_col_schema = _input_columns[idx].schema_idx;
 
-    // schema of the input column
-    auto& schema = _metadata->get_schema(src_col_schema);
     // real depth of the output cudf column hierarchy (1 == no nesting, 2 == 1 level, etc)
+    // nesting depth must be same across sources so getting it from the zeroth source is ok
     int const max_output_depth = _metadata->get_output_nesting_depth(src_col_schema);
 
+    // Map to store depths if this column has lists
+    std::map<std::pair<int, int>, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
     // if this column has lists, generate depth remapping
-    std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
-    if (schema.max_repetition_level > 0) {
-      generate_depth_remappings(depth_remapping, src_col_schema, *_metadata);
-    }
+    std::for_each(
+      thrust::make_counting_iterator(static_cast<size_t>(0)),
+      thrust::make_counting_iterator(_sources.size()),
+      [&](auto const src_file_idx) {
+        auto const mapped_schema_idx = _metadata->map_schema_index(src_col_schema, src_file_idx);
+        if (_metadata->get_schema(mapped_schema_idx, src_file_idx).max_repetition_level > 0) {
+          generate_depth_remappings(
+            depth_remapping, src_col_schema, mapped_schema_idx, src_file_idx, *_metadata);
+        }
+      });
 
     // fill in host-side nesting info
-    int schema_idx  = src_col_schema;
+    int schema_idx = src_col_schema;
+    // This is okay as we only use this to check stubness of cur_schema and
+    // to get its parent's indices, both of which are one to one mapped.
     auto cur_schema = _metadata->get_schema(schema_idx);
     int cur_depth   = max_output_depth - 1;
     while (schema_idx > 0) {
@@ -848,6 +879,9 @@ void reader::impl::allocate_nesting_info()
       if (!cur_schema.is_stub()) {
         // initialize each page within the chunk
         for (size_t p_idx = 0; p_idx < subpass.column_page_count[idx]; p_idx++) {
+          // Source file index for the current page.
+          auto const src_file_idx =
+            pass.chunks[pages[target_page_index + p_idx].chunk_idx].src_file_idx;
           PageNestingInfo* pni =
             &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size[idx])];
 
@@ -855,9 +889,11 @@ void reader::impl::allocate_nesting_info()
             &page_nesting_decode_info[nesting_info_index +
                                       (p_idx * per_page_nesting_info_size[idx])];
 
+          auto const mapped_src_col_schema =
+            _metadata->map_schema_index(src_col_schema, src_file_idx);
           // if we have lists, set our start and end depth remappings
-          if (schema.max_repetition_level > 0) {
-            auto remap = depth_remapping.find(src_col_schema);
+          if (_metadata->get_schema(mapped_src_col_schema, src_file_idx).max_repetition_level > 0) {
+            auto remap = depth_remapping.find({src_col_schema, src_file_idx});
             CUDF_EXPECTS(remap != depth_remapping.end(),
                          "Could not find depth remapping for schema");
             std::vector<int> const& rep_depth_remap = (remap->second.first);
@@ -871,11 +907,15 @@ void reader::impl::allocate_nesting_info()
             }
           }
 
+          // Get the schema from the current input source.
+          auto& actual_cur_schema = _metadata->get_schema(
+            _metadata->map_schema_index(schema_idx, src_file_idx), src_file_idx);
+
           // values indexed by output column index
-          nesting_info[cur_depth].max_def_level = cur_schema.max_definition_level;
+          nesting_info[cur_depth].max_def_level = actual_cur_schema.max_definition_level;
           pni[cur_depth].size                   = 0;
           pni[cur_depth].type =
-            to_type_id(cur_schema, _strings_to_categorical, _options.timestamp_type.id());
+            to_type_id(actual_cur_schema, _strings_to_categorical, _options.timestamp_type.id());
           pni[cur_depth].nullable = cur_schema.repetition_type == OPTIONAL;
         }
 
@@ -888,6 +928,8 @@ void reader::impl::allocate_nesting_info()
       cur_schema = _metadata->get_schema(schema_idx);
     }
 
+    // Offset the page and nesting info indices
+    target_page_index += subpass.column_page_count[idx];
     nesting_info_index += (per_page_nesting_info_size[idx] * subpass.column_page_count[idx]);
   }
 
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 8b59a7eef08..7f1b0b1cd46 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -3822,8 +3822,8 @@ def test_parquet_reader_with_mismatched_tables(store_schema):
     df1 = cudf.DataFrame(
         {
             "i32": cudf.Series([None, None, None], dtype="int32"),
-            "i64": cudf.Series([1234, None, 123], dtype="int64"),
-            "list": list([[1, 2], [None, 4], [5, 6]]),
+            "i64": cudf.Series([1234, 467, 123], dtype="int64"),
+            "list": list([[1, 2], None, [None, 6]]),
             "time": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"),
             "str": ["vfd", None, "ghu"],
             "d_list": list(
@@ -3838,14 +3838,14 @@ def test_parquet_reader_with_mismatched_tables(store_schema):
 
     df2 = cudf.DataFrame(
         {
-            "str": ["abc", "def", None],
+            "str": ["abc", "def", "ghi"],
             "i64": cudf.Series([None, 65, 98], dtype="int64"),
             "times": cudf.Series([1234, None, 4123], dtype="datetime64[us]"),
-            "list": list([[7, 8], [9, 10], [None, 12]]),
+            "list": list([[7, 8], [9, 10], [11, 12]]),
             "d_list": list(
                 [
                     [pd.Timedelta(minutes=4), None],
-                    [None, None],
+                    None,
                     [pd.Timedelta(minutes=6), None],
                 ]
             ),
@@ -3900,38 +3900,27 @@ def test_parquet_reader_with_mismatched_structs():
         {
             "a": 1,
             "b": {
-                "inner_a": 10,
-                "inner_b": {"inner_inner_b": 1, "inner_inner_a": 2},
+                "a_a": 10,
+                "b_b": {"b_b_b": 1, "b_b_a": 2},
             },
             "c": 2,
         },
         {
             "a": 3,
-            "b": {"inner_a": 30, "inner_b": {"inner_inner_a": 210}},
+            "b": {"b_a": 30, "b_b": {"b_b_a": 210}},
             "c": 4,
         },
-        {"a": 5, "b": {"inner_a": 50, "inner_b": None}, "c": 6},
+        {"a": 5, "b": {"b_a": 50, "b_b": None}, "c": 6},
         {"a": 7, "b": None, "c": 8},
-        {"a": None, "b": {"inner_a": None, "inner_b": None}, "c": None},
-        None,
-        {
-            "a": None,
-            "b": {
-                "inner_a": None,
-                "inner_b": {"inner_inner_b": None, "inner_inner_a": 10},
-            },
-            "c": 10,
-        },
+        {"a": 5, "b": {"b_a": None, "b_b": None}, "c": None},
     ]
 
     data2 = [
-        {"a": 1, "b": {"inner_b": {"inner_inner_a": None}}},
-        {"a": 3, "b": {"inner_b": {"inner_inner_a": 1}}},
-        {"a": 5, "b": {"inner_b": None}},
-        {"a": 7, "b": {"inner_b": {"inner_inner_b": 1, "inner_inner_a": 0}}},
-        {"a": None, "b": {"inner_b": None}},
+        {"a": 1, "b": {"b_b": {"b_b_a": None}}},
+        {"a": 5, "b": {"b_b": None}},
+        {"a": 7, "b": {"b_b": {"b_b_b": 1, "b_b_a": 0}}},
+        {"a": None, "b": {"b_b": None}},
         None,
-        {"a": None, "b": {"inner_b": {"inner_inner_a": 1}}},
     ]
 
     # cuDF tables from struct data
@@ -3949,20 +3938,20 @@ def test_parquet_reader_with_mismatched_structs():
     # Read the struct.b.inner_b.inner_inner_a column from parquet
     got = cudf.read_parquet(
         [buf1, buf2],
-        columns=["struct.b.inner_b.inner_inner_a"],
+        columns=["struct.b.b_b.b_b_a"],
         allow_mismatched_pq_schemas=True,
     )
     got = (
         cudf.Series(got["struct"])
         .struct.field("b")
-        .struct.field("inner_b")
-        .struct.field("inner_inner_a")
+        .struct.field("b_b")
+        .struct.field("b_b_a")
     )
 
     # Read with chunked reader
     got_chunked = read_parquet_chunked(
         [buf1, buf2],
-        columns=["struct.b.inner_b.inner_inner_a"],
+        columns=["struct.b.b_b.b_b_a"],
         chunk_read_limit=240,
         pass_read_limit=240,
         allow_mismatched_pq_schemas=True,
@@ -3970,8 +3959,8 @@ def test_parquet_reader_with_mismatched_structs():
     got_chunked = (
         cudf.Series(got_chunked["struct"])
         .struct.field("b")
-        .struct.field("inner_b")
-        .struct.field("inner_inner_a")
+        .struct.field("b_b")
+        .struct.field("b_b_a")
     )
 
     # Construct the expected series
@@ -3979,12 +3968,12 @@ def test_parquet_reader_with_mismatched_structs():
         [
             cudf.Series(df1["struct"])
             .struct.field("b")
-            .struct.field("inner_b")
-            .struct.field("inner_inner_a"),
+            .struct.field("b_b")
+            .struct.field("b_b_a"),
             cudf.Series(df2["struct"])
             .struct.field("b")
-            .struct.field("inner_b")
-            .struct.field("inner_inner_a"),
+            .struct.field("b_b")
+            .struct.field("b_b_a"),
         ]
     ).reset_index(drop=True)
 
@@ -4023,12 +4012,12 @@ def test_parquet_reader_with_mismatched_schemas_error():
         )
 
     data1 = [
-        {"a": 1, "b": {"inner_a": 1, "inner_b": 6}},
-        {"a": 3, "b": {"inner_a": None, "inner_b": 2}},
+        {"a": 1, "b": {"b_a": 1, "b_b": 6}},
+        {"a": 3, "b": {"b_a": None, "b_b": 2}},
     ]
     data2 = [
-        {"b": {"inner_a": 1}, "c": "str"},
-        {"b": {"inner_a": None}, "c": None},
+        {"b": {"b_a": 1}, "c": "str"},
+        {"b": {"b_a": None}, "c": None},
     ]
 
     # cuDF tables from struct data
@@ -4059,6 +4048,191 @@ def test_parquet_reader_with_mismatched_schemas_error():
     ):
         cudf.read_parquet(
             [buf1, buf2],
-            columns=["struct.b.inner_b"],
+            columns=["struct.b.b_b"],
             allow_mismatched_pq_schemas=True,
         )
+
+
+def test_parquet_reader_mismatched_nullability():
+    # Ensure that we can faithfully read the tables with mismatched nullabilities
+    df1 = cudf.DataFrame(
+        {
+            "timedelta": cudf.Series([12, 54, 1231], dtype="timedelta64[ms]"),
+            "duration_list": list(
+                [
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            None,
+                            [pd.Timedelta(minutes=8), None],
+                        ],
+                        None,
+                    ],
+                    None,
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)],
+                        ]
+                    ],
+                ]
+            ),
+            "int64": cudf.Series([1234, None, 4123], dtype="int64"),
+            "int32": cudf.Series([1234, 123, 4123], dtype="int32"),
+            "list": list([[1, 2], [1, 2], [1, 2]]),
+            "datetime": cudf.Series([1234, 123, 4123], dtype="datetime64[ms]"),
+            "string": cudf.Series(["kitten", "puppy", "cub"]),
+        }
+    )
+
+    df2 = cudf.DataFrame(
+        {
+            "timedelta": cudf.Series(
+                [None, None, None], dtype="timedelta64[ms]"
+            ),
+            "duration_list": list(
+                [
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=1)],
+                        ],
+                    ],
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)],
+                        ]
+                    ],
+                    [
+                        [
+                            [pd.Timedelta(minutes=1), pd.Timedelta(minutes=2)],
+                            [pd.Timedelta(minutes=5), pd.Timedelta(minutes=3)],
+                            [pd.Timedelta(minutes=8), pd.Timedelta(minutes=4)],
+                        ]
+                    ],
+                ]
+            ),
+            "int64": cudf.Series([1234, 123, 4123], dtype="int64"),
+            "int32": cudf.Series([1234, None, 4123], dtype="int32"),
+            "list": list([[1, 2], None, [1, 2]]),
+            "datetime": cudf.Series(
+                [1234, None, 4123], dtype="datetime64[ms]"
+            ),
+            "string": cudf.Series(["kitten", None, "cub"]),
+        }
+    )
+
+    # Write tables to parquet with arrow schema for compatibility for duration column(s)
+    fname1 = BytesIO()
+    df1.to_parquet(fname1, store_schema=True)
+    fname2 = BytesIO()
+    df2.to_parquet(fname2, store_schema=True)
+
+    # Read tables back with cudf and arrow in either order and compare
+    assert_eq(
+        cudf.read_parquet([fname1, fname2]),
+        cudf.concat([df1, df2]).reset_index(drop=True),
+    )
+    assert_eq(
+        cudf.read_parquet([fname2, fname1]),
+        cudf.concat([df2, df1]).reset_index(drop=True),
+    )
+
+
+def test_parquet_reader_mismatched_nullability_structs(tmpdir):
+    data1 = [
+        {
+            "a": "a",
+            "b": {
+                "b_a": 10,
+                "b_b": {"b_b_b": 1, "b_b_a": 12},
+            },
+            "c": [1, 2],
+        },
+        {
+            "a": "b",
+            "b": {
+                "b_a": 30,
+                "b_b": {"b_b_b": 2, "b_b_a": 2},
+            },
+            "c": [3, 4],
+        },
+        {
+            "a": "c",
+            "b": {
+                "b_a": 50,
+                "b_b": {"b_b_b": 4, "b_b_a": 5},
+            },
+            "c": [5, 6],
+        },
+        {
+            "a": "d",
+            "b": {
+                "b_a": 135,
+                "b_b": {"b_b_b": 12, "b_b_a": 32},
+            },
+            "c": [7, 8],
+        },
+        {
+            "a": "e",
+            "b": {
+                "b_a": 1,
+                "b_b": {"b_b_b": 1, "b_b_a": 5},
+            },
+            "c": [9, 10],
+        },
+        {
+            "a": "f",
+            "b": {
+                "b_a": 32,
+                "b_b": {"b_b_b": 1, "b_b_a": 6},
+            },
+            "c": [11, 12],
+        },
+    ]
+
+    data2 = [
+        {
+            "a": "g",
+            "b": {
+                "b_a": 10,
+                "b_b": {"b_b_b": None, "b_b_a": 2},
+            },
+            "c": None,
+        },
+        {"a": None, "b": {"b_a": None, "b_b": None}, "c": [15, 16]},
+        {"a": "j", "b": None, "c": [8, 10]},
+        {"a": None, "b": {"b_a": None, "b_b": None}, "c": None},
+        None,
+        {
+            "a": None,
+            "b": {"b_a": None, "b_b": {"b_b_b": 1}},
+            "c": [18, 19],
+        },
+        {"a": None, "b": None, "c": None},
+    ]
+
+    pa_table1 = pa.Table.from_pydict({"struct": data1})
+    df1 = cudf.DataFrame.from_arrow(pa_table1)
+
+    pa_table2 = pa.Table.from_pydict({"struct": data2})
+    df2 = cudf.DataFrame.from_arrow(pa_table2)
+
+    # Write tables to parquet
+    buf1 = BytesIO()
+    df1.to_parquet(buf1)
+    buf2 = BytesIO()
+    df2.to_parquet(buf2)
+
+    # Read tables back with cudf and compare with expected.
+    assert_eq(
+        cudf.read_parquet([buf1, buf2]),
+        cudf.concat([df1, df2]).reset_index(drop=True),
+    )
+    assert_eq(
+        cudf.read_parquet([buf2, buf1]),
+        cudf.concat([df2, df1]).reset_index(drop=True),
+    )

From 1b402dfc2f078656bcbbb8a0386008601620e6e2 Mon Sep 17 00:00:00 2001
From: Mike McCarty <mmccarty@nvidia.com>
Date: Wed, 11 Sep 2024 19:00:45 -0400
Subject: [PATCH 805/842] Recommending `miniforge` for conda install (#16782)

Recommending miniforge for conda install in README

Authors:
  - Mike McCarty (https://github.com/mmccarty)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16782
---
 README.md                  | 2 +-
 python/custreamz/README.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f62f7885d63..8f8c2adac2f 100644
--- a/README.md
+++ b/README.md
@@ -79,7 +79,7 @@ pip install --extra-index-url=https://pypi.nvidia.com cudf-cu12
 
 ### Conda
 
-cuDF can be installed with conda (via [miniconda](https://docs.conda.io/projects/miniconda/en/latest/) or the full [Anaconda distribution](https://www.anaconda.com/download) from the `rapidsai` channel:
+cuDF can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel:
 
 ```bash
 conda install -c rapidsai -c conda-forge -c nvidia \
diff --git a/python/custreamz/README.md b/python/custreamz/README.md
index 1509dac9e61..8da17ef09dc 100644
--- a/python/custreamz/README.md
+++ b/python/custreamz/README.md
@@ -54,7 +54,7 @@ Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapids
 
 ### Conda
 
-cuStreamz is installed with conda ([miniconda](https://conda.io/miniconda.html), or the full [Anaconda distribution](https://www.anaconda.com/download)) from the `rapidsai` or `rapidsai-nightly` channel:
+cuStraamz can be installed with conda (via [miniforge](https://github.com/conda-forge/miniforge)) from the `rapidsai` channel:
 
 Release:
 ```bash

From 3dbc33a5cb1cf7052cd67f5654b34594403fbfef Mon Sep 17 00:00:00 2001
From: Jihoon Son <ghoonson@gmail.com>
Date: Wed, 11 Sep 2024 19:11:20 -0700
Subject: [PATCH 806/842] Revert "Fix empty cluster handling in tdigest merge
 (#16675)" (#16800)

This PR reverts #16675, which has introduced another bug. Our nightly CI pipeline is failing because of this bug (https://github.com/NVIDIA/spark-rapids/issues/11463). I can reproduce the bug within a libcudf unit test. I will make another PR to fix both the original issue and the new bug.

Authors:
  - Jihoon Son (https://github.com/jihoonson)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/16800
---
 cpp/include/cudf/detail/tdigest/tdigest.hpp   | 17 ++--
 cpp/include/cudf_test/tdigest_utilities.cuh   | 20 ++---
 cpp/src/quantiles/tdigest/tdigest.cu          | 23 +++--
 .../quantiles/tdigest/tdigest_aggregation.cu  | 70 ++++++---------
 cpp/tests/groupby/tdigest_tests.cu            | 90 ++-----------------
 .../quantiles/percentile_approx_test.cpp      |  4 +-
 6 files changed, 62 insertions(+), 162 deletions(-)

diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index 672b95e2d01..80a4460023f 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -143,29 +143,28 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
                                             rmm::device_async_resource_ref mr);
 
 /**
- * @brief Create a tdigest column of empty clusters.
+ * @brief Create an empty tdigest column.
  *
- * The column created contains the specified number of rows of empty clusters.
+ * An empty tdigest column contains a single row of length 0
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @returns A tdigest column of empty clusters.
+ * @returns An empty tdigest column.
  */
 CUDF_EXPORT
-std::unique_ptr<column> make_tdigest_column_of_empty_clusters(size_type num_rows,
-                                                              rmm::cuda_stream_view stream,
-                                                              rmm::device_async_resource_ref mr);
+std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
+                                                  rmm::device_async_resource_ref mr);
 
 /**
- * @brief Create a scalar of an empty tdigest cluster.
+ * @brief Create an empty tdigest scalar.
  *
- * The returned scalar is a struct_scalar that contains a single row of an empty cluster.
+ * An empty tdigest scalar is a struct_scalar that contains a single row of length 0
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  *
- * @returns A scalar of an empty tdigest cluster.
+ * @returns An empty tdigest scalar.
  */
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr);
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index be7d19b2227..1758790cd64 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -270,8 +270,8 @@ void tdigest_simple_all_nulls_aggregation(Func op)
     static_cast<column_view>(values).type(), tdigest_gen{}, op, values, delta);
 
   // NOTE: an empty tdigest column still has 1 row.
-  auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -562,12 +562,12 @@ template <typename MergeFunc>
 void tdigest_merge_empty(MergeFunc merge_op)
 {
   // 3 empty tdigests all in the same group
-  auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
-  auto b = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
-  auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto b = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   std::vector<column_view> cols;
   cols.push_back(*a);
   cols.push_back(*b);
@@ -577,8 +577,8 @@ void tdigest_merge_empty(MergeFunc merge_op)
   auto const delta = 1000;
   auto result      = merge_op(*values, delta);
 
-  auto expected = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
 }
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index 76cd55bf994..0d017cf1f13 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -292,33 +292,32 @@ std::unique_ptr<column> make_tdigest_column(size_type num_rows,
   return make_structs_column(num_rows, std::move(children), 0, {}, stream, mr);
 }
 
-std::unique_ptr<column> make_tdigest_column_of_empty_clusters(size_type num_rows,
-                                                              rmm::cuda_stream_view stream,
-                                                              rmm::device_async_resource_ref mr)
+std::unique_ptr<column> make_empty_tdigest_column(rmm::cuda_stream_view stream,
+                                                  rmm::device_async_resource_ref mr)
 {
   auto offsets = cudf::make_fixed_width_column(
-    data_type(type_id::INT32), num_rows + 1, mask_state::UNALLOCATED, stream, mr);
+    data_type(type_id::INT32), 2, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                offsets->mutable_view().begin<size_type>(),
                offsets->mutable_view().end<size_type>(),
                0);
 
-  auto min_col = cudf::make_numeric_column(
-    data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
+  auto min_col =
+    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                min_col->mutable_view().begin<double>(),
                min_col->mutable_view().end<double>(),
                0);
-  auto max_col = cudf::make_numeric_column(
-    data_type(type_id::FLOAT64), num_rows, mask_state::UNALLOCATED, stream, mr);
+  auto max_col =
+    cudf::make_numeric_column(data_type(type_id::FLOAT64), 1, mask_state::UNALLOCATED, stream, mr);
   thrust::fill(rmm::exec_policy(stream),
                max_col->mutable_view().begin<double>(),
                max_col->mutable_view().end<double>(),
                0);
 
-  return make_tdigest_column(num_rows,
-                             cudf::make_empty_column(type_id::FLOAT64),
-                             cudf::make_empty_column(type_id::FLOAT64),
+  return make_tdigest_column(1,
+                             make_empty_column(type_id::FLOAT64),
+                             make_empty_column(type_id::FLOAT64),
                              std::move(offsets),
                              std::move(min_col),
                              std::move(max_col),
@@ -339,7 +338,7 @@ std::unique_ptr<column> make_tdigest_column_of_empty_clusters(size_type num_rows
 std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
                                                   rmm::device_async_resource_ref mr)
 {
-  auto contents = make_tdigest_column_of_empty_clusters(1, stream, mr)->release();
+  auto contents = make_empty_tdigest_column(stream, mr)->release();
   return std::make_unique<struct_scalar>(
     std::move(*std::make_unique<table>(std::move(contents.children))), true, stream, mr);
 }
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index d591fb5c171..2dd25a7b890 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -366,8 +366,8 @@ std::unique_ptr<scalar> to_tdigest_scalar(std::unique_ptr<column>&& tdigest,
  * @param group_cluster_wl    Output.  The set of cluster weight limits for each group.
  * @param group_num_clusters  Output.  The number of output clusters for each input group.
  * @param group_cluster_offsets  Offsets per-group to the start of it's clusters
- * @param may_have_empty_clusters Whether or not there could be empty clusters. Must only be
- * set to false when there is no empty cluster, true otherwise.
+ * @param has_nulls Whether or not the input contains nulls
+ *
  */
 
 template <typename GroupInfo, typename NearestWeightFunc, typename CumulativeWeight>
@@ -379,7 +379,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
                                                 double* group_cluster_wl,
                                                 size_type* group_num_clusters,
                                                 size_type const* group_cluster_offsets,
-                                                bool may_have_empty_clusters)
+                                                bool has_nulls)
 {
   int const tid = threadIdx.x + blockIdx.x * blockDim.x;
 
@@ -399,12 +399,11 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
   // a group with nothing in it.
   group_num_clusters[group_index] = 0;
   if (total_weight <= 0) {
-    // If the input contains empty clusters, we can potentially have a group that also generates
-    // empty clusters because -all- of the input values are null or empty cluster. In that case, the
-    // `reduce_by_key` call in the tdigest generation step will need a location to store the unused
-    // reduction value for that group of nulls and empty clusters. These "stubs" will be
-    // postprocessed out afterwards.
-    if (may_have_empty_clusters) { group_num_clusters[group_index] = 1; }
+    // if the input contains nulls we can potentially have a group that generates no
+    // clusters because -all- of the input values are null.  in that case, the reduce_by_key call
+    // in the tdigest generation step will need a location to store the unused reduction value for
+    // that group of nulls. these "stubs" will be postprocessed out afterwards.
+    if (has_nulls) { group_num_clusters[group_index] = 1; }
     return;
   }
 
@@ -503,8 +502,7 @@ CUDF_KERNEL void generate_cluster_limits_kernel(int delta,
  * stream that falls before our current cluster limit
  * @param group_info         A functor which returns the info for the specified group (total weight,
  * size and start offset)
- * @param may_have_empty_clusters Whether or not there could be empty clusters. It should be
- * set to false only when there is no empty cluster.
+ * @param has_nulls          Whether or not the input data contains nulls
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
@@ -518,7 +516,7 @@ generate_group_cluster_info(int delta,
                             NearestWeight nearest_weight,
                             GroupInfo group_info,
                             CumulativeWeight cumulative_weight,
-                            bool may_have_empty_clusters,
+                            bool has_nulls,
                             rmm::cuda_stream_view stream,
                             rmm::device_async_resource_ref mr)
 {
@@ -537,7 +535,7 @@ generate_group_cluster_info(int delta,
     nullptr,
     group_num_clusters.begin(),
     nullptr,
-    may_have_empty_clusters);
+    has_nulls);
 
   // generate group cluster offsets (where the clusters for a given group start and end)
   auto group_cluster_offsets = cudf::make_numeric_column(
@@ -569,7 +567,7 @@ generate_group_cluster_info(int delta,
     group_cluster_wl.begin(),
     group_num_clusters.begin(),
     group_cluster_offsets->view().begin<size_type>(),
-    may_have_empty_clusters);
+    has_nulls);
 
   return {std::move(group_cluster_wl),
           std::move(group_cluster_offsets),
@@ -582,7 +580,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                                             std::unique_ptr<column>&& offsets,
                                             std::unique_ptr<column>&& min_col,
                                             std::unique_ptr<column>&& max_col,
-                                            bool may_have_empty_clusters,
+                                            bool has_nulls,
                                             rmm::cuda_stream_view stream,
                                             rmm::device_async_resource_ref mr)
 {
@@ -597,7 +595,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                           size_type i) { return is_stub_weight(offsets[i]) ? 1 : 0; };
 
   size_type const num_stubs = [&]() {
-    if (!may_have_empty_clusters) { return 0; }
+    if (!has_nulls) { return 0; }
     auto iter = cudf::detail::make_counting_transform_iterator(
       0, cuda::proclaim_return_type<size_type>(is_stub_digest));
     return thrust::reduce(rmm::exec_policy(stream), iter, iter + num_rows);
@@ -663,10 +661,6 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                                                     mr);
 }
 
-/**
- * @brief A functor which returns the cluster index within a group that the value at
- * the given value index falls into.
- */
 template <typename CumulativeWeight>
 struct compute_tdigests_keys_fn {
   int const delta;
@@ -712,8 +706,8 @@ struct compute_tdigests_keys_fn {
  * boundaries.
  *
  * @param delta              tdigest compression level
- * @param centroids_begin    Beginning of the range of centroids.
- * @param centroids_end      End of the range of centroids.
+ * @param values_begin       Beginning of the range of input values.
+ * @param values_end         End of the range of input values.
  * @param cumulative_weight  Functor which returns cumulative weight and group information for
  * an absolute input value index.
  * @param min_col            Column containing the minimum value per group.
@@ -721,8 +715,7 @@ struct compute_tdigests_keys_fn {
  * @param group_cluster_wl   Cluster weight limits for each group.
  * @param group_cluster_offsets R-value reference of offsets into the cluster weight limits.
  * @param total_clusters     Total number of clusters in all groups.
- * @param may_have_empty_clusters Whether or not there could be empty clusters. It should be
- * set to false only when there is no empty cluster.
+ * @param has_nulls          Whether or not the input contains nulls
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory
  *
@@ -738,7 +731,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                                          rmm::device_uvector<double> const& group_cluster_wl,
                                          std::unique_ptr<column>&& group_cluster_offsets,
                                          size_type total_clusters,
-                                         bool may_have_empty_clusters,
+                                         bool has_nulls,
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr)
 {
@@ -757,9 +750,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
   //   double       // max
   // }
   //
-  if (total_clusters == 0) {
-    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr);
-  }
+  if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
 
   // each input group represents an individual tdigest.  within each tdigest, we want the keys
   // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall
@@ -802,7 +793,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
                              std::move(group_cluster_offsets),
                              std::move(min_col),
                              std::move(max_col),
-                             may_have_empty_clusters,
+                             has_nulls,
                              stream,
                              mr);
 }
@@ -1154,13 +1145,8 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
   auto merged =
     cudf::detail::concatenate(tdigest_views, stream, cudf::get_current_device_resource_ref());
 
-  auto merged_weights = merged->get_column(1).view();
-  // If there are no values, we can simply return a column that has only empty tdigests.
-  if (merged_weights.size() == 0) {
-    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(num_groups, stream, mr);
-  }
-
   // generate cumulative weights
+  auto merged_weights     = merged->get_column(1).view();
   auto cumulative_weights = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED, stream);
   auto keys = cudf::detail::make_counting_transform_iterator(
@@ -1175,10 +1161,6 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 
   auto const delta = max_centroids;
 
-  // We do not know whether there is any empty cluster in the input without actually reading the
-  // data, which could be expensive. So, we just assume that there could be empty clusters.
-  auto const may_have_empty_clusters = true;
-
   // generate cluster info
   auto [group_cluster_wl, group_cluster_offsets, total_clusters] = generate_group_cluster_info(
     delta,
@@ -1195,7 +1177,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
       group_labels,
       group_offsets,
       {tdigest_offsets.begin<size_type>(), static_cast<size_t>(tdigest_offsets.size())}},
-    may_have_empty_clusters,
+    false,
     stream,
     mr);
 
@@ -1220,7 +1202,7 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
     group_cluster_wl,
     std::move(group_cluster_offsets),
     total_clusters,
-    may_have_empty_clusters,
+    false,
     stream,
     mr);
 }
@@ -1285,9 +1267,7 @@ std::unique_ptr<column> group_tdigest(column_view const& col,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
-  if (col.size() == 0) {
-    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr);
-  }
+  if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
 
   auto const delta = max_centroids;
   return cudf::type_dispatcher(col.type(),
@@ -1313,7 +1293,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
   tdigest_column_view tdv(input);
 
   if (num_groups == 0 || input.size() == 0) {
-    return cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(1, stream, mr);
+    return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr);
   }
 
   // bring group offsets back to the host
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index 3780dbb1d95..baa59026b07 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -469,16 +469,16 @@ TEST_F(TDigestMergeTest, EmptyGroups)
   cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0};
   int const delta = 1000;
 
-  auto a = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto b = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta);
-  auto c = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   auto d = cudf::type_dispatcher(
     static_cast<cudf::column_view>(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta);
-  auto e = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto e = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
 
   std::vector<cudf::column_view> cols;
   cols.push_back(*a);
@@ -507,81 +507,3 @@ TEST_F(TDigestMergeTest, EmptyGroups)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result.second[0].results[0]);
 }
-
-std::unique_ptr<cudf::table> do_agg(
-  cudf::column_view key,
-  cudf::column_view val,
-  std::function<std::unique_ptr<cudf::groupby_aggregation>()> make_agg)
-{
-  std::vector<cudf::column_view> keys;
-  keys.push_back(key);
-  cudf::table_view const key_table(keys);
-
-  cudf::groupby::groupby gb(key_table);
-  std::vector<cudf::groupby::aggregation_request> requests;
-  cudf::groupby::aggregation_request req;
-  req.values = val;
-  req.aggregations.push_back(make_agg());
-  requests.push_back(std::move(req));
-
-  auto result = gb.aggregate(std::move(requests));
-
-  std::vector<std::unique_ptr<cudf::column>> result_columns;
-  for (auto&& c : result.first->release()) {
-    result_columns.push_back(std::move(c));
-  }
-
-  EXPECT_EQ(result.second.size(), 1);
-  EXPECT_EQ(result.second[0].results.size(), 1);
-  result_columns.push_back(std::move(result.second[0].results[0]));
-
-  return std::make_unique<cudf::table>(std::move(result_columns));
-}
-
-TEST_F(TDigestMergeTest, AllGroupsHaveEmptyClusters)
-{
-  // The input must be sorted by the key.
-  // See `aggregate_result_functor::operator()<aggregation::TDIGEST>` for details.
-  auto const keys      = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 0, 1, 1, 2}};
-  auto const keys_view = cudf::column_view(keys);
-  auto val_elems  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
-  auto val_valids = cudf::detail::make_counting_transform_iterator(0, [](auto i) {
-    // All values are null
-    return false;
-  });
-  auto const vals = cudf::test::fixed_width_column_wrapper<int32_t>{
-    val_elems, val_elems + keys_view.size(), val_valids};
-
-  auto const delta = 10000;
-
-  // Compute tdigest. The result should have 3 empty clusters, one per group.
-  auto const compute_result = do_agg(keys_view, cudf::column_view(vals), [&delta]() {
-    return cudf::make_tdigest_aggregation<cudf::groupby_aggregation>(delta);
-  });
-
-  auto const expected_computed_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
-  cudf::column_view const expected_computed_keys_view{expected_computed_keys};
-  auto const expected_computed_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    expected_computed_keys_view.size(),
-    cudf::get_default_stream(),
-    rmm::mr::get_current_device_resource());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_keys_view, compute_result->get_column(0).view());
-  // The computed values are nullable even though the input values are not.
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_computed_vals->view(),
-                                 compute_result->get_column(1).view());
-
-  // Merge tdigest. The result should have 3 empty clusters, one per group.
-  auto const merge_result =
-    do_agg(compute_result->get_column(0).view(), compute_result->get_column(1).view(), [&delta]() {
-      return cudf::make_merge_tdigest_aggregation<cudf::groupby_aggregation>(delta);
-    });
-
-  auto const expected_merged_keys = cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2}};
-  cudf::column_view const expected_merged_keys_view{expected_merged_keys};
-  auto const expected_merged_vals = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    expected_merged_keys_view.size(),
-    cudf::get_default_stream(),
-    rmm::mr::get_current_device_resource());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_keys_view, merge_result->get_column(0).view());
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_merged_vals->view(), merge_result->get_column(1).view());
-}
diff --git a/cpp/tests/quantiles/percentile_approx_test.cpp b/cpp/tests/quantiles/percentile_approx_test.cpp
index 7359f0406fc..915717713df 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cpp
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -371,8 +371,8 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {};
 
 TEST_F(PercentileApproxTest, EmptyInput)
 {
-  auto empty_ = cudf::tdigest::detail::make_tdigest_column_of_empty_clusters(
-    1, cudf::get_default_stream(), cudf::get_current_device_resource_ref());
+  auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column(
+    cudf::get_default_stream(), cudf::get_current_device_resource_ref());
   cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
 
   std::vector<cudf::column_view> input;

From 264a4445fd3db48b5e09094c828de87abb7edba9 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Thu, 12 Sep 2024 12:06:32 +0000
Subject: [PATCH 807/842] test

---
 .../cudf/pandas/scripts/conftest-patch.py     |  2 +-
 .../cudf/pandas/scripts/run-pandas-tests.sh   |  7 +++++-
 .../pandas/scripts/summarize-test-results.py  | 23 +++++++++++++++++++
 python/cudf/cudf_pandas_tests/conftest.py     |  2 +-
 4 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index db19d6a6471..d855b3468a0 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -88,7 +88,7 @@ def pytest_runtest_teardown(item, nextitem):
     ):
         # Write the function call counts to a file
         worker_id = os.getenv("PYTEST_XDIST_WORKER", "master")
-        output_file = f'function_call_counts_{os.path.basename(item.nodeid.split("::")[0])}_{worker_id}.json'
+        output_file = f'{item.nodeid.split("::")[0].replace("/", "__")}_{worker_id}_metrics.json'
         with open(output_file, "w") as f:
             json.dump(dict(function_call_counts), f, indent=4)
         print(f"Function call counts have been written to {output_file}")
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 36c73ba3e26..4dc3ac03cc7 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -134,7 +134,8 @@ TEST_THAT_CRASH_PYTEST_WORKERS="not test_bitmasks_pyarrow \
 and not test_large_string_pyarrow \
 and not test_interchange_from_corrected_buffer_dtypes \
 and not test_eof_states \
-and not test_array_tz"
+and not test_array_tz \
+and not test_groupby_raises_category"
 
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 600m python -m pytest -p cudf.pandas \
@@ -145,9 +146,13 @@ PANDAS_CI="1" timeout 600m python -m pytest -p cudf.pandas \
     "$@" || [ $? = 1 ]  # Exit success if exit code was 1 (permit test failures but not other errors)
 
 mv *.json ..
+pwd
 ls -al
+pwd
 ls -al tests/
 cd ..
+pwd
 ls -al
+pwd
 ls -al pandas-tests/
 rm -rf pandas-testing/pandas-tests/
diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
index ffd2abb960d..6c238de0f12 100644
--- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py
+++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
@@ -12,7 +12,9 @@
 """
 
 import argparse
+import glob
 import json
+import os
 
 from rich.console import Console
 from rich.table import Table
@@ -57,6 +59,27 @@ def get_per_module_results(log_file_name):
                 per_module_results[module_name].setdefault(outcome, 0)
                 per_module_results[module_name]["total"] += 1
                 per_module_results[module_name][outcome] += 1
+
+    for key, value in per_module_results.items():
+        processed_name = key.replace("/", "__") + "_*_metrics.json"
+        # Assuming the directory is the same as the module name's directory
+        # directory = os.path.dirname(os.getcwd())
+        pattern = os.path.join(
+            "/nvme/0/pgali/cudf/pandas-testing", processed_name
+        )
+        matching_files = glob.glob(pattern)
+        for file in matching_files:
+            with open(file) as f:
+                function_call_counts = json.load(f)
+            per_module_results[key]["_slow_function_call"] = (
+                per_module_results[key].get("_slow_function_call", 0)
+                + function_call_counts.get("_slow_function_call", 0)
+            )
+            per_module_results[key]["_fast_function_call"] = (
+                per_module_results[key].get("_fast_function_call", 0)
+                + function_call_counts.get("_fast_function_call", 0)
+            )
+            # value["function_call_counts"] = function_call_counts
     return per_module_results
 
 
diff --git a/python/cudf/cudf_pandas_tests/conftest.py b/python/cudf/cudf_pandas_tests/conftest.py
index 7c4bec4e8a8..d4dc4047df1 100644
--- a/python/cudf/cudf_pandas_tests/conftest.py
+++ b/python/cudf/cudf_pandas_tests/conftest.py
@@ -56,7 +56,7 @@ def pytest_runtest_teardown(item, nextitem):
     ):
         # Write the function call counts to a file
         worker_id = os.getenv("PYTEST_XDIST_WORKER", "master")
-        output_file = f'function_call_counts_{os.path.basename(item.nodeid.split("::")[0])}_{worker_id}.json'
+        output_file = f'{item.nodeid.split("::")[0].replace("/", "__")}_{worker_id}_metrics.json'
         with open(output_file, "w") as f:
             json.dump(dict(function_call_counts), f)
             # for func, count in function_call_counts.items():

From 337cef83d6b253456f9650987dd69d50b3e1c731 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Thu, 12 Sep 2024 15:23:59 +0000
Subject: [PATCH 808/842] test

---
 python/cudf/cudf/pandas/scripts/summarize-test-results.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
index 6c238de0f12..babda8313af 100644
--- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py
+++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
@@ -63,10 +63,8 @@ def get_per_module_results(log_file_name):
     for key, value in per_module_results.items():
         processed_name = key.replace("/", "__") + "_*_metrics.json"
         # Assuming the directory is the same as the module name's directory
-        # directory = os.path.dirname(os.getcwd())
-        pattern = os.path.join(
-            "/nvme/0/pgali/cudf/pandas-testing", processed_name
-        )
+        directory = os.path.dirname(log_file_name)
+        pattern = os.path.join(directory, processed_name)
         matching_files = glob.glob(pattern)
         for file in matching_files:
             with open(file) as f:

From 5efca92297798e0179ec5acc9828c7b5e8c2c252 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Thu, 12 Sep 2024 17:57:17 +0000
Subject: [PATCH 809/842] test

---
 ci/cudf_pandas_scripts/pandas-tests/job-summary.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
index 93a815838b7..34742728ed8 100644
--- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
+++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -69,7 +69,7 @@ def emoji_failed(x):
 main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index()
 diff_df = pr_df - main_df
 
-pr_df = pr_df[["total", "passed", "failed", "skipped"]]
+pr_df = pr_df[["total", "passed", "failed", "skipped", "_slow_function_call", "_fast_function_call"]]
 diff_df = diff_df[["total", "passed", "failed", "skipped"]]
 diff_df.columns = diff_df.columns + "_diff"
 diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed)
@@ -89,6 +89,8 @@ def emoji_failed(x):
         "passed_diff": "Passed delta",
         "failed_diff": "Failed delta",
         "skipped_diff": "Skipped delta",
+        "_slow_function_call" : "Slow function calls",
+        "_fast_function_call" : "Fast function calls",
     }
 )
 df = df.sort_values(by=["Failed tests", "Skipped tests"], ascending=False)

From 5e6ec9871eb1974e004d45e1f7ad24218d1967c6 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Fri, 13 Sep 2024 21:27:49 +0000
Subject: [PATCH 810/842] test

---
 ci/cudf_pandas_scripts/pandas-tests/job-summary.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
index 34742728ed8..3c8a5b9cc96 100644
--- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
+++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -68,8 +68,10 @@ def emoji_failed(x):
 pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index()
 main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index()
 diff_df = pr_df - main_df
+pr_df['Slow calls %'] = (pr_df['_slow_function_call']/(pr_df['_slow_function_call'] + pr_df['_fast_function_call']))*100.0
+pr_df['Fast calls %'] = (pr_df['_fast_function_call']/(pr_df['_slow_function_call'] + pr_df['_fast_function_call']))*100.0
+pr_df = pr_df[["total", "passed", "failed", "skipped", 'Slow calls %', 'Fast calls %']]
 
-pr_df = pr_df[["total", "passed", "failed", "skipped", "_slow_function_call", "_fast_function_call"]]
 diff_df = diff_df[["total", "passed", "failed", "skipped"]]
 diff_df.columns = diff_df.columns + "_diff"
 diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed)
@@ -89,8 +91,8 @@ def emoji_failed(x):
         "passed_diff": "Passed delta",
         "failed_diff": "Failed delta",
         "skipped_diff": "Skipped delta",
-        "_slow_function_call" : "Slow function calls",
-        "_fast_function_call" : "Fast function calls",
+        # "_slow_function_call" : "Slow function calls",
+        # "_fast_function_call" : "Fast function calls",
     }
 )
 df = df.sort_values(by=["Failed tests", "Skipped tests"], ascending=False)

From 2200ec2e9b4893f7bc9627de7ccca7c0e3f204c6 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Sat, 14 Sep 2024 00:19:18 +0000
Subject: [PATCH 811/842] test

---
 ci/cudf_pandas_scripts/pandas-tests/job-summary.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
index 3c8a5b9cc96..b6c234d6ad3 100644
--- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
+++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -68,8 +68,13 @@ def emoji_failed(x):
 pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index()
 main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index()
 diff_df = pr_df - main_df
-pr_df['Slow calls %'] = (pr_df['_slow_function_call']/(pr_df['_slow_function_call'] + pr_df['_fast_function_call']))*100.0
-pr_df['Fast calls %'] = (pr_df['_fast_function_call']/(pr_df['_slow_function_call'] + pr_df['_fast_function_call']))*100.0
+pr_df['Slow calls %'] = ((pr_df['_slow_function_call']/(pr_df['_slow_function_call'] + pr_df['_fast_function_call']))*100.0).round(2)
+pr_df['Fast calls %'] = ((pr_df['_fast_function_call']/(pr_df['_slow_function_call'] + pr_df['_fast_function_call']))*100.0).round(2)
+
+# Add '%' suffix to 'Slow calls %' and 'Fast calls %' columns
+pr_df['Slow calls %'] = pr_df['Slow calls %'].astype(str) + ' %'
+pr_df['Fast calls %'] = pr_df['Fast calls %'].astype(str) + ' %'
+
 pr_df = pr_df[["total", "passed", "failed", "skipped", 'Slow calls %', 'Fast calls %']]
 
 diff_df = diff_df[["total", "passed", "failed", "skipped"]]

From 1b7b5a9a2e2bbccb68dbefbfdf4c21521546235e Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Sat, 14 Sep 2024 03:51:01 +0000
Subject: [PATCH 812/842] test

---
 ci/cudf_pandas_scripts/pandas-tests/job-summary.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
index b6c234d6ad3..588a5f1bd04 100644
--- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
+++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -68,13 +68,16 @@ def emoji_failed(x):
 pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index()
 main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index()
 diff_df = pr_df - main_df
-pr_df['Slow calls %'] = ((pr_df['_slow_function_call']/(pr_df['_slow_function_call'] + pr_df['_fast_function_call']))*100.0).round(2)
-pr_df['Fast calls %'] = ((pr_df['_fast_function_call']/(pr_df['_slow_function_call'] + pr_df['_fast_function_call']))*100.0).round(2)
+pr_df['Slow calls %'] = ((pr_df['_slow_function_call']/(pr_df['_slow_function_call'] + pr_df['_fast_function_call']))*100.0).round(1)
+pr_df['Fast calls %'] = ((pr_df['_fast_function_call']/(pr_df['_slow_function_call'] + pr_df['_fast_function_call']))*100.0).round(1)
 
 # Add '%' suffix to 'Slow calls %' and 'Fast calls %' columns
 pr_df['Slow calls %'] = pr_df['Slow calls %'].astype(str) + ' %'
 pr_df['Fast calls %'] = pr_df['Fast calls %'].astype(str) + ' %'
 
+pr_df['Slow calls %'] = pr_df['Slow calls %'].replace('nan %', '0 %')
+pr_df['Fast calls %'] = pr_df['Fast calls %'].replace('nan %', '0 %')
+
 pr_df = pr_df[["total", "passed", "failed", "skipped", 'Slow calls %', 'Fast calls %']]
 
 diff_df = diff_df[["total", "passed", "failed", "skipped"]]

From b0e4955f05672cfa274a88af00570b70b755fe7e Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Sun, 15 Sep 2024 01:51:56 +0000
Subject: [PATCH 813/842] test

---
 .../pandas-tests/job-summary.py                 | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
index 588a5f1bd04..7e85aa0a178 100644
--- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
+++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -68,18 +68,17 @@ def emoji_failed(x):
 pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index()
 main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index()
 diff_df = pr_df - main_df
-pr_df['Slow calls %'] = ((pr_df['_slow_function_call']/(pr_df['_slow_function_call'] + pr_df['_fast_function_call']))*100.0).round(1)
-pr_df['Fast calls %'] = ((pr_df['_fast_function_call']/(pr_df['_slow_function_call'] + pr_df['_fast_function_call']))*100.0).round(1)
+pr_df['CPU Usage'] = ((pr_df['_slow_function_call']/(pr_df['_slow_function_call'] + pr_df['_fast_function_call']))*100.0).round(1)
+pr_df['GPU Usage'] = ((pr_df['_fast_function_call']/(pr_df['_slow_function_call'] + pr_df['_fast_function_call']))*100.0).round(1)
 
-# Add '%' suffix to 'Slow calls %' and 'Fast calls %' columns
-pr_df['Slow calls %'] = pr_df['Slow calls %'].astype(str) + ' %'
-pr_df['Fast calls %'] = pr_df['Fast calls %'].astype(str) + ' %'
+# Add '%' suffix to 'CPU Usage' and 'GPU Usage' columns
+pr_df['CPU Usage'] = pr_df['CPU Usage'].astype(str) + '%'
+pr_df['GPU Usage'] = pr_df['GPU Usage'].astype(str) + '%'
 
-pr_df['Slow calls %'] = pr_df['Slow calls %'].replace('nan %', '0 %')
-pr_df['Fast calls %'] = pr_df['Fast calls %'].replace('nan %', '0 %')
-
-pr_df = pr_df[["total", "passed", "failed", "skipped", 'Slow calls %', 'Fast calls %']]
+pr_df['CPU Usage'] = pr_df['CPU Usage'].replace('nan%', '0%')
+pr_df['GPU Usage'] = pr_df['GPU Usage'].replace('nan%', '0%')
 
+pr_df = pr_df[["total", "passed", "failed", "skipped", 'CPU Usage', 'GPU Usage']]
 diff_df = diff_df[["total", "passed", "failed", "skipped"]]
 diff_df.columns = diff_df.columns + "_diff"
 diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed)

From d2344dcdf0752cef5ed4eb0657e62f5797fae98a Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Sun, 15 Sep 2024 12:34:47 +0000
Subject: [PATCH 814/842] test

---
 ci/cudf_pandas_scripts/pandas-tests/job-summary.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
index 7e85aa0a178..9e8c29e1637 100644
--- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
+++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -71,6 +71,9 @@ def emoji_failed(x):
 pr_df['CPU Usage'] = ((pr_df['_slow_function_call']/(pr_df['_slow_function_call'] + pr_df['_fast_function_call']))*100.0).round(1)
 pr_df['GPU Usage'] = ((pr_df['_fast_function_call']/(pr_df['_slow_function_call'] + pr_df['_fast_function_call']))*100.0).round(1)
 
+cpu_usage_mean = pr_df['CPU Usage'].mean()
+gpu_usage_mean = pr_df['GPU Usage'].mean()
+
 # Add '%' suffix to 'CPU Usage' and 'GPU Usage' columns
 pr_df['CPU Usage'] = pr_df['CPU Usage'].astype(str) + '%'
 pr_df['GPU Usage'] = pr_df['GPU Usage'].astype(str) + '%'
@@ -98,14 +101,13 @@ def emoji_failed(x):
         "passed_diff": "Passed delta",
         "failed_diff": "Failed delta",
         "skipped_diff": "Skipped delta",
-        # "_slow_function_call" : "Slow function calls",
-        # "_fast_function_call" : "Fast function calls",
     }
 )
 df = df.sort_values(by=["Failed tests", "Skipped tests"], ascending=False)
 
 print(comment)
 print()
+print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean} % and {gpu_usage_mean} %")
 print("Here are the results of running the Pandas tests against this PR:")
 print()
 print(df.to_markdown())

From 32d3a307d550a7be1d7883e035c5a1aa87442650 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Sun, 15 Sep 2024 18:36:31 +0000
Subject: [PATCH 815/842] cleanup

---
 ci/cudf_pandas_scripts/pandas-tests/job-summary.py  | 5 +++--
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 5 ++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
index 9e8c29e1637..6944aee98ae 100644
--- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
+++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -71,8 +71,8 @@ def emoji_failed(x):
 pr_df['CPU Usage'] = ((pr_df['_slow_function_call']/(pr_df['_slow_function_call'] + pr_df['_fast_function_call']))*100.0).round(1)
 pr_df['GPU Usage'] = ((pr_df['_fast_function_call']/(pr_df['_slow_function_call'] + pr_df['_fast_function_call']))*100.0).round(1)
 
-cpu_usage_mean = pr_df['CPU Usage'].mean()
-gpu_usage_mean = pr_df['GPU Usage'].mean()
+cpu_usage_mean = pr_df['CPU Usage'].mean().round(2)
+gpu_usage_mean = pr_df['GPU Usage'].mean().round(2)
 
 # Add '%' suffix to 'CPU Usage' and 'GPU Usage' columns
 pr_df['CPU Usage'] = pr_df['CPU Usage'].astype(str) + '%'
@@ -108,6 +108,7 @@ def emoji_failed(x):
 print(comment)
 print()
 print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean} % and {gpu_usage_mean} %")
+print()
 print("Here are the results of running the Pandas tests against this PR:")
 print()
 print(df.to_markdown())
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 4dc3ac03cc7..b1ac2166e9f 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -134,11 +134,10 @@ TEST_THAT_CRASH_PYTEST_WORKERS="not test_bitmasks_pyarrow \
 and not test_large_string_pyarrow \
 and not test_interchange_from_corrected_buffer_dtypes \
 and not test_eof_states \
-and not test_array_tz \
-and not test_groupby_raises_category"
+and not test_array_tz"
 
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
-PANDAS_CI="1" timeout 600m python -m pytest -p cudf.pandas \
+PANDAS_CI="1" timeout 90m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
     -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \
     --import-mode=importlib \

From 84c58e1697b44da35c8a1d3834e8e4a9821728a1 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 16 Sep 2024 14:48:04 +0000
Subject: [PATCH 816/842] update and cleanup

---
 python/cudf/cudf/pandas/fast_slow_proxy.py    | 10 ++-
 .../cudf/pandas/scripts/run-pandas-tests.sh   |  8 --
 .../pandas/scripts/summarize-test-results.py  |  1 -
 python/cudf/cudf_pandas_tests/conftest.py     | 85 -------------------
 4 files changed, 8 insertions(+), 96 deletions(-)
 delete mode 100644 python/cudf/cudf_pandas_tests/conftest.py

diff --git a/python/cudf/cudf/pandas/fast_slow_proxy.py b/python/cudf/cudf/pandas/fast_slow_proxy.py
index 9a6a63c2d35..bf2ee6ae624 100644
--- a/python/cudf/cudf/pandas/fast_slow_proxy.py
+++ b/python/cudf/cudf/pandas/fast_slow_proxy.py
@@ -882,11 +882,17 @@ def _assert_fast_slow_eq(left, right):
 
 
 def _fast_function_call():
-    return 1
+    """
+    Placeholder fast function for pytest profiling purposes.
+    """
+    return None
 
 
 def _slow_function_call():
-    return 1
+    """
+    Placeholder slow function for pytest profiling purposes.
+    """
+    return None
 
 
 def _fast_slow_function_call(
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index b1ac2166e9f..416264ea04d 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -145,13 +145,5 @@ PANDAS_CI="1" timeout 90m python -m pytest -p cudf.pandas \
     "$@" || [ $? = 1 ]  # Exit success if exit code was 1 (permit test failures but not other errors)
 
 mv *.json ..
-pwd
-ls -al
-pwd
-ls -al tests/
 cd ..
-pwd
-ls -al
-pwd
-ls -al pandas-tests/
 rm -rf pandas-testing/pandas-tests/
diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
index babda8313af..53159a50909 100644
--- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py
+++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
@@ -77,7 +77,6 @@ def get_per_module_results(log_file_name):
                 per_module_results[key].get("_fast_function_call", 0)
                 + function_call_counts.get("_fast_function_call", 0)
             )
-            # value["function_call_counts"] = function_call_counts
     return per_module_results
 
 
diff --git a/python/cudf/cudf_pandas_tests/conftest.py b/python/cudf/cudf_pandas_tests/conftest.py
deleted file mode 100644
index d4dc4047df1..00000000000
--- a/python/cudf/cudf_pandas_tests/conftest.py
+++ /dev/null
@@ -1,85 +0,0 @@
-# Copyright (c) 2022-2024, NVIDIA CORPORATION.
-import json
-import multiprocessing
-import os
-import sys
-
-import pytest
-
-# Dictionary to store function call counts
-manager = multiprocessing.Manager()
-function_call_counts = manager.dict()
-
-# The specific function to track
-FUNCTION_NAME = {"_slow_function_call", "_fast_function_call"}
-
-
-def trace_calls(frame, event, arg):
-    if event != "call":
-        return
-    code = frame.f_code
-    func_name = code.co_name
-    if func_name in FUNCTION_NAME:
-        function_call_counts[func_name] = (
-            function_call_counts.get(func_name, 0) + 1
-        )
-
-
-def pytest_sessionstart(session):
-    # Set the profile function to trace calls
-    sys.setprofile(trace_calls)
-
-
-def pytest_sessionfinish(session, exitstatus):
-    # Remove the profile function
-    sys.setprofile(None)
-
-
-@pytest.hookimpl(tryfirst=True)
-def pytest_runtest_setup(item):
-    # Check if this is the first test in the file
-    if item.nodeid.split("::")[0] != getattr(
-        pytest_runtest_setup, "current_file", None
-    ):
-        # If it's a new file, reset the function call counts
-        global function_call_counts
-        function_call_counts = manager.dict()
-        pytest_runtest_setup.current_file = item.nodeid.split("::")[0]
-
-
-@pytest.hookimpl(trylast=True)
-def pytest_runtest_teardown(item, nextitem):
-    # Check if this is the last test in the file
-    if (
-        nextitem is None
-        or nextitem.nodeid.split("::")[0] != item.nodeid.split("::")[0]
-    ):
-        # Write the function call counts to a file
-        worker_id = os.getenv("PYTEST_XDIST_WORKER", "master")
-        output_file = f'{item.nodeid.split("::")[0].replace("/", "__")}_{worker_id}_metrics.json'
-        with open(output_file, "w") as f:
-            json.dump(dict(function_call_counts), f)
-            # for func, count in function_call_counts.items():
-            #     f.write(f'{func}: {count}\n')
-        print(f"Function call counts have been written to {output_file}")
-
-
-@pytest.hookimpl(tryfirst=True)
-def pytest_configure(config):
-    if hasattr(config, "workerinput"):
-        # Running in xdist worker
-        global function_call_counts
-        function_call_counts = manager.dict()
-
-
-@pytest.hookimpl(trylast=True)
-def pytest_unconfigure(config):
-    if hasattr(config, "workerinput"):
-        # Running in xdist worker
-        worker_id = config.workerinput["workerid"]
-        output_file = f"function_call_counts_worker_{worker_id}.json"
-        with open(output_file, "w") as f:
-            json.dump(dict(function_call_counts), f)
-            # for func, count in function_call_counts.items():
-            #     f.write(f'{func}: {count}\n')
-        print(f"Function call counts have been written to {output_file}")

From c4f4cbf661a5f49e65ceb5faa52c9d827986f261 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 16 Sep 2024 14:50:18 +0000
Subject: [PATCH 817/842] revert

---
 .github/workflows/pr.yaml | 318 +++++++++++++++++++-------------------
 1 file changed, 159 insertions(+), 159 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 18d1ea39d59..a4a8f036174 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -14,26 +14,26 @@ jobs:
     needs:
       - changed-files
       - checks
-      # - conda-cpp-build
-      # - conda-cpp-checks
-      # - conda-cpp-tests
-      # - conda-python-build
-      # - conda-python-cudf-tests
-      # - conda-python-other-tests
-      # - conda-java-tests
+      - conda-cpp-build
+      - conda-cpp-checks
+      - conda-cpp-tests
+      - conda-python-build
+      - conda-python-cudf-tests
+      - conda-python-other-tests
+      - conda-java-tests
       - static-configure
-      # - conda-notebook-tests
-      # - docs-build
+      - conda-notebook-tests
+      - docs-build
       - wheel-build-libcudf
       - wheel-build-pylibcudf
       - wheel-build-cudf
-      # - wheel-tests-cudf
-      # - wheel-build-cudf-polars
-      # - wheel-tests-cudf-polars
-      # - wheel-build-dask-cudf
-      # - wheel-tests-dask-cudf
-      # - devcontainer
-      # - unit-tests-cudf-pandas
+      - wheel-tests-cudf
+      - wheel-build-cudf-polars
+      - wheel-tests-cudf-polars
+      - wheel-build-dask-cudf
+      - wheel-tests-dask-cudf
+      - devcontainer
+      - unit-tests-cudf-pandas
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
@@ -107,60 +107,60 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12
     with:
       enable_check_generated_files: false
-  # conda-cpp-build:
-  #   needs: checks
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
-  #   with:
-  #     build_type: pull-request
-  # conda-cpp-checks:
-  #   needs: conda-cpp-build
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
-  #   with:
-  #     build_type: pull-request
-  #     enable_check_symbols: true
-  # conda-cpp-tests:
-  #   needs: [conda-cpp-build, changed-files]
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
-  #   if: needs.changed-files.outputs.test_cpp == 'true'
-  #   with:
-  #     build_type: pull-request
-  # conda-python-build:
-  #   needs: conda-cpp-build
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
-  #   with:
-  #     build_type: pull-request
-  # conda-python-cudf-tests:
-  #   needs: [conda-python-build, changed-files]
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
-  #   if: needs.changed-files.outputs.test_python == 'true'
-  #   with:
-  #     build_type: pull-request
-  #     script: "ci/test_python_cudf.sh"
-  # conda-python-other-tests:
-  #   # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
-  #   needs: [conda-python-build, changed-files]
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
-  #   if: needs.changed-files.outputs.test_python == 'true'
-  #   with:
-  #     build_type: pull-request
-  #     script: "ci/test_python_other.sh"
-  # conda-java-tests:
-  #   needs: [conda-cpp-build, changed-files]
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
-  #   if: needs.changed-files.outputs.test_java == 'true'
-  #   with:
-  #     build_type: pull-request
-  #     node_type: "gpu-v100-latest-1"
-  #     arch: "amd64"
-  #     container_image: "rapidsai/ci-conda:latest"
-  #     run_script: "ci/test_java.sh"
+  conda-cpp-build:
+    needs: checks
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
+    with:
+      build_type: pull-request
+  conda-cpp-checks:
+    needs: conda-cpp-build
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
+    with:
+      build_type: pull-request
+      enable_check_symbols: true
+  conda-cpp-tests:
+    needs: [conda-cpp-build, changed-files]
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
+    if: needs.changed-files.outputs.test_cpp == 'true'
+    with:
+      build_type: pull-request
+  conda-python-build:
+    needs: conda-cpp-build
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+    with:
+      build_type: pull-request
+  conda-python-cudf-tests:
+    needs: [conda-python-build, changed-files]
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    if: needs.changed-files.outputs.test_python == 'true'
+    with:
+      build_type: pull-request
+      script: "ci/test_python_cudf.sh"
+  conda-python-other-tests:
+    # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
+    needs: [conda-python-build, changed-files]
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    if: needs.changed-files.outputs.test_python == 'true'
+    with:
+      build_type: pull-request
+      script: "ci/test_python_other.sh"
+  conda-java-tests:
+    needs: [conda-cpp-build, changed-files]
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    if: needs.changed-files.outputs.test_java == 'true'
+    with:
+      build_type: pull-request
+      node_type: "gpu-v100-latest-1"
+      arch: "amd64"
+      container_image: "rapidsai/ci-conda:latest"
+      run_script: "ci/test_java.sh"
   static-configure:
     needs: checks
     secrets: inherit
@@ -171,27 +171,27 @@ jobs:
       # primary static consumers (Spark) are not in conda anyway.
       container_image: "rapidsai/ci-wheel:latest"
       run_script: "ci/configure_cpp_static.sh"
-  # conda-notebook-tests:
-  #   needs: [conda-python-build, changed-files]
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
-  #   if: needs.changed-files.outputs.test_notebooks == 'true'
-  #   with:
-  #     build_type: pull-request
-  #     node_type: "gpu-v100-latest-1"
-  #     arch: "amd64"
-  #     container_image: "rapidsai/ci-conda:latest"
-  #     run_script: "ci/test_notebooks.sh"
-  # docs-build:
-  #   needs: conda-python-build
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
-  #   with:
-  #     build_type: pull-request
-  #     node_type: "gpu-v100-latest-1"
-  #     arch: "amd64"
-  #     container_image: "rapidsai/ci-conda:latest"
-  #     run_script: "ci/build_docs.sh"
+  conda-notebook-tests:
+    needs: [conda-python-build, changed-files]
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    if: needs.changed-files.outputs.test_notebooks == 'true'
+    with:
+      build_type: pull-request
+      node_type: "gpu-v100-latest-1"
+      arch: "amd64"
+      container_image: "rapidsai/ci-conda:latest"
+      run_script: "ci/test_notebooks.sh"
+  docs-build:
+    needs: conda-python-build
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    with:
+      build_type: pull-request
+      node_type: "gpu-v100-latest-1"
+      arch: "amd64"
+      container_image: "rapidsai/ci-conda:latest"
+      run_script: "ci/build_docs.sh"
   wheel-build-libcudf:
     needs: checks
     secrets: inherit
@@ -215,74 +215,74 @@ jobs:
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
-  # wheel-tests-cudf:
-  #   needs: [wheel-build-cudf, changed-files]
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
-  #   if: needs.changed-files.outputs.test_python == 'true'
-  #   with:
-  #     build_type: pull-request
-  #     script: ci/test_wheel_cudf.sh
-  # wheel-build-cudf-polars:
-  #   needs: wheel-build-pylibcudf
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
-  #   with:
-  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-  #     build_type: pull-request
-  #     script: "ci/build_wheel_cudf_polars.sh"
-  # wheel-tests-cudf-polars:
-  #   needs: [wheel-build-cudf-polars, changed-files]
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
-  #   if: needs.changed-files.outputs.test_python == 'true'
-  #   with:
-  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-  #     build_type: pull-request
-  #     # This always runs, but only fails if this PR touches code in
-  #     # pylibcudf or cudf_polars
-  #     script: "ci/test_wheel_cudf_polars.sh"
-  # wheel-build-dask-cudf:
-  #   needs: wheel-build-cudf
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
-  #   with:
-  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-  #     build_type: pull-request
-  #     script: "ci/build_wheel_dask_cudf.sh"
-  # wheel-tests-dask-cudf:
-  #   needs: [wheel-build-dask-cudf, changed-files]
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
-  #   if: needs.changed-files.outputs.test_python == 'true'
-  #   with:
-  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-  #     build_type: pull-request
-  #     script: ci/test_wheel_dask_cudf.sh
-  # devcontainer:
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12
-  #   with:
-  #     arch: '["amd64"]'
-  #     cuda: '["12.5"]'
-  #     build_command: |
-  #       sccache -z;
-  #       build-all -DBUILD_BENCHMARKS=ON --verbose;
-  #       sccache -s;
-  # unit-tests-cudf-pandas:
-  #   needs: [wheel-build-cudf, changed-files]
-  #   secrets: inherit
-  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
-  #   if: needs.changed-files.outputs.test_python == 'true'
-  #   with:
-  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-  #     build_type: pull-request
-  #     script: ci/cudf_pandas_scripts/run_tests.sh
+  wheel-tests-cudf:
+    needs: [wheel-build-cudf, changed-files]
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    if: needs.changed-files.outputs.test_python == 'true'
+    with:
+      build_type: pull-request
+      script: ci/test_wheel_cudf.sh
+  wheel-build-cudf-polars:
+    needs: wheel-build-pylibcudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      script: "ci/build_wheel_cudf_polars.sh"
+  wheel-tests-cudf-polars:
+    needs: [wheel-build-cudf-polars, changed-files]
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    if: needs.changed-files.outputs.test_python == 'true'
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      # This always runs, but only fails if this PR touches code in
+      # pylibcudf or cudf_polars
+      script: "ci/test_wheel_cudf_polars.sh"
+  wheel-build-dask-cudf:
+    needs: wheel-build-cudf
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      script: "ci/build_wheel_dask_cudf.sh"
+  wheel-tests-dask-cudf:
+    needs: [wheel-build-dask-cudf, changed-files]
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    if: needs.changed-files.outputs.test_python == 'true'
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      script: ci/test_wheel_dask_cudf.sh
+  devcontainer:
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12
+    with:
+      arch: '["amd64"]'
+      cuda: '["12.5"]'
+      build_command: |
+        sccache -z;
+        build-all -DBUILD_BENCHMARKS=ON --verbose;
+        sccache -s;
+  unit-tests-cudf-pandas:
+    needs: [wheel-build-cudf, changed-files]
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    if: needs.changed-files.outputs.test_python == 'true'
+    with:
+      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+      build_type: pull-request
+      script: ci/cudf_pandas_scripts/run_tests.sh
   pandas-tests:
     # run the Pandas unit tests using PR branch
     needs: [wheel-build-cudf, changed-files]

From 23545de4f8d653ff0d3251fb049db55f1fb8eeea Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 16 Sep 2024 14:51:10 +0000
Subject: [PATCH 818/842] cleanup

---
 ci/cudf_pandas_scripts/pandas-tests/job-summary.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
index 6944aee98ae..3a3da7dfa1d 100644
--- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
+++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py
@@ -107,7 +107,7 @@ def emoji_failed(x):
 
 print(comment)
 print()
-print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean} % and {gpu_usage_mean} %")
+print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean}% and {gpu_usage_mean}%")
 print()
 print("Here are the results of running the Pandas tests against this PR:")
 print()

From 124d3e353eeebd595da113dbef3d5bad842a791d Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 16 Sep 2024 12:17:58 -0500
Subject: [PATCH 819/842] Migrate dask-cudf README improvements to dask-cudf
 sphinx docs (#16765)

Follow up to https://github.com/rapidsai/cudf/pull/16671

- Moves most of the information recently added to the dask-cudf README into the dask-cudf Sphinx documentation
- Adds a "Quick-start" example to the simplified dask-cudf README

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Benjamin Zaitlen (https://github.com/quasiben)
  - Peter Andreas Entschev (https://github.com/pentschev)

URL: https://github.com/rapidsai/cudf/pull/16765
---
 docs/cudf/source/user_guide/10min.ipynb |  31 ++--
 docs/dask_cudf/source/index.rst         | 210 ++++++++++++++++++------
 python/dask_cudf/README.md              | 148 +++++------------
 3 files changed, 213 insertions(+), 176 deletions(-)

diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index 2eaa75b3189..95f5f9734dd 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -5,9 +5,9 @@
    "id": "4c6c548b",
    "metadata": {},
    "source": [
-    "# 10 Minutes to cuDF and Dask-cuDF\n",
+    "# 10 Minutes to cuDF and Dask cuDF\n",
     "\n",
-    "Modelled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask-cuDF, geared mainly towards new users.\n",
+    "Modelled after 10 Minutes to Pandas, this is a short introduction to cuDF and Dask cuDF, geared mainly towards new users.\n",
     "\n",
     "## What are these Libraries?\n",
     "\n",
@@ -18,13 +18,14 @@
     "[Dask cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed using cuDF GPU DataFrames instead of Pandas DataFrames. For instance, when you call `dask_cudf.read_csv(...)`, your cluster's GPUs do the work of parsing the CSV file(s) by calling [`cudf.read_csv()`](https://docs.rapids.ai/api/cudf/stable/api_docs/api/cudf.read_csv.html).\n",
     "\n",
     "\n",
-    "> [!NOTE]  \n",
-    "> This notebook uses the explicit Dask cuDF API (`dask_cudf`) for clarity. However, we strongly recommend that you use Dask's [configuration infrastructure](https://docs.dask.org/en/latest/configuration.html) to set the `\"dataframe.backend\"` to `\"cudf\"`, and work with the `dask.dataframe` API directly. Please see the [Dask cuDF documentation](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) for more information.\n",
+    "<div class=\"alert alert-block alert-info\">\n",
+    "<b>Note:</b> This notebook uses the explicit Dask cuDF API (dask_cudf) for clarity. However, we strongly recommend that you use Dask's <a href=\"https://docs.dask.org/en/latest/configuration.html\">configuration infrastructure</a> to set the \"dataframe.backend\" option to \"cudf\", and work with the Dask DataFrame API directly. Please see the <a href=\"https://github.com/rapidsai/cudf/tree/main/python/dask_cudf\">Dask cuDF documentation</a> for more information.\n",
+    "</div>\n",
     "\n",
     "\n",
-    "## When to use cuDF and Dask-cuDF\n",
+    "## When to use cuDF and Dask cuDF\n",
     "\n",
-    "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask-cuDF."
+    "If your workflow is fast enough on a single GPU or your data comfortably fits in memory on a single GPU, you would want to use cuDF. If you want to distribute your workflow across multiple GPUs, have more data than you can fit in memory on a single GPU, or want to analyze data spread across many files at once, you would want to use Dask cuDF."
    ]
   },
   {
@@ -115,7 +116,7 @@
    "source": [
     "ds = dask_cudf.from_cudf(s, npartitions=2)\n",
     "# Note the call to head here to show the first few entries, unlike\n",
-    "# cuDF objects, dask-cuDF objects do not have a printing\n",
+    "# cuDF objects, Dask-cuDF objects do not have a printing\n",
     "# representation that shows values since they may not be in local\n",
     "# memory.\n",
     "ds.head(n=3)"
@@ -331,11 +332,11 @@
    "id": "b17db919",
    "metadata": {},
    "source": [
-    "Now we will convert our cuDF dataframe into a dask-cuDF equivalent. Here we call out a key difference: to inspect the data we must call a method (here `.head()` to look at the first few values). In the general case (see the end of this notebook), the data in `ddf` will be distributed across multiple GPUs.\n",
+    "Now we will convert our cuDF dataframe into a Dask-cuDF equivalent. Here we call out a key difference: to inspect the data we must call a method (here `.head()` to look at the first few values). In the general case (see the end of this notebook), the data in `ddf` will be distributed across multiple GPUs.\n",
     "\n",
-    "In this small case, we could call `ddf.compute()` to obtain a cuDF object from the dask-cuDF object. In general, we should avoid calling `.compute()` on large dataframes, and restrict ourselves to using it when we have some (relatively) small postprocessed result that we wish to inspect. Hence, throughout this notebook we will generally call `.head()` to inspect the first few values of a dask-cuDF dataframe, occasionally calling out places where we use `.compute()` and why.\n",
+    "In this small case, we could call `ddf.compute()` to obtain a cuDF object from the Dask-cuDF object. In general, we should avoid calling `.compute()` on large dataframes, and restrict ourselves to using it when we have some (relatively) small postprocessed result that we wish to inspect. Hence, throughout this notebook we will generally call `.head()` to inspect the first few values of a Dask-cuDF dataframe, occasionally calling out places where we use `.compute()` and why.\n",
     "\n",
-    "*To understand more of the differences between how cuDF and dask-cuDF behave here, visit the [10 Minutes to Dask](https://docs.dask.org/en/stable/10-minutes-to-dask.html) tutorial after this one.*"
+    "*To understand more of the differences between how cuDF and Dask cuDF behave here, visit the [10 Minutes to Dask](https://docs.dask.org/en/stable/10-minutes-to-dask.html) tutorial after this one.*"
    ]
   },
   {
@@ -1680,7 +1681,7 @@
    "id": "7aa0089f",
    "metadata": {},
    "source": [
-    "Note here we call `compute()` rather than `head()` on the dask-cuDF dataframe since we are happy that the number of matching rows will be small (and hence it is reasonable to bring the entire result back)."
+    "Note here we call `compute()` rather than `head()` on the Dask-cuDF dataframe since we are happy that the number of matching rows will be small (and hence it is reasonable to bring the entire result back)."
    ]
   },
   {
@@ -2393,7 +2394,7 @@
    "id": "f6094cbe",
    "metadata": {},
    "source": [
-    "Applying functions to a `Series`. Note that applying user defined functions directly with Dask-cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe."
+    "Applying functions to a `Series`. Note that applying user defined functions directly with Dask cuDF is not yet implemented. For now, you can use [map_partitions](http://docs.dask.org/en/stable/generated/dask.dataframe.DataFrame.map_partitions.html) to apply a function to each partition of the distributed dataframe."
    ]
   },
   {
@@ -3492,7 +3493,7 @@
    "id": "5ac3b004",
    "metadata": {},
    "source": [
-    "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask-cuDF."
+    "Transposing a dataframe, using either the `transpose` method or `T` property. Currently, all columns must have the same type. Transposing is not currently implemented in Dask cuDF."
    ]
   },
   {
@@ -4181,7 +4182,7 @@
    "id": "aa8a445b",
    "metadata": {},
    "source": [
-    "To convert the first few entries to pandas, we similarly call `.head()` on the dask-cuDF dataframe to obtain a local cuDF dataframe, which we can then convert."
+    "To convert the first few entries to pandas, we similarly call `.head()` on the Dask-cuDF dataframe to obtain a local cuDF dataframe, which we can then convert."
    ]
   },
   {
@@ -4899,7 +4900,7 @@
    "id": "787eae14",
    "metadata": {},
    "source": [
-    "Note that for the dask-cuDF case, we use `dask_cudf.read_csv` in preference to `dask_cudf.from_cudf(cudf.read_csv)` since the former can parallelize across multiple GPUs and handle larger CSV files that would fit in memory on a single GPU."
+    "Note that for the Dask-cuDF case, we use `dask_cudf.read_csv` in preference to `dask_cudf.from_cudf(cudf.read_csv)` since the former can parallelize across multiple GPUs and handle larger CSV files that would fit in memory on a single GPU."
    ]
   },
   {
diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst
index 9a216690384..7fe6cbd45fa 100644
--- a/docs/dask_cudf/source/index.rst
+++ b/docs/dask_cudf/source/index.rst
@@ -3,39 +3,42 @@
    You can adapt this file completely to your liking, but it should at least
    contain the root `toctree` directive.
 
-Welcome to dask-cudf's documentation!
+Welcome to Dask cuDF's documentation!
 =====================================
 
-**Dask-cuDF** (pronounced "DASK KOO-dee-eff") is an extension
+**Dask cuDF** (pronounced "DASK KOO-dee-eff") is an extension
 library for the `Dask <https://dask.org>`__ parallel computing
-framework that provides a `cuDF
-<https://docs.rapids.ai/api/cudf/stable/>`__-backed distributed
-dataframe with the same API as `Dask dataframes
-<https://docs.dask.org/en/stable/dataframe.html>`__.
+framework. When installed, Dask cuDF is automatically registered
+as the ``"cudf"`` dataframe backend for
+`Dask DataFrame <https://docs.dask.org/en/stable/dataframe.html>`__.
+
+.. note::
+  Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU
+  or multi-node execution on their own. You must also deploy a
+  `dask.distributed <https://distributed.dask.org/en/stable/>` cluster
+  to leverage multiple GPUs. We strongly recommend using `Dask-CUDA
+  <https://docs.rapids.ai/api/dask-cuda/stable/>`__ to simplify the
+  setup of the cluster, taking advantage of all features of the GPU
+  and networking hardware.
 
 If you are familiar with Dask and `pandas <pandas.pydata.org>`__ or
-`cuDF <https://docs.rapids.ai/api/cudf/stable/>`__, then Dask-cuDF
+`cuDF <https://docs.rapids.ai/api/cudf/stable/>`__, then Dask cuDF
 should feel familiar to you. If not, we recommend starting with `10
 minutes to Dask
 <https://docs.dask.org/en/stable/10-minutes-to-dask.html>`__ followed
-by `10 minutes to cuDF and Dask-cuDF
+by `10 minutes to cuDF and Dask cuDF
 <https://docs.rapids.ai/api/cudf/stable/user_guide/10min.html>`__.
 
-When running on multi-GPU systems, `Dask-CUDA
-<https://docs.rapids.ai/api/dask-cuda/stable/>`__ is recommended to
-simplify the setup of the cluster, taking advantage of all features of
-the GPU and networking hardware.
 
-Using Dask-cuDF
+Using Dask cuDF
 ---------------
 
-When installed, Dask-cuDF registers itself as a dataframe backend for
-Dask. This means that in many cases, using cuDF-backed dataframes requires
-only small changes to an existing workflow. The minimal change is to
-select cuDF as the dataframe backend in :doc:`Dask's
-configuration <dask:configuration>`. To do so, we must set the option
-``dataframe.backend`` to ``cudf``. From Python, this can be achieved
-like so::
+The Dask DataFrame API (Recommended)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Simply use the `Dask configuration <dask:configuration>` system to
+set the ``"dataframe.backend"`` option to ``"cudf"``. From Python,
+this can be achieved like so::
 
   import dask
 
@@ -44,52 +47,157 @@ like so::
 Alternatively, you can set ``DASK_DATAFRAME__BACKEND=cudf`` in the
 environment before running your code.
 
-Dataframe creation from on-disk formats
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-If your workflow creates Dask dataframes from on-disk formats
-(for example using :func:`dask.dataframe.read_parquet`), then setting
-the backend may well be enough to migrate your workflow.
-
-For example, consider reading a dataframe from parquet::
+Once this is done, the public Dask DataFrame API will leverage
+``cudf`` automatically when a new DataFrame collection is created
+from an on-disk format using any of the following ``dask.dataframe``
+functions::
 
-   import dask.dataframe as dd
+* :func:`dask.dataframe.read_parquet`
+* :func:`dask.dataframe.read_json`
+* :func:`dask.dataframe.read_csv`
+* :func:`dask.dataframe.read_orc`
+* :func:`dask.dataframe.read_hdf`
+* :func:`dask.dataframe.from_dict`
 
-   # By default, we obtain a pandas-backed dataframe
-   df = dd.read_parquet("data.parquet", ...)
+For example::
 
+  import dask.dataframe as dd
 
-To obtain a cuDF-backed dataframe, we must set the
-``dataframe.backend`` configuration option::
+  # By default, we obtain a pandas-backed dataframe
+  df = dd.read_parquet("data.parquet", ...)
 
   import dask
-  import dask.dataframe as dd
 
   dask.config.set({"dataframe.backend": "cudf"})
-  # This gives us a cuDF-backed dataframe
+  # This now gives us a cuDF-backed dataframe
   df = dd.read_parquet("data.parquet", ...)
 
-This code will use cuDF's GPU-accelerated :func:`parquet reader
-<cudf.read_parquet>` to read partitions of the data.
+When other functions are used to create a new collection
+(e.g. :func:`from_map`, :func:`from_pandas`, :func:`from_delayed`,
+and :func:`from_array`), the backend of the new collection will
+depend on the inputs to those functions. For example::
+
+  import pandas as pd
+  import cudf
+
+  # This gives us a pandas-backed dataframe
+  dd.from_pandas(pd.DataFrame({"a": range(10)}))
+
+  # This gives us a cuDF-backed dataframe
+  dd.from_pandas(cudf.DataFrame({"a": range(10)}))
+
+An existing collection can always be moved to a specific backend
+using the :func:`dask.dataframe.DataFrame.to_backend` API::
+
+  # This ensures that we have a cuDF-backed dataframe
+  df = df.to_backend("cudf")
+
+  # This ensures that we have a pandas-backed dataframe
+  df = df.to_backend("pandas")
+
+The explicit Dask cuDF API
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In addition to providing the ``"cudf"`` backend for Dask DataFrame,
+Dask cuDF also provides an explicit ``dask_cudf`` API::
+
+  import dask_cudf
+
+  # This always gives us a cuDF-backed dataframe
+  df = dask_cudf.read_parquet("data.parquet", ...)
+
+This API is used implicitly by the Dask DataFrame API when the ``"cudf"``
+backend is enabled. Therefore, using it directly will not provide any
+performance benefit over the CPU/GPU-portable ``dask.dataframe`` API.
+Also, using some parts of the explicit API are incompatible with
+automatic query planning (see the next section).
+
+The explicit Dask cuDF API
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+).
+As long as the ``"dataframe.query-planning"`` configuration is set to
+``True`` (the default) when ``dask.dataframe`` is first imported, `Dask
+Expressions <https://github.com/dask/dask-expr>`__ will be used under the hood.
+
+For example, the following code will automatically benefit from predicate
+pushdown when the result is computed::
+
+  df = dd.read_parquet("/my/parquet/dataset/")
+  result = df.sort_values('B')['A']
+
+Unoptimized expression graph (``df.pprint()``)::
+
+  Projection: columns='A'
+    SortValues: by=['B'] shuffle_method='tasks' options={}
+      ReadParquetFSSpec: path='/my/parquet/dataset/' ...
+
+Simplified expression graph (``df.simplify().pprint()``)::
+
+  Projection: columns='A'
+    SortValues: by=['B'] shuffle_method='tasks' options={}
+      ReadParquetFSSpec: path='/my/parquet/dataset/' columns=['A', 'B'] ...
+
+.. note::
+  Dask will automatically simplify the expression graph (within
+  :func:`optimize`) when the result is converted to a task graph
+  (via :func:`compute` or :func:`persist`). You do not need to call
+  :func:`simplify` yourself.
+
+
+Using Multiple GPUs and Multiple Nodes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Whenever possible, Dask cuDF (i.e. Dask DataFrame) will automatically try
+to partition your data into small-enough tasks to fit comfortably in the
+memory of a single GPU. This means the necessary compute tasks needed to
+compute a query can often be streamed to a single GPU process for
+out-of-core computing. This also means that the compute tasks can be
+executed in parallel over a multi-GPU cluster.
+
+In order to execute your Dask workflow on multiple GPUs, you will
+typically need to use `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+to deploy distributed Dask cluster, and
+`Distributed <https://distributed.dask.org/en/stable/client.html>`__
+to define a client object. For example::
+
+  from dask_cuda import LocalCUDACluster
+  from distributed import Client
+
+  if __name__ == "__main__":
+
+    client = Client(
+      LocalCUDACluster(
+        CUDA_VISIBLE_DEVICES="0,1",  # Use two workers (on devices 0 and 1)
+        rmm_pool_size=0.9,  # Use 90% of GPU memory as a pool for faster allocations
+        enable_cudf_spill=True,  # Improve device memory stability
+        local_directory="/fast/scratch/",  # Use fast local storage for spilling
+      )
+    )
+
+    df = dd.read_parquet("/my/parquet/dataset/")
+    agg = df.groupby('B').sum()
+    agg.compute()  # This will use the cluster defined above
+
+.. note::
+  This example uses :func:`compute` to materialize a concrete
+  ``cudf.DataFrame`` object in local memory. Never call :func:`compute`
+  on a large collection that cannot fit comfortably in the memory of a
+  single GPU! See Dask's `documentation on managing computation
+  <https://distributed.dask.org/en/stable/manage-computation.html>`__
+  for more details.
 
-Dataframe creation from in-memory formats
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Please see the `Dask-CUDA <https://docs.rapids.ai/api/dask-cuda/stable/>`__
+documentation for more information about deploying GPU-aware clusters
+(including `best practices
+<https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/>`__).
 
-If you already have a dataframe in memory and want to convert it to a
-cuDF-backend one, there are two options depending on whether the
-dataframe is already a Dask one or not. If you have a Dask dataframe,
-then you can call :func:`dask.dataframe.to_backend` passing ``"cudf"``
-as the backend; if you have a pandas dataframe then you can either
-call :func:`dask.dataframe.from_pandas` followed by
-:func:`~dask.dataframe.to_backend` or first convert the dataframe with
-:func:`cudf.from_pandas` and then parallelise this with
-:func:`dask_cudf.from_cudf`.
 
 API Reference
 -------------
 
-Generally speaking, Dask-cuDF tries to offer exactly the same API as
-Dask itself. There are, however, some minor differences mostly because
+Generally speaking, Dask cuDF tries to offer exactly the same API as
+Dask DataFrame. There are, however, some minor differences mostly because
 cuDF does not :doc:`perfectly mirror <cudf:user_guide/PandasCompat>`
 the pandas API, or because cuDF provides additional configuration
 flags (these mostly occur in data reading and writing interfaces).
@@ -97,7 +205,7 @@ flags (these mostly occur in data reading and writing interfaces).
 As a result, straightforward workflows can be migrated without too
 much trouble, but more complex ones that utilise more features may
 need a bit of tweaking. The API documentation describes details of the
-differences and all functionality that Dask-cuDF supports.
+differences and all functionality that Dask cuDF supports.
 
 .. toctree::
    :maxdepth: 2
diff --git a/python/dask_cudf/README.md b/python/dask_cudf/README.md
index 6edb9f87d48..4655d2165f0 100644
--- a/python/dask_cudf/README.md
+++ b/python/dask_cudf/README.md
@@ -1,135 +1,63 @@
 # <div align="left"><img src="../../img/rapids_logo.png" width="90px"/>&nbsp;Dask cuDF - A GPU Backend for Dask DataFrame</div>
 
-Dask cuDF (a.k.a. dask-cudf or `dask_cudf`) is an extension library for [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html). When installed, Dask cuDF is automatically registered as the `"cudf"` [dataframe backend](https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html) for Dask DataFrame.
-
-## Using Dask cuDF
-
-### The Dask DataFrame API (Recommended)
-
-Simply set the `"dataframe.backend"` [configuration](https://docs.dask.org/en/stable/configuration.html) to `"cudf"` in Dask, and the public Dask DataFrame API will leverage `cudf` automatically:
-
-```python
-import dask
-dask.config.set({"dataframe.backend": "cudf"})
-
-import dask.dataframe as dd
-# This gives us a cuDF-backed dataframe
-df = dd.read_parquet("data.parquet", ...)
-```
+Dask cuDF (a.k.a. dask-cudf or `dask_cudf`) is an extension library for [Dask DataFrame](https://docs.dask.org/en/stable/dataframe.html) that provides a Pandas-like API for parallel and larger-than-memory DataFrame computing on GPUs. When installed, Dask cuDF is automatically registered as the `"cudf"` [dataframe backend](https://docs.dask.org/en/stable/how-to/selecting-the-collection-backend.html) for Dask DataFrame.
 
 > [!IMPORTANT]
-> The `"dataframe.backend"` configuration will only be used for collection creation when the following APIs are used: `read_parquet`, `read_json`, `read_csv`, `read_orc`, `read_hdf`, and `from_dict`. For example, if `from_map`, `from_pandas`, `from_delayed`, or `from_array` are used, the backend of the new collection will depend on the input to the function:
-
-```python
-import pandas as pd
-import cudf
-
-# This gives us a Pandas-backed dataframe
-dd.from_pandas(pd.DataFrame({"a": range(10)}))
-
-# This gives us a cuDF-backed dataframe
-dd.from_pandas(cudf.DataFrame({"a": range(10)}))
-```
-
-A cuDF-backed DataFrame collection can be moved to the `"pandas"` backend:
-
-```python
-df = df.to_backend("pandas")
-```
-
-Similarly, a Pandas-backed DataFrame collection can be moved to the `"cudf"` backend:
-
-```python
-df = df.to_backend("cudf")
-```
-
-### The Explicit Dask cuDF API
-
-In addition to providing the `"cudf"` backend for Dask DataFrame, Dask cuDF also provides an explicit `dask_cudf` API:
-
-```python
-import dask_cudf
-
-# This always gives us a cuDF-backed dataframe
-df = dask_cudf.read_parquet("data.parquet", ...)
-```
-
-> [!NOTE]
-> This API is used implicitly by the Dask DataFrame API when the `"cudf"` backend is enabled. Therefore, using it directly will not provide any performance benefit over the CPU/GPU-portable `dask.dataframe` API. Also, using some parts of the explicit API are incompatible with automatic query planning (see the next section).
+> Dask cuDF does not provide support for multi-GPU or multi-node execution on its own. You must also deploy a distributed cluster (ideally with [Dask-CUDA](https://docs.rapids.ai/api/dask-cuda/stable/)) to leverage multiple GPUs efficiently.
 
-See the [Dask cuDF's API documentation](https://docs.rapids.ai/api/dask-cudf/stable/) for further information.
-
-## Query Planning
-
-Dask cuDF now provides automatic query planning by default (RAPIDS 24.06+). As long as the `"dataframe.query-planning"` configuration is set to `True` (the default) when `dask.dataframe` is first imported, [Dask Expressions](https://github.com/dask/dask-expr) will be used under the hood.
-
-For example, the following user code will automatically benefit from predicate pushdown when the result is computed.
-
-```python
-df = dd.read_parquet("/my/parquet/dataset/")
-result = df.sort_values('B')['A']
-```
-
-Unoptimized expression graph (`df.pprint()`):
-```
-Projection: columns='A'
-  SortValues: by=['B'] shuffle_method='tasks' options={}
-    ReadParquetFSSpec: path='/my/parquet/dataset/' ...
-```
+## Using Dask cuDF
 
-Simplified expression graph (`df.simplify().pprint()`):
-```
-Projection: columns='A'
-  SortValues: by=['B'] shuffle_method='tasks' options={}
-    ReadParquetFSSpec: path='/my/parquet/dataset/' columns=['A', 'B'] ...
-```
+Please visit [the official documentation page](https://docs.rapids.ai/api/dask-cudf/stable/) for detailed information about using Dask cuDF.
 
-> [!NOTE]
-> Dask will automatically simplify the expression graph (within `optimize`) when the result is converted to a task graph (via `compute` or `persist`). The user does not need to call `simplify` themself.
+## Installation
 
+See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to-date information and commands for installing Dask cuDF and other RAPIDS packages.
 
-## Using Multiple GPUs and Multiple Nodes
+## Resources
 
-Whenever possible, Dask cuDF (i.e. Dask DataFrame) will automatically try to partition your data into small-enough tasks to fit comfortably in the memory of a single GPU. This means the necessary compute tasks needed to compute a query can often be streamed to a single GPU process for out-of-core computing. This also means that the compute tasks can be executed in parallel over a multi-GPU cluster.
+- [Dask cuDF documentation](https://docs.rapids.ai/api/dask-cudf/stable/)
+- [cuDF documentation](https://docs.rapids.ai/api/cudf/stable/)
+- [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/)
+- [Dask-CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/)
+- [Deployment](https://docs.rapids.ai/deployment/stable/)
+- [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate.
 
-> [!IMPORTANT]
-> Neither Dask cuDF nor Dask DataFrame provide support for multi-GPU or multi-node execution on their own. You must deploy a distributed cluster (ideally with [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/)) to leverage multiple GPUs.
+### Quick-start example
 
-In order to execute your Dask workflow on multiple GPUs, you will typically need to use [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) to deploy distributed Dask cluster, and [Distributed](https://distributed.dask.org/en/stable/client.html) to define a `client` object. For example:
+A very common Dask cuDF use case is single-node multi-GPU data processing. These workflows typically use the following pattern:
 
 ```python
-
+import dask
+import dask.dataframe as dd
 from dask_cuda import LocalCUDACluster
 from distributed import Client
 
-client = Client(
+if __name__ == "__main__":
+
+  # Define a GPU-aware cluster to leverage multiple GPUs
+  client = Client(
     LocalCUDACluster(
-        CUDA_VISIBLE_DEVICES="0,1",  # Use two workers (on devices 0 and 1)
-        rmm_pool_size=0.9,  # Use 90% of GPU memory as a pool for faster allocations
-        enable_cudf_spill=True,  # Improve device memory stability
-        local_directory="/fast/scratch/",  # Use fast local storage for spilling
+      CUDA_VISIBLE_DEVICES="0,1",  # Use two workers (on devices 0 and 1)
+      rmm_pool_size=0.9,  # Use 90% of GPU memory as a pool for faster allocations
+      enable_cudf_spill=True,  # Improve device memory stability
+      local_directory="/fast/scratch/",  # Use fast local storage for spilling
     )
-)
+  )
 
-df = dd.read_parquet("/my/parquet/dataset/")
-agg = df.groupby('B').sum()
-agg.compute()  # This will use the cluster defined above
-```
+  # Set the default dataframe backend to "cudf"
+  dask.config.set({"dataframe.backend": "cudf"})
 
-> [!NOTE]
-> This example uses `compute` to materialize a concrete `cudf.DataFrame` object in local memory. Never call `compute` on a large collection that cannot fit comfortably in the memory of a single GPU! See Dask's [documentation on managing computation](https://distributed.dask.org/en/stable/manage-computation.html) for more details.
+  # Create your DataFrame collection from on-disk
+  # or in-memory data
+  df = dd.read_parquet("/my/parquet/dataset/")
 
-Please see the [Dask CUDA](https://docs.rapids.ai/api/dask-cuda/stable/) documentation for more information about deploying GPU-aware clusters (including [best practices](https://docs.rapids.ai/api/dask-cuda/stable/examples/best-practices/)).
+  # Use cudf-like syntax to transform and/or query your data
+  query = df.groupby('item')['price'].mean()
 
-## Install
-
-See the [RAPIDS install page](https://docs.rapids.ai/install) for the most up-to-date information and commands for installing Dask cuDF and other RAPIDS packages.
+  # Compute, persist, or write out the result
+  query.head()
+```
 
-## Resources
+If you do not have multiple GPUs available, using `LocalCUDACluster` is optional. However, it is still a good idea to [enable cuDF spilling](https://docs.rapids.ai/api/cudf/stable/developer_guide/library_design/#spilling-to-host-memory).
 
-- [Dask cuDF API documentation](https://docs.rapids.ai/api/dask-cudf/stable/)
-- [cuDF API documentation](https://docs.rapids.ai/api/cudf/stable/)
-- [10 Minutes to cuDF and Dask cuDF](https://docs.rapids.ai/api/cudf/stable/user_guide/10min/)
-- [Dask CUDA documentation](https://docs.rapids.ai/api/dask-cuda/stable/)
-- [Deployment](https://docs.rapids.ai/deployment/stable/)
-- [RAPIDS Community](https://rapids.ai/learn-more/#get-involved): Get help, contribute, and collaborate.
+If you wish to scale across multiple nodes, you will need to use a different mechanism to deploy your Dask-CUDA workers. Please see [the RAPIDS deployment documentation](https://docs.rapids.ai/deployment/stable/) for more instructions.

From 40333854b5efadb5b482ec80663b837680af1598 Mon Sep 17 00:00:00 2001
From: Jason Lowe <jlowe@nvidia.com>
Date: Mon, 16 Sep 2024 17:04:47 -0500
Subject: [PATCH 820/842] Java: Make
 ColumnVector.fromViewWithContiguousAllocation public (#16784)

Exposes ColumnVector's fromViewWithContiguousAllocation method so code outside of cudf that builds contiguous table views can expose those columns in Java.

Authors:
  - Jason Lowe (https://github.com/jlowe)

Approvers:
  - Alessandro Bellina (https://github.com/abellina)

URL: https://github.com/rapidsai/cudf/pull/16784
---
 java/src/main/java/ai/rapids/cudf/ColumnVector.java | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ColumnVector.java b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
index 5a0fbd224ad..6a0f0f6f169 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnVector.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnVector.java
@@ -218,7 +218,13 @@ static long initViewHandle(DType type, int numRows, int nullCount,
         od, vd, nullCount, numRows, childHandles);
   }
 
-  static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) {
+  /**
+   * Creates a ColumnVector from a native column_view using a contiguous device allocation.
+   *
+   * @param columnViewAddress address of the native column_view
+   * @param buffer device buffer containing the data referenced by the column view
+   */
+  public static ColumnVector fromViewWithContiguousAllocation(long columnViewAddress, DeviceMemoryBuffer buffer) {
     return new ColumnVector(columnViewAddress, buffer);
   }
 

From 86861e08d9f7b1ae0a61d6b05bbfc6690107ca0f Mon Sep 17 00:00:00 2001
From: "Richard (Rick) Zamora" <rzamora217@gmail.com>
Date: Mon, 16 Sep 2024 19:14:18 -0500
Subject: [PATCH 821/842] Fix `cov`/`corr` bug in dask-cudf (#16786)

Closes https://github.com/rapidsai/cudf/issues/14935

Overrides `_prepare_cov_corr` method to avoid cudf compatibility issues in dask-cudf.

Authors:
  - Richard (Rick) Zamora (https://github.com/rjzamora)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/16786
---
 python/dask_cudf/dask_cudf/expr/_collection.py | 18 +++++++++++++++++-
 python/dask_cudf/dask_cudf/tests/test_core.py  | 17 +++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/expr/_collection.py
index f60e4ff81ef..97e1dffc65b 100644
--- a/python/dask_cudf/dask_cudf/expr/_collection.py
+++ b/python/dask_cudf/dask_cudf/expr/_collection.py
@@ -49,8 +49,24 @@ def to_dask_dataframe(self, **kwargs):
 
         return self.to_backend("pandas", **kwargs)
 
+    def _prepare_cov_corr(self, min_periods, numeric_only):
+        # Upstream version of this method sets min_periods
+        # to 2 by default (which is not supported by cudf)
+        # TODO: Remove when cudf supports both min_periods
+        # and numeric_only
+        # See: https://github.com/rapidsai/cudf/issues/12626
+        # See: https://github.com/rapidsai/cudf/issues/9009
+        self._meta.cov(min_periods=min_periods)
+
+        frame = self
+        if numeric_only:
+            numerics = self._meta._get_numeric_data()
+            if len(numerics.columns) != len(self.columns):
+                frame = frame[list(numerics.columns)]
+        return frame, min_periods
+
     # var can be removed if cudf#15179 is addressed.
-    # See: https://github.com/rapidsai/cudf/issues/15179
+    # See: https://github.com/rapidsai/cudf/issues/14935
     def var(
         self,
         axis=0,
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 905d8c08135..7aa0f6320f2 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -1007,3 +1007,20 @@ def test_to_backend_simplify():
         df2 = df.to_backend("cudf")[["y"]].simplify()
         df3 = df[["y"]].to_backend("cudf").to_backend("cudf").simplify()
         assert df2._name == df3._name
+
+
+@pytest.mark.parametrize("numeric_only", [True, False])
+@pytest.mark.parametrize("op", ["corr", "cov"])
+def test_cov_corr(op, numeric_only):
+    df = cudf.DataFrame.from_dict(
+        {
+            "x": np.random.randint(0, 5, size=10),
+            "y": np.random.normal(size=10),
+        }
+    )
+    ddf = dd.from_pandas(df, npartitions=2)
+    res = getattr(ddf, op)(numeric_only=numeric_only)
+    # Use to_pandas until cudf supports numeric_only
+    # (See: https://github.com/rapidsai/cudf/issues/12626)
+    expect = getattr(df.to_pandas(), op)(numeric_only=numeric_only)
+    dd.assert_eq(res, expect)

From f8d50639fffb541dee3b860c19756af2c4a5a850 Mon Sep 17 00:00:00 2001
From: Paul Mattione <156858817+pmattione-nvidia@users.noreply.github.com>
Date: Mon, 16 Sep 2024 21:27:38 -0400
Subject: [PATCH 822/842] Add ability to set parquet row group max #rows and
 #bytes in java (#16805)

Adds the ability to set the max # rows per row group and max # bytes per row group for parquet files.

Authors:
  - Paul Mattione (https://github.com/pmattione-nvidia)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16805
---
 .../ai/rapids/cudf/ParquetWriterOptions.java  | 26 ++++++-
 java/src/main/java/ai/rapids/cudf/Table.java  | 68 +++++++++++--------
 java/src/main/native/src/TableJni.cpp         |  8 +++
 .../test/java/ai/rapids/cudf/TableTest.java   |  8 ++-
 4 files changed, 80 insertions(+), 30 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
index 7b58817550d..8c8180436e6 100644
--- a/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/ParquetWriterOptions.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -24,9 +24,13 @@
  */
 public final class ParquetWriterOptions extends CompressionMetadataWriterOptions {
   private final StatisticsFrequency statsGranularity;
+  private int rowGroupSizeRows;
+  private long rowGroupSizeBytes;
 
   private ParquetWriterOptions(Builder builder) {
     super(builder);
+    this.rowGroupSizeRows = builder.rowGroupSizeRows;
+    this.rowGroupSizeBytes = builder.rowGroupSizeBytes;
     this.statsGranularity = builder.statsGranularity;
   }
 
@@ -51,18 +55,38 @@ public static Builder builder() {
     return new Builder();
   }
 
+  public int getRowGroupSizeRows() {
+    return rowGroupSizeRows;
+  }
+
+  public long getRowGroupSizeBytes() {
+    return rowGroupSizeBytes;
+  }
+
   public StatisticsFrequency getStatisticsFrequency() {
     return statsGranularity;
   }
 
   public static class Builder extends CompressionMetadataWriterOptions.Builder
         <Builder, ParquetWriterOptions> {
+    private int rowGroupSizeRows = 1000000; //Max of 1 million rows per row group
+    private long rowGroupSizeBytes = 128 * 1024 * 1024; //Max of 128MB per row group
     private StatisticsFrequency statsGranularity = StatisticsFrequency.ROWGROUP;
 
     public Builder() {
       super();
     }
 
+    public Builder withRowGroupSizeRows(int rowGroupSizeRows) {
+      this.rowGroupSizeRows = rowGroupSizeRows;
+      return this;
+    }
+
+    public Builder withRowGroupSizeBytes(long rowGroupSizeBytes) {
+      this.rowGroupSizeBytes = rowGroupSizeBytes;
+      return this;
+    }
+
     public Builder withStatisticsFrequency(StatisticsFrequency statsGranularity) {
       this.statsGranularity = statsGranularity;
       return this;
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index cbb126d7ee5..09da43374ae 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -332,20 +332,22 @@ private static native long[] readAvroFromDataSource(String[] filterColumnNames,
 
   /**
    * Setup everything to write parquet formatted data to a file.
-   * @param columnNames     names that correspond to the table columns
-   * @param numChildren     Children of the top level
-   * @param flatNumChildren flattened list of children per column
-   * @param nullable        true if the column can have nulls else false
-   * @param metadataKeys    Metadata key names to place in the Parquet file
-   * @param metadataValues  Metadata values corresponding to metadataKeys
-   * @param compression     native compression codec ID
-   * @param statsFreq       native statistics frequency ID
-   * @param isInt96         true if timestamp type is int96
-   * @param precisions      precision list containing all the precisions of the decimal types in
-   *                        the columns
-   * @param isMapValues     true if a column is a map
-   * @param isBinaryValues  true if a column is a binary
-   * @param filename        local output path
+   * @param columnNames       names that correspond to the table columns
+   * @param numChildren       Children of the top level
+   * @param flatNumChildren   flattened list of children per column
+   * @param nullable          true if the column can have nulls else false
+   * @param metadataKeys      Metadata key names to place in the Parquet file
+   * @param metadataValues    Metadata values corresponding to metadataKeys
+   * @param compression       native compression codec ID
+   * @param rowGroupSizeRows  max #rows in a row group
+   * @param rowGroupSizeBytes max #bytes in a row group
+   * @param statsFreq         native statistics frequency ID
+   * @param isInt96           true if timestamp type is int96
+   * @param precisions        precision list containing all the precisions of the decimal types in
+   *                          the columns
+   * @param isMapValues       true if a column is a map
+   * @param isBinaryValues    true if a column is a binary
+   * @param filename          local output path
    * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd.
    */
   private static native long writeParquetFileBegin(String[] columnNames,
@@ -355,6 +357,8 @@ private static native long writeParquetFileBegin(String[] columnNames,
                                                    String[] metadataKeys,
                                                    String[] metadataValues,
                                                    int compression,
+                                                   int rowGroupSizeRows,
+                                                   long rowGroupSizeBytes,
                                                    int statsFreq,
                                                    boolean[] isInt96,
                                                    int[] precisions,
@@ -366,20 +370,22 @@ private static native long writeParquetFileBegin(String[] columnNames,
 
   /**
    * Setup everything to write parquet formatted data to a buffer.
-   * @param columnNames     names that correspond to the table columns
-   * @param numChildren     Children of the top level
-   * @param flatNumChildren flattened list of children per column
-   * @param nullable        true if the column can have nulls else false
-   * @param metadataKeys    Metadata key names to place in the Parquet file
-   * @param metadataValues  Metadata values corresponding to metadataKeys
-   * @param compression     native compression codec ID
-   * @param statsFreq       native statistics frequency ID
-   * @param isInt96         true if timestamp type is int96
-   * @param precisions      precision list containing all the precisions of the decimal types in
-   *                        the columns
-   * @param isMapValues     true if a column is a map
-   * @param isBinaryValues  true if a column is a binary
-   * @param consumer        consumer of host buffers produced.
+   * @param columnNames       names that correspond to the table columns
+   * @param numChildren       Children of the top level
+   * @param flatNumChildren   flattened list of children per column
+   * @param nullable          true if the column can have nulls else false
+   * @param metadataKeys      Metadata key names to place in the Parquet file
+   * @param metadataValues    Metadata values corresponding to metadataKeys
+   * @param compression       native compression codec ID
+   * @param rowGroupSizeRows  max #rows in a row group
+   * @param rowGroupSizeBytes max #bytes in a row group
+   * @param statsFreq         native statistics frequency ID
+   * @param isInt96           true if timestamp type is int96
+   * @param precisions        precision list containing all the precisions of the decimal types in
+   *                          the columns
+   * @param isMapValues       true if a column is a map
+   * @param isBinaryValues    true if a column is a binary
+   * @param consumer          consumer of host buffers produced.
    * @return a handle that is used in later calls to writeParquetChunk and writeParquetEnd.
    */
   private static native long writeParquetBufferBegin(String[] columnNames,
@@ -389,6 +395,8 @@ private static native long writeParquetBufferBegin(String[] columnNames,
                                                      String[] metadataKeys,
                                                      String[] metadataValues,
                                                      int compression,
+                                                     int rowGroupSizeRows,
+                                                     long rowGroupSizeBytes,
                                                      int statsFreq,
                                                      boolean[] isInt96,
                                                      int[] precisions,
@@ -1820,6 +1828,8 @@ private ParquetTableWriter(ParquetWriterOptions options, File outputFile) {
           options.getMetadataKeys(),
           options.getMetadataValues(),
           options.getCompressionType().nativeId,
+          options.getRowGroupSizeRows(),
+          options.getRowGroupSizeBytes(),
           options.getStatisticsFrequency().nativeId,
           options.getFlatIsTimeTypeInt96(),
           options.getFlatPrecision(),
@@ -1840,6 +1850,8 @@ private ParquetTableWriter(ParquetWriterOptions options, HostBufferConsumer cons
           options.getMetadataKeys(),
           options.getMetadataValues(),
           options.getCompressionType().nativeId,
+          options.getRowGroupSizeRows(),
+          options.getRowGroupSizeBytes(),
           options.getStatisticsFrequency().nativeId,
           options.getFlatIsTimeTypeInt96(),
           options.getFlatPrecision(),
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 40a111209b0..92e213bcb60 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -2150,6 +2150,8 @@ Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env,
                                                   jobjectArray j_metadata_keys,
                                                   jobjectArray j_metadata_values,
                                                   jint j_compression,
+                                                  jint j_row_group_size_rows,
+                                                  jlong j_row_group_size_bytes,
                                                   jint j_stats_freq,
                                                   jbooleanArray j_isInt96,
                                                   jintArray j_precisions,
@@ -2205,6 +2207,8 @@ Java_ai_rapids_cudf_Table_writeParquetBufferBegin(JNIEnv* env,
       chunked_parquet_writer_options::builder(sink)
         .metadata(std::move(metadata))
         .compression(static_cast<compression_type>(j_compression))
+        .row_group_size_rows(j_row_group_size_rows)
+        .row_group_size_bytes(j_row_group_size_bytes)
         .stats_level(static_cast<statistics_freq>(j_stats_freq))
         .key_value_metadata({kv_metadata})
         .compression_statistics(stats)
@@ -2227,6 +2231,8 @@ Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env,
                                                 jobjectArray j_metadata_keys,
                                                 jobjectArray j_metadata_values,
                                                 jint j_compression,
+                                                jint j_row_group_size_rows,
+                                                jlong j_row_group_size_bytes,
                                                 jint j_stats_freq,
                                                 jbooleanArray j_isInt96,
                                                 jintArray j_precisions,
@@ -2280,6 +2286,8 @@ Java_ai_rapids_cudf_Table_writeParquetFileBegin(JNIEnv* env,
       chunked_parquet_writer_options::builder(sink)
         .metadata(std::move(metadata))
         .compression(static_cast<compression_type>(j_compression))
+        .row_group_size_rows(j_row_group_size_rows)
+        .row_group_size_bytes(j_row_group_size_bytes)
         .stats_level(static_cast<statistics_freq>(j_stats_freq))
         .key_value_metadata({kv_metadata})
         .compression_statistics(stats)
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 56fe63598d9..830f2b33b32 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -9122,7 +9122,11 @@ void testParquetWriteToBufferChunked() {
     columns.add(Columns.STRUCT.name);
     WriteUtils.buildWriterOptions(optBuilder, columns);
     ParquetWriterOptions options = optBuilder.build();
-    ParquetWriterOptions optionsNoCompress = optBuilder.withCompressionType(CompressionType.NONE).build();
+    ParquetWriterOptions optionsNoCompress =
+      optBuilder.withCompressionType(CompressionType.NONE)
+      .withRowGroupSizeRows(10000)
+      .withRowGroupSizeBytes(10000)
+      .build();
     try (Table table0 = getExpectedFileTable(columns);
          MyBufferConsumer consumer = new MyBufferConsumer()) {
       try (TableWriter writer = Table.writeParquetChunked(options, consumer)) {
@@ -9208,6 +9212,8 @@ void testParquetWriteToFileUncompressedNoStats() throws IOException {
           .withDecimalColumn("_c7", 4)
           .withDecimalColumn("_c8", 6)
           .withCompressionType(CompressionType.NONE)
+          .withRowGroupSizeRows(10000)
+          .withRowGroupSizeBytes(10000)
           .withStatisticsFrequency(ParquetWriterOptions.StatisticsFrequency.NONE)
           .build();
       try (TableWriter writer = Table.writeParquetChunked(options, tempFile.getAbsoluteFile())) {

From 7285efbeee12fa7f327933bcf6a52726bfa07790 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Mon, 16 Sep 2024 18:41:27 -1000
Subject: [PATCH 823/842] Support drop_first in get_dummies (#16795)

closes #16791

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16795
---
 python/cudf/cudf/core/reshape.py      | 11 +++++++----
 python/cudf/cudf/tests/test_onehot.py | 17 +++++++++++++++++
 2 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 3d205957126..c026579b8b5 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -738,7 +738,8 @@ def get_dummies(
     sparse : boolean, optional
         Right now this is NON-FUNCTIONAL argument in rapids.
     drop_first : boolean, optional
-        Right now this is NON-FUNCTIONAL argument in rapids.
+        Whether to get k-1 dummies out of k categorical levels by removing the
+        first level.
     columns : sequence of str, optional
         Names of columns to encode. If not provided, will attempt to encode all
         columns. Note this is different from pandas default behavior, which
@@ -806,9 +807,6 @@ def get_dummies(
     if sparse:
         raise NotImplementedError("sparse is not supported yet")
 
-    if drop_first:
-        raise NotImplementedError("drop_first is not supported yet")
-
     if isinstance(data, cudf.DataFrame):
         encode_fallback_dtypes = ["object", "category"]
 
@@ -862,6 +860,7 @@ def get_dummies(
                     prefix=prefix_map.get(name, prefix),
                     prefix_sep=prefix_sep_map.get(name, prefix_sep),
                     dtype=dtype,
+                    drop_first=drop_first,
                 )
                 result_data.update(col_enc_data)
             return cudf.DataFrame._from_data(result_data, index=data.index)
@@ -874,6 +873,7 @@ def get_dummies(
             prefix=prefix,
             prefix_sep=prefix_sep,
             dtype=dtype,
+            drop_first=drop_first,
         )
         return cudf.DataFrame._from_data(data, index=ser.index)
 
@@ -1256,6 +1256,7 @@ def _one_hot_encode_column(
     prefix: str | None,
     prefix_sep: str | None,
     dtype: Dtype | None,
+    drop_first: bool,
 ) -> dict[str, ColumnBase]:
     """Encode a single column with one hot encoding. The return dictionary
     contains pairs of (category, encodings). The keys may be prefixed with
@@ -1276,6 +1277,8 @@ def _one_hot_encode_column(
         )
     data = one_hot_encode(column, categories)
 
+    if drop_first and len(data):
+        data.pop(next(iter(data)))
     if prefix is not None and prefix_sep is not None:
         data = {f"{prefix}{prefix_sep}{col}": enc for col, enc in data.items()}
     if dtype:
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index cc17dc46e0a..e054143b438 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -161,3 +161,20 @@ def test_get_dummies_cats_deprecated():
     df = cudf.DataFrame(range(3))
     with pytest.warns(FutureWarning):
         cudf.get_dummies(df, cats={0: [0, 1, 2]})
+
+
+def test_get_dummies_drop_first_series():
+    result = cudf.get_dummies(cudf.Series(list("abcaa")), drop_first=True)
+    expected = pd.get_dummies(pd.Series(list("abcaa")), drop_first=True)
+    assert_eq(result, expected)
+
+
+def test_get_dummies_drop_first_dataframe():
+    result = cudf.get_dummies(
+        cudf.DataFrame({"A": list("abcaa"), "B": list("bcaab")}),
+        drop_first=True,
+    )
+    expected = pd.get_dummies(
+        pd.DataFrame({"A": list("abcaa"), "B": list("bcaab")}), drop_first=True
+    )
+    assert_eq(result, expected)

From 27c29ebd81864d1662dd8a3e8e807955bd8fd9c5 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Tue, 17 Sep 2024 09:17:43 -0500
Subject: [PATCH 824/842] Use cupy 12.2.0 as oldest dependency pinning on CUDA
 12 ARM (#16808)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Uses cupy 12.2.0 as oldest dependency pinning on ARM to ensure CUDA 12 support.

This will fix nightly CI failures that look like:

```
LibMambaUnsatisfiableError: Encountered problems while solving:
  - package cupy-12.0.0-py311h308989c_2 requires python_abi 3.11.* *_cp311, but none of the providers can be installed

Could not solve for environment specs
The following packages are incompatible
├─ cuda-version 12.2**  is installable and it requires
│  └─ cudatoolkit 12.2|12.2.* , which can be installed;
├─ cupy 12.0.0  is installable with the potential options
│  ├─ cupy 12.0.0 would require
│  │  └─ cudatoolkit >=11.2,<12 , which conflicts with any installable versions previously reported;
...
```

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

URL: https://github.com/rapidsai/cudf/pull/16808
---
 dependencies.yaml | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 483335c02ff..7a13043cc5f 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -710,7 +710,16 @@ dependencies:
               - numpy==1.23.*
               - pandas==2.0.*
               - pyarrow==14.0.0
-              - cupy==12.0.0  # ignored as pip constraint
+          - matrix:
+            packages:
+      - output_types: conda
+        matrices:
+          - matrix: {dependencies: "oldest", arch: "aarch64", cuda: "12.*"}
+            packages:
+              - cupy==12.2.0  # cupy 12.2.0 is the earliest with CUDA 12 ARM packages.
+          - matrix: {dependencies: "oldest"}
+            packages:
+              - cupy==12.0.0
           - matrix:
             packages:
       - output_types: requirements

From 23351aa15f5334b7582c53d4cb6b7421c5c2fd74 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 17 Sep 2024 13:14:32 -0400
Subject: [PATCH 825/842] Word-based nvtext::minhash function (#15368)

Experimental implementation for #15055
The input is a lists column of strings where each string in each row is expected as a word to be hashed. The minimum hash for that row is returned in a lists column where each row contains a minhash per input hash seed.
Here the caller is expected to produce the words to be hashed.

```
std::unique_ptr<cudf::column> word_minhash(
  cudf::lists_column_view const& input,
  cudf::device_span<uint32_t const> seeds,
  rmm::cuda_stream_view stream,
  rmm::device_async_resource_ref mr);
```

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Nghia Truong (https://github.com/ttnghia)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: https://github.com/rapidsai/cudf/pull/15368
---
 cpp/benchmarks/CMakeLists.txt                 |   2 +-
 cpp/benchmarks/text/word_minhash.cpp          |  77 +++++++++
 cpp/include/nvtext/minhash.hpp                |  61 +++++++-
 cpp/src/text/minhash.cu                       | 147 +++++++++++++++++-
 cpp/tests/text/minhash_tests.cpp              |  35 +++++
 python/cudf/cudf/_lib/nvtext/minhash.pyx      |  38 +++++
 python/cudf/cudf/_lib/strings/__init__.py     |   9 +-
 python/cudf/cudf/core/column/string.py        |  70 +++++++++
 .../cudf/cudf/tests/text/test_text_methods.py |  60 +++++++
 .../pylibcudf/libcudf/nvtext/minhash.pxd      |  10 ++
 10 files changed, 498 insertions(+), 11 deletions(-)
 create mode 100644 cpp/benchmarks/text/word_minhash.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 3bf9d02b384..6c5f4a68a4c 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -337,7 +337,7 @@ ConfigureBench(TEXT_BENCH text/ngrams.cpp text/subword.cpp)
 
 ConfigureNVBench(
   TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
+  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp text/word_minhash.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/benchmarks/text/word_minhash.cpp b/cpp/benchmarks/text/word_minhash.cpp
new file mode 100644
index 00000000000..adc3dddc59c
--- /dev/null
+++ b/cpp/benchmarks/text/word_minhash.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/filling.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <nvtext/minhash.hpp>
+
+#include <rmm/device_buffer.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_word_minhash(nvbench::state& state)
+{
+  auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
+  auto const base64     = state.get_int64("hash_type") == 64;
+
+  data_profile const strings_profile =
+    data_profile_builder().distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
+  auto strings_table =
+    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
+
+  auto const num_offsets = (num_rows / row_width) + 1;
+  auto offsets           = cudf::sequence(num_offsets,
+                                cudf::numeric_scalar<cudf::size_type>(0),
+                                cudf::numeric_scalar<cudf::size_type>(row_width));
+
+  auto source = cudf::make_lists_column(num_offsets - 1,
+                                        std::move(offsets),
+                                        std::move(strings_table->release().front()),
+                                        0,
+                                        rmm::device_buffer{});
+
+  data_profile const seeds_profile = data_profile_builder().no_validity().distribution(
+    cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, 256);
+  auto const seed_type   = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
+  auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
+  auto seeds             = seeds_table->get_column(0);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+
+  cudf::strings_column_view input(cudf::lists_column_view(source->view()).child());
+  auto chars_size = input.chars_size(cudf::get_default_stream());
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);
+  state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = base64 ? nvtext::word_minhash64(source->view(), seeds.view())
+                         : nvtext::word_minhash(source->view(), seeds.view());
+  });
+}
+
+NVBENCH_BENCH(bench_word_minhash)
+  .set_name("word_minhash")
+  .add_int64_axis("num_rows", {131072, 262144, 524288, 1048576, 2097152})
+  .add_int64_axis("row_width", {10, 100, 1000})
+  .add_int64_axis("seed_count", {2, 25})
+  .add_int64_axis("hash_type", {32, 64});
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index c83a4260c19..7c909f1a948 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -17,6 +17,7 @@
 
 #include <cudf/column/column.hpp>
 #include <cudf/hashing.hpp>
+#include <cudf/lists/lists_column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/export.hpp>
@@ -72,7 +73,7 @@ std::unique_ptr<cudf::column> minhash(
  *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
  * @param seeds Seed values used for the hash algorithm
@@ -133,7 +134,7 @@ std::unique_ptr<cudf::column> minhash64(
  *
  * @throw std::invalid_argument if the width < 2
  * @throw std::invalid_argument if seeds is empty
- * @throw std::overflow_error if `seeds * input.size()` exceeds the column size limit
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
  * @param seeds Seed values used for the hash algorithm
@@ -150,5 +151,61 @@ std::unique_ptr<cudf::column> minhash64(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns the minhash values for each row of strings per seed
+ *
+ * Hash values are computed from each string in each row and the
+ * minimum hash value is returned for each row for each seed.
+ * Each row of the output list column are seed results for the corresponding
+ * input row. The order of the elements in each row match the order of
+ * the seeds provided in the `seeds` parameter.
+ *
+ * This function uses MurmurHash3_x86_32 for the hash algorithm.
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if seeds is empty
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Lists column of strings to compute minhash
+ * @param seeds Seed values used for the hash algorithm
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> word_minhash(
+  cudf::lists_column_view const& input,
+  cudf::device_span<uint32_t const> seeds,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
+/**
+ * @brief Returns the minhash values for each row of strings per seed
+ *
+ * Hash values are computed from each string in each row and the
+ * minimum hash value is returned for each row for each seed.
+ * Each row of the output list column are seed results for the corresponding
+ * input row. The order of the elements in each row match the order of
+ * the seeds provided in the `seeds` parameter.
+ *
+ * This function uses MurmurHash3_x64_128 for the hash algorithm though
+ * only the first 64-bits of the hash are used in computing the output.
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if seeds is empty
+ * @throw std::overflow_error if `seeds.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Lists column of strings to compute minhash
+ * @param seeds Seed values used for the hash algorithm
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> word_minhash64(
+  cudf::lists_column_view const& input,
+  cudf::device_span<uint64_t const> seeds,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 /** @} */  // end of group
 }  // namespace CUDF_EXPORT nvtext
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 605582f28a6..a03a34f5fa7 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -25,6 +25,8 @@
 #include <cudf/hashing/detail/hashing.hpp>
 #include <cudf/hashing/detail/murmurhash3_x64_128.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
+#include <cudf/lists/list_device_view.cuh>
+#include <cudf/lists/lists_column_device_view.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -151,15 +153,111 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                                           mr);
   auto d_hashes = hashes->mutable_view().data<hash_value_type>();
 
-  constexpr int block_size = 256;
-  cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
+  constexpr cudf::thread_index_type block_size = 256;
+  cudf::detail::grid_1d grid{
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
   minhash_kernel<HashFunction><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     *d_strings, seeds, width, d_hashes);
 
   return hashes;
 }
 
-std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const& input,
+/**
+ * @brief Compute the minhash of each list row of strings for each seed
+ *
+ * This is a warp-per-row algorithm where parallel threads within a warp
+ * work on strings in a single list row.
+ *
+ * @tparam HashFunction hash function to use on each string
+ *
+ * @param d_input List of strings to process
+ * @param seeds Seeds for hashing each string
+ * @param d_hashes Minhash output values (one per row)
+ */
+template <
+  typename HashFunction,
+  typename hash_value_type = std::
+    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+CUDF_KERNEL void minhash_word_kernel(cudf::detail::lists_column_device_view const d_input,
+                                     cudf::device_span<hash_value_type const> seeds,
+                                     hash_value_type* d_hashes)
+{
+  auto const idx     = cudf::detail::grid_1d::global_thread_id();
+  auto const row_idx = idx / cudf::detail::warp_size;
+
+  if (row_idx >= d_input.size()) { return; }
+  if (d_input.is_null(row_idx)) { return; }
+
+  auto const d_row    = cudf::list_device_view(d_input, row_idx);
+  auto const d_output = d_hashes + (row_idx * seeds.size());
+
+  // initialize hashes output for this row
+  auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
+  if (lane_idx == 0) {
+    auto const init = d_row.size() == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
+    thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
+  }
+  __syncwarp();
+
+  // each lane hashes a string from the input row
+  for (auto str_idx = lane_idx; str_idx < d_row.size(); str_idx += cudf::detail::warp_size) {
+    auto const hash_str =
+      d_row.is_null(str_idx) ? cudf::string_view{} : d_row.element<cudf::string_view>(str_idx);
+    for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
+      auto const hasher = HashFunction(seeds[seed_idx]);
+      // hash string and store the min value
+      hash_value_type hv;
+      if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
+        hv = hasher(hash_str);
+      } else {
+        // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values
+        // but only uses the first uint64 value as requested by the LLM team.
+        hv = thrust::get<0>(hasher(hash_str));
+      }
+      cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
+      ref.fetch_min(hv, cuda::std::memory_order_relaxed);
+    }
+  }
+}
+
+template <
+  typename HashFunction,
+  typename hash_value_type = std::
+    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+std::unique_ptr<cudf::column> word_minhash_fn(cudf::lists_column_view const& input,
+                                              cudf::device_span<hash_value_type const> seeds,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::device_async_resource_ref mr)
+{
+  CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
+  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * seeds.size()) <
+                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+               "The number of seeds times the number of input rows exceeds the column size limit",
+               std::overflow_error);
+
+  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
+  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+
+  auto const d_input = cudf::column_device_view::create(input.parent(), stream);
+
+  auto hashes   = cudf::make_numeric_column(output_type,
+                                          input.size() * static_cast<cudf::size_type>(seeds.size()),
+                                          cudf::mask_state::UNALLOCATED,
+                                          stream,
+                                          mr);
+  auto d_hashes = hashes->mutable_view().data<hash_value_type>();
+  auto lcdv     = cudf::detail::lists_column_device_view(*d_input);
+
+  constexpr cudf::thread_index_type block_size = 256;
+  cudf::detail::grid_1d grid{
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
+  minhash_word_kernel<HashFunction>
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(lcdv, seeds, d_hashes);
+
+  return hashes;
+}
+
+std::unique_ptr<cudf::column> build_list_result(cudf::column_view const& input,
                                                 std::unique_ptr<cudf::column>&& hashes,
                                                 cudf::size_type seeds_size,
                                                 rmm::cuda_stream_view stream,
@@ -176,7 +274,7 @@ std::unique_ptr<cudf::column> build_list_result(cudf::strings_column_view const&
                                   std::move(offsets),
                                   std::move(hashes),
                                   input.null_count(),
-                                  cudf::detail::copy_bitmask(input.parent(), stream, mr),
+                                  cudf::detail::copy_bitmask(input, stream, mr),
                                   stream,
                                   mr);
   // expect this condition to be very rare
@@ -208,7 +306,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
   auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  return build_list_result(input, std::move(hashes), seeds.size(), stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
 }
 
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
@@ -232,7 +330,27 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
   auto hashes        = detail::minhash_fn<HashFunction>(input, seeds, width, stream, mr);
-  return build_list_result(input, std::move(hashes), seeds.size(), stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
+}
+
+std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
+                                           cudf::device_span<uint32_t const> seeds,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+  auto hashes        = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
+}
+
+std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
+                                             cudf::device_span<uint64_t const> seeds,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
+  auto hashes        = detail::word_minhash_fn<HashFunction>(input, seeds, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
 }
 }  // namespace detail
 
@@ -276,4 +394,21 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
   return detail::minhash64(input, seeds, width, stream, mr);
 }
 
+std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
+                                           cudf::device_span<uint32_t const> seeds,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::word_minhash(input, seeds, stream, mr);
+}
+
+std::unique_ptr<cudf::column> word_minhash64(cudf::lists_column_view const& input,
+                                             cudf::device_span<uint64_t const> seeds,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::word_minhash64(input, seeds, stream, mr);
+}
 }  // namespace nvtext
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index 7575a3ba846..e23f3f6e7d8 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -139,6 +139,41 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
+TEST_F(MinHashTest, WordsMinHash)
+{
+  using LCWS    = cudf::test::lists_column_wrapper<cudf::string_view>;
+  auto validity = cudf::test::iterators::null_at(1);
+
+  LCWS input(
+    {LCWS({"hello", "abcdéfgh"}),
+     LCWS{},
+     LCWS({"rapids", "moré", "test", "text"}),
+     LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})},
+    validity);
+
+  auto view = cudf::lists_column_view(input);
+
+  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2});
+  auto results = nvtext::word_minhash(view, cudf::column_view(seeds));
+  using LCW32  = cudf::test::lists_column_wrapper<uint32_t>;
+  LCW32 expected({LCW32{2069617641u, 1975382903u},
+                  LCW32{},
+                  LCW32{657297235u, 1010955999u},
+                  LCW32{644643885u, 310002789u}},
+                 validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({11, 22});
+  auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64));
+  using LCW64    = cudf::test::lists_column_wrapper<uint64_t>;
+  LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul},
+                    LCW64{},
+                    LCW64{5331949571924938590ul, 2088583894581919741ul},
+                    LCW64{3400468157617183341ul, 2398577492366130055ul}},
+                   validity);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
+}
+
 TEST_F(MinHashTest, EmptyTest)
 {
   auto input   = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index 5ee15d0e409..59cb8d51440 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -10,6 +10,8 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
+    word_minhash as cpp_word_minhash,
+    word_minhash64 as cpp_word_minhash64,
 )
 from pylibcudf.libcudf.types cimport size_type
 
@@ -54,3 +56,39 @@ def minhash64(Column strings, Column seeds, int width):
         )
 
     return Column.from_unique_ptr(move(c_result))
+
+
+@acquire_spill_lock()
+def word_minhash(Column input, Column seeds):
+
+    cdef column_view c_input = input.view()
+    cdef column_view c_seeds = seeds.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_word_minhash(
+                c_input,
+                c_seeds
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
+
+
+@acquire_spill_lock()
+def word_minhash64(Column input, Column seeds):
+
+    cdef column_view c_input = input.view()
+    cdef column_view c_seeds = seeds.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_word_minhash64(
+                c_input,
+                c_seeds
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index 47a194c4fda..4bf8a9b1a8f 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix
 from cudf._lib.nvtext.generate_ngrams import (
     generate_character_ngrams,
@@ -6,7 +6,12 @@
     hash_character_ngrams,
 )
 from cudf._lib.nvtext.jaccard import jaccard_index
-from cudf._lib.nvtext.minhash import minhash, minhash64
+from cudf._lib.nvtext.minhash import (
+    minhash,
+    minhash64,
+    word_minhash,
+    word_minhash64,
+)
 from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize
 from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces
 from cudf._lib.nvtext.replace import filter_tokens, replace_tokens
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 16e6908f308..e059917b0b8 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5349,6 +5349,76 @@ def minhash64(
             libstrings.minhash64(self._column, seeds_column, width)
         )
 
+    def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
+        """
+        Compute the minhash of a list column of strings.
+        This uses the MurmurHash3_x86_32 algorithm for the hash function.
+
+        Parameters
+        ----------
+        seeds : ColumnLike
+            The seeds used for the hash algorithm.
+            Must be of type uint32.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
+        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
+        >>> ls.str.word_minhash(seeds=seeds)
+        0     [21141582, 1232889953, 1268336794]
+        1    [962346254, 2321233602, 1354839212]
+        dtype: list
+        """
+        if seeds is None:
+            seeds_column = column.as_column(0, dtype=np.uint32, length=1)
+        else:
+            seeds_column = column.as_column(seeds)
+            if seeds_column.dtype != np.uint32:
+                raise ValueError(
+                    f"Expecting a Series with dtype uint32, got {type(seeds)}"
+                )
+        return self._return_or_inplace(
+            libstrings.word_minhash(self._column, seeds_column)
+        )
+
+    def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
+        """
+        Compute the minhash of a list column of strings.
+        This uses the MurmurHash3_x64_128 algorithm for the hash function.
+        This function generates 2 uint64 values but only the first
+        uint64 value is used.
+
+        Parameters
+        ----------
+        seeds : ColumnLike
+            The seeds used for the hash algorithm.
+            Must be of type uint64.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
+        >>> seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
+        >>> ls.str.word_minhash64(seeds)
+        0    [2603139454418834912, 8644371945174847701, 5541030711534384340]
+        1    [5240044617220523711, 5847101123925041457, 153762819128779913]
+        dtype: list
+        """
+        if seeds is None:
+            seeds_column = column.as_column(0, dtype=np.uint64, length=1)
+        else:
+            seeds_column = column.as_column(seeds)
+            if seeds_column.dtype != np.uint64:
+                raise ValueError(
+                    f"Expecting a Series with dtype uint64, got {type(seeds)}"
+                )
+        return self._return_or_inplace(
+            libstrings.word_minhash64(self._column, seeds_column)
+        )
+
     def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex:
         """
         Compute the Jaccard index between this column and the given
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 52179f55da3..997ca357986 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -946,6 +946,66 @@ def test_minhash():
         strings.str.minhash64(seeds=seeds)
 
 
+def test_word_minhash():
+    ls = cudf.Series([["this", "is", "my"], ["favorite", "book"]])
+
+    expected = cudf.Series(
+        [
+            cudf.Series([21141582], dtype=np.uint32),
+            cudf.Series([962346254], dtype=np.uint32),
+        ]
+    )
+    actual = ls.str.word_minhash()
+    assert_eq(expected, actual)
+    seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
+    expected = cudf.Series(
+        [
+            cudf.Series([21141582, 1232889953, 1268336794], dtype=np.uint32),
+            cudf.Series([962346254, 2321233602, 1354839212], dtype=np.uint32),
+        ]
+    )
+    actual = ls.str.word_minhash(seeds=seeds)
+    assert_eq(expected, actual)
+
+    expected = cudf.Series(
+        [
+            cudf.Series([2603139454418834912], dtype=np.uint64),
+            cudf.Series([5240044617220523711], dtype=np.uint64),
+        ]
+    )
+    actual = ls.str.word_minhash64()
+    assert_eq(expected, actual)
+    seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
+    expected = cudf.Series(
+        [
+            cudf.Series(
+                [
+                    2603139454418834912,
+                    8644371945174847701,
+                    5541030711534384340,
+                ],
+                dtype=np.uint64,
+            ),
+            cudf.Series(
+                [5240044617220523711, 5847101123925041457, 153762819128779913],
+                dtype=np.uint64,
+            ),
+        ]
+    )
+    actual = ls.str.word_minhash64(seeds=seeds)
+    assert_eq(expected, actual)
+
+    # test wrong seed types
+    with pytest.raises(ValueError):
+        ls.str.word_minhash(seeds="a")
+    with pytest.raises(ValueError):
+        seeds = cudf.Series([0, 1, 2], dtype=np.int32)
+        ls.str.word_minhash(seeds=seeds)
+    with pytest.raises(ValueError):
+        seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
+        ls.str.word_minhash64(seeds=seeds)
+
+
 def test_jaccard_index():
     str1 = cudf.Series(["the brown dog", "jumped about"])
     str2 = cudf.Series(["the black cat", "jumped around"])
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index 0c352a5068b..f2dd22f43aa 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -19,3 +19,13 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const column_view &seeds,
         const size_type width,
     ) except +
+
+    cdef unique_ptr[column] word_minhash(
+        const column_view &input,
+        const column_view &seeds
+    ) except +
+
+    cdef unique_ptr[column] word_minhash64(
+        const column_view &input,
+        const column_view &seeds
+    ) except +

From e98e10981fc245a6837a51e9b6c2b933a5d7acd8 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 17 Sep 2024 13:19:40 -0400
Subject: [PATCH 826/842] Support multiple new-line characters in regex APIs
 (#15961)

Add support for multiple new-line characters for BOL (`^` / `\A`) and EOL (`$` / `\Z`):
-  `\n` line-feed (already supported)
-  `\r` carriage-return
-  `\u0085` next line (NEL)
-  `\u2028` line separator
-  `\u2029` paragraph separator

Reference #15746

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Nghia Truong (https://github.com/ttnghia)
  - Navin Kumar (https://github.com/NVnavkumar)

URL: https://github.com/rapidsai/cudf/pull/15961
---
 cpp/doxygen/regex.md                      |  6 +++
 cpp/include/cudf/strings/regex/flags.hpp  | 20 ++++++--
 cpp/include/cudf/strings/string_view.cuh  | 11 +++--
 cpp/src/strings/regex/regcomp.cpp         | 21 ++++++--
 cpp/src/strings/regex/regex.inl           | 46 +++++++++++++-----
 cpp/tests/strings/contains_tests.cpp      | 59 +++++++++++++++++++++++
 cpp/tests/strings/extract_tests.cpp       | 40 +++++++++++++++
 cpp/tests/strings/findall_tests.cpp       | 28 +++++++++++
 cpp/tests/strings/replace_regex_tests.cpp | 49 +++++++++++++++++++
 cpp/tests/strings/special_chars.h         | 25 ++++++++++
 10 files changed, 281 insertions(+), 24 deletions(-)
 create mode 100644 cpp/tests/strings/special_chars.h

diff --git a/cpp/doxygen/regex.md b/cpp/doxygen/regex.md
index 8d206f245dc..6d1c91a5752 100644
--- a/cpp/doxygen/regex.md
+++ b/cpp/doxygen/regex.md
@@ -17,6 +17,12 @@ The details are based on features documented at https://www.regular-expressions.
 
 **Note:** The alternation character is the pipe character `|` and not the character included in the tables on this page. There is an issue including the pipe character inside the table markdown that is rendered by doxygen.
 
+By default, only the `\n` character is recognized as a line break. The [cudf::strings::regex_flags::EXT_NEWLINE](@ref cudf::strings::regex_flags) increases the set of line break characters to include:
+- Paragraph separator (Unicode: `2029`, UTF-8: `E280A9`)
+- Line separator (Unicode: `2028`, UTF-8: `E280A8`)
+- Next line (Unicode: `0085`, UTF-8: `C285`)
+- Carriage return (Unicode: `000D`, UTF-8: `0D`)
+
 **Invalid regex patterns will result in undefined behavior**. This includes but is not limited to the following:
 - Unescaped special characters (listed in the third row of the Characters table below) when they are intended to match as literals.
 - Unmatched paired special characters like `()`, `[]`, and `{}`.
diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp
index f7108129dee..4f3fc7086f2 100644
--- a/cpp/include/cudf/strings/regex/flags.hpp
+++ b/cpp/include/cudf/strings/regex/flags.hpp
@@ -35,10 +35,11 @@ namespace strings {
  * and to match the Python flag values.
  */
 enum regex_flags : uint32_t {
-  DEFAULT   = 0,   ///< default
-  MULTILINE = 8,   ///< the '^' and '$' honor new-line characters
-  DOTALL    = 16,  ///< the '.' matching includes new-line characters
-  ASCII     = 256  ///< use only ASCII when matching built-in character classes
+  DEFAULT     = 0,    ///< default
+  MULTILINE   = 8,    ///< the '^' and '$' honor new-line characters
+  DOTALL      = 16,   ///< the '.' matching includes new-line characters
+  ASCII       = 256,  ///< use only ASCII when matching built-in character classes
+  EXT_NEWLINE = 512   ///< new-line matches extended characters
 };
 
 /**
@@ -74,6 +75,17 @@ constexpr bool is_ascii(regex_flags const f)
   return (f & regex_flags::ASCII) == regex_flags::ASCII;
 }
 
+/**
+ * @brief Returns true if the given flags contain EXT_NEWLINE
+ *
+ * @param f Regex flags to check
+ * @return true if `f` includes EXT_NEWLINE
+ */
+constexpr bool is_ext_newline(regex_flags const f)
+{
+  return (f & regex_flags::EXT_NEWLINE) == regex_flags::EXT_NEWLINE;
+}
+
 /**
  * @brief Capture groups setting
  *
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index abb26d7ccb4..14695c3bb27 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -191,9 +191,14 @@ __device__ inline string_view::const_iterator& string_view::const_iterator::oper
 
 __device__ inline string_view::const_iterator& string_view::const_iterator::operator--()
 {
-  if (byte_pos > 0)
-    while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
-      ;
+  if (byte_pos > 0) {
+    if (byte_pos == char_pos) {
+      --byte_pos;
+    } else {
+      while (strings::detail::bytes_in_utf8_byte(static_cast<uint8_t>(p[--byte_pos])) == 0)
+        ;
+    }
+  }
   --char_pos;
   return *this;
 }
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index adf650a4f27..7c4c89bd3fb 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -539,15 +539,26 @@ class regex_parser {
                                                          : static_cast<int32_t>(LBRA);
       case ')': return RBRA;
       case '^': {
-        _chr = is_multiline(_flags) ? chr : '\n';
+        if (is_ext_newline(_flags)) {
+          _chr = is_multiline(_flags) ? 'S' : 'N';
+        } else {
+          _chr = is_multiline(_flags) ? chr : '\n';
+        }
         return BOL;
       }
       case '$': {
-        _chr = is_multiline(_flags) ? chr : '\n';
+        if (is_ext_newline(_flags)) {
+          _chr = is_multiline(_flags) ? 'S' : 'N';
+        } else {
+          _chr = is_multiline(_flags) ? chr : '\n';
+        }
         return EOL;
       }
       case '[': return build_cclass();
-      case '.': return dot_type;
+      case '.': {
+        _chr = is_ext_newline(_flags) ? 'N' : chr;
+        return dot_type;
+      }
     }
 
     if (std::find(quantifiers.begin(), quantifiers.end(), static_cast<char>(chr)) ==
@@ -959,7 +970,7 @@ class regex_compiler {
       _prog.inst_at(inst_id).u1.cls_id = class_id;
     } else if (token == CHAR) {
       _prog.inst_at(inst_id).u1.c = yy;
-    } else if (token == BOL || token == EOL) {
+    } else if (token == BOL || token == EOL || token == ANY) {
       _prog.inst_at(inst_id).u1.c = yy;
     }
     push_and(inst_id, inst_id);
@@ -1194,7 +1205,7 @@ void reprog::print(regex_flags const flags)
       case STAR: printf("   STAR next=%d", inst.u2.next_id); break;
       case PLUS: printf("   PLUS next=%d", inst.u2.next_id); break;
       case QUEST: printf("  QUEST next=%d", inst.u2.next_id); break;
-      case ANY: printf("    ANY next=%d", inst.u2.next_id); break;
+      case ANY: printf("    ANY '%c', next=%d", inst.u1.c, inst.u2.next_id); break;
       case ANYNL: printf("  ANYNL next=%d", inst.u2.next_id); break;
       case NOP: printf("    NOP next=%d", inst.u2.next_id); break;
       case BOL: {
diff --git a/cpp/src/strings/regex/regex.inl b/cpp/src/strings/regex/regex.inl
index 3b899e4edc1..e34a1e12015 100644
--- a/cpp/src/strings/regex/regex.inl
+++ b/cpp/src/strings/regex/regex.inl
@@ -126,6 +126,16 @@ __device__ __forceinline__ void reprog_device::reljunk::swaplist()
   list2    = tmp;
 }
 
+/**
+ * @brief Check for supported new-line characters
+ *
+ * '\n, \r, \u0085, \u2028, or \u2029'
+ */
+constexpr bool is_newline(char32_t const ch)
+{
+  return (ch == '\n' || ch == '\r' || ch == 0x00c285 || ch == 0x00e280a8 || ch == 0x00e280a9);
+}
+
 /**
  * @brief Utility to check a specific character against this class instance.
  *
@@ -258,11 +268,14 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
     if (checkstart) {
       auto startchar = static_cast<char_utf8>(jnk.startchar);
       switch (jnk.starttype) {
-        case BOL:
-          if (pos == 0) break;
-          if (jnk.startchar != '^') { return cuda::std::nullopt; }
+        case BOL: {
+          if (pos == 0) { break; }
+          if (startchar != '^' && startchar != 'S') { return cuda::std::nullopt; }
+          if (startchar != '\n') { break; }
           --itr;
           startchar = static_cast<char_utf8>('\n');
+          [[fallthrough]];
+        }
         case CHAR: {
           auto const find_itr = find_char(startchar, dstr, itr);
           if (find_itr.byte_offset() >= dstr.size_bytes()) { return cuda::std::nullopt; }
@@ -312,26 +325,34 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
             id_activate = inst.u2.next_id;
             expanded    = true;
             break;
-          case BOL:
-            if ((pos == 0) || ((inst.u1.c == '^') && (dstr[pos - 1] == '\n'))) {
+          case BOL: {
+            auto titr         = itr;
+            auto const prev_c = pos > 0 ? *(--titr) : 0;
+            if ((pos == 0) || ((inst.u1.c == '^') && (prev_c == '\n')) ||
+                ((inst.u1.c == 'S') && (is_newline(prev_c)))) {
               id_activate = inst.u2.next_id;
               expanded    = true;
             }
             break;
-          case EOL:
+          }
+          case EOL: {
             // after the last character OR:
             // - for MULTILINE, if current character is new-line
             // - for non-MULTILINE, the very last character of the string can also be a new-line
+            bool const nl = (inst.u1.c == 'S' || inst.u1.c == 'N') ? is_newline(c) : (c == '\n');
             if (last_character ||
-                ((c == '\n') && (inst.u1.c != 'Z') &&
-                 ((inst.u1.c == '$') || (itr.byte_offset() + 1 == dstr.size_bytes())))) {
+                (nl && (inst.u1.c != 'Z') &&
+                 ((inst.u1.c == '$' || inst.u1.c == 'S') ||
+                  (itr.byte_offset() + bytes_in_char_utf8(c) == dstr.size_bytes())))) {
               id_activate = inst.u2.next_id;
               expanded    = true;
             }
             break;
+          }
           case BOW:
           case NBOW: {
-            auto const prev_c       = pos > 0 ? dstr[pos - 1] : 0;
+            auto titr               = itr;
+            auto const prev_c       = pos > 0 ? *(--titr) : 0;
             auto const word_class   = reclass_device{CCLASS_W};
             bool const curr_is_word = word_class.is_match(c, _codepoint_flags);
             bool const prev_is_word = word_class.is_match(prev_c, _codepoint_flags);
@@ -366,9 +387,10 @@ __device__ __forceinline__ match_result reprog_device::regexec(string_view const
         case CHAR:
           if (inst.u1.c == c) id_activate = inst.u2.next_id;
           break;
-        case ANY:
-          if (c != '\n') id_activate = inst.u2.next_id;
-          break;
+        case ANY: {
+          if ((c == '\n') || ((inst.u1.c == 'N') && is_newline(c))) { break; }
+          [[fallthrough]];
+        }
         case ANYNL: id_activate = inst.u2.next_id; break;
         case NCCLASS:
         case CCLASS: {
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index c816316d0ff..acf850c7a66 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -613,6 +615,63 @@ TEST_F(StringsContainsTests, MultiLine)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
 }
 
+TEST_F(StringsContainsTests, SpecialNewLines)
+{
+  auto input = cudf::test::strings_column_wrapper({"zzé" LINE_SEPARATOR "qqq" NEXT_LINE "zzé",
+                                                   "qqq\rzzé" LINE_SEPARATOR "lll",
+                                                   "zzé",
+                                                   "",
+                                                   "zzé" PARAGRAPH_SEPARATOR,
+                                                   "abc\nzzé" NEXT_LINE});
+  auto view  = cudf::strings_column_view(input);
+
+  auto pattern = std::string("^zzé$");
+  auto prog =
+    cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE);
+  auto ml_flags = static_cast<cudf::strings::regex_flags>(cudf::strings::regex_flags::EXT_NEWLINE |
+                                                          cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml  = cudf::strings::regex_program::create(pattern, ml_flags);
+
+  auto expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1, 0});
+  auto results  = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0, 1, 1});
+  results  = cudf::strings::contains_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 1, 0});
+  results  = cudf::strings::matches_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0, 1, 0});
+  results  = cudf::strings::matches_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto counts = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0, 1, 0});
+  results     = cudf::strings::count_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts);
+  counts  = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0, 1, 1});
+  results = cudf::strings::count_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, counts);
+
+  pattern  = std::string("q.*l");
+  prog     = cudf::strings::regex_program::create(pattern);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 1, 0, 0, 0, 0});
+  results  = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  // inst ANY will stop matching on first 'newline' and so should not match anything here
+  prog     = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE);
+  expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 0});
+  results  = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  // including the DOTALL flag accepts the newline characters
+  auto dot_flags = static_cast<cudf::strings::regex_flags>(cudf::strings::regex_flags::EXT_NEWLINE |
+                                                           cudf::strings::regex_flags::DOTALL);
+  prog           = cudf::strings::regex_program::create(pattern, dot_flags);
+  expected       = cudf::test::fixed_width_column_wrapper<bool>({0, 1, 0, 0, 0, 0});
+  results        = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(StringsContainsTests, EndOfString)
 {
   auto input = cudf::test::strings_column_wrapper(
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index b26cbd5a549..1491da758d5 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -14,9 +14,12 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/debug_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 
 #include <cudf/detail/iterator.cuh>
@@ -200,6 +203,43 @@ TEST_F(StringsExtractTests, DotAll)
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
 }
 
+TEST_F(StringsExtractTests, SpecialNewLines)
+{
+  auto input = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" LINE_SEPARATOR "zzé",
+                                                   "qqq" LINE_SEPARATOR "zzé\rlll",
+                                                   "zzé",
+                                                   "",
+                                                   "zzé" NEXT_LINE,
+                                                   "abc" PARAGRAPH_SEPARATOR "zzé\n"});
+  auto view  = cudf::strings_column_view(input);
+
+  auto prog =
+    cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE);
+  auto results = cudf::strings::extract(view, *prog);
+  auto expected =
+    cudf::test::strings_column_wrapper({"", "", "zzé", "", "zzé", ""}, {0, 0, 1, 0, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+
+  auto both_flags = static_cast<cudf::strings::regex_flags>(
+    cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags);
+  results      = cudf::strings::extract(view, *prog_ml);
+  expected =
+    cudf::test::strings_column_wrapper({"zzé", "zzé", "zzé", "", "zzé", "zzé"}, {1, 1, 1, 0, 1, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+
+  prog = cudf::strings::regex_program::create("q(q.*l)l");
+  expected = cudf::test::strings_column_wrapper({"", "qq" LINE_SEPARATOR "zzé\rll", "", "", "", ""},
+                                                {0, 1, 0, 0, 0, 0});
+  results = cudf::strings::extract(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+  // expect no matches here since the newline(s) interrupts the pattern
+  prog = cudf::strings::regex_program::create("q(q.*l)l", cudf::strings::regex_flags::EXT_NEWLINE);
+  expected = cudf::test::strings_column_wrapper({"", "", "", "", "", ""}, {0, 0, 0, 0, 0, 0});
+  results  = cudf::strings::extract(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+}
+
 TEST_F(StringsExtractTests, EmptyExtractTest)
 {
   std::vector<char const*> h_strings{nullptr, "AAA", "AAA_A", "AAA_AAA_", "A__", ""};
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index 4582dcb1e38..47606b9b3ed 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -80,6 +82,32 @@ TEST_F(StringsFindallTests, DotAll)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
+TEST_F(StringsFindallTests, SpecialNewLines)
+{
+  auto input = cudf::test::strings_column_wrapper({"zzé" PARAGRAPH_SEPARATOR "qqq\nzzé",
+                                                   "qqq\nzzé" PARAGRAPH_SEPARATOR "lll",
+                                                   "zzé",
+                                                   "",
+                                                   "zzé\r",
+                                                   "zzé" LINE_SEPARATOR "zzé" NEXT_LINE});
+  auto view  = cudf::strings_column_view(input);
+
+  auto prog =
+    cudf::strings::regex_program::create("(^zzé$)", cudf::strings::regex_flags::EXT_NEWLINE);
+  auto results = cudf::strings::findall(view, *prog);
+  using LCW    = cudf::test::lists_column_wrapper<cudf::string_view>;
+  LCW expected({LCW{}, LCW{}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  auto both_flags = static_cast<cudf::strings::regex_flags>(
+    cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml = cudf::strings::regex_program::create("^(zzé)$", both_flags);
+  results      = cudf::strings::findall(view, *prog_ml);
+  LCW expected_ml(
+    {LCW{"zzé", "zzé"}, LCW{"zzé"}, LCW{"zzé"}, LCW{}, LCW{"zzé"}, LCW{"zzé", "zzé"}});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected_ml);
+}
+
 TEST_F(StringsFindallTests, MediumRegex)
 {
   // This results in 15 regex instructions and falls in the 'medium' range.
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index 8c0482653fb..9847d8d6bb5 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "special_chars.h"
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -245,6 +247,53 @@ TEST_F(StringsReplaceRegexTest, Multiline)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected);
 }
 
+TEST_F(StringsReplaceRegexTest, SpecialNewLines)
+{
+  auto input   = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé",
+                                                     "qqq" NEXT_LINE "zzé" NEXT_LINE "lll",
+                                                     "zzé",
+                                                     "",
+                                                     "zzé" PARAGRAPH_SEPARATOR,
+                                                     "abc\rzzé\r"});
+  auto view    = cudf::strings_column_view(input);
+  auto repl    = cudf::string_scalar("_");
+  auto pattern = std::string("^zzé$");
+  auto prog =
+    cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::EXT_NEWLINE);
+  auto results  = cudf::strings::replace_re(view, *prog, repl);
+  auto expected = cudf::test::strings_column_wrapper({"zzé" NEXT_LINE "qqq" NEXT_LINE "zzé",
+                                                      "qqq" NEXT_LINE "zzé" NEXT_LINE "lll",
+                                                      "_",
+                                                      "",
+                                                      "_" PARAGRAPH_SEPARATOR,
+                                                      "abc\rzzé\r"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  auto both_flags = static_cast<cudf::strings::regex_flags>(
+    cudf::strings::regex_flags::EXT_NEWLINE | cudf::strings::regex_flags::MULTILINE);
+  auto prog_ml = cudf::strings::regex_program::create(pattern, both_flags);
+  results      = cudf::strings::replace_re(view, *prog_ml, repl);
+  expected     = cudf::test::strings_column_wrapper({"_" NEXT_LINE "qqq" NEXT_LINE "_",
+                                                     "qqq" NEXT_LINE "_" NEXT_LINE "lll",
+                                                     "_",
+                                                     "",
+                                                     "_" PARAGRAPH_SEPARATOR,
+                                                     "abc\r_\r"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+
+  auto repl_template = std::string("[\\1]");
+  pattern            = std::string("(^zzé$)");
+  prog               = cudf::strings::regex_program::create(pattern, both_flags);
+  results            = cudf::strings::replace_with_backrefs(view, *prog, repl_template);
+  expected = cudf::test::strings_column_wrapper({"[zzé]" NEXT_LINE "qqq" NEXT_LINE "[zzé]",
+                                                 "qqq" NEXT_LINE "[zzé]" NEXT_LINE "lll",
+                                                 "[zzé]",
+                                                 "",
+                                                 "[zzé]" PARAGRAPH_SEPARATOR,
+                                                 "abc\r[zzé]\r"});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+}
+
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest)
 {
   std::vector<char const*> h_strings{"the quick brown fox jumps over the lazy dog",
diff --git a/cpp/tests/strings/special_chars.h b/cpp/tests/strings/special_chars.h
new file mode 100644
index 00000000000..0d630f6bb52
--- /dev/null
+++ b/cpp/tests/strings/special_chars.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+namespace cudf::test {
+
+// special new-line characters for use with regex_flags::EXT_NEWLINE
+#define NEXT_LINE           "\xC2\x85"
+#define LINE_SEPARATOR      "\xE2\x80\xA8"
+#define PARAGRAPH_SEPARATOR "\xE2\x80\xA9"
+
+}  // namespace cudf::test

From a112f684318e24b2321df48004ca58180f169410 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Tue, 17 Sep 2024 11:31:38 -0700
Subject: [PATCH 827/842] Add io_type axis with default `PINNED_BUFFER` to
 nvbench PQ multithreaded reader (#16809)

Closes #16758

This PR adds an `io_type` axis to the benchmarks in `PARQUET_MULTITHREAD_READER_NVBENCH` with `PINNED_BUFFER` as default value. More description at #16758.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - David Wendt (https://github.com/davidwendt)
  - Tianyu Liu (https://github.com/kingcrimsontianyu)

URL: https://github.com/rapidsai/cudf/pull/16809
---
 .../io/parquet/parquet_reader_multithread.cpp | 36 ++++++++++++-------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
index 3abd4280081..7121cb9f034 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -50,7 +50,7 @@ std::string get_label(std::string const& test_name, nvbench::state const& state)
 }
 
 std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
-  nvbench::state& state, std::vector<cudf::type_id> const& d_types)
+  nvbench::state& state, std::vector<cudf::type_id> const& d_types, io_type io_source_type)
 {
   cudf::size_type const cardinality = state.get_int64("cardinality");
   cudf::size_type const run_length  = state.get_int64("run_length");
@@ -63,7 +63,7 @@ std::tuple<std::vector<cuio_source_sink_pair>, size_t, size_t> write_file_data(
   size_t total_file_size = 0;
 
   for (size_t i = 0; i < num_files; ++i) {
-    cuio_source_sink_pair source_sink{io_type::HOST_BUFFER};
+    cuio_source_sink_pair source_sink{io_source_type};
 
     auto const tbl = create_random_table(
       cycle_dtypes(d_types, num_cols),
@@ -92,11 +92,13 @@ void BM_parquet_multithreaded_read_common(nvbench::state& state,
 {
   size_t const data_size = state.get_int64("total_data_size");
   auto const num_threads = state.get_int64("num_threads");
+  auto const source_type = retrieve_io_type_enum(state.get_string("io_type"));
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
   BS::thread_pool threads(num_threads);
 
-  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+  auto [source_sink_vector, total_file_size, num_files] =
+    write_file_data(state, d_types, source_type);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
                  source_sink_vector.end(),
@@ -173,10 +175,12 @@ void BM_parquet_multithreaded_read_chunked_common(nvbench::state& state,
   auto const num_threads    = state.get_int64("num_threads");
   size_t const input_limit  = state.get_int64("input_limit");
   size_t const output_limit = state.get_int64("output_limit");
+  auto const source_type    = retrieve_io_type_enum(state.get_string("io_type"));
 
   auto streams = cudf::detail::fork_streams(cudf::get_default_stream(), num_threads);
   BS::thread_pool threads(num_threads);
-  auto [source_sink_vector, total_file_size, num_files] = write_file_data(state, d_types);
+  auto [source_sink_vector, total_file_size, num_files] =
+    write_file_data(state, d_types, source_type);
   std::vector<cudf::io::source_info> source_info_vector;
   std::transform(source_sink_vector.begin(),
                  source_sink_vector.end(),
@@ -264,7 +268,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_mixed)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width)
   .set_name("parquet_multithreaded_read_decode_fixed_width")
@@ -273,7 +278,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_fixed_width)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_string)
   .set_name("parquet_multithreaded_read_decode_string")
@@ -282,7 +288,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_string)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_list)
   .set_name("parquet_multithreaded_read_decode_list")
@@ -291,7 +298,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_list)
   .add_int64_axis("total_data_size", {512 * 1024 * 1024, 1024 * 1024 * 1024})
   .add_int64_axis("num_threads", {1, 2, 4, 8})
   .add_int64_axis("num_cols", {4})
-  .add_int64_axis("run_length", {8});
+  .add_int64_axis("run_length", {8})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 // mixed data types: fixed width, strings
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed)
@@ -303,7 +311,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_mixed)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width)
   .set_name("parquet_multithreaded_read_decode_chunked_fixed_width")
@@ -314,7 +323,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_fixed_width)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string)
   .set_name("parquet_multithreaded_read_decode_chunked_string")
@@ -325,7 +335,8 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_string)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});
 
 NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list)
   .set_name("parquet_multithreaded_read_decode_chunked_list")
@@ -336,4 +347,5 @@ NVBENCH_BENCH(BM_parquet_multithreaded_read_chunked_list)
   .add_int64_axis("num_cols", {4})
   .add_int64_axis("run_length", {8})
   .add_int64_axis("input_limit", {640 * 1024 * 1024})
-  .add_int64_axis("output_limit", {640 * 1024 * 1024});
+  .add_int64_axis("output_limit", {640 * 1024 * 1024})
+  .add_string_axis("io_type", {"PINNED_BUFFER"});

From 124cd803fd06525164fa7a80394348eea6ebc792 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 18 Sep 2024 02:34:18 +0000
Subject: [PATCH 828/842] test

---
 ci/cudf_pandas_scripts/pandas-tests/run.sh    |  2 +-
 .../cudf/pandas/scripts/conftest-patch.py     | 31 ++++++++++++++-----
 .../cudf/pandas/scripts/run-pandas-tests.sh   |  5 +--
 3 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index e5cd4436a3a..f71ac83e407 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -33,7 +33,7 @@ bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \
   -m "not slow" \
   --max-worker-restart=3 \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-pandas.xml" \
-  --dist worksteal \
+  --dist loadfile \
   --report-log=${PANDAS_TESTS_BRANCH}.json 2>&1
 
 SUMMARY_FILE_NAME=${PANDAS_TESTS_BRANCH}-${RAPIDS_FULL_VERSION}-results.json
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index d855b3468a0..4dfdea72178 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -7,6 +7,7 @@
 import multiprocessing
 import os
 import sys
+from collections import defaultdict
 from functools import wraps
 
 import pytest
@@ -40,21 +41,35 @@ def patch_testing_functions():
 
 # Dictionary to store function call counts
 manager = multiprocessing.Manager()
-function_call_counts = manager.dict()
+function_call_counts = defaultdict(int)  # type: ignore
 
 # The specific function to track
 FUNCTION_NAME = {"_slow_function_call", "_fast_function_call"}
 
 
+def find_pytest_file(frame):
+    new_f = frame
+    while new_f:
+        if "pandas-testing/pandas-tests/tests" in new_f.f_globals.get(
+            "__file__", ""
+        ):
+            return os.path.abspath(new_f.f_globals.get("__file__", ""))
+        new_f = new_f.f_back
+    return None
+
+
 def trace_calls(frame, event, arg):
     if event != "call":
         return
     code = frame.f_code
     func_name = code.co_name
+
     if func_name in FUNCTION_NAME:
-        function_call_counts[func_name] = (
-            function_call_counts.get(func_name, 0) + 1
-        )
+        # filename = find_pytest_file(frame)
+        # if filename not in function_call_counts:
+        #     function_call_counts[filename] = defaultdict(int)
+        # function_call_counts[filename][func_name] += 1
+        function_call_counts[func_name] += 1
 
 
 def pytest_sessionstart(session):
@@ -75,7 +90,7 @@ def pytest_runtest_setup(item):
     ):
         # If it's a new file, reset the function call counts
         global function_call_counts
-        function_call_counts = manager.dict()
+        function_call_counts = defaultdict(int)
         pytest_runtest_setup.current_file = item.nodeid.split("::")[0]
 
 
@@ -89,6 +104,8 @@ def pytest_runtest_teardown(item, nextitem):
         # Write the function call counts to a file
         worker_id = os.getenv("PYTEST_XDIST_WORKER", "master")
         output_file = f'{item.nodeid.split("::")[0].replace("/", "__")}_{worker_id}_metrics.json'
+        # if os.path.exists(output_file):
+        #     output_file = f'{item.nodeid.split("::")[0].replace("/", "__")}_{worker_id}_metrics_1.json'
         with open(output_file, "w") as f:
             json.dump(dict(function_call_counts), f, indent=4)
         print(f"Function call counts have been written to {output_file}")
@@ -99,7 +116,7 @@ def pytest_configure(config):
     if hasattr(config, "workerinput"):
         # Running in xdist worker
         global function_call_counts
-        function_call_counts = manager.dict()
+        function_call_counts = defaultdict(int)
 
 
 @pytest.hookimpl(trylast=True)
@@ -110,7 +127,7 @@ def pytest_unconfigure(config):
         output_file = f"function_call_counts_worker_{worker_id}.json"
         with open(output_file, "w") as f:
             json.dump(dict(function_call_counts), f, indent=4)
-        print(f"Function call counts have been written to {output_file}")
+        # print(f"Function call counts have been written to {output_file}")
 
 
 sys.path.append(os.path.dirname(__file__))
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 416264ea04d..781bde87f7d 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -65,7 +65,7 @@ markers = [
 ]
 EOF
     # append the contents of patch-confest.py to conftest.py
-    cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftest.py
+    # cat ../python/cudf/cudf/pandas/scripts/conftest-patch.py >> pandas-tests/conftest.py
 
     # Substitute `pandas.tests` with a relative import.
     # This will depend on the location of the test module relative to
@@ -134,7 +134,8 @@ TEST_THAT_CRASH_PYTEST_WORKERS="not test_bitmasks_pyarrow \
 and not test_large_string_pyarrow \
 and not test_interchange_from_corrected_buffer_dtypes \
 and not test_eof_states \
-and not test_array_tz"
+and not test_array_tz \
+and not test_groupby_raises_category"
 
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 90m python -m pytest -p cudf.pandas \

From 35d6a3af9d968138d6d1a7ab3c88e16799b50f4e Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 17 Sep 2024 21:52:43 -0500
Subject: [PATCH 829/842] Update
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh

---
 python/cudf/cudf/pandas/scripts/run-pandas-tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index 781bde87f7d..be83086a7dd 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -138,7 +138,7 @@ and not test_array_tz \
 and not test_groupby_raises_category"
 
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
-PANDAS_CI="1" timeout 90m python -m pytest -p cudf.pandas \
+PANDAS_CI="1" timeout 900m python -m pytest -p cudf.pandas \
     -v -m "not single_cpu and not db" \
     -k "$TEST_THAT_NEED_MOTO_SERVER and $TEST_THAT_CRASH_PYTEST_WORKERS and not test_groupby_raises_category_on_category and not test_constructor_no_pandas_array and not test_is_monotonic_na and not test_index_contains and not test_index_contains and not test_frame_op_subclass_nonclass_constructor and not test_round_trip_current" \
     --import-mode=importlib \

From 57ae3e372e93a16db8aef143759ef58392c4215f Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 18 Sep 2024 02:10:58 -0500
Subject: [PATCH 830/842] Enable cudf.pandas REPL and -c command support
 (#16428)

This PR enables support for two features:
- `python -m cudf.pandas` gives a REPL experience (previously it raised an error)
- `python -m cudf.pandas -c "<commands>"` runs the provided commands (previously unsupported)

Authors:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Murray (https://github.com/Matt711)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/16428
---
 docs/cudf/source/cudf_pandas/usage.md      |  20 +++++
 python/cudf/cudf/pandas/__main__.py        |  36 +++++++-
 python/cudf/cudf_pandas_tests/test_main.py | 100 +++++++++++++++++++++
 3 files changed, 154 insertions(+), 2 deletions(-)
 create mode 100644 python/cudf/cudf_pandas_tests/test_main.py

diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md
index 0398a8d7086..41838e01dd9 100644
--- a/docs/cudf/source/cudf_pandas/usage.md
+++ b/docs/cudf/source/cudf_pandas/usage.md
@@ -120,3 +120,23 @@ To profile a script being run from the command line, pass the
 ```bash
 python -m cudf.pandas --profile script.py
 ```
+
+### cudf.pandas CLI Features
+
+Several of the ways to provide input to the `python` interpreter also work with `python -m cudf.pandas`, such as the REPL, the `-c` flag, and reading from stdin.
+
+Executing `python -m cudf.pandas` with no script name will enter a REPL (read-eval-print loop) similar to the behavior of the normal `python` interpreter.
+
+The `-c` flag accepts a code string to run, like this:
+
+```bash
+$ python -m cudf.pandas -c "import pandas; print(pandas)"
+<module 'pandas' (ModuleAccelerator(fast=cudf, slow=pandas))>
+```
+
+Users can also provide code to execute from stdin, like this:
+
+```bash
+$ echo "import pandas; print(pandas)" | python -m cudf.pandas
+<module 'pandas' (ModuleAccelerator(fast=cudf, slow=pandas))>
+```
diff --git a/python/cudf/cudf/pandas/__main__.py b/python/cudf/cudf/pandas/__main__.py
index 3a82829eb7a..e0d3d9101a9 100644
--- a/python/cudf/cudf/pandas/__main__.py
+++ b/python/cudf/cudf/pandas/__main__.py
@@ -10,6 +10,7 @@
 """
 
 import argparse
+import code
 import runpy
 import sys
 import tempfile
@@ -21,6 +22,8 @@
 
 @contextmanager
 def profile(function_profile, line_profile, fn):
+    if fn is None and (line_profile or function_profile):
+        raise RuntimeError("Enabling the profiler requires a script name.")
     if line_profile:
         with open(fn) as f:
             lines = f.readlines()
@@ -54,6 +57,11 @@ def main():
         dest="module",
         nargs=1,
     )
+    parser.add_argument(
+        "-c",
+        dest="cmd",
+        nargs=1,
+    )
     parser.add_argument(
         "--profile",
         action="store_true",
@@ -72,9 +80,18 @@ def main():
 
     args = parser.parse_args()
 
+    if args.cmd:
+        f = tempfile.NamedTemporaryFile(mode="w+b", suffix=".py")
+        f.write(args.cmd[0].encode())
+        f.seek(0)
+        args.args.insert(0, f.name)
+
     install()
-    with profile(args.profile, args.line_profile, args.args[0]) as fn:
-        args.args[0] = fn
+
+    script_name = args.args[0] if len(args.args) > 0 else None
+    with profile(args.profile, args.line_profile, script_name) as fn:
+        if script_name is not None:
+            args.args[0] = fn
         if args.module:
             (module,) = args.module
             # run the module passing the remaining arguments
@@ -85,6 +102,21 @@ def main():
             # Remove ourself from argv and continue
             sys.argv[:] = args.args
             runpy.run_path(args.args[0], run_name="__main__")
+        else:
+            if sys.stdin.isatty():
+                banner = f"Python {sys.version} on {sys.platform}"
+                site_import = not sys.flags.no_site
+                if site_import:
+                    cprt = 'Type "help", "copyright", "credits" or "license" for more information.'
+                    banner += "\n" + cprt
+            else:
+                # Don't show prompts or banners if stdin is not a TTY
+                sys.ps1 = ""
+                sys.ps2 = ""
+                banner = ""
+
+            # Launch an interactive interpreter
+            code.interact(banner=banner, exitmsg="")
 
 
 if __name__ == "__main__":
diff --git a/python/cudf/cudf_pandas_tests/test_main.py b/python/cudf/cudf_pandas_tests/test_main.py
new file mode 100644
index 00000000000..326224c8fc0
--- /dev/null
+++ b/python/cudf/cudf_pandas_tests/test_main.py
@@ -0,0 +1,100 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import subprocess
+import tempfile
+import textwrap
+
+
+def _run_python(*, cudf_pandas, command):
+    executable = "python "
+    if cudf_pandas:
+        executable += "-m cudf.pandas "
+    return subprocess.run(
+        executable + command,
+        shell=True,
+        capture_output=True,
+        check=True,
+        text=True,
+    )
+
+
+def test_run_cudf_pandas_with_script():
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".py", delete=True) as f:
+        code = textwrap.dedent(
+            """
+            import pandas as pd
+            df = pd.DataFrame({'a': [1, 2, 3]})
+            print(df['a'].sum())
+            """
+        )
+        f.write(code)
+        f.flush()
+
+        res = _run_python(cudf_pandas=True, command=f.name)
+        expect = _run_python(cudf_pandas=False, command=f.name)
+
+    assert res.stdout != ""
+    assert res.stdout == expect.stdout
+
+
+def test_run_cudf_pandas_with_script_with_cmd_args():
+    input_args_and_code = """-c 'import pandas as pd; df = pd.DataFrame({"a": [1, 2, 3]}); print(df["a"].sum())'"""
+
+    res = _run_python(cudf_pandas=True, command=input_args_and_code)
+    expect = _run_python(cudf_pandas=False, command=input_args_and_code)
+
+    assert res.stdout != ""
+    assert res.stdout == expect.stdout
+
+
+def test_run_cudf_pandas_with_script_with_cmd_args_check_cudf():
+    """Verify that cudf is active with -m cudf.pandas."""
+    input_args_and_code = """-c 'import pandas as pd; print(pd)'"""
+
+    res = _run_python(cudf_pandas=True, command=input_args_and_code)
+    expect = _run_python(cudf_pandas=False, command=input_args_and_code)
+
+    assert "cudf" in res.stdout
+    assert "cudf" not in expect.stdout
+
+
+def test_cudf_pandas_script_repl():
+    def start_repl_process(cmd):
+        return subprocess.Popen(
+            cmd.split(),
+            stdin=subprocess.PIPE,
+            stdout=subprocess.PIPE,
+            text=True,
+        )
+
+    def get_repl_output(process, commands):
+        for command in commands:
+            process.stdin.write(command)
+            process.stdin.flush()
+        return process.communicate()
+
+    p1 = start_repl_process("python -m cudf.pandas")
+    p2 = start_repl_process("python")
+    commands = [
+        "import pandas as pd\n",
+        "print(pd.Series(range(2)).sum())\n",
+        "print(pd.Series(range(5)).sum())\n",
+        "import sys\n",
+        "print(pd.Series(list('abcd')), out=sys.stderr)\n",
+    ]
+
+    res = get_repl_output(p1, commands)
+    expect = get_repl_output(p2, commands)
+
+    # Check stdout
+    assert res[0] != ""
+    assert res[0] == expect[0]
+
+    # Check stderr
+    assert res[1] != ""
+    assert res[1] == expect[1]
+
+    p1.kill()
+    p2.kill()

From 44a9c10105ab06538264e727188a04d623b0811e Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 18 Sep 2024 01:25:59 -0700
Subject: [PATCH 831/842] Add a benchmark to study Parquet reader's performance
 for wide tables (#16751)

Related to #16750

This PR adds a benchmark to study read throughput of Parquet reader for wide tables.

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Paul Mattione (https://github.com/pmattione-nvidia)
  - Vukasin Milovanovic (https://github.com/vuule)

URL: https://github.com/rapidsai/cudf/pull/16751
---
 .../io/parquet/parquet_reader_input.cpp       | 87 ++++++++++++++++++-
 1 file changed, 85 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
index 7563c823454..ce115fd7723 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -32,7 +32,8 @@ constexpr cudf::size_type num_cols = 64;
 void parquet_read_common(cudf::size_type num_rows_to_read,
                          cudf::size_type num_cols_to_read,
                          cuio_source_sink_pair& source_sink,
-                         nvbench::state& state)
+                         nvbench::state& state,
+                         size_t table_data_size = data_size)
 {
   cudf::io::parquet_reader_options read_opts =
     cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
@@ -52,7 +53,7 @@ void parquet_read_common(cudf::size_type num_rows_to_read,
     });
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
-  state.add_element_count(static_cast<double>(data_size) / time, "bytes_per_second");
+  state.add_element_count(static_cast<double>(table_data_size) / time, "bytes_per_second");
   state.add_buffer_size(
     mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
@@ -231,6 +232,70 @@ void BM_parquet_read_chunks(nvbench::state& state, nvbench::type_list<nvbench::e
   state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
 }
 
+template <data_type DataType>
+void BM_parquet_read_wide_tables(nvbench::state& state,
+                                 nvbench::type_list<nvbench::enum_type<DataType>> type_list)
+{
+  auto const d_type = get_type_or_group(static_cast<int32_t>(DataType));
+
+  auto const n_col           = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const data_size_bytes = static_cast<size_t>(state.get_int64("data_size_mb") << 20);
+  auto const cardinality     = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length      = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  auto const source_type     = io_type::DEVICE_BUFFER;
+  cuio_source_sink_pair source_sink(source_type);
+
+  auto const num_rows_written = [&]() {
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_type, n_col),
+      table_size_bytes{data_size_bytes},
+      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const view = tbl->view();
+
+    cudf::io::parquet_writer_options write_opts =
+      cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+        .compression(cudf::io::compression_type::NONE);
+    cudf::io::write_parquet(write_opts);
+    return view.num_rows();
+  }();
+
+  parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes);
+}
+
+void BM_parquet_read_wide_tables_mixed(nvbench::state& state)
+{
+  auto const d_type = []() {
+    auto d_type1 = get_type_or_group(static_cast<int32_t>(data_type::INTEGRAL));
+    auto d_type2 = get_type_or_group(static_cast<int32_t>(data_type::FLOAT));
+    d_type1.reserve(d_type1.size() + d_type2.size());
+    std::move(d_type2.begin(), d_type2.end(), std::back_inserter(d_type1));
+    return d_type1;
+  }();
+
+  auto const n_col           = static_cast<cudf::size_type>(state.get_int64("num_cols"));
+  auto const data_size_bytes = static_cast<size_t>(state.get_int64("data_size_mb") << 20);
+  auto const cardinality     = static_cast<cudf::size_type>(state.get_int64("cardinality"));
+  auto const run_length      = static_cast<cudf::size_type>(state.get_int64("run_length"));
+  auto const source_type     = io_type::DEVICE_BUFFER;
+  cuio_source_sink_pair source_sink(source_type);
+
+  auto const num_rows_written = [&]() {
+    auto const tbl = create_random_table(
+      cycle_dtypes(d_type, n_col),
+      table_size_bytes{data_size_bytes},
+      data_profile_builder().cardinality(cardinality).avg_run_length(run_length));
+    auto const view = tbl->view();
+
+    cudf::io::parquet_writer_options write_opts =
+      cudf::io::parquet_writer_options::builder(source_sink.make_sink_info(), view)
+        .compression(cudf::io::compression_type::NONE);
+    cudf::io::write_parquet(write_opts);
+    return view.num_rows();
+  }();
+
+  parquet_read_common(num_rows_written, n_col, source_sink, state, data_size_bytes);
+}
+
 using d_type_list = nvbench::enum_type_list<data_type::INTEGRAL,
                                             data_type::FLOAT,
                                             data_type::DECIMAL,
@@ -272,6 +337,24 @@ NVBENCH_BENCH(BM_parquet_read_io_small_mixed)
   .add_int64_axis("run_length", {1, 32})
   .add_int64_axis("num_string_cols", {1, 2, 3});
 
+using d_type_list_wide_table = nvbench::enum_type_list<data_type::DECIMAL, data_type::STRING>;
+NVBENCH_BENCH_TYPES(BM_parquet_read_wide_tables, NVBENCH_TYPE_AXES(d_type_list_wide_table))
+  .set_name("parquet_read_wide_tables")
+  .set_min_samples(4)
+  .set_type_axes_names({"data_type"})
+  .add_int64_axis("data_size_mb", {1024, 2048, 4096})
+  .add_int64_axis("num_cols", {256, 512, 1024})
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
+
+NVBENCH_BENCH(BM_parquet_read_wide_tables_mixed)
+  .set_name("parquet_read_wide_tables_mixed")
+  .set_min_samples(4)
+  .add_int64_axis("data_size_mb", {1024, 2048, 4096})
+  .add_int64_axis("num_cols", {256, 512, 1024})
+  .add_int64_axis("cardinality", {0, 1000})
+  .add_int64_axis("run_length", {1, 32});
+
 // a benchmark for structs that only contain fixed-width types
 using d_type_list_struct_only = nvbench::enum_type_list<data_type::STRUCT>;
 NVBENCH_BENCH_TYPES(BM_parquet_read_fixed_width_struct, NVBENCH_TYPE_AXES(d_type_list_struct_only))

From bc0fc18e3a06ae1c7288cabcbd06eaeb411256bc Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 18 Sep 2024 12:54:28 +0000
Subject: [PATCH 832/842] test

---
 .../cudf/pandas/scripts/conftest-patch.py     | 102 ++++++++++--------
 1 file changed, 55 insertions(+), 47 deletions(-)

diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index 4dfdea72178..52a6fa89bef 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -7,6 +7,7 @@
 import multiprocessing
 import os
 import sys
+import traceback
 from collections import defaultdict
 from functools import wraps
 
@@ -48,14 +49,20 @@ def patch_testing_functions():
 
 
 def find_pytest_file(frame):
-    new_f = frame
-    while new_f:
-        if "pandas-testing/pandas-tests/tests" in new_f.f_globals.get(
-            "__file__", ""
-        ):
-            return os.path.abspath(new_f.f_globals.get("__file__", ""))
-        new_f = new_f.f_back
+    stack = traceback.extract_stack()
+    absolute_paths = [frame.filename for frame in stack]
+    for file in absolute_paths:
+        if "pandas-testing/pandas-tests/tests" in file and file.rsplit("/", 1)[
+            -1
+        ].startswith("test_"):
+            return file
     return None
+    # new_f = frame
+    # while new_f:
+    #     if "pandas-testing/pandas-tests/tests" in new_f.f_globals.get("__file__", ""):
+    #         return os.path.abspath(new_f.f_globals.get("__file__", ""))
+    #     new_f = new_f.f_back
+    # return None
 
 
 def trace_calls(frame, event, arg):
@@ -65,11 +72,12 @@ def trace_calls(frame, event, arg):
     func_name = code.co_name
 
     if func_name in FUNCTION_NAME:
-        # filename = find_pytest_file(frame)
-        # if filename not in function_call_counts:
-        #     function_call_counts[filename] = defaultdict(int)
-        # function_call_counts[filename][func_name] += 1
-        function_call_counts[func_name] += 1
+        filename = find_pytest_file(frame)
+        if filename is None:
+            return
+        if filename not in function_call_counts:
+            function_call_counts[filename] = defaultdict(int)
+        function_call_counts[filename][func_name] += 1
 
 
 def pytest_sessionstart(session):
@@ -82,41 +90,41 @@ def pytest_sessionfinish(session, exitstatus):
     sys.setprofile(None)
 
 
-@pytest.hookimpl(tryfirst=True)
-def pytest_runtest_setup(item):
-    # Check if this is the first test in the file
-    if item.nodeid.split("::")[0] != getattr(
-        pytest_runtest_setup, "current_file", None
-    ):
-        # If it's a new file, reset the function call counts
-        global function_call_counts
-        function_call_counts = defaultdict(int)
-        pytest_runtest_setup.current_file = item.nodeid.split("::")[0]
-
-
-@pytest.hookimpl(trylast=True)
-def pytest_runtest_teardown(item, nextitem):
-    # Check if this is the last test in the file
-    if (
-        nextitem is None
-        or nextitem.nodeid.split("::")[0] != item.nodeid.split("::")[0]
-    ):
-        # Write the function call counts to a file
-        worker_id = os.getenv("PYTEST_XDIST_WORKER", "master")
-        output_file = f'{item.nodeid.split("::")[0].replace("/", "__")}_{worker_id}_metrics.json'
-        # if os.path.exists(output_file):
-        #     output_file = f'{item.nodeid.split("::")[0].replace("/", "__")}_{worker_id}_metrics_1.json'
-        with open(output_file, "w") as f:
-            json.dump(dict(function_call_counts), f, indent=4)
-        print(f"Function call counts have been written to {output_file}")
-
-
-@pytest.hookimpl(tryfirst=True)
-def pytest_configure(config):
-    if hasattr(config, "workerinput"):
-        # Running in xdist worker
-        global function_call_counts
-        function_call_counts = defaultdict(int)
+# @pytest.hookimpl(tryfirst=True)
+# def pytest_runtest_setup(item):
+#     # Check if this is the first test in the file
+#     if item.nodeid.split("::")[0] != getattr(
+#         pytest_runtest_setup, "current_file", None
+#     ):
+#         # If it's a new file, reset the function call counts
+#         global function_call_counts
+#         function_call_counts = defaultdict(int)
+#         pytest_runtest_setup.current_file = item.nodeid.split("::")[0]
+
+
+# @pytest.hookimpl(trylast=True)
+# def pytest_runtest_teardown(item, nextitem):
+#     # Check if this is the last test in the file
+#     if (
+#         nextitem is None
+#         or nextitem.nodeid.split("::")[0] != item.nodeid.split("::")[0]
+#     ):
+#         # Write the function call counts to a file
+#         worker_id = os.getenv("PYTEST_XDIST_WORKER", "master")
+#         output_file = f'{item.nodeid.split("::")[0].replace("/", "__")}_{worker_id}_metrics.json'
+#         # if os.path.exists(output_file):
+#         #     output_file = f'{item.nodeid.split("::")[0].replace("/", "__")}_{worker_id}_metrics_1.json'
+#         with open(output_file, "w") as f:
+#             json.dump(dict(function_call_counts), f, indent=4)
+#         print(f"Function call counts have been written to {output_file}")
+
+
+# @pytest.hookimpl(tryfirst=True)
+# def pytest_configure(config):
+#     if hasattr(config, "workerinput"):
+#         # Running in xdist worker
+#         global function_call_counts
+#         function_call_counts = defaultdict(int)
 
 
 @pytest.hookimpl(trylast=True)

From 91b03c6d245576ee612e6e2b2aff92b9d8f79fd2 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 18 Sep 2024 13:02:09 +0000
Subject: [PATCH 833/842] test

---
 ci/cudf_pandas_scripts/pandas-tests/run.sh    |  2 +-
 .../pandas/scripts/summarize-test-results.py  | 55 +++++++++++++------
 2 files changed, 40 insertions(+), 17 deletions(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index f71ac83e407..e5cd4436a3a 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -33,7 +33,7 @@ bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \
   -m "not slow" \
   --max-worker-restart=3 \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-pandas.xml" \
-  --dist loadfile \
+  --dist worksteal \
   --report-log=${PANDAS_TESTS_BRANCH}.json 2>&1
 
 SUMMARY_FILE_NAME=${PANDAS_TESTS_BRANCH}-${RAPIDS_FULL_VERSION}-results.json
diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
index 53159a50909..347118b290d 100644
--- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py
+++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
@@ -60,23 +60,46 @@ def get_per_module_results(log_file_name):
                 per_module_results[module_name]["total"] += 1
                 per_module_results[module_name][outcome] += 1
 
+    directory = os.path.dirname(log_file_name)
+    pattern = os.path.join(directory, "function_call_counts_worker_*.json")
+    matching_files = glob.glob(pattern)
+    function_call_counts = {}
+    for file in matching_files:
+        with open(file) as f:
+            function_call_count = json.load(f)
+        if not function_call_counts:
+            function_call_counts.update(function_call_count)
+        else:
+            for key, value in function_call_count.items():
+                function_call_counts[key]["_slow_function_call"] += value.get(
+                    "_slow_function_call", 0
+                )
+                function_call_counts[key]["_fast_function_call"] += value.get(
+                    "_fast_function_call", 0
+                )
+            # per_module_results[key]["_slow_function_call"] = (
+            #     per_module_results[key].get("_slow_function_call", 0)
+            #     + function_call_counts.get("_slow_function_call", 0)
+            # )
+            # per_module_results[key]["_fast_function_call"] = (
+            #     per_module_results[key].get("_fast_function_call", 0)
+            #     + function_call_counts.get("_fast_function_call", 0)
+            # )
     for key, value in per_module_results.items():
-        processed_name = key.replace("/", "__") + "_*_metrics.json"
-        # Assuming the directory is the same as the module name's directory
-        directory = os.path.dirname(log_file_name)
-        pattern = os.path.join(directory, processed_name)
-        matching_files = glob.glob(pattern)
-        for file in matching_files:
-            with open(file) as f:
-                function_call_counts = json.load(f)
-            per_module_results[key]["_slow_function_call"] = (
-                per_module_results[key].get("_slow_function_call", 0)
-                + function_call_counts.get("_slow_function_call", 0)
-            )
-            per_module_results[key]["_fast_function_call"] = (
-                per_module_results[key].get("_fast_function_call", 0)
-                + function_call_counts.get("_fast_function_call", 0)
-            )
+        # processed_name = key.replace("/", "__") + "_*_metrics.json"
+        # # Assuming the directory is the same as the module name's directory
+        # directory = os.path.dirname(log_file_name)
+        # pattern = os.path.join(directory, processed_name)
+        # matching_files = glob.glob(pattern)
+        # for file in matching_files:
+        #     with open(file) as f:
+        #         function_call_counts = json.load(f)
+        per_module_results[key]["_slow_function_call"] = function_call_counts[
+            key
+        ].get("_slow_function_call", 0)
+        per_module_results[key]["_fast_function_call"] = function_call_counts[
+            key
+        ].get("_fast_function_call", 0)
     return per_module_results
 
 
From 2a9a8f5b95ea62824147f1629de1fe52fdbf1254 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Wed, 18 Sep 2024 09:02:41 -0500
Subject: [PATCH 834/842] use get-pr-info from nv-gha-runners (#16819)

There are two implementations of the same action; one in
[rapidsai/shared-actions](https://github.com/rapidsai/shared-actions/tree/main/get-pr-info)
and [the other](https://github.com/nv-gha-runners/get-pr-info) in the
nv-gha-runners org. This PR switches to the implementation in the
nv-gha-runners group in order to keep a single source of truth.

Tested in
https://github.com/rapidsai/cudf/actions/runs/10906617425/job/30268277178?pr=16819#step:4:5
---
 .github/workflows/pr.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index a4a8f036174..d7d14ea12ff 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -52,7 +52,7 @@ jobs:
     steps:
       - name: Get PR info
         id: get-pr-info
-        uses: rapidsai/shared-actions/get-pr-info@main
+        uses: nv-gha-runners/get-pr-info@main
       - name: Checkout code repo
         uses: actions/checkout@v4
         with:

From 2a3026dec9dca553c2be7d49f2d0e6c09a9f4589 Mon Sep 17 00:00:00 2001
From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com>
Date: Wed, 18 Sep 2024 10:04:31 -0700
Subject: [PATCH 835/842] Change the Parquet writer's
 `default_row_group_size_bytes` from 128MB to inf (#16750)

Closes #16733.

This PR changes the default value of Parquet writer's default max row group size from 128MB to 1Million rows. This allows avoiding thin row group strips when writing wide (> 512 cols) tables resulting in a significantly improved read throughput for wide tables (especially when low cardinality) with virtually no impact to narrow-tables read performance.

Benchmarked using: #16751

## Results

### Hardware
```
GPU: NVIDIA RTX 5880 Ada Generation
SM Version: 890 (PTX Version: 860)
Number of SMs: 110
SM Default Clock Rate: 18446744071874 MHz
Global Memory: 23879 MiB Free / 48632 MiB Total
Global Memory Bus Peak: 960 GB/sec (384-bit DDR @10001MHz)
Max Shared Memory: 100 KiB/SM, 48 KiB/Block
L2 Cache Size: 98304 KiB
Maximum Active Blocks: 24/SM
Maximum Active Threads: 1536/SM, 1024/Block
Available Registers: 65536/SM, 65536/Block
ECC Enabled: No
```

### Read Throughput
```
## parquet_read_wide_tables_mixed

|     T     | num_rows | num_cols |  GPU Time_old  |  GPU Time_new  | bytes_per_second_old | bytes_per_second_new | peak_memory_usage_old | peak_memory_usage_new | encoded_file_size_old | encoded_file_size_new |
|-----------|----------|----------|----------------|----------------|----------------------|----------------------|-----------------------|-----------------------|-----------------------|-----------------------|
|  INTEGRAL |    10000 |       64 |     940.690 us |     928.387 us |         570720378014 |         578283256754 |             3.405 MiB |             3.405 MiB |           748.248 KiB |           748.248 KiB |
|  INTEGRAL |   100000 |       64 |       2.053 ms |       2.037 ms |         261541794543 |         263500220325 |            28.308 MiB |            28.308 MiB |             5.164 MiB |             5.164 MiB |
|  INTEGRAL |   500000 |       64 |       5.783 ms |       5.693 ms |          92838553328 |          94296134644 |           139.928 MiB |           139.042 MiB |            24.698 MiB |            24.325 MiB |
|  INTEGRAL |  1000000 |       64 |      11.400 ms |      10.775 ms |          47092763803 |          49824643807 |           279.254 MiB |           277.470 MiB |            49.042 MiB |            48.284 MiB |
|  INTEGRAL |    10000 |      256 |       1.718 ms |       1.732 ms |         312407306091 |         309935794547 |            13.752 MiB |            13.752 MiB |             2.956 MiB |             2.956 MiB |
|  INTEGRAL |   100000 |      256 |       5.726 ms |       5.818 ms |          93765292338 |          92275580643 |           114.366 MiB |           114.366 MiB |            20.743 MiB |            20.743 MiB |
|  INTEGRAL |   500000 |      256 |      25.179 ms |      22.159 ms |          21322289603 |          24228371776 |           572.905 MiB |           561.786 MiB |           103.796 MiB |            97.677 MiB |
|  INTEGRAL |  1000000 |      256 |      48.259 ms |      42.428 ms |          11124725758 |          12653746472 |             1.117 GiB |             1.095 GiB |           206.155 MiB |           193.886 MiB |
|  INTEGRAL |    10000 |      512 |       2.741 ms |       2.758 ms |         195853280055 |         194632437549 |            27.508 MiB |            27.508 MiB |             5.918 MiB |             5.918 MiB |
|  INTEGRAL |   100000 |      512 |      11.197 ms |      10.600 ms |          47945685016 |          50646524148 |           235.910 MiB |           228.755 MiB |            44.559 MiB |            41.510 MiB |
|  INTEGRAL |   500000 |      512 |      54.929 ms |      43.554 ms |           9773962645 |          12326557981 |             1.146 GiB |             1.097 GiB |           221.266 MiB |           195.384 MiB |
|  INTEGRAL |  1000000 |      512 |     103.779 ms |      82.403 ms |           5173195193 |           6515218035 |             2.288 GiB |             2.190 GiB |           442.101 MiB |           387.861 MiB |
|  INTEGRAL |    10000 |     1024 |       5.210 ms |       5.405 ms |         103040438112 |          99319591295 |            54.937 MiB |            54.937 MiB |            11.829 MiB |            11.829 MiB |
|  INTEGRAL |   100000 |     1024 |      26.891 ms |      20.194 ms |          19964357393 |          26585391032 |           498.410 MiB |           456.756 MiB |            99.962 MiB |            82.939 MiB |
|  INTEGRAL |   500000 |     1024 |     135.404 ms |      84.676 ms |           3964957208 |           6340314329 |             2.434 GiB |             2.191 GiB |           500.554 MiB |           390.418 MiB |
|  INTEGRAL |  1000000 |     1024 |     256.033 ms |     162.217 ms |           2096879057 |           3309593393 |             4.869 GiB |             4.372 GiB |          1001.573 MiB |           775.040 MiB |
|     FLOAT |    10000 |       64 |     962.219 us |     951.565 us |         557950915640 |         564197923891 |             5.275 MiB |             5.275 MiB |          1012.101 KiB |          1012.101 KiB |
|     FLOAT |   100000 |       64 |       2.032 ms |       2.032 ms |         264218700681 |         264250413360 |            45.321 MiB |            45.321 MiB |             6.316 MiB |             6.316 MiB |
|     FLOAT |   500000 |       64 |       6.660 ms |       6.693 ms |          80611279094 |          80219014175 |           224.129 MiB |           222.946 MiB |            29.685 MiB |            29.044 MiB |
|     FLOAT |  1000000 |       64 |      13.560 ms |      13.758 ms |          39591771965 |          39023315442 |           447.103 MiB |           445.007 MiB |            58.762 MiB |            57.482 MiB |
|     FLOAT |    10000 |      256 |       1.808 ms |       1.825 ms |         297020886609 |         294226222306 |            21.109 MiB |            21.109 MiB |             3.968 MiB |             3.968 MiB |
|     FLOAT |   100000 |      256 |       6.921 ms |       6.307 ms |          77571490752 |          85116522574 |           185.578 MiB |           181.271 MiB |            27.393 MiB |            25.256 MiB |
|     FLOAT |   500000 |      256 |      30.064 ms |      25.955 ms |          17857874786 |          20684696586 |           914.366 MiB |           891.787 MiB |           128.981 MiB |           116.186 MiB |
|     FLOAT |  1000000 |      256 |      59.189 ms |      48.592 ms |           9070460126 |          11048464794 |             1.787 GiB |             1.738 GiB |           258.075 MiB |           229.920 MiB |
|     FLOAT |    10000 |      512 |       2.998 ms |       3.006 ms |         179078195058 |         178594968077 |            42.222 MiB |            42.222 MiB |             7.941 MiB |             7.941 MiB |
|     FLOAT |   100000 |      512 |      14.160 ms |      12.314 ms |          37915291403 |          43597041127 |           376.553 MiB |           362.567 MiB |            60.136 MiB |            50.537 MiB |
|     FLOAT |   500000 |      512 |      69.524 ms |      50.251 ms |           7722076774 |          10683715204 |             1.826 GiB |             1.742 GiB |           292.552 MiB |           232.393 MiB |
|     FLOAT |  1000000 |      512 |     130.729 ms |      95.458 ms |           4106742786 |           5624164002 |             3.647 GiB |             3.477 GiB |           581.180 MiB |           459.927 MiB |
|     FLOAT |    10000 |     1024 |       6.351 ms |       6.492 ms |          84532884515 |          82693769317 |            84.452 MiB |            84.452 MiB |            15.893 MiB |            15.893 MiB |
|     FLOAT |   100000 |     1024 |      36.898 ms |      26.302 ms |          14550146722 |          20411596018 |           778.441 MiB |           725.125 MiB |           136.809 MiB |           101.066 MiB |
|     FLOAT |   500000 |     1024 |     166.699 ms |      98.340 ms |           3220600409 |           5459311820 |             3.802 GiB |             3.484 GiB |           685.702 MiB |           464.775 MiB |
|     FLOAT |  1000000 |     1024 |     339.687 ms |     188.463 ms |           1580487011 |           2848673918 |             7.606 GiB |             6.953 GiB |             1.340 GiB |           919.840 MiB |
|   DECIMAL |    10000 |       64 |       1.076 ms |       1.092 ms |         498752693210 |         491676757508 |             7.485 MiB |             7.485 MiB |             1.216 MiB |             1.216 MiB |
|   DECIMAL |   100000 |       64 |       2.166 ms |       2.172 ms |         247840684988 |         247198078197 |            65.498 MiB |            65.498 MiB |             6.658 MiB |             6.658 MiB |
|   DECIMAL |   500000 |       64 |       7.421 ms |       7.058 ms |          72343289850 |          76066836305 |           325.515 MiB |           322.466 MiB |            31.349 MiB |            29.384 MiB |
|   DECIMAL |  1000000 |       64 |      15.239 ms |      14.020 ms |          35230516583 |          38291860266 |           649.547 MiB |           643.714 MiB |            61.759 MiB |            57.826 MiB |
|   DECIMAL |    10000 |      256 |       1.989 ms |       1.989 ms |         269930562597 |         269886680781 |            30.119 MiB |            30.119 MiB |             4.896 MiB |             4.896 MiB |
|   DECIMAL |   100000 |      256 |       7.839 ms |       6.966 ms |          68483613468 |          77073587059 |           269.638 MiB |           263.547 MiB |            30.588 MiB |            26.664 MiB |
|   DECIMAL |   500000 |      256 |      35.199 ms |      26.893 ms |          15252335676 |          19963411264 |             1.312 GiB |             1.267 GiB |           150.948 MiB |           117.601 MiB |
|   DECIMAL |  1000000 |      256 |      72.584 ms |      50.944 ms |           7396511691 |          10538553316 |             2.622 GiB |             2.529 GiB |           301.231 MiB |           231.353 MiB |
|   DECIMAL |    10000 |      512 |       3.612 ms |       3.595 ms |         148642296188 |         149335059500 |            60.283 MiB |            60.283 MiB |             9.801 MiB |             9.801 MiB |
|   DECIMAL |   100000 |      512 |      19.820 ms |      14.084 ms |          27087819156 |          38119174003 |           562.417 MiB |           527.494 MiB |            75.263 MiB |            53.349 MiB |
|   DECIMAL |   500000 |      512 |      94.913 ms |      51.910 ms |           5656452419 |          10342308581 |             2.747 GiB |             2.536 GiB |           377.112 MiB |           235.187 MiB |
|   DECIMAL |  1000000 |      512 |     180.513 ms |      98.562 ms |           2974131976 |           5447057883 |             5.494 GiB |             5.063 GiB |           754.738 MiB |           462.785 MiB |
|   DECIMAL |    10000 |     1024 |       7.667 ms |       6.777 ms |          70025338013 |          79218913933 |           120.656 MiB |           120.656 MiB |            19.616 MiB |            19.616 MiB |
|   DECIMAL |   100000 |     1024 |      61.182 ms |      26.946 ms |           8775038947 |          19923803470 |             1.184 GiB |             1.031 GiB |           201.928 MiB |           106.705 MiB |
|   DECIMAL |   500000 |     1024 |     261.218 ms |     102.314 ms |           2055261558 |           5247292283 |             5.921 GiB |             5.076 GiB |          1012.826 MiB |           470.402 MiB |
|   DECIMAL |  1000000 |     1024 |     513.386 ms |     196.347 ms |           1045744543 |           2734301880 |            11.843 GiB |            10.133 GiB |             1.980 GiB |           925.576 MiB |
| TIMESTAMP |    10000 |       64 |       1.014 ms |       1.016 ms |         529606978079 |         528414399822 |             6.079 MiB |             6.079 MiB |             1.068 MiB |             1.068 MiB |
| TIMESTAMP |   100000 |       64 |       2.057 ms |       2.053 ms |         261019684779 |         261455248599 |            52.688 MiB |            52.688 MiB |             6.436 MiB |             6.436 MiB |
| TIMESTAMP |   500000 |       64 |       6.950 ms |       6.761 ms |          77245644716 |          79410211533 |           260.606 MiB |           259.304 MiB |            29.924 MiB |            29.164 MiB |
| TIMESTAMP |  1000000 |       64 |      14.506 ms |      13.832 ms |          37010291008 |          38813599633 |           521.240 MiB |           517.604 MiB |            59.878 MiB |            57.601 MiB |
| TIMESTAMP |    10000 |      256 |       1.878 ms |       1.889 ms |         285887176743 |         284275145551 |            24.328 MiB |            24.328 MiB |             4.290 MiB |             4.290 MiB |
| TIMESTAMP |   100000 |      256 |       7.198 ms |       6.458 ms |          74586920018 |          83128450019 |           215.854 MiB |           210.739 MiB |            28.681 MiB |            25.734 MiB |
| TIMESTAMP |   500000 |      256 |      34.185 ms |      26.654 ms |          15705060785 |          20142331826 |             1.044 GiB |             1.013 GiB |           137.016 MiB |           116.663 MiB |
| TIMESTAMP |  1000000 |      256 |      66.420 ms |      49.599 ms |           8083007343 |          10824295857 |             2.085 GiB |             2.022 GiB |           272.580 MiB |           230.395 MiB |
| TIMESTAMP |    10000 |      512 |       3.143 ms |       3.150 ms |         170821086658 |         170446277893 |            48.702 MiB |            48.702 MiB |             8.591 MiB |             8.591 MiB |
| TIMESTAMP |   100000 |      512 |      17.652 ms |      12.615 ms |          30413872283 |          42557024194 |           440.115 MiB |           421.891 MiB |            63.197 MiB |            51.502 MiB |
| TIMESTAMP |   500000 |      512 |      75.454 ms |      50.955 ms |           7115233856 |          10536117334 |             2.146 GiB |             2.028 GiB |           315.073 MiB |           233.355 MiB |
| TIMESTAMP |  1000000 |      512 |     140.692 ms |      95.964 ms |           3815935506 |           5594485106 |             4.285 GiB |             4.048 GiB |           627.348 MiB |           460.885 MiB |
| TIMESTAMP |    10000 |     1024 |       6.436 ms |       6.975 ms |          83411903593 |          76971777095 |            97.454 MiB |            97.454 MiB |            17.196 MiB |            17.196 MiB |
| TIMESTAMP |   100000 |     1024 |      45.659 ms |      26.728 ms |          11758159876 |          20086145129 |           936.005 MiB |           844.159 MiB |           159.908 MiB |           103.000 MiB |
| TIMESTAMP |   500000 |     1024 |     199.636 ms |      99.231 ms |           2689242353 |           5410303529 |             4.557 GiB |             4.057 GiB |           794.728 MiB |           466.703 MiB |
| TIMESTAMP |  1000000 |     1024 |     372.691 ms |     192.598 ms |           1440523696 |           2787517681 |             9.104 GiB |             8.099 GiB |             1.551 GiB |           921.760 MiB |
|  DURATION |    10000 |       64 |     986.208 us |     989.153 us |         544379023579 |         542758221495 |             6.417 MiB |             6.417 MiB |           932.501 KiB |           932.501 KiB |
|  DURATION |   100000 |       64 |       2.222 ms |       2.018 ms |         241594183626 |         266034888500 |            57.291 MiB |            57.291 MiB |             6.079 MiB |             6.079 MiB |
|  DURATION |   500000 |       64 |       6.642 ms |       6.673 ms |          80830328889 |          80453377113 |           284.029 MiB |           283.224 MiB |            28.819 MiB |            28.288 MiB |
|  DURATION |  1000000 |       64 |      13.150 ms |      13.488 ms |          40828039129 |          39804805295 |           567.280 MiB |           565.669 MiB |            57.137 MiB |            56.075 MiB |
|  DURATION |    10000 |      256 |       1.805 ms |       1.815 ms |         297459887040 |         295856879191 |            25.686 MiB |            25.686 MiB |             3.665 MiB |             3.665 MiB |
|  DURATION |   100000 |      256 |       6.839 ms |       6.270 ms |          78502421937 |          85630914910 |           232.874 MiB |           229.165 MiB |            25.863 MiB |            24.323 MiB |
|  DURATION |   500000 |      256 |      29.886 ms |      26.234 ms |          17964080662 |          20464503730 |             1.125 GiB |             1.106 GiB |           123.885 MiB |           113.179 MiB |
|  DURATION |  1000000 |      256 |      58.290 ms |      48.418 ms |           9210348188 |          11088351436 |             2.250 GiB |             2.210 GiB |           247.272 MiB |           224.312 MiB |
|  DURATION |    10000 |      512 |       3.035 ms |       2.964 ms |         176885037888 |         181108374773 |            51.383 MiB |            51.383 MiB |             7.342 MiB |             7.342 MiB |
|  DURATION |   100000 |      512 |      14.492 ms |      12.136 ms |          37044853523 |          44237579412 |           474.355 MiB |           458.371 MiB |            55.996 MiB |            48.689 MiB |
|  DURATION |   500000 |      512 |      70.131 ms |      51.095 ms |           7655286246 |          10507294503 |             2.299 GiB |             2.213 GiB |           271.064 MiB |           226.438 MiB |
|  DURATION |  1000000 |      512 |     132.495 ms |      95.019 ms |           4051999205 |           5650150759 |             4.593 GiB |             4.419 GiB |           541.495 MiB |           448.815 MiB |
|  DURATION |    10000 |     1024 |       6.576 ms |       6.318 ms |          81638807422 |          84977253627 |           102.782 MiB |           102.782 MiB |            14.701 MiB |            14.701 MiB |
|  DURATION |   100000 |     1024 |      38.001 ms |      26.011 ms |          14127627316 |          20640219375 |           964.471 MiB |           916.755 MiB |           127.532 MiB |            97.394 MiB |
|  DURATION |   500000 |     1024 |     159.928 ms |      98.126 ms |           3356945213 |           5471258270 |             4.711 GiB |             4.426 GiB |           639.050 MiB |           452.925 MiB |
|  DURATION |  1000000 |     1024 |     305.818 ms |     188.647 ms |           1755524869 |           2845895428 |             9.422 GiB |             8.839 GiB |             1.249 GiB |           897.737 MiB |
|    STRING |    10000 |       64 |       2.241 ms |       2.244 ms |         239611491431 |         239240518530 |            15.926 MiB |            15.926 MiB |             2.075 MiB |             2.075 MiB |
|    STRING |   100000 |       64 |       4.862 ms |       4.822 ms |         110419679907 |         111346705245 |           132.646 MiB |           132.646 MiB |             8.087 MiB |             8.087 MiB |
|    STRING |   500000 |       64 |      20.498 ms |      17.812 ms |          26191957819 |          30140554720 |           664.294 MiB |           645.028 MiB |            40.456 MiB |            30.817 MiB |
|    STRING |  1000000 |       64 |      37.773 ms |      34.985 ms |          14213079575 |          15345709268 |             1.298 GiB |             1.255 GiB |            80.941 MiB |            59.259 MiB |
|    STRING |    10000 |      256 |       4.125 ms |       4.171 ms |         130163506067 |         128706550148 |            63.789 MiB |            63.789 MiB |             8.319 MiB |             8.319 MiB |
|    STRING |   100000 |      256 |      22.074 ms |      17.799 ms |          24321103825 |          30162947098 |           584.754 MiB |           530.912 MiB |            58.602 MiB |            32.330 MiB |
|    STRING |   500000 |      256 |      93.278 ms |      66.770 ms |           5755572906 |           8040584271 |             2.857 GiB |             2.521 GiB |           294.130 MiB |           123.271 MiB |
|    STRING |  1000000 |      256 |     190.999 ms |     122.359 ms |           2810851154 |           4387682165 |             5.715 GiB |             5.023 GiB |           588.586 MiB |           237.018 MiB |
|    STRING |    10000 |      512 |       7.520 ms |       8.010 ms |          71390390607 |          67021971176 |           127.538 MiB |           127.538 MiB |            16.634 MiB |            16.634 MiB |
|    STRING |   100000 |      512 |      51.666 ms |      32.251 ms |          10391219810 |          16646741143 |             1.259 GiB |             1.037 GiB |           173.940 MiB |            64.682 MiB |
|    STRING |   500000 |      512 |     251.723 ms |     125.963 ms |           2132782858 |           4262141577 |             6.300 GiB |             5.040 GiB |           873.437 MiB |           246.559 MiB |
|    STRING |  1000000 |      512 |     477.668 ms |     244.912 ms |           1123940871 |           2192101011 |            12.602 GiB |            10.044 GiB |             1.707 GiB |           474.121 MiB |
|    STRING |    10000 |     1024 |      17.184 ms |      16.128 ms |          31242201518 |          33288874029 |           276.395 MiB |           254.971 MiB |            40.126 MiB |            33.243 MiB |
|    STRING |   100000 |     1024 |     132.094 ms |      63.304 ms |           4064323158 |           8480799642 |             2.721 GiB |             2.073 GiB |           414.092 MiB |           129.316 MiB |
|    STRING |   500000 |     1024 |     608.283 ms |     251.026 ms |            882600977 |           2138709222 |            13.618 GiB |            10.076 GiB |             2.028 GiB |           493.067 MiB |
|    STRING |  1000000 |     1024 |        1.249 s |     485.734 ms |            429750505 |           1105276473 |            27.239 GiB |            20.079 GiB |             4.059 GiB |           948.185 MiB |
```

Authors:
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)
  - Charles Blackmon-Luca (https://github.com/charlesbluca)

URL: https://github.com/rapidsai/cudf/pull/16750
---
 cpp/include/cudf/io/parquet.hpp          |  5 +++--
 cpp/src/io/parquet/writer_impl.cu        | 10 ++++++++--
 python/cudf/cudf/_lib/parquet.pyx        | 16 ++++++++--------
 python/cudf/cudf/core/dataframe.py       |  2 +-
 python/cudf/cudf/io/parquet.py           |  8 ++++----
 python/cudf/cudf/utils/ioutils.py        | 12 ++++--------
 python/dask_cudf/dask_cudf/io/parquet.py |  7 ++-----
 7 files changed, 30 insertions(+), 30 deletions(-)

diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index ed7b2ac0850..ee03a382bec 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -39,8 +39,9 @@ namespace io {
  * @file
  */
 
-constexpr size_t default_row_group_size_bytes   = 128 * 1024 * 1024;  ///< 128MB per row group
-constexpr size_type default_row_group_size_rows = 1000000;     ///< 1 million rows per row group
+constexpr size_t default_row_group_size_bytes =
+  std::numeric_limits<size_t>::max();                          ///< Infinite bytes per row group
+constexpr size_type default_row_group_size_rows = 1'000'000;   ///< 1 million rows per row group
 constexpr size_t default_max_page_size_bytes    = 512 * 1024;  ///< 512KB per page
 constexpr size_type default_max_page_size_rows  = 20000;       ///< 20k rows per page
 constexpr int32_t default_column_index_truncate_length = 64;   ///< truncate to 64 bytes
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 81fd4ab9f82..ec05f35d405 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -1819,8 +1819,14 @@ auto convert_table_to_parquet_data(table_input_metadata& table_meta,
     auto const table_size  = std::reduce(column_sizes.begin(), column_sizes.end());
     auto const avg_row_len = util::div_rounding_up_safe<size_t>(table_size, input.num_rows());
     if (avg_row_len > 0) {
-      auto const rg_frag_size = util::div_rounding_up_safe(max_row_group_size, avg_row_len);
-      max_page_fragment_size  = std::min<size_type>(rg_frag_size, max_page_fragment_size);
+      // Ensure `rg_frag_size` is not bigger than size_type::max for default max_row_group_size
+      // value (=uint64::max) to avoid a sign overflow when comparing
+      auto const rg_frag_size =
+        std::min<size_t>(std::numeric_limits<size_type>::max(),
+                         util::div_rounding_up_safe(max_row_group_size, avg_row_len));
+      // Safe comparison as rg_frag_size fits in size_type
+      max_page_fragment_size =
+        std::min<size_type>(static_cast<size_type>(rg_frag_size), max_page_fragment_size);
     }
 
     // dividing page size by average row length will tend to overshoot the desired
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index a0155671a26..e6c9d60b05b 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -438,7 +438,7 @@ def write_parquet(
     object statistics="ROWGROUP",
     object metadata_file_path=None,
     object int96_timestamps=False,
-    object row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
+    object row_group_size_bytes=None,
     object row_group_size_rows=None,
     object max_page_size_bytes=None,
     object max_page_size_rows=None,
@@ -616,9 +616,9 @@ cdef class ParquetWriter:
         Name of the compression to use. Use ``None`` for no compression.
     statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP'
         Level at which column statistics should be included in file.
-    row_group_size_bytes: int, default 134217728
+    row_group_size_bytes: int, default ``uint64 max``
         Maximum size of each stripe of the output.
-        By default, 134217728 (128MB) will be used.
+        By default, a virtually infinite size equal to ``uint64 max`` will be used.
     row_group_size_rows: int, default 1000000
         Maximum number of rows of each stripe of the output.
         By default, 1000000 (10^6 rows) will be used.
@@ -661,11 +661,11 @@ cdef class ParquetWriter:
 
     def __cinit__(self, object filepath_or_buffer, object index=None,
                   object compression="snappy", str statistics="ROWGROUP",
-                  int row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
-                  int row_group_size_rows=1000000,
-                  int max_page_size_bytes=524288,
-                  int max_page_size_rows=20000,
-                  int max_dictionary_size=1048576,
+                  size_t row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
+                  size_type row_group_size_rows=1000000,
+                  size_t max_page_size_bytes=524288,
+                  size_type max_page_size_rows=20000,
+                  size_t max_dictionary_size=1048576,
                   bool use_dictionary=True,
                   bool store_schema=False):
         filepaths_or_buffers = (
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 58a16a6d504..d73ad8225ca 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -6840,7 +6840,7 @@ def to_parquet(
         statistics="ROWGROUP",
         metadata_file_path=None,
         int96_timestamps=False,
-        row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
+        row_group_size_bytes=None,
         row_group_size_rows=None,
         max_page_size_bytes=None,
         max_page_size_rows=None,
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 62be7378e9e..ce99f98b559 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -64,7 +64,7 @@ def _write_parquet(
     statistics="ROWGROUP",
     metadata_file_path=None,
     int96_timestamps=False,
-    row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
+    row_group_size_bytes=None,
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
@@ -149,7 +149,7 @@ def write_to_dataset(
     return_metadata=False,
     statistics="ROWGROUP",
     int96_timestamps=False,
-    row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
+    row_group_size_bytes=None,
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
@@ -205,7 +205,7 @@ def write_to_dataset(
         If ``False``, timestamps will not be altered.
     row_group_size_bytes: integer or None, default None
         Maximum size of each stripe of the output.
-        If None, 134217728 (128MB) will be used.
+        If None, no limit on row group stripe size will be used.
     row_group_size_rows: integer or None, default None
         Maximum number of rows of each stripe of the output.
         If None, 1000000 will be used.
@@ -980,7 +980,7 @@ def to_parquet(
     statistics="ROWGROUP",
     metadata_file_path=None,
     int96_timestamps=False,
-    row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
+    row_group_size_bytes=None,
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index 1627107b57d..1180da321e6 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -27,7 +27,7 @@
     fsspec_parquet = None
 
 _BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
-_ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024
+_ROW_GROUP_SIZE_BYTES_DEFAULT = np.iinfo(np.uint64).max
 
 _docstring_remote_sources = """
 - cuDF supports local and remote data stores. See configuration details for
@@ -275,10 +275,9 @@
     timestamp[us] to the int96 format, which is the number of Julian
     days and the number of nanoseconds since midnight of 1970-01-01.
     If ``False``, timestamps will not be altered.
-row_group_size_bytes: integer, default {row_group_size_bytes_val}
+row_group_size_bytes: integer, default None
     Maximum size of each stripe of the output.
-    If None, {row_group_size_bytes_val}
-    ({row_group_size_bytes_val_in_mb} MB) will be used.
+    If None, no limit on row group stripe size will be used.
 row_group_size_rows: integer or None, default None
     Maximum number of rows of each stripe of the output.
     If None, 1000000 will be used.
@@ -346,10 +345,7 @@
 See Also
 --------
 cudf.read_parquet
-""".format(
-    row_group_size_bytes_val=_ROW_GROUP_SIZE_BYTES_DEFAULT,
-    row_group_size_bytes_val_in_mb=_ROW_GROUP_SIZE_BYTES_DEFAULT / 1024 / 1024,
-)
+"""
 doc_to_parquet = docfmt_partial(docstring=_docstring_to_parquet)
 
 _docstring_merge_parquet_filemetadata = """
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index e793d4381d1..a781b8242fe 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -23,7 +23,6 @@
 from cudf.io import write_to_dataset
 from cudf.io.parquet import _apply_post_filters, _normalize_filters
 from cudf.utils.dtypes import cudf_dtype_from_pa_type
-from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
 
 
 class CudfEngine(ArrowDatasetEngine):
@@ -341,9 +340,7 @@ def write_partition(
                 return_metadata=return_metadata,
                 statistics=kwargs.get("statistics", "ROWGROUP"),
                 int96_timestamps=kwargs.get("int96_timestamps", False),
-                row_group_size_bytes=kwargs.get(
-                    "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT
-                ),
+                row_group_size_bytes=kwargs.get("row_group_size_bytes", None),
                 row_group_size_rows=kwargs.get("row_group_size_rows", None),
                 max_page_size_bytes=kwargs.get("max_page_size_bytes", None),
                 max_page_size_rows=kwargs.get("max_page_size_rows", None),
@@ -365,7 +362,7 @@ def write_partition(
                     statistics=kwargs.get("statistics", "ROWGROUP"),
                     int96_timestamps=kwargs.get("int96_timestamps", False),
                     row_group_size_bytes=kwargs.get(
-                        "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT
+                        "row_group_size_bytes", None
                     ),
                     row_group_size_rows=kwargs.get(
                         "row_group_size_rows", None

From e68f55c98f257bdeedeb31e68c9737264bd0b393 Mon Sep 17 00:00:00 2001
From: Srinivas Yadav <43375352+srinivasyadav18@users.noreply.github.com>
Date: Wed, 18 Sep 2024 12:12:23 -0500
Subject: [PATCH 836/842] Refactor mixed_semi_join using cuco::static_set
 (#16230)

This PR refactors `mixed_semi_join` by replacing **cuco** legacy `static_map` with latest `static_set`.
Contributes to #12261.

Authors:
  - Srinivas Yadav (https://github.com/srinivasyadav18)
  - Muhammad Haseeb (https://github.com/mhaseeb123)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/16230
---
 cpp/src/join/join_common_utils.hpp       |  6 --
 cpp/src/join/mixed_join_common_utils.cuh | 33 +++++++++
 cpp/src/join/mixed_join_kernels_semi.cu  | 35 ++++-----
 cpp/src/join/mixed_join_kernels_semi.cuh |  6 +-
 cpp/src/join/mixed_join_semi.cu          | 90 +++++++-----------------
 cpp/tests/join/mixed_join_tests.cu       | 30 ++++++++
 6 files changed, 109 insertions(+), 91 deletions(-)

diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index 86402a0e7de..573101cefd9 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -22,7 +22,6 @@
 #include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_view.hpp>
 
-#include <cuco/static_map.cuh>
 #include <cuco/static_multimap.cuh>
 #include <cuda/atomic>
 
@@ -51,11 +50,6 @@ using mixed_multimap_type =
                         cudf::detail::cuco_allocator<char>,
                         cuco::legacy::double_hashing<1, hash_type, hash_type>>;
 
-using semi_map_type = cuco::legacy::static_map<hash_value_type,
-                                               size_type,
-                                               cuda::thread_scope_device,
-                                               cudf::detail::cuco_allocator<char>>;
-
 using row_hash_legacy =
   cudf::row_hasher<cudf::hashing::detail::default_hash, cudf::nullate::DYNAMIC>;
 
diff --git a/cpp/src/join/mixed_join_common_utils.cuh b/cpp/src/join/mixed_join_common_utils.cuh
index 19701816867..89c13285cfe 100644
--- a/cpp/src/join/mixed_join_common_utils.cuh
+++ b/cpp/src/join/mixed_join_common_utils.cuh
@@ -25,6 +25,7 @@
 #include <rmm/device_uvector.hpp>
 
 #include <cub/cub.cuh>
+#include <cuco/static_set.cuh>
 
 namespace cudf {
 namespace detail {
@@ -160,6 +161,38 @@ struct pair_expression_equality : public expression_equality<has_nulls> {
   }
 };
 
+/**
+ * @brief Equality comparator that composes two row_equality comparators.
+ */
+struct double_row_equality_comparator {
+  row_equality const equality_comparator;
+  row_equality const conditional_comparator;
+
+  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
+  {
+    using experimental::row::lhs_index_type;
+    using experimental::row::rhs_index_type;
+
+    return equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) &&
+           conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index});
+  }
+};
+
+// A CUDA Cooperative Group of 4 threads for the hash set.
+auto constexpr DEFAULT_MIXED_JOIN_CG_SIZE = 4;
+
+// The hash set type used by mixed_semi_join with the build_table.
+using hash_set_type = cuco::static_set<size_type,
+                                       cuco::extent<size_t>,
+                                       cuda::thread_scope_device,
+                                       double_row_equality_comparator,
+                                       cuco::linear_probing<DEFAULT_MIXED_JOIN_CG_SIZE, row_hash>,
+                                       cudf::detail::cuco_allocator<char>,
+                                       cuco::storage<1>>;
+
+// The hash_set_ref_type used by mixed_semi_join kerenels for probing.
+using hash_set_ref_type = hash_set_type::ref_type<cuco::contains_tag>;
+
 }  // namespace detail
 
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join_kernels_semi.cu b/cpp/src/join/mixed_join_kernels_semi.cu
index 7459ac3e99c..f2c5ff13638 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cu
+++ b/cpp/src/join/mixed_join_kernels_semi.cu
@@ -38,12 +38,16 @@ CUDF_KERNEL void __launch_bounds__(block_size)
                   table_device_view right_table,
                   table_device_view probe,
                   table_device_view build,
-                  row_hash const hash_probe,
                   row_equality const equality_probe,
-                  cudf::detail::semi_map_type::device_view hash_table_view,
+                  hash_set_ref_type set_ref,
                   cudf::device_span<bool> left_table_keep_mask,
                   cudf::ast::detail::expression_device_view device_expression_data)
 {
+  auto constexpr cg_size = hash_set_ref_type::cg_size;
+
+  auto const tile =
+    cooperative_groups::tiled_partition<cg_size>(cooperative_groups::this_thread_block());
+
   // Normally the casting of a shared memory array is used to create multiple
   // arrays of different types from the shared memory buffer, but here it is
   // used to circumvent conflicts between arrays of different types between
@@ -52,24 +56,24 @@ CUDF_KERNEL void __launch_bounds__(block_size)
   cudf::ast::detail::IntermediateDataType<has_nulls>* intermediate_storage =
     reinterpret_cast<cudf::ast::detail::IntermediateDataType<has_nulls>*>(raw_intermediate_storage);
   auto thread_intermediate_storage =
-    &intermediate_storage[threadIdx.x * device_expression_data.num_intermediates];
-
-  cudf::size_type const left_num_rows  = left_table.num_rows();
-  cudf::size_type const right_num_rows = right_table.num_rows();
-  auto const outer_num_rows            = left_num_rows;
+    &intermediate_storage[tile.meta_group_rank() * device_expression_data.num_intermediates];
 
-  cudf::size_type outer_row_index = threadIdx.x + blockIdx.x * block_size;
+  cudf::size_type const outer_num_rows = left_table.num_rows();
+  auto const outer_row_index = cudf::detail::grid_1d::global_thread_id<block_size>() / cg_size;
 
   auto evaluator = cudf::ast::detail::expression_evaluator<has_nulls>(
     left_table, right_table, device_expression_data);
 
   if (outer_row_index < outer_num_rows) {
+    // Make sure to swap_tables here as hash_set will use probe table as the left one.
+    auto constexpr swap_tables = true;
     // Figure out the number of elements for this key.
     auto equality = single_expression_equality<has_nulls>{
-      evaluator, thread_intermediate_storage, false, equality_probe};
+      evaluator, thread_intermediate_storage, swap_tables, equality_probe};
 
-    left_table_keep_mask[outer_row_index] =
-      hash_table_view.contains(outer_row_index, hash_probe, equality);
+    auto const set_ref_equality = set_ref.with_key_eq(equality);
+    auto const result           = set_ref_equality.contains(tile, outer_row_index);
+    if (tile.thread_rank() == 0) left_table_keep_mask[outer_row_index] = result;
   }
 }
 
@@ -78,9 +82,8 @@ void launch_mixed_join_semi(bool has_nulls,
                             table_device_view right_table,
                             table_device_view probe,
                             table_device_view build,
-                            row_hash const hash_probe,
                             row_equality const equality_probe,
-                            cudf::detail::semi_map_type::device_view hash_table_view,
+                            hash_set_ref_type set_ref,
                             cudf::device_span<bool> left_table_keep_mask,
                             cudf::ast::detail::expression_device_view device_expression_data,
                             detail::grid_1d const config,
@@ -94,9 +97,8 @@ void launch_mixed_join_semi(bool has_nulls,
         right_table,
         probe,
         build,
-        hash_probe,
         equality_probe,
-        hash_table_view,
+        set_ref,
         left_table_keep_mask,
         device_expression_data);
   } else {
@@ -106,9 +108,8 @@ void launch_mixed_join_semi(bool has_nulls,
         right_table,
         probe,
         build,
-        hash_probe,
         equality_probe,
-        hash_table_view,
+        set_ref,
         left_table_keep_mask,
         device_expression_data);
   }
diff --git a/cpp/src/join/mixed_join_kernels_semi.cuh b/cpp/src/join/mixed_join_kernels_semi.cuh
index 43714ffb36a..b08298e64e4 100644
--- a/cpp/src/join/mixed_join_kernels_semi.cuh
+++ b/cpp/src/join/mixed_join_kernels_semi.cuh
@@ -45,9 +45,8 @@ namespace detail {
  * @param[in] right_table The right table
  * @param[in] probe The table with which to probe the hash table for matches.
  * @param[in] build The table with which the hash table was built.
- * @param[in] hash_probe The hasher used for the probe table.
  * @param[in] equality_probe The equality comparator used when probing the hash table.
- * @param[in] hash_table_view The hash table built from `build`.
+ * @param[in] set_ref The hash table device view built from `build`.
  * @param[out] left_table_keep_mask The result of the join operation with "true" element indicating
  * the corresponding index from left table is present in output
  * @param[in] device_expression_data Container of device data required to evaluate the desired
@@ -58,9 +57,8 @@ void launch_mixed_join_semi(bool has_nulls,
                             table_device_view right_table,
                             table_device_view probe,
                             table_device_view build,
-                            row_hash const hash_probe,
                             row_equality const equality_probe,
-                            cudf::detail::semi_map_type::device_view hash_table_view,
+                            hash_set_ref_type set_ref,
                             cudf::device_span<bool> left_table_keep_mask,
                             cudf::ast::detail::expression_device_view device_expression_data,
                             detail::grid_1d const config,
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index cfb785e242c..719b1d47105 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -46,45 +46,6 @@
 namespace cudf {
 namespace detail {
 
-namespace {
-/**
- * @brief Device functor to create a pair of hash value and index for a given row.
- */
-struct make_pair_function_semi {
-  __device__ __forceinline__ cudf::detail::pair_type operator()(size_type i) const noexcept
-  {
-    // The value is irrelevant since we only ever use the hash map to check for
-    // membership of a particular row index.
-    return cuco::make_pair(static_cast<hash_value_type>(i), 0);
-  }
-};
-
-/**
- * @brief Equality comparator that composes two row_equality comparators.
- */
-class double_row_equality {
- public:
-  double_row_equality(row_equality equality_comparator, row_equality conditional_comparator)
-    : _equality_comparator{equality_comparator}, _conditional_comparator{conditional_comparator}
-  {
-  }
-
-  __device__ bool operator()(size_type lhs_row_index, size_type rhs_row_index) const noexcept
-  {
-    using experimental::row::lhs_index_type;
-    using experimental::row::rhs_index_type;
-
-    return _equality_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index}) &&
-           _conditional_comparator(lhs_index_type{lhs_row_index}, rhs_index_type{rhs_row_index});
-  }
-
- private:
-  row_equality _equality_comparator;
-  row_equality _conditional_comparator;
-};
-
-}  // namespace
-
 std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   table_view const& left_equality,
   table_view const& right_equality,
@@ -96,7 +57,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) && (join_type != join_kind::LEFT_JOIN) &&
+  CUDF_EXPECTS((join_type != join_kind::INNER_JOIN) and (join_type != join_kind::LEFT_JOIN) and
                  (join_type != join_kind::FULL_JOIN),
                "Inner, left, and full joins should use mixed_join.");
 
@@ -137,7 +98,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   // output column and follow the null-supporting expression evaluation code
   // path.
   auto const has_nulls = cudf::nullate::DYNAMIC{
-    cudf::has_nulls(left_equality) || cudf::has_nulls(right_equality) ||
+    cudf::has_nulls(left_equality) or cudf::has_nulls(right_equality) or
     binary_predicate.may_evaluate_null(left_conditional, right_conditional, stream)};
 
   auto const parser = ast::detail::expression_parser{
@@ -156,27 +117,20 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto right_conditional_view = table_device_view::create(right_conditional, stream);
 
   auto const preprocessed_build =
-    experimental::row::equality::preprocessed_table::create(build, stream);
+    cudf::experimental::row::equality::preprocessed_table::create(build, stream);
   auto const preprocessed_probe =
-    experimental::row::equality::preprocessed_table::create(probe, stream);
+    cudf::experimental::row::equality::preprocessed_table::create(probe, stream);
   auto const row_comparator =
-    cudf::experimental::row::equality::two_table_comparator{preprocessed_probe, preprocessed_build};
+    cudf::experimental::row::equality::two_table_comparator{preprocessed_build, preprocessed_probe};
   auto const equality_probe = row_comparator.equal_to<false>(has_nulls, compare_nulls);
 
-  semi_map_type hash_table{
-    compute_hash_table_size(build.num_rows()),
-    cuco::empty_key{std::numeric_limits<hash_value_type>::max()},
-    cuco::empty_value{cudf::detail::JoinNoneValue},
-    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
-    stream.value()};
-
   // Create hash table containing all keys found in right table
   // TODO: To add support for nested columns we will need to flatten in many
   // places. However, this probably isn't worth adding any time soon since we
   // won't be able to support AST conditions for those types anyway.
   auto const build_nulls    = cudf::nullate::DYNAMIC{cudf::has_nulls(build)};
   auto const row_hash_build = cudf::experimental::row::hash::row_hasher{preprocessed_build};
-  auto const hash_build     = row_hash_build.device_hasher(build_nulls);
+
   // Since we may see multiple rows that are identical in the equality tables
   // but differ in the conditional tables, the equality comparator used for
   // insertion must account for both sets of tables. An alternative solution
@@ -191,20 +145,28 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
   auto const equality_build_equality =
     row_comparator_build.equal_to<false>(build_nulls, compare_nulls);
   auto const preprocessed_build_condtional =
-    experimental::row::equality::preprocessed_table::create(right_conditional, stream);
+    cudf::experimental::row::equality::preprocessed_table::create(right_conditional, stream);
   auto const row_comparator_conditional_build =
     cudf::experimental::row::equality::two_table_comparator{preprocessed_build_condtional,
                                                             preprocessed_build_condtional};
   auto const equality_build_conditional =
     row_comparator_conditional_build.equal_to<false>(build_nulls, compare_nulls);
-  double_row_equality equality_build{equality_build_equality, equality_build_conditional};
-  make_pair_function_semi pair_func_build{};
 
-  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func_build);
+  hash_set_type row_set{
+    {compute_hash_table_size(build.num_rows())},
+    cuco::empty_key{JoinNoneValue},
+    {equality_build_equality, equality_build_conditional},
+    {row_hash_build.device_hasher(build_nulls)},
+    {},
+    {},
+    cudf::detail::cuco_allocator<char>{rmm::mr::polymorphic_allocator<char>{}, stream},
+    {stream.value()}};
+
+  auto iter = thrust::make_counting_iterator(0);
 
   // skip rows that are null here.
   if ((compare_nulls == null_equality::EQUAL) or (not nullable(build))) {
-    hash_table.insert(iter, iter + right_num_rows, hash_build, equality_build, stream.value());
+    row_set.insert(iter, iter + right_num_rows, stream.value());
   } else {
     thrust::counting_iterator<cudf::size_type> stencil(0);
     auto const [row_bitmask, _] =
@@ -212,18 +174,19 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
     row_is_valid pred{static_cast<bitmask_type const*>(row_bitmask.data())};
 
     // insert valid rows
-    hash_table.insert_if(
-      iter, iter + right_num_rows, stencil, pred, hash_build, equality_build, stream.value());
+    row_set.insert_if(iter, iter + right_num_rows, stencil, pred, stream.value());
   }
 
-  auto hash_table_view = hash_table.get_device_view();
-
   detail::grid_1d const config(outer_num_rows, DEFAULT_JOIN_BLOCK_SIZE);
-  auto const shmem_size_per_block = parser.shmem_per_thread * config.num_threads_per_block;
+  auto const shmem_size_per_block =
+    parser.shmem_per_thread *
+    cuco::detail::int_div_ceil(config.num_threads_per_block, hash_set_type::cg_size);
 
   auto const row_hash   = cudf::experimental::row::hash::row_hasher{preprocessed_probe};
   auto const hash_probe = row_hash.device_hasher(has_nulls);
 
+  hash_set_ref_type const row_set_ref = row_set.ref(cuco::contains).with_hash_function(hash_probe);
+
   // Vector used to indicate indices from left/probe table which are present in output
   auto left_table_keep_mask = rmm::device_uvector<bool>(probe.num_rows(), stream);
 
@@ -232,9 +195,8 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_join_semi(
                          *right_conditional_view,
                          *probe_view,
                          *build_view,
-                         hash_probe,
                          equality_probe,
-                         hash_table_view,
+                         row_set_ref,
                          cudf::device_span<bool>(left_table_keep_mask),
                          parser.device_expression_data,
                          config,
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
index 6c147c8a128..08a0136700d 100644
--- a/cpp/tests/join/mixed_join_tests.cu
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -778,6 +778,21 @@ TYPED_TEST(MixedLeftSemiJoinTest, BasicEquality)
              {1});
 }
 
+TYPED_TEST(MixedLeftSemiJoinTest, MixedLeftSemiJoinGatherMap)
+{
+  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto left_one_greater_right_one =
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
+
+  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
+             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
+             {0},
+             {1},
+             left_one_greater_right_one,
+             {2, 7, 8});
+}
+
 TYPED_TEST(MixedLeftSemiJoinTest, BasicEqualityDuplicates)
 {
   this->test({{0, 1, 2, 1}, {3, 4, 5, 6}, {10, 20, 30, 40}},
@@ -900,3 +915,18 @@ TYPED_TEST(MixedLeftAntiJoinTest, AsymmetricLeftLargerEquality)
              left_zero_eq_right_zero,
              {0, 1, 3});
 }
+
+TYPED_TEST(MixedLeftAntiJoinTest, MixedLeftAntiJoinGatherMap)
+{
+  auto const col_ref_left_1  = cudf::ast::column_reference(0, cudf::ast::table_reference::LEFT);
+  auto const col_ref_right_1 = cudf::ast::column_reference(0, cudf::ast::table_reference::RIGHT);
+  auto left_one_greater_right_one =
+    cudf::ast::operation(cudf::ast::ast_operator::GREATER, col_ref_left_1, col_ref_right_1);
+
+  this->test({{2, 3, 9, 0, 1, 7, 4, 6, 5, 8}, {1, 2, 3, 4, 5, 6, 7, 8, 9, 0}},
+             {{6, 5, 9, 8, 10, 32}, {0, 1, 2, 3, 4, 5}, {7, 8, 9, 0, 1, 2}},
+             {0},
+             {1},
+             left_one_greater_right_one,
+             {0, 1, 3, 4, 5, 6, 9});
+}

From 63358cc3d3d263d1f33b574ae6333706b77b6e3e Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 18 Sep 2024 14:40:05 -0500
Subject: [PATCH 837/842] Update summarize-test-results.py

---
 python/cudf/cudf/pandas/scripts/summarize-test-results.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
index 347118b290d..af11fd196a7 100644
--- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py
+++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
@@ -71,6 +71,10 @@ def get_per_module_results(log_file_name):
             function_call_counts.update(function_call_count)
         else:
             for key, value in function_call_count.items():
+                if "_slow_function_call" not in function_call_counts[key]:
+                    function_call_counts[key]["_slow_function_call"] = 0
+                if "_fast_function_call" not in function_call_counts[key]:
+                    function_call_counts[key]["_fast_function_call"] = 0
                 function_call_counts[key]["_slow_function_call"] += value.get(
                     "_slow_function_call", 0
                 )

From 42c53247bd3933c83fde18d378902a76d1506c57 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Wed, 18 Sep 2024 14:42:09 -0500
Subject: [PATCH 838/842] Use CI workflow branch 'branch-24.10' again (#16832)

All RAPIDS libraries have been updated with Python 3.12 support, so
Python 3.12 changes
have been merged into `branch-24.10` of `shared-workflows`:
https://github.com/rapidsai/shared-workflows/pull/213

This updates GitHub Actions configs here to that branch.
---
 .github/workflows/build.yaml                  | 28 +++++------
 .github/workflows/pandas-tests.yaml           |  2 +-
 .github/workflows/pr.yaml                     | 48 +++++++++----------
 .../workflows/pr_issue_status_automation.yml  |  6 +--
 .github/workflows/test.yaml                   | 24 +++++-----
 5 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index d6d3e3fdd33..b5d17022a3a 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-libcudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -81,7 +81,7 @@ jobs:
   wheel-publish-libcudf:
     needs: wheel-build-libcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -92,7 +92,7 @@ jobs:
   wheel-build-pylibcudf:
     needs: [wheel-publish-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -102,7 +102,7 @@ jobs:
   wheel-publish-pylibcudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -113,7 +113,7 @@ jobs:
   wheel-build-cudf:
     needs: wheel-publish-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -123,7 +123,7 @@ jobs:
   wheel-publish-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -134,7 +134,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-publish-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -146,7 +146,7 @@ jobs:
   wheel-publish-dask-cudf:
     needs: wheel-build-dask-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -157,7 +157,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-publish-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -169,7 +169,7 @@ jobs:
   wheel-publish-cudf-polars:
     needs: wheel-build-cudf-polars
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.10
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pandas-tests.yaml b/.github/workflows/pandas-tests.yaml
index d670132cca9..10c803f7921 100644
--- a/.github/workflows/pandas-tests.yaml
+++ b/.github/workflows/pandas-tests.yaml
@@ -17,7 +17,7 @@ jobs:
   pandas-tests:
       # run the Pandas unit tests
       secrets: inherit
-      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+      uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
       with:
         # This selects "ARCH=amd64 + the latest supported Python + CUDA".
         matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index d7d14ea12ff..b515dbff9f3 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -37,7 +37,7 @@ jobs:
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.10
     if: always()
     with:
       needs: ${{ toJSON(needs) }}
@@ -104,39 +104,39 @@ jobs:
               - '!notebooks/**'
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: pull-request
       enable_check_symbols: true
   conda-cpp-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     if: needs.changed-files.outputs.test_cpp == 'true'
     with:
       build_type: pull-request
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.10
     with:
       build_type: pull-request
   conda-python-cudf-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
@@ -145,7 +145,7 @@ jobs:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
@@ -153,7 +153,7 @@ jobs:
   conda-java-tests:
     needs: [conda-cpp-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     if: needs.changed-files.outputs.test_java == 'true'
     with:
       build_type: pull-request
@@ -164,7 +164,7 @@ jobs:
   static-configure:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -174,7 +174,7 @@ jobs:
   conda-notebook-tests:
     needs: [conda-python-build, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     if: needs.changed-files.outputs.test_notebooks == 'true'
     with:
       build_type: pull-request
@@ -185,7 +185,7 @@ jobs:
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -195,7 +195,7 @@ jobs:
   wheel-build-libcudf:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # build for every combination of arch and CUDA version, but only for the latest Python
       matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
@@ -204,21 +204,21 @@ jobs:
   wheel-build-pylibcudf:
     needs: [checks, wheel-build-libcudf]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_pylibcudf.sh"
   wheel-build-cudf:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
   wheel-tests-cudf:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       build_type: pull-request
@@ -226,7 +226,7 @@ jobs:
   wheel-build-cudf-polars:
     needs: wheel-build-pylibcudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -235,7 +235,7 @@ jobs:
   wheel-tests-cudf-polars:
     needs: [wheel-build-cudf-polars, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -247,7 +247,7 @@ jobs:
   wheel-build-dask-cudf:
     needs: wheel-build-cudf
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -256,7 +256,7 @@ jobs:
   wheel-tests-dask-cudf:
     needs: [wheel-build-dask-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -265,7 +265,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.10
     with:
       arch: '["amd64"]'
       cuda: '["12.5"]'
@@ -276,7 +276,7 @@ jobs:
   unit-tests-cudf-pandas:
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -287,7 +287,7 @@ jobs:
     # run the Pandas unit tests using PR branch
     needs: [wheel-build-cudf, changed-files]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     if: needs.changed-files.outputs.test_python == 'true'
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
@@ -299,7 +299,7 @@ jobs:
   pandas-tests-diff:
     # diff the results of running the Pandas unit tests and publish a job summary
     needs: pandas-tests
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
         node_type: cpu4
         build_type: pull-request
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
index fe77ad4b6b2..45e5191eb54 100644
--- a/.github/workflows/pr_issue_status_automation.yml
+++ b/.github/workflows/pr_issue_status_automation.yml
@@ -23,7 +23,7 @@ on:
 
 jobs:
     get-project-id:
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@python-3.12
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-item-id.yaml@branch-24.10
       if: github.event.pull_request.state == 'open'
       secrets: inherit
       permissions:
@@ -34,7 +34,7 @@ jobs:
 
     update-status:
       # This job sets the PR and its linked issues to "In Progress" status
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@python-3.12
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-single-select-field.yaml@branch-24.10
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@python-3.12
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.10
       if: ${{ github.event.pull_request.state == 'open' && needs.get-project-id.outputs.ITEM_PROJECT_ID != '' }}
       needs: get-project-id
       with:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 4af6a0d690d..8605fa46f68 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -25,7 +25,7 @@ jobs:
       enable_check_symbols: true
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -33,7 +33,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-cpp-memcheck-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -45,7 +45,7 @@ jobs:
       run_script: "ci/test_cpp_memcheck.sh"
   static-configure:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: pull-request
       # Use the wheel container so we can skip conda solves and since our
@@ -54,7 +54,7 @@ jobs:
       run_script: "ci/configure_cpp_static.sh"
   conda-python-cudf-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -64,7 +64,7 @@ jobs:
   conda-python-other-tests:
     # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -73,7 +73,7 @@ jobs:
       script: "ci/test_python_other.sh"
   conda-java-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -85,7 +85,7 @@ jobs:
       run_script: "ci/test_java.sh"
   conda-notebook-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -97,7 +97,7 @@ jobs:
       run_script: "ci/test_notebooks.sh"
   wheel-tests-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -106,7 +106,7 @@ jobs:
       script: ci/test_wheel_cudf.sh
   wheel-tests-dask-cudf:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       # This selects "ARCH=amd64 + the latest supported Python + CUDA".
       matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
@@ -117,7 +117,7 @@ jobs:
       script: ci/test_wheel_dask_cudf.sh
   unit-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -126,7 +126,7 @@ jobs:
       script: ci/cudf_pandas_scripts/run_tests.sh
   third-party-integration-tests-cudf-pandas:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.10
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}

From 13890fd3abab4221e0e7537b81d9b1e88811fc6d Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 18 Sep 2024 21:47:33 +0000
Subject: [PATCH 839/842] test

---
 ci/cudf_pandas_scripts/pandas-tests/run.sh    |  2 +-
 .../cudf/pandas/scripts/conftest-patch.py     | 51 ++----------------
 .../cudf/pandas/scripts/run-pandas-tests.sh   |  6 ++-
 .../pandas/scripts/summarize-test-results.py  | 52 +++++++++----------
 4 files changed, 32 insertions(+), 79 deletions(-)

diff --git a/ci/cudf_pandas_scripts/pandas-tests/run.sh b/ci/cudf_pandas_scripts/pandas-tests/run.sh
index e5cd4436a3a..09ceba366b6 100755
--- a/ci/cudf_pandas_scripts/pandas-tests/run.sh
+++ b/ci/cudf_pandas_scripts/pandas-tests/run.sh
@@ -31,7 +31,7 @@ bash python/cudf/cudf/pandas/scripts/run-pandas-tests.sh \
   -n 5 \
   --tb=no \
   -m "not slow" \
-  --max-worker-restart=3 \
+  --max-worker-restart=0 \
   --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-pandas.xml" \
   --dist worksteal \
   --report-log=${PANDAS_TESTS_BRANCH}.json 2>&1
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index 52a6fa89bef..05b42ecc610 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -4,7 +4,6 @@
 
 import contextlib
 import json
-import multiprocessing
 import os
 import sys
 import traceback
@@ -41,8 +40,7 @@ def patch_testing_functions():
 
 
 # Dictionary to store function call counts
-manager = multiprocessing.Manager()
-function_call_counts = defaultdict(int)  # type: ignore
+function_call_counts = {}  # type: ignore
 
 # The specific function to track
 FUNCTION_NAME = {"_slow_function_call", "_fast_function_call"}
@@ -55,14 +53,8 @@ def find_pytest_file(frame):
         if "pandas-testing/pandas-tests/tests" in file and file.rsplit("/", 1)[
             -1
         ].startswith("test_"):
-            return file
+            return str(file).rsplit("pandas-testing/", 1)[-1]
     return None
-    # new_f = frame
-    # while new_f:
-    #     if "pandas-testing/pandas-tests/tests" in new_f.f_globals.get("__file__", ""):
-    #         return os.path.abspath(new_f.f_globals.get("__file__", ""))
-    #     new_f = new_f.f_back
-    # return None
 
 
 def trace_calls(frame, event, arg):
@@ -90,43 +82,6 @@ def pytest_sessionfinish(session, exitstatus):
     sys.setprofile(None)
 
 
-# @pytest.hookimpl(tryfirst=True)
-# def pytest_runtest_setup(item):
-#     # Check if this is the first test in the file
-#     if item.nodeid.split("::")[0] != getattr(
-#         pytest_runtest_setup, "current_file", None
-#     ):
-#         # If it's a new file, reset the function call counts
-#         global function_call_counts
-#         function_call_counts = defaultdict(int)
-#         pytest_runtest_setup.current_file = item.nodeid.split("::")[0]
-
-
-# @pytest.hookimpl(trylast=True)
-# def pytest_runtest_teardown(item, nextitem):
-#     # Check if this is the last test in the file
-#     if (
-#         nextitem is None
-#         or nextitem.nodeid.split("::")[0] != item.nodeid.split("::")[0]
-#     ):
-#         # Write the function call counts to a file
-#         worker_id = os.getenv("PYTEST_XDIST_WORKER", "master")
-#         output_file = f'{item.nodeid.split("::")[0].replace("/", "__")}_{worker_id}_metrics.json'
-#         # if os.path.exists(output_file):
-#         #     output_file = f'{item.nodeid.split("::")[0].replace("/", "__")}_{worker_id}_metrics_1.json'
-#         with open(output_file, "w") as f:
-#             json.dump(dict(function_call_counts), f, indent=4)
-#         print(f"Function call counts have been written to {output_file}")
-
-
-# @pytest.hookimpl(tryfirst=True)
-# def pytest_configure(config):
-#     if hasattr(config, "workerinput"):
-#         # Running in xdist worker
-#         global function_call_counts
-#         function_call_counts = defaultdict(int)
-
-
 @pytest.hookimpl(trylast=True)
 def pytest_unconfigure(config):
     if hasattr(config, "workerinput"):
@@ -135,7 +90,7 @@ def pytest_unconfigure(config):
         output_file = f"function_call_counts_worker_{worker_id}.json"
         with open(output_file, "w") as f:
             json.dump(dict(function_call_counts), f, indent=4)
-        # print(f"Function call counts have been written to {output_file}")
+        print(f"Function call counts have been written to {output_file}")
 
 
 sys.path.append(os.path.dirname(__file__))
diff --git a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
index be83086a7dd..517b990b319 100755
--- a/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
+++ b/python/cudf/cudf/pandas/scripts/run-pandas-tests.sh
@@ -24,7 +24,8 @@ PANDAS_VERSION=$(python -c "import pandas; print(pandas.__version__)")
 
 # tests/io/test_clipboard.py::TestClipboard crashes pytest workers (possibly due to fixture patching clipboard functionality)
 PYTEST_IGNORES="--ignore=tests/io/parser/common/test_read_errors.py \
---ignore=tests/io/test_clipboard.py"
+--ignore=tests/io/test_clipboard.py \
+--ignore=tests/groupby/test_raises.py"
 
 mkdir -p pandas-testing
 cd pandas-testing
@@ -135,7 +136,8 @@ and not test_large_string_pyarrow \
 and not test_interchange_from_corrected_buffer_dtypes \
 and not test_eof_states \
 and not test_array_tz \
-and not test_groupby_raises_category"
+and not test_groupby_raises_category \
+and not test_groupby_raises_datetime"
 
 # TODO: Remove "not db" once a postgres & mysql container is set up on the CI
 PANDAS_CI="1" timeout 900m python -m pytest -p cudf.pandas \
diff --git a/python/cudf/cudf/pandas/scripts/summarize-test-results.py b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
index 347118b290d..fd76c12f78e 100644
--- a/python/cudf/cudf/pandas/scripts/summarize-test-results.py
+++ b/python/cudf/cudf/pandas/scripts/summarize-test-results.py
@@ -71,35 +71,31 @@ def get_per_module_results(log_file_name):
             function_call_counts.update(function_call_count)
         else:
             for key, value in function_call_count.items():
-                function_call_counts[key]["_slow_function_call"] += value.get(
-                    "_slow_function_call", 0
-                )
-                function_call_counts[key]["_fast_function_call"] += value.get(
-                    "_fast_function_call", 0
-                )
-            # per_module_results[key]["_slow_function_call"] = (
-            #     per_module_results[key].get("_slow_function_call", 0)
-            #     + function_call_counts.get("_slow_function_call", 0)
-            # )
-            # per_module_results[key]["_fast_function_call"] = (
-            #     per_module_results[key].get("_fast_function_call", 0)
-            #     + function_call_counts.get("_fast_function_call", 0)
-            # )
+                if key not in function_call_counts:
+                    function_call_counts[key] = value
+                else:
+                    if "_slow_function_call" not in function_call_counts[key]:
+                        function_call_counts[key]["_slow_function_call"] = 0
+                    if "_fast_function_call" not in function_call_counts[key]:
+                        function_call_counts[key]["_fast_function_call"] = 0
+                    function_call_counts[key]["_slow_function_call"] += (
+                        value.get("_slow_function_call", 0)
+                    )
+                    function_call_counts[key]["_fast_function_call"] += (
+                        value.get("_fast_function_call", 0)
+                    )
+
     for key, value in per_module_results.items():
-        # processed_name = key.replace("/", "__") + "_*_metrics.json"
-        # # Assuming the directory is the same as the module name's directory
-        # directory = os.path.dirname(log_file_name)
-        # pattern = os.path.join(directory, processed_name)
-        # matching_files = glob.glob(pattern)
-        # for file in matching_files:
-        #     with open(file) as f:
-        #         function_call_counts = json.load(f)
-        per_module_results[key]["_slow_function_call"] = function_call_counts[
-            key
-        ].get("_slow_function_call", 0)
-        per_module_results[key]["_fast_function_call"] = function_call_counts[
-            key
-        ].get("_fast_function_call", 0)
+        if key in function_call_counts:
+            per_module_results[key]["_slow_function_call"] = (
+                function_call_counts[key].get("_slow_function_call", 0)
+            )
+            per_module_results[key]["_fast_function_call"] = (
+                function_call_counts[key].get("_fast_function_call", 0)
+            )
+        else:
+            per_module_results[key]["_slow_function_call"] = 0
+            per_module_results[key]["_fast_function_call"] = 0
     return per_module_results
 
 
From db1bf0bbaa9b568c4d4fd5b3a043456a1eea45af Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 18 Sep 2024 21:53:36 +0000
Subject: [PATCH 840/842] test

---
 .github/workflows/pr.yaml | 336 +++++++++++++++++++-------------------
 1 file changed, 168 insertions(+), 168 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index d7d14ea12ff..2e1c8f2d2fb 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -14,26 +14,26 @@ jobs:
     needs:
       - changed-files
       - checks
-      - conda-cpp-build
-      - conda-cpp-checks
-      - conda-cpp-tests
-      - conda-python-build
-      - conda-python-cudf-tests
-      - conda-python-other-tests
-      - conda-java-tests
+      # - conda-cpp-build
+      # - conda-cpp-checks
+      # - conda-cpp-tests
+      # - conda-python-build
+      # - conda-python-cudf-tests
+      # - conda-python-other-tests
+      # - conda-java-tests
       - static-configure
-      - conda-notebook-tests
-      - docs-build
+      # - conda-notebook-tests
+      # - docs-build
       - wheel-build-libcudf
       - wheel-build-pylibcudf
       - wheel-build-cudf
-      - wheel-tests-cudf
-      - wheel-build-cudf-polars
-      - wheel-tests-cudf-polars
-      - wheel-build-dask-cudf
-      - wheel-tests-dask-cudf
+      # - wheel-tests-cudf
+      # - wheel-build-cudf-polars
+      # - wheel-tests-cudf-polars
+      # - wheel-build-dask-cudf
+      # - wheel-tests-dask-cudf
       - devcontainer
-      - unit-tests-cudf-pandas
+      # - unit-tests-cudf-pandas
       - pandas-tests
       - pandas-tests-diff
     secrets: inherit
@@ -107,91 +107,91 @@ jobs:
     uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@python-3.12
     with:
       enable_check_generated_files: false
-  conda-cpp-build:
-    needs: checks
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
-    with:
-      build_type: pull-request
-  conda-cpp-checks:
-    needs: conda-cpp-build
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
-    with:
-      build_type: pull-request
-      enable_check_symbols: true
-  conda-cpp-tests:
-    needs: [conda-cpp-build, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
-    if: needs.changed-files.outputs.test_cpp == 'true'
-    with:
-      build_type: pull-request
-  conda-python-build:
-    needs: conda-cpp-build
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
-    with:
-      build_type: pull-request
-  conda-python-cudf-tests:
-    needs: [conda-python-build, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
-    if: needs.changed-files.outputs.test_python == 'true'
-    with:
-      build_type: pull-request
-      script: "ci/test_python_cudf.sh"
-  conda-python-other-tests:
-    # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
-    needs: [conda-python-build, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
-    if: needs.changed-files.outputs.test_python == 'true'
-    with:
-      build_type: pull-request
-      script: "ci/test_python_other.sh"
-  conda-java-tests:
-    needs: [conda-cpp-build, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
-    if: needs.changed-files.outputs.test_java == 'true'
-    with:
-      build_type: pull-request
-      node_type: "gpu-v100-latest-1"
-      arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
-      run_script: "ci/test_java.sh"
-  static-configure:
-    needs: checks
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
-    with:
-      build_type: pull-request
-      # Use the wheel container so we can skip conda solves and since our
-      # primary static consumers (Spark) are not in conda anyway.
-      container_image: "rapidsai/ci-wheel:latest"
-      run_script: "ci/configure_cpp_static.sh"
-  conda-notebook-tests:
-    needs: [conda-python-build, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
-    if: needs.changed-files.outputs.test_notebooks == 'true'
-    with:
-      build_type: pull-request
-      node_type: "gpu-v100-latest-1"
-      arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
-      run_script: "ci/test_notebooks.sh"
-  docs-build:
-    needs: conda-python-build
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
-    with:
-      build_type: pull-request
-      node_type: "gpu-v100-latest-1"
-      arch: "amd64"
-      container_image: "rapidsai/ci-conda:latest"
-      run_script: "ci/build_docs.sh"
+  # conda-cpp-build:
+  #   needs: checks
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@python-3.12
+  #   with:
+  #     build_type: pull-request
+  # conda-cpp-checks:
+  #   needs: conda-cpp-build
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@python-3.12
+  #   with:
+  #     build_type: pull-request
+  #     enable_check_symbols: true
+  # conda-cpp-tests:
+  #   needs: [conda-cpp-build, changed-files]
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@python-3.12
+  #   if: needs.changed-files.outputs.test_cpp == 'true'
+  #   with:
+  #     build_type: pull-request
+  # conda-python-build:
+  #   needs: conda-cpp-build
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@python-3.12
+  #   with:
+  #     build_type: pull-request
+  # conda-python-cudf-tests:
+  #   needs: [conda-python-build, changed-files]
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+  #   if: needs.changed-files.outputs.test_python == 'true'
+  #   with:
+  #     build_type: pull-request
+  #     script: "ci/test_python_cudf.sh"
+  # conda-python-other-tests:
+  #   # Tests for dask_cudf, custreamz, cudf_kafka are separated for CI parallelism
+  #   needs: [conda-python-build, changed-files]
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@python-3.12
+  #   if: needs.changed-files.outputs.test_python == 'true'
+  #   with:
+  #     build_type: pull-request
+  #     script: "ci/test_python_other.sh"
+  # conda-java-tests:
+  #   needs: [conda-cpp-build, changed-files]
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+  #   if: needs.changed-files.outputs.test_java == 'true'
+  #   with:
+  #     build_type: pull-request
+  #     node_type: "gpu-v100-latest-1"
+  #     arch: "amd64"
+  #     container_image: "rapidsai/ci-conda:latest"
+  #     run_script: "ci/test_java.sh"
+  # static-configure:
+  #   needs: checks
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+  #   with:
+  #     build_type: pull-request
+  #     # Use the wheel container so we can skip conda solves and since our
+  #     # primary static consumers (Spark) are not in conda anyway.
+  #     container_image: "rapidsai/ci-wheel:latest"
+  #     run_script: "ci/configure_cpp_static.sh"
+  # conda-notebook-tests:
+  #   needs: [conda-python-build, changed-files]
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+  #   if: needs.changed-files.outputs.test_notebooks == 'true'
+  #   with:
+  #     build_type: pull-request
+  #     node_type: "gpu-v100-latest-1"
+  #     arch: "amd64"
+  #     container_image: "rapidsai/ci-conda:latest"
+  #     run_script: "ci/test_notebooks.sh"
+  # docs-build:
+  #   needs: conda-python-build
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@python-3.12
+  #   with:
+  #     build_type: pull-request
+  #     node_type: "gpu-v100-latest-1"
+  #     arch: "amd64"
+  #     container_image: "rapidsai/ci-conda:latest"
+  #     run_script: "ci/build_docs.sh"
   wheel-build-libcudf:
     needs: checks
     secrets: inherit
@@ -215,74 +215,74 @@ jobs:
     with:
       build_type: pull-request
       script: "ci/build_wheel_cudf.sh"
-  wheel-tests-cudf:
-    needs: [wheel-build-cudf, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
-    if: needs.changed-files.outputs.test_python == 'true'
-    with:
-      build_type: pull-request
-      script: ci/test_wheel_cudf.sh
-  wheel-build-cudf-polars:
-    needs: wheel-build-pylibcudf
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
-    with:
-      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-      build_type: pull-request
-      script: "ci/build_wheel_cudf_polars.sh"
-  wheel-tests-cudf-polars:
-    needs: [wheel-build-cudf-polars, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
-    if: needs.changed-files.outputs.test_python == 'true'
-    with:
-      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-      build_type: pull-request
-      # This always runs, but only fails if this PR touches code in
-      # pylibcudf or cudf_polars
-      script: "ci/test_wheel_cudf_polars.sh"
-  wheel-build-dask-cudf:
-    needs: wheel-build-cudf
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
-    with:
-      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-      build_type: pull-request
-      script: "ci/build_wheel_dask_cudf.sh"
-  wheel-tests-dask-cudf:
-    needs: [wheel-build-dask-cudf, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
-    if: needs.changed-files.outputs.test_python == 'true'
-    with:
-      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-      build_type: pull-request
-      script: ci/test_wheel_dask_cudf.sh
-  devcontainer:
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12
-    with:
-      arch: '["amd64"]'
-      cuda: '["12.5"]'
-      build_command: |
-        sccache -z;
-        build-all -DBUILD_BENCHMARKS=ON --verbose;
-        sccache -s;
-  unit-tests-cudf-pandas:
-    needs: [wheel-build-cudf, changed-files]
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
-    if: needs.changed-files.outputs.test_python == 'true'
-    with:
-      # This selects "ARCH=amd64 + the latest supported Python + CUDA".
-      matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
-      build_type: pull-request
-      script: ci/cudf_pandas_scripts/run_tests.sh
+  # wheel-tests-cudf:
+  #   needs: [wheel-build-cudf, changed-files]
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+  #   if: needs.changed-files.outputs.test_python == 'true'
+  #   with:
+  #     build_type: pull-request
+  #     script: ci/test_wheel_cudf.sh
+  # wheel-build-cudf-polars:
+  #   needs: wheel-build-pylibcudf
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+  #   with:
+  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+  #     build_type: pull-request
+  #     script: "ci/build_wheel_cudf_polars.sh"
+  # wheel-tests-cudf-polars:
+  #   needs: [wheel-build-cudf-polars, changed-files]
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+  #   if: needs.changed-files.outputs.test_python == 'true'
+  #   with:
+  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+  #     build_type: pull-request
+  #     # This always runs, but only fails if this PR touches code in
+  #     # pylibcudf or cudf_polars
+  #     script: "ci/test_wheel_cudf_polars.sh"
+  # wheel-build-dask-cudf:
+  #   needs: wheel-build-cudf
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@python-3.12
+  #   with:
+  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+  #     build_type: pull-request
+  #     script: "ci/build_wheel_dask_cudf.sh"
+  # wheel-tests-dask-cudf:
+  #   needs: [wheel-build-dask-cudf, changed-files]
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+  #   if: needs.changed-files.outputs.test_python == 'true'
+  #   with:
+  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+  #     build_type: pull-request
+  #     script: ci/test_wheel_dask_cudf.sh
+  # devcontainer:
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@python-3.12
+  #   with:
+  #     arch: '["amd64"]'
+  #     cuda: '["12.5"]'
+  #     build_command: |
+  #       sccache -z;
+  #       build-all -DBUILD_BENCHMARKS=ON --verbose;
+  #       sccache -s;
+  # unit-tests-cudf-pandas:
+  #   needs: [wheel-build-cudf, changed-files]
+  #   secrets: inherit
+  #   uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@python-3.12
+  #   if: needs.changed-files.outputs.test_python == 'true'
+  #   with:
+  #     # This selects "ARCH=amd64 + the latest supported Python + CUDA".
+  #     matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))]))
+  #     build_type: pull-request
+  #     script: ci/cudf_pandas_scripts/run_tests.sh
   pandas-tests:
     # run the Pandas unit tests using PR branch
     needs: [wheel-build-cudf, changed-files]

From fe2611cb73a26ec5cb19feb63060acfda4f62529 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Wed, 18 Sep 2024 21:56:49 +0000
Subject: [PATCH 841/842] test

---
 .github/workflows/pr.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 840451d2e25..80c50514897 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -21,7 +21,7 @@ jobs:
       # - conda-python-cudf-tests
       # - conda-python-other-tests
       # - conda-java-tests
-      - static-configure
+      # - static-configure
       # - conda-notebook-tests
       # - docs-build
       - wheel-build-libcudf
@@ -32,7 +32,7 @@ jobs:
       # - wheel-tests-cudf-polars
       # - wheel-build-dask-cudf
       # - wheel-tests-dask-cudf
-      - devcontainer
+      # - devcontainer
       # - unit-tests-cudf-pandas
       - pandas-tests
       - pandas-tests-diff

From b4ce6adb4eb28b46c0f3469b6cedb9fffca70b45 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 18 Sep 2024 18:24:02 -0500
Subject: [PATCH 842/842] Update
 python/cudf/cudf/pandas/scripts/conftest-patch.py

---
 python/cudf/cudf/pandas/scripts/conftest-patch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index 05b42ecc610..aefd640fb5c 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -53,7 +53,7 @@ def find_pytest_file(frame):
         if "pandas-testing/pandas-tests/tests" in file and file.rsplit("/", 1)[
             -1
         ].startswith("test_"):
-            return str(file).rsplit("pandas-testing/", 1)[-1]
+            return str(file).rsplit("pandas-tests/", 1)[-1]
     return None